From 5c2ae01e8dadfd8f22346afb31ceb974759051c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 24 Sep 2025 11:00:41 +0000 Subject: [PATCH 001/248] chore: Add codeowners file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- CODEOWNERS | 63 +----------------------------------------------------- 1 file changed, 1 insertion(+), 62 deletions(-) diff --git a/CODEOWNERS b/CODEOWNERS index 6f59d98afb6..48513c28d32 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,64 +1,7 @@ # Core -[Core-ADLR] @mcore-reviewers/core-adlr +[Core] @mcore-reviewers/dev-core megatron/core/ -[Core-NeMo] @mcore-reviewers/core-nemo -megatron/core/ - -^[Core-MLPerf] @mcore-reviewers/mlperf -megatron/core/ - -[GPT] @mcore-reviewers/gpt -megatron/core/models/gpt/ - -[Multimodal] @mcore-reviewers/multi-modal -megatron/core/models/multimodal/ - -[Hybrid-mamba] @mcore-reviewers/hybrid-mamba -megatron/core/models/mamba/ - -# Distributed Checkpointing -[Distributed Checkpointing] @mcore-reviewers/dist-checkpointing -megatron/core/dist_checkpointing/ - -# Distributed Optimizer -[Distributed Optimizer] @mcore-reviewers/dist-optimizer -megatron/core/optimizer/distrib_optimizer/ - -# Quantization and Inference (QAT) -[Quantization and Inference (QAT)] @mcore-reviewers/quantization-and-inference -megatron/core/inference/modelopt_support - -# Datasets -[Datasets] @mcore-reviewers/datasets -megatron/core/datasets/ - -# Parallelism -[Pipeline Parallelism] @mcore-reviewers/pipeline-parallelism -megatron/core/pipeline_parallel/ - -# Transformer -[Transformer] @mcore-reviewers/core-adlr @mcore-reviewers/core-nemo -megatron/core/transformer/ - -[MoE-ADLR] @mcore-reviewers/moe-adlr -megatron/core/transformer/moe/ - -[MoE-Moe] @mcore-reviewers/moe-moe -megatron/core/transformer/moe/ - -# Inference -[Inference] @mcore-reviewers/inference -megatron/core/inference/ - -# Parallel State -[ParallelState] @mcore-reviewers/core-adlr @mcore-reviewers/core-nemo -megatron/core/parallel_state.py - -[Post-Training] @mcore-reviewers/post-training -megatron/core/post_training/ -megatron/post_training - [CI][1] @mcore-reviewers/ci .gitlab/ .github/ @@ -68,7 +11,3 @@ Dockerfile.ci.dev tests/ megatron/core/transformer/transformer_block.py megatron/core/transformer/transformer_layer.py - -[RL] @mcore-reviewers/rl -megatron/rl/ -examples/rl/ \ No newline at end of file From 454e7b5ecfb7e19e2d06dce153e90690587cce70 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Wed, 24 Sep 2025 15:18:22 -0700 Subject: [PATCH 002/248] ADLR/megatron-lm!4065 - ci: Add main/dev branching to queuemanager --- .gitlab/stages/02.test.yml | 2 +- .../python_scripts/wait_for_resources.py | 29 +++++++++++++++---- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index ed050e19864..8abdf310156 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -36,7 +36,7 @@ wait_for_resources: - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE} - export GITLAB_ENDPOINT - export NUM_CONCURRENT_JOBS - - python tests/test_utils/python_scripts/wait_for_resources.py --pipeline-id $CI_PIPELINE_ID + - python tests/test_utils/python_scripts/wait_for_resources.py --pipeline-id $CI_PIPELINE_ID --target-branch $CI_MERGE_REQUEST_TARGET_BRANCH_NAME rules: - if: $CI_MERGE_REQUEST_LABELS =~ /fast-track/ when: never diff --git a/tests/test_utils/python_scripts/wait_for_resources.py b/tests/test_utils/python_scripts/wait_for_resources.py index 6b20fc55c96..c653567c0f6 100644 --- a/tests/test_utils/python_scripts/wait_for_resources.py +++ b/tests/test_utils/python_scripts/wait_for_resources.py @@ -2,7 +2,9 @@ import logging import os +import re import time +from typing import Literal import click import gitlab @@ -11,7 +13,7 @@ PROJECT_ID = int(os.getenv("CI_PROJECT_ID", 19378)) GITLAB_ENDPOINT = os.getenv("GITLAB_ENDPOINT") RO_API_TOKEN = os.getenv("RO_API_TOKEN") -NUM_CONCURRENT_JOBS = int(os.getenv("NUM_CONCURRENT_JOBS", 2)) +NUM_CONCURRENT_JOBS = int(os.getenv("NUM_CONCURRENT_JOBS", 2)) // 2 # for main and dev branch logging.basicConfig() logger = logging.getLogger(__name__) @@ -22,12 +24,14 @@ def get_gitlab_handle(): return gitlab.Gitlab(f"https://{GITLAB_ENDPOINT}", private_token=os.getenv("RO_API_TOKEN")) -def ci_is_busy(pipeline): +def ci_is_busy(pipeline, target_branch: str): """List all merge request pipelines created before the given pipeline that are still pending or running.""" mr_pipelines = ( get_gitlab_handle() .projects.get(PROJECT_ID) - .pipelines.list(source="merge_request_event", get_all=True) + .pipelines.list( + source="merge_request_event", per_page=100, page=1, order_by="id", sort="desc" + ) ) pipeline_time = pipeline.attributes["created_at"] @@ -36,22 +40,32 @@ def ci_is_busy(pipeline): p for p in mr_pipelines if p.attributes["created_at"] < pipeline_time + if ( + get_gitlab_handle() + .projects.get(PROJECT_ID) + .mergerequests.get( + int(re.search(r'merge-requests/(\d+)', p.attributes["ref"]).group(1)) + ) + .target_branch + == target_branch + ) and p.attributes["status"] in ("pending", "running") ] ) - logger.info(f"In queue: {in_queue}. Waiting for resources...") + logger.info(f"Position in queue: {in_queue+1}. Waiting for resources...") return in_queue > NUM_CONCURRENT_JOBS @click.command() @click.option("--pipeline-id", required=True, type=int, help="CI pipeline ID to check") -def main(pipeline_id): +@click.option("--target-branch", required=True, type=str, help="Target branch to check") +def main(pipeline_id, target_branch): pipeline = get_gitlab_handle().projects.get(PROJECT_ID).pipelines.get(pipeline_id) logger.info(f"Job concurrency: {NUM_CONCURRENT_JOBS}") while True: try: - is_busy = ci_is_busy(pipeline) + is_busy = ci_is_busy(pipeline, target_branch) if not is_busy: break time.sleep(60) @@ -60,6 +74,9 @@ def main(pipeline_id): logger.info(f"Network error. Retrying... {e}") time.sleep(15) continue + except Exception as e: + logger.error(f"Error: {e}") + break if __name__ == "__main__": From c0188dc2aa94e68cd3521176dbc549970ab686cb Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Mon, 29 Sep 2025 09:34:04 -0700 Subject: [PATCH 003/248] ADLR/megatron-lm!4090 - cp: `!4084 - ci: Send dev alerts to separate channel` --- .gitlab/scripts/build.sh | 1 - .gitlab/stages/02.test.yml | 9 +++++++-- .gitlab/stages/04.functional-tests.yml | 8 ++++++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/.gitlab/scripts/build.sh b/.gitlab/scripts/build.sh index edb774e72bd..960af104628 100644 --- a/.gitlab/scripts/build.sh +++ b/.gitlab/scripts/build.sh @@ -44,7 +44,6 @@ JET_API_VERSION=$(curl -s -u "$ARTIFACTORY_USER:$ARTIFACTORY_TOKEN" "https://sc- DOCKER_BUILDKIT=1 docker build \ --secret id=JET_INDEX_URLS \ --secret id=LOGGER_INDEX_URL \ - --secret id=EXPERIMENTAL_FLASH_ATTN \ --target $STAGE \ -f docker/$FILE \ -t ${IMAGE}:${CI_PIPELINE_ID} \ diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index 8abdf310156..72f1491b07c 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -216,7 +216,12 @@ test:unit_tests_notify: - team/megatron script: - env - - export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK} + - | + if [[ "$CI_COMMIT_BRANCH" == "dev" ]]; then + export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK_DEV} + else + export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK} + fi - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE} - export GITLAB_ENDPOINT - export TAG_TEAM=$([[ "$CI_COMMIT_BRANCH" == "main" ]] && echo "1" || "0") @@ -232,7 +237,7 @@ test:unit_tests_notify: paths: - scripts rules: - - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "ci-unit-test-extended" + - if: $CI_PIPELINE_SOURCE == "schedule" && ($CI_COMMIT_BRANCH == "ci-unit-test-extended" || "ci-dev-unit-test-extended") when: always - when: never diff --git a/.gitlab/stages/04.functional-tests.yml b/.gitlab/stages/04.functional-tests.yml index a8575e921ee..084787e8ec3 100644 --- a/.gitlab/stages/04.functional-tests.yml +++ b/.gitlab/stages/04.functional-tests.yml @@ -202,12 +202,16 @@ functional:x_notify: - purpose/utility - team/megatron variables: - WEBHOOK_URL: ${MCORE_NOTIFICATION_HOOK} RO_API_TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE} CONTEXT: $FUNCTIONAL_TEST_SCOPE script: - env - - export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK} + - | + if [[ "$CI_COMMIT_BRANCH" == "dev" ]]; then + export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK_DEV} + else + export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK} + fi - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE} - export GITLAB_ENDPOINT - export CONTEXT=$FUNCTIONAL_TEST_SCOPE From 4808e33c6052fcfd2da66f82c35b3957ddf3c2d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 30 Sep 2025 08:48:15 +0000 Subject: [PATCH 004/248] ci(hotfix): Nightly runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/02.test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index 72f1491b07c..6eb60d03ec7 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -237,7 +237,7 @@ test:unit_tests_notify: paths: - scripts rules: - - if: $CI_PIPELINE_SOURCE == "schedule" && ($CI_COMMIT_BRANCH == "ci-unit-test-extended" || "ci-dev-unit-test-extended") + - if: $CI_PIPELINE_SOURCE == "schedule" && ($CI_COMMIT_BRANCH == "ci-unit-test-extended" || $CI_COMMIT_BRANCH == "ci-dev-unit-test-extended") when: always - when: never From a43c0483c8f472e7954ecca5c919868400a3d951 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Fri, 3 Oct 2025 08:40:37 -0700 Subject: [PATCH 005/248] ADLR/megatron-lm!4127 - ADLR/megatron-lm!4084 - ci: Send dev alerts to separate channel --- .gitlab/stages/02.test.yml | 6 +++++- .gitlab/stages/04.functional-tests.yml | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index 6eb60d03ec7..49135bda6af 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -163,6 +163,8 @@ test:unit_tests_pyt(DEV)_mcore(legacy): ENVIRONMENT: dev TAG: legacy rules: + - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == 'dev' + when: never - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /^core_r/ when: never - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME != 'main' @@ -179,6 +181,8 @@ test:unit_tests_pyt(LTS)_mcore(legacy): ENVIRONMENT: lts TAG: legacy rules: + - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == 'dev' + when: never - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /^core_r/ when: never - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME != 'main' @@ -217,7 +221,7 @@ test:unit_tests_notify: script: - env - | - if [[ "$CI_COMMIT_BRANCH" == "dev" ]]; then + if [[ "$CI_COMMIT_BRANCH" == "*dev*" ]]; then export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK_DEV} else export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK} diff --git a/.gitlab/stages/04.functional-tests.yml b/.gitlab/stages/04.functional-tests.yml index 084787e8ec3..4b7c17668fe 100644 --- a/.gitlab/stages/04.functional-tests.yml +++ b/.gitlab/stages/04.functional-tests.yml @@ -207,7 +207,7 @@ functional:x_notify: script: - env - | - if [[ "$CI_COMMIT_BRANCH" == "dev" ]]; then + if [[ "$CI_COMMIT_BRANCH" == "*dev*" ]]; then export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK_DEV} else export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK} From c862095921ad876628bc27f72505dfc6ad407e8f Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Fri, 3 Oct 2025 09:16:40 -0700 Subject: [PATCH 006/248] ADLR/megatron-lm!4128 - ci: Auto-cherrypick MR into main --- .gitlab/stages/00.pre.yml | 63 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml index 5e209e62548..c91ffc80995 100644 --- a/.gitlab/stages/00.pre.yml +++ b/.gitlab/stages/00.pre.yml @@ -141,6 +141,69 @@ pre:label_merge_request: source labels curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data-urlencode "add_labels=$LABELS" -X PUT +pre:maybe_cherry_pick_to_main: + rules: + - if: "$CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == 'dev' && $CI_MERGE_REQUEST_LABELS =~ /mirror-to-main/" + - when: never + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + stage: .pre + image: nentangso/alpine-git-curl-jq + variables: + GIT_STRATEGY: "clone" + script: + - | + set -x + MR_ID=$CI_MERGE_REQUEST_IID + TARGET_BRANCH="cp/$MR_ID-into-main" + TARGET_BRANCH_EXISTS_OK=$([[ "$(git ls-remote --heads origin refs/heads/$TARGET_BRANCH)" != "" ]] && echo true || echo false) + + if [[ "$TARGET_BRANCH_EXISTS_OK" == "true" ]]; then + echo Target branch already exists, will not cherry-pick again. + exit 0 + fi + + MR=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}") + + LABELS=$(echo -E $MR | jq '.labels | join(",")' | tr -d '"') + AUTHOR_ID=$(echo -E $MR | jq '.author.id' | tr -d '"') + AUTHOR_NAME=$(echo -E $MR | jq '.author.username' | tr -d '"') + TITLE=$(echo -E $MR | jq '.title' | tr -d '"') + MILESTONE_ID=$(echo -E $MR | jq '.milestone.id' | tr -d '"') + + git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git" + git config --global user.email "mcore-bot@nvidia.com" + git config --global user.name "Mcore Bot" + + git fetch origin dev + git fetch origin $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME + git checkout $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME + START_COMMIT=$(git merge-base origin/dev origin/$CI_MERGE_REQUEST_SOURCE_BRANCH_NAME) + END_COMMIT=$(git rev-parse HEAD) + + git fetch origin main + git checkout main + git checkout -b $TARGET_BRANCH + + git cherry-pick $START_COMMIT..$END_COMMIT + git push -u origin $TARGET_BRANCH + + curl \ + --header "PRIVATE-TOKEN: $PAT" \ + --url https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests \ + -d "source_branch=$TARGET_BRANCH" \ + -d "target_branch=main" \ + -d "title=cp MR !$MR_ID from dev: \`$TITLE\`" \ + -d "labels=cherry-picked-from-dev" \ + -d "reviewer_ids=$AUTHOR_ID" \ + -d "milestone_id=$MILESTONE_ID" \ + -d "description=[🤖]: Hi @$AUTHOR_NAME 👋,

we've cherry picked \`$TITLE (!$MR_ID)\` into \`main\` for you! 🚀

Please review and approve this cherry pick by your convenience\!" + pre:maybe_cherry_pick_commit: rules: - if: '$CI_COMMIT_BRANCH == "main" && $CI_PIPELINE_SOURCE == "push"' From f9bb58c87e5e78fa031259cfe48bffc4ad12da0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 10 Oct 2025 09:16:49 +0000 Subject: [PATCH 007/248] ci: Re-add safe-imports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/02.test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index 49135bda6af..b271f72b3bd 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -410,7 +410,7 @@ test:safe_imports: - python -m pip install --no-cache-dir click - python .gitlab/scripts/check_imports.py --package-name megatron.core rules: - - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME != 'main' + - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME != 'main' && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME != 'dev' when: never - if: $UNIT_TEST == 'yes' && $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" allow_failure: true From 2a6ca17db30d0e0daf501a0838720c417a88894c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 10 Oct 2025 09:20:03 +0000 Subject: [PATCH 008/248] ci: No legacy for unit test extended MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/02.test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index b271f72b3bd..e3ea9fdd68c 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -165,6 +165,8 @@ test:unit_tests_pyt(DEV)_mcore(legacy): rules: - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == 'dev' when: never + - if: $CI_COMMIT_BRANCH == 'ci-dev-unit-test-extended' + when: never - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /^core_r/ when: never - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME != 'main' @@ -183,6 +185,8 @@ test:unit_tests_pyt(LTS)_mcore(legacy): rules: - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == 'dev' when: never + - if: $CI_COMMIT_BRANCH == 'ci-dev-unit-test-extended' + when: never - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /^core_r/ when: never - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME != 'main' From 54825abc134efe545dff8669039f0f3fe74f6999 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 10 Oct 2025 09:22:58 +0000 Subject: [PATCH 009/248] ci: Reduce number of repeats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index be4b658f2d6..6b46d92aacb 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,7 +6,7 @@ INTEGRATION_TEST_SCOPE: mr FUNCTIONAL_TEST: "yes" FUNCTIONAL_TEST_SCOPE: mr-slim - FUNCTIONAL_TEST_REPEAT: 5 + FUNCTIONAL_TEST_REPEAT: 1 FUNCTIONAL_TEST_TIME_LIMIT: 2700 CLUSTER_A100: "" CLUSTER_H100: "" @@ -72,7 +72,7 @@ workflow: INTEGRATION_TEST_SCOPE: mr FUNCTIONAL_TEST: "no" FUNCTIONAL_TEST_SCOPE: mr-slim - FUNCTIONAL_TEST_REPEAT: 5 + FUNCTIONAL_TEST_REPEAT: 1 FUNCTIONAL_TEST_TIME_LIMIT: 2700 CLUSTER_A100: "" CLUSTER_H100: "" @@ -119,7 +119,7 @@ workflow: INTEGRATION_TEST: "no" FUNCTIONAL_TEST: "yes" FUNCTIONAL_TEST_SCOPE: mr - FUNCTIONAL_TEST_REPEAT: 5 + FUNCTIONAL_TEST_REPEAT: 1 FUNCTIONAL_TEST_TIME_LIMIT: 2700 CLUSTER_A100: "" CLUSTER_H100: "" From 15819b664c52c5426a6110d088fab9e121de5f88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 10 Oct 2025 14:34:30 +0000 Subject: [PATCH 010/248] ci: Fix notification channel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/02.test.yml | 2 +- .gitlab/stages/04.functional-tests.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index e3ea9fdd68c..71f49f55055 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -225,7 +225,7 @@ test:unit_tests_notify: script: - env - | - if [[ "$CI_COMMIT_BRANCH" == "*dev*" ]]; then + if [[ "$CI_COMMIT_BRANCH" == *dev* ]]; then export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK_DEV} else export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK} diff --git a/.gitlab/stages/04.functional-tests.yml b/.gitlab/stages/04.functional-tests.yml index 4b7c17668fe..7fe8aad0771 100644 --- a/.gitlab/stages/04.functional-tests.yml +++ b/.gitlab/stages/04.functional-tests.yml @@ -207,7 +207,7 @@ functional:x_notify: script: - env - | - if [[ "$CI_COMMIT_BRANCH" == "*dev*" ]]; then + if [[ "$CI_COMMIT_BRANCH" == *dev* ]]; then export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK_DEV} else export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK} From 879a7a1e33cddf88523a587ffb4b9f1c7e163591 Mon Sep 17 00:00:00 2001 From: Deyu Fu Date: Fri, 10 Oct 2025 07:34:34 -0700 Subject: [PATCH 011/248] ADLR/megatron-lm!4106 - [DEV] Add muon and layer-wise distributed optimizer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Zijie Yan Co-authored-by: Hao Wu Co-authored-by: oliver könig Co-authored-by: Boxiang Wang Co-authored-by: mikail --- docker/Dockerfile.ci.dev | 2 +- .../core/optimizer/layer_wise_optimizer.py | 158 +++++++++ megatron/core/optimizer/muon.py | 307 ++++++++++++++++++ megatron/core/optimizer/optimizer_config.py | 25 +- megatron/core/tensor_parallel/layers.py | 1 + megatron/training/arguments.py | 28 +- megatron/training/checkpointing.py | 14 +- megatron/training/training.py | 36 +- pyproject.toml | 3 + tests/unit_tests/test_muon_optimizer.py | 245 ++++++++++++++ uv.lock | 14 + 11 files changed, 818 insertions(+), 15 deletions(-) create mode 100644 megatron/core/optimizer/layer_wise_optimizer.py create mode 100644 megatron/core/optimizer/muon.py create mode 100644 tests/unit_tests/test_muon_optimizer.py diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev index 45b0cba871c..b3295697f31 100644 --- a/docker/Dockerfile.ci.dev +++ b/docker/Dockerfile.ci.dev @@ -32,7 +32,7 @@ COPY megatron/core/package_info.py /workspace/megatron/core/ RUN --mount=type=cache,target=/root/.cache/uv \ bash -ex <<"EOF" uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages - uv sync --extra dev --extra mlm --link-mode copy --locked \ + uv sync --extra dev --extra mlm --link-mode copy --locked --all-groups \ --no-install-package torch \ --no-install-package torchvision \ --no-install-package triton \ diff --git a/megatron/core/optimizer/layer_wise_optimizer.py b/megatron/core/optimizer/layer_wise_optimizer.py new file mode 100644 index 00000000000..b398a645ce3 --- /dev/null +++ b/megatron/core/optimizer/layer_wise_optimizer.py @@ -0,0 +1,158 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import torch + +from .optimizer import ChainedOptimizer, MegatronOptimizer, Float16OptimizerWithFloat16Params +from .optimizer_config import OptimizerConfig +from .clip_grads import clip_grad_by_total_norm_fp32, count_zeros_fp32, get_grad_norm_fp32 + +from megatron.core.process_groups_config import ProcessGroupCollection +from megatron.core.utils import get_pg_rank, get_pg_size + + +class LayerWiseDistributedOptimizer(ChainedOptimizer): + """Layer-wise distributed optimizer for Megatron-core models. + + This is a experimental distributed optimizer wrapper that distributes weight to DP ranks by full layer. + Implemented as ChainedOptimizer to support different weights use different optimizers (e.g. muon+adam) + When using, keep all megatron distributed optimizer related options OFF. + + How LayerWiseDistributedOptimizer work: + 1. weights are splited into lists and each rank only keep its shard in its optimizer + 2. Megatron DDP handle allreduce grad for all params, note that each rank have full model and grad + 3. optimizer is already modified so only param belong to this DP rank is updated + 3. grad_norm and zero counting will reduce metrics globally in step function + 4. Do regular update with chained optimizers, optimizer is already modified so partial update happens + 5. allgather updated params to every rank(currently through broadcast loop) + """ + def __init__( + self, + optimizers: List[MegatronOptimizer], + config: OptimizerConfig, + pg_collection: Optional[ProcessGroupCollection] = None, + ) -> None: + self.pg_collection = pg_collection + self.shard_params(optimizers) + # wrap optimizer after sharding to avoid unnecessary master weight creation + # TODO(deyuf): check if underlying optimizer.config need to fixed and if so can use that instead of passing + if config.bf16: + if isinstance(optimizers[0], Float16OptimizerWithFloat16Params): + raise TypeError('LayerWiseDistributedOptimizer received Float16 optimizer already.') + optimizers = [Float16OptimizerWithFloat16Params(optim, config, None, None) for optim in optimizers] + super().__init__(optimizers) + + # TODO(kunlun, deyuf): potential future perf optimization + # since allreduce is unchanged and handled by megatron DDP, they're already in contiguous gbuf + # so instead of shard param by layer randomly, we can still shard by buf range but keep some "extras" + # to keep boundary weight not sharded. This way each rank do some duplicated work but we can call + # single allgather later and all current distopt optimization can be applied + + def shard_params(self, optimizers): + """Shard all params into lists by rank. """ + # We'll optimize sharding later if there is perf issue. should be ok since linear are grouped already + # Key is to create separate sharding for dp/expt parallel, saved in dp_cp_params_list, expt_dp_params_list + # example of 4 dp rank and 10 non-expert parameters p0-p9, then dp_cp_params_list will look like + # [[p0, p4, p8], [p1, p5, p9], [p2, p6], [p3, p7]] + + # simplify when dp_cp group size is 1 + if get_pg_size(self.pg_collection.dp_cp) == 1: + self.dp_cp_params_list = None + self.expt_dp_params_list = None + return + + dp_cp_idx, expt_dp_idx = 0, 0 + dp_cp_size = get_pg_size(self.pg_collection.dp_cp) + expt_dp_size = get_pg_size(self.pg_collection.expt_dp) + self.dp_cp_params_list = [[] for _ in range(dp_cp_size)] + self.expt_dp_params_list = [[] for _ in range(expt_dp_size)] + # get all param groups, this is called before init so cannot rely on Chained optimizer method + param_groups = [] + for optimizer in optimizers: + param_groups += optimizer.param_groups + for group in param_groups: + params_this_rank = [] + if group["is_expert_parallel"]: + for p in group["params"]: + if expt_dp_idx == get_pg_rank(self.pg_collection.expt_dp): + params_this_rank.append(p) + self.expt_dp_params_list[expt_dp_idx].append(p) + expt_dp_idx = (expt_dp_idx + 1) % expt_dp_size + else: + for p in group["params"]: + if dp_cp_idx == get_pg_rank(self.pg_collection.dp_cp): + params_this_rank.append(p) + self.dp_cp_params_list[dp_cp_idx].append(p) + dp_cp_idx = (dp_cp_idx + 1) % dp_cp_size + # now we modify the group to only handle local params + group["params"] = params_this_rank + + # simplify when expt_dp group size is 1 or expert parallel is off + if expt_dp_size == 1 or len(self.expt_dp_params_list[0]) == 0: + self.expt_dp_params_list = None + + @torch.no_grad() + def broadcast_params(self): + """All rank broadcast updated local params(allgatherv). """ + # Broadcast linear layer weights to all other ranks. + # This may not be slower than PyTorch allgatherv which calls broadcast internally. + # TODO(skyw): Profile and implement more efficient version. + if self.dp_cp_params_list is None: + return + for i, params in enumerate(self.dp_cp_params_list): + src_global_rank = torch.distributed.get_global_rank(self.pg_collection.dp_cp, i) + for p in params: + torch.distributed.broadcast(p, src_global_rank, self.pg_collection.dp_cp) + if self.expt_dp_params_list is None: + return + for i, params in enumerate(self.expt_dp_params_list): + src_global_rank = torch.distributed.get_global_rank(self.pg_collection.expt_dp, i) + for p in params: + torch.distributed.broadcast(p, src_global_rank, self.pg_collection.expt_dp) + + @torch.no_grad() + def get_grad_norm(self): + # similar to dist opt, always aggregate globally + grads_for_norm = [] + for optimizer in self.chained_optimizers: + grads_for_norm += optimizer.get_main_grads_for_grad_norm() + grad_norm = get_grad_norm_fp32( + grads_for_norm, grad_stats_parallel_group=None + ) + return grad_norm + + @torch.no_grad() + def count_zeros(self): + params = [] + for optimizer in self.chained_optimizers: + params += optimizer.get_parameters() + return count_zeros_fp32( + params, + grad_stats_parallel_group=None, + use_decoupled_grad=self.config.use_precision_aware_optimizer_no_fp8_or_ds_fp8, + ) + + @torch.no_grad() + def step(self): # type: ignore[no-untyped-def] + """step function for layer-wise optimizer.""" + update_successful, grad_norm, num_zeros_in_grad = super().step() + + # All gather updated params. + self.broadcast_params() + + return update_successful, grad_norm, num_zeros_in_grad + + def save_state_dict_to_file(self, filename: str) -> None: + """Save the parameter state of the optimizer. + + Args: + filename: The filename to save the parameter state. + """ + torch.save(super().state_dict(), filename) + + def load_state_dict_from_file(self, filename: str) -> None: + """Load the parameter state of the optimizer.""" + super().load_state_dict(torch.load(filename)) + + diff --git a/megatron/core/optimizer/muon.py b/megatron/core/optimizer/muon.py new file mode 100644 index 00000000000..d2dc7533bf9 --- /dev/null +++ b/megatron/core/optimizer/muon.py @@ -0,0 +1,307 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +"""Megatron muon optimizer wrapper to handle tensor-parallel.""" + +import logging +from functools import partial +from typing import Callable, List, Literal, Optional + +import torch +from torch.optim.optimizer import ParamsT + +from megatron.core import parallel_state +from megatron.core.process_groups_config import ProcessGroupCollection +from megatron.core.transformer.module import MegatronModule +from megatron.core.utils import get_pg_size, log_single_rank + +from . import _get_param_groups, get_megatron_optimizer +from .layer_wise_optimizer import LayerWiseDistributedOptimizer +from .optimizer import ( + ChainedOptimizer, + Float16OptimizerWithFloat16Params, + FP32Optimizer, + MegatronOptimizer, +) +from .optimizer_config import OptimizerConfig + +try: + from emerging_optimizers.orthogonalized_optimizers import ( + OrthogonalizedOptimizer, + get_muon_scale_factor, + ) + from emerging_optimizers.orthogonalized_optimizers.muon_utils import newton_schulz_tp + + HAVE_EMERGING_OPTIMIZERS = True +except ImportError: + HAVE_EMERGING_OPTIMIZERS = False + OrthogonalizedOptimizer = object + + +logger = logging.getLogger(__name__) + + +class TensorParallelMuon(OrthogonalizedOptimizer): + """Tensor Parallel Muon optimizer.""" + + def __init__( + self, + params: ParamsT, + lr: float = 3e-4, + momentum_beta: float = 0.95, + use_nesterov: bool = True, + weight_decay: float = 0.01, + use_decoupled_weight_decay: bool = True, + split_qkv: bool = False, + is_qkv_fn: Callable[[torch.Tensor], bool] | None = None, + qkv_split_shapes: tuple[int, int, int] | None = None, + fp32_matmul_prec: str = "medium", + coefficient_type: str = "quintic", + num_ns_steps: int = 5, + scale_mode: str = "spectral", + extra_scale_factor: float = 1.0, + pg_collection: Optional[ProcessGroupCollection] = None, + mode: Literal["blockwise", "duplicated", "distributed"] = "duplicated", + ) -> None: + if num_ns_steps < 1: + raise ValueError(f"num_ns_steps must be at least 1, got {num_ns_steps}") + + orthogonalize_fn = partial( + newton_schulz_tp, + steps=num_ns_steps, + coefficient_type=coefficient_type, + mode="duplicated" if mode == "blockwise" else mode, + ) + scale_factor_fn = partial( + get_muon_scale_factor, mode=scale_mode, extra_scale_factor=extra_scale_factor + ) + + def orthogonalize_fn_tp( + x: torch.Tensor, + tp_group: torch.distributed.ProcessGroup, + partition_dim: int | None = None, + ) -> torch.Tensor: + return orthogonalize_fn(x, tp_group=tp_group, partition_dim=partition_dim) + + def scale_factor_fn_tp( + size_out: int, size_in: int, partition_dim: int | None = None + ) -> float: + if partition_dim is None: + return scale_factor_fn(size_out, size_in) + + size = [size_out, size_in] + size[partition_dim] *= get_pg_size(pg_collection.tp) if pg_collection else 1 + return scale_factor_fn(*size) + + self.pg_collection = pg_collection + self.mode = mode + + super().__init__( + params, + lr, + momentum_beta, + use_nesterov, + weight_decay, + use_decoupled_weight_decay, + split_qkv, + is_qkv_fn, + qkv_split_shapes, + fp32_matmul_prec, + orthogonalize_fn_tp, + scale_factor_fn_tp, + ) + + def orthogonalize(self, p: torch.Tensor, grad: torch.Tensor) -> torch.Tensor: + """Orthogonalize the momentum. + + Args: + p: The parameter tensor. i is necessary to pass param tensor in addition to momentum + because a lot of information is only available in the param tensor, + attributes for example. + grad: The momentum tensor. + + Returns: + The orthogonalized gradient tensor. + """ + if self.pg_collection: + tp_group = ( + self.pg_collection.expt_tp + if getattr(p, 'expert_tp', False) + else self.pg_collection.tp + ) + else: + tp_group = None + partition_dim = None if self.mode == "blockwise" else getattr(p, "partition_dim", None) + if partition_dim == -1: + # llm-shower use different default value for partition_dim than TE. + # Because -1 is a valid index for ndarray, we decided to not overload it. + partition_dim = None + if self.split_qkv and self.is_qkv_fn(p): # type: ignore[misc] + # split grouped attention parameters (e.g., QKV, GQA, etc.) + qkv_grads = torch.split(grad, self.qkv_split_shapes, dim=0) + + # Apply Newton-Schulz to each component + qkv_whitened = [ + self.orthogonalize_fn(g, tp_group=tp_group, partition_dim=partition_dim) + for g in qkv_grads + ] + qkv_scales = [ + self.scale_factor_fn(g.size(0), g.size(1), partition_dim) for g in qkv_grads + ] + + # Apply individual scales to each component and concatenate + grad = torch.cat( + [whitened * scale for whitened, scale in zip(qkv_whitened, qkv_scales)] + ) + else: + grad = self.orthogonalize_fn( + grad, tp_group=tp_group, partition_dim=partition_dim + ) * self.scale_factor_fn(grad.size(0), grad.size(1), partition_dim) + return grad + + +def get_megatron_muon_optimizer( + config: OptimizerConfig, + model_chunks: List[MegatronModule], + no_weight_decay_cond: Optional[Callable] = None, + scale_lr_cond: Optional[Callable] = None, + lr_mult: float = 1.0, + use_gloo_process_groups: bool = True, + layer_wise_distributed_optimizer: bool = False, + pg_collection: Optional[ProcessGroupCollection] = None, +) -> MegatronOptimizer: + """This function is used to get the muon optimizer for the model chunks. + It is used to get the muon optimizer for the model chunks. + + Args: + config (OptimizerConfig): optimizer configuration object. + model_chunks (List[MegatronModule]): model chunks to get optimizer for. + no_weight_decay_cond (func, optional): function to determine whether a parameter + should not perform weight decay. Defaults to None. + scale_lr_cond (func, optional): function to determine whether a parameter + should have a scaled learning rate. Defaults to None. + lr_mult (float, optional): learning rate multiplier for parameters that + satisfy scale_lr_cond. Defaults to 1.0. + use_gloo_process_groups (bool): if false, disable use of Gloo process groups + in underlying Megatron optimizers. + layer_wise_distributed_optimizer (bool): if true, use layer-wise distributed optimizer. + Defaults to False. + """ + assert HAVE_EMERGING_OPTIMIZERS, "Emerging Optimizers is not installed." + + # dist-optim is not supported due to strong coupling with how DDP init grad buffer + # in thoery we can put some weight to use non-dist-muon and rest to dist-adam + # but there are strong dependency and assumption in DDP that prevent it + if config.use_distributed_optimizer: + raise Exception('muon with dist optimizer is not supported.') + + # before this function receive properly created collection + if pg_collection is None: + pg_collection = ProcessGroupCollection.use_mpu_process_groups() + pg_collection.dp_cp = parallel_state.get_data_parallel_group(with_context_parallel=True) + pg_collection.expt_dp = parallel_state.get_expert_data_parallel_group() + + log_single_rank(logger, logging.INFO, f'Setting up emerging optimizer with config {config}') + + optimizers = [] + # record list of non/linear params + linear_params = [] + nonlinear_params = [] + for model_chunk in model_chunks: + for name, param in model_chunk.named_parameters(): + if not param.requires_grad: + continue + # add flag for expert weight so optimizer can figure which tp group it uses + # alternatively, create new param group and save tp_group. this require more + # change in optimizer + if 'experts' in name and 'shared' not in name: + param.expert_tp = True + # TODO(deyuf): might not be sufficient for future algorithm. revisit this conditioning + if not getattr(param, 'is_embedding_or_output_parameter', False) and not ( + len(param.shape) == 1 + ): + linear_params.append(param) + else: + nonlinear_params.append(param) + + # freezing nonlinear params and get param groups for muon + for param in nonlinear_params: + param.requires_grad = False + + linear_param_groups = _get_param_groups( + model_chunks, + no_weight_decay_cond, + scale_lr_cond, + lr_mult, + lr=config.lr, + min_lr=config.min_lr, + decoupled_lr=config.decoupled_lr, + decoupled_min_lr=config.decoupled_min_lr, + ) + + # TODO(deyuf): support qkv split + optimizer = TensorParallelMuon( + linear_param_groups, + lr=config.lr, + momentum_beta=config.muon_momentum, + use_nesterov=config.muon_use_nesterov, + weight_decay=config.weight_decay, + fp32_matmul_prec=config.muon_fp32_matmul_prec, + num_ns_steps=config.muon_num_ns_steps, + scale_mode=config.muon_scale_mode, + split_qkv=False, + qkv_split_shapes=None, + extra_scale_factor=config.muon_extra_scale_factor, + pg_collection=pg_collection, + mode=config.muon_tp_mode, + ) + + # set config here to: + # 1. get adam for rest of layer + # 2. avoid ChainedOptimizer check fail that assert all optimizers are same kind + # side effect is muon optimizer will have wrong name str, i.e. config.optimizer == 'adam' + # TODO(deyuf): allow user to select optimizer mix and relax ChainedOptimizer design + config.optimizer = 'adam' + + # need to wrap into megatron mix precision optimizer. (only support bf16 w/o loss scale now) + if config.fp16: + raise Exception('muon with fp16 is not supported.') + reset_config_bf16 = False + if config.bf16: + if layer_wise_distributed_optimizer: + # creating master weight before layerwise sharding will lead to unnecessary master + # weight so here we delay master weight creation into layer_wise unset config.bf16 + # will also result in all optimizers below(adam) to also not be wrapped + config.bf16 = False + reset_config_bf16 = True + else: + # if not using layer_wise wrapper, just create master weight here is fine + optimizer = Float16OptimizerWithFloat16Params(optimizer, config, None, None) + else: + optimizer = FP32Optimizer(optimizer, config, None) + + optimizers.append(optimizer) + + # done with muon, unfreeze nonlinear and freeze linear + for param in nonlinear_params: + param.requires_grad = True + for param in linear_params: + param.requires_grad = False + + # call original get. linear params will be skipped since they're freezed + chained_adam = get_megatron_optimizer( + config, model_chunks, no_weight_decay_cond, scale_lr_cond, lr_mult, use_gloo_process_groups + ) + + # unfreeze everything + for param in linear_params: + param.requires_grad = True + + # chain everything together + optimizers += chained_adam.chained_optimizers + + if layer_wise_distributed_optimizer: + log_single_rank(logger, logging.INFO, 'Using LayerWiseDistributedOptimizer for Muon') + if reset_config_bf16: + config.bf16 = True + return LayerWiseDistributedOptimizer(optimizers, config, pg_collection) + return ChainedOptimizer(optimizers) diff --git a/megatron/core/optimizer/optimizer_config.py b/megatron/core/optimizer/optimizer_config.py index 8151d5e9de1..65e1fd6a71f 100644 --- a/megatron/core/optimizer/optimizer_config.py +++ b/megatron/core/optimizer/optimizer_config.py @@ -16,7 +16,7 @@ class OptimizerConfig: # General ############## optimizer: str = 'adam' - """Optimizer to use (one of Adam or SGD).""" + """Optimizer to use (one of Adam, SGD, or Muon).""" lr: Optional[float] = None """Initial learning rate. Depending on decay style and initial warmup, the learning rate at each @@ -124,6 +124,29 @@ class OptimizerConfig: sgd_momentum: float = 0.9 """Momentum factor for SGD optimizer.""" + # Muon + muon_momentum: float = 0.95 + """The momentum used by the internal SGD.""" + + muon_use_nesterov: bool = True + """Whether to use Nesterov-style momentum in the internal SGD.""" + + muon_scale_mode: str = "spectral" + """The mode to use for the scale factor. Defaults to "spectral".""" + + muon_fp32_matmul_prec: str = "medium" + """The precision to use for the fp32 matmul. Defaults to "medium".""" + + muon_num_ns_steps: int = 5 + """The number of iteration steps to use in the Newton-Schulz iteration.""" + + muon_tp_mode: str = "blockwise" + """How to perform NS calculation for tensor parallel weights. Defaults to "blockwise".""" + + muon_extra_scale_factor: float = 1.0 + """Additional scale factor for the muon update.""" + + ####################### # Distributed optimizer ####################### diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py index e6e65425b23..773c61597bc 100644 --- a/megatron/core/tensor_parallel/layers.py +++ b/megatron/core/tensor_parallel/layers.py @@ -56,6 +56,7 @@ HAVE_TE = False _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = { + "expert_tp": False, "tensor_model_parallel": False, "partition_dim": -1, "partition_stride": 1, diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 11fa9ad2d58..dc33a639e8d 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1121,6 +1121,13 @@ def validate_args(args, defaults={}): args.no_load_rng = True print('Warning: disabling --no-load-rng for upcycling.') + # Muon optimizercheck + if 'muon' in args.optimizer: + assert not args.use_distributed_optimizer, "Muon optimizer does not support distributed optimizer for now." + assert not args.use_torch_fsdp2, "Muon optimizer does not support Torch-FSDP2 for now." + assert not args.use_megatron_fsdp, "Muon optimizer does not support Megatron-FSDP for now." + assert args.ckpt_format == "torch", "Muon optimizer only supports torch checkpoint format for now." + # Optimizer CPU offload check if args.optimizer_cpu_offload: assert args.use_precision_aware_optimizer, ( @@ -1866,6 +1873,25 @@ def _add_regularization_args(parser): 'numerical stability') group.add_argument('--sgd-momentum', type=float, default=0.9, help='Momentum factor for sgd') + group.add_argument('--muon-momentum', type=float, default=0.95, + help='Momentum factor for Muon optimizer') + group.add_argument('--muon-no-use-nesterov', action='store_false', default=True, + dest='muon_use_nesterov', + help='Whether to use Nesterov-style momentum in the internal SGD') + group.add_argument('--muon-scale-mode', type=str, default='spectral', + choices=['spectral', 'unit_rms_norm', 'shape_scaling'], + help='Scale mode for Muon optimizer') + group.add_argument('--muon-fp32-matmul-prec', type=str, default='medium', + choices=['low', 'medium', 'high'], + help='FP32 matmul precision for Newton-Schulz iteration') + group.add_argument('--muon-num-ns-steps', type=int, default=5, + help='Number of Newton-Schulz steps for Muon optimizer') + group.add_argument('--muon-tp-mode', type=str, default='blockwise', + choices=['blockwise', 'duplicated', 'distributed'], + help='How to perform NS calculation for tensor model parallel weights') + group.add_argument('--muon-extra-scale-factor', type=float, default=1.0, + help='Additional scale factor for the muon update') + return parser @@ -2152,7 +2178,7 @@ def _add_training_args(parser): help='Enable bias only in the QKV linear layers', dest='add_qkv_bias') group.add_argument('--optimizer', type=str, default='adam', - choices=['adam', 'sgd'], + choices=['adam', 'sgd', 'muon', 'dist_muon'], help='Optimizer function') group.add_argument('--optimizer-cpu-offload', action='store_true', help='Offload optimizer state to CPU') diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py index 4302b3fa8fd..deff728aa23 100644 --- a/megatron/training/checkpointing.py +++ b/megatron/training/checkpointing.py @@ -486,6 +486,14 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati if not optimizer.is_stub_optimizer: optimizer.save_parameter_state(optim_checkpoint_name) + # LayerWiseDistributedOptimizer save + if getattr(args, "optimizer", "adam").startswith("dist_"): + dp_rank = mpu.get_data_parallel_rank() + optim_checkpoint_name = os.path.join(os.path.dirname(checkpoint_name), f"layer_wise_optimizer_{dp_rank}.pt") + ensure_directory_exists(optim_checkpoint_name) + if not optimizer.is_stub_optimizer: + optimizer.save_state_dict_to_file(optim_checkpoint_name) + async_save_request = None if args.async_save: if ckpt_type == CheckpointType.LEGACY: @@ -1655,7 +1663,11 @@ def load_model_state_dict(module, state_dict, strict: bool): if not release and not args.finetune and not args.no_load_optim: try: # Load state dict. - if not skip_load_to_model_and_opt and optimizer is not None and not optimizer.is_stub_optimizer: + if getattr(args, "optimizer", "adam").startswith("dist_"): + dp_rank = mpu.get_data_parallel_rank() + optim_checkpoint_name = os.path.join(os.path.dirname(checkpoint_name), f"layer_wise_optimizer_{dp_rank}.pt") + optimizer.load_state_dict_from_file(optim_checkpoint_name) + elif not skip_load_to_model_and_opt and optimizer is not None and not optimizer.is_stub_optimizer: optimizer.load_state_dict(state_dict['optimizer']) # Load distributed optimizer's custom parameter state. diff --git a/megatron/training/training.py b/megatron/training/training.py index 23a6ba6170f..bc5fefa86ba 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -75,6 +75,7 @@ from megatron.core.distributed import finalize_model_grads from megatron.core.enums import ModelType from megatron.core.optimizer import get_megatron_optimizer, OptimizerConfig +from megatron.core.optimizer.muon import get_megatron_muon_optimizer from megatron.core.rerun_state_machine import ( get_rerun_state_machine, destroy_rerun_state_machine, @@ -1090,17 +1091,30 @@ def setup_model_and_optimizer( kwargs[f.name] = getattr(args, f.name) config = OptimizerConfig(**kwargs) config.timers = timers - optimizer = get_megatron_optimizer( - config, - model, - no_wd_decay_cond, - scale_lr_cond, - lr_mult, - use_gloo_process_groups=args.enable_gloo_process_groups, - # If the user is asking for a non-zero embedding init std, skip weight decay for embeddings - # to avoid embeddings from shrinking to zero as recommended in https://arxiv.org/abs/2312.16903 - default_skip_embedding_weight_decay=args.embedding_init_method_std is not None, - ) + + if 'muon' not in config.optimizer: + optimizer = get_megatron_optimizer( + config, + model, + no_wd_decay_cond, + scale_lr_cond, + lr_mult, + use_gloo_process_groups=args.enable_gloo_process_groups, + # If the user is asking for a non-zero embedding init std, skip weight decay for embeddings + # to avoid embeddings from shrinking to zero as recommended in https://arxiv.org/abs/2312.16903 + default_skip_embedding_weight_decay=args.embedding_init_method_std is not None, + ) + else: + optimizer = get_megatron_muon_optimizer( + config, + model, + no_wd_decay_cond, + scale_lr_cond, + lr_mult, + use_gloo_process_groups=args.enable_gloo_process_groups, + layer_wise_distributed_optimizer='dist' in config.optimizer, + ) + opt_param_scheduler = get_optimizer_param_scheduler(optimizer) one_logger and one_logger.log_metrics({"app_build_optimzer_finish_time": one_logger_utils.get_timestamp_in_ms()}) diff --git a/pyproject.toml b/pyproject.toml index 71e87bc8b83..3362a0181c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -139,6 +139,7 @@ linting = [ ] ci = ["python-gitlab", "slack-sdk", "pandas"] flash_mla = ["flash_mla"] +emerging_optimizers = ["emerging_optimizers"] [tool.uv] default-groups = ["linting", "build", "test"] @@ -165,7 +166,9 @@ override-dependencies = [ flash_mla = [ { git = "https://github.com/deepseek-ai/FlashMLA", rev = "9edee0c022cd0938148a18e334203b0aab43aa19" }, ] + # transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "0289e76380088358a584d809faf69effab1a7cda" } # on `release_v2.7 +emerging_optimizers = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git", rev= "fb1add873e7851ec34b48581ea1b15761b73d189"} [tool.isort] profile = "black" # black-compatible diff --git a/tests/unit_tests/test_muon_optimizer.py b/tests/unit_tests/test_muon_optimizer.py new file mode 100644 index 00000000000..d5dffcd0e19 --- /dev/null +++ b/tests/unit_tests/test_muon_optimizer.py @@ -0,0 +1,245 @@ +import os +import pytest + +from packaging.version import Version + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig +from megatron.core.optimizer import OptimizerConfig +from megatron.core.optimizer.muon import get_megatron_muon_optimizer, TensorParallelMuon +from megatron.core.transformer import TransformerConfig +from tests.unit_tests.test_utilities import Utils +from tests.unit_tests.test_utils import _deinit_distributed, _init_distributed + + +class Net(nn.Module): + def __init__(self): + super().__init__() + self.fc1 = nn.Linear(80, 48) + self.fc2 = nn.Linear(48, 10) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return x + + +@pytest.mark.skipif( + Version(os.getenv('NVIDIA_PYTORCH_VERSION', "24.01")) <= Version("25.05"), + reason="Skip muon optimizer for LTS test", +) +def test_muon_optimizer_smoke(): + """Smoke test for TensorParallelMuon optimizer.""" + # Create a simple linear model for testing + model = torch.nn.Linear(100, 50, bias=False, dtype=torch.float32, device='cuda') + model.requires_grad_(True) + model.weight.data.fill_(1.0) + + # Create TensorParallelMuon optimizer + optimizer = TensorParallelMuon( + params=[model.weight], + lr=0.01, + momentum_beta=0.95, + use_nesterov=True, + weight_decay=0.01, + use_decoupled_weight_decay=True, + split_qkv=False, + fp32_matmul_prec="medium", + num_ns_steps=5, + scale_mode="spectral", + extra_scale_factor=1.0, + pg_collection=None, + mode="duplicated", + ) + + # Test basic properties + assert optimizer is not None, "Optimizer should not be None" + assert hasattr(optimizer, 'param_groups'), "Optimizer should have param_groups" + assert len(optimizer.param_groups) > 0, "Optimizer should have at least one parameter group" + + # Test forward and backward pass + input_tensor = torch.randn(32, 100, dtype=torch.float32, device='cuda') + output = model(input_tensor) + loss = output.sum() + loss.backward() + + # Store original weight + original_weight = model.weight.data.clone() + + # Test optimizer step + optimizer.step() + + # Verify weight was updated + assert not torch.equal( + model.weight.data, original_weight + ), "Weight should be updated after optimizer step" + + # Test zero_grad + optimizer.zero_grad() + assert model.weight.grad is None or torch.all( + model.weight.grad == 0 + ), "Gradients should be zeroed" + + # Test state_dict and load_state_dict + state_dict = optimizer.state_dict() + assert 'state' in state_dict, "State dict should contain state" + assert 'param_groups' in state_dict, "State dict should contain param_groups" + + # Load state dict should not raise error + optimizer.load_state_dict(state_dict) + + +@pytest.mark.skipif( + Version(os.getenv('NVIDIA_PYTORCH_VERSION', "24.01")) <= Version("25.05"), + reason="Skip muon optimizer for LTS test", +) +def test_get_megatron_muon_optimizer_smoke(): + """Smoke test for get_megatron_muon_optimizer function.""" + world = int(os.getenv('WORLD_SIZE', '1')) + rank = int(os.getenv('RANK', '0')) + + # Setup: distributed, model + _init_distributed(world, rank) + Utils.initialize_model_parallel() + + # Create a model with both linear and non-linear parameters + model = Net().bfloat16().cuda() + model.requires_grad_(True) + + # Wrap in DDP (required for Megatron optimizer) + ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=False) + model = DistributedDataParallel( + TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model + ) + + # Ensure all parameters require gradients + for param in model.parameters(): + assert param.requires_grad, "All parameters should require gradients" + + # Create optimizer config for Muon + optimizer_config = OptimizerConfig( + optimizer='muon', # This will be changed internally to 'adam' for non-linear params + lr=0.01, + weight_decay=0.01, + bf16=True, + use_distributed_optimizer=False, # Muon doesn't support distributed optimizer + muon_momentum=0.95, + muon_use_nesterov=True, + muon_fp32_matmul_prec="medium", + muon_num_ns_steps=5, + muon_scale_mode="spectral", + muon_tp_mode="duplicated", + ) + + # Test creating the optimizer + optimizer = get_megatron_muon_optimizer( + config=optimizer_config, + model_chunks=[model], + use_gloo_process_groups=True, + layer_wise_distributed_optimizer=False, + ) + + # Test basic properties + assert optimizer is not None, "Optimizer should not be None" + assert hasattr(optimizer, 'param_groups'), "Optimizer should have param_groups" + assert hasattr(optimizer, 'chained_optimizers'), "Should be a ChainedOptimizer" + assert len(optimizer.chained_optimizers) >= 1, "Should have at least one chained optimizer" + + # Test forward and backward pass + input_tensor = torch.randn(16, 80, dtype=torch.bfloat16, device='cuda') + output = model(input_tensor) + loss = output.sum() + loss.backward() + + # Store original parameters + original_params = {} + for name, param in model.named_parameters(): + original_params[name] = param.data.clone() + + # Test optimizer step + optimizer.step() + + # Verify at least some parameters were updated + params_updated = 0 + for name, param in model.named_parameters(): + if not torch.equal(param.data, original_params[name]): + params_updated += 1 + + assert params_updated > 0, "At least some parameters should be updated after optimizer step" + + # Test zero_grad + optimizer.zero_grad() + for param in model.parameters(): + assert param.grad is None or torch.all( + param.grad == 0 + ), f"Gradients should be zeroed for all parameters" + + # Test state_dict and load_state_dict + state_dict = optimizer.state_dict() + assert isinstance(state_dict, list), "State dict should be a list" + + # Load state dict should not raise error + optimizer.load_state_dict(state_dict) + + _deinit_distributed() + + +@pytest.mark.skipif( + Version(os.getenv('NVIDIA_PYTORCH_VERSION', "24.01")) <= Version("25.05"), + reason="Skip muon optimizer for LTS test", +) +def test_get_megatron_muon_optimizer_validation(): + """Test validation logic for get_megatron_muon_optimizer.""" + world = int(os.getenv('WORLD_SIZE', '1')) + rank = int(os.getenv('RANK', '0')) + + # Setup: distributed, model + _init_distributed(world, rank) + Utils.initialize_model_parallel() + + # Create a simple model + model = torch.nn.Linear(100, 50, bias=False, dtype=torch.bfloat16, device='cuda') + model.requires_grad_(True) + ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=False) + model = DistributedDataParallel( + TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model + ) + + # Test 1: Distributed optimizer should raise exception + optimizer_config_dist = OptimizerConfig( + optimizer='muon', + lr=0.01, + bf16=True, + use_distributed_optimizer=True, # This should cause an exception + ) + + with pytest.raises(Exception, match='muon with dist optimizer is not supported'): + get_megatron_muon_optimizer(config=optimizer_config_dist, model_chunks=[model]) + + # Test 2: FP16 should raise exception + optimizer_config_fp16 = OptimizerConfig( + optimizer='muon', + lr=0.01, + fp16=True, # This should cause an exception + use_distributed_optimizer=False, + ) + + with pytest.raises(Exception, match='muon with fp16 is not supported'): + get_megatron_muon_optimizer(config=optimizer_config_fp16, model_chunks=[model]) + + # Test 3: Invalid num_ns_steps should raise exception + optimizer_config_invalid_ns = OptimizerConfig( + optimizer='muon', + lr=0.01, + bf16=True, + use_distributed_optimizer=False, + muon_num_ns_steps=0, # This should cause an exception + ) + + with pytest.raises(ValueError, match='num_ns_steps must be at least 1'): + get_megatron_muon_optimizer(config=optimizer_config_invalid_ns, model_chunks=[model]) + + _deinit_distributed() diff --git a/uv.lock b/uv.lock index 6a674513f11..84da2bd685a 100644 --- a/uv.lock +++ b/uv.lock @@ -1181,6 +1181,16 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/87/62/9773de14fe6c45c23649e98b83231fffd7b9892b6cf863251dc2afa73643/einops-0.8.1-py3-none-any.whl", hash = "sha256:919387eb55330f5757c6bea9165c5ff5cfe63a642682ea788a6d472576d81737", size = 64359, upload-time = "2025-02-09T03:17:01.998Z" }, ] +[[package]] +name = "emerging-optimizers" +version = "0.1.0" +source = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=fb1add873e7851ec34b48581ea1b15761b73d189#fb1add873e7851ec34b48581ea1b15761b73d189" } +dependencies = [ + { name = "absl-py" }, + { name = "torch", marker = "sys_platform == 'never' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "typing-extensions" }, +] + [[package]] name = "exceptiongroup" version = "1.3.0" @@ -2227,6 +2237,9 @@ docs = [ { name = "sphinx-autodoc2" }, { name = "sphinx-copybutton" }, ] +emerging-optimizers = [ + { name = "emerging-optimizers" }, +] flash-mla = [ { name = "flash-mla" }, ] @@ -2314,6 +2327,7 @@ docs = [ { name = "sphinx-autodoc2" }, { name = "sphinx-copybutton" }, ] +emerging-optimizers = [{ name = "emerging-optimizers", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=fb1add873e7851ec34b48581ea1b15761b73d189" }] flash-mla = [{ name = "flash-mla", git = "https://github.com/deepseek-ai/FlashMLA?rev=9edee0c022cd0938148a18e334203b0aab43aa19" }] linting = [ { name = "black", specifier = "==24.4.2" }, From 4bdc4e279c43b58bbbb61cdcbe533d4f5d7c4b45 Mon Sep 17 00:00:00 2001 From: Santosh Bhavani Date: Sat, 11 Oct 2025 07:21:56 -0700 Subject: [PATCH 012/248] ADLR/megatron-lm!4060 - Update dev branch README Co-authored-by: Santosh Bhavani --- README.md | 460 ++++-------------------------------------------------- 1 file changed, 32 insertions(+), 428 deletions(-) diff --git a/README.md b/README.md index 85f21a4322e..6765569370b 100644 --- a/README.md +++ b/README.md @@ -10,461 +10,65 @@ Megatron-LM & Megatron Core
-## ⚡ Quick Start +> ## 🚨 **DEVELOPMENT BRANCH** +> ⚠️ **EXPERIMENTAL FEATURES** - This is the **dev branch** with experimental features. +> +> **→ For releases and comprehensive documentation, visit the [main branch](https://github.com/NVIDIA/Megatron-LM)** -```bash -# 1. Install Megatron Core with required dependencies -pip install megatron-core -pip install --no-build-isolation transformer-engine[pytorch] +## ⚡ Quickstart -# 2. Clone repository for examples -git clone https://github.com/NVIDIA/Megatron-LM.git +```bash +# Clone the dev branch +git clone -b dev https://github.com/NVIDIA/Megatron-LM.git cd Megatron-LM -``` - -**→ [Complete Installation Guide](#installation)** - Docker, pip variants (dev,lts,etc.), source installation, and system requirements - -# Latest News - -- 🔄 NEW! **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models. -- 🗺️ **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive roadmap for MoE features including DeepSeek-V3, Qwen3, advanced parallelism strategies, FP8 optimizations, and Blackwell performance enhancements. -- 🚀 **[GPT-OSS Implementation](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions are being integrated into Megatron Core. -- **[2025/06]** **[Megatron MoE Model Zoo](https://github.com/yanring/Megatron-MoE-ModelZoo)** - Best practices and optimized configurations for training DeepSeek-V3, Mixtral, and Qwen3 MoE models with performance benchmarking and checkpoint conversion tools. -- **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)). - -
-Previous News - -- **[2024/07]** Megatron Core v0.7 improves scalability and training resiliency and adds support for multimodal training ([blog](https://developer.nvidia.com/blog/train-generative-ai-models-more-efficiently-with-new-nvidia-Megatron-Core-functionalities/)). -- **[2024/06]** Megatron Core added supports for Mamba-based models. Check out our paper [An Empirical Study of Mamba-based Language Models](https://arxiv.org/pdf/2406.07887) and [code example](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba). -- **[2024/01 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron Core**](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) in this repository. Megatron Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs. Explore the [Megatron Core intro](#Megatron Core) for more details. -
+# Install from source with dev dependencies (includes transformer_engine) +pip install -e .[mlm,dev] +```
Table of Contents **Getting Started** -- [Quick Start](#-quick-start) -- [Latest News](#latest-news) -- [Megatron Overview](#megatron-overview) - - [Project Structure](#project-structure) - - [Megatron-LM: Reference Implementation](#megatron-lm-reference-implementation) - - [Megatron Core: Production Library](#megatron-core-production-library) -- [Installation](#installation) - - [Docker (Recommended)](#-docker-recommended) - - [Pip Installation](#-pip-installation) - - [Source Installation](#-source-installation) - - [System Requirements](#system-requirements) - -**Core Features** -- [Performance Benchmarking](#performance-benchmarking) - - [Weak Scaling Results](#weak-scaling-results) - - [Strong Scaling Results](#strong-scaling-results) -- [Ecosystem Libraries](#ecosystem-libraries) - -**Training** -- [Training](#training) - - [Getting Started](#getting-started) - - [Data Preparation](#data-preparation) -- [Parallelism Strategies](#parallelism-strategies) - - [Data Parallelism (DP)](#data-parallelism-dp) - - [Tensor Parallelism (TP)](#tensor-parallelism-tp) - - [Pipeline Parallelism (PP)](#pipeline-parallelism-pp) - - [Context Parallelism (CP)](#context-parallelism-cp) - - [Expert Parallelism (EP)](#expert-parallelism-ep) - - [Parallelism Selection Guide](#parallelism-selection-guide) -- [Performance Optimizations](#performance-optimizations) +- [⚡ Quick Start](#-quick-start) +- [🧠 Dev Branch Philosophy](#-dev-branch-philosophy) +- [📊 Performance & Benchmarking](#-performance--benchmarking) +- [👥 Community & Support](#-community--support) -**Resources** -- [Examples](./examples/) - Training scripts and tutorials -- [Documentation](https://docs.nvidia.com/Megatron-Core/) - Official docs -- [Roadmaps](#roadmaps) - Development roadmaps and feature tracking -- [Community & Support](#-community--support) - Get help and contribute - - [Getting Help](#getting-help) - - [Contributing](#contributing) - - [Citation](#citation) +**For Complete Documentation** → [Main Branch](https://github.com/NVIDIA/Megatron-LM) | [Official Docs](https://docs.nvidia.com/Megatron-Core/)
-# Megatron Overview - -## Project Structure -``` -Megatron-LM/ -├── megatron/ -│ ├── core/ # Megatron Core (kernels, parallelism, building blocks) -│ │ ├── models/ # Transformer models -│ │ ├── transformer/ # Transformer building blocks -│ │ ├── tensor_parallel/ # Tensor parallelism -│ │ ├── pipeline_parallel/ # Pipeline parallelism -│ │ ├── distributed/ # Distributed training (FSDP, DDP) -│ │ ├── optimizer/ # Optimizers -│ │ ├── datasets/ # Dataset loaders -│ │ ├── inference/ # Inference engines -│ │ └── export/ # Model export (e.g. TensorRT-LLM) -│ ├── training/ # Training scripts -│ ├── inference/ # Inference server -│ ├── legacy/ # Legacy components -│ └── post_training/ # Post-training (RLHF, etc.) -├── examples/ # Ready-to-use training examples -├── tools/ # Utility tools -├── tests/ # Comprehensive test suite -└── docs/ # Documentation -``` - -### Megatron-LM: Reference Implementation -**Reference implementation** that includes Megatron Core plus everything needed to train models. - -**Best for:** -- **Training state-of-the-art foundation models** at scale with cutting-edge performance on latest NVIDIA hardware -- **Research teams** exploring new architectures and training techniques -- **Learning distributed training** concepts and best practices -- **Quick experimentation** with proven model configurations - -**What you get:** -- Pre-configured training scripts for GPT, LLama, DeepSeek, Qwen, and more. -- End-to-end examples from data prep to evaluation -- Research-focused tools and utilities - -### Megatron Core: Composable Library -**Composable library** with GPU-optimized building blocks for custom training frameworks. - -**Best for:** -- **Framework developers** building on top of modular and optimized components -- **Research teams** needing custom training loops, optimizers, or data pipelines -- **ML engineers** requiring fault-tolerant training pipelines - -**What you get:** -- Composable transformer building blocks (attention, MLP, etc.) -- Advanced parallelism strategies (TP, PP, DP, EP, CP) -- Pipeline schedules and distributed optimizers -- Mixed precision support (FP16, BF16, FP8) -- GPU-optimized kernels and memory management -- High-performance dataloaders and dataset utilities -- Model architectures (LLaMA, Qwen, GPT, Mixtral, Mamba, etc.) - -## Ecosystem Libraries - -**Libraries used by Megatron Core:** - -- **[Megatron Energon](https://github.com/NVIDIA/Megatron-Energon)** 📣 **NEW!** - Multi-modal data loader (text, images, video, audio) with distributed loading and dataset blending -- **[Transformer Engine](https://github.com/NVIDIA/TransformerEngine)** - Optimized kernels and FP8 mixed precision support -- **[Resiliency Extension (NVRx)](https://github.com/NVIDIA/nvidia-resiliency-ext)** - Fault tolerant training with failure detection and recovery - -**Libraries using Megatron Core:** - -- **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and production-ready recipes -- **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods -- **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples -- **[TensorRT Model Optimizer (ModelOpt)](https://github.com/NVIDIA/TensorRT-Model-Optimizer)** - Model optimization toolkit for quantization, pruning, and distillation - -**Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed) - -# Installation - -## 🐳 Docker (Recommended) - -We strongly recommend using the previous releases of [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) rather than the latest one for optimal compatibility with Megatron Core release and testing. Our releases are always based on the previous month's NGC container, so this ensures compatibility and stability. - -This container comes with all dependencies pre-installed with compatible versions and optimized configurations for NVIDIA GPUs: - -- PyTorch (latest stable version) -- CUDA, cuDNN, NCCL (latest stable versions) -- Support for FP8 on NVIDIA Hopper, Ada, and Blackwell GPUs -- For best performance, use NVIDIA Turing GPU architecture generations and later - -```bash -# Run container with mounted directories -docker run --runtime --nvidia --gpus all -it --rm \ - -v /path/to/megatron:/workspace/megatron \ - -v /path/to/dataset:/workspace/dataset \ - -v /path/to/checkpoints:/workspace/checkpoints \ - nvcr.io/nvidia/pytorch:25.04-py3 -``` - -## Pip Installation - -Megatron Core offers support for two NGC PyTorch containers: - -- `dev`: Moving head that supports the most recent upstream dependencies -- `lts`: Long-term support of NGC PyTorch 24.01 - -Both containers can be combined with `mlm` which adds package dependencies for Megatron-LM on top of Megatron Core. - -```bash -# Install the latest release with minimal dependencies (no Transformer Engine) -pip install megatron-core[dev] -``` - -```bash -# Install packages for LTS support NGC PyTorch 24.01 -pip install megatron-core[lts] -``` - -For a version of Megatron Core with only torch, run: - -```bash -pip install megatron-core -``` - -For dependencies required by Megatron-LM, please run: - -```bash -pip install megatron-core[mlm] -``` - -## Source Installation - -For development or latest features: - -For Hybrid models, Megatron Core requires [mamba](https://github.com/state-spaces/mamba). If the pre-built wheel in PyPI does not fit your environment, you can fall back to an install script Megatron Core uses in its CI system. For this, please install `uv` first: - -```bash -export UV_VERSION=0.7.2 -export PATH="$HOME/.local/bin:$PATH" -curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh -export UV_PROJECT_ENVIRONMENT=./venv -export PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH" -export UV_LINK_MODE=copy -``` - -Run the following command to build upstream dependencies from source: - -```bash -# Clone and install -git clone https://github.com/NVIDIA/Megatron-LM.git -cd Megatron-LM - -# Optional: checkout specific release -git checkout core_r0.13.0 - -bash docker/common/install.sh --environment {dev,lts} -``` - -## System Requirements -### Hardware Requirements -- **FP8 Support**: NVIDIA Hopper, Ada, Blackwell GPUs -- **Recommended**: NVIDIA Turing architecture or later -### Software Requirements -- **CUDA/cuDNN/NCCL**: Latest stable versions -- **PyTorch**: Latest stable version -- **Transformer Engine**: Latest stable version -- **Python**: 3.12 recommended -# Performance Benchmarking -For our latest performance benchmarking results, please refer to [NVIDIA NeMo Framework Performance Summary](https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance_summary.html). -Our codebase efficiently trains models from 2B to 462B parameters across thousands of GPUs, achieving up to **47% Model FLOP Utilization (MFU)** on H100 clusters. +## Dev Branch Philosophy -![Model table](images/model_table.png) - -**Benchmark Configuration:** -- **Vocabulary size**: 131,072 tokens -- **Sequence length**: 4096 tokens -- **Model scaling**: Varied hidden size, attention heads, and layers to achieve target parameter counts -- **Communication optimizations**: Fine-grained overlapping with DP (`--overlap-grad-reduce`, `--overlap-param-gather`), TP (`--tp-comm-overlap`), and PP (enabled by default) - -**Key Results:** -- **6144 H100 GPUs**: Successfully benchmarked 462B parameter model training -- **Superlinear scaling**: MFU increases from 41% to 47-48% with model size -- **End-to-end measurement**: Throughputs include all operations (data loading, optimizer steps, communication, logging) -- **Production ready**: Full training pipeline with checkpointing and fault tolerance -- *Note: Performance results measured without training to convergence* - -## Weak Scaling Results -Our weak scaled results show superlinear scaling (MFU increases from 41% for the smallest model considered to 47-48% for the largest models); this is because larger GEMMs have higher arithmetic intensity and are consequently more efficient to execute. - -![Weak scaling](images/weak_scaling.png) - -## Strong Scaling Results -We also strong scaled the standard GPT-3 model (our version has slightly more than 175 billion parameters due to larger vocabulary size) from 96 H100 GPUs to 4608 GPUs, using the same batch size of 1152 sequences throughout. Communication becomes more exposed at larger scale, leading to a reduction in MFU from 47% to 42%. - -![Strong scaling](images/strong_scaling.png) - -# Training - -## Getting Started - -### Simple Training Example -```bash -# Distributed training example (2 GPUs, mock data) -torchrun --nproc_per_node=2 examples/run_simple_mcore_train_loop.py -``` - -### LLama-3 Training Example -```bash -# 8 GPUs, FP8 precision, mock data -./examples/llama/train_llama3_8b_fp8.sh -``` - -## Data Preparation - -### JSONL Data Format -```json -{"text": "Your training text here..."} -{"text": "Another training sample..."} -``` - -### Basic Preprocessing -```bash -python tools/preprocess_data.py \ - --input data.jsonl \ - --output-prefix processed_data \ - --tokenizer-type HuggingFaceTokenizer \ - --tokenizer-model /path/to/tokenizer.model \ - --workers 8 \ - --append-eod -``` - -### Key Arguments -- `--input`: Path to input JSON/JSONL file -- `--output-prefix`: Prefix for output binary files (.bin and .idx) -- `--tokenizer-type`: Tokenizer type (`HuggingFaceTokenizer`, `GPT2BPETokenizer`, etc.) -- `--tokenizer-model`: Path to tokenizer model file -- `--workers`: Number of parallel workers for processing -- `--append-eod`: Add end-of-document token - - - -# Parallelism Strategies - -## Data Parallelism (DP) - -### Standard Data Parallel -```bash -# Standard DDP - replicate model on each GPU -torchrun --nproc_per_node=8 pretrain_gpt.py \ - --data-parallel-sharding-strategy no_shard -``` - -### Fully Sharded Data Parallel (FSDP) -```bash -# Megatron's optimized FSDP (~15% faster than PyTorch FSDP2) ---use-custom-fsdp - -# PyTorch FSDP2 ---use-torch-fsdp2 - -# Sharding strategies ---data-parallel-sharding-strategy optim # Shard optimizer states (ZeRO-1) ---data-parallel-sharding-strategy optim_grads # Shard gradients + optimizer (ZeRO-2) ---data-parallel-sharding-strategy optim_grads_params # Shard parameters + gradients + optimizer (ZeRO-3) -``` - -## Tensor Parallelism (TP) -Split individual model layers across GPUs: -```bash ---tensor-model-parallel-size 4 # 4-way tensor parallelism ---sequence-parallel # Enable sequence parallelism (recommended with TP) -``` - -## Pipeline Parallelism (PP) -Split model depth across GPUs: -```bash ---pipeline-model-parallel-size 8 # 8 pipeline stages ---virtual-pipeline-model-parallel-size 4 # Virtual pipeline for better load balancing -``` - -## Context Parallelism (CP) -Split long sequences across GPUs for handling long contexts: -```bash ---context-parallel-size 2 # 2-way context parallelism ---cp-comm-type p2p # Communication: p2p, a2a, allgather, a2a+p2p ---hierarchical-context-parallel-sizes 2 4 # Hierarchical context parallelism -``` - -## Expert Parallelism (EP) -For Mixture of Experts (MoE) models: -```bash ---expert-model-parallel-size 4 # 4-way expert parallelism ---num-experts 8 # 8 experts per MoE layer ---moe-grouped-gemm # Optimize expert computation -``` - -## Combining Parallelism Strategies - -### Parallelism Selection Guide - -Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/tree/main/scripts/performance/recommended_model_configs): - -| Model | Size | GPUs | TP | PP | CP | EP | Notes | -|-------|------|------|----|----|----|----|-------| -| **LLama-3** | 8B | 8 | 1 | 1 | 2 | 1 | CP for long seqlen (8K) | -| **LLama-3** | 70B | 64 | 4 | 4 | 2 | 1 | TP+PP | -| **LLama-3.1** | 405B | 1024 | 8 | 8 | 2 | 1 | 3D parallelism for scale | -| **GPT-3** | 175B | 128-512 | 4 | 8 | 1 | 1 | Large model config | -| **Mixtral** | 8x7B | 64 | 1 | 4 | 1 | 8 | EP for MoE | -| **Mixtral** | 8x22B | 256 | 4 | 4 | 8 | 8 | Combined TP+EP for large MoE | -| **DeepSeek-V3** | 671B | 1024 | 2 | 16 | 1 | 64 | Large MoE config | - -### MoE-Specific Requirements - -**Important**: When combining Expert Parallelism (EP) with Tensor Parallelism (TP), **Sequence Parallelism (SP) must be enabled**. - -## Performance Optimizations - -| Feature | Flag | Benefit | -|---------|------|---------| -| **FlashAttention** | `--attention-backend` | Faster attention and lower memory usage | -| **FP8 Training** | `--fp8-hybrid` | Faster training | -| **Activation Checkpointing** | `--recompute-activations` | Reduced memory usage | -| **Data Parallelism Communication Overlap** | `--overlap-grad-reduce` | Faster distributed training | -| **Distributed Optimizer** | `--use-distributed-optimizer` | Reduced checkpointing time | - -**→ [NVIDIA NeMo Framework Performance Tuning Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance-guide.html#performance-tuning-guide)** - Comprehensive performance optimization guide covering advanced tuning techniques, communication overlaps, memory optimizations, and profiling options. - -### FlashAttention -[FlashAttention](https://github.com/Dao-AILab/flash-attention) is a fast and memory-efficient attention algorithm. We recommend the default usage, which uses cuDNN for attention via Transformer Engine and provides up to 50% speedups on forward and 84% on backward propagation with FP8 kernels. The `flash-attn` package is also supported via `--use-flash-attn`. - -### Mixed Precision Training -```bash ---fp16 # Standard FP16 ---bf16 # BFloat16 (recommended for large models) ---fp8-hybrid # FP8 training (Hopper, Ada, and Blackwell GPUs) -``` - -### Activation Checkpointing and Recomputation -```bash -# For limited memory ---recompute-activations - -# For extreme memory constraints ---recompute-granularity full \ ---recompute-method uniform -``` - -### Data Parallelism Communication Overlap - -```bash ---overlap-grad-reduce ---overlap-param-gather -``` - -### Distributed Optimizer -```bash ---use-distributed-optimizer -``` +### Fast Iteration +- **Streamlined Review**: 1 code owner + 1 dev approver (can delegate review) + CI/CD -# Roadmaps +### Feature Lifecycle (Coming Soon) +- **6-Month Timeline**: Experimental features must graduate to stable or be deprecated +- **Migration Support**: Assistance provided for feature transitions -Stay up-to-date with our development roadmaps and planned features: +### Stability Expectations +- **Experimental Nature**: Features may change or be removed as development progresses +- **Testing**: All features will pass convergence and performance validation before inclusion +- **Support**: Dev branch issues should include `[DEV]` prefix -- **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive MoE feature development including DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements -- **[GPT-OSS Implementation Tracker](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions +## Performance & Benchmarking -*More roadmap trackers will be added soon.* +🚧 **Coming Soon** - We will update this section with performance benchmarks of experimental features as they become available. -# Community & Support +## Community & Support -## Getting Help +### Getting Help - 📖 **[Documentation](https://docs.nvidia.com/Megatron-Core/)** - Official documentation - 🐛 **[Issues](https://github.com/NVIDIA/Megatron-LM/issues)** - Bug reports and feature requests -## Contributing +### Contributing We ❤️ contributions! Ways to contribute: - 🐛 **Report bugs** - Help us improve reliability - 💡 **Suggest features** - Shape the future of Megatron Core @@ -473,7 +77,7 @@ We ❤️ contributions! Ways to contribute: **→ [Contributing Guide](./CONTRIBUTING.md)** -## Citation +### Citation ```bibtex @article{megatron-lm, title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism}, From eff3f6ab9f074a2f8882c3f222539e2d16912d60 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Sun, 12 Oct 2025 10:27:14 -0700 Subject: [PATCH 013/248] ADLR/megatron-lm!4223 - Ko3n1g/cp/4213 to dev Co-authored-by: Mcore Bot --- .../core/optimizer/layer_wise_optimizer.py | 23 +- megatron/core/optimizer/optimizer_config.py | 1 - .../python_test_utils/common.py | 39 +- .../get_test_results_from_tensorboard_logs.py | 1 - .../shell_test_utils/run_ci_test.sh | 5 +- .../golden_values_dev_dgx_h100.json | 227 ++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 227 ++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 227 ++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 227 ++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 227 ++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 227 ++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ ...olden_values_dev_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_dev_dgxa100_dracooci.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 227 ++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ ...olden_values_dev_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_dev_dgxa100_dracooci.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_coreweave.json | 162 +++++ .../golden_values_dev_dgxh100_eos.json | 162 +++++ .../golden_values_dev_dgxh100_coreweave.json | 162 +++++ .../golden_values_dev_dgxh100_eos.json | 162 +++++ .../golden_values_dev_dgx_h100.json | 288 ++++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 538 +++++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 288 ++++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 288 ++++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 538 +++++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 538 +++++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 538 +++++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 288 ++++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 288 ++++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 288 ++++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 609 +++++++++++++++--- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 271 ++++++-- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 271 ++++++-- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 247 ++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 288 ++++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 263 +++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 271 ++++++-- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 281 ++++++-- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 285 ++++++-- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 243 ++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 227 ++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 521 +++++++++++++-- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 481 ++++++++++++-- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 538 +++++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 538 +++++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 487 ++++++++++++-- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 288 ++++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 538 +++++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_lts_dgx_a100.json | 538 +++++++++++++++- ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 538 +++++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_lts_dgx_a100.json | 538 +++++++++++++++- ...olden_values_lts_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_lts_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_coreweave.json | 1 + .../golden_values_dev_dgxh100_eos.json | 1 + .../golden_values_dev_dgxh100_coreweave.json | 1 + .../golden_values_dev_dgxh100_eos.json | 1 + .../golden_values_dev_dgxh100_coreweave.json | 1 + .../golden_values_dev_dgxh100_eos.json | 1 + .../golden_values_dev_dgxh100_coreweave.json | 1 + .../golden_values_dev_dgxh100_eos.json | 1 + .../golden_values_dev_dgx_h100.json | 288 ++++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 265 +++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 269 ++++++-- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgxh100_coreweave.json | 1 + .../golden_values_dev_dgxh100_eos.json | 1 + .../golden_values_dev_dgxh100_coreweave.json | 1 + .../golden_values_dev_dgxh100_eos.json | 1 + .../golden_values_dev_dgx_h100.json | 297 +++++++-- ...olden_values_dev_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_dev_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 301 +++++++-- ...olden_values_dev_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_dev_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_lts_dgx_a100.json | 311 +++++++-- ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 227 ++++++- ...olden_values_dev_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_dev_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 227 ++++++- ...olden_values_dev_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_dev_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 227 ++++++- ...olden_values_dev_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_dev_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 227 ++++++- ...olden_values_dev_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_dev_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 227 ++++++- ...olden_values_dev_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_dev_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgxh100_coreweave.json | 344 ++++++++++ .../golden_values_dev_dgxh100_eos.json | 344 ++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 597 ++++++++++++++--- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 227 ++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 227 ++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ ...olden_values_lts_dgxa100_dracooci-ord.json | 287 +++++++++ .../golden_values_lts_dgxa100_dracooci.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 271 ++++++-- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgxh100_coreweave.json | 160 +++++ .../golden_values_dev_dgxh100_eos.json | 160 +++++ .../golden_values_dev_dgxh100_coreweave.json | 1 + .../golden_values_dev_dgxh100_eos.json | 1 + .../golden_values_dev_dgxh100_coreweave.json | 1 + .../golden_values_dev_dgxh100_eos.json | 1 + .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 288 ++++++++- .../golden_values_dev_dgxh100_coreweave.json | 287 +++++++++ .../golden_values_dev_dgxh100_eos.json | 287 +++++++++ .../golden_values_dev_dgx_h100.json | 163 ++++- .../golden_values_dev_dgxh100_coreweave.json | 162 +++++ .../golden_values_dev_dgxh100_eos.json | 162 +++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- ...olden_values_dev_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_dev_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- ...olden_values_dev_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_dev_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- ...olden_values_dev_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_dev_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- ...olden_values_dev_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_dev_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- ...olden_values_dev_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_dev_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../golden_values_dev_dgx_h100.json | 447 ++++++++++++- ...olden_values_dev_dgxa100_dracooci-ord.json | 537 +++++++++++++++ .../golden_values_dev_dgxa100_dracooci.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_coreweave.json | 537 +++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 537 +++++++++++++++ .../python_scripts/download_golden_values.py | 40 +- .../python_scripts/launch_jet_workload.py | 7 +- tests/test_utils/recipes/bert.yaml | 2 +- ...pt-dynamic-inference-with-coordinator.yaml | 3 +- .../recipes/gpt-dynamic-inference.yaml | 3 +- tests/test_utils/recipes/gpt-grads.yaml | 2 +- tests/test_utils/recipes/gpt-nemo.yaml | 2 +- .../recipes/gpt-static-inference.yaml | 4 +- tests/test_utils/recipes/gpt.yaml | 2 +- .../recipes/mamba-static-inference.yaml | 4 +- tests/test_utils/recipes/mamba.yaml | 2 +- tests/test_utils/recipes/mimo.yaml | 2 +- .../recipes/moe-dynamic-inference.yaml | 6 +- .../recipes/moe-static-inference.yaml | 6 +- tests/test_utils/recipes/moe.yaml | 14 +- .../test_utils/recipes/multimodal-llava.yaml | 2 +- tests/test_utils/recipes/t5.yaml | 2 +- tests/unit_tests/test_muon_optimizer.py | 7 +- 433 files changed, 158359 insertions(+), 2068 deletions(-) create mode 100644 tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxh100_eos.json diff --git a/megatron/core/optimizer/layer_wise_optimizer.py b/megatron/core/optimizer/layer_wise_optimizer.py index b398a645ce3..6c77be48e30 100644 --- a/megatron/core/optimizer/layer_wise_optimizer.py +++ b/megatron/core/optimizer/layer_wise_optimizer.py @@ -4,13 +4,13 @@ import torch -from .optimizer import ChainedOptimizer, MegatronOptimizer, Float16OptimizerWithFloat16Params -from .optimizer_config import OptimizerConfig -from .clip_grads import clip_grad_by_total_norm_fp32, count_zeros_fp32, get_grad_norm_fp32 - from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.utils import get_pg_rank, get_pg_size +from .clip_grads import clip_grad_by_total_norm_fp32, count_zeros_fp32, get_grad_norm_fp32 +from .optimizer import ChainedOptimizer, Float16OptimizerWithFloat16Params, MegatronOptimizer +from .optimizer_config import OptimizerConfig + class LayerWiseDistributedOptimizer(ChainedOptimizer): """Layer-wise distributed optimizer for Megatron-core models. @@ -27,6 +27,7 @@ class LayerWiseDistributedOptimizer(ChainedOptimizer): 4. Do regular update with chained optimizers, optimizer is already modified so partial update happens 5. allgather updated params to every rank(currently through broadcast loop) """ + def __init__( self, optimizers: List[MegatronOptimizer], @@ -40,7 +41,9 @@ def __init__( if config.bf16: if isinstance(optimizers[0], Float16OptimizerWithFloat16Params): raise TypeError('LayerWiseDistributedOptimizer received Float16 optimizer already.') - optimizers = [Float16OptimizerWithFloat16Params(optim, config, None, None) for optim in optimizers] + optimizers = [ + Float16OptimizerWithFloat16Params(optim, config, None, None) for optim in optimizers + ] super().__init__(optimizers) # TODO(kunlun, deyuf): potential future perf optimization @@ -50,7 +53,7 @@ def __init__( # single allgather later and all current distopt optimization can be applied def shard_params(self, optimizers): - """Shard all params into lists by rank. """ + """Shard all params into lists by rank.""" # We'll optimize sharding later if there is perf issue. should be ok since linear are grouped already # Key is to create separate sharding for dp/expt parallel, saved in dp_cp_params_list, expt_dp_params_list # example of 4 dp rank and 10 non-expert parameters p0-p9, then dp_cp_params_list will look like @@ -94,7 +97,7 @@ def shard_params(self, optimizers): @torch.no_grad() def broadcast_params(self): - """All rank broadcast updated local params(allgatherv). """ + """All rank broadcast updated local params(allgatherv).""" # Broadcast linear layer weights to all other ranks. # This may not be slower than PyTorch allgatherv which calls broadcast internally. # TODO(skyw): Profile and implement more efficient version. @@ -117,9 +120,7 @@ def get_grad_norm(self): grads_for_norm = [] for optimizer in self.chained_optimizers: grads_for_norm += optimizer.get_main_grads_for_grad_norm() - grad_norm = get_grad_norm_fp32( - grads_for_norm, grad_stats_parallel_group=None - ) + grad_norm = get_grad_norm_fp32(grads_for_norm, grad_stats_parallel_group=None) return grad_norm @torch.no_grad() @@ -154,5 +155,3 @@ def save_state_dict_to_file(self, filename: str) -> None: def load_state_dict_from_file(self, filename: str) -> None: """Load the parameter state of the optimizer.""" super().load_state_dict(torch.load(filename)) - - diff --git a/megatron/core/optimizer/optimizer_config.py b/megatron/core/optimizer/optimizer_config.py index 65e1fd6a71f..ced3845804f 100644 --- a/megatron/core/optimizer/optimizer_config.py +++ b/megatron/core/optimizer/optimizer_config.py @@ -146,7 +146,6 @@ class OptimizerConfig: muon_extra_scale_factor: float = 1.0 """Additional scale factor for the muon update.""" - ####################### # Distributed optimizer ####################### diff --git a/tests/functional_tests/python_test_utils/common.py b/tests/functional_tests/python_test_utils/common.py index 23d512f1125..4af4bd36167 100644 --- a/tests/functional_tests/python_test_utils/common.py +++ b/tests/functional_tests/python_test_utils/common.py @@ -218,25 +218,18 @@ def pipeline( ] if metric_name == "iteration-time": - if len(actual_value_list) >= 10: - actual_value_list = actual_value_list[3:-3] - golden_value_list = golden_value_list[3:-3] - total_steps_evaluated = ( - golden_value.end_step / golden_value.step_interval + 1 - 3 - 3 - ) - else: - actual_value_list = actual_value_list[3:-1] - golden_value_list = golden_value_list[3:-1] - total_steps_evaluated = ( - golden_value.end_step / golden_value.step_interval + 1 - 3 - 1 - ) - logger.info( - "For metric `%s`, the first and last 3 scalars are removed from the list to reduce noise.", - metric_name, - ) - - actual_value_list = [np.inf if type(v) is str else v for v in actual_value_list] - golden_value_list = [np.inf if type(v) is str else v for v in golden_value_list] + actual_value_list = [ + np.median([np.inf if type(v) is str else v for v in actual_value_list]) + ] + golden_value_list = [ + np.median([np.inf if type(v) is str else v for v in golden_value_list]) + ] + total_steps_evaluated = 1 + else: + total_steps_evaluated = golden_value.end_step / golden_value.step_interval + 1 + + actual_value_list = [np.inf if type(v) is str else v for v in actual_value_list] + golden_value_list = [np.inf if type(v) is str else v for v in golden_value_list] actual = np.array(actual_value_list) golden = np.array(golden_value_list) @@ -248,8 +241,12 @@ def pipeline( passing = np.mean(is_close) >= (num_failing_steps_allowed / total_steps_evaluated) if not passing: - logger.info("Actual values: %s", ", ".join([str(v) for v in actual_value_list])) - logger.info("Golden values: %s", ", ".join([str(v) for v in golden_value_list])) + logger.info( + "Actual values: %s", ", ".join([str(v) for v in (*actual_value_list,)]) + ) + logger.info( + "Golden values: %s", ", ".join([str(v) for v in (*golden_value_list,)]) + ) raise test.error_message(metric_name) result = f"{test.type_of_test_result.name} test for metric {metric_name}: PASSED" diff --git a/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py b/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py index 50e7e03b0c2..7b74a6879ad 100644 --- a/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py +++ b/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py @@ -29,7 +29,6 @@ default=False, ) @click.option("--step-size", required=False, default=5, type=int, help="Step size of sampling") -@click.option("--step-size", required=False, default=5, type=int, help="Step size of sampling") def collect_train_test_metrics( logs_dir: str, train_iters: str, diff --git a/tests/functional_tests/shell_test_utils/run_ci_test.sh b/tests/functional_tests/shell_test_utils/run_ci_test.sh index 872053a8d3f..b24423773e5 100644 --- a/tests/functional_tests/shell_test_utils/run_ci_test.sh +++ b/tests/functional_tests/shell_test_utils/run_ci_test.sh @@ -231,7 +231,7 @@ for i in $(seq 1 $N_REPEAT); do if [[ "$TEST_TYPE" == "release" ]]; then EXTRACT_ARGS=("--is-convergence-test") else - EXTRACT_ARGS=("--is-normal-test") + EXTRACT_ARGS=("--is-normal-test" "--step-size" "1") fi # Read test values from Tensorboard for non-inference tests. @@ -285,7 +285,8 @@ for i in $(seq 1 $N_REPEAT); do --logs-dir $TENSORBOARD_PATH \ --train-iters $TRAIN_ITERS \ --output-path "${OUTPUT_PATH}/$(basename $GOLDEN_VALUES_PATH .json)_2nd.json" \ - --is-second-run + --is-second-run \ + "${EXTRACT_ARGS[@]}" echo "Running pytest 1st vs 2nd run comparison" uv run --no-sync pytest -s -o log_cli=true --log-cli-level=info $ROOT_DIR/tests/functional_tests/python_test_utils/test_pretraining_resume_checkpoint_pipeline.py \ diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index c9c84707301..a7cfd87bc71 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.48367, + "2": 10.48426, + "3": 10.48254, + "4": 10.48311, "5": 10.4764, + "6": 10.4844, + "7": 10.48458, + "8": 10.48829, + "9": 10.49008, "10": 10.47268, + "11": 10.47256, + "12": 10.48259, + "13": 10.47857, + "14": 10.45154, "15": 10.47925, + "16": 10.45346, + "17": 10.45145, + "18": 10.46238, + "19": 10.44113, "20": 10.45448, + "21": 10.43454, + "22": 10.40592, + "23": 10.39961, + "24": 10.37579, "25": 10.38182, + "26": 10.35147, + "27": 10.35388, + "28": 10.34937, + "29": 10.28711, "30": 10.21159, + "31": 10.1726, + "32": 10.13421, + "33": 10.14744, + "34": 10.10737, "35": 10.10581, + "36": 10.08735, + "37": 10.08157, + "38": 10.07233, + "39": 10.00094, "40": 9.98143, + "41": 9.92541, + "42": 9.87527, + "43": 9.88711, + "44": 9.80642, "45": 9.82325, + "46": 9.73785, + "47": 9.74817, + "48": 9.71609, + "49": 9.74484, "50": 9.72982 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2570.0, + "2": 1923.0, + "3": 1512.0, + "4": 2322.0, "5": 2033.0, + "6": 1774.0, + "7": 2781.0, + "8": 2460.0, + "9": 2308.0, "10": 2635.0, + "11": 2397.0, + "12": 1817.0, + "13": 2348.0, + "14": 2749.0, "15": 2027.0, + "16": 2719.0, + "17": 2487.0, + "18": 2533.0, + "19": 2547.0, "20": 2850.0, + "21": 1990.0, + "22": 2884.0, + "23": 2857.0, + "24": 2685.0, "25": 2514.0, + "26": 2958.0, + "27": 2673.0, + "28": 2723.0, + "29": 2571.0, "30": 2858.0, + "31": 2157.0, + "32": 2357.0, + "33": 2242.0, + "34": 2464.0, "35": 2544.0, + "36": 2933.0, + "37": 3293.0, + "38": 2730.0, + "39": 2795.0, "40": 3310.0, + "41": 1816.0, + "42": 1467.0, + "43": 1817.0, + "44": 2633.0, "45": 3576.0, + "46": 3015.0, + "47": 2805.0, + "48": 3071.0, + "49": 2974.0, "50": 2267.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1784014336.0, + "2": 1784014336.0, + "3": 1784014336.0, + "4": 1784014336.0, "5": 1784014336.0, + "6": 1784014336.0, + "7": 1784014336.0, + "8": 1784014336.0, + "9": 1784014336.0, "10": 1784014336.0, + "11": 1784014336.0, + "12": 1784014336.0, + "13": 1784014336.0, + "14": 1784014336.0, "15": 1784014336.0, + "16": 1784014336.0, + "17": 1784014336.0, + "18": 1784014336.0, + "19": 1784014336.0, "20": 1784014336.0, + "21": 1784014336.0, + "22": 1784014336.0, + "23": 1784014336.0, + "24": 1784014336.0, "25": 1784014336.0, + "26": 1784014336.0, + "27": 1784014336.0, + "28": 1784014336.0, + "29": 1784014336.0, "30": 1784014336.0, + "31": 1784014336.0, + "32": 1784014336.0, + "33": 1784014336.0, + "34": 1784014336.0, "35": 1784014336.0, + "36": 1784014336.0, + "37": 1784014336.0, + "38": 1784014336.0, + "39": 1784014336.0, "40": 1784014336.0, + "41": 1784014336.0, + "42": 1784014336.0, + "43": 1784014336.0, + "44": 1784014336.0, "45": 1784014336.0, + "46": 1784014336.0, + "47": 1784014336.0, + "48": 1784014336.0, + "49": 1784014336.0, "50": 1784014336.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2365860864.0, + "2": 3108323328.0, + "3": 3108323328.0, + "4": 3108323328.0, "5": 3108323328.0, + "6": 3108323328.0, + "7": 3108323328.0, + "8": 3108323328.0, + "9": 3108323328.0, "10": 3108323328.0, + "11": 3108323328.0, + "12": 3108323328.0, + "13": 3108323328.0, + "14": 3108323328.0, "15": 3108323328.0, + "16": 3108323328.0, + "17": 3108323328.0, + "18": 3108323328.0, + "19": 3108323328.0, "20": 3108323328.0, + "21": 3108323328.0, + "22": 3108323328.0, + "23": 3108323328.0, + "24": 3108323328.0, "25": 3108323328.0, + "26": 3108323328.0, + "27": 3108323328.0, + "28": 3108323328.0, + "29": 3108323328.0, "30": 3108323328.0, + "31": 3108323328.0, + "32": 3108323328.0, + "33": 3108323328.0, + "34": 3108323328.0, "35": 3108323328.0, + "36": 3108323328.0, + "37": 3108323328.0, + "38": 3108323328.0, + "39": 3108323328.0, "40": 3108323328.0, + "41": 3108323328.0, + "42": 3108323328.0, + "43": 3108323328.0, + "44": 3108323328.0, "45": 3108323328.0, + "46": 3108323328.0, + "47": 3108323328.0, + "48": 3108323328.0, + "49": 3108323328.0, "50": 3108323328.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 12.77355, - "5": 0.85924, - "10": 0.86109, - "15": 0.87427, - "20": 1.11915, - "25": 0.87738, - "30": 0.86647, - "35": 0.84584, - "40": 0.86114, - "45": 1.15934, - "50": 0.84601 + "1": 11.95325, + "2": 1.03495, + "3": 1.01983, + "4": 1.02247, + "5": 1.02376, + "6": 1.01057, + "7": 1.00305, + "8": 1.00511, + "9": 1.01164, + "10": 1.00809, + "11": 1.00401, + "12": 1.01195, + "13": 1.00522, + "14": 1.01037, + "15": 1.01016, + "16": 1.00481, + "17": 1.00787, + "18": 1.00866, + "19": 1.0117, + "20": 1.43302, + "21": 1.37362, + "22": 1.11681, + "23": 1.05672, + "24": 1.00983, + "25": 1.01065, + "26": 1.00572, + "27": 1.00992, + "28": 1.00576, + "29": 1.00599, + "30": 1.00468, + "31": 1.00657, + "32": 1.00207, + "33": 1.00815, + "34": 1.01333, + "35": 1.00888, + "36": 1.01481, + "37": 1.32861, + "38": 1.01215, + "39": 1.00755, + "40": 1.00235, + "41": 1.00954, + "42": 1.00544, + "43": 1.0136, + "44": 1.34075, + "45": 1.00937, + "46": 1.0108, + "47": 1.01217, + "48": 1.11889, + "49": 1.34225, + "50": 1.09191 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..2e4f3c6e211 --- /dev/null +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.48367, + "2": 10.48426, + "3": 10.48254, + "4": 10.48311, + "5": 10.4764, + "6": 10.4844, + "7": 10.48458, + "8": 10.48829, + "9": 10.49008, + "10": 10.47268, + "11": 10.47256, + "12": 10.48259, + "13": 10.47857, + "14": 10.45154, + "15": 10.47925, + "16": 10.45346, + "17": 10.45145, + "18": 10.46238, + "19": 10.44113, + "20": 10.45448, + "21": 10.43454, + "22": 10.40592, + "23": 10.39961, + "24": 10.37579, + "25": 10.38182, + "26": 10.35147, + "27": 10.35388, + "28": 10.34937, + "29": 10.28711, + "30": 10.21159, + "31": 10.1726, + "32": 10.13421, + "33": 10.14744, + "34": 10.10737, + "35": 10.10581, + "36": 10.08735, + "37": 10.08157, + "38": 10.07233, + "39": 10.00094, + "40": 9.98143, + "41": 9.92541, + "42": 9.87527, + "43": 9.88711, + "44": 9.80642, + "45": 9.82325, + "46": 9.73785, + "47": 9.74817, + "48": 9.71609, + "49": 9.74484, + "50": 9.72982 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2570.0, + "2": 1923.0, + "3": 1512.0, + "4": 2322.0, + "5": 2033.0, + "6": 1774.0, + "7": 2781.0, + "8": 2460.0, + "9": 2308.0, + "10": 2635.0, + "11": 2397.0, + "12": 1817.0, + "13": 2348.0, + "14": 2749.0, + "15": 2027.0, + "16": 2719.0, + "17": 2487.0, + "18": 2533.0, + "19": 2547.0, + "20": 2850.0, + "21": 1990.0, + "22": 2884.0, + "23": 2857.0, + "24": 2685.0, + "25": 2514.0, + "26": 2958.0, + "27": 2673.0, + "28": 2723.0, + "29": 2571.0, + "30": 2858.0, + "31": 2157.0, + "32": 2357.0, + "33": 2242.0, + "34": 2464.0, + "35": 2544.0, + "36": 2933.0, + "37": 3293.0, + "38": 2730.0, + "39": 2795.0, + "40": 3310.0, + "41": 1816.0, + "42": 1467.0, + "43": 1817.0, + "44": 2633.0, + "45": 3576.0, + "46": 3015.0, + "47": 2805.0, + "48": 3071.0, + "49": 2974.0, + "50": 2267.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1784014336.0, + "2": 1784014336.0, + "3": 1784014336.0, + "4": 1784014336.0, + "5": 1784014336.0, + "6": 1784014336.0, + "7": 1784014336.0, + "8": 1784014336.0, + "9": 1784014336.0, + "10": 1784014336.0, + "11": 1784014336.0, + "12": 1784014336.0, + "13": 1784014336.0, + "14": 1784014336.0, + "15": 1784014336.0, + "16": 1784014336.0, + "17": 1784014336.0, + "18": 1784014336.0, + "19": 1784014336.0, + "20": 1784014336.0, + "21": 1784014336.0, + "22": 1784014336.0, + "23": 1784014336.0, + "24": 1784014336.0, + "25": 1784014336.0, + "26": 1784014336.0, + "27": 1784014336.0, + "28": 1784014336.0, + "29": 1784014336.0, + "30": 1784014336.0, + "31": 1784014336.0, + "32": 1784014336.0, + "33": 1784014336.0, + "34": 1784014336.0, + "35": 1784014336.0, + "36": 1784014336.0, + "37": 1784014336.0, + "38": 1784014336.0, + "39": 1784014336.0, + "40": 1784014336.0, + "41": 1784014336.0, + "42": 1784014336.0, + "43": 1784014336.0, + "44": 1784014336.0, + "45": 1784014336.0, + "46": 1784014336.0, + "47": 1784014336.0, + "48": 1784014336.0, + "49": 1784014336.0, + "50": 1784014336.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2365860864.0, + "2": 3108323328.0, + "3": 3108323328.0, + "4": 3108323328.0, + "5": 3108323328.0, + "6": 3108323328.0, + "7": 3108323328.0, + "8": 3108847104.0, + "9": 3108847104.0, + "10": 3108847104.0, + "11": 3108847104.0, + "12": 3108847104.0, + "13": 3108847104.0, + "14": 3108847104.0, + "15": 3108847104.0, + "16": 3108847104.0, + "17": 3108847104.0, + "18": 3108847104.0, + "19": 3108847104.0, + "20": 3108847104.0, + "21": 3108847104.0, + "22": 3108847104.0, + "23": 3108847104.0, + "24": 3108847104.0, + "25": 3108847104.0, + "26": 3108847104.0, + "27": 3108847104.0, + "28": 3108847104.0, + "29": 3108847104.0, + "30": 3108847104.0, + "31": 3108847104.0, + "32": 3108847104.0, + "33": 3108847104.0, + "34": 3108847104.0, + "35": 3108847104.0, + "36": 3108847104.0, + "37": 3108847104.0, + "38": 3108847104.0, + "39": 3108847104.0, + "40": 3108847104.0, + "41": 3108847104.0, + "42": 3108847104.0, + "43": 3108847104.0, + "44": 3108847104.0, + "45": 3108847104.0, + "46": 3108847104.0, + "47": 3108847104.0, + "48": 3108847104.0, + "49": 3108847104.0, + "50": 3108847104.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 12.28863, + "2": 1.02215, + "3": 0.91269, + "4": 0.90798, + "5": 0.9095, + "6": 0.89623, + "7": 0.91406, + "8": 0.93659, + "9": 0.98867, + "10": 0.97926, + "11": 0.92244, + "12": 0.93168, + "13": 0.91684, + "14": 0.92151, + "15": 0.90545, + "16": 0.92975, + "17": 0.9771, + "18": 0.91421, + "19": 0.91325, + "20": 1.37492, + "21": 1.35582, + "22": 0.90471, + "23": 0.90119, + "24": 0.9066, + "25": 0.89745, + "26": 0.90071, + "27": 0.90705, + "28": 0.91467, + "29": 0.90066, + "30": 0.94983, + "31": 0.9257, + "32": 0.92349, + "33": 0.92172, + "34": 0.93247, + "35": 0.91594, + "36": 0.9259, + "37": 0.91518, + "38": 0.91714, + "39": 0.91191, + "40": 0.91531, + "41": 0.91413, + "42": 0.92876, + "43": 0.95961, + "44": 0.90524, + "45": 0.89573, + "46": 0.90239, + "47": 0.89546, + "48": 1.05878, + "49": 1.18954, + "50": 1.15643 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..1352649be85 --- /dev/null +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.48367, + "2": 10.48426, + "3": 10.48254, + "4": 10.48311, + "5": 10.4764, + "6": 10.4844, + "7": 10.48458, + "8": 10.48829, + "9": 10.49008, + "10": 10.47268, + "11": 10.47256, + "12": 10.48259, + "13": 10.47857, + "14": 10.45154, + "15": 10.47925, + "16": 10.45346, + "17": 10.45145, + "18": 10.46238, + "19": 10.44113, + "20": 10.45448, + "21": 10.43454, + "22": 10.40592, + "23": 10.39961, + "24": 10.37579, + "25": 10.38182, + "26": 10.35147, + "27": 10.35388, + "28": 10.34937, + "29": 10.28711, + "30": 10.21159, + "31": 10.1726, + "32": 10.13421, + "33": 10.14744, + "34": 10.10737, + "35": 10.10581, + "36": 10.08735, + "37": 10.08157, + "38": 10.07233, + "39": 10.00094, + "40": 9.98143, + "41": 9.92541, + "42": 9.87527, + "43": 9.88711, + "44": 9.80642, + "45": 9.82325, + "46": 9.73785, + "47": 9.74817, + "48": 9.71609, + "49": 9.74484, + "50": 9.72982 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2570.0, + "2": 1923.0, + "3": 1512.0, + "4": 2322.0, + "5": 2033.0, + "6": 1774.0, + "7": 2781.0, + "8": 2460.0, + "9": 2308.0, + "10": 2635.0, + "11": 2397.0, + "12": 1817.0, + "13": 2348.0, + "14": 2749.0, + "15": 2027.0, + "16": 2719.0, + "17": 2487.0, + "18": 2533.0, + "19": 2547.0, + "20": 2850.0, + "21": 1990.0, + "22": 2884.0, + "23": 2857.0, + "24": 2685.0, + "25": 2514.0, + "26": 2958.0, + "27": 2673.0, + "28": 2723.0, + "29": 2571.0, + "30": 2858.0, + "31": 2157.0, + "32": 2357.0, + "33": 2242.0, + "34": 2464.0, + "35": 2544.0, + "36": 2933.0, + "37": 3293.0, + "38": 2730.0, + "39": 2795.0, + "40": 3310.0, + "41": 1816.0, + "42": 1467.0, + "43": 1817.0, + "44": 2633.0, + "45": 3576.0, + "46": 3015.0, + "47": 2805.0, + "48": 3071.0, + "49": 2974.0, + "50": 2267.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1784014336.0, + "2": 1784014336.0, + "3": 1784014336.0, + "4": 1784014336.0, + "5": 1784014336.0, + "6": 1784014336.0, + "7": 1784014336.0, + "8": 1784014336.0, + "9": 1784014336.0, + "10": 1784014336.0, + "11": 1784014336.0, + "12": 1784014336.0, + "13": 1784014336.0, + "14": 1784014336.0, + "15": 1784014336.0, + "16": 1784014336.0, + "17": 1784014336.0, + "18": 1784014336.0, + "19": 1784014336.0, + "20": 1784014336.0, + "21": 1784014336.0, + "22": 1784014336.0, + "23": 1784014336.0, + "24": 1784014336.0, + "25": 1784014336.0, + "26": 1784014336.0, + "27": 1784014336.0, + "28": 1784014336.0, + "29": 1784014336.0, + "30": 1784014336.0, + "31": 1784014336.0, + "32": 1784014336.0, + "33": 1784014336.0, + "34": 1784014336.0, + "35": 1784014336.0, + "36": 1784014336.0, + "37": 1784014336.0, + "38": 1784014336.0, + "39": 1784014336.0, + "40": 1784014336.0, + "41": 1784014336.0, + "42": 1784014336.0, + "43": 1784014336.0, + "44": 1784014336.0, + "45": 1784014336.0, + "46": 1784014336.0, + "47": 1784014336.0, + "48": 1784014336.0, + "49": 1784014336.0, + "50": 1784014336.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2365860864.0, + "2": 3108323328.0, + "3": 3108323328.0, + "4": 3108323328.0, + "5": 3108323328.0, + "6": 3108842496.0, + "7": 3108842496.0, + "8": 3108842496.0, + "9": 3108842496.0, + "10": 3108842496.0, + "11": 3108842496.0, + "12": 3108842496.0, + "13": 3108842496.0, + "14": 3108842496.0, + "15": 3108842496.0, + "16": 3108842496.0, + "17": 3108842496.0, + "18": 3108842496.0, + "19": 3108842496.0, + "20": 3108842496.0, + "21": 3108842496.0, + "22": 3108842496.0, + "23": 3108842496.0, + "24": 3108842496.0, + "25": 3108842496.0, + "26": 3108842496.0, + "27": 3108842496.0, + "28": 3108842496.0, + "29": 3108842496.0, + "30": 3108842496.0, + "31": 3108842496.0, + "32": 3108842496.0, + "33": 3108842496.0, + "34": 3108842496.0, + "35": 3108842496.0, + "36": 3108842496.0, + "37": 3108842496.0, + "38": 3108842496.0, + "39": 3108842496.0, + "40": 3108842496.0, + "41": 3108842496.0, + "42": 3108842496.0, + "43": 3108842496.0, + "44": 3108842496.0, + "45": 3108842496.0, + "46": 3108842496.0, + "47": 3108842496.0, + "48": 3108842496.0, + "49": 3108842496.0, + "50": 3108842496.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.98661, + "2": 1.05916, + "3": 1.01721, + "4": 1.02611, + "5": 1.02779, + "6": 1.11252, + "7": 1.0176, + "8": 1.02427, + "9": 1.02561, + "10": 1.01845, + "11": 1.02419, + "12": 1.01745, + "13": 1.01224, + "14": 1.02388, + "15": 1.03687, + "16": 1.01886, + "17": 1.01708, + "18": 1.01143, + "19": 1.01902, + "20": 1.49878, + "21": 1.47537, + "22": 1.01801, + "23": 1.05158, + "24": 1.03481, + "25": 1.01773, + "26": 1.01186, + "27": 1.02203, + "28": 1.01824, + "29": 1.01865, + "30": 1.02165, + "31": 1.0184, + "32": 1.02106, + "33": 1.04655, + "34": 1.03129, + "35": 1.01893, + "36": 1.02153, + "37": 1.02154, + "38": 1.0213, + "39": 1.14846, + "40": 1.02149, + "41": 1.01905, + "42": 1.02038, + "43": 1.03126, + "44": 1.04155, + "45": 1.01649, + "46": 1.01742, + "47": 1.02406, + "48": 1.27122, + "49": 1.15085, + "50": 1.10861 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index f38e38fdb9c..fb44f049ad6 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.4837, + "2": 10.48435, + "3": 10.48251, + "4": 10.48303, "5": 10.47647, + "6": 10.48423, + "7": 10.48457, + "8": 10.48837, + "9": 10.49003, "10": 10.47255, + "11": 10.47245, + "12": 10.4828, + "13": 10.47855, + "14": 10.45162, "15": 10.47936, + "16": 10.45364, + "17": 10.45143, + "18": 10.46239, + "19": 10.44136, "20": 10.45438, + "21": 10.43469, + "22": 10.40587, + "23": 10.39982, + "24": 10.37585, "25": 10.38173, + "26": 10.35154, + "27": 10.35401, + "28": 10.3497, + "29": 10.28714, "30": 10.21194, + "31": 10.17274, + "32": 10.13439, + "33": 10.14753, + "34": 10.10759, "35": 10.10592, + "36": 10.08756, + "37": 10.08177, + "38": 10.07257, + "39": 10.0013, "40": 9.9816, + "41": 9.92549, + "42": 9.87529, + "43": 9.88742, + "44": 9.80641, "45": 9.82342, + "46": 9.73815, + "47": 9.74831, + "48": 9.71619, + "49": 9.74504, "50": 9.73004 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2554.0, + "2": 1919.0, + "3": 1521.0, + "4": 2330.0, "5": 2010.0, + "6": 1725.0, + "7": 2803.0, + "8": 2435.0, + "9": 2286.0, "10": 2570.0, + "11": 2438.0, + "12": 1829.0, + "13": 2332.0, + "14": 2832.0, "15": 2008.0, + "16": 2659.0, + "17": 2454.0, + "18": 2500.0, + "19": 2588.0, "20": 2834.0, + "21": 2042.0, + "22": 3037.0, + "23": 2702.0, + "24": 2700.0, "25": 2568.0, + "26": 2896.0, + "27": 2735.0, + "28": 2699.0, + "29": 2548.0, "30": 2843.0, + "31": 2160.0, + "32": 2458.0, + "33": 2130.0, + "34": 2517.0, "35": 2597.0, + "36": 3001.0, + "37": 3305.0, + "38": 2682.0, + "39": 2805.0, "40": 3425.0, + "41": 1812.0, + "42": 1481.0, + "43": 1726.0, + "44": 2575.0, "45": 3438.0, + "46": 2960.0, + "47": 2792.0, + "48": 3107.0, + "49": 2854.0, "50": 2145.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1767237120.0, + "2": 1767237120.0, + "3": 1767237120.0, + "4": 1767237120.0, "5": 1767237120.0, + "6": 1767237120.0, + "7": 1767237120.0, + "8": 1767237120.0, + "9": 1767237120.0, "10": 1767237120.0, + "11": 1767237120.0, + "12": 1767237120.0, + "13": 1767237120.0, + "14": 1767237120.0, "15": 1767237120.0, + "16": 1767237120.0, + "17": 1767237120.0, + "18": 1767237120.0, + "19": 1767237120.0, "20": 1767237120.0, + "21": 1767237120.0, + "22": 1767237120.0, + "23": 1767237120.0, + "24": 1767237120.0, "25": 1767237120.0, + "26": 1767237120.0, + "27": 1767237120.0, + "28": 1767237120.0, + "29": 1767237120.0, "30": 1767237120.0, + "31": 1767237120.0, + "32": 1767237120.0, + "33": 1767237120.0, + "34": 1767237120.0, "35": 1767237120.0, + "36": 1767237120.0, + "37": 1767237120.0, + "38": 1767237120.0, + "39": 1767237120.0, "40": 1767237120.0, + "41": 1767237120.0, + "42": 1767237120.0, + "43": 1767237120.0, + "44": 1767237120.0, "45": 1767237120.0, + "46": 1767237120.0, + "47": 1767237120.0, + "48": 1767237120.0, + "49": 1767237120.0, "50": 1767237120.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2336500736.0, + "2": 3079487488.0, + "3": 3079487488.0, + "4": 3079487488.0, "5": 3079487488.0, + "6": 3079487488.0, + "7": 3079487488.0, + "8": 3079487488.0, + "9": 3079487488.0, "10": 3079487488.0, + "11": 3079487488.0, + "12": 3079487488.0, + "13": 3079487488.0, + "14": 3079487488.0, "15": 3079487488.0, + "16": 3079487488.0, + "17": 3079487488.0, + "18": 3079487488.0, + "19": 3079487488.0, "20": 3079487488.0, + "21": 3079487488.0, + "22": 3079487488.0, + "23": 3079487488.0, + "24": 3079487488.0, "25": 3079487488.0, + "26": 3079487488.0, + "27": 3079487488.0, + "28": 3079487488.0, + "29": 3079487488.0, "30": 3079487488.0, + "31": 3079487488.0, + "32": 3079487488.0, + "33": 3079487488.0, + "34": 3079487488.0, "35": 3079487488.0, + "36": 3079487488.0, + "37": 3079487488.0, + "38": 3079487488.0, + "39": 3079487488.0, "40": 3079487488.0, + "41": 3079487488.0, + "42": 3079487488.0, + "43": 3079487488.0, + "44": 3079487488.0, "45": 3079487488.0, + "46": 3079487488.0, + "47": 3079487488.0, + "48": 3079487488.0, + "49": 3079487488.0, "50": 3079487488.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 9.51607, - "5": 0.70637, - "10": 0.74903, - "15": 0.69218, - "20": 0.94021, - "25": 0.69, - "30": 0.69576, - "35": 0.69538, - "40": 0.69122, - "45": 1.04545, - "50": 0.69215 + "1": 11.5674, + "2": 0.87925, + "3": 0.84214, + "4": 0.85037, + "5": 0.85134, + "6": 0.84821, + "7": 0.84955, + "8": 0.84912, + "9": 0.85227, + "10": 0.84641, + "11": 0.84805, + "12": 0.84791, + "13": 0.86059, + "14": 0.86196, + "15": 1.10537, + "16": 1.03739, + "17": 0.8309, + "18": 0.82806, + "19": 1.30044, + "20": 0.83029, + "21": 0.82677, + "22": 1.30745, + "23": 0.85382, + "24": 0.83942, + "25": 0.83871, + "26": 0.8337, + "27": 0.83434, + "28": 0.8309, + "29": 0.83936, + "30": 0.83788, + "31": 0.83476, + "32": 0.83236, + "33": 0.83163, + "34": 0.84328, + "35": 0.83702, + "36": 0.83877, + "37": 0.83834, + "38": 0.83145, + "39": 0.83941, + "40": 0.84432, + "41": 1.16619, + "42": 1.1534, + "43": 1.08513, + "44": 0.84537, + "45": 0.99113, + "46": 0.84419, + "47": 0.89066, + "48": 0.83549, + "49": 1.01154, + "50": 0.96557 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..0ff198806cb --- /dev/null +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.4837, + "2": 10.48435, + "3": 10.48251, + "4": 10.48303, + "5": 10.47647, + "6": 10.48423, + "7": 10.48457, + "8": 10.48837, + "9": 10.49003, + "10": 10.47255, + "11": 10.47245, + "12": 10.4828, + "13": 10.47855, + "14": 10.45162, + "15": 10.47936, + "16": 10.45364, + "17": 10.45143, + "18": 10.46239, + "19": 10.44136, + "20": 10.45438, + "21": 10.43469, + "22": 10.40587, + "23": 10.39982, + "24": 10.37585, + "25": 10.38173, + "26": 10.35154, + "27": 10.35401, + "28": 10.3497, + "29": 10.28714, + "30": 10.21194, + "31": 10.17274, + "32": 10.13439, + "33": 10.14753, + "34": 10.10759, + "35": 10.10592, + "36": 10.08756, + "37": 10.08177, + "38": 10.07257, + "39": 10.0013, + "40": 9.9816, + "41": 9.92549, + "42": 9.87529, + "43": 9.88742, + "44": 9.80641, + "45": 9.82342, + "46": 9.73815, + "47": 9.74831, + "48": 9.71619, + "49": 9.74504, + "50": 9.73004 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2554.0, + "2": 1919.0, + "3": 1521.0, + "4": 2330.0, + "5": 2010.0, + "6": 1725.0, + "7": 2803.0, + "8": 2435.0, + "9": 2286.0, + "10": 2570.0, + "11": 2438.0, + "12": 1829.0, + "13": 2332.0, + "14": 2832.0, + "15": 2008.0, + "16": 2659.0, + "17": 2454.0, + "18": 2500.0, + "19": 2588.0, + "20": 2834.0, + "21": 2042.0, + "22": 3037.0, + "23": 2702.0, + "24": 2700.0, + "25": 2568.0, + "26": 2896.0, + "27": 2735.0, + "28": 2699.0, + "29": 2548.0, + "30": 2843.0, + "31": 2160.0, + "32": 2458.0, + "33": 2130.0, + "34": 2517.0, + "35": 2597.0, + "36": 3001.0, + "37": 3305.0, + "38": 2682.0, + "39": 2805.0, + "40": 3425.0, + "41": 1812.0, + "42": 1481.0, + "43": 1726.0, + "44": 2575.0, + "45": 3438.0, + "46": 2960.0, + "47": 2792.0, + "48": 3107.0, + "49": 2854.0, + "50": 2145.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1767237120.0, + "2": 1767237120.0, + "3": 1767237120.0, + "4": 1767237120.0, + "5": 1767237120.0, + "6": 1767237120.0, + "7": 1767237120.0, + "8": 1767237120.0, + "9": 1767237120.0, + "10": 1767237120.0, + "11": 1767237120.0, + "12": 1767237120.0, + "13": 1767237120.0, + "14": 1767237120.0, + "15": 1767237120.0, + "16": 1767237120.0, + "17": 1767237120.0, + "18": 1767237120.0, + "19": 1767237120.0, + "20": 1767237120.0, + "21": 1767237120.0, + "22": 1767237120.0, + "23": 1767237120.0, + "24": 1767237120.0, + "25": 1767237120.0, + "26": 1767237120.0, + "27": 1767237120.0, + "28": 1767237120.0, + "29": 1767237120.0, + "30": 1767237120.0, + "31": 1767237120.0, + "32": 1767237120.0, + "33": 1767237120.0, + "34": 1767237120.0, + "35": 1767237120.0, + "36": 1767237120.0, + "37": 1767237120.0, + "38": 1767237120.0, + "39": 1767237120.0, + "40": 1767237120.0, + "41": 1767237120.0, + "42": 1767237120.0, + "43": 1767237120.0, + "44": 1767237120.0, + "45": 1767237120.0, + "46": 1767237120.0, + "47": 1767237120.0, + "48": 1767237120.0, + "49": 1767237120.0, + "50": 1767237120.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2336500736.0, + "2": 3079487488.0, + "3": 3079487488.0, + "4": 3079487488.0, + "5": 3079487488.0, + "6": 3079487488.0, + "7": 3079487488.0, + "8": 3079487488.0, + "9": 3079487488.0, + "10": 3079487488.0, + "11": 3079487488.0, + "12": 3079487488.0, + "13": 3079487488.0, + "14": 3079487488.0, + "15": 3079487488.0, + "16": 3079487488.0, + "17": 3079487488.0, + "18": 3079487488.0, + "19": 3079487488.0, + "20": 3079487488.0, + "21": 3079487488.0, + "22": 3079487488.0, + "23": 3079487488.0, + "24": 3079487488.0, + "25": 3079487488.0, + "26": 3079487488.0, + "27": 3079487488.0, + "28": 3079487488.0, + "29": 3079487488.0, + "30": 3079487488.0, + "31": 3079487488.0, + "32": 3079487488.0, + "33": 3079487488.0, + "34": 3079487488.0, + "35": 3079487488.0, + "36": 3079487488.0, + "37": 3079487488.0, + "38": 3079487488.0, + "39": 3079487488.0, + "40": 3079487488.0, + "41": 3079487488.0, + "42": 3079487488.0, + "43": 3079487488.0, + "44": 3079487488.0, + "45": 3079487488.0, + "46": 3079487488.0, + "47": 3079487488.0, + "48": 3079487488.0, + "49": 3079487488.0, + "50": 3079487488.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 12.70758, + "2": 0.8354, + "3": 0.78875, + "4": 0.77893, + "5": 0.81797, + "6": 0.77299, + "7": 0.76726, + "8": 0.77744, + "9": 0.77036, + "10": 0.76808, + "11": 0.77009, + "12": 0.77543, + "13": 0.78463, + "14": 0.77498, + "15": 0.76065, + "16": 1.28888, + "17": 0.78476, + "18": 0.77415, + "19": 0.77341, + "20": 1.04994, + "21": 1.25413, + "22": 0.7709, + "23": 0.85615, + "24": 0.76186, + "25": 0.75903, + "26": 0.75431, + "27": 0.76868, + "28": 0.7776, + "29": 0.74989, + "30": 0.75136, + "31": 0.7956, + "32": 0.74247, + "33": 0.73237, + "34": 0.73066, + "35": 0.74241, + "36": 0.74361, + "37": 0.77983, + "38": 0.77753, + "39": 0.75036, + "40": 0.75188, + "41": 0.75332, + "42": 0.89635, + "43": 0.73883, + "44": 0.92932, + "45": 0.73444, + "46": 0.73103, + "47": 1.01543, + "48": 1.06091, + "49": 0.92342, + "50": 1.25669 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..bf20b2b00e3 --- /dev/null +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.4837, + "2": 10.48435, + "3": 10.48251, + "4": 10.48303, + "5": 10.47647, + "6": 10.48423, + "7": 10.48457, + "8": 10.48837, + "9": 10.49003, + "10": 10.47255, + "11": 10.47245, + "12": 10.4828, + "13": 10.47855, + "14": 10.45162, + "15": 10.47936, + "16": 10.45364, + "17": 10.45143, + "18": 10.46239, + "19": 10.44136, + "20": 10.45438, + "21": 10.43469, + "22": 10.40587, + "23": 10.39982, + "24": 10.37585, + "25": 10.38173, + "26": 10.35154, + "27": 10.35401, + "28": 10.3497, + "29": 10.28714, + "30": 10.21194, + "31": 10.17274, + "32": 10.13439, + "33": 10.14753, + "34": 10.10759, + "35": 10.10592, + "36": 10.08756, + "37": 10.08177, + "38": 10.07257, + "39": 10.0013, + "40": 9.9816, + "41": 9.92549, + "42": 9.87529, + "43": 9.88742, + "44": 9.80641, + "45": 9.82342, + "46": 9.73815, + "47": 9.74831, + "48": 9.71619, + "49": 9.74504, + "50": 9.73004 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2554.0, + "2": 1919.0, + "3": 1521.0, + "4": 2330.0, + "5": 2010.0, + "6": 1725.0, + "7": 2803.0, + "8": 2435.0, + "9": 2286.0, + "10": 2570.0, + "11": 2438.0, + "12": 1829.0, + "13": 2332.0, + "14": 2832.0, + "15": 2008.0, + "16": 2659.0, + "17": 2454.0, + "18": 2500.0, + "19": 2588.0, + "20": 2834.0, + "21": 2042.0, + "22": 3037.0, + "23": 2702.0, + "24": 2700.0, + "25": 2568.0, + "26": 2896.0, + "27": 2735.0, + "28": 2699.0, + "29": 2548.0, + "30": 2843.0, + "31": 2160.0, + "32": 2458.0, + "33": 2130.0, + "34": 2517.0, + "35": 2597.0, + "36": 3001.0, + "37": 3305.0, + "38": 2682.0, + "39": 2805.0, + "40": 3425.0, + "41": 1812.0, + "42": 1481.0, + "43": 1726.0, + "44": 2575.0, + "45": 3438.0, + "46": 2960.0, + "47": 2792.0, + "48": 3107.0, + "49": 2854.0, + "50": 2145.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1767237120.0, + "2": 1767237120.0, + "3": 1767237120.0, + "4": 1767237120.0, + "5": 1767237120.0, + "6": 1767237120.0, + "7": 1767237120.0, + "8": 1767237120.0, + "9": 1767237120.0, + "10": 1767237120.0, + "11": 1767237120.0, + "12": 1767237120.0, + "13": 1767237120.0, + "14": 1767237120.0, + "15": 1767237120.0, + "16": 1767237120.0, + "17": 1767237120.0, + "18": 1767237120.0, + "19": 1767237120.0, + "20": 1767237120.0, + "21": 1767237120.0, + "22": 1767237120.0, + "23": 1767237120.0, + "24": 1767237120.0, + "25": 1767237120.0, + "26": 1767237120.0, + "27": 1767237120.0, + "28": 1767237120.0, + "29": 1767237120.0, + "30": 1767237120.0, + "31": 1767237120.0, + "32": 1767237120.0, + "33": 1767237120.0, + "34": 1767237120.0, + "35": 1767237120.0, + "36": 1767237120.0, + "37": 1767237120.0, + "38": 1767237120.0, + "39": 1767237120.0, + "40": 1767237120.0, + "41": 1767237120.0, + "42": 1767237120.0, + "43": 1767237120.0, + "44": 1767237120.0, + "45": 1767237120.0, + "46": 1767237120.0, + "47": 1767237120.0, + "48": 1767237120.0, + "49": 1767237120.0, + "50": 1767237120.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2336500736.0, + "2": 3079487488.0, + "3": 3079487488.0, + "4": 3079487488.0, + "5": 3079487488.0, + "6": 3079487488.0, + "7": 3079487488.0, + "8": 3079487488.0, + "9": 3079487488.0, + "10": 3079487488.0, + "11": 3079487488.0, + "12": 3079487488.0, + "13": 3079487488.0, + "14": 3079487488.0, + "15": 3079487488.0, + "16": 3079487488.0, + "17": 3079487488.0, + "18": 3079487488.0, + "19": 3079487488.0, + "20": 3079487488.0, + "21": 3079487488.0, + "22": 3079487488.0, + "23": 3079487488.0, + "24": 3079487488.0, + "25": 3079487488.0, + "26": 3079487488.0, + "27": 3079487488.0, + "28": 3079487488.0, + "29": 3079487488.0, + "30": 3079487488.0, + "31": 3079487488.0, + "32": 3079487488.0, + "33": 3079487488.0, + "34": 3079487488.0, + "35": 3079487488.0, + "36": 3079487488.0, + "37": 3079487488.0, + "38": 3079487488.0, + "39": 3079487488.0, + "40": 3079487488.0, + "41": 3079487488.0, + "42": 3079487488.0, + "43": 3079487488.0, + "44": 3079487488.0, + "45": 3079487488.0, + "46": 3079487488.0, + "47": 3079487488.0, + "48": 3079487488.0, + "49": 3079487488.0, + "50": 3079487488.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.68301, + "2": 0.87796, + "3": 0.84756, + "4": 0.85513, + "5": 0.85643, + "6": 0.85366, + "7": 0.8468, + "8": 0.84974, + "9": 0.84989, + "10": 0.8464, + "11": 0.84369, + "12": 0.84972, + "13": 0.84311, + "14": 0.85648, + "15": 1.1084, + "16": 0.8827, + "17": 0.87952, + "18": 0.88554, + "19": 0.82673, + "20": 0.82222, + "21": 1.06414, + "22": 1.09134, + "23": 1.02591, + "24": 0.82601, + "25": 0.82277, + "26": 0.81844, + "27": 0.82627, + "28": 0.82854, + "29": 0.82653, + "30": 0.82247, + "31": 0.82906, + "32": 0.82363, + "33": 0.82944, + "34": 0.82401, + "35": 0.82902, + "36": 0.83537, + "37": 0.8265, + "38": 0.82728, + "39": 0.82087, + "40": 0.82525, + "41": 0.82691, + "42": 1.14473, + "43": 0.97566, + "44": 0.82343, + "45": 0.82956, + "46": 0.82572, + "47": 0.83635, + "48": 0.94255, + "49": 0.99753, + "50": 1.10127 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 325bd59c44d..8063c892338 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.48367, + "2": 10.48426, + "3": 10.48254, + "4": 10.48311, "5": 10.4764, + "6": 10.4844, + "7": 10.48458, + "8": 10.48829, + "9": 10.49008, "10": 10.47268, + "11": 10.47256, + "12": 10.48259, + "13": 10.47857, + "14": 10.45154, "15": 10.47925, + "16": 10.45346, + "17": 10.45145, + "18": 10.46238, + "19": 10.44113, "20": 10.45448, + "21": 10.43454, + "22": 10.40592, + "23": 10.39961, + "24": 10.37579, "25": 10.38182, + "26": 10.35147, + "27": 10.35388, + "28": 10.34937, + "29": 10.28711, "30": 10.21159, + "31": 10.1726, + "32": 10.13421, + "33": 10.14744, + "34": 10.10737, "35": 10.10581, + "36": 10.08735, + "37": 10.08157, + "38": 10.07233, + "39": 10.00094, "40": 9.98143, + "41": 9.92541, + "42": 9.87527, + "43": 9.88711, + "44": 9.80642, "45": 9.82325, + "46": 9.73785, + "47": 9.74817, + "48": 9.71609, + "49": 9.74484, "50": 9.72982, + "51": 9.71485, + "52": 9.66475, + "53": 9.60919, + "54": 9.62705, "55": 9.61012, + "56": 9.617, + "57": 9.56786, + "58": 9.52731, + "59": 9.51668, "60": 9.51865, + "61": 9.53132, + "62": 9.45016, + "63": 9.45725, + "64": 9.43435, "65": 9.45801, + "66": 9.4368, + "67": 9.3968, + "68": 9.36474, + "69": 9.4095, "70": 9.376, + "71": 9.41716, + "72": 9.42574, + "73": 9.37581, + "74": 9.41547, "75": 9.37891, + "76": 9.28017, + "77": 9.32205, + "78": 9.35754, + "79": 9.32162, "80": 9.31486, + "81": 9.2678, + "82": 9.34178, + "83": 9.32145, + "84": 9.24785, "85": 9.35023, + "86": 9.22392, + "87": 9.3062, + "88": 9.29891, + "89": 9.22716, "90": 9.28483, + "91": 9.23109, + "92": 9.27463, + "93": 9.19241, + "94": 9.23984, "95": 9.28006, + "96": 9.17526, + "97": 9.21894, + "98": 9.17192, + "99": 9.16446, "100": 9.14816 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2570.0, + "2": 1923.0, + "3": 1512.0, + "4": 2322.0, "5": 2033.0, + "6": 1774.0, + "7": 2781.0, + "8": 2460.0, + "9": 2308.0, "10": 2635.0, + "11": 2397.0, + "12": 1817.0, + "13": 2348.0, + "14": 2749.0, "15": 2027.0, + "16": 2719.0, + "17": 2487.0, + "18": 2533.0, + "19": 2547.0, "20": 2850.0, + "21": 1990.0, + "22": 2884.0, + "23": 2857.0, + "24": 2685.0, "25": 2514.0, + "26": 2958.0, + "27": 2673.0, + "28": 2723.0, + "29": 2571.0, "30": 2858.0, + "31": 2157.0, + "32": 2357.0, + "33": 2242.0, + "34": 2464.0, "35": 2544.0, + "36": 2933.0, + "37": 3293.0, + "38": 2730.0, + "39": 2795.0, "40": 3310.0, + "41": 1816.0, + "42": 1467.0, + "43": 1817.0, + "44": 2633.0, "45": 3576.0, + "46": 3015.0, + "47": 2805.0, + "48": 3071.0, + "49": 2974.0, "50": 2267.0, + "51": 1923.0, + "52": 2515.0, + "53": 3615.0, + "54": 3426.0, "55": 3436.0, + "56": 4411.0, + "57": 4095.0, + "58": 4308.0, + "59": 1687.0, "60": 2431.0, + "61": 2151.0, + "62": 3986.0, + "63": 3558.0, + "64": 4286.0, "65": 3052.0, + "66": 1720.0, + "67": 1910.0, + "68": 4193.0, + "69": 4347.0, "70": 4596.0, + "71": 2078.0, + "72": 4406.0, + "73": 4062.0, + "74": 3358.0, "75": 4606.0, + "76": 2187.0, + "77": 4854.0, + "78": 4098.0, + "79": 2652.0, "80": 3776.0, + "81": 3550.0, + "82": 3031.0, + "83": 5345.0, + "84": 4396.0, "85": 4354.0, + "86": 3332.0, + "87": 4815.0, + "88": 3303.0, + "89": 4611.0, "90": 4346.0, + "91": 4361.0, + "92": 3502.0, + "93": 5624.0, + "94": 3733.0, "95": 4728.0, + "96": 3534.0, + "97": 3873.0, + "98": 4525.0, + "99": 4329.0, "100": 3365.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1784014336.0, + "2": 1784014336.0, + "3": 1784014336.0, + "4": 1784014336.0, "5": 1784014336.0, + "6": 1784014336.0, + "7": 1784014336.0, + "8": 1784014336.0, + "9": 1784014336.0, "10": 1784014336.0, + "11": 1784014336.0, + "12": 1784014336.0, + "13": 1784014336.0, + "14": 1784014336.0, "15": 1784014336.0, + "16": 1784014336.0, + "17": 1784014336.0, + "18": 1784014336.0, + "19": 1784014336.0, "20": 1784014336.0, + "21": 1784014336.0, + "22": 1784014336.0, + "23": 1784014336.0, + "24": 1784014336.0, "25": 1784014336.0, + "26": 1784014336.0, + "27": 1784014336.0, + "28": 1784014336.0, + "29": 1784014336.0, "30": 1784014336.0, + "31": 1784014336.0, + "32": 1784014336.0, + "33": 1784014336.0, + "34": 1784014336.0, "35": 1784014336.0, + "36": 1784014336.0, + "37": 1784014336.0, + "38": 1784014336.0, + "39": 1784014336.0, "40": 1784014336.0, + "41": 1784014336.0, + "42": 1784014336.0, + "43": 1784014336.0, + "44": 1784014336.0, "45": 1784014336.0, + "46": 1784014336.0, + "47": 1784014336.0, + "48": 1784014336.0, + "49": 1784014336.0, "50": 1784014336.0, + "51": 1784014336.0, + "52": 1784014336.0, + "53": 1784014336.0, + "54": 1784014336.0, "55": 1784014336.0, + "56": 1784014336.0, + "57": 1784014336.0, + "58": 1784014336.0, + "59": 1784014336.0, "60": 1784014336.0, + "61": 1784014336.0, + "62": 1784014336.0, + "63": 1784014336.0, + "64": 1784014336.0, "65": 1784014336.0, + "66": 1784014336.0, + "67": 1784014336.0, + "68": 1784014336.0, + "69": 1784014336.0, "70": 1784014336.0, + "71": 1784014336.0, + "72": 1784014336.0, + "73": 1784014336.0, + "74": 1784014336.0, "75": 1784014336.0, + "76": 1784014336.0, + "77": 1784014336.0, + "78": 1784014336.0, + "79": 1784014336.0, "80": 1784014336.0, + "81": 1784014336.0, + "82": 1784014336.0, + "83": 1784014336.0, + "84": 1784014336.0, "85": 1784014336.0, + "86": 1784014336.0, + "87": 1784014336.0, + "88": 1784014336.0, + "89": 1784014336.0, "90": 1784014336.0, + "91": 1784014336.0, + "92": 1784014336.0, + "93": 1784014336.0, + "94": 1784014336.0, "95": 1784014336.0, + "96": 1784014336.0, + "97": 1784014336.0, + "98": 1784014336.0, + "99": 1784014336.0, "100": 1784014336.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2365860864.0, + "2": 3108323328.0, + "3": 3108323328.0, + "4": 3108323328.0, "5": 3108323328.0, + "6": 3108323328.0, + "7": 3108323328.0, + "8": 3108323328.0, + "9": 3108323328.0, "10": 3108323328.0, + "11": 3108323328.0, + "12": 3108323328.0, + "13": 3108323328.0, + "14": 3108323328.0, "15": 3108323328.0, + "16": 3108323328.0, + "17": 3108323328.0, + "18": 3108323328.0, + "19": 3108323328.0, "20": 3108323328.0, + "21": 3108323328.0, + "22": 3108323328.0, + "23": 3108323328.0, + "24": 3108323328.0, "25": 3108323328.0, + "26": 3108323328.0, + "27": 3108323328.0, + "28": 3108323328.0, + "29": 3108323328.0, "30": 3108323328.0, + "31": 3108323328.0, + "32": 3108323328.0, + "33": 3108323328.0, + "34": 3108323328.0, "35": 3108323328.0, + "36": 3108323328.0, + "37": 3108323328.0, + "38": 3108323328.0, + "39": 3108323328.0, "40": 3108323328.0, + "41": 3108323328.0, + "42": 3108323328.0, + "43": 3108323328.0, + "44": 3108323328.0, "45": 3108323328.0, + "46": 3108323328.0, + "47": 3108323328.0, + "48": 3108323328.0, + "49": 3108323328.0, "50": 3108323328.0, + "51": 3108323328.0, + "52": 3108323328.0, + "53": 3108323328.0, + "54": 3108323328.0, "55": 3108323328.0, + "56": 3108323328.0, + "57": 3108323328.0, + "58": 3108323328.0, + "59": 3108323328.0, "60": 3108323328.0, + "61": 3108323328.0, + "62": 3108323328.0, + "63": 3108323328.0, + "64": 3108323328.0, "65": 3108323328.0, + "66": 3108323328.0, + "67": 3108323328.0, + "68": 3108323328.0, + "69": 3108323328.0, "70": 3108323328.0, + "71": 3108323328.0, + "72": 3108323328.0, + "73": 3108323328.0, + "74": 3108323328.0, "75": 3108323328.0, + "76": 3108323328.0, + "77": 3108323328.0, + "78": 3108323328.0, + "79": 3108323328.0, "80": 3108323328.0, + "81": 3108323328.0, + "82": 3108323328.0, + "83": 3108323328.0, + "84": 3108323328.0, "85": 3108323328.0, + "86": 3108323328.0, + "87": 3108323328.0, + "88": 3108323328.0, + "89": 3108323328.0, "90": 3108323328.0, + "91": 3108323328.0, + "92": 3108323328.0, + "93": 3108323328.0, + "94": 3108323328.0, "95": 3108323328.0, + "96": 3108323328.0, + "97": 3108323328.0, + "98": 3108323328.0, + "99": 3108323328.0, "100": 3108323328.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 11.15622, - "5": 0.89876, - "10": 0.89356, - "15": 0.87954, - "20": 0.86205, - "25": 0.856, - "30": 0.88843, - "35": 0.85722, - "40": 0.87142, - "45": 1.00082, - "50": 1.22422, - "55": 1.51231, - "60": 0.8651, - "65": 0.85577, - "70": 0.86627, - "75": 0.94057, - "80": 0.86318, - "85": 1.18974, - "90": 0.85756, - "95": 0.85398, - "100": 0.85745 + "1": 12.25998, + "2": 1.04599, + "3": 1.00983, + "4": 1.01193, + "5": 1.01326, + "6": 1.01181, + "7": 1.01264, + "8": 1.01822, + "9": 1.02424, + "10": 1.0191, + "11": 1.01303, + "12": 1.00485, + "13": 1.0025, + "14": 1.00999, + "15": 1.00956, + "16": 1.00094, + "17": 1.00769, + "18": 1.01014, + "19": 1.01639, + "20": 1.22304, + "21": 1.4851, + "22": 1.19412, + "23": 1.01165, + "24": 1.0106, + "25": 1.01512, + "26": 1.00595, + "27": 1.01769, + "28": 1.01182, + "29": 1.00676, + "30": 1.00481, + "31": 1.1042, + "32": 1.00908, + "33": 1.01083, + "34": 1.00353, + "35": 1.00454, + "36": 1.00641, + "37": 1.00279, + "38": 1.00471, + "39": 1.00143, + "40": 1.00802, + "41": 1.00755, + "42": 1.00913, + "43": 1.00814, + "44": 1.00935, + "45": 1.00635, + "46": 1.01076, + "47": 1.01077, + "48": 1.14065, + "49": 1.24856, + "50": 1.09012, + "51": 1.03825, + "52": 1.44742, + "53": 1.3184, + "54": 1.01374, + "55": 1.01506, + "56": 1.01099, + "57": 1.04106, + "58": 1.02232, + "59": 1.01748, + "60": 1.00992, + "61": 1.02073, + "62": 1.02809, + "63": 1.34383, + "64": 1.38941, + "65": 1.10673, + "66": 1.01505, + "67": 1.00839, + "68": 1.00645, + "69": 1.01066, + "70": 1.01137, + "71": 1.35475, + "72": 1.02215, + "73": 1.0187, + "74": 1.01939, + "75": 1.10218, + "76": 1.12059, + "77": 1.12057, + "78": 1.03631, + "79": 1.12601, + "80": 1.33494, + "81": 1.09935, + "82": 1.06264, + "83": 1.31187, + "84": 1.0139, + "85": 1.00708, + "86": 1.02816, + "87": 1.02033, + "88": 1.01728, + "89": 1.2628, + "90": 1.01941, + "91": 1.01944, + "92": 1.0295, + "93": 1.01897, + "94": 1.01663, + "95": 1.02386, + "96": 1.00901, + "97": 1.00751, + "98": 1.0074, + "99": 1.00366, + "100": 1.00628 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..137f195264d --- /dev/null +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.48367, + "2": 10.48426, + "3": 10.48254, + "4": 10.48311, + "5": 10.4764, + "6": 10.4844, + "7": 10.48458, + "8": 10.48829, + "9": 10.49008, + "10": 10.47268, + "11": 10.47256, + "12": 10.48259, + "13": 10.47857, + "14": 10.45154, + "15": 10.47925, + "16": 10.45346, + "17": 10.45145, + "18": 10.46238, + "19": 10.44113, + "20": 10.45448, + "21": 10.43454, + "22": 10.40592, + "23": 10.39961, + "24": 10.37579, + "25": 10.38182, + "26": 10.35147, + "27": 10.35388, + "28": 10.34937, + "29": 10.28711, + "30": 10.21159, + "31": 10.1726, + "32": 10.13421, + "33": 10.14744, + "34": 10.10737, + "35": 10.10581, + "36": 10.08735, + "37": 10.08157, + "38": 10.07233, + "39": 10.00094, + "40": 9.98143, + "41": 9.92541, + "42": 9.87527, + "43": 9.88711, + "44": 9.80642, + "45": 9.82325, + "46": 9.73785, + "47": 9.74817, + "48": 9.71609, + "49": 9.74484, + "50": 9.72982, + "51": 9.71485, + "52": 9.66475, + "53": 9.60919, + "54": 9.62705, + "55": 9.61012, + "56": 9.617, + "57": 9.56786, + "58": 9.52731, + "59": 9.51668, + "60": 9.51865, + "61": 9.53132, + "62": 9.45016, + "63": 9.45725, + "64": 9.43435, + "65": 9.45801, + "66": 9.4368, + "67": 9.3968, + "68": 9.36474, + "69": 9.4095, + "70": 9.376, + "71": 9.41716, + "72": 9.42574, + "73": 9.37581, + "74": 9.41547, + "75": 9.37891, + "76": 9.28017, + "77": 9.32205, + "78": 9.35754, + "79": 9.32162, + "80": 9.31486, + "81": 9.2678, + "82": 9.34178, + "83": 9.32145, + "84": 9.24785, + "85": 9.35023, + "86": 9.22392, + "87": 9.3062, + "88": 9.29891, + "89": 9.22716, + "90": 9.28483, + "91": 9.23109, + "92": 9.27463, + "93": 9.19241, + "94": 9.23984, + "95": 9.28006, + "96": 9.17526, + "97": 9.21894, + "98": 9.17192, + "99": 9.16446, + "100": 9.14816 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2570.0, + "2": 1923.0, + "3": 1512.0, + "4": 2322.0, + "5": 2033.0, + "6": 1774.0, + "7": 2781.0, + "8": 2460.0, + "9": 2308.0, + "10": 2635.0, + "11": 2397.0, + "12": 1817.0, + "13": 2348.0, + "14": 2749.0, + "15": 2027.0, + "16": 2719.0, + "17": 2487.0, + "18": 2533.0, + "19": 2547.0, + "20": 2850.0, + "21": 1990.0, + "22": 2884.0, + "23": 2857.0, + "24": 2685.0, + "25": 2514.0, + "26": 2958.0, + "27": 2673.0, + "28": 2723.0, + "29": 2571.0, + "30": 2858.0, + "31": 2157.0, + "32": 2357.0, + "33": 2242.0, + "34": 2464.0, + "35": 2544.0, + "36": 2933.0, + "37": 3293.0, + "38": 2730.0, + "39": 2795.0, + "40": 3310.0, + "41": 1816.0, + "42": 1467.0, + "43": 1817.0, + "44": 2633.0, + "45": 3576.0, + "46": 3015.0, + "47": 2805.0, + "48": 3071.0, + "49": 2974.0, + "50": 2267.0, + "51": 1923.0, + "52": 2515.0, + "53": 3615.0, + "54": 3426.0, + "55": 3436.0, + "56": 4411.0, + "57": 4095.0, + "58": 4308.0, + "59": 1687.0, + "60": 2431.0, + "61": 2151.0, + "62": 3986.0, + "63": 3558.0, + "64": 4286.0, + "65": 3052.0, + "66": 1720.0, + "67": 1910.0, + "68": 4193.0, + "69": 4347.0, + "70": 4596.0, + "71": 2078.0, + "72": 4406.0, + "73": 4062.0, + "74": 3358.0, + "75": 4606.0, + "76": 2187.0, + "77": 4854.0, + "78": 4098.0, + "79": 2652.0, + "80": 3776.0, + "81": 3550.0, + "82": 3031.0, + "83": 5345.0, + "84": 4396.0, + "85": 4354.0, + "86": 3332.0, + "87": 4815.0, + "88": 3303.0, + "89": 4611.0, + "90": 4346.0, + "91": 4361.0, + "92": 3502.0, + "93": 5624.0, + "94": 3733.0, + "95": 4728.0, + "96": 3534.0, + "97": 3873.0, + "98": 4525.0, + "99": 4329.0, + "100": 3365.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1784014336.0, + "2": 1784014336.0, + "3": 1784014336.0, + "4": 1784014336.0, + "5": 1784014336.0, + "6": 1784014336.0, + "7": 1784014336.0, + "8": 1784014336.0, + "9": 1784014336.0, + "10": 1784014336.0, + "11": 1784014336.0, + "12": 1784014336.0, + "13": 1784014336.0, + "14": 1784014336.0, + "15": 1784014336.0, + "16": 1784014336.0, + "17": 1784014336.0, + "18": 1784014336.0, + "19": 1784014336.0, + "20": 1784014336.0, + "21": 1784014336.0, + "22": 1784014336.0, + "23": 1784014336.0, + "24": 1784014336.0, + "25": 1784014336.0, + "26": 1784014336.0, + "27": 1784014336.0, + "28": 1784014336.0, + "29": 1784014336.0, + "30": 1784014336.0, + "31": 1784014336.0, + "32": 1784014336.0, + "33": 1784014336.0, + "34": 1784014336.0, + "35": 1784014336.0, + "36": 1784014336.0, + "37": 1784014336.0, + "38": 1784014336.0, + "39": 1784014336.0, + "40": 1784014336.0, + "41": 1784014336.0, + "42": 1784014336.0, + "43": 1784014336.0, + "44": 1784014336.0, + "45": 1784014336.0, + "46": 1784014336.0, + "47": 1784014336.0, + "48": 1784014336.0, + "49": 1784014336.0, + "50": 1784014336.0, + "51": 1784014336.0, + "52": 1784014336.0, + "53": 1784014336.0, + "54": 1784014336.0, + "55": 1784014336.0, + "56": 1784014336.0, + "57": 1784014336.0, + "58": 1784014336.0, + "59": 1784014336.0, + "60": 1784014336.0, + "61": 1784014336.0, + "62": 1784014336.0, + "63": 1784014336.0, + "64": 1784014336.0, + "65": 1784014336.0, + "66": 1784014336.0, + "67": 1784014336.0, + "68": 1784014336.0, + "69": 1784014336.0, + "70": 1784014336.0, + "71": 1784014336.0, + "72": 1784014336.0, + "73": 1784014336.0, + "74": 1784014336.0, + "75": 1784014336.0, + "76": 1784014336.0, + "77": 1784014336.0, + "78": 1784014336.0, + "79": 1784014336.0, + "80": 1784014336.0, + "81": 1784014336.0, + "82": 1784014336.0, + "83": 1784014336.0, + "84": 1784014336.0, + "85": 1784014336.0, + "86": 1784014336.0, + "87": 1784014336.0, + "88": 1784014336.0, + "89": 1784014336.0, + "90": 1784014336.0, + "91": 1784014336.0, + "92": 1784014336.0, + "93": 1784014336.0, + "94": 1784014336.0, + "95": 1784014336.0, + "96": 1784014336.0, + "97": 1784014336.0, + "98": 1784014336.0, + "99": 1784014336.0, + "100": 1784014336.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2365860864.0, + "2": 3108323328.0, + "3": 3108323328.0, + "4": 3108323328.0, + "5": 3108323328.0, + "6": 3108323328.0, + "7": 3108323328.0, + "8": 3108323328.0, + "9": 3108323328.0, + "10": 3108845568.0, + "11": 3108845568.0, + "12": 3108845568.0, + "13": 3108845568.0, + "14": 3108845568.0, + "15": 3108845568.0, + "16": 3108845568.0, + "17": 3108845568.0, + "18": 3108845568.0, + "19": 3108845568.0, + "20": 3108845568.0, + "21": 3108845568.0, + "22": 3108845568.0, + "23": 3108845568.0, + "24": 3108845568.0, + "25": 3108845568.0, + "26": 3108845568.0, + "27": 3108845568.0, + "28": 3108845568.0, + "29": 3108845568.0, + "30": 3108845568.0, + "31": 3108845568.0, + "32": 3108845568.0, + "33": 3108845568.0, + "34": 3108845568.0, + "35": 3108845568.0, + "36": 3108845568.0, + "37": 3108846080.0, + "38": 3108846080.0, + "39": 3108846080.0, + "40": 3108846080.0, + "41": 3108846080.0, + "42": 3108846080.0, + "43": 3108846080.0, + "44": 3108846080.0, + "45": 3108846080.0, + "46": 3108846080.0, + "47": 3108846080.0, + "48": 3108846080.0, + "49": 3108846080.0, + "50": 3108846080.0, + "51": 3108846080.0, + "52": 3108846080.0, + "53": 3108846080.0, + "54": 3108846080.0, + "55": 3108846080.0, + "56": 3108846080.0, + "57": 3108846080.0, + "58": 3108846080.0, + "59": 3108846080.0, + "60": 3108846080.0, + "61": 3108846080.0, + "62": 3108847616.0, + "63": 3108847616.0, + "64": 3108847616.0, + "65": 3108847616.0, + "66": 3108847616.0, + "67": 3108847616.0, + "68": 3108847616.0, + "69": 3108847616.0, + "70": 3108847616.0, + "71": 3108847616.0, + "72": 3108847616.0, + "73": 3108847616.0, + "74": 3108847616.0, + "75": 3108847616.0, + "76": 3108847616.0, + "77": 3108847616.0, + "78": 3108847616.0, + "79": 3108847616.0, + "80": 3108847616.0, + "81": 3108847616.0, + "82": 3108847616.0, + "83": 3108847616.0, + "84": 3108847616.0, + "85": 3108847616.0, + "86": 3108847616.0, + "87": 3108847616.0, + "88": 3108847616.0, + "89": 3108847616.0, + "90": 3108847616.0, + "91": 3108847616.0, + "92": 3108847616.0, + "93": 3108847616.0, + "94": 3108847616.0, + "95": 3108847616.0, + "96": 3108847616.0, + "97": 3108847616.0, + "98": 3108847616.0, + "99": 3108847616.0, + "100": 3108847616.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 13.09913, + "2": 1.02984, + "3": 0.9509, + "4": 0.92961, + "5": 0.88057, + "6": 0.86499, + "7": 0.87435, + "8": 0.87748, + "9": 0.88481, + "10": 0.87813, + "11": 0.88937, + "12": 0.91092, + "13": 0.85441, + "14": 0.87519, + "15": 0.89434, + "16": 1.08771, + "17": 0.87461, + "18": 0.8785, + "19": 1.08419, + "20": 1.00138, + "21": 0.98051, + "22": 1.32806, + "23": 0.85982, + "24": 0.88387, + "25": 0.88245, + "26": 0.87335, + "27": 0.88317, + "28": 0.88985, + "29": 0.895, + "30": 0.87281, + "31": 0.88109, + "32": 0.87358, + "33": 0.89681, + "34": 0.91049, + "35": 0.89763, + "36": 0.89169, + "37": 0.89357, + "38": 0.89732, + "39": 0.88241, + "40": 0.90292, + "41": 0.88715, + "42": 0.90721, + "43": 1.00024, + "44": 1.05261, + "45": 0.88589, + "46": 0.89065, + "47": 1.19824, + "48": 1.03763, + "49": 0.88362, + "50": 2.54681, + "51": 0.88554, + "52": 1.29624, + "53": 0.90469, + "54": 1.25859, + "55": 0.8959, + "56": 0.89223, + "57": 0.91307, + "58": 0.9046, + "59": 0.90217, + "60": 1.19764, + "61": 0.96385, + "62": 1.26273, + "63": 1.00365, + "64": 0.95065, + "65": 0.87723, + "66": 0.87675, + "67": 0.8752, + "68": 1.1677, + "69": 0.87584, + "70": 0.88581, + "71": 1.19607, + "72": 0.88789, + "73": 1.11276, + "74": 0.89256, + "75": 0.8887, + "76": 1.28091, + "77": 0.93746, + "78": 0.87892, + "79": 1.07934, + "80": 0.88837, + "81": 0.87726, + "82": 0.87655, + "83": 0.89632, + "84": 0.90579, + "85": 0.88535, + "86": 0.8924, + "87": 0.8763, + "88": 0.8769, + "89": 0.87952, + "90": 0.89745, + "91": 0.8736, + "92": 0.8825, + "93": 0.8845, + "94": 0.87495, + "95": 0.88075, + "96": 0.94076, + "97": 0.87753, + "98": 0.88407, + "99": 0.89106, + "100": 0.88092 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..dc5d31f8f8b --- /dev/null +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.48367, + "2": 10.48426, + "3": 10.48254, + "4": 10.48311, + "5": 10.4764, + "6": 10.4844, + "7": 10.48458, + "8": 10.48829, + "9": 10.49008, + "10": 10.47268, + "11": 10.47256, + "12": 10.48259, + "13": 10.47857, + "14": 10.45154, + "15": 10.47925, + "16": 10.45346, + "17": 10.45145, + "18": 10.46238, + "19": 10.44113, + "20": 10.45448, + "21": 10.43454, + "22": 10.40592, + "23": 10.39961, + "24": 10.37579, + "25": 10.38182, + "26": 10.35147, + "27": 10.35388, + "28": 10.34937, + "29": 10.28711, + "30": 10.21159, + "31": 10.1726, + "32": 10.13421, + "33": 10.14744, + "34": 10.10737, + "35": 10.10581, + "36": 10.08735, + "37": 10.08157, + "38": 10.07233, + "39": 10.00094, + "40": 9.98143, + "41": 9.92541, + "42": 9.87527, + "43": 9.88711, + "44": 9.80642, + "45": 9.82325, + "46": 9.73785, + "47": 9.74817, + "48": 9.71609, + "49": 9.74484, + "50": 9.72982, + "51": 9.71485, + "52": 9.66475, + "53": 9.60919, + "54": 9.62705, + "55": 9.61012, + "56": 9.617, + "57": 9.56786, + "58": 9.52731, + "59": 9.51668, + "60": 9.51865, + "61": 9.53132, + "62": 9.45016, + "63": 9.45725, + "64": 9.43435, + "65": 9.45801, + "66": 9.4368, + "67": 9.3968, + "68": 9.36474, + "69": 9.4095, + "70": 9.376, + "71": 9.41716, + "72": 9.42574, + "73": 9.37581, + "74": 9.41547, + "75": 9.37891, + "76": 9.28017, + "77": 9.32205, + "78": 9.35754, + "79": 9.32162, + "80": 9.31486, + "81": 9.2678, + "82": 9.34178, + "83": 9.32145, + "84": 9.24785, + "85": 9.35023, + "86": 9.22392, + "87": 9.3062, + "88": 9.29891, + "89": 9.22716, + "90": 9.28483, + "91": 9.23109, + "92": 9.27463, + "93": 9.19241, + "94": 9.23984, + "95": 9.28006, + "96": 9.17526, + "97": 9.21894, + "98": 9.17192, + "99": 9.16446, + "100": 9.14816 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2570.0, + "2": 1923.0, + "3": 1512.0, + "4": 2322.0, + "5": 2033.0, + "6": 1774.0, + "7": 2781.0, + "8": 2460.0, + "9": 2308.0, + "10": 2635.0, + "11": 2397.0, + "12": 1817.0, + "13": 2348.0, + "14": 2749.0, + "15": 2027.0, + "16": 2719.0, + "17": 2487.0, + "18": 2533.0, + "19": 2547.0, + "20": 2850.0, + "21": 1990.0, + "22": 2884.0, + "23": 2857.0, + "24": 2685.0, + "25": 2514.0, + "26": 2958.0, + "27": 2673.0, + "28": 2723.0, + "29": 2571.0, + "30": 2858.0, + "31": 2157.0, + "32": 2357.0, + "33": 2242.0, + "34": 2464.0, + "35": 2544.0, + "36": 2933.0, + "37": 3293.0, + "38": 2730.0, + "39": 2795.0, + "40": 3310.0, + "41": 1816.0, + "42": 1467.0, + "43": 1817.0, + "44": 2633.0, + "45": 3576.0, + "46": 3015.0, + "47": 2805.0, + "48": 3071.0, + "49": 2974.0, + "50": 2267.0, + "51": 1923.0, + "52": 2515.0, + "53": 3615.0, + "54": 3426.0, + "55": 3436.0, + "56": 4411.0, + "57": 4095.0, + "58": 4308.0, + "59": 1687.0, + "60": 2431.0, + "61": 2151.0, + "62": 3986.0, + "63": 3558.0, + "64": 4286.0, + "65": 3052.0, + "66": 1720.0, + "67": 1910.0, + "68": 4193.0, + "69": 4347.0, + "70": 4596.0, + "71": 2078.0, + "72": 4406.0, + "73": 4062.0, + "74": 3358.0, + "75": 4606.0, + "76": 2187.0, + "77": 4854.0, + "78": 4098.0, + "79": 2652.0, + "80": 3776.0, + "81": 3550.0, + "82": 3031.0, + "83": 5345.0, + "84": 4396.0, + "85": 4354.0, + "86": 3332.0, + "87": 4815.0, + "88": 3303.0, + "89": 4611.0, + "90": 4346.0, + "91": 4361.0, + "92": 3502.0, + "93": 5624.0, + "94": 3733.0, + "95": 4728.0, + "96": 3534.0, + "97": 3873.0, + "98": 4525.0, + "99": 4329.0, + "100": 3365.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1784014336.0, + "2": 1784014336.0, + "3": 1784014336.0, + "4": 1784014336.0, + "5": 1784014336.0, + "6": 1784014336.0, + "7": 1784014336.0, + "8": 1784014336.0, + "9": 1784014336.0, + "10": 1784014336.0, + "11": 1784014336.0, + "12": 1784014336.0, + "13": 1784014336.0, + "14": 1784014336.0, + "15": 1784014336.0, + "16": 1784014336.0, + "17": 1784014336.0, + "18": 1784014336.0, + "19": 1784014336.0, + "20": 1784014336.0, + "21": 1784014336.0, + "22": 1784014336.0, + "23": 1784014336.0, + "24": 1784014336.0, + "25": 1784014336.0, + "26": 1784014336.0, + "27": 1784014336.0, + "28": 1784014336.0, + "29": 1784014336.0, + "30": 1784014336.0, + "31": 1784014336.0, + "32": 1784014336.0, + "33": 1784014336.0, + "34": 1784014336.0, + "35": 1784014336.0, + "36": 1784014336.0, + "37": 1784014336.0, + "38": 1784014336.0, + "39": 1784014336.0, + "40": 1784014336.0, + "41": 1784014336.0, + "42": 1784014336.0, + "43": 1784014336.0, + "44": 1784014336.0, + "45": 1784014336.0, + "46": 1784014336.0, + "47": 1784014336.0, + "48": 1784014336.0, + "49": 1784014336.0, + "50": 1784014336.0, + "51": 1784014336.0, + "52": 1784014336.0, + "53": 1784014336.0, + "54": 1784014336.0, + "55": 1784014336.0, + "56": 1784014336.0, + "57": 1784014336.0, + "58": 1784014336.0, + "59": 1784014336.0, + "60": 1784014336.0, + "61": 1784014336.0, + "62": 1784014336.0, + "63": 1784014336.0, + "64": 1784014336.0, + "65": 1784014336.0, + "66": 1784014336.0, + "67": 1784014336.0, + "68": 1784014336.0, + "69": 1784014336.0, + "70": 1784014336.0, + "71": 1784014336.0, + "72": 1784014336.0, + "73": 1784014336.0, + "74": 1784014336.0, + "75": 1784014336.0, + "76": 1784014336.0, + "77": 1784014336.0, + "78": 1784014336.0, + "79": 1784014336.0, + "80": 1784014336.0, + "81": 1784014336.0, + "82": 1784014336.0, + "83": 1784014336.0, + "84": 1784014336.0, + "85": 1784014336.0, + "86": 1784014336.0, + "87": 1784014336.0, + "88": 1784014336.0, + "89": 1784014336.0, + "90": 1784014336.0, + "91": 1784014336.0, + "92": 1784014336.0, + "93": 1784014336.0, + "94": 1784014336.0, + "95": 1784014336.0, + "96": 1784014336.0, + "97": 1784014336.0, + "98": 1784014336.0, + "99": 1784014336.0, + "100": 1784014336.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2365860864.0, + "2": 3108323328.0, + "3": 3108323328.0, + "4": 3108323328.0, + "5": 3108323328.0, + "6": 3108323328.0, + "7": 3108323328.0, + "8": 3108323328.0, + "9": 3108323328.0, + "10": 3108323328.0, + "11": 3108323328.0, + "12": 3108323328.0, + "13": 3108323328.0, + "14": 3108323328.0, + "15": 3108323328.0, + "16": 3108323328.0, + "17": 3108323328.0, + "18": 3108323328.0, + "19": 3108323328.0, + "20": 3108323328.0, + "21": 3108323328.0, + "22": 3108323328.0, + "23": 3108323328.0, + "24": 3108323328.0, + "25": 3108323328.0, + "26": 3108323328.0, + "27": 3108323328.0, + "28": 3108323328.0, + "29": 3108323328.0, + "30": 3108323328.0, + "31": 3108323328.0, + "32": 3108323328.0, + "33": 3108323328.0, + "34": 3108323328.0, + "35": 3108323328.0, + "36": 3108323328.0, + "37": 3108323328.0, + "38": 3108323328.0, + "39": 3108323328.0, + "40": 3108323328.0, + "41": 3108323328.0, + "42": 3108323328.0, + "43": 3108323328.0, + "44": 3108323328.0, + "45": 3108323328.0, + "46": 3108323328.0, + "47": 3108323328.0, + "48": 3108323328.0, + "49": 3108323328.0, + "50": 3108323328.0, + "51": 3108323328.0, + "52": 3108323328.0, + "53": 3108323328.0, + "54": 3108323328.0, + "55": 3108323328.0, + "56": 3108323328.0, + "57": 3108842496.0, + "58": 3108842496.0, + "59": 3108842496.0, + "60": 3108842496.0, + "61": 3108842496.0, + "62": 3108842496.0, + "63": 3108842496.0, + "64": 3108842496.0, + "65": 3108842496.0, + "66": 3108842496.0, + "67": 3108842496.0, + "68": 3108842496.0, + "69": 3108842496.0, + "70": 3108842496.0, + "71": 3108842496.0, + "72": 3108842496.0, + "73": 3108842496.0, + "74": 3108842496.0, + "75": 3108844544.0, + "76": 3108844544.0, + "77": 3108844544.0, + "78": 3108844544.0, + "79": 3108844544.0, + "80": 3108844544.0, + "81": 3108844544.0, + "82": 3108844544.0, + "83": 3108844544.0, + "84": 3108844544.0, + "85": 3108844544.0, + "86": 3108844544.0, + "87": 3108844544.0, + "88": 3108844544.0, + "89": 3108844544.0, + "90": 3108844544.0, + "91": 3108844544.0, + "92": 3108844544.0, + "93": 3108844544.0, + "94": 3108844544.0, + "95": 3108844544.0, + "96": 3108844544.0, + "97": 3108844544.0, + "98": 3108844544.0, + "99": 3108844544.0, + "100": 3108844544.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.84806, + "2": 1.03522, + "3": 1.00793, + "4": 1.00939, + "5": 1.00929, + "6": 1.01517, + "7": 1.01009, + "8": 1.01561, + "9": 1.02131, + "10": 1.01787, + "11": 1.01149, + "12": 1.0128, + "13": 1.01358, + "14": 1.01768, + "15": 1.23565, + "16": 1.01096, + "17": 1.19479, + "18": 1.01674, + "19": 1.01808, + "20": 1.23016, + "21": 1.01908, + "22": 1.11536, + "23": 1.0888, + "24": 1.02965, + "25": 1.03972, + "26": 1.00766, + "27": 1.00981, + "28": 1.01339, + "29": 1.01801, + "30": 1.01655, + "31": 1.01796, + "32": 1.01286, + "33": 1.01823, + "34": 1.00604, + "35": 1.01493, + "36": 1.01106, + "37": 1.00783, + "38": 1.01573, + "39": 1.01525, + "40": 1.09842, + "41": 1.39919, + "42": 1.22658, + "43": 1.00841, + "44": 0.99932, + "45": 1.00156, + "46": 1.18473, + "47": 1.01528, + "48": 1.00768, + "49": 1.00498, + "50": 0.9957, + "51": 1.29149, + "52": 1.10051, + "53": 1.00264, + "54": 1.00531, + "55": 1.30558, + "56": 0.99836, + "57": 1.00645, + "58": 1.00413, + "59": 1.00106, + "60": 1.00076, + "61": 1.32205, + "62": 1.00795, + "63": 1.2523, + "64": 1.01369, + "65": 1.01151, + "66": 1.01484, + "67": 1.00831, + "68": 1.01849, + "69": 1.01821, + "70": 1.01316, + "71": 1.01068, + "72": 1.01792, + "73": 1.47417, + "74": 1.01143, + "75": 1.14077, + "76": 1.01286, + "77": 1.08819, + "78": 1.01005, + "79": 1.0069, + "80": 1.01196, + "81": 1.0882, + "82": 1.00417, + "83": 1.29479, + "84": 1.0044, + "85": 1.0103, + "86": 1.00862, + "87": 1.01863, + "88": 1.2549, + "89": 1.0075, + "90": 1.00874, + "91": 1.0111, + "92": 1.01049, + "93": 1.01084, + "94": 1.01043, + "95": 1.01246, + "96": 1.01317, + "97": 1.09821, + "98": 1.01406, + "99": 1.00578, + "100": 1.09442 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 0019ac97573..b5f4b597886 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.4837, + "2": 10.48435, + "3": 10.48251, + "4": 10.48303, "5": 10.47647, + "6": 10.48423, + "7": 10.48457, + "8": 10.48837, + "9": 10.49003, "10": 10.47255, + "11": 10.47245, + "12": 10.4828, + "13": 10.47855, + "14": 10.45162, "15": 10.47936, + "16": 10.45364, + "17": 10.45143, + "18": 10.46239, + "19": 10.44136, "20": 10.45438, + "21": 10.43469, + "22": 10.40587, + "23": 10.39982, + "24": 10.37585, "25": 10.38173, + "26": 10.35154, + "27": 10.35401, + "28": 10.3497, + "29": 10.28714, "30": 10.21194, + "31": 10.17274, + "32": 10.13439, + "33": 10.14753, + "34": 10.10759, "35": 10.10592, + "36": 10.08756, + "37": 10.08177, + "38": 10.07257, + "39": 10.0013, "40": 9.9816, + "41": 9.92549, + "42": 9.87529, + "43": 9.88742, + "44": 9.80641, "45": 9.82342, + "46": 9.73815, + "47": 9.74831, + "48": 9.71619, + "49": 9.74504, "50": 9.73004, + "51": 9.71503, + "52": 9.66484, + "53": 9.60935, + "54": 9.62735, "55": 9.61036, + "56": 9.61745, + "57": 9.56794, + "58": 9.52742, + "59": 9.51685, "60": 9.51873, + "61": 9.53147, + "62": 9.45024, + "63": 9.45733, + "64": 9.43455, "65": 9.4582, + "66": 9.43694, + "67": 9.39693, + "68": 9.36491, + "69": 9.40957, "70": 9.37605, + "71": 9.41735, + "72": 9.42581, + "73": 9.37614, + "74": 9.41544, "75": 9.37897, + "76": 9.28015, + "77": 9.32215, + "78": 9.35752, + "79": 9.32154, "80": 9.31496, + "81": 9.26776, + "82": 9.34189, + "83": 9.32163, + "84": 9.24791, "85": 9.35021, + "86": 9.22383, + "87": 9.30627, + "88": 9.29884, + "89": 9.22708, "90": 9.28475, + "91": 9.23116, + "92": 9.27477, + "93": 9.1922, + "94": 9.23984, "95": 9.27996, + "96": 9.17534, + "97": 9.21892, + "98": 9.1719, + "99": 9.1646, "100": 9.14809 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2554.0, + "2": 1919.0, + "3": 1521.0, + "4": 2330.0, "5": 2010.0, + "6": 1725.0, + "7": 2803.0, + "8": 2435.0, + "9": 2286.0, "10": 2570.0, + "11": 2438.0, + "12": 1829.0, + "13": 2332.0, + "14": 2832.0, "15": 2008.0, + "16": 2659.0, + "17": 2454.0, + "18": 2500.0, + "19": 2588.0, "20": 2834.0, + "21": 2042.0, + "22": 3037.0, + "23": 2702.0, + "24": 2700.0, "25": 2568.0, + "26": 2896.0, + "27": 2735.0, + "28": 2699.0, + "29": 2548.0, "30": 2843.0, + "31": 2160.0, + "32": 2458.0, + "33": 2130.0, + "34": 2517.0, "35": 2597.0, + "36": 3001.0, + "37": 3305.0, + "38": 2682.0, + "39": 2805.0, "40": 3425.0, + "41": 1812.0, + "42": 1481.0, + "43": 1726.0, + "44": 2575.0, "45": 3438.0, + "46": 2960.0, + "47": 2792.0, + "48": 3107.0, + "49": 2854.0, "50": 2145.0, + "51": 1964.0, + "52": 2437.0, + "53": 3823.0, + "54": 3427.0, "55": 3392.0, + "56": 4421.0, + "57": 4003.0, + "58": 4224.0, + "59": 1816.0, "60": 2520.0, + "61": 2106.0, + "62": 4011.0, + "63": 3637.0, + "64": 4375.0, "65": 3080.0, + "66": 1753.0, + "67": 1913.0, + "68": 4407.0, + "69": 4475.0, "70": 4419.0, + "71": 2152.0, + "72": 4399.0, + "73": 4134.0, + "74": 3315.0, "75": 4815.0, + "76": 2322.0, + "77": 5019.0, + "78": 4171.0, + "79": 2788.0, "80": 3831.0, + "81": 3411.0, + "82": 3004.0, + "83": 5145.0, + "84": 4399.0, "85": 4295.0, + "86": 3410.0, + "87": 4880.0, + "88": 3350.0, + "89": 4659.0, "90": 4370.0, + "91": 4273.0, + "92": 3325.0, + "93": 5509.0, + "94": 3804.0, "95": 4711.0, + "96": 3631.0, + "97": 3774.0, + "98": 4477.0, + "99": 4459.0, "100": 3220.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1767237120.0, + "2": 1767237120.0, + "3": 1767237120.0, + "4": 1767237120.0, "5": 1767237120.0, + "6": 1767237120.0, + "7": 1767237120.0, + "8": 1767237120.0, + "9": 1767237120.0, "10": 1767237120.0, + "11": 1767237120.0, + "12": 1767237120.0, + "13": 1767237120.0, + "14": 1767237120.0, "15": 1767237120.0, + "16": 1767237120.0, + "17": 1767237120.0, + "18": 1767237120.0, + "19": 1767237120.0, "20": 1767237120.0, + "21": 1767237120.0, + "22": 1767237120.0, + "23": 1767237120.0, + "24": 1767237120.0, "25": 1767237120.0, + "26": 1767237120.0, + "27": 1767237120.0, + "28": 1767237120.0, + "29": 1767237120.0, "30": 1767237120.0, + "31": 1767237120.0, + "32": 1767237120.0, + "33": 1767237120.0, + "34": 1767237120.0, "35": 1767237120.0, + "36": 1767237120.0, + "37": 1767237120.0, + "38": 1767237120.0, + "39": 1767237120.0, "40": 1767237120.0, + "41": 1767237120.0, + "42": 1767237120.0, + "43": 1767237120.0, + "44": 1767237120.0, "45": 1767237120.0, + "46": 1767237120.0, + "47": 1767237120.0, + "48": 1767237120.0, + "49": 1767237120.0, "50": 1767237120.0, + "51": 1767237120.0, + "52": 1767237120.0, + "53": 1767237120.0, + "54": 1767237120.0, "55": 1767237120.0, + "56": 1767237120.0, + "57": 1767237120.0, + "58": 1767237120.0, + "59": 1767237120.0, "60": 1767237120.0, + "61": 1767237120.0, + "62": 1767237120.0, + "63": 1767237120.0, + "64": 1767237120.0, "65": 1767237120.0, + "66": 1767237120.0, + "67": 1767237120.0, + "68": 1767237120.0, + "69": 1767237120.0, "70": 1767237120.0, + "71": 1767237120.0, + "72": 1767237120.0, + "73": 1767237120.0, + "74": 1767237120.0, "75": 1767237120.0, + "76": 1767237120.0, + "77": 1767237120.0, + "78": 1767237120.0, + "79": 1767237120.0, "80": 1767237120.0, + "81": 1767237120.0, + "82": 1767237120.0, + "83": 1767237120.0, + "84": 1767237120.0, "85": 1767237120.0, + "86": 1767237120.0, + "87": 1767237120.0, + "88": 1767237120.0, + "89": 1767237120.0, "90": 1767237120.0, + "91": 1767237120.0, + "92": 1767237120.0, + "93": 1767237120.0, + "94": 1767237120.0, "95": 1767237120.0, + "96": 1767237120.0, + "97": 1767237120.0, + "98": 1767237120.0, + "99": 1767237120.0, "100": 1767237120.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2336500736.0, + "2": 3079487488.0, + "3": 3079487488.0, + "4": 3079487488.0, "5": 3079487488.0, + "6": 3079487488.0, + "7": 3079487488.0, + "8": 3079487488.0, + "9": 3079487488.0, "10": 3079487488.0, + "11": 3079487488.0, + "12": 3079487488.0, + "13": 3079487488.0, + "14": 3079487488.0, "15": 3079487488.0, + "16": 3079487488.0, + "17": 3079487488.0, + "18": 3079487488.0, + "19": 3079487488.0, "20": 3079487488.0, + "21": 3079487488.0, + "22": 3079487488.0, + "23": 3079487488.0, + "24": 3079487488.0, "25": 3079487488.0, + "26": 3079487488.0, + "27": 3079487488.0, + "28": 3079487488.0, + "29": 3079487488.0, "30": 3079487488.0, + "31": 3079487488.0, + "32": 3079487488.0, + "33": 3079487488.0, + "34": 3079487488.0, "35": 3079487488.0, + "36": 3079487488.0, + "37": 3079487488.0, + "38": 3079487488.0, + "39": 3079487488.0, "40": 3079487488.0, + "41": 3079487488.0, + "42": 3079487488.0, + "43": 3079487488.0, + "44": 3079487488.0, "45": 3079487488.0, + "46": 3079487488.0, + "47": 3079487488.0, + "48": 3079487488.0, + "49": 3079487488.0, "50": 3079487488.0, + "51": 3079487488.0, + "52": 3079487488.0, + "53": 3079487488.0, + "54": 3079487488.0, "55": 3079487488.0, + "56": 3079487488.0, + "57": 3079487488.0, + "58": 3079487488.0, + "59": 3079487488.0, "60": 3079487488.0, + "61": 3079487488.0, + "62": 3079487488.0, + "63": 3079487488.0, + "64": 3079487488.0, "65": 3079487488.0, + "66": 3079487488.0, + "67": 3079487488.0, + "68": 3079487488.0, + "69": 3079487488.0, "70": 3079487488.0, + "71": 3079487488.0, + "72": 3079487488.0, + "73": 3079487488.0, + "74": 3079487488.0, "75": 3079487488.0, + "76": 3079487488.0, + "77": 3079487488.0, + "78": 3079487488.0, + "79": 3079487488.0, "80": 3079487488.0, + "81": 3079487488.0, + "82": 3079487488.0, + "83": 3079487488.0, + "84": 3079487488.0, "85": 3079487488.0, + "86": 3079487488.0, + "87": 3079487488.0, + "88": 3079487488.0, + "89": 3079487488.0, "90": 3079487488.0, + "91": 3079487488.0, + "92": 3079487488.0, + "93": 3079487488.0, + "94": 3079487488.0, "95": 3079487488.0, + "96": 3079487488.0, + "97": 3079487488.0, + "98": 3079487488.0, + "99": 3079487488.0, "100": 3079487488.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 12.8928, - "5": 0.79082, - "10": 0.75815, - "15": 0.75209, - "20": 0.9959, - "25": 0.75483, - "30": 0.74868, - "35": 0.75419, - "40": 0.75497, - "45": 0.9028, - "50": 0.80341, - "55": 1.06556, - "60": 0.72403, - "65": 0.72429, - "70": 1.04312, - "75": 1.09577, - "80": 0.77413, - "85": 0.72501, - "90": 0.72387, - "95": 0.72312, - "100": 0.80268 + "1": 11.88602, + "2": 0.95024, + "3": 0.88873, + "4": 0.84081, + "5": 0.8407, + "6": 0.841, + "7": 0.83666, + "8": 0.83819, + "9": 0.83577, + "10": 0.83982, + "11": 0.83346, + "12": 0.8683, + "13": 0.84255, + "14": 0.83676, + "15": 1.08071, + "16": 1.25785, + "17": 0.83186, + "18": 0.8423, + "19": 0.84907, + "20": 0.84641, + "21": 0.84182, + "22": 1.26058, + "23": 0.86142, + "24": 0.84798, + "25": 0.84097, + "26": 0.84232, + "27": 0.85483, + "28": 0.85596, + "29": 0.85197, + "30": 0.85702, + "31": 0.85002, + "32": 0.85132, + "33": 0.85438, + "34": 0.86588, + "35": 0.87207, + "36": 0.85768, + "37": 0.87379, + "38": 0.85134, + "39": 0.8537, + "40": 0.84912, + "41": 0.85397, + "42": 0.9623, + "43": 1.06611, + "44": 0.98659, + "45": 1.18823, + "46": 0.86085, + "47": 0.85574, + "48": 0.8596, + "49": 0.97573, + "50": 0.95882, + "51": 0.86517, + "52": 0.85872, + "53": 0.86263, + "54": 0.86436, + "55": 0.89018, + "56": 0.8674, + "57": 0.86176, + "58": 0.85395, + "59": 1.16789, + "60": 0.85822, + "61": 1.20441, + "62": 0.85426, + "63": 0.85652, + "64": 0.85392, + "65": 0.86218, + "66": 0.88112, + "67": 1.16257, + "68": 0.85308, + "69": 1.00689, + "70": 0.86168, + "71": 1.01898, + "72": 1.007, + "73": 1.32547, + "74": 0.87953, + "75": 0.86331, + "76": 1.21865, + "77": 0.97064, + "78": 0.86068, + "79": 0.97841, + "80": 0.87282, + "81": 0.87319, + "82": 0.86404, + "83": 0.85854, + "84": 0.86686, + "85": 1.10394, + "86": 0.88271, + "87": 0.88117, + "88": 0.86213, + "89": 0.86328, + "90": 0.86472, + "91": 0.86372, + "92": 0.86414, + "93": 0.86268, + "94": 0.86412, + "95": 0.86343, + "96": 0.86012, + "97": 1.00046, + "98": 1.16876, + "99": 0.86021, + "100": 0.86224 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..1c7c359e92d --- /dev/null +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.4837, + "2": 10.48435, + "3": 10.48251, + "4": 10.48303, + "5": 10.47647, + "6": 10.48423, + "7": 10.48457, + "8": 10.48837, + "9": 10.49003, + "10": 10.47255, + "11": 10.47245, + "12": 10.4828, + "13": 10.47855, + "14": 10.45162, + "15": 10.47936, + "16": 10.45364, + "17": 10.45143, + "18": 10.46239, + "19": 10.44136, + "20": 10.45438, + "21": 10.43469, + "22": 10.40587, + "23": 10.39982, + "24": 10.37585, + "25": 10.38173, + "26": 10.35154, + "27": 10.35401, + "28": 10.3497, + "29": 10.28714, + "30": 10.21194, + "31": 10.17274, + "32": 10.13439, + "33": 10.14753, + "34": 10.10759, + "35": 10.10592, + "36": 10.08756, + "37": 10.08177, + "38": 10.07257, + "39": 10.0013, + "40": 9.9816, + "41": 9.92549, + "42": 9.87529, + "43": 9.88742, + "44": 9.80641, + "45": 9.82342, + "46": 9.73815, + "47": 9.74831, + "48": 9.71619, + "49": 9.74504, + "50": 9.73004, + "51": 9.71503, + "52": 9.66484, + "53": 9.60935, + "54": 9.62735, + "55": 9.61036, + "56": 9.61745, + "57": 9.56794, + "58": 9.52742, + "59": 9.51685, + "60": 9.51873, + "61": 9.53147, + "62": 9.45024, + "63": 9.45733, + "64": 9.43455, + "65": 9.4582, + "66": 9.43694, + "67": 9.39693, + "68": 9.36491, + "69": 9.40957, + "70": 9.37605, + "71": 9.41735, + "72": 9.42581, + "73": 9.37614, + "74": 9.41544, + "75": 9.37897, + "76": 9.28015, + "77": 9.32215, + "78": 9.35752, + "79": 9.32154, + "80": 9.31496, + "81": 9.26776, + "82": 9.34189, + "83": 9.32163, + "84": 9.24791, + "85": 9.35021, + "86": 9.22383, + "87": 9.30627, + "88": 9.29884, + "89": 9.22708, + "90": 9.28475, + "91": 9.23116, + "92": 9.27477, + "93": 9.1922, + "94": 9.23984, + "95": 9.27996, + "96": 9.17534, + "97": 9.21892, + "98": 9.1719, + "99": 9.1646, + "100": 9.14809 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2554.0, + "2": 1919.0, + "3": 1521.0, + "4": 2330.0, + "5": 2010.0, + "6": 1725.0, + "7": 2803.0, + "8": 2435.0, + "9": 2286.0, + "10": 2570.0, + "11": 2438.0, + "12": 1829.0, + "13": 2332.0, + "14": 2832.0, + "15": 2008.0, + "16": 2659.0, + "17": 2454.0, + "18": 2500.0, + "19": 2588.0, + "20": 2834.0, + "21": 2042.0, + "22": 3037.0, + "23": 2702.0, + "24": 2700.0, + "25": 2568.0, + "26": 2896.0, + "27": 2735.0, + "28": 2699.0, + "29": 2548.0, + "30": 2843.0, + "31": 2160.0, + "32": 2458.0, + "33": 2130.0, + "34": 2517.0, + "35": 2597.0, + "36": 3001.0, + "37": 3305.0, + "38": 2682.0, + "39": 2805.0, + "40": 3425.0, + "41": 1812.0, + "42": 1481.0, + "43": 1726.0, + "44": 2575.0, + "45": 3438.0, + "46": 2960.0, + "47": 2792.0, + "48": 3107.0, + "49": 2854.0, + "50": 2145.0, + "51": 1964.0, + "52": 2437.0, + "53": 3823.0, + "54": 3427.0, + "55": 3392.0, + "56": 4421.0, + "57": 4003.0, + "58": 4224.0, + "59": 1816.0, + "60": 2520.0, + "61": 2106.0, + "62": 4011.0, + "63": 3637.0, + "64": 4375.0, + "65": 3080.0, + "66": 1753.0, + "67": 1913.0, + "68": 4407.0, + "69": 4475.0, + "70": 4419.0, + "71": 2152.0, + "72": 4399.0, + "73": 4134.0, + "74": 3315.0, + "75": 4815.0, + "76": 2322.0, + "77": 5019.0, + "78": 4171.0, + "79": 2788.0, + "80": 3831.0, + "81": 3411.0, + "82": 3004.0, + "83": 5145.0, + "84": 4399.0, + "85": 4295.0, + "86": 3410.0, + "87": 4880.0, + "88": 3350.0, + "89": 4659.0, + "90": 4370.0, + "91": 4273.0, + "92": 3325.0, + "93": 5509.0, + "94": 3804.0, + "95": 4711.0, + "96": 3631.0, + "97": 3774.0, + "98": 4477.0, + "99": 4459.0, + "100": 3220.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1767237120.0, + "2": 1767237120.0, + "3": 1767237120.0, + "4": 1767237120.0, + "5": 1767237120.0, + "6": 1767237120.0, + "7": 1767237120.0, + "8": 1767237120.0, + "9": 1767237120.0, + "10": 1767237120.0, + "11": 1767237120.0, + "12": 1767237120.0, + "13": 1767237120.0, + "14": 1767237120.0, + "15": 1767237120.0, + "16": 1767237120.0, + "17": 1767237120.0, + "18": 1767237120.0, + "19": 1767237120.0, + "20": 1767237120.0, + "21": 1767237120.0, + "22": 1767237120.0, + "23": 1767237120.0, + "24": 1767237120.0, + "25": 1767237120.0, + "26": 1767237120.0, + "27": 1767237120.0, + "28": 1767237120.0, + "29": 1767237120.0, + "30": 1767237120.0, + "31": 1767237120.0, + "32": 1767237120.0, + "33": 1767237120.0, + "34": 1767237120.0, + "35": 1767237120.0, + "36": 1767237120.0, + "37": 1767237120.0, + "38": 1767237120.0, + "39": 1767237120.0, + "40": 1767237120.0, + "41": 1767237120.0, + "42": 1767237120.0, + "43": 1767237120.0, + "44": 1767237120.0, + "45": 1767237120.0, + "46": 1767237120.0, + "47": 1767237120.0, + "48": 1767237120.0, + "49": 1767237120.0, + "50": 1767237120.0, + "51": 1767237120.0, + "52": 1767237120.0, + "53": 1767237120.0, + "54": 1767237120.0, + "55": 1767237120.0, + "56": 1767237120.0, + "57": 1767237120.0, + "58": 1767237120.0, + "59": 1767237120.0, + "60": 1767237120.0, + "61": 1767237120.0, + "62": 1767237120.0, + "63": 1767237120.0, + "64": 1767237120.0, + "65": 1767237120.0, + "66": 1767237120.0, + "67": 1767237120.0, + "68": 1767237120.0, + "69": 1767237120.0, + "70": 1767237120.0, + "71": 1767237120.0, + "72": 1767237120.0, + "73": 1767237120.0, + "74": 1767237120.0, + "75": 1767237120.0, + "76": 1767237120.0, + "77": 1767237120.0, + "78": 1767237120.0, + "79": 1767237120.0, + "80": 1767237120.0, + "81": 1767237120.0, + "82": 1767237120.0, + "83": 1767237120.0, + "84": 1767237120.0, + "85": 1767237120.0, + "86": 1767237120.0, + "87": 1767237120.0, + "88": 1767237120.0, + "89": 1767237120.0, + "90": 1767237120.0, + "91": 1767237120.0, + "92": 1767237120.0, + "93": 1767237120.0, + "94": 1767237120.0, + "95": 1767237120.0, + "96": 1767237120.0, + "97": 1767237120.0, + "98": 1767237120.0, + "99": 1767237120.0, + "100": 1767237120.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2336500736.0, + "2": 3079487488.0, + "3": 3079487488.0, + "4": 3079487488.0, + "5": 3079487488.0, + "6": 3079487488.0, + "7": 3079487488.0, + "8": 3079487488.0, + "9": 3079487488.0, + "10": 3079487488.0, + "11": 3079487488.0, + "12": 3079487488.0, + "13": 3079487488.0, + "14": 3079487488.0, + "15": 3079487488.0, + "16": 3079487488.0, + "17": 3079487488.0, + "18": 3079487488.0, + "19": 3079487488.0, + "20": 3079487488.0, + "21": 3079487488.0, + "22": 3079487488.0, + "23": 3079487488.0, + "24": 3079487488.0, + "25": 3079487488.0, + "26": 3079487488.0, + "27": 3079487488.0, + "28": 3079487488.0, + "29": 3079487488.0, + "30": 3079487488.0, + "31": 3079487488.0, + "32": 3079487488.0, + "33": 3079487488.0, + "34": 3079487488.0, + "35": 3079487488.0, + "36": 3079487488.0, + "37": 3079487488.0, + "38": 3079487488.0, + "39": 3079487488.0, + "40": 3079487488.0, + "41": 3079487488.0, + "42": 3079487488.0, + "43": 3079487488.0, + "44": 3079487488.0, + "45": 3079487488.0, + "46": 3079487488.0, + "47": 3079487488.0, + "48": 3079487488.0, + "49": 3079487488.0, + "50": 3079487488.0, + "51": 3079487488.0, + "52": 3079487488.0, + "53": 3079487488.0, + "54": 3079487488.0, + "55": 3079487488.0, + "56": 3079487488.0, + "57": 3079487488.0, + "58": 3079487488.0, + "59": 3079487488.0, + "60": 3079487488.0, + "61": 3079487488.0, + "62": 3079487488.0, + "63": 3079487488.0, + "64": 3079487488.0, + "65": 3079487488.0, + "66": 3079487488.0, + "67": 3079487488.0, + "68": 3079487488.0, + "69": 3079487488.0, + "70": 3079487488.0, + "71": 3079487488.0, + "72": 3079487488.0, + "73": 3079487488.0, + "74": 3079487488.0, + "75": 3079487488.0, + "76": 3079487488.0, + "77": 3079487488.0, + "78": 3079487488.0, + "79": 3079487488.0, + "80": 3079487488.0, + "81": 3079487488.0, + "82": 3079487488.0, + "83": 3079487488.0, + "84": 3079487488.0, + "85": 3079487488.0, + "86": 3079487488.0, + "87": 3079487488.0, + "88": 3079487488.0, + "89": 3079487488.0, + "90": 3079487488.0, + "91": 3079487488.0, + "92": 3079487488.0, + "93": 3079487488.0, + "94": 3079487488.0, + "95": 3079487488.0, + "96": 3079487488.0, + "97": 3079487488.0, + "98": 3079487488.0, + "99": 3079487488.0, + "100": 3079487488.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 12.43441, + "2": 0.78136, + "3": 0.7462, + "4": 0.7121, + "5": 0.71539, + "6": 0.71675, + "7": 0.71163, + "8": 0.71648, + "9": 0.72398, + "10": 0.71927, + "11": 0.80592, + "12": 0.70909, + "13": 0.71547, + "14": 0.71572, + "15": 0.70839, + "16": 0.71281, + "17": 0.71709, + "18": 0.70875, + "19": 0.71455, + "20": 0.989, + "21": 0.98319, + "22": 0.95078, + "23": 0.94171, + "24": 0.71144, + "25": 0.70971, + "26": 0.71131, + "27": 0.70864, + "28": 0.72406, + "29": 0.71861, + "30": 0.71986, + "31": 0.71003, + "32": 0.70772, + "33": 0.71322, + "34": 0.70935, + "35": 0.71103, + "36": 0.70629, + "37": 0.71354, + "38": 0.71466, + "39": 0.71799, + "40": 0.71635, + "41": 0.72804, + "42": 0.71281, + "43": 0.7097, + "44": 0.71324, + "45": 0.70979, + "46": 0.7111, + "47": 0.71491, + "48": 1.05833, + "49": 0.89093, + "50": 0.8836, + "51": 0.72864, + "52": 0.72146, + "53": 0.72243, + "54": 0.71938, + "55": 0.71917, + "56": 0.71867, + "57": 0.72048, + "58": 0.72484, + "59": 0.72197, + "60": 0.7218, + "61": 0.728, + "62": 0.71944, + "63": 0.73343, + "64": 5.90055, + "65": 5.53828, + "66": 0.91077, + "67": 1.09715, + "68": 0.70698, + "69": 0.70556, + "70": 1.00845, + "71": 0.71076, + "72": 0.71777, + "73": 0.71659, + "74": 0.71156, + "75": 0.8128, + "76": 0.7115, + "77": 0.97488, + "78": 0.89177, + "79": 0.87098, + "80": 1.01456, + "81": 0.81896, + "82": 0.71793, + "83": 1.04586, + "84": 0.72118, + "85": 1.02779, + "86": 0.72077, + "87": 0.71418, + "88": 0.71356, + "89": 0.74602, + "90": 0.77996, + "91": 1.05945, + "92": 0.72043, + "93": 0.72396, + "94": 0.72365, + "95": 0.72843, + "96": 0.71516, + "97": 0.71321, + "98": 0.72468, + "99": 0.72441, + "100": 0.71951 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..27a34e32198 --- /dev/null +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.4837, + "2": 10.48435, + "3": 10.48251, + "4": 10.48303, + "5": 10.47647, + "6": 10.48423, + "7": 10.48457, + "8": 10.48837, + "9": 10.49003, + "10": 10.47255, + "11": 10.47245, + "12": 10.4828, + "13": 10.47855, + "14": 10.45162, + "15": 10.47936, + "16": 10.45364, + "17": 10.45143, + "18": 10.46239, + "19": 10.44136, + "20": 10.45438, + "21": 10.43469, + "22": 10.40587, + "23": 10.39982, + "24": 10.37585, + "25": 10.38173, + "26": 10.35154, + "27": 10.35401, + "28": 10.3497, + "29": 10.28714, + "30": 10.21194, + "31": 10.17274, + "32": 10.13439, + "33": 10.14753, + "34": 10.10759, + "35": 10.10592, + "36": 10.08756, + "37": 10.08177, + "38": 10.07257, + "39": 10.0013, + "40": 9.9816, + "41": 9.92549, + "42": 9.87529, + "43": 9.88742, + "44": 9.80641, + "45": 9.82342, + "46": 9.73815, + "47": 9.74831, + "48": 9.71619, + "49": 9.74504, + "50": 9.73004, + "51": 9.71503, + "52": 9.66484, + "53": 9.60935, + "54": 9.62735, + "55": 9.61036, + "56": 9.61745, + "57": 9.56794, + "58": 9.52742, + "59": 9.51685, + "60": 9.51873, + "61": 9.53147, + "62": 9.45024, + "63": 9.45733, + "64": 9.43455, + "65": 9.4582, + "66": 9.43694, + "67": 9.39693, + "68": 9.36491, + "69": 9.40957, + "70": 9.37605, + "71": 9.41735, + "72": 9.42581, + "73": 9.37614, + "74": 9.41544, + "75": 9.37897, + "76": 9.28015, + "77": 9.32215, + "78": 9.35752, + "79": 9.32154, + "80": 9.31496, + "81": 9.26776, + "82": 9.34189, + "83": 9.32163, + "84": 9.24791, + "85": 9.35021, + "86": 9.22383, + "87": 9.30627, + "88": 9.29884, + "89": 9.22708, + "90": 9.28475, + "91": 9.23116, + "92": 9.27477, + "93": 9.1922, + "94": 9.23984, + "95": 9.27996, + "96": 9.17534, + "97": 9.21892, + "98": 9.1719, + "99": 9.1646, + "100": 9.14809 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2554.0, + "2": 1919.0, + "3": 1521.0, + "4": 2330.0, + "5": 2010.0, + "6": 1725.0, + "7": 2803.0, + "8": 2435.0, + "9": 2286.0, + "10": 2570.0, + "11": 2438.0, + "12": 1829.0, + "13": 2332.0, + "14": 2832.0, + "15": 2008.0, + "16": 2659.0, + "17": 2454.0, + "18": 2500.0, + "19": 2588.0, + "20": 2834.0, + "21": 2042.0, + "22": 3037.0, + "23": 2702.0, + "24": 2700.0, + "25": 2568.0, + "26": 2896.0, + "27": 2735.0, + "28": 2699.0, + "29": 2548.0, + "30": 2843.0, + "31": 2160.0, + "32": 2458.0, + "33": 2130.0, + "34": 2517.0, + "35": 2597.0, + "36": 3001.0, + "37": 3305.0, + "38": 2682.0, + "39": 2805.0, + "40": 3425.0, + "41": 1812.0, + "42": 1481.0, + "43": 1726.0, + "44": 2575.0, + "45": 3438.0, + "46": 2960.0, + "47": 2792.0, + "48": 3107.0, + "49": 2854.0, + "50": 2145.0, + "51": 1964.0, + "52": 2437.0, + "53": 3823.0, + "54": 3427.0, + "55": 3392.0, + "56": 4421.0, + "57": 4003.0, + "58": 4224.0, + "59": 1816.0, + "60": 2520.0, + "61": 2106.0, + "62": 4011.0, + "63": 3637.0, + "64": 4375.0, + "65": 3080.0, + "66": 1753.0, + "67": 1913.0, + "68": 4407.0, + "69": 4475.0, + "70": 4419.0, + "71": 2152.0, + "72": 4399.0, + "73": 4134.0, + "74": 3315.0, + "75": 4815.0, + "76": 2322.0, + "77": 5019.0, + "78": 4171.0, + "79": 2788.0, + "80": 3831.0, + "81": 3411.0, + "82": 3004.0, + "83": 5145.0, + "84": 4399.0, + "85": 4295.0, + "86": 3410.0, + "87": 4880.0, + "88": 3350.0, + "89": 4659.0, + "90": 4370.0, + "91": 4273.0, + "92": 3325.0, + "93": 5509.0, + "94": 3804.0, + "95": 4711.0, + "96": 3631.0, + "97": 3774.0, + "98": 4477.0, + "99": 4459.0, + "100": 3220.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1767237120.0, + "2": 1767237120.0, + "3": 1767237120.0, + "4": 1767237120.0, + "5": 1767237120.0, + "6": 1767237120.0, + "7": 1767237120.0, + "8": 1767237120.0, + "9": 1767237120.0, + "10": 1767237120.0, + "11": 1767237120.0, + "12": 1767237120.0, + "13": 1767237120.0, + "14": 1767237120.0, + "15": 1767237120.0, + "16": 1767237120.0, + "17": 1767237120.0, + "18": 1767237120.0, + "19": 1767237120.0, + "20": 1767237120.0, + "21": 1767237120.0, + "22": 1767237120.0, + "23": 1767237120.0, + "24": 1767237120.0, + "25": 1767237120.0, + "26": 1767237120.0, + "27": 1767237120.0, + "28": 1767237120.0, + "29": 1767237120.0, + "30": 1767237120.0, + "31": 1767237120.0, + "32": 1767237120.0, + "33": 1767237120.0, + "34": 1767237120.0, + "35": 1767237120.0, + "36": 1767237120.0, + "37": 1767237120.0, + "38": 1767237120.0, + "39": 1767237120.0, + "40": 1767237120.0, + "41": 1767237120.0, + "42": 1767237120.0, + "43": 1767237120.0, + "44": 1767237120.0, + "45": 1767237120.0, + "46": 1767237120.0, + "47": 1767237120.0, + "48": 1767237120.0, + "49": 1767237120.0, + "50": 1767237120.0, + "51": 1767237120.0, + "52": 1767237120.0, + "53": 1767237120.0, + "54": 1767237120.0, + "55": 1767237120.0, + "56": 1767237120.0, + "57": 1767237120.0, + "58": 1767237120.0, + "59": 1767237120.0, + "60": 1767237120.0, + "61": 1767237120.0, + "62": 1767237120.0, + "63": 1767237120.0, + "64": 1767237120.0, + "65": 1767237120.0, + "66": 1767237120.0, + "67": 1767237120.0, + "68": 1767237120.0, + "69": 1767237120.0, + "70": 1767237120.0, + "71": 1767237120.0, + "72": 1767237120.0, + "73": 1767237120.0, + "74": 1767237120.0, + "75": 1767237120.0, + "76": 1767237120.0, + "77": 1767237120.0, + "78": 1767237120.0, + "79": 1767237120.0, + "80": 1767237120.0, + "81": 1767237120.0, + "82": 1767237120.0, + "83": 1767237120.0, + "84": 1767237120.0, + "85": 1767237120.0, + "86": 1767237120.0, + "87": 1767237120.0, + "88": 1767237120.0, + "89": 1767237120.0, + "90": 1767237120.0, + "91": 1767237120.0, + "92": 1767237120.0, + "93": 1767237120.0, + "94": 1767237120.0, + "95": 1767237120.0, + "96": 1767237120.0, + "97": 1767237120.0, + "98": 1767237120.0, + "99": 1767237120.0, + "100": 1767237120.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2336500736.0, + "2": 3079487488.0, + "3": 3079487488.0, + "4": 3079487488.0, + "5": 3079487488.0, + "6": 3079487488.0, + "7": 3079487488.0, + "8": 3079487488.0, + "9": 3079487488.0, + "10": 3079487488.0, + "11": 3079487488.0, + "12": 3079487488.0, + "13": 3079487488.0, + "14": 3079487488.0, + "15": 3079487488.0, + "16": 3079487488.0, + "17": 3079487488.0, + "18": 3079487488.0, + "19": 3079487488.0, + "20": 3079487488.0, + "21": 3079487488.0, + "22": 3079487488.0, + "23": 3079487488.0, + "24": 3079487488.0, + "25": 3079487488.0, + "26": 3079487488.0, + "27": 3079487488.0, + "28": 3079487488.0, + "29": 3079487488.0, + "30": 3079487488.0, + "31": 3079487488.0, + "32": 3079487488.0, + "33": 3079487488.0, + "34": 3079487488.0, + "35": 3079487488.0, + "36": 3079487488.0, + "37": 3079487488.0, + "38": 3079487488.0, + "39": 3079487488.0, + "40": 3079487488.0, + "41": 3079487488.0, + "42": 3079487488.0, + "43": 3079487488.0, + "44": 3079487488.0, + "45": 3079487488.0, + "46": 3079487488.0, + "47": 3079487488.0, + "48": 3079487488.0, + "49": 3079487488.0, + "50": 3079487488.0, + "51": 3079487488.0, + "52": 3079487488.0, + "53": 3079487488.0, + "54": 3079487488.0, + "55": 3079487488.0, + "56": 3079487488.0, + "57": 3079487488.0, + "58": 3079487488.0, + "59": 3079487488.0, + "60": 3079487488.0, + "61": 3079487488.0, + "62": 3079487488.0, + "63": 3079487488.0, + "64": 3079487488.0, + "65": 3079487488.0, + "66": 3079487488.0, + "67": 3079487488.0, + "68": 3079487488.0, + "69": 3079487488.0, + "70": 3079487488.0, + "71": 3079487488.0, + "72": 3079487488.0, + "73": 3079487488.0, + "74": 3079487488.0, + "75": 3079487488.0, + "76": 3079487488.0, + "77": 3079487488.0, + "78": 3079487488.0, + "79": 3079487488.0, + "80": 3079487488.0, + "81": 3079487488.0, + "82": 3079487488.0, + "83": 3079487488.0, + "84": 3079487488.0, + "85": 3079487488.0, + "86": 3079487488.0, + "87": 3079487488.0, + "88": 3079487488.0, + "89": 3079487488.0, + "90": 3079487488.0, + "91": 3079487488.0, + "92": 3079487488.0, + "93": 3079487488.0, + "94": 3079487488.0, + "95": 3079487488.0, + "96": 3079487488.0, + "97": 3079487488.0, + "98": 3079487488.0, + "99": 3079487488.0, + "100": 3079487488.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.74907, + "2": 0.85881, + "3": 0.84325, + "4": 0.84358, + "5": 0.84379, + "6": 0.84251, + "7": 0.84123, + "8": 0.8499, + "9": 0.8999, + "10": 0.92522, + "11": 0.94116, + "12": 0.85793, + "13": 0.84568, + "14": 0.84264, + "15": 0.84084, + "16": 0.84084, + "17": 0.83843, + "18": 0.8412, + "19": 0.84178, + "20": 1.1044, + "21": 1.21871, + "22": 1.25946, + "23": 0.85008, + "24": 0.91404, + "25": 0.84787, + "26": 0.84792, + "27": 0.85174, + "28": 0.84996, + "29": 0.84337, + "30": 0.84498, + "31": 0.8486, + "32": 0.84203, + "33": 0.84451, + "34": 0.85648, + "35": 0.83537, + "36": 0.84205, + "37": 0.83563, + "38": 0.84541, + "39": 0.84231, + "40": 0.84639, + "41": 0.84365, + "42": 0.84512, + "43": 0.84437, + "44": 0.84299, + "45": 0.85866, + "46": 0.84237, + "47": 0.84617, + "48": 1.18328, + "49": 0.88875, + "50": 0.96388, + "51": 0.98149, + "52": 0.89905, + "53": 0.84382, + "54": 0.85382, + "55": 0.84338, + "56": 0.84282, + "57": 0.92404, + "58": 0.84627, + "59": 0.83811, + "60": 0.83802, + "61": 0.85109, + "62": 0.83231, + "63": 0.83505, + "64": 1.15842, + "65": 1.1324, + "66": 0.83972, + "67": 0.82896, + "68": 0.82596, + "69": 0.83118, + "70": 0.84229, + "71": 0.8328, + "72": 0.82924, + "73": 0.83555, + "74": 0.83422, + "75": 0.90796, + "76": 0.85077, + "77": 1.07568, + "78": 1.30938, + "79": 1.12037, + "80": 0.82751, + "81": 0.83544, + "82": 0.88688, + "83": 1.16362, + "84": 0.83207, + "85": 0.83917, + "86": 1.14681, + "87": 1.17025, + "88": 0.82985, + "89": 0.82492, + "90": 0.90586, + "91": 0.83299, + "92": 0.83139, + "93": 0.83405, + "94": 0.83756, + "95": 0.83351, + "96": 0.83063, + "97": 0.83499, + "98": 0.84617, + "99": 0.83623, + "100": 0.84014 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_h100.json index 7a7d567ec46..2219c242a8b 100644 --- a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.47723, + "2": 10.47576, + "3": 10.46809, + "4": 10.47326, "5": 10.47148, + "6": 10.46049, + "7": 10.46357, + "8": 10.47334, + "9": 10.48063, "10": 10.46319, + "11": 10.47102, + "12": 10.45502, + "13": 10.44665, + "14": 10.451, "15": 10.48846, + "16": 10.4509, + "17": 10.44648, + "18": 10.44272, + "19": 10.43057, "20": 10.44534, + "21": 10.41778, + "22": 10.38667, + "23": 10.39322, + "24": 10.37847, "25": 10.35474, + "26": 10.35955, + "27": 10.34527, + "28": 10.33539, + "29": 10.25416, "30": 10.23011, + "31": 10.14092, + "32": 10.13601, + "33": 10.13944, + "34": 10.11377, "35": 10.0888, + "36": 10.09247, + "37": 10.06836, + "38": 10.04664, + "39": 9.97584, "40": 9.93781, + "41": 9.90867, + "42": 9.84873, + "43": 9.8577, + "44": 9.79259, "45": 9.8035, + "46": 9.7029, + "47": 9.73432, + "48": 9.70106, + "49": 9.69981, "50": 9.70258 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2137.0, + "2": 1618.0, + "3": 1561.0, + "4": 1871.0, "5": 1983.0, + "6": 1565.0, + "7": 2779.0, + "8": 2108.0, + "9": 2008.0, "10": 2086.0, + "11": 2534.0, + "12": 1686.0, + "13": 2120.0, + "14": 2814.0, "15": 1735.0, + "16": 2535.0, + "17": 2409.0, + "18": 2345.0, + "19": 2374.0, "20": 2739.0, + "21": 2030.0, + "22": 2819.0, + "23": 2763.0, + "24": 2731.0, "25": 2429.0, + "26": 2817.0, + "27": 2944.0, + "28": 2741.0, + "29": 2639.0, "30": 2723.0, + "31": 2158.0, + "32": 2242.0, + "33": 2046.0, + "34": 2139.0, "35": 2492.0, + "36": 2641.0, + "37": 2853.0, + "38": 2705.0, + "39": 2807.0, "40": 3333.0, + "41": 1762.0, + "42": 1410.0, + "43": 1558.0, + "44": 2384.0, "45": 3170.0, + "46": 2664.0, + "47": 2641.0, + "48": 3490.0, + "49": 2928.0, "50": 2487.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 3404871168.0, + "2": 3404871168.0, + "3": 3404871168.0, + "4": 3404871168.0, "5": 3404871168.0, + "6": 3404871168.0, + "7": 3404871168.0, + "8": 3404871168.0, + "9": 3404871168.0, "10": 3404871168.0, + "11": 3404871168.0, + "12": 3404871168.0, + "13": 3404871168.0, + "14": 3404871168.0, "15": 3404871168.0, + "16": 3404871168.0, + "17": 3404871168.0, + "18": 3404871168.0, + "19": 3404871168.0, "20": 3404871168.0, + "21": 3404871168.0, + "22": 3404871168.0, + "23": 3404871168.0, + "24": 3404871168.0, "25": 3404871168.0, + "26": 3404871168.0, + "27": 3404871168.0, + "28": 3404871168.0, + "29": 3404871168.0, "30": 3404871168.0, + "31": 3404871168.0, + "32": 3404871168.0, + "33": 3404871168.0, + "34": 3404871168.0, "35": 3404871168.0, + "36": 3404871168.0, + "37": 3404871168.0, + "38": 3404871168.0, + "39": 3404871168.0, "40": 3404871168.0, + "41": 3404871168.0, + "42": 3404871168.0, + "43": 3404871168.0, + "44": 3404871168.0, "45": 3404871168.0, + "46": 3404871168.0, + "47": 3404871168.0, + "48": 3404871168.0, + "49": 3404871168.0, "50": 3404871168.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 4194526208.0, + "2": 5660965888.0, + "3": 5660965888.0, + "4": 5660965888.0, "5": 5660965888.0, + "6": 5660965888.0, + "7": 5660965888.0, + "8": 5660965888.0, + "9": 5660965888.0, "10": 5660965888.0, + "11": 5660965888.0, + "12": 5660965888.0, + "13": 5660965888.0, + "14": 5660965888.0, "15": 5660965888.0, + "16": 5660965888.0, + "17": 5660965888.0, + "18": 5660965888.0, + "19": 5660965888.0, "20": 5660965888.0, + "21": 5660965888.0, + "22": 5660965888.0, + "23": 5660965888.0, + "24": 5660965888.0, "25": 5660965888.0, + "26": 5660965888.0, + "27": 5660965888.0, + "28": 5660965888.0, + "29": 5660965888.0, "30": 5660965888.0, + "31": 5660965888.0, + "32": 5660965888.0, + "33": 5660965888.0, + "34": 5660965888.0, "35": 5660965888.0, + "36": 5660965888.0, + "37": 5660965888.0, + "38": 5660965888.0, + "39": 5660965888.0, "40": 5660965888.0, + "41": 5660965888.0, + "42": 5660965888.0, + "43": 5660965888.0, + "44": 5660965888.0, "45": 5660965888.0, + "46": 5660965888.0, + "47": 5660965888.0, + "48": 5660965888.0, + "49": 5660965888.0, "50": 5660965888.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 10.04018, - "5": 0.49888, - "10": 0.45046, - "15": 0.45352, - "20": 0.46632, - "25": 0.44805, - "30": 0.58321, - "35": 0.60604, - "40": 0.44629, - "45": 0.75157, - "50": 0.44163 + "1": 10.41177, + "2": 0.63219, + "3": 0.53615, + "4": 0.53244, + "5": 0.53041, + "6": 0.53364, + "7": 0.53797, + "8": 0.52807, + "9": 0.53172, + "10": 0.53116, + "11": 0.52906, + "12": 0.53113, + "13": 0.52796, + "14": 0.52974, + "15": 0.52875, + "16": 0.52005, + "17": 0.51948, + "18": 0.52008, + "19": 0.52456, + "20": 0.52593, + "21": 0.52988, + "22": 0.52281, + "23": 0.51971, + "24": 0.52235, + "25": 0.54145, + "26": 0.52876, + "27": 0.51926, + "28": 0.51381, + "29": 0.51526, + "30": 0.51632, + "31": 0.52532, + "32": 0.61496, + "33": 0.59949, + "34": 0.52069, + "35": 0.52649, + "36": 0.66485, + "37": 0.52497, + "38": 0.52464, + "39": 0.76801, + "40": 0.52465, + "41": 0.69091, + "42": 0.74369, + "43": 0.5242, + "44": 0.75825, + "45": 0.68331, + "46": 0.75831, + "47": 0.51724, + "48": 0.51305, + "49": 0.51686, + "50": 0.52176 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..8ff12f47d08 --- /dev/null +++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.47723, + "2": 10.47576, + "3": 10.46809, + "4": 10.47326, + "5": 10.47148, + "6": 10.46049, + "7": 10.46357, + "8": 10.47334, + "9": 10.48063, + "10": 10.46319, + "11": 10.47102, + "12": 10.45502, + "13": 10.44665, + "14": 10.451, + "15": 10.48846, + "16": 10.4509, + "17": 10.44648, + "18": 10.44272, + "19": 10.43057, + "20": 10.44534, + "21": 10.41778, + "22": 10.38667, + "23": 10.39322, + "24": 10.37847, + "25": 10.35474, + "26": 10.35955, + "27": 10.34527, + "28": 10.33539, + "29": 10.25416, + "30": 10.23011, + "31": 10.14092, + "32": 10.13601, + "33": 10.13944, + "34": 10.11377, + "35": 10.0888, + "36": 10.09247, + "37": 10.06836, + "38": 10.04664, + "39": 9.97584, + "40": 9.93781, + "41": 9.90867, + "42": 9.84873, + "43": 9.8577, + "44": 9.79259, + "45": 9.8035, + "46": 9.7029, + "47": 9.73432, + "48": 9.70106, + "49": 9.69981, + "50": 9.70258 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2137.0, + "2": 1618.0, + "3": 1561.0, + "4": 1871.0, + "5": 1983.0, + "6": 1565.0, + "7": 2779.0, + "8": 2108.0, + "9": 2008.0, + "10": 2086.0, + "11": 2534.0, + "12": 1686.0, + "13": 2120.0, + "14": 2814.0, + "15": 1735.0, + "16": 2535.0, + "17": 2409.0, + "18": 2345.0, + "19": 2374.0, + "20": 2739.0, + "21": 2030.0, + "22": 2819.0, + "23": 2763.0, + "24": 2731.0, + "25": 2429.0, + "26": 2817.0, + "27": 2944.0, + "28": 2741.0, + "29": 2639.0, + "30": 2723.0, + "31": 2158.0, + "32": 2242.0, + "33": 2046.0, + "34": 2139.0, + "35": 2492.0, + "36": 2641.0, + "37": 2853.0, + "38": 2705.0, + "39": 2807.0, + "40": 3333.0, + "41": 1762.0, + "42": 1410.0, + "43": 1558.0, + "44": 2384.0, + "45": 3170.0, + "46": 2664.0, + "47": 2641.0, + "48": 3490.0, + "49": 2928.0, + "50": 2487.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3404871168.0, + "2": 3404871168.0, + "3": 3404871168.0, + "4": 3404871168.0, + "5": 3404871168.0, + "6": 3404871168.0, + "7": 3404871168.0, + "8": 3404871168.0, + "9": 3404871168.0, + "10": 3404871168.0, + "11": 3404871168.0, + "12": 3404871168.0, + "13": 3404871168.0, + "14": 3404871168.0, + "15": 3404871168.0, + "16": 3404871168.0, + "17": 3404871168.0, + "18": 3404871168.0, + "19": 3404871168.0, + "20": 3404871168.0, + "21": 3404871168.0, + "22": 3404871168.0, + "23": 3404871168.0, + "24": 3404871168.0, + "25": 3404871168.0, + "26": 3404871168.0, + "27": 3404871168.0, + "28": 3404871168.0, + "29": 3404871168.0, + "30": 3404871168.0, + "31": 3404871168.0, + "32": 3404871168.0, + "33": 3404871168.0, + "34": 3404871168.0, + "35": 3404871168.0, + "36": 3404871168.0, + "37": 3404871168.0, + "38": 3404871168.0, + "39": 3404871168.0, + "40": 3404871168.0, + "41": 3404871168.0, + "42": 3404871168.0, + "43": 3404871168.0, + "44": 3404871168.0, + "45": 3404871168.0, + "46": 3404871168.0, + "47": 3404871168.0, + "48": 3404871168.0, + "49": 3404871168.0, + "50": 3404871168.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4194526208.0, + "2": 5660965888.0, + "3": 5660965888.0, + "4": 5660965888.0, + "5": 5660965888.0, + "6": 5660965888.0, + "7": 5660965888.0, + "8": 5660965888.0, + "9": 5660965888.0, + "10": 5660965888.0, + "11": 5660965888.0, + "12": 5660965888.0, + "13": 5660965888.0, + "14": 5660965888.0, + "15": 5660965888.0, + "16": 5660965888.0, + "17": 5660965888.0, + "18": 5660965888.0, + "19": 5660965888.0, + "20": 5660965888.0, + "21": 5660965888.0, + "22": 5660965888.0, + "23": 5660965888.0, + "24": 5660965888.0, + "25": 5660965888.0, + "26": 5660965888.0, + "27": 5660965888.0, + "28": 5660965888.0, + "29": 5660965888.0, + "30": 5660965888.0, + "31": 5660965888.0, + "32": 5660965888.0, + "33": 5660965888.0, + "34": 5660965888.0, + "35": 5660965888.0, + "36": 5660965888.0, + "37": 5660965888.0, + "38": 5660965888.0, + "39": 5660965888.0, + "40": 5660965888.0, + "41": 5660965888.0, + "42": 5660965888.0, + "43": 5660965888.0, + "44": 5660965888.0, + "45": 5660965888.0, + "46": 5660965888.0, + "47": 5660965888.0, + "48": 5660965888.0, + "49": 5660965888.0, + "50": 5660965888.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.13654, + "2": 0.5493, + "3": 0.46515, + "4": 0.45431, + "5": 0.46032, + "6": 0.45814, + "7": 0.45793, + "8": 0.46137, + "9": 0.46682, + "10": 0.46519, + "11": 0.46206, + "12": 0.46526, + "13": 0.46309, + "14": 0.46231, + "15": 0.47151, + "16": 0.4581, + "17": 0.4833, + "18": 0.47393, + "19": 0.48513, + "20": 0.47017, + "21": 0.47471, + "22": 0.46394, + "23": 0.46475, + "24": 0.46879, + "25": 0.46294, + "26": 0.46242, + "27": 0.4645, + "28": 0.4715, + "29": 0.46842, + "30": 0.46401, + "31": 0.96127, + "32": 0.4785, + "33": 0.62004, + "34": 0.4827, + "35": 0.47953, + "36": 0.48459, + "37": 0.48738, + "38": 0.49573, + "39": 0.58967, + "40": 0.79369, + "41": 0.46618, + "42": 0.72243, + "43": 0.63291, + "44": 0.62301, + "45": 0.68335, + "46": 0.48579, + "47": 0.46817, + "48": 0.46582, + "49": 0.46457, + "50": 0.46777 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..df02cb774f4 --- /dev/null +++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.47723, + "2": 10.47576, + "3": 10.46809, + "4": 10.47326, + "5": 10.47148, + "6": 10.46049, + "7": 10.46357, + "8": 10.47334, + "9": 10.48063, + "10": 10.46319, + "11": 10.47102, + "12": 10.45502, + "13": 10.44665, + "14": 10.451, + "15": 10.48846, + "16": 10.4509, + "17": 10.44648, + "18": 10.44272, + "19": 10.43057, + "20": 10.44534, + "21": 10.41778, + "22": 10.38667, + "23": 10.39322, + "24": 10.37847, + "25": 10.35474, + "26": 10.35955, + "27": 10.34527, + "28": 10.33539, + "29": 10.25416, + "30": 10.23011, + "31": 10.14092, + "32": 10.13601, + "33": 10.13944, + "34": 10.11377, + "35": 10.0888, + "36": 10.09247, + "37": 10.06836, + "38": 10.04664, + "39": 9.97584, + "40": 9.93781, + "41": 9.90867, + "42": 9.84873, + "43": 9.8577, + "44": 9.79259, + "45": 9.8035, + "46": 9.7029, + "47": 9.73432, + "48": 9.70106, + "49": 9.69981, + "50": 9.70258 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2137.0, + "2": 1618.0, + "3": 1561.0, + "4": 1871.0, + "5": 1983.0, + "6": 1565.0, + "7": 2779.0, + "8": 2108.0, + "9": 2008.0, + "10": 2086.0, + "11": 2534.0, + "12": 1686.0, + "13": 2120.0, + "14": 2814.0, + "15": 1735.0, + "16": 2535.0, + "17": 2409.0, + "18": 2345.0, + "19": 2374.0, + "20": 2739.0, + "21": 2030.0, + "22": 2819.0, + "23": 2763.0, + "24": 2731.0, + "25": 2429.0, + "26": 2817.0, + "27": 2944.0, + "28": 2741.0, + "29": 2639.0, + "30": 2723.0, + "31": 2158.0, + "32": 2242.0, + "33": 2046.0, + "34": 2139.0, + "35": 2492.0, + "36": 2641.0, + "37": 2853.0, + "38": 2705.0, + "39": 2807.0, + "40": 3333.0, + "41": 1762.0, + "42": 1410.0, + "43": 1558.0, + "44": 2384.0, + "45": 3170.0, + "46": 2664.0, + "47": 2641.0, + "48": 3490.0, + "49": 2928.0, + "50": 2487.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3404871168.0, + "2": 3404871168.0, + "3": 3404871168.0, + "4": 3404871168.0, + "5": 3404871168.0, + "6": 3404871168.0, + "7": 3404871168.0, + "8": 3404871168.0, + "9": 3404871168.0, + "10": 3404871168.0, + "11": 3404871168.0, + "12": 3404871168.0, + "13": 3404871168.0, + "14": 3404871168.0, + "15": 3404871168.0, + "16": 3404871168.0, + "17": 3404871168.0, + "18": 3404871168.0, + "19": 3404871168.0, + "20": 3404871168.0, + "21": 3404871168.0, + "22": 3404871168.0, + "23": 3404871168.0, + "24": 3404871168.0, + "25": 3404871168.0, + "26": 3404871168.0, + "27": 3404871168.0, + "28": 3404871168.0, + "29": 3404871168.0, + "30": 3404871168.0, + "31": 3404871168.0, + "32": 3404871168.0, + "33": 3404871168.0, + "34": 3404871168.0, + "35": 3404871168.0, + "36": 3404871168.0, + "37": 3404871168.0, + "38": 3404871168.0, + "39": 3404871168.0, + "40": 3404871168.0, + "41": 3404871168.0, + "42": 3404871168.0, + "43": 3404871168.0, + "44": 3404871168.0, + "45": 3404871168.0, + "46": 3404871168.0, + "47": 3404871168.0, + "48": 3404871168.0, + "49": 3404871168.0, + "50": 3404871168.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4194526208.0, + "2": 5660965888.0, + "3": 5660965888.0, + "4": 5660965888.0, + "5": 5660965888.0, + "6": 5660965888.0, + "7": 5660965888.0, + "8": 5660965888.0, + "9": 5660965888.0, + "10": 5660965888.0, + "11": 5660965888.0, + "12": 5660965888.0, + "13": 5660965888.0, + "14": 5660965888.0, + "15": 5660965888.0, + "16": 5660965888.0, + "17": 5660965888.0, + "18": 5660965888.0, + "19": 5660965888.0, + "20": 5660965888.0, + "21": 5660965888.0, + "22": 5660965888.0, + "23": 5660965888.0, + "24": 5660965888.0, + "25": 5660965888.0, + "26": 5660965888.0, + "27": 5660965888.0, + "28": 5660965888.0, + "29": 5660965888.0, + "30": 5660965888.0, + "31": 5660965888.0, + "32": 5660965888.0, + "33": 5660965888.0, + "34": 5660965888.0, + "35": 5660965888.0, + "36": 5660965888.0, + "37": 5660965888.0, + "38": 5660965888.0, + "39": 5660965888.0, + "40": 5660965888.0, + "41": 5660965888.0, + "42": 5660965888.0, + "43": 5660965888.0, + "44": 5660965888.0, + "45": 5660965888.0, + "46": 5660965888.0, + "47": 5660965888.0, + "48": 5660965888.0, + "49": 5660965888.0, + "50": 5660965888.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.44279, + "2": 0.55345, + "3": 0.53909, + "4": 0.52187, + "5": 0.52958, + "6": 0.5241, + "7": 0.5353, + "8": 0.51946, + "9": 0.52732, + "10": 0.52759, + "11": 0.51849, + "12": 0.52326, + "13": 0.52472, + "14": 0.52577, + "15": 0.51817, + "16": 0.51922, + "17": 0.51686, + "18": 0.5248, + "19": 0.51945, + "20": 0.74697, + "21": 0.51544, + "22": 0.52412, + "23": 0.66206, + "24": 0.51781, + "25": 0.52429, + "26": 0.52068, + "27": 0.62432, + "28": 0.52016, + "29": 0.52217, + "30": 0.51949, + "31": 0.69033, + "32": 0.52127, + "33": 0.52602, + "34": 0.6403, + "35": 0.51723, + "36": 0.52445, + "37": 0.51746, + "38": 0.52296, + "39": 0.52159, + "40": 0.6718, + "41": 0.58171, + "42": 0.7393, + "43": 0.54277, + "44": 0.81615, + "45": 0.52284, + "46": 0.71947, + "47": 0.52219, + "48": 0.51866, + "49": 0.51764, + "50": 0.51841 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgx_h100.json index 8101027dc18..edd42f32479 100644 --- a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.55236, + "2": 10.52891, + "3": 10.55085, + "4": 10.55035, "5": 10.52311, + "6": 10.53328, + "7": 10.53097, + "8": 10.54323, + "9": 10.54514, "10": 10.53676, + "11": 10.53791, + "12": 10.54319, + "13": 10.5263, + "14": 10.5316, "15": 10.52714, + "16": 10.50594, + "17": 10.5009, + "18": 10.51023, + "19": 10.493, "20": 10.48862, + "21": 10.47473, + "22": 10.42799, + "23": 10.42684, + "24": 10.4036, "25": 10.39991, + "26": 10.38461, + "27": 10.38216, + "28": 10.36877, + "29": 10.32192, "30": 10.2204, + "31": 10.17094, + "32": 10.12605, + "33": 10.10628, + "34": 10.09438, "35": 10.07042, + "36": 10.07481, + "37": 10.03644, + "38": 10.01812, + "39": 9.96852, "40": 9.93082, + "41": 9.87316, + "42": 9.81842, + "43": 9.8156, + "44": 9.73841, "45": 9.7628, + "46": 9.67691, + "47": 9.68688, + "48": 9.66292, + "49": 9.67587, "50": 9.67446 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2320.0, + "2": 2645.0, + "3": 2441.0, + "4": 2417.0, "5": 2730.0, + "6": 2332.0, + "7": 1661.0, + "8": 2386.0, + "9": 2256.0, "10": 2428.0, + "11": 2152.0, + "12": 2337.0, + "13": 2643.0, + "14": 2209.0, "15": 2607.0, + "16": 2411.0, + "17": 2529.0, + "18": 2418.0, + "19": 2363.0, "20": 2323.0, + "21": 2401.0, + "22": 2588.0, + "23": 2338.0, + "24": 2305.0, "25": 2702.0, + "26": 2370.0, + "27": 2462.0, + "28": 2407.0, + "29": 2240.0, "30": 2850.0, + "31": 2882.0, + "32": 2837.0, + "33": 2645.0, + "34": 2874.0, "35": 2913.0, + "36": 3000.0, + "37": 3122.0, + "38": 2680.0, + "39": 2216.0, "40": 2211.0, + "41": 3456.0, + "42": 3624.0, + "43": 3364.0, + "44": 4026.0, "45": 4145.0, + "46": 2924.0, + "47": 1942.0, + "48": 3363.0, + "49": 3532.0, "50": 3710.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2061524480.0, + "2": 2061524480.0, + "3": 2061524480.0, + "4": 2061524480.0, "5": 2061524480.0, + "6": 2061524480.0, + "7": 2061524480.0, + "8": 2061524480.0, + "9": 2061524480.0, "10": 2061524480.0, + "11": 2061524480.0, + "12": 2061524480.0, + "13": 2061524480.0, + "14": 2061524480.0, "15": 2061524480.0, + "16": 2061524480.0, + "17": 2061524480.0, + "18": 2061524480.0, + "19": 2061524480.0, "20": 2061524480.0, + "21": 2061524480.0, + "22": 2061524480.0, + "23": 2061524480.0, + "24": 2061524480.0, "25": 2061524480.0, + "26": 2061524480.0, + "27": 2061524480.0, + "28": 2061524480.0, + "29": 2061524480.0, "30": 2061524480.0, + "31": 2061524480.0, + "32": 2061524480.0, + "33": 2061524480.0, + "34": 2061524480.0, "35": 2061524480.0, + "36": 2061524480.0, + "37": 2061524480.0, + "38": 2061524480.0, + "39": 2061524480.0, "40": 2061524480.0, + "41": 2061524480.0, + "42": 2061524480.0, + "43": 2061524480.0, + "44": 2061524480.0, "45": 2061524480.0, + "46": 2061524480.0, + "47": 2061524480.0, + "48": 2061524480.0, + "49": 2061524480.0, "50": 2061524480.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 4385424896.0, + "2": 5245672960.0, + "3": 5245672960.0, + "4": 5245672960.0, "5": 5245672960.0, + "6": 5245672960.0, + "7": 5245672960.0, + "8": 5245672960.0, + "9": 5245672960.0, "10": 5245672960.0, + "11": 5245672960.0, + "12": 5245672960.0, + "13": 5245672960.0, + "14": 5245672960.0, "15": 5245672960.0, + "16": 5245672960.0, + "17": 5245672960.0, + "18": 5245672960.0, + "19": 5245672960.0, "20": 5245672960.0, + "21": 5245672960.0, + "22": 5245672960.0, + "23": 5245672960.0, + "24": 5245672960.0, "25": 5245672960.0, + "26": 5245672960.0, + "27": 5245672960.0, + "28": 5245672960.0, + "29": 5245672960.0, "30": 5245672960.0, + "31": 5245672960.0, + "32": 5245672960.0, + "33": 5245672960.0, + "34": 5245672960.0, "35": 5245672960.0, + "36": 5245672960.0, + "37": 5245672960.0, + "38": 5245672960.0, + "39": 5245672960.0, "40": 5245672960.0, + "41": 5245672960.0, + "42": 5245672960.0, + "43": 5245672960.0, + "44": 5245672960.0, "45": 5245672960.0, + "46": 5245672960.0, + "47": 5245672960.0, + "48": 5245672960.0, + "49": 5245672960.0, "50": 5245672960.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 13.96724, - "5": 0.61599, - "10": 0.61805, - "15": 0.63435, - "20": 1.30403, - "25": 0.62544, - "30": 0.59341, - "35": 0.60604, - "40": 0.61527, - "45": 1.34256, - "50": 0.59871 + "1": 14.52125, + "2": 0.80201, + "3": 0.7469, + "4": 0.73694, + "5": 0.7315, + "6": 0.74178, + "7": 0.74868, + "8": 0.76041, + "9": 0.73349, + "10": 0.73103, + "11": 0.72627, + "12": 1.24485, + "13": 0.92369, + "14": 0.9992, + "15": 0.71522, + "16": 0.72059, + "17": 0.70821, + "18": 0.72513, + "19": 0.92847, + "20": 1.55552, + "21": 1.65501, + "22": 1.61714, + "23": 1.01208, + "24": 0.97003, + "25": 0.73922, + "26": 0.76213, + "27": 0.71228, + "28": 0.74068, + "29": 0.70429, + "30": 0.73547, + "31": 0.73693, + "32": 0.72401, + "33": 0.73688, + "34": 0.73718, + "35": 0.70434, + "36": 0.71346, + "37": 0.71973, + "38": 0.70358, + "39": 1.01971, + "40": 0.72495, + "41": 1.04905, + "42": 0.71671, + "43": 0.89934, + "44": 0.71242, + "45": 0.70583, + "46": 0.69596, + "47": 1.2374, + "48": 1.16, + "49": 1.08122, + "50": 1.48874 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..b825cf8964e --- /dev/null +++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.55236, + "2": 10.52891, + "3": 10.55085, + "4": 10.55035, + "5": 10.52311, + "6": 10.53328, + "7": 10.53097, + "8": 10.54323, + "9": 10.54514, + "10": 10.53676, + "11": 10.53791, + "12": 10.54319, + "13": 10.5263, + "14": 10.5316, + "15": 10.52714, + "16": 10.50594, + "17": 10.5009, + "18": 10.51023, + "19": 10.493, + "20": 10.48862, + "21": 10.47473, + "22": 10.42799, + "23": 10.42684, + "24": 10.4036, + "25": 10.39991, + "26": 10.38461, + "27": 10.38216, + "28": 10.36877, + "29": 10.32192, + "30": 10.2204, + "31": 10.17094, + "32": 10.12605, + "33": 10.10628, + "34": 10.09438, + "35": 10.07042, + "36": 10.07481, + "37": 10.03644, + "38": 10.01812, + "39": 9.96852, + "40": 9.93082, + "41": 9.87316, + "42": 9.81842, + "43": 9.8156, + "44": 9.73841, + "45": 9.7628, + "46": 9.67691, + "47": 9.68688, + "48": 9.66292, + "49": 9.67587, + "50": 9.67446 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2320.0, + "2": 2645.0, + "3": 2441.0, + "4": 2417.0, + "5": 2730.0, + "6": 2332.0, + "7": 1661.0, + "8": 2386.0, + "9": 2256.0, + "10": 2428.0, + "11": 2152.0, + "12": 2337.0, + "13": 2643.0, + "14": 2209.0, + "15": 2607.0, + "16": 2411.0, + "17": 2529.0, + "18": 2418.0, + "19": 2363.0, + "20": 2323.0, + "21": 2401.0, + "22": 2588.0, + "23": 2338.0, + "24": 2305.0, + "25": 2702.0, + "26": 2370.0, + "27": 2462.0, + "28": 2407.0, + "29": 2240.0, + "30": 2850.0, + "31": 2882.0, + "32": 2837.0, + "33": 2645.0, + "34": 2874.0, + "35": 2913.0, + "36": 3000.0, + "37": 3122.0, + "38": 2680.0, + "39": 2216.0, + "40": 2211.0, + "41": 3456.0, + "42": 3624.0, + "43": 3364.0, + "44": 4026.0, + "45": 4145.0, + "46": 2924.0, + "47": 1942.0, + "48": 3363.0, + "49": 3532.0, + "50": 3710.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2061524480.0, + "2": 2061524480.0, + "3": 2061524480.0, + "4": 2061524480.0, + "5": 2061524480.0, + "6": 2061524480.0, + "7": 2061524480.0, + "8": 2061524480.0, + "9": 2061524480.0, + "10": 2061524480.0, + "11": 2061524480.0, + "12": 2061524480.0, + "13": 2061524480.0, + "14": 2061524480.0, + "15": 2061524480.0, + "16": 2061524480.0, + "17": 2061524480.0, + "18": 2061524480.0, + "19": 2061524480.0, + "20": 2061524480.0, + "21": 2061524480.0, + "22": 2061524480.0, + "23": 2061524480.0, + "24": 2061524480.0, + "25": 2061524480.0, + "26": 2061524480.0, + "27": 2061524480.0, + "28": 2061524480.0, + "29": 2061524480.0, + "30": 2061524480.0, + "31": 2061524480.0, + "32": 2061524480.0, + "33": 2061524480.0, + "34": 2061524480.0, + "35": 2061524480.0, + "36": 2061524480.0, + "37": 2061524480.0, + "38": 2061524480.0, + "39": 2061524480.0, + "40": 2061524480.0, + "41": 2061524480.0, + "42": 2061524480.0, + "43": 2061524480.0, + "44": 2061524480.0, + "45": 2061524480.0, + "46": 2061524480.0, + "47": 2061524480.0, + "48": 2061524480.0, + "49": 2061524480.0, + "50": 2061524480.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4385424896.0, + "2": 5245672960.0, + "3": 5245672960.0, + "4": 5245672960.0, + "5": 5245672960.0, + "6": 5245672960.0, + "7": 5245672960.0, + "8": 5245672960.0, + "9": 5245672960.0, + "10": 5245672960.0, + "11": 5245672960.0, + "12": 5245672960.0, + "13": 5245672960.0, + "14": 5245672960.0, + "15": 5245672960.0, + "16": 5245672960.0, + "17": 5245672960.0, + "18": 5245672960.0, + "19": 5245672960.0, + "20": 5245672960.0, + "21": 5245672960.0, + "22": 5245672960.0, + "23": 5245672960.0, + "24": 5245672960.0, + "25": 5245672960.0, + "26": 5245672960.0, + "27": 5245672960.0, + "28": 5245672960.0, + "29": 5245672960.0, + "30": 5245672960.0, + "31": 5245672960.0, + "32": 5245672960.0, + "33": 5245672960.0, + "34": 5245672960.0, + "35": 5245672960.0, + "36": 5245672960.0, + "37": 5245672960.0, + "38": 5245672960.0, + "39": 5245672960.0, + "40": 5245672960.0, + "41": 5245672960.0, + "42": 5245672960.0, + "43": 5245672960.0, + "44": 5245672960.0, + "45": 5245672960.0, + "46": 5245672960.0, + "47": 5245672960.0, + "48": 5245672960.0, + "49": 5245672960.0, + "50": 5245672960.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 16.04066, + "2": 0.7032, + "3": 0.64317, + "4": 0.64902, + "5": 0.64969, + "6": 0.63112, + "7": 0.65022, + "8": 0.64825, + "9": 0.6561, + "10": 0.65389, + "11": 0.63629, + "12": 0.61059, + "13": 0.61378, + "14": 0.63387, + "15": 0.63512, + "16": 0.67245, + "17": 1.84585, + "18": 0.92074, + "19": 0.88511, + "20": 1.52328, + "21": 1.57421, + "22": 1.42349, + "23": 0.90417, + "24": 0.62214, + "25": 0.61751, + "26": 0.62328, + "27": 0.63404, + "28": 0.64274, + "29": 0.61224, + "30": 0.6522, + "31": 0.65622, + "32": 0.64451, + "33": 0.65916, + "34": 0.67975, + "35": 0.63318, + "36": 0.63519, + "37": 0.62099, + "38": 0.63824, + "39": 0.65345, + "40": 0.63256, + "41": 0.64564, + "42": 0.61807, + "43": 0.84645, + "44": 0.85427, + "45": 0.85855, + "46": 0.97022, + "47": 1.2994, + "48": 1.26968, + "49": 1.21118, + "50": 1.43722 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..0d85e13b23b --- /dev/null +++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.55236, + "2": 10.52891, + "3": 10.55085, + "4": 10.55035, + "5": 10.52311, + "6": 10.53328, + "7": 10.53097, + "8": 10.54323, + "9": 10.54514, + "10": 10.53676, + "11": 10.53791, + "12": 10.54319, + "13": 10.5263, + "14": 10.5316, + "15": 10.52714, + "16": 10.50594, + "17": 10.5009, + "18": 10.51023, + "19": 10.493, + "20": 10.48862, + "21": 10.47473, + "22": 10.42799, + "23": 10.42684, + "24": 10.4036, + "25": 10.39991, + "26": 10.38461, + "27": 10.38216, + "28": 10.36877, + "29": 10.32192, + "30": 10.2204, + "31": 10.17094, + "32": 10.12605, + "33": 10.10628, + "34": 10.09438, + "35": 10.07042, + "36": 10.07481, + "37": 10.03644, + "38": 10.01812, + "39": 9.96852, + "40": 9.93082, + "41": 9.87316, + "42": 9.81842, + "43": 9.8156, + "44": 9.73841, + "45": 9.7628, + "46": 9.67691, + "47": 9.68688, + "48": 9.66292, + "49": 9.67587, + "50": 9.67446 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2320.0, + "2": 2645.0, + "3": 2441.0, + "4": 2417.0, + "5": 2730.0, + "6": 2332.0, + "7": 1661.0, + "8": 2386.0, + "9": 2256.0, + "10": 2428.0, + "11": 2152.0, + "12": 2337.0, + "13": 2643.0, + "14": 2209.0, + "15": 2607.0, + "16": 2411.0, + "17": 2529.0, + "18": 2418.0, + "19": 2363.0, + "20": 2323.0, + "21": 2401.0, + "22": 2588.0, + "23": 2338.0, + "24": 2305.0, + "25": 2702.0, + "26": 2370.0, + "27": 2462.0, + "28": 2407.0, + "29": 2240.0, + "30": 2850.0, + "31": 2882.0, + "32": 2837.0, + "33": 2645.0, + "34": 2874.0, + "35": 2913.0, + "36": 3000.0, + "37": 3122.0, + "38": 2680.0, + "39": 2216.0, + "40": 2211.0, + "41": 3456.0, + "42": 3624.0, + "43": 3364.0, + "44": 4026.0, + "45": 4145.0, + "46": 2924.0, + "47": 1942.0, + "48": 3363.0, + "49": 3532.0, + "50": 3710.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2061524480.0, + "2": 2061524480.0, + "3": 2061524480.0, + "4": 2061524480.0, + "5": 2061524480.0, + "6": 2061524480.0, + "7": 2061524480.0, + "8": 2061524480.0, + "9": 2061524480.0, + "10": 2061524480.0, + "11": 2061524480.0, + "12": 2061524480.0, + "13": 2061524480.0, + "14": 2061524480.0, + "15": 2061524480.0, + "16": 2061524480.0, + "17": 2061524480.0, + "18": 2061524480.0, + "19": 2061524480.0, + "20": 2061524480.0, + "21": 2061524480.0, + "22": 2061524480.0, + "23": 2061524480.0, + "24": 2061524480.0, + "25": 2061524480.0, + "26": 2061524480.0, + "27": 2061524480.0, + "28": 2061524480.0, + "29": 2061524480.0, + "30": 2061524480.0, + "31": 2061524480.0, + "32": 2061524480.0, + "33": 2061524480.0, + "34": 2061524480.0, + "35": 2061524480.0, + "36": 2061524480.0, + "37": 2061524480.0, + "38": 2061524480.0, + "39": 2061524480.0, + "40": 2061524480.0, + "41": 2061524480.0, + "42": 2061524480.0, + "43": 2061524480.0, + "44": 2061524480.0, + "45": 2061524480.0, + "46": 2061524480.0, + "47": 2061524480.0, + "48": 2061524480.0, + "49": 2061524480.0, + "50": 2061524480.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4385424896.0, + "2": 5245672960.0, + "3": 5245672960.0, + "4": 5245672960.0, + "5": 5245672960.0, + "6": 5245672960.0, + "7": 5245672960.0, + "8": 5245672960.0, + "9": 5245672960.0, + "10": 5245672960.0, + "11": 5245672960.0, + "12": 5245672960.0, + "13": 5245672960.0, + "14": 5245672960.0, + "15": 5245672960.0, + "16": 5245672960.0, + "17": 5245672960.0, + "18": 5245672960.0, + "19": 5245672960.0, + "20": 5245672960.0, + "21": 5245672960.0, + "22": 5245672960.0, + "23": 5245672960.0, + "24": 5245672960.0, + "25": 5245672960.0, + "26": 5245672960.0, + "27": 5245672960.0, + "28": 5245672960.0, + "29": 5245672960.0, + "30": 5245672960.0, + "31": 5245672960.0, + "32": 5245672960.0, + "33": 5245672960.0, + "34": 5245672960.0, + "35": 5245672960.0, + "36": 5245672960.0, + "37": 5245672960.0, + "38": 5245672960.0, + "39": 5245672960.0, + "40": 5245672960.0, + "41": 5245672960.0, + "42": 5245672960.0, + "43": 5245672960.0, + "44": 5245672960.0, + "45": 5245672960.0, + "46": 5245672960.0, + "47": 5245672960.0, + "48": 5245672960.0, + "49": 5245672960.0, + "50": 5245672960.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 14.48983, + "2": 0.782, + "3": 0.71913, + "4": 0.71541, + "5": 0.71528, + "6": 0.7219, + "7": 0.72729, + "8": 0.72714, + "9": 0.7634, + "10": 0.71523, + "11": 0.72303, + "12": 1.34179, + "13": 0.93338, + "14": 0.72484, + "15": 0.70784, + "16": 0.72443, + "17": 0.72151, + "18": 0.71102, + "19": 1.13624, + "20": 1.56469, + "21": 1.66622, + "22": 0.9574, + "23": 0.69921, + "24": 0.70477, + "25": 0.73932, + "26": 0.74798, + "27": 0.72633, + "28": 0.72782, + "29": 0.73646, + "30": 0.73665, + "31": 0.74301, + "32": 0.73363, + "33": 0.71952, + "34": 0.7406, + "35": 0.71103, + "36": 0.70026, + "37": 0.71087, + "38": 0.88272, + "39": 0.71279, + "40": 0.92123, + "41": 1.20193, + "42": 0.72924, + "43": 0.70749, + "44": 0.72158, + "45": 0.71169, + "46": 1.23637, + "47": 1.13432, + "48": 1.26896, + "49": 1.13682, + "50": 1.21366 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgx_h100.json index 6ca48489088..36ea57771ea 100644 --- a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.42626, + "2": 10.41171, + "3": 10.41885, + "4": 10.42153, "5": 10.42192, + "6": 10.41563, + "7": 10.42859, + "8": 10.42079, + "9": 10.43014, "10": 10.40859, + "11": 10.43501, + "12": 10.4025, + "13": 10.42274, + "14": 10.41249, "15": 10.40948, + "16": 10.40806, + "17": 10.3892, + "18": 10.38857, + "19": 10.37147, "20": 10.40453, + "21": 10.36615, + "22": 10.34963, + "23": 10.35388, + "24": 10.30136, "25": 10.31117, + "26": 10.30241, + "27": 10.2821, + "28": 10.27928, + "29": 10.23928, "30": 10.14742, + "31": 10.10532, + "32": 10.09426, + "33": 10.09032, + "34": 10.06437, "35": 10.04643, + "36": 10.03306, + "37": 10.00505, + "38": 10.00274, + "39": 9.91418, "40": 9.91103, + "41": 9.86562, + "42": 9.78095, + "43": 9.79496, + "44": 9.73077, "45": 9.7428, + "46": 9.63829, + "47": 9.6868, + "48": 9.637, + "49": 9.6554, "50": 9.65776 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 3452.0, + "2": 2890.0, + "3": 1856.0, + "4": 3256.0, "5": 3333.0, + "6": 2985.0, + "7": 3208.0, + "8": 3314.0, + "9": 3134.0, "10": 3124.0, + "11": 3913.0, + "12": 3008.0, + "13": 3108.0, + "14": 3652.0, "15": 3267.0, + "16": 3662.0, + "17": 3680.0, + "18": 3708.0, + "19": 3375.0, "20": 3449.0, + "21": 3115.0, + "22": 3545.0, + "23": 3516.0, + "24": 3789.0, "25": 3570.0, + "26": 3719.0, + "27": 2808.0, + "28": 3823.0, + "29": 3626.0, "30": 4136.0, + "31": 2541.0, + "32": 3945.0, + "33": 3501.0, + "34": 3795.0, "35": 3652.0, + "36": 4269.0, + "37": 4152.0, + "38": 3787.0, + "39": 3873.0, "40": 4661.0, + "41": 2846.0, + "42": 1556.0, + "43": 2809.0, + "44": 4030.0, "45": 4724.0, + "46": 4587.0, + "47": 3120.0, + "48": 4366.0, + "49": 3839.0, "50": 3146.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1661765632.0, + "2": 1661765632.0, + "3": 1661765632.0, + "4": 1661765632.0, "5": 1661765632.0, + "6": 1661765632.0, + "7": 1661765632.0, + "8": 1661765632.0, + "9": 1661765632.0, "10": 1661765632.0, + "11": 1661765632.0, + "12": 1661765632.0, + "13": 1661765632.0, + "14": 1661765632.0, "15": 1661765632.0, + "16": 1661765632.0, + "17": 1661765632.0, + "18": 1661765632.0, + "19": 1661765632.0, "20": 1661765632.0, + "21": 1661765632.0, + "22": 1661765632.0, + "23": 1661765632.0, + "24": 1661765632.0, "25": 1661765632.0, + "26": 1661765632.0, + "27": 1661765632.0, + "28": 1661765632.0, + "29": 1661765632.0, "30": 1661765632.0, + "31": 1661765632.0, + "32": 1661765632.0, + "33": 1661765632.0, + "34": 1661765632.0, "35": 1661765632.0, + "36": 1661765632.0, + "37": 1661765632.0, + "38": 1661765632.0, + "39": 1661765632.0, "40": 1661765632.0, + "41": 1661765632.0, + "42": 1661765632.0, + "43": 1661765632.0, + "44": 1661765632.0, "45": 1661765632.0, + "46": 1661765632.0, + "47": 1661765632.0, + "48": 1661765632.0, + "49": 1661765632.0, "50": 1661765632.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2506479104.0, + "2": 3205449216.0, + "3": 3205449216.0, + "4": 3205449216.0, "5": 3205449216.0, + "6": 3205449216.0, + "7": 3205449216.0, + "8": 3205449216.0, + "9": 3205449216.0, "10": 3205449216.0, + "11": 3205449216.0, + "12": 3205449216.0, + "13": 3205449216.0, + "14": 3205449216.0, "15": 3205449216.0, + "16": 3205449216.0, + "17": 3205449216.0, + "18": 3205449216.0, + "19": 3205449216.0, "20": 3205449216.0, + "21": 3205449216.0, + "22": 3205449216.0, + "23": 3205449216.0, + "24": 3205449216.0, "25": 3205449216.0, + "26": 3205449216.0, + "27": 3205449216.0, + "28": 3205449216.0, + "29": 3205449216.0, "30": 3205449216.0, + "31": 3205449216.0, + "32": 3205449216.0, + "33": 3205449216.0, + "34": 3205449216.0, "35": 3205449216.0, + "36": 3205449216.0, + "37": 3205449216.0, + "38": 3205449216.0, + "39": 3205449216.0, "40": 3205449216.0, + "41": 3205449216.0, + "42": 3205449216.0, + "43": 3205449216.0, + "44": 3205449216.0, "45": 3205449216.0, + "46": 3205449216.0, + "47": 3205449216.0, + "48": 3205449216.0, + "49": 3205449216.0, "50": 3205449216.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 9.57532, - "5": 1.46202, - "10": 1.45865, - "15": 1.46969, - "20": 1.46895, - "25": 1.45633, - "30": 1.74568, - "35": 1.47151, - "40": 1.4582, - "45": 1.45697, - "50": 1.45728 + "1": 10.20165, + "2": 1.76894, + "3": 1.75257, + "4": 1.76371, + "5": 1.76165, + "6": 1.76697, + "7": 1.7566, + "8": 1.76422, + "9": 1.76493, + "10": 1.76085, + "11": 1.75557, + "12": 1.7612, + "13": 1.84209, + "14": 1.7609, + "15": 1.75819, + "16": 1.76084, + "17": 2.14365, + "18": 1.77031, + "19": 1.77623, + "20": 1.81462, + "21": 2.1764, + "22": 1.76578, + "23": 1.75799, + "24": 2.18418, + "25": 1.76236, + "26": 2.12149, + "27": 2.09277, + "28": 1.77853, + "29": 1.83529, + "30": 1.77362, + "31": 1.77704, + "32": 1.78154, + "33": 1.76732, + "34": 1.77318, + "35": 1.77963, + "36": 1.77541, + "37": 1.77626, + "38": 1.77185, + "39": 1.78486, + "40": 1.78003, + "41": 1.78092, + "42": 1.77118, + "43": 1.77626, + "44": 1.78384, + "45": 1.78376, + "46": 1.84893, + "47": 1.78761, + "48": 1.79814, + "49": 1.79323, + "50": 1.77941 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..73cbc43b7f2 --- /dev/null +++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.42626, + "2": 10.41171, + "3": 10.41885, + "4": 10.42153, + "5": 10.42192, + "6": 10.41563, + "7": 10.42859, + "8": 10.42079, + "9": 10.43014, + "10": 10.40859, + "11": 10.43501, + "12": 10.4025, + "13": 10.42274, + "14": 10.41249, + "15": 10.40948, + "16": 10.40806, + "17": 10.3892, + "18": 10.38857, + "19": 10.37147, + "20": 10.40453, + "21": 10.36615, + "22": 10.34963, + "23": 10.35388, + "24": 10.30136, + "25": 10.31117, + "26": 10.30241, + "27": 10.2821, + "28": 10.27928, + "29": 10.23928, + "30": 10.14742, + "31": 10.10532, + "32": 10.09426, + "33": 10.09032, + "34": 10.06437, + "35": 10.04643, + "36": 10.03306, + "37": 10.00505, + "38": 10.00274, + "39": 9.91418, + "40": 9.91103, + "41": 9.86562, + "42": 9.78095, + "43": 9.79496, + "44": 9.73077, + "45": 9.7428, + "46": 9.63829, + "47": 9.6868, + "48": 9.637, + "49": 9.6554, + "50": 9.65776 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3452.0, + "2": 2890.0, + "3": 1856.0, + "4": 3256.0, + "5": 3333.0, + "6": 2985.0, + "7": 3208.0, + "8": 3314.0, + "9": 3134.0, + "10": 3124.0, + "11": 3913.0, + "12": 3008.0, + "13": 3108.0, + "14": 3652.0, + "15": 3267.0, + "16": 3662.0, + "17": 3680.0, + "18": 3708.0, + "19": 3375.0, + "20": 3449.0, + "21": 3115.0, + "22": 3545.0, + "23": 3516.0, + "24": 3789.0, + "25": 3570.0, + "26": 3719.0, + "27": 2808.0, + "28": 3823.0, + "29": 3626.0, + "30": 4136.0, + "31": 2541.0, + "32": 3945.0, + "33": 3501.0, + "34": 3795.0, + "35": 3652.0, + "36": 4269.0, + "37": 4152.0, + "38": 3787.0, + "39": 3873.0, + "40": 4661.0, + "41": 2846.0, + "42": 1556.0, + "43": 2809.0, + "44": 4030.0, + "45": 4724.0, + "46": 4587.0, + "47": 3120.0, + "48": 4366.0, + "49": 3839.0, + "50": 3146.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1661765632.0, + "2": 1661765632.0, + "3": 1661765632.0, + "4": 1661765632.0, + "5": 1661765632.0, + "6": 1661765632.0, + "7": 1661765632.0, + "8": 1661765632.0, + "9": 1661765632.0, + "10": 1661765632.0, + "11": 1661765632.0, + "12": 1661765632.0, + "13": 1661765632.0, + "14": 1661765632.0, + "15": 1661765632.0, + "16": 1661765632.0, + "17": 1661765632.0, + "18": 1661765632.0, + "19": 1661765632.0, + "20": 1661765632.0, + "21": 1661765632.0, + "22": 1661765632.0, + "23": 1661765632.0, + "24": 1661765632.0, + "25": 1661765632.0, + "26": 1661765632.0, + "27": 1661765632.0, + "28": 1661765632.0, + "29": 1661765632.0, + "30": 1661765632.0, + "31": 1661765632.0, + "32": 1661765632.0, + "33": 1661765632.0, + "34": 1661765632.0, + "35": 1661765632.0, + "36": 1661765632.0, + "37": 1661765632.0, + "38": 1661765632.0, + "39": 1661765632.0, + "40": 1661765632.0, + "41": 1661765632.0, + "42": 1661765632.0, + "43": 1661765632.0, + "44": 1661765632.0, + "45": 1661765632.0, + "46": 1661765632.0, + "47": 1661765632.0, + "48": 1661765632.0, + "49": 1661765632.0, + "50": 1661765632.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2506479104.0, + "2": 3205449216.0, + "3": 3205449216.0, + "4": 3205449216.0, + "5": 3205449216.0, + "6": 3205449216.0, + "7": 3205449216.0, + "8": 3205449216.0, + "9": 3205449216.0, + "10": 3205449216.0, + "11": 3205449216.0, + "12": 3205449216.0, + "13": 3205449216.0, + "14": 3205449216.0, + "15": 3205449216.0, + "16": 3205449216.0, + "17": 3205449216.0, + "18": 3205449216.0, + "19": 3205449216.0, + "20": 3205449216.0, + "21": 3205449216.0, + "22": 3205449216.0, + "23": 3205449216.0, + "24": 3205449216.0, + "25": 3205449216.0, + "26": 3205449216.0, + "27": 3205449216.0, + "28": 3205449216.0, + "29": 3205449216.0, + "30": 3205449216.0, + "31": 3205449216.0, + "32": 3205449216.0, + "33": 3205449216.0, + "34": 3205449216.0, + "35": 3205449216.0, + "36": 3205449216.0, + "37": 3205449216.0, + "38": 3205449216.0, + "39": 3205449216.0, + "40": 3205449216.0, + "41": 3205449216.0, + "42": 3205449216.0, + "43": 3205449216.0, + "44": 3205449216.0, + "45": 3205449216.0, + "46": 3205449216.0, + "47": 3205449216.0, + "48": 3205449216.0, + "49": 3205449216.0, + "50": 3205449216.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.60443, + "2": 1.59144, + "3": 1.53882, + "4": 1.56784, + "5": 1.52207, + "6": 1.53885, + "7": 1.52214, + "8": 1.52095, + "9": 1.51957, + "10": 1.51224, + "11": 1.49689, + "12": 1.5078, + "13": 1.50118, + "14": 1.4917, + "15": 1.60359, + "16": 1.55447, + "17": 1.55262, + "18": 1.84594, + "19": 1.55841, + "20": 1.7545, + "21": 1.48478, + "22": 1.49549, + "23": 1.81525, + "24": 1.79126, + "25": 2.12023, + "26": 1.49775, + "27": 1.80406, + "28": 1.49411, + "29": 1.96966, + "30": 1.48009, + "31": 1.47915, + "32": 1.48757, + "33": 1.47812, + "34": 1.4701, + "35": 1.47099, + "36": 1.47773, + "37": 1.48414, + "38": 1.51352, + "39": 1.48595, + "40": 1.49001, + "41": 1.48545, + "42": 1.50863, + "43": 1.47565, + "44": 1.48135, + "45": 1.48123, + "46": 1.48152, + "47": 1.48884, + "48": 1.56195, + "49": 1.55628, + "50": 1.48725 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..88adf60a26e --- /dev/null +++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.42626, + "2": 10.41171, + "3": 10.41885, + "4": 10.42153, + "5": 10.42192, + "6": 10.41563, + "7": 10.42859, + "8": 10.42079, + "9": 10.43014, + "10": 10.40859, + "11": 10.43501, + "12": 10.4025, + "13": 10.42274, + "14": 10.41249, + "15": 10.40948, + "16": 10.40806, + "17": 10.3892, + "18": 10.38857, + "19": 10.37147, + "20": 10.40453, + "21": 10.36615, + "22": 10.34963, + "23": 10.35388, + "24": 10.30136, + "25": 10.31117, + "26": 10.30241, + "27": 10.2821, + "28": 10.27928, + "29": 10.23928, + "30": 10.14742, + "31": 10.10532, + "32": 10.09426, + "33": 10.09032, + "34": 10.06437, + "35": 10.04643, + "36": 10.03306, + "37": 10.00505, + "38": 10.00274, + "39": 9.91418, + "40": 9.91103, + "41": 9.86562, + "42": 9.78095, + "43": 9.79496, + "44": 9.73077, + "45": 9.7428, + "46": 9.63829, + "47": 9.6868, + "48": 9.637, + "49": 9.6554, + "50": 9.65776 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3452.0, + "2": 2890.0, + "3": 1856.0, + "4": 3256.0, + "5": 3333.0, + "6": 2985.0, + "7": 3208.0, + "8": 3314.0, + "9": 3134.0, + "10": 3124.0, + "11": 3913.0, + "12": 3008.0, + "13": 3108.0, + "14": 3652.0, + "15": 3267.0, + "16": 3662.0, + "17": 3680.0, + "18": 3708.0, + "19": 3375.0, + "20": 3449.0, + "21": 3115.0, + "22": 3545.0, + "23": 3516.0, + "24": 3789.0, + "25": 3570.0, + "26": 3719.0, + "27": 2808.0, + "28": 3823.0, + "29": 3626.0, + "30": 4136.0, + "31": 2541.0, + "32": 3945.0, + "33": 3501.0, + "34": 3795.0, + "35": 3652.0, + "36": 4269.0, + "37": 4152.0, + "38": 3787.0, + "39": 3873.0, + "40": 4661.0, + "41": 2846.0, + "42": 1556.0, + "43": 2809.0, + "44": 4030.0, + "45": 4724.0, + "46": 4587.0, + "47": 3120.0, + "48": 4366.0, + "49": 3839.0, + "50": 3146.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1661765632.0, + "2": 1661765632.0, + "3": 1661765632.0, + "4": 1661765632.0, + "5": 1661765632.0, + "6": 1661765632.0, + "7": 1661765632.0, + "8": 1661765632.0, + "9": 1661765632.0, + "10": 1661765632.0, + "11": 1661765632.0, + "12": 1661765632.0, + "13": 1661765632.0, + "14": 1661765632.0, + "15": 1661765632.0, + "16": 1661765632.0, + "17": 1661765632.0, + "18": 1661765632.0, + "19": 1661765632.0, + "20": 1661765632.0, + "21": 1661765632.0, + "22": 1661765632.0, + "23": 1661765632.0, + "24": 1661765632.0, + "25": 1661765632.0, + "26": 1661765632.0, + "27": 1661765632.0, + "28": 1661765632.0, + "29": 1661765632.0, + "30": 1661765632.0, + "31": 1661765632.0, + "32": 1661765632.0, + "33": 1661765632.0, + "34": 1661765632.0, + "35": 1661765632.0, + "36": 1661765632.0, + "37": 1661765632.0, + "38": 1661765632.0, + "39": 1661765632.0, + "40": 1661765632.0, + "41": 1661765632.0, + "42": 1661765632.0, + "43": 1661765632.0, + "44": 1661765632.0, + "45": 1661765632.0, + "46": 1661765632.0, + "47": 1661765632.0, + "48": 1661765632.0, + "49": 1661765632.0, + "50": 1661765632.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2506479104.0, + "2": 3205449216.0, + "3": 3205449216.0, + "4": 3205449216.0, + "5": 3205449216.0, + "6": 3205449216.0, + "7": 3205449216.0, + "8": 3205449216.0, + "9": 3205449216.0, + "10": 3205449216.0, + "11": 3205449216.0, + "12": 3205449216.0, + "13": 3205449216.0, + "14": 3205449216.0, + "15": 3205449216.0, + "16": 3205449216.0, + "17": 3205449216.0, + "18": 3205449216.0, + "19": 3205449216.0, + "20": 3205449216.0, + "21": 3205449216.0, + "22": 3205449216.0, + "23": 3205449216.0, + "24": 3205449216.0, + "25": 3205449216.0, + "26": 3205449216.0, + "27": 3205449216.0, + "28": 3205449216.0, + "29": 3205449216.0, + "30": 3205449216.0, + "31": 3205449216.0, + "32": 3205449216.0, + "33": 3205449216.0, + "34": 3205449216.0, + "35": 3205449216.0, + "36": 3205449216.0, + "37": 3205449216.0, + "38": 3205449216.0, + "39": 3205449216.0, + "40": 3205449216.0, + "41": 3205449216.0, + "42": 3205449216.0, + "43": 3205449216.0, + "44": 3205449216.0, + "45": 3205449216.0, + "46": 3205449216.0, + "47": 3205449216.0, + "48": 3205449216.0, + "49": 3205449216.0, + "50": 3205449216.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.29331, + "2": 1.82828, + "3": 1.75745, + "4": 1.75149, + "5": 1.76912, + "6": 1.75888, + "7": 1.75313, + "8": 1.75423, + "9": 1.74482, + "10": 1.84387, + "11": 2.01499, + "12": 1.74448, + "13": 1.75425, + "14": 2.09351, + "15": 1.77765, + "16": 1.76841, + "17": 1.75495, + "18": 2.05727, + "19": 1.77481, + "20": 2.11285, + "21": 1.77659, + "22": 1.75669, + "23": 1.75872, + "24": 2.1065, + "25": 2.02543, + "26": 1.84773, + "27": 1.76632, + "28": 1.76482, + "29": 1.75732, + "30": 1.75335, + "31": 1.75453, + "32": 1.80627, + "33": 1.757, + "34": 1.75719, + "35": 1.75478, + "36": 1.76009, + "37": 1.75602, + "38": 1.75806, + "39": 1.75609, + "40": 1.75247, + "41": 1.75179, + "42": 1.75873, + "43": 1.77534, + "44": 1.80833, + "45": 1.74663, + "46": 1.75048, + "47": 1.7473, + "48": 1.75253, + "49": 1.76783, + "50": 1.75365 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json index a8c99cdd960..fbdb62b88ac 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json @@ -2,140 +2,535 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.89618, + "2": 10.89538, + "3": 10.88915, + "4": 10.89094, "5": 10.8927, + "6": 10.90148, + "7": 10.89392, + "8": 10.90369, + "9": 10.90794, "10": 10.89108, + "11": 10.88762, + "12": 10.9076, + "13": 10.91429, + "14": 10.90654, "15": 10.90227, + "16": 10.91042, + "17": 10.89896, + "18": 10.90666, + "19": 10.89908, "20": 10.90133, + "21": 10.91713, + "22": 10.89139, + "23": 10.90085, + "24": 10.89366, "25": 10.89372, + "26": 10.87372, + "27": 10.87917, + "28": 10.88756, + "29": 10.85461, "30": 10.83891, + "31": 10.75166, + "32": 10.8278, + "33": 10.80306, + "34": 10.73559, "35": 10.7301, + "36": 10.69318, + "37": 10.72854, + "38": 10.65364, + "39": 10.71672, "40": 10.56996, + "41": 10.58467, + "42": 10.59853, + "43": 10.3948, + "44": 10.44431, "45": 10.3452, + "46": 10.31919, + "47": 10.49671, + "48": 10.31281, + "49": 10.09084, "50": 10.31089, + "51": 10.25547, + "52": 10.15856, + "53": 10.38114, + "54": 10.2992, "55": 10.23806, + "56": 10.00726, + "57": 9.87765, + "58": 10.15279, + "59": 9.94207, "60": 9.8666, + "61": 10.00032, + "62": 10.23443, + "63": 9.71917, + "64": 10.04209, "65": 9.30009, + "66": 9.95537, + "67": 9.6499, + "68": 10.00402, + "69": 9.99988, "70": 9.96383, + "71": 9.84259, + "72": 9.81258, + "73": 9.70921, + "74": 9.19832, "75": 9.61686, + "76": 9.28859, + "77": 10.20416, + "78": 9.88378, + "79": 9.54296, "80": 9.57095, + "81": 9.64006, + "82": 9.83648, + "83": 9.47691, + "84": 9.54866, "85": 9.75198, + "86": 9.21427, + "87": 9.70607, + "88": 9.87307, + "89": 9.72876, "90": 9.92353, + "91": 9.48236, + "92": 9.47671, + "93": 9.20895, + "94": 8.9625, "95": 9.62369, + "96": 9.64228, + "97": 9.41575, + "98": 9.77515, + "99": 9.00692, "100": 9.51305 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 454770688.0, + "2": 454770688.0, + "3": 454770688.0, + "4": 454770688.0, "5": 454770688.0, + "6": 454770688.0, + "7": 454770688.0, + "8": 454770688.0, + "9": 454770688.0, "10": 454770688.0, + "11": 454770688.0, + "12": 454770688.0, + "13": 454770688.0, + "14": 454770688.0, "15": 454770688.0, + "16": 454770688.0, + "17": 454770688.0, + "18": 518880768.0, + "19": 518880768.0, "20": 518880768.0, + "21": 518880768.0, + "22": 518880768.0, + "23": 518880768.0, + "24": 518880768.0, "25": 518880768.0, + "26": 518880768.0, + "27": 518880768.0, + "28": 518880768.0, + "29": 518880768.0, "30": 518880768.0, + "31": 518880768.0, + "32": 518880768.0, + "33": 518880768.0, + "34": 518880768.0, "35": 518880768.0, + "36": 518880768.0, + "37": 518880768.0, + "38": 518880768.0, + "39": 518880768.0, "40": 518880768.0, + "41": 518880768.0, + "42": 518880768.0, + "43": 518880768.0, + "44": 518880768.0, "45": 518880768.0, + "46": 518880768.0, + "47": 518880768.0, + "48": 518880768.0, + "49": 518880768.0, "50": 518880768.0, + "51": 518880768.0, + "52": 518880768.0, + "53": 518880768.0, + "54": 518880768.0, "55": 518880768.0, + "56": 518880768.0, + "57": 518880768.0, + "58": 518880768.0, + "59": 518880768.0, "60": 518880768.0, + "61": 518880768.0, + "62": 518880768.0, + "63": 518880768.0, + "64": 518880768.0, "65": 518880768.0, + "66": 518880768.0, + "67": 518880768.0, + "68": 518880768.0, + "69": 518880768.0, "70": 518880768.0, + "71": 518880768.0, + "72": 518880768.0, + "73": 518880768.0, + "74": 518880768.0, "75": 518880768.0, + "76": 518880768.0, + "77": 518880768.0, + "78": 518880768.0, + "79": 518880768.0, "80": 518880768.0, + "81": 518880768.0, + "82": 518880768.0, + "83": 518880768.0, + "84": 518880768.0, "85": 518880768.0, + "86": 518880768.0, + "87": 518880768.0, + "88": 518880768.0, + "89": 518880768.0, "90": 518880768.0, + "91": 518880768.0, + "92": 518880768.0, + "93": 518880768.0, + "94": 518880768.0, "95": 518880768.0, + "96": 518880768.0, + "97": 518880768.0, + "98": 518880768.0, + "99": 518880768.0, "100": 518880768.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 4511150592.0, + "2": 4544705536.0, + "3": 4544705536.0, + "4": 4544705536.0, "5": 4544705536.0, + "6": 4544705536.0, + "7": 4544705536.0, + "8": 4544705536.0, + "9": 4544705536.0, "10": 4544705536.0, + "11": 4544705536.0, + "12": 4544705536.0, + "13": 4544705536.0, + "14": 4544705536.0, "15": 4544705536.0, + "16": 4544705536.0, + "17": 4544705536.0, + "18": 4544705536.0, + "19": 4607767040.0, "20": 4607767040.0, + "21": 4607767040.0, + "22": 4607767040.0, + "23": 4607767040.0, + "24": 4607767040.0, "25": 4607767040.0, + "26": 4607767040.0, + "27": 4607767040.0, + "28": 4607767040.0, + "29": 4607767040.0, "30": 4607767040.0, + "31": 4607767040.0, + "32": 4607767040.0, + "33": 4607767040.0, + "34": 4607767040.0, "35": 4607767040.0, + "36": 4607767040.0, + "37": 4607767040.0, + "38": 4607767040.0, + "39": 4607767040.0, "40": 4607767040.0, + "41": 4607767040.0, + "42": 4607767040.0, + "43": 4607767040.0, + "44": 4607767040.0, "45": 4607767040.0, + "46": 4607767040.0, + "47": 4607767040.0, + "48": 4607767040.0, + "49": 4607767040.0, "50": 4607767040.0, + "51": 4607767040.0, + "52": 4607767040.0, + "53": 4607767040.0, + "54": 4607767040.0, "55": 4607767040.0, + "56": 4607767040.0, + "57": 4607767040.0, + "58": 4607767040.0, + "59": 4607767040.0, "60": 4607767040.0, + "61": 4607767040.0, + "62": 4607767040.0, + "63": 4607767040.0, + "64": 4607767040.0, "65": 4607767040.0, + "66": 4607767040.0, + "67": 4607767040.0, + "68": 4607767040.0, + "69": 4607767040.0, "70": 4607767040.0, + "71": 4607767040.0, + "72": 4607767040.0, + "73": 4607767040.0, + "74": 4607767040.0, "75": 4607767040.0, + "76": 4607767040.0, + "77": 4607767040.0, + "78": 4607767040.0, + "79": 4607767040.0, "80": 4607767040.0, + "81": 4607767040.0, + "82": 4607767040.0, + "83": 4607767040.0, + "84": 4607767040.0, "85": 4607767040.0, + "86": 4607767040.0, + "87": 4607767040.0, + "88": 4607767040.0, + "89": 4607767040.0, "90": 4607767040.0, + "91": 4607767040.0, + "92": 4607767040.0, + "93": 4607767040.0, + "94": 4607767040.0, "95": 4607767040.0, + "96": 4607767040.0, + "97": 4607767040.0, + "98": 4607767040.0, + "99": 4607767040.0, "100": 4607767040.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 6.03441, - "5": 0.05457, - "10": 0.0555, - "15": 0.05442, - "20": 0.05936, - "25": 0.06165, - "30": 0.05917, - "35": 0.06761, - "40": 0.06021, - "45": 0.06061, - "50": 0.05916, - "55": 0.06279, - "60": 0.05959, - "65": 0.05975, - "70": 0.05984, - "75": 0.05968, - "80": 0.06032, - "85": 0.05993, - "90": 0.06577, - "95": 0.0595, - "100": 0.06114 + "1": 6.44783, + "2": 0.09007, + "3": 0.06737, + "4": 0.06577, + "5": 0.06617, + "6": 0.06499, + "7": 0.06848, + "8": 0.06519, + "9": 0.06616, + "10": 0.06552, + "11": 0.06475, + "12": 0.06425, + "13": 0.06448, + "14": 0.0646, + "15": 0.06511, + "16": 0.06475, + "17": 0.06554, + "18": 0.11461, + "19": 0.07217, + "20": 0.07186, + "21": 0.07086, + "22": 0.06865, + "23": 0.07004, + "24": 0.07096, + "25": 0.071, + "26": 0.07082, + "27": 0.07253, + "28": 0.07103, + "29": 0.07101, + "30": 0.07144, + "31": 0.07157, + "32": 0.07144, + "33": 0.07102, + "34": 0.0715, + "35": 0.07197, + "36": 0.07104, + "37": 0.07183, + "38": 0.07076, + "39": 0.07174, + "40": 0.07198, + "41": 0.0728, + "42": 0.07014, + "43": 0.07139, + "44": 0.07151, + "45": 0.0731, + "46": 0.07262, + "47": 0.07101, + "48": 0.07085, + "49": 0.07236, + "50": 0.07208, + "51": 0.10876, + "52": 0.07904, + "53": 0.07811, + "54": 0.07594, + "55": 0.07858, + "56": 0.08222, + "57": 0.08161, + "58": 0.0804, + "59": 0.07879, + "60": 0.07013, + "61": 0.06958, + "62": 0.07024, + "63": 0.06986, + "64": 0.07068, + "65": 0.07096, + "66": 0.07033, + "67": 0.07005, + "68": 0.07023, + "69": 0.07133, + "70": 0.07104, + "71": 0.0717, + "72": 0.07141, + "73": 0.07155, + "74": 0.07093, + "75": 0.07044, + "76": 0.06976, + "77": 0.07009, + "78": 0.07092, + "79": 0.07151, + "80": 0.07062, + "81": 0.07312, + "82": 0.07117, + "83": 0.07287, + "84": 0.07054, + "85": 0.07186, + "86": 0.0698, + "87": 0.07076, + "88": 0.0702, + "89": 0.07128, + "90": 0.07039, + "91": 0.07054, + "92": 0.07169, + "93": 0.07155, + "94": 0.07057, + "95": 0.07134, + "96": 0.07134, + "97": 0.07146, + "98": 0.07223, + "99": 0.07189, + "100": 0.07136 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", "15": "nan", + "16": "nan", + "17": "nan", + "18": 1155.0, + "19": 1454.0, "20": 1095.0, + "21": 1230.0, + "22": "nan", + "23": 1357.0, + "24": 1150.0, "25": 1228.0, + "26": 1202.0, + "27": 1326.0, + "28": 1466.0, + "29": 1438.0, "30": 1238.0, + "31": 1008.0, + "32": 1160.0, + "33": 1371.0, + "34": 1154.0, "35": 1295.0, + "36": 1156.0, + "37": 1403.0, + "38": 1487.0, + "39": 1429.0, "40": 1412.0, + "41": 1458.0, + "42": 1316.0, + "43": 1193.0, + "44": 1323.0, "45": 1297.0, + "46": 1276.0, + "47": 1868.0, + "48": 1251.0, + "49": 1272.0, "50": 1524.0, + "51": 1367.0, + "52": 1372.0, + "53": 1715.0, + "54": 1485.0, "55": 1482.0, + "56": 1473.0, + "57": 1539.0, + "58": 1736.0, + "59": 1661.0, "60": 1586.0, + "61": 1691.0, + "62": 1865.0, + "63": 1395.0, + "64": 1846.0, "65": 1428.0, + "66": 1717.0, + "67": 1700.0, + "68": 1750.0, + "69": 1681.0, "70": 1861.0, + "71": 2048.0, + "72": 1552.0, + "73": 2010.0, + "74": 1344.0, "75": 1840.0, + "76": 1846.0, + "77": 2034.0, + "78": 2170.0, + "79": 1949.0, "80": 2077.0, + "81": 2381.0, + "82": 2390.0, + "83": 1843.0, + "84": 2060.0, "85": 2317.0, + "86": 1958.0, + "87": 2829.0, + "88": 2046.0, + "89": 2260.0, "90": 2545.0, + "91": 1801.0, + "92": 2505.0, + "93": 2064.0, + "94": 2223.0, "95": 2379.0, + "96": 2579.0, + "97": 2411.0, + "98": 2500.0, + "99": 2124.0, "100": 2119.0 } } diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..68de1078bf3 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.89618, + "2": 10.89538, + "3": 10.88915, + "4": 10.89094, + "5": 10.8927, + "6": 10.90148, + "7": 10.89392, + "8": 10.90369, + "9": 10.90794, + "10": 10.89108, + "11": 10.88762, + "12": 10.9076, + "13": 10.91429, + "14": 10.90654, + "15": 10.90227, + "16": 10.91042, + "17": 10.89896, + "18": 10.90666, + "19": 10.89908, + "20": 10.90133, + "21": 10.91713, + "22": 10.89139, + "23": 10.90085, + "24": 10.89366, + "25": 10.89372, + "26": 10.87372, + "27": 10.87917, + "28": 10.88756, + "29": 10.85461, + "30": 10.83891, + "31": 10.75166, + "32": 10.8278, + "33": 10.80306, + "34": 10.73559, + "35": 10.7301, + "36": 10.69318, + "37": 10.72854, + "38": 10.65364, + "39": 10.71672, + "40": 10.56996, + "41": 10.58467, + "42": 10.59853, + "43": 10.3948, + "44": 10.44431, + "45": 10.3452, + "46": 10.31919, + "47": 10.49671, + "48": 10.31281, + "49": 10.09084, + "50": 10.31089, + "51": 10.25547, + "52": 10.15856, + "53": 10.38114, + "54": 10.2992, + "55": 10.23806, + "56": 10.00726, + "57": 9.87765, + "58": 10.15279, + "59": 9.94207, + "60": 9.8666, + "61": 10.00032, + "62": 10.23443, + "63": 9.71917, + "64": 10.04209, + "65": 9.30009, + "66": 9.95537, + "67": 9.6499, + "68": 10.00402, + "69": 9.99988, + "70": 9.96383, + "71": 9.84259, + "72": 9.81258, + "73": 9.70921, + "74": 9.19832, + "75": 9.61686, + "76": 9.28859, + "77": 10.20416, + "78": 9.88378, + "79": 9.54296, + "80": 9.57095, + "81": 9.64006, + "82": 9.83648, + "83": 9.47691, + "84": 9.54866, + "85": 9.75198, + "86": 9.21427, + "87": 9.70607, + "88": 9.87307, + "89": 9.72876, + "90": 9.92353, + "91": 9.48236, + "92": 9.47671, + "93": 9.20895, + "94": 8.9625, + "95": 9.62369, + "96": 9.64228, + "97": 9.41575, + "98": 9.77515, + "99": 9.00692, + "100": 9.51305 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 454770688.0, + "2": 454770688.0, + "3": 454770688.0, + "4": 454770688.0, + "5": 454770688.0, + "6": 454770688.0, + "7": 454770688.0, + "8": 454770688.0, + "9": 454770688.0, + "10": 454770688.0, + "11": 454770688.0, + "12": 454770688.0, + "13": 454770688.0, + "14": 454770688.0, + "15": 454770688.0, + "16": 454770688.0, + "17": 454770688.0, + "18": 518880768.0, + "19": 518880768.0, + "20": 518880768.0, + "21": 518880768.0, + "22": 518880768.0, + "23": 518880768.0, + "24": 518880768.0, + "25": 518880768.0, + "26": 518880768.0, + "27": 518880768.0, + "28": 518880768.0, + "29": 518880768.0, + "30": 518880768.0, + "31": 518880768.0, + "32": 518880768.0, + "33": 518880768.0, + "34": 518880768.0, + "35": 518880768.0, + "36": 518880768.0, + "37": 518880768.0, + "38": 518880768.0, + "39": 518880768.0, + "40": 518880768.0, + "41": 518880768.0, + "42": 518880768.0, + "43": 518880768.0, + "44": 518880768.0, + "45": 518880768.0, + "46": 518880768.0, + "47": 518880768.0, + "48": 518880768.0, + "49": 518880768.0, + "50": 518880768.0, + "51": 518880768.0, + "52": 518880768.0, + "53": 518880768.0, + "54": 518880768.0, + "55": 518880768.0, + "56": 518880768.0, + "57": 518880768.0, + "58": 518880768.0, + "59": 518880768.0, + "60": 518880768.0, + "61": 518880768.0, + "62": 518880768.0, + "63": 518880768.0, + "64": 518880768.0, + "65": 518880768.0, + "66": 518880768.0, + "67": 518880768.0, + "68": 518880768.0, + "69": 518880768.0, + "70": 518880768.0, + "71": 518880768.0, + "72": 518880768.0, + "73": 518880768.0, + "74": 518880768.0, + "75": 518880768.0, + "76": 518880768.0, + "77": 518880768.0, + "78": 518880768.0, + "79": 518880768.0, + "80": 518880768.0, + "81": 518880768.0, + "82": 518880768.0, + "83": 518880768.0, + "84": 518880768.0, + "85": 518880768.0, + "86": 518880768.0, + "87": 518880768.0, + "88": 518880768.0, + "89": 518880768.0, + "90": 518880768.0, + "91": 518880768.0, + "92": 518880768.0, + "93": 518880768.0, + "94": 518880768.0, + "95": 518880768.0, + "96": 518880768.0, + "97": 518880768.0, + "98": 518880768.0, + "99": 518880768.0, + "100": 518880768.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4511150592.0, + "2": 4544705536.0, + "3": 4544705536.0, + "4": 4544705536.0, + "5": 4544705536.0, + "6": 4544705536.0, + "7": 4544705536.0, + "8": 4544705536.0, + "9": 4544705536.0, + "10": 4544705536.0, + "11": 4544705536.0, + "12": 4544705536.0, + "13": 4544705536.0, + "14": 4544705536.0, + "15": 4544705536.0, + "16": 4544705536.0, + "17": 4544705536.0, + "18": 4544705536.0, + "19": 4607767040.0, + "20": 4607767040.0, + "21": 4607767040.0, + "22": 4607767040.0, + "23": 4607767040.0, + "24": 4607767040.0, + "25": 4607767040.0, + "26": 4607767040.0, + "27": 4607767040.0, + "28": 4607767040.0, + "29": 4607767040.0, + "30": 4607767040.0, + "31": 4607767040.0, + "32": 4607767040.0, + "33": 4607767040.0, + "34": 4607767040.0, + "35": 4607767040.0, + "36": 4607767040.0, + "37": 4607767040.0, + "38": 4607767040.0, + "39": 4607767040.0, + "40": 4607767040.0, + "41": 4607767040.0, + "42": 4607767040.0, + "43": 4607767040.0, + "44": 4607767040.0, + "45": 4607767040.0, + "46": 4607767040.0, + "47": 4607767040.0, + "48": 4607767040.0, + "49": 4607767040.0, + "50": 4607767040.0, + "51": 4607767040.0, + "52": 4607767040.0, + "53": 4607767040.0, + "54": 4607767040.0, + "55": 4607767040.0, + "56": 4607767040.0, + "57": 4607767040.0, + "58": 4607767040.0, + "59": 4607767040.0, + "60": 4607767040.0, + "61": 4607767040.0, + "62": 4607767040.0, + "63": 4607767040.0, + "64": 4607767040.0, + "65": 4607767040.0, + "66": 4607767040.0, + "67": 4607767040.0, + "68": 4607767040.0, + "69": 4607767040.0, + "70": 4607767040.0, + "71": 4607767040.0, + "72": 4607767040.0, + "73": 4607767040.0, + "74": 4607767040.0, + "75": 4607767040.0, + "76": 4607767040.0, + "77": 4607767040.0, + "78": 4607767040.0, + "79": 4607767040.0, + "80": 4607767040.0, + "81": 4607767040.0, + "82": 4607767040.0, + "83": 4607767040.0, + "84": 4607767040.0, + "85": 4607767040.0, + "86": 4607767040.0, + "87": 4607767040.0, + "88": 4607767040.0, + "89": 4607767040.0, + "90": 4607767040.0, + "91": 4607767040.0, + "92": 4607767040.0, + "93": 4607767040.0, + "94": 4607767040.0, + "95": 4607767040.0, + "96": 4607767040.0, + "97": 4607767040.0, + "98": 4607767040.0, + "99": 4607767040.0, + "100": 4607767040.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.06687, + "2": 0.09744, + "3": 0.05659, + "4": 0.05607, + "5": 0.05508, + "6": 0.05545, + "7": 0.06728, + "8": 0.06907, + "9": 0.06794, + "10": 0.05561, + "11": 0.05366, + "12": 0.05478, + "13": 0.05682, + "14": 0.0602, + "15": 0.05987, + "16": 0.05524, + "17": 0.05387, + "18": 0.0976, + "19": 0.06103, + "20": 0.06125, + "21": 0.06399, + "22": 0.06406, + "23": 0.05846, + "24": 0.0595, + "25": 0.05948, + "26": 0.05947, + "27": 0.05843, + "28": 0.06573, + "29": 0.06497, + "30": 0.05987, + "31": 0.05899, + "32": 0.05983, + "33": 0.05828, + "34": 0.06034, + "35": 0.06568, + "36": 0.0606, + "37": 0.05892, + "38": 0.05998, + "39": 0.06244, + "40": 0.06557, + "41": 0.05845, + "42": 0.06012, + "43": 0.05942, + "44": 0.05983, + "45": 0.06123, + "46": 0.06648, + "47": 0.06513, + "48": 0.0599, + "49": 0.05866, + "50": 0.06093, + "51": 0.06536, + "52": 0.06086, + "53": 0.05831, + "54": 0.06064, + "55": 0.05976, + "56": 0.06762, + "57": 0.06301, + "58": 0.05996, + "59": 0.05844, + "60": 0.06016, + "61": 0.05903, + "62": 0.05975, + "63": 0.06658, + "64": 0.06396, + "65": 0.05913, + "66": 0.06025, + "67": 0.0595, + "68": 0.06002, + "69": 0.05954, + "70": 0.06032, + "71": 0.06012, + "72": 0.06048, + "73": 0.05933, + "74": 0.05958, + "75": 0.06007, + "76": 0.06034, + "77": 0.05974, + "78": 0.06035, + "79": 0.06014, + "80": 0.06072, + "81": 0.06083, + "82": 0.062, + "83": 0.05964, + "84": 0.06048, + "85": 0.0602, + "86": 0.0607, + "87": 0.05907, + "88": 0.0636, + "89": 0.06003, + "90": 0.06002, + "91": 0.05858, + "92": 0.06008, + "93": 0.05932, + "94": 0.05884, + "95": 0.05815, + "96": 0.05789, + "97": 0.05853, + "98": 0.05852, + "99": 0.05895, + "100": 0.0617 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": 1155.0, + "19": 1454.0, + "20": 1095.0, + "21": 1230.0, + "22": "nan", + "23": 1357.0, + "24": 1150.0, + "25": 1228.0, + "26": 1202.0, + "27": 1326.0, + "28": 1466.0, + "29": 1438.0, + "30": 1238.0, + "31": 1008.0, + "32": 1160.0, + "33": 1371.0, + "34": 1154.0, + "35": 1295.0, + "36": 1156.0, + "37": 1403.0, + "38": 1487.0, + "39": 1429.0, + "40": 1412.0, + "41": 1458.0, + "42": 1316.0, + "43": 1193.0, + "44": 1323.0, + "45": 1297.0, + "46": 1276.0, + "47": 1868.0, + "48": 1251.0, + "49": 1272.0, + "50": 1524.0, + "51": 1367.0, + "52": 1372.0, + "53": 1715.0, + "54": 1485.0, + "55": 1482.0, + "56": 1473.0, + "57": 1539.0, + "58": 1736.0, + "59": 1661.0, + "60": 1586.0, + "61": 1691.0, + "62": 1865.0, + "63": 1395.0, + "64": 1846.0, + "65": 1428.0, + "66": 1717.0, + "67": 1700.0, + "68": 1750.0, + "69": 1681.0, + "70": 1861.0, + "71": 2048.0, + "72": 1552.0, + "73": 2010.0, + "74": 1344.0, + "75": 1840.0, + "76": 1846.0, + "77": 2034.0, + "78": 2170.0, + "79": 1949.0, + "80": 2077.0, + "81": 2381.0, + "82": 2390.0, + "83": 1843.0, + "84": 2060.0, + "85": 2317.0, + "86": 1958.0, + "87": 2829.0, + "88": 2046.0, + "89": 2260.0, + "90": 2545.0, + "91": 1801.0, + "92": 2505.0, + "93": 2064.0, + "94": 2223.0, + "95": 2379.0, + "96": 2579.0, + "97": 2411.0, + "98": 2500.0, + "99": 2124.0, + "100": 2119.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..8828025e4b4 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.89618, + "2": 10.89538, + "3": 10.88915, + "4": 10.89094, + "5": 10.8927, + "6": 10.90148, + "7": 10.89392, + "8": 10.90369, + "9": 10.90794, + "10": 10.89108, + "11": 10.88762, + "12": 10.9076, + "13": 10.91429, + "14": 10.90654, + "15": 10.90227, + "16": 10.91042, + "17": 10.89896, + "18": 10.90666, + "19": 10.89908, + "20": 10.90133, + "21": 10.91713, + "22": 10.89139, + "23": 10.90085, + "24": 10.89366, + "25": 10.89372, + "26": 10.87372, + "27": 10.87917, + "28": 10.88756, + "29": 10.85461, + "30": 10.83891, + "31": 10.75166, + "32": 10.8278, + "33": 10.80306, + "34": 10.73559, + "35": 10.7301, + "36": 10.69318, + "37": 10.72854, + "38": 10.65364, + "39": 10.71672, + "40": 10.56996, + "41": 10.58467, + "42": 10.59853, + "43": 10.3948, + "44": 10.44431, + "45": 10.3452, + "46": 10.31919, + "47": 10.49671, + "48": 10.31281, + "49": 10.09084, + "50": 10.31089, + "51": 10.25547, + "52": 10.15856, + "53": 10.38114, + "54": 10.2992, + "55": 10.23806, + "56": 10.00726, + "57": 9.87765, + "58": 10.15279, + "59": 9.94207, + "60": 9.8666, + "61": 10.00032, + "62": 10.23443, + "63": 9.71917, + "64": 10.04209, + "65": 9.30009, + "66": 9.95537, + "67": 9.6499, + "68": 10.00402, + "69": 9.99988, + "70": 9.96383, + "71": 9.84259, + "72": 9.81258, + "73": 9.70921, + "74": 9.19832, + "75": 9.61686, + "76": 9.28859, + "77": 10.20416, + "78": 9.88378, + "79": 9.54296, + "80": 9.57095, + "81": 9.64006, + "82": 9.83648, + "83": 9.47691, + "84": 9.54866, + "85": 9.75198, + "86": 9.21427, + "87": 9.70607, + "88": 9.87307, + "89": 9.72876, + "90": 9.92353, + "91": 9.48236, + "92": 9.47671, + "93": 9.20895, + "94": 8.9625, + "95": 9.62369, + "96": 9.64228, + "97": 9.41575, + "98": 9.77515, + "99": 9.00692, + "100": 9.51305 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 454770688.0, + "2": 454770688.0, + "3": 454770688.0, + "4": 454770688.0, + "5": 454770688.0, + "6": 454770688.0, + "7": 454770688.0, + "8": 454770688.0, + "9": 454770688.0, + "10": 454770688.0, + "11": 454770688.0, + "12": 454770688.0, + "13": 454770688.0, + "14": 454770688.0, + "15": 454770688.0, + "16": 454770688.0, + "17": 454770688.0, + "18": 518880768.0, + "19": 518880768.0, + "20": 518880768.0, + "21": 518880768.0, + "22": 518880768.0, + "23": 518880768.0, + "24": 518880768.0, + "25": 518880768.0, + "26": 518880768.0, + "27": 518880768.0, + "28": 518880768.0, + "29": 518880768.0, + "30": 518880768.0, + "31": 518880768.0, + "32": 518880768.0, + "33": 518880768.0, + "34": 518880768.0, + "35": 518880768.0, + "36": 518880768.0, + "37": 518880768.0, + "38": 518880768.0, + "39": 518880768.0, + "40": 518880768.0, + "41": 518880768.0, + "42": 518880768.0, + "43": 518880768.0, + "44": 518880768.0, + "45": 518880768.0, + "46": 518880768.0, + "47": 518880768.0, + "48": 518880768.0, + "49": 518880768.0, + "50": 518880768.0, + "51": 518880768.0, + "52": 518880768.0, + "53": 518880768.0, + "54": 518880768.0, + "55": 518880768.0, + "56": 518880768.0, + "57": 518880768.0, + "58": 518880768.0, + "59": 518880768.0, + "60": 518880768.0, + "61": 518880768.0, + "62": 518880768.0, + "63": 518880768.0, + "64": 518880768.0, + "65": 518880768.0, + "66": 518880768.0, + "67": 518880768.0, + "68": 518880768.0, + "69": 518880768.0, + "70": 518880768.0, + "71": 518880768.0, + "72": 518880768.0, + "73": 518880768.0, + "74": 518880768.0, + "75": 518880768.0, + "76": 518880768.0, + "77": 518880768.0, + "78": 518880768.0, + "79": 518880768.0, + "80": 518880768.0, + "81": 518880768.0, + "82": 518880768.0, + "83": 518880768.0, + "84": 518880768.0, + "85": 518880768.0, + "86": 518880768.0, + "87": 518880768.0, + "88": 518880768.0, + "89": 518880768.0, + "90": 518880768.0, + "91": 518880768.0, + "92": 518880768.0, + "93": 518880768.0, + "94": 518880768.0, + "95": 518880768.0, + "96": 518880768.0, + "97": 518880768.0, + "98": 518880768.0, + "99": 518880768.0, + "100": 518880768.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4511150592.0, + "2": 4544705536.0, + "3": 4544705536.0, + "4": 4544705536.0, + "5": 4544705536.0, + "6": 4544705536.0, + "7": 4544705536.0, + "8": 4544705536.0, + "9": 4544705536.0, + "10": 4544705536.0, + "11": 4544705536.0, + "12": 4544705536.0, + "13": 4544705536.0, + "14": 4544705536.0, + "15": 4544705536.0, + "16": 4544705536.0, + "17": 4544705536.0, + "18": 4544705536.0, + "19": 4607767040.0, + "20": 4607767040.0, + "21": 4607767040.0, + "22": 4607767040.0, + "23": 4607767040.0, + "24": 4607767040.0, + "25": 4607767040.0, + "26": 4607767040.0, + "27": 4607767040.0, + "28": 4607767040.0, + "29": 4607767040.0, + "30": 4607767040.0, + "31": 4607767040.0, + "32": 4607767040.0, + "33": 4607767040.0, + "34": 4607767040.0, + "35": 4607767040.0, + "36": 4607767040.0, + "37": 4607767040.0, + "38": 4607767040.0, + "39": 4607767040.0, + "40": 4607767040.0, + "41": 4607767040.0, + "42": 4607767040.0, + "43": 4607767040.0, + "44": 4607767040.0, + "45": 4607767040.0, + "46": 4607767040.0, + "47": 4607767040.0, + "48": 4607767040.0, + "49": 4607767040.0, + "50": 4607767040.0, + "51": 4607767040.0, + "52": 4607767040.0, + "53": 4607767040.0, + "54": 4607767040.0, + "55": 4607767040.0, + "56": 4607767040.0, + "57": 4607767040.0, + "58": 4607767040.0, + "59": 4607767040.0, + "60": 4607767040.0, + "61": 4607767040.0, + "62": 4607767040.0, + "63": 4607767040.0, + "64": 4607767040.0, + "65": 4607767040.0, + "66": 4607767040.0, + "67": 4607767040.0, + "68": 4607767040.0, + "69": 4607767040.0, + "70": 4607767040.0, + "71": 4607767040.0, + "72": 4607767040.0, + "73": 4607767040.0, + "74": 4607767040.0, + "75": 4607767040.0, + "76": 4607767040.0, + "77": 4607767040.0, + "78": 4607767040.0, + "79": 4607767040.0, + "80": 4607767040.0, + "81": 4607767040.0, + "82": 4607767040.0, + "83": 4607767040.0, + "84": 4607767040.0, + "85": 4607767040.0, + "86": 4607767040.0, + "87": 4607767040.0, + "88": 4607767040.0, + "89": 4607767040.0, + "90": 4607767040.0, + "91": 4607767040.0, + "92": 4607767040.0, + "93": 4607767040.0, + "94": 4607767040.0, + "95": 4607767040.0, + "96": 4607767040.0, + "97": 4607767040.0, + "98": 4607767040.0, + "99": 4607767040.0, + "100": 4607767040.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.49307, + "2": 0.10356, + "3": 0.08062, + "4": 0.0772, + "5": 0.07555, + "6": 0.06677, + "7": 0.06434, + "8": 0.06228, + "9": 0.0624, + "10": 0.06213, + "11": 0.06353, + "12": 0.0622, + "13": 0.06377, + "14": 0.06323, + "15": 0.06296, + "16": 0.06251, + "17": 0.06382, + "18": 0.11433, + "19": 0.07262, + "20": 0.07222, + "21": 0.07613, + "22": 0.06977, + "23": 0.06664, + "24": 0.07256, + "25": 0.07344, + "26": 0.0723, + "27": 0.07264, + "28": 0.0697, + "29": 0.06998, + "30": 0.06785, + "31": 0.07022, + "32": 0.06834, + "33": 0.06679, + "34": 0.0678, + "35": 0.0679, + "36": 0.0679, + "37": 0.06826, + "38": 0.06821, + "39": 0.0665, + "40": 0.06798, + "41": 0.06816, + "42": 0.06816, + "43": 0.06901, + "44": 0.06772, + "45": 0.06849, + "46": 0.06843, + "47": 0.06773, + "48": 0.06705, + "49": 0.06755, + "50": 0.06844, + "51": 0.0971, + "52": 0.06968, + "53": 0.06915, + "54": 0.06982, + "55": 0.0703, + "56": 0.07014, + "57": 0.07047, + "58": 0.06835, + "59": 0.07077, + "60": 0.06886, + "61": 0.06929, + "62": 0.06887, + "63": 0.06946, + "64": 0.06924, + "65": 0.06987, + "66": 0.06898, + "67": 0.06873, + "68": 0.0695, + "69": 0.0712, + "70": 0.06928, + "71": 0.0692, + "72": 0.07014, + "73": 0.06964, + "74": 0.06884, + "75": 0.06897, + "76": 0.07036, + "77": 0.0693, + "78": 0.06905, + "79": 0.0698, + "80": 0.06831, + "81": 0.06969, + "82": 0.06871, + "83": 0.07059, + "84": 0.06905, + "85": 0.06955, + "86": 0.06926, + "87": 0.06905, + "88": 0.06912, + "89": 0.07039, + "90": 0.06895, + "91": 0.069, + "92": 0.0698, + "93": 0.06946, + "94": 0.06825, + "95": 0.06933, + "96": 0.06851, + "97": 0.06883, + "98": 0.07421, + "99": 0.06926, + "100": 0.07018 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": 1155.0, + "19": 1454.0, + "20": 1095.0, + "21": 1230.0, + "22": "nan", + "23": 1357.0, + "24": 1150.0, + "25": 1228.0, + "26": 1202.0, + "27": 1326.0, + "28": 1466.0, + "29": 1438.0, + "30": 1238.0, + "31": 1008.0, + "32": 1160.0, + "33": 1371.0, + "34": 1154.0, + "35": 1295.0, + "36": 1156.0, + "37": 1403.0, + "38": 1487.0, + "39": 1429.0, + "40": 1412.0, + "41": 1458.0, + "42": 1316.0, + "43": 1193.0, + "44": 1323.0, + "45": 1297.0, + "46": 1276.0, + "47": 1868.0, + "48": 1251.0, + "49": 1272.0, + "50": 1524.0, + "51": 1367.0, + "52": 1372.0, + "53": 1715.0, + "54": 1485.0, + "55": 1482.0, + "56": 1473.0, + "57": 1539.0, + "58": 1736.0, + "59": 1661.0, + "60": 1586.0, + "61": 1691.0, + "62": 1865.0, + "63": 1395.0, + "64": 1846.0, + "65": 1428.0, + "66": 1717.0, + "67": 1700.0, + "68": 1750.0, + "69": 1681.0, + "70": 1861.0, + "71": 2048.0, + "72": 1552.0, + "73": 2010.0, + "74": 1344.0, + "75": 1840.0, + "76": 1846.0, + "77": 2034.0, + "78": 2170.0, + "79": 1949.0, + "80": 2077.0, + "81": 2381.0, + "82": 2390.0, + "83": 1843.0, + "84": 2060.0, + "85": 2317.0, + "86": 1958.0, + "87": 2829.0, + "88": 2046.0, + "89": 2260.0, + "90": 2545.0, + "91": 1801.0, + "92": 2505.0, + "93": 2064.0, + "94": 2223.0, + "95": 2379.0, + "96": 2579.0, + "97": 2411.0, + "98": 2500.0, + "99": 2124.0, + "100": 2119.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..2dcf90e989f --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84012, + "2": 10.83568, + "3": 10.83117, + "4": 10.81867, + "5": 10.84147, + "6": 10.87385, + "7": 10.83678, + "8": 10.84423, + "9": 10.84878, + "10": 10.82107, + "11": 10.85906, + "12": 10.85708, + "13": 10.88186, + "14": 10.87751, + "15": 10.85423, + "16": 10.85071, + "17": 10.84178, + "18": 10.86599, + "19": 10.86171, + "20": 10.85111, + "21": 10.85522, + "22": 10.82384, + "23": 10.86861, + "24": 10.82812, + "25": 10.82229, + "26": 10.83266, + "27": 10.82346, + "28": 10.84367, + "29": 10.83315, + "30": 10.75584, + "31": 10.66438, + "32": 10.78744, + "33": 10.76542, + "34": 10.67705, + "35": 10.68389, + "36": 10.63442, + "37": 10.68265, + "38": 10.6013, + "39": 10.69422, + "40": 10.52756, + "41": 10.54166, + "42": 10.56471, + "43": 10.34495, + "44": 10.38785, + "45": 10.3119, + "46": 10.3021, + "47": 10.479, + "48": 10.28168, + "49": 10.05783, + "50": 10.29392, + "51": 10.2381, + "52": 10.15425, + "53": 10.35958, + "54": 10.26866, + "55": 10.21882, + "56": 9.9963, + "57": 9.87322, + "58": 10.14154, + "59": 9.93616, + "60": 9.8477, + "61": 9.98627, + "62": 10.21642, + "63": 9.69005, + "64": 10.01919, + "65": 9.30027, + "66": 9.9353, + "67": 9.63074, + "68": 9.99036, + "69": 9.98369, + "70": 9.92473, + "71": 9.81441, + "72": 9.79281, + "73": 9.67937, + "74": 9.19331, + "75": 9.60615, + "76": 9.28477, + "77": 10.18543, + "78": 9.86681, + "79": 9.52304, + "80": 9.55867, + "81": 9.62718, + "82": 9.81491, + "83": 9.45803, + "84": 9.53679, + "85": 9.7331, + "86": 9.20021, + "87": 9.69537, + "88": 9.85367, + "89": 9.7164, + "90": 9.91024, + "91": 9.46125, + "92": 9.46592, + "93": 9.19252, + "94": 8.94116, + "95": 9.60586, + "96": 9.62228, + "97": 9.39813, + "98": 9.76041, + "99": 8.9914, + "100": 9.49453 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 454770688.0, + "2": 454770688.0, + "3": 454770688.0, + "4": 454770688.0, + "5": 454770688.0, + "6": 454770688.0, + "7": 454770688.0, + "8": 454770688.0, + "9": 454770688.0, + "10": 454770688.0, + "11": 454770688.0, + "12": 454770688.0, + "13": 454770688.0, + "14": 454770688.0, + "15": 454770688.0, + "16": 454770688.0, + "17": 454770688.0, + "18": 518880768.0, + "19": 518880768.0, + "20": 518880768.0, + "21": 518880768.0, + "22": 518880768.0, + "23": 518880768.0, + "24": 518880768.0, + "25": 518880768.0, + "26": 518880768.0, + "27": 518880768.0, + "28": 518880768.0, + "29": 518880768.0, + "30": 518880768.0, + "31": 518880768.0, + "32": 518880768.0, + "33": 518880768.0, + "34": 518880768.0, + "35": 518880768.0, + "36": 518880768.0, + "37": 518880768.0, + "38": 518880768.0, + "39": 518880768.0, + "40": 518880768.0, + "41": 518880768.0, + "42": 518880768.0, + "43": 518880768.0, + "44": 518880768.0, + "45": 518880768.0, + "46": 518880768.0, + "47": 518880768.0, + "48": 518880768.0, + "49": 518880768.0, + "50": 518880768.0, + "51": 518880768.0, + "52": 518880768.0, + "53": 518880768.0, + "54": 518880768.0, + "55": 518880768.0, + "56": 518880768.0, + "57": 518880768.0, + "58": 518880768.0, + "59": 518880768.0, + "60": 518880768.0, + "61": 518880768.0, + "62": 518880768.0, + "63": 518880768.0, + "64": 518880768.0, + "65": 518880768.0, + "66": 518880768.0, + "67": 518880768.0, + "68": 518880768.0, + "69": 518880768.0, + "70": 518880768.0, + "71": 518880768.0, + "72": 518880768.0, + "73": 518880768.0, + "74": 518880768.0, + "75": 518880768.0, + "76": 518880768.0, + "77": 518880768.0, + "78": 518880768.0, + "79": 518880768.0, + "80": 518880768.0, + "81": 518880768.0, + "82": 518880768.0, + "83": 518880768.0, + "84": 518880768.0, + "85": 518880768.0, + "86": 518880768.0, + "87": 518880768.0, + "88": 518880768.0, + "89": 518880768.0, + "90": 518880768.0, + "91": 518880768.0, + "92": 518880768.0, + "93": 518880768.0, + "94": 518880768.0, + "95": 518880768.0, + "96": 518880768.0, + "97": 518880768.0, + "98": 518880768.0, + "99": 518880768.0, + "100": 518880768.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4511150592.0, + "2": 4544705536.0, + "3": 4544705536.0, + "4": 4544705536.0, + "5": 4544705536.0, + "6": 4544705536.0, + "7": 4544705536.0, + "8": 4544705536.0, + "9": 4544705536.0, + "10": 4544705536.0, + "11": 4544705536.0, + "12": 4544705536.0, + "13": 4544705536.0, + "14": 4544705536.0, + "15": 4544705536.0, + "16": 4544705536.0, + "17": 4544705536.0, + "18": 4544705536.0, + "19": 4607767040.0, + "20": 4607767040.0, + "21": 4607767040.0, + "22": 4607767040.0, + "23": 4607767040.0, + "24": 4607767040.0, + "25": 4607767040.0, + "26": 4607767040.0, + "27": 4607767040.0, + "28": 4607767040.0, + "29": 4607767040.0, + "30": 4607767040.0, + "31": 4607767040.0, + "32": 4607767040.0, + "33": 4607767040.0, + "34": 4607767040.0, + "35": 4607767040.0, + "36": 4607767040.0, + "37": 4607767040.0, + "38": 4607767040.0, + "39": 4607767040.0, + "40": 4607767040.0, + "41": 4607767040.0, + "42": 4607767040.0, + "43": 4607767040.0, + "44": 4607767040.0, + "45": 4607767040.0, + "46": 4607767040.0, + "47": 4607767040.0, + "48": 4607767040.0, + "49": 4607767040.0, + "50": 4607767040.0, + "51": 4607767040.0, + "52": 4607767040.0, + "53": 4607767040.0, + "54": 4607767040.0, + "55": 4607767040.0, + "56": 4607767040.0, + "57": 4607767040.0, + "58": 4607767040.0, + "59": 4607767040.0, + "60": 4607767040.0, + "61": 4607767040.0, + "62": 4607767040.0, + "63": 4607767040.0, + "64": 4607767040.0, + "65": 4607767040.0, + "66": 4607767040.0, + "67": 4607767040.0, + "68": 4607767040.0, + "69": 4607767040.0, + "70": 4607767040.0, + "71": 4607767040.0, + "72": 4607767040.0, + "73": 4607767040.0, + "74": 4607767040.0, + "75": 4607767040.0, + "76": 4607767040.0, + "77": 4607767040.0, + "78": 4607767040.0, + "79": 4607767040.0, + "80": 4607767040.0, + "81": 4607767040.0, + "82": 4607767040.0, + "83": 4607767040.0, + "84": 4607767040.0, + "85": 4607767040.0, + "86": 4607767040.0, + "87": 4607767040.0, + "88": 4607767040.0, + "89": 4607767040.0, + "90": 4607767040.0, + "91": 4607767040.0, + "92": 4607767040.0, + "93": 4607767040.0, + "94": 4607767040.0, + "95": 4607767040.0, + "96": 4607767040.0, + "97": 4607767040.0, + "98": 4607767040.0, + "99": 4607767040.0, + "100": 4607767040.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.15163, + "2": 0.14001, + "3": 0.09738, + "4": 0.09666, + "5": 0.09591, + "6": 0.09502, + "7": 0.30332, + "8": 0.09429, + "9": 0.09574, + "10": 0.32414, + "11": 0.10077, + "12": 0.09969, + "13": 0.10068, + "14": 0.09948, + "15": 0.09294, + "16": 0.09255, + "17": 0.09477, + "18": 0.14327, + "19": 0.10341, + "20": 0.10247, + "21": 0.11373, + "22": 0.09883, + "23": 0.1005, + "24": 0.10247, + "25": 0.10217, + "26": 0.10239, + "27": 0.36118, + "28": 0.10234, + "29": 0.1012, + "30": 0.10299, + "31": 0.1015, + "32": 0.10188, + "33": 0.32101, + "34": 0.10218, + "35": 0.10166, + "36": 0.10235, + "37": 0.10172, + "38": 0.10247, + "39": 0.10164, + "40": 0.10267, + "41": 0.1028, + "42": 0.10313, + "43": 0.1019, + "44": 0.10268, + "45": 0.10251, + "46": 0.10335, + "47": 0.10126, + "48": 0.10332, + "49": 0.10228, + "50": 0.10227, + "51": 0.10617, + "52": 0.10408, + "53": 0.10202, + "54": 0.10229, + "55": 0.10292, + "56": 0.10208, + "57": 0.10265, + "58": 0.10167, + "59": 0.1041, + "60": 0.10412, + "61": 0.10262, + "62": 0.10173, + "63": 0.10364, + "64": 0.10282, + "65": 0.10402, + "66": 0.10211, + "67": 0.10345, + "68": 0.10307, + "69": 0.10364, + "70": 0.10244, + "71": 0.10307, + "72": 0.10282, + "73": 0.10422, + "74": 0.1031, + "75": 0.10272, + "76": 0.10576, + "77": 0.10322, + "78": 0.10398, + "79": 0.10274, + "80": 0.10278, + "81": 0.10314, + "82": 0.10329, + "83": 0.10412, + "84": 0.10207, + "85": 0.10239, + "86": 0.10321, + "87": 0.10221, + "88": 0.10195, + "89": 0.10399, + "90": 0.10279, + "91": 0.10252, + "92": 0.10385, + "93": 0.10387, + "94": 0.10226, + "95": 0.10105, + "96": 0.10245, + "97": 0.10298, + "98": 0.1036, + "99": 0.10248, + "100": 0.10187 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": 1199.0, + "19": 1499.0, + "20": 1143.0, + "21": 1307.0, + "22": "nan", + "23": 1326.0, + "24": 1091.0, + "25": 1185.0, + "26": 1131.0, + "27": 1294.0, + "28": 1528.0, + "29": 1487.0, + "30": 1375.0, + "31": 1058.0, + "32": 1170.0, + "33": 1406.0, + "34": 1265.0, + "35": 1207.0, + "36": 1197.0, + "37": 1581.0, + "38": 1477.0, + "39": 1542.0, + "40": 1423.0, + "41": 1538.0, + "42": 1460.0, + "43": 1153.0, + "44": 1282.0, + "45": 1344.0, + "46": 1162.0, + "47": 1831.0, + "48": 1308.0, + "49": 1218.0, + "50": 1559.0, + "51": 1515.0, + "52": 1569.0, + "53": 1758.0, + "54": 1439.0, + "55": 1573.0, + "56": 1418.0, + "57": 1514.0, + "58": 1624.0, + "59": 1622.0, + "60": 1564.0, + "61": 1714.0, + "62": 1854.0, + "63": 1577.0, + "64": 1773.0, + "65": 1496.0, + "66": 1668.0, + "67": 1597.0, + "68": 1804.0, + "69": 1804.0, + "70": 1898.0, + "71": 1957.0, + "72": 1568.0, + "73": 2020.0, + "74": 1322.0, + "75": 1893.0, + "76": 1826.0, + "77": 2136.0, + "78": 2137.0, + "79": 1990.0, + "80": 2134.0, + "81": 2465.0, + "82": 2240.0, + "83": 1883.0, + "84": 2128.0, + "85": 2231.0, + "86": 1998.0, + "87": 2747.0, + "88": 2122.0, + "89": 2331.0, + "90": 2378.0, + "91": 1880.0, + "92": 2563.0, + "93": 2065.0, + "94": 2127.0, + "95": 2285.0, + "96": 2665.0, + "97": 2514.0, + "98": 2516.0, + "99": 2265.0, + "100": 2233.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..ad019904f52 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84012, + "2": 10.83568, + "3": 10.83117, + "4": 10.81867, + "5": 10.84147, + "6": 10.87385, + "7": 10.83678, + "8": 10.84423, + "9": 10.84878, + "10": 10.82107, + "11": 10.85906, + "12": 10.85708, + "13": 10.88186, + "14": 10.87751, + "15": 10.85423, + "16": 10.85071, + "17": 10.84178, + "18": 10.86599, + "19": 10.86171, + "20": 10.85111, + "21": 10.85522, + "22": 10.82384, + "23": 10.86861, + "24": 10.82812, + "25": 10.82229, + "26": 10.83266, + "27": 10.82346, + "28": 10.84367, + "29": 10.83315, + "30": 10.75584, + "31": 10.66438, + "32": 10.78744, + "33": 10.76542, + "34": 10.67705, + "35": 10.68389, + "36": 10.63442, + "37": 10.68265, + "38": 10.6013, + "39": 10.69422, + "40": 10.52756, + "41": 10.54166, + "42": 10.56471, + "43": 10.34495, + "44": 10.38785, + "45": 10.3119, + "46": 10.3021, + "47": 10.479, + "48": 10.28168, + "49": 10.05783, + "50": 10.29392, + "51": 10.2381, + "52": 10.15425, + "53": 10.35958, + "54": 10.26866, + "55": 10.21882, + "56": 9.9963, + "57": 9.87322, + "58": 10.14154, + "59": 9.93616, + "60": 9.8477, + "61": 9.98627, + "62": 10.21642, + "63": 9.69005, + "64": 10.01919, + "65": 9.30027, + "66": 9.9353, + "67": 9.63074, + "68": 9.99036, + "69": 9.98369, + "70": 9.92473, + "71": 9.81441, + "72": 9.79281, + "73": 9.67937, + "74": 9.19331, + "75": 9.60615, + "76": 9.28477, + "77": 10.18543, + "78": 9.86681, + "79": 9.52304, + "80": 9.55867, + "81": 9.62718, + "82": 9.81491, + "83": 9.45803, + "84": 9.53679, + "85": 9.7331, + "86": 9.20021, + "87": 9.69537, + "88": 9.85367, + "89": 9.7164, + "90": 9.91024, + "91": 9.46125, + "92": 9.46592, + "93": 9.19252, + "94": 8.94116, + "95": 9.60586, + "96": 9.62228, + "97": 9.39813, + "98": 9.76041, + "99": 8.9914, + "100": 9.49453 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 454770688.0, + "2": 454770688.0, + "3": 454770688.0, + "4": 454770688.0, + "5": 454770688.0, + "6": 454770688.0, + "7": 454770688.0, + "8": 454770688.0, + "9": 454770688.0, + "10": 454770688.0, + "11": 454770688.0, + "12": 454770688.0, + "13": 454770688.0, + "14": 454770688.0, + "15": 454770688.0, + "16": 454770688.0, + "17": 454770688.0, + "18": 518880768.0, + "19": 518880768.0, + "20": 518880768.0, + "21": 518880768.0, + "22": 518880768.0, + "23": 518880768.0, + "24": 518880768.0, + "25": 518880768.0, + "26": 518880768.0, + "27": 518880768.0, + "28": 518880768.0, + "29": 518880768.0, + "30": 518880768.0, + "31": 518880768.0, + "32": 518880768.0, + "33": 518880768.0, + "34": 518880768.0, + "35": 518880768.0, + "36": 518880768.0, + "37": 518880768.0, + "38": 518880768.0, + "39": 518880768.0, + "40": 518880768.0, + "41": 518880768.0, + "42": 518880768.0, + "43": 518880768.0, + "44": 518880768.0, + "45": 518880768.0, + "46": 518880768.0, + "47": 518880768.0, + "48": 518880768.0, + "49": 518880768.0, + "50": 518880768.0, + "51": 518880768.0, + "52": 518880768.0, + "53": 518880768.0, + "54": 518880768.0, + "55": 518880768.0, + "56": 518880768.0, + "57": 518880768.0, + "58": 518880768.0, + "59": 518880768.0, + "60": 518880768.0, + "61": 518880768.0, + "62": 518880768.0, + "63": 518880768.0, + "64": 518880768.0, + "65": 518880768.0, + "66": 518880768.0, + "67": 518880768.0, + "68": 518880768.0, + "69": 518880768.0, + "70": 518880768.0, + "71": 518880768.0, + "72": 518880768.0, + "73": 518880768.0, + "74": 518880768.0, + "75": 518880768.0, + "76": 518880768.0, + "77": 518880768.0, + "78": 518880768.0, + "79": 518880768.0, + "80": 518880768.0, + "81": 518880768.0, + "82": 518880768.0, + "83": 518880768.0, + "84": 518880768.0, + "85": 518880768.0, + "86": 518880768.0, + "87": 518880768.0, + "88": 518880768.0, + "89": 518880768.0, + "90": 518880768.0, + "91": 518880768.0, + "92": 518880768.0, + "93": 518880768.0, + "94": 518880768.0, + "95": 518880768.0, + "96": 518880768.0, + "97": 518880768.0, + "98": 518880768.0, + "99": 518880768.0, + "100": 518880768.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4511150592.0, + "2": 4544705536.0, + "3": 4544705536.0, + "4": 4544705536.0, + "5": 4544705536.0, + "6": 4544705536.0, + "7": 4544705536.0, + "8": 4544705536.0, + "9": 4544705536.0, + "10": 4544705536.0, + "11": 4544705536.0, + "12": 4544705536.0, + "13": 4544705536.0, + "14": 4544705536.0, + "15": 4544705536.0, + "16": 4544705536.0, + "17": 4544705536.0, + "18": 4544705536.0, + "19": 4607767040.0, + "20": 4607767040.0, + "21": 4607767040.0, + "22": 4607767040.0, + "23": 4607767040.0, + "24": 4607767040.0, + "25": 4607767040.0, + "26": 4607767040.0, + "27": 4607767040.0, + "28": 4607767040.0, + "29": 4607767040.0, + "30": 4607767040.0, + "31": 4607767040.0, + "32": 4607767040.0, + "33": 4607767040.0, + "34": 4607767040.0, + "35": 4607767040.0, + "36": 4607767040.0, + "37": 4607767040.0, + "38": 4607767040.0, + "39": 4607767040.0, + "40": 4607767040.0, + "41": 4607767040.0, + "42": 4607767040.0, + "43": 4607767040.0, + "44": 4607767040.0, + "45": 4607767040.0, + "46": 4607767040.0, + "47": 4607767040.0, + "48": 4607767040.0, + "49": 4607767040.0, + "50": 4607767040.0, + "51": 4607767040.0, + "52": 4607767040.0, + "53": 4607767040.0, + "54": 4607767040.0, + "55": 4607767040.0, + "56": 4607767040.0, + "57": 4607767040.0, + "58": 4607767040.0, + "59": 4607767040.0, + "60": 4607767040.0, + "61": 4607767040.0, + "62": 4607767040.0, + "63": 4607767040.0, + "64": 4607767040.0, + "65": 4607767040.0, + "66": 4607767040.0, + "67": 4607767040.0, + "68": 4607767040.0, + "69": 4607767040.0, + "70": 4607767040.0, + "71": 4607767040.0, + "72": 4607767040.0, + "73": 4607767040.0, + "74": 4607767040.0, + "75": 4607767040.0, + "76": 4607767040.0, + "77": 4607767040.0, + "78": 4607767040.0, + "79": 4607767040.0, + "80": 4607767040.0, + "81": 4607767040.0, + "82": 4607767040.0, + "83": 4607767040.0, + "84": 4607767040.0, + "85": 4607767040.0, + "86": 4607767040.0, + "87": 4607767040.0, + "88": 4607767040.0, + "89": 4607767040.0, + "90": 4607767040.0, + "91": 4607767040.0, + "92": 4607767040.0, + "93": 4607767040.0, + "94": 4607767040.0, + "95": 4607767040.0, + "96": 4607767040.0, + "97": 4607767040.0, + "98": 4607767040.0, + "99": 4607767040.0, + "100": 4607767040.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 8.67451, + "2": 0.15078, + "3": 0.09855, + "4": 0.09629, + "5": 0.09742, + "6": 0.09583, + "7": 0.09793, + "8": 0.09606, + "9": 0.10504, + "10": 0.09835, + "11": 0.0952, + "12": 0.09441, + "13": 0.0944, + "14": 0.0943, + "15": 0.09542, + "16": 0.09535, + "17": 0.0966, + "18": 0.13822, + "19": 0.10314, + "20": 0.10196, + "21": 0.10307, + "22": 0.09787, + "23": 0.11254, + "24": 0.10384, + "25": 0.10311, + "26": 0.10301, + "27": 0.10387, + "28": 0.10266, + "29": 0.10411, + "30": 0.11398, + "31": 0.32837, + "32": 0.10305, + "33": 0.10287, + "34": 0.10161, + "35": 0.10254, + "36": 0.10257, + "37": 0.10309, + "38": 0.10366, + "39": 0.1025, + "40": 0.1018, + "41": 0.10351, + "42": 0.10149, + "43": 0.10316, + "44": 0.10083, + "45": 0.10239, + "46": 0.34508, + "47": 0.10287, + "48": 0.36063, + "49": 0.10328, + "50": 0.10084, + "51": 0.10526, + "52": 0.10046, + "53": 0.09909, + "54": 0.09965, + "55": 0.09957, + "56": 0.09996, + "57": 0.09902, + "58": 0.1004, + "59": 0.10194, + "60": 0.101, + "61": 0.09902, + "62": 0.10015, + "63": 0.09937, + "64": 0.1003, + "65": 0.09988, + "66": 0.10055, + "67": 0.09976, + "68": 0.10001, + "69": 0.10157, + "70": 0.10136, + "71": 0.09951, + "72": 0.10026, + "73": 0.09946, + "74": 0.10113, + "75": 0.09881, + "76": 0.1007, + "77": 0.09917, + "78": 0.09983, + "79": 0.10051, + "80": 0.10101, + "81": 0.09942, + "82": 0.09995, + "83": 0.09932, + "84": 0.10088, + "85": 0.0992, + "86": 0.10084, + "87": 0.099, + "88": 0.0997, + "89": 0.10146, + "90": 0.10228, + "91": 0.09992, + "92": 0.09981, + "93": 0.09937, + "94": 0.10022, + "95": 0.09934, + "96": 0.10011, + "97": 0.09912, + "98": 0.09963, + "99": 0.10098, + "100": 0.10322 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": "nan", + "18": 1199.0, + "19": 1499.0, + "20": 1143.0, + "21": 1307.0, + "22": "nan", + "23": 1326.0, + "24": 1091.0, + "25": 1185.0, + "26": 1131.0, + "27": 1294.0, + "28": 1528.0, + "29": 1487.0, + "30": 1375.0, + "31": 1058.0, + "32": 1170.0, + "33": 1406.0, + "34": 1265.0, + "35": 1207.0, + "36": 1197.0, + "37": 1581.0, + "38": 1477.0, + "39": 1542.0, + "40": 1423.0, + "41": 1538.0, + "42": 1460.0, + "43": 1153.0, + "44": 1282.0, + "45": 1344.0, + "46": 1162.0, + "47": 1831.0, + "48": 1308.0, + "49": 1218.0, + "50": 1559.0, + "51": 1515.0, + "52": 1569.0, + "53": 1758.0, + "54": 1439.0, + "55": 1573.0, + "56": 1418.0, + "57": 1514.0, + "58": 1624.0, + "59": 1622.0, + "60": 1564.0, + "61": 1714.0, + "62": 1854.0, + "63": 1577.0, + "64": 1773.0, + "65": 1496.0, + "66": 1668.0, + "67": 1597.0, + "68": 1804.0, + "69": 1804.0, + "70": 1898.0, + "71": 1957.0, + "72": 1568.0, + "73": 2020.0, + "74": 1322.0, + "75": 1893.0, + "76": 1826.0, + "77": 2136.0, + "78": 2137.0, + "79": 1990.0, + "80": 2134.0, + "81": 2465.0, + "82": 2240.0, + "83": 1883.0, + "84": 2128.0, + "85": 2231.0, + "86": 1998.0, + "87": 2747.0, + "88": 2122.0, + "89": 2331.0, + "90": 2378.0, + "91": 1880.0, + "92": 2563.0, + "93": 2065.0, + "94": 2127.0, + "95": 2285.0, + "96": 2665.0, + "97": 2514.0, + "98": 2516.0, + "99": 2265.0, + "100": 2233.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_h100.json index 2e7b0f25d33..f558db5b4f0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.86114, + "2": 10.86847, + "3": 10.86465, + "4": 10.86473, "5": 10.87296, + "6": 10.88615, + "7": 10.8645, + "8": 10.87335, + "9": 10.87481, "10": 10.83903, + "11": 10.86614, + "12": 10.86169, + "13": 10.87354, + "14": 10.87593, "15": 10.8216, + "16": 10.83071, + "17": 10.79411, + "18": 10.81433, + "19": 10.80011, "20": 10.71697, + "21": 10.70154, + "22": 10.57235, + "23": 10.70749, + "24": 10.6006, "25": 10.5566, + "26": 10.60138, + "27": 10.60955, + "28": 10.55626, + "29": 10.57268, "30": 10.36032, + "31": 10.11454, + "32": 10.45937, + "33": 10.45389, + "34": 10.21168, "35": 10.26583, + "36": 10.21483, + "37": 10.34814, + "38": 10.19787, + "39": 10.39713, "40": 10.08719, + "41": 10.13539, + "42": 10.20638, + "43": 9.82769, + "44": 9.95444, "45": 9.82374, + "46": 9.79864, + "47": 10.12579, + "48": 9.83547, + "49": 9.51888, "50": 9.90498 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1696.0, + "2": 1671.0, + "3": 1537.0, + "4": 1705.0, "5": 1776.0, + "6": 1735.0, + "7": 1767.0, + "8": 1569.0, + "9": 1750.0, "10": 1413.0, + "11": 1746.0, + "12": 1681.0, + "13": 1828.0, + "14": 1739.0, "15": 1801.0, + "16": 1895.0, + "17": 1781.0, + "18": 1693.0, + "19": 1705.0, "20": 1624.0, + "21": 1838.0, + "22": 1792.0, + "23": 2005.0, + "24": 1601.0, "25": 1483.0, + "26": 1615.0, + "27": 1844.0, + "28": 1961.0, + "29": 2012.0, "30": 1856.0, + "31": 1502.0, + "32": 1794.0, + "33": 2118.0, + "34": 1742.0, "35": 1953.0, + "36": 1940.0, + "37": 2324.0, + "38": 2109.0, + "39": 2369.0, "40": 2183.0, + "41": 2063.0, + "42": 2232.0, + "43": 1917.0, + "44": 2084.0, "45": 2058.0, + "46": 2144.0, + "47": 2488.0, + "48": 2407.0, + "49": 2125.0, "50": 2134.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 952847360.0, + "2": 952847360.0, + "3": 952847360.0, + "4": 952847360.0, "5": 952847360.0, + "6": 952847360.0, + "7": 952847360.0, + "8": 952847360.0, + "9": 952847360.0, "10": 952847360.0, + "11": 952847360.0, + "12": 952847360.0, + "13": 952847360.0, + "14": 952847360.0, "15": 952847360.0, + "16": 952847360.0, + "17": 952847360.0, + "18": 952847360.0, + "19": 952847360.0, "20": 952847360.0, + "21": 952847360.0, + "22": 952847360.0, + "23": 952847360.0, + "24": 952847360.0, "25": 952847360.0, + "26": 952847360.0, + "27": 952847360.0, + "28": 952847360.0, + "29": 952847360.0, "30": 952847360.0, + "31": 952847360.0, + "32": 952847360.0, + "33": 952847360.0, + "34": 952847360.0, "35": 952847360.0, + "36": 952847360.0, + "37": 952847360.0, + "38": 952847360.0, + "39": 952847360.0, "40": 952847360.0, + "41": 952847360.0, + "42": 952847360.0, + "43": 952847360.0, + "44": 952847360.0, "45": 952847360.0, + "46": 952847360.0, + "47": 952847360.0, + "48": 952847360.0, + "49": 952847360.0, "50": 952847360.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 3275808768.0, + "2": 3637371904.0, + "3": 3637371904.0, + "4": 3637371904.0, "5": 3637371904.0, + "6": 3637371904.0, + "7": 3637371904.0, + "8": 3637371904.0, + "9": 3637371904.0, "10": 3637371904.0, + "11": 3637371904.0, + "12": 3637371904.0, + "13": 3637371904.0, + "14": 3637371904.0, "15": 3637371904.0, + "16": 3637371904.0, + "17": 3637371904.0, + "18": 3637371904.0, + "19": 3637371904.0, "20": 3637371904.0, + "21": 3637371904.0, + "22": 3637371904.0, + "23": 3637371904.0, + "24": 3637371904.0, "25": 3637371904.0, + "26": 3637371904.0, + "27": 3637371904.0, + "28": 3637371904.0, + "29": 3637371904.0, "30": 3637371904.0, + "31": 3637371904.0, + "32": 3637371904.0, + "33": 3637371904.0, + "34": 3637371904.0, "35": 3637371904.0, + "36": 3637371904.0, + "37": 3637371904.0, + "38": 3637371904.0, + "39": 3637371904.0, "40": 3637371904.0, + "41": 3637371904.0, + "42": 3637371904.0, + "43": 3637371904.0, + "44": 3637371904.0, "45": 3637371904.0, + "46": 3637371904.0, + "47": 3637371904.0, + "48": 3637371904.0, + "49": 3637371904.0, "50": 3637371904.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 7.61967, - "5": 0.10355, - "10": 0.08878, - "15": 0.08692, - "20": 0.08664, - "25": 0.0863, - "30": 0.08732, - "35": 0.08763, - "40": 0.08674, - "45": 0.087, - "50": 0.08652 + "1": 8.69225, + "2": 0.11422, + "3": 0.10425, + "4": 0.10234, + "5": 0.10569, + "6": 0.10564, + "7": 0.1017, + "8": 0.10104, + "9": 0.10184, + "10": 0.10389, + "11": 0.10239, + "12": 0.10308, + "13": 0.10366, + "14": 0.10282, + "15": 0.10527, + "16": 0.10468, + "17": 0.10379, + "18": 0.10311, + "19": 0.10589, + "20": 0.1039, + "21": 0.10317, + "22": 0.10318, + "23": 0.10407, + "24": 0.1045, + "25": 0.10518, + "26": 0.10372, + "27": 0.10299, + "28": 0.1034, + "29": 0.1018, + "30": 0.10184, + "31": 0.10197, + "32": 0.10201, + "33": 0.10166, + "34": 0.1031, + "35": 0.1016, + "36": 0.10083, + "37": 0.09963, + "38": 0.10028, + "39": 0.10032, + "40": 0.10016, + "41": 0.09952, + "42": 0.09904, + "43": 0.09972, + "44": 0.10089, + "45": 0.10162, + "46": 0.10079, + "47": 0.09922, + "48": 0.10128, + "49": 0.09992, + "50": 0.0985 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..64d215b77ba --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86114, + "2": 10.86847, + "3": 10.86465, + "4": 10.86473, + "5": 10.87296, + "6": 10.88615, + "7": 10.8645, + "8": 10.87335, + "9": 10.87481, + "10": 10.83903, + "11": 10.86614, + "12": 10.86169, + "13": 10.87354, + "14": 10.87593, + "15": 10.8216, + "16": 10.83071, + "17": 10.79411, + "18": 10.81433, + "19": 10.80011, + "20": 10.71697, + "21": 10.70154, + "22": 10.57235, + "23": 10.70749, + "24": 10.6006, + "25": 10.5566, + "26": 10.60138, + "27": 10.60955, + "28": 10.55626, + "29": 10.57268, + "30": 10.36032, + "31": 10.11454, + "32": 10.45937, + "33": 10.45389, + "34": 10.21168, + "35": 10.26583, + "36": 10.21483, + "37": 10.34814, + "38": 10.19787, + "39": 10.39713, + "40": 10.08719, + "41": 10.13539, + "42": 10.20638, + "43": 9.82769, + "44": 9.95444, + "45": 9.82374, + "46": 9.79864, + "47": 10.12579, + "48": 9.83547, + "49": 9.51888, + "50": 9.90498 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1696.0, + "2": 1671.0, + "3": 1537.0, + "4": 1705.0, + "5": 1776.0, + "6": 1735.0, + "7": 1767.0, + "8": 1569.0, + "9": 1750.0, + "10": 1413.0, + "11": 1746.0, + "12": 1681.0, + "13": 1828.0, + "14": 1739.0, + "15": 1801.0, + "16": 1895.0, + "17": 1781.0, + "18": 1693.0, + "19": 1705.0, + "20": 1624.0, + "21": 1838.0, + "22": 1792.0, + "23": 2005.0, + "24": 1601.0, + "25": 1483.0, + "26": 1615.0, + "27": 1844.0, + "28": 1961.0, + "29": 2012.0, + "30": 1856.0, + "31": 1502.0, + "32": 1794.0, + "33": 2118.0, + "34": 1742.0, + "35": 1953.0, + "36": 1940.0, + "37": 2324.0, + "38": 2109.0, + "39": 2369.0, + "40": 2183.0, + "41": 2063.0, + "42": 2232.0, + "43": 1917.0, + "44": 2084.0, + "45": 2058.0, + "46": 2144.0, + "47": 2488.0, + "48": 2407.0, + "49": 2125.0, + "50": 2134.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 952847360.0, + "2": 952847360.0, + "3": 952847360.0, + "4": 952847360.0, + "5": 952847360.0, + "6": 952847360.0, + "7": 952847360.0, + "8": 952847360.0, + "9": 952847360.0, + "10": 952847360.0, + "11": 952847360.0, + "12": 952847360.0, + "13": 952847360.0, + "14": 952847360.0, + "15": 952847360.0, + "16": 952847360.0, + "17": 952847360.0, + "18": 952847360.0, + "19": 952847360.0, + "20": 952847360.0, + "21": 952847360.0, + "22": 952847360.0, + "23": 952847360.0, + "24": 952847360.0, + "25": 952847360.0, + "26": 952847360.0, + "27": 952847360.0, + "28": 952847360.0, + "29": 952847360.0, + "30": 952847360.0, + "31": 952847360.0, + "32": 952847360.0, + "33": 952847360.0, + "34": 952847360.0, + "35": 952847360.0, + "36": 952847360.0, + "37": 952847360.0, + "38": 952847360.0, + "39": 952847360.0, + "40": 952847360.0, + "41": 952847360.0, + "42": 952847360.0, + "43": 952847360.0, + "44": 952847360.0, + "45": 952847360.0, + "46": 952847360.0, + "47": 952847360.0, + "48": 952847360.0, + "49": 952847360.0, + "50": 952847360.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3275808768.0, + "2": 3637371904.0, + "3": 3637371904.0, + "4": 3637371904.0, + "5": 3637371904.0, + "6": 3637371904.0, + "7": 3637371904.0, + "8": 3637371904.0, + "9": 3637371904.0, + "10": 3637371904.0, + "11": 3637371904.0, + "12": 3637371904.0, + "13": 3637371904.0, + "14": 3637371904.0, + "15": 3637371904.0, + "16": 3637371904.0, + "17": 3637371904.0, + "18": 3637371904.0, + "19": 3637371904.0, + "20": 3637371904.0, + "21": 3637371904.0, + "22": 3637371904.0, + "23": 3637371904.0, + "24": 3637371904.0, + "25": 3637371904.0, + "26": 3637371904.0, + "27": 3637371904.0, + "28": 3637371904.0, + "29": 3637371904.0, + "30": 3637371904.0, + "31": 3637371904.0, + "32": 3637371904.0, + "33": 3637371904.0, + "34": 3637371904.0, + "35": 3637371904.0, + "36": 3637371904.0, + "37": 3637371904.0, + "38": 3637371904.0, + "39": 3637371904.0, + "40": 3637371904.0, + "41": 3637371904.0, + "42": 3637371904.0, + "43": 3637371904.0, + "44": 3637371904.0, + "45": 3637371904.0, + "46": 3637371904.0, + "47": 3637371904.0, + "48": 3637371904.0, + "49": 3637371904.0, + "50": 3637371904.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 7.73281, + "2": 0.12339, + "3": 0.09356, + "4": 0.09244, + "5": 0.0876, + "6": 0.08746, + "7": 0.08714, + "8": 0.08631, + "9": 0.08986, + "10": 0.09011, + "11": 0.09237, + "12": 0.09085, + "13": 0.09077, + "14": 0.09007, + "15": 0.0931, + "16": 0.09275, + "17": 0.08996, + "18": 0.0933, + "19": 0.09008, + "20": 0.0898, + "21": 0.08974, + "22": 0.09148, + "23": 0.09027, + "24": 0.09097, + "25": 0.08936, + "26": 0.08932, + "27": 0.09046, + "28": 0.09053, + "29": 0.08937, + "30": 0.08941, + "31": 0.09008, + "32": 0.08969, + "33": 0.08975, + "34": 0.09039, + "35": 0.08967, + "36": 0.08981, + "37": 0.09109, + "38": 0.08894, + "39": 0.09029, + "40": 0.09, + "41": 0.0901, + "42": 0.08944, + "43": 0.09026, + "44": 0.09008, + "45": 0.09096, + "46": 0.08999, + "47": 0.08974, + "48": 0.08959, + "49": 0.09001, + "50": 0.08972 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..6660a5e446e --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86114, + "2": 10.86847, + "3": 10.86465, + "4": 10.86473, + "5": 10.87296, + "6": 10.88615, + "7": 10.8645, + "8": 10.87335, + "9": 10.87481, + "10": 10.83903, + "11": 10.86614, + "12": 10.86169, + "13": 10.87354, + "14": 10.87593, + "15": 10.8216, + "16": 10.83071, + "17": 10.79411, + "18": 10.81433, + "19": 10.80011, + "20": 10.71697, + "21": 10.70154, + "22": 10.57235, + "23": 10.70749, + "24": 10.6006, + "25": 10.5566, + "26": 10.60138, + "27": 10.60955, + "28": 10.55626, + "29": 10.57268, + "30": 10.36032, + "31": 10.11454, + "32": 10.45937, + "33": 10.45389, + "34": 10.21168, + "35": 10.26583, + "36": 10.21483, + "37": 10.34814, + "38": 10.19787, + "39": 10.39713, + "40": 10.08719, + "41": 10.13539, + "42": 10.20638, + "43": 9.82769, + "44": 9.95444, + "45": 9.82374, + "46": 9.79864, + "47": 10.12579, + "48": 9.83547, + "49": 9.51888, + "50": 9.90498 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1696.0, + "2": 1671.0, + "3": 1537.0, + "4": 1705.0, + "5": 1776.0, + "6": 1735.0, + "7": 1767.0, + "8": 1569.0, + "9": 1750.0, + "10": 1413.0, + "11": 1746.0, + "12": 1681.0, + "13": 1828.0, + "14": 1739.0, + "15": 1801.0, + "16": 1895.0, + "17": 1781.0, + "18": 1693.0, + "19": 1705.0, + "20": 1624.0, + "21": 1838.0, + "22": 1792.0, + "23": 2005.0, + "24": 1601.0, + "25": 1483.0, + "26": 1615.0, + "27": 1844.0, + "28": 1961.0, + "29": 2012.0, + "30": 1856.0, + "31": 1502.0, + "32": 1794.0, + "33": 2118.0, + "34": 1742.0, + "35": 1953.0, + "36": 1940.0, + "37": 2324.0, + "38": 2109.0, + "39": 2369.0, + "40": 2183.0, + "41": 2063.0, + "42": 2232.0, + "43": 1917.0, + "44": 2084.0, + "45": 2058.0, + "46": 2144.0, + "47": 2488.0, + "48": 2407.0, + "49": 2125.0, + "50": 2134.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 952847360.0, + "2": 952847360.0, + "3": 952847360.0, + "4": 952847360.0, + "5": 952847360.0, + "6": 952847360.0, + "7": 952847360.0, + "8": 952847360.0, + "9": 952847360.0, + "10": 952847360.0, + "11": 952847360.0, + "12": 952847360.0, + "13": 952847360.0, + "14": 952847360.0, + "15": 952847360.0, + "16": 952847360.0, + "17": 952847360.0, + "18": 952847360.0, + "19": 952847360.0, + "20": 952847360.0, + "21": 952847360.0, + "22": 952847360.0, + "23": 952847360.0, + "24": 952847360.0, + "25": 952847360.0, + "26": 952847360.0, + "27": 952847360.0, + "28": 952847360.0, + "29": 952847360.0, + "30": 952847360.0, + "31": 952847360.0, + "32": 952847360.0, + "33": 952847360.0, + "34": 952847360.0, + "35": 952847360.0, + "36": 952847360.0, + "37": 952847360.0, + "38": 952847360.0, + "39": 952847360.0, + "40": 952847360.0, + "41": 952847360.0, + "42": 952847360.0, + "43": 952847360.0, + "44": 952847360.0, + "45": 952847360.0, + "46": 952847360.0, + "47": 952847360.0, + "48": 952847360.0, + "49": 952847360.0, + "50": 952847360.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3275808768.0, + "2": 3637371904.0, + "3": 3637371904.0, + "4": 3637371904.0, + "5": 3637371904.0, + "6": 3637371904.0, + "7": 3637371904.0, + "8": 3637371904.0, + "9": 3637371904.0, + "10": 3637371904.0, + "11": 3637371904.0, + "12": 3637371904.0, + "13": 3637371904.0, + "14": 3637371904.0, + "15": 3637371904.0, + "16": 3637371904.0, + "17": 3637371904.0, + "18": 3637371904.0, + "19": 3637371904.0, + "20": 3637371904.0, + "21": 3637371904.0, + "22": 3637371904.0, + "23": 3637371904.0, + "24": 3637371904.0, + "25": 3637371904.0, + "26": 3637371904.0, + "27": 3637371904.0, + "28": 3637371904.0, + "29": 3637371904.0, + "30": 3637371904.0, + "31": 3637371904.0, + "32": 3637371904.0, + "33": 3637371904.0, + "34": 3637371904.0, + "35": 3637371904.0, + "36": 3637371904.0, + "37": 3637371904.0, + "38": 3637371904.0, + "39": 3637371904.0, + "40": 3637371904.0, + "41": 3637371904.0, + "42": 3637371904.0, + "43": 3637371904.0, + "44": 3637371904.0, + "45": 3637371904.0, + "46": 3637371904.0, + "47": 3637371904.0, + "48": 3637371904.0, + "49": 3637371904.0, + "50": 3637371904.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 8.92875, + "2": 0.12034, + "3": 0.10184, + "4": 0.10215, + "5": 0.10291, + "6": 0.10167, + "7": 0.09936, + "8": 0.10097, + "9": 0.10127, + "10": 0.10171, + "11": 0.10013, + "12": 0.09898, + "13": 0.10085, + "14": 0.10081, + "15": 0.10088, + "16": 0.10002, + "17": 0.0999, + "18": 0.10168, + "19": 0.10032, + "20": 0.09815, + "21": 0.10018, + "22": 0.09914, + "23": 0.1005, + "24": 0.10106, + "25": 0.10086, + "26": 0.10152, + "27": 0.1, + "28": 0.10161, + "29": 0.10038, + "30": 0.10045, + "31": 0.10187, + "32": 0.10055, + "33": 0.11357, + "34": 0.10266, + "35": 0.10298, + "36": 0.10061, + "37": 0.10166, + "38": 0.10185, + "39": 0.09925, + "40": 0.10087, + "41": 0.10001, + "42": 0.1, + "43": 0.10286, + "44": 0.10227, + "45": 0.10327, + "46": 0.10041, + "47": 0.10091, + "48": 0.10215, + "49": 0.10017, + "50": 0.10055 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..1306e400ed7 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.8337, + "2": 10.83217, + "3": 10.83141, + "4": 10.80035, + "5": 10.85677, + "6": 10.86685, + "7": 10.84597, + "8": 10.84289, + "9": 10.8558, + "10": 10.80851, + "11": 10.89022, + "12": 10.87084, + "13": 10.87527, + "14": 10.8902, + "15": 10.79856, + "16": 10.81047, + "17": 10.78972, + "18": 10.824, + "19": 10.80709, + "20": 10.71089, + "21": 10.68461, + "22": 10.54244, + "23": 10.71826, + "24": 10.58552, + "25": 10.5436, + "26": 10.60978, + "27": 10.61027, + "28": 10.57094, + "29": 10.5905, + "30": 10.35069, + "31": 10.08989, + "32": 10.47124, + "33": 10.45479, + "34": 10.19985, + "35": 10.26074, + "36": 10.21478, + "37": 10.33663, + "38": 10.17509, + "39": 10.39333, + "40": 10.07155, + "41": 10.14016, + "42": 10.19706, + "43": 9.81234, + "44": 9.93566, + "45": 9.81507, + "46": 9.80601, + "47": 10.12818, + "48": 9.82423, + "49": 9.50741, + "50": 9.88952 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1563.0, + "2": 1726.0, + "3": 1587.0, + "4": 1729.0, + "5": 1808.0, + "6": 1766.0, + "7": 1701.0, + "8": 1761.0, + "9": 1852.0, + "10": 1377.0, + "11": 1784.0, + "12": 1773.0, + "13": 1887.0, + "14": 1869.0, + "15": 1872.0, + "16": 1819.0, + "17": 1779.0, + "18": 1669.0, + "19": 1838.0, + "20": 1675.0, + "21": 1847.0, + "22": 1671.0, + "23": 1931.0, + "24": 1672.0, + "25": 1549.0, + "26": 1756.0, + "27": 1756.0, + "28": 1977.0, + "29": 1963.0, + "30": 2043.0, + "31": 1615.0, + "32": 1875.0, + "33": 2095.0, + "34": 1910.0, + "35": 2002.0, + "36": 1897.0, + "37": 2269.0, + "38": 2215.0, + "39": 2342.0, + "40": 2311.0, + "41": 2338.0, + "42": 2189.0, + "43": 1957.0, + "44": 2119.0, + "45": 2149.0, + "46": 2258.0, + "47": 2617.0, + "48": 2367.0, + "49": 2311.0, + "50": 2368.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 952847360.0, + "2": 952847360.0, + "3": 952847360.0, + "4": 952847360.0, + "5": 952847360.0, + "6": 952847360.0, + "7": 952847360.0, + "8": 952847360.0, + "9": 952847360.0, + "10": 952847360.0, + "11": 952847360.0, + "12": 952847360.0, + "13": 952847360.0, + "14": 952847360.0, + "15": 952847360.0, + "16": 952847360.0, + "17": 952847360.0, + "18": 952847360.0, + "19": 952847360.0, + "20": 952847360.0, + "21": 952847360.0, + "22": 952847360.0, + "23": 952847360.0, + "24": 952847360.0, + "25": 952847360.0, + "26": 952847360.0, + "27": 952847360.0, + "28": 952847360.0, + "29": 952847360.0, + "30": 952847360.0, + "31": 952847360.0, + "32": 952847360.0, + "33": 952847360.0, + "34": 952847360.0, + "35": 952847360.0, + "36": 952847360.0, + "37": 952847360.0, + "38": 952847360.0, + "39": 952847360.0, + "40": 952847360.0, + "41": 952847360.0, + "42": 952847360.0, + "43": 952847360.0, + "44": 952847360.0, + "45": 952847360.0, + "46": 952847360.0, + "47": 952847360.0, + "48": 952847360.0, + "49": 952847360.0, + "50": 952847360.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3275808768.0, + "2": 3637371904.0, + "3": 3637371904.0, + "4": 3637371904.0, + "5": 3637371904.0, + "6": 3637371904.0, + "7": 3637371904.0, + "8": 3637371904.0, + "9": 3637371904.0, + "10": 3637371904.0, + "11": 3637371904.0, + "12": 3637371904.0, + "13": 3637371904.0, + "14": 3637371904.0, + "15": 3637371904.0, + "16": 3637371904.0, + "17": 3637371904.0, + "18": 3637371904.0, + "19": 3637371904.0, + "20": 3637371904.0, + "21": 3637371904.0, + "22": 3637371904.0, + "23": 3637371904.0, + "24": 3637371904.0, + "25": 3637371904.0, + "26": 3637371904.0, + "27": 3637371904.0, + "28": 3637371904.0, + "29": 3637371904.0, + "30": 3637371904.0, + "31": 3637371904.0, + "32": 3637371904.0, + "33": 3637371904.0, + "34": 3637371904.0, + "35": 3637371904.0, + "36": 3637371904.0, + "37": 3637371904.0, + "38": 3637371904.0, + "39": 3637371904.0, + "40": 3637371904.0, + "41": 3637371904.0, + "42": 3637371904.0, + "43": 3637371904.0, + "44": 3637371904.0, + "45": 3637371904.0, + "46": 3637371904.0, + "47": 3637371904.0, + "48": 3637371904.0, + "49": 3637371904.0, + "50": 3637371904.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.97292, + "2": 0.1992, + "3": 0.16312, + "4": 0.15734, + "5": 0.40689, + "6": 0.36557, + "7": 0.15246, + "8": 0.14808, + "9": 0.14741, + "10": 0.14777, + "11": 0.14712, + "12": 0.1483, + "13": 0.14786, + "14": 0.14918, + "15": 0.1483, + "16": 0.14751, + "17": 0.14865, + "18": 0.14757, + "19": 0.14736, + "20": 0.14811, + "21": 0.14912, + "22": 0.14808, + "23": 0.14726, + "24": 0.14827, + "25": 0.14733, + "26": 0.14693, + "27": 0.14758, + "28": 0.14719, + "29": 0.14607, + "30": 0.14763, + "31": 0.14698, + "32": 0.14682, + "33": 0.14766, + "34": 0.14759, + "35": 0.14762, + "36": 0.14523, + "37": 0.14552, + "38": 0.14636, + "39": 0.14736, + "40": 0.14684, + "41": 0.14843, + "42": 0.14643, + "43": 0.1472, + "44": 0.34866, + "45": 0.14782, + "46": 0.14753, + "47": 0.14656, + "48": 0.14734, + "49": 0.14632, + "50": 0.14628 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..d92033a2e8a --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.8337, + "2": 10.83217, + "3": 10.83141, + "4": 10.80035, + "5": 10.85677, + "6": 10.86685, + "7": 10.84597, + "8": 10.84289, + "9": 10.8558, + "10": 10.80851, + "11": 10.89022, + "12": 10.87084, + "13": 10.87527, + "14": 10.8902, + "15": 10.79856, + "16": 10.81047, + "17": 10.78972, + "18": 10.824, + "19": 10.80709, + "20": 10.71089, + "21": 10.68461, + "22": 10.54244, + "23": 10.71826, + "24": 10.58552, + "25": 10.5436, + "26": 10.60978, + "27": 10.61027, + "28": 10.57094, + "29": 10.5905, + "30": 10.35069, + "31": 10.08989, + "32": 10.47124, + "33": 10.45479, + "34": 10.19985, + "35": 10.26074, + "36": 10.21478, + "37": 10.33663, + "38": 10.17509, + "39": 10.39333, + "40": 10.07155, + "41": 10.14016, + "42": 10.19706, + "43": 9.81234, + "44": 9.93566, + "45": 9.81507, + "46": 9.80601, + "47": 10.12818, + "48": 9.82423, + "49": 9.50741, + "50": 9.88952 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1563.0, + "2": 1726.0, + "3": 1587.0, + "4": 1729.0, + "5": 1808.0, + "6": 1766.0, + "7": 1701.0, + "8": 1761.0, + "9": 1852.0, + "10": 1377.0, + "11": 1784.0, + "12": 1773.0, + "13": 1887.0, + "14": 1869.0, + "15": 1872.0, + "16": 1819.0, + "17": 1779.0, + "18": 1669.0, + "19": 1838.0, + "20": 1675.0, + "21": 1847.0, + "22": 1671.0, + "23": 1931.0, + "24": 1672.0, + "25": 1549.0, + "26": 1756.0, + "27": 1756.0, + "28": 1977.0, + "29": 1963.0, + "30": 2043.0, + "31": 1615.0, + "32": 1875.0, + "33": 2095.0, + "34": 1910.0, + "35": 2002.0, + "36": 1897.0, + "37": 2269.0, + "38": 2215.0, + "39": 2342.0, + "40": 2311.0, + "41": 2338.0, + "42": 2189.0, + "43": 1957.0, + "44": 2119.0, + "45": 2149.0, + "46": 2258.0, + "47": 2617.0, + "48": 2367.0, + "49": 2311.0, + "50": 2368.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 952847360.0, + "2": 952847360.0, + "3": 952847360.0, + "4": 952847360.0, + "5": 952847360.0, + "6": 952847360.0, + "7": 952847360.0, + "8": 952847360.0, + "9": 952847360.0, + "10": 952847360.0, + "11": 952847360.0, + "12": 952847360.0, + "13": 952847360.0, + "14": 952847360.0, + "15": 952847360.0, + "16": 952847360.0, + "17": 952847360.0, + "18": 952847360.0, + "19": 952847360.0, + "20": 952847360.0, + "21": 952847360.0, + "22": 952847360.0, + "23": 952847360.0, + "24": 952847360.0, + "25": 952847360.0, + "26": 952847360.0, + "27": 952847360.0, + "28": 952847360.0, + "29": 952847360.0, + "30": 952847360.0, + "31": 952847360.0, + "32": 952847360.0, + "33": 952847360.0, + "34": 952847360.0, + "35": 952847360.0, + "36": 952847360.0, + "37": 952847360.0, + "38": 952847360.0, + "39": 952847360.0, + "40": 952847360.0, + "41": 952847360.0, + "42": 952847360.0, + "43": 952847360.0, + "44": 952847360.0, + "45": 952847360.0, + "46": 952847360.0, + "47": 952847360.0, + "48": 952847360.0, + "49": 952847360.0, + "50": 952847360.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3275808768.0, + "2": 3637371904.0, + "3": 3637371904.0, + "4": 3637371904.0, + "5": 3637371904.0, + "6": 3637371904.0, + "7": 3637371904.0, + "8": 3637371904.0, + "9": 3637371904.0, + "10": 3637371904.0, + "11": 3637371904.0, + "12": 3637371904.0, + "13": 3637371904.0, + "14": 3637371904.0, + "15": 3637371904.0, + "16": 3637371904.0, + "17": 3637371904.0, + "18": 3637371904.0, + "19": 3637371904.0, + "20": 3637371904.0, + "21": 3637371904.0, + "22": 3637371904.0, + "23": 3637371904.0, + "24": 3637371904.0, + "25": 3637371904.0, + "26": 3637371904.0, + "27": 3637371904.0, + "28": 3637371904.0, + "29": 3637371904.0, + "30": 3637371904.0, + "31": 3637371904.0, + "32": 3637371904.0, + "33": 3637371904.0, + "34": 3637371904.0, + "35": 3637371904.0, + "36": 3637371904.0, + "37": 3637371904.0, + "38": 3637371904.0, + "39": 3637371904.0, + "40": 3637371904.0, + "41": 3637371904.0, + "42": 3637371904.0, + "43": 3637371904.0, + "44": 3637371904.0, + "45": 3637371904.0, + "46": 3637371904.0, + "47": 3637371904.0, + "48": 3637371904.0, + "49": 3637371904.0, + "50": 3637371904.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 14.79244, + "2": 0.18866, + "3": 0.15434, + "4": 0.15761, + "5": 0.15724, + "6": 0.15378, + "7": 0.15381, + "8": 0.15636, + "9": 0.15341, + "10": 0.15408, + "11": 0.15704, + "12": 0.15148, + "13": 0.14733, + "14": 0.14655, + "15": 0.15415, + "16": 0.15103, + "17": 0.1512, + "18": 0.15478, + "19": 0.15325, + "20": 0.14874, + "21": 0.14873, + "22": 0.15363, + "23": 0.14741, + "24": 0.14761, + "25": 0.14905, + "26": 0.14826, + "27": 0.14811, + "28": 0.14877, + "29": 0.15462, + "30": 0.15391, + "31": 0.15501, + "32": 0.15366, + "33": 0.15348, + "34": 0.15427, + "35": 0.15377, + "36": 0.15502, + "37": 0.15312, + "38": 0.15305, + "39": 0.15313, + "40": 0.15265, + "41": 0.15294, + "42": 0.15318, + "43": 0.15372, + "44": 0.1524, + "45": 0.15283, + "46": 0.15215, + "47": 0.15253, + "48": 0.15208, + "49": 0.15253, + "50": 0.15255 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..9669534a70b --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.83568, + "2": 10.83266, + "3": 10.83151, + "4": 10.80343, + "5": 10.8567, + "6": 10.86778, + "7": 10.84836, + "8": 10.84624, + "9": 10.85924, + "10": 10.81478, + "11": 10.89821, + "12": 10.88433, + "13": 10.88963, + "14": 10.90075, + "15": 10.85098, + "16": 10.86603, + "17": 10.85455, + "18": 10.88507, + "19": 10.8773, + "20": 10.85865, + "21": 10.85654, + "22": 10.79685, + "23": 10.88724, + "24": 10.82649, + "25": 10.81343, + "26": 10.82705, + "27": 10.84612, + "28": 10.84227, + "29": 10.85329, + "30": 10.74969, + "31": 10.63041, + "32": 10.79004, + "33": 10.77234, + "34": 10.65722, + "35": 10.65857, + "36": 10.61583, + "37": 10.67536, + "38": 10.58101, + "39": 10.69083, + "40": 10.50359, + "41": 10.52777, + "42": 10.55371, + "43": 10.28636, + "44": 10.36369, + "45": 10.2738, + "46": 10.24567, + "47": 10.45103, + "48": 10.23707, + "49": 9.99555, + "50": 10.25588, + "51": 10.20129, + "52": 10.10855, + "53": 10.34609, + "54": 10.24857, + "55": 10.18782, + "56": 9.95521, + "57": 9.81221, + "58": 10.10875, + "59": 9.8863, + "60": 9.80901, + "61": 9.94824, + "62": 10.1999, + "63": 9.64431, + "64": 9.9951, + "65": 9.24475, + "66": 9.90917, + "67": 9.59735, + "68": 9.97285, + "69": 9.96332, + "70": 9.91039, + "71": 9.78596, + "72": 9.77263, + "73": 9.6618, + "74": 9.16289, + "75": 9.5812, + "76": 9.26137, + "77": 10.17615, + "78": 9.85644, + "79": 9.50644, + "80": 9.54102, + "81": 9.61313, + "82": 9.80669, + "83": 9.44696, + "84": 9.52782, + "85": 9.72633, + "86": 9.19099, + "87": 9.68736, + "88": 9.85216, + "89": 9.71335, + "90": 9.90316, + "91": 9.46064, + "92": 9.46059, + "93": 9.19418, + "94": 8.93434, + "95": 9.60258, + "96": 9.61852, + "97": 9.39594, + "98": 9.76012, + "99": 8.98668, + "100": 9.49405 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 684610560.0, + "2": 684610560.0, + "3": 684610560.0, + "4": 684610560.0, + "5": 684610560.0, + "6": 684610560.0, + "7": 684610560.0, + "8": 684610560.0, + "9": 684610560.0, + "10": 684610560.0, + "11": 684610560.0, + "12": 684610560.0, + "13": 684610560.0, + "14": 684610560.0, + "15": 684610560.0, + "16": 684610560.0, + "17": 1043027456.0, + "18": 1043027456.0, + "19": 1043027456.0, + "20": 1043027456.0, + "21": 1043027456.0, + "22": 1043027456.0, + "23": 1043027456.0, + "24": 1043027456.0, + "25": 1043027456.0, + "26": 1043027456.0, + "27": 1043027456.0, + "28": 1043027456.0, + "29": 1043027456.0, + "30": 1043027456.0, + "31": 1043027456.0, + "32": 1043027456.0, + "33": 1043027456.0, + "34": 1043027456.0, + "35": 1043027456.0, + "36": 1043027456.0, + "37": 1043027456.0, + "38": 1043027456.0, + "39": 1043027456.0, + "40": 1043027456.0, + "41": 1043027456.0, + "42": 1043027456.0, + "43": 1043027456.0, + "44": 1043027456.0, + "45": 1043027456.0, + "46": 1043027456.0, + "47": 1043027456.0, + "48": 1043027456.0, + "49": 1043027456.0, + "50": 1043027456.0, + "51": 1043027456.0, + "52": 1043027456.0, + "53": 1043027456.0, + "54": 1043027456.0, + "55": 1043027456.0, + "56": 1043027456.0, + "57": 1043027456.0, + "58": 1043027456.0, + "59": 1043027456.0, + "60": 1043027456.0, + "61": 1043027456.0, + "62": 1043027456.0, + "63": 1043027456.0, + "64": 1043027456.0, + "65": 1043027456.0, + "66": 1043027456.0, + "67": 1043027456.0, + "68": 1043027456.0, + "69": 1043027456.0, + "70": 1043027456.0, + "71": 1043027456.0, + "72": 1043027456.0, + "73": 1043027456.0, + "74": 1043027456.0, + "75": 1043027456.0, + "76": 1043027456.0, + "77": 1043027456.0, + "78": 1043027456.0, + "79": 1043027456.0, + "80": 1043027456.0, + "81": 1043027456.0, + "82": 1043027456.0, + "83": 1043027456.0, + "84": 1043027456.0, + "85": 1043027456.0, + "86": 1043027456.0, + "87": 1043027456.0, + "88": 1043027456.0, + "89": 1043027456.0, + "90": 1043027456.0, + "91": 1043027456.0, + "92": 1043027456.0, + "93": 1043027456.0, + "94": 1043027456.0, + "95": 1043027456.0, + "96": 1043027456.0, + "97": 1043027456.0, + "98": 1043027456.0, + "99": 1043027456.0, + "100": 1043027456.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3187304960.0, + "2": 3187305472.0, + "3": 3187305472.0, + "4": 3187305472.0, + "5": 3187305472.0, + "6": 3187305472.0, + "7": 3187305472.0, + "8": 3187305472.0, + "9": 3187305472.0, + "10": 3187305472.0, + "11": 3187305472.0, + "12": 3187305472.0, + "13": 3187305472.0, + "14": 3187305472.0, + "15": 3187305472.0, + "16": 3187305472.0, + "17": 3187305472.0, + "18": 3547033088.0, + "19": 3547033088.0, + "20": 3547033088.0, + "21": 3547033088.0, + "22": 3547033088.0, + "23": 3547033088.0, + "24": 3547033088.0, + "25": 3547033088.0, + "26": 3547033088.0, + "27": 3547033088.0, + "28": 3547033088.0, + "29": 3547033088.0, + "30": 3547033088.0, + "31": 3547033088.0, + "32": 3547033088.0, + "33": 3547033088.0, + "34": 3547033088.0, + "35": 3547033088.0, + "36": 3547033088.0, + "37": 3547033088.0, + "38": 3547033088.0, + "39": 3547033088.0, + "40": 3547033088.0, + "41": 3547033088.0, + "42": 3547033088.0, + "43": 3547033088.0, + "44": 3547033088.0, + "45": 3547033088.0, + "46": 3547033088.0, + "47": 3547033088.0, + "48": 3547033088.0, + "49": 3547033088.0, + "50": 3547033088.0, + "51": 3547033088.0, + "52": 3547033088.0, + "53": 3547033088.0, + "54": 3547033088.0, + "55": 3547033088.0, + "56": 3547033088.0, + "57": 3547033088.0, + "58": 3547033088.0, + "59": 3547033088.0, + "60": 3547033088.0, + "61": 3547033088.0, + "62": 3547033088.0, + "63": 3547033088.0, + "64": 3547033088.0, + "65": 3547033088.0, + "66": 3547033088.0, + "67": 3547033088.0, + "68": 3547033088.0, + "69": 3547033088.0, + "70": 3547033088.0, + "71": 3547033088.0, + "72": 3547033088.0, + "73": 3547033088.0, + "74": 3547033088.0, + "75": 3547033088.0, + "76": 3547033088.0, + "77": 3547033088.0, + "78": 3547033088.0, + "79": 3547033088.0, + "80": 3547033088.0, + "81": 3547033088.0, + "82": 3547033088.0, + "83": 3547033088.0, + "84": 3547033088.0, + "85": 3547033088.0, + "86": 3547033088.0, + "87": 3547033088.0, + "88": 3547033088.0, + "89": 3547033088.0, + "90": 3547033088.0, + "91": 3547033088.0, + "92": 3547033088.0, + "93": 3547033088.0, + "94": 3547033088.0, + "95": 3547033088.0, + "96": 3547033088.0, + "97": 3547033088.0, + "98": 3547033088.0, + "99": 3547033088.0, + "100": 3547033088.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.2136, + "2": 0.17385, + "3": 0.1375, + "4": 0.14124, + "5": 0.13525, + "6": 0.13546, + "7": 0.13534, + "8": 0.13459, + "9": 0.13505, + "10": 0.1463, + "11": 0.13547, + "12": 0.14518, + "13": 0.13738, + "14": 0.13687, + "15": 0.14389, + "16": 0.13574, + "17": 0.2165, + "18": 0.15319, + "19": 0.14548, + "20": 0.15335, + "21": 0.14926, + "22": 0.13834, + "23": 0.14513, + "24": 0.14572, + "25": 0.14607, + "26": 0.14645, + "27": 0.14591, + "28": 0.14675, + "29": 0.14668, + "30": 0.1468, + "31": 0.14701, + "32": 0.14635, + "33": 0.14655, + "34": 0.14999, + "35": 0.14702, + "36": 0.14559, + "37": 0.14632, + "38": 0.15055, + "39": 0.1456, + "40": 0.15293, + "41": 0.14613, + "42": 0.14562, + "43": 0.15546, + "44": 0.14537, + "45": 0.14571, + "46": 0.14754, + "47": 0.14944, + "48": 0.14875, + "49": 0.14515, + "50": 0.14462, + "51": 0.15106, + "52": 0.1468, + "53": 0.14697, + "54": 0.14607, + "55": 0.14673, + "56": 0.1478, + "57": 0.14729, + "58": 0.14787, + "59": 0.14686, + "60": 0.14664, + "61": 0.14613, + "62": 0.14473, + "63": 0.14534, + "64": 0.14576, + "65": 0.14698, + "66": 0.14626, + "67": 0.14642, + "68": 0.14692, + "69": 0.14497, + "70": 0.14585, + "71": 0.14658, + "72": 0.14646, + "73": 0.14784, + "74": 0.14641, + "75": 0.14604, + "76": 0.14649, + "77": 0.14675, + "78": 0.14677, + "79": 0.14639, + "80": 0.14873, + "81": 0.14632, + "82": 0.14642, + "83": 0.14666, + "84": 0.14579, + "85": 0.14675, + "86": 0.14449, + "87": 0.14611, + "88": 0.1466, + "89": 0.14651, + "90": 0.14511, + "91": 0.14613, + "92": 0.14552, + "93": 0.14658, + "94": 0.14599, + "95": 0.14588, + "96": 0.14535, + "97": 0.14603, + "98": 0.14551, + "99": 0.14681, + "100": 0.14606 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": 2249.0, + "18": 2165.0, + "19": 2362.0, + "20": 1953.0, + "21": 1898.0, + "22": "nan", + "23": 2371.0, + "24": 1984.0, + "25": 1818.0, + "26": 1980.0, + "27": 2078.0, + "28": 2467.0, + "29": 2395.0, + "30": 2298.0, + "31": 1682.0, + "32": 2236.0, + "33": 2192.0, + "34": 1800.0, + "35": 2083.0, + "36": 2139.0, + "37": 2498.0, + "38": 2218.0, + "39": 2642.0, + "40": 2287.0, + "41": 2344.0, + "42": 2304.0, + "43": 2098.0, + "44": 2107.0, + "45": 2243.0, + "46": 1960.0, + "47": 2729.0, + "48": 2418.0, + "49": 1910.0, + "50": 2426.0, + "51": 2335.0, + "52": 2407.0, + "53": 2888.0, + "54": 2477.0, + "55": 2440.0, + "56": 2286.0, + "57": 2340.0, + "58": 2652.0, + "59": 2321.0, + "60": 2493.0, + "61": 2812.0, + "62": 2711.0, + "63": 2367.0, + "64": 2802.0, + "65": 2411.0, + "66": 2869.0, + "67": 2577.0, + "68": 2859.0, + "69": 2524.0, + "70": 3119.0, + "71": 2926.0, + "72": 2251.0, + "73": 2929.0, + "74": 2110.0, + "75": 2884.0, + "76": 2992.0, + "77": 3380.0, + "78": 3484.0, + "79": 3533.0, + "80": 3549.0, + "81": 3616.0, + "82": 3347.0, + "83": 3124.0, + "84": 3276.0, + "85": 3721.0, + "86": 3207.0, + "87": 3941.0, + "88": 3250.0, + "89": 3863.0, + "90": 3452.0, + "91": 2630.0, + "92": 3431.0, + "93": 3123.0, + "94": 3671.0, + "95": 3340.0, + "96": 3874.0, + "97": 3519.0, + "98": 3727.0, + "99": 3447.0, + "100": 3338.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgxa100_dracooci.json new file mode 100644 index 00000000000..fbf4935d854 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.83568, + "2": 10.83266, + "3": 10.83151, + "4": 10.80343, + "5": 10.8567, + "6": 10.86778, + "7": 10.84836, + "8": 10.84624, + "9": 10.85924, + "10": 10.81478, + "11": 10.89821, + "12": 10.88433, + "13": 10.88963, + "14": 10.90075, + "15": 10.85098, + "16": 10.86603, + "17": 10.85455, + "18": 10.88507, + "19": 10.8773, + "20": 10.85865, + "21": 10.85654, + "22": 10.79685, + "23": 10.88724, + "24": 10.82649, + "25": 10.81343, + "26": 10.82705, + "27": 10.84612, + "28": 10.84227, + "29": 10.85329, + "30": 10.74969, + "31": 10.63041, + "32": 10.79004, + "33": 10.77234, + "34": 10.65722, + "35": 10.65857, + "36": 10.61583, + "37": 10.67536, + "38": 10.58101, + "39": 10.69083, + "40": 10.50359, + "41": 10.52777, + "42": 10.55371, + "43": 10.28636, + "44": 10.36369, + "45": 10.2738, + "46": 10.24567, + "47": 10.45103, + "48": 10.23707, + "49": 9.99555, + "50": 10.25588, + "51": 10.20129, + "52": 10.10855, + "53": 10.34609, + "54": 10.24857, + "55": 10.18782, + "56": 9.95521, + "57": 9.81221, + "58": 10.10875, + "59": 9.8863, + "60": 9.80901, + "61": 9.94824, + "62": 10.1999, + "63": 9.64431, + "64": 9.9951, + "65": 9.24475, + "66": 9.90917, + "67": 9.59735, + "68": 9.97285, + "69": 9.96332, + "70": 9.91039, + "71": 9.78596, + "72": 9.77263, + "73": 9.6618, + "74": 9.16289, + "75": 9.5812, + "76": 9.26137, + "77": 10.17615, + "78": 9.85644, + "79": 9.50644, + "80": 9.54102, + "81": 9.61313, + "82": 9.80669, + "83": 9.44696, + "84": 9.52782, + "85": 9.72633, + "86": 9.19099, + "87": 9.68736, + "88": 9.85216, + "89": 9.71335, + "90": 9.90316, + "91": 9.46064, + "92": 9.46059, + "93": 9.19418, + "94": 8.93434, + "95": 9.60258, + "96": 9.61852, + "97": 9.39594, + "98": 9.76012, + "99": 8.98668, + "100": 9.49405 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 684610560.0, + "2": 684610560.0, + "3": 684610560.0, + "4": 684610560.0, + "5": 684610560.0, + "6": 684610560.0, + "7": 684610560.0, + "8": 684610560.0, + "9": 684610560.0, + "10": 684610560.0, + "11": 684610560.0, + "12": 684610560.0, + "13": 684610560.0, + "14": 684610560.0, + "15": 684610560.0, + "16": 684610560.0, + "17": 1043027456.0, + "18": 1043027456.0, + "19": 1043027456.0, + "20": 1043027456.0, + "21": 1043027456.0, + "22": 1043027456.0, + "23": 1043027456.0, + "24": 1043027456.0, + "25": 1043027456.0, + "26": 1043027456.0, + "27": 1043027456.0, + "28": 1043027456.0, + "29": 1043027456.0, + "30": 1043027456.0, + "31": 1043027456.0, + "32": 1043027456.0, + "33": 1043027456.0, + "34": 1043027456.0, + "35": 1043027456.0, + "36": 1043027456.0, + "37": 1043027456.0, + "38": 1043027456.0, + "39": 1043027456.0, + "40": 1043027456.0, + "41": 1043027456.0, + "42": 1043027456.0, + "43": 1043027456.0, + "44": 1043027456.0, + "45": 1043027456.0, + "46": 1043027456.0, + "47": 1043027456.0, + "48": 1043027456.0, + "49": 1043027456.0, + "50": 1043027456.0, + "51": 1043027456.0, + "52": 1043027456.0, + "53": 1043027456.0, + "54": 1043027456.0, + "55": 1043027456.0, + "56": 1043027456.0, + "57": 1043027456.0, + "58": 1043027456.0, + "59": 1043027456.0, + "60": 1043027456.0, + "61": 1043027456.0, + "62": 1043027456.0, + "63": 1043027456.0, + "64": 1043027456.0, + "65": 1043027456.0, + "66": 1043027456.0, + "67": 1043027456.0, + "68": 1043027456.0, + "69": 1043027456.0, + "70": 1043027456.0, + "71": 1043027456.0, + "72": 1043027456.0, + "73": 1043027456.0, + "74": 1043027456.0, + "75": 1043027456.0, + "76": 1043027456.0, + "77": 1043027456.0, + "78": 1043027456.0, + "79": 1043027456.0, + "80": 1043027456.0, + "81": 1043027456.0, + "82": 1043027456.0, + "83": 1043027456.0, + "84": 1043027456.0, + "85": 1043027456.0, + "86": 1043027456.0, + "87": 1043027456.0, + "88": 1043027456.0, + "89": 1043027456.0, + "90": 1043027456.0, + "91": 1043027456.0, + "92": 1043027456.0, + "93": 1043027456.0, + "94": 1043027456.0, + "95": 1043027456.0, + "96": 1043027456.0, + "97": 1043027456.0, + "98": 1043027456.0, + "99": 1043027456.0, + "100": 1043027456.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3187304960.0, + "2": 3187305472.0, + "3": 3187305472.0, + "4": 3187305472.0, + "5": 3187305472.0, + "6": 3187305472.0, + "7": 3187305472.0, + "8": 3187305472.0, + "9": 3187305472.0, + "10": 3187305472.0, + "11": 3187305472.0, + "12": 3187305472.0, + "13": 3187305472.0, + "14": 3187305472.0, + "15": 3187305472.0, + "16": 3187305472.0, + "17": 3187305472.0, + "18": 3547033088.0, + "19": 3547033088.0, + "20": 3547033088.0, + "21": 3547033088.0, + "22": 3547033088.0, + "23": 3547033088.0, + "24": 3547033088.0, + "25": 3547033088.0, + "26": 3547033088.0, + "27": 3547033088.0, + "28": 3547033088.0, + "29": 3547033088.0, + "30": 3547033088.0, + "31": 3547033088.0, + "32": 3547033088.0, + "33": 3547033088.0, + "34": 3547033088.0, + "35": 3547033088.0, + "36": 3547033088.0, + "37": 3547033088.0, + "38": 3547033088.0, + "39": 3547033088.0, + "40": 3547033088.0, + "41": 3547033088.0, + "42": 3547033088.0, + "43": 3547033088.0, + "44": 3547033088.0, + "45": 3547033088.0, + "46": 3547033088.0, + "47": 3547033088.0, + "48": 3547033088.0, + "49": 3547033088.0, + "50": 3547033088.0, + "51": 3547033088.0, + "52": 3547033088.0, + "53": 3547033088.0, + "54": 3547033088.0, + "55": 3547033088.0, + "56": 3547033088.0, + "57": 3547033088.0, + "58": 3547033088.0, + "59": 3547033088.0, + "60": 3547033088.0, + "61": 3547033088.0, + "62": 3547033088.0, + "63": 3547033088.0, + "64": 3547033088.0, + "65": 3547033088.0, + "66": 3547033088.0, + "67": 3547033088.0, + "68": 3547033088.0, + "69": 3547033088.0, + "70": 3547033088.0, + "71": 3547033088.0, + "72": 3547033088.0, + "73": 3547033088.0, + "74": 3547033088.0, + "75": 3547033088.0, + "76": 3547033088.0, + "77": 3547033088.0, + "78": 3547033088.0, + "79": 3547033088.0, + "80": 3547033088.0, + "81": 3547033088.0, + "82": 3547033088.0, + "83": 3547033088.0, + "84": 3547033088.0, + "85": 3547033088.0, + "86": 3547033088.0, + "87": 3547033088.0, + "88": 3547033088.0, + "89": 3547033088.0, + "90": 3547033088.0, + "91": 3547033088.0, + "92": 3547033088.0, + "93": 3547033088.0, + "94": 3547033088.0, + "95": 3547033088.0, + "96": 3547033088.0, + "97": 3547033088.0, + "98": 3547033088.0, + "99": 3547033088.0, + "100": 3547033088.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 5.93467, + "2": 0.17161, + "3": 0.14039, + "4": 0.13829, + "5": 0.13749, + "6": 0.13944, + "7": 0.13824, + "8": 0.13871, + "9": 0.13838, + "10": 0.13737, + "11": 0.13776, + "12": 0.13721, + "13": 0.13753, + "14": 0.13754, + "15": 0.13872, + "16": 0.13797, + "17": 0.20803, + "18": 0.15259, + "19": 0.14464, + "20": 0.14422, + "21": 0.14345, + "22": 0.13549, + "23": 0.14245, + "24": 0.14329, + "25": 0.14394, + "26": 0.14405, + "27": 0.14342, + "28": 0.14331, + "29": 0.14487, + "30": 0.14483, + "31": 0.14485, + "32": 0.14456, + "33": 0.14289, + "34": 0.14297, + "35": 0.14395, + "36": 0.14402, + "37": 0.14382, + "38": 0.13994, + "39": 0.14081, + "40": 0.14133, + "41": 0.14193, + "42": 0.14096, + "43": 0.14276, + "44": 0.14166, + "45": 0.13978, + "46": 0.1416, + "47": 0.14022, + "48": 0.14002, + "49": 0.14073, + "50": 0.14162, + "51": 0.14791, + "52": 0.14124, + "53": 0.14062, + "54": 0.14018, + "55": 0.14011, + "56": 0.13945, + "57": 0.14062, + "58": 0.14119, + "59": 0.14089, + "60": 0.14102, + "61": 0.13963, + "62": 0.14092, + "63": 0.14055, + "64": 0.14084, + "65": 0.14007, + "66": 0.13972, + "67": 0.14119, + "68": 0.13979, + "69": 0.14005, + "70": 0.14035, + "71": 0.14023, + "72": 0.14046, + "73": 0.1403, + "74": 0.13974, + "75": 0.14059, + "76": 0.1405, + "77": 0.14012, + "78": 0.14025, + "79": 0.13985, + "80": 0.1396, + "81": 0.1399, + "82": 0.14103, + "83": 0.13999, + "84": 0.13938, + "85": 0.13986, + "86": 0.14082, + "87": 0.13988, + "88": 0.13941, + "89": 0.13979, + "90": 0.13994, + "91": 0.14044, + "92": 0.13957, + "93": 0.14067, + "94": 0.13918, + "95": 0.14088, + "96": 0.14093, + "97": 0.13871, + "98": 0.13964, + "99": 0.13894, + "100": 0.13923 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": 2249.0, + "18": 2165.0, + "19": 2362.0, + "20": 1953.0, + "21": 1898.0, + "22": "nan", + "23": 2371.0, + "24": 1984.0, + "25": 1818.0, + "26": 1980.0, + "27": 2078.0, + "28": 2467.0, + "29": 2395.0, + "30": 2298.0, + "31": 1682.0, + "32": 2236.0, + "33": 2192.0, + "34": 1800.0, + "35": 2083.0, + "36": 2139.0, + "37": 2498.0, + "38": 2218.0, + "39": 2642.0, + "40": 2287.0, + "41": 2344.0, + "42": 2304.0, + "43": 2098.0, + "44": 2107.0, + "45": 2243.0, + "46": 1960.0, + "47": 2729.0, + "48": 2418.0, + "49": 1910.0, + "50": 2426.0, + "51": 2335.0, + "52": 2407.0, + "53": 2888.0, + "54": 2477.0, + "55": 2440.0, + "56": 2286.0, + "57": 2340.0, + "58": 2652.0, + "59": 2321.0, + "60": 2493.0, + "61": 2812.0, + "62": 2711.0, + "63": 2367.0, + "64": 2802.0, + "65": 2411.0, + "66": 2869.0, + "67": 2577.0, + "68": 2859.0, + "69": 2524.0, + "70": 3119.0, + "71": 2926.0, + "72": 2251.0, + "73": 2929.0, + "74": 2110.0, + "75": 2884.0, + "76": 2992.0, + "77": 3380.0, + "78": 3484.0, + "79": 3533.0, + "80": 3549.0, + "81": 3616.0, + "82": 3347.0, + "83": 3124.0, + "84": 3276.0, + "85": 3721.0, + "86": 3207.0, + "87": 3941.0, + "88": 3250.0, + "89": 3863.0, + "90": 3452.0, + "91": 2630.0, + "92": 3431.0, + "93": 3123.0, + "94": 3671.0, + "95": 3340.0, + "96": 3874.0, + "97": 3519.0, + "98": 3727.0, + "99": 3447.0, + "100": 3338.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..809ba358612 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.8337, + "2": 10.83216, + "3": 10.83262, + "4": 10.80149, + "5": 10.85789, + "6": 10.86796, + "7": 10.84795, + "8": 10.84663, + "9": 10.86076, + "10": 10.81578, + "11": 10.89921, + "12": 10.88475, + "13": 10.89093, + "14": 10.9047, + "15": 10.84971, + "16": 10.86517, + "17": 10.85475, + "18": 10.8881, + "19": 10.87622, + "20": 10.85686, + "21": 10.85506, + "22": 10.79694, + "23": 10.88579, + "24": 10.8279, + "25": 10.81326, + "26": 10.82693, + "27": 10.846, + "28": 10.84147, + "29": 10.8522, + "30": 10.74663, + "31": 10.62679, + "32": 10.79112, + "33": 10.77171, + "34": 10.65521, + "35": 10.65647, + "36": 10.61755, + "37": 10.67472, + "38": 10.58181, + "39": 10.69126, + "40": 10.50351, + "41": 10.53015, + "42": 10.55529, + "43": 10.28638, + "44": 10.36341, + "45": 10.27258, + "46": 10.24593, + "47": 10.45076, + "48": 10.23738, + "49": 9.99756, + "50": 10.25445, + "51": 10.20109, + "52": 10.10787, + "53": 10.34615, + "54": 10.24765, + "55": 10.18699, + "56": 9.95445, + "57": 9.81113, + "58": 10.10718, + "59": 9.88656, + "60": 9.8098, + "61": 9.95021, + "62": 10.20123, + "63": 9.64325, + "64": 9.99571, + "65": 9.24409, + "66": 9.90919, + "67": 9.59742, + "68": 9.97199, + "69": 9.96262, + "70": 9.91024, + "71": 9.78581, + "72": 9.77311, + "73": 9.66157, + "74": 9.16191, + "75": 9.58173, + "76": 9.26165, + "77": 10.17527, + "78": 9.85663, + "79": 9.50663, + "80": 9.54167, + "81": 9.61305, + "82": 9.80599, + "83": 9.44744, + "84": 9.52725, + "85": 9.7262, + "86": 9.1912, + "87": 9.68768, + "88": 9.85199, + "89": 9.71342, + "90": 9.90242, + "91": 9.4603, + "92": 9.46187, + "93": 9.19485, + "94": 8.93416, + "95": 9.60208, + "96": 9.61859, + "97": 9.39629, + "98": 9.76032, + "99": 8.98677, + "100": 9.49424 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 684610560.0, + "2": 684610560.0, + "3": 684610560.0, + "4": 684610560.0, + "5": 684610560.0, + "6": 684610560.0, + "7": 684610560.0, + "8": 684610560.0, + "9": 684610560.0, + "10": 684610560.0, + "11": 684610560.0, + "12": 684610560.0, + "13": 684610560.0, + "14": 684610560.0, + "15": 684610560.0, + "16": 684610560.0, + "17": 1043027456.0, + "18": 1043027456.0, + "19": 1043027456.0, + "20": 1043027456.0, + "21": 1043027456.0, + "22": 1043027456.0, + "23": 1043027456.0, + "24": 1043027456.0, + "25": 1043027456.0, + "26": 1043027456.0, + "27": 1043027456.0, + "28": 1043027456.0, + "29": 1043027456.0, + "30": 1043027456.0, + "31": 1043027456.0, + "32": 1043027456.0, + "33": 1043027456.0, + "34": 1043027456.0, + "35": 1043027456.0, + "36": 1043027456.0, + "37": 1043027456.0, + "38": 1043027456.0, + "39": 1043027456.0, + "40": 1043027456.0, + "41": 1043027456.0, + "42": 1043027456.0, + "43": 1043027456.0, + "44": 1043027456.0, + "45": 1043027456.0, + "46": 1043027456.0, + "47": 1043027456.0, + "48": 1043027456.0, + "49": 1043027456.0, + "50": 1043027456.0, + "51": 1043027456.0, + "52": 1043027456.0, + "53": 1043027456.0, + "54": 1043027456.0, + "55": 1043027456.0, + "56": 1043027456.0, + "57": 1043027456.0, + "58": 1043027456.0, + "59": 1043027456.0, + "60": 1043027456.0, + "61": 1043027456.0, + "62": 1043027456.0, + "63": 1043027456.0, + "64": 1043027456.0, + "65": 1043027456.0, + "66": 1043027456.0, + "67": 1043027456.0, + "68": 1043027456.0, + "69": 1043027456.0, + "70": 1043027456.0, + "71": 1043027456.0, + "72": 1043027456.0, + "73": 1043027456.0, + "74": 1043027456.0, + "75": 1043027456.0, + "76": 1043027456.0, + "77": 1043027456.0, + "78": 1043027456.0, + "79": 1043027456.0, + "80": 1043027456.0, + "81": 1043027456.0, + "82": 1043027456.0, + "83": 1043027456.0, + "84": 1043027456.0, + "85": 1043027456.0, + "86": 1043027456.0, + "87": 1043027456.0, + "88": 1043027456.0, + "89": 1043027456.0, + "90": 1043027456.0, + "91": 1043027456.0, + "92": 1043027456.0, + "93": 1043027456.0, + "94": 1043027456.0, + "95": 1043027456.0, + "96": 1043027456.0, + "97": 1043027456.0, + "98": 1043027456.0, + "99": 1043027456.0, + "100": 1043027456.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3187304960.0, + "2": 3187305472.0, + "3": 3187305472.0, + "4": 3187305472.0, + "5": 3187305472.0, + "6": 3187305472.0, + "7": 3187305472.0, + "8": 3187305472.0, + "9": 3187305472.0, + "10": 3187305472.0, + "11": 3187305472.0, + "12": 3187305472.0, + "13": 3187305472.0, + "14": 3187305472.0, + "15": 3187305472.0, + "16": 3187305472.0, + "17": 3187305472.0, + "18": 3547033088.0, + "19": 3547033088.0, + "20": 3547033088.0, + "21": 3547033088.0, + "22": 3547033088.0, + "23": 3547033088.0, + "24": 3547033088.0, + "25": 3547033088.0, + "26": 3547033088.0, + "27": 3547033088.0, + "28": 3547033088.0, + "29": 3547033088.0, + "30": 3547033088.0, + "31": 3547033088.0, + "32": 3547033088.0, + "33": 3547033088.0, + "34": 3547033088.0, + "35": 3547033088.0, + "36": 3547033088.0, + "37": 3547033088.0, + "38": 3547033088.0, + "39": 3547033088.0, + "40": 3547033088.0, + "41": 3547033088.0, + "42": 3547033088.0, + "43": 3547033088.0, + "44": 3547033088.0, + "45": 3547033088.0, + "46": 3547033088.0, + "47": 3547033088.0, + "48": 3547033088.0, + "49": 3547033088.0, + "50": 3547033088.0, + "51": 3547033088.0, + "52": 3547033088.0, + "53": 3547033088.0, + "54": 3547033088.0, + "55": 3547033088.0, + "56": 3547033088.0, + "57": 3547033088.0, + "58": 3547033088.0, + "59": 3547033088.0, + "60": 3547033088.0, + "61": 3547033088.0, + "62": 3547033088.0, + "63": 3547033088.0, + "64": 3547033088.0, + "65": 3547033088.0, + "66": 3547033088.0, + "67": 3547033088.0, + "68": 3547033088.0, + "69": 3547033088.0, + "70": 3547033088.0, + "71": 3547033088.0, + "72": 3547033088.0, + "73": 3547033088.0, + "74": 3547033088.0, + "75": 3547033088.0, + "76": 3547033088.0, + "77": 3547033088.0, + "78": 3547033088.0, + "79": 3547033088.0, + "80": 3547033088.0, + "81": 3547033088.0, + "82": 3547033088.0, + "83": 3547033088.0, + "84": 3547033088.0, + "85": 3547033088.0, + "86": 3547033088.0, + "87": 3547033088.0, + "88": 3547033088.0, + "89": 3547033088.0, + "90": 3547033088.0, + "91": 3547033088.0, + "92": 3547033088.0, + "93": 3547033088.0, + "94": 3547033088.0, + "95": 3547033088.0, + "96": 3547033088.0, + "97": 3547033088.0, + "98": 3547033088.0, + "99": 3547033088.0, + "100": 3547033088.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.37801, + "2": 0.17868, + "3": 0.15737, + "4": 0.15064, + "5": 0.14295, + "6": 0.14401, + "7": 0.14335, + "8": 0.14238, + "9": 0.14059, + "10": 0.14021, + "11": 0.14214, + "12": 0.14087, + "13": 0.13924, + "14": 0.13916, + "15": 0.13973, + "16": 0.13895, + "17": 0.19936, + "18": 0.22469, + "19": 0.1492, + "20": 0.1494, + "21": 0.14972, + "22": 0.1406, + "23": 0.14885, + "24": 0.15067, + "25": 0.14941, + "26": 0.14905, + "27": 0.14895, + "28": 0.1478, + "29": 0.14932, + "30": 0.14921, + "31": 0.15043, + "32": 0.15028, + "33": 0.14795, + "34": 0.14864, + "35": 0.14904, + "36": 0.1491, + "37": 0.14886, + "38": 0.14931, + "39": 0.1489, + "40": 0.14851, + "41": 0.14847, + "42": 0.14829, + "43": 0.15254, + "44": 0.1485, + "45": 0.14926, + "46": 0.1481, + "47": 0.14794, + "48": 0.14884, + "49": 0.1478, + "50": 0.14737, + "51": 0.15947, + "52": 0.15469, + "53": 0.15082, + "54": 0.15106, + "55": 0.15266, + "56": 0.15055, + "57": 0.15141, + "58": 0.15117, + "59": 0.15229, + "60": 0.15163, + "61": 0.1511, + "62": 0.15177, + "63": 0.1513, + "64": 0.15114, + "65": 0.1506, + "66": 0.15109, + "67": 0.15009, + "68": 0.1507, + "69": 0.15042, + "70": 0.15201, + "71": 0.15105, + "72": 0.1509, + "73": 0.1504, + "74": 0.15078, + "75": 0.15053, + "76": 0.14994, + "77": 0.14987, + "78": 0.15076, + "79": 0.15058, + "80": 0.1508, + "81": 0.15114, + "82": 0.15016, + "83": 0.15085, + "84": 0.15149, + "85": 0.15054, + "86": 0.15154, + "87": 0.15001, + "88": 0.14995, + "89": 0.15097, + "90": 0.15063, + "91": 0.15144, + "92": 0.15033, + "93": 0.14991, + "94": 0.15161, + "95": 0.15125, + "96": 0.1519, + "97": 0.15146, + "98": 0.15186, + "99": 0.153, + "100": 0.15275 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": 2245.0, + "18": 2160.0, + "19": 2344.0, + "20": 1969.0, + "21": 1966.0, + "22": "nan", + "23": 2369.0, + "24": 1914.0, + "25": 1863.0, + "26": 1931.0, + "27": 2040.0, + "28": 2378.0, + "29": 2411.0, + "30": 2312.0, + "31": 1759.0, + "32": 2303.0, + "33": 2170.0, + "34": 1860.0, + "35": 2063.0, + "36": 2040.0, + "37": 2464.0, + "38": 2129.0, + "39": 2616.0, + "40": 2212.0, + "41": 2402.0, + "42": 2290.0, + "43": 2083.0, + "44": 2083.0, + "45": 2333.0, + "46": 1979.0, + "47": 2653.0, + "48": 2424.0, + "49": 1878.0, + "50": 2369.0, + "51": 2318.0, + "52": 2456.0, + "53": 2905.0, + "54": 2495.0, + "55": 2357.0, + "56": 2295.0, + "57": 2256.0, + "58": 2752.0, + "59": 2319.0, + "60": 2500.0, + "61": 2883.0, + "62": 2791.0, + "63": 2396.0, + "64": 2838.0, + "65": 2438.0, + "66": 2880.0, + "67": 2596.0, + "68": 2940.0, + "69": 2730.0, + "70": 3075.0, + "71": 2957.0, + "72": 2334.0, + "73": 2995.0, + "74": 2178.0, + "75": 2803.0, + "76": 3073.0, + "77": 3411.0, + "78": 3517.0, + "79": 3430.0, + "80": 3568.0, + "81": 3657.0, + "82": 3328.0, + "83": 3188.0, + "84": 3296.0, + "85": 3675.0, + "86": 3300.0, + "87": 3966.0, + "88": 3275.0, + "89": 3995.0, + "90": 3397.0, + "91": 2658.0, + "92": 3409.0, + "93": 3067.0, + "94": 3727.0, + "95": 3468.0, + "96": 3802.0, + "97": 3448.0, + "98": 3735.0, + "99": 3426.0, + "100": 3267.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..de5bb1034d5 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.8337, + "2": 10.83216, + "3": 10.83262, + "4": 10.80149, + "5": 10.85789, + "6": 10.86796, + "7": 10.84795, + "8": 10.84663, + "9": 10.86076, + "10": 10.81578, + "11": 10.89921, + "12": 10.88475, + "13": 10.89093, + "14": 10.9047, + "15": 10.84971, + "16": 10.86517, + "17": 10.85475, + "18": 10.8881, + "19": 10.87622, + "20": 10.85686, + "21": 10.85506, + "22": 10.79694, + "23": 10.88579, + "24": 10.8279, + "25": 10.81326, + "26": 10.82693, + "27": 10.846, + "28": 10.84147, + "29": 10.8522, + "30": 10.74663, + "31": 10.62679, + "32": 10.79112, + "33": 10.77171, + "34": 10.65521, + "35": 10.65647, + "36": 10.61755, + "37": 10.67472, + "38": 10.58181, + "39": 10.69126, + "40": 10.50351, + "41": 10.53015, + "42": 10.55529, + "43": 10.28638, + "44": 10.36341, + "45": 10.27258, + "46": 10.24593, + "47": 10.45076, + "48": 10.23738, + "49": 9.99756, + "50": 10.25445, + "51": 10.20109, + "52": 10.10787, + "53": 10.34615, + "54": 10.24765, + "55": 10.18699, + "56": 9.95445, + "57": 9.81113, + "58": 10.10718, + "59": 9.88656, + "60": 9.8098, + "61": 9.95021, + "62": 10.20123, + "63": 9.64325, + "64": 9.99571, + "65": 9.24409, + "66": 9.90919, + "67": 9.59742, + "68": 9.97199, + "69": 9.96262, + "70": 9.91024, + "71": 9.78581, + "72": 9.77311, + "73": 9.66157, + "74": 9.16191, + "75": 9.58173, + "76": 9.26165, + "77": 10.17527, + "78": 9.85663, + "79": 9.50663, + "80": 9.54167, + "81": 9.61305, + "82": 9.80599, + "83": 9.44744, + "84": 9.52725, + "85": 9.7262, + "86": 9.1912, + "87": 9.68768, + "88": 9.85199, + "89": 9.71342, + "90": 9.90242, + "91": 9.4603, + "92": 9.46187, + "93": 9.19485, + "94": 8.93416, + "95": 9.60208, + "96": 9.61859, + "97": 9.39629, + "98": 9.76032, + "99": 8.98677, + "100": 9.49424 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 684610560.0, + "2": 684610560.0, + "3": 684610560.0, + "4": 684610560.0, + "5": 684610560.0, + "6": 684610560.0, + "7": 684610560.0, + "8": 684610560.0, + "9": 684610560.0, + "10": 684610560.0, + "11": 684610560.0, + "12": 684610560.0, + "13": 684610560.0, + "14": 684610560.0, + "15": 684610560.0, + "16": 684610560.0, + "17": 1043027456.0, + "18": 1043027456.0, + "19": 1043027456.0, + "20": 1043027456.0, + "21": 1043027456.0, + "22": 1043027456.0, + "23": 1043027456.0, + "24": 1043027456.0, + "25": 1043027456.0, + "26": 1043027456.0, + "27": 1043027456.0, + "28": 1043027456.0, + "29": 1043027456.0, + "30": 1043027456.0, + "31": 1043027456.0, + "32": 1043027456.0, + "33": 1043027456.0, + "34": 1043027456.0, + "35": 1043027456.0, + "36": 1043027456.0, + "37": 1043027456.0, + "38": 1043027456.0, + "39": 1043027456.0, + "40": 1043027456.0, + "41": 1043027456.0, + "42": 1043027456.0, + "43": 1043027456.0, + "44": 1043027456.0, + "45": 1043027456.0, + "46": 1043027456.0, + "47": 1043027456.0, + "48": 1043027456.0, + "49": 1043027456.0, + "50": 1043027456.0, + "51": 1043027456.0, + "52": 1043027456.0, + "53": 1043027456.0, + "54": 1043027456.0, + "55": 1043027456.0, + "56": 1043027456.0, + "57": 1043027456.0, + "58": 1043027456.0, + "59": 1043027456.0, + "60": 1043027456.0, + "61": 1043027456.0, + "62": 1043027456.0, + "63": 1043027456.0, + "64": 1043027456.0, + "65": 1043027456.0, + "66": 1043027456.0, + "67": 1043027456.0, + "68": 1043027456.0, + "69": 1043027456.0, + "70": 1043027456.0, + "71": 1043027456.0, + "72": 1043027456.0, + "73": 1043027456.0, + "74": 1043027456.0, + "75": 1043027456.0, + "76": 1043027456.0, + "77": 1043027456.0, + "78": 1043027456.0, + "79": 1043027456.0, + "80": 1043027456.0, + "81": 1043027456.0, + "82": 1043027456.0, + "83": 1043027456.0, + "84": 1043027456.0, + "85": 1043027456.0, + "86": 1043027456.0, + "87": 1043027456.0, + "88": 1043027456.0, + "89": 1043027456.0, + "90": 1043027456.0, + "91": 1043027456.0, + "92": 1043027456.0, + "93": 1043027456.0, + "94": 1043027456.0, + "95": 1043027456.0, + "96": 1043027456.0, + "97": 1043027456.0, + "98": 1043027456.0, + "99": 1043027456.0, + "100": 1043027456.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3187304960.0, + "2": 3187305472.0, + "3": 3187305472.0, + "4": 3187305472.0, + "5": 3187305472.0, + "6": 3187305472.0, + "7": 3187305472.0, + "8": 3187305472.0, + "9": 3187305472.0, + "10": 3187305472.0, + "11": 3187305472.0, + "12": 3187305472.0, + "13": 3187305472.0, + "14": 3187305472.0, + "15": 3187305472.0, + "16": 3187305472.0, + "17": 3187305472.0, + "18": 3547033088.0, + "19": 3547033088.0, + "20": 3547033088.0, + "21": 3547033088.0, + "22": 3547033088.0, + "23": 3547033088.0, + "24": 3547033088.0, + "25": 3547033088.0, + "26": 3547033088.0, + "27": 3547033088.0, + "28": 3547033088.0, + "29": 3547033088.0, + "30": 3547033088.0, + "31": 3547033088.0, + "32": 3547033088.0, + "33": 3547033088.0, + "34": 3547033088.0, + "35": 3547033088.0, + "36": 3547033088.0, + "37": 3547033088.0, + "38": 3547033088.0, + "39": 3547033088.0, + "40": 3547033088.0, + "41": 3547033088.0, + "42": 3547033088.0, + "43": 3547033088.0, + "44": 3547033088.0, + "45": 3547033088.0, + "46": 3547033088.0, + "47": 3547033088.0, + "48": 3547033088.0, + "49": 3547033088.0, + "50": 3547033088.0, + "51": 3547033088.0, + "52": 3547033088.0, + "53": 3547033088.0, + "54": 3547033088.0, + "55": 3547033088.0, + "56": 3547033088.0, + "57": 3547033088.0, + "58": 3547033088.0, + "59": 3547033088.0, + "60": 3547033088.0, + "61": 3547033088.0, + "62": 3547033088.0, + "63": 3547033088.0, + "64": 3547033088.0, + "65": 3547033088.0, + "66": 3547033088.0, + "67": 3547033088.0, + "68": 3547033088.0, + "69": 3547033088.0, + "70": 3547033088.0, + "71": 3547033088.0, + "72": 3547033088.0, + "73": 3547033088.0, + "74": 3547033088.0, + "75": 3547033088.0, + "76": 3547033088.0, + "77": 3547033088.0, + "78": 3547033088.0, + "79": 3547033088.0, + "80": 3547033088.0, + "81": 3547033088.0, + "82": 3547033088.0, + "83": 3547033088.0, + "84": 3547033088.0, + "85": 3547033088.0, + "86": 3547033088.0, + "87": 3547033088.0, + "88": 3547033088.0, + "89": 3547033088.0, + "90": 3547033088.0, + "91": 3547033088.0, + "92": 3547033088.0, + "93": 3547033088.0, + "94": 3547033088.0, + "95": 3547033088.0, + "96": 3547033088.0, + "97": 3547033088.0, + "98": 3547033088.0, + "99": 3547033088.0, + "100": 3547033088.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 13.07277, + "2": 0.17981, + "3": 0.14386, + "4": 0.1435, + "5": 0.14361, + "6": 0.14398, + "7": 0.14414, + "8": 0.14134, + "9": 0.14066, + "10": 0.14194, + "11": 0.14352, + "12": 0.14166, + "13": 0.14151, + "14": 0.1412, + "15": 0.14002, + "16": 0.13993, + "17": 0.30867, + "18": 0.15579, + "19": 0.15102, + "20": 0.15133, + "21": 0.14959, + "22": 0.14048, + "23": 0.14802, + "24": 0.14897, + "25": 0.14939, + "26": 0.14898, + "27": 0.14842, + "28": 0.14823, + "29": 0.14857, + "30": 0.14925, + "31": 0.15012, + "32": 0.14855, + "33": 0.14814, + "34": 0.14919, + "35": 0.14741, + "36": 0.14744, + "37": 0.14683, + "38": 0.14765, + "39": 0.14761, + "40": 0.14793, + "41": 0.1474, + "42": 0.14696, + "43": 0.1474, + "44": 0.14654, + "45": 0.14791, + "46": 0.14781, + "47": 0.14668, + "48": 0.14704, + "49": 0.14651, + "50": 0.14572, + "51": 0.15362, + "52": 0.14601, + "53": 0.14563, + "54": 0.14741, + "55": 0.14637, + "56": 0.14559, + "57": 0.14652, + "58": 0.14699, + "59": 0.14779, + "60": 0.1462, + "61": 0.14772, + "62": 0.14661, + "63": 0.14845, + "64": 0.14671, + "65": 0.1482, + "66": 0.14822, + "67": 0.14825, + "68": 0.14639, + "69": 0.15372, + "70": 0.14987, + "71": 0.15493, + "72": 0.1481, + "73": 0.15538, + "74": 0.14975, + "75": 0.15142, + "76": 0.15038, + "77": 0.15289, + "78": 0.14615, + "79": 0.14637, + "80": 0.14753, + "81": 0.14757, + "82": 0.14613, + "83": 0.14695, + "84": 0.14643, + "85": 0.14587, + "86": 0.15058, + "87": 0.14782, + "88": 0.1457, + "89": 0.14638, + "90": 0.14656, + "91": 0.14569, + "92": 0.14658, + "93": 0.14636, + "94": 0.14616, + "95": 0.14633, + "96": 0.14546, + "97": 0.14634, + "98": 0.14579, + "99": 0.14537, + "100": 0.14711 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": 2245.0, + "18": 2160.0, + "19": 2344.0, + "20": 1969.0, + "21": 1966.0, + "22": "nan", + "23": 2369.0, + "24": 1914.0, + "25": 1863.0, + "26": 1931.0, + "27": 2040.0, + "28": 2378.0, + "29": 2411.0, + "30": 2312.0, + "31": 1759.0, + "32": 2303.0, + "33": 2170.0, + "34": 1860.0, + "35": 2063.0, + "36": 2040.0, + "37": 2464.0, + "38": 2129.0, + "39": 2616.0, + "40": 2212.0, + "41": 2402.0, + "42": 2290.0, + "43": 2083.0, + "44": 2083.0, + "45": 2333.0, + "46": 1979.0, + "47": 2653.0, + "48": 2424.0, + "49": 1878.0, + "50": 2369.0, + "51": 2318.0, + "52": 2456.0, + "53": 2905.0, + "54": 2495.0, + "55": 2357.0, + "56": 2295.0, + "57": 2256.0, + "58": 2752.0, + "59": 2319.0, + "60": 2500.0, + "61": 2883.0, + "62": 2791.0, + "63": 2396.0, + "64": 2838.0, + "65": 2438.0, + "66": 2880.0, + "67": 2596.0, + "68": 2940.0, + "69": 2730.0, + "70": 3075.0, + "71": 2957.0, + "72": 2334.0, + "73": 2995.0, + "74": 2178.0, + "75": 2803.0, + "76": 3073.0, + "77": 3411.0, + "78": 3517.0, + "79": 3430.0, + "80": 3568.0, + "81": 3657.0, + "82": 3328.0, + "83": 3188.0, + "84": 3296.0, + "85": 3675.0, + "86": 3300.0, + "87": 3966.0, + "88": 3275.0, + "89": 3995.0, + "90": 3397.0, + "91": 2658.0, + "92": 3409.0, + "93": 3067.0, + "94": 3727.0, + "95": 3468.0, + "96": 3802.0, + "97": 3448.0, + "98": 3735.0, + "99": 3426.0, + "100": 3267.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgx_h100.json index 66d41feb78a..6a5be6c0d9c 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.85678, + "2": 10.86405, + "3": 10.86854, + "4": 10.85128, "5": 10.88398, + "6": 10.89024, + "7": 10.86645, + "8": 10.86924, + "9": 10.87305, "10": 10.84079, + "11": 10.87928, + "12": 10.8729, + "13": 10.87791, + "14": 10.8901, "15": 10.82504, + "16": 10.8296, + "17": 10.80874, + "18": 10.8116, + "19": 10.81543, "20": 10.71912, + "21": 10.70404, + "22": 10.56645, + "23": 10.71858, + "24": 10.60989, "25": 10.55479, + "26": 10.60874, + "27": 10.62302, + "28": 10.56954, + "29": 10.57966, "30": 10.35998, + "31": 10.11311, + "32": 10.46587, + "33": 10.45154, + "34": 10.20826, "35": 10.26937, + "36": 10.21924, + "37": 10.33852, + "38": 10.186, + "39": 10.3997, "40": 10.08396, + "41": 10.13418, + "42": 10.20887, + "43": 9.82537, + "44": 9.95906, "45": 9.82563, + "46": 9.80623, + "47": 10.13499, + "48": 9.84002, + "49": 9.52482, "50": 9.90725 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1654.0, + "2": 1646.0, + "3": 1565.0, + "4": 1759.0, "5": 1860.0, + "6": 1741.0, + "7": 1752.0, + "8": 1579.0, + "9": 1849.0, "10": 1317.0, + "11": 1901.0, + "12": 1702.0, + "13": 1872.0, + "14": 1781.0, "15": 1759.0, + "16": 1820.0, + "17": 1819.0, + "18": 1721.0, + "19": 1828.0, "20": 1730.0, + "21": 1935.0, + "22": 1764.0, + "23": 1962.0, + "24": 1564.0, "25": 1552.0, + "26": 1668.0, + "27": 1803.0, + "28": 1988.0, + "29": 1966.0, "30": 1895.0, + "31": 1532.0, + "32": 1866.0, + "33": 2026.0, + "34": 1906.0, "35": 1987.0, + "36": 1863.0, + "37": 2231.0, + "38": 2109.0, + "39": 2277.0, "40": 2099.0, + "41": 2209.0, + "42": 2227.0, + "43": 1913.0, + "44": 2129.0, "45": 1993.0, + "46": 2288.0, + "47": 2458.0, + "48": 2418.0, + "49": 2155.0, "50": 2085.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 777900032.0, + "2": 777900032.0, + "3": 777900032.0, + "4": 777900032.0, "5": 777900032.0, + "6": 777900032.0, + "7": 777900032.0, + "8": 777900032.0, + "9": 777900032.0, "10": 777900032.0, + "11": 777900032.0, + "12": 777900032.0, + "13": 777900032.0, + "14": 777900032.0, "15": 777900032.0, + "16": 777900032.0, + "17": 777900032.0, + "18": 777900032.0, + "19": 777900032.0, "20": 777900032.0, + "21": 777900032.0, + "22": 777900032.0, + "23": 777900032.0, + "24": 777900032.0, "25": 777900032.0, + "26": 777900032.0, + "27": 777900032.0, + "28": 777900032.0, + "29": 777900032.0, "30": 777900032.0, + "31": 777900032.0, + "32": 777900032.0, + "33": 777900032.0, + "34": 777900032.0, "35": 777900032.0, + "36": 777900032.0, + "37": 777900032.0, + "38": 777900032.0, + "39": 777900032.0, "40": 777900032.0, + "41": 777900032.0, + "42": 777900032.0, + "43": 777900032.0, + "44": 777900032.0, "45": 777900032.0, + "46": 777900032.0, + "47": 777900032.0, + "48": 777900032.0, + "49": 777900032.0, "50": 777900032.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2463815680.0, + "2": 2744478720.0, + "3": 2744478720.0, + "4": 2744478720.0, "5": 2744478720.0, + "6": 2744478720.0, + "7": 2744478720.0, + "8": 2744478720.0, + "9": 2744478720.0, "10": 2744478720.0, + "11": 2744478720.0, + "12": 2744478720.0, + "13": 2744478720.0, + "14": 2744478720.0, "15": 2744478720.0, + "16": 2744478720.0, + "17": 2744478720.0, + "18": 2744478720.0, + "19": 2744478720.0, "20": 2744478720.0, + "21": 2744478720.0, + "22": 2744478720.0, + "23": 2744478720.0, + "24": 2744478720.0, "25": 2744478720.0, + "26": 2744478720.0, + "27": 2744478720.0, + "28": 2744478720.0, + "29": 2744478720.0, "30": 2744478720.0, + "31": 2744478720.0, + "32": 2744478720.0, + "33": 2744478720.0, + "34": 2744478720.0, "35": 2744478720.0, + "36": 2744478720.0, + "37": 2744478720.0, + "38": 2744478720.0, + "39": 2744478720.0, "40": 2744478720.0, + "41": 2744478720.0, + "42": 2744478720.0, + "43": 2744478720.0, + "44": 2744478720.0, "45": 2744478720.0, + "46": 2744478720.0, + "47": 2744478720.0, + "48": 2744478720.0, + "49": 2744478720.0, "50": 2744478720.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 12.50471, - "5": 0.10661, - "10": 0.10734, - "15": 0.1053, - "20": 0.10696, - "25": 0.10794, - "30": 0.10635, - "35": 0.10713, - "40": 0.10333, - "45": 0.10618, - "50": 0.10738 + "1": 11.05472, + "2": 0.1429, + "3": 0.12828, + "4": 0.12976, + "5": 0.12969, + "6": 0.12181, + "7": 0.12512, + "8": 0.12267, + "9": 0.12362, + "10": 0.12382, + "11": 0.1219, + "12": 0.12295, + "13": 0.12406, + "14": 0.12396, + "15": 0.12483, + "16": 0.12596, + "17": 0.12252, + "18": 0.12284, + "19": 0.12465, + "20": 0.12674, + "21": 0.12398, + "22": 0.12376, + "23": 0.12244, + "24": 0.12641, + "25": 0.1234, + "26": 0.12355, + "27": 0.12183, + "28": 0.12355, + "29": 0.12372, + "30": 0.12258, + "31": 0.1231, + "32": 0.12444, + "33": 0.12266, + "34": 0.12208, + "35": 0.12181, + "36": 0.12028, + "37": 0.12298, + "38": 0.1214, + "39": 0.12242, + "40": 0.12058, + "41": 0.12169, + "42": 0.1223, + "43": 0.1221, + "44": 0.12176, + "45": 0.12039, + "46": 0.12206, + "47": 0.12138, + "48": 0.12715, + "49": 0.12339, + "50": 0.12175 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..e8f7325e5f3 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.85678, + "2": 10.86405, + "3": 10.86854, + "4": 10.85128, + "5": 10.88398, + "6": 10.89024, + "7": 10.86645, + "8": 10.86924, + "9": 10.87305, + "10": 10.84079, + "11": 10.87928, + "12": 10.8729, + "13": 10.87791, + "14": 10.8901, + "15": 10.82504, + "16": 10.8296, + "17": 10.80874, + "18": 10.8116, + "19": 10.81543, + "20": 10.71912, + "21": 10.70404, + "22": 10.56645, + "23": 10.71858, + "24": 10.60989, + "25": 10.55479, + "26": 10.60874, + "27": 10.62302, + "28": 10.56954, + "29": 10.57966, + "30": 10.35998, + "31": 10.11311, + "32": 10.46587, + "33": 10.45154, + "34": 10.20826, + "35": 10.26937, + "36": 10.21924, + "37": 10.33852, + "38": 10.186, + "39": 10.3997, + "40": 10.08396, + "41": 10.13418, + "42": 10.20887, + "43": 9.82537, + "44": 9.95906, + "45": 9.82563, + "46": 9.80623, + "47": 10.13499, + "48": 9.84002, + "49": 9.52482, + "50": 9.90725 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1654.0, + "2": 1646.0, + "3": 1565.0, + "4": 1759.0, + "5": 1860.0, + "6": 1741.0, + "7": 1752.0, + "8": 1579.0, + "9": 1849.0, + "10": 1317.0, + "11": 1901.0, + "12": 1702.0, + "13": 1872.0, + "14": 1781.0, + "15": 1759.0, + "16": 1820.0, + "17": 1819.0, + "18": 1721.0, + "19": 1828.0, + "20": 1730.0, + "21": 1935.0, + "22": 1764.0, + "23": 1962.0, + "24": 1564.0, + "25": 1552.0, + "26": 1668.0, + "27": 1803.0, + "28": 1988.0, + "29": 1966.0, + "30": 1895.0, + "31": 1532.0, + "32": 1866.0, + "33": 2026.0, + "34": 1906.0, + "35": 1987.0, + "36": 1863.0, + "37": 2231.0, + "38": 2109.0, + "39": 2277.0, + "40": 2099.0, + "41": 2209.0, + "42": 2227.0, + "43": 1913.0, + "44": 2129.0, + "45": 1993.0, + "46": 2288.0, + "47": 2458.0, + "48": 2418.0, + "49": 2155.0, + "50": 2085.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 777900032.0, + "2": 777900032.0, + "3": 777900032.0, + "4": 777900032.0, + "5": 777900032.0, + "6": 777900032.0, + "7": 777900032.0, + "8": 777900032.0, + "9": 777900032.0, + "10": 777900032.0, + "11": 777900032.0, + "12": 777900032.0, + "13": 777900032.0, + "14": 777900032.0, + "15": 777900032.0, + "16": 777900032.0, + "17": 777900032.0, + "18": 777900032.0, + "19": 777900032.0, + "20": 777900032.0, + "21": 777900032.0, + "22": 777900032.0, + "23": 777900032.0, + "24": 777900032.0, + "25": 777900032.0, + "26": 777900032.0, + "27": 777900032.0, + "28": 777900032.0, + "29": 777900032.0, + "30": 777900032.0, + "31": 777900032.0, + "32": 777900032.0, + "33": 777900032.0, + "34": 777900032.0, + "35": 777900032.0, + "36": 777900032.0, + "37": 777900032.0, + "38": 777900032.0, + "39": 777900032.0, + "40": 777900032.0, + "41": 777900032.0, + "42": 777900032.0, + "43": 777900032.0, + "44": 777900032.0, + "45": 777900032.0, + "46": 777900032.0, + "47": 777900032.0, + "48": 777900032.0, + "49": 777900032.0, + "50": 777900032.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2463815680.0, + "2": 2744478720.0, + "3": 2744478720.0, + "4": 2744478720.0, + "5": 2744478720.0, + "6": 2744478720.0, + "7": 2744478720.0, + "8": 2744478720.0, + "9": 2744478720.0, + "10": 2744478720.0, + "11": 2744478720.0, + "12": 2744478720.0, + "13": 2744478720.0, + "14": 2744478720.0, + "15": 2744478720.0, + "16": 2744478720.0, + "17": 2744478720.0, + "18": 2744478720.0, + "19": 2744478720.0, + "20": 2744478720.0, + "21": 2744478720.0, + "22": 2744478720.0, + "23": 2744478720.0, + "24": 2744478720.0, + "25": 2744478720.0, + "26": 2744478720.0, + "27": 2744478720.0, + "28": 2744478720.0, + "29": 2744478720.0, + "30": 2744478720.0, + "31": 2744478720.0, + "32": 2744478720.0, + "33": 2744478720.0, + "34": 2744478720.0, + "35": 2744478720.0, + "36": 2744478720.0, + "37": 2744478720.0, + "38": 2744478720.0, + "39": 2744478720.0, + "40": 2744478720.0, + "41": 2744478720.0, + "42": 2744478720.0, + "43": 2744478720.0, + "44": 2744478720.0, + "45": 2744478720.0, + "46": 2744478720.0, + "47": 2744478720.0, + "48": 2744478720.0, + "49": 2744478720.0, + "50": 2744478720.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.35419, + "2": 0.13991, + "3": 0.10767, + "4": 0.10938, + "5": 0.10724, + "6": 0.10478, + "7": 0.10552, + "8": 0.10656, + "9": 0.10556, + "10": 0.10532, + "11": 0.10534, + "12": 0.10534, + "13": 0.10527, + "14": 0.10709, + "15": 0.10495, + "16": 0.10604, + "17": 0.10965, + "18": 0.1088, + "19": 0.1041, + "20": 0.10506, + "21": 0.1048, + "22": 0.10602, + "23": 0.10565, + "24": 0.1054, + "25": 0.10522, + "26": 0.10463, + "27": 0.10589, + "28": 0.10459, + "29": 0.10668, + "30": 0.10356, + "31": 0.10981, + "32": 0.10384, + "33": 0.1044, + "34": 0.10384, + "35": 0.10498, + "36": 0.10335, + "37": 0.10417, + "38": 0.10399, + "39": 0.10546, + "40": 0.10397, + "41": 0.10485, + "42": 0.104, + "43": 0.10561, + "44": 0.10556, + "45": 0.10548, + "46": 0.10502, + "47": 0.10566, + "48": 0.10496, + "49": 0.1064, + "50": 0.10702 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..5517997e6c1 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.85678, + "2": 10.86405, + "3": 10.86854, + "4": 10.85128, + "5": 10.88398, + "6": 10.89024, + "7": 10.86645, + "8": 10.86924, + "9": 10.87305, + "10": 10.84079, + "11": 10.87928, + "12": 10.8729, + "13": 10.87791, + "14": 10.8901, + "15": 10.82504, + "16": 10.8296, + "17": 10.80874, + "18": 10.8116, + "19": 10.81543, + "20": 10.71912, + "21": 10.70404, + "22": 10.56645, + "23": 10.71858, + "24": 10.60989, + "25": 10.55479, + "26": 10.60874, + "27": 10.62302, + "28": 10.56954, + "29": 10.57966, + "30": 10.35998, + "31": 10.11311, + "32": 10.46587, + "33": 10.45154, + "34": 10.20826, + "35": 10.26937, + "36": 10.21924, + "37": 10.33852, + "38": 10.186, + "39": 10.3997, + "40": 10.08396, + "41": 10.13418, + "42": 10.20887, + "43": 9.82537, + "44": 9.95906, + "45": 9.82563, + "46": 9.80623, + "47": 10.13499, + "48": 9.84002, + "49": 9.52482, + "50": 9.90725 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1654.0, + "2": 1646.0, + "3": 1565.0, + "4": 1759.0, + "5": 1860.0, + "6": 1741.0, + "7": 1752.0, + "8": 1579.0, + "9": 1849.0, + "10": 1317.0, + "11": 1901.0, + "12": 1702.0, + "13": 1872.0, + "14": 1781.0, + "15": 1759.0, + "16": 1820.0, + "17": 1819.0, + "18": 1721.0, + "19": 1828.0, + "20": 1730.0, + "21": 1935.0, + "22": 1764.0, + "23": 1962.0, + "24": 1564.0, + "25": 1552.0, + "26": 1668.0, + "27": 1803.0, + "28": 1988.0, + "29": 1966.0, + "30": 1895.0, + "31": 1532.0, + "32": 1866.0, + "33": 2026.0, + "34": 1906.0, + "35": 1987.0, + "36": 1863.0, + "37": 2231.0, + "38": 2109.0, + "39": 2277.0, + "40": 2099.0, + "41": 2209.0, + "42": 2227.0, + "43": 1913.0, + "44": 2129.0, + "45": 1993.0, + "46": 2288.0, + "47": 2458.0, + "48": 2418.0, + "49": 2155.0, + "50": 2085.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 777900032.0, + "2": 777900032.0, + "3": 777900032.0, + "4": 777900032.0, + "5": 777900032.0, + "6": 777900032.0, + "7": 777900032.0, + "8": 777900032.0, + "9": 777900032.0, + "10": 777900032.0, + "11": 777900032.0, + "12": 777900032.0, + "13": 777900032.0, + "14": 777900032.0, + "15": 777900032.0, + "16": 777900032.0, + "17": 777900032.0, + "18": 777900032.0, + "19": 777900032.0, + "20": 777900032.0, + "21": 777900032.0, + "22": 777900032.0, + "23": 777900032.0, + "24": 777900032.0, + "25": 777900032.0, + "26": 777900032.0, + "27": 777900032.0, + "28": 777900032.0, + "29": 777900032.0, + "30": 777900032.0, + "31": 777900032.0, + "32": 777900032.0, + "33": 777900032.0, + "34": 777900032.0, + "35": 777900032.0, + "36": 777900032.0, + "37": 777900032.0, + "38": 777900032.0, + "39": 777900032.0, + "40": 777900032.0, + "41": 777900032.0, + "42": 777900032.0, + "43": 777900032.0, + "44": 777900032.0, + "45": 777900032.0, + "46": 777900032.0, + "47": 777900032.0, + "48": 777900032.0, + "49": 777900032.0, + "50": 777900032.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2463815680.0, + "2": 2744478720.0, + "3": 2744478720.0, + "4": 2744478720.0, + "5": 2744478720.0, + "6": 2744478720.0, + "7": 2744478720.0, + "8": 2744478720.0, + "9": 2744478720.0, + "10": 2744478720.0, + "11": 2744478720.0, + "12": 2744478720.0, + "13": 2744478720.0, + "14": 2744478720.0, + "15": 2744478720.0, + "16": 2744478720.0, + "17": 2744478720.0, + "18": 2744478720.0, + "19": 2744478720.0, + "20": 2744478720.0, + "21": 2744478720.0, + "22": 2744478720.0, + "23": 2744478720.0, + "24": 2744478720.0, + "25": 2744478720.0, + "26": 2744478720.0, + "27": 2744478720.0, + "28": 2744478720.0, + "29": 2744478720.0, + "30": 2744478720.0, + "31": 2744478720.0, + "32": 2744478720.0, + "33": 2744478720.0, + "34": 2744478720.0, + "35": 2744478720.0, + "36": 2744478720.0, + "37": 2744478720.0, + "38": 2744478720.0, + "39": 2744478720.0, + "40": 2744478720.0, + "41": 2744478720.0, + "42": 2744478720.0, + "43": 2744478720.0, + "44": 2744478720.0, + "45": 2744478720.0, + "46": 2744478720.0, + "47": 2744478720.0, + "48": 2744478720.0, + "49": 2744478720.0, + "50": 2744478720.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.34716, + "2": 0.14227, + "3": 0.12689, + "4": 0.13008, + "5": 0.12281, + "6": 0.12008, + "7": 0.11926, + "8": 0.11756, + "9": 0.11844, + "10": 0.11959, + "11": 0.11763, + "12": 0.11828, + "13": 0.11955, + "14": 0.11929, + "15": 0.11867, + "16": 0.11859, + "17": 0.12095, + "18": 0.11695, + "19": 0.11774, + "20": 0.11863, + "21": 0.11942, + "22": 0.12117, + "23": 0.11884, + "24": 0.12003, + "25": 0.11915, + "26": 0.11977, + "27": 0.11816, + "28": 0.12705, + "29": 0.11815, + "30": 0.12166, + "31": 0.12023, + "32": 0.12154, + "33": 0.12781, + "34": 0.12209, + "35": 0.12372, + "36": 0.12109, + "37": 0.11897, + "38": 0.12385, + "39": 0.11961, + "40": 0.11846, + "41": 0.11902, + "42": 0.11915, + "43": 0.12286, + "44": 0.11759, + "45": 0.11912, + "46": 0.1204, + "47": 0.12027, + "48": 0.12073, + "49": 0.1164, + "50": 0.11734 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..e6214f74d31 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.79203, + "2": 10.80273, + "3": 10.80585, + "4": 10.77187, + "5": 10.84593, + "6": 10.86693, + "7": 10.82551, + "8": 10.81215, + "9": 10.83332, + "10": 10.76951, + "11": 10.89062, + "12": 10.84504, + "13": 10.85859, + "14": 10.8801, + "15": 10.78971, + "16": 10.78188, + "17": 10.75787, + "18": 10.79172, + "19": 10.79529, + "20": 10.67886, + "21": 10.65973, + "22": 10.50045, + "23": 10.71219, + "24": 10.55058, + "25": 10.50431, + "26": 10.5802, + "27": 10.58378, + "28": 10.55688, + "29": 10.55907, + "30": 10.33089, + "31": 10.08209, + "32": 10.44504, + "33": 10.44161, + "34": 10.19769, + "35": 10.25278, + "36": 10.19158, + "37": 10.31839, + "38": 10.16293, + "39": 10.37474, + "40": 10.05241, + "41": 10.13501, + "42": 10.18884, + "43": 9.8066, + "44": 9.92658, + "45": 9.80259, + "46": 9.81165, + "47": 10.12682, + "48": 9.8236, + "49": 9.51061, + "50": 9.88804 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1649.0, + "2": 1710.0, + "3": 1754.0, + "4": 1850.0, + "5": 1890.0, + "6": 1767.0, + "7": 1830.0, + "8": 1723.0, + "9": 1758.0, + "10": 1397.0, + "11": 1890.0, + "12": 1657.0, + "13": 1761.0, + "14": 1813.0, + "15": 1928.0, + "16": 1828.0, + "17": 1933.0, + "18": 1633.0, + "19": 1777.0, + "20": 1565.0, + "21": 1807.0, + "22": 1678.0, + "23": 2014.0, + "24": 1766.0, + "25": 1699.0, + "26": 1741.0, + "27": 1800.0, + "28": 1937.0, + "29": 1921.0, + "30": 1943.0, + "31": 1527.0, + "32": 1848.0, + "33": 2144.0, + "34": 1925.0, + "35": 2018.0, + "36": 1937.0, + "37": 2297.0, + "38": 2214.0, + "39": 2374.0, + "40": 2191.0, + "41": 2369.0, + "42": 2299.0, + "43": 1963.0, + "44": 2146.0, + "45": 2207.0, + "46": 2332.0, + "47": 2590.0, + "48": 2428.0, + "49": 2255.0, + "50": 2362.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 782094336.0, + "2": 782094336.0, + "3": 782094336.0, + "4": 782094336.0, + "5": 782094336.0, + "6": 782094336.0, + "7": 782094336.0, + "8": 782094336.0, + "9": 782094336.0, + "10": 782094336.0, + "11": 782094336.0, + "12": 782094336.0, + "13": 782094336.0, + "14": 782094336.0, + "15": 782094336.0, + "16": 782094336.0, + "17": 782094336.0, + "18": 782094336.0, + "19": 782094336.0, + "20": 782094336.0, + "21": 782094336.0, + "22": 782094336.0, + "23": 782094336.0, + "24": 782094336.0, + "25": 782094336.0, + "26": 782094336.0, + "27": 782094336.0, + "28": 782094336.0, + "29": 782094336.0, + "30": 782094336.0, + "31": 782094336.0, + "32": 782094336.0, + "33": 782094336.0, + "34": 782094336.0, + "35": 782094336.0, + "36": 782094336.0, + "37": 782094336.0, + "38": 782094336.0, + "39": 782094336.0, + "40": 782094336.0, + "41": 782094336.0, + "42": 782094336.0, + "43": 782094336.0, + "44": 782094336.0, + "45": 782094336.0, + "46": 782094336.0, + "47": 782094336.0, + "48": 782094336.0, + "49": 782094336.0, + "50": 782094336.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2462767104.0, + "2": 2748673024.0, + "3": 2748673024.0, + "4": 2748673024.0, + "5": 2748673024.0, + "6": 2748673024.0, + "7": 2748673024.0, + "8": 2748673024.0, + "9": 2748673024.0, + "10": 2748673024.0, + "11": 2748673024.0, + "12": 2748673024.0, + "13": 2748673024.0, + "14": 2748673024.0, + "15": 2748673024.0, + "16": 2748673024.0, + "17": 2748673024.0, + "18": 2748673024.0, + "19": 2748673024.0, + "20": 2748673024.0, + "21": 2748673024.0, + "22": 2748673024.0, + "23": 2748673024.0, + "24": 2748673024.0, + "25": 2748673024.0, + "26": 2748673024.0, + "27": 2748673024.0, + "28": 2748673024.0, + "29": 2748673024.0, + "30": 2748673024.0, + "31": 2748673024.0, + "32": 2748673024.0, + "33": 2748673024.0, + "34": 2748673024.0, + "35": 2748673024.0, + "36": 2748673024.0, + "37": 2748673024.0, + "38": 2748673024.0, + "39": 2748673024.0, + "40": 2748673024.0, + "41": 2748673024.0, + "42": 2748673024.0, + "43": 2748673024.0, + "44": 2748673024.0, + "45": 2748673024.0, + "46": 2748673024.0, + "47": 2748673024.0, + "48": 2748673024.0, + "49": 2748673024.0, + "50": 2748673024.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 16.75952, + "2": 0.21448, + "3": 0.18235, + "4": 0.18003, + "5": 0.17893, + "6": 0.17927, + "7": 0.1794, + "8": 0.17993, + "9": 0.17782, + "10": 0.17913, + "11": 0.18107, + "12": 0.18068, + "13": 0.18061, + "14": 0.17963, + "15": 0.17853, + "16": 0.17955, + "17": 0.17969, + "18": 0.17916, + "19": 0.18341, + "20": 0.18099, + "21": 0.18071, + "22": 0.17995, + "23": 0.17926, + "24": 0.17948, + "25": 0.18014, + "26": 0.17924, + "27": 0.1802, + "28": 0.17909, + "29": 0.18091, + "30": 0.18001, + "31": 0.17868, + "32": 0.17758, + "33": 0.1779, + "34": 0.17881, + "35": 0.17826, + "36": 0.1779, + "37": 0.17715, + "38": 0.17751, + "39": 0.17819, + "40": 0.17892, + "41": 0.17948, + "42": 0.45058, + "43": 0.18152, + "44": 0.17768, + "45": 0.17817, + "46": 0.17937, + "47": 0.17662, + "48": 0.17804, + "49": 0.17764, + "50": 0.17626 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..e0e25d127f8 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.79203, + "2": 10.80273, + "3": 10.80585, + "4": 10.77187, + "5": 10.84593, + "6": 10.86693, + "7": 10.82551, + "8": 10.81215, + "9": 10.83332, + "10": 10.76951, + "11": 10.89062, + "12": 10.84504, + "13": 10.85859, + "14": 10.8801, + "15": 10.78971, + "16": 10.78188, + "17": 10.75787, + "18": 10.79172, + "19": 10.79529, + "20": 10.67886, + "21": 10.65973, + "22": 10.50045, + "23": 10.71219, + "24": 10.55058, + "25": 10.50431, + "26": 10.5802, + "27": 10.58378, + "28": 10.55688, + "29": 10.55907, + "30": 10.33089, + "31": 10.08209, + "32": 10.44504, + "33": 10.44161, + "34": 10.19769, + "35": 10.25278, + "36": 10.19158, + "37": 10.31839, + "38": 10.16293, + "39": 10.37474, + "40": 10.05241, + "41": 10.13501, + "42": 10.18884, + "43": 9.8066, + "44": 9.92658, + "45": 9.80259, + "46": 9.81165, + "47": 10.12682, + "48": 9.8236, + "49": 9.51061, + "50": 9.88804 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1649.0, + "2": 1710.0, + "3": 1754.0, + "4": 1850.0, + "5": 1890.0, + "6": 1767.0, + "7": 1830.0, + "8": 1723.0, + "9": 1758.0, + "10": 1397.0, + "11": 1890.0, + "12": 1657.0, + "13": 1761.0, + "14": 1813.0, + "15": 1928.0, + "16": 1828.0, + "17": 1933.0, + "18": 1633.0, + "19": 1777.0, + "20": 1565.0, + "21": 1807.0, + "22": 1678.0, + "23": 2014.0, + "24": 1766.0, + "25": 1699.0, + "26": 1741.0, + "27": 1800.0, + "28": 1937.0, + "29": 1921.0, + "30": 1943.0, + "31": 1527.0, + "32": 1848.0, + "33": 2144.0, + "34": 1925.0, + "35": 2018.0, + "36": 1937.0, + "37": 2297.0, + "38": 2214.0, + "39": 2374.0, + "40": 2191.0, + "41": 2369.0, + "42": 2299.0, + "43": 1963.0, + "44": 2146.0, + "45": 2207.0, + "46": 2332.0, + "47": 2590.0, + "48": 2428.0, + "49": 2255.0, + "50": 2362.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 782094336.0, + "2": 782094336.0, + "3": 782094336.0, + "4": 782094336.0, + "5": 782094336.0, + "6": 782094336.0, + "7": 782094336.0, + "8": 782094336.0, + "9": 782094336.0, + "10": 782094336.0, + "11": 782094336.0, + "12": 782094336.0, + "13": 782094336.0, + "14": 782094336.0, + "15": 782094336.0, + "16": 782094336.0, + "17": 782094336.0, + "18": 782094336.0, + "19": 782094336.0, + "20": 782094336.0, + "21": 782094336.0, + "22": 782094336.0, + "23": 782094336.0, + "24": 782094336.0, + "25": 782094336.0, + "26": 782094336.0, + "27": 782094336.0, + "28": 782094336.0, + "29": 782094336.0, + "30": 782094336.0, + "31": 782094336.0, + "32": 782094336.0, + "33": 782094336.0, + "34": 782094336.0, + "35": 782094336.0, + "36": 782094336.0, + "37": 782094336.0, + "38": 782094336.0, + "39": 782094336.0, + "40": 782094336.0, + "41": 782094336.0, + "42": 782094336.0, + "43": 782094336.0, + "44": 782094336.0, + "45": 782094336.0, + "46": 782094336.0, + "47": 782094336.0, + "48": 782094336.0, + "49": 782094336.0, + "50": 782094336.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2462767104.0, + "2": 2748673024.0, + "3": 2748673024.0, + "4": 2748673024.0, + "5": 2748673024.0, + "6": 2748673024.0, + "7": 2748673024.0, + "8": 2748673024.0, + "9": 2748673024.0, + "10": 2748673024.0, + "11": 2748673024.0, + "12": 2748673024.0, + "13": 2748673024.0, + "14": 2748673024.0, + "15": 2748673024.0, + "16": 2748673024.0, + "17": 2748673024.0, + "18": 2748673024.0, + "19": 2748673024.0, + "20": 2748673024.0, + "21": 2748673024.0, + "22": 2748673024.0, + "23": 2748673024.0, + "24": 2748673024.0, + "25": 2748673024.0, + "26": 2748673024.0, + "27": 2748673024.0, + "28": 2748673024.0, + "29": 2748673024.0, + "30": 2748673024.0, + "31": 2748673024.0, + "32": 2748673024.0, + "33": 2748673024.0, + "34": 2748673024.0, + "35": 2748673024.0, + "36": 2748673024.0, + "37": 2748673024.0, + "38": 2748673024.0, + "39": 2748673024.0, + "40": 2748673024.0, + "41": 2748673024.0, + "42": 2748673024.0, + "43": 2748673024.0, + "44": 2748673024.0, + "45": 2748673024.0, + "46": 2748673024.0, + "47": 2748673024.0, + "48": 2748673024.0, + "49": 2748673024.0, + "50": 2748673024.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 17.23168, + "2": 0.20941, + "3": 0.18259, + "4": 0.18034, + "5": 0.18066, + "6": 0.17945, + "7": 0.17976, + "8": 0.18065, + "9": 0.18143, + "10": 0.18186, + "11": 0.18118, + "12": 0.17934, + "13": 0.17804, + "14": 0.17863, + "15": 0.17803, + "16": 0.35778, + "17": 0.17914, + "18": 0.17741, + "19": 0.17754, + "20": 0.17681, + "21": 0.17586, + "22": 0.17817, + "23": 0.17672, + "24": 0.17747, + "25": 0.17716, + "26": 0.17607, + "27": 0.17666, + "28": 0.17643, + "29": 0.17611, + "30": 0.17755, + "31": 0.17964, + "32": 0.17651, + "33": 0.18061, + "34": 0.17677, + "35": 0.179, + "36": 0.17888, + "37": 0.17609, + "38": 0.17685, + "39": 0.17655, + "40": 0.37865, + "41": 0.17694, + "42": 0.17631, + "43": 0.17661, + "44": 0.17607, + "45": 0.17551, + "46": 0.1785, + "47": 0.17532, + "48": 0.17603, + "49": 0.17585, + "50": 0.17631 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..987f9cc4371 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.79449, + "2": 10.80656, + "3": 10.80727, + "4": 10.77389, + "5": 10.84829, + "6": 10.86736, + "7": 10.82922, + "8": 10.81537, + "9": 10.83956, + "10": 10.77652, + "11": 10.90107, + "12": 10.85927, + "13": 10.87396, + "14": 10.89723, + "15": 10.83961, + "16": 10.83508, + "17": 10.82101, + "18": 10.86029, + "19": 10.86558, + "20": 10.82896, + "21": 10.83275, + "22": 10.75286, + "23": 10.88062, + "24": 10.78219, + "25": 10.76607, + "26": 10.79522, + "27": 10.79866, + "28": 10.81697, + "29": 10.82169, + "30": 10.69891, + "31": 10.55698, + "32": 10.75759, + "33": 10.74362, + "34": 10.59976, + "35": 10.61772, + "36": 10.56389, + "37": 10.63614, + "38": 10.53029, + "39": 10.65358, + "40": 10.44072, + "41": 10.49636, + "42": 10.50954, + "43": 10.22362, + "44": 10.30902, + "45": 10.21065, + "46": 10.19943, + "47": 10.41641, + "48": 10.18128, + "49": 9.94311, + "50": 10.21224, + "51": 10.16759, + "52": 10.06895, + "53": 10.30707, + "54": 10.20911, + "55": 10.15688, + "56": 9.91474, + "57": 9.77696, + "58": 10.07417, + "59": 9.86333, + "60": 9.77328, + "61": 9.9292, + "62": 10.17156, + "63": 9.62041, + "64": 9.97113, + "65": 9.21979, + "66": 9.88693, + "67": 9.58363, + "68": 9.94922, + "69": 9.95271, + "70": 9.89312, + "71": 9.77658, + "72": 9.75435, + "73": 9.6497, + "74": 9.1439, + "75": 9.56121, + "76": 9.25111, + "77": 10.17063, + "78": 9.85402, + "79": 9.49965, + "80": 9.53086, + "81": 9.60555, + "82": 9.80179, + "83": 9.43744, + "84": 9.51987, + "85": 9.7196, + "86": 9.18595, + "87": 9.68687, + "88": 9.8443, + "89": 9.70586, + "90": 9.89977, + "91": 9.45029, + "92": 9.45356, + "93": 9.18554, + "94": 8.92968, + "95": 9.59767, + "96": 9.61491, + "97": 9.39084, + "98": 9.75667, + "99": 8.97921, + "100": 9.49001 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 570640384.0, + "2": 570640384.0, + "3": 570640384.0, + "4": 570640384.0, + "5": 570640384.0, + "6": 570640384.0, + "7": 570640384.0, + "8": 570640384.0, + "9": 570640384.0, + "10": 570640384.0, + "11": 570640384.0, + "12": 570640384.0, + "13": 570640384.0, + "14": 570640384.0, + "15": 570640384.0, + "16": 570640384.0, + "17": 852351488.0, + "18": 852351488.0, + "19": 852351488.0, + "20": 852351488.0, + "21": 852351488.0, + "22": 852351488.0, + "23": 852351488.0, + "24": 852351488.0, + "25": 852351488.0, + "26": 852351488.0, + "27": 852351488.0, + "28": 852351488.0, + "29": 852351488.0, + "30": 852351488.0, + "31": 852351488.0, + "32": 852351488.0, + "33": 852351488.0, + "34": 852351488.0, + "35": 852351488.0, + "36": 852351488.0, + "37": 852351488.0, + "38": 852351488.0, + "39": 852351488.0, + "40": 852351488.0, + "41": 852351488.0, + "42": 852351488.0, + "43": 852351488.0, + "44": 852351488.0, + "45": 852351488.0, + "46": 852351488.0, + "47": 852351488.0, + "48": 852351488.0, + "49": 852351488.0, + "50": 852351488.0, + "51": 852351488.0, + "52": 852351488.0, + "53": 852351488.0, + "54": 852351488.0, + "55": 852351488.0, + "56": 852351488.0, + "57": 852351488.0, + "58": 852351488.0, + "59": 852351488.0, + "60": 852351488.0, + "61": 852351488.0, + "62": 852351488.0, + "63": 852351488.0, + "64": 852351488.0, + "65": 852351488.0, + "66": 852351488.0, + "67": 852351488.0, + "68": 852351488.0, + "69": 852351488.0, + "70": 852351488.0, + "71": 852351488.0, + "72": 852351488.0, + "73": 852351488.0, + "74": 852351488.0, + "75": 852351488.0, + "76": 852351488.0, + "77": 852351488.0, + "78": 852351488.0, + "79": 852351488.0, + "80": 852351488.0, + "81": 852351488.0, + "82": 852351488.0, + "83": 852351488.0, + "84": 852351488.0, + "85": 852351488.0, + "86": 852351488.0, + "87": 852351488.0, + "88": 852351488.0, + "89": 852351488.0, + "90": 852351488.0, + "91": 852351488.0, + "92": 852351488.0, + "93": 852351488.0, + "94": 852351488.0, + "95": 852351488.0, + "96": 852351488.0, + "97": 852351488.0, + "98": 852351488.0, + "99": 852351488.0, + "100": 852351488.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2393217536.0, + "2": 2393218048.0, + "3": 2393218048.0, + "4": 2393218048.0, + "5": 2393218048.0, + "6": 2393218048.0, + "7": 2393218048.0, + "8": 2393218048.0, + "9": 2393218048.0, + "10": 2393218048.0, + "11": 2393218048.0, + "12": 2393218048.0, + "13": 2393218048.0, + "14": 2393218048.0, + "15": 2393218048.0, + "16": 2393218048.0, + "17": 2393218048.0, + "18": 2675191296.0, + "19": 2675191296.0, + "20": 2675191296.0, + "21": 2675191296.0, + "22": 2675191296.0, + "23": 2675191296.0, + "24": 2675191296.0, + "25": 2675191296.0, + "26": 2675191296.0, + "27": 2675191296.0, + "28": 2675191296.0, + "29": 2675191296.0, + "30": 2675191296.0, + "31": 2675191296.0, + "32": 2675191296.0, + "33": 2675191296.0, + "34": 2675191296.0, + "35": 2675191296.0, + "36": 2675191296.0, + "37": 2675191296.0, + "38": 2675191296.0, + "39": 2675191296.0, + "40": 2675191296.0, + "41": 2675191296.0, + "42": 2675191296.0, + "43": 2675191296.0, + "44": 2675191296.0, + "45": 2675191296.0, + "46": 2675191296.0, + "47": 2675191296.0, + "48": 2675191296.0, + "49": 2675191296.0, + "50": 2675191296.0, + "51": 2675191296.0, + "52": 2675191296.0, + "53": 2675191296.0, + "54": 2675191296.0, + "55": 2675191296.0, + "56": 2675191296.0, + "57": 2675191296.0, + "58": 2675191296.0, + "59": 2675191296.0, + "60": 2675191296.0, + "61": 2675191296.0, + "62": 2675191296.0, + "63": 2675191296.0, + "64": 2675191296.0, + "65": 2675191296.0, + "66": 2675191296.0, + "67": 2675191296.0, + "68": 2675191296.0, + "69": 2675191296.0, + "70": 2675191296.0, + "71": 2675191296.0, + "72": 2675191296.0, + "73": 2675191296.0, + "74": 2675191296.0, + "75": 2675191296.0, + "76": 2675191296.0, + "77": 2675191296.0, + "78": 2675191296.0, + "79": 2675191296.0, + "80": 2675191296.0, + "81": 2675191296.0, + "82": 2675191296.0, + "83": 2675191296.0, + "84": 2675191296.0, + "85": 2675191296.0, + "86": 2675191296.0, + "87": 2675191296.0, + "88": 2675191296.0, + "89": 2675191296.0, + "90": 2675191296.0, + "91": 2675191296.0, + "92": 2675191296.0, + "93": 2675191296.0, + "94": 2675191296.0, + "95": 2675191296.0, + "96": 2675191296.0, + "97": 2675191296.0, + "98": 2675191296.0, + "99": 2675191296.0, + "100": 2675191296.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.25711, + "2": 0.20442, + "3": 0.31053, + "4": 0.17506, + "5": 0.17361, + "6": 0.16764, + "7": 0.16815, + "8": 0.16765, + "9": 0.16758, + "10": 0.17113, + "11": 0.16809, + "12": 0.17003, + "13": 0.16677, + "14": 0.16938, + "15": 0.16824, + "16": 0.16835, + "17": 0.24523, + "18": 0.17988, + "19": 0.17563, + "20": 0.17432, + "21": 0.17506, + "22": 0.17636, + "23": 0.17595, + "24": 0.17331, + "25": 0.17442, + "26": 0.17591, + "27": 0.17526, + "28": 0.17471, + "29": 0.17521, + "30": 0.17559, + "31": 0.17578, + "32": 0.17405, + "33": 0.17441, + "34": 0.17455, + "35": 0.17668, + "36": 0.17388, + "37": 0.17292, + "38": 0.17248, + "39": 0.17218, + "40": 0.17206, + "41": 0.17379, + "42": 0.17175, + "43": 0.17411, + "44": 0.17163, + "45": 0.17284, + "46": 0.17334, + "47": 0.17308, + "48": 0.17237, + "49": 0.17279, + "50": 0.17287, + "51": 0.18182, + "52": 0.17476, + "53": 0.17364, + "54": 0.17347, + "55": 0.1738, + "56": 0.17294, + "57": 0.17424, + "58": 0.17414, + "59": 0.17308, + "60": 0.17396, + "61": 0.17298, + "62": 0.17287, + "63": 0.17296, + "64": 0.17278, + "65": 0.17319, + "66": 0.17283, + "67": 0.17327, + "68": 0.17328, + "69": 0.17196, + "70": 0.17288, + "71": 0.1729, + "72": 0.1733, + "73": 0.17323, + "74": 0.17351, + "75": 0.17316, + "76": 0.17296, + "77": 0.17287, + "78": 0.17254, + "79": 0.17342, + "80": 0.17324, + "81": 0.17326, + "82": 0.17333, + "83": 0.17397, + "84": 0.17448, + "85": 0.17529, + "86": 0.17422, + "87": 0.17326, + "88": 0.17393, + "89": 0.17292, + "90": 0.17379, + "91": 0.17366, + "92": 0.17324, + "93": 0.17397, + "94": 0.17409, + "95": 0.17371, + "96": 0.17366, + "97": 0.17346, + "98": 0.17343, + "99": 0.17375, + "100": 0.17351 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": 2437.0, + "18": 2405.0, + "19": 2950.0, + "20": 1827.0, + "21": 2154.0, + "22": 2731.0, + "23": 2609.0, + "24": 2290.0, + "25": 2325.0, + "26": 2079.0, + "27": 2138.0, + "28": 2702.0, + "29": 2576.0, + "30": 2528.0, + "31": 1895.0, + "32": 2628.0, + "33": 2325.0, + "34": 1928.0, + "35": 2061.0, + "36": 2153.0, + "37": 2600.0, + "38": 2350.0, + "39": 2997.0, + "40": 2053.0, + "41": 3352.0, + "42": 2497.0, + "43": 2867.0, + "44": 2109.0, + "45": 2490.0, + "46": 2279.0, + "47": 3051.0, + "48": 2527.0, + "49": 1973.0, + "50": 2887.0, + "51": 2310.0, + "52": 2526.0, + "53": 3705.0, + "54": 2888.0, + "55": 2440.0, + "56": 2496.0, + "57": 2338.0, + "58": 3283.0, + "59": 2849.0, + "60": 2893.0, + "61": 2956.0, + "62": 3134.0, + "63": 3275.0, + "64": 3176.0, + "65": 2318.0, + "66": 3857.0, + "67": 2606.0, + "68": 3313.0, + "69": 2826.0, + "70": 3665.0, + "71": 3011.0, + "72": 2693.0, + "73": 3357.0, + "74": 2271.0, + "75": 2955.0, + "76": 3617.0, + "77": 3936.0, + "78": 3951.0, + "79": 4065.0, + "80": 3665.0, + "81": 5191.0, + "82": 3511.0, + "83": 3263.0, + "84": 3876.0, + "85": 4048.0, + "86": 3414.0, + "87": 3980.0, + "88": 3617.0, + "89": 4400.0, + "90": 3695.0, + "91": 2857.0, + "92": 4432.0, + "93": 3494.0, + "94": 4438.0, + "95": 4076.0, + "96": 3948.0, + "97": 4242.0, + "98": 4943.0, + "99": 3861.0, + "100": 3631.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgxa100_dracooci.json new file mode 100644 index 00000000000..2bcd6d2eaf1 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.79449, + "2": 10.80656, + "3": 10.80727, + "4": 10.77389, + "5": 10.84829, + "6": 10.86736, + "7": 10.82922, + "8": 10.81537, + "9": 10.83956, + "10": 10.77652, + "11": 10.90107, + "12": 10.85927, + "13": 10.87396, + "14": 10.89723, + "15": 10.83961, + "16": 10.83508, + "17": 10.82101, + "18": 10.86029, + "19": 10.86558, + "20": 10.82896, + "21": 10.83275, + "22": 10.75286, + "23": 10.88062, + "24": 10.78219, + "25": 10.76607, + "26": 10.79522, + "27": 10.79866, + "28": 10.81697, + "29": 10.82169, + "30": 10.69891, + "31": 10.55698, + "32": 10.75759, + "33": 10.74362, + "34": 10.59976, + "35": 10.61772, + "36": 10.56389, + "37": 10.63614, + "38": 10.53029, + "39": 10.65358, + "40": 10.44072, + "41": 10.49636, + "42": 10.50954, + "43": 10.22362, + "44": 10.30902, + "45": 10.21065, + "46": 10.19943, + "47": 10.41641, + "48": 10.18128, + "49": 9.94311, + "50": 10.21224, + "51": 10.16759, + "52": 10.06895, + "53": 10.30707, + "54": 10.20911, + "55": 10.15688, + "56": 9.91474, + "57": 9.77696, + "58": 10.07417, + "59": 9.86333, + "60": 9.77328, + "61": 9.9292, + "62": 10.17156, + "63": 9.62041, + "64": 9.97113, + "65": 9.21979, + "66": 9.88693, + "67": 9.58363, + "68": 9.94922, + "69": 9.95271, + "70": 9.89312, + "71": 9.77658, + "72": 9.75435, + "73": 9.6497, + "74": 9.1439, + "75": 9.56121, + "76": 9.25111, + "77": 10.17063, + "78": 9.85402, + "79": 9.49965, + "80": 9.53086, + "81": 9.60555, + "82": 9.80179, + "83": 9.43744, + "84": 9.51987, + "85": 9.7196, + "86": 9.18595, + "87": 9.68687, + "88": 9.8443, + "89": 9.70586, + "90": 9.89977, + "91": 9.45029, + "92": 9.45356, + "93": 9.18554, + "94": 8.92968, + "95": 9.59767, + "96": 9.61491, + "97": 9.39084, + "98": 9.75667, + "99": 8.97921, + "100": 9.49001 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 570640384.0, + "2": 570640384.0, + "3": 570640384.0, + "4": 570640384.0, + "5": 570640384.0, + "6": 570640384.0, + "7": 570640384.0, + "8": 570640384.0, + "9": 570640384.0, + "10": 570640384.0, + "11": 570640384.0, + "12": 570640384.0, + "13": 570640384.0, + "14": 570640384.0, + "15": 570640384.0, + "16": 570640384.0, + "17": 852351488.0, + "18": 852351488.0, + "19": 852351488.0, + "20": 852351488.0, + "21": 852351488.0, + "22": 852351488.0, + "23": 852351488.0, + "24": 852351488.0, + "25": 852351488.0, + "26": 852351488.0, + "27": 852351488.0, + "28": 852351488.0, + "29": 852351488.0, + "30": 852351488.0, + "31": 852351488.0, + "32": 852351488.0, + "33": 852351488.0, + "34": 852351488.0, + "35": 852351488.0, + "36": 852351488.0, + "37": 852351488.0, + "38": 852351488.0, + "39": 852351488.0, + "40": 852351488.0, + "41": 852351488.0, + "42": 852351488.0, + "43": 852351488.0, + "44": 852351488.0, + "45": 852351488.0, + "46": 852351488.0, + "47": 852351488.0, + "48": 852351488.0, + "49": 852351488.0, + "50": 852351488.0, + "51": 852351488.0, + "52": 852351488.0, + "53": 852351488.0, + "54": 852351488.0, + "55": 852351488.0, + "56": 852351488.0, + "57": 852351488.0, + "58": 852351488.0, + "59": 852351488.0, + "60": 852351488.0, + "61": 852351488.0, + "62": 852351488.0, + "63": 852351488.0, + "64": 852351488.0, + "65": 852351488.0, + "66": 852351488.0, + "67": 852351488.0, + "68": 852351488.0, + "69": 852351488.0, + "70": 852351488.0, + "71": 852351488.0, + "72": 852351488.0, + "73": 852351488.0, + "74": 852351488.0, + "75": 852351488.0, + "76": 852351488.0, + "77": 852351488.0, + "78": 852351488.0, + "79": 852351488.0, + "80": 852351488.0, + "81": 852351488.0, + "82": 852351488.0, + "83": 852351488.0, + "84": 852351488.0, + "85": 852351488.0, + "86": 852351488.0, + "87": 852351488.0, + "88": 852351488.0, + "89": 852351488.0, + "90": 852351488.0, + "91": 852351488.0, + "92": 852351488.0, + "93": 852351488.0, + "94": 852351488.0, + "95": 852351488.0, + "96": 852351488.0, + "97": 852351488.0, + "98": 852351488.0, + "99": 852351488.0, + "100": 852351488.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2393217536.0, + "2": 2393218048.0, + "3": 2393218048.0, + "4": 2393218048.0, + "5": 2393218048.0, + "6": 2393218048.0, + "7": 2393218048.0, + "8": 2393218048.0, + "9": 2393218048.0, + "10": 2393218048.0, + "11": 2393218048.0, + "12": 2393218048.0, + "13": 2393218048.0, + "14": 2393218048.0, + "15": 2393218048.0, + "16": 2393218048.0, + "17": 2393218048.0, + "18": 2675191296.0, + "19": 2675191296.0, + "20": 2675191296.0, + "21": 2675191296.0, + "22": 2675191296.0, + "23": 2675191296.0, + "24": 2675191296.0, + "25": 2675191296.0, + "26": 2675191296.0, + "27": 2675191296.0, + "28": 2675191296.0, + "29": 2675191296.0, + "30": 2675191296.0, + "31": 2675191296.0, + "32": 2675191296.0, + "33": 2675191296.0, + "34": 2675191296.0, + "35": 2675191296.0, + "36": 2675191296.0, + "37": 2675191296.0, + "38": 2675191296.0, + "39": 2675191296.0, + "40": 2675191296.0, + "41": 2675191296.0, + "42": 2675191296.0, + "43": 2675191296.0, + "44": 2675191296.0, + "45": 2675191296.0, + "46": 2675191296.0, + "47": 2675191296.0, + "48": 2675191296.0, + "49": 2675191296.0, + "50": 2675191296.0, + "51": 2675191296.0, + "52": 2675191296.0, + "53": 2675191296.0, + "54": 2675191296.0, + "55": 2675191296.0, + "56": 2675191296.0, + "57": 2675191296.0, + "58": 2675191296.0, + "59": 2675191296.0, + "60": 2675191296.0, + "61": 2675191296.0, + "62": 2675191296.0, + "63": 2675191296.0, + "64": 2675191296.0, + "65": 2675191296.0, + "66": 2675191296.0, + "67": 2675191296.0, + "68": 2675191296.0, + "69": 2675191296.0, + "70": 2675191296.0, + "71": 2675191296.0, + "72": 2675191296.0, + "73": 2675191296.0, + "74": 2675191296.0, + "75": 2675191296.0, + "76": 2675191296.0, + "77": 2675191296.0, + "78": 2675191296.0, + "79": 2675191296.0, + "80": 2675191296.0, + "81": 2675191296.0, + "82": 2675191296.0, + "83": 2675191296.0, + "84": 2675191296.0, + "85": 2675191296.0, + "86": 2675191296.0, + "87": 2675191296.0, + "88": 2675191296.0, + "89": 2675191296.0, + "90": 2675191296.0, + "91": 2675191296.0, + "92": 2675191296.0, + "93": 2675191296.0, + "94": 2675191296.0, + "95": 2675191296.0, + "96": 2675191296.0, + "97": 2675191296.0, + "98": 2675191296.0, + "99": 2675191296.0, + "100": 2675191296.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 7.71736, + "2": 0.20733, + "3": 0.16848, + "4": 0.16524, + "5": 0.16238, + "6": 0.16187, + "7": 0.16222, + "8": 0.16966, + "9": 0.16728, + "10": 0.16645, + "11": 0.16656, + "12": 0.16608, + "13": 0.16573, + "14": 0.16701, + "15": 0.16496, + "16": 0.16669, + "17": 0.23079, + "18": 0.1849, + "19": 0.17171, + "20": 0.17096, + "21": 0.17174, + "22": 0.17119, + "23": 0.17277, + "24": 0.17201, + "25": 0.17439, + "26": 0.17169, + "27": 0.17161, + "28": 0.17192, + "29": 0.17194, + "30": 0.17228, + "31": 0.17292, + "32": 0.17122, + "33": 0.17157, + "34": 0.1724, + "35": 0.17452, + "36": 0.17212, + "37": 0.17181, + "38": 0.17195, + "39": 0.17197, + "40": 0.17277, + "41": 0.17339, + "42": 0.17111, + "43": 0.17212, + "44": 0.17128, + "45": 0.17186, + "46": 0.17214, + "47": 0.17062, + "48": 0.17161, + "49": 0.17218, + "50": 0.17161, + "51": 0.17752, + "52": 0.17189, + "53": 0.17103, + "54": 0.17149, + "55": 0.1719, + "56": 0.17107, + "57": 0.17148, + "58": 0.17125, + "59": 0.17359, + "60": 0.172, + "61": 0.17008, + "62": 0.17062, + "63": 0.17153, + "64": 0.17237, + "65": 0.1724, + "66": 0.17702, + "67": 0.17451, + "68": 0.17335, + "69": 0.17257, + "70": 0.17296, + "71": 0.17324, + "72": 0.17308, + "73": 0.1733, + "74": 0.17393, + "75": 0.17307, + "76": 0.17314, + "77": 0.17235, + "78": 0.17169, + "79": 0.17051, + "80": 0.17076, + "81": 0.17091, + "82": 0.1698, + "83": 0.16956, + "84": 0.16892, + "85": 0.17014, + "86": 0.16969, + "87": 0.16994, + "88": 0.17052, + "89": 0.1722, + "90": 0.16945, + "91": 0.17051, + "92": 0.16932, + "93": 0.17024, + "94": 0.1701, + "95": 0.16924, + "96": 0.16933, + "97": 0.17042, + "98": 0.16973, + "99": 0.17021, + "100": 0.17096 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": 2437.0, + "18": 2405.0, + "19": 2950.0, + "20": 1827.0, + "21": 2154.0, + "22": 2731.0, + "23": 2609.0, + "24": 2290.0, + "25": 2325.0, + "26": 2079.0, + "27": 2138.0, + "28": 2702.0, + "29": 2576.0, + "30": 2528.0, + "31": 1895.0, + "32": 2628.0, + "33": 2325.0, + "34": 1928.0, + "35": 2061.0, + "36": 2153.0, + "37": 2600.0, + "38": 2350.0, + "39": 2997.0, + "40": 2053.0, + "41": 3352.0, + "42": 2497.0, + "43": 2867.0, + "44": 2109.0, + "45": 2490.0, + "46": 2279.0, + "47": 3051.0, + "48": 2527.0, + "49": 1973.0, + "50": 2887.0, + "51": 2310.0, + "52": 2526.0, + "53": 3705.0, + "54": 2888.0, + "55": 2440.0, + "56": 2496.0, + "57": 2338.0, + "58": 3283.0, + "59": 2849.0, + "60": 2893.0, + "61": 2956.0, + "62": 3134.0, + "63": 3275.0, + "64": 3176.0, + "65": 2318.0, + "66": 3857.0, + "67": 2606.0, + "68": 3313.0, + "69": 2826.0, + "70": 3665.0, + "71": 3011.0, + "72": 2693.0, + "73": 3357.0, + "74": 2271.0, + "75": 2955.0, + "76": 3617.0, + "77": 3936.0, + "78": 3951.0, + "79": 4065.0, + "80": 3665.0, + "81": 5191.0, + "82": 3511.0, + "83": 3263.0, + "84": 3876.0, + "85": 4048.0, + "86": 3414.0, + "87": 3980.0, + "88": 3617.0, + "89": 4400.0, + "90": 3695.0, + "91": 2857.0, + "92": 4432.0, + "93": 3494.0, + "94": 4438.0, + "95": 4076.0, + "96": 3948.0, + "97": 4242.0, + "98": 4943.0, + "99": 3861.0, + "100": 3631.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..66f5a69ba1b --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.79219, + "2": 10.80294, + "3": 10.80725, + "4": 10.77342, + "5": 10.84727, + "6": 10.8682, + "7": 10.8278, + "8": 10.81626, + "9": 10.83861, + "10": 10.77729, + "11": 10.90005, + "12": 10.85954, + "13": 10.87494, + "14": 10.8953, + "15": 10.84106, + "16": 10.83779, + "17": 10.82436, + "18": 10.85906, + "19": 10.86597, + "20": 10.82889, + "21": 10.83382, + "22": 10.75171, + "23": 10.8822, + "24": 10.78198, + "25": 10.7666, + "26": 10.79421, + "27": 10.79973, + "28": 10.81809, + "29": 10.81973, + "30": 10.69961, + "31": 10.55541, + "32": 10.75748, + "33": 10.7417, + "34": 10.59849, + "35": 10.61845, + "36": 10.56439, + "37": 10.63758, + "38": 10.53033, + "39": 10.65378, + "40": 10.44051, + "41": 10.49785, + "42": 10.50842, + "43": 10.22237, + "44": 10.30681, + "45": 10.20859, + "46": 10.20077, + "47": 10.41716, + "48": 10.18042, + "49": 9.94398, + "50": 10.21168, + "51": 10.16603, + "52": 10.06842, + "53": 10.30736, + "54": 10.20998, + "55": 10.15675, + "56": 9.91528, + "57": 9.77636, + "58": 10.07274, + "59": 9.86327, + "60": 9.77265, + "61": 9.92815, + "62": 10.17249, + "63": 9.62223, + "64": 9.97162, + "65": 9.22128, + "66": 9.88606, + "67": 9.5836, + "68": 9.95061, + "69": 9.95306, + "70": 9.89371, + "71": 9.77681, + "72": 9.75545, + "73": 9.64983, + "74": 9.14359, + "75": 9.56098, + "76": 9.25119, + "77": 10.16981, + "78": 9.854, + "79": 9.49956, + "80": 9.5311, + "81": 9.60482, + "82": 9.80129, + "83": 9.43763, + "84": 9.51982, + "85": 9.71911, + "86": 9.18564, + "87": 9.68731, + "88": 9.84403, + "89": 9.7063, + "90": 9.89983, + "91": 9.45059, + "92": 9.45364, + "93": 9.18519, + "94": 8.92953, + "95": 9.59785, + "96": 9.61472, + "97": 9.39069, + "98": 9.75698, + "99": 8.9803, + "100": 9.49009 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 570640384.0, + "2": 570640384.0, + "3": 570640384.0, + "4": 570640384.0, + "5": 570640384.0, + "6": 570640384.0, + "7": 570640384.0, + "8": 570640384.0, + "9": 570640384.0, + "10": 570640384.0, + "11": 570640384.0, + "12": 570640384.0, + "13": 570640384.0, + "14": 570640384.0, + "15": 570640384.0, + "16": 570640384.0, + "17": 852351488.0, + "18": 852351488.0, + "19": 852351488.0, + "20": 852351488.0, + "21": 852351488.0, + "22": 852351488.0, + "23": 852351488.0, + "24": 852351488.0, + "25": 852351488.0, + "26": 852351488.0, + "27": 852351488.0, + "28": 852351488.0, + "29": 852351488.0, + "30": 852351488.0, + "31": 852351488.0, + "32": 852351488.0, + "33": 852351488.0, + "34": 852351488.0, + "35": 852351488.0, + "36": 852351488.0, + "37": 852351488.0, + "38": 852351488.0, + "39": 852351488.0, + "40": 852351488.0, + "41": 852351488.0, + "42": 852351488.0, + "43": 852351488.0, + "44": 852351488.0, + "45": 852351488.0, + "46": 852351488.0, + "47": 852351488.0, + "48": 852351488.0, + "49": 852351488.0, + "50": 852351488.0, + "51": 852351488.0, + "52": 852351488.0, + "53": 852351488.0, + "54": 852351488.0, + "55": 852351488.0, + "56": 852351488.0, + "57": 852351488.0, + "58": 852351488.0, + "59": 852351488.0, + "60": 852351488.0, + "61": 852351488.0, + "62": 852351488.0, + "63": 852351488.0, + "64": 852351488.0, + "65": 852351488.0, + "66": 852351488.0, + "67": 852351488.0, + "68": 852351488.0, + "69": 852351488.0, + "70": 852351488.0, + "71": 852351488.0, + "72": 852351488.0, + "73": 852351488.0, + "74": 852351488.0, + "75": 852351488.0, + "76": 852351488.0, + "77": 852351488.0, + "78": 852351488.0, + "79": 852351488.0, + "80": 852351488.0, + "81": 852351488.0, + "82": 852351488.0, + "83": 852351488.0, + "84": 852351488.0, + "85": 852351488.0, + "86": 852351488.0, + "87": 852351488.0, + "88": 852351488.0, + "89": 852351488.0, + "90": 852351488.0, + "91": 852351488.0, + "92": 852351488.0, + "93": 852351488.0, + "94": 852351488.0, + "95": 852351488.0, + "96": 852351488.0, + "97": 852351488.0, + "98": 852351488.0, + "99": 852351488.0, + "100": 852351488.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2393217536.0, + "2": 2393218048.0, + "3": 2393218048.0, + "4": 2393218048.0, + "5": 2393218048.0, + "6": 2393218048.0, + "7": 2393218048.0, + "8": 2393218048.0, + "9": 2393218048.0, + "10": 2393218048.0, + "11": 2393218048.0, + "12": 2393218048.0, + "13": 2393218048.0, + "14": 2393218048.0, + "15": 2393218048.0, + "16": 2393218048.0, + "17": 2393218048.0, + "18": 2675191296.0, + "19": 2675191296.0, + "20": 2675191296.0, + "21": 2675191296.0, + "22": 2675191296.0, + "23": 2675191296.0, + "24": 2675191296.0, + "25": 2675191296.0, + "26": 2675191296.0, + "27": 2675191296.0, + "28": 2675191296.0, + "29": 2675191296.0, + "30": 2675191296.0, + "31": 2675191296.0, + "32": 2675191296.0, + "33": 2675191296.0, + "34": 2675191296.0, + "35": 2675191296.0, + "36": 2675191296.0, + "37": 2675191296.0, + "38": 2675191296.0, + "39": 2675191296.0, + "40": 2675191296.0, + "41": 2675191296.0, + "42": 2675191296.0, + "43": 2675191296.0, + "44": 2675191296.0, + "45": 2675191296.0, + "46": 2675191296.0, + "47": 2675191296.0, + "48": 2675191296.0, + "49": 2675191296.0, + "50": 2675191296.0, + "51": 2675191296.0, + "52": 2675191296.0, + "53": 2675191296.0, + "54": 2675191296.0, + "55": 2675191296.0, + "56": 2675191296.0, + "57": 2675191296.0, + "58": 2675191296.0, + "59": 2675191296.0, + "60": 2675191296.0, + "61": 2675191296.0, + "62": 2675191296.0, + "63": 2675191296.0, + "64": 2675191296.0, + "65": 2675191296.0, + "66": 2675191296.0, + "67": 2675191296.0, + "68": 2675191296.0, + "69": 2675191296.0, + "70": 2675191296.0, + "71": 2675191296.0, + "72": 2675191296.0, + "73": 2675191296.0, + "74": 2675191296.0, + "75": 2675191296.0, + "76": 2675191296.0, + "77": 2675191296.0, + "78": 2675191296.0, + "79": 2675191296.0, + "80": 2675191296.0, + "81": 2675191296.0, + "82": 2675191296.0, + "83": 2675191296.0, + "84": 2675191296.0, + "85": 2675191296.0, + "86": 2675191296.0, + "87": 2675191296.0, + "88": 2675191296.0, + "89": 2675191296.0, + "90": 2675191296.0, + "91": 2675191296.0, + "92": 2675191296.0, + "93": 2675191296.0, + "94": 2675191296.0, + "95": 2675191296.0, + "96": 2675191296.0, + "97": 2675191296.0, + "98": 2675191296.0, + "99": 2675191296.0, + "100": 2675191296.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 19.89272, + "2": 0.22107, + "3": 0.18275, + "4": 0.18107, + "5": 0.17886, + "6": 0.18018, + "7": 0.17948, + "8": 0.18069, + "9": 0.17962, + "10": 0.17963, + "11": 0.17947, + "12": 0.17823, + "13": 0.17865, + "14": 0.17837, + "15": 0.17763, + "16": 0.1799, + "17": 0.22816, + "18": 0.19169, + "19": 0.18609, + "20": 0.18543, + "21": 0.18512, + "22": 0.1854, + "23": 0.18528, + "24": 0.18513, + "25": 0.18379, + "26": 0.18616, + "27": 0.18415, + "28": 0.18391, + "29": 0.18338, + "30": 0.18284, + "31": 0.18419, + "32": 0.18271, + "33": 0.18342, + "34": 0.18309, + "35": 0.18499, + "36": 0.18314, + "37": 0.18313, + "38": 0.18318, + "39": 0.18257, + "40": 0.18362, + "41": 0.18408, + "42": 0.18593, + "43": 0.18429, + "44": 0.18306, + "45": 0.18258, + "46": 0.18357, + "47": 0.18345, + "48": 0.18361, + "49": 0.18333, + "50": 0.18415, + "51": 0.19311, + "52": 0.18608, + "53": 0.18549, + "54": 0.18334, + "55": 0.38073, + "56": 0.18342, + "57": 0.18432, + "58": 0.18626, + "59": 0.18513, + "60": 0.18344, + "61": 0.18248, + "62": 0.18332, + "63": 0.18441, + "64": 0.18566, + "65": 0.18351, + "66": 0.1834, + "67": 0.18454, + "68": 0.18312, + "69": 0.18334, + "70": 0.18273, + "71": 0.18529, + "72": 0.18793, + "73": 0.18357, + "74": 0.18295, + "75": 0.18311, + "76": 0.18315, + "77": 0.18309, + "78": 0.1831, + "79": 0.18331, + "80": 0.18243, + "81": 0.1841, + "82": 0.18426, + "83": 0.18296, + "84": 0.18393, + "85": 0.18305, + "86": 0.18319, + "87": 0.18267, + "88": 0.18256, + "89": 0.18287, + "90": 0.18205, + "91": 0.18594, + "92": 0.18287, + "93": 0.18383, + "94": 0.18383, + "95": 0.183, + "96": 0.18259, + "97": 0.18302, + "98": 0.18382, + "99": 0.18264, + "100": 0.18713 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": 2468.0, + "18": 2395.0, + "19": 3001.0, + "20": 1788.0, + "21": 2162.0, + "22": 2852.0, + "23": 2510.0, + "24": 2287.0, + "25": 2300.0, + "26": 2108.0, + "27": 2155.0, + "28": 2751.0, + "29": 2604.0, + "30": 2419.0, + "31": 1842.0, + "32": 2598.0, + "33": 2277.0, + "34": 1897.0, + "35": 2097.0, + "36": 2176.0, + "37": 2715.0, + "38": 2423.0, + "39": 3095.0, + "40": 2126.0, + "41": 3441.0, + "42": 2505.0, + "43": 2679.0, + "44": 2086.0, + "45": 2520.0, + "46": 2259.0, + "47": 3003.0, + "48": 2604.0, + "49": 1956.0, + "50": 2929.0, + "51": 2283.0, + "52": 2458.0, + "53": 3770.0, + "54": 2965.0, + "55": 2457.0, + "56": 2411.0, + "57": 2342.0, + "58": 3450.0, + "59": 2845.0, + "60": 2961.0, + "61": 2897.0, + "62": 3092.0, + "63": 3200.0, + "64": 3129.0, + "65": 2359.0, + "66": 3857.0, + "67": 2591.0, + "68": 3272.0, + "69": 2823.0, + "70": 3633.0, + "71": 3058.0, + "72": 2755.0, + "73": 3353.0, + "74": 2201.0, + "75": 2932.0, + "76": 3649.0, + "77": 4022.0, + "78": 3953.0, + "79": 4091.0, + "80": 3595.0, + "81": 5179.0, + "82": 3499.0, + "83": 3262.0, + "84": 3902.0, + "85": 3959.0, + "86": 3288.0, + "87": 4032.0, + "88": 3628.0, + "89": 4405.0, + "90": 3785.0, + "91": 2856.0, + "92": 4187.0, + "93": 3564.0, + "94": 4347.0, + "95": 4072.0, + "96": 3833.0, + "97": 4121.0, + "98": 4897.0, + "99": 4120.0, + "100": 3581.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..77c8aa6317e --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.79219, + "2": 10.80294, + "3": 10.80725, + "4": 10.77342, + "5": 10.84727, + "6": 10.8682, + "7": 10.8278, + "8": 10.81626, + "9": 10.83861, + "10": 10.77729, + "11": 10.90005, + "12": 10.85954, + "13": 10.87494, + "14": 10.8953, + "15": 10.84106, + "16": 10.83779, + "17": 10.82436, + "18": 10.85906, + "19": 10.86597, + "20": 10.82889, + "21": 10.83382, + "22": 10.75171, + "23": 10.8822, + "24": 10.78198, + "25": 10.7666, + "26": 10.79421, + "27": 10.79973, + "28": 10.81809, + "29": 10.81973, + "30": 10.69961, + "31": 10.55541, + "32": 10.75748, + "33": 10.7417, + "34": 10.59849, + "35": 10.61845, + "36": 10.56439, + "37": 10.63758, + "38": 10.53033, + "39": 10.65378, + "40": 10.44051, + "41": 10.49785, + "42": 10.50842, + "43": 10.22237, + "44": 10.30681, + "45": 10.20859, + "46": 10.20077, + "47": 10.41716, + "48": 10.18042, + "49": 9.94398, + "50": 10.21168, + "51": 10.16603, + "52": 10.06842, + "53": 10.30736, + "54": 10.20998, + "55": 10.15675, + "56": 9.91528, + "57": 9.77636, + "58": 10.07274, + "59": 9.86327, + "60": 9.77265, + "61": 9.92815, + "62": 10.17249, + "63": 9.62223, + "64": 9.97162, + "65": 9.22128, + "66": 9.88606, + "67": 9.5836, + "68": 9.95061, + "69": 9.95306, + "70": 9.89371, + "71": 9.77681, + "72": 9.75545, + "73": 9.64983, + "74": 9.14359, + "75": 9.56098, + "76": 9.25119, + "77": 10.16981, + "78": 9.854, + "79": 9.49956, + "80": 9.5311, + "81": 9.60482, + "82": 9.80129, + "83": 9.43763, + "84": 9.51982, + "85": 9.71911, + "86": 9.18564, + "87": 9.68731, + "88": 9.84403, + "89": 9.7063, + "90": 9.89983, + "91": 9.45059, + "92": 9.45364, + "93": 9.18519, + "94": 8.92953, + "95": 9.59785, + "96": 9.61472, + "97": 9.39069, + "98": 9.75698, + "99": 8.9803, + "100": 9.49009 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 570640384.0, + "2": 570640384.0, + "3": 570640384.0, + "4": 570640384.0, + "5": 570640384.0, + "6": 570640384.0, + "7": 570640384.0, + "8": 570640384.0, + "9": 570640384.0, + "10": 570640384.0, + "11": 570640384.0, + "12": 570640384.0, + "13": 570640384.0, + "14": 570640384.0, + "15": 570640384.0, + "16": 570640384.0, + "17": 852351488.0, + "18": 852351488.0, + "19": 852351488.0, + "20": 852351488.0, + "21": 852351488.0, + "22": 852351488.0, + "23": 852351488.0, + "24": 852351488.0, + "25": 852351488.0, + "26": 852351488.0, + "27": 852351488.0, + "28": 852351488.0, + "29": 852351488.0, + "30": 852351488.0, + "31": 852351488.0, + "32": 852351488.0, + "33": 852351488.0, + "34": 852351488.0, + "35": 852351488.0, + "36": 852351488.0, + "37": 852351488.0, + "38": 852351488.0, + "39": 852351488.0, + "40": 852351488.0, + "41": 852351488.0, + "42": 852351488.0, + "43": 852351488.0, + "44": 852351488.0, + "45": 852351488.0, + "46": 852351488.0, + "47": 852351488.0, + "48": 852351488.0, + "49": 852351488.0, + "50": 852351488.0, + "51": 852351488.0, + "52": 852351488.0, + "53": 852351488.0, + "54": 852351488.0, + "55": 852351488.0, + "56": 852351488.0, + "57": 852351488.0, + "58": 852351488.0, + "59": 852351488.0, + "60": 852351488.0, + "61": 852351488.0, + "62": 852351488.0, + "63": 852351488.0, + "64": 852351488.0, + "65": 852351488.0, + "66": 852351488.0, + "67": 852351488.0, + "68": 852351488.0, + "69": 852351488.0, + "70": 852351488.0, + "71": 852351488.0, + "72": 852351488.0, + "73": 852351488.0, + "74": 852351488.0, + "75": 852351488.0, + "76": 852351488.0, + "77": 852351488.0, + "78": 852351488.0, + "79": 852351488.0, + "80": 852351488.0, + "81": 852351488.0, + "82": 852351488.0, + "83": 852351488.0, + "84": 852351488.0, + "85": 852351488.0, + "86": 852351488.0, + "87": 852351488.0, + "88": 852351488.0, + "89": 852351488.0, + "90": 852351488.0, + "91": 852351488.0, + "92": 852351488.0, + "93": 852351488.0, + "94": 852351488.0, + "95": 852351488.0, + "96": 852351488.0, + "97": 852351488.0, + "98": 852351488.0, + "99": 852351488.0, + "100": 852351488.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2393217536.0, + "2": 2393218048.0, + "3": 2393218048.0, + "4": 2393218048.0, + "5": 2393218048.0, + "6": 2393218048.0, + "7": 2393218048.0, + "8": 2393218048.0, + "9": 2393218048.0, + "10": 2393218048.0, + "11": 2393218048.0, + "12": 2393218048.0, + "13": 2393218048.0, + "14": 2393218048.0, + "15": 2393218048.0, + "16": 2393218048.0, + "17": 2393218048.0, + "18": 2675191296.0, + "19": 2675191296.0, + "20": 2675191296.0, + "21": 2675191296.0, + "22": 2675191296.0, + "23": 2675191296.0, + "24": 2675191296.0, + "25": 2675191296.0, + "26": 2675191296.0, + "27": 2675191296.0, + "28": 2675191296.0, + "29": 2675191296.0, + "30": 2675191296.0, + "31": 2675191296.0, + "32": 2675191296.0, + "33": 2675191296.0, + "34": 2675191296.0, + "35": 2675191296.0, + "36": 2675191296.0, + "37": 2675191296.0, + "38": 2675191296.0, + "39": 2675191296.0, + "40": 2675191296.0, + "41": 2675191296.0, + "42": 2675191296.0, + "43": 2675191296.0, + "44": 2675191296.0, + "45": 2675191296.0, + "46": 2675191296.0, + "47": 2675191296.0, + "48": 2675191296.0, + "49": 2675191296.0, + "50": 2675191296.0, + "51": 2675191296.0, + "52": 2675191296.0, + "53": 2675191296.0, + "54": 2675191296.0, + "55": 2675191296.0, + "56": 2675191296.0, + "57": 2675191296.0, + "58": 2675191296.0, + "59": 2675191296.0, + "60": 2675191296.0, + "61": 2675191296.0, + "62": 2675191296.0, + "63": 2675191296.0, + "64": 2675191296.0, + "65": 2675191296.0, + "66": 2675191296.0, + "67": 2675191296.0, + "68": 2675191296.0, + "69": 2675191296.0, + "70": 2675191296.0, + "71": 2675191296.0, + "72": 2675191296.0, + "73": 2675191296.0, + "74": 2675191296.0, + "75": 2675191296.0, + "76": 2675191296.0, + "77": 2675191296.0, + "78": 2675191296.0, + "79": 2675191296.0, + "80": 2675191296.0, + "81": 2675191296.0, + "82": 2675191296.0, + "83": 2675191296.0, + "84": 2675191296.0, + "85": 2675191296.0, + "86": 2675191296.0, + "87": 2675191296.0, + "88": 2675191296.0, + "89": 2675191296.0, + "90": 2675191296.0, + "91": 2675191296.0, + "92": 2675191296.0, + "93": 2675191296.0, + "94": 2675191296.0, + "95": 2675191296.0, + "96": 2675191296.0, + "97": 2675191296.0, + "98": 2675191296.0, + "99": 2675191296.0, + "100": 2675191296.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 17.43358, + "2": 0.22524, + "3": 0.17789, + "4": 0.17624, + "5": 0.17537, + "6": 0.17509, + "7": 0.17504, + "8": 0.17597, + "9": 0.33529, + "10": 0.1733, + "11": 0.17189, + "12": 0.36273, + "13": 0.33105, + "14": 0.17358, + "15": 0.17041, + "16": 0.17127, + "17": 0.22308, + "18": 0.18489, + "19": 0.17575, + "20": 0.17774, + "21": 0.17576, + "22": 0.17856, + "23": 0.17708, + "24": 0.17716, + "25": 0.17653, + "26": 0.17714, + "27": 0.17666, + "28": 0.17607, + "29": 0.17677, + "30": 0.17713, + "31": 0.17662, + "32": 0.17475, + "33": 0.17536, + "34": 0.17541, + "35": 0.17373, + "36": 0.17425, + "37": 0.17642, + "38": 0.17354, + "39": 0.1728, + "40": 0.17398, + "41": 0.17325, + "42": 0.17407, + "43": 0.17446, + "44": 0.17406, + "45": 0.17259, + "46": 0.17351, + "47": 0.17206, + "48": 0.17349, + "49": 0.17325, + "50": 0.17301, + "51": 0.1847, + "52": 0.17696, + "53": 0.17664, + "54": 0.17578, + "55": 0.17469, + "56": 0.1747, + "57": 0.17669, + "58": 0.46947, + "59": 0.17866, + "60": 0.18128, + "61": 0.1841, + "62": 0.18126, + "63": 0.18539, + "64": 0.18121, + "65": 0.18392, + "66": 0.18089, + "67": 0.18156, + "68": 0.18143, + "69": 0.18341, + "70": 0.18174, + "71": 0.18035, + "72": 0.18154, + "73": 0.18372, + "74": 0.18315, + "75": 0.18495, + "76": 0.18114, + "77": 0.18247, + "78": 0.18539, + "79": 0.18003, + "80": 0.18064, + "81": 0.18357, + "82": 0.18141, + "83": 0.18237, + "84": 0.1825, + "85": 0.1832, + "86": 0.18311, + "87": 0.18223, + "88": 0.18193, + "89": 0.18393, + "90": 0.18315, + "91": 0.18376, + "92": 0.1829, + "93": 0.18319, + "94": 0.18381, + "95": 0.18373, + "96": 0.18292, + "97": 0.18321, + "98": 0.18299, + "99": 0.1838, + "100": 0.18438 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": "nan", + "17": 2468.0, + "18": 2395.0, + "19": 3001.0, + "20": 1788.0, + "21": 2162.0, + "22": 2852.0, + "23": 2510.0, + "24": 2287.0, + "25": 2300.0, + "26": 2108.0, + "27": 2155.0, + "28": 2751.0, + "29": 2604.0, + "30": 2419.0, + "31": 1842.0, + "32": 2598.0, + "33": 2277.0, + "34": 1897.0, + "35": 2097.0, + "36": 2176.0, + "37": 2715.0, + "38": 2423.0, + "39": 3095.0, + "40": 2126.0, + "41": 3441.0, + "42": 2505.0, + "43": 2679.0, + "44": 2086.0, + "45": 2520.0, + "46": 2259.0, + "47": 3003.0, + "48": 2604.0, + "49": 1956.0, + "50": 2929.0, + "51": 2283.0, + "52": 2458.0, + "53": 3770.0, + "54": 2965.0, + "55": 2457.0, + "56": 2411.0, + "57": 2342.0, + "58": 3450.0, + "59": 2845.0, + "60": 2961.0, + "61": 2897.0, + "62": 3092.0, + "63": 3200.0, + "64": 3129.0, + "65": 2359.0, + "66": 3857.0, + "67": 2591.0, + "68": 3272.0, + "69": 2823.0, + "70": 3633.0, + "71": 3058.0, + "72": 2755.0, + "73": 3353.0, + "74": 2201.0, + "75": 2932.0, + "76": 3649.0, + "77": 4022.0, + "78": 3953.0, + "79": 4091.0, + "80": 3595.0, + "81": 5179.0, + "82": 3499.0, + "83": 3262.0, + "84": 3902.0, + "85": 3959.0, + "86": 3288.0, + "87": 4032.0, + "88": 3628.0, + "89": 4405.0, + "90": 3785.0, + "91": 2856.0, + "92": 4187.0, + "93": 3564.0, + "94": 4347.0, + "95": 4072.0, + "96": 3833.0, + "97": 4121.0, + "98": 4897.0, + "99": 4120.0, + "100": 3581.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_h100.json index d7a8a24cd68..dabf1673e8e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_h100.json @@ -2,140 +2,535 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.84277, + "2": 10.85562, + "3": 10.84568, + "4": 10.84364, "5": 10.85979, + "6": 10.86413, + "7": 10.85362, + "8": 10.85066, + "9": 10.8615, "10": 10.82586, + "11": 10.86811, + "12": 10.85685, + "13": 10.87827, + "14": 10.86894, "15": 10.85888, + "16": 10.8685, + "17": 10.85105, + "18": 10.85939, + "19": 10.85704, "20": 10.84526, + "21": 10.85808, + "22": 10.83215, + "23": 10.86717, + "24": 10.83773, "25": 10.82744, + "26": 10.83163, + "27": 10.83573, + "28": 10.82373, + "29": 10.81624, "30": 10.76486, + "31": 10.69044, + "32": 10.76257, + "33": 10.75455, + "34": 10.67733, "35": 10.66335, + "36": 10.63634, + "37": 10.66856, + "38": 10.5969, + "39": 10.67599, "40": 10.50898, + "41": 10.53945, + "42": 10.55263, + "43": 10.35003, + "44": 10.40418, "45": 10.32106, + "46": 10.27724, + "47": 10.45205, + "48": 10.28913, + "49": 10.05779, "50": 10.27777, + "51": 10.23471, + "52": 10.13764, + "53": 10.34797, + "54": 10.26738, "55": 10.20734, + "56": 9.99527, + "57": 9.89333, + "58": 10.13452, + "59": 9.92856, "60": 9.8551, + "61": 9.98264, + "62": 10.20686, + "63": 9.70842, + "64": 10.01687, "65": 9.30409, + "66": 9.93326, + "67": 9.62677, + "68": 9.98429, + "69": 9.9755, "70": 9.93956, + "71": 9.81005, + "72": 9.798, + "73": 9.68454, + "74": 9.19951, "75": 9.60518, + "76": 9.27791, + "77": 10.19437, + "78": 9.8671, + "79": 9.53341, "80": 9.56341, + "81": 9.63047, + "82": 9.82819, + "83": 9.46388, + "84": 9.53736, "85": 9.74561, + "86": 9.21332, + "87": 9.7014, + "88": 9.86621, + "89": 9.72242, "90": 9.92089, + "91": 9.47178, + "92": 9.46996, + "93": 9.20589, + "94": 8.94772, "95": 9.60815, + "96": 9.63635, + "97": 9.4138, + "98": 9.77274, + "99": 8.9958, "100": 9.50415 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 284527616.0, + "2": 284527616.0, + "3": 284527616.0, + "4": 284527616.0, "5": 284527616.0, + "6": 284527616.0, + "7": 284527616.0, + "8": 284527616.0, + "9": 284527616.0, "10": 284527616.0, + "11": 284527616.0, + "12": 284527616.0, + "13": 284527616.0, + "14": 284527616.0, "15": 284527616.0, + "16": 416513536.0, + "17": 416513536.0, + "18": 416513536.0, + "19": 416513536.0, "20": 416513536.0, + "21": 416513536.0, + "22": 416513536.0, + "23": 416513536.0, + "24": 416513536.0, "25": 416513536.0, + "26": 416513536.0, + "27": 416513536.0, + "28": 416513536.0, + "29": 416513536.0, "30": 416513536.0, + "31": 416513536.0, + "32": 416513536.0, + "33": 416513536.0, + "34": 416513536.0, "35": 416513536.0, + "36": 416513536.0, + "37": 416513536.0, + "38": 416513536.0, + "39": 416513536.0, "40": 416513536.0, + "41": 416513536.0, + "42": 416513536.0, + "43": 416513536.0, + "44": 416513536.0, "45": 416513536.0, + "46": 416513536.0, + "47": 416513536.0, + "48": 416513536.0, + "49": 416513536.0, "50": 416513536.0, + "51": 416513536.0, + "52": 416513536.0, + "53": 416513536.0, + "54": 416513536.0, "55": 416513536.0, + "56": 416513536.0, + "57": 416513536.0, + "58": 416513536.0, + "59": 416513536.0, "60": 416513536.0, + "61": 416513536.0, + "62": 416513536.0, + "63": 416513536.0, + "64": 416513536.0, "65": 416513536.0, + "66": 416513536.0, + "67": 416513536.0, + "68": 416513536.0, + "69": 416513536.0, "70": 416513536.0, + "71": 416513536.0, + "72": 416513536.0, + "73": 416513536.0, + "74": 416513536.0, "75": 416513536.0, + "76": 416513536.0, + "77": 416513536.0, + "78": 416513536.0, + "79": 416513536.0, "80": 416513536.0, + "81": 416513536.0, + "82": 416513536.0, + "83": 416513536.0, + "84": 416513536.0, "85": 416513536.0, + "86": 416513536.0, + "87": 416513536.0, + "88": 416513536.0, + "89": 416513536.0, "90": 416513536.0, + "91": 416513536.0, + "92": 416513536.0, + "93": 416513536.0, + "94": 416513536.0, "95": 416513536.0, + "96": 416513536.0, + "97": 416513536.0, + "98": 416513536.0, + "99": 416513536.0, "100": 416513536.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1465368064.0, + "2": 1465368576.0, + "3": 1465368576.0, + "4": 1465368576.0, "5": 1465368576.0, + "6": 1465368576.0, + "7": 1465368576.0, + "8": 1465368576.0, + "9": 1465368576.0, "10": 1465368576.0, + "11": 1465368576.0, + "12": 1465368576.0, + "13": 1465368576.0, + "14": 1465368576.0, "15": 1465368576.0, + "16": 1465368576.0, + "17": 1597092352.0, + "18": 1597092352.0, + "19": 1597092352.0, "20": 1597092352.0, + "21": 1597092352.0, + "22": 1597092352.0, + "23": 1597092352.0, + "24": 1597092352.0, "25": 1597092352.0, + "26": 1597092352.0, + "27": 1597092352.0, + "28": 1597092352.0, + "29": 1597092352.0, "30": 1597092352.0, + "31": 1597092352.0, + "32": 1597092352.0, + "33": 1597092352.0, + "34": 1597092352.0, "35": 1597092352.0, + "36": 1597092352.0, + "37": 1597092352.0, + "38": 1597092352.0, + "39": 1597092352.0, "40": 1597092352.0, + "41": 1597092352.0, + "42": 1597092352.0, + "43": 1597092352.0, + "44": 1597092352.0, "45": 1597092352.0, + "46": 1597092352.0, + "47": 1597092352.0, + "48": 1597092352.0, + "49": 1597092352.0, "50": 1597092352.0, + "51": 1597092352.0, + "52": 1597092352.0, + "53": 1597092352.0, + "54": 1597092352.0, "55": 1597092352.0, + "56": 1597092352.0, + "57": 1597092352.0, + "58": 1597092352.0, + "59": 1597092352.0, "60": 1597092352.0, + "61": 1597092352.0, + "62": 1597092352.0, + "63": 1597092352.0, + "64": 1597092352.0, "65": 1597092352.0, + "66": 1597092352.0, + "67": 1597092352.0, + "68": 1597092352.0, + "69": 1597092352.0, "70": 1597092352.0, + "71": 1597092352.0, + "72": 1597092352.0, + "73": 1597092352.0, + "74": 1597092352.0, "75": 1597092352.0, + "76": 1597092352.0, + "77": 1597092352.0, + "78": 1597092352.0, + "79": 1597092352.0, "80": 1597092352.0, + "81": 1597092352.0, + "82": 1597092352.0, + "83": 1597092352.0, + "84": 1597092352.0, "85": 1597092352.0, + "86": 1597092352.0, + "87": 1597092352.0, + "88": 1597092352.0, + "89": 1597092352.0, "90": 1597092352.0, + "91": 1597092352.0, + "92": 1597092352.0, + "93": 1597092352.0, + "94": 1597092352.0, "95": 1597092352.0, + "96": 1597092352.0, + "97": 1597092352.0, + "98": 1597092352.0, + "99": 1597092352.0, "100": 1597092352.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 6.08145, - "5": 0.19699, - "10": 0.18649, - "15": 0.1857, - "20": 0.2021, - "25": 0.20057, - "30": 0.19804, - "35": 0.19848, - "40": 0.20241, - "45": 0.19796, - "50": 0.19684, - "55": 0.19872, - "60": 0.19694, - "65": 0.19755, - "70": 0.19889, - "75": 0.19755, - "80": 0.20241, - "85": 0.20082, - "90": 0.19963, - "95": 0.20089, - "100": 0.19724 + "1": 6.78518, + "2": 0.23744, + "3": 0.21193, + "4": 0.21211, + "5": 0.21234, + "6": 0.21714, + "7": 0.21381, + "8": 0.21678, + "9": 0.21057, + "10": 0.21454, + "11": 0.21268, + "12": 0.21347, + "13": 0.209, + "14": 0.20717, + "15": 0.20674, + "16": 0.28167, + "17": 0.21476, + "18": 0.22185, + "19": 0.22342, + "20": 0.21927, + "21": 0.21844, + "22": 0.20869, + "23": 0.21636, + "24": 0.22148, + "25": 0.21904, + "26": 0.21751, + "27": 0.21967, + "28": 0.21863, + "29": 0.21626, + "30": 0.22036, + "31": 0.21954, + "32": 0.22158, + "33": 0.22026, + "34": 0.21931, + "35": 0.21953, + "36": 0.22128, + "37": 0.22086, + "38": 0.22232, + "39": 0.22188, + "40": 0.22409, + "41": 0.22246, + "42": 0.22597, + "43": 0.22399, + "44": 0.22475, + "45": 0.22278, + "46": 0.22509, + "47": 0.2265, + "48": 0.22645, + "49": 0.22526, + "50": 0.22341, + "51": 0.22545, + "52": 0.22535, + "53": 0.22576, + "54": 0.2245, + "55": 0.22609, + "56": 0.2228, + "57": 0.22559, + "58": 0.22342, + "59": 0.22459, + "60": 0.2267, + "61": 0.22697, + "62": 0.22521, + "63": 0.22584, + "64": 0.22709, + "65": 0.22302, + "66": 0.22625, + "67": 0.22446, + "68": 0.22406, + "69": 0.22377, + "70": 0.22903, + "71": 0.2251, + "72": 0.22663, + "73": 0.2167, + "74": 0.21951, + "75": 0.22056, + "76": 0.22119, + "77": 0.21831, + "78": 0.21638, + "79": 0.22219, + "80": 0.21903, + "81": 0.21864, + "82": 0.22289, + "83": 0.21759, + "84": 0.21896, + "85": 0.21769, + "86": 0.21796, + "87": 0.22137, + "88": 0.2181, + "89": 0.22173, + "90": 0.21854, + "91": 0.21692, + "92": 0.21712, + "93": 0.21996, + "94": 0.2158, + "95": 0.21804, + "96": 0.21776, + "97": 0.21778, + "98": 0.21975, + "99": 0.21815, + "100": 0.21699 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", "15": "nan", + "16": 2365.0, + "17": "nan", + "18": 2331.0, + "19": 2912.0, "20": 1664.0, + "21": 2009.0, + "22": "nan", + "23": 2483.0, + "24": 2192.0, "25": 2290.0, + "26": 1916.0, + "27": 2020.0, + "28": 2503.0, + "29": 2379.0, "30": 2400.0, + "31": 1759.0, + "32": 2522.0, + "33": 2145.0, + "34": 1791.0, "35": 1777.0, + "36": 2100.0, + "37": 2396.0, + "38": 2040.0, + "39": 2983.0, "40": 1805.0, + "41": 3097.0, + "42": 2421.0, + "43": 2566.0, + "44": 1858.0, "45": 2371.0, + "46": 2140.0, + "47": 2603.0, + "48": 2358.0, + "49": 1739.0, "50": 2686.0, + "51": 2041.0, + "52": 2226.0, + "53": 3222.0, + "54": 2784.0, "55": 2290.0, + "56": 2428.0, + "57": 2146.0, + "58": 3048.0, + "59": 2504.0, "60": 2612.0, + "61": 2623.0, + "62": 3003.0, + "63": 2762.0, + "64": 2917.0, "65": 2104.0, + "66": 3550.0, + "67": 2433.0, + "68": 3146.0, + "69": 2877.0, "70": 3528.0, + "71": 2983.0, + "72": 2640.0, + "73": 3199.0, + "74": 2084.0, "75": 2809.0, + "76": 3599.0, + "77": 3667.0, + "78": 3680.0, + "79": 3972.0, "80": 3365.0, + "81": 5042.0, + "82": 3291.0, + "83": 3016.0, + "84": 3592.0, "85": 3792.0, + "86": 3192.0, + "87": 4219.0, + "88": 3376.0, + "89": 4110.0, "90": 3939.0, + "91": 2912.0, + "92": 4114.0, + "93": 3499.0, + "94": 4339.0, "95": 3829.0, + "96": 3875.0, + "97": 4100.0, + "98": 4889.0, + "99": 3771.0, "100": 3390.0 } } diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..eaee6a60f26 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84277, + "2": 10.85562, + "3": 10.84568, + "4": 10.84364, + "5": 10.85979, + "6": 10.86413, + "7": 10.85362, + "8": 10.85066, + "9": 10.8615, + "10": 10.82586, + "11": 10.86811, + "12": 10.85685, + "13": 10.87827, + "14": 10.86894, + "15": 10.85888, + "16": 10.8685, + "17": 10.85105, + "18": 10.85939, + "19": 10.85704, + "20": 10.84526, + "21": 10.85808, + "22": 10.83215, + "23": 10.86717, + "24": 10.83773, + "25": 10.82744, + "26": 10.83163, + "27": 10.83573, + "28": 10.82373, + "29": 10.81624, + "30": 10.76486, + "31": 10.69044, + "32": 10.76257, + "33": 10.75455, + "34": 10.67733, + "35": 10.66335, + "36": 10.63634, + "37": 10.66856, + "38": 10.5969, + "39": 10.67599, + "40": 10.50898, + "41": 10.53945, + "42": 10.55263, + "43": 10.35003, + "44": 10.40418, + "45": 10.32106, + "46": 10.27724, + "47": 10.45205, + "48": 10.28913, + "49": 10.05779, + "50": 10.27777, + "51": 10.23471, + "52": 10.13764, + "53": 10.34797, + "54": 10.26738, + "55": 10.20734, + "56": 9.99527, + "57": 9.89333, + "58": 10.13452, + "59": 9.92856, + "60": 9.8551, + "61": 9.98264, + "62": 10.20686, + "63": 9.70842, + "64": 10.01687, + "65": 9.30409, + "66": 9.93326, + "67": 9.62677, + "68": 9.98429, + "69": 9.9755, + "70": 9.93956, + "71": 9.81005, + "72": 9.798, + "73": 9.68454, + "74": 9.19951, + "75": 9.60518, + "76": 9.27791, + "77": 10.19437, + "78": 9.8671, + "79": 9.53341, + "80": 9.56341, + "81": 9.63047, + "82": 9.82819, + "83": 9.46388, + "84": 9.53736, + "85": 9.74561, + "86": 9.21332, + "87": 9.7014, + "88": 9.86621, + "89": 9.72242, + "90": 9.92089, + "91": 9.47178, + "92": 9.46996, + "93": 9.20589, + "94": 8.94772, + "95": 9.60815, + "96": 9.63635, + "97": 9.4138, + "98": 9.77274, + "99": 8.9958, + "100": 9.50415 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 284527616.0, + "2": 284527616.0, + "3": 284527616.0, + "4": 284527616.0, + "5": 284527616.0, + "6": 284527616.0, + "7": 284527616.0, + "8": 284527616.0, + "9": 284527616.0, + "10": 284527616.0, + "11": 284527616.0, + "12": 284527616.0, + "13": 284527616.0, + "14": 284527616.0, + "15": 284527616.0, + "16": 416513536.0, + "17": 416513536.0, + "18": 416513536.0, + "19": 416513536.0, + "20": 416513536.0, + "21": 416513536.0, + "22": 416513536.0, + "23": 416513536.0, + "24": 416513536.0, + "25": 416513536.0, + "26": 416513536.0, + "27": 416513536.0, + "28": 416513536.0, + "29": 416513536.0, + "30": 416513536.0, + "31": 416513536.0, + "32": 416513536.0, + "33": 416513536.0, + "34": 416513536.0, + "35": 416513536.0, + "36": 416513536.0, + "37": 416513536.0, + "38": 416513536.0, + "39": 416513536.0, + "40": 416513536.0, + "41": 416513536.0, + "42": 416513536.0, + "43": 416513536.0, + "44": 416513536.0, + "45": 416513536.0, + "46": 416513536.0, + "47": 416513536.0, + "48": 416513536.0, + "49": 416513536.0, + "50": 416513536.0, + "51": 416513536.0, + "52": 416513536.0, + "53": 416513536.0, + "54": 416513536.0, + "55": 416513536.0, + "56": 416513536.0, + "57": 416513536.0, + "58": 416513536.0, + "59": 416513536.0, + "60": 416513536.0, + "61": 416513536.0, + "62": 416513536.0, + "63": 416513536.0, + "64": 416513536.0, + "65": 416513536.0, + "66": 416513536.0, + "67": 416513536.0, + "68": 416513536.0, + "69": 416513536.0, + "70": 416513536.0, + "71": 416513536.0, + "72": 416513536.0, + "73": 416513536.0, + "74": 416513536.0, + "75": 416513536.0, + "76": 416513536.0, + "77": 416513536.0, + "78": 416513536.0, + "79": 416513536.0, + "80": 416513536.0, + "81": 416513536.0, + "82": 416513536.0, + "83": 416513536.0, + "84": 416513536.0, + "85": 416513536.0, + "86": 416513536.0, + "87": 416513536.0, + "88": 416513536.0, + "89": 416513536.0, + "90": 416513536.0, + "91": 416513536.0, + "92": 416513536.0, + "93": 416513536.0, + "94": 416513536.0, + "95": 416513536.0, + "96": 416513536.0, + "97": 416513536.0, + "98": 416513536.0, + "99": 416513536.0, + "100": 416513536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1465368064.0, + "2": 1465368576.0, + "3": 1465368576.0, + "4": 1465368576.0, + "5": 1465368576.0, + "6": 1465368576.0, + "7": 1465368576.0, + "8": 1465368576.0, + "9": 1465368576.0, + "10": 1465368576.0, + "11": 1465368576.0, + "12": 1465368576.0, + "13": 1465368576.0, + "14": 1465368576.0, + "15": 1465368576.0, + "16": 1465368576.0, + "17": 1597092352.0, + "18": 1597092352.0, + "19": 1597092352.0, + "20": 1597092352.0, + "21": 1597092352.0, + "22": 1597092352.0, + "23": 1597092352.0, + "24": 1597092352.0, + "25": 1597092352.0, + "26": 1597092352.0, + "27": 1597092352.0, + "28": 1597092352.0, + "29": 1597092352.0, + "30": 1597092352.0, + "31": 1597092352.0, + "32": 1597092352.0, + "33": 1597092352.0, + "34": 1597092352.0, + "35": 1597092352.0, + "36": 1597092352.0, + "37": 1597092352.0, + "38": 1597092352.0, + "39": 1597092352.0, + "40": 1597092352.0, + "41": 1597092352.0, + "42": 1597092352.0, + "43": 1597092352.0, + "44": 1597092352.0, + "45": 1597092352.0, + "46": 1597092352.0, + "47": 1597092352.0, + "48": 1597092352.0, + "49": 1597092352.0, + "50": 1597092352.0, + "51": 1597092352.0, + "52": 1597092352.0, + "53": 1597092352.0, + "54": 1597092352.0, + "55": 1597092352.0, + "56": 1597092352.0, + "57": 1597092352.0, + "58": 1597092352.0, + "59": 1597092352.0, + "60": 1597092352.0, + "61": 1597092352.0, + "62": 1597092352.0, + "63": 1597092352.0, + "64": 1597092352.0, + "65": 1597092352.0, + "66": 1597092352.0, + "67": 1597092352.0, + "68": 1597092352.0, + "69": 1597092352.0, + "70": 1597092352.0, + "71": 1597092352.0, + "72": 1597092352.0, + "73": 1597092352.0, + "74": 1597092352.0, + "75": 1597092352.0, + "76": 1597092352.0, + "77": 1597092352.0, + "78": 1597092352.0, + "79": 1597092352.0, + "80": 1597092352.0, + "81": 1597092352.0, + "82": 1597092352.0, + "83": 1597092352.0, + "84": 1597092352.0, + "85": 1597092352.0, + "86": 1597092352.0, + "87": 1597092352.0, + "88": 1597092352.0, + "89": 1597092352.0, + "90": 1597092352.0, + "91": 1597092352.0, + "92": 1597092352.0, + "93": 1597092352.0, + "94": 1597092352.0, + "95": 1597092352.0, + "96": 1597092352.0, + "97": 1597092352.0, + "98": 1597092352.0, + "99": 1597092352.0, + "100": 1597092352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 7.37179, + "2": 0.21537, + "3": 0.18911, + "4": 0.18458, + "5": 0.18487, + "6": 0.18754, + "7": 0.18665, + "8": 0.1878, + "9": 0.18553, + "10": 0.1849, + "11": 0.18796, + "12": 0.18834, + "13": 0.19005, + "14": 0.18356, + "15": 0.18558, + "16": 0.27381, + "17": 0.18936, + "18": 0.19528, + "19": 0.19364, + "20": 0.1953, + "21": 0.19158, + "22": 0.18527, + "23": 0.1891, + "24": 0.19114, + "25": 0.19216, + "26": 0.19001, + "27": 0.19218, + "28": 0.19054, + "29": 0.19151, + "30": 0.19191, + "31": 0.19643, + "32": 0.19421, + "33": 0.19414, + "34": 0.19615, + "35": 0.19402, + "36": 0.19651, + "37": 0.19212, + "38": 0.19469, + "39": 0.19904, + "40": 0.19924, + "41": 0.19587, + "42": 0.21217, + "43": 0.21187, + "44": 0.19529, + "45": 0.20033, + "46": 0.20271, + "47": 0.19543, + "48": 0.20218, + "49": 0.20489, + "50": 0.19921, + "51": 0.2115, + "52": 0.20718, + "53": 0.19391, + "54": 0.19638, + "55": 0.19472, + "56": 0.19481, + "57": 0.19264, + "58": 0.19802, + "59": 0.19862, + "60": 0.19826, + "61": 0.19634, + "62": 0.19752, + "63": 0.19602, + "64": 0.19649, + "65": 0.19524, + "66": 0.19483, + "67": 0.19471, + "68": 0.19619, + "69": 0.19456, + "70": 0.1972, + "71": 0.19562, + "72": 0.1963, + "73": 0.19559, + "74": 0.1958, + "75": 0.2007, + "76": 0.19838, + "77": 0.1931, + "78": 0.19809, + "79": 0.19589, + "80": 0.19799, + "81": 0.19659, + "82": 0.19661, + "83": 0.20092, + "84": 0.19558, + "85": 0.19886, + "86": 0.20355, + "87": 0.19808, + "88": 0.19948, + "89": 0.19521, + "90": 0.19741, + "91": 0.19953, + "92": 0.19688, + "93": 0.19645, + "94": 0.19575, + "95": 0.19574, + "96": 0.19609, + "97": 0.19745, + "98": 0.19491, + "99": 0.19618, + "100": 0.19576 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": 2365.0, + "17": "nan", + "18": 2331.0, + "19": 2912.0, + "20": 1664.0, + "21": 2009.0, + "22": "nan", + "23": 2483.0, + "24": 2192.0, + "25": 2290.0, + "26": 1916.0, + "27": 2020.0, + "28": 2503.0, + "29": 2379.0, + "30": 2400.0, + "31": 1759.0, + "32": 2522.0, + "33": 2145.0, + "34": 1791.0, + "35": 1777.0, + "36": 2100.0, + "37": 2396.0, + "38": 2040.0, + "39": 2983.0, + "40": 1805.0, + "41": 3097.0, + "42": 2421.0, + "43": 2566.0, + "44": 1858.0, + "45": 2371.0, + "46": 2140.0, + "47": 2603.0, + "48": 2358.0, + "49": 1739.0, + "50": 2686.0, + "51": 2041.0, + "52": 2226.0, + "53": 3222.0, + "54": 2784.0, + "55": 2290.0, + "56": 2428.0, + "57": 2146.0, + "58": 3048.0, + "59": 2504.0, + "60": 2612.0, + "61": 2623.0, + "62": 3003.0, + "63": 2762.0, + "64": 2917.0, + "65": 2104.0, + "66": 3550.0, + "67": 2433.0, + "68": 3146.0, + "69": 2877.0, + "70": 3528.0, + "71": 2983.0, + "72": 2640.0, + "73": 3199.0, + "74": 2084.0, + "75": 2809.0, + "76": 3599.0, + "77": 3667.0, + "78": 3680.0, + "79": 3972.0, + "80": 3365.0, + "81": 5042.0, + "82": 3291.0, + "83": 3016.0, + "84": 3592.0, + "85": 3792.0, + "86": 3192.0, + "87": 4219.0, + "88": 3376.0, + "89": 4110.0, + "90": 3939.0, + "91": 2912.0, + "92": 4114.0, + "93": 3499.0, + "94": 4339.0, + "95": 3829.0, + "96": 3875.0, + "97": 4100.0, + "98": 4889.0, + "99": 3771.0, + "100": 3390.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..47fa63fad72 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84277, + "2": 10.85562, + "3": 10.84568, + "4": 10.84364, + "5": 10.85979, + "6": 10.86413, + "7": 10.85362, + "8": 10.85066, + "9": 10.8615, + "10": 10.82586, + "11": 10.86811, + "12": 10.85685, + "13": 10.87827, + "14": 10.86894, + "15": 10.85888, + "16": 10.8685, + "17": 10.85105, + "18": 10.85939, + "19": 10.85704, + "20": 10.84526, + "21": 10.85808, + "22": 10.83215, + "23": 10.86717, + "24": 10.83773, + "25": 10.82744, + "26": 10.83163, + "27": 10.83573, + "28": 10.82373, + "29": 10.81624, + "30": 10.76486, + "31": 10.69044, + "32": 10.76257, + "33": 10.75455, + "34": 10.67733, + "35": 10.66335, + "36": 10.63634, + "37": 10.66856, + "38": 10.5969, + "39": 10.67599, + "40": 10.50898, + "41": 10.53945, + "42": 10.55263, + "43": 10.35003, + "44": 10.40418, + "45": 10.32106, + "46": 10.27724, + "47": 10.45205, + "48": 10.28913, + "49": 10.05779, + "50": 10.27777, + "51": 10.23471, + "52": 10.13764, + "53": 10.34797, + "54": 10.26738, + "55": 10.20734, + "56": 9.99527, + "57": 9.89333, + "58": 10.13452, + "59": 9.92856, + "60": 9.8551, + "61": 9.98264, + "62": 10.20686, + "63": 9.70842, + "64": 10.01687, + "65": 9.30409, + "66": 9.93326, + "67": 9.62677, + "68": 9.98429, + "69": 9.9755, + "70": 9.93956, + "71": 9.81005, + "72": 9.798, + "73": 9.68454, + "74": 9.19951, + "75": 9.60518, + "76": 9.27791, + "77": 10.19437, + "78": 9.8671, + "79": 9.53341, + "80": 9.56341, + "81": 9.63047, + "82": 9.82819, + "83": 9.46388, + "84": 9.53736, + "85": 9.74561, + "86": 9.21332, + "87": 9.7014, + "88": 9.86621, + "89": 9.72242, + "90": 9.92089, + "91": 9.47178, + "92": 9.46996, + "93": 9.20589, + "94": 8.94772, + "95": 9.60815, + "96": 9.63635, + "97": 9.4138, + "98": 9.77274, + "99": 8.9958, + "100": 9.50415 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 284527616.0, + "2": 284527616.0, + "3": 284527616.0, + "4": 284527616.0, + "5": 284527616.0, + "6": 284527616.0, + "7": 284527616.0, + "8": 284527616.0, + "9": 284527616.0, + "10": 284527616.0, + "11": 284527616.0, + "12": 284527616.0, + "13": 284527616.0, + "14": 284527616.0, + "15": 284527616.0, + "16": 416513536.0, + "17": 416513536.0, + "18": 416513536.0, + "19": 416513536.0, + "20": 416513536.0, + "21": 416513536.0, + "22": 416513536.0, + "23": 416513536.0, + "24": 416513536.0, + "25": 416513536.0, + "26": 416513536.0, + "27": 416513536.0, + "28": 416513536.0, + "29": 416513536.0, + "30": 416513536.0, + "31": 416513536.0, + "32": 416513536.0, + "33": 416513536.0, + "34": 416513536.0, + "35": 416513536.0, + "36": 416513536.0, + "37": 416513536.0, + "38": 416513536.0, + "39": 416513536.0, + "40": 416513536.0, + "41": 416513536.0, + "42": 416513536.0, + "43": 416513536.0, + "44": 416513536.0, + "45": 416513536.0, + "46": 416513536.0, + "47": 416513536.0, + "48": 416513536.0, + "49": 416513536.0, + "50": 416513536.0, + "51": 416513536.0, + "52": 416513536.0, + "53": 416513536.0, + "54": 416513536.0, + "55": 416513536.0, + "56": 416513536.0, + "57": 416513536.0, + "58": 416513536.0, + "59": 416513536.0, + "60": 416513536.0, + "61": 416513536.0, + "62": 416513536.0, + "63": 416513536.0, + "64": 416513536.0, + "65": 416513536.0, + "66": 416513536.0, + "67": 416513536.0, + "68": 416513536.0, + "69": 416513536.0, + "70": 416513536.0, + "71": 416513536.0, + "72": 416513536.0, + "73": 416513536.0, + "74": 416513536.0, + "75": 416513536.0, + "76": 416513536.0, + "77": 416513536.0, + "78": 416513536.0, + "79": 416513536.0, + "80": 416513536.0, + "81": 416513536.0, + "82": 416513536.0, + "83": 416513536.0, + "84": 416513536.0, + "85": 416513536.0, + "86": 416513536.0, + "87": 416513536.0, + "88": 416513536.0, + "89": 416513536.0, + "90": 416513536.0, + "91": 416513536.0, + "92": 416513536.0, + "93": 416513536.0, + "94": 416513536.0, + "95": 416513536.0, + "96": 416513536.0, + "97": 416513536.0, + "98": 416513536.0, + "99": 416513536.0, + "100": 416513536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1465368064.0, + "2": 1465368576.0, + "3": 1465368576.0, + "4": 1465368576.0, + "5": 1465368576.0, + "6": 1465368576.0, + "7": 1465368576.0, + "8": 1465368576.0, + "9": 1465368576.0, + "10": 1465368576.0, + "11": 1465368576.0, + "12": 1465368576.0, + "13": 1465368576.0, + "14": 1465368576.0, + "15": 1465368576.0, + "16": 1465368576.0, + "17": 1597092352.0, + "18": 1597092352.0, + "19": 1597092352.0, + "20": 1597092352.0, + "21": 1597092352.0, + "22": 1597092352.0, + "23": 1597092352.0, + "24": 1597092352.0, + "25": 1597092352.0, + "26": 1597092352.0, + "27": 1597092352.0, + "28": 1597092352.0, + "29": 1597092352.0, + "30": 1597092352.0, + "31": 1597092352.0, + "32": 1597092352.0, + "33": 1597092352.0, + "34": 1597092352.0, + "35": 1597092352.0, + "36": 1597092352.0, + "37": 1597092352.0, + "38": 1597092352.0, + "39": 1597092352.0, + "40": 1597092352.0, + "41": 1597092352.0, + "42": 1597092352.0, + "43": 1597092352.0, + "44": 1597092352.0, + "45": 1597092352.0, + "46": 1597092352.0, + "47": 1597092352.0, + "48": 1597092352.0, + "49": 1597092352.0, + "50": 1597092352.0, + "51": 1597092352.0, + "52": 1597092352.0, + "53": 1597092352.0, + "54": 1597092352.0, + "55": 1597092352.0, + "56": 1597092352.0, + "57": 1597092352.0, + "58": 1597092352.0, + "59": 1597092352.0, + "60": 1597092352.0, + "61": 1597092352.0, + "62": 1597092352.0, + "63": 1597092352.0, + "64": 1597092352.0, + "65": 1597092352.0, + "66": 1597092352.0, + "67": 1597092352.0, + "68": 1597092352.0, + "69": 1597092352.0, + "70": 1597092352.0, + "71": 1597092352.0, + "72": 1597092352.0, + "73": 1597092352.0, + "74": 1597092352.0, + "75": 1597092352.0, + "76": 1597092352.0, + "77": 1597092352.0, + "78": 1597092352.0, + "79": 1597092352.0, + "80": 1597092352.0, + "81": 1597092352.0, + "82": 1597092352.0, + "83": 1597092352.0, + "84": 1597092352.0, + "85": 1597092352.0, + "86": 1597092352.0, + "87": 1597092352.0, + "88": 1597092352.0, + "89": 1597092352.0, + "90": 1597092352.0, + "91": 1597092352.0, + "92": 1597092352.0, + "93": 1597092352.0, + "94": 1597092352.0, + "95": 1597092352.0, + "96": 1597092352.0, + "97": 1597092352.0, + "98": 1597092352.0, + "99": 1597092352.0, + "100": 1597092352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 7.02035, + "2": 0.23195, + "3": 0.20851, + "4": 0.20697, + "5": 0.20737, + "6": 0.20888, + "7": 0.2126, + "8": 0.21169, + "9": 0.21057, + "10": 0.21255, + "11": 0.21108, + "12": 0.21506, + "13": 0.21085, + "14": 0.21072, + "15": 0.20967, + "16": 0.28325, + "17": 0.21485, + "18": 0.21984, + "19": 0.22277, + "20": 0.22004, + "21": 0.2242, + "22": 0.21349, + "23": 0.22346, + "24": 0.22444, + "25": 0.22521, + "26": 0.22267, + "27": 0.22592, + "28": 0.22136, + "29": 0.22802, + "30": 0.2227, + "31": 0.22084, + "32": 0.22099, + "33": 0.22019, + "34": 0.22336, + "35": 0.23024, + "36": 0.23188, + "37": 0.21929, + "38": 0.22277, + "39": 0.22303, + "40": 0.22269, + "41": 0.22539, + "42": 0.22835, + "43": 0.22379, + "44": 0.22103, + "45": 0.21919, + "46": 0.22653, + "47": 0.21996, + "48": 0.22399, + "49": 0.22202, + "50": 0.22099, + "51": 0.21773, + "52": 0.22165, + "53": 0.2208, + "54": 0.22241, + "55": 0.22007, + "56": 0.22113, + "57": 0.22282, + "58": 0.22209, + "59": 0.22153, + "60": 0.22251, + "61": 0.22383, + "62": 0.22477, + "63": 0.22389, + "64": 0.22518, + "65": 0.22491, + "66": 0.22204, + "67": 0.23149, + "68": 0.22301, + "69": 0.2298, + "70": 0.23059, + "71": 0.22412, + "72": 0.21788, + "73": 0.2209, + "74": 0.22227, + "75": 0.22603, + "76": 0.22022, + "77": 0.22045, + "78": 0.22051, + "79": 0.22157, + "80": 0.22544, + "81": 0.22703, + "82": 0.23226, + "83": 0.23535, + "84": 0.22503, + "85": 0.21869, + "86": 0.21989, + "87": 0.21782, + "88": 0.22296, + "89": 0.24294, + "90": 0.27356, + "91": 0.2182, + "92": 0.22138, + "93": 0.21695, + "94": 0.22172, + "95": 0.21947, + "96": 0.21792, + "97": 0.22243, + "98": 0.21902, + "99": 0.2202, + "100": 0.22043 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": 2365.0, + "17": "nan", + "18": 2331.0, + "19": 2912.0, + "20": 1664.0, + "21": 2009.0, + "22": "nan", + "23": 2483.0, + "24": 2192.0, + "25": 2290.0, + "26": 1916.0, + "27": 2020.0, + "28": 2503.0, + "29": 2379.0, + "30": 2400.0, + "31": 1759.0, + "32": 2522.0, + "33": 2145.0, + "34": 1791.0, + "35": 1777.0, + "36": 2100.0, + "37": 2396.0, + "38": 2040.0, + "39": 2983.0, + "40": 1805.0, + "41": 3097.0, + "42": 2421.0, + "43": 2566.0, + "44": 1858.0, + "45": 2371.0, + "46": 2140.0, + "47": 2603.0, + "48": 2358.0, + "49": 1739.0, + "50": 2686.0, + "51": 2041.0, + "52": 2226.0, + "53": 3222.0, + "54": 2784.0, + "55": 2290.0, + "56": 2428.0, + "57": 2146.0, + "58": 3048.0, + "59": 2504.0, + "60": 2612.0, + "61": 2623.0, + "62": 3003.0, + "63": 2762.0, + "64": 2917.0, + "65": 2104.0, + "66": 3550.0, + "67": 2433.0, + "68": 3146.0, + "69": 2877.0, + "70": 3528.0, + "71": 2983.0, + "72": 2640.0, + "73": 3199.0, + "74": 2084.0, + "75": 2809.0, + "76": 3599.0, + "77": 3667.0, + "78": 3680.0, + "79": 3972.0, + "80": 3365.0, + "81": 5042.0, + "82": 3291.0, + "83": 3016.0, + "84": 3592.0, + "85": 3792.0, + "86": 3192.0, + "87": 4219.0, + "88": 3376.0, + "89": 4110.0, + "90": 3939.0, + "91": 2912.0, + "92": 4114.0, + "93": 3499.0, + "94": 4339.0, + "95": 3829.0, + "96": 3875.0, + "97": 4100.0, + "98": 4889.0, + "99": 3771.0, + "100": 3390.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..9f83699719d --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.86104, + "2": 10.85751, + "3": 10.86157, + "4": 10.84944, + "5": 10.88371, + "6": 10.88763, + "7": 10.86427, + "8": 10.87317, + "9": 10.86952, + "10": 10.84263, + "11": 10.88626, + "12": 10.88784, + "13": 10.89496, + "14": 10.90319, + "15": 10.87935, + "16": 10.88588, + "17": 10.86428, + "18": 10.88923, + "19": 10.88151, + "20": 10.87405, + "21": 10.88996, + "22": 10.83151, + "23": 10.89289, + "24": 10.85821, + "25": 10.82867, + "26": 10.82729, + "27": 10.85428, + "28": 10.84631, + "29": 10.85408, + "30": 10.77191, + "31": 10.67404, + "32": 10.78923, + "33": 10.7757, + "34": 10.67639, + "35": 10.67622, + "36": 10.63402, + "37": 10.69312, + "38": 10.61026, + "39": 10.70232, + "40": 10.517, + "41": 10.54604, + "42": 10.57058, + "43": 10.32305, + "44": 10.39205, + "45": 10.28436, + "46": 10.27329, + "47": 10.4798, + "48": 10.25535, + "49": 10.01605, + "50": 10.27861, + "51": 10.21825, + "52": 10.1281, + "53": 10.35922, + "54": 10.25909, + "55": 10.20112, + "56": 9.9815, + "57": 9.84915, + "58": 10.12333, + "59": 9.90734, + "60": 9.83306, + "61": 9.97107, + "62": 10.22132, + "63": 9.6767, + "64": 10.01779, + "65": 9.26979, + "66": 9.9402, + "67": 9.62874, + "68": 9.9875, + "69": 9.98441, + "70": 9.92662, + "71": 9.80996, + "72": 9.79208, + "73": 9.68101, + "74": 9.18023, + "75": 9.61385, + "76": 9.28826, + "77": 10.19395, + "78": 9.87453, + "79": 9.52966, + "80": 9.56419, + "81": 9.63453, + "82": 9.82245, + "83": 9.47207, + "84": 9.54654, + "85": 9.74319, + "86": 9.2009, + "87": 9.70113, + "88": 9.86518, + "89": 9.7307, + "90": 9.92148, + "91": 9.4869, + "92": 9.47682, + "93": 9.2135, + "94": 8.94897, + "95": 9.6163, + "96": 9.63416, + "97": 9.41229, + "98": 9.77615, + "99": 9.00251, + "100": 9.5087 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 284527616.0, + "2": 284527616.0, + "3": 284527616.0, + "4": 284527616.0, + "5": 284527616.0, + "6": 284527616.0, + "7": 284527616.0, + "8": 284527616.0, + "9": 284527616.0, + "10": 284527616.0, + "11": 284527616.0, + "12": 284527616.0, + "13": 284527616.0, + "14": 284527616.0, + "15": 284527616.0, + "16": 416513536.0, + "17": 416513536.0, + "18": 416513536.0, + "19": 416513536.0, + "20": 416513536.0, + "21": 416513536.0, + "22": 416513536.0, + "23": 416513536.0, + "24": 416513536.0, + "25": 416513536.0, + "26": 416513536.0, + "27": 416513536.0, + "28": 416513536.0, + "29": 416513536.0, + "30": 416513536.0, + "31": 416513536.0, + "32": 416513536.0, + "33": 416513536.0, + "34": 416513536.0, + "35": 416513536.0, + "36": 416513536.0, + "37": 416513536.0, + "38": 416513536.0, + "39": 416513536.0, + "40": 416513536.0, + "41": 416513536.0, + "42": 416513536.0, + "43": 416513536.0, + "44": 416513536.0, + "45": 416513536.0, + "46": 416513536.0, + "47": 416513536.0, + "48": 416513536.0, + "49": 416513536.0, + "50": 416513536.0, + "51": 416513536.0, + "52": 416513536.0, + "53": 416513536.0, + "54": 416513536.0, + "55": 416513536.0, + "56": 416513536.0, + "57": 416513536.0, + "58": 416513536.0, + "59": 416513536.0, + "60": 416513536.0, + "61": 416513536.0, + "62": 416513536.0, + "63": 416513536.0, + "64": 416513536.0, + "65": 416513536.0, + "66": 416513536.0, + "67": 416513536.0, + "68": 416513536.0, + "69": 416513536.0, + "70": 416513536.0, + "71": 416513536.0, + "72": 416513536.0, + "73": 416513536.0, + "74": 416513536.0, + "75": 416513536.0, + "76": 416513536.0, + "77": 416513536.0, + "78": 416513536.0, + "79": 416513536.0, + "80": 416513536.0, + "81": 416513536.0, + "82": 416513536.0, + "83": 416513536.0, + "84": 416513536.0, + "85": 416513536.0, + "86": 416513536.0, + "87": 416513536.0, + "88": 416513536.0, + "89": 416513536.0, + "90": 416513536.0, + "91": 416513536.0, + "92": 416513536.0, + "93": 416513536.0, + "94": 416513536.0, + "95": 416513536.0, + "96": 416513536.0, + "97": 416513536.0, + "98": 416513536.0, + "99": 416513536.0, + "100": 416513536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1464319488.0, + "2": 1465368576.0, + "3": 1465368576.0, + "4": 1465368576.0, + "5": 1465368576.0, + "6": 1465368576.0, + "7": 1465368576.0, + "8": 1465368576.0, + "9": 1465368576.0, + "10": 1465368576.0, + "11": 1465368576.0, + "12": 1465368576.0, + "13": 1465368576.0, + "14": 1465368576.0, + "15": 1465368576.0, + "16": 1465368576.0, + "17": 1595257344.0, + "18": 1595257344.0, + "19": 1595257344.0, + "20": 1595257344.0, + "21": 1595257344.0, + "22": 1595257344.0, + "23": 1596305920.0, + "24": 1596305920.0, + "25": 1596305920.0, + "26": 1596305920.0, + "27": 1596305920.0, + "28": 1596305920.0, + "29": 1596305920.0, + "30": 1596305920.0, + "31": 1596305920.0, + "32": 1596305920.0, + "33": 1596305920.0, + "34": 1596305920.0, + "35": 1596305920.0, + "36": 1596305920.0, + "37": 1596305920.0, + "38": 1596305920.0, + "39": 1596305920.0, + "40": 1596305920.0, + "41": 1596305920.0, + "42": 1596305920.0, + "43": 1596305920.0, + "44": 1596305920.0, + "45": 1596305920.0, + "46": 1596305920.0, + "47": 1596305920.0, + "48": 1596305920.0, + "49": 1596305920.0, + "50": 1596305920.0, + "51": 1596305920.0, + "52": 1596305920.0, + "53": 1596305920.0, + "54": 1596305920.0, + "55": 1596305920.0, + "56": 1596305920.0, + "57": 1596305920.0, + "58": 1596305920.0, + "59": 1596305920.0, + "60": 1596305920.0, + "61": 1596305920.0, + "62": 1596305920.0, + "63": 1596305920.0, + "64": 1596305920.0, + "65": 1596305920.0, + "66": 1596305920.0, + "67": 1596305920.0, + "68": 1596305920.0, + "69": 1596305920.0, + "70": 1596305920.0, + "71": 1596305920.0, + "72": 1596305920.0, + "73": 1596305920.0, + "74": 1596305920.0, + "75": 1596305920.0, + "76": 1596305920.0, + "77": 1596305920.0, + "78": 1596305920.0, + "79": 1596305920.0, + "80": 1596305920.0, + "81": 1596305920.0, + "82": 1596305920.0, + "83": 1596305920.0, + "84": 1596305920.0, + "85": 1596305920.0, + "86": 1596305920.0, + "87": 1596305920.0, + "88": 1596305920.0, + "89": 1596305920.0, + "90": 1596305920.0, + "91": 1596305920.0, + "92": 1596305920.0, + "93": 1596305920.0, + "94": 1596305920.0, + "95": 1596305920.0, + "96": 1596305920.0, + "97": 1596305920.0, + "98": 1596305920.0, + "99": 1596305920.0, + "100": 1596305920.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 8.02223, + "2": 0.38061, + "3": 0.32373, + "4": 0.32033, + "5": 0.31913, + "6": 0.32369, + "7": 0.32104, + "8": 0.55134, + "9": 0.31907, + "10": 0.31445, + "11": 0.31681, + "12": 0.32078, + "13": 0.31316, + "14": 0.31705, + "15": 0.32367, + "16": 0.51605, + "17": 0.32163, + "18": 0.33141, + "19": 0.32965, + "20": 0.33483, + "21": 0.33262, + "22": 0.31555, + "23": 0.54296, + "24": 0.32628, + "25": 0.32494, + "26": 0.33072, + "27": 0.32494, + "28": 0.32501, + "29": 0.33418, + "30": 0.32445, + "31": 0.32469, + "32": 0.54347, + "33": 0.32433, + "34": 0.33133, + "35": 0.32861, + "36": 0.32508, + "37": 0.33059, + "38": 0.32933, + "39": 0.32486, + "40": 0.32922, + "41": 0.32822, + "42": 0.32589, + "43": 0.32604, + "44": 0.32857, + "45": 0.32472, + "46": 0.32696, + "47": 0.32915, + "48": 0.32449, + "49": 0.32476, + "50": 0.33417, + "51": 0.32622, + "52": 0.31932, + "53": 0.32288, + "54": 0.32664, + "55": 0.3199, + "56": 0.32098, + "57": 0.33106, + "58": 0.32428, + "59": 0.32012, + "60": 0.63225, + "61": 0.3217, + "62": 0.3235, + "63": 0.32372, + "64": 0.31863, + "65": 0.32545, + "66": 0.32518, + "67": 0.32024, + "68": 0.32648, + "69": 0.32388, + "70": 0.32115, + "71": 0.32798, + "72": 0.32445, + "73": 0.32219, + "74": 0.32407, + "75": 0.32414, + "76": 0.31907, + "77": 0.3226, + "78": 0.32339, + "79": 0.31992, + "80": 0.32293, + "81": 0.32579, + "82": 0.31876, + "83": 0.31946, + "84": 0.32957, + "85": 0.3196, + "86": 0.31988, + "87": 0.32978, + "88": 0.31888, + "89": 0.31848, + "90": 0.32475, + "91": 0.32291, + "92": 0.32112, + "93": 0.32728, + "94": 0.32274, + "95": 0.31869, + "96": 0.32364, + "97": 0.32247, + "98": 0.32012, + "99": 0.32377, + "100": 0.32291 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": 2261.0, + "17": "nan", + "18": 2424.0, + "19": 2800.0, + "20": 1777.0, + "21": 2046.0, + "22": "nan", + "23": 2489.0, + "24": 2136.0, + "25": 2124.0, + "26": 1902.0, + "27": 2006.0, + "28": 2337.0, + "29": 2425.0, + "30": 2262.0, + "31": 1584.0, + "32": 2470.0, + "33": 2074.0, + "34": 1679.0, + "35": 1763.0, + "36": 1918.0, + "37": 2542.0, + "38": 2195.0, + "39": 3045.0, + "40": 1875.0, + "41": 3199.0, + "42": 2508.0, + "43": 2563.0, + "44": 1898.0, + "45": 2434.0, + "46": 2065.0, + "47": 2739.0, + "48": 2291.0, + "49": 1821.0, + "50": 2634.0, + "51": 2172.0, + "52": 2278.0, + "53": 3531.0, + "54": 2662.0, + "55": 2383.0, + "56": 2480.0, + "57": 2136.0, + "58": 3305.0, + "59": 2485.0, + "60": 2832.0, + "61": 2847.0, + "62": 2841.0, + "63": 2867.0, + "64": 3107.0, + "65": 2223.0, + "66": 3682.0, + "67": 2533.0, + "68": 3137.0, + "69": 2650.0, + "70": 3836.0, + "71": 2945.0, + "72": 2727.0, + "73": 3322.0, + "74": 2186.0, + "75": 2913.0, + "76": 3553.0, + "77": 3629.0, + "78": 3871.0, + "79": 4097.0, + "80": 3398.0, + "81": 5006.0, + "82": 3345.0, + "83": 3174.0, + "84": 3718.0, + "85": 3618.0, + "86": 3181.0, + "87": 3995.0, + "88": 3634.0, + "89": 4250.0, + "90": 3676.0, + "91": 2926.0, + "92": 4446.0, + "93": 3780.0, + "94": 4430.0, + "95": 4082.0, + "96": 3952.0, + "97": 4117.0, + "98": 5049.0, + "99": 4122.0, + "100": 3502.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..dd9dc5f116a --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.86104, + "2": 10.85751, + "3": 10.86157, + "4": 10.84944, + "5": 10.88371, + "6": 10.88763, + "7": 10.86427, + "8": 10.87317, + "9": 10.86952, + "10": 10.84263, + "11": 10.88626, + "12": 10.88784, + "13": 10.89496, + "14": 10.90319, + "15": 10.87935, + "16": 10.88588, + "17": 10.86428, + "18": 10.88923, + "19": 10.88151, + "20": 10.87405, + "21": 10.88996, + "22": 10.83151, + "23": 10.89289, + "24": 10.85821, + "25": 10.82867, + "26": 10.82729, + "27": 10.85428, + "28": 10.84631, + "29": 10.85408, + "30": 10.77191, + "31": 10.67404, + "32": 10.78923, + "33": 10.7757, + "34": 10.67639, + "35": 10.67622, + "36": 10.63402, + "37": 10.69312, + "38": 10.61026, + "39": 10.70232, + "40": 10.517, + "41": 10.54604, + "42": 10.57058, + "43": 10.32305, + "44": 10.39205, + "45": 10.28436, + "46": 10.27329, + "47": 10.4798, + "48": 10.25535, + "49": 10.01605, + "50": 10.27861, + "51": 10.21825, + "52": 10.1281, + "53": 10.35922, + "54": 10.25909, + "55": 10.20112, + "56": 9.9815, + "57": 9.84915, + "58": 10.12333, + "59": 9.90734, + "60": 9.83306, + "61": 9.97107, + "62": 10.22132, + "63": 9.6767, + "64": 10.01779, + "65": 9.26979, + "66": 9.9402, + "67": 9.62874, + "68": 9.9875, + "69": 9.98441, + "70": 9.92662, + "71": 9.80996, + "72": 9.79208, + "73": 9.68101, + "74": 9.18023, + "75": 9.61385, + "76": 9.28826, + "77": 10.19395, + "78": 9.87453, + "79": 9.52966, + "80": 9.56419, + "81": 9.63453, + "82": 9.82245, + "83": 9.47207, + "84": 9.54654, + "85": 9.74319, + "86": 9.2009, + "87": 9.70113, + "88": 9.86518, + "89": 9.7307, + "90": 9.92148, + "91": 9.4869, + "92": 9.47682, + "93": 9.2135, + "94": 8.94897, + "95": 9.6163, + "96": 9.63416, + "97": 9.41229, + "98": 9.77615, + "99": 9.00251, + "100": 9.5087 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 284527616.0, + "2": 284527616.0, + "3": 284527616.0, + "4": 284527616.0, + "5": 284527616.0, + "6": 284527616.0, + "7": 284527616.0, + "8": 284527616.0, + "9": 284527616.0, + "10": 284527616.0, + "11": 284527616.0, + "12": 284527616.0, + "13": 284527616.0, + "14": 284527616.0, + "15": 284527616.0, + "16": 416513536.0, + "17": 416513536.0, + "18": 416513536.0, + "19": 416513536.0, + "20": 416513536.0, + "21": 416513536.0, + "22": 416513536.0, + "23": 416513536.0, + "24": 416513536.0, + "25": 416513536.0, + "26": 416513536.0, + "27": 416513536.0, + "28": 416513536.0, + "29": 416513536.0, + "30": 416513536.0, + "31": 416513536.0, + "32": 416513536.0, + "33": 416513536.0, + "34": 416513536.0, + "35": 416513536.0, + "36": 416513536.0, + "37": 416513536.0, + "38": 416513536.0, + "39": 416513536.0, + "40": 416513536.0, + "41": 416513536.0, + "42": 416513536.0, + "43": 416513536.0, + "44": 416513536.0, + "45": 416513536.0, + "46": 416513536.0, + "47": 416513536.0, + "48": 416513536.0, + "49": 416513536.0, + "50": 416513536.0, + "51": 416513536.0, + "52": 416513536.0, + "53": 416513536.0, + "54": 416513536.0, + "55": 416513536.0, + "56": 416513536.0, + "57": 416513536.0, + "58": 416513536.0, + "59": 416513536.0, + "60": 416513536.0, + "61": 416513536.0, + "62": 416513536.0, + "63": 416513536.0, + "64": 416513536.0, + "65": 416513536.0, + "66": 416513536.0, + "67": 416513536.0, + "68": 416513536.0, + "69": 416513536.0, + "70": 416513536.0, + "71": 416513536.0, + "72": 416513536.0, + "73": 416513536.0, + "74": 416513536.0, + "75": 416513536.0, + "76": 416513536.0, + "77": 416513536.0, + "78": 416513536.0, + "79": 416513536.0, + "80": 416513536.0, + "81": 416513536.0, + "82": 416513536.0, + "83": 416513536.0, + "84": 416513536.0, + "85": 416513536.0, + "86": 416513536.0, + "87": 416513536.0, + "88": 416513536.0, + "89": 416513536.0, + "90": 416513536.0, + "91": 416513536.0, + "92": 416513536.0, + "93": 416513536.0, + "94": 416513536.0, + "95": 416513536.0, + "96": 416513536.0, + "97": 416513536.0, + "98": 416513536.0, + "99": 416513536.0, + "100": 416513536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1465367040.0, + "2": 1465367040.0, + "3": 1465367552.0, + "4": 1465367552.0, + "5": 1465367552.0, + "6": 1465367552.0, + "7": 1465367552.0, + "8": 1465367552.0, + "9": 1465367552.0, + "10": 1465367552.0, + "11": 1465367552.0, + "12": 1465367552.0, + "13": 1465368064.0, + "14": 1465368064.0, + "15": 1465368064.0, + "16": 1465368064.0, + "17": 1597091328.0, + "18": 1597092352.0, + "19": 1597092352.0, + "20": 1597092352.0, + "21": 1597092352.0, + "22": 1597092352.0, + "23": 1597092352.0, + "24": 1597092352.0, + "25": 1597092352.0, + "26": 1597092352.0, + "27": 1597092352.0, + "28": 1597092352.0, + "29": 1597092352.0, + "30": 1597092352.0, + "31": 1597092352.0, + "32": 1597092352.0, + "33": 1597092352.0, + "34": 1597092352.0, + "35": 1597092352.0, + "36": 1597092352.0, + "37": 1597092352.0, + "38": 1597092352.0, + "39": 1597092352.0, + "40": 1597092352.0, + "41": 1597092352.0, + "42": 1597092352.0, + "43": 1597092352.0, + "44": 1597092352.0, + "45": 1597092352.0, + "46": 1597092352.0, + "47": 1597092352.0, + "48": 1597092352.0, + "49": 1597092352.0, + "50": 1597092352.0, + "51": 1597092352.0, + "52": 1597092352.0, + "53": 1597092352.0, + "54": 1597092352.0, + "55": 1597092352.0, + "56": 1597092352.0, + "57": 1597092352.0, + "58": 1597092352.0, + "59": 1597092352.0, + "60": 1597092352.0, + "61": 1597092352.0, + "62": 1597092352.0, + "63": 1597092352.0, + "64": 1597092352.0, + "65": 1597092352.0, + "66": 1597092352.0, + "67": 1597092352.0, + "68": 1597092352.0, + "69": 1597092352.0, + "70": 1597092352.0, + "71": 1597092352.0, + "72": 1597092352.0, + "73": 1597092352.0, + "74": 1597092352.0, + "75": 1597092352.0, + "76": 1597092352.0, + "77": 1597092352.0, + "78": 1597092352.0, + "79": 1597092352.0, + "80": 1597092352.0, + "81": 1597092352.0, + "82": 1597092352.0, + "83": 1597092352.0, + "84": 1597092352.0, + "85": 1597092352.0, + "86": 1597092352.0, + "87": 1597092352.0, + "88": 1597092352.0, + "89": 1597092352.0, + "90": 1597092352.0, + "91": 1597092352.0, + "92": 1597092352.0, + "93": 1597092352.0, + "94": 1597092352.0, + "95": 1597092352.0, + "96": 1597092352.0, + "97": 1597092352.0, + "98": 1597092352.0, + "99": 1597092352.0, + "100": 1597092352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 7.2197, + "2": 0.38153, + "3": 0.31292, + "4": 0.31213, + "5": 0.30805, + "6": 0.31347, + "7": 0.30766, + "8": 0.30913, + "9": 0.31477, + "10": 0.311, + "11": 0.65045, + "12": 0.30686, + "13": 0.49089, + "14": 0.47587, + "15": 0.30732, + "16": 0.44089, + "17": 0.30846, + "18": 0.31946, + "19": 0.34232, + "20": 0.31703, + "21": 0.31667, + "22": 0.6731, + "23": 0.3162, + "24": 0.31788, + "25": 0.31492, + "26": 0.31699, + "27": 0.31509, + "28": 0.31634, + "29": 0.55951, + "30": 0.31931, + "31": 0.54064, + "32": 0.32022, + "33": 0.31532, + "34": 0.31678, + "35": 0.31737, + "36": 0.31871, + "37": 0.31753, + "38": 0.31664, + "39": 0.32082, + "40": 0.31603, + "41": 0.31831, + "42": 0.32238, + "43": 0.31648, + "44": 0.31713, + "45": 0.32324, + "46": 0.31647, + "47": 0.31877, + "48": 0.32192, + "49": 0.31644, + "50": 0.31704, + "51": 0.31935, + "52": 0.31622, + "53": 0.32109, + "54": 0.31685, + "55": 0.31646, + "56": 0.32045, + "57": 0.31644, + "58": 0.31787, + "59": 0.32038, + "60": 0.31946, + "61": 0.31938, + "62": 0.31564, + "63": 0.32119, + "64": 0.31817, + "65": 0.31991, + "66": 0.32324, + "67": 0.31621, + "68": 0.31739, + "69": 0.32315, + "70": 0.31648, + "71": 0.31985, + "72": 0.32121, + "73": 0.31529, + "74": 0.31685, + "75": 0.32032, + "76": 0.31549, + "77": 0.31631, + "78": 0.32153, + "79": 0.31574, + "80": 0.32036, + "81": 0.31981, + "82": 0.31914, + "83": 0.31869, + "84": 0.31666, + "85": 0.32462, + "86": 0.31593, + "87": 0.31737, + "88": 0.32152, + "89": 0.31605, + "90": 0.31771, + "91": 0.32722, + "92": 0.31534, + "93": 0.31963, + "94": 0.32198, + "95": 0.31603, + "96": 0.31693, + "97": 0.32705, + "98": 0.31586, + "99": 0.31749, + "100": 0.32114 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": 2261.0, + "17": "nan", + "18": 2424.0, + "19": 2800.0, + "20": 1777.0, + "21": 2046.0, + "22": "nan", + "23": 2489.0, + "24": 2136.0, + "25": 2124.0, + "26": 1902.0, + "27": 2006.0, + "28": 2337.0, + "29": 2425.0, + "30": 2262.0, + "31": 1584.0, + "32": 2470.0, + "33": 2074.0, + "34": 1679.0, + "35": 1763.0, + "36": 1918.0, + "37": 2542.0, + "38": 2195.0, + "39": 3045.0, + "40": 1875.0, + "41": 3199.0, + "42": 2508.0, + "43": 2563.0, + "44": 1898.0, + "45": 2434.0, + "46": 2065.0, + "47": 2739.0, + "48": 2291.0, + "49": 1821.0, + "50": 2634.0, + "51": 2172.0, + "52": 2278.0, + "53": 3531.0, + "54": 2662.0, + "55": 2383.0, + "56": 2480.0, + "57": 2136.0, + "58": 3305.0, + "59": 2485.0, + "60": 2832.0, + "61": 2847.0, + "62": 2841.0, + "63": 2867.0, + "64": 3107.0, + "65": 2223.0, + "66": 3682.0, + "67": 2533.0, + "68": 3137.0, + "69": 2650.0, + "70": 3836.0, + "71": 2945.0, + "72": 2727.0, + "73": 3322.0, + "74": 2186.0, + "75": 2913.0, + "76": 3553.0, + "77": 3629.0, + "78": 3871.0, + "79": 4097.0, + "80": 3398.0, + "81": 5006.0, + "82": 3345.0, + "83": 3174.0, + "84": 3718.0, + "85": 3618.0, + "86": 3181.0, + "87": 3995.0, + "88": 3634.0, + "89": 4250.0, + "90": 3676.0, + "91": 2926.0, + "92": 4446.0, + "93": 3780.0, + "94": 4430.0, + "95": 4082.0, + "96": 3952.0, + "97": 4117.0, + "98": 5049.0, + "99": 4122.0, + "100": 3502.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json index ac72f0a511b..24b971e51f0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json @@ -2,140 +2,535 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.84277, + "2": 10.85562, + "3": 10.84568, + "4": 10.84364, "5": 10.85979, + "6": 10.86413, + "7": 10.85362, + "8": 10.85066, + "9": 10.8615, "10": 10.82586, + "11": 10.86811, + "12": 10.85685, + "13": 10.87827, + "14": 10.86894, "15": 10.85888, + "16": 10.8685, + "17": 10.85105, + "18": 10.85939, + "19": 10.85704, "20": 10.84526, + "21": 10.85808, + "22": 10.83215, + "23": 10.86717, + "24": 10.83773, "25": 10.82744, + "26": 10.83163, + "27": 10.83573, + "28": 10.82373, + "29": 10.81624, "30": 10.76486, + "31": 10.69044, + "32": 10.76257, + "33": 10.75455, + "34": 10.67733, "35": 10.66335, + "36": 10.63634, + "37": 10.66856, + "38": 10.5969, + "39": 10.67599, "40": 10.50898, + "41": 10.53945, + "42": 10.55263, + "43": 10.35003, + "44": 10.40418, "45": 10.32106, + "46": 10.27724, + "47": 10.45205, + "48": 10.28913, + "49": 10.05779, "50": 10.27777, + "51": 10.23471, + "52": 10.13764, + "53": 10.34797, + "54": 10.26738, "55": 10.20734, + "56": 9.99527, + "57": 9.89333, + "58": 10.13452, + "59": 9.92856, "60": 9.8551, + "61": 9.98264, + "62": 10.20686, + "63": 9.70842, + "64": 10.01687, "65": 9.30409, + "66": 9.93326, + "67": 9.62677, + "68": 9.98429, + "69": 9.9755, "70": 9.93956, + "71": 9.81005, + "72": 9.798, + "73": 9.68454, + "74": 9.19951, "75": 9.60518, + "76": 9.27791, + "77": 10.19437, + "78": 9.8671, + "79": 9.53341, "80": 9.56341, + "81": 9.63047, + "82": 9.82819, + "83": 9.46388, + "84": 9.53736, "85": 9.74561, + "86": 9.21332, + "87": 9.7014, + "88": 9.86621, + "89": 9.72242, "90": 9.92089, + "91": 9.47178, + "92": 9.46996, + "93": 9.20589, + "94": 8.94772, "95": 9.60815, + "96": 9.63635, + "97": 9.4138, + "98": 9.77274, + "99": 8.9958, "100": 9.50415 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 284527616.0, + "2": 284527616.0, + "3": 284527616.0, + "4": 284527616.0, "5": 284527616.0, + "6": 284527616.0, + "7": 284527616.0, + "8": 284527616.0, + "9": 284527616.0, "10": 284527616.0, + "11": 284527616.0, + "12": 284527616.0, + "13": 284527616.0, + "14": 284527616.0, "15": 284527616.0, + "16": 416513536.0, + "17": 416513536.0, + "18": 416513536.0, + "19": 416513536.0, "20": 416513536.0, + "21": 416513536.0, + "22": 416513536.0, + "23": 416513536.0, + "24": 416513536.0, "25": 416513536.0, + "26": 416513536.0, + "27": 416513536.0, + "28": 416513536.0, + "29": 416513536.0, "30": 416513536.0, + "31": 416513536.0, + "32": 416513536.0, + "33": 416513536.0, + "34": 416513536.0, "35": 416513536.0, + "36": 416513536.0, + "37": 416513536.0, + "38": 416513536.0, + "39": 416513536.0, "40": 416513536.0, + "41": 416513536.0, + "42": 416513536.0, + "43": 416513536.0, + "44": 416513536.0, "45": 416513536.0, + "46": 416513536.0, + "47": 416513536.0, + "48": 416513536.0, + "49": 416513536.0, "50": 416513536.0, + "51": 416513536.0, + "52": 416513536.0, + "53": 416513536.0, + "54": 416513536.0, "55": 416513536.0, + "56": 416513536.0, + "57": 416513536.0, + "58": 416513536.0, + "59": 416513536.0, "60": 416513536.0, + "61": 416513536.0, + "62": 416513536.0, + "63": 416513536.0, + "64": 416513536.0, "65": 416513536.0, + "66": 416513536.0, + "67": 416513536.0, + "68": 416513536.0, + "69": 416513536.0, "70": 416513536.0, + "71": 416513536.0, + "72": 416513536.0, + "73": 416513536.0, + "74": 416513536.0, "75": 416513536.0, + "76": 416513536.0, + "77": 416513536.0, + "78": 416513536.0, + "79": 416513536.0, "80": 416513536.0, + "81": 416513536.0, + "82": 416513536.0, + "83": 416513536.0, + "84": 416513536.0, "85": 416513536.0, + "86": 416513536.0, + "87": 416513536.0, + "88": 416513536.0, + "89": 416513536.0, "90": 416513536.0, + "91": 416513536.0, + "92": 416513536.0, + "93": 416513536.0, + "94": 416513536.0, "95": 416513536.0, + "96": 416513536.0, + "97": 416513536.0, + "98": 416513536.0, + "99": 416513536.0, "100": 416513536.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1465368064.0, + "2": 1465368576.0, + "3": 1465368576.0, + "4": 1465368576.0, "5": 1465368576.0, + "6": 1465368576.0, + "7": 1465368576.0, + "8": 1465368576.0, + "9": 1465368576.0, "10": 1465368576.0, + "11": 1465368576.0, + "12": 1465368576.0, + "13": 1465368576.0, + "14": 1465368576.0, "15": 1465368576.0, + "16": 1465368576.0, + "17": 1597092352.0, + "18": 1597092352.0, + "19": 1597092352.0, "20": 1597092352.0, + "21": 1597092352.0, + "22": 1597092352.0, + "23": 1597092352.0, + "24": 1597092352.0, "25": 1597092352.0, + "26": 1597092352.0, + "27": 1597092352.0, + "28": 1597092352.0, + "29": 1597092352.0, "30": 1597092352.0, + "31": 1597092352.0, + "32": 1597092352.0, + "33": 1597092352.0, + "34": 1597092352.0, "35": 1597092352.0, + "36": 1597092352.0, + "37": 1597092352.0, + "38": 1597092352.0, + "39": 1597092352.0, "40": 1597092352.0, + "41": 1597092352.0, + "42": 1597092352.0, + "43": 1597092352.0, + "44": 1597092352.0, "45": 1597092352.0, + "46": 1597092352.0, + "47": 1597092352.0, + "48": 1597092352.0, + "49": 1597092352.0, "50": 1597092352.0, + "51": 1597092352.0, + "52": 1597092352.0, + "53": 1597092352.0, + "54": 1597092352.0, "55": 1597092352.0, + "56": 1597092352.0, + "57": 1597092352.0, + "58": 1597092352.0, + "59": 1597092352.0, "60": 1597092352.0, + "61": 1597092352.0, + "62": 1597092352.0, + "63": 1597092352.0, + "64": 1597092352.0, "65": 1597092352.0, + "66": 1597092352.0, + "67": 1597092352.0, + "68": 1597092352.0, + "69": 1597092352.0, "70": 1597092352.0, + "71": 1597092352.0, + "72": 1597092352.0, + "73": 1597092352.0, + "74": 1597092352.0, "75": 1597092352.0, + "76": 1597092352.0, + "77": 1597092352.0, + "78": 1597092352.0, + "79": 1597092352.0, "80": 1597092352.0, + "81": 1597092352.0, + "82": 1597092352.0, + "83": 1597092352.0, + "84": 1597092352.0, "85": 1597092352.0, + "86": 1597092352.0, + "87": 1597092352.0, + "88": 1597092352.0, + "89": 1597092352.0, "90": 1597092352.0, + "91": 1597092352.0, + "92": 1597092352.0, + "93": 1597092352.0, + "94": 1597092352.0, "95": 1597092352.0, + "96": 1597092352.0, + "97": 1597092352.0, + "98": 1597092352.0, + "99": 1597092352.0, "100": 1597092352.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 6.8101, - "5": 0.18701, - "10": 0.18541, - "15": 0.18521, - "20": 0.19609, - "25": 0.1951, - "30": 0.19333, - "35": 0.19677, - "40": 0.19632, - "45": 0.1936, - "50": 0.1942, - "55": 0.19155, - "60": 0.19561, - "65": 0.19204, - "70": 0.2011, - "75": 0.19962, - "80": 0.19865, - "85": 0.20072, - "90": 0.19885, - "95": 0.20622, - "100": 0.20088 + "1": 6.81983, + "2": 0.2794, + "3": 0.23686, + "4": 0.21148, + "5": 0.21241, + "6": 0.21432, + "7": 0.21203, + "8": 0.21066, + "9": 0.20958, + "10": 0.21304, + "11": 0.2134, + "12": 0.21369, + "13": 0.2107, + "14": 0.21366, + "15": 0.20862, + "16": 0.28561, + "17": 0.2165, + "18": 0.21953, + "19": 0.22122, + "20": 0.22177, + "21": 0.2229, + "22": 0.21407, + "23": 0.22275, + "24": 0.22407, + "25": 0.22273, + "26": 0.22637, + "27": 0.22313, + "28": 0.22384, + "29": 0.22193, + "30": 0.22359, + "31": 0.2209, + "32": 0.22301, + "33": 0.22023, + "34": 0.22191, + "35": 0.22291, + "36": 0.22174, + "37": 0.22136, + "38": 0.22212, + "39": 0.22108, + "40": 0.22197, + "41": 0.22185, + "42": 0.22093, + "43": 0.22393, + "44": 0.22166, + "45": 0.2211, + "46": 0.22759, + "47": 0.22278, + "48": 0.22181, + "49": 0.2205, + "50": 0.2208, + "51": 0.22217, + "52": 0.22209, + "53": 0.21851, + "54": 0.21953, + "55": 0.22284, + "56": 0.21873, + "57": 0.21994, + "58": 0.21738, + "59": 0.22216, + "60": 0.22091, + "61": 0.21912, + "62": 0.21916, + "63": 0.21618, + "64": 0.22037, + "65": 0.22084, + "66": 0.21741, + "67": 0.2191, + "68": 0.21708, + "69": 0.21714, + "70": 0.22023, + "71": 0.21802, + "72": 0.216, + "73": 0.22116, + "74": 0.22062, + "75": 0.23228, + "76": 0.22254, + "77": 0.21552, + "78": 0.2206, + "79": 0.2158, + "80": 0.22212, + "81": 0.22066, + "82": 0.22199, + "83": 0.21697, + "84": 0.21798, + "85": 0.21804, + "86": 0.21874, + "87": 0.21943, + "88": 0.21826, + "89": 0.21793, + "90": 0.21769, + "91": 0.21994, + "92": 0.21792, + "93": 0.22021, + "94": 0.21851, + "95": 0.21939, + "96": 0.21921, + "97": 0.22073, + "98": 0.21992, + "99": 0.21794, + "100": 0.21873 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", "15": "nan", + "16": 2365.0, + "17": "nan", + "18": 2331.0, + "19": 2912.0, "20": 1664.0, + "21": 2009.0, + "22": "nan", + "23": 2483.0, + "24": 2192.0, "25": 2290.0, + "26": 1916.0, + "27": 2020.0, + "28": 2503.0, + "29": 2379.0, "30": 2400.0, + "31": 1759.0, + "32": 2522.0, + "33": 2145.0, + "34": 1791.0, "35": 1777.0, + "36": 2100.0, + "37": 2396.0, + "38": 2040.0, + "39": 2983.0, "40": 1805.0, + "41": 3097.0, + "42": 2421.0, + "43": 2566.0, + "44": 1858.0, "45": 2371.0, + "46": 2140.0, + "47": 2603.0, + "48": 2358.0, + "49": 1739.0, "50": 2686.0, + "51": 2041.0, + "52": 2226.0, + "53": 3222.0, + "54": 2784.0, "55": 2290.0, + "56": 2428.0, + "57": 2146.0, + "58": 3048.0, + "59": 2504.0, "60": 2612.0, + "61": 2623.0, + "62": 3003.0, + "63": 2762.0, + "64": 2917.0, "65": 2104.0, + "66": 3550.0, + "67": 2433.0, + "68": 3146.0, + "69": 2877.0, "70": 3528.0, + "71": 2983.0, + "72": 2640.0, + "73": 3199.0, + "74": 2084.0, "75": 2809.0, + "76": 3599.0, + "77": 3667.0, + "78": 3680.0, + "79": 3972.0, "80": 3365.0, + "81": 5042.0, + "82": 3291.0, + "83": 3016.0, + "84": 3592.0, "85": 3792.0, + "86": 3192.0, + "87": 4219.0, + "88": 3376.0, + "89": 4110.0, "90": 3939.0, + "91": 2912.0, + "92": 4114.0, + "93": 3499.0, + "94": 4339.0, "95": 3829.0, + "96": 3875.0, + "97": 4100.0, + "98": 4889.0, + "99": 3771.0, "100": 3390.0 } } diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..6d3fed6a4e1 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84277, + "2": 10.85562, + "3": 10.84568, + "4": 10.84364, + "5": 10.85979, + "6": 10.86413, + "7": 10.85362, + "8": 10.85066, + "9": 10.8615, + "10": 10.82586, + "11": 10.86811, + "12": 10.85685, + "13": 10.87827, + "14": 10.86894, + "15": 10.85888, + "16": 10.8685, + "17": 10.85105, + "18": 10.85939, + "19": 10.85704, + "20": 10.84526, + "21": 10.85808, + "22": 10.83215, + "23": 10.86717, + "24": 10.83773, + "25": 10.82744, + "26": 10.83163, + "27": 10.83573, + "28": 10.82373, + "29": 10.81624, + "30": 10.76486, + "31": 10.69044, + "32": 10.76257, + "33": 10.75455, + "34": 10.67733, + "35": 10.66335, + "36": 10.63634, + "37": 10.66856, + "38": 10.5969, + "39": 10.67599, + "40": 10.50898, + "41": 10.53945, + "42": 10.55263, + "43": 10.35003, + "44": 10.40418, + "45": 10.32106, + "46": 10.27724, + "47": 10.45205, + "48": 10.28913, + "49": 10.05779, + "50": 10.27777, + "51": 10.23471, + "52": 10.13764, + "53": 10.34797, + "54": 10.26738, + "55": 10.20734, + "56": 9.99527, + "57": 9.89333, + "58": 10.13452, + "59": 9.92856, + "60": 9.8551, + "61": 9.98264, + "62": 10.20686, + "63": 9.70842, + "64": 10.01687, + "65": 9.30409, + "66": 9.93326, + "67": 9.62677, + "68": 9.98429, + "69": 9.9755, + "70": 9.93956, + "71": 9.81005, + "72": 9.798, + "73": 9.68454, + "74": 9.19951, + "75": 9.60518, + "76": 9.27791, + "77": 10.19437, + "78": 9.8671, + "79": 9.53341, + "80": 9.56341, + "81": 9.63047, + "82": 9.82819, + "83": 9.46388, + "84": 9.53736, + "85": 9.74561, + "86": 9.21332, + "87": 9.7014, + "88": 9.86621, + "89": 9.72242, + "90": 9.92089, + "91": 9.47178, + "92": 9.46996, + "93": 9.20589, + "94": 8.94772, + "95": 9.60815, + "96": 9.63635, + "97": 9.4138, + "98": 9.77274, + "99": 8.9958, + "100": 9.50415 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 284527616.0, + "2": 284527616.0, + "3": 284527616.0, + "4": 284527616.0, + "5": 284527616.0, + "6": 284527616.0, + "7": 284527616.0, + "8": 284527616.0, + "9": 284527616.0, + "10": 284527616.0, + "11": 284527616.0, + "12": 284527616.0, + "13": 284527616.0, + "14": 284527616.0, + "15": 284527616.0, + "16": 416513536.0, + "17": 416513536.0, + "18": 416513536.0, + "19": 416513536.0, + "20": 416513536.0, + "21": 416513536.0, + "22": 416513536.0, + "23": 416513536.0, + "24": 416513536.0, + "25": 416513536.0, + "26": 416513536.0, + "27": 416513536.0, + "28": 416513536.0, + "29": 416513536.0, + "30": 416513536.0, + "31": 416513536.0, + "32": 416513536.0, + "33": 416513536.0, + "34": 416513536.0, + "35": 416513536.0, + "36": 416513536.0, + "37": 416513536.0, + "38": 416513536.0, + "39": 416513536.0, + "40": 416513536.0, + "41": 416513536.0, + "42": 416513536.0, + "43": 416513536.0, + "44": 416513536.0, + "45": 416513536.0, + "46": 416513536.0, + "47": 416513536.0, + "48": 416513536.0, + "49": 416513536.0, + "50": 416513536.0, + "51": 416513536.0, + "52": 416513536.0, + "53": 416513536.0, + "54": 416513536.0, + "55": 416513536.0, + "56": 416513536.0, + "57": 416513536.0, + "58": 416513536.0, + "59": 416513536.0, + "60": 416513536.0, + "61": 416513536.0, + "62": 416513536.0, + "63": 416513536.0, + "64": 416513536.0, + "65": 416513536.0, + "66": 416513536.0, + "67": 416513536.0, + "68": 416513536.0, + "69": 416513536.0, + "70": 416513536.0, + "71": 416513536.0, + "72": 416513536.0, + "73": 416513536.0, + "74": 416513536.0, + "75": 416513536.0, + "76": 416513536.0, + "77": 416513536.0, + "78": 416513536.0, + "79": 416513536.0, + "80": 416513536.0, + "81": 416513536.0, + "82": 416513536.0, + "83": 416513536.0, + "84": 416513536.0, + "85": 416513536.0, + "86": 416513536.0, + "87": 416513536.0, + "88": 416513536.0, + "89": 416513536.0, + "90": 416513536.0, + "91": 416513536.0, + "92": 416513536.0, + "93": 416513536.0, + "94": 416513536.0, + "95": 416513536.0, + "96": 416513536.0, + "97": 416513536.0, + "98": 416513536.0, + "99": 416513536.0, + "100": 416513536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1465368064.0, + "2": 1465368576.0, + "3": 1465368576.0, + "4": 1465368576.0, + "5": 1465368576.0, + "6": 1465368576.0, + "7": 1465368576.0, + "8": 1465368576.0, + "9": 1465368576.0, + "10": 1465368576.0, + "11": 1465368576.0, + "12": 1465368576.0, + "13": 1465368576.0, + "14": 1465368576.0, + "15": 1465368576.0, + "16": 1465368576.0, + "17": 1597092352.0, + "18": 1597092352.0, + "19": 1597092352.0, + "20": 1597092352.0, + "21": 1597092352.0, + "22": 1597092352.0, + "23": 1597092352.0, + "24": 1597092352.0, + "25": 1597092352.0, + "26": 1597092352.0, + "27": 1597092352.0, + "28": 1597092352.0, + "29": 1597092352.0, + "30": 1597092352.0, + "31": 1597092352.0, + "32": 1597092352.0, + "33": 1597092352.0, + "34": 1597092352.0, + "35": 1597092352.0, + "36": 1597092352.0, + "37": 1597092352.0, + "38": 1597092352.0, + "39": 1597092352.0, + "40": 1597092352.0, + "41": 1597092352.0, + "42": 1597092352.0, + "43": 1597092352.0, + "44": 1597092352.0, + "45": 1597092352.0, + "46": 1597092352.0, + "47": 1597092352.0, + "48": 1597092352.0, + "49": 1597092352.0, + "50": 1597092352.0, + "51": 1597092352.0, + "52": 1597092352.0, + "53": 1597092352.0, + "54": 1597092352.0, + "55": 1597092352.0, + "56": 1597092352.0, + "57": 1597092352.0, + "58": 1597092352.0, + "59": 1597092352.0, + "60": 1597092352.0, + "61": 1597092352.0, + "62": 1597092352.0, + "63": 1597092352.0, + "64": 1597092352.0, + "65": 1597092352.0, + "66": 1597092352.0, + "67": 1597092352.0, + "68": 1597092352.0, + "69": 1597092352.0, + "70": 1597092352.0, + "71": 1597092352.0, + "72": 1597092352.0, + "73": 1597092352.0, + "74": 1597092352.0, + "75": 1597092352.0, + "76": 1597092352.0, + "77": 1597092352.0, + "78": 1597092352.0, + "79": 1597092352.0, + "80": 1597092352.0, + "81": 1597092352.0, + "82": 1597092352.0, + "83": 1597092352.0, + "84": 1597092352.0, + "85": 1597092352.0, + "86": 1597092352.0, + "87": 1597092352.0, + "88": 1597092352.0, + "89": 1597092352.0, + "90": 1597092352.0, + "91": 1597092352.0, + "92": 1597092352.0, + "93": 1597092352.0, + "94": 1597092352.0, + "95": 1597092352.0, + "96": 1597092352.0, + "97": 1597092352.0, + "98": 1597092352.0, + "99": 1597092352.0, + "100": 1597092352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.88808, + "2": 0.20981, + "3": 0.18464, + "4": 0.18146, + "5": 0.18139, + "6": 0.18232, + "7": 0.18139, + "8": 0.19305, + "9": 0.20922, + "10": 0.21649, + "11": 0.21725, + "12": 0.21609, + "13": 0.21598, + "14": 0.20547, + "15": 0.17989, + "16": 0.28174, + "17": 0.18387, + "18": 0.18953, + "19": 0.18846, + "20": 0.19189, + "21": 0.19314, + "22": 0.18064, + "23": 0.18755, + "24": 0.18827, + "25": 0.18887, + "26": 0.19031, + "27": 0.1885, + "28": 0.18793, + "29": 0.19305, + "30": 0.19416, + "31": 0.19643, + "32": 0.1951, + "33": 0.19776, + "34": 0.1938, + "35": 0.19081, + "36": 0.19042, + "37": 0.18859, + "38": 0.19216, + "39": 0.1926, + "40": 0.19911, + "41": 0.19456, + "42": 0.19355, + "43": 0.1903, + "44": 0.1948, + "45": 0.19482, + "46": 0.19503, + "47": 0.19164, + "48": 0.19046, + "49": 0.19133, + "50": 0.19304, + "51": 0.19406, + "52": 0.20215, + "53": 0.18888, + "54": 0.19054, + "55": 0.1901, + "56": 0.18974, + "57": 0.18817, + "58": 0.18992, + "59": 0.18977, + "60": 0.19074, + "61": 0.1885, + "62": 0.18892, + "63": 0.18809, + "64": 0.19043, + "65": 0.19082, + "66": 0.19034, + "67": 0.19393, + "68": 0.18998, + "69": 0.19445, + "70": 0.19067, + "71": 0.19176, + "72": 0.18979, + "73": 0.18866, + "74": 0.18912, + "75": 0.19329, + "76": 0.19148, + "77": 0.19217, + "78": 0.18942, + "79": 0.19141, + "80": 0.19297, + "81": 0.19247, + "82": 0.19228, + "83": 0.19275, + "84": 0.19196, + "85": 0.19648, + "86": 0.20088, + "87": 0.20172, + "88": 0.1985, + "89": 0.20262, + "90": 0.20618, + "91": 0.19394, + "92": 0.1911, + "93": 0.19148, + "94": 0.50543, + "95": 0.19162, + "96": 0.19339, + "97": 0.1931, + "98": 0.19152, + "99": 0.19182, + "100": 0.1939 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": 2365.0, + "17": "nan", + "18": 2331.0, + "19": 2912.0, + "20": 1664.0, + "21": 2009.0, + "22": "nan", + "23": 2483.0, + "24": 2192.0, + "25": 2290.0, + "26": 1916.0, + "27": 2020.0, + "28": 2503.0, + "29": 2379.0, + "30": 2400.0, + "31": 1759.0, + "32": 2522.0, + "33": 2145.0, + "34": 1791.0, + "35": 1777.0, + "36": 2100.0, + "37": 2396.0, + "38": 2040.0, + "39": 2983.0, + "40": 1805.0, + "41": 3097.0, + "42": 2421.0, + "43": 2566.0, + "44": 1858.0, + "45": 2371.0, + "46": 2140.0, + "47": 2603.0, + "48": 2358.0, + "49": 1739.0, + "50": 2686.0, + "51": 2041.0, + "52": 2226.0, + "53": 3222.0, + "54": 2784.0, + "55": 2290.0, + "56": 2428.0, + "57": 2146.0, + "58": 3048.0, + "59": 2504.0, + "60": 2612.0, + "61": 2623.0, + "62": 3003.0, + "63": 2762.0, + "64": 2917.0, + "65": 2104.0, + "66": 3550.0, + "67": 2433.0, + "68": 3146.0, + "69": 2877.0, + "70": 3528.0, + "71": 2983.0, + "72": 2640.0, + "73": 3199.0, + "74": 2084.0, + "75": 2809.0, + "76": 3599.0, + "77": 3667.0, + "78": 3680.0, + "79": 3972.0, + "80": 3365.0, + "81": 5042.0, + "82": 3291.0, + "83": 3016.0, + "84": 3592.0, + "85": 3792.0, + "86": 3192.0, + "87": 4219.0, + "88": 3376.0, + "89": 4110.0, + "90": 3939.0, + "91": 2912.0, + "92": 4114.0, + "93": 3499.0, + "94": 4339.0, + "95": 3829.0, + "96": 3875.0, + "97": 4100.0, + "98": 4889.0, + "99": 3771.0, + "100": 3390.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..cb0ad3fdb4b --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84277, + "2": 10.85562, + "3": 10.84568, + "4": 10.84364, + "5": 10.85979, + "6": 10.86413, + "7": 10.85362, + "8": 10.85066, + "9": 10.8615, + "10": 10.82586, + "11": 10.86811, + "12": 10.85685, + "13": 10.87827, + "14": 10.86894, + "15": 10.85888, + "16": 10.8685, + "17": 10.85105, + "18": 10.85939, + "19": 10.85704, + "20": 10.84526, + "21": 10.85808, + "22": 10.83215, + "23": 10.86717, + "24": 10.83773, + "25": 10.82744, + "26": 10.83163, + "27": 10.83573, + "28": 10.82373, + "29": 10.81624, + "30": 10.76486, + "31": 10.69044, + "32": 10.76257, + "33": 10.75455, + "34": 10.67733, + "35": 10.66335, + "36": 10.63634, + "37": 10.66856, + "38": 10.5969, + "39": 10.67599, + "40": 10.50898, + "41": 10.53945, + "42": 10.55263, + "43": 10.35003, + "44": 10.40418, + "45": 10.32106, + "46": 10.27724, + "47": 10.45205, + "48": 10.28913, + "49": 10.05779, + "50": 10.27777, + "51": 10.23471, + "52": 10.13764, + "53": 10.34797, + "54": 10.26738, + "55": 10.20734, + "56": 9.99527, + "57": 9.89333, + "58": 10.13452, + "59": 9.92856, + "60": 9.8551, + "61": 9.98264, + "62": 10.20686, + "63": 9.70842, + "64": 10.01687, + "65": 9.30409, + "66": 9.93326, + "67": 9.62677, + "68": 9.98429, + "69": 9.9755, + "70": 9.93956, + "71": 9.81005, + "72": 9.798, + "73": 9.68454, + "74": 9.19951, + "75": 9.60518, + "76": 9.27791, + "77": 10.19437, + "78": 9.8671, + "79": 9.53341, + "80": 9.56341, + "81": 9.63047, + "82": 9.82819, + "83": 9.46388, + "84": 9.53736, + "85": 9.74561, + "86": 9.21332, + "87": 9.7014, + "88": 9.86621, + "89": 9.72242, + "90": 9.92089, + "91": 9.47178, + "92": 9.46996, + "93": 9.20589, + "94": 8.94772, + "95": 9.60815, + "96": 9.63635, + "97": 9.4138, + "98": 9.77274, + "99": 8.9958, + "100": 9.50415 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 284527616.0, + "2": 284527616.0, + "3": 284527616.0, + "4": 284527616.0, + "5": 284527616.0, + "6": 284527616.0, + "7": 284527616.0, + "8": 284527616.0, + "9": 284527616.0, + "10": 284527616.0, + "11": 284527616.0, + "12": 284527616.0, + "13": 284527616.0, + "14": 284527616.0, + "15": 284527616.0, + "16": 416513536.0, + "17": 416513536.0, + "18": 416513536.0, + "19": 416513536.0, + "20": 416513536.0, + "21": 416513536.0, + "22": 416513536.0, + "23": 416513536.0, + "24": 416513536.0, + "25": 416513536.0, + "26": 416513536.0, + "27": 416513536.0, + "28": 416513536.0, + "29": 416513536.0, + "30": 416513536.0, + "31": 416513536.0, + "32": 416513536.0, + "33": 416513536.0, + "34": 416513536.0, + "35": 416513536.0, + "36": 416513536.0, + "37": 416513536.0, + "38": 416513536.0, + "39": 416513536.0, + "40": 416513536.0, + "41": 416513536.0, + "42": 416513536.0, + "43": 416513536.0, + "44": 416513536.0, + "45": 416513536.0, + "46": 416513536.0, + "47": 416513536.0, + "48": 416513536.0, + "49": 416513536.0, + "50": 416513536.0, + "51": 416513536.0, + "52": 416513536.0, + "53": 416513536.0, + "54": 416513536.0, + "55": 416513536.0, + "56": 416513536.0, + "57": 416513536.0, + "58": 416513536.0, + "59": 416513536.0, + "60": 416513536.0, + "61": 416513536.0, + "62": 416513536.0, + "63": 416513536.0, + "64": 416513536.0, + "65": 416513536.0, + "66": 416513536.0, + "67": 416513536.0, + "68": 416513536.0, + "69": 416513536.0, + "70": 416513536.0, + "71": 416513536.0, + "72": 416513536.0, + "73": 416513536.0, + "74": 416513536.0, + "75": 416513536.0, + "76": 416513536.0, + "77": 416513536.0, + "78": 416513536.0, + "79": 416513536.0, + "80": 416513536.0, + "81": 416513536.0, + "82": 416513536.0, + "83": 416513536.0, + "84": 416513536.0, + "85": 416513536.0, + "86": 416513536.0, + "87": 416513536.0, + "88": 416513536.0, + "89": 416513536.0, + "90": 416513536.0, + "91": 416513536.0, + "92": 416513536.0, + "93": 416513536.0, + "94": 416513536.0, + "95": 416513536.0, + "96": 416513536.0, + "97": 416513536.0, + "98": 416513536.0, + "99": 416513536.0, + "100": 416513536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1465368064.0, + "2": 1465368576.0, + "3": 1465368576.0, + "4": 1465368576.0, + "5": 1465368576.0, + "6": 1465368576.0, + "7": 1465368576.0, + "8": 1465368576.0, + "9": 1465368576.0, + "10": 1465368576.0, + "11": 1465368576.0, + "12": 1465368576.0, + "13": 1465368576.0, + "14": 1465368576.0, + "15": 1465368576.0, + "16": 1465368576.0, + "17": 1597092352.0, + "18": 1597092352.0, + "19": 1597092352.0, + "20": 1597092352.0, + "21": 1597092352.0, + "22": 1597092352.0, + "23": 1597092352.0, + "24": 1597092352.0, + "25": 1597092352.0, + "26": 1597092352.0, + "27": 1597092352.0, + "28": 1597092352.0, + "29": 1597092352.0, + "30": 1597092352.0, + "31": 1597092352.0, + "32": 1597092352.0, + "33": 1597092352.0, + "34": 1597092352.0, + "35": 1597092352.0, + "36": 1597092352.0, + "37": 1597092352.0, + "38": 1597092352.0, + "39": 1597092352.0, + "40": 1597092352.0, + "41": 1597092352.0, + "42": 1597092352.0, + "43": 1597092352.0, + "44": 1597092352.0, + "45": 1597092352.0, + "46": 1597092352.0, + "47": 1597092352.0, + "48": 1597092352.0, + "49": 1597092352.0, + "50": 1597092352.0, + "51": 1597092352.0, + "52": 1597092352.0, + "53": 1597092352.0, + "54": 1597092352.0, + "55": 1597092352.0, + "56": 1597092352.0, + "57": 1597092352.0, + "58": 1597092352.0, + "59": 1597092352.0, + "60": 1597092352.0, + "61": 1597092352.0, + "62": 1597092352.0, + "63": 1597092352.0, + "64": 1597092352.0, + "65": 1597092352.0, + "66": 1597092352.0, + "67": 1597092352.0, + "68": 1597092352.0, + "69": 1597092352.0, + "70": 1597092352.0, + "71": 1597092352.0, + "72": 1597092352.0, + "73": 1597092352.0, + "74": 1597092352.0, + "75": 1597092352.0, + "76": 1597092352.0, + "77": 1597092352.0, + "78": 1597092352.0, + "79": 1597092352.0, + "80": 1597092352.0, + "81": 1597092352.0, + "82": 1597092352.0, + "83": 1597092352.0, + "84": 1597092352.0, + "85": 1597092352.0, + "86": 1597092352.0, + "87": 1597092352.0, + "88": 1597092352.0, + "89": 1597092352.0, + "90": 1597092352.0, + "91": 1597092352.0, + "92": 1597092352.0, + "93": 1597092352.0, + "94": 1597092352.0, + "95": 1597092352.0, + "96": 1597092352.0, + "97": 1597092352.0, + "98": 1597092352.0, + "99": 1597092352.0, + "100": 1597092352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.78805, + "2": 0.23224, + "3": 0.20783, + "4": 0.21971, + "5": 0.22246, + "6": 0.23346, + "7": 0.21626, + "8": 0.20597, + "9": 0.2043, + "10": 0.20681, + "11": 0.20511, + "12": 0.20484, + "13": 0.21351, + "14": 0.20446, + "15": 0.21063, + "16": 0.28338, + "17": 0.21017, + "18": 0.21577, + "19": 0.21852, + "20": 0.23072, + "21": 0.25974, + "22": 0.21717, + "23": 0.22548, + "24": 0.21878, + "25": 0.21448, + "26": 0.21416, + "27": 0.22357, + "28": 0.21645, + "29": 0.21325, + "30": 0.21465, + "31": 0.21452, + "32": 0.21608, + "33": 0.23531, + "34": 0.227, + "35": 0.2188, + "36": 0.21248, + "37": 0.21694, + "38": 0.21269, + "39": 0.22285, + "40": 0.21458, + "41": 0.2134, + "42": 0.21991, + "43": 0.21621, + "44": 0.21422, + "45": 0.21339, + "46": 0.21332, + "47": 0.21892, + "48": 0.21384, + "49": 0.21668, + "50": 0.21806, + "51": 0.21958, + "52": 0.2173, + "53": 0.21642, + "54": 0.22157, + "55": 0.21549, + "56": 0.21528, + "57": 0.21789, + "58": 0.21634, + "59": 0.21649, + "60": 0.2141, + "61": 0.21447, + "62": 0.21596, + "63": 0.21545, + "64": 0.22145, + "65": 0.21603, + "66": 0.21504, + "67": 0.21551, + "68": 0.21918, + "69": 0.21831, + "70": 0.21943, + "71": 0.21537, + "72": 0.21937, + "73": 0.21783, + "74": 0.2246, + "75": 0.22031, + "76": 0.23249, + "77": 0.21862, + "78": 0.21663, + "79": 0.21806, + "80": 0.21694, + "81": 0.21684, + "82": 0.21559, + "83": 0.21877, + "84": 0.2151, + "85": 0.21819, + "86": 0.2167, + "87": 0.21768, + "88": 0.21415, + "89": 0.21694, + "90": 0.21444, + "91": 0.21616, + "92": 0.21967, + "93": 0.21672, + "94": 0.21699, + "95": 0.21892, + "96": 0.21871, + "97": 0.21805, + "98": 0.21674, + "99": 0.21639, + "100": 0.21581 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": 2365.0, + "17": "nan", + "18": 2331.0, + "19": 2912.0, + "20": 1664.0, + "21": 2009.0, + "22": "nan", + "23": 2483.0, + "24": 2192.0, + "25": 2290.0, + "26": 1916.0, + "27": 2020.0, + "28": 2503.0, + "29": 2379.0, + "30": 2400.0, + "31": 1759.0, + "32": 2522.0, + "33": 2145.0, + "34": 1791.0, + "35": 1777.0, + "36": 2100.0, + "37": 2396.0, + "38": 2040.0, + "39": 2983.0, + "40": 1805.0, + "41": 3097.0, + "42": 2421.0, + "43": 2566.0, + "44": 1858.0, + "45": 2371.0, + "46": 2140.0, + "47": 2603.0, + "48": 2358.0, + "49": 1739.0, + "50": 2686.0, + "51": 2041.0, + "52": 2226.0, + "53": 3222.0, + "54": 2784.0, + "55": 2290.0, + "56": 2428.0, + "57": 2146.0, + "58": 3048.0, + "59": 2504.0, + "60": 2612.0, + "61": 2623.0, + "62": 3003.0, + "63": 2762.0, + "64": 2917.0, + "65": 2104.0, + "66": 3550.0, + "67": 2433.0, + "68": 3146.0, + "69": 2877.0, + "70": 3528.0, + "71": 2983.0, + "72": 2640.0, + "73": 3199.0, + "74": 2084.0, + "75": 2809.0, + "76": 3599.0, + "77": 3667.0, + "78": 3680.0, + "79": 3972.0, + "80": 3365.0, + "81": 5042.0, + "82": 3291.0, + "83": 3016.0, + "84": 3592.0, + "85": 3792.0, + "86": 3192.0, + "87": 4219.0, + "88": 3376.0, + "89": 4110.0, + "90": 3939.0, + "91": 2912.0, + "92": 4114.0, + "93": 3499.0, + "94": 4339.0, + "95": 3829.0, + "96": 3875.0, + "97": 4100.0, + "98": 4889.0, + "99": 3771.0, + "100": 3390.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..d7593924d14 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.86104, + "2": 10.85751, + "3": 10.86157, + "4": 10.84944, + "5": 10.88371, + "6": 10.88763, + "7": 10.86427, + "8": 10.87317, + "9": 10.86952, + "10": 10.84263, + "11": 10.88626, + "12": 10.88784, + "13": 10.89496, + "14": 10.90319, + "15": 10.87935, + "16": 10.88588, + "17": 10.86428, + "18": 10.88923, + "19": 10.88151, + "20": 10.87405, + "21": 10.88996, + "22": 10.83151, + "23": 10.89289, + "24": 10.85821, + "25": 10.82867, + "26": 10.82729, + "27": 10.85428, + "28": 10.84631, + "29": 10.85408, + "30": 10.77191, + "31": 10.67404, + "32": 10.78923, + "33": 10.7757, + "34": 10.67639, + "35": 10.67622, + "36": 10.63402, + "37": 10.69312, + "38": 10.61026, + "39": 10.70232, + "40": 10.517, + "41": 10.54604, + "42": 10.57058, + "43": 10.32305, + "44": 10.39205, + "45": 10.28436, + "46": 10.27329, + "47": 10.4798, + "48": 10.25535, + "49": 10.01605, + "50": 10.27861, + "51": 10.21825, + "52": 10.1281, + "53": 10.35922, + "54": 10.25909, + "55": 10.20112, + "56": 9.9815, + "57": 9.84915, + "58": 10.12333, + "59": 9.90734, + "60": 9.83306, + "61": 9.97107, + "62": 10.22132, + "63": 9.6767, + "64": 10.01779, + "65": 9.26979, + "66": 9.9402, + "67": 9.62874, + "68": 9.9875, + "69": 9.98441, + "70": 9.92662, + "71": 9.80996, + "72": 9.79208, + "73": 9.68101, + "74": 9.18023, + "75": 9.61385, + "76": 9.28826, + "77": 10.19395, + "78": 9.87453, + "79": 9.52966, + "80": 9.56419, + "81": 9.63453, + "82": 9.82245, + "83": 9.47207, + "84": 9.54654, + "85": 9.74319, + "86": 9.2009, + "87": 9.70113, + "88": 9.86518, + "89": 9.7307, + "90": 9.92148, + "91": 9.4869, + "92": 9.47682, + "93": 9.2135, + "94": 8.94897, + "95": 9.6163, + "96": 9.63416, + "97": 9.41229, + "98": 9.77615, + "99": 9.00251, + "100": 9.5087 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 284527616.0, + "2": 284527616.0, + "3": 284527616.0, + "4": 284527616.0, + "5": 284527616.0, + "6": 284527616.0, + "7": 284527616.0, + "8": 284527616.0, + "9": 284527616.0, + "10": 284527616.0, + "11": 284527616.0, + "12": 284527616.0, + "13": 284527616.0, + "14": 284527616.0, + "15": 284527616.0, + "16": 416513536.0, + "17": 416513536.0, + "18": 416513536.0, + "19": 416513536.0, + "20": 416513536.0, + "21": 416513536.0, + "22": 416513536.0, + "23": 416513536.0, + "24": 416513536.0, + "25": 416513536.0, + "26": 416513536.0, + "27": 416513536.0, + "28": 416513536.0, + "29": 416513536.0, + "30": 416513536.0, + "31": 416513536.0, + "32": 416513536.0, + "33": 416513536.0, + "34": 416513536.0, + "35": 416513536.0, + "36": 416513536.0, + "37": 416513536.0, + "38": 416513536.0, + "39": 416513536.0, + "40": 416513536.0, + "41": 416513536.0, + "42": 416513536.0, + "43": 416513536.0, + "44": 416513536.0, + "45": 416513536.0, + "46": 416513536.0, + "47": 416513536.0, + "48": 416513536.0, + "49": 416513536.0, + "50": 416513536.0, + "51": 416513536.0, + "52": 416513536.0, + "53": 416513536.0, + "54": 416513536.0, + "55": 416513536.0, + "56": 416513536.0, + "57": 416513536.0, + "58": 416513536.0, + "59": 416513536.0, + "60": 416513536.0, + "61": 416513536.0, + "62": 416513536.0, + "63": 416513536.0, + "64": 416513536.0, + "65": 416513536.0, + "66": 416513536.0, + "67": 416513536.0, + "68": 416513536.0, + "69": 416513536.0, + "70": 416513536.0, + "71": 416513536.0, + "72": 416513536.0, + "73": 416513536.0, + "74": 416513536.0, + "75": 416513536.0, + "76": 416513536.0, + "77": 416513536.0, + "78": 416513536.0, + "79": 416513536.0, + "80": 416513536.0, + "81": 416513536.0, + "82": 416513536.0, + "83": 416513536.0, + "84": 416513536.0, + "85": 416513536.0, + "86": 416513536.0, + "87": 416513536.0, + "88": 416513536.0, + "89": 416513536.0, + "90": 416513536.0, + "91": 416513536.0, + "92": 416513536.0, + "93": 416513536.0, + "94": 416513536.0, + "95": 416513536.0, + "96": 416513536.0, + "97": 416513536.0, + "98": 416513536.0, + "99": 416513536.0, + "100": 416513536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1464319488.0, + "2": 1465368064.0, + "3": 1465368576.0, + "4": 1465368576.0, + "5": 1465368576.0, + "6": 1465368576.0, + "7": 1465368576.0, + "8": 1465368576.0, + "9": 1465368576.0, + "10": 1465368576.0, + "11": 1465368576.0, + "12": 1465368576.0, + "13": 1465368576.0, + "14": 1465368576.0, + "15": 1465368576.0, + "16": 1465368576.0, + "17": 1597089792.0, + "18": 1597089792.0, + "19": 1597089792.0, + "20": 1597089792.0, + "21": 1597089792.0, + "22": 1597089792.0, + "23": 1597089792.0, + "24": 1597089792.0, + "25": 1597089792.0, + "26": 1597089792.0, + "27": 1597089792.0, + "28": 1597089792.0, + "29": 1597089792.0, + "30": 1597089792.0, + "31": 1597089792.0, + "32": 1597089792.0, + "33": 1597089792.0, + "34": 1597089792.0, + "35": 1597089792.0, + "36": 1597089792.0, + "37": 1597089792.0, + "38": 1597089792.0, + "39": 1597089792.0, + "40": 1597089792.0, + "41": 1597089792.0, + "42": 1597089792.0, + "43": 1597089792.0, + "44": 1597089792.0, + "45": 1597089792.0, + "46": 1597089792.0, + "47": 1597089792.0, + "48": 1597089792.0, + "49": 1597089792.0, + "50": 1597089792.0, + "51": 1597089792.0, + "52": 1597089792.0, + "53": 1597089792.0, + "54": 1597089792.0, + "55": 1597089792.0, + "56": 1597089792.0, + "57": 1597089792.0, + "58": 1597089792.0, + "59": 1597089792.0, + "60": 1597089792.0, + "61": 1597089792.0, + "62": 1597089792.0, + "63": 1597089792.0, + "64": 1597089792.0, + "65": 1597089792.0, + "66": 1597089792.0, + "67": 1597091328.0, + "68": 1597091328.0, + "69": 1597091328.0, + "70": 1597091328.0, + "71": 1597091328.0, + "72": 1597091328.0, + "73": 1597091328.0, + "74": 1597091328.0, + "75": 1597091328.0, + "76": 1597091328.0, + "77": 1597091328.0, + "78": 1597091328.0, + "79": 1597091328.0, + "80": 1597091328.0, + "81": 1597091328.0, + "82": 1597091328.0, + "83": 1597091328.0, + "84": 1597091328.0, + "85": 1597091328.0, + "86": 1597091328.0, + "87": 1597091328.0, + "88": 1597091328.0, + "89": 1597091840.0, + "90": 1597091840.0, + "91": 1597091840.0, + "92": 1597091840.0, + "93": 1597091840.0, + "94": 1597091840.0, + "95": 1597091840.0, + "96": 1597091840.0, + "97": 1597091840.0, + "98": 1597091840.0, + "99": 1597091840.0, + "100": 1597091840.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 8.86459, + "2": 0.35839, + "3": 0.5214, + "4": 0.31404, + "5": 0.31247, + "6": 0.30997, + "7": 0.30873, + "8": 0.49835, + "9": 0.30592, + "10": 0.30506, + "11": 0.30662, + "12": 0.30928, + "13": 0.30537, + "14": 0.30594, + "15": 0.30802, + "16": 0.43126, + "17": 0.30967, + "18": 0.53614, + "19": 0.64808, + "20": 0.31719, + "21": 0.31628, + "22": 0.30781, + "23": 0.32412, + "24": 0.31672, + "25": 0.32015, + "26": 0.31659, + "27": 0.31615, + "28": 0.3194, + "29": 0.32624, + "30": 0.31611, + "31": 0.32028, + "32": 0.33615, + "33": 0.31587, + "34": 0.31903, + "35": 0.33274, + "36": 0.3171, + "37": 0.31597, + "38": 0.32394, + "39": 0.316, + "40": 0.31757, + "41": 0.32645, + "42": 0.32417, + "43": 0.31631, + "44": 0.32431, + "45": 0.31726, + "46": 0.31727, + "47": 0.32304, + "48": 0.32395, + "49": 0.31889, + "50": 0.31989, + "51": 0.32325, + "52": 0.31611, + "53": 0.31629, + "54": 0.32342, + "55": 0.31477, + "56": 0.31566, + "57": 0.32276, + "58": 0.31546, + "59": 0.31489, + "60": 0.31909, + "61": 0.32058, + "62": 0.31567, + "63": 0.31971, + "64": 0.32041, + "65": 0.31499, + "66": 0.3179, + "67": 0.32106, + "68": 0.31511, + "69": 0.31464, + "70": 0.32289, + "71": 0.31535, + "72": 0.3155, + "73": 0.32255, + "74": 0.31506, + "75": 0.3148, + "76": 0.32238, + "77": 0.31466, + "78": 0.31532, + "79": 0.32059, + "80": 0.31659, + "81": 0.31482, + "82": 0.31978, + "83": 0.31945, + "84": 0.31576, + "85": 0.31726, + "86": 0.32066, + "87": 0.31517, + "88": 0.31517, + "89": 0.32561, + "90": 0.3153, + "91": 0.31485, + "92": 0.32199, + "93": 0.31486, + "94": 0.31701, + "95": 0.32449, + "96": 0.3188, + "97": 0.31788, + "98": 0.32439, + "99": 0.31804, + "100": 0.31798 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": 2261.0, + "17": "nan", + "18": 2424.0, + "19": 2800.0, + "20": 1777.0, + "21": 2046.0, + "22": "nan", + "23": 2489.0, + "24": 2136.0, + "25": 2124.0, + "26": 1902.0, + "27": 2006.0, + "28": 2337.0, + "29": 2425.0, + "30": 2262.0, + "31": 1584.0, + "32": 2470.0, + "33": 2074.0, + "34": 1679.0, + "35": 1763.0, + "36": 1918.0, + "37": 2542.0, + "38": 2195.0, + "39": 3045.0, + "40": 1875.0, + "41": 3199.0, + "42": 2508.0, + "43": 2563.0, + "44": 1898.0, + "45": 2434.0, + "46": 2065.0, + "47": 2739.0, + "48": 2291.0, + "49": 1821.0, + "50": 2634.0, + "51": 2172.0, + "52": 2278.0, + "53": 3531.0, + "54": 2662.0, + "55": 2383.0, + "56": 2480.0, + "57": 2136.0, + "58": 3305.0, + "59": 2485.0, + "60": 2832.0, + "61": 2847.0, + "62": 2841.0, + "63": 2867.0, + "64": 3107.0, + "65": 2223.0, + "66": 3682.0, + "67": 2533.0, + "68": 3137.0, + "69": 2650.0, + "70": 3836.0, + "71": 2945.0, + "72": 2727.0, + "73": 3322.0, + "74": 2186.0, + "75": 2913.0, + "76": 3553.0, + "77": 3629.0, + "78": 3871.0, + "79": 4097.0, + "80": 3398.0, + "81": 5006.0, + "82": 3345.0, + "83": 3174.0, + "84": 3718.0, + "85": 3618.0, + "86": 3181.0, + "87": 3995.0, + "88": 3634.0, + "89": 4250.0, + "90": 3676.0, + "91": 2926.0, + "92": 4446.0, + "93": 3780.0, + "94": 4430.0, + "95": 4082.0, + "96": 3952.0, + "97": 4117.0, + "98": 5049.0, + "99": 4122.0, + "100": 3502.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..7a89171c0cd --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.86104, + "2": 10.85751, + "3": 10.86157, + "4": 10.84944, + "5": 10.88371, + "6": 10.88763, + "7": 10.86427, + "8": 10.87317, + "9": 10.86952, + "10": 10.84263, + "11": 10.88626, + "12": 10.88784, + "13": 10.89496, + "14": 10.90319, + "15": 10.87935, + "16": 10.88588, + "17": 10.86428, + "18": 10.88923, + "19": 10.88151, + "20": 10.87405, + "21": 10.88996, + "22": 10.83151, + "23": 10.89289, + "24": 10.85821, + "25": 10.82867, + "26": 10.82729, + "27": 10.85428, + "28": 10.84631, + "29": 10.85408, + "30": 10.77191, + "31": 10.67404, + "32": 10.78923, + "33": 10.7757, + "34": 10.67639, + "35": 10.67622, + "36": 10.63402, + "37": 10.69312, + "38": 10.61026, + "39": 10.70232, + "40": 10.517, + "41": 10.54604, + "42": 10.57058, + "43": 10.32305, + "44": 10.39205, + "45": 10.28436, + "46": 10.27329, + "47": 10.4798, + "48": 10.25535, + "49": 10.01605, + "50": 10.27861, + "51": 10.21825, + "52": 10.1281, + "53": 10.35922, + "54": 10.25909, + "55": 10.20112, + "56": 9.9815, + "57": 9.84915, + "58": 10.12333, + "59": 9.90734, + "60": 9.83306, + "61": 9.97107, + "62": 10.22132, + "63": 9.6767, + "64": 10.01779, + "65": 9.26979, + "66": 9.9402, + "67": 9.62874, + "68": 9.9875, + "69": 9.98441, + "70": 9.92662, + "71": 9.80996, + "72": 9.79208, + "73": 9.68101, + "74": 9.18023, + "75": 9.61385, + "76": 9.28826, + "77": 10.19395, + "78": 9.87453, + "79": 9.52966, + "80": 9.56419, + "81": 9.63453, + "82": 9.82245, + "83": 9.47207, + "84": 9.54654, + "85": 9.74319, + "86": 9.2009, + "87": 9.70113, + "88": 9.86518, + "89": 9.7307, + "90": 9.92148, + "91": 9.4869, + "92": 9.47682, + "93": 9.2135, + "94": 8.94897, + "95": 9.6163, + "96": 9.63416, + "97": 9.41229, + "98": 9.77615, + "99": 9.00251, + "100": 9.5087 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 284527616.0, + "2": 284527616.0, + "3": 284527616.0, + "4": 284527616.0, + "5": 284527616.0, + "6": 284527616.0, + "7": 284527616.0, + "8": 284527616.0, + "9": 284527616.0, + "10": 284527616.0, + "11": 284527616.0, + "12": 284527616.0, + "13": 284527616.0, + "14": 284527616.0, + "15": 284527616.0, + "16": 416513536.0, + "17": 416513536.0, + "18": 416513536.0, + "19": 416513536.0, + "20": 416513536.0, + "21": 416513536.0, + "22": 416513536.0, + "23": 416513536.0, + "24": 416513536.0, + "25": 416513536.0, + "26": 416513536.0, + "27": 416513536.0, + "28": 416513536.0, + "29": 416513536.0, + "30": 416513536.0, + "31": 416513536.0, + "32": 416513536.0, + "33": 416513536.0, + "34": 416513536.0, + "35": 416513536.0, + "36": 416513536.0, + "37": 416513536.0, + "38": 416513536.0, + "39": 416513536.0, + "40": 416513536.0, + "41": 416513536.0, + "42": 416513536.0, + "43": 416513536.0, + "44": 416513536.0, + "45": 416513536.0, + "46": 416513536.0, + "47": 416513536.0, + "48": 416513536.0, + "49": 416513536.0, + "50": 416513536.0, + "51": 416513536.0, + "52": 416513536.0, + "53": 416513536.0, + "54": 416513536.0, + "55": 416513536.0, + "56": 416513536.0, + "57": 416513536.0, + "58": 416513536.0, + "59": 416513536.0, + "60": 416513536.0, + "61": 416513536.0, + "62": 416513536.0, + "63": 416513536.0, + "64": 416513536.0, + "65": 416513536.0, + "66": 416513536.0, + "67": 416513536.0, + "68": 416513536.0, + "69": 416513536.0, + "70": 416513536.0, + "71": 416513536.0, + "72": 416513536.0, + "73": 416513536.0, + "74": 416513536.0, + "75": 416513536.0, + "76": 416513536.0, + "77": 416513536.0, + "78": 416513536.0, + "79": 416513536.0, + "80": 416513536.0, + "81": 416513536.0, + "82": 416513536.0, + "83": 416513536.0, + "84": 416513536.0, + "85": 416513536.0, + "86": 416513536.0, + "87": 416513536.0, + "88": 416513536.0, + "89": 416513536.0, + "90": 416513536.0, + "91": 416513536.0, + "92": 416513536.0, + "93": 416513536.0, + "94": 416513536.0, + "95": 416513536.0, + "96": 416513536.0, + "97": 416513536.0, + "98": 416513536.0, + "99": 416513536.0, + "100": 416513536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1465368064.0, + "2": 1465368064.0, + "3": 1465368576.0, + "4": 1465368576.0, + "5": 1465368576.0, + "6": 1465368576.0, + "7": 1465368576.0, + "8": 1465368576.0, + "9": 1465368576.0, + "10": 1465368576.0, + "11": 1465368576.0, + "12": 1465368576.0, + "13": 1465368576.0, + "14": 1465368576.0, + "15": 1465368576.0, + "16": 1465368576.0, + "17": 1597092352.0, + "18": 1597092352.0, + "19": 1597092352.0, + "20": 1597092352.0, + "21": 1597092352.0, + "22": 1597092352.0, + "23": 1597092352.0, + "24": 1597092352.0, + "25": 1597092352.0, + "26": 1597092352.0, + "27": 1597092352.0, + "28": 1597092352.0, + "29": 1597092352.0, + "30": 1597092352.0, + "31": 1597092352.0, + "32": 1597092352.0, + "33": 1597092352.0, + "34": 1597092352.0, + "35": 1597092352.0, + "36": 1597092352.0, + "37": 1597092352.0, + "38": 1597092352.0, + "39": 1597092352.0, + "40": 1597092352.0, + "41": 1597092352.0, + "42": 1597092352.0, + "43": 1597092352.0, + "44": 1597092352.0, + "45": 1597092352.0, + "46": 1597092352.0, + "47": 1597092352.0, + "48": 1597092352.0, + "49": 1597092352.0, + "50": 1597092352.0, + "51": 1597092352.0, + "52": 1597092352.0, + "53": 1597092352.0, + "54": 1597092352.0, + "55": 1597092352.0, + "56": 1597092352.0, + "57": 1597092352.0, + "58": 1597092352.0, + "59": 1597092352.0, + "60": 1597092352.0, + "61": 1597092352.0, + "62": 1597092352.0, + "63": 1597092352.0, + "64": 1597092352.0, + "65": 1597092352.0, + "66": 1597092352.0, + "67": 1597092352.0, + "68": 1597092352.0, + "69": 1597092352.0, + "70": 1597092352.0, + "71": 1597092352.0, + "72": 1597092352.0, + "73": 1597092352.0, + "74": 1597092352.0, + "75": 1597092352.0, + "76": 1597092352.0, + "77": 1597092352.0, + "78": 1597092352.0, + "79": 1597092352.0, + "80": 1597092352.0, + "81": 1597092352.0, + "82": 1597092352.0, + "83": 1597092352.0, + "84": 1597092352.0, + "85": 1597092352.0, + "86": 1597092352.0, + "87": 1597092352.0, + "88": 1597092352.0, + "89": 1597092352.0, + "90": 1597092352.0, + "91": 1597092352.0, + "92": 1597092352.0, + "93": 1597092352.0, + "94": 1597092352.0, + "95": 1597092352.0, + "96": 1597092352.0, + "97": 1597092352.0, + "98": 1597092352.0, + "99": 1597092352.0, + "100": 1597092352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.35929, + "2": 0.34184, + "3": 0.31203, + "4": 0.30787, + "5": 0.30852, + "6": 0.30384, + "7": 0.30155, + "8": 0.30427, + "9": 0.51091, + "10": 0.30553, + "11": 0.30575, + "12": 0.61502, + "13": 0.30643, + "14": 0.57901, + "15": 0.52637, + "16": 0.42283, + "17": 0.30589, + "18": 0.32011, + "19": 0.31661, + "20": 0.31932, + "21": 0.32147, + "22": 0.48024, + "23": 0.32123, + "24": 0.32199, + "25": 0.6542, + "26": 0.65941, + "27": 0.31987, + "28": 0.32071, + "29": 0.31705, + "30": 0.3217, + "31": 0.32104, + "32": 0.31733, + "33": 0.31859, + "34": 0.32143, + "35": 0.31823, + "36": 0.31764, + "37": 0.32459, + "38": 0.31791, + "39": 0.31709, + "40": 0.3224, + "41": 0.32157, + "42": 0.31769, + "43": 0.32161, + "44": 0.32202, + "45": 0.31808, + "46": 0.32115, + "47": 0.32215, + "48": 0.31811, + "49": 0.32081, + "50": 0.3219, + "51": 0.32586, + "52": 0.32097, + "53": 0.32086, + "54": 0.31965, + "55": 0.32299, + "56": 0.32057, + "57": 0.31894, + "58": 0.3227, + "59": 0.31818, + "60": 0.31815, + "61": 0.32331, + "62": 0.31818, + "63": 0.31777, + "64": 0.32493, + "65": 0.31806, + "66": 0.31829, + "67": 0.32281, + "68": 0.31721, + "69": 0.31771, + "70": 0.323, + "71": 0.31739, + "72": 0.31848, + "73": 0.31915, + "74": 0.3218, + "75": 0.31772, + "76": 0.31789, + "77": 0.32187, + "78": 0.31771, + "79": 0.3183, + "80": 0.32385, + "81": 0.31791, + "82": 0.31794, + "83": 0.32606, + "84": 0.31846, + "85": 0.31748, + "86": 0.32559, + "87": 0.31829, + "88": 0.31805, + "89": 0.32163, + "90": 0.31834, + "91": 0.31753, + "92": 0.32249, + "93": 0.3175, + "94": 0.31731, + "95": 0.31891, + "96": 0.31986, + "97": 0.31789, + "98": 0.31909, + "99": 0.32353, + "100": 0.31768 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": "nan", + "2": "nan", + "3": "nan", + "4": "nan", + "5": "nan", + "6": "nan", + "7": "nan", + "8": "nan", + "9": "nan", + "10": "nan", + "11": "nan", + "12": "nan", + "13": "nan", + "14": "nan", + "15": "nan", + "16": 2261.0, + "17": "nan", + "18": 2424.0, + "19": 2800.0, + "20": 1777.0, + "21": 2046.0, + "22": "nan", + "23": 2489.0, + "24": 2136.0, + "25": 2124.0, + "26": 1902.0, + "27": 2006.0, + "28": 2337.0, + "29": 2425.0, + "30": 2262.0, + "31": 1584.0, + "32": 2470.0, + "33": 2074.0, + "34": 1679.0, + "35": 1763.0, + "36": 1918.0, + "37": 2542.0, + "38": 2195.0, + "39": 3045.0, + "40": 1875.0, + "41": 3199.0, + "42": 2508.0, + "43": 2563.0, + "44": 1898.0, + "45": 2434.0, + "46": 2065.0, + "47": 2739.0, + "48": 2291.0, + "49": 1821.0, + "50": 2634.0, + "51": 2172.0, + "52": 2278.0, + "53": 3531.0, + "54": 2662.0, + "55": 2383.0, + "56": 2480.0, + "57": 2136.0, + "58": 3305.0, + "59": 2485.0, + "60": 2832.0, + "61": 2847.0, + "62": 2841.0, + "63": 2867.0, + "64": 3107.0, + "65": 2223.0, + "66": 3682.0, + "67": 2533.0, + "68": 3137.0, + "69": 2650.0, + "70": 3836.0, + "71": 2945.0, + "72": 2727.0, + "73": 3322.0, + "74": 2186.0, + "75": 2913.0, + "76": 3553.0, + "77": 3629.0, + "78": 3871.0, + "79": 4097.0, + "80": 3398.0, + "81": 5006.0, + "82": 3345.0, + "83": 3174.0, + "84": 3718.0, + "85": 3618.0, + "86": 3181.0, + "87": 3995.0, + "88": 3634.0, + "89": 4250.0, + "90": 3676.0, + "91": 2926.0, + "92": 4446.0, + "93": 3780.0, + "94": 4430.0, + "95": 4082.0, + "96": 3952.0, + "97": 4117.0, + "98": 5049.0, + "99": 4122.0, + "100": 3502.0 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..3aad045fc8e --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,162 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 12.58569, + "2": 12.58406, + "3": 12.58486, + "4": 12.58642, + "5": 12.58279, + "6": 12.57912, + "7": 12.56177, + "8": 12.52304, + "9": 12.4966, + "10": 12.4826, + "11": 12.31462, + "12": 12.272, + "13": 12.20924, + "14": 12.20094, + "15": 11.79651, + "16": 11.78035, + "17": 11.74188, + "18": 11.71656, + "19": 11.59074, + "20": 11.47672, + "21": 11.23784, + "22": 11.3586, + "23": 11.25768, + "24": 11.14081, + "25": 10.97989 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 521035392.0, + "2": 521662624.0, + "3": 520932992.0, + "4": 521225120.0, + "5": 520993600.0, + "6": 521369824.0, + "7": 521417344.0, + "8": 521054784.0, + "9": 521458592.0, + "10": 521175520.0, + "11": 522277376.0, + "12": 521435904.0, + "13": 521472640.0, + "14": 522442496.0, + "15": 521589568.0, + "16": 521414080.0, + "17": 521025696.0, + "18": 521279168.0, + "19": 521154400.0, + "20": 521132352.0, + "21": 522909696.0, + "22": 521591904.0, + "23": 521353504.0, + "24": 521426496.0, + "25": 523547008.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 24540168192.0, + "2": 24540168192.0, + "3": 24540168192.0, + "4": 24540168192.0, + "5": 24540168192.0, + "6": 24540168192.0, + "7": 24540168192.0, + "8": 24540168192.0, + "9": 24540168192.0, + "10": 24540168192.0, + "11": 24540168192.0, + "12": 24540168192.0, + "13": 24540168192.0, + "14": 24540168192.0, + "15": 24540168192.0, + "16": 24540168192.0, + "17": 24540168192.0, + "18": 24540168192.0, + "19": 24540168192.0, + "20": 24540168192.0, + "21": 24540168192.0, + "22": 24540168192.0, + "23": 24540168192.0, + "24": 24540168192.0, + "25": 24540168192.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 52730810368.0, + "2": 60518309888.0, + "3": 60518309888.0, + "4": 60518309888.0, + "5": 60518309888.0, + "6": 60518309888.0, + "7": 60518309888.0, + "8": 60518309888.0, + "9": 60518309888.0, + "10": 60518309888.0, + "11": 60518309888.0, + "12": 60518309888.0, + "13": 60518309888.0, + "14": 60518309888.0, + "15": 60518309888.0, + "16": 60518309888.0, + "17": 60518309888.0, + "18": 60518309888.0, + "19": 60518309888.0, + "20": 60518309888.0, + "21": 60518309888.0, + "22": 60518309888.0, + "23": 60518309888.0, + "24": 60518309888.0, + "25": 60518309888.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": "nan", + "2": 9.35391, + "3": "nan", + "4": 1.17482, + "5": "nan", + "6": 1.17131, + "7": "nan", + "8": 1.17328, + "9": "nan", + "10": 1.17214, + "11": "nan", + "12": 1.17467, + "13": "nan", + "14": 1.17439, + "15": "nan", + "16": 1.17582, + "17": "nan", + "18": 1.1764, + "19": "nan", + "20": 1.17744, + "21": "nan", + "22": 1.17439, + "23": "nan", + "24": 1.17461, + "25": "nan" + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..478f889b21c --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,162 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 12.58569, + "2": 12.58406, + "3": 12.58486, + "4": 12.58642, + "5": 12.58279, + "6": 12.57912, + "7": 12.56177, + "8": 12.52304, + "9": 12.4966, + "10": 12.4826, + "11": 12.31462, + "12": 12.272, + "13": 12.20924, + "14": 12.20094, + "15": 11.79651, + "16": 11.78035, + "17": 11.74188, + "18": 11.71656, + "19": 11.59074, + "20": 11.47672, + "21": 11.23784, + "22": 11.3586, + "23": 11.25768, + "24": 11.14081, + "25": 10.97989 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 521035392.0, + "2": 521662624.0, + "3": 520932992.0, + "4": 521225120.0, + "5": 520993600.0, + "6": 521369824.0, + "7": 521417344.0, + "8": 521054784.0, + "9": 521458592.0, + "10": 521175520.0, + "11": 522277376.0, + "12": 521435904.0, + "13": 521472640.0, + "14": 522442496.0, + "15": 521589568.0, + "16": 521414080.0, + "17": 521025696.0, + "18": 521279168.0, + "19": 521154400.0, + "20": 521132352.0, + "21": 522909696.0, + "22": 521591904.0, + "23": 521353504.0, + "24": 521426496.0, + "25": 523547008.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 24540168192.0, + "2": 24540168192.0, + "3": 24540168192.0, + "4": 24540168192.0, + "5": 24540168192.0, + "6": 24540168192.0, + "7": 24540168192.0, + "8": 24540168192.0, + "9": 24540168192.0, + "10": 24540168192.0, + "11": 24540168192.0, + "12": 24540168192.0, + "13": 24540168192.0, + "14": 24540168192.0, + "15": 24540168192.0, + "16": 24540168192.0, + "17": 24540168192.0, + "18": 24540168192.0, + "19": 24540168192.0, + "20": 24540168192.0, + "21": 24540389376.0, + "22": 24540168192.0, + "23": 24540168192.0, + "24": 24540168192.0, + "25": 24540168192.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 52730810368.0, + "2": 60518424576.0, + "3": 60518424576.0, + "4": 60518424576.0, + "5": 60518424576.0, + "6": 60518424576.0, + "7": 60518424576.0, + "8": 60518424576.0, + "9": 60518424576.0, + "10": 60518424576.0, + "11": 60518424576.0, + "12": 60518424576.0, + "13": 60518424576.0, + "14": 60518424576.0, + "15": 60518424576.0, + "16": 60518424576.0, + "17": 60518424576.0, + "18": 60518424576.0, + "19": 60518424576.0, + "20": 60518424576.0, + "21": 60518424576.0, + "22": 60518424576.0, + "23": 60518424576.0, + "24": 60518424576.0, + "25": 60518424576.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": "nan", + "2": 10.03336, + "3": "nan", + "4": 1.18525, + "5": "nan", + "6": 1.18158, + "7": "nan", + "8": 1.18536, + "9": "nan", + "10": 1.18428, + "11": "nan", + "12": 1.18625, + "13": "nan", + "14": 1.18256, + "15": "nan", + "16": 1.18023, + "17": "nan", + "18": 1.18227, + "19": "nan", + "20": 1.18284, + "21": "nan", + "22": 1.18238, + "23": "nan", + "24": 1.18151, + "25": "nan" + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..a059e81b488 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,162 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 12.59715, + "2": 12.60067, + "3": 12.59727, + "4": 12.60021, + "5": 12.59013, + "6": 12.58834, + "7": 12.57605, + "8": 12.5362, + "9": 12.50745, + "10": 12.49091, + "11": 12.32614, + "12": 12.29366, + "13": 12.22589, + "14": 12.23023, + "15": 11.82108, + "16": 11.80586, + "17": 11.77001, + "18": 11.74946, + "19": 11.62189, + "20": 11.51704, + "21": 11.27121, + "22": 11.38966, + "23": 11.29559, + "24": 11.16591, + "25": 11.00354 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 523037536.0, + "2": 523668064.0, + "3": 522933056.0, + "4": 523228480.0, + "5": 523009792.0, + "6": 523364320.0, + "7": 523427840.0, + "8": 523074688.0, + "9": 523459232.0, + "10": 523184992.0, + "11": 524288736.0, + "12": 523447712.0, + "13": 523490112.0, + "14": 524476096.0, + "15": 523630496.0, + "16": 523459232.0, + "17": 523075936.0, + "18": 523360192.0, + "19": 523206816.0, + "20": 523230848.0, + "21": 524941248.0, + "22": 523654464.0, + "23": 523420576.0, + "24": 523494720.0, + "25": 525638016.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 20663519232.0, + "2": 20663519232.0, + "3": 20663519232.0, + "4": 20663519232.0, + "5": 20663519232.0, + "6": 20663519232.0, + "7": 20663519232.0, + "8": 20663519232.0, + "9": 20663519232.0, + "10": 20663519232.0, + "11": 20663519232.0, + "12": 20663519232.0, + "13": 20663519232.0, + "14": 20663519232.0, + "15": 20663519232.0, + "16": 20663519232.0, + "17": 20663519232.0, + "18": 20663519232.0, + "19": 20663519232.0, + "20": 20663519232.0, + "21": 20663519232.0, + "22": 20663519232.0, + "23": 20663519232.0, + "24": 20663519232.0, + "25": 20663519232.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 50289545216.0, + "2": 57143791616.0, + "3": 57143791616.0, + "4": 57143791616.0, + "5": 57143791616.0, + "6": 57143791616.0, + "7": 57143791616.0, + "8": 57143791616.0, + "9": 57143791616.0, + "10": 57143791616.0, + "11": 57143791616.0, + "12": 57143791616.0, + "13": 57143791616.0, + "14": 57143791616.0, + "15": 57143791616.0, + "16": 57143791616.0, + "17": 57143791616.0, + "18": 57143791616.0, + "19": 57143791616.0, + "20": 57143791616.0, + "21": 57143791616.0, + "22": 57143791616.0, + "23": 57143791616.0, + "24": 57143791616.0, + "25": 57143791616.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": "nan", + "2": 6.55725, + "3": "nan", + "4": 1.12211, + "5": "nan", + "6": 1.11783, + "7": "nan", + "8": 1.11727, + "9": "nan", + "10": 1.1176, + "11": "nan", + "12": 1.11841, + "13": "nan", + "14": 1.11918, + "15": "nan", + "16": 1.12025, + "17": "nan", + "18": 1.11888, + "19": "nan", + "20": 1.12, + "21": "nan", + "22": 1.11939, + "23": "nan", + "24": 1.11949, + "25": "nan" + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..0847af86737 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,162 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 12.59715, + "2": 12.60067, + "3": 12.59727, + "4": 12.60021, + "5": 12.59013, + "6": 12.58834, + "7": 12.57605, + "8": 12.5362, + "9": 12.50745, + "10": 12.49091, + "11": 12.32614, + "12": 12.29366, + "13": 12.22589, + "14": 12.23023, + "15": 11.82108, + "16": 11.80586, + "17": 11.77001, + "18": 11.74946, + "19": 11.62189, + "20": 11.51704, + "21": 11.27121, + "22": 11.38966, + "23": 11.29559, + "24": 11.16591, + "25": 11.00354 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 523037536.0, + "2": 523668064.0, + "3": 522933056.0, + "4": 523228480.0, + "5": 523009792.0, + "6": 523364320.0, + "7": 523427840.0, + "8": 523074688.0, + "9": 523459232.0, + "10": 523184992.0, + "11": 524288736.0, + "12": 523447712.0, + "13": 523490112.0, + "14": 524476096.0, + "15": 523630496.0, + "16": 523459232.0, + "17": 523075936.0, + "18": 523360192.0, + "19": 523206816.0, + "20": 523230848.0, + "21": 524941248.0, + "22": 523654464.0, + "23": 523420576.0, + "24": 523494720.0, + "25": 525638016.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 20663519232.0, + "2": 20663519232.0, + "3": 20663519232.0, + "4": 20663519232.0, + "5": 20663519232.0, + "6": 20663519232.0, + "7": 20663519232.0, + "8": 20663519232.0, + "9": 20663519232.0, + "10": 20663519232.0, + "11": 20663519232.0, + "12": 20663519232.0, + "13": 20663519232.0, + "14": 20663519232.0, + "15": 20663519232.0, + "16": 20663519232.0, + "17": 20663519232.0, + "18": 20663519232.0, + "19": 20663519232.0, + "20": 20663519232.0, + "21": 20663519232.0, + "22": 20663519232.0, + "23": 20663519232.0, + "24": 20663519232.0, + "25": 20663519232.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 50289545216.0, + "2": 57143791616.0, + "3": 57143791616.0, + "4": 57143791616.0, + "5": 57143791616.0, + "6": 57143791616.0, + "7": 57143791616.0, + "8": 57143791616.0, + "9": 57143791616.0, + "10": 57143791616.0, + "11": 57143791616.0, + "12": 57143791616.0, + "13": 57143791616.0, + "14": 57143791616.0, + "15": 57143791616.0, + "16": 57143791616.0, + "17": 57143791616.0, + "18": 57143791616.0, + "19": 57143791616.0, + "20": 57143791616.0, + "21": 57143791616.0, + "22": 57143791616.0, + "23": 57143791616.0, + "24": 57143791616.0, + "25": 57143791616.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": "nan", + "2": 6.11084, + "3": "nan", + "4": 1.11678, + "5": "nan", + "6": 1.11532, + "7": "nan", + "8": 1.11539, + "9": "nan", + "10": 1.1161, + "11": "nan", + "12": 1.11723, + "13": "nan", + "14": 1.11756, + "15": "nan", + "16": 1.11596, + "17": "nan", + "18": 1.11605, + "19": "nan", + "20": 1.11783, + "21": "nan", + "22": 1.11636, + "23": "nan", + "24": 1.11585, + "25": "nan" + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index d3e2bdcb541..22254614510 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,287 @@ -{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.89592, "5": 10.89131, "10": 10.88299, "15": 10.84786, "20": 10.74925, "25": 10.59226, "30": 10.41136, "35": 10.28136, "40": 10.09306, "45": 9.84149, "50": 9.91285}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1581.0, "5": 1962.0, "10": 1435.0, "15": 1944.0, "20": 1679.0, "25": 1645.0, "30": 1912.0, "35": 2023.0, "40": 2270.0, "45": 2152.0, "50": 2580.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 581488640.0, "5": 581488640.0, "10": 581488640.0, "15": 581488640.0, "20": 581488640.0, "25": 581488640.0, "30": 581488640.0, "35": 581488640.0, "40": 581488640.0, "45": 581488640.0, "50": 581488640.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4605813248.0, "5": 4702429696.0, "10": 4702429696.0, "15": 4702429696.0, "20": 4702429696.0, "25": 4702429696.0, "30": 4702429696.0, "35": 4702429696.0, "40": 4702429696.0, "45": 4702429696.0, "50": 4702429696.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 6.75074, "5": 0.05633, "10": 0.05789, "15": 0.05558, "20": 0.05703, "25": 0.05856, "30": 0.06132, "35": 0.05777, "40": 0.05818, "45": 0.05736, "50": 0.05735}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.89592, + "2": 10.89514, + "3": 10.88761, + "4": 10.88903, + "5": 10.89131, + "6": 10.90004, + "7": 10.89143, + "8": 10.89938, + "9": 10.90231, + "10": 10.88299, + "11": 10.87827, + "12": 10.89318, + "13": 10.89818, + "14": 10.89188, + "15": 10.84786, + "16": 10.85369, + "17": 10.831, + "18": 10.83994, + "19": 10.82779, + "20": 10.74925, + "21": 10.73558, + "22": 10.61567, + "23": 10.72599, + "24": 10.63027, + "25": 10.59226, + "26": 10.63312, + "27": 10.63277, + "28": 10.58231, + "29": 10.58547, + "30": 10.41136, + "31": 10.15833, + "32": 10.48326, + "33": 10.46651, + "34": 10.23801, + "35": 10.28136, + "36": 10.24029, + "37": 10.3617, + "38": 10.20342, + "39": 10.404, + "40": 10.09306, + "41": 10.15805, + "42": 10.21903, + "43": 9.84274, + "44": 9.97219, + "45": 9.84149, + "46": 9.82007, + "47": 10.14934, + "48": 9.85997, + "49": 9.54155, + "50": 9.91285 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1581.0, + "2": 1674.0, + "3": 1724.0, + "4": 1803.0, + "5": 1962.0, + "6": 1846.0, + "7": 1864.0, + "8": 1792.0, + "9": 1848.0, + "10": 1435.0, + "11": 1868.0, + "12": 1782.0, + "13": 1874.0, + "14": 1783.0, + "15": 1944.0, + "16": 1933.0, + "17": 1807.0, + "18": 1737.0, + "19": 1822.0, + "20": 1679.0, + "21": 1808.0, + "22": 1806.0, + "23": 2077.0, + "24": 1663.0, + "25": 1645.0, + "26": 1719.0, + "27": 1925.0, + "28": 2030.0, + "29": 2042.0, + "30": 1912.0, + "31": 1603.0, + "32": 1938.0, + "33": 2158.0, + "34": 1896.0, + "35": 2023.0, + "36": 1910.0, + "37": 2330.0, + "38": 2298.0, + "39": 2498.0, + "40": 2270.0, + "41": 2464.0, + "42": 2296.0, + "43": 2042.0, + "44": 2138.0, + "45": 2152.0, + "46": 2282.0, + "47": 2529.0, + "48": 2454.0, + "49": 2358.0, + "50": 2580.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 581488640.0, + "2": 581488640.0, + "3": 581488640.0, + "4": 581488640.0, + "5": 581488640.0, + "6": 581488640.0, + "7": 581488640.0, + "8": 581488640.0, + "9": 581488640.0, + "10": 581488640.0, + "11": 581488640.0, + "12": 581488640.0, + "13": 581488640.0, + "14": 581488640.0, + "15": 581488640.0, + "16": 581488640.0, + "17": 581488640.0, + "18": 581488640.0, + "19": 581488640.0, + "20": 581488640.0, + "21": 581488640.0, + "22": 581488640.0, + "23": 581488640.0, + "24": 581488640.0, + "25": 581488640.0, + "26": 581488640.0, + "27": 581488640.0, + "28": 581488640.0, + "29": 581488640.0, + "30": 581488640.0, + "31": 581488640.0, + "32": 581488640.0, + "33": 581488640.0, + "34": 581488640.0, + "35": 581488640.0, + "36": 581488640.0, + "37": 581488640.0, + "38": 581488640.0, + "39": 581488640.0, + "40": 581488640.0, + "41": 581488640.0, + "42": 581488640.0, + "43": 581488640.0, + "44": 581488640.0, + "45": 581488640.0, + "46": 581488640.0, + "47": 581488640.0, + "48": 581488640.0, + "49": 581488640.0, + "50": 581488640.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4605813248.0, + "2": 4702429696.0, + "3": 4702429696.0, + "4": 4702429696.0, + "5": 4702429696.0, + "6": 4702429696.0, + "7": 4702429696.0, + "8": 4702429696.0, + "9": 4702429696.0, + "10": 4702429696.0, + "11": 4702429696.0, + "12": 4702429696.0, + "13": 4702429696.0, + "14": 4702429696.0, + "15": 4702429696.0, + "16": 4702429696.0, + "17": 4702429696.0, + "18": 4702429696.0, + "19": 4702429696.0, + "20": 4702429696.0, + "21": 4702429696.0, + "22": 4702429696.0, + "23": 4702429696.0, + "24": 4702429696.0, + "25": 4702429696.0, + "26": 4702429696.0, + "27": 4702429696.0, + "28": 4702429696.0, + "29": 4702429696.0, + "30": 4702429696.0, + "31": 4702429696.0, + "32": 4702429696.0, + "33": 4702429696.0, + "34": 4702429696.0, + "35": 4702429696.0, + "36": 4702429696.0, + "37": 4702429696.0, + "38": 4702429696.0, + "39": 4702429696.0, + "40": 4702429696.0, + "41": 4702429696.0, + "42": 4702429696.0, + "43": 4702429696.0, + "44": 4702429696.0, + "45": 4702429696.0, + "46": 4702429696.0, + "47": 4702429696.0, + "48": 4702429696.0, + "49": 4702429696.0, + "50": 4702429696.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 6.5684, + "2": 0.10503, + "3": 0.08759, + "4": 0.08854, + "5": 0.08902, + "6": 0.08493, + "7": 0.07755, + "8": 0.0738, + "9": 0.07491, + "10": 0.07437, + "11": 0.07546, + "12": 0.07621, + "13": 0.08298, + "14": 0.07518, + "15": 0.07632, + "16": 0.07439, + "17": 0.07556, + "18": 0.07572, + "19": 0.0773, + "20": 0.07632, + "21": 0.07507, + "22": 0.07379, + "23": 0.07514, + "24": 0.07634, + "25": 0.07537, + "26": 0.07376, + "27": 0.07568, + "28": 0.07436, + "29": 0.07588, + "30": 0.07446, + "31": 0.0821, + "32": 0.08812, + "33": 0.0891, + "34": 0.08234, + "35": 0.07539, + "36": 0.07468, + "37": 0.07649, + "38": 0.07542, + "39": 0.07476, + "40": 0.07444, + "41": 0.07481, + "42": 0.07343, + "43": 0.07666, + "44": 0.08426, + "45": 0.07584, + "46": 0.07674, + "47": 0.07463, + "48": 0.07387, + "49": 0.07347, + "50": 0.07545 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..8e0ed5db84f --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.89592, + "2": 10.89514, + "3": 10.88761, + "4": 10.88903, + "5": 10.89131, + "6": 10.90004, + "7": 10.89143, + "8": 10.89938, + "9": 10.90231, + "10": 10.88299, + "11": 10.87827, + "12": 10.89318, + "13": 10.89818, + "14": 10.89188, + "15": 10.84786, + "16": 10.85369, + "17": 10.831, + "18": 10.83994, + "19": 10.82779, + "20": 10.74925, + "21": 10.73558, + "22": 10.61567, + "23": 10.72599, + "24": 10.63027, + "25": 10.59226, + "26": 10.63312, + "27": 10.63277, + "28": 10.58231, + "29": 10.58547, + "30": 10.41136, + "31": 10.15833, + "32": 10.48326, + "33": 10.46651, + "34": 10.23801, + "35": 10.28136, + "36": 10.24029, + "37": 10.3617, + "38": 10.20342, + "39": 10.404, + "40": 10.09306, + "41": 10.15805, + "42": 10.21903, + "43": 9.84274, + "44": 9.97219, + "45": 9.84149, + "46": 9.82007, + "47": 10.14934, + "48": 9.85997, + "49": 9.54155, + "50": 9.91285 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1581.0, + "2": 1674.0, + "3": 1724.0, + "4": 1803.0, + "5": 1962.0, + "6": 1846.0, + "7": 1864.0, + "8": 1792.0, + "9": 1848.0, + "10": 1435.0, + "11": 1868.0, + "12": 1782.0, + "13": 1874.0, + "14": 1783.0, + "15": 1944.0, + "16": 1933.0, + "17": 1807.0, + "18": 1737.0, + "19": 1822.0, + "20": 1679.0, + "21": 1808.0, + "22": 1806.0, + "23": 2077.0, + "24": 1663.0, + "25": 1645.0, + "26": 1719.0, + "27": 1925.0, + "28": 2030.0, + "29": 2042.0, + "30": 1912.0, + "31": 1603.0, + "32": 1938.0, + "33": 2158.0, + "34": 1896.0, + "35": 2023.0, + "36": 1910.0, + "37": 2330.0, + "38": 2298.0, + "39": 2498.0, + "40": 2270.0, + "41": 2464.0, + "42": 2296.0, + "43": 2042.0, + "44": 2138.0, + "45": 2152.0, + "46": 2282.0, + "47": 2529.0, + "48": 2454.0, + "49": 2358.0, + "50": 2580.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 581488640.0, + "2": 581488640.0, + "3": 581488640.0, + "4": 581488640.0, + "5": 581488640.0, + "6": 581488640.0, + "7": 581488640.0, + "8": 581488640.0, + "9": 581488640.0, + "10": 581488640.0, + "11": 581488640.0, + "12": 581488640.0, + "13": 581488640.0, + "14": 581488640.0, + "15": 581488640.0, + "16": 581488640.0, + "17": 581488640.0, + "18": 581488640.0, + "19": 581488640.0, + "20": 581488640.0, + "21": 581488640.0, + "22": 581488640.0, + "23": 581488640.0, + "24": 581488640.0, + "25": 581488640.0, + "26": 581488640.0, + "27": 581488640.0, + "28": 581488640.0, + "29": 581488640.0, + "30": 581488640.0, + "31": 581488640.0, + "32": 581488640.0, + "33": 581488640.0, + "34": 581488640.0, + "35": 581488640.0, + "36": 581488640.0, + "37": 581488640.0, + "38": 581488640.0, + "39": 581488640.0, + "40": 581488640.0, + "41": 581488640.0, + "42": 581488640.0, + "43": 581488640.0, + "44": 581488640.0, + "45": 581488640.0, + "46": 581488640.0, + "47": 581488640.0, + "48": 581488640.0, + "49": 581488640.0, + "50": 581488640.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4605813248.0, + "2": 4702429696.0, + "3": 4702429696.0, + "4": 4702429696.0, + "5": 4702429696.0, + "6": 4702429696.0, + "7": 4702429696.0, + "8": 4702429696.0, + "9": 4702429696.0, + "10": 4702429696.0, + "11": 4702429696.0, + "12": 4702429696.0, + "13": 4702429696.0, + "14": 4702429696.0, + "15": 4702429696.0, + "16": 4702429696.0, + "17": 4702429696.0, + "18": 4702429696.0, + "19": 4702429696.0, + "20": 4702429696.0, + "21": 4702429696.0, + "22": 4702429696.0, + "23": 4702429696.0, + "24": 4702429696.0, + "25": 4702429696.0, + "26": 4702429696.0, + "27": 4702429696.0, + "28": 4702429696.0, + "29": 4702429696.0, + "30": 4702429696.0, + "31": 4702429696.0, + "32": 4702429696.0, + "33": 4702429696.0, + "34": 4702429696.0, + "35": 4702429696.0, + "36": 4702429696.0, + "37": 4702429696.0, + "38": 4702429696.0, + "39": 4702429696.0, + "40": 4702429696.0, + "41": 4702429696.0, + "42": 4702429696.0, + "43": 4702429696.0, + "44": 4702429696.0, + "45": 4702429696.0, + "46": 4702429696.0, + "47": 4702429696.0, + "48": 4702429696.0, + "49": 4702429696.0, + "50": 4702429696.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 6.83679, + "2": 0.10466, + "3": 0.07514, + "4": 0.07264, + "5": 0.06334, + "6": 0.06416, + "7": 0.06155, + "8": 0.06516, + "9": 0.06439, + "10": 0.06295, + "11": 0.06245, + "12": 0.06307, + "13": 0.06464, + "14": 0.06342, + "15": 0.06273, + "16": 0.0658, + "17": 0.06138, + "18": 0.06379, + "19": 0.06329, + "20": 0.06616, + "21": 0.06117, + "22": 0.06327, + "23": 0.06081, + "24": 0.06339, + "25": 0.06116, + "26": 0.06459, + "27": 0.06165, + "28": 0.06346, + "29": 0.06054, + "30": 0.06342, + "31": 0.06119, + "32": 0.06267, + "33": 0.06074, + "34": 0.0635, + "35": 0.06057, + "36": 0.06382, + "37": 0.06202, + "38": 0.06345, + "39": 0.06229, + "40": 0.06422, + "41": 0.06182, + "42": 0.06246, + "43": 0.06164, + "44": 0.06299, + "45": 0.06869, + "46": 0.06388, + "47": 0.06106, + "48": 0.06243, + "49": 0.06122, + "50": 0.06339 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..db410897813 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.89592, + "2": 10.89514, + "3": 10.88761, + "4": 10.88903, + "5": 10.89131, + "6": 10.90004, + "7": 10.89143, + "8": 10.89938, + "9": 10.90231, + "10": 10.88299, + "11": 10.87827, + "12": 10.89318, + "13": 10.89818, + "14": 10.89188, + "15": 10.84786, + "16": 10.85369, + "17": 10.831, + "18": 10.83994, + "19": 10.82779, + "20": 10.74925, + "21": 10.73558, + "22": 10.61567, + "23": 10.72599, + "24": 10.63027, + "25": 10.59226, + "26": 10.63312, + "27": 10.63277, + "28": 10.58231, + "29": 10.58547, + "30": 10.41136, + "31": 10.15833, + "32": 10.48326, + "33": 10.46651, + "34": 10.23801, + "35": 10.28136, + "36": 10.24029, + "37": 10.3617, + "38": 10.20342, + "39": 10.404, + "40": 10.09306, + "41": 10.15805, + "42": 10.21903, + "43": 9.84274, + "44": 9.97219, + "45": 9.84149, + "46": 9.82007, + "47": 10.14934, + "48": 9.85997, + "49": 9.54155, + "50": 9.91285 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1581.0, + "2": 1674.0, + "3": 1724.0, + "4": 1803.0, + "5": 1962.0, + "6": 1846.0, + "7": 1864.0, + "8": 1792.0, + "9": 1848.0, + "10": 1435.0, + "11": 1868.0, + "12": 1782.0, + "13": 1874.0, + "14": 1783.0, + "15": 1944.0, + "16": 1933.0, + "17": 1807.0, + "18": 1737.0, + "19": 1822.0, + "20": 1679.0, + "21": 1808.0, + "22": 1806.0, + "23": 2077.0, + "24": 1663.0, + "25": 1645.0, + "26": 1719.0, + "27": 1925.0, + "28": 2030.0, + "29": 2042.0, + "30": 1912.0, + "31": 1603.0, + "32": 1938.0, + "33": 2158.0, + "34": 1896.0, + "35": 2023.0, + "36": 1910.0, + "37": 2330.0, + "38": 2298.0, + "39": 2498.0, + "40": 2270.0, + "41": 2464.0, + "42": 2296.0, + "43": 2042.0, + "44": 2138.0, + "45": 2152.0, + "46": 2282.0, + "47": 2529.0, + "48": 2454.0, + "49": 2358.0, + "50": 2580.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 581488640.0, + "2": 581488640.0, + "3": 581488640.0, + "4": 581488640.0, + "5": 581488640.0, + "6": 581488640.0, + "7": 581488640.0, + "8": 581488640.0, + "9": 581488640.0, + "10": 581488640.0, + "11": 581488640.0, + "12": 581488640.0, + "13": 581488640.0, + "14": 581488640.0, + "15": 581488640.0, + "16": 581488640.0, + "17": 581488640.0, + "18": 581488640.0, + "19": 581488640.0, + "20": 581488640.0, + "21": 581488640.0, + "22": 581488640.0, + "23": 581488640.0, + "24": 581488640.0, + "25": 581488640.0, + "26": 581488640.0, + "27": 581488640.0, + "28": 581488640.0, + "29": 581488640.0, + "30": 581488640.0, + "31": 581488640.0, + "32": 581488640.0, + "33": 581488640.0, + "34": 581488640.0, + "35": 581488640.0, + "36": 581488640.0, + "37": 581488640.0, + "38": 581488640.0, + "39": 581488640.0, + "40": 581488640.0, + "41": 581488640.0, + "42": 581488640.0, + "43": 581488640.0, + "44": 581488640.0, + "45": 581488640.0, + "46": 581488640.0, + "47": 581488640.0, + "48": 581488640.0, + "49": 581488640.0, + "50": 581488640.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4605813248.0, + "2": 4702429696.0, + "3": 4702429696.0, + "4": 4702429696.0, + "5": 4702429696.0, + "6": 4702429696.0, + "7": 4702429696.0, + "8": 4702429696.0, + "9": 4702429696.0, + "10": 4702429696.0, + "11": 4702429696.0, + "12": 4702429696.0, + "13": 4702429696.0, + "14": 4702429696.0, + "15": 4702429696.0, + "16": 4702429696.0, + "17": 4702429696.0, + "18": 4702429696.0, + "19": 4702429696.0, + "20": 4702429696.0, + "21": 4702429696.0, + "22": 4702429696.0, + "23": 4702429696.0, + "24": 4702429696.0, + "25": 4702429696.0, + "26": 4702429696.0, + "27": 4702429696.0, + "28": 4702429696.0, + "29": 4702429696.0, + "30": 4702429696.0, + "31": 4702429696.0, + "32": 4702429696.0, + "33": 4702429696.0, + "34": 4702429696.0, + "35": 4702429696.0, + "36": 4702429696.0, + "37": 4702429696.0, + "38": 4702429696.0, + "39": 4702429696.0, + "40": 4702429696.0, + "41": 4702429696.0, + "42": 4702429696.0, + "43": 4702429696.0, + "44": 4702429696.0, + "45": 4702429696.0, + "46": 4702429696.0, + "47": 4702429696.0, + "48": 4702429696.0, + "49": 4702429696.0, + "50": 4702429696.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 6.7331, + "2": 0.09599, + "3": 0.08799, + "4": 0.08582, + "5": 0.08478, + "6": 0.08513, + "7": 0.07688, + "8": 0.07429, + "9": 0.07778, + "10": 0.07515, + "11": 0.07987, + "12": 0.07525, + "13": 0.07727, + "14": 0.07535, + "15": 0.07896, + "16": 0.07509, + "17": 0.07751, + "18": 0.076, + "19": 0.07647, + "20": 0.07502, + "21": 0.07467, + "22": 0.07544, + "23": 0.0742, + "24": 0.07536, + "25": 0.07588, + "26": 0.07381, + "27": 0.07407, + "28": 0.075, + "29": 0.07424, + "30": 0.07454, + "31": 0.07482, + "32": 0.07526, + "33": 0.07493, + "34": 0.07437, + "35": 0.07447, + "36": 0.07482, + "37": 0.07454, + "38": 0.07501, + "39": 0.07495, + "40": 0.07481, + "41": 0.07433, + "42": 0.07467, + "43": 0.0754, + "44": 0.07543, + "45": 0.07498, + "46": 0.07457, + "47": 0.07378, + "48": 0.07477, + "49": 0.07465, + "50": 0.07444 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..f9dab22ab59 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.8401, + "2": 10.83566, + "3": 10.82993, + "4": 10.8173, + "5": 10.84032, + "6": 10.87262, + "7": 10.83467, + "8": 10.8403, + "9": 10.84359, + "10": 10.8134, + "11": 10.85025, + "12": 10.84316, + "13": 10.86605, + "14": 10.86315, + "15": 10.80276, + "16": 10.79643, + "17": 10.7763, + "18": 10.8015, + "19": 10.7939, + "20": 10.705, + "21": 10.68148, + "22": 10.56313, + "23": 10.70136, + "24": 10.57939, + "25": 10.53849, + "26": 10.60617, + "27": 10.59211, + "28": 10.56156, + "29": 10.57666, + "30": 10.35521, + "31": 10.12773, + "32": 10.46367, + "33": 10.45444, + "34": 10.22451, + "35": 10.27148, + "36": 10.22184, + "37": 10.33945, + "38": 10.18637, + "39": 10.39329, + "40": 10.08049, + "41": 10.13789, + "42": 10.20012, + "43": 9.83791, + "44": 9.94327, + "45": 9.8229, + "46": 9.82313, + "47": 10.13353, + "48": 9.8415, + "49": 9.52102, + "50": 9.90118 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1670.0, + "2": 1691.0, + "3": 1630.0, + "4": 1805.0, + "5": 1970.0, + "6": 1901.0, + "7": 1816.0, + "8": 1587.0, + "9": 1905.0, + "10": 1397.0, + "11": 1954.0, + "12": 1859.0, + "13": 1873.0, + "14": 1875.0, + "15": 1936.0, + "16": 1972.0, + "17": 1816.0, + "18": 1773.0, + "19": 1833.0, + "20": 1715.0, + "21": 1923.0, + "22": 1681.0, + "23": 2055.0, + "24": 1727.0, + "25": 1703.0, + "26": 1761.0, + "27": 1917.0, + "28": 1962.0, + "29": 2010.0, + "30": 1957.0, + "31": 1723.0, + "32": 1898.0, + "33": 2153.0, + "34": 1828.0, + "35": 1991.0, + "36": 1937.0, + "37": 2347.0, + "38": 2365.0, + "39": 2349.0, + "40": 2239.0, + "41": 2217.0, + "42": 2222.0, + "43": 2121.0, + "44": 2059.0, + "45": 2144.0, + "46": 2296.0, + "47": 2487.0, + "48": 2376.0, + "49": 2330.0, + "50": 2377.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 552238592.0, + "2": 552238592.0, + "3": 552238592.0, + "4": 552238592.0, + "5": 552238592.0, + "6": 552238592.0, + "7": 552238592.0, + "8": 552238592.0, + "9": 552238592.0, + "10": 552238592.0, + "11": 552238592.0, + "12": 552238592.0, + "13": 552238592.0, + "14": 552238592.0, + "15": 552238592.0, + "16": 552238592.0, + "17": 552238592.0, + "18": 552238592.0, + "19": 552238592.0, + "20": 552238592.0, + "21": 552238592.0, + "22": 552238592.0, + "23": 552238592.0, + "24": 552238592.0, + "25": 552238592.0, + "26": 552238592.0, + "27": 552238592.0, + "28": 552238592.0, + "29": 552238592.0, + "30": 552238592.0, + "31": 552238592.0, + "32": 552238592.0, + "33": 552238592.0, + "34": 552238592.0, + "35": 552238592.0, + "36": 552238592.0, + "37": 552238592.0, + "38": 552238592.0, + "39": 552238592.0, + "40": 552238592.0, + "41": 552238592.0, + "42": 552238592.0, + "43": 552238592.0, + "44": 552238592.0, + "45": 552238592.0, + "46": 552238592.0, + "47": 552238592.0, + "48": 552238592.0, + "49": 552238592.0, + "50": 552238592.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4576563200.0, + "2": 4673179648.0, + "3": 4673179648.0, + "4": 4673179648.0, + "5": 4673179648.0, + "6": 4673179648.0, + "7": 4673179648.0, + "8": 4673179648.0, + "9": 4673179648.0, + "10": 4673179648.0, + "11": 4673179648.0, + "12": 4673179648.0, + "13": 4673179648.0, + "14": 4673179648.0, + "15": 4673179648.0, + "16": 4673179648.0, + "17": 4673179648.0, + "18": 4673179648.0, + "19": 4673179648.0, + "20": 4673179648.0, + "21": 4673179648.0, + "22": 4673179648.0, + "23": 4673179648.0, + "24": 4673179648.0, + "25": 4673179648.0, + "26": 4673179648.0, + "27": 4673179648.0, + "28": 4673179648.0, + "29": 4673179648.0, + "30": 4673179648.0, + "31": 4673179648.0, + "32": 4673179648.0, + "33": 4673179648.0, + "34": 4673179648.0, + "35": 4673179648.0, + "36": 4673179648.0, + "37": 4673179648.0, + "38": 4673179648.0, + "39": 4673179648.0, + "40": 4673179648.0, + "41": 4673179648.0, + "42": 4673179648.0, + "43": 4673179648.0, + "44": 4673179648.0, + "45": 4673179648.0, + "46": 4673179648.0, + "47": 4673179648.0, + "48": 4673179648.0, + "49": 4673179648.0, + "50": 4673179648.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 8.45713, + "2": 0.13161, + "3": 0.11061, + "4": 0.12579, + "5": 0.13121, + "6": 0.13773, + "7": 0.13653, + "8": 0.46789, + "9": 0.12385, + "10": 0.12166, + "11": 0.1263, + "12": 0.13396, + "13": 0.12492, + "14": 0.12502, + "15": 0.11723, + "16": 0.15631, + "17": 0.3771, + "18": 0.12361, + "19": 0.11397, + "20": 0.11135, + "21": 0.10366, + "22": 0.10396, + "23": 0.10431, + "24": 0.10481, + "25": 0.10339, + "26": 0.1068, + "27": 0.10511, + "28": 0.36221, + "29": 0.1036, + "30": 0.10364, + "31": 0.10951, + "32": 0.11609, + "33": 0.11339, + "34": 0.1139, + "35": 0.11975, + "36": 0.11809, + "37": 0.10984, + "38": 0.10706, + "39": 0.10797, + "40": 0.11217, + "41": 0.11266, + "42": 0.10821, + "43": 0.1114, + "44": 0.10779, + "45": 0.1071, + "46": 0.11272, + "47": 0.1145, + "48": 0.10778, + "49": 0.10649, + "50": 0.10728 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..cc9bcd1b512 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.8401, + "2": 10.83566, + "3": 10.82993, + "4": 10.8173, + "5": 10.84032, + "6": 10.87262, + "7": 10.83467, + "8": 10.8403, + "9": 10.84359, + "10": 10.8134, + "11": 10.85025, + "12": 10.84316, + "13": 10.86605, + "14": 10.86315, + "15": 10.80276, + "16": 10.79643, + "17": 10.7763, + "18": 10.8015, + "19": 10.7939, + "20": 10.705, + "21": 10.68148, + "22": 10.56313, + "23": 10.70136, + "24": 10.57939, + "25": 10.53849, + "26": 10.60617, + "27": 10.59211, + "28": 10.56156, + "29": 10.57666, + "30": 10.35521, + "31": 10.12773, + "32": 10.46367, + "33": 10.45444, + "34": 10.22451, + "35": 10.27148, + "36": 10.22184, + "37": 10.33945, + "38": 10.18637, + "39": 10.39329, + "40": 10.08049, + "41": 10.13789, + "42": 10.20012, + "43": 9.83791, + "44": 9.94327, + "45": 9.8229, + "46": 9.82313, + "47": 10.13353, + "48": 9.8415, + "49": 9.52102, + "50": 9.90118 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1670.0, + "2": 1691.0, + "3": 1630.0, + "4": 1805.0, + "5": 1970.0, + "6": 1901.0, + "7": 1816.0, + "8": 1587.0, + "9": 1905.0, + "10": 1397.0, + "11": 1954.0, + "12": 1859.0, + "13": 1873.0, + "14": 1875.0, + "15": 1936.0, + "16": 1972.0, + "17": 1816.0, + "18": 1773.0, + "19": 1833.0, + "20": 1715.0, + "21": 1923.0, + "22": 1681.0, + "23": 2055.0, + "24": 1727.0, + "25": 1703.0, + "26": 1761.0, + "27": 1917.0, + "28": 1962.0, + "29": 2010.0, + "30": 1957.0, + "31": 1723.0, + "32": 1898.0, + "33": 2153.0, + "34": 1828.0, + "35": 1991.0, + "36": 1937.0, + "37": 2347.0, + "38": 2365.0, + "39": 2349.0, + "40": 2239.0, + "41": 2217.0, + "42": 2222.0, + "43": 2121.0, + "44": 2059.0, + "45": 2144.0, + "46": 2296.0, + "47": 2487.0, + "48": 2376.0, + "49": 2330.0, + "50": 2377.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 552238592.0, + "2": 552238592.0, + "3": 552238592.0, + "4": 552238592.0, + "5": 552238592.0, + "6": 552238592.0, + "7": 552238592.0, + "8": 552238592.0, + "9": 552238592.0, + "10": 552238592.0, + "11": 552238592.0, + "12": 552238592.0, + "13": 552238592.0, + "14": 552238592.0, + "15": 552238592.0, + "16": 552238592.0, + "17": 552238592.0, + "18": 552238592.0, + "19": 552238592.0, + "20": 552238592.0, + "21": 552238592.0, + "22": 552238592.0, + "23": 552238592.0, + "24": 552238592.0, + "25": 552238592.0, + "26": 552238592.0, + "27": 552238592.0, + "28": 552238592.0, + "29": 552238592.0, + "30": 552238592.0, + "31": 552238592.0, + "32": 552238592.0, + "33": 552238592.0, + "34": 552238592.0, + "35": 552238592.0, + "36": 552238592.0, + "37": 552238592.0, + "38": 552238592.0, + "39": 552238592.0, + "40": 552238592.0, + "41": 552238592.0, + "42": 552238592.0, + "43": 552238592.0, + "44": 552238592.0, + "45": 552238592.0, + "46": 552238592.0, + "47": 552238592.0, + "48": 552238592.0, + "49": 552238592.0, + "50": 552238592.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4576563200.0, + "2": 4673179648.0, + "3": 4673179648.0, + "4": 4673179648.0, + "5": 4673179648.0, + "6": 4673179648.0, + "7": 4673179648.0, + "8": 4673179648.0, + "9": 4673179648.0, + "10": 4673179648.0, + "11": 4673179648.0, + "12": 4673179648.0, + "13": 4673179648.0, + "14": 4673179648.0, + "15": 4673179648.0, + "16": 4673179648.0, + "17": 4673179648.0, + "18": 4673179648.0, + "19": 4673179648.0, + "20": 4673179648.0, + "21": 4673179648.0, + "22": 4673179648.0, + "23": 4673179648.0, + "24": 4673179648.0, + "25": 4673179648.0, + "26": 4673179648.0, + "27": 4673179648.0, + "28": 4673179648.0, + "29": 4673179648.0, + "30": 4673179648.0, + "31": 4673179648.0, + "32": 4673179648.0, + "33": 4673179648.0, + "34": 4673179648.0, + "35": 4673179648.0, + "36": 4673179648.0, + "37": 4673179648.0, + "38": 4673179648.0, + "39": 4673179648.0, + "40": 4673179648.0, + "41": 4673179648.0, + "42": 4673179648.0, + "43": 4673179648.0, + "44": 4673179648.0, + "45": 4673179648.0, + "46": 4673179648.0, + "47": 4673179648.0, + "48": 4673179648.0, + "49": 4673179648.0, + "50": 4673179648.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 13.01978, + "2": 0.13386, + "3": 0.10421, + "4": 0.10575, + "5": 0.10347, + "6": 0.10366, + "7": 0.10198, + "8": 0.10204, + "9": 0.10153, + "10": 0.10361, + "11": 0.10226, + "12": 0.31034, + "13": 0.36244, + "14": 0.32183, + "15": 0.09858, + "16": 0.10098, + "17": 0.10218, + "18": 0.09859, + "19": 0.09858, + "20": 0.0985, + "21": 0.09758, + "22": 0.0984, + "23": 0.09686, + "24": 0.09763, + "25": 0.09689, + "26": 0.0979, + "27": 0.09858, + "28": 0.09763, + "29": 0.09678, + "30": 0.09714, + "31": 0.10001, + "32": 0.09705, + "33": 0.09776, + "34": 0.09662, + "35": 0.09763, + "36": 0.10137, + "37": 0.10113, + "38": 0.09825, + "39": 0.09976, + "40": 0.09925, + "41": 0.09738, + "42": 0.09904, + "43": 0.10108, + "44": 0.09921, + "45": 0.09873, + "46": 0.10018, + "47": 0.09927, + "48": 0.09914, + "49": 0.09907, + "50": 0.09879 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 1f0d2e2e9a1..ca95ad65b3d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.89631, + "2": 10.89416, + "3": 10.88786, + "4": 10.8914, "5": 10.89154, + "6": 10.90001, + "7": 10.89184, + "8": 10.89886, + "9": 10.90208, "10": 10.88361, + "11": 10.87816, + "12": 10.89332, + "13": 10.89816, + "14": 10.89241, "15": 10.84798, + "16": 10.854, + "17": 10.83093, + "18": 10.83991, + "19": 10.82802, "20": 10.74822, + "21": 10.73494, + "22": 10.61719, + "23": 10.72621, + "24": 10.63177, "25": 10.5931, + "26": 10.63365, + "27": 10.63304, + "28": 10.58259, + "29": 10.58595, "30": 10.41201, + "31": 10.15907, + "32": 10.48362, + "33": 10.46704, + "34": 10.23815, "35": 10.28193, + "36": 10.24052, + "37": 10.36227, + "38": 10.20306, + "39": 10.40456, "40": 10.09271, + "41": 10.15831, + "42": 10.21934, + "43": 9.8436, + "44": 9.97299, "45": 9.84189, + "46": 9.82017, + "47": 10.14968, + "48": 9.86021, + "49": 9.54238, "50": 9.91347, + "51": 9.85447, + "52": 9.73936, + "53": 10.07426, + "54": 9.96915, "55": 9.88574, + "56": 9.62437, + "57": 9.4823, + "58": 9.83483, + "59": 9.58732, "60": 9.50245, + "61": 9.69343, + "62": 9.98806, + "63": 9.39103, + "64": 9.78021, "65": 8.94515, + "66": 9.70494, + "67": 9.37251, + "68": 9.78329, + "69": 9.79058, "70": 9.74454, + "71": 9.62301, + "72": 9.58458, + "73": 9.50513, + "74": 8.94312, "75": 9.42524, + "76": 9.07601, + "77": 10.06353, + "78": 9.72308, + "79": 9.37502, "80": 9.40453, + "81": 9.47794, + "82": 9.69667, + "83": 9.3072, + "84": 9.41526, "85": 9.61293, + "86": 9.07195, + "87": 9.5884, + "88": 9.74762, + "89": 9.59982, "90": 9.81672, + "91": 9.3379, + "92": 9.35605, + "93": 9.07425, + "94": 8.8351, "95": 9.5184, + "96": 9.52391, + "97": 9.30923, + "98": 9.66743, + "99": 8.88419, "100": 9.39924 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1483.0, + "2": 1650.0, + "3": 1681.0, + "4": 1767.0, "5": 1903.0, + "6": 1952.0, + "7": 1967.0, + "8": 1651.0, + "9": 1886.0, "10": 1427.0, + "11": 1897.0, + "12": 1855.0, + "13": 1941.0, + "14": 1749.0, "15": 1901.0, + "16": 1813.0, + "17": 1710.0, + "18": 1707.0, + "19": 1819.0, "20": 1639.0, + "21": 1880.0, + "22": 1769.0, + "23": 2016.0, + "24": 1692.0, "25": 1672.0, + "26": 1778.0, + "27": 1861.0, + "28": 1964.0, + "29": 2021.0, "30": 1938.0, + "31": 1645.0, + "32": 1864.0, + "33": 2150.0, + "34": 1828.0, "35": 1982.0, + "36": 1864.0, + "37": 2355.0, + "38": 2358.0, + "39": 2385.0, "40": 2407.0, + "41": 2501.0, + "42": 2435.0, + "43": 2033.0, + "44": 2089.0, "45": 2210.0, + "46": 2351.0, + "47": 2502.0, + "48": 2444.0, + "49": 2302.0, "50": 2492.0, + "51": 2598.0, + "52": 2547.0, + "53": 2957.0, + "54": 2750.0, "55": 2372.0, + "56": 2569.0, + "57": 2395.0, + "58": 2901.0, + "59": 2741.0, "60": 2430.0, + "61": 2868.0, + "62": 2651.0, + "63": 2507.0, + "64": 3014.0, "65": 2683.0, + "66": 2935.0, + "67": 2783.0, + "68": 2725.0, + "69": 2788.0, "70": 3152.0, + "71": 3026.0, + "72": 2415.0, + "73": 3122.0, + "74": 1967.0, "75": 2581.0, + "76": 3010.0, + "77": 3294.0, + "78": 3166.0, + "79": 3150.0, "80": 3246.0, + "81": 3566.0, + "82": 3285.0, + "83": 2817.0, + "84": 3269.0, "85": 3425.0, + "86": 2819.0, + "87": 3577.0, + "88": 3004.0, + "89": 3323.0, "90": 3023.0, + "91": 2661.0, + "92": 3066.0, + "93": 2691.0, + "94": 3305.0, "95": 3403.0, + "96": 3377.0, + "97": 3242.0, + "98": 3697.0, + "99": 3112.0, "100": 3199.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 581488640.0, + "2": 581488640.0, + "3": 581488640.0, + "4": 581488640.0, "5": 581488640.0, + "6": 581488640.0, + "7": 581488640.0, + "8": 581488640.0, + "9": 581488640.0, "10": 581488640.0, + "11": 581488640.0, + "12": 581488640.0, + "13": 581488640.0, + "14": 581488640.0, "15": 581488640.0, + "16": 581488640.0, + "17": 581488640.0, + "18": 581488640.0, + "19": 581488640.0, "20": 581488640.0, + "21": 581488640.0, + "22": 581488640.0, + "23": 581488640.0, + "24": 581488640.0, "25": 581488640.0, + "26": 581488640.0, + "27": 581488640.0, + "28": 581488640.0, + "29": 581488640.0, "30": 581488640.0, + "31": 581488640.0, + "32": 581488640.0, + "33": 581488640.0, + "34": 581488640.0, "35": 581488640.0, + "36": 581488640.0, + "37": 581488640.0, + "38": 581488640.0, + "39": 581488640.0, "40": 581488640.0, + "41": 581488640.0, + "42": 581488640.0, + "43": 581488640.0, + "44": 581488640.0, "45": 581488640.0, + "46": 581488640.0, + "47": 581488640.0, + "48": 581488640.0, + "49": 581488640.0, "50": 581488640.0, + "51": 581488640.0, + "52": 581488640.0, + "53": 581488640.0, + "54": 581488640.0, "55": 581488640.0, + "56": 581488640.0, + "57": 581488640.0, + "58": 581488640.0, + "59": 581488640.0, "60": 581488640.0, + "61": 581488640.0, + "62": 581488640.0, + "63": 581488640.0, + "64": 581488640.0, "65": 581488640.0, + "66": 581488640.0, + "67": 581488640.0, + "68": 581488640.0, + "69": 581488640.0, "70": 581488640.0, + "71": 581488640.0, + "72": 581488640.0, + "73": 581488640.0, + "74": 581488640.0, "75": 581488640.0, + "76": 581488640.0, + "77": 581488640.0, + "78": 581488640.0, + "79": 581488640.0, "80": 581488640.0, + "81": 581488640.0, + "82": 581488640.0, + "83": 581488640.0, + "84": 581488640.0, "85": 581488640.0, + "86": 581488640.0, + "87": 581488640.0, + "88": 581488640.0, + "89": 581488640.0, "90": 581488640.0, + "91": 581488640.0, + "92": 581488640.0, + "93": 581488640.0, + "94": 581488640.0, "95": 581488640.0, + "96": 581488640.0, + "97": 581488640.0, + "98": 581488640.0, + "99": 581488640.0, "100": 581488640.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2594126336.0, + "2": 2690742784.0, + "3": 2690742784.0, + "4": 2690742784.0, "5": 2690742784.0, + "6": 2690742784.0, + "7": 2690742784.0, + "8": 2690742784.0, + "9": 2690742784.0, "10": 2690742784.0, + "11": 2690742784.0, + "12": 2690742784.0, + "13": 2690742784.0, + "14": 2690742784.0, "15": 2690742784.0, + "16": 2690742784.0, + "17": 2690742784.0, + "18": 2690742784.0, + "19": 2690742784.0, "20": 2690742784.0, + "21": 2690742784.0, + "22": 2690742784.0, + "23": 2690742784.0, + "24": 2690742784.0, "25": 2690742784.0, + "26": 2690742784.0, + "27": 2690742784.0, + "28": 2690742784.0, + "29": 2690742784.0, "30": 2690742784.0, + "31": 2690742784.0, + "32": 2690742784.0, + "33": 2690742784.0, + "34": 2690742784.0, "35": 2690742784.0, + "36": 2690742784.0, + "37": 2690742784.0, + "38": 2690742784.0, + "39": 2690742784.0, "40": 2690742784.0, + "41": 2690742784.0, + "42": 2690742784.0, + "43": 2690742784.0, + "44": 2690742784.0, "45": 2690742784.0, + "46": 2690742784.0, + "47": 2690742784.0, + "48": 2690742784.0, + "49": 2690742784.0, "50": 2690742784.0, + "51": 2690742784.0, + "52": 2690742784.0, + "53": 2690742784.0, + "54": 2690742784.0, "55": 2690742784.0, + "56": 2690742784.0, + "57": 2690742784.0, + "58": 2690742784.0, + "59": 2690742784.0, "60": 2690742784.0, + "61": 2690742784.0, + "62": 2690742784.0, + "63": 2690742784.0, + "64": 2690742784.0, "65": 2690742784.0, + "66": 2690742784.0, + "67": 2690742784.0, + "68": 2690742784.0, + "69": 2690742784.0, "70": 2690742784.0, + "71": 2690742784.0, + "72": 2690742784.0, + "73": 2690742784.0, + "74": 2690742784.0, "75": 2690742784.0, + "76": 2690742784.0, + "77": 2690742784.0, + "78": 2690742784.0, + "79": 2690742784.0, "80": 2690742784.0, + "81": 2690742784.0, + "82": 2690742784.0, + "83": 2690742784.0, + "84": 2690742784.0, "85": 2690742784.0, + "86": 2690742784.0, + "87": 2690742784.0, + "88": 2690742784.0, + "89": 2690742784.0, "90": 2690742784.0, + "91": 2690742784.0, + "92": 2690742784.0, + "93": 2690742784.0, + "94": 2690742784.0, "95": 2690742784.0, + "96": 2690742784.0, + "97": 2690742784.0, + "98": 2690742784.0, + "99": 2690742784.0, "100": 2690742784.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 8.28181, - "5": 0.05617, - "10": 0.05714, - "15": 0.05541, - "20": 0.05475, - "25": 0.05518, - "30": 0.0563, - "35": 0.05638, - "40": 0.05543, - "45": 0.05574, - "50": 0.05563, - "55": 0.07246, - "60": 0.05657, - "65": 0.05621, - "70": 0.05607, - "75": 0.05605, - "80": 0.05618, - "85": 0.05509, - "90": 0.05962, - "95": 0.05777, - "100": 0.06336 + "1": 7.50382, + "2": 0.09494, + "3": 0.08499, + "4": 0.08516, + "5": 0.08574, + "6": 0.07205, + "7": 0.0678, + "8": 0.06716, + "9": 0.06722, + "10": 0.06806, + "11": 0.06825, + "12": 0.06735, + "13": 0.06795, + "14": 0.06749, + "15": 0.06675, + "16": 0.06707, + "17": 0.06697, + "18": 0.06753, + "19": 0.06817, + "20": 0.06848, + "21": 0.06619, + "22": 0.06841, + "23": 0.06785, + "24": 0.06849, + "25": 0.06774, + "26": 0.06776, + "27": 0.06722, + "28": 0.06759, + "29": 0.06651, + "30": 0.06707, + "31": 0.06654, + "32": 0.06698, + "33": 0.06699, + "34": 0.06679, + "35": 0.06871, + "36": 0.06753, + "37": 0.06724, + "38": 0.06699, + "39": 0.06694, + "40": 0.06736, + "41": 0.06719, + "42": 0.06704, + "43": 0.06772, + "44": 0.06769, + "45": 0.06718, + "46": 0.06687, + "47": 0.0666, + "48": 0.06791, + "49": 0.06768, + "50": 0.06799, + "51": 0.08137, + "52": 0.07388, + "53": 0.07162, + "54": 0.06825, + "55": 0.09073, + "56": 0.06514, + "57": 0.06572, + "58": 0.066, + "59": 0.06584, + "60": 0.06564, + "61": 0.06432, + "62": 0.06646, + "63": 0.06643, + "64": 0.06637, + "65": 0.06605, + "66": 0.06606, + "67": 0.06661, + "68": 0.06602, + "69": 0.06559, + "70": 0.06607, + "71": 0.06417, + "72": 0.06658, + "73": 0.06562, + "74": 0.06641, + "75": 0.0655, + "76": 0.06656, + "77": 0.065, + "78": 0.06615, + "79": 0.06666, + "80": 0.06535, + "81": 0.06679, + "82": 0.06885, + "83": 0.06577, + "84": 0.06461, + "85": 0.06689, + "86": 0.06445, + "87": 0.06546, + "88": 0.06624, + "89": 0.06635, + "90": 0.0643, + "91": 0.06631, + "92": 0.0655, + "93": 0.06522, + "94": 0.06652, + "95": 0.06592, + "96": 0.0658, + "97": 0.06642, + "98": 0.06519, + "99": 0.06466, + "100": 0.06561 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..0f5131905ca --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.89631, + "2": 10.89416, + "3": 10.88786, + "4": 10.8914, + "5": 10.89154, + "6": 10.90001, + "7": 10.89184, + "8": 10.89886, + "9": 10.90208, + "10": 10.88361, + "11": 10.87816, + "12": 10.89332, + "13": 10.89816, + "14": 10.89241, + "15": 10.84798, + "16": 10.854, + "17": 10.83093, + "18": 10.83991, + "19": 10.82802, + "20": 10.74822, + "21": 10.73494, + "22": 10.61719, + "23": 10.72621, + "24": 10.63177, + "25": 10.5931, + "26": 10.63365, + "27": 10.63304, + "28": 10.58259, + "29": 10.58595, + "30": 10.41201, + "31": 10.15907, + "32": 10.48362, + "33": 10.46704, + "34": 10.23815, + "35": 10.28193, + "36": 10.24052, + "37": 10.36227, + "38": 10.20306, + "39": 10.40456, + "40": 10.09271, + "41": 10.15831, + "42": 10.21934, + "43": 9.8436, + "44": 9.97299, + "45": 9.84189, + "46": 9.82017, + "47": 10.14968, + "48": 9.86021, + "49": 9.54238, + "50": 9.91347, + "51": 9.85447, + "52": 9.73936, + "53": 10.07426, + "54": 9.96915, + "55": 9.88574, + "56": 9.62437, + "57": 9.4823, + "58": 9.83483, + "59": 9.58732, + "60": 9.50245, + "61": 9.69343, + "62": 9.98806, + "63": 9.39103, + "64": 9.78021, + "65": 8.94515, + "66": 9.70494, + "67": 9.37251, + "68": 9.78329, + "69": 9.79058, + "70": 9.74454, + "71": 9.62301, + "72": 9.58458, + "73": 9.50513, + "74": 8.94312, + "75": 9.42524, + "76": 9.07601, + "77": 10.06353, + "78": 9.72308, + "79": 9.37502, + "80": 9.40453, + "81": 9.47794, + "82": 9.69667, + "83": 9.3072, + "84": 9.41526, + "85": 9.61293, + "86": 9.07195, + "87": 9.5884, + "88": 9.74762, + "89": 9.59982, + "90": 9.81672, + "91": 9.3379, + "92": 9.35605, + "93": 9.07425, + "94": 8.8351, + "95": 9.5184, + "96": 9.52391, + "97": 9.30923, + "98": 9.66743, + "99": 8.88419, + "100": 9.39924 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1483.0, + "2": 1650.0, + "3": 1681.0, + "4": 1767.0, + "5": 1903.0, + "6": 1952.0, + "7": 1967.0, + "8": 1651.0, + "9": 1886.0, + "10": 1427.0, + "11": 1897.0, + "12": 1855.0, + "13": 1941.0, + "14": 1749.0, + "15": 1901.0, + "16": 1813.0, + "17": 1710.0, + "18": 1707.0, + "19": 1819.0, + "20": 1639.0, + "21": 1880.0, + "22": 1769.0, + "23": 2016.0, + "24": 1692.0, + "25": 1672.0, + "26": 1778.0, + "27": 1861.0, + "28": 1964.0, + "29": 2021.0, + "30": 1938.0, + "31": 1645.0, + "32": 1864.0, + "33": 2150.0, + "34": 1828.0, + "35": 1982.0, + "36": 1864.0, + "37": 2355.0, + "38": 2358.0, + "39": 2385.0, + "40": 2407.0, + "41": 2501.0, + "42": 2435.0, + "43": 2033.0, + "44": 2089.0, + "45": 2210.0, + "46": 2351.0, + "47": 2502.0, + "48": 2444.0, + "49": 2302.0, + "50": 2492.0, + "51": 2598.0, + "52": 2547.0, + "53": 2957.0, + "54": 2750.0, + "55": 2372.0, + "56": 2569.0, + "57": 2395.0, + "58": 2901.0, + "59": 2741.0, + "60": 2430.0, + "61": 2868.0, + "62": 2651.0, + "63": 2507.0, + "64": 3014.0, + "65": 2683.0, + "66": 2935.0, + "67": 2783.0, + "68": 2725.0, + "69": 2788.0, + "70": 3152.0, + "71": 3026.0, + "72": 2415.0, + "73": 3122.0, + "74": 1967.0, + "75": 2581.0, + "76": 3010.0, + "77": 3294.0, + "78": 3166.0, + "79": 3150.0, + "80": 3246.0, + "81": 3566.0, + "82": 3285.0, + "83": 2817.0, + "84": 3269.0, + "85": 3425.0, + "86": 2819.0, + "87": 3577.0, + "88": 3004.0, + "89": 3323.0, + "90": 3023.0, + "91": 2661.0, + "92": 3066.0, + "93": 2691.0, + "94": 3305.0, + "95": 3403.0, + "96": 3377.0, + "97": 3242.0, + "98": 3697.0, + "99": 3112.0, + "100": 3199.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 581488640.0, + "2": 581488640.0, + "3": 581488640.0, + "4": 581488640.0, + "5": 581488640.0, + "6": 581488640.0, + "7": 581488640.0, + "8": 581488640.0, + "9": 581488640.0, + "10": 581488640.0, + "11": 581488640.0, + "12": 581488640.0, + "13": 581488640.0, + "14": 581488640.0, + "15": 581488640.0, + "16": 581488640.0, + "17": 581488640.0, + "18": 581488640.0, + "19": 581488640.0, + "20": 581488640.0, + "21": 581488640.0, + "22": 581488640.0, + "23": 581488640.0, + "24": 581488640.0, + "25": 581488640.0, + "26": 581488640.0, + "27": 581488640.0, + "28": 581488640.0, + "29": 581488640.0, + "30": 581488640.0, + "31": 581488640.0, + "32": 581488640.0, + "33": 581488640.0, + "34": 581488640.0, + "35": 581488640.0, + "36": 581488640.0, + "37": 581488640.0, + "38": 581488640.0, + "39": 581488640.0, + "40": 581488640.0, + "41": 581488640.0, + "42": 581488640.0, + "43": 581488640.0, + "44": 581488640.0, + "45": 581488640.0, + "46": 581488640.0, + "47": 581488640.0, + "48": 581488640.0, + "49": 581488640.0, + "50": 581488640.0, + "51": 581488640.0, + "52": 581488640.0, + "53": 581488640.0, + "54": 581488640.0, + "55": 581488640.0, + "56": 581488640.0, + "57": 581488640.0, + "58": 581488640.0, + "59": 581488640.0, + "60": 581488640.0, + "61": 581488640.0, + "62": 581488640.0, + "63": 581488640.0, + "64": 581488640.0, + "65": 581488640.0, + "66": 581488640.0, + "67": 581488640.0, + "68": 581488640.0, + "69": 581488640.0, + "70": 581488640.0, + "71": 581488640.0, + "72": 581488640.0, + "73": 581488640.0, + "74": 581488640.0, + "75": 581488640.0, + "76": 581488640.0, + "77": 581488640.0, + "78": 581488640.0, + "79": 581488640.0, + "80": 581488640.0, + "81": 581488640.0, + "82": 581488640.0, + "83": 581488640.0, + "84": 581488640.0, + "85": 581488640.0, + "86": 581488640.0, + "87": 581488640.0, + "88": 581488640.0, + "89": 581488640.0, + "90": 581488640.0, + "91": 581488640.0, + "92": 581488640.0, + "93": 581488640.0, + "94": 581488640.0, + "95": 581488640.0, + "96": 581488640.0, + "97": 581488640.0, + "98": 581488640.0, + "99": 581488640.0, + "100": 581488640.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2594126336.0, + "2": 2690742784.0, + "3": 2690742784.0, + "4": 2690742784.0, + "5": 2690742784.0, + "6": 2690742784.0, + "7": 2690742784.0, + "8": 2690742784.0, + "9": 2690742784.0, + "10": 2690742784.0, + "11": 2690742784.0, + "12": 2690742784.0, + "13": 2690742784.0, + "14": 2690742784.0, + "15": 2690742784.0, + "16": 2690742784.0, + "17": 2690742784.0, + "18": 2690742784.0, + "19": 2690742784.0, + "20": 2690742784.0, + "21": 2690742784.0, + "22": 2690742784.0, + "23": 2690742784.0, + "24": 2690742784.0, + "25": 2690742784.0, + "26": 2690742784.0, + "27": 2690742784.0, + "28": 2690742784.0, + "29": 2690742784.0, + "30": 2690742784.0, + "31": 2690742784.0, + "32": 2690742784.0, + "33": 2690742784.0, + "34": 2690742784.0, + "35": 2690742784.0, + "36": 2690742784.0, + "37": 2690742784.0, + "38": 2690742784.0, + "39": 2690742784.0, + "40": 2690742784.0, + "41": 2690742784.0, + "42": 2690742784.0, + "43": 2690742784.0, + "44": 2690742784.0, + "45": 2690742784.0, + "46": 2690742784.0, + "47": 2690742784.0, + "48": 2690742784.0, + "49": 2690742784.0, + "50": 2690742784.0, + "51": 2690742784.0, + "52": 2690742784.0, + "53": 2690742784.0, + "54": 2690742784.0, + "55": 2690742784.0, + "56": 2690742784.0, + "57": 2690742784.0, + "58": 2690742784.0, + "59": 2690742784.0, + "60": 2690742784.0, + "61": 2690742784.0, + "62": 2690742784.0, + "63": 2690742784.0, + "64": 2690742784.0, + "65": 2690742784.0, + "66": 2690742784.0, + "67": 2690742784.0, + "68": 2690742784.0, + "69": 2690742784.0, + "70": 2690742784.0, + "71": 2690742784.0, + "72": 2690742784.0, + "73": 2690742784.0, + "74": 2690742784.0, + "75": 2690742784.0, + "76": 2690742784.0, + "77": 2690742784.0, + "78": 2690742784.0, + "79": 2690742784.0, + "80": 2690742784.0, + "81": 2690742784.0, + "82": 2690742784.0, + "83": 2690742784.0, + "84": 2690742784.0, + "85": 2690742784.0, + "86": 2690742784.0, + "87": 2690742784.0, + "88": 2690742784.0, + "89": 2690742784.0, + "90": 2690742784.0, + "91": 2690742784.0, + "92": 2690742784.0, + "93": 2690742784.0, + "94": 2690742784.0, + "95": 2690742784.0, + "96": 2690742784.0, + "97": 2690742784.0, + "98": 2690742784.0, + "99": 2690742784.0, + "100": 2690742784.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 7.57521, + "2": 0.07593, + "3": 0.05387, + "4": 0.05352, + "5": 0.05602, + "6": 3.85308, + "7": 0.05787, + "8": 0.71621, + "9": 0.33662, + "10": 0.6136, + "11": 1.43071, + "12": 0.0585, + "13": 0.05762, + "14": 0.0573, + "15": 0.06754, + "16": 0.06151, + "17": 0.06798, + "18": 0.05523, + "19": 0.18762, + "20": 0.28771, + "21": 0.05854, + "22": 0.05692, + "23": 0.05871, + "24": 0.05788, + "25": 0.05853, + "26": 0.05723, + "27": 0.05911, + "28": 0.05718, + "29": 0.05914, + "30": 0.0562, + "31": 0.05914, + "32": 0.05683, + "33": 0.0585, + "34": 0.05641, + "35": 0.06095, + "36": 0.05706, + "37": 0.05915, + "38": 0.05666, + "39": 0.05887, + "40": 0.05689, + "41": 0.06354, + "42": 0.05728, + "43": 0.06056, + "44": 0.05698, + "45": 0.05866, + "46": 0.05782, + "47": 0.05864, + "48": 0.05766, + "49": 0.0593, + "50": 0.05709, + "51": 0.07764, + "52": 0.06534, + "53": 0.05923, + "54": 0.08052, + "55": 0.05743, + "56": 0.05803, + "57": 0.05961, + "58": 0.05679, + "59": 0.05691, + "60": 0.05989, + "61": 0.05604, + "62": 0.05739, + "63": 0.05673, + "64": 0.0572, + "65": 0.0573, + "66": 0.05797, + "67": 0.05694, + "68": 0.05763, + "69": 0.05765, + "70": 0.05718, + "71": 0.05666, + "72": 0.05782, + "73": 0.0577, + "74": 0.05704, + "75": 0.06457, + "76": 0.06526, + "77": 0.06461, + "78": 0.05996, + "79": 0.05701, + "80": 0.0582, + "81": 0.06253, + "82": 0.05976, + "83": 0.05924, + "84": 0.05851, + "85": 0.0593, + "86": 0.05994, + "87": 0.05913, + "88": 0.05723, + "89": 0.0581, + "90": 0.05828, + "91": 0.06035, + "92": 0.05762, + "93": 0.059, + "94": 0.05728, + "95": 0.05927, + "96": 0.05721, + "97": 0.05992, + "98": 0.05777, + "99": 0.05867, + "100": 0.0569 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..686e980d509 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.89631, + "2": 10.89416, + "3": 10.88786, + "4": 10.8914, + "5": 10.89154, + "6": 10.90001, + "7": 10.89184, + "8": 10.89886, + "9": 10.90208, + "10": 10.88361, + "11": 10.87816, + "12": 10.89332, + "13": 10.89816, + "14": 10.89241, + "15": 10.84798, + "16": 10.854, + "17": 10.83093, + "18": 10.83991, + "19": 10.82802, + "20": 10.74822, + "21": 10.73494, + "22": 10.61719, + "23": 10.72621, + "24": 10.63177, + "25": 10.5931, + "26": 10.63365, + "27": 10.63304, + "28": 10.58259, + "29": 10.58595, + "30": 10.41201, + "31": 10.15907, + "32": 10.48362, + "33": 10.46704, + "34": 10.23815, + "35": 10.28193, + "36": 10.24052, + "37": 10.36227, + "38": 10.20306, + "39": 10.40456, + "40": 10.09271, + "41": 10.15831, + "42": 10.21934, + "43": 9.8436, + "44": 9.97299, + "45": 9.84189, + "46": 9.82017, + "47": 10.14968, + "48": 9.86021, + "49": 9.54238, + "50": 9.91347, + "51": 9.85447, + "52": 9.73936, + "53": 10.07426, + "54": 9.96915, + "55": 9.88574, + "56": 9.62437, + "57": 9.4823, + "58": 9.83483, + "59": 9.58732, + "60": 9.50245, + "61": 9.69343, + "62": 9.98806, + "63": 9.39103, + "64": 9.78021, + "65": 8.94515, + "66": 9.70494, + "67": 9.37251, + "68": 9.78329, + "69": 9.79058, + "70": 9.74454, + "71": 9.62301, + "72": 9.58458, + "73": 9.50513, + "74": 8.94312, + "75": 9.42524, + "76": 9.07601, + "77": 10.06353, + "78": 9.72308, + "79": 9.37502, + "80": 9.40453, + "81": 9.47794, + "82": 9.69667, + "83": 9.3072, + "84": 9.41526, + "85": 9.61293, + "86": 9.07195, + "87": 9.5884, + "88": 9.74762, + "89": 9.59982, + "90": 9.81672, + "91": 9.3379, + "92": 9.35605, + "93": 9.07425, + "94": 8.8351, + "95": 9.5184, + "96": 9.52391, + "97": 9.30923, + "98": 9.66743, + "99": 8.88419, + "100": 9.39924 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1483.0, + "2": 1650.0, + "3": 1681.0, + "4": 1767.0, + "5": 1903.0, + "6": 1952.0, + "7": 1967.0, + "8": 1651.0, + "9": 1886.0, + "10": 1427.0, + "11": 1897.0, + "12": 1855.0, + "13": 1941.0, + "14": 1749.0, + "15": 1901.0, + "16": 1813.0, + "17": 1710.0, + "18": 1707.0, + "19": 1819.0, + "20": 1639.0, + "21": 1880.0, + "22": 1769.0, + "23": 2016.0, + "24": 1692.0, + "25": 1672.0, + "26": 1778.0, + "27": 1861.0, + "28": 1964.0, + "29": 2021.0, + "30": 1938.0, + "31": 1645.0, + "32": 1864.0, + "33": 2150.0, + "34": 1828.0, + "35": 1982.0, + "36": 1864.0, + "37": 2355.0, + "38": 2358.0, + "39": 2385.0, + "40": 2407.0, + "41": 2501.0, + "42": 2435.0, + "43": 2033.0, + "44": 2089.0, + "45": 2210.0, + "46": 2351.0, + "47": 2502.0, + "48": 2444.0, + "49": 2302.0, + "50": 2492.0, + "51": 2598.0, + "52": 2547.0, + "53": 2957.0, + "54": 2750.0, + "55": 2372.0, + "56": 2569.0, + "57": 2395.0, + "58": 2901.0, + "59": 2741.0, + "60": 2430.0, + "61": 2868.0, + "62": 2651.0, + "63": 2507.0, + "64": 3014.0, + "65": 2683.0, + "66": 2935.0, + "67": 2783.0, + "68": 2725.0, + "69": 2788.0, + "70": 3152.0, + "71": 3026.0, + "72": 2415.0, + "73": 3122.0, + "74": 1967.0, + "75": 2581.0, + "76": 3010.0, + "77": 3294.0, + "78": 3166.0, + "79": 3150.0, + "80": 3246.0, + "81": 3566.0, + "82": 3285.0, + "83": 2817.0, + "84": 3269.0, + "85": 3425.0, + "86": 2819.0, + "87": 3577.0, + "88": 3004.0, + "89": 3323.0, + "90": 3023.0, + "91": 2661.0, + "92": 3066.0, + "93": 2691.0, + "94": 3305.0, + "95": 3403.0, + "96": 3377.0, + "97": 3242.0, + "98": 3697.0, + "99": 3112.0, + "100": 3199.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 581488640.0, + "2": 581488640.0, + "3": 581488640.0, + "4": 581488640.0, + "5": 581488640.0, + "6": 581488640.0, + "7": 581488640.0, + "8": 581488640.0, + "9": 581488640.0, + "10": 581488640.0, + "11": 581488640.0, + "12": 581488640.0, + "13": 581488640.0, + "14": 581488640.0, + "15": 581488640.0, + "16": 581488640.0, + "17": 581488640.0, + "18": 581488640.0, + "19": 581488640.0, + "20": 581488640.0, + "21": 581488640.0, + "22": 581488640.0, + "23": 581488640.0, + "24": 581488640.0, + "25": 581488640.0, + "26": 581488640.0, + "27": 581488640.0, + "28": 581488640.0, + "29": 581488640.0, + "30": 581488640.0, + "31": 581488640.0, + "32": 581488640.0, + "33": 581488640.0, + "34": 581488640.0, + "35": 581488640.0, + "36": 581488640.0, + "37": 581488640.0, + "38": 581488640.0, + "39": 581488640.0, + "40": 581488640.0, + "41": 581488640.0, + "42": 581488640.0, + "43": 581488640.0, + "44": 581488640.0, + "45": 581488640.0, + "46": 581488640.0, + "47": 581488640.0, + "48": 581488640.0, + "49": 581488640.0, + "50": 581488640.0, + "51": 581488640.0, + "52": 581488640.0, + "53": 581488640.0, + "54": 581488640.0, + "55": 581488640.0, + "56": 581488640.0, + "57": 581488640.0, + "58": 581488640.0, + "59": 581488640.0, + "60": 581488640.0, + "61": 581488640.0, + "62": 581488640.0, + "63": 581488640.0, + "64": 581488640.0, + "65": 581488640.0, + "66": 581488640.0, + "67": 581488640.0, + "68": 581488640.0, + "69": 581488640.0, + "70": 581488640.0, + "71": 581488640.0, + "72": 581488640.0, + "73": 581488640.0, + "74": 581488640.0, + "75": 581488640.0, + "76": 581488640.0, + "77": 581488640.0, + "78": 581488640.0, + "79": 581488640.0, + "80": 581488640.0, + "81": 581488640.0, + "82": 581488640.0, + "83": 581488640.0, + "84": 581488640.0, + "85": 581488640.0, + "86": 581488640.0, + "87": 581488640.0, + "88": 581488640.0, + "89": 581488640.0, + "90": 581488640.0, + "91": 581488640.0, + "92": 581488640.0, + "93": 581488640.0, + "94": 581488640.0, + "95": 581488640.0, + "96": 581488640.0, + "97": 581488640.0, + "98": 581488640.0, + "99": 581488640.0, + "100": 581488640.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2594126336.0, + "2": 2690742784.0, + "3": 2690742784.0, + "4": 2690742784.0, + "5": 2690742784.0, + "6": 2690742784.0, + "7": 2690742784.0, + "8": 2690742784.0, + "9": 2690742784.0, + "10": 2690742784.0, + "11": 2690742784.0, + "12": 2690742784.0, + "13": 2690742784.0, + "14": 2690742784.0, + "15": 2690742784.0, + "16": 2690742784.0, + "17": 2690742784.0, + "18": 2690742784.0, + "19": 2690742784.0, + "20": 2690742784.0, + "21": 2690742784.0, + "22": 2690742784.0, + "23": 2690742784.0, + "24": 2690742784.0, + "25": 2690742784.0, + "26": 2690742784.0, + "27": 2690742784.0, + "28": 2690742784.0, + "29": 2690742784.0, + "30": 2690742784.0, + "31": 2690742784.0, + "32": 2690742784.0, + "33": 2690742784.0, + "34": 2690742784.0, + "35": 2690742784.0, + "36": 2690742784.0, + "37": 2690742784.0, + "38": 2690742784.0, + "39": 2690742784.0, + "40": 2690742784.0, + "41": 2690742784.0, + "42": 2690742784.0, + "43": 2690742784.0, + "44": 2690742784.0, + "45": 2690742784.0, + "46": 2690742784.0, + "47": 2690742784.0, + "48": 2690742784.0, + "49": 2690742784.0, + "50": 2690742784.0, + "51": 2690742784.0, + "52": 2690742784.0, + "53": 2690742784.0, + "54": 2690742784.0, + "55": 2690742784.0, + "56": 2690742784.0, + "57": 2690742784.0, + "58": 2690742784.0, + "59": 2690742784.0, + "60": 2690742784.0, + "61": 2690742784.0, + "62": 2690742784.0, + "63": 2690742784.0, + "64": 2690742784.0, + "65": 2690742784.0, + "66": 2690742784.0, + "67": 2690742784.0, + "68": 2690742784.0, + "69": 2690742784.0, + "70": 2690742784.0, + "71": 2690742784.0, + "72": 2690742784.0, + "73": 2690742784.0, + "74": 2690742784.0, + "75": 2690742784.0, + "76": 2690742784.0, + "77": 2690742784.0, + "78": 2690742784.0, + "79": 2690742784.0, + "80": 2690742784.0, + "81": 2690742784.0, + "82": 2690742784.0, + "83": 2690742784.0, + "84": 2690742784.0, + "85": 2690742784.0, + "86": 2690742784.0, + "87": 2690742784.0, + "88": 2690742784.0, + "89": 2690742784.0, + "90": 2690742784.0, + "91": 2690742784.0, + "92": 2690742784.0, + "93": 2690742784.0, + "94": 2690742784.0, + "95": 2690742784.0, + "96": 2690742784.0, + "97": 2690742784.0, + "98": 2690742784.0, + "99": 2690742784.0, + "100": 2690742784.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 7.46673, + "2": 0.07879, + "3": 0.06822, + "4": 0.06744, + "5": 0.06664, + "6": 0.06786, + "7": 0.06766, + "8": 0.06659, + "9": 0.06797, + "10": 0.07184, + "11": 0.07288, + "12": 0.07188, + "13": 0.07026, + "14": 0.06821, + "15": 0.06667, + "16": 0.06656, + "17": 0.06764, + "18": 0.06816, + "19": 0.06695, + "20": 0.06832, + "21": 0.06808, + "22": 0.06822, + "23": 0.06838, + "24": 0.06731, + "25": 0.06857, + "26": 0.06706, + "27": 0.06819, + "28": 0.06784, + "29": 0.06785, + "30": 0.06735, + "31": 0.0685, + "32": 0.07005, + "33": 0.07122, + "34": 0.07241, + "35": 0.07067, + "36": 0.06981, + "37": 0.06934, + "38": 0.06771, + "39": 0.06805, + "40": 0.06824, + "41": 0.06831, + "42": 0.06733, + "43": 0.06819, + "44": 0.06816, + "45": 0.06847, + "46": 0.0674, + "47": 0.06856, + "48": 0.07158, + "49": 0.07079, + "50": 0.0717, + "51": 0.08179, + "52": 0.07272, + "53": 0.06939, + "54": 0.06631, + "55": 0.07046, + "56": 0.09852, + "57": 0.06464, + "58": 0.06466, + "59": 0.06537, + "60": 0.06301, + "61": 0.06361, + "62": 0.06551, + "63": 0.06563, + "64": 0.0749, + "65": 0.0748, + "66": 0.07507, + "67": 0.07552, + "68": 0.07573, + "69": 0.07066, + "70": 0.0658, + "71": 0.0647, + "72": 0.06444, + "73": 0.06462, + "74": 0.06543, + "75": 0.06609, + "76": 0.06503, + "77": 0.06499, + "78": 0.0644, + "79": 0.06439, + "80": 0.06417, + "81": 0.06401, + "82": 0.06575, + "83": 0.06494, + "84": 0.06442, + "85": 0.06396, + "86": 0.06422, + "87": 0.06484, + "88": 0.06512, + "89": 0.06426, + "90": 0.06481, + "91": 0.06476, + "92": 0.06383, + "93": 0.06456, + "94": 0.06292, + "95": 0.0638, + "96": 0.06392, + "97": 0.06356, + "98": 0.06355, + "99": 0.06439, + "100": 0.06428 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..42b005d7102 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.8401, + "2": 10.83566, + "3": 10.82993, + "4": 10.8173, + "5": 10.84032, + "6": 10.87262, + "7": 10.83467, + "8": 10.8403, + "9": 10.84359, + "10": 10.8134, + "11": 10.85025, + "12": 10.84316, + "13": 10.86605, + "14": 10.86315, + "15": 10.80276, + "16": 10.79643, + "17": 10.7763, + "18": 10.8015, + "19": 10.7939, + "20": 10.705, + "21": 10.68148, + "22": 10.56313, + "23": 10.70136, + "24": 10.57939, + "25": 10.53849, + "26": 10.60617, + "27": 10.59211, + "28": 10.56156, + "29": 10.57666, + "30": 10.35521, + "31": 10.12773, + "32": 10.46367, + "33": 10.45444, + "34": 10.22451, + "35": 10.27148, + "36": 10.22184, + "37": 10.33945, + "38": 10.18637, + "39": 10.39329, + "40": 10.08049, + "41": 10.13789, + "42": 10.20012, + "43": 9.83791, + "44": 9.94327, + "45": 9.8229, + "46": 9.82313, + "47": 10.13353, + "48": 9.8415, + "49": 9.52102, + "50": 9.90118, + "51": 9.83467, + "52": 9.73176, + "53": 10.04773, + "54": 9.93856, + "55": 9.86424, + "56": 9.61259, + "57": 9.46819, + "58": 9.81223, + "59": 9.57172, + "60": 9.4803, + "61": 9.67964, + "62": 9.96738, + "63": 9.35351, + "64": 9.7573, + "65": 8.93743, + "66": 9.68132, + "67": 9.35694, + "68": 9.7681, + "69": 9.77289, + "70": 9.71026, + "71": 9.60024, + "72": 9.56674, + "73": 9.47644, + "74": 8.93189, + "75": 9.4088, + "76": 9.06887, + "77": 10.04696, + "78": 9.70975, + "79": 9.35669, + "80": 9.39078, + "81": 9.46574, + "82": 9.68028, + "83": 9.29218, + "84": 9.40234, + "85": 9.59741, + "86": 9.06109, + "87": 9.57951, + "88": 9.73247, + "89": 9.58838, + "90": 9.80389, + "91": 9.32105, + "92": 9.35011, + "93": 9.06313, + "94": 8.82006, + "95": 9.50562, + "96": 9.51103, + "97": 9.29305, + "98": 9.65571, + "99": 8.87502, + "100": 9.38808 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1670.0, + "2": 1691.0, + "3": 1630.0, + "4": 1805.0, + "5": 1970.0, + "6": 1901.0, + "7": 1816.0, + "8": 1587.0, + "9": 1905.0, + "10": 1397.0, + "11": 1954.0, + "12": 1859.0, + "13": 1873.0, + "14": 1875.0, + "15": 1936.0, + "16": 1972.0, + "17": 1816.0, + "18": 1773.0, + "19": 1833.0, + "20": 1715.0, + "21": 1923.0, + "22": 1681.0, + "23": 2055.0, + "24": 1727.0, + "25": 1703.0, + "26": 1761.0, + "27": 1917.0, + "28": 1962.0, + "29": 2010.0, + "30": 1957.0, + "31": 1723.0, + "32": 1898.0, + "33": 2153.0, + "34": 1828.0, + "35": 1991.0, + "36": 1937.0, + "37": 2347.0, + "38": 2365.0, + "39": 2349.0, + "40": 2239.0, + "41": 2217.0, + "42": 2222.0, + "43": 2121.0, + "44": 2059.0, + "45": 2144.0, + "46": 2296.0, + "47": 2487.0, + "48": 2376.0, + "49": 2330.0, + "50": 2377.0, + "51": 2540.0, + "52": 2598.0, + "53": 2917.0, + "54": 2715.0, + "55": 2436.0, + "56": 2691.0, + "57": 2196.0, + "58": 2875.0, + "59": 2726.0, + "60": 2445.0, + "61": 3031.0, + "62": 2618.0, + "63": 2551.0, + "64": 2939.0, + "65": 2645.0, + "66": 3160.0, + "67": 2729.0, + "68": 2852.0, + "69": 2938.0, + "70": 3337.0, + "71": 3044.0, + "72": 2531.0, + "73": 2918.0, + "74": 1976.0, + "75": 2726.0, + "76": 3036.0, + "77": 3435.0, + "78": 3375.0, + "79": 3221.0, + "80": 3356.0, + "81": 3820.0, + "82": 3203.0, + "83": 2699.0, + "84": 3073.0, + "85": 3336.0, + "86": 2729.0, + "87": 3962.0, + "88": 3062.0, + "89": 3512.0, + "90": 3044.0, + "91": 2957.0, + "92": 3276.0, + "93": 2757.0, + "94": 3568.0, + "95": 3484.0, + "96": 3627.0, + "97": 3229.0, + "98": 3722.0, + "99": 3219.0, + "100": 3467.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 552238592.0, + "2": 552238592.0, + "3": 552238592.0, + "4": 552238592.0, + "5": 552238592.0, + "6": 552238592.0, + "7": 552238592.0, + "8": 552238592.0, + "9": 552238592.0, + "10": 552238592.0, + "11": 552238592.0, + "12": 552238592.0, + "13": 552238592.0, + "14": 552238592.0, + "15": 552238592.0, + "16": 552238592.0, + "17": 552238592.0, + "18": 552238592.0, + "19": 552238592.0, + "20": 552238592.0, + "21": 552238592.0, + "22": 552238592.0, + "23": 552238592.0, + "24": 552238592.0, + "25": 552238592.0, + "26": 552238592.0, + "27": 552238592.0, + "28": 552238592.0, + "29": 552238592.0, + "30": 552238592.0, + "31": 552238592.0, + "32": 552238592.0, + "33": 552238592.0, + "34": 552238592.0, + "35": 552238592.0, + "36": 552238592.0, + "37": 552238592.0, + "38": 552238592.0, + "39": 552238592.0, + "40": 552238592.0, + "41": 552238592.0, + "42": 552238592.0, + "43": 552238592.0, + "44": 552238592.0, + "45": 552238592.0, + "46": 552238592.0, + "47": 552238592.0, + "48": 552238592.0, + "49": 552238592.0, + "50": 552238592.0, + "51": 552238592.0, + "52": 552238592.0, + "53": 552238592.0, + "54": 552238592.0, + "55": 552238592.0, + "56": 552238592.0, + "57": 552238592.0, + "58": 552238592.0, + "59": 552238592.0, + "60": 552238592.0, + "61": 552238592.0, + "62": 552238592.0, + "63": 552238592.0, + "64": 552238592.0, + "65": 552238592.0, + "66": 552238592.0, + "67": 552238592.0, + "68": 552238592.0, + "69": 552238592.0, + "70": 552238592.0, + "71": 552238592.0, + "72": 552238592.0, + "73": 552238592.0, + "74": 552238592.0, + "75": 552238592.0, + "76": 552238592.0, + "77": 552238592.0, + "78": 552238592.0, + "79": 552238592.0, + "80": 552238592.0, + "81": 552238592.0, + "82": 552238592.0, + "83": 552238592.0, + "84": 552238592.0, + "85": 552238592.0, + "86": 552238592.0, + "87": 552238592.0, + "88": 552238592.0, + "89": 552238592.0, + "90": 552238592.0, + "91": 552238592.0, + "92": 552238592.0, + "93": 552238592.0, + "94": 552238592.0, + "95": 552238592.0, + "96": 552238592.0, + "97": 552238592.0, + "98": 552238592.0, + "99": 552238592.0, + "100": 552238592.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4576563200.0, + "2": 4673179648.0, + "3": 4673179648.0, + "4": 4673179648.0, + "5": 4673179648.0, + "6": 4673179648.0, + "7": 4673179648.0, + "8": 4673179648.0, + "9": 4673179648.0, + "10": 4673179648.0, + "11": 4673179648.0, + "12": 4673179648.0, + "13": 4673179648.0, + "14": 4673179648.0, + "15": 4673179648.0, + "16": 4673179648.0, + "17": 4673179648.0, + "18": 4673179648.0, + "19": 4673179648.0, + "20": 4673179648.0, + "21": 4673179648.0, + "22": 4673179648.0, + "23": 4673179648.0, + "24": 4673179648.0, + "25": 4673179648.0, + "26": 4673179648.0, + "27": 4673179648.0, + "28": 4673179648.0, + "29": 4673179648.0, + "30": 4673179648.0, + "31": 4673179648.0, + "32": 4673179648.0, + "33": 4673179648.0, + "34": 4673179648.0, + "35": 4673179648.0, + "36": 4673179648.0, + "37": 4673179648.0, + "38": 4673179648.0, + "39": 4673179648.0, + "40": 4673179648.0, + "41": 4673179648.0, + "42": 4673179648.0, + "43": 4673179648.0, + "44": 4673179648.0, + "45": 4673179648.0, + "46": 4673179648.0, + "47": 4673179648.0, + "48": 4673179648.0, + "49": 4673179648.0, + "50": 4673179648.0, + "51": 4673179648.0, + "52": 4673179648.0, + "53": 4673179648.0, + "54": 4673179648.0, + "55": 4673179648.0, + "56": 4673179648.0, + "57": 4673179648.0, + "58": 4673179648.0, + "59": 4673179648.0, + "60": 4673179648.0, + "61": 4673179648.0, + "62": 4673179648.0, + "63": 4673179648.0, + "64": 4673179648.0, + "65": 4673179648.0, + "66": 4673179648.0, + "67": 4673179648.0, + "68": 4673179648.0, + "69": 4673179648.0, + "70": 4673179648.0, + "71": 4673179648.0, + "72": 4673179648.0, + "73": 4673179648.0, + "74": 4673179648.0, + "75": 4673179648.0, + "76": 4673179648.0, + "77": 4673179648.0, + "78": 4673179648.0, + "79": 4673179648.0, + "80": 4673179648.0, + "81": 4673179648.0, + "82": 4673179648.0, + "83": 4673179648.0, + "84": 4673179648.0, + "85": 4673179648.0, + "86": 4673179648.0, + "87": 4673179648.0, + "88": 4673179648.0, + "89": 4673179648.0, + "90": 4673179648.0, + "91": 4673179648.0, + "92": 4673179648.0, + "93": 4673179648.0, + "94": 4673179648.0, + "95": 4673179648.0, + "96": 4673179648.0, + "97": 4673179648.0, + "98": 4673179648.0, + "99": 4673179648.0, + "100": 4673179648.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 12.14508, + "2": 0.13504, + "3": 0.10484, + "4": 0.10489, + "5": 0.10473, + "6": 0.10497, + "7": 0.10413, + "8": 0.10536, + "9": 0.32726, + "10": 0.10707, + "11": 0.1004, + "12": 0.10131, + "13": 0.10126, + "14": 0.10152, + "15": 0.10011, + "16": 0.10055, + "17": 0.10006, + "18": 0.10008, + "19": 0.09902, + "20": 0.10043, + "21": 0.09943, + "22": 0.10108, + "23": 0.10016, + "24": 0.10055, + "25": 0.10767, + "26": 0.10062, + "27": 0.09965, + "28": 0.09956, + "29": 0.09902, + "30": 0.09994, + "31": 0.10043, + "32": 0.09913, + "33": 0.09934, + "34": 0.10116, + "35": 0.09881, + "36": 0.09921, + "37": 0.09882, + "38": 0.09871, + "39": 0.09864, + "40": 0.09965, + "41": 0.09923, + "42": 0.09939, + "43": 0.10071, + "44": 0.09983, + "45": 0.35882, + "46": 0.10188, + "47": 0.09992, + "48": 0.09983, + "49": 0.09848, + "50": 0.10049, + "51": 0.11806, + "52": 0.10549, + "53": 0.10158, + "54": 0.10548, + "55": 0.10224, + "56": 0.10244, + "57": 0.10391, + "58": 0.10383, + "59": 0.10417, + "60": 0.10737, + "61": 0.1029, + "62": 0.10202, + "63": 0.10011, + "64": 0.10594, + "65": 0.10093, + "66": 0.10168, + "67": 0.1008, + "68": 0.14562, + "69": 0.09913, + "70": 0.10262, + "71": 0.09958, + "72": 0.10173, + "73": 0.09928, + "74": 0.10376, + "75": 0.09944, + "76": 0.10143, + "77": 0.10005, + "78": 0.1033, + "79": 0.09996, + "80": 0.10114, + "81": 0.09988, + "82": 0.10093, + "83": 0.09908, + "84": 0.1014, + "85": 0.09925, + "86": 0.10175, + "87": 0.09965, + "88": 0.10189, + "89": 0.10015, + "90": 0.10099, + "91": 0.09925, + "92": 0.10123, + "93": 0.09879, + "94": 0.10599, + "95": 0.0991, + "96": 0.10147, + "97": 0.09941, + "98": 0.10245, + "99": 0.09902, + "100": 0.10071 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..2fd83504089 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.8401, + "2": 10.83566, + "3": 10.82993, + "4": 10.8173, + "5": 10.84032, + "6": 10.87262, + "7": 10.83467, + "8": 10.8403, + "9": 10.84359, + "10": 10.8134, + "11": 10.85025, + "12": 10.84316, + "13": 10.86605, + "14": 10.86315, + "15": 10.80276, + "16": 10.79643, + "17": 10.7763, + "18": 10.8015, + "19": 10.7939, + "20": 10.705, + "21": 10.68148, + "22": 10.56313, + "23": 10.70136, + "24": 10.57939, + "25": 10.53849, + "26": 10.60617, + "27": 10.59211, + "28": 10.56156, + "29": 10.57666, + "30": 10.35521, + "31": 10.12773, + "32": 10.46367, + "33": 10.45444, + "34": 10.22451, + "35": 10.27148, + "36": 10.22184, + "37": 10.33945, + "38": 10.18637, + "39": 10.39329, + "40": 10.08049, + "41": 10.13789, + "42": 10.20012, + "43": 9.83791, + "44": 9.94327, + "45": 9.8229, + "46": 9.82313, + "47": 10.13353, + "48": 9.8415, + "49": 9.52102, + "50": 9.90118, + "51": 9.83467, + "52": 9.73176, + "53": 10.04773, + "54": 9.93856, + "55": 9.86424, + "56": 9.61259, + "57": 9.46819, + "58": 9.81223, + "59": 9.57172, + "60": 9.4803, + "61": 9.67964, + "62": 9.96738, + "63": 9.35351, + "64": 9.7573, + "65": 8.93743, + "66": 9.68132, + "67": 9.35694, + "68": 9.7681, + "69": 9.77289, + "70": 9.71026, + "71": 9.60024, + "72": 9.56674, + "73": 9.47644, + "74": 8.93189, + "75": 9.4088, + "76": 9.06887, + "77": 10.04696, + "78": 9.70975, + "79": 9.35669, + "80": 9.39078, + "81": 9.46574, + "82": 9.68028, + "83": 9.29218, + "84": 9.40234, + "85": 9.59741, + "86": 9.06109, + "87": 9.57951, + "88": 9.73247, + "89": 9.58838, + "90": 9.80389, + "91": 9.32105, + "92": 9.35011, + "93": 9.06313, + "94": 8.82006, + "95": 9.50562, + "96": 9.51103, + "97": 9.29305, + "98": 9.65571, + "99": 8.87502, + "100": 9.38808 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1670.0, + "2": 1691.0, + "3": 1630.0, + "4": 1805.0, + "5": 1970.0, + "6": 1901.0, + "7": 1816.0, + "8": 1587.0, + "9": 1905.0, + "10": 1397.0, + "11": 1954.0, + "12": 1859.0, + "13": 1873.0, + "14": 1875.0, + "15": 1936.0, + "16": 1972.0, + "17": 1816.0, + "18": 1773.0, + "19": 1833.0, + "20": 1715.0, + "21": 1923.0, + "22": 1681.0, + "23": 2055.0, + "24": 1727.0, + "25": 1703.0, + "26": 1761.0, + "27": 1917.0, + "28": 1962.0, + "29": 2010.0, + "30": 1957.0, + "31": 1723.0, + "32": 1898.0, + "33": 2153.0, + "34": 1828.0, + "35": 1991.0, + "36": 1937.0, + "37": 2347.0, + "38": 2365.0, + "39": 2349.0, + "40": 2239.0, + "41": 2217.0, + "42": 2222.0, + "43": 2121.0, + "44": 2059.0, + "45": 2144.0, + "46": 2296.0, + "47": 2487.0, + "48": 2376.0, + "49": 2330.0, + "50": 2377.0, + "51": 2540.0, + "52": 2598.0, + "53": 2917.0, + "54": 2715.0, + "55": 2436.0, + "56": 2691.0, + "57": 2196.0, + "58": 2875.0, + "59": 2726.0, + "60": 2445.0, + "61": 3031.0, + "62": 2618.0, + "63": 2551.0, + "64": 2939.0, + "65": 2645.0, + "66": 3160.0, + "67": 2729.0, + "68": 2852.0, + "69": 2938.0, + "70": 3337.0, + "71": 3044.0, + "72": 2531.0, + "73": 2918.0, + "74": 1976.0, + "75": 2726.0, + "76": 3036.0, + "77": 3435.0, + "78": 3375.0, + "79": 3221.0, + "80": 3356.0, + "81": 3820.0, + "82": 3203.0, + "83": 2699.0, + "84": 3073.0, + "85": 3336.0, + "86": 2729.0, + "87": 3962.0, + "88": 3062.0, + "89": 3512.0, + "90": 3044.0, + "91": 2957.0, + "92": 3276.0, + "93": 2757.0, + "94": 3568.0, + "95": 3484.0, + "96": 3627.0, + "97": 3229.0, + "98": 3722.0, + "99": 3219.0, + "100": 3467.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 552238592.0, + "2": 552238592.0, + "3": 552238592.0, + "4": 552238592.0, + "5": 552238592.0, + "6": 552238592.0, + "7": 552238592.0, + "8": 552238592.0, + "9": 552238592.0, + "10": 552238592.0, + "11": 552238592.0, + "12": 552238592.0, + "13": 552238592.0, + "14": 552238592.0, + "15": 552238592.0, + "16": 552238592.0, + "17": 552238592.0, + "18": 552238592.0, + "19": 552238592.0, + "20": 552238592.0, + "21": 552238592.0, + "22": 552238592.0, + "23": 552238592.0, + "24": 552238592.0, + "25": 552238592.0, + "26": 552238592.0, + "27": 552238592.0, + "28": 552238592.0, + "29": 552238592.0, + "30": 552238592.0, + "31": 552238592.0, + "32": 552238592.0, + "33": 552238592.0, + "34": 552238592.0, + "35": 552238592.0, + "36": 552238592.0, + "37": 552238592.0, + "38": 552238592.0, + "39": 552238592.0, + "40": 552238592.0, + "41": 552238592.0, + "42": 552238592.0, + "43": 552238592.0, + "44": 552238592.0, + "45": 552238592.0, + "46": 552238592.0, + "47": 552238592.0, + "48": 552238592.0, + "49": 552238592.0, + "50": 552238592.0, + "51": 552238592.0, + "52": 552238592.0, + "53": 552238592.0, + "54": 552238592.0, + "55": 552238592.0, + "56": 552238592.0, + "57": 552238592.0, + "58": 552238592.0, + "59": 552238592.0, + "60": 552238592.0, + "61": 552238592.0, + "62": 552238592.0, + "63": 552238592.0, + "64": 552238592.0, + "65": 552238592.0, + "66": 552238592.0, + "67": 552238592.0, + "68": 552238592.0, + "69": 552238592.0, + "70": 552238592.0, + "71": 552238592.0, + "72": 552238592.0, + "73": 552238592.0, + "74": 552238592.0, + "75": 552238592.0, + "76": 552238592.0, + "77": 552238592.0, + "78": 552238592.0, + "79": 552238592.0, + "80": 552238592.0, + "81": 552238592.0, + "82": 552238592.0, + "83": 552238592.0, + "84": 552238592.0, + "85": 552238592.0, + "86": 552238592.0, + "87": 552238592.0, + "88": 552238592.0, + "89": 552238592.0, + "90": 552238592.0, + "91": 552238592.0, + "92": 552238592.0, + "93": 552238592.0, + "94": 552238592.0, + "95": 552238592.0, + "96": 552238592.0, + "97": 552238592.0, + "98": 552238592.0, + "99": 552238592.0, + "100": 552238592.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4576563200.0, + "2": 4673179648.0, + "3": 4673179648.0, + "4": 4673179648.0, + "5": 4673179648.0, + "6": 4673179648.0, + "7": 4673179648.0, + "8": 4673179648.0, + "9": 4673179648.0, + "10": 4673179648.0, + "11": 4673179648.0, + "12": 4673179648.0, + "13": 4673179648.0, + "14": 4673179648.0, + "15": 4673179648.0, + "16": 4673179648.0, + "17": 4673179648.0, + "18": 4673179648.0, + "19": 4673179648.0, + "20": 4673179648.0, + "21": 4673179648.0, + "22": 4673179648.0, + "23": 4673179648.0, + "24": 4673179648.0, + "25": 4673179648.0, + "26": 4673179648.0, + "27": 4673179648.0, + "28": 4673179648.0, + "29": 4673179648.0, + "30": 4673179648.0, + "31": 4673179648.0, + "32": 4673179648.0, + "33": 4673179648.0, + "34": 4673179648.0, + "35": 4673179648.0, + "36": 4673179648.0, + "37": 4673179648.0, + "38": 4673179648.0, + "39": 4673179648.0, + "40": 4673179648.0, + "41": 4673179648.0, + "42": 4673179648.0, + "43": 4673179648.0, + "44": 4673179648.0, + "45": 4673179648.0, + "46": 4673179648.0, + "47": 4673179648.0, + "48": 4673179648.0, + "49": 4673179648.0, + "50": 4673179648.0, + "51": 4673179648.0, + "52": 4673179648.0, + "53": 4673179648.0, + "54": 4673179648.0, + "55": 4673179648.0, + "56": 4673179648.0, + "57": 4673179648.0, + "58": 4673179648.0, + "59": 4673179648.0, + "60": 4673179648.0, + "61": 4673179648.0, + "62": 4673179648.0, + "63": 4673179648.0, + "64": 4673179648.0, + "65": 4673179648.0, + "66": 4673179648.0, + "67": 4673179648.0, + "68": 4673179648.0, + "69": 4673179648.0, + "70": 4673179648.0, + "71": 4673179648.0, + "72": 4673179648.0, + "73": 4673179648.0, + "74": 4673179648.0, + "75": 4673179648.0, + "76": 4673179648.0, + "77": 4673179648.0, + "78": 4673179648.0, + "79": 4673179648.0, + "80": 4673179648.0, + "81": 4673179648.0, + "82": 4673179648.0, + "83": 4673179648.0, + "84": 4673179648.0, + "85": 4673179648.0, + "86": 4673179648.0, + "87": 4673179648.0, + "88": 4673179648.0, + "89": 4673179648.0, + "90": 4673179648.0, + "91": 4673179648.0, + "92": 4673179648.0, + "93": 4673179648.0, + "94": 4673179648.0, + "95": 4673179648.0, + "96": 4673179648.0, + "97": 4673179648.0, + "98": 4673179648.0, + "99": 4673179648.0, + "100": 4673179648.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 8.84608, + "2": 0.1383, + "3": 0.11074, + "4": 0.09988, + "5": 0.09832, + "6": 0.09852, + "7": 0.09942, + "8": 0.09887, + "9": 0.09982, + "10": 0.0999, + "11": 0.32507, + "12": 0.0997, + "13": 0.10073, + "14": 0.09862, + "15": 0.09903, + "16": 0.09813, + "17": 0.09854, + "18": 0.09827, + "19": 0.09818, + "20": 0.09782, + "21": 0.0976, + "22": 0.09763, + "23": 0.09742, + "24": 0.10007, + "25": 0.09709, + "26": 0.10028, + "27": 0.09967, + "28": 0.10005, + "29": 0.09819, + "30": 0.09782, + "31": 0.09728, + "32": 0.09707, + "33": 0.09712, + "34": 0.09768, + "35": 0.09779, + "36": 0.09761, + "37": 0.09958, + "38": 0.09866, + "39": 0.09784, + "40": 0.09877, + "41": 0.09772, + "42": 0.09833, + "43": 0.09811, + "44": 0.09781, + "45": 0.09781, + "46": 0.09827, + "47": 0.09771, + "48": 0.09763, + "49": 0.09768, + "50": 0.09899, + "51": 0.10947, + "52": 0.09886, + "53": 0.09597, + "54": 0.09838, + "55": 0.09729, + "56": 0.09695, + "57": 0.09961, + "58": 0.09847, + "59": 0.09888, + "60": 0.09635, + "61": 0.09692, + "62": 0.0979, + "63": 0.09738, + "64": 0.09561, + "65": 0.0984, + "66": 0.0969, + "67": 0.13611, + "68": 0.09631, + "69": 0.09564, + "70": 0.09538, + "71": 0.09557, + "72": 0.09548, + "73": 0.09581, + "74": 0.09593, + "75": 0.09489, + "76": 0.0959, + "77": 0.09486, + "78": 0.09568, + "79": 0.09634, + "80": 0.09468, + "81": 0.09589, + "82": 0.09598, + "83": 0.09489, + "84": 0.0954, + "85": 0.09413, + "86": 0.09499, + "87": 0.09424, + "88": 0.09411, + "89": 0.09598, + "90": 0.09549, + "91": 0.09452, + "92": 0.09467, + "93": 0.09619, + "94": 0.09523, + "95": 0.09445, + "96": 0.09426, + "97": 0.09435, + "98": 0.09523, + "99": 0.09534, + "100": 0.09547 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 5d9f1423ab0..dc66396ad6b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,537 @@ -{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.89631, "5": 10.89154, "10": 10.88361, "15": 10.84803, "20": 10.74824, "25": 10.59309, "30": 10.41204, "35": 10.28189, "40": 10.09271, "45": 9.84194, "50": 9.91343, "55": 9.88574, "60": 9.50243, "65": 8.94516, "70": 9.74451, "75": 9.42524, "80": 9.40454, "85": 9.61295, "90": 9.81672, "95": 9.51841, "100": 9.39923}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1483.0, "5": 1903.0, "10": 1427.0, "15": 1980.0, "20": 1588.0, "25": 1649.0, "30": 1984.0, "35": 1921.0, "40": 2367.0, "45": 2184.0, "50": 2444.0, "55": 2503.0, "60": 2367.0, "65": 2605.0, "70": 3135.0, "75": 2556.0, "80": 3301.0, "85": 3380.0, "90": 3198.0, "95": 3431.0, "100": 3089.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1261848064.0, "5": 1261848064.0, "10": 1261848064.0, "15": 1261848064.0, "20": 1261848064.0, "25": 1261848064.0, "30": 1261848064.0, "35": 1261848064.0, "40": 1261848064.0, "45": 1261848064.0, "50": 1261848064.0, "55": 1261848064.0, "60": 1261848064.0, "65": 1261848064.0, "70": 1261848064.0, "75": 1261848064.0, "80": 1261848064.0, "85": 1261848064.0, "90": 1261848064.0, "95": 1261848064.0, "100": 1261848064.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2013852672.0, "5": 2563430400.0, "10": 2563430400.0, "15": 2563430400.0, "20": 2563430400.0, "25": 2563430400.0, "30": 2563430400.0, "35": 2563430400.0, "40": 2563430400.0, "45": 2563430400.0, "50": 2563430400.0, "55": 2563430400.0, "60": 2563430400.0, "65": 2563430400.0, "70": 2563430400.0, "75": 2563430400.0, "80": 2563430400.0, "85": 2563430400.0, "90": 2563430400.0, "95": 2563430400.0, "100": 2563430400.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 6.02119, "5": 0.07164, "10": 0.07403, "15": 0.07196, "20": 0.07295, "25": 0.07132, "30": 0.07266, "35": 0.07052, "40": 0.08274, "45": 0.07025, "50": 0.07178, "55": 0.0715, "60": 0.07114, "65": 0.07081, "70": 0.07243, "75": 0.07071, "80": 0.07039, "85": 0.07108, "90": 0.07278, "95": 0.07197, "100": 0.07038}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.89631, + "2": 10.89416, + "3": 10.88786, + "4": 10.8914, + "5": 10.89154, + "6": 10.90001, + "7": 10.89184, + "8": 10.89886, + "9": 10.90208, + "10": 10.88361, + "11": 10.87817, + "12": 10.89334, + "13": 10.89814, + "14": 10.89242, + "15": 10.84803, + "16": 10.85398, + "17": 10.83097, + "18": 10.83991, + "19": 10.82801, + "20": 10.74824, + "21": 10.73496, + "22": 10.61719, + "23": 10.72621, + "24": 10.63178, + "25": 10.59309, + "26": 10.63369, + "27": 10.63304, + "28": 10.58264, + "29": 10.58594, + "30": 10.41204, + "31": 10.15899, + "32": 10.48366, + "33": 10.46706, + "34": 10.23811, + "35": 10.28189, + "36": 10.24056, + "37": 10.36219, + "38": 10.20309, + "39": 10.40454, + "40": 10.09271, + "41": 10.15835, + "42": 10.21933, + "43": 9.84358, + "44": 9.97303, + "45": 9.84194, + "46": 9.82017, + "47": 10.14969, + "48": 9.86023, + "49": 9.54235, + "50": 9.91343, + "51": 9.8545, + "52": 9.7393, + "53": 10.07426, + "54": 9.96913, + "55": 9.88574, + "56": 9.62438, + "57": 9.48229, + "58": 9.83484, + "59": 9.58731, + "60": 9.50243, + "61": 9.6934, + "62": 9.988, + "63": 9.39105, + "64": 9.78022, + "65": 8.94516, + "66": 9.70492, + "67": 9.37249, + "68": 9.78328, + "69": 9.79057, + "70": 9.74451, + "71": 9.62298, + "72": 9.58457, + "73": 9.50511, + "74": 8.94308, + "75": 9.42524, + "76": 9.07602, + "77": 10.06352, + "78": 9.72307, + "79": 9.37497, + "80": 9.40454, + "81": 9.4779, + "82": 9.69669, + "83": 9.30714, + "84": 9.41525, + "85": 9.61295, + "86": 9.07198, + "87": 9.58834, + "88": 9.7476, + "89": 9.59984, + "90": 9.81672, + "91": 9.33791, + "92": 9.35608, + "93": 9.07423, + "94": 8.83511, + "95": 9.51841, + "96": 9.52391, + "97": 9.30922, + "98": 9.66746, + "99": 8.88421, + "100": 9.39923 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1483.0, + "2": 1650.0, + "3": 1681.0, + "4": 1767.0, + "5": 1903.0, + "6": 1952.0, + "7": 1967.0, + "8": 1651.0, + "9": 1886.0, + "10": 1427.0, + "11": 1939.0, + "12": 1778.0, + "13": 1964.0, + "14": 1762.0, + "15": 1980.0, + "16": 1923.0, + "17": 1817.0, + "18": 1783.0, + "19": 1750.0, + "20": 1588.0, + "21": 1855.0, + "22": 1641.0, + "23": 2098.0, + "24": 1679.0, + "25": 1649.0, + "26": 1806.0, + "27": 1834.0, + "28": 2042.0, + "29": 2033.0, + "30": 1984.0, + "31": 1518.0, + "32": 1954.0, + "33": 2068.0, + "34": 1900.0, + "35": 1921.0, + "36": 1965.0, + "37": 2321.0, + "38": 2340.0, + "39": 2344.0, + "40": 2367.0, + "41": 2457.0, + "42": 2367.0, + "43": 2020.0, + "44": 2135.0, + "45": 2184.0, + "46": 2310.0, + "47": 2463.0, + "48": 2450.0, + "49": 2259.0, + "50": 2444.0, + "51": 2543.0, + "52": 2613.0, + "53": 2945.0, + "54": 2713.0, + "55": 2503.0, + "56": 2692.0, + "57": 2338.0, + "58": 2961.0, + "59": 2620.0, + "60": 2367.0, + "61": 2909.0, + "62": 2728.0, + "63": 2399.0, + "64": 2909.0, + "65": 2605.0, + "66": 2983.0, + "67": 2793.0, + "68": 2663.0, + "69": 2833.0, + "70": 3135.0, + "71": 2997.0, + "72": 2464.0, + "73": 3088.0, + "74": 1970.0, + "75": 2556.0, + "76": 3064.0, + "77": 3231.0, + "78": 3097.0, + "79": 3035.0, + "80": 3301.0, + "81": 3599.0, + "82": 3215.0, + "83": 2757.0, + "84": 3130.0, + "85": 3380.0, + "86": 2742.0, + "87": 3723.0, + "88": 3066.0, + "89": 3264.0, + "90": 3198.0, + "91": 2718.0, + "92": 3070.0, + "93": 2624.0, + "94": 3301.0, + "95": 3431.0, + "96": 3358.0, + "97": 3142.0, + "98": 3704.0, + "99": 3107.0, + "100": 3089.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1261848064.0, + "2": 1261848064.0, + "3": 1261848064.0, + "4": 1261848064.0, + "5": 1261848064.0, + "6": 1261848064.0, + "7": 1261848064.0, + "8": 1261848064.0, + "9": 1261848064.0, + "10": 1261848064.0, + "11": 1261848064.0, + "12": 1261848064.0, + "13": 1261848064.0, + "14": 1261848064.0, + "15": 1261848064.0, + "16": 1261848064.0, + "17": 1261848064.0, + "18": 1261848064.0, + "19": 1261848064.0, + "20": 1261848064.0, + "21": 1261848064.0, + "22": 1261848064.0, + "23": 1261848064.0, + "24": 1261848064.0, + "25": 1261848064.0, + "26": 1261848064.0, + "27": 1261848064.0, + "28": 1261848064.0, + "29": 1261848064.0, + "30": 1261848064.0, + "31": 1261848064.0, + "32": 1261848064.0, + "33": 1261848064.0, + "34": 1261848064.0, + "35": 1261848064.0, + "36": 1261848064.0, + "37": 1261848064.0, + "38": 1261848064.0, + "39": 1261848064.0, + "40": 1261848064.0, + "41": 1261848064.0, + "42": 1261848064.0, + "43": 1261848064.0, + "44": 1261848064.0, + "45": 1261848064.0, + "46": 1261848064.0, + "47": 1261848064.0, + "48": 1261848064.0, + "49": 1261848064.0, + "50": 1261848064.0, + "51": 1261848064.0, + "52": 1261848064.0, + "53": 1261848064.0, + "54": 1261848064.0, + "55": 1261848064.0, + "56": 1261848064.0, + "57": 1261848064.0, + "58": 1261848064.0, + "59": 1261848064.0, + "60": 1261848064.0, + "61": 1261848064.0, + "62": 1261848064.0, + "63": 1261848064.0, + "64": 1261848064.0, + "65": 1261848064.0, + "66": 1261848064.0, + "67": 1261848064.0, + "68": 1261848064.0, + "69": 1261848064.0, + "70": 1261848064.0, + "71": 1261848064.0, + "72": 1261848064.0, + "73": 1261848064.0, + "74": 1261848064.0, + "75": 1261848064.0, + "76": 1261848064.0, + "77": 1261848064.0, + "78": 1261848064.0, + "79": 1261848064.0, + "80": 1261848064.0, + "81": 1261848064.0, + "82": 1261848064.0, + "83": 1261848064.0, + "84": 1261848064.0, + "85": 1261848064.0, + "86": 1261848064.0, + "87": 1261848064.0, + "88": 1261848064.0, + "89": 1261848064.0, + "90": 1261848064.0, + "91": 1261848064.0, + "92": 1261848064.0, + "93": 1261848064.0, + "94": 1261848064.0, + "95": 1261848064.0, + "96": 1261848064.0, + "97": 1261848064.0, + "98": 1261848064.0, + "99": 1261848064.0, + "100": 1261848064.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2013852672.0, + "2": 2563430400.0, + "3": 2563430400.0, + "4": 2563430400.0, + "5": 2563430400.0, + "6": 2563430400.0, + "7": 2563430400.0, + "8": 2563430400.0, + "9": 2563430400.0, + "10": 2563430400.0, + "11": 2563430400.0, + "12": 2563430400.0, + "13": 2563430400.0, + "14": 2563430400.0, + "15": 2563430400.0, + "16": 2563430400.0, + "17": 2563430400.0, + "18": 2563430400.0, + "19": 2563430400.0, + "20": 2563430400.0, + "21": 2563430400.0, + "22": 2563430400.0, + "23": 2563430400.0, + "24": 2563430400.0, + "25": 2563430400.0, + "26": 2563430400.0, + "27": 2563430400.0, + "28": 2563430400.0, + "29": 2563430400.0, + "30": 2563430400.0, + "31": 2563430400.0, + "32": 2563430400.0, + "33": 2563430400.0, + "34": 2563430400.0, + "35": 2563430400.0, + "36": 2563430400.0, + "37": 2563430400.0, + "38": 2563430400.0, + "39": 2563430400.0, + "40": 2563430400.0, + "41": 2563430400.0, + "42": 2563430400.0, + "43": 2563430400.0, + "44": 2563430400.0, + "45": 2563430400.0, + "46": 2563430400.0, + "47": 2563430400.0, + "48": 2563430400.0, + "49": 2563430400.0, + "50": 2563430400.0, + "51": 2563430400.0, + "52": 2563430400.0, + "53": 2563430400.0, + "54": 2563430400.0, + "55": 2563430400.0, + "56": 2563430400.0, + "57": 2563430400.0, + "58": 2563430400.0, + "59": 2563430400.0, + "60": 2563430400.0, + "61": 2563430400.0, + "62": 2563430400.0, + "63": 2563430400.0, + "64": 2563430400.0, + "65": 2563430400.0, + "66": 2563430400.0, + "67": 2563430400.0, + "68": 2563430400.0, + "69": 2563430400.0, + "70": 2563430400.0, + "71": 2563430400.0, + "72": 2563430400.0, + "73": 2563430400.0, + "74": 2563430400.0, + "75": 2563430400.0, + "76": 2563430400.0, + "77": 2563430400.0, + "78": 2563430400.0, + "79": 2563430400.0, + "80": 2563430400.0, + "81": 2563430400.0, + "82": 2563430400.0, + "83": 2563430400.0, + "84": 2563430400.0, + "85": 2563430400.0, + "86": 2563430400.0, + "87": 2563430400.0, + "88": 2563430400.0, + "89": 2563430400.0, + "90": 2563430400.0, + "91": 2563430400.0, + "92": 2563430400.0, + "93": 2563430400.0, + "94": 2563430400.0, + "95": 2563430400.0, + "96": 2563430400.0, + "97": 2563430400.0, + "98": 2563430400.0, + "99": 2563430400.0, + "100": 2563430400.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.44856, + "2": 0.10562, + "3": 0.09824, + "4": 0.09657, + "5": 0.10604, + "6": 0.09627, + "7": 0.09681, + "8": 0.09299, + "9": 0.09413, + "10": 0.09401, + "11": 0.09341, + "12": 0.09223, + "13": 0.09373, + "14": 0.0936, + "15": 0.09439, + "16": 0.09285, + "17": 0.09422, + "18": 0.09511, + "19": 0.09966, + "20": 0.10107, + "21": 0.09445, + "22": 0.09548, + "23": 0.09554, + "24": 0.09478, + "25": 0.09465, + "26": 0.09292, + "27": 0.10339, + "28": 0.09562, + "29": 0.09593, + "30": 0.09298, + "31": 0.09573, + "32": 0.09264, + "33": 0.0942, + "34": 0.09203, + "35": 0.09537, + "36": 0.09222, + "37": 0.09501, + "38": 0.0938, + "39": 0.09662, + "40": 0.10355, + "41": 0.09832, + "42": 0.09636, + "43": 0.09409, + "44": 0.09306, + "45": 0.09367, + "46": 0.09321, + "47": 0.10415, + "48": 0.09382, + "49": 0.09322, + "50": 0.09238, + "51": 0.09596, + "52": 0.09089, + "53": 0.0918, + "54": 0.09088, + "55": 0.09144, + "56": 0.09049, + "57": 0.09241, + "58": 0.09222, + "59": 0.09415, + "60": 0.09271, + "61": 0.09208, + "62": 0.09152, + "63": 0.09266, + "64": 0.09085, + "65": 0.09196, + "66": 0.09181, + "67": 0.09397, + "68": 0.08963, + "69": 0.09222, + "70": 0.09229, + "71": 0.09614, + "72": 0.0904, + "73": 0.09323, + "74": 0.09152, + "75": 0.09189, + "76": 0.08973, + "77": 0.09202, + "78": 0.08991, + "79": 0.09241, + "80": 0.08986, + "81": 0.09353, + "82": 0.09206, + "83": 0.09177, + "84": 0.09067, + "85": 0.09271, + "86": 0.09133, + "87": 0.09239, + "88": 0.08972, + "89": 0.09242, + "90": 0.09005, + "91": 0.09389, + "92": 0.09396, + "93": 0.09776, + "94": 0.09824, + "95": 0.1008, + "96": 0.09732, + "97": 0.09819, + "98": 0.09221, + "99": 0.09502, + "100": 0.09143 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..b668a763f40 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.89631, + "2": 10.89416, + "3": 10.88786, + "4": 10.8914, + "5": 10.89154, + "6": 10.90001, + "7": 10.89184, + "8": 10.89886, + "9": 10.90208, + "10": 10.88361, + "11": 10.87817, + "12": 10.89334, + "13": 10.89814, + "14": 10.89242, + "15": 10.84803, + "16": 10.85398, + "17": 10.83097, + "18": 10.83991, + "19": 10.82801, + "20": 10.74824, + "21": 10.73496, + "22": 10.61719, + "23": 10.72621, + "24": 10.63178, + "25": 10.59309, + "26": 10.63369, + "27": 10.63304, + "28": 10.58264, + "29": 10.58594, + "30": 10.41204, + "31": 10.15899, + "32": 10.48366, + "33": 10.46706, + "34": 10.23811, + "35": 10.28189, + "36": 10.24056, + "37": 10.36219, + "38": 10.20309, + "39": 10.40454, + "40": 10.09271, + "41": 10.15835, + "42": 10.21933, + "43": 9.84358, + "44": 9.97303, + "45": 9.84194, + "46": 9.82017, + "47": 10.14969, + "48": 9.86023, + "49": 9.54235, + "50": 9.91343, + "51": 9.8545, + "52": 9.7393, + "53": 10.07426, + "54": 9.96913, + "55": 9.88574, + "56": 9.62438, + "57": 9.48229, + "58": 9.83484, + "59": 9.58731, + "60": 9.50243, + "61": 9.6934, + "62": 9.988, + "63": 9.39105, + "64": 9.78022, + "65": 8.94516, + "66": 9.70492, + "67": 9.37249, + "68": 9.78328, + "69": 9.79057, + "70": 9.74451, + "71": 9.62298, + "72": 9.58457, + "73": 9.50511, + "74": 8.94308, + "75": 9.42524, + "76": 9.07602, + "77": 10.06352, + "78": 9.72307, + "79": 9.37497, + "80": 9.40454, + "81": 9.4779, + "82": 9.69669, + "83": 9.30714, + "84": 9.41525, + "85": 9.61295, + "86": 9.07198, + "87": 9.58834, + "88": 9.7476, + "89": 9.59984, + "90": 9.81672, + "91": 9.33791, + "92": 9.35608, + "93": 9.07423, + "94": 8.83511, + "95": 9.51841, + "96": 9.52391, + "97": 9.30922, + "98": 9.66746, + "99": 8.88421, + "100": 9.39923 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1483.0, + "2": 1650.0, + "3": 1681.0, + "4": 1767.0, + "5": 1903.0, + "6": 1952.0, + "7": 1967.0, + "8": 1651.0, + "9": 1886.0, + "10": 1427.0, + "11": 1939.0, + "12": 1778.0, + "13": 1964.0, + "14": 1762.0, + "15": 1980.0, + "16": 1923.0, + "17": 1817.0, + "18": 1783.0, + "19": 1750.0, + "20": 1588.0, + "21": 1855.0, + "22": 1641.0, + "23": 2098.0, + "24": 1679.0, + "25": 1649.0, + "26": 1806.0, + "27": 1834.0, + "28": 2042.0, + "29": 2033.0, + "30": 1984.0, + "31": 1518.0, + "32": 1954.0, + "33": 2068.0, + "34": 1900.0, + "35": 1921.0, + "36": 1965.0, + "37": 2321.0, + "38": 2340.0, + "39": 2344.0, + "40": 2367.0, + "41": 2457.0, + "42": 2367.0, + "43": 2020.0, + "44": 2135.0, + "45": 2184.0, + "46": 2310.0, + "47": 2463.0, + "48": 2450.0, + "49": 2259.0, + "50": 2444.0, + "51": 2543.0, + "52": 2613.0, + "53": 2945.0, + "54": 2713.0, + "55": 2503.0, + "56": 2692.0, + "57": 2338.0, + "58": 2961.0, + "59": 2620.0, + "60": 2367.0, + "61": 2909.0, + "62": 2728.0, + "63": 2399.0, + "64": 2909.0, + "65": 2605.0, + "66": 2983.0, + "67": 2793.0, + "68": 2663.0, + "69": 2833.0, + "70": 3135.0, + "71": 2997.0, + "72": 2464.0, + "73": 3088.0, + "74": 1970.0, + "75": 2556.0, + "76": 3064.0, + "77": 3231.0, + "78": 3097.0, + "79": 3035.0, + "80": 3301.0, + "81": 3599.0, + "82": 3215.0, + "83": 2757.0, + "84": 3130.0, + "85": 3380.0, + "86": 2742.0, + "87": 3723.0, + "88": 3066.0, + "89": 3264.0, + "90": 3198.0, + "91": 2718.0, + "92": 3070.0, + "93": 2624.0, + "94": 3301.0, + "95": 3431.0, + "96": 3358.0, + "97": 3142.0, + "98": 3704.0, + "99": 3107.0, + "100": 3089.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1261848064.0, + "2": 1261848064.0, + "3": 1261848064.0, + "4": 1261848064.0, + "5": 1261848064.0, + "6": 1261848064.0, + "7": 1261848064.0, + "8": 1261848064.0, + "9": 1261848064.0, + "10": 1261848064.0, + "11": 1261848064.0, + "12": 1261848064.0, + "13": 1261848064.0, + "14": 1261848064.0, + "15": 1261848064.0, + "16": 1261848064.0, + "17": 1261848064.0, + "18": 1261848064.0, + "19": 1261848064.0, + "20": 1261848064.0, + "21": 1261848064.0, + "22": 1261848064.0, + "23": 1261848064.0, + "24": 1261848064.0, + "25": 1261848064.0, + "26": 1261848064.0, + "27": 1261848064.0, + "28": 1261848064.0, + "29": 1261848064.0, + "30": 1261848064.0, + "31": 1261848064.0, + "32": 1261848064.0, + "33": 1261848064.0, + "34": 1261848064.0, + "35": 1261848064.0, + "36": 1261848064.0, + "37": 1261848064.0, + "38": 1261848064.0, + "39": 1261848064.0, + "40": 1261848064.0, + "41": 1261848064.0, + "42": 1261848064.0, + "43": 1261848064.0, + "44": 1261848064.0, + "45": 1261848064.0, + "46": 1261848064.0, + "47": 1261848064.0, + "48": 1261848064.0, + "49": 1261848064.0, + "50": 1261848064.0, + "51": 1261848064.0, + "52": 1261848064.0, + "53": 1261848064.0, + "54": 1261848064.0, + "55": 1261848064.0, + "56": 1261848064.0, + "57": 1261848064.0, + "58": 1261848064.0, + "59": 1261848064.0, + "60": 1261848064.0, + "61": 1261848064.0, + "62": 1261848064.0, + "63": 1261848064.0, + "64": 1261848064.0, + "65": 1261848064.0, + "66": 1261848064.0, + "67": 1261848064.0, + "68": 1261848064.0, + "69": 1261848064.0, + "70": 1261848064.0, + "71": 1261848064.0, + "72": 1261848064.0, + "73": 1261848064.0, + "74": 1261848064.0, + "75": 1261848064.0, + "76": 1261848064.0, + "77": 1261848064.0, + "78": 1261848064.0, + "79": 1261848064.0, + "80": 1261848064.0, + "81": 1261848064.0, + "82": 1261848064.0, + "83": 1261848064.0, + "84": 1261848064.0, + "85": 1261848064.0, + "86": 1261848064.0, + "87": 1261848064.0, + "88": 1261848064.0, + "89": 1261848064.0, + "90": 1261848064.0, + "91": 1261848064.0, + "92": 1261848064.0, + "93": 1261848064.0, + "94": 1261848064.0, + "95": 1261848064.0, + "96": 1261848064.0, + "97": 1261848064.0, + "98": 1261848064.0, + "99": 1261848064.0, + "100": 1261848064.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2013852672.0, + "2": 2563430400.0, + "3": 2563430400.0, + "4": 2563430400.0, + "5": 2563430400.0, + "6": 2563430400.0, + "7": 2563430400.0, + "8": 2563430400.0, + "9": 2563430400.0, + "10": 2563430400.0, + "11": 2563430400.0, + "12": 2563430400.0, + "13": 2563430400.0, + "14": 2563430400.0, + "15": 2563430400.0, + "16": 2563430400.0, + "17": 2563430400.0, + "18": 2563430400.0, + "19": 2563430400.0, + "20": 2563430400.0, + "21": 2563430400.0, + "22": 2563430400.0, + "23": 2563430400.0, + "24": 2563430400.0, + "25": 2563430400.0, + "26": 2563430400.0, + "27": 2563430400.0, + "28": 2563430400.0, + "29": 2563430400.0, + "30": 2563430400.0, + "31": 2563430400.0, + "32": 2563430400.0, + "33": 2563430400.0, + "34": 2563430400.0, + "35": 2563430400.0, + "36": 2563430400.0, + "37": 2563430400.0, + "38": 2563430400.0, + "39": 2563430400.0, + "40": 2563430400.0, + "41": 2563430400.0, + "42": 2563430400.0, + "43": 2563430400.0, + "44": 2563430400.0, + "45": 2563430400.0, + "46": 2563430400.0, + "47": 2563430400.0, + "48": 2563430400.0, + "49": 2563430400.0, + "50": 2563430400.0, + "51": 2563430400.0, + "52": 2563430400.0, + "53": 2563430400.0, + "54": 2563430400.0, + "55": 2563430400.0, + "56": 2563430400.0, + "57": 2563430400.0, + "58": 2563430400.0, + "59": 2563430400.0, + "60": 2563430400.0, + "61": 2563430400.0, + "62": 2563430400.0, + "63": 2563430400.0, + "64": 2563430400.0, + "65": 2563430400.0, + "66": 2563430400.0, + "67": 2563430400.0, + "68": 2563430400.0, + "69": 2563430400.0, + "70": 2563430400.0, + "71": 2563430400.0, + "72": 2563430400.0, + "73": 2563430400.0, + "74": 2563430400.0, + "75": 2563430400.0, + "76": 2563430400.0, + "77": 2563430400.0, + "78": 2563430400.0, + "79": 2563430400.0, + "80": 2563430400.0, + "81": 2563430400.0, + "82": 2563430400.0, + "83": 2563430400.0, + "84": 2563430400.0, + "85": 2563430400.0, + "86": 2563430400.0, + "87": 2563430400.0, + "88": 2563430400.0, + "89": 2563430400.0, + "90": 2563430400.0, + "91": 2563430400.0, + "92": 2563430400.0, + "93": 2563430400.0, + "94": 2563430400.0, + "95": 2563430400.0, + "96": 2563430400.0, + "97": 2563430400.0, + "98": 2563430400.0, + "99": 2563430400.0, + "100": 2563430400.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 5.78359, + "2": 0.10731, + "3": 0.08283, + "4": 0.07992, + "5": 0.08439, + "6": 0.07969, + "7": 0.08163, + "8": 0.08089, + "9": 0.08141, + "10": 0.07975, + "11": 0.08161, + "12": 0.0805, + "13": 0.0818, + "14": 0.07991, + "15": 0.08157, + "16": 0.07954, + "17": 0.08164, + "18": 0.07926, + "19": 0.08125, + "20": 0.07966, + "21": 0.08124, + "22": 0.08103, + "23": 0.08196, + "24": 0.08021, + "25": 0.08231, + "26": 0.07972, + "27": 0.08528, + "28": 0.07953, + "29": 0.08123, + "30": 0.08056, + "31": 0.08212, + "32": 0.08047, + "33": 0.08698, + "34": 0.07962, + "35": 0.08139, + "36": 0.0794, + "37": 0.08328, + "38": 0.07999, + "39": 0.08718, + "40": 0.08108, + "41": 0.08156, + "42": 0.07929, + "43": 0.08201, + "44": 0.07973, + "45": 0.08159, + "46": 0.08471, + "47": 0.08541, + "48": 0.07975, + "49": 0.08192, + "50": 0.08031, + "51": 0.08385, + "52": 0.08324, + "53": 0.08018, + "54": 0.08375, + "55": 0.08221, + "56": 0.08137, + "57": 0.08577, + "58": 0.08166, + "59": 0.08204, + "60": 0.08143, + "61": 0.08073, + "62": 0.08115, + "63": 0.08107, + "64": 0.08084, + "65": 0.08278, + "66": 0.08197, + "67": 0.08122, + "68": 0.08061, + "69": 0.08097, + "70": 0.08354, + "71": 0.08073, + "72": 0.08394, + "73": 0.08209, + "74": 0.0827, + "75": 0.08731, + "76": 0.08195, + "77": 0.08148, + "78": 0.08314, + "79": 0.08109, + "80": 0.0807, + "81": 0.08051, + "82": 0.08191, + "83": 0.08724, + "84": 0.08176, + "85": 0.0832, + "86": 0.08166, + "87": 0.08365, + "88": 0.0816, + "89": 0.0817, + "90": 0.08103, + "91": 0.08096, + "92": 0.08046, + "93": 0.08298, + "94": 0.08019, + "95": 0.08128, + "96": 0.08237, + "97": 0.08167, + "98": 0.0806, + "99": 0.08319, + "100": 0.08202 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..df5117f4d8f --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.89631, + "2": 10.89416, + "3": 10.88786, + "4": 10.8914, + "5": 10.89154, + "6": 10.90001, + "7": 10.89184, + "8": 10.89886, + "9": 10.90208, + "10": 10.88361, + "11": 10.87817, + "12": 10.89334, + "13": 10.89814, + "14": 10.89242, + "15": 10.84803, + "16": 10.85398, + "17": 10.83097, + "18": 10.83991, + "19": 10.82801, + "20": 10.74824, + "21": 10.73496, + "22": 10.61719, + "23": 10.72621, + "24": 10.63178, + "25": 10.59309, + "26": 10.63369, + "27": 10.63304, + "28": 10.58264, + "29": 10.58594, + "30": 10.41204, + "31": 10.15899, + "32": 10.48366, + "33": 10.46706, + "34": 10.23811, + "35": 10.28189, + "36": 10.24056, + "37": 10.36219, + "38": 10.20309, + "39": 10.40454, + "40": 10.09271, + "41": 10.15835, + "42": 10.21933, + "43": 9.84358, + "44": 9.97303, + "45": 9.84194, + "46": 9.82017, + "47": 10.14969, + "48": 9.86023, + "49": 9.54235, + "50": 9.91343, + "51": 9.8545, + "52": 9.7393, + "53": 10.07426, + "54": 9.96913, + "55": 9.88574, + "56": 9.62438, + "57": 9.48229, + "58": 9.83484, + "59": 9.58731, + "60": 9.50243, + "61": 9.6934, + "62": 9.988, + "63": 9.39105, + "64": 9.78022, + "65": 8.94516, + "66": 9.70492, + "67": 9.37249, + "68": 9.78328, + "69": 9.79057, + "70": 9.74451, + "71": 9.62298, + "72": 9.58457, + "73": 9.50511, + "74": 8.94308, + "75": 9.42524, + "76": 9.07602, + "77": 10.06352, + "78": 9.72307, + "79": 9.37497, + "80": 9.40454, + "81": 9.4779, + "82": 9.69669, + "83": 9.30714, + "84": 9.41525, + "85": 9.61295, + "86": 9.07198, + "87": 9.58834, + "88": 9.7476, + "89": 9.59984, + "90": 9.81672, + "91": 9.33791, + "92": 9.35608, + "93": 9.07423, + "94": 8.83511, + "95": 9.51841, + "96": 9.52391, + "97": 9.30922, + "98": 9.66746, + "99": 8.88421, + "100": 9.39923 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1483.0, + "2": 1650.0, + "3": 1681.0, + "4": 1767.0, + "5": 1903.0, + "6": 1952.0, + "7": 1967.0, + "8": 1651.0, + "9": 1886.0, + "10": 1427.0, + "11": 1939.0, + "12": 1778.0, + "13": 1964.0, + "14": 1762.0, + "15": 1980.0, + "16": 1923.0, + "17": 1817.0, + "18": 1783.0, + "19": 1750.0, + "20": 1588.0, + "21": 1855.0, + "22": 1641.0, + "23": 2098.0, + "24": 1679.0, + "25": 1649.0, + "26": 1806.0, + "27": 1834.0, + "28": 2042.0, + "29": 2033.0, + "30": 1984.0, + "31": 1518.0, + "32": 1954.0, + "33": 2068.0, + "34": 1900.0, + "35": 1921.0, + "36": 1965.0, + "37": 2321.0, + "38": 2340.0, + "39": 2344.0, + "40": 2367.0, + "41": 2457.0, + "42": 2367.0, + "43": 2020.0, + "44": 2135.0, + "45": 2184.0, + "46": 2310.0, + "47": 2463.0, + "48": 2450.0, + "49": 2259.0, + "50": 2444.0, + "51": 2543.0, + "52": 2613.0, + "53": 2945.0, + "54": 2713.0, + "55": 2503.0, + "56": 2692.0, + "57": 2338.0, + "58": 2961.0, + "59": 2620.0, + "60": 2367.0, + "61": 2909.0, + "62": 2728.0, + "63": 2399.0, + "64": 2909.0, + "65": 2605.0, + "66": 2983.0, + "67": 2793.0, + "68": 2663.0, + "69": 2833.0, + "70": 3135.0, + "71": 2997.0, + "72": 2464.0, + "73": 3088.0, + "74": 1970.0, + "75": 2556.0, + "76": 3064.0, + "77": 3231.0, + "78": 3097.0, + "79": 3035.0, + "80": 3301.0, + "81": 3599.0, + "82": 3215.0, + "83": 2757.0, + "84": 3130.0, + "85": 3380.0, + "86": 2742.0, + "87": 3723.0, + "88": 3066.0, + "89": 3264.0, + "90": 3198.0, + "91": 2718.0, + "92": 3070.0, + "93": 2624.0, + "94": 3301.0, + "95": 3431.0, + "96": 3358.0, + "97": 3142.0, + "98": 3704.0, + "99": 3107.0, + "100": 3089.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1261848064.0, + "2": 1261848064.0, + "3": 1261848064.0, + "4": 1261848064.0, + "5": 1261848064.0, + "6": 1261848064.0, + "7": 1261848064.0, + "8": 1261848064.0, + "9": 1261848064.0, + "10": 1261848064.0, + "11": 1261848064.0, + "12": 1261848064.0, + "13": 1261848064.0, + "14": 1261848064.0, + "15": 1261848064.0, + "16": 1261848064.0, + "17": 1261848064.0, + "18": 1261848064.0, + "19": 1261848064.0, + "20": 1261848064.0, + "21": 1261848064.0, + "22": 1261848064.0, + "23": 1261848064.0, + "24": 1261848064.0, + "25": 1261848064.0, + "26": 1261848064.0, + "27": 1261848064.0, + "28": 1261848064.0, + "29": 1261848064.0, + "30": 1261848064.0, + "31": 1261848064.0, + "32": 1261848064.0, + "33": 1261848064.0, + "34": 1261848064.0, + "35": 1261848064.0, + "36": 1261848064.0, + "37": 1261848064.0, + "38": 1261848064.0, + "39": 1261848064.0, + "40": 1261848064.0, + "41": 1261848064.0, + "42": 1261848064.0, + "43": 1261848064.0, + "44": 1261848064.0, + "45": 1261848064.0, + "46": 1261848064.0, + "47": 1261848064.0, + "48": 1261848064.0, + "49": 1261848064.0, + "50": 1261848064.0, + "51": 1261848064.0, + "52": 1261848064.0, + "53": 1261848064.0, + "54": 1261848064.0, + "55": 1261848064.0, + "56": 1261848064.0, + "57": 1261848064.0, + "58": 1261848064.0, + "59": 1261848064.0, + "60": 1261848064.0, + "61": 1261848064.0, + "62": 1261848064.0, + "63": 1261848064.0, + "64": 1261848064.0, + "65": 1261848064.0, + "66": 1261848064.0, + "67": 1261848064.0, + "68": 1261848064.0, + "69": 1261848064.0, + "70": 1261848064.0, + "71": 1261848064.0, + "72": 1261848064.0, + "73": 1261848064.0, + "74": 1261848064.0, + "75": 1261848064.0, + "76": 1261848064.0, + "77": 1261848064.0, + "78": 1261848064.0, + "79": 1261848064.0, + "80": 1261848064.0, + "81": 1261848064.0, + "82": 1261848064.0, + "83": 1261848064.0, + "84": 1261848064.0, + "85": 1261848064.0, + "86": 1261848064.0, + "87": 1261848064.0, + "88": 1261848064.0, + "89": 1261848064.0, + "90": 1261848064.0, + "91": 1261848064.0, + "92": 1261848064.0, + "93": 1261848064.0, + "94": 1261848064.0, + "95": 1261848064.0, + "96": 1261848064.0, + "97": 1261848064.0, + "98": 1261848064.0, + "99": 1261848064.0, + "100": 1261848064.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2013852672.0, + "2": 2563430400.0, + "3": 2563430400.0, + "4": 2563430400.0, + "5": 2563430400.0, + "6": 2563430400.0, + "7": 2563430400.0, + "8": 2563430400.0, + "9": 2563430400.0, + "10": 2563430400.0, + "11": 2563430400.0, + "12": 2563430400.0, + "13": 2563430400.0, + "14": 2563430400.0, + "15": 2563430400.0, + "16": 2563430400.0, + "17": 2563430400.0, + "18": 2563430400.0, + "19": 2563430400.0, + "20": 2563430400.0, + "21": 2563430400.0, + "22": 2563430400.0, + "23": 2563430400.0, + "24": 2563430400.0, + "25": 2563430400.0, + "26": 2563430400.0, + "27": 2563430400.0, + "28": 2563430400.0, + "29": 2563430400.0, + "30": 2563430400.0, + "31": 2563430400.0, + "32": 2563430400.0, + "33": 2563430400.0, + "34": 2563430400.0, + "35": 2563430400.0, + "36": 2563430400.0, + "37": 2563430400.0, + "38": 2563430400.0, + "39": 2563430400.0, + "40": 2563430400.0, + "41": 2563430400.0, + "42": 2563430400.0, + "43": 2563430400.0, + "44": 2563430400.0, + "45": 2563430400.0, + "46": 2563430400.0, + "47": 2563430400.0, + "48": 2563430400.0, + "49": 2563430400.0, + "50": 2563430400.0, + "51": 2563430400.0, + "52": 2563430400.0, + "53": 2563430400.0, + "54": 2563430400.0, + "55": 2563430400.0, + "56": 2563430400.0, + "57": 2563430400.0, + "58": 2563430400.0, + "59": 2563430400.0, + "60": 2563430400.0, + "61": 2563430400.0, + "62": 2563430400.0, + "63": 2563430400.0, + "64": 2563430400.0, + "65": 2563430400.0, + "66": 2563430400.0, + "67": 2563430400.0, + "68": 2563430400.0, + "69": 2563430400.0, + "70": 2563430400.0, + "71": 2563430400.0, + "72": 2563430400.0, + "73": 2563430400.0, + "74": 2563430400.0, + "75": 2563430400.0, + "76": 2563430400.0, + "77": 2563430400.0, + "78": 2563430400.0, + "79": 2563430400.0, + "80": 2563430400.0, + "81": 2563430400.0, + "82": 2563430400.0, + "83": 2563430400.0, + "84": 2563430400.0, + "85": 2563430400.0, + "86": 2563430400.0, + "87": 2563430400.0, + "88": 2563430400.0, + "89": 2563430400.0, + "90": 2563430400.0, + "91": 2563430400.0, + "92": 2563430400.0, + "93": 2563430400.0, + "94": 2563430400.0, + "95": 2563430400.0, + "96": 2563430400.0, + "97": 2563430400.0, + "98": 2563430400.0, + "99": 2563430400.0, + "100": 2563430400.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.36321, + "2": 0.1218, + "3": 0.11132, + "4": 0.10707, + "5": 0.0969, + "6": 0.09387, + "7": 0.09166, + "8": 0.09482, + "9": 0.09368, + "10": 0.09371, + "11": 0.0914, + "12": 0.09315, + "13": 0.09323, + "14": 0.09407, + "15": 0.09341, + "16": 0.09525, + "17": 0.09338, + "18": 0.09247, + "19": 0.09648, + "20": 0.09425, + "21": 0.09329, + "22": 0.09356, + "23": 0.09379, + "24": 0.09405, + "25": 0.0935, + "26": 0.09238, + "27": 0.09612, + "28": 0.09315, + "29": 0.09297, + "30": 0.09342, + "31": 0.09294, + "32": 0.09287, + "33": 0.09256, + "34": 0.09461, + "35": 0.09355, + "36": 0.09517, + "37": 0.09434, + "38": 0.0956, + "39": 0.09435, + "40": 0.09568, + "41": 0.09615, + "42": 0.09244, + "43": 0.09364, + "44": 0.09376, + "45": 0.09258, + "46": 0.09268, + "47": 0.09255, + "48": 0.09424, + "49": 0.09573, + "50": 0.09436, + "51": 0.0945, + "52": 0.09894, + "53": 0.09918, + "54": 0.09823, + "55": 0.09863, + "56": 0.09834, + "57": 0.09709, + "58": 0.09303, + "59": 0.09404, + "60": 0.09192, + "61": 0.09198, + "62": 0.09274, + "63": 0.09166, + "64": 0.09147, + "65": 0.09327, + "66": 0.11015, + "67": 0.09684, + "68": 0.09642, + "69": 0.09562, + "70": 0.0924, + "71": 0.09384, + "72": 0.09189, + "73": 0.09372, + "74": 0.09193, + "75": 0.09409, + "76": 0.09252, + "77": 0.09275, + "78": 0.09475, + "79": 0.0945, + "80": 0.10107, + "81": 0.09197, + "82": 0.09204, + "83": 0.09353, + "84": 0.09326, + "85": 0.09194, + "86": 0.1029, + "87": 0.09285, + "88": 0.09168, + "89": 0.09478, + "90": 0.09254, + "91": 0.0921, + "92": 0.09246, + "93": 0.09207, + "94": 0.09324, + "95": 0.09431, + "96": 0.09195, + "97": 0.09285, + "98": 0.09175, + "99": 0.09153, + "100": 0.11457 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..2fa4188369a --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.8401, + "2": 10.83566, + "3": 10.82993, + "4": 10.8173, + "5": 10.84032, + "6": 10.87262, + "7": 10.83467, + "8": 10.84031, + "9": 10.84361, + "10": 10.81341, + "11": 10.85023, + "12": 10.84316, + "13": 10.86604, + "14": 10.86311, + "15": 10.80278, + "16": 10.79645, + "17": 10.77627, + "18": 10.80147, + "19": 10.79392, + "20": 10.70496, + "21": 10.68149, + "22": 10.56314, + "23": 10.70138, + "24": 10.57935, + "25": 10.53846, + "26": 10.60617, + "27": 10.5921, + "28": 10.56154, + "29": 10.57665, + "30": 10.35517, + "31": 10.1277, + "32": 10.46372, + "33": 10.45444, + "34": 10.22446, + "35": 10.27147, + "36": 10.22183, + "37": 10.33944, + "38": 10.18637, + "39": 10.39327, + "40": 10.08044, + "41": 10.13794, + "42": 10.20012, + "43": 9.8379, + "44": 9.9433, + "45": 9.82292, + "46": 9.8231, + "47": 10.13356, + "48": 9.84151, + "49": 9.52105, + "50": 9.90113, + "51": 9.83465, + "52": 9.73175, + "53": 10.04772, + "54": 9.93858, + "55": 9.86422, + "56": 9.61259, + "57": 9.46816, + "58": 9.81221, + "59": 9.57171, + "60": 9.48029, + "61": 9.67964, + "62": 9.96739, + "63": 9.35353, + "64": 9.75732, + "65": 8.93749, + "66": 9.68132, + "67": 9.357, + "68": 9.76807, + "69": 9.77288, + "70": 9.71025, + "71": 9.60021, + "72": 9.56674, + "73": 9.47644, + "74": 8.93192, + "75": 9.40879, + "76": 9.06885, + "77": 10.04691, + "78": 9.70976, + "79": 9.35666, + "80": 9.39077, + "81": 9.46573, + "82": 9.6803, + "83": 9.29215, + "84": 9.40239, + "85": 9.59743, + "86": 9.06112, + "87": 9.57954, + "88": 9.73247, + "89": 9.58838, + "90": 9.80386, + "91": 9.32104, + "92": 9.35012, + "93": 9.06314, + "94": 8.82007, + "95": 9.50565, + "96": 9.51099, + "97": 9.29311, + "98": 9.65573, + "99": 8.87504, + "100": 9.38812 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1670.0, + "2": 1691.0, + "3": 1630.0, + "4": 1805.0, + "5": 1970.0, + "6": 1901.0, + "7": 1815.0, + "8": 1592.0, + "9": 1968.0, + "10": 1436.0, + "11": 1923.0, + "12": 1867.0, + "13": 1888.0, + "14": 1807.0, + "15": 1918.0, + "16": 1922.0, + "17": 1774.0, + "18": 1735.0, + "19": 1886.0, + "20": 1786.0, + "21": 2020.0, + "22": 1685.0, + "23": 2112.0, + "24": 1657.0, + "25": 1610.0, + "26": 1815.0, + "27": 1880.0, + "28": 2025.0, + "29": 1975.0, + "30": 2039.0, + "31": 1713.0, + "32": 1926.0, + "33": 2163.0, + "34": 1894.0, + "35": 2001.0, + "36": 1963.0, + "37": 2401.0, + "38": 2324.0, + "39": 2351.0, + "40": 2321.0, + "41": 2266.0, + "42": 2317.0, + "43": 1999.0, + "44": 2133.0, + "45": 2205.0, + "46": 2324.0, + "47": 2463.0, + "48": 2447.0, + "49": 2237.0, + "50": 2365.0, + "51": 2534.0, + "52": 2604.0, + "53": 2995.0, + "54": 2699.0, + "55": 2489.0, + "56": 2680.0, + "57": 2285.0, + "58": 2976.0, + "59": 2816.0, + "60": 2508.0, + "61": 3075.0, + "62": 2710.0, + "63": 2574.0, + "64": 3027.0, + "65": 2719.0, + "66": 3182.0, + "67": 2770.0, + "68": 2875.0, + "69": 2961.0, + "70": 3241.0, + "71": 2859.0, + "72": 2495.0, + "73": 2972.0, + "74": 1989.0, + "75": 2643.0, + "76": 3012.0, + "77": 3398.0, + "78": 3413.0, + "79": 3272.0, + "80": 3368.0, + "81": 3656.0, + "82": 3228.0, + "83": 2772.0, + "84": 3146.0, + "85": 3336.0, + "86": 2738.0, + "87": 3886.0, + "88": 3044.0, + "89": 3429.0, + "90": 2961.0, + "91": 2952.0, + "92": 3239.0, + "93": 2791.0, + "94": 3583.0, + "95": 3533.0, + "96": 3530.0, + "97": 3241.0, + "98": 3680.0, + "99": 3320.0, + "100": 3432.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1230390272.0, + "2": 1230390272.0, + "3": 1230390272.0, + "4": 1230390272.0, + "5": 1230390272.0, + "6": 1230390272.0, + "7": 1230390272.0, + "8": 1230390272.0, + "9": 1230390272.0, + "10": 1230390272.0, + "11": 1230390272.0, + "12": 1230390272.0, + "13": 1230390272.0, + "14": 1230390272.0, + "15": 1230390272.0, + "16": 1230390272.0, + "17": 1230390272.0, + "18": 1230390272.0, + "19": 1230390272.0, + "20": 1230390272.0, + "21": 1230390272.0, + "22": 1230390272.0, + "23": 1230390272.0, + "24": 1230390272.0, + "25": 1230390272.0, + "26": 1230390272.0, + "27": 1230390272.0, + "28": 1230390272.0, + "29": 1230390272.0, + "30": 1230390272.0, + "31": 1230390272.0, + "32": 1230390272.0, + "33": 1230390272.0, + "34": 1230390272.0, + "35": 1230390272.0, + "36": 1230390272.0, + "37": 1230390272.0, + "38": 1230390272.0, + "39": 1230390272.0, + "40": 1230390272.0, + "41": 1230390272.0, + "42": 1230390272.0, + "43": 1230390272.0, + "44": 1230390272.0, + "45": 1230390272.0, + "46": 1230390272.0, + "47": 1230390272.0, + "48": 1230390272.0, + "49": 1230390272.0, + "50": 1230390272.0, + "51": 1230390272.0, + "52": 1230390272.0, + "53": 1230390272.0, + "54": 1230390272.0, + "55": 1230390272.0, + "56": 1230390272.0, + "57": 1230390272.0, + "58": 1230390272.0, + "59": 1230390272.0, + "60": 1230390272.0, + "61": 1230390272.0, + "62": 1230390272.0, + "63": 1230390272.0, + "64": 1230390272.0, + "65": 1230390272.0, + "66": 1230390272.0, + "67": 1230390272.0, + "68": 1230390272.0, + "69": 1230390272.0, + "70": 1230390272.0, + "71": 1230390272.0, + "72": 1230390272.0, + "73": 1230390272.0, + "74": 1230390272.0, + "75": 1230390272.0, + "76": 1230390272.0, + "77": 1230390272.0, + "78": 1230390272.0, + "79": 1230390272.0, + "80": 1230390272.0, + "81": 1230390272.0, + "82": 1230390272.0, + "83": 1230390272.0, + "84": 1230390272.0, + "85": 1230390272.0, + "86": 1230390272.0, + "87": 1230390272.0, + "88": 1230390272.0, + "89": 1230390272.0, + "90": 1230390272.0, + "91": 1230390272.0, + "92": 1230390272.0, + "93": 1230390272.0, + "94": 1230390272.0, + "95": 1230390272.0, + "96": 1230390272.0, + "97": 1230390272.0, + "98": 1230390272.0, + "99": 1230390272.0, + "100": 1230390272.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1984492032.0, + "2": 2531972608.0, + "3": 2531972608.0, + "4": 2531972608.0, + "5": 2531972608.0, + "6": 2531972608.0, + "7": 2531972608.0, + "8": 2531972608.0, + "9": 2531972608.0, + "10": 2531972608.0, + "11": 2531972608.0, + "12": 2531972608.0, + "13": 2531972608.0, + "14": 2531972608.0, + "15": 2531972608.0, + "16": 2531972608.0, + "17": 2531972608.0, + "18": 2531972608.0, + "19": 2531972608.0, + "20": 2531972608.0, + "21": 2531972608.0, + "22": 2531972608.0, + "23": 2531972608.0, + "24": 2531972608.0, + "25": 2531972608.0, + "26": 2531972608.0, + "27": 2531972608.0, + "28": 2531972608.0, + "29": 2531972608.0, + "30": 2531972608.0, + "31": 2531972608.0, + "32": 2531972608.0, + "33": 2531972608.0, + "34": 2531972608.0, + "35": 2531972608.0, + "36": 2531972608.0, + "37": 2531972608.0, + "38": 2531972608.0, + "39": 2531972608.0, + "40": 2531972608.0, + "41": 2531972608.0, + "42": 2531972608.0, + "43": 2531972608.0, + "44": 2531972608.0, + "45": 2531972608.0, + "46": 2531972608.0, + "47": 2531972608.0, + "48": 2531972608.0, + "49": 2531972608.0, + "50": 2531972608.0, + "51": 2531972608.0, + "52": 2531972608.0, + "53": 2531972608.0, + "54": 2531972608.0, + "55": 2531972608.0, + "56": 2531972608.0, + "57": 2531972608.0, + "58": 2531972608.0, + "59": 2531972608.0, + "60": 2531972608.0, + "61": 2531972608.0, + "62": 2531972608.0, + "63": 2531972608.0, + "64": 2531972608.0, + "65": 2531972608.0, + "66": 2531972608.0, + "67": 2531972608.0, + "68": 2531972608.0, + "69": 2531972608.0, + "70": 2531972608.0, + "71": 2531972608.0, + "72": 2531972608.0, + "73": 2531972608.0, + "74": 2531972608.0, + "75": 2531972608.0, + "76": 2531972608.0, + "77": 2531972608.0, + "78": 2531972608.0, + "79": 2531972608.0, + "80": 2531972608.0, + "81": 2531972608.0, + "82": 2531972608.0, + "83": 2531972608.0, + "84": 2531972608.0, + "85": 2531972608.0, + "86": 2531972608.0, + "87": 2531972608.0, + "88": 2531972608.0, + "89": 2531972608.0, + "90": 2531972608.0, + "91": 2531972608.0, + "92": 2531972608.0, + "93": 2531972608.0, + "94": 2531972608.0, + "95": 2531972608.0, + "96": 2531972608.0, + "97": 2531972608.0, + "98": 2531972608.0, + "99": 2531972608.0, + "100": 2531972608.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.66979, + "2": 0.15375, + "3": 0.13471, + "4": 0.1451, + "5": 0.13243, + "6": 0.13226, + "7": 0.14437, + "8": 0.13751, + "9": 0.1427, + "10": 0.14549, + "11": 0.14547, + "12": 0.14682, + "13": 0.40877, + "14": 0.1477, + "15": 0.15085, + "16": 0.14383, + "17": 0.15106, + "18": 0.14683, + "19": 0.14809, + "20": 0.1535, + "21": 0.14869, + "22": 0.14139, + "23": 0.16201, + "24": 0.15437, + "25": 0.14424, + "26": 0.15046, + "27": 0.14191, + "28": 0.14273, + "29": 0.14227, + "30": 0.14587, + "31": 0.14729, + "32": 0.14529, + "33": 0.14194, + "34": 0.14753, + "35": 0.14364, + "36": 0.15173, + "37": 0.15588, + "38": 0.17947, + "39": 0.16014, + "40": 0.16333, + "41": 0.15457, + "42": 0.17017, + "43": 0.13231, + "44": 0.13057, + "45": 0.13024, + "46": 0.1296, + "47": 0.13068, + "48": 0.12962, + "49": 0.13029, + "50": 0.13004, + "51": 0.13664, + "52": 0.1321, + "53": 0.13024, + "54": 0.16102, + "55": 0.15998, + "56": 0.16599, + "57": 0.1739, + "58": 0.1617, + "59": 0.16149, + "60": 0.15536, + "61": 0.19483, + "62": 0.18185, + "63": 0.17713, + "64": 0.20241, + "65": 0.2339, + "66": 0.19396, + "67": 0.18469, + "68": 0.13408, + "69": 0.13102, + "70": 0.13245, + "71": 0.1302, + "72": 0.13294, + "73": 0.13181, + "74": 0.13273, + "75": 0.13082, + "76": 0.13319, + "77": 0.13089, + "78": 0.13266, + "79": 0.13146, + "80": 0.13271, + "81": 0.13064, + "82": 0.133, + "83": 0.1325, + "84": 0.13269, + "85": 0.13105, + "86": 0.13314, + "87": 0.13059, + "88": 0.13244, + "89": 0.13183, + "90": 0.13294, + "91": 0.13281, + "92": 0.13352, + "93": 0.13201, + "94": 0.1343, + "95": 0.13224, + "96": 0.13339, + "97": 0.13189, + "98": 0.1351, + "99": 0.13191, + "100": 0.13277 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..a6e28752239 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.8401, + "2": 10.83566, + "3": 10.82993, + "4": 10.8173, + "5": 10.84032, + "6": 10.87262, + "7": 10.83467, + "8": 10.84031, + "9": 10.84361, + "10": 10.81341, + "11": 10.85023, + "12": 10.84316, + "13": 10.86604, + "14": 10.86311, + "15": 10.80278, + "16": 10.79645, + "17": 10.77627, + "18": 10.80147, + "19": 10.79392, + "20": 10.70496, + "21": 10.68149, + "22": 10.56314, + "23": 10.70138, + "24": 10.57935, + "25": 10.53846, + "26": 10.60617, + "27": 10.5921, + "28": 10.56154, + "29": 10.57665, + "30": 10.35517, + "31": 10.1277, + "32": 10.46372, + "33": 10.45444, + "34": 10.22446, + "35": 10.27147, + "36": 10.22183, + "37": 10.33944, + "38": 10.18637, + "39": 10.39327, + "40": 10.08044, + "41": 10.13794, + "42": 10.20012, + "43": 9.8379, + "44": 9.9433, + "45": 9.82292, + "46": 9.8231, + "47": 10.13356, + "48": 9.84151, + "49": 9.52105, + "50": 9.90113, + "51": 9.83465, + "52": 9.73175, + "53": 10.04772, + "54": 9.93858, + "55": 9.86422, + "56": 9.61259, + "57": 9.46816, + "58": 9.81221, + "59": 9.57171, + "60": 9.48029, + "61": 9.67964, + "62": 9.96739, + "63": 9.35353, + "64": 9.75732, + "65": 8.93749, + "66": 9.68132, + "67": 9.357, + "68": 9.76807, + "69": 9.77288, + "70": 9.71025, + "71": 9.60021, + "72": 9.56674, + "73": 9.47644, + "74": 8.93192, + "75": 9.40879, + "76": 9.06885, + "77": 10.04691, + "78": 9.70976, + "79": 9.35666, + "80": 9.39077, + "81": 9.46573, + "82": 9.6803, + "83": 9.29215, + "84": 9.40239, + "85": 9.59743, + "86": 9.06112, + "87": 9.57954, + "88": 9.73247, + "89": 9.58838, + "90": 9.80386, + "91": 9.32104, + "92": 9.35012, + "93": 9.06314, + "94": 8.82007, + "95": 9.50565, + "96": 9.51099, + "97": 9.29311, + "98": 9.65573, + "99": 8.87504, + "100": 9.38812 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1670.0, + "2": 1691.0, + "3": 1630.0, + "4": 1805.0, + "5": 1970.0, + "6": 1901.0, + "7": 1815.0, + "8": 1592.0, + "9": 1968.0, + "10": 1436.0, + "11": 1923.0, + "12": 1867.0, + "13": 1888.0, + "14": 1807.0, + "15": 1918.0, + "16": 1922.0, + "17": 1774.0, + "18": 1735.0, + "19": 1886.0, + "20": 1786.0, + "21": 2020.0, + "22": 1685.0, + "23": 2112.0, + "24": 1657.0, + "25": 1610.0, + "26": 1815.0, + "27": 1880.0, + "28": 2025.0, + "29": 1975.0, + "30": 2039.0, + "31": 1713.0, + "32": 1926.0, + "33": 2163.0, + "34": 1894.0, + "35": 2001.0, + "36": 1963.0, + "37": 2401.0, + "38": 2324.0, + "39": 2351.0, + "40": 2321.0, + "41": 2266.0, + "42": 2317.0, + "43": 1999.0, + "44": 2133.0, + "45": 2205.0, + "46": 2324.0, + "47": 2463.0, + "48": 2447.0, + "49": 2237.0, + "50": 2365.0, + "51": 2534.0, + "52": 2604.0, + "53": 2995.0, + "54": 2699.0, + "55": 2489.0, + "56": 2680.0, + "57": 2285.0, + "58": 2976.0, + "59": 2816.0, + "60": 2508.0, + "61": 3075.0, + "62": 2710.0, + "63": 2574.0, + "64": 3027.0, + "65": 2719.0, + "66": 3182.0, + "67": 2770.0, + "68": 2875.0, + "69": 2961.0, + "70": 3241.0, + "71": 2859.0, + "72": 2495.0, + "73": 2972.0, + "74": 1989.0, + "75": 2643.0, + "76": 3012.0, + "77": 3398.0, + "78": 3413.0, + "79": 3272.0, + "80": 3368.0, + "81": 3656.0, + "82": 3228.0, + "83": 2772.0, + "84": 3146.0, + "85": 3336.0, + "86": 2738.0, + "87": 3886.0, + "88": 3044.0, + "89": 3429.0, + "90": 2961.0, + "91": 2952.0, + "92": 3239.0, + "93": 2791.0, + "94": 3583.0, + "95": 3533.0, + "96": 3530.0, + "97": 3241.0, + "98": 3680.0, + "99": 3320.0, + "100": 3432.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1230390272.0, + "2": 1230390272.0, + "3": 1230390272.0, + "4": 1230390272.0, + "5": 1230390272.0, + "6": 1230390272.0, + "7": 1230390272.0, + "8": 1230390272.0, + "9": 1230390272.0, + "10": 1230390272.0, + "11": 1230390272.0, + "12": 1230390272.0, + "13": 1230390272.0, + "14": 1230390272.0, + "15": 1230390272.0, + "16": 1230390272.0, + "17": 1230390272.0, + "18": 1230390272.0, + "19": 1230390272.0, + "20": 1230390272.0, + "21": 1230390272.0, + "22": 1230390272.0, + "23": 1230390272.0, + "24": 1230390272.0, + "25": 1230390272.0, + "26": 1230390272.0, + "27": 1230390272.0, + "28": 1230390272.0, + "29": 1230390272.0, + "30": 1230390272.0, + "31": 1230390272.0, + "32": 1230390272.0, + "33": 1230390272.0, + "34": 1230390272.0, + "35": 1230390272.0, + "36": 1230390272.0, + "37": 1230390272.0, + "38": 1230390272.0, + "39": 1230390272.0, + "40": 1230390272.0, + "41": 1230390272.0, + "42": 1230390272.0, + "43": 1230390272.0, + "44": 1230390272.0, + "45": 1230390272.0, + "46": 1230390272.0, + "47": 1230390272.0, + "48": 1230390272.0, + "49": 1230390272.0, + "50": 1230390272.0, + "51": 1230390272.0, + "52": 1230390272.0, + "53": 1230390272.0, + "54": 1230390272.0, + "55": 1230390272.0, + "56": 1230390272.0, + "57": 1230390272.0, + "58": 1230390272.0, + "59": 1230390272.0, + "60": 1230390272.0, + "61": 1230390272.0, + "62": 1230390272.0, + "63": 1230390272.0, + "64": 1230390272.0, + "65": 1230390272.0, + "66": 1230390272.0, + "67": 1230390272.0, + "68": 1230390272.0, + "69": 1230390272.0, + "70": 1230390272.0, + "71": 1230390272.0, + "72": 1230390272.0, + "73": 1230390272.0, + "74": 1230390272.0, + "75": 1230390272.0, + "76": 1230390272.0, + "77": 1230390272.0, + "78": 1230390272.0, + "79": 1230390272.0, + "80": 1230390272.0, + "81": 1230390272.0, + "82": 1230390272.0, + "83": 1230390272.0, + "84": 1230390272.0, + "85": 1230390272.0, + "86": 1230390272.0, + "87": 1230390272.0, + "88": 1230390272.0, + "89": 1230390272.0, + "90": 1230390272.0, + "91": 1230390272.0, + "92": 1230390272.0, + "93": 1230390272.0, + "94": 1230390272.0, + "95": 1230390272.0, + "96": 1230390272.0, + "97": 1230390272.0, + "98": 1230390272.0, + "99": 1230390272.0, + "100": 1230390272.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1984492032.0, + "2": 2531972608.0, + "3": 2531972608.0, + "4": 2531972608.0, + "5": 2531972608.0, + "6": 2531972608.0, + "7": 2531972608.0, + "8": 2531972608.0, + "9": 2531972608.0, + "10": 2531972608.0, + "11": 2531972608.0, + "12": 2531972608.0, + "13": 2531972608.0, + "14": 2531972608.0, + "15": 2531972608.0, + "16": 2531972608.0, + "17": 2531972608.0, + "18": 2531972608.0, + "19": 2531972608.0, + "20": 2531972608.0, + "21": 2531972608.0, + "22": 2531972608.0, + "23": 2531972608.0, + "24": 2531972608.0, + "25": 2531972608.0, + "26": 2531972608.0, + "27": 2531972608.0, + "28": 2531972608.0, + "29": 2531972608.0, + "30": 2531972608.0, + "31": 2531972608.0, + "32": 2531972608.0, + "33": 2531972608.0, + "34": 2531972608.0, + "35": 2531972608.0, + "36": 2531972608.0, + "37": 2531972608.0, + "38": 2531972608.0, + "39": 2531972608.0, + "40": 2531972608.0, + "41": 2531972608.0, + "42": 2531972608.0, + "43": 2531972608.0, + "44": 2531972608.0, + "45": 2531972608.0, + "46": 2531972608.0, + "47": 2531972608.0, + "48": 2531972608.0, + "49": 2531972608.0, + "50": 2531972608.0, + "51": 2531972608.0, + "52": 2531972608.0, + "53": 2531972608.0, + "54": 2531972608.0, + "55": 2531972608.0, + "56": 2531972608.0, + "57": 2531972608.0, + "58": 2531972608.0, + "59": 2531972608.0, + "60": 2531972608.0, + "61": 2531972608.0, + "62": 2531972608.0, + "63": 2531972608.0, + "64": 2531972608.0, + "65": 2531972608.0, + "66": 2531972608.0, + "67": 2531972608.0, + "68": 2531972608.0, + "69": 2531972608.0, + "70": 2531972608.0, + "71": 2531972608.0, + "72": 2531972608.0, + "73": 2531972608.0, + "74": 2531972608.0, + "75": 2531972608.0, + "76": 2531972608.0, + "77": 2531972608.0, + "78": 2531972608.0, + "79": 2531972608.0, + "80": 2531972608.0, + "81": 2531972608.0, + "82": 2531972608.0, + "83": 2531972608.0, + "84": 2531972608.0, + "85": 2531972608.0, + "86": 2531972608.0, + "87": 2531972608.0, + "88": 2531972608.0, + "89": 2531972608.0, + "90": 2531972608.0, + "91": 2531972608.0, + "92": 2531972608.0, + "93": 2531972608.0, + "94": 2531972608.0, + "95": 2531972608.0, + "96": 2531972608.0, + "97": 2531972608.0, + "98": 2531972608.0, + "99": 2531972608.0, + "100": 2531972608.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.69156, + "2": 0.15851, + "3": 0.15939, + "4": 0.14587, + "5": 0.13996, + "6": 0.14246, + "7": 0.14168, + "8": 0.13947, + "9": 0.1406, + "10": 0.13629, + "11": 0.38438, + "12": 0.13502, + "13": 0.13606, + "14": 0.14033, + "15": 0.13443, + "16": 0.13179, + "17": 0.13378, + "18": 0.13167, + "19": 0.13416, + "20": 0.134, + "21": 0.13338, + "22": 0.13341, + "23": 0.13463, + "24": 0.13194, + "25": 0.13343, + "26": 0.13151, + "27": 0.13224, + "28": 0.13211, + "29": 0.13154, + "30": 0.13114, + "31": 0.13127, + "32": 0.13156, + "33": 0.13112, + "34": 0.13133, + "35": 0.13254, + "36": 0.1314, + "37": 0.13112, + "38": 0.13159, + "39": 0.13294, + "40": 0.1325, + "41": 0.1311, + "42": 0.13177, + "43": 0.13171, + "44": 0.13171, + "45": 0.1308, + "46": 0.13012, + "47": 0.13104, + "48": 0.13108, + "49": 0.13129, + "50": 0.13155, + "51": 0.15273, + "52": 0.1324, + "53": 0.13236, + "54": 0.13244, + "55": 0.13198, + "56": 0.1336, + "57": 0.13148, + "58": 0.13225, + "59": 0.13123, + "60": 0.13225, + "61": 0.13307, + "62": 0.13259, + "63": 0.13191, + "64": 0.13297, + "65": 0.13243, + "66": 0.13236, + "67": 0.1309, + "68": 0.13226, + "69": 0.13072, + "70": 0.13171, + "71": 0.13137, + "72": 0.13229, + "73": 0.13521, + "74": 0.13296, + "75": 0.13526, + "76": 0.13228, + "77": 0.13205, + "78": 0.13248, + "79": 0.13355, + "80": 0.13311, + "81": 0.13269, + "82": 0.13199, + "83": 0.13576, + "84": 0.13205, + "85": 0.13411, + "86": 0.13176, + "87": 0.13273, + "88": 0.13166, + "89": 0.13262, + "90": 0.13138, + "91": 0.13261, + "92": 0.13197, + "93": 0.13258, + "94": 0.13132, + "95": 0.13295, + "96": 0.1307, + "97": 0.13291, + "98": 0.13163, + "99": 0.13281, + "100": 0.13201 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 8f2ce322a3e..8056e7174f0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.85787, + "2": 10.87336, + "3": 10.86821, + "4": 10.87255, "5": 10.87398, + "6": 10.89631, + "7": 10.86379, + "8": 10.87834, + "9": 10.87399, "10": 10.83714, + "11": 10.86988, + "12": 10.85947, + "13": 10.87777, + "14": 10.87924, "15": 10.81888, + "16": 10.83058, + "17": 10.78684, + "18": 10.80146, + "19": 10.79775, "20": 10.71155, + "21": 10.6865, + "22": 10.55277, + "23": 10.7014, + "24": 10.58527, "25": 10.52658, + "26": 10.58299, + "27": 10.59487, + "28": 10.54787, + "29": 10.55928, "30": 10.32818, + "31": 10.08272, + "32": 10.44699, + "33": 10.42755, + "34": 10.17932, "35": 10.24095, + "36": 10.18094, + "37": 10.32809, + "38": 10.16727, + "39": 10.37344, "40": 10.05079, + "41": 10.10728, + "42": 10.17799, + "43": 9.77846, + "44": 9.91207, "45": 9.77392, + "46": 9.75431, + "47": 10.09497, + "48": 9.79523, + "49": 9.46391, "50": 9.8673, + "51": 9.80381, + "52": 9.68202, + "53": 10.02345, + "54": 9.91634, "55": 9.82456, + "56": 9.56974, + "57": 9.42672, + "58": 9.78081, + "59": 9.53243, "60": 9.44593, + "61": 9.64254, + "62": 9.94293, + "63": 9.31764, + "64": 9.72548, "65": 8.88739, + "66": 9.65691, + "67": 9.31749, + "68": 9.73495, + "69": 9.74866, "70": 9.69625, + "71": 9.57689, + "72": 9.52422, + "73": 9.45595, + "74": 8.88269, "75": 9.37584, + "76": 9.01136, + "77": 10.02287, + "78": 9.67963, + "79": 9.33172, "80": 9.35826, + "81": 9.43394, + "82": 9.65054, + "83": 9.25503, + "84": 9.3714, "85": 9.5623, + "86": 9.03489, + "87": 9.54614, + "88": 9.69785, + "89": 9.54656, "90": 9.77624, + "91": 9.2884, + "92": 9.30662, + "93": 9.02647, + "94": 8.78837, "95": 9.48027, + "96": 9.47974, + "97": 9.25611, + "98": 9.61949, + "99": 8.83824, "100": 9.35135 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1858.0, + "2": 1854.0, + "3": 1803.0, + "4": 1955.0, "5": 2000.0, + "6": 2036.0, + "7": 1932.0, + "8": 1791.0, + "9": 1935.0, "10": 1654.0, + "11": 2080.0, + "12": 1881.0, + "13": 1977.0, + "14": 2080.0, "15": 1957.0, + "16": 1910.0, + "17": 1974.0, + "18": 1896.0, + "19": 1955.0, "20": 1816.0, + "21": 1906.0, + "22": 1972.0, + "23": 2062.0, + "24": 1897.0, "25": 1830.0, + "26": 1788.0, + "27": 1849.0, + "28": 2008.0, + "29": 2128.0, "30": 1969.0, + "31": 1630.0, + "32": 2057.0, + "33": 2171.0, + "34": 1947.0, "35": 2097.0, + "36": 1972.0, + "37": 2348.0, + "38": 2186.0, + "39": 2378.0, "40": 2181.0, + "41": 2326.0, + "42": 2334.0, + "43": 2219.0, + "44": 2234.0, "45": 2231.0, + "46": 2229.0, + "47": 2449.0, + "48": 2439.0, + "49": 2159.0, "50": 2290.0, + "51": 2514.0, + "52": 2513.0, + "53": 2894.0, + "54": 2656.0, "55": 2348.0, + "56": 2506.0, + "57": 2501.0, + "58": 2770.0, + "59": 2681.0, "60": 2434.0, + "61": 2776.0, + "62": 2596.0, + "63": 2617.0, + "64": 3012.0, "65": 2657.0, + "66": 2947.0, + "67": 3089.0, + "68": 2818.0, + "69": 2909.0, "70": 3025.0, + "71": 2924.0, + "72": 2702.0, + "73": 2947.0, + "74": 2306.0, "75": 2791.0, + "76": 3093.0, + "77": 3107.0, + "78": 3134.0, + "79": 3205.0, "80": 3123.0, + "81": 3290.0, + "82": 3172.0, + "83": 2719.0, + "84": 3328.0, "85": 3255.0, + "86": 2546.0, + "87": 3472.0, + "88": 3068.0, + "89": 2953.0, "90": 3300.0, + "91": 3154.0, + "92": 3061.0, + "93": 2889.0, + "94": 3535.0, "95": 3078.0, + "96": 3181.0, + "97": 3135.0, + "98": 3569.0, + "99": 3319.0, "100": 3223.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 921653248.0, + "2": 921653248.0, + "3": 921653248.0, + "4": 921653248.0, "5": 921653248.0, + "6": 921653248.0, + "7": 921653248.0, + "8": 921653248.0, + "9": 921653248.0, "10": 921653248.0, + "11": 921653248.0, + "12": 921653248.0, + "13": 921653248.0, + "14": 921653248.0, "15": 921653248.0, + "16": 921653248.0, + "17": 921653248.0, + "18": 921653248.0, + "19": 921653248.0, "20": 921653248.0, + "21": 921653248.0, + "22": 921653248.0, + "23": 921653248.0, + "24": 921653248.0, "25": 921653248.0, + "26": 921653248.0, + "27": 921653248.0, + "28": 921653248.0, + "29": 921653248.0, "30": 921653248.0, + "31": 921653248.0, + "32": 921653248.0, + "33": 921653248.0, + "34": 921653248.0, "35": 921653248.0, + "36": 921653248.0, + "37": 921653248.0, + "38": 921653248.0, + "39": 921653248.0, "40": 921653248.0, + "41": 921653248.0, + "42": 921653248.0, + "43": 921653248.0, + "44": 921653248.0, "45": 921653248.0, + "46": 921653248.0, + "47": 921653248.0, + "48": 921653248.0, + "49": 921653248.0, "50": 921653248.0, + "51": 921653248.0, + "52": 921653248.0, + "53": 921653248.0, + "54": 921653248.0, "55": 921653248.0, + "56": 921653248.0, + "57": 921653248.0, + "58": 921653248.0, + "59": 921653248.0, "60": 921653248.0, + "61": 921653248.0, + "62": 921653248.0, + "63": 921653248.0, + "64": 921653248.0, "65": 921653248.0, + "66": 921653248.0, + "67": 921653248.0, + "68": 921653248.0, + "69": 921653248.0, "70": 921653248.0, + "71": 921653248.0, + "72": 921653248.0, + "73": 921653248.0, + "74": 921653248.0, "75": 921653248.0, + "76": 921653248.0, + "77": 921653248.0, + "78": 921653248.0, + "79": 921653248.0, "80": 921653248.0, + "81": 921653248.0, + "82": 921653248.0, + "83": 921653248.0, + "84": 921653248.0, "85": 921653248.0, + "86": 921653248.0, + "87": 921653248.0, + "88": 921653248.0, + "89": 921653248.0, "90": 921653248.0, + "91": 921653248.0, + "92": 921653248.0, + "93": 921653248.0, + "94": 921653248.0, "95": 921653248.0, + "96": 921653248.0, + "97": 921653248.0, + "98": 921653248.0, + "99": 921653248.0, "100": 921653248.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2237722624.0, + "2": 2600334336.0, + "3": 2600334336.0, + "4": 2600334336.0, "5": 2600334336.0, + "6": 2600334336.0, + "7": 2600334336.0, + "8": 2600334336.0, + "9": 2600334336.0, "10": 2600334336.0, + "11": 2600334336.0, + "12": 2600334336.0, + "13": 2600334336.0, + "14": 2600334336.0, "15": 2600334336.0, + "16": 2600334336.0, + "17": 2600334336.0, + "18": 2600334336.0, + "19": 2600334336.0, "20": 2600334336.0, + "21": 2600334336.0, + "22": 2600334336.0, + "23": 2600334336.0, + "24": 2600334336.0, "25": 2600334336.0, + "26": 2600334336.0, + "27": 2600334336.0, + "28": 2600334336.0, + "29": 2600334336.0, "30": 2600334336.0, + "31": 2600334336.0, + "32": 2600334336.0, + "33": 2600334336.0, + "34": 2600334336.0, "35": 2600334336.0, + "36": 2600334336.0, + "37": 2600334336.0, + "38": 2600334336.0, + "39": 2600334336.0, "40": 2600334336.0, + "41": 2600334336.0, + "42": 2600334336.0, + "43": 2600334336.0, + "44": 2600334336.0, "45": 2600334336.0, + "46": 2600334336.0, + "47": 2600334336.0, + "48": 2600334336.0, + "49": 2600334336.0, "50": 2600334336.0, + "51": 2600334336.0, + "52": 2600334336.0, + "53": 2600334336.0, + "54": 2600334336.0, "55": 2600334336.0, + "56": 2600334336.0, + "57": 2600334336.0, + "58": 2600334336.0, + "59": 2600334336.0, "60": 2600334336.0, + "61": 2600334336.0, + "62": 2600334336.0, + "63": 2600334336.0, + "64": 2600334336.0, "65": 2600334336.0, + "66": 2600334336.0, + "67": 2600334336.0, + "68": 2600334336.0, + "69": 2600334336.0, "70": 2600334336.0, + "71": 2600334336.0, + "72": 2600334336.0, + "73": 2600334336.0, + "74": 2600334336.0, "75": 2600334336.0, + "76": 2600334336.0, + "77": 2600334336.0, + "78": 2600334336.0, + "79": 2600334336.0, "80": 2600334336.0, + "81": 2600334336.0, + "82": 2600334336.0, + "83": 2600334336.0, + "84": 2600334336.0, "85": 2600334336.0, + "86": 2600334336.0, + "87": 2600334336.0, + "88": 2600334336.0, + "89": 2600334336.0, "90": 2600334336.0, + "91": 2600334336.0, + "92": 2600334336.0, + "93": 2600334336.0, + "94": 2600334336.0, "95": 2600334336.0, + "96": 2600334336.0, + "97": 2600334336.0, + "98": 2600334336.0, + "99": 2600334336.0, "100": 2600334336.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 10.274, - "5": 0.08611, - "10": 0.08856, - "15": 0.09559, - "20": 0.08781, - "25": 0.0877, - "30": 0.08743, - "35": 0.08748, - "40": 0.08658, - "45": 0.08701, - "50": 0.08685, - "55": 0.08673, - "60": 0.08608, - "65": 0.08606, - "70": 0.08638, - "75": 0.08694, - "80": 0.08748, - "85": 0.08937, - "90": 0.08844, - "95": 0.08722, - "100": 0.08813 + "1": 10.43555, + "2": 0.12658, + "3": 0.11069, + "4": 0.10147, + "5": 0.10118, + "6": 0.10108, + "7": 0.10059, + "8": 0.09885, + "9": 0.10197, + "10": 0.10148, + "11": 0.10092, + "12": 0.10046, + "13": 0.10111, + "14": 0.10211, + "15": 0.10226, + "16": 0.10138, + "17": 0.10161, + "18": 0.10294, + "19": 0.10161, + "20": 0.10231, + "21": 0.10295, + "22": 0.10337, + "23": 0.10219, + "24": 0.10301, + "25": 0.10137, + "26": 0.10266, + "27": 0.10223, + "28": 0.10298, + "29": 0.1033, + "30": 0.1033, + "31": 0.10269, + "32": 0.1022, + "33": 0.10279, + "34": 0.1017, + "35": 0.1017, + "36": 0.10155, + "37": 0.1018, + "38": 0.10278, + "39": 0.10226, + "40": 0.10208, + "41": 0.10264, + "42": 0.10119, + "43": 0.10372, + "44": 0.10116, + "45": 0.1015, + "46": 0.09996, + "47": 0.10089, + "48": 0.10148, + "49": 0.10042, + "50": 0.09948, + "51": 0.10234, + "52": 0.10011, + "53": 0.09939, + "54": 0.09905, + "55": 0.1003, + "56": 0.09964, + "57": 0.10028, + "58": 0.10099, + "59": 0.09982, + "60": 0.09923, + "61": 0.09876, + "62": 0.09945, + "63": 0.10026, + "64": 0.09913, + "65": 0.09908, + "66": 0.10039, + "67": 0.10115, + "68": 0.10055, + "69": 0.09942, + "70": 0.09949, + "71": 0.09986, + "72": 0.10015, + "73": 0.10084, + "74": 0.10077, + "75": 0.09933, + "76": 0.10121, + "77": 0.09959, + "78": 0.09938, + "79": 0.0991, + "80": 0.09802, + "81": 0.10115, + "82": 0.09939, + "83": 0.09963, + "84": 0.0992, + "85": 0.09904, + "86": 0.1026, + "87": 0.09983, + "88": 0.10128, + "89": 0.09897, + "90": 0.09918, + "91": 0.10029, + "92": 0.09877, + "93": 0.09988, + "94": 0.09933, + "95": 0.10109, + "96": 0.10013, + "97": 0.10103, + "98": 0.10004, + "99": 0.09987, + "100": 0.09979 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..482e2d753b9 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85787, + "2": 10.87336, + "3": 10.86821, + "4": 10.87255, + "5": 10.87398, + "6": 10.89631, + "7": 10.86379, + "8": 10.87834, + "9": 10.87399, + "10": 10.83714, + "11": 10.86988, + "12": 10.85947, + "13": 10.87777, + "14": 10.87924, + "15": 10.81888, + "16": 10.83058, + "17": 10.78684, + "18": 10.80146, + "19": 10.79775, + "20": 10.71155, + "21": 10.6865, + "22": 10.55277, + "23": 10.7014, + "24": 10.58527, + "25": 10.52658, + "26": 10.58299, + "27": 10.59487, + "28": 10.54787, + "29": 10.55928, + "30": 10.32818, + "31": 10.08272, + "32": 10.44699, + "33": 10.42755, + "34": 10.17932, + "35": 10.24095, + "36": 10.18094, + "37": 10.32809, + "38": 10.16727, + "39": 10.37344, + "40": 10.05079, + "41": 10.10728, + "42": 10.17799, + "43": 9.77846, + "44": 9.91207, + "45": 9.77392, + "46": 9.75431, + "47": 10.09497, + "48": 9.79523, + "49": 9.46391, + "50": 9.8673, + "51": 9.80381, + "52": 9.68202, + "53": 10.02345, + "54": 9.91634, + "55": 9.82456, + "56": 9.56974, + "57": 9.42672, + "58": 9.78081, + "59": 9.53243, + "60": 9.44593, + "61": 9.64254, + "62": 9.94293, + "63": 9.31764, + "64": 9.72548, + "65": 8.88739, + "66": 9.65691, + "67": 9.31749, + "68": 9.73495, + "69": 9.74866, + "70": 9.69625, + "71": 9.57689, + "72": 9.52422, + "73": 9.45595, + "74": 8.88269, + "75": 9.37584, + "76": 9.01136, + "77": 10.02287, + "78": 9.67963, + "79": 9.33172, + "80": 9.35826, + "81": 9.43394, + "82": 9.65054, + "83": 9.25503, + "84": 9.3714, + "85": 9.5623, + "86": 9.03489, + "87": 9.54614, + "88": 9.69785, + "89": 9.54656, + "90": 9.77624, + "91": 9.2884, + "92": 9.30662, + "93": 9.02647, + "94": 8.78837, + "95": 9.48027, + "96": 9.47974, + "97": 9.25611, + "98": 9.61949, + "99": 8.83824, + "100": 9.35135 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1858.0, + "2": 1854.0, + "3": 1803.0, + "4": 1955.0, + "5": 2000.0, + "6": 2036.0, + "7": 1932.0, + "8": 1791.0, + "9": 1935.0, + "10": 1654.0, + "11": 2080.0, + "12": 1881.0, + "13": 1977.0, + "14": 2080.0, + "15": 1957.0, + "16": 1910.0, + "17": 1974.0, + "18": 1896.0, + "19": 1955.0, + "20": 1816.0, + "21": 1906.0, + "22": 1972.0, + "23": 2062.0, + "24": 1897.0, + "25": 1830.0, + "26": 1788.0, + "27": 1849.0, + "28": 2008.0, + "29": 2128.0, + "30": 1969.0, + "31": 1630.0, + "32": 2057.0, + "33": 2171.0, + "34": 1947.0, + "35": 2097.0, + "36": 1972.0, + "37": 2348.0, + "38": 2186.0, + "39": 2378.0, + "40": 2181.0, + "41": 2326.0, + "42": 2334.0, + "43": 2219.0, + "44": 2234.0, + "45": 2231.0, + "46": 2229.0, + "47": 2449.0, + "48": 2439.0, + "49": 2159.0, + "50": 2290.0, + "51": 2514.0, + "52": 2513.0, + "53": 2894.0, + "54": 2656.0, + "55": 2348.0, + "56": 2506.0, + "57": 2501.0, + "58": 2770.0, + "59": 2681.0, + "60": 2434.0, + "61": 2776.0, + "62": 2596.0, + "63": 2617.0, + "64": 3012.0, + "65": 2657.0, + "66": 2947.0, + "67": 3089.0, + "68": 2818.0, + "69": 2909.0, + "70": 3025.0, + "71": 2924.0, + "72": 2702.0, + "73": 2947.0, + "74": 2306.0, + "75": 2791.0, + "76": 3093.0, + "77": 3107.0, + "78": 3134.0, + "79": 3205.0, + "80": 3123.0, + "81": 3290.0, + "82": 3172.0, + "83": 2719.0, + "84": 3328.0, + "85": 3255.0, + "86": 2546.0, + "87": 3472.0, + "88": 3068.0, + "89": 2953.0, + "90": 3300.0, + "91": 3154.0, + "92": 3061.0, + "93": 2889.0, + "94": 3535.0, + "95": 3078.0, + "96": 3181.0, + "97": 3135.0, + "98": 3569.0, + "99": 3319.0, + "100": 3223.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 921653248.0, + "2": 921653248.0, + "3": 921653248.0, + "4": 921653248.0, + "5": 921653248.0, + "6": 921653248.0, + "7": 921653248.0, + "8": 921653248.0, + "9": 921653248.0, + "10": 921653248.0, + "11": 921653248.0, + "12": 921653248.0, + "13": 921653248.0, + "14": 921653248.0, + "15": 921653248.0, + "16": 921653248.0, + "17": 921653248.0, + "18": 921653248.0, + "19": 921653248.0, + "20": 921653248.0, + "21": 921653248.0, + "22": 921653248.0, + "23": 921653248.0, + "24": 921653248.0, + "25": 921653248.0, + "26": 921653248.0, + "27": 921653248.0, + "28": 921653248.0, + "29": 921653248.0, + "30": 921653248.0, + "31": 921653248.0, + "32": 921653248.0, + "33": 921653248.0, + "34": 921653248.0, + "35": 921653248.0, + "36": 921653248.0, + "37": 921653248.0, + "38": 921653248.0, + "39": 921653248.0, + "40": 921653248.0, + "41": 921653248.0, + "42": 921653248.0, + "43": 921653248.0, + "44": 921653248.0, + "45": 921653248.0, + "46": 921653248.0, + "47": 921653248.0, + "48": 921653248.0, + "49": 921653248.0, + "50": 921653248.0, + "51": 921653248.0, + "52": 921653248.0, + "53": 921653248.0, + "54": 921653248.0, + "55": 921653248.0, + "56": 921653248.0, + "57": 921653248.0, + "58": 921653248.0, + "59": 921653248.0, + "60": 921653248.0, + "61": 921653248.0, + "62": 921653248.0, + "63": 921653248.0, + "64": 921653248.0, + "65": 921653248.0, + "66": 921653248.0, + "67": 921653248.0, + "68": 921653248.0, + "69": 921653248.0, + "70": 921653248.0, + "71": 921653248.0, + "72": 921653248.0, + "73": 921653248.0, + "74": 921653248.0, + "75": 921653248.0, + "76": 921653248.0, + "77": 921653248.0, + "78": 921653248.0, + "79": 921653248.0, + "80": 921653248.0, + "81": 921653248.0, + "82": 921653248.0, + "83": 921653248.0, + "84": 921653248.0, + "85": 921653248.0, + "86": 921653248.0, + "87": 921653248.0, + "88": 921653248.0, + "89": 921653248.0, + "90": 921653248.0, + "91": 921653248.0, + "92": 921653248.0, + "93": 921653248.0, + "94": 921653248.0, + "95": 921653248.0, + "96": 921653248.0, + "97": 921653248.0, + "98": 921653248.0, + "99": 921653248.0, + "100": 921653248.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2237722624.0, + "2": 2600334336.0, + "3": 2600334336.0, + "4": 2600334336.0, + "5": 2600334336.0, + "6": 2600334336.0, + "7": 2600334336.0, + "8": 2600334336.0, + "9": 2600334336.0, + "10": 2600334336.0, + "11": 2600334336.0, + "12": 2600334336.0, + "13": 2600334336.0, + "14": 2600334336.0, + "15": 2600334336.0, + "16": 2600334336.0, + "17": 2600334336.0, + "18": 2600334336.0, + "19": 2600334336.0, + "20": 2600334336.0, + "21": 2600334336.0, + "22": 2600334336.0, + "23": 2600334336.0, + "24": 2600334336.0, + "25": 2600334336.0, + "26": 2600334336.0, + "27": 2600334336.0, + "28": 2600334336.0, + "29": 2600334336.0, + "30": 2600334336.0, + "31": 2600334336.0, + "32": 2600334336.0, + "33": 2600334336.0, + "34": 2600334336.0, + "35": 2600334336.0, + "36": 2600334336.0, + "37": 2600334336.0, + "38": 2600334336.0, + "39": 2600334336.0, + "40": 2600334336.0, + "41": 2600334336.0, + "42": 2600334336.0, + "43": 2600334336.0, + "44": 2600334336.0, + "45": 2600334336.0, + "46": 2600334336.0, + "47": 2600334336.0, + "48": 2600334336.0, + "49": 2600334336.0, + "50": 2600334336.0, + "51": 2600334336.0, + "52": 2600334336.0, + "53": 2600334336.0, + "54": 2600334336.0, + "55": 2600334336.0, + "56": 2600334336.0, + "57": 2600334336.0, + "58": 2600334336.0, + "59": 2600334336.0, + "60": 2600334336.0, + "61": 2600334336.0, + "62": 2600334336.0, + "63": 2600334336.0, + "64": 2600334336.0, + "65": 2600334336.0, + "66": 2600334336.0, + "67": 2600334336.0, + "68": 2600334336.0, + "69": 2600334336.0, + "70": 2600334336.0, + "71": 2600334336.0, + "72": 2600334336.0, + "73": 2600334336.0, + "74": 2600334336.0, + "75": 2600334336.0, + "76": 2600334336.0, + "77": 2600334336.0, + "78": 2600334336.0, + "79": 2600334336.0, + "80": 2600334336.0, + "81": 2600334336.0, + "82": 2600334336.0, + "83": 2600334336.0, + "84": 2600334336.0, + "85": 2600334336.0, + "86": 2600334336.0, + "87": 2600334336.0, + "88": 2600334336.0, + "89": 2600334336.0, + "90": 2600334336.0, + "91": 2600334336.0, + "92": 2600334336.0, + "93": 2600334336.0, + "94": 2600334336.0, + "95": 2600334336.0, + "96": 2600334336.0, + "97": 2600334336.0, + "98": 2600334336.0, + "99": 2600334336.0, + "100": 2600334336.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.95491, + "2": 0.12886, + "3": 0.09196, + "4": 0.09036, + "5": 0.0891, + "6": 0.08806, + "7": 0.08916, + "8": 0.08903, + "9": 0.08912, + "10": 0.08738, + "11": 0.08775, + "12": 0.08738, + "13": 0.08675, + "14": 0.08535, + "15": 0.08586, + "16": 0.0851, + "17": 0.08505, + "18": 0.08481, + "19": 0.08648, + "20": 0.08679, + "21": 0.08735, + "22": 0.08776, + "23": 0.0857, + "24": 0.0851, + "25": 0.08801, + "26": 0.08761, + "27": 0.08685, + "28": 0.08721, + "29": 0.08807, + "30": 0.08783, + "31": 0.08825, + "32": 0.08805, + "33": 0.08749, + "34": 0.08564, + "35": 0.085, + "36": 0.08606, + "37": 0.08494, + "38": 0.08477, + "39": 0.08603, + "40": 0.08627, + "41": 0.08694, + "42": 0.08578, + "43": 0.08584, + "44": 0.08577, + "45": 0.08596, + "46": 0.08538, + "47": 0.0862, + "48": 0.08574, + "49": 0.08854, + "50": 0.08527, + "51": 0.09439, + "52": 0.08466, + "53": 0.08545, + "54": 0.08497, + "55": 0.08493, + "56": 0.08787, + "57": 0.08631, + "58": 0.08602, + "59": 0.08587, + "60": 0.0854, + "61": 0.08742, + "62": 0.0911, + "63": 0.09274, + "64": 0.08551, + "65": 0.08568, + "66": 0.0853, + "67": 0.08594, + "68": 0.08625, + "69": 0.08637, + "70": 0.08573, + "71": 0.08555, + "72": 0.0872, + "73": 0.08585, + "74": 0.08614, + "75": 0.08597, + "76": 0.08636, + "77": 0.08583, + "78": 0.08519, + "79": 0.0856, + "80": 0.08653, + "81": 0.08552, + "82": 0.08602, + "83": 0.08556, + "84": 0.08482, + "85": 0.08554, + "86": 0.08706, + "87": 0.08629, + "88": 0.08512, + "89": 0.08574, + "90": 0.08568, + "91": 0.08531, + "92": 0.08556, + "93": 0.08519, + "94": 0.08579, + "95": 0.0868, + "96": 0.08804, + "97": 0.08724, + "98": 0.08666, + "99": 0.08515, + "100": 0.08511 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..b0474f2f8ec --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85787, + "2": 10.87336, + "3": 10.86821, + "4": 10.87255, + "5": 10.87398, + "6": 10.89631, + "7": 10.86379, + "8": 10.87834, + "9": 10.87399, + "10": 10.83714, + "11": 10.86988, + "12": 10.85947, + "13": 10.87777, + "14": 10.87924, + "15": 10.81888, + "16": 10.83058, + "17": 10.78684, + "18": 10.80146, + "19": 10.79775, + "20": 10.71155, + "21": 10.6865, + "22": 10.55277, + "23": 10.7014, + "24": 10.58527, + "25": 10.52658, + "26": 10.58299, + "27": 10.59487, + "28": 10.54787, + "29": 10.55928, + "30": 10.32818, + "31": 10.08272, + "32": 10.44699, + "33": 10.42755, + "34": 10.17932, + "35": 10.24095, + "36": 10.18094, + "37": 10.32809, + "38": 10.16727, + "39": 10.37344, + "40": 10.05079, + "41": 10.10728, + "42": 10.17799, + "43": 9.77846, + "44": 9.91207, + "45": 9.77392, + "46": 9.75431, + "47": 10.09497, + "48": 9.79523, + "49": 9.46391, + "50": 9.8673, + "51": 9.80381, + "52": 9.68202, + "53": 10.02345, + "54": 9.91634, + "55": 9.82456, + "56": 9.56974, + "57": 9.42672, + "58": 9.78081, + "59": 9.53243, + "60": 9.44593, + "61": 9.64254, + "62": 9.94293, + "63": 9.31764, + "64": 9.72548, + "65": 8.88739, + "66": 9.65691, + "67": 9.31749, + "68": 9.73495, + "69": 9.74866, + "70": 9.69625, + "71": 9.57689, + "72": 9.52422, + "73": 9.45595, + "74": 8.88269, + "75": 9.37584, + "76": 9.01136, + "77": 10.02287, + "78": 9.67963, + "79": 9.33172, + "80": 9.35826, + "81": 9.43394, + "82": 9.65054, + "83": 9.25503, + "84": 9.3714, + "85": 9.5623, + "86": 9.03489, + "87": 9.54614, + "88": 9.69785, + "89": 9.54656, + "90": 9.77624, + "91": 9.2884, + "92": 9.30662, + "93": 9.02647, + "94": 8.78837, + "95": 9.48027, + "96": 9.47974, + "97": 9.25611, + "98": 9.61949, + "99": 8.83824, + "100": 9.35135 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1858.0, + "2": 1854.0, + "3": 1803.0, + "4": 1955.0, + "5": 2000.0, + "6": 2036.0, + "7": 1932.0, + "8": 1791.0, + "9": 1935.0, + "10": 1654.0, + "11": 2080.0, + "12": 1881.0, + "13": 1977.0, + "14": 2080.0, + "15": 1957.0, + "16": 1910.0, + "17": 1974.0, + "18": 1896.0, + "19": 1955.0, + "20": 1816.0, + "21": 1906.0, + "22": 1972.0, + "23": 2062.0, + "24": 1897.0, + "25": 1830.0, + "26": 1788.0, + "27": 1849.0, + "28": 2008.0, + "29": 2128.0, + "30": 1969.0, + "31": 1630.0, + "32": 2057.0, + "33": 2171.0, + "34": 1947.0, + "35": 2097.0, + "36": 1972.0, + "37": 2348.0, + "38": 2186.0, + "39": 2378.0, + "40": 2181.0, + "41": 2326.0, + "42": 2334.0, + "43": 2219.0, + "44": 2234.0, + "45": 2231.0, + "46": 2229.0, + "47": 2449.0, + "48": 2439.0, + "49": 2159.0, + "50": 2290.0, + "51": 2514.0, + "52": 2513.0, + "53": 2894.0, + "54": 2656.0, + "55": 2348.0, + "56": 2506.0, + "57": 2501.0, + "58": 2770.0, + "59": 2681.0, + "60": 2434.0, + "61": 2776.0, + "62": 2596.0, + "63": 2617.0, + "64": 3012.0, + "65": 2657.0, + "66": 2947.0, + "67": 3089.0, + "68": 2818.0, + "69": 2909.0, + "70": 3025.0, + "71": 2924.0, + "72": 2702.0, + "73": 2947.0, + "74": 2306.0, + "75": 2791.0, + "76": 3093.0, + "77": 3107.0, + "78": 3134.0, + "79": 3205.0, + "80": 3123.0, + "81": 3290.0, + "82": 3172.0, + "83": 2719.0, + "84": 3328.0, + "85": 3255.0, + "86": 2546.0, + "87": 3472.0, + "88": 3068.0, + "89": 2953.0, + "90": 3300.0, + "91": 3154.0, + "92": 3061.0, + "93": 2889.0, + "94": 3535.0, + "95": 3078.0, + "96": 3181.0, + "97": 3135.0, + "98": 3569.0, + "99": 3319.0, + "100": 3223.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 921653248.0, + "2": 921653248.0, + "3": 921653248.0, + "4": 921653248.0, + "5": 921653248.0, + "6": 921653248.0, + "7": 921653248.0, + "8": 921653248.0, + "9": 921653248.0, + "10": 921653248.0, + "11": 921653248.0, + "12": 921653248.0, + "13": 921653248.0, + "14": 921653248.0, + "15": 921653248.0, + "16": 921653248.0, + "17": 921653248.0, + "18": 921653248.0, + "19": 921653248.0, + "20": 921653248.0, + "21": 921653248.0, + "22": 921653248.0, + "23": 921653248.0, + "24": 921653248.0, + "25": 921653248.0, + "26": 921653248.0, + "27": 921653248.0, + "28": 921653248.0, + "29": 921653248.0, + "30": 921653248.0, + "31": 921653248.0, + "32": 921653248.0, + "33": 921653248.0, + "34": 921653248.0, + "35": 921653248.0, + "36": 921653248.0, + "37": 921653248.0, + "38": 921653248.0, + "39": 921653248.0, + "40": 921653248.0, + "41": 921653248.0, + "42": 921653248.0, + "43": 921653248.0, + "44": 921653248.0, + "45": 921653248.0, + "46": 921653248.0, + "47": 921653248.0, + "48": 921653248.0, + "49": 921653248.0, + "50": 921653248.0, + "51": 921653248.0, + "52": 921653248.0, + "53": 921653248.0, + "54": 921653248.0, + "55": 921653248.0, + "56": 921653248.0, + "57": 921653248.0, + "58": 921653248.0, + "59": 921653248.0, + "60": 921653248.0, + "61": 921653248.0, + "62": 921653248.0, + "63": 921653248.0, + "64": 921653248.0, + "65": 921653248.0, + "66": 921653248.0, + "67": 921653248.0, + "68": 921653248.0, + "69": 921653248.0, + "70": 921653248.0, + "71": 921653248.0, + "72": 921653248.0, + "73": 921653248.0, + "74": 921653248.0, + "75": 921653248.0, + "76": 921653248.0, + "77": 921653248.0, + "78": 921653248.0, + "79": 921653248.0, + "80": 921653248.0, + "81": 921653248.0, + "82": 921653248.0, + "83": 921653248.0, + "84": 921653248.0, + "85": 921653248.0, + "86": 921653248.0, + "87": 921653248.0, + "88": 921653248.0, + "89": 921653248.0, + "90": 921653248.0, + "91": 921653248.0, + "92": 921653248.0, + "93": 921653248.0, + "94": 921653248.0, + "95": 921653248.0, + "96": 921653248.0, + "97": 921653248.0, + "98": 921653248.0, + "99": 921653248.0, + "100": 921653248.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2237722624.0, + "2": 2600334336.0, + "3": 2600334336.0, + "4": 2600334336.0, + "5": 2600334336.0, + "6": 2600334336.0, + "7": 2600334336.0, + "8": 2600334336.0, + "9": 2600334336.0, + "10": 2600334336.0, + "11": 2600334336.0, + "12": 2600334336.0, + "13": 2600334336.0, + "14": 2600334336.0, + "15": 2600334336.0, + "16": 2600334336.0, + "17": 2600334336.0, + "18": 2600334336.0, + "19": 2600334336.0, + "20": 2600334336.0, + "21": 2600334336.0, + "22": 2600334336.0, + "23": 2600334336.0, + "24": 2600334336.0, + "25": 2600334336.0, + "26": 2600334336.0, + "27": 2600334336.0, + "28": 2600334336.0, + "29": 2600334336.0, + "30": 2600334336.0, + "31": 2600334336.0, + "32": 2600334336.0, + "33": 2600334336.0, + "34": 2600334336.0, + "35": 2600334336.0, + "36": 2600334336.0, + "37": 2600334336.0, + "38": 2600334336.0, + "39": 2600334336.0, + "40": 2600334336.0, + "41": 2600334336.0, + "42": 2600334336.0, + "43": 2600334336.0, + "44": 2600334336.0, + "45": 2600334336.0, + "46": 2600334336.0, + "47": 2600334336.0, + "48": 2600334336.0, + "49": 2600334336.0, + "50": 2600334336.0, + "51": 2600334336.0, + "52": 2600334336.0, + "53": 2600334336.0, + "54": 2600334336.0, + "55": 2600334336.0, + "56": 2600334336.0, + "57": 2600334336.0, + "58": 2600334336.0, + "59": 2600334336.0, + "60": 2600334336.0, + "61": 2600334336.0, + "62": 2600334336.0, + "63": 2600334336.0, + "64": 2600334336.0, + "65": 2600334336.0, + "66": 2600334336.0, + "67": 2600334336.0, + "68": 2600334336.0, + "69": 2600334336.0, + "70": 2600334336.0, + "71": 2600334336.0, + "72": 2600334336.0, + "73": 2600334336.0, + "74": 2600334336.0, + "75": 2600334336.0, + "76": 2600334336.0, + "77": 2600334336.0, + "78": 2600334336.0, + "79": 2600334336.0, + "80": 2600334336.0, + "81": 2600334336.0, + "82": 2600334336.0, + "83": 2600334336.0, + "84": 2600334336.0, + "85": 2600334336.0, + "86": 2600334336.0, + "87": 2600334336.0, + "88": 2600334336.0, + "89": 2600334336.0, + "90": 2600334336.0, + "91": 2600334336.0, + "92": 2600334336.0, + "93": 2600334336.0, + "94": 2600334336.0, + "95": 2600334336.0, + "96": 2600334336.0, + "97": 2600334336.0, + "98": 2600334336.0, + "99": 2600334336.0, + "100": 2600334336.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.39748, + "2": 0.11699, + "3": 0.10324, + "4": 0.10602, + "5": 0.10273, + "6": 0.10169, + "7": 0.10402, + "8": 0.10582, + "9": 0.10893, + "10": 0.10156, + "11": 0.10006, + "12": 0.10034, + "13": 0.10111, + "14": 0.10835, + "15": 0.10198, + "16": 0.10295, + "17": 0.10379, + "18": 0.10096, + "19": 0.10678, + "20": 0.10208, + "21": 0.10213, + "22": 0.10179, + "23": 0.10357, + "24": 0.10282, + "25": 0.09979, + "26": 0.10143, + "27": 0.10197, + "28": 0.10127, + "29": 0.10116, + "30": 0.10243, + "31": 0.10107, + "32": 0.10147, + "33": 0.10181, + "34": 0.1038, + "35": 0.10095, + "36": 0.09889, + "37": 0.09992, + "38": 0.10001, + "39": 0.10006, + "40": 0.10004, + "41": 0.09886, + "42": 0.09836, + "43": 0.09974, + "44": 0.10016, + "45": 0.10004, + "46": 0.09945, + "47": 0.0989, + "48": 0.09882, + "49": 0.09906, + "50": 0.09893, + "51": 0.10108, + "52": 0.10571, + "53": 0.10114, + "54": 0.09935, + "55": 0.09893, + "56": 0.09871, + "57": 0.10568, + "58": 0.09952, + "59": 0.10185, + "60": 0.09937, + "61": 0.09902, + "62": 0.10469, + "63": 0.10029, + "64": 0.09881, + "65": 0.09927, + "66": 0.09932, + "67": 0.10538, + "68": 0.09988, + "69": 0.10144, + "70": 0.09918, + "71": 0.10686, + "72": 0.09922, + "73": 0.09936, + "74": 0.09915, + "75": 0.09862, + "76": 0.1068, + "77": 0.09885, + "78": 0.09998, + "79": 0.1002, + "80": 0.09911, + "81": 0.10038, + "82": 0.09931, + "83": 0.09871, + "84": 0.09987, + "85": 0.09983, + "86": 0.10014, + "87": 0.0994, + "88": 0.09924, + "89": 0.10058, + "90": 0.10033, + "91": 0.10009, + "92": 0.10037, + "93": 0.09877, + "94": 0.09968, + "95": 0.10011, + "96": 0.09929, + "97": 0.09969, + "98": 0.09929, + "99": 0.10037, + "100": 0.10155 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 9cc113af90f..866cb310652 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.85902, + "2": 10.87345, + "3": 10.86919, + "4": 10.87273, "5": 10.87389, + "6": 10.89658, + "7": 10.86387, + "8": 10.87869, + "9": 10.87439, "10": 10.83846, + "11": 10.87012, + "12": 10.86011, + "13": 10.87824, + "14": 10.87935, "15": 10.8191, + "16": 10.83109, + "17": 10.78722, + "18": 10.80215, + "19": 10.7983, "20": 10.71224, + "21": 10.68683, + "22": 10.55402, + "23": 10.70111, + "24": 10.58621, "25": 10.52673, + "26": 10.5837, + "27": 10.59499, + "28": 10.54816, + "29": 10.55965, "30": 10.32899, + "31": 10.08331, + "32": 10.44752, + "33": 10.4278, + "34": 10.1796, "35": 10.24121, + "36": 10.18155, + "37": 10.32827, + "38": 10.16792, + "39": 10.37357, "40": 10.05111, + "41": 10.10708, + "42": 10.17823, + "43": 9.77867, + "44": 9.91197, "45": 9.77404, + "46": 9.75415, + "47": 10.09501, + "48": 9.79531, + "49": 9.46422, "50": 9.86729, + "51": 9.80375, + "52": 9.68218, + "53": 10.02348, + "54": 9.91595, "55": 9.82442, + "56": 9.56994, + "57": 9.42628, + "58": 9.78075, + "59": 9.53254, "60": 9.44561, + "61": 9.64249, + "62": 9.94298, + "63": 9.31745, + "64": 9.7256, "65": 8.88735, + "66": 9.65711, + "67": 9.31747, + "68": 9.73506, + "69": 9.74863, "70": 9.69601, + "71": 9.57682, + "72": 9.52425, + "73": 9.4558, + "74": 8.8826, "75": 9.37563, + "76": 9.01106, + "77": 10.02278, + "78": 9.6796, + "79": 9.33171, "80": 9.35836, + "81": 9.43399, + "82": 9.65055, + "83": 9.2551, + "84": 9.37131, "85": 9.56237, + "86": 9.0351, + "87": 9.54617, + "88": 9.69806, + "89": 9.54657, "90": 9.77627, + "91": 9.28858, + "92": 9.30652, + "93": 9.02646, + "94": 8.7883, "95": 9.48041, + "96": 9.47962, + "97": 9.25545, + "98": 9.61947, + "99": 8.83854, "100": 9.35116 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1862.0, + "2": 1874.0, + "3": 1748.0, + "4": 1955.0, "5": 2050.0, + "6": 1997.0, + "7": 1967.0, + "8": 1853.0, + "9": 1965.0, "10": 1652.0, + "11": 2042.0, + "12": 1877.0, + "13": 2076.0, + "14": 1956.0, "15": 1953.0, + "16": 1915.0, + "17": 2045.0, + "18": 1965.0, + "19": 1988.0, "20": 1785.0, + "21": 1941.0, + "22": 1928.0, + "23": 2112.0, + "24": 1802.0, "25": 1933.0, + "26": 1786.0, + "27": 1945.0, + "28": 2037.0, + "29": 2119.0, "30": 2022.0, + "31": 1699.0, + "32": 2130.0, + "33": 2187.0, + "34": 1929.0, "35": 2092.0, + "36": 2109.0, + "37": 2362.0, + "38": 2211.0, + "39": 2383.0, "40": 2203.0, + "41": 2288.0, + "42": 2224.0, + "43": 2150.0, + "44": 2206.0, "45": 2187.0, + "46": 2181.0, + "47": 2260.0, + "48": 2341.0, + "49": 2210.0, "50": 2219.0, + "51": 2508.0, + "52": 2483.0, + "53": 2959.0, + "54": 2554.0, "55": 2408.0, + "56": 2452.0, + "57": 2528.0, + "58": 2594.0, + "59": 2750.0, "60": 2563.0, + "61": 2794.0, + "62": 2495.0, + "63": 2493.0, + "64": 2965.0, "65": 2569.0, + "66": 2877.0, + "67": 2969.0, + "68": 2803.0, + "69": 2944.0, "70": 3001.0, + "71": 2867.0, + "72": 2714.0, + "73": 3017.0, + "74": 2281.0, "75": 2774.0, + "76": 2983.0, + "77": 2955.0, + "78": 3148.0, + "79": 3076.0, "80": 2992.0, + "81": 3255.0, + "82": 3212.0, + "83": 2809.0, + "84": 3266.0, "85": 3188.0, + "86": 2616.0, + "87": 3492.0, + "88": 3130.0, + "89": 3020.0, "90": 3238.0, + "91": 3106.0, + "92": 3183.0, + "93": 2960.0, + "94": 3492.0, "95": 3112.0, + "96": 3256.0, + "97": 3055.0, + "98": 3558.0, + "99": 3196.0, "100": 3109.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 921653248.0, + "2": 921653248.0, + "3": 921653248.0, + "4": 921653248.0, "5": 921653248.0, + "6": 921653248.0, + "7": 921653248.0, + "8": 921653248.0, + "9": 921653248.0, "10": 921653248.0, + "11": 921653248.0, + "12": 921653248.0, + "13": 921653248.0, + "14": 921653248.0, "15": 921653248.0, + "16": 921653248.0, + "17": 921653248.0, + "18": 921653248.0, + "19": 921653248.0, "20": 921653248.0, + "21": 921653248.0, + "22": 921653248.0, + "23": 921653248.0, + "24": 921653248.0, "25": 921653248.0, + "26": 921653248.0, + "27": 921653248.0, + "28": 921653248.0, + "29": 921653248.0, "30": 921653248.0, + "31": 921653248.0, + "32": 921653248.0, + "33": 921653248.0, + "34": 921653248.0, "35": 921653248.0, + "36": 921653248.0, + "37": 921653248.0, + "38": 921653248.0, + "39": 921653248.0, "40": 921653248.0, + "41": 921653248.0, + "42": 921653248.0, + "43": 921653248.0, + "44": 921653248.0, "45": 921653248.0, + "46": 921653248.0, + "47": 921653248.0, + "48": 921653248.0, + "49": 921653248.0, "50": 921653248.0, + "51": 921653248.0, + "52": 921653248.0, + "53": 921653248.0, + "54": 921653248.0, "55": 921653248.0, + "56": 921653248.0, + "57": 921653248.0, + "58": 921653248.0, + "59": 921653248.0, "60": 921653248.0, + "61": 921653248.0, + "62": 921653248.0, + "63": 921653248.0, + "64": 921653248.0, "65": 921653248.0, + "66": 921653248.0, + "67": 921653248.0, + "68": 921653248.0, + "69": 921653248.0, "70": 921653248.0, + "71": 921653248.0, + "72": 921653248.0, + "73": 921653248.0, + "74": 921653248.0, "75": 921653248.0, + "76": 921653248.0, + "77": 921653248.0, + "78": 921653248.0, + "79": 921653248.0, "80": 921653248.0, + "81": 921653248.0, + "82": 921653248.0, + "83": 921653248.0, + "84": 921653248.0, "85": 921653248.0, + "86": 921653248.0, + "87": 921653248.0, + "88": 921653248.0, + "89": 921653248.0, "90": 921653248.0, + "91": 921653248.0, + "92": 921653248.0, + "93": 921653248.0, + "94": 921653248.0, "95": 921653248.0, + "96": 921653248.0, + "97": 921653248.0, + "98": 921653248.0, + "99": 921653248.0, "100": 921653248.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2240868352.0, + "2": 2603480064.0, + "3": 2603480064.0, + "4": 2603480064.0, "5": 2603480064.0, + "6": 2603480064.0, + "7": 2603480064.0, + "8": 2603480064.0, + "9": 2603480064.0, "10": 2603480064.0, + "11": 2603480064.0, + "12": 2603480064.0, + "13": 2603480064.0, + "14": 2603480064.0, "15": 2603480064.0, + "16": 2603480064.0, + "17": 2603480064.0, + "18": 2603480064.0, + "19": 2603480064.0, "20": 2603480064.0, + "21": 2603480064.0, + "22": 2603480064.0, + "23": 2603480064.0, + "24": 2603480064.0, "25": 2603480064.0, + "26": 2603480064.0, + "27": 2603480064.0, + "28": 2603480064.0, + "29": 2603480064.0, "30": 2603480064.0, + "31": 2603480064.0, + "32": 2603480064.0, + "33": 2603480064.0, + "34": 2603480064.0, "35": 2603480064.0, + "36": 2603480064.0, + "37": 2603480064.0, + "38": 2603480064.0, + "39": 2603480064.0, "40": 2603480064.0, + "41": 2603480064.0, + "42": 2603480064.0, + "43": 2603480064.0, + "44": 2603480064.0, "45": 2603480064.0, + "46": 2603480064.0, + "47": 2603480064.0, + "48": 2603480064.0, + "49": 2603480064.0, "50": 2603480064.0, + "51": 2603480064.0, + "52": 2603480064.0, + "53": 2603480064.0, + "54": 2603480064.0, "55": 2603480064.0, + "56": 2603480064.0, + "57": 2603480064.0, + "58": 2603480064.0, + "59": 2603480064.0, "60": 2603480064.0, + "61": 2603480064.0, + "62": 2603480064.0, + "63": 2603480064.0, + "64": 2603480064.0, "65": 2603480064.0, + "66": 2603480064.0, + "67": 2603480064.0, + "68": 2603480064.0, + "69": 2603480064.0, "70": 2603480064.0, + "71": 2603480064.0, + "72": 2603480064.0, + "73": 2603480064.0, + "74": 2603480064.0, "75": 2603480064.0, + "76": 2603480064.0, + "77": 2603480064.0, + "78": 2603480064.0, + "79": 2603480064.0, "80": 2603480064.0, + "81": 2603480064.0, + "82": 2603480064.0, + "83": 2603480064.0, + "84": 2603480064.0, "85": 2603480064.0, + "86": 2603480064.0, + "87": 2603480064.0, + "88": 2603480064.0, + "89": 2603480064.0, "90": 2603480064.0, + "91": 2603480064.0, + "92": 2603480064.0, + "93": 2603480064.0, + "94": 2603480064.0, "95": 2603480064.0, + "96": 2603480064.0, + "97": 2603480064.0, + "98": 2603480064.0, + "99": 2603480064.0, "100": 2603480064.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 7.78165, - "5": 0.09513, - "10": 0.10651, - "15": 0.10345, - "20": 0.10578, - "25": 0.10549, - "30": 0.09676, - "35": 0.09698, - "40": 0.10038, - "45": 0.09627, - "50": 0.09595, - "55": 0.0993, - "60": 0.09556, - "65": 0.09917, - "70": 0.09623, - "75": 0.09539, - "80": 0.09584, - "85": 0.09887, - "90": 0.09565, - "95": 0.09717, - "100": 0.09806 + "1": 10.22635, + "2": 0.13443, + "3": 0.11453, + "4": 0.11544, + "5": 0.11529, + "6": 0.1139, + "7": 0.11696, + "8": 0.11432, + "9": 0.11422, + "10": 0.11467, + "11": 0.1115, + "12": 0.11137, + "13": 0.11192, + "14": 0.1124, + "15": 0.11313, + "16": 0.11436, + "17": 0.11212, + "18": 0.11209, + "19": 0.11518, + "20": 0.11167, + "21": 0.11083, + "22": 0.11186, + "23": 0.11362, + "24": 0.11218, + "25": 0.1144, + "26": 0.11178, + "27": 0.11153, + "28": 0.11303, + "29": 0.11052, + "30": 0.11214, + "31": 0.1141, + "32": 0.1126, + "33": 0.11238, + "34": 0.1134, + "35": 0.11232, + "36": 0.11052, + "37": 0.11225, + "38": 0.1121, + "39": 0.113, + "40": 0.11315, + "41": 0.11169, + "42": 0.11263, + "43": 0.11419, + "44": 0.11234, + "45": 0.11091, + "46": 0.11336, + "47": 0.11328, + "48": 0.11388, + "49": 0.11279, + "50": 0.11198, + "51": 0.13191, + "52": 0.11591, + "53": 0.11273, + "54": 0.11461, + "55": 0.11358, + "56": 0.11259, + "57": 0.11325, + "58": 0.1162, + "59": 0.11491, + "60": 0.11726, + "61": 0.11465, + "62": 0.11311, + "63": 0.11801, + "64": 0.11752, + "65": 0.11546, + "66": 0.11225, + "67": 0.11448, + "68": 0.11548, + "69": 0.11397, + "70": 0.11275, + "71": 0.11441, + "72": 0.11487, + "73": 0.11522, + "74": 0.11426, + "75": 0.11345, + "76": 0.11269, + "77": 0.1157, + "78": 0.11597, + "79": 0.11379, + "80": 0.11587, + "81": 0.11486, + "82": 0.11305, + "83": 0.1127, + "84": 0.11361, + "85": 0.11384, + "86": 0.11703, + "87": 0.11426, + "88": 0.11283, + "89": 0.1146, + "90": 0.11235, + "91": 0.11207, + "92": 0.11217, + "93": 0.11286, + "94": 0.11446, + "95": 0.11504, + "96": 0.11469, + "97": 0.11241, + "98": 0.11333, + "99": 0.11104, + "100": 0.1126 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..444ff2cd262 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85902, + "2": 10.87345, + "3": 10.86919, + "4": 10.87273, + "5": 10.87389, + "6": 10.89658, + "7": 10.86387, + "8": 10.87869, + "9": 10.87439, + "10": 10.83846, + "11": 10.87012, + "12": 10.86011, + "13": 10.87824, + "14": 10.87935, + "15": 10.8191, + "16": 10.83109, + "17": 10.78722, + "18": 10.80215, + "19": 10.7983, + "20": 10.71224, + "21": 10.68683, + "22": 10.55402, + "23": 10.70111, + "24": 10.58621, + "25": 10.52673, + "26": 10.5837, + "27": 10.59499, + "28": 10.54816, + "29": 10.55965, + "30": 10.32899, + "31": 10.08331, + "32": 10.44752, + "33": 10.4278, + "34": 10.1796, + "35": 10.24121, + "36": 10.18155, + "37": 10.32827, + "38": 10.16792, + "39": 10.37357, + "40": 10.05111, + "41": 10.10708, + "42": 10.17823, + "43": 9.77867, + "44": 9.91197, + "45": 9.77404, + "46": 9.75415, + "47": 10.09501, + "48": 9.79531, + "49": 9.46422, + "50": 9.86729, + "51": 9.80375, + "52": 9.68218, + "53": 10.02348, + "54": 9.91595, + "55": 9.82442, + "56": 9.56994, + "57": 9.42628, + "58": 9.78075, + "59": 9.53254, + "60": 9.44561, + "61": 9.64249, + "62": 9.94298, + "63": 9.31745, + "64": 9.7256, + "65": 8.88735, + "66": 9.65711, + "67": 9.31747, + "68": 9.73506, + "69": 9.74863, + "70": 9.69601, + "71": 9.57682, + "72": 9.52425, + "73": 9.4558, + "74": 8.8826, + "75": 9.37563, + "76": 9.01106, + "77": 10.02278, + "78": 9.6796, + "79": 9.33171, + "80": 9.35836, + "81": 9.43399, + "82": 9.65055, + "83": 9.2551, + "84": 9.37131, + "85": 9.56237, + "86": 9.0351, + "87": 9.54617, + "88": 9.69806, + "89": 9.54657, + "90": 9.77627, + "91": 9.28858, + "92": 9.30652, + "93": 9.02646, + "94": 8.7883, + "95": 9.48041, + "96": 9.47962, + "97": 9.25545, + "98": 9.61947, + "99": 8.83854, + "100": 9.35116 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1862.0, + "2": 1874.0, + "3": 1748.0, + "4": 1955.0, + "5": 2050.0, + "6": 1997.0, + "7": 1967.0, + "8": 1853.0, + "9": 1965.0, + "10": 1652.0, + "11": 2042.0, + "12": 1877.0, + "13": 2076.0, + "14": 1956.0, + "15": 1953.0, + "16": 1915.0, + "17": 2045.0, + "18": 1965.0, + "19": 1988.0, + "20": 1785.0, + "21": 1941.0, + "22": 1928.0, + "23": 2112.0, + "24": 1802.0, + "25": 1933.0, + "26": 1786.0, + "27": 1945.0, + "28": 2037.0, + "29": 2119.0, + "30": 2022.0, + "31": 1699.0, + "32": 2130.0, + "33": 2187.0, + "34": 1929.0, + "35": 2092.0, + "36": 2109.0, + "37": 2362.0, + "38": 2211.0, + "39": 2383.0, + "40": 2203.0, + "41": 2288.0, + "42": 2224.0, + "43": 2150.0, + "44": 2206.0, + "45": 2187.0, + "46": 2181.0, + "47": 2260.0, + "48": 2341.0, + "49": 2210.0, + "50": 2219.0, + "51": 2508.0, + "52": 2483.0, + "53": 2959.0, + "54": 2554.0, + "55": 2408.0, + "56": 2452.0, + "57": 2528.0, + "58": 2594.0, + "59": 2750.0, + "60": 2563.0, + "61": 2794.0, + "62": 2495.0, + "63": 2493.0, + "64": 2965.0, + "65": 2569.0, + "66": 2877.0, + "67": 2969.0, + "68": 2803.0, + "69": 2944.0, + "70": 3001.0, + "71": 2867.0, + "72": 2714.0, + "73": 3017.0, + "74": 2281.0, + "75": 2774.0, + "76": 2983.0, + "77": 2955.0, + "78": 3148.0, + "79": 3076.0, + "80": 2992.0, + "81": 3255.0, + "82": 3212.0, + "83": 2809.0, + "84": 3266.0, + "85": 3188.0, + "86": 2616.0, + "87": 3492.0, + "88": 3130.0, + "89": 3020.0, + "90": 3238.0, + "91": 3106.0, + "92": 3183.0, + "93": 2960.0, + "94": 3492.0, + "95": 3112.0, + "96": 3256.0, + "97": 3055.0, + "98": 3558.0, + "99": 3196.0, + "100": 3109.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 921653248.0, + "2": 921653248.0, + "3": 921653248.0, + "4": 921653248.0, + "5": 921653248.0, + "6": 921653248.0, + "7": 921653248.0, + "8": 921653248.0, + "9": 921653248.0, + "10": 921653248.0, + "11": 921653248.0, + "12": 921653248.0, + "13": 921653248.0, + "14": 921653248.0, + "15": 921653248.0, + "16": 921653248.0, + "17": 921653248.0, + "18": 921653248.0, + "19": 921653248.0, + "20": 921653248.0, + "21": 921653248.0, + "22": 921653248.0, + "23": 921653248.0, + "24": 921653248.0, + "25": 921653248.0, + "26": 921653248.0, + "27": 921653248.0, + "28": 921653248.0, + "29": 921653248.0, + "30": 921653248.0, + "31": 921653248.0, + "32": 921653248.0, + "33": 921653248.0, + "34": 921653248.0, + "35": 921653248.0, + "36": 921653248.0, + "37": 921653248.0, + "38": 921653248.0, + "39": 921653248.0, + "40": 921653248.0, + "41": 921653248.0, + "42": 921653248.0, + "43": 921653248.0, + "44": 921653248.0, + "45": 921653248.0, + "46": 921653248.0, + "47": 921653248.0, + "48": 921653248.0, + "49": 921653248.0, + "50": 921653248.0, + "51": 921653248.0, + "52": 921653248.0, + "53": 921653248.0, + "54": 921653248.0, + "55": 921653248.0, + "56": 921653248.0, + "57": 921653248.0, + "58": 921653248.0, + "59": 921653248.0, + "60": 921653248.0, + "61": 921653248.0, + "62": 921653248.0, + "63": 921653248.0, + "64": 921653248.0, + "65": 921653248.0, + "66": 921653248.0, + "67": 921653248.0, + "68": 921653248.0, + "69": 921653248.0, + "70": 921653248.0, + "71": 921653248.0, + "72": 921653248.0, + "73": 921653248.0, + "74": 921653248.0, + "75": 921653248.0, + "76": 921653248.0, + "77": 921653248.0, + "78": 921653248.0, + "79": 921653248.0, + "80": 921653248.0, + "81": 921653248.0, + "82": 921653248.0, + "83": 921653248.0, + "84": 921653248.0, + "85": 921653248.0, + "86": 921653248.0, + "87": 921653248.0, + "88": 921653248.0, + "89": 921653248.0, + "90": 921653248.0, + "91": 921653248.0, + "92": 921653248.0, + "93": 921653248.0, + "94": 921653248.0, + "95": 921653248.0, + "96": 921653248.0, + "97": 921653248.0, + "98": 921653248.0, + "99": 921653248.0, + "100": 921653248.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2240868352.0, + "2": 2603480064.0, + "3": 2603480064.0, + "4": 2603480064.0, + "5": 2603480064.0, + "6": 2603480064.0, + "7": 2603480064.0, + "8": 2603480064.0, + "9": 2603480064.0, + "10": 2603480064.0, + "11": 2603480064.0, + "12": 2603480064.0, + "13": 2603480064.0, + "14": 2603480064.0, + "15": 2603480064.0, + "16": 2603480064.0, + "17": 2603480064.0, + "18": 2603480064.0, + "19": 2603480064.0, + "20": 2603480064.0, + "21": 2603480064.0, + "22": 2603480064.0, + "23": 2603480064.0, + "24": 2603480064.0, + "25": 2603480064.0, + "26": 2603480064.0, + "27": 2603480064.0, + "28": 2603480064.0, + "29": 2603480064.0, + "30": 2603480064.0, + "31": 2603480064.0, + "32": 2603480064.0, + "33": 2603480064.0, + "34": 2603480064.0, + "35": 2603480064.0, + "36": 2603480064.0, + "37": 2603480064.0, + "38": 2603480064.0, + "39": 2603480064.0, + "40": 2603480064.0, + "41": 2603480064.0, + "42": 2603480064.0, + "43": 2603480064.0, + "44": 2603480064.0, + "45": 2603480064.0, + "46": 2603480064.0, + "47": 2603480064.0, + "48": 2603480064.0, + "49": 2603480064.0, + "50": 2603480064.0, + "51": 2603480064.0, + "52": 2603480064.0, + "53": 2603480064.0, + "54": 2603480064.0, + "55": 2603480064.0, + "56": 2603480064.0, + "57": 2603480064.0, + "58": 2603480064.0, + "59": 2603480064.0, + "60": 2603480064.0, + "61": 2603480064.0, + "62": 2603480064.0, + "63": 2603480064.0, + "64": 2603480064.0, + "65": 2603480064.0, + "66": 2603480064.0, + "67": 2603480064.0, + "68": 2603480064.0, + "69": 2603480064.0, + "70": 2603480064.0, + "71": 2603480064.0, + "72": 2603480064.0, + "73": 2603480064.0, + "74": 2603480064.0, + "75": 2603480064.0, + "76": 2603480064.0, + "77": 2603480064.0, + "78": 2603480064.0, + "79": 2603480064.0, + "80": 2603480064.0, + "81": 2603480064.0, + "82": 2603480064.0, + "83": 2603480064.0, + "84": 2603480064.0, + "85": 2603480064.0, + "86": 2603480064.0, + "87": 2603480064.0, + "88": 2603480064.0, + "89": 2603480064.0, + "90": 2603480064.0, + "91": 2603480064.0, + "92": 2603480064.0, + "93": 2603480064.0, + "94": 2603480064.0, + "95": 2603480064.0, + "96": 2603480064.0, + "97": 2603480064.0, + "98": 2603480064.0, + "99": 2603480064.0, + "100": 2603480064.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 8.53967, + "2": 0.14008, + "3": 0.1043, + "4": 0.16652, + "5": 0.10343, + "6": 0.10275, + "7": 0.10316, + "8": 0.10367, + "9": 0.10405, + "10": 0.10359, + "11": 0.09939, + "12": 0.09913, + "13": 0.09947, + "14": 0.09988, + "15": 0.10308, + "16": 0.0992, + "17": 0.10106, + "18": 0.0992, + "19": 0.09921, + "20": 0.1056, + "21": 0.10004, + "22": 0.10135, + "23": 0.1021, + "24": 0.10492, + "25": 0.09982, + "26": 0.10268, + "27": 0.10169, + "28": 0.1028, + "29": 0.10458, + "30": 0.10225, + "31": 0.09971, + "32": 0.09988, + "33": 0.10453, + "34": 0.10059, + "35": 0.10094, + "36": 0.1008, + "37": 0.10217, + "38": 0.10611, + "39": 0.10301, + "40": 0.10034, + "41": 0.09987, + "42": 0.09958, + "43": 0.10624, + "44": 0.09987, + "45": 0.09978, + "46": 0.09969, + "47": 0.10044, + "48": 0.10951, + "49": 0.10288, + "50": 0.10274, + "51": 0.10908, + "52": 0.10956, + "53": 0.10353, + "54": 0.10291, + "55": 0.09986, + "56": 0.10048, + "57": 0.10053, + "58": 0.10032, + "59": 0.09989, + "60": 0.09972, + "61": 0.09968, + "62": 0.09979, + "63": 0.10038, + "64": 0.09948, + "65": 0.10028, + "66": 0.0996, + "67": 0.10025, + "68": 0.09985, + "69": 0.1, + "70": 0.10176, + "71": 0.10036, + "72": 0.09961, + "73": 0.09996, + "74": 0.10022, + "75": 0.10121, + "76": 0.1012, + "77": 0.10049, + "78": 0.10212, + "79": 0.10036, + "80": 0.10284, + "81": 0.10151, + "82": 0.10433, + "83": 0.10034, + "84": 0.09991, + "85": 0.10037, + "86": 0.10005, + "87": 0.10117, + "88": 0.10004, + "89": 0.10192, + "90": 0.09956, + "91": 0.09987, + "92": 0.0995, + "93": 0.10044, + "94": 0.10249, + "95": 0.10315, + "96": 0.10488, + "97": 0.10312, + "98": 0.10392, + "99": 0.10217, + "100": 0.10295 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..8655a61eb9b --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85902, + "2": 10.87345, + "3": 10.86919, + "4": 10.87273, + "5": 10.87389, + "6": 10.89658, + "7": 10.86387, + "8": 10.87869, + "9": 10.87439, + "10": 10.83846, + "11": 10.87012, + "12": 10.86011, + "13": 10.87824, + "14": 10.87935, + "15": 10.8191, + "16": 10.83109, + "17": 10.78722, + "18": 10.80215, + "19": 10.7983, + "20": 10.71224, + "21": 10.68683, + "22": 10.55402, + "23": 10.70111, + "24": 10.58621, + "25": 10.52673, + "26": 10.5837, + "27": 10.59499, + "28": 10.54816, + "29": 10.55965, + "30": 10.32899, + "31": 10.08331, + "32": 10.44752, + "33": 10.4278, + "34": 10.1796, + "35": 10.24121, + "36": 10.18155, + "37": 10.32827, + "38": 10.16792, + "39": 10.37357, + "40": 10.05111, + "41": 10.10708, + "42": 10.17823, + "43": 9.77867, + "44": 9.91197, + "45": 9.77404, + "46": 9.75415, + "47": 10.09501, + "48": 9.79531, + "49": 9.46422, + "50": 9.86729, + "51": 9.80375, + "52": 9.68218, + "53": 10.02348, + "54": 9.91595, + "55": 9.82442, + "56": 9.56994, + "57": 9.42628, + "58": 9.78075, + "59": 9.53254, + "60": 9.44561, + "61": 9.64249, + "62": 9.94298, + "63": 9.31745, + "64": 9.7256, + "65": 8.88735, + "66": 9.65711, + "67": 9.31747, + "68": 9.73506, + "69": 9.74863, + "70": 9.69601, + "71": 9.57682, + "72": 9.52425, + "73": 9.4558, + "74": 8.8826, + "75": 9.37563, + "76": 9.01106, + "77": 10.02278, + "78": 9.6796, + "79": 9.33171, + "80": 9.35836, + "81": 9.43399, + "82": 9.65055, + "83": 9.2551, + "84": 9.37131, + "85": 9.56237, + "86": 9.0351, + "87": 9.54617, + "88": 9.69806, + "89": 9.54657, + "90": 9.77627, + "91": 9.28858, + "92": 9.30652, + "93": 9.02646, + "94": 8.7883, + "95": 9.48041, + "96": 9.47962, + "97": 9.25545, + "98": 9.61947, + "99": 8.83854, + "100": 9.35116 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1862.0, + "2": 1874.0, + "3": 1748.0, + "4": 1955.0, + "5": 2050.0, + "6": 1997.0, + "7": 1967.0, + "8": 1853.0, + "9": 1965.0, + "10": 1652.0, + "11": 2042.0, + "12": 1877.0, + "13": 2076.0, + "14": 1956.0, + "15": 1953.0, + "16": 1915.0, + "17": 2045.0, + "18": 1965.0, + "19": 1988.0, + "20": 1785.0, + "21": 1941.0, + "22": 1928.0, + "23": 2112.0, + "24": 1802.0, + "25": 1933.0, + "26": 1786.0, + "27": 1945.0, + "28": 2037.0, + "29": 2119.0, + "30": 2022.0, + "31": 1699.0, + "32": 2130.0, + "33": 2187.0, + "34": 1929.0, + "35": 2092.0, + "36": 2109.0, + "37": 2362.0, + "38": 2211.0, + "39": 2383.0, + "40": 2203.0, + "41": 2288.0, + "42": 2224.0, + "43": 2150.0, + "44": 2206.0, + "45": 2187.0, + "46": 2181.0, + "47": 2260.0, + "48": 2341.0, + "49": 2210.0, + "50": 2219.0, + "51": 2508.0, + "52": 2483.0, + "53": 2959.0, + "54": 2554.0, + "55": 2408.0, + "56": 2452.0, + "57": 2528.0, + "58": 2594.0, + "59": 2750.0, + "60": 2563.0, + "61": 2794.0, + "62": 2495.0, + "63": 2493.0, + "64": 2965.0, + "65": 2569.0, + "66": 2877.0, + "67": 2969.0, + "68": 2803.0, + "69": 2944.0, + "70": 3001.0, + "71": 2867.0, + "72": 2714.0, + "73": 3017.0, + "74": 2281.0, + "75": 2774.0, + "76": 2983.0, + "77": 2955.0, + "78": 3148.0, + "79": 3076.0, + "80": 2992.0, + "81": 3255.0, + "82": 3212.0, + "83": 2809.0, + "84": 3266.0, + "85": 3188.0, + "86": 2616.0, + "87": 3492.0, + "88": 3130.0, + "89": 3020.0, + "90": 3238.0, + "91": 3106.0, + "92": 3183.0, + "93": 2960.0, + "94": 3492.0, + "95": 3112.0, + "96": 3256.0, + "97": 3055.0, + "98": 3558.0, + "99": 3196.0, + "100": 3109.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 921653248.0, + "2": 921653248.0, + "3": 921653248.0, + "4": 921653248.0, + "5": 921653248.0, + "6": 921653248.0, + "7": 921653248.0, + "8": 921653248.0, + "9": 921653248.0, + "10": 921653248.0, + "11": 921653248.0, + "12": 921653248.0, + "13": 921653248.0, + "14": 921653248.0, + "15": 921653248.0, + "16": 921653248.0, + "17": 921653248.0, + "18": 921653248.0, + "19": 921653248.0, + "20": 921653248.0, + "21": 921653248.0, + "22": 921653248.0, + "23": 921653248.0, + "24": 921653248.0, + "25": 921653248.0, + "26": 921653248.0, + "27": 921653248.0, + "28": 921653248.0, + "29": 921653248.0, + "30": 921653248.0, + "31": 921653248.0, + "32": 921653248.0, + "33": 921653248.0, + "34": 921653248.0, + "35": 921653248.0, + "36": 921653248.0, + "37": 921653248.0, + "38": 921653248.0, + "39": 921653248.0, + "40": 921653248.0, + "41": 921653248.0, + "42": 921653248.0, + "43": 921653248.0, + "44": 921653248.0, + "45": 921653248.0, + "46": 921653248.0, + "47": 921653248.0, + "48": 921653248.0, + "49": 921653248.0, + "50": 921653248.0, + "51": 921653248.0, + "52": 921653248.0, + "53": 921653248.0, + "54": 921653248.0, + "55": 921653248.0, + "56": 921653248.0, + "57": 921653248.0, + "58": 921653248.0, + "59": 921653248.0, + "60": 921653248.0, + "61": 921653248.0, + "62": 921653248.0, + "63": 921653248.0, + "64": 921653248.0, + "65": 921653248.0, + "66": 921653248.0, + "67": 921653248.0, + "68": 921653248.0, + "69": 921653248.0, + "70": 921653248.0, + "71": 921653248.0, + "72": 921653248.0, + "73": 921653248.0, + "74": 921653248.0, + "75": 921653248.0, + "76": 921653248.0, + "77": 921653248.0, + "78": 921653248.0, + "79": 921653248.0, + "80": 921653248.0, + "81": 921653248.0, + "82": 921653248.0, + "83": 921653248.0, + "84": 921653248.0, + "85": 921653248.0, + "86": 921653248.0, + "87": 921653248.0, + "88": 921653248.0, + "89": 921653248.0, + "90": 921653248.0, + "91": 921653248.0, + "92": 921653248.0, + "93": 921653248.0, + "94": 921653248.0, + "95": 921653248.0, + "96": 921653248.0, + "97": 921653248.0, + "98": 921653248.0, + "99": 921653248.0, + "100": 921653248.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2240868352.0, + "2": 2603480064.0, + "3": 2603480064.0, + "4": 2603480064.0, + "5": 2603480064.0, + "6": 2603480064.0, + "7": 2603480064.0, + "8": 2603480064.0, + "9": 2603480064.0, + "10": 2603480064.0, + "11": 2603480064.0, + "12": 2603480064.0, + "13": 2603480064.0, + "14": 2603480064.0, + "15": 2603480064.0, + "16": 2603480064.0, + "17": 2603480064.0, + "18": 2603480064.0, + "19": 2603480064.0, + "20": 2603480064.0, + "21": 2603480064.0, + "22": 2603480064.0, + "23": 2603480064.0, + "24": 2603480064.0, + "25": 2603480064.0, + "26": 2603480064.0, + "27": 2603480064.0, + "28": 2603480064.0, + "29": 2603480064.0, + "30": 2603480064.0, + "31": 2603480064.0, + "32": 2603480064.0, + "33": 2603480064.0, + "34": 2603480064.0, + "35": 2603480064.0, + "36": 2603480064.0, + "37": 2603480064.0, + "38": 2603480064.0, + "39": 2603480064.0, + "40": 2603480064.0, + "41": 2603480064.0, + "42": 2603480064.0, + "43": 2603480064.0, + "44": 2603480064.0, + "45": 2603480064.0, + "46": 2603480064.0, + "47": 2603480064.0, + "48": 2603480064.0, + "49": 2603480064.0, + "50": 2603480064.0, + "51": 2603480064.0, + "52": 2603480064.0, + "53": 2603480064.0, + "54": 2603480064.0, + "55": 2603480064.0, + "56": 2603480064.0, + "57": 2603480064.0, + "58": 2603480064.0, + "59": 2603480064.0, + "60": 2603480064.0, + "61": 2603480064.0, + "62": 2603480064.0, + "63": 2603480064.0, + "64": 2603480064.0, + "65": 2603480064.0, + "66": 2603480064.0, + "67": 2603480064.0, + "68": 2603480064.0, + "69": 2603480064.0, + "70": 2603480064.0, + "71": 2603480064.0, + "72": 2603480064.0, + "73": 2603480064.0, + "74": 2603480064.0, + "75": 2603480064.0, + "76": 2603480064.0, + "77": 2603480064.0, + "78": 2603480064.0, + "79": 2603480064.0, + "80": 2603480064.0, + "81": 2603480064.0, + "82": 2603480064.0, + "83": 2603480064.0, + "84": 2603480064.0, + "85": 2603480064.0, + "86": 2603480064.0, + "87": 2603480064.0, + "88": 2603480064.0, + "89": 2603480064.0, + "90": 2603480064.0, + "91": 2603480064.0, + "92": 2603480064.0, + "93": 2603480064.0, + "94": 2603480064.0, + "95": 2603480064.0, + "96": 2603480064.0, + "97": 2603480064.0, + "98": 2603480064.0, + "99": 2603480064.0, + "100": 2603480064.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.33977, + "2": 0.14663, + "3": 0.12463, + "4": 0.11901, + "5": 0.118, + "6": 0.11842, + "7": 0.11849, + "8": 0.11649, + "9": 0.11703, + "10": 0.11655, + "11": 0.11646, + "12": 0.11802, + "13": 0.11742, + "14": 0.1167, + "15": 0.11429, + "16": 0.11654, + "17": 0.11533, + "18": 0.11853, + "19": 0.1171, + "20": 0.11735, + "21": 0.11515, + "22": 0.11632, + "23": 0.11865, + "24": 0.11706, + "25": 0.11644, + "26": 0.11684, + "27": 0.11688, + "28": 0.11839, + "29": 0.11706, + "30": 0.11761, + "31": 0.11696, + "32": 0.11567, + "33": 0.1149, + "34": 0.11395, + "35": 0.11367, + "36": 0.11567, + "37": 0.11646, + "38": 0.11392, + "39": 0.11516, + "40": 0.11529, + "41": 0.11559, + "42": 0.11519, + "43": 0.11808, + "44": 0.11599, + "45": 0.11605, + "46": 0.11502, + "47": 0.11651, + "48": 0.11713, + "49": 0.11667, + "50": 0.11432, + "51": 0.12857, + "52": 0.12187, + "53": 0.11684, + "54": 0.11222, + "55": 0.11538, + "56": 0.11241, + "57": 0.11229, + "58": 0.11087, + "59": 0.11183, + "60": 0.11124, + "61": 0.11009, + "62": 0.11052, + "63": 0.11585, + "64": 0.11262, + "65": 0.11148, + "66": 0.11248, + "67": 0.11274, + "68": 0.11394, + "69": 0.11397, + "70": 0.11233, + "71": 0.11354, + "72": 0.11589, + "73": 0.11373, + "74": 0.11483, + "75": 0.11512, + "76": 0.11378, + "77": 0.11431, + "78": 0.11374, + "79": 0.11521, + "80": 0.11486, + "81": 0.11364, + "82": 0.11419, + "83": 0.11439, + "84": 0.11589, + "85": 0.11422, + "86": 0.11458, + "87": 0.11184, + "88": 0.11418, + "89": 0.11264, + "90": 0.11169, + "91": 0.11452, + "92": 0.11215, + "93": 0.11431, + "94": 0.11145, + "95": 0.11129, + "96": 0.11113, + "97": 0.11365, + "98": 0.11127, + "99": 0.11136, + "100": 0.11229 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..11db16901fd --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84474, + "2": 10.84714, + "3": 10.84155, + "4": 10.82474, + "5": 10.86418, + "6": 10.87687, + "7": 10.86881, + "8": 10.85782, + "9": 10.86927, + "10": 10.82155, + "11": 10.90254, + "12": 10.87935, + "13": 10.88455, + "14": 10.89946, + "15": 10.81195, + "16": 10.81872, + "17": 10.8008, + "18": 10.82581, + "19": 10.82045, + "20": 10.71872, + "21": 10.67848, + "22": 10.5397, + "23": 10.71982, + "24": 10.57533, + "25": 10.53036, + "26": 10.60075, + "27": 10.61432, + "28": 10.57308, + "29": 10.58758, + "30": 10.3358, + "31": 10.06363, + "32": 10.46475, + "33": 10.43552, + "34": 10.17388, + "35": 10.24081, + "36": 10.19268, + "37": 10.3222, + "38": 10.15004, + "39": 10.37797, + "40": 10.05008, + "41": 10.11342, + "42": 10.17323, + "43": 9.76225, + "44": 9.89234, + "45": 9.76762, + "46": 9.75986, + "47": 10.09534, + "48": 9.78722, + "49": 9.45529, + "50": 9.85505, + "51": 9.79116, + "52": 9.68704, + "53": 10.02199, + "54": 9.90262, + "55": 9.82465, + "56": 9.56989, + "57": 9.40892, + "58": 9.77732, + "59": 9.52733, + "60": 9.44306, + "61": 9.64215, + "62": 9.94224, + "63": 9.31031, + "64": 9.72428, + "65": 8.89104, + "66": 9.65351, + "67": 9.31775, + "68": 9.73884, + "69": 9.7436, + "70": 9.67902, + "71": 9.56185, + "72": 9.53074, + "73": 9.44621, + "74": 8.88449, + "75": 9.36836, + "76": 9.02423, + "77": 10.0162, + "78": 9.68193, + "79": 9.327, + "80": 9.35799, + "81": 9.43376, + "82": 9.64749, + "83": 9.25646, + "84": 9.3666, + "85": 9.56032, + "86": 9.0356, + "87": 9.54626, + "88": 9.70003, + "89": 9.54986, + "90": 9.77055, + "91": 9.28744, + "92": 9.31156, + "93": 9.03212, + "94": 8.78135, + "95": 9.48101, + "96": 9.47679, + "97": 9.24913, + "98": 9.61711, + "99": 8.83684, + "100": 9.34997 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1776.0, + "2": 1837.0, + "3": 1749.0, + "4": 1902.0, + "5": 2128.0, + "6": 2161.0, + "7": 1990.0, + "8": 1860.0, + "9": 1953.0, + "10": 1615.0, + "11": 2052.0, + "12": 1809.0, + "13": 2136.0, + "14": 1966.0, + "15": 2021.0, + "16": 1892.0, + "17": 1945.0, + "18": 1826.0, + "19": 1858.0, + "20": 1775.0, + "21": 1971.0, + "22": 1818.0, + "23": 2137.0, + "24": 1842.0, + "25": 1916.0, + "26": 1946.0, + "27": 1940.0, + "28": 2046.0, + "29": 2000.0, + "30": 2029.0, + "31": 1701.0, + "32": 2056.0, + "33": 2208.0, + "34": 2024.0, + "35": 2107.0, + "36": 1985.0, + "37": 2243.0, + "38": 2228.0, + "39": 2433.0, + "40": 2174.0, + "41": 2295.0, + "42": 2262.0, + "43": 2097.0, + "44": 2291.0, + "45": 2110.0, + "46": 2293.0, + "47": 2553.0, + "48": 2368.0, + "49": 2280.0, + "50": 2363.0, + "51": 2596.0, + "52": 2582.0, + "53": 2816.0, + "54": 2729.0, + "55": 2460.0, + "56": 2735.0, + "57": 2451.0, + "58": 2746.0, + "59": 2848.0, + "60": 2462.0, + "61": 2890.0, + "62": 2565.0, + "63": 2520.0, + "64": 2932.0, + "65": 2724.0, + "66": 3014.0, + "67": 2958.0, + "68": 2847.0, + "69": 2937.0, + "70": 2952.0, + "71": 2954.0, + "72": 2617.0, + "73": 3068.0, + "74": 2239.0, + "75": 2823.0, + "76": 3073.0, + "77": 3109.0, + "78": 3263.0, + "79": 3254.0, + "80": 3222.0, + "81": 3475.0, + "82": 3277.0, + "83": 2732.0, + "84": 3393.0, + "85": 3314.0, + "86": 2674.0, + "87": 3433.0, + "88": 3250.0, + "89": 3089.0, + "90": 3087.0, + "91": 3070.0, + "92": 3358.0, + "93": 2823.0, + "94": 3442.0, + "95": 3146.0, + "96": 3256.0, + "97": 3086.0, + "98": 3563.0, + "99": 3247.0, + "100": 3331.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 888098304.0, + "2": 888098304.0, + "3": 888098304.0, + "4": 888098304.0, + "5": 888098304.0, + "6": 888098304.0, + "7": 888098304.0, + "8": 888098304.0, + "9": 888098304.0, + "10": 888098304.0, + "11": 888098304.0, + "12": 888098304.0, + "13": 888098304.0, + "14": 888098304.0, + "15": 888098304.0, + "16": 888098304.0, + "17": 888098304.0, + "18": 888098304.0, + "19": 888098304.0, + "20": 888098304.0, + "21": 888098304.0, + "22": 888098304.0, + "23": 888098304.0, + "24": 888098304.0, + "25": 888098304.0, + "26": 888098304.0, + "27": 888098304.0, + "28": 888098304.0, + "29": 888098304.0, + "30": 888098304.0, + "31": 888098304.0, + "32": 888098304.0, + "33": 888098304.0, + "34": 888098304.0, + "35": 888098304.0, + "36": 888098304.0, + "37": 888098304.0, + "38": 888098304.0, + "39": 888098304.0, + "40": 888098304.0, + "41": 888098304.0, + "42": 888098304.0, + "43": 888098304.0, + "44": 888098304.0, + "45": 888098304.0, + "46": 888098304.0, + "47": 888098304.0, + "48": 888098304.0, + "49": 888098304.0, + "50": 888098304.0, + "51": 888098304.0, + "52": 888098304.0, + "53": 888098304.0, + "54": 888098304.0, + "55": 888098304.0, + "56": 888098304.0, + "57": 888098304.0, + "58": 888098304.0, + "59": 888098304.0, + "60": 888098304.0, + "61": 888098304.0, + "62": 888098304.0, + "63": 888098304.0, + "64": 888098304.0, + "65": 888098304.0, + "66": 888098304.0, + "67": 888098304.0, + "68": 888098304.0, + "69": 888098304.0, + "70": 888098304.0, + "71": 888098304.0, + "72": 888098304.0, + "73": 888098304.0, + "74": 888098304.0, + "75": 888098304.0, + "76": 888098304.0, + "77": 888098304.0, + "78": 888098304.0, + "79": 888098304.0, + "80": 888098304.0, + "81": 888098304.0, + "82": 888098304.0, + "83": 888098304.0, + "84": 888098304.0, + "85": 888098304.0, + "86": 888098304.0, + "87": 888098304.0, + "88": 888098304.0, + "89": 888098304.0, + "90": 888098304.0, + "91": 888098304.0, + "92": 888098304.0, + "93": 888098304.0, + "94": 888098304.0, + "95": 888098304.0, + "96": 888098304.0, + "97": 888098304.0, + "98": 888098304.0, + "99": 888098304.0, + "100": 888098304.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3216302592.0, + "2": 3575768576.0, + "3": 3575768576.0, + "4": 3575768576.0, + "5": 3575768576.0, + "6": 3575768576.0, + "7": 3575768576.0, + "8": 3575768576.0, + "9": 3575768576.0, + "10": 3575768576.0, + "11": 3575768576.0, + "12": 3575768576.0, + "13": 3575768576.0, + "14": 3575768576.0, + "15": 3575768576.0, + "16": 3575768576.0, + "17": 3575768576.0, + "18": 3575768576.0, + "19": 3575768576.0, + "20": 3575768576.0, + "21": 3575768576.0, + "22": 3575768576.0, + "23": 3575768576.0, + "24": 3575768576.0, + "25": 3575768576.0, + "26": 3575768576.0, + "27": 3575768576.0, + "28": 3575768576.0, + "29": 3575768576.0, + "30": 3575768576.0, + "31": 3575768576.0, + "32": 3575768576.0, + "33": 3575768576.0, + "34": 3575768576.0, + "35": 3575768576.0, + "36": 3575768576.0, + "37": 3575768576.0, + "38": 3575768576.0, + "39": 3575768576.0, + "40": 3575768576.0, + "41": 3575768576.0, + "42": 3575768576.0, + "43": 3575768576.0, + "44": 3575768576.0, + "45": 3575768576.0, + "46": 3575768576.0, + "47": 3575768576.0, + "48": 3575768576.0, + "49": 3575768576.0, + "50": 3575768576.0, + "51": 3575768576.0, + "52": 3575768576.0, + "53": 3575768576.0, + "54": 3575768576.0, + "55": 3575768576.0, + "56": 3575768576.0, + "57": 3575768576.0, + "58": 3575768576.0, + "59": 3575768576.0, + "60": 3575768576.0, + "61": 3575768576.0, + "62": 3575768576.0, + "63": 3575768576.0, + "64": 3575768576.0, + "65": 3575768576.0, + "66": 3575768576.0, + "67": 3575768576.0, + "68": 3575768576.0, + "69": 3575768576.0, + "70": 3575768576.0, + "71": 3575768576.0, + "72": 3575768576.0, + "73": 3575768576.0, + "74": 3575768576.0, + "75": 3575768576.0, + "76": 3575768576.0, + "77": 3575768576.0, + "78": 3575768576.0, + "79": 3575768576.0, + "80": 3575768576.0, + "81": 3575768576.0, + "82": 3575768576.0, + "83": 3575768576.0, + "84": 3575768576.0, + "85": 3575768576.0, + "86": 3575768576.0, + "87": 3575768576.0, + "88": 3575768576.0, + "89": 3575768576.0, + "90": 3575768576.0, + "91": 3575768576.0, + "92": 3575768576.0, + "93": 3575768576.0, + "94": 3575768576.0, + "95": 3575768576.0, + "96": 3575768576.0, + "97": 3575768576.0, + "98": 3575768576.0, + "99": 3575768576.0, + "100": 3575768576.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.22961, + "2": 0.22748, + "3": 0.18391, + "4": 0.18331, + "5": 0.1874, + "6": 0.18206, + "7": 0.18807, + "8": 0.18736, + "9": 0.17626, + "10": 0.18332, + "11": 0.18368, + "12": 0.42125, + "13": 0.18444, + "14": 0.18305, + "15": 0.1848, + "16": 0.18368, + "17": 0.18426, + "18": 0.18316, + "19": 0.18444, + "20": 0.18426, + "21": 0.18455, + "22": 0.18314, + "23": 0.18337, + "24": 0.18472, + "25": 0.18337, + "26": 0.18358, + "27": 0.18264, + "28": 0.18257, + "29": 0.18324, + "30": 0.18335, + "31": 0.18284, + "32": 0.18259, + "33": 0.18301, + "34": 0.18387, + "35": 0.1854, + "36": 0.18356, + "37": 0.18347, + "38": 0.18279, + "39": 0.18388, + "40": 0.18293, + "41": 0.1825, + "42": 0.17397, + "43": 0.17567, + "44": 0.17489, + "45": 0.17541, + "46": 0.17602, + "47": 0.38172, + "48": 0.1751, + "49": 0.1743, + "50": 0.17335, + "51": 0.17566, + "52": 0.1679, + "53": 0.16794, + "54": 0.16866, + "55": 0.16905, + "56": 0.16842, + "57": 0.16848, + "58": 0.16761, + "59": 0.16753, + "60": 0.16801, + "61": 0.16865, + "62": 0.16798, + "63": 0.16843, + "64": 0.16707, + "65": 0.16694, + "66": 0.16951, + "67": 0.16784, + "68": 0.16521, + "69": 0.16496, + "70": 0.16411, + "71": 0.16368, + "72": 0.16388, + "73": 0.16443, + "74": 0.16404, + "75": 0.16491, + "76": 0.16453, + "77": 0.16357, + "78": 0.1639, + "79": 0.16482, + "80": 0.1642, + "81": 0.17333, + "82": 0.17353, + "83": 0.17251, + "84": 0.17307, + "85": 0.17382, + "86": 0.17698, + "87": 0.18538, + "88": 0.18078, + "89": 0.17207, + "90": 0.17225, + "91": 0.17489, + "92": 0.17401, + "93": 0.17299, + "94": 0.17352, + "95": 0.17399, + "96": 0.1736, + "97": 0.17413, + "98": 0.17369, + "99": 0.17278, + "100": 0.17242 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..9af18296737 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84474, + "2": 10.84714, + "3": 10.84155, + "4": 10.82474, + "5": 10.86418, + "6": 10.87687, + "7": 10.86881, + "8": 10.85782, + "9": 10.86927, + "10": 10.82155, + "11": 10.90254, + "12": 10.87935, + "13": 10.88455, + "14": 10.89946, + "15": 10.81195, + "16": 10.81872, + "17": 10.8008, + "18": 10.82581, + "19": 10.82045, + "20": 10.71872, + "21": 10.67848, + "22": 10.5397, + "23": 10.71982, + "24": 10.57533, + "25": 10.53036, + "26": 10.60075, + "27": 10.61432, + "28": 10.57308, + "29": 10.58758, + "30": 10.3358, + "31": 10.06363, + "32": 10.46475, + "33": 10.43552, + "34": 10.17388, + "35": 10.24081, + "36": 10.19268, + "37": 10.3222, + "38": 10.15004, + "39": 10.37797, + "40": 10.05008, + "41": 10.11342, + "42": 10.17323, + "43": 9.76225, + "44": 9.89234, + "45": 9.76762, + "46": 9.75986, + "47": 10.09534, + "48": 9.78722, + "49": 9.45529, + "50": 9.85505, + "51": 9.79116, + "52": 9.68704, + "53": 10.02199, + "54": 9.90262, + "55": 9.82465, + "56": 9.56989, + "57": 9.40892, + "58": 9.77732, + "59": 9.52733, + "60": 9.44306, + "61": 9.64215, + "62": 9.94224, + "63": 9.31031, + "64": 9.72428, + "65": 8.89104, + "66": 9.65351, + "67": 9.31775, + "68": 9.73884, + "69": 9.7436, + "70": 9.67902, + "71": 9.56185, + "72": 9.53074, + "73": 9.44621, + "74": 8.88449, + "75": 9.36836, + "76": 9.02423, + "77": 10.0162, + "78": 9.68193, + "79": 9.327, + "80": 9.35799, + "81": 9.43376, + "82": 9.64749, + "83": 9.25646, + "84": 9.3666, + "85": 9.56032, + "86": 9.0356, + "87": 9.54626, + "88": 9.70003, + "89": 9.54986, + "90": 9.77055, + "91": 9.28744, + "92": 9.31156, + "93": 9.03212, + "94": 8.78135, + "95": 9.48101, + "96": 9.47679, + "97": 9.24913, + "98": 9.61711, + "99": 8.83684, + "100": 9.34997 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1776.0, + "2": 1837.0, + "3": 1749.0, + "4": 1902.0, + "5": 2128.0, + "6": 2161.0, + "7": 1990.0, + "8": 1860.0, + "9": 1953.0, + "10": 1615.0, + "11": 2052.0, + "12": 1809.0, + "13": 2136.0, + "14": 1966.0, + "15": 2021.0, + "16": 1892.0, + "17": 1945.0, + "18": 1826.0, + "19": 1858.0, + "20": 1775.0, + "21": 1971.0, + "22": 1818.0, + "23": 2137.0, + "24": 1842.0, + "25": 1916.0, + "26": 1946.0, + "27": 1940.0, + "28": 2046.0, + "29": 2000.0, + "30": 2029.0, + "31": 1701.0, + "32": 2056.0, + "33": 2208.0, + "34": 2024.0, + "35": 2107.0, + "36": 1985.0, + "37": 2243.0, + "38": 2228.0, + "39": 2433.0, + "40": 2174.0, + "41": 2295.0, + "42": 2262.0, + "43": 2097.0, + "44": 2291.0, + "45": 2110.0, + "46": 2293.0, + "47": 2553.0, + "48": 2368.0, + "49": 2280.0, + "50": 2363.0, + "51": 2596.0, + "52": 2582.0, + "53": 2816.0, + "54": 2729.0, + "55": 2460.0, + "56": 2735.0, + "57": 2451.0, + "58": 2746.0, + "59": 2848.0, + "60": 2462.0, + "61": 2890.0, + "62": 2565.0, + "63": 2520.0, + "64": 2932.0, + "65": 2724.0, + "66": 3014.0, + "67": 2958.0, + "68": 2847.0, + "69": 2937.0, + "70": 2952.0, + "71": 2954.0, + "72": 2617.0, + "73": 3068.0, + "74": 2239.0, + "75": 2823.0, + "76": 3073.0, + "77": 3109.0, + "78": 3263.0, + "79": 3254.0, + "80": 3222.0, + "81": 3475.0, + "82": 3277.0, + "83": 2732.0, + "84": 3393.0, + "85": 3314.0, + "86": 2674.0, + "87": 3433.0, + "88": 3250.0, + "89": 3089.0, + "90": 3087.0, + "91": 3070.0, + "92": 3358.0, + "93": 2823.0, + "94": 3442.0, + "95": 3146.0, + "96": 3256.0, + "97": 3086.0, + "98": 3563.0, + "99": 3247.0, + "100": 3331.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 888098304.0, + "2": 888098304.0, + "3": 888098304.0, + "4": 888098304.0, + "5": 888098304.0, + "6": 888098304.0, + "7": 888098304.0, + "8": 888098304.0, + "9": 888098304.0, + "10": 888098304.0, + "11": 888098304.0, + "12": 888098304.0, + "13": 888098304.0, + "14": 888098304.0, + "15": 888098304.0, + "16": 888098304.0, + "17": 888098304.0, + "18": 888098304.0, + "19": 888098304.0, + "20": 888098304.0, + "21": 888098304.0, + "22": 888098304.0, + "23": 888098304.0, + "24": 888098304.0, + "25": 888098304.0, + "26": 888098304.0, + "27": 888098304.0, + "28": 888098304.0, + "29": 888098304.0, + "30": 888098304.0, + "31": 888098304.0, + "32": 888098304.0, + "33": 888098304.0, + "34": 888098304.0, + "35": 888098304.0, + "36": 888098304.0, + "37": 888098304.0, + "38": 888098304.0, + "39": 888098304.0, + "40": 888098304.0, + "41": 888098304.0, + "42": 888098304.0, + "43": 888098304.0, + "44": 888098304.0, + "45": 888098304.0, + "46": 888098304.0, + "47": 888098304.0, + "48": 888098304.0, + "49": 888098304.0, + "50": 888098304.0, + "51": 888098304.0, + "52": 888098304.0, + "53": 888098304.0, + "54": 888098304.0, + "55": 888098304.0, + "56": 888098304.0, + "57": 888098304.0, + "58": 888098304.0, + "59": 888098304.0, + "60": 888098304.0, + "61": 888098304.0, + "62": 888098304.0, + "63": 888098304.0, + "64": 888098304.0, + "65": 888098304.0, + "66": 888098304.0, + "67": 888098304.0, + "68": 888098304.0, + "69": 888098304.0, + "70": 888098304.0, + "71": 888098304.0, + "72": 888098304.0, + "73": 888098304.0, + "74": 888098304.0, + "75": 888098304.0, + "76": 888098304.0, + "77": 888098304.0, + "78": 888098304.0, + "79": 888098304.0, + "80": 888098304.0, + "81": 888098304.0, + "82": 888098304.0, + "83": 888098304.0, + "84": 888098304.0, + "85": 888098304.0, + "86": 888098304.0, + "87": 888098304.0, + "88": 888098304.0, + "89": 888098304.0, + "90": 888098304.0, + "91": 888098304.0, + "92": 888098304.0, + "93": 888098304.0, + "94": 888098304.0, + "95": 888098304.0, + "96": 888098304.0, + "97": 888098304.0, + "98": 888098304.0, + "99": 888098304.0, + "100": 888098304.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3216302592.0, + "2": 3575768576.0, + "3": 3575768576.0, + "4": 3575768576.0, + "5": 3575768576.0, + "6": 3575768576.0, + "7": 3575768576.0, + "8": 3575768576.0, + "9": 3575768576.0, + "10": 3575768576.0, + "11": 3575768576.0, + "12": 3575768576.0, + "13": 3575768576.0, + "14": 3575768576.0, + "15": 3575768576.0, + "16": 3575768576.0, + "17": 3575768576.0, + "18": 3575768576.0, + "19": 3575768576.0, + "20": 3575768576.0, + "21": 3575768576.0, + "22": 3575768576.0, + "23": 3575768576.0, + "24": 3575768576.0, + "25": 3575768576.0, + "26": 3575768576.0, + "27": 3575768576.0, + "28": 3575768576.0, + "29": 3575768576.0, + "30": 3575768576.0, + "31": 3575768576.0, + "32": 3575768576.0, + "33": 3575768576.0, + "34": 3575768576.0, + "35": 3575768576.0, + "36": 3575768576.0, + "37": 3575768576.0, + "38": 3575768576.0, + "39": 3575768576.0, + "40": 3575768576.0, + "41": 3575768576.0, + "42": 3575768576.0, + "43": 3575768576.0, + "44": 3575768576.0, + "45": 3575768576.0, + "46": 3575768576.0, + "47": 3575768576.0, + "48": 3575768576.0, + "49": 3575768576.0, + "50": 3575768576.0, + "51": 3575768576.0, + "52": 3575768576.0, + "53": 3575768576.0, + "54": 3575768576.0, + "55": 3575768576.0, + "56": 3575768576.0, + "57": 3575768576.0, + "58": 3575768576.0, + "59": 3575768576.0, + "60": 3575768576.0, + "61": 3575768576.0, + "62": 3575768576.0, + "63": 3575768576.0, + "64": 3575768576.0, + "65": 3575768576.0, + "66": 3575768576.0, + "67": 3575768576.0, + "68": 3575768576.0, + "69": 3575768576.0, + "70": 3575768576.0, + "71": 3575768576.0, + "72": 3575768576.0, + "73": 3575768576.0, + "74": 3575768576.0, + "75": 3575768576.0, + "76": 3575768576.0, + "77": 3575768576.0, + "78": 3575768576.0, + "79": 3575768576.0, + "80": 3575768576.0, + "81": 3575768576.0, + "82": 3575768576.0, + "83": 3575768576.0, + "84": 3575768576.0, + "85": 3575768576.0, + "86": 3575768576.0, + "87": 3575768576.0, + "88": 3575768576.0, + "89": 3575768576.0, + "90": 3575768576.0, + "91": 3575768576.0, + "92": 3575768576.0, + "93": 3575768576.0, + "94": 3575768576.0, + "95": 3575768576.0, + "96": 3575768576.0, + "97": 3575768576.0, + "98": 3575768576.0, + "99": 3575768576.0, + "100": 3575768576.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.66914, + "2": 0.21684, + "3": 0.17892, + "4": 0.17346, + "5": 0.17105, + "6": 0.17127, + "7": 0.17098, + "8": 0.17217, + "9": 0.17182, + "10": 0.17103, + "11": 0.17137, + "12": 0.17055, + "13": 0.17065, + "14": 0.17142, + "15": 0.17038, + "16": 0.16903, + "17": 0.16848, + "18": 0.16975, + "19": 0.16977, + "20": 0.17019, + "21": 0.16985, + "22": 0.16955, + "23": 0.16804, + "24": 0.16891, + "25": 0.16902, + "26": 0.16957, + "27": 0.16863, + "28": 0.16926, + "29": 0.16921, + "30": 0.168, + "31": 0.16922, + "32": 0.16856, + "33": 0.17245, + "34": 0.16964, + "35": 0.16929, + "36": 0.16825, + "37": 0.16872, + "38": 0.16843, + "39": 0.16954, + "40": 0.16969, + "41": 0.16937, + "42": 0.1686, + "43": 0.34614, + "44": 0.16943, + "45": 0.16912, + "46": 0.16957, + "47": 0.16789, + "48": 0.16768, + "49": 0.16897, + "50": 0.16779, + "51": 0.3373, + "52": 0.17048, + "53": 0.16638, + "54": 0.16813, + "55": 0.16767, + "56": 0.16807, + "57": 0.16799, + "58": 0.16657, + "59": 0.16804, + "60": 0.16874, + "61": 0.1679, + "62": 0.16609, + "63": 0.16577, + "64": 0.16659, + "65": 0.16778, + "66": 0.16673, + "67": 0.16832, + "68": 0.16874, + "69": 0.16895, + "70": 0.16685, + "71": 0.16724, + "72": 0.1677, + "73": 0.16716, + "74": 0.16899, + "75": 0.1687, + "76": 0.16719, + "77": 0.16812, + "78": 0.1671, + "79": 0.1671, + "80": 0.16726, + "81": 0.16712, + "82": 0.16866, + "83": 0.16717, + "84": 0.16749, + "85": 0.16759, + "86": 0.16853, + "87": 0.16786, + "88": 0.16717, + "89": 0.16661, + "90": 0.16719, + "91": 0.17397, + "92": 0.17387, + "93": 0.17474, + "94": 0.17341, + "95": 0.17473, + "96": 0.17386, + "97": 0.17453, + "98": 0.17503, + "99": 0.17293, + "100": 0.17243 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 17ee04cf0ae..63425028dd5 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.85599, + "2": 10.8648, + "3": 10.87042, + "4": 10.85288, "5": 10.88397, + "6": 10.89184, + "7": 10.86732, + "8": 10.87057, + "9": 10.87432, "10": 10.84185, + "11": 10.87989, + "12": 10.87417, + "13": 10.87884, + "14": 10.89184, "15": 10.82659, + "16": 10.83027, + "17": 10.80933, + "18": 10.81431, + "19": 10.8167, "20": 10.72165, + "21": 10.70557, + "22": 10.56881, + "23": 10.72025, + "24": 10.61194, "25": 10.55765, + "26": 10.61149, + "27": 10.62635, + "28": 10.57155, + "29": 10.58212, "30": 10.36267, + "31": 10.11682, + "32": 10.4682, + "33": 10.45411, + "34": 10.21121, "35": 10.27207, + "36": 10.22246, + "37": 10.34079, + "38": 10.18964, + "39": 10.40228, "40": 10.08758, + "41": 10.13714, + "42": 10.21175, + "43": 9.82878, + "44": 9.96255, "45": 9.82846, + "46": 9.80952, + "47": 10.13734, + "48": 9.84349, + "49": 9.52888, "50": 9.91046, + "51": 9.85075, + "52": 9.73181, + "53": 10.06388, + "54": 9.95432, "55": 9.87204, + "56": 9.61823, + "57": 9.47467, + "58": 9.82802, + "59": 9.57962, "60": 9.49074, + "61": 9.68473, + "62": 9.99245, + "63": 9.38364, + "64": 9.77766, "65": 8.94008, + "66": 9.70099, + "67": 9.3605, + "68": 9.77766, + "69": 9.78865, "70": 9.73813, + "71": 9.61811, + "72": 9.58068, + "73": 9.4964, + "74": 8.93812, "75": 9.42081, + "76": 9.07416, + "77": 10.06077, + "78": 9.71952, + "79": 9.37088, "80": 9.39874, + "81": 9.47802, + "82": 9.69299, + "83": 9.30276, + "84": 9.41548, "85": 9.60883, + "86": 9.07461, + "87": 9.58826, + "88": 9.74392, + "89": 9.5951, "90": 9.81217, + "91": 9.33796, + "92": 9.3534, + "93": 9.07315, + "94": 8.83127, "95": 9.51524, + "96": 9.52183, + "97": 9.31012, + "98": 9.66532, + "99": 8.88179, "100": 9.39375 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1640.0, + "2": 1738.0, + "3": 1638.0, + "4": 1810.0, "5": 1755.0, + "6": 1681.0, + "7": 1781.0, + "8": 1502.0, + "9": 1817.0, "10": 1394.0, + "11": 1927.0, + "12": 1691.0, + "13": 1901.0, + "14": 1631.0, "15": 1765.0, + "16": 1864.0, + "17": 1704.0, + "18": 1771.0, + "19": 1817.0, "20": 1831.0, + "21": 1813.0, + "22": 1673.0, + "23": 2005.0, + "24": 1553.0, "25": 1577.0, + "26": 1656.0, + "27": 1734.0, + "28": 1896.0, + "29": 2051.0, "30": 1897.0, + "31": 1452.0, + "32": 1785.0, + "33": 2061.0, + "34": 1857.0, "35": 1920.0, + "36": 1990.0, + "37": 2191.0, + "38": 2142.0, + "39": 2215.0, "40": 2166.0, + "41": 2154.0, + "42": 2148.0, + "43": 1881.0, + "44": 2066.0, "45": 1952.0, + "46": 2217.0, + "47": 2513.0, + "48": 2356.0, + "49": 2294.0, "50": 2140.0, + "51": 2509.0, + "52": 2528.0, + "53": 2851.0, + "54": 2747.0, "55": 2333.0, + "56": 2724.0, + "57": 2315.0, + "58": 2754.0, + "59": 2774.0, "60": 2336.0, + "61": 2912.0, + "62": 2415.0, + "63": 2341.0, + "64": 2837.0, "65": 2661.0, + "66": 3000.0, + "67": 2779.0, + "68": 2691.0, + "69": 2793.0, "70": 3183.0, + "71": 2962.0, + "72": 2393.0, + "73": 2997.0, + "74": 1935.0, "75": 2463.0, + "76": 3065.0, + "77": 3184.0, + "78": 3154.0, + "79": 3127.0, "80": 3286.0, + "81": 3386.0, + "82": 3128.0, + "83": 2608.0, + "84": 3079.0, "85": 3260.0, + "86": 2687.0, + "87": 3591.0, + "88": 3035.0, + "89": 3165.0, "90": 3166.0, + "91": 2690.0, + "92": 2897.0, + "93": 2630.0, + "94": 3348.0, "95": 3349.0, + "96": 3288.0, + "97": 3055.0, + "98": 3516.0, + "99": 3035.0, "100": 3109.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 746194432.0, + "2": 746194432.0, + "3": 746194432.0, + "4": 746194432.0, "5": 746194432.0, + "6": 746194432.0, + "7": 746194432.0, + "8": 746194432.0, + "9": 746194432.0, "10": 746194432.0, + "11": 746194432.0, + "12": 746194432.0, + "13": 746194432.0, + "14": 746194432.0, "15": 746194432.0, + "16": 746194432.0, + "17": 746194432.0, + "18": 746194432.0, + "19": 746194432.0, "20": 746194432.0, + "21": 746194432.0, + "22": 746194432.0, + "23": 746194432.0, + "24": 746194432.0, "25": 746194432.0, + "26": 746194432.0, + "27": 746194432.0, + "28": 746194432.0, + "29": 746194432.0, "30": 746194432.0, + "31": 746194432.0, + "32": 746194432.0, + "33": 746194432.0, + "34": 746194432.0, "35": 746194432.0, + "36": 746194432.0, + "37": 746194432.0, + "38": 746194432.0, + "39": 746194432.0, "40": 746194432.0, + "41": 746194432.0, + "42": 746194432.0, + "43": 746194432.0, + "44": 746194432.0, "45": 746194432.0, + "46": 746194432.0, + "47": 746194432.0, + "48": 746194432.0, + "49": 746194432.0, "50": 746194432.0, + "51": 746194432.0, + "52": 746194432.0, + "53": 746194432.0, + "54": 746194432.0, "55": 746194432.0, + "56": 746194432.0, + "57": 746194432.0, + "58": 746194432.0, + "59": 746194432.0, "60": 746194432.0, + "61": 746194432.0, + "62": 746194432.0, + "63": 746194432.0, + "64": 746194432.0, "65": 746194432.0, + "66": 746194432.0, + "67": 746194432.0, + "68": 746194432.0, + "69": 746194432.0, "70": 746194432.0, + "71": 746194432.0, + "72": 746194432.0, + "73": 746194432.0, + "74": 746194432.0, "75": 746194432.0, + "76": 746194432.0, + "77": 746194432.0, + "78": 746194432.0, + "79": 746194432.0, "80": 746194432.0, + "81": 746194432.0, + "82": 746194432.0, + "83": 746194432.0, + "84": 746194432.0, "85": 746194432.0, + "86": 746194432.0, + "87": 746194432.0, + "88": 746194432.0, + "89": 746194432.0, "90": 746194432.0, + "91": 746194432.0, + "92": 746194432.0, + "93": 746194432.0, + "94": 746194432.0, "95": 746194432.0, + "96": 746194432.0, + "97": 746194432.0, + "98": 746194432.0, + "99": 746194432.0, "100": 746194432.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1926153216.0, + "2": 2209851392.0, + "3": 2209851392.0, + "4": 2209851392.0, "5": 2209851392.0, + "6": 2209851392.0, + "7": 2209851392.0, + "8": 2209851392.0, + "9": 2209851392.0, "10": 2209851392.0, + "11": 2209851392.0, + "12": 2209851392.0, + "13": 2209851392.0, + "14": 2209851392.0, "15": 2209851392.0, + "16": 2209851392.0, + "17": 2209851392.0, + "18": 2209851392.0, + "19": 2209851392.0, "20": 2209851392.0, + "21": 2209851392.0, + "22": 2209851392.0, + "23": 2209851392.0, + "24": 2209851392.0, "25": 2209851392.0, + "26": 2209851392.0, + "27": 2209851392.0, + "28": 2209851392.0, + "29": 2209851392.0, "30": 2209851392.0, + "31": 2209851392.0, + "32": 2209851392.0, + "33": 2209851392.0, + "34": 2209851392.0, "35": 2209851392.0, + "36": 2209851392.0, + "37": 2209851392.0, + "38": 2209851392.0, + "39": 2209851392.0, "40": 2209851392.0, + "41": 2209851392.0, + "42": 2209851392.0, + "43": 2209851392.0, + "44": 2209851392.0, "45": 2209851392.0, + "46": 2209851392.0, + "47": 2209851392.0, + "48": 2209851392.0, + "49": 2209851392.0, "50": 2209851392.0, + "51": 2209851392.0, + "52": 2209851392.0, + "53": 2209851392.0, + "54": 2209851392.0, "55": 2209851392.0, + "56": 2209851392.0, + "57": 2209851392.0, + "58": 2209851392.0, + "59": 2209851392.0, "60": 2209851392.0, + "61": 2209851392.0, + "62": 2209851392.0, + "63": 2209851392.0, + "64": 2209851392.0, "65": 2209851392.0, + "66": 2209851392.0, + "67": 2209851392.0, + "68": 2209851392.0, + "69": 2209851392.0, "70": 2209851392.0, + "71": 2209851392.0, + "72": 2209851392.0, + "73": 2209851392.0, + "74": 2209851392.0, "75": 2209851392.0, + "76": 2209851392.0, + "77": 2209851392.0, + "78": 2209851392.0, + "79": 2209851392.0, "80": 2209851392.0, + "81": 2209851392.0, + "82": 2209851392.0, + "83": 2209851392.0, + "84": 2209851392.0, "85": 2209851392.0, + "86": 2209851392.0, + "87": 2209851392.0, + "88": 2209851392.0, + "89": 2209851392.0, "90": 2209851392.0, + "91": 2209851392.0, + "92": 2209851392.0, + "93": 2209851392.0, + "94": 2209851392.0, "95": 2209851392.0, + "96": 2209851392.0, + "97": 2209851392.0, + "98": 2209851392.0, + "99": 2209851392.0, "100": 2209851392.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 10.15333, - "5": 0.09518, - "10": 0.09562, - "15": 0.09503, - "20": 0.09503, - "25": 0.09461, - "30": 0.09547, - "35": 0.09528, - "40": 0.0967, - "45": 0.09344, - "50": 0.09511, - "55": 0.09515, - "60": 0.09496, - "65": 0.09478, - "70": 0.09504, - "75": 0.09415, - "80": 0.09367, - "85": 0.09449, - "90": 0.09786, - "95": 0.09592, - "100": 0.09477 + "1": 12.78916, + "2": 0.129, + "3": 0.1167, + "4": 0.11497, + "5": 0.10818, + "6": 0.10473, + "7": 0.10532, + "8": 0.10616, + "9": 0.10723, + "10": 0.10865, + "11": 0.10729, + "12": 0.10632, + "13": 0.10608, + "14": 0.1066, + "15": 0.10589, + "16": 0.10567, + "17": 0.10574, + "18": 0.10663, + "19": 0.10656, + "20": 0.10767, + "21": 0.10522, + "22": 0.10601, + "23": 0.10475, + "24": 0.10392, + "25": 0.10556, + "26": 0.10438, + "27": 0.10635, + "28": 0.10742, + "29": 0.10795, + "30": 0.10745, + "31": 0.10836, + "32": 0.10639, + "33": 0.10597, + "34": 0.1064, + "35": 0.10496, + "36": 0.10549, + "37": 0.10538, + "38": 0.107, + "39": 0.10567, + "40": 0.10655, + "41": 0.10552, + "42": 0.10527, + "43": 0.10546, + "44": 0.10643, + "45": 0.10624, + "46": 0.10787, + "47": 0.1068, + "48": 0.1075, + "49": 0.10525, + "50": 0.10727, + "51": 0.126, + "52": 0.1146, + "53": 0.11042, + "54": 0.12389, + "55": 0.10643, + "56": 0.10676, + "57": 0.10677, + "58": 0.10573, + "59": 0.10709, + "60": 0.10515, + "61": 0.10668, + "62": 0.10599, + "63": 0.10616, + "64": 0.10462, + "65": 0.10742, + "66": 0.10693, + "67": 0.10628, + "68": 0.10748, + "69": 0.10707, + "70": 0.10621, + "71": 0.105, + "72": 0.10801, + "73": 0.10662, + "74": 0.10641, + "75": 0.10562, + "76": 0.10643, + "77": 0.10629, + "78": 0.10538, + "79": 0.1047, + "80": 0.10541, + "81": 0.10526, + "82": 0.10753, + "83": 0.10562, + "84": 0.10631, + "85": 0.10586, + "86": 0.10685, + "87": 0.1065, + "88": 0.10696, + "89": 0.10619, + "90": 0.10588, + "91": 0.10452, + "92": 0.10667, + "93": 0.10546, + "94": 0.1036, + "95": 0.10483, + "96": 0.10512, + "97": 0.10433, + "98": 0.10471, + "99": 0.10514, + "100": 0.10516 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..f5a45f2f146 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85599, + "2": 10.8648, + "3": 10.87042, + "4": 10.85288, + "5": 10.88397, + "6": 10.89184, + "7": 10.86732, + "8": 10.87057, + "9": 10.87432, + "10": 10.84185, + "11": 10.87989, + "12": 10.87417, + "13": 10.87884, + "14": 10.89184, + "15": 10.82659, + "16": 10.83027, + "17": 10.80933, + "18": 10.81431, + "19": 10.8167, + "20": 10.72165, + "21": 10.70557, + "22": 10.56881, + "23": 10.72025, + "24": 10.61194, + "25": 10.55765, + "26": 10.61149, + "27": 10.62635, + "28": 10.57155, + "29": 10.58212, + "30": 10.36267, + "31": 10.11682, + "32": 10.4682, + "33": 10.45411, + "34": 10.21121, + "35": 10.27207, + "36": 10.22246, + "37": 10.34079, + "38": 10.18964, + "39": 10.40228, + "40": 10.08758, + "41": 10.13714, + "42": 10.21175, + "43": 9.82878, + "44": 9.96255, + "45": 9.82846, + "46": 9.80952, + "47": 10.13734, + "48": 9.84349, + "49": 9.52888, + "50": 9.91046, + "51": 9.85075, + "52": 9.73181, + "53": 10.06388, + "54": 9.95432, + "55": 9.87204, + "56": 9.61823, + "57": 9.47467, + "58": 9.82802, + "59": 9.57962, + "60": 9.49074, + "61": 9.68473, + "62": 9.99245, + "63": 9.38364, + "64": 9.77766, + "65": 8.94008, + "66": 9.70099, + "67": 9.3605, + "68": 9.77766, + "69": 9.78865, + "70": 9.73813, + "71": 9.61811, + "72": 9.58068, + "73": 9.4964, + "74": 8.93812, + "75": 9.42081, + "76": 9.07416, + "77": 10.06077, + "78": 9.71952, + "79": 9.37088, + "80": 9.39874, + "81": 9.47802, + "82": 9.69299, + "83": 9.30276, + "84": 9.41548, + "85": 9.60883, + "86": 9.07461, + "87": 9.58826, + "88": 9.74392, + "89": 9.5951, + "90": 9.81217, + "91": 9.33796, + "92": 9.3534, + "93": 9.07315, + "94": 8.83127, + "95": 9.51524, + "96": 9.52183, + "97": 9.31012, + "98": 9.66532, + "99": 8.88179, + "100": 9.39375 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1640.0, + "2": 1738.0, + "3": 1638.0, + "4": 1810.0, + "5": 1755.0, + "6": 1681.0, + "7": 1781.0, + "8": 1502.0, + "9": 1817.0, + "10": 1394.0, + "11": 1927.0, + "12": 1691.0, + "13": 1901.0, + "14": 1631.0, + "15": 1765.0, + "16": 1864.0, + "17": 1704.0, + "18": 1771.0, + "19": 1817.0, + "20": 1831.0, + "21": 1813.0, + "22": 1673.0, + "23": 2005.0, + "24": 1553.0, + "25": 1577.0, + "26": 1656.0, + "27": 1734.0, + "28": 1896.0, + "29": 2051.0, + "30": 1897.0, + "31": 1452.0, + "32": 1785.0, + "33": 2061.0, + "34": 1857.0, + "35": 1920.0, + "36": 1990.0, + "37": 2191.0, + "38": 2142.0, + "39": 2215.0, + "40": 2166.0, + "41": 2154.0, + "42": 2148.0, + "43": 1881.0, + "44": 2066.0, + "45": 1952.0, + "46": 2217.0, + "47": 2513.0, + "48": 2356.0, + "49": 2294.0, + "50": 2140.0, + "51": 2509.0, + "52": 2528.0, + "53": 2851.0, + "54": 2747.0, + "55": 2333.0, + "56": 2724.0, + "57": 2315.0, + "58": 2754.0, + "59": 2774.0, + "60": 2336.0, + "61": 2912.0, + "62": 2415.0, + "63": 2341.0, + "64": 2837.0, + "65": 2661.0, + "66": 3000.0, + "67": 2779.0, + "68": 2691.0, + "69": 2793.0, + "70": 3183.0, + "71": 2962.0, + "72": 2393.0, + "73": 2997.0, + "74": 1935.0, + "75": 2463.0, + "76": 3065.0, + "77": 3184.0, + "78": 3154.0, + "79": 3127.0, + "80": 3286.0, + "81": 3386.0, + "82": 3128.0, + "83": 2608.0, + "84": 3079.0, + "85": 3260.0, + "86": 2687.0, + "87": 3591.0, + "88": 3035.0, + "89": 3165.0, + "90": 3166.0, + "91": 2690.0, + "92": 2897.0, + "93": 2630.0, + "94": 3348.0, + "95": 3349.0, + "96": 3288.0, + "97": 3055.0, + "98": 3516.0, + "99": 3035.0, + "100": 3109.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 746194432.0, + "2": 746194432.0, + "3": 746194432.0, + "4": 746194432.0, + "5": 746194432.0, + "6": 746194432.0, + "7": 746194432.0, + "8": 746194432.0, + "9": 746194432.0, + "10": 746194432.0, + "11": 746194432.0, + "12": 746194432.0, + "13": 746194432.0, + "14": 746194432.0, + "15": 746194432.0, + "16": 746194432.0, + "17": 746194432.0, + "18": 746194432.0, + "19": 746194432.0, + "20": 746194432.0, + "21": 746194432.0, + "22": 746194432.0, + "23": 746194432.0, + "24": 746194432.0, + "25": 746194432.0, + "26": 746194432.0, + "27": 746194432.0, + "28": 746194432.0, + "29": 746194432.0, + "30": 746194432.0, + "31": 746194432.0, + "32": 746194432.0, + "33": 746194432.0, + "34": 746194432.0, + "35": 746194432.0, + "36": 746194432.0, + "37": 746194432.0, + "38": 746194432.0, + "39": 746194432.0, + "40": 746194432.0, + "41": 746194432.0, + "42": 746194432.0, + "43": 746194432.0, + "44": 746194432.0, + "45": 746194432.0, + "46": 746194432.0, + "47": 746194432.0, + "48": 746194432.0, + "49": 746194432.0, + "50": 746194432.0, + "51": 746194432.0, + "52": 746194432.0, + "53": 746194432.0, + "54": 746194432.0, + "55": 746194432.0, + "56": 746194432.0, + "57": 746194432.0, + "58": 746194432.0, + "59": 746194432.0, + "60": 746194432.0, + "61": 746194432.0, + "62": 746194432.0, + "63": 746194432.0, + "64": 746194432.0, + "65": 746194432.0, + "66": 746194432.0, + "67": 746194432.0, + "68": 746194432.0, + "69": 746194432.0, + "70": 746194432.0, + "71": 746194432.0, + "72": 746194432.0, + "73": 746194432.0, + "74": 746194432.0, + "75": 746194432.0, + "76": 746194432.0, + "77": 746194432.0, + "78": 746194432.0, + "79": 746194432.0, + "80": 746194432.0, + "81": 746194432.0, + "82": 746194432.0, + "83": 746194432.0, + "84": 746194432.0, + "85": 746194432.0, + "86": 746194432.0, + "87": 746194432.0, + "88": 746194432.0, + "89": 746194432.0, + "90": 746194432.0, + "91": 746194432.0, + "92": 746194432.0, + "93": 746194432.0, + "94": 746194432.0, + "95": 746194432.0, + "96": 746194432.0, + "97": 746194432.0, + "98": 746194432.0, + "99": 746194432.0, + "100": 746194432.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1926153216.0, + "2": 2209851392.0, + "3": 2209851392.0, + "4": 2209851392.0, + "5": 2209851392.0, + "6": 2209851392.0, + "7": 2209851392.0, + "8": 2209851392.0, + "9": 2209851392.0, + "10": 2209851392.0, + "11": 2209851392.0, + "12": 2209851392.0, + "13": 2209851392.0, + "14": 2209851392.0, + "15": 2209851392.0, + "16": 2209851392.0, + "17": 2209851392.0, + "18": 2209851392.0, + "19": 2209851392.0, + "20": 2209851392.0, + "21": 2209851392.0, + "22": 2209851392.0, + "23": 2209851392.0, + "24": 2209851392.0, + "25": 2209851392.0, + "26": 2209851392.0, + "27": 2209851392.0, + "28": 2209851392.0, + "29": 2209851392.0, + "30": 2209851392.0, + "31": 2209851392.0, + "32": 2209851392.0, + "33": 2209851392.0, + "34": 2209851392.0, + "35": 2209851392.0, + "36": 2209851392.0, + "37": 2209851392.0, + "38": 2209851392.0, + "39": 2209851392.0, + "40": 2209851392.0, + "41": 2209851392.0, + "42": 2209851392.0, + "43": 2209851392.0, + "44": 2209851392.0, + "45": 2209851392.0, + "46": 2209851392.0, + "47": 2209851392.0, + "48": 2209851392.0, + "49": 2209851392.0, + "50": 2209851392.0, + "51": 2209851392.0, + "52": 2209851392.0, + "53": 2209851392.0, + "54": 2209851392.0, + "55": 2209851392.0, + "56": 2209851392.0, + "57": 2209851392.0, + "58": 2209851392.0, + "59": 2209851392.0, + "60": 2209851392.0, + "61": 2209851392.0, + "62": 2209851392.0, + "63": 2209851392.0, + "64": 2209851392.0, + "65": 2209851392.0, + "66": 2209851392.0, + "67": 2209851392.0, + "68": 2209851392.0, + "69": 2209851392.0, + "70": 2209851392.0, + "71": 2209851392.0, + "72": 2209851392.0, + "73": 2209851392.0, + "74": 2209851392.0, + "75": 2209851392.0, + "76": 2209851392.0, + "77": 2209851392.0, + "78": 2209851392.0, + "79": 2209851392.0, + "80": 2209851392.0, + "81": 2209851392.0, + "82": 2209851392.0, + "83": 2209851392.0, + "84": 2209851392.0, + "85": 2209851392.0, + "86": 2209851392.0, + "87": 2209851392.0, + "88": 2209851392.0, + "89": 2209851392.0, + "90": 2209851392.0, + "91": 2209851392.0, + "92": 2209851392.0, + "93": 2209851392.0, + "94": 2209851392.0, + "95": 2209851392.0, + "96": 2209851392.0, + "97": 2209851392.0, + "98": 2209851392.0, + "99": 2209851392.0, + "100": 2209851392.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.78981, + "2": 0.14641, + "3": 0.09823, + "4": 0.09626, + "5": 0.09543, + "6": 0.09563, + "7": 0.09569, + "8": 0.0947, + "9": 0.09571, + "10": 0.09565, + "11": 0.09526, + "12": 0.09451, + "13": 0.09577, + "14": 0.09578, + "15": 0.0954, + "16": 0.09495, + "17": 0.09576, + "18": 0.09506, + "19": 0.09526, + "20": 0.09508, + "21": 0.09525, + "22": 0.09601, + "23": 0.09712, + "24": 0.09956, + "25": 0.09858, + "26": 0.09859, + "27": 0.097, + "28": 0.0963, + "29": 0.09742, + "30": 0.09459, + "31": 0.09583, + "32": 0.09745, + "33": 0.09523, + "34": 0.09486, + "35": 0.09594, + "36": 0.09571, + "37": 0.09608, + "38": 0.09689, + "39": 0.09574, + "40": 0.09565, + "41": 0.0958, + "42": 0.09573, + "43": 0.0958, + "44": 0.09524, + "45": 0.09519, + "46": 0.0952, + "47": 0.09476, + "48": 0.09432, + "49": 0.09445, + "50": 0.09411, + "51": 0.11832, + "52": 0.10335, + "53": 0.10105, + "54": 0.11751, + "55": 0.09996, + "56": 0.09926, + "57": 0.1014, + "58": 0.10002, + "59": 0.10069, + "60": 0.09932, + "61": 0.09999, + "62": 0.10028, + "63": 0.09961, + "64": 0.09886, + "65": 0.10127, + "66": 0.09994, + "67": 0.09975, + "68": 0.10037, + "69": 0.09896, + "70": 0.09847, + "71": 0.09907, + "72": 0.09929, + "73": 0.09893, + "74": 0.09893, + "75": 0.09961, + "76": 0.09928, + "77": 0.0991, + "78": 0.10211, + "79": 0.09934, + "80": 0.10027, + "81": 0.0996, + "82": 0.09986, + "83": 0.09951, + "84": 0.09761, + "85": 0.09909, + "86": 0.099, + "87": 0.09903, + "88": 0.09905, + "89": 0.0999, + "90": 0.09942, + "91": 0.09983, + "92": 0.09886, + "93": 0.09982, + "94": 0.09894, + "95": 0.09946, + "96": 0.09983, + "97": 0.09904, + "98": 0.09902, + "99": 0.09961, + "100": 0.09808 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..72743900cff --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85599, + "2": 10.8648, + "3": 10.87042, + "4": 10.85288, + "5": 10.88397, + "6": 10.89184, + "7": 10.86732, + "8": 10.87057, + "9": 10.87432, + "10": 10.84185, + "11": 10.87989, + "12": 10.87417, + "13": 10.87884, + "14": 10.89184, + "15": 10.82659, + "16": 10.83027, + "17": 10.80933, + "18": 10.81431, + "19": 10.8167, + "20": 10.72165, + "21": 10.70557, + "22": 10.56881, + "23": 10.72025, + "24": 10.61194, + "25": 10.55765, + "26": 10.61149, + "27": 10.62635, + "28": 10.57155, + "29": 10.58212, + "30": 10.36267, + "31": 10.11682, + "32": 10.4682, + "33": 10.45411, + "34": 10.21121, + "35": 10.27207, + "36": 10.22246, + "37": 10.34079, + "38": 10.18964, + "39": 10.40228, + "40": 10.08758, + "41": 10.13714, + "42": 10.21175, + "43": 9.82878, + "44": 9.96255, + "45": 9.82846, + "46": 9.80952, + "47": 10.13734, + "48": 9.84349, + "49": 9.52888, + "50": 9.91046, + "51": 9.85075, + "52": 9.73181, + "53": 10.06388, + "54": 9.95432, + "55": 9.87204, + "56": 9.61823, + "57": 9.47467, + "58": 9.82802, + "59": 9.57962, + "60": 9.49074, + "61": 9.68473, + "62": 9.99245, + "63": 9.38364, + "64": 9.77766, + "65": 8.94008, + "66": 9.70099, + "67": 9.3605, + "68": 9.77766, + "69": 9.78865, + "70": 9.73813, + "71": 9.61811, + "72": 9.58068, + "73": 9.4964, + "74": 8.93812, + "75": 9.42081, + "76": 9.07416, + "77": 10.06077, + "78": 9.71952, + "79": 9.37088, + "80": 9.39874, + "81": 9.47802, + "82": 9.69299, + "83": 9.30276, + "84": 9.41548, + "85": 9.60883, + "86": 9.07461, + "87": 9.58826, + "88": 9.74392, + "89": 9.5951, + "90": 9.81217, + "91": 9.33796, + "92": 9.3534, + "93": 9.07315, + "94": 8.83127, + "95": 9.51524, + "96": 9.52183, + "97": 9.31012, + "98": 9.66532, + "99": 8.88179, + "100": 9.39375 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1640.0, + "2": 1738.0, + "3": 1638.0, + "4": 1810.0, + "5": 1755.0, + "6": 1681.0, + "7": 1781.0, + "8": 1502.0, + "9": 1817.0, + "10": 1394.0, + "11": 1927.0, + "12": 1691.0, + "13": 1901.0, + "14": 1631.0, + "15": 1765.0, + "16": 1864.0, + "17": 1704.0, + "18": 1771.0, + "19": 1817.0, + "20": 1831.0, + "21": 1813.0, + "22": 1673.0, + "23": 2005.0, + "24": 1553.0, + "25": 1577.0, + "26": 1656.0, + "27": 1734.0, + "28": 1896.0, + "29": 2051.0, + "30": 1897.0, + "31": 1452.0, + "32": 1785.0, + "33": 2061.0, + "34": 1857.0, + "35": 1920.0, + "36": 1990.0, + "37": 2191.0, + "38": 2142.0, + "39": 2215.0, + "40": 2166.0, + "41": 2154.0, + "42": 2148.0, + "43": 1881.0, + "44": 2066.0, + "45": 1952.0, + "46": 2217.0, + "47": 2513.0, + "48": 2356.0, + "49": 2294.0, + "50": 2140.0, + "51": 2509.0, + "52": 2528.0, + "53": 2851.0, + "54": 2747.0, + "55": 2333.0, + "56": 2724.0, + "57": 2315.0, + "58": 2754.0, + "59": 2774.0, + "60": 2336.0, + "61": 2912.0, + "62": 2415.0, + "63": 2341.0, + "64": 2837.0, + "65": 2661.0, + "66": 3000.0, + "67": 2779.0, + "68": 2691.0, + "69": 2793.0, + "70": 3183.0, + "71": 2962.0, + "72": 2393.0, + "73": 2997.0, + "74": 1935.0, + "75": 2463.0, + "76": 3065.0, + "77": 3184.0, + "78": 3154.0, + "79": 3127.0, + "80": 3286.0, + "81": 3386.0, + "82": 3128.0, + "83": 2608.0, + "84": 3079.0, + "85": 3260.0, + "86": 2687.0, + "87": 3591.0, + "88": 3035.0, + "89": 3165.0, + "90": 3166.0, + "91": 2690.0, + "92": 2897.0, + "93": 2630.0, + "94": 3348.0, + "95": 3349.0, + "96": 3288.0, + "97": 3055.0, + "98": 3516.0, + "99": 3035.0, + "100": 3109.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 746194432.0, + "2": 746194432.0, + "3": 746194432.0, + "4": 746194432.0, + "5": 746194432.0, + "6": 746194432.0, + "7": 746194432.0, + "8": 746194432.0, + "9": 746194432.0, + "10": 746194432.0, + "11": 746194432.0, + "12": 746194432.0, + "13": 746194432.0, + "14": 746194432.0, + "15": 746194432.0, + "16": 746194432.0, + "17": 746194432.0, + "18": 746194432.0, + "19": 746194432.0, + "20": 746194432.0, + "21": 746194432.0, + "22": 746194432.0, + "23": 746194432.0, + "24": 746194432.0, + "25": 746194432.0, + "26": 746194432.0, + "27": 746194432.0, + "28": 746194432.0, + "29": 746194432.0, + "30": 746194432.0, + "31": 746194432.0, + "32": 746194432.0, + "33": 746194432.0, + "34": 746194432.0, + "35": 746194432.0, + "36": 746194432.0, + "37": 746194432.0, + "38": 746194432.0, + "39": 746194432.0, + "40": 746194432.0, + "41": 746194432.0, + "42": 746194432.0, + "43": 746194432.0, + "44": 746194432.0, + "45": 746194432.0, + "46": 746194432.0, + "47": 746194432.0, + "48": 746194432.0, + "49": 746194432.0, + "50": 746194432.0, + "51": 746194432.0, + "52": 746194432.0, + "53": 746194432.0, + "54": 746194432.0, + "55": 746194432.0, + "56": 746194432.0, + "57": 746194432.0, + "58": 746194432.0, + "59": 746194432.0, + "60": 746194432.0, + "61": 746194432.0, + "62": 746194432.0, + "63": 746194432.0, + "64": 746194432.0, + "65": 746194432.0, + "66": 746194432.0, + "67": 746194432.0, + "68": 746194432.0, + "69": 746194432.0, + "70": 746194432.0, + "71": 746194432.0, + "72": 746194432.0, + "73": 746194432.0, + "74": 746194432.0, + "75": 746194432.0, + "76": 746194432.0, + "77": 746194432.0, + "78": 746194432.0, + "79": 746194432.0, + "80": 746194432.0, + "81": 746194432.0, + "82": 746194432.0, + "83": 746194432.0, + "84": 746194432.0, + "85": 746194432.0, + "86": 746194432.0, + "87": 746194432.0, + "88": 746194432.0, + "89": 746194432.0, + "90": 746194432.0, + "91": 746194432.0, + "92": 746194432.0, + "93": 746194432.0, + "94": 746194432.0, + "95": 746194432.0, + "96": 746194432.0, + "97": 746194432.0, + "98": 746194432.0, + "99": 746194432.0, + "100": 746194432.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1926153216.0, + "2": 2209851392.0, + "3": 2209851392.0, + "4": 2209851392.0, + "5": 2209851392.0, + "6": 2209851392.0, + "7": 2209851392.0, + "8": 2209851392.0, + "9": 2209851392.0, + "10": 2209851392.0, + "11": 2209851392.0, + "12": 2209851392.0, + "13": 2209851392.0, + "14": 2209851392.0, + "15": 2209851392.0, + "16": 2209851392.0, + "17": 2209851392.0, + "18": 2209851392.0, + "19": 2209851392.0, + "20": 2209851392.0, + "21": 2209851392.0, + "22": 2209851392.0, + "23": 2209851392.0, + "24": 2209851392.0, + "25": 2209851392.0, + "26": 2209851392.0, + "27": 2209851392.0, + "28": 2209851392.0, + "29": 2209851392.0, + "30": 2209851392.0, + "31": 2209851392.0, + "32": 2209851392.0, + "33": 2209851392.0, + "34": 2209851392.0, + "35": 2209851392.0, + "36": 2209851392.0, + "37": 2209851392.0, + "38": 2209851392.0, + "39": 2209851392.0, + "40": 2209851392.0, + "41": 2209851392.0, + "42": 2209851392.0, + "43": 2209851392.0, + "44": 2209851392.0, + "45": 2209851392.0, + "46": 2209851392.0, + "47": 2209851392.0, + "48": 2209851392.0, + "49": 2209851392.0, + "50": 2209851392.0, + "51": 2209851392.0, + "52": 2209851392.0, + "53": 2209851392.0, + "54": 2209851392.0, + "55": 2209851392.0, + "56": 2209851392.0, + "57": 2209851392.0, + "58": 2209851392.0, + "59": 2209851392.0, + "60": 2209851392.0, + "61": 2209851392.0, + "62": 2209851392.0, + "63": 2209851392.0, + "64": 2209851392.0, + "65": 2209851392.0, + "66": 2209851392.0, + "67": 2209851392.0, + "68": 2209851392.0, + "69": 2209851392.0, + "70": 2209851392.0, + "71": 2209851392.0, + "72": 2209851392.0, + "73": 2209851392.0, + "74": 2209851392.0, + "75": 2209851392.0, + "76": 2209851392.0, + "77": 2209851392.0, + "78": 2209851392.0, + "79": 2209851392.0, + "80": 2209851392.0, + "81": 2209851392.0, + "82": 2209851392.0, + "83": 2209851392.0, + "84": 2209851392.0, + "85": 2209851392.0, + "86": 2209851392.0, + "87": 2209851392.0, + "88": 2209851392.0, + "89": 2209851392.0, + "90": 2209851392.0, + "91": 2209851392.0, + "92": 2209851392.0, + "93": 2209851392.0, + "94": 2209851392.0, + "95": 2209851392.0, + "96": 2209851392.0, + "97": 2209851392.0, + "98": 2209851392.0, + "99": 2209851392.0, + "100": 2209851392.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 12.71973, + "2": 0.14026, + "3": 0.11862, + "4": 0.10675, + "5": 0.10706, + "6": 0.10639, + "7": 0.10733, + "8": 0.10668, + "9": 0.10876, + "10": 0.10818, + "11": 0.10917, + "12": 0.1083, + "13": 0.10781, + "14": 0.10774, + "15": 0.10649, + "16": 0.10734, + "17": 0.10691, + "18": 0.10561, + "19": 0.10658, + "20": 0.10698, + "21": 0.10786, + "22": 0.10799, + "23": 0.10759, + "24": 0.10883, + "25": 0.10795, + "26": 0.10754, + "27": 0.10823, + "28": 0.10763, + "29": 0.10845, + "30": 0.10831, + "31": 0.10745, + "32": 0.10718, + "33": 0.10787, + "34": 0.10797, + "35": 0.1082, + "36": 0.10752, + "37": 0.10829, + "38": 0.10875, + "39": 0.10866, + "40": 0.1088, + "41": 0.10879, + "42": 0.10749, + "43": 0.10899, + "44": 0.10725, + "45": 0.10697, + "46": 0.10761, + "47": 0.10683, + "48": 0.10976, + "49": 0.10965, + "50": 0.10766, + "51": 0.123, + "52": 0.11396, + "53": 0.10816, + "54": 0.10864, + "55": 0.12449, + "56": 0.1076, + "57": 0.10895, + "58": 0.10793, + "59": 0.10902, + "60": 0.10551, + "61": 0.10575, + "62": 0.10761, + "63": 0.10614, + "64": 0.10584, + "65": 0.10699, + "66": 0.1077, + "67": 0.10786, + "68": 0.10744, + "69": 0.10671, + "70": 0.10786, + "71": 0.10765, + "72": 0.10586, + "73": 0.10669, + "74": 0.10611, + "75": 0.10692, + "76": 0.10782, + "77": 0.10601, + "78": 0.10616, + "79": 0.10555, + "80": 0.10728, + "81": 0.10656, + "82": 0.10848, + "83": 0.10786, + "84": 0.10935, + "85": 0.11246, + "86": 0.11271, + "87": 0.10885, + "88": 0.10616, + "89": 0.10731, + "90": 0.10705, + "91": 0.10547, + "92": 0.10622, + "93": 0.10619, + "94": 0.10678, + "95": 0.10769, + "96": 0.10574, + "97": 0.10691, + "98": 0.10682, + "99": 0.10685, + "100": 0.10542 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index b6823bec847..2125b88c754 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.85599, + "2": 10.8648, + "3": 10.87042, + "4": 10.85288, "5": 10.88397, + "6": 10.89184, + "7": 10.86732, + "8": 10.87057, + "9": 10.87432, "10": 10.84185, + "11": 10.87989, + "12": 10.87417, + "13": 10.87884, + "14": 10.89184, "15": 10.82659, + "16": 10.83027, + "17": 10.80933, + "18": 10.81431, + "19": 10.8167, "20": 10.72165, + "21": 10.70557, + "22": 10.56881, + "23": 10.72025, + "24": 10.61194, "25": 10.55765, + "26": 10.61149, + "27": 10.62635, + "28": 10.57155, + "29": 10.58212, "30": 10.36267, + "31": 10.11682, + "32": 10.4682, + "33": 10.45411, + "34": 10.21121, "35": 10.27207, + "36": 10.22246, + "37": 10.34079, + "38": 10.18964, + "39": 10.40228, "40": 10.08758, + "41": 10.13714, + "42": 10.21175, + "43": 9.82878, + "44": 9.96255, "45": 9.82846, + "46": 9.80952, + "47": 10.13734, + "48": 9.84349, + "49": 9.52888, "50": 9.91046, + "51": 9.85075, + "52": 9.73181, + "53": 10.06388, + "54": 9.95432, "55": 9.87204, + "56": 9.61823, + "57": 9.47467, + "58": 9.82802, + "59": 9.57962, "60": 9.49074, + "61": 9.68473, + "62": 9.99245, + "63": 9.38364, + "64": 9.77766, "65": 8.94008, + "66": 9.70099, + "67": 9.3605, + "68": 9.77766, + "69": 9.78865, "70": 9.73813, + "71": 9.61811, + "72": 9.58068, + "73": 9.4964, + "74": 8.93812, "75": 9.42081, + "76": 9.07416, + "77": 10.06077, + "78": 9.71952, + "79": 9.37088, "80": 9.39874, + "81": 9.47802, + "82": 9.69299, + "83": 9.30276, + "84": 9.41548, "85": 9.60883, + "86": 9.07461, + "87": 9.58826, + "88": 9.74392, + "89": 9.5951, "90": 9.81217, + "91": 9.33796, + "92": 9.3534, + "93": 9.07315, + "94": 8.83127, "95": 9.51524, + "96": 9.52183, + "97": 9.31012, + "98": 9.66532, + "99": 8.88179, "100": 9.39375 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1640.0, + "2": 1738.0, + "3": 1638.0, + "4": 1810.0, "5": 1755.0, + "6": 1681.0, + "7": 1781.0, + "8": 1502.0, + "9": 1817.0, "10": 1394.0, + "11": 1927.0, + "12": 1691.0, + "13": 1901.0, + "14": 1631.0, "15": 1765.0, + "16": 1864.0, + "17": 1704.0, + "18": 1771.0, + "19": 1817.0, "20": 1831.0, + "21": 1813.0, + "22": 1673.0, + "23": 2005.0, + "24": 1553.0, "25": 1577.0, + "26": 1656.0, + "27": 1734.0, + "28": 1896.0, + "29": 2051.0, "30": 1897.0, + "31": 1452.0, + "32": 1785.0, + "33": 2061.0, + "34": 1857.0, "35": 1920.0, + "36": 1990.0, + "37": 2191.0, + "38": 2142.0, + "39": 2215.0, "40": 2166.0, + "41": 2154.0, + "42": 2148.0, + "43": 1881.0, + "44": 2066.0, "45": 1952.0, + "46": 2217.0, + "47": 2513.0, + "48": 2356.0, + "49": 2294.0, "50": 2140.0, + "51": 2509.0, + "52": 2528.0, + "53": 2851.0, + "54": 2747.0, "55": 2333.0, + "56": 2724.0, + "57": 2315.0, + "58": 2754.0, + "59": 2774.0, "60": 2336.0, + "61": 2912.0, + "62": 2415.0, + "63": 2341.0, + "64": 2837.0, "65": 2661.0, + "66": 3000.0, + "67": 2779.0, + "68": 2691.0, + "69": 2793.0, "70": 3183.0, + "71": 2962.0, + "72": 2393.0, + "73": 2997.0, + "74": 1935.0, "75": 2463.0, + "76": 3065.0, + "77": 3184.0, + "78": 3154.0, + "79": 3127.0, "80": 3286.0, + "81": 3386.0, + "82": 3128.0, + "83": 2608.0, + "84": 3079.0, "85": 3260.0, + "86": 2687.0, + "87": 3591.0, + "88": 3035.0, + "89": 3165.0, "90": 3166.0, + "91": 2690.0, + "92": 2897.0, + "93": 2630.0, + "94": 3348.0, "95": 3349.0, + "96": 3288.0, + "97": 3055.0, + "98": 3516.0, + "99": 3035.0, "100": 3109.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 746194432.0, + "2": 746194432.0, + "3": 746194432.0, + "4": 746194432.0, "5": 746194432.0, + "6": 746194432.0, + "7": 746194432.0, + "8": 746194432.0, + "9": 746194432.0, "10": 746194432.0, + "11": 746194432.0, + "12": 746194432.0, + "13": 746194432.0, + "14": 746194432.0, "15": 746194432.0, + "16": 746194432.0, + "17": 746194432.0, + "18": 746194432.0, + "19": 746194432.0, "20": 746194432.0, + "21": 746194432.0, + "22": 746194432.0, + "23": 746194432.0, + "24": 746194432.0, "25": 746194432.0, + "26": 746194432.0, + "27": 746194432.0, + "28": 746194432.0, + "29": 746194432.0, "30": 746194432.0, + "31": 746194432.0, + "32": 746194432.0, + "33": 746194432.0, + "34": 746194432.0, "35": 746194432.0, + "36": 746194432.0, + "37": 746194432.0, + "38": 746194432.0, + "39": 746194432.0, "40": 746194432.0, + "41": 746194432.0, + "42": 746194432.0, + "43": 746194432.0, + "44": 746194432.0, "45": 746194432.0, + "46": 746194432.0, + "47": 746194432.0, + "48": 746194432.0, + "49": 746194432.0, "50": 746194432.0, + "51": 746194432.0, + "52": 746194432.0, + "53": 746194432.0, + "54": 746194432.0, "55": 746194432.0, + "56": 746194432.0, + "57": 746194432.0, + "58": 746194432.0, + "59": 746194432.0, "60": 746194432.0, + "61": 746194432.0, + "62": 746194432.0, + "63": 746194432.0, + "64": 746194432.0, "65": 746194432.0, + "66": 746194432.0, + "67": 746194432.0, + "68": 746194432.0, + "69": 746194432.0, "70": 746194432.0, + "71": 746194432.0, + "72": 746194432.0, + "73": 746194432.0, + "74": 746194432.0, "75": 746194432.0, + "76": 746194432.0, + "77": 746194432.0, + "78": 746194432.0, + "79": 746194432.0, "80": 746194432.0, + "81": 746194432.0, + "82": 746194432.0, + "83": 746194432.0, + "84": 746194432.0, "85": 746194432.0, + "86": 746194432.0, + "87": 746194432.0, + "88": 746194432.0, + "89": 746194432.0, "90": 746194432.0, + "91": 746194432.0, + "92": 746194432.0, + "93": 746194432.0, + "94": 746194432.0, "95": 746194432.0, + "96": 746194432.0, + "97": 746194432.0, + "98": 746194432.0, + "99": 746194432.0, "100": 746194432.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1926153216.0, + "2": 2209851392.0, + "3": 2209851392.0, + "4": 2209851392.0, "5": 2209851392.0, + "6": 2209851392.0, + "7": 2209851392.0, + "8": 2209851392.0, + "9": 2209851392.0, "10": 2209851392.0, + "11": 2209851392.0, + "12": 2209851392.0, + "13": 2209851392.0, + "14": 2209851392.0, "15": 2209851392.0, + "16": 2209851392.0, + "17": 2209851392.0, + "18": 2209851392.0, + "19": 2209851392.0, "20": 2209851392.0, + "21": 2209851392.0, + "22": 2209851392.0, + "23": 2209851392.0, + "24": 2209851392.0, "25": 2209851392.0, + "26": 2209851392.0, + "27": 2209851392.0, + "28": 2209851392.0, + "29": 2209851392.0, "30": 2209851392.0, + "31": 2209851392.0, + "32": 2209851392.0, + "33": 2209851392.0, + "34": 2209851392.0, "35": 2209851392.0, + "36": 2209851392.0, + "37": 2209851392.0, + "38": 2209851392.0, + "39": 2209851392.0, "40": 2209851392.0, + "41": 2209851392.0, + "42": 2209851392.0, + "43": 2209851392.0, + "44": 2209851392.0, "45": 2209851392.0, + "46": 2209851392.0, + "47": 2209851392.0, + "48": 2209851392.0, + "49": 2209851392.0, "50": 2209851392.0, + "51": 2209851392.0, + "52": 2209851392.0, + "53": 2209851392.0, + "54": 2209851392.0, "55": 2209851392.0, + "56": 2209851392.0, + "57": 2209851392.0, + "58": 2209851392.0, + "59": 2209851392.0, "60": 2209851392.0, + "61": 2209851392.0, + "62": 2209851392.0, + "63": 2209851392.0, + "64": 2209851392.0, "65": 2209851392.0, + "66": 2209851392.0, + "67": 2209851392.0, + "68": 2209851392.0, + "69": 2209851392.0, "70": 2209851392.0, + "71": 2209851392.0, + "72": 2209851392.0, + "73": 2209851392.0, + "74": 2209851392.0, "75": 2209851392.0, + "76": 2209851392.0, + "77": 2209851392.0, + "78": 2209851392.0, + "79": 2209851392.0, "80": 2209851392.0, + "81": 2209851392.0, + "82": 2209851392.0, + "83": 2209851392.0, + "84": 2209851392.0, "85": 2209851392.0, + "86": 2209851392.0, + "87": 2209851392.0, + "88": 2209851392.0, + "89": 2209851392.0, "90": 2209851392.0, + "91": 2209851392.0, + "92": 2209851392.0, + "93": 2209851392.0, + "94": 2209851392.0, "95": 2209851392.0, + "96": 2209851392.0, + "97": 2209851392.0, + "98": 2209851392.0, + "99": 2209851392.0, "100": 2209851392.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 11.76041, - "5": 0.0928, - "10": 0.09401, - "15": 0.09246, - "20": 0.09284, - "25": 0.09344, - "30": 0.09267, - "35": 0.09314, - "40": 0.0926, - "45": 0.09244, - "50": 0.0925, - "55": 0.09481, - "60": 0.09314, - "65": 0.09243, - "70": 0.09297, - "75": 0.09278, - "80": 0.0928, - "85": 0.09198, - "90": 0.09259, - "95": 0.09244, - "100": 0.09223 + "1": 12.82981, + "2": 0.12202, + "3": 0.10747, + "4": 0.10702, + "5": 0.10713, + "6": 0.10667, + "7": 0.10627, + "8": 0.10699, + "9": 0.10657, + "10": 0.10715, + "11": 0.10642, + "12": 0.10705, + "13": 0.10495, + "14": 0.10784, + "15": 0.1107, + "16": 0.1105, + "17": 0.11162, + "18": 0.11128, + "19": 0.11269, + "20": 0.10842, + "21": 0.10915, + "22": 0.10863, + "23": 0.10818, + "24": 0.10975, + "25": 0.10577, + "26": 0.10559, + "27": 0.10659, + "28": 0.10616, + "29": 0.10712, + "30": 0.10735, + "31": 0.1064, + "32": 0.10562, + "33": 0.10538, + "34": 0.10678, + "35": 0.10507, + "36": 0.10502, + "37": 0.10532, + "38": 0.10636, + "39": 0.10511, + "40": 0.10497, + "41": 0.10557, + "42": 0.10413, + "43": 0.10684, + "44": 0.10567, + "45": 0.10719, + "46": 0.10887, + "47": 0.11215, + "48": 0.11102, + "49": 0.10907, + "50": 0.10761, + "51": 0.12141, + "52": 0.13372, + "53": 0.10585, + "54": 0.10595, + "55": 0.10712, + "56": 0.10573, + "57": 0.10825, + "58": 0.10991, + "59": 0.10753, + "60": 0.10565, + "61": 0.10639, + "62": 0.11, + "63": 0.10465, + "64": 0.10596, + "65": 0.10785, + "66": 0.11597, + "67": 0.10697, + "68": 0.10722, + "69": 0.10693, + "70": 0.1079, + "71": 0.10852, + "72": 0.10729, + "73": 0.10617, + "74": 0.1046, + "75": 0.10476, + "76": 0.11096, + "77": 0.10553, + "78": 0.10593, + "79": 0.1069, + "80": 0.10615, + "81": 0.11416, + "82": 0.10544, + "83": 0.10562, + "84": 0.10576, + "85": 0.10568, + "86": 0.10984, + "87": 0.10814, + "88": 0.10556, + "89": 0.10524, + "90": 0.1051, + "91": 0.11373, + "92": 0.10616, + "93": 0.10743, + "94": 0.10695, + "95": 0.11373, + "96": 0.10777, + "97": 0.10685, + "98": 0.10614, + "99": 0.10571, + "100": 0.10707 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..f5278baae82 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85599, + "2": 10.8648, + "3": 10.87042, + "4": 10.85288, + "5": 10.88397, + "6": 10.89184, + "7": 10.86732, + "8": 10.87057, + "9": 10.87432, + "10": 10.84185, + "11": 10.87989, + "12": 10.87417, + "13": 10.87884, + "14": 10.89184, + "15": 10.82659, + "16": 10.83027, + "17": 10.80933, + "18": 10.81431, + "19": 10.8167, + "20": 10.72165, + "21": 10.70557, + "22": 10.56881, + "23": 10.72025, + "24": 10.61194, + "25": 10.55765, + "26": 10.61149, + "27": 10.62635, + "28": 10.57155, + "29": 10.58212, + "30": 10.36267, + "31": 10.11682, + "32": 10.4682, + "33": 10.45411, + "34": 10.21121, + "35": 10.27207, + "36": 10.22246, + "37": 10.34079, + "38": 10.18964, + "39": 10.40228, + "40": 10.08758, + "41": 10.13714, + "42": 10.21175, + "43": 9.82878, + "44": 9.96255, + "45": 9.82846, + "46": 9.80952, + "47": 10.13734, + "48": 9.84349, + "49": 9.52888, + "50": 9.91046, + "51": 9.85075, + "52": 9.73181, + "53": 10.06388, + "54": 9.95432, + "55": 9.87204, + "56": 9.61823, + "57": 9.47467, + "58": 9.82802, + "59": 9.57962, + "60": 9.49074, + "61": 9.68473, + "62": 9.99245, + "63": 9.38364, + "64": 9.77766, + "65": 8.94008, + "66": 9.70099, + "67": 9.3605, + "68": 9.77766, + "69": 9.78865, + "70": 9.73813, + "71": 9.61811, + "72": 9.58068, + "73": 9.4964, + "74": 8.93812, + "75": 9.42081, + "76": 9.07416, + "77": 10.06077, + "78": 9.71952, + "79": 9.37088, + "80": 9.39874, + "81": 9.47802, + "82": 9.69299, + "83": 9.30276, + "84": 9.41548, + "85": 9.60883, + "86": 9.07461, + "87": 9.58826, + "88": 9.74392, + "89": 9.5951, + "90": 9.81217, + "91": 9.33796, + "92": 9.3534, + "93": 9.07315, + "94": 8.83127, + "95": 9.51524, + "96": 9.52183, + "97": 9.31012, + "98": 9.66532, + "99": 8.88179, + "100": 9.39375 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1640.0, + "2": 1738.0, + "3": 1638.0, + "4": 1810.0, + "5": 1755.0, + "6": 1681.0, + "7": 1781.0, + "8": 1502.0, + "9": 1817.0, + "10": 1394.0, + "11": 1927.0, + "12": 1691.0, + "13": 1901.0, + "14": 1631.0, + "15": 1765.0, + "16": 1864.0, + "17": 1704.0, + "18": 1771.0, + "19": 1817.0, + "20": 1831.0, + "21": 1813.0, + "22": 1673.0, + "23": 2005.0, + "24": 1553.0, + "25": 1577.0, + "26": 1656.0, + "27": 1734.0, + "28": 1896.0, + "29": 2051.0, + "30": 1897.0, + "31": 1452.0, + "32": 1785.0, + "33": 2061.0, + "34": 1857.0, + "35": 1920.0, + "36": 1990.0, + "37": 2191.0, + "38": 2142.0, + "39": 2215.0, + "40": 2166.0, + "41": 2154.0, + "42": 2148.0, + "43": 1881.0, + "44": 2066.0, + "45": 1952.0, + "46": 2217.0, + "47": 2513.0, + "48": 2356.0, + "49": 2294.0, + "50": 2140.0, + "51": 2509.0, + "52": 2528.0, + "53": 2851.0, + "54": 2747.0, + "55": 2333.0, + "56": 2724.0, + "57": 2315.0, + "58": 2754.0, + "59": 2774.0, + "60": 2336.0, + "61": 2912.0, + "62": 2415.0, + "63": 2341.0, + "64": 2837.0, + "65": 2661.0, + "66": 3000.0, + "67": 2779.0, + "68": 2691.0, + "69": 2793.0, + "70": 3183.0, + "71": 2962.0, + "72": 2393.0, + "73": 2997.0, + "74": 1935.0, + "75": 2463.0, + "76": 3065.0, + "77": 3184.0, + "78": 3154.0, + "79": 3127.0, + "80": 3286.0, + "81": 3386.0, + "82": 3128.0, + "83": 2608.0, + "84": 3079.0, + "85": 3260.0, + "86": 2687.0, + "87": 3591.0, + "88": 3035.0, + "89": 3165.0, + "90": 3166.0, + "91": 2690.0, + "92": 2897.0, + "93": 2630.0, + "94": 3348.0, + "95": 3349.0, + "96": 3288.0, + "97": 3055.0, + "98": 3516.0, + "99": 3035.0, + "100": 3109.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 746194432.0, + "2": 746194432.0, + "3": 746194432.0, + "4": 746194432.0, + "5": 746194432.0, + "6": 746194432.0, + "7": 746194432.0, + "8": 746194432.0, + "9": 746194432.0, + "10": 746194432.0, + "11": 746194432.0, + "12": 746194432.0, + "13": 746194432.0, + "14": 746194432.0, + "15": 746194432.0, + "16": 746194432.0, + "17": 746194432.0, + "18": 746194432.0, + "19": 746194432.0, + "20": 746194432.0, + "21": 746194432.0, + "22": 746194432.0, + "23": 746194432.0, + "24": 746194432.0, + "25": 746194432.0, + "26": 746194432.0, + "27": 746194432.0, + "28": 746194432.0, + "29": 746194432.0, + "30": 746194432.0, + "31": 746194432.0, + "32": 746194432.0, + "33": 746194432.0, + "34": 746194432.0, + "35": 746194432.0, + "36": 746194432.0, + "37": 746194432.0, + "38": 746194432.0, + "39": 746194432.0, + "40": 746194432.0, + "41": 746194432.0, + "42": 746194432.0, + "43": 746194432.0, + "44": 746194432.0, + "45": 746194432.0, + "46": 746194432.0, + "47": 746194432.0, + "48": 746194432.0, + "49": 746194432.0, + "50": 746194432.0, + "51": 746194432.0, + "52": 746194432.0, + "53": 746194432.0, + "54": 746194432.0, + "55": 746194432.0, + "56": 746194432.0, + "57": 746194432.0, + "58": 746194432.0, + "59": 746194432.0, + "60": 746194432.0, + "61": 746194432.0, + "62": 746194432.0, + "63": 746194432.0, + "64": 746194432.0, + "65": 746194432.0, + "66": 746194432.0, + "67": 746194432.0, + "68": 746194432.0, + "69": 746194432.0, + "70": 746194432.0, + "71": 746194432.0, + "72": 746194432.0, + "73": 746194432.0, + "74": 746194432.0, + "75": 746194432.0, + "76": 746194432.0, + "77": 746194432.0, + "78": 746194432.0, + "79": 746194432.0, + "80": 746194432.0, + "81": 746194432.0, + "82": 746194432.0, + "83": 746194432.0, + "84": 746194432.0, + "85": 746194432.0, + "86": 746194432.0, + "87": 746194432.0, + "88": 746194432.0, + "89": 746194432.0, + "90": 746194432.0, + "91": 746194432.0, + "92": 746194432.0, + "93": 746194432.0, + "94": 746194432.0, + "95": 746194432.0, + "96": 746194432.0, + "97": 746194432.0, + "98": 746194432.0, + "99": 746194432.0, + "100": 746194432.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1926153216.0, + "2": 2209851392.0, + "3": 2209851392.0, + "4": 2209851392.0, + "5": 2209851392.0, + "6": 2209851392.0, + "7": 2209851392.0, + "8": 2209851392.0, + "9": 2209851392.0, + "10": 2209851392.0, + "11": 2209851392.0, + "12": 2209851392.0, + "13": 2209851392.0, + "14": 2209851392.0, + "15": 2209851392.0, + "16": 2209851392.0, + "17": 2209851392.0, + "18": 2209851392.0, + "19": 2209851392.0, + "20": 2209851392.0, + "21": 2209851392.0, + "22": 2209851392.0, + "23": 2209851392.0, + "24": 2209851392.0, + "25": 2209851392.0, + "26": 2209851392.0, + "27": 2209851392.0, + "28": 2209851392.0, + "29": 2209851392.0, + "30": 2209851392.0, + "31": 2209851392.0, + "32": 2209851392.0, + "33": 2209851392.0, + "34": 2209851392.0, + "35": 2209851392.0, + "36": 2209851392.0, + "37": 2209851392.0, + "38": 2209851392.0, + "39": 2209851392.0, + "40": 2209851392.0, + "41": 2209851392.0, + "42": 2209851392.0, + "43": 2209851392.0, + "44": 2209851392.0, + "45": 2209851392.0, + "46": 2209851392.0, + "47": 2209851392.0, + "48": 2209851392.0, + "49": 2209851392.0, + "50": 2209851392.0, + "51": 2209851392.0, + "52": 2209851392.0, + "53": 2209851392.0, + "54": 2209851392.0, + "55": 2209851392.0, + "56": 2209851392.0, + "57": 2209851392.0, + "58": 2209851392.0, + "59": 2209851392.0, + "60": 2209851392.0, + "61": 2209851392.0, + "62": 2209851392.0, + "63": 2209851392.0, + "64": 2209851392.0, + "65": 2209851392.0, + "66": 2209851392.0, + "67": 2209851392.0, + "68": 2209851392.0, + "69": 2209851392.0, + "70": 2209851392.0, + "71": 2209851392.0, + "72": 2209851392.0, + "73": 2209851392.0, + "74": 2209851392.0, + "75": 2209851392.0, + "76": 2209851392.0, + "77": 2209851392.0, + "78": 2209851392.0, + "79": 2209851392.0, + "80": 2209851392.0, + "81": 2209851392.0, + "82": 2209851392.0, + "83": 2209851392.0, + "84": 2209851392.0, + "85": 2209851392.0, + "86": 2209851392.0, + "87": 2209851392.0, + "88": 2209851392.0, + "89": 2209851392.0, + "90": 2209851392.0, + "91": 2209851392.0, + "92": 2209851392.0, + "93": 2209851392.0, + "94": 2209851392.0, + "95": 2209851392.0, + "96": 2209851392.0, + "97": 2209851392.0, + "98": 2209851392.0, + "99": 2209851392.0, + "100": 2209851392.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.70442, + "2": 0.13019, + "3": 0.0979, + "4": 0.09686, + "5": 0.09768, + "6": 0.09685, + "7": 0.09593, + "8": 0.09527, + "9": 0.09564, + "10": 0.09666, + "11": 0.09434, + "12": 0.09507, + "13": 0.09515, + "14": 0.09479, + "15": 0.09471, + "16": 0.09457, + "17": 0.09471, + "18": 0.09471, + "19": 0.09425, + "20": 0.09404, + "21": 0.09478, + "22": 0.09431, + "23": 0.09582, + "24": 0.09629, + "25": 0.09606, + "26": 0.09601, + "27": 0.09669, + "28": 0.0955, + "29": 0.09877, + "30": 0.09681, + "31": 0.09783, + "32": 0.09679, + "33": 0.09636, + "34": 0.09497, + "35": 0.0955, + "36": 0.09533, + "37": 0.09488, + "38": 0.10172, + "39": 0.09491, + "40": 0.09435, + "41": 0.09527, + "42": 0.09493, + "43": 0.10246, + "44": 0.10248, + "45": 0.10163, + "46": 0.10184, + "47": 0.10193, + "48": 0.10237, + "49": 0.10206, + "50": 0.10141, + "51": 0.11047, + "52": 0.12328, + "53": 0.10274, + "54": 0.0969, + "55": 0.09666, + "56": 0.09655, + "57": 0.09837, + "58": 0.10123, + "59": 0.10037, + "60": 0.09607, + "61": 0.09522, + "62": 0.09645, + "63": 0.09756, + "64": 0.09502, + "65": 0.09541, + "66": 0.09681, + "67": 0.09707, + "68": 0.09483, + "69": 0.09531, + "70": 0.0962, + "71": 0.09572, + "72": 0.09677, + "73": 0.09704, + "74": 0.09624, + "75": 0.09474, + "76": 0.09532, + "77": 0.09678, + "78": 0.09534, + "79": 0.09817, + "80": 0.09669, + "81": 0.09724, + "82": 0.09754, + "83": 0.09837, + "84": 0.09528, + "85": 0.09597, + "86": 0.09653, + "87": 0.09565, + "88": 0.0961, + "89": 0.09685, + "90": 0.0967, + "91": 0.0944, + "92": 0.09565, + "93": 0.09526, + "94": 0.09573, + "95": 0.09396, + "96": 0.09557, + "97": 0.09618, + "98": 0.0957, + "99": 0.09558, + "100": 0.09514 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..50639a30816 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85599, + "2": 10.8648, + "3": 10.87042, + "4": 10.85288, + "5": 10.88397, + "6": 10.89184, + "7": 10.86732, + "8": 10.87057, + "9": 10.87432, + "10": 10.84185, + "11": 10.87989, + "12": 10.87417, + "13": 10.87884, + "14": 10.89184, + "15": 10.82659, + "16": 10.83027, + "17": 10.80933, + "18": 10.81431, + "19": 10.8167, + "20": 10.72165, + "21": 10.70557, + "22": 10.56881, + "23": 10.72025, + "24": 10.61194, + "25": 10.55765, + "26": 10.61149, + "27": 10.62635, + "28": 10.57155, + "29": 10.58212, + "30": 10.36267, + "31": 10.11682, + "32": 10.4682, + "33": 10.45411, + "34": 10.21121, + "35": 10.27207, + "36": 10.22246, + "37": 10.34079, + "38": 10.18964, + "39": 10.40228, + "40": 10.08758, + "41": 10.13714, + "42": 10.21175, + "43": 9.82878, + "44": 9.96255, + "45": 9.82846, + "46": 9.80952, + "47": 10.13734, + "48": 9.84349, + "49": 9.52888, + "50": 9.91046, + "51": 9.85075, + "52": 9.73181, + "53": 10.06388, + "54": 9.95432, + "55": 9.87204, + "56": 9.61823, + "57": 9.47467, + "58": 9.82802, + "59": 9.57962, + "60": 9.49074, + "61": 9.68473, + "62": 9.99245, + "63": 9.38364, + "64": 9.77766, + "65": 8.94008, + "66": 9.70099, + "67": 9.3605, + "68": 9.77766, + "69": 9.78865, + "70": 9.73813, + "71": 9.61811, + "72": 9.58068, + "73": 9.4964, + "74": 8.93812, + "75": 9.42081, + "76": 9.07416, + "77": 10.06077, + "78": 9.71952, + "79": 9.37088, + "80": 9.39874, + "81": 9.47802, + "82": 9.69299, + "83": 9.30276, + "84": 9.41548, + "85": 9.60883, + "86": 9.07461, + "87": 9.58826, + "88": 9.74392, + "89": 9.5951, + "90": 9.81217, + "91": 9.33796, + "92": 9.3534, + "93": 9.07315, + "94": 8.83127, + "95": 9.51524, + "96": 9.52183, + "97": 9.31012, + "98": 9.66532, + "99": 8.88179, + "100": 9.39375 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1640.0, + "2": 1738.0, + "3": 1638.0, + "4": 1810.0, + "5": 1755.0, + "6": 1681.0, + "7": 1781.0, + "8": 1502.0, + "9": 1817.0, + "10": 1394.0, + "11": 1927.0, + "12": 1691.0, + "13": 1901.0, + "14": 1631.0, + "15": 1765.0, + "16": 1864.0, + "17": 1704.0, + "18": 1771.0, + "19": 1817.0, + "20": 1831.0, + "21": 1813.0, + "22": 1673.0, + "23": 2005.0, + "24": 1553.0, + "25": 1577.0, + "26": 1656.0, + "27": 1734.0, + "28": 1896.0, + "29": 2051.0, + "30": 1897.0, + "31": 1452.0, + "32": 1785.0, + "33": 2061.0, + "34": 1857.0, + "35": 1920.0, + "36": 1990.0, + "37": 2191.0, + "38": 2142.0, + "39": 2215.0, + "40": 2166.0, + "41": 2154.0, + "42": 2148.0, + "43": 1881.0, + "44": 2066.0, + "45": 1952.0, + "46": 2217.0, + "47": 2513.0, + "48": 2356.0, + "49": 2294.0, + "50": 2140.0, + "51": 2509.0, + "52": 2528.0, + "53": 2851.0, + "54": 2747.0, + "55": 2333.0, + "56": 2724.0, + "57": 2315.0, + "58": 2754.0, + "59": 2774.0, + "60": 2336.0, + "61": 2912.0, + "62": 2415.0, + "63": 2341.0, + "64": 2837.0, + "65": 2661.0, + "66": 3000.0, + "67": 2779.0, + "68": 2691.0, + "69": 2793.0, + "70": 3183.0, + "71": 2962.0, + "72": 2393.0, + "73": 2997.0, + "74": 1935.0, + "75": 2463.0, + "76": 3065.0, + "77": 3184.0, + "78": 3154.0, + "79": 3127.0, + "80": 3286.0, + "81": 3386.0, + "82": 3128.0, + "83": 2608.0, + "84": 3079.0, + "85": 3260.0, + "86": 2687.0, + "87": 3591.0, + "88": 3035.0, + "89": 3165.0, + "90": 3166.0, + "91": 2690.0, + "92": 2897.0, + "93": 2630.0, + "94": 3348.0, + "95": 3349.0, + "96": 3288.0, + "97": 3055.0, + "98": 3516.0, + "99": 3035.0, + "100": 3109.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 746194432.0, + "2": 746194432.0, + "3": 746194432.0, + "4": 746194432.0, + "5": 746194432.0, + "6": 746194432.0, + "7": 746194432.0, + "8": 746194432.0, + "9": 746194432.0, + "10": 746194432.0, + "11": 746194432.0, + "12": 746194432.0, + "13": 746194432.0, + "14": 746194432.0, + "15": 746194432.0, + "16": 746194432.0, + "17": 746194432.0, + "18": 746194432.0, + "19": 746194432.0, + "20": 746194432.0, + "21": 746194432.0, + "22": 746194432.0, + "23": 746194432.0, + "24": 746194432.0, + "25": 746194432.0, + "26": 746194432.0, + "27": 746194432.0, + "28": 746194432.0, + "29": 746194432.0, + "30": 746194432.0, + "31": 746194432.0, + "32": 746194432.0, + "33": 746194432.0, + "34": 746194432.0, + "35": 746194432.0, + "36": 746194432.0, + "37": 746194432.0, + "38": 746194432.0, + "39": 746194432.0, + "40": 746194432.0, + "41": 746194432.0, + "42": 746194432.0, + "43": 746194432.0, + "44": 746194432.0, + "45": 746194432.0, + "46": 746194432.0, + "47": 746194432.0, + "48": 746194432.0, + "49": 746194432.0, + "50": 746194432.0, + "51": 746194432.0, + "52": 746194432.0, + "53": 746194432.0, + "54": 746194432.0, + "55": 746194432.0, + "56": 746194432.0, + "57": 746194432.0, + "58": 746194432.0, + "59": 746194432.0, + "60": 746194432.0, + "61": 746194432.0, + "62": 746194432.0, + "63": 746194432.0, + "64": 746194432.0, + "65": 746194432.0, + "66": 746194432.0, + "67": 746194432.0, + "68": 746194432.0, + "69": 746194432.0, + "70": 746194432.0, + "71": 746194432.0, + "72": 746194432.0, + "73": 746194432.0, + "74": 746194432.0, + "75": 746194432.0, + "76": 746194432.0, + "77": 746194432.0, + "78": 746194432.0, + "79": 746194432.0, + "80": 746194432.0, + "81": 746194432.0, + "82": 746194432.0, + "83": 746194432.0, + "84": 746194432.0, + "85": 746194432.0, + "86": 746194432.0, + "87": 746194432.0, + "88": 746194432.0, + "89": 746194432.0, + "90": 746194432.0, + "91": 746194432.0, + "92": 746194432.0, + "93": 746194432.0, + "94": 746194432.0, + "95": 746194432.0, + "96": 746194432.0, + "97": 746194432.0, + "98": 746194432.0, + "99": 746194432.0, + "100": 746194432.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1926153216.0, + "2": 2209851392.0, + "3": 2209851392.0, + "4": 2209851392.0, + "5": 2209851392.0, + "6": 2209851392.0, + "7": 2209851392.0, + "8": 2209851392.0, + "9": 2209851392.0, + "10": 2209851392.0, + "11": 2209851392.0, + "12": 2209851392.0, + "13": 2209851392.0, + "14": 2209851392.0, + "15": 2209851392.0, + "16": 2209851392.0, + "17": 2209851392.0, + "18": 2209851392.0, + "19": 2209851392.0, + "20": 2209851392.0, + "21": 2209851392.0, + "22": 2209851392.0, + "23": 2209851392.0, + "24": 2209851392.0, + "25": 2209851392.0, + "26": 2209851392.0, + "27": 2209851392.0, + "28": 2209851392.0, + "29": 2209851392.0, + "30": 2209851392.0, + "31": 2209851392.0, + "32": 2209851392.0, + "33": 2209851392.0, + "34": 2209851392.0, + "35": 2209851392.0, + "36": 2209851392.0, + "37": 2209851392.0, + "38": 2209851392.0, + "39": 2209851392.0, + "40": 2209851392.0, + "41": 2209851392.0, + "42": 2209851392.0, + "43": 2209851392.0, + "44": 2209851392.0, + "45": 2209851392.0, + "46": 2209851392.0, + "47": 2209851392.0, + "48": 2209851392.0, + "49": 2209851392.0, + "50": 2209851392.0, + "51": 2209851392.0, + "52": 2209851392.0, + "53": 2209851392.0, + "54": 2209851392.0, + "55": 2209851392.0, + "56": 2209851392.0, + "57": 2209851392.0, + "58": 2209851392.0, + "59": 2209851392.0, + "60": 2209851392.0, + "61": 2209851392.0, + "62": 2209851392.0, + "63": 2209851392.0, + "64": 2209851392.0, + "65": 2209851392.0, + "66": 2209851392.0, + "67": 2209851392.0, + "68": 2209851392.0, + "69": 2209851392.0, + "70": 2209851392.0, + "71": 2209851392.0, + "72": 2209851392.0, + "73": 2209851392.0, + "74": 2209851392.0, + "75": 2209851392.0, + "76": 2209851392.0, + "77": 2209851392.0, + "78": 2209851392.0, + "79": 2209851392.0, + "80": 2209851392.0, + "81": 2209851392.0, + "82": 2209851392.0, + "83": 2209851392.0, + "84": 2209851392.0, + "85": 2209851392.0, + "86": 2209851392.0, + "87": 2209851392.0, + "88": 2209851392.0, + "89": 2209851392.0, + "90": 2209851392.0, + "91": 2209851392.0, + "92": 2209851392.0, + "93": 2209851392.0, + "94": 2209851392.0, + "95": 2209851392.0, + "96": 2209851392.0, + "97": 2209851392.0, + "98": 2209851392.0, + "99": 2209851392.0, + "100": 2209851392.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 12.88983, + "2": 0.12288, + "3": 0.10944, + "4": 0.10822, + "5": 0.10919, + "6": 0.10835, + "7": 0.11035, + "8": 0.10879, + "9": 0.11001, + "10": 0.11009, + "11": 0.10945, + "12": 0.10868, + "13": 0.1086, + "14": 0.10899, + "15": 0.10852, + "16": 0.10822, + "17": 0.10818, + "18": 0.10877, + "19": 0.10888, + "20": 0.10828, + "21": 0.109, + "22": 0.108, + "23": 0.10722, + "24": 0.10731, + "25": 0.1075, + "26": 0.10744, + "27": 0.10843, + "28": 0.10831, + "29": 0.10841, + "30": 0.10718, + "31": 0.10837, + "32": 0.10773, + "33": 0.10792, + "34": 0.10698, + "35": 0.10976, + "36": 0.10758, + "37": 0.10825, + "38": 0.10781, + "39": 0.10912, + "40": 0.10847, + "41": 0.10786, + "42": 0.10767, + "43": 0.10761, + "44": 0.1076, + "45": 0.1078, + "46": 0.10992, + "47": 0.1061, + "48": 0.10654, + "49": 0.10566, + "50": 0.1066, + "51": 0.11234, + "52": 0.11065, + "53": 0.10795, + "54": 0.10668, + "55": 0.10678, + "56": 0.10889, + "57": 0.10802, + "58": 0.12482, + "59": 0.10666, + "60": 0.10637, + "61": 0.10776, + "62": 0.10743, + "63": 0.10782, + "64": 0.10634, + "65": 0.10744, + "66": 0.10859, + "67": 0.10949, + "68": 0.1075, + "69": 0.10803, + "70": 0.10688, + "71": 0.10797, + "72": 0.10752, + "73": 0.10816, + "74": 0.10734, + "75": 0.10832, + "76": 0.10815, + "77": 0.10868, + "78": 0.10839, + "79": 0.1074, + "80": 0.10866, + "81": 0.11122, + "82": 0.11035, + "83": 0.1101, + "84": 0.1122, + "85": 0.10866, + "86": 0.10915, + "87": 0.10842, + "88": 0.10723, + "89": 0.10849, + "90": 0.10814, + "91": 0.10833, + "92": 0.10719, + "93": 0.10725, + "94": 0.10754, + "95": 0.10758, + "96": 0.1082, + "97": 0.10768, + "98": 0.10708, + "99": 0.10785, + "100": 0.10841 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..5de8b526700 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.79205, + "2": 10.80272, + "3": 10.80707, + "4": 10.77315, + "5": 10.84695, + "6": 10.86789, + "7": 10.82655, + "8": 10.81333, + "9": 10.83441, + "10": 10.77106, + "11": 10.89149, + "12": 10.84617, + "13": 10.85969, + "14": 10.8812, + "15": 10.79093, + "16": 10.78328, + "17": 10.75926, + "18": 10.79337, + "19": 10.797, + "20": 10.68042, + "21": 10.66126, + "22": 10.50248, + "23": 10.71375, + "24": 10.55253, + "25": 10.50715, + "26": 10.58275, + "27": 10.58672, + "28": 10.55873, + "29": 10.56101, + "30": 10.33325, + "31": 10.08467, + "32": 10.44744, + "33": 10.44372, + "34": 10.2003, + "35": 10.25545, + "36": 10.19448, + "37": 10.32113, + "38": 10.1659, + "39": 10.37726, + "40": 10.05544, + "41": 10.13785, + "42": 10.19159, + "43": 9.80956, + "44": 9.92967, + "45": 9.80575, + "46": 9.81454, + "47": 10.12933, + "48": 9.82644, + "49": 9.51395, + "50": 9.89082, + "51": 9.8397, + "52": 9.73412, + "53": 10.05515, + "54": 9.94093, + "55": 9.87063, + "56": 9.61009, + "57": 9.46055, + "58": 9.81541, + "59": 9.57905, + "60": 9.48478, + "61": 9.68485, + "62": 9.97574, + "63": 9.36483, + "64": 9.76838, + "65": 8.94022, + "66": 9.68864, + "67": 9.36647, + "68": 9.77611, + "69": 9.78404, + "70": 9.72243, + "71": 9.6082, + "72": 9.57758, + "73": 9.48936, + "74": 8.9399, + "75": 9.40907, + "76": 9.08135, + "77": 10.05639, + "78": 9.72293, + "79": 9.36509, + "80": 9.3976, + "81": 9.47445, + "82": 9.68843, + "83": 9.30263, + "84": 9.4102, + "85": 9.60746, + "86": 9.07122, + "87": 9.58742, + "88": 9.74129, + "89": 9.59922, + "90": 9.81041, + "91": 9.33141, + "92": 9.35529, + "93": 9.07461, + "94": 8.82759, + "95": 9.5116, + "96": 9.51899, + "97": 9.30162, + "98": 9.66741, + "99": 8.88218, + "100": 9.39722 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1580.0, + "2": 1686.0, + "3": 1726.0, + "4": 1795.0, + "5": 1901.0, + "6": 1778.0, + "7": 1963.0, + "8": 1704.0, + "9": 1811.0, + "10": 1346.0, + "11": 1849.0, + "12": 1683.0, + "13": 1888.0, + "14": 1711.0, + "15": 1926.0, + "16": 1841.0, + "17": 1931.0, + "18": 1716.0, + "19": 1765.0, + "20": 1643.0, + "21": 1884.0, + "22": 1626.0, + "23": 1954.0, + "24": 1715.0, + "25": 1683.0, + "26": 1679.0, + "27": 1817.0, + "28": 2019.0, + "29": 1946.0, + "30": 1867.0, + "31": 1544.0, + "32": 1832.0, + "33": 2119.0, + "34": 1921.0, + "35": 2020.0, + "36": 1953.0, + "37": 2350.0, + "38": 2210.0, + "39": 2319.0, + "40": 2252.0, + "41": 2449.0, + "42": 2364.0, + "43": 2089.0, + "44": 2094.0, + "45": 2243.0, + "46": 2335.0, + "47": 2406.0, + "48": 2410.0, + "49": 2341.0, + "50": 2459.0, + "51": 2611.0, + "52": 2427.0, + "53": 2838.0, + "54": 2632.0, + "55": 2291.0, + "56": 2663.0, + "57": 2276.0, + "58": 2777.0, + "59": 2601.0, + "60": 2404.0, + "61": 2985.0, + "62": 2595.0, + "63": 2454.0, + "64": 3101.0, + "65": 2474.0, + "66": 3006.0, + "67": 2671.0, + "68": 2874.0, + "69": 2956.0, + "70": 3102.0, + "71": 2891.0, + "72": 2543.0, + "73": 2860.0, + "74": 1888.0, + "75": 2603.0, + "76": 2813.0, + "77": 3361.0, + "78": 3252.0, + "79": 3007.0, + "80": 3420.0, + "81": 3624.0, + "82": 3184.0, + "83": 2708.0, + "84": 3138.0, + "85": 3388.0, + "86": 2619.0, + "87": 3682.0, + "88": 3074.0, + "89": 3260.0, + "90": 2904.0, + "91": 2634.0, + "92": 3097.0, + "93": 2745.0, + "94": 3484.0, + "95": 3333.0, + "96": 3292.0, + "97": 3141.0, + "98": 3550.0, + "99": 3170.0, + "100": 3347.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 714736640.0, + "2": 714736640.0, + "3": 714736640.0, + "4": 714736640.0, + "5": 714736640.0, + "6": 714736640.0, + "7": 714736640.0, + "8": 714736640.0, + "9": 714736640.0, + "10": 714736640.0, + "11": 714736640.0, + "12": 714736640.0, + "13": 714736640.0, + "14": 714736640.0, + "15": 714736640.0, + "16": 714736640.0, + "17": 714736640.0, + "18": 714736640.0, + "19": 714736640.0, + "20": 714736640.0, + "21": 714736640.0, + "22": 714736640.0, + "23": 714736640.0, + "24": 714736640.0, + "25": 714736640.0, + "26": 714736640.0, + "27": 714736640.0, + "28": 714736640.0, + "29": 714736640.0, + "30": 714736640.0, + "31": 714736640.0, + "32": 714736640.0, + "33": 714736640.0, + "34": 714736640.0, + "35": 714736640.0, + "36": 714736640.0, + "37": 714736640.0, + "38": 714736640.0, + "39": 714736640.0, + "40": 714736640.0, + "41": 714736640.0, + "42": 714736640.0, + "43": 714736640.0, + "44": 714736640.0, + "45": 714736640.0, + "46": 714736640.0, + "47": 714736640.0, + "48": 714736640.0, + "49": 714736640.0, + "50": 714736640.0, + "51": 714736640.0, + "52": 714736640.0, + "53": 714736640.0, + "54": 714736640.0, + "55": 714736640.0, + "56": 714736640.0, + "57": 714736640.0, + "58": 714736640.0, + "59": 714736640.0, + "60": 714736640.0, + "61": 714736640.0, + "62": 714736640.0, + "63": 714736640.0, + "64": 714736640.0, + "65": 714736640.0, + "66": 714736640.0, + "67": 714736640.0, + "68": 714736640.0, + "69": 714736640.0, + "70": 714736640.0, + "71": 714736640.0, + "72": 714736640.0, + "73": 714736640.0, + "74": 714736640.0, + "75": 714736640.0, + "76": 714736640.0, + "77": 714736640.0, + "78": 714736640.0, + "79": 714736640.0, + "80": 714736640.0, + "81": 714736640.0, + "82": 714736640.0, + "83": 714736640.0, + "84": 714736640.0, + "85": 714736640.0, + "86": 714736640.0, + "87": 714736640.0, + "88": 714736640.0, + "89": 714736640.0, + "90": 714736640.0, + "91": 714736640.0, + "92": 714736640.0, + "93": 714736640.0, + "94": 714736640.0, + "95": 714736640.0, + "96": 714736640.0, + "97": 714736640.0, + "98": 714736640.0, + "99": 714736640.0, + "100": 714736640.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2399714304.0, + "2": 2681315328.0, + "3": 2681315328.0, + "4": 2681315328.0, + "5": 2681315328.0, + "6": 2681315328.0, + "7": 2681315328.0, + "8": 2681315328.0, + "9": 2681315328.0, + "10": 2681315328.0, + "11": 2681315328.0, + "12": 2681315328.0, + "13": 2681315328.0, + "14": 2681315328.0, + "15": 2681315328.0, + "16": 2681315328.0, + "17": 2681315328.0, + "18": 2681315328.0, + "19": 2681315328.0, + "20": 2681315328.0, + "21": 2681315328.0, + "22": 2681315328.0, + "23": 2681315328.0, + "24": 2681315328.0, + "25": 2681315328.0, + "26": 2681315328.0, + "27": 2681315328.0, + "28": 2681315328.0, + "29": 2681315328.0, + "30": 2681315328.0, + "31": 2681315328.0, + "32": 2681315328.0, + "33": 2681315328.0, + "34": 2681315328.0, + "35": 2681315328.0, + "36": 2681315328.0, + "37": 2681315328.0, + "38": 2681315328.0, + "39": 2681315328.0, + "40": 2681315328.0, + "41": 2681315328.0, + "42": 2681315328.0, + "43": 2681315328.0, + "44": 2681315328.0, + "45": 2681315328.0, + "46": 2681315328.0, + "47": 2681315328.0, + "48": 2681315328.0, + "49": 2681315328.0, + "50": 2681315328.0, + "51": 2681315328.0, + "52": 2681315328.0, + "53": 2681315328.0, + "54": 2681315328.0, + "55": 2681315328.0, + "56": 2681315328.0, + "57": 2681315328.0, + "58": 2681315328.0, + "59": 2681315328.0, + "60": 2681315328.0, + "61": 2681315328.0, + "62": 2681315328.0, + "63": 2681315328.0, + "64": 2681315328.0, + "65": 2681315328.0, + "66": 2681315328.0, + "67": 2681315328.0, + "68": 2681315328.0, + "69": 2681315328.0, + "70": 2681315328.0, + "71": 2681315328.0, + "72": 2681315328.0, + "73": 2681315328.0, + "74": 2681315328.0, + "75": 2681315328.0, + "76": 2681315328.0, + "77": 2681315328.0, + "78": 2681315328.0, + "79": 2681315328.0, + "80": 2681315328.0, + "81": 2681315328.0, + "82": 2681315328.0, + "83": 2681315328.0, + "84": 2681315328.0, + "85": 2681315328.0, + "86": 2681315328.0, + "87": 2681315328.0, + "88": 2681315328.0, + "89": 2681315328.0, + "90": 2681315328.0, + "91": 2681315328.0, + "92": 2681315328.0, + "93": 2681315328.0, + "94": 2681315328.0, + "95": 2681315328.0, + "96": 2681315328.0, + "97": 2681315328.0, + "98": 2681315328.0, + "99": 2681315328.0, + "100": 2681315328.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 12.214, + "2": 0.2986, + "3": 0.17295, + "4": 0.16821, + "5": 0.16854, + "6": 0.16781, + "7": 0.16849, + "8": 0.16759, + "9": 0.16821, + "10": 0.16905, + "11": 0.16939, + "12": 0.16739, + "13": 0.16719, + "14": 0.16712, + "15": 0.16829, + "16": 0.1725, + "17": 0.16696, + "18": 0.16586, + "19": 0.16737, + "20": 0.16711, + "21": 0.16776, + "22": 0.16801, + "23": 0.16812, + "24": 0.16559, + "25": 0.16732, + "26": 0.16954, + "27": 0.16886, + "28": 0.1669, + "29": 0.16695, + "30": 0.16775, + "31": 0.16795, + "32": 0.16696, + "33": 0.16584, + "34": 0.16695, + "35": 0.16714, + "36": 0.16747, + "37": 0.16686, + "38": 0.16675, + "39": 0.16654, + "40": 0.18817, + "41": 0.16797, + "42": 0.16692, + "43": 0.16746, + "44": 0.16567, + "45": 0.1672, + "46": 0.1681, + "47": 0.16794, + "48": 0.17384, + "49": 0.17344, + "50": 0.17178, + "51": 0.17498, + "52": 0.16896, + "53": 0.2031, + "54": 0.16689, + "55": 0.16738, + "56": 0.1658, + "57": 0.16757, + "58": 0.16947, + "59": 0.16981, + "60": 0.16658, + "61": 0.16728, + "62": 0.16586, + "63": 0.16601, + "64": 0.16674, + "65": 0.16826, + "66": 0.16662, + "67": 0.16681, + "68": 0.1673, + "69": 0.16747, + "70": 0.16723, + "71": 0.16746, + "72": 0.16639, + "73": 0.16738, + "74": 0.16734, + "75": 0.16723, + "76": 0.16734, + "77": 0.16644, + "78": 0.16664, + "79": 0.16693, + "80": 0.16638, + "81": 0.16693, + "82": 0.16667, + "83": 0.1665, + "84": 0.16715, + "85": 0.16683, + "86": 0.16633, + "87": 0.16713, + "88": 0.16671, + "89": 0.16706, + "90": 0.16702, + "91": 0.16739, + "92": 0.16596, + "93": 0.1665, + "94": 0.16701, + "95": 0.16634, + "96": 0.16704, + "97": 0.16737, + "98": 0.16691, + "99": 0.16712, + "100": 0.16653 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..fba68f73b6e --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.79205, + "2": 10.80272, + "3": 10.80707, + "4": 10.77315, + "5": 10.84695, + "6": 10.86789, + "7": 10.82655, + "8": 10.81333, + "9": 10.83441, + "10": 10.77106, + "11": 10.89149, + "12": 10.84617, + "13": 10.85969, + "14": 10.8812, + "15": 10.79093, + "16": 10.78328, + "17": 10.75926, + "18": 10.79337, + "19": 10.797, + "20": 10.68042, + "21": 10.66126, + "22": 10.50248, + "23": 10.71375, + "24": 10.55253, + "25": 10.50715, + "26": 10.58275, + "27": 10.58672, + "28": 10.55873, + "29": 10.56101, + "30": 10.33325, + "31": 10.08467, + "32": 10.44744, + "33": 10.44372, + "34": 10.2003, + "35": 10.25545, + "36": 10.19448, + "37": 10.32113, + "38": 10.1659, + "39": 10.37726, + "40": 10.05544, + "41": 10.13785, + "42": 10.19159, + "43": 9.80956, + "44": 9.92967, + "45": 9.80575, + "46": 9.81454, + "47": 10.12933, + "48": 9.82644, + "49": 9.51395, + "50": 9.89082, + "51": 9.8397, + "52": 9.73412, + "53": 10.05515, + "54": 9.94093, + "55": 9.87063, + "56": 9.61009, + "57": 9.46055, + "58": 9.81541, + "59": 9.57905, + "60": 9.48478, + "61": 9.68485, + "62": 9.97574, + "63": 9.36483, + "64": 9.76838, + "65": 8.94022, + "66": 9.68864, + "67": 9.36647, + "68": 9.77611, + "69": 9.78404, + "70": 9.72243, + "71": 9.6082, + "72": 9.57758, + "73": 9.48936, + "74": 8.9399, + "75": 9.40907, + "76": 9.08135, + "77": 10.05639, + "78": 9.72293, + "79": 9.36509, + "80": 9.3976, + "81": 9.47445, + "82": 9.68843, + "83": 9.30263, + "84": 9.4102, + "85": 9.60746, + "86": 9.07122, + "87": 9.58742, + "88": 9.74129, + "89": 9.59922, + "90": 9.81041, + "91": 9.33141, + "92": 9.35529, + "93": 9.07461, + "94": 8.82759, + "95": 9.5116, + "96": 9.51899, + "97": 9.30162, + "98": 9.66741, + "99": 8.88218, + "100": 9.39722 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1580.0, + "2": 1686.0, + "3": 1726.0, + "4": 1795.0, + "5": 1901.0, + "6": 1778.0, + "7": 1963.0, + "8": 1704.0, + "9": 1811.0, + "10": 1346.0, + "11": 1849.0, + "12": 1683.0, + "13": 1888.0, + "14": 1711.0, + "15": 1926.0, + "16": 1841.0, + "17": 1931.0, + "18": 1716.0, + "19": 1765.0, + "20": 1643.0, + "21": 1884.0, + "22": 1626.0, + "23": 1954.0, + "24": 1715.0, + "25": 1683.0, + "26": 1679.0, + "27": 1817.0, + "28": 2019.0, + "29": 1946.0, + "30": 1867.0, + "31": 1544.0, + "32": 1832.0, + "33": 2119.0, + "34": 1921.0, + "35": 2020.0, + "36": 1953.0, + "37": 2350.0, + "38": 2210.0, + "39": 2319.0, + "40": 2252.0, + "41": 2449.0, + "42": 2364.0, + "43": 2089.0, + "44": 2094.0, + "45": 2243.0, + "46": 2335.0, + "47": 2406.0, + "48": 2410.0, + "49": 2341.0, + "50": 2459.0, + "51": 2611.0, + "52": 2427.0, + "53": 2838.0, + "54": 2632.0, + "55": 2291.0, + "56": 2663.0, + "57": 2276.0, + "58": 2777.0, + "59": 2601.0, + "60": 2404.0, + "61": 2985.0, + "62": 2595.0, + "63": 2454.0, + "64": 3101.0, + "65": 2474.0, + "66": 3006.0, + "67": 2671.0, + "68": 2874.0, + "69": 2956.0, + "70": 3102.0, + "71": 2891.0, + "72": 2543.0, + "73": 2860.0, + "74": 1888.0, + "75": 2603.0, + "76": 2813.0, + "77": 3361.0, + "78": 3252.0, + "79": 3007.0, + "80": 3420.0, + "81": 3624.0, + "82": 3184.0, + "83": 2708.0, + "84": 3138.0, + "85": 3388.0, + "86": 2619.0, + "87": 3682.0, + "88": 3074.0, + "89": 3260.0, + "90": 2904.0, + "91": 2634.0, + "92": 3097.0, + "93": 2745.0, + "94": 3484.0, + "95": 3333.0, + "96": 3292.0, + "97": 3141.0, + "98": 3550.0, + "99": 3170.0, + "100": 3347.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 714736640.0, + "2": 714736640.0, + "3": 714736640.0, + "4": 714736640.0, + "5": 714736640.0, + "6": 714736640.0, + "7": 714736640.0, + "8": 714736640.0, + "9": 714736640.0, + "10": 714736640.0, + "11": 714736640.0, + "12": 714736640.0, + "13": 714736640.0, + "14": 714736640.0, + "15": 714736640.0, + "16": 714736640.0, + "17": 714736640.0, + "18": 714736640.0, + "19": 714736640.0, + "20": 714736640.0, + "21": 714736640.0, + "22": 714736640.0, + "23": 714736640.0, + "24": 714736640.0, + "25": 714736640.0, + "26": 714736640.0, + "27": 714736640.0, + "28": 714736640.0, + "29": 714736640.0, + "30": 714736640.0, + "31": 714736640.0, + "32": 714736640.0, + "33": 714736640.0, + "34": 714736640.0, + "35": 714736640.0, + "36": 714736640.0, + "37": 714736640.0, + "38": 714736640.0, + "39": 714736640.0, + "40": 714736640.0, + "41": 714736640.0, + "42": 714736640.0, + "43": 714736640.0, + "44": 714736640.0, + "45": 714736640.0, + "46": 714736640.0, + "47": 714736640.0, + "48": 714736640.0, + "49": 714736640.0, + "50": 714736640.0, + "51": 714736640.0, + "52": 714736640.0, + "53": 714736640.0, + "54": 714736640.0, + "55": 714736640.0, + "56": 714736640.0, + "57": 714736640.0, + "58": 714736640.0, + "59": 714736640.0, + "60": 714736640.0, + "61": 714736640.0, + "62": 714736640.0, + "63": 714736640.0, + "64": 714736640.0, + "65": 714736640.0, + "66": 714736640.0, + "67": 714736640.0, + "68": 714736640.0, + "69": 714736640.0, + "70": 714736640.0, + "71": 714736640.0, + "72": 714736640.0, + "73": 714736640.0, + "74": 714736640.0, + "75": 714736640.0, + "76": 714736640.0, + "77": 714736640.0, + "78": 714736640.0, + "79": 714736640.0, + "80": 714736640.0, + "81": 714736640.0, + "82": 714736640.0, + "83": 714736640.0, + "84": 714736640.0, + "85": 714736640.0, + "86": 714736640.0, + "87": 714736640.0, + "88": 714736640.0, + "89": 714736640.0, + "90": 714736640.0, + "91": 714736640.0, + "92": 714736640.0, + "93": 714736640.0, + "94": 714736640.0, + "95": 714736640.0, + "96": 714736640.0, + "97": 714736640.0, + "98": 714736640.0, + "99": 714736640.0, + "100": 714736640.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2399714304.0, + "2": 2681315328.0, + "3": 2681315328.0, + "4": 2681315328.0, + "5": 2681315328.0, + "6": 2681315328.0, + "7": 2681315328.0, + "8": 2681315328.0, + "9": 2681315328.0, + "10": 2681315328.0, + "11": 2681315328.0, + "12": 2681315328.0, + "13": 2681315328.0, + "14": 2681315328.0, + "15": 2681315328.0, + "16": 2681315328.0, + "17": 2681315328.0, + "18": 2681315328.0, + "19": 2681315328.0, + "20": 2681315328.0, + "21": 2681315328.0, + "22": 2681315328.0, + "23": 2681315328.0, + "24": 2681315328.0, + "25": 2681315328.0, + "26": 2681315328.0, + "27": 2681315328.0, + "28": 2681315328.0, + "29": 2681315328.0, + "30": 2681315328.0, + "31": 2681315328.0, + "32": 2681315328.0, + "33": 2681315328.0, + "34": 2681315328.0, + "35": 2681315328.0, + "36": 2681315328.0, + "37": 2681315328.0, + "38": 2681315328.0, + "39": 2681315328.0, + "40": 2681315328.0, + "41": 2681315328.0, + "42": 2681315328.0, + "43": 2681315328.0, + "44": 2681315328.0, + "45": 2681315328.0, + "46": 2681315328.0, + "47": 2681315328.0, + "48": 2681315328.0, + "49": 2681315328.0, + "50": 2681315328.0, + "51": 2681315328.0, + "52": 2681315328.0, + "53": 2681315328.0, + "54": 2681315328.0, + "55": 2681315328.0, + "56": 2681315328.0, + "57": 2681315328.0, + "58": 2681315328.0, + "59": 2681315328.0, + "60": 2681315328.0, + "61": 2681315328.0, + "62": 2681315328.0, + "63": 2681315328.0, + "64": 2681315328.0, + "65": 2681315328.0, + "66": 2681315328.0, + "67": 2681315328.0, + "68": 2681315328.0, + "69": 2681315328.0, + "70": 2681315328.0, + "71": 2681315328.0, + "72": 2681315328.0, + "73": 2681315328.0, + "74": 2681315328.0, + "75": 2681315328.0, + "76": 2681315328.0, + "77": 2681315328.0, + "78": 2681315328.0, + "79": 2681315328.0, + "80": 2681315328.0, + "81": 2681315328.0, + "82": 2681315328.0, + "83": 2681315328.0, + "84": 2681315328.0, + "85": 2681315328.0, + "86": 2681315328.0, + "87": 2681315328.0, + "88": 2681315328.0, + "89": 2681315328.0, + "90": 2681315328.0, + "91": 2681315328.0, + "92": 2681315328.0, + "93": 2681315328.0, + "94": 2681315328.0, + "95": 2681315328.0, + "96": 2681315328.0, + "97": 2681315328.0, + "98": 2681315328.0, + "99": 2681315328.0, + "100": 2681315328.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 12.16871, + "2": 0.19825, + "3": 0.17764, + "4": 0.17796, + "5": 0.17192, + "6": 0.17224, + "7": 0.17188, + "8": 0.17172, + "9": 0.17327, + "10": 0.17337, + "11": 0.17262, + "12": 0.17206, + "13": 0.17211, + "14": 0.17318, + "15": 0.17218, + "16": 0.17375, + "17": 0.17267, + "18": 0.1736, + "19": 0.17211, + "20": 0.16903, + "21": 0.16941, + "22": 0.17049, + "23": 0.17119, + "24": 0.173, + "25": 0.16874, + "26": 0.16822, + "27": 0.16694, + "28": 0.16671, + "29": 0.16762, + "30": 0.16932, + "31": 0.17431, + "32": 0.16784, + "33": 0.16633, + "34": 0.16587, + "35": 0.16729, + "36": 0.16658, + "37": 0.16788, + "38": 0.1666, + "39": 0.16597, + "40": 0.16589, + "41": 0.16706, + "42": 0.16633, + "43": 0.16631, + "44": 0.16797, + "45": 0.16699, + "46": 0.16824, + "47": 0.167, + "48": 0.16653, + "49": 0.16587, + "50": 0.16635, + "51": 0.18233, + "52": 0.21141, + "53": 0.16986, + "54": 0.1702, + "55": 0.16952, + "56": 0.16978, + "57": 0.16872, + "58": 0.16891, + "59": 0.17005, + "60": 0.16948, + "61": 0.16922, + "62": 0.16913, + "63": 0.1694, + "64": 0.16954, + "65": 0.16972, + "66": 0.16677, + "67": 0.16621, + "68": 0.16658, + "69": 0.16617, + "70": 0.1656, + "71": 0.16718, + "72": 0.16666, + "73": 0.16987, + "74": 0.17045, + "75": 0.16726, + "76": 0.1671, + "77": 0.16753, + "78": 0.17072, + "79": 0.16826, + "80": 0.16784, + "81": 0.16717, + "82": 0.16591, + "83": 0.16729, + "84": 0.16631, + "85": 0.16697, + "86": 0.1677, + "87": 0.16577, + "88": 0.1676, + "89": 0.16708, + "90": 0.16577, + "91": 0.16637, + "92": 0.16659, + "93": 0.16604, + "94": 0.16681, + "95": 0.16705, + "96": 0.16588, + "97": 0.16674, + "98": 0.16703, + "99": 0.16605, + "100": 0.16691 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 0e382b4ce7b..732eb3335b2 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.76985, + "2": 10.81791, + "3": 10.784, + "4": 10.788, "5": 10.81927, + "6": 10.84306, + "7": 10.83464, + "8": 10.8066, + "9": 10.83359, "10": 10.73562, + "11": 10.86814, + "12": 10.85075, + "13": 10.84505, + "14": 10.87136, "15": 10.8218, + "16": 10.80433, + "17": 10.76124, + "18": 10.80363, + "19": 10.80599, "20": 10.74747, + "21": 10.7254, + "22": 10.60597, + "23": 10.74387, + "24": 10.65549, "25": 10.58002, + "26": 10.64496, + "27": 10.67191, + "28": 10.66903, + "29": 10.66652, "30": 10.46947, + "31": 10.26264, + "32": 10.56932, + "33": 10.54232, + "34": 10.36113, "35": 10.39558, + "36": 10.36866, + "37": 10.47523, + "38": 10.33715, + "39": 10.49947, "40": 10.23019, + "41": 10.30905, + "42": 10.33124, + "43": 9.99091, + "44": 10.09605, "45": 10.00787, + "46": 9.96718, + "47": 10.27077, + "48": 10.01043, + "49": 9.73437, "50": 10.04737, + "51": 10.00084, + "52": 9.89672, + "53": 10.19876, + "54": 10.09066, "55": 10.00567, + "56": 9.77199, + "57": 9.64533, + "58": 9.98587, + "59": 9.72608, "60": 9.6777, + "61": 9.8157, + "62": 10.092, + "63": 9.54758, + "64": 9.90438, "65": 9.09492, + "66": 9.84068, + "67": 9.48471, + "68": 9.88996, + "69": 9.87691, "70": 9.85294, + "71": 9.73278, + "72": 9.72558, + "73": 9.63706, + "74": 9.12334, "75": 9.55335, + "76": 9.21765, + "77": 10.15202, + "78": 9.81465, + "79": 9.47558, "80": 9.52073, + "81": 9.5872, + "82": 9.79125, + "83": 9.44848, + "84": 9.49585, "85": 9.72189, + "86": 9.18037, + "87": 9.66127, + "88": 9.84359, + "89": 9.71651, "90": 9.88102, + "91": 9.48434, + "92": 9.4705, + "93": 9.20911, + "94": 8.95382, "95": 9.60554, + "96": 9.63976, + "97": 9.38762, + "98": 9.7573, + "99": 9.0159, "100": 9.49925 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2680.0, + "2": 2615.0, + "3": 2642.0, + "4": 2479.0, "5": 2971.0, + "6": 2822.0, + "7": 2833.0, + "8": 2508.0, + "9": 2922.0, "10": 2508.0, + "11": 2917.0, + "12": 2817.0, + "13": 2935.0, + "14": 2969.0, "15": 2679.0, + "16": 2976.0, + "17": 2609.0, + "18": 2868.0, + "19": 2790.0, "20": 2461.0, + "21": 2636.0, + "22": 2356.0, + "23": 2798.0, + "24": 2613.0, "25": 2640.0, + "26": 2701.0, + "27": 2761.0, + "28": 2801.0, + "29": 2971.0, "30": 2590.0, + "31": 2307.0, + "32": 2751.0, + "33": 2881.0, + "34": 2352.0, "35": 2480.0, + "36": 2443.0, + "37": 2748.0, + "38": 2692.0, + "39": 2709.0, "40": 2570.0, + "41": 2752.0, + "42": 2689.0, + "43": 2381.0, + "44": 2483.0, "45": 2397.0, + "46": 2281.0, + "47": 2684.0, + "48": 2330.0, + "49": 2293.0, "50": 2740.0, + "51": 2575.0, + "52": 2621.0, + "53": 2891.0, + "54": 2655.0, "55": 2559.0, + "56": 2566.0, + "57": 2471.0, + "58": 2767.0, + "59": 2529.0, "60": 2289.0, + "61": 2642.0, + "62": 2820.0, + "63": 2654.0, + "64": 3020.0, "65": 2687.0, + "66": 2884.0, + "67": 2666.0, + "68": 2720.0, + "69": 2738.0, "70": 3004.0, + "71": 2816.0, + "72": 2537.0, + "73": 2826.0, + "74": 2192.0, "75": 2647.0, + "76": 3048.0, + "77": 3019.0, + "78": 3134.0, + "79": 3092.0, "80": 3054.0, + "81": 3298.0, + "82": 3350.0, + "83": 2597.0, + "84": 3436.0, "85": 3350.0, + "86": 2993.0, + "87": 3509.0, + "88": 3403.0, + "89": 3490.0, "90": 3368.0, + "91": 2461.0, + "92": 2803.0, + "93": 2933.0, + "94": 2888.0, "95": 3138.0, + "96": 3047.0, + "97": 3016.0, + "98": 3382.0, + "99": 2995.0, "100": 2490.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 745731584.0, + "2": 745731584.0, + "3": 745731584.0, + "4": 745731584.0, "5": 745731584.0, + "6": 745731584.0, + "7": 745731584.0, + "8": 745731584.0, + "9": 745731584.0, "10": 745731584.0, + "11": 745731584.0, + "12": 745731584.0, + "13": 745731584.0, + "14": 745731584.0, "15": 745731584.0, + "16": 745731584.0, + "17": 745731584.0, + "18": 745731584.0, + "19": 745731584.0, "20": 745731584.0, + "21": 745731584.0, + "22": 745731584.0, + "23": 745731584.0, + "24": 745731584.0, "25": 745731584.0, + "26": 745731584.0, + "27": 745731584.0, + "28": 745731584.0, + "29": 745731584.0, "30": 745731584.0, + "31": 745731584.0, + "32": 745731584.0, + "33": 745731584.0, + "34": 745731584.0, "35": 745731584.0, + "36": 745731584.0, + "37": 745731584.0, + "38": 745731584.0, + "39": 745731584.0, "40": 745731584.0, + "41": 745731584.0, + "42": 745731584.0, + "43": 745731584.0, + "44": 745731584.0, "45": 745731584.0, + "46": 745731584.0, + "47": 745731584.0, + "48": 745731584.0, + "49": 745731584.0, "50": 745731584.0, + "51": 745731584.0, + "52": 745731584.0, + "53": 745731584.0, + "54": 745731584.0, "55": 745731584.0, + "56": 745731584.0, + "57": 745731584.0, + "58": 745731584.0, + "59": 745731584.0, "60": 745731584.0, + "61": 745731584.0, + "62": 745731584.0, + "63": 745731584.0, + "64": 745731584.0, "65": 745731584.0, + "66": 745731584.0, + "67": 745731584.0, + "68": 745731584.0, + "69": 745731584.0, "70": 745731584.0, + "71": 745731584.0, + "72": 745731584.0, + "73": 745731584.0, + "74": 745731584.0, "75": 745731584.0, + "76": 745731584.0, + "77": 745731584.0, + "78": 745731584.0, + "79": 745731584.0, "80": 745731584.0, + "81": 745731584.0, + "82": 745731584.0, + "83": 745731584.0, + "84": 745731584.0, "85": 745731584.0, + "86": 745731584.0, + "87": 745731584.0, + "88": 745731584.0, + "89": 745731584.0, "90": 745731584.0, + "91": 745731584.0, + "92": 745731584.0, + "93": 745731584.0, + "94": 745731584.0, "95": 745731584.0, + "96": 745731584.0, + "97": 745731584.0, + "98": 745731584.0, + "99": 745731584.0, "100": 745731584.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1928906752.0, + "2": 2210568192.0, + "3": 2210568192.0, + "4": 2210568192.0, "5": 2210568192.0, + "6": 2210568192.0, + "7": 2210568192.0, + "8": 2210568192.0, + "9": 2210568192.0, "10": 2210568192.0, + "11": 2210568192.0, + "12": 2210568192.0, + "13": 2210568192.0, + "14": 2210568192.0, "15": 2210568192.0, + "16": 2210568192.0, + "17": 2210568192.0, + "18": 2210568192.0, + "19": 2210568192.0, "20": 2210568192.0, + "21": 2210568192.0, + "22": 2210568192.0, + "23": 2210568192.0, + "24": 2210568192.0, "25": 2210568192.0, + "26": 2210568192.0, + "27": 2210568192.0, + "28": 2210568192.0, + "29": 2210568192.0, "30": 2210568192.0, + "31": 2210568192.0, + "32": 2210568192.0, + "33": 2210568192.0, + "34": 2210568192.0, "35": 2210568192.0, + "36": 2210568192.0, + "37": 2210568192.0, + "38": 2210568192.0, + "39": 2210568192.0, "40": 2210568192.0, + "41": 2210568192.0, + "42": 2210568192.0, + "43": 2210568192.0, + "44": 2210568192.0, "45": 2210568192.0, + "46": 2210568192.0, + "47": 2210568192.0, + "48": 2210568192.0, + "49": 2210568192.0, "50": 2210568192.0, + "51": 2210568192.0, + "52": 2210568192.0, + "53": 2210568192.0, + "54": 2210568192.0, "55": 2210568192.0, + "56": 2210568192.0, + "57": 2210568192.0, + "58": 2210568192.0, + "59": 2210568192.0, "60": 2210568192.0, + "61": 2210568192.0, + "62": 2210568192.0, + "63": 2210568192.0, + "64": 2210568192.0, "65": 2210568192.0, + "66": 2210568192.0, + "67": 2210568192.0, + "68": 2210568192.0, + "69": 2210568192.0, "70": 2210568192.0, + "71": 2210568192.0, + "72": 2210568192.0, + "73": 2210568192.0, + "74": 2210568192.0, "75": 2210568192.0, + "76": 2210568192.0, + "77": 2210568192.0, + "78": 2210568192.0, + "79": 2210568192.0, "80": 2210568192.0, + "81": 2210568192.0, + "82": 2210568192.0, + "83": 2210568192.0, + "84": 2210568192.0, "85": 2210568192.0, + "86": 2210568192.0, + "87": 2210568192.0, + "88": 2210568192.0, + "89": 2210568192.0, "90": 2210568192.0, + "91": 2210568192.0, + "92": 2210568192.0, + "93": 2210568192.0, + "94": 2210568192.0, "95": 2210568192.0, + "96": 2210568192.0, + "97": 2210568192.0, + "98": 2210568192.0, + "99": 2210568192.0, "100": 2210568192.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 13.90495, - "5": 0.1093, - "10": 0.10381, - "15": 0.10282, - "20": 0.10222, - "25": 0.10608, - "30": 0.10579, - "35": 0.113, - "40": 0.10704, - "45": 0.10527, - "50": 0.1051, - "55": 0.10242, - "60": 0.10626, - "65": 0.10167, - "70": 0.10143, - "75": 0.10116, - "80": 0.10173, - "85": 0.10268, - "90": 0.10062, - "95": 0.10308, - "100": 0.10193 + "1": 15.52736, + "2": 0.14752, + "3": 0.12429, + "4": 0.12037, + "5": 0.12096, + "6": 0.11965, + "7": 0.1198, + "8": 0.12021, + "9": 0.12041, + "10": 0.12377, + "11": 0.11828, + "12": 0.11903, + "13": 0.12052, + "14": 0.11683, + "15": 0.1179, + "16": 0.1185, + "17": 0.1178, + "18": 0.12085, + "19": 0.11844, + "20": 0.11779, + "21": 0.11689, + "22": 0.11623, + "23": 0.11674, + "24": 0.11908, + "25": 0.11762, + "26": 0.11952, + "27": 0.11831, + "28": 0.11712, + "29": 0.11898, + "30": 0.11914, + "31": 0.11719, + "32": 0.11849, + "33": 0.1193, + "34": 0.11601, + "35": 0.1215, + "36": 0.11653, + "37": 0.11596, + "38": 0.11751, + "39": 0.1194, + "40": 0.11662, + "41": 0.11896, + "42": 0.11624, + "43": 0.11775, + "44": 0.11757, + "45": 0.11618, + "46": 0.1194, + "47": 0.11754, + "48": 0.11775, + "49": 0.11637, + "50": 0.11524, + "51": 0.14043, + "52": 0.12567, + "53": 0.12158, + "54": 0.1217, + "55": 0.15002, + "56": 0.11858, + "57": 0.11887, + "58": 0.11705, + "59": 0.11599, + "60": 0.11585, + "61": 0.11429, + "62": 0.11598, + "63": 0.116, + "64": 0.11878, + "65": 0.11921, + "66": 0.11734, + "67": 0.11708, + "68": 0.11543, + "69": 0.11703, + "70": 0.11514, + "71": 0.1178, + "72": 0.1154, + "73": 0.12116, + "74": 0.12077, + "75": 0.1166, + "76": 0.11599, + "77": 0.11628, + "78": 0.11749, + "79": 0.11828, + "80": 0.12013, + "81": 0.11887, + "82": 0.1195, + "83": 0.11685, + "84": 0.11603, + "85": 0.11434, + "86": 0.11762, + "87": 0.11821, + "88": 0.12276, + "89": 0.12384, + "90": 0.11892, + "91": 0.11831, + "92": 0.11619, + "93": 0.11613, + "94": 0.11455, + "95": 0.1172, + "96": 0.11583, + "97": 0.11939, + "98": 0.11877, + "99": 0.11703, + "100": 0.12143 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..5147f8fd670 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.76985, + "2": 10.81791, + "3": 10.784, + "4": 10.788, + "5": 10.81927, + "6": 10.84306, + "7": 10.83464, + "8": 10.8066, + "9": 10.83359, + "10": 10.73562, + "11": 10.86814, + "12": 10.85075, + "13": 10.84505, + "14": 10.87136, + "15": 10.8218, + "16": 10.80433, + "17": 10.76124, + "18": 10.80363, + "19": 10.80599, + "20": 10.74747, + "21": 10.7254, + "22": 10.60597, + "23": 10.74387, + "24": 10.65549, + "25": 10.58002, + "26": 10.64496, + "27": 10.67191, + "28": 10.66903, + "29": 10.66652, + "30": 10.46947, + "31": 10.26264, + "32": 10.56932, + "33": 10.54232, + "34": 10.36113, + "35": 10.39558, + "36": 10.36866, + "37": 10.47523, + "38": 10.33715, + "39": 10.49947, + "40": 10.23019, + "41": 10.30905, + "42": 10.33124, + "43": 9.99091, + "44": 10.09605, + "45": 10.00787, + "46": 9.96718, + "47": 10.27077, + "48": 10.01043, + "49": 9.73437, + "50": 10.04737, + "51": 10.00084, + "52": 9.89672, + "53": 10.19876, + "54": 10.09066, + "55": 10.00567, + "56": 9.77199, + "57": 9.64533, + "58": 9.98587, + "59": 9.72608, + "60": 9.6777, + "61": 9.8157, + "62": 10.092, + "63": 9.54758, + "64": 9.90438, + "65": 9.09492, + "66": 9.84068, + "67": 9.48471, + "68": 9.88996, + "69": 9.87691, + "70": 9.85294, + "71": 9.73278, + "72": 9.72558, + "73": 9.63706, + "74": 9.12334, + "75": 9.55335, + "76": 9.21765, + "77": 10.15202, + "78": 9.81465, + "79": 9.47558, + "80": 9.52073, + "81": 9.5872, + "82": 9.79125, + "83": 9.44848, + "84": 9.49585, + "85": 9.72189, + "86": 9.18037, + "87": 9.66127, + "88": 9.84359, + "89": 9.71651, + "90": 9.88102, + "91": 9.48434, + "92": 9.4705, + "93": 9.20911, + "94": 8.95382, + "95": 9.60554, + "96": 9.63976, + "97": 9.38762, + "98": 9.7573, + "99": 9.0159, + "100": 9.49925 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2680.0, + "2": 2615.0, + "3": 2642.0, + "4": 2479.0, + "5": 2971.0, + "6": 2822.0, + "7": 2833.0, + "8": 2508.0, + "9": 2922.0, + "10": 2508.0, + "11": 2917.0, + "12": 2817.0, + "13": 2935.0, + "14": 2969.0, + "15": 2679.0, + "16": 2976.0, + "17": 2609.0, + "18": 2868.0, + "19": 2790.0, + "20": 2461.0, + "21": 2636.0, + "22": 2356.0, + "23": 2798.0, + "24": 2613.0, + "25": 2640.0, + "26": 2701.0, + "27": 2761.0, + "28": 2801.0, + "29": 2971.0, + "30": 2590.0, + "31": 2307.0, + "32": 2751.0, + "33": 2881.0, + "34": 2352.0, + "35": 2480.0, + "36": 2443.0, + "37": 2748.0, + "38": 2692.0, + "39": 2709.0, + "40": 2570.0, + "41": 2752.0, + "42": 2689.0, + "43": 2381.0, + "44": 2483.0, + "45": 2397.0, + "46": 2281.0, + "47": 2684.0, + "48": 2330.0, + "49": 2293.0, + "50": 2740.0, + "51": 2575.0, + "52": 2621.0, + "53": 2891.0, + "54": 2655.0, + "55": 2559.0, + "56": 2566.0, + "57": 2471.0, + "58": 2767.0, + "59": 2529.0, + "60": 2289.0, + "61": 2642.0, + "62": 2820.0, + "63": 2654.0, + "64": 3020.0, + "65": 2687.0, + "66": 2884.0, + "67": 2666.0, + "68": 2720.0, + "69": 2738.0, + "70": 3004.0, + "71": 2816.0, + "72": 2537.0, + "73": 2826.0, + "74": 2192.0, + "75": 2647.0, + "76": 3048.0, + "77": 3019.0, + "78": 3134.0, + "79": 3092.0, + "80": 3054.0, + "81": 3298.0, + "82": 3350.0, + "83": 2597.0, + "84": 3436.0, + "85": 3350.0, + "86": 2993.0, + "87": 3509.0, + "88": 3403.0, + "89": 3490.0, + "90": 3368.0, + "91": 2461.0, + "92": 2803.0, + "93": 2933.0, + "94": 2888.0, + "95": 3138.0, + "96": 3047.0, + "97": 3016.0, + "98": 3382.0, + "99": 2995.0, + "100": 2490.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 745731584.0, + "2": 745731584.0, + "3": 745731584.0, + "4": 745731584.0, + "5": 745731584.0, + "6": 745731584.0, + "7": 745731584.0, + "8": 745731584.0, + "9": 745731584.0, + "10": 745731584.0, + "11": 745731584.0, + "12": 745731584.0, + "13": 745731584.0, + "14": 745731584.0, + "15": 745731584.0, + "16": 745731584.0, + "17": 745731584.0, + "18": 745731584.0, + "19": 745731584.0, + "20": 745731584.0, + "21": 745731584.0, + "22": 745731584.0, + "23": 745731584.0, + "24": 745731584.0, + "25": 745731584.0, + "26": 745731584.0, + "27": 745731584.0, + "28": 745731584.0, + "29": 745731584.0, + "30": 745731584.0, + "31": 745731584.0, + "32": 745731584.0, + "33": 745731584.0, + "34": 745731584.0, + "35": 745731584.0, + "36": 745731584.0, + "37": 745731584.0, + "38": 745731584.0, + "39": 745731584.0, + "40": 745731584.0, + "41": 745731584.0, + "42": 745731584.0, + "43": 745731584.0, + "44": 745731584.0, + "45": 745731584.0, + "46": 745731584.0, + "47": 745731584.0, + "48": 745731584.0, + "49": 745731584.0, + "50": 745731584.0, + "51": 745731584.0, + "52": 745731584.0, + "53": 745731584.0, + "54": 745731584.0, + "55": 745731584.0, + "56": 745731584.0, + "57": 745731584.0, + "58": 745731584.0, + "59": 745731584.0, + "60": 745731584.0, + "61": 745731584.0, + "62": 745731584.0, + "63": 745731584.0, + "64": 745731584.0, + "65": 745731584.0, + "66": 745731584.0, + "67": 745731584.0, + "68": 745731584.0, + "69": 745731584.0, + "70": 745731584.0, + "71": 745731584.0, + "72": 745731584.0, + "73": 745731584.0, + "74": 745731584.0, + "75": 745731584.0, + "76": 745731584.0, + "77": 745731584.0, + "78": 745731584.0, + "79": 745731584.0, + "80": 745731584.0, + "81": 745731584.0, + "82": 745731584.0, + "83": 745731584.0, + "84": 745731584.0, + "85": 745731584.0, + "86": 745731584.0, + "87": 745731584.0, + "88": 745731584.0, + "89": 745731584.0, + "90": 745731584.0, + "91": 745731584.0, + "92": 745731584.0, + "93": 745731584.0, + "94": 745731584.0, + "95": 745731584.0, + "96": 745731584.0, + "97": 745731584.0, + "98": 745731584.0, + "99": 745731584.0, + "100": 745731584.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1928906752.0, + "2": 2210568192.0, + "3": 2210568192.0, + "4": 2210568192.0, + "5": 2210568192.0, + "6": 2210568192.0, + "7": 2210568192.0, + "8": 2210568192.0, + "9": 2210568192.0, + "10": 2210568192.0, + "11": 2210568192.0, + "12": 2210568192.0, + "13": 2210568192.0, + "14": 2210568192.0, + "15": 2210568192.0, + "16": 2210568192.0, + "17": 2210568192.0, + "18": 2210568192.0, + "19": 2210568192.0, + "20": 2210568192.0, + "21": 2210568192.0, + "22": 2210568192.0, + "23": 2210568192.0, + "24": 2210568192.0, + "25": 2210568192.0, + "26": 2210568192.0, + "27": 2210568192.0, + "28": 2210568192.0, + "29": 2210568192.0, + "30": 2210568192.0, + "31": 2210568192.0, + "32": 2210568192.0, + "33": 2210568192.0, + "34": 2210568192.0, + "35": 2210568192.0, + "36": 2210568192.0, + "37": 2210568192.0, + "38": 2210568192.0, + "39": 2210568192.0, + "40": 2210568192.0, + "41": 2210568192.0, + "42": 2210568192.0, + "43": 2210568192.0, + "44": 2210568192.0, + "45": 2210568192.0, + "46": 2210568192.0, + "47": 2210568192.0, + "48": 2210568192.0, + "49": 2210568192.0, + "50": 2210568192.0, + "51": 2210568192.0, + "52": 2210568192.0, + "53": 2210568192.0, + "54": 2210568192.0, + "55": 2210568192.0, + "56": 2210568192.0, + "57": 2210568192.0, + "58": 2210568192.0, + "59": 2210568192.0, + "60": 2210568192.0, + "61": 2210568192.0, + "62": 2210568192.0, + "63": 2210568192.0, + "64": 2210568192.0, + "65": 2210568192.0, + "66": 2210568192.0, + "67": 2210568192.0, + "68": 2210568192.0, + "69": 2210568192.0, + "70": 2210568192.0, + "71": 2210568192.0, + "72": 2210568192.0, + "73": 2210568192.0, + "74": 2210568192.0, + "75": 2210568192.0, + "76": 2210568192.0, + "77": 2210568192.0, + "78": 2210568192.0, + "79": 2210568192.0, + "80": 2210568192.0, + "81": 2210568192.0, + "82": 2210568192.0, + "83": 2210568192.0, + "84": 2210568192.0, + "85": 2210568192.0, + "86": 2210568192.0, + "87": 2210568192.0, + "88": 2210568192.0, + "89": 2210568192.0, + "90": 2210568192.0, + "91": 2210568192.0, + "92": 2210568192.0, + "93": 2210568192.0, + "94": 2210568192.0, + "95": 2210568192.0, + "96": 2210568192.0, + "97": 2210568192.0, + "98": 2210568192.0, + "99": 2210568192.0, + "100": 2210568192.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 14.07236, + "2": 0.1439, + "3": 0.10617, + "4": 0.10423, + "5": 0.10661, + "6": 0.10547, + "7": 0.10337, + "8": 0.10254, + "9": 0.10285, + "10": 0.10538, + "11": 0.10211, + "12": 0.10209, + "13": 0.10172, + "14": 0.10352, + "15": 0.10417, + "16": 0.10185, + "17": 0.10199, + "18": 0.10179, + "19": 0.10297, + "20": 0.1054, + "21": 0.1025, + "22": 0.10172, + "23": 0.10344, + "24": 0.10371, + "25": 0.10166, + "26": 0.10183, + "27": 0.10449, + "28": 0.10545, + "29": 0.10167, + "30": 0.10337, + "31": 0.10277, + "32": 0.10385, + "33": 0.10255, + "34": 0.10441, + "35": 0.10202, + "36": 0.10215, + "37": 0.10277, + "38": 0.10448, + "39": 0.10501, + "40": 0.10325, + "41": 0.1085, + "42": 0.10236, + "43": 0.10413, + "44": 0.106, + "45": 0.10424, + "46": 0.10394, + "47": 0.1034, + "48": 0.10504, + "49": 0.10449, + "50": 0.10267, + "51": 0.12806, + "52": 0.11548, + "53": 0.11073, + "54": 0.1334, + "55": 0.10772, + "56": 0.11009, + "57": 0.10972, + "58": 0.1102, + "59": 0.11446, + "60": 0.11073, + "61": 0.10863, + "62": 0.10838, + "63": 0.10921, + "64": 0.10822, + "65": 0.11173, + "66": 0.1072, + "67": 0.10938, + "68": 0.1065, + "69": 0.10824, + "70": 0.10675, + "71": 0.10695, + "72": 0.10752, + "73": 0.10679, + "74": 0.10848, + "75": 0.1071, + "76": 0.10649, + "77": 0.1042, + "78": 0.10173, + "79": 0.10326, + "80": 0.10215, + "81": 0.10267, + "82": 0.10344, + "83": 0.10345, + "84": 0.10379, + "85": 0.10264, + "86": 0.1045, + "87": 0.10535, + "88": 0.10336, + "89": 0.1083, + "90": 0.10383, + "91": 0.10217, + "92": 0.10152, + "93": 0.10202, + "94": 0.10212, + "95": 0.10185, + "96": 0.10273, + "97": 0.10301, + "98": 0.10313, + "99": 0.10255, + "100": 0.1027 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..245c396be68 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.76985, + "2": 10.81791, + "3": 10.784, + "4": 10.788, + "5": 10.81927, + "6": 10.84306, + "7": 10.83464, + "8": 10.8066, + "9": 10.83359, + "10": 10.73562, + "11": 10.86814, + "12": 10.85075, + "13": 10.84505, + "14": 10.87136, + "15": 10.8218, + "16": 10.80433, + "17": 10.76124, + "18": 10.80363, + "19": 10.80599, + "20": 10.74747, + "21": 10.7254, + "22": 10.60597, + "23": 10.74387, + "24": 10.65549, + "25": 10.58002, + "26": 10.64496, + "27": 10.67191, + "28": 10.66903, + "29": 10.66652, + "30": 10.46947, + "31": 10.26264, + "32": 10.56932, + "33": 10.54232, + "34": 10.36113, + "35": 10.39558, + "36": 10.36866, + "37": 10.47523, + "38": 10.33715, + "39": 10.49947, + "40": 10.23019, + "41": 10.30905, + "42": 10.33124, + "43": 9.99091, + "44": 10.09605, + "45": 10.00787, + "46": 9.96718, + "47": 10.27077, + "48": 10.01043, + "49": 9.73437, + "50": 10.04737, + "51": 10.00084, + "52": 9.89672, + "53": 10.19876, + "54": 10.09066, + "55": 10.00567, + "56": 9.77199, + "57": 9.64533, + "58": 9.98587, + "59": 9.72608, + "60": 9.6777, + "61": 9.8157, + "62": 10.092, + "63": 9.54758, + "64": 9.90438, + "65": 9.09492, + "66": 9.84068, + "67": 9.48471, + "68": 9.88996, + "69": 9.87691, + "70": 9.85294, + "71": 9.73278, + "72": 9.72558, + "73": 9.63706, + "74": 9.12334, + "75": 9.55335, + "76": 9.21765, + "77": 10.15202, + "78": 9.81465, + "79": 9.47558, + "80": 9.52073, + "81": 9.5872, + "82": 9.79125, + "83": 9.44848, + "84": 9.49585, + "85": 9.72189, + "86": 9.18037, + "87": 9.66127, + "88": 9.84359, + "89": 9.71651, + "90": 9.88102, + "91": 9.48434, + "92": 9.4705, + "93": 9.20911, + "94": 8.95382, + "95": 9.60554, + "96": 9.63976, + "97": 9.38762, + "98": 9.7573, + "99": 9.0159, + "100": 9.49925 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2680.0, + "2": 2615.0, + "3": 2642.0, + "4": 2479.0, + "5": 2971.0, + "6": 2822.0, + "7": 2833.0, + "8": 2508.0, + "9": 2922.0, + "10": 2508.0, + "11": 2917.0, + "12": 2817.0, + "13": 2935.0, + "14": 2969.0, + "15": 2679.0, + "16": 2976.0, + "17": 2609.0, + "18": 2868.0, + "19": 2790.0, + "20": 2461.0, + "21": 2636.0, + "22": 2356.0, + "23": 2798.0, + "24": 2613.0, + "25": 2640.0, + "26": 2701.0, + "27": 2761.0, + "28": 2801.0, + "29": 2971.0, + "30": 2590.0, + "31": 2307.0, + "32": 2751.0, + "33": 2881.0, + "34": 2352.0, + "35": 2480.0, + "36": 2443.0, + "37": 2748.0, + "38": 2692.0, + "39": 2709.0, + "40": 2570.0, + "41": 2752.0, + "42": 2689.0, + "43": 2381.0, + "44": 2483.0, + "45": 2397.0, + "46": 2281.0, + "47": 2684.0, + "48": 2330.0, + "49": 2293.0, + "50": 2740.0, + "51": 2575.0, + "52": 2621.0, + "53": 2891.0, + "54": 2655.0, + "55": 2559.0, + "56": 2566.0, + "57": 2471.0, + "58": 2767.0, + "59": 2529.0, + "60": 2289.0, + "61": 2642.0, + "62": 2820.0, + "63": 2654.0, + "64": 3020.0, + "65": 2687.0, + "66": 2884.0, + "67": 2666.0, + "68": 2720.0, + "69": 2738.0, + "70": 3004.0, + "71": 2816.0, + "72": 2537.0, + "73": 2826.0, + "74": 2192.0, + "75": 2647.0, + "76": 3048.0, + "77": 3019.0, + "78": 3134.0, + "79": 3092.0, + "80": 3054.0, + "81": 3298.0, + "82": 3350.0, + "83": 2597.0, + "84": 3436.0, + "85": 3350.0, + "86": 2993.0, + "87": 3509.0, + "88": 3403.0, + "89": 3490.0, + "90": 3368.0, + "91": 2461.0, + "92": 2803.0, + "93": 2933.0, + "94": 2888.0, + "95": 3138.0, + "96": 3047.0, + "97": 3016.0, + "98": 3382.0, + "99": 2995.0, + "100": 2490.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 745731584.0, + "2": 745731584.0, + "3": 745731584.0, + "4": 745731584.0, + "5": 745731584.0, + "6": 745731584.0, + "7": 745731584.0, + "8": 745731584.0, + "9": 745731584.0, + "10": 745731584.0, + "11": 745731584.0, + "12": 745731584.0, + "13": 745731584.0, + "14": 745731584.0, + "15": 745731584.0, + "16": 745731584.0, + "17": 745731584.0, + "18": 745731584.0, + "19": 745731584.0, + "20": 745731584.0, + "21": 745731584.0, + "22": 745731584.0, + "23": 745731584.0, + "24": 745731584.0, + "25": 745731584.0, + "26": 745731584.0, + "27": 745731584.0, + "28": 745731584.0, + "29": 745731584.0, + "30": 745731584.0, + "31": 745731584.0, + "32": 745731584.0, + "33": 745731584.0, + "34": 745731584.0, + "35": 745731584.0, + "36": 745731584.0, + "37": 745731584.0, + "38": 745731584.0, + "39": 745731584.0, + "40": 745731584.0, + "41": 745731584.0, + "42": 745731584.0, + "43": 745731584.0, + "44": 745731584.0, + "45": 745731584.0, + "46": 745731584.0, + "47": 745731584.0, + "48": 745731584.0, + "49": 745731584.0, + "50": 745731584.0, + "51": 745731584.0, + "52": 745731584.0, + "53": 745731584.0, + "54": 745731584.0, + "55": 745731584.0, + "56": 745731584.0, + "57": 745731584.0, + "58": 745731584.0, + "59": 745731584.0, + "60": 745731584.0, + "61": 745731584.0, + "62": 745731584.0, + "63": 745731584.0, + "64": 745731584.0, + "65": 745731584.0, + "66": 745731584.0, + "67": 745731584.0, + "68": 745731584.0, + "69": 745731584.0, + "70": 745731584.0, + "71": 745731584.0, + "72": 745731584.0, + "73": 745731584.0, + "74": 745731584.0, + "75": 745731584.0, + "76": 745731584.0, + "77": 745731584.0, + "78": 745731584.0, + "79": 745731584.0, + "80": 745731584.0, + "81": 745731584.0, + "82": 745731584.0, + "83": 745731584.0, + "84": 745731584.0, + "85": 745731584.0, + "86": 745731584.0, + "87": 745731584.0, + "88": 745731584.0, + "89": 745731584.0, + "90": 745731584.0, + "91": 745731584.0, + "92": 745731584.0, + "93": 745731584.0, + "94": 745731584.0, + "95": 745731584.0, + "96": 745731584.0, + "97": 745731584.0, + "98": 745731584.0, + "99": 745731584.0, + "100": 745731584.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1928906752.0, + "2": 2210568192.0, + "3": 2210568192.0, + "4": 2210568192.0, + "5": 2210568192.0, + "6": 2210568192.0, + "7": 2210568192.0, + "8": 2210568192.0, + "9": 2210568192.0, + "10": 2210568192.0, + "11": 2210568192.0, + "12": 2210568192.0, + "13": 2210568192.0, + "14": 2210568192.0, + "15": 2210568192.0, + "16": 2210568192.0, + "17": 2210568192.0, + "18": 2210568192.0, + "19": 2210568192.0, + "20": 2210568192.0, + "21": 2210568192.0, + "22": 2210568192.0, + "23": 2210568192.0, + "24": 2210568192.0, + "25": 2210568192.0, + "26": 2210568192.0, + "27": 2210568192.0, + "28": 2210568192.0, + "29": 2210568192.0, + "30": 2210568192.0, + "31": 2210568192.0, + "32": 2210568192.0, + "33": 2210568192.0, + "34": 2210568192.0, + "35": 2210568192.0, + "36": 2210568192.0, + "37": 2210568192.0, + "38": 2210568192.0, + "39": 2210568192.0, + "40": 2210568192.0, + "41": 2210568192.0, + "42": 2210568192.0, + "43": 2210568192.0, + "44": 2210568192.0, + "45": 2210568192.0, + "46": 2210568192.0, + "47": 2210568192.0, + "48": 2210568192.0, + "49": 2210568192.0, + "50": 2210568192.0, + "51": 2210568192.0, + "52": 2210568192.0, + "53": 2210568192.0, + "54": 2210568192.0, + "55": 2210568192.0, + "56": 2210568192.0, + "57": 2210568192.0, + "58": 2210568192.0, + "59": 2210568192.0, + "60": 2210568192.0, + "61": 2210568192.0, + "62": 2210568192.0, + "63": 2210568192.0, + "64": 2210568192.0, + "65": 2210568192.0, + "66": 2210568192.0, + "67": 2210568192.0, + "68": 2210568192.0, + "69": 2210568192.0, + "70": 2210568192.0, + "71": 2210568192.0, + "72": 2210568192.0, + "73": 2210568192.0, + "74": 2210568192.0, + "75": 2210568192.0, + "76": 2210568192.0, + "77": 2210568192.0, + "78": 2210568192.0, + "79": 2210568192.0, + "80": 2210568192.0, + "81": 2210568192.0, + "82": 2210568192.0, + "83": 2210568192.0, + "84": 2210568192.0, + "85": 2210568192.0, + "86": 2210568192.0, + "87": 2210568192.0, + "88": 2210568192.0, + "89": 2210568192.0, + "90": 2210568192.0, + "91": 2210568192.0, + "92": 2210568192.0, + "93": 2210568192.0, + "94": 2210568192.0, + "95": 2210568192.0, + "96": 2210568192.0, + "97": 2210568192.0, + "98": 2210568192.0, + "99": 2210568192.0, + "100": 2210568192.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 15.33061, + "2": 0.15156, + "3": 0.12174, + "4": 0.12197, + "5": 0.12023, + "6": 0.11997, + "7": 0.11882, + "8": 0.11859, + "9": 0.11967, + "10": 0.11724, + "11": 0.11735, + "12": 0.11593, + "13": 0.11661, + "14": 0.11794, + "15": 0.11649, + "16": 0.11682, + "17": 0.11623, + "18": 0.11719, + "19": 0.11753, + "20": 0.11581, + "21": 0.11757, + "22": 0.11628, + "23": 0.11692, + "24": 0.1163, + "25": 0.1167, + "26": 0.11646, + "27": 0.11803, + "28": 0.11984, + "29": 0.11941, + "30": 0.11857, + "31": 0.11687, + "32": 0.11515, + "33": 0.11754, + "34": 0.11591, + "35": 0.11819, + "36": 0.11754, + "37": 0.11694, + "38": 0.11726, + "39": 0.11761, + "40": 0.11745, + "41": 0.11768, + "42": 0.11775, + "43": 0.11661, + "44": 0.11724, + "45": 0.1189, + "46": 0.11964, + "47": 0.11985, + "48": 0.12086, + "49": 0.11855, + "50": 0.11941, + "51": 0.13155, + "52": 0.12627, + "53": 0.12132, + "54": 0.12027, + "55": 0.12076, + "56": 0.14178, + "57": 0.12294, + "58": 0.12155, + "59": 0.11843, + "60": 0.11687, + "61": 0.11827, + "62": 0.11957, + "63": 0.11945, + "64": 0.11781, + "65": 0.12041, + "66": 0.11949, + "67": 0.12059, + "68": 0.11821, + "69": 0.11858, + "70": 0.11799, + "71": 0.12009, + "72": 0.12095, + "73": 0.11845, + "74": 0.11834, + "75": 0.11893, + "76": 0.1214, + "77": 0.1195, + "78": 0.11933, + "79": 0.11885, + "80": 0.11948, + "81": 0.12097, + "82": 0.12, + "83": 0.11954, + "84": 0.11693, + "85": 0.1175, + "86": 0.11941, + "87": 0.11723, + "88": 0.11941, + "89": 0.11804, + "90": 0.11751, + "91": 0.11952, + "92": 0.11778, + "93": 0.11924, + "94": 0.11755, + "95": 0.11789, + "96": 0.11673, + "97": 0.11967, + "98": 0.11752, + "99": 0.11926, + "100": 0.11806 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index ac706ac960b..7b9a1722673 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.91349, + "2": 10.90719, + "3": 10.91328, + "4": 10.87838, "5": 10.91769, + "6": 10.93821, + "7": 10.90469, + "8": 10.90393, + "9": 10.90876, "10": 10.89645, + "11": 10.92562, + "12": 10.91891, + "13": 10.91537, + "14": 10.93343, "15": 10.86115, + "16": 10.85374, + "17": 10.82717, + "18": 10.86544, + "19": 10.86225, "20": 10.76737, + "21": 10.74634, + "22": 10.62228, + "23": 10.76122, + "24": 10.64732, "25": 10.59597, + "26": 10.66352, + "27": 10.6542, + "28": 10.6077, + "29": 10.62581, "30": 10.41591, + "31": 10.16855, + "32": 10.50267, + "33": 10.50304, + "34": 10.25481, "35": 10.31879, + "36": 10.27167, + "37": 10.37751, + "38": 10.22122, + "39": 10.44798, "40": 10.14166, + "41": 10.1771, + "42": 10.2426, + "43": 9.87148, + "44": 9.99875, "45": 9.88702, + "46": 9.86139, + "47": 10.18144, + "48": 9.87873, + "49": 9.58706, "50": 9.9542, + "51": 9.8866, + "52": 9.78429, + "53": 10.10842, + "54": 9.97368, "55": 9.89803, + "56": 9.65427, + "57": 9.52013, + "58": 9.87297, + "59": 9.6132, "60": 9.54967, + "61": 9.70681, + "62": 9.98533, + "63": 9.41357, + "64": 9.80966, "65": 8.97052, + "66": 9.72773, + "67": 9.39183, + "68": 9.8084, + "69": 9.82052, "70": 9.76655, + "71": 9.63414, + "72": 9.60485, + "73": 9.52299, + "74": 8.9718, "75": 9.42321, + "76": 9.10113, + "77": 10.0716, + "78": 9.74266, + "79": 9.40343, "80": 9.41333, + "81": 9.49931, + "82": 9.70236, + "83": 9.33436, + "84": 9.43774, "85": 9.63924, + "86": 9.07931, + "87": 9.60447, + "88": 9.7824, + "89": 9.62386, "90": 9.84241, + "91": 9.35506, + "92": 9.38398, + "93": 9.09747, + "94": 8.8471, "95": 9.5314, + "96": 9.54263, + "97": 9.32886, + "98": 9.6926, + "99": 8.89976, "100": 9.43124 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 22727424.0, + "2": 22925204.0, + "3": 22596900.0, + "4": 23219556.0, "5": 22714624.0, + "6": 23021776.0, + "7": 22771632.0, + "8": 22926560.0, + "9": 22842156.0, "10": 22918168.0, + "11": 22500688.0, + "12": 22459470.0, + "13": 22917228.0, + "14": 22387988.0, "15": 22821732.0, + "16": 22830306.0, + "17": 22819520.0, + "18": 22582628.0, + "19": 22618028.0, "20": 22693852.0, + "21": 22739344.0, + "22": 22799596.0, + "23": 22539016.0, + "24": 22770946.0, "25": 22819324.0, + "26": 22547928.0, + "27": 22468716.0, + "28": 22453820.0, + "29": 22529898.0, "30": 22631220.0, + "31": 22955420.0, + "32": 22585276.0, + "33": 22558602.0, + "34": 22835792.0, "35": 22788208.0, + "36": 22589796.0, + "37": 22496928.0, + "38": 22896192.0, + "39": 22801858.0, "40": 22657640.0, + "41": 22658982.0, + "42": 22667052.0, + "43": 22975816.0, + "44": 22747688.0, "45": 22674846.0, + "46": 22884684.0, + "47": 22633708.0, + "48": 22928466.0, + "49": 22728092.0, "50": 22905080.0, + "51": 22791108.0, + "52": 22748190.0, + "53": 22924900.0, + "54": 22840164.0, "55": 22518344.0, + "56": 22877680.0, + "57": 23113944.0, + "58": 22846268.0, + "59": 22716084.0, "60": 22742984.0, + "61": 22724584.0, + "62": 22672944.0, + "63": 22846388.0, + "64": 22823650.0, "65": 23061058.0, + "66": 22729266.0, + "67": 22908888.0, + "68": 22610020.0, + "69": 22583826.0, "70": 22829374.0, + "71": 22748240.0, + "72": 22654480.0, + "73": 22741180.0, + "74": 23047914.0, "75": 23054396.0, + "76": 22900788.0, + "77": 22271588.0, + "78": 22789024.0, + "79": 22743632.0, "80": 22706696.0, + "81": 22891372.0, + "82": 22777860.0, + "83": 22840532.0, + "84": 23010386.0, "85": 22711212.0, + "86": 23103006.0, + "87": 22734564.0, + "88": 22637848.0, + "89": 22497850.0, "90": 22972712.0, + "91": 22767188.0, + "92": 22808834.0, + "93": 22659304.0, + "94": 22911552.0, "95": 23047794.0, + "96": 22829386.0, + "97": 22608168.0, + "98": 22762756.0, + "99": 22905900.0, "100": 23015488.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 746443264.0, + "2": 746443264.0, + "3": 746443264.0, + "4": 746443264.0, "5": 746443264.0, + "6": 746443264.0, + "7": 746443264.0, + "8": 746443264.0, + "9": 746443264.0, "10": 746443264.0, + "11": 746443264.0, + "12": 746443264.0, + "13": 746443264.0, + "14": 746443264.0, "15": 746443264.0, + "16": 746443264.0, + "17": 746443264.0, + "18": 746443264.0, + "19": 746443264.0, "20": 746443264.0, + "21": 746443264.0, + "22": 746443264.0, + "23": 746443264.0, + "24": 746443264.0, "25": 746443264.0, + "26": 746443264.0, + "27": 746443264.0, + "28": 746443264.0, + "29": 746443264.0, "30": 746443264.0, + "31": 746443264.0, + "32": 746443264.0, + "33": 746443264.0, + "34": 746443264.0, "35": 746443264.0, + "36": 746443264.0, + "37": 746443264.0, + "38": 746443264.0, + "39": 746443264.0, "40": 746443264.0, + "41": 746443264.0, + "42": 746443264.0, + "43": 746443264.0, + "44": 746443264.0, "45": 746443264.0, + "46": 746443264.0, + "47": 746443264.0, + "48": 746443264.0, + "49": 746443264.0, "50": 746443264.0, + "51": 746443264.0, + "52": 746443264.0, + "53": 746443264.0, + "54": 746443264.0, "55": 746443264.0, + "56": 746443264.0, + "57": 746443264.0, + "58": 746443264.0, + "59": 746443264.0, "60": 746443264.0, + "61": 746443264.0, + "62": 746443264.0, + "63": 746443264.0, + "64": 746443264.0, "65": 746443264.0, + "66": 746443264.0, + "67": 746443264.0, + "68": 746443264.0, + "69": 746443264.0, "70": 746443264.0, + "71": 746443264.0, + "72": 746443264.0, + "73": 746443264.0, + "74": 746443264.0, "75": 746443264.0, + "76": 746443264.0, + "77": 746443264.0, + "78": 746443264.0, + "79": 746443264.0, "80": 746443264.0, + "81": 746443264.0, + "82": 746443264.0, + "83": 746443264.0, + "84": 746443264.0, "85": 746443264.0, + "86": 746443264.0, + "87": 746443264.0, + "88": 746443264.0, + "89": 746443264.0, "90": 746443264.0, + "91": 746443264.0, + "92": 746443264.0, + "93": 746443264.0, + "94": 746443264.0, "95": 746443264.0, + "96": 746443264.0, + "97": 746443264.0, + "98": 746443264.0, + "99": 746443264.0, "100": 746443264.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1926291456.0, + "2": 2210100224.0, + "3": 2210100224.0, + "4": 2210100224.0, "5": 2210100224.0, + "6": 2210100224.0, + "7": 2210100224.0, + "8": 2210100224.0, + "9": 2210100224.0, "10": 2210100224.0, + "11": 2210100224.0, + "12": 2210100224.0, + "13": 2210100224.0, + "14": 2210100224.0, "15": 2210100224.0, + "16": 2210100224.0, + "17": 2210100224.0, + "18": 2210100224.0, + "19": 2210100224.0, "20": 2210100224.0, + "21": 2210100224.0, + "22": 2210100224.0, + "23": 2210100224.0, + "24": 2210100224.0, "25": 2210100224.0, + "26": 2210100224.0, + "27": 2210100224.0, + "28": 2210100224.0, + "29": 2210100224.0, "30": 2210100224.0, + "31": 2210100224.0, + "32": 2210100224.0, + "33": 2210100224.0, + "34": 2210100224.0, "35": 2210100224.0, + "36": 2210100224.0, + "37": 2210100224.0, + "38": 2210100224.0, + "39": 2210100224.0, "40": 2210100224.0, + "41": 2210100224.0, + "42": 2210100224.0, + "43": 2210100224.0, + "44": 2210100224.0, "45": 2210100224.0, + "46": 2210100224.0, + "47": 2210100224.0, + "48": 2210100224.0, + "49": 2210100224.0, "50": 2210100224.0, + "51": 2210100224.0, + "52": 2210100224.0, + "53": 2210100224.0, + "54": 2210100224.0, "55": 2210100224.0, + "56": 2210100224.0, + "57": 2210100224.0, + "58": 2210100224.0, + "59": 2210100224.0, "60": 2210100224.0, + "61": 2210100224.0, + "62": 2210100224.0, + "63": 2210100224.0, + "64": 2210100224.0, "65": 2210100224.0, + "66": 2210100224.0, + "67": 2210100224.0, + "68": 2210100224.0, + "69": 2210100224.0, "70": 2210100224.0, + "71": 2210100224.0, + "72": 2210100224.0, + "73": 2210100224.0, + "74": 2210100224.0, "75": 2210100224.0, + "76": 2210100224.0, + "77": 2210100224.0, + "78": 2210100224.0, + "79": 2210100224.0, "80": 2210100224.0, + "81": 2210100224.0, + "82": 2210100224.0, + "83": 2210100224.0, + "84": 2210100224.0, "85": 2210100224.0, + "86": 2210100224.0, + "87": 2210100224.0, + "88": 2210100224.0, + "89": 2210100224.0, "90": 2210100224.0, + "91": 2210100224.0, + "92": 2210100224.0, + "93": 2210100224.0, + "94": 2210100224.0, "95": 2210100224.0, + "96": 2210100224.0, + "97": 2210100224.0, + "98": 2210100224.0, + "99": 2210100224.0, "100": 2210100224.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 12.51362, - "5": 0.10049, - "10": 0.10087, - "15": 0.09868, - "20": 0.09931, - "25": 0.09841, - "30": 0.09873, - "35": 0.09844, - "40": 0.09896, - "45": 0.09974, - "50": 0.09906, - "55": 0.10067, - "60": 0.09886, - "65": 0.0994, - "70": 0.09923, - "75": 0.09864, - "80": 0.09906, - "85": 0.09932, - "90": 0.09976, - "95": 0.09902, - "100": 0.09871 + "1": 14.52368, + "2": 0.12904, + "3": 0.11517, + "4": 0.11756, + "5": 0.11573, + "6": 0.11676, + "7": 0.11475, + "8": 0.11625, + "9": 0.11519, + "10": 0.12088, + "11": 0.11883, + "12": 0.11908, + "13": 0.11781, + "14": 0.11708, + "15": 0.11808, + "16": 0.11499, + "17": 0.11904, + "18": 0.11758, + "19": 0.11836, + "20": 0.11696, + "21": 0.11517, + "22": 0.11537, + "23": 0.11509, + "24": 0.11668, + "25": 0.11421, + "26": 0.11535, + "27": 0.1148, + "28": 0.11573, + "29": 0.11684, + "30": 0.11652, + "31": 0.11749, + "32": 0.11508, + "33": 0.11651, + "34": 0.11541, + "35": 0.11609, + "36": 0.11722, + "37": 0.11735, + "38": 0.11849, + "39": 0.11931, + "40": 0.11381, + "41": 0.11418, + "42": 0.11682, + "43": 0.1172, + "44": 0.11595, + "45": 0.1149, + "46": 0.11591, + "47": 0.11441, + "48": 0.11991, + "49": 0.11482, + "50": 0.11551, + "51": 0.12066, + "52": 0.11485, + "53": 0.11554, + "54": 0.11513, + "55": 0.11749, + "56": 0.11612, + "57": 0.11313, + "58": 0.1131, + "59": 0.11488, + "60": 0.11602, + "61": 0.11343, + "62": 0.11313, + "63": 0.11487, + "64": 0.11581, + "65": 0.11438, + "66": 0.11344, + "67": 0.11567, + "68": 0.11465, + "69": 0.11374, + "70": 0.11452, + "71": 0.11431, + "72": 0.1157, + "73": 0.11626, + "74": 0.11498, + "75": 0.11329, + "76": 0.11264, + "77": 0.11291, + "78": 0.11343, + "79": 0.11536, + "80": 0.11515, + "81": 0.11726, + "82": 0.11537, + "83": 0.11363, + "84": 0.11591, + "85": 0.11747, + "86": 0.11816, + "87": 0.11504, + "88": 0.11547, + "89": 0.11463, + "90": 0.11598, + "91": 0.11209, + "92": 0.11386, + "93": 0.11296, + "94": 0.11351, + "95": 0.11409, + "96": 0.11256, + "97": 0.11707, + "98": 0.1149, + "99": 0.11577, + "100": 0.1143 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..6e9f643a273 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.91349, + "2": 10.90719, + "3": 10.91328, + "4": 10.87838, + "5": 10.91769, + "6": 10.93821, + "7": 10.90469, + "8": 10.90393, + "9": 10.90876, + "10": 10.89645, + "11": 10.92562, + "12": 10.91891, + "13": 10.91537, + "14": 10.93343, + "15": 10.86115, + "16": 10.85374, + "17": 10.82717, + "18": 10.86544, + "19": 10.86225, + "20": 10.76737, + "21": 10.74634, + "22": 10.62228, + "23": 10.76122, + "24": 10.64732, + "25": 10.59597, + "26": 10.66352, + "27": 10.6542, + "28": 10.6077, + "29": 10.62581, + "30": 10.41591, + "31": 10.16855, + "32": 10.50267, + "33": 10.50304, + "34": 10.25481, + "35": 10.31879, + "36": 10.27167, + "37": 10.37751, + "38": 10.22122, + "39": 10.44798, + "40": 10.14166, + "41": 10.1771, + "42": 10.2426, + "43": 9.87148, + "44": 9.99875, + "45": 9.88702, + "46": 9.86139, + "47": 10.18144, + "48": 9.87873, + "49": 9.58706, + "50": 9.9542, + "51": 9.8866, + "52": 9.78429, + "53": 10.10842, + "54": 9.97368, + "55": 9.89803, + "56": 9.65427, + "57": 9.52013, + "58": 9.87297, + "59": 9.6132, + "60": 9.54967, + "61": 9.70681, + "62": 9.98533, + "63": 9.41357, + "64": 9.80966, + "65": 8.97052, + "66": 9.72773, + "67": 9.39183, + "68": 9.8084, + "69": 9.82052, + "70": 9.76655, + "71": 9.63414, + "72": 9.60485, + "73": 9.52299, + "74": 8.9718, + "75": 9.42321, + "76": 9.10113, + "77": 10.0716, + "78": 9.74266, + "79": 9.40343, + "80": 9.41333, + "81": 9.49931, + "82": 9.70236, + "83": 9.33436, + "84": 9.43774, + "85": 9.63924, + "86": 9.07931, + "87": 9.60447, + "88": 9.7824, + "89": 9.62386, + "90": 9.84241, + "91": 9.35506, + "92": 9.38398, + "93": 9.09747, + "94": 8.8471, + "95": 9.5314, + "96": 9.54263, + "97": 9.32886, + "98": 9.6926, + "99": 8.89976, + "100": 9.43124 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 22727424.0, + "2": 22925204.0, + "3": 22596900.0, + "4": 23219556.0, + "5": 22714624.0, + "6": 23021776.0, + "7": 22771632.0, + "8": 22926560.0, + "9": 22842156.0, + "10": 22918168.0, + "11": 22500688.0, + "12": 22459470.0, + "13": 22917228.0, + "14": 22387988.0, + "15": 22821732.0, + "16": 22830306.0, + "17": 22819520.0, + "18": 22582628.0, + "19": 22618028.0, + "20": 22693852.0, + "21": 22739344.0, + "22": 22799596.0, + "23": 22539016.0, + "24": 22770946.0, + "25": 22819324.0, + "26": 22547928.0, + "27": 22468716.0, + "28": 22453820.0, + "29": 22529898.0, + "30": 22631220.0, + "31": 22955420.0, + "32": 22585276.0, + "33": 22558602.0, + "34": 22835792.0, + "35": 22788208.0, + "36": 22589796.0, + "37": 22496928.0, + "38": 22896192.0, + "39": 22801858.0, + "40": 22657640.0, + "41": 22658982.0, + "42": 22667052.0, + "43": 22975816.0, + "44": 22747688.0, + "45": 22674846.0, + "46": 22884684.0, + "47": 22633708.0, + "48": 22928466.0, + "49": 22728092.0, + "50": 22905080.0, + "51": 22791108.0, + "52": 22748190.0, + "53": 22924900.0, + "54": 22840164.0, + "55": 22518344.0, + "56": 22877680.0, + "57": 23113944.0, + "58": 22846268.0, + "59": 22716084.0, + "60": 22742984.0, + "61": 22724584.0, + "62": 22672944.0, + "63": 22846388.0, + "64": 22823650.0, + "65": 23061058.0, + "66": 22729266.0, + "67": 22908888.0, + "68": 22610020.0, + "69": 22583826.0, + "70": 22829374.0, + "71": 22748240.0, + "72": 22654480.0, + "73": 22741180.0, + "74": 23047914.0, + "75": 23054396.0, + "76": 22900788.0, + "77": 22271588.0, + "78": 22789024.0, + "79": 22743632.0, + "80": 22706696.0, + "81": 22891372.0, + "82": 22777860.0, + "83": 22840532.0, + "84": 23010386.0, + "85": 22711212.0, + "86": 23103006.0, + "87": 22734564.0, + "88": 22637848.0, + "89": 22497850.0, + "90": 22972712.0, + "91": 22767188.0, + "92": 22808834.0, + "93": 22659304.0, + "94": 22911552.0, + "95": 23047794.0, + "96": 22829386.0, + "97": 22608168.0, + "98": 22762756.0, + "99": 22905900.0, + "100": 23015488.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 746443264.0, + "2": 746443264.0, + "3": 746443264.0, + "4": 746443264.0, + "5": 746443264.0, + "6": 746443264.0, + "7": 746443264.0, + "8": 746443264.0, + "9": 746443264.0, + "10": 746443264.0, + "11": 746443264.0, + "12": 746443264.0, + "13": 746443264.0, + "14": 746443264.0, + "15": 746443264.0, + "16": 746443264.0, + "17": 746443264.0, + "18": 746443264.0, + "19": 746443264.0, + "20": 746443264.0, + "21": 746443264.0, + "22": 746443264.0, + "23": 746443264.0, + "24": 746443264.0, + "25": 746443264.0, + "26": 746443264.0, + "27": 746443264.0, + "28": 746443264.0, + "29": 746443264.0, + "30": 746443264.0, + "31": 746443264.0, + "32": 746443264.0, + "33": 746443264.0, + "34": 746443264.0, + "35": 746443264.0, + "36": 746443264.0, + "37": 746443264.0, + "38": 746443264.0, + "39": 746443264.0, + "40": 746443264.0, + "41": 746443264.0, + "42": 746443264.0, + "43": 746443264.0, + "44": 746443264.0, + "45": 746443264.0, + "46": 746443264.0, + "47": 746443264.0, + "48": 746443264.0, + "49": 746443264.0, + "50": 746443264.0, + "51": 746443264.0, + "52": 746443264.0, + "53": 746443264.0, + "54": 746443264.0, + "55": 746443264.0, + "56": 746443264.0, + "57": 746443264.0, + "58": 746443264.0, + "59": 746443264.0, + "60": 746443264.0, + "61": 746443264.0, + "62": 746443264.0, + "63": 746443264.0, + "64": 746443264.0, + "65": 746443264.0, + "66": 746443264.0, + "67": 746443264.0, + "68": 746443264.0, + "69": 746443264.0, + "70": 746443264.0, + "71": 746443264.0, + "72": 746443264.0, + "73": 746443264.0, + "74": 746443264.0, + "75": 746443264.0, + "76": 746443264.0, + "77": 746443264.0, + "78": 746443264.0, + "79": 746443264.0, + "80": 746443264.0, + "81": 746443264.0, + "82": 746443264.0, + "83": 746443264.0, + "84": 746443264.0, + "85": 746443264.0, + "86": 746443264.0, + "87": 746443264.0, + "88": 746443264.0, + "89": 746443264.0, + "90": 746443264.0, + "91": 746443264.0, + "92": 746443264.0, + "93": 746443264.0, + "94": 746443264.0, + "95": 746443264.0, + "96": 746443264.0, + "97": 746443264.0, + "98": 746443264.0, + "99": 746443264.0, + "100": 746443264.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1926291456.0, + "2": 2210100224.0, + "3": 2210100224.0, + "4": 2210100224.0, + "5": 2210100224.0, + "6": 2210100224.0, + "7": 2210100224.0, + "8": 2210100224.0, + "9": 2210100224.0, + "10": 2210100224.0, + "11": 2210100224.0, + "12": 2210100224.0, + "13": 2210100224.0, + "14": 2210100224.0, + "15": 2210100224.0, + "16": 2210100224.0, + "17": 2210100224.0, + "18": 2210100224.0, + "19": 2210100224.0, + "20": 2210100224.0, + "21": 2210100224.0, + "22": 2210100224.0, + "23": 2210100224.0, + "24": 2210100224.0, + "25": 2210100224.0, + "26": 2210100224.0, + "27": 2210100224.0, + "28": 2210100224.0, + "29": 2210100224.0, + "30": 2210100224.0, + "31": 2210100224.0, + "32": 2210100224.0, + "33": 2210100224.0, + "34": 2210100224.0, + "35": 2210100224.0, + "36": 2210100224.0, + "37": 2210100224.0, + "38": 2210100224.0, + "39": 2210100224.0, + "40": 2210100224.0, + "41": 2210100224.0, + "42": 2210100224.0, + "43": 2210100224.0, + "44": 2210100224.0, + "45": 2210100224.0, + "46": 2210100224.0, + "47": 2210100224.0, + "48": 2210100224.0, + "49": 2210100224.0, + "50": 2210100224.0, + "51": 2210100224.0, + "52": 2210100224.0, + "53": 2210100224.0, + "54": 2210100224.0, + "55": 2210100224.0, + "56": 2210100224.0, + "57": 2210100224.0, + "58": 2210100224.0, + "59": 2210100224.0, + "60": 2210100224.0, + "61": 2210100224.0, + "62": 2210100224.0, + "63": 2210100224.0, + "64": 2210100224.0, + "65": 2210100224.0, + "66": 2210100224.0, + "67": 2210100224.0, + "68": 2210100224.0, + "69": 2210100224.0, + "70": 2210100224.0, + "71": 2210100224.0, + "72": 2210100224.0, + "73": 2210100224.0, + "74": 2210100224.0, + "75": 2210100224.0, + "76": 2210100224.0, + "77": 2210100224.0, + "78": 2210100224.0, + "79": 2210100224.0, + "80": 2210100224.0, + "81": 2210100224.0, + "82": 2210100224.0, + "83": 2210100224.0, + "84": 2210100224.0, + "85": 2210100224.0, + "86": 2210100224.0, + "87": 2210100224.0, + "88": 2210100224.0, + "89": 2210100224.0, + "90": 2210100224.0, + "91": 2210100224.0, + "92": 2210100224.0, + "93": 2210100224.0, + "94": 2210100224.0, + "95": 2210100224.0, + "96": 2210100224.0, + "97": 2210100224.0, + "98": 2210100224.0, + "99": 2210100224.0, + "100": 2210100224.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 12.93568, + "2": 0.13825, + "3": 0.10934, + "4": 0.10452, + "5": 0.10497, + "6": 0.104, + "7": 0.10328, + "8": 0.10258, + "9": 0.10234, + "10": 0.10351, + "11": 0.10272, + "12": 0.10199, + "13": 0.10258, + "14": 0.1027, + "15": 0.10293, + "16": 0.10182, + "17": 0.10316, + "18": 0.10197, + "19": 0.10305, + "20": 0.10272, + "21": 0.11174, + "22": 0.10459, + "23": 0.10481, + "24": 0.10575, + "25": 0.10937, + "26": 0.10268, + "27": 0.10583, + "28": 0.10249, + "29": 0.10137, + "30": 0.10307, + "31": 0.10524, + "32": 0.10586, + "33": 0.1041, + "34": 0.10278, + "35": 0.10412, + "36": 0.10185, + "37": 0.10244, + "38": 0.10111, + "39": 0.10231, + "40": 0.10346, + "41": 0.10527, + "42": 0.10187, + "43": 0.10283, + "44": 0.10242, + "45": 0.10465, + "46": 0.10208, + "47": 0.10316, + "48": 0.10189, + "49": 0.10524, + "50": 0.10242, + "51": 0.10733, + "52": 0.10211, + "53": 0.10215, + "54": 0.10143, + "55": 0.10092, + "56": 0.10225, + "57": 0.1029, + "58": 0.10504, + "59": 0.10464, + "60": 0.10364, + "61": 0.10221, + "62": 0.10154, + "63": 0.10225, + "64": 0.1013, + "65": 0.10347, + "66": 0.10142, + "67": 0.102, + "68": 0.10339, + "69": 0.10291, + "70": 0.10294, + "71": 0.10164, + "72": 0.1026, + "73": 0.10225, + "74": 0.10241, + "75": 0.10146, + "76": 0.10155, + "77": 0.10259, + "78": 0.10243, + "79": 0.10169, + "80": 0.10195, + "81": 0.10134, + "82": 0.10222, + "83": 0.10368, + "84": 0.10065, + "85": 0.10117, + "86": 0.10158, + "87": 0.10243, + "88": 0.10233, + "89": 0.10157, + "90": 0.10229, + "91": 0.10188, + "92": 0.10172, + "93": 0.1013, + "94": 0.1011, + "95": 0.10202, + "96": 0.10173, + "97": 0.10128, + "98": 0.10222, + "99": 0.10127, + "100": 0.10148 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..d3d593b49c2 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.91349, + "2": 10.90719, + "3": 10.91328, + "4": 10.87838, + "5": 10.91769, + "6": 10.93821, + "7": 10.90469, + "8": 10.90393, + "9": 10.90876, + "10": 10.89645, + "11": 10.92562, + "12": 10.91891, + "13": 10.91537, + "14": 10.93343, + "15": 10.86115, + "16": 10.85374, + "17": 10.82717, + "18": 10.86544, + "19": 10.86225, + "20": 10.76737, + "21": 10.74634, + "22": 10.62228, + "23": 10.76122, + "24": 10.64732, + "25": 10.59597, + "26": 10.66352, + "27": 10.6542, + "28": 10.6077, + "29": 10.62581, + "30": 10.41591, + "31": 10.16855, + "32": 10.50267, + "33": 10.50304, + "34": 10.25481, + "35": 10.31879, + "36": 10.27167, + "37": 10.37751, + "38": 10.22122, + "39": 10.44798, + "40": 10.14166, + "41": 10.1771, + "42": 10.2426, + "43": 9.87148, + "44": 9.99875, + "45": 9.88702, + "46": 9.86139, + "47": 10.18144, + "48": 9.87873, + "49": 9.58706, + "50": 9.9542, + "51": 9.8866, + "52": 9.78429, + "53": 10.10842, + "54": 9.97368, + "55": 9.89803, + "56": 9.65427, + "57": 9.52013, + "58": 9.87297, + "59": 9.6132, + "60": 9.54967, + "61": 9.70681, + "62": 9.98533, + "63": 9.41357, + "64": 9.80966, + "65": 8.97052, + "66": 9.72773, + "67": 9.39183, + "68": 9.8084, + "69": 9.82052, + "70": 9.76655, + "71": 9.63414, + "72": 9.60485, + "73": 9.52299, + "74": 8.9718, + "75": 9.42321, + "76": 9.10113, + "77": 10.0716, + "78": 9.74266, + "79": 9.40343, + "80": 9.41333, + "81": 9.49931, + "82": 9.70236, + "83": 9.33436, + "84": 9.43774, + "85": 9.63924, + "86": 9.07931, + "87": 9.60447, + "88": 9.7824, + "89": 9.62386, + "90": 9.84241, + "91": 9.35506, + "92": 9.38398, + "93": 9.09747, + "94": 8.8471, + "95": 9.5314, + "96": 9.54263, + "97": 9.32886, + "98": 9.6926, + "99": 8.89976, + "100": 9.43124 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 22727424.0, + "2": 22925204.0, + "3": 22596900.0, + "4": 23219556.0, + "5": 22714624.0, + "6": 23021776.0, + "7": 22771632.0, + "8": 22926560.0, + "9": 22842156.0, + "10": 22918168.0, + "11": 22500688.0, + "12": 22459470.0, + "13": 22917228.0, + "14": 22387988.0, + "15": 22821732.0, + "16": 22830306.0, + "17": 22819520.0, + "18": 22582628.0, + "19": 22618028.0, + "20": 22693852.0, + "21": 22739344.0, + "22": 22799596.0, + "23": 22539016.0, + "24": 22770946.0, + "25": 22819324.0, + "26": 22547928.0, + "27": 22468716.0, + "28": 22453820.0, + "29": 22529898.0, + "30": 22631220.0, + "31": 22955420.0, + "32": 22585276.0, + "33": 22558602.0, + "34": 22835792.0, + "35": 22788208.0, + "36": 22589796.0, + "37": 22496928.0, + "38": 22896192.0, + "39": 22801858.0, + "40": 22657640.0, + "41": 22658982.0, + "42": 22667052.0, + "43": 22975816.0, + "44": 22747688.0, + "45": 22674846.0, + "46": 22884684.0, + "47": 22633708.0, + "48": 22928466.0, + "49": 22728092.0, + "50": 22905080.0, + "51": 22791108.0, + "52": 22748190.0, + "53": 22924900.0, + "54": 22840164.0, + "55": 22518344.0, + "56": 22877680.0, + "57": 23113944.0, + "58": 22846268.0, + "59": 22716084.0, + "60": 22742984.0, + "61": 22724584.0, + "62": 22672944.0, + "63": 22846388.0, + "64": 22823650.0, + "65": 23061058.0, + "66": 22729266.0, + "67": 22908888.0, + "68": 22610020.0, + "69": 22583826.0, + "70": 22829374.0, + "71": 22748240.0, + "72": 22654480.0, + "73": 22741180.0, + "74": 23047914.0, + "75": 23054396.0, + "76": 22900788.0, + "77": 22271588.0, + "78": 22789024.0, + "79": 22743632.0, + "80": 22706696.0, + "81": 22891372.0, + "82": 22777860.0, + "83": 22840532.0, + "84": 23010386.0, + "85": 22711212.0, + "86": 23103006.0, + "87": 22734564.0, + "88": 22637848.0, + "89": 22497850.0, + "90": 22972712.0, + "91": 22767188.0, + "92": 22808834.0, + "93": 22659304.0, + "94": 22911552.0, + "95": 23047794.0, + "96": 22829386.0, + "97": 22608168.0, + "98": 22762756.0, + "99": 22905900.0, + "100": 23015488.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 746443264.0, + "2": 746443264.0, + "3": 746443264.0, + "4": 746443264.0, + "5": 746443264.0, + "6": 746443264.0, + "7": 746443264.0, + "8": 746443264.0, + "9": 746443264.0, + "10": 746443264.0, + "11": 746443264.0, + "12": 746443264.0, + "13": 746443264.0, + "14": 746443264.0, + "15": 746443264.0, + "16": 746443264.0, + "17": 746443264.0, + "18": 746443264.0, + "19": 746443264.0, + "20": 746443264.0, + "21": 746443264.0, + "22": 746443264.0, + "23": 746443264.0, + "24": 746443264.0, + "25": 746443264.0, + "26": 746443264.0, + "27": 746443264.0, + "28": 746443264.0, + "29": 746443264.0, + "30": 746443264.0, + "31": 746443264.0, + "32": 746443264.0, + "33": 746443264.0, + "34": 746443264.0, + "35": 746443264.0, + "36": 746443264.0, + "37": 746443264.0, + "38": 746443264.0, + "39": 746443264.0, + "40": 746443264.0, + "41": 746443264.0, + "42": 746443264.0, + "43": 746443264.0, + "44": 746443264.0, + "45": 746443264.0, + "46": 746443264.0, + "47": 746443264.0, + "48": 746443264.0, + "49": 746443264.0, + "50": 746443264.0, + "51": 746443264.0, + "52": 746443264.0, + "53": 746443264.0, + "54": 746443264.0, + "55": 746443264.0, + "56": 746443264.0, + "57": 746443264.0, + "58": 746443264.0, + "59": 746443264.0, + "60": 746443264.0, + "61": 746443264.0, + "62": 746443264.0, + "63": 746443264.0, + "64": 746443264.0, + "65": 746443264.0, + "66": 746443264.0, + "67": 746443264.0, + "68": 746443264.0, + "69": 746443264.0, + "70": 746443264.0, + "71": 746443264.0, + "72": 746443264.0, + "73": 746443264.0, + "74": 746443264.0, + "75": 746443264.0, + "76": 746443264.0, + "77": 746443264.0, + "78": 746443264.0, + "79": 746443264.0, + "80": 746443264.0, + "81": 746443264.0, + "82": 746443264.0, + "83": 746443264.0, + "84": 746443264.0, + "85": 746443264.0, + "86": 746443264.0, + "87": 746443264.0, + "88": 746443264.0, + "89": 746443264.0, + "90": 746443264.0, + "91": 746443264.0, + "92": 746443264.0, + "93": 746443264.0, + "94": 746443264.0, + "95": 746443264.0, + "96": 746443264.0, + "97": 746443264.0, + "98": 746443264.0, + "99": 746443264.0, + "100": 746443264.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1926291456.0, + "2": 2210100224.0, + "3": 2210100224.0, + "4": 2210100224.0, + "5": 2210100224.0, + "6": 2210100224.0, + "7": 2210100224.0, + "8": 2210100224.0, + "9": 2210100224.0, + "10": 2210100224.0, + "11": 2210100224.0, + "12": 2210100224.0, + "13": 2210100224.0, + "14": 2210100224.0, + "15": 2210100224.0, + "16": 2210100224.0, + "17": 2210100224.0, + "18": 2210100224.0, + "19": 2210100224.0, + "20": 2210100224.0, + "21": 2210100224.0, + "22": 2210100224.0, + "23": 2210100224.0, + "24": 2210100224.0, + "25": 2210100224.0, + "26": 2210100224.0, + "27": 2210100224.0, + "28": 2210100224.0, + "29": 2210100224.0, + "30": 2210100224.0, + "31": 2210100224.0, + "32": 2210100224.0, + "33": 2210100224.0, + "34": 2210100224.0, + "35": 2210100224.0, + "36": 2210100224.0, + "37": 2210100224.0, + "38": 2210100224.0, + "39": 2210100224.0, + "40": 2210100224.0, + "41": 2210100224.0, + "42": 2210100224.0, + "43": 2210100224.0, + "44": 2210100224.0, + "45": 2210100224.0, + "46": 2210100224.0, + "47": 2210100224.0, + "48": 2210100224.0, + "49": 2210100224.0, + "50": 2210100224.0, + "51": 2210100224.0, + "52": 2210100224.0, + "53": 2210100224.0, + "54": 2210100224.0, + "55": 2210100224.0, + "56": 2210100224.0, + "57": 2210100224.0, + "58": 2210100224.0, + "59": 2210100224.0, + "60": 2210100224.0, + "61": 2210100224.0, + "62": 2210100224.0, + "63": 2210100224.0, + "64": 2210100224.0, + "65": 2210100224.0, + "66": 2210100224.0, + "67": 2210100224.0, + "68": 2210100224.0, + "69": 2210100224.0, + "70": 2210100224.0, + "71": 2210100224.0, + "72": 2210100224.0, + "73": 2210100224.0, + "74": 2210100224.0, + "75": 2210100224.0, + "76": 2210100224.0, + "77": 2210100224.0, + "78": 2210100224.0, + "79": 2210100224.0, + "80": 2210100224.0, + "81": 2210100224.0, + "82": 2210100224.0, + "83": 2210100224.0, + "84": 2210100224.0, + "85": 2210100224.0, + "86": 2210100224.0, + "87": 2210100224.0, + "88": 2210100224.0, + "89": 2210100224.0, + "90": 2210100224.0, + "91": 2210100224.0, + "92": 2210100224.0, + "93": 2210100224.0, + "94": 2210100224.0, + "95": 2210100224.0, + "96": 2210100224.0, + "97": 2210100224.0, + "98": 2210100224.0, + "99": 2210100224.0, + "100": 2210100224.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 14.49723, + "2": 0.13917, + "3": 0.12323, + "4": 0.12243, + "5": 0.12247, + "6": 0.12126, + "7": 0.12098, + "8": 0.1227, + "9": 0.12232, + "10": 0.12216, + "11": 0.12203, + "12": 0.12472, + "13": 0.11919, + "14": 0.12363, + "15": 0.11934, + "16": 0.12078, + "17": 0.1214, + "18": 0.12382, + "19": 0.11938, + "20": 0.11818, + "21": 0.1195, + "22": 0.1193, + "23": 0.11729, + "24": 0.11671, + "25": 0.11812, + "26": 0.11788, + "27": 0.11835, + "28": 0.11687, + "29": 0.11683, + "30": 0.1185, + "31": 0.11738, + "32": 0.11696, + "33": 0.11541, + "34": 0.11482, + "35": 0.11307, + "36": 0.11445, + "37": 0.11503, + "38": 0.11448, + "39": 0.11562, + "40": 0.11468, + "41": 0.11341, + "42": 0.11368, + "43": 0.11604, + "44": 0.11649, + "45": 0.11581, + "46": 0.11637, + "47": 0.11699, + "48": 0.11661, + "49": 0.11522, + "50": 0.11451, + "51": 0.12299, + "52": 0.11449, + "53": 0.11137, + "54": 0.11274, + "55": 0.1121, + "56": 0.11212, + "57": 0.11573, + "58": 0.11206, + "59": 0.11388, + "60": 0.11369, + "61": 0.11208, + "62": 0.11287, + "63": 0.11238, + "64": 0.11193, + "65": 0.11205, + "66": 0.11482, + "67": 0.1131, + "68": 0.11433, + "69": 0.11257, + "70": 0.1116, + "71": 0.11365, + "72": 0.11214, + "73": 0.11376, + "74": 0.11389, + "75": 0.11397, + "76": 0.11359, + "77": 0.11346, + "78": 0.11235, + "79": 0.11282, + "80": 0.11301, + "81": 0.11347, + "82": 0.11356, + "83": 0.11321, + "84": 0.11412, + "85": 0.11256, + "86": 0.11555, + "87": 0.11224, + "88": 0.11344, + "89": 0.11351, + "90": 0.11218, + "91": 0.11235, + "92": 0.11417, + "93": 0.11691, + "94": 0.11326, + "95": 0.11519, + "96": 0.11321, + "97": 0.11272, + "98": 0.11268, + "99": 0.11187, + "100": 0.11371 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..0c4a176491d --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.90105, + "2": 10.89262, + "3": 10.90042, + "4": 10.88139, + "5": 10.89686, + "6": 10.91104, + "7": 10.90071, + "8": 10.88372, + "9": 10.89705, + "10": 10.88269, + "11": 10.91638, + "12": 10.88862, + "13": 10.89506, + "14": 10.90397, + "15": 10.83975, + "16": 10.84821, + "17": 10.83519, + "18": 10.83782, + "19": 10.83204, + "20": 10.74037, + "21": 10.70726, + "22": 10.5989, + "23": 10.72135, + "24": 10.60586, + "25": 10.57931, + "26": 10.63021, + "27": 10.62207, + "28": 10.57267, + "29": 10.60724, + "30": 10.37738, + "31": 10.15237, + "32": 10.47733, + "33": 10.48045, + "34": 10.24256, + "35": 10.29033, + "36": 10.26052, + "37": 10.36236, + "38": 10.2143, + "39": 10.44546, + "40": 10.1156, + "41": 10.15998, + "42": 10.23373, + "43": 9.85188, + "44": 9.97725, + "45": 9.85639, + "46": 9.83161, + "47": 10.17999, + "48": 9.85771, + "49": 9.54486, + "50": 9.93378, + "51": 9.86811, + "52": 9.76315, + "53": 10.10886, + "54": 9.95631, + "55": 9.87553, + "56": 9.64641, + "57": 9.49014, + "58": 9.85454, + "59": 9.59336, + "60": 9.528, + "61": 9.69542, + "62": 10.01688, + "63": 9.38936, + "64": 9.80315, + "65": 8.95041, + "66": 9.72761, + "67": 9.37481, + "68": 9.80513, + "69": 9.81015, + "70": 9.76634, + "71": 9.63164, + "72": 9.57894, + "73": 9.52071, + "74": 8.94946, + "75": 9.4304, + "76": 9.0845, + "77": 10.08945, + "78": 9.72783, + "79": 9.37638, + "80": 9.40916, + "81": 9.4973, + "82": 9.71293, + "83": 9.33328, + "84": 9.44016, + "85": 9.63365, + "86": 9.07079, + "87": 9.61271, + "88": 9.78341, + "89": 9.60939, + "90": 9.8516, + "91": 9.34566, + "92": 9.38259, + "93": 9.07364, + "94": 8.81745, + "95": 9.51874, + "96": 9.54064, + "97": 9.3403, + "98": 9.7014, + "99": 8.88889, + "100": 9.43257 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 22727086.0, + "2": 22925536.0, + "3": 22597166.0, + "4": 23219856.0, + "5": 22714736.0, + "6": 23021732.0, + "7": 22770914.0, + "8": 22927056.0, + "9": 22842296.0, + "10": 22918912.0, + "11": 22500920.0, + "12": 22460280.0, + "13": 22917408.0, + "14": 22388720.0, + "15": 22821334.0, + "16": 22830758.0, + "17": 22818604.0, + "18": 22581868.0, + "19": 22618000.0, + "20": 22694008.0, + "21": 22739396.0, + "22": 22800094.0, + "23": 22540104.0, + "24": 22771496.0, + "25": 22818912.0, + "26": 22547352.0, + "27": 22469568.0, + "28": 22453522.0, + "29": 22530096.0, + "30": 22631266.0, + "31": 22955564.0, + "32": 22585980.0, + "33": 22558174.0, + "34": 22835734.0, + "35": 22787944.0, + "36": 22590020.0, + "37": 22497168.0, + "38": 22896692.0, + "39": 22801708.0, + "40": 22658196.0, + "41": 22659512.0, + "42": 22667920.0, + "43": 22975524.0, + "44": 22746310.0, + "45": 22675296.0, + "46": 22884630.0, + "47": 22633552.0, + "48": 22929508.0, + "49": 22727314.0, + "50": 22904808.0, + "51": 22791580.0, + "52": 22748196.0, + "53": 22926080.0, + "54": 22839468.0, + "55": 22518754.0, + "56": 22877424.0, + "57": 23112764.0, + "58": 22845208.0, + "59": 22716140.0, + "60": 22743504.0, + "61": 22724840.0, + "62": 22672332.0, + "63": 22846080.0, + "64": 22823362.0, + "65": 23060460.0, + "66": 22729572.0, + "67": 22907836.0, + "68": 22610520.0, + "69": 22584436.0, + "70": 22829772.0, + "71": 22749364.0, + "72": 22653792.0, + "73": 22740804.0, + "74": 23047852.0, + "75": 23054048.0, + "76": 22901336.0, + "77": 22271880.0, + "78": 22789702.0, + "79": 22743626.0, + "80": 22706308.0, + "81": 22891444.0, + "82": 22776950.0, + "83": 22839442.0, + "84": 23010112.0, + "85": 22712054.0, + "86": 23103248.0, + "87": 22735596.0, + "88": 22636964.0, + "89": 22499088.0, + "90": 22972128.0, + "91": 22767228.0, + "92": 22810212.0, + "93": 22659490.0, + "94": 22911654.0, + "95": 23048144.0, + "96": 22828752.0, + "97": 22608416.0, + "98": 22762932.0, + "99": 22906240.0, + "100": 23015824.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 717082624.0, + "2": 717082624.0, + "3": 717082624.0, + "4": 717082624.0, + "5": 717082624.0, + "6": 717082624.0, + "7": 717082624.0, + "8": 717082624.0, + "9": 717082624.0, + "10": 717082624.0, + "11": 717082624.0, + "12": 717082624.0, + "13": 717082624.0, + "14": 717082624.0, + "15": 717082624.0, + "16": 717082624.0, + "17": 717082624.0, + "18": 717082624.0, + "19": 717082624.0, + "20": 717082624.0, + "21": 717082624.0, + "22": 717082624.0, + "23": 717082624.0, + "24": 717082624.0, + "25": 717082624.0, + "26": 717082624.0, + "27": 717082624.0, + "28": 717082624.0, + "29": 717082624.0, + "30": 717082624.0, + "31": 717082624.0, + "32": 717082624.0, + "33": 717082624.0, + "34": 717082624.0, + "35": 717082624.0, + "36": 717082624.0, + "37": 717082624.0, + "38": 717082624.0, + "39": 717082624.0, + "40": 717082624.0, + "41": 717082624.0, + "42": 717082624.0, + "43": 717082624.0, + "44": 717082624.0, + "45": 717082624.0, + "46": 717082624.0, + "47": 717082624.0, + "48": 717082624.0, + "49": 717082624.0, + "50": 717082624.0, + "51": 717082624.0, + "52": 717082624.0, + "53": 717082624.0, + "54": 717082624.0, + "55": 717082624.0, + "56": 717082624.0, + "57": 717082624.0, + "58": 717082624.0, + "59": 717082624.0, + "60": 717082624.0, + "61": 717082624.0, + "62": 717082624.0, + "63": 717082624.0, + "64": 717082624.0, + "65": 717082624.0, + "66": 717082624.0, + "67": 717082624.0, + "68": 717082624.0, + "69": 717082624.0, + "70": 717082624.0, + "71": 717082624.0, + "72": 717082624.0, + "73": 717082624.0, + "74": 717082624.0, + "75": 717082624.0, + "76": 717082624.0, + "77": 717082624.0, + "78": 717082624.0, + "79": 717082624.0, + "80": 717082624.0, + "81": 717082624.0, + "82": 717082624.0, + "83": 717082624.0, + "84": 717082624.0, + "85": 717082624.0, + "86": 717082624.0, + "87": 717082624.0, + "88": 717082624.0, + "89": 717082624.0, + "90": 717082624.0, + "91": 717082624.0, + "92": 717082624.0, + "93": 717082624.0, + "94": 717082624.0, + "95": 717082624.0, + "96": 717082624.0, + "97": 717082624.0, + "98": 717082624.0, + "99": 717082624.0, + "100": 717082624.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2399852544.0, + "2": 2683661312.0, + "3": 2683661312.0, + "4": 2683661312.0, + "5": 2683661312.0, + "6": 2683661312.0, + "7": 2683661312.0, + "8": 2683661312.0, + "9": 2683661312.0, + "10": 2683661312.0, + "11": 2683661312.0, + "12": 2683661312.0, + "13": 2683661312.0, + "14": 2683661312.0, + "15": 2683661312.0, + "16": 2683661312.0, + "17": 2683661312.0, + "18": 2683661312.0, + "19": 2683661312.0, + "20": 2683661312.0, + "21": 2683661312.0, + "22": 2683661312.0, + "23": 2683661312.0, + "24": 2683661312.0, + "25": 2683661312.0, + "26": 2683661312.0, + "27": 2683661312.0, + "28": 2683661312.0, + "29": 2683661312.0, + "30": 2683661312.0, + "31": 2683661312.0, + "32": 2683661312.0, + "33": 2683661312.0, + "34": 2683661312.0, + "35": 2683661312.0, + "36": 2683661312.0, + "37": 2683661312.0, + "38": 2683661312.0, + "39": 2683661312.0, + "40": 2683661312.0, + "41": 2683661312.0, + "42": 2683661312.0, + "43": 2683661312.0, + "44": 2683661312.0, + "45": 2683661312.0, + "46": 2683661312.0, + "47": 2683661312.0, + "48": 2683661312.0, + "49": 2683661312.0, + "50": 2683661312.0, + "51": 2683661312.0, + "52": 2683661312.0, + "53": 2683661312.0, + "54": 2683661312.0, + "55": 2683661312.0, + "56": 2683661312.0, + "57": 2683661312.0, + "58": 2683661312.0, + "59": 2683661312.0, + "60": 2683661312.0, + "61": 2683661312.0, + "62": 2683661312.0, + "63": 2683661312.0, + "64": 2683661312.0, + "65": 2683661312.0, + "66": 2683661312.0, + "67": 2683661312.0, + "68": 2683661312.0, + "69": 2683661312.0, + "70": 2683661312.0, + "71": 2683661312.0, + "72": 2683661312.0, + "73": 2683661312.0, + "74": 2683661312.0, + "75": 2683661312.0, + "76": 2683661312.0, + "77": 2683661312.0, + "78": 2683661312.0, + "79": 2683661312.0, + "80": 2683661312.0, + "81": 2683661312.0, + "82": 2683661312.0, + "83": 2683661312.0, + "84": 2683661312.0, + "85": 2683661312.0, + "86": 2683661312.0, + "87": 2683661312.0, + "88": 2683661312.0, + "89": 2683661312.0, + "90": 2683661312.0, + "91": 2683661312.0, + "92": 2683661312.0, + "93": 2683661312.0, + "94": 2683661312.0, + "95": 2683661312.0, + "96": 2683661312.0, + "97": 2683661312.0, + "98": 2683661312.0, + "99": 2683661312.0, + "100": 2683661312.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 16.63764, + "2": 0.21125, + "3": 0.18805, + "4": 0.18329, + "5": 0.1823, + "6": 0.18232, + "7": 0.18144, + "8": 0.18027, + "9": 0.17969, + "10": 0.18238, + "11": 0.18028, + "12": 0.36174, + "13": 0.18167, + "14": 0.1837, + "15": 0.18267, + "16": 0.18257, + "17": 0.18024, + "18": 0.18275, + "19": 0.1832, + "20": 0.17831, + "21": 0.18017, + "22": 0.18109, + "23": 0.17885, + "24": 0.18267, + "25": 0.18058, + "26": 0.1773, + "27": 0.1794, + "28": 0.17907, + "29": 0.18081, + "30": 0.17905, + "31": 0.17854, + "32": 0.17894, + "33": 0.17849, + "34": 0.17658, + "35": 0.17776, + "36": 0.17727, + "37": 0.17642, + "38": 0.17777, + "39": 0.17803, + "40": 0.17642, + "41": 0.17693, + "42": 0.17625, + "43": 0.17866, + "44": 0.17762, + "45": 0.17754, + "46": 0.17702, + "47": 0.17711, + "48": 0.17758, + "49": 0.17715, + "50": 0.17757, + "51": 0.18445, + "52": 0.1799, + "53": 0.18208, + "54": 0.17612, + "55": 0.17944, + "56": 0.17873, + "57": 0.18258, + "58": 0.17483, + "59": 0.17477, + "60": 0.17433, + "61": 0.17366, + "62": 0.44447, + "63": 0.17665, + "64": 0.17466, + "65": 0.17524, + "66": 0.17467, + "67": 0.17584, + "68": 0.17461, + "69": 0.17423, + "70": 0.1742, + "71": 0.1735, + "72": 0.17461, + "73": 0.17526, + "74": 0.17447, + "75": 0.17297, + "76": 0.17355, + "77": 0.17305, + "78": 0.17366, + "79": 0.17341, + "80": 0.17382, + "81": 0.17396, + "82": 0.17489, + "83": 0.17464, + "84": 0.17401, + "85": 0.17498, + "86": 0.17379, + "87": 0.1725, + "88": 0.17312, + "89": 0.17427, + "90": 0.17333, + "91": 0.1738, + "92": 0.1743, + "93": 0.1732, + "94": 0.1739, + "95": 0.17949, + "96": 0.17499, + "97": 0.17375, + "98": 0.17377, + "99": 0.17343, + "100": 0.17383 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..0fb0b846d53 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.90105, + "2": 10.89262, + "3": 10.90042, + "4": 10.88139, + "5": 10.89686, + "6": 10.91104, + "7": 10.90071, + "8": 10.88372, + "9": 10.89705, + "10": 10.88269, + "11": 10.91638, + "12": 10.88862, + "13": 10.89506, + "14": 10.90397, + "15": 10.83975, + "16": 10.84821, + "17": 10.83519, + "18": 10.83782, + "19": 10.83204, + "20": 10.74037, + "21": 10.70726, + "22": 10.5989, + "23": 10.72135, + "24": 10.60586, + "25": 10.57931, + "26": 10.63021, + "27": 10.62207, + "28": 10.57267, + "29": 10.60724, + "30": 10.37738, + "31": 10.15237, + "32": 10.47733, + "33": 10.48045, + "34": 10.24256, + "35": 10.29033, + "36": 10.26052, + "37": 10.36236, + "38": 10.2143, + "39": 10.44546, + "40": 10.1156, + "41": 10.15998, + "42": 10.23373, + "43": 9.85188, + "44": 9.97725, + "45": 9.85639, + "46": 9.83161, + "47": 10.17999, + "48": 9.85771, + "49": 9.54486, + "50": 9.93378, + "51": 9.86811, + "52": 9.76315, + "53": 10.10886, + "54": 9.95631, + "55": 9.87553, + "56": 9.64641, + "57": 9.49014, + "58": 9.85454, + "59": 9.59336, + "60": 9.528, + "61": 9.69542, + "62": 10.01688, + "63": 9.38936, + "64": 9.80315, + "65": 8.95041, + "66": 9.72761, + "67": 9.37481, + "68": 9.80513, + "69": 9.81015, + "70": 9.76634, + "71": 9.63164, + "72": 9.57894, + "73": 9.52071, + "74": 8.94946, + "75": 9.4304, + "76": 9.0845, + "77": 10.08945, + "78": 9.72783, + "79": 9.37638, + "80": 9.40916, + "81": 9.4973, + "82": 9.71293, + "83": 9.33328, + "84": 9.44016, + "85": 9.63365, + "86": 9.07079, + "87": 9.61271, + "88": 9.78341, + "89": 9.60939, + "90": 9.8516, + "91": 9.34566, + "92": 9.38259, + "93": 9.07364, + "94": 8.81745, + "95": 9.51874, + "96": 9.54064, + "97": 9.3403, + "98": 9.7014, + "99": 8.88889, + "100": 9.43257 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 22727086.0, + "2": 22925536.0, + "3": 22597166.0, + "4": 23219856.0, + "5": 22714736.0, + "6": 23021732.0, + "7": 22770914.0, + "8": 22927056.0, + "9": 22842296.0, + "10": 22918912.0, + "11": 22500920.0, + "12": 22460280.0, + "13": 22917408.0, + "14": 22388720.0, + "15": 22821334.0, + "16": 22830758.0, + "17": 22818604.0, + "18": 22581868.0, + "19": 22618000.0, + "20": 22694008.0, + "21": 22739396.0, + "22": 22800094.0, + "23": 22540104.0, + "24": 22771496.0, + "25": 22818912.0, + "26": 22547352.0, + "27": 22469568.0, + "28": 22453522.0, + "29": 22530096.0, + "30": 22631266.0, + "31": 22955564.0, + "32": 22585980.0, + "33": 22558174.0, + "34": 22835734.0, + "35": 22787944.0, + "36": 22590020.0, + "37": 22497168.0, + "38": 22896692.0, + "39": 22801708.0, + "40": 22658196.0, + "41": 22659512.0, + "42": 22667920.0, + "43": 22975524.0, + "44": 22746310.0, + "45": 22675296.0, + "46": 22884630.0, + "47": 22633552.0, + "48": 22929508.0, + "49": 22727314.0, + "50": 22904808.0, + "51": 22791580.0, + "52": 22748196.0, + "53": 22926080.0, + "54": 22839468.0, + "55": 22518754.0, + "56": 22877424.0, + "57": 23112764.0, + "58": 22845208.0, + "59": 22716140.0, + "60": 22743504.0, + "61": 22724840.0, + "62": 22672332.0, + "63": 22846080.0, + "64": 22823362.0, + "65": 23060460.0, + "66": 22729572.0, + "67": 22907836.0, + "68": 22610520.0, + "69": 22584436.0, + "70": 22829772.0, + "71": 22749364.0, + "72": 22653792.0, + "73": 22740804.0, + "74": 23047852.0, + "75": 23054048.0, + "76": 22901336.0, + "77": 22271880.0, + "78": 22789702.0, + "79": 22743626.0, + "80": 22706308.0, + "81": 22891444.0, + "82": 22776950.0, + "83": 22839442.0, + "84": 23010112.0, + "85": 22712054.0, + "86": 23103248.0, + "87": 22735596.0, + "88": 22636964.0, + "89": 22499088.0, + "90": 22972128.0, + "91": 22767228.0, + "92": 22810212.0, + "93": 22659490.0, + "94": 22911654.0, + "95": 23048144.0, + "96": 22828752.0, + "97": 22608416.0, + "98": 22762932.0, + "99": 22906240.0, + "100": 23015824.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 717082624.0, + "2": 717082624.0, + "3": 717082624.0, + "4": 717082624.0, + "5": 717082624.0, + "6": 717082624.0, + "7": 717082624.0, + "8": 717082624.0, + "9": 717082624.0, + "10": 717082624.0, + "11": 717082624.0, + "12": 717082624.0, + "13": 717082624.0, + "14": 717082624.0, + "15": 717082624.0, + "16": 717082624.0, + "17": 717082624.0, + "18": 717082624.0, + "19": 717082624.0, + "20": 717082624.0, + "21": 717082624.0, + "22": 717082624.0, + "23": 717082624.0, + "24": 717082624.0, + "25": 717082624.0, + "26": 717082624.0, + "27": 717082624.0, + "28": 717082624.0, + "29": 717082624.0, + "30": 717082624.0, + "31": 717082624.0, + "32": 717082624.0, + "33": 717082624.0, + "34": 717082624.0, + "35": 717082624.0, + "36": 717082624.0, + "37": 717082624.0, + "38": 717082624.0, + "39": 717082624.0, + "40": 717082624.0, + "41": 717082624.0, + "42": 717082624.0, + "43": 717082624.0, + "44": 717082624.0, + "45": 717082624.0, + "46": 717082624.0, + "47": 717082624.0, + "48": 717082624.0, + "49": 717082624.0, + "50": 717082624.0, + "51": 717082624.0, + "52": 717082624.0, + "53": 717082624.0, + "54": 717082624.0, + "55": 717082624.0, + "56": 717082624.0, + "57": 717082624.0, + "58": 717082624.0, + "59": 717082624.0, + "60": 717082624.0, + "61": 717082624.0, + "62": 717082624.0, + "63": 717082624.0, + "64": 717082624.0, + "65": 717082624.0, + "66": 717082624.0, + "67": 717082624.0, + "68": 717082624.0, + "69": 717082624.0, + "70": 717082624.0, + "71": 717082624.0, + "72": 717082624.0, + "73": 717082624.0, + "74": 717082624.0, + "75": 717082624.0, + "76": 717082624.0, + "77": 717082624.0, + "78": 717082624.0, + "79": 717082624.0, + "80": 717082624.0, + "81": 717082624.0, + "82": 717082624.0, + "83": 717082624.0, + "84": 717082624.0, + "85": 717082624.0, + "86": 717082624.0, + "87": 717082624.0, + "88": 717082624.0, + "89": 717082624.0, + "90": 717082624.0, + "91": 717082624.0, + "92": 717082624.0, + "93": 717082624.0, + "94": 717082624.0, + "95": 717082624.0, + "96": 717082624.0, + "97": 717082624.0, + "98": 717082624.0, + "99": 717082624.0, + "100": 717082624.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2399852544.0, + "2": 2683661312.0, + "3": 2683661312.0, + "4": 2683661312.0, + "5": 2683661312.0, + "6": 2683661312.0, + "7": 2683661312.0, + "8": 2683661312.0, + "9": 2683661312.0, + "10": 2683661312.0, + "11": 2683661312.0, + "12": 2683661312.0, + "13": 2683661312.0, + "14": 2683661312.0, + "15": 2683661312.0, + "16": 2683661312.0, + "17": 2683661312.0, + "18": 2683661312.0, + "19": 2683661312.0, + "20": 2683661312.0, + "21": 2683661312.0, + "22": 2683661312.0, + "23": 2683661312.0, + "24": 2683661312.0, + "25": 2683661312.0, + "26": 2683661312.0, + "27": 2683661312.0, + "28": 2683661312.0, + "29": 2683661312.0, + "30": 2683661312.0, + "31": 2683661312.0, + "32": 2683661312.0, + "33": 2683661312.0, + "34": 2683661312.0, + "35": 2683661312.0, + "36": 2683661312.0, + "37": 2683661312.0, + "38": 2683661312.0, + "39": 2683661312.0, + "40": 2683661312.0, + "41": 2683661312.0, + "42": 2683661312.0, + "43": 2683661312.0, + "44": 2683661312.0, + "45": 2683661312.0, + "46": 2683661312.0, + "47": 2683661312.0, + "48": 2683661312.0, + "49": 2683661312.0, + "50": 2683661312.0, + "51": 2683661312.0, + "52": 2683661312.0, + "53": 2683661312.0, + "54": 2683661312.0, + "55": 2683661312.0, + "56": 2683661312.0, + "57": 2683661312.0, + "58": 2683661312.0, + "59": 2683661312.0, + "60": 2683661312.0, + "61": 2683661312.0, + "62": 2683661312.0, + "63": 2683661312.0, + "64": 2683661312.0, + "65": 2683661312.0, + "66": 2683661312.0, + "67": 2683661312.0, + "68": 2683661312.0, + "69": 2683661312.0, + "70": 2683661312.0, + "71": 2683661312.0, + "72": 2683661312.0, + "73": 2683661312.0, + "74": 2683661312.0, + "75": 2683661312.0, + "76": 2683661312.0, + "77": 2683661312.0, + "78": 2683661312.0, + "79": 2683661312.0, + "80": 2683661312.0, + "81": 2683661312.0, + "82": 2683661312.0, + "83": 2683661312.0, + "84": 2683661312.0, + "85": 2683661312.0, + "86": 2683661312.0, + "87": 2683661312.0, + "88": 2683661312.0, + "89": 2683661312.0, + "90": 2683661312.0, + "91": 2683661312.0, + "92": 2683661312.0, + "93": 2683661312.0, + "94": 2683661312.0, + "95": 2683661312.0, + "96": 2683661312.0, + "97": 2683661312.0, + "98": 2683661312.0, + "99": 2683661312.0, + "100": 2683661312.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 19.93377, + "2": 0.214, + "3": 0.18589, + "4": 0.17894, + "5": 0.1785, + "6": 0.17891, + "7": 0.18156, + "8": 0.18079, + "9": 0.17824, + "10": 0.17989, + "11": 0.17805, + "12": 0.17716, + "13": 0.17836, + "14": 0.17787, + "15": 0.17769, + "16": 0.17666, + "17": 0.17653, + "18": 0.1758, + "19": 0.17562, + "20": 0.1768, + "21": 0.1768, + "22": 0.17624, + "23": 0.17472, + "24": 0.17432, + "25": 0.1736, + "26": 0.1746, + "27": 0.17474, + "28": 0.17601, + "29": 0.17807, + "30": 0.17493, + "31": 0.17335, + "32": 0.17319, + "33": 0.17268, + "34": 0.17305, + "35": 0.17412, + "36": 0.17335, + "37": 0.17266, + "38": 0.17413, + "39": 0.17304, + "40": 0.17432, + "41": 0.17519, + "42": 0.17337, + "43": 0.17392, + "44": 0.17265, + "45": 0.17279, + "46": 0.17548, + "47": 0.17651, + "48": 0.17389, + "49": 0.17631, + "50": 0.17232, + "51": 0.18407, + "52": 0.17581, + "53": 0.37263, + "54": 0.17452, + "55": 0.17442, + "56": 0.1745, + "57": 0.17483, + "58": 0.17583, + "59": 0.17494, + "60": 0.17407, + "61": 0.17423, + "62": 0.17441, + "63": 0.17659, + "64": 0.17537, + "65": 0.17556, + "66": 0.3524, + "67": 0.17531, + "68": 0.17588, + "69": 0.17592, + "70": 0.17431, + "71": 0.17395, + "72": 0.17604, + "73": 0.17728, + "74": 0.17752, + "75": 0.1758, + "76": 0.17612, + "77": 0.17411, + "78": 0.17662, + "79": 0.17605, + "80": 0.17671, + "81": 0.17596, + "82": 0.1766, + "83": 0.17666, + "84": 0.17679, + "85": 0.17653, + "86": 0.17635, + "87": 0.17598, + "88": 0.17546, + "89": 0.17602, + "90": 0.17567, + "91": 0.17695, + "92": 0.17831, + "93": 0.17683, + "94": 0.17578, + "95": 0.17724, + "96": 0.17805, + "97": 0.17524, + "98": 0.17706, + "99": 0.1768, + "100": 0.17633 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 0568628b7b7..9ec4370d823 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,287 @@ -{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.84523, "5": 10.87428, "10": 10.82858, "15": 10.81926, "20": 10.72749, "25": 10.55195, "30": 10.36504, "35": 10.27845, "40": 10.09773, "45": 9.84203, "50": 9.91254}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1725.0, "5": 1834.0, "10": 1478.0, "15": 1891.0, "20": 1639.0, "25": 1623.0, "30": 1882.0, "35": 2043.0, "40": 2168.0, "45": 2159.0, "50": 2319.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 763220480.0, "5": 763220480.0, "10": 763220480.0, "15": 763220480.0, "20": 763220480.0, "25": 763220480.0, "30": 763220480.0, "35": 763220480.0, "40": 763220480.0, "45": 763220480.0, "50": 763220480.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3868255744.0, "5": 4152064512.0, "10": 4152064512.0, "15": 4152064512.0, "20": 4152064512.0, "25": 4152064512.0, "30": 4152064512.0, "35": 4152064512.0, "40": 4152064512.0, "45": 4152064512.0, "50": 4152064512.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 13.37152, "5": 0.10735, "10": 0.10615, "15": 0.10727, "20": 0.10475, "25": 0.10789, "30": 0.10639, "35": 0.1051, "40": 0.10657, "45": 0.10582, "50": 0.1069}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.84523, + "2": 10.85412, + "3": 10.85365, + "4": 10.83867, + "5": 10.87428, + "6": 10.89334, + "7": 10.8541, + "8": 10.86232, + "9": 10.86355, + "10": 10.82858, + "11": 10.88772, + "12": 10.87148, + "13": 10.87939, + "14": 10.89122, + "15": 10.81926, + "16": 10.83064, + "17": 10.79873, + "18": 10.81769, + "19": 10.8196, + "20": 10.72749, + "21": 10.70555, + "22": 10.56395, + "23": 10.7282, + "24": 10.60841, + "25": 10.55195, + "26": 10.60869, + "27": 10.62878, + "28": 10.5827, + "29": 10.59984, + "30": 10.36504, + "31": 10.12095, + "32": 10.47626, + "33": 10.46908, + "34": 10.22325, + "35": 10.27845, + "36": 10.22879, + "37": 10.35946, + "38": 10.19333, + "39": 10.41585, + "40": 10.09773, + "41": 10.15714, + "42": 10.22441, + "43": 9.8328, + "44": 9.96934, + "45": 9.84203, + "46": 9.83023, + "47": 10.15603, + "48": 9.85506, + "49": 9.54051, + "50": 9.91254 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1725.0, + "2": 1664.0, + "3": 1710.0, + "4": 1712.0, + "5": 1834.0, + "6": 1743.0, + "7": 1803.0, + "8": 1744.0, + "9": 1770.0, + "10": 1478.0, + "11": 1879.0, + "12": 1696.0, + "13": 1952.0, + "14": 1732.0, + "15": 1891.0, + "16": 1872.0, + "17": 1737.0, + "18": 1744.0, + "19": 1843.0, + "20": 1639.0, + "21": 1817.0, + "22": 1615.0, + "23": 1960.0, + "24": 1646.0, + "25": 1623.0, + "26": 1671.0, + "27": 1841.0, + "28": 2009.0, + "29": 1956.0, + "30": 1882.0, + "31": 1597.0, + "32": 1921.0, + "33": 2114.0, + "34": 1828.0, + "35": 2043.0, + "36": 1947.0, + "37": 2338.0, + "38": 2227.0, + "39": 2346.0, + "40": 2168.0, + "41": 2204.0, + "42": 2247.0, + "43": 2078.0, + "44": 2064.0, + "45": 2159.0, + "46": 2489.0, + "47": 2497.0, + "48": 2305.0, + "49": 2272.0, + "50": 2319.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759681536.0, + "2": 759681536.0, + "3": 759681536.0, + "4": 759681536.0, + "5": 759681536.0, + "6": 759681536.0, + "7": 759681536.0, + "8": 759681536.0, + "9": 759681536.0, + "10": 759681536.0, + "11": 759681536.0, + "12": 759681536.0, + "13": 759681536.0, + "14": 759681536.0, + "15": 759681536.0, + "16": 759681536.0, + "17": 759681536.0, + "18": 759681536.0, + "19": 759681536.0, + "20": 759681536.0, + "21": 759681536.0, + "22": 759681536.0, + "23": 759681536.0, + "24": 759681536.0, + "25": 759681536.0, + "26": 759681536.0, + "27": 759681536.0, + "28": 759681536.0, + "29": 759681536.0, + "30": 759681536.0, + "31": 759681536.0, + "32": 759681536.0, + "33": 759681536.0, + "34": 759681536.0, + "35": 759681536.0, + "36": 759681536.0, + "37": 759681536.0, + "38": 759681536.0, + "39": 759681536.0, + "40": 759681536.0, + "41": 759681536.0, + "42": 759681536.0, + "43": 759681536.0, + "44": 759681536.0, + "45": 759681536.0, + "46": 759681536.0, + "47": 759681536.0, + "48": 759681536.0, + "49": 759681536.0, + "50": 759681536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3866813952.0, + "2": 4148525568.0, + "3": 4148525568.0, + "4": 4148525568.0, + "5": 4148525568.0, + "6": 4148525568.0, + "7": 4148525568.0, + "8": 4148525568.0, + "9": 4148525568.0, + "10": 4148525568.0, + "11": 4148525568.0, + "12": 4148525568.0, + "13": 4148525568.0, + "14": 4148525568.0, + "15": 4148525568.0, + "16": 4148525568.0, + "17": 4148525568.0, + "18": 4148525568.0, + "19": 4148525568.0, + "20": 4148525568.0, + "21": 4148525568.0, + "22": 4148525568.0, + "23": 4148525568.0, + "24": 4148525568.0, + "25": 4148525568.0, + "26": 4148525568.0, + "27": 4148525568.0, + "28": 4148525568.0, + "29": 4148525568.0, + "30": 4148525568.0, + "31": 4148525568.0, + "32": 4148525568.0, + "33": 4148525568.0, + "34": 4148525568.0, + "35": 4148525568.0, + "36": 4148525568.0, + "37": 4148525568.0, + "38": 4148525568.0, + "39": 4148525568.0, + "40": 4148525568.0, + "41": 4148525568.0, + "42": 4148525568.0, + "43": 4148525568.0, + "44": 4148525568.0, + "45": 4148525568.0, + "46": 4148525568.0, + "47": 4148525568.0, + "48": 4148525568.0, + "49": 4148525568.0, + "50": 4148525568.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 12.73497, + "2": 0.13463, + "3": 0.12132, + "4": 0.12121, + "5": 0.12122, + "6": 0.11968, + "7": 0.12077, + "8": 0.12029, + "9": 0.12102, + "10": 0.12242, + "11": 0.12132, + "12": 0.11963, + "13": 0.11976, + "14": 0.12077, + "15": 0.12284, + "16": 0.12192, + "17": 0.12079, + "18": 0.12083, + "19": 0.12289, + "20": 0.12192, + "21": 0.12178, + "22": 0.1217, + "23": 0.1195, + "24": 0.12278, + "25": 0.12076, + "26": 0.11902, + "27": 0.12039, + "28": 0.12124, + "29": 0.12162, + "30": 0.12043, + "31": 0.12129, + "32": 0.11876, + "33": 0.12087, + "34": 0.12139, + "35": 0.11913, + "36": 0.12007, + "37": 0.11949, + "38": 0.12009, + "39": 0.12132, + "40": 0.1201, + "41": 0.12285, + "42": 0.12083, + "43": 0.12338, + "44": 0.12174, + "45": 0.12023, + "46": 0.11927, + "47": 0.11992, + "48": 0.12123, + "49": 0.12216, + "50": 0.11881 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..796e07451cc --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.84523, + "2": 10.85412, + "3": 10.85365, + "4": 10.83867, + "5": 10.87428, + "6": 10.89334, + "7": 10.8541, + "8": 10.86232, + "9": 10.86355, + "10": 10.82858, + "11": 10.88772, + "12": 10.87148, + "13": 10.87939, + "14": 10.89122, + "15": 10.81926, + "16": 10.83064, + "17": 10.79873, + "18": 10.81769, + "19": 10.8196, + "20": 10.72749, + "21": 10.70555, + "22": 10.56395, + "23": 10.7282, + "24": 10.60841, + "25": 10.55195, + "26": 10.60869, + "27": 10.62878, + "28": 10.5827, + "29": 10.59984, + "30": 10.36504, + "31": 10.12095, + "32": 10.47626, + "33": 10.46908, + "34": 10.22325, + "35": 10.27845, + "36": 10.22879, + "37": 10.35946, + "38": 10.19333, + "39": 10.41585, + "40": 10.09773, + "41": 10.15714, + "42": 10.22441, + "43": 9.8328, + "44": 9.96934, + "45": 9.84203, + "46": 9.83023, + "47": 10.15603, + "48": 9.85506, + "49": 9.54051, + "50": 9.91254 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1725.0, + "2": 1664.0, + "3": 1710.0, + "4": 1712.0, + "5": 1834.0, + "6": 1743.0, + "7": 1803.0, + "8": 1744.0, + "9": 1770.0, + "10": 1478.0, + "11": 1879.0, + "12": 1696.0, + "13": 1952.0, + "14": 1732.0, + "15": 1891.0, + "16": 1872.0, + "17": 1737.0, + "18": 1744.0, + "19": 1843.0, + "20": 1639.0, + "21": 1817.0, + "22": 1615.0, + "23": 1960.0, + "24": 1646.0, + "25": 1623.0, + "26": 1671.0, + "27": 1841.0, + "28": 2009.0, + "29": 1956.0, + "30": 1882.0, + "31": 1597.0, + "32": 1921.0, + "33": 2114.0, + "34": 1828.0, + "35": 2043.0, + "36": 1947.0, + "37": 2338.0, + "38": 2227.0, + "39": 2346.0, + "40": 2168.0, + "41": 2204.0, + "42": 2247.0, + "43": 2078.0, + "44": 2064.0, + "45": 2159.0, + "46": 2489.0, + "47": 2497.0, + "48": 2305.0, + "49": 2272.0, + "50": 2319.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759681536.0, + "2": 759681536.0, + "3": 759681536.0, + "4": 759681536.0, + "5": 759681536.0, + "6": 759681536.0, + "7": 759681536.0, + "8": 759681536.0, + "9": 759681536.0, + "10": 759681536.0, + "11": 759681536.0, + "12": 759681536.0, + "13": 759681536.0, + "14": 759681536.0, + "15": 759681536.0, + "16": 759681536.0, + "17": 759681536.0, + "18": 759681536.0, + "19": 759681536.0, + "20": 759681536.0, + "21": 759681536.0, + "22": 759681536.0, + "23": 759681536.0, + "24": 759681536.0, + "25": 759681536.0, + "26": 759681536.0, + "27": 759681536.0, + "28": 759681536.0, + "29": 759681536.0, + "30": 759681536.0, + "31": 759681536.0, + "32": 759681536.0, + "33": 759681536.0, + "34": 759681536.0, + "35": 759681536.0, + "36": 759681536.0, + "37": 759681536.0, + "38": 759681536.0, + "39": 759681536.0, + "40": 759681536.0, + "41": 759681536.0, + "42": 759681536.0, + "43": 759681536.0, + "44": 759681536.0, + "45": 759681536.0, + "46": 759681536.0, + "47": 759681536.0, + "48": 759681536.0, + "49": 759681536.0, + "50": 759681536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3866813952.0, + "2": 4148525568.0, + "3": 4148525568.0, + "4": 4148525568.0, + "5": 4148525568.0, + "6": 4148525568.0, + "7": 4148525568.0, + "8": 4148525568.0, + "9": 4148525568.0, + "10": 4148525568.0, + "11": 4148525568.0, + "12": 4148525568.0, + "13": 4148525568.0, + "14": 4148525568.0, + "15": 4148525568.0, + "16": 4148525568.0, + "17": 4148525568.0, + "18": 4148525568.0, + "19": 4148525568.0, + "20": 4148525568.0, + "21": 4148525568.0, + "22": 4148525568.0, + "23": 4148525568.0, + "24": 4148525568.0, + "25": 4148525568.0, + "26": 4148525568.0, + "27": 4148525568.0, + "28": 4148525568.0, + "29": 4148525568.0, + "30": 4148525568.0, + "31": 4148525568.0, + "32": 4148525568.0, + "33": 4148525568.0, + "34": 4148525568.0, + "35": 4148525568.0, + "36": 4148525568.0, + "37": 4148525568.0, + "38": 4148525568.0, + "39": 4148525568.0, + "40": 4148525568.0, + "41": 4148525568.0, + "42": 4148525568.0, + "43": 4148525568.0, + "44": 4148525568.0, + "45": 4148525568.0, + "46": 4148525568.0, + "47": 4148525568.0, + "48": 4148525568.0, + "49": 4148525568.0, + "50": 4148525568.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 13.82235, + "2": 0.15582, + "3": 0.10905, + "4": 0.1073, + "5": 0.109, + "6": 0.10732, + "7": 0.10878, + "8": 0.11223, + "9": 0.10518, + "10": 0.10855, + "11": 0.11135, + "12": 0.10511, + "13": 0.1065, + "14": 0.10507, + "15": 0.10485, + "16": 0.10494, + "17": 0.10498, + "18": 0.10434, + "19": 0.10497, + "20": 0.10409, + "21": 0.10596, + "22": 0.10798, + "23": 0.10596, + "24": 0.10493, + "25": 0.10426, + "26": 0.10473, + "27": 0.10393, + "28": 0.10415, + "29": 0.10372, + "30": 0.10375, + "31": 0.10526, + "32": 0.10354, + "33": 0.10378, + "34": 0.10407, + "35": 0.10415, + "36": 0.10637, + "37": 0.10889, + "38": 0.10823, + "39": 0.10551, + "40": 0.10613, + "41": 0.10424, + "42": 0.10385, + "43": 0.10519, + "44": 0.1044, + "45": 0.10488, + "46": 0.10678, + "47": 0.10342, + "48": 0.10517, + "49": 0.10469, + "50": 0.10438 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..b5d55ac433c --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.84523, + "2": 10.85412, + "3": 10.85365, + "4": 10.83867, + "5": 10.87428, + "6": 10.89334, + "7": 10.8541, + "8": 10.86232, + "9": 10.86355, + "10": 10.82858, + "11": 10.88772, + "12": 10.87148, + "13": 10.87939, + "14": 10.89122, + "15": 10.81926, + "16": 10.83064, + "17": 10.79873, + "18": 10.81769, + "19": 10.8196, + "20": 10.72749, + "21": 10.70555, + "22": 10.56395, + "23": 10.7282, + "24": 10.60841, + "25": 10.55195, + "26": 10.60869, + "27": 10.62878, + "28": 10.5827, + "29": 10.59984, + "30": 10.36504, + "31": 10.12095, + "32": 10.47626, + "33": 10.46908, + "34": 10.22325, + "35": 10.27845, + "36": 10.22879, + "37": 10.35946, + "38": 10.19333, + "39": 10.41585, + "40": 10.09773, + "41": 10.15714, + "42": 10.22441, + "43": 9.8328, + "44": 9.96934, + "45": 9.84203, + "46": 9.83023, + "47": 10.15603, + "48": 9.85506, + "49": 9.54051, + "50": 9.91254 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1725.0, + "2": 1664.0, + "3": 1710.0, + "4": 1712.0, + "5": 1834.0, + "6": 1743.0, + "7": 1803.0, + "8": 1744.0, + "9": 1770.0, + "10": 1478.0, + "11": 1879.0, + "12": 1696.0, + "13": 1952.0, + "14": 1732.0, + "15": 1891.0, + "16": 1872.0, + "17": 1737.0, + "18": 1744.0, + "19": 1843.0, + "20": 1639.0, + "21": 1817.0, + "22": 1615.0, + "23": 1960.0, + "24": 1646.0, + "25": 1623.0, + "26": 1671.0, + "27": 1841.0, + "28": 2009.0, + "29": 1956.0, + "30": 1882.0, + "31": 1597.0, + "32": 1921.0, + "33": 2114.0, + "34": 1828.0, + "35": 2043.0, + "36": 1947.0, + "37": 2338.0, + "38": 2227.0, + "39": 2346.0, + "40": 2168.0, + "41": 2204.0, + "42": 2247.0, + "43": 2078.0, + "44": 2064.0, + "45": 2159.0, + "46": 2489.0, + "47": 2497.0, + "48": 2305.0, + "49": 2272.0, + "50": 2319.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759681536.0, + "2": 759681536.0, + "3": 759681536.0, + "4": 759681536.0, + "5": 759681536.0, + "6": 759681536.0, + "7": 759681536.0, + "8": 759681536.0, + "9": 759681536.0, + "10": 759681536.0, + "11": 759681536.0, + "12": 759681536.0, + "13": 759681536.0, + "14": 759681536.0, + "15": 759681536.0, + "16": 759681536.0, + "17": 759681536.0, + "18": 759681536.0, + "19": 759681536.0, + "20": 759681536.0, + "21": 759681536.0, + "22": 759681536.0, + "23": 759681536.0, + "24": 759681536.0, + "25": 759681536.0, + "26": 759681536.0, + "27": 759681536.0, + "28": 759681536.0, + "29": 759681536.0, + "30": 759681536.0, + "31": 759681536.0, + "32": 759681536.0, + "33": 759681536.0, + "34": 759681536.0, + "35": 759681536.0, + "36": 759681536.0, + "37": 759681536.0, + "38": 759681536.0, + "39": 759681536.0, + "40": 759681536.0, + "41": 759681536.0, + "42": 759681536.0, + "43": 759681536.0, + "44": 759681536.0, + "45": 759681536.0, + "46": 759681536.0, + "47": 759681536.0, + "48": 759681536.0, + "49": 759681536.0, + "50": 759681536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3866813952.0, + "2": 4148525568.0, + "3": 4148525568.0, + "4": 4148525568.0, + "5": 4148525568.0, + "6": 4148525568.0, + "7": 4148525568.0, + "8": 4148525568.0, + "9": 4148525568.0, + "10": 4148525568.0, + "11": 4148525568.0, + "12": 4148525568.0, + "13": 4148525568.0, + "14": 4148525568.0, + "15": 4148525568.0, + "16": 4148525568.0, + "17": 4148525568.0, + "18": 4148525568.0, + "19": 4148525568.0, + "20": 4148525568.0, + "21": 4148525568.0, + "22": 4148525568.0, + "23": 4148525568.0, + "24": 4148525568.0, + "25": 4148525568.0, + "26": 4148525568.0, + "27": 4148525568.0, + "28": 4148525568.0, + "29": 4148525568.0, + "30": 4148525568.0, + "31": 4148525568.0, + "32": 4148525568.0, + "33": 4148525568.0, + "34": 4148525568.0, + "35": 4148525568.0, + "36": 4148525568.0, + "37": 4148525568.0, + "38": 4148525568.0, + "39": 4148525568.0, + "40": 4148525568.0, + "41": 4148525568.0, + "42": 4148525568.0, + "43": 4148525568.0, + "44": 4148525568.0, + "45": 4148525568.0, + "46": 4148525568.0, + "47": 4148525568.0, + "48": 4148525568.0, + "49": 4148525568.0, + "50": 4148525568.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 12.80183, + "2": 0.14507, + "3": 0.13423, + "4": 0.12539, + "5": 0.12233, + "6": 0.12325, + "7": 0.12437, + "8": 0.12453, + "9": 0.12348, + "10": 0.12305, + "11": 0.12491, + "12": 0.12346, + "13": 0.1234, + "14": 0.12145, + "15": 0.12227, + "16": 0.12254, + "17": 0.12422, + "18": 0.12237, + "19": 0.12342, + "20": 0.1219, + "21": 0.1212, + "22": 0.12243, + "23": 0.11962, + "24": 0.1224, + "25": 0.12155, + "26": 0.12253, + "27": 0.12095, + "28": 0.12035, + "29": 0.12115, + "30": 0.11898, + "31": 0.12063, + "32": 0.1189, + "33": 0.12106, + "34": 0.11766, + "35": 0.11962, + "36": 0.12112, + "37": 0.11847, + "38": 0.11727, + "39": 0.11905, + "40": 0.11887, + "41": 0.11948, + "42": 0.11832, + "43": 0.11858, + "44": 0.1186, + "45": 0.12057, + "46": 0.1186, + "47": 0.12097, + "48": 0.11934, + "49": 0.11972, + "50": 0.12006 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..ed32255e786 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82005, + "2": 10.81907, + "3": 10.81396, + "4": 10.78497, + "5": 10.85284, + "6": 10.87449, + "7": 10.83201, + "8": 10.83297, + "9": 10.83935, + "10": 10.78455, + "11": 10.87798, + "12": 10.86112, + "13": 10.86444, + "14": 10.87605, + "15": 10.79229, + "16": 10.79509, + "17": 10.76768, + "18": 10.81005, + "19": 10.79719, + "20": 10.69211, + "21": 10.68164, + "22": 10.52085, + "23": 10.70893, + "24": 10.57599, + "25": 10.52412, + "26": 10.59517, + "27": 10.58426, + "28": 10.56233, + "29": 10.57013, + "30": 10.34552, + "31": 10.10049, + "32": 10.45378, + "33": 10.44627, + "34": 10.20606, + "35": 10.26239, + "36": 10.21239, + "37": 10.32522, + "38": 10.16777, + "39": 10.38334, + "40": 10.07241, + "41": 10.13863, + "42": 10.19814, + "43": 9.81073, + "44": 9.93244, + "45": 9.81101, + "46": 9.80877, + "47": 10.12608, + "48": 9.82108, + "49": 9.50625, + "50": 9.88422 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1559.0, + "2": 1591.0, + "3": 1727.0, + "4": 1835.0, + "5": 1840.0, + "6": 1719.0, + "7": 1740.0, + "8": 1591.0, + "9": 1839.0, + "10": 1380.0, + "11": 1856.0, + "12": 1693.0, + "13": 1906.0, + "14": 1757.0, + "15": 1848.0, + "16": 1791.0, + "17": 1752.0, + "18": 1669.0, + "19": 1722.0, + "20": 1601.0, + "21": 1900.0, + "22": 1662.0, + "23": 2006.0, + "24": 1597.0, + "25": 1635.0, + "26": 1709.0, + "27": 1931.0, + "28": 2043.0, + "29": 1888.0, + "30": 1936.0, + "31": 1550.0, + "32": 1913.0, + "33": 2135.0, + "34": 1703.0, + "35": 1908.0, + "36": 1953.0, + "37": 2291.0, + "38": 2210.0, + "39": 2334.0, + "40": 2100.0, + "41": 2300.0, + "42": 2236.0, + "43": 1897.0, + "44": 1993.0, + "45": 2098.0, + "46": 2298.0, + "47": 2504.0, + "48": 2356.0, + "49": 2268.0, + "50": 2333.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 730320896.0, + "2": 730320896.0, + "3": 730320896.0, + "4": 730320896.0, + "5": 730320896.0, + "6": 730320896.0, + "7": 730320896.0, + "8": 730320896.0, + "9": 730320896.0, + "10": 730320896.0, + "11": 730320896.0, + "12": 730320896.0, + "13": 730320896.0, + "14": 730320896.0, + "15": 730320896.0, + "16": 730320896.0, + "17": 730320896.0, + "18": 730320896.0, + "19": 730320896.0, + "20": 730320896.0, + "21": 730320896.0, + "22": 730320896.0, + "23": 730320896.0, + "24": 730320896.0, + "25": 730320896.0, + "26": 730320896.0, + "27": 730320896.0, + "28": 730320896.0, + "29": 730320896.0, + "30": 730320896.0, + "31": 730320896.0, + "32": 730320896.0, + "33": 730320896.0, + "34": 730320896.0, + "35": 730320896.0, + "36": 730320896.0, + "37": 730320896.0, + "38": 730320896.0, + "39": 730320896.0, + "40": 730320896.0, + "41": 730320896.0, + "42": 730320896.0, + "43": 730320896.0, + "44": 730320896.0, + "45": 730320896.0, + "46": 730320896.0, + "47": 730320896.0, + "48": 730320896.0, + "49": 730320896.0, + "50": 730320896.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3837453312.0, + "2": 4119164928.0, + "3": 4119164928.0, + "4": 4119164928.0, + "5": 4119164928.0, + "6": 4119164928.0, + "7": 4119164928.0, + "8": 4119164928.0, + "9": 4119164928.0, + "10": 4119164928.0, + "11": 4119164928.0, + "12": 4119164928.0, + "13": 4119164928.0, + "14": 4119164928.0, + "15": 4119164928.0, + "16": 4119164928.0, + "17": 4119164928.0, + "18": 4119164928.0, + "19": 4119164928.0, + "20": 4119164928.0, + "21": 4119164928.0, + "22": 4119164928.0, + "23": 4119164928.0, + "24": 4119164928.0, + "25": 4119164928.0, + "26": 4119164928.0, + "27": 4119164928.0, + "28": 4119164928.0, + "29": 4119164928.0, + "30": 4119164928.0, + "31": 4119164928.0, + "32": 4119164928.0, + "33": 4119164928.0, + "34": 4119164928.0, + "35": 4119164928.0, + "36": 4119164928.0, + "37": 4119164928.0, + "38": 4119164928.0, + "39": 4119164928.0, + "40": 4119164928.0, + "41": 4119164928.0, + "42": 4119164928.0, + "43": 4119164928.0, + "44": 4119164928.0, + "45": 4119164928.0, + "46": 4119164928.0, + "47": 4119164928.0, + "48": 4119164928.0, + "49": 4119164928.0, + "50": 4119164928.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 21.82644, + "2": 0.19908, + "3": 0.17208, + "4": 0.17348, + "5": 0.40692, + "6": 0.17348, + "7": 0.17221, + "8": 0.17282, + "9": 0.17343, + "10": 0.17259, + "11": 0.44574, + "12": 0.17197, + "13": 0.17235, + "14": 0.17135, + "15": 0.17217, + "16": 0.17214, + "17": 0.17346, + "18": 0.17055, + "19": 0.17076, + "20": 0.17071, + "21": 0.17349, + "22": 0.17417, + "23": 0.16998, + "24": 0.17303, + "25": 0.17019, + "26": 0.16905, + "27": 0.16967, + "28": 0.17087, + "29": 0.16779, + "30": 0.16786, + "31": 0.1689, + "32": 0.16672, + "33": 0.1672, + "34": 0.16926, + "35": 0.16914, + "36": 0.16747, + "37": 0.16765, + "38": 0.16682, + "39": 0.1667, + "40": 0.16914, + "41": 0.16662, + "42": 0.16688, + "43": 0.16639, + "44": 0.16515, + "45": 0.16517, + "46": 0.16701, + "47": 0.16705, + "48": 0.16627, + "49": 0.16652, + "50": 0.16472 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..13f8dfbd7e8 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82005, + "2": 10.81907, + "3": 10.81396, + "4": 10.78497, + "5": 10.85284, + "6": 10.87449, + "7": 10.83201, + "8": 10.83297, + "9": 10.83935, + "10": 10.78455, + "11": 10.87798, + "12": 10.86112, + "13": 10.86444, + "14": 10.87605, + "15": 10.79229, + "16": 10.79509, + "17": 10.76768, + "18": 10.81005, + "19": 10.79719, + "20": 10.69211, + "21": 10.68164, + "22": 10.52085, + "23": 10.70893, + "24": 10.57599, + "25": 10.52412, + "26": 10.59517, + "27": 10.58426, + "28": 10.56233, + "29": 10.57013, + "30": 10.34552, + "31": 10.10049, + "32": 10.45378, + "33": 10.44627, + "34": 10.20606, + "35": 10.26239, + "36": 10.21239, + "37": 10.32522, + "38": 10.16777, + "39": 10.38334, + "40": 10.07241, + "41": 10.13863, + "42": 10.19814, + "43": 9.81073, + "44": 9.93244, + "45": 9.81101, + "46": 9.80877, + "47": 10.12608, + "48": 9.82108, + "49": 9.50625, + "50": 9.88422 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1559.0, + "2": 1591.0, + "3": 1727.0, + "4": 1835.0, + "5": 1840.0, + "6": 1719.0, + "7": 1740.0, + "8": 1591.0, + "9": 1839.0, + "10": 1380.0, + "11": 1856.0, + "12": 1693.0, + "13": 1906.0, + "14": 1757.0, + "15": 1848.0, + "16": 1791.0, + "17": 1752.0, + "18": 1669.0, + "19": 1722.0, + "20": 1601.0, + "21": 1900.0, + "22": 1662.0, + "23": 2006.0, + "24": 1597.0, + "25": 1635.0, + "26": 1709.0, + "27": 1931.0, + "28": 2043.0, + "29": 1888.0, + "30": 1936.0, + "31": 1550.0, + "32": 1913.0, + "33": 2135.0, + "34": 1703.0, + "35": 1908.0, + "36": 1953.0, + "37": 2291.0, + "38": 2210.0, + "39": 2334.0, + "40": 2100.0, + "41": 2300.0, + "42": 2236.0, + "43": 1897.0, + "44": 1993.0, + "45": 2098.0, + "46": 2298.0, + "47": 2504.0, + "48": 2356.0, + "49": 2268.0, + "50": 2333.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 730320896.0, + "2": 730320896.0, + "3": 730320896.0, + "4": 730320896.0, + "5": 730320896.0, + "6": 730320896.0, + "7": 730320896.0, + "8": 730320896.0, + "9": 730320896.0, + "10": 730320896.0, + "11": 730320896.0, + "12": 730320896.0, + "13": 730320896.0, + "14": 730320896.0, + "15": 730320896.0, + "16": 730320896.0, + "17": 730320896.0, + "18": 730320896.0, + "19": 730320896.0, + "20": 730320896.0, + "21": 730320896.0, + "22": 730320896.0, + "23": 730320896.0, + "24": 730320896.0, + "25": 730320896.0, + "26": 730320896.0, + "27": 730320896.0, + "28": 730320896.0, + "29": 730320896.0, + "30": 730320896.0, + "31": 730320896.0, + "32": 730320896.0, + "33": 730320896.0, + "34": 730320896.0, + "35": 730320896.0, + "36": 730320896.0, + "37": 730320896.0, + "38": 730320896.0, + "39": 730320896.0, + "40": 730320896.0, + "41": 730320896.0, + "42": 730320896.0, + "43": 730320896.0, + "44": 730320896.0, + "45": 730320896.0, + "46": 730320896.0, + "47": 730320896.0, + "48": 730320896.0, + "49": 730320896.0, + "50": 730320896.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3837453312.0, + "2": 4119164928.0, + "3": 4119164928.0, + "4": 4119164928.0, + "5": 4119164928.0, + "6": 4119164928.0, + "7": 4119164928.0, + "8": 4119164928.0, + "9": 4119164928.0, + "10": 4119164928.0, + "11": 4119164928.0, + "12": 4119164928.0, + "13": 4119164928.0, + "14": 4119164928.0, + "15": 4119164928.0, + "16": 4119164928.0, + "17": 4119164928.0, + "18": 4119164928.0, + "19": 4119164928.0, + "20": 4119164928.0, + "21": 4119164928.0, + "22": 4119164928.0, + "23": 4119164928.0, + "24": 4119164928.0, + "25": 4119164928.0, + "26": 4119164928.0, + "27": 4119164928.0, + "28": 4119164928.0, + "29": 4119164928.0, + "30": 4119164928.0, + "31": 4119164928.0, + "32": 4119164928.0, + "33": 4119164928.0, + "34": 4119164928.0, + "35": 4119164928.0, + "36": 4119164928.0, + "37": 4119164928.0, + "38": 4119164928.0, + "39": 4119164928.0, + "40": 4119164928.0, + "41": 4119164928.0, + "42": 4119164928.0, + "43": 4119164928.0, + "44": 4119164928.0, + "45": 4119164928.0, + "46": 4119164928.0, + "47": 4119164928.0, + "48": 4119164928.0, + "49": 4119164928.0, + "50": 4119164928.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 19.01426, + "2": 0.19331, + "3": 0.17686, + "4": 0.17351, + "5": 0.17409, + "6": 0.39233, + "7": 0.17062, + "8": 0.17244, + "9": 0.1721, + "10": 0.1728, + "11": 0.16853, + "12": 0.16766, + "13": 0.45674, + "14": 0.17028, + "15": 0.16973, + "16": 0.16893, + "17": 0.16884, + "18": 0.17013, + "19": 0.16961, + "20": 0.17167, + "21": 0.1673, + "22": 0.16984, + "23": 0.17183, + "24": 0.17023, + "25": 0.16914, + "26": 0.16981, + "27": 0.1674, + "28": 0.16751, + "29": 0.16693, + "30": 0.16857, + "31": 0.16737, + "32": 0.16785, + "33": 0.16718, + "34": 0.16686, + "35": 0.16592, + "36": 0.16924, + "37": 0.16753, + "38": 0.16813, + "39": 0.16663, + "40": 0.22514, + "41": 0.16853, + "42": 0.17036, + "43": 0.16917, + "44": 0.167, + "45": 0.16766, + "46": 0.167, + "47": 0.16654, + "48": 0.16869, + "49": 0.16681, + "50": 0.16794 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index a8768535dbb..f88bc4dbaad 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,287 @@ -{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.84523, "5": 10.87428, "10": 10.82859, "15": 10.81927, "20": 10.72749, "25": 10.55198, "30": 10.36511, "35": 10.27848, "40": 10.09773, "45": 9.84205, "50": 9.91258}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1725.0, "5": 1834.0, "10": 1459.0, "15": 1886.0, "20": 1649.0, "25": 1647.0, "30": 1964.0, "35": 2017.0, "40": 2207.0, "45": 2164.0, "50": 2224.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 551155200.0, "5": 551155200.0, "10": 551155200.0, "15": 551155200.0, "20": 551155200.0, "25": 551155200.0, "30": 551155200.0, "35": 551155200.0, "40": 551155200.0, "45": 551155200.0, "50": 551155200.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3798206976.0, "5": 3940916736.0, "10": 3940916736.0, "15": 3940916736.0, "20": 3940916736.0, "25": 3940916736.0, "30": 3940916736.0, "35": 3940916736.0, "40": 3940916736.0, "45": 3940916736.0, "50": 3940916736.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 17.08492, "5": 0.11359, "10": 0.11447, "15": 0.11042, "20": 0.1105, "25": 0.11485, "30": 0.11374, "35": 0.1115, "40": 0.10857, "45": 0.11114, "50": 0.10673}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.84523, + "2": 10.85412, + "3": 10.85365, + "4": 10.83867, + "5": 10.87428, + "6": 10.89334, + "7": 10.8541, + "8": 10.86235, + "9": 10.86352, + "10": 10.82859, + "11": 10.88772, + "12": 10.87148, + "13": 10.87938, + "14": 10.89123, + "15": 10.81927, + "16": 10.83063, + "17": 10.79878, + "18": 10.81771, + "19": 10.81957, + "20": 10.72749, + "21": 10.70552, + "22": 10.56396, + "23": 10.72823, + "24": 10.60839, + "25": 10.55198, + "26": 10.60868, + "27": 10.62879, + "28": 10.58271, + "29": 10.59982, + "30": 10.36511, + "31": 10.12096, + "32": 10.47628, + "33": 10.46906, + "34": 10.22326, + "35": 10.27848, + "36": 10.22883, + "37": 10.35947, + "38": 10.19331, + "39": 10.41586, + "40": 10.09773, + "41": 10.15718, + "42": 10.22441, + "43": 9.83281, + "44": 9.96935, + "45": 9.84205, + "46": 9.83017, + "47": 10.15602, + "48": 9.85503, + "49": 9.54049, + "50": 9.91258 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1725.0, + "2": 1664.0, + "3": 1710.0, + "4": 1712.0, + "5": 1834.0, + "6": 1743.0, + "7": 1803.0, + "8": 1737.0, + "9": 1779.0, + "10": 1459.0, + "11": 1898.0, + "12": 1661.0, + "13": 1860.0, + "14": 1764.0, + "15": 1886.0, + "16": 1916.0, + "17": 1773.0, + "18": 1702.0, + "19": 1742.0, + "20": 1649.0, + "21": 1899.0, + "22": 1631.0, + "23": 1960.0, + "24": 1570.0, + "25": 1647.0, + "26": 1649.0, + "27": 1811.0, + "28": 1930.0, + "29": 1910.0, + "30": 1964.0, + "31": 1536.0, + "32": 1873.0, + "33": 2191.0, + "34": 1838.0, + "35": 2017.0, + "36": 1916.0, + "37": 2345.0, + "38": 2247.0, + "39": 2374.0, + "40": 2207.0, + "41": 2246.0, + "42": 2291.0, + "43": 2027.0, + "44": 2147.0, + "45": 2164.0, + "46": 2300.0, + "47": 2418.0, + "48": 2467.0, + "49": 2255.0, + "50": 2224.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 552054272.0, + "2": 552054272.0, + "3": 552054272.0, + "4": 552054272.0, + "5": 552054272.0, + "6": 552054272.0, + "7": 552054272.0, + "8": 552054272.0, + "9": 552054272.0, + "10": 552054272.0, + "11": 552054272.0, + "12": 552054272.0, + "13": 552054272.0, + "14": 552054272.0, + "15": 552054272.0, + "16": 552054272.0, + "17": 552054272.0, + "18": 552054272.0, + "19": 552054272.0, + "20": 552054272.0, + "21": 552054272.0, + "22": 552054272.0, + "23": 552054272.0, + "24": 552054272.0, + "25": 552054272.0, + "26": 552054272.0, + "27": 552054272.0, + "28": 552054272.0, + "29": 552054272.0, + "30": 552054272.0, + "31": 552054272.0, + "32": 552054272.0, + "33": 552054272.0, + "34": 552054272.0, + "35": 552054272.0, + "36": 552054272.0, + "37": 552054272.0, + "38": 552054272.0, + "39": 552054272.0, + "40": 552054272.0, + "41": 552054272.0, + "42": 552054272.0, + "43": 552054272.0, + "44": 552054272.0, + "45": 552054272.0, + "46": 552054272.0, + "47": 552054272.0, + "48": 552054272.0, + "49": 552054272.0, + "50": 552054272.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3798206976.0, + "2": 3940899328.0, + "3": 3940899328.0, + "4": 3940899328.0, + "5": 3940899328.0, + "6": 3940899328.0, + "7": 3940899328.0, + "8": 3940899328.0, + "9": 3940899328.0, + "10": 3940899328.0, + "11": 3940899328.0, + "12": 3940899328.0, + "13": 3940899328.0, + "14": 3940899328.0, + "15": 3940899328.0, + "16": 3940899328.0, + "17": 3940899328.0, + "18": 3940899328.0, + "19": 3940899328.0, + "20": 3940899328.0, + "21": 3940899328.0, + "22": 3940899328.0, + "23": 3940899328.0, + "24": 3940899328.0, + "25": 3940899328.0, + "26": 3940899328.0, + "27": 3940899328.0, + "28": 3940899328.0, + "29": 3940899328.0, + "30": 3940899328.0, + "31": 3940899328.0, + "32": 3940899328.0, + "33": 3940899328.0, + "34": 3940899328.0, + "35": 3940899328.0, + "36": 3940899328.0, + "37": 3940899328.0, + "38": 3940899328.0, + "39": 3940899328.0, + "40": 3940899328.0, + "41": 3940899328.0, + "42": 3940899328.0, + "43": 3940899328.0, + "44": 3940899328.0, + "45": 3940899328.0, + "46": 3940899328.0, + "47": 3940899328.0, + "48": 3940899328.0, + "49": 3940899328.0, + "50": 3940899328.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 13.77378, + "2": 0.15884, + "3": 0.14867, + "4": 0.12729, + "5": 0.12441, + "6": 0.12501, + "7": 0.12396, + "8": 0.12217, + "9": 0.12636, + "10": 0.12685, + "11": 0.28489, + "12": 0.1228, + "13": 0.12284, + "14": 0.12293, + "15": 0.12456, + "16": 0.12522, + "17": 0.12575, + "18": 0.12506, + "19": 0.12636, + "20": 0.12549, + "21": 0.28282, + "22": 0.12596, + "23": 0.12451, + "24": 0.12852, + "25": 0.12585, + "26": 0.1249, + "27": 0.12809, + "28": 0.12564, + "29": 0.12685, + "30": 0.12691, + "31": 0.29536, + "32": 0.12574, + "33": 0.12648, + "34": 0.12772, + "35": 0.12732, + "36": 0.12522, + "37": 0.12739, + "38": 0.12791, + "39": 0.12659, + "40": 0.12766, + "41": 0.28835, + "42": 0.12796, + "43": 0.12957, + "44": 0.12516, + "45": 0.12485, + "46": 0.12641, + "47": 0.12384, + "48": 0.12562, + "49": 0.12302, + "50": 0.12604 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..24a2e339e46 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.84523, + "2": 10.85412, + "3": 10.85365, + "4": 10.83867, + "5": 10.87428, + "6": 10.89334, + "7": 10.8541, + "8": 10.86235, + "9": 10.86352, + "10": 10.82859, + "11": 10.88772, + "12": 10.87148, + "13": 10.87938, + "14": 10.89123, + "15": 10.81927, + "16": 10.83063, + "17": 10.79878, + "18": 10.81771, + "19": 10.81957, + "20": 10.72749, + "21": 10.70552, + "22": 10.56396, + "23": 10.72823, + "24": 10.60839, + "25": 10.55198, + "26": 10.60868, + "27": 10.62879, + "28": 10.58271, + "29": 10.59982, + "30": 10.36511, + "31": 10.12096, + "32": 10.47628, + "33": 10.46906, + "34": 10.22326, + "35": 10.27848, + "36": 10.22883, + "37": 10.35947, + "38": 10.19331, + "39": 10.41586, + "40": 10.09773, + "41": 10.15718, + "42": 10.22441, + "43": 9.83281, + "44": 9.96935, + "45": 9.84205, + "46": 9.83017, + "47": 10.15602, + "48": 9.85503, + "49": 9.54049, + "50": 9.91258 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1725.0, + "2": 1664.0, + "3": 1710.0, + "4": 1712.0, + "5": 1834.0, + "6": 1743.0, + "7": 1803.0, + "8": 1737.0, + "9": 1779.0, + "10": 1459.0, + "11": 1898.0, + "12": 1661.0, + "13": 1860.0, + "14": 1764.0, + "15": 1886.0, + "16": 1916.0, + "17": 1773.0, + "18": 1702.0, + "19": 1742.0, + "20": 1649.0, + "21": 1899.0, + "22": 1631.0, + "23": 1960.0, + "24": 1570.0, + "25": 1647.0, + "26": 1649.0, + "27": 1811.0, + "28": 1930.0, + "29": 1910.0, + "30": 1964.0, + "31": 1536.0, + "32": 1873.0, + "33": 2191.0, + "34": 1838.0, + "35": 2017.0, + "36": 1916.0, + "37": 2345.0, + "38": 2247.0, + "39": 2374.0, + "40": 2207.0, + "41": 2246.0, + "42": 2291.0, + "43": 2027.0, + "44": 2147.0, + "45": 2164.0, + "46": 2300.0, + "47": 2418.0, + "48": 2467.0, + "49": 2255.0, + "50": 2224.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 552054272.0, + "2": 552054272.0, + "3": 552054272.0, + "4": 552054272.0, + "5": 552054272.0, + "6": 552054272.0, + "7": 552054272.0, + "8": 552054272.0, + "9": 552054272.0, + "10": 552054272.0, + "11": 552054272.0, + "12": 552054272.0, + "13": 552054272.0, + "14": 552054272.0, + "15": 552054272.0, + "16": 552054272.0, + "17": 552054272.0, + "18": 552054272.0, + "19": 552054272.0, + "20": 552054272.0, + "21": 552054272.0, + "22": 552054272.0, + "23": 552054272.0, + "24": 552054272.0, + "25": 552054272.0, + "26": 552054272.0, + "27": 552054272.0, + "28": 552054272.0, + "29": 552054272.0, + "30": 552054272.0, + "31": 552054272.0, + "32": 552054272.0, + "33": 552054272.0, + "34": 552054272.0, + "35": 552054272.0, + "36": 552054272.0, + "37": 552054272.0, + "38": 552054272.0, + "39": 552054272.0, + "40": 552054272.0, + "41": 552054272.0, + "42": 552054272.0, + "43": 552054272.0, + "44": 552054272.0, + "45": 552054272.0, + "46": 552054272.0, + "47": 552054272.0, + "48": 552054272.0, + "49": 552054272.0, + "50": 552054272.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3798206976.0, + "2": 3940899328.0, + "3": 3940899328.0, + "4": 3940899328.0, + "5": 3940899328.0, + "6": 3940899328.0, + "7": 3940899328.0, + "8": 3940899328.0, + "9": 3940899328.0, + "10": 3940899328.0, + "11": 3940899328.0, + "12": 3940899328.0, + "13": 3940899328.0, + "14": 3940899328.0, + "15": 3940899328.0, + "16": 3940899328.0, + "17": 3940899328.0, + "18": 3940899328.0, + "19": 3940899328.0, + "20": 3940899328.0, + "21": 3940899328.0, + "22": 3940899328.0, + "23": 3940899328.0, + "24": 3940899328.0, + "25": 3940899328.0, + "26": 3940899328.0, + "27": 3940899328.0, + "28": 3940899328.0, + "29": 3940899328.0, + "30": 3940899328.0, + "31": 3940899328.0, + "32": 3940899328.0, + "33": 3940899328.0, + "34": 3940899328.0, + "35": 3940899328.0, + "36": 3940899328.0, + "37": 3940899328.0, + "38": 3940899328.0, + "39": 3940899328.0, + "40": 3940899328.0, + "41": 3940899328.0, + "42": 3940899328.0, + "43": 3940899328.0, + "44": 3940899328.0, + "45": 3940899328.0, + "46": 3940899328.0, + "47": 3940899328.0, + "48": 3940899328.0, + "49": 3940899328.0, + "50": 3940899328.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 17.59634, + "2": 0.14856, + "3": 0.11161, + "4": 0.11302, + "5": 0.11107, + "6": 0.1136, + "7": 0.11041, + "8": 0.10987, + "9": 0.10957, + "10": 0.11046, + "11": 0.24569, + "12": 0.11057, + "13": 0.11113, + "14": 0.10972, + "15": 0.10919, + "16": 0.10934, + "17": 0.11, + "18": 0.11335, + "19": 0.11254, + "20": 0.11141, + "21": 0.24662, + "22": 0.11244, + "23": 0.11141, + "24": 0.11252, + "25": 0.11118, + "26": 0.11137, + "27": 0.1105, + "28": 0.11086, + "29": 0.11045, + "30": 0.11129, + "31": 0.24072, + "32": 0.11093, + "33": 0.11087, + "34": 0.11452, + "35": 0.12015, + "36": 0.11133, + "37": 0.1109, + "38": 0.11245, + "39": 0.11262, + "40": 0.11211, + "41": 0.23988, + "42": 0.11163, + "43": 0.11285, + "44": 0.1115, + "45": 0.1137, + "46": 0.11213, + "47": 0.11057, + "48": 0.11163, + "49": 0.11229, + "50": 0.11164 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..5e069163f6c --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.84523, + "2": 10.85412, + "3": 10.85365, + "4": 10.83867, + "5": 10.87428, + "6": 10.89334, + "7": 10.8541, + "8": 10.86235, + "9": 10.86352, + "10": 10.82859, + "11": 10.88772, + "12": 10.87148, + "13": 10.87938, + "14": 10.89123, + "15": 10.81927, + "16": 10.83063, + "17": 10.79878, + "18": 10.81771, + "19": 10.81957, + "20": 10.72749, + "21": 10.70552, + "22": 10.56396, + "23": 10.72823, + "24": 10.60839, + "25": 10.55198, + "26": 10.60868, + "27": 10.62879, + "28": 10.58271, + "29": 10.59982, + "30": 10.36511, + "31": 10.12096, + "32": 10.47628, + "33": 10.46906, + "34": 10.22326, + "35": 10.27848, + "36": 10.22883, + "37": 10.35947, + "38": 10.19331, + "39": 10.41586, + "40": 10.09773, + "41": 10.15718, + "42": 10.22441, + "43": 9.83281, + "44": 9.96935, + "45": 9.84205, + "46": 9.83017, + "47": 10.15602, + "48": 9.85503, + "49": 9.54049, + "50": 9.91258 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1725.0, + "2": 1664.0, + "3": 1710.0, + "4": 1712.0, + "5": 1834.0, + "6": 1743.0, + "7": 1803.0, + "8": 1737.0, + "9": 1779.0, + "10": 1459.0, + "11": 1898.0, + "12": 1661.0, + "13": 1860.0, + "14": 1764.0, + "15": 1886.0, + "16": 1916.0, + "17": 1773.0, + "18": 1702.0, + "19": 1742.0, + "20": 1649.0, + "21": 1899.0, + "22": 1631.0, + "23": 1960.0, + "24": 1570.0, + "25": 1647.0, + "26": 1649.0, + "27": 1811.0, + "28": 1930.0, + "29": 1910.0, + "30": 1964.0, + "31": 1536.0, + "32": 1873.0, + "33": 2191.0, + "34": 1838.0, + "35": 2017.0, + "36": 1916.0, + "37": 2345.0, + "38": 2247.0, + "39": 2374.0, + "40": 2207.0, + "41": 2246.0, + "42": 2291.0, + "43": 2027.0, + "44": 2147.0, + "45": 2164.0, + "46": 2300.0, + "47": 2418.0, + "48": 2467.0, + "49": 2255.0, + "50": 2224.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 552054272.0, + "2": 552054272.0, + "3": 552054272.0, + "4": 552054272.0, + "5": 552054272.0, + "6": 552054272.0, + "7": 552054272.0, + "8": 552054272.0, + "9": 552054272.0, + "10": 552054272.0, + "11": 552054272.0, + "12": 552054272.0, + "13": 552054272.0, + "14": 552054272.0, + "15": 552054272.0, + "16": 552054272.0, + "17": 552054272.0, + "18": 552054272.0, + "19": 552054272.0, + "20": 552054272.0, + "21": 552054272.0, + "22": 552054272.0, + "23": 552054272.0, + "24": 552054272.0, + "25": 552054272.0, + "26": 552054272.0, + "27": 552054272.0, + "28": 552054272.0, + "29": 552054272.0, + "30": 552054272.0, + "31": 552054272.0, + "32": 552054272.0, + "33": 552054272.0, + "34": 552054272.0, + "35": 552054272.0, + "36": 552054272.0, + "37": 552054272.0, + "38": 552054272.0, + "39": 552054272.0, + "40": 552054272.0, + "41": 552054272.0, + "42": 552054272.0, + "43": 552054272.0, + "44": 552054272.0, + "45": 552054272.0, + "46": 552054272.0, + "47": 552054272.0, + "48": 552054272.0, + "49": 552054272.0, + "50": 552054272.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3798206976.0, + "2": 3940899328.0, + "3": 3940899328.0, + "4": 3940899328.0, + "5": 3940899328.0, + "6": 3940899328.0, + "7": 3940899328.0, + "8": 3940899328.0, + "9": 3940899328.0, + "10": 3940899328.0, + "11": 3940899328.0, + "12": 3940899328.0, + "13": 3940899328.0, + "14": 3940899328.0, + "15": 3940899328.0, + "16": 3940899328.0, + "17": 3940899328.0, + "18": 3940899328.0, + "19": 3940899328.0, + "20": 3940899328.0, + "21": 3940899328.0, + "22": 3940899328.0, + "23": 3940899328.0, + "24": 3940899328.0, + "25": 3940899328.0, + "26": 3940899328.0, + "27": 3940899328.0, + "28": 3940899328.0, + "29": 3940899328.0, + "30": 3940899328.0, + "31": 3940899328.0, + "32": 3940899328.0, + "33": 3940899328.0, + "34": 3940899328.0, + "35": 3940899328.0, + "36": 3940899328.0, + "37": 3940899328.0, + "38": 3940899328.0, + "39": 3940899328.0, + "40": 3940899328.0, + "41": 3940899328.0, + "42": 3940899328.0, + "43": 3940899328.0, + "44": 3940899328.0, + "45": 3940899328.0, + "46": 3940899328.0, + "47": 3940899328.0, + "48": 3940899328.0, + "49": 3940899328.0, + "50": 3940899328.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 15.65845, + "2": 0.14332, + "3": 0.12833, + "4": 0.12525, + "5": 0.12451, + "6": 0.12488, + "7": 0.12455, + "8": 0.12623, + "9": 0.1249, + "10": 0.127, + "11": 0.29256, + "12": 0.12446, + "13": 0.12388, + "14": 0.12448, + "15": 0.12475, + "16": 0.12507, + "17": 0.12682, + "18": 0.12473, + "19": 0.12569, + "20": 0.12441, + "21": 0.28384, + "22": 0.12554, + "23": 0.12552, + "24": 0.12663, + "25": 0.12441, + "26": 0.12547, + "27": 0.12485, + "28": 0.12492, + "29": 0.12419, + "30": 0.12518, + "31": 0.28416, + "32": 0.12399, + "33": 0.12692, + "34": 0.12606, + "35": 0.12537, + "36": 0.12614, + "37": 0.12484, + "38": 0.12464, + "39": 0.12396, + "40": 0.1239, + "41": 0.28831, + "42": 0.12609, + "43": 0.12537, + "44": 0.12484, + "45": 0.12567, + "46": 0.12791, + "47": 0.12281, + "48": 0.124, + "49": 0.12486, + "50": 0.12585 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..62be0bafcf5 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82005, + "2": 10.81907, + "3": 10.81396, + "4": 10.78497, + "5": 10.85284, + "6": 10.87449, + "7": 10.83201, + "8": 10.83297, + "9": 10.83935, + "10": 10.78455, + "11": 10.87798, + "12": 10.86112, + "13": 10.86444, + "14": 10.87605, + "15": 10.7923, + "16": 10.7951, + "17": 10.76773, + "18": 10.81002, + "19": 10.79715, + "20": 10.69213, + "21": 10.68165, + "22": 10.52083, + "23": 10.70895, + "24": 10.57597, + "25": 10.5241, + "26": 10.59512, + "27": 10.58424, + "28": 10.56231, + "29": 10.57009, + "30": 10.34556, + "31": 10.10048, + "32": 10.45377, + "33": 10.44632, + "34": 10.20606, + "35": 10.26241, + "36": 10.21241, + "37": 10.32522, + "38": 10.16779, + "39": 10.38327, + "40": 10.07237, + "41": 10.13863, + "42": 10.19814, + "43": 9.81079, + "44": 9.93246, + "45": 9.811, + "46": 9.8088, + "47": 10.12607, + "48": 9.82111, + "49": 9.50627, + "50": 9.88419 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1559.0, + "2": 1591.0, + "3": 1727.0, + "4": 1835.0, + "5": 1840.0, + "6": 1719.0, + "7": 1740.0, + "8": 1591.0, + "9": 1839.0, + "10": 1380.0, + "11": 1856.0, + "12": 1693.0, + "13": 1906.0, + "14": 1757.0, + "15": 1850.0, + "16": 1754.0, + "17": 1768.0, + "18": 1671.0, + "19": 1715.0, + "20": 1699.0, + "21": 1891.0, + "22": 1794.0, + "23": 1970.0, + "24": 1751.0, + "25": 1614.0, + "26": 1805.0, + "27": 1821.0, + "28": 2042.0, + "29": 2014.0, + "30": 1905.0, + "31": 1658.0, + "32": 1848.0, + "33": 2113.0, + "34": 1678.0, + "35": 1933.0, + "36": 1922.0, + "37": 2309.0, + "38": 2120.0, + "39": 2469.0, + "40": 2169.0, + "41": 2241.0, + "42": 2276.0, + "43": 1937.0, + "44": 2090.0, + "45": 2101.0, + "46": 2282.0, + "47": 2493.0, + "48": 2309.0, + "49": 2250.0, + "50": 2421.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 522346496.0, + "2": 522346496.0, + "3": 522346496.0, + "4": 522346496.0, + "5": 522346496.0, + "6": 522346496.0, + "7": 522346496.0, + "8": 522346496.0, + "9": 522346496.0, + "10": 522346496.0, + "11": 522346496.0, + "12": 522346496.0, + "13": 522346496.0, + "14": 522346496.0, + "15": 522346496.0, + "16": 522346496.0, + "17": 522346496.0, + "18": 522346496.0, + "19": 522346496.0, + "20": 522346496.0, + "21": 522346496.0, + "22": 522346496.0, + "23": 522346496.0, + "24": 522346496.0, + "25": 522346496.0, + "26": 522346496.0, + "27": 522346496.0, + "28": 522346496.0, + "29": 522346496.0, + "30": 522346496.0, + "31": 522346496.0, + "32": 522346496.0, + "33": 522346496.0, + "34": 522346496.0, + "35": 522346496.0, + "36": 522346496.0, + "37": 522346496.0, + "38": 522346496.0, + "39": 522346496.0, + "40": 522346496.0, + "41": 522346496.0, + "42": 522346496.0, + "43": 522346496.0, + "44": 522346496.0, + "45": 522346496.0, + "46": 522346496.0, + "47": 522346496.0, + "48": 522346496.0, + "49": 522346496.0, + "50": 522346496.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3769791488.0, + "2": 3912108032.0, + "3": 3912108032.0, + "4": 3912108032.0, + "5": 3912108032.0, + "6": 3912108032.0, + "7": 3912108032.0, + "8": 3912108032.0, + "9": 3912108032.0, + "10": 3912108032.0, + "11": 3912108032.0, + "12": 3912108032.0, + "13": 3912108032.0, + "14": 3912108032.0, + "15": 3912108032.0, + "16": 3912108032.0, + "17": 3912108032.0, + "18": 3912108032.0, + "19": 3912108032.0, + "20": 3912108032.0, + "21": 3912108032.0, + "22": 3912108032.0, + "23": 3912108032.0, + "24": 3912108032.0, + "25": 3912108032.0, + "26": 3912108032.0, + "27": 3912108032.0, + "28": 3912108032.0, + "29": 3912108032.0, + "30": 3912108032.0, + "31": 3912108032.0, + "32": 3912108032.0, + "33": 3912108032.0, + "34": 3912108032.0, + "35": 3912108032.0, + "36": 3912108032.0, + "37": 3912108032.0, + "38": 3912108032.0, + "39": 3912108032.0, + "40": 3912108032.0, + "41": 3912108032.0, + "42": 3912108032.0, + "43": 3912108032.0, + "44": 3912108032.0, + "45": 3912108032.0, + "46": 3912108032.0, + "47": 3912108032.0, + "48": 3912108032.0, + "49": 3912108032.0, + "50": 3912108032.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 22.86952, + "2": 0.20661, + "3": 0.18026, + "4": 0.17656, + "5": 0.17996, + "6": 0.17701, + "7": 0.17871, + "8": 0.17528, + "9": 0.17563, + "10": 0.17569, + "11": 0.74111, + "12": 0.17396, + "13": 0.17377, + "14": 0.1738, + "15": 0.17271, + "16": 0.17324, + "17": 0.17404, + "18": 0.17229, + "19": 0.17205, + "20": 0.17274, + "21": 0.30088, + "22": 0.17329, + "23": 0.17535, + "24": 0.17212, + "25": 0.17389, + "26": 0.19974, + "27": 0.19407, + "28": 0.17531, + "29": 0.17514, + "30": 0.17299, + "31": 0.30323, + "32": 0.17369, + "33": 0.17341, + "34": 0.1737, + "35": 0.17388, + "36": 0.17546, + "37": 0.17373, + "38": 0.17505, + "39": 0.17758, + "40": 0.17506, + "41": 0.3082, + "42": 0.17306, + "43": 0.17922, + "44": 0.17678, + "45": 0.17538, + "46": 0.17386, + "47": 0.17387, + "48": 0.17425, + "49": 0.1761, + "50": 0.17415 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..f7a81a7b3e4 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82005, + "2": 10.81907, + "3": 10.81396, + "4": 10.78497, + "5": 10.85284, + "6": 10.87449, + "7": 10.83201, + "8": 10.83297, + "9": 10.83935, + "10": 10.78455, + "11": 10.87798, + "12": 10.86112, + "13": 10.86444, + "14": 10.87605, + "15": 10.7923, + "16": 10.7951, + "17": 10.76773, + "18": 10.81002, + "19": 10.79715, + "20": 10.69213, + "21": 10.68165, + "22": 10.52083, + "23": 10.70895, + "24": 10.57597, + "25": 10.5241, + "26": 10.59512, + "27": 10.58424, + "28": 10.56231, + "29": 10.57009, + "30": 10.34556, + "31": 10.10048, + "32": 10.45377, + "33": 10.44632, + "34": 10.20606, + "35": 10.26241, + "36": 10.21241, + "37": 10.32522, + "38": 10.16779, + "39": 10.38327, + "40": 10.07237, + "41": 10.13863, + "42": 10.19814, + "43": 9.81079, + "44": 9.93246, + "45": 9.811, + "46": 9.8088, + "47": 10.12607, + "48": 9.82111, + "49": 9.50627, + "50": 9.88419 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1559.0, + "2": 1591.0, + "3": 1727.0, + "4": 1835.0, + "5": 1840.0, + "6": 1719.0, + "7": 1740.0, + "8": 1591.0, + "9": 1839.0, + "10": 1380.0, + "11": 1856.0, + "12": 1693.0, + "13": 1906.0, + "14": 1757.0, + "15": 1850.0, + "16": 1754.0, + "17": 1768.0, + "18": 1671.0, + "19": 1715.0, + "20": 1699.0, + "21": 1891.0, + "22": 1794.0, + "23": 1970.0, + "24": 1751.0, + "25": 1614.0, + "26": 1805.0, + "27": 1821.0, + "28": 2042.0, + "29": 2014.0, + "30": 1905.0, + "31": 1658.0, + "32": 1848.0, + "33": 2113.0, + "34": 1678.0, + "35": 1933.0, + "36": 1922.0, + "37": 2309.0, + "38": 2120.0, + "39": 2469.0, + "40": 2169.0, + "41": 2241.0, + "42": 2276.0, + "43": 1937.0, + "44": 2090.0, + "45": 2101.0, + "46": 2282.0, + "47": 2493.0, + "48": 2309.0, + "49": 2250.0, + "50": 2421.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 522346496.0, + "2": 522346496.0, + "3": 522346496.0, + "4": 522346496.0, + "5": 522346496.0, + "6": 522346496.0, + "7": 522346496.0, + "8": 522346496.0, + "9": 522346496.0, + "10": 522346496.0, + "11": 522346496.0, + "12": 522346496.0, + "13": 522346496.0, + "14": 522346496.0, + "15": 522346496.0, + "16": 522346496.0, + "17": 522346496.0, + "18": 522346496.0, + "19": 522346496.0, + "20": 522346496.0, + "21": 522346496.0, + "22": 522346496.0, + "23": 522346496.0, + "24": 522346496.0, + "25": 522346496.0, + "26": 522346496.0, + "27": 522346496.0, + "28": 522346496.0, + "29": 522346496.0, + "30": 522346496.0, + "31": 522346496.0, + "32": 522346496.0, + "33": 522346496.0, + "34": 522346496.0, + "35": 522346496.0, + "36": 522346496.0, + "37": 522346496.0, + "38": 522346496.0, + "39": 522346496.0, + "40": 522346496.0, + "41": 522346496.0, + "42": 522346496.0, + "43": 522346496.0, + "44": 522346496.0, + "45": 522346496.0, + "46": 522346496.0, + "47": 522346496.0, + "48": 522346496.0, + "49": 522346496.0, + "50": 522346496.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3769791488.0, + "2": 3912108032.0, + "3": 3912108032.0, + "4": 3912108032.0, + "5": 3912108032.0, + "6": 3912108032.0, + "7": 3912108032.0, + "8": 3912108032.0, + "9": 3912108032.0, + "10": 3912108032.0, + "11": 3912108032.0, + "12": 3912108032.0, + "13": 3912108032.0, + "14": 3912108032.0, + "15": 3912108032.0, + "16": 3912108032.0, + "17": 3912108032.0, + "18": 3912108032.0, + "19": 3912108032.0, + "20": 3912108032.0, + "21": 3912108032.0, + "22": 3912108032.0, + "23": 3912108032.0, + "24": 3912108032.0, + "25": 3912108032.0, + "26": 3912108032.0, + "27": 3912108032.0, + "28": 3912108032.0, + "29": 3912108032.0, + "30": 3912108032.0, + "31": 3912108032.0, + "32": 3912108032.0, + "33": 3912108032.0, + "34": 3912108032.0, + "35": 3912108032.0, + "36": 3912108032.0, + "37": 3912108032.0, + "38": 3912108032.0, + "39": 3912108032.0, + "40": 3912108032.0, + "41": 3912108032.0, + "42": 3912108032.0, + "43": 3912108032.0, + "44": 3912108032.0, + "45": 3912108032.0, + "46": 3912108032.0, + "47": 3912108032.0, + "48": 3912108032.0, + "49": 3912108032.0, + "50": 3912108032.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 26.03973, + "2": 0.20991, + "3": 0.18001, + "4": 0.17535, + "5": 0.37487, + "6": 0.17569, + "7": 0.17538, + "8": 0.17644, + "9": 0.17601, + "10": 0.17454, + "11": 0.32086, + "12": 0.17452, + "13": 0.17725, + "14": 0.17806, + "15": 0.17968, + "16": 0.17731, + "17": 0.18214, + "18": 0.17979, + "19": 0.18197, + "20": 0.18282, + "21": 0.31872, + "22": 0.17621, + "23": 0.18154, + "24": 0.17536, + "25": 0.17248, + "26": 0.3922, + "27": 0.17401, + "28": 0.17258, + "29": 0.17486, + "30": 0.17468, + "31": 0.31294, + "32": 0.17218, + "33": 0.17311, + "34": 0.17553, + "35": 0.17239, + "36": 0.17742, + "37": 0.17354, + "38": 0.17694, + "39": 0.17551, + "40": 0.38673, + "41": 0.31702, + "42": 0.17359, + "43": 0.17781, + "44": 0.17499, + "45": 0.17326, + "46": 0.17496, + "47": 0.17486, + "48": 0.17727, + "49": 0.17954, + "50": 0.17661 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 3f5bf549afb..0c1982c8b78 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,537 @@ -{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.84517, "5": 10.87422, "10": 10.82907, "15": 10.81973, "20": 10.72685, "25": 10.55128, "30": 10.36566, "35": 10.2744, "40": 10.0956, "45": 9.83425, "50": 9.90532, "55": 9.87297, "60": 9.48861, "65": 8.93435, "70": 9.72364, "75": 9.40392, "80": 9.38215, "85": 9.5893, "90": 9.78202, "95": 9.47913, "100": 9.34982}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1655.0, "5": 1897.0, "10": 1441.0, "15": 1918.0, "20": 1610.0, "25": 1597.0, "30": 1875.0, "35": 2045.0, "40": 2184.0, "45": 2077.0, "50": 2196.0, "55": 2351.0, "60": 2359.0, "65": 2577.0, "70": 3151.0, "75": 2425.0, "80": 3254.0, "85": 3492.0, "90": 3160.0, "95": 3247.0, "100": 3076.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 763220480.0, "5": 763220480.0, "10": 763220480.0, "15": 763220480.0, "20": 763220480.0, "25": 763220480.0, "30": 763220480.0, "35": 763220480.0, "40": 763220480.0, "45": 763220480.0, "50": 763220480.0, "55": 763220480.0, "60": 763220480.0, "65": 763220480.0, "70": 763220480.0, "75": 763220480.0, "80": 763220480.0, "85": 763220480.0, "90": 763220480.0, "95": 763220480.0, "100": 763220480.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2359490560.0, "5": 2643299328.0, "10": 2643299328.0, "15": 2643299328.0, "20": 2643299328.0, "25": 2643299328.0, "30": 2643299328.0, "35": 2643299328.0, "40": 2643299328.0, "45": 2643299328.0, "50": 2643299328.0, "55": 2643299328.0, "60": 2643299328.0, "65": 2643299328.0, "70": 2643299328.0, "75": 2643299328.0, "80": 2643299328.0, "85": 2643299328.0, "90": 2643299328.0, "95": 2643299328.0, "100": 2643299328.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 12.65344, "5": 0.0984, "10": 0.10108, "15": 0.09929, "20": 0.10139, "25": 0.09855, "30": 0.10032, "35": 0.09726, "40": 0.09784, "45": 0.09917, "50": 0.09956, "55": 0.10014, "60": 0.10632, "65": 0.09944, "70": 0.09595, "75": 0.09574, "80": 0.09657, "85": 0.10004, "90": 0.0985, "95": 0.10078, "100": 0.09765}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84517, + "2": 10.85349, + "3": 10.8539, + "4": 10.83826, + "5": 10.87422, + "6": 10.89306, + "7": 10.85452, + "8": 10.8626, + "9": 10.86463, + "10": 10.82907, + "11": 10.88787, + "12": 10.87098, + "13": 10.87914, + "14": 10.89069, + "15": 10.81973, + "16": 10.83156, + "17": 10.79863, + "18": 10.81648, + "19": 10.8189, + "20": 10.72685, + "21": 10.70581, + "22": 10.56347, + "23": 10.72794, + "24": 10.60761, + "25": 10.55128, + "26": 10.60749, + "27": 10.6277, + "28": 10.58262, + "29": 10.59959, + "30": 10.36566, + "31": 10.11988, + "32": 10.4755, + "33": 10.46637, + "34": 10.22009, + "35": 10.2744, + "36": 10.22594, + "37": 10.35729, + "38": 10.19156, + "39": 10.41342, + "40": 10.0956, + "41": 10.15511, + "42": 10.22085, + "43": 9.82797, + "44": 9.96276, + "45": 9.83425, + "46": 9.82209, + "47": 10.14765, + "48": 9.84681, + "49": 9.53377, + "50": 9.90532, + "51": 9.85116, + "52": 9.73516, + "53": 10.05863, + "54": 9.94369, + "55": 9.87297, + "56": 9.61703, + "57": 9.4675, + "58": 9.82223, + "59": 9.57338, + "60": 9.48861, + "61": 9.67921, + "62": 9.97513, + "63": 9.37045, + "64": 9.76643, + "65": 8.93435, + "66": 9.69463, + "67": 9.35357, + "68": 9.76826, + "69": 9.77682, + "70": 9.72364, + "71": 9.59895, + "72": 9.56454, + "73": 9.48327, + "74": 8.92062, + "75": 9.40392, + "76": 9.05301, + "77": 10.04175, + "78": 9.69879, + "79": 9.35128, + "80": 9.38215, + "81": 9.45866, + "82": 9.67518, + "83": 9.28411, + "84": 9.39313, + "85": 9.5893, + "86": 9.05182, + "87": 9.56419, + "88": 9.71756, + "89": 9.57129, + "90": 9.78202, + "91": 9.3061, + "92": 9.32048, + "93": 9.03942, + "94": 8.79522, + "95": 9.47913, + "96": 9.48454, + "97": 9.2699, + "98": 9.62563, + "99": 8.84255, + "100": 9.34982 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1655.0, + "2": 1697.0, + "3": 1773.0, + "4": 1782.0, + "5": 1897.0, + "6": 1802.0, + "7": 1874.0, + "8": 1653.0, + "9": 1814.0, + "10": 1441.0, + "11": 1909.0, + "12": 1645.0, + "13": 1931.0, + "14": 1678.0, + "15": 1918.0, + "16": 1961.0, + "17": 1711.0, + "18": 1658.0, + "19": 1791.0, + "20": 1610.0, + "21": 1815.0, + "22": 1677.0, + "23": 1952.0, + "24": 1612.0, + "25": 1597.0, + "26": 1657.0, + "27": 1850.0, + "28": 2013.0, + "29": 1966.0, + "30": 1875.0, + "31": 1585.0, + "32": 1941.0, + "33": 2085.0, + "34": 1837.0, + "35": 2045.0, + "36": 1898.0, + "37": 2333.0, + "38": 2247.0, + "39": 2266.0, + "40": 2184.0, + "41": 2209.0, + "42": 2164.0, + "43": 2076.0, + "44": 2169.0, + "45": 2077.0, + "46": 2325.0, + "47": 2505.0, + "48": 2442.0, + "49": 2205.0, + "50": 2196.0, + "51": 2500.0, + "52": 2572.0, + "53": 2905.0, + "54": 2794.0, + "55": 2351.0, + "56": 2606.0, + "57": 2388.0, + "58": 2864.0, + "59": 2726.0, + "60": 2359.0, + "61": 2915.0, + "62": 2610.0, + "63": 2397.0, + "64": 2886.0, + "65": 2577.0, + "66": 2913.0, + "67": 2715.0, + "68": 2646.0, + "69": 2805.0, + "70": 3151.0, + "71": 2917.0, + "72": 2403.0, + "73": 2948.0, + "74": 1994.0, + "75": 2425.0, + "76": 2898.0, + "77": 3085.0, + "78": 3228.0, + "79": 2981.0, + "80": 3254.0, + "81": 3499.0, + "82": 3121.0, + "83": 2711.0, + "84": 3105.0, + "85": 3492.0, + "86": 2693.0, + "87": 3602.0, + "88": 3052.0, + "89": 3230.0, + "90": 3160.0, + "91": 2647.0, + "92": 3160.0, + "93": 2650.0, + "94": 3430.0, + "95": 3247.0, + "96": 3353.0, + "97": 3064.0, + "98": 3486.0, + "99": 3190.0, + "100": 3076.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 759681536.0, + "2": 759681536.0, + "3": 759681536.0, + "4": 759681536.0, + "5": 759681536.0, + "6": 759681536.0, + "7": 759681536.0, + "8": 759681536.0, + "9": 759681536.0, + "10": 759681536.0, + "11": 759681536.0, + "12": 759681536.0, + "13": 759681536.0, + "14": 759681536.0, + "15": 759681536.0, + "16": 759681536.0, + "17": 759681536.0, + "18": 759681536.0, + "19": 759681536.0, + "20": 759681536.0, + "21": 759681536.0, + "22": 759681536.0, + "23": 759681536.0, + "24": 759681536.0, + "25": 759681536.0, + "26": 759681536.0, + "27": 759681536.0, + "28": 759681536.0, + "29": 759681536.0, + "30": 759681536.0, + "31": 759681536.0, + "32": 759681536.0, + "33": 759681536.0, + "34": 759681536.0, + "35": 759681536.0, + "36": 759681536.0, + "37": 759681536.0, + "38": 759681536.0, + "39": 759681536.0, + "40": 759681536.0, + "41": 759681536.0, + "42": 759681536.0, + "43": 759681536.0, + "44": 759681536.0, + "45": 759681536.0, + "46": 759681536.0, + "47": 759681536.0, + "48": 759681536.0, + "49": 759681536.0, + "50": 759681536.0, + "51": 759681536.0, + "52": 759681536.0, + "53": 759681536.0, + "54": 759681536.0, + "55": 759681536.0, + "56": 759681536.0, + "57": 759681536.0, + "58": 759681536.0, + "59": 759681536.0, + "60": 759681536.0, + "61": 759681536.0, + "62": 759681536.0, + "63": 759681536.0, + "64": 759681536.0, + "65": 759681536.0, + "66": 759681536.0, + "67": 759681536.0, + "68": 759681536.0, + "69": 759681536.0, + "70": 759681536.0, + "71": 759681536.0, + "72": 759681536.0, + "73": 759681536.0, + "74": 759681536.0, + "75": 759681536.0, + "76": 759681536.0, + "77": 759681536.0, + "78": 759681536.0, + "79": 759681536.0, + "80": 759681536.0, + "81": 759681536.0, + "82": 759681536.0, + "83": 759681536.0, + "84": 759681536.0, + "85": 759681536.0, + "86": 759681536.0, + "87": 759681536.0, + "88": 759681536.0, + "89": 759681536.0, + "90": 759681536.0, + "91": 759681536.0, + "92": 759681536.0, + "93": 759681536.0, + "94": 759681536.0, + "95": 759681536.0, + "96": 759681536.0, + "97": 759681536.0, + "98": 759681536.0, + "99": 759681536.0, + "100": 759681536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2358048768.0, + "2": 2639760384.0, + "3": 2639760384.0, + "4": 2639760384.0, + "5": 2639760384.0, + "6": 2639760384.0, + "7": 2639760384.0, + "8": 2639760384.0, + "9": 2639760384.0, + "10": 2639760384.0, + "11": 2639760384.0, + "12": 2639760384.0, + "13": 2639760384.0, + "14": 2639760384.0, + "15": 2639760384.0, + "16": 2639760384.0, + "17": 2639760384.0, + "18": 2639760384.0, + "19": 2639760384.0, + "20": 2639760384.0, + "21": 2639760384.0, + "22": 2639760384.0, + "23": 2639760384.0, + "24": 2639760384.0, + "25": 2639760384.0, + "26": 2639760384.0, + "27": 2639760384.0, + "28": 2639760384.0, + "29": 2639760384.0, + "30": 2639760384.0, + "31": 2639760384.0, + "32": 2639760384.0, + "33": 2639760384.0, + "34": 2639760384.0, + "35": 2639760384.0, + "36": 2639760384.0, + "37": 2639760384.0, + "38": 2639760384.0, + "39": 2639760384.0, + "40": 2639760384.0, + "41": 2639760384.0, + "42": 2639760384.0, + "43": 2639760384.0, + "44": 2639760384.0, + "45": 2639760384.0, + "46": 2639760384.0, + "47": 2639760384.0, + "48": 2639760384.0, + "49": 2639760384.0, + "50": 2639760384.0, + "51": 2639760384.0, + "52": 2639760384.0, + "53": 2639760384.0, + "54": 2639760384.0, + "55": 2639760384.0, + "56": 2639760384.0, + "57": 2639760384.0, + "58": 2639760384.0, + "59": 2639760384.0, + "60": 2639760384.0, + "61": 2639760384.0, + "62": 2639760384.0, + "63": 2639760384.0, + "64": 2639760384.0, + "65": 2639760384.0, + "66": 2639760384.0, + "67": 2639760384.0, + "68": 2639760384.0, + "69": 2639760384.0, + "70": 2639760384.0, + "71": 2639760384.0, + "72": 2639760384.0, + "73": 2639760384.0, + "74": 2639760384.0, + "75": 2639760384.0, + "76": 2639760384.0, + "77": 2639760384.0, + "78": 2639760384.0, + "79": 2639760384.0, + "80": 2639760384.0, + "81": 2639760384.0, + "82": 2639760384.0, + "83": 2639760384.0, + "84": 2639760384.0, + "85": 2639760384.0, + "86": 2639760384.0, + "87": 2639760384.0, + "88": 2639760384.0, + "89": 2639760384.0, + "90": 2639760384.0, + "91": 2639760384.0, + "92": 2639760384.0, + "93": 2639760384.0, + "94": 2639760384.0, + "95": 2639760384.0, + "96": 2639760384.0, + "97": 2639760384.0, + "98": 2639760384.0, + "99": 2639760384.0, + "100": 2639760384.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 15.75462, + "2": 0.12782, + "3": 0.11297, + "4": 0.11221, + "5": 0.11226, + "6": 0.11209, + "7": 0.11157, + "8": 0.11109, + "9": 0.11159, + "10": 0.11411, + "11": 0.11336, + "12": 0.10975, + "13": 0.11129, + "14": 0.11016, + "15": 0.11082, + "16": 0.11173, + "17": 0.1107, + "18": 0.113, + "19": 0.11419, + "20": 0.11333, + "21": 0.11169, + "22": 0.11202, + "23": 0.11053, + "24": 0.1123, + "25": 0.11015, + "26": 0.11042, + "27": 0.11289, + "28": 0.11429, + "29": 0.11129, + "30": 0.11046, + "31": 0.11122, + "32": 0.1104, + "33": 0.11073, + "34": 0.11003, + "35": 0.1113, + "36": 0.11176, + "37": 0.11321, + "38": 0.10946, + "39": 0.10923, + "40": 0.10989, + "41": 0.11025, + "42": 0.11059, + "43": 0.11079, + "44": 0.11083, + "45": 0.1125, + "46": 0.11427, + "47": 0.10872, + "48": 0.11101, + "49": 0.10925, + "50": 0.10952, + "51": 0.11025, + "52": 0.11105, + "53": 0.11002, + "54": 0.10971, + "55": 0.11074, + "56": 0.11019, + "57": 0.11283, + "58": 0.11172, + "59": 0.1132, + "60": 0.11512, + "61": 0.11318, + "62": 0.11088, + "63": 0.11201, + "64": 0.10971, + "65": 0.11109, + "66": 0.11046, + "67": 0.1107, + "68": 0.11123, + "69": 0.1121, + "70": 0.11129, + "71": 0.1106, + "72": 0.11162, + "73": 0.11219, + "74": 0.11285, + "75": 0.11259, + "76": 0.11452, + "77": 0.11103, + "78": 0.11112, + "79": 0.11137, + "80": 0.11228, + "81": 0.11061, + "82": 0.11185, + "83": 0.111, + "84": 0.11067, + "85": 0.11266, + "86": 0.11269, + "87": 0.11295, + "88": 0.10971, + "89": 0.11137, + "90": 0.11022, + "91": 0.11153, + "92": 0.10828, + "93": 0.1125, + "94": 0.11279, + "95": 0.11157, + "96": 0.11174, + "97": 0.10966, + "98": 0.11031, + "99": 0.11036, + "100": 0.10984 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..73ffbc48219 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84517, + "2": 10.85349, + "3": 10.8539, + "4": 10.83826, + "5": 10.87422, + "6": 10.89306, + "7": 10.85452, + "8": 10.8626, + "9": 10.86463, + "10": 10.82907, + "11": 10.88787, + "12": 10.87098, + "13": 10.87914, + "14": 10.89069, + "15": 10.81973, + "16": 10.83156, + "17": 10.79863, + "18": 10.81648, + "19": 10.8189, + "20": 10.72685, + "21": 10.70581, + "22": 10.56347, + "23": 10.72794, + "24": 10.60761, + "25": 10.55128, + "26": 10.60749, + "27": 10.6277, + "28": 10.58262, + "29": 10.59959, + "30": 10.36566, + "31": 10.11988, + "32": 10.4755, + "33": 10.46637, + "34": 10.22009, + "35": 10.2744, + "36": 10.22594, + "37": 10.35729, + "38": 10.19156, + "39": 10.41342, + "40": 10.0956, + "41": 10.15511, + "42": 10.22085, + "43": 9.82797, + "44": 9.96276, + "45": 9.83425, + "46": 9.82209, + "47": 10.14765, + "48": 9.84681, + "49": 9.53377, + "50": 9.90532, + "51": 9.85116, + "52": 9.73516, + "53": 10.05863, + "54": 9.94369, + "55": 9.87297, + "56": 9.61703, + "57": 9.4675, + "58": 9.82223, + "59": 9.57338, + "60": 9.48861, + "61": 9.67921, + "62": 9.97513, + "63": 9.37045, + "64": 9.76643, + "65": 8.93435, + "66": 9.69463, + "67": 9.35357, + "68": 9.76826, + "69": 9.77682, + "70": 9.72364, + "71": 9.59895, + "72": 9.56454, + "73": 9.48327, + "74": 8.92062, + "75": 9.40392, + "76": 9.05301, + "77": 10.04175, + "78": 9.69879, + "79": 9.35128, + "80": 9.38215, + "81": 9.45866, + "82": 9.67518, + "83": 9.28411, + "84": 9.39313, + "85": 9.5893, + "86": 9.05182, + "87": 9.56419, + "88": 9.71756, + "89": 9.57129, + "90": 9.78202, + "91": 9.3061, + "92": 9.32048, + "93": 9.03942, + "94": 8.79522, + "95": 9.47913, + "96": 9.48454, + "97": 9.2699, + "98": 9.62563, + "99": 8.84255, + "100": 9.34982 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1655.0, + "2": 1697.0, + "3": 1773.0, + "4": 1782.0, + "5": 1897.0, + "6": 1802.0, + "7": 1874.0, + "8": 1653.0, + "9": 1814.0, + "10": 1441.0, + "11": 1909.0, + "12": 1645.0, + "13": 1931.0, + "14": 1678.0, + "15": 1918.0, + "16": 1961.0, + "17": 1711.0, + "18": 1658.0, + "19": 1791.0, + "20": 1610.0, + "21": 1815.0, + "22": 1677.0, + "23": 1952.0, + "24": 1612.0, + "25": 1597.0, + "26": 1657.0, + "27": 1850.0, + "28": 2013.0, + "29": 1966.0, + "30": 1875.0, + "31": 1585.0, + "32": 1941.0, + "33": 2085.0, + "34": 1837.0, + "35": 2045.0, + "36": 1898.0, + "37": 2333.0, + "38": 2247.0, + "39": 2266.0, + "40": 2184.0, + "41": 2209.0, + "42": 2164.0, + "43": 2076.0, + "44": 2169.0, + "45": 2077.0, + "46": 2325.0, + "47": 2505.0, + "48": 2442.0, + "49": 2205.0, + "50": 2196.0, + "51": 2500.0, + "52": 2572.0, + "53": 2905.0, + "54": 2794.0, + "55": 2351.0, + "56": 2606.0, + "57": 2388.0, + "58": 2864.0, + "59": 2726.0, + "60": 2359.0, + "61": 2915.0, + "62": 2610.0, + "63": 2397.0, + "64": 2886.0, + "65": 2577.0, + "66": 2913.0, + "67": 2715.0, + "68": 2646.0, + "69": 2805.0, + "70": 3151.0, + "71": 2917.0, + "72": 2403.0, + "73": 2948.0, + "74": 1994.0, + "75": 2425.0, + "76": 2898.0, + "77": 3085.0, + "78": 3228.0, + "79": 2981.0, + "80": 3254.0, + "81": 3499.0, + "82": 3121.0, + "83": 2711.0, + "84": 3105.0, + "85": 3492.0, + "86": 2693.0, + "87": 3602.0, + "88": 3052.0, + "89": 3230.0, + "90": 3160.0, + "91": 2647.0, + "92": 3160.0, + "93": 2650.0, + "94": 3430.0, + "95": 3247.0, + "96": 3353.0, + "97": 3064.0, + "98": 3486.0, + "99": 3190.0, + "100": 3076.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 759681536.0, + "2": 759681536.0, + "3": 759681536.0, + "4": 759681536.0, + "5": 759681536.0, + "6": 759681536.0, + "7": 759681536.0, + "8": 759681536.0, + "9": 759681536.0, + "10": 759681536.0, + "11": 759681536.0, + "12": 759681536.0, + "13": 759681536.0, + "14": 759681536.0, + "15": 759681536.0, + "16": 759681536.0, + "17": 759681536.0, + "18": 759681536.0, + "19": 759681536.0, + "20": 759681536.0, + "21": 759681536.0, + "22": 759681536.0, + "23": 759681536.0, + "24": 759681536.0, + "25": 759681536.0, + "26": 759681536.0, + "27": 759681536.0, + "28": 759681536.0, + "29": 759681536.0, + "30": 759681536.0, + "31": 759681536.0, + "32": 759681536.0, + "33": 759681536.0, + "34": 759681536.0, + "35": 759681536.0, + "36": 759681536.0, + "37": 759681536.0, + "38": 759681536.0, + "39": 759681536.0, + "40": 759681536.0, + "41": 759681536.0, + "42": 759681536.0, + "43": 759681536.0, + "44": 759681536.0, + "45": 759681536.0, + "46": 759681536.0, + "47": 759681536.0, + "48": 759681536.0, + "49": 759681536.0, + "50": 759681536.0, + "51": 759681536.0, + "52": 759681536.0, + "53": 759681536.0, + "54": 759681536.0, + "55": 759681536.0, + "56": 759681536.0, + "57": 759681536.0, + "58": 759681536.0, + "59": 759681536.0, + "60": 759681536.0, + "61": 759681536.0, + "62": 759681536.0, + "63": 759681536.0, + "64": 759681536.0, + "65": 759681536.0, + "66": 759681536.0, + "67": 759681536.0, + "68": 759681536.0, + "69": 759681536.0, + "70": 759681536.0, + "71": 759681536.0, + "72": 759681536.0, + "73": 759681536.0, + "74": 759681536.0, + "75": 759681536.0, + "76": 759681536.0, + "77": 759681536.0, + "78": 759681536.0, + "79": 759681536.0, + "80": 759681536.0, + "81": 759681536.0, + "82": 759681536.0, + "83": 759681536.0, + "84": 759681536.0, + "85": 759681536.0, + "86": 759681536.0, + "87": 759681536.0, + "88": 759681536.0, + "89": 759681536.0, + "90": 759681536.0, + "91": 759681536.0, + "92": 759681536.0, + "93": 759681536.0, + "94": 759681536.0, + "95": 759681536.0, + "96": 759681536.0, + "97": 759681536.0, + "98": 759681536.0, + "99": 759681536.0, + "100": 759681536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2358048768.0, + "2": 2639760384.0, + "3": 2639760384.0, + "4": 2639760384.0, + "5": 2639760384.0, + "6": 2639760384.0, + "7": 2639760384.0, + "8": 2639760384.0, + "9": 2639760384.0, + "10": 2639760384.0, + "11": 2639760384.0, + "12": 2639760384.0, + "13": 2639760384.0, + "14": 2639760384.0, + "15": 2639760384.0, + "16": 2639760384.0, + "17": 2639760384.0, + "18": 2639760384.0, + "19": 2639760384.0, + "20": 2639760384.0, + "21": 2639760384.0, + "22": 2639760384.0, + "23": 2639760384.0, + "24": 2639760384.0, + "25": 2639760384.0, + "26": 2639760384.0, + "27": 2639760384.0, + "28": 2639760384.0, + "29": 2639760384.0, + "30": 2639760384.0, + "31": 2639760384.0, + "32": 2639760384.0, + "33": 2639760384.0, + "34": 2639760384.0, + "35": 2639760384.0, + "36": 2639760384.0, + "37": 2639760384.0, + "38": 2639760384.0, + "39": 2639760384.0, + "40": 2639760384.0, + "41": 2639760384.0, + "42": 2639760384.0, + "43": 2639760384.0, + "44": 2639760384.0, + "45": 2639760384.0, + "46": 2639760384.0, + "47": 2639760384.0, + "48": 2639760384.0, + "49": 2639760384.0, + "50": 2639760384.0, + "51": 2639760384.0, + "52": 2639760384.0, + "53": 2639760384.0, + "54": 2639760384.0, + "55": 2639760384.0, + "56": 2639760384.0, + "57": 2639760384.0, + "58": 2639760384.0, + "59": 2639760384.0, + "60": 2639760384.0, + "61": 2639760384.0, + "62": 2639760384.0, + "63": 2639760384.0, + "64": 2639760384.0, + "65": 2639760384.0, + "66": 2639760384.0, + "67": 2639760384.0, + "68": 2639760384.0, + "69": 2639760384.0, + "70": 2639760384.0, + "71": 2639760384.0, + "72": 2639760384.0, + "73": 2639760384.0, + "74": 2639760384.0, + "75": 2639760384.0, + "76": 2639760384.0, + "77": 2639760384.0, + "78": 2639760384.0, + "79": 2639760384.0, + "80": 2639760384.0, + "81": 2639760384.0, + "82": 2639760384.0, + "83": 2639760384.0, + "84": 2639760384.0, + "85": 2639760384.0, + "86": 2639760384.0, + "87": 2639760384.0, + "88": 2639760384.0, + "89": 2639760384.0, + "90": 2639760384.0, + "91": 2639760384.0, + "92": 2639760384.0, + "93": 2639760384.0, + "94": 2639760384.0, + "95": 2639760384.0, + "96": 2639760384.0, + "97": 2639760384.0, + "98": 2639760384.0, + "99": 2639760384.0, + "100": 2639760384.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 14.25777, + "2": 0.13394, + "3": 0.09922, + "4": 0.09894, + "5": 0.09775, + "6": 0.09731, + "7": 0.09832, + "8": 0.09902, + "9": 0.0976, + "10": 0.09738, + "11": 0.09769, + "12": 0.09775, + "13": 0.0973, + "14": 0.09697, + "15": 0.09749, + "16": 0.09763, + "17": 0.09815, + "18": 0.09802, + "19": 0.09718, + "20": 0.09775, + "21": 0.09758, + "22": 0.09773, + "23": 0.09785, + "24": 0.09828, + "25": 0.09821, + "26": 0.09669, + "27": 0.09722, + "28": 0.09732, + "29": 0.09861, + "30": 0.09875, + "31": 0.09867, + "32": 0.09834, + "33": 0.0982, + "34": 0.09928, + "35": 0.09811, + "36": 0.09669, + "37": 0.09757, + "38": 0.09767, + "39": 0.09702, + "40": 0.09753, + "41": 0.09794, + "42": 0.09878, + "43": 0.09912, + "44": 0.09929, + "45": 0.09921, + "46": 0.09947, + "47": 0.10001, + "48": 0.09906, + "49": 0.09991, + "50": 0.0993, + "51": 0.10133, + "52": 0.09956, + "53": 0.09824, + "54": 0.09904, + "55": 0.09915, + "56": 0.09925, + "57": 0.09859, + "58": 0.09644, + "59": 0.09661, + "60": 0.09755, + "61": 0.09709, + "62": 0.09665, + "63": 0.09681, + "64": 0.09617, + "65": 0.09641, + "66": 0.09621, + "67": 0.09683, + "68": 0.09678, + "69": 0.09664, + "70": 0.09803, + "71": 0.09677, + "72": 0.09645, + "73": 0.09681, + "74": 0.09753, + "75": 0.09704, + "76": 0.09776, + "77": 0.09822, + "78": 0.09631, + "79": 0.09728, + "80": 0.09766, + "81": 0.09703, + "82": 0.0976, + "83": 0.09876, + "84": 0.09779, + "85": 0.0973, + "86": 0.09965, + "87": 0.09825, + "88": 0.09698, + "89": 0.09761, + "90": 0.09663, + "91": 0.09746, + "92": 0.09681, + "93": 0.09761, + "94": 0.09917, + "95": 0.09904, + "96": 0.09748, + "97": 0.09707, + "98": 0.09661, + "99": 0.09831, + "100": 0.09719 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..603dba4c2e5 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84517, + "2": 10.85349, + "3": 10.8539, + "4": 10.83826, + "5": 10.87422, + "6": 10.89306, + "7": 10.85452, + "8": 10.8626, + "9": 10.86463, + "10": 10.82907, + "11": 10.88787, + "12": 10.87098, + "13": 10.87914, + "14": 10.89069, + "15": 10.81973, + "16": 10.83156, + "17": 10.79863, + "18": 10.81648, + "19": 10.8189, + "20": 10.72685, + "21": 10.70581, + "22": 10.56347, + "23": 10.72794, + "24": 10.60761, + "25": 10.55128, + "26": 10.60749, + "27": 10.6277, + "28": 10.58262, + "29": 10.59959, + "30": 10.36566, + "31": 10.11988, + "32": 10.4755, + "33": 10.46637, + "34": 10.22009, + "35": 10.2744, + "36": 10.22594, + "37": 10.35729, + "38": 10.19156, + "39": 10.41342, + "40": 10.0956, + "41": 10.15511, + "42": 10.22085, + "43": 9.82797, + "44": 9.96276, + "45": 9.83425, + "46": 9.82209, + "47": 10.14765, + "48": 9.84681, + "49": 9.53377, + "50": 9.90532, + "51": 9.85116, + "52": 9.73516, + "53": 10.05863, + "54": 9.94369, + "55": 9.87297, + "56": 9.61703, + "57": 9.4675, + "58": 9.82223, + "59": 9.57338, + "60": 9.48861, + "61": 9.67921, + "62": 9.97513, + "63": 9.37045, + "64": 9.76643, + "65": 8.93435, + "66": 9.69463, + "67": 9.35357, + "68": 9.76826, + "69": 9.77682, + "70": 9.72364, + "71": 9.59895, + "72": 9.56454, + "73": 9.48327, + "74": 8.92062, + "75": 9.40392, + "76": 9.05301, + "77": 10.04175, + "78": 9.69879, + "79": 9.35128, + "80": 9.38215, + "81": 9.45866, + "82": 9.67518, + "83": 9.28411, + "84": 9.39313, + "85": 9.5893, + "86": 9.05182, + "87": 9.56419, + "88": 9.71756, + "89": 9.57129, + "90": 9.78202, + "91": 9.3061, + "92": 9.32048, + "93": 9.03942, + "94": 8.79522, + "95": 9.47913, + "96": 9.48454, + "97": 9.2699, + "98": 9.62563, + "99": 8.84255, + "100": 9.34982 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1655.0, + "2": 1697.0, + "3": 1773.0, + "4": 1782.0, + "5": 1897.0, + "6": 1802.0, + "7": 1874.0, + "8": 1653.0, + "9": 1814.0, + "10": 1441.0, + "11": 1909.0, + "12": 1645.0, + "13": 1931.0, + "14": 1678.0, + "15": 1918.0, + "16": 1961.0, + "17": 1711.0, + "18": 1658.0, + "19": 1791.0, + "20": 1610.0, + "21": 1815.0, + "22": 1677.0, + "23": 1952.0, + "24": 1612.0, + "25": 1597.0, + "26": 1657.0, + "27": 1850.0, + "28": 2013.0, + "29": 1966.0, + "30": 1875.0, + "31": 1585.0, + "32": 1941.0, + "33": 2085.0, + "34": 1837.0, + "35": 2045.0, + "36": 1898.0, + "37": 2333.0, + "38": 2247.0, + "39": 2266.0, + "40": 2184.0, + "41": 2209.0, + "42": 2164.0, + "43": 2076.0, + "44": 2169.0, + "45": 2077.0, + "46": 2325.0, + "47": 2505.0, + "48": 2442.0, + "49": 2205.0, + "50": 2196.0, + "51": 2500.0, + "52": 2572.0, + "53": 2905.0, + "54": 2794.0, + "55": 2351.0, + "56": 2606.0, + "57": 2388.0, + "58": 2864.0, + "59": 2726.0, + "60": 2359.0, + "61": 2915.0, + "62": 2610.0, + "63": 2397.0, + "64": 2886.0, + "65": 2577.0, + "66": 2913.0, + "67": 2715.0, + "68": 2646.0, + "69": 2805.0, + "70": 3151.0, + "71": 2917.0, + "72": 2403.0, + "73": 2948.0, + "74": 1994.0, + "75": 2425.0, + "76": 2898.0, + "77": 3085.0, + "78": 3228.0, + "79": 2981.0, + "80": 3254.0, + "81": 3499.0, + "82": 3121.0, + "83": 2711.0, + "84": 3105.0, + "85": 3492.0, + "86": 2693.0, + "87": 3602.0, + "88": 3052.0, + "89": 3230.0, + "90": 3160.0, + "91": 2647.0, + "92": 3160.0, + "93": 2650.0, + "94": 3430.0, + "95": 3247.0, + "96": 3353.0, + "97": 3064.0, + "98": 3486.0, + "99": 3190.0, + "100": 3076.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 759681536.0, + "2": 759681536.0, + "3": 759681536.0, + "4": 759681536.0, + "5": 759681536.0, + "6": 759681536.0, + "7": 759681536.0, + "8": 759681536.0, + "9": 759681536.0, + "10": 759681536.0, + "11": 759681536.0, + "12": 759681536.0, + "13": 759681536.0, + "14": 759681536.0, + "15": 759681536.0, + "16": 759681536.0, + "17": 759681536.0, + "18": 759681536.0, + "19": 759681536.0, + "20": 759681536.0, + "21": 759681536.0, + "22": 759681536.0, + "23": 759681536.0, + "24": 759681536.0, + "25": 759681536.0, + "26": 759681536.0, + "27": 759681536.0, + "28": 759681536.0, + "29": 759681536.0, + "30": 759681536.0, + "31": 759681536.0, + "32": 759681536.0, + "33": 759681536.0, + "34": 759681536.0, + "35": 759681536.0, + "36": 759681536.0, + "37": 759681536.0, + "38": 759681536.0, + "39": 759681536.0, + "40": 759681536.0, + "41": 759681536.0, + "42": 759681536.0, + "43": 759681536.0, + "44": 759681536.0, + "45": 759681536.0, + "46": 759681536.0, + "47": 759681536.0, + "48": 759681536.0, + "49": 759681536.0, + "50": 759681536.0, + "51": 759681536.0, + "52": 759681536.0, + "53": 759681536.0, + "54": 759681536.0, + "55": 759681536.0, + "56": 759681536.0, + "57": 759681536.0, + "58": 759681536.0, + "59": 759681536.0, + "60": 759681536.0, + "61": 759681536.0, + "62": 759681536.0, + "63": 759681536.0, + "64": 759681536.0, + "65": 759681536.0, + "66": 759681536.0, + "67": 759681536.0, + "68": 759681536.0, + "69": 759681536.0, + "70": 759681536.0, + "71": 759681536.0, + "72": 759681536.0, + "73": 759681536.0, + "74": 759681536.0, + "75": 759681536.0, + "76": 759681536.0, + "77": 759681536.0, + "78": 759681536.0, + "79": 759681536.0, + "80": 759681536.0, + "81": 759681536.0, + "82": 759681536.0, + "83": 759681536.0, + "84": 759681536.0, + "85": 759681536.0, + "86": 759681536.0, + "87": 759681536.0, + "88": 759681536.0, + "89": 759681536.0, + "90": 759681536.0, + "91": 759681536.0, + "92": 759681536.0, + "93": 759681536.0, + "94": 759681536.0, + "95": 759681536.0, + "96": 759681536.0, + "97": 759681536.0, + "98": 759681536.0, + "99": 759681536.0, + "100": 759681536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2358048768.0, + "2": 2639760384.0, + "3": 2639760384.0, + "4": 2639760384.0, + "5": 2639760384.0, + "6": 2639760384.0, + "7": 2639760384.0, + "8": 2639760384.0, + "9": 2639760384.0, + "10": 2639760384.0, + "11": 2639760384.0, + "12": 2639760384.0, + "13": 2639760384.0, + "14": 2639760384.0, + "15": 2639760384.0, + "16": 2639760384.0, + "17": 2639760384.0, + "18": 2639760384.0, + "19": 2639760384.0, + "20": 2639760384.0, + "21": 2639760384.0, + "22": 2639760384.0, + "23": 2639760384.0, + "24": 2639760384.0, + "25": 2639760384.0, + "26": 2639760384.0, + "27": 2639760384.0, + "28": 2639760384.0, + "29": 2639760384.0, + "30": 2639760384.0, + "31": 2639760384.0, + "32": 2639760384.0, + "33": 2639760384.0, + "34": 2639760384.0, + "35": 2639760384.0, + "36": 2639760384.0, + "37": 2639760384.0, + "38": 2639760384.0, + "39": 2639760384.0, + "40": 2639760384.0, + "41": 2639760384.0, + "42": 2639760384.0, + "43": 2639760384.0, + "44": 2639760384.0, + "45": 2639760384.0, + "46": 2639760384.0, + "47": 2639760384.0, + "48": 2639760384.0, + "49": 2639760384.0, + "50": 2639760384.0, + "51": 2639760384.0, + "52": 2639760384.0, + "53": 2639760384.0, + "54": 2639760384.0, + "55": 2639760384.0, + "56": 2639760384.0, + "57": 2639760384.0, + "58": 2639760384.0, + "59": 2639760384.0, + "60": 2639760384.0, + "61": 2639760384.0, + "62": 2639760384.0, + "63": 2639760384.0, + "64": 2639760384.0, + "65": 2639760384.0, + "66": 2639760384.0, + "67": 2639760384.0, + "68": 2639760384.0, + "69": 2639760384.0, + "70": 2639760384.0, + "71": 2639760384.0, + "72": 2639760384.0, + "73": 2639760384.0, + "74": 2639760384.0, + "75": 2639760384.0, + "76": 2639760384.0, + "77": 2639760384.0, + "78": 2639760384.0, + "79": 2639760384.0, + "80": 2639760384.0, + "81": 2639760384.0, + "82": 2639760384.0, + "83": 2639760384.0, + "84": 2639760384.0, + "85": 2639760384.0, + "86": 2639760384.0, + "87": 2639760384.0, + "88": 2639760384.0, + "89": 2639760384.0, + "90": 2639760384.0, + "91": 2639760384.0, + "92": 2639760384.0, + "93": 2639760384.0, + "94": 2639760384.0, + "95": 2639760384.0, + "96": 2639760384.0, + "97": 2639760384.0, + "98": 2639760384.0, + "99": 2639760384.0, + "100": 2639760384.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 16.0335, + "2": 0.14377, + "3": 0.129, + "4": 0.12162, + "5": 0.11612, + "6": 0.11324, + "7": 0.11415, + "8": 0.11274, + "9": 0.11392, + "10": 0.11729, + "11": 0.11228, + "12": 0.11141, + "13": 0.11245, + "14": 0.11042, + "15": 0.11174, + "16": 0.1114, + "17": 0.11204, + "18": 0.11241, + "19": 0.11298, + "20": 0.11272, + "21": 0.11169, + "22": 0.11228, + "23": 0.11255, + "24": 0.11124, + "25": 0.11188, + "26": 0.11351, + "27": 0.11159, + "28": 0.11318, + "29": 0.11016, + "30": 0.11051, + "31": 0.11184, + "32": 0.11116, + "33": 0.1106, + "34": 0.11105, + "35": 0.113, + "36": 0.11198, + "37": 0.1117, + "38": 0.11109, + "39": 0.1099, + "40": 0.11097, + "41": 0.11159, + "42": 0.11191, + "43": 0.11283, + "44": 0.11266, + "45": 0.111, + "46": 0.11347, + "47": 0.1099, + "48": 0.10973, + "49": 0.11225, + "50": 0.11231, + "51": 0.1122, + "52": 0.10985, + "53": 0.11147, + "54": 0.11064, + "55": 0.11101, + "56": 0.11356, + "57": 0.11368, + "58": 0.11185, + "59": 0.11193, + "60": 0.11205, + "61": 0.11176, + "62": 0.11293, + "63": 0.1127, + "64": 0.11343, + "65": 0.11282, + "66": 0.11245, + "67": 0.11385, + "68": 0.11071, + "69": 0.11079, + "70": 0.112, + "71": 0.1108, + "72": 0.11299, + "73": 0.11305, + "74": 0.11343, + "75": 0.11155, + "76": 0.11323, + "77": 0.11174, + "78": 0.11138, + "79": 0.11246, + "80": 0.11252, + "81": 0.11217, + "82": 0.11269, + "83": 0.11312, + "84": 0.11075, + "85": 0.11227, + "86": 0.11159, + "87": 0.11227, + "88": 0.11227, + "89": 0.11277, + "90": 0.11219, + "91": 0.11067, + "92": 0.10961, + "93": 0.10907, + "94": 0.11584, + "95": 0.1087, + "96": 0.11107, + "97": 0.11046, + "98": 0.10986, + "99": 0.11249, + "100": 0.1095 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..cf2c7b97468 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.82005, + "2": 10.81907, + "3": 10.81397, + "4": 10.78498, + "5": 10.85285, + "6": 10.87448, + "7": 10.83201, + "8": 10.83296, + "9": 10.83936, + "10": 10.78449, + "11": 10.87794, + "12": 10.86113, + "13": 10.86438, + "14": 10.87595, + "15": 10.79226, + "16": 10.79507, + "17": 10.76764, + "18": 10.80977, + "19": 10.79693, + "20": 10.69196, + "21": 10.68154, + "22": 10.52072, + "23": 10.70881, + "24": 10.5753, + "25": 10.52318, + "26": 10.59411, + "27": 10.58357, + "28": 10.56188, + "29": 10.5696, + "30": 10.34505, + "31": 10.09986, + "32": 10.45209, + "33": 10.44378, + "34": 10.20285, + "35": 10.25888, + "36": 10.20951, + "37": 10.32305, + "38": 10.1656, + "39": 10.38115, + "40": 10.07032, + "41": 10.1364, + "42": 10.19467, + "43": 9.80541, + "44": 9.92556, + "45": 9.803, + "46": 9.80008, + "47": 10.11716, + "48": 9.81309, + "49": 9.49911, + "50": 9.87675, + "51": 9.82883, + "52": 9.71745, + "53": 10.03867, + "54": 9.92195, + "55": 9.85523, + "56": 9.5922, + "57": 9.44053, + "58": 9.79679, + "59": 9.5545, + "60": 9.46634, + "61": 9.66578, + "62": 9.95346, + "63": 9.33681, + "64": 9.74137, + "65": 8.91657, + "66": 9.66586, + "67": 9.34349, + "68": 9.75312, + "69": 9.75728, + "70": 9.69276, + "71": 9.58799, + "72": 9.55054, + "73": 9.46306, + "74": 8.90575, + "75": 9.37813, + "76": 9.04954, + "77": 10.02987, + "78": 9.69223, + "79": 9.33487, + "80": 9.368, + "81": 9.44383, + "82": 9.66162, + "83": 9.27183, + "84": 9.38074, + "85": 9.57598, + "86": 9.0429, + "87": 9.55787, + "88": 9.70459, + "89": 9.56609, + "90": 9.77247, + "91": 9.29341, + "92": 9.31916, + "93": 9.03465, + "94": 8.78492, + "95": 9.46912, + "96": 9.47453, + "97": 9.25689, + "98": 9.61859, + "99": 8.83266, + "100": 9.34574 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1559.0, + "2": 1591.0, + "3": 1686.0, + "4": 1707.0, + "5": 1915.0, + "6": 1734.0, + "7": 1735.0, + "8": 1584.0, + "9": 1810.0, + "10": 1361.0, + "11": 1884.0, + "12": 1714.0, + "13": 1923.0, + "14": 1736.0, + "15": 1831.0, + "16": 1684.0, + "17": 1787.0, + "18": 1707.0, + "19": 1680.0, + "20": 1695.0, + "21": 1815.0, + "22": 1711.0, + "23": 2079.0, + "24": 1677.0, + "25": 1650.0, + "26": 1714.0, + "27": 1813.0, + "28": 1998.0, + "29": 1931.0, + "30": 1861.0, + "31": 1573.0, + "32": 1934.0, + "33": 2063.0, + "34": 1891.0, + "35": 1916.0, + "36": 1939.0, + "37": 2299.0, + "38": 2235.0, + "39": 2352.0, + "40": 2109.0, + "41": 2286.0, + "42": 2232.0, + "43": 1919.0, + "44": 2032.0, + "45": 2098.0, + "46": 2287.0, + "47": 2513.0, + "48": 2360.0, + "49": 2126.0, + "50": 2424.0, + "51": 2433.0, + "52": 2566.0, + "53": 2902.0, + "54": 2589.0, + "55": 2309.0, + "56": 2761.0, + "57": 2265.0, + "58": 2876.0, + "59": 2821.0, + "60": 2432.0, + "61": 3073.0, + "62": 2638.0, + "63": 2426.0, + "64": 2913.0, + "65": 2660.0, + "66": 2985.0, + "67": 2723.0, + "68": 2790.0, + "69": 2997.0, + "70": 3132.0, + "71": 2837.0, + "72": 2291.0, + "73": 2780.0, + "74": 1936.0, + "75": 2555.0, + "76": 3028.0, + "77": 3175.0, + "78": 3109.0, + "79": 2994.0, + "80": 3370.0, + "81": 3552.0, + "82": 3308.0, + "83": 2898.0, + "84": 3285.0, + "85": 3434.0, + "86": 2573.0, + "87": 3858.0, + "88": 2920.0, + "89": 3217.0, + "90": 2868.0, + "91": 2784.0, + "92": 3011.0, + "93": 2700.0, + "94": 3372.0, + "95": 3273.0, + "96": 3557.0, + "97": 3145.0, + "98": 3635.0, + "99": 3308.0, + "100": 3359.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 730320896.0, + "2": 730320896.0, + "3": 730320896.0, + "4": 730320896.0, + "5": 730320896.0, + "6": 730320896.0, + "7": 730320896.0, + "8": 730320896.0, + "9": 730320896.0, + "10": 730320896.0, + "11": 730320896.0, + "12": 730320896.0, + "13": 730320896.0, + "14": 730320896.0, + "15": 730320896.0, + "16": 730320896.0, + "17": 730320896.0, + "18": 730320896.0, + "19": 730320896.0, + "20": 730320896.0, + "21": 730320896.0, + "22": 730320896.0, + "23": 730320896.0, + "24": 730320896.0, + "25": 730320896.0, + "26": 730320896.0, + "27": 730320896.0, + "28": 730320896.0, + "29": 730320896.0, + "30": 730320896.0, + "31": 730320896.0, + "32": 730320896.0, + "33": 730320896.0, + "34": 730320896.0, + "35": 730320896.0, + "36": 730320896.0, + "37": 730320896.0, + "38": 730320896.0, + "39": 730320896.0, + "40": 730320896.0, + "41": 730320896.0, + "42": 730320896.0, + "43": 730320896.0, + "44": 730320896.0, + "45": 730320896.0, + "46": 730320896.0, + "47": 730320896.0, + "48": 730320896.0, + "49": 730320896.0, + "50": 730320896.0, + "51": 730320896.0, + "52": 730320896.0, + "53": 730320896.0, + "54": 730320896.0, + "55": 730320896.0, + "56": 730320896.0, + "57": 730320896.0, + "58": 730320896.0, + "59": 730320896.0, + "60": 730320896.0, + "61": 730320896.0, + "62": 730320896.0, + "63": 730320896.0, + "64": 730320896.0, + "65": 730320896.0, + "66": 730320896.0, + "67": 730320896.0, + "68": 730320896.0, + "69": 730320896.0, + "70": 730320896.0, + "71": 730320896.0, + "72": 730320896.0, + "73": 730320896.0, + "74": 730320896.0, + "75": 730320896.0, + "76": 730320896.0, + "77": 730320896.0, + "78": 730320896.0, + "79": 730320896.0, + "80": 730320896.0, + "81": 730320896.0, + "82": 730320896.0, + "83": 730320896.0, + "84": 730320896.0, + "85": 730320896.0, + "86": 730320896.0, + "87": 730320896.0, + "88": 730320896.0, + "89": 730320896.0, + "90": 730320896.0, + "91": 730320896.0, + "92": 730320896.0, + "93": 730320896.0, + "94": 730320896.0, + "95": 730320896.0, + "96": 730320896.0, + "97": 730320896.0, + "98": 730320896.0, + "99": 730320896.0, + "100": 730320896.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3837453312.0, + "2": 4119164928.0, + "3": 4119164928.0, + "4": 4119164928.0, + "5": 4119164928.0, + "6": 4119164928.0, + "7": 4119164928.0, + "8": 4119164928.0, + "9": 4119164928.0, + "10": 4119164928.0, + "11": 4119164928.0, + "12": 4119164928.0, + "13": 4119164928.0, + "14": 4119164928.0, + "15": 4119164928.0, + "16": 4119164928.0, + "17": 4119164928.0, + "18": 4119164928.0, + "19": 4119164928.0, + "20": 4119164928.0, + "21": 4119164928.0, + "22": 4119164928.0, + "23": 4119164928.0, + "24": 4119164928.0, + "25": 4119164928.0, + "26": 4119164928.0, + "27": 4119164928.0, + "28": 4119164928.0, + "29": 4119164928.0, + "30": 4119164928.0, + "31": 4119164928.0, + "32": 4119164928.0, + "33": 4119164928.0, + "34": 4119164928.0, + "35": 4119164928.0, + "36": 4119164928.0, + "37": 4119164928.0, + "38": 4119164928.0, + "39": 4119164928.0, + "40": 4119164928.0, + "41": 4119164928.0, + "42": 4119164928.0, + "43": 4119164928.0, + "44": 4119164928.0, + "45": 4119164928.0, + "46": 4119164928.0, + "47": 4119164928.0, + "48": 4119164928.0, + "49": 4119164928.0, + "50": 4119164928.0, + "51": 4119164928.0, + "52": 4119164928.0, + "53": 4119164928.0, + "54": 4119164928.0, + "55": 4119164928.0, + "56": 4119164928.0, + "57": 4119164928.0, + "58": 4119164928.0, + "59": 4119164928.0, + "60": 4119164928.0, + "61": 4119164928.0, + "62": 4119164928.0, + "63": 4119164928.0, + "64": 4119164928.0, + "65": 4119164928.0, + "66": 4119164928.0, + "67": 4119164928.0, + "68": 4119164928.0, + "69": 4119164928.0, + "70": 4119164928.0, + "71": 4119164928.0, + "72": 4119164928.0, + "73": 4119164928.0, + "74": 4119164928.0, + "75": 4119164928.0, + "76": 4119164928.0, + "77": 4119164928.0, + "78": 4119164928.0, + "79": 4119164928.0, + "80": 4119164928.0, + "81": 4119164928.0, + "82": 4119164928.0, + "83": 4119164928.0, + "84": 4119164928.0, + "85": 4119164928.0, + "86": 4119164928.0, + "87": 4119164928.0, + "88": 4119164928.0, + "89": 4119164928.0, + "90": 4119164928.0, + "91": 4119164928.0, + "92": 4119164928.0, + "93": 4119164928.0, + "94": 4119164928.0, + "95": 4119164928.0, + "96": 4119164928.0, + "97": 4119164928.0, + "98": 4119164928.0, + "99": 4119164928.0, + "100": 4119164928.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 20.0062, + "2": 0.22515, + "3": 0.1977, + "4": 0.18911, + "5": 0.18615, + "6": 0.17034, + "7": 0.16978, + "8": 0.172, + "9": 0.17258, + "10": 0.17365, + "11": 0.17197, + "12": 0.17127, + "13": 0.16991, + "14": 0.16997, + "15": 0.16994, + "16": 0.17143, + "17": 0.17095, + "18": 0.17098, + "19": 0.16956, + "20": 0.1705, + "21": 0.17016, + "22": 0.1709, + "23": 0.18003, + "24": 0.1728, + "25": 0.17179, + "26": 0.17099, + "27": 0.1721, + "28": 0.17027, + "29": 0.17076, + "30": 0.17085, + "31": 0.17145, + "32": 0.17023, + "33": 0.17166, + "34": 0.17042, + "35": 0.17306, + "36": 0.17083, + "37": 0.17109, + "38": 0.17096, + "39": 0.17162, + "40": 0.1709, + "41": 0.17007, + "42": 0.17021, + "43": 0.1703, + "44": 0.1709, + "45": 0.17091, + "46": 0.1708, + "47": 0.17037, + "48": 0.17053, + "49": 0.17145, + "50": 0.17057, + "51": 0.17728, + "52": 0.17072, + "53": 0.17004, + "54": 0.17259, + "55": 0.17417, + "56": 0.17223, + "57": 0.1731, + "58": 0.172, + "59": 0.17128, + "60": 0.17384, + "61": 0.17393, + "62": 0.17367, + "63": 0.17427, + "64": 0.17235, + "65": 0.17484, + "66": 0.1728, + "67": 0.17351, + "68": 0.17401, + "69": 0.17395, + "70": 0.1725, + "71": 0.17219, + "72": 0.17187, + "73": 0.17393, + "74": 0.17345, + "75": 0.17421, + "76": 0.17406, + "77": 0.17155, + "78": 0.1728, + "79": 0.17462, + "80": 0.17582, + "81": 0.17113, + "82": 0.17105, + "83": 0.17061, + "84": 0.17127, + "85": 0.17361, + "86": 0.17294, + "87": 0.17183, + "88": 0.17162, + "89": 0.17105, + "90": 0.17179, + "91": 0.17278, + "92": 0.17216, + "93": 0.17178, + "94": 0.17267, + "95": 0.1706, + "96": 0.17363, + "97": 0.17455, + "98": 0.17149, + "99": 0.17187, + "100": 0.1711 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..f2fcc6e9139 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.82005, + "2": 10.81907, + "3": 10.81397, + "4": 10.78498, + "5": 10.85285, + "6": 10.87448, + "7": 10.83201, + "8": 10.83296, + "9": 10.83936, + "10": 10.78449, + "11": 10.87794, + "12": 10.86113, + "13": 10.86438, + "14": 10.87595, + "15": 10.79226, + "16": 10.79507, + "17": 10.76764, + "18": 10.80977, + "19": 10.79693, + "20": 10.69196, + "21": 10.68154, + "22": 10.52072, + "23": 10.70881, + "24": 10.5753, + "25": 10.52318, + "26": 10.59411, + "27": 10.58357, + "28": 10.56188, + "29": 10.5696, + "30": 10.34505, + "31": 10.09986, + "32": 10.45209, + "33": 10.44378, + "34": 10.20285, + "35": 10.25888, + "36": 10.20951, + "37": 10.32305, + "38": 10.1656, + "39": 10.38115, + "40": 10.07032, + "41": 10.1364, + "42": 10.19467, + "43": 9.80541, + "44": 9.92556, + "45": 9.803, + "46": 9.80008, + "47": 10.11716, + "48": 9.81309, + "49": 9.49911, + "50": 9.87675, + "51": 9.82883, + "52": 9.71745, + "53": 10.03867, + "54": 9.92195, + "55": 9.85523, + "56": 9.5922, + "57": 9.44053, + "58": 9.79679, + "59": 9.5545, + "60": 9.46634, + "61": 9.66578, + "62": 9.95346, + "63": 9.33681, + "64": 9.74137, + "65": 8.91657, + "66": 9.66586, + "67": 9.34349, + "68": 9.75312, + "69": 9.75728, + "70": 9.69276, + "71": 9.58799, + "72": 9.55054, + "73": 9.46306, + "74": 8.90575, + "75": 9.37813, + "76": 9.04954, + "77": 10.02987, + "78": 9.69223, + "79": 9.33487, + "80": 9.368, + "81": 9.44383, + "82": 9.66162, + "83": 9.27183, + "84": 9.38074, + "85": 9.57598, + "86": 9.0429, + "87": 9.55787, + "88": 9.70459, + "89": 9.56609, + "90": 9.77247, + "91": 9.29341, + "92": 9.31916, + "93": 9.03465, + "94": 8.78492, + "95": 9.46912, + "96": 9.47453, + "97": 9.25689, + "98": 9.61859, + "99": 8.83266, + "100": 9.34574 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1559.0, + "2": 1591.0, + "3": 1686.0, + "4": 1707.0, + "5": 1915.0, + "6": 1734.0, + "7": 1735.0, + "8": 1584.0, + "9": 1810.0, + "10": 1361.0, + "11": 1884.0, + "12": 1714.0, + "13": 1923.0, + "14": 1736.0, + "15": 1831.0, + "16": 1684.0, + "17": 1787.0, + "18": 1707.0, + "19": 1680.0, + "20": 1695.0, + "21": 1815.0, + "22": 1711.0, + "23": 2079.0, + "24": 1677.0, + "25": 1650.0, + "26": 1714.0, + "27": 1813.0, + "28": 1998.0, + "29": 1931.0, + "30": 1861.0, + "31": 1573.0, + "32": 1934.0, + "33": 2063.0, + "34": 1891.0, + "35": 1916.0, + "36": 1939.0, + "37": 2299.0, + "38": 2235.0, + "39": 2352.0, + "40": 2109.0, + "41": 2286.0, + "42": 2232.0, + "43": 1919.0, + "44": 2032.0, + "45": 2098.0, + "46": 2287.0, + "47": 2513.0, + "48": 2360.0, + "49": 2126.0, + "50": 2424.0, + "51": 2433.0, + "52": 2566.0, + "53": 2902.0, + "54": 2589.0, + "55": 2309.0, + "56": 2761.0, + "57": 2265.0, + "58": 2876.0, + "59": 2821.0, + "60": 2432.0, + "61": 3073.0, + "62": 2638.0, + "63": 2426.0, + "64": 2913.0, + "65": 2660.0, + "66": 2985.0, + "67": 2723.0, + "68": 2790.0, + "69": 2997.0, + "70": 3132.0, + "71": 2837.0, + "72": 2291.0, + "73": 2780.0, + "74": 1936.0, + "75": 2555.0, + "76": 3028.0, + "77": 3175.0, + "78": 3109.0, + "79": 2994.0, + "80": 3370.0, + "81": 3552.0, + "82": 3308.0, + "83": 2898.0, + "84": 3285.0, + "85": 3434.0, + "86": 2573.0, + "87": 3858.0, + "88": 2920.0, + "89": 3217.0, + "90": 2868.0, + "91": 2784.0, + "92": 3011.0, + "93": 2700.0, + "94": 3372.0, + "95": 3273.0, + "96": 3557.0, + "97": 3145.0, + "98": 3635.0, + "99": 3308.0, + "100": 3359.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 730320896.0, + "2": 730320896.0, + "3": 730320896.0, + "4": 730320896.0, + "5": 730320896.0, + "6": 730320896.0, + "7": 730320896.0, + "8": 730320896.0, + "9": 730320896.0, + "10": 730320896.0, + "11": 730320896.0, + "12": 730320896.0, + "13": 730320896.0, + "14": 730320896.0, + "15": 730320896.0, + "16": 730320896.0, + "17": 730320896.0, + "18": 730320896.0, + "19": 730320896.0, + "20": 730320896.0, + "21": 730320896.0, + "22": 730320896.0, + "23": 730320896.0, + "24": 730320896.0, + "25": 730320896.0, + "26": 730320896.0, + "27": 730320896.0, + "28": 730320896.0, + "29": 730320896.0, + "30": 730320896.0, + "31": 730320896.0, + "32": 730320896.0, + "33": 730320896.0, + "34": 730320896.0, + "35": 730320896.0, + "36": 730320896.0, + "37": 730320896.0, + "38": 730320896.0, + "39": 730320896.0, + "40": 730320896.0, + "41": 730320896.0, + "42": 730320896.0, + "43": 730320896.0, + "44": 730320896.0, + "45": 730320896.0, + "46": 730320896.0, + "47": 730320896.0, + "48": 730320896.0, + "49": 730320896.0, + "50": 730320896.0, + "51": 730320896.0, + "52": 730320896.0, + "53": 730320896.0, + "54": 730320896.0, + "55": 730320896.0, + "56": 730320896.0, + "57": 730320896.0, + "58": 730320896.0, + "59": 730320896.0, + "60": 730320896.0, + "61": 730320896.0, + "62": 730320896.0, + "63": 730320896.0, + "64": 730320896.0, + "65": 730320896.0, + "66": 730320896.0, + "67": 730320896.0, + "68": 730320896.0, + "69": 730320896.0, + "70": 730320896.0, + "71": 730320896.0, + "72": 730320896.0, + "73": 730320896.0, + "74": 730320896.0, + "75": 730320896.0, + "76": 730320896.0, + "77": 730320896.0, + "78": 730320896.0, + "79": 730320896.0, + "80": 730320896.0, + "81": 730320896.0, + "82": 730320896.0, + "83": 730320896.0, + "84": 730320896.0, + "85": 730320896.0, + "86": 730320896.0, + "87": 730320896.0, + "88": 730320896.0, + "89": 730320896.0, + "90": 730320896.0, + "91": 730320896.0, + "92": 730320896.0, + "93": 730320896.0, + "94": 730320896.0, + "95": 730320896.0, + "96": 730320896.0, + "97": 730320896.0, + "98": 730320896.0, + "99": 730320896.0, + "100": 730320896.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3837453312.0, + "2": 4119164928.0, + "3": 4119164928.0, + "4": 4119164928.0, + "5": 4119164928.0, + "6": 4119164928.0, + "7": 4119164928.0, + "8": 4119164928.0, + "9": 4119164928.0, + "10": 4119164928.0, + "11": 4119164928.0, + "12": 4119164928.0, + "13": 4119164928.0, + "14": 4119164928.0, + "15": 4119164928.0, + "16": 4119164928.0, + "17": 4119164928.0, + "18": 4119164928.0, + "19": 4119164928.0, + "20": 4119164928.0, + "21": 4119164928.0, + "22": 4119164928.0, + "23": 4119164928.0, + "24": 4119164928.0, + "25": 4119164928.0, + "26": 4119164928.0, + "27": 4119164928.0, + "28": 4119164928.0, + "29": 4119164928.0, + "30": 4119164928.0, + "31": 4119164928.0, + "32": 4119164928.0, + "33": 4119164928.0, + "34": 4119164928.0, + "35": 4119164928.0, + "36": 4119164928.0, + "37": 4119164928.0, + "38": 4119164928.0, + "39": 4119164928.0, + "40": 4119164928.0, + "41": 4119164928.0, + "42": 4119164928.0, + "43": 4119164928.0, + "44": 4119164928.0, + "45": 4119164928.0, + "46": 4119164928.0, + "47": 4119164928.0, + "48": 4119164928.0, + "49": 4119164928.0, + "50": 4119164928.0, + "51": 4119164928.0, + "52": 4119164928.0, + "53": 4119164928.0, + "54": 4119164928.0, + "55": 4119164928.0, + "56": 4119164928.0, + "57": 4119164928.0, + "58": 4119164928.0, + "59": 4119164928.0, + "60": 4119164928.0, + "61": 4119164928.0, + "62": 4119164928.0, + "63": 4119164928.0, + "64": 4119164928.0, + "65": 4119164928.0, + "66": 4119164928.0, + "67": 4119164928.0, + "68": 4119164928.0, + "69": 4119164928.0, + "70": 4119164928.0, + "71": 4119164928.0, + "72": 4119164928.0, + "73": 4119164928.0, + "74": 4119164928.0, + "75": 4119164928.0, + "76": 4119164928.0, + "77": 4119164928.0, + "78": 4119164928.0, + "79": 4119164928.0, + "80": 4119164928.0, + "81": 4119164928.0, + "82": 4119164928.0, + "83": 4119164928.0, + "84": 4119164928.0, + "85": 4119164928.0, + "86": 4119164928.0, + "87": 4119164928.0, + "88": 4119164928.0, + "89": 4119164928.0, + "90": 4119164928.0, + "91": 4119164928.0, + "92": 4119164928.0, + "93": 4119164928.0, + "94": 4119164928.0, + "95": 4119164928.0, + "96": 4119164928.0, + "97": 4119164928.0, + "98": 4119164928.0, + "99": 4119164928.0, + "100": 4119164928.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 20.54847, + "2": 0.20654, + "3": 0.17899, + "4": 0.17609, + "5": 0.17607, + "6": 0.17545, + "7": 0.17582, + "8": 0.3981, + "9": 0.17427, + "10": 0.17111, + "11": 0.1706, + "12": 0.17427, + "13": 0.17652, + "14": 0.17107, + "15": 0.17191, + "16": 0.1696, + "17": 0.17104, + "18": 0.16925, + "19": 0.16894, + "20": 0.17181, + "21": 0.1703, + "22": 0.1722, + "23": 0.16959, + "24": 0.18369, + "25": 0.17058, + "26": 0.17105, + "27": 0.16942, + "28": 0.1691, + "29": 0.16894, + "30": 0.17, + "31": 0.17083, + "32": 0.17034, + "33": 0.16855, + "34": 0.16981, + "35": 0.1699, + "36": 0.16909, + "37": 0.16901, + "38": 0.16998, + "39": 0.16957, + "40": 0.17038, + "41": 0.16846, + "42": 0.16847, + "43": 0.16956, + "44": 0.16964, + "45": 0.16919, + "46": 0.16891, + "47": 0.16901, + "48": 0.16904, + "49": 0.16981, + "50": 0.17034, + "51": 0.17135, + "52": 0.16786, + "53": 0.1668, + "54": 0.1671, + "55": 0.16695, + "56": 0.16737, + "57": 0.1668, + "58": 0.16761, + "59": 0.16755, + "60": 0.16907, + "61": 0.16638, + "62": 0.16819, + "63": 0.16827, + "64": 0.17031, + "65": 0.167, + "66": 0.39277, + "67": 0.16989, + "68": 0.16709, + "69": 0.16761, + "70": 0.16602, + "71": 0.168, + "72": 0.16646, + "73": 0.16976, + "74": 0.16686, + "75": 0.16959, + "76": 0.16956, + "77": 0.1686, + "78": 0.16588, + "79": 0.16726, + "80": 0.16802, + "81": 0.16806, + "82": 0.1664, + "83": 0.16817, + "84": 0.16729, + "85": 0.1687, + "86": 0.16736, + "87": 0.1677, + "88": 0.16777, + "89": 0.16794, + "90": 0.16675, + "91": 0.1685, + "92": 0.1679, + "93": 0.16927, + "94": 0.16945, + "95": 0.171, + "96": 0.1671, + "97": 0.38537, + "98": 0.16869, + "99": 0.1704, + "100": 0.16709 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 4a4be7c6755..c681b5bd1b4 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,537 @@ -{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.84517, "5": 10.87427, "10": 10.82907, "15": 10.81974, "20": 10.727, "25": 10.55217, "30": 10.36614, "35": 10.2778, "40": 10.0976, "45": 9.84196, "50": 9.9125, "55": 9.88096, "60": 9.50125, "65": 8.94761, "70": 9.7424, "75": 9.42532, "80": 9.40396, "85": 9.61405, "90": 9.81418, "95": 9.5173, "100": 9.39541}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1655.0, "5": 1803.0, "10": 1448.0, "15": 1879.0, "20": 1657.0, "25": 1625.0, "30": 1882.0, "35": 1954.0, "40": 2191.0, "45": 2091.0, "50": 2189.0, "55": 2325.0, "60": 2361.0, "65": 2673.0, "70": 3139.0, "75": 2519.0, "80": 3205.0, "85": 3209.0, "90": 3168.0, "95": 3261.0, "100": 3135.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 763220480.0, "5": 763220480.0, "10": 763220480.0, "15": 763220480.0, "20": 763220480.0, "25": 763220480.0, "30": 763220480.0, "35": 763220480.0, "40": 763220480.0, "45": 763220480.0, "50": 763220480.0, "55": 763220480.0, "60": 763220480.0, "65": 763220480.0, "70": 763220480.0, "75": 763220480.0, "80": 763220480.0, "85": 763220480.0, "90": 763220480.0, "95": 763220480.0, "100": 763220480.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2359490560.0, "5": 2643299328.0, "10": 2643299328.0, "15": 2643299328.0, "20": 2643299328.0, "25": 2643299328.0, "30": 2643299328.0, "35": 2643299328.0, "40": 2643299328.0, "45": 2643299328.0, "50": 2643299328.0, "55": 2643299328.0, "60": 2643299328.0, "65": 2643299328.0, "70": 2643299328.0, "75": 2643299328.0, "80": 2643299328.0, "85": 2643299328.0, "90": 2643299328.0, "95": 2643299328.0, "100": 2643299328.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 16.90194, "5": 0.09713, "10": 0.1002, "15": 0.09686, "20": 0.0971, "25": 0.09785, "30": 0.10076, "35": 0.09808, "40": 0.10148, "45": 0.10005, "50": 0.09728, "55": 0.09621, "60": 0.09718, "65": 0.10047, "70": 0.09897, "75": 0.10302, "80": 0.10138, "85": 0.10032, "90": 0.097, "95": 0.09743, "100": 0.09586}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84517, + "2": 10.85349, + "3": 10.8539, + "4": 10.83825, + "5": 10.87427, + "6": 10.89307, + "7": 10.85454, + "8": 10.8626, + "9": 10.86468, + "10": 10.82907, + "11": 10.88789, + "12": 10.87095, + "13": 10.87916, + "14": 10.89079, + "15": 10.81974, + "16": 10.83162, + "17": 10.79863, + "18": 10.81667, + "19": 10.81919, + "20": 10.727, + "21": 10.70594, + "22": 10.56364, + "23": 10.72802, + "24": 10.60832, + "25": 10.55217, + "26": 10.60845, + "27": 10.62847, + "28": 10.5831, + "29": 10.60012, + "30": 10.36614, + "31": 10.12044, + "32": 10.47684, + "33": 10.46873, + "34": 10.22319, + "35": 10.2778, + "36": 10.22892, + "37": 10.35949, + "38": 10.19371, + "39": 10.4155, + "40": 10.0976, + "41": 10.15737, + "42": 10.22396, + "43": 9.83286, + "44": 9.96916, + "45": 9.84196, + "46": 9.83045, + "47": 10.15628, + "48": 9.85484, + "49": 9.54086, + "50": 9.9125, + "51": 9.8587, + "52": 9.74287, + "53": 10.06647, + "54": 9.95168, + "55": 9.88096, + "56": 9.62625, + "57": 9.47766, + "58": 9.8335, + "59": 9.58522, + "60": 9.50125, + "61": 9.69186, + "62": 9.98858, + "63": 9.38478, + "64": 9.78027, + "65": 8.94761, + "66": 9.70857, + "67": 9.36847, + "68": 9.78438, + "69": 9.79407, + "70": 9.7424, + "71": 9.61808, + "72": 9.58427, + "73": 9.50347, + "74": 8.9422, + "75": 9.42532, + "76": 9.07407, + "77": 10.06351, + "78": 9.7208, + "79": 9.37296, + "80": 9.40396, + "81": 9.48168, + "82": 9.69778, + "83": 9.30711, + "84": 9.41712, + "85": 9.61405, + "86": 9.07618, + "87": 9.59088, + "88": 9.7464, + "89": 9.59987, + "90": 9.81418, + "91": 9.33775, + "92": 9.35372, + "93": 9.07397, + "94": 8.8317, + "95": 9.5173, + "96": 9.52412, + "97": 9.30995, + "98": 9.66807, + "99": 8.8859, + "100": 9.39541 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1655.0, + "2": 1697.0, + "3": 1724.0, + "4": 1720.0, + "5": 1803.0, + "6": 1772.0, + "7": 1811.0, + "8": 1678.0, + "9": 1828.0, + "10": 1448.0, + "11": 1890.0, + "12": 1657.0, + "13": 1852.0, + "14": 1717.0, + "15": 1879.0, + "16": 1921.0, + "17": 1666.0, + "18": 1729.0, + "19": 1767.0, + "20": 1657.0, + "21": 1827.0, + "22": 1594.0, + "23": 1918.0, + "24": 1622.0, + "25": 1625.0, + "26": 1649.0, + "27": 1788.0, + "28": 2030.0, + "29": 1980.0, + "30": 1882.0, + "31": 1564.0, + "32": 1918.0, + "33": 2045.0, + "34": 1884.0, + "35": 1954.0, + "36": 1910.0, + "37": 2267.0, + "38": 2195.0, + "39": 2346.0, + "40": 2191.0, + "41": 2171.0, + "42": 2246.0, + "43": 1997.0, + "44": 2156.0, + "45": 2091.0, + "46": 2439.0, + "47": 2539.0, + "48": 2418.0, + "49": 2207.0, + "50": 2189.0, + "51": 2608.0, + "52": 2444.0, + "53": 2898.0, + "54": 2664.0, + "55": 2325.0, + "56": 2614.0, + "57": 2394.0, + "58": 2812.0, + "59": 2771.0, + "60": 2361.0, + "61": 2855.0, + "62": 2675.0, + "63": 2393.0, + "64": 3014.0, + "65": 2673.0, + "66": 3051.0, + "67": 2657.0, + "68": 2662.0, + "69": 2736.0, + "70": 3139.0, + "71": 2943.0, + "72": 2293.0, + "73": 2908.0, + "74": 1887.0, + "75": 2519.0, + "76": 3060.0, + "77": 3191.0, + "78": 3211.0, + "79": 3081.0, + "80": 3205.0, + "81": 3563.0, + "82": 3201.0, + "83": 2614.0, + "84": 3162.0, + "85": 3209.0, + "86": 2660.0, + "87": 3729.0, + "88": 3002.0, + "89": 3160.0, + "90": 3168.0, + "91": 2753.0, + "92": 3258.0, + "93": 2617.0, + "94": 3341.0, + "95": 3261.0, + "96": 3370.0, + "97": 3163.0, + "98": 3566.0, + "99": 3179.0, + "100": 3135.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 763220480.0, + "2": 763220480.0, + "3": 763220480.0, + "4": 763220480.0, + "5": 763220480.0, + "6": 763220480.0, + "7": 763220480.0, + "8": 763220480.0, + "9": 763220480.0, + "10": 763220480.0, + "11": 763220480.0, + "12": 763220480.0, + "13": 763220480.0, + "14": 763220480.0, + "15": 763220480.0, + "16": 763220480.0, + "17": 763220480.0, + "18": 763220480.0, + "19": 763220480.0, + "20": 763220480.0, + "21": 763220480.0, + "22": 763220480.0, + "23": 763220480.0, + "24": 763220480.0, + "25": 763220480.0, + "26": 763220480.0, + "27": 763220480.0, + "28": 763220480.0, + "29": 763220480.0, + "30": 763220480.0, + "31": 763220480.0, + "32": 763220480.0, + "33": 763220480.0, + "34": 763220480.0, + "35": 763220480.0, + "36": 763220480.0, + "37": 763220480.0, + "38": 763220480.0, + "39": 763220480.0, + "40": 763220480.0, + "41": 763220480.0, + "42": 763220480.0, + "43": 763220480.0, + "44": 763220480.0, + "45": 763220480.0, + "46": 763220480.0, + "47": 763220480.0, + "48": 763220480.0, + "49": 763220480.0, + "50": 763220480.0, + "51": 763220480.0, + "52": 763220480.0, + "53": 763220480.0, + "54": 763220480.0, + "55": 763220480.0, + "56": 763220480.0, + "57": 763220480.0, + "58": 763220480.0, + "59": 763220480.0, + "60": 763220480.0, + "61": 763220480.0, + "62": 763220480.0, + "63": 763220480.0, + "64": 763220480.0, + "65": 763220480.0, + "66": 763220480.0, + "67": 763220480.0, + "68": 763220480.0, + "69": 763220480.0, + "70": 763220480.0, + "71": 763220480.0, + "72": 763220480.0, + "73": 763220480.0, + "74": 763220480.0, + "75": 763220480.0, + "76": 763220480.0, + "77": 763220480.0, + "78": 763220480.0, + "79": 763220480.0, + "80": 763220480.0, + "81": 763220480.0, + "82": 763220480.0, + "83": 763220480.0, + "84": 763220480.0, + "85": 763220480.0, + "86": 763220480.0, + "87": 763220480.0, + "88": 763220480.0, + "89": 763220480.0, + "90": 763220480.0, + "91": 763220480.0, + "92": 763220480.0, + "93": 763220480.0, + "94": 763220480.0, + "95": 763220480.0, + "96": 763220480.0, + "97": 763220480.0, + "98": 763220480.0, + "99": 763220480.0, + "100": 763220480.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2359490560.0, + "2": 2643299328.0, + "3": 2643299328.0, + "4": 2643299328.0, + "5": 2643299328.0, + "6": 2643299328.0, + "7": 2643299328.0, + "8": 2643299328.0, + "9": 2643299328.0, + "10": 2643299328.0, + "11": 2643299328.0, + "12": 2643299328.0, + "13": 2643299328.0, + "14": 2643299328.0, + "15": 2643299328.0, + "16": 2643299328.0, + "17": 2643299328.0, + "18": 2643299328.0, + "19": 2643299328.0, + "20": 2643299328.0, + "21": 2643299328.0, + "22": 2643299328.0, + "23": 2643299328.0, + "24": 2643299328.0, + "25": 2643299328.0, + "26": 2643299328.0, + "27": 2643299328.0, + "28": 2643299328.0, + "29": 2643299328.0, + "30": 2643299328.0, + "31": 2643299328.0, + "32": 2643299328.0, + "33": 2643299328.0, + "34": 2643299328.0, + "35": 2643299328.0, + "36": 2643299328.0, + "37": 2643299328.0, + "38": 2643299328.0, + "39": 2643299328.0, + "40": 2643299328.0, + "41": 2643299328.0, + "42": 2643299328.0, + "43": 2643299328.0, + "44": 2643299328.0, + "45": 2643299328.0, + "46": 2643299328.0, + "47": 2643299328.0, + "48": 2643299328.0, + "49": 2643299328.0, + "50": 2643299328.0, + "51": 2643299328.0, + "52": 2643299328.0, + "53": 2643299328.0, + "54": 2643299328.0, + "55": 2643299328.0, + "56": 2643299328.0, + "57": 2643299328.0, + "58": 2643299328.0, + "59": 2643299328.0, + "60": 2643299328.0, + "61": 2643299328.0, + "62": 2643299328.0, + "63": 2643299328.0, + "64": 2643299328.0, + "65": 2643299328.0, + "66": 2643299328.0, + "67": 2643299328.0, + "68": 2643299328.0, + "69": 2643299328.0, + "70": 2643299328.0, + "71": 2643299328.0, + "72": 2643299328.0, + "73": 2643299328.0, + "74": 2643299328.0, + "75": 2643299328.0, + "76": 2643299328.0, + "77": 2643299328.0, + "78": 2643299328.0, + "79": 2643299328.0, + "80": 2643299328.0, + "81": 2643299328.0, + "82": 2643299328.0, + "83": 2643299328.0, + "84": 2643299328.0, + "85": 2643299328.0, + "86": 2643299328.0, + "87": 2643299328.0, + "88": 2643299328.0, + "89": 2643299328.0, + "90": 2643299328.0, + "91": 2643299328.0, + "92": 2643299328.0, + "93": 2643299328.0, + "94": 2643299328.0, + "95": 2643299328.0, + "96": 2643299328.0, + "97": 2643299328.0, + "98": 2643299328.0, + "99": 2643299328.0, + "100": 2643299328.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 17.55882, + "2": 0.13655, + "3": 0.11858, + "4": 0.11941, + "5": 0.11739, + "6": 0.11681, + "7": 0.11862, + "8": 0.11921, + "9": 0.11665, + "10": 0.11215, + "11": 0.11312, + "12": 0.1133, + "13": 0.11518, + "14": 0.11608, + "15": 0.11464, + "16": 0.11376, + "17": 0.11276, + "18": 0.11015, + "19": 0.11044, + "20": 0.11079, + "21": 0.11474, + "22": 0.11541, + "23": 0.11297, + "24": 0.11166, + "25": 0.11284, + "26": 0.11199, + "27": 0.11465, + "28": 0.11372, + "29": 0.10904, + "30": 0.10993, + "31": 0.1098, + "32": 0.10938, + "33": 0.10814, + "34": 0.11037, + "35": 0.11052, + "36": 0.1106, + "37": 0.11033, + "38": 0.10993, + "39": 0.11259, + "40": 0.11019, + "41": 0.11104, + "42": 0.10843, + "43": 0.10994, + "44": 0.10984, + "45": 0.11066, + "46": 0.11026, + "47": 0.11119, + "48": 0.11328, + "49": 0.11122, + "50": 0.11048, + "51": 0.11634, + "52": 0.10989, + "53": 0.10877, + "54": 0.10843, + "55": 0.1103, + "56": 0.11044, + "57": 0.11032, + "58": 0.10904, + "59": 0.1093, + "60": 0.10814, + "61": 0.10768, + "62": 0.10827, + "63": 0.11047, + "64": 0.10921, + "65": 0.11011, + "66": 0.11245, + "67": 0.10798, + "68": 0.11072, + "69": 0.10966, + "70": 0.10787, + "71": 0.10889, + "72": 0.10915, + "73": 0.10943, + "74": 0.11136, + "75": 0.11012, + "76": 0.11056, + "77": 0.1092, + "78": 0.11055, + "79": 0.11067, + "80": 0.11178, + "81": 0.11295, + "82": 0.11012, + "83": 0.11251, + "84": 0.11453, + "85": 0.11392, + "86": 0.1136, + "87": 0.10936, + "88": 0.10748, + "89": 0.109, + "90": 0.10971, + "91": 0.10877, + "92": 0.1101, + "93": 0.11367, + "94": 0.11157, + "95": 0.11149, + "96": 0.10884, + "97": 0.10884, + "98": 0.10766, + "99": 0.10924, + "100": 0.10913 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..14b95ca2ef5 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84517, + "2": 10.85349, + "3": 10.8539, + "4": 10.83825, + "5": 10.87427, + "6": 10.89307, + "7": 10.85454, + "8": 10.8626, + "9": 10.86468, + "10": 10.82907, + "11": 10.88789, + "12": 10.87095, + "13": 10.87916, + "14": 10.89079, + "15": 10.81974, + "16": 10.83162, + "17": 10.79863, + "18": 10.81667, + "19": 10.81919, + "20": 10.727, + "21": 10.70594, + "22": 10.56364, + "23": 10.72802, + "24": 10.60832, + "25": 10.55217, + "26": 10.60845, + "27": 10.62847, + "28": 10.5831, + "29": 10.60012, + "30": 10.36614, + "31": 10.12044, + "32": 10.47684, + "33": 10.46873, + "34": 10.22319, + "35": 10.2778, + "36": 10.22892, + "37": 10.35949, + "38": 10.19371, + "39": 10.4155, + "40": 10.0976, + "41": 10.15737, + "42": 10.22396, + "43": 9.83286, + "44": 9.96916, + "45": 9.84196, + "46": 9.83045, + "47": 10.15628, + "48": 9.85484, + "49": 9.54086, + "50": 9.9125, + "51": 9.8587, + "52": 9.74287, + "53": 10.06647, + "54": 9.95168, + "55": 9.88096, + "56": 9.62625, + "57": 9.47766, + "58": 9.8335, + "59": 9.58522, + "60": 9.50125, + "61": 9.69186, + "62": 9.98858, + "63": 9.38478, + "64": 9.78027, + "65": 8.94761, + "66": 9.70857, + "67": 9.36847, + "68": 9.78438, + "69": 9.79407, + "70": 9.7424, + "71": 9.61808, + "72": 9.58427, + "73": 9.50347, + "74": 8.9422, + "75": 9.42532, + "76": 9.07407, + "77": 10.06351, + "78": 9.7208, + "79": 9.37296, + "80": 9.40396, + "81": 9.48168, + "82": 9.69778, + "83": 9.30711, + "84": 9.41712, + "85": 9.61405, + "86": 9.07618, + "87": 9.59088, + "88": 9.7464, + "89": 9.59987, + "90": 9.81418, + "91": 9.33775, + "92": 9.35372, + "93": 9.07397, + "94": 8.8317, + "95": 9.5173, + "96": 9.52412, + "97": 9.30995, + "98": 9.66807, + "99": 8.8859, + "100": 9.39541 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1655.0, + "2": 1697.0, + "3": 1724.0, + "4": 1720.0, + "5": 1803.0, + "6": 1772.0, + "7": 1811.0, + "8": 1678.0, + "9": 1828.0, + "10": 1448.0, + "11": 1890.0, + "12": 1657.0, + "13": 1852.0, + "14": 1717.0, + "15": 1879.0, + "16": 1921.0, + "17": 1666.0, + "18": 1729.0, + "19": 1767.0, + "20": 1657.0, + "21": 1827.0, + "22": 1594.0, + "23": 1918.0, + "24": 1622.0, + "25": 1625.0, + "26": 1649.0, + "27": 1788.0, + "28": 2030.0, + "29": 1980.0, + "30": 1882.0, + "31": 1564.0, + "32": 1918.0, + "33": 2045.0, + "34": 1884.0, + "35": 1954.0, + "36": 1910.0, + "37": 2267.0, + "38": 2195.0, + "39": 2346.0, + "40": 2191.0, + "41": 2171.0, + "42": 2246.0, + "43": 1997.0, + "44": 2156.0, + "45": 2091.0, + "46": 2439.0, + "47": 2539.0, + "48": 2418.0, + "49": 2207.0, + "50": 2189.0, + "51": 2608.0, + "52": 2444.0, + "53": 2898.0, + "54": 2664.0, + "55": 2325.0, + "56": 2614.0, + "57": 2394.0, + "58": 2812.0, + "59": 2771.0, + "60": 2361.0, + "61": 2855.0, + "62": 2675.0, + "63": 2393.0, + "64": 3014.0, + "65": 2673.0, + "66": 3051.0, + "67": 2657.0, + "68": 2662.0, + "69": 2736.0, + "70": 3139.0, + "71": 2943.0, + "72": 2293.0, + "73": 2908.0, + "74": 1887.0, + "75": 2519.0, + "76": 3060.0, + "77": 3191.0, + "78": 3211.0, + "79": 3081.0, + "80": 3205.0, + "81": 3563.0, + "82": 3201.0, + "83": 2614.0, + "84": 3162.0, + "85": 3209.0, + "86": 2660.0, + "87": 3729.0, + "88": 3002.0, + "89": 3160.0, + "90": 3168.0, + "91": 2753.0, + "92": 3258.0, + "93": 2617.0, + "94": 3341.0, + "95": 3261.0, + "96": 3370.0, + "97": 3163.0, + "98": 3566.0, + "99": 3179.0, + "100": 3135.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 763220480.0, + "2": 763220480.0, + "3": 763220480.0, + "4": 763220480.0, + "5": 763220480.0, + "6": 763220480.0, + "7": 763220480.0, + "8": 763220480.0, + "9": 763220480.0, + "10": 763220480.0, + "11": 763220480.0, + "12": 763220480.0, + "13": 763220480.0, + "14": 763220480.0, + "15": 763220480.0, + "16": 763220480.0, + "17": 763220480.0, + "18": 763220480.0, + "19": 763220480.0, + "20": 763220480.0, + "21": 763220480.0, + "22": 763220480.0, + "23": 763220480.0, + "24": 763220480.0, + "25": 763220480.0, + "26": 763220480.0, + "27": 763220480.0, + "28": 763220480.0, + "29": 763220480.0, + "30": 763220480.0, + "31": 763220480.0, + "32": 763220480.0, + "33": 763220480.0, + "34": 763220480.0, + "35": 763220480.0, + "36": 763220480.0, + "37": 763220480.0, + "38": 763220480.0, + "39": 763220480.0, + "40": 763220480.0, + "41": 763220480.0, + "42": 763220480.0, + "43": 763220480.0, + "44": 763220480.0, + "45": 763220480.0, + "46": 763220480.0, + "47": 763220480.0, + "48": 763220480.0, + "49": 763220480.0, + "50": 763220480.0, + "51": 763220480.0, + "52": 763220480.0, + "53": 763220480.0, + "54": 763220480.0, + "55": 763220480.0, + "56": 763220480.0, + "57": 763220480.0, + "58": 763220480.0, + "59": 763220480.0, + "60": 763220480.0, + "61": 763220480.0, + "62": 763220480.0, + "63": 763220480.0, + "64": 763220480.0, + "65": 763220480.0, + "66": 763220480.0, + "67": 763220480.0, + "68": 763220480.0, + "69": 763220480.0, + "70": 763220480.0, + "71": 763220480.0, + "72": 763220480.0, + "73": 763220480.0, + "74": 763220480.0, + "75": 763220480.0, + "76": 763220480.0, + "77": 763220480.0, + "78": 763220480.0, + "79": 763220480.0, + "80": 763220480.0, + "81": 763220480.0, + "82": 763220480.0, + "83": 763220480.0, + "84": 763220480.0, + "85": 763220480.0, + "86": 763220480.0, + "87": 763220480.0, + "88": 763220480.0, + "89": 763220480.0, + "90": 763220480.0, + "91": 763220480.0, + "92": 763220480.0, + "93": 763220480.0, + "94": 763220480.0, + "95": 763220480.0, + "96": 763220480.0, + "97": 763220480.0, + "98": 763220480.0, + "99": 763220480.0, + "100": 763220480.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2359490560.0, + "2": 2643299328.0, + "3": 2643299328.0, + "4": 2643299328.0, + "5": 2643299328.0, + "6": 2643299328.0, + "7": 2643299328.0, + "8": 2643299328.0, + "9": 2643299328.0, + "10": 2643299328.0, + "11": 2643299328.0, + "12": 2643299328.0, + "13": 2643299328.0, + "14": 2643299328.0, + "15": 2643299328.0, + "16": 2643299328.0, + "17": 2643299328.0, + "18": 2643299328.0, + "19": 2643299328.0, + "20": 2643299328.0, + "21": 2643299328.0, + "22": 2643299328.0, + "23": 2643299328.0, + "24": 2643299328.0, + "25": 2643299328.0, + "26": 2643299328.0, + "27": 2643299328.0, + "28": 2643299328.0, + "29": 2643299328.0, + "30": 2643299328.0, + "31": 2643299328.0, + "32": 2643299328.0, + "33": 2643299328.0, + "34": 2643299328.0, + "35": 2643299328.0, + "36": 2643299328.0, + "37": 2643299328.0, + "38": 2643299328.0, + "39": 2643299328.0, + "40": 2643299328.0, + "41": 2643299328.0, + "42": 2643299328.0, + "43": 2643299328.0, + "44": 2643299328.0, + "45": 2643299328.0, + "46": 2643299328.0, + "47": 2643299328.0, + "48": 2643299328.0, + "49": 2643299328.0, + "50": 2643299328.0, + "51": 2643299328.0, + "52": 2643299328.0, + "53": 2643299328.0, + "54": 2643299328.0, + "55": 2643299328.0, + "56": 2643299328.0, + "57": 2643299328.0, + "58": 2643299328.0, + "59": 2643299328.0, + "60": 2643299328.0, + "61": 2643299328.0, + "62": 2643299328.0, + "63": 2643299328.0, + "64": 2643299328.0, + "65": 2643299328.0, + "66": 2643299328.0, + "67": 2643299328.0, + "68": 2643299328.0, + "69": 2643299328.0, + "70": 2643299328.0, + "71": 2643299328.0, + "72": 2643299328.0, + "73": 2643299328.0, + "74": 2643299328.0, + "75": 2643299328.0, + "76": 2643299328.0, + "77": 2643299328.0, + "78": 2643299328.0, + "79": 2643299328.0, + "80": 2643299328.0, + "81": 2643299328.0, + "82": 2643299328.0, + "83": 2643299328.0, + "84": 2643299328.0, + "85": 2643299328.0, + "86": 2643299328.0, + "87": 2643299328.0, + "88": 2643299328.0, + "89": 2643299328.0, + "90": 2643299328.0, + "91": 2643299328.0, + "92": 2643299328.0, + "93": 2643299328.0, + "94": 2643299328.0, + "95": 2643299328.0, + "96": 2643299328.0, + "97": 2643299328.0, + "98": 2643299328.0, + "99": 2643299328.0, + "100": 2643299328.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 16.57994, + "2": 0.13128, + "3": 0.10309, + "4": 0.10229, + "5": 0.10072, + "6": 0.09862, + "7": 0.10136, + "8": 0.10155, + "9": 0.10115, + "10": 0.09973, + "11": 0.10272, + "12": 0.10529, + "13": 0.10516, + "14": 0.10397, + "15": 0.10407, + "16": 0.10362, + "17": 0.10333, + "18": 0.10307, + "19": 0.10283, + "20": 0.09949, + "21": 0.09817, + "22": 0.1027, + "23": 0.10231, + "24": 0.10218, + "25": 0.10307, + "26": 0.10424, + "27": 0.10183, + "28": 0.10321, + "29": 0.10228, + "30": 0.10178, + "31": 0.10491, + "32": 0.10267, + "33": 0.10205, + "34": 0.10154, + "35": 0.10239, + "36": 0.10188, + "37": 0.10547, + "38": 0.10217, + "39": 0.10273, + "40": 0.09793, + "41": 0.09773, + "42": 0.09752, + "43": 0.09866, + "44": 0.0975, + "45": 0.09867, + "46": 0.09876, + "47": 0.09929, + "48": 0.09909, + "49": 0.101, + "50": 0.0978, + "51": 0.10715, + "52": 0.10113, + "53": 0.10133, + "54": 0.10021, + "55": 0.10053, + "56": 0.10041, + "57": 0.10033, + "58": 0.10121, + "59": 0.09846, + "60": 0.09725, + "61": 0.09803, + "62": 0.09772, + "63": 0.09712, + "64": 0.10005, + "65": 0.09924, + "66": 0.09828, + "67": 0.09806, + "68": 0.09771, + "69": 0.103, + "70": 0.10104, + "71": 0.10088, + "72": 0.1012, + "73": 0.10067, + "74": 0.1036, + "75": 0.09878, + "76": 0.10012, + "77": 0.09887, + "78": 0.09891, + "79": 0.09932, + "80": 0.09828, + "81": 0.1, + "82": 0.10177, + "83": 0.09881, + "84": 0.09963, + "85": 0.09854, + "86": 0.09886, + "87": 0.10179, + "88": 0.10085, + "89": 0.10134, + "90": 0.1035, + "91": 0.10105, + "92": 0.10027, + "93": 0.10157, + "94": 0.10164, + "95": 0.10203, + "96": 0.09929, + "97": 0.10135, + "98": 0.10191, + "99": 0.10128, + "100": 0.1009 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..f0d9be9be9d --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84517, + "2": 10.85349, + "3": 10.8539, + "4": 10.83825, + "5": 10.87427, + "6": 10.89307, + "7": 10.85454, + "8": 10.8626, + "9": 10.86468, + "10": 10.82907, + "11": 10.88789, + "12": 10.87095, + "13": 10.87916, + "14": 10.89079, + "15": 10.81974, + "16": 10.83162, + "17": 10.79863, + "18": 10.81667, + "19": 10.81919, + "20": 10.727, + "21": 10.70594, + "22": 10.56364, + "23": 10.72802, + "24": 10.60832, + "25": 10.55217, + "26": 10.60845, + "27": 10.62847, + "28": 10.5831, + "29": 10.60012, + "30": 10.36614, + "31": 10.12044, + "32": 10.47684, + "33": 10.46873, + "34": 10.22319, + "35": 10.2778, + "36": 10.22892, + "37": 10.35949, + "38": 10.19371, + "39": 10.4155, + "40": 10.0976, + "41": 10.15737, + "42": 10.22396, + "43": 9.83286, + "44": 9.96916, + "45": 9.84196, + "46": 9.83045, + "47": 10.15628, + "48": 9.85484, + "49": 9.54086, + "50": 9.9125, + "51": 9.8587, + "52": 9.74287, + "53": 10.06647, + "54": 9.95168, + "55": 9.88096, + "56": 9.62625, + "57": 9.47766, + "58": 9.8335, + "59": 9.58522, + "60": 9.50125, + "61": 9.69186, + "62": 9.98858, + "63": 9.38478, + "64": 9.78027, + "65": 8.94761, + "66": 9.70857, + "67": 9.36847, + "68": 9.78438, + "69": 9.79407, + "70": 9.7424, + "71": 9.61808, + "72": 9.58427, + "73": 9.50347, + "74": 8.9422, + "75": 9.42532, + "76": 9.07407, + "77": 10.06351, + "78": 9.7208, + "79": 9.37296, + "80": 9.40396, + "81": 9.48168, + "82": 9.69778, + "83": 9.30711, + "84": 9.41712, + "85": 9.61405, + "86": 9.07618, + "87": 9.59088, + "88": 9.7464, + "89": 9.59987, + "90": 9.81418, + "91": 9.33775, + "92": 9.35372, + "93": 9.07397, + "94": 8.8317, + "95": 9.5173, + "96": 9.52412, + "97": 9.30995, + "98": 9.66807, + "99": 8.8859, + "100": 9.39541 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1655.0, + "2": 1697.0, + "3": 1724.0, + "4": 1720.0, + "5": 1803.0, + "6": 1772.0, + "7": 1811.0, + "8": 1678.0, + "9": 1828.0, + "10": 1448.0, + "11": 1890.0, + "12": 1657.0, + "13": 1852.0, + "14": 1717.0, + "15": 1879.0, + "16": 1921.0, + "17": 1666.0, + "18": 1729.0, + "19": 1767.0, + "20": 1657.0, + "21": 1827.0, + "22": 1594.0, + "23": 1918.0, + "24": 1622.0, + "25": 1625.0, + "26": 1649.0, + "27": 1788.0, + "28": 2030.0, + "29": 1980.0, + "30": 1882.0, + "31": 1564.0, + "32": 1918.0, + "33": 2045.0, + "34": 1884.0, + "35": 1954.0, + "36": 1910.0, + "37": 2267.0, + "38": 2195.0, + "39": 2346.0, + "40": 2191.0, + "41": 2171.0, + "42": 2246.0, + "43": 1997.0, + "44": 2156.0, + "45": 2091.0, + "46": 2439.0, + "47": 2539.0, + "48": 2418.0, + "49": 2207.0, + "50": 2189.0, + "51": 2608.0, + "52": 2444.0, + "53": 2898.0, + "54": 2664.0, + "55": 2325.0, + "56": 2614.0, + "57": 2394.0, + "58": 2812.0, + "59": 2771.0, + "60": 2361.0, + "61": 2855.0, + "62": 2675.0, + "63": 2393.0, + "64": 3014.0, + "65": 2673.0, + "66": 3051.0, + "67": 2657.0, + "68": 2662.0, + "69": 2736.0, + "70": 3139.0, + "71": 2943.0, + "72": 2293.0, + "73": 2908.0, + "74": 1887.0, + "75": 2519.0, + "76": 3060.0, + "77": 3191.0, + "78": 3211.0, + "79": 3081.0, + "80": 3205.0, + "81": 3563.0, + "82": 3201.0, + "83": 2614.0, + "84": 3162.0, + "85": 3209.0, + "86": 2660.0, + "87": 3729.0, + "88": 3002.0, + "89": 3160.0, + "90": 3168.0, + "91": 2753.0, + "92": 3258.0, + "93": 2617.0, + "94": 3341.0, + "95": 3261.0, + "96": 3370.0, + "97": 3163.0, + "98": 3566.0, + "99": 3179.0, + "100": 3135.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 763220480.0, + "2": 763220480.0, + "3": 763220480.0, + "4": 763220480.0, + "5": 763220480.0, + "6": 763220480.0, + "7": 763220480.0, + "8": 763220480.0, + "9": 763220480.0, + "10": 763220480.0, + "11": 763220480.0, + "12": 763220480.0, + "13": 763220480.0, + "14": 763220480.0, + "15": 763220480.0, + "16": 763220480.0, + "17": 763220480.0, + "18": 763220480.0, + "19": 763220480.0, + "20": 763220480.0, + "21": 763220480.0, + "22": 763220480.0, + "23": 763220480.0, + "24": 763220480.0, + "25": 763220480.0, + "26": 763220480.0, + "27": 763220480.0, + "28": 763220480.0, + "29": 763220480.0, + "30": 763220480.0, + "31": 763220480.0, + "32": 763220480.0, + "33": 763220480.0, + "34": 763220480.0, + "35": 763220480.0, + "36": 763220480.0, + "37": 763220480.0, + "38": 763220480.0, + "39": 763220480.0, + "40": 763220480.0, + "41": 763220480.0, + "42": 763220480.0, + "43": 763220480.0, + "44": 763220480.0, + "45": 763220480.0, + "46": 763220480.0, + "47": 763220480.0, + "48": 763220480.0, + "49": 763220480.0, + "50": 763220480.0, + "51": 763220480.0, + "52": 763220480.0, + "53": 763220480.0, + "54": 763220480.0, + "55": 763220480.0, + "56": 763220480.0, + "57": 763220480.0, + "58": 763220480.0, + "59": 763220480.0, + "60": 763220480.0, + "61": 763220480.0, + "62": 763220480.0, + "63": 763220480.0, + "64": 763220480.0, + "65": 763220480.0, + "66": 763220480.0, + "67": 763220480.0, + "68": 763220480.0, + "69": 763220480.0, + "70": 763220480.0, + "71": 763220480.0, + "72": 763220480.0, + "73": 763220480.0, + "74": 763220480.0, + "75": 763220480.0, + "76": 763220480.0, + "77": 763220480.0, + "78": 763220480.0, + "79": 763220480.0, + "80": 763220480.0, + "81": 763220480.0, + "82": 763220480.0, + "83": 763220480.0, + "84": 763220480.0, + "85": 763220480.0, + "86": 763220480.0, + "87": 763220480.0, + "88": 763220480.0, + "89": 763220480.0, + "90": 763220480.0, + "91": 763220480.0, + "92": 763220480.0, + "93": 763220480.0, + "94": 763220480.0, + "95": 763220480.0, + "96": 763220480.0, + "97": 763220480.0, + "98": 763220480.0, + "99": 763220480.0, + "100": 763220480.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2359490560.0, + "2": 2643299328.0, + "3": 2643299328.0, + "4": 2643299328.0, + "5": 2643299328.0, + "6": 2643299328.0, + "7": 2643299328.0, + "8": 2643299328.0, + "9": 2643299328.0, + "10": 2643299328.0, + "11": 2643299328.0, + "12": 2643299328.0, + "13": 2643299328.0, + "14": 2643299328.0, + "15": 2643299328.0, + "16": 2643299328.0, + "17": 2643299328.0, + "18": 2643299328.0, + "19": 2643299328.0, + "20": 2643299328.0, + "21": 2643299328.0, + "22": 2643299328.0, + "23": 2643299328.0, + "24": 2643299328.0, + "25": 2643299328.0, + "26": 2643299328.0, + "27": 2643299328.0, + "28": 2643299328.0, + "29": 2643299328.0, + "30": 2643299328.0, + "31": 2643299328.0, + "32": 2643299328.0, + "33": 2643299328.0, + "34": 2643299328.0, + "35": 2643299328.0, + "36": 2643299328.0, + "37": 2643299328.0, + "38": 2643299328.0, + "39": 2643299328.0, + "40": 2643299328.0, + "41": 2643299328.0, + "42": 2643299328.0, + "43": 2643299328.0, + "44": 2643299328.0, + "45": 2643299328.0, + "46": 2643299328.0, + "47": 2643299328.0, + "48": 2643299328.0, + "49": 2643299328.0, + "50": 2643299328.0, + "51": 2643299328.0, + "52": 2643299328.0, + "53": 2643299328.0, + "54": 2643299328.0, + "55": 2643299328.0, + "56": 2643299328.0, + "57": 2643299328.0, + "58": 2643299328.0, + "59": 2643299328.0, + "60": 2643299328.0, + "61": 2643299328.0, + "62": 2643299328.0, + "63": 2643299328.0, + "64": 2643299328.0, + "65": 2643299328.0, + "66": 2643299328.0, + "67": 2643299328.0, + "68": 2643299328.0, + "69": 2643299328.0, + "70": 2643299328.0, + "71": 2643299328.0, + "72": 2643299328.0, + "73": 2643299328.0, + "74": 2643299328.0, + "75": 2643299328.0, + "76": 2643299328.0, + "77": 2643299328.0, + "78": 2643299328.0, + "79": 2643299328.0, + "80": 2643299328.0, + "81": 2643299328.0, + "82": 2643299328.0, + "83": 2643299328.0, + "84": 2643299328.0, + "85": 2643299328.0, + "86": 2643299328.0, + "87": 2643299328.0, + "88": 2643299328.0, + "89": 2643299328.0, + "90": 2643299328.0, + "91": 2643299328.0, + "92": 2643299328.0, + "93": 2643299328.0, + "94": 2643299328.0, + "95": 2643299328.0, + "96": 2643299328.0, + "97": 2643299328.0, + "98": 2643299328.0, + "99": 2643299328.0, + "100": 2643299328.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 17.57509, + "2": 0.1453, + "3": 0.11184, + "4": 0.11457, + "5": 0.12345, + "6": 0.12167, + "7": 0.12451, + "8": 0.11003, + "9": 0.11229, + "10": 0.11078, + "11": 0.11178, + "12": 0.11071, + "13": 0.11183, + "14": 0.1131, + "15": 0.11195, + "16": 0.11109, + "17": 0.11155, + "18": 0.11436, + "19": 0.11335, + "20": 0.11235, + "21": 0.11323, + "22": 0.11234, + "23": 0.1131, + "24": 0.11154, + "25": 0.11274, + "26": 0.11525, + "27": 0.11435, + "28": 0.11247, + "29": 0.11318, + "30": 0.11126, + "31": 0.11489, + "32": 0.11045, + "33": 0.1114, + "34": 0.11253, + "35": 0.11114, + "36": 0.114, + "37": 0.11201, + "38": 0.10979, + "39": 0.11069, + "40": 0.11078, + "41": 0.11142, + "42": 0.11091, + "43": 0.11324, + "44": 0.11151, + "45": 0.11295, + "46": 0.11174, + "47": 0.10954, + "48": 0.11083, + "49": 0.11195, + "50": 0.11251, + "51": 0.11627, + "52": 0.11199, + "53": 0.11127, + "54": 0.11464, + "55": 0.11072, + "56": 0.1136, + "57": 0.11119, + "58": 0.11025, + "59": 0.11083, + "60": 0.11126, + "61": 0.10968, + "62": 0.11104, + "63": 0.11515, + "64": 0.11136, + "65": 0.11454, + "66": 0.10994, + "67": 0.11003, + "68": 0.10997, + "69": 0.11155, + "70": 0.11002, + "71": 0.1121, + "72": 0.11334, + "73": 0.11221, + "74": 0.11542, + "75": 0.11082, + "76": 0.10997, + "77": 0.11087, + "78": 0.11222, + "79": 0.11343, + "80": 0.11462, + "81": 0.11272, + "82": 0.11293, + "83": 0.113, + "84": 0.11134, + "85": 0.11308, + "86": 0.11357, + "87": 0.11341, + "88": 0.11349, + "89": 0.11342, + "90": 0.11212, + "91": 0.11377, + "92": 0.11421, + "93": 0.1115, + "94": 0.11293, + "95": 0.11334, + "96": 0.11303, + "97": 0.11198, + "98": 0.11326, + "99": 0.11128, + "100": 0.1117 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..9bafb7796c5 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.82005, + "2": 10.81907, + "3": 10.81396, + "4": 10.78497, + "5": 10.85284, + "6": 10.87449, + "7": 10.83201, + "8": 10.83297, + "9": 10.83935, + "10": 10.78455, + "11": 10.87798, + "12": 10.86112, + "13": 10.86444, + "14": 10.87605, + "15": 10.79229, + "16": 10.79509, + "17": 10.76768, + "18": 10.81005, + "19": 10.79719, + "20": 10.69211, + "21": 10.68164, + "22": 10.52085, + "23": 10.70893, + "24": 10.57599, + "25": 10.52412, + "26": 10.59517, + "27": 10.58426, + "28": 10.56233, + "29": 10.57013, + "30": 10.34552, + "31": 10.10049, + "32": 10.45378, + "33": 10.44627, + "34": 10.20606, + "35": 10.26239, + "36": 10.21239, + "37": 10.32522, + "38": 10.16777, + "39": 10.38334, + "40": 10.07241, + "41": 10.13863, + "42": 10.19814, + "43": 9.81073, + "44": 9.93244, + "45": 9.81101, + "46": 9.80877, + "47": 10.12608, + "48": 9.82108, + "49": 9.50625, + "50": 9.88422, + "51": 9.83655, + "52": 9.72542, + "53": 10.04681, + "54": 9.93029, + "55": 9.86374, + "56": 9.60187, + "57": 9.4509, + "58": 9.80848, + "59": 9.56669, + "60": 9.47965, + "61": 9.67901, + "62": 9.96739, + "63": 9.35162, + "64": 9.75606, + "65": 8.93063, + "66": 9.68053, + "67": 9.35888, + "68": 9.76985, + "69": 9.77496, + "70": 9.71215, + "71": 9.60754, + "72": 9.57085, + "73": 9.48404, + "74": 8.92823, + "75": 9.40048, + "76": 9.07196, + "77": 10.05227, + "78": 9.71519, + "79": 9.35769, + "80": 9.39077, + "81": 9.46749, + "82": 9.68504, + "83": 9.29553, + "84": 9.40532, + "85": 9.60141, + "86": 9.06774, + "87": 9.585, + "88": 9.73363, + "89": 9.59519, + "90": 9.80501, + "91": 9.3255, + "92": 9.35331, + "93": 9.06981, + "94": 8.82231, + "95": 9.50816, + "96": 9.51534, + "97": 9.29772, + "98": 9.66202, + "99": 8.87692, + "100": 9.3924 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1559.0, + "2": 1591.0, + "3": 1727.0, + "4": 1835.0, + "5": 1840.0, + "6": 1719.0, + "7": 1740.0, + "8": 1591.0, + "9": 1839.0, + "10": 1380.0, + "11": 1856.0, + "12": 1693.0, + "13": 1906.0, + "14": 1757.0, + "15": 1848.0, + "16": 1791.0, + "17": 1752.0, + "18": 1669.0, + "19": 1722.0, + "20": 1601.0, + "21": 1900.0, + "22": 1662.0, + "23": 2006.0, + "24": 1597.0, + "25": 1635.0, + "26": 1709.0, + "27": 1931.0, + "28": 2043.0, + "29": 1888.0, + "30": 1936.0, + "31": 1550.0, + "32": 1913.0, + "33": 2135.0, + "34": 1703.0, + "35": 1908.0, + "36": 1953.0, + "37": 2291.0, + "38": 2210.0, + "39": 2334.0, + "40": 2100.0, + "41": 2300.0, + "42": 2236.0, + "43": 1897.0, + "44": 1993.0, + "45": 2098.0, + "46": 2298.0, + "47": 2504.0, + "48": 2356.0, + "49": 2268.0, + "50": 2333.0, + "51": 2487.0, + "52": 2422.0, + "53": 2969.0, + "54": 2698.0, + "55": 2260.0, + "56": 2773.0, + "57": 2153.0, + "58": 2903.0, + "59": 2750.0, + "60": 2399.0, + "61": 2943.0, + "62": 2646.0, + "63": 2470.0, + "64": 2952.0, + "65": 2656.0, + "66": 3077.0, + "67": 2683.0, + "68": 2841.0, + "69": 3047.0, + "70": 3077.0, + "71": 2947.0, + "72": 2446.0, + "73": 2719.0, + "74": 1886.0, + "75": 2547.0, + "76": 2983.0, + "77": 3150.0, + "78": 3223.0, + "79": 3085.0, + "80": 3315.0, + "81": 3695.0, + "82": 3285.0, + "83": 2818.0, + "84": 3328.0, + "85": 3371.0, + "86": 2574.0, + "87": 3733.0, + "88": 3046.0, + "89": 3195.0, + "90": 2943.0, + "91": 2825.0, + "92": 3086.0, + "93": 2711.0, + "94": 3416.0, + "95": 3457.0, + "96": 3408.0, + "97": 3161.0, + "98": 3616.0, + "99": 3374.0, + "100": 3292.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 733859840.0, + "2": 733859840.0, + "3": 733859840.0, + "4": 733859840.0, + "5": 733859840.0, + "6": 733859840.0, + "7": 733859840.0, + "8": 733859840.0, + "9": 733859840.0, + "10": 733859840.0, + "11": 733859840.0, + "12": 733859840.0, + "13": 733859840.0, + "14": 733859840.0, + "15": 733859840.0, + "16": 733859840.0, + "17": 733859840.0, + "18": 733859840.0, + "19": 733859840.0, + "20": 733859840.0, + "21": 733859840.0, + "22": 733859840.0, + "23": 733859840.0, + "24": 733859840.0, + "25": 733859840.0, + "26": 733859840.0, + "27": 733859840.0, + "28": 733859840.0, + "29": 733859840.0, + "30": 733859840.0, + "31": 733859840.0, + "32": 733859840.0, + "33": 733859840.0, + "34": 733859840.0, + "35": 733859840.0, + "36": 733859840.0, + "37": 733859840.0, + "38": 733859840.0, + "39": 733859840.0, + "40": 733859840.0, + "41": 733859840.0, + "42": 733859840.0, + "43": 733859840.0, + "44": 733859840.0, + "45": 733859840.0, + "46": 733859840.0, + "47": 733859840.0, + "48": 733859840.0, + "49": 733859840.0, + "50": 733859840.0, + "51": 733859840.0, + "52": 733859840.0, + "53": 733859840.0, + "54": 733859840.0, + "55": 733859840.0, + "56": 733859840.0, + "57": 733859840.0, + "58": 733859840.0, + "59": 733859840.0, + "60": 733859840.0, + "61": 733859840.0, + "62": 733859840.0, + "63": 733859840.0, + "64": 733859840.0, + "65": 733859840.0, + "66": 733859840.0, + "67": 733859840.0, + "68": 733859840.0, + "69": 733859840.0, + "70": 733859840.0, + "71": 733859840.0, + "72": 733859840.0, + "73": 733859840.0, + "74": 733859840.0, + "75": 733859840.0, + "76": 733859840.0, + "77": 733859840.0, + "78": 733859840.0, + "79": 733859840.0, + "80": 733859840.0, + "81": 733859840.0, + "82": 733859840.0, + "83": 733859840.0, + "84": 733859840.0, + "85": 733859840.0, + "86": 733859840.0, + "87": 733859840.0, + "88": 733859840.0, + "89": 733859840.0, + "90": 733859840.0, + "91": 733859840.0, + "92": 733859840.0, + "93": 733859840.0, + "94": 733859840.0, + "95": 733859840.0, + "96": 733859840.0, + "97": 733859840.0, + "98": 733859840.0, + "99": 733859840.0, + "100": 733859840.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3838895104.0, + "2": 4122703872.0, + "3": 4122703872.0, + "4": 4122703872.0, + "5": 4122703872.0, + "6": 4122703872.0, + "7": 4122703872.0, + "8": 4122703872.0, + "9": 4122703872.0, + "10": 4122703872.0, + "11": 4122703872.0, + "12": 4122703872.0, + "13": 4122703872.0, + "14": 4122703872.0, + "15": 4122703872.0, + "16": 4122703872.0, + "17": 4122703872.0, + "18": 4122703872.0, + "19": 4122703872.0, + "20": 4122703872.0, + "21": 4122703872.0, + "22": 4122703872.0, + "23": 4122703872.0, + "24": 4122703872.0, + "25": 4122703872.0, + "26": 4122703872.0, + "27": 4122703872.0, + "28": 4122703872.0, + "29": 4122703872.0, + "30": 4122703872.0, + "31": 4122703872.0, + "32": 4122703872.0, + "33": 4122703872.0, + "34": 4122703872.0, + "35": 4122703872.0, + "36": 4122703872.0, + "37": 4122703872.0, + "38": 4122703872.0, + "39": 4122703872.0, + "40": 4122703872.0, + "41": 4122703872.0, + "42": 4122703872.0, + "43": 4122703872.0, + "44": 4122703872.0, + "45": 4122703872.0, + "46": 4122703872.0, + "47": 4122703872.0, + "48": 4122703872.0, + "49": 4122703872.0, + "50": 4122703872.0, + "51": 4122703872.0, + "52": 4122703872.0, + "53": 4122703872.0, + "54": 4122703872.0, + "55": 4122703872.0, + "56": 4122703872.0, + "57": 4122703872.0, + "58": 4122703872.0, + "59": 4122703872.0, + "60": 4122703872.0, + "61": 4122703872.0, + "62": 4122703872.0, + "63": 4122703872.0, + "64": 4122703872.0, + "65": 4122703872.0, + "66": 4122703872.0, + "67": 4122703872.0, + "68": 4122703872.0, + "69": 4122703872.0, + "70": 4122703872.0, + "71": 4122703872.0, + "72": 4122703872.0, + "73": 4122703872.0, + "74": 4122703872.0, + "75": 4122703872.0, + "76": 4122703872.0, + "77": 4122703872.0, + "78": 4122703872.0, + "79": 4122703872.0, + "80": 4122703872.0, + "81": 4122703872.0, + "82": 4122703872.0, + "83": 4122703872.0, + "84": 4122703872.0, + "85": 4122703872.0, + "86": 4122703872.0, + "87": 4122703872.0, + "88": 4122703872.0, + "89": 4122703872.0, + "90": 4122703872.0, + "91": 4122703872.0, + "92": 4122703872.0, + "93": 4122703872.0, + "94": 4122703872.0, + "95": 4122703872.0, + "96": 4122703872.0, + "97": 4122703872.0, + "98": 4122703872.0, + "99": 4122703872.0, + "100": 4122703872.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 20.74392, + "2": 0.20458, + "3": 0.17337, + "4": 0.17372, + "5": 0.17406, + "6": 0.17407, + "7": 0.1701, + "8": 0.1709, + "9": 0.17096, + "10": 0.17284, + "11": 0.17356, + "12": 0.17143, + "13": 0.17133, + "14": 0.17078, + "15": 0.17163, + "16": 0.17206, + "17": 0.17227, + "18": 0.1714, + "19": 0.17121, + "20": 0.17143, + "21": 0.17086, + "22": 0.17241, + "23": 0.17251, + "24": 0.17165, + "25": 0.17082, + "26": 0.17042, + "27": 0.1695, + "28": 0.17064, + "29": 0.17259, + "30": 0.17056, + "31": 0.17093, + "32": 0.16764, + "33": 0.1668, + "34": 0.16801, + "35": 0.1684, + "36": 0.1676, + "37": 0.16666, + "38": 0.16729, + "39": 0.16578, + "40": 0.16707, + "41": 0.16873, + "42": 0.16705, + "43": 0.16817, + "44": 0.16766, + "45": 0.16793, + "46": 0.16745, + "47": 0.16825, + "48": 0.16561, + "49": 0.16693, + "50": 0.167, + "51": 0.17408, + "52": 0.17381, + "53": 0.17359, + "54": 0.17167, + "55": 0.17219, + "56": 0.17329, + "57": 0.17468, + "58": 0.17336, + "59": 0.17436, + "60": 0.17289, + "61": 0.17216, + "62": 0.17277, + "63": 0.17306, + "64": 0.17382, + "65": 0.17362, + "66": 0.1721, + "67": 0.17256, + "68": 0.17189, + "69": 0.17201, + "70": 0.17356, + "71": 0.1728, + "72": 0.17241, + "73": 0.17349, + "74": 0.17357, + "75": 0.17454, + "76": 0.17395, + "77": 0.17253, + "78": 0.17295, + "79": 0.17219, + "80": 0.1746, + "81": 0.17297, + "82": 0.1742, + "83": 0.17306, + "84": 0.17236, + "85": 0.17328, + "86": 0.17434, + "87": 0.17285, + "88": 0.17502, + "89": 0.17257, + "90": 0.1726, + "91": 0.17295, + "92": 0.17284, + "93": 0.17452, + "94": 0.17398, + "95": 0.17312, + "96": 0.1727, + "97": 0.17207, + "98": 0.17436, + "99": 0.17586, + "100": 0.17341 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..e0f27834c5c --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.82005, + "2": 10.81907, + "3": 10.81396, + "4": 10.78497, + "5": 10.85284, + "6": 10.87449, + "7": 10.83201, + "8": 10.83297, + "9": 10.83935, + "10": 10.78455, + "11": 10.87798, + "12": 10.86112, + "13": 10.86444, + "14": 10.87605, + "15": 10.79229, + "16": 10.79509, + "17": 10.76768, + "18": 10.81005, + "19": 10.79719, + "20": 10.69211, + "21": 10.68164, + "22": 10.52085, + "23": 10.70893, + "24": 10.57599, + "25": 10.52412, + "26": 10.59517, + "27": 10.58426, + "28": 10.56233, + "29": 10.57013, + "30": 10.34552, + "31": 10.10049, + "32": 10.45378, + "33": 10.44627, + "34": 10.20606, + "35": 10.26239, + "36": 10.21239, + "37": 10.32522, + "38": 10.16777, + "39": 10.38334, + "40": 10.07241, + "41": 10.13863, + "42": 10.19814, + "43": 9.81073, + "44": 9.93244, + "45": 9.81101, + "46": 9.80877, + "47": 10.12608, + "48": 9.82108, + "49": 9.50625, + "50": 9.88422, + "51": 9.83655, + "52": 9.72542, + "53": 10.04681, + "54": 9.93029, + "55": 9.86374, + "56": 9.60187, + "57": 9.4509, + "58": 9.80848, + "59": 9.56669, + "60": 9.47965, + "61": 9.67901, + "62": 9.96739, + "63": 9.35162, + "64": 9.75606, + "65": 8.93063, + "66": 9.68053, + "67": 9.35888, + "68": 9.76985, + "69": 9.77496, + "70": 9.71215, + "71": 9.60754, + "72": 9.57085, + "73": 9.48404, + "74": 8.92823, + "75": 9.40048, + "76": 9.07196, + "77": 10.05227, + "78": 9.71519, + "79": 9.35769, + "80": 9.39077, + "81": 9.46749, + "82": 9.68504, + "83": 9.29553, + "84": 9.40532, + "85": 9.60141, + "86": 9.06774, + "87": 9.585, + "88": 9.73363, + "89": 9.59519, + "90": 9.80501, + "91": 9.3255, + "92": 9.35331, + "93": 9.06981, + "94": 8.82231, + "95": 9.50816, + "96": 9.51534, + "97": 9.29772, + "98": 9.66202, + "99": 8.87692, + "100": 9.3924 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1559.0, + "2": 1591.0, + "3": 1727.0, + "4": 1835.0, + "5": 1840.0, + "6": 1719.0, + "7": 1740.0, + "8": 1591.0, + "9": 1839.0, + "10": 1380.0, + "11": 1856.0, + "12": 1693.0, + "13": 1906.0, + "14": 1757.0, + "15": 1848.0, + "16": 1791.0, + "17": 1752.0, + "18": 1669.0, + "19": 1722.0, + "20": 1601.0, + "21": 1900.0, + "22": 1662.0, + "23": 2006.0, + "24": 1597.0, + "25": 1635.0, + "26": 1709.0, + "27": 1931.0, + "28": 2043.0, + "29": 1888.0, + "30": 1936.0, + "31": 1550.0, + "32": 1913.0, + "33": 2135.0, + "34": 1703.0, + "35": 1908.0, + "36": 1953.0, + "37": 2291.0, + "38": 2210.0, + "39": 2334.0, + "40": 2100.0, + "41": 2300.0, + "42": 2236.0, + "43": 1897.0, + "44": 1993.0, + "45": 2098.0, + "46": 2298.0, + "47": 2504.0, + "48": 2356.0, + "49": 2268.0, + "50": 2333.0, + "51": 2487.0, + "52": 2422.0, + "53": 2969.0, + "54": 2698.0, + "55": 2260.0, + "56": 2773.0, + "57": 2153.0, + "58": 2903.0, + "59": 2750.0, + "60": 2399.0, + "61": 2943.0, + "62": 2646.0, + "63": 2470.0, + "64": 2952.0, + "65": 2656.0, + "66": 3077.0, + "67": 2683.0, + "68": 2841.0, + "69": 3047.0, + "70": 3077.0, + "71": 2947.0, + "72": 2446.0, + "73": 2719.0, + "74": 1886.0, + "75": 2547.0, + "76": 2983.0, + "77": 3150.0, + "78": 3223.0, + "79": 3085.0, + "80": 3315.0, + "81": 3695.0, + "82": 3285.0, + "83": 2818.0, + "84": 3328.0, + "85": 3371.0, + "86": 2574.0, + "87": 3733.0, + "88": 3046.0, + "89": 3195.0, + "90": 2943.0, + "91": 2825.0, + "92": 3086.0, + "93": 2711.0, + "94": 3416.0, + "95": 3457.0, + "96": 3408.0, + "97": 3161.0, + "98": 3616.0, + "99": 3374.0, + "100": 3292.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 733859840.0, + "2": 733859840.0, + "3": 733859840.0, + "4": 733859840.0, + "5": 733859840.0, + "6": 733859840.0, + "7": 733859840.0, + "8": 733859840.0, + "9": 733859840.0, + "10": 733859840.0, + "11": 733859840.0, + "12": 733859840.0, + "13": 733859840.0, + "14": 733859840.0, + "15": 733859840.0, + "16": 733859840.0, + "17": 733859840.0, + "18": 733859840.0, + "19": 733859840.0, + "20": 733859840.0, + "21": 733859840.0, + "22": 733859840.0, + "23": 733859840.0, + "24": 733859840.0, + "25": 733859840.0, + "26": 733859840.0, + "27": 733859840.0, + "28": 733859840.0, + "29": 733859840.0, + "30": 733859840.0, + "31": 733859840.0, + "32": 733859840.0, + "33": 733859840.0, + "34": 733859840.0, + "35": 733859840.0, + "36": 733859840.0, + "37": 733859840.0, + "38": 733859840.0, + "39": 733859840.0, + "40": 733859840.0, + "41": 733859840.0, + "42": 733859840.0, + "43": 733859840.0, + "44": 733859840.0, + "45": 733859840.0, + "46": 733859840.0, + "47": 733859840.0, + "48": 733859840.0, + "49": 733859840.0, + "50": 733859840.0, + "51": 733859840.0, + "52": 733859840.0, + "53": 733859840.0, + "54": 733859840.0, + "55": 733859840.0, + "56": 733859840.0, + "57": 733859840.0, + "58": 733859840.0, + "59": 733859840.0, + "60": 733859840.0, + "61": 733859840.0, + "62": 733859840.0, + "63": 733859840.0, + "64": 733859840.0, + "65": 733859840.0, + "66": 733859840.0, + "67": 733859840.0, + "68": 733859840.0, + "69": 733859840.0, + "70": 733859840.0, + "71": 733859840.0, + "72": 733859840.0, + "73": 733859840.0, + "74": 733859840.0, + "75": 733859840.0, + "76": 733859840.0, + "77": 733859840.0, + "78": 733859840.0, + "79": 733859840.0, + "80": 733859840.0, + "81": 733859840.0, + "82": 733859840.0, + "83": 733859840.0, + "84": 733859840.0, + "85": 733859840.0, + "86": 733859840.0, + "87": 733859840.0, + "88": 733859840.0, + "89": 733859840.0, + "90": 733859840.0, + "91": 733859840.0, + "92": 733859840.0, + "93": 733859840.0, + "94": 733859840.0, + "95": 733859840.0, + "96": 733859840.0, + "97": 733859840.0, + "98": 733859840.0, + "99": 733859840.0, + "100": 733859840.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3838895104.0, + "2": 4122703872.0, + "3": 4122703872.0, + "4": 4122703872.0, + "5": 4122703872.0, + "6": 4122703872.0, + "7": 4122703872.0, + "8": 4122703872.0, + "9": 4122703872.0, + "10": 4122703872.0, + "11": 4122703872.0, + "12": 4122703872.0, + "13": 4122703872.0, + "14": 4122703872.0, + "15": 4122703872.0, + "16": 4122703872.0, + "17": 4122703872.0, + "18": 4122703872.0, + "19": 4122703872.0, + "20": 4122703872.0, + "21": 4122703872.0, + "22": 4122703872.0, + "23": 4122703872.0, + "24": 4122703872.0, + "25": 4122703872.0, + "26": 4122703872.0, + "27": 4122703872.0, + "28": 4122703872.0, + "29": 4122703872.0, + "30": 4122703872.0, + "31": 4122703872.0, + "32": 4122703872.0, + "33": 4122703872.0, + "34": 4122703872.0, + "35": 4122703872.0, + "36": 4122703872.0, + "37": 4122703872.0, + "38": 4122703872.0, + "39": 4122703872.0, + "40": 4122703872.0, + "41": 4122703872.0, + "42": 4122703872.0, + "43": 4122703872.0, + "44": 4122703872.0, + "45": 4122703872.0, + "46": 4122703872.0, + "47": 4122703872.0, + "48": 4122703872.0, + "49": 4122703872.0, + "50": 4122703872.0, + "51": 4122703872.0, + "52": 4122703872.0, + "53": 4122703872.0, + "54": 4122703872.0, + "55": 4122703872.0, + "56": 4122703872.0, + "57": 4122703872.0, + "58": 4122703872.0, + "59": 4122703872.0, + "60": 4122703872.0, + "61": 4122703872.0, + "62": 4122703872.0, + "63": 4122703872.0, + "64": 4122703872.0, + "65": 4122703872.0, + "66": 4122703872.0, + "67": 4122703872.0, + "68": 4122703872.0, + "69": 4122703872.0, + "70": 4122703872.0, + "71": 4122703872.0, + "72": 4122703872.0, + "73": 4122703872.0, + "74": 4122703872.0, + "75": 4122703872.0, + "76": 4122703872.0, + "77": 4122703872.0, + "78": 4122703872.0, + "79": 4122703872.0, + "80": 4122703872.0, + "81": 4122703872.0, + "82": 4122703872.0, + "83": 4122703872.0, + "84": 4122703872.0, + "85": 4122703872.0, + "86": 4122703872.0, + "87": 4122703872.0, + "88": 4122703872.0, + "89": 4122703872.0, + "90": 4122703872.0, + "91": 4122703872.0, + "92": 4122703872.0, + "93": 4122703872.0, + "94": 4122703872.0, + "95": 4122703872.0, + "96": 4122703872.0, + "97": 4122703872.0, + "98": 4122703872.0, + "99": 4122703872.0, + "100": 4122703872.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 21.63875, + "2": 0.20787, + "3": 0.17721, + "4": 0.17658, + "5": 0.17528, + "6": 0.17173, + "7": 0.17222, + "8": 0.17098, + "9": 0.16832, + "10": 0.16824, + "11": 0.16991, + "12": 0.16843, + "13": 0.42886, + "14": 0.16771, + "15": 0.16923, + "16": 0.16925, + "17": 0.16721, + "18": 0.16835, + "19": 0.16585, + "20": 0.16956, + "21": 0.16767, + "22": 0.16714, + "23": 0.16974, + "24": 0.16792, + "25": 0.16824, + "26": 0.16516, + "27": 0.16767, + "28": 0.16689, + "29": 0.16698, + "30": 0.16729, + "31": 0.16513, + "32": 0.1676, + "33": 0.16825, + "34": 0.16806, + "35": 0.16705, + "36": 0.16629, + "37": 0.16592, + "38": 0.16499, + "39": 0.16482, + "40": 0.1659, + "41": 0.167, + "42": 0.16751, + "43": 0.16596, + "44": 0.16515, + "45": 0.1666, + "46": 0.17084, + "47": 0.16836, + "48": 0.16826, + "49": 0.16977, + "50": 0.16743, + "51": 0.17999, + "52": 0.17241, + "53": 0.17103, + "54": 0.17085, + "55": 0.17395, + "56": 0.17509, + "57": 0.17396, + "58": 0.1719, + "59": 0.171, + "60": 0.17345, + "61": 0.16946, + "62": 0.17066, + "63": 0.17284, + "64": 0.17167, + "65": 0.17007, + "66": 0.17279, + "67": 0.17225, + "68": 0.17054, + "69": 0.17013, + "70": 0.16853, + "71": 0.17021, + "72": 0.17001, + "73": 0.17136, + "74": 0.17139, + "75": 0.17396, + "76": 0.17179, + "77": 0.1705, + "78": 0.17116, + "79": 0.17303, + "80": 0.17196, + "81": 0.17269, + "82": 0.16795, + "83": 0.16966, + "84": 0.17044, + "85": 0.17085, + "86": 0.17338, + "87": 0.1704, + "88": 0.17066, + "89": 0.16954, + "90": 0.16994, + "91": 0.17172, + "92": 0.17222, + "93": 0.17163, + "94": 0.17173, + "95": 0.17012, + "96": 0.16985, + "97": 0.17078, + "98": 0.17262, + "99": 0.17354, + "100": 0.1683 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 191ec6ee23e..39c385529c2 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,537 @@ -{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.84517, "5": 10.87427, "10": 10.82906, "15": 10.81976, "20": 10.72701, "25": 10.5522, "30": 10.36616, "35": 10.27781, "40": 10.09758, "45": 9.84191, "50": 9.91248, "55": 9.88096, "60": 9.50125, "65": 8.94762, "70": 9.74241, "75": 9.42529, "80": 9.40396, "85": 9.61407, "90": 9.8142, "95": 9.51734, "100": 9.39538}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1655.0, "5": 1803.0, "10": 1413.0, "15": 1951.0, "20": 1561.0, "25": 1665.0, "30": 1893.0, "35": 2010.0, "40": 2188.0, "45": 2126.0, "50": 2250.0, "55": 2351.0, "60": 2440.0, "65": 2602.0, "70": 3234.0, "75": 2388.0, "80": 3186.0, "85": 3262.0, "90": 3018.0, "95": 3426.0, "100": 3204.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 551288320.0, "5": 551288320.0, "10": 551288320.0, "15": 551288320.0, "20": 551288320.0, "25": 551288320.0, "30": 551288320.0, "35": 551288320.0, "40": 551288320.0, "45": 551288320.0, "50": 551288320.0, "55": 551288320.0, "60": 551288320.0, "65": 551288320.0, "70": 551288320.0, "75": 551288320.0, "80": 551288320.0, "85": 551288320.0, "90": 551288320.0, "95": 551288320.0, "100": 551288320.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2289440768.0, "5": 2431367168.0, "10": 2431367168.0, "15": 2431367168.0, "20": 2431367168.0, "25": 2431367168.0, "30": 2431367168.0, "35": 2431367168.0, "40": 2431367168.0, "45": 2431367168.0, "50": 2431367168.0, "55": 2431367168.0, "60": 2431367168.0, "65": 2431367168.0, "70": 2431367168.0, "75": 2431367168.0, "80": 2431367168.0, "85": 2431367168.0, "90": 2431367168.0, "95": 2431367168.0, "100": 2431367168.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 16.78965, "5": 0.09699, "10": 0.09747, "15": 0.09725, "20": 0.09706, "25": 0.09768, "30": 0.09735, "35": 0.09599, "40": 0.09512, "45": 0.09648, "50": 0.09612, "55": 0.10241, "60": 0.09796, "65": 0.10117, "70": 0.09751, "75": 0.09884, "80": 0.10009, "85": 0.09677, "90": 0.09652, "95": 0.1026, "100": 0.09685}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84517, + "2": 10.85349, + "3": 10.8539, + "4": 10.83825, + "5": 10.87427, + "6": 10.89307, + "7": 10.85454, + "8": 10.8626, + "9": 10.86464, + "10": 10.82906, + "11": 10.88792, + "12": 10.87099, + "13": 10.87921, + "14": 10.89078, + "15": 10.81976, + "16": 10.83158, + "17": 10.79868, + "18": 10.81672, + "19": 10.81919, + "20": 10.72701, + "21": 10.70594, + "22": 10.56367, + "23": 10.72804, + "24": 10.60832, + "25": 10.5522, + "26": 10.60853, + "27": 10.62847, + "28": 10.58306, + "29": 10.60011, + "30": 10.36616, + "31": 10.12043, + "32": 10.47685, + "33": 10.46868, + "34": 10.22316, + "35": 10.27781, + "36": 10.22892, + "37": 10.35949, + "38": 10.19369, + "39": 10.41549, + "40": 10.09758, + "41": 10.1573, + "42": 10.22398, + "43": 9.83289, + "44": 9.96912, + "45": 9.84191, + "46": 9.83041, + "47": 10.15626, + "48": 9.85486, + "49": 9.54086, + "50": 9.91248, + "51": 9.85868, + "52": 9.74284, + "53": 10.06645, + "54": 9.95167, + "55": 9.88096, + "56": 9.62626, + "57": 9.47768, + "58": 9.83346, + "59": 9.58526, + "60": 9.50125, + "61": 9.69182, + "62": 9.98853, + "63": 9.38476, + "64": 9.7803, + "65": 8.94762, + "66": 9.70856, + "67": 9.36852, + "68": 9.78439, + "69": 9.79406, + "70": 9.74241, + "71": 9.61808, + "72": 9.58428, + "73": 9.5035, + "74": 8.94221, + "75": 9.42529, + "76": 9.07408, + "77": 10.06351, + "78": 9.7208, + "79": 9.37294, + "80": 9.40396, + "81": 9.48168, + "82": 9.69778, + "83": 9.30714, + "84": 9.41712, + "85": 9.61407, + "86": 9.07615, + "87": 9.59094, + "88": 9.74641, + "89": 9.59993, + "90": 9.8142, + "91": 9.33773, + "92": 9.35373, + "93": 9.07395, + "94": 8.83173, + "95": 9.51734, + "96": 9.52415, + "97": 9.30995, + "98": 9.66805, + "99": 8.88588, + "100": 9.39538 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1655.0, + "2": 1697.0, + "3": 1724.0, + "4": 1720.0, + "5": 1803.0, + "6": 1772.0, + "7": 1811.0, + "8": 1766.0, + "9": 1750.0, + "10": 1413.0, + "11": 1861.0, + "12": 1650.0, + "13": 1895.0, + "14": 1662.0, + "15": 1951.0, + "16": 1998.0, + "17": 1798.0, + "18": 1687.0, + "19": 1856.0, + "20": 1561.0, + "21": 1882.0, + "22": 1652.0, + "23": 2075.0, + "24": 1606.0, + "25": 1665.0, + "26": 1686.0, + "27": 1839.0, + "28": 2053.0, + "29": 1907.0, + "30": 1893.0, + "31": 1581.0, + "32": 1791.0, + "33": 2149.0, + "34": 1872.0, + "35": 2010.0, + "36": 1799.0, + "37": 2311.0, + "38": 2221.0, + "39": 2261.0, + "40": 2188.0, + "41": 2204.0, + "42": 2300.0, + "43": 2001.0, + "44": 2119.0, + "45": 2126.0, + "46": 2374.0, + "47": 2468.0, + "48": 2405.0, + "49": 2247.0, + "50": 2250.0, + "51": 2607.0, + "52": 2618.0, + "53": 2828.0, + "54": 2730.0, + "55": 2351.0, + "56": 2753.0, + "57": 2323.0, + "58": 2809.0, + "59": 2721.0, + "60": 2440.0, + "61": 2875.0, + "62": 2726.0, + "63": 2444.0, + "64": 3001.0, + "65": 2602.0, + "66": 2981.0, + "67": 2676.0, + "68": 2623.0, + "69": 2802.0, + "70": 3234.0, + "71": 2902.0, + "72": 2337.0, + "73": 2856.0, + "74": 1903.0, + "75": 2388.0, + "76": 3118.0, + "77": 3108.0, + "78": 3122.0, + "79": 2994.0, + "80": 3186.0, + "81": 3470.0, + "82": 3164.0, + "83": 2726.0, + "84": 3214.0, + "85": 3262.0, + "86": 2602.0, + "87": 3658.0, + "88": 2906.0, + "89": 3054.0, + "90": 3018.0, + "91": 2690.0, + "92": 3106.0, + "93": 2701.0, + "94": 3263.0, + "95": 3426.0, + "96": 3405.0, + "97": 3087.0, + "98": 3510.0, + "99": 3148.0, + "100": 3204.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 551269888.0, + "2": 551269888.0, + "3": 551269888.0, + "4": 552318464.0, + "5": 551269888.0, + "6": 551269888.0, + "7": 551269888.0, + "8": 551269888.0, + "9": 551269888.0, + "10": 551269888.0, + "11": 551269888.0, + "12": 551269888.0, + "13": 551269888.0, + "14": 551269888.0, + "15": 551269888.0, + "16": 551269888.0, + "17": 551269888.0, + "18": 551269888.0, + "19": 551269888.0, + "20": 551269888.0, + "21": 551269888.0, + "22": 551269888.0, + "23": 551269888.0, + "24": 551269888.0, + "25": 551269888.0, + "26": 551269888.0, + "27": 551269888.0, + "28": 551269888.0, + "29": 551269888.0, + "30": 551269888.0, + "31": 551269888.0, + "32": 551269888.0, + "33": 551269888.0, + "34": 551269888.0, + "35": 551269888.0, + "36": 551269888.0, + "37": 551269888.0, + "38": 551269888.0, + "39": 551269888.0, + "40": 551269888.0, + "41": 551269888.0, + "42": 551269888.0, + "43": 551269888.0, + "44": 551269888.0, + "45": 551269888.0, + "46": 551269888.0, + "47": 551269888.0, + "48": 551269888.0, + "49": 551269888.0, + "50": 551269888.0, + "51": 551269888.0, + "52": 551269888.0, + "53": 551269888.0, + "54": 551269888.0, + "55": 551269888.0, + "56": 551269888.0, + "57": 551269888.0, + "58": 551269888.0, + "59": 551269888.0, + "60": 551269888.0, + "61": 551269888.0, + "62": 551269888.0, + "63": 551269888.0, + "64": 551269888.0, + "65": 551269888.0, + "66": 551269888.0, + "67": 551269888.0, + "68": 551269888.0, + "69": 551269888.0, + "70": 551269888.0, + "71": 551269888.0, + "72": 551269888.0, + "73": 551269888.0, + "74": 551269888.0, + "75": 551269888.0, + "76": 551269888.0, + "77": 551269888.0, + "78": 551269888.0, + "79": 551269888.0, + "80": 551269888.0, + "81": 551269888.0, + "82": 551269888.0, + "83": 551269888.0, + "84": 551269888.0, + "85": 551269888.0, + "86": 551269888.0, + "87": 551269888.0, + "88": 551269888.0, + "89": 551269888.0, + "90": 551269888.0, + "91": 551269888.0, + "92": 551269888.0, + "93": 551269888.0, + "94": 551269888.0, + "95": 551269888.0, + "96": 551269888.0, + "97": 551269888.0, + "98": 551269888.0, + "99": 551269888.0, + "100": 551269888.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2290489344.0, + "2": 2432397312.0, + "3": 2432397312.0, + "4": 2432397312.0, + "5": 2432397312.0, + "6": 2432397312.0, + "7": 2432397312.0, + "8": 2432397312.0, + "9": 2432397312.0, + "10": 2432397312.0, + "11": 2432397312.0, + "12": 2432397312.0, + "13": 2432397312.0, + "14": 2432397312.0, + "15": 2432397312.0, + "16": 2432397312.0, + "17": 2432397312.0, + "18": 2432397312.0, + "19": 2432397312.0, + "20": 2432397312.0, + "21": 2432397312.0, + "22": 2432397312.0, + "23": 2432397312.0, + "24": 2432397312.0, + "25": 2432397312.0, + "26": 2432397312.0, + "27": 2432397312.0, + "28": 2432397312.0, + "29": 2432397312.0, + "30": 2432397312.0, + "31": 2432397312.0, + "32": 2432397312.0, + "33": 2432397312.0, + "34": 2432397312.0, + "35": 2432397312.0, + "36": 2432397312.0, + "37": 2432397312.0, + "38": 2432397312.0, + "39": 2432397312.0, + "40": 2432397312.0, + "41": 2432397312.0, + "42": 2432397312.0, + "43": 2432397312.0, + "44": 2432397312.0, + "45": 2432397312.0, + "46": 2432397312.0, + "47": 2432397312.0, + "48": 2432397312.0, + "49": 2432397312.0, + "50": 2432397312.0, + "51": 2432397312.0, + "52": 2432397312.0, + "53": 2432397312.0, + "54": 2432397312.0, + "55": 2432397312.0, + "56": 2432397312.0, + "57": 2432397312.0, + "58": 2432397312.0, + "59": 2432397312.0, + "60": 2432397312.0, + "61": 2432397312.0, + "62": 2432397312.0, + "63": 2432397312.0, + "64": 2432397312.0, + "65": 2432397312.0, + "66": 2432397312.0, + "67": 2432397312.0, + "68": 2432397312.0, + "69": 2432397312.0, + "70": 2432397312.0, + "71": 2432397312.0, + "72": 2432397312.0, + "73": 2432397312.0, + "74": 2432397312.0, + "75": 2432397312.0, + "76": 2432397312.0, + "77": 2432397312.0, + "78": 2432397312.0, + "79": 2432397312.0, + "80": 2432397312.0, + "81": 2432397312.0, + "82": 2432397312.0, + "83": 2432397312.0, + "84": 2432397312.0, + "85": 2432397312.0, + "86": 2432397312.0, + "87": 2432397312.0, + "88": 2432397312.0, + "89": 2432397312.0, + "90": 2432397312.0, + "91": 2432397312.0, + "92": 2432397312.0, + "93": 2432397312.0, + "94": 2432397312.0, + "95": 2432397312.0, + "96": 2432397312.0, + "97": 2432397312.0, + "98": 2432397312.0, + "99": 2432397312.0, + "100": 2432397312.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 17.54138, + "2": 0.13158, + "3": 0.11931, + "4": 0.11269, + "5": 0.1124, + "6": 0.11102, + "7": 0.11179, + "8": 0.11071, + "9": 0.11115, + "10": 0.11216, + "11": 0.11019, + "12": 0.10929, + "13": 0.10974, + "14": 0.11072, + "15": 0.11028, + "16": 0.10961, + "17": 0.1105, + "18": 0.1098, + "19": 0.11053, + "20": 0.11011, + "21": 0.10991, + "22": 0.10929, + "23": 0.11003, + "24": 0.10899, + "25": 0.10976, + "26": 0.10976, + "27": 0.11215, + "28": 0.11012, + "29": 0.11201, + "30": 0.11164, + "31": 0.10958, + "32": 0.10984, + "33": 0.10959, + "34": 0.10961, + "35": 0.11104, + "36": 0.11182, + "37": 0.11063, + "38": 0.11001, + "39": 0.10974, + "40": 0.10932, + "41": 0.10961, + "42": 0.1101, + "43": 0.11018, + "44": 0.11136, + "45": 0.1111, + "46": 0.11139, + "47": 0.1089, + "48": 0.10943, + "49": 0.10954, + "50": 0.10991, + "51": 0.11785, + "52": 0.11209, + "53": 0.11006, + "54": 0.11154, + "55": 0.11442, + "56": 0.11224, + "57": 0.11144, + "58": 0.11019, + "59": 0.11203, + "60": 0.11138, + "61": 0.11054, + "62": 0.10988, + "63": 0.11137, + "64": 0.11375, + "65": 0.11099, + "66": 0.11062, + "67": 0.11059, + "68": 0.1103, + "69": 0.11052, + "70": 0.11117, + "71": 0.11388, + "72": 0.1141, + "73": 0.11416, + "74": 0.11486, + "75": 0.11283, + "76": 0.1123, + "77": 0.11047, + "78": 0.11279, + "79": 0.11417, + "80": 0.11037, + "81": 0.11258, + "82": 0.1135, + "83": 0.11215, + "84": 0.11183, + "85": 0.1122, + "86": 0.11261, + "87": 0.1097, + "88": 0.1112, + "89": 0.11201, + "90": 0.11377, + "91": 0.11526, + "92": 0.11074, + "93": 0.11279, + "94": 0.11178, + "95": 0.11134, + "96": 0.11018, + "97": 0.11123, + "98": 0.11129, + "99": 0.11384, + "100": 0.11183 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..d31da6ac7cf --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84517, + "2": 10.85349, + "3": 10.8539, + "4": 10.83825, + "5": 10.87427, + "6": 10.89307, + "7": 10.85454, + "8": 10.8626, + "9": 10.86464, + "10": 10.82906, + "11": 10.88792, + "12": 10.87099, + "13": 10.87921, + "14": 10.89078, + "15": 10.81976, + "16": 10.83158, + "17": 10.79868, + "18": 10.81672, + "19": 10.81919, + "20": 10.72701, + "21": 10.70594, + "22": 10.56367, + "23": 10.72804, + "24": 10.60832, + "25": 10.5522, + "26": 10.60853, + "27": 10.62847, + "28": 10.58306, + "29": 10.60011, + "30": 10.36616, + "31": 10.12043, + "32": 10.47685, + "33": 10.46868, + "34": 10.22316, + "35": 10.27781, + "36": 10.22892, + "37": 10.35949, + "38": 10.19369, + "39": 10.41549, + "40": 10.09758, + "41": 10.1573, + "42": 10.22398, + "43": 9.83289, + "44": 9.96912, + "45": 9.84191, + "46": 9.83041, + "47": 10.15626, + "48": 9.85486, + "49": 9.54086, + "50": 9.91248, + "51": 9.85868, + "52": 9.74284, + "53": 10.06645, + "54": 9.95167, + "55": 9.88096, + "56": 9.62626, + "57": 9.47768, + "58": 9.83346, + "59": 9.58526, + "60": 9.50125, + "61": 9.69182, + "62": 9.98853, + "63": 9.38476, + "64": 9.7803, + "65": 8.94762, + "66": 9.70856, + "67": 9.36852, + "68": 9.78439, + "69": 9.79406, + "70": 9.74241, + "71": 9.61808, + "72": 9.58428, + "73": 9.5035, + "74": 8.94221, + "75": 9.42529, + "76": 9.07408, + "77": 10.06351, + "78": 9.7208, + "79": 9.37294, + "80": 9.40396, + "81": 9.48168, + "82": 9.69778, + "83": 9.30714, + "84": 9.41712, + "85": 9.61407, + "86": 9.07615, + "87": 9.59094, + "88": 9.74641, + "89": 9.59993, + "90": 9.8142, + "91": 9.33773, + "92": 9.35373, + "93": 9.07395, + "94": 8.83173, + "95": 9.51734, + "96": 9.52415, + "97": 9.30995, + "98": 9.66805, + "99": 8.88588, + "100": 9.39538 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1655.0, + "2": 1697.0, + "3": 1724.0, + "4": 1720.0, + "5": 1803.0, + "6": 1772.0, + "7": 1811.0, + "8": 1766.0, + "9": 1750.0, + "10": 1413.0, + "11": 1861.0, + "12": 1650.0, + "13": 1895.0, + "14": 1662.0, + "15": 1951.0, + "16": 1998.0, + "17": 1798.0, + "18": 1687.0, + "19": 1856.0, + "20": 1561.0, + "21": 1882.0, + "22": 1652.0, + "23": 2075.0, + "24": 1606.0, + "25": 1665.0, + "26": 1686.0, + "27": 1839.0, + "28": 2053.0, + "29": 1907.0, + "30": 1893.0, + "31": 1581.0, + "32": 1791.0, + "33": 2149.0, + "34": 1872.0, + "35": 2010.0, + "36": 1799.0, + "37": 2311.0, + "38": 2221.0, + "39": 2261.0, + "40": 2188.0, + "41": 2204.0, + "42": 2300.0, + "43": 2001.0, + "44": 2119.0, + "45": 2126.0, + "46": 2374.0, + "47": 2468.0, + "48": 2405.0, + "49": 2247.0, + "50": 2250.0, + "51": 2607.0, + "52": 2618.0, + "53": 2828.0, + "54": 2730.0, + "55": 2351.0, + "56": 2753.0, + "57": 2323.0, + "58": 2809.0, + "59": 2721.0, + "60": 2440.0, + "61": 2875.0, + "62": 2726.0, + "63": 2444.0, + "64": 3001.0, + "65": 2602.0, + "66": 2981.0, + "67": 2676.0, + "68": 2623.0, + "69": 2802.0, + "70": 3234.0, + "71": 2902.0, + "72": 2337.0, + "73": 2856.0, + "74": 1903.0, + "75": 2388.0, + "76": 3118.0, + "77": 3108.0, + "78": 3122.0, + "79": 2994.0, + "80": 3186.0, + "81": 3470.0, + "82": 3164.0, + "83": 2726.0, + "84": 3214.0, + "85": 3262.0, + "86": 2602.0, + "87": 3658.0, + "88": 2906.0, + "89": 3054.0, + "90": 3018.0, + "91": 2690.0, + "92": 3106.0, + "93": 2701.0, + "94": 3263.0, + "95": 3426.0, + "96": 3405.0, + "97": 3087.0, + "98": 3510.0, + "99": 3148.0, + "100": 3204.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 551269888.0, + "2": 551269888.0, + "3": 551269888.0, + "4": 552318464.0, + "5": 551269888.0, + "6": 551269888.0, + "7": 551269888.0, + "8": 551269888.0, + "9": 551269888.0, + "10": 551269888.0, + "11": 551269888.0, + "12": 551269888.0, + "13": 551269888.0, + "14": 551269888.0, + "15": 551269888.0, + "16": 551269888.0, + "17": 551269888.0, + "18": 551269888.0, + "19": 551269888.0, + "20": 551269888.0, + "21": 551269888.0, + "22": 551269888.0, + "23": 551269888.0, + "24": 551269888.0, + "25": 551269888.0, + "26": 551269888.0, + "27": 551269888.0, + "28": 551269888.0, + "29": 551269888.0, + "30": 551269888.0, + "31": 551269888.0, + "32": 551269888.0, + "33": 551269888.0, + "34": 551269888.0, + "35": 551269888.0, + "36": 551269888.0, + "37": 551269888.0, + "38": 551269888.0, + "39": 551269888.0, + "40": 551269888.0, + "41": 551269888.0, + "42": 551269888.0, + "43": 551269888.0, + "44": 551269888.0, + "45": 551269888.0, + "46": 551269888.0, + "47": 551269888.0, + "48": 551269888.0, + "49": 551269888.0, + "50": 551269888.0, + "51": 551269888.0, + "52": 551269888.0, + "53": 551269888.0, + "54": 551269888.0, + "55": 551269888.0, + "56": 551269888.0, + "57": 551269888.0, + "58": 551269888.0, + "59": 551269888.0, + "60": 551269888.0, + "61": 551269888.0, + "62": 551269888.0, + "63": 551269888.0, + "64": 551269888.0, + "65": 551269888.0, + "66": 551269888.0, + "67": 551269888.0, + "68": 551269888.0, + "69": 551269888.0, + "70": 551269888.0, + "71": 551269888.0, + "72": 551269888.0, + "73": 551269888.0, + "74": 551269888.0, + "75": 551269888.0, + "76": 551269888.0, + "77": 551269888.0, + "78": 551269888.0, + "79": 551269888.0, + "80": 551269888.0, + "81": 551269888.0, + "82": 551269888.0, + "83": 551269888.0, + "84": 551269888.0, + "85": 551269888.0, + "86": 551269888.0, + "87": 551269888.0, + "88": 551269888.0, + "89": 551269888.0, + "90": 551269888.0, + "91": 551269888.0, + "92": 551269888.0, + "93": 551269888.0, + "94": 551269888.0, + "95": 551269888.0, + "96": 551269888.0, + "97": 551269888.0, + "98": 551269888.0, + "99": 551269888.0, + "100": 551269888.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2290489344.0, + "2": 2432397312.0, + "3": 2432397312.0, + "4": 2432397312.0, + "5": 2432397312.0, + "6": 2432397312.0, + "7": 2432397312.0, + "8": 2432397312.0, + "9": 2432397312.0, + "10": 2432397312.0, + "11": 2432397312.0, + "12": 2432397312.0, + "13": 2432397312.0, + "14": 2432397312.0, + "15": 2432397312.0, + "16": 2432397312.0, + "17": 2432397312.0, + "18": 2432397312.0, + "19": 2432397312.0, + "20": 2432397312.0, + "21": 2432397312.0, + "22": 2432397312.0, + "23": 2432397312.0, + "24": 2432397312.0, + "25": 2432397312.0, + "26": 2432397312.0, + "27": 2432397312.0, + "28": 2432397312.0, + "29": 2432397312.0, + "30": 2432397312.0, + "31": 2432397312.0, + "32": 2432397312.0, + "33": 2432397312.0, + "34": 2432397312.0, + "35": 2432397312.0, + "36": 2432397312.0, + "37": 2432397312.0, + "38": 2432397312.0, + "39": 2432397312.0, + "40": 2432397312.0, + "41": 2432397312.0, + "42": 2432397312.0, + "43": 2432397312.0, + "44": 2432397312.0, + "45": 2432397312.0, + "46": 2432397312.0, + "47": 2432397312.0, + "48": 2432397312.0, + "49": 2432397312.0, + "50": 2432397312.0, + "51": 2432397312.0, + "52": 2432397312.0, + "53": 2432397312.0, + "54": 2432397312.0, + "55": 2432397312.0, + "56": 2432397312.0, + "57": 2432397312.0, + "58": 2432397312.0, + "59": 2432397312.0, + "60": 2432397312.0, + "61": 2432397312.0, + "62": 2432397312.0, + "63": 2432397312.0, + "64": 2432397312.0, + "65": 2432397312.0, + "66": 2432397312.0, + "67": 2432397312.0, + "68": 2432397312.0, + "69": 2432397312.0, + "70": 2432397312.0, + "71": 2432397312.0, + "72": 2432397312.0, + "73": 2432397312.0, + "74": 2432397312.0, + "75": 2432397312.0, + "76": 2432397312.0, + "77": 2432397312.0, + "78": 2432397312.0, + "79": 2432397312.0, + "80": 2432397312.0, + "81": 2432397312.0, + "82": 2432397312.0, + "83": 2432397312.0, + "84": 2432397312.0, + "85": 2432397312.0, + "86": 2432397312.0, + "87": 2432397312.0, + "88": 2432397312.0, + "89": 2432397312.0, + "90": 2432397312.0, + "91": 2432397312.0, + "92": 2432397312.0, + "93": 2432397312.0, + "94": 2432397312.0, + "95": 2432397312.0, + "96": 2432397312.0, + "97": 2432397312.0, + "98": 2432397312.0, + "99": 2432397312.0, + "100": 2432397312.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 16.46548, + "2": 0.12959, + "3": 0.10184, + "4": 0.09901, + "5": 0.09738, + "6": 0.09779, + "7": 0.09844, + "8": 0.09824, + "9": 0.0976, + "10": 0.0989, + "11": 0.09806, + "12": 0.09847, + "13": 0.09693, + "14": 0.0975, + "15": 0.09734, + "16": 0.09676, + "17": 0.09761, + "18": 0.10064, + "19": 0.10268, + "20": 0.10193, + "21": 0.09868, + "22": 0.10036, + "23": 0.10125, + "24": 0.10069, + "25": 0.09985, + "26": 0.09933, + "27": 0.10255, + "28": 0.09872, + "29": 0.09702, + "30": 0.09893, + "31": 0.10092, + "32": 0.10188, + "33": 0.09747, + "34": 0.09867, + "35": 0.09716, + "36": 0.09808, + "37": 0.09735, + "38": 0.09948, + "39": 0.10526, + "40": 0.10139, + "41": 0.09798, + "42": 0.10054, + "43": 0.09915, + "44": 0.09761, + "45": 0.09943, + "46": 0.09837, + "47": 0.10213, + "48": 0.0976, + "49": 0.09851, + "50": 0.09815, + "51": 0.10646, + "52": 0.10032, + "53": 0.10073, + "54": 0.10074, + "55": 0.10099, + "56": 0.09991, + "57": 0.10044, + "58": 0.10136, + "59": 0.10068, + "60": 0.10185, + "61": 0.10193, + "62": 0.10012, + "63": 0.09915, + "64": 0.09898, + "65": 0.10063, + "66": 0.10749, + "67": 0.09751, + "68": 0.10261, + "69": 0.10397, + "70": 0.10225, + "71": 0.10161, + "72": 0.09906, + "73": 0.09842, + "74": 0.10577, + "75": 0.1039, + "76": 0.10082, + "77": 0.09852, + "78": 0.09796, + "79": 0.10077, + "80": 0.10371, + "81": 0.10025, + "82": 0.10234, + "83": 0.10234, + "84": 0.10127, + "85": 0.10403, + "86": 0.10427, + "87": 0.10111, + "88": 0.10052, + "89": 0.10059, + "90": 0.10355, + "91": 0.10168, + "92": 0.1012, + "93": 0.10032, + "94": 0.10123, + "95": 0.10403, + "96": 0.10413, + "97": 0.10405, + "98": 0.11267, + "99": 0.11812, + "100": 0.11125 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..acadb81abbe --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84517, + "2": 10.85349, + "3": 10.8539, + "4": 10.83825, + "5": 10.87427, + "6": 10.89307, + "7": 10.85454, + "8": 10.8626, + "9": 10.86464, + "10": 10.82906, + "11": 10.88792, + "12": 10.87099, + "13": 10.87921, + "14": 10.89078, + "15": 10.81976, + "16": 10.83158, + "17": 10.79868, + "18": 10.81672, + "19": 10.81919, + "20": 10.72701, + "21": 10.70594, + "22": 10.56367, + "23": 10.72804, + "24": 10.60832, + "25": 10.5522, + "26": 10.60853, + "27": 10.62847, + "28": 10.58306, + "29": 10.60011, + "30": 10.36616, + "31": 10.12043, + "32": 10.47685, + "33": 10.46868, + "34": 10.22316, + "35": 10.27781, + "36": 10.22892, + "37": 10.35949, + "38": 10.19369, + "39": 10.41549, + "40": 10.09758, + "41": 10.1573, + "42": 10.22398, + "43": 9.83289, + "44": 9.96912, + "45": 9.84191, + "46": 9.83041, + "47": 10.15626, + "48": 9.85486, + "49": 9.54086, + "50": 9.91248, + "51": 9.85868, + "52": 9.74284, + "53": 10.06645, + "54": 9.95167, + "55": 9.88096, + "56": 9.62626, + "57": 9.47768, + "58": 9.83346, + "59": 9.58526, + "60": 9.50125, + "61": 9.69182, + "62": 9.98853, + "63": 9.38476, + "64": 9.7803, + "65": 8.94762, + "66": 9.70856, + "67": 9.36852, + "68": 9.78439, + "69": 9.79406, + "70": 9.74241, + "71": 9.61808, + "72": 9.58428, + "73": 9.5035, + "74": 8.94221, + "75": 9.42529, + "76": 9.07408, + "77": 10.06351, + "78": 9.7208, + "79": 9.37294, + "80": 9.40396, + "81": 9.48168, + "82": 9.69778, + "83": 9.30714, + "84": 9.41712, + "85": 9.61407, + "86": 9.07615, + "87": 9.59094, + "88": 9.74641, + "89": 9.59993, + "90": 9.8142, + "91": 9.33773, + "92": 9.35373, + "93": 9.07395, + "94": 8.83173, + "95": 9.51734, + "96": 9.52415, + "97": 9.30995, + "98": 9.66805, + "99": 8.88588, + "100": 9.39538 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1655.0, + "2": 1697.0, + "3": 1724.0, + "4": 1720.0, + "5": 1803.0, + "6": 1772.0, + "7": 1811.0, + "8": 1766.0, + "9": 1750.0, + "10": 1413.0, + "11": 1861.0, + "12": 1650.0, + "13": 1895.0, + "14": 1662.0, + "15": 1951.0, + "16": 1998.0, + "17": 1798.0, + "18": 1687.0, + "19": 1856.0, + "20": 1561.0, + "21": 1882.0, + "22": 1652.0, + "23": 2075.0, + "24": 1606.0, + "25": 1665.0, + "26": 1686.0, + "27": 1839.0, + "28": 2053.0, + "29": 1907.0, + "30": 1893.0, + "31": 1581.0, + "32": 1791.0, + "33": 2149.0, + "34": 1872.0, + "35": 2010.0, + "36": 1799.0, + "37": 2311.0, + "38": 2221.0, + "39": 2261.0, + "40": 2188.0, + "41": 2204.0, + "42": 2300.0, + "43": 2001.0, + "44": 2119.0, + "45": 2126.0, + "46": 2374.0, + "47": 2468.0, + "48": 2405.0, + "49": 2247.0, + "50": 2250.0, + "51": 2607.0, + "52": 2618.0, + "53": 2828.0, + "54": 2730.0, + "55": 2351.0, + "56": 2753.0, + "57": 2323.0, + "58": 2809.0, + "59": 2721.0, + "60": 2440.0, + "61": 2875.0, + "62": 2726.0, + "63": 2444.0, + "64": 3001.0, + "65": 2602.0, + "66": 2981.0, + "67": 2676.0, + "68": 2623.0, + "69": 2802.0, + "70": 3234.0, + "71": 2902.0, + "72": 2337.0, + "73": 2856.0, + "74": 1903.0, + "75": 2388.0, + "76": 3118.0, + "77": 3108.0, + "78": 3122.0, + "79": 2994.0, + "80": 3186.0, + "81": 3470.0, + "82": 3164.0, + "83": 2726.0, + "84": 3214.0, + "85": 3262.0, + "86": 2602.0, + "87": 3658.0, + "88": 2906.0, + "89": 3054.0, + "90": 3018.0, + "91": 2690.0, + "92": 3106.0, + "93": 2701.0, + "94": 3263.0, + "95": 3426.0, + "96": 3405.0, + "97": 3087.0, + "98": 3510.0, + "99": 3148.0, + "100": 3204.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 551269888.0, + "2": 551269888.0, + "3": 551269888.0, + "4": 552318464.0, + "5": 551269888.0, + "6": 551269888.0, + "7": 551269888.0, + "8": 551269888.0, + "9": 551269888.0, + "10": 551269888.0, + "11": 551269888.0, + "12": 551269888.0, + "13": 551269888.0, + "14": 551269888.0, + "15": 551269888.0, + "16": 551269888.0, + "17": 551269888.0, + "18": 551269888.0, + "19": 551269888.0, + "20": 551269888.0, + "21": 551269888.0, + "22": 551269888.0, + "23": 551269888.0, + "24": 551269888.0, + "25": 551269888.0, + "26": 551269888.0, + "27": 551269888.0, + "28": 551269888.0, + "29": 551269888.0, + "30": 551269888.0, + "31": 551269888.0, + "32": 551269888.0, + "33": 551269888.0, + "34": 551269888.0, + "35": 551269888.0, + "36": 551269888.0, + "37": 551269888.0, + "38": 551269888.0, + "39": 551269888.0, + "40": 551269888.0, + "41": 551269888.0, + "42": 551269888.0, + "43": 551269888.0, + "44": 551269888.0, + "45": 551269888.0, + "46": 551269888.0, + "47": 551269888.0, + "48": 551269888.0, + "49": 551269888.0, + "50": 551269888.0, + "51": 551269888.0, + "52": 551269888.0, + "53": 551269888.0, + "54": 551269888.0, + "55": 551269888.0, + "56": 551269888.0, + "57": 551269888.0, + "58": 551269888.0, + "59": 551269888.0, + "60": 551269888.0, + "61": 551269888.0, + "62": 551269888.0, + "63": 551269888.0, + "64": 551269888.0, + "65": 551269888.0, + "66": 551269888.0, + "67": 551269888.0, + "68": 551269888.0, + "69": 551269888.0, + "70": 551269888.0, + "71": 551269888.0, + "72": 551269888.0, + "73": 551269888.0, + "74": 551269888.0, + "75": 551269888.0, + "76": 551269888.0, + "77": 551269888.0, + "78": 551269888.0, + "79": 551269888.0, + "80": 551269888.0, + "81": 551269888.0, + "82": 551269888.0, + "83": 551269888.0, + "84": 551269888.0, + "85": 551269888.0, + "86": 551269888.0, + "87": 551269888.0, + "88": 551269888.0, + "89": 551269888.0, + "90": 551269888.0, + "91": 551269888.0, + "92": 551269888.0, + "93": 551269888.0, + "94": 551269888.0, + "95": 551269888.0, + "96": 551269888.0, + "97": 551269888.0, + "98": 551269888.0, + "99": 551269888.0, + "100": 551269888.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2290489344.0, + "2": 2432397312.0, + "3": 2432397312.0, + "4": 2432397312.0, + "5": 2432397312.0, + "6": 2432397312.0, + "7": 2432397312.0, + "8": 2432397312.0, + "9": 2432397312.0, + "10": 2432397312.0, + "11": 2432397312.0, + "12": 2432397312.0, + "13": 2432397312.0, + "14": 2432397312.0, + "15": 2432397312.0, + "16": 2432397312.0, + "17": 2432397312.0, + "18": 2432397312.0, + "19": 2432397312.0, + "20": 2432397312.0, + "21": 2432397312.0, + "22": 2432397312.0, + "23": 2432397312.0, + "24": 2432397312.0, + "25": 2432397312.0, + "26": 2432397312.0, + "27": 2432397312.0, + "28": 2432397312.0, + "29": 2432397312.0, + "30": 2432397312.0, + "31": 2432397312.0, + "32": 2432397312.0, + "33": 2432397312.0, + "34": 2432397312.0, + "35": 2432397312.0, + "36": 2432397312.0, + "37": 2432397312.0, + "38": 2432397312.0, + "39": 2432397312.0, + "40": 2432397312.0, + "41": 2432397312.0, + "42": 2432397312.0, + "43": 2432397312.0, + "44": 2432397312.0, + "45": 2432397312.0, + "46": 2432397312.0, + "47": 2432397312.0, + "48": 2432397312.0, + "49": 2432397312.0, + "50": 2432397312.0, + "51": 2432397312.0, + "52": 2432397312.0, + "53": 2432397312.0, + "54": 2432397312.0, + "55": 2432397312.0, + "56": 2432397312.0, + "57": 2432397312.0, + "58": 2432397312.0, + "59": 2432397312.0, + "60": 2432397312.0, + "61": 2432397312.0, + "62": 2432397312.0, + "63": 2432397312.0, + "64": 2432397312.0, + "65": 2432397312.0, + "66": 2432397312.0, + "67": 2432397312.0, + "68": 2432397312.0, + "69": 2432397312.0, + "70": 2432397312.0, + "71": 2432397312.0, + "72": 2432397312.0, + "73": 2432397312.0, + "74": 2432397312.0, + "75": 2432397312.0, + "76": 2432397312.0, + "77": 2432397312.0, + "78": 2432397312.0, + "79": 2432397312.0, + "80": 2432397312.0, + "81": 2432397312.0, + "82": 2432397312.0, + "83": 2432397312.0, + "84": 2432397312.0, + "85": 2432397312.0, + "86": 2432397312.0, + "87": 2432397312.0, + "88": 2432397312.0, + "89": 2432397312.0, + "90": 2432397312.0, + "91": 2432397312.0, + "92": 2432397312.0, + "93": 2432397312.0, + "94": 2432397312.0, + "95": 2432397312.0, + "96": 2432397312.0, + "97": 2432397312.0, + "98": 2432397312.0, + "99": 2432397312.0, + "100": 2432397312.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 17.61957, + "2": 0.12347, + "3": 0.11094, + "4": 0.11482, + "5": 0.11141, + "6": 0.10928, + "7": 0.10905, + "8": 0.11026, + "9": 0.11003, + "10": 0.11095, + "11": 0.11002, + "12": 0.1122, + "13": 0.11472, + "14": 0.11511, + "15": 0.11073, + "16": 0.11228, + "17": 0.11342, + "18": 0.11197, + "19": 0.11062, + "20": 0.11097, + "21": 0.11081, + "22": 0.11379, + "23": 0.10968, + "24": 0.11083, + "25": 0.11649, + "26": 0.11043, + "27": 0.11175, + "28": 0.11122, + "29": 0.11218, + "30": 0.11261, + "31": 0.11314, + "32": 0.10971, + "33": 0.11028, + "34": 0.11149, + "35": 0.11122, + "36": 0.11079, + "37": 0.11188, + "38": 0.1115, + "39": 0.11238, + "40": 0.11528, + "41": 0.11165, + "42": 0.11137, + "43": 0.11139, + "44": 0.11074, + "45": 0.11141, + "46": 0.11158, + "47": 0.1105, + "48": 0.11128, + "49": 0.11164, + "50": 0.11572, + "51": 0.11625, + "52": 0.10969, + "53": 0.10904, + "54": 0.1098, + "55": 0.10896, + "56": 0.11225, + "57": 0.11301, + "58": 0.11047, + "59": 0.10959, + "60": 0.11005, + "61": 0.11018, + "62": 0.10831, + "63": 0.10997, + "64": 0.10896, + "65": 0.11116, + "66": 0.11148, + "67": 0.1092, + "68": 0.10947, + "69": 0.10933, + "70": 0.10869, + "71": 0.10873, + "72": 0.10849, + "73": 0.10872, + "74": 0.10951, + "75": 0.1119, + "76": 0.1109, + "77": 0.10896, + "78": 0.10963, + "79": 0.11057, + "80": 0.10858, + "81": 0.10732, + "82": 0.10824, + "83": 0.11006, + "84": 0.11062, + "85": 0.1096, + "86": 0.10933, + "87": 0.11001, + "88": 0.11053, + "89": 0.10899, + "90": 0.10989, + "91": 0.10903, + "92": 0.10959, + "93": 0.11185, + "94": 0.11166, + "95": 0.11067, + "96": 0.11183, + "97": 0.11136, + "98": 0.11022, + "99": 0.11091, + "100": 0.10951 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..b3879ab6045 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.82005, + "2": 10.81907, + "3": 10.81396, + "4": 10.78497, + "5": 10.85284, + "6": 10.87449, + "7": 10.83201, + "8": 10.83297, + "9": 10.83935, + "10": 10.78455, + "11": 10.87798, + "12": 10.86112, + "13": 10.86444, + "14": 10.87605, + "15": 10.79229, + "16": 10.79509, + "17": 10.76768, + "18": 10.81006, + "19": 10.79716, + "20": 10.69212, + "21": 10.68168, + "22": 10.52085, + "23": 10.70898, + "24": 10.576, + "25": 10.52413, + "26": 10.59515, + "27": 10.58426, + "28": 10.56233, + "29": 10.57012, + "30": 10.34552, + "31": 10.10047, + "32": 10.45375, + "33": 10.44623, + "34": 10.20608, + "35": 10.26241, + "36": 10.2124, + "37": 10.3252, + "38": 10.16775, + "39": 10.38332, + "40": 10.07236, + "41": 10.13863, + "42": 10.19811, + "43": 9.81071, + "44": 9.93244, + "45": 9.81098, + "46": 9.80879, + "47": 10.1261, + "48": 9.82105, + "49": 9.50626, + "50": 9.88418, + "51": 9.8366, + "52": 9.7254, + "53": 10.04687, + "54": 9.93029, + "55": 9.86374, + "56": 9.60183, + "57": 9.4509, + "58": 9.80845, + "59": 9.56672, + "60": 9.47963, + "61": 9.67901, + "62": 9.96737, + "63": 9.3516, + "64": 9.75605, + "65": 8.93065, + "66": 9.68055, + "67": 9.3589, + "68": 9.76988, + "69": 9.77495, + "70": 9.71218, + "71": 9.60756, + "72": 9.57084, + "73": 9.48407, + "74": 8.92824, + "75": 9.4005, + "76": 9.07193, + "77": 10.05226, + "78": 9.71515, + "79": 9.35771, + "80": 9.39078, + "81": 9.46751, + "82": 9.68504, + "83": 9.29556, + "84": 9.4053, + "85": 9.60138, + "86": 9.06772, + "87": 9.58501, + "88": 9.73362, + "89": 9.59515, + "90": 9.80502, + "91": 9.3255, + "92": 9.35334, + "93": 9.06984, + "94": 8.8223, + "95": 9.50821, + "96": 9.51534, + "97": 9.29768, + "98": 9.66205, + "99": 8.87695, + "100": 9.3924 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1559.0, + "2": 1591.0, + "3": 1727.0, + "4": 1835.0, + "5": 1840.0, + "6": 1719.0, + "7": 1740.0, + "8": 1591.0, + "9": 1839.0, + "10": 1380.0, + "11": 1856.0, + "12": 1693.0, + "13": 1906.0, + "14": 1757.0, + "15": 1848.0, + "16": 1791.0, + "17": 1729.0, + "18": 1672.0, + "19": 1718.0, + "20": 1621.0, + "21": 1931.0, + "22": 1738.0, + "23": 1992.0, + "24": 1676.0, + "25": 1689.0, + "26": 1748.0, + "27": 1801.0, + "28": 1986.0, + "29": 2043.0, + "30": 1907.0, + "31": 1627.0, + "32": 1918.0, + "33": 2003.0, + "34": 1779.0, + "35": 1922.0, + "36": 1942.0, + "37": 2294.0, + "38": 2145.0, + "39": 2395.0, + "40": 2045.0, + "41": 2415.0, + "42": 2277.0, + "43": 1863.0, + "44": 2087.0, + "45": 2097.0, + "46": 2265.0, + "47": 2436.0, + "48": 2460.0, + "49": 2217.0, + "50": 2368.0, + "51": 2552.0, + "52": 2541.0, + "53": 2907.0, + "54": 2604.0, + "55": 2383.0, + "56": 2762.0, + "57": 2128.0, + "58": 3040.0, + "59": 2797.0, + "60": 2509.0, + "61": 3041.0, + "62": 2642.0, + "63": 2401.0, + "64": 2913.0, + "65": 2628.0, + "66": 2934.0, + "67": 2791.0, + "68": 2718.0, + "69": 3050.0, + "70": 3129.0, + "71": 3014.0, + "72": 2263.0, + "73": 2761.0, + "74": 1887.0, + "75": 2552.0, + "76": 3111.0, + "77": 3240.0, + "78": 3150.0, + "79": 3139.0, + "80": 3279.0, + "81": 3595.0, + "82": 3194.0, + "83": 2797.0, + "84": 3272.0, + "85": 3344.0, + "86": 2611.0, + "87": 3802.0, + "88": 3054.0, + "89": 3205.0, + "90": 2980.0, + "91": 2726.0, + "92": 3043.0, + "93": 2751.0, + "94": 3247.0, + "95": 3324.0, + "96": 3503.0, + "97": 3057.0, + "98": 3465.0, + "99": 3320.0, + "100": 3467.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 522345472.0, + "2": 522345472.0, + "3": 522345472.0, + "4": 522345472.0, + "5": 522345472.0, + "6": 522345472.0, + "7": 522345472.0, + "8": 522345472.0, + "9": 522345472.0, + "10": 522345472.0, + "11": 522345472.0, + "12": 522345472.0, + "13": 522345472.0, + "14": 522345472.0, + "15": 522345472.0, + "16": 522345472.0, + "17": 522345472.0, + "18": 522345472.0, + "19": 522345472.0, + "20": 522345472.0, + "21": 522345472.0, + "22": 522345472.0, + "23": 522345472.0, + "24": 522345472.0, + "25": 522345472.0, + "26": 522345472.0, + "27": 522345472.0, + "28": 522345472.0, + "29": 522345472.0, + "30": 522345472.0, + "31": 522345472.0, + "32": 522345472.0, + "33": 522345472.0, + "34": 522345472.0, + "35": 522345472.0, + "36": 522345472.0, + "37": 522345472.0, + "38": 522345472.0, + "39": 522345472.0, + "40": 522345472.0, + "41": 522345472.0, + "42": 522345472.0, + "43": 522345472.0, + "44": 522345472.0, + "45": 522345472.0, + "46": 522345472.0, + "47": 522345472.0, + "48": 522345472.0, + "49": 522345472.0, + "50": 522345472.0, + "51": 522345472.0, + "52": 522345472.0, + "53": 522345472.0, + "54": 522345472.0, + "55": 522345472.0, + "56": 522345472.0, + "57": 522345472.0, + "58": 522345472.0, + "59": 522345472.0, + "60": 522345472.0, + "61": 522345472.0, + "62": 522345472.0, + "63": 522345472.0, + "64": 522345472.0, + "65": 522345472.0, + "66": 522345472.0, + "67": 522345472.0, + "68": 522345472.0, + "69": 522345472.0, + "70": 522345472.0, + "71": 522345472.0, + "72": 522345472.0, + "73": 522345472.0, + "74": 522345472.0, + "75": 522345472.0, + "76": 522345472.0, + "77": 522345472.0, + "78": 522345472.0, + "79": 522345472.0, + "80": 522345472.0, + "81": 522345472.0, + "82": 522345472.0, + "83": 522345472.0, + "84": 522345472.0, + "85": 522345472.0, + "86": 522345472.0, + "87": 522345472.0, + "88": 522345472.0, + "89": 522345472.0, + "90": 522345472.0, + "91": 522345472.0, + "92": 522345472.0, + "93": 522345472.0, + "94": 522345472.0, + "95": 522345472.0, + "96": 522345472.0, + "97": 522345472.0, + "98": 522345472.0, + "99": 522345472.0, + "100": 522345472.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3769790464.0, + "2": 3912107008.0, + "3": 3912107008.0, + "4": 3912107008.0, + "5": 3912107008.0, + "6": 3912107008.0, + "7": 3912107008.0, + "8": 3912107008.0, + "9": 3912107008.0, + "10": 3912107008.0, + "11": 3912107008.0, + "12": 3912107008.0, + "13": 3912107008.0, + "14": 3912107008.0, + "15": 3912107008.0, + "16": 3912107008.0, + "17": 3912107008.0, + "18": 3912107008.0, + "19": 3912107008.0, + "20": 3912107008.0, + "21": 3912107008.0, + "22": 3912107008.0, + "23": 3912107008.0, + "24": 3912107008.0, + "25": 3912107008.0, + "26": 3912107008.0, + "27": 3912107008.0, + "28": 3912107008.0, + "29": 3912107008.0, + "30": 3912107008.0, + "31": 3912107008.0, + "32": 3912107008.0, + "33": 3912107008.0, + "34": 3912107008.0, + "35": 3912107008.0, + "36": 3912107008.0, + "37": 3912107008.0, + "38": 3912107008.0, + "39": 3912107008.0, + "40": 3912107008.0, + "41": 3912107008.0, + "42": 3912107008.0, + "43": 3912107008.0, + "44": 3912107008.0, + "45": 3912107008.0, + "46": 3912107008.0, + "47": 3912107008.0, + "48": 3912107008.0, + "49": 3912107008.0, + "50": 3912107008.0, + "51": 3912107008.0, + "52": 3912107008.0, + "53": 3912107008.0, + "54": 3912107008.0, + "55": 3912107008.0, + "56": 3912107008.0, + "57": 3912107008.0, + "58": 3912107008.0, + "59": 3912107008.0, + "60": 3912107008.0, + "61": 3912107008.0, + "62": 3912107008.0, + "63": 3912107008.0, + "64": 3912107008.0, + "65": 3912107008.0, + "66": 3912107008.0, + "67": 3912107008.0, + "68": 3912107008.0, + "69": 3912107008.0, + "70": 3912107008.0, + "71": 3912107008.0, + "72": 3912107008.0, + "73": 3912107008.0, + "74": 3912107008.0, + "75": 3912107008.0, + "76": 3912107008.0, + "77": 3912107008.0, + "78": 3912107008.0, + "79": 3912107008.0, + "80": 3912107008.0, + "81": 3912107008.0, + "82": 3912107008.0, + "83": 3912107008.0, + "84": 3912107008.0, + "85": 3912107008.0, + "86": 3912107008.0, + "87": 3912107008.0, + "88": 3912107008.0, + "89": 3912107008.0, + "90": 3912107008.0, + "91": 3912107008.0, + "92": 3912107008.0, + "93": 3912107008.0, + "94": 3912107008.0, + "95": 3912107008.0, + "96": 3912107008.0, + "97": 3912107008.0, + "98": 3912107008.0, + "99": 3912107008.0, + "100": 3912107008.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 22.15873, + "2": 0.19792, + "3": 0.1814, + "4": 0.17908, + "5": 0.17702, + "6": 0.17453, + "7": 0.17287, + "8": 0.17032, + "9": 0.17054, + "10": 0.44712, + "11": 0.17227, + "12": 0.17101, + "13": 0.17082, + "14": 0.17199, + "15": 0.17186, + "16": 0.17114, + "17": 0.1707, + "18": 0.17045, + "19": 0.17481, + "20": 0.17111, + "21": 0.17083, + "22": 0.17129, + "23": 0.17239, + "24": 0.17005, + "25": 0.17192, + "26": 0.1691, + "27": 0.17032, + "28": 0.16887, + "29": 0.16717, + "30": 0.16807, + "31": 0.17067, + "32": 0.16897, + "33": 0.17243, + "34": 0.17258, + "35": 0.17272, + "36": 0.17383, + "37": 0.17386, + "38": 0.17203, + "39": 0.17038, + "40": 0.17096, + "41": 0.1719, + "42": 0.1709, + "43": 0.17197, + "44": 0.17101, + "45": 0.17489, + "46": 0.17609, + "47": 0.16812, + "48": 0.16806, + "49": 0.16849, + "50": 0.1703, + "51": 0.17862, + "52": 0.41416, + "53": 0.1718, + "54": 0.17191, + "55": 0.41423, + "56": 0.47793, + "57": 0.17285, + "58": 0.17132, + "59": 0.17185, + "60": 0.17227, + "61": 0.17122, + "62": 0.17318, + "63": 0.17212, + "64": 0.17031, + "65": 0.17228, + "66": 0.17232, + "67": 0.17242, + "68": 0.17235, + "69": 0.17144, + "70": 0.17165, + "71": 0.17203, + "72": 0.17267, + "73": 0.17307, + "74": 0.17368, + "75": 0.17116, + "76": 0.17269, + "77": 0.17015, + "78": 0.17294, + "79": 0.17314, + "80": 0.17169, + "81": 0.1715, + "82": 0.17089, + "83": 0.17291, + "84": 0.17115, + "85": 0.17524, + "86": 0.17227, + "87": 0.17185, + "88": 0.17129, + "89": 0.17337, + "90": 0.17103, + "91": 0.17221, + "92": 0.17181, + "93": 0.17265, + "94": 0.17245, + "95": 0.17227, + "96": 0.17215, + "97": 0.17169, + "98": 0.17141, + "99": 0.17414, + "100": 0.17196 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..1d2aa1ec3ba --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.82005, + "2": 10.81907, + "3": 10.81396, + "4": 10.78497, + "5": 10.85284, + "6": 10.87449, + "7": 10.83201, + "8": 10.83297, + "9": 10.83935, + "10": 10.78455, + "11": 10.87798, + "12": 10.86112, + "13": 10.86444, + "14": 10.87605, + "15": 10.79229, + "16": 10.79509, + "17": 10.76768, + "18": 10.81006, + "19": 10.79716, + "20": 10.69212, + "21": 10.68168, + "22": 10.52085, + "23": 10.70898, + "24": 10.576, + "25": 10.52413, + "26": 10.59515, + "27": 10.58426, + "28": 10.56233, + "29": 10.57012, + "30": 10.34552, + "31": 10.10047, + "32": 10.45375, + "33": 10.44623, + "34": 10.20608, + "35": 10.26241, + "36": 10.2124, + "37": 10.3252, + "38": 10.16775, + "39": 10.38332, + "40": 10.07236, + "41": 10.13863, + "42": 10.19811, + "43": 9.81071, + "44": 9.93244, + "45": 9.81098, + "46": 9.80879, + "47": 10.1261, + "48": 9.82105, + "49": 9.50626, + "50": 9.88418, + "51": 9.8366, + "52": 9.7254, + "53": 10.04687, + "54": 9.93029, + "55": 9.86374, + "56": 9.60183, + "57": 9.4509, + "58": 9.80845, + "59": 9.56672, + "60": 9.47963, + "61": 9.67901, + "62": 9.96737, + "63": 9.3516, + "64": 9.75605, + "65": 8.93065, + "66": 9.68055, + "67": 9.3589, + "68": 9.76988, + "69": 9.77495, + "70": 9.71218, + "71": 9.60756, + "72": 9.57084, + "73": 9.48407, + "74": 8.92824, + "75": 9.4005, + "76": 9.07193, + "77": 10.05226, + "78": 9.71515, + "79": 9.35771, + "80": 9.39078, + "81": 9.46751, + "82": 9.68504, + "83": 9.29556, + "84": 9.4053, + "85": 9.60138, + "86": 9.06772, + "87": 9.58501, + "88": 9.73362, + "89": 9.59515, + "90": 9.80502, + "91": 9.3255, + "92": 9.35334, + "93": 9.06984, + "94": 8.8223, + "95": 9.50821, + "96": 9.51534, + "97": 9.29768, + "98": 9.66205, + "99": 8.87695, + "100": 9.3924 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1559.0, + "2": 1591.0, + "3": 1727.0, + "4": 1835.0, + "5": 1840.0, + "6": 1719.0, + "7": 1740.0, + "8": 1591.0, + "9": 1839.0, + "10": 1380.0, + "11": 1856.0, + "12": 1693.0, + "13": 1906.0, + "14": 1757.0, + "15": 1848.0, + "16": 1791.0, + "17": 1729.0, + "18": 1672.0, + "19": 1718.0, + "20": 1621.0, + "21": 1931.0, + "22": 1738.0, + "23": 1992.0, + "24": 1676.0, + "25": 1689.0, + "26": 1748.0, + "27": 1801.0, + "28": 1986.0, + "29": 2043.0, + "30": 1907.0, + "31": 1627.0, + "32": 1918.0, + "33": 2003.0, + "34": 1779.0, + "35": 1922.0, + "36": 1942.0, + "37": 2294.0, + "38": 2145.0, + "39": 2395.0, + "40": 2045.0, + "41": 2415.0, + "42": 2277.0, + "43": 1863.0, + "44": 2087.0, + "45": 2097.0, + "46": 2265.0, + "47": 2436.0, + "48": 2460.0, + "49": 2217.0, + "50": 2368.0, + "51": 2552.0, + "52": 2541.0, + "53": 2907.0, + "54": 2604.0, + "55": 2383.0, + "56": 2762.0, + "57": 2128.0, + "58": 3040.0, + "59": 2797.0, + "60": 2509.0, + "61": 3041.0, + "62": 2642.0, + "63": 2401.0, + "64": 2913.0, + "65": 2628.0, + "66": 2934.0, + "67": 2791.0, + "68": 2718.0, + "69": 3050.0, + "70": 3129.0, + "71": 3014.0, + "72": 2263.0, + "73": 2761.0, + "74": 1887.0, + "75": 2552.0, + "76": 3111.0, + "77": 3240.0, + "78": 3150.0, + "79": 3139.0, + "80": 3279.0, + "81": 3595.0, + "82": 3194.0, + "83": 2797.0, + "84": 3272.0, + "85": 3344.0, + "86": 2611.0, + "87": 3802.0, + "88": 3054.0, + "89": 3205.0, + "90": 2980.0, + "91": 2726.0, + "92": 3043.0, + "93": 2751.0, + "94": 3247.0, + "95": 3324.0, + "96": 3503.0, + "97": 3057.0, + "98": 3465.0, + "99": 3320.0, + "100": 3467.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 522345472.0, + "2": 522345472.0, + "3": 522345472.0, + "4": 522345472.0, + "5": 522345472.0, + "6": 522345472.0, + "7": 522345472.0, + "8": 522345472.0, + "9": 522345472.0, + "10": 522345472.0, + "11": 522345472.0, + "12": 522345472.0, + "13": 522345472.0, + "14": 522345472.0, + "15": 522345472.0, + "16": 522345472.0, + "17": 522345472.0, + "18": 522345472.0, + "19": 522345472.0, + "20": 522345472.0, + "21": 522345472.0, + "22": 522345472.0, + "23": 522345472.0, + "24": 522345472.0, + "25": 522345472.0, + "26": 522345472.0, + "27": 522345472.0, + "28": 522345472.0, + "29": 522345472.0, + "30": 522345472.0, + "31": 522345472.0, + "32": 522345472.0, + "33": 522345472.0, + "34": 522345472.0, + "35": 522345472.0, + "36": 522345472.0, + "37": 522345472.0, + "38": 522345472.0, + "39": 522345472.0, + "40": 522345472.0, + "41": 522345472.0, + "42": 522345472.0, + "43": 522345472.0, + "44": 522345472.0, + "45": 522345472.0, + "46": 522345472.0, + "47": 522345472.0, + "48": 522345472.0, + "49": 522345472.0, + "50": 522345472.0, + "51": 522345472.0, + "52": 522345472.0, + "53": 522345472.0, + "54": 522345472.0, + "55": 522345472.0, + "56": 522345472.0, + "57": 522345472.0, + "58": 522345472.0, + "59": 522345472.0, + "60": 522345472.0, + "61": 522345472.0, + "62": 522345472.0, + "63": 522345472.0, + "64": 522345472.0, + "65": 522345472.0, + "66": 522345472.0, + "67": 522345472.0, + "68": 522345472.0, + "69": 522345472.0, + "70": 522345472.0, + "71": 522345472.0, + "72": 522345472.0, + "73": 522345472.0, + "74": 522345472.0, + "75": 522345472.0, + "76": 522345472.0, + "77": 522345472.0, + "78": 522345472.0, + "79": 522345472.0, + "80": 522345472.0, + "81": 522345472.0, + "82": 522345472.0, + "83": 522345472.0, + "84": 522345472.0, + "85": 522345472.0, + "86": 522345472.0, + "87": 522345472.0, + "88": 522345472.0, + "89": 522345472.0, + "90": 522345472.0, + "91": 522345472.0, + "92": 522345472.0, + "93": 522345472.0, + "94": 522345472.0, + "95": 522345472.0, + "96": 522345472.0, + "97": 522345472.0, + "98": 522345472.0, + "99": 522345472.0, + "100": 522345472.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3769790464.0, + "2": 3912107008.0, + "3": 3912107008.0, + "4": 3912107008.0, + "5": 3912107008.0, + "6": 3912107008.0, + "7": 3912107008.0, + "8": 3912107008.0, + "9": 3912107008.0, + "10": 3912107008.0, + "11": 3912107008.0, + "12": 3912107008.0, + "13": 3912107008.0, + "14": 3912107008.0, + "15": 3912107008.0, + "16": 3912107008.0, + "17": 3912107008.0, + "18": 3912107008.0, + "19": 3912107008.0, + "20": 3912107008.0, + "21": 3912107008.0, + "22": 3912107008.0, + "23": 3912107008.0, + "24": 3912107008.0, + "25": 3912107008.0, + "26": 3912107008.0, + "27": 3912107008.0, + "28": 3912107008.0, + "29": 3912107008.0, + "30": 3912107008.0, + "31": 3912107008.0, + "32": 3912107008.0, + "33": 3912107008.0, + "34": 3912107008.0, + "35": 3912107008.0, + "36": 3912107008.0, + "37": 3912107008.0, + "38": 3912107008.0, + "39": 3912107008.0, + "40": 3912107008.0, + "41": 3912107008.0, + "42": 3912107008.0, + "43": 3912107008.0, + "44": 3912107008.0, + "45": 3912107008.0, + "46": 3912107008.0, + "47": 3912107008.0, + "48": 3912107008.0, + "49": 3912107008.0, + "50": 3912107008.0, + "51": 3912107008.0, + "52": 3912107008.0, + "53": 3912107008.0, + "54": 3912107008.0, + "55": 3912107008.0, + "56": 3912107008.0, + "57": 3912107008.0, + "58": 3912107008.0, + "59": 3912107008.0, + "60": 3912107008.0, + "61": 3912107008.0, + "62": 3912107008.0, + "63": 3912107008.0, + "64": 3912107008.0, + "65": 3912107008.0, + "66": 3912107008.0, + "67": 3912107008.0, + "68": 3912107008.0, + "69": 3912107008.0, + "70": 3912107008.0, + "71": 3912107008.0, + "72": 3912107008.0, + "73": 3912107008.0, + "74": 3912107008.0, + "75": 3912107008.0, + "76": 3912107008.0, + "77": 3912107008.0, + "78": 3912107008.0, + "79": 3912107008.0, + "80": 3912107008.0, + "81": 3912107008.0, + "82": 3912107008.0, + "83": 3912107008.0, + "84": 3912107008.0, + "85": 3912107008.0, + "86": 3912107008.0, + "87": 3912107008.0, + "88": 3912107008.0, + "89": 3912107008.0, + "90": 3912107008.0, + "91": 3912107008.0, + "92": 3912107008.0, + "93": 3912107008.0, + "94": 3912107008.0, + "95": 3912107008.0, + "96": 3912107008.0, + "97": 3912107008.0, + "98": 3912107008.0, + "99": 3912107008.0, + "100": 3912107008.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 22.61328, + "2": 0.20632, + "3": 0.1825, + "4": 0.17425, + "5": 0.17426, + "6": 0.17288, + "7": 0.17611, + "8": 0.17588, + "9": 0.17544, + "10": 0.17232, + "11": 0.17362, + "12": 0.17368, + "13": 0.17578, + "14": 0.17305, + "15": 0.17514, + "16": 0.17367, + "17": 0.17474, + "18": 0.17196, + "19": 0.1737, + "20": 0.17359, + "21": 0.17277, + "22": 0.17502, + "23": 0.17321, + "24": 0.172, + "25": 0.17239, + "26": 0.17041, + "27": 0.17172, + "28": 0.17178, + "29": 0.17225, + "30": 0.17082, + "31": 0.17234, + "32": 0.17192, + "33": 0.17201, + "34": 0.17283, + "35": 0.17212, + "36": 0.17393, + "37": 0.17078, + "38": 0.17394, + "39": 0.17341, + "40": 0.17259, + "41": 0.17595, + "42": 0.17237, + "43": 0.17334, + "44": 0.17079, + "45": 0.17254, + "46": 0.17378, + "47": 0.17228, + "48": 0.17193, + "49": 0.17207, + "50": 0.17337, + "51": 0.18317, + "52": 0.44439, + "53": 0.17445, + "54": 0.1761, + "55": 0.17625, + "56": 0.17729, + "57": 0.17831, + "58": 0.17704, + "59": 0.17623, + "60": 0.17946, + "61": 0.17712, + "62": 0.17274, + "63": 0.17809, + "64": 0.17585, + "65": 0.179, + "66": 0.17777, + "67": 0.17718, + "68": 0.17654, + "69": 0.17491, + "70": 0.17913, + "71": 0.17578, + "72": 0.17669, + "73": 0.17735, + "74": 0.17979, + "75": 0.17759, + "76": 0.17852, + "77": 0.1802, + "78": 0.17531, + "79": 0.17834, + "80": 0.17782, + "81": 0.17526, + "82": 0.17347, + "83": 0.17511, + "84": 0.17403, + "85": 0.17634, + "86": 0.1725, + "87": 0.17606, + "88": 0.17534, + "89": 0.17477, + "90": 0.17578, + "91": 0.1753, + "92": 0.17582, + "93": 0.17671, + "94": 0.17621, + "95": 0.17573, + "96": 0.17511, + "97": 0.17469, + "98": 0.17498, + "99": 0.41864, + "100": 0.17148 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..c903b0c0464 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.82005, + "2": 10.81907, + "3": 10.81396, + "4": 10.78497, + "5": 10.85284, + "6": 10.87449, + "7": 10.83201, + "8": 10.83297, + "9": 10.83935, + "10": 10.78455, + "11": 10.87798, + "12": 10.86112, + "13": 10.86444, + "14": 10.87605, + "15": 10.79229, + "16": 10.79509, + "17": 10.76768, + "18": 10.81006, + "19": 10.79716, + "20": 10.69212, + "21": 10.68168, + "22": 10.52085, + "23": 10.70898, + "24": 10.576, + "25": 10.52413, + "26": 10.59515, + "27": 10.58426, + "28": 10.56233, + "29": 10.57012, + "30": 10.34552, + "31": 10.10047, + "32": 10.45375, + "33": 10.44623, + "34": 10.20608, + "35": 10.26241, + "36": 10.2124, + "37": 10.3252, + "38": 10.16775, + "39": 10.38332, + "40": 10.07236, + "41": 10.13863, + "42": 10.19811, + "43": 9.81071, + "44": 9.93244, + "45": 9.81098, + "46": 9.80879, + "47": 10.1261, + "48": 9.82105, + "49": 9.50626, + "50": 9.88418, + "51": 9.8366, + "52": 9.7254, + "53": 10.04687, + "54": 9.93029, + "55": 9.86374, + "56": 9.60183, + "57": 9.4509, + "58": 9.80845, + "59": 9.56672, + "60": 9.47963, + "61": 9.67901, + "62": 9.96737, + "63": 9.3516, + "64": 9.75605, + "65": 8.93065, + "66": 9.68055, + "67": 9.3589, + "68": 9.76988, + "69": 9.77495, + "70": 9.71218, + "71": 9.60756, + "72": 9.57084, + "73": 9.48407, + "74": 8.92824, + "75": 9.4005, + "76": 9.07193, + "77": 10.05226, + "78": 9.71515, + "79": 9.35771, + "80": 9.39078, + "81": 9.46751, + "82": 9.68504, + "83": 9.29556, + "84": 9.4053, + "85": 9.60138, + "86": 9.06772, + "87": 9.58501, + "88": 9.73362, + "89": 9.59515, + "90": 9.80502, + "91": 9.3255, + "92": 9.35334, + "93": 9.06984, + "94": 8.8223, + "95": 9.50821, + "96": 9.51534, + "97": 9.29768, + "98": 9.66205, + "99": 8.87695, + "100": 9.3924 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1559.0, + "2": 1591.0, + "3": 1727.0, + "4": 1835.0, + "5": 1840.0, + "6": 1719.0, + "7": 1740.0, + "8": 1591.0, + "9": 1839.0, + "10": 1380.0, + "11": 1856.0, + "12": 1693.0, + "13": 1906.0, + "14": 1757.0, + "15": 1848.0, + "16": 1791.0, + "17": 1729.0, + "18": 1672.0, + "19": 1718.0, + "20": 1621.0, + "21": 1931.0, + "22": 1738.0, + "23": 1992.0, + "24": 1676.0, + "25": 1689.0, + "26": 1748.0, + "27": 1801.0, + "28": 1986.0, + "29": 2043.0, + "30": 1907.0, + "31": 1627.0, + "32": 1918.0, + "33": 2003.0, + "34": 1779.0, + "35": 1922.0, + "36": 1942.0, + "37": 2294.0, + "38": 2145.0, + "39": 2395.0, + "40": 2045.0, + "41": 2415.0, + "42": 2277.0, + "43": 1863.0, + "44": 2087.0, + "45": 2097.0, + "46": 2265.0, + "47": 2436.0, + "48": 2460.0, + "49": 2217.0, + "50": 2368.0, + "51": 2552.0, + "52": 2541.0, + "53": 2907.0, + "54": 2604.0, + "55": 2383.0, + "56": 2762.0, + "57": 2128.0, + "58": 3040.0, + "59": 2797.0, + "60": 2509.0, + "61": 3041.0, + "62": 2642.0, + "63": 2401.0, + "64": 2913.0, + "65": 2628.0, + "66": 2934.0, + "67": 2791.0, + "68": 2718.0, + "69": 3050.0, + "70": 3129.0, + "71": 3014.0, + "72": 2263.0, + "73": 2761.0, + "74": 1887.0, + "75": 2552.0, + "76": 3111.0, + "77": 3240.0, + "78": 3150.0, + "79": 3139.0, + "80": 3279.0, + "81": 3595.0, + "82": 3194.0, + "83": 2797.0, + "84": 3272.0, + "85": 3344.0, + "86": 2611.0, + "87": 3802.0, + "88": 3054.0, + "89": 3205.0, + "90": 2980.0, + "91": 2726.0, + "92": 3043.0, + "93": 2751.0, + "94": 3247.0, + "95": 3324.0, + "96": 3503.0, + "97": 3057.0, + "98": 3465.0, + "99": 3320.0, + "100": 3467.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 519065600.0, + "2": 519065600.0, + "3": 519065600.0, + "4": 519065600.0, + "5": 519065600.0, + "6": 519065600.0, + "7": 519065600.0, + "8": 519065600.0, + "9": 519065600.0, + "10": 519065600.0, + "11": 519065600.0, + "12": 519065600.0, + "13": 519065600.0, + "14": 519065600.0, + "15": 519065600.0, + "16": 519065600.0, + "17": 519065600.0, + "18": 519065600.0, + "19": 519065600.0, + "20": 519065600.0, + "21": 519065600.0, + "22": 519065600.0, + "23": 519065600.0, + "24": 519065600.0, + "25": 519065600.0, + "26": 519065600.0, + "27": 519065600.0, + "28": 519065600.0, + "29": 519065600.0, + "30": 519065600.0, + "31": 519065600.0, + "32": 519065600.0, + "33": 519065600.0, + "34": 519065600.0, + "35": 519065600.0, + "36": 519065600.0, + "37": 519065600.0, + "38": 519065600.0, + "39": 519065600.0, + "40": 519065600.0, + "41": 519065600.0, + "42": 519065600.0, + "43": 519065600.0, + "44": 519065600.0, + "45": 519065600.0, + "46": 519065600.0, + "47": 519065600.0, + "48": 519065600.0, + "49": 519065600.0, + "50": 519065600.0, + "51": 519065600.0, + "52": 519065600.0, + "53": 519065600.0, + "54": 519065600.0, + "55": 519065600.0, + "56": 519065600.0, + "57": 519065600.0, + "58": 519065600.0, + "59": 519065600.0, + "60": 519065600.0, + "61": 519065600.0, + "62": 519065600.0, + "63": 519065600.0, + "64": 519065600.0, + "65": 519065600.0, + "66": 519065600.0, + "67": 519065600.0, + "68": 519065600.0, + "69": 519065600.0, + "70": 519065600.0, + "71": 519065600.0, + "72": 519065600.0, + "73": 519065600.0, + "74": 519065600.0, + "75": 519065600.0, + "76": 519065600.0, + "77": 519065600.0, + "78": 519065600.0, + "79": 519065600.0, + "80": 519065600.0, + "81": 519065600.0, + "82": 519065600.0, + "83": 519065600.0, + "84": 519065600.0, + "85": 519065600.0, + "86": 519065600.0, + "87": 519065600.0, + "88": 519065600.0, + "89": 519065600.0, + "90": 519065600.0, + "91": 519065600.0, + "92": 519065600.0, + "93": 519065600.0, + "94": 519065600.0, + "95": 519065600.0, + "96": 519065600.0, + "97": 519065600.0, + "98": 519065600.0, + "99": 519065600.0, + "100": 519065600.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3767053312.0, + "2": 3907909632.0, + "3": 3907909632.0, + "4": 3907909632.0, + "5": 3907909632.0, + "6": 3907909632.0, + "7": 3907909632.0, + "8": 3907909632.0, + "9": 3907909632.0, + "10": 3907909632.0, + "11": 3907909632.0, + "12": 3907909632.0, + "13": 3907909632.0, + "14": 3907909632.0, + "15": 3907909632.0, + "16": 3907909632.0, + "17": 3907909632.0, + "18": 3907909632.0, + "19": 3907909632.0, + "20": 3907909632.0, + "21": 3907909632.0, + "22": 3907909632.0, + "23": 3907909632.0, + "24": 3907909632.0, + "25": 3907909632.0, + "26": 3907909632.0, + "27": 3907909632.0, + "28": 3907909632.0, + "29": 3907909632.0, + "30": 3907909632.0, + "31": 3907909632.0, + "32": 3907909632.0, + "33": 3907909632.0, + "34": 3907909632.0, + "35": 3907909632.0, + "36": 3907909632.0, + "37": 3907909632.0, + "38": 3907909632.0, + "39": 3907909632.0, + "40": 3907909632.0, + "41": 3907909632.0, + "42": 3907909632.0, + "43": 3907909632.0, + "44": 3907909632.0, + "45": 3907909632.0, + "46": 3907909632.0, + "47": 3907909632.0, + "48": 3907909632.0, + "49": 3907909632.0, + "50": 3907909632.0, + "51": 3907909632.0, + "52": 3907909632.0, + "53": 3907909632.0, + "54": 3907909632.0, + "55": 3907909632.0, + "56": 3907909632.0, + "57": 3907909632.0, + "58": 3907909632.0, + "59": 3907909632.0, + "60": 3907909632.0, + "61": 3907909632.0, + "62": 3907909632.0, + "63": 3907909632.0, + "64": 3907909632.0, + "65": 3907909632.0, + "66": 3907909632.0, + "67": 3907909632.0, + "68": 3907909632.0, + "69": 3907909632.0, + "70": 3907909632.0, + "71": 3907909632.0, + "72": 3907909632.0, + "73": 3907909632.0, + "74": 3907909632.0, + "75": 3907909632.0, + "76": 3907909632.0, + "77": 3907909632.0, + "78": 3907909632.0, + "79": 3907909632.0, + "80": 3907909632.0, + "81": 3907909632.0, + "82": 3907909632.0, + "83": 3907909632.0, + "84": 3907909632.0, + "85": 3907909632.0, + "86": 3907909632.0, + "87": 3907909632.0, + "88": 3907909632.0, + "89": 3907909632.0, + "90": 3907909632.0, + "91": 3907909632.0, + "92": 3907909632.0, + "93": 3907909632.0, + "94": 3907909632.0, + "95": 3907909632.0, + "96": 3907909632.0, + "97": 3907909632.0, + "98": 3907909632.0, + "99": 3907909632.0, + "100": 3907909632.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 23.61626, + "2": 0.20825, + "3": 0.18598, + "4": 0.17768, + "5": 0.1774, + "6": 0.17565, + "7": 0.17554, + "8": 0.17574, + "9": 0.17822, + "10": 0.18542, + "11": 0.3344, + "12": 0.17809, + "13": 0.17774, + "14": 0.17628, + "15": 0.17758, + "16": 0.17752, + "17": 0.17677, + "18": 0.17866, + "19": 0.17775, + "20": 0.17503, + "21": 0.32873, + "22": 0.17696, + "23": 0.17781, + "24": 0.17815, + "25": 0.17477, + "26": 0.17422, + "27": 0.17425, + "28": 0.17474, + "29": 0.17648, + "30": 0.17377, + "31": 0.33173, + "32": 0.17366, + "33": 0.17393, + "34": 0.17333, + "35": 0.17469, + "36": 0.1737, + "37": 0.17376, + "38": 0.17511, + "39": 0.17374, + "40": 0.38462, + "41": 0.33019, + "42": 0.18095, + "43": 0.17639, + "44": 0.17398, + "45": 0.17539, + "46": 0.17369, + "47": 0.1733, + "48": 0.17495, + "49": 0.1737, + "50": 0.1733, + "51": 0.3281, + "52": 0.17681, + "53": 0.17706, + "54": 0.17883, + "55": 0.18057, + "56": 0.18194, + "57": 0.18281, + "58": 0.1833, + "59": 0.18471, + "60": 0.40872, + "61": 0.33723, + "62": 0.18166, + "63": 0.38808, + "64": 0.17968, + "65": 0.18147, + "66": 0.17961, + "67": 0.17851, + "68": 0.17748, + "69": 0.17797, + "70": 0.17994, + "71": 0.33627, + "72": 0.17952, + "73": 0.178, + "74": 0.17922, + "75": 0.17803, + "76": 0.18159, + "77": 0.17818, + "78": 0.17782, + "79": 0.36281, + "80": 0.18081, + "81": 0.33928, + "82": 0.17691, + "83": 0.17684, + "84": 0.17781, + "85": 0.18012, + "86": 0.17905, + "87": 0.17785, + "88": 0.17817, + "89": 0.17743, + "90": 0.17902, + "91": 0.33283, + "92": 0.17956, + "93": 0.17935, + "94": 0.18039, + "95": 0.17971, + "96": 0.18011, + "97": 0.18031, + "98": 0.1785, + "99": 0.18155, + "100": 0.17741 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..9d14156b3a0 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.82005, + "2": 10.81907, + "3": 10.81396, + "4": 10.78497, + "5": 10.85284, + "6": 10.87449, + "7": 10.83201, + "8": 10.83297, + "9": 10.83935, + "10": 10.78455, + "11": 10.87798, + "12": 10.86112, + "13": 10.86444, + "14": 10.87605, + "15": 10.79229, + "16": 10.79509, + "17": 10.76768, + "18": 10.81006, + "19": 10.79716, + "20": 10.69212, + "21": 10.68168, + "22": 10.52085, + "23": 10.70898, + "24": 10.576, + "25": 10.52413, + "26": 10.59515, + "27": 10.58426, + "28": 10.56233, + "29": 10.57012, + "30": 10.34552, + "31": 10.10047, + "32": 10.45375, + "33": 10.44623, + "34": 10.20608, + "35": 10.26241, + "36": 10.2124, + "37": 10.3252, + "38": 10.16775, + "39": 10.38332, + "40": 10.07236, + "41": 10.13863, + "42": 10.19811, + "43": 9.81071, + "44": 9.93244, + "45": 9.81098, + "46": 9.80879, + "47": 10.1261, + "48": 9.82105, + "49": 9.50626, + "50": 9.88418, + "51": 9.8366, + "52": 9.7254, + "53": 10.04687, + "54": 9.93029, + "55": 9.86374, + "56": 9.60183, + "57": 9.4509, + "58": 9.80845, + "59": 9.56672, + "60": 9.47963, + "61": 9.67901, + "62": 9.96737, + "63": 9.3516, + "64": 9.75605, + "65": 8.93065, + "66": 9.68055, + "67": 9.3589, + "68": 9.76988, + "69": 9.77495, + "70": 9.71218, + "71": 9.60756, + "72": 9.57084, + "73": 9.48407, + "74": 8.92824, + "75": 9.4005, + "76": 9.07193, + "77": 10.05226, + "78": 9.71515, + "79": 9.35771, + "80": 9.39078, + "81": 9.46751, + "82": 9.68504, + "83": 9.29556, + "84": 9.4053, + "85": 9.60138, + "86": 9.06772, + "87": 9.58501, + "88": 9.73362, + "89": 9.59515, + "90": 9.80502, + "91": 9.3255, + "92": 9.35334, + "93": 9.06984, + "94": 8.8223, + "95": 9.50821, + "96": 9.51534, + "97": 9.29768, + "98": 9.66205, + "99": 8.87695, + "100": 9.3924 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1559.0, + "2": 1591.0, + "3": 1727.0, + "4": 1835.0, + "5": 1840.0, + "6": 1719.0, + "7": 1740.0, + "8": 1591.0, + "9": 1839.0, + "10": 1380.0, + "11": 1856.0, + "12": 1693.0, + "13": 1906.0, + "14": 1757.0, + "15": 1848.0, + "16": 1791.0, + "17": 1729.0, + "18": 1672.0, + "19": 1718.0, + "20": 1621.0, + "21": 1931.0, + "22": 1738.0, + "23": 1992.0, + "24": 1676.0, + "25": 1689.0, + "26": 1748.0, + "27": 1801.0, + "28": 1986.0, + "29": 2043.0, + "30": 1907.0, + "31": 1627.0, + "32": 1918.0, + "33": 2003.0, + "34": 1779.0, + "35": 1922.0, + "36": 1942.0, + "37": 2294.0, + "38": 2145.0, + "39": 2395.0, + "40": 2045.0, + "41": 2415.0, + "42": 2277.0, + "43": 1863.0, + "44": 2087.0, + "45": 2097.0, + "46": 2265.0, + "47": 2436.0, + "48": 2460.0, + "49": 2217.0, + "50": 2368.0, + "51": 2552.0, + "52": 2541.0, + "53": 2907.0, + "54": 2604.0, + "55": 2383.0, + "56": 2762.0, + "57": 2128.0, + "58": 3040.0, + "59": 2797.0, + "60": 2509.0, + "61": 3041.0, + "62": 2642.0, + "63": 2401.0, + "64": 2913.0, + "65": 2628.0, + "66": 2934.0, + "67": 2791.0, + "68": 2718.0, + "69": 3050.0, + "70": 3129.0, + "71": 3014.0, + "72": 2263.0, + "73": 2761.0, + "74": 1887.0, + "75": 2552.0, + "76": 3111.0, + "77": 3240.0, + "78": 3150.0, + "79": 3139.0, + "80": 3279.0, + "81": 3595.0, + "82": 3194.0, + "83": 2797.0, + "84": 3272.0, + "85": 3344.0, + "86": 2611.0, + "87": 3802.0, + "88": 3054.0, + "89": 3205.0, + "90": 2980.0, + "91": 2726.0, + "92": 3043.0, + "93": 2751.0, + "94": 3247.0, + "95": 3324.0, + "96": 3503.0, + "97": 3057.0, + "98": 3465.0, + "99": 3320.0, + "100": 3467.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 519065600.0, + "2": 519065600.0, + "3": 519065600.0, + "4": 519065600.0, + "5": 519065600.0, + "6": 519065600.0, + "7": 519065600.0, + "8": 519065600.0, + "9": 519065600.0, + "10": 519065600.0, + "11": 519065600.0, + "12": 519065600.0, + "13": 519065600.0, + "14": 519065600.0, + "15": 519065600.0, + "16": 519065600.0, + "17": 519065600.0, + "18": 519065600.0, + "19": 519065600.0, + "20": 519065600.0, + "21": 519065600.0, + "22": 519065600.0, + "23": 519065600.0, + "24": 519065600.0, + "25": 519065600.0, + "26": 519065600.0, + "27": 519065600.0, + "28": 519065600.0, + "29": 519065600.0, + "30": 519065600.0, + "31": 519065600.0, + "32": 519065600.0, + "33": 519065600.0, + "34": 519065600.0, + "35": 519065600.0, + "36": 519065600.0, + "37": 519065600.0, + "38": 519065600.0, + "39": 519065600.0, + "40": 519065600.0, + "41": 519065600.0, + "42": 519065600.0, + "43": 519065600.0, + "44": 519065600.0, + "45": 519065600.0, + "46": 519065600.0, + "47": 519065600.0, + "48": 519065600.0, + "49": 519065600.0, + "50": 519065600.0, + "51": 519065600.0, + "52": 519065600.0, + "53": 519065600.0, + "54": 519065600.0, + "55": 519065600.0, + "56": 519065600.0, + "57": 519065600.0, + "58": 519065600.0, + "59": 519065600.0, + "60": 519065600.0, + "61": 519065600.0, + "62": 519065600.0, + "63": 519065600.0, + "64": 519065600.0, + "65": 519065600.0, + "66": 519065600.0, + "67": 519065600.0, + "68": 519065600.0, + "69": 519065600.0, + "70": 519065600.0, + "71": 519065600.0, + "72": 519065600.0, + "73": 519065600.0, + "74": 519065600.0, + "75": 519065600.0, + "76": 519065600.0, + "77": 519065600.0, + "78": 519065600.0, + "79": 519065600.0, + "80": 519065600.0, + "81": 519065600.0, + "82": 519065600.0, + "83": 519065600.0, + "84": 519065600.0, + "85": 519065600.0, + "86": 519065600.0, + "87": 519065600.0, + "88": 519065600.0, + "89": 519065600.0, + "90": 519065600.0, + "91": 519065600.0, + "92": 519065600.0, + "93": 519065600.0, + "94": 519065600.0, + "95": 519065600.0, + "96": 519065600.0, + "97": 519065600.0, + "98": 519065600.0, + "99": 519065600.0, + "100": 519065600.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3767053312.0, + "2": 3907909632.0, + "3": 3907909632.0, + "4": 3907909632.0, + "5": 3907909632.0, + "6": 3907909632.0, + "7": 3907909632.0, + "8": 3907909632.0, + "9": 3907909632.0, + "10": 3907909632.0, + "11": 3907909632.0, + "12": 3907909632.0, + "13": 3907909632.0, + "14": 3907909632.0, + "15": 3907909632.0, + "16": 3907909632.0, + "17": 3907909632.0, + "18": 3907909632.0, + "19": 3907909632.0, + "20": 3907909632.0, + "21": 3907909632.0, + "22": 3907909632.0, + "23": 3907909632.0, + "24": 3907909632.0, + "25": 3907909632.0, + "26": 3907909632.0, + "27": 3907909632.0, + "28": 3907909632.0, + "29": 3907909632.0, + "30": 3907909632.0, + "31": 3907909632.0, + "32": 3907909632.0, + "33": 3907909632.0, + "34": 3907909632.0, + "35": 3907909632.0, + "36": 3907909632.0, + "37": 3907909632.0, + "38": 3907909632.0, + "39": 3907909632.0, + "40": 3907909632.0, + "41": 3907909632.0, + "42": 3907909632.0, + "43": 3907909632.0, + "44": 3907909632.0, + "45": 3907909632.0, + "46": 3907909632.0, + "47": 3907909632.0, + "48": 3907909632.0, + "49": 3907909632.0, + "50": 3907909632.0, + "51": 3907909632.0, + "52": 3907909632.0, + "53": 3907909632.0, + "54": 3907909632.0, + "55": 3907909632.0, + "56": 3907909632.0, + "57": 3907909632.0, + "58": 3907909632.0, + "59": 3907909632.0, + "60": 3907909632.0, + "61": 3907909632.0, + "62": 3907909632.0, + "63": 3907909632.0, + "64": 3907909632.0, + "65": 3907909632.0, + "66": 3907909632.0, + "67": 3907909632.0, + "68": 3907909632.0, + "69": 3907909632.0, + "70": 3907909632.0, + "71": 3907909632.0, + "72": 3907909632.0, + "73": 3907909632.0, + "74": 3907909632.0, + "75": 3907909632.0, + "76": 3907909632.0, + "77": 3907909632.0, + "78": 3907909632.0, + "79": 3907909632.0, + "80": 3907909632.0, + "81": 3907909632.0, + "82": 3907909632.0, + "83": 3907909632.0, + "84": 3907909632.0, + "85": 3907909632.0, + "86": 3907909632.0, + "87": 3907909632.0, + "88": 3907909632.0, + "89": 3907909632.0, + "90": 3907909632.0, + "91": 3907909632.0, + "92": 3907909632.0, + "93": 3907909632.0, + "94": 3907909632.0, + "95": 3907909632.0, + "96": 3907909632.0, + "97": 3907909632.0, + "98": 3907909632.0, + "99": 3907909632.0, + "100": 3907909632.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 21.99574, + "2": 0.195, + "3": 0.1744, + "4": 0.17427, + "5": 0.17308, + "6": 0.16861, + "7": 0.17429, + "8": 0.1716, + "9": 0.16924, + "10": 0.16858, + "11": 0.33896, + "12": 0.17029, + "13": 0.16981, + "14": 0.16723, + "15": 0.16853, + "16": 0.16865, + "17": 0.16777, + "18": 0.16879, + "19": 0.16785, + "20": 0.16886, + "21": 0.3357, + "22": 0.17081, + "23": 0.17048, + "24": 0.16879, + "25": 0.1687, + "26": 0.16713, + "27": 0.16939, + "28": 0.1692, + "29": 0.17134, + "30": 0.17092, + "31": 0.3812, + "32": 0.17397, + "33": 0.17588, + "34": 0.17999, + "35": 0.17703, + "36": 0.1801, + "37": 0.1707, + "38": 0.17289, + "39": 0.17016, + "40": 0.17112, + "41": 0.33944, + "42": 0.17206, + "43": 0.17137, + "44": 0.16906, + "45": 0.42618, + "46": 0.1703, + "47": 0.17243, + "48": 0.17004, + "49": 0.16966, + "50": 0.16756, + "51": 0.51274, + "52": 0.17278, + "53": 0.17206, + "54": 0.17409, + "55": 0.17339, + "56": 0.17492, + "57": 0.17254, + "58": 0.17691, + "59": 0.46979, + "60": 0.37194, + "61": 0.34378, + "62": 0.17598, + "63": 0.48505, + "64": 0.17494, + "65": 0.18089, + "66": 0.17632, + "67": 0.1754, + "68": 0.17476, + "69": 0.172, + "70": 0.1727, + "71": 0.33976, + "72": 0.17542, + "73": 0.17238, + "74": 0.17531, + "75": 0.1747, + "76": 0.17675, + "77": 0.17303, + "78": 0.17397, + "79": 0.17413, + "80": 0.17841, + "81": 0.34399, + "82": 0.17266, + "83": 0.17424, + "84": 0.17542, + "85": 0.17322, + "86": 0.17628, + "87": 0.17307, + "88": 0.17357, + "89": 0.17221, + "90": 0.17402, + "91": 0.34115, + "92": 0.17524, + "93": 0.21142, + "94": 0.18543, + "95": 0.19932, + "96": 0.20217, + "97": 0.21251, + "98": 0.20217, + "99": 0.19729, + "100": 0.19649 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..31d5de38121 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.9359, + "2": 10.92235, + "3": 10.92366, + "4": 10.90567, + "5": 10.93225, + "6": 10.93547, + "7": 10.92702, + "8": 10.92052, + "9": 10.9395, + "10": 10.91083, + "11": 10.94242, + "12": 10.93185, + "13": 10.92496, + "14": 10.94487, + "15": 10.85723, + "16": 10.88074, + "17": 10.87011, + "18": 10.88561, + "19": 10.87042, + "20": 10.77088, + "21": 10.7565, + "22": 10.62779, + "23": 10.77022, + "24": 10.65205, + "25": 10.60556, + "26": 10.66333, + "27": 10.66552, + "28": 10.60547, + "29": 10.6471, + "30": 10.40549, + "31": 10.16719, + "32": 10.51369, + "33": 10.5051, + "34": 10.27046, + "35": 10.31366, + "36": 10.27241, + "37": 10.38617, + "38": 10.23179, + "39": 10.45437, + "40": 10.12334, + "41": 10.19576, + "42": 10.25282, + "43": 9.86635, + "44": 9.99502, + "45": 9.87564, + "46": 9.86006, + "47": 10.19474, + "48": 9.87777, + "49": 9.56673, + "50": 9.94452, + "51": 9.89728, + "52": 9.7879, + "53": 10.1278, + "54": 9.98346, + "55": 9.90094, + "56": 9.66557, + "57": 9.50042, + "58": 9.87703, + "59": 9.61777, + "60": 9.55238, + "61": 9.71568, + "62": 10.03384, + "63": 9.41318, + "64": 9.8198, + "65": 8.96792, + "66": 9.74791, + "67": 9.39412, + "68": 9.82081, + "69": 9.82389, + "70": 9.77835, + "71": 9.64728, + "72": 9.59599, + "73": 9.53704, + "74": 8.96545, + "75": 9.44605, + "76": 9.10011, + "77": 10.09977, + "78": 9.7355, + "79": 9.38643, + "80": 9.42014, + "81": 9.50916, + "82": 9.72306, + "83": 9.3462, + "84": 9.44805, + "85": 9.64324, + "86": 9.07728, + "87": 9.61635, + "88": 9.79137, + "89": 9.61978, + "90": 9.85827, + "91": 9.35282, + "92": 9.38717, + "93": 9.08084, + "94": 8.82234, + "95": 9.52085, + "96": 9.54578, + "97": 9.34183, + "98": 9.70521, + "99": 8.89223, + "100": 9.43415 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 22727686.0, + "2": 22924976.0, + "3": 22597376.0, + "4": 23218740.0, + "5": 22715312.0, + "6": 23020980.0, + "7": 22770736.0, + "8": 22927078.0, + "9": 22841964.0, + "10": 22919060.0, + "11": 22501344.0, + "12": 22460424.0, + "13": 22916824.0, + "14": 22388904.0, + "15": 22821200.0, + "16": 22829956.0, + "17": 22819072.0, + "18": 22582680.0, + "19": 22618528.0, + "20": 22693840.0, + "21": 22739692.0, + "22": 22799900.0, + "23": 22538946.0, + "24": 22771530.0, + "25": 22819524.0, + "26": 22548320.0, + "27": 22468868.0, + "28": 22452892.0, + "29": 22530184.0, + "30": 22631232.0, + "31": 22955646.0, + "32": 22584920.0, + "33": 22558000.0, + "34": 22835968.0, + "35": 22787888.0, + "36": 22589844.0, + "37": 22497188.0, + "38": 22896516.0, + "39": 22801334.0, + "40": 22658144.0, + "41": 22659958.0, + "42": 22667478.0, + "43": 22975596.0, + "44": 22746734.0, + "45": 22674630.0, + "46": 22884436.0, + "47": 22633878.0, + "48": 22929042.0, + "49": 22727064.0, + "50": 22904452.0, + "51": 22791508.0, + "52": 22748880.0, + "53": 22925802.0, + "54": 22840006.0, + "55": 22519094.0, + "56": 22878426.0, + "57": 23113192.0, + "58": 22845340.0, + "59": 22716044.0, + "60": 22743052.0, + "61": 22724280.0, + "62": 22673222.0, + "63": 22845776.0, + "64": 22823900.0, + "65": 23061016.0, + "66": 22729616.0, + "67": 22907968.0, + "68": 22610332.0, + "69": 22584232.0, + "70": 22829332.0, + "71": 22748216.0, + "72": 22654286.0, + "73": 22740516.0, + "74": 23047704.0, + "75": 23054164.0, + "76": 22901462.0, + "77": 22272388.0, + "78": 22789468.0, + "79": 22744352.0, + "80": 22707344.0, + "81": 22890704.0, + "82": 22777178.0, + "83": 22839028.0, + "84": 23010036.0, + "85": 22712182.0, + "86": 23103124.0, + "87": 22735052.0, + "88": 22637176.0, + "89": 22499076.0, + "90": 22971846.0, + "91": 22767066.0, + "92": 22808462.0, + "93": 22659702.0, + "94": 22912288.0, + "95": 23047676.0, + "96": 22828984.0, + "97": 22608528.0, + "98": 22763476.0, + "99": 22905460.0, + "100": 23015938.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 519065600.0, + "2": 519065600.0, + "3": 519065600.0, + "4": 519065600.0, + "5": 519065600.0, + "6": 519065600.0, + "7": 519065600.0, + "8": 519065600.0, + "9": 519065600.0, + "10": 519065600.0, + "11": 519065600.0, + "12": 519065600.0, + "13": 519065600.0, + "14": 519065600.0, + "15": 519065600.0, + "16": 519065600.0, + "17": 519065600.0, + "18": 519065600.0, + "19": 519065600.0, + "20": 519065600.0, + "21": 519065600.0, + "22": 519065600.0, + "23": 519065600.0, + "24": 519065600.0, + "25": 519065600.0, + "26": 519065600.0, + "27": 519065600.0, + "28": 519065600.0, + "29": 519065600.0, + "30": 519065600.0, + "31": 519065600.0, + "32": 519065600.0, + "33": 519065600.0, + "34": 519065600.0, + "35": 519065600.0, + "36": 519065600.0, + "37": 519065600.0, + "38": 519065600.0, + "39": 519065600.0, + "40": 519065600.0, + "41": 519065600.0, + "42": 519065600.0, + "43": 519065600.0, + "44": 519065600.0, + "45": 519065600.0, + "46": 519065600.0, + "47": 519065600.0, + "48": 519065600.0, + "49": 519065600.0, + "50": 519065600.0, + "51": 519065600.0, + "52": 519065600.0, + "53": 519065600.0, + "54": 519065600.0, + "55": 519065600.0, + "56": 519065600.0, + "57": 519065600.0, + "58": 519065600.0, + "59": 519065600.0, + "60": 519065600.0, + "61": 519065600.0, + "62": 519065600.0, + "63": 519065600.0, + "64": 519065600.0, + "65": 519065600.0, + "66": 519065600.0, + "67": 519065600.0, + "68": 519065600.0, + "69": 519065600.0, + "70": 519065600.0, + "71": 519065600.0, + "72": 519065600.0, + "73": 519065600.0, + "74": 519065600.0, + "75": 519065600.0, + "76": 519065600.0, + "77": 519065600.0, + "78": 519065600.0, + "79": 519065600.0, + "80": 519065600.0, + "81": 519065600.0, + "82": 519065600.0, + "83": 519065600.0, + "84": 519065600.0, + "85": 519065600.0, + "86": 519065600.0, + "87": 519065600.0, + "88": 519065600.0, + "89": 519065600.0, + "90": 519065600.0, + "91": 519065600.0, + "92": 519065600.0, + "93": 519065600.0, + "94": 519065600.0, + "95": 519065600.0, + "96": 519065600.0, + "97": 519065600.0, + "98": 519065600.0, + "99": 519065600.0, + "100": 519065600.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3767053312.0, + "2": 3907909632.0, + "3": 3907909632.0, + "4": 3907909632.0, + "5": 3907909632.0, + "6": 3907909632.0, + "7": 3907909632.0, + "8": 3907909632.0, + "9": 3907909632.0, + "10": 3907909632.0, + "11": 3907909632.0, + "12": 3907909632.0, + "13": 3907909632.0, + "14": 3907909632.0, + "15": 3907909632.0, + "16": 3907909632.0, + "17": 3907909632.0, + "18": 3907909632.0, + "19": 3907909632.0, + "20": 3907909632.0, + "21": 3907909632.0, + "22": 3907909632.0, + "23": 3907909632.0, + "24": 3907909632.0, + "25": 3907909632.0, + "26": 3907909632.0, + "27": 3907909632.0, + "28": 3907909632.0, + "29": 3907909632.0, + "30": 3907909632.0, + "31": 3907909632.0, + "32": 3907909632.0, + "33": 3907909632.0, + "34": 3907909632.0, + "35": 3907909632.0, + "36": 3907909632.0, + "37": 3907909632.0, + "38": 3907909632.0, + "39": 3907909632.0, + "40": 3907909632.0, + "41": 3907909632.0, + "42": 3907909632.0, + "43": 3907909632.0, + "44": 3907909632.0, + "45": 3907909632.0, + "46": 3907909632.0, + "47": 3907909632.0, + "48": 3907909632.0, + "49": 3907909632.0, + "50": 3907909632.0, + "51": 3907909632.0, + "52": 3907909632.0, + "53": 3907909632.0, + "54": 3907909632.0, + "55": 3907909632.0, + "56": 3907909632.0, + "57": 3907909632.0, + "58": 3907909632.0, + "59": 3907909632.0, + "60": 3907909632.0, + "61": 3907909632.0, + "62": 3907909632.0, + "63": 3907909632.0, + "64": 3907909632.0, + "65": 3907909632.0, + "66": 3907909632.0, + "67": 3907909632.0, + "68": 3907909632.0, + "69": 3907909632.0, + "70": 3907909632.0, + "71": 3907909632.0, + "72": 3907909632.0, + "73": 3907909632.0, + "74": 3907909632.0, + "75": 3907909632.0, + "76": 3907909632.0, + "77": 3907909632.0, + "78": 3907909632.0, + "79": 3907909632.0, + "80": 3907909632.0, + "81": 3907909632.0, + "82": 3907909632.0, + "83": 3907909632.0, + "84": 3907909632.0, + "85": 3907909632.0, + "86": 3907909632.0, + "87": 3907909632.0, + "88": 3907909632.0, + "89": 3907909632.0, + "90": 3907909632.0, + "91": 3907909632.0, + "92": 3907909632.0, + "93": 3907909632.0, + "94": 3907909632.0, + "95": 3907909632.0, + "96": 3907909632.0, + "97": 3907909632.0, + "98": 3907909632.0, + "99": 3907909632.0, + "100": 3907909632.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 20.87438, + "2": 0.21694, + "3": 0.17509, + "4": 0.17193, + "5": 0.17145, + "6": 0.17454, + "7": 0.1709, + "8": 0.1729, + "9": 0.17295, + "10": 0.17277, + "11": 0.17318, + "12": 0.17273, + "13": 0.171, + "14": 0.17232, + "15": 0.1722, + "16": 0.17261, + "17": 0.17438, + "18": 0.17353, + "19": 0.1731, + "20": 0.17122, + "21": 0.17049, + "22": 0.17348, + "23": 0.17169, + "24": 0.17293, + "25": 0.17364, + "26": 0.17003, + "27": 0.17011, + "28": 0.17126, + "29": 0.1722, + "30": 0.17039, + "31": 0.17016, + "32": 0.17105, + "33": 0.16994, + "34": 0.17076, + "35": 0.17327, + "36": 0.17175, + "37": 0.17048, + "38": 0.1719, + "39": 0.17008, + "40": 0.17063, + "41": 0.17257, + "42": 0.17094, + "43": 0.17115, + "44": 0.17118, + "45": 0.171, + "46": 0.17132, + "47": 0.16943, + "48": 0.17114, + "49": 0.17083, + "50": 0.16974, + "51": 0.17654, + "52": 0.17131, + "53": 0.35484, + "54": 0.16981, + "55": 0.16969, + "56": 0.17178, + "57": 0.16951, + "58": 0.16856, + "59": 0.17046, + "60": 0.45725, + "61": 0.17092, + "62": 0.171, + "63": 0.17125, + "64": 0.17131, + "65": 0.17462, + "66": 0.17192, + "67": 0.16865, + "68": 0.17104, + "69": 0.16936, + "70": 0.17219, + "71": 0.174, + "72": 0.17689, + "73": 0.17007, + "74": 0.16999, + "75": 0.16903, + "76": 0.17096, + "77": 0.16876, + "78": 0.17318, + "79": 0.17216, + "80": 0.17036, + "81": 0.16928, + "82": 0.17019, + "83": 0.17001, + "84": 0.17182, + "85": 0.16951, + "86": 0.4678, + "87": 0.16886, + "88": 0.1689, + "89": 0.16837, + "90": 0.16751, + "91": 0.168, + "92": 0.1724, + "93": 0.16907, + "94": 0.17236, + "95": 0.16852, + "96": 0.16884, + "97": 0.16823, + "98": 0.16821, + "99": 0.16981, + "100": 0.1715 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..0805966b94c --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.9359, + "2": 10.92235, + "3": 10.92366, + "4": 10.90567, + "5": 10.93225, + "6": 10.93547, + "7": 10.92702, + "8": 10.92052, + "9": 10.9395, + "10": 10.91083, + "11": 10.94242, + "12": 10.93185, + "13": 10.92496, + "14": 10.94487, + "15": 10.85723, + "16": 10.88074, + "17": 10.87011, + "18": 10.88561, + "19": 10.87042, + "20": 10.77088, + "21": 10.7565, + "22": 10.62779, + "23": 10.77022, + "24": 10.65205, + "25": 10.60556, + "26": 10.66333, + "27": 10.66552, + "28": 10.60547, + "29": 10.6471, + "30": 10.40549, + "31": 10.16719, + "32": 10.51369, + "33": 10.5051, + "34": 10.27046, + "35": 10.31366, + "36": 10.27241, + "37": 10.38617, + "38": 10.23179, + "39": 10.45437, + "40": 10.12334, + "41": 10.19576, + "42": 10.25282, + "43": 9.86635, + "44": 9.99502, + "45": 9.87564, + "46": 9.86006, + "47": 10.19474, + "48": 9.87777, + "49": 9.56673, + "50": 9.94452, + "51": 9.89728, + "52": 9.7879, + "53": 10.1278, + "54": 9.98346, + "55": 9.90094, + "56": 9.66557, + "57": 9.50042, + "58": 9.87703, + "59": 9.61777, + "60": 9.55238, + "61": 9.71568, + "62": 10.03384, + "63": 9.41318, + "64": 9.8198, + "65": 8.96792, + "66": 9.74791, + "67": 9.39412, + "68": 9.82081, + "69": 9.82389, + "70": 9.77835, + "71": 9.64728, + "72": 9.59599, + "73": 9.53704, + "74": 8.96545, + "75": 9.44605, + "76": 9.10011, + "77": 10.09977, + "78": 9.7355, + "79": 9.38643, + "80": 9.42014, + "81": 9.50916, + "82": 9.72306, + "83": 9.3462, + "84": 9.44805, + "85": 9.64324, + "86": 9.07728, + "87": 9.61635, + "88": 9.79137, + "89": 9.61978, + "90": 9.85827, + "91": 9.35282, + "92": 9.38717, + "93": 9.08084, + "94": 8.82234, + "95": 9.52085, + "96": 9.54578, + "97": 9.34183, + "98": 9.70521, + "99": 8.89223, + "100": 9.43415 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 22727686.0, + "2": 22924976.0, + "3": 22597376.0, + "4": 23218740.0, + "5": 22715312.0, + "6": 23020980.0, + "7": 22770736.0, + "8": 22927078.0, + "9": 22841964.0, + "10": 22919060.0, + "11": 22501344.0, + "12": 22460424.0, + "13": 22916824.0, + "14": 22388904.0, + "15": 22821200.0, + "16": 22829956.0, + "17": 22819072.0, + "18": 22582680.0, + "19": 22618528.0, + "20": 22693840.0, + "21": 22739692.0, + "22": 22799900.0, + "23": 22538946.0, + "24": 22771530.0, + "25": 22819524.0, + "26": 22548320.0, + "27": 22468868.0, + "28": 22452892.0, + "29": 22530184.0, + "30": 22631232.0, + "31": 22955646.0, + "32": 22584920.0, + "33": 22558000.0, + "34": 22835968.0, + "35": 22787888.0, + "36": 22589844.0, + "37": 22497188.0, + "38": 22896516.0, + "39": 22801334.0, + "40": 22658144.0, + "41": 22659958.0, + "42": 22667478.0, + "43": 22975596.0, + "44": 22746734.0, + "45": 22674630.0, + "46": 22884436.0, + "47": 22633878.0, + "48": 22929042.0, + "49": 22727064.0, + "50": 22904452.0, + "51": 22791508.0, + "52": 22748880.0, + "53": 22925802.0, + "54": 22840006.0, + "55": 22519094.0, + "56": 22878426.0, + "57": 23113192.0, + "58": 22845340.0, + "59": 22716044.0, + "60": 22743052.0, + "61": 22724280.0, + "62": 22673222.0, + "63": 22845776.0, + "64": 22823900.0, + "65": 23061016.0, + "66": 22729616.0, + "67": 22907968.0, + "68": 22610332.0, + "69": 22584232.0, + "70": 22829332.0, + "71": 22748216.0, + "72": 22654286.0, + "73": 22740516.0, + "74": 23047704.0, + "75": 23054164.0, + "76": 22901462.0, + "77": 22272388.0, + "78": 22789468.0, + "79": 22744352.0, + "80": 22707344.0, + "81": 22890704.0, + "82": 22777178.0, + "83": 22839028.0, + "84": 23010036.0, + "85": 22712182.0, + "86": 23103124.0, + "87": 22735052.0, + "88": 22637176.0, + "89": 22499076.0, + "90": 22971846.0, + "91": 22767066.0, + "92": 22808462.0, + "93": 22659702.0, + "94": 22912288.0, + "95": 23047676.0, + "96": 22828984.0, + "97": 22608528.0, + "98": 22763476.0, + "99": 22905460.0, + "100": 23015938.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 519065600.0, + "2": 519065600.0, + "3": 519065600.0, + "4": 519065600.0, + "5": 519065600.0, + "6": 519065600.0, + "7": 519065600.0, + "8": 519065600.0, + "9": 519065600.0, + "10": 519065600.0, + "11": 519065600.0, + "12": 519065600.0, + "13": 519065600.0, + "14": 519065600.0, + "15": 519065600.0, + "16": 519065600.0, + "17": 519065600.0, + "18": 519065600.0, + "19": 519065600.0, + "20": 519065600.0, + "21": 519065600.0, + "22": 519065600.0, + "23": 519065600.0, + "24": 519065600.0, + "25": 519065600.0, + "26": 519065600.0, + "27": 519065600.0, + "28": 519065600.0, + "29": 519065600.0, + "30": 519065600.0, + "31": 519065600.0, + "32": 519065600.0, + "33": 519065600.0, + "34": 519065600.0, + "35": 519065600.0, + "36": 519065600.0, + "37": 519065600.0, + "38": 519065600.0, + "39": 519065600.0, + "40": 519065600.0, + "41": 519065600.0, + "42": 519065600.0, + "43": 519065600.0, + "44": 519065600.0, + "45": 519065600.0, + "46": 519065600.0, + "47": 519065600.0, + "48": 519065600.0, + "49": 519065600.0, + "50": 519065600.0, + "51": 519065600.0, + "52": 519065600.0, + "53": 519065600.0, + "54": 519065600.0, + "55": 519065600.0, + "56": 519065600.0, + "57": 519065600.0, + "58": 519065600.0, + "59": 519065600.0, + "60": 519065600.0, + "61": 519065600.0, + "62": 519065600.0, + "63": 519065600.0, + "64": 519065600.0, + "65": 519065600.0, + "66": 519065600.0, + "67": 519065600.0, + "68": 519065600.0, + "69": 519065600.0, + "70": 519065600.0, + "71": 519065600.0, + "72": 519065600.0, + "73": 519065600.0, + "74": 519065600.0, + "75": 519065600.0, + "76": 519065600.0, + "77": 519065600.0, + "78": 519065600.0, + "79": 519065600.0, + "80": 519065600.0, + "81": 519065600.0, + "82": 519065600.0, + "83": 519065600.0, + "84": 519065600.0, + "85": 519065600.0, + "86": 519065600.0, + "87": 519065600.0, + "88": 519065600.0, + "89": 519065600.0, + "90": 519065600.0, + "91": 519065600.0, + "92": 519065600.0, + "93": 519065600.0, + "94": 519065600.0, + "95": 519065600.0, + "96": 519065600.0, + "97": 519065600.0, + "98": 519065600.0, + "99": 519065600.0, + "100": 519065600.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3767053312.0, + "2": 3907909632.0, + "3": 3907909632.0, + "4": 3907909632.0, + "5": 3907909632.0, + "6": 3907909632.0, + "7": 3907909632.0, + "8": 3907909632.0, + "9": 3907909632.0, + "10": 3907909632.0, + "11": 3907909632.0, + "12": 3907909632.0, + "13": 3907909632.0, + "14": 3907909632.0, + "15": 3907909632.0, + "16": 3907909632.0, + "17": 3907909632.0, + "18": 3907909632.0, + "19": 3907909632.0, + "20": 3907909632.0, + "21": 3907909632.0, + "22": 3907909632.0, + "23": 3907909632.0, + "24": 3907909632.0, + "25": 3907909632.0, + "26": 3907909632.0, + "27": 3907909632.0, + "28": 3907909632.0, + "29": 3907909632.0, + "30": 3907909632.0, + "31": 3907909632.0, + "32": 3907909632.0, + "33": 3907909632.0, + "34": 3907909632.0, + "35": 3907909632.0, + "36": 3907909632.0, + "37": 3907909632.0, + "38": 3907909632.0, + "39": 3907909632.0, + "40": 3907909632.0, + "41": 3907909632.0, + "42": 3907909632.0, + "43": 3907909632.0, + "44": 3907909632.0, + "45": 3907909632.0, + "46": 3907909632.0, + "47": 3907909632.0, + "48": 3907909632.0, + "49": 3907909632.0, + "50": 3907909632.0, + "51": 3907909632.0, + "52": 3907909632.0, + "53": 3907909632.0, + "54": 3907909632.0, + "55": 3907909632.0, + "56": 3907909632.0, + "57": 3907909632.0, + "58": 3907909632.0, + "59": 3907909632.0, + "60": 3907909632.0, + "61": 3907909632.0, + "62": 3907909632.0, + "63": 3907909632.0, + "64": 3907909632.0, + "65": 3907909632.0, + "66": 3907909632.0, + "67": 3907909632.0, + "68": 3907909632.0, + "69": 3907909632.0, + "70": 3907909632.0, + "71": 3907909632.0, + "72": 3907909632.0, + "73": 3907909632.0, + "74": 3907909632.0, + "75": 3907909632.0, + "76": 3907909632.0, + "77": 3907909632.0, + "78": 3907909632.0, + "79": 3907909632.0, + "80": 3907909632.0, + "81": 3907909632.0, + "82": 3907909632.0, + "83": 3907909632.0, + "84": 3907909632.0, + "85": 3907909632.0, + "86": 3907909632.0, + "87": 3907909632.0, + "88": 3907909632.0, + "89": 3907909632.0, + "90": 3907909632.0, + "91": 3907909632.0, + "92": 3907909632.0, + "93": 3907909632.0, + "94": 3907909632.0, + "95": 3907909632.0, + "96": 3907909632.0, + "97": 3907909632.0, + "98": 3907909632.0, + "99": 3907909632.0, + "100": 3907909632.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 23.45694, + "2": 0.20346, + "3": 0.36409, + "4": 0.17107, + "5": 0.17023, + "6": 0.17074, + "7": 0.38699, + "8": 0.17041, + "9": 0.16888, + "10": 0.16794, + "11": 0.16767, + "12": 0.16767, + "13": 0.16663, + "14": 0.16756, + "15": 0.16615, + "16": 0.16657, + "17": 0.16641, + "18": 0.16668, + "19": 0.16729, + "20": 0.16771, + "21": 0.16737, + "22": 0.17089, + "23": 0.16854, + "24": 0.16704, + "25": 0.16752, + "26": 0.16872, + "27": 0.16766, + "28": 0.16803, + "29": 0.16634, + "30": 0.16703, + "31": 0.17358, + "32": 0.16783, + "33": 0.1671, + "34": 0.16686, + "35": 0.16729, + "36": 0.16745, + "37": 0.16819, + "38": 0.16726, + "39": 0.16705, + "40": 0.16771, + "41": 0.16664, + "42": 0.1698, + "43": 0.16915, + "44": 0.16724, + "45": 0.16752, + "46": 0.16605, + "47": 0.16613, + "48": 0.16709, + "49": 0.17009, + "50": 0.1677, + "51": 0.17196, + "52": 0.16857, + "53": 0.16835, + "54": 0.16769, + "55": 0.16954, + "56": 0.16851, + "57": 0.17085, + "58": 0.16981, + "59": 0.17076, + "60": 0.45985, + "61": 0.1701, + "62": 0.16952, + "63": 0.16919, + "64": 0.16816, + "65": 0.16858, + "66": 0.16768, + "67": 0.16965, + "68": 0.16881, + "69": 0.16837, + "70": 0.16824, + "71": 0.16956, + "72": 0.16914, + "73": 0.17096, + "74": 0.16954, + "75": 0.16772, + "76": 0.16933, + "77": 0.16793, + "78": 0.16698, + "79": 0.17038, + "80": 0.16791, + "81": 0.16747, + "82": 0.16745, + "83": 0.16958, + "84": 0.16855, + "85": 0.16833, + "86": 0.16922, + "87": 0.16839, + "88": 0.16805, + "89": 0.16825, + "90": 0.16691, + "91": 0.16873, + "92": 0.16882, + "93": 0.16822, + "94": 0.16847, + "95": 0.16712, + "96": 0.16757, + "97": 0.16817, + "98": 0.168, + "99": 0.16812, + "100": 0.16722 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..796cf7943e2 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.81184, + "2": 10.80964, + "3": 10.8261, + "4": 10.83055, + "5": 10.85464, + "6": 10.84052, + "7": 10.83581, + "8": 10.80288, + "9": 10.87748, + "10": 10.88256, + "11": 10.87624, + "12": 10.82598, + "13": 10.84134, + "14": 10.81521, + "15": 10.80679, + "16": 10.79904, + "17": 10.76842, + "18": 10.77939, + "19": 10.75192, + "20": 10.63196, + "21": 10.68212, + "22": 10.63985, + "23": 10.75592, + "24": 10.60961, + "25": 10.47374, + "26": 10.59698, + "27": 10.54094, + "28": 10.44971, + "29": 10.39259, + "30": 10.39285, + "31": 10.49257, + "32": 10.31859, + "33": 10.27757, + "34": 10.44435, + "35": 9.96791, + "36": 10.11232, + "37": 10.02385, + "38": 10.37514, + "39": 9.78682, + "40": 10.1, + "41": 10.12396, + "42": 10.03, + "43": 10.19936, + "44": 10.0547, + "45": 9.68344, + "46": 9.98163, + "47": 9.92505, + "48": 9.6694, + "49": 9.91809, + "50": 9.92465, + "51": 9.79329, + "52": 9.32763, + "53": 9.64981, + "54": 9.86048, + "55": 9.98132, + "56": 9.81689, + "57": 9.74442, + "58": 9.83018, + "59": 9.32863, + "60": 9.3523, + "61": 9.45116, + "62": 10.19127, + "63": 9.35566, + "64": 9.62798, + "65": 9.70213, + "66": 9.52535, + "67": 9.66178, + "68": 9.58762, + "69": 9.38587, + "70": 9.73809, + "71": 9.87613, + "72": 9.69256, + "73": 9.39159, + "74": 9.44032, + "75": 8.95616, + "76": 9.56366, + "77": 9.61319, + "78": 9.39159, + "79": 9.52907, + "80": 9.31501, + "81": 9.70173, + "82": 9.90394, + "83": 9.31634, + "84": 9.47172, + "85": 8.97886, + "86": 9.6647, + "87": 9.43234, + "88": 9.58689, + "89": 9.52323, + "90": 9.55812, + "91": 9.62767, + "92": 9.13988, + "93": 9.42377, + "94": 9.54545, + "95": 9.13529, + "96": 8.75175, + "97": 9.58148, + "98": 9.78964, + "99": 9.37931, + "100": 9.21091 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1125.0, + "2": 1177.0, + "3": 1265.0, + "4": 1241.0, + "5": 1255.0, + "6": 1304.0, + "7": 1204.0, + "8": 998.0, + "9": 1236.0, + "10": 1367.0, + "11": 1252.0, + "12": 1281.0, + "13": 1254.0, + "14": 1148.0, + "15": 1127.0, + "16": 1102.0, + "17": 1193.0, + "18": 1248.0, + "19": 1072.0, + "20": 1082.0, + "21": 1201.0, + "22": 1302.0, + "23": 1336.0, + "24": 1317.0, + "25": 1114.0, + "26": 1200.0, + "27": 1255.0, + "28": 1323.0, + "29": 1288.0, + "30": 1558.0, + "31": 1489.0, + "32": 1390.0, + "33": 1413.0, + "34": 1518.0, + "35": 1292.0, + "36": 1395.0, + "37": 1487.0, + "38": 1573.0, + "39": 1376.0, + "40": 1433.0, + "41": 1677.0, + "42": 1728.0, + "43": 1669.0, + "44": 1607.0, + "45": 1564.0, + "46": 1874.0, + "47": 1660.0, + "48": 1554.0, + "49": 1781.0, + "50": 1749.0, + "51": 1747.0, + "52": 1656.0, + "53": 1912.0, + "54": 1870.0, + "55": 1718.0, + "56": 1972.0, + "57": 1917.0, + "58": 1686.0, + "59": 1542.0, + "60": 1872.0, + "61": 2198.0, + "62": 2145.0, + "63": 1975.0, + "64": 2111.0, + "65": 2464.0, + "66": 2160.0, + "67": 2311.0, + "68": 2259.0, + "69": 2255.0, + "70": 2564.0, + "71": 2402.0, + "72": 2424.0, + "73": 1990.0, + "74": 2221.0, + "75": 1884.0, + "76": 2375.0, + "77": 2394.0, + "78": 2450.0, + "79": 2674.0, + "80": 1924.0, + "81": 2394.0, + "82": 2612.0, + "83": 2579.0, + "84": 2243.0, + "85": 2150.0, + "86": 2358.0, + "87": 2678.0, + "88": 2260.0, + "89": 2556.0, + "90": 2319.0, + "91": 2452.0, + "92": 1952.0, + "93": 2189.0, + "94": 2451.0, + "95": 2518.0, + "96": 2182.0, + "97": 2162.0, + "98": 2332.0, + "99": 2331.0, + "100": 2071.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 730320896.0, + "2": 730320896.0, + "3": 730320896.0, + "4": 730320896.0, + "5": 730320896.0, + "6": 730320896.0, + "7": 730320896.0, + "8": 730320896.0, + "9": 730320896.0, + "10": 730320896.0, + "11": 730320896.0, + "12": 730320896.0, + "13": 730320896.0, + "14": 730320896.0, + "15": 730320896.0, + "16": 730320896.0, + "17": 730320896.0, + "18": 730320896.0, + "19": 730320896.0, + "20": 730320896.0, + "21": 730320896.0, + "22": 730320896.0, + "23": 730320896.0, + "24": 730320896.0, + "25": 730320896.0, + "26": 730320896.0, + "27": 730320896.0, + "28": 730320896.0, + "29": 730320896.0, + "30": 730320896.0, + "31": 730320896.0, + "32": 730320896.0, + "33": 730320896.0, + "34": 730320896.0, + "35": 730320896.0, + "36": 730320896.0, + "37": 730320896.0, + "38": 730320896.0, + "39": 730320896.0, + "40": 730320896.0, + "41": 730320896.0, + "42": 730320896.0, + "43": 730320896.0, + "44": 730320896.0, + "45": 730320896.0, + "46": 730320896.0, + "47": 730320896.0, + "48": 730320896.0, + "49": 730320896.0, + "50": 730320896.0, + "51": 730320896.0, + "52": 730320896.0, + "53": 730320896.0, + "54": 730320896.0, + "55": 730320896.0, + "56": 730320896.0, + "57": 730320896.0, + "58": 730320896.0, + "59": 730320896.0, + "60": 730320896.0, + "61": 730320896.0, + "62": 730320896.0, + "63": 730320896.0, + "64": 730320896.0, + "65": 730320896.0, + "66": 730320896.0, + "67": 730320896.0, + "68": 730320896.0, + "69": 730320896.0, + "70": 730320896.0, + "71": 730320896.0, + "72": 730320896.0, + "73": 730320896.0, + "74": 730320896.0, + "75": 730320896.0, + "76": 730320896.0, + "77": 730320896.0, + "78": 730320896.0, + "79": 730320896.0, + "80": 730320896.0, + "81": 730320896.0, + "82": 730320896.0, + "83": 730320896.0, + "84": 730320896.0, + "85": 730320896.0, + "86": 730320896.0, + "87": 730320896.0, + "88": 730320896.0, + "89": 730320896.0, + "90": 730320896.0, + "91": 730320896.0, + "92": 730320896.0, + "93": 730320896.0, + "94": 730320896.0, + "95": 730320896.0, + "96": 730320896.0, + "97": 730320896.0, + "98": 730320896.0, + "99": 730320896.0, + "100": 730320896.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4311542272.0, + "2": 4593253888.0, + "3": 4593253888.0, + "4": 4593253888.0, + "5": 4593253888.0, + "6": 4593253888.0, + "7": 4593253888.0, + "8": 4593253888.0, + "9": 4593253888.0, + "10": 4593253888.0, + "11": 4593253888.0, + "12": 4593253888.0, + "13": 4593253888.0, + "14": 4593253888.0, + "15": 4593253888.0, + "16": 4593253888.0, + "17": 4593253888.0, + "18": 4593253888.0, + "19": 4593253888.0, + "20": 4593253888.0, + "21": 4593253888.0, + "22": 4593253888.0, + "23": 4593253888.0, + "24": 4593253888.0, + "25": 4593253888.0, + "26": 4593253888.0, + "27": 4593253888.0, + "28": 4593253888.0, + "29": 4593253888.0, + "30": 4593253888.0, + "31": 4593253888.0, + "32": 4593253888.0, + "33": 4593253888.0, + "34": 4593253888.0, + "35": 4593253888.0, + "36": 4593253888.0, + "37": 4593253888.0, + "38": 4593253888.0, + "39": 4593253888.0, + "40": 4593253888.0, + "41": 4593253888.0, + "42": 4593253888.0, + "43": 4593253888.0, + "44": 4593253888.0, + "45": 4593253888.0, + "46": 4593253888.0, + "47": 4593253888.0, + "48": 4593253888.0, + "49": 4593253888.0, + "50": 4593253888.0, + "51": 4593253888.0, + "52": 4593253888.0, + "53": 4593253888.0, + "54": 4593253888.0, + "55": 4593253888.0, + "56": 4593253888.0, + "57": 4593253888.0, + "58": 4593253888.0, + "59": 4593253888.0, + "60": 4593253888.0, + "61": 4593253888.0, + "62": 4593253888.0, + "63": 4593253888.0, + "64": 4593253888.0, + "65": 4593253888.0, + "66": 4593253888.0, + "67": 4593253888.0, + "68": 4593253888.0, + "69": 4593253888.0, + "70": 4593253888.0, + "71": 4593253888.0, + "72": 4593253888.0, + "73": 4593253888.0, + "74": 4593253888.0, + "75": 4593253888.0, + "76": 4593253888.0, + "77": 4593253888.0, + "78": 4593253888.0, + "79": 4593253888.0, + "80": 4593253888.0, + "81": 4593253888.0, + "82": 4593253888.0, + "83": 4593253888.0, + "84": 4593253888.0, + "85": 4593253888.0, + "86": 4593253888.0, + "87": 4593253888.0, + "88": 4593253888.0, + "89": 4593253888.0, + "90": 4593253888.0, + "91": 4593253888.0, + "92": 4593253888.0, + "93": 4593253888.0, + "94": 4593253888.0, + "95": 4593253888.0, + "96": 4593253888.0, + "97": 4593253888.0, + "98": 4593253888.0, + "99": 4593253888.0, + "100": 4593253888.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 17.52326, + "2": 0.393, + "3": 0.36565, + "4": 0.55958, + "5": 0.59432, + "6": 0.36552, + "7": 0.3644, + "8": 0.36754, + "9": 0.36565, + "10": 0.36536, + "11": 0.36628, + "12": 0.36391, + "13": 0.36591, + "14": 0.3664, + "15": 0.36556, + "16": 0.3646, + "17": 0.36476, + "18": 0.36531, + "19": 0.36649, + "20": 0.36649, + "21": 0.36435, + "22": 0.3664, + "23": 0.36307, + "24": 0.36376, + "25": 0.36657, + "26": 0.36362, + "27": 0.36425, + "28": 0.36383, + "29": 0.36442, + "30": 0.36444, + "31": 0.3654, + "32": 0.36458, + "33": 0.36385, + "34": 0.36266, + "35": 0.36477, + "36": 0.36485, + "37": 0.36372, + "38": 0.36353, + "39": 0.36479, + "40": 0.36451, + "41": 0.36779, + "42": 0.36291, + "43": 0.36064, + "44": 0.36562, + "45": 0.36059, + "46": 0.36061, + "47": 0.36334, + "48": 0.35858, + "49": 0.36178, + "50": 0.36084, + "51": 0.36846, + "52": 0.36344, + "53": 0.36176, + "54": 0.36135, + "55": 0.36414, + "56": 0.36441, + "57": 0.36275, + "58": 0.36148, + "59": 0.36257, + "60": 0.36232, + "61": 0.36496, + "62": 0.36046, + "63": 0.36356, + "64": 0.36319, + "65": 0.3607, + "66": 0.36207, + "67": 0.36075, + "68": 0.35944, + "69": 0.36108, + "70": 0.35673, + "71": 0.36006, + "72": 0.3571, + "73": 0.36016, + "74": 0.36157, + "75": 0.36375, + "76": 0.35881, + "77": 0.36157, + "78": 0.35722, + "79": 0.35554, + "80": 0.35834, + "81": 0.35751, + "82": 0.35515, + "83": 0.35648, + "84": 0.5928, + "85": 0.35925, + "86": 0.3557, + "87": 0.3574, + "88": 0.35737, + "89": 0.4081, + "90": 0.56444, + "91": 0.35647, + "92": 0.35632, + "93": 0.35846, + "94": 0.35392, + "95": 0.35892, + "96": 0.36197, + "97": 0.36101, + "98": 0.35768, + "99": 0.36307, + "100": 0.35815 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..ec432ff7884 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.81184, + "2": 10.80964, + "3": 10.8261, + "4": 10.83055, + "5": 10.85464, + "6": 10.84052, + "7": 10.83581, + "8": 10.80288, + "9": 10.87748, + "10": 10.88256, + "11": 10.87624, + "12": 10.82598, + "13": 10.84134, + "14": 10.81521, + "15": 10.80679, + "16": 10.79904, + "17": 10.76842, + "18": 10.77939, + "19": 10.75192, + "20": 10.63196, + "21": 10.68212, + "22": 10.63985, + "23": 10.75592, + "24": 10.60961, + "25": 10.47374, + "26": 10.59698, + "27": 10.54094, + "28": 10.44971, + "29": 10.39259, + "30": 10.39285, + "31": 10.49257, + "32": 10.31859, + "33": 10.27757, + "34": 10.44435, + "35": 9.96791, + "36": 10.11232, + "37": 10.02385, + "38": 10.37514, + "39": 9.78682, + "40": 10.1, + "41": 10.12396, + "42": 10.03, + "43": 10.19936, + "44": 10.0547, + "45": 9.68344, + "46": 9.98163, + "47": 9.92505, + "48": 9.6694, + "49": 9.91809, + "50": 9.92465, + "51": 9.79329, + "52": 9.32763, + "53": 9.64981, + "54": 9.86048, + "55": 9.98132, + "56": 9.81689, + "57": 9.74442, + "58": 9.83018, + "59": 9.32863, + "60": 9.3523, + "61": 9.45116, + "62": 10.19127, + "63": 9.35566, + "64": 9.62798, + "65": 9.70213, + "66": 9.52535, + "67": 9.66178, + "68": 9.58762, + "69": 9.38587, + "70": 9.73809, + "71": 9.87613, + "72": 9.69256, + "73": 9.39159, + "74": 9.44032, + "75": 8.95616, + "76": 9.56366, + "77": 9.61319, + "78": 9.39159, + "79": 9.52907, + "80": 9.31501, + "81": 9.70173, + "82": 9.90394, + "83": 9.31634, + "84": 9.47172, + "85": 8.97886, + "86": 9.6647, + "87": 9.43234, + "88": 9.58689, + "89": 9.52323, + "90": 9.55812, + "91": 9.62767, + "92": 9.13988, + "93": 9.42377, + "94": 9.54545, + "95": 9.13529, + "96": 8.75175, + "97": 9.58148, + "98": 9.78964, + "99": 9.37931, + "100": 9.21091 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1125.0, + "2": 1177.0, + "3": 1265.0, + "4": 1241.0, + "5": 1255.0, + "6": 1304.0, + "7": 1204.0, + "8": 998.0, + "9": 1236.0, + "10": 1367.0, + "11": 1252.0, + "12": 1281.0, + "13": 1254.0, + "14": 1148.0, + "15": 1127.0, + "16": 1102.0, + "17": 1193.0, + "18": 1248.0, + "19": 1072.0, + "20": 1082.0, + "21": 1201.0, + "22": 1302.0, + "23": 1336.0, + "24": 1317.0, + "25": 1114.0, + "26": 1200.0, + "27": 1255.0, + "28": 1323.0, + "29": 1288.0, + "30": 1558.0, + "31": 1489.0, + "32": 1390.0, + "33": 1413.0, + "34": 1518.0, + "35": 1292.0, + "36": 1395.0, + "37": 1487.0, + "38": 1573.0, + "39": 1376.0, + "40": 1433.0, + "41": 1677.0, + "42": 1728.0, + "43": 1669.0, + "44": 1607.0, + "45": 1564.0, + "46": 1874.0, + "47": 1660.0, + "48": 1554.0, + "49": 1781.0, + "50": 1749.0, + "51": 1747.0, + "52": 1656.0, + "53": 1912.0, + "54": 1870.0, + "55": 1718.0, + "56": 1972.0, + "57": 1917.0, + "58": 1686.0, + "59": 1542.0, + "60": 1872.0, + "61": 2198.0, + "62": 2145.0, + "63": 1975.0, + "64": 2111.0, + "65": 2464.0, + "66": 2160.0, + "67": 2311.0, + "68": 2259.0, + "69": 2255.0, + "70": 2564.0, + "71": 2402.0, + "72": 2424.0, + "73": 1990.0, + "74": 2221.0, + "75": 1884.0, + "76": 2375.0, + "77": 2394.0, + "78": 2450.0, + "79": 2674.0, + "80": 1924.0, + "81": 2394.0, + "82": 2612.0, + "83": 2579.0, + "84": 2243.0, + "85": 2150.0, + "86": 2358.0, + "87": 2678.0, + "88": 2260.0, + "89": 2556.0, + "90": 2319.0, + "91": 2452.0, + "92": 1952.0, + "93": 2189.0, + "94": 2451.0, + "95": 2518.0, + "96": 2182.0, + "97": 2162.0, + "98": 2332.0, + "99": 2331.0, + "100": 2071.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 730320896.0, + "2": 730320896.0, + "3": 730320896.0, + "4": 730320896.0, + "5": 730320896.0, + "6": 730320896.0, + "7": 730320896.0, + "8": 730320896.0, + "9": 730320896.0, + "10": 730320896.0, + "11": 730320896.0, + "12": 730320896.0, + "13": 730320896.0, + "14": 730320896.0, + "15": 730320896.0, + "16": 730320896.0, + "17": 730320896.0, + "18": 730320896.0, + "19": 730320896.0, + "20": 730320896.0, + "21": 730320896.0, + "22": 730320896.0, + "23": 730320896.0, + "24": 730320896.0, + "25": 730320896.0, + "26": 730320896.0, + "27": 730320896.0, + "28": 730320896.0, + "29": 730320896.0, + "30": 730320896.0, + "31": 730320896.0, + "32": 730320896.0, + "33": 730320896.0, + "34": 730320896.0, + "35": 730320896.0, + "36": 730320896.0, + "37": 730320896.0, + "38": 730320896.0, + "39": 730320896.0, + "40": 730320896.0, + "41": 730320896.0, + "42": 730320896.0, + "43": 730320896.0, + "44": 730320896.0, + "45": 730320896.0, + "46": 730320896.0, + "47": 730320896.0, + "48": 730320896.0, + "49": 730320896.0, + "50": 730320896.0, + "51": 730320896.0, + "52": 730320896.0, + "53": 730320896.0, + "54": 730320896.0, + "55": 730320896.0, + "56": 730320896.0, + "57": 730320896.0, + "58": 730320896.0, + "59": 730320896.0, + "60": 730320896.0, + "61": 730320896.0, + "62": 730320896.0, + "63": 730320896.0, + "64": 730320896.0, + "65": 730320896.0, + "66": 730320896.0, + "67": 730320896.0, + "68": 730320896.0, + "69": 730320896.0, + "70": 730320896.0, + "71": 730320896.0, + "72": 730320896.0, + "73": 730320896.0, + "74": 730320896.0, + "75": 730320896.0, + "76": 730320896.0, + "77": 730320896.0, + "78": 730320896.0, + "79": 730320896.0, + "80": 730320896.0, + "81": 730320896.0, + "82": 730320896.0, + "83": 730320896.0, + "84": 730320896.0, + "85": 730320896.0, + "86": 730320896.0, + "87": 730320896.0, + "88": 730320896.0, + "89": 730320896.0, + "90": 730320896.0, + "91": 730320896.0, + "92": 730320896.0, + "93": 730320896.0, + "94": 730320896.0, + "95": 730320896.0, + "96": 730320896.0, + "97": 730320896.0, + "98": 730320896.0, + "99": 730320896.0, + "100": 730320896.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4311542272.0, + "2": 4593253888.0, + "3": 4593253888.0, + "4": 4593253888.0, + "5": 4593253888.0, + "6": 4593253888.0, + "7": 4593253888.0, + "8": 4593253888.0, + "9": 4593253888.0, + "10": 4593253888.0, + "11": 4593253888.0, + "12": 4593253888.0, + "13": 4593253888.0, + "14": 4593253888.0, + "15": 4593253888.0, + "16": 4593253888.0, + "17": 4593253888.0, + "18": 4593253888.0, + "19": 4593253888.0, + "20": 4593253888.0, + "21": 4593253888.0, + "22": 4593253888.0, + "23": 4593253888.0, + "24": 4593253888.0, + "25": 4593253888.0, + "26": 4593253888.0, + "27": 4593253888.0, + "28": 4593253888.0, + "29": 4593253888.0, + "30": 4593253888.0, + "31": 4593253888.0, + "32": 4593253888.0, + "33": 4593253888.0, + "34": 4593253888.0, + "35": 4593253888.0, + "36": 4593253888.0, + "37": 4593253888.0, + "38": 4593253888.0, + "39": 4593253888.0, + "40": 4593253888.0, + "41": 4593253888.0, + "42": 4593253888.0, + "43": 4593253888.0, + "44": 4593253888.0, + "45": 4593253888.0, + "46": 4593253888.0, + "47": 4593253888.0, + "48": 4593253888.0, + "49": 4593253888.0, + "50": 4593253888.0, + "51": 4593253888.0, + "52": 4593253888.0, + "53": 4593253888.0, + "54": 4593253888.0, + "55": 4593253888.0, + "56": 4593253888.0, + "57": 4593253888.0, + "58": 4593253888.0, + "59": 4593253888.0, + "60": 4593253888.0, + "61": 4593253888.0, + "62": 4593253888.0, + "63": 4593253888.0, + "64": 4593253888.0, + "65": 4593253888.0, + "66": 4593253888.0, + "67": 4593253888.0, + "68": 4593253888.0, + "69": 4593253888.0, + "70": 4593253888.0, + "71": 4593253888.0, + "72": 4593253888.0, + "73": 4593253888.0, + "74": 4593253888.0, + "75": 4593253888.0, + "76": 4593253888.0, + "77": 4593253888.0, + "78": 4593253888.0, + "79": 4593253888.0, + "80": 4593253888.0, + "81": 4593253888.0, + "82": 4593253888.0, + "83": 4593253888.0, + "84": 4593253888.0, + "85": 4593253888.0, + "86": 4593253888.0, + "87": 4593253888.0, + "88": 4593253888.0, + "89": 4593253888.0, + "90": 4593253888.0, + "91": 4593253888.0, + "92": 4593253888.0, + "93": 4593253888.0, + "94": 4593253888.0, + "95": 4593253888.0, + "96": 4593253888.0, + "97": 4593253888.0, + "98": 4593253888.0, + "99": 4593253888.0, + "100": 4593253888.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 21.09115, + "2": 0.41164, + "3": 0.38182, + "4": 0.38049, + "5": 0.60969, + "6": 0.36583, + "7": 0.36416, + "8": 0.37604, + "9": 0.3679, + "10": 0.36785, + "11": 0.36954, + "12": 0.36975, + "13": 0.36874, + "14": 0.36917, + "15": 0.37218, + "16": 0.37039, + "17": 0.36749, + "18": 0.36956, + "19": 0.37349, + "20": 0.37202, + "21": 0.36788, + "22": 0.37092, + "23": 0.36616, + "24": 0.36575, + "25": 0.36576, + "26": 0.36657, + "27": 0.36754, + "28": 0.36677, + "29": 0.36466, + "30": 0.36792, + "31": 0.36536, + "32": 0.36562, + "33": 0.36872, + "34": 0.36339, + "35": 0.36568, + "36": 0.36568, + "37": 0.36366, + "38": 0.36485, + "39": 0.36421, + "40": 0.35995, + "41": 0.36131, + "42": 0.36351, + "43": 0.36398, + "44": 0.3645, + "45": 0.359, + "46": 0.3614, + "47": 0.35954, + "48": 0.36106, + "49": 0.36508, + "50": 0.36162, + "51": 0.36692, + "52": 0.36519, + "53": 0.3602, + "54": 0.36089, + "55": 0.36195, + "56": 0.35943, + "57": 0.36048, + "58": 0.36032, + "59": 0.36446, + "60": 0.36455, + "61": 0.36016, + "62": 0.36345, + "63": 0.3602, + "64": 0.36067, + "65": 0.36076, + "66": 0.36538, + "67": 0.57124, + "68": 0.36375, + "69": 0.36298, + "70": 0.3623, + "71": 0.36583, + "72": 0.36199, + "73": 0.36503, + "74": 0.3612, + "75": 0.36467, + "76": 0.36386, + "77": 0.36345, + "78": 0.36764, + "79": 0.36585, + "80": 0.36636, + "81": 0.36354, + "82": 0.36426, + "83": 0.36781, + "84": 0.58958, + "85": 0.36576, + "86": 0.36705, + "87": 0.36285, + "88": 0.3685, + "89": 0.36603, + "90": 0.36553, + "91": 0.36328, + "92": 0.36279, + "93": 0.36243, + "94": 0.3647, + "95": 0.3673, + "96": 0.36551, + "97": 0.36297, + "98": 0.36326, + "99": 0.3621, + "100": 0.36226 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 74df36b8e05..ef753336010 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,287 @@ -{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.83936, "5": 10.87939, "10": 10.8926, "15": 10.83088, "20": 10.6635, "25": 10.50497, "30": 10.42916, "35": 9.99632, "40": 10.12495, "45": 9.71369, "50": 9.96042}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1026.0, "5": 1259.0, "10": 1319.0, "15": 1217.0, "20": 1019.0, "25": 1066.0, "30": 1532.0, "35": 1235.0, "40": 1513.0, "45": 1501.0, "50": 1639.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 763220480.0, "5": 763220480.0, "10": 763220480.0, "15": 763220480.0, "20": 763220480.0, "25": 763220480.0, "30": 763220480.0, "35": 763220480.0, "40": 763220480.0, "45": 763220480.0, "50": 763220480.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4342344704.0, "5": 4626153472.0, "10": 4626153472.0, "15": 4626153472.0, "20": 4626153472.0, "25": 4626153472.0, "30": 4626153472.0, "35": 4626153472.0, "40": 4626153472.0, "45": 4626153472.0, "50": 4626153472.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 13.4691, "5": 0.23174, "10": 0.22417, "15": 0.22833, "20": 0.22378, "25": 0.23805, "30": 0.22623, "35": 0.22839, "40": 0.22689, "45": 0.22807, "50": 0.22843}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.83936, + "2": 10.8442, + "3": 10.86813, + "4": 10.86022, + "5": 10.87939, + "6": 10.85969, + "7": 10.86386, + "8": 10.8444, + "9": 10.88995, + "10": 10.8926, + "11": 10.89136, + "12": 10.85312, + "13": 10.87319, + "14": 10.83805, + "15": 10.83088, + "16": 10.82011, + "17": 10.79138, + "18": 10.81055, + "19": 10.77977, + "20": 10.6635, + "21": 10.69765, + "22": 10.67421, + "23": 10.77344, + "24": 10.63919, + "25": 10.50497, + "26": 10.61911, + "27": 10.56921, + "28": 10.46859, + "29": 10.41119, + "30": 10.42916, + "31": 10.52553, + "32": 10.34942, + "33": 10.2967, + "34": 10.46909, + "35": 9.99632, + "36": 10.13945, + "37": 10.0434, + "38": 10.4139, + "39": 9.80941, + "40": 10.12495, + "41": 10.14883, + "42": 10.04042, + "43": 10.22142, + "44": 10.07348, + "45": 9.71369, + "46": 10.00449, + "47": 9.94758, + "48": 9.68856, + "49": 9.93637, + "50": 9.96042 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1026.0, + "2": 1184.0, + "3": 1226.0, + "4": 1248.0, + "5": 1259.0, + "6": 1421.0, + "7": 1182.0, + "8": 1036.0, + "9": 1293.0, + "10": 1319.0, + "11": 1212.0, + "12": 1373.0, + "13": 1327.0, + "14": 1121.0, + "15": 1217.0, + "16": 1163.0, + "17": 1246.0, + "18": 1280.0, + "19": 1128.0, + "20": 1019.0, + "21": 1147.0, + "22": 1156.0, + "23": 1341.0, + "24": 1312.0, + "25": 1066.0, + "26": 1138.0, + "27": 1270.0, + "28": 1260.0, + "29": 1292.0, + "30": 1532.0, + "31": 1477.0, + "32": 1460.0, + "33": 1537.0, + "34": 1513.0, + "35": 1235.0, + "36": 1316.0, + "37": 1466.0, + "38": 1564.0, + "39": 1380.0, + "40": 1513.0, + "41": 1633.0, + "42": 1509.0, + "43": 1731.0, + "44": 1636.0, + "45": 1501.0, + "46": 1884.0, + "47": 1567.0, + "48": 1631.0, + "49": 1825.0, + "50": 1639.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759681536.0, + "2": 759681536.0, + "3": 759681536.0, + "4": 759681536.0, + "5": 759681536.0, + "6": 759681536.0, + "7": 759681536.0, + "8": 759681536.0, + "9": 759681536.0, + "10": 759681536.0, + "11": 759681536.0, + "12": 759681536.0, + "13": 759681536.0, + "14": 759681536.0, + "15": 759681536.0, + "16": 759681536.0, + "17": 759681536.0, + "18": 759681536.0, + "19": 759681536.0, + "20": 759681536.0, + "21": 759681536.0, + "22": 759681536.0, + "23": 759681536.0, + "24": 759681536.0, + "25": 759681536.0, + "26": 759681536.0, + "27": 759681536.0, + "28": 759681536.0, + "29": 759681536.0, + "30": 759681536.0, + "31": 759681536.0, + "32": 759681536.0, + "33": 759681536.0, + "34": 759681536.0, + "35": 759681536.0, + "36": 759681536.0, + "37": 759681536.0, + "38": 759681536.0, + "39": 759681536.0, + "40": 759681536.0, + "41": 759681536.0, + "42": 759681536.0, + "43": 759681536.0, + "44": 759681536.0, + "45": 759681536.0, + "46": 759681536.0, + "47": 759681536.0, + "48": 759681536.0, + "49": 759681536.0, + "50": 759681536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4340902912.0, + "2": 4622614528.0, + "3": 4622614528.0, + "4": 4622614528.0, + "5": 4622614528.0, + "6": 4622614528.0, + "7": 4622614528.0, + "8": 4622614528.0, + "9": 4622614528.0, + "10": 4622614528.0, + "11": 4622614528.0, + "12": 4622614528.0, + "13": 4622614528.0, + "14": 4622614528.0, + "15": 4622614528.0, + "16": 4622614528.0, + "17": 4622614528.0, + "18": 4622614528.0, + "19": 4622614528.0, + "20": 4622614528.0, + "21": 4622614528.0, + "22": 4622614528.0, + "23": 4622614528.0, + "24": 4622614528.0, + "25": 4622614528.0, + "26": 4622614528.0, + "27": 4622614528.0, + "28": 4622614528.0, + "29": 4622614528.0, + "30": 4622614528.0, + "31": 4622614528.0, + "32": 4622614528.0, + "33": 4622614528.0, + "34": 4622614528.0, + "35": 4622614528.0, + "36": 4622614528.0, + "37": 4622614528.0, + "38": 4622614528.0, + "39": 4622614528.0, + "40": 4622614528.0, + "41": 4622614528.0, + "42": 4622614528.0, + "43": 4622614528.0, + "44": 4622614528.0, + "45": 4622614528.0, + "46": 4622614528.0, + "47": 4622614528.0, + "48": 4622614528.0, + "49": 4622614528.0, + "50": 4622614528.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 12.91878, + "2": 0.30301, + "3": 0.26726, + "4": 0.26031, + "5": 0.25815, + "6": 0.26195, + "7": 0.26064, + "8": 0.26459, + "9": 0.25765, + "10": 0.26159, + "11": 0.25801, + "12": 0.2577, + "13": 0.25882, + "14": 0.25879, + "15": 0.25853, + "16": 0.25689, + "17": 0.25763, + "18": 0.26042, + "19": 0.25687, + "20": 0.25459, + "21": 0.25315, + "22": 0.2615, + "23": 0.25473, + "24": 0.2558, + "25": 0.25524, + "26": 0.25354, + "27": 0.25658, + "28": 0.25019, + "29": 0.2622, + "30": 0.25785, + "31": 0.25516, + "32": 0.25092, + "33": 0.25655, + "34": 0.25493, + "35": 0.2541, + "36": 0.25492, + "37": 0.25229, + "38": 0.25775, + "39": 0.25432, + "40": 0.25358, + "41": 0.25502, + "42": 0.25428, + "43": 0.25111, + "44": 0.25239, + "45": 0.25573, + "46": 0.25505, + "47": 0.25199, + "48": 0.25057, + "49": 0.25588, + "50": 0.2569 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..67c8ef8abff --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.83936, + "2": 10.8442, + "3": 10.86813, + "4": 10.86022, + "5": 10.87939, + "6": 10.85969, + "7": 10.86386, + "8": 10.8444, + "9": 10.88995, + "10": 10.8926, + "11": 10.89136, + "12": 10.85312, + "13": 10.87319, + "14": 10.83805, + "15": 10.83088, + "16": 10.82011, + "17": 10.79138, + "18": 10.81055, + "19": 10.77977, + "20": 10.6635, + "21": 10.69765, + "22": 10.67421, + "23": 10.77344, + "24": 10.63919, + "25": 10.50497, + "26": 10.61911, + "27": 10.56921, + "28": 10.46859, + "29": 10.41119, + "30": 10.42916, + "31": 10.52553, + "32": 10.34942, + "33": 10.2967, + "34": 10.46909, + "35": 9.99632, + "36": 10.13945, + "37": 10.0434, + "38": 10.4139, + "39": 9.80941, + "40": 10.12495, + "41": 10.14883, + "42": 10.04042, + "43": 10.22142, + "44": 10.07348, + "45": 9.71369, + "46": 10.00449, + "47": 9.94758, + "48": 9.68856, + "49": 9.93637, + "50": 9.96042 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1026.0, + "2": 1184.0, + "3": 1226.0, + "4": 1248.0, + "5": 1259.0, + "6": 1421.0, + "7": 1182.0, + "8": 1036.0, + "9": 1293.0, + "10": 1319.0, + "11": 1212.0, + "12": 1373.0, + "13": 1327.0, + "14": 1121.0, + "15": 1217.0, + "16": 1163.0, + "17": 1246.0, + "18": 1280.0, + "19": 1128.0, + "20": 1019.0, + "21": 1147.0, + "22": 1156.0, + "23": 1341.0, + "24": 1312.0, + "25": 1066.0, + "26": 1138.0, + "27": 1270.0, + "28": 1260.0, + "29": 1292.0, + "30": 1532.0, + "31": 1477.0, + "32": 1460.0, + "33": 1537.0, + "34": 1513.0, + "35": 1235.0, + "36": 1316.0, + "37": 1466.0, + "38": 1564.0, + "39": 1380.0, + "40": 1513.0, + "41": 1633.0, + "42": 1509.0, + "43": 1731.0, + "44": 1636.0, + "45": 1501.0, + "46": 1884.0, + "47": 1567.0, + "48": 1631.0, + "49": 1825.0, + "50": 1639.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759681536.0, + "2": 759681536.0, + "3": 759681536.0, + "4": 759681536.0, + "5": 759681536.0, + "6": 759681536.0, + "7": 759681536.0, + "8": 759681536.0, + "9": 759681536.0, + "10": 759681536.0, + "11": 759681536.0, + "12": 759681536.0, + "13": 759681536.0, + "14": 759681536.0, + "15": 759681536.0, + "16": 759681536.0, + "17": 759681536.0, + "18": 759681536.0, + "19": 759681536.0, + "20": 759681536.0, + "21": 759681536.0, + "22": 759681536.0, + "23": 759681536.0, + "24": 759681536.0, + "25": 759681536.0, + "26": 759681536.0, + "27": 759681536.0, + "28": 759681536.0, + "29": 759681536.0, + "30": 759681536.0, + "31": 759681536.0, + "32": 759681536.0, + "33": 759681536.0, + "34": 759681536.0, + "35": 759681536.0, + "36": 759681536.0, + "37": 759681536.0, + "38": 759681536.0, + "39": 759681536.0, + "40": 759681536.0, + "41": 759681536.0, + "42": 759681536.0, + "43": 759681536.0, + "44": 759681536.0, + "45": 759681536.0, + "46": 759681536.0, + "47": 759681536.0, + "48": 759681536.0, + "49": 759681536.0, + "50": 759681536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4340902912.0, + "2": 4622614528.0, + "3": 4622614528.0, + "4": 4622614528.0, + "5": 4622614528.0, + "6": 4622614528.0, + "7": 4622614528.0, + "8": 4622614528.0, + "9": 4622614528.0, + "10": 4622614528.0, + "11": 4622614528.0, + "12": 4622614528.0, + "13": 4622614528.0, + "14": 4622614528.0, + "15": 4622614528.0, + "16": 4622614528.0, + "17": 4622614528.0, + "18": 4622614528.0, + "19": 4622614528.0, + "20": 4622614528.0, + "21": 4622614528.0, + "22": 4622614528.0, + "23": 4622614528.0, + "24": 4622614528.0, + "25": 4622614528.0, + "26": 4622614528.0, + "27": 4622614528.0, + "28": 4622614528.0, + "29": 4622614528.0, + "30": 4622614528.0, + "31": 4622614528.0, + "32": 4622614528.0, + "33": 4622614528.0, + "34": 4622614528.0, + "35": 4622614528.0, + "36": 4622614528.0, + "37": 4622614528.0, + "38": 4622614528.0, + "39": 4622614528.0, + "40": 4622614528.0, + "41": 4622614528.0, + "42": 4622614528.0, + "43": 4622614528.0, + "44": 4622614528.0, + "45": 4622614528.0, + "46": 4622614528.0, + "47": 4622614528.0, + "48": 4622614528.0, + "49": 4622614528.0, + "50": 4622614528.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 13.91724, + "2": 0.27573, + "3": 0.23467, + "4": 0.23594, + "5": 0.23302, + "6": 0.23216, + "7": 0.23399, + "8": 0.23423, + "9": 0.23365, + "10": 0.23211, + "11": 0.2332, + "12": 0.23283, + "13": 0.23445, + "14": 0.23405, + "15": 0.23349, + "16": 0.23298, + "17": 0.23305, + "18": 0.23251, + "19": 0.23322, + "20": 0.23348, + "21": 0.23189, + "22": 0.23316, + "23": 0.2316, + "24": 0.23233, + "25": 0.23512, + "26": 0.23232, + "27": 0.23306, + "28": 0.23244, + "29": 0.23331, + "30": 0.23258, + "31": 0.23311, + "32": 0.23326, + "33": 0.23418, + "34": 0.23411, + "35": 0.23489, + "36": 0.2317, + "37": 0.23483, + "38": 0.23235, + "39": 0.23511, + "40": 0.23413, + "41": 0.23395, + "42": 0.23405, + "43": 0.23331, + "44": 0.23297, + "45": 0.23473, + "46": 0.23192, + "47": 0.23377, + "48": 0.23322, + "49": 0.23042, + "50": 0.23263 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..5e0ca24c497 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.83936, + "2": 10.8442, + "3": 10.86813, + "4": 10.86022, + "5": 10.87939, + "6": 10.85969, + "7": 10.86386, + "8": 10.8444, + "9": 10.88995, + "10": 10.8926, + "11": 10.89136, + "12": 10.85312, + "13": 10.87319, + "14": 10.83805, + "15": 10.83088, + "16": 10.82011, + "17": 10.79138, + "18": 10.81055, + "19": 10.77977, + "20": 10.6635, + "21": 10.69765, + "22": 10.67421, + "23": 10.77344, + "24": 10.63919, + "25": 10.50497, + "26": 10.61911, + "27": 10.56921, + "28": 10.46859, + "29": 10.41119, + "30": 10.42916, + "31": 10.52553, + "32": 10.34942, + "33": 10.2967, + "34": 10.46909, + "35": 9.99632, + "36": 10.13945, + "37": 10.0434, + "38": 10.4139, + "39": 9.80941, + "40": 10.12495, + "41": 10.14883, + "42": 10.04042, + "43": 10.22142, + "44": 10.07348, + "45": 9.71369, + "46": 10.00449, + "47": 9.94758, + "48": 9.68856, + "49": 9.93637, + "50": 9.96042 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1026.0, + "2": 1184.0, + "3": 1226.0, + "4": 1248.0, + "5": 1259.0, + "6": 1421.0, + "7": 1182.0, + "8": 1036.0, + "9": 1293.0, + "10": 1319.0, + "11": 1212.0, + "12": 1373.0, + "13": 1327.0, + "14": 1121.0, + "15": 1217.0, + "16": 1163.0, + "17": 1246.0, + "18": 1280.0, + "19": 1128.0, + "20": 1019.0, + "21": 1147.0, + "22": 1156.0, + "23": 1341.0, + "24": 1312.0, + "25": 1066.0, + "26": 1138.0, + "27": 1270.0, + "28": 1260.0, + "29": 1292.0, + "30": 1532.0, + "31": 1477.0, + "32": 1460.0, + "33": 1537.0, + "34": 1513.0, + "35": 1235.0, + "36": 1316.0, + "37": 1466.0, + "38": 1564.0, + "39": 1380.0, + "40": 1513.0, + "41": 1633.0, + "42": 1509.0, + "43": 1731.0, + "44": 1636.0, + "45": 1501.0, + "46": 1884.0, + "47": 1567.0, + "48": 1631.0, + "49": 1825.0, + "50": 1639.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759681536.0, + "2": 759681536.0, + "3": 759681536.0, + "4": 759681536.0, + "5": 759681536.0, + "6": 759681536.0, + "7": 759681536.0, + "8": 759681536.0, + "9": 759681536.0, + "10": 759681536.0, + "11": 759681536.0, + "12": 759681536.0, + "13": 759681536.0, + "14": 759681536.0, + "15": 759681536.0, + "16": 759681536.0, + "17": 759681536.0, + "18": 759681536.0, + "19": 759681536.0, + "20": 759681536.0, + "21": 759681536.0, + "22": 759681536.0, + "23": 759681536.0, + "24": 759681536.0, + "25": 759681536.0, + "26": 759681536.0, + "27": 759681536.0, + "28": 759681536.0, + "29": 759681536.0, + "30": 759681536.0, + "31": 759681536.0, + "32": 759681536.0, + "33": 759681536.0, + "34": 759681536.0, + "35": 759681536.0, + "36": 759681536.0, + "37": 759681536.0, + "38": 759681536.0, + "39": 759681536.0, + "40": 759681536.0, + "41": 759681536.0, + "42": 759681536.0, + "43": 759681536.0, + "44": 759681536.0, + "45": 759681536.0, + "46": 759681536.0, + "47": 759681536.0, + "48": 759681536.0, + "49": 759681536.0, + "50": 759681536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4340902912.0, + "2": 4622614528.0, + "3": 4622614528.0, + "4": 4622614528.0, + "5": 4622614528.0, + "6": 4622614528.0, + "7": 4622614528.0, + "8": 4622614528.0, + "9": 4622614528.0, + "10": 4622614528.0, + "11": 4622614528.0, + "12": 4622614528.0, + "13": 4622614528.0, + "14": 4622614528.0, + "15": 4622614528.0, + "16": 4622614528.0, + "17": 4622614528.0, + "18": 4622614528.0, + "19": 4622614528.0, + "20": 4622614528.0, + "21": 4622614528.0, + "22": 4622614528.0, + "23": 4622614528.0, + "24": 4622614528.0, + "25": 4622614528.0, + "26": 4622614528.0, + "27": 4622614528.0, + "28": 4622614528.0, + "29": 4622614528.0, + "30": 4622614528.0, + "31": 4622614528.0, + "32": 4622614528.0, + "33": 4622614528.0, + "34": 4622614528.0, + "35": 4622614528.0, + "36": 4622614528.0, + "37": 4622614528.0, + "38": 4622614528.0, + "39": 4622614528.0, + "40": 4622614528.0, + "41": 4622614528.0, + "42": 4622614528.0, + "43": 4622614528.0, + "44": 4622614528.0, + "45": 4622614528.0, + "46": 4622614528.0, + "47": 4622614528.0, + "48": 4622614528.0, + "49": 4622614528.0, + "50": 4622614528.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 12.785, + "2": 0.28429, + "3": 0.25654, + "4": 0.25675, + "5": 0.25763, + "6": 0.25556, + "7": 0.25403, + "8": 0.25276, + "9": 0.25351, + "10": 0.25546, + "11": 0.25488, + "12": 0.25607, + "13": 0.25404, + "14": 0.25256, + "15": 0.25733, + "16": 0.25987, + "17": 0.25778, + "18": 0.25053, + "19": 0.25288, + "20": 0.258, + "21": 0.25606, + "22": 0.25231, + "23": 0.25223, + "24": 0.26464, + "25": 0.26469, + "26": 0.25015, + "27": 0.25378, + "28": 0.25459, + "29": 0.26134, + "30": 0.26129, + "31": 0.2595, + "32": 0.26444, + "33": 0.25568, + "34": 0.25514, + "35": 0.25087, + "36": 0.25275, + "37": 0.25383, + "38": 0.24953, + "39": 0.24996, + "40": 0.25393, + "41": 0.25556, + "42": 0.25158, + "43": 0.25124, + "44": 0.25, + "45": 0.25586, + "46": 0.26057, + "47": 0.25868, + "48": 0.26304, + "49": 0.2615, + "50": 0.26261 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..2685ca10966 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.81184, + "2": 10.80964, + "3": 10.8261, + "4": 10.83055, + "5": 10.85464, + "6": 10.84052, + "7": 10.83581, + "8": 10.80288, + "9": 10.87748, + "10": 10.88256, + "11": 10.87624, + "12": 10.82598, + "13": 10.84134, + "14": 10.81521, + "15": 10.80679, + "16": 10.79904, + "17": 10.76842, + "18": 10.77939, + "19": 10.75192, + "20": 10.63196, + "21": 10.68212, + "22": 10.63985, + "23": 10.75592, + "24": 10.60961, + "25": 10.47374, + "26": 10.59698, + "27": 10.54094, + "28": 10.44971, + "29": 10.39259, + "30": 10.39285, + "31": 10.49257, + "32": 10.31859, + "33": 10.27757, + "34": 10.44435, + "35": 9.96791, + "36": 10.11232, + "37": 10.02385, + "38": 10.37514, + "39": 9.78682, + "40": 10.1, + "41": 10.12396, + "42": 10.03, + "43": 10.19936, + "44": 10.0547, + "45": 9.68344, + "46": 9.98163, + "47": 9.92505, + "48": 9.6694, + "49": 9.91809, + "50": 9.92465 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1125.0, + "2": 1177.0, + "3": 1265.0, + "4": 1241.0, + "5": 1255.0, + "6": 1304.0, + "7": 1204.0, + "8": 998.0, + "9": 1236.0, + "10": 1367.0, + "11": 1252.0, + "12": 1281.0, + "13": 1254.0, + "14": 1148.0, + "15": 1127.0, + "16": 1102.0, + "17": 1193.0, + "18": 1248.0, + "19": 1072.0, + "20": 1082.0, + "21": 1201.0, + "22": 1302.0, + "23": 1336.0, + "24": 1317.0, + "25": 1114.0, + "26": 1200.0, + "27": 1255.0, + "28": 1323.0, + "29": 1288.0, + "30": 1558.0, + "31": 1489.0, + "32": 1390.0, + "33": 1413.0, + "34": 1518.0, + "35": 1292.0, + "36": 1395.0, + "37": 1487.0, + "38": 1573.0, + "39": 1376.0, + "40": 1433.0, + "41": 1677.0, + "42": 1728.0, + "43": 1669.0, + "44": 1607.0, + "45": 1564.0, + "46": 1874.0, + "47": 1660.0, + "48": 1554.0, + "49": 1781.0, + "50": 1749.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 730320896.0, + "2": 730320896.0, + "3": 730320896.0, + "4": 730320896.0, + "5": 730320896.0, + "6": 730320896.0, + "7": 730320896.0, + "8": 730320896.0, + "9": 730320896.0, + "10": 730320896.0, + "11": 730320896.0, + "12": 730320896.0, + "13": 730320896.0, + "14": 730320896.0, + "15": 730320896.0, + "16": 730320896.0, + "17": 730320896.0, + "18": 730320896.0, + "19": 730320896.0, + "20": 730320896.0, + "21": 730320896.0, + "22": 730320896.0, + "23": 730320896.0, + "24": 730320896.0, + "25": 730320896.0, + "26": 730320896.0, + "27": 730320896.0, + "28": 730320896.0, + "29": 730320896.0, + "30": 730320896.0, + "31": 730320896.0, + "32": 730320896.0, + "33": 730320896.0, + "34": 730320896.0, + "35": 730320896.0, + "36": 730320896.0, + "37": 730320896.0, + "38": 730320896.0, + "39": 730320896.0, + "40": 730320896.0, + "41": 730320896.0, + "42": 730320896.0, + "43": 730320896.0, + "44": 730320896.0, + "45": 730320896.0, + "46": 730320896.0, + "47": 730320896.0, + "48": 730320896.0, + "49": 730320896.0, + "50": 730320896.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4311542272.0, + "2": 4593253888.0, + "3": 4593253888.0, + "4": 4593253888.0, + "5": 4593253888.0, + "6": 4593253888.0, + "7": 4593253888.0, + "8": 4593253888.0, + "9": 4593253888.0, + "10": 4593253888.0, + "11": 4593253888.0, + "12": 4593253888.0, + "13": 4593253888.0, + "14": 4593253888.0, + "15": 4593253888.0, + "16": 4593253888.0, + "17": 4593253888.0, + "18": 4593253888.0, + "19": 4593253888.0, + "20": 4593253888.0, + "21": 4593253888.0, + "22": 4593253888.0, + "23": 4593253888.0, + "24": 4593253888.0, + "25": 4593253888.0, + "26": 4593253888.0, + "27": 4593253888.0, + "28": 4593253888.0, + "29": 4593253888.0, + "30": 4593253888.0, + "31": 4593253888.0, + "32": 4593253888.0, + "33": 4593253888.0, + "34": 4593253888.0, + "35": 4593253888.0, + "36": 4593253888.0, + "37": 4593253888.0, + "38": 4593253888.0, + "39": 4593253888.0, + "40": 4593253888.0, + "41": 4593253888.0, + "42": 4593253888.0, + "43": 4593253888.0, + "44": 4593253888.0, + "45": 4593253888.0, + "46": 4593253888.0, + "47": 4593253888.0, + "48": 4593253888.0, + "49": 4593253888.0, + "50": 4593253888.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 23.63558, + "2": 0.38944, + "3": 0.36089, + "4": 0.36151, + "5": 0.5961, + "6": 0.35637, + "7": 0.35787, + "8": 0.35755, + "9": 0.35356, + "10": 0.35923, + "11": 0.35827, + "12": 0.35689, + "13": 0.97539, + "14": 0.35703, + "15": 0.35633, + "16": 0.35889, + "17": 0.35586, + "18": 0.35688, + "19": 0.35645, + "20": 0.35976, + "21": 0.35733, + "22": 0.35708, + "23": 0.35968, + "24": 0.35728, + "25": 0.35727, + "26": 0.35822, + "27": 0.35734, + "28": 0.35672, + "29": 0.35566, + "30": 0.35576, + "31": 0.35716, + "32": 0.35824, + "33": 0.35667, + "34": 0.35897, + "35": 0.35713, + "36": 0.35482, + "37": 0.35925, + "38": 0.35547, + "39": 0.35781, + "40": 0.35516, + "41": 0.35633, + "42": 0.35674, + "43": 0.35645, + "44": 0.35797, + "45": 0.35717, + "46": 0.35635, + "47": 0.35374, + "48": 0.35743, + "49": 0.35664, + "50": 0.35474 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..516c7e99194 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.81184, + "2": 10.80964, + "3": 10.8261, + "4": 10.83055, + "5": 10.85464, + "6": 10.84052, + "7": 10.83581, + "8": 10.80288, + "9": 10.87748, + "10": 10.88256, + "11": 10.87624, + "12": 10.82598, + "13": 10.84134, + "14": 10.81521, + "15": 10.80679, + "16": 10.79904, + "17": 10.76842, + "18": 10.77939, + "19": 10.75192, + "20": 10.63196, + "21": 10.68212, + "22": 10.63985, + "23": 10.75592, + "24": 10.60961, + "25": 10.47374, + "26": 10.59698, + "27": 10.54094, + "28": 10.44971, + "29": 10.39259, + "30": 10.39285, + "31": 10.49257, + "32": 10.31859, + "33": 10.27757, + "34": 10.44435, + "35": 9.96791, + "36": 10.11232, + "37": 10.02385, + "38": 10.37514, + "39": 9.78682, + "40": 10.1, + "41": 10.12396, + "42": 10.03, + "43": 10.19936, + "44": 10.0547, + "45": 9.68344, + "46": 9.98163, + "47": 9.92505, + "48": 9.6694, + "49": 9.91809, + "50": 9.92465 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1125.0, + "2": 1177.0, + "3": 1265.0, + "4": 1241.0, + "5": 1255.0, + "6": 1304.0, + "7": 1204.0, + "8": 998.0, + "9": 1236.0, + "10": 1367.0, + "11": 1252.0, + "12": 1281.0, + "13": 1254.0, + "14": 1148.0, + "15": 1127.0, + "16": 1102.0, + "17": 1193.0, + "18": 1248.0, + "19": 1072.0, + "20": 1082.0, + "21": 1201.0, + "22": 1302.0, + "23": 1336.0, + "24": 1317.0, + "25": 1114.0, + "26": 1200.0, + "27": 1255.0, + "28": 1323.0, + "29": 1288.0, + "30": 1558.0, + "31": 1489.0, + "32": 1390.0, + "33": 1413.0, + "34": 1518.0, + "35": 1292.0, + "36": 1395.0, + "37": 1487.0, + "38": 1573.0, + "39": 1376.0, + "40": 1433.0, + "41": 1677.0, + "42": 1728.0, + "43": 1669.0, + "44": 1607.0, + "45": 1564.0, + "46": 1874.0, + "47": 1660.0, + "48": 1554.0, + "49": 1781.0, + "50": 1749.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 730320896.0, + "2": 730320896.0, + "3": 730320896.0, + "4": 730320896.0, + "5": 730320896.0, + "6": 730320896.0, + "7": 730320896.0, + "8": 730320896.0, + "9": 730320896.0, + "10": 730320896.0, + "11": 730320896.0, + "12": 730320896.0, + "13": 730320896.0, + "14": 730320896.0, + "15": 730320896.0, + "16": 730320896.0, + "17": 730320896.0, + "18": 730320896.0, + "19": 730320896.0, + "20": 730320896.0, + "21": 730320896.0, + "22": 730320896.0, + "23": 730320896.0, + "24": 730320896.0, + "25": 730320896.0, + "26": 730320896.0, + "27": 730320896.0, + "28": 730320896.0, + "29": 730320896.0, + "30": 730320896.0, + "31": 730320896.0, + "32": 730320896.0, + "33": 730320896.0, + "34": 730320896.0, + "35": 730320896.0, + "36": 730320896.0, + "37": 730320896.0, + "38": 730320896.0, + "39": 730320896.0, + "40": 730320896.0, + "41": 730320896.0, + "42": 730320896.0, + "43": 730320896.0, + "44": 730320896.0, + "45": 730320896.0, + "46": 730320896.0, + "47": 730320896.0, + "48": 730320896.0, + "49": 730320896.0, + "50": 730320896.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4311542272.0, + "2": 4593253888.0, + "3": 4593253888.0, + "4": 4593253888.0, + "5": 4593253888.0, + "6": 4593253888.0, + "7": 4593253888.0, + "8": 4593253888.0, + "9": 4593253888.0, + "10": 4593253888.0, + "11": 4593253888.0, + "12": 4593253888.0, + "13": 4593253888.0, + "14": 4593253888.0, + "15": 4593253888.0, + "16": 4593253888.0, + "17": 4593253888.0, + "18": 4593253888.0, + "19": 4593253888.0, + "20": 4593253888.0, + "21": 4593253888.0, + "22": 4593253888.0, + "23": 4593253888.0, + "24": 4593253888.0, + "25": 4593253888.0, + "26": 4593253888.0, + "27": 4593253888.0, + "28": 4593253888.0, + "29": 4593253888.0, + "30": 4593253888.0, + "31": 4593253888.0, + "32": 4593253888.0, + "33": 4593253888.0, + "34": 4593253888.0, + "35": 4593253888.0, + "36": 4593253888.0, + "37": 4593253888.0, + "38": 4593253888.0, + "39": 4593253888.0, + "40": 4593253888.0, + "41": 4593253888.0, + "42": 4593253888.0, + "43": 4593253888.0, + "44": 4593253888.0, + "45": 4593253888.0, + "46": 4593253888.0, + "47": 4593253888.0, + "48": 4593253888.0, + "49": 4593253888.0, + "50": 4593253888.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 19.94048, + "2": 0.39367, + "3": 0.37589, + "4": 0.37388, + "5": 0.66307, + "6": 0.36351, + "7": 0.3595, + "8": 0.36116, + "9": 0.36043, + "10": 0.35758, + "11": 0.36057, + "12": 0.35963, + "13": 0.36072, + "14": 0.35903, + "15": 0.35994, + "16": 0.35763, + "17": 0.36245, + "18": 0.35747, + "19": 0.35878, + "20": 0.35982, + "21": 0.35849, + "22": 0.35936, + "23": 0.35823, + "24": 0.35778, + "25": 0.3606, + "26": 0.35907, + "27": 0.35852, + "28": 0.35911, + "29": 0.35837, + "30": 0.35815, + "31": 0.35909, + "32": 0.35701, + "33": 0.3602, + "34": 0.35976, + "35": 0.36009, + "36": 0.35943, + "37": 0.35776, + "38": 0.35664, + "39": 0.36098, + "40": 0.35836, + "41": 0.35857, + "42": 0.35915, + "43": 0.3572, + "44": 0.35779, + "45": 0.36243, + "46": 0.35772, + "47": 0.35984, + "48": 0.35743, + "49": 0.35726, + "50": 0.35872 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index bdbd770075f..ecbd1bac9aa 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,287 @@ -{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.88372, "5": 10.88547, "10": 10.86477, "15": 10.81334, "20": 10.71864, "25": 10.55396, "30": 10.36075, "35": 10.25855, "40": 10.0779, "45": 9.84493, "50": 9.89982}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 22726932.0, "5": 22713776.0, "10": 22918608.0, "15": 22821768.0, "20": 22693536.0, "25": 22819092.0, "30": 22630868.0, "35": 22788568.0, "40": 22657832.0, "45": 22674860.0, "50": 22904840.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 688127488.0, "5": 688127488.0, "10": 688127488.0, "15": 688127488.0, "20": 688127488.0, "25": 688127488.0, "30": 688127488.0, "35": 688127488.0, "40": 688127488.0, "45": 688127488.0, "50": 688127488.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2159072768.0, "5": 2415565312.0, "10": 2415565312.0, "15": 2415565312.0, "20": 2415565312.0, "25": 2415565312.0, "30": 2415565312.0, "35": 2415565312.0, "40": 2415565312.0, "45": 2415565312.0, "50": 2415565312.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 12.21878, "5": 0.09761, "10": 0.10322, "15": 0.09934, "20": 0.09992, "25": 0.10002, "30": 0.09769, "35": 0.09817, "40": 0.09665, "45": 0.09737, "50": 0.09814}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.88372, + "2": 10.87208, + "3": 10.8784, + "4": 10.85806, + "5": 10.88547, + "6": 10.89556, + "7": 10.88051, + "8": 10.87687, + "9": 10.868, + "10": 10.86477, + "11": 10.87779, + "12": 10.8736, + "13": 10.8617, + "14": 10.88756, + "15": 10.81334, + "16": 10.8276, + "17": 10.80766, + "18": 10.81067, + "19": 10.81127, + "20": 10.71864, + "21": 10.69427, + "22": 10.58083, + "23": 10.69548, + "24": 10.60367, + "25": 10.55396, + "26": 10.61304, + "27": 10.59026, + "28": 10.54029, + "29": 10.55687, + "30": 10.36075, + "31": 10.13943, + "32": 10.44344, + "33": 10.44459, + "34": 10.21087, + "35": 10.25855, + "36": 10.22779, + "37": 10.32843, + "38": 10.18154, + "39": 10.37655, + "40": 10.0779, + "41": 10.12618, + "42": 10.19378, + "43": 9.85406, + "44": 9.94224, + "45": 9.84493, + "46": 9.831, + "47": 10.13553, + "48": 9.84455, + "49": 9.5571, + "50": 9.89982 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 22726932.0, + "2": 22924916.0, + "3": 22597332.0, + "4": 23219544.0, + "5": 22713776.0, + "6": 23021572.0, + "7": 22771346.0, + "8": 22926354.0, + "9": 22842338.0, + "10": 22918608.0, + "11": 22500808.0, + "12": 22460148.0, + "13": 22917564.0, + "14": 22389452.0, + "15": 22821768.0, + "16": 22831588.0, + "17": 22819586.0, + "18": 22582872.0, + "19": 22618426.0, + "20": 22693536.0, + "21": 22739728.0, + "22": 22800622.0, + "23": 22539616.0, + "24": 22771504.0, + "25": 22819092.0, + "26": 22547456.0, + "27": 22468726.0, + "28": 22453546.0, + "29": 22529680.0, + "30": 22630868.0, + "31": 22955432.0, + "32": 22585376.0, + "33": 22557692.0, + "34": 22835582.0, + "35": 22788568.0, + "36": 22588652.0, + "37": 22497950.0, + "38": 22895768.0, + "39": 22801524.0, + "40": 22657832.0, + "41": 22659668.0, + "42": 22667616.0, + "43": 22975828.0, + "44": 22746024.0, + "45": 22674860.0, + "46": 22884404.0, + "47": 22633804.0, + "48": 22928614.0, + "49": 22728000.0, + "50": 22904840.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 689176064.0, + "2": 689176064.0, + "3": 689176064.0, + "4": 689176064.0, + "5": 689176064.0, + "6": 689176064.0, + "7": 689176064.0, + "8": 689176064.0, + "9": 689176064.0, + "10": 689176064.0, + "11": 689176064.0, + "12": 689176064.0, + "13": 689176064.0, + "14": 689176064.0, + "15": 689176064.0, + "16": 689176064.0, + "17": 689176064.0, + "18": 689176064.0, + "19": 689176064.0, + "20": 689176064.0, + "21": 689176064.0, + "22": 689176064.0, + "23": 689176064.0, + "24": 689176064.0, + "25": 689176064.0, + "26": 689176064.0, + "27": 689176064.0, + "28": 689176064.0, + "29": 689176064.0, + "30": 689176064.0, + "31": 689176064.0, + "32": 689176064.0, + "33": 689176064.0, + "34": 689176064.0, + "35": 689176064.0, + "36": 689176064.0, + "37": 689176064.0, + "38": 689176064.0, + "39": 689176064.0, + "40": 689176064.0, + "41": 689176064.0, + "42": 689176064.0, + "43": 689176064.0, + "44": 689176064.0, + "45": 689176064.0, + "46": 689176064.0, + "47": 689176064.0, + "48": 689176064.0, + "49": 689176064.0, + "50": 689176064.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2158024192.0, + "2": 2416613888.0, + "3": 2416613888.0, + "4": 2416613888.0, + "5": 2416613888.0, + "6": 2416613888.0, + "7": 2416613888.0, + "8": 2416613888.0, + "9": 2416613888.0, + "10": 2416613888.0, + "11": 2416613888.0, + "12": 2416613888.0, + "13": 2416613888.0, + "14": 2416613888.0, + "15": 2416613888.0, + "16": 2416613888.0, + "17": 2416613888.0, + "18": 2416613888.0, + "19": 2416613888.0, + "20": 2416613888.0, + "21": 2416613888.0, + "22": 2416613888.0, + "23": 2416613888.0, + "24": 2416613888.0, + "25": 2416613888.0, + "26": 2416613888.0, + "27": 2416613888.0, + "28": 2416613888.0, + "29": 2416613888.0, + "30": 2416613888.0, + "31": 2416613888.0, + "32": 2416613888.0, + "33": 2416613888.0, + "34": 2416613888.0, + "35": 2416613888.0, + "36": 2416613888.0, + "37": 2416613888.0, + "38": 2416613888.0, + "39": 2416613888.0, + "40": 2416613888.0, + "41": 2416613888.0, + "42": 2416613888.0, + "43": 2416613888.0, + "44": 2416613888.0, + "45": 2416613888.0, + "46": 2416613888.0, + "47": 2416613888.0, + "48": 2416613888.0, + "49": 2416613888.0, + "50": 2416613888.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.59299, + "2": 0.13612, + "3": 0.11964, + "4": 0.11995, + "5": 0.12152, + "6": 0.121, + "7": 0.1191, + "8": 0.11751, + "9": 0.11711, + "10": 0.11878, + "11": 0.12221, + "12": 0.11956, + "13": 0.11737, + "14": 0.11954, + "15": 0.11916, + "16": 0.12038, + "17": 0.11939, + "18": 0.11747, + "19": 0.11879, + "20": 0.11955, + "21": 0.12128, + "22": 0.11892, + "23": 0.12306, + "24": 0.11834, + "25": 0.11924, + "26": 0.11961, + "27": 0.11912, + "28": 0.11913, + "29": 0.11896, + "30": 0.11897, + "31": 0.12121, + "32": 0.1215, + "33": 0.11867, + "34": 0.11783, + "35": 0.11835, + "36": 0.12172, + "37": 0.11939, + "38": 0.11963, + "39": 0.11846, + "40": 0.11889, + "41": 0.11897, + "42": 0.11775, + "43": 0.12004, + "44": 0.1201, + "45": 0.11742, + "46": 0.1204, + "47": 0.11915, + "48": 0.1208, + "49": 0.11898, + "50": 0.1165 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..19e0972675c --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.88372, + "2": 10.87208, + "3": 10.8784, + "4": 10.85806, + "5": 10.88547, + "6": 10.89556, + "7": 10.88051, + "8": 10.87687, + "9": 10.868, + "10": 10.86477, + "11": 10.87779, + "12": 10.8736, + "13": 10.8617, + "14": 10.88756, + "15": 10.81334, + "16": 10.8276, + "17": 10.80766, + "18": 10.81067, + "19": 10.81127, + "20": 10.71864, + "21": 10.69427, + "22": 10.58083, + "23": 10.69548, + "24": 10.60367, + "25": 10.55396, + "26": 10.61304, + "27": 10.59026, + "28": 10.54029, + "29": 10.55687, + "30": 10.36075, + "31": 10.13943, + "32": 10.44344, + "33": 10.44459, + "34": 10.21087, + "35": 10.25855, + "36": 10.22779, + "37": 10.32843, + "38": 10.18154, + "39": 10.37655, + "40": 10.0779, + "41": 10.12618, + "42": 10.19378, + "43": 9.85406, + "44": 9.94224, + "45": 9.84493, + "46": 9.831, + "47": 10.13553, + "48": 9.84455, + "49": 9.5571, + "50": 9.89982 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 22726932.0, + "2": 22924916.0, + "3": 22597332.0, + "4": 23219544.0, + "5": 22713776.0, + "6": 23021572.0, + "7": 22771346.0, + "8": 22926354.0, + "9": 22842338.0, + "10": 22918608.0, + "11": 22500808.0, + "12": 22460148.0, + "13": 22917564.0, + "14": 22389452.0, + "15": 22821768.0, + "16": 22831588.0, + "17": 22819586.0, + "18": 22582872.0, + "19": 22618426.0, + "20": 22693536.0, + "21": 22739728.0, + "22": 22800622.0, + "23": 22539616.0, + "24": 22771504.0, + "25": 22819092.0, + "26": 22547456.0, + "27": 22468726.0, + "28": 22453546.0, + "29": 22529680.0, + "30": 22630868.0, + "31": 22955432.0, + "32": 22585376.0, + "33": 22557692.0, + "34": 22835582.0, + "35": 22788568.0, + "36": 22588652.0, + "37": 22497950.0, + "38": 22895768.0, + "39": 22801524.0, + "40": 22657832.0, + "41": 22659668.0, + "42": 22667616.0, + "43": 22975828.0, + "44": 22746024.0, + "45": 22674860.0, + "46": 22884404.0, + "47": 22633804.0, + "48": 22928614.0, + "49": 22728000.0, + "50": 22904840.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 689176064.0, + "2": 689176064.0, + "3": 689176064.0, + "4": 689176064.0, + "5": 689176064.0, + "6": 689176064.0, + "7": 689176064.0, + "8": 689176064.0, + "9": 689176064.0, + "10": 689176064.0, + "11": 689176064.0, + "12": 689176064.0, + "13": 689176064.0, + "14": 689176064.0, + "15": 689176064.0, + "16": 689176064.0, + "17": 689176064.0, + "18": 689176064.0, + "19": 689176064.0, + "20": 689176064.0, + "21": 689176064.0, + "22": 689176064.0, + "23": 689176064.0, + "24": 689176064.0, + "25": 689176064.0, + "26": 689176064.0, + "27": 689176064.0, + "28": 689176064.0, + "29": 689176064.0, + "30": 689176064.0, + "31": 689176064.0, + "32": 689176064.0, + "33": 689176064.0, + "34": 689176064.0, + "35": 689176064.0, + "36": 689176064.0, + "37": 689176064.0, + "38": 689176064.0, + "39": 689176064.0, + "40": 689176064.0, + "41": 689176064.0, + "42": 689176064.0, + "43": 689176064.0, + "44": 689176064.0, + "45": 689176064.0, + "46": 689176064.0, + "47": 689176064.0, + "48": 689176064.0, + "49": 689176064.0, + "50": 689176064.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2158024192.0, + "2": 2416613888.0, + "3": 2416613888.0, + "4": 2416613888.0, + "5": 2416613888.0, + "6": 2416613888.0, + "7": 2416613888.0, + "8": 2416613888.0, + "9": 2416613888.0, + "10": 2416613888.0, + "11": 2416613888.0, + "12": 2416613888.0, + "13": 2416613888.0, + "14": 2416613888.0, + "15": 2416613888.0, + "16": 2416613888.0, + "17": 2416613888.0, + "18": 2416613888.0, + "19": 2416613888.0, + "20": 2416613888.0, + "21": 2416613888.0, + "22": 2416613888.0, + "23": 2416613888.0, + "24": 2416613888.0, + "25": 2416613888.0, + "26": 2416613888.0, + "27": 2416613888.0, + "28": 2416613888.0, + "29": 2416613888.0, + "30": 2416613888.0, + "31": 2416613888.0, + "32": 2416613888.0, + "33": 2416613888.0, + "34": 2416613888.0, + "35": 2416613888.0, + "36": 2416613888.0, + "37": 2416613888.0, + "38": 2416613888.0, + "39": 2416613888.0, + "40": 2416613888.0, + "41": 2416613888.0, + "42": 2416613888.0, + "43": 2416613888.0, + "44": 2416613888.0, + "45": 2416613888.0, + "46": 2416613888.0, + "47": 2416613888.0, + "48": 2416613888.0, + "49": 2416613888.0, + "50": 2416613888.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.71503, + "2": 0.1487, + "3": 1.53681, + "4": 2.08776, + "5": 2.61238, + "6": 1.60198, + "7": 0.87803, + "8": 0.10645, + "9": 1.03031, + "10": 0.10629, + "11": 0.2821, + "12": 0.10863, + "13": 0.10328, + "14": 0.10854, + "15": 0.10326, + "16": 0.10341, + "17": 0.10778, + "18": 0.11121, + "19": 0.10959, + "20": 0.10422, + "21": 0.10422, + "22": 0.1042, + "23": 0.10422, + "24": 0.10385, + "25": 0.10416, + "26": 0.1052, + "27": 0.10423, + "28": 0.10355, + "29": 0.10327, + "30": 0.10455, + "31": 0.10463, + "32": 0.1045, + "33": 0.10325, + "34": 0.10331, + "35": 0.10475, + "36": 0.10327, + "37": 0.10355, + "38": 0.10433, + "39": 0.10353, + "40": 0.10394, + "41": 0.10379, + "42": 0.10774, + "43": 0.10625, + "44": 0.10346, + "45": 0.10532, + "46": 0.10766, + "47": 0.10537, + "48": 0.10462, + "49": 0.1051, + "50": 0.1039 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..ea2bd7effce --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.88372, + "2": 10.87208, + "3": 10.8784, + "4": 10.85806, + "5": 10.88547, + "6": 10.89556, + "7": 10.88051, + "8": 10.87687, + "9": 10.868, + "10": 10.86477, + "11": 10.87779, + "12": 10.8736, + "13": 10.8617, + "14": 10.88756, + "15": 10.81334, + "16": 10.8276, + "17": 10.80766, + "18": 10.81067, + "19": 10.81127, + "20": 10.71864, + "21": 10.69427, + "22": 10.58083, + "23": 10.69548, + "24": 10.60367, + "25": 10.55396, + "26": 10.61304, + "27": 10.59026, + "28": 10.54029, + "29": 10.55687, + "30": 10.36075, + "31": 10.13943, + "32": 10.44344, + "33": 10.44459, + "34": 10.21087, + "35": 10.25855, + "36": 10.22779, + "37": 10.32843, + "38": 10.18154, + "39": 10.37655, + "40": 10.0779, + "41": 10.12618, + "42": 10.19378, + "43": 9.85406, + "44": 9.94224, + "45": 9.84493, + "46": 9.831, + "47": 10.13553, + "48": 9.84455, + "49": 9.5571, + "50": 9.89982 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 22726932.0, + "2": 22924916.0, + "3": 22597332.0, + "4": 23219544.0, + "5": 22713776.0, + "6": 23021572.0, + "7": 22771346.0, + "8": 22926354.0, + "9": 22842338.0, + "10": 22918608.0, + "11": 22500808.0, + "12": 22460148.0, + "13": 22917564.0, + "14": 22389452.0, + "15": 22821768.0, + "16": 22831588.0, + "17": 22819586.0, + "18": 22582872.0, + "19": 22618426.0, + "20": 22693536.0, + "21": 22739728.0, + "22": 22800622.0, + "23": 22539616.0, + "24": 22771504.0, + "25": 22819092.0, + "26": 22547456.0, + "27": 22468726.0, + "28": 22453546.0, + "29": 22529680.0, + "30": 22630868.0, + "31": 22955432.0, + "32": 22585376.0, + "33": 22557692.0, + "34": 22835582.0, + "35": 22788568.0, + "36": 22588652.0, + "37": 22497950.0, + "38": 22895768.0, + "39": 22801524.0, + "40": 22657832.0, + "41": 22659668.0, + "42": 22667616.0, + "43": 22975828.0, + "44": 22746024.0, + "45": 22674860.0, + "46": 22884404.0, + "47": 22633804.0, + "48": 22928614.0, + "49": 22728000.0, + "50": 22904840.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 689176064.0, + "2": 689176064.0, + "3": 689176064.0, + "4": 689176064.0, + "5": 689176064.0, + "6": 689176064.0, + "7": 689176064.0, + "8": 689176064.0, + "9": 689176064.0, + "10": 689176064.0, + "11": 689176064.0, + "12": 689176064.0, + "13": 689176064.0, + "14": 689176064.0, + "15": 689176064.0, + "16": 689176064.0, + "17": 689176064.0, + "18": 689176064.0, + "19": 689176064.0, + "20": 689176064.0, + "21": 689176064.0, + "22": 689176064.0, + "23": 689176064.0, + "24": 689176064.0, + "25": 689176064.0, + "26": 689176064.0, + "27": 689176064.0, + "28": 689176064.0, + "29": 689176064.0, + "30": 689176064.0, + "31": 689176064.0, + "32": 689176064.0, + "33": 689176064.0, + "34": 689176064.0, + "35": 689176064.0, + "36": 689176064.0, + "37": 689176064.0, + "38": 689176064.0, + "39": 689176064.0, + "40": 689176064.0, + "41": 689176064.0, + "42": 689176064.0, + "43": 689176064.0, + "44": 689176064.0, + "45": 689176064.0, + "46": 689176064.0, + "47": 689176064.0, + "48": 689176064.0, + "49": 689176064.0, + "50": 689176064.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2158024192.0, + "2": 2416613888.0, + "3": 2416613888.0, + "4": 2416613888.0, + "5": 2416613888.0, + "6": 2416613888.0, + "7": 2416613888.0, + "8": 2416613888.0, + "9": 2416613888.0, + "10": 2416613888.0, + "11": 2416613888.0, + "12": 2416613888.0, + "13": 2416613888.0, + "14": 2416613888.0, + "15": 2416613888.0, + "16": 2416613888.0, + "17": 2416613888.0, + "18": 2416613888.0, + "19": 2416613888.0, + "20": 2416613888.0, + "21": 2416613888.0, + "22": 2416613888.0, + "23": 2416613888.0, + "24": 2416613888.0, + "25": 2416613888.0, + "26": 2416613888.0, + "27": 2416613888.0, + "28": 2416613888.0, + "29": 2416613888.0, + "30": 2416613888.0, + "31": 2416613888.0, + "32": 2416613888.0, + "33": 2416613888.0, + "34": 2416613888.0, + "35": 2416613888.0, + "36": 2416613888.0, + "37": 2416613888.0, + "38": 2416613888.0, + "39": 2416613888.0, + "40": 2416613888.0, + "41": 2416613888.0, + "42": 2416613888.0, + "43": 2416613888.0, + "44": 2416613888.0, + "45": 2416613888.0, + "46": 2416613888.0, + "47": 2416613888.0, + "48": 2416613888.0, + "49": 2416613888.0, + "50": 2416613888.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.4694, + "2": 0.13977, + "3": 0.12731, + "4": 0.12879, + "5": 0.11865, + "6": 0.118, + "7": 0.11942, + "8": 0.11938, + "9": 0.11951, + "10": 0.11735, + "11": 0.11836, + "12": 0.11978, + "13": 0.11914, + "14": 0.11821, + "15": 0.11692, + "16": 0.11708, + "17": 0.11825, + "18": 0.11909, + "19": 0.11996, + "20": 0.11962, + "21": 0.12002, + "22": 0.11972, + "23": 0.11943, + "24": 0.11873, + "25": 0.11787, + "26": 0.1172, + "27": 0.11703, + "28": 0.12106, + "29": 0.11863, + "30": 0.11927, + "31": 0.11941, + "32": 0.11801, + "33": 0.11903, + "34": 0.1181, + "35": 0.11794, + "36": 0.11973, + "37": 0.11831, + "38": 0.11753, + "39": 0.11901, + "40": 0.11713, + "41": 0.11926, + "42": 0.11756, + "43": 0.1189, + "44": 0.11853, + "45": 0.12132, + "46": 0.11905, + "47": 0.11892, + "48": 0.11664, + "49": 0.11721, + "50": 0.11854 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..0f1e0462ded --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.9735, + "2": 10.96394, + "3": 10.96467, + "4": 10.96021, + "5": 10.95594, + "6": 10.96043, + "7": 10.95626, + "8": 10.96144, + "9": 10.965, + "10": 10.94989, + "11": 10.95576, + "12": 10.947, + "13": 10.94636, + "14": 10.95394, + "15": 10.9115, + "16": 10.91038, + "17": 10.88885, + "18": 10.89782, + "19": 10.89048, + "20": 10.80975, + "21": 10.78792, + "22": 10.69838, + "23": 10.79225, + "24": 10.69861, + "25": 10.6662, + "26": 10.71196, + "27": 10.68312, + "28": 10.62307, + "29": 10.65054, + "30": 10.45501, + "31": 10.22425, + "32": 10.52333, + "33": 10.52504, + "34": 10.29088, + "35": 10.33418, + "36": 10.28927, + "37": 10.39816, + "38": 10.25546, + "39": 10.44879, + "40": 10.14646, + "41": 10.19054, + "42": 10.24672, + "43": 9.89533, + "44": 10.00885, + "45": 9.89112, + "46": 9.86375, + "47": 10.165, + "48": 9.87995, + "49": 9.5695, + "50": 9.9526 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 22727052.0, + "2": 22925412.0, + "3": 22596906.0, + "4": 23219222.0, + "5": 22714228.0, + "6": 23021930.0, + "7": 22770230.0, + "8": 22926370.0, + "9": 22841956.0, + "10": 22918376.0, + "11": 22501022.0, + "12": 22459784.0, + "13": 22916644.0, + "14": 22389748.0, + "15": 22820932.0, + "16": 22831208.0, + "17": 22819716.0, + "18": 22582820.0, + "19": 22618452.0, + "20": 22694228.0, + "21": 22740076.0, + "22": 22799292.0, + "23": 22539898.0, + "24": 22771252.0, + "25": 22819528.0, + "26": 22547832.0, + "27": 22468264.0, + "28": 22453304.0, + "29": 22529758.0, + "30": 22631178.0, + "31": 22955168.0, + "32": 22584982.0, + "33": 22558648.0, + "34": 22835982.0, + "35": 22787526.0, + "36": 22589358.0, + "37": 22496568.0, + "38": 22896700.0, + "39": 22801666.0, + "40": 22657932.0, + "41": 22658800.0, + "42": 22666830.0, + "43": 22975584.0, + "44": 22746628.0, + "45": 22674550.0, + "46": 22885018.0, + "47": 22633780.0, + "48": 22929278.0, + "49": 22728106.0, + "50": 22905400.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 657718272.0, + "2": 657718272.0, + "3": 657718272.0, + "4": 657718272.0, + "5": 657718272.0, + "6": 657718272.0, + "7": 657718272.0, + "8": 657718272.0, + "9": 657718272.0, + "10": 657718272.0, + "11": 657718272.0, + "12": 657718272.0, + "13": 657718272.0, + "14": 657718272.0, + "15": 657718272.0, + "16": 657718272.0, + "17": 657718272.0, + "18": 657718272.0, + "19": 657718272.0, + "20": 657718272.0, + "21": 657718272.0, + "22": 657718272.0, + "23": 657718272.0, + "24": 657718272.0, + "25": 657718272.0, + "26": 657718272.0, + "27": 657718272.0, + "28": 657718272.0, + "29": 657718272.0, + "30": 657718272.0, + "31": 657718272.0, + "32": 657718272.0, + "33": 657718272.0, + "34": 657718272.0, + "35": 657718272.0, + "36": 657718272.0, + "37": 657718272.0, + "38": 657718272.0, + "39": 657718272.0, + "40": 657718272.0, + "41": 657718272.0, + "42": 657718272.0, + "43": 657718272.0, + "44": 657718272.0, + "45": 657718272.0, + "46": 657718272.0, + "47": 657718272.0, + "48": 657718272.0, + "49": 657718272.0, + "50": 657718272.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2129712128.0, + "2": 2385156096.0, + "3": 2385156096.0, + "4": 2385156096.0, + "5": 2385156096.0, + "6": 2385156096.0, + "7": 2385156096.0, + "8": 2385156096.0, + "9": 2385156096.0, + "10": 2385156096.0, + "11": 2385156096.0, + "12": 2385156096.0, + "13": 2385156096.0, + "14": 2385156096.0, + "15": 2385156096.0, + "16": 2385156096.0, + "17": 2385156096.0, + "18": 2385156096.0, + "19": 2385156096.0, + "20": 2385156096.0, + "21": 2385156096.0, + "22": 2385156096.0, + "23": 2385156096.0, + "24": 2385156096.0, + "25": 2385156096.0, + "26": 2385156096.0, + "27": 2385156096.0, + "28": 2385156096.0, + "29": 2385156096.0, + "30": 2385156096.0, + "31": 2385156096.0, + "32": 2385156096.0, + "33": 2385156096.0, + "34": 2385156096.0, + "35": 2385156096.0, + "36": 2385156096.0, + "37": 2385156096.0, + "38": 2385156096.0, + "39": 2385156096.0, + "40": 2385156096.0, + "41": 2385156096.0, + "42": 2385156096.0, + "43": 2385156096.0, + "44": 2385156096.0, + "45": 2385156096.0, + "46": 2385156096.0, + "47": 2385156096.0, + "48": 2385156096.0, + "49": 2385156096.0, + "50": 2385156096.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 17.63368, + "2": 0.20019, + "3": 0.17416, + "4": 0.17243, + "5": 0.17154, + "6": 0.17102, + "7": 0.17145, + "8": 0.17064, + "9": 0.17149, + "10": 0.17097, + "11": 0.1712, + "12": 0.17013, + "13": 0.17029, + "14": 0.17017, + "15": 0.4213, + "16": 0.44794, + "17": 0.16976, + "18": 0.16874, + "19": 0.16893, + "20": 0.16955, + "21": 0.16934, + "22": 0.16862, + "23": 0.16838, + "24": 0.16917, + "25": 0.16984, + "26": 0.16954, + "27": 0.16772, + "28": 0.16867, + "29": 0.16821, + "30": 0.16849, + "31": 0.1682, + "32": 0.16841, + "33": 0.16791, + "34": 0.16857, + "35": 0.16849, + "36": 0.16691, + "37": 0.16837, + "38": 0.16784, + "39": 0.1683, + "40": 0.16832, + "41": 0.16851, + "42": 0.16835, + "43": 0.16781, + "44": 0.16765, + "45": 0.16745, + "46": 0.1685, + "47": 0.168, + "48": 0.16906, + "49": 0.16772, + "50": 0.16771 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..5b8869bf6ef --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.9735, + "2": 10.96394, + "3": 10.96467, + "4": 10.96021, + "5": 10.95594, + "6": 10.96043, + "7": 10.95626, + "8": 10.96144, + "9": 10.965, + "10": 10.94989, + "11": 10.95576, + "12": 10.947, + "13": 10.94636, + "14": 10.95394, + "15": 10.9115, + "16": 10.91038, + "17": 10.88885, + "18": 10.89782, + "19": 10.89048, + "20": 10.80975, + "21": 10.78792, + "22": 10.69838, + "23": 10.79225, + "24": 10.69861, + "25": 10.6662, + "26": 10.71196, + "27": 10.68312, + "28": 10.62307, + "29": 10.65054, + "30": 10.45501, + "31": 10.22425, + "32": 10.52333, + "33": 10.52504, + "34": 10.29088, + "35": 10.33418, + "36": 10.28927, + "37": 10.39816, + "38": 10.25546, + "39": 10.44879, + "40": 10.14646, + "41": 10.19054, + "42": 10.24672, + "43": 9.89533, + "44": 10.00885, + "45": 9.89112, + "46": 9.86375, + "47": 10.165, + "48": 9.87995, + "49": 9.5695, + "50": 9.9526 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 22727052.0, + "2": 22925412.0, + "3": 22596906.0, + "4": 23219222.0, + "5": 22714228.0, + "6": 23021930.0, + "7": 22770230.0, + "8": 22926370.0, + "9": 22841956.0, + "10": 22918376.0, + "11": 22501022.0, + "12": 22459784.0, + "13": 22916644.0, + "14": 22389748.0, + "15": 22820932.0, + "16": 22831208.0, + "17": 22819716.0, + "18": 22582820.0, + "19": 22618452.0, + "20": 22694228.0, + "21": 22740076.0, + "22": 22799292.0, + "23": 22539898.0, + "24": 22771252.0, + "25": 22819528.0, + "26": 22547832.0, + "27": 22468264.0, + "28": 22453304.0, + "29": 22529758.0, + "30": 22631178.0, + "31": 22955168.0, + "32": 22584982.0, + "33": 22558648.0, + "34": 22835982.0, + "35": 22787526.0, + "36": 22589358.0, + "37": 22496568.0, + "38": 22896700.0, + "39": 22801666.0, + "40": 22657932.0, + "41": 22658800.0, + "42": 22666830.0, + "43": 22975584.0, + "44": 22746628.0, + "45": 22674550.0, + "46": 22885018.0, + "47": 22633780.0, + "48": 22929278.0, + "49": 22728106.0, + "50": 22905400.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 657718272.0, + "2": 657718272.0, + "3": 657718272.0, + "4": 657718272.0, + "5": 657718272.0, + "6": 657718272.0, + "7": 657718272.0, + "8": 657718272.0, + "9": 657718272.0, + "10": 657718272.0, + "11": 657718272.0, + "12": 657718272.0, + "13": 657718272.0, + "14": 657718272.0, + "15": 657718272.0, + "16": 657718272.0, + "17": 657718272.0, + "18": 657718272.0, + "19": 657718272.0, + "20": 657718272.0, + "21": 657718272.0, + "22": 657718272.0, + "23": 657718272.0, + "24": 657718272.0, + "25": 657718272.0, + "26": 657718272.0, + "27": 657718272.0, + "28": 657718272.0, + "29": 657718272.0, + "30": 657718272.0, + "31": 657718272.0, + "32": 657718272.0, + "33": 657718272.0, + "34": 657718272.0, + "35": 657718272.0, + "36": 657718272.0, + "37": 657718272.0, + "38": 657718272.0, + "39": 657718272.0, + "40": 657718272.0, + "41": 657718272.0, + "42": 657718272.0, + "43": 657718272.0, + "44": 657718272.0, + "45": 657718272.0, + "46": 657718272.0, + "47": 657718272.0, + "48": 657718272.0, + "49": 657718272.0, + "50": 657718272.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2129712128.0, + "2": 2385156096.0, + "3": 2385156096.0, + "4": 2385156096.0, + "5": 2385156096.0, + "6": 2385156096.0, + "7": 2385156096.0, + "8": 2385156096.0, + "9": 2385156096.0, + "10": 2385156096.0, + "11": 2385156096.0, + "12": 2385156096.0, + "13": 2385156096.0, + "14": 2385156096.0, + "15": 2385156096.0, + "16": 2385156096.0, + "17": 2385156096.0, + "18": 2385156096.0, + "19": 2385156096.0, + "20": 2385156096.0, + "21": 2385156096.0, + "22": 2385156096.0, + "23": 2385156096.0, + "24": 2385156096.0, + "25": 2385156096.0, + "26": 2385156096.0, + "27": 2385156096.0, + "28": 2385156096.0, + "29": 2385156096.0, + "30": 2385156096.0, + "31": 2385156096.0, + "32": 2385156096.0, + "33": 2385156096.0, + "34": 2385156096.0, + "35": 2385156096.0, + "36": 2385156096.0, + "37": 2385156096.0, + "38": 2385156096.0, + "39": 2385156096.0, + "40": 2385156096.0, + "41": 2385156096.0, + "42": 2385156096.0, + "43": 2385156096.0, + "44": 2385156096.0, + "45": 2385156096.0, + "46": 2385156096.0, + "47": 2385156096.0, + "48": 2385156096.0, + "49": 2385156096.0, + "50": 2385156096.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 16.59745, + "2": 0.20599, + "3": 0.17301, + "4": 0.16858, + "5": 0.16742, + "6": 0.16685, + "7": 0.16812, + "8": 0.16712, + "9": 0.16761, + "10": 0.17297, + "11": 0.16947, + "12": 0.16929, + "13": 0.16969, + "14": 0.17093, + "15": 0.41089, + "16": 0.16958, + "17": 0.17028, + "18": 0.16804, + "19": 0.168, + "20": 0.16883, + "21": 0.16811, + "22": 0.16849, + "23": 0.17004, + "24": 0.16922, + "25": 0.16921, + "26": 0.16876, + "27": 0.16877, + "28": 0.16916, + "29": 0.16991, + "30": 0.16846, + "31": 0.16951, + "32": 0.16845, + "33": 0.1685, + "34": 0.16865, + "35": 0.16813, + "36": 0.16739, + "37": 0.16866, + "38": 0.16859, + "39": 0.16669, + "40": 0.16917, + "41": 0.16941, + "42": 0.1688, + "43": 0.1693, + "44": 0.16931, + "45": 0.16903, + "46": 0.16894, + "47": 0.16682, + "48": 0.16811, + "49": 0.1682, + "50": 0.16932 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index f770cd4d016..10eb9e57910 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,287 @@ -{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.89824, "5": 10.88993, "10": 10.88255, "15": 10.86969, "20": 10.84335, "25": 10.75377, "30": 10.62875, "35": 10.56066, "40": 10.36652, "45": 10.15385, "50": 10.18997}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 22727178.0, "5": 22714208.0, "10": 22918036.0, "15": 22820856.0, "20": 22693674.0, "25": 22818024.0, "30": 22630720.0, "35": 22787216.0, "40": 22657316.0, "45": 22674868.0, "50": 22903748.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 641870336.0, "5": 641870336.0, "10": 641870336.0, "15": 641870336.0, "20": 641870336.0, "25": 641870336.0, "30": 641870336.0, "35": 641870336.0, "40": 641870336.0, "45": 641870336.0, "50": 641870336.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2611572224.0, "5": 2843894272.0, "10": 2843894272.0, "15": 2843894272.0, "20": 2843894272.0, "25": 2843894272.0, "30": 2843894272.0, "35": 2843894272.0, "40": 2843894272.0, "45": 2843894272.0, "50": 2843894272.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 13.42997, "5": 0.07593, "10": 0.06948, "15": 0.07002, "20": 0.07394, "25": 0.07013, "30": 0.07189, "35": 0.07303, "40": 0.07285, "45": 0.0679, "50": 0.069}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.89824, + "2": 10.90282, + "3": 10.89982, + "4": 10.86583, + "5": 10.88993, + "6": 10.9049, + "7": 10.89182, + "8": 10.90189, + "9": 10.88632, + "10": 10.88255, + "11": 10.91544, + "12": 10.90811, + "13": 10.91696, + "14": 10.92165, + "15": 10.86969, + "16": 10.8841, + "17": 10.87056, + "18": 10.88709, + "19": 10.87706, + "20": 10.84335, + "21": 10.83631, + "22": 10.76629, + "23": 10.83029, + "24": 10.79277, + "25": 10.75377, + "26": 10.78891, + "27": 10.79166, + "28": 10.74336, + "29": 10.75965, + "30": 10.62875, + "31": 10.45418, + "32": 10.68825, + "33": 10.68615, + "34": 10.52385, + "35": 10.56066, + "36": 10.53762, + "37": 10.60286, + "38": 10.46752, + "39": 10.60804, + "40": 10.36652, + "41": 10.38788, + "42": 10.45579, + "43": 10.15865, + "44": 10.24803, + "45": 10.15385, + "46": 10.13564, + "47": 10.39205, + "48": 10.1415, + "49": 9.88025, + "50": 10.18997 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 22727178.0, + "2": 22924812.0, + "3": 22596704.0, + "4": 23218766.0, + "5": 22714208.0, + "6": 23020316.0, + "7": 22771086.0, + "8": 22926440.0, + "9": 22842352.0, + "10": 22918036.0, + "11": 22500516.0, + "12": 22459304.0, + "13": 22916284.0, + "14": 22387532.0, + "15": 22820856.0, + "16": 22830090.0, + "17": 22818880.0, + "18": 22582012.0, + "19": 22616784.0, + "20": 22693674.0, + "21": 22739360.0, + "22": 22799250.0, + "23": 22538774.0, + "24": 22770954.0, + "25": 22818024.0, + "26": 22547278.0, + "27": 22468476.0, + "28": 22452228.0, + "29": 22527980.0, + "30": 22630720.0, + "31": 22954516.0, + "32": 22584820.0, + "33": 22557266.0, + "34": 22834728.0, + "35": 22787216.0, + "36": 22588668.0, + "37": 22496474.0, + "38": 22895320.0, + "39": 22800062.0, + "40": 22657316.0, + "41": 22658142.0, + "42": 22666692.0, + "43": 22974950.0, + "44": 22745468.0, + "45": 22674868.0, + "46": 22883238.0, + "47": 22632908.0, + "48": 22927884.0, + "49": 22727252.0, + "50": 22903748.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 638724608.0, + "2": 638724608.0, + "3": 638724608.0, + "4": 638724608.0, + "5": 638724608.0, + "6": 638724608.0, + "7": 638724608.0, + "8": 638724608.0, + "9": 638724608.0, + "10": 638724608.0, + "11": 638724608.0, + "12": 638724608.0, + "13": 638724608.0, + "14": 638724608.0, + "15": 638724608.0, + "16": 638724608.0, + "17": 638724608.0, + "18": 638724608.0, + "19": 638724608.0, + "20": 638724608.0, + "21": 638724608.0, + "22": 638724608.0, + "23": 638724608.0, + "24": 638724608.0, + "25": 638724608.0, + "26": 638724608.0, + "27": 638724608.0, + "28": 638724608.0, + "29": 638724608.0, + "30": 638724608.0, + "31": 638724608.0, + "32": 638724608.0, + "33": 638724608.0, + "34": 638724608.0, + "35": 638724608.0, + "36": 638724608.0, + "37": 638724608.0, + "38": 638724608.0, + "39": 638724608.0, + "40": 638724608.0, + "41": 638724608.0, + "42": 638724608.0, + "43": 638724608.0, + "44": 638724608.0, + "45": 638724608.0, + "46": 638724608.0, + "47": 638724608.0, + "48": 638724608.0, + "49": 638724608.0, + "50": 638724608.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2610025984.0, + "2": 2840250880.0, + "3": 2840250880.0, + "4": 2840250880.0, + "5": 2840250880.0, + "6": 2840250880.0, + "7": 2840250880.0, + "8": 2840250880.0, + "9": 2840250880.0, + "10": 2840250880.0, + "11": 2840250880.0, + "12": 2840250880.0, + "13": 2840250880.0, + "14": 2840250880.0, + "15": 2840250880.0, + "16": 2840250880.0, + "17": 2840250880.0, + "18": 2840250880.0, + "19": 2840250880.0, + "20": 2840250880.0, + "21": 2840250880.0, + "22": 2840250880.0, + "23": 2840250880.0, + "24": 2840250880.0, + "25": 2840250880.0, + "26": 2840250880.0, + "27": 2840250880.0, + "28": 2840250880.0, + "29": 2840250880.0, + "30": 2840250880.0, + "31": 2840250880.0, + "32": 2840250880.0, + "33": 2840250880.0, + "34": 2840250880.0, + "35": 2840250880.0, + "36": 2840250880.0, + "37": 2840250880.0, + "38": 2840250880.0, + "39": 2840250880.0, + "40": 2840250880.0, + "41": 2840250880.0, + "42": 2840250880.0, + "43": 2840250880.0, + "44": 2840250880.0, + "45": 2840250880.0, + "46": 2840250880.0, + "47": 2840250880.0, + "48": 2840250880.0, + "49": 2840250880.0, + "50": 2840250880.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 12.82473, + "2": 0.09608, + "3": 0.08117, + "4": 0.08184, + "5": 0.08242, + "6": 0.07918, + "7": 0.07939, + "8": 0.07963, + "9": 0.07945, + "10": 0.081, + "11": 0.07867, + "12": 0.07897, + "13": 0.0828, + "14": 0.08361, + "15": 0.08417, + "16": 0.08323, + "17": 0.08405, + "18": 0.08256, + "19": 0.08229, + "20": 0.0827, + "21": 0.08446, + "22": 0.08314, + "23": 0.08296, + "24": 0.08234, + "25": 0.0813, + "26": 0.08393, + "27": 0.08424, + "28": 0.08312, + "29": 0.08286, + "30": 0.08113, + "31": 0.07871, + "32": 0.08259, + "33": 0.08088, + "34": 0.07808, + "35": 0.07855, + "36": 0.07792, + "37": 0.07877, + "38": 0.07813, + "39": 0.07792, + "40": 0.07826, + "41": 0.07872, + "42": 0.07977, + "43": 0.07875, + "44": 0.07847, + "45": 0.07879, + "46": 0.07965, + "47": 0.08085, + "48": 0.07886, + "49": 0.07904, + "50": 0.07778 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..f1fd0f05b76 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.89824, + "2": 10.90282, + "3": 10.89982, + "4": 10.86583, + "5": 10.88993, + "6": 10.9049, + "7": 10.89182, + "8": 10.90189, + "9": 10.88632, + "10": 10.88255, + "11": 10.91544, + "12": 10.90811, + "13": 10.91696, + "14": 10.92165, + "15": 10.86969, + "16": 10.8841, + "17": 10.87056, + "18": 10.88709, + "19": 10.87706, + "20": 10.84335, + "21": 10.83631, + "22": 10.76629, + "23": 10.83029, + "24": 10.79277, + "25": 10.75377, + "26": 10.78891, + "27": 10.79166, + "28": 10.74336, + "29": 10.75965, + "30": 10.62875, + "31": 10.45418, + "32": 10.68825, + "33": 10.68615, + "34": 10.52385, + "35": 10.56066, + "36": 10.53762, + "37": 10.60286, + "38": 10.46752, + "39": 10.60804, + "40": 10.36652, + "41": 10.38788, + "42": 10.45579, + "43": 10.15865, + "44": 10.24803, + "45": 10.15385, + "46": 10.13564, + "47": 10.39205, + "48": 10.1415, + "49": 9.88025, + "50": 10.18997 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 22727178.0, + "2": 22924812.0, + "3": 22596704.0, + "4": 23218766.0, + "5": 22714208.0, + "6": 23020316.0, + "7": 22771086.0, + "8": 22926440.0, + "9": 22842352.0, + "10": 22918036.0, + "11": 22500516.0, + "12": 22459304.0, + "13": 22916284.0, + "14": 22387532.0, + "15": 22820856.0, + "16": 22830090.0, + "17": 22818880.0, + "18": 22582012.0, + "19": 22616784.0, + "20": 22693674.0, + "21": 22739360.0, + "22": 22799250.0, + "23": 22538774.0, + "24": 22770954.0, + "25": 22818024.0, + "26": 22547278.0, + "27": 22468476.0, + "28": 22452228.0, + "29": 22527980.0, + "30": 22630720.0, + "31": 22954516.0, + "32": 22584820.0, + "33": 22557266.0, + "34": 22834728.0, + "35": 22787216.0, + "36": 22588668.0, + "37": 22496474.0, + "38": 22895320.0, + "39": 22800062.0, + "40": 22657316.0, + "41": 22658142.0, + "42": 22666692.0, + "43": 22974950.0, + "44": 22745468.0, + "45": 22674868.0, + "46": 22883238.0, + "47": 22632908.0, + "48": 22927884.0, + "49": 22727252.0, + "50": 22903748.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 638724608.0, + "2": 638724608.0, + "3": 638724608.0, + "4": 638724608.0, + "5": 638724608.0, + "6": 638724608.0, + "7": 638724608.0, + "8": 638724608.0, + "9": 638724608.0, + "10": 638724608.0, + "11": 638724608.0, + "12": 638724608.0, + "13": 638724608.0, + "14": 638724608.0, + "15": 638724608.0, + "16": 638724608.0, + "17": 638724608.0, + "18": 638724608.0, + "19": 638724608.0, + "20": 638724608.0, + "21": 638724608.0, + "22": 638724608.0, + "23": 638724608.0, + "24": 638724608.0, + "25": 638724608.0, + "26": 638724608.0, + "27": 638724608.0, + "28": 638724608.0, + "29": 638724608.0, + "30": 638724608.0, + "31": 638724608.0, + "32": 638724608.0, + "33": 638724608.0, + "34": 638724608.0, + "35": 638724608.0, + "36": 638724608.0, + "37": 638724608.0, + "38": 638724608.0, + "39": 638724608.0, + "40": 638724608.0, + "41": 638724608.0, + "42": 638724608.0, + "43": 638724608.0, + "44": 638724608.0, + "45": 638724608.0, + "46": 638724608.0, + "47": 638724608.0, + "48": 638724608.0, + "49": 638724608.0, + "50": 638724608.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2610025984.0, + "2": 2840250880.0, + "3": 2840250880.0, + "4": 2840250880.0, + "5": 2840250880.0, + "6": 2840250880.0, + "7": 2840250880.0, + "8": 2840250880.0, + "9": 2840250880.0, + "10": 2840250880.0, + "11": 2840250880.0, + "12": 2840250880.0, + "13": 2840250880.0, + "14": 2840250880.0, + "15": 2840250880.0, + "16": 2840250880.0, + "17": 2840250880.0, + "18": 2840250880.0, + "19": 2840250880.0, + "20": 2840250880.0, + "21": 2840250880.0, + "22": 2840250880.0, + "23": 2840250880.0, + "24": 2840250880.0, + "25": 2840250880.0, + "26": 2840250880.0, + "27": 2840250880.0, + "28": 2840250880.0, + "29": 2840250880.0, + "30": 2840250880.0, + "31": 2840250880.0, + "32": 2840250880.0, + "33": 2840250880.0, + "34": 2840250880.0, + "35": 2840250880.0, + "36": 2840250880.0, + "37": 2840250880.0, + "38": 2840250880.0, + "39": 2840250880.0, + "40": 2840250880.0, + "41": 2840250880.0, + "42": 2840250880.0, + "43": 2840250880.0, + "44": 2840250880.0, + "45": 2840250880.0, + "46": 2840250880.0, + "47": 2840250880.0, + "48": 2840250880.0, + "49": 2840250880.0, + "50": 2840250880.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 14.66119, + "2": 0.10511, + "3": 0.07267, + "4": 0.07159, + "5": 0.07147, + "6": 0.07254, + "7": 0.07213, + "8": 0.07141, + "9": 0.07159, + "10": 0.07239, + "11": 0.07155, + "12": 0.0717, + "13": 0.07155, + "14": 0.07174, + "15": 0.07179, + "16": 0.07185, + "17": 0.0714, + "18": 0.07139, + "19": 0.0717, + "20": 0.07106, + "21": 0.0716, + "22": 0.07218, + "23": 0.07161, + "24": 0.07166, + "25": 0.07144, + "26": 0.07156, + "27": 0.0718, + "28": 0.07207, + "29": 0.07096, + "30": 0.07235, + "31": 0.07223, + "32": 0.07219, + "33": 0.07195, + "34": 0.07232, + "35": 0.07433, + "36": 0.07598, + "37": 0.07242, + "38": 0.07166, + "39": 0.07174, + "40": 0.07148, + "41": 0.0722, + "42": 0.07169, + "43": 0.07213, + "44": 0.07193, + "45": 0.07163, + "46": 0.07302, + "47": 0.07199, + "48": 0.07329, + "49": 0.07491, + "50": 0.07339 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..8f65ccec75e --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.89824, + "2": 10.90282, + "3": 10.89982, + "4": 10.86583, + "5": 10.88993, + "6": 10.9049, + "7": 10.89182, + "8": 10.90189, + "9": 10.88632, + "10": 10.88255, + "11": 10.91544, + "12": 10.90811, + "13": 10.91696, + "14": 10.92165, + "15": 10.86969, + "16": 10.8841, + "17": 10.87056, + "18": 10.88709, + "19": 10.87706, + "20": 10.84335, + "21": 10.83631, + "22": 10.76629, + "23": 10.83029, + "24": 10.79277, + "25": 10.75377, + "26": 10.78891, + "27": 10.79166, + "28": 10.74336, + "29": 10.75965, + "30": 10.62875, + "31": 10.45418, + "32": 10.68825, + "33": 10.68615, + "34": 10.52385, + "35": 10.56066, + "36": 10.53762, + "37": 10.60286, + "38": 10.46752, + "39": 10.60804, + "40": 10.36652, + "41": 10.38788, + "42": 10.45579, + "43": 10.15865, + "44": 10.24803, + "45": 10.15385, + "46": 10.13564, + "47": 10.39205, + "48": 10.1415, + "49": 9.88025, + "50": 10.18997 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 22727178.0, + "2": 22924812.0, + "3": 22596704.0, + "4": 23218766.0, + "5": 22714208.0, + "6": 23020316.0, + "7": 22771086.0, + "8": 22926440.0, + "9": 22842352.0, + "10": 22918036.0, + "11": 22500516.0, + "12": 22459304.0, + "13": 22916284.0, + "14": 22387532.0, + "15": 22820856.0, + "16": 22830090.0, + "17": 22818880.0, + "18": 22582012.0, + "19": 22616784.0, + "20": 22693674.0, + "21": 22739360.0, + "22": 22799250.0, + "23": 22538774.0, + "24": 22770954.0, + "25": 22818024.0, + "26": 22547278.0, + "27": 22468476.0, + "28": 22452228.0, + "29": 22527980.0, + "30": 22630720.0, + "31": 22954516.0, + "32": 22584820.0, + "33": 22557266.0, + "34": 22834728.0, + "35": 22787216.0, + "36": 22588668.0, + "37": 22496474.0, + "38": 22895320.0, + "39": 22800062.0, + "40": 22657316.0, + "41": 22658142.0, + "42": 22666692.0, + "43": 22974950.0, + "44": 22745468.0, + "45": 22674868.0, + "46": 22883238.0, + "47": 22632908.0, + "48": 22927884.0, + "49": 22727252.0, + "50": 22903748.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 638724608.0, + "2": 638724608.0, + "3": 638724608.0, + "4": 638724608.0, + "5": 638724608.0, + "6": 638724608.0, + "7": 638724608.0, + "8": 638724608.0, + "9": 638724608.0, + "10": 638724608.0, + "11": 638724608.0, + "12": 638724608.0, + "13": 638724608.0, + "14": 638724608.0, + "15": 638724608.0, + "16": 638724608.0, + "17": 638724608.0, + "18": 638724608.0, + "19": 638724608.0, + "20": 638724608.0, + "21": 638724608.0, + "22": 638724608.0, + "23": 638724608.0, + "24": 638724608.0, + "25": 638724608.0, + "26": 638724608.0, + "27": 638724608.0, + "28": 638724608.0, + "29": 638724608.0, + "30": 638724608.0, + "31": 638724608.0, + "32": 638724608.0, + "33": 638724608.0, + "34": 638724608.0, + "35": 638724608.0, + "36": 638724608.0, + "37": 638724608.0, + "38": 638724608.0, + "39": 638724608.0, + "40": 638724608.0, + "41": 638724608.0, + "42": 638724608.0, + "43": 638724608.0, + "44": 638724608.0, + "45": 638724608.0, + "46": 638724608.0, + "47": 638724608.0, + "48": 638724608.0, + "49": 638724608.0, + "50": 638724608.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2610025984.0, + "2": 2840250880.0, + "3": 2840250880.0, + "4": 2840250880.0, + "5": 2840250880.0, + "6": 2840250880.0, + "7": 2840250880.0, + "8": 2840250880.0, + "9": 2840250880.0, + "10": 2840250880.0, + "11": 2840250880.0, + "12": 2840250880.0, + "13": 2840250880.0, + "14": 2840250880.0, + "15": 2840250880.0, + "16": 2840250880.0, + "17": 2840250880.0, + "18": 2840250880.0, + "19": 2840250880.0, + "20": 2840250880.0, + "21": 2840250880.0, + "22": 2840250880.0, + "23": 2840250880.0, + "24": 2840250880.0, + "25": 2840250880.0, + "26": 2840250880.0, + "27": 2840250880.0, + "28": 2840250880.0, + "29": 2840250880.0, + "30": 2840250880.0, + "31": 2840250880.0, + "32": 2840250880.0, + "33": 2840250880.0, + "34": 2840250880.0, + "35": 2840250880.0, + "36": 2840250880.0, + "37": 2840250880.0, + "38": 2840250880.0, + "39": 2840250880.0, + "40": 2840250880.0, + "41": 2840250880.0, + "42": 2840250880.0, + "43": 2840250880.0, + "44": 2840250880.0, + "45": 2840250880.0, + "46": 2840250880.0, + "47": 2840250880.0, + "48": 2840250880.0, + "49": 2840250880.0, + "50": 2840250880.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 12.45868, + "2": 0.10817, + "3": 0.08964, + "4": 0.08342, + "5": 0.08198, + "6": 0.08179, + "7": 0.08172, + "8": 0.08319, + "9": 0.07964, + "10": 0.07872, + "11": 0.07783, + "12": 0.07839, + "13": 0.07961, + "14": 0.07913, + "15": 0.08021, + "16": 0.07965, + "17": 0.07946, + "18": 0.07924, + "19": 0.0792, + "20": 0.07919, + "21": 0.07872, + "22": 0.07958, + "23": 0.07857, + "24": 0.0793, + "25": 0.07936, + "26": 0.07956, + "27": 0.07904, + "28": 0.07939, + "29": 0.08007, + "30": 0.07912, + "31": 0.07945, + "32": 0.07845, + "33": 0.07804, + "34": 0.07801, + "35": 0.07775, + "36": 0.07835, + "37": 0.0781, + "38": 0.07939, + "39": 0.07789, + "40": 0.07803, + "41": 0.07935, + "42": 0.07838, + "43": 0.07862, + "44": 0.07884, + "45": 0.07747, + "46": 0.07832, + "47": 0.07792, + "48": 0.07896, + "49": 0.07798, + "50": 0.0779 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..6c887e9458f --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.84528, + "2": 10.85311, + "3": 10.85731, + "4": 10.84281, + "5": 10.87387, + "6": 10.88121, + "7": 10.8659, + "8": 10.84699, + "9": 10.86717, + "10": 10.83535, + "11": 10.91365, + "12": 10.87413, + "13": 10.86738, + "14": 10.89179, + "15": 10.84228, + "16": 10.84293, + "17": 10.81858, + "18": 10.85434, + "19": 10.85509, + "20": 10.80167, + "21": 10.79018, + "22": 10.72544, + "23": 10.8153, + "24": 10.74295, + "25": 10.71149, + "26": 10.77065, + "27": 10.78549, + "28": 10.73165, + "29": 10.75732, + "30": 10.58467, + "31": 10.4336, + "32": 10.68109, + "33": 10.66825, + "34": 10.49989, + "35": 10.53287, + "36": 10.52052, + "37": 10.59723, + "38": 10.45735, + "39": 10.62122, + "40": 10.35652, + "41": 10.40323, + "42": 10.45573, + "43": 10.11522, + "44": 10.24355, + "45": 10.13839, + "46": 10.11493, + "47": 10.39794, + "48": 10.14359, + "49": 9.89174, + "50": 10.20005 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 22726236.0, + "2": 22925004.0, + "3": 22596304.0, + "4": 23218272.0, + "5": 22714030.0, + "6": 23020852.0, + "7": 22770078.0, + "8": 22926044.0, + "9": 22841056.0, + "10": 22918036.0, + "11": 22500304.0, + "12": 22458314.0, + "13": 22916576.0, + "14": 22387996.0, + "15": 22821520.0, + "16": 22830056.0, + "17": 22819198.0, + "18": 22582774.0, + "19": 22617328.0, + "20": 22693656.0, + "21": 22739808.0, + "22": 22798880.0, + "23": 22539324.0, + "24": 22770360.0, + "25": 22819138.0, + "26": 22547248.0, + "27": 22468282.0, + "28": 22452480.0, + "29": 22528584.0, + "30": 22630790.0, + "31": 22954356.0, + "32": 22584864.0, + "33": 22557742.0, + "34": 22834464.0, + "35": 22787508.0, + "36": 22588878.0, + "37": 22496888.0, + "38": 22894876.0, + "39": 22800580.0, + "40": 22657590.0, + "41": 22658712.0, + "42": 22665704.0, + "43": 22975164.0, + "44": 22746238.0, + "45": 22674508.0, + "46": 22883428.0, + "47": 22632120.0, + "48": 22927616.0, + "49": 22726280.0, + "50": 22904058.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 609363968.0, + "2": 609363968.0, + "3": 609363968.0, + "4": 609363968.0, + "5": 609363968.0, + "6": 609363968.0, + "7": 609363968.0, + "8": 609363968.0, + "9": 609363968.0, + "10": 609363968.0, + "11": 609363968.0, + "12": 609363968.0, + "13": 609363968.0, + "14": 609363968.0, + "15": 609363968.0, + "16": 609363968.0, + "17": 609363968.0, + "18": 609363968.0, + "19": 609363968.0, + "20": 609363968.0, + "21": 609363968.0, + "22": 609363968.0, + "23": 609363968.0, + "24": 609363968.0, + "25": 609363968.0, + "26": 609363968.0, + "27": 609363968.0, + "28": 609363968.0, + "29": 609363968.0, + "30": 609363968.0, + "31": 609363968.0, + "32": 609363968.0, + "33": 609363968.0, + "34": 609363968.0, + "35": 609363968.0, + "36": 609363968.0, + "37": 609363968.0, + "38": 609363968.0, + "39": 609363968.0, + "40": 609363968.0, + "41": 609363968.0, + "42": 609363968.0, + "43": 609363968.0, + "44": 609363968.0, + "45": 609363968.0, + "46": 609363968.0, + "47": 609363968.0, + "48": 609363968.0, + "49": 609363968.0, + "50": 609363968.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2580665344.0, + "2": 2810890240.0, + "3": 2810890240.0, + "4": 2810890240.0, + "5": 2810890240.0, + "6": 2810890240.0, + "7": 2810890240.0, + "8": 2810890240.0, + "9": 2810890240.0, + "10": 2810890240.0, + "11": 2810890240.0, + "12": 2810890240.0, + "13": 2810890240.0, + "14": 2810890240.0, + "15": 2810890240.0, + "16": 2810890240.0, + "17": 2810890240.0, + "18": 2810890240.0, + "19": 2810890240.0, + "20": 2810890240.0, + "21": 2810890240.0, + "22": 2810890240.0, + "23": 2810890240.0, + "24": 2810890240.0, + "25": 2810890240.0, + "26": 2810890240.0, + "27": 2810890240.0, + "28": 2810890240.0, + "29": 2810890240.0, + "30": 2810890240.0, + "31": 2810890240.0, + "32": 2810890240.0, + "33": 2810890240.0, + "34": 2810890240.0, + "35": 2810890240.0, + "36": 2810890240.0, + "37": 2810890240.0, + "38": 2810890240.0, + "39": 2810890240.0, + "40": 2810890240.0, + "41": 2810890240.0, + "42": 2810890240.0, + "43": 2810890240.0, + "44": 2810890240.0, + "45": 2810890240.0, + "46": 2810890240.0, + "47": 2810890240.0, + "48": 2810890240.0, + "49": 2810890240.0, + "50": 2810890240.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 21.45212, + "2": 0.14782, + "3": 0.12419, + "4": 0.12287, + "5": 0.12472, + "6": 0.12792, + "7": 0.11932, + "8": 0.12137, + "9": 0.11933, + "10": 0.11994, + "11": 0.11962, + "12": 0.11989, + "13": 0.11879, + "14": 0.11883, + "15": 0.11974, + "16": 0.1189, + "17": 0.121, + "18": 0.12116, + "19": 0.12032, + "20": 0.1212, + "21": 0.11987, + "22": 0.1217, + "23": 0.12108, + "24": 0.12179, + "25": 0.12038, + "26": 0.11988, + "27": 0.12062, + "28": 0.12611, + "29": 0.11789, + "30": 0.11799, + "31": 0.11768, + "32": 0.11881, + "33": 0.11737, + "34": 0.11841, + "35": 0.11781, + "36": 0.11854, + "37": 0.1174, + "38": 0.11872, + "39": 0.11623, + "40": 0.1178, + "41": 0.11984, + "42": 0.11948, + "43": 0.12006, + "44": 0.11861, + "45": 0.11968, + "46": 0.12944, + "47": 0.11845, + "48": 0.12012, + "49": 0.11921, + "50": 0.11821 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..c213f354c2a --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.84528, + "2": 10.85311, + "3": 10.85731, + "4": 10.84281, + "5": 10.87387, + "6": 10.88121, + "7": 10.8659, + "8": 10.84699, + "9": 10.86717, + "10": 10.83535, + "11": 10.91365, + "12": 10.87413, + "13": 10.86738, + "14": 10.89179, + "15": 10.84228, + "16": 10.84293, + "17": 10.81858, + "18": 10.85434, + "19": 10.85509, + "20": 10.80167, + "21": 10.79018, + "22": 10.72544, + "23": 10.8153, + "24": 10.74295, + "25": 10.71149, + "26": 10.77065, + "27": 10.78549, + "28": 10.73165, + "29": 10.75732, + "30": 10.58467, + "31": 10.4336, + "32": 10.68109, + "33": 10.66825, + "34": 10.49989, + "35": 10.53287, + "36": 10.52052, + "37": 10.59723, + "38": 10.45735, + "39": 10.62122, + "40": 10.35652, + "41": 10.40323, + "42": 10.45573, + "43": 10.11522, + "44": 10.24355, + "45": 10.13839, + "46": 10.11493, + "47": 10.39794, + "48": 10.14359, + "49": 9.89174, + "50": 10.20005 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 22726236.0, + "2": 22925004.0, + "3": 22596304.0, + "4": 23218272.0, + "5": 22714030.0, + "6": 23020852.0, + "7": 22770078.0, + "8": 22926044.0, + "9": 22841056.0, + "10": 22918036.0, + "11": 22500304.0, + "12": 22458314.0, + "13": 22916576.0, + "14": 22387996.0, + "15": 22821520.0, + "16": 22830056.0, + "17": 22819198.0, + "18": 22582774.0, + "19": 22617328.0, + "20": 22693656.0, + "21": 22739808.0, + "22": 22798880.0, + "23": 22539324.0, + "24": 22770360.0, + "25": 22819138.0, + "26": 22547248.0, + "27": 22468282.0, + "28": 22452480.0, + "29": 22528584.0, + "30": 22630790.0, + "31": 22954356.0, + "32": 22584864.0, + "33": 22557742.0, + "34": 22834464.0, + "35": 22787508.0, + "36": 22588878.0, + "37": 22496888.0, + "38": 22894876.0, + "39": 22800580.0, + "40": 22657590.0, + "41": 22658712.0, + "42": 22665704.0, + "43": 22975164.0, + "44": 22746238.0, + "45": 22674508.0, + "46": 22883428.0, + "47": 22632120.0, + "48": 22927616.0, + "49": 22726280.0, + "50": 22904058.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 609363968.0, + "2": 609363968.0, + "3": 609363968.0, + "4": 609363968.0, + "5": 609363968.0, + "6": 609363968.0, + "7": 609363968.0, + "8": 609363968.0, + "9": 609363968.0, + "10": 609363968.0, + "11": 609363968.0, + "12": 609363968.0, + "13": 609363968.0, + "14": 609363968.0, + "15": 609363968.0, + "16": 609363968.0, + "17": 609363968.0, + "18": 609363968.0, + "19": 609363968.0, + "20": 609363968.0, + "21": 609363968.0, + "22": 609363968.0, + "23": 609363968.0, + "24": 609363968.0, + "25": 609363968.0, + "26": 609363968.0, + "27": 609363968.0, + "28": 609363968.0, + "29": 609363968.0, + "30": 609363968.0, + "31": 609363968.0, + "32": 609363968.0, + "33": 609363968.0, + "34": 609363968.0, + "35": 609363968.0, + "36": 609363968.0, + "37": 609363968.0, + "38": 609363968.0, + "39": 609363968.0, + "40": 609363968.0, + "41": 609363968.0, + "42": 609363968.0, + "43": 609363968.0, + "44": 609363968.0, + "45": 609363968.0, + "46": 609363968.0, + "47": 609363968.0, + "48": 609363968.0, + "49": 609363968.0, + "50": 609363968.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2580665344.0, + "2": 2810890240.0, + "3": 2810890240.0, + "4": 2810890240.0, + "5": 2810890240.0, + "6": 2810890240.0, + "7": 2810890240.0, + "8": 2810890240.0, + "9": 2810890240.0, + "10": 2810890240.0, + "11": 2810890240.0, + "12": 2810890240.0, + "13": 2810890240.0, + "14": 2810890240.0, + "15": 2810890240.0, + "16": 2810890240.0, + "17": 2810890240.0, + "18": 2810890240.0, + "19": 2810890240.0, + "20": 2810890240.0, + "21": 2810890240.0, + "22": 2810890240.0, + "23": 2810890240.0, + "24": 2810890240.0, + "25": 2810890240.0, + "26": 2810890240.0, + "27": 2810890240.0, + "28": 2810890240.0, + "29": 2810890240.0, + "30": 2810890240.0, + "31": 2810890240.0, + "32": 2810890240.0, + "33": 2810890240.0, + "34": 2810890240.0, + "35": 2810890240.0, + "36": 2810890240.0, + "37": 2810890240.0, + "38": 2810890240.0, + "39": 2810890240.0, + "40": 2810890240.0, + "41": 2810890240.0, + "42": 2810890240.0, + "43": 2810890240.0, + "44": 2810890240.0, + "45": 2810890240.0, + "46": 2810890240.0, + "47": 2810890240.0, + "48": 2810890240.0, + "49": 2810890240.0, + "50": 2810890240.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 17.94763, + "2": 0.1464, + "3": 0.12192, + "4": 0.12042, + "5": 0.12369, + "6": 0.1197, + "7": 0.12002, + "8": 0.12026, + "9": 0.11856, + "10": 0.11993, + "11": 0.11958, + "12": 0.11934, + "13": 0.11858, + "14": 0.11928, + "15": 0.11863, + "16": 0.11911, + "17": 0.11905, + "18": 0.12098, + "19": 0.11814, + "20": 0.11768, + "21": 0.11925, + "22": 0.11811, + "23": 0.11686, + "24": 0.11706, + "25": 0.11682, + "26": 0.11906, + "27": 0.11759, + "28": 0.11866, + "29": 0.11785, + "30": 0.11772, + "31": 0.11912, + "32": 0.118, + "33": 0.11808, + "34": 0.1174, + "35": 0.11853, + "36": 0.1174, + "37": 0.11808, + "38": 0.1194, + "39": 0.11749, + "40": 0.11871, + "41": 0.11887, + "42": 0.11731, + "43": 0.11929, + "44": 0.11811, + "45": 0.11913, + "46": 0.11806, + "47": 0.11686, + "48": 0.11726, + "49": 0.11729, + "50": 0.11729 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 3a679ee1d68..b668521f995 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.85163, - "5": 10.8787, - "10": 10.80636, - "15": 10.81034, - "20": 10.68692, - "25": 10.49703, - "30": 10.32668, - "35": 10.2249, - "40": 10.04381, + "2": 10.85389, + "3": 10.83867, + "4": 10.84326, + "5": 10.87865, + "6": 10.87589, + "7": 10.86185, + "8": 10.84926, + "9": 10.84876, + "10": 10.80639, + "11": 10.88684, + "12": 10.85677, + "13": 10.86234, + "14": 10.87768, + "15": 10.81036, + "16": 10.81987, + "17": 10.78281, + "18": 10.80322, + "19": 10.78354, + "20": 10.6869, + "21": 10.66901, + "22": 10.5231, + "23": 10.68441, + "24": 10.56577, + "25": 10.49701, + "26": 10.5655, + "27": 10.58174, + "28": 10.52997, + "29": 10.55562, + "30": 10.32673, + "31": 10.07635, + "32": 10.43058, + "33": 10.42459, + "34": 10.16648, + "35": 10.22488, + "36": 10.1834, + "37": 10.29955, + "38": 10.145, + "39": 10.37068, + "40": 10.04384, + "41": 10.09449, + "42": 10.1738, + "43": 9.77535, + "44": 9.90309, "45": 9.77899, - "50": 9.85789, - "55": 9.83807, - "60": 9.44187, - "65": 8.88428, - "70": 9.70474, + "46": 9.76547, + "47": 10.1072, + "48": 9.80031, + "49": 9.47524, + "50": 9.85793, + "51": 9.80033, + "52": 9.69511, + "53": 10.02851, + "54": 9.91434, + "55": 9.83811, + "56": 9.57832, + "57": 9.42584, + "58": 9.79169, + "59": 9.53621, + "60": 9.44188, + "61": 9.65656, + "62": 9.9438, + "63": 9.32147, + "64": 9.73338, + "65": 8.88431, + "66": 9.65528, + "67": 9.32102, + "68": 9.75063, + "69": 9.76395, + "70": 9.70471, + "71": 9.56858, + "72": 9.53902, + "73": 9.45226, + "74": 8.87734, "75": 9.37931, - "80": 9.36592, - "85": 9.57422, - "90": 9.78804, - "95": 9.48833, - "100": 9.35873 + "76": 9.01864, + "77": 10.0352, + "78": 9.69265, + "79": 9.33457, + "80": 9.36591, + "81": 9.4392, + "82": 9.66576, + "83": 9.25445, + "84": 9.37801, + "85": 9.57423, + "86": 9.03279, + "87": 9.55778, + "88": 9.71526, + "89": 9.55706, + "90": 9.78807, + "91": 9.29512, + "92": 9.31513, + "93": 9.03245, + "94": 8.79084, + "95": 9.48837, + "96": 9.49575, + "97": 9.27132, + "98": 9.64072, + "99": 8.84738, + "100": 9.3587 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 619.0, - "5": 646.0, - "10": 582.0, - "15": 710.0, - "20": 672.0, - "25": 605.0, - "30": 745.0, - "35": 753.0, - "40": 797.0, - "45": 727.0, - "50": 852.0, - "55": 882.0, - "60": 892.0, - "65": 934.0, - "70": 1066.0, - "75": 928.0, - "80": 1058.0, - "85": 1127.0, - "90": 1130.0, - "95": 1034.0, - "100": 1064.0 + "1": 604.0, + "2": 601.0, + "3": 657.0, + "4": 631.0, + "5": 677.0, + "6": 630.0, + "7": 662.0, + "8": 607.0, + "9": 614.0, + "10": 588.0, + "11": 713.0, + "12": 679.0, + "13": 667.0, + "14": 649.0, + "15": 667.0, + "16": 659.0, + "17": 681.0, + "18": 674.0, + "19": 586.0, + "20": 668.0, + "21": 679.0, + "22": 646.0, + "23": 757.0, + "24": 633.0, + "25": 653.0, + "26": 662.0, + "27": 682.0, + "28": 746.0, + "29": 758.0, + "30": 711.0, + "31": 645.0, + "32": 705.0, + "33": 759.0, + "34": 667.0, + "35": 745.0, + "36": 744.0, + "37": 799.0, + "38": 781.0, + "39": 903.0, + "40": 806.0, + "41": 804.0, + "42": 853.0, + "43": 651.0, + "44": 817.0, + "45": 834.0, + "46": 842.0, + "47": 859.0, + "48": 846.0, + "49": 831.0, + "50": 774.0, + "51": 927.0, + "52": 907.0, + "53": 981.0, + "54": 884.0, + "55": 858.0, + "56": 950.0, + "57": 885.0, + "58": 961.0, + "59": 949.0, + "60": 837.0, + "61": 953.0, + "62": 907.0, + "63": 911.0, + "64": 1085.0, + "65": 964.0, + "66": 1054.0, + "67": 1008.0, + "68": 975.0, + "69": 1027.0, + "70": 1025.0, + "71": 1093.0, + "72": 882.0, + "73": 988.0, + "74": 685.0, + "75": 857.0, + "76": 1040.0, + "77": 1138.0, + "78": 1115.0, + "79": 1049.0, + "80": 1127.0, + "81": 1260.0, + "82": 1089.0, + "83": 1000.0, + "84": 1123.0, + "85": 1179.0, + "86": 927.0, + "87": 1264.0, + "88": 1041.0, + "89": 1165.0, + "90": 1105.0, + "91": 1136.0, + "92": 1151.0, + "93": 880.0, + "94": 1183.0, + "95": 1125.0, + "96": 1202.0, + "97": 1026.0, + "98": 1189.0, + "99": 1171.0, + "100": 1097.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 689356288.0, - "5": 689356288.0, - "10": 689356288.0, - "15": 689356288.0, - "20": 689356288.0, - "25": 689356288.0, - "30": 689356288.0, - "35": 689356288.0, - "40": 689356288.0, - "45": 689356288.0, - "50": 689356288.0, - "55": 689356288.0, - "60": 689356288.0, - "65": 689356288.0, - "70": 689356288.0, - "75": 689356288.0, - "80": 689356288.0, - "85": 689356288.0, - "90": 689356288.0, - "95": 689356288.0, - "100": 689356288.0 + "1": 689618432.0, + "2": 689618432.0, + "3": 689618432.0, + "4": 689618432.0, + "5": 689618432.0, + "6": 689618432.0, + "7": 689618432.0, + "8": 689618432.0, + "9": 689618432.0, + "10": 689618432.0, + "11": 689618432.0, + "12": 689618432.0, + "13": 689618432.0, + "14": 689618432.0, + "15": 689618432.0, + "16": 689618432.0, + "17": 689618432.0, + "18": 689618432.0, + "19": 689618432.0, + "20": 689618432.0, + "21": 689618432.0, + "22": 689618432.0, + "23": 689618432.0, + "24": 689618432.0, + "25": 689618432.0, + "26": 689618432.0, + "27": 689618432.0, + "28": 689618432.0, + "29": 689618432.0, + "30": 689618432.0, + "31": 689618432.0, + "32": 689618432.0, + "33": 689618432.0, + "34": 689618432.0, + "35": 689618432.0, + "36": 689618432.0, + "37": 689618432.0, + "38": 689618432.0, + "39": 689618432.0, + "40": 689618432.0, + "41": 689618432.0, + "42": 689618432.0, + "43": 689618432.0, + "44": 689618432.0, + "45": 689618432.0, + "46": 689618432.0, + "47": 689618432.0, + "48": 689618432.0, + "49": 689618432.0, + "50": 689618432.0, + "51": 689618432.0, + "52": 689618432.0, + "53": 689618432.0, + "54": 689618432.0, + "55": 689618432.0, + "56": 689618432.0, + "57": 689618432.0, + "58": 689618432.0, + "59": 689618432.0, + "60": 689618432.0, + "61": 689618432.0, + "62": 689618432.0, + "63": 689618432.0, + "64": 689618432.0, + "65": 689618432.0, + "66": 689618432.0, + "67": 689618432.0, + "68": 689618432.0, + "69": 689618432.0, + "70": 689618432.0, + "71": 689618432.0, + "72": 689618432.0, + "73": 689618432.0, + "74": 689618432.0, + "75": 689618432.0, + "76": 689618432.0, + "77": 689618432.0, + "78": 689618432.0, + "79": 689618432.0, + "80": 689618432.0, + "81": 689618432.0, + "82": 689618432.0, + "83": 689618432.0, + "84": 689618432.0, + "85": 689618432.0, + "86": 689618432.0, + "87": 689618432.0, + "88": 689618432.0, + "89": 689618432.0, + "90": 689618432.0, + "91": 689618432.0, + "92": 689618432.0, + "93": 689618432.0, + "94": 689618432.0, + "95": 689618432.0, + "96": 689618432.0, + "97": 689618432.0, + "98": 689618432.0, + "99": 689618432.0, + "100": 689618432.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 961750016.0, - "5": 1220176384.0, - "10": 1223321600.0, - "15": 1223321600.0, - "20": 1223321600.0, - "25": 1223321600.0, - "30": 1223321600.0, - "35": 1223321600.0, - "40": 1223321600.0, - "45": 1223321600.0, - "50": 1223321600.0, - "55": 1223321600.0, - "60": 1223321600.0, - "65": 1223321600.0, - "70": 1223321600.0, - "75": 1223321600.0, - "80": 1223321600.0, - "85": 1223321600.0, - "90": 1223321600.0, - "95": 1223321600.0, - "100": 1223321600.0 + "1": 959652864.0, + "2": 1220175872.0, + "3": 1221224448.0, + "4": 1221224448.0, + "5": 1221224448.0, + "6": 1221224448.0, + "7": 1221224448.0, + "8": 1221224448.0, + "9": 1221224448.0, + "10": 1221224448.0, + "11": 1221224448.0, + "12": 1221224448.0, + "13": 1221224448.0, + "14": 1221224448.0, + "15": 1221224448.0, + "16": 1221224448.0, + "17": 1221224448.0, + "18": 1221224448.0, + "19": 1221224448.0, + "20": 1221224448.0, + "21": 1221224448.0, + "22": 1221224448.0, + "23": 1221224448.0, + "24": 1221224448.0, + "25": 1221224448.0, + "26": 1221224448.0, + "27": 1221224448.0, + "28": 1221224448.0, + "29": 1221224448.0, + "30": 1221224448.0, + "31": 1221224448.0, + "32": 1221224448.0, + "33": 1221224448.0, + "34": 1221224448.0, + "35": 1221224448.0, + "36": 1221224448.0, + "37": 1221224448.0, + "38": 1221224448.0, + "39": 1221224448.0, + "40": 1221224448.0, + "41": 1221224448.0, + "42": 1221224448.0, + "43": 1221224448.0, + "44": 1221224448.0, + "45": 1221224448.0, + "46": 1221224448.0, + "47": 1221224448.0, + "48": 1221224448.0, + "49": 1221224448.0, + "50": 1221224448.0, + "51": 1221486080.0, + "52": 1221486080.0, + "53": 1221486080.0, + "54": 1221486080.0, + "55": 1221486080.0, + "56": 1221486080.0, + "57": 1221486080.0, + "58": 1221486080.0, + "59": 1221486080.0, + "60": 1221486080.0, + "61": 1221486080.0, + "62": 1221486080.0, + "63": 1221486080.0, + "64": 1221486080.0, + "65": 1221486080.0, + "66": 1221486080.0, + "67": 1221486080.0, + "68": 1221486080.0, + "69": 1221487104.0, + "70": 1221487104.0, + "71": 1221487104.0, + "72": 1221487104.0, + "73": 1221487104.0, + "74": 1221487104.0, + "75": 1221487104.0, + "76": 1221487104.0, + "77": 1221487104.0, + "78": 1221487104.0, + "79": 1221487104.0, + "80": 1221487104.0, + "81": 1221487104.0, + "82": 1221487104.0, + "83": 1221487104.0, + "84": 1221487104.0, + "85": 1221487104.0, + "86": 1221487104.0, + "87": 1221487104.0, + "88": 1221487104.0, + "89": 1221487104.0, + "90": 1221487104.0, + "91": 1221487104.0, + "92": 1221487104.0, + "93": 1221487104.0, + "94": 1221487104.0, + "95": 1221487104.0, + "96": 1221487104.0, + "97": 1221487104.0, + "98": 1221487104.0, + "99": 1221487104.0, + "100": 1221487104.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 9.33137, - "5": 0.24439, - "10": 0.24539, - "15": 0.24239, - "20": 0.24713, - "25": 0.24683, - "30": 0.24516, - "35": 0.24456, - "40": 0.25161, - "45": 0.24886, - "50": 0.24548, - "55": 0.25414, - "60": 0.24546, - "65": 0.25395, - "70": 0.24573, - "75": 0.24821, - "80": 0.25298, - "85": 0.2568, - "90": 0.24531, - "95": 0.24617, - "100": 0.25395 + "1": 10.63286, + "2": 0.29932, + "3": 0.28799, + "4": 0.28475, + "5": 0.28729, + "6": 0.28613, + "7": 0.28182, + "8": 0.28376, + "9": 0.28071, + "10": 0.28064, + "11": 0.28008, + "12": 0.27999, + "13": 0.27369, + "14": 0.27735, + "15": 0.27802, + "16": 0.27647, + "17": 0.28017, + "18": 0.27624, + "19": 0.27907, + "20": 0.28457, + "21": 0.28621, + "22": 0.27968, + "23": 0.2788, + "24": 0.27704, + "25": 0.27774, + "26": 0.27744, + "27": 0.27759, + "28": 0.27978, + "29": 0.28051, + "30": 0.28034, + "31": 0.27733, + "32": 0.27813, + "33": 0.27733, + "34": 0.28166, + "35": 0.27601, + "36": 0.27766, + "37": 0.27784, + "38": 0.27709, + "39": 0.2776, + "40": 0.27758, + "41": 0.27975, + "42": 0.27633, + "43": 0.27864, + "44": 0.27802, + "45": 0.27955, + "46": 0.27725, + "47": 0.27926, + "48": 0.28083, + "49": 0.2781, + "50": 0.27962, + "51": 0.30289, + "52": 0.2758, + "53": 0.27484, + "54": 0.29013, + "55": 0.28835, + "56": 0.274, + "57": 0.27512, + "58": 0.27238, + "59": 0.27429, + "60": 0.27435, + "61": 0.27493, + "62": 0.27237, + "63": 0.27125, + "64": 0.27873, + "65": 0.27559, + "66": 0.27509, + "67": 0.27136, + "68": 0.27248, + "69": 0.27308, + "70": 0.27367, + "71": 0.27224, + "72": 0.27404, + "73": 0.27347, + "74": 0.27274, + "75": 0.27659, + "76": 0.27508, + "77": 0.27421, + "78": 0.27262, + "79": 0.27496, + "80": 0.27635, + "81": 0.60573, + "82": 0.27646, + "83": 0.27511, + "84": 0.27432, + "85": 0.27697, + "86": 0.27845, + "87": 0.27696, + "88": 0.27613, + "89": 0.28436, + "90": 0.27824, + "91": 0.27389, + "92": 0.27309, + "93": 0.27377, + "94": 0.27986, + "95": 0.27303, + "96": 0.2751, + "97": 0.2752, + "98": 0.27677, + "99": 0.27534, + "100": 0.27167 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..3a7a72a10c2 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85163, + "2": 10.85389, + "3": 10.83863, + "4": 10.84324, + "5": 10.87867, + "6": 10.87588, + "7": 10.86181, + "8": 10.84924, + "9": 10.84875, + "10": 10.80634, + "11": 10.8868, + "12": 10.8568, + "13": 10.86235, + "14": 10.87766, + "15": 10.81037, + "16": 10.8198, + "17": 10.7828, + "18": 10.80323, + "19": 10.78353, + "20": 10.6869, + "21": 10.66905, + "22": 10.52312, + "23": 10.68437, + "24": 10.56579, + "25": 10.49701, + "26": 10.56552, + "27": 10.58172, + "28": 10.52997, + "29": 10.55561, + "30": 10.32668, + "31": 10.07633, + "32": 10.43056, + "33": 10.42454, + "34": 10.16648, + "35": 10.22486, + "36": 10.18345, + "37": 10.29955, + "38": 10.14498, + "39": 10.37064, + "40": 10.04385, + "41": 10.09446, + "42": 10.1738, + "43": 9.77535, + "44": 9.9031, + "45": 9.779, + "46": 9.76548, + "47": 10.10718, + "48": 9.80028, + "49": 9.4752, + "50": 9.85787, + "51": 9.80034, + "52": 9.69507, + "53": 10.0285, + "54": 9.91432, + "55": 9.83807, + "56": 9.57827, + "57": 9.42584, + "58": 9.79171, + "59": 9.53621, + "60": 9.44186, + "61": 9.65655, + "62": 9.94377, + "63": 9.32146, + "64": 9.7334, + "65": 8.88429, + "66": 9.65527, + "67": 9.321, + "68": 9.75066, + "69": 9.76398, + "70": 9.70468, + "71": 9.56857, + "72": 9.53903, + "73": 9.45227, + "74": 8.87742, + "75": 9.37933, + "76": 9.0186, + "77": 10.03521, + "78": 9.69265, + "79": 9.33456, + "80": 9.36592, + "81": 9.4392, + "82": 9.66571, + "83": 9.25447, + "84": 9.378, + "85": 9.57419, + "86": 9.03278, + "87": 9.55776, + "88": 9.71523, + "89": 9.55706, + "90": 9.78804, + "91": 9.29518, + "92": 9.31513, + "93": 9.03243, + "94": 8.79087, + "95": 9.48835, + "96": 9.49572, + "97": 9.27133, + "98": 9.64071, + "99": 8.84737, + "100": 9.35871 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 627.0, + "2": 608.0, + "3": 673.0, + "4": 679.0, + "5": 640.0, + "6": 694.0, + "7": 628.0, + "8": 602.0, + "9": 653.0, + "10": 534.0, + "11": 712.0, + "12": 631.0, + "13": 674.0, + "14": 682.0, + "15": 711.0, + "16": 655.0, + "17": 720.0, + "18": 660.0, + "19": 641.0, + "20": 653.0, + "21": 651.0, + "22": 628.0, + "23": 722.0, + "24": 647.0, + "25": 682.0, + "26": 658.0, + "27": 655.0, + "28": 725.0, + "29": 794.0, + "30": 729.0, + "31": 632.0, + "32": 733.0, + "33": 803.0, + "34": 704.0, + "35": 728.0, + "36": 797.0, + "37": 839.0, + "38": 830.0, + "39": 885.0, + "40": 788.0, + "41": 878.0, + "42": 897.0, + "43": 770.0, + "44": 867.0, + "45": 735.0, + "46": 812.0, + "47": 884.0, + "48": 879.0, + "49": 828.0, + "50": 812.0, + "51": 896.0, + "52": 876.0, + "53": 976.0, + "54": 939.0, + "55": 875.0, + "56": 951.0, + "57": 865.0, + "58": 1011.0, + "59": 947.0, + "60": 786.0, + "61": 1059.0, + "62": 920.0, + "63": 917.0, + "64": 1022.0, + "65": 940.0, + "66": 1052.0, + "67": 994.0, + "68": 1024.0, + "69": 980.0, + "70": 1046.0, + "71": 1132.0, + "72": 911.0, + "73": 1006.0, + "74": 688.0, + "75": 889.0, + "76": 972.0, + "77": 1162.0, + "78": 1045.0, + "79": 1008.0, + "80": 1089.0, + "81": 1209.0, + "82": 1067.0, + "83": 999.0, + "84": 1135.0, + "85": 1194.0, + "86": 936.0, + "87": 1271.0, + "88": 1144.0, + "89": 1099.0, + "90": 1140.0, + "91": 1115.0, + "92": 1127.0, + "93": 961.0, + "94": 1203.0, + "95": 1140.0, + "96": 1177.0, + "97": 1055.0, + "98": 1335.0, + "99": 1164.0, + "100": 1093.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 689356288.0, + "2": 689356288.0, + "3": 689356288.0, + "4": 689356288.0, + "5": 689356288.0, + "6": 689356288.0, + "7": 689356288.0, + "8": 689356288.0, + "9": 689356288.0, + "10": 689356288.0, + "11": 689356288.0, + "12": 689356288.0, + "13": 689356288.0, + "14": 689356288.0, + "15": 689356288.0, + "16": 689356288.0, + "17": 689356288.0, + "18": 689356288.0, + "19": 689356288.0, + "20": 689356288.0, + "21": 689356288.0, + "22": 689356288.0, + "23": 689356288.0, + "24": 689356288.0, + "25": 689356288.0, + "26": 689356288.0, + "27": 689356288.0, + "28": 689356288.0, + "29": 689356288.0, + "30": 689356288.0, + "31": 689356288.0, + "32": 689356288.0, + "33": 689356288.0, + "34": 689356288.0, + "35": 689356288.0, + "36": 689356288.0, + "37": 689356288.0, + "38": 689356288.0, + "39": 689356288.0, + "40": 689356288.0, + "41": 689356288.0, + "42": 689356288.0, + "43": 689356288.0, + "44": 689356288.0, + "45": 689356288.0, + "46": 689356288.0, + "47": 689356288.0, + "48": 689356288.0, + "49": 689356288.0, + "50": 689356288.0, + "51": 689356288.0, + "52": 689356288.0, + "53": 689356288.0, + "54": 689356288.0, + "55": 689356288.0, + "56": 689356288.0, + "57": 689356288.0, + "58": 689356288.0, + "59": 689356288.0, + "60": 689356288.0, + "61": 689356288.0, + "62": 689356288.0, + "63": 689356288.0, + "64": 689356288.0, + "65": 689356288.0, + "66": 689356288.0, + "67": 689356288.0, + "68": 689356288.0, + "69": 689356288.0, + "70": 689356288.0, + "71": 689356288.0, + "72": 689356288.0, + "73": 689356288.0, + "74": 689356288.0, + "75": 689356288.0, + "76": 689356288.0, + "77": 689356288.0, + "78": 689356288.0, + "79": 689356288.0, + "80": 689356288.0, + "81": 689356288.0, + "82": 689356288.0, + "83": 689356288.0, + "84": 689356288.0, + "85": 689356288.0, + "86": 689356288.0, + "87": 689356288.0, + "88": 689356288.0, + "89": 689356288.0, + "90": 689356288.0, + "91": 689356288.0, + "92": 689356288.0, + "93": 689356288.0, + "94": 689356288.0, + "95": 689356288.0, + "96": 689356288.0, + "97": 689356288.0, + "98": 689356288.0, + "99": 689356288.0, + "100": 689356288.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 962798592.0, + "2": 1220175872.0, + "3": 1220175872.0, + "4": 1220175872.0, + "5": 1220175872.0, + "6": 1220175872.0, + "7": 1220175872.0, + "8": 1220175872.0, + "9": 1220175872.0, + "10": 1220175872.0, + "11": 1220175872.0, + "12": 1220175872.0, + "13": 1220175872.0, + "14": 1220175872.0, + "15": 1220175872.0, + "16": 1220175872.0, + "17": 1220175872.0, + "18": 1220175872.0, + "19": 1220175872.0, + "20": 1220175872.0, + "21": 1220175872.0, + "22": 1220175872.0, + "23": 1220175872.0, + "24": 1220175872.0, + "25": 1220175872.0, + "26": 1221224960.0, + "27": 1221224960.0, + "28": 1221224960.0, + "29": 1221224960.0, + "30": 1221224960.0, + "31": 1221224960.0, + "32": 1221224960.0, + "33": 1221224960.0, + "34": 1221224960.0, + "35": 1221224960.0, + "36": 1221224960.0, + "37": 1221224960.0, + "38": 1221224960.0, + "39": 1221224960.0, + "40": 1221224960.0, + "41": 1221224960.0, + "42": 1221224960.0, + "43": 1221224960.0, + "44": 1221224960.0, + "45": 1221224960.0, + "46": 1221224960.0, + "47": 1221224960.0, + "48": 1221224960.0, + "49": 1221224960.0, + "50": 1221224960.0, + "51": 1221224960.0, + "52": 1221224960.0, + "53": 1221224960.0, + "54": 1221224960.0, + "55": 1221224960.0, + "56": 1221224960.0, + "57": 1221224960.0, + "58": 1221224960.0, + "59": 1221224960.0, + "60": 1221224960.0, + "61": 1221224960.0, + "62": 1221224960.0, + "63": 1221224960.0, + "64": 1221224960.0, + "65": 1221224960.0, + "66": 1221224960.0, + "67": 1221224960.0, + "68": 1221224960.0, + "69": 1221224960.0, + "70": 1221224960.0, + "71": 1221224960.0, + "72": 1221224960.0, + "73": 1221224960.0, + "74": 1221224960.0, + "75": 1221224960.0, + "76": 1221224960.0, + "77": 1221224960.0, + "78": 1221224960.0, + "79": 1221224960.0, + "80": 1221224960.0, + "81": 1221224960.0, + "82": 1221224960.0, + "83": 1221224960.0, + "84": 1221224960.0, + "85": 1221224960.0, + "86": 1221224960.0, + "87": 1221224960.0, + "88": 1221224960.0, + "89": 1221224960.0, + "90": 1221224960.0, + "91": 1221224960.0, + "92": 1221224960.0, + "93": 1221224960.0, + "94": 1221224960.0, + "95": 1221224960.0, + "96": 1221224960.0, + "97": 1221224960.0, + "98": 1221224960.0, + "99": 1221224960.0, + "100": 1221224960.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.57061, + "2": 0.29948, + "3": 0.25664, + "4": 0.25525, + "5": 0.25975, + "6": 0.25312, + "7": 0.25214, + "8": 0.25198, + "9": 0.25236, + "10": 0.25037, + "11": 0.2502, + "12": 0.26, + "13": 0.25174, + "14": 0.2554, + "15": 0.25351, + "16": 0.25165, + "17": 0.25076, + "18": 0.2547, + "19": 0.26231, + "20": 0.24779, + "21": 0.2545, + "22": 0.2531, + "23": 0.25207, + "24": 0.25132, + "25": 0.25306, + "26": 0.25309, + "27": 0.25693, + "28": 0.25352, + "29": 0.25148, + "30": 0.29402, + "31": 0.26128, + "32": 0.24916, + "33": 0.24618, + "34": 0.25663, + "35": 0.25422, + "36": 0.24893, + "37": 0.2479, + "38": 0.24866, + "39": 0.2519, + "40": 0.24703, + "41": 0.26177, + "42": 0.26238, + "43": 0.26445, + "44": 0.25941, + "45": 0.25966, + "46": 0.26213, + "47": 0.2596, + "48": 0.2599, + "49": 0.26099, + "50": 0.25831, + "51": 0.26468, + "52": 0.27616, + "53": 0.28242, + "54": 0.25962, + "55": 0.25746, + "56": 0.2557, + "57": 0.25914, + "58": 0.26888, + "59": 0.25926, + "60": 0.2602, + "61": 0.25903, + "62": 0.59856, + "63": 0.25221, + "64": 0.26626, + "65": 0.25583, + "66": 0.25184, + "67": 0.25017, + "68": 0.24797, + "69": 0.25276, + "70": 0.24957, + "71": 0.25739, + "72": 0.25804, + "73": 0.24807, + "74": 0.24833, + "75": 0.24684, + "76": 0.24858, + "77": 0.2483, + "78": 0.24799, + "79": 0.24873, + "80": 0.25713, + "81": 0.24828, + "82": 0.25747, + "83": 0.25481, + "84": 0.25333, + "85": 0.25368, + "86": 0.24984, + "87": 0.24993, + "88": 0.24848, + "89": 0.24598, + "90": 0.24825, + "91": 0.24841, + "92": 0.24485, + "93": 0.24192, + "94": 0.24464, + "95": 0.24499, + "96": 0.24711, + "97": 0.2469, + "98": 0.24804, + "99": 0.25199, + "100": 0.24705 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..e88d1fcb739 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85163, + "2": 10.85389, + "3": 10.83866, + "4": 10.84328, + "5": 10.8787, + "6": 10.87586, + "7": 10.86186, + "8": 10.84928, + "9": 10.84877, + "10": 10.80639, + "11": 10.88679, + "12": 10.85682, + "13": 10.86235, + "14": 10.87768, + "15": 10.81037, + "16": 10.81984, + "17": 10.7828, + "18": 10.80322, + "19": 10.78358, + "20": 10.68694, + "21": 10.66905, + "22": 10.52315, + "23": 10.68436, + "24": 10.56577, + "25": 10.49705, + "26": 10.56553, + "27": 10.58171, + "28": 10.52995, + "29": 10.55561, + "30": 10.32672, + "31": 10.07636, + "32": 10.43058, + "33": 10.42455, + "34": 10.16647, + "35": 10.22486, + "36": 10.18341, + "37": 10.29956, + "38": 10.14498, + "39": 10.37061, + "40": 10.04385, + "41": 10.0945, + "42": 10.17381, + "43": 9.77538, + "44": 9.90308, + "45": 9.779, + "46": 9.76548, + "47": 10.10723, + "48": 9.80029, + "49": 9.47526, + "50": 9.85792, + "51": 9.80039, + "52": 9.69506, + "53": 10.0285, + "54": 9.9143, + "55": 9.83807, + "56": 9.57833, + "57": 9.42582, + "58": 9.79172, + "59": 9.53617, + "60": 9.44186, + "61": 9.65656, + "62": 9.94377, + "63": 9.32151, + "64": 9.73339, + "65": 8.88427, + "66": 9.65533, + "67": 9.32106, + "68": 9.75064, + "69": 9.764, + "70": 9.70469, + "71": 9.56861, + "72": 9.53902, + "73": 9.45226, + "74": 8.87736, + "75": 9.37933, + "76": 9.01867, + "77": 10.03519, + "78": 9.69263, + "79": 9.33459, + "80": 9.36591, + "81": 9.43919, + "82": 9.66572, + "83": 9.25441, + "84": 9.378, + "85": 9.57422, + "86": 9.03277, + "87": 9.55775, + "88": 9.71521, + "89": 9.55703, + "90": 9.788, + "91": 9.29518, + "92": 9.31516, + "93": 9.03246, + "94": 8.79087, + "95": 9.48833, + "96": 9.49574, + "97": 9.2713, + "98": 9.64071, + "99": 8.84741, + "100": 9.35871 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 600.0, + "2": 574.0, + "3": 677.0, + "4": 617.0, + "5": 669.0, + "6": 650.0, + "7": 700.0, + "8": 624.0, + "9": 649.0, + "10": 562.0, + "11": 661.0, + "12": 622.0, + "13": 711.0, + "14": 656.0, + "15": 688.0, + "16": 667.0, + "17": 696.0, + "18": 660.0, + "19": 607.0, + "20": 649.0, + "21": 646.0, + "22": 653.0, + "23": 743.0, + "24": 678.0, + "25": 663.0, + "26": 661.0, + "27": 703.0, + "28": 769.0, + "29": 775.0, + "30": 767.0, + "31": 606.0, + "32": 755.0, + "33": 764.0, + "34": 676.0, + "35": 779.0, + "36": 768.0, + "37": 824.0, + "38": 808.0, + "39": 893.0, + "40": 795.0, + "41": 774.0, + "42": 895.0, + "43": 758.0, + "44": 770.0, + "45": 738.0, + "46": 856.0, + "47": 912.0, + "48": 843.0, + "49": 884.0, + "50": 782.0, + "51": 967.0, + "52": 940.0, + "53": 988.0, + "54": 937.0, + "55": 870.0, + "56": 981.0, + "57": 838.0, + "58": 909.0, + "59": 969.0, + "60": 821.0, + "61": 1016.0, + "62": 953.0, + "63": 895.0, + "64": 1137.0, + "65": 917.0, + "66": 1050.0, + "67": 946.0, + "68": 974.0, + "69": 1091.0, + "70": 1024.0, + "71": 1104.0, + "72": 888.0, + "73": 967.0, + "74": 657.0, + "75": 879.0, + "76": 977.0, + "77": 1172.0, + "78": 1085.0, + "79": 1107.0, + "80": 1178.0, + "81": 1236.0, + "82": 1103.0, + "83": 975.0, + "84": 1164.0, + "85": 1160.0, + "86": 879.0, + "87": 1184.0, + "88": 1102.0, + "89": 1105.0, + "90": 1122.0, + "91": 1065.0, + "92": 1090.0, + "93": 848.0, + "94": 1158.0, + "95": 1173.0, + "96": 1140.0, + "97": 1074.0, + "98": 1203.0, + "99": 1141.0, + "100": 1111.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 689356288.0, + "2": 689356288.0, + "3": 689356288.0, + "4": 689356288.0, + "5": 689356288.0, + "6": 689356288.0, + "7": 689356288.0, + "8": 689356288.0, + "9": 689356288.0, + "10": 689356288.0, + "11": 689356288.0, + "12": 689356288.0, + "13": 689356288.0, + "14": 689356288.0, + "15": 689356288.0, + "16": 689356288.0, + "17": 689356288.0, + "18": 689356288.0, + "19": 689356288.0, + "20": 689356288.0, + "21": 689356288.0, + "22": 689356288.0, + "23": 689356288.0, + "24": 689356288.0, + "25": 689356288.0, + "26": 689356288.0, + "27": 689356288.0, + "28": 689356288.0, + "29": 689356288.0, + "30": 689356288.0, + "31": 689356288.0, + "32": 689356288.0, + "33": 689356288.0, + "34": 689356288.0, + "35": 689356288.0, + "36": 689356288.0, + "37": 689356288.0, + "38": 689356288.0, + "39": 689356288.0, + "40": 689356288.0, + "41": 689356288.0, + "42": 689356288.0, + "43": 689356288.0, + "44": 689356288.0, + "45": 689356288.0, + "46": 689356288.0, + "47": 689356288.0, + "48": 689356288.0, + "49": 689356288.0, + "50": 689356288.0, + "51": 689356288.0, + "52": 689356288.0, + "53": 689356288.0, + "54": 689356288.0, + "55": 689356288.0, + "56": 689356288.0, + "57": 689356288.0, + "58": 689356288.0, + "59": 689356288.0, + "60": 689356288.0, + "61": 689356288.0, + "62": 689356288.0, + "63": 689356288.0, + "64": 689356288.0, + "65": 689356288.0, + "66": 689356288.0, + "67": 689356288.0, + "68": 689356288.0, + "69": 689356288.0, + "70": 689356288.0, + "71": 689356288.0, + "72": 689356288.0, + "73": 689356288.0, + "74": 689356288.0, + "75": 689356288.0, + "76": 689356288.0, + "77": 689356288.0, + "78": 689356288.0, + "79": 689356288.0, + "80": 689356288.0, + "81": 689356288.0, + "82": 689356288.0, + "83": 689356288.0, + "84": 689356288.0, + "85": 689356288.0, + "86": 689356288.0, + "87": 689356288.0, + "88": 689356288.0, + "89": 689356288.0, + "90": 689356288.0, + "91": 689356288.0, + "92": 689356288.0, + "93": 689356288.0, + "94": 689356288.0, + "95": 689356288.0, + "96": 689356288.0, + "97": 689356288.0, + "98": 689356288.0, + "99": 689356288.0, + "100": 689356288.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 959652864.0, + "2": 1221223936.0, + "3": 1221224960.0, + "4": 1221224960.0, + "5": 1221224960.0, + "6": 1221224960.0, + "7": 1221224960.0, + "8": 1221224960.0, + "9": 1221224960.0, + "10": 1221224960.0, + "11": 1221224960.0, + "12": 1221224960.0, + "13": 1221224960.0, + "14": 1221224960.0, + "15": 1221224960.0, + "16": 1221224960.0, + "17": 1221224960.0, + "18": 1221224960.0, + "19": 1221224960.0, + "20": 1221224960.0, + "21": 1221224960.0, + "22": 1221224960.0, + "23": 1221224960.0, + "24": 1221224960.0, + "25": 1221224960.0, + "26": 1221224960.0, + "27": 1221224960.0, + "28": 1221224960.0, + "29": 1221224960.0, + "30": 1221224960.0, + "31": 1221224960.0, + "32": 1221224960.0, + "33": 1221224960.0, + "34": 1221224960.0, + "35": 1221224960.0, + "36": 1221224960.0, + "37": 1221224960.0, + "38": 1221224960.0, + "39": 1221224960.0, + "40": 1221224960.0, + "41": 1221224960.0, + "42": 1221224960.0, + "43": 1221224960.0, + "44": 1221224960.0, + "45": 1221224960.0, + "46": 1221224960.0, + "47": 1221224960.0, + "48": 1221224960.0, + "49": 1221224960.0, + "50": 1221224960.0, + "51": 1221224960.0, + "52": 1221224960.0, + "53": 1221224960.0, + "54": 1221224960.0, + "55": 1221224960.0, + "56": 1221224960.0, + "57": 1221224960.0, + "58": 1221224960.0, + "59": 1221224960.0, + "60": 1221224960.0, + "61": 1221224960.0, + "62": 1221224960.0, + "63": 1221224960.0, + "64": 1221224960.0, + "65": 1221224960.0, + "66": 1221224960.0, + "67": 1221224960.0, + "68": 1221224960.0, + "69": 1221224960.0, + "70": 1221224960.0, + "71": 1221224960.0, + "72": 1221224960.0, + "73": 1221224960.0, + "74": 1221224960.0, + "75": 1221224960.0, + "76": 1221224960.0, + "77": 1221224960.0, + "78": 1221224960.0, + "79": 1221224960.0, + "80": 1221224960.0, + "81": 1221224960.0, + "82": 1221224960.0, + "83": 1221224960.0, + "84": 1221224960.0, + "85": 1221224960.0, + "86": 1221224960.0, + "87": 1221224960.0, + "88": 1221224960.0, + "89": 1221224960.0, + "90": 1221224960.0, + "91": 1221224960.0, + "92": 1221224960.0, + "93": 1221224960.0, + "94": 1221224960.0, + "95": 1221224960.0, + "96": 1221224960.0, + "97": 1221224960.0, + "98": 1221224960.0, + "99": 1221224960.0, + "100": 1221224960.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.34397, + "2": 0.2989, + "3": 0.28701, + "4": 0.28299, + "5": 0.28509, + "6": 0.28378, + "7": 0.28776, + "8": 0.28423, + "9": 0.28722, + "10": 0.28077, + "11": 0.28936, + "12": 0.28752, + "13": 0.2827, + "14": 0.28574, + "15": 0.28467, + "16": 0.28217, + "17": 0.28486, + "18": 0.28581, + "19": 0.28155, + "20": 0.28509, + "21": 0.28251, + "22": 0.28381, + "23": 0.27876, + "24": 0.28748, + "25": 0.28028, + "26": 0.28778, + "27": 0.28262, + "28": 0.28332, + "29": 0.28115, + "30": 0.28178, + "31": 0.28495, + "32": 0.28165, + "33": 0.28663, + "34": 0.29207, + "35": 0.28688, + "36": 0.27656, + "37": 0.28363, + "38": 0.28429, + "39": 0.28629, + "40": 0.27969, + "41": 0.27978, + "42": 0.28454, + "43": 0.28022, + "44": 0.28402, + "45": 0.27645, + "46": 0.28795, + "47": 0.28097, + "48": 0.28395, + "49": 0.28183, + "50": 0.28615, + "51": 0.28373, + "52": 0.27449, + "53": 0.27345, + "54": 0.27869, + "55": 0.27079, + "56": 0.27901, + "57": 0.27662, + "58": 0.27749, + "59": 0.27681, + "60": 0.27639, + "61": 0.27275, + "62": 0.27644, + "63": 0.27655, + "64": 0.2741, + "65": 0.27749, + "66": 0.27321, + "67": 0.27962, + "68": 0.2759, + "69": 0.27771, + "70": 0.27472, + "71": 0.27602, + "72": 0.27221, + "73": 0.27682, + "74": 0.27563, + "75": 0.27287, + "76": 0.27345, + "77": 0.27491, + "78": 0.27512, + "79": 0.27463, + "80": 0.27721, + "81": 0.27482, + "82": 0.27638, + "83": 0.27219, + "84": 0.27519, + "85": 0.27727, + "86": 0.2756, + "87": 0.27351, + "88": 0.27369, + "89": 0.27604, + "90": 0.27461, + "91": 0.27436, + "92": 0.27679, + "93": 0.27705, + "94": 0.27348, + "95": 0.28014, + "96": 0.27482, + "97": 0.27546, + "98": 0.27381, + "99": 0.27767, + "100": 0.27505 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..27f7687927e --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.88759, + "2": 10.90372, + "3": 10.87084, + "4": 10.8703, + "5": 10.9019, + "6": 10.90847, + "7": 10.88782, + "8": 10.87732, + "9": 10.88357, + "10": 10.8685, + "11": 10.881, + "12": 10.88499, + "13": 10.90361, + "14": 10.89973, + "15": 10.84836, + "16": 10.84523, + "17": 10.8009, + "18": 10.82612, + "19": 10.81899, + "20": 10.71771, + "21": 10.69282, + "22": 10.57372, + "23": 10.70806, + "24": 10.58164, + "25": 10.54272, + "26": 10.60193, + "27": 10.59774, + "28": 10.55016, + "29": 10.56339, + "30": 10.33644, + "31": 10.09546, + "32": 10.4367, + "33": 10.43049, + "34": 10.17724, + "35": 10.23973, + "36": 10.1824, + "37": 10.30496, + "38": 10.14903, + "39": 10.35864, + "40": 10.0326, + "41": 10.08767, + "42": 10.16354, + "43": 9.78196, + "44": 9.89592, + "45": 9.76817, + "46": 9.7675, + "47": 10.08837, + "48": 9.78334, + "49": 9.45719, + "50": 9.85325, + "51": 9.78848, + "52": 9.67834, + "53": 10.01957, + "54": 9.90016, + "55": 9.82267, + "56": 9.56373, + "57": 9.41789, + "58": 9.77443, + "59": 9.52365, + "60": 9.43758, + "61": 9.64823, + "62": 9.93687, + "63": 9.30556, + "64": 9.72235, + "65": 8.87846, + "66": 9.65137, + "67": 9.31592, + "68": 9.73885, + "69": 9.74593, + "70": 9.68162, + "71": 9.56047, + "72": 9.53909, + "73": 9.44523, + "74": 8.88643, + "75": 9.37197, + "76": 9.03136, + "77": 10.03086, + "78": 9.6894, + "79": 9.33246, + "80": 9.35658, + "81": 9.43622, + "82": 9.65385, + "83": 9.2576, + "84": 9.3653, + "85": 9.57144, + "86": 9.03654, + "87": 9.55861, + "88": 9.70775, + "89": 9.55527, + "90": 9.7773, + "91": 9.29751, + "92": 9.32182, + "93": 9.0299, + "94": 8.78447, + "95": 9.48561, + "96": 9.48707, + "97": 9.27002, + "98": 9.63516, + "99": 8.83979, + "100": 9.35905 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 568.0, + "2": 629.0, + "3": 632.0, + "4": 645.0, + "5": 701.0, + "6": 581.0, + "7": 683.0, + "8": 582.0, + "9": 635.0, + "10": 541.0, + "11": 670.0, + "12": 548.0, + "13": 678.0, + "14": 681.0, + "15": 687.0, + "16": 686.0, + "17": 698.0, + "18": 652.0, + "19": 625.0, + "20": 614.0, + "21": 657.0, + "22": 589.0, + "23": 691.0, + "24": 607.0, + "25": 633.0, + "26": 695.0, + "27": 697.0, + "28": 701.0, + "29": 744.0, + "30": 666.0, + "31": 582.0, + "32": 675.0, + "33": 703.0, + "34": 648.0, + "35": 699.0, + "36": 763.0, + "37": 803.0, + "38": 848.0, + "39": 846.0, + "40": 769.0, + "41": 806.0, + "42": 858.0, + "43": 708.0, + "44": 779.0, + "45": 854.0, + "46": 804.0, + "47": 892.0, + "48": 866.0, + "49": 827.0, + "50": 819.0, + "51": 913.0, + "52": 837.0, + "53": 1076.0, + "54": 934.0, + "55": 892.0, + "56": 945.0, + "57": 850.0, + "58": 1041.0, + "59": 994.0, + "60": 875.0, + "61": 996.0, + "62": 983.0, + "63": 909.0, + "64": 1115.0, + "65": 922.0, + "66": 1137.0, + "67": 958.0, + "68": 996.0, + "69": 1065.0, + "70": 1077.0, + "71": 1119.0, + "72": 837.0, + "73": 1022.0, + "74": 750.0, + "75": 904.0, + "76": 1058.0, + "77": 1193.0, + "78": 1146.0, + "79": 1023.0, + "80": 1111.0, + "81": 1212.0, + "82": 1045.0, + "83": 1022.0, + "84": 1202.0, + "85": 1159.0, + "86": 885.0, + "87": 1249.0, + "88": 1065.0, + "89": 1158.0, + "90": 1045.0, + "91": 1061.0, + "92": 1143.0, + "93": 908.0, + "94": 1118.0, + "95": 1071.0, + "96": 1147.0, + "97": 1091.0, + "98": 1214.0, + "99": 1103.0, + "100": 1140.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 610712576.0, + "2": 610712576.0, + "3": 610712576.0, + "4": 610712576.0, + "5": 610712576.0, + "6": 610712576.0, + "7": 610712576.0, + "8": 610712576.0, + "9": 610712576.0, + "10": 610712576.0, + "11": 610712576.0, + "12": 610712576.0, + "13": 610712576.0, + "14": 610712576.0, + "15": 610712576.0, + "16": 610712576.0, + "17": 610712576.0, + "18": 610712576.0, + "19": 610712576.0, + "20": 610712576.0, + "21": 610712576.0, + "22": 610712576.0, + "23": 610712576.0, + "24": 610712576.0, + "25": 610712576.0, + "26": 610712576.0, + "27": 610712576.0, + "28": 610712576.0, + "29": 610712576.0, + "30": 610712576.0, + "31": 610712576.0, + "32": 610712576.0, + "33": 610712576.0, + "34": 610712576.0, + "35": 610712576.0, + "36": 610712576.0, + "37": 610712576.0, + "38": 610712576.0, + "39": 610712576.0, + "40": 610712576.0, + "41": 610712576.0, + "42": 610712576.0, + "43": 610712576.0, + "44": 610712576.0, + "45": 610712576.0, + "46": 610712576.0, + "47": 610712576.0, + "48": 610712576.0, + "49": 610712576.0, + "50": 610712576.0, + "51": 610712576.0, + "52": 610712576.0, + "53": 610712576.0, + "54": 610712576.0, + "55": 610712576.0, + "56": 610712576.0, + "57": 610712576.0, + "58": 610712576.0, + "59": 610712576.0, + "60": 610712576.0, + "61": 610712576.0, + "62": 610712576.0, + "63": 610712576.0, + "64": 610712576.0, + "65": 610712576.0, + "66": 610712576.0, + "67": 610712576.0, + "68": 610712576.0, + "69": 610712576.0, + "70": 610712576.0, + "71": 610712576.0, + "72": 610712576.0, + "73": 610712576.0, + "74": 610712576.0, + "75": 610712576.0, + "76": 610712576.0, + "77": 610712576.0, + "78": 610712576.0, + "79": 610712576.0, + "80": 610712576.0, + "81": 610712576.0, + "82": 610712576.0, + "83": 610712576.0, + "84": 610712576.0, + "85": 610712576.0, + "86": 610712576.0, + "87": 610712576.0, + "88": 610712576.0, + "89": 610712576.0, + "90": 610712576.0, + "91": 610712576.0, + "92": 610712576.0, + "93": 610712576.0, + "94": 610712576.0, + "95": 610712576.0, + "96": 610712576.0, + "97": 610712576.0, + "98": 610712576.0, + "99": 610712576.0, + "100": 610712576.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 882344448.0, + "2": 1142590976.0, + "3": 1142590976.0, + "4": 1142590976.0, + "5": 1142590976.0, + "6": 1142590976.0, + "7": 1142590976.0, + "8": 1142590976.0, + "9": 1142590976.0, + "10": 1142590976.0, + "11": 1142590976.0, + "12": 1142590976.0, + "13": 1142590976.0, + "14": 1142590976.0, + "15": 1142605824.0, + "16": 1142605824.0, + "17": 1142605824.0, + "18": 1142605824.0, + "19": 1142605824.0, + "20": 1142605824.0, + "21": 1142605824.0, + "22": 1142605824.0, + "23": 1142605824.0, + "24": 1142605824.0, + "25": 1142605824.0, + "26": 1142605824.0, + "27": 1142605824.0, + "28": 1142605824.0, + "29": 1142605824.0, + "30": 1142605824.0, + "31": 1142605824.0, + "32": 1142605824.0, + "33": 1142605824.0, + "34": 1142605824.0, + "35": 1142605824.0, + "36": 1142605824.0, + "37": 1142605824.0, + "38": 1142605824.0, + "39": 1142605824.0, + "40": 1142605824.0, + "41": 1142605824.0, + "42": 1142605824.0, + "43": 1142605824.0, + "44": 1142605824.0, + "45": 1142605824.0, + "46": 1142605824.0, + "47": 1142605824.0, + "48": 1142605824.0, + "49": 1142605824.0, + "50": 1142605824.0, + "51": 1142605824.0, + "52": 1142605824.0, + "53": 1142605824.0, + "54": 1142605824.0, + "55": 1142605824.0, + "56": 1142605824.0, + "57": 1142605824.0, + "58": 1142605824.0, + "59": 1142605824.0, + "60": 1142605824.0, + "61": 1142605824.0, + "62": 1142605824.0, + "63": 1142605824.0, + "64": 1142605824.0, + "65": 1142605824.0, + "66": 1142605824.0, + "67": 1142605824.0, + "68": 1142605824.0, + "69": 1142605824.0, + "70": 1142605824.0, + "71": 1142605824.0, + "72": 1142605824.0, + "73": 1142605824.0, + "74": 1142605824.0, + "75": 1142605824.0, + "76": 1142605824.0, + "77": 1142605824.0, + "78": 1142605824.0, + "79": 1142605824.0, + "80": 1142605824.0, + "81": 1142605824.0, + "82": 1142605824.0, + "83": 1142605824.0, + "84": 1142605824.0, + "85": 1142605824.0, + "86": 1142605824.0, + "87": 1142605824.0, + "88": 1142605824.0, + "89": 1142605824.0, + "90": 1142605824.0, + "91": 1142605824.0, + "92": 1142605824.0, + "93": 1142605824.0, + "94": 1142605824.0, + "95": 1142605824.0, + "96": 1142605824.0, + "97": 1142605824.0, + "98": 1142605824.0, + "99": 1142605824.0, + "100": 1142605824.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 8.61399, + "2": 0.3945, + "3": 0.34953, + "4": 0.35042, + "5": 0.35976, + "6": 0.34775, + "7": 0.34855, + "8": 0.3567, + "9": 0.57776, + "10": 0.35283, + "11": 0.34546, + "12": 0.66208, + "13": 0.3538, + "14": 0.33888, + "15": 0.34934, + "16": 0.3406, + "17": 0.34067, + "18": 0.34972, + "19": 0.33929, + "20": 0.57923, + "21": 0.33789, + "22": 0.63069, + "23": 0.33968, + "24": 0.3363, + "25": 0.35184, + "26": 0.33895, + "27": 0.33764, + "28": 0.36204, + "29": 0.33822, + "30": 0.3377, + "31": 0.35301, + "32": 0.33764, + "33": 0.33768, + "34": 0.35102, + "35": 0.33833, + "36": 0.33797, + "37": 0.35167, + "38": 0.33758, + "39": 0.33772, + "40": 0.34854, + "41": 0.33774, + "42": 0.33744, + "43": 0.35268, + "44": 0.33831, + "45": 0.34111, + "46": 0.36265, + "47": 0.33842, + "48": 0.33892, + "49": 0.35205, + "50": 0.33895, + "51": 0.35452, + "52": 0.3491, + "53": 0.34427, + "54": 0.3643, + "55": 0.34634, + "56": 0.34328, + "57": 0.35888, + "58": 0.34339, + "59": 0.3441, + "60": 0.35965, + "61": 0.34295, + "62": 0.3437, + "63": 0.35875, + "64": 0.34325, + "65": 0.34385, + "66": 0.35947, + "67": 0.34189, + "68": 0.34267, + "69": 0.35835, + "70": 0.3399, + "71": 0.34054, + "72": 0.36119, + "73": 0.3405, + "74": 0.34184, + "75": 0.36047, + "76": 0.34108, + "77": 0.35201, + "78": 0.3566, + "79": 0.34417, + "80": 0.36209, + "81": 0.3499, + "82": 0.34382, + "83": 0.35876, + "84": 0.34299, + "85": 0.34373, + "86": 0.3589, + "87": 0.3438, + "88": 0.3435, + "89": 0.35918, + "90": 0.34314, + "91": 0.34454, + "92": 0.3605, + "93": 0.35594, + "94": 0.34422, + "95": 0.36259, + "96": 0.34401, + "97": 0.34507, + "98": 0.3692, + "99": 0.34387, + "100": 0.35445 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..d39fc02d394 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.88759, + "2": 10.90372, + "3": 10.87084, + "4": 10.87028, + "5": 10.90194, + "6": 10.90848, + "7": 10.88784, + "8": 10.87729, + "9": 10.8836, + "10": 10.86849, + "11": 10.88103, + "12": 10.88497, + "13": 10.90361, + "14": 10.89973, + "15": 10.84833, + "16": 10.84522, + "17": 10.80087, + "18": 10.82613, + "19": 10.81897, + "20": 10.7177, + "21": 10.69285, + "22": 10.57376, + "23": 10.70805, + "24": 10.5816, + "25": 10.54269, + "26": 10.60192, + "27": 10.59777, + "28": 10.55013, + "29": 10.5634, + "30": 10.3364, + "31": 10.09543, + "32": 10.43669, + "33": 10.43049, + "34": 10.17722, + "35": 10.23976, + "36": 10.18239, + "37": 10.30493, + "38": 10.14901, + "39": 10.35864, + "40": 10.03267, + "41": 10.08765, + "42": 10.16354, + "43": 9.78194, + "44": 9.89592, + "45": 9.76819, + "46": 9.76746, + "47": 10.08836, + "48": 9.78334, + "49": 9.45723, + "50": 9.85323, + "51": 9.78852, + "52": 9.67832, + "53": 10.01958, + "54": 9.90021, + "55": 9.82267, + "56": 9.56373, + "57": 9.41792, + "58": 9.77442, + "59": 9.52363, + "60": 9.43757, + "61": 9.64824, + "62": 9.93692, + "63": 9.30557, + "64": 9.72235, + "65": 8.87843, + "66": 9.65136, + "67": 9.31594, + "68": 9.7388, + "69": 9.74596, + "70": 9.68161, + "71": 9.5605, + "72": 9.53909, + "73": 9.4452, + "74": 8.88639, + "75": 9.372, + "76": 9.03138, + "77": 10.03084, + "78": 9.68943, + "79": 9.33251, + "80": 9.35653, + "81": 9.4362, + "82": 9.65384, + "83": 9.2576, + "84": 9.36531, + "85": 9.57145, + "86": 9.0365, + "87": 9.55862, + "88": 9.70774, + "89": 9.55529, + "90": 9.7773, + "91": 9.29748, + "92": 9.32182, + "93": 9.02991, + "94": 8.78449, + "95": 9.48563, + "96": 9.48709, + "97": 9.27007, + "98": 9.63511, + "99": 8.83981, + "100": 9.35907 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 600.0, + "2": 622.0, + "3": 611.0, + "4": 564.0, + "5": 653.0, + "6": 733.0, + "7": 686.0, + "8": 617.0, + "9": 679.0, + "10": 535.0, + "11": 644.0, + "12": 616.0, + "13": 708.0, + "14": 646.0, + "15": 648.0, + "16": 648.0, + "17": 683.0, + "18": 638.0, + "19": 643.0, + "20": 587.0, + "21": 656.0, + "22": 578.0, + "23": 707.0, + "24": 640.0, + "25": 626.0, + "26": 675.0, + "27": 697.0, + "28": 740.0, + "29": 731.0, + "30": 656.0, + "31": 589.0, + "32": 704.0, + "33": 740.0, + "34": 711.0, + "35": 677.0, + "36": 723.0, + "37": 790.0, + "38": 759.0, + "39": 846.0, + "40": 797.0, + "41": 748.0, + "42": 817.0, + "43": 706.0, + "44": 809.0, + "45": 749.0, + "46": 812.0, + "47": 914.0, + "48": 890.0, + "49": 795.0, + "50": 864.0, + "51": 963.0, + "52": 907.0, + "53": 1040.0, + "54": 981.0, + "55": 836.0, + "56": 1022.0, + "57": 804.0, + "58": 964.0, + "59": 1012.0, + "60": 849.0, + "61": 996.0, + "62": 1016.0, + "63": 890.0, + "64": 1092.0, + "65": 1006.0, + "66": 1113.0, + "67": 916.0, + "68": 1065.0, + "69": 1073.0, + "70": 1156.0, + "71": 1034.0, + "72": 844.0, + "73": 1014.0, + "74": 748.0, + "75": 893.0, + "76": 1008.0, + "77": 1179.0, + "78": 1170.0, + "79": 1060.0, + "80": 1130.0, + "81": 1160.0, + "82": 1011.0, + "83": 964.0, + "84": 1205.0, + "85": 1082.0, + "86": 842.0, + "87": 1113.0, + "88": 1053.0, + "89": 1124.0, + "90": 1058.0, + "91": 1066.0, + "92": 1170.0, + "93": 894.0, + "94": 1207.0, + "95": 1104.0, + "96": 1196.0, + "97": 1081.0, + "98": 1247.0, + "99": 1088.0, + "100": 1138.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 610712576.0, + "2": 610712576.0, + "3": 610712576.0, + "4": 610712576.0, + "5": 610712576.0, + "6": 610712576.0, + "7": 610712576.0, + "8": 610712576.0, + "9": 610712576.0, + "10": 610712576.0, + "11": 610712576.0, + "12": 610712576.0, + "13": 610712576.0, + "14": 610712576.0, + "15": 610712576.0, + "16": 610712576.0, + "17": 610712576.0, + "18": 610712576.0, + "19": 610712576.0, + "20": 610712576.0, + "21": 610712576.0, + "22": 610712576.0, + "23": 610712576.0, + "24": 610712576.0, + "25": 610712576.0, + "26": 610712576.0, + "27": 610712576.0, + "28": 610712576.0, + "29": 610712576.0, + "30": 610712576.0, + "31": 610712576.0, + "32": 610712576.0, + "33": 610712576.0, + "34": 610712576.0, + "35": 610712576.0, + "36": 610712576.0, + "37": 610712576.0, + "38": 610712576.0, + "39": 610712576.0, + "40": 610712576.0, + "41": 610712576.0, + "42": 610712576.0, + "43": 610712576.0, + "44": 610712576.0, + "45": 610712576.0, + "46": 610712576.0, + "47": 610712576.0, + "48": 610712576.0, + "49": 610712576.0, + "50": 610712576.0, + "51": 610712576.0, + "52": 610712576.0, + "53": 610712576.0, + "54": 610712576.0, + "55": 610712576.0, + "56": 610712576.0, + "57": 610712576.0, + "58": 610712576.0, + "59": 610712576.0, + "60": 610712576.0, + "61": 610712576.0, + "62": 610712576.0, + "63": 610712576.0, + "64": 610712576.0, + "65": 610712576.0, + "66": 610712576.0, + "67": 610712576.0, + "68": 610712576.0, + "69": 610712576.0, + "70": 610712576.0, + "71": 610712576.0, + "72": 610712576.0, + "73": 610712576.0, + "74": 610712576.0, + "75": 610712576.0, + "76": 610712576.0, + "77": 610712576.0, + "78": 610712576.0, + "79": 610712576.0, + "80": 610712576.0, + "81": 610712576.0, + "82": 610712576.0, + "83": 610712576.0, + "84": 610712576.0, + "85": 610712576.0, + "86": 610712576.0, + "87": 610712576.0, + "88": 610712576.0, + "89": 610712576.0, + "90": 610712576.0, + "91": 610712576.0, + "92": 610712576.0, + "93": 610712576.0, + "94": 610712576.0, + "95": 610712576.0, + "96": 610712576.0, + "97": 610712576.0, + "98": 610712576.0, + "99": 610712576.0, + "100": 610712576.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 879199232.0, + "2": 1141542400.0, + "3": 1141557248.0, + "4": 1141557248.0, + "5": 1141557248.0, + "6": 1141557248.0, + "7": 1141557248.0, + "8": 1141557248.0, + "9": 1141557248.0, + "10": 1141557248.0, + "11": 1141557248.0, + "12": 1141557248.0, + "13": 1141557248.0, + "14": 1141557248.0, + "15": 1141557248.0, + "16": 1141557248.0, + "17": 1141557248.0, + "18": 1141557248.0, + "19": 1141557248.0, + "20": 1141557248.0, + "21": 1141557248.0, + "22": 1141557248.0, + "23": 1141557248.0, + "24": 1141557248.0, + "25": 1141557248.0, + "26": 1141557248.0, + "27": 1141557248.0, + "28": 1141557248.0, + "29": 1141557248.0, + "30": 1141557248.0, + "31": 1141557248.0, + "32": 1141557248.0, + "33": 1141557248.0, + "34": 1141557248.0, + "35": 1141557248.0, + "36": 1141557248.0, + "37": 1141557248.0, + "38": 1141557248.0, + "39": 1141557248.0, + "40": 1141557248.0, + "41": 1141557248.0, + "42": 1141557248.0, + "43": 1141557248.0, + "44": 1141557248.0, + "45": 1141557248.0, + "46": 1141557248.0, + "47": 1141557248.0, + "48": 1141557248.0, + "49": 1141557248.0, + "50": 1141557248.0, + "51": 1141557248.0, + "52": 1141557248.0, + "53": 1141557248.0, + "54": 1141557248.0, + "55": 1141557248.0, + "56": 1141557248.0, + "57": 1141557248.0, + "58": 1141557248.0, + "59": 1141557248.0, + "60": 1141557248.0, + "61": 1142604800.0, + "62": 1142604800.0, + "63": 1142604800.0, + "64": 1142604800.0, + "65": 1142604800.0, + "66": 1142605824.0, + "67": 1142605824.0, + "68": 1142605824.0, + "69": 1142605824.0, + "70": 1142605824.0, + "71": 1142605824.0, + "72": 1142605824.0, + "73": 1142605824.0, + "74": 1142605824.0, + "75": 1142605824.0, + "76": 1142605824.0, + "77": 1142605824.0, + "78": 1142605824.0, + "79": 1142605824.0, + "80": 1142605824.0, + "81": 1142605824.0, + "82": 1142605824.0, + "83": 1142605824.0, + "84": 1142605824.0, + "85": 1142605824.0, + "86": 1142605824.0, + "87": 1142605824.0, + "88": 1142605824.0, + "89": 1142605824.0, + "90": 1142605824.0, + "91": 1142605824.0, + "92": 1142605824.0, + "93": 1143639552.0, + "94": 1143639552.0, + "95": 1143639552.0, + "96": 1143639552.0, + "97": 1143639552.0, + "98": 1143639552.0, + "99": 1143639552.0, + "100": 1143639552.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 8.52918, + "2": 0.38912, + "3": 0.35372, + "4": 0.34811, + "5": 0.35505, + "6": 0.35402, + "7": 0.55808, + "8": 0.3492, + "9": 0.34355, + "10": 0.82935, + "11": 0.34715, + "12": 0.34905, + "13": 0.55638, + "14": 0.35683, + "15": 0.34903, + "16": 0.34374, + "17": 0.35024, + "18": 0.35007, + "19": 0.34305, + "20": 0.35453, + "21": 0.3508, + "22": 0.35066, + "23": 0.34925, + "24": 0.35006, + "25": 0.34932, + "26": 0.66663, + "27": 0.34789, + "28": 0.34677, + "29": 0.34709, + "30": 0.35185, + "31": 0.34811, + "32": 0.35284, + "33": 0.35196, + "34": 0.35397, + "35": 0.34638, + "36": 0.35167, + "37": 0.35284, + "38": 0.34596, + "39": 0.35367, + "40": 0.35293, + "41": 0.34542, + "42": 0.35234, + "43": 0.35494, + "44": 0.34767, + "45": 0.35264, + "46": 0.35205, + "47": 0.35099, + "48": 0.34893, + "49": 0.34959, + "50": 0.34935, + "51": 0.35425, + "52": 0.34505, + "53": 0.34281, + "54": 0.35622, + "55": 0.3559, + "56": 0.34855, + "57": 0.34974, + "58": 0.34693, + "59": 0.34844, + "60": 0.34963, + "61": 0.34651, + "62": 0.349, + "63": 0.35001, + "64": 0.34701, + "65": 0.34907, + "66": 0.34895, + "67": 0.34615, + "68": 0.34859, + "69": 0.36095, + "70": 0.34112, + "71": 0.34777, + "72": 0.35188, + "73": 0.34151, + "74": 0.34797, + "75": 0.35077, + "76": 0.34341, + "77": 0.35012, + "78": 0.34839, + "79": 0.34146, + "80": 0.35541, + "81": 0.34764, + "82": 0.34184, + "83": 0.35606, + "84": 0.34949, + "85": 0.34885, + "86": 0.3509, + "87": 0.35235, + "88": 0.34695, + "89": 0.35078, + "90": 0.35066, + "91": 0.352, + "92": 0.34948, + "93": 0.35191, + "94": 0.35111, + "95": 0.35751, + "96": 0.3453, + "97": 0.3509, + "98": 0.35322, + "99": 0.34448, + "100": 0.35525 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 2632047f775..f1d9edf458f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.84466, + "2": 10.84794, + "3": 10.84923, + "4": 10.8433, "5": 10.88246, + "6": 10.8808, + "7": 10.86574, + "8": 10.85417, + "9": 10.85542, "10": 10.81812, + "11": 10.88726, + "12": 10.86329, + "13": 10.86656, + "14": 10.884, "15": 10.8231, + "16": 10.82809, + "17": 10.79467, + "18": 10.81466, + "19": 10.80122, "20": 10.71614, + "21": 10.69886, + "22": 10.56738, + "23": 10.71707, + "24": 10.60503, "25": 10.55053, + "26": 10.60941, + "27": 10.62543, + "28": 10.57767, + "29": 10.59725, "30": 10.38488, + "31": 10.15554, + "32": 10.48231, + "33": 10.4763, + "34": 10.2393, "35": 10.29064, + "36": 10.25146, + "37": 10.35662, + "38": 10.21142, + "39": 10.42144, "40": 10.11569, + "41": 10.16423, + "42": 10.23644, + "43": 9.86597, + "44": 9.98146, "45": 9.86983, + "46": 9.85349, + "47": 10.16995, + "48": 9.876, + "49": 9.57237, "50": 9.92525, + "51": 9.8709, + "52": 9.7737, + "53": 10.08149, + "54": 9.97376, "55": 9.90036, + "56": 9.64783, + "57": 9.50136, + "58": 9.85199, + "59": 9.6034, "60": 9.50993, + "61": 9.71315, + "62": 9.99373, + "63": 9.39358, + "64": 9.78904, "65": 8.96358, + "66": 9.71142, + "67": 9.38175, + "68": 9.79833, + "69": 9.80889, "70": 9.75039, + "71": 9.62004, + "72": 9.59387, + "73": 9.50631, + "74": 8.94916, "75": 9.43188, + "76": 9.08702, + "77": 10.06886, + "78": 9.73459, + "79": 9.38325, "80": 9.41272, + "81": 9.48499, + "82": 9.70672, + "83": 9.30939, + "84": 9.42428, "85": 9.61991, + "86": 9.07811, + "87": 9.59541, + "88": 9.75596, + "89": 9.60274, "90": 9.82165, + "91": 9.34268, + "92": 9.35878, + "93": 9.08116, + "94": 8.83791, "95": 9.5238, + "96": 9.53556, + "97": 9.31807, + "98": 9.68183, + "99": 8.89422, "100": 9.40138 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1713.0, + "2": 1750.0, + "3": 1744.0, + "4": 1895.0, "5": 1839.0, + "6": 1881.0, + "7": 1850.0, + "8": 1743.0, + "9": 1810.0, "10": 1452.0, + "11": 1886.0, + "12": 1752.0, + "13": 1834.0, + "14": 1774.0, "15": 1909.0, + "16": 1803.0, + "17": 1927.0, + "18": 1765.0, + "19": 1847.0, "20": 1707.0, + "21": 1950.0, + "22": 1794.0, + "23": 1974.0, + "24": 1676.0, "25": 1652.0, + "26": 1774.0, + "27": 1799.0, + "28": 2135.0, + "29": 2048.0, "30": 2032.0, + "31": 1599.0, + "32": 1929.0, + "33": 2143.0, + "34": 1874.0, "35": 1974.0, + "36": 2011.0, + "37": 2364.0, + "38": 2199.0, + "39": 2363.0, "40": 2239.0, + "41": 2269.0, + "42": 2228.0, + "43": 1972.0, + "44": 2070.0, "45": 2033.0, + "46": 2357.0, + "47": 2520.0, + "48": 2316.0, + "49": 2307.0, "50": 2302.0, + "51": 2514.0, + "52": 2430.0, + "53": 2840.0, + "54": 2677.0, "55": 2394.0, + "56": 2601.0, + "57": 2341.0, + "58": 2837.0, + "59": 2789.0, "60": 2425.0, + "61": 2923.0, + "62": 2591.0, + "63": 2416.0, + "64": 2937.0, "65": 2572.0, + "66": 3008.0, + "67": 2843.0, + "68": 2761.0, + "69": 2834.0, "70": 3108.0, + "71": 2989.0, + "72": 2316.0, + "73": 2950.0, + "74": 1899.0, "75": 2378.0, + "76": 2962.0, + "77": 3343.0, + "78": 3183.0, + "79": 2979.0, "80": 3209.0, + "81": 3583.0, + "82": 3160.0, + "83": 2776.0, + "84": 3242.0, "85": 3425.0, + "86": 2720.0, + "87": 3820.0, + "88": 3050.0, + "89": 3297.0, "90": 3069.0, + "91": 2685.0, + "92": 3061.0, + "93": 2584.0, + "94": 3338.0, "95": 3406.0, + "96": 3389.0, + "97": 3104.0, + "98": 3583.0, + "99": 3229.0, "100": 3225.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 490700288.0, + "2": 490700288.0, + "3": 490700288.0, + "4": 490700288.0, "5": 490700288.0, + "6": 490700288.0, + "7": 490700288.0, + "8": 490700288.0, + "9": 490700288.0, "10": 490700288.0, + "11": 490700288.0, + "12": 490700288.0, + "13": 490700288.0, + "14": 490700288.0, "15": 490700288.0, + "16": 490700288.0, + "17": 490700288.0, + "18": 490700288.0, + "19": 490700288.0, "20": 490700288.0, + "21": 490700288.0, + "22": 490700288.0, + "23": 490700288.0, + "24": 490700288.0, "25": 490700288.0, + "26": 490700288.0, + "27": 490700288.0, + "28": 490700288.0, + "29": 490700288.0, "30": 490700288.0, + "31": 490700288.0, + "32": 490700288.0, + "33": 490700288.0, + "34": 490700288.0, "35": 490700288.0, + "36": 490700288.0, + "37": 490700288.0, + "38": 490700288.0, + "39": 490700288.0, "40": 490700288.0, + "41": 490700288.0, + "42": 490700288.0, + "43": 490700288.0, + "44": 490700288.0, "45": 490700288.0, + "46": 490700288.0, + "47": 490700288.0, + "48": 490700288.0, + "49": 490700288.0, "50": 490700288.0, + "51": 490700288.0, + "52": 490700288.0, + "53": 490700288.0, + "54": 490700288.0, "55": 490700288.0, + "56": 490700288.0, + "57": 490700288.0, + "58": 490700288.0, + "59": 490700288.0, "60": 490700288.0, + "61": 490700288.0, + "62": 490700288.0, + "63": 490700288.0, + "64": 490700288.0, "65": 490700288.0, + "66": 490700288.0, + "67": 490700288.0, + "68": 490700288.0, + "69": 490700288.0, "70": 490700288.0, + "71": 490700288.0, + "72": 490700288.0, + "73": 490700288.0, + "74": 490700288.0, "75": 490700288.0, + "76": 490700288.0, + "77": 490700288.0, + "78": 490700288.0, + "79": 490700288.0, "80": 490700288.0, + "81": 490700288.0, + "82": 490700288.0, + "83": 490700288.0, + "84": 490700288.0, "85": 490700288.0, + "86": 490700288.0, + "87": 490700288.0, + "88": 490700288.0, + "89": 490700288.0, "90": 490700288.0, + "91": 490700288.0, + "92": 490700288.0, + "93": 490700288.0, + "94": 490700288.0, "95": 490700288.0, + "96": 490700288.0, + "97": 490700288.0, + "98": 490700288.0, + "99": 490700288.0, "100": 490700288.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1553275392.0, + "2": 1681702400.0, + "3": 1681702400.0, + "4": 1681702400.0, "5": 1681702400.0, + "6": 1681702400.0, + "7": 1681702400.0, + "8": 1681702400.0, + "9": 1681702400.0, "10": 1681702400.0, + "11": 1681702400.0, + "12": 1681702400.0, + "13": 1681702400.0, + "14": 1681702400.0, "15": 1681702400.0, + "16": 1681702400.0, + "17": 1681702400.0, + "18": 1681702400.0, + "19": 1681702400.0, "20": 1681702400.0, + "21": 1681702400.0, + "22": 1681702400.0, + "23": 1681702400.0, + "24": 1681702400.0, "25": 1681702400.0, + "26": 1681702400.0, + "27": 1681702400.0, + "28": 1681702400.0, + "29": 1681702400.0, "30": 1681702400.0, + "31": 1681702400.0, + "32": 1681702400.0, + "33": 1681702400.0, + "34": 1681702400.0, "35": 1681702400.0, + "36": 1681702400.0, + "37": 1681702400.0, + "38": 1681702400.0, + "39": 1681702400.0, "40": 1681702400.0, + "41": 1681702400.0, + "42": 1681702400.0, + "43": 1681702400.0, + "44": 1681702400.0, "45": 1681702400.0, + "46": 1681702400.0, + "47": 1681702400.0, + "48": 1681702400.0, + "49": 1681702400.0, "50": 1681702400.0, + "51": 1681702400.0, + "52": 1681702400.0, + "53": 1681702400.0, + "54": 1681702400.0, "55": 1681702400.0, + "56": 1681702400.0, + "57": 1681702400.0, + "58": 1681702400.0, + "59": 1681702400.0, "60": 1681702400.0, + "61": 1681702400.0, + "62": 1681702400.0, + "63": 1681702400.0, + "64": 1681702400.0, "65": 1681702400.0, + "66": 1681702400.0, + "67": 1681702400.0, + "68": 1681702400.0, + "69": 1681702400.0, "70": 1681702400.0, + "71": 1681702400.0, + "72": 1681702400.0, + "73": 1681702400.0, + "74": 1681702400.0, "75": 1681702400.0, + "76": 1681702400.0, + "77": 1681702400.0, + "78": 1681702400.0, + "79": 1681702400.0, "80": 1681702400.0, + "81": 1681702400.0, + "82": 1681702400.0, + "83": 1681702400.0, + "84": 1681702400.0, "85": 1681702400.0, + "86": 1681702400.0, + "87": 1681702400.0, + "88": 1681702400.0, + "89": 1681702400.0, "90": 1681702400.0, + "91": 1681702400.0, + "92": 1681702400.0, + "93": 1681702400.0, + "94": 1681702400.0, "95": 1681702400.0, + "96": 1681702400.0, + "97": 1681702400.0, + "98": 1681702400.0, + "99": 1681702400.0, "100": 1681702400.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 9.9076, - "5": 0.11074, - "10": 0.12173, - "15": 0.11269, - "20": 0.11096, - "25": 0.11356, - "30": 0.11295, - "35": 0.11469, - "40": 0.11165, - "45": 0.11166, - "50": 0.11293, - "55": 0.11499, - "60": 0.11319, - "65": 0.11468, - "70": 0.11141, - "75": 0.11225, - "80": 0.11302, - "85": 0.11225, - "90": 0.11321, - "95": 0.11254, - "100": 0.1116 + "1": 12.86117, + "2": 0.13933, + "3": 0.12865, + "4": 0.12909, + "5": 0.13086, + "6": 0.12937, + "7": 0.12955, + "8": 0.12832, + "9": 0.13012, + "10": 0.12917, + "11": 0.13042, + "12": 0.13029, + "13": 0.12973, + "14": 0.1288, + "15": 0.13228, + "16": 0.13052, + "17": 0.13054, + "18": 0.12967, + "19": 0.13242, + "20": 0.12969, + "21": 0.13088, + "22": 0.13019, + "23": 0.12965, + "24": 0.12899, + "25": 0.13258, + "26": 0.13001, + "27": 0.12913, + "28": 0.13084, + "29": 0.13114, + "30": 0.13032, + "31": 0.13065, + "32": 0.13047, + "33": 0.13027, + "34": 0.13197, + "35": 0.13065, + "36": 0.13067, + "37": 0.12989, + "38": 0.13114, + "39": 0.12933, + "40": 0.12861, + "41": 0.12817, + "42": 0.13081, + "43": 0.12928, + "44": 0.13005, + "45": 0.13082, + "46": 0.12995, + "47": 0.12857, + "48": 0.13137, + "49": 0.12979, + "50": 0.13191, + "51": 0.15409, + "52": 0.13157, + "53": 0.14032, + "54": 0.13375, + "55": 0.13825, + "56": 0.13176, + "57": 0.13198, + "58": 0.13061, + "59": 0.12937, + "60": 0.1313, + "61": 0.14432, + "62": 0.1338, + "63": 0.13267, + "64": 0.13096, + "65": 0.13182, + "66": 0.13165, + "67": 0.13147, + "68": 0.13711, + "69": 0.13191, + "70": 0.13223, + "71": 0.13057, + "72": 0.13123, + "73": 0.13196, + "74": 0.1341, + "75": 0.13029, + "76": 0.13292, + "77": 0.13191, + "78": 0.1325, + "79": 0.13167, + "80": 0.1322, + "81": 0.13122, + "82": 0.1304, + "83": 0.1321, + "84": 0.13338, + "85": 0.13207, + "86": 0.13126, + "87": 0.13079, + "88": 0.13219, + "89": 0.13079, + "90": 0.13174, + "91": 0.13224, + "92": 0.13121, + "93": 0.13434, + "94": 0.13083, + "95": 0.13012, + "96": 0.13136, + "97": 0.13212, + "98": 0.13196, + "99": 0.13215, + "100": 0.13279 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..48eca17dac7 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84466, + "2": 10.84794, + "3": 10.84923, + "4": 10.8433, + "5": 10.88246, + "6": 10.8808, + "7": 10.86574, + "8": 10.85417, + "9": 10.85542, + "10": 10.81812, + "11": 10.88726, + "12": 10.86329, + "13": 10.86656, + "14": 10.884, + "15": 10.8231, + "16": 10.82809, + "17": 10.79467, + "18": 10.81466, + "19": 10.80122, + "20": 10.71614, + "21": 10.69886, + "22": 10.56738, + "23": 10.71707, + "24": 10.60503, + "25": 10.55053, + "26": 10.60941, + "27": 10.62543, + "28": 10.57767, + "29": 10.59725, + "30": 10.38488, + "31": 10.15554, + "32": 10.48231, + "33": 10.4763, + "34": 10.2393, + "35": 10.29064, + "36": 10.25146, + "37": 10.35662, + "38": 10.21142, + "39": 10.42144, + "40": 10.11569, + "41": 10.16423, + "42": 10.23644, + "43": 9.86597, + "44": 9.98146, + "45": 9.86983, + "46": 9.85349, + "47": 10.16995, + "48": 9.876, + "49": 9.57237, + "50": 9.92525, + "51": 9.8709, + "52": 9.7737, + "53": 10.08149, + "54": 9.97376, + "55": 9.90036, + "56": 9.64783, + "57": 9.50136, + "58": 9.85199, + "59": 9.6034, + "60": 9.50993, + "61": 9.71315, + "62": 9.99373, + "63": 9.39358, + "64": 9.78904, + "65": 8.96358, + "66": 9.71142, + "67": 9.38175, + "68": 9.79833, + "69": 9.80889, + "70": 9.75039, + "71": 9.62004, + "72": 9.59387, + "73": 9.50631, + "74": 8.94916, + "75": 9.43188, + "76": 9.08702, + "77": 10.06886, + "78": 9.73459, + "79": 9.38325, + "80": 9.41272, + "81": 9.48499, + "82": 9.70672, + "83": 9.30939, + "84": 9.42428, + "85": 9.61991, + "86": 9.07811, + "87": 9.59541, + "88": 9.75596, + "89": 9.60274, + "90": 9.82165, + "91": 9.34268, + "92": 9.35878, + "93": 9.08116, + "94": 8.83791, + "95": 9.5238, + "96": 9.53556, + "97": 9.31807, + "98": 9.68183, + "99": 8.89422, + "100": 9.40138 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1713.0, + "2": 1750.0, + "3": 1744.0, + "4": 1895.0, + "5": 1839.0, + "6": 1881.0, + "7": 1850.0, + "8": 1743.0, + "9": 1810.0, + "10": 1452.0, + "11": 1886.0, + "12": 1752.0, + "13": 1834.0, + "14": 1774.0, + "15": 1909.0, + "16": 1803.0, + "17": 1927.0, + "18": 1765.0, + "19": 1847.0, + "20": 1707.0, + "21": 1950.0, + "22": 1794.0, + "23": 1974.0, + "24": 1676.0, + "25": 1652.0, + "26": 1774.0, + "27": 1799.0, + "28": 2135.0, + "29": 2048.0, + "30": 2032.0, + "31": 1599.0, + "32": 1929.0, + "33": 2143.0, + "34": 1874.0, + "35": 1974.0, + "36": 2011.0, + "37": 2364.0, + "38": 2199.0, + "39": 2363.0, + "40": 2239.0, + "41": 2269.0, + "42": 2228.0, + "43": 1972.0, + "44": 2070.0, + "45": 2033.0, + "46": 2357.0, + "47": 2520.0, + "48": 2316.0, + "49": 2307.0, + "50": 2302.0, + "51": 2514.0, + "52": 2430.0, + "53": 2840.0, + "54": 2677.0, + "55": 2394.0, + "56": 2601.0, + "57": 2341.0, + "58": 2837.0, + "59": 2789.0, + "60": 2425.0, + "61": 2923.0, + "62": 2591.0, + "63": 2416.0, + "64": 2937.0, + "65": 2572.0, + "66": 3008.0, + "67": 2843.0, + "68": 2761.0, + "69": 2834.0, + "70": 3108.0, + "71": 2989.0, + "72": 2316.0, + "73": 2950.0, + "74": 1899.0, + "75": 2378.0, + "76": 2962.0, + "77": 3343.0, + "78": 3183.0, + "79": 2979.0, + "80": 3209.0, + "81": 3583.0, + "82": 3160.0, + "83": 2776.0, + "84": 3242.0, + "85": 3425.0, + "86": 2720.0, + "87": 3820.0, + "88": 3050.0, + "89": 3297.0, + "90": 3069.0, + "91": 2685.0, + "92": 3061.0, + "93": 2584.0, + "94": 3338.0, + "95": 3406.0, + "96": 3389.0, + "97": 3104.0, + "98": 3583.0, + "99": 3229.0, + "100": 3225.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 490700288.0, + "2": 490700288.0, + "3": 490700288.0, + "4": 490700288.0, + "5": 490700288.0, + "6": 490700288.0, + "7": 490700288.0, + "8": 490700288.0, + "9": 490700288.0, + "10": 490700288.0, + "11": 490700288.0, + "12": 490700288.0, + "13": 490700288.0, + "14": 490700288.0, + "15": 490700288.0, + "16": 490700288.0, + "17": 490700288.0, + "18": 490700288.0, + "19": 490700288.0, + "20": 490700288.0, + "21": 490700288.0, + "22": 490700288.0, + "23": 490700288.0, + "24": 490700288.0, + "25": 490700288.0, + "26": 490700288.0, + "27": 490700288.0, + "28": 490700288.0, + "29": 490700288.0, + "30": 490700288.0, + "31": 490700288.0, + "32": 490700288.0, + "33": 490700288.0, + "34": 490700288.0, + "35": 490700288.0, + "36": 490700288.0, + "37": 490700288.0, + "38": 490700288.0, + "39": 490700288.0, + "40": 490700288.0, + "41": 490700288.0, + "42": 490700288.0, + "43": 490700288.0, + "44": 490700288.0, + "45": 490700288.0, + "46": 490700288.0, + "47": 490700288.0, + "48": 490700288.0, + "49": 490700288.0, + "50": 490700288.0, + "51": 490700288.0, + "52": 490700288.0, + "53": 490700288.0, + "54": 490700288.0, + "55": 490700288.0, + "56": 490700288.0, + "57": 490700288.0, + "58": 490700288.0, + "59": 490700288.0, + "60": 490700288.0, + "61": 490700288.0, + "62": 490700288.0, + "63": 490700288.0, + "64": 490700288.0, + "65": 490700288.0, + "66": 490700288.0, + "67": 490700288.0, + "68": 490700288.0, + "69": 490700288.0, + "70": 490700288.0, + "71": 490700288.0, + "72": 490700288.0, + "73": 490700288.0, + "74": 490700288.0, + "75": 490700288.0, + "76": 490700288.0, + "77": 490700288.0, + "78": 490700288.0, + "79": 490700288.0, + "80": 490700288.0, + "81": 490700288.0, + "82": 490700288.0, + "83": 490700288.0, + "84": 490700288.0, + "85": 490700288.0, + "86": 490700288.0, + "87": 490700288.0, + "88": 490700288.0, + "89": 490700288.0, + "90": 490700288.0, + "91": 490700288.0, + "92": 490700288.0, + "93": 490700288.0, + "94": 490700288.0, + "95": 490700288.0, + "96": 490700288.0, + "97": 490700288.0, + "98": 490700288.0, + "99": 490700288.0, + "100": 490700288.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1553275392.0, + "2": 1681702400.0, + "3": 1681702400.0, + "4": 1681702400.0, + "5": 1681702400.0, + "6": 1681702400.0, + "7": 1681702400.0, + "8": 1681702400.0, + "9": 1681702400.0, + "10": 1681702400.0, + "11": 1681702400.0, + "12": 1681702400.0, + "13": 1681702400.0, + "14": 1681702400.0, + "15": 1681702400.0, + "16": 1681702400.0, + "17": 1681702400.0, + "18": 1681702400.0, + "19": 1681702400.0, + "20": 1681702400.0, + "21": 1681702400.0, + "22": 1681702400.0, + "23": 1681702400.0, + "24": 1681702400.0, + "25": 1681702400.0, + "26": 1681702400.0, + "27": 1681702400.0, + "28": 1681702400.0, + "29": 1681702400.0, + "30": 1681702400.0, + "31": 1681702400.0, + "32": 1681702400.0, + "33": 1681702400.0, + "34": 1681702400.0, + "35": 1681702400.0, + "36": 1681702400.0, + "37": 1681702400.0, + "38": 1681702400.0, + "39": 1681702400.0, + "40": 1681702400.0, + "41": 1681702400.0, + "42": 1681702400.0, + "43": 1681702400.0, + "44": 1681702400.0, + "45": 1681702400.0, + "46": 1681702400.0, + "47": 1681702400.0, + "48": 1681702400.0, + "49": 1681702400.0, + "50": 1681702400.0, + "51": 1681702400.0, + "52": 1681702400.0, + "53": 1681702400.0, + "54": 1681702400.0, + "55": 1681702400.0, + "56": 1681702400.0, + "57": 1681702400.0, + "58": 1681702400.0, + "59": 1681702400.0, + "60": 1681702400.0, + "61": 1681702400.0, + "62": 1681702400.0, + "63": 1681702400.0, + "64": 1681702400.0, + "65": 1681702400.0, + "66": 1681702400.0, + "67": 1681702400.0, + "68": 1681702400.0, + "69": 1681702400.0, + "70": 1681702400.0, + "71": 1681702400.0, + "72": 1681702400.0, + "73": 1681702400.0, + "74": 1681702400.0, + "75": 1681702400.0, + "76": 1681702400.0, + "77": 1681702400.0, + "78": 1681702400.0, + "79": 1681702400.0, + "80": 1681702400.0, + "81": 1681702400.0, + "82": 1681702400.0, + "83": 1681702400.0, + "84": 1681702400.0, + "85": 1681702400.0, + "86": 1681702400.0, + "87": 1681702400.0, + "88": 1681702400.0, + "89": 1681702400.0, + "90": 1681702400.0, + "91": 1681702400.0, + "92": 1681702400.0, + "93": 1681702400.0, + "94": 1681702400.0, + "95": 1681702400.0, + "96": 1681702400.0, + "97": 1681702400.0, + "98": 1681702400.0, + "99": 1681702400.0, + "100": 1681702400.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 13.69891, + "2": 0.13291, + "3": 0.11069, + "4": 0.11005, + "5": 0.11137, + "6": 0.11181, + "7": 0.11024, + "8": 0.1118, + "9": 0.11019, + "10": 0.1115, + "11": 0.10932, + "12": 0.11102, + "13": 0.11122, + "14": 0.10885, + "15": 0.11063, + "16": 0.10921, + "17": 0.11073, + "18": 0.11138, + "19": 0.10984, + "20": 0.1097, + "21": 0.11067, + "22": 0.10976, + "23": 0.11182, + "24": 0.11128, + "25": 0.11361, + "26": 0.11246, + "27": 0.11156, + "28": 0.11079, + "29": 0.11109, + "30": 0.11063, + "31": 0.11335, + "32": 0.11146, + "33": 0.10977, + "34": 0.10982, + "35": 0.11082, + "36": 0.11114, + "37": 0.11175, + "38": 0.11066, + "39": 0.10976, + "40": 0.11142, + "41": 0.10972, + "42": 0.11235, + "43": 0.11078, + "44": 0.11209, + "45": 0.11117, + "46": 0.112, + "47": 0.11091, + "48": 0.11186, + "49": 0.1122, + "50": 0.11209, + "51": 0.11626, + "52": 0.1141, + "53": 0.11342, + "54": 0.11372, + "55": 0.1122, + "56": 0.11383, + "57": 0.1146, + "58": 0.1142, + "59": 0.11394, + "60": 0.1139, + "61": 0.11353, + "62": 0.11377, + "63": 0.11401, + "64": 0.11264, + "65": 0.11272, + "66": 0.11265, + "67": 0.11267, + "68": 0.11872, + "69": 0.1156, + "70": 0.11377, + "71": 0.11536, + "72": 0.11453, + "73": 0.11588, + "74": 0.11658, + "75": 0.11499, + "76": 0.11315, + "77": 0.11296, + "78": 0.11428, + "79": 0.11415, + "80": 0.11548, + "81": 0.11393, + "82": 0.11142, + "83": 0.11373, + "84": 0.1132, + "85": 0.11294, + "86": 0.11271, + "87": 0.11374, + "88": 0.11311, + "89": 0.11318, + "90": 0.1122, + "91": 0.11311, + "92": 0.11396, + "93": 0.11384, + "94": 0.11636, + "95": 0.11934, + "96": 0.12031, + "97": 0.11987, + "98": 0.11805, + "99": 0.12232, + "100": 0.12103 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..077c5e1317a --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84466, + "2": 10.84794, + "3": 10.84923, + "4": 10.8433, + "5": 10.88246, + "6": 10.8808, + "7": 10.86574, + "8": 10.85417, + "9": 10.85542, + "10": 10.81812, + "11": 10.88726, + "12": 10.86329, + "13": 10.86656, + "14": 10.884, + "15": 10.8231, + "16": 10.82809, + "17": 10.79467, + "18": 10.81466, + "19": 10.80122, + "20": 10.71614, + "21": 10.69886, + "22": 10.56738, + "23": 10.71707, + "24": 10.60503, + "25": 10.55053, + "26": 10.60941, + "27": 10.62543, + "28": 10.57767, + "29": 10.59725, + "30": 10.38488, + "31": 10.15554, + "32": 10.48231, + "33": 10.4763, + "34": 10.2393, + "35": 10.29064, + "36": 10.25146, + "37": 10.35662, + "38": 10.21142, + "39": 10.42144, + "40": 10.11569, + "41": 10.16423, + "42": 10.23644, + "43": 9.86597, + "44": 9.98146, + "45": 9.86983, + "46": 9.85349, + "47": 10.16995, + "48": 9.876, + "49": 9.57237, + "50": 9.92525, + "51": 9.8709, + "52": 9.7737, + "53": 10.08149, + "54": 9.97376, + "55": 9.90036, + "56": 9.64783, + "57": 9.50136, + "58": 9.85199, + "59": 9.6034, + "60": 9.50993, + "61": 9.71315, + "62": 9.99373, + "63": 9.39358, + "64": 9.78904, + "65": 8.96358, + "66": 9.71142, + "67": 9.38175, + "68": 9.79833, + "69": 9.80889, + "70": 9.75039, + "71": 9.62004, + "72": 9.59387, + "73": 9.50631, + "74": 8.94916, + "75": 9.43188, + "76": 9.08702, + "77": 10.06886, + "78": 9.73459, + "79": 9.38325, + "80": 9.41272, + "81": 9.48499, + "82": 9.70672, + "83": 9.30939, + "84": 9.42428, + "85": 9.61991, + "86": 9.07811, + "87": 9.59541, + "88": 9.75596, + "89": 9.60274, + "90": 9.82165, + "91": 9.34268, + "92": 9.35878, + "93": 9.08116, + "94": 8.83791, + "95": 9.5238, + "96": 9.53556, + "97": 9.31807, + "98": 9.68183, + "99": 8.89422, + "100": 9.40138 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1713.0, + "2": 1750.0, + "3": 1744.0, + "4": 1895.0, + "5": 1839.0, + "6": 1881.0, + "7": 1850.0, + "8": 1743.0, + "9": 1810.0, + "10": 1452.0, + "11": 1886.0, + "12": 1752.0, + "13": 1834.0, + "14": 1774.0, + "15": 1909.0, + "16": 1803.0, + "17": 1927.0, + "18": 1765.0, + "19": 1847.0, + "20": 1707.0, + "21": 1950.0, + "22": 1794.0, + "23": 1974.0, + "24": 1676.0, + "25": 1652.0, + "26": 1774.0, + "27": 1799.0, + "28": 2135.0, + "29": 2048.0, + "30": 2032.0, + "31": 1599.0, + "32": 1929.0, + "33": 2143.0, + "34": 1874.0, + "35": 1974.0, + "36": 2011.0, + "37": 2364.0, + "38": 2199.0, + "39": 2363.0, + "40": 2239.0, + "41": 2269.0, + "42": 2228.0, + "43": 1972.0, + "44": 2070.0, + "45": 2033.0, + "46": 2357.0, + "47": 2520.0, + "48": 2316.0, + "49": 2307.0, + "50": 2302.0, + "51": 2514.0, + "52": 2430.0, + "53": 2840.0, + "54": 2677.0, + "55": 2394.0, + "56": 2601.0, + "57": 2341.0, + "58": 2837.0, + "59": 2789.0, + "60": 2425.0, + "61": 2923.0, + "62": 2591.0, + "63": 2416.0, + "64": 2937.0, + "65": 2572.0, + "66": 3008.0, + "67": 2843.0, + "68": 2761.0, + "69": 2834.0, + "70": 3108.0, + "71": 2989.0, + "72": 2316.0, + "73": 2950.0, + "74": 1899.0, + "75": 2378.0, + "76": 2962.0, + "77": 3343.0, + "78": 3183.0, + "79": 2979.0, + "80": 3209.0, + "81": 3583.0, + "82": 3160.0, + "83": 2776.0, + "84": 3242.0, + "85": 3425.0, + "86": 2720.0, + "87": 3820.0, + "88": 3050.0, + "89": 3297.0, + "90": 3069.0, + "91": 2685.0, + "92": 3061.0, + "93": 2584.0, + "94": 3338.0, + "95": 3406.0, + "96": 3389.0, + "97": 3104.0, + "98": 3583.0, + "99": 3229.0, + "100": 3225.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 490700288.0, + "2": 490700288.0, + "3": 490700288.0, + "4": 490700288.0, + "5": 490700288.0, + "6": 490700288.0, + "7": 490700288.0, + "8": 490700288.0, + "9": 490700288.0, + "10": 490700288.0, + "11": 490700288.0, + "12": 490700288.0, + "13": 490700288.0, + "14": 490700288.0, + "15": 490700288.0, + "16": 490700288.0, + "17": 490700288.0, + "18": 490700288.0, + "19": 490700288.0, + "20": 490700288.0, + "21": 490700288.0, + "22": 490700288.0, + "23": 490700288.0, + "24": 490700288.0, + "25": 490700288.0, + "26": 490700288.0, + "27": 490700288.0, + "28": 490700288.0, + "29": 490700288.0, + "30": 490700288.0, + "31": 490700288.0, + "32": 490700288.0, + "33": 490700288.0, + "34": 490700288.0, + "35": 490700288.0, + "36": 490700288.0, + "37": 490700288.0, + "38": 490700288.0, + "39": 490700288.0, + "40": 490700288.0, + "41": 490700288.0, + "42": 490700288.0, + "43": 490700288.0, + "44": 490700288.0, + "45": 490700288.0, + "46": 490700288.0, + "47": 490700288.0, + "48": 490700288.0, + "49": 490700288.0, + "50": 490700288.0, + "51": 490700288.0, + "52": 490700288.0, + "53": 490700288.0, + "54": 490700288.0, + "55": 490700288.0, + "56": 490700288.0, + "57": 490700288.0, + "58": 490700288.0, + "59": 490700288.0, + "60": 490700288.0, + "61": 490700288.0, + "62": 490700288.0, + "63": 490700288.0, + "64": 490700288.0, + "65": 490700288.0, + "66": 490700288.0, + "67": 490700288.0, + "68": 490700288.0, + "69": 490700288.0, + "70": 490700288.0, + "71": 490700288.0, + "72": 490700288.0, + "73": 490700288.0, + "74": 490700288.0, + "75": 490700288.0, + "76": 490700288.0, + "77": 490700288.0, + "78": 490700288.0, + "79": 490700288.0, + "80": 490700288.0, + "81": 490700288.0, + "82": 490700288.0, + "83": 490700288.0, + "84": 490700288.0, + "85": 490700288.0, + "86": 490700288.0, + "87": 490700288.0, + "88": 490700288.0, + "89": 490700288.0, + "90": 490700288.0, + "91": 490700288.0, + "92": 490700288.0, + "93": 490700288.0, + "94": 490700288.0, + "95": 490700288.0, + "96": 490700288.0, + "97": 490700288.0, + "98": 490700288.0, + "99": 490700288.0, + "100": 490700288.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1553275392.0, + "2": 1681702400.0, + "3": 1681702400.0, + "4": 1681702400.0, + "5": 1681702400.0, + "6": 1681702400.0, + "7": 1681702400.0, + "8": 1681702400.0, + "9": 1681702400.0, + "10": 1681702400.0, + "11": 1681702400.0, + "12": 1681702400.0, + "13": 1681702400.0, + "14": 1681702400.0, + "15": 1681702400.0, + "16": 1681702400.0, + "17": 1681702400.0, + "18": 1681702400.0, + "19": 1681702400.0, + "20": 1681702400.0, + "21": 1681702400.0, + "22": 1681702400.0, + "23": 1681702400.0, + "24": 1681702400.0, + "25": 1681702400.0, + "26": 1681702400.0, + "27": 1681702400.0, + "28": 1681702400.0, + "29": 1681702400.0, + "30": 1681702400.0, + "31": 1681702400.0, + "32": 1681702400.0, + "33": 1681702400.0, + "34": 1681702400.0, + "35": 1681702400.0, + "36": 1681702400.0, + "37": 1681702400.0, + "38": 1681702400.0, + "39": 1681702400.0, + "40": 1681702400.0, + "41": 1681702400.0, + "42": 1681702400.0, + "43": 1681702400.0, + "44": 1681702400.0, + "45": 1681702400.0, + "46": 1681702400.0, + "47": 1681702400.0, + "48": 1681702400.0, + "49": 1681702400.0, + "50": 1681702400.0, + "51": 1681702400.0, + "52": 1681702400.0, + "53": 1681702400.0, + "54": 1681702400.0, + "55": 1681702400.0, + "56": 1681702400.0, + "57": 1681702400.0, + "58": 1681702400.0, + "59": 1681702400.0, + "60": 1681702400.0, + "61": 1681702400.0, + "62": 1681702400.0, + "63": 1681702400.0, + "64": 1681702400.0, + "65": 1681702400.0, + "66": 1681702400.0, + "67": 1681702400.0, + "68": 1681702400.0, + "69": 1681702400.0, + "70": 1681702400.0, + "71": 1681702400.0, + "72": 1681702400.0, + "73": 1681702400.0, + "74": 1681702400.0, + "75": 1681702400.0, + "76": 1681702400.0, + "77": 1681702400.0, + "78": 1681702400.0, + "79": 1681702400.0, + "80": 1681702400.0, + "81": 1681702400.0, + "82": 1681702400.0, + "83": 1681702400.0, + "84": 1681702400.0, + "85": 1681702400.0, + "86": 1681702400.0, + "87": 1681702400.0, + "88": 1681702400.0, + "89": 1681702400.0, + "90": 1681702400.0, + "91": 1681702400.0, + "92": 1681702400.0, + "93": 1681702400.0, + "94": 1681702400.0, + "95": 1681702400.0, + "96": 1681702400.0, + "97": 1681702400.0, + "98": 1681702400.0, + "99": 1681702400.0, + "100": 1681702400.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 12.96096, + "2": 0.14328, + "3": 0.13234, + "4": 0.12983, + "5": 0.1339, + "6": 0.13424, + "7": 0.13558, + "8": 0.13644, + "9": 0.13434, + "10": 0.13106, + "11": 0.13377, + "12": 0.13148, + "13": 0.13136, + "14": 0.13331, + "15": 0.13429, + "16": 0.13208, + "17": 0.1316, + "18": 0.13139, + "19": 0.1287, + "20": 0.13199, + "21": 0.1318, + "22": 0.13196, + "23": 0.13019, + "24": 0.1317, + "25": 0.13217, + "26": 0.12983, + "27": 0.12928, + "28": 0.13258, + "29": 0.13441, + "30": 0.13276, + "31": 0.13264, + "32": 0.13228, + "33": 0.13159, + "34": 0.13219, + "35": 0.133, + "36": 0.13166, + "37": 0.13174, + "38": 0.1304, + "39": 0.1314, + "40": 0.13029, + "41": 0.13074, + "42": 0.12839, + "43": 0.13136, + "44": 0.13209, + "45": 0.12923, + "46": 0.13318, + "47": 0.1319, + "48": 0.13259, + "49": 0.13079, + "50": 0.12933, + "51": 0.15172, + "52": 0.1333, + "53": 0.14462, + "54": 0.13216, + "55": 0.13399, + "56": 0.13553, + "57": 0.13325, + "58": 0.13361, + "59": 0.13333, + "60": 0.13354, + "61": 0.13207, + "62": 0.1338, + "63": 0.13105, + "64": 0.13392, + "65": 0.13319, + "66": 0.13384, + "67": 0.13217, + "68": 0.13367, + "69": 0.13229, + "70": 0.13221, + "71": 0.1335, + "72": 0.13557, + "73": 0.13385, + "74": 0.13485, + "75": 0.13327, + "76": 0.13288, + "77": 0.13329, + "78": 0.13402, + "79": 0.13416, + "80": 0.13423, + "81": 0.13316, + "82": 0.13278, + "83": 0.13364, + "84": 0.13264, + "85": 0.13203, + "86": 0.13235, + "87": 0.13381, + "88": 0.13365, + "89": 0.13338, + "90": 0.1334, + "91": 0.13418, + "92": 0.13669, + "93": 0.13477, + "94": 0.13244, + "95": 0.13237, + "96": 0.13182, + "97": 0.13149, + "98": 0.13223, + "99": 0.13163, + "100": 0.1326 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..b9b764a3fd2 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.88734, + "2": 10.90383, + "3": 10.88081, + "4": 10.88371, + "5": 10.90948, + "6": 10.91613, + "7": 10.89451, + "8": 10.88622, + "9": 10.89544, + "10": 10.87763, + "11": 10.89061, + "12": 10.89565, + "13": 10.9078, + "14": 10.90725, + "15": 10.86371, + "16": 10.86172, + "17": 10.81949, + "18": 10.84638, + "19": 10.83804, + "20": 10.7509, + "21": 10.72756, + "22": 10.6229, + "23": 10.74449, + "24": 10.63231, + "25": 10.59917, + "26": 10.64491, + "27": 10.64672, + "28": 10.59686, + "29": 10.60675, + "30": 10.40104, + "31": 10.18011, + "32": 10.49048, + "33": 10.48347, + "34": 10.251, + "35": 10.30793, + "36": 10.25618, + "37": 10.36503, + "38": 10.2179, + "39": 10.41024, + "40": 10.10902, + "41": 10.16109, + "42": 10.22733, + "43": 9.87492, + "44": 9.97842, + "45": 9.85831, + "46": 9.85388, + "47": 10.15356, + "48": 9.86194, + "49": 9.55678, + "50": 9.92111, + "51": 9.86199, + "52": 9.75595, + "53": 10.07575, + "54": 9.96137, + "55": 9.88529, + "56": 9.63476, + "57": 9.49273, + "58": 9.83039, + "59": 9.59148, + "60": 9.50737, + "61": 9.70512, + "62": 9.98404, + "63": 9.37583, + "64": 9.77923, + "65": 8.95828, + "66": 9.70623, + "67": 9.37471, + "68": 9.78699, + "69": 9.78826, + "70": 9.72733, + "71": 9.61217, + "72": 9.5913, + "73": 9.49847, + "74": 8.95651, + "75": 9.42571, + "76": 9.09602, + "77": 10.06687, + "78": 9.73141, + "79": 9.37953, + "80": 9.40559, + "81": 9.48179, + "82": 9.694, + "83": 9.31183, + "84": 9.41312, + "85": 9.61572, + "86": 9.07774, + "87": 9.59695, + "88": 9.74877, + "89": 9.60255, + "90": 9.81277, + "91": 9.34555, + "92": 9.36555, + "93": 9.07714, + "94": 8.83102, + "95": 9.52119, + "96": 9.52503, + "97": 9.31354, + "98": 9.6769, + "99": 8.8896, + "100": 9.40111 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1614.0, + "2": 1820.0, + "3": 1724.0, + "4": 1889.0, + "5": 2021.0, + "6": 1920.0, + "7": 1930.0, + "8": 1736.0, + "9": 1989.0, + "10": 1399.0, + "11": 2051.0, + "12": 1859.0, + "13": 2007.0, + "14": 1830.0, + "15": 1872.0, + "16": 1877.0, + "17": 1960.0, + "18": 1747.0, + "19": 1815.0, + "20": 1692.0, + "21": 2039.0, + "22": 1713.0, + "23": 1963.0, + "24": 1743.0, + "25": 1784.0, + "26": 1793.0, + "27": 1860.0, + "28": 1956.0, + "29": 2152.0, + "30": 1900.0, + "31": 1685.0, + "32": 2000.0, + "33": 2085.0, + "34": 1867.0, + "35": 2081.0, + "36": 1975.0, + "37": 2341.0, + "38": 2316.0, + "39": 2438.0, + "40": 2233.0, + "41": 2306.0, + "42": 2319.0, + "43": 2082.0, + "44": 2158.0, + "45": 2144.0, + "46": 2227.0, + "47": 2675.0, + "48": 2473.0, + "49": 2231.0, + "50": 2513.0, + "51": 2611.0, + "52": 2560.0, + "53": 3169.0, + "54": 2698.0, + "55": 2493.0, + "56": 2791.0, + "57": 2298.0, + "58": 3182.0, + "59": 2851.0, + "60": 2440.0, + "61": 2909.0, + "62": 2834.0, + "63": 2389.0, + "64": 3187.0, + "65": 2763.0, + "66": 3321.0, + "67": 2818.0, + "68": 2835.0, + "69": 3037.0, + "70": 3219.0, + "71": 3046.0, + "72": 2359.0, + "73": 2939.0, + "74": 2061.0, + "75": 2601.0, + "76": 2971.0, + "77": 3400.0, + "78": 3295.0, + "79": 3211.0, + "80": 3341.0, + "81": 3756.0, + "82": 3240.0, + "83": 2851.0, + "84": 3378.0, + "85": 3433.0, + "86": 2818.0, + "87": 3852.0, + "88": 3000.0, + "89": 3574.0, + "90": 3019.0, + "91": 2624.0, + "92": 3179.0, + "93": 2831.0, + "94": 3483.0, + "95": 3417.0, + "96": 3492.0, + "97": 3114.0, + "98": 3675.0, + "99": 3172.0, + "100": 3372.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 462455808.0, + "2": 462455808.0, + "3": 462455808.0, + "4": 462455808.0, + "5": 462455808.0, + "6": 462455808.0, + "7": 462455808.0, + "8": 462455808.0, + "9": 462455808.0, + "10": 462455808.0, + "11": 462455808.0, + "12": 462455808.0, + "13": 462455808.0, + "14": 462455808.0, + "15": 462455808.0, + "16": 462455808.0, + "17": 462455808.0, + "18": 462455808.0, + "19": 462455808.0, + "20": 462455808.0, + "21": 462455808.0, + "22": 462455808.0, + "23": 462455808.0, + "24": 462455808.0, + "25": 462455808.0, + "26": 462455808.0, + "27": 462455808.0, + "28": 462455808.0, + "29": 462455808.0, + "30": 462455808.0, + "31": 462455808.0, + "32": 462455808.0, + "33": 462455808.0, + "34": 462455808.0, + "35": 462455808.0, + "36": 462455808.0, + "37": 462455808.0, + "38": 462455808.0, + "39": 462455808.0, + "40": 462455808.0, + "41": 462455808.0, + "42": 462455808.0, + "43": 462455808.0, + "44": 462455808.0, + "45": 462455808.0, + "46": 462455808.0, + "47": 462455808.0, + "48": 462455808.0, + "49": 462455808.0, + "50": 462455808.0, + "51": 462455808.0, + "52": 462455808.0, + "53": 462455808.0, + "54": 462455808.0, + "55": 462455808.0, + "56": 462455808.0, + "57": 462455808.0, + "58": 462455808.0, + "59": 462455808.0, + "60": 462455808.0, + "61": 462455808.0, + "62": 462455808.0, + "63": 462455808.0, + "64": 462455808.0, + "65": 462455808.0, + "66": 462455808.0, + "67": 462455808.0, + "68": 462455808.0, + "69": 462455808.0, + "70": 462455808.0, + "71": 462455808.0, + "72": 462455808.0, + "73": 462455808.0, + "74": 462455808.0, + "75": 462455808.0, + "76": 462455808.0, + "77": 462455808.0, + "78": 462455808.0, + "79": 462455808.0, + "80": 462455808.0, + "81": 462455808.0, + "82": 462455808.0, + "83": 462455808.0, + "84": 462455808.0, + "85": 462455808.0, + "86": 462455808.0, + "87": 462455808.0, + "88": 462455808.0, + "89": 462455808.0, + "90": 462455808.0, + "91": 462455808.0, + "92": 462455808.0, + "93": 462455808.0, + "94": 462455808.0, + "95": 462455808.0, + "96": 462455808.0, + "97": 462455808.0, + "98": 462455808.0, + "99": 462455808.0, + "100": 462455808.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2529822720.0, + "2": 2658249728.0, + "3": 2658249728.0, + "4": 2658249728.0, + "5": 2658249728.0, + "6": 2658249728.0, + "7": 2658249728.0, + "8": 2658249728.0, + "9": 2658249728.0, + "10": 2658249728.0, + "11": 2658249728.0, + "12": 2658249728.0, + "13": 2658249728.0, + "14": 2658249728.0, + "15": 2658249728.0, + "16": 2658249728.0, + "17": 2658249728.0, + "18": 2658249728.0, + "19": 2658249728.0, + "20": 2658249728.0, + "21": 2658249728.0, + "22": 2658249728.0, + "23": 2658249728.0, + "24": 2658249728.0, + "25": 2658249728.0, + "26": 2658249728.0, + "27": 2658249728.0, + "28": 2658249728.0, + "29": 2658249728.0, + "30": 2658249728.0, + "31": 2658249728.0, + "32": 2658249728.0, + "33": 2658249728.0, + "34": 2658249728.0, + "35": 2658249728.0, + "36": 2658249728.0, + "37": 2658249728.0, + "38": 2658249728.0, + "39": 2658249728.0, + "40": 2658249728.0, + "41": 2658249728.0, + "42": 2658249728.0, + "43": 2658249728.0, + "44": 2658249728.0, + "45": 2658249728.0, + "46": 2658249728.0, + "47": 2658249728.0, + "48": 2658249728.0, + "49": 2658249728.0, + "50": 2658249728.0, + "51": 2658249728.0, + "52": 2658249728.0, + "53": 2658249728.0, + "54": 2658249728.0, + "55": 2658249728.0, + "56": 2658249728.0, + "57": 2658249728.0, + "58": 2658249728.0, + "59": 2658249728.0, + "60": 2658249728.0, + "61": 2658249728.0, + "62": 2658249728.0, + "63": 2658249728.0, + "64": 2658249728.0, + "65": 2658249728.0, + "66": 2658249728.0, + "67": 2658249728.0, + "68": 2658249728.0, + "69": 2658249728.0, + "70": 2658249728.0, + "71": 2658249728.0, + "72": 2658249728.0, + "73": 2658249728.0, + "74": 2658249728.0, + "75": 2658249728.0, + "76": 2658249728.0, + "77": 2658249728.0, + "78": 2658249728.0, + "79": 2658249728.0, + "80": 2658249728.0, + "81": 2658249728.0, + "82": 2658249728.0, + "83": 2658249728.0, + "84": 2658249728.0, + "85": 2658249728.0, + "86": 2658249728.0, + "87": 2658249728.0, + "88": 2658249728.0, + "89": 2658249728.0, + "90": 2658249728.0, + "91": 2658249728.0, + "92": 2658249728.0, + "93": 2658249728.0, + "94": 2658249728.0, + "95": 2658249728.0, + "96": 2658249728.0, + "97": 2658249728.0, + "98": 2658249728.0, + "99": 2658249728.0, + "100": 2658249728.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 15.80127, + "2": 0.21048, + "3": 0.19424, + "4": 0.19406, + "5": 0.19305, + "6": 0.46258, + "7": 0.19395, + "8": 0.19336, + "9": 0.19347, + "10": 0.19469, + "11": 0.19315, + "12": 0.19201, + "13": 0.19467, + "14": 0.19268, + "15": 0.19342, + "16": 0.19454, + "17": 0.1928, + "18": 0.19024, + "19": 0.19035, + "20": 0.19633, + "21": 0.19068, + "22": 0.19007, + "23": 0.19089, + "24": 0.18966, + "25": 0.18965, + "26": 0.19703, + "27": 0.19046, + "28": 0.18906, + "29": 0.18887, + "30": 0.19, + "31": 0.19237, + "32": 0.19083, + "33": 0.18835, + "34": 0.18864, + "35": 0.18967, + "36": 0.19256, + "37": 0.18907, + "38": 0.18914, + "39": 0.18932, + "40": 0.18927, + "41": 0.18947, + "42": 0.19022, + "43": 0.18879, + "44": 0.1889, + "45": 0.19016, + "46": 0.18968, + "47": 0.19422, + "48": 0.19149, + "49": 0.19174, + "50": 0.18898, + "51": 0.19117, + "52": 0.18823, + "53": 0.42924, + "54": 0.18787, + "55": 0.18684, + "56": 0.19129, + "57": 0.18962, + "58": 0.18731, + "59": 0.18736, + "60": 0.18779, + "61": 0.19123, + "62": 0.1899, + "63": 0.18761, + "64": 0.24503, + "65": 0.2384, + "66": 0.24805, + "67": 0.23845, + "68": 0.23074, + "69": 0.23115, + "70": 0.23619, + "71": 0.23855, + "72": 0.24362, + "73": 0.28624, + "74": 0.30988, + "75": 0.31666, + "76": 0.25387, + "77": 0.2495, + "78": 0.1922, + "79": 0.18998, + "80": 0.18827, + "81": 0.18839, + "82": 0.18827, + "83": 0.19179, + "84": 0.18895, + "85": 0.18764, + "86": 0.18715, + "87": 0.18798, + "88": 0.19102, + "89": 0.18913, + "90": 0.18734, + "91": 0.18768, + "92": 0.1878, + "93": 0.19083, + "94": 0.19033, + "95": 0.18891, + "96": 0.18801, + "97": 0.1884, + "98": 0.18802, + "99": 0.1921, + "100": 0.1908 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..37b3ad50408 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.88734, + "2": 10.90383, + "3": 10.88081, + "4": 10.88371, + "5": 10.90948, + "6": 10.91613, + "7": 10.89451, + "8": 10.88622, + "9": 10.89544, + "10": 10.87763, + "11": 10.89061, + "12": 10.89565, + "13": 10.9078, + "14": 10.90725, + "15": 10.86371, + "16": 10.86172, + "17": 10.81949, + "18": 10.84638, + "19": 10.83804, + "20": 10.7509, + "21": 10.72756, + "22": 10.6229, + "23": 10.74449, + "24": 10.63231, + "25": 10.59917, + "26": 10.64491, + "27": 10.64672, + "28": 10.59686, + "29": 10.60675, + "30": 10.40104, + "31": 10.18011, + "32": 10.49048, + "33": 10.48347, + "34": 10.251, + "35": 10.30793, + "36": 10.25618, + "37": 10.36503, + "38": 10.2179, + "39": 10.41024, + "40": 10.10902, + "41": 10.16109, + "42": 10.22733, + "43": 9.87492, + "44": 9.97842, + "45": 9.85831, + "46": 9.85388, + "47": 10.15356, + "48": 9.86194, + "49": 9.55678, + "50": 9.92111, + "51": 9.86199, + "52": 9.75595, + "53": 10.07575, + "54": 9.96137, + "55": 9.88529, + "56": 9.63476, + "57": 9.49273, + "58": 9.83039, + "59": 9.59148, + "60": 9.50737, + "61": 9.70512, + "62": 9.98404, + "63": 9.37583, + "64": 9.77923, + "65": 8.95828, + "66": 9.70623, + "67": 9.37471, + "68": 9.78699, + "69": 9.78826, + "70": 9.72733, + "71": 9.61217, + "72": 9.5913, + "73": 9.49847, + "74": 8.95651, + "75": 9.42571, + "76": 9.09602, + "77": 10.06687, + "78": 9.73141, + "79": 9.37953, + "80": 9.40559, + "81": 9.48179, + "82": 9.694, + "83": 9.31183, + "84": 9.41312, + "85": 9.61572, + "86": 9.07774, + "87": 9.59695, + "88": 9.74877, + "89": 9.60255, + "90": 9.81277, + "91": 9.34555, + "92": 9.36555, + "93": 9.07714, + "94": 8.83102, + "95": 9.52119, + "96": 9.52503, + "97": 9.31354, + "98": 9.6769, + "99": 8.8896, + "100": 9.40111 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1614.0, + "2": 1820.0, + "3": 1724.0, + "4": 1889.0, + "5": 2021.0, + "6": 1920.0, + "7": 1930.0, + "8": 1736.0, + "9": 1989.0, + "10": 1399.0, + "11": 2051.0, + "12": 1859.0, + "13": 2007.0, + "14": 1830.0, + "15": 1872.0, + "16": 1877.0, + "17": 1960.0, + "18": 1747.0, + "19": 1815.0, + "20": 1692.0, + "21": 2039.0, + "22": 1713.0, + "23": 1963.0, + "24": 1743.0, + "25": 1784.0, + "26": 1793.0, + "27": 1860.0, + "28": 1956.0, + "29": 2152.0, + "30": 1900.0, + "31": 1685.0, + "32": 2000.0, + "33": 2085.0, + "34": 1867.0, + "35": 2081.0, + "36": 1975.0, + "37": 2341.0, + "38": 2316.0, + "39": 2438.0, + "40": 2233.0, + "41": 2306.0, + "42": 2319.0, + "43": 2082.0, + "44": 2158.0, + "45": 2144.0, + "46": 2227.0, + "47": 2675.0, + "48": 2473.0, + "49": 2231.0, + "50": 2513.0, + "51": 2611.0, + "52": 2560.0, + "53": 3169.0, + "54": 2698.0, + "55": 2493.0, + "56": 2791.0, + "57": 2298.0, + "58": 3182.0, + "59": 2851.0, + "60": 2440.0, + "61": 2909.0, + "62": 2834.0, + "63": 2389.0, + "64": 3187.0, + "65": 2763.0, + "66": 3321.0, + "67": 2818.0, + "68": 2835.0, + "69": 3037.0, + "70": 3219.0, + "71": 3046.0, + "72": 2359.0, + "73": 2939.0, + "74": 2061.0, + "75": 2601.0, + "76": 2971.0, + "77": 3400.0, + "78": 3295.0, + "79": 3211.0, + "80": 3341.0, + "81": 3756.0, + "82": 3240.0, + "83": 2851.0, + "84": 3378.0, + "85": 3433.0, + "86": 2818.0, + "87": 3852.0, + "88": 3000.0, + "89": 3574.0, + "90": 3019.0, + "91": 2624.0, + "92": 3179.0, + "93": 2831.0, + "94": 3483.0, + "95": 3417.0, + "96": 3492.0, + "97": 3114.0, + "98": 3675.0, + "99": 3172.0, + "100": 3372.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 462455808.0, + "2": 462455808.0, + "3": 462455808.0, + "4": 462455808.0, + "5": 462455808.0, + "6": 462455808.0, + "7": 462455808.0, + "8": 462455808.0, + "9": 462455808.0, + "10": 462455808.0, + "11": 462455808.0, + "12": 462455808.0, + "13": 462455808.0, + "14": 462455808.0, + "15": 462455808.0, + "16": 462455808.0, + "17": 462455808.0, + "18": 462455808.0, + "19": 462455808.0, + "20": 462455808.0, + "21": 462455808.0, + "22": 462455808.0, + "23": 462455808.0, + "24": 462455808.0, + "25": 462455808.0, + "26": 462455808.0, + "27": 462455808.0, + "28": 462455808.0, + "29": 462455808.0, + "30": 462455808.0, + "31": 462455808.0, + "32": 462455808.0, + "33": 462455808.0, + "34": 462455808.0, + "35": 462455808.0, + "36": 462455808.0, + "37": 462455808.0, + "38": 462455808.0, + "39": 462455808.0, + "40": 462455808.0, + "41": 462455808.0, + "42": 462455808.0, + "43": 462455808.0, + "44": 462455808.0, + "45": 462455808.0, + "46": 462455808.0, + "47": 462455808.0, + "48": 462455808.0, + "49": 462455808.0, + "50": 462455808.0, + "51": 462455808.0, + "52": 462455808.0, + "53": 462455808.0, + "54": 462455808.0, + "55": 462455808.0, + "56": 462455808.0, + "57": 462455808.0, + "58": 462455808.0, + "59": 462455808.0, + "60": 462455808.0, + "61": 462455808.0, + "62": 462455808.0, + "63": 462455808.0, + "64": 462455808.0, + "65": 462455808.0, + "66": 462455808.0, + "67": 462455808.0, + "68": 462455808.0, + "69": 462455808.0, + "70": 462455808.0, + "71": 462455808.0, + "72": 462455808.0, + "73": 462455808.0, + "74": 462455808.0, + "75": 462455808.0, + "76": 462455808.0, + "77": 462455808.0, + "78": 462455808.0, + "79": 462455808.0, + "80": 462455808.0, + "81": 462455808.0, + "82": 462455808.0, + "83": 462455808.0, + "84": 462455808.0, + "85": 462455808.0, + "86": 462455808.0, + "87": 462455808.0, + "88": 462455808.0, + "89": 462455808.0, + "90": 462455808.0, + "91": 462455808.0, + "92": 462455808.0, + "93": 462455808.0, + "94": 462455808.0, + "95": 462455808.0, + "96": 462455808.0, + "97": 462455808.0, + "98": 462455808.0, + "99": 462455808.0, + "100": 462455808.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2529822720.0, + "2": 2658249728.0, + "3": 2658249728.0, + "4": 2658249728.0, + "5": 2658249728.0, + "6": 2658249728.0, + "7": 2658249728.0, + "8": 2658249728.0, + "9": 2658249728.0, + "10": 2658249728.0, + "11": 2658249728.0, + "12": 2658249728.0, + "13": 2658249728.0, + "14": 2658249728.0, + "15": 2658249728.0, + "16": 2658249728.0, + "17": 2658249728.0, + "18": 2658249728.0, + "19": 2658249728.0, + "20": 2658249728.0, + "21": 2658249728.0, + "22": 2658249728.0, + "23": 2658249728.0, + "24": 2658249728.0, + "25": 2658249728.0, + "26": 2658249728.0, + "27": 2658249728.0, + "28": 2658249728.0, + "29": 2658249728.0, + "30": 2658249728.0, + "31": 2658249728.0, + "32": 2658249728.0, + "33": 2658249728.0, + "34": 2658249728.0, + "35": 2658249728.0, + "36": 2658249728.0, + "37": 2658249728.0, + "38": 2658249728.0, + "39": 2658249728.0, + "40": 2658249728.0, + "41": 2658249728.0, + "42": 2658249728.0, + "43": 2658249728.0, + "44": 2658249728.0, + "45": 2658249728.0, + "46": 2658249728.0, + "47": 2658249728.0, + "48": 2658249728.0, + "49": 2658249728.0, + "50": 2658249728.0, + "51": 2658249728.0, + "52": 2658249728.0, + "53": 2658249728.0, + "54": 2658249728.0, + "55": 2658249728.0, + "56": 2658249728.0, + "57": 2658249728.0, + "58": 2658249728.0, + "59": 2658249728.0, + "60": 2658249728.0, + "61": 2658249728.0, + "62": 2658249728.0, + "63": 2658249728.0, + "64": 2658249728.0, + "65": 2658249728.0, + "66": 2658249728.0, + "67": 2658249728.0, + "68": 2658249728.0, + "69": 2658249728.0, + "70": 2658249728.0, + "71": 2658249728.0, + "72": 2658249728.0, + "73": 2658249728.0, + "74": 2658249728.0, + "75": 2658249728.0, + "76": 2658249728.0, + "77": 2658249728.0, + "78": 2658249728.0, + "79": 2658249728.0, + "80": 2658249728.0, + "81": 2658249728.0, + "82": 2658249728.0, + "83": 2658249728.0, + "84": 2658249728.0, + "85": 2658249728.0, + "86": 2658249728.0, + "87": 2658249728.0, + "88": 2658249728.0, + "89": 2658249728.0, + "90": 2658249728.0, + "91": 2658249728.0, + "92": 2658249728.0, + "93": 2658249728.0, + "94": 2658249728.0, + "95": 2658249728.0, + "96": 2658249728.0, + "97": 2658249728.0, + "98": 2658249728.0, + "99": 2658249728.0, + "100": 2658249728.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.21979, + "2": 0.23993, + "3": 0.20666, + "4": 0.20438, + "5": 0.18758, + "6": 0.18742, + "7": 0.35545, + "8": 0.19091, + "9": 0.18666, + "10": 0.18676, + "11": 0.18722, + "12": 0.18603, + "13": 0.18977, + "14": 0.18646, + "15": 0.18634, + "16": 0.18662, + "17": 0.1894, + "18": 0.18693, + "19": 0.18807, + "20": 0.18641, + "21": 0.18648, + "22": 0.18729, + "23": 0.18572, + "24": 0.18999, + "25": 0.18548, + "26": 0.1861, + "27": 0.18884, + "28": 0.18544, + "29": 0.18916, + "30": 0.18587, + "31": 0.18557, + "32": 0.1855, + "33": 0.18841, + "34": 0.18606, + "35": 0.18832, + "36": 0.18518, + "37": 0.37059, + "38": 0.18603, + "39": 0.18695, + "40": 0.18575, + "41": 0.18563, + "42": 0.1854, + "43": 0.18938, + "44": 0.18881, + "45": 0.18598, + "46": 0.18518, + "47": 0.18498, + "48": 0.18591, + "49": 0.44149, + "50": 0.18979, + "51": 0.19055, + "52": 0.18685, + "53": 0.18664, + "54": 0.1883, + "55": 0.18876, + "56": 0.18804, + "57": 0.19098, + "58": 0.1906, + "59": 0.18982, + "60": 0.19201, + "61": 0.18888, + "62": 0.18984, + "63": 0.19266, + "64": 0.19293, + "65": 0.19379, + "66": 0.1901, + "67": 0.18841, + "68": 0.19003, + "69": 0.18922, + "70": 0.19267, + "71": 0.1883, + "72": 0.18753, + "73": 0.18871, + "74": 0.18988, + "75": 0.18979, + "76": 0.18974, + "77": 0.18868, + "78": 0.19111, + "79": 0.19033, + "80": 0.18892, + "81": 0.19389, + "82": 0.18863, + "83": 0.1889, + "84": 0.19203, + "85": 0.18938, + "86": 0.19151, + "87": 0.18754, + "88": 0.18794, + "89": 0.18964, + "90": 0.1881, + "91": 0.19389, + "92": 0.19072, + "93": 0.18826, + "94": 0.18909, + "95": 0.19026, + "96": 0.1894, + "97": 0.18891, + "98": 0.18715, + "99": 0.18688, + "100": 0.1904 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 88e3f568e8a..c8c73bdbafc 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.86535, + "2": 10.85873, + "3": 10.86284, + "4": 10.84009, "5": 10.87856, + "6": 10.88856, + "7": 10.86532, + "8": 10.86017, + "9": 10.8599, "10": 10.82981, + "11": 10.8895, + "12": 10.8751, + "13": 10.87423, + "14": 10.89675, "15": 10.82054, + "16": 10.82504, + "17": 10.78983, + "18": 10.81029, + "19": 10.80535, "20": 10.70398, + "21": 10.66993, + "22": 10.50643, + "23": 10.69004, + "24": 10.56314, "25": 10.4942, + "26": 10.56628, + "27": 10.58025, + "28": 10.51571, + "29": 10.55299, "30": 10.30549, + "31": 10.02245, + "32": 10.40614, + "33": 10.39874, + "34": 10.13771, "35": 10.20184, + "36": 10.16052, + "37": 10.28973, + "38": 10.11474, + "39": 10.361, "40": 10.01903, + "41": 10.07292, + "42": 10.14698, + "43": 9.74687, + "44": 9.87766, "45": 9.74966, + "46": 9.73383, + "47": 10.07535, + "48": 9.78068, + "49": 9.44784, "50": 9.8399 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 653.0, + "2": 642.0, + "3": 630.0, + "4": 585.0, "5": 635.0, + "6": 687.0, + "7": 615.0, + "8": 601.0, + "9": 607.0, "10": 522.0, + "11": 637.0, + "12": 675.0, + "13": 649.0, + "14": 648.0, "15": 640.0, + "16": 602.0, + "17": 668.0, + "18": 634.0, + "19": 593.0, "20": 579.0, + "21": 633.0, + "22": 597.0, + "23": 756.0, + "24": 612.0, "25": 591.0, + "26": 620.0, + "27": 700.0, + "28": 705.0, + "29": 795.0, "30": 752.0, + "31": 628.0, + "32": 712.0, + "33": 752.0, + "34": 737.0, "35": 741.0, + "36": 770.0, + "37": 861.0, + "38": 823.0, + "39": 812.0, "40": 814.0, + "41": 826.0, + "42": 801.0, + "43": 769.0, + "44": 822.0, "45": 777.0, + "46": 828.0, + "47": 878.0, + "48": 915.0, + "49": 908.0, "50": 848.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 510165504.0, - "5": 510165504.0, - "10": 510165504.0, - "15": 510165504.0, - "20": 510165504.0, - "25": 510165504.0, - "30": 510165504.0, - "35": 510165504.0, - "40": 510165504.0, - "45": 510165504.0, - "50": 510165504.0 + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 755704320.0, - "5": 933680128.0, - "10": 933680640.0, - "15": 933680640.0, - "20": 933680640.0, - "25": 933680640.0, - "30": 933680640.0, - "35": 933680640.0, - "40": 933680640.0, - "45": 933680640.0, - "50": 933680640.0 + "1": 759895552.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, + "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, + "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, + "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, + "50": 933156352.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 14.27411, - "5": 0.27049, - "10": 0.2735, - "15": 0.2699, - "20": 0.28311, - "25": 0.28368, - "30": 0.28623, - "35": 0.28201, - "40": 0.27349, - "45": 0.28, - "50": 0.28987 + "1": 16.50426, + "2": 0.36653, + "3": 0.34466, + "4": 0.34777, + "5": 0.33341, + "6": 0.3232, + "7": 0.32752, + "8": 0.32335, + "9": 0.32468, + "10": 0.32504, + "11": 0.32396, + "12": 0.32512, + "13": 0.32567, + "14": 0.32353, + "15": 0.31982, + "16": 0.3257, + "17": 0.32525, + "18": 0.32037, + "19": 0.32059, + "20": 0.32739, + "21": 0.32382, + "22": 0.32191, + "23": 0.3644, + "24": 0.35527, + "25": 0.32169, + "26": 0.3265, + "27": 0.3207, + "28": 0.31972, + "29": 0.32327, + "30": 0.31924, + "31": 0.32108, + "32": 0.32626, + "33": 0.31775, + "34": 0.31872, + "35": 0.32546, + "36": 0.317, + "37": 0.31972, + "38": 0.32263, + "39": 0.32037, + "40": 0.32326, + "41": 0.32505, + "42": 0.3215, + "43": 0.31898, + "44": 0.32895, + "45": 0.32343, + "46": 0.3229, + "47": 0.32813, + "48": 0.32454, + "49": 0.31943, + "50": 0.32434 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..88252ac05b0 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86284, + "4": 10.84009, + "5": 10.87856, + "6": 10.88856, + "7": 10.86532, + "8": 10.86017, + "9": 10.8599, + "10": 10.82981, + "11": 10.8895, + "12": 10.8751, + "13": 10.87423, + "14": 10.89675, + "15": 10.82054, + "16": 10.82504, + "17": 10.78983, + "18": 10.81029, + "19": 10.80535, + "20": 10.70398, + "21": 10.66993, + "22": 10.50643, + "23": 10.69004, + "24": 10.56314, + "25": 10.4942, + "26": 10.56628, + "27": 10.58025, + "28": 10.51571, + "29": 10.55299, + "30": 10.30549, + "31": 10.02245, + "32": 10.40614, + "33": 10.39874, + "34": 10.13771, + "35": 10.20184, + "36": 10.16052, + "37": 10.28973, + "38": 10.11474, + "39": 10.361, + "40": 10.01903, + "41": 10.07292, + "42": 10.14698, + "43": 9.74687, + "44": 9.87766, + "45": 9.74966, + "46": 9.73383, + "47": 10.07535, + "48": 9.78068, + "49": 9.44784, + "50": 9.8399 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 653.0, + "2": 642.0, + "3": 630.0, + "4": 585.0, + "5": 635.0, + "6": 687.0, + "7": 615.0, + "8": 601.0, + "9": 607.0, + "10": 522.0, + "11": 637.0, + "12": 675.0, + "13": 649.0, + "14": 648.0, + "15": 640.0, + "16": 602.0, + "17": 668.0, + "18": 634.0, + "19": 593.0, + "20": 579.0, + "21": 633.0, + "22": 597.0, + "23": 756.0, + "24": 612.0, + "25": 591.0, + "26": 620.0, + "27": 700.0, + "28": 705.0, + "29": 795.0, + "30": 752.0, + "31": 628.0, + "32": 712.0, + "33": 752.0, + "34": 737.0, + "35": 741.0, + "36": 770.0, + "37": 861.0, + "38": 823.0, + "39": 812.0, + "40": 814.0, + "41": 826.0, + "42": 801.0, + "43": 769.0, + "44": 822.0, + "45": 777.0, + "46": 828.0, + "47": 878.0, + "48": 915.0, + "49": 908.0, + "50": 848.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759898624.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, + "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, + "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, + "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, + "50": 933156352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 17.48733, + "2": 0.32636, + "3": 0.28113, + "4": 0.28069, + "5": 0.28063, + "6": 0.28085, + "7": 0.27912, + "8": 0.27833, + "9": 0.27983, + "10": 0.28235, + "11": 0.28033, + "12": 0.27634, + "13": 0.27743, + "14": 0.27968, + "15": 0.27741, + "16": 0.27901, + "17": 0.27898, + "18": 0.28259, + "19": 0.27738, + "20": 0.27602, + "21": 0.27999, + "22": 0.27615, + "23": 0.27868, + "24": 0.27928, + "25": 0.27684, + "26": 0.27875, + "27": 0.27628, + "28": 0.28571, + "29": 0.27681, + "30": 0.28404, + "31": 0.28086, + "32": 0.28479, + "33": 0.28538, + "34": 0.28086, + "35": 0.28036, + "36": 0.28227, + "37": 0.28585, + "38": 0.28963, + "39": 0.28114, + "40": 0.28277, + "41": 0.28191, + "42": 0.28102, + "43": 0.29373, + "44": 0.2876, + "45": 0.27991, + "46": 0.27977, + "47": 0.28135, + "48": 0.28282, + "49": 0.28275, + "50": 0.28218 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..f2adbef4530 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86284, + "4": 10.84009, + "5": 10.87856, + "6": 10.88856, + "7": 10.86532, + "8": 10.86017, + "9": 10.8599, + "10": 10.82981, + "11": 10.8895, + "12": 10.8751, + "13": 10.87423, + "14": 10.89675, + "15": 10.82054, + "16": 10.82504, + "17": 10.78983, + "18": 10.81029, + "19": 10.80535, + "20": 10.70398, + "21": 10.66993, + "22": 10.50643, + "23": 10.69004, + "24": 10.56314, + "25": 10.4942, + "26": 10.56628, + "27": 10.58025, + "28": 10.51571, + "29": 10.55299, + "30": 10.30549, + "31": 10.02245, + "32": 10.40614, + "33": 10.39874, + "34": 10.13771, + "35": 10.20184, + "36": 10.16052, + "37": 10.28973, + "38": 10.11474, + "39": 10.361, + "40": 10.01903, + "41": 10.07292, + "42": 10.14698, + "43": 9.74687, + "44": 9.87766, + "45": 9.74966, + "46": 9.73383, + "47": 10.07535, + "48": 9.78068, + "49": 9.44784, + "50": 9.8399 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 653.0, + "2": 642.0, + "3": 630.0, + "4": 585.0, + "5": 635.0, + "6": 687.0, + "7": 615.0, + "8": 601.0, + "9": 607.0, + "10": 522.0, + "11": 637.0, + "12": 675.0, + "13": 649.0, + "14": 648.0, + "15": 640.0, + "16": 602.0, + "17": 668.0, + "18": 634.0, + "19": 593.0, + "20": 579.0, + "21": 633.0, + "22": 597.0, + "23": 756.0, + "24": 612.0, + "25": 591.0, + "26": 620.0, + "27": 700.0, + "28": 705.0, + "29": 795.0, + "30": 752.0, + "31": 628.0, + "32": 712.0, + "33": 752.0, + "34": 737.0, + "35": 741.0, + "36": 770.0, + "37": 861.0, + "38": 823.0, + "39": 812.0, + "40": 814.0, + "41": 826.0, + "42": 801.0, + "43": 769.0, + "44": 822.0, + "45": 777.0, + "46": 828.0, + "47": 878.0, + "48": 915.0, + "49": 908.0, + "50": 848.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759898624.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, + "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, + "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, + "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, + "50": 933156352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 16.72434, + "2": 0.40342, + "3": 0.32477, + "4": 0.32459, + "5": 0.32511, + "6": 0.32478, + "7": 0.32469, + "8": 0.32479, + "9": 0.32229, + "10": 0.32534, + "11": 0.32568, + "12": 0.32325, + "13": 0.3234, + "14": 0.32735, + "15": 0.32264, + "16": 0.32664, + "17": 0.32289, + "18": 0.32328, + "19": 0.32997, + "20": 0.32955, + "21": 0.32699, + "22": 0.3292, + "23": 0.32982, + "24": 0.32452, + "25": 0.32644, + "26": 0.32596, + "27": 0.32426, + "28": 0.32527, + "29": 0.32409, + "30": 0.32549, + "31": 0.32259, + "32": 0.32488, + "33": 0.32331, + "34": 0.3242, + "35": 0.3261, + "36": 0.32048, + "37": 0.32127, + "38": 0.32479, + "39": 0.32338, + "40": 0.32137, + "41": 0.32292, + "42": 0.32202, + "43": 0.32321, + "44": 0.32105, + "45": 0.32265, + "46": 0.32148, + "47": 0.32443, + "48": 0.32158, + "49": 0.32089, + "50": 0.32389 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 386d5fed474..67aa60490cf 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.86535, - "5": 10.87853, - "10": 10.82982, - "15": 10.82054, - "20": 10.704, - "25": 10.49417, - "30": 10.30549, - "35": 10.20186, - "40": 10.01901, - "45": 9.74963, - "50": 9.8399 + "2": 10.85873, + "3": 10.86284, + "4": 10.84007, + "5": 10.87855, + "6": 10.88852, + "7": 10.86534, + "8": 10.86018, + "9": 10.85988, + "10": 10.8298, + "11": 10.88947, + "12": 10.87509, + "13": 10.87426, + "14": 10.89675, + "15": 10.82058, + "16": 10.82501, + "17": 10.78981, + "18": 10.81029, + "19": 10.80531, + "20": 10.70396, + "21": 10.66991, + "22": 10.5064, + "23": 10.69006, + "24": 10.56312, + "25": 10.49419, + "26": 10.56627, + "27": 10.58024, + "28": 10.51573, + "29": 10.55298, + "30": 10.30548, + "31": 10.02248, + "32": 10.40615, + "33": 10.39876, + "34": 10.13771, + "35": 10.20187, + "36": 10.16047, + "37": 10.28972, + "38": 10.11475, + "39": 10.36102, + "40": 10.01904, + "41": 10.07293, + "42": 10.14696, + "43": 9.74687, + "44": 9.87765, + "45": 9.74966, + "46": 9.73379, + "47": 10.07533, + "48": 9.78071, + "49": 9.44786, + "50": 9.83991 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 565.0, - "5": 634.0, - "10": 570.0, - "15": 645.0, - "20": 616.0, - "25": 577.0, - "30": 747.0, - "35": 760.0, - "40": 765.0, - "45": 838.0, - "50": 895.0 + "1": 594.0, + "2": 641.0, + "3": 677.0, + "4": 648.0, + "5": 645.0, + "6": 681.0, + "7": 639.0, + "8": 590.0, + "9": 648.0, + "10": 519.0, + "11": 703.0, + "12": 589.0, + "13": 650.0, + "14": 706.0, + "15": 675.0, + "16": 652.0, + "17": 685.0, + "18": 596.0, + "19": 672.0, + "20": 667.0, + "21": 650.0, + "22": 656.0, + "23": 706.0, + "24": 595.0, + "25": 593.0, + "26": 595.0, + "27": 685.0, + "28": 756.0, + "29": 674.0, + "30": 743.0, + "31": 612.0, + "32": 723.0, + "33": 778.0, + "34": 695.0, + "35": 716.0, + "36": 683.0, + "37": 805.0, + "38": 756.0, + "39": 850.0, + "40": 822.0, + "41": 870.0, + "42": 767.0, + "43": 747.0, + "44": 798.0, + "45": 782.0, + "46": 891.0, + "47": 887.0, + "48": 898.0, + "49": 890.0, + "50": 881.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, "50": 510689792.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 759895552.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, "45": 933156352.0, - "50": 934202368.0 + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, + "50": 933156352.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 18.05689, - "5": 0.28787, - "10": 0.2889, - "15": 0.28608, - "20": 0.28427, - "25": 0.29621, - "30": 0.28048, - "35": 0.2827, - "40": 0.28468, - "45": 0.27947, - "50": 0.30286 + "1": 16.5651, + "2": 0.34314, + "3": 0.32308, + "4": 0.32445, + "5": 0.33098, + "6": 0.32202, + "7": 0.32251, + "8": 0.32355, + "9": 0.32346, + "10": 0.31687, + "11": 0.32105, + "12": 0.32381, + "13": 0.32098, + "14": 0.32322, + "15": 0.31579, + "16": 0.31699, + "17": 0.32307, + "18": 0.32662, + "19": 0.33548, + "20": 0.32088, + "21": 0.32691, + "22": 0.32206, + "23": 0.32261, + "24": 0.32621, + "25": 0.32403, + "26": 0.32368, + "27": 0.32665, + "28": 0.32924, + "29": 0.32322, + "30": 0.32903, + "31": 0.32199, + "32": 0.32034, + "33": 0.32453, + "34": 0.32691, + "35": 0.32014, + "36": 0.3206, + "37": 0.31874, + "38": 0.32448, + "39": 0.32813, + "40": 0.32242, + "41": 0.32196, + "42": 0.32843, + "43": 0.32328, + "44": 0.32049, + "45": 0.3265, + "46": 0.31996, + "47": 0.32173, + "48": 0.323, + "49": 0.32398, + "50": 0.3329 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..303a87c0069 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86281, + "4": 10.8401, + "5": 10.87858, + "6": 10.88853, + "7": 10.86535, + "8": 10.86017, + "9": 10.8599, + "10": 10.82979, + "11": 10.88945, + "12": 10.87509, + "13": 10.87423, + "14": 10.89675, + "15": 10.8205, + "16": 10.825, + "17": 10.78982, + "18": 10.81028, + "19": 10.80532, + "20": 10.70394, + "21": 10.66988, + "22": 10.50642, + "23": 10.69005, + "24": 10.56311, + "25": 10.49417, + "26": 10.56628, + "27": 10.58023, + "28": 10.5157, + "29": 10.55296, + "30": 10.30548, + "31": 10.02248, + "32": 10.40617, + "33": 10.39875, + "34": 10.13774, + "35": 10.20186, + "36": 10.16048, + "37": 10.28974, + "38": 10.1148, + "39": 10.36104, + "40": 10.01904, + "41": 10.07288, + "42": 10.14695, + "43": 9.74684, + "44": 9.87761, + "45": 9.74967, + "46": 9.73383, + "47": 10.07539, + "48": 9.78069, + "49": 9.44781, + "50": 9.83988 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 593.0, + "2": 628.0, + "3": 611.0, + "4": 628.0, + "5": 651.0, + "6": 650.0, + "7": 630.0, + "8": 551.0, + "9": 708.0, + "10": 508.0, + "11": 656.0, + "12": 633.0, + "13": 683.0, + "14": 683.0, + "15": 633.0, + "16": 614.0, + "17": 628.0, + "18": 626.0, + "19": 574.0, + "20": 620.0, + "21": 684.0, + "22": 598.0, + "23": 752.0, + "24": 593.0, + "25": 549.0, + "26": 607.0, + "27": 661.0, + "28": 739.0, + "29": 699.0, + "30": 728.0, + "31": 571.0, + "32": 695.0, + "33": 761.0, + "34": 670.0, + "35": 708.0, + "36": 677.0, + "37": 861.0, + "38": 768.0, + "39": 836.0, + "40": 789.0, + "41": 818.0, + "42": 853.0, + "43": 774.0, + "44": 800.0, + "45": 743.0, + "46": 832.0, + "47": 902.0, + "48": 827.0, + "49": 914.0, + "50": 878.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759895552.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, + "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, + "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, + "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, + "50": 933156352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 14.98198, + "2": 0.32508, + "3": 0.27859, + "4": 0.28973, + "5": 0.28871, + "6": 0.28743, + "7": 0.28586, + "8": 0.28626, + "9": 0.28734, + "10": 0.28834, + "11": 0.29037, + "12": 0.29031, + "13": 0.27847, + "14": 0.28002, + "15": 0.28617, + "16": 0.28603, + "17": 0.28309, + "18": 0.28753, + "19": 0.34589, + "20": 0.28022, + "21": 0.28261, + "22": 0.28865, + "23": 0.28869, + "24": 0.2851, + "25": 0.28458, + "26": 0.28706, + "27": 0.28515, + "28": 0.29088, + "29": 0.28891, + "30": 0.28446, + "31": 0.28444, + "32": 0.28347, + "33": 0.28941, + "34": 0.28783, + "35": 0.28386, + "36": 0.28238, + "37": 0.28325, + "38": 0.28579, + "39": 0.29406, + "40": 0.28819, + "41": 0.29033, + "42": 0.28815, + "43": 0.2919, + "44": 0.2895, + "45": 0.28613, + "46": 0.28704, + "47": 0.29081, + "48": 0.29057, + "49": 0.2897, + "50": 0.28865 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..a74ab8d8415 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86281, + "4": 10.84011, + "5": 10.87855, + "6": 10.88849, + "7": 10.86536, + "8": 10.86016, + "9": 10.85987, + "10": 10.82979, + "11": 10.88946, + "12": 10.87508, + "13": 10.87423, + "14": 10.89679, + "15": 10.82052, + "16": 10.825, + "17": 10.78984, + "18": 10.81026, + "19": 10.80535, + "20": 10.70395, + "21": 10.66988, + "22": 10.50641, + "23": 10.69004, + "24": 10.56309, + "25": 10.49417, + "26": 10.56626, + "27": 10.58024, + "28": 10.51572, + "29": 10.55294, + "30": 10.30552, + "31": 10.02243, + "32": 10.40616, + "33": 10.39875, + "34": 10.13772, + "35": 10.20189, + "36": 10.16048, + "37": 10.28972, + "38": 10.11479, + "39": 10.361, + "40": 10.01902, + "41": 10.07292, + "42": 10.14694, + "43": 9.74686, + "44": 9.87768, + "45": 9.74966, + "46": 9.7338, + "47": 10.07535, + "48": 9.7807, + "49": 9.44783, + "50": 9.83991 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 600.0, + "2": 620.0, + "3": 606.0, + "4": 684.0, + "5": 647.0, + "6": 679.0, + "7": 630.0, + "8": 568.0, + "9": 627.0, + "10": 519.0, + "11": 635.0, + "12": 640.0, + "13": 677.0, + "14": 631.0, + "15": 668.0, + "16": 666.0, + "17": 671.0, + "18": 623.0, + "19": 658.0, + "20": 639.0, + "21": 624.0, + "22": 614.0, + "23": 741.0, + "24": 607.0, + "25": 636.0, + "26": 639.0, + "27": 689.0, + "28": 751.0, + "29": 724.0, + "30": 771.0, + "31": 564.0, + "32": 750.0, + "33": 765.0, + "34": 693.0, + "35": 737.0, + "36": 754.0, + "37": 807.0, + "38": 786.0, + "39": 879.0, + "40": 737.0, + "41": 817.0, + "42": 857.0, + "43": 709.0, + "44": 808.0, + "45": 795.0, + "46": 837.0, + "47": 879.0, + "48": 899.0, + "49": 890.0, + "50": 860.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759895552.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 934204928.0, + "25": 934204928.0, + "26": 934204928.0, + "27": 934204928.0, + "28": 934204928.0, + "29": 934204928.0, + "30": 934204928.0, + "31": 934204928.0, + "32": 934204928.0, + "33": 934204928.0, + "34": 934204928.0, + "35": 934204928.0, + "36": 934204928.0, + "37": 934204928.0, + "38": 934204928.0, + "39": 934204928.0, + "40": 934204928.0, + "41": 934204928.0, + "42": 934204928.0, + "43": 934204928.0, + "44": 934204928.0, + "45": 934204928.0, + "46": 934204928.0, + "47": 934204928.0, + "48": 934204928.0, + "49": 934204928.0, + "50": 934204928.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 16.61636, + "2": 0.35255, + "3": 0.33784, + "4": 0.33448, + "5": 0.33388, + "6": 0.33362, + "7": 0.33399, + "8": 0.33377, + "9": 0.3345, + "10": 0.33436, + "11": 0.33616, + "12": 0.33216, + "13": 0.32717, + "14": 0.3285, + "15": 0.31893, + "16": 0.32207, + "17": 0.32068, + "18": 0.3232, + "19": 0.31799, + "20": 0.32295, + "21": 0.32148, + "22": 0.3312, + "23": 0.33388, + "24": 0.33493, + "25": 0.33793, + "26": 0.33838, + "27": 0.33827, + "28": 0.34, + "29": 0.33074, + "30": 0.32608, + "31": 0.32629, + "32": 0.3285, + "33": 0.32776, + "34": 0.32575, + "35": 0.32648, + "36": 0.3252, + "37": 0.32697, + "38": 0.33001, + "39": 0.3354, + "40": 0.33513, + "41": 0.33447, + "42": 0.3352, + "43": 0.33163, + "44": 0.32495, + "45": 0.32668, + "46": 0.32429, + "47": 0.32917, + "48": 0.32614, + "49": 0.32637, + "50": 0.32702 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..93a6863f9ba --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.93292, + "2": 10.93423, + "3": 10.91343, + "4": 10.9032, + "5": 10.9297, + "6": 10.93654, + "7": 10.90278, + "8": 10.92115, + "9": 10.90703, + "10": 10.90474, + "11": 10.88784, + "12": 10.91739, + "13": 10.91191, + "14": 10.91502, + "15": 10.87124, + "16": 10.86128, + "17": 10.82695, + "18": 10.8568, + "19": 10.84056, + "20": 10.75, + "21": 10.71506, + "22": 10.58117, + "23": 10.72641, + "24": 10.60731, + "25": 10.53752, + "26": 10.61071, + "27": 10.5993, + "28": 10.54954, + "29": 10.56604, + "30": 10.32554, + "31": 10.06698, + "32": 10.43804, + "33": 10.42362, + "34": 10.16013, + "35": 10.22894, + "36": 10.17616, + "37": 10.29237, + "38": 10.13292, + "39": 10.34958, + "40": 10.01974, + "41": 10.07538, + "42": 10.15409, + "43": 9.76091, + "44": 9.88355, + "45": 9.75545, + "46": 9.74961, + "47": 10.07545, + "48": 9.77938, + "49": 9.43818, + "50": 9.84069 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 575.0, + "2": 559.0, + "3": 613.0, + "4": 620.0, + "5": 596.0, + "6": 632.0, + "7": 610.0, + "8": 563.0, + "9": 590.0, + "10": 556.0, + "11": 680.0, + "12": 555.0, + "13": 624.0, + "14": 619.0, + "15": 609.0, + "16": 656.0, + "17": 643.0, + "18": 621.0, + "19": 604.0, + "20": 628.0, + "21": 608.0, + "22": 623.0, + "23": 640.0, + "24": 607.0, + "25": 605.0, + "26": 644.0, + "27": 664.0, + "28": 703.0, + "29": 741.0, + "30": 670.0, + "31": 602.0, + "32": 687.0, + "33": 780.0, + "34": 661.0, + "35": 672.0, + "36": 726.0, + "37": 776.0, + "38": 756.0, + "39": 843.0, + "40": 832.0, + "41": 850.0, + "42": 793.0, + "43": 719.0, + "44": 800.0, + "45": 716.0, + "46": 811.0, + "47": 828.0, + "48": 865.0, + "49": 810.0, + "50": 875.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 431783936.0, + "2": 431783936.0, + "3": 431783936.0, + "4": 431783936.0, + "5": 431783936.0, + "6": 431783936.0, + "7": 431783936.0, + "8": 431783936.0, + "9": 431783936.0, + "10": 431783936.0, + "11": 431783936.0, + "12": 431783936.0, + "13": 431783936.0, + "14": 431783936.0, + "15": 431783936.0, + "16": 431783936.0, + "17": 431783936.0, + "18": 431783936.0, + "19": 431783936.0, + "20": 431783936.0, + "21": 431783936.0, + "22": 431783936.0, + "23": 431783936.0, + "24": 431783936.0, + "25": 431783936.0, + "26": 431783936.0, + "27": 431783936.0, + "28": 431783936.0, + "29": 431783936.0, + "30": 431783936.0, + "31": 431783936.0, + "32": 431783936.0, + "33": 431783936.0, + "34": 431783936.0, + "35": 431783936.0, + "36": 431783936.0, + "37": 431783936.0, + "38": 431783936.0, + "39": 431783936.0, + "40": 431783936.0, + "41": 431783936.0, + "42": 431783936.0, + "43": 431783936.0, + "44": 431783936.0, + "45": 431783936.0, + "46": 431783936.0, + "47": 431783936.0, + "48": 431783936.0, + "49": 431783936.0, + "50": 431783936.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 677333504.0, + "2": 855308800.0, + "3": 855308800.0, + "4": 855308800.0, + "5": 855308800.0, + "6": 855308800.0, + "7": 855308800.0, + "8": 855308800.0, + "9": 855308800.0, + "10": 855308800.0, + "11": 855308800.0, + "12": 855308800.0, + "13": 855308800.0, + "14": 855308800.0, + "15": 855308800.0, + "16": 855308800.0, + "17": 855308800.0, + "18": 855308800.0, + "19": 855310336.0, + "20": 855310336.0, + "21": 855310336.0, + "22": 855310336.0, + "23": 855310336.0, + "24": 855310336.0, + "25": 855310336.0, + "26": 855311360.0, + "27": 855311360.0, + "28": 855311360.0, + "29": 855311360.0, + "30": 855311360.0, + "31": 855311360.0, + "32": 855311360.0, + "33": 855311360.0, + "34": 855311360.0, + "35": 855311360.0, + "36": 855311360.0, + "37": 855311360.0, + "38": 855311360.0, + "39": 855311360.0, + "40": 855311360.0, + "41": 855311360.0, + "42": 855311360.0, + "43": 855311360.0, + "44": 855311360.0, + "45": 855311360.0, + "46": 855311360.0, + "47": 855311360.0, + "48": 855311360.0, + "49": 855311360.0, + "50": 855311360.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 14.36326, + "2": 0.4559, + "3": 0.42105, + "4": 0.43438, + "5": 0.42464, + "6": 0.41381, + "7": 0.42997, + "8": 0.41256, + "9": 0.42034, + "10": 0.41575, + "11": 0.41092, + "12": 0.42374, + "13": 0.41123, + "14": 0.42677, + "15": 0.41074, + "16": 0.42059, + "17": 0.41911, + "18": 0.41172, + "19": 0.42617, + "20": 0.41085, + "21": 0.42288, + "22": 0.41567, + "23": 0.41045, + "24": 0.42041, + "25": 0.40891, + "26": 0.42104, + "27": 0.41476, + "28": 0.4134, + "29": 0.41023, + "30": 0.40616, + "31": 0.41979, + "32": 0.40666, + "33": 0.41352, + "34": 0.42345, + "35": 0.40886, + "36": 0.42443, + "37": 0.40786, + "38": 0.41631, + "39": 0.41181, + "40": 0.40693, + "41": 0.41652, + "42": 0.40701, + "43": 0.42407, + "44": 0.41181, + "45": 0.40787, + "46": 0.41861, + "47": 0.40384, + "48": 0.4279, + "49": 0.40721, + "50": 0.41192 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..fcf25e804f7 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.93292, + "2": 10.93423, + "3": 10.91343, + "4": 10.90318, + "5": 10.92969, + "6": 10.93655, + "7": 10.90278, + "8": 10.92114, + "9": 10.90705, + "10": 10.90476, + "11": 10.88784, + "12": 10.91738, + "13": 10.91192, + "14": 10.91507, + "15": 10.87121, + "16": 10.8613, + "17": 10.82698, + "18": 10.85677, + "19": 10.8406, + "20": 10.74995, + "21": 10.7151, + "22": 10.58115, + "23": 10.72643, + "24": 10.60731, + "25": 10.53752, + "26": 10.61065, + "27": 10.59933, + "28": 10.54956, + "29": 10.56604, + "30": 10.32551, + "31": 10.06702, + "32": 10.43808, + "33": 10.42361, + "34": 10.16018, + "35": 10.22893, + "36": 10.17618, + "37": 10.29235, + "38": 10.13293, + "39": 10.34955, + "40": 10.01975, + "41": 10.07537, + "42": 10.15408, + "43": 9.7609, + "44": 9.88355, + "45": 9.75548, + "46": 9.74966, + "47": 10.07548, + "48": 9.77939, + "49": 9.4382, + "50": 9.8407 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 584.0, + "2": 575.0, + "3": 637.0, + "4": 586.0, + "5": 643.0, + "6": 652.0, + "7": 636.0, + "8": 624.0, + "9": 699.0, + "10": 579.0, + "11": 684.0, + "12": 650.0, + "13": 645.0, + "14": 582.0, + "15": 623.0, + "16": 637.0, + "17": 675.0, + "18": 614.0, + "19": 579.0, + "20": 589.0, + "21": 643.0, + "22": 603.0, + "23": 709.0, + "24": 582.0, + "25": 632.0, + "26": 638.0, + "27": 662.0, + "28": 732.0, + "29": 705.0, + "30": 691.0, + "31": 539.0, + "32": 731.0, + "33": 809.0, + "34": 721.0, + "35": 680.0, + "36": 701.0, + "37": 779.0, + "38": 770.0, + "39": 816.0, + "40": 795.0, + "41": 793.0, + "42": 826.0, + "43": 747.0, + "44": 782.0, + "45": 724.0, + "46": 813.0, + "47": 858.0, + "48": 880.0, + "49": 822.0, + "50": 851.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 431783936.0, + "2": 431783936.0, + "3": 431783936.0, + "4": 431783936.0, + "5": 431783936.0, + "6": 431783936.0, + "7": 431783936.0, + "8": 431783936.0, + "9": 431783936.0, + "10": 431783936.0, + "11": 431783936.0, + "12": 431783936.0, + "13": 431783936.0, + "14": 431783936.0, + "15": 431783936.0, + "16": 431783936.0, + "17": 431783936.0, + "18": 431783936.0, + "19": 431783936.0, + "20": 431783936.0, + "21": 431783936.0, + "22": 431783936.0, + "23": 431783936.0, + "24": 431783936.0, + "25": 431783936.0, + "26": 431783936.0, + "27": 431783936.0, + "28": 431783936.0, + "29": 431783936.0, + "30": 431783936.0, + "31": 431783936.0, + "32": 431783936.0, + "33": 431783936.0, + "34": 431783936.0, + "35": 431783936.0, + "36": 431783936.0, + "37": 431783936.0, + "38": 431783936.0, + "39": 431783936.0, + "40": 431783936.0, + "41": 431783936.0, + "42": 431783936.0, + "43": 431783936.0, + "44": 431783936.0, + "45": 431783936.0, + "46": 431783936.0, + "47": 431783936.0, + "48": 431783936.0, + "49": 431783936.0, + "50": 431783936.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 677335040.0, + "2": 853214208.0, + "3": 854260224.0, + "4": 854260224.0, + "5": 854260224.0, + "6": 854260224.0, + "7": 854260224.0, + "8": 854260224.0, + "9": 854261760.0, + "10": 854261760.0, + "11": 854261760.0, + "12": 854261760.0, + "13": 854261760.0, + "14": 854261760.0, + "15": 854261760.0, + "16": 854261760.0, + "17": 854261760.0, + "18": 854261760.0, + "19": 854261760.0, + "20": 854261760.0, + "21": 854261760.0, + "22": 854261760.0, + "23": 854261760.0, + "24": 854262784.0, + "25": 854262784.0, + "26": 854262784.0, + "27": 854262784.0, + "28": 854262784.0, + "29": 854262784.0, + "30": 854262784.0, + "31": 854262784.0, + "32": 854262784.0, + "33": 854262784.0, + "34": 854262784.0, + "35": 854262784.0, + "36": 854262784.0, + "37": 854262784.0, + "38": 854262784.0, + "39": 854262784.0, + "40": 854262784.0, + "41": 854262784.0, + "42": 854262784.0, + "43": 854262784.0, + "44": 854262784.0, + "45": 854262784.0, + "46": 854262784.0, + "47": 854262784.0, + "48": 854262784.0, + "49": 854262784.0, + "50": 854262784.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 16.47386, + "2": 0.47756, + "3": 0.45149, + "4": 0.3974, + "5": 0.40219, + "6": 0.40118, + "7": 0.39646, + "8": 0.399, + "9": 0.40423, + "10": 0.39996, + "11": 0.40013, + "12": 0.39333, + "13": 0.40016, + "14": 0.40246, + "15": 0.39824, + "16": 0.39607, + "17": 0.38883, + "18": 0.39558, + "19": 0.40073, + "20": 0.39465, + "21": 0.39509, + "22": 0.39239, + "23": 0.39366, + "24": 0.39612, + "25": 0.39292, + "26": 0.39495, + "27": 0.39096, + "28": 0.39872, + "29": 0.39945, + "30": 0.38903, + "31": 0.40121, + "32": 0.3932, + "33": 0.39872, + "34": 0.4027, + "35": 0.38761, + "36": 0.39596, + "37": 0.40133, + "38": 0.39669, + "39": 0.39549, + "40": 0.39351, + "41": 0.39605, + "42": 0.39902, + "43": 0.39692, + "44": 0.39866, + "45": 0.38737, + "46": 0.40095, + "47": 0.40062, + "48": 0.39784, + "49": 0.39656, + "50": 0.39145 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 94d3531293f..db2baf5c599 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.86535, + "2": 10.85873, + "3": 10.86284, + "4": 10.84009, "5": 10.87856, + "6": 10.88856, + "7": 10.86532, + "8": 10.86017, + "9": 10.8599, "10": 10.82981, + "11": 10.8895, + "12": 10.8751, + "13": 10.87423, + "14": 10.89675, "15": 10.82054, + "16": 10.82504, + "17": 10.78983, + "18": 10.81029, + "19": 10.80535, "20": 10.70398, + "21": 10.66993, + "22": 10.50643, + "23": 10.69004, + "24": 10.56314, "25": 10.4942, + "26": 10.56628, + "27": 10.58025, + "28": 10.51571, + "29": 10.55299, "30": 10.30549, + "31": 10.02245, + "32": 10.40614, + "33": 10.39874, + "34": 10.13771, "35": 10.20184, + "36": 10.16052, + "37": 10.28973, + "38": 10.11474, + "39": 10.361, "40": 10.01903, + "41": 10.07292, + "42": 10.14698, + "43": 9.74687, + "44": 9.87766, "45": 9.74966, + "46": 9.73383, + "47": 10.07535, + "48": 9.78068, + "49": 9.44784, "50": 9.8399 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 653.0, + "2": 642.0, + "3": 630.0, + "4": 585.0, "5": 635.0, + "6": 687.0, + "7": 615.0, + "8": 601.0, + "9": 607.0, "10": 522.0, + "11": 637.0, + "12": 675.0, + "13": 649.0, + "14": 648.0, "15": 640.0, + "16": 602.0, + "17": 668.0, + "18": 634.0, + "19": 593.0, "20": 579.0, + "21": 633.0, + "22": 597.0, + "23": 756.0, + "24": 612.0, "25": 591.0, + "26": 620.0, + "27": 700.0, + "28": 705.0, + "29": 795.0, "30": 752.0, + "31": 628.0, + "32": 712.0, + "33": 752.0, + "34": 737.0, "35": 741.0, + "36": 770.0, + "37": 861.0, + "38": 823.0, + "39": 812.0, "40": 814.0, + "41": 826.0, + "42": 801.0, + "43": 769.0, + "44": 822.0, "45": 777.0, + "46": 828.0, + "47": 878.0, + "48": 915.0, + "49": 908.0, "50": 848.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, "50": 510689792.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 759895552.0, - "5": 934204928.0, - "10": 934204928.0, - "15": 934204928.0, - "20": 934204928.0, - "25": 934204928.0, - "30": 934204928.0, - "35": 934204928.0, - "40": 934204928.0, - "45": 934204928.0, - "50": 934204928.0 + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, + "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, + "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, + "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, + "50": 933156352.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 16.20665, - "5": 0.29885, - "10": 0.28312, - "15": 0.28379, - "20": 0.29142, - "25": 0.28821, - "30": 0.28552, - "35": 0.29704, - "40": 0.29487, - "45": 0.28474, - "50": 0.29091 + "1": 16.00603, + "2": 0.37533, + "3": 0.32669, + "4": 0.33301, + "5": 0.33912, + "6": 0.32887, + "7": 0.32417, + "8": 0.32988, + "9": 0.33113, + "10": 0.32547, + "11": 0.32805, + "12": 0.328, + "13": 0.33007, + "14": 0.33264, + "15": 0.3341, + "16": 0.33744, + "17": 0.33776, + "18": 0.33727, + "19": 0.33724, + "20": 0.33333, + "21": 0.32884, + "22": 0.32956, + "23": 0.33051, + "24": 0.33032, + "25": 0.3332, + "26": 0.32905, + "27": 0.32375, + "28": 0.3404, + "29": 0.33196, + "30": 0.33981, + "31": 0.33813, + "32": 0.34997, + "33": 0.34437, + "34": 0.33045, + "35": 0.32839, + "36": 0.32738, + "37": 0.32817, + "38": 0.32837, + "39": 0.32923, + "40": 0.33033, + "41": 0.32725, + "42": 0.32793, + "43": 0.32998, + "44": 0.32897, + "45": 0.32784, + "46": 0.32856, + "47": 0.33025, + "48": 0.32747, + "49": 0.32752, + "50": 0.32926 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..7b244eb8d53 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86284, + "4": 10.84009, + "5": 10.87856, + "6": 10.88856, + "7": 10.86532, + "8": 10.86017, + "9": 10.8599, + "10": 10.82981, + "11": 10.8895, + "12": 10.8751, + "13": 10.87423, + "14": 10.89675, + "15": 10.82054, + "16": 10.82504, + "17": 10.78983, + "18": 10.81029, + "19": 10.80535, + "20": 10.70398, + "21": 10.66993, + "22": 10.50643, + "23": 10.69004, + "24": 10.56314, + "25": 10.4942, + "26": 10.56628, + "27": 10.58025, + "28": 10.51571, + "29": 10.55299, + "30": 10.30549, + "31": 10.02245, + "32": 10.40614, + "33": 10.39874, + "34": 10.13771, + "35": 10.20184, + "36": 10.16052, + "37": 10.28973, + "38": 10.11474, + "39": 10.361, + "40": 10.01903, + "41": 10.07292, + "42": 10.14698, + "43": 9.74687, + "44": 9.87766, + "45": 9.74966, + "46": 9.73383, + "47": 10.07535, + "48": 9.78068, + "49": 9.44784, + "50": 9.8399 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 653.0, + "2": 642.0, + "3": 630.0, + "4": 585.0, + "5": 635.0, + "6": 687.0, + "7": 615.0, + "8": 601.0, + "9": 607.0, + "10": 522.0, + "11": 637.0, + "12": 675.0, + "13": 649.0, + "14": 648.0, + "15": 640.0, + "16": 602.0, + "17": 668.0, + "18": 634.0, + "19": 593.0, + "20": 579.0, + "21": 633.0, + "22": 597.0, + "23": 756.0, + "24": 612.0, + "25": 591.0, + "26": 620.0, + "27": 700.0, + "28": 705.0, + "29": 795.0, + "30": 752.0, + "31": 628.0, + "32": 712.0, + "33": 752.0, + "34": 737.0, + "35": 741.0, + "36": 770.0, + "37": 861.0, + "38": 823.0, + "39": 812.0, + "40": 814.0, + "41": 826.0, + "42": 801.0, + "43": 769.0, + "44": 822.0, + "45": 777.0, + "46": 828.0, + "47": 878.0, + "48": 915.0, + "49": 908.0, + "50": 848.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759898624.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, + "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, + "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, + "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, + "50": 933156352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 16.5499, + "2": 0.36629, + "3": 0.28373, + "4": 0.2889, + "5": 0.28714, + "6": 0.28308, + "7": 0.28631, + "8": 0.28716, + "9": 0.2827, + "10": 0.28014, + "11": 0.28458, + "12": 0.28337, + "13": 0.28673, + "14": 0.28763, + "15": 0.28453, + "16": 0.28536, + "17": 0.2915, + "18": 0.29241, + "19": 0.28738, + "20": 0.28157, + "21": 0.28725, + "22": 0.28594, + "23": 0.28463, + "24": 0.28697, + "25": 0.28822, + "26": 0.28636, + "27": 0.29484, + "28": 0.29612, + "29": 0.29284, + "30": 0.28832, + "31": 0.28707, + "32": 0.28946, + "33": 0.28737, + "34": 0.28546, + "35": 0.28437, + "36": 0.28751, + "37": 0.28834, + "38": 0.28784, + "39": 0.28871, + "40": 0.28919, + "41": 0.28543, + "42": 0.28646, + "43": 0.29593, + "44": 0.28978, + "45": 0.29038, + "46": 0.29126, + "47": 0.28667, + "48": 0.28881, + "49": 0.28809, + "50": 0.28744 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..02b4683ea0b --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86284, + "4": 10.84009, + "5": 10.87856, + "6": 10.88856, + "7": 10.86532, + "8": 10.86017, + "9": 10.8599, + "10": 10.82981, + "11": 10.8895, + "12": 10.8751, + "13": 10.87423, + "14": 10.89675, + "15": 10.82054, + "16": 10.82504, + "17": 10.78983, + "18": 10.81029, + "19": 10.80535, + "20": 10.70398, + "21": 10.66993, + "22": 10.50643, + "23": 10.69004, + "24": 10.56314, + "25": 10.4942, + "26": 10.56628, + "27": 10.58025, + "28": 10.51571, + "29": 10.55299, + "30": 10.30549, + "31": 10.02245, + "32": 10.40614, + "33": 10.39874, + "34": 10.13771, + "35": 10.20184, + "36": 10.16052, + "37": 10.28973, + "38": 10.11474, + "39": 10.361, + "40": 10.01903, + "41": 10.07292, + "42": 10.14698, + "43": 9.74687, + "44": 9.87766, + "45": 9.74966, + "46": 9.73383, + "47": 10.07535, + "48": 9.78068, + "49": 9.44784, + "50": 9.8399 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 653.0, + "2": 642.0, + "3": 630.0, + "4": 585.0, + "5": 635.0, + "6": 687.0, + "7": 615.0, + "8": 601.0, + "9": 607.0, + "10": 522.0, + "11": 637.0, + "12": 675.0, + "13": 649.0, + "14": 648.0, + "15": 640.0, + "16": 602.0, + "17": 668.0, + "18": 634.0, + "19": 593.0, + "20": 579.0, + "21": 633.0, + "22": 597.0, + "23": 756.0, + "24": 612.0, + "25": 591.0, + "26": 620.0, + "27": 700.0, + "28": 705.0, + "29": 795.0, + "30": 752.0, + "31": 628.0, + "32": 712.0, + "33": 752.0, + "34": 737.0, + "35": 741.0, + "36": 770.0, + "37": 861.0, + "38": 823.0, + "39": 812.0, + "40": 814.0, + "41": 826.0, + "42": 801.0, + "43": 769.0, + "44": 822.0, + "45": 777.0, + "46": 828.0, + "47": 878.0, + "48": 915.0, + "49": 908.0, + "50": 848.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 757801472.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, + "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, + "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, + "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, + "50": 933156352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 15.78036, + "2": 0.34723, + "3": 0.33492, + "4": 0.3292, + "5": 0.33036, + "6": 0.34971, + "7": 0.33848, + "8": 0.33262, + "9": 0.34028, + "10": 0.3518, + "11": 0.34239, + "12": 0.33211, + "13": 0.32961, + "14": 0.33263, + "15": 0.32808, + "16": 0.33152, + "17": 0.33313, + "18": 0.329, + "19": 0.3317, + "20": 0.33143, + "21": 0.34166, + "22": 0.33873, + "23": 0.34817, + "24": 0.3415, + "25": 0.34495, + "26": 0.32592, + "27": 0.32935, + "28": 0.33233, + "29": 0.328, + "30": 0.32746, + "31": 0.3275, + "32": 0.327, + "33": 0.32765, + "34": 0.32542, + "35": 0.32703, + "36": 0.33052, + "37": 0.33413, + "38": 0.32701, + "39": 0.32816, + "40": 0.32555, + "41": 0.33676, + "42": 0.33367, + "43": 0.33748, + "44": 0.33125, + "45": 0.32793, + "46": 0.33387, + "47": 0.32628, + "48": 0.32993, + "49": 0.32747, + "50": 0.327 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 17f2535f7d8..91630133bbc 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,287 @@ -{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86535, "5": 10.87856, "10": 10.82981, "15": 10.82054, "20": 10.70398, "25": 10.4942, "30": 10.30549, "35": 10.20184, "40": 10.01903, "45": 9.74966, "50": 9.8399}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 653.0, "5": 635.0, "10": 522.0, "15": 640.0, "20": 579.0, "25": 591.0, "30": 752.0, "35": 741.0, "40": 814.0, "45": 777.0, "50": 848.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 510689792.0, "5": 510689792.0, "10": 510689792.0, "15": 510689792.0, "20": 510689792.0, "25": 510689792.0, "30": 510689792.0, "35": 510689792.0, "40": 510689792.0, "45": 510689792.0, "50": 510689792.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 759898624.0, "5": 933156352.0, "10": 933156352.0, "15": 933156352.0, "20": 933156352.0, "25": 934204416.0, "30": 934204416.0, "35": 934204416.0, "40": 934204416.0, "45": 934204416.0, "50": 934204416.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 19.9057, "5": 0.26754, "10": 0.26496, "15": 0.26771, "20": 0.26791, "25": 0.26865, "30": 0.26668, "35": 0.2709, "40": 0.26908, "45": 0.26408, "50": 0.27511}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86284, + "4": 10.84009, + "5": 10.87856, + "6": 10.88856, + "7": 10.86532, + "8": 10.86017, + "9": 10.8599, + "10": 10.82981, + "11": 10.8895, + "12": 10.8751, + "13": 10.87423, + "14": 10.89675, + "15": 10.82054, + "16": 10.82504, + "17": 10.78983, + "18": 10.81029, + "19": 10.80535, + "20": 10.70398, + "21": 10.66993, + "22": 10.50643, + "23": 10.69004, + "24": 10.56314, + "25": 10.4942, + "26": 10.56628, + "27": 10.58025, + "28": 10.51571, + "29": 10.55299, + "30": 10.30549, + "31": 10.02245, + "32": 10.40614, + "33": 10.39874, + "34": 10.13771, + "35": 10.20184, + "36": 10.16052, + "37": 10.28973, + "38": 10.11474, + "39": 10.361, + "40": 10.01903, + "41": 10.07292, + "42": 10.14698, + "43": 9.74687, + "44": 9.87766, + "45": 9.74966, + "46": 9.73383, + "47": 10.07535, + "48": 9.78068, + "49": 9.44784, + "50": 9.8399 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 653.0, + "2": 642.0, + "3": 630.0, + "4": 585.0, + "5": 635.0, + "6": 687.0, + "7": 615.0, + "8": 601.0, + "9": 607.0, + "10": 522.0, + "11": 637.0, + "12": 675.0, + "13": 649.0, + "14": 648.0, + "15": 640.0, + "16": 602.0, + "17": 668.0, + "18": 634.0, + "19": 593.0, + "20": 579.0, + "21": 633.0, + "22": 597.0, + "23": 756.0, + "24": 612.0, + "25": 591.0, + "26": 620.0, + "27": 700.0, + "28": 705.0, + "29": 795.0, + "30": 752.0, + "31": 628.0, + "32": 712.0, + "33": 752.0, + "34": 737.0, + "35": 741.0, + "36": 770.0, + "37": 861.0, + "38": 823.0, + "39": 812.0, + "40": 814.0, + "41": 826.0, + "42": 801.0, + "43": 769.0, + "44": 822.0, + "45": 777.0, + "46": 828.0, + "47": 878.0, + "48": 915.0, + "49": 908.0, + "50": 848.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759895552.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, + "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, + "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, + "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, + "50": 933156352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 18.74335, + "2": 0.3476, + "3": 0.32845, + "4": 0.34133, + "5": 0.34487, + "6": 0.34494, + "7": 0.33861, + "8": 0.33955, + "9": 0.34794, + "10": 0.32879, + "11": 0.32446, + "12": 0.3306, + "13": 0.32382, + "14": 0.33396, + "15": 0.32393, + "16": 0.32115, + "17": 0.32752, + "18": 0.32386, + "19": 0.32588, + "20": 0.32805, + "21": 0.32785, + "22": 0.32655, + "23": 0.32262, + "24": 0.32541, + "25": 0.32541, + "26": 0.32301, + "27": 0.32448, + "28": 0.32526, + "29": 0.32436, + "30": 0.32542, + "31": 0.32734, + "32": 0.32473, + "33": 0.32718, + "34": 0.32951, + "35": 0.33292, + "36": 0.34033, + "37": 0.34474, + "38": 0.34306, + "39": 0.34159, + "40": 0.32995, + "41": 0.33037, + "42": 0.33033, + "43": 0.33246, + "44": 0.33318, + "45": 0.33332, + "46": 0.32932, + "47": 0.33279, + "48": 0.33327, + "49": 0.33082, + "50": 0.33522 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..81f4d5c3832 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86284, + "4": 10.84009, + "5": 10.87856, + "6": 10.88856, + "7": 10.86532, + "8": 10.86017, + "9": 10.8599, + "10": 10.82981, + "11": 10.8895, + "12": 10.8751, + "13": 10.87423, + "14": 10.89675, + "15": 10.82054, + "16": 10.82504, + "17": 10.78983, + "18": 10.81029, + "19": 10.80535, + "20": 10.70398, + "21": 10.66993, + "22": 10.50643, + "23": 10.69004, + "24": 10.56314, + "25": 10.4942, + "26": 10.56628, + "27": 10.58025, + "28": 10.51571, + "29": 10.55299, + "30": 10.30549, + "31": 10.02245, + "32": 10.40614, + "33": 10.39874, + "34": 10.13771, + "35": 10.20184, + "36": 10.16052, + "37": 10.28973, + "38": 10.11474, + "39": 10.361, + "40": 10.01903, + "41": 10.07292, + "42": 10.14698, + "43": 9.74687, + "44": 9.87766, + "45": 9.74966, + "46": 9.73383, + "47": 10.07535, + "48": 9.78068, + "49": 9.44784, + "50": 9.8399 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 653.0, + "2": 642.0, + "3": 630.0, + "4": 585.0, + "5": 635.0, + "6": 687.0, + "7": 615.0, + "8": 601.0, + "9": 607.0, + "10": 522.0, + "11": 637.0, + "12": 675.0, + "13": 649.0, + "14": 648.0, + "15": 640.0, + "16": 602.0, + "17": 668.0, + "18": 634.0, + "19": 593.0, + "20": 579.0, + "21": 633.0, + "22": 597.0, + "23": 756.0, + "24": 612.0, + "25": 591.0, + "26": 620.0, + "27": 700.0, + "28": 705.0, + "29": 795.0, + "30": 752.0, + "31": 628.0, + "32": 712.0, + "33": 752.0, + "34": 737.0, + "35": 741.0, + "36": 770.0, + "37": 861.0, + "38": 823.0, + "39": 812.0, + "40": 814.0, + "41": 826.0, + "42": 801.0, + "43": 769.0, + "44": 822.0, + "45": 777.0, + "46": 828.0, + "47": 878.0, + "48": 915.0, + "49": 908.0, + "50": 848.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759895552.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 934203392.0, + "28": 934203392.0, + "29": 934203392.0, + "30": 934203392.0, + "31": 934203392.0, + "32": 934203392.0, + "33": 934203392.0, + "34": 934203392.0, + "35": 934203392.0, + "36": 934203392.0, + "37": 934203392.0, + "38": 934203392.0, + "39": 934203392.0, + "40": 934203392.0, + "41": 934203392.0, + "42": 934203392.0, + "43": 934203392.0, + "44": 934203392.0, + "45": 934203392.0, + "46": 934203392.0, + "47": 934203392.0, + "48": 934203392.0, + "49": 934203392.0, + "50": 934203392.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 21.7688, + "2": 0.32156, + "3": 0.2747, + "4": 0.2768, + "5": 0.27883, + "6": 0.27703, + "7": 0.27847, + "8": 0.27539, + "9": 0.27303, + "10": 0.27375, + "11": 0.28033, + "12": 0.28202, + "13": 0.27965, + "14": 0.27594, + "15": 0.2733, + "16": 0.2734, + "17": 0.2761, + "18": 0.28051, + "19": 0.28074, + "20": 0.28674, + "21": 0.27278, + "22": 0.2765, + "23": 0.27317, + "24": 0.27474, + "25": 0.27496, + "26": 0.27426, + "27": 0.28705, + "28": 0.2814, + "29": 0.28559, + "30": 0.28098, + "31": 0.29666, + "32": 0.28302, + "33": 0.28642, + "34": 0.28282, + "35": 0.28457, + "36": 0.2843, + "37": 0.27728, + "38": 0.2746, + "39": 0.2774, + "40": 0.27644, + "41": 0.27658, + "42": 0.27835, + "43": 0.27776, + "44": 0.27654, + "45": 0.27705, + "46": 0.27383, + "47": 0.27806, + "48": 0.27418, + "49": 0.27617, + "50": 0.27185 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..f64661824cb --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86284, + "4": 10.84009, + "5": 10.87856, + "6": 10.88856, + "7": 10.86532, + "8": 10.86017, + "9": 10.8599, + "10": 10.82981, + "11": 10.8895, + "12": 10.8751, + "13": 10.87423, + "14": 10.89675, + "15": 10.82054, + "16": 10.82504, + "17": 10.78983, + "18": 10.81029, + "19": 10.80535, + "20": 10.70398, + "21": 10.66993, + "22": 10.50643, + "23": 10.69004, + "24": 10.56314, + "25": 10.4942, + "26": 10.56628, + "27": 10.58025, + "28": 10.51571, + "29": 10.55299, + "30": 10.30549, + "31": 10.02245, + "32": 10.40614, + "33": 10.39874, + "34": 10.13771, + "35": 10.20184, + "36": 10.16052, + "37": 10.28973, + "38": 10.11474, + "39": 10.361, + "40": 10.01903, + "41": 10.07292, + "42": 10.14698, + "43": 9.74687, + "44": 9.87766, + "45": 9.74966, + "46": 9.73383, + "47": 10.07535, + "48": 9.78068, + "49": 9.44784, + "50": 9.8399 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 653.0, + "2": 642.0, + "3": 630.0, + "4": 585.0, + "5": 635.0, + "6": 687.0, + "7": 615.0, + "8": 601.0, + "9": 607.0, + "10": 522.0, + "11": 637.0, + "12": 675.0, + "13": 649.0, + "14": 648.0, + "15": 640.0, + "16": 602.0, + "17": 668.0, + "18": 634.0, + "19": 593.0, + "20": 579.0, + "21": 633.0, + "22": 597.0, + "23": 756.0, + "24": 612.0, + "25": 591.0, + "26": 620.0, + "27": 700.0, + "28": 705.0, + "29": 795.0, + "30": 752.0, + "31": 628.0, + "32": 712.0, + "33": 752.0, + "34": 737.0, + "35": 741.0, + "36": 770.0, + "37": 861.0, + "38": 823.0, + "39": 812.0, + "40": 814.0, + "41": 826.0, + "42": 801.0, + "43": 769.0, + "44": 822.0, + "45": 777.0, + "46": 828.0, + "47": 878.0, + "48": 915.0, + "49": 908.0, + "50": 848.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759898624.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, + "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, + "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, + "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, + "50": 933156352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 18.71096, + "2": 0.39649, + "3": 0.33228, + "4": 0.33042, + "5": 0.33036, + "6": 0.3326, + "7": 0.33962, + "8": 0.37041, + "9": 0.33077, + "10": 0.33179, + "11": 0.33053, + "12": 0.33332, + "13": 0.33149, + "14": 0.32928, + "15": 0.33252, + "16": 0.3321, + "17": 0.32661, + "18": 0.32933, + "19": 0.32718, + "20": 0.32982, + "21": 0.32827, + "22": 0.3313, + "23": 0.32836, + "24": 0.3287, + "25": 0.33025, + "26": 0.32605, + "27": 0.33501, + "28": 0.32889, + "29": 0.32971, + "30": 0.3318, + "31": 0.33458, + "32": 0.33222, + "33": 0.33434, + "34": 0.3337, + "35": 0.33221, + "36": 0.32984, + "37": 0.32779, + "38": 0.33131, + "39": 0.33056, + "40": 0.32941, + "41": 0.32351, + "42": 0.32946, + "43": 0.32913, + "44": 0.3283, + "45": 0.32845, + "46": 0.32474, + "47": 0.33097, + "48": 0.32791, + "49": 0.33143, + "50": 0.33005 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index e9d8d072b10..910068628d2 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.86535, - "5": 10.87856, - "10": 10.82981, - "15": 10.82051, + "2": 10.85873, + "3": 10.86283, + "4": 10.84011, + "5": 10.87855, + "6": 10.88851, + "7": 10.86537, + "8": 10.86017, + "9": 10.85989, + "10": 10.8298, + "11": 10.88947, + "12": 10.87508, + "13": 10.87426, + "14": 10.89677, + "15": 10.82053, + "16": 10.825, + "17": 10.78979, + "18": 10.81027, + "19": 10.80535, "20": 10.70395, + "21": 10.66991, + "22": 10.50641, + "23": 10.69004, + "24": 10.56305, "25": 10.49417, - "30": 10.30548, - "35": 10.20188, + "26": 10.56629, + "27": 10.58022, + "28": 10.51575, + "29": 10.55298, + "30": 10.30549, + "31": 10.02244, + "32": 10.40616, + "33": 10.39872, + "34": 10.1377, + "35": 10.20186, + "36": 10.16052, + "37": 10.28973, + "38": 10.11481, + "39": 10.36101, "40": 10.019, - "45": 9.7497, - "50": 9.83994 + "41": 10.07294, + "42": 10.14697, + "43": 9.74685, + "44": 9.87762, + "45": 9.74969, + "46": 9.73382, + "47": 10.07533, + "48": 9.78067, + "49": 9.44782, + "50": 9.83992 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 599.0, - "5": 640.0, - "10": 529.0, - "15": 691.0, - "20": 644.0, - "25": 573.0, - "30": 712.0, - "35": 736.0, - "40": 797.0, - "45": 764.0, - "50": 822.0 + "1": 601.0, + "2": 613.0, + "3": 655.0, + "4": 593.0, + "5": 678.0, + "6": 642.0, + "7": 620.0, + "8": 549.0, + "9": 640.0, + "10": 502.0, + "11": 660.0, + "12": 645.0, + "13": 615.0, + "14": 696.0, + "15": 670.0, + "16": 631.0, + "17": 648.0, + "18": 611.0, + "19": 605.0, + "20": 621.0, + "21": 673.0, + "22": 661.0, + "23": 715.0, + "24": 654.0, + "25": 594.0, + "26": 589.0, + "27": 648.0, + "28": 690.0, + "29": 755.0, + "30": 678.0, + "31": 584.0, + "32": 712.0, + "33": 793.0, + "34": 765.0, + "35": 738.0, + "36": 737.0, + "37": 868.0, + "38": 726.0, + "39": 868.0, + "40": 809.0, + "41": 833.0, + "42": 806.0, + "43": 783.0, + "44": 785.0, + "45": 800.0, + "46": 875.0, + "47": 903.0, + "48": 899.0, + "49": 878.0, + "50": 873.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, "50": 510689792.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 756752896.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, "50": 933156352.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 19.51044, - "5": 0.29555, - "10": 0.28638, - "15": 0.2812, - "20": 0.28547, - "25": 0.28087, - "30": 0.28444, - "35": 0.28059, - "40": 0.28626, - "45": 0.28541, - "50": 0.2861 + "1": 18.51483, + "2": 0.38305, + "3": 0.31916, + "4": 0.33028, + "5": 0.34426, + "6": 0.35623, + "7": 0.32503, + "8": 0.32084, + "9": 0.32047, + "10": 0.32595, + "11": 0.32652, + "12": 0.32296, + "13": 0.32617, + "14": 0.32833, + "15": 0.32492, + "16": 0.32302, + "17": 0.32458, + "18": 0.32598, + "19": 0.32565, + "20": 0.32747, + "21": 0.3272, + "22": 0.32863, + "23": 0.32847, + "24": 0.32664, + "25": 0.32485, + "26": 0.32858, + "27": 0.32665, + "28": 0.32434, + "29": 0.32998, + "30": 0.33789, + "31": 0.32692, + "32": 0.32521, + "33": 0.32521, + "34": 0.32786, + "35": 0.32813, + "36": 0.32665, + "37": 0.32466, + "38": 0.33006, + "39": 0.32341, + "40": 0.32787, + "41": 0.32762, + "42": 0.32448, + "43": 0.32181, + "44": 0.33035, + "45": 0.32497, + "46": 0.32334, + "47": 0.32904, + "48": 0.32458, + "49": 0.32391, + "50": 0.32652 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..f0eb7547392 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86282, + "4": 10.84009, + "5": 10.87855, + "6": 10.88856, + "7": 10.86539, + "8": 10.86016, + "9": 10.85985, + "10": 10.82981, + "11": 10.8895, + "12": 10.87506, + "13": 10.87424, + "14": 10.89677, + "15": 10.82052, + "16": 10.825, + "17": 10.78983, + "18": 10.81027, + "19": 10.80534, + "20": 10.70395, + "21": 10.66987, + "22": 10.50641, + "23": 10.69005, + "24": 10.56316, + "25": 10.49414, + "26": 10.56627, + "27": 10.58026, + "28": 10.51573, + "29": 10.55295, + "30": 10.30554, + "31": 10.02245, + "32": 10.40617, + "33": 10.39881, + "34": 10.13768, + "35": 10.20187, + "36": 10.16048, + "37": 10.28976, + "38": 10.1148, + "39": 10.361, + "40": 10.019, + "41": 10.07292, + "42": 10.14692, + "43": 9.74685, + "44": 9.8776, + "45": 9.74967, + "46": 9.73383, + "47": 10.07533, + "48": 9.78069, + "49": 9.44781, + "50": 9.83988 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 615.0, + "2": 640.0, + "3": 586.0, + "4": 621.0, + "5": 619.0, + "6": 683.0, + "7": 667.0, + "8": 564.0, + "9": 646.0, + "10": 540.0, + "11": 654.0, + "12": 647.0, + "13": 656.0, + "14": 652.0, + "15": 658.0, + "16": 624.0, + "17": 657.0, + "18": 621.0, + "19": 555.0, + "20": 613.0, + "21": 643.0, + "22": 626.0, + "23": 749.0, + "24": 638.0, + "25": 562.0, + "26": 613.0, + "27": 653.0, + "28": 668.0, + "29": 780.0, + "30": 710.0, + "31": 577.0, + "32": 719.0, + "33": 821.0, + "34": 708.0, + "35": 690.0, + "36": 697.0, + "37": 878.0, + "38": 734.0, + "39": 867.0, + "40": 810.0, + "41": 837.0, + "42": 829.0, + "43": 687.0, + "44": 782.0, + "45": 761.0, + "46": 856.0, + "47": 896.0, + "48": 904.0, + "49": 841.0, + "50": 838.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 757799936.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, + "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, + "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, + "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, + "50": 933156352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 19.15382, + "2": 0.382, + "3": 0.2953, + "4": 0.30669, + "5": 0.2864, + "6": 0.28721, + "7": 0.28819, + "8": 0.28856, + "9": 0.3024, + "10": 0.29011, + "11": 0.29044, + "12": 0.28948, + "13": 0.29391, + "14": 0.29381, + "15": 0.29174, + "16": 0.29101, + "17": 0.29087, + "18": 0.30622, + "19": 0.28768, + "20": 0.29439, + "21": 0.28914, + "22": 0.28729, + "23": 0.28503, + "24": 0.28932, + "25": 0.28325, + "26": 0.2863, + "27": 0.28599, + "28": 0.28766, + "29": 0.28539, + "30": 0.28326, + "31": 0.2833, + "32": 0.28222, + "33": 0.28588, + "34": 0.28764, + "35": 0.28697, + "36": 0.28266, + "37": 0.2825, + "38": 0.28576, + "39": 0.28329, + "40": 0.28369, + "41": 0.28375, + "42": 0.28077, + "43": 0.28714, + "44": 0.28289, + "45": 0.28552, + "46": 0.28119, + "47": 0.28252, + "48": 0.28882, + "49": 0.30153, + "50": 0.299 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..cc1700ed493 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86285, + "4": 10.84007, + "5": 10.87854, + "6": 10.88852, + "7": 10.86537, + "8": 10.86015, + "9": 10.85985, + "10": 10.82982, + "11": 10.88949, + "12": 10.87509, + "13": 10.87426, + "14": 10.89674, + "15": 10.82054, + "16": 10.82501, + "17": 10.78985, + "18": 10.81032, + "19": 10.8053, + "20": 10.70397, + "21": 10.66986, + "22": 10.50641, + "23": 10.69001, + "24": 10.56317, + "25": 10.49421, + "26": 10.56628, + "27": 10.58022, + "28": 10.51574, + "29": 10.55292, + "30": 10.30549, + "31": 10.0225, + "32": 10.40617, + "33": 10.39874, + "34": 10.13772, + "35": 10.20187, + "36": 10.16045, + "37": 10.28977, + "38": 10.11478, + "39": 10.36101, + "40": 10.01903, + "41": 10.07294, + "42": 10.14691, + "43": 9.74683, + "44": 9.87762, + "45": 9.74966, + "46": 9.73384, + "47": 10.07535, + "48": 9.78069, + "49": 9.44783, + "50": 9.83992 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 607.0, + "2": 628.0, + "3": 600.0, + "4": 658.0, + "5": 657.0, + "6": 707.0, + "7": 637.0, + "8": 593.0, + "9": 632.0, + "10": 553.0, + "11": 641.0, + "12": 631.0, + "13": 676.0, + "14": 643.0, + "15": 623.0, + "16": 611.0, + "17": 687.0, + "18": 622.0, + "19": 581.0, + "20": 609.0, + "21": 652.0, + "22": 621.0, + "23": 800.0, + "24": 618.0, + "25": 623.0, + "26": 595.0, + "27": 679.0, + "28": 726.0, + "29": 719.0, + "30": 723.0, + "31": 624.0, + "32": 737.0, + "33": 776.0, + "34": 713.0, + "35": 696.0, + "36": 759.0, + "37": 829.0, + "38": 784.0, + "39": 798.0, + "40": 813.0, + "41": 814.0, + "42": 880.0, + "43": 780.0, + "44": 775.0, + "45": 759.0, + "46": 849.0, + "47": 938.0, + "48": 876.0, + "49": 886.0, + "50": 817.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759895552.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, + "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, + "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, + "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, + "50": 933156352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 18.67374, + "2": 0.33434, + "3": 0.32862, + "4": 0.3312, + "5": 0.32463, + "6": 0.33221, + "7": 0.33167, + "8": 0.32476, + "9": 0.32742, + "10": 0.32327, + "11": 0.31599, + "12": 0.32511, + "13": 0.32273, + "14": 0.31956, + "15": 0.32777, + "16": 0.32745, + "17": 0.31743, + "18": 0.32418, + "19": 0.32759, + "20": 0.32696, + "21": 0.32321, + "22": 0.32923, + "23": 0.32125, + "24": 0.32088, + "25": 0.32288, + "26": 0.31739, + "27": 0.33667, + "28": 0.32586, + "29": 0.31738, + "30": 0.31392, + "31": 0.32116, + "32": 0.31637, + "33": 0.32029, + "34": 0.32057, + "35": 0.31739, + "36": 0.31341, + "37": 0.32121, + "38": 0.326, + "39": 0.31692, + "40": 0.31511, + "41": 0.32216, + "42": 0.31654, + "43": 0.32474, + "44": 0.32162, + "45": 0.31451, + "46": 0.31434, + "47": 0.32885, + "48": 0.31603, + "49": 0.31732, + "50": 0.3234 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..2ac9a4a8d47 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.93292, + "2": 10.93423, + "3": 10.91348, + "4": 10.90322, + "5": 10.92969, + "6": 10.93655, + "7": 10.90277, + "8": 10.92116, + "9": 10.90706, + "10": 10.90473, + "11": 10.88783, + "12": 10.91738, + "13": 10.9119, + "14": 10.91506, + "15": 10.87123, + "16": 10.86131, + "17": 10.82698, + "18": 10.85674, + "19": 10.84055, + "20": 10.74998, + "21": 10.71508, + "22": 10.58112, + "23": 10.72642, + "24": 10.60722, + "25": 10.53752, + "26": 10.61072, + "27": 10.59927, + "28": 10.54955, + "29": 10.56605, + "30": 10.32547, + "31": 10.06698, + "32": 10.43807, + "33": 10.42361, + "34": 10.16018, + "35": 10.22893, + "36": 10.17616, + "37": 10.29235, + "38": 10.13293, + "39": 10.34957, + "40": 10.01973, + "41": 10.07533, + "42": 10.15408, + "43": 9.76085, + "44": 9.88357, + "45": 9.75546, + "46": 9.74963, + "47": 10.07546, + "48": 9.77937, + "49": 9.43813, + "50": 9.84068 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 568.0, + "2": 600.0, + "3": 624.0, + "4": 589.0, + "5": 692.0, + "6": 705.0, + "7": 662.0, + "8": 616.0, + "9": 679.0, + "10": 508.0, + "11": 703.0, + "12": 638.0, + "13": 678.0, + "14": 649.0, + "15": 659.0, + "16": 606.0, + "17": 663.0, + "18": 613.0, + "19": 615.0, + "20": 598.0, + "21": 639.0, + "22": 628.0, + "23": 675.0, + "24": 590.0, + "25": 595.0, + "26": 588.0, + "27": 678.0, + "28": 687.0, + "29": 688.0, + "30": 681.0, + "31": 618.0, + "32": 706.0, + "33": 758.0, + "34": 683.0, + "35": 741.0, + "36": 694.0, + "37": 819.0, + "38": 786.0, + "39": 866.0, + "40": 779.0, + "41": 838.0, + "42": 837.0, + "43": 695.0, + "44": 716.0, + "45": 738.0, + "46": 802.0, + "47": 926.0, + "48": 854.0, + "49": 811.0, + "50": 807.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 431783936.0, + "2": 431783936.0, + "3": 431783936.0, + "4": 431783936.0, + "5": 431783936.0, + "6": 431783936.0, + "7": 431783936.0, + "8": 431783936.0, + "9": 431783936.0, + "10": 431783936.0, + "11": 431783936.0, + "12": 431783936.0, + "13": 431783936.0, + "14": 431783936.0, + "15": 431783936.0, + "16": 431783936.0, + "17": 431783936.0, + "18": 431783936.0, + "19": 431783936.0, + "20": 431783936.0, + "21": 431783936.0, + "22": 431783936.0, + "23": 431783936.0, + "24": 431783936.0, + "25": 431783936.0, + "26": 431783936.0, + "27": 431783936.0, + "28": 431783936.0, + "29": 431783936.0, + "30": 431783936.0, + "31": 431783936.0, + "32": 431783936.0, + "33": 431783936.0, + "34": 431783936.0, + "35": 431783936.0, + "36": 431783936.0, + "37": 431783936.0, + "38": 431783936.0, + "39": 431783936.0, + "40": 431783936.0, + "41": 431783936.0, + "42": 431783936.0, + "43": 431783936.0, + "44": 431783936.0, + "45": 431783936.0, + "46": 431783936.0, + "47": 431783936.0, + "48": 431783936.0, + "49": 431783936.0, + "50": 431783936.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 678383616.0, + "2": 854262272.0, + "3": 854262272.0, + "4": 854262272.0, + "5": 854262272.0, + "6": 854262272.0, + "7": 855309824.0, + "8": 855309824.0, + "9": 855309824.0, + "10": 855309824.0, + "11": 855309824.0, + "12": 855309824.0, + "13": 855309824.0, + "14": 855310848.0, + "15": 855310848.0, + "16": 855310848.0, + "17": 855310848.0, + "18": 855310848.0, + "19": 855310848.0, + "20": 855310848.0, + "21": 855310848.0, + "22": 855310848.0, + "23": 855310848.0, + "24": 855310848.0, + "25": 855310848.0, + "26": 855310848.0, + "27": 855310848.0, + "28": 855310848.0, + "29": 855310848.0, + "30": 855310848.0, + "31": 855310848.0, + "32": 855310848.0, + "33": 855310848.0, + "34": 855310848.0, + "35": 855310848.0, + "36": 855310848.0, + "37": 855310848.0, + "38": 855310848.0, + "39": 855310848.0, + "40": 855310848.0, + "41": 855310848.0, + "42": 855310848.0, + "43": 855310848.0, + "44": 855310848.0, + "45": 855310848.0, + "46": 855311360.0, + "47": 855311360.0, + "48": 855311360.0, + "49": 855311360.0, + "50": 855311360.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 23.53527, + "2": 0.45843, + "3": 0.41722, + "4": 0.41343, + "5": 0.43098, + "6": 0.41032, + "7": 0.42789, + "8": 0.4109, + "9": 0.41334, + "10": 0.42277, + "11": 0.41109, + "12": 0.4255, + "13": 0.41083, + "14": 0.41498, + "15": 0.4158, + "16": 0.40724, + "17": 0.42608, + "18": 0.40815, + "19": 0.41361, + "20": 0.40774, + "21": 0.41448, + "22": 0.42245, + "23": 0.40681, + "24": 0.41744, + "25": 0.41008, + "26": 0.41229, + "27": 0.42006, + "28": 0.40569, + "29": 0.44026, + "30": 0.40835, + "31": 0.41007, + "32": 0.41186, + "33": 0.40618, + "34": 0.42247, + "35": 0.40587, + "36": 0.41189, + "37": 0.40876, + "38": 0.41309, + "39": 0.42068, + "40": 0.40576, + "41": 0.41665, + "42": 0.40588, + "43": 0.41519, + "44": 0.41465, + "45": 0.63205, + "46": 0.42162, + "47": 0.41448, + "48": 0.42206, + "49": 0.41268, + "50": 0.41606 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..1e9b2b8989e --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.93292, + "2": 10.93423, + "3": 10.91345, + "4": 10.90321, + "5": 10.92971, + "6": 10.93655, + "7": 10.90279, + "8": 10.92115, + "9": 10.90703, + "10": 10.90476, + "11": 10.88787, + "12": 10.91736, + "13": 10.91188, + "14": 10.91505, + "15": 10.87126, + "16": 10.86126, + "17": 10.82696, + "18": 10.85675, + "19": 10.8406, + "20": 10.74999, + "21": 10.71507, + "22": 10.58116, + "23": 10.72641, + "24": 10.60728, + "25": 10.53754, + "26": 10.61066, + "27": 10.59928, + "28": 10.54957, + "29": 10.56599, + "30": 10.32553, + "31": 10.06697, + "32": 10.43809, + "33": 10.42361, + "34": 10.16014, + "35": 10.22896, + "36": 10.17612, + "37": 10.29237, + "38": 10.13298, + "39": 10.34958, + "40": 10.01972, + "41": 10.07534, + "42": 10.1541, + "43": 9.76093, + "44": 9.8836, + "45": 9.75546, + "46": 9.74961, + "47": 10.07546, + "48": 9.77936, + "49": 9.43816, + "50": 9.84073 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 565.0, + "2": 625.0, + "3": 618.0, + "4": 618.0, + "5": 630.0, + "6": 653.0, + "7": 581.0, + "8": 630.0, + "9": 648.0, + "10": 502.0, + "11": 696.0, + "12": 653.0, + "13": 680.0, + "14": 629.0, + "15": 599.0, + "16": 670.0, + "17": 649.0, + "18": 580.0, + "19": 594.0, + "20": 578.0, + "21": 616.0, + "22": 609.0, + "23": 655.0, + "24": 611.0, + "25": 593.0, + "26": 595.0, + "27": 660.0, + "28": 756.0, + "29": 745.0, + "30": 691.0, + "31": 611.0, + "32": 676.0, + "33": 767.0, + "34": 669.0, + "35": 757.0, + "36": 794.0, + "37": 793.0, + "38": 778.0, + "39": 833.0, + "40": 785.0, + "41": 787.0, + "42": 769.0, + "43": 751.0, + "44": 714.0, + "45": 769.0, + "46": 835.0, + "47": 902.0, + "48": 853.0, + "49": 807.0, + "50": 823.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 431783936.0, + "2": 431783936.0, + "3": 431783936.0, + "4": 431783936.0, + "5": 431783936.0, + "6": 431783936.0, + "7": 431783936.0, + "8": 431783936.0, + "9": 431783936.0, + "10": 431783936.0, + "11": 431783936.0, + "12": 431783936.0, + "13": 431783936.0, + "14": 431783936.0, + "15": 431783936.0, + "16": 431783936.0, + "17": 431783936.0, + "18": 431783936.0, + "19": 431783936.0, + "20": 431783936.0, + "21": 431783936.0, + "22": 431783936.0, + "23": 431783936.0, + "24": 431783936.0, + "25": 431783936.0, + "26": 431783936.0, + "27": 431783936.0, + "28": 431783936.0, + "29": 431783936.0, + "30": 431783936.0, + "31": 431783936.0, + "32": 431783936.0, + "33": 431783936.0, + "34": 431783936.0, + "35": 431783936.0, + "36": 431783936.0, + "37": 431783936.0, + "38": 431783936.0, + "39": 431783936.0, + "40": 431783936.0, + "41": 431783936.0, + "42": 431783936.0, + "43": 431783936.0, + "44": 431783936.0, + "45": 431783936.0, + "46": 431783936.0, + "47": 431783936.0, + "48": 431783936.0, + "49": 431783936.0, + "50": 431783936.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 677335040.0, + "2": 854262784.0, + "3": 854262784.0, + "4": 854262784.0, + "5": 854262784.0, + "6": 854262784.0, + "7": 854262784.0, + "8": 854262784.0, + "9": 854262784.0, + "10": 854262784.0, + "11": 854262784.0, + "12": 854262784.0, + "13": 854262784.0, + "14": 854262784.0, + "15": 854262784.0, + "16": 854262784.0, + "17": 854262784.0, + "18": 854262784.0, + "19": 854262784.0, + "20": 854262784.0, + "21": 854262784.0, + "22": 854262784.0, + "23": 854262784.0, + "24": 854262784.0, + "25": 854262784.0, + "26": 854262784.0, + "27": 854262784.0, + "28": 854262784.0, + "29": 854262784.0, + "30": 854262784.0, + "31": 854262784.0, + "32": 854262784.0, + "33": 854262784.0, + "34": 854262784.0, + "35": 854262784.0, + "36": 855311360.0, + "37": 855311360.0, + "38": 855311360.0, + "39": 855311360.0, + "40": 855311360.0, + "41": 855311360.0, + "42": 855311360.0, + "43": 855311360.0, + "44": 855311360.0, + "45": 855311360.0, + "46": 855311360.0, + "47": 855311360.0, + "48": 855311360.0, + "49": 855311360.0, + "50": 855311360.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 20.54291, + "2": 0.45304, + "3": 0.40799, + "4": 0.41533, + "5": 0.59635, + "6": 0.41138, + "7": 0.41402, + "8": 0.41118, + "9": 0.41133, + "10": 0.41277, + "11": 0.41021, + "12": 0.41466, + "13": 0.40958, + "14": 0.40717, + "15": 0.40964, + "16": 0.40616, + "17": 0.41407, + "18": 0.40562, + "19": 0.40279, + "20": 0.40656, + "21": 0.40188, + "22": 0.4164, + "23": 0.40487, + "24": 0.41094, + "25": 0.4165, + "26": 0.40755, + "27": 0.41769, + "28": 0.40789, + "29": 0.41516, + "30": 0.41364, + "31": 0.41649, + "32": 0.4104, + "33": 0.40992, + "34": 0.41619, + "35": 0.41207, + "36": 0.40835, + "37": 0.41126, + "38": 0.40711, + "39": 0.4143, + "40": 0.40503, + "41": 0.40421, + "42": 0.40304, + "43": 0.39915, + "44": 0.41215, + "45": 0.40298, + "46": 0.40298, + "47": 0.611, + "48": 0.39997, + "49": 0.40324, + "50": 0.40197 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index ecd9a58df01..5fd95d06800 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.86535, + "2": 10.85873, + "3": 10.86284, + "4": 10.84009, "5": 10.87856, + "6": 10.88856, + "7": 10.86532, + "8": 10.86017, + "9": 10.8599, "10": 10.82981, + "11": 10.8895, + "12": 10.8751, + "13": 10.87423, + "14": 10.89675, "15": 10.82054, + "16": 10.82504, + "17": 10.78983, + "18": 10.81029, + "19": 10.80535, "20": 10.70398, + "21": 10.66993, + "22": 10.50643, + "23": 10.69004, + "24": 10.56314, "25": 10.4942, + "26": 10.56628, + "27": 10.58025, + "28": 10.51571, + "29": 10.55299, "30": 10.30549, + "31": 10.02245, + "32": 10.40614, + "33": 10.39874, + "34": 10.13771, "35": 10.20184, + "36": 10.16052, + "37": 10.28973, + "38": 10.11474, + "39": 10.361, "40": 10.01903, + "41": 10.07292, + "42": 10.14698, + "43": 9.74687, + "44": 9.87766, "45": 9.74966, + "46": 9.73383, + "47": 10.07535, + "48": 9.78068, + "49": 9.44784, "50": 9.8399 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 653.0, + "2": 642.0, + "3": 630.0, + "4": 585.0, "5": 635.0, + "6": 687.0, + "7": 615.0, + "8": 601.0, + "9": 607.0, "10": 522.0, + "11": 637.0, + "12": 675.0, + "13": 649.0, + "14": 648.0, "15": 640.0, + "16": 602.0, + "17": 668.0, + "18": 634.0, + "19": 593.0, "20": 579.0, + "21": 633.0, + "22": 597.0, + "23": 756.0, + "24": 612.0, "25": 591.0, + "26": 620.0, + "27": 700.0, + "28": 705.0, + "29": 795.0, "30": 752.0, + "31": 628.0, + "32": 712.0, + "33": 752.0, + "34": 737.0, "35": 741.0, + "36": 770.0, + "37": 861.0, + "38": 823.0, + "39": 812.0, "40": 814.0, + "41": 826.0, + "42": 801.0, + "43": 769.0, + "44": 822.0, "45": 777.0, + "46": 828.0, + "47": 878.0, + "48": 915.0, + "49": 908.0, "50": 848.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 512262656.0, - "5": 512262656.0, - "10": 512262656.0, - "15": 512262656.0, - "20": 512262656.0, - "25": 512262656.0, - "30": 512262656.0, - "35": 512262656.0, - "40": 512262656.0, - "45": 512262656.0, - "50": 512262656.0 + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 755703296.0, - "5": 941019136.0, - "10": 941019136.0, - "15": 941020160.0, - "20": 941020160.0, - "25": 941020160.0, - "30": 941020160.0, - "35": 941020160.0, - "40": 941020160.0, - "45": 941020160.0, - "50": 941020160.0 + "1": 756752896.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, + "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, + "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, + "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, + "50": 933156352.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 15.67966, - "5": 0.28203, - "10": 0.27605, - "15": 0.28683, - "20": 0.2914, - "25": 0.28469, - "30": 0.2918, - "35": 0.28556, - "40": 0.28361, - "45": 0.28565, - "50": 0.28831 + "1": 17.87202, + "2": 0.35495, + "3": 0.32873, + "4": 0.33459, + "5": 0.32873, + "6": 0.33081, + "7": 0.33232, + "8": 0.3289, + "9": 0.33298, + "10": 0.33358, + "11": 0.33283, + "12": 0.33379, + "13": 0.33111, + "14": 0.3333, + "15": 0.33177, + "16": 0.33147, + "17": 0.33096, + "18": 0.33187, + "19": 0.33163, + "20": 0.33051, + "21": 0.33361, + "22": 0.32835, + "23": 0.32736, + "24": 0.32984, + "25": 0.32922, + "26": 0.32419, + "27": 0.32825, + "28": 0.33117, + "29": 0.32926, + "30": 0.32943, + "31": 0.33565, + "32": 0.33382, + "33": 0.33313, + "34": 0.33602, + "35": 0.32634, + "36": 0.33173, + "37": 0.33173, + "38": 0.33145, + "39": 0.32666, + "40": 0.33039, + "41": 0.3278, + "42": 0.32774, + "43": 0.33361, + "44": 0.32996, + "45": 0.32769, + "46": 0.3288, + "47": 0.33016, + "48": 0.33102, + "49": 0.33052, + "50": 0.33008 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..3730bf58aa1 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86284, + "4": 10.84009, + "5": 10.87856, + "6": 10.88856, + "7": 10.86532, + "8": 10.86017, + "9": 10.8599, + "10": 10.82981, + "11": 10.8895, + "12": 10.8751, + "13": 10.87423, + "14": 10.89675, + "15": 10.82054, + "16": 10.82504, + "17": 10.78983, + "18": 10.81029, + "19": 10.80535, + "20": 10.70398, + "21": 10.66993, + "22": 10.50643, + "23": 10.69004, + "24": 10.56314, + "25": 10.4942, + "26": 10.56628, + "27": 10.58025, + "28": 10.51571, + "29": 10.55299, + "30": 10.30549, + "31": 10.02245, + "32": 10.40614, + "33": 10.39874, + "34": 10.13771, + "35": 10.20184, + "36": 10.16052, + "37": 10.28973, + "38": 10.11474, + "39": 10.361, + "40": 10.01903, + "41": 10.07292, + "42": 10.14698, + "43": 9.74687, + "44": 9.87766, + "45": 9.74966, + "46": 9.73383, + "47": 10.07535, + "48": 9.78068, + "49": 9.44784, + "50": 9.8399 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 653.0, + "2": 642.0, + "3": 630.0, + "4": 585.0, + "5": 635.0, + "6": 687.0, + "7": 615.0, + "8": 601.0, + "9": 607.0, + "10": 522.0, + "11": 637.0, + "12": 675.0, + "13": 649.0, + "14": 648.0, + "15": 640.0, + "16": 602.0, + "17": 668.0, + "18": 634.0, + "19": 593.0, + "20": 579.0, + "21": 633.0, + "22": 597.0, + "23": 756.0, + "24": 612.0, + "25": 591.0, + "26": 620.0, + "27": 700.0, + "28": 705.0, + "29": 795.0, + "30": 752.0, + "31": 628.0, + "32": 712.0, + "33": 752.0, + "34": 737.0, + "35": 741.0, + "36": 770.0, + "37": 861.0, + "38": 823.0, + "39": 812.0, + "40": 814.0, + "41": 826.0, + "42": 801.0, + "43": 769.0, + "44": 822.0, + "45": 777.0, + "46": 828.0, + "47": 878.0, + "48": 915.0, + "49": 908.0, + "50": 848.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759895552.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, + "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 934203392.0, + "39": 934203392.0, + "40": 934203392.0, + "41": 934203392.0, + "42": 934203392.0, + "43": 934203392.0, + "44": 934203392.0, + "45": 934203392.0, + "46": 934203392.0, + "47": 934203392.0, + "48": 934203392.0, + "49": 934203392.0, + "50": 934203392.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 18.70462, + "2": 0.49178, + "3": 0.30373, + "4": 0.3001, + "5": 0.29469, + "6": 0.29224, + "7": 0.29428, + "8": 0.29177, + "9": 0.2949, + "10": 0.29498, + "11": 0.29024, + "12": 0.28647, + "13": 0.29815, + "14": 0.28835, + "15": 0.28856, + "16": 0.29348, + "17": 0.28749, + "18": 0.28567, + "19": 0.28368, + "20": 0.29149, + "21": 0.29096, + "22": 0.28857, + "23": 0.28606, + "24": 0.29136, + "25": 0.29054, + "26": 0.28694, + "27": 0.28152, + "28": 0.28851, + "29": 0.28838, + "30": 0.2819, + "31": 0.29168, + "32": 0.28475, + "33": 0.28928, + "34": 0.32279, + "35": 0.28586, + "36": 0.2887, + "37": 0.2901, + "38": 0.29895, + "39": 0.28981, + "40": 0.28651, + "41": 0.30755, + "42": 0.3078, + "43": 0.30107, + "44": 0.28402, + "45": 0.28696, + "46": 0.28819, + "47": 0.2889, + "48": 0.28688, + "49": 0.28638, + "50": 0.28429 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..cd45ff021d9 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86284, + "4": 10.84009, + "5": 10.87856, + "6": 10.88856, + "7": 10.86532, + "8": 10.86017, + "9": 10.8599, + "10": 10.82981, + "11": 10.8895, + "12": 10.8751, + "13": 10.87423, + "14": 10.89675, + "15": 10.82054, + "16": 10.82504, + "17": 10.78983, + "18": 10.81029, + "19": 10.80535, + "20": 10.70398, + "21": 10.66993, + "22": 10.50643, + "23": 10.69004, + "24": 10.56314, + "25": 10.4942, + "26": 10.56628, + "27": 10.58025, + "28": 10.51571, + "29": 10.55299, + "30": 10.30549, + "31": 10.02245, + "32": 10.40614, + "33": 10.39874, + "34": 10.13771, + "35": 10.20184, + "36": 10.16052, + "37": 10.28973, + "38": 10.11474, + "39": 10.361, + "40": 10.01903, + "41": 10.07292, + "42": 10.14698, + "43": 9.74687, + "44": 9.87766, + "45": 9.74966, + "46": 9.73383, + "47": 10.07535, + "48": 9.78068, + "49": 9.44784, + "50": 9.8399 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 653.0, + "2": 642.0, + "3": 630.0, + "4": 585.0, + "5": 635.0, + "6": 687.0, + "7": 615.0, + "8": 601.0, + "9": 607.0, + "10": 522.0, + "11": 637.0, + "12": 675.0, + "13": 649.0, + "14": 648.0, + "15": 640.0, + "16": 602.0, + "17": 668.0, + "18": 634.0, + "19": 593.0, + "20": 579.0, + "21": 633.0, + "22": 597.0, + "23": 756.0, + "24": 612.0, + "25": 591.0, + "26": 620.0, + "27": 700.0, + "28": 705.0, + "29": 795.0, + "30": 752.0, + "31": 628.0, + "32": 712.0, + "33": 752.0, + "34": 737.0, + "35": 741.0, + "36": 770.0, + "37": 861.0, + "38": 823.0, + "39": 812.0, + "40": 814.0, + "41": 826.0, + "42": 801.0, + "43": 769.0, + "44": 822.0, + "45": 777.0, + "46": 828.0, + "47": 878.0, + "48": 915.0, + "49": 908.0, + "50": 848.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759895552.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 934201856.0, + "34": 934201856.0, + "35": 934201856.0, + "36": 934201856.0, + "37": 934201856.0, + "38": 934201856.0, + "39": 934201856.0, + "40": 934201856.0, + "41": 934201856.0, + "42": 934201856.0, + "43": 934201856.0, + "44": 934201856.0, + "45": 934201856.0, + "46": 934201856.0, + "47": 934201856.0, + "48": 934201856.0, + "49": 934201856.0, + "50": 934201856.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 17.72917, + "2": 0.36269, + "3": 0.33585, + "4": 0.33878, + "5": 0.33758, + "6": 0.33453, + "7": 0.33628, + "8": 0.33416, + "9": 0.33309, + "10": 0.33521, + "11": 0.33536, + "12": 0.33148, + "13": 0.33565, + "14": 0.33401, + "15": 0.33029, + "16": 0.33788, + "17": 0.33302, + "18": 0.33337, + "19": 0.33761, + "20": 0.33672, + "21": 0.33256, + "22": 0.3374, + "23": 0.33652, + "24": 0.33672, + "25": 0.33982, + "26": 0.3335, + "27": 0.3328, + "28": 0.33835, + "29": 0.33338, + "30": 0.33371, + "31": 0.33991, + "32": 0.33259, + "33": 0.33537, + "34": 0.33777, + "35": 0.33494, + "36": 0.33504, + "37": 0.33915, + "38": 0.33462, + "39": 0.33387, + "40": 0.33791, + "41": 0.33426, + "42": 0.33834, + "43": 0.33785, + "44": 0.32761, + "45": 0.32857, + "46": 0.33205, + "47": 0.3355, + "48": 0.33535, + "49": 0.33792, + "50": 0.33613 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 7d91181b5b6..7f2dfc8b2bc 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.86535, + "2": 10.85873, + "3": 10.86283, + "4": 10.84007, "5": 10.87856, - "10": 10.82982, - "15": 10.82057, - "20": 10.70395, - "25": 10.49424, - "30": 10.30548, + "6": 10.88854, + "7": 10.86537, + "8": 10.86016, + "9": 10.85989, + "10": 10.82983, + "11": 10.88946, + "12": 10.8751, + "13": 10.87425, + "14": 10.89673, + "15": 10.82054, + "16": 10.82498, + "17": 10.78981, + "18": 10.81028, + "19": 10.80532, + "20": 10.70399, + "21": 10.66989, + "22": 10.50644, + "23": 10.69005, + "24": 10.56315, + "25": 10.49423, + "26": 10.56628, + "27": 10.58023, + "28": 10.51568, + "29": 10.55294, + "30": 10.30549, + "31": 10.02244, + "32": 10.40614, + "33": 10.39877, + "34": 10.13771, "35": 10.20187, - "40": 10.01905, - "45": 9.74965, - "50": 9.83993 + "36": 10.16047, + "37": 10.28971, + "38": 10.11478, + "39": 10.36106, + "40": 10.01903, + "41": 10.0729, + "42": 10.14696, + "43": 9.74682, + "44": 9.87762, + "45": 9.74966, + "46": 9.73383, + "47": 10.07536, + "48": 9.7807, + "49": 9.44779, + "50": 9.83987 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 628.0, - "5": 596.0, - "10": 550.0, - "15": 668.0, - "20": 597.0, - "25": 596.0, - "30": 721.0, - "35": 733.0, - "40": 770.0, - "45": 787.0, - "50": 834.0 + "1": 603.0, + "2": 644.0, + "3": 642.0, + "4": 665.0, + "5": 647.0, + "6": 668.0, + "7": 615.0, + "8": 545.0, + "9": 591.0, + "10": 540.0, + "11": 689.0, + "12": 629.0, + "13": 696.0, + "14": 658.0, + "15": 592.0, + "16": 672.0, + "17": 674.0, + "18": 623.0, + "19": 635.0, + "20": 573.0, + "21": 651.0, + "22": 625.0, + "23": 761.0, + "24": 631.0, + "25": 593.0, + "26": 614.0, + "27": 646.0, + "28": 744.0, + "29": 756.0, + "30": 699.0, + "31": 600.0, + "32": 686.0, + "33": 777.0, + "34": 734.0, + "35": 765.0, + "36": 763.0, + "37": 876.0, + "38": 802.0, + "39": 832.0, + "40": 788.0, + "41": 811.0, + "42": 850.0, + "43": 765.0, + "44": 854.0, + "45": 853.0, + "46": 878.0, + "47": 862.0, + "48": 881.0, + "49": 859.0, + "50": 919.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, "50": 510689792.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 759895552.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, "10": 933156352.0, - "15": 934202368.0, - "20": 934202368.0, - "25": 934202368.0, - "30": 934202368.0, - "35": 934202368.0, - "40": 934202368.0, - "45": 934202368.0, - "50": 934202368.0 + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, + "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, + "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, + "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, + "50": 933156352.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 18.4128, - "5": 0.28948, - "10": 0.28908, - "15": 0.29449, - "20": 0.2915, - "25": 0.29014, - "30": 0.29089, - "35": 0.2912, - "40": 0.29097, - "45": 0.28976, - "50": 0.28881 + "1": 17.48669, + "2": 0.35686, + "3": 0.33796, + "4": 0.33709, + "5": 0.33802, + "6": 0.33381, + "7": 0.33842, + "8": 0.3348, + "9": 0.33686, + "10": 0.3401, + "11": 0.34206, + "12": 0.33741, + "13": 0.34235, + "14": 0.33743, + "15": 0.34813, + "16": 0.342, + "17": 0.33354, + "18": 0.33386, + "19": 0.32453, + "20": 0.31766, + "21": 0.31357, + "22": 0.3174, + "23": 0.31757, + "24": 0.31831, + "25": 0.3365, + "26": 0.33734, + "27": 0.33686, + "28": 0.32433, + "29": 0.3211, + "30": 0.31641, + "31": 0.32085, + "32": 0.32356, + "33": 0.31983, + "34": 0.31994, + "35": 0.32561, + "36": 0.3216, + "37": 0.31934, + "38": 0.31931, + "39": 0.32259, + "40": 0.31785, + "41": 0.321, + "42": 0.32432, + "43": 0.32102, + "44": 0.31762, + "45": 0.32401, + "46": 0.32061, + "47": 0.3186, + "48": 0.32263, + "49": 0.31974, + "50": 0.31888 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..5c64711360d --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86283, + "4": 10.84006, + "5": 10.87853, + "6": 10.88852, + "7": 10.86537, + "8": 10.86018, + "9": 10.85991, + "10": 10.82984, + "11": 10.88948, + "12": 10.87506, + "13": 10.87427, + "14": 10.8968, + "15": 10.82052, + "16": 10.82498, + "17": 10.78984, + "18": 10.8103, + "19": 10.80531, + "20": 10.70396, + "21": 10.66991, + "22": 10.50642, + "23": 10.69005, + "24": 10.56311, + "25": 10.49418, + "26": 10.56624, + "27": 10.58025, + "28": 10.51574, + "29": 10.55295, + "30": 10.3055, + "31": 10.0225, + "32": 10.40617, + "33": 10.39874, + "34": 10.13767, + "35": 10.20188, + "36": 10.16051, + "37": 10.28971, + "38": 10.11484, + "39": 10.361, + "40": 10.01901, + "41": 10.07292, + "42": 10.14698, + "43": 9.74684, + "44": 9.87759, + "45": 9.74966, + "46": 9.73384, + "47": 10.07536, + "48": 9.78071, + "49": 9.44782, + "50": 9.83988 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 597.0, + "2": 639.0, + "3": 612.0, + "4": 595.0, + "5": 633.0, + "6": 679.0, + "7": 626.0, + "8": 555.0, + "9": 700.0, + "10": 529.0, + "11": 658.0, + "12": 622.0, + "13": 660.0, + "14": 622.0, + "15": 690.0, + "16": 639.0, + "17": 671.0, + "18": 653.0, + "19": 595.0, + "20": 584.0, + "21": 656.0, + "22": 560.0, + "23": 743.0, + "24": 616.0, + "25": 626.0, + "26": 623.0, + "27": 680.0, + "28": 680.0, + "29": 750.0, + "30": 690.0, + "31": 560.0, + "32": 794.0, + "33": 753.0, + "34": 693.0, + "35": 696.0, + "36": 760.0, + "37": 852.0, + "38": 792.0, + "39": 849.0, + "40": 773.0, + "41": 842.0, + "42": 798.0, + "43": 732.0, + "44": 751.0, + "45": 788.0, + "46": 834.0, + "47": 853.0, + "48": 888.0, + "49": 919.0, + "50": 813.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759895552.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 934204928.0, + "11": 934204928.0, + "12": 934204928.0, + "13": 934204928.0, + "14": 934204928.0, + "15": 934204928.0, + "16": 934204928.0, + "17": 934204928.0, + "18": 934204928.0, + "19": 934204928.0, + "20": 934204928.0, + "21": 934204928.0, + "22": 934204928.0, + "23": 934204928.0, + "24": 934204928.0, + "25": 934204928.0, + "26": 934204928.0, + "27": 934204928.0, + "28": 934204928.0, + "29": 934204928.0, + "30": 934204928.0, + "31": 934204928.0, + "32": 934204928.0, + "33": 934204928.0, + "34": 934204928.0, + "35": 934204928.0, + "36": 934204928.0, + "37": 934204928.0, + "38": 934204928.0, + "39": 934204928.0, + "40": 934204928.0, + "41": 934204928.0, + "42": 934204928.0, + "43": 934204928.0, + "44": 934204928.0, + "45": 934204928.0, + "46": 934204928.0, + "47": 934204928.0, + "48": 934204928.0, + "49": 934204928.0, + "50": 934204928.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 18.56725, + "2": 0.36563, + "3": 0.29793, + "4": 0.29146, + "5": 0.29688, + "6": 0.29337, + "7": 0.29262, + "8": 0.28985, + "9": 0.29835, + "10": 0.32046, + "11": 0.28909, + "12": 0.29047, + "13": 0.29281, + "14": 0.29357, + "15": 0.29127, + "16": 0.29335, + "17": 0.29304, + "18": 0.29416, + "19": 0.29357, + "20": 0.29492, + "21": 0.28986, + "22": 0.29152, + "23": 0.29187, + "24": 0.29293, + "25": 0.28805, + "26": 0.28928, + "27": 0.28866, + "28": 0.29096, + "29": 0.28896, + "30": 0.2822, + "31": 0.31729, + "32": 0.28381, + "33": 0.28187, + "34": 0.28158, + "35": 0.28315, + "36": 0.28905, + "37": 0.28877, + "38": 0.29206, + "39": 0.28679, + "40": 0.28818, + "41": 0.28755, + "42": 0.28911, + "43": 0.28782, + "44": 0.28493, + "45": 0.28392, + "46": 0.28061, + "47": 0.29507, + "48": 0.28442, + "49": 0.28204, + "50": 0.28301 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..524007ed7d6 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86283, + "4": 10.84007, + "5": 10.87854, + "6": 10.88853, + "7": 10.86532, + "8": 10.8602, + "9": 10.85991, + "10": 10.82981, + "11": 10.8895, + "12": 10.87507, + "13": 10.87426, + "14": 10.89678, + "15": 10.82054, + "16": 10.825, + "17": 10.7898, + "18": 10.8103, + "19": 10.80536, + "20": 10.70398, + "21": 10.66992, + "22": 10.50644, + "23": 10.69005, + "24": 10.5631, + "25": 10.49418, + "26": 10.56626, + "27": 10.58028, + "28": 10.51572, + "29": 10.55298, + "30": 10.30549, + "31": 10.02244, + "32": 10.40615, + "33": 10.3988, + "34": 10.13773, + "35": 10.20188, + "36": 10.1605, + "37": 10.28974, + "38": 10.11477, + "39": 10.36102, + "40": 10.01902, + "41": 10.07292, + "42": 10.14694, + "43": 9.74685, + "44": 9.87766, + "45": 9.74965, + "46": 9.73384, + "47": 10.07535, + "48": 9.7807, + "49": 9.44783, + "50": 9.83991 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 647.0, + "2": 614.0, + "3": 640.0, + "4": 603.0, + "5": 600.0, + "6": 683.0, + "7": 630.0, + "8": 565.0, + "9": 671.0, + "10": 531.0, + "11": 670.0, + "12": 643.0, + "13": 626.0, + "14": 635.0, + "15": 655.0, + "16": 643.0, + "17": 693.0, + "18": 634.0, + "19": 648.0, + "20": 644.0, + "21": 690.0, + "22": 606.0, + "23": 694.0, + "24": 565.0, + "25": 605.0, + "26": 636.0, + "27": 638.0, + "28": 721.0, + "29": 750.0, + "30": 760.0, + "31": 572.0, + "32": 705.0, + "33": 816.0, + "34": 737.0, + "35": 720.0, + "36": 710.0, + "37": 862.0, + "38": 763.0, + "39": 909.0, + "40": 795.0, + "41": 776.0, + "42": 858.0, + "43": 771.0, + "44": 858.0, + "45": 857.0, + "46": 864.0, + "47": 880.0, + "48": 923.0, + "49": 899.0, + "50": 868.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 757801472.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, + "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, + "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, + "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, + "50": 933156352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 17.58309, + "2": 0.34736, + "3": 0.32683, + "4": 0.3279, + "5": 0.32934, + "6": 0.33179, + "7": 0.3281, + "8": 0.3324, + "9": 0.32989, + "10": 0.32742, + "11": 0.33009, + "12": 0.3345, + "13": 0.33455, + "14": 0.3346, + "15": 0.33747, + "16": 0.33625, + "17": 0.3454, + "18": 0.33586, + "19": 0.33227, + "20": 0.33242, + "21": 0.33093, + "22": 0.33378, + "23": 0.33439, + "24": 0.33159, + "25": 0.32826, + "26": 0.33259, + "27": 0.33154, + "28": 0.32855, + "29": 0.32973, + "30": 0.33267, + "31": 0.33156, + "32": 0.32832, + "33": 0.33304, + "34": 0.32817, + "35": 0.32993, + "36": 0.33154, + "37": 0.32842, + "38": 0.32508, + "39": 0.33067, + "40": 0.33115, + "41": 0.32719, + "42": 0.33205, + "43": 0.3472, + "44": 0.33564, + "45": 0.33202, + "46": 0.33051, + "47": 0.32871, + "48": 0.33055, + "49": 0.33399, + "50": 0.33114 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..14cd1d474ea --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.93292, + "2": 10.93423, + "3": 10.91347, + "4": 10.90321, + "5": 10.92968, + "6": 10.93655, + "7": 10.90282, + "8": 10.92114, + "9": 10.9071, + "10": 10.90475, + "11": 10.88788, + "12": 10.91736, + "13": 10.91189, + "14": 10.91506, + "15": 10.87125, + "16": 10.86126, + "17": 10.82696, + "18": 10.85678, + "19": 10.84055, + "20": 10.75, + "21": 10.71504, + "22": 10.58118, + "23": 10.72644, + "24": 10.60729, + "25": 10.53753, + "26": 10.61069, + "27": 10.5993, + "28": 10.54958, + "29": 10.56602, + "30": 10.32554, + "31": 10.06693, + "32": 10.4381, + "33": 10.42361, + "34": 10.16014, + "35": 10.22895, + "36": 10.17612, + "37": 10.29235, + "38": 10.13293, + "39": 10.34955, + "40": 10.01972, + "41": 10.07533, + "42": 10.1541, + "43": 9.76091, + "44": 9.88354, + "45": 9.75546, + "46": 9.7496, + "47": 10.07548, + "48": 9.77939, + "49": 9.43816, + "50": 9.84074 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 602.0, + "2": 601.0, + "3": 651.0, + "4": 566.0, + "5": 693.0, + "6": 637.0, + "7": 601.0, + "8": 628.0, + "9": 593.0, + "10": 579.0, + "11": 685.0, + "12": 630.0, + "13": 654.0, + "14": 624.0, + "15": 569.0, + "16": 630.0, + "17": 623.0, + "18": 588.0, + "19": 594.0, + "20": 599.0, + "21": 633.0, + "22": 585.0, + "23": 642.0, + "24": 613.0, + "25": 592.0, + "26": 662.0, + "27": 617.0, + "28": 709.0, + "29": 691.0, + "30": 693.0, + "31": 574.0, + "32": 708.0, + "33": 781.0, + "34": 693.0, + "35": 712.0, + "36": 777.0, + "37": 799.0, + "38": 765.0, + "39": 865.0, + "40": 811.0, + "41": 795.0, + "42": 818.0, + "43": 730.0, + "44": 730.0, + "45": 781.0, + "46": 788.0, + "47": 884.0, + "48": 833.0, + "49": 841.0, + "50": 839.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 431783936.0, + "2": 431783936.0, + "3": 431783936.0, + "4": 431783936.0, + "5": 431783936.0, + "6": 431783936.0, + "7": 431783936.0, + "8": 431783936.0, + "9": 431783936.0, + "10": 431783936.0, + "11": 431783936.0, + "12": 431783936.0, + "13": 431783936.0, + "14": 431783936.0, + "15": 431783936.0, + "16": 431783936.0, + "17": 431783936.0, + "18": 431783936.0, + "19": 431783936.0, + "20": 431783936.0, + "21": 431783936.0, + "22": 431783936.0, + "23": 431783936.0, + "24": 431783936.0, + "25": 431783936.0, + "26": 431783936.0, + "27": 431783936.0, + "28": 431783936.0, + "29": 431783936.0, + "30": 431783936.0, + "31": 431783936.0, + "32": 431783936.0, + "33": 431783936.0, + "34": 431783936.0, + "35": 431783936.0, + "36": 431783936.0, + "37": 431783936.0, + "38": 431783936.0, + "39": 431783936.0, + "40": 431783936.0, + "41": 431783936.0, + "42": 431783936.0, + "43": 431783936.0, + "44": 431783936.0, + "45": 431783936.0, + "46": 431783936.0, + "47": 431783936.0, + "48": 431783936.0, + "49": 431783936.0, + "50": 431783936.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 677335040.0, + "2": 853214208.0, + "3": 853214208.0, + "4": 853214208.0, + "5": 854262272.0, + "6": 854262272.0, + "7": 854262272.0, + "8": 854262272.0, + "9": 854262272.0, + "10": 854262272.0, + "11": 854262272.0, + "12": 854262272.0, + "13": 854262272.0, + "14": 854262272.0, + "15": 854262784.0, + "16": 854262784.0, + "17": 854262784.0, + "18": 854262784.0, + "19": 854262784.0, + "20": 854262784.0, + "21": 854262784.0, + "22": 855309824.0, + "23": 855309824.0, + "24": 855309824.0, + "25": 855309824.0, + "26": 855309824.0, + "27": 855309824.0, + "28": 855309824.0, + "29": 855309824.0, + "30": 855309824.0, + "31": 855309824.0, + "32": 855309824.0, + "33": 855309824.0, + "34": 855309824.0, + "35": 855309824.0, + "36": 855309824.0, + "37": 855309824.0, + "38": 855309824.0, + "39": 855309824.0, + "40": 855309824.0, + "41": 855309824.0, + "42": 855309824.0, + "43": 855309824.0, + "44": 855309824.0, + "45": 855309824.0, + "46": 855309824.0, + "47": 855309824.0, + "48": 855309824.0, + "49": 855309824.0, + "50": 855309824.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 17.80821, + "2": 0.44808, + "3": 0.40988, + "4": 0.40164, + "5": 0.4125, + "6": 0.40088, + "7": 0.40048, + "8": 0.40898, + "9": 0.39981, + "10": 0.40981, + "11": 0.3988, + "12": 0.39912, + "13": 0.40567, + "14": 0.39849, + "15": 0.40867, + "16": 0.39758, + "17": 0.39933, + "18": 0.40941, + "19": 0.39811, + "20": 0.40972, + "21": 0.39879, + "22": 0.40217, + "23": 0.40454, + "24": 0.397, + "25": 0.4072, + "26": 0.39671, + "27": 0.3982, + "28": 0.40691, + "29": 0.39562, + "30": 0.40833, + "31": 0.39669, + "32": 0.39668, + "33": 0.40988, + "34": 0.39562, + "35": 0.41063, + "36": 0.39531, + "37": 0.39635, + "38": 0.41178, + "39": 0.39606, + "40": 0.41007, + "41": 0.39542, + "42": 0.39788, + "43": 0.41102, + "44": 0.3969, + "45": 0.41204, + "46": 0.39665, + "47": 0.39695, + "48": 0.41099, + "49": 0.39625, + "50": 0.4146 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..9c3dab558ec --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.93292, + "2": 10.93423, + "3": 10.91345, + "4": 10.90321, + "5": 10.9297, + "6": 10.93657, + "7": 10.90281, + "8": 10.92116, + "9": 10.90702, + "10": 10.90475, + "11": 10.88789, + "12": 10.91738, + "13": 10.91188, + "14": 10.91509, + "15": 10.87126, + "16": 10.8613, + "17": 10.82702, + "18": 10.85677, + "19": 10.84056, + "20": 10.75001, + "21": 10.71508, + "22": 10.58113, + "23": 10.7264, + "24": 10.60734, + "25": 10.53754, + "26": 10.61068, + "27": 10.59932, + "28": 10.54956, + "29": 10.56601, + "30": 10.32552, + "31": 10.06698, + "32": 10.43809, + "33": 10.4236, + "34": 10.16018, + "35": 10.22896, + "36": 10.17616, + "37": 10.29237, + "38": 10.13292, + "39": 10.34956, + "40": 10.01975, + "41": 10.07535, + "42": 10.15409, + "43": 9.7609, + "44": 9.88356, + "45": 9.75543, + "46": 9.74958, + "47": 10.07545, + "48": 9.77939, + "49": 9.43818, + "50": 9.84071 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 618.0, + "2": 622.0, + "3": 667.0, + "4": 559.0, + "5": 671.0, + "6": 625.0, + "7": 656.0, + "8": 584.0, + "9": 654.0, + "10": 511.0, + "11": 690.0, + "12": 601.0, + "13": 628.0, + "14": 654.0, + "15": 604.0, + "16": 652.0, + "17": 646.0, + "18": 640.0, + "19": 579.0, + "20": 532.0, + "21": 644.0, + "22": 584.0, + "23": 649.0, + "24": 595.0, + "25": 614.0, + "26": 621.0, + "27": 648.0, + "28": 727.0, + "29": 683.0, + "30": 657.0, + "31": 553.0, + "32": 700.0, + "33": 776.0, + "34": 645.0, + "35": 729.0, + "36": 740.0, + "37": 733.0, + "38": 740.0, + "39": 816.0, + "40": 792.0, + "41": 769.0, + "42": 828.0, + "43": 740.0, + "44": 784.0, + "45": 761.0, + "46": 831.0, + "47": 833.0, + "48": 866.0, + "49": 819.0, + "50": 876.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 431783936.0, + "2": 431783936.0, + "3": 431783936.0, + "4": 431783936.0, + "5": 431783936.0, + "6": 431783936.0, + "7": 431783936.0, + "8": 431783936.0, + "9": 431783936.0, + "10": 431783936.0, + "11": 431783936.0, + "12": 431783936.0, + "13": 431783936.0, + "14": 431783936.0, + "15": 431783936.0, + "16": 431783936.0, + "17": 431783936.0, + "18": 431783936.0, + "19": 431783936.0, + "20": 431783936.0, + "21": 431783936.0, + "22": 431783936.0, + "23": 431783936.0, + "24": 431783936.0, + "25": 431783936.0, + "26": 431783936.0, + "27": 431783936.0, + "28": 431783936.0, + "29": 431783936.0, + "30": 431783936.0, + "31": 431783936.0, + "32": 431783936.0, + "33": 431783936.0, + "34": 431783936.0, + "35": 431783936.0, + "36": 431783936.0, + "37": 431783936.0, + "38": 431783936.0, + "39": 431783936.0, + "40": 431783936.0, + "41": 431783936.0, + "42": 431783936.0, + "43": 431783936.0, + "44": 431783936.0, + "45": 431783936.0, + "46": 431783936.0, + "47": 431783936.0, + "48": 431783936.0, + "49": 431783936.0, + "50": 431783936.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 678382080.0, + "2": 854262784.0, + "3": 854262784.0, + "4": 855310848.0, + "5": 855310848.0, + "6": 855310848.0, + "7": 855310848.0, + "8": 855310848.0, + "9": 855310848.0, + "10": 855310848.0, + "11": 855310848.0, + "12": 855310848.0, + "13": 855310848.0, + "14": 855310848.0, + "15": 855310848.0, + "16": 855310848.0, + "17": 855310848.0, + "18": 855310848.0, + "19": 855310848.0, + "20": 855310848.0, + "21": 855310848.0, + "22": 855310848.0, + "23": 855310848.0, + "24": 855310848.0, + "25": 855310848.0, + "26": 855310848.0, + "27": 855310848.0, + "28": 855310848.0, + "29": 855310848.0, + "30": 855311360.0, + "31": 855311360.0, + "32": 855311360.0, + "33": 855311360.0, + "34": 855311360.0, + "35": 855311360.0, + "36": 855311360.0, + "37": 855311360.0, + "38": 855311360.0, + "39": 855311360.0, + "40": 855311360.0, + "41": 855311360.0, + "42": 855311360.0, + "43": 855311360.0, + "44": 855311360.0, + "45": 855311360.0, + "46": 855311360.0, + "47": 855311360.0, + "48": 855311360.0, + "49": 855311360.0, + "50": 855311360.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 20.59672, + "2": 0.48034, + "3": 0.40738, + "4": 0.42161, + "5": 0.40858, + "6": 0.39543, + "7": 0.40287, + "8": 0.3966, + "9": 0.41138, + "10": 0.3986, + "11": 0.39331, + "12": 0.40756, + "13": 0.3935, + "14": 0.40339, + "15": 0.39322, + "16": 0.38875, + "17": 0.3989, + "18": 0.39441, + "19": 0.4034, + "20": 0.39017, + "21": 0.39088, + "22": 0.40266, + "23": 0.39396, + "24": 0.40055, + "25": 0.39308, + "26": 0.38936, + "27": 0.40304, + "28": 0.40539, + "29": 0.39709, + "30": 0.39502, + "31": 0.3928, + "32": 0.40816, + "33": 0.39533, + "34": 0.39686, + "35": 0.39825, + "36": 0.39554, + "37": 0.40729, + "38": 0.39634, + "39": 0.39853, + "40": 0.39904, + "41": 0.39615, + "42": 0.40732, + "43": 0.39538, + "44": 0.40115, + "45": 0.40237, + "46": 0.40262, + "47": 0.6094, + "48": 0.396, + "49": 0.40787, + "50": 0.3942 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 7bc5d3556fa..bb6bba8ed0e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.86535, - "5": 10.87856, + "2": 10.85873, + "3": 10.8628, + "4": 10.84009, + "5": 10.87853, + "6": 10.88854, + "7": 10.86533, + "8": 10.86016, + "9": 10.85986, "10": 10.82978, - "15": 10.8205, - "20": 10.70397, - "25": 10.49419, - "30": 10.30553, - "35": 10.20189, - "40": 10.019, - "45": 9.74966, + "11": 10.88951, + "12": 10.8751, + "13": 10.87423, + "14": 10.89676, + "15": 10.82054, + "16": 10.82498, + "17": 10.78983, + "18": 10.8103, + "19": 10.80532, + "20": 10.70395, + "21": 10.66992, + "22": 10.50638, + "23": 10.69003, + "24": 10.5631, + "25": 10.4942, + "26": 10.56628, + "27": 10.58022, + "28": 10.51569, + "29": 10.55298, + "30": 10.30552, + "31": 10.02248, + "32": 10.40616, + "33": 10.39876, + "34": 10.13775, + "35": 10.20182, + "36": 10.16045, + "37": 10.28971, + "38": 10.11479, + "39": 10.36102, + "40": 10.01903, + "41": 10.07292, + "42": 10.14694, + "43": 9.74688, + "44": 9.87761, + "45": 9.74964, + "46": 9.73382, + "47": 10.07536, + "48": 9.78068, + "49": 9.44785, "50": 9.8399 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 592.0, - "5": 682.0, - "10": 528.0, - "15": 610.0, - "20": 610.0, - "25": 585.0, - "30": 690.0, - "35": 743.0, - "40": 769.0, - "45": 776.0, - "50": 793.0 + "1": 575.0, + "2": 661.0, + "3": 612.0, + "4": 601.0, + "5": 654.0, + "6": 680.0, + "7": 639.0, + "8": 567.0, + "9": 683.0, + "10": 559.0, + "11": 618.0, + "12": 620.0, + "13": 668.0, + "14": 681.0, + "15": 642.0, + "16": 637.0, + "17": 645.0, + "18": 610.0, + "19": 622.0, + "20": 611.0, + "21": 667.0, + "22": 590.0, + "23": 734.0, + "24": 615.0, + "25": 598.0, + "26": 634.0, + "27": 667.0, + "28": 675.0, + "29": 769.0, + "30": 715.0, + "31": 607.0, + "32": 763.0, + "33": 814.0, + "34": 694.0, + "35": 713.0, + "36": 780.0, + "37": 817.0, + "38": 759.0, + "39": 886.0, + "40": 790.0, + "41": 758.0, + "42": 895.0, + "43": 763.0, + "44": 846.0, + "45": 765.0, + "46": 822.0, + "47": 882.0, + "48": 890.0, + "49": 875.0, + "50": 829.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, "50": 510689792.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 759895552.0, - "5": 933156352.0, - "10": 933156352.0, - "15": 933156352.0, - "20": 933156352.0, - "25": 933156352.0, - "30": 933156352.0, - "35": 934204928.0, - "40": 934204928.0, - "45": 934204928.0, - "50": 934204928.0 + "2": 934203904.0, + "3": 934203904.0, + "4": 934203904.0, + "5": 934203904.0, + "6": 934203904.0, + "7": 934203904.0, + "8": 934203904.0, + "9": 934203904.0, + "10": 934203904.0, + "11": 934203904.0, + "12": 934203904.0, + "13": 934203904.0, + "14": 934203904.0, + "15": 934203904.0, + "16": 934203904.0, + "17": 934203904.0, + "18": 934203904.0, + "19": 934203904.0, + "20": 934203904.0, + "21": 934203904.0, + "22": 934203904.0, + "23": 934203904.0, + "24": 934203904.0, + "25": 934203904.0, + "26": 934203904.0, + "27": 934203904.0, + "28": 934203904.0, + "29": 934203904.0, + "30": 934203904.0, + "31": 934203904.0, + "32": 934203904.0, + "33": 934203904.0, + "34": 934203904.0, + "35": 934203904.0, + "36": 934203904.0, + "37": 934203904.0, + "38": 934203904.0, + "39": 934203904.0, + "40": 934203904.0, + "41": 934203904.0, + "42": 934203904.0, + "43": 934203904.0, + "44": 934203904.0, + "45": 934203904.0, + "46": 934203904.0, + "47": 934203904.0, + "48": 934203904.0, + "49": 934203904.0, + "50": 934203904.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 16.08421, - "5": 0.28702, - "10": 0.28776, - "15": 0.28313, - "20": 0.29045, - "25": 0.28998, - "30": 0.29456, - "35": 0.28602, - "40": 0.29367, - "45": 0.28709, - "50": 0.2778 + "1": 15.70977, + "2": 0.39393, + "3": 0.33447, + "4": 0.34165, + "5": 0.33487, + "6": 0.33525, + "7": 0.33869, + "8": 0.33407, + "9": 0.32508, + "10": 0.32918, + "11": 0.32205, + "12": 0.32514, + "13": 0.32309, + "14": 0.32866, + "15": 0.32578, + "16": 0.32709, + "17": 0.32494, + "18": 0.3252, + "19": 0.32806, + "20": 0.32441, + "21": 0.32296, + "22": 0.32925, + "23": 0.32839, + "24": 0.32762, + "25": 0.33125, + "26": 0.3356, + "27": 0.32827, + "28": 0.32644, + "29": 0.32972, + "30": 0.32228, + "31": 0.3298, + "32": 0.32343, + "33": 0.32498, + "34": 0.32618, + "35": 0.32714, + "36": 0.32467, + "37": 0.32506, + "38": 0.32635, + "39": 0.3247, + "40": 0.32635, + "41": 0.32613, + "42": 0.32304, + "43": 0.32555, + "44": 0.32911, + "45": 0.3247, + "46": 0.32199, + "47": 0.32475, + "48": 0.32466, + "49": 0.32582, + "50": 0.32505 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..8e79ecc164b --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86283, + "4": 10.84004, + "5": 10.87856, + "6": 10.88851, + "7": 10.86535, + "8": 10.86016, + "9": 10.8599, + "10": 10.8298, + "11": 10.88949, + "12": 10.87507, + "13": 10.87424, + "14": 10.89675, + "15": 10.82057, + "16": 10.82503, + "17": 10.7898, + "18": 10.81025, + "19": 10.80535, + "20": 10.70398, + "21": 10.6699, + "22": 10.50643, + "23": 10.69004, + "24": 10.5631, + "25": 10.49418, + "26": 10.56626, + "27": 10.58022, + "28": 10.5157, + "29": 10.55297, + "30": 10.30551, + "31": 10.02249, + "32": 10.40617, + "33": 10.3988, + "34": 10.13771, + "35": 10.20187, + "36": 10.16052, + "37": 10.28969, + "38": 10.11482, + "39": 10.36105, + "40": 10.01899, + "41": 10.0729, + "42": 10.14695, + "43": 9.74686, + "44": 9.87766, + "45": 9.74967, + "46": 9.73385, + "47": 10.07539, + "48": 9.7807, + "49": 9.4478, + "50": 9.83992 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 594.0, + "2": 655.0, + "3": 626.0, + "4": 604.0, + "5": 612.0, + "6": 667.0, + "7": 653.0, + "8": 575.0, + "9": 673.0, + "10": 542.0, + "11": 672.0, + "12": 584.0, + "13": 616.0, + "14": 673.0, + "15": 695.0, + "16": 655.0, + "17": 640.0, + "18": 640.0, + "19": 637.0, + "20": 601.0, + "21": 680.0, + "22": 565.0, + "23": 706.0, + "24": 615.0, + "25": 603.0, + "26": 591.0, + "27": 653.0, + "28": 696.0, + "29": 781.0, + "30": 767.0, + "31": 608.0, + "32": 740.0, + "33": 839.0, + "34": 727.0, + "35": 729.0, + "36": 720.0, + "37": 821.0, + "38": 818.0, + "39": 826.0, + "40": 750.0, + "41": 855.0, + "42": 871.0, + "43": 719.0, + "44": 838.0, + "45": 761.0, + "46": 886.0, + "47": 852.0, + "48": 876.0, + "49": 905.0, + "50": 872.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759895552.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 934202368.0, + "5": 934202368.0, + "6": 934202368.0, + "7": 934202368.0, + "8": 934202368.0, + "9": 934202368.0, + "10": 934202368.0, + "11": 934202368.0, + "12": 934202368.0, + "13": 934202368.0, + "14": 934202368.0, + "15": 934202368.0, + "16": 934202368.0, + "17": 934202368.0, + "18": 934202368.0, + "19": 934202368.0, + "20": 934202368.0, + "21": 934202368.0, + "22": 934202368.0, + "23": 934202368.0, + "24": 934202368.0, + "25": 934202368.0, + "26": 934202368.0, + "27": 934202368.0, + "28": 934202368.0, + "29": 934202368.0, + "30": 934202368.0, + "31": 934202368.0, + "32": 934202368.0, + "33": 934202368.0, + "34": 934202368.0, + "35": 934202368.0, + "36": 934202368.0, + "37": 934202368.0, + "38": 934202368.0, + "39": 934202368.0, + "40": 934202368.0, + "41": 934202368.0, + "42": 934202368.0, + "43": 934202368.0, + "44": 934202368.0, + "45": 934202368.0, + "46": 934202368.0, + "47": 934202368.0, + "48": 934202368.0, + "49": 934202368.0, + "50": 934202368.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 16.29804, + "2": 0.33247, + "3": 0.3002, + "4": 0.29387, + "5": 0.28202, + "6": 0.28144, + "7": 0.28667, + "8": 0.28202, + "9": 0.28668, + "10": 0.28475, + "11": 0.28037, + "12": 0.28061, + "13": 0.28479, + "14": 0.28709, + "15": 0.28259, + "16": 0.28648, + "17": 0.28752, + "18": 0.28427, + "19": 0.28253, + "20": 0.28216, + "21": 0.28394, + "22": 0.28202, + "23": 0.2842, + "24": 0.28848, + "25": 0.29137, + "26": 0.29314, + "27": 0.29412, + "28": 0.29477, + "29": 0.2847, + "30": 0.29036, + "31": 0.29596, + "32": 0.29187, + "33": 0.2913, + "34": 0.28636, + "35": 0.29547, + "36": 0.29476, + "37": 0.29213, + "38": 0.28835, + "39": 0.28597, + "40": 0.28573, + "41": 0.28673, + "42": 0.28864, + "43": 0.28774, + "44": 0.2871, + "45": 0.28744, + "46": 0.28594, + "47": 0.29182, + "48": 0.28838, + "49": 0.28221, + "50": 0.28369 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..fb8e93ed571 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86285, + "4": 10.84011, + "5": 10.87856, + "6": 10.88852, + "7": 10.86536, + "8": 10.86016, + "9": 10.85989, + "10": 10.82982, + "11": 10.88947, + "12": 10.8751, + "13": 10.87425, + "14": 10.89675, + "15": 10.82051, + "16": 10.82498, + "17": 10.78982, + "18": 10.81029, + "19": 10.80533, + "20": 10.70397, + "21": 10.66991, + "22": 10.50644, + "23": 10.69004, + "24": 10.56312, + "25": 10.49421, + "26": 10.56627, + "27": 10.58027, + "28": 10.51573, + "29": 10.553, + "30": 10.30549, + "31": 10.02248, + "32": 10.40616, + "33": 10.39874, + "34": 10.13771, + "35": 10.20187, + "36": 10.16049, + "37": 10.28975, + "38": 10.11483, + "39": 10.36101, + "40": 10.01902, + "41": 10.07289, + "42": 10.14695, + "43": 9.74689, + "44": 9.87763, + "45": 9.74967, + "46": 9.73381, + "47": 10.07535, + "48": 9.78068, + "49": 9.44781, + "50": 9.8399 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 625.0, + "2": 644.0, + "3": 614.0, + "4": 636.0, + "5": 605.0, + "6": 649.0, + "7": 606.0, + "8": 559.0, + "9": 658.0, + "10": 524.0, + "11": 693.0, + "12": 598.0, + "13": 702.0, + "14": 660.0, + "15": 638.0, + "16": 596.0, + "17": 662.0, + "18": 586.0, + "19": 594.0, + "20": 598.0, + "21": 656.0, + "22": 608.0, + "23": 706.0, + "24": 609.0, + "25": 610.0, + "26": 632.0, + "27": 664.0, + "28": 766.0, + "29": 765.0, + "30": 755.0, + "31": 606.0, + "32": 708.0, + "33": 775.0, + "34": 735.0, + "35": 729.0, + "36": 739.0, + "37": 840.0, + "38": 749.0, + "39": 911.0, + "40": 763.0, + "41": 830.0, + "42": 835.0, + "43": 755.0, + "44": 823.0, + "45": 799.0, + "46": 811.0, + "47": 869.0, + "48": 839.0, + "49": 897.0, + "50": 869.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759898624.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 934202368.0, + "5": 934202368.0, + "6": 934202368.0, + "7": 934202368.0, + "8": 934202368.0, + "9": 934202368.0, + "10": 934202368.0, + "11": 934202368.0, + "12": 934202368.0, + "13": 934202368.0, + "14": 934202368.0, + "15": 934202368.0, + "16": 934202368.0, + "17": 934202368.0, + "18": 934202368.0, + "19": 934202368.0, + "20": 934202368.0, + "21": 934202368.0, + "22": 934202368.0, + "23": 934202368.0, + "24": 934202368.0, + "25": 934202368.0, + "26": 934202368.0, + "27": 934202368.0, + "28": 934202368.0, + "29": 934202368.0, + "30": 934202368.0, + "31": 934202368.0, + "32": 934202368.0, + "33": 934202368.0, + "34": 934202368.0, + "35": 934202368.0, + "36": 934202368.0, + "37": 934202368.0, + "38": 934202368.0, + "39": 934202368.0, + "40": 934202368.0, + "41": 934202368.0, + "42": 934202368.0, + "43": 934202368.0, + "44": 934202368.0, + "45": 934202368.0, + "46": 934202368.0, + "47": 934202368.0, + "48": 934202368.0, + "49": 934202368.0, + "50": 934202368.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 15.91359, + "2": 0.40136, + "3": 0.32913, + "4": 0.33946, + "5": 0.32404, + "6": 0.31963, + "7": 0.32283, + "8": 0.32302, + "9": 0.32004, + "10": 0.32058, + "11": 0.33128, + "12": 0.32725, + "13": 0.3253, + "14": 0.32532, + "15": 0.32194, + "16": 0.32237, + "17": 0.31946, + "18": 0.31937, + "19": 0.3185, + "20": 0.3193, + "21": 0.32216, + "22": 0.328, + "23": 0.32251, + "24": 0.32294, + "25": 0.32205, + "26": 0.32393, + "27": 0.32132, + "28": 0.32221, + "29": 0.32269, + "30": 0.32422, + "31": 0.32527, + "32": 0.32866, + "33": 0.32346, + "34": 0.32064, + "35": 0.3199, + "36": 0.32198, + "37": 0.32252, + "38": 0.32103, + "39": 0.32486, + "40": 0.32573, + "41": 0.32643, + "42": 0.3234, + "43": 0.32778, + "44": 0.32302, + "45": 0.32434, + "46": 0.32532, + "47": 0.32115, + "48": 0.31979, + "49": 0.3233, + "50": 0.31776 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..ca10e306407 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.93292, + "2": 10.93423, + "3": 10.91347, + "4": 10.90322, + "5": 10.92969, + "6": 10.93655, + "7": 10.90282, + "8": 10.92116, + "9": 10.90706, + "10": 10.90475, + "11": 10.8879, + "12": 10.91737, + "13": 10.9119, + "14": 10.91505, + "15": 10.87123, + "16": 10.86125, + "17": 10.82702, + "18": 10.85679, + "19": 10.84058, + "20": 10.75, + "21": 10.71511, + "22": 10.58115, + "23": 10.72641, + "24": 10.60726, + "25": 10.53753, + "26": 10.61066, + "27": 10.59933, + "28": 10.54955, + "29": 10.566, + "30": 10.32548, + "31": 10.06696, + "32": 10.4381, + "33": 10.4236, + "34": 10.16016, + "35": 10.22896, + "36": 10.17617, + "37": 10.29231, + "38": 10.13293, + "39": 10.34955, + "40": 10.01977, + "41": 10.07533, + "42": 10.1541, + "43": 9.7609, + "44": 9.88356, + "45": 9.75549, + "46": 9.74959, + "47": 10.07543, + "48": 9.7794, + "49": 9.4382, + "50": 9.84069 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 595.0, + "2": 593.0, + "3": 625.0, + "4": 603.0, + "5": 636.0, + "6": 612.0, + "7": 635.0, + "8": 619.0, + "9": 658.0, + "10": 526.0, + "11": 694.0, + "12": 570.0, + "13": 643.0, + "14": 639.0, + "15": 648.0, + "16": 647.0, + "17": 627.0, + "18": 586.0, + "19": 632.0, + "20": 663.0, + "21": 628.0, + "22": 545.0, + "23": 679.0, + "24": 624.0, + "25": 532.0, + "26": 623.0, + "27": 656.0, + "28": 719.0, + "29": 710.0, + "30": 707.0, + "31": 635.0, + "32": 710.0, + "33": 784.0, + "34": 679.0, + "35": 680.0, + "36": 695.0, + "37": 767.0, + "38": 782.0, + "39": 858.0, + "40": 746.0, + "41": 797.0, + "42": 774.0, + "43": 698.0, + "44": 748.0, + "45": 789.0, + "46": 819.0, + "47": 867.0, + "48": 871.0, + "49": 894.0, + "50": 868.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 431783936.0, + "2": 431783936.0, + "3": 431783936.0, + "4": 431783936.0, + "5": 431783936.0, + "6": 431783936.0, + "7": 431783936.0, + "8": 431783936.0, + "9": 431783936.0, + "10": 431783936.0, + "11": 431783936.0, + "12": 431783936.0, + "13": 431783936.0, + "14": 431783936.0, + "15": 431783936.0, + "16": 431783936.0, + "17": 431783936.0, + "18": 431783936.0, + "19": 431783936.0, + "20": 431783936.0, + "21": 431783936.0, + "22": 431783936.0, + "23": 431783936.0, + "24": 431783936.0, + "25": 431783936.0, + "26": 431783936.0, + "27": 431783936.0, + "28": 431783936.0, + "29": 431783936.0, + "30": 431783936.0, + "31": 431783936.0, + "32": 431783936.0, + "33": 431783936.0, + "34": 431783936.0, + "35": 431783936.0, + "36": 431783936.0, + "37": 431783936.0, + "38": 431783936.0, + "39": 431783936.0, + "40": 431783936.0, + "41": 431783936.0, + "42": 431783936.0, + "43": 431783936.0, + "44": 431783936.0, + "45": 431783936.0, + "46": 431783936.0, + "47": 431783936.0, + "48": 431783936.0, + "49": 431783936.0, + "50": 431783936.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 677334528.0, + "2": 854262272.0, + "3": 855309312.0, + "4": 855309312.0, + "5": 855309312.0, + "6": 855309312.0, + "7": 855309312.0, + "8": 855309312.0, + "9": 855309312.0, + "10": 855309312.0, + "11": 855309312.0, + "12": 855309312.0, + "13": 855309312.0, + "14": 855309312.0, + "15": 855309312.0, + "16": 855309312.0, + "17": 855309824.0, + "18": 855309824.0, + "19": 855309824.0, + "20": 855309824.0, + "21": 855309824.0, + "22": 855309824.0, + "23": 855309824.0, + "24": 855309824.0, + "25": 855309824.0, + "26": 855309824.0, + "27": 855309824.0, + "28": 855309824.0, + "29": 855309824.0, + "30": 855309824.0, + "31": 855310848.0, + "32": 855310848.0, + "33": 855310848.0, + "34": 855310848.0, + "35": 855310848.0, + "36": 855310848.0, + "37": 855310848.0, + "38": 855310848.0, + "39": 855310848.0, + "40": 855310848.0, + "41": 855310848.0, + "42": 855310848.0, + "43": 855310848.0, + "44": 855310848.0, + "45": 855310848.0, + "46": 855310848.0, + "47": 855310848.0, + "48": 855310848.0, + "49": 855310848.0, + "50": 855310848.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 15.39243, + "2": 0.47114, + "3": 0.4118, + "4": 0.4088, + "5": 0.41627, + "6": 0.40803, + "7": 0.41796, + "8": 0.40621, + "9": 0.40868, + "10": 0.41207, + "11": 0.40628, + "12": 0.41887, + "13": 0.40513, + "14": 0.41436, + "15": 0.40824, + "16": 0.40927, + "17": 0.41859, + "18": 0.40493, + "19": 0.41309, + "20": 0.4031, + "21": 0.40742, + "22": 0.41395, + "23": 0.40602, + "24": 0.41635, + "25": 0.40363, + "26": 0.40541, + "27": 0.41468, + "28": 0.40626, + "29": 0.41736, + "30": 0.41505, + "31": 0.42497, + "32": 0.42917, + "33": 0.41862, + "34": 0.40386, + "35": 0.39199, + "36": 0.39203, + "37": 0.4022, + "38": 0.39232, + "39": 0.40413, + "40": 0.39067, + "41": 0.39156, + "42": 0.40281, + "43": 0.3918, + "44": 0.40265, + "45": 0.39137, + "46": 0.39193, + "47": 0.4014, + "48": 0.3911, + "49": 0.40482, + "50": 0.38988 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..de27a6084a7 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.93292, + "2": 10.93423, + "3": 10.91344, + "4": 10.9032, + "5": 10.92965, + "6": 10.93658, + "7": 10.90279, + "8": 10.92116, + "9": 10.90707, + "10": 10.90476, + "11": 10.88785, + "12": 10.91736, + "13": 10.91188, + "14": 10.91506, + "15": 10.87121, + "16": 10.86128, + "17": 10.827, + "18": 10.85677, + "19": 10.84058, + "20": 10.74999, + "21": 10.71508, + "22": 10.58119, + "23": 10.72643, + "24": 10.60729, + "25": 10.53754, + "26": 10.61069, + "27": 10.59933, + "28": 10.54956, + "29": 10.56602, + "30": 10.32552, + "31": 10.06695, + "32": 10.43807, + "33": 10.42362, + "34": 10.16012, + "35": 10.22898, + "36": 10.17617, + "37": 10.29237, + "38": 10.13296, + "39": 10.34957, + "40": 10.01974, + "41": 10.07532, + "42": 10.15409, + "43": 9.76091, + "44": 9.88357, + "45": 9.75551, + "46": 9.74958, + "47": 10.07547, + "48": 9.77938, + "49": 9.43818, + "50": 9.84068 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 575.0, + "2": 590.0, + "3": 619.0, + "4": 585.0, + "5": 619.0, + "6": 641.0, + "7": 615.0, + "8": 599.0, + "9": 674.0, + "10": 511.0, + "11": 678.0, + "12": 632.0, + "13": 669.0, + "14": 614.0, + "15": 668.0, + "16": 647.0, + "17": 611.0, + "18": 625.0, + "19": 612.0, + "20": 548.0, + "21": 583.0, + "22": 599.0, + "23": 677.0, + "24": 570.0, + "25": 554.0, + "26": 661.0, + "27": 691.0, + "28": 745.0, + "29": 688.0, + "30": 770.0, + "31": 555.0, + "32": 712.0, + "33": 790.0, + "34": 637.0, + "35": 690.0, + "36": 736.0, + "37": 795.0, + "38": 728.0, + "39": 808.0, + "40": 740.0, + "41": 791.0, + "42": 800.0, + "43": 708.0, + "44": 730.0, + "45": 777.0, + "46": 786.0, + "47": 894.0, + "48": 897.0, + "49": 825.0, + "50": 850.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 431783936.0, + "2": 431783936.0, + "3": 431783936.0, + "4": 431783936.0, + "5": 431783936.0, + "6": 431783936.0, + "7": 431783936.0, + "8": 431783936.0, + "9": 431783936.0, + "10": 431783936.0, + "11": 431783936.0, + "12": 431783936.0, + "13": 431783936.0, + "14": 431783936.0, + "15": 431783936.0, + "16": 431783936.0, + "17": 431783936.0, + "18": 431783936.0, + "19": 431783936.0, + "20": 431783936.0, + "21": 431783936.0, + "22": 431783936.0, + "23": 431783936.0, + "24": 431783936.0, + "25": 431783936.0, + "26": 431783936.0, + "27": 431783936.0, + "28": 431783936.0, + "29": 431783936.0, + "30": 431783936.0, + "31": 431783936.0, + "32": 431783936.0, + "33": 431783936.0, + "34": 431783936.0, + "35": 431783936.0, + "36": 431783936.0, + "37": 431783936.0, + "38": 431783936.0, + "39": 431783936.0, + "40": 431783936.0, + "41": 431783936.0, + "42": 431783936.0, + "43": 431783936.0, + "44": 431783936.0, + "45": 431783936.0, + "46": 431783936.0, + "47": 431783936.0, + "48": 431783936.0, + "49": 431783936.0, + "50": 431783936.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 677335040.0, + "2": 854262784.0, + "3": 854262784.0, + "4": 854262784.0, + "5": 854262784.0, + "6": 854262784.0, + "7": 854262784.0, + "8": 855310848.0, + "9": 855310848.0, + "10": 855310848.0, + "11": 855310848.0, + "12": 855310848.0, + "13": 855310848.0, + "14": 855310848.0, + "15": 855310848.0, + "16": 855310848.0, + "17": 855311360.0, + "18": 855311360.0, + "19": 855311360.0, + "20": 855311360.0, + "21": 855311360.0, + "22": 855311360.0, + "23": 855311360.0, + "24": 855311360.0, + "25": 855311360.0, + "26": 855311360.0, + "27": 855311360.0, + "28": 855311360.0, + "29": 855311360.0, + "30": 855311360.0, + "31": 855311360.0, + "32": 855311360.0, + "33": 855311360.0, + "34": 855311360.0, + "35": 855311360.0, + "36": 855311360.0, + "37": 855311360.0, + "38": 855311360.0, + "39": 855311360.0, + "40": 855311360.0, + "41": 855311360.0, + "42": 855311360.0, + "43": 855311360.0, + "44": 855311360.0, + "45": 855311360.0, + "46": 855311360.0, + "47": 855311360.0, + "48": 855311360.0, + "49": 855311360.0, + "50": 855311360.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 14.21722, + "2": 0.44346, + "3": 0.4048, + "4": 0.4153, + "5": 0.40403, + "6": 0.40186, + "7": 0.40648, + "8": 0.39996, + "9": 0.41082, + "10": 0.39802, + "11": 0.40029, + "12": 0.4031, + "13": 0.39772, + "14": 0.40795, + "15": 0.39818, + "16": 0.39779, + "17": 0.40587, + "18": 0.3977, + "19": 0.40697, + "20": 0.39617, + "21": 0.39797, + "22": 0.40462, + "23": 0.39629, + "24": 0.41062, + "25": 0.396, + "26": 0.39789, + "27": 0.3983, + "28": 0.39459, + "29": 0.40633, + "30": 0.39484, + "31": 0.3948, + "32": 0.4047, + "33": 0.39655, + "34": 0.40817, + "35": 0.39452, + "36": 0.39485, + "37": 0.40608, + "38": 0.39482, + "39": 0.40667, + "40": 0.39484, + "41": 0.39476, + "42": 0.40733, + "43": 0.39462, + "44": 0.41255, + "45": 0.39333, + "46": 0.39499, + "47": 0.40452, + "48": 0.39484, + "49": 0.40745, + "50": 0.39497 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index f2137d28953..2fa70eac521 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.85949, + "2": 10.85553, + "3": 10.86548, + "4": 10.84554, "5": 10.88344, + "6": 10.89429, + "7": 10.87068, + "8": 10.86983, + "9": 10.86919, "10": 10.83883, + "11": 10.89435, + "12": 10.8798, + "13": 10.87987, + "14": 10.90317, "15": 10.8405, + "16": 10.83786, + "17": 10.80668, + "18": 10.83025, + "19": 10.82262, "20": 10.73192, + "21": 10.7075, + "22": 10.56005, + "23": 10.72406, + "24": 10.61116, "25": 10.5481, + "26": 10.61334, + "27": 10.6305, + "28": 10.56645, + "29": 10.59672, "30": 10.37136, + "31": 10.11721, + "32": 10.46127, + "33": 10.45247, + "34": 10.21687, "35": 10.27171, + "36": 10.2312, + "37": 10.34809, + "38": 10.18842, + "39": 10.41042, "40": 10.09426, + "41": 10.14711, + "42": 10.21247, + "43": 9.84106, + "44": 9.95919, "45": 9.84082, + "46": 9.82482, + "47": 10.13882, + "48": 9.85839, + "49": 9.5472, "50": 9.90883 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1690.0, + "2": 1776.0, + "3": 1642.0, + "4": 1825.0, "5": 1809.0, + "6": 1795.0, + "7": 1830.0, + "8": 1626.0, + "9": 1878.0, "10": 1423.0, + "11": 1868.0, + "12": 1653.0, + "13": 1897.0, + "14": 1783.0, "15": 1861.0, + "16": 1938.0, + "17": 1825.0, + "18": 1730.0, + "19": 1727.0, "20": 1735.0, + "21": 1783.0, + "22": 1576.0, + "23": 1949.0, + "24": 1630.0, "25": 1498.0, + "26": 1649.0, + "27": 1809.0, + "28": 2019.0, + "29": 2009.0, "30": 1832.0, + "31": 1524.0, + "32": 1943.0, + "33": 2081.0, + "34": 1888.0, "35": 1935.0, + "36": 1898.0, + "37": 2325.0, + "38": 2070.0, + "39": 2248.0, "40": 2199.0, + "41": 2264.0, + "42": 2349.0, + "43": 2087.0, + "44": 2107.0, "45": 2098.0, + "46": 2407.0, + "47": 2456.0, + "48": 2404.0, + "49": 2417.0, "50": 2407.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 516194816.0, + "2": 516194816.0, + "3": 516194816.0, + "4": 516194816.0, "5": 516194816.0, + "6": 516194816.0, + "7": 516194816.0, + "8": 516194816.0, + "9": 516194816.0, "10": 516194816.0, + "11": 516194816.0, + "12": 516194816.0, + "13": 516194816.0, + "14": 516194816.0, "15": 516194816.0, + "16": 516194816.0, + "17": 516194816.0, + "18": 516194816.0, + "19": 516194816.0, "20": 516194816.0, + "21": 516194816.0, + "22": 516194816.0, + "23": 516194816.0, + "24": 516194816.0, "25": 516194816.0, + "26": 516194816.0, + "27": 516194816.0, + "28": 516194816.0, + "29": 516194816.0, "30": 516194816.0, + "31": 516194816.0, + "32": 516194816.0, + "33": 516194816.0, + "34": 516194816.0, "35": 516194816.0, + "36": 516194816.0, + "37": 516194816.0, + "38": 516194816.0, + "39": 516194816.0, "40": 516194816.0, + "41": 516194816.0, + "42": 516194816.0, + "43": 516194816.0, + "44": 516194816.0, "45": 516194816.0, + "46": 516194816.0, + "47": 516194816.0, + "48": 516194816.0, + "49": 516194816.0, "50": 516194816.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1670130688.0, + "2": 1840523776.0, + "3": 1840523776.0, + "4": 1840523776.0, "5": 1840523776.0, + "6": 1840523776.0, + "7": 1840523776.0, + "8": 1840523776.0, + "9": 1840523776.0, "10": 1840523776.0, - "15": 1841310208.0, - "20": 1841310208.0, - "25": 1841310208.0, - "30": 1841310208.0, - "35": 1841310208.0, - "40": 1841310208.0, - "45": 1841310208.0, - "50": 1841310208.0 + "11": 1840523776.0, + "12": 1840523776.0, + "13": 1840523776.0, + "14": 1840523776.0, + "15": 1840523776.0, + "16": 1840523776.0, + "17": 1840523776.0, + "18": 1840523776.0, + "19": 1840523776.0, + "20": 1840523776.0, + "21": 1840523776.0, + "22": 1840523776.0, + "23": 1840523776.0, + "24": 1840523776.0, + "25": 1840523776.0, + "26": 1840523776.0, + "27": 1840523776.0, + "28": 1840523776.0, + "29": 1840523776.0, + "30": 1840523776.0, + "31": 1840523776.0, + "32": 1840523776.0, + "33": 1840523776.0, + "34": 1840523776.0, + "35": 1840523776.0, + "36": 1840523776.0, + "37": 1840523776.0, + "38": 1840523776.0, + "39": 1840523776.0, + "40": 1840523776.0, + "41": 1840523776.0, + "42": 1840523776.0, + "43": 1840523776.0, + "44": 1840523776.0, + "45": 1840523776.0, + "46": 1840523776.0, + "47": 1840523776.0, + "48": 1840523776.0, + "49": 1840523776.0, + "50": 1840523776.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 15.15592, - "5": 0.12534, - "10": 0.11995, - "15": 0.12083, - "20": 0.11947, - "25": 0.11848, - "30": 0.11832, - "35": 0.11938, - "40": 0.12709, - "45": 0.11947, - "50": 0.11811 + "1": 15.46989, + "2": 0.15818, + "3": 0.14336, + "4": 0.14305, + "5": 0.14285, + "6": 0.14415, + "7": 0.14655, + "8": 0.14457, + "9": 0.14518, + "10": 0.14657, + "11": 0.14517, + "12": 0.14486, + "13": 0.14388, + "14": 0.14419, + "15": 0.14463, + "16": 0.146, + "17": 0.14212, + "18": 0.14726, + "19": 0.14464, + "20": 0.14514, + "21": 0.14341, + "22": 0.14454, + "23": 0.14327, + "24": 0.14354, + "25": 0.14453, + "26": 0.14409, + "27": 0.14547, + "28": 0.14291, + "29": 0.14484, + "30": 0.1444, + "31": 0.14388, + "32": 0.14651, + "33": 0.14385, + "34": 0.14057, + "35": 0.14021, + "36": 0.14028, + "37": 0.13912, + "38": 0.13925, + "39": 0.14191, + "40": 0.14024, + "41": 0.14034, + "42": 0.14027, + "43": 0.14125, + "44": 0.14142, + "45": 0.14126, + "46": 0.14404, + "47": 0.1403, + "48": 0.14011, + "49": 0.14086, + "50": 0.13902 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..9a1bfb0707b --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.85949, + "2": 10.85553, + "3": 10.86548, + "4": 10.84554, + "5": 10.88344, + "6": 10.89429, + "7": 10.87068, + "8": 10.86983, + "9": 10.86919, + "10": 10.83883, + "11": 10.89435, + "12": 10.8798, + "13": 10.87987, + "14": 10.90317, + "15": 10.8405, + "16": 10.83786, + "17": 10.80668, + "18": 10.83025, + "19": 10.82262, + "20": 10.73192, + "21": 10.7075, + "22": 10.56005, + "23": 10.72406, + "24": 10.61116, + "25": 10.5481, + "26": 10.61334, + "27": 10.6305, + "28": 10.56645, + "29": 10.59672, + "30": 10.37136, + "31": 10.11721, + "32": 10.46127, + "33": 10.45247, + "34": 10.21687, + "35": 10.27171, + "36": 10.2312, + "37": 10.34809, + "38": 10.18842, + "39": 10.41042, + "40": 10.09426, + "41": 10.14711, + "42": 10.21247, + "43": 9.84106, + "44": 9.95919, + "45": 9.84082, + "46": 9.82482, + "47": 10.13882, + "48": 9.85839, + "49": 9.5472, + "50": 9.90883 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1690.0, + "2": 1776.0, + "3": 1642.0, + "4": 1825.0, + "5": 1809.0, + "6": 1795.0, + "7": 1830.0, + "8": 1626.0, + "9": 1878.0, + "10": 1423.0, + "11": 1868.0, + "12": 1653.0, + "13": 1897.0, + "14": 1783.0, + "15": 1861.0, + "16": 1938.0, + "17": 1825.0, + "18": 1730.0, + "19": 1727.0, + "20": 1735.0, + "21": 1783.0, + "22": 1576.0, + "23": 1949.0, + "24": 1630.0, + "25": 1498.0, + "26": 1649.0, + "27": 1809.0, + "28": 2019.0, + "29": 2009.0, + "30": 1832.0, + "31": 1524.0, + "32": 1943.0, + "33": 2081.0, + "34": 1888.0, + "35": 1935.0, + "36": 1898.0, + "37": 2325.0, + "38": 2070.0, + "39": 2248.0, + "40": 2199.0, + "41": 2264.0, + "42": 2349.0, + "43": 2087.0, + "44": 2107.0, + "45": 2098.0, + "46": 2407.0, + "47": 2456.0, + "48": 2404.0, + "49": 2417.0, + "50": 2407.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 516194816.0, + "2": 516194816.0, + "3": 516194816.0, + "4": 516194816.0, + "5": 516194816.0, + "6": 516194816.0, + "7": 516194816.0, + "8": 516194816.0, + "9": 516194816.0, + "10": 516194816.0, + "11": 516194816.0, + "12": 516194816.0, + "13": 516194816.0, + "14": 516194816.0, + "15": 516194816.0, + "16": 516194816.0, + "17": 516194816.0, + "18": 516194816.0, + "19": 516194816.0, + "20": 516194816.0, + "21": 516194816.0, + "22": 516194816.0, + "23": 516194816.0, + "24": 516194816.0, + "25": 516194816.0, + "26": 516194816.0, + "27": 516194816.0, + "28": 516194816.0, + "29": 516194816.0, + "30": 516194816.0, + "31": 516194816.0, + "32": 516194816.0, + "33": 516194816.0, + "34": 516194816.0, + "35": 516194816.0, + "36": 516194816.0, + "37": 516194816.0, + "38": 516194816.0, + "39": 516194816.0, + "40": 516194816.0, + "41": 516194816.0, + "42": 516194816.0, + "43": 516194816.0, + "44": 516194816.0, + "45": 516194816.0, + "46": 516194816.0, + "47": 516194816.0, + "48": 516194816.0, + "49": 516194816.0, + "50": 516194816.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1670130688.0, + "2": 1840523776.0, + "3": 1841310208.0, + "4": 1841310208.0, + "5": 1841310208.0, + "6": 1841310208.0, + "7": 1841310208.0, + "8": 1841310208.0, + "9": 1841310208.0, + "10": 1841310208.0, + "11": 1841310208.0, + "12": 1841310208.0, + "13": 1841310208.0, + "14": 1841310208.0, + "15": 1841310208.0, + "16": 1841310208.0, + "17": 1841310208.0, + "18": 1841310208.0, + "19": 1841310208.0, + "20": 1841310208.0, + "21": 1841310208.0, + "22": 1841310208.0, + "23": 1841310208.0, + "24": 1841310208.0, + "25": 1841310208.0, + "26": 1841310208.0, + "27": 1841310208.0, + "28": 1841310208.0, + "29": 1841310208.0, + "30": 1841310208.0, + "31": 1841310208.0, + "32": 1841310208.0, + "33": 1841310208.0, + "34": 1841310208.0, + "35": 1841310208.0, + "36": 1841310208.0, + "37": 1841310208.0, + "38": 1841310208.0, + "39": 1841310208.0, + "40": 1841310208.0, + "41": 1841310208.0, + "42": 1841310208.0, + "43": 1841310208.0, + "44": 1841310208.0, + "45": 1841310208.0, + "46": 1841310208.0, + "47": 1841310208.0, + "48": 1841310208.0, + "49": 1841310208.0, + "50": 1841310208.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 12.9332, + "2": 0.16326, + "3": 0.12463, + "4": 0.12744, + "5": 0.12912, + "6": 0.12823, + "7": 0.12454, + "8": 0.12362, + "9": 0.12458, + "10": 0.12419, + "11": 0.12352, + "12": 0.12552, + "13": 0.12365, + "14": 0.12466, + "15": 0.12255, + "16": 0.12286, + "17": 0.12294, + "18": 0.12246, + "19": 0.12292, + "20": 0.12533, + "21": 0.12268, + "22": 0.12434, + "23": 0.11979, + "24": 0.11976, + "25": 0.11744, + "26": 0.11555, + "27": 0.11746, + "28": 0.11709, + "29": 0.12764, + "30": 0.11818, + "31": 0.11917, + "32": 0.11662, + "33": 0.11909, + "34": 0.11844, + "35": 0.1167, + "36": 0.12045, + "37": 0.11624, + "38": 0.11602, + "39": 0.11985, + "40": 0.11702, + "41": 0.11671, + "42": 0.11663, + "43": 0.11741, + "44": 0.11703, + "45": 0.11752, + "46": 0.11604, + "47": 0.11836, + "48": 0.12278, + "49": 0.12884, + "50": 0.11659 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..379b1c16f29 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.85949, + "2": 10.85553, + "3": 10.86548, + "4": 10.84554, + "5": 10.88344, + "6": 10.89429, + "7": 10.87068, + "8": 10.86983, + "9": 10.86919, + "10": 10.83883, + "11": 10.89435, + "12": 10.8798, + "13": 10.87987, + "14": 10.90317, + "15": 10.8405, + "16": 10.83786, + "17": 10.80668, + "18": 10.83025, + "19": 10.82262, + "20": 10.73192, + "21": 10.7075, + "22": 10.56005, + "23": 10.72406, + "24": 10.61116, + "25": 10.5481, + "26": 10.61334, + "27": 10.6305, + "28": 10.56645, + "29": 10.59672, + "30": 10.37136, + "31": 10.11721, + "32": 10.46127, + "33": 10.45247, + "34": 10.21687, + "35": 10.27171, + "36": 10.2312, + "37": 10.34809, + "38": 10.18842, + "39": 10.41042, + "40": 10.09426, + "41": 10.14711, + "42": 10.21247, + "43": 9.84106, + "44": 9.95919, + "45": 9.84082, + "46": 9.82482, + "47": 10.13882, + "48": 9.85839, + "49": 9.5472, + "50": 9.90883 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1690.0, + "2": 1776.0, + "3": 1642.0, + "4": 1825.0, + "5": 1809.0, + "6": 1795.0, + "7": 1830.0, + "8": 1626.0, + "9": 1878.0, + "10": 1423.0, + "11": 1868.0, + "12": 1653.0, + "13": 1897.0, + "14": 1783.0, + "15": 1861.0, + "16": 1938.0, + "17": 1825.0, + "18": 1730.0, + "19": 1727.0, + "20": 1735.0, + "21": 1783.0, + "22": 1576.0, + "23": 1949.0, + "24": 1630.0, + "25": 1498.0, + "26": 1649.0, + "27": 1809.0, + "28": 2019.0, + "29": 2009.0, + "30": 1832.0, + "31": 1524.0, + "32": 1943.0, + "33": 2081.0, + "34": 1888.0, + "35": 1935.0, + "36": 1898.0, + "37": 2325.0, + "38": 2070.0, + "39": 2248.0, + "40": 2199.0, + "41": 2264.0, + "42": 2349.0, + "43": 2087.0, + "44": 2107.0, + "45": 2098.0, + "46": 2407.0, + "47": 2456.0, + "48": 2404.0, + "49": 2417.0, + "50": 2407.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 516194816.0, + "2": 516194816.0, + "3": 516194816.0, + "4": 516194816.0, + "5": 516194816.0, + "6": 516194816.0, + "7": 516194816.0, + "8": 516194816.0, + "9": 516194816.0, + "10": 516194816.0, + "11": 516194816.0, + "12": 516194816.0, + "13": 516194816.0, + "14": 516194816.0, + "15": 516194816.0, + "16": 516194816.0, + "17": 516194816.0, + "18": 516194816.0, + "19": 516194816.0, + "20": 516194816.0, + "21": 516194816.0, + "22": 516194816.0, + "23": 516194816.0, + "24": 516194816.0, + "25": 516194816.0, + "26": 516194816.0, + "27": 516194816.0, + "28": 516194816.0, + "29": 516194816.0, + "30": 516194816.0, + "31": 516194816.0, + "32": 516194816.0, + "33": 516194816.0, + "34": 516194816.0, + "35": 516194816.0, + "36": 516194816.0, + "37": 516194816.0, + "38": 516194816.0, + "39": 516194816.0, + "40": 516194816.0, + "41": 516194816.0, + "42": 516194816.0, + "43": 516194816.0, + "44": 516194816.0, + "45": 516194816.0, + "46": 516194816.0, + "47": 516194816.0, + "48": 516194816.0, + "49": 516194816.0, + "50": 516194816.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1670130688.0, + "2": 1840523776.0, + "3": 1840523776.0, + "4": 1840523776.0, + "5": 1840523776.0, + "6": 1840523776.0, + "7": 1840523776.0, + "8": 1840523776.0, + "9": 1840523776.0, + "10": 1840523776.0, + "11": 1840523776.0, + "12": 1840523776.0, + "13": 1840523776.0, + "14": 1840523776.0, + "15": 1840523776.0, + "16": 1840523776.0, + "17": 1840523776.0, + "18": 1840523776.0, + "19": 1840523776.0, + "20": 1840523776.0, + "21": 1840523776.0, + "22": 1840523776.0, + "23": 1840523776.0, + "24": 1840523776.0, + "25": 1840523776.0, + "26": 1840523776.0, + "27": 1840523776.0, + "28": 1840523776.0, + "29": 1840523776.0, + "30": 1840523776.0, + "31": 1840523776.0, + "32": 1840523776.0, + "33": 1840523776.0, + "34": 1840523776.0, + "35": 1840523776.0, + "36": 1840523776.0, + "37": 1840523776.0, + "38": 1840523776.0, + "39": 1840523776.0, + "40": 1840523776.0, + "41": 1840523776.0, + "42": 1840523776.0, + "43": 1840523776.0, + "44": 1840523776.0, + "45": 1840523776.0, + "46": 1840523776.0, + "47": 1840523776.0, + "48": 1840523776.0, + "49": 1840523776.0, + "50": 1840523776.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 15.2683, + "2": 0.15358, + "3": 0.13619, + "4": 0.13976, + "5": 0.13713, + "6": 0.13753, + "7": 0.13575, + "8": 0.13485, + "9": 0.13779, + "10": 0.13697, + "11": 0.14178, + "12": 0.1397, + "13": 0.13744, + "14": 0.14039, + "15": 0.13739, + "16": 0.1361, + "17": 0.13816, + "18": 0.13722, + "19": 0.15342, + "20": 0.14613, + "21": 0.14806, + "22": 0.14423, + "23": 0.14791, + "24": 0.14345, + "25": 0.14474, + "26": 0.14564, + "27": 0.14168, + "28": 0.14148, + "29": 0.13863, + "30": 0.13751, + "31": 0.14015, + "32": 0.13821, + "33": 0.14038, + "34": 0.13859, + "35": 0.14531, + "36": 0.14468, + "37": 0.13783, + "38": 0.13787, + "39": 0.13879, + "40": 0.14072, + "41": 0.14065, + "42": 0.13865, + "43": 0.13953, + "44": 0.13882, + "45": 0.13622, + "46": 0.14034, + "47": 0.13659, + "48": 0.14369, + "49": 0.13987, + "50": 0.13803 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..d381ff1bd8e --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.92655, + "2": 10.92585, + "3": 10.91514, + "4": 10.90899, + "5": 10.92719, + "6": 10.9356, + "7": 10.90644, + "8": 10.92124, + "9": 10.91072, + "10": 10.9079, + "11": 10.89279, + "12": 10.9243, + "13": 10.91492, + "14": 10.9214, + "15": 10.88295, + "16": 10.87305, + "17": 10.84065, + "18": 10.87298, + "19": 10.85634, + "20": 10.77595, + "21": 10.74894, + "22": 10.63082, + "23": 10.75618, + "24": 10.65648, + "25": 10.59261, + "26": 10.65439, + "27": 10.64911, + "28": 10.59499, + "29": 10.60946, + "30": 10.39175, + "31": 10.1572, + "32": 10.49109, + "33": 10.47964, + "34": 10.24073, + "35": 10.29696, + "36": 10.2467, + "37": 10.35242, + "38": 10.20484, + "39": 10.40504, + "40": 10.09662, + "41": 10.15197, + "42": 10.22064, + "43": 9.85509, + "44": 9.96162, + "45": 9.84469, + "46": 9.83833, + "47": 10.14003, + "48": 9.85758, + "49": 9.53744, + "50": 9.90944 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1594.0, + "2": 1834.0, + "3": 1682.0, + "4": 1736.0, + "5": 1923.0, + "6": 1815.0, + "7": 1879.0, + "8": 1755.0, + "9": 1905.0, + "10": 1370.0, + "11": 1981.0, + "12": 1780.0, + "13": 2007.0, + "14": 1848.0, + "15": 1887.0, + "16": 1753.0, + "17": 1859.0, + "18": 1752.0, + "19": 1820.0, + "20": 1591.0, + "21": 1835.0, + "22": 1655.0, + "23": 1972.0, + "24": 1667.0, + "25": 1655.0, + "26": 1798.0, + "27": 1853.0, + "28": 1993.0, + "29": 1998.0, + "30": 1946.0, + "31": 1613.0, + "32": 1954.0, + "33": 2212.0, + "34": 1965.0, + "35": 1940.0, + "36": 1954.0, + "37": 2289.0, + "38": 2173.0, + "39": 2478.0, + "40": 2097.0, + "41": 2342.0, + "42": 2362.0, + "43": 1952.0, + "44": 2105.0, + "45": 2063.0, + "46": 2234.0, + "47": 2444.0, + "48": 2395.0, + "49": 2316.0, + "50": 2445.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 438468608.0, + "2": 438468608.0, + "3": 438468608.0, + "4": 438468608.0, + "5": 438468608.0, + "6": 438468608.0, + "7": 438468608.0, + "8": 438468608.0, + "9": 438468608.0, + "10": 438468608.0, + "11": 438468608.0, + "12": 438468608.0, + "13": 438468608.0, + "14": 438468608.0, + "15": 438468608.0, + "16": 438468608.0, + "17": 438468608.0, + "18": 438468608.0, + "19": 438468608.0, + "20": 438468608.0, + "21": 438468608.0, + "22": 438468608.0, + "23": 438468608.0, + "24": 438468608.0, + "25": 438468608.0, + "26": 438468608.0, + "27": 438468608.0, + "28": 438468608.0, + "29": 438468608.0, + "30": 438468608.0, + "31": 438468608.0, + "32": 438468608.0, + "33": 438468608.0, + "34": 438468608.0, + "35": 438468608.0, + "36": 438468608.0, + "37": 438468608.0, + "38": 438468608.0, + "39": 438468608.0, + "40": 438468608.0, + "41": 438468608.0, + "42": 438468608.0, + "43": 438468608.0, + "44": 438468608.0, + "45": 438468608.0, + "46": 438468608.0, + "47": 438468608.0, + "48": 438468608.0, + "49": 438468608.0, + "50": 438468608.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2658189824.0, + "2": 2658189824.0, + "3": 2658189824.0, + "4": 2658189824.0, + "5": 2658189824.0, + "6": 2658189824.0, + "7": 2658189824.0, + "8": 2658189824.0, + "9": 2658189824.0, + "10": 2658189824.0, + "11": 2658189824.0, + "12": 2658189824.0, + "13": 2658189824.0, + "14": 2658189824.0, + "15": 2658189824.0, + "16": 2658189824.0, + "17": 2658189824.0, + "18": 2658189824.0, + "19": 2658189824.0, + "20": 2658189824.0, + "21": 2658189824.0, + "22": 2658189824.0, + "23": 2658189824.0, + "24": 2658189824.0, + "25": 2658189824.0, + "26": 2658189824.0, + "27": 2658189824.0, + "28": 2658189824.0, + "29": 2658189824.0, + "30": 2658189824.0, + "31": 2658189824.0, + "32": 2658189824.0, + "33": 2658189824.0, + "34": 2658189824.0, + "35": 2658189824.0, + "36": 2658189824.0, + "37": 2658189824.0, + "38": 2658189824.0, + "39": 2658189824.0, + "40": 2658189824.0, + "41": 2658189824.0, + "42": 2658189824.0, + "43": 2658189824.0, + "44": 2658189824.0, + "45": 2658189824.0, + "46": 2658189824.0, + "47": 2658189824.0, + "48": 2658189824.0, + "49": 2658189824.0, + "50": 2658189824.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 16.89692, + "2": 0.22636, + "3": 0.19282, + "4": 0.19102, + "5": 0.18966, + "6": 0.19089, + "7": 0.18785, + "8": 0.19603, + "9": 0.20181, + "10": 0.20496, + "11": 0.21259, + "12": 0.22807, + "13": 0.20894, + "14": 0.23285, + "15": 0.21589, + "16": 0.21307, + "17": 0.2066, + "18": 0.20281, + "19": 0.20035, + "20": 0.21165, + "21": 0.21499, + "22": 0.20787, + "23": 0.20796, + "24": 0.20107, + "25": 0.20655, + "26": 0.19066, + "27": 0.19278, + "28": 0.18972, + "29": 0.18934, + "30": 0.18911, + "31": 0.18621, + "32": 0.18488, + "33": 0.18787, + "34": 0.18483, + "35": 0.18634, + "36": 0.18614, + "37": 0.18598, + "38": 0.19035, + "39": 0.1965, + "40": 0.22208, + "41": 0.21118, + "42": 0.21696, + "43": 0.2487, + "44": 0.25093, + "45": 0.25052, + "46": 0.23122, + "47": 0.23444, + "48": 0.23094, + "49": 0.23714, + "50": 0.41655 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..7c826222075 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.92655, + "2": 10.92585, + "3": 10.91514, + "4": 10.90898, + "5": 10.92718, + "6": 10.9356, + "7": 10.90644, + "8": 10.9212, + "9": 10.91072, + "10": 10.90791, + "11": 10.89277, + "12": 10.92427, + "13": 10.91491, + "14": 10.92144, + "15": 10.88294, + "16": 10.8731, + "17": 10.84065, + "18": 10.87301, + "19": 10.85632, + "20": 10.77595, + "21": 10.74892, + "22": 10.63083, + "23": 10.75616, + "24": 10.65644, + "25": 10.59263, + "26": 10.65439, + "27": 10.64917, + "28": 10.59496, + "29": 10.60945, + "30": 10.39175, + "31": 10.15721, + "32": 10.49112, + "33": 10.4796, + "34": 10.24073, + "35": 10.297, + "36": 10.24673, + "37": 10.35244, + "38": 10.20481, + "39": 10.40504, + "40": 10.09662, + "41": 10.15197, + "42": 10.22065, + "43": 9.85507, + "44": 9.96161, + "45": 9.84469, + "46": 9.83836, + "47": 10.14002, + "48": 9.85758, + "49": 9.53747, + "50": 9.90948 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1610.0, + "2": 1834.0, + "3": 1691.0, + "4": 1762.0, + "5": 1930.0, + "6": 1842.0, + "7": 1885.0, + "8": 1832.0, + "9": 1917.0, + "10": 1419.0, + "11": 1991.0, + "12": 1756.0, + "13": 2014.0, + "14": 1811.0, + "15": 1937.0, + "16": 1771.0, + "17": 1873.0, + "18": 1717.0, + "19": 1721.0, + "20": 1631.0, + "21": 1842.0, + "22": 1808.0, + "23": 1932.0, + "24": 1572.0, + "25": 1667.0, + "26": 1818.0, + "27": 1928.0, + "28": 2063.0, + "29": 2105.0, + "30": 1908.0, + "31": 1554.0, + "32": 1943.0, + "33": 2262.0, + "34": 1908.0, + "35": 1939.0, + "36": 2027.0, + "37": 2400.0, + "38": 2269.0, + "39": 2458.0, + "40": 2109.0, + "41": 2257.0, + "42": 2224.0, + "43": 2059.0, + "44": 2118.0, + "45": 2090.0, + "46": 2409.0, + "47": 2607.0, + "48": 2457.0, + "49": 2239.0, + "50": 2412.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 436764672.0, + "2": 436764672.0, + "3": 436764672.0, + "4": 436764672.0, + "5": 436764672.0, + "6": 436764672.0, + "7": 436764672.0, + "8": 436764672.0, + "9": 436764672.0, + "10": 436764672.0, + "11": 436764672.0, + "12": 436764672.0, + "13": 436764672.0, + "14": 436764672.0, + "15": 436764672.0, + "16": 436764672.0, + "17": 436764672.0, + "18": 436764672.0, + "19": 436764672.0, + "20": 436764672.0, + "21": 436764672.0, + "22": 436764672.0, + "23": 436764672.0, + "24": 436764672.0, + "25": 436764672.0, + "26": 436764672.0, + "27": 436764672.0, + "28": 436764672.0, + "29": 436764672.0, + "30": 436764672.0, + "31": 436764672.0, + "32": 436764672.0, + "33": 436764672.0, + "34": 436764672.0, + "35": 436764672.0, + "36": 436764672.0, + "37": 436764672.0, + "38": 436764672.0, + "39": 436764672.0, + "40": 436764672.0, + "41": 436764672.0, + "42": 436764672.0, + "43": 436764672.0, + "44": 436764672.0, + "45": 436764672.0, + "46": 436764672.0, + "47": 436764672.0, + "48": 436764672.0, + "49": 436764672.0, + "50": 436764672.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2658189824.0, + "2": 2658189824.0, + "3": 2658189824.0, + "4": 2658189824.0, + "5": 2658189824.0, + "6": 2658189824.0, + "7": 2658189824.0, + "8": 2658189824.0, + "9": 2658189824.0, + "10": 2658189824.0, + "11": 2658189824.0, + "12": 2658189824.0, + "13": 2658189824.0, + "14": 2658189824.0, + "15": 2658189824.0, + "16": 2658189824.0, + "17": 2658189824.0, + "18": 2658189824.0, + "19": 2658189824.0, + "20": 2658189824.0, + "21": 2658189824.0, + "22": 2658189824.0, + "23": 2658189824.0, + "24": 2658189824.0, + "25": 2658189824.0, + "26": 2658189824.0, + "27": 2658189824.0, + "28": 2658189824.0, + "29": 2658189824.0, + "30": 2658189824.0, + "31": 2658189824.0, + "32": 2658189824.0, + "33": 2658189824.0, + "34": 2658189824.0, + "35": 2658189824.0, + "36": 2658189824.0, + "37": 2658189824.0, + "38": 2658189824.0, + "39": 2658189824.0, + "40": 2658189824.0, + "41": 2658189824.0, + "42": 2658189824.0, + "43": 2658189824.0, + "44": 2658189824.0, + "45": 2658189824.0, + "46": 2658189824.0, + "47": 2658189824.0, + "48": 2658189824.0, + "49": 2658189824.0, + "50": 2658189824.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 18.07715, + "2": 0.23504, + "3": 0.18606, + "4": 0.186, + "5": 0.18473, + "6": 0.18533, + "7": 0.18715, + "8": 0.18676, + "9": 0.18665, + "10": 0.18428, + "11": 0.18511, + "12": 0.18619, + "13": 0.18461, + "14": 0.18647, + "15": 0.18581, + "16": 0.18608, + "17": 0.18299, + "18": 0.18471, + "19": 0.18333, + "20": 0.18288, + "21": 0.18432, + "22": 0.1817, + "23": 0.18526, + "24": 0.18337, + "25": 0.18381, + "26": 0.18253, + "27": 0.18309, + "28": 0.18721, + "29": 0.18268, + "30": 0.1853, + "31": 0.18365, + "32": 0.18239, + "33": 0.18174, + "34": 0.1823, + "35": 0.18255, + "36": 0.18445, + "37": 0.18019, + "38": 0.18127, + "39": 0.18126, + "40": 0.18097, + "41": 0.18271, + "42": 0.18269, + "43": 0.182, + "44": 0.18282, + "45": 0.18347, + "46": 0.18363, + "47": 0.18571, + "48": 0.18216, + "49": 0.18221, + "50": 0.18026 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index f96b534490d..bac5baf3a43 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.92337, + "2": 10.91811, + "3": 10.91506, + "4": 10.92436, "5": 10.92089, + "6": 10.92887, + "7": 10.92681, + "8": 10.91989, + "9": 10.92227, "10": 10.92192, + "11": 10.918, + "12": 10.9238, + "13": 10.92406, + "14": 10.90862, "15": 10.92351, + "16": 10.91807, + "17": 10.9154, + "18": 10.91265, + "19": 10.9091, "20": 10.90031, + "21": 10.8959, + "22": 10.8828, + "23": 10.89975, + "24": 10.88437, "25": 10.87827, + "26": 10.88155, + "27": 10.88649, + "28": 10.85679, + "29": 10.85657, "30": 10.81423, + "31": 10.76651, + "32": 10.83131, + "33": 10.83158, + "34": 10.78071, "35": 10.78865, + "36": 10.78003, + "37": 10.80446, + "38": 10.72434, + "39": 10.78066, "40": 10.65927, + "41": 10.69208, + "42": 10.70973, + "43": 10.56128, + "44": 10.61369, "45": 10.56875, + "46": 10.54455, + "47": 10.66751, + "48": 10.53792, + "49": 10.40861, "50": 10.55421 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 22791636.0, + "2": 22989424.0, + "3": 22661212.0, + "4": 23283558.0, "5": 22778528.0, + "6": 23085340.0, + "7": 22834596.0, + "8": 22990452.0, + "9": 22906466.0, "10": 22983232.0, + "11": 22564584.0, + "12": 22524010.0, + "13": 22981124.0, + "14": 22453096.0, "15": 22886400.0, + "16": 22895424.0, + "17": 22883736.0, + "18": 22647090.0, + "19": 22682526.0, "20": 22758358.0, + "21": 22804276.0, + "22": 22863814.0, + "23": 22603616.0, + "24": 22835172.0, "25": 22883742.0, + "26": 22611358.0, + "27": 22532968.0, + "28": 22517794.0, + "29": 22593448.0, "30": 22695256.0, + "31": 23019472.0, + "32": 22648896.0, + "33": 22622516.0, + "34": 22899620.0, "35": 22851572.0, + "36": 22653160.0, + "37": 22560476.0, + "38": 22960058.0, + "39": 22865476.0, "40": 22721680.0, + "41": 22723112.0, + "42": 22730726.0, + "43": 23039588.0, + "44": 22810020.0, "45": 22738904.0, + "46": 22948334.0, + "47": 22696668.0, + "48": 22992832.0, + "49": 22791208.0, "50": 22968272.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 387744256.0, + "2": 387744256.0, + "3": 387744256.0, + "4": 387744256.0, "5": 387744256.0, + "6": 387744256.0, + "7": 387744256.0, + "8": 387744256.0, + "9": 387744256.0, "10": 387744256.0, + "11": 387744256.0, + "12": 387744256.0, + "13": 387744256.0, + "14": 387744256.0, "15": 387744256.0, + "16": 387744256.0, + "17": 387744256.0, + "18": 387744256.0, + "19": 387744256.0, "20": 387744256.0, + "21": 387744256.0, + "22": 387744256.0, + "23": 387744256.0, + "24": 387744256.0, "25": 387744256.0, + "26": 387744256.0, + "27": 387744256.0, + "28": 387744256.0, + "29": 387744256.0, "30": 387744256.0, + "31": 387744256.0, + "32": 387744256.0, + "33": 387744256.0, + "34": 387744256.0, "35": 387744256.0, + "36": 387744256.0, + "37": 387744256.0, + "38": 387744256.0, + "39": 387744256.0, "40": 387744256.0, + "41": 387744256.0, + "42": 387744256.0, + "43": 387744256.0, + "44": 387744256.0, "45": 387744256.0, + "46": 387744256.0, + "47": 387744256.0, + "48": 387744256.0, + "49": 387744256.0, "50": 387744256.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1122646528.0, + "2": 1245896192.0, + "3": 1245896192.0, + "4": 1245896192.0, "5": 1245896192.0, + "6": 1245896192.0, + "7": 1245896192.0, + "8": 1245896192.0, + "9": 1245896192.0, "10": 1245896192.0, + "11": 1245896192.0, + "12": 1245896192.0, + "13": 1245896192.0, + "14": 1245896192.0, "15": 1245896192.0, + "16": 1245896192.0, + "17": 1245896192.0, + "18": 1245896192.0, + "19": 1245896192.0, "20": 1245896192.0, + "21": 1245896192.0, + "22": 1245896192.0, + "23": 1245896192.0, + "24": 1245896192.0, "25": 1245896192.0, + "26": 1245896192.0, + "27": 1245896192.0, + "28": 1245896192.0, + "29": 1245896192.0, "30": 1245896192.0, + "31": 1245896192.0, + "32": 1245896192.0, + "33": 1245896192.0, + "34": 1245896192.0, "35": 1245896192.0, + "36": 1245896192.0, + "37": 1245896192.0, + "38": 1245896192.0, + "39": 1245896192.0, "40": 1245896192.0, + "41": 1245896192.0, + "42": 1245896192.0, + "43": 1245896192.0, + "44": 1245896192.0, "45": 1245896192.0, + "46": 1245896192.0, + "47": 1245896192.0, + "48": 1245896192.0, + "49": 1245896192.0, "50": 1245896192.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 9.91153, - "5": 0.10105, - "10": 0.09991, - "15": 0.09967, - "20": 0.10034, - "25": 0.10389, - "30": 0.10155, - "35": 0.11161, - "40": 0.10351, - "45": 0.10165, - "50": 0.10213 + "1": 10.00615, + "2": 0.13355, + "3": 0.1156, + "4": 0.11748, + "5": 0.11709, + "6": 0.11516, + "7": 0.11746, + "8": 0.11799, + "9": 0.11829, + "10": 0.11844, + "11": 0.11847, + "12": 0.12334, + "13": 0.12621, + "14": 0.1244, + "15": 0.11572, + "16": 0.11683, + "17": 0.11639, + "18": 0.11916, + "19": 0.1174, + "20": 0.11558, + "21": 0.11518, + "22": 0.1165, + "23": 0.11972, + "24": 0.12052, + "25": 0.11938, + "26": 0.125, + "27": 0.11874, + "28": 0.11938, + "29": 0.11733, + "30": 0.11731, + "31": 0.11777, + "32": 0.11704, + "33": 0.121, + "34": 0.12101, + "35": 0.11619, + "36": 0.11824, + "37": 0.11821, + "38": 0.11953, + "39": 0.11906, + "40": 0.118, + "41": 0.11938, + "42": 0.11873, + "43": 0.11887, + "44": 0.11808, + "45": 0.11848, + "46": 0.12012, + "47": 0.11741, + "48": 0.11744, + "49": 0.11829, + "50": 0.11954 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..5f5b4095502 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.92337, + "2": 10.91811, + "3": 10.91506, + "4": 10.92436, + "5": 10.92089, + "6": 10.92887, + "7": 10.92681, + "8": 10.91989, + "9": 10.92227, + "10": 10.92192, + "11": 10.918, + "12": 10.9238, + "13": 10.92406, + "14": 10.90862, + "15": 10.92351, + "16": 10.91807, + "17": 10.9154, + "18": 10.91265, + "19": 10.9091, + "20": 10.90031, + "21": 10.8959, + "22": 10.8828, + "23": 10.89975, + "24": 10.88437, + "25": 10.87827, + "26": 10.88155, + "27": 10.88649, + "28": 10.85679, + "29": 10.85657, + "30": 10.81423, + "31": 10.76651, + "32": 10.83131, + "33": 10.83158, + "34": 10.78071, + "35": 10.78865, + "36": 10.78003, + "37": 10.80446, + "38": 10.72434, + "39": 10.78066, + "40": 10.65927, + "41": 10.69208, + "42": 10.70973, + "43": 10.56128, + "44": 10.61369, + "45": 10.56875, + "46": 10.54455, + "47": 10.66751, + "48": 10.53792, + "49": 10.40861, + "50": 10.55421 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 22791636.0, + "2": 22989424.0, + "3": 22661212.0, + "4": 23283558.0, + "5": 22778528.0, + "6": 23085340.0, + "7": 22834596.0, + "8": 22990452.0, + "9": 22906466.0, + "10": 22983232.0, + "11": 22564584.0, + "12": 22524010.0, + "13": 22981124.0, + "14": 22453096.0, + "15": 22886400.0, + "16": 22895424.0, + "17": 22883736.0, + "18": 22647090.0, + "19": 22682526.0, + "20": 22758358.0, + "21": 22804276.0, + "22": 22863814.0, + "23": 22603616.0, + "24": 22835172.0, + "25": 22883742.0, + "26": 22611358.0, + "27": 22532968.0, + "28": 22517794.0, + "29": 22593448.0, + "30": 22695256.0, + "31": 23019472.0, + "32": 22648896.0, + "33": 22622516.0, + "34": 22899620.0, + "35": 22851572.0, + "36": 22653160.0, + "37": 22560476.0, + "38": 22960058.0, + "39": 22865476.0, + "40": 22721680.0, + "41": 22723112.0, + "42": 22730726.0, + "43": 23039588.0, + "44": 22810020.0, + "45": 22738904.0, + "46": 22948334.0, + "47": 22696668.0, + "48": 22992832.0, + "49": 22791208.0, + "50": 22968272.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 387744256.0, + "2": 387744256.0, + "3": 387744256.0, + "4": 387744256.0, + "5": 387744256.0, + "6": 387744256.0, + "7": 387744256.0, + "8": 387744256.0, + "9": 387744256.0, + "10": 387744256.0, + "11": 387744256.0, + "12": 387744256.0, + "13": 387744256.0, + "14": 387744256.0, + "15": 387744256.0, + "16": 387744256.0, + "17": 387744256.0, + "18": 387744256.0, + "19": 387744256.0, + "20": 387744256.0, + "21": 387744256.0, + "22": 387744256.0, + "23": 387744256.0, + "24": 387744256.0, + "25": 387744256.0, + "26": 387744256.0, + "27": 387744256.0, + "28": 387744256.0, + "29": 387744256.0, + "30": 387744256.0, + "31": 387744256.0, + "32": 387744256.0, + "33": 387744256.0, + "34": 387744256.0, + "35": 387744256.0, + "36": 387744256.0, + "37": 387744256.0, + "38": 387744256.0, + "39": 387744256.0, + "40": 387744256.0, + "41": 387744256.0, + "42": 387744256.0, + "43": 387744256.0, + "44": 387744256.0, + "45": 387744256.0, + "46": 387744256.0, + "47": 387744256.0, + "48": 387744256.0, + "49": 387744256.0, + "50": 387744256.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1122646528.0, + "2": 1245896192.0, + "3": 1245896192.0, + "4": 1245896192.0, + "5": 1245896192.0, + "6": 1245896192.0, + "7": 1245896192.0, + "8": 1245896192.0, + "9": 1245896192.0, + "10": 1245896192.0, + "11": 1245896192.0, + "12": 1245896192.0, + "13": 1245896192.0, + "14": 1245896192.0, + "15": 1245896192.0, + "16": 1245896192.0, + "17": 1245896192.0, + "18": 1245896192.0, + "19": 1245896192.0, + "20": 1245896192.0, + "21": 1245896192.0, + "22": 1245896192.0, + "23": 1245896192.0, + "24": 1245896192.0, + "25": 1245896192.0, + "26": 1245896192.0, + "27": 1245896192.0, + "28": 1245896192.0, + "29": 1245896192.0, + "30": 1245896192.0, + "31": 1245896192.0, + "32": 1245896192.0, + "33": 1245896192.0, + "34": 1245896192.0, + "35": 1245896192.0, + "36": 1245896192.0, + "37": 1245896192.0, + "38": 1245896192.0, + "39": 1245896192.0, + "40": 1245896192.0, + "41": 1245896192.0, + "42": 1245896192.0, + "43": 1245896192.0, + "44": 1245896192.0, + "45": 1245896192.0, + "46": 1245896192.0, + "47": 1245896192.0, + "48": 1245896192.0, + "49": 1245896192.0, + "50": 1245896192.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 9.48646, + "2": 0.13915, + "3": 0.11332, + "4": 0.11062, + "5": 0.10601, + "6": 0.10405, + "7": 0.10505, + "8": 0.10406, + "9": 0.10505, + "10": 0.10412, + "11": 0.1027, + "12": 0.10452, + "13": 0.10273, + "14": 0.10271, + "15": 0.10391, + "16": 0.10227, + "17": 0.10295, + "18": 0.10375, + "19": 0.10202, + "20": 0.10246, + "21": 0.10149, + "22": 0.1037, + "23": 0.10264, + "24": 0.10318, + "25": 0.10409, + "26": 0.11044, + "27": 0.10485, + "28": 0.10691, + "29": 0.10499, + "30": 0.10361, + "31": 0.10501, + "32": 0.10466, + "33": 0.1048, + "34": 0.10456, + "35": 0.10388, + "36": 0.10498, + "37": 0.10375, + "38": 0.10297, + "39": 0.10174, + "40": 0.10044, + "41": 0.10196, + "42": 0.10494, + "43": 0.10303, + "44": 0.10254, + "45": 0.10314, + "46": 0.10306, + "47": 0.10329, + "48": 0.10445, + "49": 0.10543, + "50": 0.1043 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..d0103111a28 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.92337, + "2": 10.91811, + "3": 10.91506, + "4": 10.92436, + "5": 10.92089, + "6": 10.92887, + "7": 10.92681, + "8": 10.91989, + "9": 10.92227, + "10": 10.92192, + "11": 10.918, + "12": 10.9238, + "13": 10.92406, + "14": 10.90862, + "15": 10.92351, + "16": 10.91807, + "17": 10.9154, + "18": 10.91265, + "19": 10.9091, + "20": 10.90031, + "21": 10.8959, + "22": 10.8828, + "23": 10.89975, + "24": 10.88437, + "25": 10.87827, + "26": 10.88155, + "27": 10.88649, + "28": 10.85679, + "29": 10.85657, + "30": 10.81423, + "31": 10.76651, + "32": 10.83131, + "33": 10.83158, + "34": 10.78071, + "35": 10.78865, + "36": 10.78003, + "37": 10.80446, + "38": 10.72434, + "39": 10.78066, + "40": 10.65927, + "41": 10.69208, + "42": 10.70973, + "43": 10.56128, + "44": 10.61369, + "45": 10.56875, + "46": 10.54455, + "47": 10.66751, + "48": 10.53792, + "49": 10.40861, + "50": 10.55421 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 22791636.0, + "2": 22989424.0, + "3": 22661212.0, + "4": 23283558.0, + "5": 22778528.0, + "6": 23085340.0, + "7": 22834596.0, + "8": 22990452.0, + "9": 22906466.0, + "10": 22983232.0, + "11": 22564584.0, + "12": 22524010.0, + "13": 22981124.0, + "14": 22453096.0, + "15": 22886400.0, + "16": 22895424.0, + "17": 22883736.0, + "18": 22647090.0, + "19": 22682526.0, + "20": 22758358.0, + "21": 22804276.0, + "22": 22863814.0, + "23": 22603616.0, + "24": 22835172.0, + "25": 22883742.0, + "26": 22611358.0, + "27": 22532968.0, + "28": 22517794.0, + "29": 22593448.0, + "30": 22695256.0, + "31": 23019472.0, + "32": 22648896.0, + "33": 22622516.0, + "34": 22899620.0, + "35": 22851572.0, + "36": 22653160.0, + "37": 22560476.0, + "38": 22960058.0, + "39": 22865476.0, + "40": 22721680.0, + "41": 22723112.0, + "42": 22730726.0, + "43": 23039588.0, + "44": 22810020.0, + "45": 22738904.0, + "46": 22948334.0, + "47": 22696668.0, + "48": 22992832.0, + "49": 22791208.0, + "50": 22968272.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 387744256.0, + "2": 387744256.0, + "3": 387744256.0, + "4": 387744256.0, + "5": 387744256.0, + "6": 387744256.0, + "7": 387744256.0, + "8": 387744256.0, + "9": 387744256.0, + "10": 387744256.0, + "11": 387744256.0, + "12": 387744256.0, + "13": 387744256.0, + "14": 387744256.0, + "15": 387744256.0, + "16": 387744256.0, + "17": 387744256.0, + "18": 387744256.0, + "19": 387744256.0, + "20": 387744256.0, + "21": 387744256.0, + "22": 387744256.0, + "23": 387744256.0, + "24": 387744256.0, + "25": 387744256.0, + "26": 387744256.0, + "27": 387744256.0, + "28": 387744256.0, + "29": 387744256.0, + "30": 387744256.0, + "31": 387744256.0, + "32": 387744256.0, + "33": 387744256.0, + "34": 387744256.0, + "35": 387744256.0, + "36": 387744256.0, + "37": 387744256.0, + "38": 387744256.0, + "39": 387744256.0, + "40": 387744256.0, + "41": 387744256.0, + "42": 387744256.0, + "43": 387744256.0, + "44": 387744256.0, + "45": 387744256.0, + "46": 387744256.0, + "47": 387744256.0, + "48": 387744256.0, + "49": 387744256.0, + "50": 387744256.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1122646528.0, + "2": 1245896192.0, + "3": 1245896192.0, + "4": 1245896192.0, + "5": 1245896192.0, + "6": 1245896192.0, + "7": 1245896192.0, + "8": 1245896192.0, + "9": 1245896192.0, + "10": 1245896192.0, + "11": 1245896192.0, + "12": 1245896192.0, + "13": 1245896192.0, + "14": 1245896192.0, + "15": 1245896192.0, + "16": 1245896192.0, + "17": 1245896192.0, + "18": 1245896192.0, + "19": 1245896192.0, + "20": 1245896192.0, + "21": 1245896192.0, + "22": 1245896192.0, + "23": 1245896192.0, + "24": 1245896192.0, + "25": 1245896192.0, + "26": 1245896192.0, + "27": 1245896192.0, + "28": 1245896192.0, + "29": 1245896192.0, + "30": 1245896192.0, + "31": 1245896192.0, + "32": 1245896192.0, + "33": 1245896192.0, + "34": 1245896192.0, + "35": 1245896192.0, + "36": 1245896192.0, + "37": 1245896192.0, + "38": 1245896192.0, + "39": 1245896192.0, + "40": 1245896192.0, + "41": 1245896192.0, + "42": 1245896192.0, + "43": 1245896192.0, + "44": 1245896192.0, + "45": 1245896192.0, + "46": 1245896192.0, + "47": 1245896192.0, + "48": 1245896192.0, + "49": 1245896192.0, + "50": 1245896192.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 9.86323, + "2": 0.13474, + "3": 0.1236, + "4": 0.12168, + "5": 0.12406, + "6": 0.12501, + "7": 0.12711, + "8": 0.12778, + "9": 0.12839, + "10": 0.12143, + "11": 0.12109, + "12": 0.12077, + "13": 0.11905, + "14": 0.12184, + "15": 0.12152, + "16": 0.11812, + "17": 0.11693, + "18": 0.11549, + "19": 0.11712, + "20": 0.11675, + "21": 0.11877, + "22": 0.11837, + "23": 0.11757, + "24": 0.11636, + "25": 0.11722, + "26": 0.12393, + "27": 0.11736, + "28": 0.11759, + "29": 0.11945, + "30": 0.11726, + "31": 0.12096, + "32": 0.12206, + "33": 0.11734, + "34": 0.11894, + "35": 0.11695, + "36": 0.11712, + "37": 0.11489, + "38": 0.11866, + "39": 0.11749, + "40": 0.11829, + "41": 0.11674, + "42": 0.1181, + "43": 0.11808, + "44": 0.11621, + "45": 0.11832, + "46": 0.12031, + "47": 0.12023, + "48": 0.11643, + "49": 0.11855, + "50": 0.11792 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 92e4f61f204..4fc4344a2e0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.86535, - "5": 10.87857, - "10": 10.8298, - "15": 10.82054, - "20": 10.70396, - "25": 10.49423, - "30": 10.30551, - "35": 10.20189, - "40": 10.01906, + "2": 10.85873, + "3": 10.86284, + "4": 10.84005, + "5": 10.87854, + "6": 10.8885, + "7": 10.86534, + "8": 10.86017, + "9": 10.85988, + "10": 10.82978, + "11": 10.88948, + "12": 10.8751, + "13": 10.87424, + "14": 10.89677, + "15": 10.82052, + "16": 10.82497, + "17": 10.78983, + "18": 10.81028, + "19": 10.80533, + "20": 10.70398, + "21": 10.66993, + "22": 10.50641, + "23": 10.69004, + "24": 10.56313, + "25": 10.49419, + "26": 10.56627, + "27": 10.58027, + "28": 10.51571, + "29": 10.55294, + "30": 10.3055, + "31": 10.02244, + "32": 10.40616, + "33": 10.39877, + "34": 10.13771, + "35": 10.20185, + "36": 10.16052, + "37": 10.28974, + "38": 10.11478, + "39": 10.36102, + "40": 10.01901, + "41": 10.07288, + "42": 10.14698, + "43": 9.74686, + "44": 9.87764, "45": 9.74965, - "50": 9.83991, - "55": 9.81661, - "60": 9.43542, - "65": 8.87157, + "46": 9.73383, + "47": 10.07534, + "48": 9.78068, + "49": 9.4478, + "50": 9.8399, + "51": 9.78024, + "52": 9.67265, + "53": 10.02013, + "54": 9.8979, + "55": 9.81663, + "56": 9.56041, + "57": 9.4118, + "58": 9.77417, + "59": 9.51799, + "60": 9.43538, + "61": 9.64483, + "62": 9.93002, + "63": 9.30912, + "64": 9.72066, + "65": 8.87152, + "66": 9.64433, + "67": 9.31332, + "68": 9.74069, + "69": 9.75327, "70": 9.70004, - "75": 9.37312, - "80": 9.36163, - "85": 9.5694, - "90": 9.78468, + "71": 9.56557, + "72": 9.53091, + "73": 9.44385, + "74": 8.8678, + "75": 9.37308, + "76": 9.01275, + "77": 10.02855, + "78": 9.68739, + "79": 9.32795, + "80": 9.36169, + "81": 9.43364, + "82": 9.66094, + "83": 9.25137, + "84": 9.37353, + "85": 9.56936, + "86": 9.03179, + "87": 9.55585, + "88": 9.71056, + "89": 9.55398, + "90": 9.78472, + "91": 9.29079, + "92": 9.31245, + "93": 9.03137, + "94": 8.78667, "95": 9.4873, + "96": 9.49052, + "97": 9.26686, + "98": 9.63648, + "99": 8.84331, "100": 9.3555 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 595.0, - "5": 623.0, - "10": 551.0, - "15": 632.0, - "20": 621.0, - "25": 581.0, - "30": 691.0, - "35": 739.0, - "40": 812.0, - "45": 829.0, - "50": 869.0, - "55": 909.0, - "60": 832.0, - "65": 936.0, - "70": 1050.0, - "75": 816.0, - "80": 1140.0, - "85": 1203.0, - "90": 1108.0, - "95": 1190.0, - "100": 1117.0 + "1": 603.0, + "2": 642.0, + "3": 648.0, + "4": 599.0, + "5": 644.0, + "6": 645.0, + "7": 625.0, + "8": 544.0, + "9": 657.0, + "10": 536.0, + "11": 673.0, + "12": 618.0, + "13": 646.0, + "14": 683.0, + "15": 639.0, + "16": 616.0, + "17": 656.0, + "18": 579.0, + "19": 637.0, + "20": 628.0, + "21": 672.0, + "22": 627.0, + "23": 744.0, + "24": 610.0, + "25": 578.0, + "26": 602.0, + "27": 633.0, + "28": 750.0, + "29": 709.0, + "30": 736.0, + "31": 626.0, + "32": 716.0, + "33": 754.0, + "34": 692.0, + "35": 707.0, + "36": 733.0, + "37": 797.0, + "38": 813.0, + "39": 878.0, + "40": 807.0, + "41": 808.0, + "42": 831.0, + "43": 703.0, + "44": 810.0, + "45": 768.0, + "46": 858.0, + "47": 879.0, + "48": 856.0, + "49": 814.0, + "50": 862.0, + "51": 928.0, + "52": 1001.0, + "53": 1019.0, + "54": 978.0, + "55": 917.0, + "56": 1023.0, + "57": 835.0, + "58": 1020.0, + "59": 1033.0, + "60": 900.0, + "61": 998.0, + "62": 966.0, + "63": 933.0, + "64": 1084.0, + "65": 960.0, + "66": 1081.0, + "67": 1043.0, + "68": 1032.0, + "69": 1029.0, + "70": 1108.0, + "71": 1123.0, + "72": 848.0, + "73": 991.0, + "74": 685.0, + "75": 878.0, + "76": 1149.0, + "77": 1198.0, + "78": 1087.0, + "79": 1095.0, + "80": 1114.0, + "81": 1229.0, + "82": 1048.0, + "83": 1002.0, + "84": 1115.0, + "85": 1228.0, + "86": 896.0, + "87": 1212.0, + "88": 1039.0, + "89": 1111.0, + "90": 1085.0, + "91": 1140.0, + "92": 1186.0, + "93": 896.0, + "94": 1148.0, + "95": 1102.0, + "96": 1113.0, + "97": 1002.0, + "98": 1267.0, + "99": 1178.0, + "100": 1179.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, "50": 510689792.0, + "51": 510689792.0, + "52": 510689792.0, + "53": 510689792.0, + "54": 510689792.0, "55": 510689792.0, + "56": 510689792.0, + "57": 510689792.0, + "58": 510689792.0, + "59": 510689792.0, "60": 510689792.0, + "61": 510689792.0, + "62": 510689792.0, + "63": 510689792.0, + "64": 510689792.0, "65": 510689792.0, + "66": 510689792.0, + "67": 510689792.0, + "68": 510689792.0, + "69": 510689792.0, "70": 510689792.0, + "71": 510689792.0, + "72": 510689792.0, + "73": 510689792.0, + "74": 510689792.0, "75": 510689792.0, + "76": 510689792.0, + "77": 510689792.0, + "78": 510689792.0, + "79": 510689792.0, "80": 510689792.0, + "81": 510689792.0, + "82": 510689792.0, + "83": 510689792.0, + "84": 510689792.0, "85": 510689792.0, + "86": 510689792.0, + "87": 510689792.0, + "88": 510689792.0, + "89": 510689792.0, "90": 510689792.0, + "91": 510689792.0, + "92": 510689792.0, + "93": 510689792.0, + "94": 510689792.0, "95": 510689792.0, + "96": 510689792.0, + "97": 510689792.0, + "98": 510689792.0, + "99": 510689792.0, "100": 510689792.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 759895552.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, "50": 933156352.0, + "51": 933156352.0, + "52": 933156352.0, + "53": 933156352.0, + "54": 933156352.0, "55": 933156352.0, + "56": 933156352.0, + "57": 933156352.0, + "58": 933156352.0, + "59": 933156352.0, "60": 933156352.0, + "61": 933156352.0, + "62": 933156352.0, + "63": 933156352.0, + "64": 933156352.0, "65": 933156352.0, + "66": 933156352.0, + "67": 933156352.0, + "68": 933156352.0, + "69": 933156352.0, "70": 933156352.0, + "71": 933156352.0, + "72": 933156352.0, + "73": 933156352.0, + "74": 933156352.0, "75": 933156352.0, + "76": 933156352.0, + "77": 933156352.0, + "78": 933156352.0, + "79": 933156352.0, "80": 933156352.0, + "81": 933156352.0, + "82": 933156352.0, + "83": 933156352.0, + "84": 933156352.0, "85": 933156352.0, + "86": 933156352.0, + "87": 933156352.0, + "88": 933156352.0, + "89": 933156352.0, "90": 933156352.0, + "91": 933156352.0, + "92": 933156352.0, + "93": 933156352.0, + "94": 933156352.0, "95": 933156352.0, + "96": 933156352.0, + "97": 933156352.0, + "98": 933156352.0, + "99": 933156352.0, "100": 933156352.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 14.34885, - "5": 0.28143, - "10": 0.28313, - "15": 0.27848, - "20": 0.28429, - "25": 0.28541, - "30": 0.28319, - "35": 0.28404, - "40": 0.28308, - "45": 0.27994, - "50": 0.28525, - "55": 0.2917, - "60": 0.29133, - "65": 0.28566, - "70": 0.29027, - "75": 0.28604, - "80": 0.29548, - "85": 0.28726, - "90": 0.28624, - "95": 0.2883, - "100": 0.29017 + "1": 16.11625, + "2": 0.36631, + "3": 0.34354, + "4": 0.34024, + "5": 0.33469, + "6": 0.3419, + "7": 0.33228, + "8": 0.32074, + "9": 0.32378, + "10": 0.32158, + "11": 0.32213, + "12": 0.32775, + "13": 0.32607, + "14": 0.32118, + "15": 0.3245, + "16": 0.3215, + "17": 0.32118, + "18": 0.32636, + "19": 0.32325, + "20": 0.32277, + "21": 0.32375, + "22": 0.32539, + "23": 0.32026, + "24": 0.32491, + "25": 0.32391, + "26": 0.32302, + "27": 0.32176, + "28": 0.32809, + "29": 0.32603, + "30": 0.3249, + "31": 0.33977, + "32": 0.34038, + "33": 0.34031, + "34": 0.32189, + "35": 0.32635, + "36": 0.32269, + "37": 0.32267, + "38": 0.3225, + "39": 0.32579, + "40": 0.32854, + "41": 0.32405, + "42": 0.32252, + "43": 0.3294, + "44": 0.32763, + "45": 0.32247, + "46": 0.32281, + "47": 0.32544, + "48": 0.32623, + "49": 0.32647, + "50": 0.32132, + "51": 0.32838, + "52": 0.32103, + "53": 0.32972, + "54": 0.32308, + "55": 0.3197, + "56": 0.32532, + "57": 0.33022, + "58": 0.32385, + "59": 0.3254, + "60": 0.33968, + "61": 0.334, + "62": 0.33471, + "63": 0.33468, + "64": 0.32025, + "65": 0.31712, + "66": 0.327, + "67": 0.3195, + "68": 0.32296, + "69": 0.32809, + "70": 0.321, + "71": 0.32464, + "72": 0.33034, + "73": 0.32003, + "74": 0.31593, + "75": 0.32867, + "76": 0.32348, + "77": 0.31767, + "78": 0.33054, + "79": 0.32363, + "80": 0.3218, + "81": 0.32884, + "82": 0.32228, + "83": 0.31938, + "84": 0.32519, + "85": 0.32022, + "86": 0.32099, + "87": 0.32558, + "88": 0.32258, + "89": 0.32117, + "90": 0.33145, + "91": 0.33173, + "92": 0.32613, + "93": 0.33404, + "94": 0.32862, + "95": 0.32897, + "96": 0.32817, + "97": 0.32958, + "98": 0.32759, + "99": 0.33061, + "100": 0.33344 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..af0dc8991a7 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86286, + "4": 10.8401, + "5": 10.87854, + "6": 10.88851, + "7": 10.86534, + "8": 10.86016, + "9": 10.8599, + "10": 10.82977, + "11": 10.88949, + "12": 10.8751, + "13": 10.87423, + "14": 10.89677, + "15": 10.82052, + "16": 10.82497, + "17": 10.78983, + "18": 10.81028, + "19": 10.80533, + "20": 10.70396, + "21": 10.66992, + "22": 10.50642, + "23": 10.69003, + "24": 10.56316, + "25": 10.49422, + "26": 10.56629, + "27": 10.58024, + "28": 10.5157, + "29": 10.55294, + "30": 10.30549, + "31": 10.02246, + "32": 10.40618, + "33": 10.3988, + "34": 10.13772, + "35": 10.20188, + "36": 10.16051, + "37": 10.28976, + "38": 10.11481, + "39": 10.36103, + "40": 10.01902, + "41": 10.07292, + "42": 10.14693, + "43": 9.74685, + "44": 9.87763, + "45": 9.74968, + "46": 9.73387, + "47": 10.07535, + "48": 9.78069, + "49": 9.44782, + "50": 9.83989, + "51": 9.78023, + "52": 9.67265, + "53": 10.02014, + "54": 9.89792, + "55": 9.81667, + "56": 9.56045, + "57": 9.41178, + "58": 9.77416, + "59": 9.51797, + "60": 9.43536, + "61": 9.64484, + "62": 9.93004, + "63": 9.30908, + "64": 9.72064, + "65": 8.87155, + "66": 9.64428, + "67": 9.31328, + "68": 9.74066, + "69": 9.75332, + "70": 9.70004, + "71": 9.56561, + "72": 9.53094, + "73": 9.44384, + "74": 8.86782, + "75": 9.37311, + "76": 9.01276, + "77": 10.02852, + "78": 9.68739, + "79": 9.32796, + "80": 9.36168, + "81": 9.43368, + "82": 9.66094, + "83": 9.25138, + "84": 9.37354, + "85": 9.5694, + "86": 9.03176, + "87": 9.55582, + "88": 9.71055, + "89": 9.55397, + "90": 9.7847, + "91": 9.29075, + "92": 9.31241, + "93": 9.03141, + "94": 8.78668, + "95": 9.48729, + "96": 9.49051, + "97": 9.26682, + "98": 9.63648, + "99": 8.84335, + "100": 9.35548 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 602.0, + "2": 621.0, + "3": 616.0, + "4": 577.0, + "5": 617.0, + "6": 617.0, + "7": 645.0, + "8": 568.0, + "9": 673.0, + "10": 569.0, + "11": 637.0, + "12": 647.0, + "13": 676.0, + "14": 666.0, + "15": 706.0, + "16": 627.0, + "17": 640.0, + "18": 607.0, + "19": 623.0, + "20": 620.0, + "21": 654.0, + "22": 640.0, + "23": 775.0, + "24": 581.0, + "25": 629.0, + "26": 665.0, + "27": 689.0, + "28": 707.0, + "29": 722.0, + "30": 738.0, + "31": 640.0, + "32": 746.0, + "33": 831.0, + "34": 673.0, + "35": 746.0, + "36": 749.0, + "37": 826.0, + "38": 771.0, + "39": 852.0, + "40": 746.0, + "41": 834.0, + "42": 845.0, + "43": 709.0, + "44": 739.0, + "45": 808.0, + "46": 888.0, + "47": 849.0, + "48": 880.0, + "49": 879.0, + "50": 840.0, + "51": 915.0, + "52": 896.0, + "53": 1048.0, + "54": 1044.0, + "55": 954.0, + "56": 960.0, + "57": 849.0, + "58": 1035.0, + "59": 1036.0, + "60": 875.0, + "61": 1010.0, + "62": 973.0, + "63": 928.0, + "64": 1019.0, + "65": 928.0, + "66": 1115.0, + "67": 966.0, + "68": 954.0, + "69": 1094.0, + "70": 1039.0, + "71": 1034.0, + "72": 891.0, + "73": 1023.0, + "74": 764.0, + "75": 903.0, + "76": 1061.0, + "77": 1149.0, + "78": 1070.0, + "79": 1063.0, + "80": 1091.0, + "81": 1242.0, + "82": 1047.0, + "83": 1012.0, + "84": 1154.0, + "85": 1199.0, + "86": 930.0, + "87": 1297.0, + "88": 1049.0, + "89": 1103.0, + "90": 1021.0, + "91": 1134.0, + "92": 1187.0, + "93": 918.0, + "94": 1129.0, + "95": 1126.0, + "96": 1146.0, + "97": 1003.0, + "98": 1260.0, + "99": 1135.0, + "100": 1164.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0, + "51": 510689792.0, + "52": 510689792.0, + "53": 510689792.0, + "54": 510689792.0, + "55": 510689792.0, + "56": 510689792.0, + "57": 510689792.0, + "58": 510689792.0, + "59": 510689792.0, + "60": 510689792.0, + "61": 510689792.0, + "62": 510689792.0, + "63": 510689792.0, + "64": 510689792.0, + "65": 510689792.0, + "66": 510689792.0, + "67": 510689792.0, + "68": 510689792.0, + "69": 510689792.0, + "70": 510689792.0, + "71": 510689792.0, + "72": 510689792.0, + "73": 510689792.0, + "74": 510689792.0, + "75": 510689792.0, + "76": 510689792.0, + "77": 510689792.0, + "78": 510689792.0, + "79": 510689792.0, + "80": 510689792.0, + "81": 510689792.0, + "82": 510689792.0, + "83": 510689792.0, + "84": 510689792.0, + "85": 510689792.0, + "86": 510689792.0, + "87": 510689792.0, + "88": 510689792.0, + "89": 510689792.0, + "90": 510689792.0, + "91": 510689792.0, + "92": 510689792.0, + "93": 510689792.0, + "94": 510689792.0, + "95": 510689792.0, + "96": 510689792.0, + "97": 510689792.0, + "98": 510689792.0, + "99": 510689792.0, + "100": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 759895552.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 934203392.0, + "16": 934203392.0, + "17": 934203392.0, + "18": 934203392.0, + "19": 934203392.0, + "20": 934203392.0, + "21": 934203392.0, + "22": 934203392.0, + "23": 934203392.0, + "24": 934203392.0, + "25": 934203392.0, + "26": 934203392.0, + "27": 934203392.0, + "28": 934203392.0, + "29": 934203392.0, + "30": 934203392.0, + "31": 934203392.0, + "32": 934203392.0, + "33": 934203392.0, + "34": 934203392.0, + "35": 934203392.0, + "36": 934203392.0, + "37": 934203392.0, + "38": 934203392.0, + "39": 934203392.0, + "40": 934203392.0, + "41": 934203392.0, + "42": 934203392.0, + "43": 934203392.0, + "44": 934203392.0, + "45": 934203392.0, + "46": 934203392.0, + "47": 934203392.0, + "48": 934203392.0, + "49": 934203392.0, + "50": 934203392.0, + "51": 934203392.0, + "52": 934203392.0, + "53": 934203392.0, + "54": 934203392.0, + "55": 934203392.0, + "56": 934203392.0, + "57": 934203392.0, + "58": 934203392.0, + "59": 934203392.0, + "60": 934203392.0, + "61": 934203392.0, + "62": 934203392.0, + "63": 934203392.0, + "64": 934203392.0, + "65": 934203392.0, + "66": 934203392.0, + "67": 934203392.0, + "68": 934203392.0, + "69": 934203392.0, + "70": 934203392.0, + "71": 934203392.0, + "72": 934203392.0, + "73": 934203392.0, + "74": 934203392.0, + "75": 934203392.0, + "76": 934203392.0, + "77": 934203392.0, + "78": 934203392.0, + "79": 934203392.0, + "80": 934203392.0, + "81": 934203392.0, + "82": 934203392.0, + "83": 934203392.0, + "84": 934203392.0, + "85": 934203392.0, + "86": 934203392.0, + "87": 934203392.0, + "88": 934203392.0, + "89": 934203392.0, + "90": 934203392.0, + "91": 934203392.0, + "92": 934203392.0, + "93": 934203392.0, + "94": 934203392.0, + "95": 934203392.0, + "96": 934203392.0, + "97": 934203392.0, + "98": 934203392.0, + "99": 934203392.0, + "100": 934203392.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 17.126, + "2": 0.48552, + "3": 0.29604, + "4": 0.30321, + "5": 0.28764, + "6": 0.28618, + "7": 0.28577, + "8": 0.28879, + "9": 0.28726, + "10": 0.28646, + "11": 0.28506, + "12": 0.28217, + "13": 0.2868, + "14": 0.28787, + "15": 0.28549, + "16": 0.2862, + "17": 0.28698, + "18": 0.29086, + "19": 0.28554, + "20": 0.2857, + "21": 0.28549, + "22": 0.28641, + "23": 0.28608, + "24": 0.28569, + "25": 0.28652, + "26": 0.28468, + "27": 0.28942, + "28": 0.28949, + "29": 0.28879, + "30": 0.28796, + "31": 0.29103, + "32": 0.29073, + "33": 0.28732, + "34": 0.29616, + "35": 0.28855, + "36": 0.28828, + "37": 0.28466, + "38": 0.28953, + "39": 0.29333, + "40": 0.28768, + "41": 0.28231, + "42": 0.28695, + "43": 0.28583, + "44": 0.28905, + "45": 0.28528, + "46": 0.28715, + "47": 0.28626, + "48": 0.28831, + "49": 0.28647, + "50": 0.28555, + "51": 0.29483, + "52": 0.28779, + "53": 0.28678, + "54": 0.28789, + "55": 0.28871, + "56": 0.29987, + "57": 0.29343, + "58": 0.28823, + "59": 0.28887, + "60": 0.29468, + "61": 0.28773, + "62": 0.30025, + "63": 0.28844, + "64": 0.28597, + "65": 0.28565, + "66": 0.2875, + "67": 0.28661, + "68": 0.2859, + "69": 0.28584, + "70": 0.28606, + "71": 0.286, + "72": 0.2846, + "73": 0.29219, + "74": 0.28688, + "75": 0.28871, + "76": 0.28938, + "77": 0.28731, + "78": 0.28558, + "79": 0.28696, + "80": 0.28619, + "81": 0.28793, + "82": 0.28828, + "83": 0.28522, + "84": 0.29988, + "85": 0.29704, + "86": 0.28664, + "87": 0.2857, + "88": 0.28622, + "89": 0.28571, + "90": 0.2853, + "91": 0.29259, + "92": 0.28615, + "93": 0.285, + "94": 0.286, + "95": 0.28546, + "96": 0.28446, + "97": 0.28434, + "98": 0.28413, + "99": 0.2875, + "100": 0.28509 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..c677311f507 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.86535, + "2": 10.85873, + "3": 10.86285, + "4": 10.84007, + "5": 10.87856, + "6": 10.88856, + "7": 10.86538, + "8": 10.86017, + "9": 10.85991, + "10": 10.8298, + "11": 10.88947, + "12": 10.87508, + "13": 10.87422, + "14": 10.89677, + "15": 10.8205, + "16": 10.82499, + "17": 10.78984, + "18": 10.81029, + "19": 10.80536, + "20": 10.70396, + "21": 10.6699, + "22": 10.50644, + "23": 10.69003, + "24": 10.5631, + "25": 10.49417, + "26": 10.56624, + "27": 10.58026, + "28": 10.51571, + "29": 10.553, + "30": 10.30552, + "31": 10.02249, + "32": 10.40613, + "33": 10.3988, + "34": 10.13771, + "35": 10.20186, + "36": 10.16052, + "37": 10.28975, + "38": 10.1148, + "39": 10.36102, + "40": 10.01904, + "41": 10.07292, + "42": 10.14696, + "43": 9.74683, + "44": 9.87763, + "45": 9.74966, + "46": 9.73387, + "47": 10.07534, + "48": 9.78069, + "49": 9.4478, + "50": 9.83991, + "51": 9.78025, + "52": 9.67263, + "53": 10.0201, + "54": 9.89789, + "55": 9.81664, + "56": 9.56044, + "57": 9.41178, + "58": 9.77419, + "59": 9.51794, + "60": 9.43538, + "61": 9.64484, + "62": 9.93004, + "63": 9.30911, + "64": 9.72068, + "65": 8.87154, + "66": 9.64427, + "67": 9.31328, + "68": 9.74067, + "69": 9.75334, + "70": 9.70004, + "71": 9.56556, + "72": 9.53094, + "73": 9.44386, + "74": 8.86782, + "75": 9.37314, + "76": 9.01274, + "77": 10.02855, + "78": 9.68739, + "79": 9.328, + "80": 9.36168, + "81": 9.43367, + "82": 9.66094, + "83": 9.25139, + "84": 9.37352, + "85": 9.56939, + "86": 9.03181, + "87": 9.55584, + "88": 9.71055, + "89": 9.55395, + "90": 9.78475, + "91": 9.29077, + "92": 9.31245, + "93": 9.03142, + "94": 8.78671, + "95": 9.4873, + "96": 9.49052, + "97": 9.26684, + "98": 9.63648, + "99": 8.84333, + "100": 9.35549 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 585.0, + "2": 648.0, + "3": 630.0, + "4": 656.0, + "5": 620.0, + "6": 637.0, + "7": 641.0, + "8": 581.0, + "9": 660.0, + "10": 504.0, + "11": 664.0, + "12": 639.0, + "13": 670.0, + "14": 666.0, + "15": 652.0, + "16": 624.0, + "17": 704.0, + "18": 579.0, + "19": 682.0, + "20": 623.0, + "21": 657.0, + "22": 561.0, + "23": 763.0, + "24": 593.0, + "25": 629.0, + "26": 669.0, + "27": 691.0, + "28": 738.0, + "29": 788.0, + "30": 744.0, + "31": 604.0, + "32": 736.0, + "33": 787.0, + "34": 706.0, + "35": 692.0, + "36": 714.0, + "37": 835.0, + "38": 768.0, + "39": 894.0, + "40": 764.0, + "41": 852.0, + "42": 878.0, + "43": 733.0, + "44": 827.0, + "45": 785.0, + "46": 877.0, + "47": 927.0, + "48": 873.0, + "49": 891.0, + "50": 869.0, + "51": 928.0, + "52": 968.0, + "53": 1089.0, + "54": 966.0, + "55": 913.0, + "56": 983.0, + "57": 889.0, + "58": 1063.0, + "59": 1005.0, + "60": 876.0, + "61": 1043.0, + "62": 897.0, + "63": 971.0, + "64": 1100.0, + "65": 911.0, + "66": 1107.0, + "67": 948.0, + "68": 1033.0, + "69": 1064.0, + "70": 1118.0, + "71": 1032.0, + "72": 854.0, + "73": 1007.0, + "74": 739.0, + "75": 877.0, + "76": 1075.0, + "77": 1108.0, + "78": 1103.0, + "79": 980.0, + "80": 1055.0, + "81": 1240.0, + "82": 1101.0, + "83": 1007.0, + "84": 1147.0, + "85": 1157.0, + "86": 897.0, + "87": 1247.0, + "88": 1015.0, + "89": 1155.0, + "90": 1138.0, + "91": 1141.0, + "92": 1142.0, + "93": 947.0, + "94": 1116.0, + "95": 1119.0, + "96": 1099.0, + "97": 997.0, + "98": 1188.0, + "99": 1141.0, + "100": 1102.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 510689792.0, + "2": 510689792.0, + "3": 510689792.0, + "4": 510689792.0, + "5": 510689792.0, + "6": 510689792.0, + "7": 510689792.0, + "8": 510689792.0, + "9": 510689792.0, + "10": 510689792.0, + "11": 510689792.0, + "12": 510689792.0, + "13": 510689792.0, + "14": 510689792.0, + "15": 510689792.0, + "16": 510689792.0, + "17": 510689792.0, + "18": 510689792.0, + "19": 510689792.0, + "20": 510689792.0, + "21": 510689792.0, + "22": 510689792.0, + "23": 510689792.0, + "24": 510689792.0, + "25": 510689792.0, + "26": 510689792.0, + "27": 510689792.0, + "28": 510689792.0, + "29": 510689792.0, + "30": 510689792.0, + "31": 510689792.0, + "32": 510689792.0, + "33": 510689792.0, + "34": 510689792.0, + "35": 510689792.0, + "36": 510689792.0, + "37": 510689792.0, + "38": 510689792.0, + "39": 510689792.0, + "40": 510689792.0, + "41": 510689792.0, + "42": 510689792.0, + "43": 510689792.0, + "44": 510689792.0, + "45": 510689792.0, + "46": 510689792.0, + "47": 510689792.0, + "48": 510689792.0, + "49": 510689792.0, + "50": 510689792.0, + "51": 510689792.0, + "52": 510689792.0, + "53": 510689792.0, + "54": 510689792.0, + "55": 510689792.0, + "56": 510689792.0, + "57": 510689792.0, + "58": 510689792.0, + "59": 510689792.0, + "60": 510689792.0, + "61": 510689792.0, + "62": 510689792.0, + "63": 510689792.0, + "64": 510689792.0, + "65": 510689792.0, + "66": 510689792.0, + "67": 510689792.0, + "68": 510689792.0, + "69": 510689792.0, + "70": 510689792.0, + "71": 510689792.0, + "72": 510689792.0, + "73": 510689792.0, + "74": 510689792.0, + "75": 510689792.0, + "76": 510689792.0, + "77": 510689792.0, + "78": 510689792.0, + "79": 510689792.0, + "80": 510689792.0, + "81": 510689792.0, + "82": 510689792.0, + "83": 510689792.0, + "84": 510689792.0, + "85": 510689792.0, + "86": 510689792.0, + "87": 510689792.0, + "88": 510689792.0, + "89": 510689792.0, + "90": 510689792.0, + "91": 510689792.0, + "92": 510689792.0, + "93": 510689792.0, + "94": 510689792.0, + "95": 510689792.0, + "96": 510689792.0, + "97": 510689792.0, + "98": 510689792.0, + "99": 510689792.0, + "100": 510689792.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 759895552.0, + "2": 933156352.0, + "3": 933156352.0, + "4": 933156352.0, + "5": 933156352.0, + "6": 933156352.0, + "7": 933156352.0, + "8": 933156352.0, + "9": 933156352.0, + "10": 933156352.0, + "11": 933156352.0, + "12": 933156352.0, + "13": 933156352.0, + "14": 933156352.0, + "15": 933156352.0, + "16": 933156352.0, + "17": 933156352.0, + "18": 933156352.0, + "19": 933156352.0, + "20": 933156352.0, + "21": 933156352.0, + "22": 933156352.0, + "23": 933156352.0, + "24": 933156352.0, + "25": 933156352.0, + "26": 933156352.0, + "27": 933156352.0, + "28": 933156352.0, + "29": 933156352.0, + "30": 933156352.0, + "31": 933156352.0, + "32": 933156352.0, + "33": 933156352.0, + "34": 933156352.0, + "35": 933156352.0, + "36": 933156352.0, + "37": 933156352.0, + "38": 933156352.0, + "39": 933156352.0, + "40": 933156352.0, + "41": 933156352.0, + "42": 933156352.0, + "43": 933156352.0, + "44": 933156352.0, + "45": 933156352.0, + "46": 933156352.0, + "47": 933156352.0, + "48": 933156352.0, + "49": 933156352.0, + "50": 933156352.0, + "51": 933156352.0, + "52": 933156352.0, + "53": 933156352.0, + "54": 933156352.0, + "55": 933156352.0, + "56": 933156352.0, + "57": 933156352.0, + "58": 933156352.0, + "59": 933156352.0, + "60": 933156352.0, + "61": 933156352.0, + "62": 933156352.0, + "63": 933156352.0, + "64": 933156352.0, + "65": 933156352.0, + "66": 933156352.0, + "67": 933156352.0, + "68": 933156352.0, + "69": 933156352.0, + "70": 933156352.0, + "71": 933156352.0, + "72": 933156352.0, + "73": 933156352.0, + "74": 933156352.0, + "75": 933156352.0, + "76": 933156352.0, + "77": 933156352.0, + "78": 933156352.0, + "79": 933156352.0, + "80": 933156352.0, + "81": 933156352.0, + "82": 933156352.0, + "83": 933156352.0, + "84": 933156352.0, + "85": 933156352.0, + "86": 933156352.0, + "87": 933156352.0, + "88": 933156352.0, + "89": 933156352.0, + "90": 933156352.0, + "91": 933156352.0, + "92": 933156352.0, + "93": 933156352.0, + "94": 933156352.0, + "95": 933156352.0, + "96": 933156352.0, + "97": 933156352.0, + "98": 933156352.0, + "99": 933156352.0, + "100": 933156352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 15.91944, + "2": 0.35854, + "3": 0.34422, + "4": 0.34655, + "5": 0.33791, + "6": 0.34327, + "7": 0.34394, + "8": 0.3383, + "9": 0.34058, + "10": 0.32396, + "11": 0.32631, + "12": 0.33064, + "13": 0.32832, + "14": 0.32645, + "15": 0.32686, + "16": 0.32351, + "17": 0.32796, + "18": 0.33094, + "19": 0.32865, + "20": 0.32722, + "21": 0.32666, + "22": 0.32679, + "23": 0.32717, + "24": 0.32824, + "25": 0.32793, + "26": 0.32517, + "27": 0.326, + "28": 0.32627, + "29": 0.32627, + "30": 0.32688, + "31": 0.32603, + "32": 0.32544, + "33": 0.32613, + "34": 0.32696, + "35": 0.32522, + "36": 0.32966, + "37": 0.32462, + "38": 0.32724, + "39": 0.32622, + "40": 0.32646, + "41": 0.32504, + "42": 0.32464, + "43": 0.3299, + "44": 0.32495, + "45": 0.32382, + "46": 0.32567, + "47": 0.32847, + "48": 0.32521, + "49": 0.32738, + "50": 0.32495, + "51": 0.33517, + "52": 0.33963, + "53": 0.33084, + "54": 0.3299, + "55": 0.33062, + "56": 0.32923, + "57": 0.32909, + "58": 0.331, + "59": 0.32595, + "60": 0.32446, + "61": 0.32961, + "62": 0.33126, + "63": 0.32393, + "64": 0.32986, + "65": 0.32836, + "66": 0.32921, + "67": 0.32945, + "68": 0.32848, + "69": 0.32625, + "70": 0.32898, + "71": 0.33227, + "72": 0.32403, + "73": 0.3284, + "74": 0.32761, + "75": 0.32791, + "76": 0.33223, + "77": 0.33113, + "78": 0.32546, + "79": 0.32925, + "80": 0.33175, + "81": 0.33071, + "82": 0.32698, + "83": 0.32738, + "84": 0.32835, + "85": 0.32729, + "86": 0.33228, + "87": 0.32668, + "88": 0.33091, + "89": 0.32825, + "90": 0.32752, + "91": 0.32814, + "92": 0.33195, + "93": 0.32686, + "94": 0.33172, + "95": 0.33336, + "96": 0.32938, + "97": 0.33024, + "98": 0.32939, + "99": 0.32654, + "100": 0.3311 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..ebf6c82ee54 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.93292, + "2": 10.93423, + "3": 10.91345, + "4": 10.90324, + "5": 10.92968, + "6": 10.93656, + "7": 10.90276, + "8": 10.92117, + "9": 10.90704, + "10": 10.90472, + "11": 10.88787, + "12": 10.91738, + "13": 10.9119, + "14": 10.91507, + "15": 10.87126, + "16": 10.8613, + "17": 10.82697, + "18": 10.85679, + "19": 10.84054, + "20": 10.75001, + "21": 10.71507, + "22": 10.58114, + "23": 10.72644, + "24": 10.60727, + "25": 10.53752, + "26": 10.61066, + "27": 10.59932, + "28": 10.54958, + "29": 10.56604, + "30": 10.32552, + "31": 10.06696, + "32": 10.4381, + "33": 10.42364, + "34": 10.16013, + "35": 10.22893, + "36": 10.17617, + "37": 10.29237, + "38": 10.13294, + "39": 10.34957, + "40": 10.01977, + "41": 10.07538, + "42": 10.15409, + "43": 9.76086, + "44": 9.88355, + "45": 9.75547, + "46": 9.74959, + "47": 10.07548, + "48": 9.7794, + "49": 9.43816, + "50": 9.84069, + "51": 9.77753, + "52": 9.66527, + "53": 10.00737, + "54": 9.88876, + "55": 9.81447, + "56": 9.55926, + "57": 9.39917, + "58": 9.77268, + "59": 9.51592, + "60": 9.42444, + "61": 9.64312, + "62": 9.93506, + "63": 9.30274, + "64": 9.72153, + "65": 8.86712, + "66": 9.64652, + "67": 9.30859, + "68": 9.74064, + "69": 9.7415, + "70": 9.679, + "71": 9.55873, + "72": 9.53279, + "73": 9.43847, + "74": 8.88232, + "75": 9.36664, + "76": 9.02474, + "77": 10.02955, + "78": 9.68856, + "79": 9.32607, + "80": 9.35304, + "81": 9.43249, + "82": 9.65191, + "83": 9.25401, + "84": 9.36521, + "85": 9.56704, + "86": 9.03547, + "87": 9.55775, + "88": 9.70744, + "89": 9.55898, + "90": 9.77582, + "91": 9.29648, + "92": 9.32116, + "93": 9.02867, + "94": 8.78308, + "95": 9.48328, + "96": 9.48474, + "97": 9.26673, + "98": 9.63741, + "99": 8.83899, + "100": 9.35877 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 612.0, + "2": 654.0, + "3": 644.0, + "4": 624.0, + "5": 683.0, + "6": 610.0, + "7": 588.0, + "8": 594.0, + "9": 672.0, + "10": 520.0, + "11": 665.0, + "12": 621.0, + "13": 608.0, + "14": 635.0, + "15": 647.0, + "16": 630.0, + "17": 644.0, + "18": 624.0, + "19": 615.0, + "20": 606.0, + "21": 625.0, + "22": 608.0, + "23": 673.0, + "24": 575.0, + "25": 614.0, + "26": 607.0, + "27": 677.0, + "28": 722.0, + "29": 751.0, + "30": 740.0, + "31": 643.0, + "32": 722.0, + "33": 755.0, + "34": 656.0, + "35": 704.0, + "36": 719.0, + "37": 777.0, + "38": 788.0, + "39": 864.0, + "40": 783.0, + "41": 775.0, + "42": 842.0, + "43": 714.0, + "44": 725.0, + "45": 765.0, + "46": 880.0, + "47": 877.0, + "48": 813.0, + "49": 884.0, + "50": 806.0, + "51": 892.0, + "52": 949.0, + "53": 967.0, + "54": 953.0, + "55": 873.0, + "56": 949.0, + "57": 857.0, + "58": 1012.0, + "59": 993.0, + "60": 902.0, + "61": 986.0, + "62": 927.0, + "63": 856.0, + "64": 1097.0, + "65": 939.0, + "66": 1069.0, + "67": 932.0, + "68": 951.0, + "69": 1057.0, + "70": 1099.0, + "71": 1071.0, + "72": 884.0, + "73": 1024.0, + "74": 726.0, + "75": 895.0, + "76": 1038.0, + "77": 1116.0, + "78": 1129.0, + "79": 1060.0, + "80": 1169.0, + "81": 1199.0, + "82": 1064.0, + "83": 1024.0, + "84": 1124.0, + "85": 1134.0, + "86": 836.0, + "87": 1175.0, + "88": 1046.0, + "89": 1174.0, + "90": 1121.0, + "91": 1063.0, + "92": 1161.0, + "93": 925.0, + "94": 1129.0, + "95": 1168.0, + "96": 1212.0, + "97": 1019.0, + "98": 1216.0, + "99": 1131.0, + "100": 1070.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 431783936.0, + "2": 431783936.0, + "3": 431783936.0, + "4": 431783936.0, + "5": 431783936.0, + "6": 431783936.0, + "7": 431783936.0, + "8": 431783936.0, + "9": 431783936.0, + "10": 431783936.0, + "11": 431783936.0, + "12": 431783936.0, + "13": 431783936.0, + "14": 431783936.0, + "15": 431783936.0, + "16": 431783936.0, + "17": 431783936.0, + "18": 431783936.0, + "19": 431783936.0, + "20": 431783936.0, + "21": 431783936.0, + "22": 431783936.0, + "23": 431783936.0, + "24": 431783936.0, + "25": 431783936.0, + "26": 431783936.0, + "27": 431783936.0, + "28": 431783936.0, + "29": 431783936.0, + "30": 431783936.0, + "31": 431783936.0, + "32": 431783936.0, + "33": 431783936.0, + "34": 431783936.0, + "35": 431783936.0, + "36": 431783936.0, + "37": 431783936.0, + "38": 431783936.0, + "39": 431783936.0, + "40": 431783936.0, + "41": 431783936.0, + "42": 431783936.0, + "43": 431783936.0, + "44": 431783936.0, + "45": 431783936.0, + "46": 431783936.0, + "47": 431783936.0, + "48": 431783936.0, + "49": 431783936.0, + "50": 431783936.0, + "51": 431783936.0, + "52": 431783936.0, + "53": 431783936.0, + "54": 431783936.0, + "55": 431783936.0, + "56": 431783936.0, + "57": 431783936.0, + "58": 431783936.0, + "59": 431783936.0, + "60": 431783936.0, + "61": 431783936.0, + "62": 431783936.0, + "63": 431783936.0, + "64": 431783936.0, + "65": 431783936.0, + "66": 431783936.0, + "67": 431783936.0, + "68": 431783936.0, + "69": 431783936.0, + "70": 431783936.0, + "71": 431783936.0, + "72": 431783936.0, + "73": 431783936.0, + "74": 431783936.0, + "75": 431783936.0, + "76": 431783936.0, + "77": 431783936.0, + "78": 431783936.0, + "79": 431783936.0, + "80": 431783936.0, + "81": 431783936.0, + "82": 431783936.0, + "83": 431783936.0, + "84": 431783936.0, + "85": 431783936.0, + "86": 431783936.0, + "87": 431783936.0, + "88": 431783936.0, + "89": 431783936.0, + "90": 431783936.0, + "91": 431783936.0, + "92": 431783936.0, + "93": 431783936.0, + "94": 431783936.0, + "95": 431783936.0, + "96": 431783936.0, + "97": 431783936.0, + "98": 431783936.0, + "99": 431783936.0, + "100": 431783936.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 677333504.0, + "2": 854262272.0, + "3": 854262272.0, + "4": 854262272.0, + "5": 854262272.0, + "6": 854262784.0, + "7": 854262784.0, + "8": 854262784.0, + "9": 854262784.0, + "10": 854262784.0, + "11": 854262784.0, + "12": 854262784.0, + "13": 854262784.0, + "14": 855309824.0, + "15": 855309824.0, + "16": 855309824.0, + "17": 855309824.0, + "18": 855309824.0, + "19": 855309824.0, + "20": 855309824.0, + "21": 855309824.0, + "22": 855309824.0, + "23": 855309824.0, + "24": 855310336.0, + "25": 855310336.0, + "26": 855310336.0, + "27": 855310336.0, + "28": 855310336.0, + "29": 855310336.0, + "30": 855310336.0, + "31": 855310336.0, + "32": 855310336.0, + "33": 855310336.0, + "34": 855310336.0, + "35": 855310336.0, + "36": 855310336.0, + "37": 855310336.0, + "38": 855310336.0, + "39": 855310848.0, + "40": 855310848.0, + "41": 855310848.0, + "42": 855310848.0, + "43": 855310848.0, + "44": 855310848.0, + "45": 855310848.0, + "46": 855310848.0, + "47": 855310848.0, + "48": 855310848.0, + "49": 855310848.0, + "50": 855310848.0, + "51": 855310848.0, + "52": 855311360.0, + "53": 855311360.0, + "54": 855311360.0, + "55": 855311360.0, + "56": 855311360.0, + "57": 855311360.0, + "58": 855311360.0, + "59": 855311360.0, + "60": 855311360.0, + "61": 855311360.0, + "62": 855311360.0, + "63": 855311360.0, + "64": 855311360.0, + "65": 855311360.0, + "66": 855311360.0, + "67": 855311360.0, + "68": 855311360.0, + "69": 855311360.0, + "70": 855311360.0, + "71": 855311360.0, + "72": 855311360.0, + "73": 855311360.0, + "74": 855311360.0, + "75": 855311360.0, + "76": 855311360.0, + "77": 855311360.0, + "78": 855311360.0, + "79": 855311360.0, + "80": 855311360.0, + "81": 855311360.0, + "82": 855311360.0, + "83": 855311360.0, + "84": 855311360.0, + "85": 855311360.0, + "86": 855311360.0, + "87": 855311360.0, + "88": 855311360.0, + "89": 855311360.0, + "90": 855311360.0, + "91": 855311360.0, + "92": 855311360.0, + "93": 855311360.0, + "94": 855311360.0, + "95": 855311360.0, + "96": 855311360.0, + "97": 855311360.0, + "98": 855311360.0, + "99": 855311360.0, + "100": 855311360.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 16.64296, + "2": 0.44061, + "3": 0.39868, + "4": 0.40602, + "5": 0.39627, + "6": 0.40168, + "7": 0.40214, + "8": 0.39767, + "9": 0.41335, + "10": 0.39617, + "11": 0.40142, + "12": 0.40689, + "13": 0.39378, + "14": 0.4283, + "15": 0.39562, + "16": 0.40196, + "17": 0.40151, + "18": 0.3962, + "19": 0.40589, + "20": 0.39453, + "21": 0.3993, + "22": 0.40417, + "23": 0.39434, + "24": 0.40809, + "25": 0.39356, + "26": 0.3984, + "27": 0.39878, + "28": 0.39312, + "29": 0.40669, + "30": 0.39393, + "31": 0.40709, + "32": 0.39611, + "33": 0.3938, + "34": 0.40377, + "35": 0.39302, + "36": 0.40068, + "37": 0.40083, + "38": 0.39393, + "39": 0.40832, + "40": 0.39387, + "41": 0.4, + "42": 0.4025, + "43": 0.39558, + "44": 0.41322, + "45": 0.3943, + "46": 0.40231, + "47": 0.40377, + "48": 0.39613, + "49": 0.41098, + "50": 0.39556, + "51": 0.41526, + "52": 0.40592, + "53": 0.39522, + "54": 0.39643, + "55": 0.40606, + "56": 0.39472, + "57": 0.41022, + "58": 0.3949, + "59": 0.39351, + "60": 0.40774, + "61": 0.39377, + "62": 0.40683, + "63": 0.3959, + "64": 0.39778, + "65": 0.40721, + "66": 0.39636, + "67": 0.41074, + "68": 0.39529, + "69": 0.39586, + "70": 0.40972, + "71": 0.39753, + "72": 0.40958, + "73": 0.39662, + "74": 0.39837, + "75": 0.40947, + "76": 0.3973, + "77": 0.41202, + "78": 0.3967, + "79": 0.39826, + "80": 0.41197, + "81": 0.39832, + "82": 0.40955, + "83": 0.39814, + "84": 0.39694, + "85": 0.41004, + "86": 0.3965, + "87": 0.4108, + "88": 0.39649, + "89": 0.3978, + "90": 0.41151, + "91": 0.39705, + "92": 0.41097, + "93": 0.39242, + "94": 0.39997, + "95": 0.40901, + "96": 0.39359, + "97": 0.40554, + "98": 0.40278, + "99": 0.39673, + "100": 0.40583 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..73ae0926a59 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.93292, + "2": 10.93423, + "3": 10.91346, + "4": 10.90324, + "5": 10.92972, + "6": 10.93653, + "7": 10.90279, + "8": 10.92113, + "9": 10.90704, + "10": 10.90477, + "11": 10.88787, + "12": 10.91738, + "13": 10.91189, + "14": 10.91507, + "15": 10.87125, + "16": 10.86126, + "17": 10.82697, + "18": 10.85673, + "19": 10.84059, + "20": 10.74997, + "21": 10.71507, + "22": 10.58117, + "23": 10.72642, + "24": 10.60726, + "25": 10.53749, + "26": 10.61068, + "27": 10.59929, + "28": 10.5496, + "29": 10.56602, + "30": 10.32547, + "31": 10.06697, + "32": 10.43814, + "33": 10.42363, + "34": 10.16017, + "35": 10.22894, + "36": 10.1762, + "37": 10.29237, + "38": 10.13297, + "39": 10.34954, + "40": 10.01975, + "41": 10.07536, + "42": 10.1541, + "43": 9.76088, + "44": 9.88355, + "45": 9.75547, + "46": 9.74961, + "47": 10.07545, + "48": 9.7794, + "49": 9.43818, + "50": 9.84069, + "51": 9.77754, + "52": 9.66525, + "53": 10.00737, + "54": 9.88878, + "55": 9.81447, + "56": 9.55923, + "57": 9.39915, + "58": 9.77269, + "59": 9.51596, + "60": 9.42442, + "61": 9.64311, + "62": 9.93507, + "63": 9.30273, + "64": 9.72153, + "65": 8.86708, + "66": 9.64649, + "67": 9.30858, + "68": 9.74064, + "69": 9.7415, + "70": 9.67901, + "71": 9.55877, + "72": 9.53276, + "73": 9.43849, + "74": 8.88229, + "75": 9.36665, + "76": 9.02475, + "77": 10.02958, + "78": 9.68855, + "79": 9.32606, + "80": 9.35307, + "81": 9.43246, + "82": 9.65191, + "83": 9.25402, + "84": 9.36522, + "85": 9.56708, + "86": 9.03554, + "87": 9.55776, + "88": 9.70744, + "89": 9.55897, + "90": 9.77584, + "91": 9.2965, + "92": 9.32116, + "93": 9.0287, + "94": 8.78307, + "95": 9.48325, + "96": 9.48475, + "97": 9.26678, + "98": 9.63738, + "99": 8.83898, + "100": 9.35879 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 582.0, + "2": 593.0, + "3": 619.0, + "4": 627.0, + "5": 660.0, + "6": 625.0, + "7": 597.0, + "8": 616.0, + "9": 608.0, + "10": 529.0, + "11": 692.0, + "12": 629.0, + "13": 695.0, + "14": 694.0, + "15": 606.0, + "16": 604.0, + "17": 647.0, + "18": 576.0, + "19": 570.0, + "20": 541.0, + "21": 625.0, + "22": 629.0, + "23": 676.0, + "24": 567.0, + "25": 617.0, + "26": 674.0, + "27": 680.0, + "28": 703.0, + "29": 684.0, + "30": 692.0, + "31": 565.0, + "32": 741.0, + "33": 789.0, + "34": 704.0, + "35": 718.0, + "36": 688.0, + "37": 762.0, + "38": 777.0, + "39": 847.0, + "40": 735.0, + "41": 839.0, + "42": 789.0, + "43": 710.0, + "44": 756.0, + "45": 780.0, + "46": 819.0, + "47": 844.0, + "48": 885.0, + "49": 833.0, + "50": 791.0, + "51": 878.0, + "52": 894.0, + "53": 955.0, + "54": 966.0, + "55": 923.0, + "56": 973.0, + "57": 844.0, + "58": 964.0, + "59": 977.0, + "60": 868.0, + "61": 931.0, + "62": 972.0, + "63": 884.0, + "64": 1042.0, + "65": 895.0, + "66": 1085.0, + "67": 992.0, + "68": 962.0, + "69": 1045.0, + "70": 1078.0, + "71": 1075.0, + "72": 935.0, + "73": 1035.0, + "74": 737.0, + "75": 875.0, + "76": 1037.0, + "77": 1154.0, + "78": 1118.0, + "79": 1051.0, + "80": 1190.0, + "81": 1225.0, + "82": 1135.0, + "83": 999.0, + "84": 1125.0, + "85": 1106.0, + "86": 866.0, + "87": 1201.0, + "88": 1075.0, + "89": 1177.0, + "90": 1092.0, + "91": 1055.0, + "92": 1162.0, + "93": 917.0, + "94": 1083.0, + "95": 1040.0, + "96": 1178.0, + "97": 1096.0, + "98": 1281.0, + "99": 1184.0, + "100": 1106.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 431783936.0, + "2": 431783936.0, + "3": 431783936.0, + "4": 431783936.0, + "5": 431783936.0, + "6": 431783936.0, + "7": 431783936.0, + "8": 431783936.0, + "9": 431783936.0, + "10": 431783936.0, + "11": 431783936.0, + "12": 431783936.0, + "13": 431783936.0, + "14": 431783936.0, + "15": 431783936.0, + "16": 431783936.0, + "17": 431783936.0, + "18": 431783936.0, + "19": 431783936.0, + "20": 431783936.0, + "21": 431783936.0, + "22": 431783936.0, + "23": 431783936.0, + "24": 431783936.0, + "25": 431783936.0, + "26": 431783936.0, + "27": 431783936.0, + "28": 431783936.0, + "29": 431783936.0, + "30": 431783936.0, + "31": 431783936.0, + "32": 431783936.0, + "33": 431783936.0, + "34": 431783936.0, + "35": 431783936.0, + "36": 431783936.0, + "37": 431783936.0, + "38": 431783936.0, + "39": 431783936.0, + "40": 431783936.0, + "41": 431783936.0, + "42": 431783936.0, + "43": 431783936.0, + "44": 431783936.0, + "45": 431783936.0, + "46": 431783936.0, + "47": 431783936.0, + "48": 431783936.0, + "49": 431783936.0, + "50": 431783936.0, + "51": 431783936.0, + "52": 431783936.0, + "53": 431783936.0, + "54": 431783936.0, + "55": 431783936.0, + "56": 431783936.0, + "57": 431783936.0, + "58": 431783936.0, + "59": 431783936.0, + "60": 431783936.0, + "61": 431783936.0, + "62": 431783936.0, + "63": 431783936.0, + "64": 431783936.0, + "65": 431783936.0, + "66": 431783936.0, + "67": 431783936.0, + "68": 431783936.0, + "69": 431783936.0, + "70": 431783936.0, + "71": 431783936.0, + "72": 431783936.0, + "73": 431783936.0, + "74": 431783936.0, + "75": 431783936.0, + "76": 431783936.0, + "77": 431783936.0, + "78": 431783936.0, + "79": 431783936.0, + "80": 431783936.0, + "81": 431783936.0, + "82": 431783936.0, + "83": 431783936.0, + "84": 431783936.0, + "85": 431783936.0, + "86": 431783936.0, + "87": 431783936.0, + "88": 431783936.0, + "89": 431783936.0, + "90": 431783936.0, + "91": 431783936.0, + "92": 431783936.0, + "93": 431783936.0, + "94": 431783936.0, + "95": 431783936.0, + "96": 431783936.0, + "97": 431783936.0, + "98": 431783936.0, + "99": 431783936.0, + "100": 431783936.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 678382080.0, + "2": 855308800.0, + "3": 855308800.0, + "4": 855308800.0, + "5": 855308800.0, + "6": 855308800.0, + "7": 855308800.0, + "8": 855308800.0, + "9": 855308800.0, + "10": 855308800.0, + "11": 855308800.0, + "12": 855308800.0, + "13": 855308800.0, + "14": 855308800.0, + "15": 855308800.0, + "16": 855310848.0, + "17": 855310848.0, + "18": 855310848.0, + "19": 855310848.0, + "20": 855310848.0, + "21": 855310848.0, + "22": 855310848.0, + "23": 855310848.0, + "24": 855310848.0, + "25": 855310848.0, + "26": 855310848.0, + "27": 855310848.0, + "28": 855310848.0, + "29": 855310848.0, + "30": 855310848.0, + "31": 855311360.0, + "32": 855311360.0, + "33": 855311360.0, + "34": 855311360.0, + "35": 855311360.0, + "36": 855311360.0, + "37": 855311360.0, + "38": 855311360.0, + "39": 855311360.0, + "40": 855311360.0, + "41": 855311360.0, + "42": 855311360.0, + "43": 855311360.0, + "44": 855311360.0, + "45": 855311360.0, + "46": 855311360.0, + "47": 855311360.0, + "48": 855311360.0, + "49": 855311360.0, + "50": 855311360.0, + "51": 855311360.0, + "52": 855311360.0, + "53": 855311360.0, + "54": 855311360.0, + "55": 855311360.0, + "56": 855311360.0, + "57": 855311360.0, + "58": 855311360.0, + "59": 855311360.0, + "60": 855311360.0, + "61": 855311360.0, + "62": 855311360.0, + "63": 855311360.0, + "64": 855311360.0, + "65": 855311360.0, + "66": 855311360.0, + "67": 855311360.0, + "68": 855311360.0, + "69": 855311360.0, + "70": 855311360.0, + "71": 855311360.0, + "72": 855311360.0, + "73": 855311360.0, + "74": 855311360.0, + "75": 855311360.0, + "76": 855311360.0, + "77": 855311360.0, + "78": 855311360.0, + "79": 855311360.0, + "80": 855311360.0, + "81": 855311360.0, + "82": 855311360.0, + "83": 855311360.0, + "84": 855311360.0, + "85": 855311360.0, + "86": 855311360.0, + "87": 855311360.0, + "88": 855311360.0, + "89": 855311360.0, + "90": 855311360.0, + "91": 855311360.0, + "92": 855311360.0, + "93": 855311360.0, + "94": 855311360.0, + "95": 855311360.0, + "96": 855311360.0, + "97": 855311360.0, + "98": 855311360.0, + "99": 855311360.0, + "100": 855311360.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 20.34843, + "2": 0.4496, + "3": 0.40575, + "4": 0.41925, + "5": 0.74795, + "6": 0.41468, + "7": 0.4068, + "8": 0.41689, + "9": 0.41436, + "10": 0.40801, + "11": 0.4195, + "12": 0.40914, + "13": 0.42647, + "14": 0.40668, + "15": 0.41793, + "16": 0.41417, + "17": 0.40751, + "18": 0.42901, + "19": 0.41369, + "20": 0.41147, + "21": 0.41666, + "22": 0.4069, + "23": 0.41601, + "24": 0.40503, + "25": 0.41667, + "26": 0.40986, + "27": 0.4062, + "28": 0.41374, + "29": 0.40694, + "30": 0.42156, + "31": 0.4086, + "32": 0.4087, + "33": 0.42034, + "34": 0.40632, + "35": 0.42126, + "36": 0.4059, + "37": 0.41875, + "38": 0.41448, + "39": 0.40473, + "40": 0.4248, + "41": 0.40265, + "42": 0.41245, + "43": 0.41222, + "44": 0.40565, + "45": 0.42043, + "46": 0.40713, + "47": 0.41725, + "48": 0.41199, + "49": 0.41368, + "50": 0.41468, + "51": 0.40417, + "52": 0.40097, + "53": 0.39853, + "54": 0.40708, + "55": 0.39518, + "56": 0.3992, + "57": 0.39785, + "58": 0.39681, + "59": 0.4057, + "60": 0.39395, + "61": 0.39896, + "62": 0.40375, + "63": 0.3954, + "64": 0.40498, + "65": 0.39366, + "66": 0.39924, + "67": 0.40424, + "68": 0.39447, + "69": 0.40703, + "70": 0.39461, + "71": 0.39881, + "72": 0.40382, + "73": 0.39319, + "74": 0.40889, + "75": 0.39321, + "76": 0.39854, + "77": 0.40156, + "78": 0.39432, + "79": 0.40811, + "80": 0.39353, + "81": 0.39894, + "82": 0.4043, + "83": 0.39208, + "84": 0.44003, + "85": 0.39225, + "86": 0.40107, + "87": 0.40581, + "88": 0.39601, + "89": 0.41177, + "90": 0.39396, + "91": 0.40039, + "92": 0.40383, + "93": 0.39686, + "94": 0.40986, + "95": 0.39506, + "96": 0.40327, + "97": 0.40327, + "98": 0.39659, + "99": 0.40763, + "100": 0.39858 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 588420ea5a1..2c78cced2a6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.85949, + "2": 10.85553, + "3": 10.86548, + "4": 10.84554, "5": 10.88344, + "6": 10.89429, + "7": 10.87068, + "8": 10.86983, + "9": 10.86919, "10": 10.83883, + "11": 10.89435, + "12": 10.8798, + "13": 10.87987, + "14": 10.90317, "15": 10.8405, + "16": 10.83786, + "17": 10.80668, + "18": 10.83025, + "19": 10.82262, "20": 10.73192, + "21": 10.7075, + "22": 10.56005, + "23": 10.72406, + "24": 10.61116, "25": 10.5481, + "26": 10.61334, + "27": 10.6305, + "28": 10.56645, + "29": 10.59672, "30": 10.37136, + "31": 10.11721, + "32": 10.46127, + "33": 10.45247, + "34": 10.21687, "35": 10.27171, + "36": 10.2312, + "37": 10.34809, + "38": 10.18842, + "39": 10.41042, "40": 10.09426, + "41": 10.14711, + "42": 10.21247, + "43": 9.84106, + "44": 9.95919, "45": 9.84082, + "46": 9.82482, + "47": 10.13882, + "48": 9.85839, + "49": 9.5472, "50": 9.90883, + "51": 9.85585, + "52": 9.75243, + "53": 10.07588, + "54": 9.95691, "55": 9.88207, + "56": 9.63139, + "57": 9.48649, + "58": 9.83116, + "59": 9.58907, "60": 9.50648, + "61": 9.70368, + "62": 9.98289, + "63": 9.38314, + "64": 9.7791, "65": 8.95182, + "66": 9.70161, + "67": 9.37209, + "68": 9.78856, + "69": 9.79856, "70": 9.74748, + "71": 9.6191, + "72": 9.585, + "73": 9.49728, + "74": 8.93928, "75": 9.42702, + "76": 9.08022, + "77": 10.06569, + "78": 9.72897, + "79": 9.37772, "80": 9.41001, + "81": 9.47977, + "82": 9.70183, + "83": 9.30621, + "84": 9.42098, "85": 9.61377, + "86": 9.07654, + "87": 9.59456, + "88": 9.75071, + "89": 9.60243, "90": 9.81899, + "91": 9.33898, + "92": 9.35718, + "93": 9.07884, + "94": 8.83509, "95": 9.52175, + "96": 9.53007, + "97": 9.31309, + "98": 9.67781, + "99": 8.89061, "100": 9.39729 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1690.0, + "2": 1776.0, + "3": 1642.0, + "4": 1825.0, "5": 1809.0, + "6": 1795.0, + "7": 1830.0, + "8": 1626.0, + "9": 1878.0, "10": 1423.0, + "11": 1868.0, + "12": 1653.0, + "13": 1897.0, + "14": 1783.0, "15": 1861.0, + "16": 1938.0, + "17": 1825.0, + "18": 1730.0, + "19": 1727.0, "20": 1735.0, + "21": 1783.0, + "22": 1576.0, + "23": 1949.0, + "24": 1630.0, "25": 1498.0, + "26": 1649.0, + "27": 1809.0, + "28": 2019.0, + "29": 2009.0, "30": 1832.0, + "31": 1524.0, + "32": 1943.0, + "33": 2081.0, + "34": 1888.0, "35": 1935.0, + "36": 1898.0, + "37": 2325.0, + "38": 2070.0, + "39": 2248.0, "40": 2199.0, + "41": 2264.0, + "42": 2349.0, + "43": 2087.0, + "44": 2107.0, "45": 2098.0, + "46": 2407.0, + "47": 2456.0, + "48": 2404.0, + "49": 2417.0, "50": 2407.0, + "51": 2578.0, + "52": 2630.0, + "53": 2857.0, + "54": 2818.0, "55": 2368.0, + "56": 2757.0, + "57": 2423.0, + "58": 2776.0, + "59": 2742.0, "60": 2371.0, + "61": 2906.0, + "62": 2517.0, + "63": 2374.0, + "64": 2995.0, "65": 2634.0, + "66": 2995.0, + "67": 2884.0, + "68": 2840.0, + "69": 2766.0, "70": 3006.0, + "71": 3023.0, + "72": 2386.0, + "73": 2958.0, + "74": 1851.0, "75": 2585.0, + "76": 2973.0, + "77": 3244.0, + "78": 3142.0, + "79": 3185.0, "80": 3249.0, + "81": 3665.0, + "82": 3153.0, + "83": 2821.0, + "84": 3083.0, "85": 3247.0, + "86": 2734.0, + "87": 3759.0, + "88": 2968.0, + "89": 3282.0, "90": 3064.0, + "91": 2908.0, + "92": 2946.0, + "93": 2592.0, + "94": 3363.0, "95": 3423.0, + "96": 3259.0, + "97": 2976.0, + "98": 3683.0, + "99": 3173.0, "100": 3143.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 516194816.0, + "2": 516194816.0, + "3": 516194816.0, + "4": 516194816.0, "5": 516194816.0, + "6": 516194816.0, + "7": 516194816.0, + "8": 516194816.0, + "9": 516194816.0, "10": 516194816.0, + "11": 516194816.0, + "12": 516194816.0, + "13": 516194816.0, + "14": 516194816.0, "15": 516194816.0, + "16": 516194816.0, + "17": 516194816.0, + "18": 516194816.0, + "19": 516194816.0, "20": 516194816.0, + "21": 516194816.0, + "22": 516194816.0, + "23": 516194816.0, + "24": 516194816.0, "25": 516194816.0, + "26": 516194816.0, + "27": 516194816.0, + "28": 516194816.0, + "29": 516194816.0, "30": 516194816.0, + "31": 516194816.0, + "32": 516194816.0, + "33": 516194816.0, + "34": 516194816.0, "35": 516194816.0, + "36": 516194816.0, + "37": 516194816.0, + "38": 516194816.0, + "39": 516194816.0, "40": 516194816.0, + "41": 516194816.0, + "42": 516194816.0, + "43": 516194816.0, + "44": 516194816.0, "45": 516194816.0, + "46": 516194816.0, + "47": 516194816.0, + "48": 516194816.0, + "49": 516194816.0, "50": 516194816.0, + "51": 516194816.0, + "52": 516194816.0, + "53": 516194816.0, + "54": 516194816.0, "55": 516194816.0, + "56": 516194816.0, + "57": 516194816.0, + "58": 516194816.0, + "59": 516194816.0, "60": 516194816.0, + "61": 516194816.0, + "62": 516194816.0, + "63": 516194816.0, + "64": 516194816.0, "65": 516194816.0, + "66": 516194816.0, + "67": 516194816.0, + "68": 516194816.0, + "69": 516194816.0, "70": 516194816.0, + "71": 516194816.0, + "72": 516194816.0, + "73": 516194816.0, + "74": 516194816.0, "75": 516194816.0, + "76": 516194816.0, + "77": 516194816.0, + "78": 516194816.0, + "79": 516194816.0, "80": 516194816.0, + "81": 516194816.0, + "82": 516194816.0, + "83": 516194816.0, + "84": 516194816.0, "85": 516194816.0, + "86": 516194816.0, + "87": 516194816.0, + "88": 516194816.0, + "89": 516194816.0, "90": 516194816.0, + "91": 516194816.0, + "92": 516194816.0, + "93": 516194816.0, + "94": 516194816.0, "95": 516194816.0, + "96": 516194816.0, + "97": 516194816.0, + "98": 516194816.0, + "99": 516194816.0, "100": 516194816.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1670130688.0, + "2": 1840523776.0, + "3": 1840523776.0, + "4": 1840523776.0, "5": 1840523776.0, + "6": 1840523776.0, + "7": 1840523776.0, + "8": 1840523776.0, + "9": 1840523776.0, "10": 1840523776.0, + "11": 1840523776.0, + "12": 1840523776.0, + "13": 1840523776.0, + "14": 1840523776.0, "15": 1840523776.0, - "20": 1841310208.0, - "25": 1841310208.0, - "30": 1841310208.0, - "35": 1841310208.0, - "40": 1841310208.0, - "45": 1841310208.0, - "50": 1841310208.0, - "55": 1841310208.0, - "60": 1841310208.0, - "65": 1841310208.0, - "70": 1841310208.0, - "75": 1841310208.0, - "80": 1841310208.0, - "85": 1841310208.0, - "90": 1841310208.0, - "95": 1841310208.0, - "100": 1841310208.0 + "16": 1840523776.0, + "17": 1840523776.0, + "18": 1840523776.0, + "19": 1840523776.0, + "20": 1840523776.0, + "21": 1840523776.0, + "22": 1840523776.0, + "23": 1840523776.0, + "24": 1840523776.0, + "25": 1840523776.0, + "26": 1840523776.0, + "27": 1840523776.0, + "28": 1840523776.0, + "29": 1840523776.0, + "30": 1840523776.0, + "31": 1840523776.0, + "32": 1840523776.0, + "33": 1840523776.0, + "34": 1840523776.0, + "35": 1840523776.0, + "36": 1840523776.0, + "37": 1840523776.0, + "38": 1840523776.0, + "39": 1840523776.0, + "40": 1840523776.0, + "41": 1840523776.0, + "42": 1840523776.0, + "43": 1840523776.0, + "44": 1840523776.0, + "45": 1840523776.0, + "46": 1840523776.0, + "47": 1840523776.0, + "48": 1840523776.0, + "49": 1840523776.0, + "50": 1840523776.0, + "51": 1840523776.0, + "52": 1840523776.0, + "53": 1840523776.0, + "54": 1840523776.0, + "55": 1840523776.0, + "56": 1840523776.0, + "57": 1840523776.0, + "58": 1840523776.0, + "59": 1840523776.0, + "60": 1840523776.0, + "61": 1840523776.0, + "62": 1840523776.0, + "63": 1840523776.0, + "64": 1840523776.0, + "65": 1840523776.0, + "66": 1840523776.0, + "67": 1840523776.0, + "68": 1840523776.0, + "69": 1840523776.0, + "70": 1840523776.0, + "71": 1840523776.0, + "72": 1840523776.0, + "73": 1840523776.0, + "74": 1840523776.0, + "75": 1840523776.0, + "76": 1840523776.0, + "77": 1840523776.0, + "78": 1840523776.0, + "79": 1840523776.0, + "80": 1840523776.0, + "81": 1840523776.0, + "82": 1840523776.0, + "83": 1840523776.0, + "84": 1840523776.0, + "85": 1840523776.0, + "86": 1840523776.0, + "87": 1840523776.0, + "88": 1840523776.0, + "89": 1840523776.0, + "90": 1840523776.0, + "91": 1840523776.0, + "92": 1840523776.0, + "93": 1840523776.0, + "94": 1840523776.0, + "95": 1840523776.0, + "96": 1840523776.0, + "97": 1840523776.0, + "98": 1840523776.0, + "99": 1840523776.0, + "100": 1840523776.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 14.69041, - "5": 0.12029, - "10": 0.12392, - "15": 0.12795, - "20": 0.12945, - "25": 0.11653, - "30": 0.11758, - "35": 0.12012, - "40": 0.11726, - "45": 0.11921, - "50": 0.12046, - "55": 0.11872, - "60": 0.11663, - "65": 0.11858, - "70": 0.11801, - "75": 0.11679, - "80": 0.11617, - "85": 0.11789, - "90": 0.11709, - "95": 0.11779, - "100": 0.11872 + "1": 15.10612, + "2": 0.1542, + "3": 0.13803, + "4": 0.14173, + "5": 0.13703, + "6": 0.13715, + "7": 0.13669, + "8": 0.13634, + "9": 0.13883, + "10": 0.13804, + "11": 0.13759, + "12": 0.1376, + "13": 0.1382, + "14": 0.13696, + "15": 0.13434, + "16": 0.13528, + "17": 0.13745, + "18": 0.13625, + "19": 0.13968, + "20": 0.13682, + "21": 0.13596, + "22": 0.13719, + "23": 0.13667, + "24": 0.13638, + "25": 0.13753, + "26": 0.13644, + "27": 0.13707, + "28": 0.13952, + "29": 0.1369, + "30": 0.13707, + "31": 0.13675, + "32": 0.13583, + "33": 0.1367, + "34": 0.13775, + "35": 0.13604, + "36": 0.13754, + "37": 0.13616, + "38": 0.13653, + "39": 0.13703, + "40": 0.13711, + "41": 0.13929, + "42": 0.1367, + "43": 0.13765, + "44": 0.1376, + "45": 0.13629, + "46": 0.13767, + "47": 0.13691, + "48": 0.13819, + "49": 0.13713, + "50": 0.13764, + "51": 0.14385, + "52": 0.13731, + "53": 0.13926, + "54": 0.13909, + "55": 0.13708, + "56": 0.13606, + "57": 0.1385, + "58": 0.13816, + "59": 0.13715, + "60": 0.13837, + "61": 0.13836, + "62": 0.13899, + "63": 0.13766, + "64": 0.13809, + "65": 0.1396, + "66": 0.13817, + "67": 0.13774, + "68": 0.13776, + "69": 0.13995, + "70": 0.14012, + "71": 0.13829, + "72": 0.14013, + "73": 0.13752, + "74": 0.13771, + "75": 0.13835, + "76": 0.13975, + "77": 0.13762, + "78": 0.13969, + "79": 0.14152, + "80": 0.13795, + "81": 0.13719, + "82": 0.13686, + "83": 0.13959, + "84": 0.13635, + "85": 0.13911, + "86": 0.13853, + "87": 0.13756, + "88": 0.13795, + "89": 0.13781, + "90": 0.13889, + "91": 0.1373, + "92": 0.14159, + "93": 0.13719, + "94": 0.13599, + "95": 0.13739, + "96": 0.13865, + "97": 0.13776, + "98": 0.14044, + "99": 0.13747, + "100": 0.13826 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..bb22d5373cc --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85949, + "2": 10.85553, + "3": 10.86548, + "4": 10.84554, + "5": 10.88344, + "6": 10.89429, + "7": 10.87068, + "8": 10.86983, + "9": 10.86919, + "10": 10.83883, + "11": 10.89435, + "12": 10.8798, + "13": 10.87987, + "14": 10.90317, + "15": 10.8405, + "16": 10.83786, + "17": 10.80668, + "18": 10.83025, + "19": 10.82262, + "20": 10.73192, + "21": 10.7075, + "22": 10.56005, + "23": 10.72406, + "24": 10.61116, + "25": 10.5481, + "26": 10.61334, + "27": 10.6305, + "28": 10.56645, + "29": 10.59672, + "30": 10.37136, + "31": 10.11721, + "32": 10.46127, + "33": 10.45247, + "34": 10.21687, + "35": 10.27171, + "36": 10.2312, + "37": 10.34809, + "38": 10.18842, + "39": 10.41042, + "40": 10.09426, + "41": 10.14711, + "42": 10.21247, + "43": 9.84106, + "44": 9.95919, + "45": 9.84082, + "46": 9.82482, + "47": 10.13882, + "48": 9.85839, + "49": 9.5472, + "50": 9.90883, + "51": 9.85585, + "52": 9.75243, + "53": 10.07588, + "54": 9.95691, + "55": 9.88207, + "56": 9.63139, + "57": 9.48649, + "58": 9.83116, + "59": 9.58907, + "60": 9.50648, + "61": 9.70368, + "62": 9.98289, + "63": 9.38314, + "64": 9.7791, + "65": 8.95182, + "66": 9.70161, + "67": 9.37209, + "68": 9.78856, + "69": 9.79856, + "70": 9.74748, + "71": 9.6191, + "72": 9.585, + "73": 9.49728, + "74": 8.93928, + "75": 9.42702, + "76": 9.08022, + "77": 10.06569, + "78": 9.72897, + "79": 9.37772, + "80": 9.41001, + "81": 9.47977, + "82": 9.70183, + "83": 9.30621, + "84": 9.42098, + "85": 9.61377, + "86": 9.07654, + "87": 9.59456, + "88": 9.75071, + "89": 9.60243, + "90": 9.81899, + "91": 9.33898, + "92": 9.35718, + "93": 9.07884, + "94": 8.83509, + "95": 9.52175, + "96": 9.53007, + "97": 9.31309, + "98": 9.67781, + "99": 8.89061, + "100": 9.39729 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1690.0, + "2": 1776.0, + "3": 1642.0, + "4": 1825.0, + "5": 1809.0, + "6": 1795.0, + "7": 1830.0, + "8": 1626.0, + "9": 1878.0, + "10": 1423.0, + "11": 1868.0, + "12": 1653.0, + "13": 1897.0, + "14": 1783.0, + "15": 1861.0, + "16": 1938.0, + "17": 1825.0, + "18": 1730.0, + "19": 1727.0, + "20": 1735.0, + "21": 1783.0, + "22": 1576.0, + "23": 1949.0, + "24": 1630.0, + "25": 1498.0, + "26": 1649.0, + "27": 1809.0, + "28": 2019.0, + "29": 2009.0, + "30": 1832.0, + "31": 1524.0, + "32": 1943.0, + "33": 2081.0, + "34": 1888.0, + "35": 1935.0, + "36": 1898.0, + "37": 2325.0, + "38": 2070.0, + "39": 2248.0, + "40": 2199.0, + "41": 2264.0, + "42": 2349.0, + "43": 2087.0, + "44": 2107.0, + "45": 2098.0, + "46": 2407.0, + "47": 2456.0, + "48": 2404.0, + "49": 2417.0, + "50": 2407.0, + "51": 2578.0, + "52": 2630.0, + "53": 2857.0, + "54": 2818.0, + "55": 2368.0, + "56": 2757.0, + "57": 2423.0, + "58": 2776.0, + "59": 2742.0, + "60": 2371.0, + "61": 2906.0, + "62": 2517.0, + "63": 2374.0, + "64": 2995.0, + "65": 2634.0, + "66": 2995.0, + "67": 2884.0, + "68": 2840.0, + "69": 2766.0, + "70": 3006.0, + "71": 3023.0, + "72": 2386.0, + "73": 2958.0, + "74": 1851.0, + "75": 2585.0, + "76": 2973.0, + "77": 3244.0, + "78": 3142.0, + "79": 3185.0, + "80": 3249.0, + "81": 3665.0, + "82": 3153.0, + "83": 2821.0, + "84": 3083.0, + "85": 3247.0, + "86": 2734.0, + "87": 3759.0, + "88": 2968.0, + "89": 3282.0, + "90": 3064.0, + "91": 2908.0, + "92": 2946.0, + "93": 2592.0, + "94": 3363.0, + "95": 3423.0, + "96": 3259.0, + "97": 2976.0, + "98": 3683.0, + "99": 3173.0, + "100": 3143.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 516194816.0, + "2": 516194816.0, + "3": 516194816.0, + "4": 516194816.0, + "5": 516194816.0, + "6": 516194816.0, + "7": 516194816.0, + "8": 516194816.0, + "9": 516194816.0, + "10": 516194816.0, + "11": 516194816.0, + "12": 516194816.0, + "13": 516194816.0, + "14": 516194816.0, + "15": 516194816.0, + "16": 516194816.0, + "17": 516194816.0, + "18": 516194816.0, + "19": 516194816.0, + "20": 516194816.0, + "21": 516194816.0, + "22": 516194816.0, + "23": 516194816.0, + "24": 516194816.0, + "25": 516194816.0, + "26": 516194816.0, + "27": 516194816.0, + "28": 516194816.0, + "29": 516194816.0, + "30": 516194816.0, + "31": 516194816.0, + "32": 516194816.0, + "33": 516194816.0, + "34": 516194816.0, + "35": 516194816.0, + "36": 516194816.0, + "37": 516194816.0, + "38": 516194816.0, + "39": 516194816.0, + "40": 516194816.0, + "41": 516194816.0, + "42": 516194816.0, + "43": 516194816.0, + "44": 516194816.0, + "45": 516194816.0, + "46": 516194816.0, + "47": 516194816.0, + "48": 516194816.0, + "49": 516194816.0, + "50": 516194816.0, + "51": 516194816.0, + "52": 516194816.0, + "53": 516194816.0, + "54": 516194816.0, + "55": 516194816.0, + "56": 516194816.0, + "57": 516194816.0, + "58": 516194816.0, + "59": 516194816.0, + "60": 516194816.0, + "61": 516194816.0, + "62": 516194816.0, + "63": 516194816.0, + "64": 516194816.0, + "65": 516194816.0, + "66": 516194816.0, + "67": 516194816.0, + "68": 516194816.0, + "69": 516194816.0, + "70": 516194816.0, + "71": 516194816.0, + "72": 516194816.0, + "73": 516194816.0, + "74": 516194816.0, + "75": 516194816.0, + "76": 516194816.0, + "77": 516194816.0, + "78": 516194816.0, + "79": 516194816.0, + "80": 516194816.0, + "81": 516194816.0, + "82": 516194816.0, + "83": 516194816.0, + "84": 516194816.0, + "85": 516194816.0, + "86": 516194816.0, + "87": 516194816.0, + "88": 516194816.0, + "89": 516194816.0, + "90": 516194816.0, + "91": 516194816.0, + "92": 516194816.0, + "93": 516194816.0, + "94": 516194816.0, + "95": 516194816.0, + "96": 516194816.0, + "97": 516194816.0, + "98": 516194816.0, + "99": 516194816.0, + "100": 516194816.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1670130688.0, + "2": 1840523776.0, + "3": 1840523776.0, + "4": 1840523776.0, + "5": 1840523776.0, + "6": 1840523776.0, + "7": 1841310208.0, + "8": 1841310208.0, + "9": 1841310208.0, + "10": 1841310208.0, + "11": 1841310208.0, + "12": 1841310208.0, + "13": 1841310208.0, + "14": 1841310208.0, + "15": 1841310208.0, + "16": 1841310208.0, + "17": 1841310208.0, + "18": 1841310208.0, + "19": 1841310208.0, + "20": 1841310208.0, + "21": 1841310208.0, + "22": 1841310208.0, + "23": 1841310208.0, + "24": 1841310208.0, + "25": 1841310208.0, + "26": 1841310208.0, + "27": 1841310208.0, + "28": 1841310208.0, + "29": 1841310208.0, + "30": 1841310208.0, + "31": 1841310208.0, + "32": 1841310208.0, + "33": 1841310208.0, + "34": 1841310208.0, + "35": 1841310208.0, + "36": 1841310208.0, + "37": 1841310208.0, + "38": 1841310208.0, + "39": 1841310208.0, + "40": 1841310208.0, + "41": 1841310208.0, + "42": 1841310208.0, + "43": 1841310208.0, + "44": 1841310208.0, + "45": 1841310208.0, + "46": 1841310208.0, + "47": 1841310208.0, + "48": 1841310208.0, + "49": 1841310208.0, + "50": 1841310208.0, + "51": 1841310208.0, + "52": 1841310208.0, + "53": 1841310208.0, + "54": 1841310208.0, + "55": 1841310208.0, + "56": 1841310208.0, + "57": 1841310208.0, + "58": 1841310208.0, + "59": 1841310208.0, + "60": 1841310208.0, + "61": 1841310208.0, + "62": 1841310208.0, + "63": 1841310208.0, + "64": 1841310208.0, + "65": 1841310208.0, + "66": 1841310208.0, + "67": 1841310208.0, + "68": 1841310208.0, + "69": 1841310208.0, + "70": 1841310208.0, + "71": 1841310208.0, + "72": 1841310208.0, + "73": 1841310208.0, + "74": 1841310208.0, + "75": 1841310208.0, + "76": 1841310208.0, + "77": 1841310208.0, + "78": 1841310208.0, + "79": 1841310208.0, + "80": 1841310208.0, + "81": 1841310208.0, + "82": 1841310208.0, + "83": 1841310208.0, + "84": 1841310208.0, + "85": 1841310208.0, + "86": 1841310208.0, + "87": 1841310208.0, + "88": 1841310208.0, + "89": 1841310208.0, + "90": 1841310208.0, + "91": 1841310208.0, + "92": 1841310208.0, + "93": 1841310208.0, + "94": 1841310208.0, + "95": 1841310208.0, + "96": 1841310208.0, + "97": 1841310208.0, + "98": 1841310208.0, + "99": 1841310208.0, + "100": 1841310208.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 14.64403, + "2": 0.16797, + "3": 0.12497, + "4": 0.12885, + "5": 0.12618, + "6": 0.13062, + "7": 0.13213, + "8": 0.12464, + "9": 0.11932, + "10": 0.11974, + "11": 0.11909, + "12": 0.12055, + "13": 0.1201, + "14": 0.12035, + "15": 0.12245, + "16": 0.12189, + "17": 0.12194, + "18": 0.12112, + "19": 0.12294, + "20": 0.12528, + "21": 0.12355, + "22": 0.12627, + "23": 0.13006, + "24": 0.12885, + "25": 0.12289, + "26": 0.12586, + "27": 0.12347, + "28": 0.12378, + "29": 0.12521, + "30": 0.12152, + "31": 0.12233, + "32": 0.12264, + "33": 0.12293, + "34": 0.12188, + "35": 0.12305, + "36": 0.11979, + "37": 0.12011, + "38": 0.12066, + "39": 0.11933, + "40": 0.1218, + "41": 0.1229, + "42": 0.12279, + "43": 0.12218, + "44": 0.12191, + "45": 0.12293, + "46": 0.12168, + "47": 0.12842, + "48": 0.12658, + "49": 0.12505, + "50": 0.12387, + "51": 0.1324, + "52": 0.13379, + "53": 0.1261, + "54": 0.11854, + "55": 0.11853, + "56": 0.11881, + "57": 0.1209, + "58": 0.12111, + "59": 0.11838, + "60": 0.12687, + "61": 0.11751, + "62": 0.11883, + "63": 0.11928, + "64": 0.11974, + "65": 0.11845, + "66": 0.11894, + "67": 0.11846, + "68": 0.11858, + "69": 0.11994, + "70": 0.11764, + "71": 0.12093, + "72": 0.11968, + "73": 0.1186, + "74": 0.11964, + "75": 0.11783, + "76": 0.1194, + "77": 0.11791, + "78": 0.12113, + "79": 0.11779, + "80": 0.11874, + "81": 0.1199, + "82": 0.11927, + "83": 0.1179, + "84": 0.11758, + "85": 0.11656, + "86": 0.11748, + "87": 0.11919, + "88": 0.11702, + "89": 0.11924, + "90": 0.11761, + "91": 0.12024, + "92": 0.12008, + "93": 0.11955, + "94": 0.11864, + "95": 0.11843, + "96": 0.1186, + "97": 0.1208, + "98": 0.11919, + "99": 0.11935, + "100": 0.1196 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..eb0e5f82b03 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85949, + "2": 10.85553, + "3": 10.86548, + "4": 10.84554, + "5": 10.88344, + "6": 10.89429, + "7": 10.87068, + "8": 10.86983, + "9": 10.86919, + "10": 10.83883, + "11": 10.89435, + "12": 10.8798, + "13": 10.87987, + "14": 10.90317, + "15": 10.8405, + "16": 10.83786, + "17": 10.80668, + "18": 10.83025, + "19": 10.82262, + "20": 10.73192, + "21": 10.7075, + "22": 10.56005, + "23": 10.72406, + "24": 10.61116, + "25": 10.5481, + "26": 10.61334, + "27": 10.6305, + "28": 10.56645, + "29": 10.59672, + "30": 10.37136, + "31": 10.11721, + "32": 10.46127, + "33": 10.45247, + "34": 10.21687, + "35": 10.27171, + "36": 10.2312, + "37": 10.34809, + "38": 10.18842, + "39": 10.41042, + "40": 10.09426, + "41": 10.14711, + "42": 10.21247, + "43": 9.84106, + "44": 9.95919, + "45": 9.84082, + "46": 9.82482, + "47": 10.13882, + "48": 9.85839, + "49": 9.5472, + "50": 9.90883, + "51": 9.85585, + "52": 9.75243, + "53": 10.07588, + "54": 9.95691, + "55": 9.88207, + "56": 9.63139, + "57": 9.48649, + "58": 9.83116, + "59": 9.58907, + "60": 9.50648, + "61": 9.70368, + "62": 9.98289, + "63": 9.38314, + "64": 9.7791, + "65": 8.95182, + "66": 9.70161, + "67": 9.37209, + "68": 9.78856, + "69": 9.79856, + "70": 9.74748, + "71": 9.6191, + "72": 9.585, + "73": 9.49728, + "74": 8.93928, + "75": 9.42702, + "76": 9.08022, + "77": 10.06569, + "78": 9.72897, + "79": 9.37772, + "80": 9.41001, + "81": 9.47977, + "82": 9.70183, + "83": 9.30621, + "84": 9.42098, + "85": 9.61377, + "86": 9.07654, + "87": 9.59456, + "88": 9.75071, + "89": 9.60243, + "90": 9.81899, + "91": 9.33898, + "92": 9.35718, + "93": 9.07884, + "94": 8.83509, + "95": 9.52175, + "96": 9.53007, + "97": 9.31309, + "98": 9.67781, + "99": 8.89061, + "100": 9.39729 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1690.0, + "2": 1776.0, + "3": 1642.0, + "4": 1825.0, + "5": 1809.0, + "6": 1795.0, + "7": 1830.0, + "8": 1626.0, + "9": 1878.0, + "10": 1423.0, + "11": 1868.0, + "12": 1653.0, + "13": 1897.0, + "14": 1783.0, + "15": 1861.0, + "16": 1938.0, + "17": 1825.0, + "18": 1730.0, + "19": 1727.0, + "20": 1735.0, + "21": 1783.0, + "22": 1576.0, + "23": 1949.0, + "24": 1630.0, + "25": 1498.0, + "26": 1649.0, + "27": 1809.0, + "28": 2019.0, + "29": 2009.0, + "30": 1832.0, + "31": 1524.0, + "32": 1943.0, + "33": 2081.0, + "34": 1888.0, + "35": 1935.0, + "36": 1898.0, + "37": 2325.0, + "38": 2070.0, + "39": 2248.0, + "40": 2199.0, + "41": 2264.0, + "42": 2349.0, + "43": 2087.0, + "44": 2107.0, + "45": 2098.0, + "46": 2407.0, + "47": 2456.0, + "48": 2404.0, + "49": 2417.0, + "50": 2407.0, + "51": 2578.0, + "52": 2630.0, + "53": 2857.0, + "54": 2818.0, + "55": 2368.0, + "56": 2757.0, + "57": 2423.0, + "58": 2776.0, + "59": 2742.0, + "60": 2371.0, + "61": 2906.0, + "62": 2517.0, + "63": 2374.0, + "64": 2995.0, + "65": 2634.0, + "66": 2995.0, + "67": 2884.0, + "68": 2840.0, + "69": 2766.0, + "70": 3006.0, + "71": 3023.0, + "72": 2386.0, + "73": 2958.0, + "74": 1851.0, + "75": 2585.0, + "76": 2973.0, + "77": 3244.0, + "78": 3142.0, + "79": 3185.0, + "80": 3249.0, + "81": 3665.0, + "82": 3153.0, + "83": 2821.0, + "84": 3083.0, + "85": 3247.0, + "86": 2734.0, + "87": 3759.0, + "88": 2968.0, + "89": 3282.0, + "90": 3064.0, + "91": 2908.0, + "92": 2946.0, + "93": 2592.0, + "94": 3363.0, + "95": 3423.0, + "96": 3259.0, + "97": 2976.0, + "98": 3683.0, + "99": 3173.0, + "100": 3143.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 516194816.0, + "2": 516194816.0, + "3": 516194816.0, + "4": 516194816.0, + "5": 516194816.0, + "6": 516194816.0, + "7": 516194816.0, + "8": 516194816.0, + "9": 516194816.0, + "10": 516194816.0, + "11": 516194816.0, + "12": 516194816.0, + "13": 516194816.0, + "14": 516194816.0, + "15": 516194816.0, + "16": 516194816.0, + "17": 516194816.0, + "18": 516194816.0, + "19": 516194816.0, + "20": 516194816.0, + "21": 516194816.0, + "22": 516194816.0, + "23": 516194816.0, + "24": 516194816.0, + "25": 516194816.0, + "26": 516194816.0, + "27": 516194816.0, + "28": 516194816.0, + "29": 516194816.0, + "30": 516194816.0, + "31": 516194816.0, + "32": 516194816.0, + "33": 516194816.0, + "34": 516194816.0, + "35": 516194816.0, + "36": 516194816.0, + "37": 516194816.0, + "38": 516194816.0, + "39": 516194816.0, + "40": 516194816.0, + "41": 516194816.0, + "42": 516194816.0, + "43": 516194816.0, + "44": 516194816.0, + "45": 516194816.0, + "46": 516194816.0, + "47": 516194816.0, + "48": 516194816.0, + "49": 516194816.0, + "50": 516194816.0, + "51": 516194816.0, + "52": 516194816.0, + "53": 516194816.0, + "54": 516194816.0, + "55": 516194816.0, + "56": 516194816.0, + "57": 516194816.0, + "58": 516194816.0, + "59": 516194816.0, + "60": 516194816.0, + "61": 516194816.0, + "62": 516194816.0, + "63": 516194816.0, + "64": 516194816.0, + "65": 516194816.0, + "66": 516194816.0, + "67": 516194816.0, + "68": 516194816.0, + "69": 516194816.0, + "70": 516194816.0, + "71": 516194816.0, + "72": 516194816.0, + "73": 516194816.0, + "74": 516194816.0, + "75": 516194816.0, + "76": 516194816.0, + "77": 516194816.0, + "78": 516194816.0, + "79": 516194816.0, + "80": 516194816.0, + "81": 516194816.0, + "82": 516194816.0, + "83": 516194816.0, + "84": 516194816.0, + "85": 516194816.0, + "86": 516194816.0, + "87": 516194816.0, + "88": 516194816.0, + "89": 516194816.0, + "90": 516194816.0, + "91": 516194816.0, + "92": 516194816.0, + "93": 516194816.0, + "94": 516194816.0, + "95": 516194816.0, + "96": 516194816.0, + "97": 516194816.0, + "98": 516194816.0, + "99": 516194816.0, + "100": 516194816.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1670130688.0, + "2": 1840523776.0, + "3": 1840523776.0, + "4": 1840523776.0, + "5": 1840523776.0, + "6": 1840523776.0, + "7": 1840523776.0, + "8": 1840523776.0, + "9": 1840523776.0, + "10": 1840523776.0, + "11": 1840523776.0, + "12": 1840523776.0, + "13": 1840523776.0, + "14": 1840523776.0, + "15": 1840523776.0, + "16": 1840523776.0, + "17": 1840523776.0, + "18": 1840523776.0, + "19": 1840523776.0, + "20": 1840523776.0, + "21": 1840523776.0, + "22": 1840523776.0, + "23": 1840523776.0, + "24": 1840523776.0, + "25": 1840523776.0, + "26": 1840523776.0, + "27": 1840523776.0, + "28": 1840523776.0, + "29": 1840523776.0, + "30": 1840523776.0, + "31": 1840523776.0, + "32": 1840523776.0, + "33": 1840523776.0, + "34": 1840523776.0, + "35": 1840523776.0, + "36": 1840523776.0, + "37": 1840523776.0, + "38": 1840523776.0, + "39": 1840523776.0, + "40": 1840523776.0, + "41": 1840523776.0, + "42": 1840523776.0, + "43": 1840523776.0, + "44": 1840523776.0, + "45": 1840523776.0, + "46": 1840523776.0, + "47": 1840523776.0, + "48": 1840523776.0, + "49": 1840523776.0, + "50": 1840523776.0, + "51": 1840523776.0, + "52": 1840523776.0, + "53": 1840523776.0, + "54": 1840523776.0, + "55": 1840523776.0, + "56": 1840523776.0, + "57": 1840523776.0, + "58": 1840523776.0, + "59": 1840523776.0, + "60": 1840523776.0, + "61": 1840523776.0, + "62": 1840523776.0, + "63": 1840523776.0, + "64": 1840523776.0, + "65": 1840523776.0, + "66": 1840523776.0, + "67": 1840523776.0, + "68": 1840523776.0, + "69": 1840523776.0, + "70": 1840523776.0, + "71": 1840523776.0, + "72": 1840523776.0, + "73": 1840523776.0, + "74": 1840523776.0, + "75": 1840523776.0, + "76": 1840523776.0, + "77": 1840523776.0, + "78": 1840523776.0, + "79": 1840523776.0, + "80": 1840523776.0, + "81": 1840523776.0, + "82": 1840523776.0, + "83": 1841310208.0, + "84": 1841310208.0, + "85": 1841310208.0, + "86": 1841310208.0, + "87": 1841310208.0, + "88": 1841310208.0, + "89": 1841310208.0, + "90": 1841310208.0, + "91": 1841310208.0, + "92": 1841310208.0, + "93": 1841310208.0, + "94": 1841310208.0, + "95": 1841310208.0, + "96": 1841310208.0, + "97": 1841310208.0, + "98": 1841310208.0, + "99": 1841310208.0, + "100": 1841310208.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 15.65402, + "2": 0.15533, + "3": 0.13713, + "4": 0.14193, + "5": 0.13861, + "6": 0.13948, + "7": 0.13637, + "8": 0.13619, + "9": 0.14162, + "10": 0.13725, + "11": 0.13988, + "12": 0.14179, + "13": 0.14346, + "14": 0.14488, + "15": 0.1468, + "16": 0.14288, + "17": 0.13708, + "18": 0.13765, + "19": 0.13957, + "20": 0.13778, + "21": 0.13931, + "22": 0.13758, + "23": 0.13751, + "24": 0.14023, + "25": 0.14508, + "26": 0.15744, + "27": 0.15391, + "28": 0.15519, + "29": 0.14118, + "30": 0.1391, + "31": 0.13604, + "32": 0.1366, + "33": 0.13813, + "34": 0.13786, + "35": 0.13728, + "36": 0.13981, + "37": 0.14024, + "38": 0.13688, + "39": 0.13391, + "40": 0.13738, + "41": 0.14059, + "42": 0.13512, + "43": 0.13775, + "44": 0.13641, + "45": 0.13686, + "46": 0.14053, + "47": 0.13951, + "48": 0.14166, + "49": 0.13555, + "50": 0.13577, + "51": 0.14328, + "52": 0.14201, + "53": 0.13861, + "54": 0.13965, + "55": 0.13807, + "56": 0.14044, + "57": 0.14358, + "58": 0.14042, + "59": 0.13858, + "60": 0.13959, + "61": 0.13788, + "62": 0.14032, + "63": 0.13843, + "64": 0.13942, + "65": 0.13742, + "66": 0.13948, + "67": 0.14263, + "68": 0.13848, + "69": 0.13944, + "70": 0.13874, + "71": 0.14302, + "72": 0.13748, + "73": 0.13837, + "74": 0.13911, + "75": 0.13965, + "76": 0.1466, + "77": 0.14259, + "78": 0.13635, + "79": 0.14025, + "80": 0.14725, + "81": 0.14592, + "82": 0.14832, + "83": 0.14727, + "84": 0.14437, + "85": 0.13721, + "86": 0.14235, + "87": 0.13812, + "88": 0.13937, + "89": 0.1389, + "90": 0.13661, + "91": 0.1432, + "92": 0.1389, + "93": 0.13881, + "94": 0.13803, + "95": 0.13815, + "96": 0.14203, + "97": 0.13816, + "98": 0.13963, + "99": 0.14236, + "100": 0.14371 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..b037a96c895 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.92655, + "2": 10.92585, + "3": 10.91515, + "4": 10.90905, + "5": 10.92721, + "6": 10.93563, + "7": 10.90642, + "8": 10.92122, + "9": 10.91072, + "10": 10.9079, + "11": 10.89281, + "12": 10.92428, + "13": 10.91489, + "14": 10.92146, + "15": 10.88294, + "16": 10.87306, + "17": 10.84064, + "18": 10.87301, + "19": 10.85639, + "20": 10.77595, + "21": 10.74891, + "22": 10.63081, + "23": 10.75618, + "24": 10.65646, + "25": 10.59263, + "26": 10.65434, + "27": 10.64917, + "28": 10.59496, + "29": 10.60943, + "30": 10.39175, + "31": 10.15724, + "32": 10.49108, + "33": 10.47963, + "34": 10.24072, + "35": 10.29699, + "36": 10.24669, + "37": 10.35246, + "38": 10.2048, + "39": 10.40502, + "40": 10.09661, + "41": 10.15196, + "42": 10.22071, + "43": 9.85506, + "44": 9.96164, + "45": 9.84471, + "46": 9.83835, + "47": 10.14005, + "48": 9.85759, + "49": 9.53745, + "50": 9.90943, + "51": 9.84889, + "52": 9.74165, + "53": 10.0634, + "54": 9.94734, + "55": 9.87774, + "56": 9.62734, + "57": 9.47159, + "58": 9.82898, + "59": 9.58277, + "60": 9.49122, + "61": 9.69967, + "62": 9.97993, + "63": 9.37282, + "64": 9.77462, + "65": 8.94257, + "66": 9.69881, + "67": 9.36409, + "68": 9.78788, + "69": 9.78337, + "70": 9.72278, + "71": 9.6081, + "72": 9.5843, + "73": 9.48976, + "74": 8.9486, + "75": 9.41891, + "76": 9.08727, + "77": 10.06346, + "78": 9.72838, + "79": 9.37152, + "80": 9.40057, + "81": 9.47832, + "82": 9.69155, + "83": 9.30737, + "84": 9.41234, + "85": 9.61188, + "86": 9.07586, + "87": 9.59459, + "88": 9.74737, + "89": 9.60679, + "90": 9.81026, + "91": 9.34362, + "92": 9.36488, + "93": 9.07724, + "94": 8.83091, + "95": 9.5172, + "96": 9.52447, + "97": 9.31032, + "98": 9.67872, + "99": 8.88837, + "100": 9.40136 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1652.0, + "2": 1809.0, + "3": 1697.0, + "4": 1687.0, + "5": 1983.0, + "6": 1918.0, + "7": 1852.0, + "8": 1726.0, + "9": 1864.0, + "10": 1445.0, + "11": 1907.0, + "12": 1737.0, + "13": 1917.0, + "14": 1796.0, + "15": 1908.0, + "16": 1761.0, + "17": 1863.0, + "18": 1755.0, + "19": 1793.0, + "20": 1636.0, + "21": 1854.0, + "22": 1706.0, + "23": 1991.0, + "24": 1637.0, + "25": 1729.0, + "26": 1800.0, + "27": 1859.0, + "28": 2032.0, + "29": 2012.0, + "30": 1912.0, + "31": 1529.0, + "32": 1953.0, + "33": 2266.0, + "34": 1934.0, + "35": 1910.0, + "36": 1967.0, + "37": 2323.0, + "38": 2236.0, + "39": 2450.0, + "40": 2184.0, + "41": 2303.0, + "42": 2258.0, + "43": 2025.0, + "44": 2240.0, + "45": 2122.0, + "46": 2252.0, + "47": 2581.0, + "48": 2451.0, + "49": 2292.0, + "50": 2525.0, + "51": 2822.0, + "52": 2570.0, + "53": 2948.0, + "54": 2795.0, + "55": 2407.0, + "56": 2786.0, + "57": 2346.0, + "58": 3115.0, + "59": 2885.0, + "60": 2430.0, + "61": 2926.0, + "62": 2574.0, + "63": 2362.0, + "64": 2948.0, + "65": 2802.0, + "66": 3346.0, + "67": 2744.0, + "68": 2926.0, + "69": 2971.0, + "70": 3278.0, + "71": 2955.0, + "72": 2445.0, + "73": 3156.0, + "74": 1933.0, + "75": 2547.0, + "76": 3025.0, + "77": 3458.0, + "78": 3206.0, + "79": 3240.0, + "80": 3526.0, + "81": 3691.0, + "82": 3454.0, + "83": 2739.0, + "84": 3328.0, + "85": 3300.0, + "86": 2859.0, + "87": 3822.0, + "88": 3130.0, + "89": 3409.0, + "90": 3148.0, + "91": 2760.0, + "92": 3173.0, + "93": 2608.0, + "94": 3428.0, + "95": 3402.0, + "96": 3633.0, + "97": 3222.0, + "98": 3696.0, + "99": 3142.0, + "100": 3351.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 438468608.0, + "2": 438468608.0, + "3": 438468608.0, + "4": 438468608.0, + "5": 438468608.0, + "6": 438468608.0, + "7": 438468608.0, + "8": 438468608.0, + "9": 438468608.0, + "10": 438468608.0, + "11": 438468608.0, + "12": 438468608.0, + "13": 438468608.0, + "14": 438468608.0, + "15": 438468608.0, + "16": 438468608.0, + "17": 438468608.0, + "18": 438468608.0, + "19": 438468608.0, + "20": 438468608.0, + "21": 438468608.0, + "22": 438468608.0, + "23": 438468608.0, + "24": 438468608.0, + "25": 438468608.0, + "26": 438468608.0, + "27": 438468608.0, + "28": 438468608.0, + "29": 438468608.0, + "30": 438468608.0, + "31": 438468608.0, + "32": 438468608.0, + "33": 438468608.0, + "34": 438468608.0, + "35": 438468608.0, + "36": 438468608.0, + "37": 438468608.0, + "38": 438468608.0, + "39": 438468608.0, + "40": 438468608.0, + "41": 438468608.0, + "42": 438468608.0, + "43": 438468608.0, + "44": 438468608.0, + "45": 438468608.0, + "46": 438468608.0, + "47": 438468608.0, + "48": 438468608.0, + "49": 438468608.0, + "50": 438468608.0, + "51": 438468608.0, + "52": 438468608.0, + "53": 438468608.0, + "54": 438468608.0, + "55": 438468608.0, + "56": 438468608.0, + "57": 438468608.0, + "58": 438468608.0, + "59": 438468608.0, + "60": 438468608.0, + "61": 438468608.0, + "62": 438468608.0, + "63": 438468608.0, + "64": 438468608.0, + "65": 438468608.0, + "66": 438468608.0, + "67": 438468608.0, + "68": 438468608.0, + "69": 438468608.0, + "70": 438468608.0, + "71": 438468608.0, + "72": 438468608.0, + "73": 438468608.0, + "74": 438468608.0, + "75": 438468608.0, + "76": 438468608.0, + "77": 438468608.0, + "78": 438468608.0, + "79": 438468608.0, + "80": 438468608.0, + "81": 438468608.0, + "82": 438468608.0, + "83": 438468608.0, + "84": 438468608.0, + "85": 438468608.0, + "86": 438468608.0, + "87": 438468608.0, + "88": 438468608.0, + "89": 438468608.0, + "90": 438468608.0, + "91": 438468608.0, + "92": 438468608.0, + "93": 438468608.0, + "94": 438468608.0, + "95": 438468608.0, + "96": 438468608.0, + "97": 438468608.0, + "98": 438468608.0, + "99": 438468608.0, + "100": 438468608.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2658189824.0, + "2": 2658189824.0, + "3": 2658189824.0, + "4": 2658189824.0, + "5": 2658189824.0, + "6": 2658189824.0, + "7": 2658189824.0, + "8": 2658189824.0, + "9": 2658189824.0, + "10": 2658189824.0, + "11": 2658189824.0, + "12": 2658189824.0, + "13": 2658189824.0, + "14": 2658189824.0, + "15": 2658189824.0, + "16": 2658189824.0, + "17": 2658189824.0, + "18": 2658189824.0, + "19": 2658189824.0, + "20": 2658189824.0, + "21": 2658189824.0, + "22": 2658189824.0, + "23": 2658189824.0, + "24": 2658189824.0, + "25": 2658189824.0, + "26": 2658189824.0, + "27": 2658189824.0, + "28": 2658189824.0, + "29": 2658189824.0, + "30": 2658189824.0, + "31": 2658189824.0, + "32": 2658189824.0, + "33": 2658189824.0, + "34": 2658189824.0, + "35": 2658189824.0, + "36": 2658189824.0, + "37": 2658189824.0, + "38": 2658189824.0, + "39": 2658189824.0, + "40": 2658189824.0, + "41": 2658189824.0, + "42": 2658189824.0, + "43": 2658189824.0, + "44": 2658189824.0, + "45": 2658189824.0, + "46": 2658189824.0, + "47": 2658189824.0, + "48": 2658189824.0, + "49": 2658189824.0, + "50": 2658189824.0, + "51": 2658189824.0, + "52": 2658189824.0, + "53": 2658189824.0, + "54": 2658189824.0, + "55": 2658189824.0, + "56": 2658189824.0, + "57": 2658189824.0, + "58": 2658189824.0, + "59": 2658189824.0, + "60": 2658189824.0, + "61": 2658189824.0, + "62": 2658189824.0, + "63": 2658189824.0, + "64": 2658189824.0, + "65": 2658189824.0, + "66": 2658189824.0, + "67": 2658189824.0, + "68": 2658189824.0, + "69": 2658189824.0, + "70": 2658189824.0, + "71": 2658189824.0, + "72": 2658189824.0, + "73": 2658189824.0, + "74": 2658189824.0, + "75": 2658189824.0, + "76": 2658189824.0, + "77": 2658189824.0, + "78": 2658189824.0, + "79": 2658189824.0, + "80": 2658189824.0, + "81": 2658189824.0, + "82": 2658189824.0, + "83": 2658189824.0, + "84": 2658189824.0, + "85": 2658189824.0, + "86": 2658189824.0, + "87": 2658189824.0, + "88": 2658189824.0, + "89": 2658189824.0, + "90": 2658189824.0, + "91": 2658189824.0, + "92": 2658189824.0, + "93": 2658189824.0, + "94": 2658189824.0, + "95": 2658189824.0, + "96": 2658189824.0, + "97": 2658189824.0, + "98": 2658189824.0, + "99": 2658189824.0, + "100": 2658189824.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 16.21334, + "2": 0.23608, + "3": 0.19735, + "4": 0.19252, + "5": 0.19648, + "6": 0.19203, + "7": 0.19219, + "8": 0.18973, + "9": 0.18684, + "10": 0.19159, + "11": 0.18643, + "12": 0.18986, + "13": 0.19025, + "14": 0.19056, + "15": 0.19293, + "16": 0.44796, + "17": 0.19013, + "18": 0.18935, + "19": 0.19012, + "20": 0.19194, + "21": 0.44342, + "22": 0.18909, + "23": 0.19253, + "24": 0.18728, + "25": 0.18638, + "26": 0.18656, + "27": 0.1932, + "28": 0.18998, + "29": 0.18957, + "30": 0.18392, + "31": 0.18385, + "32": 0.18468, + "33": 0.18516, + "34": 0.18864, + "35": 0.18375, + "36": 0.18378, + "37": 0.18966, + "38": 0.18733, + "39": 0.18976, + "40": 0.18909, + "41": 0.18487, + "42": 0.18422, + "43": 0.1846, + "44": 0.18581, + "45": 0.18726, + "46": 0.18439, + "47": 0.1845, + "48": 0.18384, + "49": 0.18422, + "50": 0.18685, + "51": 0.39339, + "52": 0.19487, + "53": 0.19224, + "54": 0.18723, + "55": 0.18809, + "56": 0.18463, + "57": 0.18414, + "58": 0.18472, + "59": 0.18467, + "60": 0.19286, + "61": 0.18645, + "62": 0.18785, + "63": 0.18591, + "64": 0.18644, + "65": 0.1905, + "66": 0.18834, + "67": 0.18595, + "68": 0.1873, + "69": 0.1863, + "70": 0.19033, + "71": 0.19567, + "72": 0.18818, + "73": 0.18498, + "74": 0.18476, + "75": 0.18427, + "76": 0.19433, + "77": 0.18426, + "78": 0.18436, + "79": 0.18486, + "80": 0.18553, + "81": 0.18804, + "82": 0.18885, + "83": 0.18682, + "84": 0.18782, + "85": 0.18674, + "86": 0.18747, + "87": 0.19054, + "88": 0.18731, + "89": 0.18701, + "90": 0.18815, + "91": 0.1867, + "92": 0.19324, + "93": 0.1868, + "94": 0.18625, + "95": 0.18677, + "96": 0.18717, + "97": 0.1888, + "98": 0.19044, + "99": 0.19131, + "100": 0.18423 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..f917c6cc0e4 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.92655, + "2": 10.92585, + "3": 10.91514, + "4": 10.90903, + "5": 10.92718, + "6": 10.93557, + "7": 10.90644, + "8": 10.92122, + "9": 10.91072, + "10": 10.90789, + "11": 10.89278, + "12": 10.9243, + "13": 10.91485, + "14": 10.92142, + "15": 10.8829, + "16": 10.87307, + "17": 10.84066, + "18": 10.87298, + "19": 10.85633, + "20": 10.77594, + "21": 10.74895, + "22": 10.63081, + "23": 10.75621, + "24": 10.65644, + "25": 10.59266, + "26": 10.65438, + "27": 10.64909, + "28": 10.59497, + "29": 10.60943, + "30": 10.39176, + "31": 10.15724, + "32": 10.4911, + "33": 10.47963, + "34": 10.24068, + "35": 10.29701, + "36": 10.24669, + "37": 10.35242, + "38": 10.20484, + "39": 10.40506, + "40": 10.09662, + "41": 10.15193, + "42": 10.22066, + "43": 9.85508, + "44": 9.96165, + "45": 9.84471, + "46": 9.83836, + "47": 10.14003, + "48": 9.85764, + "49": 9.53744, + "50": 9.90947, + "51": 9.84892, + "52": 9.74166, + "53": 10.06337, + "54": 9.9473, + "55": 9.87771, + "56": 9.62738, + "57": 9.47161, + "58": 9.82894, + "59": 9.58274, + "60": 9.49123, + "61": 9.69974, + "62": 9.9799, + "63": 9.37281, + "64": 9.77461, + "65": 8.94257, + "66": 9.69883, + "67": 9.36406, + "68": 9.78786, + "69": 9.78336, + "70": 9.72276, + "71": 9.6081, + "72": 9.58428, + "73": 9.48979, + "74": 8.94855, + "75": 9.4189, + "76": 9.08727, + "77": 10.06346, + "78": 9.72838, + "79": 9.37156, + "80": 9.40056, + "81": 9.47827, + "82": 9.69154, + "83": 9.30739, + "84": 9.41237, + "85": 9.61189, + "86": 9.07589, + "87": 9.59464, + "88": 9.74734, + "89": 9.60676, + "90": 9.81027, + "91": 9.3436, + "92": 9.36495, + "93": 9.07727, + "94": 8.83093, + "95": 9.51724, + "96": 9.52445, + "97": 9.31032, + "98": 9.67873, + "99": 8.88838, + "100": 9.40135 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1664.0, + "2": 1830.0, + "3": 1679.0, + "4": 1739.0, + "5": 1978.0, + "6": 1893.0, + "7": 1836.0, + "8": 1709.0, + "9": 1941.0, + "10": 1440.0, + "11": 1916.0, + "12": 1781.0, + "13": 1978.0, + "14": 1867.0, + "15": 1997.0, + "16": 1704.0, + "17": 1822.0, + "18": 1610.0, + "19": 1700.0, + "20": 1580.0, + "21": 1805.0, + "22": 1706.0, + "23": 1968.0, + "24": 1619.0, + "25": 1717.0, + "26": 1852.0, + "27": 1944.0, + "28": 2087.0, + "29": 2009.0, + "30": 1915.0, + "31": 1560.0, + "32": 1963.0, + "33": 2161.0, + "34": 2003.0, + "35": 1941.0, + "36": 1977.0, + "37": 2353.0, + "38": 2193.0, + "39": 2425.0, + "40": 2125.0, + "41": 2239.0, + "42": 2203.0, + "43": 1988.0, + "44": 2154.0, + "45": 2037.0, + "46": 2222.0, + "47": 2644.0, + "48": 2428.0, + "49": 2272.0, + "50": 2482.0, + "51": 2746.0, + "52": 2634.0, + "53": 2927.0, + "54": 2689.0, + "55": 2476.0, + "56": 2694.0, + "57": 2382.0, + "58": 3021.0, + "59": 2806.0, + "60": 2510.0, + "61": 2886.0, + "62": 2639.0, + "63": 2314.0, + "64": 3075.0, + "65": 2677.0, + "66": 3260.0, + "67": 2866.0, + "68": 2797.0, + "69": 2920.0, + "70": 3298.0, + "71": 3074.0, + "72": 2433.0, + "73": 3082.0, + "74": 1986.0, + "75": 2706.0, + "76": 3045.0, + "77": 3450.0, + "78": 3299.0, + "79": 3366.0, + "80": 3348.0, + "81": 3827.0, + "82": 3410.0, + "83": 2855.0, + "84": 3427.0, + "85": 3226.0, + "86": 2724.0, + "87": 3790.0, + "88": 3083.0, + "89": 3503.0, + "90": 3119.0, + "91": 2684.0, + "92": 3159.0, + "93": 2689.0, + "94": 3478.0, + "95": 3464.0, + "96": 3584.0, + "97": 3223.0, + "98": 3723.0, + "99": 3220.0, + "100": 3335.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 438468608.0, + "2": 438468608.0, + "3": 438468608.0, + "4": 438468608.0, + "5": 438468608.0, + "6": 438468608.0, + "7": 438468608.0, + "8": 438468608.0, + "9": 438468608.0, + "10": 438468608.0, + "11": 438468608.0, + "12": 438468608.0, + "13": 438468608.0, + "14": 438468608.0, + "15": 438468608.0, + "16": 438468608.0, + "17": 438468608.0, + "18": 438468608.0, + "19": 438468608.0, + "20": 438468608.0, + "21": 438468608.0, + "22": 438468608.0, + "23": 438468608.0, + "24": 438468608.0, + "25": 438468608.0, + "26": 438468608.0, + "27": 438468608.0, + "28": 438468608.0, + "29": 438468608.0, + "30": 438468608.0, + "31": 438468608.0, + "32": 438468608.0, + "33": 438468608.0, + "34": 438468608.0, + "35": 438468608.0, + "36": 438468608.0, + "37": 438468608.0, + "38": 438468608.0, + "39": 438468608.0, + "40": 438468608.0, + "41": 438468608.0, + "42": 438468608.0, + "43": 438468608.0, + "44": 438468608.0, + "45": 438468608.0, + "46": 438468608.0, + "47": 438468608.0, + "48": 438468608.0, + "49": 438468608.0, + "50": 438468608.0, + "51": 438468608.0, + "52": 438468608.0, + "53": 438468608.0, + "54": 438468608.0, + "55": 438468608.0, + "56": 438468608.0, + "57": 438468608.0, + "58": 438468608.0, + "59": 438468608.0, + "60": 438468608.0, + "61": 438468608.0, + "62": 438468608.0, + "63": 438468608.0, + "64": 438468608.0, + "65": 438468608.0, + "66": 438468608.0, + "67": 438468608.0, + "68": 438468608.0, + "69": 438468608.0, + "70": 438468608.0, + "71": 438468608.0, + "72": 438468608.0, + "73": 438468608.0, + "74": 438468608.0, + "75": 438468608.0, + "76": 438468608.0, + "77": 438468608.0, + "78": 438468608.0, + "79": 438468608.0, + "80": 438468608.0, + "81": 438468608.0, + "82": 438468608.0, + "83": 438468608.0, + "84": 438468608.0, + "85": 438468608.0, + "86": 438468608.0, + "87": 438468608.0, + "88": 438468608.0, + "89": 438468608.0, + "90": 438468608.0, + "91": 438468608.0, + "92": 438468608.0, + "93": 438468608.0, + "94": 438468608.0, + "95": 438468608.0, + "96": 438468608.0, + "97": 438468608.0, + "98": 438468608.0, + "99": 438468608.0, + "100": 438468608.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2658189824.0, + "2": 2658189824.0, + "3": 2658189824.0, + "4": 2658189824.0, + "5": 2658189824.0, + "6": 2658189824.0, + "7": 2658189824.0, + "8": 2658189824.0, + "9": 2658189824.0, + "10": 2658189824.0, + "11": 2658189824.0, + "12": 2658189824.0, + "13": 2658189824.0, + "14": 2658189824.0, + "15": 2658189824.0, + "16": 2658189824.0, + "17": 2658189824.0, + "18": 2658189824.0, + "19": 2658189824.0, + "20": 2658189824.0, + "21": 2658189824.0, + "22": 2658189824.0, + "23": 2658189824.0, + "24": 2658189824.0, + "25": 2658189824.0, + "26": 2658189824.0, + "27": 2658189824.0, + "28": 2658189824.0, + "29": 2658189824.0, + "30": 2658189824.0, + "31": 2658189824.0, + "32": 2658189824.0, + "33": 2658189824.0, + "34": 2658189824.0, + "35": 2658189824.0, + "36": 2658189824.0, + "37": 2658189824.0, + "38": 2658189824.0, + "39": 2658189824.0, + "40": 2658189824.0, + "41": 2658189824.0, + "42": 2658189824.0, + "43": 2658189824.0, + "44": 2658189824.0, + "45": 2658189824.0, + "46": 2658189824.0, + "47": 2658189824.0, + "48": 2658189824.0, + "49": 2658189824.0, + "50": 2658189824.0, + "51": 2658189824.0, + "52": 2658189824.0, + "53": 2658189824.0, + "54": 2658189824.0, + "55": 2658189824.0, + "56": 2658189824.0, + "57": 2658189824.0, + "58": 2658189824.0, + "59": 2658189824.0, + "60": 2658189824.0, + "61": 2658189824.0, + "62": 2658189824.0, + "63": 2658189824.0, + "64": 2658189824.0, + "65": 2658189824.0, + "66": 2658189824.0, + "67": 2658189824.0, + "68": 2658189824.0, + "69": 2658189824.0, + "70": 2658189824.0, + "71": 2658189824.0, + "72": 2658189824.0, + "73": 2658189824.0, + "74": 2658189824.0, + "75": 2658189824.0, + "76": 2658189824.0, + "77": 2658189824.0, + "78": 2658189824.0, + "79": 2658189824.0, + "80": 2658189824.0, + "81": 2658189824.0, + "82": 2658189824.0, + "83": 2658189824.0, + "84": 2658189824.0, + "85": 2658189824.0, + "86": 2658189824.0, + "87": 2658189824.0, + "88": 2658189824.0, + "89": 2658189824.0, + "90": 2658189824.0, + "91": 2658189824.0, + "92": 2658189824.0, + "93": 2658189824.0, + "94": 2658189824.0, + "95": 2658189824.0, + "96": 2658189824.0, + "97": 2658189824.0, + "98": 2658189824.0, + "99": 2658189824.0, + "100": 2658189824.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 16.23895, + "2": 0.20726, + "3": 0.17912, + "4": 0.18256, + "5": 0.18172, + "6": 0.18173, + "7": 0.18211, + "8": 0.18112, + "9": 0.18625, + "10": 0.18006, + "11": 0.18704, + "12": 0.17857, + "13": 0.17784, + "14": 0.18165, + "15": 0.1799, + "16": 0.17752, + "17": 0.17782, + "18": 0.1783, + "19": 0.17747, + "20": 0.18053, + "21": 0.17942, + "22": 0.17652, + "23": 0.17547, + "24": 0.17698, + "25": 0.17802, + "26": 0.17909, + "27": 0.1761, + "28": 0.17568, + "29": 0.17486, + "30": 0.17517, + "31": 0.18013, + "32": 0.18802, + "33": 0.18062, + "34": 0.18393, + "35": 0.18008, + "36": 0.18215, + "37": 0.18359, + "38": 0.18075, + "39": 0.17951, + "40": 0.17932, + "41": 0.18163, + "42": 0.18241, + "43": 0.18319, + "44": 0.18167, + "45": 0.18855, + "46": 0.18203, + "47": 0.17989, + "48": 0.18432, + "49": 0.18049, + "50": 0.18019, + "51": 0.1889, + "52": 0.18448, + "53": 0.18169, + "54": 0.1839, + "55": 0.18232, + "56": 0.18118, + "57": 0.18003, + "58": 0.37898, + "59": 0.18312, + "60": 0.17998, + "61": 0.17977, + "62": 0.18171, + "63": 0.181, + "64": 0.18283, + "65": 0.17995, + "66": 0.18199, + "67": 0.17999, + "68": 0.18052, + "69": 0.17988, + "70": 0.18409, + "71": 0.17919, + "72": 0.1808, + "73": 0.18072, + "74": 0.18009, + "75": 0.18701, + "76": 0.18172, + "77": 0.18079, + "78": 0.18125, + "79": 0.18109, + "80": 0.18217, + "81": 0.18459, + "82": 0.18212, + "83": 0.1828, + "84": 0.18156, + "85": 0.18308, + "86": 0.18586, + "87": 0.18076, + "88": 0.17994, + "89": 0.17997, + "90": 0.17982, + "91": 0.18361, + "92": 0.18438, + "93": 0.17977, + "94": 0.18014, + "95": 0.18079, + "96": 0.18168, + "97": 0.18546, + "98": 0.18181, + "99": 0.18024, + "100": 0.1811 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 953c7c07295..925cc0a5ec5 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,537 @@ -{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.85949, "5": 10.88346, "10": 10.83886, "15": 10.84052, "20": 10.73193, "25": 10.54813, "30": 10.37137, "35": 10.27172, "40": 10.09425, "45": 9.84079, "50": 9.90875, "55": 9.88203, "60": 9.50643, "65": 8.95166, "70": 9.74737, "75": 9.42703, "80": 9.40982, "85": 9.61371, "90": 9.81898, "95": 9.52172, "100": 9.39725}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1688.0, "5": 1909.0, "10": 1457.0, "15": 1930.0, "20": 1596.0, "25": 1557.0, "30": 1860.0, "35": 1902.0, "40": 2207.0, "45": 2095.0, "50": 2416.0, "55": 2216.0, "60": 2457.0, "65": 2472.0, "70": 3057.0, "75": 2474.0, "80": 3338.0, "85": 3324.0, "90": 3096.0, "95": 3399.0, "100": 3128.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 517505536.0, "5": 517505536.0, "10": 517505536.0, "15": 517505536.0, "20": 517505536.0, "25": 517505536.0, "30": 517505536.0, "35": 517505536.0, "40": 517505536.0, "45": 517505536.0, "50": 517505536.0, "55": 517505536.0, "60": 517505536.0, "65": 517505536.0, "70": 517505536.0, "75": 517505536.0, "80": 517505536.0, "85": 517505536.0, "90": 517505536.0, "95": 517505536.0, "100": 517505536.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1246524928.0, "5": 1428695552.0, "10": 1428695552.0, "15": 1428695552.0, "20": 1428695552.0, "25": 1428695552.0, "30": 1428695552.0, "35": 1428695552.0, "40": 1428695552.0, "45": 1428695552.0, "50": 1428695552.0, "55": 1428695552.0, "60": 1428695552.0, "65": 1428695552.0, "70": 1428695552.0, "75": 1428695552.0, "80": 1428695552.0, "85": 1428695552.0, "90": 1428695552.0, "95": 1428695552.0, "100": 1428695552.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 11.33109, "5": 0.12233, "10": 0.12087, "15": 0.12933, "20": 0.12038, "25": 0.12097, "30": 0.12085, "35": 0.12137, "40": 0.11996, "45": 0.12054, "50": 0.12218, "55": 0.12402, "60": 0.13274, "65": 0.12088, "70": 0.12039, "75": 0.12248, "80": 0.12305, "85": 0.12385, "90": 0.12202, "95": 0.1201, "100": 0.12049}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85949, + "2": 10.85553, + "3": 10.86543, + "4": 10.84553, + "5": 10.88346, + "6": 10.89431, + "7": 10.87067, + "8": 10.86979, + "9": 10.86918, + "10": 10.83886, + "11": 10.8943, + "12": 10.87983, + "13": 10.87985, + "14": 10.90321, + "15": 10.84052, + "16": 10.83787, + "17": 10.80669, + "18": 10.83026, + "19": 10.82261, + "20": 10.73193, + "21": 10.70748, + "22": 10.56005, + "23": 10.72399, + "24": 10.61114, + "25": 10.54813, + "26": 10.61329, + "27": 10.63053, + "28": 10.56646, + "29": 10.59668, + "30": 10.37137, + "31": 10.11725, + "32": 10.46127, + "33": 10.45249, + "34": 10.2169, + "35": 10.27172, + "36": 10.23119, + "37": 10.34809, + "38": 10.1884, + "39": 10.41044, + "40": 10.09425, + "41": 10.14707, + "42": 10.21242, + "43": 9.84105, + "44": 9.95918, + "45": 9.84079, + "46": 9.82479, + "47": 10.13878, + "48": 9.85831, + "49": 9.54705, + "50": 9.90875, + "51": 9.8558, + "52": 9.75237, + "53": 10.07589, + "54": 9.95688, + "55": 9.88203, + "56": 9.6313, + "57": 9.48649, + "58": 9.83109, + "59": 9.58897, + "60": 9.50643, + "61": 9.70363, + "62": 9.98286, + "63": 9.38302, + "64": 9.77901, + "65": 8.95166, + "66": 9.70158, + "67": 9.37203, + "68": 9.78849, + "69": 9.79851, + "70": 9.74737, + "71": 9.61908, + "72": 9.58502, + "73": 9.49721, + "74": 8.93927, + "75": 9.42703, + "76": 9.0802, + "77": 10.06567, + "78": 9.72893, + "79": 9.3776, + "80": 9.40982, + "81": 9.47976, + "82": 9.7018, + "83": 9.30612, + "84": 9.4209, + "85": 9.61371, + "86": 9.07649, + "87": 9.5945, + "88": 9.75068, + "89": 9.60238, + "90": 9.81898, + "91": 9.33894, + "92": 9.35716, + "93": 9.07879, + "94": 8.83503, + "95": 9.52172, + "96": 9.53003, + "97": 9.31306, + "98": 9.67783, + "99": 8.89058, + "100": 9.39725 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1688.0, + "2": 1806.0, + "3": 1675.0, + "4": 1842.0, + "5": 1909.0, + "6": 1908.0, + "7": 1783.0, + "8": 1611.0, + "9": 1753.0, + "10": 1457.0, + "11": 1880.0, + "12": 1683.0, + "13": 1907.0, + "14": 1733.0, + "15": 1930.0, + "16": 1840.0, + "17": 1892.0, + "18": 1650.0, + "19": 1790.0, + "20": 1596.0, + "21": 1765.0, + "22": 1616.0, + "23": 1974.0, + "24": 1621.0, + "25": 1557.0, + "26": 1745.0, + "27": 1722.0, + "28": 1976.0, + "29": 2068.0, + "30": 1860.0, + "31": 1536.0, + "32": 1883.0, + "33": 2071.0, + "34": 1894.0, + "35": 1902.0, + "36": 1885.0, + "37": 2231.0, + "38": 2129.0, + "39": 2333.0, + "40": 2207.0, + "41": 2193.0, + "42": 2322.0, + "43": 2015.0, + "44": 2089.0, + "45": 2095.0, + "46": 2392.0, + "47": 2430.0, + "48": 2414.0, + "49": 2340.0, + "50": 2416.0, + "51": 2613.0, + "52": 2538.0, + "53": 2792.0, + "54": 2801.0, + "55": 2216.0, + "56": 2858.0, + "57": 2381.0, + "58": 2854.0, + "59": 2787.0, + "60": 2457.0, + "61": 2941.0, + "62": 2543.0, + "63": 2408.0, + "64": 2968.0, + "65": 2472.0, + "66": 2977.0, + "67": 2839.0, + "68": 2775.0, + "69": 2832.0, + "70": 3057.0, + "71": 2909.0, + "72": 2421.0, + "73": 2982.0, + "74": 1922.0, + "75": 2474.0, + "76": 3059.0, + "77": 3177.0, + "78": 3067.0, + "79": 3052.0, + "80": 3338.0, + "81": 3644.0, + "82": 3234.0, + "83": 2798.0, + "84": 3196.0, + "85": 3324.0, + "86": 2855.0, + "87": 3820.0, + "88": 2962.0, + "89": 3379.0, + "90": 3096.0, + "91": 2857.0, + "92": 3077.0, + "93": 2693.0, + "94": 3312.0, + "95": 3399.0, + "96": 3378.0, + "97": 3030.0, + "98": 3619.0, + "99": 3160.0, + "100": 3128.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 517505536.0, + "2": 517505536.0, + "3": 517505536.0, + "4": 517505536.0, + "5": 517505536.0, + "6": 517505536.0, + "7": 517505536.0, + "8": 517505536.0, + "9": 517505536.0, + "10": 517505536.0, + "11": 517505536.0, + "12": 517505536.0, + "13": 517505536.0, + "14": 517505536.0, + "15": 517505536.0, + "16": 517505536.0, + "17": 517505536.0, + "18": 517505536.0, + "19": 517505536.0, + "20": 517505536.0, + "21": 517505536.0, + "22": 517505536.0, + "23": 517505536.0, + "24": 517505536.0, + "25": 517505536.0, + "26": 517505536.0, + "27": 517505536.0, + "28": 517505536.0, + "29": 517505536.0, + "30": 517505536.0, + "31": 517505536.0, + "32": 517505536.0, + "33": 517505536.0, + "34": 517505536.0, + "35": 517505536.0, + "36": 517505536.0, + "37": 517505536.0, + "38": 517505536.0, + "39": 517505536.0, + "40": 517505536.0, + "41": 517505536.0, + "42": 517505536.0, + "43": 517505536.0, + "44": 517505536.0, + "45": 517505536.0, + "46": 517505536.0, + "47": 517505536.0, + "48": 517505536.0, + "49": 517505536.0, + "50": 517505536.0, + "51": 517505536.0, + "52": 517505536.0, + "53": 517505536.0, + "54": 517505536.0, + "55": 517505536.0, + "56": 517505536.0, + "57": 517505536.0, + "58": 517505536.0, + "59": 517505536.0, + "60": 517505536.0, + "61": 517505536.0, + "62": 517505536.0, + "63": 517505536.0, + "64": 517505536.0, + "65": 517505536.0, + "66": 517505536.0, + "67": 517505536.0, + "68": 517505536.0, + "69": 517505536.0, + "70": 517505536.0, + "71": 517505536.0, + "72": 517505536.0, + "73": 517505536.0, + "74": 517505536.0, + "75": 517505536.0, + "76": 517505536.0, + "77": 517505536.0, + "78": 517505536.0, + "79": 517505536.0, + "80": 517505536.0, + "81": 517505536.0, + "82": 517505536.0, + "83": 517505536.0, + "84": 517505536.0, + "85": 517505536.0, + "86": 517505536.0, + "87": 517505536.0, + "88": 517505536.0, + "89": 517505536.0, + "90": 517505536.0, + "91": 517505536.0, + "92": 517505536.0, + "93": 517505536.0, + "94": 517505536.0, + "95": 517505536.0, + "96": 517505536.0, + "97": 517505536.0, + "98": 517505536.0, + "99": 517505536.0, + "100": 517505536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1246524928.0, + "2": 1428695552.0, + "3": 1428695552.0, + "4": 1428695552.0, + "5": 1428695552.0, + "6": 1428695552.0, + "7": 1428695552.0, + "8": 1428695552.0, + "9": 1428695552.0, + "10": 1428695552.0, + "11": 1428695552.0, + "12": 1428695552.0, + "13": 1428695552.0, + "14": 1428695552.0, + "15": 1428695552.0, + "16": 1428695552.0, + "17": 1428695552.0, + "18": 1428695552.0, + "19": 1428695552.0, + "20": 1428695552.0, + "21": 1428695552.0, + "22": 1428695552.0, + "23": 1428695552.0, + "24": 1428695552.0, + "25": 1428695552.0, + "26": 1428695552.0, + "27": 1428695552.0, + "28": 1428695552.0, + "29": 1428695552.0, + "30": 1428695552.0, + "31": 1428695552.0, + "32": 1428695552.0, + "33": 1428695552.0, + "34": 1428695552.0, + "35": 1428695552.0, + "36": 1428695552.0, + "37": 1428695552.0, + "38": 1428695552.0, + "39": 1428695552.0, + "40": 1428695552.0, + "41": 1428695552.0, + "42": 1428695552.0, + "43": 1428695552.0, + "44": 1428695552.0, + "45": 1428695552.0, + "46": 1428695552.0, + "47": 1428695552.0, + "48": 1428695552.0, + "49": 1428695552.0, + "50": 1428695552.0, + "51": 1428695552.0, + "52": 1428695552.0, + "53": 1428695552.0, + "54": 1428695552.0, + "55": 1428695552.0, + "56": 1428695552.0, + "57": 1428695552.0, + "58": 1428695552.0, + "59": 1428695552.0, + "60": 1428695552.0, + "61": 1428695552.0, + "62": 1428695552.0, + "63": 1428695552.0, + "64": 1428695552.0, + "65": 1428695552.0, + "66": 1428695552.0, + "67": 1428695552.0, + "68": 1428695552.0, + "69": 1428695552.0, + "70": 1428695552.0, + "71": 1428695552.0, + "72": 1428695552.0, + "73": 1428695552.0, + "74": 1428695552.0, + "75": 1428695552.0, + "76": 1428695552.0, + "77": 1428695552.0, + "78": 1428695552.0, + "79": 1428695552.0, + "80": 1428695552.0, + "81": 1428695552.0, + "82": 1428695552.0, + "83": 1428695552.0, + "84": 1428695552.0, + "85": 1428695552.0, + "86": 1428695552.0, + "87": 1428695552.0, + "88": 1428695552.0, + "89": 1428695552.0, + "90": 1428695552.0, + "91": 1428695552.0, + "92": 1428695552.0, + "93": 1428695552.0, + "94": 1428695552.0, + "95": 1428695552.0, + "96": 1428695552.0, + "97": 1428695552.0, + "98": 1428695552.0, + "99": 1428695552.0, + "100": 1428695552.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.60342, + "2": 0.19062, + "3": 0.17106, + "4": 0.15064, + "5": 0.15065, + "6": 0.1494, + "7": 0.15215, + "8": 0.14914, + "9": 0.15232, + "10": 0.15441, + "11": 0.15247, + "12": 0.15046, + "13": 0.15058, + "14": 0.15219, + "15": 0.15133, + "16": 0.15023, + "17": 0.1509, + "18": 0.14938, + "19": 0.15103, + "20": 0.1515, + "21": 0.1522, + "22": 0.1489, + "23": 0.15182, + "24": 0.1502, + "25": 0.15153, + "26": 0.15174, + "27": 0.15257, + "28": 0.14921, + "29": 0.14989, + "30": 0.14944, + "31": 0.15201, + "32": 0.1504, + "33": 0.1493, + "34": 0.15189, + "35": 0.14934, + "36": 0.15042, + "37": 0.15128, + "38": 0.15671, + "39": 0.14985, + "40": 0.15139, + "41": 0.15056, + "42": 0.14937, + "43": 0.15027, + "44": 0.15158, + "45": 0.15159, + "46": 0.15106, + "47": 0.14958, + "48": 0.15078, + "49": 0.15171, + "50": 0.15469, + "51": 0.17266, + "52": 0.16844, + "53": 0.16496, + "54": 0.16828, + "55": 0.15512, + "56": 0.15061, + "57": 0.1542, + "58": 0.15315, + "59": 0.15262, + "60": 0.1507, + "61": 0.15164, + "62": 0.15223, + "63": 0.15172, + "64": 0.15124, + "65": 0.15315, + "66": 0.15108, + "67": 0.15238, + "68": 0.1491, + "69": 0.15112, + "70": 0.15218, + "71": 0.15542, + "72": 0.1514, + "73": 0.15306, + "74": 0.14963, + "75": 0.15272, + "76": 0.15, + "77": 0.15284, + "78": 0.15228, + "79": 0.15051, + "80": 0.15149, + "81": 0.15215, + "82": 0.15086, + "83": 0.1515, + "84": 0.15437, + "85": 0.15454, + "86": 0.15197, + "87": 0.15062, + "88": 0.14949, + "89": 0.15096, + "90": 0.15098, + "91": 0.15349, + "92": 0.15219, + "93": 0.15171, + "94": 0.15116, + "95": 0.15081, + "96": 0.15321, + "97": 0.15268, + "98": 0.15451, + "99": 0.1496, + "100": 0.15252 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..9d88acfb6cd --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85949, + "2": 10.85553, + "3": 10.86543, + "4": 10.84553, + "5": 10.88346, + "6": 10.89431, + "7": 10.87067, + "8": 10.86979, + "9": 10.86918, + "10": 10.83886, + "11": 10.8943, + "12": 10.87983, + "13": 10.87985, + "14": 10.90321, + "15": 10.84052, + "16": 10.83787, + "17": 10.80669, + "18": 10.83026, + "19": 10.82261, + "20": 10.73193, + "21": 10.70748, + "22": 10.56005, + "23": 10.72399, + "24": 10.61114, + "25": 10.54813, + "26": 10.61329, + "27": 10.63053, + "28": 10.56646, + "29": 10.59668, + "30": 10.37137, + "31": 10.11725, + "32": 10.46127, + "33": 10.45249, + "34": 10.2169, + "35": 10.27172, + "36": 10.23119, + "37": 10.34809, + "38": 10.1884, + "39": 10.41044, + "40": 10.09425, + "41": 10.14707, + "42": 10.21242, + "43": 9.84105, + "44": 9.95918, + "45": 9.84079, + "46": 9.82479, + "47": 10.13878, + "48": 9.85831, + "49": 9.54705, + "50": 9.90875, + "51": 9.8558, + "52": 9.75237, + "53": 10.07589, + "54": 9.95688, + "55": 9.88203, + "56": 9.6313, + "57": 9.48649, + "58": 9.83109, + "59": 9.58897, + "60": 9.50643, + "61": 9.70363, + "62": 9.98286, + "63": 9.38302, + "64": 9.77901, + "65": 8.95166, + "66": 9.70158, + "67": 9.37203, + "68": 9.78849, + "69": 9.79851, + "70": 9.74737, + "71": 9.61908, + "72": 9.58502, + "73": 9.49721, + "74": 8.93927, + "75": 9.42703, + "76": 9.0802, + "77": 10.06567, + "78": 9.72893, + "79": 9.3776, + "80": 9.40982, + "81": 9.47976, + "82": 9.7018, + "83": 9.30612, + "84": 9.4209, + "85": 9.61371, + "86": 9.07649, + "87": 9.5945, + "88": 9.75068, + "89": 9.60238, + "90": 9.81898, + "91": 9.33894, + "92": 9.35716, + "93": 9.07879, + "94": 8.83503, + "95": 9.52172, + "96": 9.53003, + "97": 9.31306, + "98": 9.67783, + "99": 8.89058, + "100": 9.39725 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1688.0, + "2": 1806.0, + "3": 1675.0, + "4": 1842.0, + "5": 1909.0, + "6": 1908.0, + "7": 1783.0, + "8": 1611.0, + "9": 1753.0, + "10": 1457.0, + "11": 1880.0, + "12": 1683.0, + "13": 1907.0, + "14": 1733.0, + "15": 1930.0, + "16": 1840.0, + "17": 1892.0, + "18": 1650.0, + "19": 1790.0, + "20": 1596.0, + "21": 1765.0, + "22": 1616.0, + "23": 1974.0, + "24": 1621.0, + "25": 1557.0, + "26": 1745.0, + "27": 1722.0, + "28": 1976.0, + "29": 2068.0, + "30": 1860.0, + "31": 1536.0, + "32": 1883.0, + "33": 2071.0, + "34": 1894.0, + "35": 1902.0, + "36": 1885.0, + "37": 2231.0, + "38": 2129.0, + "39": 2333.0, + "40": 2207.0, + "41": 2193.0, + "42": 2322.0, + "43": 2015.0, + "44": 2089.0, + "45": 2095.0, + "46": 2392.0, + "47": 2430.0, + "48": 2414.0, + "49": 2340.0, + "50": 2416.0, + "51": 2613.0, + "52": 2538.0, + "53": 2792.0, + "54": 2801.0, + "55": 2216.0, + "56": 2858.0, + "57": 2381.0, + "58": 2854.0, + "59": 2787.0, + "60": 2457.0, + "61": 2941.0, + "62": 2543.0, + "63": 2408.0, + "64": 2968.0, + "65": 2472.0, + "66": 2977.0, + "67": 2839.0, + "68": 2775.0, + "69": 2832.0, + "70": 3057.0, + "71": 2909.0, + "72": 2421.0, + "73": 2982.0, + "74": 1922.0, + "75": 2474.0, + "76": 3059.0, + "77": 3177.0, + "78": 3067.0, + "79": 3052.0, + "80": 3338.0, + "81": 3644.0, + "82": 3234.0, + "83": 2798.0, + "84": 3196.0, + "85": 3324.0, + "86": 2855.0, + "87": 3820.0, + "88": 2962.0, + "89": 3379.0, + "90": 3096.0, + "91": 2857.0, + "92": 3077.0, + "93": 2693.0, + "94": 3312.0, + "95": 3399.0, + "96": 3378.0, + "97": 3030.0, + "98": 3619.0, + "99": 3160.0, + "100": 3128.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 517505536.0, + "2": 517505536.0, + "3": 517505536.0, + "4": 517505536.0, + "5": 517505536.0, + "6": 517505536.0, + "7": 517505536.0, + "8": 517505536.0, + "9": 517505536.0, + "10": 517505536.0, + "11": 517505536.0, + "12": 517505536.0, + "13": 517505536.0, + "14": 517505536.0, + "15": 517505536.0, + "16": 517505536.0, + "17": 517505536.0, + "18": 517505536.0, + "19": 517505536.0, + "20": 517505536.0, + "21": 517505536.0, + "22": 517505536.0, + "23": 517505536.0, + "24": 517505536.0, + "25": 517505536.0, + "26": 517505536.0, + "27": 517505536.0, + "28": 517505536.0, + "29": 517505536.0, + "30": 517505536.0, + "31": 517505536.0, + "32": 517505536.0, + "33": 517505536.0, + "34": 517505536.0, + "35": 517505536.0, + "36": 517505536.0, + "37": 517505536.0, + "38": 517505536.0, + "39": 517505536.0, + "40": 517505536.0, + "41": 517505536.0, + "42": 517505536.0, + "43": 517505536.0, + "44": 517505536.0, + "45": 517505536.0, + "46": 517505536.0, + "47": 517505536.0, + "48": 517505536.0, + "49": 517505536.0, + "50": 517505536.0, + "51": 517505536.0, + "52": 517505536.0, + "53": 517505536.0, + "54": 517505536.0, + "55": 517505536.0, + "56": 517505536.0, + "57": 517505536.0, + "58": 517505536.0, + "59": 517505536.0, + "60": 517505536.0, + "61": 517505536.0, + "62": 517505536.0, + "63": 517505536.0, + "64": 517505536.0, + "65": 517505536.0, + "66": 517505536.0, + "67": 517505536.0, + "68": 517505536.0, + "69": 517505536.0, + "70": 517505536.0, + "71": 517505536.0, + "72": 517505536.0, + "73": 517505536.0, + "74": 517505536.0, + "75": 517505536.0, + "76": 517505536.0, + "77": 517505536.0, + "78": 517505536.0, + "79": 517505536.0, + "80": 517505536.0, + "81": 517505536.0, + "82": 517505536.0, + "83": 517505536.0, + "84": 517505536.0, + "85": 517505536.0, + "86": 517505536.0, + "87": 517505536.0, + "88": 517505536.0, + "89": 517505536.0, + "90": 517505536.0, + "91": 517505536.0, + "92": 517505536.0, + "93": 517505536.0, + "94": 517505536.0, + "95": 517505536.0, + "96": 517505536.0, + "97": 517505536.0, + "98": 517505536.0, + "99": 517505536.0, + "100": 517505536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1246524928.0, + "2": 1428695552.0, + "3": 1428695552.0, + "4": 1428695552.0, + "5": 1428695552.0, + "6": 1428695552.0, + "7": 1428695552.0, + "8": 1428695552.0, + "9": 1428695552.0, + "10": 1428695552.0, + "11": 1428695552.0, + "12": 1428695552.0, + "13": 1428695552.0, + "14": 1428695552.0, + "15": 1428695552.0, + "16": 1428695552.0, + "17": 1428695552.0, + "18": 1428695552.0, + "19": 1428695552.0, + "20": 1428695552.0, + "21": 1428695552.0, + "22": 1428695552.0, + "23": 1428695552.0, + "24": 1428695552.0, + "25": 1428695552.0, + "26": 1428695552.0, + "27": 1428695552.0, + "28": 1428695552.0, + "29": 1428695552.0, + "30": 1428695552.0, + "31": 1428695552.0, + "32": 1428695552.0, + "33": 1428695552.0, + "34": 1428695552.0, + "35": 1428695552.0, + "36": 1428695552.0, + "37": 1428695552.0, + "38": 1428695552.0, + "39": 1428695552.0, + "40": 1428695552.0, + "41": 1428695552.0, + "42": 1428695552.0, + "43": 1428695552.0, + "44": 1428695552.0, + "45": 1428695552.0, + "46": 1428695552.0, + "47": 1428695552.0, + "48": 1428695552.0, + "49": 1428695552.0, + "50": 1428695552.0, + "51": 1428695552.0, + "52": 1428695552.0, + "53": 1428695552.0, + "54": 1428695552.0, + "55": 1428695552.0, + "56": 1428695552.0, + "57": 1428695552.0, + "58": 1428695552.0, + "59": 1428695552.0, + "60": 1428695552.0, + "61": 1428695552.0, + "62": 1428695552.0, + "63": 1428695552.0, + "64": 1428695552.0, + "65": 1428695552.0, + "66": 1428695552.0, + "67": 1428695552.0, + "68": 1428695552.0, + "69": 1428695552.0, + "70": 1428695552.0, + "71": 1428695552.0, + "72": 1428695552.0, + "73": 1428695552.0, + "74": 1428695552.0, + "75": 1428695552.0, + "76": 1428695552.0, + "77": 1428695552.0, + "78": 1428695552.0, + "79": 1428695552.0, + "80": 1428695552.0, + "81": 1428695552.0, + "82": 1428695552.0, + "83": 1428695552.0, + "84": 1428695552.0, + "85": 1428695552.0, + "86": 1428695552.0, + "87": 1428695552.0, + "88": 1428695552.0, + "89": 1428695552.0, + "90": 1428695552.0, + "91": 1428695552.0, + "92": 1428695552.0, + "93": 1428695552.0, + "94": 1428695552.0, + "95": 1428695552.0, + "96": 1428695552.0, + "97": 1428695552.0, + "98": 1428695552.0, + "99": 1428695552.0, + "100": 1428695552.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.53219, + "2": 0.1684, + "3": 0.13213, + "4": 0.13603, + "5": 0.14526, + "6": 0.13427, + "7": 0.136, + "8": 0.13232, + "9": 0.13802, + "10": 0.13323, + "11": 0.13284, + "12": 0.1324, + "13": 0.13226, + "14": 0.13345, + "15": 0.13404, + "16": 0.13246, + "17": 0.13846, + "18": 0.14976, + "19": 0.15115, + "20": 0.1432, + "21": 0.14309, + "22": 0.14543, + "23": 0.1451, + "24": 0.14454, + "25": 0.14293, + "26": 0.14271, + "27": 0.14031, + "28": 0.13412, + "29": 0.13599, + "30": 0.13491, + "31": 0.13451, + "32": 0.1457, + "33": 0.13899, + "34": 0.14249, + "35": 0.13753, + "36": 0.13178, + "37": 0.13407, + "38": 0.13463, + "39": 0.13305, + "40": 0.13317, + "41": 0.13403, + "42": 0.1337, + "43": 0.13374, + "44": 0.13271, + "45": 0.13351, + "46": 0.1329, + "47": 0.13703, + "48": 0.1336, + "49": 0.13392, + "50": 0.13491, + "51": 0.15864, + "52": 0.14644, + "53": 0.13353, + "54": 0.13586, + "55": 0.1338, + "56": 0.13348, + "57": 0.13862, + "58": 0.13538, + "59": 0.13584, + "60": 0.13637, + "61": 0.1348, + "62": 0.13739, + "63": 0.13414, + "64": 0.13588, + "65": 0.13342, + "66": 0.13248, + "67": 0.13306, + "68": 0.13382, + "69": 0.13258, + "70": 0.1323, + "71": 0.13391, + "72": 0.13175, + "73": 0.13255, + "74": 0.13144, + "75": 0.13133, + "76": 0.13154, + "77": 0.13197, + "78": 0.13181, + "79": 0.13551, + "80": 0.13273, + "81": 0.13213, + "82": 0.13227, + "83": 0.13169, + "84": 0.13255, + "85": 0.13081, + "86": 0.13276, + "87": 0.13515, + "88": 0.13346, + "89": 0.13174, + "90": 0.13117, + "91": 0.13268, + "92": 0.131, + "93": 0.13188, + "94": 0.13089, + "95": 0.13284, + "96": 0.13247, + "97": 0.13153, + "98": 0.13147, + "99": 0.13253, + "100": 0.13209 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..e895f06a28a --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85949, + "2": 10.85553, + "3": 10.86543, + "4": 10.84553, + "5": 10.88346, + "6": 10.89431, + "7": 10.87067, + "8": 10.86979, + "9": 10.86918, + "10": 10.83886, + "11": 10.8943, + "12": 10.87983, + "13": 10.87985, + "14": 10.90321, + "15": 10.84052, + "16": 10.83787, + "17": 10.80669, + "18": 10.83026, + "19": 10.82261, + "20": 10.73193, + "21": 10.70748, + "22": 10.56005, + "23": 10.72399, + "24": 10.61114, + "25": 10.54813, + "26": 10.61329, + "27": 10.63053, + "28": 10.56646, + "29": 10.59668, + "30": 10.37137, + "31": 10.11725, + "32": 10.46127, + "33": 10.45249, + "34": 10.2169, + "35": 10.27172, + "36": 10.23119, + "37": 10.34809, + "38": 10.1884, + "39": 10.41044, + "40": 10.09425, + "41": 10.14707, + "42": 10.21242, + "43": 9.84105, + "44": 9.95918, + "45": 9.84079, + "46": 9.82479, + "47": 10.13878, + "48": 9.85831, + "49": 9.54705, + "50": 9.90875, + "51": 9.8558, + "52": 9.75237, + "53": 10.07589, + "54": 9.95688, + "55": 9.88203, + "56": 9.6313, + "57": 9.48649, + "58": 9.83109, + "59": 9.58897, + "60": 9.50643, + "61": 9.70363, + "62": 9.98286, + "63": 9.38302, + "64": 9.77901, + "65": 8.95166, + "66": 9.70158, + "67": 9.37203, + "68": 9.78849, + "69": 9.79851, + "70": 9.74737, + "71": 9.61908, + "72": 9.58502, + "73": 9.49721, + "74": 8.93927, + "75": 9.42703, + "76": 9.0802, + "77": 10.06567, + "78": 9.72893, + "79": 9.3776, + "80": 9.40982, + "81": 9.47976, + "82": 9.7018, + "83": 9.30612, + "84": 9.4209, + "85": 9.61371, + "86": 9.07649, + "87": 9.5945, + "88": 9.75068, + "89": 9.60238, + "90": 9.81898, + "91": 9.33894, + "92": 9.35716, + "93": 9.07879, + "94": 8.83503, + "95": 9.52172, + "96": 9.53003, + "97": 9.31306, + "98": 9.67783, + "99": 8.89058, + "100": 9.39725 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1688.0, + "2": 1806.0, + "3": 1675.0, + "4": 1842.0, + "5": 1909.0, + "6": 1908.0, + "7": 1783.0, + "8": 1611.0, + "9": 1753.0, + "10": 1457.0, + "11": 1880.0, + "12": 1683.0, + "13": 1907.0, + "14": 1733.0, + "15": 1930.0, + "16": 1840.0, + "17": 1892.0, + "18": 1650.0, + "19": 1790.0, + "20": 1596.0, + "21": 1765.0, + "22": 1616.0, + "23": 1974.0, + "24": 1621.0, + "25": 1557.0, + "26": 1745.0, + "27": 1722.0, + "28": 1976.0, + "29": 2068.0, + "30": 1860.0, + "31": 1536.0, + "32": 1883.0, + "33": 2071.0, + "34": 1894.0, + "35": 1902.0, + "36": 1885.0, + "37": 2231.0, + "38": 2129.0, + "39": 2333.0, + "40": 2207.0, + "41": 2193.0, + "42": 2322.0, + "43": 2015.0, + "44": 2089.0, + "45": 2095.0, + "46": 2392.0, + "47": 2430.0, + "48": 2414.0, + "49": 2340.0, + "50": 2416.0, + "51": 2613.0, + "52": 2538.0, + "53": 2792.0, + "54": 2801.0, + "55": 2216.0, + "56": 2858.0, + "57": 2381.0, + "58": 2854.0, + "59": 2787.0, + "60": 2457.0, + "61": 2941.0, + "62": 2543.0, + "63": 2408.0, + "64": 2968.0, + "65": 2472.0, + "66": 2977.0, + "67": 2839.0, + "68": 2775.0, + "69": 2832.0, + "70": 3057.0, + "71": 2909.0, + "72": 2421.0, + "73": 2982.0, + "74": 1922.0, + "75": 2474.0, + "76": 3059.0, + "77": 3177.0, + "78": 3067.0, + "79": 3052.0, + "80": 3338.0, + "81": 3644.0, + "82": 3234.0, + "83": 2798.0, + "84": 3196.0, + "85": 3324.0, + "86": 2855.0, + "87": 3820.0, + "88": 2962.0, + "89": 3379.0, + "90": 3096.0, + "91": 2857.0, + "92": 3077.0, + "93": 2693.0, + "94": 3312.0, + "95": 3399.0, + "96": 3378.0, + "97": 3030.0, + "98": 3619.0, + "99": 3160.0, + "100": 3128.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 517505536.0, + "2": 517505536.0, + "3": 517505536.0, + "4": 517505536.0, + "5": 517505536.0, + "6": 517505536.0, + "7": 517505536.0, + "8": 517505536.0, + "9": 517505536.0, + "10": 517505536.0, + "11": 517505536.0, + "12": 517505536.0, + "13": 517505536.0, + "14": 517505536.0, + "15": 517505536.0, + "16": 517505536.0, + "17": 517505536.0, + "18": 517505536.0, + "19": 517505536.0, + "20": 517505536.0, + "21": 517505536.0, + "22": 517505536.0, + "23": 517505536.0, + "24": 517505536.0, + "25": 517505536.0, + "26": 517505536.0, + "27": 517505536.0, + "28": 517505536.0, + "29": 517505536.0, + "30": 517505536.0, + "31": 517505536.0, + "32": 517505536.0, + "33": 517505536.0, + "34": 517505536.0, + "35": 517505536.0, + "36": 517505536.0, + "37": 517505536.0, + "38": 517505536.0, + "39": 517505536.0, + "40": 517505536.0, + "41": 517505536.0, + "42": 517505536.0, + "43": 517505536.0, + "44": 517505536.0, + "45": 517505536.0, + "46": 517505536.0, + "47": 517505536.0, + "48": 517505536.0, + "49": 517505536.0, + "50": 517505536.0, + "51": 517505536.0, + "52": 517505536.0, + "53": 517505536.0, + "54": 517505536.0, + "55": 517505536.0, + "56": 517505536.0, + "57": 517505536.0, + "58": 517505536.0, + "59": 517505536.0, + "60": 517505536.0, + "61": 517505536.0, + "62": 517505536.0, + "63": 517505536.0, + "64": 517505536.0, + "65": 517505536.0, + "66": 517505536.0, + "67": 517505536.0, + "68": 517505536.0, + "69": 517505536.0, + "70": 517505536.0, + "71": 517505536.0, + "72": 517505536.0, + "73": 517505536.0, + "74": 517505536.0, + "75": 517505536.0, + "76": 517505536.0, + "77": 517505536.0, + "78": 517505536.0, + "79": 517505536.0, + "80": 517505536.0, + "81": 517505536.0, + "82": 517505536.0, + "83": 517505536.0, + "84": 517505536.0, + "85": 517505536.0, + "86": 517505536.0, + "87": 517505536.0, + "88": 517505536.0, + "89": 517505536.0, + "90": 517505536.0, + "91": 517505536.0, + "92": 517505536.0, + "93": 517505536.0, + "94": 517505536.0, + "95": 517505536.0, + "96": 517505536.0, + "97": 517505536.0, + "98": 517505536.0, + "99": 517505536.0, + "100": 517505536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1246524928.0, + "2": 1428695552.0, + "3": 1428695552.0, + "4": 1428695552.0, + "5": 1428695552.0, + "6": 1428695552.0, + "7": 1428695552.0, + "8": 1428695552.0, + "9": 1428695552.0, + "10": 1428695552.0, + "11": 1428695552.0, + "12": 1428695552.0, + "13": 1428695552.0, + "14": 1428695552.0, + "15": 1428695552.0, + "16": 1428695552.0, + "17": 1428695552.0, + "18": 1428695552.0, + "19": 1428695552.0, + "20": 1428695552.0, + "21": 1428695552.0, + "22": 1428695552.0, + "23": 1428695552.0, + "24": 1428695552.0, + "25": 1428695552.0, + "26": 1428695552.0, + "27": 1428695552.0, + "28": 1428695552.0, + "29": 1428695552.0, + "30": 1428695552.0, + "31": 1428695552.0, + "32": 1428695552.0, + "33": 1428695552.0, + "34": 1428695552.0, + "35": 1428695552.0, + "36": 1428695552.0, + "37": 1428695552.0, + "38": 1428695552.0, + "39": 1428695552.0, + "40": 1428695552.0, + "41": 1428695552.0, + "42": 1428695552.0, + "43": 1428695552.0, + "44": 1428695552.0, + "45": 1428695552.0, + "46": 1428695552.0, + "47": 1428695552.0, + "48": 1428695552.0, + "49": 1428695552.0, + "50": 1428695552.0, + "51": 1428695552.0, + "52": 1428695552.0, + "53": 1428695552.0, + "54": 1428695552.0, + "55": 1428695552.0, + "56": 1428695552.0, + "57": 1428695552.0, + "58": 1428695552.0, + "59": 1428695552.0, + "60": 1428695552.0, + "61": 1428695552.0, + "62": 1428695552.0, + "63": 1428695552.0, + "64": 1428695552.0, + "65": 1428695552.0, + "66": 1428695552.0, + "67": 1428695552.0, + "68": 1428695552.0, + "69": 1428695552.0, + "70": 1428695552.0, + "71": 1428695552.0, + "72": 1428695552.0, + "73": 1428695552.0, + "74": 1428695552.0, + "75": 1428695552.0, + "76": 1428695552.0, + "77": 1428695552.0, + "78": 1428695552.0, + "79": 1428695552.0, + "80": 1428695552.0, + "81": 1428695552.0, + "82": 1428695552.0, + "83": 1428695552.0, + "84": 1428695552.0, + "85": 1428695552.0, + "86": 1428695552.0, + "87": 1428695552.0, + "88": 1428695552.0, + "89": 1428695552.0, + "90": 1428695552.0, + "91": 1428695552.0, + "92": 1428695552.0, + "93": 1428695552.0, + "94": 1428695552.0, + "95": 1428695552.0, + "96": 1428695552.0, + "97": 1428695552.0, + "98": 1428695552.0, + "99": 1428695552.0, + "100": 1428695552.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.77129, + "2": 0.18805, + "3": 0.15486, + "4": 0.15531, + "5": 0.15342, + "6": 0.15402, + "7": 0.15787, + "8": 0.15837, + "9": 0.15422, + "10": 0.1531, + "11": 0.1531, + "12": 0.1521, + "13": 0.15206, + "14": 0.15281, + "15": 0.15025, + "16": 0.15321, + "17": 0.15383, + "18": 0.15265, + "19": 0.15535, + "20": 0.15414, + "21": 0.15275, + "22": 0.152, + "23": 0.15456, + "24": 0.15209, + "25": 0.15358, + "26": 0.15228, + "27": 0.15217, + "28": 0.15204, + "29": 0.1526, + "30": 0.15259, + "31": 0.15237, + "32": 0.15885, + "33": 0.1577, + "34": 0.16029, + "35": 0.15618, + "36": 0.16006, + "37": 0.15686, + "38": 0.15897, + "39": 0.15985, + "40": 0.15818, + "41": 0.15734, + "42": 0.15623, + "43": 0.15982, + "44": 0.15844, + "45": 0.15965, + "46": 0.15995, + "47": 0.1576, + "48": 0.15787, + "49": 0.15857, + "50": 0.16598, + "51": 0.15831, + "52": 0.15281, + "53": 0.15278, + "54": 0.15155, + "55": 0.1544, + "56": 0.15102, + "57": 0.1505, + "58": 0.15177, + "59": 0.15275, + "60": 0.15179, + "61": 0.15138, + "62": 0.153, + "63": 0.14962, + "64": 0.15104, + "65": 0.15104, + "66": 0.1541, + "67": 0.15089, + "68": 0.15178, + "69": 0.15241, + "70": 0.1524, + "71": 0.14991, + "72": 0.15107, + "73": 0.15205, + "74": 0.15105, + "75": 0.14944, + "76": 0.15086, + "77": 0.15066, + "78": 0.15037, + "79": 0.1517, + "80": 0.1535, + "81": 0.15067, + "82": 0.15202, + "83": 0.1513, + "84": 0.15157, + "85": 0.15077, + "86": 0.15249, + "87": 0.15259, + "88": 0.15065, + "89": 0.15236, + "90": 0.15088, + "91": 0.15271, + "92": 0.15124, + "93": 0.15371, + "94": 0.14949, + "95": 0.15169, + "96": 0.15061, + "97": 0.15123, + "98": 0.15143, + "99": 0.15292, + "100": 0.15348 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..798f3341573 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.92705, + "2": 10.92645, + "3": 10.91604, + "4": 10.90911, + "5": 10.92795, + "6": 10.93626, + "7": 10.90626, + "8": 10.92128, + "9": 10.90998, + "10": 10.90786, + "11": 10.89335, + "12": 10.92456, + "13": 10.9146, + "14": 10.9213, + "15": 10.88314, + "16": 10.87325, + "17": 10.84129, + "18": 10.87276, + "19": 10.8563, + "20": 10.77629, + "21": 10.74869, + "22": 10.63031, + "23": 10.75678, + "24": 10.65646, + "25": 10.59141, + "26": 10.65375, + "27": 10.6485, + "28": 10.59548, + "29": 10.6088, + "30": 10.39192, + "31": 10.15753, + "32": 10.49098, + "33": 10.4793, + "34": 10.24058, + "35": 10.29686, + "36": 10.24644, + "37": 10.35232, + "38": 10.20489, + "39": 10.4052, + "40": 10.0964, + "41": 10.15175, + "42": 10.22026, + "43": 9.85499, + "44": 9.96143, + "45": 9.84464, + "46": 9.83801, + "47": 10.13988, + "48": 9.85718, + "49": 9.53698, + "50": 9.90918, + "51": 9.84886, + "52": 9.74154, + "53": 10.06347, + "54": 9.94683, + "55": 9.87762, + "56": 9.6274, + "57": 9.47112, + "58": 9.82925, + "59": 9.58253, + "60": 9.49121, + "61": 9.69956, + "62": 9.97968, + "63": 9.37277, + "64": 9.77468, + "65": 8.94236, + "66": 9.6991, + "67": 9.36382, + "68": 9.78787, + "69": 9.78332, + "70": 9.72266, + "71": 9.60801, + "72": 9.58459, + "73": 9.48963, + "74": 8.94871, + "75": 9.41912, + "76": 9.08725, + "77": 10.06354, + "78": 9.72835, + "79": 9.37162, + "80": 9.40077, + "81": 9.47843, + "82": 9.69177, + "83": 9.3076, + "84": 9.41232, + "85": 9.61207, + "86": 9.07599, + "87": 9.59468, + "88": 9.74738, + "89": 9.60686, + "90": 9.81015, + "91": 9.34359, + "92": 9.36482, + "93": 9.07761, + "94": 8.83108, + "95": 9.51716, + "96": 9.52447, + "97": 9.31027, + "98": 9.67892, + "99": 8.88832, + "100": 9.4015 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1627.0, + "2": 1801.0, + "3": 1730.0, + "4": 1762.0, + "5": 2010.0, + "6": 1889.0, + "7": 1888.0, + "8": 1729.0, + "9": 1852.0, + "10": 1368.0, + "11": 1973.0, + "12": 1722.0, + "13": 1966.0, + "14": 1874.0, + "15": 1897.0, + "16": 1785.0, + "17": 1942.0, + "18": 1718.0, + "19": 1716.0, + "20": 1626.0, + "21": 1797.0, + "22": 1673.0, + "23": 1937.0, + "24": 1561.0, + "25": 1743.0, + "26": 1917.0, + "27": 1886.0, + "28": 1968.0, + "29": 2029.0, + "30": 1930.0, + "31": 1635.0, + "32": 1974.0, + "33": 2159.0, + "34": 2035.0, + "35": 1954.0, + "36": 1948.0, + "37": 2317.0, + "38": 2312.0, + "39": 2458.0, + "40": 2199.0, + "41": 2352.0, + "42": 2288.0, + "43": 2005.0, + "44": 2191.0, + "45": 2068.0, + "46": 2272.0, + "47": 2530.0, + "48": 2458.0, + "49": 2252.0, + "50": 2460.0, + "51": 2777.0, + "52": 2659.0, + "53": 2959.0, + "54": 2700.0, + "55": 2427.0, + "56": 2797.0, + "57": 2430.0, + "58": 3077.0, + "59": 2781.0, + "60": 2380.0, + "61": 2816.0, + "62": 2812.0, + "63": 2452.0, + "64": 2958.0, + "65": 2657.0, + "66": 3208.0, + "67": 2786.0, + "68": 2842.0, + "69": 2927.0, + "70": 3265.0, + "71": 3098.0, + "72": 2445.0, + "73": 3120.0, + "74": 1900.0, + "75": 2675.0, + "76": 3065.0, + "77": 3452.0, + "78": 3263.0, + "79": 3398.0, + "80": 3434.0, + "81": 3695.0, + "82": 3308.0, + "83": 2935.0, + "84": 3423.0, + "85": 3302.0, + "86": 2785.0, + "87": 3788.0, + "88": 3030.0, + "89": 3532.0, + "90": 3230.0, + "91": 2681.0, + "92": 3175.0, + "93": 2718.0, + "94": 3392.0, + "95": 3340.0, + "96": 3504.0, + "97": 3227.0, + "98": 3757.0, + "99": 3245.0, + "100": 3291.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 489193472.0, + "2": 489193472.0, + "3": 489193472.0, + "4": 489193472.0, + "5": 489193472.0, + "6": 489193472.0, + "7": 489193472.0, + "8": 489193472.0, + "9": 489193472.0, + "10": 489193472.0, + "11": 489193472.0, + "12": 489193472.0, + "13": 489193472.0, + "14": 489193472.0, + "15": 489193472.0, + "16": 489193472.0, + "17": 489193472.0, + "18": 489193472.0, + "19": 489193472.0, + "20": 489193472.0, + "21": 489193472.0, + "22": 489193472.0, + "23": 489193472.0, + "24": 489193472.0, + "25": 489193472.0, + "26": 489193472.0, + "27": 489193472.0, + "28": 489193472.0, + "29": 489193472.0, + "30": 489193472.0, + "31": 489193472.0, + "32": 489193472.0, + "33": 489193472.0, + "34": 489193472.0, + "35": 489193472.0, + "36": 489193472.0, + "37": 489193472.0, + "38": 489193472.0, + "39": 489193472.0, + "40": 489193472.0, + "41": 489193472.0, + "42": 489193472.0, + "43": 489193472.0, + "44": 489193472.0, + "45": 489193472.0, + "46": 489193472.0, + "47": 489193472.0, + "48": 489193472.0, + "49": 489193472.0, + "50": 489193472.0, + "51": 489193472.0, + "52": 489193472.0, + "53": 489193472.0, + "54": 489193472.0, + "55": 489193472.0, + "56": 489193472.0, + "57": 489193472.0, + "58": 489193472.0, + "59": 489193472.0, + "60": 489193472.0, + "61": 489193472.0, + "62": 489193472.0, + "63": 489193472.0, + "64": 489193472.0, + "65": 489193472.0, + "66": 489193472.0, + "67": 489193472.0, + "68": 489193472.0, + "69": 489193472.0, + "70": 489193472.0, + "71": 489193472.0, + "72": 489193472.0, + "73": 489193472.0, + "74": 489193472.0, + "75": 489193472.0, + "76": 489193472.0, + "77": 489193472.0, + "78": 489193472.0, + "79": 489193472.0, + "80": 489193472.0, + "81": 489193472.0, + "82": 489193472.0, + "83": 489193472.0, + "84": 489193472.0, + "85": 489193472.0, + "86": 489193472.0, + "87": 489193472.0, + "88": 489193472.0, + "89": 489193472.0, + "90": 489193472.0, + "91": 489193472.0, + "92": 489193472.0, + "93": 489193472.0, + "94": 489193472.0, + "95": 489193472.0, + "96": 489193472.0, + "97": 489193472.0, + "98": 489193472.0, + "99": 489193472.0, + "100": 489193472.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1720084480.0, + "2": 1902255104.0, + "3": 1902255104.0, + "4": 1902255104.0, + "5": 1902255104.0, + "6": 1902255104.0, + "7": 1902255104.0, + "8": 1902255104.0, + "9": 1902255104.0, + "10": 1902255104.0, + "11": 1902255104.0, + "12": 1902255104.0, + "13": 1902255104.0, + "14": 1902255104.0, + "15": 1902255104.0, + "16": 1902255104.0, + "17": 1902255104.0, + "18": 1902255104.0, + "19": 1902255104.0, + "20": 1902255104.0, + "21": 1902255104.0, + "22": 1902255104.0, + "23": 1902255104.0, + "24": 1902255104.0, + "25": 1902255104.0, + "26": 1902255104.0, + "27": 1902255104.0, + "28": 1902255104.0, + "29": 1902255104.0, + "30": 1902255104.0, + "31": 1902255104.0, + "32": 1902255104.0, + "33": 1902255104.0, + "34": 1902255104.0, + "35": 1902255104.0, + "36": 1902255104.0, + "37": 1902255104.0, + "38": 1902255104.0, + "39": 1902255104.0, + "40": 1902255104.0, + "41": 1902255104.0, + "42": 1902255104.0, + "43": 1902255104.0, + "44": 1902255104.0, + "45": 1902255104.0, + "46": 1902255104.0, + "47": 1902255104.0, + "48": 1902255104.0, + "49": 1902255104.0, + "50": 1902255104.0, + "51": 1902255104.0, + "52": 1902255104.0, + "53": 1902255104.0, + "54": 1902255104.0, + "55": 1902255104.0, + "56": 1902255104.0, + "57": 1902255104.0, + "58": 1902255104.0, + "59": 1902255104.0, + "60": 1902255104.0, + "61": 1902255104.0, + "62": 1902255104.0, + "63": 1902255104.0, + "64": 1902255104.0, + "65": 1902255104.0, + "66": 1902255104.0, + "67": 1902255104.0, + "68": 1902910464.0, + "69": 1902910464.0, + "70": 1902910464.0, + "71": 1902910464.0, + "72": 1902910464.0, + "73": 1902910464.0, + "74": 1902910464.0, + "75": 1902910464.0, + "76": 1902910464.0, + "77": 1902910464.0, + "78": 1902910464.0, + "79": 1902910464.0, + "80": 1902910464.0, + "81": 1902910464.0, + "82": 1902910464.0, + "83": 1902910464.0, + "84": 1902910464.0, + "85": 1902910464.0, + "86": 1902910464.0, + "87": 1902910464.0, + "88": 1902910464.0, + "89": 1902910464.0, + "90": 1902910464.0, + "91": 1902910464.0, + "92": 1902910464.0, + "93": 1902910464.0, + "94": 1902910464.0, + "95": 1902910464.0, + "96": 1902910464.0, + "97": 1902910464.0, + "98": 1902910464.0, + "99": 1902910464.0, + "100": 1902910464.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.34333, + "2": 0.56623, + "3": 0.22775, + "4": 0.22931, + "5": 0.22667, + "6": 0.22758, + "7": 0.23105, + "8": 0.22555, + "9": 0.22541, + "10": 0.22533, + "11": 0.26995, + "12": 0.22791, + "13": 0.22744, + "14": 0.2254, + "15": 0.22691, + "16": 0.22536, + "17": 0.22399, + "18": 0.224, + "19": 0.22435, + "20": 0.22788, + "21": 0.22441, + "22": 0.2236, + "23": 0.22313, + "24": 0.22481, + "25": 0.22503, + "26": 0.22356, + "27": 0.22387, + "28": 0.22422, + "29": 0.22896, + "30": 0.22362, + "31": 0.22424, + "32": 0.22361, + "33": 0.2255, + "34": 0.22376, + "35": 0.2227, + "36": 0.22202, + "37": 0.22249, + "38": 0.22911, + "39": 0.22157, + "40": 0.22231, + "41": 0.22166, + "42": 0.22525, + "43": 0.2221, + "44": 0.22185, + "45": 0.22126, + "46": 0.22185, + "47": 0.2264, + "48": 0.22191, + "49": 0.2212, + "50": 0.22178, + "51": 0.23228, + "52": 0.22482, + "53": 0.22431, + "54": 0.22641, + "55": 0.22437, + "56": 0.22665, + "57": 0.22617, + "58": 0.2284, + "59": 0.22644, + "60": 0.22523, + "61": 0.22532, + "62": 0.2282, + "63": 0.22526, + "64": 0.22535, + "65": 0.22523, + "66": 0.22567, + "67": 0.22948, + "68": 0.22527, + "69": 0.22591, + "70": 0.22514, + "71": 0.2281, + "72": 0.22718, + "73": 0.22617, + "74": 0.22559, + "75": 0.22567, + "76": 0.22848, + "77": 0.22459, + "78": 0.22571, + "79": 0.22534, + "80": 0.22962, + "81": 0.2301, + "82": 0.22809, + "83": 0.2285, + "84": 0.22921, + "85": 0.2309, + "86": 0.22744, + "87": 0.22777, + "88": 0.22831, + "89": 0.23199, + "90": 0.22761, + "91": 0.22896, + "92": 0.22814, + "93": 0.23065, + "94": 0.22829, + "95": 0.22767, + "96": 0.22866, + "97": 0.22828, + "98": 0.23227, + "99": 0.22772, + "100": 0.2283 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..1bd58f46aa2 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.92705, + "2": 10.92645, + "3": 10.91604, + "4": 10.90911, + "5": 10.92795, + "6": 10.93626, + "7": 10.90626, + "8": 10.92128, + "9": 10.90998, + "10": 10.90786, + "11": 10.89335, + "12": 10.92456, + "13": 10.9146, + "14": 10.9213, + "15": 10.88314, + "16": 10.87325, + "17": 10.84129, + "18": 10.87276, + "19": 10.8563, + "20": 10.77629, + "21": 10.74869, + "22": 10.63031, + "23": 10.75678, + "24": 10.65646, + "25": 10.59141, + "26": 10.65375, + "27": 10.6485, + "28": 10.59548, + "29": 10.6088, + "30": 10.39192, + "31": 10.15753, + "32": 10.49098, + "33": 10.4793, + "34": 10.24058, + "35": 10.29686, + "36": 10.24644, + "37": 10.35232, + "38": 10.20489, + "39": 10.4052, + "40": 10.0964, + "41": 10.15175, + "42": 10.22026, + "43": 9.85499, + "44": 9.96143, + "45": 9.84464, + "46": 9.83801, + "47": 10.13988, + "48": 9.85718, + "49": 9.53698, + "50": 9.90918, + "51": 9.84886, + "52": 9.74154, + "53": 10.06347, + "54": 9.94683, + "55": 9.87762, + "56": 9.6274, + "57": 9.47112, + "58": 9.82925, + "59": 9.58253, + "60": 9.49121, + "61": 9.69956, + "62": 9.97968, + "63": 9.37277, + "64": 9.77468, + "65": 8.94236, + "66": 9.6991, + "67": 9.36382, + "68": 9.78787, + "69": 9.78332, + "70": 9.72266, + "71": 9.60801, + "72": 9.58459, + "73": 9.48963, + "74": 8.94871, + "75": 9.41912, + "76": 9.08725, + "77": 10.06354, + "78": 9.72835, + "79": 9.37162, + "80": 9.40077, + "81": 9.47843, + "82": 9.69177, + "83": 9.3076, + "84": 9.41232, + "85": 9.61207, + "86": 9.07599, + "87": 9.59468, + "88": 9.74738, + "89": 9.60686, + "90": 9.81015, + "91": 9.34359, + "92": 9.36482, + "93": 9.07761, + "94": 8.83108, + "95": 9.51716, + "96": 9.52447, + "97": 9.31027, + "98": 9.67892, + "99": 8.88832, + "100": 9.4015 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1627.0, + "2": 1801.0, + "3": 1730.0, + "4": 1762.0, + "5": 2010.0, + "6": 1889.0, + "7": 1888.0, + "8": 1729.0, + "9": 1852.0, + "10": 1368.0, + "11": 1973.0, + "12": 1722.0, + "13": 1966.0, + "14": 1874.0, + "15": 1897.0, + "16": 1785.0, + "17": 1942.0, + "18": 1718.0, + "19": 1716.0, + "20": 1626.0, + "21": 1797.0, + "22": 1673.0, + "23": 1937.0, + "24": 1561.0, + "25": 1743.0, + "26": 1917.0, + "27": 1886.0, + "28": 1968.0, + "29": 2029.0, + "30": 1930.0, + "31": 1635.0, + "32": 1974.0, + "33": 2159.0, + "34": 2035.0, + "35": 1954.0, + "36": 1948.0, + "37": 2317.0, + "38": 2312.0, + "39": 2458.0, + "40": 2199.0, + "41": 2352.0, + "42": 2288.0, + "43": 2005.0, + "44": 2191.0, + "45": 2068.0, + "46": 2272.0, + "47": 2530.0, + "48": 2458.0, + "49": 2252.0, + "50": 2460.0, + "51": 2777.0, + "52": 2659.0, + "53": 2959.0, + "54": 2700.0, + "55": 2427.0, + "56": 2797.0, + "57": 2430.0, + "58": 3077.0, + "59": 2781.0, + "60": 2380.0, + "61": 2816.0, + "62": 2812.0, + "63": 2452.0, + "64": 2958.0, + "65": 2657.0, + "66": 3208.0, + "67": 2786.0, + "68": 2842.0, + "69": 2927.0, + "70": 3265.0, + "71": 3098.0, + "72": 2445.0, + "73": 3120.0, + "74": 1900.0, + "75": 2675.0, + "76": 3065.0, + "77": 3452.0, + "78": 3263.0, + "79": 3398.0, + "80": 3434.0, + "81": 3695.0, + "82": 3308.0, + "83": 2935.0, + "84": 3423.0, + "85": 3302.0, + "86": 2785.0, + "87": 3788.0, + "88": 3030.0, + "89": 3532.0, + "90": 3230.0, + "91": 2681.0, + "92": 3175.0, + "93": 2718.0, + "94": 3392.0, + "95": 3340.0, + "96": 3504.0, + "97": 3227.0, + "98": 3757.0, + "99": 3245.0, + "100": 3291.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 487096320.0, + "2": 487096320.0, + "3": 487096320.0, + "4": 487096320.0, + "5": 487096320.0, + "6": 487096320.0, + "7": 487096320.0, + "8": 487096320.0, + "9": 487096320.0, + "10": 487096320.0, + "11": 487096320.0, + "12": 487096320.0, + "13": 487096320.0, + "14": 487096320.0, + "15": 487096320.0, + "16": 487096320.0, + "17": 487096320.0, + "18": 487096320.0, + "19": 487096320.0, + "20": 487096320.0, + "21": 487096320.0, + "22": 487096320.0, + "23": 487096320.0, + "24": 487096320.0, + "25": 487096320.0, + "26": 487096320.0, + "27": 487096320.0, + "28": 487096320.0, + "29": 487096320.0, + "30": 487096320.0, + "31": 487096320.0, + "32": 487096320.0, + "33": 487096320.0, + "34": 487096320.0, + "35": 487096320.0, + "36": 487096320.0, + "37": 487096320.0, + "38": 487096320.0, + "39": 487096320.0, + "40": 487096320.0, + "41": 487096320.0, + "42": 487096320.0, + "43": 487096320.0, + "44": 487096320.0, + "45": 487096320.0, + "46": 487096320.0, + "47": 487096320.0, + "48": 487096320.0, + "49": 487096320.0, + "50": 487096320.0, + "51": 487096320.0, + "52": 487096320.0, + "53": 487096320.0, + "54": 487096320.0, + "55": 487096320.0, + "56": 487096320.0, + "57": 487096320.0, + "58": 487096320.0, + "59": 487096320.0, + "60": 487096320.0, + "61": 487096320.0, + "62": 487096320.0, + "63": 487096320.0, + "64": 487096320.0, + "65": 487096320.0, + "66": 487096320.0, + "67": 487096320.0, + "68": 487096320.0, + "69": 487096320.0, + "70": 487096320.0, + "71": 487096320.0, + "72": 487096320.0, + "73": 487096320.0, + "74": 487096320.0, + "75": 487096320.0, + "76": 487096320.0, + "77": 487096320.0, + "78": 487096320.0, + "79": 487096320.0, + "80": 487096320.0, + "81": 487096320.0, + "82": 487096320.0, + "83": 487096320.0, + "84": 487096320.0, + "85": 487096320.0, + "86": 487096320.0, + "87": 487096320.0, + "88": 487096320.0, + "89": 487096320.0, + "90": 487096320.0, + "91": 487096320.0, + "92": 487096320.0, + "93": 487096320.0, + "94": 487096320.0, + "95": 487096320.0, + "96": 487096320.0, + "97": 487096320.0, + "98": 487096320.0, + "99": 487096320.0, + "100": 487096320.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1720084480.0, + "2": 1900157952.0, + "3": 1901074432.0, + "4": 1901074432.0, + "5": 1901074432.0, + "6": 1901074432.0, + "7": 1901074432.0, + "8": 1901074432.0, + "9": 1901074432.0, + "10": 1901074432.0, + "11": 1901074432.0, + "12": 1901074432.0, + "13": 1901074432.0, + "14": 1901074432.0, + "15": 1901074432.0, + "16": 1901074432.0, + "17": 1901074432.0, + "18": 1901074432.0, + "19": 1901074432.0, + "20": 1901074432.0, + "21": 1901074432.0, + "22": 1901074432.0, + "23": 1901074432.0, + "24": 1901074432.0, + "25": 1901074432.0, + "26": 1901074432.0, + "27": 1901074432.0, + "28": 1901074432.0, + "29": 1901074432.0, + "30": 1901074432.0, + "31": 1901074432.0, + "32": 1901074432.0, + "33": 1901074432.0, + "34": 1901074432.0, + "35": 1901074432.0, + "36": 1901074432.0, + "37": 1901074432.0, + "38": 1901074432.0, + "39": 1901074432.0, + "40": 1901074432.0, + "41": 1901074432.0, + "42": 1901074432.0, + "43": 1901074432.0, + "44": 1901074432.0, + "45": 1901074432.0, + "46": 1901074432.0, + "47": 1901074432.0, + "48": 1901074432.0, + "49": 1901074432.0, + "50": 1901074432.0, + "51": 1901074432.0, + "52": 1901074432.0, + "53": 1901074432.0, + "54": 1901074432.0, + "55": 1901074432.0, + "56": 1901074432.0, + "57": 1901074432.0, + "58": 1901074432.0, + "59": 1901074432.0, + "60": 1901074432.0, + "61": 1901074432.0, + "62": 1901074432.0, + "63": 1901074432.0, + "64": 1901074432.0, + "65": 1901074432.0, + "66": 1901074432.0, + "67": 1901074432.0, + "68": 1901074432.0, + "69": 1901074432.0, + "70": 1901074432.0, + "71": 1901074432.0, + "72": 1901074432.0, + "73": 1901074432.0, + "74": 1901074432.0, + "75": 1901074432.0, + "76": 1901074432.0, + "77": 1901074432.0, + "78": 1901074432.0, + "79": 1901074432.0, + "80": 1901074432.0, + "81": 1901074432.0, + "82": 1901074432.0, + "83": 1901074432.0, + "84": 1901074432.0, + "85": 1901074432.0, + "86": 1901074432.0, + "87": 1901074432.0, + "88": 1901074432.0, + "89": 1901074432.0, + "90": 1901074432.0, + "91": 1901074432.0, + "92": 1901074432.0, + "93": 1901074432.0, + "94": 1901074432.0, + "95": 1901074432.0, + "96": 1901074432.0, + "97": 1901075456.0, + "98": 1901075456.0, + "99": 1901075456.0, + "100": 1901075456.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 14.46737, + "2": 0.26476, + "3": 0.23109, + "4": 0.22854, + "5": 0.22879, + "6": 0.2287, + "7": 0.23086, + "8": 0.2297, + "9": 0.23098, + "10": 0.23075, + "11": 0.23448, + "12": 0.22804, + "13": 0.22739, + "14": 0.22761, + "15": 0.23146, + "16": 0.23026, + "17": 0.22798, + "18": 0.22761, + "19": 0.22857, + "20": 0.23372, + "21": 0.22829, + "22": 0.22692, + "23": 0.22737, + "24": 0.2331, + "25": 0.22606, + "26": 0.22294, + "27": 0.22159, + "28": 0.22628, + "29": 0.22561, + "30": 0.22244, + "31": 0.22214, + "32": 0.22237, + "33": 0.22509, + "34": 0.2221, + "35": 0.22109, + "36": 0.22181, + "37": 0.22344, + "38": 0.22457, + "39": 0.22467, + "40": 0.22286, + "41": 0.22296, + "42": 0.45657, + "43": 0.22367, + "44": 0.22117, + "45": 0.22234, + "46": 0.22174, + "47": 0.21959, + "48": 0.22089, + "49": 0.2205, + "50": 0.22426, + "51": 0.22836, + "52": 0.22291, + "53": 0.22086, + "54": 0.22358, + "55": 0.22346, + "56": 0.22218, + "57": 0.22243, + "58": 0.22521, + "59": 0.22456, + "60": 0.22259, + "61": 0.22057, + "62": 0.22205, + "63": 0.22691, + "64": 0.22417, + "65": 0.22198, + "66": 0.22355, + "67": 0.22656, + "68": 0.22317, + "69": 0.22524, + "70": 0.22257, + "71": 0.22136, + "72": 0.22488, + "73": 0.22888, + "74": 0.22324, + "75": 0.22323, + "76": 0.22142, + "77": 0.22393, + "78": 0.22004, + "79": 0.21926, + "80": 0.22221, + "81": 0.22531, + "82": 0.22283, + "83": 0.22227, + "84": 0.22148, + "85": 0.2249, + "86": 0.22229, + "87": 0.22163, + "88": 0.222, + "89": 0.22492, + "90": 0.23375, + "91": 0.22011, + "92": 0.21919, + "93": 0.2217, + "94": 0.22533, + "95": 0.22265, + "96": 0.22352, + "97": 0.2219, + "98": 0.22608, + "99": 0.23763, + "100": 0.22445 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 41c7d6f3fd5..f5b16bf0710 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,537 @@ -{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.85949, "5": 10.88343, "10": 10.83882, "15": 10.84047, "20": 10.73196, "25": 10.54812, "30": 10.37134, "35": 10.27171, "40": 10.09427, "45": 9.84081, "50": 9.90876, "55": 9.882, "60": 9.50647, "65": 8.95171, "70": 9.74738, "75": 9.42706, "80": 9.40987, "85": 9.61376, "90": 9.81895, "95": 9.52168, "100": 9.39725}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 71.0, "5": 55.0, "10": 65.0, "15": 71.0, "20": 61.0, "25": 66.0, "30": 71.0, "35": 69.0, "40": 81.0, "45": 85.0, "50": 80.0, "55": 58.0, "60": 84.0, "65": 81.0, "70": 88.0, "75": 70.0, "80": 90.0, "85": 89.0, "90": 72.0, "95": 70.0, "100": 75.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 520651264.0, "5": 520651264.0, "10": 520651264.0, "15": 520651264.0, "20": 520651264.0, "25": 520651264.0, "30": 520651264.0, "35": 520651264.0, "40": 520651264.0, "45": 520651264.0, "50": 520651264.0, "55": 520651264.0, "60": 520651264.0, "65": 520651264.0, "70": 520651264.0, "75": 520651264.0, "80": 520651264.0, "85": 520651264.0, "90": 520651264.0, "95": 520651264.0, "100": 520651264.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1687975424.0, "5": 1870146048.0, "10": 1870146048.0, "15": 1870146048.0, "20": 1870146048.0, "25": 1870146048.0, "30": 1870146048.0, "35": 1870146048.0, "40": 1870146048.0, "45": 1870146048.0, "50": 1870146048.0, "55": 1870146048.0, "60": 1870146048.0, "65": 1870146048.0, "70": 1870146048.0, "75": 1870146048.0, "80": 1870146048.0, "85": 1870146048.0, "90": 1870146048.0, "95": 1870146048.0, "100": 1870146048.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 12.28055, "5": 0.12657, "10": 0.12544, "15": 0.13519, "20": 0.12958, "25": 0.12817, "30": 0.1293, "35": 0.12396, "40": 0.1241, "45": 0.12562, "50": 0.1228, "55": 0.127, "60": 0.12853, "65": 0.12708, "70": 0.12816, "75": 0.12308, "80": 0.12181, "85": 0.12079, "90": 0.12388, "95": 0.1228, "100": 0.12387}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85949, + "2": 10.85553, + "3": 10.86544, + "4": 10.84555, + "5": 10.88343, + "6": 10.89431, + "7": 10.87069, + "8": 10.86982, + "9": 10.8692, + "10": 10.83882, + "11": 10.89437, + "12": 10.8798, + "13": 10.87986, + "14": 10.90316, + "15": 10.84047, + "16": 10.83785, + "17": 10.8067, + "18": 10.83027, + "19": 10.82265, + "20": 10.73196, + "21": 10.70751, + "22": 10.56001, + "23": 10.72404, + "24": 10.61114, + "25": 10.54812, + "26": 10.61333, + "27": 10.63051, + "28": 10.56645, + "29": 10.59672, + "30": 10.37134, + "31": 10.11723, + "32": 10.46131, + "33": 10.4525, + "34": 10.21689, + "35": 10.27171, + "36": 10.2312, + "37": 10.34809, + "38": 10.18839, + "39": 10.41045, + "40": 10.09427, + "41": 10.1471, + "42": 10.21241, + "43": 9.84107, + "44": 9.95919, + "45": 9.84081, + "46": 9.82483, + "47": 10.13877, + "48": 9.85832, + "49": 9.54703, + "50": 9.90876, + "51": 9.85581, + "52": 9.75235, + "53": 10.07582, + "54": 9.95687, + "55": 9.882, + "56": 9.63137, + "57": 9.48647, + "58": 9.83111, + "59": 9.58896, + "60": 9.50647, + "61": 9.70361, + "62": 9.98283, + "63": 9.38302, + "64": 9.77906, + "65": 8.95171, + "66": 9.70162, + "67": 9.372, + "68": 9.78849, + "69": 9.79851, + "70": 9.74738, + "71": 9.61908, + "72": 9.58496, + "73": 9.49723, + "74": 8.93927, + "75": 9.42706, + "76": 9.08018, + "77": 10.06566, + "78": 9.72889, + "79": 9.37757, + "80": 9.40987, + "81": 9.47974, + "82": 9.70177, + "83": 9.30611, + "84": 9.42088, + "85": 9.61376, + "86": 9.07651, + "87": 9.59452, + "88": 9.75067, + "89": 9.60239, + "90": 9.81895, + "91": 9.33895, + "92": 9.35712, + "93": 9.07879, + "94": 8.83504, + "95": 9.52168, + "96": 9.53002, + "97": 9.31306, + "98": 9.67783, + "99": 8.89053, + "100": 9.39725 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 71.0, + "2": 65.0, + "3": 68.0, + "4": 57.0, + "5": 55.0, + "6": 70.0, + "7": 73.0, + "8": 58.0, + "9": 66.0, + "10": 65.0, + "11": 58.0, + "12": 77.0, + "13": 50.0, + "14": 65.0, + "15": 71.0, + "16": 68.0, + "17": 58.0, + "18": 57.0, + "19": 68.0, + "20": 61.0, + "21": 65.0, + "22": 57.0, + "23": 83.0, + "24": 58.0, + "25": 66.0, + "26": 63.0, + "27": 80.0, + "28": 82.0, + "29": 72.0, + "30": 71.0, + "31": 68.0, + "32": 75.0, + "33": 85.0, + "34": 63.0, + "35": 69.0, + "36": 58.0, + "37": 83.0, + "38": 65.0, + "39": 68.0, + "40": 81.0, + "41": 72.0, + "42": 76.0, + "43": 84.0, + "44": 85.0, + "45": 85.0, + "46": 79.0, + "47": 81.0, + "48": 68.0, + "49": 89.0, + "50": 80.0, + "51": 70.0, + "52": 81.0, + "53": 95.0, + "54": 101.0, + "55": 58.0, + "56": 90.0, + "57": 83.0, + "58": 90.0, + "59": 79.0, + "60": 84.0, + "61": 92.0, + "62": 102.0, + "63": 78.0, + "64": 73.0, + "65": 81.0, + "66": 88.0, + "67": 54.0, + "68": 57.0, + "69": 72.0, + "70": 88.0, + "71": 82.0, + "72": 64.0, + "73": 78.0, + "74": 76.0, + "75": 70.0, + "76": 78.0, + "77": 67.0, + "78": 86.0, + "79": 76.0, + "80": 90.0, + "81": 92.0, + "82": 72.0, + "83": 61.0, + "84": 65.0, + "85": 89.0, + "86": 73.0, + "87": 89.0, + "88": 63.0, + "89": 83.0, + "90": 72.0, + "91": 55.0, + "92": 63.0, + "93": 47.0, + "94": 74.0, + "95": 70.0, + "96": 73.0, + "97": 80.0, + "98": 76.0, + "99": 68.0, + "100": 75.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 546472448.0, + "2": 546472448.0, + "3": 546472448.0, + "4": 546472448.0, + "5": 546472448.0, + "6": 546472448.0, + "7": 546472448.0, + "8": 546472448.0, + "9": 546472448.0, + "10": 546472448.0, + "11": 546472448.0, + "12": 546472448.0, + "13": 546472448.0, + "14": 546472448.0, + "15": 546472448.0, + "16": 546472448.0, + "17": 546472448.0, + "18": 546472448.0, + "19": 546472448.0, + "20": 546472448.0, + "21": 546472448.0, + "22": 546472448.0, + "23": 546472448.0, + "24": 546472448.0, + "25": 546472448.0, + "26": 546472448.0, + "27": 546472448.0, + "28": 546472448.0, + "29": 546472448.0, + "30": 546472448.0, + "31": 546472448.0, + "32": 546472448.0, + "33": 546472448.0, + "34": 546472448.0, + "35": 546472448.0, + "36": 546472448.0, + "37": 546472448.0, + "38": 546472448.0, + "39": 546472448.0, + "40": 546472448.0, + "41": 546472448.0, + "42": 546472448.0, + "43": 546472448.0, + "44": 546472448.0, + "45": 546472448.0, + "46": 546472448.0, + "47": 546472448.0, + "48": 546472448.0, + "49": 546472448.0, + "50": 546472448.0, + "51": 546472448.0, + "52": 546472448.0, + "53": 546472448.0, + "54": 546472448.0, + "55": 546472448.0, + "56": 546472448.0, + "57": 546472448.0, + "58": 546472448.0, + "59": 546472448.0, + "60": 546472448.0, + "61": 546472448.0, + "62": 546472448.0, + "63": 546472448.0, + "64": 546472448.0, + "65": 546472448.0, + "66": 546472448.0, + "67": 546472448.0, + "68": 546472448.0, + "69": 546472448.0, + "70": 546472448.0, + "71": 546472448.0, + "72": 546472448.0, + "73": 546472448.0, + "74": 546472448.0, + "75": 546472448.0, + "76": 546472448.0, + "77": 546472448.0, + "78": 546472448.0, + "79": 546472448.0, + "80": 546472448.0, + "81": 546472448.0, + "82": 546472448.0, + "83": 546472448.0, + "84": 546472448.0, + "85": 546472448.0, + "86": 546472448.0, + "87": 546472448.0, + "88": 546472448.0, + "89": 546472448.0, + "90": 546472448.0, + "91": 546472448.0, + "92": 546472448.0, + "93": 546472448.0, + "94": 546472448.0, + "95": 546472448.0, + "96": 546472448.0, + "97": 546472448.0, + "98": 546472448.0, + "99": 546472448.0, + "100": 546472448.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1713796608.0, + "2": 1895967232.0, + "3": 1895967232.0, + "4": 1895967232.0, + "5": 1895967232.0, + "6": 1895967232.0, + "7": 1895967232.0, + "8": 1895967232.0, + "9": 1895967232.0, + "10": 1895967232.0, + "11": 1895967232.0, + "12": 1895967232.0, + "13": 1895967232.0, + "14": 1895967232.0, + "15": 1895967232.0, + "16": 1895967232.0, + "17": 1895967232.0, + "18": 1895967232.0, + "19": 1895967232.0, + "20": 1895967232.0, + "21": 1895967232.0, + "22": 1895967232.0, + "23": 1895967232.0, + "24": 1895967232.0, + "25": 1895967232.0, + "26": 1895967232.0, + "27": 1895967232.0, + "28": 1895967232.0, + "29": 1895967232.0, + "30": 1895967232.0, + "31": 1895967232.0, + "32": 1895967232.0, + "33": 1895967232.0, + "34": 1895967232.0, + "35": 1895967232.0, + "36": 1895967232.0, + "37": 1895967232.0, + "38": 1895967232.0, + "39": 1895967232.0, + "40": 1895967232.0, + "41": 1895967232.0, + "42": 1895967232.0, + "43": 1895967232.0, + "44": 1895967232.0, + "45": 1895967232.0, + "46": 1895967232.0, + "47": 1895967232.0, + "48": 1895967232.0, + "49": 1895967232.0, + "50": 1895967232.0, + "51": 1895967232.0, + "52": 1895967232.0, + "53": 1895967232.0, + "54": 1895967232.0, + "55": 1895967232.0, + "56": 1895967232.0, + "57": 1895967232.0, + "58": 1895967232.0, + "59": 1895967232.0, + "60": 1895967232.0, + "61": 1895967232.0, + "62": 1895967232.0, + "63": 1895967232.0, + "64": 1895967232.0, + "65": 1895967232.0, + "66": 1895967232.0, + "67": 1895967232.0, + "68": 1895967232.0, + "69": 1895967232.0, + "70": 1895967232.0, + "71": 1895967232.0, + "72": 1895967232.0, + "73": 1895967232.0, + "74": 1895967232.0, + "75": 1895967232.0, + "76": 1895967232.0, + "77": 1895967232.0, + "78": 1895967232.0, + "79": 1895967232.0, + "80": 1895967232.0, + "81": 1895967232.0, + "82": 1895967232.0, + "83": 1895967232.0, + "84": 1895967232.0, + "85": 1895967232.0, + "86": 1895967232.0, + "87": 1895967232.0, + "88": 1895967232.0, + "89": 1895967232.0, + "90": 1895967232.0, + "91": 1895967232.0, + "92": 1895967232.0, + "93": 1895967232.0, + "94": 1895967232.0, + "95": 1895967232.0, + "96": 1895967232.0, + "97": 1895967232.0, + "98": 1895967232.0, + "99": 1895967232.0, + "100": 1895967232.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.72275, + "2": 0.17301, + "3": 0.15386, + "4": 0.16174, + "5": 0.16281, + "6": 0.16123, + "7": 0.16321, + "8": 0.15614, + "9": 0.15485, + "10": 0.15403, + "11": 0.15407, + "12": 0.15562, + "13": 0.15964, + "14": 0.15764, + "15": 0.15375, + "16": 0.1559, + "17": 0.15118, + "18": 0.15439, + "19": 0.15335, + "20": 0.15351, + "21": 0.15162, + "22": 0.15323, + "23": 0.15304, + "24": 0.15257, + "25": 0.15184, + "26": 0.15337, + "27": 0.15366, + "28": 0.1533, + "29": 0.15626, + "30": 0.15279, + "31": 0.15396, + "32": 0.15273, + "33": 0.15868, + "34": 0.15298, + "35": 0.15363, + "36": 0.15504, + "37": 0.15404, + "38": 0.15509, + "39": 0.15421, + "40": 0.15591, + "41": 0.15488, + "42": 0.15491, + "43": 0.15536, + "44": 0.15405, + "45": 0.15301, + "46": 0.1564, + "47": 0.1538, + "48": 0.15496, + "49": 0.15554, + "50": 0.15377, + "51": 0.16069, + "52": 0.15674, + "53": 0.15488, + "54": 0.15626, + "55": 0.15428, + "56": 0.15332, + "57": 0.15575, + "58": 0.15337, + "59": 0.1573, + "60": 0.15494, + "61": 0.15582, + "62": 0.15444, + "63": 0.15451, + "64": 0.15468, + "65": 0.15421, + "66": 0.15605, + "67": 0.15502, + "68": 0.1555, + "69": 0.15365, + "70": 0.15482, + "71": 0.15668, + "72": 0.15572, + "73": 0.15504, + "74": 0.15493, + "75": 0.15395, + "76": 0.1543, + "77": 0.15616, + "78": 0.15412, + "79": 0.15658, + "80": 0.15263, + "81": 0.15632, + "82": 0.15472, + "83": 0.1556, + "84": 0.15407, + "85": 0.15567, + "86": 0.15631, + "87": 0.15367, + "88": 0.15509, + "89": 0.1539, + "90": 0.15608, + "91": 0.15432, + "92": 0.155, + "93": 0.1529, + "94": 0.1541, + "95": 0.15468, + "96": 0.15535, + "97": 0.15603, + "98": 0.15443, + "99": 0.1563, + "100": 0.15285 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..00af7ef1865 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85949, + "2": 10.85553, + "3": 10.86544, + "4": 10.84555, + "5": 10.88343, + "6": 10.89431, + "7": 10.87069, + "8": 10.86982, + "9": 10.8692, + "10": 10.83882, + "11": 10.89437, + "12": 10.8798, + "13": 10.87986, + "14": 10.90316, + "15": 10.84047, + "16": 10.83785, + "17": 10.8067, + "18": 10.83027, + "19": 10.82265, + "20": 10.73196, + "21": 10.70751, + "22": 10.56001, + "23": 10.72404, + "24": 10.61114, + "25": 10.54812, + "26": 10.61333, + "27": 10.63051, + "28": 10.56645, + "29": 10.59672, + "30": 10.37134, + "31": 10.11723, + "32": 10.46131, + "33": 10.4525, + "34": 10.21689, + "35": 10.27171, + "36": 10.2312, + "37": 10.34809, + "38": 10.18839, + "39": 10.41045, + "40": 10.09427, + "41": 10.1471, + "42": 10.21241, + "43": 9.84107, + "44": 9.95919, + "45": 9.84081, + "46": 9.82483, + "47": 10.13877, + "48": 9.85832, + "49": 9.54703, + "50": 9.90876, + "51": 9.85581, + "52": 9.75235, + "53": 10.07582, + "54": 9.95687, + "55": 9.882, + "56": 9.63137, + "57": 9.48647, + "58": 9.83111, + "59": 9.58896, + "60": 9.50647, + "61": 9.70361, + "62": 9.98283, + "63": 9.38302, + "64": 9.77906, + "65": 8.95171, + "66": 9.70162, + "67": 9.372, + "68": 9.78849, + "69": 9.79851, + "70": 9.74738, + "71": 9.61908, + "72": 9.58496, + "73": 9.49723, + "74": 8.93927, + "75": 9.42706, + "76": 9.08018, + "77": 10.06566, + "78": 9.72889, + "79": 9.37757, + "80": 9.40987, + "81": 9.47974, + "82": 9.70177, + "83": 9.30611, + "84": 9.42088, + "85": 9.61376, + "86": 9.07651, + "87": 9.59452, + "88": 9.75067, + "89": 9.60239, + "90": 9.81895, + "91": 9.33895, + "92": 9.35712, + "93": 9.07879, + "94": 8.83504, + "95": 9.52168, + "96": 9.53002, + "97": 9.31306, + "98": 9.67783, + "99": 8.89053, + "100": 9.39725 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 71.0, + "2": 65.0, + "3": 68.0, + "4": 57.0, + "5": 55.0, + "6": 70.0, + "7": 73.0, + "8": 58.0, + "9": 66.0, + "10": 65.0, + "11": 58.0, + "12": 77.0, + "13": 50.0, + "14": 65.0, + "15": 71.0, + "16": 68.0, + "17": 58.0, + "18": 57.0, + "19": 68.0, + "20": 61.0, + "21": 65.0, + "22": 57.0, + "23": 83.0, + "24": 58.0, + "25": 66.0, + "26": 63.0, + "27": 80.0, + "28": 82.0, + "29": 72.0, + "30": 71.0, + "31": 68.0, + "32": 75.0, + "33": 85.0, + "34": 63.0, + "35": 69.0, + "36": 58.0, + "37": 83.0, + "38": 65.0, + "39": 68.0, + "40": 81.0, + "41": 72.0, + "42": 76.0, + "43": 84.0, + "44": 85.0, + "45": 85.0, + "46": 79.0, + "47": 81.0, + "48": 68.0, + "49": 89.0, + "50": 80.0, + "51": 70.0, + "52": 81.0, + "53": 95.0, + "54": 101.0, + "55": 58.0, + "56": 90.0, + "57": 83.0, + "58": 90.0, + "59": 79.0, + "60": 84.0, + "61": 92.0, + "62": 102.0, + "63": 78.0, + "64": 73.0, + "65": 81.0, + "66": 88.0, + "67": 54.0, + "68": 57.0, + "69": 72.0, + "70": 88.0, + "71": 82.0, + "72": 64.0, + "73": 78.0, + "74": 76.0, + "75": 70.0, + "76": 78.0, + "77": 67.0, + "78": 86.0, + "79": 76.0, + "80": 90.0, + "81": 92.0, + "82": 72.0, + "83": 61.0, + "84": 65.0, + "85": 89.0, + "86": 73.0, + "87": 89.0, + "88": 63.0, + "89": 83.0, + "90": 72.0, + "91": 55.0, + "92": 63.0, + "93": 47.0, + "94": 74.0, + "95": 70.0, + "96": 73.0, + "97": 80.0, + "98": 76.0, + "99": 68.0, + "100": 75.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 546472448.0, + "2": 546472448.0, + "3": 546472448.0, + "4": 546472448.0, + "5": 546472448.0, + "6": 546472448.0, + "7": 546472448.0, + "8": 546472448.0, + "9": 546472448.0, + "10": 546472448.0, + "11": 546472448.0, + "12": 546472448.0, + "13": 546472448.0, + "14": 546472448.0, + "15": 546472448.0, + "16": 546472448.0, + "17": 546472448.0, + "18": 546472448.0, + "19": 546472448.0, + "20": 546472448.0, + "21": 546472448.0, + "22": 546472448.0, + "23": 546472448.0, + "24": 546472448.0, + "25": 546472448.0, + "26": 546472448.0, + "27": 546472448.0, + "28": 546472448.0, + "29": 546472448.0, + "30": 546472448.0, + "31": 546472448.0, + "32": 546472448.0, + "33": 546472448.0, + "34": 546472448.0, + "35": 546472448.0, + "36": 546472448.0, + "37": 546472448.0, + "38": 546472448.0, + "39": 546472448.0, + "40": 546472448.0, + "41": 546472448.0, + "42": 546472448.0, + "43": 546472448.0, + "44": 546472448.0, + "45": 546472448.0, + "46": 546472448.0, + "47": 546472448.0, + "48": 546472448.0, + "49": 546472448.0, + "50": 546472448.0, + "51": 546472448.0, + "52": 546472448.0, + "53": 546472448.0, + "54": 546472448.0, + "55": 546472448.0, + "56": 546472448.0, + "57": 546472448.0, + "58": 546472448.0, + "59": 546472448.0, + "60": 546472448.0, + "61": 546472448.0, + "62": 546472448.0, + "63": 546472448.0, + "64": 546472448.0, + "65": 546472448.0, + "66": 546472448.0, + "67": 546472448.0, + "68": 546472448.0, + "69": 546472448.0, + "70": 546472448.0, + "71": 546472448.0, + "72": 546472448.0, + "73": 546472448.0, + "74": 546472448.0, + "75": 546472448.0, + "76": 546472448.0, + "77": 546472448.0, + "78": 546472448.0, + "79": 546472448.0, + "80": 546472448.0, + "81": 546472448.0, + "82": 546472448.0, + "83": 546472448.0, + "84": 546472448.0, + "85": 546472448.0, + "86": 546472448.0, + "87": 546472448.0, + "88": 546472448.0, + "89": 546472448.0, + "90": 546472448.0, + "91": 546472448.0, + "92": 546472448.0, + "93": 546472448.0, + "94": 546472448.0, + "95": 546472448.0, + "96": 546472448.0, + "97": 546472448.0, + "98": 546472448.0, + "99": 546472448.0, + "100": 546472448.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1713796608.0, + "2": 1895967232.0, + "3": 1895967232.0, + "4": 1895967232.0, + "5": 1895967232.0, + "6": 1895967232.0, + "7": 1895967232.0, + "8": 1895967232.0, + "9": 1895967232.0, + "10": 1895967232.0, + "11": 1895967232.0, + "12": 1895967232.0, + "13": 1895967232.0, + "14": 1895967232.0, + "15": 1895967232.0, + "16": 1895967232.0, + "17": 1895967232.0, + "18": 1895967232.0, + "19": 1895967232.0, + "20": 1895967232.0, + "21": 1895967232.0, + "22": 1895967232.0, + "23": 1895967232.0, + "24": 1895967232.0, + "25": 1895967232.0, + "26": 1895967232.0, + "27": 1895967232.0, + "28": 1895967232.0, + "29": 1895967232.0, + "30": 1895967232.0, + "31": 1895967232.0, + "32": 1895967232.0, + "33": 1895967232.0, + "34": 1895967232.0, + "35": 1895967232.0, + "36": 1895967232.0, + "37": 1895967232.0, + "38": 1895967232.0, + "39": 1895967232.0, + "40": 1895967232.0, + "41": 1895967232.0, + "42": 1895967232.0, + "43": 1895967232.0, + "44": 1895967232.0, + "45": 1895967232.0, + "46": 1895967232.0, + "47": 1895967232.0, + "48": 1895967232.0, + "49": 1895967232.0, + "50": 1895967232.0, + "51": 1895967232.0, + "52": 1895967232.0, + "53": 1895967232.0, + "54": 1895967232.0, + "55": 1895967232.0, + "56": 1895967232.0, + "57": 1895967232.0, + "58": 1895967232.0, + "59": 1895967232.0, + "60": 1895967232.0, + "61": 1895967232.0, + "62": 1895967232.0, + "63": 1895967232.0, + "64": 1895967232.0, + "65": 1895967232.0, + "66": 1895967232.0, + "67": 1895967232.0, + "68": 1895967232.0, + "69": 1895967232.0, + "70": 1895967232.0, + "71": 1895967232.0, + "72": 1895967232.0, + "73": 1895967232.0, + "74": 1895967232.0, + "75": 1895967232.0, + "76": 1895967232.0, + "77": 1895967232.0, + "78": 1895967232.0, + "79": 1895967232.0, + "80": 1895967232.0, + "81": 1895967232.0, + "82": 1895967232.0, + "83": 1895967232.0, + "84": 1895967232.0, + "85": 1895967232.0, + "86": 1895967232.0, + "87": 1895967232.0, + "88": 1895967232.0, + "89": 1895967232.0, + "90": 1895967232.0, + "91": 1895967232.0, + "92": 1895967232.0, + "93": 1895967232.0, + "94": 1895967232.0, + "95": 1895967232.0, + "96": 1895967232.0, + "97": 1895967232.0, + "98": 1895967232.0, + "99": 1895967232.0, + "100": 1895967232.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.30059, + "2": 0.17777, + "3": 0.13503, + "4": 0.13378, + "5": 0.1357, + "6": 0.13267, + "7": 0.13302, + "8": 0.13235, + "9": 0.13435, + "10": 0.13421, + "11": 0.13233, + "12": 0.13074, + "13": 0.12922, + "14": 0.13131, + "15": 0.13296, + "16": 0.13106, + "17": 0.13142, + "18": 0.13375, + "19": 0.13295, + "20": 0.13185, + "21": 0.13239, + "22": 0.13128, + "23": 0.13257, + "24": 0.13321, + "25": 0.13186, + "26": 0.13183, + "27": 0.13148, + "28": 0.13158, + "29": 0.13055, + "30": 0.13201, + "31": 0.1314, + "32": 0.13098, + "33": 0.13284, + "34": 0.13152, + "35": 0.13191, + "36": 0.13208, + "37": 0.13199, + "38": 0.13223, + "39": 0.13213, + "40": 0.13135, + "41": 0.13187, + "42": 0.13104, + "43": 0.13286, + "44": 0.13281, + "45": 0.13109, + "46": 0.13108, + "47": 0.13377, + "48": 0.13164, + "49": 0.13194, + "50": 0.1309, + "51": 0.14716, + "52": 0.14386, + "53": 0.133, + "54": 0.13142, + "55": 0.12988, + "56": 0.13391, + "57": 0.14548, + "58": 0.1475, + "59": 0.1326, + "60": 0.13058, + "61": 0.13075, + "62": 0.13206, + "63": 0.13128, + "64": 0.13303, + "65": 0.13059, + "66": 0.12969, + "67": 0.13108, + "68": 0.13125, + "69": 0.1294, + "70": 0.13035, + "71": 0.13528, + "72": 0.13186, + "73": 0.13078, + "74": 0.12997, + "75": 0.13033, + "76": 0.13134, + "77": 0.13127, + "78": 0.12885, + "79": 0.13057, + "80": 0.13054, + "81": 0.131, + "82": 0.13102, + "83": 0.13228, + "84": 0.13261, + "85": 0.1312, + "86": 0.1324, + "87": 0.13346, + "88": 0.13044, + "89": 0.13079, + "90": 0.13018, + "91": 0.13115, + "92": 0.13135, + "93": 0.13062, + "94": 0.13049, + "95": 0.13131, + "96": 0.13099, + "97": 0.13099, + "98": 0.1311, + "99": 0.13221, + "100": 0.13235 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..c1aaf21cf26 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85949, + "2": 10.85553, + "3": 10.86544, + "4": 10.84555, + "5": 10.88343, + "6": 10.89431, + "7": 10.87069, + "8": 10.86982, + "9": 10.8692, + "10": 10.83882, + "11": 10.89437, + "12": 10.8798, + "13": 10.87986, + "14": 10.90316, + "15": 10.84047, + "16": 10.83785, + "17": 10.8067, + "18": 10.83027, + "19": 10.82265, + "20": 10.73196, + "21": 10.70751, + "22": 10.56001, + "23": 10.72404, + "24": 10.61114, + "25": 10.54812, + "26": 10.61333, + "27": 10.63051, + "28": 10.56645, + "29": 10.59672, + "30": 10.37134, + "31": 10.11723, + "32": 10.46131, + "33": 10.4525, + "34": 10.21689, + "35": 10.27171, + "36": 10.2312, + "37": 10.34809, + "38": 10.18839, + "39": 10.41045, + "40": 10.09427, + "41": 10.1471, + "42": 10.21241, + "43": 9.84107, + "44": 9.95919, + "45": 9.84081, + "46": 9.82483, + "47": 10.13877, + "48": 9.85832, + "49": 9.54703, + "50": 9.90876, + "51": 9.85581, + "52": 9.75235, + "53": 10.07582, + "54": 9.95687, + "55": 9.882, + "56": 9.63137, + "57": 9.48647, + "58": 9.83111, + "59": 9.58896, + "60": 9.50647, + "61": 9.70361, + "62": 9.98283, + "63": 9.38302, + "64": 9.77906, + "65": 8.95171, + "66": 9.70162, + "67": 9.372, + "68": 9.78849, + "69": 9.79851, + "70": 9.74738, + "71": 9.61908, + "72": 9.58496, + "73": 9.49723, + "74": 8.93927, + "75": 9.42706, + "76": 9.08018, + "77": 10.06566, + "78": 9.72889, + "79": 9.37757, + "80": 9.40987, + "81": 9.47974, + "82": 9.70177, + "83": 9.30611, + "84": 9.42088, + "85": 9.61376, + "86": 9.07651, + "87": 9.59452, + "88": 9.75067, + "89": 9.60239, + "90": 9.81895, + "91": 9.33895, + "92": 9.35712, + "93": 9.07879, + "94": 8.83504, + "95": 9.52168, + "96": 9.53002, + "97": 9.31306, + "98": 9.67783, + "99": 8.89053, + "100": 9.39725 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 71.0, + "2": 65.0, + "3": 68.0, + "4": 57.0, + "5": 55.0, + "6": 70.0, + "7": 73.0, + "8": 58.0, + "9": 66.0, + "10": 65.0, + "11": 58.0, + "12": 77.0, + "13": 50.0, + "14": 65.0, + "15": 71.0, + "16": 68.0, + "17": 58.0, + "18": 57.0, + "19": 68.0, + "20": 61.0, + "21": 65.0, + "22": 57.0, + "23": 83.0, + "24": 58.0, + "25": 66.0, + "26": 63.0, + "27": 80.0, + "28": 82.0, + "29": 72.0, + "30": 71.0, + "31": 68.0, + "32": 75.0, + "33": 85.0, + "34": 63.0, + "35": 69.0, + "36": 58.0, + "37": 83.0, + "38": 65.0, + "39": 68.0, + "40": 81.0, + "41": 72.0, + "42": 76.0, + "43": 84.0, + "44": 85.0, + "45": 85.0, + "46": 79.0, + "47": 81.0, + "48": 68.0, + "49": 89.0, + "50": 80.0, + "51": 70.0, + "52": 81.0, + "53": 95.0, + "54": 101.0, + "55": 58.0, + "56": 90.0, + "57": 83.0, + "58": 90.0, + "59": 79.0, + "60": 84.0, + "61": 92.0, + "62": 102.0, + "63": 78.0, + "64": 73.0, + "65": 81.0, + "66": 88.0, + "67": 54.0, + "68": 57.0, + "69": 72.0, + "70": 88.0, + "71": 82.0, + "72": 64.0, + "73": 78.0, + "74": 76.0, + "75": 70.0, + "76": 78.0, + "77": 67.0, + "78": 86.0, + "79": 76.0, + "80": 90.0, + "81": 92.0, + "82": 72.0, + "83": 61.0, + "84": 65.0, + "85": 89.0, + "86": 73.0, + "87": 89.0, + "88": 63.0, + "89": 83.0, + "90": 72.0, + "91": 55.0, + "92": 63.0, + "93": 47.0, + "94": 74.0, + "95": 70.0, + "96": 73.0, + "97": 80.0, + "98": 76.0, + "99": 68.0, + "100": 75.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 546472448.0, + "2": 546472448.0, + "3": 546472448.0, + "4": 546472448.0, + "5": 546472448.0, + "6": 546472448.0, + "7": 546472448.0, + "8": 546472448.0, + "9": 546472448.0, + "10": 546472448.0, + "11": 546472448.0, + "12": 546472448.0, + "13": 546472448.0, + "14": 546472448.0, + "15": 546472448.0, + "16": 546472448.0, + "17": 546472448.0, + "18": 546472448.0, + "19": 546472448.0, + "20": 546472448.0, + "21": 546472448.0, + "22": 546472448.0, + "23": 546472448.0, + "24": 546472448.0, + "25": 546472448.0, + "26": 546472448.0, + "27": 546472448.0, + "28": 546472448.0, + "29": 546472448.0, + "30": 546472448.0, + "31": 546472448.0, + "32": 546472448.0, + "33": 546472448.0, + "34": 546472448.0, + "35": 546472448.0, + "36": 546472448.0, + "37": 546472448.0, + "38": 546472448.0, + "39": 546472448.0, + "40": 546472448.0, + "41": 546472448.0, + "42": 546472448.0, + "43": 546472448.0, + "44": 546472448.0, + "45": 546472448.0, + "46": 546472448.0, + "47": 546472448.0, + "48": 546472448.0, + "49": 546472448.0, + "50": 546472448.0, + "51": 546472448.0, + "52": 546472448.0, + "53": 546472448.0, + "54": 546472448.0, + "55": 546472448.0, + "56": 546472448.0, + "57": 546472448.0, + "58": 546472448.0, + "59": 546472448.0, + "60": 546472448.0, + "61": 546472448.0, + "62": 546472448.0, + "63": 546472448.0, + "64": 546472448.0, + "65": 546472448.0, + "66": 546472448.0, + "67": 546472448.0, + "68": 546472448.0, + "69": 546472448.0, + "70": 546472448.0, + "71": 546472448.0, + "72": 546472448.0, + "73": 546472448.0, + "74": 546472448.0, + "75": 546472448.0, + "76": 546472448.0, + "77": 546472448.0, + "78": 546472448.0, + "79": 546472448.0, + "80": 546472448.0, + "81": 546472448.0, + "82": 546472448.0, + "83": 546472448.0, + "84": 546472448.0, + "85": 546472448.0, + "86": 546472448.0, + "87": 546472448.0, + "88": 546472448.0, + "89": 546472448.0, + "90": 546472448.0, + "91": 546472448.0, + "92": 546472448.0, + "93": 546472448.0, + "94": 546472448.0, + "95": 546472448.0, + "96": 546472448.0, + "97": 546472448.0, + "98": 546472448.0, + "99": 546472448.0, + "100": 546472448.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1713796608.0, + "2": 1895967232.0, + "3": 1895967232.0, + "4": 1895967232.0, + "5": 1895967232.0, + "6": 1895967232.0, + "7": 1895967232.0, + "8": 1895967232.0, + "9": 1895967232.0, + "10": 1895967232.0, + "11": 1895967232.0, + "12": 1895967232.0, + "13": 1895967232.0, + "14": 1895967232.0, + "15": 1895967232.0, + "16": 1895967232.0, + "17": 1895967232.0, + "18": 1895967232.0, + "19": 1895967232.0, + "20": 1895967232.0, + "21": 1895967232.0, + "22": 1895967232.0, + "23": 1895967232.0, + "24": 1895967232.0, + "25": 1895967232.0, + "26": 1895967232.0, + "27": 1895967232.0, + "28": 1895967232.0, + "29": 1895967232.0, + "30": 1895967232.0, + "31": 1895967232.0, + "32": 1895967232.0, + "33": 1895967232.0, + "34": 1895967232.0, + "35": 1895967232.0, + "36": 1895967232.0, + "37": 1895967232.0, + "38": 1895967232.0, + "39": 1895967232.0, + "40": 1895967232.0, + "41": 1895967232.0, + "42": 1895967232.0, + "43": 1895967232.0, + "44": 1895967232.0, + "45": 1895967232.0, + "46": 1895967232.0, + "47": 1895967232.0, + "48": 1895967232.0, + "49": 1895967232.0, + "50": 1895967232.0, + "51": 1895967232.0, + "52": 1895967232.0, + "53": 1895967232.0, + "54": 1895967232.0, + "55": 1895967232.0, + "56": 1895967232.0, + "57": 1895967232.0, + "58": 1895967232.0, + "59": 1895967232.0, + "60": 1895967232.0, + "61": 1895967232.0, + "62": 1895967232.0, + "63": 1895967232.0, + "64": 1895967232.0, + "65": 1895967232.0, + "66": 1895967232.0, + "67": 1895967232.0, + "68": 1895967232.0, + "69": 1895967232.0, + "70": 1895967232.0, + "71": 1895967232.0, + "72": 1895967232.0, + "73": 1895967232.0, + "74": 1895967232.0, + "75": 1895967232.0, + "76": 1895967232.0, + "77": 1895967232.0, + "78": 1895967232.0, + "79": 1895967232.0, + "80": 1895967232.0, + "81": 1895967232.0, + "82": 1895967232.0, + "83": 1895967232.0, + "84": 1895967232.0, + "85": 1895967232.0, + "86": 1895967232.0, + "87": 1895967232.0, + "88": 1895967232.0, + "89": 1895967232.0, + "90": 1895967232.0, + "91": 1895967232.0, + "92": 1895967232.0, + "93": 1895967232.0, + "94": 1895967232.0, + "95": 1895967232.0, + "96": 1895967232.0, + "97": 1895967232.0, + "98": 1895967232.0, + "99": 1895967232.0, + "100": 1895967232.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.81196, + "2": 0.17008, + "3": 0.15523, + "4": 0.15249, + "5": 0.15434, + "6": 0.15515, + "7": 0.15378, + "8": 0.1528, + "9": 0.15287, + "10": 0.15479, + "11": 0.15442, + "12": 0.15952, + "13": 0.15843, + "14": 0.15559, + "15": 0.15333, + "16": 0.15363, + "17": 0.15594, + "18": 0.153, + "19": 0.15542, + "20": 0.15304, + "21": 0.15492, + "22": 0.15277, + "23": 0.15803, + "24": 0.1545, + "25": 0.15639, + "26": 0.15419, + "27": 0.15381, + "28": 0.15423, + "29": 0.15354, + "30": 0.1554, + "31": 0.15389, + "32": 0.15608, + "33": 0.15361, + "34": 0.15437, + "35": 0.15233, + "36": 0.15499, + "37": 0.15114, + "38": 0.15259, + "39": 0.15269, + "40": 0.1516, + "41": 0.15052, + "42": 0.15122, + "43": 0.15389, + "44": 0.15261, + "45": 0.15376, + "46": 0.15091, + "47": 0.15197, + "48": 0.15131, + "49": 0.15083, + "50": 0.152, + "51": 0.15723, + "52": 0.15481, + "53": 0.15087, + "54": 0.15175, + "55": 0.15331, + "56": 0.15504, + "57": 0.15471, + "58": 0.1549, + "59": 0.15621, + "60": 0.1533, + "61": 0.15499, + "62": 0.15222, + "63": 0.15091, + "64": 0.1535, + "65": 0.15463, + "66": 0.15169, + "67": 0.15591, + "68": 0.15173, + "69": 0.1509, + "70": 0.15063, + "71": 0.15755, + "72": 0.1545, + "73": 0.15374, + "74": 0.15306, + "75": 0.15223, + "76": 0.15203, + "77": 0.15194, + "78": 0.15284, + "79": 0.15345, + "80": 0.15138, + "81": 0.15298, + "82": 0.15115, + "83": 0.15281, + "84": 0.1544, + "85": 0.15277, + "86": 0.15368, + "87": 0.15373, + "88": 0.15359, + "89": 0.15205, + "90": 0.1535, + "91": 0.15459, + "92": 0.15406, + "93": 0.15133, + "94": 0.1533, + "95": 0.15198, + "96": 0.15195, + "97": 0.1533, + "98": 0.15406, + "99": 0.1528, + "100": 0.15371 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..e4807dd3280 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.92705, + "2": 10.92645, + "3": 10.91603, + "4": 10.9091, + "5": 10.92799, + "6": 10.93628, + "7": 10.90625, + "8": 10.92129, + "9": 10.90998, + "10": 10.90789, + "11": 10.89335, + "12": 10.92458, + "13": 10.91459, + "14": 10.92129, + "15": 10.88313, + "16": 10.87322, + "17": 10.84129, + "18": 10.87278, + "19": 10.85629, + "20": 10.77626, + "21": 10.7487, + "22": 10.63028, + "23": 10.75683, + "24": 10.65647, + "25": 10.59138, + "26": 10.65379, + "27": 10.6485, + "28": 10.59548, + "29": 10.60882, + "30": 10.39195, + "31": 10.15754, + "32": 10.49101, + "33": 10.47929, + "34": 10.24061, + "35": 10.29687, + "36": 10.2464, + "37": 10.35228, + "38": 10.20491, + "39": 10.4052, + "40": 10.0964, + "41": 10.15176, + "42": 10.22032, + "43": 9.85497, + "44": 9.96138, + "45": 9.84466, + "46": 9.83805, + "47": 10.13984, + "48": 9.85719, + "49": 9.53694, + "50": 9.9092, + "51": 9.84886, + "52": 9.74156, + "53": 10.06349, + "54": 9.94683, + "55": 9.87764, + "56": 9.6274, + "57": 9.47111, + "58": 9.8292, + "59": 9.58251, + "60": 9.49121, + "61": 9.69959, + "62": 9.97969, + "63": 9.37277, + "64": 9.77468, + "65": 8.94232, + "66": 9.69905, + "67": 9.3638, + "68": 9.78788, + "69": 9.78333, + "70": 9.72263, + "71": 9.60795, + "72": 9.5846, + "73": 9.48966, + "74": 8.9487, + "75": 9.41912, + "76": 9.08728, + "77": 10.06356, + "78": 9.72834, + "79": 9.37163, + "80": 9.40079, + "81": 9.47845, + "82": 9.69179, + "83": 9.30761, + "84": 9.41229, + "85": 9.61209, + "86": 9.07599, + "87": 9.5947, + "88": 9.74743, + "89": 9.60687, + "90": 9.81012, + "91": 9.3436, + "92": 9.36483, + "93": 9.0776, + "94": 8.83107, + "95": 9.51718, + "96": 9.5245, + "97": 9.31025, + "98": 9.67895, + "99": 8.88829, + "100": 9.40153 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 68.0, + "2": 52.0, + "3": 60.0, + "4": 54.0, + "5": 64.0, + "6": 64.0, + "7": 66.0, + "8": 69.0, + "9": 75.0, + "10": 61.0, + "11": 61.0, + "12": 71.0, + "13": 54.0, + "14": 61.0, + "15": 58.0, + "16": 58.0, + "17": 66.0, + "18": 56.0, + "19": 56.0, + "20": 64.0, + "21": 55.0, + "22": 55.0, + "23": 80.0, + "24": 69.0, + "25": 58.0, + "26": 85.0, + "27": 67.0, + "28": 64.0, + "29": 60.0, + "30": 85.0, + "31": 77.0, + "32": 76.0, + "33": 85.0, + "34": 69.0, + "35": 66.0, + "36": 68.0, + "37": 68.0, + "38": 79.0, + "39": 69.0, + "40": 85.0, + "41": 71.0, + "42": 86.0, + "43": 78.0, + "44": 73.0, + "45": 84.0, + "46": 84.0, + "47": 78.0, + "48": 77.0, + "49": 76.0, + "50": 85.0, + "51": 70.0, + "52": 79.0, + "53": 78.0, + "54": 83.0, + "55": 69.0, + "56": 74.0, + "57": 76.0, + "58": 85.0, + "59": 67.0, + "60": 67.0, + "61": 81.0, + "62": 88.0, + "63": 76.0, + "64": 86.0, + "65": 65.0, + "66": 85.0, + "67": 64.0, + "68": 78.0, + "69": 67.0, + "70": 92.0, + "71": 68.0, + "72": 65.0, + "73": 90.0, + "74": 59.0, + "75": 51.0, + "76": 71.0, + "77": 73.0, + "78": 95.0, + "79": 84.0, + "80": 98.0, + "81": 65.0, + "82": 78.0, + "83": 64.0, + "84": 76.0, + "85": 86.0, + "86": 68.0, + "87": 85.0, + "88": 88.0, + "89": 88.0, + "90": 83.0, + "91": 51.0, + "92": 84.0, + "93": 69.0, + "94": 82.0, + "95": 72.0, + "96": 66.0, + "97": 83.0, + "98": 83.0, + "99": 65.0, + "100": 73.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 487096320.0, + "2": 487096320.0, + "3": 487096320.0, + "4": 487096320.0, + "5": 487096320.0, + "6": 487096320.0, + "7": 487096320.0, + "8": 487096320.0, + "9": 487096320.0, + "10": 487096320.0, + "11": 487096320.0, + "12": 487096320.0, + "13": 487096320.0, + "14": 487096320.0, + "15": 487096320.0, + "16": 487096320.0, + "17": 487096320.0, + "18": 487096320.0, + "19": 487096320.0, + "20": 487096320.0, + "21": 487096320.0, + "22": 487096320.0, + "23": 487096320.0, + "24": 487096320.0, + "25": 487096320.0, + "26": 487096320.0, + "27": 487096320.0, + "28": 487096320.0, + "29": 487096320.0, + "30": 487096320.0, + "31": 487096320.0, + "32": 487096320.0, + "33": 487096320.0, + "34": 487096320.0, + "35": 487096320.0, + "36": 487096320.0, + "37": 487096320.0, + "38": 487096320.0, + "39": 487096320.0, + "40": 487096320.0, + "41": 487096320.0, + "42": 487096320.0, + "43": 487096320.0, + "44": 487096320.0, + "45": 487096320.0, + "46": 487096320.0, + "47": 487096320.0, + "48": 487096320.0, + "49": 487096320.0, + "50": 487096320.0, + "51": 487096320.0, + "52": 487096320.0, + "53": 487096320.0, + "54": 487096320.0, + "55": 487096320.0, + "56": 487096320.0, + "57": 487096320.0, + "58": 487096320.0, + "59": 487096320.0, + "60": 487096320.0, + "61": 487096320.0, + "62": 487096320.0, + "63": 487096320.0, + "64": 487096320.0, + "65": 487096320.0, + "66": 487096320.0, + "67": 487096320.0, + "68": 487096320.0, + "69": 487096320.0, + "70": 487096320.0, + "71": 487096320.0, + "72": 487096320.0, + "73": 487096320.0, + "74": 487096320.0, + "75": 487096320.0, + "76": 487096320.0, + "77": 487096320.0, + "78": 487096320.0, + "79": 487096320.0, + "80": 487096320.0, + "81": 487096320.0, + "82": 487096320.0, + "83": 487096320.0, + "84": 487096320.0, + "85": 487096320.0, + "86": 487096320.0, + "87": 487096320.0, + "88": 487096320.0, + "89": 487096320.0, + "90": 487096320.0, + "91": 487096320.0, + "92": 487096320.0, + "93": 487096320.0, + "94": 487096320.0, + "95": 487096320.0, + "96": 487096320.0, + "97": 487096320.0, + "98": 487096320.0, + "99": 487096320.0, + "100": 487096320.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2158389248.0, + "2": 2338462720.0, + "3": 2338462720.0, + "4": 2339380224.0, + "5": 2339380224.0, + "6": 2339380224.0, + "7": 2339380224.0, + "8": 2339380224.0, + "9": 2339380224.0, + "10": 2339380224.0, + "11": 2339380224.0, + "12": 2339380224.0, + "13": 2339380224.0, + "14": 2339380224.0, + "15": 2339380224.0, + "16": 2339380224.0, + "17": 2339380224.0, + "18": 2339380224.0, + "19": 2339380224.0, + "20": 2339380224.0, + "21": 2339380224.0, + "22": 2339380224.0, + "23": 2339380224.0, + "24": 2339380224.0, + "25": 2339380224.0, + "26": 2339380224.0, + "27": 2339380224.0, + "28": 2339380224.0, + "29": 2339380224.0, + "30": 2339380224.0, + "31": 2339380224.0, + "32": 2339380224.0, + "33": 2339380224.0, + "34": 2339380224.0, + "35": 2339380224.0, + "36": 2339380224.0, + "37": 2339380224.0, + "38": 2339380224.0, + "39": 2339380224.0, + "40": 2339380224.0, + "41": 2339380224.0, + "42": 2339380224.0, + "43": 2339380224.0, + "44": 2339380224.0, + "45": 2339380224.0, + "46": 2339380224.0, + "47": 2339380224.0, + "48": 2339380224.0, + "49": 2339380224.0, + "50": 2339380224.0, + "51": 2339380224.0, + "52": 2339380224.0, + "53": 2339380224.0, + "54": 2339380224.0, + "55": 2339380224.0, + "56": 2339380224.0, + "57": 2339380224.0, + "58": 2339380224.0, + "59": 2339380224.0, + "60": 2339380224.0, + "61": 2339380224.0, + "62": 2339380224.0, + "63": 2339380224.0, + "64": 2339380224.0, + "65": 2339380224.0, + "66": 2339380224.0, + "67": 2339380224.0, + "68": 2339380224.0, + "69": 2339380224.0, + "70": 2339380224.0, + "71": 2339380224.0, + "72": 2339380224.0, + "73": 2339380224.0, + "74": 2339380224.0, + "75": 2339380224.0, + "76": 2339380224.0, + "77": 2339380224.0, + "78": 2339380224.0, + "79": 2339380224.0, + "80": 2339380224.0, + "81": 2339380224.0, + "82": 2339380224.0, + "83": 2339380224.0, + "84": 2339380224.0, + "85": 2339380224.0, + "86": 2339380224.0, + "87": 2339380224.0, + "88": 2339380224.0, + "89": 2339380224.0, + "90": 2339380224.0, + "91": 2339380224.0, + "92": 2339380224.0, + "93": 2339380224.0, + "94": 2339380224.0, + "95": 2339380224.0, + "96": 2339380224.0, + "97": 2339380224.0, + "98": 2339380224.0, + "99": 2339380224.0, + "100": 2339380224.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.83126, + "2": 0.26341, + "3": 0.23434, + "4": 0.23414, + "5": 0.243, + "6": 0.23093, + "7": 0.2349, + "8": 0.23447, + "9": 0.23241, + "10": 0.23155, + "11": 0.23263, + "12": 0.23115, + "13": 0.23168, + "14": 0.23309, + "15": 0.23146, + "16": 0.23206, + "17": 0.23373, + "18": 0.23689, + "19": 0.23192, + "20": 0.23083, + "21": 0.23324, + "22": 0.23339, + "23": 0.2311, + "24": 0.23003, + "25": 0.23092, + "26": 0.23001, + "27": 0.23221, + "28": 0.22984, + "29": 0.23347, + "30": 0.23349, + "31": 0.44414, + "32": 0.22811, + "33": 0.22989, + "34": 0.22796, + "35": 0.22895, + "36": 0.22701, + "37": 0.22772, + "38": 0.22966, + "39": 0.22791, + "40": 0.22768, + "41": 0.22809, + "42": 0.23136, + "43": 0.22907, + "44": 0.22647, + "45": 0.22963, + "46": 0.23039, + "47": 0.22951, + "48": 0.2281, + "49": 0.22875, + "50": 0.22865, + "51": 0.22909, + "52": 0.22123, + "53": 0.22076, + "54": 0.22154, + "55": 0.2222, + "56": 0.39897, + "57": 0.22058, + "58": 0.22118, + "59": 0.22849, + "60": 0.22871, + "61": 0.2225, + "62": 0.22208, + "63": 0.22298, + "64": 0.22377, + "65": 0.22446, + "66": 0.22435, + "67": 0.22221, + "68": 0.22386, + "69": 0.22616, + "70": 0.2232, + "71": 0.22301, + "72": 0.42061, + "73": 0.22703, + "74": 0.22271, + "75": 0.22204, + "76": 0.22282, + "77": 0.22517, + "78": 0.22207, + "79": 0.24309, + "80": 0.24317, + "81": 0.25879, + "82": 0.22268, + "83": 0.22204, + "84": 0.2228, + "85": 0.22447, + "86": 0.22388, + "87": 0.22291, + "88": 0.22259, + "89": 0.22341, + "90": 0.22502, + "91": 0.22225, + "92": 0.2218, + "93": 0.22176, + "94": 0.22225, + "95": 0.22471, + "96": 0.22277, + "97": 0.22023, + "98": 0.22426, + "99": 0.22626, + "100": 0.22111 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..7a6cb6fa053 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.92705, + "2": 10.92645, + "3": 10.91603, + "4": 10.9091, + "5": 10.92799, + "6": 10.93628, + "7": 10.90625, + "8": 10.92129, + "9": 10.90998, + "10": 10.90789, + "11": 10.89335, + "12": 10.92458, + "13": 10.91459, + "14": 10.92129, + "15": 10.88313, + "16": 10.87322, + "17": 10.84129, + "18": 10.87278, + "19": 10.85629, + "20": 10.77626, + "21": 10.7487, + "22": 10.63028, + "23": 10.75683, + "24": 10.65647, + "25": 10.59138, + "26": 10.65379, + "27": 10.6485, + "28": 10.59548, + "29": 10.60882, + "30": 10.39195, + "31": 10.15754, + "32": 10.49101, + "33": 10.47929, + "34": 10.24061, + "35": 10.29687, + "36": 10.2464, + "37": 10.35228, + "38": 10.20491, + "39": 10.4052, + "40": 10.0964, + "41": 10.15176, + "42": 10.22032, + "43": 9.85497, + "44": 9.96138, + "45": 9.84466, + "46": 9.83805, + "47": 10.13984, + "48": 9.85719, + "49": 9.53694, + "50": 9.9092, + "51": 9.84886, + "52": 9.74156, + "53": 10.06349, + "54": 9.94683, + "55": 9.87764, + "56": 9.6274, + "57": 9.47111, + "58": 9.8292, + "59": 9.58251, + "60": 9.49121, + "61": 9.69959, + "62": 9.97969, + "63": 9.37277, + "64": 9.77468, + "65": 8.94232, + "66": 9.69905, + "67": 9.3638, + "68": 9.78788, + "69": 9.78333, + "70": 9.72263, + "71": 9.60795, + "72": 9.5846, + "73": 9.48966, + "74": 8.9487, + "75": 9.41912, + "76": 9.08728, + "77": 10.06356, + "78": 9.72834, + "79": 9.37163, + "80": 9.40079, + "81": 9.47845, + "82": 9.69179, + "83": 9.30761, + "84": 9.41229, + "85": 9.61209, + "86": 9.07599, + "87": 9.5947, + "88": 9.74743, + "89": 9.60687, + "90": 9.81012, + "91": 9.3436, + "92": 9.36483, + "93": 9.0776, + "94": 8.83107, + "95": 9.51718, + "96": 9.5245, + "97": 9.31025, + "98": 9.67895, + "99": 8.88829, + "100": 9.40153 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 68.0, + "2": 52.0, + "3": 60.0, + "4": 54.0, + "5": 64.0, + "6": 64.0, + "7": 66.0, + "8": 69.0, + "9": 75.0, + "10": 61.0, + "11": 61.0, + "12": 71.0, + "13": 54.0, + "14": 61.0, + "15": 58.0, + "16": 58.0, + "17": 66.0, + "18": 56.0, + "19": 56.0, + "20": 64.0, + "21": 55.0, + "22": 55.0, + "23": 80.0, + "24": 69.0, + "25": 58.0, + "26": 85.0, + "27": 67.0, + "28": 64.0, + "29": 60.0, + "30": 85.0, + "31": 77.0, + "32": 76.0, + "33": 85.0, + "34": 69.0, + "35": 66.0, + "36": 68.0, + "37": 68.0, + "38": 79.0, + "39": 69.0, + "40": 85.0, + "41": 71.0, + "42": 86.0, + "43": 78.0, + "44": 73.0, + "45": 84.0, + "46": 84.0, + "47": 78.0, + "48": 77.0, + "49": 76.0, + "50": 85.0, + "51": 70.0, + "52": 79.0, + "53": 78.0, + "54": 83.0, + "55": 69.0, + "56": 74.0, + "57": 76.0, + "58": 85.0, + "59": 67.0, + "60": 67.0, + "61": 81.0, + "62": 88.0, + "63": 76.0, + "64": 86.0, + "65": 65.0, + "66": 85.0, + "67": 64.0, + "68": 78.0, + "69": 67.0, + "70": 92.0, + "71": 68.0, + "72": 65.0, + "73": 90.0, + "74": 59.0, + "75": 51.0, + "76": 71.0, + "77": 73.0, + "78": 95.0, + "79": 84.0, + "80": 98.0, + "81": 65.0, + "82": 78.0, + "83": 64.0, + "84": 76.0, + "85": 86.0, + "86": 68.0, + "87": 85.0, + "88": 88.0, + "89": 88.0, + "90": 83.0, + "91": 51.0, + "92": 84.0, + "93": 69.0, + "94": 82.0, + "95": 72.0, + "96": 66.0, + "97": 83.0, + "98": 83.0, + "99": 65.0, + "100": 73.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 488144896.0, + "2": 488144896.0, + "3": 488144896.0, + "4": 488144896.0, + "5": 488144896.0, + "6": 488144896.0, + "7": 488144896.0, + "8": 488144896.0, + "9": 488144896.0, + "10": 488144896.0, + "11": 488144896.0, + "12": 488144896.0, + "13": 488144896.0, + "14": 488144896.0, + "15": 488144896.0, + "16": 488144896.0, + "17": 488144896.0, + "18": 488144896.0, + "19": 488144896.0, + "20": 488144896.0, + "21": 488144896.0, + "22": 488144896.0, + "23": 488144896.0, + "24": 488144896.0, + "25": 488144896.0, + "26": 488144896.0, + "27": 488144896.0, + "28": 488144896.0, + "29": 488144896.0, + "30": 488144896.0, + "31": 488144896.0, + "32": 488144896.0, + "33": 488144896.0, + "34": 488144896.0, + "35": 488144896.0, + "36": 488144896.0, + "37": 488144896.0, + "38": 488144896.0, + "39": 488144896.0, + "40": 488144896.0, + "41": 488144896.0, + "42": 488144896.0, + "43": 488144896.0, + "44": 488144896.0, + "45": 488144896.0, + "46": 488144896.0, + "47": 488144896.0, + "48": 488144896.0, + "49": 488144896.0, + "50": 488144896.0, + "51": 488144896.0, + "52": 488144896.0, + "53": 488144896.0, + "54": 488144896.0, + "55": 488144896.0, + "56": 488144896.0, + "57": 488144896.0, + "58": 488144896.0, + "59": 488144896.0, + "60": 488144896.0, + "61": 488144896.0, + "62": 488144896.0, + "63": 488144896.0, + "64": 488144896.0, + "65": 488144896.0, + "66": 488144896.0, + "67": 488144896.0, + "68": 488144896.0, + "69": 488144896.0, + "70": 488144896.0, + "71": 488144896.0, + "72": 488144896.0, + "73": 488144896.0, + "74": 488144896.0, + "75": 488144896.0, + "76": 488144896.0, + "77": 488144896.0, + "78": 488144896.0, + "79": 488144896.0, + "80": 488144896.0, + "81": 488144896.0, + "82": 488144896.0, + "83": 488144896.0, + "84": 488144896.0, + "85": 488144896.0, + "86": 488144896.0, + "87": 488144896.0, + "88": 488144896.0, + "89": 488144896.0, + "90": 488144896.0, + "91": 488144896.0, + "92": 488144896.0, + "93": 488144896.0, + "94": 488144896.0, + "95": 488144896.0, + "96": 488144896.0, + "97": 488144896.0, + "98": 488144896.0, + "99": 488144896.0, + "100": 488144896.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2158389248.0, + "2": 2340559872.0, + "3": 2340559872.0, + "4": 2340559872.0, + "5": 2340559872.0, + "6": 2340559872.0, + "7": 2340559872.0, + "8": 2340559872.0, + "9": 2340559872.0, + "10": 2340559872.0, + "11": 2340559872.0, + "12": 2340559872.0, + "13": 2340559872.0, + "14": 2340559872.0, + "15": 2340559872.0, + "16": 2340559872.0, + "17": 2340559872.0, + "18": 2340559872.0, + "19": 2340559872.0, + "20": 2340559872.0, + "21": 2340559872.0, + "22": 2340559872.0, + "23": 2340559872.0, + "24": 2340559872.0, + "25": 2340559872.0, + "26": 2340559872.0, + "27": 2340559872.0, + "28": 2340559872.0, + "29": 2340559872.0, + "30": 2340559872.0, + "31": 2340559872.0, + "32": 2340559872.0, + "33": 2340559872.0, + "34": 2340559872.0, + "35": 2340559872.0, + "36": 2340559872.0, + "37": 2340559872.0, + "38": 2340559872.0, + "39": 2340559872.0, + "40": 2340559872.0, + "41": 2340559872.0, + "42": 2342132736.0, + "43": 2342132736.0, + "44": 2342132736.0, + "45": 2342132736.0, + "46": 2342132736.0, + "47": 2342132736.0, + "48": 2342132736.0, + "49": 2342132736.0, + "50": 2342132736.0, + "51": 2342132736.0, + "52": 2342132736.0, + "53": 2342132736.0, + "54": 2342132736.0, + "55": 2342132736.0, + "56": 2342132736.0, + "57": 2342132736.0, + "58": 2342132736.0, + "59": 2342132736.0, + "60": 2342132736.0, + "61": 2342132736.0, + "62": 2342132736.0, + "63": 2342132736.0, + "64": 2342132736.0, + "65": 2342132736.0, + "66": 2342132736.0, + "67": 2342132736.0, + "68": 2342132736.0, + "69": 2342132736.0, + "70": 2342132736.0, + "71": 2342132736.0, + "72": 2342132736.0, + "73": 2342132736.0, + "74": 2342132736.0, + "75": 2342132736.0, + "76": 2342132736.0, + "77": 2342132736.0, + "78": 2342132736.0, + "79": 2342132736.0, + "80": 2342132736.0, + "81": 2342132736.0, + "82": 2342132736.0, + "83": 2342132736.0, + "84": 2342132736.0, + "85": 2342132736.0, + "86": 2342132736.0, + "87": 2342132736.0, + "88": 2342132736.0, + "89": 2342132736.0, + "90": 2342132736.0, + "91": 2342132736.0, + "92": 2342132736.0, + "93": 2342132736.0, + "94": 2342132736.0, + "95": 2342132736.0, + "96": 2342132736.0, + "97": 2342132736.0, + "98": 2342132736.0, + "99": 2342132736.0, + "100": 2342132736.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.5603, + "2": 0.27395, + "3": 0.25016, + "4": 0.23465, + "5": 0.23169, + "6": 0.22889, + "7": 0.23765, + "8": 0.22887, + "9": 0.23381, + "10": 0.2266, + "11": 0.23432, + "12": 0.22287, + "13": 0.23838, + "14": 0.22383, + "15": 0.22359, + "16": 0.22462, + "17": 0.22449, + "18": 0.22452, + "19": 0.22358, + "20": 0.22653, + "21": 0.23567, + "22": 0.22469, + "23": 0.22426, + "24": 0.22314, + "25": 0.22088, + "26": 0.22435, + "27": 0.22371, + "28": 0.22374, + "29": 0.22621, + "30": 0.22269, + "31": 0.22968, + "32": 0.22354, + "33": 0.21974, + "34": 0.21973, + "35": 0.22162, + "36": 0.21927, + "37": 0.21792, + "38": 0.22161, + "39": 0.218, + "40": 0.2218, + "41": 0.22011, + "42": 0.21906, + "43": 0.45489, + "44": 0.21843, + "45": 0.21693, + "46": 0.22243, + "47": 0.21818, + "48": 0.22186, + "49": 0.21947, + "50": 0.21913, + "51": 0.23038, + "52": 0.43735, + "53": 0.22226, + "54": 0.22253, + "55": 0.22038, + "56": 0.22255, + "57": 0.22026, + "58": 0.22445, + "59": 0.22812, + "60": 0.22248, + "61": 0.22206, + "62": 0.22823, + "63": 0.22874, + "64": 0.22255, + "65": 0.22446, + "66": 0.2261, + "67": 0.22601, + "68": 0.2276, + "69": 0.22081, + "70": 0.22481, + "71": 0.22176, + "72": 0.22629, + "73": 0.22287, + "74": 0.22171, + "75": 0.23035, + "76": 0.23044, + "77": 0.23294, + "78": 0.22982, + "79": 0.23205, + "80": 0.23206, + "81": 0.23504, + "82": 0.22297, + "83": 0.22323, + "84": 0.21927, + "85": 0.22167, + "86": 0.22409, + "87": 0.2216, + "88": 0.22052, + "89": 0.22173, + "90": 0.22337, + "91": 0.21893, + "92": 0.22093, + "93": 0.21931, + "94": 0.2206, + "95": 0.22306, + "96": 0.2207, + "97": 0.22191, + "98": 0.22163, + "99": 0.22443, + "100": 0.21867 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 17196f707fe..3a9edd7e4f6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.85949, + "2": 10.85553, + "3": 10.86543, + "4": 10.84553, "5": 10.88346, + "6": 10.89431, + "7": 10.87067, + "8": 10.86979, + "9": 10.86918, "10": 10.83886, + "11": 10.8943, + "12": 10.87983, + "13": 10.87985, + "14": 10.90321, "15": 10.84052, + "16": 10.83787, + "17": 10.80669, + "18": 10.83026, + "19": 10.82261, "20": 10.73193, + "21": 10.70748, + "22": 10.56005, + "23": 10.72399, + "24": 10.61114, "25": 10.54813, + "26": 10.61329, + "27": 10.63053, + "28": 10.56646, + "29": 10.59668, "30": 10.37137, + "31": 10.11725, + "32": 10.46127, + "33": 10.45249, + "34": 10.2169, "35": 10.27172, + "36": 10.23119, + "37": 10.34809, + "38": 10.1884, + "39": 10.41044, "40": 10.09425, + "41": 10.14707, + "42": 10.21242, + "43": 9.84105, + "44": 9.95918, "45": 9.84079, + "46": 9.82479, + "47": 10.13878, + "48": 9.85831, + "49": 9.54705, "50": 9.90875, + "51": 9.8558, + "52": 9.75237, + "53": 10.07589, + "54": 9.95688, "55": 9.88203, + "56": 9.6313, + "57": 9.48649, + "58": 9.83109, + "59": 9.58897, "60": 9.50643, + "61": 9.70363, + "62": 9.98286, + "63": 9.38302, + "64": 9.77901, "65": 8.95166, + "66": 9.70158, + "67": 9.37203, + "68": 9.78849, + "69": 9.79851, "70": 9.74737, + "71": 9.61908, + "72": 9.58502, + "73": 9.49721, + "74": 8.93927, "75": 9.42703, + "76": 9.0802, + "77": 10.06567, + "78": 9.72893, + "79": 9.3776, "80": 9.40982, + "81": 9.47976, + "82": 9.7018, + "83": 9.30612, + "84": 9.4209, "85": 9.61371, + "86": 9.07649, + "87": 9.5945, + "88": 9.75068, + "89": 9.60238, "90": 9.81898, + "91": 9.33894, + "92": 9.35716, + "93": 9.07879, + "94": 8.83503, "95": 9.52172, + "96": 9.53003, + "97": 9.31306, + "98": 9.67783, + "99": 8.89058, "100": 9.39725 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1688.0, + "2": 1806.0, + "3": 1675.0, + "4": 1842.0, "5": 1909.0, + "6": 1908.0, + "7": 1783.0, + "8": 1611.0, + "9": 1753.0, "10": 1457.0, + "11": 1880.0, + "12": 1683.0, + "13": 1907.0, + "14": 1733.0, "15": 1930.0, + "16": 1840.0, + "17": 1892.0, + "18": 1650.0, + "19": 1790.0, "20": 1596.0, + "21": 1765.0, + "22": 1616.0, + "23": 1974.0, + "24": 1621.0, "25": 1557.0, + "26": 1745.0, + "27": 1722.0, + "28": 1976.0, + "29": 2068.0, "30": 1860.0, + "31": 1536.0, + "32": 1883.0, + "33": 2071.0, + "34": 1894.0, "35": 1902.0, + "36": 1885.0, + "37": 2231.0, + "38": 2129.0, + "39": 2333.0, "40": 2207.0, + "41": 2193.0, + "42": 2322.0, + "43": 2015.0, + "44": 2089.0, "45": 2095.0, + "46": 2392.0, + "47": 2430.0, + "48": 2414.0, + "49": 2340.0, "50": 2416.0, + "51": 2613.0, + "52": 2538.0, + "53": 2792.0, + "54": 2801.0, "55": 2216.0, + "56": 2858.0, + "57": 2381.0, + "58": 2854.0, + "59": 2787.0, "60": 2457.0, + "61": 2941.0, + "62": 2543.0, + "63": 2408.0, + "64": 2968.0, "65": 2472.0, + "66": 2977.0, + "67": 2839.0, + "68": 2775.0, + "69": 2832.0, "70": 3057.0, + "71": 2909.0, + "72": 2421.0, + "73": 2982.0, + "74": 1922.0, "75": 2474.0, + "76": 3059.0, + "77": 3177.0, + "78": 3067.0, + "79": 3052.0, "80": 3338.0, + "81": 3644.0, + "82": 3234.0, + "83": 2798.0, + "84": 3196.0, "85": 3324.0, + "86": 2855.0, + "87": 3820.0, + "88": 2962.0, + "89": 3379.0, "90": 3096.0, + "91": 2857.0, + "92": 3077.0, + "93": 2693.0, + "94": 3312.0, "95": 3399.0, + "96": 3378.0, + "97": 3030.0, + "98": 3619.0, + "99": 3160.0, "100": 3128.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 517505536.0, + "2": 517505536.0, + "3": 517505536.0, + "4": 517505536.0, "5": 517505536.0, + "6": 517505536.0, + "7": 517505536.0, + "8": 517505536.0, + "9": 517505536.0, "10": 517505536.0, + "11": 517505536.0, + "12": 517505536.0, + "13": 517505536.0, + "14": 517505536.0, "15": 517505536.0, + "16": 517505536.0, + "17": 517505536.0, + "18": 517505536.0, + "19": 517505536.0, "20": 517505536.0, + "21": 517505536.0, + "22": 517505536.0, + "23": 517505536.0, + "24": 517505536.0, "25": 517505536.0, + "26": 517505536.0, + "27": 517505536.0, + "28": 517505536.0, + "29": 517505536.0, "30": 517505536.0, + "31": 517505536.0, + "32": 517505536.0, + "33": 517505536.0, + "34": 517505536.0, "35": 517505536.0, + "36": 517505536.0, + "37": 517505536.0, + "38": 517505536.0, + "39": 517505536.0, "40": 517505536.0, + "41": 517505536.0, + "42": 517505536.0, + "43": 517505536.0, + "44": 517505536.0, "45": 517505536.0, + "46": 517505536.0, + "47": 517505536.0, + "48": 517505536.0, + "49": 517505536.0, "50": 517505536.0, + "51": 517505536.0, + "52": 517505536.0, + "53": 517505536.0, + "54": 517505536.0, "55": 517505536.0, + "56": 517505536.0, + "57": 517505536.0, + "58": 517505536.0, + "59": 517505536.0, "60": 517505536.0, + "61": 517505536.0, + "62": 517505536.0, + "63": 517505536.0, + "64": 517505536.0, "65": 517505536.0, + "66": 517505536.0, + "67": 517505536.0, + "68": 517505536.0, + "69": 517505536.0, "70": 517505536.0, + "71": 517505536.0, + "72": 517505536.0, + "73": 517505536.0, + "74": 517505536.0, "75": 517505536.0, + "76": 517505536.0, + "77": 517505536.0, + "78": 517505536.0, + "79": 517505536.0, "80": 517505536.0, + "81": 517505536.0, + "82": 517505536.0, + "83": 517505536.0, + "84": 517505536.0, "85": 517505536.0, + "86": 517505536.0, + "87": 517505536.0, + "88": 517505536.0, + "89": 517505536.0, "90": 517505536.0, + "91": 517505536.0, + "92": 517505536.0, + "93": 517505536.0, + "94": 517505536.0, "95": 517505536.0, + "96": 517505536.0, + "97": 517505536.0, + "98": 517505536.0, + "99": 517505536.0, "100": 517505536.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1246524928.0, + "2": 1428695552.0, + "3": 1428695552.0, + "4": 1428695552.0, "5": 1428695552.0, + "6": 1428695552.0, + "7": 1428695552.0, + "8": 1428695552.0, + "9": 1428695552.0, "10": 1428695552.0, + "11": 1428695552.0, + "12": 1428695552.0, + "13": 1428695552.0, + "14": 1428695552.0, "15": 1428695552.0, + "16": 1428695552.0, + "17": 1428695552.0, + "18": 1428695552.0, + "19": 1428695552.0, "20": 1428695552.0, + "21": 1428695552.0, + "22": 1428695552.0, + "23": 1428695552.0, + "24": 1428695552.0, "25": 1428695552.0, + "26": 1428695552.0, + "27": 1428695552.0, + "28": 1428695552.0, + "29": 1428695552.0, "30": 1428695552.0, + "31": 1428695552.0, + "32": 1428695552.0, + "33": 1428695552.0, + "34": 1428695552.0, "35": 1428695552.0, + "36": 1428695552.0, + "37": 1428695552.0, + "38": 1428695552.0, + "39": 1428695552.0, "40": 1428695552.0, + "41": 1428695552.0, + "42": 1428695552.0, + "43": 1428695552.0, + "44": 1428695552.0, "45": 1428695552.0, + "46": 1428695552.0, + "47": 1428695552.0, + "48": 1428695552.0, + "49": 1428695552.0, "50": 1428695552.0, + "51": 1428695552.0, + "52": 1428695552.0, + "53": 1428695552.0, + "54": 1428695552.0, "55": 1428695552.0, + "56": 1428695552.0, + "57": 1428695552.0, + "58": 1428695552.0, + "59": 1428695552.0, "60": 1428695552.0, + "61": 1428695552.0, + "62": 1428695552.0, + "63": 1428695552.0, + "64": 1428695552.0, "65": 1428695552.0, + "66": 1428695552.0, + "67": 1428695552.0, + "68": 1428695552.0, + "69": 1428695552.0, "70": 1428695552.0, + "71": 1428695552.0, + "72": 1428695552.0, + "73": 1428695552.0, + "74": 1428695552.0, "75": 1428695552.0, + "76": 1428695552.0, + "77": 1428695552.0, + "78": 1428695552.0, + "79": 1428695552.0, "80": 1428695552.0, + "81": 1428695552.0, + "82": 1428695552.0, + "83": 1428695552.0, + "84": 1428695552.0, "85": 1428695552.0, + "86": 1428695552.0, + "87": 1428695552.0, + "88": 1428695552.0, + "89": 1428695552.0, "90": 1428695552.0, + "91": 1428695552.0, + "92": 1428695552.0, + "93": 1428695552.0, + "94": 1428695552.0, "95": 1428695552.0, + "96": 1428695552.0, + "97": 1428695552.0, + "98": 1428695552.0, + "99": 1428695552.0, "100": 1428695552.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 12.11861, - "5": 0.13752, - "10": 0.1366, - "15": 0.13654, - "20": 0.13695, - "25": 0.13215, - "30": 0.13388, - "35": 0.13399, - "40": 0.13296, - "45": 0.1338, - "50": 0.1346, - "55": 0.14239, - "60": 0.13127, - "65": 0.1338, - "70": 0.1338, - "75": 0.13194, - "80": 0.13347, - "85": 0.13297, - "90": 0.13212, - "95": 0.13413, - "100": 0.14016 + "1": 11.73094, + "2": 0.19559, + "3": 0.1642, + "4": 0.1606, + "5": 0.15484, + "6": 0.15429, + "7": 0.15295, + "8": 0.15498, + "9": 0.15721, + "10": 0.1545, + "11": 0.15341, + "12": 0.15604, + "13": 0.15488, + "14": 0.15754, + "15": 0.15556, + "16": 0.15659, + "17": 0.15948, + "18": 0.15489, + "19": 0.15826, + "20": 0.15555, + "21": 0.15514, + "22": 0.15475, + "23": 0.15663, + "24": 0.15606, + "25": 0.15661, + "26": 0.15687, + "27": 0.15374, + "28": 0.15858, + "29": 0.15645, + "30": 0.15976, + "31": 0.1537, + "32": 0.15299, + "33": 0.1537, + "34": 0.15989, + "35": 0.16418, + "36": 0.16174, + "37": 0.15863, + "38": 0.15554, + "39": 0.14997, + "40": 0.15226, + "41": 0.14966, + "42": 0.15127, + "43": 0.15105, + "44": 0.15192, + "45": 0.15376, + "46": 0.15087, + "47": 0.15236, + "48": 0.15124, + "49": 0.15141, + "50": 0.15372, + "51": 0.17295, + "52": 0.16619, + "53": 0.16729, + "54": 0.15813, + "55": 0.15026, + "56": 0.15186, + "57": 0.1532, + "58": 0.1539, + "59": 0.153, + "60": 0.15346, + "61": 0.15406, + "62": 0.15229, + "63": 0.15251, + "64": 0.15279, + "65": 0.15341, + "66": 0.15398, + "67": 0.15765, + "68": 0.15411, + "69": 0.15465, + "70": 0.15275, + "71": 0.15486, + "72": 0.15324, + "73": 0.1548, + "74": 0.15612, + "75": 0.15592, + "76": 0.15644, + "77": 0.15832, + "78": 0.15223, + "79": 0.1545, + "80": 0.15466, + "81": 0.1518, + "82": 0.15396, + "83": 0.15168, + "84": 0.15232, + "85": 0.15293, + "86": 0.15384, + "87": 0.15453, + "88": 0.15446, + "89": 0.15333, + "90": 0.1576, + "91": 0.15805, + "92": 0.15474, + "93": 0.15345, + "94": 0.15146, + "95": 0.15371, + "96": 0.15549, + "97": 0.15452, + "98": 0.15437, + "99": 0.15398, + "100": 0.15413 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..39079566d74 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85949, + "2": 10.85553, + "3": 10.86543, + "4": 10.84553, + "5": 10.88346, + "6": 10.89431, + "7": 10.87067, + "8": 10.86979, + "9": 10.86918, + "10": 10.83886, + "11": 10.8943, + "12": 10.87983, + "13": 10.87985, + "14": 10.90321, + "15": 10.84052, + "16": 10.83787, + "17": 10.80669, + "18": 10.83026, + "19": 10.82261, + "20": 10.73193, + "21": 10.70748, + "22": 10.56005, + "23": 10.72399, + "24": 10.61114, + "25": 10.54813, + "26": 10.61329, + "27": 10.63053, + "28": 10.56646, + "29": 10.59668, + "30": 10.37137, + "31": 10.11725, + "32": 10.46127, + "33": 10.45249, + "34": 10.2169, + "35": 10.27172, + "36": 10.23119, + "37": 10.34809, + "38": 10.1884, + "39": 10.41044, + "40": 10.09425, + "41": 10.14707, + "42": 10.21242, + "43": 9.84105, + "44": 9.95918, + "45": 9.84079, + "46": 9.82479, + "47": 10.13878, + "48": 9.85831, + "49": 9.54705, + "50": 9.90875, + "51": 9.8558, + "52": 9.75237, + "53": 10.07589, + "54": 9.95688, + "55": 9.88203, + "56": 9.6313, + "57": 9.48649, + "58": 9.83109, + "59": 9.58897, + "60": 9.50643, + "61": 9.70363, + "62": 9.98286, + "63": 9.38302, + "64": 9.77901, + "65": 8.95166, + "66": 9.70158, + "67": 9.37203, + "68": 9.78849, + "69": 9.79851, + "70": 9.74737, + "71": 9.61908, + "72": 9.58502, + "73": 9.49721, + "74": 8.93927, + "75": 9.42703, + "76": 9.0802, + "77": 10.06567, + "78": 9.72893, + "79": 9.3776, + "80": 9.40982, + "81": 9.47976, + "82": 9.7018, + "83": 9.30612, + "84": 9.4209, + "85": 9.61371, + "86": 9.07649, + "87": 9.5945, + "88": 9.75068, + "89": 9.60238, + "90": 9.81898, + "91": 9.33894, + "92": 9.35716, + "93": 9.07879, + "94": 8.83503, + "95": 9.52172, + "96": 9.53003, + "97": 9.31306, + "98": 9.67783, + "99": 8.89058, + "100": 9.39725 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1688.0, + "2": 1806.0, + "3": 1675.0, + "4": 1842.0, + "5": 1909.0, + "6": 1908.0, + "7": 1783.0, + "8": 1611.0, + "9": 1753.0, + "10": 1457.0, + "11": 1880.0, + "12": 1683.0, + "13": 1907.0, + "14": 1733.0, + "15": 1930.0, + "16": 1840.0, + "17": 1892.0, + "18": 1650.0, + "19": 1790.0, + "20": 1596.0, + "21": 1765.0, + "22": 1616.0, + "23": 1974.0, + "24": 1621.0, + "25": 1557.0, + "26": 1745.0, + "27": 1722.0, + "28": 1976.0, + "29": 2068.0, + "30": 1860.0, + "31": 1536.0, + "32": 1883.0, + "33": 2071.0, + "34": 1894.0, + "35": 1902.0, + "36": 1885.0, + "37": 2231.0, + "38": 2129.0, + "39": 2333.0, + "40": 2207.0, + "41": 2193.0, + "42": 2322.0, + "43": 2015.0, + "44": 2089.0, + "45": 2095.0, + "46": 2392.0, + "47": 2430.0, + "48": 2414.0, + "49": 2340.0, + "50": 2416.0, + "51": 2613.0, + "52": 2538.0, + "53": 2792.0, + "54": 2801.0, + "55": 2216.0, + "56": 2858.0, + "57": 2381.0, + "58": 2854.0, + "59": 2787.0, + "60": 2457.0, + "61": 2941.0, + "62": 2543.0, + "63": 2408.0, + "64": 2968.0, + "65": 2472.0, + "66": 2977.0, + "67": 2839.0, + "68": 2775.0, + "69": 2832.0, + "70": 3057.0, + "71": 2909.0, + "72": 2421.0, + "73": 2982.0, + "74": 1922.0, + "75": 2474.0, + "76": 3059.0, + "77": 3177.0, + "78": 3067.0, + "79": 3052.0, + "80": 3338.0, + "81": 3644.0, + "82": 3234.0, + "83": 2798.0, + "84": 3196.0, + "85": 3324.0, + "86": 2855.0, + "87": 3820.0, + "88": 2962.0, + "89": 3379.0, + "90": 3096.0, + "91": 2857.0, + "92": 3077.0, + "93": 2693.0, + "94": 3312.0, + "95": 3399.0, + "96": 3378.0, + "97": 3030.0, + "98": 3619.0, + "99": 3160.0, + "100": 3128.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 517505536.0, + "2": 517505536.0, + "3": 517505536.0, + "4": 517505536.0, + "5": 517505536.0, + "6": 517505536.0, + "7": 517505536.0, + "8": 517505536.0, + "9": 517505536.0, + "10": 517505536.0, + "11": 517505536.0, + "12": 517505536.0, + "13": 517505536.0, + "14": 517505536.0, + "15": 517505536.0, + "16": 517505536.0, + "17": 517505536.0, + "18": 517505536.0, + "19": 517505536.0, + "20": 517505536.0, + "21": 517505536.0, + "22": 517505536.0, + "23": 517505536.0, + "24": 517505536.0, + "25": 517505536.0, + "26": 517505536.0, + "27": 517505536.0, + "28": 517505536.0, + "29": 517505536.0, + "30": 517505536.0, + "31": 517505536.0, + "32": 517505536.0, + "33": 517505536.0, + "34": 517505536.0, + "35": 517505536.0, + "36": 517505536.0, + "37": 517505536.0, + "38": 517505536.0, + "39": 517505536.0, + "40": 517505536.0, + "41": 517505536.0, + "42": 517505536.0, + "43": 517505536.0, + "44": 517505536.0, + "45": 517505536.0, + "46": 517505536.0, + "47": 517505536.0, + "48": 517505536.0, + "49": 517505536.0, + "50": 517505536.0, + "51": 517505536.0, + "52": 517505536.0, + "53": 517505536.0, + "54": 517505536.0, + "55": 517505536.0, + "56": 517505536.0, + "57": 517505536.0, + "58": 517505536.0, + "59": 517505536.0, + "60": 517505536.0, + "61": 517505536.0, + "62": 517505536.0, + "63": 517505536.0, + "64": 517505536.0, + "65": 517505536.0, + "66": 517505536.0, + "67": 517505536.0, + "68": 517505536.0, + "69": 517505536.0, + "70": 517505536.0, + "71": 517505536.0, + "72": 517505536.0, + "73": 517505536.0, + "74": 517505536.0, + "75": 517505536.0, + "76": 517505536.0, + "77": 517505536.0, + "78": 517505536.0, + "79": 517505536.0, + "80": 517505536.0, + "81": 517505536.0, + "82": 517505536.0, + "83": 517505536.0, + "84": 517505536.0, + "85": 517505536.0, + "86": 517505536.0, + "87": 517505536.0, + "88": 517505536.0, + "89": 517505536.0, + "90": 517505536.0, + "91": 517505536.0, + "92": 517505536.0, + "93": 517505536.0, + "94": 517505536.0, + "95": 517505536.0, + "96": 517505536.0, + "97": 517505536.0, + "98": 517505536.0, + "99": 517505536.0, + "100": 517505536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1246524928.0, + "2": 1428695552.0, + "3": 1428695552.0, + "4": 1428695552.0, + "5": 1428695552.0, + "6": 1428695552.0, + "7": 1428695552.0, + "8": 1428695552.0, + "9": 1428695552.0, + "10": 1428695552.0, + "11": 1428695552.0, + "12": 1428695552.0, + "13": 1428695552.0, + "14": 1428695552.0, + "15": 1428695552.0, + "16": 1428695552.0, + "17": 1428695552.0, + "18": 1428695552.0, + "19": 1428695552.0, + "20": 1428695552.0, + "21": 1428695552.0, + "22": 1428695552.0, + "23": 1428695552.0, + "24": 1428695552.0, + "25": 1428695552.0, + "26": 1428695552.0, + "27": 1428695552.0, + "28": 1428695552.0, + "29": 1428695552.0, + "30": 1428695552.0, + "31": 1428695552.0, + "32": 1428695552.0, + "33": 1428695552.0, + "34": 1428695552.0, + "35": 1428695552.0, + "36": 1428695552.0, + "37": 1428695552.0, + "38": 1428695552.0, + "39": 1428695552.0, + "40": 1428695552.0, + "41": 1428695552.0, + "42": 1428695552.0, + "43": 1428695552.0, + "44": 1428695552.0, + "45": 1428695552.0, + "46": 1428695552.0, + "47": 1428695552.0, + "48": 1428695552.0, + "49": 1428695552.0, + "50": 1428695552.0, + "51": 1428695552.0, + "52": 1428695552.0, + "53": 1428695552.0, + "54": 1428695552.0, + "55": 1428695552.0, + "56": 1428695552.0, + "57": 1428695552.0, + "58": 1428695552.0, + "59": 1428695552.0, + "60": 1428695552.0, + "61": 1428695552.0, + "62": 1428695552.0, + "63": 1428695552.0, + "64": 1428695552.0, + "65": 1428695552.0, + "66": 1428695552.0, + "67": 1428695552.0, + "68": 1428695552.0, + "69": 1428695552.0, + "70": 1428695552.0, + "71": 1428695552.0, + "72": 1428695552.0, + "73": 1428695552.0, + "74": 1428695552.0, + "75": 1428695552.0, + "76": 1428695552.0, + "77": 1428695552.0, + "78": 1428695552.0, + "79": 1428695552.0, + "80": 1428695552.0, + "81": 1428695552.0, + "82": 1428695552.0, + "83": 1428695552.0, + "84": 1428695552.0, + "85": 1428695552.0, + "86": 1428695552.0, + "87": 1428695552.0, + "88": 1428695552.0, + "89": 1428695552.0, + "90": 1428695552.0, + "91": 1428695552.0, + "92": 1428695552.0, + "93": 1428695552.0, + "94": 1428695552.0, + "95": 1428695552.0, + "96": 1428695552.0, + "97": 1428695552.0, + "98": 1428695552.0, + "99": 1428695552.0, + "100": 1428695552.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.53934, + "2": 0.16774, + "3": 0.13459, + "4": 0.13439, + "5": 0.13482, + "6": 0.13444, + "7": 0.13371, + "8": 0.1345, + "9": 0.13658, + "10": 0.13405, + "11": 0.13498, + "12": 0.13346, + "13": 0.13373, + "14": 0.14049, + "15": 0.13447, + "16": 0.13314, + "17": 0.13441, + "18": 0.14264, + "19": 0.15581, + "20": 0.14614, + "21": 0.14655, + "22": 0.14484, + "23": 0.13377, + "24": 0.13618, + "25": 0.13595, + "26": 0.13394, + "27": 0.13248, + "28": 0.13405, + "29": 0.13411, + "30": 0.13464, + "31": 0.13321, + "32": 0.134, + "33": 0.13496, + "34": 0.13356, + "35": 0.13325, + "36": 0.13329, + "37": 0.13359, + "38": 0.13442, + "39": 0.13494, + "40": 0.13456, + "41": 0.1333, + "42": 0.1357, + "43": 0.13407, + "44": 0.13499, + "45": 0.13371, + "46": 0.13423, + "47": 0.13545, + "48": 0.1355, + "49": 0.13329, + "50": 0.1329, + "51": 0.13926, + "52": 0.13217, + "53": 0.13369, + "54": 0.13177, + "55": 0.13062, + "56": 0.25118, + "57": 0.13283, + "58": 0.1331, + "59": 0.1388, + "60": 0.13244, + "61": 0.13219, + "62": 0.13234, + "63": 0.13297, + "64": 0.13104, + "65": 0.1339, + "66": 0.13079, + "67": 0.13112, + "68": 0.1322, + "69": 0.13305, + "70": 0.13172, + "71": 0.13249, + "72": 0.13138, + "73": 0.13329, + "74": 0.13115, + "75": 0.13263, + "76": 0.13234, + "77": 0.13051, + "78": 0.13097, + "79": 0.13092, + "80": 0.13147, + "81": 0.13202, + "82": 0.13235, + "83": 0.13167, + "84": 0.13099, + "85": 0.13063, + "86": 0.13192, + "87": 0.13259, + "88": 0.13267, + "89": 0.13154, + "90": 0.13131, + "91": 0.13195, + "92": 0.13132, + "93": 0.13226, + "94": 0.13075, + "95": 0.13002, + "96": 0.13313, + "97": 0.13202, + "98": 0.13321, + "99": 0.1318, + "100": 0.13349 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..7c1078c0b3d --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85949, + "2": 10.85553, + "3": 10.86543, + "4": 10.84553, + "5": 10.88346, + "6": 10.89431, + "7": 10.87067, + "8": 10.86979, + "9": 10.86918, + "10": 10.83886, + "11": 10.8943, + "12": 10.87983, + "13": 10.87985, + "14": 10.90321, + "15": 10.84052, + "16": 10.83787, + "17": 10.80669, + "18": 10.83026, + "19": 10.82261, + "20": 10.73193, + "21": 10.70748, + "22": 10.56005, + "23": 10.72399, + "24": 10.61114, + "25": 10.54813, + "26": 10.61329, + "27": 10.63053, + "28": 10.56646, + "29": 10.59668, + "30": 10.37137, + "31": 10.11725, + "32": 10.46127, + "33": 10.45249, + "34": 10.2169, + "35": 10.27172, + "36": 10.23119, + "37": 10.34809, + "38": 10.1884, + "39": 10.41044, + "40": 10.09425, + "41": 10.14707, + "42": 10.21242, + "43": 9.84105, + "44": 9.95918, + "45": 9.84079, + "46": 9.82479, + "47": 10.13878, + "48": 9.85831, + "49": 9.54705, + "50": 9.90875, + "51": 9.8558, + "52": 9.75237, + "53": 10.07589, + "54": 9.95688, + "55": 9.88203, + "56": 9.6313, + "57": 9.48649, + "58": 9.83109, + "59": 9.58897, + "60": 9.50643, + "61": 9.70363, + "62": 9.98286, + "63": 9.38302, + "64": 9.77901, + "65": 8.95166, + "66": 9.70158, + "67": 9.37203, + "68": 9.78849, + "69": 9.79851, + "70": 9.74737, + "71": 9.61908, + "72": 9.58502, + "73": 9.49721, + "74": 8.93927, + "75": 9.42703, + "76": 9.0802, + "77": 10.06567, + "78": 9.72893, + "79": 9.3776, + "80": 9.40982, + "81": 9.47976, + "82": 9.7018, + "83": 9.30612, + "84": 9.4209, + "85": 9.61371, + "86": 9.07649, + "87": 9.5945, + "88": 9.75068, + "89": 9.60238, + "90": 9.81898, + "91": 9.33894, + "92": 9.35716, + "93": 9.07879, + "94": 8.83503, + "95": 9.52172, + "96": 9.53003, + "97": 9.31306, + "98": 9.67783, + "99": 8.89058, + "100": 9.39725 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1688.0, + "2": 1806.0, + "3": 1675.0, + "4": 1842.0, + "5": 1909.0, + "6": 1908.0, + "7": 1783.0, + "8": 1611.0, + "9": 1753.0, + "10": 1457.0, + "11": 1880.0, + "12": 1683.0, + "13": 1907.0, + "14": 1733.0, + "15": 1930.0, + "16": 1840.0, + "17": 1892.0, + "18": 1650.0, + "19": 1790.0, + "20": 1596.0, + "21": 1765.0, + "22": 1616.0, + "23": 1974.0, + "24": 1621.0, + "25": 1557.0, + "26": 1745.0, + "27": 1722.0, + "28": 1976.0, + "29": 2068.0, + "30": 1860.0, + "31": 1536.0, + "32": 1883.0, + "33": 2071.0, + "34": 1894.0, + "35": 1902.0, + "36": 1885.0, + "37": 2231.0, + "38": 2129.0, + "39": 2333.0, + "40": 2207.0, + "41": 2193.0, + "42": 2322.0, + "43": 2015.0, + "44": 2089.0, + "45": 2095.0, + "46": 2392.0, + "47": 2430.0, + "48": 2414.0, + "49": 2340.0, + "50": 2416.0, + "51": 2613.0, + "52": 2538.0, + "53": 2792.0, + "54": 2801.0, + "55": 2216.0, + "56": 2858.0, + "57": 2381.0, + "58": 2854.0, + "59": 2787.0, + "60": 2457.0, + "61": 2941.0, + "62": 2543.0, + "63": 2408.0, + "64": 2968.0, + "65": 2472.0, + "66": 2977.0, + "67": 2839.0, + "68": 2775.0, + "69": 2832.0, + "70": 3057.0, + "71": 2909.0, + "72": 2421.0, + "73": 2982.0, + "74": 1922.0, + "75": 2474.0, + "76": 3059.0, + "77": 3177.0, + "78": 3067.0, + "79": 3052.0, + "80": 3338.0, + "81": 3644.0, + "82": 3234.0, + "83": 2798.0, + "84": 3196.0, + "85": 3324.0, + "86": 2855.0, + "87": 3820.0, + "88": 2962.0, + "89": 3379.0, + "90": 3096.0, + "91": 2857.0, + "92": 3077.0, + "93": 2693.0, + "94": 3312.0, + "95": 3399.0, + "96": 3378.0, + "97": 3030.0, + "98": 3619.0, + "99": 3160.0, + "100": 3128.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 517505536.0, + "2": 517505536.0, + "3": 517505536.0, + "4": 517505536.0, + "5": 517505536.0, + "6": 517505536.0, + "7": 517505536.0, + "8": 517505536.0, + "9": 517505536.0, + "10": 517505536.0, + "11": 517505536.0, + "12": 517505536.0, + "13": 517505536.0, + "14": 517505536.0, + "15": 517505536.0, + "16": 517505536.0, + "17": 517505536.0, + "18": 517505536.0, + "19": 517505536.0, + "20": 517505536.0, + "21": 517505536.0, + "22": 517505536.0, + "23": 517505536.0, + "24": 517505536.0, + "25": 517505536.0, + "26": 517505536.0, + "27": 517505536.0, + "28": 517505536.0, + "29": 517505536.0, + "30": 517505536.0, + "31": 517505536.0, + "32": 517505536.0, + "33": 517505536.0, + "34": 517505536.0, + "35": 517505536.0, + "36": 517505536.0, + "37": 517505536.0, + "38": 517505536.0, + "39": 517505536.0, + "40": 517505536.0, + "41": 517505536.0, + "42": 517505536.0, + "43": 517505536.0, + "44": 517505536.0, + "45": 517505536.0, + "46": 517505536.0, + "47": 517505536.0, + "48": 517505536.0, + "49": 517505536.0, + "50": 517505536.0, + "51": 517505536.0, + "52": 517505536.0, + "53": 517505536.0, + "54": 517505536.0, + "55": 517505536.0, + "56": 517505536.0, + "57": 517505536.0, + "58": 517505536.0, + "59": 517505536.0, + "60": 517505536.0, + "61": 517505536.0, + "62": 517505536.0, + "63": 517505536.0, + "64": 517505536.0, + "65": 517505536.0, + "66": 517505536.0, + "67": 517505536.0, + "68": 517505536.0, + "69": 517505536.0, + "70": 517505536.0, + "71": 517505536.0, + "72": 517505536.0, + "73": 517505536.0, + "74": 517505536.0, + "75": 517505536.0, + "76": 517505536.0, + "77": 517505536.0, + "78": 517505536.0, + "79": 517505536.0, + "80": 517505536.0, + "81": 517505536.0, + "82": 517505536.0, + "83": 517505536.0, + "84": 517505536.0, + "85": 517505536.0, + "86": 517505536.0, + "87": 517505536.0, + "88": 517505536.0, + "89": 517505536.0, + "90": 517505536.0, + "91": 517505536.0, + "92": 517505536.0, + "93": 517505536.0, + "94": 517505536.0, + "95": 517505536.0, + "96": 517505536.0, + "97": 517505536.0, + "98": 517505536.0, + "99": 517505536.0, + "100": 517505536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1246524928.0, + "2": 1428695552.0, + "3": 1428695552.0, + "4": 1428695552.0, + "5": 1428695552.0, + "6": 1428695552.0, + "7": 1428695552.0, + "8": 1428695552.0, + "9": 1428695552.0, + "10": 1428695552.0, + "11": 1428695552.0, + "12": 1428695552.0, + "13": 1428695552.0, + "14": 1428695552.0, + "15": 1428695552.0, + "16": 1428695552.0, + "17": 1428695552.0, + "18": 1428695552.0, + "19": 1428695552.0, + "20": 1428695552.0, + "21": 1428695552.0, + "22": 1428695552.0, + "23": 1428695552.0, + "24": 1428695552.0, + "25": 1428695552.0, + "26": 1428695552.0, + "27": 1428695552.0, + "28": 1428695552.0, + "29": 1428695552.0, + "30": 1428695552.0, + "31": 1428695552.0, + "32": 1428695552.0, + "33": 1428695552.0, + "34": 1428695552.0, + "35": 1428695552.0, + "36": 1428695552.0, + "37": 1428695552.0, + "38": 1428695552.0, + "39": 1428695552.0, + "40": 1428695552.0, + "41": 1428695552.0, + "42": 1428695552.0, + "43": 1428695552.0, + "44": 1428695552.0, + "45": 1428695552.0, + "46": 1428695552.0, + "47": 1428695552.0, + "48": 1428695552.0, + "49": 1428695552.0, + "50": 1428695552.0, + "51": 1428695552.0, + "52": 1428695552.0, + "53": 1428695552.0, + "54": 1428695552.0, + "55": 1428695552.0, + "56": 1428695552.0, + "57": 1428695552.0, + "58": 1428695552.0, + "59": 1428695552.0, + "60": 1428695552.0, + "61": 1428695552.0, + "62": 1428695552.0, + "63": 1428695552.0, + "64": 1428695552.0, + "65": 1428695552.0, + "66": 1428695552.0, + "67": 1428695552.0, + "68": 1428695552.0, + "69": 1428695552.0, + "70": 1428695552.0, + "71": 1428695552.0, + "72": 1428695552.0, + "73": 1428695552.0, + "74": 1428695552.0, + "75": 1428695552.0, + "76": 1428695552.0, + "77": 1428695552.0, + "78": 1428695552.0, + "79": 1428695552.0, + "80": 1428695552.0, + "81": 1428695552.0, + "82": 1428695552.0, + "83": 1428695552.0, + "84": 1428695552.0, + "85": 1428695552.0, + "86": 1428695552.0, + "87": 1428695552.0, + "88": 1428695552.0, + "89": 1428695552.0, + "90": 1428695552.0, + "91": 1428695552.0, + "92": 1428695552.0, + "93": 1428695552.0, + "94": 1428695552.0, + "95": 1428695552.0, + "96": 1428695552.0, + "97": 1428695552.0, + "98": 1428695552.0, + "99": 1428695552.0, + "100": 1428695552.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.96359, + "2": 0.17007, + "3": 0.15511, + "4": 0.15439, + "5": 0.15477, + "6": 0.15459, + "7": 0.15427, + "8": 0.15173, + "9": 0.15484, + "10": 0.15363, + "11": 0.15353, + "12": 0.15567, + "13": 0.15258, + "14": 0.15438, + "15": 0.15305, + "16": 0.15314, + "17": 0.15342, + "18": 0.15282, + "19": 0.15336, + "20": 0.15333, + "21": 0.15174, + "22": 0.15412, + "23": 0.15337, + "24": 0.15464, + "25": 0.15638, + "26": 0.15618, + "27": 0.15599, + "28": 0.15616, + "29": 0.15792, + "30": 0.15422, + "31": 0.15441, + "32": 0.15356, + "33": 0.15622, + "34": 0.15397, + "35": 0.15443, + "36": 0.15392, + "37": 0.15454, + "38": 0.15581, + "39": 0.15513, + "40": 0.15813, + "41": 0.1595, + "42": 0.15604, + "43": 0.15809, + "44": 0.15585, + "45": 0.15659, + "46": 0.15599, + "47": 0.15378, + "48": 0.15475, + "49": 0.1544, + "50": 0.15569, + "51": 0.16391, + "52": 0.16196, + "53": 0.16029, + "54": 0.16138, + "55": 0.15673, + "56": 0.1503, + "57": 0.15071, + "58": 0.15268, + "59": 0.15095, + "60": 0.15189, + "61": 0.15199, + "62": 0.14938, + "63": 0.15046, + "64": 0.14924, + "65": 0.15129, + "66": 0.14938, + "67": 0.15233, + "68": 0.15028, + "69": 0.1525, + "70": 0.15334, + "71": 0.15152, + "72": 0.15138, + "73": 0.15304, + "74": 0.1515, + "75": 0.15282, + "76": 0.1518, + "77": 0.15193, + "78": 0.15262, + "79": 0.15274, + "80": 0.15251, + "81": 0.15108, + "82": 0.15199, + "83": 0.15046, + "84": 0.15298, + "85": 0.15063, + "86": 0.15132, + "87": 0.15257, + "88": 0.15109, + "89": 0.1502, + "90": 0.15259, + "91": 0.15063, + "92": 0.15237, + "93": 0.15096, + "94": 0.1517, + "95": 0.15049, + "96": 0.15002, + "97": 0.15011, + "98": 0.15349, + "99": 0.1565, + "100": 0.15223 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..fb6afd47964 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.92705, + "2": 10.92645, + "3": 10.91604, + "4": 10.90911, + "5": 10.92795, + "6": 10.93626, + "7": 10.90626, + "8": 10.92128, + "9": 10.90998, + "10": 10.90786, + "11": 10.89335, + "12": 10.92456, + "13": 10.9146, + "14": 10.9213, + "15": 10.88314, + "16": 10.87325, + "17": 10.84129, + "18": 10.87276, + "19": 10.8563, + "20": 10.77629, + "21": 10.74869, + "22": 10.63031, + "23": 10.75678, + "24": 10.65646, + "25": 10.59141, + "26": 10.65375, + "27": 10.6485, + "28": 10.59548, + "29": 10.6088, + "30": 10.39192, + "31": 10.15753, + "32": 10.49098, + "33": 10.4793, + "34": 10.24058, + "35": 10.29686, + "36": 10.24644, + "37": 10.35232, + "38": 10.20489, + "39": 10.4052, + "40": 10.0964, + "41": 10.15175, + "42": 10.22026, + "43": 9.85499, + "44": 9.96143, + "45": 9.84464, + "46": 9.83801, + "47": 10.13988, + "48": 9.85718, + "49": 9.53698, + "50": 9.90918, + "51": 9.84886, + "52": 9.74154, + "53": 10.06347, + "54": 9.94683, + "55": 9.87762, + "56": 9.6274, + "57": 9.47112, + "58": 9.82925, + "59": 9.58253, + "60": 9.49121, + "61": 9.69956, + "62": 9.97968, + "63": 9.37277, + "64": 9.77468, + "65": 8.94236, + "66": 9.6991, + "67": 9.36382, + "68": 9.78787, + "69": 9.78332, + "70": 9.72266, + "71": 9.60801, + "72": 9.58459, + "73": 9.48963, + "74": 8.94871, + "75": 9.41912, + "76": 9.08725, + "77": 10.06354, + "78": 9.72835, + "79": 9.37162, + "80": 9.40077, + "81": 9.47843, + "82": 9.69177, + "83": 9.3076, + "84": 9.41232, + "85": 9.61207, + "86": 9.07599, + "87": 9.59468, + "88": 9.74738, + "89": 9.60686, + "90": 9.81015, + "91": 9.34359, + "92": 9.36482, + "93": 9.07761, + "94": 8.83108, + "95": 9.51716, + "96": 9.52447, + "97": 9.31027, + "98": 9.67892, + "99": 8.88832, + "100": 9.4015 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1627.0, + "2": 1801.0, + "3": 1730.0, + "4": 1762.0, + "5": 2010.0, + "6": 1889.0, + "7": 1888.0, + "8": 1729.0, + "9": 1852.0, + "10": 1368.0, + "11": 1973.0, + "12": 1722.0, + "13": 1966.0, + "14": 1874.0, + "15": 1897.0, + "16": 1785.0, + "17": 1942.0, + "18": 1718.0, + "19": 1716.0, + "20": 1626.0, + "21": 1797.0, + "22": 1673.0, + "23": 1937.0, + "24": 1561.0, + "25": 1743.0, + "26": 1917.0, + "27": 1886.0, + "28": 1968.0, + "29": 2029.0, + "30": 1930.0, + "31": 1635.0, + "32": 1974.0, + "33": 2159.0, + "34": 2035.0, + "35": 1954.0, + "36": 1948.0, + "37": 2317.0, + "38": 2312.0, + "39": 2458.0, + "40": 2199.0, + "41": 2352.0, + "42": 2288.0, + "43": 2005.0, + "44": 2191.0, + "45": 2068.0, + "46": 2272.0, + "47": 2530.0, + "48": 2458.0, + "49": 2252.0, + "50": 2460.0, + "51": 2777.0, + "52": 2659.0, + "53": 2959.0, + "54": 2700.0, + "55": 2427.0, + "56": 2797.0, + "57": 2430.0, + "58": 3077.0, + "59": 2781.0, + "60": 2380.0, + "61": 2816.0, + "62": 2812.0, + "63": 2452.0, + "64": 2958.0, + "65": 2657.0, + "66": 3208.0, + "67": 2786.0, + "68": 2842.0, + "69": 2927.0, + "70": 3265.0, + "71": 3098.0, + "72": 2445.0, + "73": 3120.0, + "74": 1900.0, + "75": 2675.0, + "76": 3065.0, + "77": 3452.0, + "78": 3263.0, + "79": 3398.0, + "80": 3434.0, + "81": 3695.0, + "82": 3308.0, + "83": 2935.0, + "84": 3423.0, + "85": 3302.0, + "86": 2785.0, + "87": 3788.0, + "88": 3030.0, + "89": 3532.0, + "90": 3230.0, + "91": 2681.0, + "92": 3175.0, + "93": 2718.0, + "94": 3392.0, + "95": 3340.0, + "96": 3504.0, + "97": 3227.0, + "98": 3757.0, + "99": 3245.0, + "100": 3291.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 487096320.0, + "2": 487096320.0, + "3": 487096320.0, + "4": 487096320.0, + "5": 487096320.0, + "6": 487096320.0, + "7": 487096320.0, + "8": 487096320.0, + "9": 487096320.0, + "10": 487096320.0, + "11": 487096320.0, + "12": 487096320.0, + "13": 487096320.0, + "14": 487096320.0, + "15": 487096320.0, + "16": 487096320.0, + "17": 487096320.0, + "18": 487096320.0, + "19": 487096320.0, + "20": 487096320.0, + "21": 487096320.0, + "22": 487096320.0, + "23": 487096320.0, + "24": 487096320.0, + "25": 487096320.0, + "26": 487096320.0, + "27": 487096320.0, + "28": 487096320.0, + "29": 487096320.0, + "30": 487096320.0, + "31": 487096320.0, + "32": 487096320.0, + "33": 487096320.0, + "34": 487096320.0, + "35": 487096320.0, + "36": 487096320.0, + "37": 487096320.0, + "38": 487096320.0, + "39": 487096320.0, + "40": 487096320.0, + "41": 487096320.0, + "42": 487096320.0, + "43": 487096320.0, + "44": 487096320.0, + "45": 487096320.0, + "46": 487096320.0, + "47": 487096320.0, + "48": 487096320.0, + "49": 487096320.0, + "50": 487096320.0, + "51": 487096320.0, + "52": 487096320.0, + "53": 487096320.0, + "54": 487096320.0, + "55": 487096320.0, + "56": 487096320.0, + "57": 487096320.0, + "58": 487096320.0, + "59": 487096320.0, + "60": 487096320.0, + "61": 487096320.0, + "62": 487096320.0, + "63": 487096320.0, + "64": 487096320.0, + "65": 487096320.0, + "66": 487096320.0, + "67": 487096320.0, + "68": 487096320.0, + "69": 487096320.0, + "70": 487096320.0, + "71": 487096320.0, + "72": 487096320.0, + "73": 487096320.0, + "74": 487096320.0, + "75": 487096320.0, + "76": 487096320.0, + "77": 487096320.0, + "78": 487096320.0, + "79": 487096320.0, + "80": 487096320.0, + "81": 487096320.0, + "82": 487096320.0, + "83": 487096320.0, + "84": 487096320.0, + "85": 487096320.0, + "86": 487096320.0, + "87": 487096320.0, + "88": 487096320.0, + "89": 487096320.0, + "90": 487096320.0, + "91": 487096320.0, + "92": 487096320.0, + "93": 487096320.0, + "94": 487096320.0, + "95": 487096320.0, + "96": 487096320.0, + "97": 487096320.0, + "98": 487096320.0, + "99": 487096320.0, + "100": 487096320.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1720084480.0, + "2": 1900157952.0, + "3": 1900157952.0, + "4": 1900157952.0, + "5": 1900157952.0, + "6": 1900157952.0, + "7": 1900157952.0, + "8": 1900157952.0, + "9": 1900157952.0, + "10": 1900157952.0, + "11": 1900157952.0, + "12": 1900157952.0, + "13": 1900157952.0, + "14": 1900157952.0, + "15": 1900157952.0, + "16": 1900157952.0, + "17": 1900157952.0, + "18": 1900157952.0, + "19": 1900157952.0, + "20": 1900157952.0, + "21": 1900157952.0, + "22": 1900157952.0, + "23": 1900157952.0, + "24": 1900157952.0, + "25": 1900157952.0, + "26": 1900157952.0, + "27": 1900157952.0, + "28": 1900157952.0, + "29": 1900157952.0, + "30": 1900157952.0, + "31": 1900157952.0, + "32": 1900157952.0, + "33": 1900157952.0, + "34": 1900157952.0, + "35": 1900157952.0, + "36": 1900157952.0, + "37": 1900157952.0, + "38": 1900157952.0, + "39": 1900157952.0, + "40": 1900157952.0, + "41": 1900157952.0, + "42": 1900157952.0, + "43": 1900157952.0, + "44": 1900157952.0, + "45": 1900157952.0, + "46": 1900157952.0, + "47": 1900157952.0, + "48": 1900157952.0, + "49": 1900157952.0, + "50": 1900157952.0, + "51": 1900157952.0, + "52": 1900157952.0, + "53": 1900157952.0, + "54": 1900157952.0, + "55": 1900157952.0, + "56": 1900157952.0, + "57": 1900157952.0, + "58": 1900157952.0, + "59": 1900157952.0, + "60": 1900157952.0, + "61": 1900157952.0, + "62": 1900157952.0, + "63": 1900157952.0, + "64": 1900157952.0, + "65": 1900157952.0, + "66": 1900157952.0, + "67": 1900157952.0, + "68": 1900157952.0, + "69": 1900157952.0, + "70": 1900157952.0, + "71": 1900157952.0, + "72": 1900157952.0, + "73": 1900157952.0, + "74": 1900157952.0, + "75": 1900157952.0, + "76": 1900157952.0, + "77": 1900157952.0, + "78": 1900157952.0, + "79": 1900157952.0, + "80": 1900157952.0, + "81": 1900157952.0, + "82": 1900157952.0, + "83": 1900157952.0, + "84": 1900157952.0, + "85": 1900157952.0, + "86": 1900157952.0, + "87": 1900157952.0, + "88": 1900157952.0, + "89": 1900157952.0, + "90": 1900157952.0, + "91": 1900157952.0, + "92": 1900157952.0, + "93": 1900157952.0, + "94": 1900157952.0, + "95": 1900157952.0, + "96": 1900157952.0, + "97": 1900157952.0, + "98": 1900157952.0, + "99": 1900157952.0, + "100": 1900157952.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 14.82235, + "2": 0.57043, + "3": 0.23395, + "4": 0.22773, + "5": 0.23061, + "6": 0.22681, + "7": 0.22898, + "8": 0.22777, + "9": 0.23178, + "10": 0.22844, + "11": 0.22696, + "12": 0.22691, + "13": 0.22689, + "14": 0.22608, + "15": 0.22509, + "16": 0.22608, + "17": 0.22957, + "18": 0.22818, + "19": 0.22555, + "20": 0.22522, + "21": 0.22614, + "22": 0.22905, + "23": 0.22671, + "24": 0.22771, + "25": 0.22415, + "26": 0.22381, + "27": 0.22625, + "28": 0.22438, + "29": 0.22389, + "30": 0.22364, + "31": 0.22738, + "32": 0.2239, + "33": 0.22369, + "34": 0.2237, + "35": 0.22477, + "36": 0.22703, + "37": 0.22298, + "38": 0.22346, + "39": 0.22306, + "40": 0.22845, + "41": 0.2224, + "42": 0.22168, + "43": 0.22358, + "44": 0.22055, + "45": 0.22285, + "46": 0.21986, + "47": 0.21973, + "48": 0.22077, + "49": 0.47346, + "50": 0.21958, + "51": 0.23099, + "52": 0.22467, + "53": 0.22654, + "54": 0.22546, + "55": 0.2396, + "56": 0.28734, + "57": 0.3188, + "58": 0.30845, + "59": 0.2927, + "60": 0.26475, + "61": 0.31496, + "62": 0.32446, + "63": 0.27846, + "64": 0.29143, + "65": 0.28739, + "66": 0.25616, + "67": 0.23629, + "68": 0.22554, + "69": 0.22096, + "70": 0.22295, + "71": 0.22447, + "72": 0.22432, + "73": 0.22303, + "74": 0.22272, + "75": 0.22429, + "76": 0.22195, + "77": 0.21956, + "78": 0.22046, + "79": 0.22253, + "80": 0.22346, + "81": 0.22141, + "82": 0.22072, + "83": 0.22211, + "84": 0.22335, + "85": 0.22188, + "86": 0.21998, + "87": 0.22058, + "88": 0.22605, + "89": 0.22132, + "90": 0.22322, + "91": 0.22195, + "92": 0.22145, + "93": 0.22388, + "94": 0.2227, + "95": 0.21996, + "96": 0.22067, + "97": 0.22039, + "98": 0.22287, + "99": 0.22626, + "100": 0.22164 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..de7286cfa2d --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.92705, + "2": 10.92645, + "3": 10.91604, + "4": 10.90911, + "5": 10.92795, + "6": 10.93626, + "7": 10.90626, + "8": 10.92128, + "9": 10.90998, + "10": 10.90786, + "11": 10.89335, + "12": 10.92456, + "13": 10.9146, + "14": 10.9213, + "15": 10.88314, + "16": 10.87325, + "17": 10.84129, + "18": 10.87276, + "19": 10.8563, + "20": 10.77629, + "21": 10.74869, + "22": 10.63031, + "23": 10.75678, + "24": 10.65646, + "25": 10.59141, + "26": 10.65375, + "27": 10.6485, + "28": 10.59548, + "29": 10.6088, + "30": 10.39192, + "31": 10.15753, + "32": 10.49098, + "33": 10.4793, + "34": 10.24058, + "35": 10.29686, + "36": 10.24644, + "37": 10.35232, + "38": 10.20489, + "39": 10.4052, + "40": 10.0964, + "41": 10.15175, + "42": 10.22026, + "43": 9.85499, + "44": 9.96143, + "45": 9.84464, + "46": 9.83801, + "47": 10.13988, + "48": 9.85718, + "49": 9.53698, + "50": 9.90918, + "51": 9.84886, + "52": 9.74154, + "53": 10.06347, + "54": 9.94683, + "55": 9.87762, + "56": 9.6274, + "57": 9.47112, + "58": 9.82925, + "59": 9.58253, + "60": 9.49121, + "61": 9.69956, + "62": 9.97968, + "63": 9.37277, + "64": 9.77468, + "65": 8.94236, + "66": 9.6991, + "67": 9.36382, + "68": 9.78787, + "69": 9.78332, + "70": 9.72266, + "71": 9.60801, + "72": 9.58459, + "73": 9.48963, + "74": 8.94871, + "75": 9.41912, + "76": 9.08725, + "77": 10.06354, + "78": 9.72835, + "79": 9.37162, + "80": 9.40077, + "81": 9.47843, + "82": 9.69177, + "83": 9.3076, + "84": 9.41232, + "85": 9.61207, + "86": 9.07599, + "87": 9.59468, + "88": 9.74738, + "89": 9.60686, + "90": 9.81015, + "91": 9.34359, + "92": 9.36482, + "93": 9.07761, + "94": 8.83108, + "95": 9.51716, + "96": 9.52447, + "97": 9.31027, + "98": 9.67892, + "99": 8.88832, + "100": 9.4015 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1627.0, + "2": 1801.0, + "3": 1730.0, + "4": 1762.0, + "5": 2010.0, + "6": 1889.0, + "7": 1888.0, + "8": 1729.0, + "9": 1852.0, + "10": 1368.0, + "11": 1973.0, + "12": 1722.0, + "13": 1966.0, + "14": 1874.0, + "15": 1897.0, + "16": 1785.0, + "17": 1942.0, + "18": 1718.0, + "19": 1716.0, + "20": 1626.0, + "21": 1797.0, + "22": 1673.0, + "23": 1937.0, + "24": 1561.0, + "25": 1743.0, + "26": 1917.0, + "27": 1886.0, + "28": 1968.0, + "29": 2029.0, + "30": 1930.0, + "31": 1635.0, + "32": 1974.0, + "33": 2159.0, + "34": 2035.0, + "35": 1954.0, + "36": 1948.0, + "37": 2317.0, + "38": 2312.0, + "39": 2458.0, + "40": 2199.0, + "41": 2352.0, + "42": 2288.0, + "43": 2005.0, + "44": 2191.0, + "45": 2068.0, + "46": 2272.0, + "47": 2530.0, + "48": 2458.0, + "49": 2252.0, + "50": 2460.0, + "51": 2777.0, + "52": 2659.0, + "53": 2959.0, + "54": 2700.0, + "55": 2427.0, + "56": 2797.0, + "57": 2430.0, + "58": 3077.0, + "59": 2781.0, + "60": 2380.0, + "61": 2816.0, + "62": 2812.0, + "63": 2452.0, + "64": 2958.0, + "65": 2657.0, + "66": 3208.0, + "67": 2786.0, + "68": 2842.0, + "69": 2927.0, + "70": 3265.0, + "71": 3098.0, + "72": 2445.0, + "73": 3120.0, + "74": 1900.0, + "75": 2675.0, + "76": 3065.0, + "77": 3452.0, + "78": 3263.0, + "79": 3398.0, + "80": 3434.0, + "81": 3695.0, + "82": 3308.0, + "83": 2935.0, + "84": 3423.0, + "85": 3302.0, + "86": 2785.0, + "87": 3788.0, + "88": 3030.0, + "89": 3532.0, + "90": 3230.0, + "91": 2681.0, + "92": 3175.0, + "93": 2718.0, + "94": 3392.0, + "95": 3340.0, + "96": 3504.0, + "97": 3227.0, + "98": 3757.0, + "99": 3245.0, + "100": 3291.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 487096320.0, + "2": 487096320.0, + "3": 487096320.0, + "4": 487096320.0, + "5": 487096320.0, + "6": 487096320.0, + "7": 487096320.0, + "8": 487096320.0, + "9": 487096320.0, + "10": 487096320.0, + "11": 487096320.0, + "12": 487096320.0, + "13": 487096320.0, + "14": 487096320.0, + "15": 487096320.0, + "16": 487096320.0, + "17": 487096320.0, + "18": 487096320.0, + "19": 487096320.0, + "20": 487096320.0, + "21": 487096320.0, + "22": 487096320.0, + "23": 487096320.0, + "24": 487096320.0, + "25": 487096320.0, + "26": 487096320.0, + "27": 487096320.0, + "28": 487096320.0, + "29": 487096320.0, + "30": 487096320.0, + "31": 487096320.0, + "32": 487096320.0, + "33": 487096320.0, + "34": 487096320.0, + "35": 487096320.0, + "36": 487096320.0, + "37": 487096320.0, + "38": 487096320.0, + "39": 487096320.0, + "40": 487096320.0, + "41": 487096320.0, + "42": 487096320.0, + "43": 487096320.0, + "44": 487096320.0, + "45": 487096320.0, + "46": 487096320.0, + "47": 487096320.0, + "48": 487096320.0, + "49": 487096320.0, + "50": 487096320.0, + "51": 487096320.0, + "52": 487096320.0, + "53": 487096320.0, + "54": 487096320.0, + "55": 487096320.0, + "56": 487096320.0, + "57": 487096320.0, + "58": 487096320.0, + "59": 487096320.0, + "60": 487096320.0, + "61": 487096320.0, + "62": 487096320.0, + "63": 487096320.0, + "64": 487096320.0, + "65": 487096320.0, + "66": 487096320.0, + "67": 487096320.0, + "68": 487096320.0, + "69": 487096320.0, + "70": 487096320.0, + "71": 487096320.0, + "72": 487096320.0, + "73": 487096320.0, + "74": 487096320.0, + "75": 487096320.0, + "76": 487096320.0, + "77": 487096320.0, + "78": 487096320.0, + "79": 487096320.0, + "80": 487096320.0, + "81": 487096320.0, + "82": 487096320.0, + "83": 487096320.0, + "84": 487096320.0, + "85": 487096320.0, + "86": 487096320.0, + "87": 487096320.0, + "88": 487096320.0, + "89": 487096320.0, + "90": 487096320.0, + "91": 487096320.0, + "92": 487096320.0, + "93": 487096320.0, + "94": 487096320.0, + "95": 487096320.0, + "96": 487096320.0, + "97": 487096320.0, + "98": 487096320.0, + "99": 487096320.0, + "100": 487096320.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1720084480.0, + "2": 1900157952.0, + "3": 1900157952.0, + "4": 1900157952.0, + "5": 1900157952.0, + "6": 1900157952.0, + "7": 1900157952.0, + "8": 1900157952.0, + "9": 1900157952.0, + "10": 1900157952.0, + "11": 1900157952.0, + "12": 1900157952.0, + "13": 1900157952.0, + "14": 1900157952.0, + "15": 1900157952.0, + "16": 1900157952.0, + "17": 1900157952.0, + "18": 1900157952.0, + "19": 1900157952.0, + "20": 1900157952.0, + "21": 1900157952.0, + "22": 1900157952.0, + "23": 1900157952.0, + "24": 1900157952.0, + "25": 1900157952.0, + "26": 1900157952.0, + "27": 1900157952.0, + "28": 1900157952.0, + "29": 1900157952.0, + "30": 1900157952.0, + "31": 1900157952.0, + "32": 1900157952.0, + "33": 1900157952.0, + "34": 1900157952.0, + "35": 1900157952.0, + "36": 1900157952.0, + "37": 1900157952.0, + "38": 1900157952.0, + "39": 1900157952.0, + "40": 1900157952.0, + "41": 1900157952.0, + "42": 1900157952.0, + "43": 1900157952.0, + "44": 1900157952.0, + "45": 1900157952.0, + "46": 1900157952.0, + "47": 1900157952.0, + "48": 1900157952.0, + "49": 1900157952.0, + "50": 1900157952.0, + "51": 1900157952.0, + "52": 1900157952.0, + "53": 1900157952.0, + "54": 1900157952.0, + "55": 1900157952.0, + "56": 1900157952.0, + "57": 1900157952.0, + "58": 1900157952.0, + "59": 1900157952.0, + "60": 1900157952.0, + "61": 1900157952.0, + "62": 1900157952.0, + "63": 1900157952.0, + "64": 1900157952.0, + "65": 1900157952.0, + "66": 1900157952.0, + "67": 1900157952.0, + "68": 1900157952.0, + "69": 1900157952.0, + "70": 1900157952.0, + "71": 1900157952.0, + "72": 1900157952.0, + "73": 1900157952.0, + "74": 1900157952.0, + "75": 1900157952.0, + "76": 1900157952.0, + "77": 1900157952.0, + "78": 1900157952.0, + "79": 1900157952.0, + "80": 1900157952.0, + "81": 1900157952.0, + "82": 1900157952.0, + "83": 1900157952.0, + "84": 1900157952.0, + "85": 1900157952.0, + "86": 1900157952.0, + "87": 1900157952.0, + "88": 1900157952.0, + "89": 1900157952.0, + "90": 1900157952.0, + "91": 1900157952.0, + "92": 1900157952.0, + "93": 1900157952.0, + "94": 1900157952.0, + "95": 1900157952.0, + "96": 1900157952.0, + "97": 1900157952.0, + "98": 1900157952.0, + "99": 1900157952.0, + "100": 1900157952.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.18635, + "2": 0.51143, + "3": 0.22467, + "4": 0.22383, + "5": 0.22656, + "6": 0.22198, + "7": 0.22714, + "8": 0.22548, + "9": 0.22693, + "10": 0.22495, + "11": 0.22373, + "12": 0.22603, + "13": 0.22383, + "14": 0.22775, + "15": 0.2246, + "16": 0.22631, + "17": 0.22428, + "18": 0.22651, + "19": 0.22468, + "20": 0.22662, + "21": 0.22656, + "22": 0.22412, + "23": 0.2244, + "24": 0.22387, + "25": 0.22714, + "26": 0.22328, + "27": 0.22509, + "28": 0.22418, + "29": 0.22427, + "30": 0.22512, + "31": 0.22375, + "32": 0.22369, + "33": 0.22403, + "34": 0.22748, + "35": 0.22797, + "36": 0.2259, + "37": 0.22337, + "38": 0.22614, + "39": 0.22328, + "40": 0.22898, + "41": 0.23448, + "42": 0.43469, + "43": 0.22427, + "44": 0.22708, + "45": 0.22289, + "46": 0.22786, + "47": 0.22274, + "48": 0.22383, + "49": 0.22317, + "50": 0.22534, + "51": 0.24991, + "52": 0.24511, + "53": 0.24212, + "54": 0.24477, + "55": 0.43963, + "56": 0.24504, + "57": 0.24214, + "58": 0.2444, + "59": 0.24255, + "60": 0.24252, + "61": 0.24317, + "62": 0.2455, + "63": 0.2441, + "64": 0.24309, + "65": 0.24205, + "66": 0.24822, + "67": 0.24294, + "68": 0.24294, + "69": 0.24265, + "70": 0.24445, + "71": 0.24281, + "72": 0.2431, + "73": 0.24193, + "74": 0.24487, + "75": 0.24331, + "76": 0.24509, + "77": 0.24318, + "78": 0.24248, + "79": 0.24489, + "80": 0.24557, + "81": 0.24722, + "82": 0.24377, + "83": 0.24576, + "84": 0.24463, + "85": 0.24362, + "86": 0.2432, + "87": 0.24588, + "88": 0.2452, + "89": 0.24361, + "90": 0.24371, + "91": 0.24472, + "92": 0.24381, + "93": 0.24279, + "94": 0.24377, + "95": 0.24609, + "96": 0.24562, + "97": 0.2436, + "98": 0.24534, + "99": 0.24537, + "100": 0.24419 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..4feab32a5b8 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.92705, + "2": 10.92645, + "3": 10.91604, + "4": 10.90911, + "5": 10.92795, + "6": 10.93626, + "7": 10.90626, + "8": 10.92128, + "9": 10.90998, + "10": 10.90786, + "11": 10.89335, + "12": 10.92456, + "13": 10.9146, + "14": 10.9213, + "15": 10.88314, + "16": 10.87325, + "17": 10.84129, + "18": 10.87276, + "19": 10.8563, + "20": 10.77629, + "21": 10.74869, + "22": 10.63031, + "23": 10.75678, + "24": 10.65646, + "25": 10.59141, + "26": 10.65375, + "27": 10.6485, + "28": 10.59548, + "29": 10.6088, + "30": 10.39192, + "31": 10.15753, + "32": 10.49098, + "33": 10.4793, + "34": 10.24058, + "35": 10.29686, + "36": 10.24644, + "37": 10.35232, + "38": 10.20489, + "39": 10.4052, + "40": 10.0964, + "41": 10.15175, + "42": 10.22026, + "43": 9.85499, + "44": 9.96143, + "45": 9.84464, + "46": 9.83801, + "47": 10.13988, + "48": 9.85718, + "49": 9.53698, + "50": 9.90918, + "51": 9.84886, + "52": 9.74154, + "53": 10.06347, + "54": 9.94683, + "55": 9.87762, + "56": 9.6274, + "57": 9.47112, + "58": 9.82925, + "59": 9.58253, + "60": 9.49121, + "61": 9.69956, + "62": 9.97968, + "63": 9.37277, + "64": 9.77468, + "65": 8.94236, + "66": 9.6991, + "67": 9.36382, + "68": 9.78787, + "69": 9.78332, + "70": 9.72266, + "71": 9.60801, + "72": 9.58459, + "73": 9.48963, + "74": 8.94871, + "75": 9.41912, + "76": 9.08725, + "77": 10.06354, + "78": 9.72835, + "79": 9.37162, + "80": 9.40077, + "81": 9.47843, + "82": 9.69177, + "83": 9.3076, + "84": 9.41232, + "85": 9.61207, + "86": 9.07599, + "87": 9.59468, + "88": 9.74738, + "89": 9.60686, + "90": 9.81015, + "91": 9.34359, + "92": 9.36482, + "93": 9.07761, + "94": 8.83108, + "95": 9.51716, + "96": 9.52447, + "97": 9.31027, + "98": 9.67892, + "99": 8.88832, + "100": 9.4015 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1627.0, + "2": 1801.0, + "3": 1730.0, + "4": 1762.0, + "5": 2010.0, + "6": 1889.0, + "7": 1888.0, + "8": 1729.0, + "9": 1852.0, + "10": 1368.0, + "11": 1973.0, + "12": 1722.0, + "13": 1966.0, + "14": 1874.0, + "15": 1897.0, + "16": 1785.0, + "17": 1942.0, + "18": 1718.0, + "19": 1716.0, + "20": 1626.0, + "21": 1797.0, + "22": 1673.0, + "23": 1937.0, + "24": 1561.0, + "25": 1743.0, + "26": 1917.0, + "27": 1886.0, + "28": 1968.0, + "29": 2029.0, + "30": 1930.0, + "31": 1635.0, + "32": 1974.0, + "33": 2159.0, + "34": 2035.0, + "35": 1954.0, + "36": 1948.0, + "37": 2317.0, + "38": 2312.0, + "39": 2458.0, + "40": 2199.0, + "41": 2352.0, + "42": 2288.0, + "43": 2005.0, + "44": 2191.0, + "45": 2068.0, + "46": 2272.0, + "47": 2530.0, + "48": 2458.0, + "49": 2252.0, + "50": 2460.0, + "51": 2777.0, + "52": 2659.0, + "53": 2959.0, + "54": 2700.0, + "55": 2427.0, + "56": 2797.0, + "57": 2430.0, + "58": 3077.0, + "59": 2781.0, + "60": 2380.0, + "61": 2816.0, + "62": 2812.0, + "63": 2452.0, + "64": 2958.0, + "65": 2657.0, + "66": 3208.0, + "67": 2786.0, + "68": 2842.0, + "69": 2927.0, + "70": 3265.0, + "71": 3098.0, + "72": 2445.0, + "73": 3120.0, + "74": 1900.0, + "75": 2675.0, + "76": 3065.0, + "77": 3452.0, + "78": 3263.0, + "79": 3398.0, + "80": 3434.0, + "81": 3695.0, + "82": 3308.0, + "83": 2935.0, + "84": 3423.0, + "85": 3302.0, + "86": 2785.0, + "87": 3788.0, + "88": 3030.0, + "89": 3532.0, + "90": 3230.0, + "91": 2681.0, + "92": 3175.0, + "93": 2718.0, + "94": 3392.0, + "95": 3340.0, + "96": 3504.0, + "97": 3227.0, + "98": 3757.0, + "99": 3245.0, + "100": 3291.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 482498560.0, + "2": 482498560.0, + "3": 482498560.0, + "4": 482498560.0, + "5": 482498560.0, + "6": 482498560.0, + "7": 482498560.0, + "8": 482498560.0, + "9": 482498560.0, + "10": 482498560.0, + "11": 482498560.0, + "12": 482498560.0, + "13": 482498560.0, + "14": 482498560.0, + "15": 482498560.0, + "16": 482498560.0, + "17": 482498560.0, + "18": 482498560.0, + "19": 482498560.0, + "20": 482498560.0, + "21": 482498560.0, + "22": 482498560.0, + "23": 482498560.0, + "24": 482498560.0, + "25": 482498560.0, + "26": 482498560.0, + "27": 482498560.0, + "28": 482498560.0, + "29": 482498560.0, + "30": 482498560.0, + "31": 482498560.0, + "32": 482498560.0, + "33": 482498560.0, + "34": 482498560.0, + "35": 482498560.0, + "36": 482498560.0, + "37": 482498560.0, + "38": 482498560.0, + "39": 482498560.0, + "40": 482498560.0, + "41": 482498560.0, + "42": 482498560.0, + "43": 482498560.0, + "44": 482498560.0, + "45": 482498560.0, + "46": 482498560.0, + "47": 482498560.0, + "48": 482498560.0, + "49": 482498560.0, + "50": 482498560.0, + "51": 482498560.0, + "52": 482498560.0, + "53": 482498560.0, + "54": 482498560.0, + "55": 482498560.0, + "56": 482498560.0, + "57": 482498560.0, + "58": 482498560.0, + "59": 482498560.0, + "60": 482498560.0, + "61": 482498560.0, + "62": 482498560.0, + "63": 482498560.0, + "64": 482498560.0, + "65": 482498560.0, + "66": 482498560.0, + "67": 482498560.0, + "68": 482498560.0, + "69": 482498560.0, + "70": 482498560.0, + "71": 482498560.0, + "72": 482498560.0, + "73": 482498560.0, + "74": 482498560.0, + "75": 482498560.0, + "76": 482498560.0, + "77": 482498560.0, + "78": 482498560.0, + "79": 482498560.0, + "80": 482498560.0, + "81": 482498560.0, + "82": 482498560.0, + "83": 482498560.0, + "84": 482498560.0, + "85": 482498560.0, + "86": 482498560.0, + "87": 482498560.0, + "88": 482498560.0, + "89": 482498560.0, + "90": 482498560.0, + "91": 482498560.0, + "92": 482498560.0, + "93": 482498560.0, + "94": 482498560.0, + "95": 482498560.0, + "96": 482498560.0, + "97": 482498560.0, + "98": 482498560.0, + "99": 482498560.0, + "100": 482498560.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1712340992.0, + "2": 1891365888.0, + "3": 1891365888.0, + "4": 1891365888.0, + "5": 1891365888.0, + "6": 1891365888.0, + "7": 1891365888.0, + "8": 1891365888.0, + "9": 1891365888.0, + "10": 1891365888.0, + "11": 1891365888.0, + "12": 1891365888.0, + "13": 1891365888.0, + "14": 1891365888.0, + "15": 1891365888.0, + "16": 1891365888.0, + "17": 1891365888.0, + "18": 1891365888.0, + "19": 1891365888.0, + "20": 1891365888.0, + "21": 1891365888.0, + "22": 1891365888.0, + "23": 1891365888.0, + "24": 1891365888.0, + "25": 1891365888.0, + "26": 1891365888.0, + "27": 1891365888.0, + "28": 1891365888.0, + "29": 1891365888.0, + "30": 1891365888.0, + "31": 1891365888.0, + "32": 1891365888.0, + "33": 1891365888.0, + "34": 1891365888.0, + "35": 1891365888.0, + "36": 1891365888.0, + "37": 1891365888.0, + "38": 1891365888.0, + "39": 1891365888.0, + "40": 1891365888.0, + "41": 1891365888.0, + "42": 1891365888.0, + "43": 1891365888.0, + "44": 1891365888.0, + "45": 1891365888.0, + "46": 1891365888.0, + "47": 1891365888.0, + "48": 1891365888.0, + "49": 1891365888.0, + "50": 1891365888.0, + "51": 1891365888.0, + "52": 1891365888.0, + "53": 1891365888.0, + "54": 1891365888.0, + "55": 1891365888.0, + "56": 1891365888.0, + "57": 1891365888.0, + "58": 1891365888.0, + "59": 1891365888.0, + "60": 1891365888.0, + "61": 1891365888.0, + "62": 1891365888.0, + "63": 1891365888.0, + "64": 1891365888.0, + "65": 1891365888.0, + "66": 1891365888.0, + "67": 1891365888.0, + "68": 1891365888.0, + "69": 1891365888.0, + "70": 1891365888.0, + "71": 1891365888.0, + "72": 1891365888.0, + "73": 1891365888.0, + "74": 1891365888.0, + "75": 1891365888.0, + "76": 1891365888.0, + "77": 1891365888.0, + "78": 1891365888.0, + "79": 1891365888.0, + "80": 1891365888.0, + "81": 1891365888.0, + "82": 1891365888.0, + "83": 1891365888.0, + "84": 1891365888.0, + "85": 1891365888.0, + "86": 1891365888.0, + "87": 1891365888.0, + "88": 1891365888.0, + "89": 1891365888.0, + "90": 1891365888.0, + "91": 1891365888.0, + "92": 1891365888.0, + "93": 1891365888.0, + "94": 1891365888.0, + "95": 1891365888.0, + "96": 1891365888.0, + "97": 1891365888.0, + "98": 1891365888.0, + "99": 1891365888.0, + "100": 1891365888.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 13.54319, + "2": 0.26722, + "3": 0.22179, + "4": 0.22153, + "5": 0.22721, + "6": 0.22318, + "7": 0.22305, + "8": 0.26638, + "9": 0.25699, + "10": 0.22617, + "11": 0.22964, + "12": 0.22917, + "13": 0.22422, + "14": 0.22513, + "15": 0.22324, + "16": 0.22185, + "17": 0.2209, + "18": 0.229, + "19": 0.22105, + "20": 0.22048, + "21": 0.22339, + "22": 0.22351, + "23": 0.22154, + "24": 0.22155, + "25": 0.22184, + "26": 0.22048, + "27": 0.22559, + "28": 0.22037, + "29": 0.22036, + "30": 0.2223, + "31": 0.22392, + "32": 0.22147, + "33": 0.22201, + "34": 0.21977, + "35": 0.22008, + "36": 0.22582, + "37": 0.21924, + "38": 0.22002, + "39": 0.22005, + "40": 0.22002, + "41": 0.22508, + "42": 0.21887, + "43": 0.21999, + "44": 0.21904, + "45": 0.22339, + "46": 0.21983, + "47": 0.21914, + "48": 0.21981, + "49": 0.22038, + "50": 0.22179, + "51": 0.44158, + "52": 0.22072, + "53": 0.2216, + "54": 0.21972, + "55": 0.2224, + "56": 0.21985, + "57": 0.21947, + "58": 0.22049, + "59": 0.22101, + "60": 0.41998, + "61": 0.22036, + "62": 0.22068, + "63": 0.223, + "64": 0.2206, + "65": 0.21966, + "66": 0.22032, + "67": 0.22009, + "68": 0.22359, + "69": 0.21962, + "70": 0.21951, + "71": 0.21979, + "72": 0.22305, + "73": 0.22044, + "74": 0.21963, + "75": 0.21954, + "76": 0.22086, + "77": 0.22567, + "78": 0.21994, + "79": 0.21942, + "80": 0.21927, + "81": 0.22743, + "82": 0.21995, + "83": 0.21975, + "84": 0.2199, + "85": 0.22001, + "86": 0.22586, + "87": 0.22037, + "88": 0.21916, + "89": 0.22024, + "90": 0.22297, + "91": 0.22249, + "92": 0.21959, + "93": 0.21938, + "94": 0.22092, + "95": 0.2253, + "96": 0.21981, + "97": 0.21968, + "98": 0.22037, + "99": 0.22237, + "100": 0.22281 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..8ac6c3744df --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.92705, + "2": 10.92645, + "3": 10.91604, + "4": 10.90911, + "5": 10.92795, + "6": 10.93626, + "7": 10.90626, + "8": 10.92128, + "9": 10.90998, + "10": 10.90786, + "11": 10.89335, + "12": 10.92456, + "13": 10.9146, + "14": 10.9213, + "15": 10.88314, + "16": 10.87325, + "17": 10.84129, + "18": 10.87276, + "19": 10.8563, + "20": 10.77629, + "21": 10.74869, + "22": 10.63031, + "23": 10.75678, + "24": 10.65646, + "25": 10.59141, + "26": 10.65375, + "27": 10.6485, + "28": 10.59548, + "29": 10.6088, + "30": 10.39192, + "31": 10.15753, + "32": 10.49098, + "33": 10.4793, + "34": 10.24058, + "35": 10.29686, + "36": 10.24644, + "37": 10.35232, + "38": 10.20489, + "39": 10.4052, + "40": 10.0964, + "41": 10.15175, + "42": 10.22026, + "43": 9.85499, + "44": 9.96143, + "45": 9.84464, + "46": 9.83801, + "47": 10.13988, + "48": 9.85718, + "49": 9.53698, + "50": 9.90918, + "51": 9.84886, + "52": 9.74154, + "53": 10.06347, + "54": 9.94683, + "55": 9.87762, + "56": 9.6274, + "57": 9.47112, + "58": 9.82925, + "59": 9.58253, + "60": 9.49121, + "61": 9.69956, + "62": 9.97968, + "63": 9.37277, + "64": 9.77468, + "65": 8.94236, + "66": 9.6991, + "67": 9.36382, + "68": 9.78787, + "69": 9.78332, + "70": 9.72266, + "71": 9.60801, + "72": 9.58459, + "73": 9.48963, + "74": 8.94871, + "75": 9.41912, + "76": 9.08725, + "77": 10.06354, + "78": 9.72835, + "79": 9.37162, + "80": 9.40077, + "81": 9.47843, + "82": 9.69177, + "83": 9.3076, + "84": 9.41232, + "85": 9.61207, + "86": 9.07599, + "87": 9.59468, + "88": 9.74738, + "89": 9.60686, + "90": 9.81015, + "91": 9.34359, + "92": 9.36482, + "93": 9.07761, + "94": 8.83108, + "95": 9.51716, + "96": 9.52447, + "97": 9.31027, + "98": 9.67892, + "99": 8.88832, + "100": 9.4015 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1627.0, + "2": 1801.0, + "3": 1730.0, + "4": 1762.0, + "5": 2010.0, + "6": 1889.0, + "7": 1888.0, + "8": 1729.0, + "9": 1852.0, + "10": 1368.0, + "11": 1973.0, + "12": 1722.0, + "13": 1966.0, + "14": 1874.0, + "15": 1897.0, + "16": 1785.0, + "17": 1942.0, + "18": 1718.0, + "19": 1716.0, + "20": 1626.0, + "21": 1797.0, + "22": 1673.0, + "23": 1937.0, + "24": 1561.0, + "25": 1743.0, + "26": 1917.0, + "27": 1886.0, + "28": 1968.0, + "29": 2029.0, + "30": 1930.0, + "31": 1635.0, + "32": 1974.0, + "33": 2159.0, + "34": 2035.0, + "35": 1954.0, + "36": 1948.0, + "37": 2317.0, + "38": 2312.0, + "39": 2458.0, + "40": 2199.0, + "41": 2352.0, + "42": 2288.0, + "43": 2005.0, + "44": 2191.0, + "45": 2068.0, + "46": 2272.0, + "47": 2530.0, + "48": 2458.0, + "49": 2252.0, + "50": 2460.0, + "51": 2777.0, + "52": 2659.0, + "53": 2959.0, + "54": 2700.0, + "55": 2427.0, + "56": 2797.0, + "57": 2430.0, + "58": 3077.0, + "59": 2781.0, + "60": 2380.0, + "61": 2816.0, + "62": 2812.0, + "63": 2452.0, + "64": 2958.0, + "65": 2657.0, + "66": 3208.0, + "67": 2786.0, + "68": 2842.0, + "69": 2927.0, + "70": 3265.0, + "71": 3098.0, + "72": 2445.0, + "73": 3120.0, + "74": 1900.0, + "75": 2675.0, + "76": 3065.0, + "77": 3452.0, + "78": 3263.0, + "79": 3398.0, + "80": 3434.0, + "81": 3695.0, + "82": 3308.0, + "83": 2935.0, + "84": 3423.0, + "85": 3302.0, + "86": 2785.0, + "87": 3788.0, + "88": 3030.0, + "89": 3532.0, + "90": 3230.0, + "91": 2681.0, + "92": 3175.0, + "93": 2718.0, + "94": 3392.0, + "95": 3340.0, + "96": 3504.0, + "97": 3227.0, + "98": 3757.0, + "99": 3245.0, + "100": 3291.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 482498560.0, + "2": 482498560.0, + "3": 482498560.0, + "4": 482498560.0, + "5": 482498560.0, + "6": 482498560.0, + "7": 482498560.0, + "8": 482498560.0, + "9": 482498560.0, + "10": 482498560.0, + "11": 482498560.0, + "12": 482498560.0, + "13": 482498560.0, + "14": 482498560.0, + "15": 482498560.0, + "16": 482498560.0, + "17": 482498560.0, + "18": 482498560.0, + "19": 482498560.0, + "20": 482498560.0, + "21": 482498560.0, + "22": 482498560.0, + "23": 482498560.0, + "24": 482498560.0, + "25": 482498560.0, + "26": 482498560.0, + "27": 482498560.0, + "28": 482498560.0, + "29": 482498560.0, + "30": 482498560.0, + "31": 482498560.0, + "32": 482498560.0, + "33": 482498560.0, + "34": 482498560.0, + "35": 482498560.0, + "36": 482498560.0, + "37": 482498560.0, + "38": 482498560.0, + "39": 482498560.0, + "40": 482498560.0, + "41": 482498560.0, + "42": 482498560.0, + "43": 482498560.0, + "44": 482498560.0, + "45": 482498560.0, + "46": 482498560.0, + "47": 482498560.0, + "48": 482498560.0, + "49": 482498560.0, + "50": 482498560.0, + "51": 482498560.0, + "52": 482498560.0, + "53": 482498560.0, + "54": 482498560.0, + "55": 482498560.0, + "56": 482498560.0, + "57": 482498560.0, + "58": 482498560.0, + "59": 482498560.0, + "60": 482498560.0, + "61": 482498560.0, + "62": 482498560.0, + "63": 482498560.0, + "64": 482498560.0, + "65": 482498560.0, + "66": 482498560.0, + "67": 482498560.0, + "68": 482498560.0, + "69": 482498560.0, + "70": 482498560.0, + "71": 482498560.0, + "72": 482498560.0, + "73": 482498560.0, + "74": 482498560.0, + "75": 482498560.0, + "76": 482498560.0, + "77": 482498560.0, + "78": 482498560.0, + "79": 482498560.0, + "80": 482498560.0, + "81": 482498560.0, + "82": 482498560.0, + "83": 482498560.0, + "84": 482498560.0, + "85": 482498560.0, + "86": 482498560.0, + "87": 482498560.0, + "88": 482498560.0, + "89": 482498560.0, + "90": 482498560.0, + "91": 482498560.0, + "92": 482498560.0, + "93": 482498560.0, + "94": 482498560.0, + "95": 482498560.0, + "96": 482498560.0, + "97": 482498560.0, + "98": 482498560.0, + "99": 482498560.0, + "100": 482498560.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1712340992.0, + "2": 1891365888.0, + "3": 1891365888.0, + "4": 1891365888.0, + "5": 1891365888.0, + "6": 1891365888.0, + "7": 1891365888.0, + "8": 1891365888.0, + "9": 1891365888.0, + "10": 1891365888.0, + "11": 1891365888.0, + "12": 1891365888.0, + "13": 1891365888.0, + "14": 1891365888.0, + "15": 1891365888.0, + "16": 1891365888.0, + "17": 1891365888.0, + "18": 1891365888.0, + "19": 1891365888.0, + "20": 1891365888.0, + "21": 1891365888.0, + "22": 1891365888.0, + "23": 1891365888.0, + "24": 1891365888.0, + "25": 1891365888.0, + "26": 1891365888.0, + "27": 1891365888.0, + "28": 1891365888.0, + "29": 1891365888.0, + "30": 1891365888.0, + "31": 1891365888.0, + "32": 1891365888.0, + "33": 1891365888.0, + "34": 1891365888.0, + "35": 1891365888.0, + "36": 1891365888.0, + "37": 1891365888.0, + "38": 1891365888.0, + "39": 1891365888.0, + "40": 1891365888.0, + "41": 1891365888.0, + "42": 1891365888.0, + "43": 1891365888.0, + "44": 1891365888.0, + "45": 1891365888.0, + "46": 1891365888.0, + "47": 1891365888.0, + "48": 1891365888.0, + "49": 1891365888.0, + "50": 1891365888.0, + "51": 1891365888.0, + "52": 1891365888.0, + "53": 1891365888.0, + "54": 1891365888.0, + "55": 1891365888.0, + "56": 1891365888.0, + "57": 1891365888.0, + "58": 1891365888.0, + "59": 1891365888.0, + "60": 1891365888.0, + "61": 1891365888.0, + "62": 1891365888.0, + "63": 1891365888.0, + "64": 1891365888.0, + "65": 1891365888.0, + "66": 1891365888.0, + "67": 1891365888.0, + "68": 1891365888.0, + "69": 1891365888.0, + "70": 1891365888.0, + "71": 1891365888.0, + "72": 1891365888.0, + "73": 1891365888.0, + "74": 1891365888.0, + "75": 1891365888.0, + "76": 1891365888.0, + "77": 1891365888.0, + "78": 1891365888.0, + "79": 1891365888.0, + "80": 1891365888.0, + "81": 1891365888.0, + "82": 1891365888.0, + "83": 1891365888.0, + "84": 1891365888.0, + "85": 1891365888.0, + "86": 1891365888.0, + "87": 1891365888.0, + "88": 1891365888.0, + "89": 1891365888.0, + "90": 1891365888.0, + "91": 1891365888.0, + "92": 1891365888.0, + "93": 1891365888.0, + "94": 1891365888.0, + "95": 1891365888.0, + "96": 1891365888.0, + "97": 1891365888.0, + "98": 1891365888.0, + "99": 1891365888.0, + "100": 1891365888.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 14.02291, + "2": 0.25698, + "3": 0.22494, + "4": 0.22549, + "5": 0.22123, + "6": 0.22199, + "7": 0.22201, + "8": 0.22481, + "9": 0.22513, + "10": 0.22241, + "11": 0.22332, + "12": 0.22223, + "13": 0.22628, + "14": 0.22248, + "15": 0.22165, + "16": 0.22121, + "17": 0.224, + "18": 0.22329, + "19": 0.22788, + "20": 0.22088, + "21": 0.22171, + "22": 0.2267, + "23": 0.2231, + "24": 0.22082, + "25": 0.22278, + "26": 0.22362, + "27": 0.22127, + "28": 0.22083, + "29": 0.22007, + "30": 0.22168, + "31": 0.22562, + "32": 0.22252, + "33": 0.22134, + "34": 0.22034, + "35": 0.22446, + "36": 0.22435, + "37": 0.21955, + "38": 0.22888, + "39": 0.22007, + "40": 0.22467, + "41": 0.22235, + "42": 0.22037, + "43": 0.21987, + "44": 0.22161, + "45": 0.22407, + "46": 0.21928, + "47": 0.21937, + "48": 0.22055, + "49": 0.22041, + "50": 0.21825, + "51": 0.23094, + "52": 0.22395, + "53": 0.22444, + "54": 0.22304, + "55": 0.22247, + "56": 0.22274, + "57": 0.22315, + "58": 0.22428, + "59": 0.22249, + "60": 0.22237, + "61": 0.22311, + "62": 0.2253, + "63": 0.22199, + "64": 0.22192, + "65": 0.22225, + "66": 0.22273, + "67": 0.22186, + "68": 0.22015, + "69": 0.22083, + "70": 0.22201, + "71": 0.22474, + "72": 0.22079, + "73": 0.22118, + "74": 0.22105, + "75": 0.22105, + "76": 0.22207, + "77": 0.22072, + "78": 0.22157, + "79": 0.22114, + "80": 0.22667, + "81": 0.22112, + "82": 0.22055, + "83": 0.22095, + "84": 0.22242, + "85": 0.22302, + "86": 0.22037, + "87": 0.22095, + "88": 0.22048, + "89": 0.22998, + "90": 0.22099, + "91": 0.22067, + "92": 0.2202, + "93": 0.22164, + "94": 0.22306, + "95": 0.22015, + "96": 0.22081, + "97": 0.22074, + "98": 0.22695, + "99": 0.22087, + "100": 0.22052 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index aa3c5f5d2a9..b052742de3f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.85949, + "2": 10.85553, + "3": 10.86546, + "4": 10.84554, "5": 10.88348, + "6": 10.89432, + "7": 10.87067, + "8": 10.86981, + "9": 10.86919, "10": 10.83887, + "11": 10.89435, + "12": 10.87982, + "13": 10.87988, + "14": 10.90314, "15": 10.8405, + "16": 10.83786, + "17": 10.80668, + "18": 10.83027, + "19": 10.82259, "20": 10.73192, + "21": 10.70753, + "22": 10.56005, + "23": 10.72402, + "24": 10.6111, "25": 10.54815, + "26": 10.61332, + "27": 10.63056, + "28": 10.56645, + "29": 10.59668, "30": 10.37137, + "31": 10.1172, + "32": 10.4613, + "33": 10.45249, + "34": 10.2169, "35": 10.27173, + "36": 10.23118, + "37": 10.34812, + "38": 10.1884, + "39": 10.41042, "40": 10.09426, + "41": 10.1471, + "42": 10.21243, + "43": 9.8411, + "44": 9.95916, "45": 9.84085, + "46": 9.8248, + "47": 10.1388, + "48": 9.8584, + "49": 9.5472, "50": 9.90878, + "51": 9.85583, + "52": 9.75242, + "53": 10.07589, + "54": 9.95688, "55": 9.88208, + "56": 9.63141, + "57": 9.48651, + "58": 9.83118, + "59": 9.58905, "60": 9.50651, + "61": 9.7037, + "62": 9.98291, + "63": 9.38315, + "64": 9.77906, "65": 8.95179, + "66": 9.7016, + "67": 9.37206, + "68": 9.78852, + "69": 9.79859, "70": 9.74746, + "71": 9.6191, + "72": 9.58502, + "73": 9.49725, + "74": 8.93933, "75": 9.42706, + "76": 9.08024, + "77": 10.06571, + "78": 9.72896, + "79": 9.37772, "80": 9.40999, + "81": 9.47983, + "82": 9.70184, + "83": 9.30625, + "84": 9.42095, "85": 9.61378, + "86": 9.07656, + "87": 9.59458, + "88": 9.75068, + "89": 9.60243, "90": 9.81901, + "91": 9.33899, + "92": 9.35717, + "93": 9.07883, + "94": 8.8351, "95": 9.52171, + "96": 9.53008, + "97": 9.31309, + "98": 9.67785, + "99": 8.89061, "100": 9.39726 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1638.0, + "2": 1766.0, + "3": 1620.0, + "4": 1810.0, "5": 1844.0, + "6": 1835.0, + "7": 1694.0, + "8": 1632.0, + "9": 1902.0, "10": 1427.0, + "11": 1932.0, + "12": 1705.0, + "13": 1834.0, + "14": 1807.0, "15": 1907.0, + "16": 1797.0, + "17": 1911.0, + "18": 1667.0, + "19": 1742.0, "20": 1662.0, + "21": 1853.0, + "22": 1621.0, + "23": 2010.0, + "24": 1546.0, "25": 1510.0, + "26": 1664.0, + "27": 1722.0, + "28": 1977.0, + "29": 2024.0, "30": 1873.0, + "31": 1494.0, + "32": 1890.0, + "33": 2067.0, + "34": 1802.0, "35": 1873.0, + "36": 1954.0, + "37": 2283.0, + "38": 2076.0, + "39": 2280.0, "40": 2111.0, + "41": 2318.0, + "42": 2206.0, + "43": 2040.0, + "44": 2088.0, "45": 2181.0, + "46": 2434.0, + "47": 2446.0, + "48": 2481.0, + "49": 2398.0, "50": 2410.0, + "51": 2528.0, + "52": 2535.0, + "53": 2875.0, + "54": 2862.0, "55": 2406.0, + "56": 2733.0, + "57": 2347.0, + "58": 2918.0, + "59": 2759.0, "60": 2404.0, + "61": 3022.0, + "62": 2494.0, + "63": 2452.0, + "64": 2838.0, "65": 2549.0, + "66": 3044.0, + "67": 2887.0, + "68": 2637.0, + "69": 2860.0, "70": 3034.0, + "71": 2989.0, + "72": 2355.0, + "73": 3034.0, + "74": 1904.0, "75": 2538.0, + "76": 3012.0, + "77": 3193.0, + "78": 2994.0, + "79": 3097.0, "80": 3254.0, + "81": 3671.0, + "82": 3299.0, + "83": 2793.0, + "84": 3146.0, "85": 3329.0, + "86": 2769.0, + "87": 3766.0, + "88": 3021.0, + "89": 3286.0, "90": 3029.0, + "91": 2772.0, + "92": 2955.0, + "93": 2852.0, + "94": 3411.0, "95": 3271.0, + "96": 3279.0, + "97": 3054.0, + "98": 3643.0, + "99": 3303.0, "100": 3142.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 518291968.0, + "2": 518291968.0, + "3": 518291968.0, + "4": 518291968.0, "5": 518291968.0, + "6": 518291968.0, + "7": 518291968.0, + "8": 518291968.0, + "9": 518291968.0, "10": 518291968.0, + "11": 518291968.0, + "12": 518291968.0, + "13": 518291968.0, + "14": 518291968.0, "15": 518291968.0, + "16": 518291968.0, + "17": 518291968.0, + "18": 518291968.0, + "19": 518291968.0, "20": 518291968.0, + "21": 518291968.0, + "22": 518291968.0, + "23": 518291968.0, + "24": 518291968.0, "25": 518291968.0, + "26": 518291968.0, + "27": 518291968.0, + "28": 518291968.0, + "29": 518291968.0, "30": 518291968.0, + "31": 518291968.0, + "32": 518291968.0, + "33": 518291968.0, + "34": 518291968.0, "35": 518291968.0, + "36": 518291968.0, + "37": 518291968.0, + "38": 518291968.0, + "39": 518291968.0, "40": 518291968.0, + "41": 518291968.0, + "42": 518291968.0, + "43": 518291968.0, + "44": 518291968.0, "45": 518291968.0, + "46": 518291968.0, + "47": 518291968.0, + "48": 518291968.0, + "49": 518291968.0, "50": 518291968.0, + "51": 518291968.0, + "52": 518291968.0, + "53": 518291968.0, + "54": 518291968.0, "55": 518291968.0, + "56": 518291968.0, + "57": 518291968.0, + "58": 518291968.0, + "59": 518291968.0, "60": 518291968.0, + "61": 518291968.0, + "62": 518291968.0, + "63": 518291968.0, + "64": 518291968.0, "65": 518291968.0, + "66": 518291968.0, + "67": 518291968.0, + "68": 518291968.0, + "69": 518291968.0, "70": 518291968.0, + "71": 518291968.0, + "72": 518291968.0, + "73": 518291968.0, + "74": 518291968.0, "75": 518291968.0, + "76": 518291968.0, + "77": 518291968.0, + "78": 518291968.0, + "79": 518291968.0, "80": 518291968.0, + "81": 518291968.0, + "82": 518291968.0, + "83": 518291968.0, + "84": 518291968.0, "85": 518291968.0, + "86": 518291968.0, + "87": 518291968.0, + "88": 518291968.0, + "89": 518291968.0, "90": 518291968.0, + "91": 518291968.0, + "92": 518291968.0, + "93": 518291968.0, + "94": 518291968.0, "95": 518291968.0, + "96": 518291968.0, + "97": 518291968.0, + "98": 518291968.0, + "99": 518291968.0, "100": 518291968.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1245476352.0, - "5": 1430268416.0, - "10": 1430268416.0, - "15": 1430268416.0, - "20": 1430268416.0, - "25": 1430268416.0, - "30": 1430268416.0, - "35": 1430268416.0, - "40": 1430268416.0, - "45": 1430268416.0, - "50": 1430268416.0, - "55": 1430268416.0, - "60": 1430268416.0, - "65": 1430268416.0, - "70": 1430268416.0, - "75": 1430268416.0, - "80": 1430268416.0, - "85": 1430268416.0, - "90": 1430268416.0, - "95": 1430268416.0, - "100": 1430268416.0 + "2": 1429481984.0, + "3": 1429481984.0, + "4": 1429481984.0, + "5": 1429481984.0, + "6": 1429481984.0, + "7": 1429481984.0, + "8": 1429481984.0, + "9": 1429481984.0, + "10": 1429481984.0, + "11": 1429481984.0, + "12": 1429481984.0, + "13": 1429481984.0, + "14": 1429481984.0, + "15": 1429481984.0, + "16": 1429481984.0, + "17": 1429481984.0, + "18": 1429481984.0, + "19": 1429481984.0, + "20": 1429481984.0, + "21": 1429481984.0, + "22": 1429481984.0, + "23": 1429481984.0, + "24": 1429481984.0, + "25": 1429481984.0, + "26": 1429481984.0, + "27": 1429481984.0, + "28": 1429481984.0, + "29": 1429481984.0, + "30": 1429481984.0, + "31": 1429481984.0, + "32": 1429481984.0, + "33": 1429481984.0, + "34": 1429481984.0, + "35": 1429481984.0, + "36": 1429481984.0, + "37": 1429481984.0, + "38": 1429481984.0, + "39": 1429481984.0, + "40": 1429481984.0, + "41": 1429481984.0, + "42": 1429481984.0, + "43": 1429481984.0, + "44": 1429481984.0, + "45": 1429481984.0, + "46": 1429481984.0, + "47": 1429481984.0, + "48": 1429481984.0, + "49": 1429481984.0, + "50": 1429481984.0, + "51": 1429481984.0, + "52": 1429481984.0, + "53": 1429481984.0, + "54": 1429481984.0, + "55": 1429481984.0, + "56": 1429481984.0, + "57": 1429481984.0, + "58": 1429481984.0, + "59": 1429481984.0, + "60": 1429481984.0, + "61": 1429481984.0, + "62": 1429481984.0, + "63": 1429481984.0, + "64": 1429481984.0, + "65": 1429481984.0, + "66": 1429481984.0, + "67": 1429481984.0, + "68": 1429481984.0, + "69": 1429481984.0, + "70": 1429481984.0, + "71": 1429481984.0, + "72": 1429481984.0, + "73": 1429481984.0, + "74": 1429481984.0, + "75": 1429481984.0, + "76": 1429481984.0, + "77": 1429481984.0, + "78": 1429481984.0, + "79": 1429481984.0, + "80": 1429481984.0, + "81": 1429481984.0, + "82": 1429481984.0, + "83": 1429481984.0, + "84": 1429481984.0, + "85": 1429481984.0, + "86": 1429481984.0, + "87": 1429481984.0, + "88": 1429481984.0, + "89": 1429481984.0, + "90": 1429481984.0, + "91": 1429481984.0, + "92": 1429481984.0, + "93": 1429481984.0, + "94": 1429481984.0, + "95": 1429481984.0, + "96": 1429481984.0, + "97": 1429481984.0, + "98": 1429481984.0, + "99": 1429481984.0, + "100": 1429481984.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 10.72639, - "5": 0.12756, - "10": 0.12238, - "15": 0.12066, - "20": 0.12159, - "25": 0.12133, - "30": 0.12407, - "35": 0.12311, - "40": 0.1259, - "45": 0.1216, - "50": 0.12187, - "55": 0.12903, - "60": 0.12481, - "65": 0.12314, - "70": 0.12347, - "75": 0.12591, - "80": 0.12073, - "85": 0.12081, - "90": 0.12092, - "95": 0.1218, - "100": 0.12338 + "1": 12.5643, + "2": 0.17332, + "3": 0.15504, + "4": 0.14953, + "5": 0.14296, + "6": 0.14226, + "7": 0.14346, + "8": 0.13938, + "9": 0.14124, + "10": 0.14047, + "11": 0.13835, + "12": 0.14091, + "13": 0.14198, + "14": 0.14069, + "15": 0.13974, + "16": 0.13801, + "17": 0.14306, + "18": 0.14074, + "19": 0.14027, + "20": 0.14158, + "21": 0.14008, + "22": 0.14191, + "23": 0.14006, + "24": 0.13998, + "25": 0.13889, + "26": 0.13978, + "27": 0.14315, + "28": 0.14416, + "29": 0.154, + "30": 0.14026, + "31": 0.14128, + "32": 0.14142, + "33": 0.14025, + "34": 0.14164, + "35": 0.14065, + "36": 0.14236, + "37": 0.13962, + "38": 0.14015, + "39": 0.1412, + "40": 0.14042, + "41": 0.14202, + "42": 0.14116, + "43": 0.1402, + "44": 0.14155, + "45": 0.13981, + "46": 0.14102, + "47": 0.13959, + "48": 0.14118, + "49": 0.14576, + "50": 0.14714, + "51": 0.14965, + "52": 0.14244, + "53": 0.14198, + "54": 0.14102, + "55": 0.1404, + "56": 0.14132, + "57": 0.14, + "58": 0.14143, + "59": 0.16106, + "60": 0.15695, + "61": 0.15431, + "62": 0.14815, + "63": 0.14032, + "64": 0.14044, + "65": 0.14332, + "66": 0.14167, + "67": 0.14533, + "68": 0.1417, + "69": 0.14266, + "70": 0.14095, + "71": 0.14063, + "72": 0.1428, + "73": 0.14351, + "74": 0.14269, + "75": 0.14075, + "76": 0.14214, + "77": 0.14239, + "78": 0.1408, + "79": 0.14254, + "80": 0.14178, + "81": 0.14443, + "82": 0.14301, + "83": 0.14097, + "84": 0.14255, + "85": 0.14113, + "86": 0.14391, + "87": 0.14098, + "88": 0.16001, + "89": 0.15765, + "90": 0.1598, + "91": 0.16005, + "92": 0.14828, + "93": 0.15228, + "94": 0.15292, + "95": 0.14998, + "96": 0.14946, + "97": 0.15122, + "98": 0.144, + "99": 0.14325, + "100": 0.14483 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..055edccd6a0 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85949, + "2": 10.85553, + "3": 10.86546, + "4": 10.84554, + "5": 10.88348, + "6": 10.89432, + "7": 10.87067, + "8": 10.86981, + "9": 10.86919, + "10": 10.83887, + "11": 10.89435, + "12": 10.87982, + "13": 10.87988, + "14": 10.90314, + "15": 10.8405, + "16": 10.83786, + "17": 10.80668, + "18": 10.83027, + "19": 10.82259, + "20": 10.73192, + "21": 10.70753, + "22": 10.56005, + "23": 10.72402, + "24": 10.6111, + "25": 10.54815, + "26": 10.61332, + "27": 10.63056, + "28": 10.56645, + "29": 10.59668, + "30": 10.37137, + "31": 10.1172, + "32": 10.4613, + "33": 10.45249, + "34": 10.2169, + "35": 10.27173, + "36": 10.23118, + "37": 10.34812, + "38": 10.1884, + "39": 10.41042, + "40": 10.09426, + "41": 10.1471, + "42": 10.21243, + "43": 9.8411, + "44": 9.95916, + "45": 9.84085, + "46": 9.8248, + "47": 10.1388, + "48": 9.8584, + "49": 9.5472, + "50": 9.90878, + "51": 9.85583, + "52": 9.75242, + "53": 10.07589, + "54": 9.95688, + "55": 9.88208, + "56": 9.63141, + "57": 9.48651, + "58": 9.83118, + "59": 9.58905, + "60": 9.50651, + "61": 9.7037, + "62": 9.98291, + "63": 9.38315, + "64": 9.77906, + "65": 8.95179, + "66": 9.7016, + "67": 9.37206, + "68": 9.78852, + "69": 9.79859, + "70": 9.74746, + "71": 9.6191, + "72": 9.58502, + "73": 9.49725, + "74": 8.93933, + "75": 9.42706, + "76": 9.08024, + "77": 10.06571, + "78": 9.72896, + "79": 9.37772, + "80": 9.40999, + "81": 9.47983, + "82": 9.70184, + "83": 9.30625, + "84": 9.42095, + "85": 9.61378, + "86": 9.07656, + "87": 9.59458, + "88": 9.75068, + "89": 9.60243, + "90": 9.81901, + "91": 9.33899, + "92": 9.35717, + "93": 9.07883, + "94": 8.8351, + "95": 9.52171, + "96": 9.53008, + "97": 9.31309, + "98": 9.67785, + "99": 8.89061, + "100": 9.39726 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1638.0, + "2": 1766.0, + "3": 1620.0, + "4": 1810.0, + "5": 1844.0, + "6": 1835.0, + "7": 1694.0, + "8": 1632.0, + "9": 1902.0, + "10": 1427.0, + "11": 1932.0, + "12": 1705.0, + "13": 1834.0, + "14": 1807.0, + "15": 1907.0, + "16": 1797.0, + "17": 1911.0, + "18": 1667.0, + "19": 1742.0, + "20": 1662.0, + "21": 1853.0, + "22": 1621.0, + "23": 2010.0, + "24": 1546.0, + "25": 1510.0, + "26": 1664.0, + "27": 1722.0, + "28": 1977.0, + "29": 2024.0, + "30": 1873.0, + "31": 1494.0, + "32": 1890.0, + "33": 2067.0, + "34": 1802.0, + "35": 1873.0, + "36": 1954.0, + "37": 2283.0, + "38": 2076.0, + "39": 2280.0, + "40": 2111.0, + "41": 2318.0, + "42": 2206.0, + "43": 2040.0, + "44": 2088.0, + "45": 2181.0, + "46": 2434.0, + "47": 2446.0, + "48": 2481.0, + "49": 2398.0, + "50": 2410.0, + "51": 2528.0, + "52": 2535.0, + "53": 2875.0, + "54": 2862.0, + "55": 2406.0, + "56": 2733.0, + "57": 2347.0, + "58": 2918.0, + "59": 2759.0, + "60": 2404.0, + "61": 3022.0, + "62": 2494.0, + "63": 2452.0, + "64": 2838.0, + "65": 2549.0, + "66": 3044.0, + "67": 2887.0, + "68": 2637.0, + "69": 2860.0, + "70": 3034.0, + "71": 2989.0, + "72": 2355.0, + "73": 3034.0, + "74": 1904.0, + "75": 2538.0, + "76": 3012.0, + "77": 3193.0, + "78": 2994.0, + "79": 3097.0, + "80": 3254.0, + "81": 3671.0, + "82": 3299.0, + "83": 2793.0, + "84": 3146.0, + "85": 3329.0, + "86": 2769.0, + "87": 3766.0, + "88": 3021.0, + "89": 3286.0, + "90": 3029.0, + "91": 2772.0, + "92": 2955.0, + "93": 2852.0, + "94": 3411.0, + "95": 3271.0, + "96": 3279.0, + "97": 3054.0, + "98": 3643.0, + "99": 3303.0, + "100": 3142.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 518291968.0, + "2": 518291968.0, + "3": 518291968.0, + "4": 518291968.0, + "5": 518291968.0, + "6": 518291968.0, + "7": 518291968.0, + "8": 518291968.0, + "9": 518291968.0, + "10": 518291968.0, + "11": 518291968.0, + "12": 518291968.0, + "13": 518291968.0, + "14": 518291968.0, + "15": 518291968.0, + "16": 518291968.0, + "17": 518291968.0, + "18": 518291968.0, + "19": 518291968.0, + "20": 518291968.0, + "21": 518291968.0, + "22": 518291968.0, + "23": 518291968.0, + "24": 518291968.0, + "25": 518291968.0, + "26": 518291968.0, + "27": 518291968.0, + "28": 518291968.0, + "29": 518291968.0, + "30": 518291968.0, + "31": 518291968.0, + "32": 518291968.0, + "33": 518291968.0, + "34": 518291968.0, + "35": 518291968.0, + "36": 518291968.0, + "37": 518291968.0, + "38": 518291968.0, + "39": 518291968.0, + "40": 518291968.0, + "41": 518291968.0, + "42": 518291968.0, + "43": 518291968.0, + "44": 518291968.0, + "45": 518291968.0, + "46": 518291968.0, + "47": 518291968.0, + "48": 518291968.0, + "49": 518291968.0, + "50": 518291968.0, + "51": 518291968.0, + "52": 518291968.0, + "53": 518291968.0, + "54": 518291968.0, + "55": 518291968.0, + "56": 518291968.0, + "57": 518291968.0, + "58": 518291968.0, + "59": 518291968.0, + "60": 518291968.0, + "61": 518291968.0, + "62": 518291968.0, + "63": 518291968.0, + "64": 518291968.0, + "65": 518291968.0, + "66": 518291968.0, + "67": 518291968.0, + "68": 518291968.0, + "69": 518291968.0, + "70": 518291968.0, + "71": 518291968.0, + "72": 518291968.0, + "73": 518291968.0, + "74": 518291968.0, + "75": 518291968.0, + "76": 518291968.0, + "77": 518291968.0, + "78": 518291968.0, + "79": 518291968.0, + "80": 518291968.0, + "81": 518291968.0, + "82": 518291968.0, + "83": 518291968.0, + "84": 518291968.0, + "85": 518291968.0, + "86": 518291968.0, + "87": 518291968.0, + "88": 518291968.0, + "89": 518291968.0, + "90": 518291968.0, + "91": 518291968.0, + "92": 518291968.0, + "93": 518291968.0, + "94": 518291968.0, + "95": 518291968.0, + "96": 518291968.0, + "97": 518291968.0, + "98": 518291968.0, + "99": 518291968.0, + "100": 518291968.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1245476352.0, + "2": 1429481984.0, + "3": 1429481984.0, + "4": 1429481984.0, + "5": 1429481984.0, + "6": 1429481984.0, + "7": 1429481984.0, + "8": 1429481984.0, + "9": 1429481984.0, + "10": 1429481984.0, + "11": 1429481984.0, + "12": 1429481984.0, + "13": 1429481984.0, + "14": 1429481984.0, + "15": 1429481984.0, + "16": 1429481984.0, + "17": 1429481984.0, + "18": 1429481984.0, + "19": 1429481984.0, + "20": 1429481984.0, + "21": 1429481984.0, + "22": 1429481984.0, + "23": 1429481984.0, + "24": 1429481984.0, + "25": 1429481984.0, + "26": 1429481984.0, + "27": 1429481984.0, + "28": 1429481984.0, + "29": 1429481984.0, + "30": 1429481984.0, + "31": 1429481984.0, + "32": 1429481984.0, + "33": 1429481984.0, + "34": 1429481984.0, + "35": 1429481984.0, + "36": 1429481984.0, + "37": 1429481984.0, + "38": 1429481984.0, + "39": 1429481984.0, + "40": 1429481984.0, + "41": 1429481984.0, + "42": 1429481984.0, + "43": 1429481984.0, + "44": 1429481984.0, + "45": 1429481984.0, + "46": 1429481984.0, + "47": 1430268416.0, + "48": 1430268416.0, + "49": 1430268416.0, + "50": 1430268416.0, + "51": 1430268416.0, + "52": 1430268416.0, + "53": 1430268416.0, + "54": 1430268416.0, + "55": 1430268416.0, + "56": 1430268416.0, + "57": 1430268416.0, + "58": 1430268416.0, + "59": 1430268416.0, + "60": 1430268416.0, + "61": 1430268416.0, + "62": 1430268416.0, + "63": 1430268416.0, + "64": 1430268416.0, + "65": 1430268416.0, + "66": 1430268416.0, + "67": 1430268416.0, + "68": 1430268416.0, + "69": 1430268416.0, + "70": 1430268416.0, + "71": 1430268416.0, + "72": 1430268416.0, + "73": 1430268416.0, + "74": 1430268416.0, + "75": 1430268416.0, + "76": 1430268416.0, + "77": 1430268416.0, + "78": 1430268416.0, + "79": 1430268416.0, + "80": 1430268416.0, + "81": 1430268416.0, + "82": 1430268416.0, + "83": 1430268416.0, + "84": 1430268416.0, + "85": 1430268416.0, + "86": 1430268416.0, + "87": 1430268416.0, + "88": 1430268416.0, + "89": 1430268416.0, + "90": 1430268416.0, + "91": 1430268416.0, + "92": 1430268416.0, + "93": 1430268416.0, + "94": 1430268416.0, + "95": 1430268416.0, + "96": 1430268416.0, + "97": 1430268416.0, + "98": 1430268416.0, + "99": 1430268416.0, + "100": 1430268416.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 12.14048, + "2": 0.15305, + "3": 0.12206, + "4": 0.12159, + "5": 0.12338, + "6": 0.12232, + "7": 0.12178, + "8": 0.12116, + "9": 0.12378, + "10": 0.1213, + "11": 0.12099, + "12": 0.12066, + "13": 0.12326, + "14": 0.12143, + "15": 0.12173, + "16": 0.12258, + "17": 0.12137, + "18": 0.12235, + "19": 0.12098, + "20": 0.12175, + "21": 0.12124, + "22": 0.12047, + "23": 0.12106, + "24": 0.12167, + "25": 0.12151, + "26": 0.12085, + "27": 0.12129, + "28": 0.1211, + "29": 0.12093, + "30": 0.12007, + "31": 0.12104, + "32": 0.12256, + "33": 0.12191, + "34": 0.12633, + "35": 0.13877, + "36": 0.13281, + "37": 0.12383, + "38": 0.12319, + "39": 0.12304, + "40": 0.12247, + "41": 0.1226, + "42": 0.12481, + "43": 0.12769, + "44": 0.12464, + "45": 0.12374, + "46": 0.12839, + "47": 0.12264, + "48": 0.13199, + "49": 0.12462, + "50": 0.12201, + "51": 0.125, + "52": 0.13707, + "53": 0.12341, + "54": 0.12318, + "55": 0.12261, + "56": 0.12283, + "57": 0.12341, + "58": 0.12301, + "59": 0.12419, + "60": 0.12361, + "61": 0.12424, + "62": 0.12437, + "63": 0.12354, + "64": 0.12246, + "65": 0.12204, + "66": 0.1235, + "67": 0.12315, + "68": 0.12287, + "69": 0.12129, + "70": 0.12211, + "71": 0.12216, + "72": 0.12316, + "73": 0.12246, + "74": 0.12156, + "75": 0.12321, + "76": 0.12274, + "77": 0.12488, + "78": 0.12309, + "79": 0.12392, + "80": 0.12291, + "81": 0.12432, + "82": 0.1239, + "83": 0.12342, + "84": 0.12131, + "85": 0.12225, + "86": 0.12172, + "87": 0.12084, + "88": 0.12493, + "89": 0.12176, + "90": 0.12578, + "91": 0.12256, + "92": 0.12137, + "93": 0.12208, + "94": 0.12379, + "95": 0.12088, + "96": 0.12458, + "97": 0.12217, + "98": 0.12238, + "99": 0.12101, + "100": 0.12165 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..80f6783f6f2 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85949, + "2": 10.85553, + "3": 10.86546, + "4": 10.84554, + "5": 10.88348, + "6": 10.89432, + "7": 10.87067, + "8": 10.86981, + "9": 10.86919, + "10": 10.83887, + "11": 10.89435, + "12": 10.87982, + "13": 10.87988, + "14": 10.90314, + "15": 10.8405, + "16": 10.83786, + "17": 10.80668, + "18": 10.83027, + "19": 10.82259, + "20": 10.73192, + "21": 10.70753, + "22": 10.56005, + "23": 10.72402, + "24": 10.6111, + "25": 10.54815, + "26": 10.61332, + "27": 10.63056, + "28": 10.56645, + "29": 10.59668, + "30": 10.37137, + "31": 10.1172, + "32": 10.4613, + "33": 10.45249, + "34": 10.2169, + "35": 10.27173, + "36": 10.23118, + "37": 10.34812, + "38": 10.1884, + "39": 10.41042, + "40": 10.09426, + "41": 10.1471, + "42": 10.21243, + "43": 9.8411, + "44": 9.95916, + "45": 9.84085, + "46": 9.8248, + "47": 10.1388, + "48": 9.8584, + "49": 9.5472, + "50": 9.90878, + "51": 9.85583, + "52": 9.75242, + "53": 10.07589, + "54": 9.95688, + "55": 9.88208, + "56": 9.63141, + "57": 9.48651, + "58": 9.83118, + "59": 9.58905, + "60": 9.50651, + "61": 9.7037, + "62": 9.98291, + "63": 9.38315, + "64": 9.77906, + "65": 8.95179, + "66": 9.7016, + "67": 9.37206, + "68": 9.78852, + "69": 9.79859, + "70": 9.74746, + "71": 9.6191, + "72": 9.58502, + "73": 9.49725, + "74": 8.93933, + "75": 9.42706, + "76": 9.08024, + "77": 10.06571, + "78": 9.72896, + "79": 9.37772, + "80": 9.40999, + "81": 9.47983, + "82": 9.70184, + "83": 9.30625, + "84": 9.42095, + "85": 9.61378, + "86": 9.07656, + "87": 9.59458, + "88": 9.75068, + "89": 9.60243, + "90": 9.81901, + "91": 9.33899, + "92": 9.35717, + "93": 9.07883, + "94": 8.8351, + "95": 9.52171, + "96": 9.53008, + "97": 9.31309, + "98": 9.67785, + "99": 8.89061, + "100": 9.39726 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1638.0, + "2": 1766.0, + "3": 1620.0, + "4": 1810.0, + "5": 1844.0, + "6": 1835.0, + "7": 1694.0, + "8": 1632.0, + "9": 1902.0, + "10": 1427.0, + "11": 1932.0, + "12": 1705.0, + "13": 1834.0, + "14": 1807.0, + "15": 1907.0, + "16": 1797.0, + "17": 1911.0, + "18": 1667.0, + "19": 1742.0, + "20": 1662.0, + "21": 1853.0, + "22": 1621.0, + "23": 2010.0, + "24": 1546.0, + "25": 1510.0, + "26": 1664.0, + "27": 1722.0, + "28": 1977.0, + "29": 2024.0, + "30": 1873.0, + "31": 1494.0, + "32": 1890.0, + "33": 2067.0, + "34": 1802.0, + "35": 1873.0, + "36": 1954.0, + "37": 2283.0, + "38": 2076.0, + "39": 2280.0, + "40": 2111.0, + "41": 2318.0, + "42": 2206.0, + "43": 2040.0, + "44": 2088.0, + "45": 2181.0, + "46": 2434.0, + "47": 2446.0, + "48": 2481.0, + "49": 2398.0, + "50": 2410.0, + "51": 2528.0, + "52": 2535.0, + "53": 2875.0, + "54": 2862.0, + "55": 2406.0, + "56": 2733.0, + "57": 2347.0, + "58": 2918.0, + "59": 2759.0, + "60": 2404.0, + "61": 3022.0, + "62": 2494.0, + "63": 2452.0, + "64": 2838.0, + "65": 2549.0, + "66": 3044.0, + "67": 2887.0, + "68": 2637.0, + "69": 2860.0, + "70": 3034.0, + "71": 2989.0, + "72": 2355.0, + "73": 3034.0, + "74": 1904.0, + "75": 2538.0, + "76": 3012.0, + "77": 3193.0, + "78": 2994.0, + "79": 3097.0, + "80": 3254.0, + "81": 3671.0, + "82": 3299.0, + "83": 2793.0, + "84": 3146.0, + "85": 3329.0, + "86": 2769.0, + "87": 3766.0, + "88": 3021.0, + "89": 3286.0, + "90": 3029.0, + "91": 2772.0, + "92": 2955.0, + "93": 2852.0, + "94": 3411.0, + "95": 3271.0, + "96": 3279.0, + "97": 3054.0, + "98": 3643.0, + "99": 3303.0, + "100": 3142.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 518291968.0, + "2": 518291968.0, + "3": 518291968.0, + "4": 518291968.0, + "5": 518291968.0, + "6": 518291968.0, + "7": 518291968.0, + "8": 518291968.0, + "9": 518291968.0, + "10": 518291968.0, + "11": 518291968.0, + "12": 518291968.0, + "13": 518291968.0, + "14": 518291968.0, + "15": 518291968.0, + "16": 518291968.0, + "17": 518291968.0, + "18": 518291968.0, + "19": 518291968.0, + "20": 518291968.0, + "21": 518291968.0, + "22": 518291968.0, + "23": 518291968.0, + "24": 518291968.0, + "25": 518291968.0, + "26": 518291968.0, + "27": 518291968.0, + "28": 518291968.0, + "29": 518291968.0, + "30": 518291968.0, + "31": 518291968.0, + "32": 518291968.0, + "33": 518291968.0, + "34": 518291968.0, + "35": 518291968.0, + "36": 518291968.0, + "37": 518291968.0, + "38": 518291968.0, + "39": 518291968.0, + "40": 518291968.0, + "41": 518291968.0, + "42": 518291968.0, + "43": 518291968.0, + "44": 518291968.0, + "45": 518291968.0, + "46": 518291968.0, + "47": 518291968.0, + "48": 518291968.0, + "49": 518291968.0, + "50": 518291968.0, + "51": 518291968.0, + "52": 518291968.0, + "53": 518291968.0, + "54": 518291968.0, + "55": 518291968.0, + "56": 518291968.0, + "57": 518291968.0, + "58": 518291968.0, + "59": 518291968.0, + "60": 518291968.0, + "61": 518291968.0, + "62": 518291968.0, + "63": 518291968.0, + "64": 518291968.0, + "65": 518291968.0, + "66": 518291968.0, + "67": 518291968.0, + "68": 518291968.0, + "69": 518291968.0, + "70": 518291968.0, + "71": 518291968.0, + "72": 518291968.0, + "73": 518291968.0, + "74": 518291968.0, + "75": 518291968.0, + "76": 518291968.0, + "77": 518291968.0, + "78": 518291968.0, + "79": 518291968.0, + "80": 518291968.0, + "81": 518291968.0, + "82": 518291968.0, + "83": 518291968.0, + "84": 518291968.0, + "85": 518291968.0, + "86": 518291968.0, + "87": 518291968.0, + "88": 518291968.0, + "89": 518291968.0, + "90": 518291968.0, + "91": 518291968.0, + "92": 518291968.0, + "93": 518291968.0, + "94": 518291968.0, + "95": 518291968.0, + "96": 518291968.0, + "97": 518291968.0, + "98": 518291968.0, + "99": 518291968.0, + "100": 518291968.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1245476352.0, + "2": 1429481984.0, + "3": 1429481984.0, + "4": 1429481984.0, + "5": 1429481984.0, + "6": 1429481984.0, + "7": 1429481984.0, + "8": 1429481984.0, + "9": 1429481984.0, + "10": 1429481984.0, + "11": 1429481984.0, + "12": 1429481984.0, + "13": 1429481984.0, + "14": 1429481984.0, + "15": 1429481984.0, + "16": 1429481984.0, + "17": 1429481984.0, + "18": 1429481984.0, + "19": 1429481984.0, + "20": 1429481984.0, + "21": 1429481984.0, + "22": 1429481984.0, + "23": 1429481984.0, + "24": 1429481984.0, + "25": 1429481984.0, + "26": 1429481984.0, + "27": 1429481984.0, + "28": 1429481984.0, + "29": 1429481984.0, + "30": 1429481984.0, + "31": 1429481984.0, + "32": 1429481984.0, + "33": 1429481984.0, + "34": 1429481984.0, + "35": 1429481984.0, + "36": 1429481984.0, + "37": 1429481984.0, + "38": 1429481984.0, + "39": 1429481984.0, + "40": 1429481984.0, + "41": 1429481984.0, + "42": 1429481984.0, + "43": 1429481984.0, + "44": 1429481984.0, + "45": 1429481984.0, + "46": 1429481984.0, + "47": 1429481984.0, + "48": 1429481984.0, + "49": 1429481984.0, + "50": 1429481984.0, + "51": 1429481984.0, + "52": 1429481984.0, + "53": 1429481984.0, + "54": 1429481984.0, + "55": 1429481984.0, + "56": 1429481984.0, + "57": 1429481984.0, + "58": 1429481984.0, + "59": 1429481984.0, + "60": 1429481984.0, + "61": 1429481984.0, + "62": 1429481984.0, + "63": 1429481984.0, + "64": 1429481984.0, + "65": 1429481984.0, + "66": 1429481984.0, + "67": 1429481984.0, + "68": 1429481984.0, + "69": 1429481984.0, + "70": 1429481984.0, + "71": 1429481984.0, + "72": 1429481984.0, + "73": 1429481984.0, + "74": 1429481984.0, + "75": 1429481984.0, + "76": 1429481984.0, + "77": 1429481984.0, + "78": 1429481984.0, + "79": 1429481984.0, + "80": 1429481984.0, + "81": 1429481984.0, + "82": 1429481984.0, + "83": 1429481984.0, + "84": 1429481984.0, + "85": 1429481984.0, + "86": 1429481984.0, + "87": 1429481984.0, + "88": 1429481984.0, + "89": 1429481984.0, + "90": 1429481984.0, + "91": 1429481984.0, + "92": 1429481984.0, + "93": 1429481984.0, + "94": 1429481984.0, + "95": 1429481984.0, + "96": 1429481984.0, + "97": 1429481984.0, + "98": 1429481984.0, + "99": 1429481984.0, + "100": 1429481984.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 12.65353, + "2": 0.15729, + "3": 0.13911, + "4": 0.14117, + "5": 0.14172, + "6": 0.14091, + "7": 0.14103, + "8": 0.14008, + "9": 0.14444, + "10": 0.14215, + "11": 0.143, + "12": 0.14395, + "13": 0.14101, + "14": 0.14112, + "15": 0.14126, + "16": 0.14286, + "17": 0.14201, + "18": 0.14405, + "19": 0.14472, + "20": 0.14424, + "21": 0.14746, + "22": 0.14732, + "23": 0.14871, + "24": 0.14885, + "25": 0.14732, + "26": 0.14775, + "27": 0.14978, + "28": 0.14685, + "29": 0.15004, + "30": 0.14663, + "31": 0.14925, + "32": 0.14679, + "33": 0.14465, + "34": 0.14701, + "35": 0.14556, + "36": 0.14835, + "37": 0.14562, + "38": 0.14971, + "39": 0.14881, + "40": 0.14688, + "41": 0.14373, + "42": 0.14577, + "43": 0.14595, + "44": 0.1465, + "45": 0.14283, + "46": 0.14194, + "47": 0.14334, + "48": 0.14235, + "49": 0.14347, + "50": 0.14228, + "51": 0.14946, + "52": 0.14427, + "53": 0.14469, + "54": 0.14466, + "55": 0.14197, + "56": 0.14396, + "57": 0.14283, + "58": 0.14383, + "59": 0.14201, + "60": 0.14448, + "61": 0.14593, + "62": 0.14316, + "63": 0.14235, + "64": 0.14447, + "65": 0.14383, + "66": 0.14456, + "67": 0.14508, + "68": 0.1452, + "69": 0.14518, + "70": 0.1449, + "71": 0.14576, + "72": 0.14328, + "73": 0.14352, + "74": 0.1504, + "75": 0.15058, + "76": 0.14825, + "77": 0.14229, + "78": 0.14494, + "79": 0.14518, + "80": 0.14464, + "81": 0.1461, + "82": 0.14482, + "83": 0.14487, + "84": 0.14272, + "85": 0.14154, + "86": 0.14252, + "87": 0.1447, + "88": 0.14327, + "89": 0.1441, + "90": 0.14688, + "91": 0.14346, + "92": 0.14427, + "93": 0.14222, + "94": 0.14464, + "95": 0.14507, + "96": 0.14196, + "97": 0.1438, + "98": 0.14103, + "99": 0.14644, + "100": 0.14474 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..ef4b8c6d946 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.92655, + "2": 10.92585, + "3": 10.91515, + "4": 10.909, + "5": 10.92721, + "6": 10.93563, + "7": 10.90643, + "8": 10.92118, + "9": 10.9107, + "10": 10.90795, + "11": 10.89277, + "12": 10.92431, + "13": 10.91489, + "14": 10.92148, + "15": 10.88292, + "16": 10.87302, + "17": 10.84069, + "18": 10.873, + "19": 10.85633, + "20": 10.77594, + "21": 10.74894, + "22": 10.63083, + "23": 10.75614, + "24": 10.65645, + "25": 10.59266, + "26": 10.6544, + "27": 10.64915, + "28": 10.59496, + "29": 10.60945, + "30": 10.3918, + "31": 10.15724, + "32": 10.49112, + "33": 10.4796, + "34": 10.24073, + "35": 10.297, + "36": 10.24677, + "37": 10.35242, + "38": 10.20481, + "39": 10.40506, + "40": 10.0966, + "41": 10.15195, + "42": 10.22065, + "43": 9.85507, + "44": 9.96164, + "45": 9.84468, + "46": 9.83835, + "47": 10.14, + "48": 9.85762, + "49": 9.53744, + "50": 9.90946, + "51": 9.84888, + "52": 9.74164, + "53": 10.0634, + "54": 9.94739, + "55": 9.87774, + "56": 9.62736, + "57": 9.47158, + "58": 9.82895, + "59": 9.58274, + "60": 9.4912, + "61": 9.69972, + "62": 9.97984, + "63": 9.37281, + "64": 9.77457, + "65": 8.94253, + "66": 9.69879, + "67": 9.3641, + "68": 9.78785, + "69": 9.78336, + "70": 9.72282, + "71": 9.60808, + "72": 9.58431, + "73": 9.4898, + "74": 8.94861, + "75": 9.4189, + "76": 9.08729, + "77": 10.06345, + "78": 9.72836, + "79": 9.37155, + "80": 9.40054, + "81": 9.47831, + "82": 9.69155, + "83": 9.30735, + "84": 9.41236, + "85": 9.61184, + "86": 9.0759, + "87": 9.59464, + "88": 9.74732, + "89": 9.60675, + "90": 9.81029, + "91": 9.34357, + "92": 9.36491, + "93": 9.07725, + "94": 8.83091, + "95": 9.51723, + "96": 9.52447, + "97": 9.31031, + "98": 9.67875, + "99": 8.88838, + "100": 9.40137 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1637.0, + "2": 1813.0, + "3": 1642.0, + "4": 1766.0, + "5": 1964.0, + "6": 1846.0, + "7": 1884.0, + "8": 1763.0, + "9": 1934.0, + "10": 1489.0, + "11": 2000.0, + "12": 1800.0, + "13": 1942.0, + "14": 1818.0, + "15": 1923.0, + "16": 1792.0, + "17": 1801.0, + "18": 1730.0, + "19": 1754.0, + "20": 1585.0, + "21": 1774.0, + "22": 1692.0, + "23": 1974.0, + "24": 1632.0, + "25": 1649.0, + "26": 1865.0, + "27": 1853.0, + "28": 2076.0, + "29": 2051.0, + "30": 1908.0, + "31": 1532.0, + "32": 1984.0, + "33": 2192.0, + "34": 1867.0, + "35": 1954.0, + "36": 1998.0, + "37": 2392.0, + "38": 2248.0, + "39": 2437.0, + "40": 2265.0, + "41": 2237.0, + "42": 2319.0, + "43": 2171.0, + "44": 2133.0, + "45": 2057.0, + "46": 2372.0, + "47": 2596.0, + "48": 2429.0, + "49": 2248.0, + "50": 2458.0, + "51": 2794.0, + "52": 2607.0, + "53": 2964.0, + "54": 2830.0, + "55": 2411.0, + "56": 2688.0, + "57": 2444.0, + "58": 3101.0, + "59": 2822.0, + "60": 2518.0, + "61": 2878.0, + "62": 2642.0, + "63": 2396.0, + "64": 2963.0, + "65": 2740.0, + "66": 3297.0, + "67": 2793.0, + "68": 2901.0, + "69": 3001.0, + "70": 3253.0, + "71": 3004.0, + "72": 2341.0, + "73": 3179.0, + "74": 1950.0, + "75": 2653.0, + "76": 3085.0, + "77": 3451.0, + "78": 3324.0, + "79": 3342.0, + "80": 3531.0, + "81": 3790.0, + "82": 3427.0, + "83": 2786.0, + "84": 3443.0, + "85": 3379.0, + "86": 2871.0, + "87": 3840.0, + "88": 3076.0, + "89": 3444.0, + "90": 2991.0, + "91": 2705.0, + "92": 3073.0, + "93": 2724.0, + "94": 3513.0, + "95": 3428.0, + "96": 3557.0, + "97": 3249.0, + "98": 3700.0, + "99": 3192.0, + "100": 3264.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 436764672.0, + "2": 436764672.0, + "3": 436764672.0, + "4": 436764672.0, + "5": 436764672.0, + "6": 436764672.0, + "7": 436764672.0, + "8": 436764672.0, + "9": 436764672.0, + "10": 436764672.0, + "11": 436764672.0, + "12": 436764672.0, + "13": 436764672.0, + "14": 436764672.0, + "15": 436764672.0, + "16": 436764672.0, + "17": 436764672.0, + "18": 436764672.0, + "19": 436764672.0, + "20": 436764672.0, + "21": 436764672.0, + "22": 436764672.0, + "23": 436764672.0, + "24": 436764672.0, + "25": 436764672.0, + "26": 436764672.0, + "27": 436764672.0, + "28": 436764672.0, + "29": 436764672.0, + "30": 436764672.0, + "31": 436764672.0, + "32": 436764672.0, + "33": 436764672.0, + "34": 436764672.0, + "35": 436764672.0, + "36": 436764672.0, + "37": 436764672.0, + "38": 436764672.0, + "39": 436764672.0, + "40": 436764672.0, + "41": 436764672.0, + "42": 436764672.0, + "43": 436764672.0, + "44": 436764672.0, + "45": 436764672.0, + "46": 436764672.0, + "47": 436764672.0, + "48": 436764672.0, + "49": 436764672.0, + "50": 436764672.0, + "51": 436764672.0, + "52": 436764672.0, + "53": 436764672.0, + "54": 436764672.0, + "55": 436764672.0, + "56": 436764672.0, + "57": 436764672.0, + "58": 436764672.0, + "59": 436764672.0, + "60": 436764672.0, + "61": 436764672.0, + "62": 436764672.0, + "63": 436764672.0, + "64": 436764672.0, + "65": 436764672.0, + "66": 436764672.0, + "67": 436764672.0, + "68": 436764672.0, + "69": 436764672.0, + "70": 436764672.0, + "71": 436764672.0, + "72": 436764672.0, + "73": 436764672.0, + "74": 436764672.0, + "75": 436764672.0, + "76": 436764672.0, + "77": 436764672.0, + "78": 436764672.0, + "79": 436764672.0, + "80": 436764672.0, + "81": 436764672.0, + "82": 436764672.0, + "83": 436764672.0, + "84": 436764672.0, + "85": 436764672.0, + "86": 436764672.0, + "87": 436764672.0, + "88": 436764672.0, + "89": 436764672.0, + "90": 436764672.0, + "91": 436764672.0, + "92": 436764672.0, + "93": 436764672.0, + "94": 436764672.0, + "95": 436764672.0, + "96": 436764672.0, + "97": 436764672.0, + "98": 436764672.0, + "99": 436764672.0, + "100": 436764672.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1178635264.0, + "2": 1359495168.0, + "3": 1359495168.0, + "4": 1359495168.0, + "5": 1359495168.0, + "6": 1359495168.0, + "7": 1359495168.0, + "8": 1359495168.0, + "9": 1359495168.0, + "10": 1359495168.0, + "11": 1359495168.0, + "12": 1359495168.0, + "13": 1359495168.0, + "14": 1359495168.0, + "15": 1359495168.0, + "16": 1359495168.0, + "17": 1359495168.0, + "18": 1359495168.0, + "19": 1359495168.0, + "20": 1359495168.0, + "21": 1359495168.0, + "22": 1359495168.0, + "23": 1359495168.0, + "24": 1359495168.0, + "25": 1359495168.0, + "26": 1359495168.0, + "27": 1359495168.0, + "28": 1359495168.0, + "29": 1359495168.0, + "30": 1359495168.0, + "31": 1359495168.0, + "32": 1359495168.0, + "33": 1359495168.0, + "34": 1359495168.0, + "35": 1359495168.0, + "36": 1359495168.0, + "37": 1359495168.0, + "38": 1359495168.0, + "39": 1359495168.0, + "40": 1359495168.0, + "41": 1359495168.0, + "42": 1359495168.0, + "43": 1359495168.0, + "44": 1359495168.0, + "45": 1359495168.0, + "46": 1359495168.0, + "47": 1359495168.0, + "48": 1359495168.0, + "49": 1359495168.0, + "50": 1359495168.0, + "51": 1359495168.0, + "52": 1359495168.0, + "53": 1359495168.0, + "54": 1359495168.0, + "55": 1359495168.0, + "56": 1359495168.0, + "57": 1359495168.0, + "58": 1359495168.0, + "59": 1359495168.0, + "60": 1359495168.0, + "61": 1359495168.0, + "62": 1359495168.0, + "63": 1359495168.0, + "64": 1359495168.0, + "65": 1359495168.0, + "66": 1359495168.0, + "67": 1359495168.0, + "68": 1359495168.0, + "69": 1359495168.0, + "70": 1359495168.0, + "71": 1359495168.0, + "72": 1359495168.0, + "73": 1359495168.0, + "74": 1359495168.0, + "75": 1359495168.0, + "76": 1359495168.0, + "77": 1359495168.0, + "78": 1359495168.0, + "79": 1359495168.0, + "80": 1359495168.0, + "81": 1359495168.0, + "82": 1359495168.0, + "83": 1359495168.0, + "84": 1359495168.0, + "85": 1359495168.0, + "86": 1359495168.0, + "87": 1359495168.0, + "88": 1359495168.0, + "89": 1359495168.0, + "90": 1359495168.0, + "91": 1359495168.0, + "92": 1359495168.0, + "93": 1359495168.0, + "94": 1359495168.0, + "95": 1359495168.0, + "96": 1359495168.0, + "97": 1359495168.0, + "98": 1359495168.0, + "99": 1359495168.0, + "100": 1359495168.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.71223, + "2": 0.2559, + "3": 0.20574, + "4": 0.19465, + "5": 0.19231, + "6": 0.19171, + "7": 0.19937, + "8": 0.19134, + "9": 0.19297, + "10": 0.56022, + "11": 0.19644, + "12": 0.1919, + "13": 0.18999, + "14": 0.19039, + "15": 0.19033, + "16": 0.19392, + "17": 0.1905, + "18": 0.19034, + "19": 0.19238, + "20": 0.18982, + "21": 0.19272, + "22": 0.18887, + "23": 0.18965, + "24": 0.18822, + "25": 0.18884, + "26": 0.19177, + "27": 0.19002, + "28": 0.19012, + "29": 0.18865, + "30": 0.18813, + "31": 0.18848, + "32": 0.19189, + "33": 0.18955, + "34": 0.18747, + "35": 0.18875, + "36": 0.18808, + "37": 0.19208, + "38": 0.18809, + "39": 0.18964, + "40": 0.18801, + "41": 0.18881, + "42": 0.18974, + "43": 0.18833, + "44": 0.19089, + "45": 0.18763, + "46": 0.18829, + "47": 0.18867, + "48": 0.19358, + "49": 0.19137, + "50": 0.18755, + "51": 0.40667, + "52": 0.20997, + "53": 0.20527, + "54": 0.20595, + "55": 0.20323, + "56": 0.20609, + "57": 0.20386, + "58": 0.20342, + "59": 0.20542, + "60": 0.20552, + "61": 0.20398, + "62": 0.20382, + "63": 0.20526, + "64": 0.20557, + "65": 0.20431, + "66": 0.20453, + "67": 0.20352, + "68": 0.20417, + "69": 0.2078, + "70": 0.20587, + "71": 0.20478, + "72": 0.20614, + "73": 0.20512, + "74": 0.20553, + "75": 0.20566, + "76": 0.20364, + "77": 0.20348, + "78": 0.20324, + "79": 0.20677, + "80": 0.20465, + "81": 0.2031, + "82": 0.20231, + "83": 0.20385, + "84": 0.20449, + "85": 0.20555, + "86": 0.2034, + "87": 0.20494, + "88": 0.2068, + "89": 0.20402, + "90": 0.20742, + "91": 0.20169, + "92": 0.20203, + "93": 0.20392, + "94": 0.2017, + "95": 0.20418, + "96": 0.20159, + "97": 0.20256, + "98": 0.20348, + "99": 0.20162, + "100": 0.20224 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..6c29141b1ab --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.92655, + "2": 10.92585, + "3": 10.91514, + "4": 10.909, + "5": 10.92715, + "6": 10.93558, + "7": 10.90643, + "8": 10.92116, + "9": 10.91068, + "10": 10.9079, + "11": 10.89281, + "12": 10.9243, + "13": 10.91489, + "14": 10.92142, + "15": 10.88293, + "16": 10.87308, + "17": 10.84069, + "18": 10.87299, + "19": 10.85635, + "20": 10.77597, + "21": 10.74899, + "22": 10.63079, + "23": 10.75618, + "24": 10.65646, + "25": 10.59264, + "26": 10.65436, + "27": 10.64916, + "28": 10.59497, + "29": 10.60952, + "30": 10.39177, + "31": 10.1573, + "32": 10.49109, + "33": 10.4796, + "34": 10.24074, + "35": 10.29698, + "36": 10.24672, + "37": 10.35242, + "38": 10.20483, + "39": 10.40503, + "40": 10.09663, + "41": 10.15197, + "42": 10.22069, + "43": 9.85509, + "44": 9.96162, + "45": 9.8447, + "46": 9.83835, + "47": 10.14006, + "48": 9.8576, + "49": 9.53743, + "50": 9.90948, + "51": 9.84887, + "52": 9.74166, + "53": 10.0634, + "54": 9.94738, + "55": 9.87771, + "56": 9.62738, + "57": 9.47156, + "58": 9.82893, + "59": 9.58275, + "60": 9.49123, + "61": 9.6997, + "62": 9.97993, + "63": 9.37281, + "64": 9.77461, + "65": 8.94258, + "66": 9.69883, + "67": 9.36407, + "68": 9.78787, + "69": 9.78335, + "70": 9.7228, + "71": 9.60807, + "72": 9.58432, + "73": 9.48978, + "74": 8.94859, + "75": 9.41891, + "76": 9.08727, + "77": 10.06346, + "78": 9.72836, + "79": 9.37154, + "80": 9.40055, + "81": 9.47831, + "82": 9.69156, + "83": 9.30737, + "84": 9.41236, + "85": 9.61183, + "86": 9.0759, + "87": 9.59459, + "88": 9.74736, + "89": 9.60675, + "90": 9.81024, + "91": 9.34359, + "92": 9.36491, + "93": 9.07724, + "94": 8.83091, + "95": 9.51724, + "96": 9.52446, + "97": 9.31031, + "98": 9.67875, + "99": 8.88841, + "100": 9.40137 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1669.0, + "2": 1803.0, + "3": 1710.0, + "4": 1820.0, + "5": 1976.0, + "6": 1885.0, + "7": 1871.0, + "8": 1764.0, + "9": 1859.0, + "10": 1373.0, + "11": 1990.0, + "12": 1788.0, + "13": 1897.0, + "14": 1734.0, + "15": 1894.0, + "16": 1713.0, + "17": 1842.0, + "18": 1666.0, + "19": 1744.0, + "20": 1653.0, + "21": 1882.0, + "22": 1706.0, + "23": 1954.0, + "24": 1640.0, + "25": 1696.0, + "26": 1871.0, + "27": 1921.0, + "28": 2037.0, + "29": 2016.0, + "30": 1883.0, + "31": 1596.0, + "32": 1913.0, + "33": 2205.0, + "34": 1860.0, + "35": 1980.0, + "36": 2029.0, + "37": 2339.0, + "38": 2176.0, + "39": 2352.0, + "40": 2111.0, + "41": 2308.0, + "42": 2334.0, + "43": 2067.0, + "44": 2193.0, + "45": 2124.0, + "46": 2336.0, + "47": 2584.0, + "48": 2349.0, + "49": 2276.0, + "50": 2539.0, + "51": 2656.0, + "52": 2542.0, + "53": 2863.0, + "54": 2741.0, + "55": 2376.0, + "56": 2790.0, + "57": 2497.0, + "58": 2939.0, + "59": 2877.0, + "60": 2326.0, + "61": 2871.0, + "62": 2654.0, + "63": 2428.0, + "64": 3017.0, + "65": 2721.0, + "66": 3212.0, + "67": 2706.0, + "68": 2877.0, + "69": 2929.0, + "70": 3147.0, + "71": 2970.0, + "72": 2362.0, + "73": 3092.0, + "74": 1964.0, + "75": 2648.0, + "76": 3014.0, + "77": 3562.0, + "78": 3371.0, + "79": 3369.0, + "80": 3457.0, + "81": 3675.0, + "82": 3516.0, + "83": 2891.0, + "84": 3362.0, + "85": 3249.0, + "86": 2711.0, + "87": 3770.0, + "88": 3008.0, + "89": 3409.0, + "90": 3052.0, + "91": 2694.0, + "92": 3142.0, + "93": 2631.0, + "94": 3394.0, + "95": 3371.0, + "96": 3517.0, + "97": 3190.0, + "98": 3808.0, + "99": 3258.0, + "100": 3248.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 436764672.0, + "2": 436764672.0, + "3": 436764672.0, + "4": 436764672.0, + "5": 436764672.0, + "6": 436764672.0, + "7": 436764672.0, + "8": 436764672.0, + "9": 436764672.0, + "10": 436764672.0, + "11": 436764672.0, + "12": 436764672.0, + "13": 436764672.0, + "14": 436764672.0, + "15": 436764672.0, + "16": 436764672.0, + "17": 436764672.0, + "18": 436764672.0, + "19": 436764672.0, + "20": 436764672.0, + "21": 436764672.0, + "22": 436764672.0, + "23": 436764672.0, + "24": 436764672.0, + "25": 436764672.0, + "26": 436764672.0, + "27": 436764672.0, + "28": 436764672.0, + "29": 436764672.0, + "30": 436764672.0, + "31": 436764672.0, + "32": 436764672.0, + "33": 436764672.0, + "34": 436764672.0, + "35": 436764672.0, + "36": 436764672.0, + "37": 436764672.0, + "38": 436764672.0, + "39": 436764672.0, + "40": 436764672.0, + "41": 436764672.0, + "42": 436764672.0, + "43": 436764672.0, + "44": 436764672.0, + "45": 436764672.0, + "46": 436764672.0, + "47": 436764672.0, + "48": 436764672.0, + "49": 436764672.0, + "50": 436764672.0, + "51": 436764672.0, + "52": 436764672.0, + "53": 436764672.0, + "54": 436764672.0, + "55": 436764672.0, + "56": 436764672.0, + "57": 436764672.0, + "58": 436764672.0, + "59": 436764672.0, + "60": 436764672.0, + "61": 436764672.0, + "62": 436764672.0, + "63": 436764672.0, + "64": 436764672.0, + "65": 436764672.0, + "66": 436764672.0, + "67": 436764672.0, + "68": 436764672.0, + "69": 436764672.0, + "70": 436764672.0, + "71": 436764672.0, + "72": 436764672.0, + "73": 436764672.0, + "74": 436764672.0, + "75": 436764672.0, + "76": 436764672.0, + "77": 436764672.0, + "78": 436764672.0, + "79": 436764672.0, + "80": 436764672.0, + "81": 436764672.0, + "82": 436764672.0, + "83": 436764672.0, + "84": 436764672.0, + "85": 436764672.0, + "86": 436764672.0, + "87": 436764672.0, + "88": 436764672.0, + "89": 436764672.0, + "90": 436764672.0, + "91": 436764672.0, + "92": 436764672.0, + "93": 436764672.0, + "94": 436764672.0, + "95": 436764672.0, + "96": 436764672.0, + "97": 436764672.0, + "98": 436764672.0, + "99": 436764672.0, + "100": 436764672.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1178635264.0, + "2": 1359495168.0, + "3": 1360411648.0, + "4": 1360411648.0, + "5": 1360411648.0, + "6": 1360411648.0, + "7": 1360411648.0, + "8": 1360411648.0, + "9": 1360411648.0, + "10": 1360411648.0, + "11": 1360411648.0, + "12": 1360411648.0, + "13": 1360411648.0, + "14": 1360411648.0, + "15": 1360411648.0, + "16": 1360411648.0, + "17": 1360411648.0, + "18": 1360411648.0, + "19": 1360411648.0, + "20": 1360411648.0, + "21": 1360411648.0, + "22": 1360411648.0, + "23": 1360411648.0, + "24": 1360411648.0, + "25": 1360411648.0, + "26": 1360411648.0, + "27": 1360411648.0, + "28": 1360411648.0, + "29": 1360411648.0, + "30": 1360411648.0, + "31": 1360411648.0, + "32": 1360411648.0, + "33": 1360411648.0, + "34": 1360411648.0, + "35": 1360411648.0, + "36": 1360411648.0, + "37": 1360411648.0, + "38": 1360411648.0, + "39": 1360411648.0, + "40": 1360411648.0, + "41": 1360411648.0, + "42": 1360411648.0, + "43": 1360411648.0, + "44": 1360411648.0, + "45": 1360411648.0, + "46": 1360411648.0, + "47": 1360411648.0, + "48": 1360411648.0, + "49": 1360411648.0, + "50": 1360411648.0, + "51": 1360411648.0, + "52": 1360411648.0, + "53": 1360411648.0, + "54": 1360411648.0, + "55": 1360411648.0, + "56": 1360411648.0, + "57": 1360411648.0, + "58": 1360411648.0, + "59": 1360411648.0, + "60": 1360411648.0, + "61": 1360411648.0, + "62": 1360411648.0, + "63": 1360411648.0, + "64": 1360411648.0, + "65": 1360411648.0, + "66": 1360411648.0, + "67": 1360411648.0, + "68": 1360411648.0, + "69": 1360411648.0, + "70": 1360411648.0, + "71": 1360411648.0, + "72": 1360411648.0, + "73": 1360411648.0, + "74": 1360411648.0, + "75": 1360411648.0, + "76": 1360411648.0, + "77": 1360411648.0, + "78": 1360411648.0, + "79": 1360411648.0, + "80": 1360411648.0, + "81": 1360411648.0, + "82": 1360411648.0, + "83": 1360411648.0, + "84": 1360411648.0, + "85": 1360411648.0, + "86": 1360411648.0, + "87": 1360411648.0, + "88": 1360411648.0, + "89": 1360411648.0, + "90": 1360411648.0, + "91": 1360411648.0, + "92": 1360411648.0, + "93": 1360411648.0, + "94": 1360411648.0, + "95": 1360411648.0, + "96": 1360411648.0, + "97": 1360411648.0, + "98": 1360411648.0, + "99": 1360411648.0, + "100": 1360411648.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 14.71622, + "2": 0.23087, + "3": 0.1951, + "4": 0.18861, + "5": 0.18812, + "6": 0.19385, + "7": 0.18893, + "8": 0.18851, + "9": 0.18797, + "10": 0.18883, + "11": 0.19316, + "12": 0.18894, + "13": 0.18809, + "14": 0.18851, + "15": 0.19062, + "16": 0.19113, + "17": 0.18987, + "18": 0.18872, + "19": 0.18621, + "20": 0.19006, + "21": 0.18925, + "22": 0.19544, + "23": 0.19322, + "24": 0.18957, + "25": 0.19074, + "26": 0.19316, + "27": 0.18825, + "28": 0.1874, + "29": 0.18747, + "30": 0.18693, + "31": 0.1865, + "32": 0.18917, + "33": 0.19083, + "34": 0.185, + "35": 0.18524, + "36": 0.18664, + "37": 0.18377, + "38": 0.18614, + "39": 0.18438, + "40": 0.18443, + "41": 0.18753, + "42": 0.1842, + "43": 0.18841, + "44": 0.18384, + "45": 0.18491, + "46": 0.18442, + "47": 0.18641, + "48": 0.18523, + "49": 0.18535, + "50": 0.18414, + "51": 0.19499, + "52": 0.18865, + "53": 0.18877, + "54": 0.18901, + "55": 0.18952, + "56": 0.18817, + "57": 0.18647, + "58": 0.19054, + "59": 0.18698, + "60": 0.19221, + "61": 0.1855, + "62": 0.18425, + "63": 0.18635, + "64": 0.18617, + "65": 0.18584, + "66": 0.18699, + "67": 0.18754, + "68": 0.18626, + "69": 0.18682, + "70": 0.37416, + "71": 0.18684, + "72": 0.18552, + "73": 0.18589, + "74": 0.18591, + "75": 0.19036, + "76": 0.18483, + "77": 0.18579, + "78": 0.18597, + "79": 0.1879, + "80": 0.18623, + "81": 0.18669, + "82": 0.18488, + "83": 0.18509, + "84": 0.18891, + "85": 0.18595, + "86": 0.18904, + "87": 0.18638, + "88": 0.18604, + "89": 0.18611, + "90": 0.18586, + "91": 0.18957, + "92": 0.18824, + "93": 0.18603, + "94": 0.18606, + "95": 0.18658, + "96": 0.18779, + "97": 0.18815, + "98": 0.18579, + "99": 0.186, + "100": 0.18722 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index e8a221fc47b..5ac3723f6cb 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.84466, + "2": 10.84794, + "3": 10.84925, + "4": 10.84332, "5": 10.88244, + "6": 10.88079, + "7": 10.86575, + "8": 10.85546, + "9": 10.85543, "10": 10.81818, + "11": 10.88769, + "12": 10.8634, + "13": 10.86681, + "14": 10.88414, "15": 10.82464, + "16": 10.82854, + "17": 10.79491, + "18": 10.81492, + "19": 10.80133, "20": 10.7181, + "21": 10.69905, + "22": 10.56744, + "23": 10.717, + "24": 10.60443, "25": 10.55007, + "26": 10.60907, + "27": 10.62028, + "28": 10.5752, + "29": 10.59624, "30": 10.38327, + "31": 10.1537, + "32": 10.48026, + "33": 10.47378, + "34": 10.2366, "35": 10.28843, + "36": 10.24838, + "37": 10.35354, + "38": 10.20794, + "39": 10.41884, "40": 10.1122, + "41": 10.16092, + "42": 10.23301, + "43": 9.86118, + "44": 9.97698, "45": 9.86493, + "46": 9.84883, + "47": 10.16617, + "48": 9.87132, + "49": 9.56691, "50": 9.92114, + "51": 9.86695, + "52": 9.76956, + "53": 10.07809, + "54": 9.97027, "55": 9.89683, + "56": 9.64394, + "57": 9.49728, + "58": 9.84867, + "59": 9.59977, "60": 9.50631, + "61": 9.71011, + "62": 9.99101, + "63": 9.38968, + "64": 9.78595, "65": 8.95983, + "66": 9.70876, + "67": 9.37892, + "68": 9.79599, + "69": 9.80666, "70": 9.74795, + "71": 9.61779, + "72": 9.59127, + "73": 9.50398, + "74": 8.94624, "75": 9.42942, + "76": 9.08423, + "77": 10.06698, + "78": 9.73256, + "79": 9.38117, "80": 9.41061, + "81": 9.48289, + "82": 9.70492, + "83": 9.30713, + "84": 9.42241, "85": 9.61802, + "86": 9.07631, + "87": 9.59382, + "88": 9.75419, + "89": 9.60093, "90": 9.82013, + "91": 9.3407, + "92": 9.35717, + "93": 9.07927, + "94": 8.83613, "95": 9.5223, + "96": 9.53379, + "97": 9.31633, + "98": 9.68007, + "99": 8.89242, "100": 9.39964 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1770.0, + "2": 1809.0, + "3": 1782.0, + "4": 1916.0, "5": 1973.0, + "6": 1955.0, + "7": 2046.0, + "8": 1773.0, + "9": 1815.0, "10": 1432.0, + "11": 1961.0, + "12": 1828.0, + "13": 1967.0, + "14": 1825.0, "15": 1980.0, + "16": 1889.0, + "17": 1866.0, + "18": 1827.0, + "19": 1876.0, "20": 1715.0, + "21": 2046.0, + "22": 1872.0, + "23": 2168.0, + "24": 1814.0, "25": 1715.0, + "26": 1721.0, + "27": 1822.0, + "28": 2102.0, + "29": 2112.0, "30": 2020.0, + "31": 1569.0, + "32": 2022.0, + "33": 2256.0, + "34": 1884.0, "35": 2034.0, + "36": 2027.0, + "37": 2438.0, + "38": 2363.0, + "39": 2526.0, "40": 2254.0, + "41": 2328.0, + "42": 2409.0, + "43": 2126.0, + "44": 2166.0, "45": 2230.0, + "46": 2487.0, + "47": 2605.0, + "48": 2351.0, + "49": 2413.0, "50": 2274.0, + "51": 2579.0, + "52": 2508.0, + "53": 2879.0, + "54": 2744.0, "55": 2402.0, + "56": 2720.0, + "57": 2384.0, + "58": 3002.0, + "59": 2743.0, "60": 2457.0, + "61": 2976.0, + "62": 2631.0, + "63": 2349.0, + "64": 3077.0, "65": 2634.0, + "66": 3076.0, + "67": 2906.0, + "68": 2759.0, + "69": 2907.0, "70": 3045.0, + "71": 3159.0, + "72": 2506.0, + "73": 2956.0, + "74": 1945.0, "75": 2467.0, + "76": 2979.0, + "77": 3209.0, + "78": 3122.0, + "79": 3048.0, "80": 3389.0, + "81": 3799.0, + "82": 3272.0, + "83": 2962.0, + "84": 3328.0, "85": 3462.0, + "86": 3071.0, + "87": 3900.0, + "88": 3128.0, + "89": 3469.0, "90": 3095.0, + "91": 2769.0, + "92": 3168.0, + "93": 2713.0, + "94": 3416.0, "95": 3515.0, + "96": 3425.0, + "97": 3223.0, + "98": 3769.0, + "99": 3230.0, "100": 3219.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 246998528.0, + "2": 246998528.0, + "3": 246998528.0, + "4": 246998528.0, "5": 246998528.0, + "6": 246998528.0, + "7": 246998528.0, + "8": 246998528.0, + "9": 246998528.0, "10": 246998528.0, + "11": 246998528.0, + "12": 246998528.0, + "13": 246998528.0, + "14": 246998528.0, "15": 246998528.0, + "16": 246998528.0, + "17": 246998528.0, + "18": 246998528.0, + "19": 246998528.0, "20": 246998528.0, + "21": 246998528.0, + "22": 246998528.0, + "23": 246998528.0, + "24": 246998528.0, "25": 246998528.0, + "26": 246998528.0, + "27": 246998528.0, + "28": 246998528.0, + "29": 246998528.0, "30": 246998528.0, + "31": 246998528.0, + "32": 246998528.0, + "33": 246998528.0, + "34": 246998528.0, "35": 246998528.0, + "36": 246998528.0, + "37": 246998528.0, + "38": 246998528.0, + "39": 246998528.0, "40": 246998528.0, + "41": 246998528.0, + "42": 246998528.0, + "43": 246998528.0, + "44": 246998528.0, "45": 246998528.0, + "46": 246998528.0, + "47": 246998528.0, + "48": 246998528.0, + "49": 246998528.0, "50": 246998528.0, + "51": 246998528.0, + "52": 246998528.0, + "53": 246998528.0, + "54": 246998528.0, "55": 246998528.0, + "56": 246998528.0, + "57": 246998528.0, + "58": 246998528.0, + "59": 246998528.0, "60": 246998528.0, + "61": 246998528.0, + "62": 246998528.0, + "63": 246998528.0, + "64": 246998528.0, "65": 246998528.0, + "66": 246998528.0, + "67": 246998528.0, + "68": 246998528.0, + "69": 246998528.0, "70": 246998528.0, + "71": 246998528.0, + "72": 246998528.0, + "73": 246998528.0, + "74": 246998528.0, "75": 246998528.0, + "76": 246998528.0, + "77": 246998528.0, + "78": 246998528.0, + "79": 246998528.0, "80": 246998528.0, + "81": 246998528.0, + "82": 246998528.0, + "83": 246998528.0, + "84": 246998528.0, "85": 246998528.0, + "86": 246998528.0, + "87": 246998528.0, + "88": 246998528.0, + "89": 246998528.0, "90": 246998528.0, + "91": 246998528.0, + "92": 246998528.0, + "93": 246998528.0, + "94": 246998528.0, "95": 246998528.0, + "96": 246998528.0, + "97": 246998528.0, + "98": 246998528.0, + "99": 246998528.0, "100": 246998528.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1503207936.0, + "2": 1503208960.0, + "3": 1503208960.0, + "4": 1503208960.0, "5": 1503208960.0, + "6": 1503208960.0, + "7": 1503208960.0, + "8": 1503208960.0, + "9": 1503208960.0, "10": 1503208960.0, + "11": 1503208960.0, + "12": 1503208960.0, + "13": 1503208960.0, + "14": 1503208960.0, "15": 1503208960.0, + "16": 1503208960.0, + "17": 1503208960.0, + "18": 1503208960.0, + "19": 1503208960.0, "20": 1503208960.0, + "21": 1503208960.0, + "22": 1503208960.0, + "23": 1503208960.0, + "24": 1503208960.0, "25": 1503208960.0, + "26": 1503208960.0, + "27": 1503208960.0, + "28": 1503208960.0, + "29": 1503208960.0, "30": 1503208960.0, + "31": 1503208960.0, + "32": 1503208960.0, + "33": 1503208960.0, + "34": 1503208960.0, "35": 1503208960.0, + "36": 1503208960.0, + "37": 1503208960.0, + "38": 1503208960.0, + "39": 1503208960.0, "40": 1503208960.0, + "41": 1503208960.0, + "42": 1503208960.0, + "43": 1503208960.0, + "44": 1503208960.0, "45": 1503208960.0, + "46": 1503208960.0, + "47": 1503208960.0, + "48": 1503208960.0, + "49": 1503208960.0, "50": 1503208960.0, + "51": 1503208960.0, + "52": 1503208960.0, + "53": 1503208960.0, + "54": 1503208960.0, "55": 1503208960.0, + "56": 1503208960.0, + "57": 1503208960.0, + "58": 1503208960.0, + "59": 1503208960.0, "60": 1503208960.0, + "61": 1503208960.0, + "62": 1503208960.0, + "63": 1503208960.0, + "64": 1503208960.0, "65": 1503208960.0, + "66": 1503208960.0, + "67": 1503208960.0, + "68": 1503208960.0, + "69": 1503208960.0, "70": 1503208960.0, + "71": 1503208960.0, + "72": 1503208960.0, + "73": 1503208960.0, + "74": 1503208960.0, "75": 1503208960.0, + "76": 1503208960.0, + "77": 1503208960.0, + "78": 1503208960.0, + "79": 1503208960.0, "80": 1503208960.0, + "81": 1503208960.0, + "82": 1503208960.0, + "83": 1503208960.0, + "84": 1503208960.0, "85": 1503208960.0, + "86": 1503208960.0, + "87": 1503208960.0, + "88": 1503208960.0, + "89": 1503208960.0, "90": 1503208960.0, + "91": 1503208960.0, + "92": 1503208960.0, + "93": 1503208960.0, + "94": 1503208960.0, "95": 1503208960.0, + "96": 1503208960.0, + "97": 1503208960.0, + "98": 1503208960.0, + "99": 1503208960.0, "100": 1503208960.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 7.35335, - "5": 0.15349, - "10": 0.15437, - "15": 0.15387, - "20": 0.15054, - "25": 0.15011, - "30": 0.15223, - "35": 0.15279, - "40": 0.15254, - "45": 0.14885, - "50": 0.15116, - "55": 0.15076, - "60": 0.15109, - "65": 0.15214, - "70": 0.15048, - "75": 0.15013, - "80": 0.15119, - "85": 0.15129, - "90": 0.15233, - "95": 0.14802, - "100": 0.15191 + "1": 6.97838, + "2": 0.1863, + "3": 0.17806, + "4": 0.17695, + "5": 0.17974, + "6": 0.17764, + "7": 0.18024, + "8": 0.17572, + "9": 0.179, + "10": 0.17802, + "11": 0.17798, + "12": 0.18743, + "13": 0.18184, + "14": 0.18624, + "15": 0.1848, + "16": 0.18027, + "17": 0.17452, + "18": 0.17844, + "19": 0.17971, + "20": 0.17848, + "21": 0.17704, + "22": 0.17765, + "23": 0.17541, + "24": 0.17687, + "25": 0.1788, + "26": 0.17648, + "27": 0.17818, + "28": 0.17831, + "29": 0.17674, + "30": 0.17588, + "31": 0.17953, + "32": 0.17664, + "33": 0.17688, + "34": 0.17669, + "35": 0.1745, + "36": 0.1776, + "37": 0.17613, + "38": 0.17723, + "39": 0.17434, + "40": 0.17681, + "41": 0.17485, + "42": 0.17993, + "43": 0.174, + "44": 0.17741, + "45": 0.17457, + "46": 0.1789, + "47": 0.17735, + "48": 0.17895, + "49": 0.17421, + "50": 0.17774, + "51": 0.17494, + "52": 0.1787, + "53": 0.17718, + "54": 0.18021, + "55": 0.17484, + "56": 0.17693, + "57": 0.178, + "58": 0.17576, + "59": 0.17632, + "60": 0.17804, + "61": 0.17762, + "62": 0.1744, + "63": 0.17562, + "64": 0.17641, + "65": 0.1776, + "66": 0.18194, + "67": 0.17871, + "68": 0.17591, + "69": 0.17673, + "70": 0.17758, + "71": 0.17616, + "72": 0.17993, + "73": 0.17721, + "74": 0.17901, + "75": 0.1779, + "76": 0.17874, + "77": 0.17769, + "78": 0.17877, + "79": 0.17963, + "80": 0.1772, + "81": 0.18363, + "82": 0.175, + "83": 0.17819, + "84": 0.17813, + "85": 0.17602, + "86": 0.17627, + "87": 0.17621, + "88": 0.17721, + "89": 0.17686, + "90": 0.17595, + "91": 0.17984, + "92": 0.17771, + "93": 0.17526, + "94": 0.17818, + "95": 0.17734, + "96": 0.18252, + "97": 0.186, + "98": 0.1736, + "99": 0.17768, + "100": 0.17699 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..492a25fb45e --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84466, + "2": 10.84794, + "3": 10.84925, + "4": 10.84332, + "5": 10.88244, + "6": 10.88079, + "7": 10.86575, + "8": 10.85546, + "9": 10.85543, + "10": 10.81818, + "11": 10.88769, + "12": 10.8634, + "13": 10.86681, + "14": 10.88414, + "15": 10.82464, + "16": 10.82854, + "17": 10.79491, + "18": 10.81492, + "19": 10.80133, + "20": 10.7181, + "21": 10.69905, + "22": 10.56744, + "23": 10.717, + "24": 10.60443, + "25": 10.55007, + "26": 10.60907, + "27": 10.62028, + "28": 10.5752, + "29": 10.59624, + "30": 10.38327, + "31": 10.1537, + "32": 10.48026, + "33": 10.47378, + "34": 10.2366, + "35": 10.28843, + "36": 10.24838, + "37": 10.35354, + "38": 10.20794, + "39": 10.41884, + "40": 10.1122, + "41": 10.16092, + "42": 10.23301, + "43": 9.86118, + "44": 9.97698, + "45": 9.86493, + "46": 9.84883, + "47": 10.16617, + "48": 9.87132, + "49": 9.56691, + "50": 9.92114, + "51": 9.86695, + "52": 9.76956, + "53": 10.07809, + "54": 9.97027, + "55": 9.89683, + "56": 9.64394, + "57": 9.49728, + "58": 9.84867, + "59": 9.59977, + "60": 9.50631, + "61": 9.71011, + "62": 9.99101, + "63": 9.38968, + "64": 9.78595, + "65": 8.95983, + "66": 9.70876, + "67": 9.37892, + "68": 9.79599, + "69": 9.80666, + "70": 9.74795, + "71": 9.61779, + "72": 9.59127, + "73": 9.50398, + "74": 8.94624, + "75": 9.42942, + "76": 9.08423, + "77": 10.06698, + "78": 9.73256, + "79": 9.38117, + "80": 9.41061, + "81": 9.48289, + "82": 9.70492, + "83": 9.30713, + "84": 9.42241, + "85": 9.61802, + "86": 9.07631, + "87": 9.59382, + "88": 9.75419, + "89": 9.60093, + "90": 9.82013, + "91": 9.3407, + "92": 9.35717, + "93": 9.07927, + "94": 8.83613, + "95": 9.5223, + "96": 9.53379, + "97": 9.31633, + "98": 9.68007, + "99": 8.89242, + "100": 9.39964 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1770.0, + "2": 1809.0, + "3": 1782.0, + "4": 1916.0, + "5": 1973.0, + "6": 1955.0, + "7": 2046.0, + "8": 1773.0, + "9": 1815.0, + "10": 1432.0, + "11": 1961.0, + "12": 1828.0, + "13": 1967.0, + "14": 1825.0, + "15": 1980.0, + "16": 1889.0, + "17": 1866.0, + "18": 1827.0, + "19": 1876.0, + "20": 1715.0, + "21": 2046.0, + "22": 1872.0, + "23": 2168.0, + "24": 1814.0, + "25": 1715.0, + "26": 1721.0, + "27": 1822.0, + "28": 2102.0, + "29": 2112.0, + "30": 2020.0, + "31": 1569.0, + "32": 2022.0, + "33": 2256.0, + "34": 1884.0, + "35": 2034.0, + "36": 2027.0, + "37": 2438.0, + "38": 2363.0, + "39": 2526.0, + "40": 2254.0, + "41": 2328.0, + "42": 2409.0, + "43": 2126.0, + "44": 2166.0, + "45": 2230.0, + "46": 2487.0, + "47": 2605.0, + "48": 2351.0, + "49": 2413.0, + "50": 2274.0, + "51": 2579.0, + "52": 2508.0, + "53": 2879.0, + "54": 2744.0, + "55": 2402.0, + "56": 2720.0, + "57": 2384.0, + "58": 3002.0, + "59": 2743.0, + "60": 2457.0, + "61": 2976.0, + "62": 2631.0, + "63": 2349.0, + "64": 3077.0, + "65": 2634.0, + "66": 3076.0, + "67": 2906.0, + "68": 2759.0, + "69": 2907.0, + "70": 3045.0, + "71": 3159.0, + "72": 2506.0, + "73": 2956.0, + "74": 1945.0, + "75": 2467.0, + "76": 2979.0, + "77": 3209.0, + "78": 3122.0, + "79": 3048.0, + "80": 3389.0, + "81": 3799.0, + "82": 3272.0, + "83": 2962.0, + "84": 3328.0, + "85": 3462.0, + "86": 3071.0, + "87": 3900.0, + "88": 3128.0, + "89": 3469.0, + "90": 3095.0, + "91": 2769.0, + "92": 3168.0, + "93": 2713.0, + "94": 3416.0, + "95": 3515.0, + "96": 3425.0, + "97": 3223.0, + "98": 3769.0, + "99": 3230.0, + "100": 3219.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 246998528.0, + "2": 246998528.0, + "3": 246998528.0, + "4": 246998528.0, + "5": 246998528.0, + "6": 246998528.0, + "7": 246998528.0, + "8": 246998528.0, + "9": 246998528.0, + "10": 246998528.0, + "11": 246998528.0, + "12": 246998528.0, + "13": 246998528.0, + "14": 246998528.0, + "15": 246998528.0, + "16": 246998528.0, + "17": 246998528.0, + "18": 246998528.0, + "19": 246998528.0, + "20": 246998528.0, + "21": 246998528.0, + "22": 246998528.0, + "23": 246998528.0, + "24": 246998528.0, + "25": 246998528.0, + "26": 246998528.0, + "27": 246998528.0, + "28": 246998528.0, + "29": 246998528.0, + "30": 246998528.0, + "31": 246998528.0, + "32": 246998528.0, + "33": 246998528.0, + "34": 246998528.0, + "35": 246998528.0, + "36": 246998528.0, + "37": 246998528.0, + "38": 246998528.0, + "39": 246998528.0, + "40": 246998528.0, + "41": 246998528.0, + "42": 246998528.0, + "43": 246998528.0, + "44": 246998528.0, + "45": 246998528.0, + "46": 246998528.0, + "47": 246998528.0, + "48": 246998528.0, + "49": 246998528.0, + "50": 246998528.0, + "51": 246998528.0, + "52": 246998528.0, + "53": 246998528.0, + "54": 246998528.0, + "55": 246998528.0, + "56": 246998528.0, + "57": 246998528.0, + "58": 246998528.0, + "59": 246998528.0, + "60": 246998528.0, + "61": 246998528.0, + "62": 246998528.0, + "63": 246998528.0, + "64": 246998528.0, + "65": 246998528.0, + "66": 246998528.0, + "67": 246998528.0, + "68": 246998528.0, + "69": 246998528.0, + "70": 246998528.0, + "71": 246998528.0, + "72": 246998528.0, + "73": 246998528.0, + "74": 246998528.0, + "75": 246998528.0, + "76": 246998528.0, + "77": 246998528.0, + "78": 246998528.0, + "79": 246998528.0, + "80": 246998528.0, + "81": 246998528.0, + "82": 246998528.0, + "83": 246998528.0, + "84": 246998528.0, + "85": 246998528.0, + "86": 246998528.0, + "87": 246998528.0, + "88": 246998528.0, + "89": 246998528.0, + "90": 246998528.0, + "91": 246998528.0, + "92": 246998528.0, + "93": 246998528.0, + "94": 246998528.0, + "95": 246998528.0, + "96": 246998528.0, + "97": 246998528.0, + "98": 246998528.0, + "99": 246998528.0, + "100": 246998528.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1503207936.0, + "2": 1503208960.0, + "3": 1503208960.0, + "4": 1503208960.0, + "5": 1503208960.0, + "6": 1503208960.0, + "7": 1503208960.0, + "8": 1503208960.0, + "9": 1503208960.0, + "10": 1503208960.0, + "11": 1503208960.0, + "12": 1503208960.0, + "13": 1503208960.0, + "14": 1503208960.0, + "15": 1503208960.0, + "16": 1503208960.0, + "17": 1503208960.0, + "18": 1503208960.0, + "19": 1503208960.0, + "20": 1503208960.0, + "21": 1503208960.0, + "22": 1503208960.0, + "23": 1503208960.0, + "24": 1503208960.0, + "25": 1503208960.0, + "26": 1503208960.0, + "27": 1503208960.0, + "28": 1503208960.0, + "29": 1503208960.0, + "30": 1503208960.0, + "31": 1503208960.0, + "32": 1503208960.0, + "33": 1503208960.0, + "34": 1503208960.0, + "35": 1503208960.0, + "36": 1503208960.0, + "37": 1503208960.0, + "38": 1503208960.0, + "39": 1503208960.0, + "40": 1503208960.0, + "41": 1503208960.0, + "42": 1503208960.0, + "43": 1503208960.0, + "44": 1503208960.0, + "45": 1503208960.0, + "46": 1503208960.0, + "47": 1503208960.0, + "48": 1503208960.0, + "49": 1503208960.0, + "50": 1503208960.0, + "51": 1503208960.0, + "52": 1503208960.0, + "53": 1503208960.0, + "54": 1503208960.0, + "55": 1503208960.0, + "56": 1503208960.0, + "57": 1503208960.0, + "58": 1503208960.0, + "59": 1503208960.0, + "60": 1503208960.0, + "61": 1503208960.0, + "62": 1503208960.0, + "63": 1503208960.0, + "64": 1503208960.0, + "65": 1503208960.0, + "66": 1503208960.0, + "67": 1503208960.0, + "68": 1503208960.0, + "69": 1503208960.0, + "70": 1503208960.0, + "71": 1503208960.0, + "72": 1503208960.0, + "73": 1503208960.0, + "74": 1503208960.0, + "75": 1503208960.0, + "76": 1503208960.0, + "77": 1503208960.0, + "78": 1503208960.0, + "79": 1503208960.0, + "80": 1503208960.0, + "81": 1503208960.0, + "82": 1503208960.0, + "83": 1503208960.0, + "84": 1503208960.0, + "85": 1503208960.0, + "86": 1503208960.0, + "87": 1503208960.0, + "88": 1503208960.0, + "89": 1503208960.0, + "90": 1503208960.0, + "91": 1503208960.0, + "92": 1503208960.0, + "93": 1503208960.0, + "94": 1503208960.0, + "95": 1503208960.0, + "96": 1503208960.0, + "97": 1503208960.0, + "98": 1503208960.0, + "99": 1503208960.0, + "100": 1503208960.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 7.36893, + "2": 0.17749, + "3": 0.15483, + "4": 3.4076, + "5": 1.15474, + "6": 1.45655, + "7": 0.15757, + "8": 0.15389, + "9": 0.47498, + "10": 0.16518, + "11": 0.23414, + "12": 0.15815, + "13": 0.15818, + "14": 0.15719, + "15": 0.15462, + "16": 0.16906, + "17": 0.159, + "18": 0.1595, + "19": 0.15825, + "20": 0.15699, + "21": 0.17023, + "22": 0.15299, + "23": 0.15858, + "24": 0.15811, + "25": 0.16082, + "26": 0.15919, + "27": 0.17036, + "28": 0.15511, + "29": 0.15676, + "30": 0.15849, + "31": 0.15691, + "32": 0.1571, + "33": 0.16802, + "34": 0.154, + "35": 0.15309, + "36": 0.15721, + "37": 0.15869, + "38": 0.16016, + "39": 0.15701, + "40": 0.15638, + "41": 0.15569, + "42": 0.15701, + "43": 0.16024, + "44": 0.15954, + "45": 0.16076, + "46": 0.15945, + "47": 0.15824, + "48": 0.15782, + "49": 0.15911, + "50": 0.15934, + "51": 0.15705, + "52": 0.17206, + "53": 0.17271, + "54": 0.17349, + "55": 0.17496, + "56": 0.16409, + "57": 0.16373, + "58": 0.16199, + "59": 0.16729, + "60": 0.16491, + "61": 0.1652, + "62": 0.17265, + "63": 0.17309, + "64": 0.15548, + "65": 0.15692, + "66": 0.16524, + "67": 0.15305, + "68": 0.16651, + "69": 0.15491, + "70": 0.15396, + "71": 0.15455, + "72": 0.16248, + "73": 0.15552, + "74": 0.1536, + "75": 0.15797, + "76": 0.15557, + "77": 0.15511, + "78": 0.16464, + "79": 0.15523, + "80": 0.15671, + "81": 0.15374, + "82": 0.15657, + "83": 0.16295, + "84": 0.15794, + "85": 0.15777, + "86": 0.15529, + "87": 0.16089, + "88": 0.15599, + "89": 0.16869, + "90": 0.15607, + "91": 0.15589, + "92": 0.15613, + "93": 0.15487, + "94": 0.15658, + "95": 0.16587, + "96": 0.1565, + "97": 0.15642, + "98": 0.15538, + "99": 0.15622, + "100": 0.16269 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..dbfceceac77 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84466, + "2": 10.84794, + "3": 10.84925, + "4": 10.84332, + "5": 10.88244, + "6": 10.88079, + "7": 10.86575, + "8": 10.85546, + "9": 10.85543, + "10": 10.81818, + "11": 10.88769, + "12": 10.8634, + "13": 10.86681, + "14": 10.88414, + "15": 10.82464, + "16": 10.82854, + "17": 10.79491, + "18": 10.81492, + "19": 10.80133, + "20": 10.7181, + "21": 10.69905, + "22": 10.56744, + "23": 10.717, + "24": 10.60443, + "25": 10.55007, + "26": 10.60907, + "27": 10.62028, + "28": 10.5752, + "29": 10.59624, + "30": 10.38327, + "31": 10.1537, + "32": 10.48026, + "33": 10.47378, + "34": 10.2366, + "35": 10.28843, + "36": 10.24838, + "37": 10.35354, + "38": 10.20794, + "39": 10.41884, + "40": 10.1122, + "41": 10.16092, + "42": 10.23301, + "43": 9.86118, + "44": 9.97698, + "45": 9.86493, + "46": 9.84883, + "47": 10.16617, + "48": 9.87132, + "49": 9.56691, + "50": 9.92114, + "51": 9.86695, + "52": 9.76956, + "53": 10.07809, + "54": 9.97027, + "55": 9.89683, + "56": 9.64394, + "57": 9.49728, + "58": 9.84867, + "59": 9.59977, + "60": 9.50631, + "61": 9.71011, + "62": 9.99101, + "63": 9.38968, + "64": 9.78595, + "65": 8.95983, + "66": 9.70876, + "67": 9.37892, + "68": 9.79599, + "69": 9.80666, + "70": 9.74795, + "71": 9.61779, + "72": 9.59127, + "73": 9.50398, + "74": 8.94624, + "75": 9.42942, + "76": 9.08423, + "77": 10.06698, + "78": 9.73256, + "79": 9.38117, + "80": 9.41061, + "81": 9.48289, + "82": 9.70492, + "83": 9.30713, + "84": 9.42241, + "85": 9.61802, + "86": 9.07631, + "87": 9.59382, + "88": 9.75419, + "89": 9.60093, + "90": 9.82013, + "91": 9.3407, + "92": 9.35717, + "93": 9.07927, + "94": 8.83613, + "95": 9.5223, + "96": 9.53379, + "97": 9.31633, + "98": 9.68007, + "99": 8.89242, + "100": 9.39964 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1770.0, + "2": 1809.0, + "3": 1782.0, + "4": 1916.0, + "5": 1973.0, + "6": 1955.0, + "7": 2046.0, + "8": 1773.0, + "9": 1815.0, + "10": 1432.0, + "11": 1961.0, + "12": 1828.0, + "13": 1967.0, + "14": 1825.0, + "15": 1980.0, + "16": 1889.0, + "17": 1866.0, + "18": 1827.0, + "19": 1876.0, + "20": 1715.0, + "21": 2046.0, + "22": 1872.0, + "23": 2168.0, + "24": 1814.0, + "25": 1715.0, + "26": 1721.0, + "27": 1822.0, + "28": 2102.0, + "29": 2112.0, + "30": 2020.0, + "31": 1569.0, + "32": 2022.0, + "33": 2256.0, + "34": 1884.0, + "35": 2034.0, + "36": 2027.0, + "37": 2438.0, + "38": 2363.0, + "39": 2526.0, + "40": 2254.0, + "41": 2328.0, + "42": 2409.0, + "43": 2126.0, + "44": 2166.0, + "45": 2230.0, + "46": 2487.0, + "47": 2605.0, + "48": 2351.0, + "49": 2413.0, + "50": 2274.0, + "51": 2579.0, + "52": 2508.0, + "53": 2879.0, + "54": 2744.0, + "55": 2402.0, + "56": 2720.0, + "57": 2384.0, + "58": 3002.0, + "59": 2743.0, + "60": 2457.0, + "61": 2976.0, + "62": 2631.0, + "63": 2349.0, + "64": 3077.0, + "65": 2634.0, + "66": 3076.0, + "67": 2906.0, + "68": 2759.0, + "69": 2907.0, + "70": 3045.0, + "71": 3159.0, + "72": 2506.0, + "73": 2956.0, + "74": 1945.0, + "75": 2467.0, + "76": 2979.0, + "77": 3209.0, + "78": 3122.0, + "79": 3048.0, + "80": 3389.0, + "81": 3799.0, + "82": 3272.0, + "83": 2962.0, + "84": 3328.0, + "85": 3462.0, + "86": 3071.0, + "87": 3900.0, + "88": 3128.0, + "89": 3469.0, + "90": 3095.0, + "91": 2769.0, + "92": 3168.0, + "93": 2713.0, + "94": 3416.0, + "95": 3515.0, + "96": 3425.0, + "97": 3223.0, + "98": 3769.0, + "99": 3230.0, + "100": 3219.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 246998528.0, + "2": 246998528.0, + "3": 246998528.0, + "4": 246998528.0, + "5": 246998528.0, + "6": 246998528.0, + "7": 246998528.0, + "8": 246998528.0, + "9": 246998528.0, + "10": 246998528.0, + "11": 246998528.0, + "12": 246998528.0, + "13": 246998528.0, + "14": 246998528.0, + "15": 246998528.0, + "16": 246998528.0, + "17": 246998528.0, + "18": 246998528.0, + "19": 246998528.0, + "20": 246998528.0, + "21": 246998528.0, + "22": 246998528.0, + "23": 246998528.0, + "24": 246998528.0, + "25": 246998528.0, + "26": 246998528.0, + "27": 246998528.0, + "28": 246998528.0, + "29": 246998528.0, + "30": 246998528.0, + "31": 246998528.0, + "32": 246998528.0, + "33": 246998528.0, + "34": 246998528.0, + "35": 246998528.0, + "36": 246998528.0, + "37": 246998528.0, + "38": 246998528.0, + "39": 246998528.0, + "40": 246998528.0, + "41": 246998528.0, + "42": 246998528.0, + "43": 246998528.0, + "44": 246998528.0, + "45": 246998528.0, + "46": 246998528.0, + "47": 246998528.0, + "48": 246998528.0, + "49": 246998528.0, + "50": 246998528.0, + "51": 246998528.0, + "52": 246998528.0, + "53": 246998528.0, + "54": 246998528.0, + "55": 246998528.0, + "56": 246998528.0, + "57": 246998528.0, + "58": 246998528.0, + "59": 246998528.0, + "60": 246998528.0, + "61": 246998528.0, + "62": 246998528.0, + "63": 246998528.0, + "64": 246998528.0, + "65": 246998528.0, + "66": 246998528.0, + "67": 246998528.0, + "68": 246998528.0, + "69": 246998528.0, + "70": 246998528.0, + "71": 246998528.0, + "72": 246998528.0, + "73": 246998528.0, + "74": 246998528.0, + "75": 246998528.0, + "76": 246998528.0, + "77": 246998528.0, + "78": 246998528.0, + "79": 246998528.0, + "80": 246998528.0, + "81": 246998528.0, + "82": 246998528.0, + "83": 246998528.0, + "84": 246998528.0, + "85": 246998528.0, + "86": 246998528.0, + "87": 246998528.0, + "88": 246998528.0, + "89": 246998528.0, + "90": 246998528.0, + "91": 246998528.0, + "92": 246998528.0, + "93": 246998528.0, + "94": 246998528.0, + "95": 246998528.0, + "96": 246998528.0, + "97": 246998528.0, + "98": 246998528.0, + "99": 246998528.0, + "100": 246998528.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1503207936.0, + "2": 1503208960.0, + "3": 1503208960.0, + "4": 1503208960.0, + "5": 1503208960.0, + "6": 1503208960.0, + "7": 1503208960.0, + "8": 1503208960.0, + "9": 1503208960.0, + "10": 1503208960.0, + "11": 1503208960.0, + "12": 1503208960.0, + "13": 1503208960.0, + "14": 1503208960.0, + "15": 1503208960.0, + "16": 1503208960.0, + "17": 1503208960.0, + "18": 1503208960.0, + "19": 1503208960.0, + "20": 1503208960.0, + "21": 1503208960.0, + "22": 1503208960.0, + "23": 1503208960.0, + "24": 1503208960.0, + "25": 1503208960.0, + "26": 1503208960.0, + "27": 1503208960.0, + "28": 1503208960.0, + "29": 1503208960.0, + "30": 1503208960.0, + "31": 1503208960.0, + "32": 1503208960.0, + "33": 1503208960.0, + "34": 1503208960.0, + "35": 1503208960.0, + "36": 1503208960.0, + "37": 1503208960.0, + "38": 1503208960.0, + "39": 1503208960.0, + "40": 1503208960.0, + "41": 1503208960.0, + "42": 1503208960.0, + "43": 1503208960.0, + "44": 1503208960.0, + "45": 1503208960.0, + "46": 1503208960.0, + "47": 1503208960.0, + "48": 1503208960.0, + "49": 1503208960.0, + "50": 1503208960.0, + "51": 1503208960.0, + "52": 1503208960.0, + "53": 1503208960.0, + "54": 1503208960.0, + "55": 1503208960.0, + "56": 1503208960.0, + "57": 1503208960.0, + "58": 1503208960.0, + "59": 1503208960.0, + "60": 1503208960.0, + "61": 1503208960.0, + "62": 1503208960.0, + "63": 1503208960.0, + "64": 1503208960.0, + "65": 1503208960.0, + "66": 1503208960.0, + "67": 1503208960.0, + "68": 1503208960.0, + "69": 1503208960.0, + "70": 1503208960.0, + "71": 1503208960.0, + "72": 1503208960.0, + "73": 1503208960.0, + "74": 1503208960.0, + "75": 1503208960.0, + "76": 1503208960.0, + "77": 1503208960.0, + "78": 1503208960.0, + "79": 1503208960.0, + "80": 1503208960.0, + "81": 1503208960.0, + "82": 1503208960.0, + "83": 1503208960.0, + "84": 1503208960.0, + "85": 1503208960.0, + "86": 1503208960.0, + "87": 1503208960.0, + "88": 1503208960.0, + "89": 1503208960.0, + "90": 1503208960.0, + "91": 1503208960.0, + "92": 1503208960.0, + "93": 1503208960.0, + "94": 1503208960.0, + "95": 1503208960.0, + "96": 1503208960.0, + "97": 1503208960.0, + "98": 1503208960.0, + "99": 1503208960.0, + "100": 1503208960.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.90789, + "2": 0.23993, + "3": 0.20829, + "4": 0.18489, + "5": 0.18237, + "6": 0.17507, + "7": 0.17401, + "8": 0.17758, + "9": 0.17734, + "10": 0.17577, + "11": 0.17329, + "12": 0.17635, + "13": 0.17559, + "14": 0.17588, + "15": 0.17556, + "16": 0.17798, + "17": 0.17347, + "18": 0.17346, + "19": 0.17675, + "20": 0.17518, + "21": 0.17864, + "22": 0.17833, + "23": 0.1827, + "24": 0.1775, + "25": 0.17745, + "26": 0.1755, + "27": 0.17594, + "28": 0.18475, + "29": 0.17599, + "30": 0.17452, + "31": 0.17601, + "32": 0.17743, + "33": 0.17355, + "34": 0.18205, + "35": 0.17672, + "36": 0.17728, + "37": 0.17438, + "38": 0.17752, + "39": 0.18463, + "40": 0.17673, + "41": 0.17505, + "42": 0.17657, + "43": 0.1769, + "44": 0.19406, + "45": 0.20743, + "46": 0.18263, + "47": 0.16986, + "48": 0.17268, + "49": 0.17404, + "50": 0.17381, + "51": 0.1735, + "52": 0.1693, + "53": 0.17058, + "54": 0.17247, + "55": 0.1773, + "56": 0.17259, + "57": 0.17109, + "58": 0.17178, + "59": 0.17167, + "60": 0.17568, + "61": 0.17729, + "62": 0.16999, + "63": 0.17091, + "64": 0.17034, + "65": 0.17236, + "66": 0.17625, + "67": 0.17591, + "68": 0.17126, + "69": 0.17159, + "70": 0.17123, + "71": 0.17221, + "72": 0.17877, + "73": 0.17426, + "74": 0.17035, + "75": 0.1721, + "76": 0.17327, + "77": 0.17396, + "78": 0.17631, + "79": 0.17485, + "80": 0.17347, + "81": 0.17358, + "82": 0.17087, + "83": 0.17164, + "84": 0.17784, + "85": 0.17401, + "86": 0.18008, + "87": 0.17399, + "88": 0.17322, + "89": 0.17239, + "90": 0.17856, + "91": 0.17078, + "92": 0.18016, + "93": 0.18343, + "94": 0.18085, + "95": 0.175, + "96": 0.17786, + "97": 0.17064, + "98": 0.17229, + "99": 0.17164, + "100": 0.20496 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 7190006ec1c..e813675fa98 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,287 @@ -{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.84269, "5": 10.85859, "10": 10.8187, "15": 10.80947, "20": 10.70829, "25": 10.57071, "30": 10.39721, "35": 10.28311, "40": 10.09728, "45": 9.86184, "50": 9.91021}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1669.0, "5": 1956.0, "10": 1416.0, "15": 1958.0, "20": 1802.0, "25": 1767.0, "30": 1901.0, "35": 1938.0, "40": 2126.0, "45": 1927.0, "50": 2307.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 299203072.0, "5": 299203072.0, "10": 299203072.0, "15": 299203072.0, "20": 299203072.0, "25": 299203072.0, "30": 299203072.0, "35": 299203072.0, "40": 299203072.0, "45": 299203072.0, "50": 299203072.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1477945856.0, "5": 1542891008.0, "10": 1542891008.0, "15": 1542891008.0, "20": 1542891008.0, "25": 1542891008.0, "30": 1542891008.0, "35": 1542891008.0, "40": 1542891008.0, "45": 1542891008.0, "50": 1542891008.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 7.64845, "5": 0.20884, "10": 0.20343, "15": 0.20612, "20": 0.22655, "25": 0.19884, "30": 0.20035, "35": 0.20606, "40": 0.19923, "45": 0.20257, "50": 0.20076}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.84269, + "2": 10.85556, + "3": 10.84446, + "4": 10.84222, + "5": 10.85859, + "6": 10.86289, + "7": 10.85166, + "8": 10.84694, + "9": 10.85648, + "10": 10.8187, + "11": 10.85952, + "12": 10.8434, + "13": 10.86329, + "14": 10.85467, + "15": 10.80947, + "16": 10.81639, + "17": 10.7887, + "18": 10.79677, + "19": 10.79127, + "20": 10.70829, + "21": 10.69425, + "22": 10.58587, + "23": 10.70272, + "24": 10.60461, + "25": 10.57071, + "26": 10.62002, + "27": 10.61414, + "28": 10.56371, + "29": 10.56749, + "30": 10.39721, + "31": 10.16567, + "32": 10.45764, + "33": 10.45152, + "34": 10.23938, + "35": 10.28311, + "36": 10.24692, + "37": 10.34247, + "38": 10.2052, + "39": 10.39167, + "40": 10.09728, + "41": 10.15266, + "42": 10.21035, + "43": 9.87733, + "44": 9.98208, + "45": 9.86184, + "46": 9.83605, + "47": 10.13379, + "48": 9.87207, + "49": 9.56144, + "50": 9.91021 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1669.0, + "2": 1797.0, + "3": 1769.0, + "4": 1812.0, + "5": 1956.0, + "6": 1892.0, + "7": 1848.0, + "8": 1619.0, + "9": 1899.0, + "10": 1416.0, + "11": 1910.0, + "12": 1734.0, + "13": 1952.0, + "14": 1901.0, + "15": 1958.0, + "16": 1961.0, + "17": 1919.0, + "18": 1881.0, + "19": 1883.0, + "20": 1802.0, + "21": 1931.0, + "22": 1655.0, + "23": 1993.0, + "24": 1633.0, + "25": 1767.0, + "26": 1727.0, + "27": 1709.0, + "28": 1909.0, + "29": 2062.0, + "30": 1901.0, + "31": 1678.0, + "32": 1944.0, + "33": 2164.0, + "34": 1777.0, + "35": 1938.0, + "36": 1876.0, + "37": 2428.0, + "38": 2216.0, + "39": 2329.0, + "40": 2126.0, + "41": 2312.0, + "42": 2207.0, + "43": 1975.0, + "44": 2062.0, + "45": 1927.0, + "46": 2258.0, + "47": 2545.0, + "48": 2291.0, + "49": 2254.0, + "50": 2307.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 299203072.0, + "2": 299203072.0, + "3": 299203072.0, + "4": 299203072.0, + "5": 299203072.0, + "6": 299203072.0, + "7": 299203072.0, + "8": 299203072.0, + "9": 299203072.0, + "10": 299203072.0, + "11": 299203072.0, + "12": 299203072.0, + "13": 299203072.0, + "14": 299203072.0, + "15": 299203072.0, + "16": 299203072.0, + "17": 299203072.0, + "18": 299203072.0, + "19": 299203072.0, + "20": 299203072.0, + "21": 299203072.0, + "22": 299203072.0, + "23": 299203072.0, + "24": 299203072.0, + "25": 299203072.0, + "26": 299203072.0, + "27": 299203072.0, + "28": 299203072.0, + "29": 299203072.0, + "30": 299203072.0, + "31": 299203072.0, + "32": 299203072.0, + "33": 299203072.0, + "34": 299203072.0, + "35": 299203072.0, + "36": 299203072.0, + "37": 299203072.0, + "38": 299203072.0, + "39": 299203072.0, + "40": 299203072.0, + "41": 299203072.0, + "42": 299203072.0, + "43": 299203072.0, + "44": 299203072.0, + "45": 299203072.0, + "46": 299203072.0, + "47": 299203072.0, + "48": 299203072.0, + "49": 299203072.0, + "50": 299203072.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1477945856.0, + "2": 1542891008.0, + "3": 1542891008.0, + "4": 1542891008.0, + "5": 1542891008.0, + "6": 1542891008.0, + "7": 1542891008.0, + "8": 1542891008.0, + "9": 1542891008.0, + "10": 1542891008.0, + "11": 1542891008.0, + "12": 1542891008.0, + "13": 1542891008.0, + "14": 1542891008.0, + "15": 1542891008.0, + "16": 1542891008.0, + "17": 1542891008.0, + "18": 1542891008.0, + "19": 1542891008.0, + "20": 1542891008.0, + "21": 1542891008.0, + "22": 1542891008.0, + "23": 1542891008.0, + "24": 1542891008.0, + "25": 1542891008.0, + "26": 1542891008.0, + "27": 1542891008.0, + "28": 1542891008.0, + "29": 1542891008.0, + "30": 1542891008.0, + "31": 1542891008.0, + "32": 1542891008.0, + "33": 1542891008.0, + "34": 1542891008.0, + "35": 1542891008.0, + "36": 1542891008.0, + "37": 1542891008.0, + "38": 1542891008.0, + "39": 1542891008.0, + "40": 1542891008.0, + "41": 1542891008.0, + "42": 1542891008.0, + "43": 1542891008.0, + "44": 1542891008.0, + "45": 1542891008.0, + "46": 1542891008.0, + "47": 1542891008.0, + "48": 1542891008.0, + "49": 1542891008.0, + "50": 1542891008.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 8.77968, + "2": 0.26175, + "3": 0.24794, + "4": 0.24501, + "5": 0.24845, + "6": 0.2486, + "7": 0.24727, + "8": 0.24913, + "9": 0.25845, + "10": 0.25285, + "11": 0.24913, + "12": 0.24699, + "13": 0.2473, + "14": 0.25154, + "15": 0.24973, + "16": 0.24744, + "17": 0.24812, + "18": 0.25005, + "19": 0.24688, + "20": 0.2449, + "21": 0.24547, + "22": 0.24699, + "23": 0.24408, + "24": 0.24933, + "25": 0.24233, + "26": 0.2452, + "27": 0.24682, + "28": 0.24269, + "29": 0.24203, + "30": 0.2418, + "31": 0.25702, + "32": 0.24123, + "33": 0.24439, + "34": 0.24088, + "35": 0.24457, + "36": 0.24197, + "37": 0.24309, + "38": 0.24278, + "39": 0.24374, + "40": 0.2478, + "41": 0.2422, + "42": 0.24357, + "43": 0.24957, + "44": 0.24752, + "45": 0.24273, + "46": 0.24413, + "47": 0.24327, + "48": 0.24256, + "49": 0.24524, + "50": 0.24667 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..0e9e1ac956f --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.84269, + "2": 10.85556, + "3": 10.84446, + "4": 10.84222, + "5": 10.85859, + "6": 10.86289, + "7": 10.85166, + "8": 10.84694, + "9": 10.85648, + "10": 10.8187, + "11": 10.85952, + "12": 10.8434, + "13": 10.86329, + "14": 10.85467, + "15": 10.80947, + "16": 10.81639, + "17": 10.7887, + "18": 10.79677, + "19": 10.79127, + "20": 10.70829, + "21": 10.69425, + "22": 10.58587, + "23": 10.70272, + "24": 10.60461, + "25": 10.57071, + "26": 10.62002, + "27": 10.61414, + "28": 10.56371, + "29": 10.56749, + "30": 10.39721, + "31": 10.16567, + "32": 10.45764, + "33": 10.45152, + "34": 10.23938, + "35": 10.28311, + "36": 10.24692, + "37": 10.34247, + "38": 10.2052, + "39": 10.39167, + "40": 10.09728, + "41": 10.15266, + "42": 10.21035, + "43": 9.87733, + "44": 9.98208, + "45": 9.86184, + "46": 9.83605, + "47": 10.13379, + "48": 9.87207, + "49": 9.56144, + "50": 9.91021 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1669.0, + "2": 1797.0, + "3": 1769.0, + "4": 1812.0, + "5": 1956.0, + "6": 1892.0, + "7": 1848.0, + "8": 1619.0, + "9": 1899.0, + "10": 1416.0, + "11": 1910.0, + "12": 1734.0, + "13": 1952.0, + "14": 1901.0, + "15": 1958.0, + "16": 1961.0, + "17": 1919.0, + "18": 1881.0, + "19": 1883.0, + "20": 1802.0, + "21": 1931.0, + "22": 1655.0, + "23": 1993.0, + "24": 1633.0, + "25": 1767.0, + "26": 1727.0, + "27": 1709.0, + "28": 1909.0, + "29": 2062.0, + "30": 1901.0, + "31": 1678.0, + "32": 1944.0, + "33": 2164.0, + "34": 1777.0, + "35": 1938.0, + "36": 1876.0, + "37": 2428.0, + "38": 2216.0, + "39": 2329.0, + "40": 2126.0, + "41": 2312.0, + "42": 2207.0, + "43": 1975.0, + "44": 2062.0, + "45": 1927.0, + "46": 2258.0, + "47": 2545.0, + "48": 2291.0, + "49": 2254.0, + "50": 2307.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 299203072.0, + "2": 299203072.0, + "3": 299203072.0, + "4": 299203072.0, + "5": 299203072.0, + "6": 299203072.0, + "7": 299203072.0, + "8": 299203072.0, + "9": 299203072.0, + "10": 299203072.0, + "11": 299203072.0, + "12": 299203072.0, + "13": 299203072.0, + "14": 299203072.0, + "15": 299203072.0, + "16": 299203072.0, + "17": 299203072.0, + "18": 299203072.0, + "19": 299203072.0, + "20": 299203072.0, + "21": 299203072.0, + "22": 299203072.0, + "23": 299203072.0, + "24": 299203072.0, + "25": 299203072.0, + "26": 299203072.0, + "27": 299203072.0, + "28": 299203072.0, + "29": 299203072.0, + "30": 299203072.0, + "31": 299203072.0, + "32": 299203072.0, + "33": 299203072.0, + "34": 299203072.0, + "35": 299203072.0, + "36": 299203072.0, + "37": 299203072.0, + "38": 299203072.0, + "39": 299203072.0, + "40": 299203072.0, + "41": 299203072.0, + "42": 299203072.0, + "43": 299203072.0, + "44": 299203072.0, + "45": 299203072.0, + "46": 299203072.0, + "47": 299203072.0, + "48": 299203072.0, + "49": 299203072.0, + "50": 299203072.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1477945856.0, + "2": 1542891008.0, + "3": 1542891008.0, + "4": 1542891008.0, + "5": 1542891008.0, + "6": 1542891008.0, + "7": 1542891008.0, + "8": 1542891008.0, + "9": 1542891008.0, + "10": 1542891008.0, + "11": 1542891008.0, + "12": 1542891008.0, + "13": 1542891008.0, + "14": 1542891008.0, + "15": 1542891008.0, + "16": 1542891008.0, + "17": 1542891008.0, + "18": 1542891008.0, + "19": 1542891008.0, + "20": 1542891008.0, + "21": 1542891008.0, + "22": 1542891008.0, + "23": 1542891008.0, + "24": 1542891008.0, + "25": 1542891008.0, + "26": 1542891008.0, + "27": 1542891008.0, + "28": 1542891008.0, + "29": 1542891008.0, + "30": 1542891008.0, + "31": 1542891008.0, + "32": 1542891008.0, + "33": 1542891008.0, + "34": 1542891008.0, + "35": 1542891008.0, + "36": 1542891008.0, + "37": 1542891008.0, + "38": 1542891008.0, + "39": 1542891008.0, + "40": 1542891008.0, + "41": 1542891008.0, + "42": 1542891008.0, + "43": 1542891008.0, + "44": 1542891008.0, + "45": 1542891008.0, + "46": 1542891008.0, + "47": 1542891008.0, + "48": 1542891008.0, + "49": 1542891008.0, + "50": 1542891008.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 8.85835, + "2": 0.24835, + "3": 0.21606, + "4": 0.2165, + "5": 0.2184, + "6": 0.21562, + "7": 0.21636, + "8": 0.21549, + "9": 0.21564, + "10": 0.21602, + "11": 0.21604, + "12": 0.21848, + "13": 0.22011, + "14": 0.21851, + "15": 0.21382, + "16": 0.21395, + "17": 0.21404, + "18": 0.21912, + "19": 0.21472, + "20": 0.21137, + "21": 0.2132, + "22": 0.21258, + "23": 0.21793, + "24": 0.22285, + "25": 0.21743, + "26": 0.21892, + "27": 0.21849, + "28": 0.2197, + "29": 0.21953, + "30": 0.21687, + "31": 0.21658, + "32": 0.2223, + "33": 0.22171, + "34": 0.21429, + "35": 0.21354, + "36": 0.21407, + "37": 0.21643, + "38": 0.21392, + "39": 0.21524, + "40": 0.21475, + "41": 0.2181, + "42": 0.21582, + "43": 0.21601, + "44": 0.21724, + "45": 0.21547, + "46": 0.21832, + "47": 0.21586, + "48": 0.21703, + "49": 0.21487, + "50": 0.21525 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..2bfd32d0721 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.84269, + "2": 10.85556, + "3": 10.84446, + "4": 10.84222, + "5": 10.85859, + "6": 10.86289, + "7": 10.85166, + "8": 10.84694, + "9": 10.85648, + "10": 10.8187, + "11": 10.85952, + "12": 10.8434, + "13": 10.86329, + "14": 10.85467, + "15": 10.80947, + "16": 10.81639, + "17": 10.7887, + "18": 10.79677, + "19": 10.79127, + "20": 10.70829, + "21": 10.69425, + "22": 10.58587, + "23": 10.70272, + "24": 10.60461, + "25": 10.57071, + "26": 10.62002, + "27": 10.61414, + "28": 10.56371, + "29": 10.56749, + "30": 10.39721, + "31": 10.16567, + "32": 10.45764, + "33": 10.45152, + "34": 10.23938, + "35": 10.28311, + "36": 10.24692, + "37": 10.34247, + "38": 10.2052, + "39": 10.39167, + "40": 10.09728, + "41": 10.15266, + "42": 10.21035, + "43": 9.87733, + "44": 9.98208, + "45": 9.86184, + "46": 9.83605, + "47": 10.13379, + "48": 9.87207, + "49": 9.56144, + "50": 9.91021 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1669.0, + "2": 1797.0, + "3": 1769.0, + "4": 1812.0, + "5": 1956.0, + "6": 1892.0, + "7": 1848.0, + "8": 1619.0, + "9": 1899.0, + "10": 1416.0, + "11": 1910.0, + "12": 1734.0, + "13": 1952.0, + "14": 1901.0, + "15": 1958.0, + "16": 1961.0, + "17": 1919.0, + "18": 1881.0, + "19": 1883.0, + "20": 1802.0, + "21": 1931.0, + "22": 1655.0, + "23": 1993.0, + "24": 1633.0, + "25": 1767.0, + "26": 1727.0, + "27": 1709.0, + "28": 1909.0, + "29": 2062.0, + "30": 1901.0, + "31": 1678.0, + "32": 1944.0, + "33": 2164.0, + "34": 1777.0, + "35": 1938.0, + "36": 1876.0, + "37": 2428.0, + "38": 2216.0, + "39": 2329.0, + "40": 2126.0, + "41": 2312.0, + "42": 2207.0, + "43": 1975.0, + "44": 2062.0, + "45": 1927.0, + "46": 2258.0, + "47": 2545.0, + "48": 2291.0, + "49": 2254.0, + "50": 2307.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 299203072.0, + "2": 299203072.0, + "3": 299203072.0, + "4": 299203072.0, + "5": 299203072.0, + "6": 299203072.0, + "7": 299203072.0, + "8": 299203072.0, + "9": 299203072.0, + "10": 299203072.0, + "11": 299203072.0, + "12": 299203072.0, + "13": 299203072.0, + "14": 299203072.0, + "15": 299203072.0, + "16": 299203072.0, + "17": 299203072.0, + "18": 299203072.0, + "19": 299203072.0, + "20": 299203072.0, + "21": 299203072.0, + "22": 299203072.0, + "23": 299203072.0, + "24": 299203072.0, + "25": 299203072.0, + "26": 299203072.0, + "27": 299203072.0, + "28": 299203072.0, + "29": 299203072.0, + "30": 299203072.0, + "31": 299203072.0, + "32": 299203072.0, + "33": 299203072.0, + "34": 299203072.0, + "35": 299203072.0, + "36": 299203072.0, + "37": 299203072.0, + "38": 299203072.0, + "39": 299203072.0, + "40": 299203072.0, + "41": 299203072.0, + "42": 299203072.0, + "43": 299203072.0, + "44": 299203072.0, + "45": 299203072.0, + "46": 299203072.0, + "47": 299203072.0, + "48": 299203072.0, + "49": 299203072.0, + "50": 299203072.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1477945856.0, + "2": 1542891008.0, + "3": 1542891008.0, + "4": 1542891008.0, + "5": 1542891008.0, + "6": 1542891008.0, + "7": 1542891008.0, + "8": 1542891008.0, + "9": 1542891008.0, + "10": 1542891008.0, + "11": 1542891008.0, + "12": 1542891008.0, + "13": 1542891008.0, + "14": 1542891008.0, + "15": 1542891008.0, + "16": 1542891008.0, + "17": 1542891008.0, + "18": 1542891008.0, + "19": 1542891008.0, + "20": 1542891008.0, + "21": 1542891008.0, + "22": 1542891008.0, + "23": 1542891008.0, + "24": 1542891008.0, + "25": 1542891008.0, + "26": 1542891008.0, + "27": 1542891008.0, + "28": 1542891008.0, + "29": 1542891008.0, + "30": 1542891008.0, + "31": 1542891008.0, + "32": 1542891008.0, + "33": 1542891008.0, + "34": 1542891008.0, + "35": 1542891008.0, + "36": 1542891008.0, + "37": 1542891008.0, + "38": 1542891008.0, + "39": 1542891008.0, + "40": 1542891008.0, + "41": 1542891008.0, + "42": 1542891008.0, + "43": 1542891008.0, + "44": 1542891008.0, + "45": 1542891008.0, + "46": 1542891008.0, + "47": 1542891008.0, + "48": 1542891008.0, + "49": 1542891008.0, + "50": 1542891008.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 8.86827, + "2": 0.25581, + "3": 0.24685, + "4": 0.24528, + "5": 0.24786, + "6": 0.25055, + "7": 0.2473, + "8": 0.24843, + "9": 0.24646, + "10": 0.24448, + "11": 0.24595, + "12": 0.24375, + "13": 0.24607, + "14": 0.2438, + "15": 0.24496, + "16": 0.24469, + "17": 0.24672, + "18": 0.2472, + "19": 0.24412, + "20": 0.24734, + "21": 0.24525, + "22": 0.24726, + "23": 0.24425, + "24": 0.2467, + "25": 0.24589, + "26": 0.24521, + "27": 0.24972, + "28": 0.24969, + "29": 0.24951, + "30": 0.24819, + "31": 0.25039, + "32": 0.24983, + "33": 0.25363, + "34": 0.25237, + "35": 0.24992, + "36": 0.24811, + "37": 0.25001, + "38": 0.24929, + "39": 0.24928, + "40": 0.24894, + "41": 0.24934, + "42": 0.24889, + "43": 0.24734, + "44": 0.24821, + "45": 0.2492, + "46": 0.24867, + "47": 0.25083, + "48": 0.24933, + "49": 0.24988, + "50": 0.25012 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..7b27bf78e61 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86122, + "2": 10.85774, + "3": 10.86039, + "4": 10.84813, + "5": 10.88242, + "6": 10.88645, + "7": 10.86227, + "8": 10.86932, + "9": 10.86444, + "10": 10.83506, + "11": 10.87765, + "12": 10.87384, + "13": 10.87945, + "14": 10.88919, + "15": 10.82738, + "16": 10.83105, + "17": 10.79888, + "18": 10.82441, + "19": 10.81363, + "20": 10.72743, + "21": 10.71638, + "22": 10.57153, + "23": 10.7269, + "24": 10.61223, + "25": 10.55753, + "26": 10.60603, + "27": 10.61792, + "28": 10.57695, + "29": 10.59633, + "30": 10.37895, + "31": 10.13125, + "32": 10.47822, + "33": 10.46894, + "34": 10.22715, + "35": 10.28321, + "36": 10.22751, + "37": 10.35397, + "38": 10.20483, + "39": 10.40755, + "40": 10.08785, + "41": 10.1591, + "42": 10.21601, + "43": 9.84821, + "44": 9.9651, + "45": 9.82625, + "46": 9.83468, + "47": 10.15337, + "48": 9.84529, + "49": 9.52926, + "50": 9.91327 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1778.0, + "2": 1875.0, + "3": 1879.0, + "4": 1912.0, + "5": 2219.0, + "6": 2163.0, + "7": 2113.0, + "8": 1747.0, + "9": 2049.0, + "10": 1530.0, + "11": 2113.0, + "12": 1959.0, + "13": 2134.0, + "14": 2055.0, + "15": 2125.0, + "16": 2139.0, + "17": 1988.0, + "18": 1892.0, + "19": 1991.0, + "20": 1867.0, + "21": 2023.0, + "22": 1865.0, + "23": 2185.0, + "24": 1774.0, + "25": 1773.0, + "26": 1990.0, + "27": 2061.0, + "28": 2215.0, + "29": 2186.0, + "30": 2129.0, + "31": 1794.0, + "32": 2109.0, + "33": 2422.0, + "34": 2135.0, + "35": 2169.0, + "36": 2127.0, + "37": 2432.0, + "38": 2490.0, + "39": 2495.0, + "40": 2486.0, + "41": 2465.0, + "42": 2535.0, + "43": 2216.0, + "44": 2407.0, + "45": 2335.0, + "46": 2617.0, + "47": 2830.0, + "48": 2480.0, + "49": 2492.0, + "50": 2687.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 269891584.0, + "2": 269891584.0, + "3": 269891584.0, + "4": 269891584.0, + "5": 269891584.0, + "6": 269891584.0, + "7": 269891584.0, + "8": 269891584.0, + "9": 269891584.0, + "10": 269891584.0, + "11": 269891584.0, + "12": 269891584.0, + "13": 269891584.0, + "14": 269891584.0, + "15": 269891584.0, + "16": 269891584.0, + "17": 269891584.0, + "18": 269891584.0, + "19": 269891584.0, + "20": 269891584.0, + "21": 269891584.0, + "22": 269891584.0, + "23": 269891584.0, + "24": 269891584.0, + "25": 269891584.0, + "26": 269891584.0, + "27": 269891584.0, + "28": 269891584.0, + "29": 269891584.0, + "30": 269891584.0, + "31": 269891584.0, + "32": 269891584.0, + "33": 269891584.0, + "34": 269891584.0, + "35": 269891584.0, + "36": 269891584.0, + "37": 269891584.0, + "38": 269891584.0, + "39": 269891584.0, + "40": 269891584.0, + "41": 269891584.0, + "42": 269891584.0, + "43": 269891584.0, + "44": 269891584.0, + "45": 269891584.0, + "46": 269891584.0, + "47": 269891584.0, + "48": 269891584.0, + "49": 269891584.0, + "50": 269891584.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1448634368.0, + "2": 1515674112.0, + "3": 1515674112.0, + "4": 1515674112.0, + "5": 1515674112.0, + "6": 1515674112.0, + "7": 1515674112.0, + "8": 1515674112.0, + "9": 1515674112.0, + "10": 1515674112.0, + "11": 1515674112.0, + "12": 1515674112.0, + "13": 1515674112.0, + "14": 1515674112.0, + "15": 1515674112.0, + "16": 1515674112.0, + "17": 1515674112.0, + "18": 1515674112.0, + "19": 1515674112.0, + "20": 1515674112.0, + "21": 1515674112.0, + "22": 1515674112.0, + "23": 1515674112.0, + "24": 1515674112.0, + "25": 1515674112.0, + "26": 1515674112.0, + "27": 1515674112.0, + "28": 1515674112.0, + "29": 1515674112.0, + "30": 1515674112.0, + "31": 1515674112.0, + "32": 1515674112.0, + "33": 1515674112.0, + "34": 1515674112.0, + "35": 1515674112.0, + "36": 1515676160.0, + "37": 1515676160.0, + "38": 1515676160.0, + "39": 1515676160.0, + "40": 1515676160.0, + "41": 1515676160.0, + "42": 1515676160.0, + "43": 1515676160.0, + "44": 1515676160.0, + "45": 1515676160.0, + "46": 1515676160.0, + "47": 1515676160.0, + "48": 1515676160.0, + "49": 1515676160.0, + "50": 1515676160.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.32442, + "2": 0.36793, + "3": 0.33232, + "4": 0.32917, + "5": 0.33097, + "6": 0.32866, + "7": 0.32256, + "8": 0.32486, + "9": 0.37982, + "10": 0.41476, + "11": 0.44694, + "12": 0.53248, + "13": 0.57146, + "14": 0.57246, + "15": 0.36094, + "16": 0.34892, + "17": 0.38022, + "18": 0.35319, + "19": 0.36887, + "20": 0.36416, + "21": 0.34563, + "22": 0.31882, + "23": 0.32147, + "24": 0.31667, + "25": 0.31696, + "26": 0.31902, + "27": 0.32164, + "28": 0.31663, + "29": 0.3158, + "30": 0.32265, + "31": 0.31608, + "32": 0.31574, + "33": 0.32267, + "34": 0.31719, + "35": 0.31721, + "36": 0.32191, + "37": 0.31699, + "38": 0.31788, + "39": 0.32413, + "40": 0.31691, + "41": 0.31767, + "42": 0.32282, + "43": 0.31846, + "44": 0.31976, + "45": 0.32052, + "46": 0.3223, + "47": 0.32037, + "48": 0.33259, + "49": 0.32455, + "50": 0.32849 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..2dea447618c --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.86122, + "2": 10.85774, + "3": 10.86039, + "4": 10.84813, + "5": 10.88242, + "6": 10.88645, + "7": 10.86227, + "8": 10.86932, + "9": 10.86444, + "10": 10.83506, + "11": 10.87765, + "12": 10.87384, + "13": 10.87945, + "14": 10.88919, + "15": 10.82738, + "16": 10.83105, + "17": 10.79888, + "18": 10.82441, + "19": 10.81363, + "20": 10.72743, + "21": 10.71638, + "22": 10.57153, + "23": 10.7269, + "24": 10.61223, + "25": 10.55753, + "26": 10.60603, + "27": 10.61792, + "28": 10.57695, + "29": 10.59633, + "30": 10.37895, + "31": 10.13125, + "32": 10.47822, + "33": 10.46894, + "34": 10.22715, + "35": 10.28321, + "36": 10.22751, + "37": 10.35397, + "38": 10.20483, + "39": 10.40755, + "40": 10.08785, + "41": 10.1591, + "42": 10.21601, + "43": 9.84821, + "44": 9.9651, + "45": 9.82625, + "46": 9.83468, + "47": 10.15337, + "48": 9.84529, + "49": 9.52926, + "50": 9.91327 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1778.0, + "2": 1875.0, + "3": 1879.0, + "4": 1912.0, + "5": 2219.0, + "6": 2163.0, + "7": 2113.0, + "8": 1747.0, + "9": 2049.0, + "10": 1530.0, + "11": 2113.0, + "12": 1959.0, + "13": 2134.0, + "14": 2055.0, + "15": 2125.0, + "16": 2139.0, + "17": 1988.0, + "18": 1892.0, + "19": 1991.0, + "20": 1867.0, + "21": 2023.0, + "22": 1865.0, + "23": 2185.0, + "24": 1774.0, + "25": 1773.0, + "26": 1990.0, + "27": 2061.0, + "28": 2215.0, + "29": 2186.0, + "30": 2129.0, + "31": 1794.0, + "32": 2109.0, + "33": 2422.0, + "34": 2135.0, + "35": 2169.0, + "36": 2127.0, + "37": 2432.0, + "38": 2490.0, + "39": 2495.0, + "40": 2486.0, + "41": 2465.0, + "42": 2535.0, + "43": 2216.0, + "44": 2407.0, + "45": 2335.0, + "46": 2617.0, + "47": 2830.0, + "48": 2480.0, + "49": 2492.0, + "50": 2687.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 269891584.0, + "2": 269891584.0, + "3": 269891584.0, + "4": 269891584.0, + "5": 269891584.0, + "6": 269891584.0, + "7": 269891584.0, + "8": 269891584.0, + "9": 269891584.0, + "10": 269891584.0, + "11": 269891584.0, + "12": 269891584.0, + "13": 269891584.0, + "14": 269891584.0, + "15": 269891584.0, + "16": 269891584.0, + "17": 269891584.0, + "18": 269891584.0, + "19": 269891584.0, + "20": 269891584.0, + "21": 269891584.0, + "22": 269891584.0, + "23": 269891584.0, + "24": 269891584.0, + "25": 269891584.0, + "26": 269891584.0, + "27": 269891584.0, + "28": 269891584.0, + "29": 269891584.0, + "30": 269891584.0, + "31": 269891584.0, + "32": 269891584.0, + "33": 269891584.0, + "34": 269891584.0, + "35": 269891584.0, + "36": 269891584.0, + "37": 269891584.0, + "38": 269891584.0, + "39": 269891584.0, + "40": 269891584.0, + "41": 269891584.0, + "42": 269891584.0, + "43": 269891584.0, + "44": 269891584.0, + "45": 269891584.0, + "46": 269891584.0, + "47": 269891584.0, + "48": 269891584.0, + "49": 269891584.0, + "50": 269891584.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1449682944.0, + "2": 1514627584.0, + "3": 1514627584.0, + "4": 1514628096.0, + "5": 1514628096.0, + "6": 1515674112.0, + "7": 1515674112.0, + "8": 1515674112.0, + "9": 1515676160.0, + "10": 1515676160.0, + "11": 1515676160.0, + "12": 1515676160.0, + "13": 1515676160.0, + "14": 1515676160.0, + "15": 1515676672.0, + "16": 1515676672.0, + "17": 1515676672.0, + "18": 1515676672.0, + "19": 1515676672.0, + "20": 1515676672.0, + "21": 1515676672.0, + "22": 1515676672.0, + "23": 1515676672.0, + "24": 1515676672.0, + "25": 1515676672.0, + "26": 1515676672.0, + "27": 1515676672.0, + "28": 1515676672.0, + "29": 1515676672.0, + "30": 1515676672.0, + "31": 1515676672.0, + "32": 1515676672.0, + "33": 1515676672.0, + "34": 1515676672.0, + "35": 1515676672.0, + "36": 1515676672.0, + "37": 1515676672.0, + "38": 1515676672.0, + "39": 1515676672.0, + "40": 1515676672.0, + "41": 1515676672.0, + "42": 1515676672.0, + "43": 1515676672.0, + "44": 1515676672.0, + "45": 1515676672.0, + "46": 1515676672.0, + "47": 1515676672.0, + "48": 1515676672.0, + "49": 1515676672.0, + "50": 1515676672.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 13.6671, + "2": 0.83595, + "3": 0.32182, + "4": 0.325, + "5": 0.52703, + "6": 0.32134, + "7": 0.32449, + "8": 0.32437, + "9": 0.32282, + "10": 0.32149, + "11": 0.32428, + "12": 0.32191, + "13": 0.32586, + "14": 0.32086, + "15": 0.3225, + "16": 0.32112, + "17": 0.32105, + "18": 0.32408, + "19": 0.32353, + "20": 0.32273, + "21": 0.32558, + "22": 0.31978, + "23": 0.32165, + "24": 0.32145, + "25": 0.31914, + "26": 0.32323, + "27": 0.32298, + "28": 0.31906, + "29": 0.31806, + "30": 0.32112, + "31": 0.31802, + "32": 0.32203, + "33": 0.32813, + "34": 0.32256, + "35": 0.32108, + "36": 0.32976, + "37": 0.32104, + "38": 0.32185, + "39": 0.32826, + "40": 0.32693, + "41": 0.32396, + "42": 0.32632, + "43": 0.33312, + "44": 0.32745, + "45": 0.32655, + "46": 0.32577, + "47": 0.32382, + "48": 0.32447, + "49": 0.32891, + "50": 0.32257 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index f479cea5f5f..39765124d93 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,537 @@ -{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.84163, "5": 10.85872, "10": 10.81849, "15": 10.81015, "20": 10.70819, "25": 10.57102, "30": 10.39695, "35": 10.28351, "40": 10.09767, "45": 9.86165, "50": 9.91045, "55": 9.88738, "60": 9.51376, "65": 8.9571, "70": 9.74676, "75": 9.42381, "80": 9.40721, "85": 9.61784, "90": 9.82256, "95": 9.51351, "100": 9.40106}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1736.0, "5": 1955.0, "10": 1441.0, "15": 1907.0, "20": 1700.0, "25": 1686.0, "30": 1941.0, "35": 1907.0, "40": 2224.0, "45": 1956.0, "50": 2232.0, "55": 2206.0, "60": 2157.0, "65": 2630.0, "70": 3040.0, "75": 2461.0, "80": 3104.0, "85": 3167.0, "90": 3069.0, "95": 3206.0, "100": 3111.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 299203072.0, "5": 299203072.0, "10": 299203072.0, "15": 299203072.0, "20": 299203072.0, "25": 299203072.0, "30": 299203072.0, "35": 299203072.0, "40": 299203072.0, "45": 299203072.0, "50": 299203072.0, "55": 299203072.0, "60": 299203072.0, "65": 299203072.0, "70": 299203072.0, "75": 299203072.0, "80": 299203072.0, "85": 299203072.0, "90": 299203072.0, "95": 299203072.0, "100": 299203072.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 977125888.0, "5": 1042071040.0, "10": 1042071040.0, "15": 1042071040.0, "20": 1042071040.0, "25": 1042071040.0, "30": 1042071040.0, "35": 1042071040.0, "40": 1042071040.0, "45": 1042071040.0, "50": 1042071040.0, "55": 1042071040.0, "60": 1042071040.0, "65": 1042071040.0, "70": 1042071040.0, "75": 1042071040.0, "80": 1042071040.0, "85": 1042071040.0, "90": 1042071040.0, "95": 1042071040.0, "100": 1042071040.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 9.57084, "5": 0.17113, "10": 0.17286, "15": 0.16879, "20": 0.16991, "25": 0.16317, "30": 0.16767, "35": 0.16367, "40": 0.16455, "45": 0.17151, "50": 0.16431, "55": 0.17778, "60": 0.16619, "65": 0.16724, "70": 0.17675, "75": 0.17316, "80": 0.17654, "85": 0.18496, "90": 0.167, "95": 0.17008, "100": 0.16742}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84163, + "2": 10.85598, + "3": 10.84413, + "4": 10.84124, + "5": 10.85872, + "6": 10.86316, + "7": 10.85184, + "8": 10.84645, + "9": 10.85647, + "10": 10.81849, + "11": 10.85923, + "12": 10.84285, + "13": 10.86432, + "14": 10.85423, + "15": 10.81015, + "16": 10.81588, + "17": 10.78949, + "18": 10.79683, + "19": 10.79073, + "20": 10.70819, + "21": 10.69322, + "22": 10.58504, + "23": 10.70217, + "24": 10.60546, + "25": 10.57102, + "26": 10.61967, + "27": 10.61501, + "28": 10.56369, + "29": 10.56725, + "30": 10.39695, + "31": 10.16591, + "32": 10.4573, + "33": 10.45199, + "34": 10.2392, + "35": 10.28351, + "36": 10.24677, + "37": 10.3427, + "38": 10.20546, + "39": 10.39187, + "40": 10.09767, + "41": 10.1526, + "42": 10.21051, + "43": 9.87726, + "44": 9.98291, + "45": 9.86165, + "46": 9.83587, + "47": 10.13369, + "48": 9.87212, + "49": 9.56121, + "50": 9.91045, + "51": 9.85839, + "52": 9.7506, + "53": 10.05817, + "54": 9.96076, + "55": 9.88738, + "56": 9.6344, + "57": 9.4967, + "58": 9.83343, + "59": 9.59391, + "60": 9.51376, + "61": 9.69928, + "62": 9.98089, + "63": 9.39065, + "64": 9.77599, + "65": 8.9571, + "66": 9.70054, + "67": 9.37, + "68": 9.78529, + "69": 9.78966, + "70": 9.74676, + "71": 9.61906, + "72": 9.58963, + "73": 9.49629, + "74": 8.94963, + "75": 9.42381, + "76": 9.07799, + "77": 10.07105, + "78": 9.72632, + "79": 9.37966, + "80": 9.40721, + "81": 9.48238, + "82": 9.70152, + "83": 9.30657, + "84": 9.41464, + "85": 9.61784, + "86": 9.08212, + "87": 9.59511, + "88": 9.75008, + "89": 9.60356, + "90": 9.82256, + "91": 9.33721, + "92": 9.35861, + "93": 9.07956, + "94": 8.83268, + "95": 9.51351, + "96": 9.52947, + "97": 9.31813, + "98": 9.67451, + "99": 8.88607, + "100": 9.40106 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1736.0, + "2": 1692.0, + "3": 1695.0, + "4": 1761.0, + "5": 1955.0, + "6": 1791.0, + "7": 1943.0, + "8": 1681.0, + "9": 1884.0, + "10": 1441.0, + "11": 1942.0, + "12": 1786.0, + "13": 1940.0, + "14": 1862.0, + "15": 1907.0, + "16": 1947.0, + "17": 1827.0, + "18": 1907.0, + "19": 1818.0, + "20": 1700.0, + "21": 1911.0, + "22": 1720.0, + "23": 1938.0, + "24": 1707.0, + "25": 1686.0, + "26": 1792.0, + "27": 1891.0, + "28": 1976.0, + "29": 1958.0, + "30": 1941.0, + "31": 1622.0, + "32": 1970.0, + "33": 2129.0, + "34": 1830.0, + "35": 1907.0, + "36": 1892.0, + "37": 2395.0, + "38": 2161.0, + "39": 2493.0, + "40": 2224.0, + "41": 2201.0, + "42": 2175.0, + "43": 1920.0, + "44": 1955.0, + "45": 1956.0, + "46": 2166.0, + "47": 2517.0, + "48": 2272.0, + "49": 2211.0, + "50": 2232.0, + "51": 2621.0, + "52": 2597.0, + "53": 2926.0, + "54": 2633.0, + "55": 2206.0, + "56": 2627.0, + "57": 2328.0, + "58": 2886.0, + "59": 2639.0, + "60": 2157.0, + "61": 2736.0, + "62": 2544.0, + "63": 2332.0, + "64": 2948.0, + "65": 2630.0, + "66": 2931.0, + "67": 2717.0, + "68": 2643.0, + "69": 2955.0, + "70": 3040.0, + "71": 2882.0, + "72": 2390.0, + "73": 2812.0, + "74": 1844.0, + "75": 2461.0, + "76": 3067.0, + "77": 3152.0, + "78": 3018.0, + "79": 3008.0, + "80": 3104.0, + "81": 3589.0, + "82": 3218.0, + "83": 2748.0, + "84": 3217.0, + "85": 3167.0, + "86": 2876.0, + "87": 3604.0, + "88": 3017.0, + "89": 3249.0, + "90": 3069.0, + "91": 2865.0, + "92": 3074.0, + "93": 2680.0, + "94": 3392.0, + "95": 3206.0, + "96": 3401.0, + "97": 3107.0, + "98": 3624.0, + "99": 3007.0, + "100": 3111.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 299203072.0, + "2": 299203072.0, + "3": 299203072.0, + "4": 299203072.0, + "5": 299203072.0, + "6": 299203072.0, + "7": 299203072.0, + "8": 299203072.0, + "9": 299203072.0, + "10": 299203072.0, + "11": 299203072.0, + "12": 299203072.0, + "13": 299203072.0, + "14": 299203072.0, + "15": 299203072.0, + "16": 299203072.0, + "17": 299203072.0, + "18": 299203072.0, + "19": 299203072.0, + "20": 299203072.0, + "21": 299203072.0, + "22": 299203072.0, + "23": 299203072.0, + "24": 299203072.0, + "25": 299203072.0, + "26": 299203072.0, + "27": 299203072.0, + "28": 299203072.0, + "29": 299203072.0, + "30": 299203072.0, + "31": 299203072.0, + "32": 299203072.0, + "33": 299203072.0, + "34": 299203072.0, + "35": 299203072.0, + "36": 299203072.0, + "37": 299203072.0, + "38": 299203072.0, + "39": 299203072.0, + "40": 299203072.0, + "41": 299203072.0, + "42": 299203072.0, + "43": 299203072.0, + "44": 299203072.0, + "45": 299203072.0, + "46": 299203072.0, + "47": 299203072.0, + "48": 299203072.0, + "49": 299203072.0, + "50": 299203072.0, + "51": 299203072.0, + "52": 299203072.0, + "53": 299203072.0, + "54": 299203072.0, + "55": 299203072.0, + "56": 299203072.0, + "57": 299203072.0, + "58": 299203072.0, + "59": 299203072.0, + "60": 299203072.0, + "61": 299203072.0, + "62": 299203072.0, + "63": 299203072.0, + "64": 299203072.0, + "65": 299203072.0, + "66": 299203072.0, + "67": 299203072.0, + "68": 299203072.0, + "69": 299203072.0, + "70": 299203072.0, + "71": 299203072.0, + "72": 299203072.0, + "73": 299203072.0, + "74": 299203072.0, + "75": 299203072.0, + "76": 299203072.0, + "77": 299203072.0, + "78": 299203072.0, + "79": 299203072.0, + "80": 299203072.0, + "81": 299203072.0, + "82": 299203072.0, + "83": 299203072.0, + "84": 299203072.0, + "85": 299203072.0, + "86": 299203072.0, + "87": 299203072.0, + "88": 299203072.0, + "89": 299203072.0, + "90": 299203072.0, + "91": 299203072.0, + "92": 299203072.0, + "93": 299203072.0, + "94": 299203072.0, + "95": 299203072.0, + "96": 299203072.0, + "97": 299203072.0, + "98": 299203072.0, + "99": 299203072.0, + "100": 299203072.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 977125888.0, + "2": 1042071040.0, + "3": 1042071040.0, + "4": 1042071040.0, + "5": 1042071040.0, + "6": 1042071040.0, + "7": 1042071040.0, + "8": 1042071040.0, + "9": 1042071040.0, + "10": 1042071040.0, + "11": 1042071040.0, + "12": 1042071040.0, + "13": 1042071040.0, + "14": 1042071040.0, + "15": 1042071040.0, + "16": 1042071040.0, + "17": 1042071040.0, + "18": 1042071040.0, + "19": 1042071040.0, + "20": 1042071040.0, + "21": 1042071040.0, + "22": 1042071040.0, + "23": 1042071040.0, + "24": 1042071040.0, + "25": 1042071040.0, + "26": 1042071040.0, + "27": 1042071040.0, + "28": 1042071040.0, + "29": 1042071040.0, + "30": 1042071040.0, + "31": 1042071040.0, + "32": 1042071040.0, + "33": 1042071040.0, + "34": 1042071040.0, + "35": 1042071040.0, + "36": 1042071040.0, + "37": 1042071040.0, + "38": 1042071040.0, + "39": 1042071040.0, + "40": 1042071040.0, + "41": 1042071040.0, + "42": 1042071040.0, + "43": 1042071040.0, + "44": 1042071040.0, + "45": 1042071040.0, + "46": 1042071040.0, + "47": 1042071040.0, + "48": 1042071040.0, + "49": 1042071040.0, + "50": 1042071040.0, + "51": 1042071040.0, + "52": 1042071040.0, + "53": 1042071040.0, + "54": 1042071040.0, + "55": 1042071040.0, + "56": 1042071040.0, + "57": 1042071040.0, + "58": 1042071040.0, + "59": 1042071040.0, + "60": 1042071040.0, + "61": 1042071040.0, + "62": 1042071040.0, + "63": 1042071040.0, + "64": 1042071040.0, + "65": 1042071040.0, + "66": 1042071040.0, + "67": 1042071040.0, + "68": 1042071040.0, + "69": 1042071040.0, + "70": 1042071040.0, + "71": 1042071040.0, + "72": 1042071040.0, + "73": 1042071040.0, + "74": 1042071040.0, + "75": 1042071040.0, + "76": 1042071040.0, + "77": 1042071040.0, + "78": 1042071040.0, + "79": 1042071040.0, + "80": 1042071040.0, + "81": 1042071040.0, + "82": 1042071040.0, + "83": 1042071040.0, + "84": 1042071040.0, + "85": 1042071040.0, + "86": 1042071040.0, + "87": 1042071040.0, + "88": 1042071040.0, + "89": 1042071040.0, + "90": 1042071040.0, + "91": 1042071040.0, + "92": 1042071040.0, + "93": 1042071040.0, + "94": 1042071040.0, + "95": 1042071040.0, + "96": 1042071040.0, + "97": 1042071040.0, + "98": 1042071040.0, + "99": 1042071040.0, + "100": 1042071040.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.64755, + "2": 0.22676, + "3": 0.21049, + "4": 0.21226, + "5": 0.21276, + "6": 0.21284, + "7": 0.21174, + "8": 0.21294, + "9": 0.21455, + "10": 0.21245, + "11": 0.21305, + "12": 0.21226, + "13": 0.21393, + "14": 0.21543, + "15": 0.21306, + "16": 0.21524, + "17": 0.21547, + "18": 0.21654, + "19": 0.21182, + "20": 0.21446, + "21": 0.2154, + "22": 0.2134, + "23": 0.21194, + "24": 0.21397, + "25": 0.21361, + "26": 0.21508, + "27": 0.21438, + "28": 0.21467, + "29": 0.21423, + "30": 0.21547, + "31": 0.2149, + "32": 0.21373, + "33": 0.21293, + "34": 0.21223, + "35": 0.21322, + "36": 0.21538, + "37": 0.2171, + "38": 0.21288, + "39": 0.214, + "40": 0.21613, + "41": 0.22561, + "42": 0.21996, + "43": 0.2231, + "44": 0.21366, + "45": 0.20946, + "46": 0.21036, + "47": 0.21159, + "48": 0.21259, + "49": 0.2162, + "50": 0.21326, + "51": 0.21621, + "52": 0.20977, + "53": 0.20911, + "54": 0.20812, + "55": 0.20849, + "56": 0.20718, + "57": 0.21288, + "58": 0.20817, + "59": 0.20767, + "60": 0.20713, + "61": 0.21035, + "62": 0.21063, + "63": 0.21186, + "64": 0.20447, + "65": 0.206, + "66": 0.2078, + "67": 0.21155, + "68": 0.21249, + "69": 0.20772, + "70": 0.2071, + "71": 0.20716, + "72": 0.20814, + "73": 0.20979, + "74": 0.21089, + "75": 0.20519, + "76": 0.20953, + "77": 0.20632, + "78": 0.21411, + "79": 0.20748, + "80": 0.20907, + "81": 0.20802, + "82": 0.20909, + "83": 0.21401, + "84": 0.21584, + "85": 0.20979, + "86": 0.20899, + "87": 0.20903, + "88": 0.21002, + "89": 0.20822, + "90": 0.20988, + "91": 0.2101, + "92": 0.20692, + "93": 0.21116, + "94": 0.20766, + "95": 0.2115, + "96": 0.20949, + "97": 0.20615, + "98": 0.20442, + "99": 0.2084, + "100": 0.20996 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..0521ec92aee --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84163, + "2": 10.85598, + "3": 10.84413, + "4": 10.84124, + "5": 10.85872, + "6": 10.86316, + "7": 10.85184, + "8": 10.84645, + "9": 10.85647, + "10": 10.81849, + "11": 10.85923, + "12": 10.84285, + "13": 10.86432, + "14": 10.85423, + "15": 10.81015, + "16": 10.81588, + "17": 10.78949, + "18": 10.79683, + "19": 10.79073, + "20": 10.70819, + "21": 10.69322, + "22": 10.58504, + "23": 10.70217, + "24": 10.60546, + "25": 10.57102, + "26": 10.61967, + "27": 10.61501, + "28": 10.56369, + "29": 10.56725, + "30": 10.39695, + "31": 10.16591, + "32": 10.4573, + "33": 10.45199, + "34": 10.2392, + "35": 10.28351, + "36": 10.24677, + "37": 10.3427, + "38": 10.20546, + "39": 10.39187, + "40": 10.09767, + "41": 10.1526, + "42": 10.21051, + "43": 9.87726, + "44": 9.98291, + "45": 9.86165, + "46": 9.83587, + "47": 10.13369, + "48": 9.87212, + "49": 9.56121, + "50": 9.91045, + "51": 9.85839, + "52": 9.7506, + "53": 10.05817, + "54": 9.96076, + "55": 9.88738, + "56": 9.6344, + "57": 9.4967, + "58": 9.83343, + "59": 9.59391, + "60": 9.51376, + "61": 9.69928, + "62": 9.98089, + "63": 9.39065, + "64": 9.77599, + "65": 8.9571, + "66": 9.70054, + "67": 9.37, + "68": 9.78529, + "69": 9.78966, + "70": 9.74676, + "71": 9.61906, + "72": 9.58963, + "73": 9.49629, + "74": 8.94963, + "75": 9.42381, + "76": 9.07799, + "77": 10.07105, + "78": 9.72632, + "79": 9.37966, + "80": 9.40721, + "81": 9.48238, + "82": 9.70152, + "83": 9.30657, + "84": 9.41464, + "85": 9.61784, + "86": 9.08212, + "87": 9.59511, + "88": 9.75008, + "89": 9.60356, + "90": 9.82256, + "91": 9.33721, + "92": 9.35861, + "93": 9.07956, + "94": 8.83268, + "95": 9.51351, + "96": 9.52947, + "97": 9.31813, + "98": 9.67451, + "99": 8.88607, + "100": 9.40106 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1736.0, + "2": 1692.0, + "3": 1695.0, + "4": 1761.0, + "5": 1955.0, + "6": 1791.0, + "7": 1943.0, + "8": 1681.0, + "9": 1884.0, + "10": 1441.0, + "11": 1942.0, + "12": 1786.0, + "13": 1940.0, + "14": 1862.0, + "15": 1907.0, + "16": 1947.0, + "17": 1827.0, + "18": 1907.0, + "19": 1818.0, + "20": 1700.0, + "21": 1911.0, + "22": 1720.0, + "23": 1938.0, + "24": 1707.0, + "25": 1686.0, + "26": 1792.0, + "27": 1891.0, + "28": 1976.0, + "29": 1958.0, + "30": 1941.0, + "31": 1622.0, + "32": 1970.0, + "33": 2129.0, + "34": 1830.0, + "35": 1907.0, + "36": 1892.0, + "37": 2395.0, + "38": 2161.0, + "39": 2493.0, + "40": 2224.0, + "41": 2201.0, + "42": 2175.0, + "43": 1920.0, + "44": 1955.0, + "45": 1956.0, + "46": 2166.0, + "47": 2517.0, + "48": 2272.0, + "49": 2211.0, + "50": 2232.0, + "51": 2621.0, + "52": 2597.0, + "53": 2926.0, + "54": 2633.0, + "55": 2206.0, + "56": 2627.0, + "57": 2328.0, + "58": 2886.0, + "59": 2639.0, + "60": 2157.0, + "61": 2736.0, + "62": 2544.0, + "63": 2332.0, + "64": 2948.0, + "65": 2630.0, + "66": 2931.0, + "67": 2717.0, + "68": 2643.0, + "69": 2955.0, + "70": 3040.0, + "71": 2882.0, + "72": 2390.0, + "73": 2812.0, + "74": 1844.0, + "75": 2461.0, + "76": 3067.0, + "77": 3152.0, + "78": 3018.0, + "79": 3008.0, + "80": 3104.0, + "81": 3589.0, + "82": 3218.0, + "83": 2748.0, + "84": 3217.0, + "85": 3167.0, + "86": 2876.0, + "87": 3604.0, + "88": 3017.0, + "89": 3249.0, + "90": 3069.0, + "91": 2865.0, + "92": 3074.0, + "93": 2680.0, + "94": 3392.0, + "95": 3206.0, + "96": 3401.0, + "97": 3107.0, + "98": 3624.0, + "99": 3007.0, + "100": 3111.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 299203072.0, + "2": 299203072.0, + "3": 299203072.0, + "4": 299203072.0, + "5": 299203072.0, + "6": 299203072.0, + "7": 299203072.0, + "8": 299203072.0, + "9": 299203072.0, + "10": 299203072.0, + "11": 299203072.0, + "12": 299203072.0, + "13": 299203072.0, + "14": 299203072.0, + "15": 299203072.0, + "16": 299203072.0, + "17": 299203072.0, + "18": 299203072.0, + "19": 299203072.0, + "20": 299203072.0, + "21": 299203072.0, + "22": 299203072.0, + "23": 299203072.0, + "24": 299203072.0, + "25": 299203072.0, + "26": 299203072.0, + "27": 299203072.0, + "28": 299203072.0, + "29": 299203072.0, + "30": 299203072.0, + "31": 299203072.0, + "32": 299203072.0, + "33": 299203072.0, + "34": 299203072.0, + "35": 299203072.0, + "36": 299203072.0, + "37": 299203072.0, + "38": 299203072.0, + "39": 299203072.0, + "40": 299203072.0, + "41": 299203072.0, + "42": 299203072.0, + "43": 299203072.0, + "44": 299203072.0, + "45": 299203072.0, + "46": 299203072.0, + "47": 299203072.0, + "48": 299203072.0, + "49": 299203072.0, + "50": 299203072.0, + "51": 299203072.0, + "52": 299203072.0, + "53": 299203072.0, + "54": 299203072.0, + "55": 299203072.0, + "56": 299203072.0, + "57": 299203072.0, + "58": 299203072.0, + "59": 299203072.0, + "60": 299203072.0, + "61": 299203072.0, + "62": 299203072.0, + "63": 299203072.0, + "64": 299203072.0, + "65": 299203072.0, + "66": 299203072.0, + "67": 299203072.0, + "68": 299203072.0, + "69": 299203072.0, + "70": 299203072.0, + "71": 299203072.0, + "72": 299203072.0, + "73": 299203072.0, + "74": 299203072.0, + "75": 299203072.0, + "76": 299203072.0, + "77": 299203072.0, + "78": 299203072.0, + "79": 299203072.0, + "80": 299203072.0, + "81": 299203072.0, + "82": 299203072.0, + "83": 299203072.0, + "84": 299203072.0, + "85": 299203072.0, + "86": 299203072.0, + "87": 299203072.0, + "88": 299203072.0, + "89": 299203072.0, + "90": 299203072.0, + "91": 299203072.0, + "92": 299203072.0, + "93": 299203072.0, + "94": 299203072.0, + "95": 299203072.0, + "96": 299203072.0, + "97": 299203072.0, + "98": 299203072.0, + "99": 299203072.0, + "100": 299203072.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 977125888.0, + "2": 1042071040.0, + "3": 1042071040.0, + "4": 1042071040.0, + "5": 1042071040.0, + "6": 1042071040.0, + "7": 1042071040.0, + "8": 1042071040.0, + "9": 1042071040.0, + "10": 1042071040.0, + "11": 1042071040.0, + "12": 1042071040.0, + "13": 1042071040.0, + "14": 1042071040.0, + "15": 1042071040.0, + "16": 1042071040.0, + "17": 1042071040.0, + "18": 1042071040.0, + "19": 1042071040.0, + "20": 1042071040.0, + "21": 1042071040.0, + "22": 1042071040.0, + "23": 1042071040.0, + "24": 1042071040.0, + "25": 1042071040.0, + "26": 1042071040.0, + "27": 1042071040.0, + "28": 1042071040.0, + "29": 1042071040.0, + "30": 1042071040.0, + "31": 1042071040.0, + "32": 1042071040.0, + "33": 1042071040.0, + "34": 1042071040.0, + "35": 1042071040.0, + "36": 1042071040.0, + "37": 1042071040.0, + "38": 1042071040.0, + "39": 1042071040.0, + "40": 1042071040.0, + "41": 1042071040.0, + "42": 1042071040.0, + "43": 1042071040.0, + "44": 1042071040.0, + "45": 1042071040.0, + "46": 1042071040.0, + "47": 1042071040.0, + "48": 1042071040.0, + "49": 1042071040.0, + "50": 1042071040.0, + "51": 1042071040.0, + "52": 1042071040.0, + "53": 1042071040.0, + "54": 1042071040.0, + "55": 1042071040.0, + "56": 1042071040.0, + "57": 1042071040.0, + "58": 1042071040.0, + "59": 1042071040.0, + "60": 1042071040.0, + "61": 1042071040.0, + "62": 1042071040.0, + "63": 1042071040.0, + "64": 1042071040.0, + "65": 1042071040.0, + "66": 1042071040.0, + "67": 1042071040.0, + "68": 1042071040.0, + "69": 1042071040.0, + "70": 1042071040.0, + "71": 1042071040.0, + "72": 1042071040.0, + "73": 1042071040.0, + "74": 1042071040.0, + "75": 1042071040.0, + "76": 1042071040.0, + "77": 1042071040.0, + "78": 1042071040.0, + "79": 1042071040.0, + "80": 1042071040.0, + "81": 1042071040.0, + "82": 1042071040.0, + "83": 1042071040.0, + "84": 1042071040.0, + "85": 1042071040.0, + "86": 1042071040.0, + "87": 1042071040.0, + "88": 1042071040.0, + "89": 1042071040.0, + "90": 1042071040.0, + "91": 1042071040.0, + "92": 1042071040.0, + "93": 1042071040.0, + "94": 1042071040.0, + "95": 1042071040.0, + "96": 1042071040.0, + "97": 1042071040.0, + "98": 1042071040.0, + "99": 1042071040.0, + "100": 1042071040.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 8.89047, + "2": 0.20763, + "3": 0.17962, + "4": 0.17996, + "5": 0.19517, + "6": 0.19097, + "7": 0.21371, + "8": 0.17946, + "9": 0.18028, + "10": 0.17811, + "11": 0.19549, + "12": 0.17995, + "13": 0.17967, + "14": 0.17747, + "15": 0.17854, + "16": 0.18132, + "17": 0.18068, + "18": 0.20382, + "19": 0.18932, + "20": 0.18279, + "21": 0.18143, + "22": 0.18461, + "23": 0.18263, + "24": 0.19677, + "25": 0.18399, + "26": 0.18138, + "27": 0.18309, + "28": 0.18505, + "29": 0.18571, + "30": 0.19268, + "31": 0.18694, + "32": 0.2033, + "33": 0.20046, + "34": 0.20101, + "35": 0.18537, + "36": 0.18526, + "37": 0.18418, + "38": 0.18481, + "39": 0.1813, + "40": 0.1837, + "41": 0.17918, + "42": 0.18044, + "43": 0.18093, + "44": 0.17996, + "45": 0.18187, + "46": 0.18178, + "47": 0.1859, + "48": 0.18306, + "49": 0.18442, + "50": 0.17901, + "51": 0.19352, + "52": 0.19143, + "53": 0.18977, + "54": 0.18373, + "55": 0.1848, + "56": 0.18899, + "57": 0.18927, + "58": 0.18981, + "59": 0.18717, + "60": 0.18468, + "61": 0.18658, + "62": 0.18885, + "63": 0.18928, + "64": 0.18734, + "65": 0.18347, + "66": 0.18338, + "67": 0.18495, + "68": 0.19141, + "69": 0.18134, + "70": 0.18277, + "71": 0.18011, + "72": 0.18334, + "73": 0.18723, + "74": 0.18857, + "75": 0.18474, + "76": 0.18198, + "77": 0.18177, + "78": 0.18552, + "79": 0.18363, + "80": 0.18411, + "81": 0.18648, + "82": 0.18145, + "83": 0.1831, + "84": 0.18203, + "85": 0.18466, + "86": 0.17969, + "87": 0.18127, + "88": 0.18208, + "89": 0.18448, + "90": 0.2123, + "91": 0.18681, + "92": 0.18312, + "93": 0.18238, + "94": 0.18152, + "95": 0.17994, + "96": 0.18524, + "97": 0.18522, + "98": 0.18434, + "99": 0.19103, + "100": 0.19147 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..b61916ffd95 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84163, + "2": 10.85598, + "3": 10.84413, + "4": 10.84124, + "5": 10.85872, + "6": 10.86316, + "7": 10.85184, + "8": 10.84645, + "9": 10.85647, + "10": 10.81849, + "11": 10.85923, + "12": 10.84285, + "13": 10.86432, + "14": 10.85423, + "15": 10.81015, + "16": 10.81588, + "17": 10.78949, + "18": 10.79683, + "19": 10.79073, + "20": 10.70819, + "21": 10.69322, + "22": 10.58504, + "23": 10.70217, + "24": 10.60546, + "25": 10.57102, + "26": 10.61967, + "27": 10.61501, + "28": 10.56369, + "29": 10.56725, + "30": 10.39695, + "31": 10.16591, + "32": 10.4573, + "33": 10.45199, + "34": 10.2392, + "35": 10.28351, + "36": 10.24677, + "37": 10.3427, + "38": 10.20546, + "39": 10.39187, + "40": 10.09767, + "41": 10.1526, + "42": 10.21051, + "43": 9.87726, + "44": 9.98291, + "45": 9.86165, + "46": 9.83587, + "47": 10.13369, + "48": 9.87212, + "49": 9.56121, + "50": 9.91045, + "51": 9.85839, + "52": 9.7506, + "53": 10.05817, + "54": 9.96076, + "55": 9.88738, + "56": 9.6344, + "57": 9.4967, + "58": 9.83343, + "59": 9.59391, + "60": 9.51376, + "61": 9.69928, + "62": 9.98089, + "63": 9.39065, + "64": 9.77599, + "65": 8.9571, + "66": 9.70054, + "67": 9.37, + "68": 9.78529, + "69": 9.78966, + "70": 9.74676, + "71": 9.61906, + "72": 9.58963, + "73": 9.49629, + "74": 8.94963, + "75": 9.42381, + "76": 9.07799, + "77": 10.07105, + "78": 9.72632, + "79": 9.37966, + "80": 9.40721, + "81": 9.48238, + "82": 9.70152, + "83": 9.30657, + "84": 9.41464, + "85": 9.61784, + "86": 9.08212, + "87": 9.59511, + "88": 9.75008, + "89": 9.60356, + "90": 9.82256, + "91": 9.33721, + "92": 9.35861, + "93": 9.07956, + "94": 8.83268, + "95": 9.51351, + "96": 9.52947, + "97": 9.31813, + "98": 9.67451, + "99": 8.88607, + "100": 9.40106 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1736.0, + "2": 1692.0, + "3": 1695.0, + "4": 1761.0, + "5": 1955.0, + "6": 1791.0, + "7": 1943.0, + "8": 1681.0, + "9": 1884.0, + "10": 1441.0, + "11": 1942.0, + "12": 1786.0, + "13": 1940.0, + "14": 1862.0, + "15": 1907.0, + "16": 1947.0, + "17": 1827.0, + "18": 1907.0, + "19": 1818.0, + "20": 1700.0, + "21": 1911.0, + "22": 1720.0, + "23": 1938.0, + "24": 1707.0, + "25": 1686.0, + "26": 1792.0, + "27": 1891.0, + "28": 1976.0, + "29": 1958.0, + "30": 1941.0, + "31": 1622.0, + "32": 1970.0, + "33": 2129.0, + "34": 1830.0, + "35": 1907.0, + "36": 1892.0, + "37": 2395.0, + "38": 2161.0, + "39": 2493.0, + "40": 2224.0, + "41": 2201.0, + "42": 2175.0, + "43": 1920.0, + "44": 1955.0, + "45": 1956.0, + "46": 2166.0, + "47": 2517.0, + "48": 2272.0, + "49": 2211.0, + "50": 2232.0, + "51": 2621.0, + "52": 2597.0, + "53": 2926.0, + "54": 2633.0, + "55": 2206.0, + "56": 2627.0, + "57": 2328.0, + "58": 2886.0, + "59": 2639.0, + "60": 2157.0, + "61": 2736.0, + "62": 2544.0, + "63": 2332.0, + "64": 2948.0, + "65": 2630.0, + "66": 2931.0, + "67": 2717.0, + "68": 2643.0, + "69": 2955.0, + "70": 3040.0, + "71": 2882.0, + "72": 2390.0, + "73": 2812.0, + "74": 1844.0, + "75": 2461.0, + "76": 3067.0, + "77": 3152.0, + "78": 3018.0, + "79": 3008.0, + "80": 3104.0, + "81": 3589.0, + "82": 3218.0, + "83": 2748.0, + "84": 3217.0, + "85": 3167.0, + "86": 2876.0, + "87": 3604.0, + "88": 3017.0, + "89": 3249.0, + "90": 3069.0, + "91": 2865.0, + "92": 3074.0, + "93": 2680.0, + "94": 3392.0, + "95": 3206.0, + "96": 3401.0, + "97": 3107.0, + "98": 3624.0, + "99": 3007.0, + "100": 3111.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 299203072.0, + "2": 299203072.0, + "3": 299203072.0, + "4": 299203072.0, + "5": 299203072.0, + "6": 299203072.0, + "7": 299203072.0, + "8": 299203072.0, + "9": 299203072.0, + "10": 299203072.0, + "11": 299203072.0, + "12": 299203072.0, + "13": 299203072.0, + "14": 299203072.0, + "15": 299203072.0, + "16": 299203072.0, + "17": 299203072.0, + "18": 299203072.0, + "19": 299203072.0, + "20": 299203072.0, + "21": 299203072.0, + "22": 299203072.0, + "23": 299203072.0, + "24": 299203072.0, + "25": 299203072.0, + "26": 299203072.0, + "27": 299203072.0, + "28": 299203072.0, + "29": 299203072.0, + "30": 299203072.0, + "31": 299203072.0, + "32": 299203072.0, + "33": 299203072.0, + "34": 299203072.0, + "35": 299203072.0, + "36": 299203072.0, + "37": 299203072.0, + "38": 299203072.0, + "39": 299203072.0, + "40": 299203072.0, + "41": 299203072.0, + "42": 299203072.0, + "43": 299203072.0, + "44": 299203072.0, + "45": 299203072.0, + "46": 299203072.0, + "47": 299203072.0, + "48": 299203072.0, + "49": 299203072.0, + "50": 299203072.0, + "51": 299203072.0, + "52": 299203072.0, + "53": 299203072.0, + "54": 299203072.0, + "55": 299203072.0, + "56": 299203072.0, + "57": 299203072.0, + "58": 299203072.0, + "59": 299203072.0, + "60": 299203072.0, + "61": 299203072.0, + "62": 299203072.0, + "63": 299203072.0, + "64": 299203072.0, + "65": 299203072.0, + "66": 299203072.0, + "67": 299203072.0, + "68": 299203072.0, + "69": 299203072.0, + "70": 299203072.0, + "71": 299203072.0, + "72": 299203072.0, + "73": 299203072.0, + "74": 299203072.0, + "75": 299203072.0, + "76": 299203072.0, + "77": 299203072.0, + "78": 299203072.0, + "79": 299203072.0, + "80": 299203072.0, + "81": 299203072.0, + "82": 299203072.0, + "83": 299203072.0, + "84": 299203072.0, + "85": 299203072.0, + "86": 299203072.0, + "87": 299203072.0, + "88": 299203072.0, + "89": 299203072.0, + "90": 299203072.0, + "91": 299203072.0, + "92": 299203072.0, + "93": 299203072.0, + "94": 299203072.0, + "95": 299203072.0, + "96": 299203072.0, + "97": 299203072.0, + "98": 299203072.0, + "99": 299203072.0, + "100": 299203072.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 977125888.0, + "2": 1042071040.0, + "3": 1042071040.0, + "4": 1042071040.0, + "5": 1042071040.0, + "6": 1042071040.0, + "7": 1042071040.0, + "8": 1042071040.0, + "9": 1042071040.0, + "10": 1042071040.0, + "11": 1042071040.0, + "12": 1042071040.0, + "13": 1042071040.0, + "14": 1042071040.0, + "15": 1042071040.0, + "16": 1042071040.0, + "17": 1042071040.0, + "18": 1042071040.0, + "19": 1042071040.0, + "20": 1042071040.0, + "21": 1042071040.0, + "22": 1042071040.0, + "23": 1042071040.0, + "24": 1042071040.0, + "25": 1042071040.0, + "26": 1042071040.0, + "27": 1042071040.0, + "28": 1042071040.0, + "29": 1042071040.0, + "30": 1042071040.0, + "31": 1042071040.0, + "32": 1042071040.0, + "33": 1042071040.0, + "34": 1042071040.0, + "35": 1042071040.0, + "36": 1042071040.0, + "37": 1042071040.0, + "38": 1042071040.0, + "39": 1042071040.0, + "40": 1042071040.0, + "41": 1042071040.0, + "42": 1042071040.0, + "43": 1042071040.0, + "44": 1042071040.0, + "45": 1042071040.0, + "46": 1042071040.0, + "47": 1042071040.0, + "48": 1042071040.0, + "49": 1042071040.0, + "50": 1042071040.0, + "51": 1042071040.0, + "52": 1042071040.0, + "53": 1042071040.0, + "54": 1042071040.0, + "55": 1042071040.0, + "56": 1042071040.0, + "57": 1042071040.0, + "58": 1042071040.0, + "59": 1042071040.0, + "60": 1042071040.0, + "61": 1042071040.0, + "62": 1042071040.0, + "63": 1042071040.0, + "64": 1042071040.0, + "65": 1042071040.0, + "66": 1042071040.0, + "67": 1042071040.0, + "68": 1042071040.0, + "69": 1042071040.0, + "70": 1042071040.0, + "71": 1042071040.0, + "72": 1042071040.0, + "73": 1042071040.0, + "74": 1042071040.0, + "75": 1042071040.0, + "76": 1042071040.0, + "77": 1042071040.0, + "78": 1042071040.0, + "79": 1042071040.0, + "80": 1042071040.0, + "81": 1042071040.0, + "82": 1042071040.0, + "83": 1042071040.0, + "84": 1042071040.0, + "85": 1042071040.0, + "86": 1042071040.0, + "87": 1042071040.0, + "88": 1042071040.0, + "89": 1042071040.0, + "90": 1042071040.0, + "91": 1042071040.0, + "92": 1042071040.0, + "93": 1042071040.0, + "94": 1042071040.0, + "95": 1042071040.0, + "96": 1042071040.0, + "97": 1042071040.0, + "98": 1042071040.0, + "99": 1042071040.0, + "100": 1042071040.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.40872, + "2": 0.25886, + "3": 0.22849, + "4": 0.21099, + "5": 0.21193, + "6": 0.20863, + "7": 0.20987, + "8": 0.21014, + "9": 0.21139, + "10": 0.21148, + "11": 0.21513, + "12": 0.21915, + "13": 0.21037, + "14": 0.20786, + "15": 0.20927, + "16": 0.20756, + "17": 0.21005, + "18": 0.21022, + "19": 0.21019, + "20": 0.21012, + "21": 0.20995, + "22": 0.21005, + "23": 0.21213, + "24": 0.20995, + "25": 0.20776, + "26": 0.21296, + "27": 0.20984, + "28": 0.21526, + "29": 0.21164, + "30": 0.21175, + "31": 0.21062, + "32": 0.21292, + "33": 0.20962, + "34": 0.21025, + "35": 0.20968, + "36": 0.21367, + "37": 0.20989, + "38": 0.21034, + "39": 0.20979, + "40": 0.21092, + "41": 0.21065, + "42": 0.20865, + "43": 0.20939, + "44": 0.21656, + "45": 0.21131, + "46": 0.21087, + "47": 0.23723, + "48": 0.21006, + "49": 0.21157, + "50": 0.20975, + "51": 0.21952, + "52": 0.21306, + "53": 0.21253, + "54": 0.21223, + "55": 0.21336, + "56": 0.21514, + "57": 0.21536, + "58": 0.21288, + "59": 0.21211, + "60": 0.21298, + "61": 0.21285, + "62": 0.21438, + "63": 0.21461, + "64": 0.21382, + "65": 0.22082, + "66": 0.21222, + "67": 0.21414, + "68": 0.21315, + "69": 0.2153, + "70": 0.2172, + "71": 0.21323, + "72": 0.21366, + "73": 0.21434, + "74": 0.21455, + "75": 0.21545, + "76": 0.21631, + "77": 0.21419, + "78": 0.21365, + "79": 0.21514, + "80": 0.21447, + "81": 0.21379, + "82": 0.21487, + "83": 0.21038, + "84": 0.21708, + "85": 0.21166, + "86": 0.2141, + "87": 0.21613, + "88": 0.21214, + "89": 0.21499, + "90": 0.21811, + "91": 0.21563, + "92": 0.2152, + "93": 0.21548, + "94": 0.21863, + "95": 0.21366, + "96": 0.21458, + "97": 0.21279, + "98": 0.21555, + "99": 0.213, + "100": 0.2112 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..81ace8a79cb --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.86122, + "2": 10.85774, + "3": 10.86039, + "4": 10.84813, + "5": 10.88242, + "6": 10.88645, + "7": 10.86227, + "8": 10.86932, + "9": 10.86444, + "10": 10.83506, + "11": 10.87765, + "12": 10.87384, + "13": 10.87945, + "14": 10.88919, + "15": 10.82738, + "16": 10.83105, + "17": 10.79888, + "18": 10.82441, + "19": 10.81363, + "20": 10.72743, + "21": 10.71638, + "22": 10.57153, + "23": 10.7269, + "24": 10.61223, + "25": 10.55753, + "26": 10.60603, + "27": 10.61792, + "28": 10.57695, + "29": 10.59633, + "30": 10.37895, + "31": 10.13125, + "32": 10.47822, + "33": 10.46894, + "34": 10.22715, + "35": 10.28321, + "36": 10.22751, + "37": 10.35397, + "38": 10.20483, + "39": 10.40755, + "40": 10.08785, + "41": 10.1591, + "42": 10.21601, + "43": 9.84821, + "44": 9.9651, + "45": 9.82625, + "46": 9.83468, + "47": 10.15337, + "48": 9.84529, + "49": 9.52926, + "50": 9.91327, + "51": 9.8517, + "52": 9.74686, + "53": 10.07204, + "54": 9.95738, + "55": 9.87788, + "56": 9.62943, + "57": 9.48988, + "58": 9.83265, + "59": 9.58831, + "60": 9.50874, + "61": 9.69495, + "62": 9.99373, + "63": 9.377, + "64": 9.78004, + "65": 8.95103, + "66": 9.71392, + "67": 9.37884, + "68": 9.78831, + "69": 9.79096, + "70": 9.73167, + "71": 9.61776, + "72": 9.59099, + "73": 9.49436, + "74": 8.95001, + "75": 9.43681, + "76": 9.09852, + "77": 10.06447, + "78": 9.72944, + "79": 9.37805, + "80": 9.41156, + "81": 9.48537, + "82": 9.69592, + "83": 9.31981, + "84": 9.42306, + "85": 9.61613, + "86": 9.07185, + "87": 9.59282, + "88": 9.75055, + "89": 9.61194, + "90": 9.8217, + "91": 9.35308, + "92": 9.36305, + "93": 9.08788, + "94": 8.83439, + "95": 9.5191, + "96": 9.52647, + "97": 9.31412, + "98": 9.67541, + "99": 8.88941, + "100": 9.40588 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1778.0, + "2": 1875.0, + "3": 1879.0, + "4": 1912.0, + "5": 2219.0, + "6": 2163.0, + "7": 2113.0, + "8": 1747.0, + "9": 2049.0, + "10": 1530.0, + "11": 2113.0, + "12": 1959.0, + "13": 2134.0, + "14": 2055.0, + "15": 2125.0, + "16": 2139.0, + "17": 1988.0, + "18": 1892.0, + "19": 1991.0, + "20": 1867.0, + "21": 2023.0, + "22": 1865.0, + "23": 2185.0, + "24": 1774.0, + "25": 1773.0, + "26": 1990.0, + "27": 2061.0, + "28": 2215.0, + "29": 2186.0, + "30": 2129.0, + "31": 1794.0, + "32": 2109.0, + "33": 2422.0, + "34": 2135.0, + "35": 2169.0, + "36": 2127.0, + "37": 2432.0, + "38": 2490.0, + "39": 2495.0, + "40": 2486.0, + "41": 2465.0, + "42": 2535.0, + "43": 2216.0, + "44": 2407.0, + "45": 2335.0, + "46": 2617.0, + "47": 2830.0, + "48": 2480.0, + "49": 2492.0, + "50": 2687.0, + "51": 2863.0, + "52": 2881.0, + "53": 3220.0, + "54": 2894.0, + "55": 2652.0, + "56": 3006.0, + "57": 2561.0, + "58": 3273.0, + "59": 3039.0, + "60": 2765.0, + "61": 3310.0, + "62": 2936.0, + "63": 2630.0, + "64": 3230.0, + "65": 2946.0, + "66": 3500.0, + "67": 2976.0, + "68": 2944.0, + "69": 3117.0, + "70": 3629.0, + "71": 3255.0, + "72": 2633.0, + "73": 3338.0, + "74": 2172.0, + "75": 2702.0, + "76": 3162.0, + "77": 3850.0, + "78": 3590.0, + "79": 3658.0, + "80": 3866.0, + "81": 3976.0, + "82": 3680.0, + "83": 3153.0, + "84": 3586.0, + "85": 3517.0, + "86": 3137.0, + "87": 4177.0, + "88": 3589.0, + "89": 3849.0, + "90": 3349.0, + "91": 2936.0, + "92": 3526.0, + "93": 2965.0, + "94": 3772.0, + "95": 3530.0, + "96": 3774.0, + "97": 3636.0, + "98": 4064.0, + "99": 3394.0, + "100": 3530.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 269891584.0, + "2": 269891584.0, + "3": 269891584.0, + "4": 269891584.0, + "5": 269891584.0, + "6": 269891584.0, + "7": 269891584.0, + "8": 269891584.0, + "9": 269891584.0, + "10": 269891584.0, + "11": 269891584.0, + "12": 269891584.0, + "13": 269891584.0, + "14": 269891584.0, + "15": 269891584.0, + "16": 269891584.0, + "17": 269891584.0, + "18": 269891584.0, + "19": 269891584.0, + "20": 269891584.0, + "21": 269891584.0, + "22": 269891584.0, + "23": 269891584.0, + "24": 269891584.0, + "25": 269891584.0, + "26": 269891584.0, + "27": 269891584.0, + "28": 269891584.0, + "29": 269891584.0, + "30": 269891584.0, + "31": 269891584.0, + "32": 269891584.0, + "33": 269891584.0, + "34": 269891584.0, + "35": 269891584.0, + "36": 269891584.0, + "37": 269891584.0, + "38": 269891584.0, + "39": 269891584.0, + "40": 269891584.0, + "41": 269891584.0, + "42": 269891584.0, + "43": 269891584.0, + "44": 269891584.0, + "45": 269891584.0, + "46": 269891584.0, + "47": 269891584.0, + "48": 269891584.0, + "49": 269891584.0, + "50": 269891584.0, + "51": 269891584.0, + "52": 269891584.0, + "53": 269891584.0, + "54": 269891584.0, + "55": 269891584.0, + "56": 269891584.0, + "57": 269891584.0, + "58": 269891584.0, + "59": 269891584.0, + "60": 269891584.0, + "61": 269891584.0, + "62": 269891584.0, + "63": 269891584.0, + "64": 269891584.0, + "65": 269891584.0, + "66": 269891584.0, + "67": 269891584.0, + "68": 269891584.0, + "69": 269891584.0, + "70": 269891584.0, + "71": 269891584.0, + "72": 269891584.0, + "73": 269891584.0, + "74": 269891584.0, + "75": 269891584.0, + "76": 269891584.0, + "77": 269891584.0, + "78": 269891584.0, + "79": 269891584.0, + "80": 269891584.0, + "81": 269891584.0, + "82": 269891584.0, + "83": 269891584.0, + "84": 269891584.0, + "85": 269891584.0, + "86": 269891584.0, + "87": 269891584.0, + "88": 269891584.0, + "89": 269891584.0, + "90": 269891584.0, + "91": 269891584.0, + "92": 269891584.0, + "93": 269891584.0, + "94": 269891584.0, + "95": 269891584.0, + "96": 269891584.0, + "97": 269891584.0, + "98": 269891584.0, + "99": 269891584.0, + "100": 269891584.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1450731008.0, + "2": 1515674112.0, + "3": 1515674112.0, + "4": 1515676672.0, + "5": 1515676672.0, + "6": 1515676672.0, + "7": 1515676672.0, + "8": 1515676672.0, + "9": 1515676672.0, + "10": 1515676672.0, + "11": 1515676672.0, + "12": 1515676672.0, + "13": 1515676672.0, + "14": 1515676672.0, + "15": 1515676672.0, + "16": 1515676672.0, + "17": 1515676672.0, + "18": 1515676672.0, + "19": 1515676672.0, + "20": 1515676672.0, + "21": 1515676672.0, + "22": 1515676672.0, + "23": 1515676672.0, + "24": 1515676672.0, + "25": 1515676672.0, + "26": 1515676672.0, + "27": 1515676672.0, + "28": 1515676672.0, + "29": 1515676672.0, + "30": 1515676672.0, + "31": 1515676672.0, + "32": 1515676672.0, + "33": 1515676672.0, + "34": 1515676672.0, + "35": 1515676672.0, + "36": 1515676672.0, + "37": 1515676672.0, + "38": 1515676672.0, + "39": 1515676672.0, + "40": 1515676672.0, + "41": 1515676672.0, + "42": 1515676672.0, + "43": 1515676672.0, + "44": 1515676672.0, + "45": 1515676672.0, + "46": 1515676672.0, + "47": 1515676672.0, + "48": 1515676672.0, + "49": 1515676672.0, + "50": 1515676672.0, + "51": 1515676672.0, + "52": 1515676672.0, + "53": 1515676672.0, + "54": 1515676672.0, + "55": 1515676672.0, + "56": 1515676672.0, + "57": 1515676672.0, + "58": 1515676672.0, + "59": 1515676672.0, + "60": 1515676672.0, + "61": 1515676672.0, + "62": 1515676672.0, + "63": 1515676672.0, + "64": 1515676672.0, + "65": 1515676672.0, + "66": 1515676672.0, + "67": 1515676672.0, + "68": 1515676672.0, + "69": 1515676672.0, + "70": 1515676672.0, + "71": 1515676672.0, + "72": 1515676672.0, + "73": 1515676672.0, + "74": 1515676672.0, + "75": 1515676672.0, + "76": 1515676672.0, + "77": 1515676672.0, + "78": 1515676672.0, + "79": 1515676672.0, + "80": 1515676672.0, + "81": 1515676672.0, + "82": 1515676672.0, + "83": 1515676672.0, + "84": 1515676672.0, + "85": 1515676672.0, + "86": 1515676672.0, + "87": 1515676672.0, + "88": 1515676672.0, + "89": 1515676672.0, + "90": 1515676672.0, + "91": 1515676672.0, + "92": 1515676672.0, + "93": 1515676672.0, + "94": 1515676672.0, + "95": 1515676672.0, + "96": 1515676672.0, + "97": 1515676672.0, + "98": 1515676672.0, + "99": 1515676672.0, + "100": 1515676672.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.9602, + "2": 0.41251, + "3": 0.31981, + "4": 0.60672, + "5": 0.31803, + "6": 0.66653, + "7": 0.31576, + "8": 0.3144, + "9": 0.31826, + "10": 0.31784, + "11": 0.31454, + "12": 0.32345, + "13": 0.31961, + "14": 0.31476, + "15": 0.31408, + "16": 0.32159, + "17": 0.31403, + "18": 0.31562, + "19": 0.32035, + "20": 0.31437, + "21": 0.50323, + "22": 0.33172, + "23": 0.31117, + "24": 0.31643, + "25": 0.3168, + "26": 0.3138, + "27": 0.31191, + "28": 0.31811, + "29": 0.31647, + "30": 0.31136, + "31": 0.31853, + "32": 0.31298, + "33": 0.3122, + "34": 0.3186, + "35": 0.31452, + "36": 0.32563, + "37": 0.31553, + "38": 0.31645, + "39": 0.31114, + "40": 0.3168, + "41": 0.31551, + "42": 0.31104, + "43": 0.31222, + "44": 0.31802, + "45": 0.53643, + "46": 0.3183, + "47": 0.3153, + "48": 0.31286, + "49": 0.31479, + "50": 0.31499, + "51": 0.3247, + "52": 0.31654, + "53": 0.3232, + "54": 0.32124, + "55": 0.31559, + "56": 0.32351, + "57": 0.3268, + "58": 0.31694, + "59": 0.31819, + "60": 0.3242, + "61": 0.31589, + "62": 0.31803, + "63": 0.32889, + "64": 0.31711, + "65": 0.3785, + "66": 0.37396, + "67": 0.33125, + "68": 0.31565, + "69": 0.32166, + "70": 0.37482, + "71": 0.37713, + "72": 0.37561, + "73": 0.37465, + "74": 0.37751, + "75": 0.37312, + "76": 0.37068, + "77": 0.3832, + "78": 0.3167, + "79": 0.31782, + "80": 0.32031, + "81": 0.31714, + "82": 0.31525, + "83": 0.32517, + "84": 0.31649, + "85": 0.31435, + "86": 0.32096, + "87": 0.31842, + "88": 0.31539, + "89": 0.32202, + "90": 0.3206, + "91": 0.31482, + "92": 0.32002, + "93": 0.31779, + "94": 0.31471, + "95": 0.31708, + "96": 0.31884, + "97": 0.31586, + "98": 0.31494, + "99": 0.32657, + "100": 0.31839 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..d6b97c844a2 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.86122, + "2": 10.85774, + "3": 10.86039, + "4": 10.84813, + "5": 10.88242, + "6": 10.88645, + "7": 10.86227, + "8": 10.86932, + "9": 10.86444, + "10": 10.83506, + "11": 10.87765, + "12": 10.87384, + "13": 10.87945, + "14": 10.88919, + "15": 10.82738, + "16": 10.83105, + "17": 10.79888, + "18": 10.82441, + "19": 10.81363, + "20": 10.72743, + "21": 10.71638, + "22": 10.57153, + "23": 10.7269, + "24": 10.61223, + "25": 10.55753, + "26": 10.60603, + "27": 10.61792, + "28": 10.57695, + "29": 10.59633, + "30": 10.37895, + "31": 10.13125, + "32": 10.47822, + "33": 10.46894, + "34": 10.22715, + "35": 10.28321, + "36": 10.22751, + "37": 10.35397, + "38": 10.20483, + "39": 10.40755, + "40": 10.08785, + "41": 10.1591, + "42": 10.21601, + "43": 9.84821, + "44": 9.9651, + "45": 9.82625, + "46": 9.83468, + "47": 10.15337, + "48": 9.84529, + "49": 9.52926, + "50": 9.91327, + "51": 9.8517, + "52": 9.74686, + "53": 10.07204, + "54": 9.95738, + "55": 9.87788, + "56": 9.62943, + "57": 9.48988, + "58": 9.83265, + "59": 9.58831, + "60": 9.50874, + "61": 9.69495, + "62": 9.99373, + "63": 9.377, + "64": 9.78004, + "65": 8.95103, + "66": 9.71392, + "67": 9.37884, + "68": 9.78831, + "69": 9.79096, + "70": 9.73167, + "71": 9.61776, + "72": 9.59099, + "73": 9.49436, + "74": 8.95001, + "75": 9.43681, + "76": 9.09852, + "77": 10.06447, + "78": 9.72944, + "79": 9.37805, + "80": 9.41156, + "81": 9.48537, + "82": 9.69592, + "83": 9.31981, + "84": 9.42306, + "85": 9.61613, + "86": 9.07185, + "87": 9.59282, + "88": 9.75055, + "89": 9.61194, + "90": 9.8217, + "91": 9.35308, + "92": 9.36305, + "93": 9.08788, + "94": 8.83439, + "95": 9.5191, + "96": 9.52647, + "97": 9.31412, + "98": 9.67541, + "99": 8.88941, + "100": 9.40588 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1778.0, + "2": 1875.0, + "3": 1879.0, + "4": 1912.0, + "5": 2219.0, + "6": 2163.0, + "7": 2113.0, + "8": 1747.0, + "9": 2049.0, + "10": 1530.0, + "11": 2113.0, + "12": 1959.0, + "13": 2134.0, + "14": 2055.0, + "15": 2125.0, + "16": 2139.0, + "17": 1988.0, + "18": 1892.0, + "19": 1991.0, + "20": 1867.0, + "21": 2023.0, + "22": 1865.0, + "23": 2185.0, + "24": 1774.0, + "25": 1773.0, + "26": 1990.0, + "27": 2061.0, + "28": 2215.0, + "29": 2186.0, + "30": 2129.0, + "31": 1794.0, + "32": 2109.0, + "33": 2422.0, + "34": 2135.0, + "35": 2169.0, + "36": 2127.0, + "37": 2432.0, + "38": 2490.0, + "39": 2495.0, + "40": 2486.0, + "41": 2465.0, + "42": 2535.0, + "43": 2216.0, + "44": 2407.0, + "45": 2335.0, + "46": 2617.0, + "47": 2830.0, + "48": 2480.0, + "49": 2492.0, + "50": 2687.0, + "51": 2863.0, + "52": 2881.0, + "53": 3220.0, + "54": 2894.0, + "55": 2652.0, + "56": 3006.0, + "57": 2561.0, + "58": 3273.0, + "59": 3039.0, + "60": 2765.0, + "61": 3310.0, + "62": 2936.0, + "63": 2630.0, + "64": 3230.0, + "65": 2946.0, + "66": 3500.0, + "67": 2976.0, + "68": 2944.0, + "69": 3117.0, + "70": 3629.0, + "71": 3255.0, + "72": 2633.0, + "73": 3338.0, + "74": 2172.0, + "75": 2702.0, + "76": 3162.0, + "77": 3850.0, + "78": 3590.0, + "79": 3658.0, + "80": 3866.0, + "81": 3976.0, + "82": 3680.0, + "83": 3153.0, + "84": 3586.0, + "85": 3517.0, + "86": 3137.0, + "87": 4177.0, + "88": 3589.0, + "89": 3849.0, + "90": 3349.0, + "91": 2936.0, + "92": 3526.0, + "93": 2965.0, + "94": 3772.0, + "95": 3530.0, + "96": 3774.0, + "97": 3636.0, + "98": 4064.0, + "99": 3394.0, + "100": 3530.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 269891584.0, + "2": 269891584.0, + "3": 269891584.0, + "4": 269891584.0, + "5": 269891584.0, + "6": 269891584.0, + "7": 269891584.0, + "8": 269891584.0, + "9": 269891584.0, + "10": 269891584.0, + "11": 269891584.0, + "12": 269891584.0, + "13": 269891584.0, + "14": 269891584.0, + "15": 269891584.0, + "16": 269891584.0, + "17": 269891584.0, + "18": 269891584.0, + "19": 269891584.0, + "20": 269891584.0, + "21": 269891584.0, + "22": 269891584.0, + "23": 269891584.0, + "24": 269891584.0, + "25": 269891584.0, + "26": 269891584.0, + "27": 269891584.0, + "28": 269891584.0, + "29": 269891584.0, + "30": 269891584.0, + "31": 269891584.0, + "32": 269891584.0, + "33": 269891584.0, + "34": 269891584.0, + "35": 269891584.0, + "36": 269891584.0, + "37": 269891584.0, + "38": 269891584.0, + "39": 269891584.0, + "40": 269891584.0, + "41": 269891584.0, + "42": 269891584.0, + "43": 269891584.0, + "44": 269891584.0, + "45": 269891584.0, + "46": 269891584.0, + "47": 269891584.0, + "48": 269891584.0, + "49": 269891584.0, + "50": 269891584.0, + "51": 269891584.0, + "52": 269891584.0, + "53": 269891584.0, + "54": 269891584.0, + "55": 269891584.0, + "56": 269891584.0, + "57": 269891584.0, + "58": 269891584.0, + "59": 269891584.0, + "60": 269891584.0, + "61": 269891584.0, + "62": 269891584.0, + "63": 269891584.0, + "64": 269891584.0, + "65": 269891584.0, + "66": 269891584.0, + "67": 269891584.0, + "68": 269891584.0, + "69": 269891584.0, + "70": 269891584.0, + "71": 269891584.0, + "72": 269891584.0, + "73": 269891584.0, + "74": 269891584.0, + "75": 269891584.0, + "76": 269891584.0, + "77": 269891584.0, + "78": 269891584.0, + "79": 269891584.0, + "80": 269891584.0, + "81": 269891584.0, + "82": 269891584.0, + "83": 269891584.0, + "84": 269891584.0, + "85": 269891584.0, + "86": 269891584.0, + "87": 269891584.0, + "88": 269891584.0, + "89": 269891584.0, + "90": 269891584.0, + "91": 269891584.0, + "92": 269891584.0, + "93": 269891584.0, + "94": 269891584.0, + "95": 269891584.0, + "96": 269891584.0, + "97": 269891584.0, + "98": 269891584.0, + "99": 269891584.0, + "100": 269891584.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1448634368.0, + "2": 1515676160.0, + "3": 1515676672.0, + "4": 1515676672.0, + "5": 1515676672.0, + "6": 1515676672.0, + "7": 1515676672.0, + "8": 1515676672.0, + "9": 1515676672.0, + "10": 1515676672.0, + "11": 1515676672.0, + "12": 1515676672.0, + "13": 1515676672.0, + "14": 1515676672.0, + "15": 1515676672.0, + "16": 1515676672.0, + "17": 1515676672.0, + "18": 1515676672.0, + "19": 1515676672.0, + "20": 1515676672.0, + "21": 1515676672.0, + "22": 1515676672.0, + "23": 1515676672.0, + "24": 1515676672.0, + "25": 1515676672.0, + "26": 1515676672.0, + "27": 1515676672.0, + "28": 1515676672.0, + "29": 1515676672.0, + "30": 1515676672.0, + "31": 1515676672.0, + "32": 1515676672.0, + "33": 1515676672.0, + "34": 1515676672.0, + "35": 1515676672.0, + "36": 1515676672.0, + "37": 1515676672.0, + "38": 1515676672.0, + "39": 1515676672.0, + "40": 1515676672.0, + "41": 1515676672.0, + "42": 1515676672.0, + "43": 1515676672.0, + "44": 1515676672.0, + "45": 1515676672.0, + "46": 1515676672.0, + "47": 1515676672.0, + "48": 1515676672.0, + "49": 1515676672.0, + "50": 1515676672.0, + "51": 1515676672.0, + "52": 1515676672.0, + "53": 1515676672.0, + "54": 1515676672.0, + "55": 1515676672.0, + "56": 1515676672.0, + "57": 1515676672.0, + "58": 1515676672.0, + "59": 1515676672.0, + "60": 1515676672.0, + "61": 1515676672.0, + "62": 1515676672.0, + "63": 1515676672.0, + "64": 1515676672.0, + "65": 1515676672.0, + "66": 1515676672.0, + "67": 1515676672.0, + "68": 1515676672.0, + "69": 1515676672.0, + "70": 1515676672.0, + "71": 1515676672.0, + "72": 1515676672.0, + "73": 1515676672.0, + "74": 1515676672.0, + "75": 1515676672.0, + "76": 1515676672.0, + "77": 1515676672.0, + "78": 1515676672.0, + "79": 1515676672.0, + "80": 1515676672.0, + "81": 1515676672.0, + "82": 1515676672.0, + "83": 1515676672.0, + "84": 1515676672.0, + "85": 1515676672.0, + "86": 1515676672.0, + "87": 1515676672.0, + "88": 1515676672.0, + "89": 1515676672.0, + "90": 1515676672.0, + "91": 1515676672.0, + "92": 1515676672.0, + "93": 1515676672.0, + "94": 1515676672.0, + "95": 1515676672.0, + "96": 1515676672.0, + "97": 1515676672.0, + "98": 1515676672.0, + "99": 1515676672.0, + "100": 1515676672.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.24087, + "2": 0.38421, + "3": 0.56749, + "4": 0.65933, + "5": 0.54431, + "6": 0.31357, + "7": 0.3132, + "8": 0.3209, + "9": 0.31313, + "10": 0.31289, + "11": 0.32184, + "12": 0.31161, + "13": 0.31148, + "14": 0.31861, + "15": 0.31107, + "16": 0.31197, + "17": 0.31486, + "18": 0.31483, + "19": 0.3123, + "20": 0.31575, + "21": 0.3191, + "22": 0.59133, + "23": 0.31699, + "24": 0.31207, + "25": 0.31265, + "26": 0.32043, + "27": 0.31399, + "28": 0.31217, + "29": 0.32071, + "30": 0.31121, + "31": 0.31193, + "32": 0.31757, + "33": 0.31731, + "34": 0.31154, + "35": 0.31452, + "36": 0.31823, + "37": 0.31136, + "38": 0.31179, + "39": 0.3179, + "40": 0.31084, + "41": 0.31144, + "42": 0.32061, + "43": 0.31112, + "44": 0.31208, + "45": 0.31884, + "46": 0.31114, + "47": 0.3115, + "48": 0.31509, + "49": 0.31746, + "50": 0.31201, + "51": 0.31606, + "52": 0.31175, + "53": 0.3173, + "54": 0.30985, + "55": 0.30955, + "56": 0.31445, + "57": 0.30938, + "58": 0.30971, + "59": 0.31705, + "60": 0.30877, + "61": 0.30909, + "62": 0.31179, + "63": 0.31576, + "64": 0.31125, + "65": 0.3109, + "66": 0.32501, + "67": 0.31051, + "68": 0.31016, + "69": 0.32083, + "70": 0.3086, + "71": 0.30949, + "72": 0.32156, + "73": 0.31102, + "74": 0.30938, + "75": 0.31802, + "76": 0.30998, + "77": 0.3092, + "78": 0.31341, + "79": 0.32109, + "80": 0.31014, + "81": 0.31196, + "82": 0.31938, + "83": 0.31078, + "84": 0.31077, + "85": 0.32048, + "86": 0.31124, + "87": 0.31023, + "88": 0.31956, + "89": 0.30978, + "90": 0.31199, + "91": 0.31731, + "92": 0.30981, + "93": 0.31067, + "94": 0.31383, + "95": 0.31976, + "96": 0.30998, + "97": 0.31195, + "98": 0.32159, + "99": 0.30804, + "100": 0.31193 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 6f422f501de..c387be284cf 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.84163, + "2": 10.85598, + "3": 10.84413, + "4": 10.84124, "5": 10.85872, + "6": 10.86316, + "7": 10.85184, + "8": 10.84645, + "9": 10.85647, "10": 10.81849, + "11": 10.85923, + "12": 10.84285, + "13": 10.86432, + "14": 10.85423, "15": 10.81015, + "16": 10.81588, + "17": 10.78949, + "18": 10.79683, + "19": 10.79073, "20": 10.70819, + "21": 10.69322, + "22": 10.58504, + "23": 10.70217, + "24": 10.60546, "25": 10.57102, + "26": 10.61967, + "27": 10.61501, + "28": 10.56369, + "29": 10.56725, "30": 10.39695, + "31": 10.16591, + "32": 10.4573, + "33": 10.45199, + "34": 10.2392, "35": 10.28351, + "36": 10.24677, + "37": 10.3427, + "38": 10.20546, + "39": 10.39187, "40": 10.09767, + "41": 10.1526, + "42": 10.21051, + "43": 9.87726, + "44": 9.98291, "45": 9.86165, + "46": 9.83587, + "47": 10.13369, + "48": 9.87212, + "49": 9.56121, "50": 9.91045, + "51": 9.85839, + "52": 9.7506, + "53": 10.05817, + "54": 9.96076, "55": 9.88738, + "56": 9.6344, + "57": 9.4967, + "58": 9.83343, + "59": 9.59391, "60": 9.51376, + "61": 9.69928, + "62": 9.98089, + "63": 9.39065, + "64": 9.77599, "65": 8.9571, + "66": 9.70054, + "67": 9.37, + "68": 9.78529, + "69": 9.78966, "70": 9.74676, + "71": 9.61906, + "72": 9.58963, + "73": 9.49629, + "74": 8.94963, "75": 9.42381, + "76": 9.07799, + "77": 10.07105, + "78": 9.72632, + "79": 9.37966, "80": 9.40721, + "81": 9.48238, + "82": 9.70152, + "83": 9.30657, + "84": 9.41464, "85": 9.61784, + "86": 9.08212, + "87": 9.59511, + "88": 9.75008, + "89": 9.60356, "90": 9.82256, + "91": 9.33721, + "92": 9.35861, + "93": 9.07956, + "94": 8.83268, "95": 9.51351, + "96": 9.52947, + "97": 9.31813, + "98": 9.67451, + "99": 8.88607, "100": 9.40106 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1736.0, + "2": 1692.0, + "3": 1695.0, + "4": 1761.0, "5": 1955.0, + "6": 1791.0, + "7": 1943.0, + "8": 1681.0, + "9": 1884.0, "10": 1441.0, + "11": 1942.0, + "12": 1786.0, + "13": 1940.0, + "14": 1862.0, "15": 1907.0, + "16": 1947.0, + "17": 1827.0, + "18": 1907.0, + "19": 1818.0, "20": 1700.0, + "21": 1911.0, + "22": 1720.0, + "23": 1938.0, + "24": 1707.0, "25": 1686.0, + "26": 1792.0, + "27": 1891.0, + "28": 1976.0, + "29": 1958.0, "30": 1941.0, + "31": 1622.0, + "32": 1970.0, + "33": 2129.0, + "34": 1830.0, "35": 1907.0, + "36": 1892.0, + "37": 2395.0, + "38": 2161.0, + "39": 2493.0, "40": 2224.0, + "41": 2201.0, + "42": 2175.0, + "43": 1920.0, + "44": 1955.0, "45": 1956.0, + "46": 2166.0, + "47": 2517.0, + "48": 2272.0, + "49": 2211.0, "50": 2232.0, + "51": 2621.0, + "52": 2597.0, + "53": 2926.0, + "54": 2633.0, "55": 2206.0, + "56": 2627.0, + "57": 2328.0, + "58": 2886.0, + "59": 2639.0, "60": 2157.0, + "61": 2736.0, + "62": 2544.0, + "63": 2332.0, + "64": 2948.0, "65": 2630.0, + "66": 2931.0, + "67": 2717.0, + "68": 2643.0, + "69": 2955.0, "70": 3040.0, + "71": 2882.0, + "72": 2390.0, + "73": 2812.0, + "74": 1844.0, "75": 2461.0, + "76": 3067.0, + "77": 3152.0, + "78": 3018.0, + "79": 3008.0, "80": 3104.0, + "81": 3589.0, + "82": 3218.0, + "83": 2748.0, + "84": 3217.0, "85": 3167.0, + "86": 2876.0, + "87": 3604.0, + "88": 3017.0, + "89": 3249.0, "90": 3069.0, + "91": 2865.0, + "92": 3074.0, + "93": 2680.0, + "94": 3392.0, "95": 3206.0, + "96": 3401.0, + "97": 3107.0, + "98": 3624.0, + "99": 3007.0, "100": 3111.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 299203072.0, + "2": 299203072.0, + "3": 299203072.0, + "4": 299203072.0, "5": 299203072.0, + "6": 299203072.0, + "7": 299203072.0, + "8": 299203072.0, + "9": 299203072.0, "10": 299203072.0, + "11": 299203072.0, + "12": 299203072.0, + "13": 299203072.0, + "14": 299203072.0, "15": 299203072.0, + "16": 299203072.0, + "17": 299203072.0, + "18": 299203072.0, + "19": 299203072.0, "20": 299203072.0, + "21": 299203072.0, + "22": 299203072.0, + "23": 299203072.0, + "24": 299203072.0, "25": 299203072.0, + "26": 299203072.0, + "27": 299203072.0, + "28": 299203072.0, + "29": 299203072.0, "30": 299203072.0, + "31": 299203072.0, + "32": 299203072.0, + "33": 299203072.0, + "34": 299203072.0, "35": 299203072.0, + "36": 299203072.0, + "37": 299203072.0, + "38": 299203072.0, + "39": 299203072.0, "40": 299203072.0, + "41": 299203072.0, + "42": 299203072.0, + "43": 299203072.0, + "44": 299203072.0, "45": 299203072.0, + "46": 299203072.0, + "47": 299203072.0, + "48": 299203072.0, + "49": 299203072.0, "50": 299203072.0, + "51": 299203072.0, + "52": 299203072.0, + "53": 299203072.0, + "54": 299203072.0, "55": 299203072.0, + "56": 299203072.0, + "57": 299203072.0, + "58": 299203072.0, + "59": 299203072.0, "60": 299203072.0, + "61": 299203072.0, + "62": 299203072.0, + "63": 299203072.0, + "64": 299203072.0, "65": 299203072.0, + "66": 299203072.0, + "67": 299203072.0, + "68": 299203072.0, + "69": 299203072.0, "70": 299203072.0, + "71": 299203072.0, + "72": 299203072.0, + "73": 299203072.0, + "74": 299203072.0, "75": 299203072.0, + "76": 299203072.0, + "77": 299203072.0, + "78": 299203072.0, + "79": 299203072.0, "80": 299203072.0, + "81": 299203072.0, + "82": 299203072.0, + "83": 299203072.0, + "84": 299203072.0, "85": 299203072.0, + "86": 299203072.0, + "87": 299203072.0, + "88": 299203072.0, + "89": 299203072.0, "90": 299203072.0, + "91": 299203072.0, + "92": 299203072.0, + "93": 299203072.0, + "94": 299203072.0, "95": 299203072.0, + "96": 299203072.0, + "97": 299203072.0, + "98": 299203072.0, + "99": 299203072.0, "100": 299203072.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 977125888.0, + "2": 1042071040.0, + "3": 1042071040.0, + "4": 1042071040.0, "5": 1042071040.0, + "6": 1042071040.0, + "7": 1042071040.0, + "8": 1042071040.0, + "9": 1042071040.0, "10": 1042071040.0, + "11": 1042071040.0, + "12": 1042071040.0, + "13": 1042071040.0, + "14": 1042071040.0, "15": 1042071040.0, + "16": 1042071040.0, + "17": 1042071040.0, + "18": 1042071040.0, + "19": 1042071040.0, "20": 1042071040.0, + "21": 1042071040.0, + "22": 1042071040.0, + "23": 1042071040.0, + "24": 1042071040.0, "25": 1042071040.0, + "26": 1042071040.0, + "27": 1042071040.0, + "28": 1042071040.0, + "29": 1042071040.0, "30": 1042071040.0, + "31": 1042071040.0, + "32": 1042071040.0, + "33": 1042071040.0, + "34": 1042071040.0, "35": 1042071040.0, + "36": 1042071040.0, + "37": 1042071040.0, + "38": 1042071040.0, + "39": 1042071040.0, "40": 1042071040.0, + "41": 1042071040.0, + "42": 1042071040.0, + "43": 1042071040.0, + "44": 1042071040.0, "45": 1042071040.0, + "46": 1042071040.0, + "47": 1042071040.0, + "48": 1042071040.0, + "49": 1042071040.0, "50": 1042071040.0, + "51": 1042071040.0, + "52": 1042071040.0, + "53": 1042071040.0, + "54": 1042071040.0, "55": 1042071040.0, + "56": 1042071040.0, + "57": 1042071040.0, + "58": 1042071040.0, + "59": 1042071040.0, "60": 1042071040.0, + "61": 1042071040.0, + "62": 1042071040.0, + "63": 1042071040.0, + "64": 1042071040.0, "65": 1042071040.0, + "66": 1042071040.0, + "67": 1042071040.0, + "68": 1042071040.0, + "69": 1042071040.0, "70": 1042071040.0, + "71": 1042071040.0, + "72": 1042071040.0, + "73": 1042071040.0, + "74": 1042071040.0, "75": 1042071040.0, + "76": 1042071040.0, + "77": 1042071040.0, + "78": 1042071040.0, + "79": 1042071040.0, "80": 1042071040.0, + "81": 1042071040.0, + "82": 1042071040.0, + "83": 1042071040.0, + "84": 1042071040.0, "85": 1042071040.0, + "86": 1042071040.0, + "87": 1042071040.0, + "88": 1042071040.0, + "89": 1042071040.0, "90": 1042071040.0, + "91": 1042071040.0, + "92": 1042071040.0, + "93": 1042071040.0, + "94": 1042071040.0, "95": 1042071040.0, + "96": 1042071040.0, + "97": 1042071040.0, + "98": 1042071040.0, + "99": 1042071040.0, "100": 1042071040.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 8.52165, - "5": 0.20516, - "10": 0.19368, - "15": 0.19068, - "20": 0.19109, - "25": 0.19345, - "30": 0.19142, - "35": 0.19012, - "40": 0.18948, - "45": 0.1901, - "50": 0.19384, - "55": 0.20627, - "60": 0.18816, - "65": 0.19043, - "70": 0.23342, - "75": 0.19438, - "80": 0.19064, - "85": 0.19143, - "90": 0.19257, - "95": 0.19189, - "100": 0.19388 + "1": 9.66271, + "2": 0.23225, + "3": 0.21983, + "4": 0.21408, + "5": 0.21473, + "6": 0.21644, + "7": 0.21513, + "8": 0.21892, + "9": 0.21351, + "10": 0.21576, + "11": 0.21747, + "12": 0.21985, + "13": 0.21564, + "14": 0.2155, + "15": 0.21384, + "16": 0.2162, + "17": 0.21558, + "18": 0.21508, + "19": 0.21618, + "20": 0.21836, + "21": 0.21423, + "22": 0.21684, + "23": 0.21439, + "24": 0.21562, + "25": 0.21579, + "26": 0.21914, + "27": 0.21564, + "28": 0.21449, + "29": 0.22032, + "30": 0.22136, + "31": 0.22263, + "32": 0.21897, + "33": 0.21534, + "34": 0.21759, + "35": 0.21572, + "36": 0.21721, + "37": 0.21402, + "38": 0.21621, + "39": 0.21783, + "40": 0.21822, + "41": 0.21596, + "42": 0.21203, + "43": 0.21782, + "44": 0.21805, + "45": 0.2183, + "46": 0.21676, + "47": 0.21734, + "48": 0.2176, + "49": 0.21836, + "50": 0.21593, + "51": 0.22189, + "52": 0.21722, + "53": 0.22114, + "54": 0.21648, + "55": 0.21825, + "56": 0.21733, + "57": 0.21702, + "58": 0.21752, + "59": 0.21546, + "60": 0.2151, + "61": 0.21602, + "62": 0.22135, + "63": 0.21659, + "64": 0.21618, + "65": 0.21569, + "66": 0.21864, + "67": 0.22799, + "68": 0.21833, + "69": 0.21643, + "70": 0.21672, + "71": 0.21562, + "72": 0.21799, + "73": 0.21791, + "74": 0.21898, + "75": 0.2183, + "76": 0.22117, + "77": 0.22, + "78": 0.2188, + "79": 0.21888, + "80": 0.21768, + "81": 0.22547, + "82": 0.2175, + "83": 0.2222, + "84": 0.21749, + "85": 0.22304, + "86": 0.22141, + "87": 0.22658, + "88": 0.21977, + "89": 0.21928, + "90": 0.21911, + "91": 0.22126, + "92": 0.21903, + "93": 0.22164, + "94": 0.21864, + "95": 0.21968, + "96": 0.21892, + "97": 0.21956, + "98": 0.21795, + "99": 0.22313, + "100": 0.2196 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..0a3544b2d93 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84163, + "2": 10.85598, + "3": 10.84413, + "4": 10.84124, + "5": 10.85872, + "6": 10.86316, + "7": 10.85184, + "8": 10.84645, + "9": 10.85647, + "10": 10.81849, + "11": 10.85923, + "12": 10.84285, + "13": 10.86432, + "14": 10.85423, + "15": 10.81015, + "16": 10.81588, + "17": 10.78949, + "18": 10.79683, + "19": 10.79073, + "20": 10.70819, + "21": 10.69322, + "22": 10.58504, + "23": 10.70217, + "24": 10.60546, + "25": 10.57102, + "26": 10.61967, + "27": 10.61501, + "28": 10.56369, + "29": 10.56725, + "30": 10.39695, + "31": 10.16591, + "32": 10.4573, + "33": 10.45199, + "34": 10.2392, + "35": 10.28351, + "36": 10.24677, + "37": 10.3427, + "38": 10.20546, + "39": 10.39187, + "40": 10.09767, + "41": 10.1526, + "42": 10.21051, + "43": 9.87726, + "44": 9.98291, + "45": 9.86165, + "46": 9.83587, + "47": 10.13369, + "48": 9.87212, + "49": 9.56121, + "50": 9.91045, + "51": 9.85839, + "52": 9.7506, + "53": 10.05817, + "54": 9.96076, + "55": 9.88738, + "56": 9.6344, + "57": 9.4967, + "58": 9.83343, + "59": 9.59391, + "60": 9.51376, + "61": 9.69928, + "62": 9.98089, + "63": 9.39065, + "64": 9.77599, + "65": 8.9571, + "66": 9.70054, + "67": 9.37, + "68": 9.78529, + "69": 9.78966, + "70": 9.74676, + "71": 9.61906, + "72": 9.58963, + "73": 9.49629, + "74": 8.94963, + "75": 9.42381, + "76": 9.07799, + "77": 10.07105, + "78": 9.72632, + "79": 9.37966, + "80": 9.40721, + "81": 9.48238, + "82": 9.70152, + "83": 9.30657, + "84": 9.41464, + "85": 9.61784, + "86": 9.08212, + "87": 9.59511, + "88": 9.75008, + "89": 9.60356, + "90": 9.82256, + "91": 9.33721, + "92": 9.35861, + "93": 9.07956, + "94": 8.83268, + "95": 9.51351, + "96": 9.52947, + "97": 9.31813, + "98": 9.67451, + "99": 8.88607, + "100": 9.40106 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1736.0, + "2": 1692.0, + "3": 1695.0, + "4": 1761.0, + "5": 1955.0, + "6": 1791.0, + "7": 1943.0, + "8": 1681.0, + "9": 1884.0, + "10": 1441.0, + "11": 1942.0, + "12": 1786.0, + "13": 1940.0, + "14": 1862.0, + "15": 1907.0, + "16": 1947.0, + "17": 1827.0, + "18": 1907.0, + "19": 1818.0, + "20": 1700.0, + "21": 1911.0, + "22": 1720.0, + "23": 1938.0, + "24": 1707.0, + "25": 1686.0, + "26": 1792.0, + "27": 1891.0, + "28": 1976.0, + "29": 1958.0, + "30": 1941.0, + "31": 1622.0, + "32": 1970.0, + "33": 2129.0, + "34": 1830.0, + "35": 1907.0, + "36": 1892.0, + "37": 2395.0, + "38": 2161.0, + "39": 2493.0, + "40": 2224.0, + "41": 2201.0, + "42": 2175.0, + "43": 1920.0, + "44": 1955.0, + "45": 1956.0, + "46": 2166.0, + "47": 2517.0, + "48": 2272.0, + "49": 2211.0, + "50": 2232.0, + "51": 2621.0, + "52": 2597.0, + "53": 2926.0, + "54": 2633.0, + "55": 2206.0, + "56": 2627.0, + "57": 2328.0, + "58": 2886.0, + "59": 2639.0, + "60": 2157.0, + "61": 2736.0, + "62": 2544.0, + "63": 2332.0, + "64": 2948.0, + "65": 2630.0, + "66": 2931.0, + "67": 2717.0, + "68": 2643.0, + "69": 2955.0, + "70": 3040.0, + "71": 2882.0, + "72": 2390.0, + "73": 2812.0, + "74": 1844.0, + "75": 2461.0, + "76": 3067.0, + "77": 3152.0, + "78": 3018.0, + "79": 3008.0, + "80": 3104.0, + "81": 3589.0, + "82": 3218.0, + "83": 2748.0, + "84": 3217.0, + "85": 3167.0, + "86": 2876.0, + "87": 3604.0, + "88": 3017.0, + "89": 3249.0, + "90": 3069.0, + "91": 2865.0, + "92": 3074.0, + "93": 2680.0, + "94": 3392.0, + "95": 3206.0, + "96": 3401.0, + "97": 3107.0, + "98": 3624.0, + "99": 3007.0, + "100": 3111.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 299203072.0, + "2": 299203072.0, + "3": 299203072.0, + "4": 299203072.0, + "5": 299203072.0, + "6": 299203072.0, + "7": 299203072.0, + "8": 299203072.0, + "9": 299203072.0, + "10": 299203072.0, + "11": 299203072.0, + "12": 299203072.0, + "13": 299203072.0, + "14": 299203072.0, + "15": 299203072.0, + "16": 299203072.0, + "17": 299203072.0, + "18": 299203072.0, + "19": 299203072.0, + "20": 299203072.0, + "21": 299203072.0, + "22": 299203072.0, + "23": 299203072.0, + "24": 299203072.0, + "25": 299203072.0, + "26": 299203072.0, + "27": 299203072.0, + "28": 299203072.0, + "29": 299203072.0, + "30": 299203072.0, + "31": 299203072.0, + "32": 299203072.0, + "33": 299203072.0, + "34": 299203072.0, + "35": 299203072.0, + "36": 299203072.0, + "37": 299203072.0, + "38": 299203072.0, + "39": 299203072.0, + "40": 299203072.0, + "41": 299203072.0, + "42": 299203072.0, + "43": 299203072.0, + "44": 299203072.0, + "45": 299203072.0, + "46": 299203072.0, + "47": 299203072.0, + "48": 299203072.0, + "49": 299203072.0, + "50": 299203072.0, + "51": 299203072.0, + "52": 299203072.0, + "53": 299203072.0, + "54": 299203072.0, + "55": 299203072.0, + "56": 299203072.0, + "57": 299203072.0, + "58": 299203072.0, + "59": 299203072.0, + "60": 299203072.0, + "61": 299203072.0, + "62": 299203072.0, + "63": 299203072.0, + "64": 299203072.0, + "65": 299203072.0, + "66": 299203072.0, + "67": 299203072.0, + "68": 299203072.0, + "69": 299203072.0, + "70": 299203072.0, + "71": 299203072.0, + "72": 299203072.0, + "73": 299203072.0, + "74": 299203072.0, + "75": 299203072.0, + "76": 299203072.0, + "77": 299203072.0, + "78": 299203072.0, + "79": 299203072.0, + "80": 299203072.0, + "81": 299203072.0, + "82": 299203072.0, + "83": 299203072.0, + "84": 299203072.0, + "85": 299203072.0, + "86": 299203072.0, + "87": 299203072.0, + "88": 299203072.0, + "89": 299203072.0, + "90": 299203072.0, + "91": 299203072.0, + "92": 299203072.0, + "93": 299203072.0, + "94": 299203072.0, + "95": 299203072.0, + "96": 299203072.0, + "97": 299203072.0, + "98": 299203072.0, + "99": 299203072.0, + "100": 299203072.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 977125888.0, + "2": 1042071040.0, + "3": 1042071040.0, + "4": 1042071040.0, + "5": 1042071040.0, + "6": 1042071040.0, + "7": 1042071040.0, + "8": 1042071040.0, + "9": 1042071040.0, + "10": 1042071040.0, + "11": 1042071040.0, + "12": 1042071040.0, + "13": 1042071040.0, + "14": 1042071040.0, + "15": 1042071040.0, + "16": 1042071040.0, + "17": 1042071040.0, + "18": 1042071040.0, + "19": 1042071040.0, + "20": 1042071040.0, + "21": 1042071040.0, + "22": 1042071040.0, + "23": 1042071040.0, + "24": 1042071040.0, + "25": 1042071040.0, + "26": 1042071040.0, + "27": 1042071040.0, + "28": 1042071040.0, + "29": 1042071040.0, + "30": 1042071040.0, + "31": 1042071040.0, + "32": 1042071040.0, + "33": 1042071040.0, + "34": 1042071040.0, + "35": 1042071040.0, + "36": 1042071040.0, + "37": 1042071040.0, + "38": 1042071040.0, + "39": 1042071040.0, + "40": 1042071040.0, + "41": 1042071040.0, + "42": 1042071040.0, + "43": 1042071040.0, + "44": 1042071040.0, + "45": 1042071040.0, + "46": 1042071040.0, + "47": 1042071040.0, + "48": 1042071040.0, + "49": 1042071040.0, + "50": 1042071040.0, + "51": 1042071040.0, + "52": 1042071040.0, + "53": 1042071040.0, + "54": 1042071040.0, + "55": 1042071040.0, + "56": 1042071040.0, + "57": 1042071040.0, + "58": 1042071040.0, + "59": 1042071040.0, + "60": 1042071040.0, + "61": 1042071040.0, + "62": 1042071040.0, + "63": 1042071040.0, + "64": 1042071040.0, + "65": 1042071040.0, + "66": 1042071040.0, + "67": 1042071040.0, + "68": 1042071040.0, + "69": 1042071040.0, + "70": 1042071040.0, + "71": 1042071040.0, + "72": 1042071040.0, + "73": 1042071040.0, + "74": 1042071040.0, + "75": 1042071040.0, + "76": 1042071040.0, + "77": 1042071040.0, + "78": 1042071040.0, + "79": 1042071040.0, + "80": 1042071040.0, + "81": 1042071040.0, + "82": 1042071040.0, + "83": 1042071040.0, + "84": 1042071040.0, + "85": 1042071040.0, + "86": 1042071040.0, + "87": 1042071040.0, + "88": 1042071040.0, + "89": 1042071040.0, + "90": 1042071040.0, + "91": 1042071040.0, + "92": 1042071040.0, + "93": 1042071040.0, + "94": 1042071040.0, + "95": 1042071040.0, + "96": 1042071040.0, + "97": 1042071040.0, + "98": 1042071040.0, + "99": 1042071040.0, + "100": 1042071040.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.6125, + "2": 0.23356, + "3": 0.21314, + "4": 0.21148, + "5": 0.20775, + "6": 0.20509, + "7": 0.19583, + "8": 0.19566, + "9": 0.19148, + "10": 0.19484, + "11": 0.20705, + "12": 0.2015, + "13": 0.18887, + "14": 0.1904, + "15": 0.19036, + "16": 0.18983, + "17": 0.1895, + "18": 0.19146, + "19": 0.18958, + "20": 0.18946, + "21": 0.19061, + "22": 0.19252, + "23": 0.18928, + "24": 0.19105, + "25": 0.18924, + "26": 0.18957, + "27": 0.19008, + "28": 0.19134, + "29": 0.18909, + "30": 0.1922, + "31": 0.1908, + "32": 0.18951, + "33": 0.18928, + "34": 0.19468, + "35": 0.19052, + "36": 0.19049, + "37": 0.19173, + "38": 0.18825, + "39": 0.1911, + "40": 0.18942, + "41": 0.1919, + "42": 0.19303, + "43": 0.19325, + "44": 0.19049, + "45": 0.18935, + "46": 0.18861, + "47": 0.19155, + "48": 0.19149, + "49": 0.1913, + "50": 0.19586, + "51": 0.20004, + "52": 0.19367, + "53": 0.19138, + "54": 0.1927, + "55": 0.19196, + "56": 0.19084, + "57": 0.19081, + "58": 0.19132, + "59": 0.18829, + "60": 0.19212, + "61": 0.19275, + "62": 0.19577, + "63": 0.18781, + "64": 0.1893, + "65": 0.18899, + "66": 0.19016, + "67": 0.1858, + "68": 0.1931, + "69": 0.18841, + "70": 0.18896, + "71": 0.18966, + "72": 0.18842, + "73": 0.19129, + "74": 0.19147, + "75": 0.19408, + "76": 0.19017, + "77": 0.18501, + "78": 0.18992, + "79": 0.18844, + "80": 0.18811, + "81": 0.19097, + "82": 0.18879, + "83": 0.18908, + "84": 0.18763, + "85": 0.1877, + "86": 0.18953, + "87": 0.1893, + "88": 0.18802, + "89": 0.18961, + "90": 0.18878, + "91": 0.18927, + "92": 0.18915, + "93": 0.19047, + "94": 0.19, + "95": 0.19146, + "96": 0.19061, + "97": 0.1925, + "98": 0.18915, + "99": 0.18916, + "100": 0.19162 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..6937fb9bd55 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84163, + "2": 10.85598, + "3": 10.84413, + "4": 10.84124, + "5": 10.85872, + "6": 10.86316, + "7": 10.85184, + "8": 10.84645, + "9": 10.85647, + "10": 10.81849, + "11": 10.85923, + "12": 10.84285, + "13": 10.86432, + "14": 10.85423, + "15": 10.81015, + "16": 10.81588, + "17": 10.78949, + "18": 10.79683, + "19": 10.79073, + "20": 10.70819, + "21": 10.69322, + "22": 10.58504, + "23": 10.70217, + "24": 10.60546, + "25": 10.57102, + "26": 10.61967, + "27": 10.61501, + "28": 10.56369, + "29": 10.56725, + "30": 10.39695, + "31": 10.16591, + "32": 10.4573, + "33": 10.45199, + "34": 10.2392, + "35": 10.28351, + "36": 10.24677, + "37": 10.3427, + "38": 10.20546, + "39": 10.39187, + "40": 10.09767, + "41": 10.1526, + "42": 10.21051, + "43": 9.87726, + "44": 9.98291, + "45": 9.86165, + "46": 9.83587, + "47": 10.13369, + "48": 9.87212, + "49": 9.56121, + "50": 9.91045, + "51": 9.85839, + "52": 9.7506, + "53": 10.05817, + "54": 9.96076, + "55": 9.88738, + "56": 9.6344, + "57": 9.4967, + "58": 9.83343, + "59": 9.59391, + "60": 9.51376, + "61": 9.69928, + "62": 9.98089, + "63": 9.39065, + "64": 9.77599, + "65": 8.9571, + "66": 9.70054, + "67": 9.37, + "68": 9.78529, + "69": 9.78966, + "70": 9.74676, + "71": 9.61906, + "72": 9.58963, + "73": 9.49629, + "74": 8.94963, + "75": 9.42381, + "76": 9.07799, + "77": 10.07105, + "78": 9.72632, + "79": 9.37966, + "80": 9.40721, + "81": 9.48238, + "82": 9.70152, + "83": 9.30657, + "84": 9.41464, + "85": 9.61784, + "86": 9.08212, + "87": 9.59511, + "88": 9.75008, + "89": 9.60356, + "90": 9.82256, + "91": 9.33721, + "92": 9.35861, + "93": 9.07956, + "94": 8.83268, + "95": 9.51351, + "96": 9.52947, + "97": 9.31813, + "98": 9.67451, + "99": 8.88607, + "100": 9.40106 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1736.0, + "2": 1692.0, + "3": 1695.0, + "4": 1761.0, + "5": 1955.0, + "6": 1791.0, + "7": 1943.0, + "8": 1681.0, + "9": 1884.0, + "10": 1441.0, + "11": 1942.0, + "12": 1786.0, + "13": 1940.0, + "14": 1862.0, + "15": 1907.0, + "16": 1947.0, + "17": 1827.0, + "18": 1907.0, + "19": 1818.0, + "20": 1700.0, + "21": 1911.0, + "22": 1720.0, + "23": 1938.0, + "24": 1707.0, + "25": 1686.0, + "26": 1792.0, + "27": 1891.0, + "28": 1976.0, + "29": 1958.0, + "30": 1941.0, + "31": 1622.0, + "32": 1970.0, + "33": 2129.0, + "34": 1830.0, + "35": 1907.0, + "36": 1892.0, + "37": 2395.0, + "38": 2161.0, + "39": 2493.0, + "40": 2224.0, + "41": 2201.0, + "42": 2175.0, + "43": 1920.0, + "44": 1955.0, + "45": 1956.0, + "46": 2166.0, + "47": 2517.0, + "48": 2272.0, + "49": 2211.0, + "50": 2232.0, + "51": 2621.0, + "52": 2597.0, + "53": 2926.0, + "54": 2633.0, + "55": 2206.0, + "56": 2627.0, + "57": 2328.0, + "58": 2886.0, + "59": 2639.0, + "60": 2157.0, + "61": 2736.0, + "62": 2544.0, + "63": 2332.0, + "64": 2948.0, + "65": 2630.0, + "66": 2931.0, + "67": 2717.0, + "68": 2643.0, + "69": 2955.0, + "70": 3040.0, + "71": 2882.0, + "72": 2390.0, + "73": 2812.0, + "74": 1844.0, + "75": 2461.0, + "76": 3067.0, + "77": 3152.0, + "78": 3018.0, + "79": 3008.0, + "80": 3104.0, + "81": 3589.0, + "82": 3218.0, + "83": 2748.0, + "84": 3217.0, + "85": 3167.0, + "86": 2876.0, + "87": 3604.0, + "88": 3017.0, + "89": 3249.0, + "90": 3069.0, + "91": 2865.0, + "92": 3074.0, + "93": 2680.0, + "94": 3392.0, + "95": 3206.0, + "96": 3401.0, + "97": 3107.0, + "98": 3624.0, + "99": 3007.0, + "100": 3111.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 299203072.0, + "2": 299203072.0, + "3": 299203072.0, + "4": 299203072.0, + "5": 299203072.0, + "6": 299203072.0, + "7": 299203072.0, + "8": 299203072.0, + "9": 299203072.0, + "10": 299203072.0, + "11": 299203072.0, + "12": 299203072.0, + "13": 299203072.0, + "14": 299203072.0, + "15": 299203072.0, + "16": 299203072.0, + "17": 299203072.0, + "18": 299203072.0, + "19": 299203072.0, + "20": 299203072.0, + "21": 299203072.0, + "22": 299203072.0, + "23": 299203072.0, + "24": 299203072.0, + "25": 299203072.0, + "26": 299203072.0, + "27": 299203072.0, + "28": 299203072.0, + "29": 299203072.0, + "30": 299203072.0, + "31": 299203072.0, + "32": 299203072.0, + "33": 299203072.0, + "34": 299203072.0, + "35": 299203072.0, + "36": 299203072.0, + "37": 299203072.0, + "38": 299203072.0, + "39": 299203072.0, + "40": 299203072.0, + "41": 299203072.0, + "42": 299203072.0, + "43": 299203072.0, + "44": 299203072.0, + "45": 299203072.0, + "46": 299203072.0, + "47": 299203072.0, + "48": 299203072.0, + "49": 299203072.0, + "50": 299203072.0, + "51": 299203072.0, + "52": 299203072.0, + "53": 299203072.0, + "54": 299203072.0, + "55": 299203072.0, + "56": 299203072.0, + "57": 299203072.0, + "58": 299203072.0, + "59": 299203072.0, + "60": 299203072.0, + "61": 299203072.0, + "62": 299203072.0, + "63": 299203072.0, + "64": 299203072.0, + "65": 299203072.0, + "66": 299203072.0, + "67": 299203072.0, + "68": 299203072.0, + "69": 299203072.0, + "70": 299203072.0, + "71": 299203072.0, + "72": 299203072.0, + "73": 299203072.0, + "74": 299203072.0, + "75": 299203072.0, + "76": 299203072.0, + "77": 299203072.0, + "78": 299203072.0, + "79": 299203072.0, + "80": 299203072.0, + "81": 299203072.0, + "82": 299203072.0, + "83": 299203072.0, + "84": 299203072.0, + "85": 299203072.0, + "86": 299203072.0, + "87": 299203072.0, + "88": 299203072.0, + "89": 299203072.0, + "90": 299203072.0, + "91": 299203072.0, + "92": 299203072.0, + "93": 299203072.0, + "94": 299203072.0, + "95": 299203072.0, + "96": 299203072.0, + "97": 299203072.0, + "98": 299203072.0, + "99": 299203072.0, + "100": 299203072.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 977125888.0, + "2": 1042071040.0, + "3": 1042071040.0, + "4": 1042071040.0, + "5": 1042071040.0, + "6": 1042071040.0, + "7": 1042071040.0, + "8": 1042071040.0, + "9": 1042071040.0, + "10": 1042071040.0, + "11": 1042071040.0, + "12": 1042071040.0, + "13": 1042071040.0, + "14": 1042071040.0, + "15": 1042071040.0, + "16": 1042071040.0, + "17": 1042071040.0, + "18": 1042071040.0, + "19": 1042071040.0, + "20": 1042071040.0, + "21": 1042071040.0, + "22": 1042071040.0, + "23": 1042071040.0, + "24": 1042071040.0, + "25": 1042071040.0, + "26": 1042071040.0, + "27": 1042071040.0, + "28": 1042071040.0, + "29": 1042071040.0, + "30": 1042071040.0, + "31": 1042071040.0, + "32": 1042071040.0, + "33": 1042071040.0, + "34": 1042071040.0, + "35": 1042071040.0, + "36": 1042071040.0, + "37": 1042071040.0, + "38": 1042071040.0, + "39": 1042071040.0, + "40": 1042071040.0, + "41": 1042071040.0, + "42": 1042071040.0, + "43": 1042071040.0, + "44": 1042071040.0, + "45": 1042071040.0, + "46": 1042071040.0, + "47": 1042071040.0, + "48": 1042071040.0, + "49": 1042071040.0, + "50": 1042071040.0, + "51": 1042071040.0, + "52": 1042071040.0, + "53": 1042071040.0, + "54": 1042071040.0, + "55": 1042071040.0, + "56": 1042071040.0, + "57": 1042071040.0, + "58": 1042071040.0, + "59": 1042071040.0, + "60": 1042071040.0, + "61": 1042071040.0, + "62": 1042071040.0, + "63": 1042071040.0, + "64": 1042071040.0, + "65": 1042071040.0, + "66": 1042071040.0, + "67": 1042071040.0, + "68": 1042071040.0, + "69": 1042071040.0, + "70": 1042071040.0, + "71": 1042071040.0, + "72": 1042071040.0, + "73": 1042071040.0, + "74": 1042071040.0, + "75": 1042071040.0, + "76": 1042071040.0, + "77": 1042071040.0, + "78": 1042071040.0, + "79": 1042071040.0, + "80": 1042071040.0, + "81": 1042071040.0, + "82": 1042071040.0, + "83": 1042071040.0, + "84": 1042071040.0, + "85": 1042071040.0, + "86": 1042071040.0, + "87": 1042071040.0, + "88": 1042071040.0, + "89": 1042071040.0, + "90": 1042071040.0, + "91": 1042071040.0, + "92": 1042071040.0, + "93": 1042071040.0, + "94": 1042071040.0, + "95": 1042071040.0, + "96": 1042071040.0, + "97": 1042071040.0, + "98": 1042071040.0, + "99": 1042071040.0, + "100": 1042071040.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.71841, + "2": 0.23136, + "3": 0.22493, + "4": 0.22779, + "5": 0.22663, + "6": 0.22036, + "7": 0.23806, + "8": 0.23483, + "9": 0.21894, + "10": 0.22798, + "11": 0.22166, + "12": 0.22477, + "13": 0.21586, + "14": 0.2289, + "15": 0.21846, + "16": 0.22439, + "17": 0.22351, + "18": 0.21894, + "19": 0.22165, + "20": 0.23, + "21": 0.21688, + "22": 0.21901, + "23": 0.21714, + "24": 0.2185, + "25": 0.21681, + "26": 0.21775, + "27": 0.21816, + "28": 0.21837, + "29": 0.21776, + "30": 0.21739, + "31": 0.21725, + "32": 0.21929, + "33": 0.2156, + "34": 0.21959, + "35": 0.21865, + "36": 0.21696, + "37": 0.21952, + "38": 0.21797, + "39": 0.21568, + "40": 0.21803, + "41": 0.21756, + "42": 0.21877, + "43": 0.21676, + "44": 0.21677, + "45": 0.21721, + "46": 0.22075, + "47": 0.21856, + "48": 0.21933, + "49": 0.21808, + "50": 0.21813, + "51": 0.22296, + "52": 0.22336, + "53": 0.21692, + "54": 0.21796, + "55": 0.21788, + "56": 0.22002, + "57": 0.21845, + "58": 0.21989, + "59": 0.21686, + "60": 0.22032, + "61": 0.22127, + "62": 0.21716, + "63": 0.21811, + "64": 0.21821, + "65": 0.22368, + "66": 0.22001, + "67": 0.21796, + "68": 0.21889, + "69": 0.22034, + "70": 0.2227, + "71": 0.2211, + "72": 0.2167, + "73": 0.21687, + "74": 0.22416, + "75": 0.22056, + "76": 0.22116, + "77": 0.21759, + "78": 0.21843, + "79": 0.22272, + "80": 0.21922, + "81": 0.2196, + "82": 0.22739, + "83": 0.22344, + "84": 0.21981, + "85": 0.22041, + "86": 0.22015, + "87": 0.21885, + "88": 0.2239, + "89": 0.22975, + "90": 0.23365, + "91": 0.22476, + "92": 0.22336, + "93": 0.21913, + "94": 0.22057, + "95": 0.21711, + "96": 0.21724, + "97": 0.22153, + "98": 0.21996, + "99": 0.21866, + "100": 0.21935 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgx_a100.json index 0733919eefd..54bb3cbea8d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgx_a100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgx_a100.json @@ -1 +1,537 @@ -{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.86122, "5": 10.88242, "10": 10.83506, "15": 10.82738, "20": 10.72743, "25": 10.55753, "30": 10.37895, "35": 10.28321, "40": 10.08785, "45": 9.82625, "50": 9.91327, "55": 9.87788, "60": 9.50874, "65": 8.95103, "70": 9.73167, "75": 9.43681, "80": 9.41156, "85": 9.61613, "90": 9.8217, "95": 9.5191, "100": 9.40588}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1778.0, "5": 2219.0, "10": 1530.0, "15": 2125.0, "20": 1867.0, "25": 1773.0, "30": 2129.0, "35": 2169.0, "40": 2486.0, "45": 2335.0, "50": 2687.0, "55": 2652.0, "60": 2765.0, "65": 2946.0, "70": 3629.0, "75": 2702.0, "80": 3866.0, "85": 3517.0, "90": 3349.0, "95": 3530.0, "100": 3530.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 269891584.0, "5": 269891584.0, "10": 269891584.0, "15": 269891584.0, "20": 269891584.0, "25": 269891584.0, "30": 269891584.0, "35": 269891584.0, "40": 269891584.0, "45": 269891584.0, "50": 269891584.0, "55": 269891584.0, "60": 269891584.0, "65": 269891584.0, "70": 269891584.0, "75": 269891584.0, "80": 269891584.0, "85": 269891584.0, "90": 269891584.0, "95": 269891584.0, "100": 269891584.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1448634368.0, "5": 1515676672.0, "10": 1515676672.0, "15": 1515676672.0, "20": 1515676672.0, "25": 1515676672.0, "30": 1515676672.0, "35": 1515676672.0, "40": 1515676672.0, "45": 1515676672.0, "50": 1515676672.0, "55": 1515676672.0, "60": 1515676672.0, "65": 1515676672.0, "70": 1515676672.0, "75": 1515676672.0, "80": 1515676672.0, "85": 1515676672.0, "90": 1515676672.0, "95": 1515676672.0, "100": 1515676672.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 9.50422, "5": 0.32491, "10": 0.31435, "15": 0.31821, "20": 0.31516, "25": 0.31746, "30": 0.31793, "35": 0.31313, "40": 0.321, "45": 0.31588, "50": 0.31619, "55": 0.31619, "60": 0.31976, "65": 0.31872, "70": 0.31488, "75": 0.32184, "80": 0.31524, "85": 0.31903, "90": 0.31743, "95": 0.31797, "100": 0.3198}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.86122, + "2": 10.85774, + "3": 10.86039, + "4": 10.84813, + "5": 10.88242, + "6": 10.88645, + "7": 10.86227, + "8": 10.86932, + "9": 10.86444, + "10": 10.83506, + "11": 10.87765, + "12": 10.87384, + "13": 10.87945, + "14": 10.88919, + "15": 10.82738, + "16": 10.83105, + "17": 10.79888, + "18": 10.82441, + "19": 10.81363, + "20": 10.72743, + "21": 10.71638, + "22": 10.57153, + "23": 10.7269, + "24": 10.61223, + "25": 10.55753, + "26": 10.60603, + "27": 10.61792, + "28": 10.57695, + "29": 10.59633, + "30": 10.37895, + "31": 10.13125, + "32": 10.47822, + "33": 10.46894, + "34": 10.22715, + "35": 10.28321, + "36": 10.22751, + "37": 10.35397, + "38": 10.20483, + "39": 10.40755, + "40": 10.08785, + "41": 10.1591, + "42": 10.21601, + "43": 9.84821, + "44": 9.9651, + "45": 9.82625, + "46": 9.83468, + "47": 10.15337, + "48": 9.84529, + "49": 9.52926, + "50": 9.91327, + "51": 9.8517, + "52": 9.74686, + "53": 10.07204, + "54": 9.95738, + "55": 9.87788, + "56": 9.62943, + "57": 9.48988, + "58": 9.83265, + "59": 9.58831, + "60": 9.50874, + "61": 9.69495, + "62": 9.99373, + "63": 9.377, + "64": 9.78004, + "65": 8.95103, + "66": 9.71392, + "67": 9.37884, + "68": 9.78831, + "69": 9.79096, + "70": 9.73167, + "71": 9.61776, + "72": 9.59099, + "73": 9.49436, + "74": 8.95001, + "75": 9.43681, + "76": 9.09852, + "77": 10.06447, + "78": 9.72944, + "79": 9.37805, + "80": 9.41156, + "81": 9.48537, + "82": 9.69592, + "83": 9.31981, + "84": 9.42306, + "85": 9.61613, + "86": 9.07185, + "87": 9.59282, + "88": 9.75055, + "89": 9.61194, + "90": 9.8217, + "91": 9.35308, + "92": 9.36305, + "93": 9.08788, + "94": 8.83439, + "95": 9.5191, + "96": 9.52647, + "97": 9.31412, + "98": 9.67541, + "99": 8.88941, + "100": 9.40588 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1778.0, + "2": 1875.0, + "3": 1879.0, + "4": 1912.0, + "5": 2219.0, + "6": 2163.0, + "7": 2113.0, + "8": 1747.0, + "9": 2049.0, + "10": 1530.0, + "11": 2113.0, + "12": 1959.0, + "13": 2134.0, + "14": 2055.0, + "15": 2125.0, + "16": 2139.0, + "17": 1988.0, + "18": 1892.0, + "19": 1991.0, + "20": 1867.0, + "21": 2023.0, + "22": 1865.0, + "23": 2185.0, + "24": 1774.0, + "25": 1773.0, + "26": 1990.0, + "27": 2061.0, + "28": 2215.0, + "29": 2186.0, + "30": 2129.0, + "31": 1794.0, + "32": 2109.0, + "33": 2422.0, + "34": 2135.0, + "35": 2169.0, + "36": 2127.0, + "37": 2432.0, + "38": 2490.0, + "39": 2495.0, + "40": 2486.0, + "41": 2465.0, + "42": 2535.0, + "43": 2216.0, + "44": 2407.0, + "45": 2335.0, + "46": 2617.0, + "47": 2830.0, + "48": 2480.0, + "49": 2492.0, + "50": 2687.0, + "51": 2863.0, + "52": 2881.0, + "53": 3220.0, + "54": 2894.0, + "55": 2652.0, + "56": 3006.0, + "57": 2561.0, + "58": 3273.0, + "59": 3039.0, + "60": 2765.0, + "61": 3310.0, + "62": 2936.0, + "63": 2630.0, + "64": 3230.0, + "65": 2946.0, + "66": 3500.0, + "67": 2976.0, + "68": 2944.0, + "69": 3117.0, + "70": 3629.0, + "71": 3255.0, + "72": 2633.0, + "73": 3338.0, + "74": 2172.0, + "75": 2702.0, + "76": 3162.0, + "77": 3850.0, + "78": 3590.0, + "79": 3658.0, + "80": 3866.0, + "81": 3976.0, + "82": 3680.0, + "83": 3153.0, + "84": 3586.0, + "85": 3517.0, + "86": 3137.0, + "87": 4177.0, + "88": 3589.0, + "89": 3849.0, + "90": 3349.0, + "91": 2936.0, + "92": 3526.0, + "93": 2965.0, + "94": 3772.0, + "95": 3530.0, + "96": 3774.0, + "97": 3636.0, + "98": 4064.0, + "99": 3394.0, + "100": 3530.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 269891584.0, + "2": 269891584.0, + "3": 269891584.0, + "4": 269891584.0, + "5": 269891584.0, + "6": 269891584.0, + "7": 269891584.0, + "8": 269891584.0, + "9": 269891584.0, + "10": 269891584.0, + "11": 269891584.0, + "12": 269891584.0, + "13": 269891584.0, + "14": 269891584.0, + "15": 269891584.0, + "16": 269891584.0, + "17": 269891584.0, + "18": 269891584.0, + "19": 269891584.0, + "20": 269891584.0, + "21": 269891584.0, + "22": 269891584.0, + "23": 269891584.0, + "24": 269891584.0, + "25": 269891584.0, + "26": 269891584.0, + "27": 269891584.0, + "28": 269891584.0, + "29": 269891584.0, + "30": 269891584.0, + "31": 269891584.0, + "32": 269891584.0, + "33": 269891584.0, + "34": 269891584.0, + "35": 269891584.0, + "36": 269891584.0, + "37": 269891584.0, + "38": 269891584.0, + "39": 269891584.0, + "40": 269891584.0, + "41": 269891584.0, + "42": 269891584.0, + "43": 269891584.0, + "44": 269891584.0, + "45": 269891584.0, + "46": 269891584.0, + "47": 269891584.0, + "48": 269891584.0, + "49": 269891584.0, + "50": 269891584.0, + "51": 269891584.0, + "52": 269891584.0, + "53": 269891584.0, + "54": 269891584.0, + "55": 269891584.0, + "56": 269891584.0, + "57": 269891584.0, + "58": 269891584.0, + "59": 269891584.0, + "60": 269891584.0, + "61": 269891584.0, + "62": 269891584.0, + "63": 269891584.0, + "64": 269891584.0, + "65": 269891584.0, + "66": 269891584.0, + "67": 269891584.0, + "68": 269891584.0, + "69": 269891584.0, + "70": 269891584.0, + "71": 269891584.0, + "72": 269891584.0, + "73": 269891584.0, + "74": 269891584.0, + "75": 269891584.0, + "76": 269891584.0, + "77": 269891584.0, + "78": 269891584.0, + "79": 269891584.0, + "80": 269891584.0, + "81": 269891584.0, + "82": 269891584.0, + "83": 269891584.0, + "84": 269891584.0, + "85": 269891584.0, + "86": 269891584.0, + "87": 269891584.0, + "88": 269891584.0, + "89": 269891584.0, + "90": 269891584.0, + "91": 269891584.0, + "92": 269891584.0, + "93": 269891584.0, + "94": 269891584.0, + "95": 269891584.0, + "96": 269891584.0, + "97": 269891584.0, + "98": 269891584.0, + "99": 269891584.0, + "100": 269891584.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1448634368.0, + "2": 1515676672.0, + "3": 1515676672.0, + "4": 1515676672.0, + "5": 1515676672.0, + "6": 1515676672.0, + "7": 1515676672.0, + "8": 1515676672.0, + "9": 1515676672.0, + "10": 1515676672.0, + "11": 1515676672.0, + "12": 1515676672.0, + "13": 1515676672.0, + "14": 1515676672.0, + "15": 1515676672.0, + "16": 1515676672.0, + "17": 1515676672.0, + "18": 1515676672.0, + "19": 1515676672.0, + "20": 1515676672.0, + "21": 1515676672.0, + "22": 1515676672.0, + "23": 1515676672.0, + "24": 1515676672.0, + "25": 1515676672.0, + "26": 1515676672.0, + "27": 1515676672.0, + "28": 1515676672.0, + "29": 1515676672.0, + "30": 1515676672.0, + "31": 1515676672.0, + "32": 1515676672.0, + "33": 1515676672.0, + "34": 1515676672.0, + "35": 1515676672.0, + "36": 1515676672.0, + "37": 1515676672.0, + "38": 1515676672.0, + "39": 1515676672.0, + "40": 1515676672.0, + "41": 1515676672.0, + "42": 1515676672.0, + "43": 1515676672.0, + "44": 1515676672.0, + "45": 1515676672.0, + "46": 1515676672.0, + "47": 1515676672.0, + "48": 1515676672.0, + "49": 1515676672.0, + "50": 1515676672.0, + "51": 1515676672.0, + "52": 1515676672.0, + "53": 1515676672.0, + "54": 1515676672.0, + "55": 1515676672.0, + "56": 1515676672.0, + "57": 1515676672.0, + "58": 1515676672.0, + "59": 1515676672.0, + "60": 1515676672.0, + "61": 1515676672.0, + "62": 1515676672.0, + "63": 1515676672.0, + "64": 1515676672.0, + "65": 1515676672.0, + "66": 1515676672.0, + "67": 1515676672.0, + "68": 1515676672.0, + "69": 1515676672.0, + "70": 1515676672.0, + "71": 1515676672.0, + "72": 1515676672.0, + "73": 1515676672.0, + "74": 1515676672.0, + "75": 1515676672.0, + "76": 1515676672.0, + "77": 1515676672.0, + "78": 1515676672.0, + "79": 1515676672.0, + "80": 1515676672.0, + "81": 1515676672.0, + "82": 1515676672.0, + "83": 1515676672.0, + "84": 1515676672.0, + "85": 1515676672.0, + "86": 1515676672.0, + "87": 1515676672.0, + "88": 1515676672.0, + "89": 1515676672.0, + "90": 1515676672.0, + "91": 1515676672.0, + "92": 1515676672.0, + "93": 1515676672.0, + "94": 1515676672.0, + "95": 1515676672.0, + "96": 1515676672.0, + "97": 1515676672.0, + "98": 1515676672.0, + "99": 1515676672.0, + "100": 1515676672.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.29271, + "2": 0.42506, + "3": 0.68343, + "4": 0.36852, + "5": 0.35945, + "6": 0.70082, + "7": 0.36184, + "8": 0.36666, + "9": 0.36956, + "10": 0.36948, + "11": 0.34035, + "12": 0.33106, + "13": 0.32678, + "14": 0.50153, + "15": 0.32624, + "16": 0.32544, + "17": 0.33191, + "18": 0.32618, + "19": 0.3263, + "20": 0.33069, + "21": 0.32595, + "22": 0.3257, + "23": 0.33264, + "24": 0.32517, + "25": 0.32475, + "26": 0.33346, + "27": 0.33354, + "28": 0.32383, + "29": 0.33025, + "30": 0.32292, + "31": 0.32259, + "32": 0.33133, + "33": 0.32233, + "34": 0.32205, + "35": 0.32577, + "36": 0.33027, + "37": 0.32369, + "38": 0.3231, + "39": 0.32941, + "40": 0.32272, + "41": 0.32419, + "42": 0.32862, + "43": 0.32341, + "44": 0.32437, + "45": 0.3291, + "46": 0.32245, + "47": 0.32412, + "48": 0.32928, + "49": 0.32252, + "50": 0.3232, + "51": 0.3288, + "52": 0.32267, + "53": 0.32323, + "54": 0.33682, + "55": 0.32632, + "56": 0.32697, + "57": 0.33895, + "58": 0.32618, + "59": 0.32589, + "60": 0.3322, + "61": 0.3251, + "62": 0.32521, + "63": 0.33036, + "64": 0.32444, + "65": 0.32508, + "66": 0.33114, + "67": 0.32315, + "68": 0.32508, + "69": 0.3303, + "70": 0.32701, + "71": 0.32493, + "72": 0.32932, + "73": 0.32763, + "74": 0.32474, + "75": 0.32636, + "76": 0.33103, + "77": 0.32433, + "78": 0.32583, + "79": 0.33332, + "80": 0.32445, + "81": 0.32512, + "82": 0.33846, + "83": 0.32647, + "84": 0.32584, + "85": 0.33063, + "86": 0.32531, + "87": 0.32597, + "88": 0.33536, + "89": 0.32529, + "90": 0.32619, + "91": 0.33191, + "92": 0.32549, + "93": 0.32565, + "94": 0.33549, + "95": 0.32239, + "96": 0.32249, + "97": 0.32967, + "98": 0.3225, + "99": 0.32206, + "100": 0.32856 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..fbfe9099b9a --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.86122, + "2": 10.85774, + "3": 10.86039, + "4": 10.84813, + "5": 10.88242, + "6": 10.88645, + "7": 10.86227, + "8": 10.86932, + "9": 10.86444, + "10": 10.83506, + "11": 10.87765, + "12": 10.87384, + "13": 10.87945, + "14": 10.88919, + "15": 10.82738, + "16": 10.83105, + "17": 10.79888, + "18": 10.82441, + "19": 10.81363, + "20": 10.72743, + "21": 10.71638, + "22": 10.57153, + "23": 10.7269, + "24": 10.61223, + "25": 10.55753, + "26": 10.60603, + "27": 10.61792, + "28": 10.57695, + "29": 10.59633, + "30": 10.37895, + "31": 10.13125, + "32": 10.47822, + "33": 10.46894, + "34": 10.22715, + "35": 10.28321, + "36": 10.22751, + "37": 10.35397, + "38": 10.20483, + "39": 10.40755, + "40": 10.08785, + "41": 10.1591, + "42": 10.21601, + "43": 9.84821, + "44": 9.9651, + "45": 9.82625, + "46": 9.83468, + "47": 10.15337, + "48": 9.84529, + "49": 9.52926, + "50": 9.91327, + "51": 9.8517, + "52": 9.74686, + "53": 10.07204, + "54": 9.95738, + "55": 9.87788, + "56": 9.62943, + "57": 9.48988, + "58": 9.83265, + "59": 9.58831, + "60": 9.50874, + "61": 9.69495, + "62": 9.99373, + "63": 9.377, + "64": 9.78004, + "65": 8.95103, + "66": 9.71392, + "67": 9.37884, + "68": 9.78831, + "69": 9.79096, + "70": 9.73167, + "71": 9.61776, + "72": 9.59099, + "73": 9.49436, + "74": 8.95001, + "75": 9.43681, + "76": 9.09852, + "77": 10.06447, + "78": 9.72944, + "79": 9.37805, + "80": 9.41156, + "81": 9.48537, + "82": 9.69592, + "83": 9.31981, + "84": 9.42306, + "85": 9.61613, + "86": 9.07185, + "87": 9.59282, + "88": 9.75055, + "89": 9.61194, + "90": 9.8217, + "91": 9.35308, + "92": 9.36305, + "93": 9.08788, + "94": 8.83439, + "95": 9.5191, + "96": 9.52647, + "97": 9.31412, + "98": 9.67541, + "99": 8.88941, + "100": 9.40588 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1778.0, + "2": 1875.0, + "3": 1879.0, + "4": 1912.0, + "5": 2219.0, + "6": 2163.0, + "7": 2113.0, + "8": 1747.0, + "9": 2049.0, + "10": 1530.0, + "11": 2113.0, + "12": 1959.0, + "13": 2134.0, + "14": 2055.0, + "15": 2125.0, + "16": 2139.0, + "17": 1988.0, + "18": 1892.0, + "19": 1991.0, + "20": 1867.0, + "21": 2023.0, + "22": 1865.0, + "23": 2185.0, + "24": 1774.0, + "25": 1773.0, + "26": 1990.0, + "27": 2061.0, + "28": 2215.0, + "29": 2186.0, + "30": 2129.0, + "31": 1794.0, + "32": 2109.0, + "33": 2422.0, + "34": 2135.0, + "35": 2169.0, + "36": 2127.0, + "37": 2432.0, + "38": 2490.0, + "39": 2495.0, + "40": 2486.0, + "41": 2465.0, + "42": 2535.0, + "43": 2216.0, + "44": 2407.0, + "45": 2335.0, + "46": 2617.0, + "47": 2830.0, + "48": 2480.0, + "49": 2492.0, + "50": 2687.0, + "51": 2863.0, + "52": 2881.0, + "53": 3220.0, + "54": 2894.0, + "55": 2652.0, + "56": 3006.0, + "57": 2561.0, + "58": 3273.0, + "59": 3039.0, + "60": 2765.0, + "61": 3310.0, + "62": 2936.0, + "63": 2630.0, + "64": 3230.0, + "65": 2946.0, + "66": 3500.0, + "67": 2976.0, + "68": 2944.0, + "69": 3117.0, + "70": 3629.0, + "71": 3255.0, + "72": 2633.0, + "73": 3338.0, + "74": 2172.0, + "75": 2702.0, + "76": 3162.0, + "77": 3850.0, + "78": 3590.0, + "79": 3658.0, + "80": 3866.0, + "81": 3976.0, + "82": 3680.0, + "83": 3153.0, + "84": 3586.0, + "85": 3517.0, + "86": 3137.0, + "87": 4177.0, + "88": 3589.0, + "89": 3849.0, + "90": 3349.0, + "91": 2936.0, + "92": 3526.0, + "93": 2965.0, + "94": 3772.0, + "95": 3530.0, + "96": 3774.0, + "97": 3636.0, + "98": 4064.0, + "99": 3394.0, + "100": 3530.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 269891584.0, + "2": 269891584.0, + "3": 269891584.0, + "4": 269891584.0, + "5": 269891584.0, + "6": 269891584.0, + "7": 269891584.0, + "8": 269891584.0, + "9": 269891584.0, + "10": 269891584.0, + "11": 269891584.0, + "12": 269891584.0, + "13": 269891584.0, + "14": 269891584.0, + "15": 269891584.0, + "16": 269891584.0, + "17": 269891584.0, + "18": 269891584.0, + "19": 269891584.0, + "20": 269891584.0, + "21": 269891584.0, + "22": 269891584.0, + "23": 269891584.0, + "24": 269891584.0, + "25": 269891584.0, + "26": 269891584.0, + "27": 269891584.0, + "28": 269891584.0, + "29": 269891584.0, + "30": 269891584.0, + "31": 269891584.0, + "32": 269891584.0, + "33": 269891584.0, + "34": 269891584.0, + "35": 269891584.0, + "36": 269891584.0, + "37": 269891584.0, + "38": 269891584.0, + "39": 269891584.0, + "40": 269891584.0, + "41": 269891584.0, + "42": 269891584.0, + "43": 269891584.0, + "44": 269891584.0, + "45": 269891584.0, + "46": 269891584.0, + "47": 269891584.0, + "48": 269891584.0, + "49": 269891584.0, + "50": 269891584.0, + "51": 269891584.0, + "52": 269891584.0, + "53": 269891584.0, + "54": 269891584.0, + "55": 269891584.0, + "56": 269891584.0, + "57": 269891584.0, + "58": 269891584.0, + "59": 269891584.0, + "60": 269891584.0, + "61": 269891584.0, + "62": 269891584.0, + "63": 269891584.0, + "64": 269891584.0, + "65": 269891584.0, + "66": 269891584.0, + "67": 269891584.0, + "68": 269891584.0, + "69": 269891584.0, + "70": 269891584.0, + "71": 269891584.0, + "72": 269891584.0, + "73": 269891584.0, + "74": 269891584.0, + "75": 269891584.0, + "76": 269891584.0, + "77": 269891584.0, + "78": 269891584.0, + "79": 269891584.0, + "80": 269891584.0, + "81": 269891584.0, + "82": 269891584.0, + "83": 269891584.0, + "84": 269891584.0, + "85": 269891584.0, + "86": 269891584.0, + "87": 269891584.0, + "88": 269891584.0, + "89": 269891584.0, + "90": 269891584.0, + "91": 269891584.0, + "92": 269891584.0, + "93": 269891584.0, + "94": 269891584.0, + "95": 269891584.0, + "96": 269891584.0, + "97": 269891584.0, + "98": 269891584.0, + "99": 269891584.0, + "100": 269891584.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1449682432.0, + "2": 1515676160.0, + "3": 1515676672.0, + "4": 1515676672.0, + "5": 1515676672.0, + "6": 1515676672.0, + "7": 1515676672.0, + "8": 1515676672.0, + "9": 1515676672.0, + "10": 1515676672.0, + "11": 1515676672.0, + "12": 1515676672.0, + "13": 1515676672.0, + "14": 1515676672.0, + "15": 1515676672.0, + "16": 1515676672.0, + "17": 1515676672.0, + "18": 1515676672.0, + "19": 1515676672.0, + "20": 1515676672.0, + "21": 1515676672.0, + "22": 1515676672.0, + "23": 1515676672.0, + "24": 1515676672.0, + "25": 1515676672.0, + "26": 1515676672.0, + "27": 1515676672.0, + "28": 1515676672.0, + "29": 1515676672.0, + "30": 1515676672.0, + "31": 1515676672.0, + "32": 1515676672.0, + "33": 1515676672.0, + "34": 1515676672.0, + "35": 1515676672.0, + "36": 1515676672.0, + "37": 1515676672.0, + "38": 1515676672.0, + "39": 1515676672.0, + "40": 1515676672.0, + "41": 1515676672.0, + "42": 1515676672.0, + "43": 1515676672.0, + "44": 1515676672.0, + "45": 1515676672.0, + "46": 1515676672.0, + "47": 1515676672.0, + "48": 1515676672.0, + "49": 1515676672.0, + "50": 1515676672.0, + "51": 1515676672.0, + "52": 1515676672.0, + "53": 1515676672.0, + "54": 1515676672.0, + "55": 1515676672.0, + "56": 1515676672.0, + "57": 1515676672.0, + "58": 1515676672.0, + "59": 1515676672.0, + "60": 1515676672.0, + "61": 1515676672.0, + "62": 1515676672.0, + "63": 1515676672.0, + "64": 1515676672.0, + "65": 1515676672.0, + "66": 1515676672.0, + "67": 1515676672.0, + "68": 1515676672.0, + "69": 1515676672.0, + "70": 1515676672.0, + "71": 1515676672.0, + "72": 1515676672.0, + "73": 1515676672.0, + "74": 1515676672.0, + "75": 1515676672.0, + "76": 1515676672.0, + "77": 1515676672.0, + "78": 1515676672.0, + "79": 1515676672.0, + "80": 1515676672.0, + "81": 1515676672.0, + "82": 1515676672.0, + "83": 1515676672.0, + "84": 1515676672.0, + "85": 1515676672.0, + "86": 1515676672.0, + "87": 1515676672.0, + "88": 1515676672.0, + "89": 1515676672.0, + "90": 1515676672.0, + "91": 1515676672.0, + "92": 1515676672.0, + "93": 1515676672.0, + "94": 1515676672.0, + "95": 1515676672.0, + "96": 1515676672.0, + "97": 1515676672.0, + "98": 1515676672.0, + "99": 1515676672.0, + "100": 1515676672.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.32987, + "2": 0.44802, + "3": 0.3897, + "4": 0.56459, + "5": 0.32806, + "6": 0.32604, + "7": 0.3324, + "8": 0.32545, + "9": 0.32671, + "10": 0.32918, + "11": 0.32556, + "12": 0.32448, + "13": 0.33048, + "14": 0.32558, + "15": 0.32571, + "16": 0.32541, + "17": 0.32955, + "18": 0.32389, + "19": 0.32497, + "20": 0.32764, + "21": 0.32394, + "22": 0.32563, + "23": 0.32657, + "24": 0.32266, + "25": 0.32254, + "26": 0.3268, + "27": 0.32163, + "28": 0.32398, + "29": 0.32473, + "30": 0.32185, + "31": 0.32189, + "32": 0.32643, + "33": 0.32083, + "34": 0.56155, + "35": 0.31927, + "36": 0.31993, + "37": 0.32102, + "38": 0.32424, + "39": 0.31933, + "40": 0.32056, + "41": 0.32393, + "42": 0.31935, + "43": 0.32004, + "44": 0.32411, + "45": 0.31946, + "46": 0.32014, + "47": 0.32328, + "48": 0.32028, + "49": 0.32003, + "50": 0.32557, + "51": 0.32445, + "52": 0.31875, + "53": 0.32179, + "54": 0.31879, + "55": 0.31778, + "56": 0.32208, + "57": 0.32308, + "58": 0.34278, + "59": 0.321, + "60": 0.32449, + "61": 0.31868, + "62": 0.31968, + "63": 0.323, + "64": 0.31977, + "65": 0.3202, + "66": 0.32473, + "67": 0.3176, + "68": 0.32003, + "69": 0.32585, + "70": 0.31796, + "71": 0.32004, + "72": 0.32637, + "73": 0.31882, + "74": 0.31909, + "75": 0.32558, + "76": 0.31782, + "77": 0.31875, + "78": 0.3264, + "79": 0.31815, + "80": 0.32078, + "81": 0.32153, + "82": 0.31967, + "83": 0.31863, + "84": 0.32086, + "85": 0.3241, + "86": 0.31836, + "87": 0.31939, + "88": 0.32513, + "89": 0.31892, + "90": 0.31985, + "91": 0.32655, + "92": 0.31914, + "93": 0.32019, + "94": 0.3246, + "95": 0.31888, + "96": 0.31924, + "97": 0.32612, + "98": 0.35151, + "99": 0.32636, + "100": 0.32793 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..9480fee796c --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.86122, + "2": 10.85774, + "3": 10.86039, + "4": 10.84813, + "5": 10.88242, + "6": 10.88645, + "7": 10.86227, + "8": 10.86932, + "9": 10.86444, + "10": 10.83506, + "11": 10.87765, + "12": 10.87384, + "13": 10.87945, + "14": 10.88919, + "15": 10.82738, + "16": 10.83105, + "17": 10.79888, + "18": 10.82441, + "19": 10.81363, + "20": 10.72743, + "21": 10.71638, + "22": 10.57153, + "23": 10.7269, + "24": 10.61223, + "25": 10.55753, + "26": 10.60603, + "27": 10.61792, + "28": 10.57695, + "29": 10.59633, + "30": 10.37895, + "31": 10.13125, + "32": 10.47822, + "33": 10.46894, + "34": 10.22715, + "35": 10.28321, + "36": 10.22751, + "37": 10.35397, + "38": 10.20483, + "39": 10.40755, + "40": 10.08785, + "41": 10.1591, + "42": 10.21601, + "43": 9.84821, + "44": 9.9651, + "45": 9.82625, + "46": 9.83468, + "47": 10.15337, + "48": 9.84529, + "49": 9.52926, + "50": 9.91327, + "51": 9.8517, + "52": 9.74686, + "53": 10.07204, + "54": 9.95738, + "55": 9.87788, + "56": 9.62943, + "57": 9.48988, + "58": 9.83265, + "59": 9.58831, + "60": 9.50874, + "61": 9.69495, + "62": 9.99373, + "63": 9.377, + "64": 9.78004, + "65": 8.95103, + "66": 9.71392, + "67": 9.37884, + "68": 9.78831, + "69": 9.79096, + "70": 9.73167, + "71": 9.61776, + "72": 9.59099, + "73": 9.49436, + "74": 8.95001, + "75": 9.43681, + "76": 9.09852, + "77": 10.06447, + "78": 9.72944, + "79": 9.37805, + "80": 9.41156, + "81": 9.48537, + "82": 9.69592, + "83": 9.31981, + "84": 9.42306, + "85": 9.61613, + "86": 9.07185, + "87": 9.59282, + "88": 9.75055, + "89": 9.61194, + "90": 9.8217, + "91": 9.35308, + "92": 9.36305, + "93": 9.08788, + "94": 8.83439, + "95": 9.5191, + "96": 9.52647, + "97": 9.31412, + "98": 9.67541, + "99": 8.88941, + "100": 9.40588 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1778.0, + "2": 1875.0, + "3": 1879.0, + "4": 1912.0, + "5": 2219.0, + "6": 2163.0, + "7": 2113.0, + "8": 1747.0, + "9": 2049.0, + "10": 1530.0, + "11": 2113.0, + "12": 1959.0, + "13": 2134.0, + "14": 2055.0, + "15": 2125.0, + "16": 2139.0, + "17": 1988.0, + "18": 1892.0, + "19": 1991.0, + "20": 1867.0, + "21": 2023.0, + "22": 1865.0, + "23": 2185.0, + "24": 1774.0, + "25": 1773.0, + "26": 1990.0, + "27": 2061.0, + "28": 2215.0, + "29": 2186.0, + "30": 2129.0, + "31": 1794.0, + "32": 2109.0, + "33": 2422.0, + "34": 2135.0, + "35": 2169.0, + "36": 2127.0, + "37": 2432.0, + "38": 2490.0, + "39": 2495.0, + "40": 2486.0, + "41": 2465.0, + "42": 2535.0, + "43": 2216.0, + "44": 2407.0, + "45": 2335.0, + "46": 2617.0, + "47": 2830.0, + "48": 2480.0, + "49": 2492.0, + "50": 2687.0, + "51": 2863.0, + "52": 2881.0, + "53": 3220.0, + "54": 2894.0, + "55": 2652.0, + "56": 3006.0, + "57": 2561.0, + "58": 3273.0, + "59": 3039.0, + "60": 2765.0, + "61": 3310.0, + "62": 2936.0, + "63": 2630.0, + "64": 3230.0, + "65": 2946.0, + "66": 3500.0, + "67": 2976.0, + "68": 2944.0, + "69": 3117.0, + "70": 3629.0, + "71": 3255.0, + "72": 2633.0, + "73": 3338.0, + "74": 2172.0, + "75": 2702.0, + "76": 3162.0, + "77": 3850.0, + "78": 3590.0, + "79": 3658.0, + "80": 3866.0, + "81": 3976.0, + "82": 3680.0, + "83": 3153.0, + "84": 3586.0, + "85": 3517.0, + "86": 3137.0, + "87": 4177.0, + "88": 3589.0, + "89": 3849.0, + "90": 3349.0, + "91": 2936.0, + "92": 3526.0, + "93": 2965.0, + "94": 3772.0, + "95": 3530.0, + "96": 3774.0, + "97": 3636.0, + "98": 4064.0, + "99": 3394.0, + "100": 3530.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 269891584.0, + "2": 269891584.0, + "3": 269891584.0, + "4": 269891584.0, + "5": 269891584.0, + "6": 269891584.0, + "7": 269891584.0, + "8": 269891584.0, + "9": 269891584.0, + "10": 269891584.0, + "11": 269891584.0, + "12": 269891584.0, + "13": 269891584.0, + "14": 269891584.0, + "15": 269891584.0, + "16": 269891584.0, + "17": 269891584.0, + "18": 269891584.0, + "19": 269891584.0, + "20": 269891584.0, + "21": 269891584.0, + "22": 269891584.0, + "23": 269891584.0, + "24": 269891584.0, + "25": 269891584.0, + "26": 269891584.0, + "27": 269891584.0, + "28": 269891584.0, + "29": 269891584.0, + "30": 269891584.0, + "31": 269891584.0, + "32": 269891584.0, + "33": 269891584.0, + "34": 269891584.0, + "35": 269891584.0, + "36": 269891584.0, + "37": 269891584.0, + "38": 269891584.0, + "39": 269891584.0, + "40": 269891584.0, + "41": 269891584.0, + "42": 269891584.0, + "43": 269891584.0, + "44": 269891584.0, + "45": 269891584.0, + "46": 269891584.0, + "47": 269891584.0, + "48": 269891584.0, + "49": 269891584.0, + "50": 269891584.0, + "51": 269891584.0, + "52": 269891584.0, + "53": 269891584.0, + "54": 269891584.0, + "55": 269891584.0, + "56": 269891584.0, + "57": 269891584.0, + "58": 269891584.0, + "59": 269891584.0, + "60": 269891584.0, + "61": 269891584.0, + "62": 269891584.0, + "63": 269891584.0, + "64": 269891584.0, + "65": 269891584.0, + "66": 269891584.0, + "67": 269891584.0, + "68": 269891584.0, + "69": 269891584.0, + "70": 269891584.0, + "71": 269891584.0, + "72": 269891584.0, + "73": 269891584.0, + "74": 269891584.0, + "75": 269891584.0, + "76": 269891584.0, + "77": 269891584.0, + "78": 269891584.0, + "79": 269891584.0, + "80": 269891584.0, + "81": 269891584.0, + "82": 269891584.0, + "83": 269891584.0, + "84": 269891584.0, + "85": 269891584.0, + "86": 269891584.0, + "87": 269891584.0, + "88": 269891584.0, + "89": 269891584.0, + "90": 269891584.0, + "91": 269891584.0, + "92": 269891584.0, + "93": 269891584.0, + "94": 269891584.0, + "95": 269891584.0, + "96": 269891584.0, + "97": 269891584.0, + "98": 269891584.0, + "99": 269891584.0, + "100": 269891584.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1448633856.0, + "2": 1513579520.0, + "3": 1513579520.0, + "4": 1513579520.0, + "5": 1513579520.0, + "6": 1513579520.0, + "7": 1513579520.0, + "8": 1515676160.0, + "9": 1515676160.0, + "10": 1515676160.0, + "11": 1515676160.0, + "12": 1515676160.0, + "13": 1515676160.0, + "14": 1515676160.0, + "15": 1515676160.0, + "16": 1515676160.0, + "17": 1515676160.0, + "18": 1515676160.0, + "19": 1515676160.0, + "20": 1515676160.0, + "21": 1515676160.0, + "22": 1515676160.0, + "23": 1515676160.0, + "24": 1515676160.0, + "25": 1515676160.0, + "26": 1515676160.0, + "27": 1515676160.0, + "28": 1515676160.0, + "29": 1515676160.0, + "30": 1515676160.0, + "31": 1515676160.0, + "32": 1515676160.0, + "33": 1515676160.0, + "34": 1515676160.0, + "35": 1515676160.0, + "36": 1515676672.0, + "37": 1515676672.0, + "38": 1515676672.0, + "39": 1515676672.0, + "40": 1515676672.0, + "41": 1515676672.0, + "42": 1515676672.0, + "43": 1515676672.0, + "44": 1515676672.0, + "45": 1515676672.0, + "46": 1515676672.0, + "47": 1515676672.0, + "48": 1515676672.0, + "49": 1515676672.0, + "50": 1515676672.0, + "51": 1515676672.0, + "52": 1515676672.0, + "53": 1515676672.0, + "54": 1515676672.0, + "55": 1515676672.0, + "56": 1515676672.0, + "57": 1515676672.0, + "58": 1515676672.0, + "59": 1515676672.0, + "60": 1515676672.0, + "61": 1515676672.0, + "62": 1515676672.0, + "63": 1515676672.0, + "64": 1515676672.0, + "65": 1515676672.0, + "66": 1515676672.0, + "67": 1515676672.0, + "68": 1515676672.0, + "69": 1515676672.0, + "70": 1515676672.0, + "71": 1515676672.0, + "72": 1515676672.0, + "73": 1515676672.0, + "74": 1515676672.0, + "75": 1515676672.0, + "76": 1515676672.0, + "77": 1515676672.0, + "78": 1515676672.0, + "79": 1515676672.0, + "80": 1515676672.0, + "81": 1515676672.0, + "82": 1515676672.0, + "83": 1515676672.0, + "84": 1515676672.0, + "85": 1515676672.0, + "86": 1515676672.0, + "87": 1515676672.0, + "88": 1515676672.0, + "89": 1515676672.0, + "90": 1515676672.0, + "91": 1515676672.0, + "92": 1515676672.0, + "93": 1515676672.0, + "94": 1515676672.0, + "95": 1515676672.0, + "96": 1515676672.0, + "97": 1515676672.0, + "98": 1515676672.0, + "99": 1515676672.0, + "100": 1515676672.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.43327, + "2": 0.37217, + "3": 0.69038, + "4": 0.33729, + "5": 0.33255, + "6": 0.3329, + "7": 0.34063, + "8": 0.55397, + "9": 0.33233, + "10": 0.33512, + "11": 0.33544, + "12": 0.33156, + "13": 0.33165, + "14": 0.33013, + "15": 0.32988, + "16": 0.32999, + "17": 0.32805, + "18": 0.32946, + "19": 0.33103, + "20": 0.32729, + "21": 0.32872, + "22": 0.3299, + "23": 0.33066, + "24": 0.3297, + "25": 0.32925, + "26": 0.33007, + "27": 0.32757, + "28": 0.32935, + "29": 0.32613, + "30": 0.33036, + "31": 0.32825, + "32": 0.32791, + "33": 0.32815, + "34": 0.32917, + "35": 0.32646, + "36": 0.33004, + "37": 0.3301, + "38": 0.32598, + "39": 0.32992, + "40": 0.33003, + "41": 0.32599, + "42": 0.32948, + "43": 0.3293, + "44": 0.326, + "45": 0.3277, + "46": 0.33009, + "47": 0.32567, + "48": 0.32635, + "49": 0.33059, + "50": 0.33062, + "51": 0.33004, + "52": 0.32318, + "53": 0.32666, + "54": 0.32944, + "55": 0.32431, + "56": 0.3255, + "57": 0.33385, + "58": 0.32385, + "59": 0.32365, + "60": 0.33444, + "61": 0.32406, + "62": 0.32323, + "63": 0.33128, + "64": 0.32416, + "65": 0.32428, + "66": 0.32909, + "67": 0.32519, + "68": 0.3235, + "69": 0.33075, + "70": 0.32636, + "71": 0.32447, + "72": 0.32921, + "73": 0.32654, + "74": 0.32367, + "75": 0.32884, + "76": 0.32668, + "77": 0.32544, + "78": 0.33087, + "79": 0.32596, + "80": 0.32366, + "81": 0.32924, + "82": 0.32879, + "83": 0.32405, + "84": 0.32977, + "85": 0.32708, + "86": 0.32429, + "87": 0.32954, + "88": 0.32748, + "89": 0.32359, + "90": 0.3286, + "91": 0.33163, + "92": 0.32398, + "93": 0.32839, + "94": 0.3316, + "95": 0.32702, + "96": 0.32902, + "97": 0.32869, + "98": 0.32786, + "99": 0.33283, + "100": 0.3296 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index c74efe95bb5..b194abf2755 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,537 @@ -{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.84194, "5": 10.85873, "10": 10.81845, "15": 10.81222, "20": 10.71072, "25": 10.57461, "30": 10.40091, "35": 10.28875, "40": 10.10167, "45": 9.86955, "50": 9.91374, "55": 9.89204, "60": 9.51573, "65": 8.95939, "70": 9.74555, "75": 9.41848, "80": 9.40261, "85": 9.61514, "90": 9.81999, "95": 9.51099, "100": 9.39984}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1664.0, "5": 2007.0, "10": 1469.0, "15": 1992.0, "20": 1767.0, "25": 1747.0, "30": 1936.0, "35": 1963.0, "40": 2274.0, "45": 2043.0, "50": 2278.0, "55": 2307.0, "60": 2287.0, "65": 2544.0, "70": 3049.0, "75": 2539.0, "80": 3101.0, "85": 3288.0, "90": 3168.0, "95": 3186.0, "100": 3212.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 397747712.0, "5": 397747712.0, "10": 397747712.0, "15": 397747712.0, "20": 397747712.0, "25": 397747712.0, "30": 397747712.0, "35": 397747712.0, "40": 397747712.0, "45": 397747712.0, "50": 397747712.0, "55": 397747712.0, "60": 397747712.0, "65": 397747712.0, "70": 397747712.0, "75": 397747712.0, "80": 397747712.0, "85": 397747712.0, "90": 397747712.0, "95": 397747712.0, "100": 397747712.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1044755968.0, "5": 1177840128.0, "10": 1177840128.0, "15": 1177840128.0, "20": 1177840128.0, "25": 1177840128.0, "30": 1177840128.0, "35": 1177840128.0, "40": 1177840128.0, "45": 1177840128.0, "50": 1177840128.0, "55": 1177840128.0, "60": 1177840128.0, "65": 1177840128.0, "70": 1177840128.0, "75": 1177840128.0, "80": 1177840128.0, "85": 1177840128.0, "90": 1177840128.0, "95": 1177840128.0, "100": 1177840128.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.05354, "5": 0.25457, "10": 0.23579, "15": 0.24024, "20": 0.23692, "25": 0.24276, "30": 0.24032, "35": 0.26057, "40": 0.23557, "45": 0.23278, "50": 0.23752, "55": 0.25569, "60": 0.23569, "65": 0.23452, "70": 0.2368, "75": 0.24765, "80": 0.24644, "85": 0.23632, "90": 0.23404, "95": 0.23761, "100": 0.24117}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84194, + "2": 10.85713, + "3": 10.84346, + "4": 10.84202, + "5": 10.85873, + "6": 10.86412, + "7": 10.851, + "8": 10.84731, + "9": 10.85736, + "10": 10.81845, + "11": 10.8595, + "12": 10.84335, + "13": 10.86446, + "14": 10.85336, + "15": 10.81222, + "16": 10.81549, + "17": 10.78956, + "18": 10.79784, + "19": 10.79279, + "20": 10.71072, + "21": 10.6971, + "22": 10.58894, + "23": 10.7072, + "24": 10.60764, + "25": 10.57461, + "26": 10.6238, + "27": 10.62036, + "28": 10.567, + "29": 10.57013, + "30": 10.40091, + "31": 10.17393, + "32": 10.46119, + "33": 10.45713, + "34": 10.24672, + "35": 10.28875, + "36": 10.25284, + "37": 10.3466, + "38": 10.20914, + "39": 10.39432, + "40": 10.10167, + "41": 10.159, + "42": 10.21413, + "43": 9.8848, + "44": 9.98809, + "45": 9.86955, + "46": 9.84366, + "47": 10.1377, + "48": 9.87973, + "49": 9.56916, + "50": 9.91374, + "51": 9.86379, + "52": 9.75652, + "53": 10.06157, + "54": 9.96418, + "55": 9.89204, + "56": 9.63681, + "57": 9.49807, + "58": 9.83504, + "59": 9.59701, + "60": 9.51573, + "61": 9.70155, + "62": 9.97973, + "63": 9.38914, + "64": 9.77552, + "65": 8.95939, + "66": 9.6978, + "67": 9.37174, + "68": 9.78449, + "69": 9.79058, + "70": 9.74555, + "71": 9.61867, + "72": 9.58317, + "73": 9.49175, + "74": 8.939, + "75": 9.41848, + "76": 9.07237, + "77": 10.06903, + "78": 9.72443, + "79": 9.3767, + "80": 9.40261, + "81": 9.47859, + "82": 9.6984, + "83": 9.30086, + "84": 9.41299, + "85": 9.61514, + "86": 9.07881, + "87": 9.59402, + "88": 9.74658, + "89": 9.60096, + "90": 9.81999, + "91": 9.32977, + "92": 9.35625, + "93": 9.07406, + "94": 8.82774, + "95": 9.51099, + "96": 9.52501, + "97": 9.3163, + "98": 9.67278, + "99": 8.88493, + "100": 9.39984 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1664.0, + "2": 1707.0, + "3": 1836.0, + "4": 1861.0, + "5": 2007.0, + "6": 1868.0, + "7": 1826.0, + "8": 1697.0, + "9": 1815.0, + "10": 1469.0, + "11": 1876.0, + "12": 1879.0, + "13": 1979.0, + "14": 1902.0, + "15": 1992.0, + "16": 1988.0, + "17": 1879.0, + "18": 1802.0, + "19": 1886.0, + "20": 1767.0, + "21": 1929.0, + "22": 1714.0, + "23": 2031.0, + "24": 1685.0, + "25": 1747.0, + "26": 1811.0, + "27": 1915.0, + "28": 1929.0, + "29": 2020.0, + "30": 1936.0, + "31": 1680.0, + "32": 1878.0, + "33": 2204.0, + "34": 1888.0, + "35": 1963.0, + "36": 1928.0, + "37": 2383.0, + "38": 2177.0, + "39": 2388.0, + "40": 2274.0, + "41": 2194.0, + "42": 2167.0, + "43": 1922.0, + "44": 1978.0, + "45": 2043.0, + "46": 2112.0, + "47": 2556.0, + "48": 2251.0, + "49": 2320.0, + "50": 2278.0, + "51": 2563.0, + "52": 2431.0, + "53": 2917.0, + "54": 2655.0, + "55": 2307.0, + "56": 2605.0, + "57": 2385.0, + "58": 2952.0, + "59": 2730.0, + "60": 2287.0, + "61": 2904.0, + "62": 2601.0, + "63": 2452.0, + "64": 2810.0, + "65": 2544.0, + "66": 2914.0, + "67": 2664.0, + "68": 2709.0, + "69": 2967.0, + "70": 3049.0, + "71": 2936.0, + "72": 2410.0, + "73": 2991.0, + "74": 1882.0, + "75": 2539.0, + "76": 3060.0, + "77": 3219.0, + "78": 3023.0, + "79": 3084.0, + "80": 3101.0, + "81": 3530.0, + "82": 3298.0, + "83": 2666.0, + "84": 3154.0, + "85": 3288.0, + "86": 2827.0, + "87": 3720.0, + "88": 3168.0, + "89": 3275.0, + "90": 3168.0, + "91": 2919.0, + "92": 3071.0, + "93": 2751.0, + "94": 3412.0, + "95": 3186.0, + "96": 3429.0, + "97": 3083.0, + "98": 3477.0, + "99": 3093.0, + "100": 3212.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 397747712.0, + "2": 397747712.0, + "3": 397747712.0, + "4": 397747712.0, + "5": 397747712.0, + "6": 397747712.0, + "7": 397747712.0, + "8": 397747712.0, + "9": 397747712.0, + "10": 397747712.0, + "11": 397747712.0, + "12": 397747712.0, + "13": 397747712.0, + "14": 397747712.0, + "15": 397747712.0, + "16": 397747712.0, + "17": 397747712.0, + "18": 397747712.0, + "19": 397747712.0, + "20": 397747712.0, + "21": 397747712.0, + "22": 397747712.0, + "23": 397747712.0, + "24": 397747712.0, + "25": 397747712.0, + "26": 397747712.0, + "27": 397747712.0, + "28": 397747712.0, + "29": 397747712.0, + "30": 397747712.0, + "31": 397747712.0, + "32": 397747712.0, + "33": 397747712.0, + "34": 397747712.0, + "35": 397747712.0, + "36": 397747712.0, + "37": 397747712.0, + "38": 397747712.0, + "39": 397747712.0, + "40": 397747712.0, + "41": 397747712.0, + "42": 397747712.0, + "43": 397747712.0, + "44": 397747712.0, + "45": 397747712.0, + "46": 397747712.0, + "47": 397747712.0, + "48": 397747712.0, + "49": 397747712.0, + "50": 397747712.0, + "51": 397747712.0, + "52": 397747712.0, + "53": 397747712.0, + "54": 397747712.0, + "55": 397747712.0, + "56": 397747712.0, + "57": 397747712.0, + "58": 397747712.0, + "59": 397747712.0, + "60": 397747712.0, + "61": 397747712.0, + "62": 397747712.0, + "63": 397747712.0, + "64": 397747712.0, + "65": 397747712.0, + "66": 397747712.0, + "67": 397747712.0, + "68": 397747712.0, + "69": 397747712.0, + "70": 397747712.0, + "71": 397747712.0, + "72": 397747712.0, + "73": 397747712.0, + "74": 397747712.0, + "75": 397747712.0, + "76": 397747712.0, + "77": 397747712.0, + "78": 397747712.0, + "79": 397747712.0, + "80": 397747712.0, + "81": 397747712.0, + "82": 397747712.0, + "83": 397747712.0, + "84": 397747712.0, + "85": 397747712.0, + "86": 397747712.0, + "87": 397747712.0, + "88": 397747712.0, + "89": 397747712.0, + "90": 397747712.0, + "91": 397747712.0, + "92": 397747712.0, + "93": 397747712.0, + "94": 397747712.0, + "95": 397747712.0, + "96": 397747712.0, + "97": 397747712.0, + "98": 397747712.0, + "99": 397747712.0, + "100": 397747712.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1044755968.0, + "2": 1177840128.0, + "3": 1177840128.0, + "4": 1177840128.0, + "5": 1177840128.0, + "6": 1177840128.0, + "7": 1177840128.0, + "8": 1177840128.0, + "9": 1177840128.0, + "10": 1177840128.0, + "11": 1177840128.0, + "12": 1177840128.0, + "13": 1177840128.0, + "14": 1177840128.0, + "15": 1177840128.0, + "16": 1177840128.0, + "17": 1177840128.0, + "18": 1177840128.0, + "19": 1177840128.0, + "20": 1177840128.0, + "21": 1177840128.0, + "22": 1177840128.0, + "23": 1177840128.0, + "24": 1177840128.0, + "25": 1177840128.0, + "26": 1177840128.0, + "27": 1177840128.0, + "28": 1177840128.0, + "29": 1177840128.0, + "30": 1177840128.0, + "31": 1177840128.0, + "32": 1177840128.0, + "33": 1177840128.0, + "34": 1177840128.0, + "35": 1177840128.0, + "36": 1177840128.0, + "37": 1177840128.0, + "38": 1177840128.0, + "39": 1177840128.0, + "40": 1177840128.0, + "41": 1177840128.0, + "42": 1177840128.0, + "43": 1177840128.0, + "44": 1177840128.0, + "45": 1177840128.0, + "46": 1177840128.0, + "47": 1177840128.0, + "48": 1177840128.0, + "49": 1177840128.0, + "50": 1177840128.0, + "51": 1177840128.0, + "52": 1177840128.0, + "53": 1177840128.0, + "54": 1177840128.0, + "55": 1177840128.0, + "56": 1177840128.0, + "57": 1177840128.0, + "58": 1177840128.0, + "59": 1177840128.0, + "60": 1177840128.0, + "61": 1177840128.0, + "62": 1177840128.0, + "63": 1177840128.0, + "64": 1177840128.0, + "65": 1177840128.0, + "66": 1177840128.0, + "67": 1177840128.0, + "68": 1177840128.0, + "69": 1177840128.0, + "70": 1177840128.0, + "71": 1177840128.0, + "72": 1177840128.0, + "73": 1177840128.0, + "74": 1177840128.0, + "75": 1177840128.0, + "76": 1177840128.0, + "77": 1177840128.0, + "78": 1177840128.0, + "79": 1177840128.0, + "80": 1177840128.0, + "81": 1177840128.0, + "82": 1177840128.0, + "83": 1177840128.0, + "84": 1177840128.0, + "85": 1177840128.0, + "86": 1177840128.0, + "87": 1177840128.0, + "88": 1177840128.0, + "89": 1177840128.0, + "90": 1177840128.0, + "91": 1177840128.0, + "92": 1177840128.0, + "93": 1177840128.0, + "94": 1177840128.0, + "95": 1177840128.0, + "96": 1177840128.0, + "97": 1177840128.0, + "98": 1177840128.0, + "99": 1177840128.0, + "100": 1177840128.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.98808, + "2": 0.31896, + "3": 0.2872, + "4": 0.28844, + "5": 0.29055, + "6": 0.28565, + "7": 0.29151, + "8": 0.2909, + "9": 0.28554, + "10": 0.28532, + "11": 0.28987, + "12": 0.29026, + "13": 0.28704, + "14": 0.28868, + "15": 0.29081, + "16": 0.29135, + "17": 0.29053, + "18": 0.29219, + "19": 0.28784, + "20": 0.29358, + "21": 0.30495, + "22": 0.29941, + "23": 0.29122, + "24": 0.29122, + "25": 0.29408, + "26": 0.29093, + "27": 0.2904, + "28": 0.29116, + "29": 0.29607, + "30": 0.29163, + "31": 0.29002, + "32": 0.29186, + "33": 0.28732, + "34": 0.28673, + "35": 0.29062, + "36": 0.2913, + "37": 0.28723, + "38": 0.28871, + "39": 0.29253, + "40": 0.2884, + "41": 0.28738, + "42": 0.28836, + "43": 0.28808, + "44": 0.28794, + "45": 0.29124, + "46": 0.29271, + "47": 0.28573, + "48": 0.28587, + "49": 0.28908, + "50": 0.28839, + "51": 0.30021, + "52": 0.30654, + "53": 0.3059, + "54": 0.29714, + "55": 0.28911, + "56": 0.29586, + "57": 0.29074, + "58": 0.28682, + "59": 0.29439, + "60": 0.28999, + "61": 0.29254, + "62": 0.28813, + "63": 0.29743, + "64": 0.28913, + "65": 0.29726, + "66": 0.29597, + "67": 0.28858, + "68": 0.29025, + "69": 0.29089, + "70": 0.29517, + "71": 0.28924, + "72": 0.29291, + "73": 0.29626, + "74": 0.29034, + "75": 0.28667, + "76": 0.29537, + "77": 0.29663, + "78": 0.29518, + "79": 0.29485, + "80": 0.29784, + "81": 0.2912, + "82": 0.29265, + "83": 0.29806, + "84": 0.29292, + "85": 0.29315, + "86": 0.31345, + "87": 0.31236, + "88": 0.29799, + "89": 0.2941, + "90": 0.29816, + "91": 0.29109, + "92": 0.2885, + "93": 0.29422, + "94": 0.29493, + "95": 0.28717, + "96": 0.29109, + "97": 0.29595, + "98": 0.29077, + "99": 0.29004, + "100": 0.29477 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..bd823394dd2 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84194, + "2": 10.85713, + "3": 10.84346, + "4": 10.84202, + "5": 10.85873, + "6": 10.86412, + "7": 10.851, + "8": 10.84731, + "9": 10.85736, + "10": 10.81845, + "11": 10.8595, + "12": 10.84335, + "13": 10.86446, + "14": 10.85336, + "15": 10.81222, + "16": 10.81549, + "17": 10.78956, + "18": 10.79784, + "19": 10.79279, + "20": 10.71072, + "21": 10.6971, + "22": 10.58894, + "23": 10.7072, + "24": 10.60764, + "25": 10.57461, + "26": 10.6238, + "27": 10.62036, + "28": 10.567, + "29": 10.57013, + "30": 10.40091, + "31": 10.17393, + "32": 10.46119, + "33": 10.45713, + "34": 10.24672, + "35": 10.28875, + "36": 10.25284, + "37": 10.3466, + "38": 10.20914, + "39": 10.39432, + "40": 10.10167, + "41": 10.159, + "42": 10.21413, + "43": 9.8848, + "44": 9.98809, + "45": 9.86955, + "46": 9.84366, + "47": 10.1377, + "48": 9.87973, + "49": 9.56916, + "50": 9.91374, + "51": 9.86379, + "52": 9.75652, + "53": 10.06157, + "54": 9.96418, + "55": 9.89204, + "56": 9.63681, + "57": 9.49807, + "58": 9.83504, + "59": 9.59701, + "60": 9.51573, + "61": 9.70155, + "62": 9.97973, + "63": 9.38914, + "64": 9.77552, + "65": 8.95939, + "66": 9.6978, + "67": 9.37174, + "68": 9.78449, + "69": 9.79058, + "70": 9.74555, + "71": 9.61867, + "72": 9.58317, + "73": 9.49175, + "74": 8.939, + "75": 9.41848, + "76": 9.07237, + "77": 10.06903, + "78": 9.72443, + "79": 9.3767, + "80": 9.40261, + "81": 9.47859, + "82": 9.6984, + "83": 9.30086, + "84": 9.41299, + "85": 9.61514, + "86": 9.07881, + "87": 9.59402, + "88": 9.74658, + "89": 9.60096, + "90": 9.81999, + "91": 9.32977, + "92": 9.35625, + "93": 9.07406, + "94": 8.82774, + "95": 9.51099, + "96": 9.52501, + "97": 9.3163, + "98": 9.67278, + "99": 8.88493, + "100": 9.39984 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1664.0, + "2": 1707.0, + "3": 1836.0, + "4": 1861.0, + "5": 2007.0, + "6": 1868.0, + "7": 1826.0, + "8": 1697.0, + "9": 1815.0, + "10": 1469.0, + "11": 1876.0, + "12": 1879.0, + "13": 1979.0, + "14": 1902.0, + "15": 1992.0, + "16": 1988.0, + "17": 1879.0, + "18": 1802.0, + "19": 1886.0, + "20": 1767.0, + "21": 1929.0, + "22": 1714.0, + "23": 2031.0, + "24": 1685.0, + "25": 1747.0, + "26": 1811.0, + "27": 1915.0, + "28": 1929.0, + "29": 2020.0, + "30": 1936.0, + "31": 1680.0, + "32": 1878.0, + "33": 2204.0, + "34": 1888.0, + "35": 1963.0, + "36": 1928.0, + "37": 2383.0, + "38": 2177.0, + "39": 2388.0, + "40": 2274.0, + "41": 2194.0, + "42": 2167.0, + "43": 1922.0, + "44": 1978.0, + "45": 2043.0, + "46": 2112.0, + "47": 2556.0, + "48": 2251.0, + "49": 2320.0, + "50": 2278.0, + "51": 2563.0, + "52": 2431.0, + "53": 2917.0, + "54": 2655.0, + "55": 2307.0, + "56": 2605.0, + "57": 2385.0, + "58": 2952.0, + "59": 2730.0, + "60": 2287.0, + "61": 2904.0, + "62": 2601.0, + "63": 2452.0, + "64": 2810.0, + "65": 2544.0, + "66": 2914.0, + "67": 2664.0, + "68": 2709.0, + "69": 2967.0, + "70": 3049.0, + "71": 2936.0, + "72": 2410.0, + "73": 2991.0, + "74": 1882.0, + "75": 2539.0, + "76": 3060.0, + "77": 3219.0, + "78": 3023.0, + "79": 3084.0, + "80": 3101.0, + "81": 3530.0, + "82": 3298.0, + "83": 2666.0, + "84": 3154.0, + "85": 3288.0, + "86": 2827.0, + "87": 3720.0, + "88": 3168.0, + "89": 3275.0, + "90": 3168.0, + "91": 2919.0, + "92": 3071.0, + "93": 2751.0, + "94": 3412.0, + "95": 3186.0, + "96": 3429.0, + "97": 3083.0, + "98": 3477.0, + "99": 3093.0, + "100": 3212.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 397747712.0, + "2": 397747712.0, + "3": 397747712.0, + "4": 397747712.0, + "5": 397747712.0, + "6": 397747712.0, + "7": 397747712.0, + "8": 397747712.0, + "9": 397747712.0, + "10": 397747712.0, + "11": 397747712.0, + "12": 397747712.0, + "13": 397747712.0, + "14": 397747712.0, + "15": 397747712.0, + "16": 397747712.0, + "17": 397747712.0, + "18": 397747712.0, + "19": 397747712.0, + "20": 397747712.0, + "21": 397747712.0, + "22": 397747712.0, + "23": 397747712.0, + "24": 397747712.0, + "25": 397747712.0, + "26": 397747712.0, + "27": 397747712.0, + "28": 397747712.0, + "29": 397747712.0, + "30": 397747712.0, + "31": 397747712.0, + "32": 397747712.0, + "33": 397747712.0, + "34": 397747712.0, + "35": 397747712.0, + "36": 397747712.0, + "37": 397747712.0, + "38": 397747712.0, + "39": 397747712.0, + "40": 397747712.0, + "41": 397747712.0, + "42": 397747712.0, + "43": 397747712.0, + "44": 397747712.0, + "45": 397747712.0, + "46": 397747712.0, + "47": 397747712.0, + "48": 397747712.0, + "49": 397747712.0, + "50": 397747712.0, + "51": 397747712.0, + "52": 397747712.0, + "53": 397747712.0, + "54": 397747712.0, + "55": 397747712.0, + "56": 397747712.0, + "57": 397747712.0, + "58": 397747712.0, + "59": 397747712.0, + "60": 397747712.0, + "61": 397747712.0, + "62": 397747712.0, + "63": 397747712.0, + "64": 397747712.0, + "65": 397747712.0, + "66": 397747712.0, + "67": 397747712.0, + "68": 397747712.0, + "69": 397747712.0, + "70": 397747712.0, + "71": 397747712.0, + "72": 397747712.0, + "73": 397747712.0, + "74": 397747712.0, + "75": 397747712.0, + "76": 397747712.0, + "77": 397747712.0, + "78": 397747712.0, + "79": 397747712.0, + "80": 397747712.0, + "81": 397747712.0, + "82": 397747712.0, + "83": 397747712.0, + "84": 397747712.0, + "85": 397747712.0, + "86": 397747712.0, + "87": 397747712.0, + "88": 397747712.0, + "89": 397747712.0, + "90": 397747712.0, + "91": 397747712.0, + "92": 397747712.0, + "93": 397747712.0, + "94": 397747712.0, + "95": 397747712.0, + "96": 397747712.0, + "97": 397747712.0, + "98": 397747712.0, + "99": 397747712.0, + "100": 397747712.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1044755968.0, + "2": 1177840128.0, + "3": 1177840128.0, + "4": 1177840128.0, + "5": 1177840128.0, + "6": 1177840128.0, + "7": 1177840128.0, + "8": 1177840128.0, + "9": 1177840128.0, + "10": 1177840128.0, + "11": 1177840128.0, + "12": 1177840128.0, + "13": 1177840128.0, + "14": 1177840128.0, + "15": 1177840128.0, + "16": 1177840128.0, + "17": 1177840128.0, + "18": 1177840128.0, + "19": 1177840128.0, + "20": 1177840128.0, + "21": 1177840128.0, + "22": 1177840128.0, + "23": 1177840128.0, + "24": 1177840128.0, + "25": 1177840128.0, + "26": 1177840128.0, + "27": 1177840128.0, + "28": 1177840128.0, + "29": 1177840128.0, + "30": 1177840128.0, + "31": 1177840128.0, + "32": 1177840128.0, + "33": 1177840128.0, + "34": 1177840128.0, + "35": 1177840128.0, + "36": 1177840128.0, + "37": 1177840128.0, + "38": 1177840128.0, + "39": 1177840128.0, + "40": 1177840128.0, + "41": 1177840128.0, + "42": 1177840128.0, + "43": 1177840128.0, + "44": 1177840128.0, + "45": 1177840128.0, + "46": 1177840128.0, + "47": 1177840128.0, + "48": 1177840128.0, + "49": 1177840128.0, + "50": 1177840128.0, + "51": 1177840128.0, + "52": 1177840128.0, + "53": 1177840128.0, + "54": 1177840128.0, + "55": 1177840128.0, + "56": 1177840128.0, + "57": 1177840128.0, + "58": 1177840128.0, + "59": 1177840128.0, + "60": 1177840128.0, + "61": 1177840128.0, + "62": 1177840128.0, + "63": 1177840128.0, + "64": 1177840128.0, + "65": 1177840128.0, + "66": 1177840128.0, + "67": 1177840128.0, + "68": 1177840128.0, + "69": 1177840128.0, + "70": 1177840128.0, + "71": 1177840128.0, + "72": 1177840128.0, + "73": 1177840128.0, + "74": 1177840128.0, + "75": 1177840128.0, + "76": 1177840128.0, + "77": 1177840128.0, + "78": 1177840128.0, + "79": 1177840128.0, + "80": 1177840128.0, + "81": 1177840128.0, + "82": 1177840128.0, + "83": 1177840128.0, + "84": 1177840128.0, + "85": 1177840128.0, + "86": 1177840128.0, + "87": 1177840128.0, + "88": 1177840128.0, + "89": 1177840128.0, + "90": 1177840128.0, + "91": 1177840128.0, + "92": 1177840128.0, + "93": 1177840128.0, + "94": 1177840128.0, + "95": 1177840128.0, + "96": 1177840128.0, + "97": 1177840128.0, + "98": 1177840128.0, + "99": 1177840128.0, + "100": 1177840128.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.95666, + "2": 0.32924, + "3": 0.25226, + "4": 0.25106, + "5": 0.25493, + "6": 0.25253, + "7": 0.25357, + "8": 0.25271, + "9": 0.25432, + "10": 0.25385, + "11": 0.25308, + "12": 0.25347, + "13": 0.25055, + "14": 0.25356, + "15": 0.26243, + "16": 0.26195, + "17": 0.25653, + "18": 0.25321, + "19": 0.25683, + "20": 0.253, + "21": 0.26002, + "22": 0.25583, + "23": 0.2569, + "24": 0.25453, + "25": 0.25674, + "26": 0.28427, + "27": 0.26846, + "28": 0.25669, + "29": 0.25979, + "30": 0.25506, + "31": 0.25795, + "32": 0.25594, + "33": 0.25547, + "34": 0.25599, + "35": 0.2592, + "36": 0.25766, + "37": 0.25711, + "38": 0.25265, + "39": 0.25683, + "40": 0.25734, + "41": 0.25589, + "42": 0.25063, + "43": 0.25742, + "44": 0.25967, + "45": 0.25573, + "46": 0.25687, + "47": 0.26161, + "48": 0.25952, + "49": 0.25626, + "50": 0.25429, + "51": 0.26173, + "52": 0.27578, + "53": 0.2696, + "54": 0.26719, + "55": 0.26842, + "56": 0.27282, + "57": 0.27059, + "58": 0.26573, + "59": 0.27553, + "60": 0.26764, + "61": 0.25837, + "62": 0.25923, + "63": 0.27037, + "64": 0.26917, + "65": 0.26615, + "66": 0.57271, + "67": 0.26906, + "68": 0.26543, + "69": 0.26985, + "70": 0.27165, + "71": 0.26533, + "72": 0.27015, + "73": 0.26666, + "74": 0.26902, + "75": 0.26747, + "76": 0.26725, + "77": 0.269, + "78": 0.27067, + "79": 0.26982, + "80": 0.26617, + "81": 0.269, + "82": 0.26853, + "83": 0.26607, + "84": 0.26722, + "85": 0.27017, + "86": 0.2778, + "87": 0.27697, + "88": 0.27012, + "89": 0.27065, + "90": 0.26599, + "91": 0.26551, + "92": 0.27357, + "93": 0.27599, + "94": 0.26598, + "95": 0.27382, + "96": 0.27956, + "97": 0.26613, + "98": 0.26511, + "99": 0.26941, + "100": 0.27208 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..d5d1de46cac --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84194, + "2": 10.85713, + "3": 10.84346, + "4": 10.84202, + "5": 10.85873, + "6": 10.86412, + "7": 10.851, + "8": 10.84731, + "9": 10.85736, + "10": 10.81845, + "11": 10.8595, + "12": 10.84335, + "13": 10.86446, + "14": 10.85336, + "15": 10.81222, + "16": 10.81549, + "17": 10.78956, + "18": 10.79784, + "19": 10.79279, + "20": 10.71072, + "21": 10.6971, + "22": 10.58894, + "23": 10.7072, + "24": 10.60764, + "25": 10.57461, + "26": 10.6238, + "27": 10.62036, + "28": 10.567, + "29": 10.57013, + "30": 10.40091, + "31": 10.17393, + "32": 10.46119, + "33": 10.45713, + "34": 10.24672, + "35": 10.28875, + "36": 10.25284, + "37": 10.3466, + "38": 10.20914, + "39": 10.39432, + "40": 10.10167, + "41": 10.159, + "42": 10.21413, + "43": 9.8848, + "44": 9.98809, + "45": 9.86955, + "46": 9.84366, + "47": 10.1377, + "48": 9.87973, + "49": 9.56916, + "50": 9.91374, + "51": 9.86379, + "52": 9.75652, + "53": 10.06157, + "54": 9.96418, + "55": 9.89204, + "56": 9.63681, + "57": 9.49807, + "58": 9.83504, + "59": 9.59701, + "60": 9.51573, + "61": 9.70155, + "62": 9.97973, + "63": 9.38914, + "64": 9.77552, + "65": 8.95939, + "66": 9.6978, + "67": 9.37174, + "68": 9.78449, + "69": 9.79058, + "70": 9.74555, + "71": 9.61867, + "72": 9.58317, + "73": 9.49175, + "74": 8.939, + "75": 9.41848, + "76": 9.07237, + "77": 10.06903, + "78": 9.72443, + "79": 9.3767, + "80": 9.40261, + "81": 9.47859, + "82": 9.6984, + "83": 9.30086, + "84": 9.41299, + "85": 9.61514, + "86": 9.07881, + "87": 9.59402, + "88": 9.74658, + "89": 9.60096, + "90": 9.81999, + "91": 9.32977, + "92": 9.35625, + "93": 9.07406, + "94": 8.82774, + "95": 9.51099, + "96": 9.52501, + "97": 9.3163, + "98": 9.67278, + "99": 8.88493, + "100": 9.39984 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1664.0, + "2": 1707.0, + "3": 1836.0, + "4": 1861.0, + "5": 2007.0, + "6": 1868.0, + "7": 1826.0, + "8": 1697.0, + "9": 1815.0, + "10": 1469.0, + "11": 1876.0, + "12": 1879.0, + "13": 1979.0, + "14": 1902.0, + "15": 1992.0, + "16": 1988.0, + "17": 1879.0, + "18": 1802.0, + "19": 1886.0, + "20": 1767.0, + "21": 1929.0, + "22": 1714.0, + "23": 2031.0, + "24": 1685.0, + "25": 1747.0, + "26": 1811.0, + "27": 1915.0, + "28": 1929.0, + "29": 2020.0, + "30": 1936.0, + "31": 1680.0, + "32": 1878.0, + "33": 2204.0, + "34": 1888.0, + "35": 1963.0, + "36": 1928.0, + "37": 2383.0, + "38": 2177.0, + "39": 2388.0, + "40": 2274.0, + "41": 2194.0, + "42": 2167.0, + "43": 1922.0, + "44": 1978.0, + "45": 2043.0, + "46": 2112.0, + "47": 2556.0, + "48": 2251.0, + "49": 2320.0, + "50": 2278.0, + "51": 2563.0, + "52": 2431.0, + "53": 2917.0, + "54": 2655.0, + "55": 2307.0, + "56": 2605.0, + "57": 2385.0, + "58": 2952.0, + "59": 2730.0, + "60": 2287.0, + "61": 2904.0, + "62": 2601.0, + "63": 2452.0, + "64": 2810.0, + "65": 2544.0, + "66": 2914.0, + "67": 2664.0, + "68": 2709.0, + "69": 2967.0, + "70": 3049.0, + "71": 2936.0, + "72": 2410.0, + "73": 2991.0, + "74": 1882.0, + "75": 2539.0, + "76": 3060.0, + "77": 3219.0, + "78": 3023.0, + "79": 3084.0, + "80": 3101.0, + "81": 3530.0, + "82": 3298.0, + "83": 2666.0, + "84": 3154.0, + "85": 3288.0, + "86": 2827.0, + "87": 3720.0, + "88": 3168.0, + "89": 3275.0, + "90": 3168.0, + "91": 2919.0, + "92": 3071.0, + "93": 2751.0, + "94": 3412.0, + "95": 3186.0, + "96": 3429.0, + "97": 3083.0, + "98": 3477.0, + "99": 3093.0, + "100": 3212.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 397747712.0, + "2": 397747712.0, + "3": 397747712.0, + "4": 397747712.0, + "5": 397747712.0, + "6": 397747712.0, + "7": 397747712.0, + "8": 397747712.0, + "9": 397747712.0, + "10": 397747712.0, + "11": 397747712.0, + "12": 397747712.0, + "13": 397747712.0, + "14": 397747712.0, + "15": 397747712.0, + "16": 397747712.0, + "17": 397747712.0, + "18": 397747712.0, + "19": 397747712.0, + "20": 397747712.0, + "21": 397747712.0, + "22": 397747712.0, + "23": 397747712.0, + "24": 397747712.0, + "25": 397747712.0, + "26": 397747712.0, + "27": 397747712.0, + "28": 397747712.0, + "29": 397747712.0, + "30": 397747712.0, + "31": 397747712.0, + "32": 397747712.0, + "33": 397747712.0, + "34": 397747712.0, + "35": 397747712.0, + "36": 397747712.0, + "37": 397747712.0, + "38": 397747712.0, + "39": 397747712.0, + "40": 397747712.0, + "41": 397747712.0, + "42": 397747712.0, + "43": 397747712.0, + "44": 397747712.0, + "45": 397747712.0, + "46": 397747712.0, + "47": 397747712.0, + "48": 397747712.0, + "49": 397747712.0, + "50": 397747712.0, + "51": 397747712.0, + "52": 397747712.0, + "53": 397747712.0, + "54": 397747712.0, + "55": 397747712.0, + "56": 397747712.0, + "57": 397747712.0, + "58": 397747712.0, + "59": 397747712.0, + "60": 397747712.0, + "61": 397747712.0, + "62": 397747712.0, + "63": 397747712.0, + "64": 397747712.0, + "65": 397747712.0, + "66": 397747712.0, + "67": 397747712.0, + "68": 397747712.0, + "69": 397747712.0, + "70": 397747712.0, + "71": 397747712.0, + "72": 397747712.0, + "73": 397747712.0, + "74": 397747712.0, + "75": 397747712.0, + "76": 397747712.0, + "77": 397747712.0, + "78": 397747712.0, + "79": 397747712.0, + "80": 397747712.0, + "81": 397747712.0, + "82": 397747712.0, + "83": 397747712.0, + "84": 397747712.0, + "85": 397747712.0, + "86": 397747712.0, + "87": 397747712.0, + "88": 397747712.0, + "89": 397747712.0, + "90": 397747712.0, + "91": 397747712.0, + "92": 397747712.0, + "93": 397747712.0, + "94": 397747712.0, + "95": 397747712.0, + "96": 397747712.0, + "97": 397747712.0, + "98": 397747712.0, + "99": 397747712.0, + "100": 397747712.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1044755968.0, + "2": 1177840128.0, + "3": 1177840128.0, + "4": 1177840128.0, + "5": 1177840128.0, + "6": 1177840128.0, + "7": 1177840128.0, + "8": 1177840128.0, + "9": 1177840128.0, + "10": 1177840128.0, + "11": 1177840128.0, + "12": 1177840128.0, + "13": 1177840128.0, + "14": 1177840128.0, + "15": 1177840128.0, + "16": 1177840128.0, + "17": 1177840128.0, + "18": 1177840128.0, + "19": 1177840128.0, + "20": 1177840128.0, + "21": 1177840128.0, + "22": 1177840128.0, + "23": 1177840128.0, + "24": 1177840128.0, + "25": 1177840128.0, + "26": 1177840128.0, + "27": 1177840128.0, + "28": 1177840128.0, + "29": 1177840128.0, + "30": 1177840128.0, + "31": 1177840128.0, + "32": 1177840128.0, + "33": 1177840128.0, + "34": 1177840128.0, + "35": 1177840128.0, + "36": 1177840128.0, + "37": 1177840128.0, + "38": 1177840128.0, + "39": 1177840128.0, + "40": 1177840128.0, + "41": 1177840128.0, + "42": 1177840128.0, + "43": 1177840128.0, + "44": 1177840128.0, + "45": 1177840128.0, + "46": 1177840128.0, + "47": 1177840128.0, + "48": 1177840128.0, + "49": 1177840128.0, + "50": 1177840128.0, + "51": 1177840128.0, + "52": 1177840128.0, + "53": 1177840128.0, + "54": 1177840128.0, + "55": 1177840128.0, + "56": 1177840128.0, + "57": 1177840128.0, + "58": 1177840128.0, + "59": 1177840128.0, + "60": 1177840128.0, + "61": 1177840128.0, + "62": 1177840128.0, + "63": 1177840128.0, + "64": 1177840128.0, + "65": 1177840128.0, + "66": 1177840128.0, + "67": 1177840128.0, + "68": 1177840128.0, + "69": 1177840128.0, + "70": 1177840128.0, + "71": 1177840128.0, + "72": 1177840128.0, + "73": 1177840128.0, + "74": 1177840128.0, + "75": 1177840128.0, + "76": 1177840128.0, + "77": 1177840128.0, + "78": 1177840128.0, + "79": 1177840128.0, + "80": 1177840128.0, + "81": 1177840128.0, + "82": 1177840128.0, + "83": 1177840128.0, + "84": 1177840128.0, + "85": 1177840128.0, + "86": 1177840128.0, + "87": 1177840128.0, + "88": 1177840128.0, + "89": 1177840128.0, + "90": 1177840128.0, + "91": 1177840128.0, + "92": 1177840128.0, + "93": 1177840128.0, + "94": 1177840128.0, + "95": 1177840128.0, + "96": 1177840128.0, + "97": 1177840128.0, + "98": 1177840128.0, + "99": 1177840128.0, + "100": 1177840128.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.61367, + "2": 0.31935, + "3": 0.29274, + "4": 0.28637, + "5": 0.2844, + "6": 0.29788, + "7": 0.2902, + "8": 0.28573, + "9": 0.29136, + "10": 0.29884, + "11": 0.29048, + "12": 0.2896, + "13": 0.29421, + "14": 0.29008, + "15": 0.2871, + "16": 0.28903, + "17": 0.2924, + "18": 0.28887, + "19": 0.28926, + "20": 0.30241, + "21": 0.29571, + "22": 0.28966, + "23": 0.29177, + "24": 0.29106, + "25": 0.28884, + "26": 0.28921, + "27": 0.29461, + "28": 0.28664, + "29": 0.28881, + "30": 0.29392, + "31": 0.29062, + "32": 0.28778, + "33": 0.29055, + "34": 0.29409, + "35": 0.29169, + "36": 0.29211, + "37": 0.29809, + "38": 0.29114, + "39": 0.29052, + "40": 0.2919, + "41": 0.2953, + "42": 0.28957, + "43": 0.29349, + "44": 0.30062, + "45": 0.28999, + "46": 0.29486, + "47": 0.29689, + "48": 0.29092, + "49": 0.29024, + "50": 0.28916, + "51": 0.30865, + "52": 0.29957, + "53": 0.28833, + "54": 0.29375, + "55": 0.29176, + "56": 0.29338, + "57": 0.28952, + "58": 0.29232, + "59": 0.29026, + "60": 0.28767, + "61": 0.29364, + "62": 0.2935, + "63": 0.29522, + "64": 0.29495, + "65": 0.29509, + "66": 0.29643, + "67": 0.29584, + "68": 0.29853, + "69": 0.29821, + "70": 0.29334, + "71": 0.29579, + "72": 0.29325, + "73": 0.29403, + "74": 0.29671, + "75": 0.63106, + "76": 0.29142, + "77": 0.29491, + "78": 0.29437, + "79": 0.29239, + "80": 0.29453, + "81": 0.29509, + "82": 0.29493, + "83": 0.2915, + "84": 0.30181, + "85": 0.29305, + "86": 0.28823, + "87": 0.29337, + "88": 0.29025, + "89": 0.28953, + "90": 0.29694, + "91": 0.29077, + "92": 0.29411, + "93": 0.28767, + "94": 0.29313, + "95": 0.29276, + "96": 0.29197, + "97": 0.29466, + "98": 0.29321, + "99": 0.29311, + "100": 0.29175 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..68686a287ae --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.86209, + "2": 10.85806, + "3": 10.8598, + "4": 10.84984, + "5": 10.88253, + "6": 10.88646, + "7": 10.8626, + "8": 10.86997, + "9": 10.86483, + "10": 10.83642, + "11": 10.87862, + "12": 10.87482, + "13": 10.87957, + "14": 10.88968, + "15": 10.82909, + "16": 10.8329, + "17": 10.79973, + "18": 10.82619, + "19": 10.81484, + "20": 10.73237, + "21": 10.72029, + "22": 10.57776, + "23": 10.73009, + "24": 10.61704, + "25": 10.56392, + "26": 10.6109, + "27": 10.6244, + "28": 10.58233, + "29": 10.59936, + "30": 10.38484, + "31": 10.14179, + "32": 10.48065, + "33": 10.47405, + "34": 10.23471, + "35": 10.28951, + "36": 10.23434, + "37": 10.35826, + "38": 10.20825, + "39": 10.41154, + "40": 10.09133, + "41": 10.1661, + "42": 10.21968, + "43": 9.85861, + "44": 9.97128, + "45": 9.83487, + "46": 9.84446, + "47": 10.15847, + "48": 9.85182, + "49": 9.53839, + "50": 9.91604, + "51": 9.85736, + "52": 9.75252, + "53": 10.0755, + "54": 9.96042, + "55": 9.88232, + "56": 9.63204, + "57": 9.49336, + "58": 9.83436, + "59": 9.59208, + "60": 9.51376, + "61": 9.69806, + "62": 9.99169, + "63": 9.37379, + "64": 9.77832, + "65": 8.95392, + "66": 9.71066, + "67": 9.38186, + "68": 9.78754, + "69": 9.7933, + "70": 9.73094, + "71": 9.61728, + "72": 9.58467, + "73": 9.4898, + "74": 8.94127, + "75": 9.4313, + "76": 9.09097, + "77": 10.06237, + "78": 9.72645, + "79": 9.37428, + "80": 9.40597, + "81": 9.47979, + "82": 9.69227, + "83": 9.3124, + "84": 9.41987, + "85": 9.61137, + "86": 9.06834, + "87": 9.59084, + "88": 9.74523, + "89": 9.6065, + "90": 9.81743, + "91": 9.34257, + "92": 9.35903, + "93": 9.07904, + "94": 8.82791, + "95": 9.51571, + "96": 9.52139, + "97": 9.31116, + "98": 9.67194, + "99": 8.88688, + "100": 9.40429 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1780.0, + "2": 1990.0, + "3": 1911.0, + "4": 1881.0, + "5": 2137.0, + "6": 2167.0, + "7": 2095.0, + "8": 1824.0, + "9": 2072.0, + "10": 1588.0, + "11": 2120.0, + "12": 2042.0, + "13": 2228.0, + "14": 2143.0, + "15": 2083.0, + "16": 1988.0, + "17": 2055.0, + "18": 1945.0, + "19": 2015.0, + "20": 1816.0, + "21": 2133.0, + "22": 1909.0, + "23": 2404.0, + "24": 1868.0, + "25": 1862.0, + "26": 1978.0, + "27": 2095.0, + "28": 2298.0, + "29": 2242.0, + "30": 2045.0, + "31": 1805.0, + "32": 2205.0, + "33": 2426.0, + "34": 2176.0, + "35": 2205.0, + "36": 2185.0, + "37": 2605.0, + "38": 2508.0, + "39": 2524.0, + "40": 2629.0, + "41": 2531.0, + "42": 2594.0, + "43": 2335.0, + "44": 2316.0, + "45": 2441.0, + "46": 2665.0, + "47": 2694.0, + "48": 2587.0, + "49": 2538.0, + "50": 2734.0, + "51": 2906.0, + "52": 2829.0, + "53": 3163.0, + "54": 3001.0, + "55": 2662.0, + "56": 2967.0, + "57": 2540.0, + "58": 3326.0, + "59": 3105.0, + "60": 2726.0, + "61": 3284.0, + "62": 2957.0, + "63": 2690.0, + "64": 3247.0, + "65": 3011.0, + "66": 3409.0, + "67": 2852.0, + "68": 3048.0, + "69": 3229.0, + "70": 3737.0, + "71": 3186.0, + "72": 2634.0, + "73": 3390.0, + "74": 2125.0, + "75": 2771.0, + "76": 3235.0, + "77": 3605.0, + "78": 3672.0, + "79": 3633.0, + "80": 3804.0, + "81": 4084.0, + "82": 3675.0, + "83": 3138.0, + "84": 3636.0, + "85": 3588.0, + "86": 3171.0, + "87": 4250.0, + "88": 3592.0, + "89": 3775.0, + "90": 3384.0, + "91": 3074.0, + "92": 3533.0, + "93": 3067.0, + "94": 3730.0, + "95": 3590.0, + "96": 3888.0, + "97": 3580.0, + "98": 4012.0, + "99": 3315.0, + "100": 3454.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 368387072.0, + "2": 368387072.0, + "3": 368387072.0, + "4": 368387072.0, + "5": 368387072.0, + "6": 368387072.0, + "7": 368387072.0, + "8": 368387072.0, + "9": 368387072.0, + "10": 368387072.0, + "11": 368387072.0, + "12": 368387072.0, + "13": 368387072.0, + "14": 368387072.0, + "15": 368387072.0, + "16": 368387072.0, + "17": 368387072.0, + "18": 368387072.0, + "19": 368387072.0, + "20": 368387072.0, + "21": 368387072.0, + "22": 368387072.0, + "23": 368387072.0, + "24": 368387072.0, + "25": 368387072.0, + "26": 368387072.0, + "27": 368387072.0, + "28": 368387072.0, + "29": 368387072.0, + "30": 368387072.0, + "31": 368387072.0, + "32": 368387072.0, + "33": 368387072.0, + "34": 368387072.0, + "35": 368387072.0, + "36": 368387072.0, + "37": 368387072.0, + "38": 368387072.0, + "39": 368387072.0, + "40": 368387072.0, + "41": 368387072.0, + "42": 368387072.0, + "43": 368387072.0, + "44": 368387072.0, + "45": 368387072.0, + "46": 368387072.0, + "47": 368387072.0, + "48": 368387072.0, + "49": 368387072.0, + "50": 368387072.0, + "51": 368387072.0, + "52": 368387072.0, + "53": 368387072.0, + "54": 368387072.0, + "55": 368387072.0, + "56": 368387072.0, + "57": 368387072.0, + "58": 368387072.0, + "59": 368387072.0, + "60": 368387072.0, + "61": 368387072.0, + "62": 368387072.0, + "63": 368387072.0, + "64": 368387072.0, + "65": 368387072.0, + "66": 368387072.0, + "67": 368387072.0, + "68": 368387072.0, + "69": 368387072.0, + "70": 368387072.0, + "71": 368387072.0, + "72": 368387072.0, + "73": 368387072.0, + "74": 368387072.0, + "75": 368387072.0, + "76": 368387072.0, + "77": 368387072.0, + "78": 368387072.0, + "79": 368387072.0, + "80": 368387072.0, + "81": 368387072.0, + "82": 368387072.0, + "83": 368387072.0, + "84": 368387072.0, + "85": 368387072.0, + "86": 368387072.0, + "87": 368387072.0, + "88": 368387072.0, + "89": 368387072.0, + "90": 368387072.0, + "91": 368387072.0, + "92": 368387072.0, + "93": 368387072.0, + "94": 368387072.0, + "95": 368387072.0, + "96": 368387072.0, + "97": 368387072.0, + "98": 368387072.0, + "99": 368387072.0, + "100": 368387072.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1510972416.0, + "2": 1643008000.0, + "3": 1643008000.0, + "4": 1643008000.0, + "5": 1643008000.0, + "6": 1645105152.0, + "7": 1645105152.0, + "8": 1645105152.0, + "9": 1645105152.0, + "10": 1645105152.0, + "11": 1647201280.0, + "12": 1647201280.0, + "13": 1647201280.0, + "14": 1647201280.0, + "15": 1647201280.0, + "16": 1647201280.0, + "17": 1647201280.0, + "18": 1647201280.0, + "19": 1647201280.0, + "20": 1647201280.0, + "21": 1647201280.0, + "22": 1647201280.0, + "23": 1647201280.0, + "24": 1647201280.0, + "25": 1647201280.0, + "26": 1647201280.0, + "27": 1647201280.0, + "28": 1647201280.0, + "29": 1647201280.0, + "30": 1647201280.0, + "31": 1647201280.0, + "32": 1647201280.0, + "33": 1647201280.0, + "34": 1647201280.0, + "35": 1647201280.0, + "36": 1647201280.0, + "37": 1647201280.0, + "38": 1649296896.0, + "39": 1649296896.0, + "40": 1649296896.0, + "41": 1649296896.0, + "42": 1649296896.0, + "43": 1649296896.0, + "44": 1649296896.0, + "45": 1649296896.0, + "46": 1649296896.0, + "47": 1649296896.0, + "48": 1649296896.0, + "49": 1649296896.0, + "50": 1649296896.0, + "51": 1649296896.0, + "52": 1649299456.0, + "53": 1649299456.0, + "54": 1649299456.0, + "55": 1649299456.0, + "56": 1649299456.0, + "57": 1649299456.0, + "58": 1649299456.0, + "59": 1649299456.0, + "60": 1649299456.0, + "61": 1649299456.0, + "62": 1649299456.0, + "63": 1649299456.0, + "64": 1649299456.0, + "65": 1649299456.0, + "66": 1649299456.0, + "67": 1649299456.0, + "68": 1649299456.0, + "69": 1649299456.0, + "70": 1649299456.0, + "71": 1649299456.0, + "72": 1649299456.0, + "73": 1649299456.0, + "74": 1649299456.0, + "75": 1649299456.0, + "76": 1649299456.0, + "77": 1649299456.0, + "78": 1649299456.0, + "79": 1649299456.0, + "80": 1649299456.0, + "81": 1649299456.0, + "82": 1649299456.0, + "83": 1649299456.0, + "84": 1649299456.0, + "85": 1649299456.0, + "86": 1649299456.0, + "87": 1649299456.0, + "88": 1649299456.0, + "89": 1649299456.0, + "90": 1649299456.0, + "91": 1649299456.0, + "92": 1649299456.0, + "93": 1649299456.0, + "94": 1649299456.0, + "95": 1649299456.0, + "96": 1649299456.0, + "97": 1649299456.0, + "98": 1649299456.0, + "99": 1649299456.0, + "100": 1649299456.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 8.6334, + "2": 0.58887, + "3": 0.44885, + "4": 0.45823, + "5": 0.4541, + "6": 0.47222, + "7": 1.10638, + "8": 0.43653, + "9": 0.44329, + "10": 0.44399, + "11": 0.44344, + "12": 0.44343, + "13": 0.44305, + "14": 0.44198, + "15": 0.43185, + "16": 0.44065, + "17": 0.4397, + "18": 0.43652, + "19": 0.44411, + "20": 0.43298, + "21": 0.43948, + "22": 0.43139, + "23": 0.44927, + "24": 0.42704, + "25": 0.42868, + "26": 0.64107, + "27": 0.43117, + "28": 0.43201, + "29": 0.42798, + "30": 0.43481, + "31": 0.5935, + "32": 0.43533, + "33": 0.42675, + "34": 0.44082, + "35": 0.42648, + "36": 0.43241, + "37": 0.42804, + "38": 0.42825, + "39": 0.43697, + "40": 0.42755, + "41": 0.43914, + "42": 0.42638, + "43": 0.43891, + "44": 0.42856, + "45": 0.42888, + "46": 0.44513, + "47": 0.4274, + "48": 0.43414, + "49": 0.65463, + "50": 0.43047, + "51": 0.43747, + "52": 0.44679, + "53": 0.4308, + "54": 0.43283, + "55": 0.44288, + "56": 0.43291, + "57": 0.44077, + "58": 0.43033, + "59": 0.43703, + "60": 0.43023, + "61": 0.43081, + "62": 0.4427, + "63": 0.43029, + "64": 0.44385, + "65": 0.43137, + "66": 0.44438, + "67": 0.43134, + "68": 0.43364, + "69": 0.43286, + "70": 0.43126, + "71": 0.4347, + "72": 0.42922, + "73": 0.44303, + "74": 0.43105, + "75": 0.43275, + "76": 0.43316, + "77": 0.43097, + "78": 0.43941, + "79": 0.42984, + "80": 0.43662, + "81": 0.43019, + "82": 0.44076, + "83": 0.42994, + "84": 0.4329, + "85": 0.44259, + "86": 0.43023, + "87": 0.43581, + "88": 0.42929, + "89": 0.43896, + "90": 0.4306, + "91": 0.43406, + "92": 0.43524, + "93": 0.43032, + "94": 0.44318, + "95": 0.42838, + "96": 0.44267, + "97": 0.43005, + "98": 0.43788, + "99": 0.43526, + "100": 0.43277 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..48895a39167 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.86209, + "2": 10.85806, + "3": 10.8598, + "4": 10.84984, + "5": 10.88253, + "6": 10.88646, + "7": 10.8626, + "8": 10.86997, + "9": 10.86483, + "10": 10.83642, + "11": 10.87862, + "12": 10.87482, + "13": 10.87957, + "14": 10.88968, + "15": 10.82909, + "16": 10.8329, + "17": 10.79973, + "18": 10.82619, + "19": 10.81484, + "20": 10.73237, + "21": 10.72029, + "22": 10.57776, + "23": 10.73009, + "24": 10.61704, + "25": 10.56392, + "26": 10.6109, + "27": 10.6244, + "28": 10.58233, + "29": 10.59936, + "30": 10.38484, + "31": 10.14179, + "32": 10.48065, + "33": 10.47405, + "34": 10.23471, + "35": 10.28951, + "36": 10.23434, + "37": 10.35826, + "38": 10.20825, + "39": 10.41154, + "40": 10.09133, + "41": 10.1661, + "42": 10.21968, + "43": 9.85861, + "44": 9.97128, + "45": 9.83487, + "46": 9.84446, + "47": 10.15847, + "48": 9.85182, + "49": 9.53839, + "50": 9.91604, + "51": 9.85736, + "52": 9.75252, + "53": 10.0755, + "54": 9.96042, + "55": 9.88232, + "56": 9.63204, + "57": 9.49336, + "58": 9.83436, + "59": 9.59208, + "60": 9.51376, + "61": 9.69806, + "62": 9.99169, + "63": 9.37379, + "64": 9.77832, + "65": 8.95392, + "66": 9.71066, + "67": 9.38186, + "68": 9.78754, + "69": 9.7933, + "70": 9.73094, + "71": 9.61728, + "72": 9.58467, + "73": 9.4898, + "74": 8.94127, + "75": 9.4313, + "76": 9.09097, + "77": 10.06237, + "78": 9.72645, + "79": 9.37428, + "80": 9.40597, + "81": 9.47979, + "82": 9.69227, + "83": 9.3124, + "84": 9.41987, + "85": 9.61137, + "86": 9.06834, + "87": 9.59084, + "88": 9.74523, + "89": 9.6065, + "90": 9.81743, + "91": 9.34257, + "92": 9.35903, + "93": 9.07904, + "94": 8.82791, + "95": 9.51571, + "96": 9.52139, + "97": 9.31116, + "98": 9.67194, + "99": 8.88688, + "100": 9.40429 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1780.0, + "2": 1990.0, + "3": 1911.0, + "4": 1881.0, + "5": 2137.0, + "6": 2167.0, + "7": 2095.0, + "8": 1824.0, + "9": 2072.0, + "10": 1588.0, + "11": 2120.0, + "12": 2042.0, + "13": 2228.0, + "14": 2143.0, + "15": 2083.0, + "16": 1988.0, + "17": 2055.0, + "18": 1945.0, + "19": 2015.0, + "20": 1816.0, + "21": 2133.0, + "22": 1909.0, + "23": 2404.0, + "24": 1868.0, + "25": 1862.0, + "26": 1978.0, + "27": 2095.0, + "28": 2298.0, + "29": 2242.0, + "30": 2045.0, + "31": 1805.0, + "32": 2205.0, + "33": 2426.0, + "34": 2176.0, + "35": 2205.0, + "36": 2185.0, + "37": 2605.0, + "38": 2508.0, + "39": 2524.0, + "40": 2629.0, + "41": 2531.0, + "42": 2594.0, + "43": 2335.0, + "44": 2316.0, + "45": 2441.0, + "46": 2665.0, + "47": 2694.0, + "48": 2587.0, + "49": 2538.0, + "50": 2734.0, + "51": 2906.0, + "52": 2829.0, + "53": 3163.0, + "54": 3001.0, + "55": 2662.0, + "56": 2967.0, + "57": 2540.0, + "58": 3326.0, + "59": 3105.0, + "60": 2726.0, + "61": 3284.0, + "62": 2957.0, + "63": 2690.0, + "64": 3247.0, + "65": 3011.0, + "66": 3409.0, + "67": 2852.0, + "68": 3048.0, + "69": 3229.0, + "70": 3737.0, + "71": 3186.0, + "72": 2634.0, + "73": 3390.0, + "74": 2125.0, + "75": 2771.0, + "76": 3235.0, + "77": 3605.0, + "78": 3672.0, + "79": 3633.0, + "80": 3804.0, + "81": 4084.0, + "82": 3675.0, + "83": 3138.0, + "84": 3636.0, + "85": 3588.0, + "86": 3171.0, + "87": 4250.0, + "88": 3592.0, + "89": 3775.0, + "90": 3384.0, + "91": 3074.0, + "92": 3533.0, + "93": 3067.0, + "94": 3730.0, + "95": 3590.0, + "96": 3888.0, + "97": 3580.0, + "98": 4012.0, + "99": 3315.0, + "100": 3454.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 368387072.0, + "2": 368387072.0, + "3": 368387072.0, + "4": 368387072.0, + "5": 368387072.0, + "6": 368387072.0, + "7": 368387072.0, + "8": 368387072.0, + "9": 368387072.0, + "10": 368387072.0, + "11": 368387072.0, + "12": 368387072.0, + "13": 368387072.0, + "14": 368387072.0, + "15": 368387072.0, + "16": 368387072.0, + "17": 368387072.0, + "18": 368387072.0, + "19": 368387072.0, + "20": 368387072.0, + "21": 368387072.0, + "22": 368387072.0, + "23": 368387072.0, + "24": 368387072.0, + "25": 368387072.0, + "26": 368387072.0, + "27": 368387072.0, + "28": 368387072.0, + "29": 368387072.0, + "30": 368387072.0, + "31": 368387072.0, + "32": 368387072.0, + "33": 368387072.0, + "34": 368387072.0, + "35": 368387072.0, + "36": 368387072.0, + "37": 368387072.0, + "38": 368387072.0, + "39": 368387072.0, + "40": 368387072.0, + "41": 368387072.0, + "42": 368387072.0, + "43": 368387072.0, + "44": 368387072.0, + "45": 368387072.0, + "46": 368387072.0, + "47": 368387072.0, + "48": 368387072.0, + "49": 368387072.0, + "50": 368387072.0, + "51": 368387072.0, + "52": 368387072.0, + "53": 368387072.0, + "54": 368387072.0, + "55": 368387072.0, + "56": 368387072.0, + "57": 368387072.0, + "58": 368387072.0, + "59": 368387072.0, + "60": 368387072.0, + "61": 368387072.0, + "62": 368387072.0, + "63": 368387072.0, + "64": 368387072.0, + "65": 368387072.0, + "66": 368387072.0, + "67": 368387072.0, + "68": 368387072.0, + "69": 368387072.0, + "70": 368387072.0, + "71": 368387072.0, + "72": 368387072.0, + "73": 368387072.0, + "74": 368387072.0, + "75": 368387072.0, + "76": 368387072.0, + "77": 368387072.0, + "78": 368387072.0, + "79": 368387072.0, + "80": 368387072.0, + "81": 368387072.0, + "82": 368387072.0, + "83": 368387072.0, + "84": 368387072.0, + "85": 368387072.0, + "86": 368387072.0, + "87": 368387072.0, + "88": 368387072.0, + "89": 368387072.0, + "90": 368387072.0, + "91": 368387072.0, + "92": 368387072.0, + "93": 368387072.0, + "94": 368387072.0, + "95": 368387072.0, + "96": 368387072.0, + "97": 368387072.0, + "98": 368387072.0, + "99": 368387072.0, + "100": 368387072.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1510972416.0, + "2": 1645105152.0, + "3": 1645105152.0, + "4": 1645105152.0, + "5": 1645105152.0, + "6": 1645105152.0, + "7": 1645105152.0, + "8": 1645105152.0, + "9": 1645105152.0, + "10": 1645105152.0, + "11": 1645105152.0, + "12": 1645105152.0, + "13": 1645105152.0, + "14": 1645105152.0, + "15": 1645105152.0, + "16": 1645105152.0, + "17": 1645105152.0, + "18": 1645105152.0, + "19": 1645105152.0, + "20": 1645105152.0, + "21": 1645105152.0, + "22": 1645105152.0, + "23": 1645105152.0, + "24": 1645105152.0, + "25": 1645105152.0, + "26": 1645105152.0, + "27": 1645105152.0, + "28": 1645105152.0, + "29": 1645105152.0, + "30": 1645105152.0, + "31": 1645105152.0, + "32": 1645105152.0, + "33": 1645105152.0, + "34": 1645105152.0, + "35": 1645105152.0, + "36": 1645105152.0, + "37": 1645105152.0, + "38": 1645105152.0, + "39": 1645105152.0, + "40": 1645105152.0, + "41": 1645105152.0, + "42": 1645105152.0, + "43": 1645105152.0, + "44": 1645105152.0, + "45": 1645105152.0, + "46": 1645105152.0, + "47": 1645105152.0, + "48": 1645105152.0, + "49": 1645105152.0, + "50": 1645105152.0, + "51": 1645105152.0, + "52": 1645105152.0, + "53": 1645105152.0, + "54": 1645105152.0, + "55": 1645105152.0, + "56": 1645105152.0, + "57": 1645105152.0, + "58": 1645105152.0, + "59": 1645105152.0, + "60": 1645105152.0, + "61": 1645105152.0, + "62": 1645105152.0, + "63": 1645105152.0, + "64": 1645105152.0, + "65": 1645105152.0, + "66": 1645105152.0, + "67": 1645105152.0, + "68": 1645105152.0, + "69": 1645105152.0, + "70": 1645105152.0, + "71": 1645105152.0, + "72": 1645105152.0, + "73": 1645105152.0, + "74": 1645105152.0, + "75": 1645105152.0, + "76": 1645105152.0, + "77": 1645105152.0, + "78": 1645105152.0, + "79": 1645105152.0, + "80": 1645105152.0, + "81": 1645105152.0, + "82": 1645105152.0, + "83": 1645105152.0, + "84": 1645105152.0, + "85": 1645105152.0, + "86": 1645105152.0, + "87": 1645105152.0, + "88": 1645105152.0, + "89": 1645105152.0, + "90": 1645105152.0, + "91": 1645105152.0, + "92": 1645105152.0, + "93": 1645105152.0, + "94": 1645105152.0, + "95": 1645105152.0, + "96": 1645105152.0, + "97": 1645105152.0, + "98": 1645105152.0, + "99": 1645105152.0, + "100": 1645105152.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 8.66493, + "2": 0.8291, + "3": 0.43315, + "4": 0.42959, + "5": 0.43827, + "6": 0.4295, + "7": 0.62136, + "8": 0.42601, + "9": 0.43172, + "10": 0.42845, + "11": 0.42549, + "12": 0.43168, + "13": 0.42375, + "14": 0.43487, + "15": 0.423, + "16": 0.43317, + "17": 0.42357, + "18": 0.42563, + "19": 0.42895, + "20": 0.42417, + "21": 0.43668, + "22": 0.42565, + "23": 0.43595, + "24": 0.42585, + "25": 0.42377, + "26": 0.4332, + "27": 0.4241, + "28": 0.43439, + "29": 0.42272, + "30": 0.4344, + "31": 0.42586, + "32": 0.42451, + "33": 0.43418, + "34": 0.42702, + "35": 0.64991, + "36": 0.42577, + "37": 0.42879, + "38": 0.42484, + "39": 0.66025, + "40": 0.42623, + "41": 0.42852, + "42": 0.42402, + "43": 0.42999, + "44": 0.42936, + "45": 0.42525, + "46": 0.43377, + "47": 0.42553, + "48": 0.42913, + "49": 0.42482, + "50": 0.42788, + "51": 0.44478, + "52": 0.4318, + "53": 0.42325, + "54": 0.44021, + "55": 0.42487, + "56": 0.43393, + "57": 0.42758, + "58": 0.43308, + "59": 0.42523, + "60": 0.42483, + "61": 0.43409, + "62": 0.42537, + "63": 0.43014, + "64": 0.42235, + "65": 0.42951, + "66": 0.43017, + "67": 0.42364, + "68": 0.4377, + "69": 0.42513, + "70": 0.4337, + "71": 0.42291, + "72": 0.42699, + "73": 0.43249, + "74": 0.42472, + "75": 0.4344, + "76": 0.4261, + "77": 0.43235, + "78": 0.42569, + "79": 0.42813, + "80": 0.43557, + "81": 0.42479, + "82": 0.43423, + "83": 0.42304, + "84": 0.43758, + "85": 0.42397, + "86": 0.42467, + "87": 0.43641, + "88": 0.42214, + "89": 0.42765, + "90": 0.42554, + "91": 0.44244, + "92": 0.42237, + "93": 0.42384, + "94": 0.44073, + "95": 0.42184, + "96": 0.43075, + "97": 0.42217, + "98": 0.44245, + "99": 0.42259, + "100": 0.42671 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 76960796d04..2dfc5d0f6ae 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.90433, + "2": 10.90931, + "3": 10.90937, + "4": 10.90764, "5": 10.90709, + "6": 10.91174, + "7": 10.91413, + "8": 10.89808, + "9": 10.91252, "10": 10.87838, + "11": 10.90538, + "12": 10.89588, + "13": 10.91234, + "14": 10.90596, "15": 10.86278, + "16": 10.85987, + "17": 10.84211, + "18": 10.83508, + "19": 10.84021, "20": 10.74667, + "21": 10.72431, + "22": 10.6337, + "23": 10.74257, + "24": 10.63399, "25": 10.60185, + "26": 10.64659, + "27": 10.64193, + "28": 10.58695, + "29": 10.59421, "30": 10.394, + "31": 10.17174, + "32": 10.48573, + "33": 10.48042, + "34": 10.25002, "35": 10.29811, + "36": 10.25221, + "37": 10.36635, + "38": 10.22258, + "39": 10.42495, "40": 10.111, + "41": 10.17165, + "42": 10.22384, + "43": 9.86674, + "44": 9.99019, "45": 9.8622, + "46": 9.84813, + "47": 10.16079, + "48": 9.87303, + "49": 9.55987, "50": 9.92159, + "51": 9.8695, + "52": 9.76154, + "53": 10.08349, + "54": 9.97449, "55": 9.89437, + "56": 9.6424, + "57": 9.50352, + "58": 9.84153, + "59": 9.60017, "60": 9.51715, + "61": 9.70458, + "62": 9.98292, + "63": 9.39067, + "64": 9.7797, "65": 8.96053, + "66": 9.70288, + "67": 9.3734, + "68": 9.78805, + "69": 9.79828, "70": 9.74999, + "71": 9.62682, + "72": 9.59043, + "73": 9.49893, + "74": 8.94842, "75": 9.42922, + "76": 9.08268, + "77": 10.07413, + "78": 9.73322, + "79": 9.38352, "80": 9.40713, + "81": 9.48366, + "82": 9.70577, + "83": 9.3103, + "84": 9.41846, "85": 9.62053, + "86": 9.08533, + "87": 9.59962, + "88": 9.75141, + "89": 9.60594, "90": 9.8245, + "91": 9.33973, + "92": 9.36344, + "93": 9.08397, + "94": 8.83571, "95": 9.51936, + "96": 9.53001, + "97": 9.31995, + "98": 9.67709, + "99": 8.88909, "100": 9.40491 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1595.0, + "2": 1632.0, + "3": 1539.0, + "4": 1702.0, "5": 1827.0, + "6": 1718.0, + "7": 1810.0, + "8": 1634.0, + "9": 2007.0, "10": 1457.0, + "11": 1906.0, + "12": 1737.0, + "13": 1917.0, + "14": 1828.0, "15": 1866.0, + "16": 1826.0, + "17": 1762.0, + "18": 1761.0, + "19": 1803.0, "20": 1803.0, + "21": 1996.0, + "22": 1691.0, + "23": 2060.0, + "24": 1622.0, "25": 1595.0, + "26": 1608.0, + "27": 1890.0, + "28": 1913.0, + "29": 1987.0, "30": 1808.0, + "31": 1549.0, + "32": 1838.0, + "33": 2073.0, + "34": 1859.0, "35": 1870.0, + "36": 1870.0, + "37": 2300.0, + "38": 2186.0, + "39": 2368.0, "40": 2097.0, + "41": 2325.0, + "42": 2227.0, + "43": 2036.0, + "44": 2098.0, "45": 2055.0, + "46": 2146.0, + "47": 2453.0, + "48": 2273.0, + "49": 2244.0, "50": 2252.0, + "51": 2484.0, + "52": 2568.0, + "53": 2834.0, + "54": 2607.0, "55": 2149.0, + "56": 2683.0, + "57": 2283.0, + "58": 2764.0, + "59": 2623.0, "60": 2456.0, + "61": 2938.0, + "62": 2456.0, + "63": 2279.0, + "64": 3078.0, "65": 2504.0, + "66": 2881.0, + "67": 2683.0, + "68": 2657.0, + "69": 2832.0, "70": 3144.0, + "71": 2930.0, + "72": 2328.0, + "73": 2984.0, + "74": 1752.0, "75": 2451.0, + "76": 3040.0, + "77": 3213.0, + "78": 2936.0, + "79": 2941.0, "80": 3112.0, + "81": 3568.0, + "82": 3105.0, + "83": 2725.0, + "84": 3051.0, "85": 3170.0, + "86": 2645.0, + "87": 3586.0, + "88": 2902.0, + "89": 3371.0, "90": 2971.0, + "91": 2800.0, + "92": 3017.0, + "93": 2524.0, + "94": 3384.0, "95": 3147.0, + "96": 3388.0, + "97": 3031.0, + "98": 3619.0, + "99": 3004.0, "100": 3100.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 312352256.0, + "2": 312352256.0, + "3": 312352256.0, + "4": 312352256.0, "5": 312352256.0, + "6": 312352256.0, + "7": 312352256.0, + "8": 312352256.0, + "9": 312352256.0, "10": 312352256.0, + "11": 312352256.0, + "12": 312352256.0, + "13": 312352256.0, + "14": 312352256.0, "15": 312352256.0, + "16": 312352256.0, + "17": 312352256.0, + "18": 312352256.0, + "19": 312352256.0, "20": 312352256.0, + "21": 312352256.0, + "22": 312352256.0, + "23": 312352256.0, + "24": 312352256.0, "25": 312352256.0, + "26": 312352256.0, + "27": 312352256.0, + "28": 312352256.0, + "29": 312352256.0, "30": 312352256.0, + "31": 312352256.0, + "32": 312352256.0, + "33": 312352256.0, + "34": 312352256.0, "35": 312352256.0, + "36": 312352256.0, + "37": 312352256.0, + "38": 312352256.0, + "39": 312352256.0, "40": 312352256.0, + "41": 312352256.0, + "42": 312352256.0, + "43": 312352256.0, + "44": 312352256.0, "45": 312352256.0, + "46": 312352256.0, + "47": 312352256.0, + "48": 312352256.0, + "49": 312352256.0, "50": 312352256.0, + "51": 312352256.0, + "52": 312352256.0, + "53": 312352256.0, + "54": 312352256.0, "55": 312352256.0, + "56": 312352256.0, + "57": 312352256.0, + "58": 312352256.0, + "59": 312352256.0, "60": 312352256.0, + "61": 312352256.0, + "62": 312352256.0, + "63": 312352256.0, + "64": 312352256.0, "65": 312352256.0, + "66": 312352256.0, + "67": 312352256.0, + "68": 312352256.0, + "69": 312352256.0, "70": 312352256.0, + "71": 312352256.0, + "72": 312352256.0, + "73": 312352256.0, + "74": 312352256.0, "75": 312352256.0, + "76": 312352256.0, + "77": 312352256.0, + "78": 312352256.0, + "79": 312352256.0, "80": 312352256.0, + "81": 312352256.0, + "82": 312352256.0, + "83": 312352256.0, + "84": 312352256.0, "85": 312352256.0, + "86": 312352256.0, + "87": 312352256.0, + "88": 312352256.0, + "89": 312352256.0, "90": 312352256.0, + "91": 312352256.0, + "92": 312352256.0, + "93": 312352256.0, + "94": 312352256.0, "95": 312352256.0, + "96": 312352256.0, + "97": 312352256.0, + "98": 312352256.0, + "99": 312352256.0, "100": 312352256.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 754434560.0, + "2": 843763200.0, + "3": 843763200.0, + "4": 843763200.0, "5": 843763200.0, + "6": 843763200.0, + "7": 843763200.0, + "8": 843763200.0, + "9": 843763200.0, "10": 843763200.0, + "11": 843763200.0, + "12": 843763200.0, + "13": 843763200.0, + "14": 843763200.0, "15": 843763200.0, + "16": 843763200.0, + "17": 843763200.0, + "18": 843763200.0, + "19": 843763200.0, "20": 843763200.0, + "21": 843763200.0, + "22": 843763200.0, + "23": 843763200.0, + "24": 843763200.0, "25": 843763200.0, + "26": 843763200.0, + "27": 843763200.0, + "28": 843763200.0, + "29": 843763200.0, "30": 843763200.0, + "31": 843763200.0, + "32": 843763200.0, + "33": 843763200.0, + "34": 843763200.0, "35": 843763200.0, + "36": 843763200.0, + "37": 843763200.0, + "38": 843763200.0, + "39": 843763200.0, "40": 843763200.0, + "41": 843763200.0, + "42": 843763200.0, + "43": 843763200.0, + "44": 843763200.0, "45": 843763200.0, + "46": 843763200.0, + "47": 843763200.0, + "48": 843763200.0, + "49": 843763200.0, "50": 843763200.0, + "51": 843763200.0, + "52": 843763200.0, + "53": 843763200.0, + "54": 843763200.0, "55": 843763200.0, + "56": 843763200.0, + "57": 843763200.0, + "58": 843763200.0, + "59": 843763200.0, "60": 843763200.0, + "61": 843763200.0, + "62": 843763200.0, + "63": 843763200.0, + "64": 843763200.0, "65": 843763200.0, + "66": 843763200.0, + "67": 843763200.0, + "68": 843763200.0, + "69": 843763200.0, "70": 843763200.0, + "71": 843763200.0, + "72": 843763200.0, + "73": 843763200.0, + "74": 843763200.0, "75": 843763200.0, + "76": 843763200.0, + "77": 843763200.0, + "78": 843763200.0, + "79": 843763200.0, "80": 843763200.0, + "81": 843763200.0, + "82": 843763200.0, + "83": 843763200.0, + "84": 843763200.0, "85": 843763200.0, + "86": 843763200.0, + "87": 843763200.0, + "88": 843763200.0, + "89": 843763200.0, "90": 843763200.0, + "91": 843763200.0, + "92": 843763200.0, + "93": 843763200.0, + "94": 843763200.0, "95": 843763200.0, + "96": 843763200.0, + "97": 843763200.0, + "98": 843763200.0, + "99": 843763200.0, "100": 843763200.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 11.81829, - "5": 0.2055, - "10": 0.20555, - "15": 0.20599, - "20": 0.2077, - "25": 0.20625, - "30": 0.20513, - "35": 0.21379, - "40": 0.19974, - "45": 0.20183, - "50": 0.1983, - "55": 0.20325, - "60": 0.19919, - "65": 0.19434, - "70": 0.19633, - "75": 0.19415, - "80": 0.19631, - "85": 0.19412, - "90": 0.20079, - "95": 0.20108, - "100": 0.20109 + "1": 13.88965, + "2": 0.27451, + "3": 0.24975, + "4": 0.25072, + "5": 0.2432, + "6": 0.24332, + "7": 0.23789, + "8": 0.23936, + "9": 0.23192, + "10": 0.22503, + "11": 0.22584, + "12": 0.22831, + "13": 0.22937, + "14": 0.22514, + "15": 0.22707, + "16": 0.22601, + "17": 0.22754, + "18": 0.22863, + "19": 0.22776, + "20": 0.2264, + "21": 0.22812, + "22": 0.23837, + "23": 0.25872, + "24": 0.23186, + "25": 0.22533, + "26": 0.22641, + "27": 0.22648, + "28": 0.22569, + "29": 0.22721, + "30": 0.22446, + "31": 0.2299, + "32": 0.22776, + "33": 0.22874, + "34": 0.22685, + "35": 0.22809, + "36": 0.23141, + "37": 0.22676, + "38": 0.22629, + "39": 0.22929, + "40": 0.23118, + "41": 0.22744, + "42": 0.22706, + "43": 0.23097, + "44": 0.22844, + "45": 0.22948, + "46": 0.22632, + "47": 0.22989, + "48": 0.22849, + "49": 0.23116, + "50": 0.23165, + "51": 0.25535, + "52": 0.27151, + "53": 0.23628, + "54": 0.23553, + "55": 0.23112, + "56": 0.23386, + "57": 0.2314, + "58": 0.23297, + "59": 0.22916, + "60": 0.22848, + "61": 0.23048, + "62": 0.22881, + "63": 0.23036, + "64": 0.2284, + "65": 0.23027, + "66": 0.22734, + "67": 0.23011, + "68": 0.22993, + "69": 0.22771, + "70": 0.23247, + "71": 0.22785, + "72": 0.22934, + "73": 0.22755, + "74": 0.22901, + "75": 0.22825, + "76": 0.22722, + "77": 0.22986, + "78": 0.22763, + "79": 0.22994, + "80": 0.22933, + "81": 0.2282, + "82": 0.22957, + "83": 0.22817, + "84": 0.22948, + "85": 0.2273, + "86": 0.22834, + "87": 0.23316, + "88": 0.22928, + "89": 0.22663, + "90": 0.23145, + "91": 0.22771, + "92": 0.22915, + "93": 0.22882, + "94": 0.22769, + "95": 0.22918, + "96": 0.23296, + "97": 0.22901, + "98": 0.23028, + "99": 0.23035, + "100": 0.23349 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..ff73ed22db1 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.90433, + "2": 10.90931, + "3": 10.90937, + "4": 10.90764, + "5": 10.90709, + "6": 10.91174, + "7": 10.91413, + "8": 10.89808, + "9": 10.91252, + "10": 10.87838, + "11": 10.90538, + "12": 10.89588, + "13": 10.91234, + "14": 10.90596, + "15": 10.86278, + "16": 10.85987, + "17": 10.84211, + "18": 10.83508, + "19": 10.84021, + "20": 10.74667, + "21": 10.72431, + "22": 10.6337, + "23": 10.74257, + "24": 10.63399, + "25": 10.60185, + "26": 10.64659, + "27": 10.64193, + "28": 10.58695, + "29": 10.59421, + "30": 10.394, + "31": 10.17174, + "32": 10.48573, + "33": 10.48042, + "34": 10.25002, + "35": 10.29811, + "36": 10.25221, + "37": 10.36635, + "38": 10.22258, + "39": 10.42495, + "40": 10.111, + "41": 10.17165, + "42": 10.22384, + "43": 9.86674, + "44": 9.99019, + "45": 9.8622, + "46": 9.84813, + "47": 10.16079, + "48": 9.87303, + "49": 9.55987, + "50": 9.92159, + "51": 9.8695, + "52": 9.76154, + "53": 10.08349, + "54": 9.97449, + "55": 9.89437, + "56": 9.6424, + "57": 9.50352, + "58": 9.84153, + "59": 9.60017, + "60": 9.51715, + "61": 9.70458, + "62": 9.98292, + "63": 9.39067, + "64": 9.7797, + "65": 8.96053, + "66": 9.70288, + "67": 9.3734, + "68": 9.78805, + "69": 9.79828, + "70": 9.74999, + "71": 9.62682, + "72": 9.59043, + "73": 9.49893, + "74": 8.94842, + "75": 9.42922, + "76": 9.08268, + "77": 10.07413, + "78": 9.73322, + "79": 9.38352, + "80": 9.40713, + "81": 9.48366, + "82": 9.70577, + "83": 9.3103, + "84": 9.41846, + "85": 9.62053, + "86": 9.08533, + "87": 9.59962, + "88": 9.75141, + "89": 9.60594, + "90": 9.8245, + "91": 9.33973, + "92": 9.36344, + "93": 9.08397, + "94": 8.83571, + "95": 9.51936, + "96": 9.53001, + "97": 9.31995, + "98": 9.67709, + "99": 8.88909, + "100": 9.40491 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1595.0, + "2": 1632.0, + "3": 1539.0, + "4": 1702.0, + "5": 1827.0, + "6": 1718.0, + "7": 1810.0, + "8": 1634.0, + "9": 2007.0, + "10": 1457.0, + "11": 1906.0, + "12": 1737.0, + "13": 1917.0, + "14": 1828.0, + "15": 1866.0, + "16": 1826.0, + "17": 1762.0, + "18": 1761.0, + "19": 1803.0, + "20": 1803.0, + "21": 1996.0, + "22": 1691.0, + "23": 2060.0, + "24": 1622.0, + "25": 1595.0, + "26": 1608.0, + "27": 1890.0, + "28": 1913.0, + "29": 1987.0, + "30": 1808.0, + "31": 1549.0, + "32": 1838.0, + "33": 2073.0, + "34": 1859.0, + "35": 1870.0, + "36": 1870.0, + "37": 2300.0, + "38": 2186.0, + "39": 2368.0, + "40": 2097.0, + "41": 2325.0, + "42": 2227.0, + "43": 2036.0, + "44": 2098.0, + "45": 2055.0, + "46": 2146.0, + "47": 2453.0, + "48": 2273.0, + "49": 2244.0, + "50": 2252.0, + "51": 2484.0, + "52": 2568.0, + "53": 2834.0, + "54": 2607.0, + "55": 2149.0, + "56": 2683.0, + "57": 2283.0, + "58": 2764.0, + "59": 2623.0, + "60": 2456.0, + "61": 2938.0, + "62": 2456.0, + "63": 2279.0, + "64": 3078.0, + "65": 2504.0, + "66": 2881.0, + "67": 2683.0, + "68": 2657.0, + "69": 2832.0, + "70": 3144.0, + "71": 2930.0, + "72": 2328.0, + "73": 2984.0, + "74": 1752.0, + "75": 2451.0, + "76": 3040.0, + "77": 3213.0, + "78": 2936.0, + "79": 2941.0, + "80": 3112.0, + "81": 3568.0, + "82": 3105.0, + "83": 2725.0, + "84": 3051.0, + "85": 3170.0, + "86": 2645.0, + "87": 3586.0, + "88": 2902.0, + "89": 3371.0, + "90": 2971.0, + "91": 2800.0, + "92": 3017.0, + "93": 2524.0, + "94": 3384.0, + "95": 3147.0, + "96": 3388.0, + "97": 3031.0, + "98": 3619.0, + "99": 3004.0, + "100": 3100.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 312352256.0, + "2": 312352256.0, + "3": 312352256.0, + "4": 312352256.0, + "5": 312352256.0, + "6": 312352256.0, + "7": 312352256.0, + "8": 312352256.0, + "9": 312352256.0, + "10": 312352256.0, + "11": 312352256.0, + "12": 312352256.0, + "13": 312352256.0, + "14": 312352256.0, + "15": 312352256.0, + "16": 312352256.0, + "17": 312352256.0, + "18": 312352256.0, + "19": 312352256.0, + "20": 312352256.0, + "21": 312352256.0, + "22": 312352256.0, + "23": 312352256.0, + "24": 312352256.0, + "25": 312352256.0, + "26": 312352256.0, + "27": 312352256.0, + "28": 312352256.0, + "29": 312352256.0, + "30": 312352256.0, + "31": 312352256.0, + "32": 312352256.0, + "33": 312352256.0, + "34": 312352256.0, + "35": 312352256.0, + "36": 312352256.0, + "37": 312352256.0, + "38": 312352256.0, + "39": 312352256.0, + "40": 312352256.0, + "41": 312352256.0, + "42": 312352256.0, + "43": 312352256.0, + "44": 312352256.0, + "45": 312352256.0, + "46": 312352256.0, + "47": 312352256.0, + "48": 312352256.0, + "49": 312352256.0, + "50": 312352256.0, + "51": 312352256.0, + "52": 312352256.0, + "53": 312352256.0, + "54": 312352256.0, + "55": 312352256.0, + "56": 312352256.0, + "57": 312352256.0, + "58": 312352256.0, + "59": 312352256.0, + "60": 312352256.0, + "61": 312352256.0, + "62": 312352256.0, + "63": 312352256.0, + "64": 312352256.0, + "65": 312352256.0, + "66": 312352256.0, + "67": 312352256.0, + "68": 312352256.0, + "69": 312352256.0, + "70": 312352256.0, + "71": 312352256.0, + "72": 312352256.0, + "73": 312352256.0, + "74": 312352256.0, + "75": 312352256.0, + "76": 312352256.0, + "77": 312352256.0, + "78": 312352256.0, + "79": 312352256.0, + "80": 312352256.0, + "81": 312352256.0, + "82": 312352256.0, + "83": 312352256.0, + "84": 312352256.0, + "85": 312352256.0, + "86": 312352256.0, + "87": 312352256.0, + "88": 312352256.0, + "89": 312352256.0, + "90": 312352256.0, + "91": 312352256.0, + "92": 312352256.0, + "93": 312352256.0, + "94": 312352256.0, + "95": 312352256.0, + "96": 312352256.0, + "97": 312352256.0, + "98": 312352256.0, + "99": 312352256.0, + "100": 312352256.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 754434560.0, + "2": 843763200.0, + "3": 843763200.0, + "4": 843763200.0, + "5": 843763200.0, + "6": 843763200.0, + "7": 843763200.0, + "8": 843763200.0, + "9": 843763200.0, + "10": 843763200.0, + "11": 843763200.0, + "12": 843763200.0, + "13": 843763200.0, + "14": 843763200.0, + "15": 843763200.0, + "16": 843763200.0, + "17": 843763200.0, + "18": 843763200.0, + "19": 843763200.0, + "20": 843763200.0, + "21": 843763200.0, + "22": 843763200.0, + "23": 843763200.0, + "24": 843763200.0, + "25": 843763200.0, + "26": 843763200.0, + "27": 843763200.0, + "28": 843763200.0, + "29": 843763200.0, + "30": 843763200.0, + "31": 843763200.0, + "32": 843763200.0, + "33": 843763200.0, + "34": 843763200.0, + "35": 843763200.0, + "36": 843763200.0, + "37": 843763200.0, + "38": 843763200.0, + "39": 843763200.0, + "40": 843763200.0, + "41": 843763200.0, + "42": 843763200.0, + "43": 843763200.0, + "44": 843763200.0, + "45": 843763200.0, + "46": 843763200.0, + "47": 843763200.0, + "48": 843763200.0, + "49": 843763200.0, + "50": 843763200.0, + "51": 843763200.0, + "52": 843763200.0, + "53": 843763200.0, + "54": 843763200.0, + "55": 843763200.0, + "56": 843763200.0, + "57": 843763200.0, + "58": 843763200.0, + "59": 843763200.0, + "60": 843763200.0, + "61": 843763200.0, + "62": 843763200.0, + "63": 843763200.0, + "64": 843763200.0, + "65": 843763200.0, + "66": 843763200.0, + "67": 843763200.0, + "68": 843763200.0, + "69": 843763200.0, + "70": 843763200.0, + "71": 843763200.0, + "72": 843763200.0, + "73": 843763200.0, + "74": 843763200.0, + "75": 843763200.0, + "76": 843763200.0, + "77": 843763200.0, + "78": 843763200.0, + "79": 843763200.0, + "80": 843763200.0, + "81": 843763200.0, + "82": 843763200.0, + "83": 843763200.0, + "84": 843763200.0, + "85": 843763200.0, + "86": 843763200.0, + "87": 843763200.0, + "88": 843763200.0, + "89": 843763200.0, + "90": 843763200.0, + "91": 843763200.0, + "92": 843763200.0, + "93": 843763200.0, + "94": 843763200.0, + "95": 843763200.0, + "96": 843763200.0, + "97": 843763200.0, + "98": 843763200.0, + "99": 843763200.0, + "100": 843763200.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 14.22764, + "2": 0.24357, + "3": 0.1983, + "4": 0.19798, + "5": 0.19753, + "6": 0.19867, + "7": 0.2023, + "8": 0.20916, + "9": 0.19896, + "10": 0.19379, + "11": 0.19485, + "12": 0.19576, + "13": 0.19787, + "14": 0.19429, + "15": 0.19302, + "16": 0.19471, + "17": 0.19504, + "18": 0.19198, + "19": 0.19495, + "20": 0.19263, + "21": 0.19416, + "22": 0.19641, + "23": 0.19469, + "24": 0.1929, + "25": 0.19216, + "26": 0.19363, + "27": 0.19398, + "28": 0.20085, + "29": 0.19636, + "30": 0.19368, + "31": 0.19607, + "32": 0.19525, + "33": 0.19664, + "34": 0.19678, + "35": 0.19781, + "36": 0.19903, + "37": 0.19855, + "38": 0.19741, + "39": 0.19904, + "40": 0.1946, + "41": 0.19866, + "42": 0.19875, + "43": 0.19854, + "44": 0.19999, + "45": 0.19615, + "46": 0.19571, + "47": 0.20067, + "48": 0.20086, + "49": 0.199, + "50": 0.20278, + "51": 0.22281, + "52": 0.23219, + "53": 0.1956, + "54": 0.20104, + "55": 0.19383, + "56": 0.19622, + "57": 0.1958, + "58": 0.19611, + "59": 0.20122, + "60": 0.19838, + "61": 0.19728, + "62": 0.19768, + "63": 0.19649, + "64": 0.19849, + "65": 0.19729, + "66": 0.20239, + "67": 0.1983, + "68": 0.19972, + "69": 0.19875, + "70": 0.19826, + "71": 0.199, + "72": 0.20079, + "73": 0.19629, + "74": 0.19463, + "75": 0.19309, + "76": 0.19531, + "77": 0.19866, + "78": 0.19554, + "79": 0.19894, + "80": 0.19644, + "81": 0.19444, + "82": 0.1982, + "83": 0.19564, + "84": 0.19462, + "85": 0.19336, + "86": 0.19393, + "87": 0.19166, + "88": 0.19067, + "89": 0.19389, + "90": 0.19317, + "91": 0.19001, + "92": 0.19028, + "93": 0.19093, + "94": 0.19224, + "95": 0.19066, + "96": 0.19224, + "97": 0.18966, + "98": 0.19044, + "99": 0.19273, + "100": 0.20509 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..5c404dad658 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.90433, + "2": 10.90931, + "3": 10.90937, + "4": 10.90764, + "5": 10.90709, + "6": 10.91174, + "7": 10.91413, + "8": 10.89808, + "9": 10.91252, + "10": 10.87838, + "11": 10.90538, + "12": 10.89588, + "13": 10.91234, + "14": 10.90596, + "15": 10.86278, + "16": 10.85987, + "17": 10.84211, + "18": 10.83508, + "19": 10.84021, + "20": 10.74667, + "21": 10.72431, + "22": 10.6337, + "23": 10.74257, + "24": 10.63399, + "25": 10.60185, + "26": 10.64659, + "27": 10.64193, + "28": 10.58695, + "29": 10.59421, + "30": 10.394, + "31": 10.17174, + "32": 10.48573, + "33": 10.48042, + "34": 10.25002, + "35": 10.29811, + "36": 10.25221, + "37": 10.36635, + "38": 10.22258, + "39": 10.42495, + "40": 10.111, + "41": 10.17165, + "42": 10.22384, + "43": 9.86674, + "44": 9.99019, + "45": 9.8622, + "46": 9.84813, + "47": 10.16079, + "48": 9.87303, + "49": 9.55987, + "50": 9.92159, + "51": 9.8695, + "52": 9.76154, + "53": 10.08349, + "54": 9.97449, + "55": 9.89437, + "56": 9.6424, + "57": 9.50352, + "58": 9.84153, + "59": 9.60017, + "60": 9.51715, + "61": 9.70458, + "62": 9.98292, + "63": 9.39067, + "64": 9.7797, + "65": 8.96053, + "66": 9.70288, + "67": 9.3734, + "68": 9.78805, + "69": 9.79828, + "70": 9.74999, + "71": 9.62682, + "72": 9.59043, + "73": 9.49893, + "74": 8.94842, + "75": 9.42922, + "76": 9.08268, + "77": 10.07413, + "78": 9.73322, + "79": 9.38352, + "80": 9.40713, + "81": 9.48366, + "82": 9.70577, + "83": 9.3103, + "84": 9.41846, + "85": 9.62053, + "86": 9.08533, + "87": 9.59962, + "88": 9.75141, + "89": 9.60594, + "90": 9.8245, + "91": 9.33973, + "92": 9.36344, + "93": 9.08397, + "94": 8.83571, + "95": 9.51936, + "96": 9.53001, + "97": 9.31995, + "98": 9.67709, + "99": 8.88909, + "100": 9.40491 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1595.0, + "2": 1632.0, + "3": 1539.0, + "4": 1702.0, + "5": 1827.0, + "6": 1718.0, + "7": 1810.0, + "8": 1634.0, + "9": 2007.0, + "10": 1457.0, + "11": 1906.0, + "12": 1737.0, + "13": 1917.0, + "14": 1828.0, + "15": 1866.0, + "16": 1826.0, + "17": 1762.0, + "18": 1761.0, + "19": 1803.0, + "20": 1803.0, + "21": 1996.0, + "22": 1691.0, + "23": 2060.0, + "24": 1622.0, + "25": 1595.0, + "26": 1608.0, + "27": 1890.0, + "28": 1913.0, + "29": 1987.0, + "30": 1808.0, + "31": 1549.0, + "32": 1838.0, + "33": 2073.0, + "34": 1859.0, + "35": 1870.0, + "36": 1870.0, + "37": 2300.0, + "38": 2186.0, + "39": 2368.0, + "40": 2097.0, + "41": 2325.0, + "42": 2227.0, + "43": 2036.0, + "44": 2098.0, + "45": 2055.0, + "46": 2146.0, + "47": 2453.0, + "48": 2273.0, + "49": 2244.0, + "50": 2252.0, + "51": 2484.0, + "52": 2568.0, + "53": 2834.0, + "54": 2607.0, + "55": 2149.0, + "56": 2683.0, + "57": 2283.0, + "58": 2764.0, + "59": 2623.0, + "60": 2456.0, + "61": 2938.0, + "62": 2456.0, + "63": 2279.0, + "64": 3078.0, + "65": 2504.0, + "66": 2881.0, + "67": 2683.0, + "68": 2657.0, + "69": 2832.0, + "70": 3144.0, + "71": 2930.0, + "72": 2328.0, + "73": 2984.0, + "74": 1752.0, + "75": 2451.0, + "76": 3040.0, + "77": 3213.0, + "78": 2936.0, + "79": 2941.0, + "80": 3112.0, + "81": 3568.0, + "82": 3105.0, + "83": 2725.0, + "84": 3051.0, + "85": 3170.0, + "86": 2645.0, + "87": 3586.0, + "88": 2902.0, + "89": 3371.0, + "90": 2971.0, + "91": 2800.0, + "92": 3017.0, + "93": 2524.0, + "94": 3384.0, + "95": 3147.0, + "96": 3388.0, + "97": 3031.0, + "98": 3619.0, + "99": 3004.0, + "100": 3100.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 312352256.0, + "2": 312352256.0, + "3": 312352256.0, + "4": 312352256.0, + "5": 312352256.0, + "6": 312352256.0, + "7": 312352256.0, + "8": 312352256.0, + "9": 312352256.0, + "10": 312352256.0, + "11": 312352256.0, + "12": 312352256.0, + "13": 312352256.0, + "14": 312352256.0, + "15": 312352256.0, + "16": 312352256.0, + "17": 312352256.0, + "18": 312352256.0, + "19": 312352256.0, + "20": 312352256.0, + "21": 312352256.0, + "22": 312352256.0, + "23": 312352256.0, + "24": 312352256.0, + "25": 312352256.0, + "26": 312352256.0, + "27": 312352256.0, + "28": 312352256.0, + "29": 312352256.0, + "30": 312352256.0, + "31": 312352256.0, + "32": 312352256.0, + "33": 312352256.0, + "34": 312352256.0, + "35": 312352256.0, + "36": 312352256.0, + "37": 312352256.0, + "38": 312352256.0, + "39": 312352256.0, + "40": 312352256.0, + "41": 312352256.0, + "42": 312352256.0, + "43": 312352256.0, + "44": 312352256.0, + "45": 312352256.0, + "46": 312352256.0, + "47": 312352256.0, + "48": 312352256.0, + "49": 312352256.0, + "50": 312352256.0, + "51": 312352256.0, + "52": 312352256.0, + "53": 312352256.0, + "54": 312352256.0, + "55": 312352256.0, + "56": 312352256.0, + "57": 312352256.0, + "58": 312352256.0, + "59": 312352256.0, + "60": 312352256.0, + "61": 312352256.0, + "62": 312352256.0, + "63": 312352256.0, + "64": 312352256.0, + "65": 312352256.0, + "66": 312352256.0, + "67": 312352256.0, + "68": 312352256.0, + "69": 312352256.0, + "70": 312352256.0, + "71": 312352256.0, + "72": 312352256.0, + "73": 312352256.0, + "74": 312352256.0, + "75": 312352256.0, + "76": 312352256.0, + "77": 312352256.0, + "78": 312352256.0, + "79": 312352256.0, + "80": 312352256.0, + "81": 312352256.0, + "82": 312352256.0, + "83": 312352256.0, + "84": 312352256.0, + "85": 312352256.0, + "86": 312352256.0, + "87": 312352256.0, + "88": 312352256.0, + "89": 312352256.0, + "90": 312352256.0, + "91": 312352256.0, + "92": 312352256.0, + "93": 312352256.0, + "94": 312352256.0, + "95": 312352256.0, + "96": 312352256.0, + "97": 312352256.0, + "98": 312352256.0, + "99": 312352256.0, + "100": 312352256.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 754434560.0, + "2": 843763200.0, + "3": 843763200.0, + "4": 843763200.0, + "5": 843763200.0, + "6": 843763200.0, + "7": 843763200.0, + "8": 843763200.0, + "9": 843763200.0, + "10": 843763200.0, + "11": 843763200.0, + "12": 843763200.0, + "13": 843763200.0, + "14": 843763200.0, + "15": 843763200.0, + "16": 843763200.0, + "17": 843763200.0, + "18": 843763200.0, + "19": 843763200.0, + "20": 843763200.0, + "21": 843763200.0, + "22": 843763200.0, + "23": 843763200.0, + "24": 843763200.0, + "25": 843763200.0, + "26": 843763200.0, + "27": 843763200.0, + "28": 843763200.0, + "29": 843763200.0, + "30": 843763200.0, + "31": 843763200.0, + "32": 843763200.0, + "33": 843763200.0, + "34": 843763200.0, + "35": 843763200.0, + "36": 843763200.0, + "37": 843763200.0, + "38": 843763200.0, + "39": 843763200.0, + "40": 843763200.0, + "41": 843763200.0, + "42": 843763200.0, + "43": 843763200.0, + "44": 843763200.0, + "45": 843763200.0, + "46": 843763200.0, + "47": 843763200.0, + "48": 843763200.0, + "49": 843763200.0, + "50": 843763200.0, + "51": 843763200.0, + "52": 843763200.0, + "53": 843763200.0, + "54": 843763200.0, + "55": 843763200.0, + "56": 843763200.0, + "57": 843763200.0, + "58": 843763200.0, + "59": 843763200.0, + "60": 843763200.0, + "61": 843763200.0, + "62": 843763200.0, + "63": 843763200.0, + "64": 843763200.0, + "65": 843763200.0, + "66": 843763200.0, + "67": 843763200.0, + "68": 843763200.0, + "69": 843763200.0, + "70": 843763200.0, + "71": 843763200.0, + "72": 843763200.0, + "73": 843763200.0, + "74": 843763200.0, + "75": 843763200.0, + "76": 843763200.0, + "77": 843763200.0, + "78": 843763200.0, + "79": 843763200.0, + "80": 843763200.0, + "81": 843763200.0, + "82": 843763200.0, + "83": 843763200.0, + "84": 843763200.0, + "85": 843763200.0, + "86": 843763200.0, + "87": 843763200.0, + "88": 843763200.0, + "89": 843763200.0, + "90": 843763200.0, + "91": 843763200.0, + "92": 843763200.0, + "93": 843763200.0, + "94": 843763200.0, + "95": 843763200.0, + "96": 843763200.0, + "97": 843763200.0, + "98": 843763200.0, + "99": 843763200.0, + "100": 843763200.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 13.61637, + "2": 0.24414, + "3": 0.22872, + "4": 0.22599, + "5": 0.22586, + "6": 0.22773, + "7": 0.22791, + "8": 0.22857, + "9": 0.2283, + "10": 0.22732, + "11": 0.22633, + "12": 0.22761, + "13": 0.22748, + "14": 0.23094, + "15": 0.22968, + "16": 0.22849, + "17": 0.22934, + "18": 0.22814, + "19": 0.22822, + "20": 0.22758, + "21": 0.22806, + "22": 0.25737, + "23": 0.24238, + "24": 0.23166, + "25": 0.22695, + "26": 0.22857, + "27": 0.23442, + "28": 0.22861, + "29": 0.2302, + "30": 0.2316, + "31": 0.23014, + "32": 0.22948, + "33": 0.23272, + "34": 0.23222, + "35": 0.23035, + "36": 0.23384, + "37": 0.23085, + "38": 0.23058, + "39": 0.23686, + "40": 0.23939, + "41": 0.23562, + "42": 0.23544, + "43": 0.23293, + "44": 0.22874, + "45": 0.234, + "46": 0.22942, + "47": 0.23036, + "48": 0.23404, + "49": 0.2686, + "50": 0.24831, + "51": 0.28415, + "52": 0.23699, + "53": 0.26129, + "54": 0.2273, + "55": 0.22639, + "56": 0.22691, + "57": 0.22504, + "58": 0.22822, + "59": 0.22913, + "60": 0.22577, + "61": 0.23097, + "62": 0.22702, + "63": 0.22579, + "64": 0.22717, + "65": 0.22986, + "66": 0.22481, + "67": 0.22676, + "68": 0.22643, + "69": 0.22933, + "70": 0.23566, + "71": 0.22795, + "72": 0.22654, + "73": 0.2256, + "74": 0.22941, + "75": 0.23701, + "76": 0.23527, + "77": 0.23476, + "78": 0.23472, + "79": 0.22599, + "80": 0.22758, + "81": 0.22717, + "82": 0.22657, + "83": 0.22688, + "84": 0.22827, + "85": 0.22612, + "86": 0.22871, + "87": 0.23133, + "88": 0.22934, + "89": 0.22859, + "90": 0.22635, + "91": 0.22606, + "92": 0.2297, + "93": 0.22713, + "94": 0.2261, + "95": 0.227, + "96": 0.23135, + "97": 0.22866, + "98": 0.22601, + "99": 0.2277, + "100": 0.2323 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgx_a100.json index 5d2d76e675b..cac9c570ec1 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgx_a100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgx_a100.json @@ -1 +1,537 @@ -{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.8583, "5": 10.87283, "10": 10.83266, "15": 10.82103, "20": 10.71378, "25": 10.54764, "30": 10.36787, "35": 10.28458, "40": 10.08925, "45": 9.84558, "50": 9.91941, "55": 9.89198, "60": 9.50822, "65": 8.95947, "70": 9.73442, "75": 9.43116, "80": 9.41096, "85": 9.61514, "90": 9.82374, "95": 9.52259, "100": 9.40801}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1691.0, "5": 2042.0, "10": 1630.0, "15": 2001.0, "20": 1728.0, "25": 1763.0, "30": 2006.0, "35": 2193.0, "40": 2383.0, "45": 2296.0, "50": 2855.0, "55": 2533.0, "60": 2704.0, "65": 2913.0, "70": 3455.0, "75": 2863.0, "80": 3626.0, "85": 3507.0, "90": 3276.0, "95": 3746.0, "100": 3624.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 232422400.0, "5": 232422400.0, "10": 232422400.0, "15": 232422400.0, "20": 232422400.0, "25": 232422400.0, "30": 232422400.0, "35": 232422400.0, "40": 232422400.0, "45": 232422400.0, "50": 232422400.0, "55": 232422400.0, "60": 232422400.0, "65": 232422400.0, "70": 232422400.0, "75": 232422400.0, "80": 232422400.0, "85": 232422400.0, "90": 232422400.0, "95": 232422400.0, "100": 232422400.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 684471808.0, "5": 773274112.0, "10": 775372800.0, "15": 775372800.0, "20": 775372800.0, "25": 775372800.0, "30": 775372800.0, "35": 775372800.0, "40": 775372800.0, "45": 775372800.0, "50": 775372800.0, "55": 775372800.0, "60": 775373312.0, "65": 775373312.0, "70": 775373312.0, "75": 775373312.0, "80": 775373312.0, "85": 775373312.0, "90": 775373312.0, "95": 775373312.0, "100": 775373312.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 15.38884, "5": 0.30271, "10": 0.29872, "15": 0.29913, "20": 0.29673, "25": 0.29722, "30": 0.29513, "35": 0.29581, "40": 0.29346, "45": 0.31009, "50": 0.30584, "55": 0.30586, "60": 0.30392, "65": 0.29478, "70": 0.29561, "75": 0.2972, "80": 0.29542, "85": 0.29898, "90": 0.29519, "95": 0.29733, "100": 0.2954}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.8583, + "2": 10.85411, + "3": 10.8543, + "4": 10.84407, + "5": 10.87282, + "6": 10.8793, + "7": 10.84658, + "8": 10.86139, + "9": 10.87078, + "10": 10.83266, + "11": 10.86332, + "12": 10.87295, + "13": 10.87798, + "14": 10.88588, + "15": 10.82104, + "16": 10.82759, + "17": 10.80303, + "18": 10.82092, + "19": 10.80032, + "20": 10.71379, + "21": 10.69818, + "22": 10.57542, + "23": 10.72119, + "24": 10.60091, + "25": 10.5476, + "26": 10.61127, + "27": 10.61393, + "28": 10.57777, + "29": 10.57888, + "30": 10.36791, + "31": 10.13451, + "32": 10.47063, + "33": 10.47371, + "34": 10.23442, + "35": 10.28457, + "36": 10.23595, + "37": 10.35351, + "38": 10.20695, + "39": 10.40581, + "40": 10.08924, + "41": 10.16388, + "42": 10.22671, + "43": 9.86336, + "44": 9.98189, + "45": 9.84555, + "46": 9.85753, + "47": 10.16884, + "48": 9.86474, + "49": 9.54712, + "50": 9.91942, + "51": 9.86179, + "52": 9.76162, + "53": 10.08383, + "54": 9.96743, + "55": 9.89199, + "56": 9.63777, + "57": 9.49339, + "58": 9.83897, + "59": 9.59641, + "60": 9.50823, + "61": 9.70513, + "62": 9.99499, + "63": 9.38054, + "64": 9.78296, + "65": 8.95946, + "66": 9.71045, + "67": 9.38075, + "68": 9.78884, + "69": 9.79451, + "70": 9.73441, + "71": 9.62146, + "72": 9.58792, + "73": 9.49657, + "74": 8.9434, + "75": 9.43112, + "76": 9.09716, + "77": 10.0681, + "78": 9.73005, + "79": 9.37764, + "80": 9.41097, + "81": 9.48622, + "82": 9.69669, + "83": 9.3163, + "84": 9.42182, + "85": 9.61516, + "86": 9.07553, + "87": 9.59851, + "88": 9.75046, + "89": 9.61112, + "90": 9.82373, + "91": 9.35278, + "92": 9.36495, + "93": 9.08811, + "94": 8.83656, + "95": 9.52256, + "96": 9.52793, + "97": 9.31634, + "98": 9.67876, + "99": 8.89321, + "100": 9.40801 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1708.0, + "2": 1804.0, + "3": 1725.0, + "4": 1881.0, + "5": 2019.0, + "6": 2015.0, + "7": 2086.0, + "8": 1730.0, + "9": 2024.0, + "10": 1515.0, + "11": 2162.0, + "12": 1847.0, + "13": 2125.0, + "14": 2050.0, + "15": 1946.0, + "16": 2000.0, + "17": 1996.0, + "18": 1874.0, + "19": 2011.0, + "20": 1771.0, + "21": 2099.0, + "22": 1892.0, + "23": 2171.0, + "24": 1834.0, + "25": 1790.0, + "26": 1803.0, + "27": 1998.0, + "28": 2211.0, + "29": 2129.0, + "30": 2147.0, + "31": 1623.0, + "32": 2174.0, + "33": 2364.0, + "34": 2035.0, + "35": 2089.0, + "36": 2202.0, + "37": 2603.0, + "38": 2468.0, + "39": 2623.0, + "40": 2383.0, + "41": 2519.0, + "42": 2522.0, + "43": 2235.0, + "44": 2275.0, + "45": 2319.0, + "46": 2632.0, + "47": 2675.0, + "48": 2697.0, + "49": 2551.0, + "50": 2814.0, + "51": 2767.0, + "52": 2804.0, + "53": 3231.0, + "54": 2905.0, + "55": 2575.0, + "56": 3077.0, + "57": 2587.0, + "58": 3346.0, + "59": 3056.0, + "60": 2695.0, + "61": 3191.0, + "62": 2637.0, + "63": 2649.0, + "64": 3176.0, + "65": 2756.0, + "66": 3481.0, + "67": 2905.0, + "68": 3114.0, + "69": 3133.0, + "70": 3533.0, + "71": 3225.0, + "72": 2621.0, + "73": 3297.0, + "74": 2145.0, + "75": 2799.0, + "76": 3354.0, + "77": 3466.0, + "78": 3485.0, + "79": 3464.0, + "80": 3614.0, + "81": 4011.0, + "82": 3694.0, + "83": 3201.0, + "84": 3655.0, + "85": 3597.0, + "86": 3096.0, + "87": 4103.0, + "88": 3306.0, + "89": 3839.0, + "90": 3352.0, + "91": 2980.0, + "92": 3452.0, + "93": 2967.0, + "94": 3773.0, + "95": 3589.0, + "96": 3800.0, + "97": 3412.0, + "98": 3998.0, + "99": 3483.0, + "100": 3651.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 232422400.0, + "2": 232422400.0, + "3": 232422400.0, + "4": 232422400.0, + "5": 232422400.0, + "6": 233470976.0, + "7": 232422400.0, + "8": 233470976.0, + "9": 232422400.0, + "10": 232422400.0, + "11": 232422400.0, + "12": 232422400.0, + "13": 232422400.0, + "14": 233470976.0, + "15": 232422400.0, + "16": 232422400.0, + "17": 232422400.0, + "18": 232422400.0, + "19": 232422400.0, + "20": 232422400.0, + "21": 232422400.0, + "22": 232422400.0, + "23": 232422400.0, + "24": 232422400.0, + "25": 232422400.0, + "26": 232422400.0, + "27": 232422400.0, + "28": 232422400.0, + "29": 232422400.0, + "30": 232422400.0, + "31": 232422400.0, + "32": 232422400.0, + "33": 232422400.0, + "34": 232422400.0, + "35": 232422400.0, + "36": 232422400.0, + "37": 232422400.0, + "38": 232422400.0, + "39": 232422400.0, + "40": 232422400.0, + "41": 232422400.0, + "42": 232422400.0, + "43": 232422400.0, + "44": 232422400.0, + "45": 232422400.0, + "46": 232422400.0, + "47": 232422400.0, + "48": 232422400.0, + "49": 233470976.0, + "50": 232422400.0, + "51": 232422400.0, + "52": 232422400.0, + "53": 232422400.0, + "54": 232422400.0, + "55": 233470976.0, + "56": 232422400.0, + "57": 233470976.0, + "58": 232422400.0, + "59": 232422400.0, + "60": 232422400.0, + "61": 232422400.0, + "62": 232422400.0, + "63": 232422400.0, + "64": 232422400.0, + "65": 232422400.0, + "66": 232422400.0, + "67": 232422400.0, + "68": 232422400.0, + "69": 232422400.0, + "70": 232422400.0, + "71": 232422400.0, + "72": 232422400.0, + "73": 232422400.0, + "74": 232422400.0, + "75": 232422400.0, + "76": 232422400.0, + "77": 232422400.0, + "78": 232422400.0, + "79": 232422400.0, + "80": 232422400.0, + "81": 232422400.0, + "82": 232422400.0, + "83": 232422400.0, + "84": 232422400.0, + "85": 232422400.0, + "86": 232422400.0, + "87": 232422400.0, + "88": 232422400.0, + "89": 232422400.0, + "90": 232422400.0, + "91": 232422400.0, + "92": 232422400.0, + "93": 232422400.0, + "94": 232422400.0, + "95": 232422400.0, + "96": 232422400.0, + "97": 232422400.0, + "98": 232422400.0, + "99": 233470976.0, + "100": 232422400.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 683423744.0, + "2": 773273600.0, + "3": 773276672.0, + "4": 773276672.0, + "5": 773276672.0, + "6": 773276672.0, + "7": 773276672.0, + "8": 773276672.0, + "9": 773276672.0, + "10": 773276672.0, + "11": 773276672.0, + "12": 773276672.0, + "13": 773276672.0, + "14": 773276672.0, + "15": 773276672.0, + "16": 773276672.0, + "17": 773276672.0, + "18": 773276672.0, + "19": 773276672.0, + "20": 773276672.0, + "21": 773276672.0, + "22": 773276672.0, + "23": 773276672.0, + "24": 773276672.0, + "25": 773276672.0, + "26": 773276672.0, + "27": 773276672.0, + "28": 773276672.0, + "29": 773276672.0, + "30": 773276672.0, + "31": 773276672.0, + "32": 773276672.0, + "33": 773276672.0, + "34": 773276672.0, + "35": 773276672.0, + "36": 773276672.0, + "37": 773276672.0, + "38": 773276672.0, + "39": 773276672.0, + "40": 773276672.0, + "41": 773276672.0, + "42": 773276672.0, + "43": 773276672.0, + "44": 773276672.0, + "45": 773276672.0, + "46": 773276672.0, + "47": 773276672.0, + "48": 773276672.0, + "49": 773276672.0, + "50": 775372800.0, + "51": 775372800.0, + "52": 775372800.0, + "53": 775372800.0, + "54": 775372800.0, + "55": 775372800.0, + "56": 775372800.0, + "57": 775372800.0, + "58": 775372800.0, + "59": 775372800.0, + "60": 775372800.0, + "61": 775372800.0, + "62": 775372800.0, + "63": 775372800.0, + "64": 775372800.0, + "65": 775372800.0, + "66": 775372800.0, + "67": 775372800.0, + "68": 775372800.0, + "69": 775372800.0, + "70": 775372800.0, + "71": 775372800.0, + "72": 775372800.0, + "73": 775372800.0, + "74": 775372800.0, + "75": 775372800.0, + "76": 775372800.0, + "77": 775372800.0, + "78": 775372800.0, + "79": 775372800.0, + "80": 775372800.0, + "81": 775372800.0, + "82": 775372800.0, + "83": 775372800.0, + "84": 775372800.0, + "85": 775372800.0, + "86": 775372800.0, + "87": 775372800.0, + "88": 775372800.0, + "89": 775372800.0, + "90": 775372800.0, + "91": 775372800.0, + "92": 775372800.0, + "93": 775372800.0, + "94": 775372800.0, + "95": 775372800.0, + "96": 775372800.0, + "97": 775372800.0, + "98": 775372800.0, + "99": 775373312.0, + "100": 775373312.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 16.23173, + "2": 0.48632, + "3": 0.3184, + "4": 0.31067, + "5": 0.31575, + "6": 0.3127, + "7": 0.3096, + "8": 0.31392, + "9": 0.31591, + "10": 0.30891, + "11": 0.31209, + "12": 0.31271, + "13": 0.30582, + "14": 0.31032, + "15": 0.30879, + "16": 0.3077, + "17": 0.30689, + "18": 0.30824, + "19": 0.30953, + "20": 0.30728, + "21": 0.31141, + "22": 0.31157, + "23": 0.30569, + "24": 0.30896, + "25": 0.30916, + "26": 0.30674, + "27": 0.31017, + "28": 0.30716, + "29": 0.30734, + "30": 0.30698, + "31": 0.30881, + "32": 0.3089, + "33": 0.30647, + "34": 0.3112, + "35": 0.311, + "36": 0.30632, + "37": 0.30856, + "38": 0.30986, + "39": 0.30502, + "40": 0.31035, + "41": 0.306, + "42": 0.30943, + "43": 0.30773, + "44": 0.30886, + "45": 0.30942, + "46": 0.30579, + "47": 0.31121, + "48": 0.31407, + "49": 0.30981, + "50": 0.30966, + "51": 0.3347, + "52": 0.35543, + "53": 0.31067, + "54": 0.30931, + "55": 0.31517, + "56": 0.30883, + "57": 0.30908, + "58": 0.31373, + "59": 0.30746, + "60": 0.31113, + "61": 0.31473, + "62": 0.30775, + "63": 0.31034, + "64": 0.31108, + "65": 0.3103, + "66": 0.3085, + "67": 0.31036, + "68": 0.31412, + "69": 0.30947, + "70": 0.30646, + "71": 0.31133, + "72": 0.30734, + "73": 0.31043, + "74": 0.31583, + "75": 0.3074, + "76": 0.30939, + "77": 0.3182, + "78": 0.30755, + "79": 0.30953, + "80": 0.3085, + "81": 0.31023, + "82": 0.30621, + "83": 0.30705, + "84": 0.31232, + "85": 0.30864, + "86": 0.31017, + "87": 0.3124, + "88": 0.30667, + "89": 0.31086, + "90": 0.31626, + "91": 0.30744, + "92": 0.30887, + "93": 0.31054, + "94": 0.31172, + "95": 0.31164, + "96": 0.31058, + "97": 0.31089, + "98": 0.30676, + "99": 0.3105, + "100": 0.31337 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..02ddabef653 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.8583, + "2": 10.85411, + "3": 10.85433, + "4": 10.84406, + "5": 10.87281, + "6": 10.87934, + "7": 10.84661, + "8": 10.86143, + "9": 10.87077, + "10": 10.83262, + "11": 10.86331, + "12": 10.87296, + "13": 10.87796, + "14": 10.88589, + "15": 10.82104, + "16": 10.82761, + "17": 10.80298, + "18": 10.82097, + "19": 10.80031, + "20": 10.71378, + "21": 10.69817, + "22": 10.57538, + "23": 10.72117, + "24": 10.60092, + "25": 10.54764, + "26": 10.6113, + "27": 10.6139, + "28": 10.57775, + "29": 10.57891, + "30": 10.36785, + "31": 10.13451, + "32": 10.47059, + "33": 10.47377, + "34": 10.23444, + "35": 10.28458, + "36": 10.23593, + "37": 10.35352, + "38": 10.20691, + "39": 10.40581, + "40": 10.08924, + "41": 10.16388, + "42": 10.22671, + "43": 9.86337, + "44": 9.98192, + "45": 9.84553, + "46": 9.85754, + "47": 10.16883, + "48": 9.86475, + "49": 9.54709, + "50": 9.91942, + "51": 9.86179, + "52": 9.76168, + "53": 10.08382, + "54": 9.96739, + "55": 9.89194, + "56": 9.63776, + "57": 9.49339, + "58": 9.83896, + "59": 9.59641, + "60": 9.50823, + "61": 9.7051, + "62": 9.99501, + "63": 9.38054, + "64": 9.78299, + "65": 8.95951, + "66": 9.71042, + "67": 9.38071, + "68": 9.7888, + "69": 9.79448, + "70": 9.73441, + "71": 9.62148, + "72": 9.58793, + "73": 9.49658, + "74": 8.94341, + "75": 9.43114, + "76": 9.09713, + "77": 10.06806, + "78": 9.73005, + "79": 9.37765, + "80": 9.41099, + "81": 9.48618, + "82": 9.69673, + "83": 9.31631, + "84": 9.42185, + "85": 9.61516, + "86": 9.07552, + "87": 9.59852, + "88": 9.75045, + "89": 9.61111, + "90": 9.82372, + "91": 9.35276, + "92": 9.365, + "93": 9.08813, + "94": 8.83655, + "95": 9.52257, + "96": 9.52788, + "97": 9.31634, + "98": 9.67878, + "99": 8.89321, + "100": 9.408 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1763.0, + "2": 1819.0, + "3": 1753.0, + "4": 1839.0, + "5": 2031.0, + "6": 1952.0, + "7": 2030.0, + "8": 1821.0, + "9": 1978.0, + "10": 1514.0, + "11": 2190.0, + "12": 1980.0, + "13": 2061.0, + "14": 2005.0, + "15": 2039.0, + "16": 1942.0, + "17": 1958.0, + "18": 1872.0, + "19": 2009.0, + "20": 1786.0, + "21": 2024.0, + "22": 1927.0, + "23": 2112.0, + "24": 1797.0, + "25": 1786.0, + "26": 1847.0, + "27": 1928.0, + "28": 2178.0, + "29": 2193.0, + "30": 1995.0, + "31": 1717.0, + "32": 2149.0, + "33": 2307.0, + "34": 2027.0, + "35": 2102.0, + "36": 2075.0, + "37": 2656.0, + "38": 2499.0, + "39": 2642.0, + "40": 2331.0, + "41": 2426.0, + "42": 2542.0, + "43": 2149.0, + "44": 2238.0, + "45": 2333.0, + "46": 2656.0, + "47": 2731.0, + "48": 2697.0, + "49": 2593.0, + "50": 2736.0, + "51": 2763.0, + "52": 2904.0, + "53": 3209.0, + "54": 2987.0, + "55": 2624.0, + "56": 3069.0, + "57": 2544.0, + "58": 3248.0, + "59": 2958.0, + "60": 2691.0, + "61": 3226.0, + "62": 2712.0, + "63": 2643.0, + "64": 3019.0, + "65": 2812.0, + "66": 3479.0, + "67": 2963.0, + "68": 3241.0, + "69": 3301.0, + "70": 3423.0, + "71": 3263.0, + "72": 2524.0, + "73": 3240.0, + "74": 2175.0, + "75": 2801.0, + "76": 3300.0, + "77": 3556.0, + "78": 3435.0, + "79": 3546.0, + "80": 3676.0, + "81": 3912.0, + "82": 3694.0, + "83": 3221.0, + "84": 3559.0, + "85": 3548.0, + "86": 3164.0, + "87": 4228.0, + "88": 3325.0, + "89": 3804.0, + "90": 3382.0, + "91": 3001.0, + "92": 3415.0, + "93": 3050.0, + "94": 3856.0, + "95": 3636.0, + "96": 3973.0, + "97": 3386.0, + "98": 3934.0, + "99": 3571.0, + "100": 3660.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 232422400.0, + "2": 232422400.0, + "3": 232422400.0, + "4": 232422400.0, + "5": 232422400.0, + "6": 232422400.0, + "7": 232422400.0, + "8": 232422400.0, + "9": 232422400.0, + "10": 232422400.0, + "11": 232422400.0, + "12": 232422400.0, + "13": 232422400.0, + "14": 232422400.0, + "15": 232422400.0, + "16": 232422400.0, + "17": 232422400.0, + "18": 232422400.0, + "19": 232422400.0, + "20": 232422400.0, + "21": 232422400.0, + "22": 232422400.0, + "23": 233470976.0, + "24": 232422400.0, + "25": 232422400.0, + "26": 232422400.0, + "27": 232422400.0, + "28": 232422400.0, + "29": 232422400.0, + "30": 232422400.0, + "31": 232422400.0, + "32": 232422400.0, + "33": 232422400.0, + "34": 232422400.0, + "35": 232422400.0, + "36": 232422400.0, + "37": 232422400.0, + "38": 232422400.0, + "39": 232422400.0, + "40": 232422400.0, + "41": 232422400.0, + "42": 232422400.0, + "43": 232422400.0, + "44": 232422400.0, + "45": 232422400.0, + "46": 232422400.0, + "47": 232422400.0, + "48": 232422400.0, + "49": 232422400.0, + "50": 232422400.0, + "51": 232422400.0, + "52": 232422400.0, + "53": 232422400.0, + "54": 233470976.0, + "55": 232422400.0, + "56": 232422400.0, + "57": 232422400.0, + "58": 232422400.0, + "59": 232422400.0, + "60": 232422400.0, + "61": 232422400.0, + "62": 232422400.0, + "63": 232422400.0, + "64": 232422400.0, + "65": 232422400.0, + "66": 232422400.0, + "67": 232422400.0, + "68": 232422400.0, + "69": 232422400.0, + "70": 232422400.0, + "71": 232422400.0, + "72": 232422400.0, + "73": 232422400.0, + "74": 232422400.0, + "75": 232422400.0, + "76": 232422400.0, + "77": 232422400.0, + "78": 232422400.0, + "79": 232422400.0, + "80": 232422400.0, + "81": 232422400.0, + "82": 232422400.0, + "83": 232422400.0, + "84": 232422400.0, + "85": 232422400.0, + "86": 232422400.0, + "87": 232422400.0, + "88": 232422400.0, + "89": 232422400.0, + "90": 232422400.0, + "91": 232422400.0, + "92": 232422400.0, + "93": 232422400.0, + "94": 232422400.0, + "95": 232422400.0, + "96": 232422400.0, + "97": 232422400.0, + "98": 232422400.0, + "99": 232422400.0, + "100": 232422400.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 684472320.0, + "2": 771179520.0, + "3": 773275136.0, + "4": 773275136.0, + "5": 773275136.0, + "6": 773275136.0, + "7": 773276672.0, + "8": 773276672.0, + "9": 773276672.0, + "10": 773276672.0, + "11": 773276672.0, + "12": 773276672.0, + "13": 773276672.0, + "14": 773276672.0, + "15": 773276672.0, + "16": 773276672.0, + "17": 773276672.0, + "18": 773276672.0, + "19": 773276672.0, + "20": 773276672.0, + "21": 773276672.0, + "22": 773276672.0, + "23": 773276672.0, + "24": 773276672.0, + "25": 773276672.0, + "26": 773276672.0, + "27": 773276672.0, + "28": 773276672.0, + "29": 773276672.0, + "30": 773276672.0, + "31": 773276672.0, + "32": 773276672.0, + "33": 773276672.0, + "34": 773276672.0, + "35": 773276672.0, + "36": 773276672.0, + "37": 773276672.0, + "38": 773276672.0, + "39": 773276672.0, + "40": 773276672.0, + "41": 773276672.0, + "42": 773276672.0, + "43": 773276672.0, + "44": 773276672.0, + "45": 773276672.0, + "46": 773276672.0, + "47": 773276672.0, + "48": 773276672.0, + "49": 773276672.0, + "50": 773276672.0, + "51": 773276672.0, + "52": 773276672.0, + "53": 773276672.0, + "54": 773276672.0, + "55": 773276672.0, + "56": 773276672.0, + "57": 773276672.0, + "58": 775370752.0, + "59": 775370752.0, + "60": 775370752.0, + "61": 775370752.0, + "62": 775370752.0, + "63": 775370752.0, + "64": 775370752.0, + "65": 775370752.0, + "66": 775370752.0, + "67": 775370752.0, + "68": 775370752.0, + "69": 775370752.0, + "70": 775370752.0, + "71": 775370752.0, + "72": 775370752.0, + "73": 775370752.0, + "74": 775370752.0, + "75": 775370752.0, + "76": 775370752.0, + "77": 775370752.0, + "78": 775370752.0, + "79": 775370752.0, + "80": 775370752.0, + "81": 775370752.0, + "82": 775370752.0, + "83": 775370752.0, + "84": 775370752.0, + "85": 775370752.0, + "86": 775370752.0, + "87": 775370752.0, + "88": 775370752.0, + "89": 775370752.0, + "90": 775370752.0, + "91": 775370752.0, + "92": 775370752.0, + "93": 775370752.0, + "94": 775370752.0, + "95": 775370752.0, + "96": 775370752.0, + "97": 775370752.0, + "98": 775370752.0, + "99": 775370752.0, + "100": 775370752.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 15.23624, + "2": 0.45559, + "3": 0.34073, + "4": 0.34912, + "5": 0.33446, + "6": 0.33332, + "7": 0.33851, + "8": 0.33336, + "9": 0.32771, + "10": 0.33159, + "11": 0.34305, + "12": 0.32874, + "13": 0.33071, + "14": 0.32996, + "15": 0.32459, + "16": 0.32655, + "17": 0.33334, + "18": 0.32446, + "19": 0.3266, + "20": 0.32986, + "21": 0.32475, + "22": 0.3254, + "23": 0.33271, + "24": 0.32384, + "25": 0.32516, + "26": 0.33394, + "27": 0.32353, + "28": 0.32387, + "29": 0.33903, + "30": 0.32341, + "31": 0.32362, + "32": 0.33581, + "33": 0.32429, + "34": 0.32354, + "35": 0.34191, + "36": 0.32385, + "37": 0.31882, + "38": 0.33898, + "39": 0.30757, + "40": 0.31116, + "41": 0.31744, + "42": 0.30716, + "43": 0.30682, + "44": 0.31469, + "45": 0.31615, + "46": 0.30687, + "47": 0.30877, + "48": 0.31402, + "49": 0.30825, + "50": 0.30784, + "51": 0.34123, + "52": 0.30954, + "53": 0.56738, + "54": 0.30221, + "55": 0.31106, + "56": 0.30933, + "57": 0.31081, + "58": 0.30785, + "59": 0.30911, + "60": 0.3023, + "61": 0.62879, + "62": 0.30236, + "63": 0.30247, + "64": 0.30924, + "65": 0.30345, + "66": 0.29854, + "67": 0.30661, + "68": 0.30496, + "69": 0.29736, + "70": 0.30244, + "71": 0.30287, + "72": 0.29819, + "73": 0.29849, + "74": 0.30577, + "75": 0.30399, + "76": 0.30895, + "77": 0.30926, + "78": 0.30949, + "79": 0.30633, + "80": 0.31099, + "81": 0.30704, + "82": 0.30445, + "83": 0.31105, + "84": 0.30999, + "85": 0.30339, + "86": 0.30467, + "87": 0.30774, + "88": 0.30578, + "89": 0.30511, + "90": 0.31156, + "91": 0.30995, + "92": 0.30672, + "93": 0.31046, + "94": 0.3104, + "95": 0.30314, + "96": 0.30871, + "97": 0.30827, + "98": 0.30255, + "99": 0.30371, + "100": 0.30359 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..5e2ba569f87 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.8583, + "2": 10.85411, + "3": 10.85432, + "4": 10.84404, + "5": 10.87282, + "6": 10.87931, + "7": 10.84659, + "8": 10.86139, + "9": 10.87078, + "10": 10.83268, + "11": 10.86331, + "12": 10.87295, + "13": 10.87792, + "14": 10.8859, + "15": 10.821, + "16": 10.8276, + "17": 10.803, + "18": 10.82095, + "19": 10.80028, + "20": 10.71379, + "21": 10.69818, + "22": 10.57543, + "23": 10.72117, + "24": 10.60088, + "25": 10.54762, + "26": 10.61129, + "27": 10.61394, + "28": 10.57775, + "29": 10.5789, + "30": 10.36786, + "31": 10.13447, + "32": 10.47056, + "33": 10.47376, + "34": 10.23442, + "35": 10.28459, + "36": 10.23594, + "37": 10.35354, + "38": 10.2069, + "39": 10.40582, + "40": 10.08919, + "41": 10.16389, + "42": 10.22672, + "43": 9.86333, + "44": 9.98188, + "45": 9.84556, + "46": 9.85756, + "47": 10.16883, + "48": 9.86477, + "49": 9.54713, + "50": 9.91938, + "51": 9.86177, + "52": 9.76163, + "53": 10.08382, + "54": 9.96738, + "55": 9.89195, + "56": 9.63775, + "57": 9.49339, + "58": 9.83898, + "59": 9.5964, + "60": 9.50822, + "61": 9.70512, + "62": 9.99504, + "63": 9.38054, + "64": 9.78296, + "65": 8.95947, + "66": 9.71043, + "67": 9.38078, + "68": 9.78882, + "69": 9.79449, + "70": 9.73441, + "71": 9.6215, + "72": 9.58789, + "73": 9.49656, + "74": 8.94345, + "75": 9.43109, + "76": 9.09716, + "77": 10.06808, + "78": 9.73001, + "79": 9.37764, + "80": 9.411, + "81": 9.48621, + "82": 9.69667, + "83": 9.31631, + "84": 9.42182, + "85": 9.61518, + "86": 9.07555, + "87": 9.59851, + "88": 9.75045, + "89": 9.61114, + "90": 9.82372, + "91": 9.35275, + "92": 9.36497, + "93": 9.08809, + "94": 8.83652, + "95": 9.52259, + "96": 9.52792, + "97": 9.31634, + "98": 9.67876, + "99": 8.89323, + "100": 9.408 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1748.0, + "2": 1856.0, + "3": 1756.0, + "4": 1916.0, + "5": 2038.0, + "6": 2033.0, + "7": 1992.0, + "8": 1767.0, + "9": 2004.0, + "10": 1566.0, + "11": 2096.0, + "12": 1979.0, + "13": 2129.0, + "14": 1957.0, + "15": 1963.0, + "16": 1930.0, + "17": 1918.0, + "18": 1820.0, + "19": 2035.0, + "20": 1792.0, + "21": 2151.0, + "22": 1928.0, + "23": 2106.0, + "24": 1888.0, + "25": 1840.0, + "26": 1892.0, + "27": 1902.0, + "28": 2196.0, + "29": 2149.0, + "30": 1921.0, + "31": 1700.0, + "32": 2103.0, + "33": 2359.0, + "34": 1969.0, + "35": 2160.0, + "36": 2083.0, + "37": 2590.0, + "38": 2506.0, + "39": 2695.0, + "40": 2402.0, + "41": 2498.0, + "42": 2534.0, + "43": 2125.0, + "44": 2292.0, + "45": 2296.0, + "46": 2691.0, + "47": 2633.0, + "48": 2721.0, + "49": 2509.0, + "50": 2799.0, + "51": 2780.0, + "52": 2832.0, + "53": 3150.0, + "54": 2950.0, + "55": 2596.0, + "56": 2975.0, + "57": 2601.0, + "58": 3243.0, + "59": 2957.0, + "60": 2743.0, + "61": 3224.0, + "62": 2804.0, + "63": 2737.0, + "64": 3139.0, + "65": 2763.0, + "66": 3501.0, + "67": 2882.0, + "68": 3059.0, + "69": 3225.0, + "70": 3538.0, + "71": 3208.0, + "72": 2562.0, + "73": 3322.0, + "74": 2181.0, + "75": 2820.0, + "76": 3361.0, + "77": 3652.0, + "78": 3521.0, + "79": 3575.0, + "80": 3630.0, + "81": 3995.0, + "82": 3702.0, + "83": 3206.0, + "84": 3591.0, + "85": 3519.0, + "86": 3053.0, + "87": 4074.0, + "88": 3380.0, + "89": 3804.0, + "90": 3435.0, + "91": 3109.0, + "92": 3439.0, + "93": 2985.0, + "94": 3843.0, + "95": 3715.0, + "96": 3825.0, + "97": 3418.0, + "98": 3954.0, + "99": 3375.0, + "100": 3532.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 232422400.0, + "2": 232422400.0, + "3": 232422400.0, + "4": 232422400.0, + "5": 232422400.0, + "6": 232422400.0, + "7": 232422400.0, + "8": 232422400.0, + "9": 232422400.0, + "10": 232422400.0, + "11": 232422400.0, + "12": 232422400.0, + "13": 232422400.0, + "14": 232422400.0, + "15": 232422400.0, + "16": 232422400.0, + "17": 232422400.0, + "18": 232422400.0, + "19": 232422400.0, + "20": 232422400.0, + "21": 232422400.0, + "22": 232422400.0, + "23": 232422400.0, + "24": 232422400.0, + "25": 232422400.0, + "26": 232422400.0, + "27": 232422400.0, + "28": 232422400.0, + "29": 232422400.0, + "30": 232422400.0, + "31": 232422400.0, + "32": 232422400.0, + "33": 232422400.0, + "34": 232422400.0, + "35": 232422400.0, + "36": 232422400.0, + "37": 232422400.0, + "38": 232422400.0, + "39": 232422400.0, + "40": 232422400.0, + "41": 232422400.0, + "42": 232422400.0, + "43": 232422400.0, + "44": 232422400.0, + "45": 232422400.0, + "46": 232422400.0, + "47": 232422400.0, + "48": 232422400.0, + "49": 232422400.0, + "50": 232422400.0, + "51": 232422400.0, + "52": 232422400.0, + "53": 232422400.0, + "54": 232422400.0, + "55": 232422400.0, + "56": 232422400.0, + "57": 232422400.0, + "58": 232422400.0, + "59": 232422400.0, + "60": 232422400.0, + "61": 232422400.0, + "62": 232422400.0, + "63": 232422400.0, + "64": 232422400.0, + "65": 232422400.0, + "66": 232422400.0, + "67": 232422400.0, + "68": 232422400.0, + "69": 232422400.0, + "70": 232422400.0, + "71": 232422400.0, + "72": 232422400.0, + "73": 232422400.0, + "74": 232422400.0, + "75": 232422400.0, + "76": 232422400.0, + "77": 232422400.0, + "78": 232422400.0, + "79": 232422400.0, + "80": 232422400.0, + "81": 232422400.0, + "82": 232422400.0, + "83": 232422400.0, + "84": 232422400.0, + "85": 232422400.0, + "86": 232422400.0, + "87": 232422400.0, + "88": 232422400.0, + "89": 232422400.0, + "90": 232422400.0, + "91": 232422400.0, + "92": 232422400.0, + "93": 232422400.0, + "94": 232422400.0, + "95": 232422400.0, + "96": 232422400.0, + "97": 232422400.0, + "98": 232422400.0, + "99": 232422400.0, + "100": 232422400.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 686566400.0, + "2": 771176960.0, + "3": 771177472.0, + "4": 773273600.0, + "5": 773273600.0, + "6": 773273600.0, + "7": 773274624.0, + "8": 773274624.0, + "9": 773274624.0, + "10": 773274624.0, + "11": 773274624.0, + "12": 773274624.0, + "13": 773274624.0, + "14": 773276160.0, + "15": 773276160.0, + "16": 773276160.0, + "17": 773276160.0, + "18": 775372800.0, + "19": 775372800.0, + "20": 775372800.0, + "21": 775372800.0, + "22": 775372800.0, + "23": 775372800.0, + "24": 775372800.0, + "25": 775372800.0, + "26": 775372800.0, + "27": 775372800.0, + "28": 775372800.0, + "29": 775372800.0, + "30": 775372800.0, + "31": 775373312.0, + "32": 775373312.0, + "33": 775373312.0, + "34": 775373312.0, + "35": 775373312.0, + "36": 775373312.0, + "37": 775373312.0, + "38": 775373312.0, + "39": 775373312.0, + "40": 775373312.0, + "41": 775373312.0, + "42": 775373312.0, + "43": 775373824.0, + "44": 775373824.0, + "45": 775373824.0, + "46": 775373824.0, + "47": 775373824.0, + "48": 775373824.0, + "49": 775373824.0, + "50": 775373824.0, + "51": 775373824.0, + "52": 775373824.0, + "53": 775373824.0, + "54": 775373824.0, + "55": 775373824.0, + "56": 775373824.0, + "57": 775373824.0, + "58": 775373824.0, + "59": 775373824.0, + "60": 775373824.0, + "61": 775373824.0, + "62": 775373824.0, + "63": 775373824.0, + "64": 775373824.0, + "65": 775373824.0, + "66": 775373824.0, + "67": 775373824.0, + "68": 775373824.0, + "69": 775373824.0, + "70": 775373824.0, + "71": 775373824.0, + "72": 775373824.0, + "73": 775373824.0, + "74": 775373824.0, + "75": 775373824.0, + "76": 775373824.0, + "77": 775373824.0, + "78": 775373824.0, + "79": 775373824.0, + "80": 775373824.0, + "81": 775373824.0, + "82": 775373824.0, + "83": 775373824.0, + "84": 775373824.0, + "85": 775373824.0, + "86": 775373824.0, + "87": 775373824.0, + "88": 775373824.0, + "89": 775373824.0, + "90": 775373824.0, + "91": 775373824.0, + "92": 775373824.0, + "93": 775373824.0, + "94": 775373824.0, + "95": 775373824.0, + "96": 775373824.0, + "97": 775373824.0, + "98": 775373824.0, + "99": 775373824.0, + "100": 775373824.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 17.28027, + "2": 0.43557, + "3": 0.31256, + "4": 0.52452, + "5": 0.30225, + "6": 0.30256, + "7": 0.30555, + "8": 0.30821, + "9": 0.30219, + "10": 0.30529, + "11": 0.30616, + "12": 0.30125, + "13": 0.30004, + "14": 0.30732, + "15": 0.30042, + "16": 0.29949, + "17": 0.30269, + "18": 0.30194, + "19": 0.29918, + "20": 0.30331, + "21": 0.30981, + "22": 0.30199, + "23": 0.30598, + "24": 0.30587, + "25": 0.30317, + "26": 0.30125, + "27": 0.30707, + "28": 0.30389, + "29": 0.302, + "30": 0.30486, + "31": 0.3068, + "32": 0.30229, + "33": 0.30311, + "34": 0.30869, + "35": 0.30157, + "36": 0.30236, + "37": 0.31062, + "38": 0.30491, + "39": 0.30805, + "40": 0.30378, + "41": 0.30681, + "42": 0.303, + "43": 0.30486, + "44": 0.30998, + "45": 0.30342, + "46": 0.3029, + "47": 0.30594, + "48": 0.30231, + "49": 0.30303, + "50": 0.30593, + "51": 0.34108, + "52": 0.3713, + "53": 0.30266, + "54": 0.30949, + "55": 0.30141, + "56": 0.30177, + "57": 0.31274, + "58": 0.30056, + "59": 0.30334, + "60": 0.30543, + "61": 0.31011, + "62": 0.30218, + "63": 0.30208, + "64": 0.30959, + "65": 0.30236, + "66": 0.3065, + "67": 0.31278, + "68": 0.30352, + "69": 0.30182, + "70": 0.30842, + "71": 0.50384, + "72": 0.30258, + "73": 0.30995, + "74": 0.30317, + "75": 0.30262, + "76": 0.31231, + "77": 0.3028, + "78": 0.29846, + "79": 0.30056, + "80": 0.3041, + "81": 0.29804, + "82": 0.29643, + "83": 0.30562, + "84": 0.2973, + "85": 0.29782, + "86": 0.30939, + "87": 0.29652, + "88": 0.2959, + "89": 0.29905, + "90": 0.30481, + "91": 0.29588, + "92": 0.29895, + "93": 0.30696, + "94": 0.29931, + "95": 0.30059, + "96": 0.31374, + "97": 0.30002, + "98": 0.29871, + "99": 0.30209, + "100": 0.30559 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..997f65d9fb4 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1 @@ +{"0": {"input_prompt": "Creative Commons Attribution-ShareAlike 4.0 International Public License\n\nBy exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-ShareAlike 4.0 International Public License (\u201cPublic License\u201d).", "generated_text": " To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.\n\nA \u201cLicense Elements\u201d means the copyright and similar rights held by the Licensor that apply to the", "generated_tokens": [3870, 1278, 13820, 1593, 11227, 56484, 2188, 1402, 27539, 1435, 1261, 8633, 1044, 3213, 1584, 23369, 1278, 29960, 29520, 27868, 1294, 22666, 1307, 9825, 33868, 1307, 2576, 6856, 1321, 5481, 1044, 1321, 1278, 29960, 10648, 47506, 3213, 2516, 10741, 1294, 22666, 1307, 15021, 1278, 29960, 10648, 26510, 1562, 6187, 1278, 29960, 29520, 14736, 5178, 2425, 2576, 6856, 1321, 5481, 1338, 1065, 2129, 93552, 68175, 1414, 4938, 1278, 48896, 1321, 4510, 10741, 6452, 1536, 1278, 29960, 10648, 1455, 11145, 1317, 1278], "tpot": [2.720426321029663, 0.6659098267555237, 0.07840608060359955, 0.07743222266435623, 0.07455050200223923, 0.0731138214468956, 0.07045378535985947, 0.07106886059045792, 0.0719049945473671, 0.07009641081094742, 0.06961708515882492, 0.0693572461605072, 0.07076390087604523, 0.06894252449274063, 0.06956227123737335, 0.07301510870456696, 0.07005567848682404, 0.07221231609582901, 0.06963715702295303, 0.07077756524085999, 0.0693695992231369, 0.07059446722269058, 0.07056189328432083, 0.07043007761240005, 0.07100988924503326, 0.06954912096261978, 0.06932665407657623, 0.06911753863096237, 0.06943970918655396, 0.06930265575647354, 0.06936381012201309, 0.07106435298919678, 0.07099161297082901, 0.06973165273666382, 0.07030060887336731, 0.06937744468450546, 0.07144572585821152, 0.0705178901553154, 0.06963129341602325, 0.06951193511486053, 0.06903158873319626, 0.0701359361410141, 0.06920403242111206, 0.06966931372880936, 0.06947369128465652, 0.07044544070959091, 0.07153702527284622, 0.06970176100730896, 0.07077661156654358, 0.06910556554794312, 0.06982534378767014, 0.07268957048654556, 0.07182464003562927, 0.07119160890579224, 0.07311885058879852, 0.07156931608915329, 0.07464009523391724, 0.0744134783744812, 0.07528038322925568, 0.0751194879412651, 0.0736798420548439, 0.0735008642077446, 0.07334134727716446, 0.07211820781230927, 0.07172300666570663, 0.06956271827220917, 0.06994012743234634, 0.07024886459112167, 0.06890105456113815, 0.07088610529899597, 0.06935007870197296, 0.06854406744241714, 0.06991859525442123, 0.07241446524858475, 0.06963654607534409, 0.06925679743289948, 0.06985462456941605, 0.06919551640748978, 0.06986681371927261, 0.07047929614782333], "latency": 15.219947323203087, "logprobs": [-1.034429907798767, -2.2820096015930176, -1.1818207502365112, -0.005243122112005949, -1.3920068740844727, -0.0023506649304181337, -0.23362953960895538, -4.410646579344757e-05, -0.8059788346290588, -1.165771722793579, -0.005122631322592497, -0.01079292967915535, -0.31597569584846497, -4.845684051513672, -0.054925862699747086, -2.718410015106201, -5.851214408874512, -7.10594367980957, -1.8839404582977295, -6.603451728820801, -0.10522890836000443, -0.14382460713386536, -0.908831775188446, -0.011833587661385536, -0.08751995116472244, -0.031985729932785034, -0.03963988274335861, -1.1124131679534912, -0.005112550221383572, -0.0002406545972917229, -0.021998438984155655, -0.013275211676955223, -0.0030618475284427404, -0.007447692099958658, -0.059675432741642, -0.027009541168808937, -0.2265223264694214, -0.027810541912913322, -0.0022902467753738165, -0.007414560765028, -2.5149638652801514, -0.06250719726085663, -0.49305495619773865, -0.00015066919149830937, -0.10436679422855377, -0.002546284580603242, -0.0039064777083694935, -0.00010132275929208845, -0.03080633655190468, -0.0027381805703043938, -0.002457219874486327, -0.0022670540492981672, -0.06900941580533981, -0.015771063044667244, -0.0026065681595355272, -3.849259376525879, -0.949365496635437, -0.007241431158035994, -0.8718545436859131, -0.2303992360830307, -0.03798322752118111, -0.0003301552205812186, -0.03691234439611435, -0.08387894183397293, -0.00013851160474587232, -0.000623032043222338, -5.864924969500862e-05, -0.027150511741638184, -0.00028236693469807506, -4.279521817807108e-05, -0.0054723224602639675, -0.0008360228384844959, -0.17018567025661469, -0.0045921108685433865, -0.0020528212189674377, -5.245195097813848e-06, -0.16259293258190155, -0.001334729720838368, -3.45700973412022e-05, -0.0004881620698142797, -0.014900578185915947, -2.706014311115723e-05, -0.004492428619414568, -0.03925368934869766, -0.0006156456656754017, -0.1234944611787796, -0.007040690630674362, -0.0002475670480635017, -0.0005224770284257829, -0.005545470397919416, -4.255681051290594e-05, -0.03230837732553482, -0.07245421409606934, -0.11131127178668976, -0.007754461374133825, -5.6980417866725475e-05, -0.0030440206173807383, -0.006326647009700537, -4.875540980719961e-05, -0.0002343380037928, -3.0397906812140718e-05, -0.012009222991764545, -0.009006588719785213, -0.001957882894203067, -1.2040065485052764e-05, -0.0010608765296638012, -0.0002615109842736274, -0.06175156682729721, -0.01025254838168621, -0.0012381753185763955, -0.0038769098464399576, -0.03451932966709137, -6.270212179515511e-05, -0.12754283845424652, -0.1323665827512741, -0.0015854182420298457, -3.0397906812140718e-05, -0.00013350549852475524, -0.15308424830436707, -1.8770537376403809, -0.31804990768432617, -0.5516462922096252, -0.19552597403526306, -0.10012278705835342, -0.7691615223884583, -0.022935237735509872, -0.5969871282577515, -1.5031214952468872, -0.384999543428421, -0.0883193388581276, -0.9926105737686157, -0.389201283454895, -0.22737500071525574, -0.012403964065015316, -0.0001934579631779343, -0.5111952424049377, -0.24787002801895142, -0.0004378790326882154, -0.027225803583860397]}, "1": {"input_prompt": "GNU GENERAL PUBLIC LICENSE\nVersion 3, 29 June 2007\n\nPreamble\n\nThe GNU General Public License is a free, copyleft license for software and other kinds of works.", "generated_text": " The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies", "generated_tokens": [1531, 91700, 1394, 2725, 9314, 1321, 2147, 18107, 6113, 1584, 9543, 1317, 4069, 5109, 2143, 18613, 1317, 9730, 1321, 4036, 1278, 6113, 1046, 5652, 9033, 1044, 1278, 56703, 7487, 11227, 56484, 1395, 13650, 1317, 33152, 2143, 18613, 1317, 9730, 1321, 4036, 1747, 15628, 1307, 1261, 3467, 1742, 1611, 3180, 5257, 1494, 10714, 5370, 9314, 1394, 1747, 2246, 8616, 1046, 2837, 1044, 1278, 16611, 29494, 17364, 1044, 2210, 1278, 56703, 7487, 11227, 56484, 1394, 2725, 1307, 2948, 9314, 1059, 1494, 28735], "tpot": [0.7174983620643616, 0.07785984128713608, 0.0764852836728096, 0.07466614246368408, 0.0717785581946373, 0.07468675076961517, 0.07152419537305832, 0.06969526410102844, 0.07110752165317535, 0.06970572471618652, 0.06920454651117325, 0.06990531086921692, 0.07004140317440033, 0.0712602511048317, 0.06903129816055298, 0.07071229070425034, 0.07059088349342346, 0.06999795883893967, 0.06967964768409729, 0.07150192558765411, 0.06971721351146698, 0.06916943937540054, 0.06966301053762436, 0.06984022259712219, 0.069039486348629, 0.06911581009626389, 0.06958959996700287, 0.0706978514790535, 0.06978118419647217, 0.06945011019706726, 0.0694519653916359, 0.0701381117105484, 0.06995609402656555, 0.06912890076637268, 0.06973984092473984, 0.06986332684755325, 0.0694037452340126, 0.06932634115219116, 0.06928720325231552, 0.06932701170444489, 0.0689065232872963, 0.07238291203975677, 0.07131846249103546, 0.06996982544660568, 0.07046765089035034, 0.0726158395409584, 0.07259414345026016, 0.07020287960767746, 0.07142271846532822, 0.0708770900964737, 0.07033068686723709, 0.07027311623096466, 0.06996393948793411, 0.07049206644296646, 0.06900809705257416, 0.0699913278222084, 0.07210537791252136, 0.0702073872089386, 0.07132425904273987, 0.06975401192903519, 0.07038697600364685, 0.06933759897947311, 0.06984009593725204, 0.06967458873987198, 0.06888572871685028, 0.06986083090305328, 0.06940105557441711, 0.06956079602241516, 0.06917689740657806, 0.06920892745256424, 0.0712355226278305, 0.07001478224992752, 0.06936268508434296, 0.069720059633255, 0.07083427160978317, 0.0705321878194809, 0.06942963600158691, 0.06904758512973785, 0.06982547044754028, 0.07130048424005508], "latency": 15.219947323203087, "logprobs": [-7.482367992401123, -4.782957077026367, -0.15608751773834229, -0.05624598637223244, -0.0666063204407692, -0.000226472009671852, -0.002314390614628792, -0.7274855971336365, -2.047292470932007, -0.0029495328199118376, -0.8379128575325012, -0.00838379468768835, -0.0015731590101495385, -0.02502445876598358, -0.0011831672163680196, -0.0041245874017477036, -0.00022742546570952982, -0.0002157455455744639, -5.936446541454643e-05, -0.0004980515805073082, -0.0002698534226510674, -2.2059996128082275, -6.3529462814331055, -0.011952094733715057, -0.00010239553375868127, -0.3807244598865509, -0.20424246788024902, -0.41751813888549805, -0.005481095518916845, -1.1086402082582936e-05, -0.007466860581189394, -0.00838320329785347, -0.009201501496136189, -0.017721762880682945, -0.0024051330983638763, -0.00045718232286162674, -8.702239938429557e-06, -1.5139465176616795e-05, -0.0031880526803433895, -0.005352333653718233, -0.10581696778535843, -0.05035088211297989, -0.5795518755912781, -0.019671587273478508, -0.007066140417009592, -0.034393906593322754, -6.98299503326416, -0.46170496940612793, -0.04491615667939186, -0.030878927558660507, -0.0016607552533969283, -0.0006268443539738655, -0.00987135712057352, -6.496695277746767e-05, -0.8354158997535706, -0.007698154542595148, -0.0012696071062237024, -0.0004447901446837932, -0.0018221217906102538, -0.0014835315523669124, -0.001134824356995523, -0.034311436116695404, -0.014452068135142326, -0.0019802500028163195, -0.014066009782254696, -0.002191762439906597, -0.0013553252210840583, -0.015814948827028275, -0.007888473570346832, -0.01361841894686222, -0.0007306052139028907, -0.00019095504831057042, -0.0022776394616812468, -0.0008617501589469612, -0.000940476544201374, -0.0038709724321961403, -0.0038757221773266792, -0.004625573288649321, -0.0022389839868992567, -5.6503606174374e-05, -0.0039673917926847935, -0.007623270619660616, -0.0014759134501218796, -0.0002557904226705432, -0.000474936212413013, -0.00139246741309762, -0.001206504413858056, -0.00015853578224778175, -0.000545472139492631, -0.0014616292901337147, -0.002354232594370842, -9.703165414975956e-05, -0.00024399164249189198, -0.16811230778694153, -0.004927040543407202, -0.017750689759850502, -0.0001802282058633864, -0.0014571059728041291, -0.003566454164683819, -0.00021264675888232887, -0.01999940164387226, -0.0008441222598776221, -4.8636207793606445e-05, -0.0011026738211512566, -1.1801649634435307e-05, -0.1814543753862381, -0.016339080408215523, -0.014278624206781387, -0.0029024637769907713, -0.006082594860345125, -0.0016703951405361295, -0.0006364941946230829, -0.0010387268848717213, -0.002667442662641406, -0.0002610342635307461, -0.002438787603750825, -0.013884739950299263, -0.007366991601884365, -0.005141369998455048, -0.010307767428457737, -0.0009261847590096295, -0.0009263038518838584, -0.0068603926338255405, -0.0008634176338091493, -0.0006144542712718248, -2.2053474822314456e-05, -0.004078048747032881]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..bb6ee34ea21 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/golden_values_dev_dgxh100_eos.json @@ -0,0 +1 @@ +{"0": {"input_prompt": "Creative Commons Attribution-ShareAlike 4.0 International Public License\n\nBy exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-ShareAlike 4.0 International Public License (\u201cPublic License\u201d).", "generated_text": " To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.\n\nA \u201cLicense Elements\u201d means the copyright and similar rights held by the Licensor that apply to the", "generated_tokens": [3870, 1278, 13820, 1593, 11227, 56484, 2188, 1402, 27539, 1435, 1261, 8633, 1044, 3213, 1584, 23369, 1278, 29960, 29520, 27868, 1294, 22666, 1307, 9825, 33868, 1307, 2576, 6856, 1321, 5481, 1044, 1321, 1278, 29960, 10648, 47506, 3213, 2516, 10741, 1294, 22666, 1307, 15021, 1278, 29960, 10648, 26510, 1562, 6187, 1278, 29960, 29520, 14736, 5178, 2425, 2576, 6856, 1321, 5481, 1338, 1065, 2129, 93552, 68175, 1414, 4938, 1278, 48896, 1321, 4510, 10741, 6452, 1536, 1278, 29960, 10648, 1455, 11145, 1317, 1278], "tpot": [2.4923102855682373, 0.6759980320930481, 0.08269506692886353, 0.08119833469390869, 0.08115603029727936, 0.0800175741314888, 0.08051318675279617, 0.08278025686740875, 0.08045568317174911, 0.08009149134159088, 0.07951929420232773, 0.08059776574373245, 0.08038483560085297, 0.07992669194936752, 0.08057552576065063, 0.07977830618619919, 0.08127715438604355, 0.08072630316019058, 0.08037532866001129, 0.0804634839296341, 0.08137375861406326, 0.0813906267285347, 0.08126940578222275, 0.08076531440019608, 0.08090108633041382, 0.0793602243065834, 0.08094745874404907, 0.0810527354478836, 0.08107049763202667, 0.08040124177932739, 0.07976572960615158, 0.08069661259651184, 0.0826275497674942, 0.0810798704624176, 0.07998496294021606, 0.08005843311548233, 0.0805768370628357, 0.08088915050029755, 0.08113190531730652, 0.08077005296945572, 0.08062981814146042, 0.08078550547361374, 0.08168613910675049, 0.08143996447324753, 0.08142809569835663, 0.08187657594680786, 0.07972115278244019, 0.08118259161710739, 0.08142592012882233, 0.0806335061788559, 0.08064771443605423, 0.07944890111684799, 0.08106396347284317, 0.08158227801322937, 0.0814877450466156, 0.08077871799468994, 0.0795617327094078, 0.08221545070409775, 0.08131680637598038, 0.08039452880620956, 0.080450139939785, 0.07980994880199432, 0.08013289421796799, 0.08113926649093628, 0.08158918470144272, 0.08053535968065262, 0.08091792464256287, 0.07972493022680283, 0.08126131445169449, 0.08287584036588669, 0.0808253064751625, 0.08110111951828003, 0.07954514771699905, 0.08085116744041443, 0.0816071406006813, 0.08060210943222046, 0.08102639764547348, 0.07997968047857285, 0.08147360384464264, 0.08081503957509995], "latency": 16.56691719801165, "logprobs": [-1.034429907798767, -2.2820096015930176, -1.1818207502365112, -0.005243122112005949, -1.3920068740844727, -0.0023506649304181337, -0.23362953960895538, -4.410646579344757e-05, -0.8059788346290588, -1.165771722793579, -0.005122631322592497, -0.01079292967915535, -0.31597569584846497, -4.845684051513672, -0.054925862699747086, -2.718410015106201, -5.851214408874512, -7.10594367980957, -1.8839404582977295, -6.603451728820801, -0.10522890836000443, -0.14382460713386536, -0.908831775188446, -0.011833587661385536, -0.08751995116472244, -0.031985729932785034, -0.03963988274335861, -1.1124131679534912, -0.005112550221383572, -0.0002406545972917229, -0.021998438984155655, -0.013275211676955223, -0.0030618475284427404, -0.007447692099958658, -0.059675432741642, -0.027009541168808937, -0.2265223264694214, -0.027810541912913322, -0.0022902467753738165, -0.007414560765028, -2.5149638652801514, -0.06250719726085663, -0.49305495619773865, -0.00015066919149830937, -0.10436679422855377, -0.002546284580603242, -0.0039064777083694935, -0.00010132275929208845, -0.03080633655190468, -0.0027381805703043938, -0.002457219874486327, -0.0022670540492981672, -0.06900941580533981, -0.015771063044667244, -0.0026065681595355272, -3.849259376525879, -0.949365496635437, -0.007241431158035994, -0.8718545436859131, -0.2303992360830307, -0.03798322752118111, -0.0003301552205812186, -0.03691234439611435, -0.08387894183397293, -0.00013851160474587232, -0.000623032043222338, -5.864924969500862e-05, -0.027150511741638184, -0.00028236693469807506, -4.279521817807108e-05, -0.0054723224602639675, -0.0008360228384844959, -0.17018567025661469, -0.0045921108685433865, -0.0020528212189674377, -5.245195097813848e-06, -0.16259293258190155, -0.001334729720838368, -3.45700973412022e-05, -0.0004881620698142797, -0.014900578185915947, -2.706014311115723e-05, -0.004492428619414568, -0.03925368934869766, -0.0006156456656754017, -0.1234944611787796, -0.007040690630674362, -0.0002475670480635017, -0.0005224770284257829, -0.005545470397919416, -4.255681051290594e-05, -0.03230837732553482, -0.07245421409606934, -0.11131127178668976, -0.007754461374133825, -5.6980417866725475e-05, -0.0030440206173807383, -0.006326647009700537, -4.875540980719961e-05, -0.0002343380037928, -3.0397906812140718e-05, -0.012009222991764545, -0.009006588719785213, -0.001957882894203067, -1.2040065485052764e-05, -0.0010608765296638012, -0.0002615109842736274, -0.06175156682729721, -0.01025254838168621, -0.0012381753185763955, -0.0038769098464399576, -0.03451932966709137, -6.270212179515511e-05, -0.12754283845424652, -0.1323665827512741, -0.0015854182420298457, -3.0397906812140718e-05, -0.00013350549852475524, -0.15308424830436707, -1.8770537376403809, -0.31804990768432617, -0.5516462922096252, -0.19552597403526306, -0.10012278705835342, -0.7691615223884583, -0.022935237735509872, -0.5969871282577515, -1.5031214952468872, -0.384999543428421, -0.0883193388581276, -0.9926105737686157, -0.389201283454895, -0.22737500071525574, -0.012403964065015316, -0.0001934579631779343, -0.5111952424049377, -0.24787002801895142, -0.0004378790326882154, -0.027225803583860397]}, "1": {"input_prompt": "GNU GENERAL PUBLIC LICENSE\nVersion 3, 29 June 2007\n\nPreamble\n\nThe GNU General Public License is a free, copyleft license for software and other kinds of works.", "generated_text": " The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies", "generated_tokens": [1531, 91700, 1394, 2725, 9314, 1321, 2147, 18107, 6113, 1584, 9543, 1317, 4069, 5109, 2143, 18613, 1317, 9730, 1321, 4036, 1278, 6113, 1046, 5652, 9033, 1044, 1278, 56703, 7487, 11227, 56484, 1395, 13650, 1317, 33152, 2143, 18613, 1317, 9730, 1321, 4036, 1747, 15628, 1307, 1261, 3467, 1742, 1611, 3180, 5257, 1494, 10714, 5370, 9314, 1394, 1747, 2246, 8616, 1046, 2837, 1044, 1278, 16611, 29494, 17364, 1044, 2210, 1278, 56703, 7487, 11227, 56484, 1394, 2725, 1307, 2948, 9314, 1059, 1494, 28735], "tpot": [0.6688169836997986, 0.08646825700998306, 0.083538718521595, 0.08260326087474823, 0.08199965208768845, 0.08158879727125168, 0.0802709087729454, 0.08419913798570633, 0.07995779067277908, 0.08143891394138336, 0.08108057081699371, 0.08084486424922943, 0.08102915436029434, 0.07983194291591644, 0.08131516724824905, 0.0816650539636612, 0.08091884851455688, 0.08093494176864624, 0.08018704503774643, 0.08179347217082977, 0.08112754672765732, 0.08112083375453949, 0.0805734395980835, 0.08067212998867035, 0.08022300899028778, 0.08121798932552338, 0.08183427155017853, 0.0806741788983345, 0.08114969730377197, 0.07974809408187866, 0.080985888838768, 0.08140931278467178, 0.0831851214170456, 0.08096041530370712, 0.07966978847980499, 0.08085939288139343, 0.08112092316150665, 0.08085711300373077, 0.08063827455043793, 0.07968409359455109, 0.08139641582965851, 0.08102294057607651, 0.08102816343307495, 0.08071696013212204, 0.08157248049974442, 0.08005645126104355, 0.08118710666894913, 0.0810147151350975, 0.08026038110256195, 0.08055280148983002, 0.07966405898332596, 0.08168742060661316, 0.0816090852022171, 0.08039574325084686, 0.08089830726385117, 0.0794670432806015, 0.08368594944477081, 0.08118339627981186, 0.08051532506942749, 0.08080841600894928, 0.07947234809398651, 0.08114787191152573, 0.08128608018159866, 0.08138518780469894, 0.08067911118268967, 0.08099766820669174, 0.08047705888748169, 0.08083853125572205, 0.08097779005765915, 0.08190613985061646, 0.08038448542356491, 0.08032994717359543, 0.08100729435682297, 0.08379139006137848, 0.08242924511432648, 0.08085381984710693, 0.07933055609464645, 0.0811963826417923, 0.08024899661540985, 0.08009414374828339], "latency": 16.56691719801165, "logprobs": [-7.482367992401123, -4.782957077026367, -0.15608751773834229, -0.05624598637223244, -0.0666063204407692, -0.000226472009671852, -0.002314390614628792, -0.7274855971336365, -2.047292470932007, -0.0029495328199118376, -0.8379128575325012, -0.00838379468768835, -0.0015731590101495385, -0.02502445876598358, -0.0011831672163680196, -0.0041245874017477036, -0.00022742546570952982, -0.0002157455455744639, -5.936446541454643e-05, -0.0004980515805073082, -0.0002698534226510674, -2.2059996128082275, -6.3529462814331055, -0.011952094733715057, -0.00010239553375868127, -0.3807244598865509, -0.20424246788024902, -0.41751813888549805, -0.005481095518916845, -1.1086402082582936e-05, -0.007466860581189394, -0.00838320329785347, -0.009201501496136189, -0.017721762880682945, -0.0024051330983638763, -0.00045718232286162674, -8.702239938429557e-06, -1.5139465176616795e-05, -0.0031880526803433895, -0.005352333653718233, -0.10581696778535843, -0.05035088211297989, -0.5795518755912781, -0.019671587273478508, -0.007066140417009592, -0.034393906593322754, -6.98299503326416, -0.46170496940612793, -0.04491615667939186, -0.030878927558660507, -0.0016607552533969283, -0.0006268443539738655, -0.00987135712057352, -6.496695277746767e-05, -0.8354158997535706, -0.007698154542595148, -0.0012696071062237024, -0.0004447901446837932, -0.0018221217906102538, -0.0014835315523669124, -0.001134824356995523, -0.034311436116695404, -0.014452068135142326, -0.0019802500028163195, -0.014066009782254696, -0.002191762439906597, -0.0013553252210840583, -0.015814948827028275, -0.007888473570346832, -0.01361841894686222, -0.0007306052139028907, -0.00019095504831057042, -0.0022776394616812468, -0.0008617501589469612, -0.000940476544201374, -0.0038709724321961403, -0.0038757221773266792, -0.004625573288649321, -0.0022389839868992567, -5.6503606174374e-05, -0.0039673917926847935, -0.007623270619660616, -0.0014759134501218796, -0.0002557904226705432, -0.000474936212413013, -0.00139246741309762, -0.001206504413858056, -0.00015853578224778175, -0.000545472139492631, -0.0014616292901337147, -0.002354232594370842, -9.703165414975956e-05, -0.00024399164249189198, -0.16811230778694153, -0.004927040543407202, -0.017750689759850502, -0.0001802282058633864, -0.0014571059728041291, -0.003566454164683819, -0.00021264675888232887, -0.01999940164387226, -0.0008441222598776221, -4.8636207793606445e-05, -0.0011026738211512566, -1.1801649634435307e-05, -0.1814543753862381, -0.016339080408215523, -0.014278624206781387, -0.0029024637769907713, -0.006082594860345125, -0.0016703951405361295, -0.0006364941946230829, -0.0010387268848717213, -0.002667442662641406, -0.0002610342635307461, -0.002438787603750825, -0.013884739950299263, -0.007366991601884365, -0.005141369998455048, -0.010307767428457737, -0.0009261847590096295, -0.0009263038518838584, -0.0068603926338255405, -0.0008634176338091493, -0.0006144542712718248, -2.2053474822314456e-05, -0.004078048747032881]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..e7bab115f6e --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1 @@ +{"1": {"input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", "generated_tokens": [3060, 2430, 1636, 2012, 1317, 1278, 2362, 1307, 1278, 16070, 1044, 1321, 1636, 23067, 1455, 1593, 1395, 1605, 3140, 5152, 1513, 1747, 1046, 2409, 1395, 3140, 5152, 1513, 1278, 2362], "tpot": [0.5686635971069336, 0.006066783796995878, 0.00542214373126626, 0.005529535934329033, 0.005290016066282988, 0.005014463793486357, 0.004941120278090239, 0.004862783942371607, 0.004948512185364962, 0.004847776144742966, 0.004972127731889486, 0.0052157118916511536, 0.005366367753595114, 0.0054197758436203, 0.005486688110977411, 0.005352096166461706, 0.005394879728555679, 0.005450463853776455, 0.005347424186766148, 0.005441728048026562, 0.0054066237062215805, 0.0052277762442827225, 0.005518496036529541, 0.005288544110953808, 0.005351583939045668, 0.005274975672364235, 0.0052535682916641235, 0.005358528345823288, 0.00528879975900054, 0.0052247364073991776], "latency": 0.7284151650965214, "logprobs": [-9.358616828918457, -2.7474308013916016, -4.628000259399414, -1.5015846490859985, -0.6537986993789673, -1.6720777750015259, -2.478705883026123, -2.0523874759674072, -2.4486241340637207, -6.257688522338867, -1.4695018529891968, -3.4444499015808105, -4.394474029541016, -3.875497817993164, -2.0133562088012695, -1.8832889795303345, -3.8004486560821533, -6.784910678863525, -0.2949134111404419, -0.9851954579353333, -6.626471519470215, -7.186152458190918, -12.800604820251465, -2.2686400413513184, -3.7816011905670166, -0.4978560209274292, -4.371628284454346, -0.0696188285946846, -0.09487748891115189, -3.2375073432922363, -10.075444221496582, -1.138173222541809, -5.97689151763916, -5.093283653259277, -3.874396324157715, -2.6073620319366455, -3.466899871826172, -5.642228126525879, -1.6154727935791016, -5.416567325592041, -12.158267974853516, -12.610607147216797, -0.09664110094308853, -2.5213418006896973, -1.3747841119766235, -2.8510401248931885, -1.1877963542938232, -0.006288621574640274, -3.382380962371826, -13.207911491394043, -4.477662086486816, -2.5299136638641357, -6.053747653961182, -0.7650555372238159, -0.04903985932469368, -1.5557448863983154, -1.1315535306930542, -5.610307216644287, -0.4059771001338959, -4.961302280426025, -0.5701270699501038, -0.7174267172813416, -2.4735305309295654, -13.610812187194824, -0.09192369878292084, -3.5248732566833496, -1.3797900676727295, -6.429551124572754, -0.541852593421936, -3.5403199195861816, -0.8477706909179688, -1.5764057636260986, -5.343497276306152, -17.19588851928711, -6.635483741760254, -0.8923014402389526, -4.114314556121826, -1.2193646430969238, -2.2128424644470215, -1.7673423290252686, -0.22567729651927948, -9.320298194885254, -0.1282224804162979, -7.3249101638793945, -2.511319875717163, -4.0696563720703125, -3.5427517890930176, -1.9300249814987183, -2.347038507461548, -1.5178614854812622, -2.366441249847412, -1.744020938873291, -1.1570327281951904, -3.0150983333587646, -0.5272141098976135, -0.4669455885887146, -1.7157398462295532, -0.8362292051315308, -0.41491177678108215, -0.9386503100395203, -1.5008316040039062, -0.4635278284549713, -1.6312834024429321, -0.5320357084274292, -1.2249717712402344, -1.1707526445388794, -0.0023814670275896788, -1.1655761003494263, -0.006950841750949621, -0.7309689521789551, -0.7428325414657593, -0.042878177016973495, -0.8572992086410522, -0.01948782242834568, -2.0537290573120117, -1.2817553281784058, -0.8235744833946228]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..18ce65a905f --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/golden_values_dev_dgxh100_eos.json @@ -0,0 +1 @@ +{"1": {"input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", "generated_tokens": [3060, 2430, 1636, 2012, 1317, 1278, 2362, 1307, 1278, 16070, 1044, 1321, 1636, 23067, 1455, 1593, 1395, 1605, 3140, 5152, 1513, 1747, 1046, 2409, 1395, 3140, 5152, 1513, 1278, 2362], "tpot": [0.6098978519439697, 0.00587167963385582, 0.00553337624296546, 0.005388895981013775, 0.0052880640141665936, 0.005359936039894819, 0.00534518389031291, 0.005303360056132078, 0.0053532798774540424, 0.005232864059507847, 0.0053773121908307076, 0.005341055803000927, 0.0052644480019807816, 0.005387584213167429, 0.005375008098781109, 0.00524944020435214, 0.0053992001339793205, 0.005333151668310165, 0.0052451519295573235, 0.005348992068320513, 0.005396031774580479, 0.0052389120683074, 0.005332960281521082, 0.005230464041233063, 0.005353568121790886, 0.005343679804354906, 0.005257599987089634, 0.005404096096754074, 0.005395135842263699, 0.005260608159005642], "latency": 0.769633749499917, "logprobs": [-9.358616828918457, -2.7474308013916016, -4.628000259399414, -1.5015846490859985, -0.6537986993789673, -1.6720777750015259, -2.478705883026123, -2.0523874759674072, -2.4486241340637207, -6.257688522338867, -1.4695018529891968, -3.4444499015808105, -4.394474029541016, -3.875497817993164, -2.0133562088012695, -1.8832889795303345, -3.8004486560821533, -6.784910678863525, -0.2949134111404419, -0.9851954579353333, -6.626471519470215, -7.186152458190918, -12.800604820251465, -2.2686400413513184, -3.7816011905670166, -0.4978560209274292, -4.371628284454346, -0.0696188285946846, -0.09487748891115189, -3.2375073432922363, -10.075444221496582, -1.138173222541809, -5.97689151763916, -5.093283653259277, -3.874396324157715, -2.6073620319366455, -3.466899871826172, -5.642228126525879, -1.6154727935791016, -5.416567325592041, -12.158267974853516, -12.610607147216797, -0.09664110094308853, -2.5213418006896973, -1.3747841119766235, -2.8510401248931885, -1.1877963542938232, -0.006288621574640274, -3.382380962371826, -13.207911491394043, -4.477662086486816, -2.5299136638641357, -6.053747653961182, -0.7650555372238159, -0.04903985932469368, -1.5557448863983154, -1.1315535306930542, -5.610307216644287, -0.4059771001338959, -4.961302280426025, -0.5701270699501038, -0.7174267172813416, -2.4735305309295654, -13.610812187194824, -0.09192369878292084, -3.5248732566833496, -1.3797900676727295, -6.429551124572754, -0.541852593421936, -3.5403199195861816, -0.8477706909179688, -1.5764057636260986, -5.343497276306152, -17.19588851928711, -6.635483741760254, -0.8923014402389526, -4.114314556121826, -1.2193646430969238, -2.2128424644470215, -1.7673423290252686, -0.22567729651927948, -9.320298194885254, -0.1282224804162979, -7.3249101638793945, -2.511319875717163, -4.0696563720703125, -3.5427517890930176, -1.9300249814987183, -2.347038507461548, -1.5178614854812622, -2.366441249847412, -1.744020938873291, -1.1570327281951904, -3.0150983333587646, -0.5272141098976135, -0.4669455885887146, -1.7157398462295532, -0.8362292051315308, -0.41491177678108215, -0.9386503100395203, -1.5008316040039062, -0.4635278284549713, -1.6312834024429321, -0.5320357084274292, -1.2249717712402344, -1.1707526445388794, -0.0023814670275896788, -1.1655761003494263, -0.006950841750949621, -0.7309689521789551, -0.7428325414657593, -0.042878177016973495, -0.8572992086410522, -0.01948782242834568, -2.0537290573120117, -1.2817553281784058, -0.8235744833946228]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..05e16225cd4 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1 @@ +{"1": {"input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", "generated_tokens": [3060, 2430, 1636, 2012, 1317, 1278, 2362, 1307, 1278, 16070, 1044, 1321, 1636, 23067, 1455, 1593, 1395, 1605, 3140, 5152, 1513, 1747, 1046, 2409, 1395, 3140, 5152, 1513, 1278, 2362], "tpot": [0.561271607875824, 0.010015103965997696, 0.008491167798638344, 0.007847008295357227, 0.007853696122765541, 0.007908639498054981, 0.0077699837274849415, 0.007929407991468906, 0.007948416285216808, 0.008069856092333794, 0.008628063835203648, 0.00827731192111969, 0.007847904227674007, 0.007874688133597374, 0.008285152725875378, 0.008413120172917843, 0.008548031561076641, 0.008463519625365734, 0.008221376687288284, 0.008037183433771133, 0.007799903862178326, 0.007931231521070004, 0.008392063900828362, 0.008282655850052834, 0.00781238405033946, 0.007775456178933382, 0.007549664005637169, 0.00783606432378292, 0.00781475193798542, 0.00798182375729084], "latency": 0.8031206205487251, "logprobs": [-9.362524032592773, -2.761181354522705, -4.53175163269043, -1.5617105960845947, -0.7528610229492188, -1.6253626346588135, -2.45941162109375, -2.1533684730529785, -2.346475124359131, -6.157411575317383, -1.3193804025650024, -3.5247979164123535, -4.488514423370361, -3.759702682495117, -2.022449493408203, -1.8945543766021729, -3.6219239234924316, -6.842351913452148, -0.3225390613079071, -0.8537865877151489, -6.520284652709961, -7.550463676452637, -12.595708847045898, -2.9504785537719727, -3.8068642616271973, -0.5890476107597351, -4.3587751388549805, -0.0665372759103775, -0.06955777853727341, -3.3523848056793213, -9.773153305053711, -1.0814638137817383, -6.204980850219727, -5.33505392074585, -3.9411606788635254, -2.7358486652374268, -3.2924106121063232, -6.0152740478515625, -1.8116782903671265, -6.243865013122559, -12.158185958862305, -12.65605354309082, -0.08688803017139435, -2.6079092025756836, -1.4071979522705078, -2.990557909011841, -1.2379846572875977, -0.006849618628621101, -3.4119930267333984, -13.05937671661377, -4.2840399742126465, -2.4802193641662598, -5.933547019958496, -0.9116124510765076, -0.060975510627031326, -1.5681536197662354, -1.0339949131011963, -5.617187023162842, -0.41873589158058167, -4.9402852058410645, -0.5690340995788574, -0.6301103830337524, -2.396580696105957, -13.29629898071289, -0.08181379735469818, -3.6629719734191895, -1.105454683303833, -6.127413749694824, -0.5906393527984619, -3.548814296722412, -0.9948520660400391, -1.5058085918426514, -5.211822509765625, -17.489606857299805, -6.8240861892700195, -0.9539748430252075, -4.2172040939331055, -1.1572864055633545, -2.3540186882019043, -1.798780918121338, -0.2533280849456787, -9.403679847717285, -0.1830129772424698, -7.440906524658203, -2.228740692138672, -4.196046352386475, -3.5180575847625732, -1.9530653953552246, -2.2825613021850586, -1.5544131994247437, -2.3991782665252686, -1.554469347000122, -1.290938377380371, -2.785543203353882, -0.6400948166847229, -0.48503541946411133, -1.432410478591919, -0.9366894960403442, -0.42669478058815, -0.9688448905944824, -1.4787911176681519, -0.43357178568840027, -1.8381303548812866, -0.6210520267486572, -1.0601571798324585, -1.1962573528289795, -0.002758747199550271, -1.2365548610687256, -0.008277395740151405, -0.7464911341667175, -0.8628943562507629, -0.0671280175447464, -0.953361988067627, -0.02595982328057289, -2.139401435852051, -1.1942673921585083, -0.7968283295631409]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..6a5ace35ec7 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/golden_values_dev_dgxh100_eos.json @@ -0,0 +1 @@ +{"1": {"input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", "generated_tokens": [3060, 2430, 1636, 2012, 1317, 1278, 2362, 1307, 1278, 16070, 1044, 1321, 1636, 23067, 1455, 1593, 1395, 1605, 3140, 5152, 1513, 1747, 1046, 2409, 1395, 3140, 5152, 1513, 1278, 2362], "tpot": [0.6358857750892639, 0.009907487779855728, 0.010546143166720867, 0.009435135871171951, 0.010123520158231258, 0.009925439953804016, 0.008350367657840252, 0.008556703105568886, 0.008582624606788158, 0.00840403139591217, 0.008557791821658611, 0.008503519929945469, 0.008379808627068996, 0.009403808042407036, 0.009133151732385159, 0.008321152068674564, 0.008845727890729904, 0.008372415788471699, 0.008591103367507458, 0.009211359545588493, 0.009166751988232136, 0.009767616167664528, 0.008620256558060646, 0.009338144212961197, 0.010125535540282726, 0.010068127885460854, 0.009669983759522438, 0.010439807549118996, 0.010279008187353611, 0.0103340158239007], "latency": 0.9097336048725992, "logprobs": [-9.362524032592773, -2.761181354522705, -4.53175163269043, -1.5617105960845947, -0.7528610229492188, -1.6253626346588135, -2.45941162109375, -2.1533684730529785, -2.346475124359131, -6.157411575317383, -1.3193804025650024, -3.5247979164123535, -4.488514423370361, -3.759702682495117, -2.022449493408203, -1.8945543766021729, -3.6219239234924316, -6.842351913452148, -0.3225390613079071, -0.8537865877151489, -6.520284652709961, -7.550463676452637, -12.595708847045898, -2.9504785537719727, -3.8068642616271973, -0.5890476107597351, -4.3587751388549805, -0.0665372759103775, -0.06955777853727341, -3.3523848056793213, -9.773153305053711, -1.0814638137817383, -6.204980850219727, -5.33505392074585, -3.9411606788635254, -2.7358486652374268, -3.2924106121063232, -6.0152740478515625, -1.8116782903671265, -6.243865013122559, -12.158185958862305, -12.65605354309082, -0.08688803017139435, -2.6079092025756836, -1.4071979522705078, -2.990557909011841, -1.2379846572875977, -0.006849618628621101, -3.4119930267333984, -13.05937671661377, -4.2840399742126465, -2.4802193641662598, -5.933547019958496, -0.9116124510765076, -0.060975510627031326, -1.5681536197662354, -1.0339949131011963, -5.617187023162842, -0.41873589158058167, -4.9402852058410645, -0.5690340995788574, -0.6301103830337524, -2.396580696105957, -13.29629898071289, -0.08181379735469818, -3.6629719734191895, -1.105454683303833, -6.127413749694824, -0.5906393527984619, -3.548814296722412, -0.9948520660400391, -1.5058085918426514, -5.211822509765625, -17.489606857299805, -6.8240861892700195, -0.9539748430252075, -4.2172040939331055, -1.1572864055633545, -2.3540186882019043, -1.798780918121338, -0.2533280849456787, -9.403679847717285, -0.1830129772424698, -7.440906524658203, -2.228740692138672, -4.196046352386475, -3.5180575847625732, -1.9530653953552246, -2.2825613021850586, -1.5544131994247437, -2.3991782665252686, -1.554469347000122, -1.290938377380371, -2.785543203353882, -0.6400948166847229, -0.48503541946411133, -1.432410478591919, -0.9366894960403442, -0.42669478058815, -0.9688448905944824, -1.4787911176681519, -0.43357178568840027, -1.8381303548812866, -0.6210520267486572, -1.0601571798324585, -1.1962573528289795, -0.002758747199550271, -1.2365548610687256, -0.008277395740151405, -0.7464911341667175, -0.8628943562507629, -0.0671280175447464, -0.953361988067627, -0.02595982328057289, -2.139401435852051, -1.1942673921585083, -0.7968283295631409]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..f37c35812e5 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1 @@ +{"0": {"input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", "generated_tokens": [3060, 2430, 1636, 2012, 1317, 1278, 2362, 1307, 1278, 16070, 1044, 1321, 1636, 23067, 1455, 1593, 1395, 1605, 3140, 5152, 1513, 1747, 1046, 2409, 1395, 3140, 5152, 1513, 1278, 2362], "tpot": [2.1197516918182373, 0.3172459900379181, 0.016708193346858025, 0.015786752104759216, 0.015607455745339394, 0.015449312515556812, 0.015446463599801064, 0.015455200336873531, 0.015508351847529411, 0.016473280265927315, 0.015467967838048935, 0.015407584607601166, 0.015393920242786407, 0.015441760420799255, 0.015666943043470383, 0.015604863874614239, 0.015388128347694874, 0.015523936599493027, 0.015425760298967361, 0.016386207193136215, 0.016847264021635056, 0.016578560695052147, 0.016409022733569145, 0.016199840232729912, 0.015789279714226723, 0.015486880205571651, 0.01539977639913559, 0.016956929117441177, 0.016581375151872635, 0.01746956817805767], "latency": 2.903888032771647, "logprobs": [-9.358616828918457, -2.7474308013916016, -4.628000259399414, -1.5015846490859985, -0.6537986993789673, -1.6720777750015259, -2.478705883026123, -2.0523874759674072, -2.4486241340637207, -6.257688522338867, -1.4695018529891968, -3.4444499015808105, -4.394474029541016, -3.875497817993164, -2.0133562088012695, -1.8832889795303345, -3.8004486560821533, -6.784910678863525, -0.2949134111404419, -0.9851954579353333, -6.626471519470215, -7.186152458190918, -12.800604820251465, -2.2686400413513184, -3.7816011905670166, -0.4978560209274292, -4.371628284454346, -0.0696188285946846, -0.09487748891115189, -3.2375073432922363, -10.075444221496582, -1.138173222541809, -5.97689151763916, -5.093283653259277, -3.874396324157715, -2.6073620319366455, -3.466899871826172, -5.642228126525879, -1.6154727935791016, -5.416567325592041, -12.158267974853516, -12.610607147216797, -0.09664110094308853, -2.5213418006896973, -1.3747841119766235, -2.8510401248931885, -1.1877963542938232, -0.006288621574640274, -3.382380962371826, -13.207911491394043, -4.477662086486816, -2.5299136638641357, -6.053747653961182, -0.7650555372238159, -0.04903985932469368, -1.5557448863983154, -1.1315535306930542, -5.610307216644287, -0.4059771001338959, -4.961302280426025, -0.5701270699501038, -0.7174267172813416, -2.4735305309295654, -13.610812187194824, -0.09192369878292084, -3.5248732566833496, -1.3797900676727295, -6.429551124572754, -0.541852593421936, -3.5403199195861816, -0.8477706909179688, -1.5764057636260986, -5.343497276306152, -17.19588851928711, -6.635483741760254, -0.8923014402389526, -4.114314556121826, -1.2193646430969238, -2.2128424644470215, -1.7673423290252686, -0.22567729651927948, -9.320298194885254, -0.1282224804162979, -7.3249101638793945, -2.511319875717163, -4.0696563720703125, -3.5427517890930176, -1.9300249814987183, -2.347038507461548, -1.5178614854812622, -2.366441249847412, -1.744020938873291, -1.1570327281951904, -3.0150983333587646, -0.5272141098976135, -0.4669455885887146, -1.7157398462295532, -0.8362292051315308, -0.41491177678108215, -0.9386503100395203, -1.5008316040039062, -0.4635278284549713, -1.6312834024429321, -0.5320357084274292, -1.2249717712402344, -1.1707526445388794, -0.0023814670275896788, -1.1655761003494263, -0.006950841750949621, -0.7309689521789551, -0.7428325414657593, -0.042878177016973495, -0.8572992086410522, -0.01948782242834568, -2.0537290573120117, -1.2817553281784058, -0.8235744833946228]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..a4b870809ba --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_eos.json @@ -0,0 +1 @@ +{"0": {"input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", "generated_tokens": [3060, 2430, 1636, 2012, 1317, 1278, 2362, 1307, 1278, 16070, 1044, 1321, 1636, 23067, 1455, 1593, 1395, 1605, 3140, 5152, 1513, 1747, 1046, 2409, 1395, 3140, 5152, 1513, 1278, 2362], "tpot": [2.2565205097198486, 0.3516305685043335, 0.01722889579832554, 0.018507104367017746, 0.01656815968453884, 0.016881439834833145, 0.0166244488209486, 0.01648310385644436, 0.016350112855434418, 0.018141599372029305, 0.01638089492917061, 0.016720257699489594, 0.01646953634917736, 0.01641814410686493, 0.016365855932235718, 0.018089760094881058, 0.016283327713608742, 0.01690729521214962, 0.019018815830349922, 0.01721513643860817, 0.01676982268691063, 0.018497919663786888, 0.016406463459134102, 0.01895606331527233, 0.018566368147730827, 0.017292767763137817, 0.02004953660070896, 0.0188816636800766, 0.019935935735702515, 0.019367488101124763], "latency": 3.115501318126917, "logprobs": [-9.358616828918457, -2.7474308013916016, -4.628000259399414, -1.5015846490859985, -0.6537986993789673, -1.6720777750015259, -2.478705883026123, -2.0523874759674072, -2.4486241340637207, -6.257688522338867, -1.4695018529891968, -3.4444499015808105, -4.394474029541016, -3.875497817993164, -2.0133562088012695, -1.8832889795303345, -3.8004486560821533, -6.784910678863525, -0.2949134111404419, -0.9851954579353333, -6.626471519470215, -7.186152458190918, -12.800604820251465, -2.2686400413513184, -3.7816011905670166, -0.4978560209274292, -4.371628284454346, -0.0696188285946846, -0.09487748891115189, -3.2375073432922363, -10.075444221496582, -1.138173222541809, -5.97689151763916, -5.093283653259277, -3.874396324157715, -2.6073620319366455, -3.466899871826172, -5.642228126525879, -1.6154727935791016, -5.416567325592041, -12.158267974853516, -12.610607147216797, -0.09664110094308853, -2.5213418006896973, -1.3747841119766235, -2.8510401248931885, -1.1877963542938232, -0.006288621574640274, -3.382380962371826, -13.207911491394043, -4.477662086486816, -2.5299136638641357, -6.053747653961182, -0.7650555372238159, -0.04903985932469368, -1.5557448863983154, -1.1315535306930542, -5.610307216644287, -0.4059771001338959, -4.961302280426025, -0.5701270699501038, -0.7174267172813416, -2.4735305309295654, -13.610812187194824, -0.09192369878292084, -3.5248732566833496, -1.3797900676727295, -6.429551124572754, -0.541852593421936, -3.5403199195861816, -0.8477706909179688, -1.5764057636260986, -5.343497276306152, -17.19588851928711, -6.635483741760254, -0.8923014402389526, -4.114314556121826, -1.2193646430969238, -2.2128424644470215, -1.7673423290252686, -0.22567729651927948, -9.320298194885254, -0.1282224804162979, -7.3249101638793945, -2.511319875717163, -4.0696563720703125, -3.5427517890930176, -1.9300249814987183, -2.347038507461548, -1.5178614854812622, -2.366441249847412, -1.744020938873291, -1.1570327281951904, -3.0150983333587646, -0.5272141098976135, -0.4669455885887146, -1.7157398462295532, -0.8362292051315308, -0.41491177678108215, -0.9386503100395203, -1.5008316040039062, -0.4635278284549713, -1.6312834024429321, -0.5320357084274292, -1.2249717712402344, -1.1707526445388794, -0.0023814670275896788, -1.1655761003494263, -0.006950841750949621, -0.7309689521789551, -0.7428325414657593, -0.042878177016973495, -0.8572992086410522, -0.01948782242834568, -2.0537290573120117, -1.2817553281784058, -0.8235744833946228]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 22fca066f39..f9b98f41237 100644 --- a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,287 @@ -{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.97439, "5": 11.00379, "10": 10.95244, "15": 10.85533, "20": 10.6403, "25": 10.25922, "30": 9.91482, "35": 9.70711, "40": 9.34219, "45": 9.00177, "50": 9.12586}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 21015.0, "5": 23387.0, "10": 19344.0, "15": 23461.0, "20": 21503.0, "25": 19506.0, "30": 20239.0, "35": 22142.0, "40": 24112.0, "45": 21801.0, "50": 27877.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3117478912.0, "5": 3117478912.0, "10": 3117478912.0, "15": 3117478912.0, "20": 3117478912.0, "25": 3117478912.0, "30": 3117478912.0, "35": 3117478912.0, "40": 3117478912.0, "45": 3117478912.0, "50": 3117478912.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 9708208128.0, "5": 10145497088.0, "10": 10145497088.0, "15": 10145497088.0, "20": 10145497088.0, "25": 10145497088.0, "30": 10145497088.0, "35": 10145497088.0, "40": 10145497088.0, "45": 10145497088.0, "50": 10145497088.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 68.38039, "5": 0.15499, "10": 0.15766, "15": 0.15466, "20": 0.15575, "25": 0.15341, "30": 0.15715, "35": 0.16344, "40": 0.15691, "45": 0.18148, "50": 0.16344}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.97434, + "2": 10.976, + "3": 10.9787, + "4": 10.95784, + "5": 11.00373, + "6": 11.00618, + "7": 10.97996, + "8": 10.96861, + "9": 10.97919, + "10": 10.95244, + "11": 10.99935, + "12": 10.96821, + "13": 10.96591, + "14": 10.99543, + "15": 10.85545, + "16": 10.85544, + "17": 10.81736, + "18": 10.82741, + "19": 10.82166, + "20": 10.64041, + "21": 10.57938, + "22": 10.33552, + "23": 10.61311, + "24": 10.34969, + "25": 10.25934, + "26": 10.36367, + "27": 10.38735, + "28": 10.35703, + "29": 10.38231, + "30": 9.91506, + "31": 9.47491, + "32": 10.08956, + "33": 10.08418, + "34": 9.65437, + "35": 9.70727, + "36": 9.58843, + "37": 9.82211, + "38": 9.53615, + "39": 9.94103, + "40": 9.34234, + "41": 9.48854, + "42": 9.56996, + "43": 9.0355, + "44": 9.15623, + "45": 9.00188, + "46": 9.06394, + "47": 9.49292, + "48": 9.04259, + "49": 8.58802, + "50": 9.12597 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 20919.0, + "2": 21891.0, + "3": 21096.0, + "4": 20712.0, + "5": 23549.0, + "6": 24113.0, + "7": 23323.0, + "8": 21849.0, + "9": 22954.0, + "10": 19196.0, + "11": 24647.0, + "12": 23707.0, + "13": 24320.0, + "14": 24596.0, + "15": 23689.0, + "16": 23647.0, + "17": 22594.0, + "18": 22957.0, + "19": 23469.0, + "20": 21794.0, + "21": 22831.0, + "22": 19274.0, + "23": 24548.0, + "24": 19712.0, + "25": 19775.0, + "26": 21249.0, + "27": 22519.0, + "28": 23834.0, + "29": 23280.0, + "30": 20509.0, + "31": 17408.0, + "32": 21974.0, + "33": 22884.0, + "34": 21870.0, + "35": 22283.0, + "36": 21004.0, + "37": 22759.0, + "38": 22719.0, + "39": 22051.0, + "40": 23748.0, + "41": 24092.0, + "42": 23517.0, + "43": 22267.0, + "44": 22001.0, + "45": 21520.0, + "46": 22824.0, + "47": 25650.0, + "48": 25468.0, + "49": 25463.0, + "50": 28240.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3117478912.0, + "2": 3117478912.0, + "3": 3117478912.0, + "4": 3117478912.0, + "5": 3117478912.0, + "6": 3117478912.0, + "7": 3117478912.0, + "8": 3117478912.0, + "9": 3117478912.0, + "10": 3117478912.0, + "11": 3117478912.0, + "12": 3117478912.0, + "13": 3117478912.0, + "14": 3117478912.0, + "15": 3117478912.0, + "16": 3117478912.0, + "17": 3117478912.0, + "18": 3117478912.0, + "19": 3117478912.0, + "20": 3117478912.0, + "21": 3117478912.0, + "22": 3117478912.0, + "23": 3117478912.0, + "24": 3117478912.0, + "25": 3117478912.0, + "26": 3117478912.0, + "27": 3117478912.0, + "28": 3117478912.0, + "29": 3117478912.0, + "30": 3117478912.0, + "31": 3117478912.0, + "32": 3117478912.0, + "33": 3117478912.0, + "34": 3117478912.0, + "35": 3117478912.0, + "36": 3117478912.0, + "37": 3117478912.0, + "38": 3117478912.0, + "39": 3117478912.0, + "40": 3117478912.0, + "41": 3117478912.0, + "42": 3117478912.0, + "43": 3117478912.0, + "44": 3117478912.0, + "45": 3117478912.0, + "46": 3117478912.0, + "47": 3117478912.0, + "48": 3117478912.0, + "49": 3117478912.0, + "50": 3117478912.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 9708208128.0, + "2": 10145497088.0, + "3": 10145497088.0, + "4": 10145497088.0, + "5": 10145497088.0, + "6": 10145497088.0, + "7": 10145497088.0, + "8": 10145497088.0, + "9": 10145497088.0, + "10": 10145497088.0, + "11": 10145497088.0, + "12": 10145497088.0, + "13": 10145497088.0, + "14": 10145497088.0, + "15": 10145497088.0, + "16": 10145497088.0, + "17": 10145497088.0, + "18": 10145497088.0, + "19": 10145497088.0, + "20": 10145497088.0, + "21": 10145497088.0, + "22": 10145497088.0, + "23": 10145497088.0, + "24": 10145497088.0, + "25": 10145497088.0, + "26": 10145497088.0, + "27": 10145497088.0, + "28": 10145497088.0, + "29": 10145497088.0, + "30": 10145497088.0, + "31": 10145497088.0, + "32": 10145497088.0, + "33": 10145497088.0, + "34": 10145497088.0, + "35": 10145497088.0, + "36": 10145497088.0, + "37": 10145497088.0, + "38": 10145497088.0, + "39": 10145497088.0, + "40": 10145497088.0, + "41": 10145497088.0, + "42": 10145497088.0, + "43": 10145497088.0, + "44": 10145497088.0, + "45": 10145497088.0, + "46": 10145497088.0, + "47": 10145497088.0, + "48": 10145497088.0, + "49": 10145497088.0, + "50": 10145497088.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 75.46828, + "2": 0.20357, + "3": 0.19791, + "4": 0.20172, + "5": 0.17347, + "6": 0.17767, + "7": 0.18123, + "8": 0.18059, + "9": 0.18281, + "10": 0.17733, + "11": 1.43978, + "12": 0.16875, + "13": 0.17029, + "14": 0.16961, + "15": 0.16995, + "16": 0.16814, + "17": 0.16932, + "18": 0.16845, + "19": 0.16867, + "20": 0.1725, + "21": 1.37727, + "22": 0.16984, + "23": 0.16887, + "24": 0.17009, + "25": 0.17014, + "26": 0.16727, + "27": 0.16686, + "28": 0.16832, + "29": 0.16702, + "30": 0.17035, + "31": 1.37603, + "32": 0.17102, + "33": 0.16863, + "34": 0.17081, + "35": 0.17287, + "36": 0.1713, + "37": 0.17386, + "38": 0.16722, + "39": 0.17073, + "40": 0.17394, + "41": 1.39311, + "42": 0.17219, + "43": 0.1735, + "44": 0.18156, + "45": 0.17372, + "46": 0.17432, + "47": 0.17103, + "48": 0.172, + "49": 0.17515, + "50": 0.17623 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..5649c8c02c0 --- /dev/null +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.97433, + "2": 10.97599, + "3": 10.97873, + "4": 10.95776, + "5": 11.00374, + "6": 11.00622, + "7": 10.9799, + "8": 10.96858, + "9": 10.97924, + "10": 10.95251, + "11": 10.99936, + "12": 10.96824, + "13": 10.96591, + "14": 10.99554, + "15": 10.85561, + "16": 10.85538, + "17": 10.81726, + "18": 10.82754, + "19": 10.82158, + "20": 10.6404, + "21": 10.57926, + "22": 10.33548, + "23": 10.61314, + "24": 10.34966, + "25": 10.25929, + "26": 10.36381, + "27": 10.38733, + "28": 10.35697, + "29": 10.38233, + "30": 9.91499, + "31": 9.47474, + "32": 10.08958, + "33": 10.08413, + "34": 9.65424, + "35": 9.70719, + "36": 9.58835, + "37": 9.82205, + "38": 9.53609, + "39": 9.94086, + "40": 9.34225, + "41": 9.48846, + "42": 9.56986, + "43": 9.03547, + "44": 9.15612, + "45": 9.00184, + "46": 9.06401, + "47": 9.49282, + "48": 9.04255, + "49": 8.58799, + "50": 9.12592 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 20988.0, + "2": 21880.0, + "3": 21325.0, + "4": 20724.0, + "5": 23551.0, + "6": 23815.0, + "7": 23302.0, + "8": 21521.0, + "9": 22934.0, + "10": 19185.0, + "11": 25126.0, + "12": 23590.0, + "13": 24504.0, + "14": 24677.0, + "15": 23380.0, + "16": 23738.0, + "17": 22330.0, + "18": 22602.0, + "19": 23748.0, + "20": 21759.0, + "21": 23060.0, + "22": 19355.0, + "23": 24789.0, + "24": 19586.0, + "25": 19683.0, + "26": 21141.0, + "27": 22031.0, + "28": 23567.0, + "29": 23130.0, + "30": 20321.0, + "31": 17223.0, + "32": 21718.0, + "33": 23067.0, + "34": 21566.0, + "35": 22023.0, + "36": 21047.0, + "37": 22678.0, + "38": 22771.0, + "39": 22336.0, + "40": 23698.0, + "41": 23997.0, + "42": 23556.0, + "43": 21934.0, + "44": 21967.0, + "45": 21610.0, + "46": 23283.0, + "47": 25289.0, + "48": 25472.0, + "49": 25458.0, + "50": 28167.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3117478912.0, + "2": 3117478912.0, + "3": 3117478912.0, + "4": 3117478912.0, + "5": 3117478912.0, + "6": 3117478912.0, + "7": 3117478912.0, + "8": 3117478912.0, + "9": 3117478912.0, + "10": 3117478912.0, + "11": 3117478912.0, + "12": 3117478912.0, + "13": 3117478912.0, + "14": 3117478912.0, + "15": 3117478912.0, + "16": 3117478912.0, + "17": 3117478912.0, + "18": 3117478912.0, + "19": 3117478912.0, + "20": 3117478912.0, + "21": 3117478912.0, + "22": 3117478912.0, + "23": 3117478912.0, + "24": 3117478912.0, + "25": 3117478912.0, + "26": 3117478912.0, + "27": 3117478912.0, + "28": 3117478912.0, + "29": 3117478912.0, + "30": 3117478912.0, + "31": 3117478912.0, + "32": 3117478912.0, + "33": 3117478912.0, + "34": 3117478912.0, + "35": 3117478912.0, + "36": 3117478912.0, + "37": 3117478912.0, + "38": 3117478912.0, + "39": 3117478912.0, + "40": 3117478912.0, + "41": 3117478912.0, + "42": 3117478912.0, + "43": 3117478912.0, + "44": 3117478912.0, + "45": 3117478912.0, + "46": 3117478912.0, + "47": 3117478912.0, + "48": 3117478912.0, + "49": 3117478912.0, + "50": 3117478912.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 9708208128.0, + "2": 10145497088.0, + "3": 10145497088.0, + "4": 10145497088.0, + "5": 10145497088.0, + "6": 10145497088.0, + "7": 10145497088.0, + "8": 10145497088.0, + "9": 10145497088.0, + "10": 10145497088.0, + "11": 10145497088.0, + "12": 10145497088.0, + "13": 10145497088.0, + "14": 10145497088.0, + "15": 10145497088.0, + "16": 10145497088.0, + "17": 10145497088.0, + "18": 10145497088.0, + "19": 10145497088.0, + "20": 10145497088.0, + "21": 10145497088.0, + "22": 10145497088.0, + "23": 10145497088.0, + "24": 10145497088.0, + "25": 10145497088.0, + "26": 10145497088.0, + "27": 10145497088.0, + "28": 10145497088.0, + "29": 10145497088.0, + "30": 10145497088.0, + "31": 10145497088.0, + "32": 10145497088.0, + "33": 10145497088.0, + "34": 10145497088.0, + "35": 10145497088.0, + "36": 10145497088.0, + "37": 10145497088.0, + "38": 10145497088.0, + "39": 10145497088.0, + "40": 10145497088.0, + "41": 10145497088.0, + "42": 10145497088.0, + "43": 10145497088.0, + "44": 10145497088.0, + "45": 10145497088.0, + "46": 10145497088.0, + "47": 10145497088.0, + "48": 10145497088.0, + "49": 10145497088.0, + "50": 10145497088.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 71.98615, + "2": 0.17824, + "3": 0.15658, + "4": 0.15553, + "5": 0.15552, + "6": 0.15497, + "7": 0.15557, + "8": 0.1611, + "9": 0.15455, + "10": 0.15318, + "11": 1.21675, + "12": 0.15852, + "13": 0.15923, + "14": 0.15544, + "15": 0.15619, + "16": 0.15301, + "17": 0.15568, + "18": 0.15352, + "19": 0.15601, + "20": 0.15832, + "21": 1.19636, + "22": 0.15369, + "23": 0.16001, + "24": 0.49798, + "25": 0.1566, + "26": 0.15462, + "27": 0.15479, + "28": 0.15431, + "29": 0.15608, + "30": 0.15697, + "31": 1.19237, + "32": 0.18057, + "33": 0.1804, + "34": 0.63136, + "35": 0.15799, + "36": 0.1573, + "37": 0.15724, + "38": 0.15688, + "39": 0.15684, + "40": 0.15532, + "41": 1.20433, + "42": 0.1556, + "43": 0.15643, + "44": 0.47664, + "45": 0.15538, + "46": 0.15623, + "47": 0.15655, + "48": 0.15632, + "49": 0.15651, + "50": 0.15611 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..951506c1571 --- /dev/null +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.97443, + "2": 10.97602, + "3": 10.97873, + "4": 10.95791, + "5": 11.00372, + "6": 11.00622, + "7": 10.97989, + "8": 10.96858, + "9": 10.97927, + "10": 10.95244, + "11": 10.99932, + "12": 10.96821, + "13": 10.96575, + "14": 10.99547, + "15": 10.85548, + "16": 10.85544, + "17": 10.81733, + "18": 10.82754, + "19": 10.82177, + "20": 10.64038, + "21": 10.57929, + "22": 10.33542, + "23": 10.613, + "24": 10.3496, + "25": 10.2592, + "26": 10.36373, + "27": 10.38741, + "28": 10.35692, + "29": 10.38238, + "30": 9.91509, + "31": 9.47482, + "32": 10.0895, + "33": 10.08422, + "34": 9.65429, + "35": 9.70734, + "36": 9.58844, + "37": 9.82215, + "38": 9.53607, + "39": 9.94104, + "40": 9.3422, + "41": 9.48847, + "42": 9.56993, + "43": 9.03549, + "44": 9.15623, + "45": 9.00183, + "46": 9.06402, + "47": 9.49291, + "48": 9.04257, + "49": 8.58806, + "50": 9.12599 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 21181.0, + "2": 22037.0, + "3": 21249.0, + "4": 20277.0, + "5": 23590.0, + "6": 24135.0, + "7": 23650.0, + "8": 21651.0, + "9": 22980.0, + "10": 19092.0, + "11": 25008.0, + "12": 23782.0, + "13": 24367.0, + "14": 24697.0, + "15": 23602.0, + "16": 23837.0, + "17": 22509.0, + "18": 22645.0, + "19": 23485.0, + "20": 21887.0, + "21": 22872.0, + "22": 19313.0, + "23": 24389.0, + "24": 19718.0, + "25": 19814.0, + "26": 21274.0, + "27": 22560.0, + "28": 23731.0, + "29": 23099.0, + "30": 19997.0, + "31": 17111.0, + "32": 22093.0, + "33": 23200.0, + "34": 21525.0, + "35": 21837.0, + "36": 21070.0, + "37": 22975.0, + "38": 22727.0, + "39": 22485.0, + "40": 23583.0, + "41": 24012.0, + "42": 23529.0, + "43": 22092.0, + "44": 21911.0, + "45": 21790.0, + "46": 23173.0, + "47": 25505.0, + "48": 25316.0, + "49": 25527.0, + "50": 28117.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3117478912.0, + "2": 3117478912.0, + "3": 3117478912.0, + "4": 3117478912.0, + "5": 3117478912.0, + "6": 3117478912.0, + "7": 3117478912.0, + "8": 3117478912.0, + "9": 3117478912.0, + "10": 3117478912.0, + "11": 3117478912.0, + "12": 3117478912.0, + "13": 3117478912.0, + "14": 3117478912.0, + "15": 3117478912.0, + "16": 3117478912.0, + "17": 3117478912.0, + "18": 3117478912.0, + "19": 3117478912.0, + "20": 3117478912.0, + "21": 3117478912.0, + "22": 3117478912.0, + "23": 3117478912.0, + "24": 3117478912.0, + "25": 3117478912.0, + "26": 3117478912.0, + "27": 3117478912.0, + "28": 3117478912.0, + "29": 3117478912.0, + "30": 3117478912.0, + "31": 3117478912.0, + "32": 3117478912.0, + "33": 3117478912.0, + "34": 3117478912.0, + "35": 3117478912.0, + "36": 3117478912.0, + "37": 3117478912.0, + "38": 3117478912.0, + "39": 3117478912.0, + "40": 3117478912.0, + "41": 3117478912.0, + "42": 3117478912.0, + "43": 3117478912.0, + "44": 3117478912.0, + "45": 3117478912.0, + "46": 3117478912.0, + "47": 3117478912.0, + "48": 3117478912.0, + "49": 3117478912.0, + "50": 3117478912.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 9708208128.0, + "2": 10145497088.0, + "3": 10145497088.0, + "4": 10145497088.0, + "5": 10145497088.0, + "6": 10145497088.0, + "7": 10145497088.0, + "8": 10145497088.0, + "9": 10145497088.0, + "10": 10145497088.0, + "11": 10145497088.0, + "12": 10145497088.0, + "13": 10145497088.0, + "14": 10145497088.0, + "15": 10145497088.0, + "16": 10145497088.0, + "17": 10145497088.0, + "18": 10145497088.0, + "19": 10145497088.0, + "20": 10145497088.0, + "21": 10145497088.0, + "22": 10145497088.0, + "23": 10145497088.0, + "24": 10145497088.0, + "25": 10145497088.0, + "26": 10145497088.0, + "27": 10145497088.0, + "28": 10145497088.0, + "29": 10145497088.0, + "30": 10145497088.0, + "31": 10145497088.0, + "32": 10145497088.0, + "33": 10145497088.0, + "34": 10145497088.0, + "35": 10145497088.0, + "36": 10145497088.0, + "37": 10145497088.0, + "38": 10145497088.0, + "39": 10145497088.0, + "40": 10145497088.0, + "41": 10145497088.0, + "42": 10145497088.0, + "43": 10145497088.0, + "44": 10145497088.0, + "45": 10145497088.0, + "46": 10145497088.0, + "47": 10145497088.0, + "48": 10145497088.0, + "49": 10145497088.0, + "50": 10145497088.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 74.91474, + "2": 0.1754, + "3": 0.17452, + "4": 0.16679, + "5": 0.16348, + "6": 0.16445, + "7": 0.16736, + "8": 0.16603, + "9": 0.16532, + "10": 0.16307, + "11": 1.37857, + "12": 0.16928, + "13": 0.53834, + "14": 0.57224, + "15": 0.16953, + "16": 0.16333, + "17": 0.16457, + "18": 0.16634, + "19": 0.51067, + "20": 0.16795, + "21": 1.3646, + "22": 0.16877, + "23": 0.16233, + "24": 0.16456, + "25": 0.16106, + "26": 0.16403, + "27": 0.16543, + "28": 0.52927, + "29": 0.16526, + "30": 0.16671, + "31": 1.34815, + "32": 0.1712, + "33": 0.16615, + "34": 0.16654, + "35": 0.16776, + "36": 0.16433, + "37": 0.16743, + "38": 0.5814, + "39": 0.17894, + "40": 0.16539, + "41": 1.61892, + "42": 0.1694, + "43": 0.16828, + "44": 0.16546, + "45": 0.16549, + "46": 0.16556, + "47": 0.51526, + "48": 0.16791, + "49": 0.16886, + "50": 0.16634 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 31600632301..66d5b70c4e7 100644 --- a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.98115, + "2": 10.98342, + "3": 10.97937, + "4": 10.95855, "5": 10.99632, - "10": 10.94823, - "15": 10.85384, - "20": 10.61864, - "25": 10.23212, - "30": 9.88866, - "35": 9.64741, - "40": 9.29934, - "45": 8.9649, - "50": 9.11107 + "6": 11.00381, + "7": 10.98294, + "8": 10.97489, + "9": 10.97741, + "10": 10.94819, + "11": 10.99293, + "12": 10.96683, + "13": 10.97205, + "14": 10.97917, + "15": 10.85381, + "16": 10.85123, + "17": 10.80904, + "18": 10.82571, + "19": 10.80813, + "20": 10.61863, + "21": 10.56868, + "22": 10.31924, + "23": 10.59307, + "24": 10.33426, + "25": 10.23213, + "26": 10.34313, + "27": 10.34586, + "28": 10.32458, + "29": 10.336, + "30": 9.88868, + "31": 9.42985, + "32": 10.0556, + "33": 10.04592, + "34": 9.60415, + "35": 9.64742, + "36": 9.5255, + "37": 9.7709, + "38": 9.49245, + "39": 9.87216, + "40": 9.29935, + "41": 9.44523, + "42": 9.52844, + "43": 9.015, + "44": 9.13046, + "45": 8.96483, + "46": 9.02876, + "47": 9.45483, + "48": 9.0228, + "49": 8.56611, + "50": 9.11105 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 21057.0, - "5": 23384.0, - "10": 18836.0, - "15": 23361.0, - "20": 21198.0, - "25": 19270.0, - "30": 19749.0, - "35": 21428.0, - "40": 23790.0, - "45": 22634.0, - "50": 27374.0 + "2": 22047.0, + "3": 21328.0, + "4": 20691.0, + "5": 23440.0, + "6": 23720.0, + "7": 23130.0, + "8": 21638.0, + "9": 22493.0, + "10": 18970.0, + "11": 24200.0, + "12": 23107.0, + "13": 24299.0, + "14": 24369.0, + "15": 23049.0, + "16": 23303.0, + "17": 21870.0, + "18": 22441.0, + "19": 23208.0, + "20": 21271.0, + "21": 22375.0, + "22": 19133.0, + "23": 23782.0, + "24": 19264.0, + "25": 19271.0, + "26": 20494.0, + "27": 21625.0, + "28": 23068.0, + "29": 22509.0, + "30": 19530.0, + "31": 16898.0, + "32": 21514.0, + "33": 22417.0, + "34": 21007.0, + "35": 21257.0, + "36": 20531.0, + "37": 23012.0, + "38": 22644.0, + "39": 22981.0, + "40": 23871.0, + "41": 23909.0, + "42": 23938.0, + "43": 22901.0, + "44": 22451.0, + "45": 22771.0, + "46": 23764.0, + "47": 25110.0, + "48": 26221.0, + "49": 26736.0, + "50": 27671.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1917381632.0, + "2": 1917381632.0, + "3": 1917381632.0, + "4": 1917381632.0, "5": 1917381632.0, + "6": 1917381632.0, + "7": 1917381632.0, + "8": 1917381632.0, + "9": 1917381632.0, "10": 1917381632.0, + "11": 1917381632.0, + "12": 1917381632.0, + "13": 1917381632.0, + "14": 1917381632.0, "15": 1917381632.0, + "16": 1917381632.0, + "17": 1917381632.0, + "18": 1917381632.0, + "19": 1917381632.0, "20": 1917381632.0, + "21": 1917381632.0, + "22": 1917381632.0, + "23": 1917381632.0, + "24": 1917381632.0, "25": 1917381632.0, + "26": 1917381632.0, + "27": 1917381632.0, + "28": 1917381632.0, + "29": 1917381632.0, "30": 1917381632.0, + "31": 1917381632.0, + "32": 1917381632.0, + "33": 1917381632.0, + "34": 1917381632.0, "35": 1917381632.0, + "36": 1917381632.0, + "37": 1917381632.0, + "38": 1917381632.0, + "39": 1917381632.0, "40": 1917381632.0, + "41": 1917381632.0, + "42": 1917381632.0, + "43": 1917381632.0, + "44": 1917381632.0, "45": 1917381632.0, + "46": 1917381632.0, + "47": 1917381632.0, + "48": 1917381632.0, + "49": 1917381632.0, "50": 1917381632.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 5502737408.0, + "2": 5907581952.0, + "3": 5907581952.0, + "4": 5907581952.0, "5": 5907581952.0, + "6": 5907581952.0, + "7": 5907581952.0, + "8": 5907581952.0, + "9": 5907581952.0, "10": 5907581952.0, + "11": 5907581952.0, + "12": 5907581952.0, + "13": 5907581952.0, + "14": 5907581952.0, "15": 5907581952.0, + "16": 5907581952.0, + "17": 5907581952.0, + "18": 5907581952.0, + "19": 5907581952.0, "20": 5907581952.0, + "21": 5907581952.0, + "22": 5907581952.0, + "23": 5907581952.0, + "24": 5907581952.0, "25": 5907581952.0, + "26": 5907581952.0, + "27": 5907581952.0, + "28": 5907581952.0, + "29": 5907581952.0, "30": 5907581952.0, + "31": 5907581952.0, + "32": 5907581952.0, + "33": 5907581952.0, + "34": 5907581952.0, "35": 5907581952.0, + "36": 5907581952.0, + "37": 5907581952.0, + "38": 5907581952.0, + "39": 5907581952.0, "40": 5907581952.0, + "41": 5907581952.0, + "42": 5907581952.0, + "43": 5907581952.0, + "44": 5907581952.0, "45": 5907581952.0, + "46": 5907581952.0, + "47": 5907581952.0, + "48": 5907581952.0, + "49": 5907581952.0, "50": 5907581952.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 23.78025, - "5": 0.2726, - "10": 0.28342, - "15": 0.27548, - "20": 0.27217, - "25": 0.27174, - "30": 0.27238, - "35": 0.26859, - "40": 0.27106, - "45": 0.27295, - "50": 0.27446 + "1": 77.32153, + "2": 0.35381, + "3": 0.31954, + "4": 0.31994, + "5": 0.32133, + "6": 0.32343, + "7": 0.63691, + "8": 0.32502, + "9": 0.32218, + "10": 0.31839, + "11": 1.20693, + "12": 0.33292, + "13": 0.32979, + "14": 0.31793, + "15": 0.32907, + "16": 0.31632, + "17": 0.3213, + "18": 0.32431, + "19": 0.68468, + "20": 0.32501, + "21": 0.91375, + "22": 0.32148, + "23": 0.32164, + "24": 0.32358, + "25": 0.32444, + "26": 0.31929, + "27": 0.32159, + "28": 0.32567, + "29": 0.31799, + "30": 0.36795, + "31": 0.98526, + "32": 0.32231, + "33": 0.31619, + "34": 0.31784, + "35": 0.31943, + "36": 0.31897, + "37": 0.31509, + "38": 0.33279, + "39": 0.32732, + "40": 0.31631, + "41": 0.91813, + "42": 0.32108, + "43": 0.31789, + "44": 0.31862, + "45": 0.32451, + "46": 0.31705, + "47": 0.31711, + "48": 0.32216, + "49": 0.31997, + "50": 0.31833 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..5f9d24a49c3 --- /dev/null +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.98115, + "2": 10.98342, + "3": 10.97937, + "4": 10.95855, + "5": 10.99622, + "6": 11.00384, + "7": 10.98297, + "8": 10.97483, + "9": 10.97753, + "10": 10.94815, + "11": 10.99296, + "12": 10.9669, + "13": 10.97214, + "14": 10.97925, + "15": 10.85387, + "16": 10.85117, + "17": 10.80894, + "18": 10.82573, + "19": 10.80812, + "20": 10.61863, + "21": 10.56868, + "22": 10.31918, + "23": 10.59297, + "24": 10.33422, + "25": 10.23218, + "26": 10.34314, + "27": 10.34572, + "28": 10.32477, + "29": 10.33598, + "30": 9.88873, + "31": 9.42999, + "32": 10.05561, + "33": 10.04589, + "34": 9.60423, + "35": 9.64746, + "36": 9.52548, + "37": 9.77088, + "38": 9.49242, + "39": 9.87225, + "40": 9.29943, + "41": 9.44525, + "42": 9.5284, + "43": 9.01502, + "44": 9.13045, + "45": 8.96484, + "46": 9.02877, + "47": 9.45487, + "48": 9.02277, + "49": 8.56605, + "50": 9.11107 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 21057.0, + "2": 22047.0, + "3": 21328.0, + "4": 20740.0, + "5": 23155.0, + "6": 23469.0, + "7": 22812.0, + "8": 21546.0, + "9": 22384.0, + "10": 18987.0, + "11": 24537.0, + "12": 23328.0, + "13": 24082.0, + "14": 24376.0, + "15": 23046.0, + "16": 23314.0, + "17": 21746.0, + "18": 22157.0, + "19": 23070.0, + "20": 21363.0, + "21": 22466.0, + "22": 18866.0, + "23": 24216.0, + "24": 19337.0, + "25": 19268.0, + "26": 20380.0, + "27": 21682.0, + "28": 23020.0, + "29": 22578.0, + "30": 20050.0, + "31": 16804.0, + "32": 21380.0, + "33": 22738.0, + "34": 20871.0, + "35": 21397.0, + "36": 20460.0, + "37": 22858.0, + "38": 22666.0, + "39": 22907.0, + "40": 23932.0, + "41": 23824.0, + "42": 23844.0, + "43": 22807.0, + "44": 22751.0, + "45": 22450.0, + "46": 23609.0, + "47": 25413.0, + "48": 26266.0, + "49": 26747.0, + "50": 27543.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1917381632.0, + "2": 1917381632.0, + "3": 1917381632.0, + "4": 1917381632.0, + "5": 1917381632.0, + "6": 1917381632.0, + "7": 1917381632.0, + "8": 1917381632.0, + "9": 1917381632.0, + "10": 1917381632.0, + "11": 1917381632.0, + "12": 1917381632.0, + "13": 1917381632.0, + "14": 1917381632.0, + "15": 1917381632.0, + "16": 1917381632.0, + "17": 1917381632.0, + "18": 1917381632.0, + "19": 1917381632.0, + "20": 1917381632.0, + "21": 1917381632.0, + "22": 1917381632.0, + "23": 1917381632.0, + "24": 1917381632.0, + "25": 1917381632.0, + "26": 1917381632.0, + "27": 1917381632.0, + "28": 1917381632.0, + "29": 1917381632.0, + "30": 1917381632.0, + "31": 1917381632.0, + "32": 1917381632.0, + "33": 1917381632.0, + "34": 1917381632.0, + "35": 1917381632.0, + "36": 1917381632.0, + "37": 1917381632.0, + "38": 1917381632.0, + "39": 1917381632.0, + "40": 1917381632.0, + "41": 1917381632.0, + "42": 1917381632.0, + "43": 1917381632.0, + "44": 1917381632.0, + "45": 1917381632.0, + "46": 1917381632.0, + "47": 1917381632.0, + "48": 1917381632.0, + "49": 1917381632.0, + "50": 1917381632.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5502737408.0, + "2": 5907581952.0, + "3": 5907581952.0, + "4": 5907581952.0, + "5": 5907581952.0, + "6": 5907581952.0, + "7": 5907581952.0, + "8": 5907581952.0, + "9": 5907581952.0, + "10": 5907581952.0, + "11": 5907581952.0, + "12": 5907581952.0, + "13": 5907581952.0, + "14": 5907581952.0, + "15": 5907581952.0, + "16": 5907581952.0, + "17": 5907581952.0, + "18": 5907581952.0, + "19": 5907581952.0, + "20": 5907581952.0, + "21": 5907581952.0, + "22": 5907581952.0, + "23": 5907581952.0, + "24": 5907581952.0, + "25": 5907581952.0, + "26": 5907581952.0, + "27": 5907581952.0, + "28": 5907581952.0, + "29": 5907581952.0, + "30": 5907581952.0, + "31": 5907581952.0, + "32": 5907581952.0, + "33": 5907581952.0, + "34": 5907581952.0, + "35": 5907581952.0, + "36": 5907581952.0, + "37": 5907581952.0, + "38": 5907581952.0, + "39": 5907581952.0, + "40": 5907581952.0, + "41": 5907581952.0, + "42": 5907581952.0, + "43": 5907581952.0, + "44": 5907581952.0, + "45": 5907581952.0, + "46": 5907581952.0, + "47": 5907581952.0, + "48": 5907581952.0, + "49": 5907581952.0, + "50": 5907581952.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 72.69145, + "2": 0.31162, + "3": 0.65164, + "4": 0.29871, + "5": 0.29932, + "6": 0.29668, + "7": 0.29179, + "8": 0.29409, + "9": 0.29759, + "10": 0.30183, + "11": 0.84375, + "12": 0.2964, + "13": 0.29589, + "14": 0.29688, + "15": 0.30127, + "16": 0.29716, + "17": 0.29351, + "18": 0.29429, + "19": 0.29751, + "20": 0.29471, + "21": 1.36793, + "22": 0.29834, + "23": 0.29442, + "24": 0.29321, + "25": 0.29912, + "26": 0.29631, + "27": 0.29343, + "28": 0.29975, + "29": 0.29701, + "30": 0.67685, + "31": 0.82445, + "32": 0.29588, + "33": 0.79672, + "34": 0.30556, + "35": 0.29842, + "36": 0.29717, + "37": 0.29457, + "38": 0.29527, + "39": 0.29757, + "40": 0.29426, + "41": 0.82657, + "42": 0.29634, + "43": 0.29423, + "44": 0.30131, + "45": 0.30554, + "46": 0.29682, + "47": 0.29317, + "48": 0.29446, + "49": 0.29791, + "50": 0.2949 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..f9118a22780 --- /dev/null +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.98115, + "2": 10.98342, + "3": 10.9794, + "4": 10.95853, + "5": 10.99622, + "6": 11.00371, + "7": 10.98299, + "8": 10.9748, + "9": 10.97742, + "10": 10.94806, + "11": 10.99306, + "12": 10.96672, + "13": 10.97199, + "14": 10.97915, + "15": 10.85402, + "16": 10.85122, + "17": 10.8089, + "18": 10.82572, + "19": 10.8081, + "20": 10.61854, + "21": 10.56862, + "22": 10.31926, + "23": 10.59295, + "24": 10.3343, + "25": 10.23216, + "26": 10.34315, + "27": 10.34581, + "28": 10.3247, + "29": 10.336, + "30": 9.88877, + "31": 9.42992, + "32": 10.05572, + "33": 10.0459, + "34": 9.6042, + "35": 9.64743, + "36": 9.52544, + "37": 9.77085, + "38": 9.49252, + "39": 9.87217, + "40": 9.29929, + "41": 9.44531, + "42": 9.52839, + "43": 9.01499, + "44": 9.13044, + "45": 8.96478, + "46": 9.02875, + "47": 9.45483, + "48": 9.02282, + "49": 8.56615, + "50": 9.11114 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 21211.0, + "2": 22047.0, + "3": 20892.0, + "4": 20624.0, + "5": 23413.0, + "6": 23493.0, + "7": 22797.0, + "8": 21401.0, + "9": 22665.0, + "10": 19047.0, + "11": 24508.0, + "12": 23266.0, + "13": 24271.0, + "14": 24293.0, + "15": 22782.0, + "16": 23282.0, + "17": 21824.0, + "18": 22133.0, + "19": 23099.0, + "20": 21505.0, + "21": 22490.0, + "22": 18675.0, + "23": 23908.0, + "24": 19148.0, + "25": 19388.0, + "26": 20532.0, + "27": 21766.0, + "28": 22571.0, + "29": 22352.0, + "30": 19883.0, + "31": 16703.0, + "32": 21084.0, + "33": 22377.0, + "34": 20576.0, + "35": 21216.0, + "36": 20603.0, + "37": 22812.0, + "38": 22830.0, + "39": 22708.0, + "40": 23830.0, + "41": 24061.0, + "42": 24003.0, + "43": 22790.0, + "44": 22703.0, + "45": 22360.0, + "46": 23642.0, + "47": 25112.0, + "48": 26185.0, + "49": 26666.0, + "50": 27765.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1917381632.0, + "2": 1917381632.0, + "3": 1917381632.0, + "4": 1917381632.0, + "5": 1917381632.0, + "6": 1917381632.0, + "7": 1917381632.0, + "8": 1917381632.0, + "9": 1917381632.0, + "10": 1917381632.0, + "11": 1917381632.0, + "12": 1917381632.0, + "13": 1917381632.0, + "14": 1917381632.0, + "15": 1917381632.0, + "16": 1917381632.0, + "17": 1917381632.0, + "18": 1917381632.0, + "19": 1917381632.0, + "20": 1917381632.0, + "21": 1917381632.0, + "22": 1917381632.0, + "23": 1917381632.0, + "24": 1917381632.0, + "25": 1917381632.0, + "26": 1917381632.0, + "27": 1917381632.0, + "28": 1917381632.0, + "29": 1917381632.0, + "30": 1917381632.0, + "31": 1917381632.0, + "32": 1917381632.0, + "33": 1917381632.0, + "34": 1917381632.0, + "35": 1917381632.0, + "36": 1917381632.0, + "37": 1917381632.0, + "38": 1917381632.0, + "39": 1917381632.0, + "40": 1917381632.0, + "41": 1917381632.0, + "42": 1917381632.0, + "43": 1917381632.0, + "44": 1917381632.0, + "45": 1917381632.0, + "46": 1917381632.0, + "47": 1917381632.0, + "48": 1917381632.0, + "49": 1917381632.0, + "50": 1917381632.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5502737408.0, + "2": 5907581952.0, + "3": 5907581952.0, + "4": 5907581952.0, + "5": 5907581952.0, + "6": 5907581952.0, + "7": 5907581952.0, + "8": 5907581952.0, + "9": 5907581952.0, + "10": 5907581952.0, + "11": 5907581952.0, + "12": 5907581952.0, + "13": 5907581952.0, + "14": 5907581952.0, + "15": 5907581952.0, + "16": 5907581952.0, + "17": 5907581952.0, + "18": 5907581952.0, + "19": 5907581952.0, + "20": 5907581952.0, + "21": 5907581952.0, + "22": 5907581952.0, + "23": 5907581952.0, + "24": 5907581952.0, + "25": 5907581952.0, + "26": 5907581952.0, + "27": 5907581952.0, + "28": 5907581952.0, + "29": 5907581952.0, + "30": 5907581952.0, + "31": 5907581952.0, + "32": 5907581952.0, + "33": 5907581952.0, + "34": 5907581952.0, + "35": 5907581952.0, + "36": 5907581952.0, + "37": 5907581952.0, + "38": 5907581952.0, + "39": 5907581952.0, + "40": 5907581952.0, + "41": 5907581952.0, + "42": 5907581952.0, + "43": 5907581952.0, + "44": 5907581952.0, + "45": 5907581952.0, + "46": 5907581952.0, + "47": 5907581952.0, + "48": 5907581952.0, + "49": 5907581952.0, + "50": 5907581952.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 76.70816, + "2": 0.44479, + "3": 0.37638, + "4": 0.32493, + "5": 0.32865, + "6": 0.3221, + "7": 0.33027, + "8": 0.32627, + "9": 0.69409, + "10": 0.66689, + "11": 0.94476, + "12": 0.6757, + "13": 0.32571, + "14": 0.3194, + "15": 0.31954, + "16": 0.32142, + "17": 0.32144, + "18": 0.3188, + "19": 0.32023, + "20": 0.70348, + "21": 1.36061, + "22": 0.32306, + "23": 0.32129, + "24": 0.31927, + "25": 0.32503, + "26": 0.322, + "27": 0.31994, + "28": 0.32043, + "29": 0.31651, + "30": 0.31907, + "31": 1.31856, + "32": 0.32016, + "33": 0.31758, + "34": 0.31966, + "35": 0.31765, + "36": 0.31717, + "37": 0.3191, + "38": 0.31591, + "39": 0.3156, + "40": 0.31599, + "41": 0.90957, + "42": 0.32017, + "43": 0.31902, + "44": 0.32013, + "45": 0.32183, + "46": 0.31561, + "47": 0.31628, + "48": 0.31911, + "49": 0.31753, + "50": 0.31636 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 5cd925750cf..42f6add1cac 100644 --- a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.98296, - "5": 10.99794, - "10": 10.94509, - "15": 10.85381, - "20": 10.6219, - "25": 10.23314, - "30": 9.8856, - "35": 9.64989, - "40": 9.30025, - "45": 8.96819, - "50": 9.10987 + "2": 10.98234, + "3": 10.98048, + "4": 10.96506, + "5": 10.99783, + "6": 11.00523, + "7": 10.98269, + "8": 10.97586, + "9": 10.97815, + "10": 10.9452, + "11": 10.9926, + "12": 10.96812, + "13": 10.97042, + "14": 10.98195, + "15": 10.85378, + "16": 10.85001, + "17": 10.80676, + "18": 10.82651, + "19": 10.81114, + "20": 10.62181, + "21": 10.56061, + "22": 10.32111, + "23": 10.59523, + "24": 10.32471, + "25": 10.23316, + "26": 10.33835, + "27": 10.34872, + "28": 10.32088, + "29": 10.33079, + "30": 9.88567, + "31": 9.43004, + "32": 10.05321, + "33": 10.0429, + "34": 9.60531, + "35": 9.64985, + "36": 9.52945, + "37": 9.76829, + "38": 9.48586, + "39": 9.87467, + "40": 9.30029, + "41": 9.44905, + "42": 9.52868, + "43": 9.01596, + "44": 9.12962, + "45": 8.96833, + "46": 9.03055, + "47": 9.45737, + "48": 9.02116, + "49": 8.569, + "50": 9.10992 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 3065.0, - "5": 3271.0, - "10": 2863.0, - "15": 3164.0, - "20": 3031.0, - "25": 2758.0, - "30": 2675.0, - "35": 2939.0, - "40": 3121.0, - "45": 2957.0, - "50": 3391.0 + "1": 2981.0, + "2": 3050.0, + "3": 3036.0, + "4": 2803.0, + "5": 3277.0, + "6": 3332.0, + "7": 3180.0, + "8": 3031.0, + "9": 3010.0, + "10": 2837.0, + "11": 3454.0, + "12": 3290.0, + "13": 3425.0, + "14": 3543.0, + "15": 3264.0, + "16": 3165.0, + "17": 3109.0, + "18": 3150.0, + "19": 3225.0, + "20": 3006.0, + "21": 3072.0, + "22": 2636.0, + "23": 3329.0, + "24": 2773.0, + "25": 2778.0, + "26": 2782.0, + "27": 3018.0, + "28": 3154.0, + "29": 3221.0, + "30": 2661.0, + "31": 2317.0, + "32": 3059.0, + "33": 3139.0, + "34": 2875.0, + "35": 2919.0, + "36": 2956.0, + "37": 3114.0, + "38": 3011.0, + "39": 3102.0, + "40": 3052.0, + "41": 3056.0, + "42": 3312.0, + "43": 2849.0, + "44": 2950.0, + "45": 2930.0, + "46": 2991.0, + "47": 3237.0, + "48": 3285.0, + "49": 3389.0, + "50": 3341.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1917251584.0, + "2": 1917251584.0, + "3": 1917251584.0, + "4": 1917251584.0, "5": 1917251584.0, + "6": 1917251584.0, + "7": 1917251584.0, + "8": 1917251584.0, + "9": 1917251584.0, "10": 1917251584.0, + "11": 1917251584.0, + "12": 1917251584.0, + "13": 1917251584.0, + "14": 1917251584.0, "15": 1917251584.0, + "16": 1917251584.0, + "17": 1917251584.0, + "18": 1917251584.0, + "19": 1917251584.0, "20": 1917251584.0, + "21": 1917251584.0, + "22": 1917251584.0, + "23": 1917251584.0, + "24": 1917251584.0, "25": 1917251584.0, + "26": 1917251584.0, + "27": 1917251584.0, + "28": 1917251584.0, + "29": 1917251584.0, "30": 1917251584.0, + "31": 1917251584.0, + "32": 1917251584.0, + "33": 1917251584.0, + "34": 1917251584.0, "35": 1917251584.0, + "36": 1917251584.0, + "37": 1917251584.0, + "38": 1917251584.0, + "39": 1917251584.0, "40": 1917251584.0, + "41": 1917251584.0, + "42": 1917251584.0, + "43": 1917251584.0, + "44": 1917251584.0, "45": 1917251584.0, + "46": 1917251584.0, + "47": 1917251584.0, + "48": 1917251584.0, + "49": 1917251584.0, "50": 1917251584.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2520653312.0, + "2": 2743788032.0, + "3": 2743788032.0, + "4": 2743788032.0, "5": 2743788032.0, + "6": 2743788032.0, + "7": 2743788032.0, + "8": 2743788032.0, + "9": 2743788032.0, "10": 2743788032.0, + "11": 2743788032.0, + "12": 2743788032.0, + "13": 2743788032.0, + "14": 2743788032.0, "15": 2743788032.0, + "16": 2743788032.0, + "17": 2743788032.0, + "18": 2743788032.0, + "19": 2743788032.0, "20": 2743788032.0, + "21": 2743788032.0, + "22": 2743788032.0, + "23": 2743788032.0, + "24": 2743788032.0, "25": 2743788032.0, + "26": 2743788032.0, + "27": 2743788032.0, + "28": 2743788032.0, + "29": 2743788032.0, "30": 2743788032.0, + "31": 2743788032.0, + "32": 2743788032.0, + "33": 2743788032.0, + "34": 2743788032.0, "35": 2743788032.0, + "36": 2743788032.0, + "37": 2743788032.0, + "38": 2743788032.0, + "39": 2743788032.0, "40": 2743788032.0, + "41": 2743788032.0, + "42": 2743788032.0, + "43": 2743788032.0, + "44": 2743788032.0, "45": 2743788032.0, + "46": 2743788032.0, + "47": 2743788032.0, + "48": 2743788032.0, + "49": 2743788032.0, "50": 2743788032.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 36.93776, - "5": 1.65475, - "10": 1.62769, - "15": 1.33667, - "20": 1.33944, - "25": 1.33881, - "30": 1.33786, - "35": 1.35864, - "40": 1.36521, - "45": 1.38143, - "50": 1.35158 + "1": 93.29155, + "2": 1.49946, + "3": 1.49367, + "4": 1.4955, + "5": 1.49263, + "6": 1.48524, + "7": 1.54794, + "8": 1.57222, + "9": 1.48844, + "10": 1.48601, + "11": 2.09056, + "12": 1.49068, + "13": 1.57264, + "14": 1.49736, + "15": 1.48278, + "16": 1.48267, + "17": 1.48508, + "18": 1.48364, + "19": 1.48751, + "20": 1.61513, + "21": 2.08969, + "22": 1.48879, + "23": 1.48515, + "24": 1.48483, + "25": 1.48865, + "26": 1.57806, + "27": 1.51158, + "28": 1.49095, + "29": 1.49422, + "30": 1.48732, + "31": 2.0932, + "32": 1.5259, + "33": 1.56274, + "34": 1.48919, + "35": 1.48483, + "36": 1.49146, + "37": 1.48123, + "38": 1.48759, + "39": 1.56751, + "40": 1.51104, + "41": 2.08583, + "42": 1.48897, + "43": 1.48816, + "44": 1.49366, + "45": 1.50945, + "46": 1.59565, + "47": 1.49573, + "48": 1.48593, + "49": 1.49004, + "50": 1.49426 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..c10a5cde1e8 --- /dev/null +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.98296, + "2": 10.98234, + "3": 10.98053, + "4": 10.96517, + "5": 10.9979, + "6": 11.00523, + "7": 10.98274, + "8": 10.97592, + "9": 10.97818, + "10": 10.94511, + "11": 10.99258, + "12": 10.96821, + "13": 10.97041, + "14": 10.98206, + "15": 10.85379, + "16": 10.84986, + "17": 10.8067, + "18": 10.82647, + "19": 10.81124, + "20": 10.62204, + "21": 10.56064, + "22": 10.32092, + "23": 10.59523, + "24": 10.32467, + "25": 10.2333, + "26": 10.33822, + "27": 10.34883, + "28": 10.32085, + "29": 10.33072, + "30": 9.88565, + "31": 9.43005, + "32": 10.05329, + "33": 10.04284, + "34": 9.60526, + "35": 9.64982, + "36": 9.52942, + "37": 9.7683, + "38": 9.48583, + "39": 9.87461, + "40": 9.30023, + "41": 9.44902, + "42": 9.52875, + "43": 9.01605, + "44": 9.12966, + "45": 8.96824, + "46": 9.03047, + "47": 9.45728, + "48": 9.02121, + "49": 8.56895, + "50": 9.1099 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2975.0, + "2": 3053.0, + "3": 3035.0, + "4": 2876.0, + "5": 3232.0, + "6": 3471.0, + "7": 3136.0, + "8": 3055.0, + "9": 3098.0, + "10": 2850.0, + "11": 3481.0, + "12": 3323.0, + "13": 3340.0, + "14": 3441.0, + "15": 3128.0, + "16": 3234.0, + "17": 2908.0, + "18": 3136.0, + "19": 3105.0, + "20": 2933.0, + "21": 3024.0, + "22": 2661.0, + "23": 3271.0, + "24": 2839.0, + "25": 2707.0, + "26": 2894.0, + "27": 3076.0, + "28": 3167.0, + "29": 3152.0, + "30": 2676.0, + "31": 2303.0, + "32": 3067.0, + "33": 3156.0, + "34": 2735.0, + "35": 2962.0, + "36": 2820.0, + "37": 3125.0, + "38": 2908.0, + "39": 3089.0, + "40": 3006.0, + "41": 3005.0, + "42": 3262.0, + "43": 2920.0, + "44": 2865.0, + "45": 2829.0, + "46": 3050.0, + "47": 3247.0, + "48": 3311.0, + "49": 3262.0, + "50": 3449.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1917251584.0, + "2": 1917251584.0, + "3": 1917251584.0, + "4": 1917251584.0, + "5": 1917251584.0, + "6": 1917251584.0, + "7": 1917251584.0, + "8": 1917251584.0, + "9": 1917251584.0, + "10": 1917251584.0, + "11": 1917251584.0, + "12": 1917251584.0, + "13": 1917251584.0, + "14": 1917251584.0, + "15": 1917251584.0, + "16": 1917251584.0, + "17": 1917251584.0, + "18": 1917251584.0, + "19": 1917251584.0, + "20": 1917251584.0, + "21": 1917251584.0, + "22": 1917251584.0, + "23": 1917251584.0, + "24": 1917251584.0, + "25": 1917251584.0, + "26": 1917251584.0, + "27": 1917251584.0, + "28": 1917251584.0, + "29": 1917251584.0, + "30": 1917251584.0, + "31": 1917251584.0, + "32": 1917251584.0, + "33": 1917251584.0, + "34": 1917251584.0, + "35": 1917251584.0, + "36": 1917251584.0, + "37": 1917251584.0, + "38": 1917251584.0, + "39": 1917251584.0, + "40": 1917251584.0, + "41": 1917251584.0, + "42": 1917251584.0, + "43": 1917251584.0, + "44": 1917251584.0, + "45": 1917251584.0, + "46": 1917251584.0, + "47": 1917251584.0, + "48": 1917251584.0, + "49": 1917251584.0, + "50": 1917251584.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2520653312.0, + "2": 2743788032.0, + "3": 2743788032.0, + "4": 2743788032.0, + "5": 2743788032.0, + "6": 2743788032.0, + "7": 2743788032.0, + "8": 2743788032.0, + "9": 2743788032.0, + "10": 2743788032.0, + "11": 2743788032.0, + "12": 2743788032.0, + "13": 2743788032.0, + "14": 2743788032.0, + "15": 2743788032.0, + "16": 2743788032.0, + "17": 2743788032.0, + "18": 2743788032.0, + "19": 2743788032.0, + "20": 2743788032.0, + "21": 2743788032.0, + "22": 2743788032.0, + "23": 2743788032.0, + "24": 2743788032.0, + "25": 2743788032.0, + "26": 2743788032.0, + "27": 2743788032.0, + "28": 2743788032.0, + "29": 2743788032.0, + "30": 2743788032.0, + "31": 2743788032.0, + "32": 2743788032.0, + "33": 2743788032.0, + "34": 2743788032.0, + "35": 2743788032.0, + "36": 2743788032.0, + "37": 2743788032.0, + "38": 2743788032.0, + "39": 2743788032.0, + "40": 2743788032.0, + "41": 2743788032.0, + "42": 2743788032.0, + "43": 2743788032.0, + "44": 2743788032.0, + "45": 2743788032.0, + "46": 2743788032.0, + "47": 2743788032.0, + "48": 2743788032.0, + "49": 2743788032.0, + "50": 2743788032.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 92.34219, + "2": 1.4515, + "3": 1.36887, + "4": 1.37341, + "5": 1.37602, + "6": 1.39004, + "7": 1.3836, + "8": 1.38196, + "9": 1.38896, + "10": 1.45857, + "11": 1.94935, + "12": 1.39106, + "13": 1.39805, + "14": 1.39033, + "15": 1.38482, + "16": 1.39457, + "17": 1.44864, + "18": 1.39068, + "19": 1.3833, + "20": 1.38815, + "21": 1.94703, + "22": 1.38309, + "23": 1.42093, + "24": 1.3998, + "25": 1.38693, + "26": 1.38436, + "27": 1.40235, + "28": 1.40751, + "29": 1.37396, + "30": 1.4111, + "31": 1.93813, + "32": 1.35926, + "33": 1.36462, + "34": 1.36782, + "35": 1.36782, + "36": 1.36568, + "37": 1.37148, + "38": 1.37963, + "39": 1.37862, + "40": 1.36625, + "41": 1.9063, + "42": 1.38764, + "43": 1.37219, + "44": 1.37186, + "45": 1.38575, + "46": 1.3857, + "47": 1.37676, + "48": 1.39862, + "49": 1.3615, + "50": 1.35892 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..baf1fa52671 --- /dev/null +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.98296, + "2": 10.98234, + "3": 10.98046, + "4": 10.96512, + "5": 10.99789, + "6": 11.00517, + "7": 10.98273, + "8": 10.97596, + "9": 10.9783, + "10": 10.9452, + "11": 10.99257, + "12": 10.96815, + "13": 10.9703, + "14": 10.98207, + "15": 10.85381, + "16": 10.85003, + "17": 10.80667, + "18": 10.82648, + "19": 10.81123, + "20": 10.62194, + "21": 10.56069, + "22": 10.32105, + "23": 10.59531, + "24": 10.32461, + "25": 10.23318, + "26": 10.33828, + "27": 10.34879, + "28": 10.32094, + "29": 10.33068, + "30": 9.8856, + "31": 9.42999, + "32": 10.05321, + "33": 10.0429, + "34": 9.6053, + "35": 9.64984, + "36": 9.52934, + "37": 9.76834, + "38": 9.48585, + "39": 9.87468, + "40": 9.30022, + "41": 9.44909, + "42": 9.52866, + "43": 9.01602, + "44": 9.12963, + "45": 8.96826, + "46": 9.03049, + "47": 9.45732, + "48": 9.02119, + "49": 8.56905, + "50": 9.10994 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2992.0, + "2": 2911.0, + "3": 2981.0, + "4": 2784.0, + "5": 3153.0, + "6": 3292.0, + "7": 3123.0, + "8": 3104.0, + "9": 3123.0, + "10": 2796.0, + "11": 3497.0, + "12": 3305.0, + "13": 3271.0, + "14": 3414.0, + "15": 3082.0, + "16": 3257.0, + "17": 3088.0, + "18": 3113.0, + "19": 3283.0, + "20": 2980.0, + "21": 3045.0, + "22": 2623.0, + "23": 3281.0, + "24": 2774.0, + "25": 2745.0, + "26": 2827.0, + "27": 3106.0, + "28": 3227.0, + "29": 3118.0, + "30": 2695.0, + "31": 2326.0, + "32": 3058.0, + "33": 3138.0, + "34": 2755.0, + "35": 2931.0, + "36": 2947.0, + "37": 3169.0, + "38": 3016.0, + "39": 3187.0, + "40": 3076.0, + "41": 3043.0, + "42": 3245.0, + "43": 2813.0, + "44": 2934.0, + "45": 2868.0, + "46": 3015.0, + "47": 3294.0, + "48": 3327.0, + "49": 3253.0, + "50": 3403.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1917251584.0, + "2": 1917251584.0, + "3": 1917251584.0, + "4": 1917251584.0, + "5": 1917251584.0, + "6": 1917251584.0, + "7": 1917251584.0, + "8": 1917251584.0, + "9": 1917251584.0, + "10": 1917251584.0, + "11": 1917251584.0, + "12": 1917251584.0, + "13": 1917251584.0, + "14": 1917251584.0, + "15": 1917251584.0, + "16": 1917251584.0, + "17": 1917251584.0, + "18": 1917251584.0, + "19": 1917251584.0, + "20": 1917251584.0, + "21": 1917251584.0, + "22": 1917251584.0, + "23": 1917251584.0, + "24": 1917251584.0, + "25": 1917251584.0, + "26": 1917251584.0, + "27": 1917251584.0, + "28": 1917251584.0, + "29": 1917251584.0, + "30": 1917251584.0, + "31": 1917251584.0, + "32": 1917251584.0, + "33": 1917251584.0, + "34": 1917251584.0, + "35": 1917251584.0, + "36": 1917251584.0, + "37": 1917251584.0, + "38": 1917251584.0, + "39": 1917251584.0, + "40": 1917251584.0, + "41": 1917251584.0, + "42": 1917251584.0, + "43": 1917251584.0, + "44": 1917251584.0, + "45": 1917251584.0, + "46": 1917251584.0, + "47": 1917251584.0, + "48": 1917251584.0, + "49": 1917251584.0, + "50": 1917251584.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2520653312.0, + "2": 2743788032.0, + "3": 2743788032.0, + "4": 2743788032.0, + "5": 2743788032.0, + "6": 2743788032.0, + "7": 2743788032.0, + "8": 2743788032.0, + "9": 2743788032.0, + "10": 2743788032.0, + "11": 2743788032.0, + "12": 2743788032.0, + "13": 2743788032.0, + "14": 2743788032.0, + "15": 2743788032.0, + "16": 2743788032.0, + "17": 2743788032.0, + "18": 2743788032.0, + "19": 2743788032.0, + "20": 2743788032.0, + "21": 2743788032.0, + "22": 2743788032.0, + "23": 2743788032.0, + "24": 2743788032.0, + "25": 2743788032.0, + "26": 2743788032.0, + "27": 2743788032.0, + "28": 2743788032.0, + "29": 2743788032.0, + "30": 2743788032.0, + "31": 2743788032.0, + "32": 2743788032.0, + "33": 2743788032.0, + "34": 2743788032.0, + "35": 2743788032.0, + "36": 2743788032.0, + "37": 2743788032.0, + "38": 2743788032.0, + "39": 2743788032.0, + "40": 2743788032.0, + "41": 2743788032.0, + "42": 2743788032.0, + "43": 2743788032.0, + "44": 2743788032.0, + "45": 2743788032.0, + "46": 2743788032.0, + "47": 2743788032.0, + "48": 2743788032.0, + "49": 2743788032.0, + "50": 2743788032.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 92.52278, + "2": 1.52203, + "3": 1.50103, + "4": 1.51627, + "5": 1.49943, + "6": 1.61325, + "7": 1.5622, + "8": 1.50668, + "9": 1.50122, + "10": 1.50749, + "11": 2.12764, + "12": 1.51111, + "13": 1.50973, + "14": 1.51712, + "15": 1.50952, + "16": 1.51343, + "17": 1.50742, + "18": 1.52017, + "19": 1.50622, + "20": 1.51648, + "21": 2.13229, + "22": 1.50789, + "23": 1.52087, + "24": 1.50668, + "25": 1.51534, + "26": 1.5016, + "27": 1.50737, + "28": 1.49873, + "29": 1.50715, + "30": 1.49941, + "31": 2.11492, + "32": 1.50348, + "33": 1.50106, + "34": 1.50093, + "35": 1.50813, + "36": 1.4988, + "37": 1.49847, + "38": 1.49777, + "39": 1.49937, + "40": 1.50456, + "41": 2.11318, + "42": 1.50605, + "43": 1.50721, + "44": 1.51813, + "45": 1.50211, + "46": 1.51633, + "47": 1.5019, + "48": 1.52386, + "49": 1.49987, + "50": 1.50829 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..f43841d5cbf --- /dev/null +++ b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1 @@ +{"1": {"input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", "generated_text": " Then, when you're ready, go home and watch the movie again.", "generated_tokens": [6830, 1044, 2200, 1636, 6185, 11831, 1044, 1974, 4590, 1321, 9951, 1278, 16070, 2790, 1046], "tpot": [0.5682204365730286, 0.00773027166724205, 0.006722208112478256, 0.0064345598220825195, 0.006336224265396595, 0.006343040149658918, 0.0063623362220823765, 0.0063252802938222885, 0.0067179519683122635, 0.006901599932461977, 0.006821152288466692, 0.006867455784231424, 0.006917183753103018, 0.006906943861395121, 0.006760320160537958], "latency": 0.6755752461031079, "logprobs": [-9.485179901123047, -3.7365002632141113, -3.0747694969177246, -1.744485855102539, -0.29669833183288574, -1.4020814895629883, -2.432681083679199, -1.7664837837219238, -1.4741225242614746, -6.42724084854126, -0.8153547048568726, -1.7931451797485352, -3.650665044784546, -3.698770046234131, -1.608336091041565, -1.6549599170684814, -2.8460211753845215, -6.670064926147461, -0.06550002098083496, -1.2442623376846313, -6.04405403137207, -9.507080078125, -10.461563110351562, -1.5952650308609009, -4.6770920753479, -0.745125412940979, -2.1571977138519287, -0.013643701560795307, -0.03557091951370239, -3.090214252471924, -8.740396499633789, -1.5405625104904175, -5.852315902709961, -3.09045672416687, -3.9833602905273438, -3.7632288932800293, -2.444291591644287, -2.273496627807617, -0.4683297276496887, -1.020460605621338, -5.3351545333862305, -8.249643325805664, -0.01584932766854763, -2.8506340980529785, -1.251563549041748, -3.7786898612976074, -1.0169645547866821, -0.002681709360331297, -3.0970988273620605, -11.113213539123535, -3.8127267360687256, -2.329777479171753, -4.672338485717773, -0.09791824221611023, -0.06286392360925674, -1.3320130109786987, -2.1521241664886475, -4.375304222106934, -0.43500134348869324, -3.9912281036376953, -0.5796594023704529, -0.26420092582702637, -2.811892509460449, -13.508228302001953, -0.10134205967187881, -3.5013256072998047, -0.8109210729598999, -5.298563480377197, -0.3272246718406677, -2.333836555480957, -0.5356347560882568, -1.288033366203308, -4.895185947418213, -15.548847198486328, -4.934615612030029, -0.22137367725372314, -6.583427429199219, -0.9010066986083984, -2.237170696258545, -1.8670732975006104, -0.20016230642795563, -5.921288013458252, -0.005614227149635553, -7.52609920501709, -3.284144878387451, -3.6920413970947266, -2.0169901847839355, -2.9249799251556396, -1.469851016998291, -2.4422709941864014, -1.2325081825256348, -1.964760184288025, -1.9597855806350708, -0.2527056932449341, -2.0347321033477783, -1.0436501502990723, -1.2124212980270386, -2.834301233291626, -1.6760799884796143, -2.205287218093872, -1.5265791416168213, -1.2453690767288208]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..36d52789f39 --- /dev/null +++ b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/golden_values_dev_dgxh100_eos.json @@ -0,0 +1 @@ +{"1": {"input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", "generated_text": " Then, when you're ready, go home and watch the movie again.", "generated_tokens": [6830, 1044, 2200, 1636, 6185, 11831, 1044, 1974, 4590, 1321, 9951, 1278, 16070, 2790, 1046], "tpot": [0.5964657068252563, 0.0076944963075220585, 0.0069276802241802216, 0.006815008353441954, 0.007004896178841591, 0.007135615684092045, 0.007600544020533562, 0.00778160011395812, 0.008111871778964996, 0.008260959759354591, 0.008273440413177013, 0.008334367536008358, 0.008409472182393074, 0.008148159831762314, 0.008159839548170567], "latency": 0.7182000600732863, "logprobs": [-9.485179901123047, -3.7365002632141113, -3.0747694969177246, -1.744485855102539, -0.29669833183288574, -1.4020814895629883, -2.432681083679199, -1.7664837837219238, -1.4741225242614746, -6.42724084854126, -0.8153547048568726, -1.7931451797485352, -3.650665044784546, -3.698770046234131, -1.608336091041565, -1.6549599170684814, -2.8460211753845215, -6.670064926147461, -0.06550002098083496, -1.2442623376846313, -6.04405403137207, -9.507080078125, -10.461563110351562, -1.5952650308609009, -4.6770920753479, -0.745125412940979, -2.1571977138519287, -0.013643701560795307, -0.03557091951370239, -3.090214252471924, -8.740396499633789, -1.5405625104904175, -5.852315902709961, -3.09045672416687, -3.9833602905273438, -3.7632288932800293, -2.444291591644287, -2.273496627807617, -0.4683297276496887, -1.020460605621338, -5.3351545333862305, -8.249643325805664, -0.01584932766854763, -2.8506340980529785, -1.251563549041748, -3.7786898612976074, -1.0169645547866821, -0.002681709360331297, -3.0970988273620605, -11.113213539123535, -3.8127267360687256, -2.329777479171753, -4.672338485717773, -0.09791824221611023, -0.06286392360925674, -1.3320130109786987, -2.1521241664886475, -4.375304222106934, -0.43500134348869324, -3.9912281036376953, -0.5796594023704529, -0.26420092582702637, -2.811892509460449, -13.508228302001953, -0.10134205967187881, -3.5013256072998047, -0.8109210729598999, -5.298563480377197, -0.3272246718406677, -2.333836555480957, -0.5356347560882568, -1.288033366203308, -4.895185947418213, -15.548847198486328, -4.934615612030029, -0.22137367725372314, -6.583427429199219, -0.9010066986083984, -2.237170696258545, -1.8670732975006104, -0.20016230642795563, -5.921288013458252, -0.005614227149635553, -7.52609920501709, -3.284144878387451, -3.6920413970947266, -2.0169901847839355, -2.9249799251556396, -1.469851016998291, -2.4422709941864014, -1.2325081825256348, -1.964760184288025, -1.9597855806350708, -0.2527056932449341, -2.0347321033477783, -1.0436501502990723, -1.2124212980270386, -2.834301233291626, -1.6760799884796143, -2.205287218093872, -1.5265791416168213, -1.2453690767288208]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..0c524fa4991 --- /dev/null +++ b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1 @@ +{"0": {"input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", "generated_text": " Then, when you're ready, go home and watch the movie again.", "generated_tokens": [6830, 1044, 2200, 1636, 6185, 11831, 1044, 1974, 4590, 1321, 9951, 1278, 16070, 2790, 1046], "tpot": [22.176082611083984, 0.6151371598243713, 0.034286558628082275, 0.03372633829712868, 0.03291260823607445, 0.033486176282167435, 0.033701471984386444, 0.03326892852783203, 0.03287017345428467, 0.033419039100408554, 0.03316511958837509, 0.03274928033351898, 0.03266361728310585, 0.032435040920972824, 0.03254726529121399], "latency": 23.265353467315435, "logprobs": [-9.485179901123047, -3.7365002632141113, -3.0747694969177246, -1.744485855102539, -0.29669833183288574, -1.4020814895629883, -2.432681083679199, -1.7664837837219238, -1.4741225242614746, -6.42724084854126, -0.8153547048568726, -1.7931451797485352, -3.650665044784546, -3.698770046234131, -1.608336091041565, -1.6549599170684814, -2.8460211753845215, -6.670064926147461, -0.06550002098083496, -1.2442623376846313, -6.04405403137207, -9.507080078125, -10.461563110351562, -1.5952650308609009, -4.6770920753479, -0.745125412940979, -2.1571977138519287, -0.013643701560795307, -0.03557091951370239, -3.090214252471924, -8.740396499633789, -1.5405625104904175, -5.852315902709961, -3.09045672416687, -3.9833602905273438, -3.7632288932800293, -2.444291591644287, -2.273496627807617, -0.4683297276496887, -1.020460605621338, -5.3351545333862305, -8.249643325805664, -0.01584932766854763, -2.8506340980529785, -1.251563549041748, -3.7786898612976074, -1.0169645547866821, -0.002681709360331297, -3.0970988273620605, -11.113213539123535, -3.8127267360687256, -2.329777479171753, -4.672338485717773, -0.09791824221611023, -0.06286392360925674, -1.3320130109786987, -2.1521241664886475, -4.375304222106934, -0.43500134348869324, -3.9912281036376953, -0.5796594023704529, -0.26420092582702637, -2.811892509460449, -13.508228302001953, -0.10134205967187881, -3.5013256072998047, -0.8109210729598999, -5.298563480377197, -0.3272246718406677, -2.333836555480957, -0.5356347560882568, -1.288033366203308, -4.895185947418213, -15.548847198486328, -4.934615612030029, -0.22137367725372314, -6.583427429199219, -0.9010066986083984, -2.237170696258545, -1.8670732975006104, -0.20016230642795563, -5.921288013458252, -0.005614227149635553, -7.52609920501709, -3.284144878387451, -3.6920413970947266, -2.0169901847839355, -2.9249799251556396, -1.469851016998291, -2.4422709941864014, -1.2325081825256348, -1.964760184288025, -1.9597855806350708, -0.2527056932449341, -2.0347321033477783, -1.0436501502990723, -1.2124212980270386, -2.834301233291626, -1.6760799884796143, -2.205287218093872, -1.5265791416168213, -1.2453690767288208]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..1d887d9830c --- /dev/null +++ b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/golden_values_dev_dgxh100_eos.json @@ -0,0 +1 @@ +{"0": {"input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", "generated_text": " Then, when you're ready, go home and watch the movie again.", "generated_tokens": [6830, 1044, 2200, 1636, 6185, 11831, 1044, 1974, 4590, 1321, 9951, 1278, 16070, 2790, 1046], "tpot": [23.254732131958008, 0.9408637881278992, 0.034858111292123795, 0.03537708520889282, 0.03476342558860779, 0.03471830487251282, 0.03922403231263161, 0.03739152103662491, 0.03962313383817673, 0.04001171141862869, 0.03972022235393524, 0.040310338139534, 0.038479968905448914, 0.03562349081039429, 0.038027167320251465], "latency": 24.731004369910806, "logprobs": [-9.485179901123047, -3.7365002632141113, -3.0747694969177246, -1.744485855102539, -0.29669833183288574, -1.4020814895629883, -2.432681083679199, -1.7664837837219238, -1.4741225242614746, -6.42724084854126, -0.8153547048568726, -1.7931451797485352, -3.650665044784546, -3.698770046234131, -1.608336091041565, -1.6549599170684814, -2.8460211753845215, -6.670064926147461, -0.06550002098083496, -1.2442623376846313, -6.04405403137207, -9.507080078125, -10.461563110351562, -1.5952650308609009, -4.6770920753479, -0.745125412940979, -2.1571977138519287, -0.013643701560795307, -0.03557091951370239, -3.090214252471924, -8.740396499633789, -1.5405625104904175, -5.852315902709961, -3.09045672416687, -3.9833602905273438, -3.7632288932800293, -2.444291591644287, -2.273496627807617, -0.4683297276496887, -1.020460605621338, -5.3351545333862305, -8.249643325805664, -0.01584932766854763, -2.8506340980529785, -1.251563549041748, -3.7786898612976074, -1.0169645547866821, -0.002681709360331297, -3.0970988273620605, -11.113213539123535, -3.8127267360687256, -2.329777479171753, -4.672338485717773, -0.09791824221611023, -0.06286392360925674, -1.3320130109786987, -2.1521241664886475, -4.375304222106934, -0.43500134348869324, -3.9912281036376953, -0.5796594023704529, -0.26420092582702637, -2.811892509460449, -13.508228302001953, -0.10134205967187881, -3.5013256072998047, -0.8109210729598999, -5.298563480377197, -0.3272246718406677, -2.333836555480957, -0.5356347560882568, -1.288033366203308, -4.895185947418213, -15.548847198486328, -4.934615612030029, -0.22137367725372314, -6.583427429199219, -0.9010066986083984, -2.237170696258545, -1.8670732975006104, -0.20016230642795563, -5.921288013458252, -0.005614227149635553, -7.52609920501709, -3.284144878387451, -3.6920413970947266, -2.0169901847839355, -2.9249799251556396, -1.469851016998291, -2.4422709941864014, -1.2325081825256348, -1.964760184288025, -1.9597855806350708, -0.2527056932449341, -2.0347321033477783, -1.0436501502990723, -1.2124212980270386, -2.834301233291626, -1.6760799884796143, -2.205287218093872, -1.5265791416168213, -1.2453690767288208]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_h100.json index c9b7badd2f9..fd720368e7c 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.7999, - "5": 10.8256, - "10": 10.77408, - "15": 10.7823, - "20": 10.69976, - "25": 10.51847, - "30": 10.36472, - "35": 10.25433, - "40": 10.1024, - "45": 9.84248, - "50": 9.92572 + "2": 10.80046, + "3": 10.80856, + "4": 10.78236, + "5": 10.82529, + "6": 10.83582, + "7": 10.81653, + "8": 10.81185, + "9": 10.81091, + "10": 10.77387, + "11": 10.85526, + "12": 10.82697, + "13": 10.85098, + "14": 10.85469, + "15": 10.7827, + "16": 10.77374, + "17": 10.7504, + "18": 10.78334, + "19": 10.75924, + "20": 10.69944, + "21": 10.67297, + "22": 10.51442, + "23": 10.68096, + "24": 10.57187, + "25": 10.51823, + "26": 10.57662, + "27": 10.59187, + "28": 10.55398, + "29": 10.57092, + "30": 10.36453, + "31": 10.10911, + "32": 10.45339, + "33": 10.43673, + "34": 10.19971, + "35": 10.25406, + "36": 10.23349, + "37": 10.35406, + "38": 10.20448, + "39": 10.39919, + "40": 10.10198, + "41": 10.12753, + "42": 10.21106, + "43": 9.83709, + "44": 9.96212, + "45": 9.84265, + "46": 9.80647, + "47": 10.14286, + "48": 9.86668, + "49": 9.5387, + "50": 9.92563 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 4866.0, - "5": 5487.0, - "10": 4524.0, - "15": 5298.0, - "20": 4827.0, - "25": 5007.0, - "30": 5355.0, - "35": 5634.0, - "40": 5894.0, - "45": 5741.0, - "50": 6592.0 + "1": 4859.0, + "2": 4958.0, + "3": 5062.0, + "4": 4978.0, + "5": 5447.0, + "6": 5701.0, + "7": 5288.0, + "8": 5091.0, + "9": 5455.0, + "10": 4456.0, + "11": 5940.0, + "12": 5333.0, + "13": 5833.0, + "14": 5618.0, + "15": 5332.0, + "16": 5494.0, + "17": 5290.0, + "18": 5259.0, + "19": 5322.0, + "20": 4889.0, + "21": 5334.0, + "22": 4823.0, + "23": 5689.0, + "24": 5082.0, + "25": 4963.0, + "26": 5289.0, + "27": 5273.0, + "28": 5740.0, + "29": 6004.0, + "30": 5295.0, + "31": 4876.0, + "32": 5709.0, + "33": 6098.0, + "34": 5165.0, + "35": 5500.0, + "36": 5505.0, + "37": 6376.0, + "38": 5826.0, + "39": 6773.0, + "40": 5824.0, + "41": 5809.0, + "42": 6386.0, + "43": 5747.0, + "44": 5860.0, + "45": 5732.0, + "46": 5948.0, + "47": 6430.0, + "48": 6500.0, + "49": 6497.0, + "50": 6719.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1145716736.0, - "5": 1145715200.0, - "10": 1145719296.0, + "2": 1145714688.0, + "3": 1145715200.0, + "4": 1145714176.0, + "5": 1146209792.0, + "6": 1146210816.0, + "7": 1145717248.0, + "8": 1146209280.0, + "9": 1145714688.0, + "10": 1146214912.0, + "11": 1146209792.0, + "12": 1145714176.0, + "13": 1145713152.0, + "14": 1146209280.0, "15": 1145713152.0, + "16": 1146210816.0, + "17": 1145713664.0, + "18": 1146210304.0, + "19": 1145714176.0, "20": 1145715200.0, + "21": 1146210304.0, + "22": 1145715712.0, + "23": 1145715712.0, + "24": 1145713152.0, "25": 1145712128.0, + "26": 1145715200.0, + "27": 1146210304.0, + "28": 1145713664.0, + "29": 1145711104.0, "30": 1145714688.0, - "35": 1145717760.0, + "31": 1146213376.0, + "32": 1145713664.0, + "33": 1145714688.0, + "34": 1145715200.0, + "35": 1146212864.0, + "36": 1145713152.0, + "37": 1145712128.0, + "38": 1146207744.0, + "39": 1145715200.0, "40": 1146210816.0, + "41": 1145714688.0, + "42": 1145712128.0, + "43": 1145715712.0, + "44": 1145717760.0, "45": 1146210304.0, - "50": 1145715712.0 + "46": 1146214400.0, + "47": 1145714688.0, + "48": 1145717760.0, + "49": 1145719296.0, + "50": 1145716224.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1593775104.0, - "5": 2052787712.0, - "10": 2057007616.0, - "15": 2057007616.0, - "20": 2057007616.0, - "25": 2057007616.0, - "30": 2057007616.0, - "35": 2057007616.0, - "40": 2057007616.0, - "45": 2057007616.0, - "50": 2057007616.0 + "2": 2051463168.0, + "3": 2052978176.0, + "4": 2052978176.0, + "5": 2052978176.0, + "6": 2053324288.0, + "7": 2053986816.0, + "8": 2053986816.0, + "9": 2057060864.0, + "10": 2057060864.0, + "11": 2057060864.0, + "12": 2057060864.0, + "13": 2057060864.0, + "14": 2057060864.0, + "15": 2057060864.0, + "16": 2057060864.0, + "17": 2057060864.0, + "18": 2057060864.0, + "19": 2057060864.0, + "20": 2057060864.0, + "21": 2057060864.0, + "22": 2057060864.0, + "23": 2057060864.0, + "24": 2057060864.0, + "25": 2057060864.0, + "26": 2057060864.0, + "27": 2057060864.0, + "28": 2057060864.0, + "29": 2057060864.0, + "30": 2057060864.0, + "31": 2057060864.0, + "32": 2057060864.0, + "33": 2057060864.0, + "34": 2057060864.0, + "35": 2057060864.0, + "36": 2057060864.0, + "37": 2057060864.0, + "38": 2057060864.0, + "39": 2057060864.0, + "40": 2057060864.0, + "41": 2057060864.0, + "42": 2057060864.0, + "43": 2057060864.0, + "44": 2057060864.0, + "45": 2057060864.0, + "46": 2057060864.0, + "47": 2057060864.0, + "48": 2057060864.0, + "49": 2057060864.0, + "50": 2057060864.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 16.36205, - "5": 0.22567, - "10": 0.24367, - "15": 0.2361, - "20": 0.22731, - "25": 0.2551, - "30": 0.22323, - "35": 0.23009, - "40": 0.2213, - "45": 0.22842, - "50": 0.22548 + "1": 18.20596, + "2": 0.35903, + "3": 0.29783, + "4": 0.32647, + "5": 0.27756, + "6": 0.27374, + "7": 0.30378, + "8": 0.27695, + "9": 0.2803, + "10": 0.28715, + "11": 0.26455, + "12": 0.26231, + "13": 0.2664, + "14": 0.25756, + "15": 0.26997, + "16": 0.26004, + "17": 0.27036, + "18": 0.26235, + "19": 0.25926, + "20": 0.2633, + "21": 0.27365, + "22": 0.28244, + "23": 0.27106, + "24": 0.26252, + "25": 0.27913, + "26": 0.26128, + "27": 0.25745, + "28": 0.28971, + "29": 0.25557, + "30": 0.26227, + "31": 0.28393, + "32": 0.2742, + "33": 0.25918, + "34": 0.2839, + "35": 0.26183, + "36": 0.26351, + "37": 0.25935, + "38": 0.27055, + "39": 0.25969, + "40": 0.25776, + "41": 0.26414, + "42": 0.26164, + "43": 0.27671, + "44": 0.26781, + "45": 0.25691, + "46": 0.28709, + "47": 0.26291, + "48": 0.26119, + "49": 0.27305, + "50": 0.26323 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..83e9dd029de --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82721, + "2": 10.84035, + "3": 10.82693, + "4": 10.81925, + "5": 10.85729, + "6": 10.86987, + "7": 10.85126, + "8": 10.84503, + "9": 10.85262, + "10": 10.79218, + "11": 10.86541, + "12": 10.87056, + "13": 10.87103, + "14": 10.87907, + "15": 10.82509, + "16": 10.81245, + "17": 10.77498, + "18": 10.81067, + "19": 10.79628, + "20": 10.7226, + "21": 10.69703, + "22": 10.5511, + "23": 10.70525, + "24": 10.59039, + "25": 10.5437, + "26": 10.60015, + "27": 10.62026, + "28": 10.57443, + "29": 10.58672, + "30": 10.35727, + "31": 10.12151, + "32": 10.47011, + "33": 10.45715, + "34": 10.21596, + "35": 10.2716, + "36": 10.23548, + "37": 10.35256, + "38": 10.20575, + "39": 10.40073, + "40": 10.09692, + "41": 10.13841, + "42": 10.21761, + "43": 9.84436, + "44": 9.96211, + "45": 9.84091, + "46": 9.81936, + "47": 10.13901, + "48": 9.8515, + "49": 9.53555, + "50": 9.92434 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4632.0, + "2": 4867.0, + "3": 4905.0, + "4": 4933.0, + "5": 5426.0, + "6": 5441.0, + "7": 5134.0, + "8": 4724.0, + "9": 5268.0, + "10": 4406.0, + "11": 5633.0, + "12": 5144.0, + "13": 5458.0, + "14": 5522.0, + "15": 5171.0, + "16": 5326.0, + "17": 5191.0, + "18": 5103.0, + "19": 5320.0, + "20": 4861.0, + "21": 5369.0, + "22": 4926.0, + "23": 5811.0, + "24": 5036.0, + "25": 4912.0, + "26": 5138.0, + "27": 5254.0, + "28": 5688.0, + "29": 5906.0, + "30": 5493.0, + "31": 4766.0, + "32": 5805.0, + "33": 5992.0, + "34": 5140.0, + "35": 5663.0, + "36": 5599.0, + "37": 6398.0, + "38": 6036.0, + "39": 6612.0, + "40": 5946.0, + "41": 5919.0, + "42": 6480.0, + "43": 5819.0, + "44": 5690.0, + "45": 5761.0, + "46": 5974.0, + "47": 6514.0, + "48": 6268.0, + "49": 6290.0, + "50": 6671.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1116598784.0, + "2": 1116598272.0, + "3": 1116596224.0, + "4": 1116597760.0, + "5": 1116595712.0, + "6": 1116594688.0, + "7": 1116595712.0, + "8": 1116595200.0, + "9": 1116597760.0, + "10": 1116596224.0, + "11": 1116597248.0, + "12": 1116596224.0, + "13": 1116600320.0, + "14": 1116594688.0, + "15": 1116597760.0, + "16": 1116594688.0, + "17": 1116595200.0, + "18": 1116598272.0, + "19": 1116594176.0, + "20": 1116595712.0, + "21": 1116594176.0, + "22": 1116595712.0, + "23": 1116596736.0, + "24": 1116598272.0, + "25": 1116595712.0, + "26": 1116598784.0, + "27": 1116596224.0, + "28": 1116597248.0, + "29": 1116598272.0, + "30": 1116594688.0, + "31": 1116601344.0, + "32": 1116597760.0, + "33": 1116595712.0, + "34": 1116596224.0, + "35": 1116598784.0, + "36": 1116594176.0, + "37": 1116595712.0, + "38": 1116596736.0, + "39": 1116595200.0, + "40": 1116597760.0, + "41": 1116598784.0, + "42": 1116598784.0, + "43": 1116599296.0, + "44": 1116598272.0, + "45": 1116596736.0, + "46": 1116597248.0, + "47": 1116597248.0, + "48": 1116594688.0, + "49": 1116592640.0, + "50": 1116598784.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1563067904.0, + "2": 2021656576.0, + "3": 2021656576.0, + "4": 2022763008.0, + "5": 2022763008.0, + "6": 2022763008.0, + "7": 2022763008.0, + "8": 2023145984.0, + "9": 2023145984.0, + "10": 2025749504.0, + "11": 2025749504.0, + "12": 2025749504.0, + "13": 2026550272.0, + "14": 2026550272.0, + "15": 2026550272.0, + "16": 2026550272.0, + "17": 2026550272.0, + "18": 2026550272.0, + "19": 2026550272.0, + "20": 2026550272.0, + "21": 2026550272.0, + "22": 2026550272.0, + "23": 2026550272.0, + "24": 2026550272.0, + "25": 2026550272.0, + "26": 2026550272.0, + "27": 2026550272.0, + "28": 2026550272.0, + "29": 2026550272.0, + "30": 2026550272.0, + "31": 2029278208.0, + "32": 2029278208.0, + "33": 2029278208.0, + "34": 2029278208.0, + "35": 2029278208.0, + "36": 2029278208.0, + "37": 2029278208.0, + "38": 2029278208.0, + "39": 2029278208.0, + "40": 2029278208.0, + "41": 2029278208.0, + "42": 2029278208.0, + "43": 2029278208.0, + "44": 2029278208.0, + "45": 2029278208.0, + "46": 2029278208.0, + "47": 2029278208.0, + "48": 2029278208.0, + "49": 2029278208.0, + "50": 2029278208.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 16.71534, + "2": 0.42823, + "3": 0.35479, + "4": 0.35129, + "5": 0.35492, + "6": 0.34734, + "7": 0.34252, + "8": 0.34249, + "9": 0.3404, + "10": 0.34249, + "11": 0.34006, + "12": 0.34343, + "13": 0.341, + "14": 0.33997, + "15": 0.34123, + "16": 0.34135, + "17": 0.34196, + "18": 0.34169, + "19": 0.34148, + "20": 0.34323, + "21": 0.34514, + "22": 0.34317, + "23": 0.34353, + "24": 0.341, + "25": 0.34149, + "26": 0.34555, + "27": 0.34102, + "28": 0.34068, + "29": 0.34243, + "30": 0.34248, + "31": 0.33982, + "32": 0.34184, + "33": 0.34279, + "34": 0.34274, + "35": 0.34238, + "36": 0.34027, + "37": 0.34377, + "38": 0.34332, + "39": 0.34223, + "40": 0.34254, + "41": 0.34097, + "42": 0.34043, + "43": 0.34447, + "44": 0.3405, + "45": 0.34009, + "46": 0.34121, + "47": 0.33815, + "48": 0.34039, + "49": 0.34174, + "50": 0.34062 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxa100_dracooci.json new file mode 100644 index 00000000000..c6c228253e0 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82721, + "2": 10.84035, + "3": 10.82723, + "4": 10.81924, + "5": 10.85677, + "6": 10.87001, + "7": 10.85158, + "8": 10.84472, + "9": 10.85255, + "10": 10.79194, + "11": 10.86558, + "12": 10.87116, + "13": 10.87097, + "14": 10.87861, + "15": 10.82571, + "16": 10.81234, + "17": 10.77447, + "18": 10.81055, + "19": 10.79638, + "20": 10.72194, + "21": 10.69672, + "22": 10.55073, + "23": 10.70511, + "24": 10.59025, + "25": 10.54429, + "26": 10.60007, + "27": 10.62018, + "28": 10.57431, + "29": 10.58678, + "30": 10.35759, + "31": 10.122, + "32": 10.47002, + "33": 10.45695, + "34": 10.21597, + "35": 10.27122, + "36": 10.23573, + "37": 10.35257, + "38": 10.20582, + "39": 10.40083, + "40": 10.09682, + "41": 10.1389, + "42": 10.21834, + "43": 9.84408, + "44": 9.96196, + "45": 9.84128, + "46": 9.8194, + "47": 10.13893, + "48": 9.85148, + "49": 9.5354, + "50": 9.9245 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4716.0, + "2": 4931.0, + "3": 4816.0, + "4": 4901.0, + "5": 5453.0, + "6": 5635.0, + "7": 5173.0, + "8": 4857.0, + "9": 5219.0, + "10": 4386.0, + "11": 5795.0, + "12": 5340.0, + "13": 5567.0, + "14": 5428.0, + "15": 5321.0, + "16": 5367.0, + "17": 5290.0, + "18": 5030.0, + "19": 5155.0, + "20": 4735.0, + "21": 5405.0, + "22": 4831.0, + "23": 5764.0, + "24": 5036.0, + "25": 4756.0, + "26": 5262.0, + "27": 5313.0, + "28": 5809.0, + "29": 5928.0, + "30": 5404.0, + "31": 4719.0, + "32": 5796.0, + "33": 6218.0, + "34": 5083.0, + "35": 5715.0, + "36": 5608.0, + "37": 6302.0, + "38": 6050.0, + "39": 6634.0, + "40": 5742.0, + "41": 5958.0, + "42": 6406.0, + "43": 5795.0, + "44": 5818.0, + "45": 5695.0, + "46": 5888.0, + "47": 6504.0, + "48": 6390.0, + "49": 6316.0, + "50": 6636.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1114775040.0, + "2": 1114774528.0, + "3": 1114772480.0, + "4": 1114774016.0, + "5": 1114770944.0, + "6": 1114771456.0, + "7": 1114771968.0, + "8": 1114770432.0, + "9": 1114774016.0, + "10": 1114772480.0, + "11": 1114772480.0, + "12": 1114774016.0, + "13": 1114776576.0, + "14": 1114770944.0, + "15": 1114774016.0, + "16": 1114774016.0, + "17": 1114770432.0, + "18": 1114774016.0, + "19": 1114770432.0, + "20": 1114771968.0, + "21": 1114771456.0, + "22": 1114771968.0, + "23": 1114772992.0, + "24": 1114774528.0, + "25": 1114770944.0, + "26": 1114774528.0, + "27": 1114772480.0, + "28": 1114773504.0, + "29": 1114774528.0, + "30": 1114770944.0, + "31": 1114777600.0, + "32": 1114773504.0, + "33": 1114771968.0, + "34": 1114772480.0, + "35": 1114775040.0, + "36": 1114771456.0, + "37": 1114771968.0, + "38": 1114772992.0, + "39": 1114770432.0, + "40": 1114774016.0, + "41": 1114775040.0, + "42": 1114775040.0, + "43": 1114775552.0, + "44": 1114774016.0, + "45": 1114772480.0, + "46": 1114774016.0, + "47": 1114772480.0, + "48": 1114770432.0, + "49": 1114768896.0, + "50": 1114775040.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1563110912.0, + "2": 2019832832.0, + "3": 2019832832.0, + "4": 2020832768.0, + "5": 2020832768.0, + "6": 2020832768.0, + "7": 2020832768.0, + "8": 2020832768.0, + "9": 2020832768.0, + "10": 2024514560.0, + "11": 2024514560.0, + "12": 2024514560.0, + "13": 2025236480.0, + "14": 2025236480.0, + "15": 2025236480.0, + "16": 2025236480.0, + "17": 2025236480.0, + "18": 2025236480.0, + "19": 2025236480.0, + "20": 2025236480.0, + "21": 2025236480.0, + "22": 2025236480.0, + "23": 2025236480.0, + "24": 2025236480.0, + "25": 2025236480.0, + "26": 2025236480.0, + "27": 2025236480.0, + "28": 2025236480.0, + "29": 2025236480.0, + "30": 2025236480.0, + "31": 2028140544.0, + "32": 2028140544.0, + "33": 2028140544.0, + "34": 2028140544.0, + "35": 2028140544.0, + "36": 2028140544.0, + "37": 2028140544.0, + "38": 2028140544.0, + "39": 2028140544.0, + "40": 2028140544.0, + "41": 2028140544.0, + "42": 2028140544.0, + "43": 2028140544.0, + "44": 2028140544.0, + "45": 2028140544.0, + "46": 2028140544.0, + "47": 2028140544.0, + "48": 2028140544.0, + "49": 2028140544.0, + "50": 2028140544.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 15.43783, + "2": 0.38321, + "3": 0.36811, + "4": 0.35154, + "5": 0.3506, + "6": 0.35246, + "7": 0.35049, + "8": 0.35172, + "9": 0.35056, + "10": 0.35222, + "11": 0.35146, + "12": 0.35099, + "13": 0.35097, + "14": 0.34999, + "15": 0.35178, + "16": 0.3507, + "17": 0.35085, + "18": 0.36269, + "19": 0.3628, + "20": 0.39629, + "21": 0.362, + "22": 0.34881, + "23": 0.34826, + "24": 0.34894, + "25": 0.34905, + "26": 0.34868, + "27": 0.34852, + "28": 0.35034, + "29": 0.3505, + "30": 0.34898, + "31": 0.34972, + "32": 0.34827, + "33": 0.34805, + "34": 0.34828, + "35": 0.3462, + "36": 0.34816, + "37": 0.34932, + "38": 0.3474, + "39": 0.34618, + "40": 0.34596, + "41": 0.34685, + "42": 0.34571, + "43": 0.34956, + "44": 0.34632, + "45": 0.34487, + "46": 0.34479, + "47": 0.34793, + "48": 0.34481, + "49": 0.34468, + "50": 0.34354 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..72d650fcb5a --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.7999, + "2": 10.80046, + "3": 10.80906, + "4": 10.78256, + "5": 10.82566, + "6": 10.83616, + "7": 10.81688, + "8": 10.81159, + "9": 10.81058, + "10": 10.77421, + "11": 10.8555, + "12": 10.82696, + "13": 10.85081, + "14": 10.85457, + "15": 10.78256, + "16": 10.77334, + "17": 10.75077, + "18": 10.78391, + "19": 10.75873, + "20": 10.70038, + "21": 10.67229, + "22": 10.51412, + "23": 10.68126, + "24": 10.57156, + "25": 10.51795, + "26": 10.57588, + "27": 10.59132, + "28": 10.55287, + "29": 10.57112, + "30": 10.36497, + "31": 10.10959, + "32": 10.45338, + "33": 10.43695, + "34": 10.20008, + "35": 10.25443, + "36": 10.23362, + "37": 10.35422, + "38": 10.20437, + "39": 10.39909, + "40": 10.10235, + "41": 10.12745, + "42": 10.21091, + "43": 9.83755, + "44": 9.96198, + "45": 9.8428, + "46": 9.80664, + "47": 10.14256, + "48": 9.86637, + "49": 9.53809, + "50": 9.92581 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4832.0, + "2": 4993.0, + "3": 5015.0, + "4": 5101.0, + "5": 5493.0, + "6": 5733.0, + "7": 5202.0, + "8": 5069.0, + "9": 5607.0, + "10": 4607.0, + "11": 5837.0, + "12": 5394.0, + "13": 5775.0, + "14": 5823.0, + "15": 5240.0, + "16": 5310.0, + "17": 5304.0, + "18": 5229.0, + "19": 5439.0, + "20": 4899.0, + "21": 5406.0, + "22": 4858.0, + "23": 5868.0, + "24": 5135.0, + "25": 4824.0, + "26": 5375.0, + "27": 5395.0, + "28": 5877.0, + "29": 5992.0, + "30": 5324.0, + "31": 4919.0, + "32": 5852.0, + "33": 6135.0, + "34": 5147.0, + "35": 5560.0, + "36": 5414.0, + "37": 6415.0, + "38": 5968.0, + "39": 6734.0, + "40": 5818.0, + "41": 5767.0, + "42": 6510.0, + "43": 5734.0, + "44": 5802.0, + "45": 5717.0, + "46": 5997.0, + "47": 6519.0, + "48": 6573.0, + "49": 6525.0, + "50": 6552.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1145716736.0, + "2": 1145714688.0, + "3": 1145715200.0, + "4": 1145713152.0, + "5": 1146210816.0, + "6": 1146210304.0, + "7": 1145716736.0, + "8": 1146209280.0, + "9": 1145714688.0, + "10": 1146214912.0, + "11": 1146210816.0, + "12": 1145713664.0, + "13": 1145713152.0, + "14": 1146210304.0, + "15": 1145713152.0, + "16": 1145714688.0, + "17": 1145713664.0, + "18": 1146212352.0, + "19": 1145714176.0, + "20": 1145715200.0, + "21": 1146210304.0, + "22": 1145715712.0, + "23": 1145715200.0, + "24": 1145713152.0, + "25": 1145712128.0, + "26": 1145715200.0, + "27": 1145715200.0, + "28": 1145713664.0, + "29": 1145711616.0, + "30": 1145714688.0, + "31": 1146213376.0, + "32": 1145713152.0, + "33": 1145714688.0, + "34": 1146210304.0, + "35": 1146212864.0, + "36": 1145713664.0, + "37": 1145712640.0, + "38": 1146207744.0, + "39": 1145715200.0, + "40": 1146210816.0, + "41": 1145715712.0, + "42": 1146207744.0, + "43": 1146211328.0, + "44": 1145716736.0, + "45": 1146210304.0, + "46": 1146214400.0, + "47": 1145714688.0, + "48": 1145717248.0, + "49": 1146215936.0, + "50": 1145716224.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1593775104.0, + "2": 2051463168.0, + "3": 2052791808.0, + "4": 2052791808.0, + "5": 2052791808.0, + "6": 2053601792.0, + "7": 2054070272.0, + "8": 2054225408.0, + "9": 2056797696.0, + "10": 2057079296.0, + "11": 2057079296.0, + "12": 2057079296.0, + "13": 2057079296.0, + "14": 2057079296.0, + "15": 2057079296.0, + "16": 2057079296.0, + "17": 2057079296.0, + "18": 2057079296.0, + "19": 2057079296.0, + "20": 2057079296.0, + "21": 2057079296.0, + "22": 2057079296.0, + "23": 2057079296.0, + "24": 2057079296.0, + "25": 2057079296.0, + "26": 2057079296.0, + "27": 2057079296.0, + "28": 2057079296.0, + "29": 2057079296.0, + "30": 2057079296.0, + "31": 2057079296.0, + "32": 2057079296.0, + "33": 2057079296.0, + "34": 2057079296.0, + "35": 2057079296.0, + "36": 2057079296.0, + "37": 2057079296.0, + "38": 2057079296.0, + "39": 2057079296.0, + "40": 2057079296.0, + "41": 2057079296.0, + "42": 2057079296.0, + "43": 2057079296.0, + "44": 2057079296.0, + "45": 2057079296.0, + "46": 2057079296.0, + "47": 2057079296.0, + "48": 2057079296.0, + "49": 2057079296.0, + "50": 2057079296.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 19.78346, + "2": 0.3309, + "3": 0.26692, + "4": 0.30511, + "5": 0.25944, + "6": 0.25055, + "7": 0.26908, + "8": 0.24453, + "9": 0.23731, + "10": 0.24901, + "11": 0.23286, + "12": 0.22911, + "13": 0.2292, + "14": 0.23339, + "15": 0.24721, + "16": 0.24166, + "17": 0.22756, + "18": 0.2223, + "19": 0.22427, + "20": 0.23111, + "21": 0.23175, + "22": 0.2573, + "23": 0.24989, + "24": 0.23707, + "25": 0.23317, + "26": 0.23062, + "27": 0.22667, + "28": 0.24009, + "29": 0.22295, + "30": 0.22987, + "31": 0.25103, + "32": 0.24353, + "33": 0.22584, + "34": 0.23541, + "35": 0.23768, + "36": 0.22699, + "37": 0.22446, + "38": 0.24288, + "39": 0.22484, + "40": 0.2277, + "41": 0.23059, + "42": 0.22349, + "43": 0.23202, + "44": 0.23787, + "45": 0.24589, + "46": 0.27096, + "47": 0.23921, + "48": 0.24334, + "49": 0.24986, + "50": 0.24759 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..e4e01388a15 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.7999, + "2": 10.80046, + "3": 10.8089, + "4": 10.78245, + "5": 10.82504, + "6": 10.83657, + "7": 10.81628, + "8": 10.81184, + "9": 10.8108, + "10": 10.7742, + "11": 10.85482, + "12": 10.82663, + "13": 10.85131, + "14": 10.85461, + "15": 10.78253, + "16": 10.77375, + "17": 10.74989, + "18": 10.78346, + "19": 10.75877, + "20": 10.69982, + "21": 10.67287, + "22": 10.5142, + "23": 10.68053, + "24": 10.57164, + "25": 10.51814, + "26": 10.57591, + "27": 10.59136, + "28": 10.55398, + "29": 10.57104, + "30": 10.36425, + "31": 10.10945, + "32": 10.45329, + "33": 10.43693, + "34": 10.20011, + "35": 10.25443, + "36": 10.23318, + "37": 10.3536, + "38": 10.20421, + "39": 10.3993, + "40": 10.10241, + "41": 10.12765, + "42": 10.21115, + "43": 9.83746, + "44": 9.96186, + "45": 9.84266, + "46": 9.80686, + "47": 10.14266, + "48": 9.86672, + "49": 9.53822, + "50": 9.92595 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4752.0, + "2": 5040.0, + "3": 5112.0, + "4": 5072.0, + "5": 5472.0, + "6": 5619.0, + "7": 5255.0, + "8": 5065.0, + "9": 5483.0, + "10": 4607.0, + "11": 5862.0, + "12": 5377.0, + "13": 5783.0, + "14": 5830.0, + "15": 5249.0, + "16": 5346.0, + "17": 5291.0, + "18": 5277.0, + "19": 5352.0, + "20": 4942.0, + "21": 5465.0, + "22": 4878.0, + "23": 5807.0, + "24": 5145.0, + "25": 4873.0, + "26": 5380.0, + "27": 5479.0, + "28": 5739.0, + "29": 5950.0, + "30": 5363.0, + "31": 4730.0, + "32": 5732.0, + "33": 5963.0, + "34": 5261.0, + "35": 5660.0, + "36": 5422.0, + "37": 6362.0, + "38": 6114.0, + "39": 6803.0, + "40": 5731.0, + "41": 5808.0, + "42": 6485.0, + "43": 5742.0, + "44": 5843.0, + "45": 5876.0, + "46": 6024.0, + "47": 6554.0, + "48": 6354.0, + "49": 6497.0, + "50": 6526.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1144115200.0, + "2": 1144113152.0, + "3": 1144113664.0, + "4": 1144112640.0, + "5": 1144113664.0, + "6": 1144113152.0, + "7": 1144115200.0, + "8": 1144112640.0, + "9": 1144113152.0, + "10": 1144118272.0, + "11": 1144112640.0, + "12": 1144112128.0, + "13": 1144110592.0, + "14": 1144112640.0, + "15": 1144111616.0, + "16": 1144112640.0, + "17": 1144112128.0, + "18": 1144113152.0, + "19": 1144112640.0, + "20": 1144113664.0, + "21": 1144113152.0, + "22": 1144114176.0, + "23": 1144113664.0, + "24": 1144111616.0, + "25": 1144110592.0, + "26": 1144113664.0, + "27": 1144113664.0, + "28": 1144112128.0, + "29": 1144110080.0, + "30": 1144113152.0, + "31": 1144116224.0, + "32": 1144112128.0, + "33": 1144113152.0, + "34": 1144113664.0, + "35": 1144115712.0, + "36": 1144111616.0, + "37": 1144111104.0, + "38": 1144110592.0, + "39": 1144113664.0, + "40": 1144113664.0, + "41": 1144114176.0, + "42": 1144109056.0, + "43": 1144114176.0, + "44": 1144115200.0, + "45": 1144113152.0, + "46": 1144117760.0, + "47": 1144113152.0, + "48": 1144115712.0, + "49": 1144117760.0, + "50": 1144114176.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1593775104.0, + "2": 2049587200.0, + "3": 2050487808.0, + "4": 2050487808.0, + "5": 2050487808.0, + "6": 2051877376.0, + "7": 2052037632.0, + "8": 2052037632.0, + "9": 2053219840.0, + "10": 2055123968.0, + "11": 2055123968.0, + "12": 2055123968.0, + "13": 2055123968.0, + "14": 2055123968.0, + "15": 2055123968.0, + "16": 2055123968.0, + "17": 2055123968.0, + "18": 2055123968.0, + "19": 2055123968.0, + "20": 2055123968.0, + "21": 2055123968.0, + "22": 2055123968.0, + "23": 2055123968.0, + "24": 2055123968.0, + "25": 2055123968.0, + "26": 2055123968.0, + "27": 2055123968.0, + "28": 2055123968.0, + "29": 2055123968.0, + "30": 2055123968.0, + "31": 2055123968.0, + "32": 2055123968.0, + "33": 2055123968.0, + "34": 2055123968.0, + "35": 2055123968.0, + "36": 2055123968.0, + "37": 2055123968.0, + "38": 2055123968.0, + "39": 2055123968.0, + "40": 2055123968.0, + "41": 2055123968.0, + "42": 2055123968.0, + "43": 2055123968.0, + "44": 2055123968.0, + "45": 2055123968.0, + "46": 2055123968.0, + "47": 2055123968.0, + "48": 2055123968.0, + "49": 2055123968.0, + "50": 2055123968.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 17.54696, + "2": 0.35381, + "3": 0.30805, + "4": 0.32999, + "5": 0.28074, + "6": 0.27713, + "7": 0.30692, + "8": 0.27076, + "9": 0.28178, + "10": 0.28798, + "11": 0.26657, + "12": 0.27288, + "13": 0.27118, + "14": 0.26505, + "15": 0.27307, + "16": 0.26745, + "17": 0.28092, + "18": 0.25951, + "19": 0.26123, + "20": 0.27117, + "21": 0.26705, + "22": 0.27657, + "23": 0.2785, + "24": 0.27138, + "25": 0.27542, + "26": 0.26549, + "27": 0.26436, + "28": 0.2817, + "29": 0.26002, + "30": 0.26437, + "31": 0.29073, + "32": 0.27239, + "33": 0.26215, + "34": 0.2748, + "35": 0.2623, + "36": 0.25929, + "37": 0.26086, + "38": 0.26996, + "39": 0.25721, + "40": 0.25938, + "41": 0.26959, + "42": 0.25657, + "43": 0.26426, + "44": 0.25689, + "45": 0.26206, + "46": 0.27753, + "47": 0.27998, + "48": 0.26838, + "49": 0.27354, + "50": 0.26097 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..e3d20b7e9f0 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.8277, + "2": 10.84068, + "3": 10.82725, + "4": 10.81926, + "5": 10.85722, + "6": 10.86986, + "7": 10.85174, + "8": 10.84457, + "9": 10.85329, + "10": 10.79198, + "11": 10.86553, + "12": 10.87133, + "13": 10.87076, + "14": 10.87887, + "15": 10.82554, + "16": 10.81223, + "17": 10.77441, + "18": 10.81045, + "19": 10.79657, + "20": 10.72264, + "21": 10.69696, + "22": 10.55147, + "23": 10.7054, + "24": 10.59026, + "25": 10.54438, + "26": 10.60027, + "27": 10.61973, + "28": 10.5745, + "29": 10.58661, + "30": 10.35758, + "31": 10.12167, + "32": 10.46999, + "33": 10.45701, + "34": 10.21559, + "35": 10.27129, + "36": 10.23523, + "37": 10.35245, + "38": 10.20629, + "39": 10.40093, + "40": 10.09725, + "41": 10.13848, + "42": 10.21819, + "43": 9.84432, + "44": 9.9617, + "45": 9.84065, + "46": 9.8197, + "47": 10.13911, + "48": 9.85183, + "49": 9.53564, + "50": 9.92448 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4680.0, + "2": 4942.0, + "3": 4820.0, + "4": 4878.0, + "5": 5470.0, + "6": 5474.0, + "7": 5224.0, + "8": 4738.0, + "9": 5223.0, + "10": 4223.0, + "11": 5625.0, + "12": 5287.0, + "13": 5621.0, + "14": 5408.0, + "15": 5262.0, + "16": 5461.0, + "17": 5216.0, + "18": 5076.0, + "19": 5238.0, + "20": 4985.0, + "21": 5432.0, + "22": 4799.0, + "23": 5740.0, + "24": 5056.0, + "25": 4935.0, + "26": 5264.0, + "27": 5417.0, + "28": 5800.0, + "29": 5904.0, + "30": 5454.0, + "31": 4819.0, + "32": 5859.0, + "33": 6012.0, + "34": 5038.0, + "35": 5618.0, + "36": 5650.0, + "37": 6312.0, + "38": 6183.0, + "39": 6590.0, + "40": 5923.0, + "41": 5990.0, + "42": 6285.0, + "43": 5816.0, + "44": 5809.0, + "45": 5685.0, + "46": 5951.0, + "47": 6413.0, + "48": 6367.0, + "49": 6227.0, + "50": 6746.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1114761216.0, + "2": 1114759680.0, + "3": 1114756608.0, + "4": 1114760192.0, + "5": 1114757120.0, + "6": 1114757632.0, + "7": 1114759168.0, + "8": 1114757632.0, + "9": 1114759680.0, + "10": 1114759168.0, + "11": 1114759168.0, + "12": 1114758144.0, + "13": 1114763264.0, + "14": 1114757120.0, + "15": 1114760192.0, + "16": 1114758144.0, + "17": 1114757120.0, + "18": 1114760192.0, + "19": 1114758144.0, + "20": 1114758656.0, + "21": 1114757120.0, + "22": 1114758144.0, + "23": 1114758144.0, + "24": 1114760704.0, + "25": 1114758144.0, + "26": 1114761216.0, + "27": 1114758656.0, + "28": 1114759680.0, + "29": 1114760704.0, + "30": 1114757120.0, + "31": 1114763776.0, + "32": 1114758656.0, + "33": 1114757120.0, + "34": 1114758656.0, + "35": 1114761216.0, + "36": 1114756608.0, + "37": 1114758144.0, + "38": 1114760192.0, + "39": 1114757632.0, + "40": 1114759680.0, + "41": 1114760192.0, + "42": 1114761216.0, + "43": 1114760704.0, + "44": 1114760192.0, + "45": 1114758656.0, + "46": 1114760192.0, + "47": 1114759680.0, + "48": 1114757120.0, + "49": 1114755072.0, + "50": 1114760704.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1563573248.0, + "2": 2019811840.0, + "3": 2019811840.0, + "4": 2020643840.0, + "5": 2020643840.0, + "6": 2020643840.0, + "7": 2020643840.0, + "8": 2020643840.0, + "9": 2020643840.0, + "10": 2024514560.0, + "11": 2024514560.0, + "12": 2024514560.0, + "13": 2025665536.0, + "14": 2025665536.0, + "15": 2025665536.0, + "16": 2025665536.0, + "17": 2025665536.0, + "18": 2025665536.0, + "19": 2025665536.0, + "20": 2025665536.0, + "21": 2025665536.0, + "22": 2025665536.0, + "23": 2025665536.0, + "24": 2025665536.0, + "25": 2025665536.0, + "26": 2025665536.0, + "27": 2025665536.0, + "28": 2025665536.0, + "29": 2025665536.0, + "30": 2025665536.0, + "31": 2028067328.0, + "32": 2028067328.0, + "33": 2028067328.0, + "34": 2028067328.0, + "35": 2028067328.0, + "36": 2028067328.0, + "37": 2028067328.0, + "38": 2028067328.0, + "39": 2028067328.0, + "40": 2028067328.0, + "41": 2028067328.0, + "42": 2028067328.0, + "43": 2028067328.0, + "44": 2028067328.0, + "45": 2028067328.0, + "46": 2028067328.0, + "47": 2028067328.0, + "48": 2028067328.0, + "49": 2028067328.0, + "50": 2028067328.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.27277, + "2": 0.40327, + "3": 0.34567, + "4": 0.33458, + "5": 0.33204, + "6": 0.33309, + "7": 0.33268, + "8": 0.33286, + "9": 0.33475, + "10": 0.3322, + "11": 0.33002, + "12": 0.33139, + "13": 0.32988, + "14": 0.32847, + "15": 0.329, + "16": 0.33243, + "17": 0.32814, + "18": 0.32942, + "19": 0.33246, + "20": 0.32858, + "21": 0.32917, + "22": 0.34065, + "23": 0.32906, + "24": 0.33021, + "25": 0.33765, + "26": 0.32931, + "27": 0.32935, + "28": 0.33465, + "29": 0.32924, + "30": 0.32887, + "31": 0.33235, + "32": 0.32882, + "33": 0.33484, + "34": 0.33959, + "35": 0.33548, + "36": 0.33621, + "37": 0.33811, + "38": 0.33082, + "39": 0.33203, + "40": 0.33659, + "41": 0.33085, + "42": 0.33009, + "43": 0.33311, + "44": 0.32891, + "45": 0.32947, + "46": 0.33546, + "47": 0.32941, + "48": 0.32968, + "49": 0.33644, + "50": 0.3272 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..ba66ccd2c7b --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.8277, + "2": 10.84068, + "3": 10.82724, + "4": 10.81923, + "5": 10.85713, + "6": 10.87014, + "7": 10.85172, + "8": 10.84521, + "9": 10.85279, + "10": 10.79234, + "11": 10.86534, + "12": 10.87114, + "13": 10.87049, + "14": 10.87874, + "15": 10.82545, + "16": 10.81195, + "17": 10.77413, + "18": 10.81121, + "19": 10.79683, + "20": 10.72265, + "21": 10.69712, + "22": 10.55129, + "23": 10.70543, + "24": 10.58987, + "25": 10.54438, + "26": 10.60004, + "27": 10.62008, + "28": 10.57416, + "29": 10.58628, + "30": 10.35718, + "31": 10.12186, + "32": 10.47004, + "33": 10.457, + "34": 10.21604, + "35": 10.27123, + "36": 10.23567, + "37": 10.35221, + "38": 10.20618, + "39": 10.40139, + "40": 10.09681, + "41": 10.13873, + "42": 10.21803, + "43": 9.84419, + "44": 9.96192, + "45": 9.84135, + "46": 9.81933, + "47": 10.13938, + "48": 9.85137, + "49": 9.53548, + "50": 9.92432 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4732.0, + "2": 4949.0, + "3": 4906.0, + "4": 4915.0, + "5": 5426.0, + "6": 5376.0, + "7": 5127.0, + "8": 4923.0, + "9": 5398.0, + "10": 4190.0, + "11": 5650.0, + "12": 5207.0, + "13": 5521.0, + "14": 5564.0, + "15": 5258.0, + "16": 5655.0, + "17": 5201.0, + "18": 5166.0, + "19": 5222.0, + "20": 4973.0, + "21": 5289.0, + "22": 4840.0, + "23": 5690.0, + "24": 4966.0, + "25": 4863.0, + "26": 5234.0, + "27": 5239.0, + "28": 5757.0, + "29": 5841.0, + "30": 5290.0, + "31": 4822.0, + "32": 5828.0, + "33": 6111.0, + "34": 5127.0, + "35": 5596.0, + "36": 5581.0, + "37": 6423.0, + "38": 6184.0, + "39": 6619.0, + "40": 5870.0, + "41": 6054.0, + "42": 6325.0, + "43": 5910.0, + "44": 5902.0, + "45": 5841.0, + "46": 6222.0, + "47": 6329.0, + "48": 6302.0, + "49": 6013.0, + "50": 6678.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1114761216.0, + "2": 1114759680.0, + "3": 1114757632.0, + "4": 1114760192.0, + "5": 1114757120.0, + "6": 1114757632.0, + "7": 1114759680.0, + "8": 1114758144.0, + "9": 1114760192.0, + "10": 1114758656.0, + "11": 1114759168.0, + "12": 1114759168.0, + "13": 1114762752.0, + "14": 1114757120.0, + "15": 1114760192.0, + "16": 1114758144.0, + "17": 1114757120.0, + "18": 1114760192.0, + "19": 1114756608.0, + "20": 1114759168.0, + "21": 1114756608.0, + "22": 1114758144.0, + "23": 1114758144.0, + "24": 1114760704.0, + "25": 1114757120.0, + "26": 1114761216.0, + "27": 1114758656.0, + "28": 1114759680.0, + "29": 1114760704.0, + "30": 1114757632.0, + "31": 1114763776.0, + "32": 1114760192.0, + "33": 1114758144.0, + "34": 1114758656.0, + "35": 1114761216.0, + "36": 1114756608.0, + "37": 1114758144.0, + "38": 1114760192.0, + "39": 1114757632.0, + "40": 1114759168.0, + "41": 1114760192.0, + "42": 1114760192.0, + "43": 1114761728.0, + "44": 1114760192.0, + "45": 1114759680.0, + "46": 1114760192.0, + "47": 1114759680.0, + "48": 1114757120.0, + "49": 1114755072.0, + "50": 1114761216.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1563573248.0, + "2": 2019811840.0, + "3": 2019811840.0, + "4": 2020516864.0, + "5": 2020516864.0, + "6": 2020516864.0, + "7": 2020516864.0, + "8": 2020516864.0, + "9": 2020516864.0, + "10": 2023621120.0, + "11": 2023621120.0, + "12": 2023621120.0, + "13": 2025571840.0, + "14": 2025571840.0, + "15": 2025571840.0, + "16": 2025571840.0, + "17": 2025571840.0, + "18": 2025571840.0, + "19": 2025571840.0, + "20": 2025571840.0, + "21": 2025571840.0, + "22": 2025571840.0, + "23": 2025571840.0, + "24": 2025571840.0, + "25": 2025571840.0, + "26": 2025571840.0, + "27": 2025571840.0, + "28": 2025571840.0, + "29": 2025571840.0, + "30": 2025571840.0, + "31": 2027690496.0, + "32": 2027690496.0, + "33": 2027690496.0, + "34": 2027690496.0, + "35": 2027690496.0, + "36": 2027690496.0, + "37": 2027690496.0, + "38": 2027690496.0, + "39": 2027690496.0, + "40": 2027690496.0, + "41": 2027690496.0, + "42": 2027690496.0, + "43": 2027690496.0, + "44": 2027690496.0, + "45": 2027690496.0, + "46": 2027690496.0, + "47": 2027690496.0, + "48": 2027690496.0, + "49": 2027690496.0, + "50": 2027690496.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 13.26761, + "2": 0.46509, + "3": 0.33784, + "4": 0.32867, + "5": 0.32614, + "6": 0.3325, + "7": 0.32603, + "8": 0.32762, + "9": 0.33105, + "10": 0.3264, + "11": 0.32497, + "12": 0.33102, + "13": 0.32607, + "14": 0.32484, + "15": 0.32523, + "16": 0.33277, + "17": 0.33128, + "18": 0.32838, + "19": 0.32883, + "20": 0.32857, + "21": 0.32833, + "22": 0.32958, + "23": 0.32767, + "24": 0.32771, + "25": 0.32857, + "26": 0.32941, + "27": 0.33631, + "28": 0.3369, + "29": 0.32694, + "30": 0.32566, + "31": 0.32837, + "32": 0.32456, + "33": 0.32475, + "34": 0.33037, + "35": 0.32967, + "36": 0.33178, + "37": 0.32753, + "38": 0.324, + "39": 0.32398, + "40": 0.32822, + "41": 0.32419, + "42": 0.33155, + "43": 0.33488, + "44": 0.32987, + "45": 0.32872, + "46": 0.33575, + "47": 0.32897, + "48": 0.32935, + "49": 0.33172, + "50": 0.32626 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_h100.json index 0366fd2c402..d74ca1632d3 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.7999, - "5": 10.82494, - "10": 10.77362, - "15": 10.78226, - "20": 10.69951, - "25": 10.51731, + "2": 10.80046, + "3": 10.8086, + "4": 10.78211, + "5": 10.8253, + "6": 10.83613, + "7": 10.81656, + "8": 10.81172, + "9": 10.81127, + "10": 10.77365, + "11": 10.8551, + "12": 10.82716, + "13": 10.85093, + "14": 10.85516, + "15": 10.78294, + "16": 10.7735, + "17": 10.75018, + "18": 10.78378, + "19": 10.75892, + "20": 10.6994, + "21": 10.67278, + "22": 10.51458, + "23": 10.68081, + "24": 10.57159, + "25": 10.51778, + "26": 10.57633, + "27": 10.59163, + "28": 10.55359, + "29": 10.57084, "30": 10.3646, - "35": 10.25444, - "40": 10.10206, - "45": 9.84247, - "50": 9.92579 + "31": 10.1091, + "32": 10.45327, + "33": 10.43719, + "34": 10.20028, + "35": 10.25449, + "36": 10.23294, + "37": 10.35395, + "38": 10.20435, + "39": 10.3991, + "40": 10.10257, + "41": 10.12803, + "42": 10.21095, + "43": 9.83714, + "44": 9.96175, + "45": 9.84268, + "46": 9.80685, + "47": 10.14284, + "48": 9.86671, + "49": 9.53845, + "50": 9.92551 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 4776.0, - "5": 5514.0, - "10": 4403.0, - "15": 5180.0, - "20": 4969.0, - "25": 5011.0, - "30": 5227.0, - "35": 5579.0, - "40": 5764.0, - "45": 5881.0, - "50": 6673.0 + "1": 4814.0, + "2": 4952.0, + "3": 5040.0, + "4": 5015.0, + "5": 5519.0, + "6": 5551.0, + "7": 5268.0, + "8": 4810.0, + "9": 5397.0, + "10": 4501.0, + "11": 5891.0, + "12": 5339.0, + "13": 5837.0, + "14": 5809.0, + "15": 5355.0, + "16": 5453.0, + "17": 5423.0, + "18": 5110.0, + "19": 5401.0, + "20": 4905.0, + "21": 5349.0, + "22": 4914.0, + "23": 5700.0, + "24": 5043.0, + "25": 4863.0, + "26": 5343.0, + "27": 5411.0, + "28": 5792.0, + "29": 6026.0, + "30": 5282.0, + "31": 4823.0, + "32": 5676.0, + "33": 6043.0, + "34": 5245.0, + "35": 5629.0, + "36": 5372.0, + "37": 6399.0, + "38": 5915.0, + "39": 6572.0, + "40": 5759.0, + "41": 5969.0, + "42": 6425.0, + "43": 5757.0, + "44": 5808.0, + "45": 5780.0, + "46": 6040.0, + "47": 6533.0, + "48": 6375.0, + "49": 6343.0, + "50": 6648.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1145716736.0, - "5": 1146210304.0, - "10": 1146214400.0, - "15": 1145712640.0, + "2": 1145714688.0, + "3": 1146211840.0, + "4": 1145713152.0, + "5": 1146210816.0, + "6": 1145713664.0, + "7": 1145717248.0, + "8": 1145713664.0, + "9": 1145714688.0, + "10": 1146214912.0, + "11": 1145714176.0, + "12": 1145714176.0, + "13": 1146208768.0, + "14": 1146209280.0, + "15": 1145713152.0, + "16": 1146210304.0, + "17": 1145713664.0, + "18": 1146209280.0, + "19": 1145714176.0, "20": 1145715200.0, + "21": 1146210304.0, + "22": 1145715712.0, + "23": 1145715200.0, + "24": 1145713152.0, "25": 1145712128.0, + "26": 1145715200.0, + "27": 1145715200.0, + "28": 1145713664.0, + "29": 1145711616.0, "30": 1145714688.0, - "35": 1146213376.0, + "31": 1145717760.0, + "32": 1145713664.0, + "33": 1145714688.0, + "34": 1145715200.0, + "35": 1146212352.0, + "36": 1145713152.0, + "37": 1145712128.0, + "38": 1146208256.0, + "39": 1145715200.0, "40": 1146210816.0, - "45": 1146210304.0, - "50": 1146211328.0 + "41": 1145715712.0, + "42": 1145712640.0, + "43": 1146211840.0, + "44": 1145716736.0, + "45": 1146209280.0, + "46": 1146214400.0, + "47": 1145714688.0, + "48": 1145717760.0, + "49": 1146215424.0, + "50": 1145716224.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 1593766912.0, - "5": 2052878848.0, - "10": 2057082880.0, - "15": 2057082880.0, - "20": 2057082880.0, - "25": 2057082880.0, - "30": 2057082880.0, - "35": 2057082880.0, - "40": 2057082880.0, - "45": 2057082880.0, - "50": 2057082880.0 + "1": 1593775104.0, + "2": 2051463168.0, + "3": 2052884992.0, + "4": 2052884992.0, + "5": 2052884992.0, + "6": 2053490176.0, + "7": 2054021632.0, + "8": 2054517248.0, + "9": 2057131520.0, + "10": 2057131520.0, + "11": 2057131520.0, + "12": 2057131520.0, + "13": 2057131520.0, + "14": 2057131520.0, + "15": 2057131520.0, + "16": 2057131520.0, + "17": 2057131520.0, + "18": 2057131520.0, + "19": 2057131520.0, + "20": 2057131520.0, + "21": 2057131520.0, + "22": 2057131520.0, + "23": 2057131520.0, + "24": 2057131520.0, + "25": 2057131520.0, + "26": 2057131520.0, + "27": 2057131520.0, + "28": 2057131520.0, + "29": 2057131520.0, + "30": 2057131520.0, + "31": 2057131520.0, + "32": 2057131520.0, + "33": 2057131520.0, + "34": 2057131520.0, + "35": 2057131520.0, + "36": 2057131520.0, + "37": 2057131520.0, + "38": 2057131520.0, + "39": 2057131520.0, + "40": 2057131520.0, + "41": 2057131520.0, + "42": 2057131520.0, + "43": 2057131520.0, + "44": 2057131520.0, + "45": 2057131520.0, + "46": 2057131520.0, + "47": 2057131520.0, + "48": 2057131520.0, + "49": 2057131520.0, + "50": 2057131520.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 16.10299, - "5": 0.23726, - "10": 0.2493, - "15": 0.24042, - "20": 0.23243, - "25": 0.23678, - "30": 0.22651, - "35": 0.24325, - "40": 0.23894, - "45": 0.23878, - "50": 0.24489 + "1": 17.92077, + "2": 0.34824, + "3": 0.30032, + "4": 0.32972, + "5": 0.27324, + "6": 0.26945, + "7": 0.29877, + "8": 0.27354, + "9": 0.26617, + "10": 0.28282, + "11": 0.26525, + "12": 0.2586, + "13": 0.27078, + "14": 0.25807, + "15": 0.27244, + "16": 0.26017, + "17": 0.27564, + "18": 0.26003, + "19": 0.25894, + "20": 0.26689, + "21": 0.26403, + "22": 0.26923, + "23": 0.27423, + "24": 0.25699, + "25": 0.26351, + "26": 0.26238, + "27": 0.26331, + "28": 0.27004, + "29": 0.2532, + "30": 0.2563, + "31": 0.27893, + "32": 0.27696, + "33": 0.25765, + "34": 0.27112, + "35": 0.26525, + "36": 0.25555, + "37": 0.25575, + "38": 0.26372, + "39": 0.25643, + "40": 0.25561, + "41": 0.26327, + "42": 0.25857, + "43": 0.26139, + "44": 0.26205, + "45": 0.25417, + "46": 0.28594, + "47": 0.27128, + "48": 0.2658, + "49": 0.27152, + "50": 0.26917 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..d48956be89e --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82721, + "2": 10.84035, + "3": 10.82731, + "4": 10.8193, + "5": 10.85656, + "6": 10.86991, + "7": 10.85176, + "8": 10.84458, + "9": 10.85252, + "10": 10.79217, + "11": 10.86529, + "12": 10.87083, + "13": 10.87071, + "14": 10.87878, + "15": 10.8256, + "16": 10.81248, + "17": 10.77483, + "18": 10.81066, + "19": 10.79672, + "20": 10.72242, + "21": 10.69688, + "22": 10.55103, + "23": 10.70528, + "24": 10.58973, + "25": 10.54425, + "26": 10.60032, + "27": 10.61999, + "28": 10.57405, + "29": 10.58627, + "30": 10.35725, + "31": 10.12171, + "32": 10.46994, + "33": 10.45695, + "34": 10.21593, + "35": 10.27139, + "36": 10.23585, + "37": 10.35223, + "38": 10.2059, + "39": 10.40125, + "40": 10.09684, + "41": 10.13886, + "42": 10.21812, + "43": 9.844, + "44": 9.96181, + "45": 9.84089, + "46": 9.81931, + "47": 10.13885, + "48": 9.85137, + "49": 9.53541, + "50": 9.92461 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4670.0, + "2": 4925.0, + "3": 4817.0, + "4": 4835.0, + "5": 5226.0, + "6": 5495.0, + "7": 5198.0, + "8": 4891.0, + "9": 5214.0, + "10": 4166.0, + "11": 5633.0, + "12": 5315.0, + "13": 5554.0, + "14": 5559.0, + "15": 5192.0, + "16": 5394.0, + "17": 5248.0, + "18": 5006.0, + "19": 5237.0, + "20": 4719.0, + "21": 5259.0, + "22": 4964.0, + "23": 5678.0, + "24": 4965.0, + "25": 4888.0, + "26": 5299.0, + "27": 5130.0, + "28": 5735.0, + "29": 5988.0, + "30": 5407.0, + "31": 4663.0, + "32": 5678.0, + "33": 6177.0, + "34": 5149.0, + "35": 5654.0, + "36": 5646.0, + "37": 6416.0, + "38": 6119.0, + "39": 6544.0, + "40": 5933.0, + "41": 5933.0, + "42": 6358.0, + "43": 5750.0, + "44": 5789.0, + "45": 5877.0, + "46": 6198.0, + "47": 6488.0, + "48": 6231.0, + "49": 6062.0, + "50": 6752.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1114775040.0, + "2": 1114774528.0, + "3": 1114772480.0, + "4": 1114774016.0, + "5": 1114770944.0, + "6": 1114771456.0, + "7": 1114772480.0, + "8": 1114771968.0, + "9": 1114774016.0, + "10": 1114771968.0, + "11": 1114773504.0, + "12": 1114772480.0, + "13": 1114776064.0, + "14": 1114770944.0, + "15": 1114774016.0, + "16": 1114771968.0, + "17": 1114770944.0, + "18": 1114774528.0, + "19": 1115379712.0, + "20": 1114772480.0, + "21": 1114772480.0, + "22": 1114771968.0, + "23": 1114771968.0, + "24": 1114775552.0, + "25": 1114771968.0, + "26": 1114774528.0, + "27": 1114772480.0, + "28": 1114773504.0, + "29": 1114774528.0, + "30": 1114770944.0, + "31": 1114777600.0, + "32": 1114773504.0, + "33": 1114770944.0, + "34": 1114772480.0, + "35": 1114775040.0, + "36": 1114770944.0, + "37": 1114771968.0, + "38": 1114772992.0, + "39": 1114771456.0, + "40": 1114774016.0, + "41": 1114774016.0, + "42": 1114775040.0, + "43": 1114775552.0, + "44": 1114774016.0, + "45": 1114772480.0, + "46": 1114774528.0, + "47": 1114772480.0, + "48": 1114770944.0, + "49": 1114768896.0, + "50": 1114774528.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1563110912.0, + "2": 2019832832.0, + "3": 2019832832.0, + "4": 2020812288.0, + "5": 2020812288.0, + "6": 2020812288.0, + "7": 2020812288.0, + "8": 2020812288.0, + "9": 2020812288.0, + "10": 2024751616.0, + "11": 2024751616.0, + "12": 2024751616.0, + "13": 2026121728.0, + "14": 2026121728.0, + "15": 2026121728.0, + "16": 2026121728.0, + "17": 2026121728.0, + "18": 2026121728.0, + "19": 2026121728.0, + "20": 2026121728.0, + "21": 2026121728.0, + "22": 2026121728.0, + "23": 2026121728.0, + "24": 2026121728.0, + "25": 2026121728.0, + "26": 2026121728.0, + "27": 2026121728.0, + "28": 2026121728.0, + "29": 2026121728.0, + "30": 2026121728.0, + "31": 2028742656.0, + "32": 2028742656.0, + "33": 2028742656.0, + "34": 2028742656.0, + "35": 2028742656.0, + "36": 2028742656.0, + "37": 2028742656.0, + "38": 2028742656.0, + "39": 2028742656.0, + "40": 2028742656.0, + "41": 2028742656.0, + "42": 2028742656.0, + "43": 2028742656.0, + "44": 2028742656.0, + "45": 2028742656.0, + "46": 2028742656.0, + "47": 2028742656.0, + "48": 2028742656.0, + "49": 2028742656.0, + "50": 2028742656.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 12.92068, + "2": 0.40425, + "3": 0.34949, + "4": 0.34585, + "5": 0.34357, + "6": 0.34307, + "7": 0.34349, + "8": 0.34363, + "9": 0.34455, + "10": 0.34336, + "11": 0.34249, + "12": 0.34279, + "13": 0.34314, + "14": 0.34376, + "15": 0.34119, + "16": 0.3408, + "17": 0.34177, + "18": 0.34009, + "19": 0.38762, + "20": 0.38864, + "21": 0.35834, + "22": 0.34233, + "23": 0.34258, + "24": 0.33896, + "25": 0.34661, + "26": 0.35239, + "27": 0.36394, + "28": 0.36314, + "29": 0.36104, + "30": 0.36054, + "31": 0.36036, + "32": 0.36349, + "33": 0.35945, + "34": 0.36271, + "35": 0.35678, + "36": 0.34046, + "37": 0.34187, + "38": 0.35806, + "39": 0.39525, + "40": 0.3435, + "41": 0.34593, + "42": 0.34164, + "43": 0.3405, + "44": 0.36624, + "45": 0.3662, + "46": 0.35554, + "47": 0.39304, + "48": 0.3749, + "49": 0.34201, + "50": 0.34231 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxa100_dracooci.json new file mode 100644 index 00000000000..bf890527985 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82721, + "2": 10.84035, + "3": 10.82731, + "4": 10.81928, + "5": 10.85683, + "6": 10.8698, + "7": 10.85147, + "8": 10.84484, + "9": 10.85252, + "10": 10.79142, + "11": 10.86555, + "12": 10.871, + "13": 10.87036, + "14": 10.87845, + "15": 10.82569, + "16": 10.81221, + "17": 10.7744, + "18": 10.81066, + "19": 10.79634, + "20": 10.7227, + "21": 10.6971, + "22": 10.55121, + "23": 10.70525, + "24": 10.59041, + "25": 10.54452, + "26": 10.60048, + "27": 10.62034, + "28": 10.57457, + "29": 10.58623, + "30": 10.35753, + "31": 10.12178, + "32": 10.46993, + "33": 10.45705, + "34": 10.21585, + "35": 10.27128, + "36": 10.23542, + "37": 10.35235, + "38": 10.20634, + "39": 10.40108, + "40": 10.09667, + "41": 10.1389, + "42": 10.21808, + "43": 9.8441, + "44": 9.96205, + "45": 9.84118, + "46": 9.81927, + "47": 10.13911, + "48": 9.85152, + "49": 9.53526, + "50": 9.92459 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4746.0, + "2": 4973.0, + "3": 4892.0, + "4": 4874.0, + "5": 5506.0, + "6": 5432.0, + "7": 5176.0, + "8": 4842.0, + "9": 5339.0, + "10": 4379.0, + "11": 5515.0, + "12": 5341.0, + "13": 5380.0, + "14": 5634.0, + "15": 5225.0, + "16": 5387.0, + "17": 5339.0, + "18": 5069.0, + "19": 5247.0, + "20": 4850.0, + "21": 5323.0, + "22": 4896.0, + "23": 5748.0, + "24": 5014.0, + "25": 4847.0, + "26": 5322.0, + "27": 5362.0, + "28": 5664.0, + "29": 6074.0, + "30": 5529.0, + "31": 4774.0, + "32": 5603.0, + "33": 5954.0, + "34": 5052.0, + "35": 5715.0, + "36": 5575.0, + "37": 6245.0, + "38": 6130.0, + "39": 6515.0, + "40": 5938.0, + "41": 5907.0, + "42": 6316.0, + "43": 5659.0, + "44": 5930.0, + "45": 5838.0, + "46": 6112.0, + "47": 6528.0, + "48": 6294.0, + "49": 6282.0, + "50": 6606.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1114775040.0, + "2": 1114774528.0, + "3": 1114772480.0, + "4": 1114774016.0, + "5": 1114770944.0, + "6": 1114771456.0, + "7": 1114772992.0, + "8": 1114771456.0, + "9": 1114774016.0, + "10": 1114772992.0, + "11": 1114773504.0, + "12": 1114772992.0, + "13": 1114776576.0, + "14": 1114770944.0, + "15": 1114774016.0, + "16": 1114774016.0, + "17": 1114770432.0, + "18": 1114774528.0, + "19": 1114770432.0, + "20": 1114772480.0, + "21": 1114771456.0, + "22": 1114771968.0, + "23": 1114771968.0, + "24": 1114775040.0, + "25": 1114770944.0, + "26": 1114774528.0, + "27": 1114772992.0, + "28": 1114774016.0, + "29": 1114774528.0, + "30": 1114770944.0, + "31": 1114777600.0, + "32": 1114773504.0, + "33": 1114771968.0, + "34": 1114772480.0, + "35": 1114775040.0, + "36": 1114770432.0, + "37": 1114771968.0, + "38": 1114772992.0, + "39": 1114770432.0, + "40": 1114774016.0, + "41": 1114775040.0, + "42": 1114774016.0, + "43": 1114774528.0, + "44": 1114774016.0, + "45": 1114772480.0, + "46": 1114774528.0, + "47": 1114773504.0, + "48": 1114770432.0, + "49": 1114769920.0, + "50": 1114775040.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1563110912.0, + "2": 2019832832.0, + "3": 2019832832.0, + "4": 2020751872.0, + "5": 2020751872.0, + "6": 2020751872.0, + "7": 2020751872.0, + "8": 2020751872.0, + "9": 2020751872.0, + "10": 2024683008.0, + "11": 2024683008.0, + "12": 2024683008.0, + "13": 2025170944.0, + "14": 2025170944.0, + "15": 2025170944.0, + "16": 2025170944.0, + "17": 2025170944.0, + "18": 2025170944.0, + "19": 2025170944.0, + "20": 2025170944.0, + "21": 2025170944.0, + "22": 2025170944.0, + "23": 2025170944.0, + "24": 2025170944.0, + "25": 2025170944.0, + "26": 2025170944.0, + "27": 2025170944.0, + "28": 2025170944.0, + "29": 2025170944.0, + "30": 2025170944.0, + "31": 2027281408.0, + "32": 2027281408.0, + "33": 2027281408.0, + "34": 2027281408.0, + "35": 2027281408.0, + "36": 2027281408.0, + "37": 2027281408.0, + "38": 2027281408.0, + "39": 2027281408.0, + "40": 2027281408.0, + "41": 2027281408.0, + "42": 2027281408.0, + "43": 2027281408.0, + "44": 2027281408.0, + "45": 2027281408.0, + "46": 2027281408.0, + "47": 2027281408.0, + "48": 2027281408.0, + "49": 2027281408.0, + "50": 2027281408.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.92351, + "2": 0.44162, + "3": 0.35291, + "4": 0.35092, + "5": 0.34453, + "6": 0.34539, + "7": 0.34192, + "8": 0.34196, + "9": 0.3428, + "10": 0.34254, + "11": 0.34053, + "12": 0.34338, + "13": 0.34149, + "14": 0.34237, + "15": 0.34549, + "16": 0.36487, + "17": 0.34819, + "18": 0.34282, + "19": 0.34387, + "20": 0.34346, + "21": 0.34257, + "22": 0.34498, + "23": 0.3426, + "24": 0.34129, + "25": 0.34497, + "26": 0.34552, + "27": 0.34229, + "28": 0.34963, + "29": 0.34554, + "30": 0.34365, + "31": 0.34384, + "32": 0.34359, + "33": 0.34344, + "34": 0.34432, + "35": 0.34398, + "36": 0.344, + "37": 0.34452, + "38": 0.34594, + "39": 0.34391, + "40": 0.34438, + "41": 0.34366, + "42": 0.34258, + "43": 0.34401, + "44": 0.34425, + "45": 0.34371, + "46": 0.34314, + "47": 0.34264, + "48": 0.34318, + "49": 0.34322, + "50": 0.34204 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..0c2d8bc15ac --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.7999, + "2": 10.80046, + "3": 10.80882, + "4": 10.78271, + "5": 10.82527, + "6": 10.83559, + "7": 10.81654, + "8": 10.81189, + "9": 10.81027, + "10": 10.77395, + "11": 10.85546, + "12": 10.82687, + "13": 10.85063, + "14": 10.85519, + "15": 10.78219, + "16": 10.77344, + "17": 10.75025, + "18": 10.78337, + "19": 10.75865, + "20": 10.69949, + "21": 10.67201, + "22": 10.51454, + "23": 10.68053, + "24": 10.57151, + "25": 10.51842, + "26": 10.57602, + "27": 10.59131, + "28": 10.55338, + "29": 10.5705, + "30": 10.36499, + "31": 10.10913, + "32": 10.45347, + "33": 10.43732, + "34": 10.20004, + "35": 10.2548, + "36": 10.23345, + "37": 10.35402, + "38": 10.2041, + "39": 10.39978, + "40": 10.10252, + "41": 10.12783, + "42": 10.21103, + "43": 9.83757, + "44": 9.96217, + "45": 9.84252, + "46": 9.80674, + "47": 10.14274, + "48": 9.86654, + "49": 9.53815, + "50": 9.92567 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4865.0, + "2": 4889.0, + "3": 5053.0, + "4": 5128.0, + "5": 5538.0, + "6": 5637.0, + "7": 5195.0, + "8": 4942.0, + "9": 5569.0, + "10": 4503.0, + "11": 6001.0, + "12": 5343.0, + "13": 5607.0, + "14": 5820.0, + "15": 5246.0, + "16": 5419.0, + "17": 5489.0, + "18": 5301.0, + "19": 5323.0, + "20": 4805.0, + "21": 5272.0, + "22": 4832.0, + "23": 5649.0, + "24": 5122.0, + "25": 4835.0, + "26": 5369.0, + "27": 5430.0, + "28": 5771.0, + "29": 6155.0, + "30": 5193.0, + "31": 4946.0, + "32": 5822.0, + "33": 6136.0, + "34": 5157.0, + "35": 5508.0, + "36": 5439.0, + "37": 6566.0, + "38": 6146.0, + "39": 6504.0, + "40": 5752.0, + "41": 5973.0, + "42": 6371.0, + "43": 5634.0, + "44": 5975.0, + "45": 5779.0, + "46": 5939.0, + "47": 6534.0, + "48": 6362.0, + "49": 6390.0, + "50": 6421.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1144115200.0, + "2": 1144113152.0, + "3": 1144113664.0, + "4": 1144112640.0, + "5": 1144113664.0, + "6": 1144113664.0, + "7": 1144115200.0, + "8": 1144112128.0, + "9": 1144113152.0, + "10": 1144117248.0, + "11": 1144112640.0, + "12": 1144112640.0, + "13": 1144110592.0, + "14": 1144113664.0, + "15": 1144111616.0, + "16": 1144113152.0, + "17": 1144112128.0, + "18": 1144114176.0, + "19": 1144112640.0, + "20": 1144113664.0, + "21": 1144113152.0, + "22": 1144113664.0, + "23": 1144114176.0, + "24": 1144111616.0, + "25": 1144110592.0, + "26": 1144114688.0, + "27": 1144113664.0, + "28": 1144112128.0, + "29": 1144109568.0, + "30": 1144113152.0, + "31": 1144116224.0, + "32": 1144112128.0, + "33": 1144113152.0, + "34": 1144113664.0, + "35": 1144115712.0, + "36": 1144112128.0, + "37": 1144110592.0, + "38": 1144110592.0, + "39": 1144113664.0, + "40": 1144113664.0, + "41": 1144114176.0, + "42": 1144111104.0, + "43": 1144114176.0, + "44": 1144116224.0, + "45": 1144112640.0, + "46": 1144116736.0, + "47": 1144113152.0, + "48": 1144116224.0, + "49": 1144117760.0, + "50": 1144114688.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1593766912.0, + "2": 2049587200.0, + "3": 2050706944.0, + "4": 2050706944.0, + "5": 2050706944.0, + "6": 2051856896.0, + "7": 2052133888.0, + "8": 2052133888.0, + "9": 2053136896.0, + "10": 2054898688.0, + "11": 2054898688.0, + "12": 2054898688.0, + "13": 2054898688.0, + "14": 2054898688.0, + "15": 2054898688.0, + "16": 2054898688.0, + "17": 2054898688.0, + "18": 2054898688.0, + "19": 2054898688.0, + "20": 2054898688.0, + "21": 2054898688.0, + "22": 2054898688.0, + "23": 2054898688.0, + "24": 2054898688.0, + "25": 2054898688.0, + "26": 2054898688.0, + "27": 2054898688.0, + "28": 2054898688.0, + "29": 2054898688.0, + "30": 2054898688.0, + "31": 2054898688.0, + "32": 2054898688.0, + "33": 2054898688.0, + "34": 2054898688.0, + "35": 2054898688.0, + "36": 2054898688.0, + "37": 2054898688.0, + "38": 2054898688.0, + "39": 2054898688.0, + "40": 2054898688.0, + "41": 2054898688.0, + "42": 2054898688.0, + "43": 2054898688.0, + "44": 2054898688.0, + "45": 2054898688.0, + "46": 2054898688.0, + "47": 2054898688.0, + "48": 2054898688.0, + "49": 2054898688.0, + "50": 2054898688.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 19.95177, + "2": 0.34433, + "3": 0.26792, + "4": 0.28931, + "5": 0.24286, + "6": 0.23522, + "7": 0.26191, + "8": 0.24179, + "9": 0.23443, + "10": 0.2479, + "11": 0.22843, + "12": 0.23568, + "13": 0.22851, + "14": 0.22301, + "15": 0.23496, + "16": 0.22557, + "17": 0.23185, + "18": 0.22478, + "19": 0.21988, + "20": 0.22721, + "21": 0.22747, + "22": 0.25032, + "23": 0.23584, + "24": 0.22392, + "25": 0.24076, + "26": 0.22602, + "27": 0.21942, + "28": 0.25471, + "29": 0.22059, + "30": 0.22483, + "31": 0.24893, + "32": 0.23382, + "33": 0.2228, + "34": 0.24334, + "35": 0.22325, + "36": 0.22492, + "37": 0.22009, + "38": 0.22761, + "39": 0.22117, + "40": 0.22618, + "41": 0.23324, + "42": 0.23137, + "43": 0.23, + "44": 0.23628, + "45": 0.22927, + "46": 0.24977, + "47": 0.23757, + "48": 0.24069, + "49": 0.254, + "50": 0.23443 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..d342471ff77 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.7999, + "2": 10.80046, + "3": 10.80877, + "4": 10.78226, + "5": 10.8254, + "6": 10.83596, + "7": 10.81676, + "8": 10.81163, + "9": 10.81106, + "10": 10.77366, + "11": 10.85495, + "12": 10.82711, + "13": 10.85109, + "14": 10.8546, + "15": 10.78267, + "16": 10.77358, + "17": 10.75036, + "18": 10.78319, + "19": 10.75876, + "20": 10.6992, + "21": 10.67244, + "22": 10.51382, + "23": 10.68112, + "24": 10.57174, + "25": 10.51756, + "26": 10.57624, + "27": 10.59185, + "28": 10.55401, + "29": 10.57113, + "30": 10.36465, + "31": 10.10866, + "32": 10.45338, + "33": 10.43764, + "34": 10.20033, + "35": 10.25433, + "36": 10.23362, + "37": 10.35369, + "38": 10.20443, + "39": 10.39917, + "40": 10.10245, + "41": 10.12765, + "42": 10.21106, + "43": 9.83722, + "44": 9.962, + "45": 9.84252, + "46": 9.80612, + "47": 10.14257, + "48": 9.86665, + "49": 9.5383, + "50": 9.92576 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4827.0, + "2": 4935.0, + "3": 5030.0, + "4": 4956.0, + "5": 5583.0, + "6": 5594.0, + "7": 5325.0, + "8": 5098.0, + "9": 5335.0, + "10": 4581.0, + "11": 5895.0, + "12": 5249.0, + "13": 5692.0, + "14": 5736.0, + "15": 5303.0, + "16": 5347.0, + "17": 5361.0, + "18": 5322.0, + "19": 5407.0, + "20": 4961.0, + "21": 5441.0, + "22": 4776.0, + "23": 5752.0, + "24": 5157.0, + "25": 4897.0, + "26": 5202.0, + "27": 5455.0, + "28": 5769.0, + "29": 5911.0, + "30": 5256.0, + "31": 4674.0, + "32": 5854.0, + "33": 6080.0, + "34": 5278.0, + "35": 5743.0, + "36": 5523.0, + "37": 6477.0, + "38": 5839.0, + "39": 6711.0, + "40": 5852.0, + "41": 6062.0, + "42": 6501.0, + "43": 5605.0, + "44": 5883.0, + "45": 5763.0, + "46": 6076.0, + "47": 6613.0, + "48": 6348.0, + "49": 6430.0, + "50": 6699.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1145716736.0, + "2": 1145714688.0, + "3": 1145715200.0, + "4": 1145714176.0, + "5": 1146210816.0, + "6": 1146210304.0, + "7": 1145716736.0, + "8": 1146209792.0, + "9": 1145714688.0, + "10": 1146214912.0, + "11": 1145714176.0, + "12": 1145713664.0, + "13": 1145712128.0, + "14": 1146209280.0, + "15": 1145713152.0, + "16": 1146210304.0, + "17": 1145713664.0, + "18": 1146210304.0, + "19": 1145714176.0, + "20": 1145715200.0, + "21": 1146210304.0, + "22": 1145715712.0, + "23": 1145716224.0, + "24": 1145713152.0, + "25": 1145712128.0, + "26": 1145715200.0, + "27": 1146210304.0, + "28": 1145713664.0, + "29": 1145711104.0, + "30": 1145714688.0, + "31": 1146213376.0, + "32": 1145713152.0, + "33": 1145714688.0, + "34": 1145714688.0, + "35": 1146213376.0, + "36": 1145713664.0, + "37": 1145712128.0, + "38": 1146207744.0, + "39": 1145715200.0, + "40": 1146210816.0, + "41": 1145714688.0, + "42": 1145711104.0, + "43": 1146211840.0, + "44": 1145717248.0, + "45": 1145714688.0, + "46": 1146214400.0, + "47": 1145714688.0, + "48": 1145717248.0, + "49": 1146214912.0, + "50": 1145716224.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1593766912.0, + "2": 2051463168.0, + "3": 2052584960.0, + "4": 2052584960.0, + "5": 2052584960.0, + "6": 2053404160.0, + "7": 2054199296.0, + "8": 2054199296.0, + "9": 2056971776.0, + "10": 2057138688.0, + "11": 2057138688.0, + "12": 2057138688.0, + "13": 2057138688.0, + "14": 2057138688.0, + "15": 2057138688.0, + "16": 2057138688.0, + "17": 2057138688.0, + "18": 2057138688.0, + "19": 2057138688.0, + "20": 2057138688.0, + "21": 2057138688.0, + "22": 2057138688.0, + "23": 2057138688.0, + "24": 2057138688.0, + "25": 2057138688.0, + "26": 2057138688.0, + "27": 2057138688.0, + "28": 2057138688.0, + "29": 2057138688.0, + "30": 2057138688.0, + "31": 2057138688.0, + "32": 2057138688.0, + "33": 2057138688.0, + "34": 2057138688.0, + "35": 2057138688.0, + "36": 2057138688.0, + "37": 2057138688.0, + "38": 2057138688.0, + "39": 2057138688.0, + "40": 2057138688.0, + "41": 2057138688.0, + "42": 2057138688.0, + "43": 2057138688.0, + "44": 2057138688.0, + "45": 2057138688.0, + "46": 2057138688.0, + "47": 2057138688.0, + "48": 2057138688.0, + "49": 2057138688.0, + "50": 2057138688.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 17.99317, + "2": 0.35408, + "3": 0.30455, + "4": 0.32631, + "5": 0.27174, + "6": 0.27168, + "7": 0.29847, + "8": 0.27152, + "9": 0.27606, + "10": 0.27991, + "11": 0.25875, + "12": 0.25854, + "13": 0.26351, + "14": 0.2599, + "15": 0.26827, + "16": 0.25734, + "17": 0.26876, + "18": 0.26302, + "19": 0.25791, + "20": 0.26587, + "21": 0.26207, + "22": 0.2718, + "23": 0.27036, + "24": 0.2557, + "25": 0.27098, + "26": 0.2562, + "27": 0.25663, + "28": 0.28209, + "29": 0.25678, + "30": 0.26198, + "31": 0.27896, + "32": 0.26879, + "33": 0.25449, + "34": 0.27377, + "35": 0.25725, + "36": 0.25349, + "37": 0.2537, + "38": 0.26246, + "39": 0.25527, + "40": 0.25676, + "41": 0.26427, + "42": 0.25718, + "43": 0.26206, + "44": 0.25615, + "45": 0.261, + "46": 0.28413, + "47": 0.27633, + "48": 0.26455, + "49": 0.2706, + "50": 0.25944 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgx_a100.json index 99b3ed41c91..4383c914d8e 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgx_a100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgx_a100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.8277, - "5": 10.85649, - "10": 10.79211, - "15": 10.82563, - "20": 10.72221, - "25": 10.54409, - "30": 10.35728, - "35": 10.2714, - "40": 10.09718, - "45": 9.8411, - "50": 9.92428 + "2": 10.84068, + "3": 10.82705, + "4": 10.81913, + "5": 10.85673, + "6": 10.86984, + "7": 10.85119, + "8": 10.84465, + "9": 10.85269, + "10": 10.79157, + "11": 10.86571, + "12": 10.87169, + "13": 10.8708, + "14": 10.8787, + "15": 10.82554, + "16": 10.81251, + "17": 10.77478, + "18": 10.81068, + "19": 10.79632, + "20": 10.72175, + "21": 10.69765, + "22": 10.55138, + "23": 10.70555, + "24": 10.59005, + "25": 10.54425, + "26": 10.60036, + "27": 10.61973, + "28": 10.57442, + "29": 10.58656, + "30": 10.35754, + "31": 10.12169, + "32": 10.46987, + "33": 10.45722, + "34": 10.2158, + "35": 10.27086, + "36": 10.2354, + "37": 10.35246, + "38": 10.20574, + "39": 10.40061, + "40": 10.09681, + "41": 10.13869, + "42": 10.21829, + "43": 9.84428, + "44": 9.9614, + "45": 9.84116, + "46": 9.81955, + "47": 10.13927, + "48": 9.85138, + "49": 9.53518, + "50": 9.92455 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 4700.0, - "5": 5362.0, - "10": 4330.0, - "15": 5291.0, - "20": 4879.0, - "25": 4857.0, - "30": 5475.0, - "35": 5683.0, - "40": 5899.0, - "45": 5910.0, - "50": 6643.0 + "1": 4627.0, + "2": 4785.0, + "3": 4887.0, + "4": 5134.0, + "5": 5403.0, + "6": 5457.0, + "7": 5140.0, + "8": 4876.0, + "9": 5213.0, + "10": 4396.0, + "11": 5749.0, + "12": 5182.0, + "13": 5436.0, + "14": 5431.0, + "15": 5327.0, + "16": 5452.0, + "17": 5245.0, + "18": 5116.0, + "19": 5216.0, + "20": 4869.0, + "21": 5326.0, + "22": 4832.0, + "23": 5719.0, + "24": 5017.0, + "25": 4980.0, + "26": 5288.0, + "27": 5346.0, + "28": 5727.0, + "29": 5937.0, + "30": 5289.0, + "31": 4777.0, + "32": 5616.0, + "33": 6137.0, + "34": 5140.0, + "35": 5690.0, + "36": 5739.0, + "37": 6425.0, + "38": 5962.0, + "39": 6620.0, + "40": 5921.0, + "41": 5820.0, + "42": 6472.0, + "43": 5860.0, + "44": 5731.0, + "45": 5769.0, + "46": 6130.0, + "47": 6576.0, + "48": 6403.0, + "49": 6084.0, + "50": 6648.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 1116857344.0, - "5": 1116853248.0, - "10": 1116854784.0, - "15": 1116856320.0, - "20": 1116853760.0, - "25": 1116854272.0, - "30": 1116853248.0, - "35": 1116857344.0, - "40": 1116855808.0, - "45": 1116854784.0, - "50": 1116857856.0 + "1": 1115810816.0, + "2": 1115809280.0, + "3": 1115807232.0, + "4": 1115809792.0, + "5": 1115806720.0, + "6": 1115807232.0, + "7": 1115808768.0, + "8": 1115807744.0, + "9": 1115809792.0, + "10": 1115808768.0, + "11": 1115808768.0, + "12": 1115808256.0, + "13": 1115811840.0, + "14": 1115807232.0, + "15": 1115809792.0, + "16": 1115808768.0, + "17": 1115806720.0, + "18": 1115809792.0, + "19": 1115806208.0, + "20": 1115808256.0, + "21": 1115806208.0, + "22": 1115807744.0, + "23": 1115807744.0, + "24": 1115810304.0, + "25": 1115807744.0, + "26": 1115810304.0, + "27": 1115808256.0, + "28": 1115809280.0, + "29": 1115810304.0, + "30": 1115806720.0, + "31": 1115813376.0, + "32": 1115809792.0, + "33": 1115807744.0, + "34": 1115808256.0, + "35": 1115810816.0, + "36": 1115806208.0, + "37": 1115807744.0, + "38": 1115809792.0, + "39": 1115807232.0, + "40": 1115809792.0, + "41": 1115810816.0, + "42": 1115810816.0, + "43": 1115811328.0, + "44": 1115809792.0, + "45": 1115808768.0, + "46": 1115810304.0, + "47": 1115808256.0, + "48": 1115806208.0, + "49": 1115805184.0, + "50": 1115811328.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1562923008.0, - "5": 2023396352.0, - "10": 2024858112.0, - "15": 2026634240.0, - "20": 2026634240.0, - "25": 2026634240.0, - "30": 2026634240.0, - "35": 2029936128.0, - "40": 2029936128.0, - "45": 2029936128.0, - "50": 2029936128.0 + "2": 2021974528.0, + "3": 2021974528.0, + "4": 2023057408.0, + "5": 2023057408.0, + "6": 2023057408.0, + "7": 2023057408.0, + "8": 2023057408.0, + "9": 2023057408.0, + "10": 2026853376.0, + "11": 2026853376.0, + "12": 2026853376.0, + "13": 2026853376.0, + "14": 2026853376.0, + "15": 2026853376.0, + "16": 2026853376.0, + "17": 2026853376.0, + "18": 2026853376.0, + "19": 2026853376.0, + "20": 2026853376.0, + "21": 2026964992.0, + "22": 2026964992.0, + "23": 2026964992.0, + "24": 2026964992.0, + "25": 2026964992.0, + "26": 2026964992.0, + "27": 2026964992.0, + "28": 2026964992.0, + "29": 2026964992.0, + "30": 2026964992.0, + "31": 2030492160.0, + "32": 2030492160.0, + "33": 2030492160.0, + "34": 2030492160.0, + "35": 2030492160.0, + "36": 2030492160.0, + "37": 2030492160.0, + "38": 2030492160.0, + "39": 2030492160.0, + "40": 2030492160.0, + "41": 2030492160.0, + "42": 2030492160.0, + "43": 2030492160.0, + "44": 2030492160.0, + "45": 2030492160.0, + "46": 2030492160.0, + "47": 2030492160.0, + "48": 2030492160.0, + "49": 2030492160.0, + "50": 2030492160.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 21.72442, - "5": 0.36486, - "10": 0.36609, - "15": 0.36152, - "20": 0.36301, - "25": 0.36085, - "30": 0.36083, - "35": 0.36317, - "40": 0.35895, - "45": 0.35462, - "50": 0.34937 + "1": 18.3953, + "2": 0.37892, + "3": 0.34007, + "4": 0.3355, + "5": 0.33186, + "6": 0.33483, + "7": 0.3277, + "8": 0.32755, + "9": 0.32791, + "10": 0.32415, + "11": 0.32272, + "12": 0.32392, + "13": 0.33508, + "14": 0.31609, + "15": 0.31941, + "16": 0.3178, + "17": 0.31692, + "18": 0.31834, + "19": 0.32074, + "20": 0.31765, + "21": 0.31933, + "22": 0.32169, + "23": 0.32073, + "24": 0.31872, + "25": 0.32305, + "26": 0.32018, + "27": 0.32077, + "28": 0.32022, + "29": 0.31612, + "30": 0.31263, + "31": 0.31663, + "32": 0.31415, + "33": 0.31634, + "34": 0.31559, + "35": 0.31239, + "36": 0.31218, + "37": 0.31427, + "38": 0.31433, + "39": 0.31314, + "40": 0.313, + "41": 0.31331, + "42": 0.31314, + "43": 0.31359, + "44": 0.31884, + "45": 0.31165, + "46": 0.31278, + "47": 0.31273, + "48": 0.31668, + "49": 0.31177, + "50": 0.31472 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..4fcc118b15a --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.8277, + "2": 10.84068, + "3": 10.82714, + "4": 10.81884, + "5": 10.85728, + "6": 10.86967, + "7": 10.85152, + "8": 10.84475, + "9": 10.85262, + "10": 10.79178, + "11": 10.86557, + "12": 10.87118, + "13": 10.87048, + "14": 10.87859, + "15": 10.82536, + "16": 10.81201, + "17": 10.77492, + "18": 10.81058, + "19": 10.79647, + "20": 10.72219, + "21": 10.69747, + "22": 10.55109, + "23": 10.70545, + "24": 10.59037, + "25": 10.54404, + "26": 10.60056, + "27": 10.6198, + "28": 10.57404, + "29": 10.5863, + "30": 10.35713, + "31": 10.12151, + "32": 10.47043, + "33": 10.45666, + "34": 10.21561, + "35": 10.2715, + "36": 10.23562, + "37": 10.35244, + "38": 10.20598, + "39": 10.40084, + "40": 10.09662, + "41": 10.13854, + "42": 10.21819, + "43": 9.84461, + "44": 9.96191, + "45": 9.84123, + "46": 9.81958, + "47": 10.13898, + "48": 9.85141, + "49": 9.53538, + "50": 9.92427 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4663.0, + "2": 4900.0, + "3": 4885.0, + "4": 4989.0, + "5": 5412.0, + "6": 5512.0, + "7": 5196.0, + "8": 4835.0, + "9": 5183.0, + "10": 4430.0, + "11": 5618.0, + "12": 5155.0, + "13": 5430.0, + "14": 5486.0, + "15": 5243.0, + "16": 5345.0, + "17": 5174.0, + "18": 5152.0, + "19": 5229.0, + "20": 4720.0, + "21": 5279.0, + "22": 4870.0, + "23": 5653.0, + "24": 4987.0, + "25": 4930.0, + "26": 5230.0, + "27": 5136.0, + "28": 5923.0, + "29": 5833.0, + "30": 5420.0, + "31": 4687.0, + "32": 5606.0, + "33": 6087.0, + "34": 5166.0, + "35": 5579.0, + "36": 5643.0, + "37": 6381.0, + "38": 6032.0, + "39": 6660.0, + "40": 5774.0, + "41": 5952.0, + "42": 6422.0, + "43": 5957.0, + "44": 5847.0, + "45": 5675.0, + "46": 6132.0, + "47": 6540.0, + "48": 6342.0, + "49": 6080.0, + "50": 6648.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1114761216.0, + "2": 1114759680.0, + "3": 1114756608.0, + "4": 1114760192.0, + "5": 1114757120.0, + "6": 1114757632.0, + "7": 1114759680.0, + "8": 1114757632.0, + "9": 1114760192.0, + "10": 1114758656.0, + "11": 1114758656.0, + "12": 1114759168.0, + "13": 1114762752.0, + "14": 1114757120.0, + "15": 1114760192.0, + "16": 1114759168.0, + "17": 1114757632.0, + "18": 1114761728.0, + "19": 1114757632.0, + "20": 1114758656.0, + "21": 1114758656.0, + "22": 1114758144.0, + "23": 1114758144.0, + "24": 1114761216.0, + "25": 1114758144.0, + "26": 1114760704.0, + "27": 1114758656.0, + "28": 1114759680.0, + "29": 1114760704.0, + "30": 1114757120.0, + "31": 1114763776.0, + "32": 1114759680.0, + "33": 1114758144.0, + "34": 1114758656.0, + "35": 1114761216.0, + "36": 1114756608.0, + "37": 1114758144.0, + "38": 1114759168.0, + "39": 1114758144.0, + "40": 1114760192.0, + "41": 1114761728.0, + "42": 1114761216.0, + "43": 1114761728.0, + "44": 1114760192.0, + "45": 1114759168.0, + "46": 1114759168.0, + "47": 1114759680.0, + "48": 1114756608.0, + "49": 1114755072.0, + "50": 1114761216.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1563573248.0, + "2": 2019811840.0, + "3": 2019811840.0, + "4": 2020844544.0, + "5": 2020844544.0, + "6": 2020844544.0, + "7": 2020844544.0, + "8": 2020844544.0, + "9": 2020844544.0, + "10": 2022765056.0, + "11": 2022765056.0, + "12": 2022765056.0, + "13": 2025424384.0, + "14": 2025424384.0, + "15": 2025424384.0, + "16": 2025424384.0, + "17": 2025424384.0, + "18": 2025424384.0, + "19": 2025424384.0, + "20": 2025424384.0, + "21": 2025424384.0, + "22": 2025424384.0, + "23": 2025424384.0, + "24": 2025424384.0, + "25": 2025424384.0, + "26": 2025424384.0, + "27": 2025424384.0, + "28": 2025424384.0, + "29": 2025424384.0, + "30": 2025424384.0, + "31": 2027865600.0, + "32": 2027865600.0, + "33": 2027865600.0, + "34": 2027865600.0, + "35": 2027865600.0, + "36": 2027865600.0, + "37": 2027865600.0, + "38": 2027865600.0, + "39": 2027865600.0, + "40": 2027865600.0, + "41": 2027865600.0, + "42": 2027865600.0, + "43": 2027865600.0, + "44": 2027865600.0, + "45": 2027865600.0, + "46": 2027865600.0, + "47": 2027865600.0, + "48": 2027865600.0, + "49": 2027865600.0, + "50": 2027865600.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 13.35833, + "2": 0.41869, + "3": 0.36543, + "4": 0.34709, + "5": 0.33564, + "6": 0.33325, + "7": 0.33079, + "8": 0.32901, + "9": 0.32623, + "10": 0.32947, + "11": 0.32518, + "12": 0.32588, + "13": 0.32491, + "14": 0.32913, + "15": 0.32376, + "16": 0.32422, + "17": 0.32793, + "18": 0.32466, + "19": 0.32256, + "20": 0.32888, + "21": 0.32611, + "22": 0.32289, + "23": 0.32585, + "24": 0.32069, + "25": 0.31969, + "26": 0.32564, + "27": 0.32022, + "28": 0.32015, + "29": 0.33015, + "30": 0.32397, + "31": 0.33512, + "32": 0.35571, + "33": 0.35217, + "34": 0.35178, + "35": 0.3531, + "36": 0.35005, + "37": 0.35174, + "38": 0.35672, + "39": 0.3522, + "40": 0.35137, + "41": 0.3597, + "42": 0.3514, + "43": 0.34943, + "44": 0.3423, + "45": 0.34024, + "46": 0.34465, + "47": 0.34043, + "48": 0.34108, + "49": 0.34462, + "50": 0.33863 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..fa073cf9e82 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.8277, + "2": 10.84068, + "3": 10.8272, + "4": 10.81929, + "5": 10.85696, + "6": 10.86987, + "7": 10.85171, + "8": 10.84459, + "9": 10.85256, + "10": 10.79201, + "11": 10.86562, + "12": 10.8711, + "13": 10.87024, + "14": 10.87806, + "15": 10.82518, + "16": 10.81192, + "17": 10.77419, + "18": 10.81073, + "19": 10.79667, + "20": 10.72258, + "21": 10.69715, + "22": 10.55066, + "23": 10.70497, + "24": 10.59057, + "25": 10.54424, + "26": 10.6002, + "27": 10.61999, + "28": 10.5741, + "29": 10.58671, + "30": 10.35729, + "31": 10.12229, + "32": 10.47057, + "33": 10.45683, + "34": 10.216, + "35": 10.27106, + "36": 10.23572, + "37": 10.35232, + "38": 10.20564, + "39": 10.40105, + "40": 10.09702, + "41": 10.13866, + "42": 10.21783, + "43": 9.84408, + "44": 9.96172, + "45": 9.84126, + "46": 9.81956, + "47": 10.13914, + "48": 9.85116, + "49": 9.53564, + "50": 9.92445 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4700.0, + "2": 4941.0, + "3": 4879.0, + "4": 5033.0, + "5": 5483.0, + "6": 5460.0, + "7": 5198.0, + "8": 4858.0, + "9": 5126.0, + "10": 4376.0, + "11": 5570.0, + "12": 5203.0, + "13": 5521.0, + "14": 5427.0, + "15": 5181.0, + "16": 5391.0, + "17": 5179.0, + "18": 5030.0, + "19": 5304.0, + "20": 4943.0, + "21": 5245.0, + "22": 4859.0, + "23": 5613.0, + "24": 5111.0, + "25": 4846.0, + "26": 5147.0, + "27": 5309.0, + "28": 5797.0, + "29": 5929.0, + "30": 5357.0, + "31": 4733.0, + "32": 5718.0, + "33": 6104.0, + "34": 5218.0, + "35": 5554.0, + "36": 5610.0, + "37": 6378.0, + "38": 6206.0, + "39": 6498.0, + "40": 5948.0, + "41": 6006.0, + "42": 6256.0, + "43": 5824.0, + "44": 5788.0, + "45": 5746.0, + "46": 6111.0, + "47": 6493.0, + "48": 6237.0, + "49": 6304.0, + "50": 6666.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1114761216.0, + "2": 1114759680.0, + "3": 1114756608.0, + "4": 1114760192.0, + "5": 1114758144.0, + "6": 1114757632.0, + "7": 1114759680.0, + "8": 1114757632.0, + "9": 1114760192.0, + "10": 1114758656.0, + "11": 1114759680.0, + "12": 1114758144.0, + "13": 1114762752.0, + "14": 1114757120.0, + "15": 1114760192.0, + "16": 1114757120.0, + "17": 1114757120.0, + "18": 1114760192.0, + "19": 1114757120.0, + "20": 1114758656.0, + "21": 1114757632.0, + "22": 1114758144.0, + "23": 1114758144.0, + "24": 1114760704.0, + "25": 1114758144.0, + "26": 1114760704.0, + "27": 1114758656.0, + "28": 1114760192.0, + "29": 1114760704.0, + "30": 1114757120.0, + "31": 1114763776.0, + "32": 1114760192.0, + "33": 1114757120.0, + "34": 1114758656.0, + "35": 1114761216.0, + "36": 1114756608.0, + "37": 1114758144.0, + "38": 1114759168.0, + "39": 1114757632.0, + "40": 1114759680.0, + "41": 1114761216.0, + "42": 1114760192.0, + "43": 1114761728.0, + "44": 1114760192.0, + "45": 1114758656.0, + "46": 1114760192.0, + "47": 1114758656.0, + "48": 1114757120.0, + "49": 1114755072.0, + "50": 1114760192.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1563573248.0, + "2": 2021675520.0, + "3": 2022020608.0, + "4": 2022438912.0, + "5": 2022438912.0, + "6": 2022438912.0, + "7": 2022438912.0, + "8": 2022438912.0, + "9": 2022438912.0, + "10": 2025787904.0, + "11": 2025787904.0, + "12": 2025787904.0, + "13": 2027309568.0, + "14": 2027309568.0, + "15": 2027309568.0, + "16": 2027309568.0, + "17": 2027309568.0, + "18": 2027309568.0, + "19": 2027309568.0, + "20": 2027309568.0, + "21": 2027309568.0, + "22": 2027309568.0, + "23": 2027309568.0, + "24": 2027309568.0, + "25": 2027309568.0, + "26": 2027309568.0, + "27": 2027309568.0, + "28": 2027309568.0, + "29": 2027309568.0, + "30": 2027309568.0, + "31": 2029440512.0, + "32": 2029440512.0, + "33": 2029440512.0, + "34": 2029440512.0, + "35": 2029440512.0, + "36": 2029440512.0, + "37": 2029440512.0, + "38": 2029440512.0, + "39": 2029440512.0, + "40": 2029440512.0, + "41": 2029440512.0, + "42": 2029440512.0, + "43": 2029440512.0, + "44": 2029440512.0, + "45": 2029440512.0, + "46": 2029440512.0, + "47": 2029440512.0, + "48": 2029440512.0, + "49": 2029440512.0, + "50": 2029440512.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 16.95291, + "2": 0.49442, + "3": 0.34431, + "4": 0.34311, + "5": 0.34183, + "6": 0.34138, + "7": 0.34639, + "8": 0.34265, + "9": 0.34183, + "10": 0.34356, + "11": 0.3425, + "12": 0.33847, + "13": 0.3416, + "14": 0.33396, + "15": 0.33683, + "16": 0.34022, + "17": 0.34114, + "18": 0.33741, + "19": 0.33884, + "20": 0.33846, + "21": 0.33805, + "22": 0.33745, + "23": 0.34007, + "24": 0.33732, + "25": 0.33138, + "26": 0.33193, + "27": 0.33201, + "28": 0.33221, + "29": 0.33258, + "30": 0.33151, + "31": 0.33323, + "32": 0.33272, + "33": 0.33137, + "34": 0.33328, + "35": 0.3321, + "36": 0.33173, + "37": 0.33275, + "38": 0.33386, + "39": 0.33182, + "40": 0.3331, + "41": 0.3318, + "42": 0.33143, + "43": 0.33272, + "44": 0.33166, + "45": 0.32995, + "46": 0.33258, + "47": 0.332, + "48": 0.33126, + "49": 0.33438, + "50": 0.32754 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json index ab35aab19fb..cd1596da3bc 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.81746, + "2": 10.82149, + "3": 10.82234, + "4": 10.79883, "5": 10.84067, + "6": 10.85636, + "7": 10.81775, + "8": 10.81498, + "9": 10.83664, "10": 10.7822, + "11": 10.85151, + "12": 10.84335, + "13": 10.85001, + "14": 10.87346, "15": 10.80974, + "16": 10.80359, + "17": 10.75702, + "18": 10.80691, + "19": 10.78689, "20": 10.73095, + "21": 10.70872, + "22": 10.57886, + "23": 10.71772, + "24": 10.63253, "25": 10.57332, + "26": 10.62323, + "27": 10.63892, + "28": 10.60509, + "29": 10.61796, "30": 10.42067, + "31": 10.18074, + "32": 10.50619, + "33": 10.50937, + "34": 10.27626, "35": 10.3249, + "36": 10.29423, + "37": 10.40006, + "38": 10.26099, + "39": 10.44197, "40": 10.1644, + "41": 10.2004, + "42": 10.26981, + "43": 9.93054, + "44": 10.04184, "45": 9.9288, + "46": 9.89638, + "47": 10.18471, + "48": 9.93119, + "49": 9.62763, "50": 9.98402 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 5082.0, + "2": 5274.0, + "3": 5447.0, + "4": 5269.0, "5": 6020.0, + "6": 6160.0, + "7": 5592.0, + "8": 5309.0, + "9": 5743.0, "10": 4800.0, + "11": 6186.0, + "12": 5648.0, + "13": 6106.0, + "14": 6126.0, "15": 5600.0, + "16": 5819.0, + "17": 5669.0, + "18": 5547.0, + "19": 5711.0, "20": 5380.0, + "21": 5677.0, + "22": 5023.0, + "23": 6080.0, + "24": 5403.0, "25": 5120.0, + "26": 5431.0, + "27": 5866.0, + "28": 6035.0, + "29": 6154.0, "30": 5456.0, + "31": 4832.0, + "32": 5956.0, + "33": 6301.0, + "34": 5366.0, "35": 5900.0, + "36": 5703.0, + "37": 6744.0, + "38": 6098.0, + "39": 6737.0, "40": 5994.0, + "41": 6144.0, + "42": 6542.0, + "43": 5751.0, + "44": 5876.0, "45": 5795.0, + "46": 6162.0, + "47": 6736.0, + "48": 6331.0, + "49": 6235.0, "50": 6668.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 627718656.0, + "2": 627719168.0, + "3": 627719168.0, + "4": 627720704.0, "5": 627718656.0, + "6": 627718656.0, + "7": 627718144.0, + "8": 627718144.0, + "9": 627718144.0, "10": 627719168.0, + "11": 627719680.0, + "12": 627719168.0, + "13": 627719680.0, + "14": 627717120.0, "15": 627720192.0, + "16": 627717632.0, + "17": 627718144.0, + "18": 627719680.0, + "19": 627719168.0, "20": 627717120.0, + "21": 627718144.0, + "22": 627720192.0, + "23": 627720192.0, + "24": 627718144.0, "25": 627718656.0, + "26": 627718144.0, + "27": 627717120.0, + "28": 627718656.0, + "29": 627717120.0, "30": 627720192.0, + "31": 627715072.0, + "32": 627720192.0, + "33": 627717632.0, + "34": 627719168.0, "35": 627716608.0, + "36": 627719168.0, + "37": 627718144.0, + "38": 627718656.0, + "39": 627715584.0, "40": 627717632.0, + "41": 627714560.0, + "42": 627718144.0, + "43": 627713536.0, + "44": 627714048.0, "45": 627719168.0, + "46": 627716096.0, + "47": 627717120.0, + "48": 627716608.0, + "49": 627715072.0, "50": 627718144.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 870138880.0, + "2": 1099332096.0, + "3": 1099950080.0, + "4": 1102007296.0, "5": 1102007296.0, + "6": 1102007296.0, + "7": 1102007296.0, + "8": 1102007296.0, + "9": 1102007296.0, "10": 1102007296.0, + "11": 1102007296.0, + "12": 1102007296.0, + "13": 1103012352.0, + "14": 1103012352.0, "15": 1103012352.0, + "16": 1103012352.0, + "17": 1103012352.0, + "18": 1103012352.0, + "19": 1103012352.0, "20": 1103012352.0, + "21": 1103012352.0, + "22": 1103012352.0, + "23": 1103012352.0, + "24": 1103012352.0, "25": 1103012352.0, + "26": 1103012352.0, + "27": 1103012352.0, + "28": 1103012352.0, + "29": 1103012352.0, "30": 1103012352.0, + "31": 1103012352.0, + "32": 1103012352.0, + "33": 1103012352.0, + "34": 1103012352.0, "35": 1103012352.0, + "36": 1103012352.0, + "37": 1103012352.0, + "38": 1103012352.0, + "39": 1103012352.0, "40": 1103012352.0, + "41": 1103012352.0, + "42": 1103012352.0, + "43": 1103012352.0, + "44": 1103012352.0, "45": 1103012352.0, + "46": 1103012352.0, + "47": 1103012352.0, + "48": 1103012352.0, + "49": 1103012352.0, "50": 1103012352.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 16.6451, - "5": 0.44582, - "10": 0.44604, - "15": 0.45437, - "20": 0.44805, - "25": 0.44906, - "30": 0.44594, - "35": 0.44862, - "40": 0.45549, - "45": 0.44951, - "50": 0.44015 + "1": 18.1916, + "2": 0.59351, + "3": 0.53789, + "4": 0.55618, + "5": 0.51747, + "6": 0.51798, + "7": 0.53735, + "8": 0.51847, + "9": 0.51772, + "10": 0.51103, + "11": 0.51385, + "12": 0.50834, + "13": 0.51586, + "14": 0.50721, + "15": 0.53294, + "16": 0.51593, + "17": 0.51388, + "18": 0.51464, + "19": 0.50827, + "20": 0.50952, + "21": 0.50189, + "22": 0.50928, + "23": 0.50324, + "24": 0.50354, + "25": 0.50213, + "26": 0.49708, + "27": 0.49953, + "28": 0.50373, + "29": 0.50455, + "30": 0.50305, + "31": 0.50567, + "32": 0.50905, + "33": 0.50325, + "34": 0.51203, + "35": 0.52783, + "36": 0.51023, + "37": 0.50726, + "38": 0.52285, + "39": 0.50728, + "40": 0.52086, + "41": 0.51671, + "42": 0.51607, + "43": 0.51296, + "44": 0.51003, + "45": 0.51106, + "46": 0.53309, + "47": 0.52738, + "48": 0.5128, + "49": 0.53044, + "50": 0.50994 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..1a408849afc --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82196, + "2": 10.84132, + "3": 10.81128, + "4": 10.82231, + "5": 10.84518, + "6": 10.8626, + "7": 10.84391, + "8": 10.84701, + "9": 10.84948, + "10": 10.78921, + "11": 10.85726, + "12": 10.84459, + "13": 10.87146, + "14": 10.87456, + "15": 10.8336, + "16": 10.80914, + "17": 10.79111, + "18": 10.81065, + "19": 10.80588, + "20": 10.73505, + "21": 10.71444, + "22": 10.57729, + "23": 10.72656, + "24": 10.61835, + "25": 10.58138, + "26": 10.63781, + "27": 10.63741, + "28": 10.60575, + "29": 10.61061, + "30": 10.40958, + "31": 10.16916, + "32": 10.49914, + "33": 10.49662, + "34": 10.26146, + "35": 10.31467, + "36": 10.28534, + "37": 10.38868, + "38": 10.24742, + "39": 10.43812, + "40": 10.14618, + "41": 10.19703, + "42": 10.26135, + "43": 9.9103, + "44": 10.02321, + "45": 9.91713, + "46": 9.89492, + "47": 10.19337, + "48": 9.93091, + "49": 9.61227, + "50": 9.97428 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4960.0, + "2": 5301.0, + "3": 5425.0, + "4": 5034.0, + "5": 6022.0, + "6": 6072.0, + "7": 5250.0, + "8": 5157.0, + "9": 5645.0, + "10": 4813.0, + "11": 6049.0, + "12": 5580.0, + "13": 5963.0, + "14": 5902.0, + "15": 5586.0, + "16": 5890.0, + "17": 5611.0, + "18": 5514.0, + "19": 5628.0, + "20": 5068.0, + "21": 5603.0, + "22": 5087.0, + "23": 6008.0, + "24": 5364.0, + "25": 4868.0, + "26": 5594.0, + "27": 5626.0, + "28": 5973.0, + "29": 6225.0, + "30": 5528.0, + "31": 4650.0, + "32": 5940.0, + "33": 6315.0, + "34": 5284.0, + "35": 5700.0, + "36": 5633.0, + "37": 6648.0, + "38": 6194.0, + "39": 6933.0, + "40": 6137.0, + "41": 6314.0, + "42": 6416.0, + "43": 5714.0, + "44": 5892.0, + "45": 6030.0, + "46": 6086.0, + "47": 6881.0, + "48": 6386.0, + "49": 6242.0, + "50": 6652.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 598359040.0, + "2": 598359040.0, + "3": 598359040.0, + "4": 598359552.0, + "5": 598358016.0, + "6": 598358016.0, + "7": 598355456.0, + "8": 598359552.0, + "9": 598356480.0, + "10": 598356992.0, + "11": 598358016.0, + "12": 598359040.0, + "13": 598359040.0, + "14": 598358528.0, + "15": 598359040.0, + "16": 598358528.0, + "17": 598353408.0, + "18": 598358016.0, + "19": 598359040.0, + "20": 598357504.0, + "21": 598359040.0, + "22": 598354432.0, + "23": 598355968.0, + "24": 598356480.0, + "25": 598357504.0, + "26": 598356480.0, + "27": 598360064.0, + "28": 598358016.0, + "29": 598355456.0, + "30": 598358528.0, + "31": 598356480.0, + "32": 598356992.0, + "33": 598359552.0, + "34": 598358016.0, + "35": 598356480.0, + "36": 598358016.0, + "37": 598359040.0, + "38": 598358016.0, + "39": 598357504.0, + "40": 598357504.0, + "41": 598351872.0, + "42": 598358528.0, + "43": 598352896.0, + "44": 598354944.0, + "45": 598355968.0, + "46": 598351872.0, + "47": 598359040.0, + "48": 598354944.0, + "49": 598353408.0, + "50": 598358016.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 842904576.0, + "2": 1072649216.0, + "3": 1072649216.0, + "4": 1072649216.0, + "5": 1072649216.0, + "6": 1072649216.0, + "7": 1072649216.0, + "8": 1072649216.0, + "9": 1072649216.0, + "10": 1072649216.0, + "11": 1072649216.0, + "12": 1072649216.0, + "13": 1072649216.0, + "14": 1072709632.0, + "15": 1072709632.0, + "16": 1073532416.0, + "17": 1073532416.0, + "18": 1073532416.0, + "19": 1073532416.0, + "20": 1073532416.0, + "21": 1073532416.0, + "22": 1073532416.0, + "23": 1073532416.0, + "24": 1073532416.0, + "25": 1073532416.0, + "26": 1073532416.0, + "27": 1073532416.0, + "28": 1073532416.0, + "29": 1073532416.0, + "30": 1073532416.0, + "31": 1073532416.0, + "32": 1073532416.0, + "33": 1073532416.0, + "34": 1073532416.0, + "35": 1073532416.0, + "36": 1073532416.0, + "37": 1073532416.0, + "38": 1073532416.0, + "39": 1073532416.0, + "40": 1073532416.0, + "41": 1073532416.0, + "42": 1073532416.0, + "43": 1073532416.0, + "44": 1073532416.0, + "45": 1073532416.0, + "46": 1073532416.0, + "47": 1073532416.0, + "48": 1073532416.0, + "49": 1073532416.0, + "50": 1073532416.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 16.79929, + "2": 0.76107, + "3": 0.70012, + "4": 0.6957, + "5": 0.69356, + "6": 0.69449, + "7": 0.69404, + "8": 0.69622, + "9": 0.69268, + "10": 0.69289, + "11": 0.69397, + "12": 0.6939, + "13": 0.69543, + "14": 0.69343, + "15": 0.69367, + "16": 0.69313, + "17": 0.69312, + "18": 0.69243, + "19": 0.69103, + "20": 0.69247, + "21": 0.69344, + "22": 0.70018, + "23": 0.69201, + "24": 0.6925, + "25": 0.69194, + "26": 0.69263, + "27": 0.69615, + "28": 0.69387, + "29": 0.6943, + "30": 0.69451, + "31": 0.69337, + "32": 0.69257, + "33": 0.69262, + "34": 0.6935, + "35": 0.69273, + "36": 0.69514, + "37": 0.69327, + "38": 0.69244, + "39": 0.69222, + "40": 0.69263, + "41": 0.69355, + "42": 0.69577, + "43": 0.6959, + "44": 0.69514, + "45": 0.69357, + "46": 0.6948, + "47": 0.69457, + "48": 0.69365, + "49": 0.69508, + "50": 0.69782 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json new file mode 100644 index 00000000000..6f16e0a8b0c --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82196, + "2": 10.84132, + "3": 10.81128, + "4": 10.82231, + "5": 10.84518, + "6": 10.8626, + "7": 10.84391, + "8": 10.84701, + "9": 10.84948, + "10": 10.78921, + "11": 10.85726, + "12": 10.84459, + "13": 10.87146, + "14": 10.87456, + "15": 10.8336, + "16": 10.80914, + "17": 10.79111, + "18": 10.81065, + "19": 10.80588, + "20": 10.73505, + "21": 10.71444, + "22": 10.57729, + "23": 10.72656, + "24": 10.61835, + "25": 10.58138, + "26": 10.63781, + "27": 10.63741, + "28": 10.60575, + "29": 10.61061, + "30": 10.40958, + "31": 10.16916, + "32": 10.49914, + "33": 10.49662, + "34": 10.26146, + "35": 10.31467, + "36": 10.28534, + "37": 10.38868, + "38": 10.24742, + "39": 10.43812, + "40": 10.14618, + "41": 10.19703, + "42": 10.26135, + "43": 9.9103, + "44": 10.02321, + "45": 9.91713, + "46": 9.89492, + "47": 10.19337, + "48": 9.93091, + "49": 9.61227, + "50": 9.97428 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4960.0, + "2": 5301.0, + "3": 5425.0, + "4": 5034.0, + "5": 6022.0, + "6": 6072.0, + "7": 5250.0, + "8": 5157.0, + "9": 5645.0, + "10": 4813.0, + "11": 6049.0, + "12": 5580.0, + "13": 5963.0, + "14": 5902.0, + "15": 5586.0, + "16": 5890.0, + "17": 5611.0, + "18": 5514.0, + "19": 5628.0, + "20": 5068.0, + "21": 5603.0, + "22": 5087.0, + "23": 6008.0, + "24": 5364.0, + "25": 4868.0, + "26": 5594.0, + "27": 5626.0, + "28": 5973.0, + "29": 6225.0, + "30": 5528.0, + "31": 4650.0, + "32": 5940.0, + "33": 6315.0, + "34": 5284.0, + "35": 5700.0, + "36": 5633.0, + "37": 6648.0, + "38": 6194.0, + "39": 6933.0, + "40": 6137.0, + "41": 6314.0, + "42": 6416.0, + "43": 5714.0, + "44": 5892.0, + "45": 6030.0, + "46": 6086.0, + "47": 6881.0, + "48": 6386.0, + "49": 6242.0, + "50": 6652.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 598359040.0, + "2": 598359040.0, + "3": 598359040.0, + "4": 598359552.0, + "5": 598358016.0, + "6": 598358016.0, + "7": 598355456.0, + "8": 598359552.0, + "9": 598356480.0, + "10": 598356992.0, + "11": 598358016.0, + "12": 598359040.0, + "13": 598359040.0, + "14": 598358528.0, + "15": 598359040.0, + "16": 598358528.0, + "17": 598353408.0, + "18": 598358016.0, + "19": 598359040.0, + "20": 598357504.0, + "21": 598359040.0, + "22": 598354432.0, + "23": 598355968.0, + "24": 598356480.0, + "25": 598357504.0, + "26": 598356480.0, + "27": 598360064.0, + "28": 598358016.0, + "29": 598355456.0, + "30": 598358528.0, + "31": 598356480.0, + "32": 598356992.0, + "33": 598359552.0, + "34": 598358016.0, + "35": 598356480.0, + "36": 598358016.0, + "37": 598359040.0, + "38": 598358016.0, + "39": 598357504.0, + "40": 598357504.0, + "41": 598351872.0, + "42": 598358528.0, + "43": 598352896.0, + "44": 598354944.0, + "45": 598355968.0, + "46": 598351872.0, + "47": 598359040.0, + "48": 598354944.0, + "49": 598353408.0, + "50": 598358016.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 842904576.0, + "2": 1072649216.0, + "3": 1072649216.0, + "4": 1072649216.0, + "5": 1072649216.0, + "6": 1072649216.0, + "7": 1072649216.0, + "8": 1072649216.0, + "9": 1072649216.0, + "10": 1072649216.0, + "11": 1072649216.0, + "12": 1072649216.0, + "13": 1072649216.0, + "14": 1072709632.0, + "15": 1072709632.0, + "16": 1073532416.0, + "17": 1073532416.0, + "18": 1073532416.0, + "19": 1073532416.0, + "20": 1073532416.0, + "21": 1073532416.0, + "22": 1073532416.0, + "23": 1073532416.0, + "24": 1073532416.0, + "25": 1073532416.0, + "26": 1073532416.0, + "27": 1073532416.0, + "28": 1073532416.0, + "29": 1073532416.0, + "30": 1073532416.0, + "31": 1073532416.0, + "32": 1073532416.0, + "33": 1073532416.0, + "34": 1073532416.0, + "35": 1073532416.0, + "36": 1073532416.0, + "37": 1073532416.0, + "38": 1073532416.0, + "39": 1073532416.0, + "40": 1073532416.0, + "41": 1073532416.0, + "42": 1073532416.0, + "43": 1073532416.0, + "44": 1073532416.0, + "45": 1073532416.0, + "46": 1073532416.0, + "47": 1073532416.0, + "48": 1073532416.0, + "49": 1073532416.0, + "50": 1073532416.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.38225, + "2": 0.74075, + "3": 0.6836, + "4": 0.67846, + "5": 0.68171, + "6": 0.67743, + "7": 0.67855, + "8": 0.68164, + "9": 0.69137, + "10": 0.69257, + "11": 0.689, + "12": 0.69315, + "13": 0.69937, + "14": 0.69826, + "15": 0.69347, + "16": 0.68684, + "17": 0.6817, + "18": 0.67679, + "19": 0.67788, + "20": 0.67815, + "21": 0.67996, + "22": 0.67681, + "23": 0.67695, + "24": 0.67767, + "25": 0.67667, + "26": 0.67717, + "27": 0.67767, + "28": 0.67494, + "29": 0.67632, + "30": 0.67695, + "31": 0.67773, + "32": 0.67605, + "33": 0.6777, + "34": 0.6774, + "35": 0.67665, + "36": 0.68036, + "37": 0.6799, + "38": 0.67884, + "39": 0.68014, + "40": 0.68029, + "41": 0.68109, + "42": 0.68033, + "43": 0.6916, + "44": 0.68689, + "45": 0.68826, + "46": 0.6873, + "47": 0.69625, + "48": 0.68895, + "49": 0.69108, + "50": 0.6864 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..c922ef3f273 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.81746, + "2": 10.82149, + "3": 10.82234, + "4": 10.79883, + "5": 10.84067, + "6": 10.85636, + "7": 10.81775, + "8": 10.81498, + "9": 10.83664, + "10": 10.7822, + "11": 10.85151, + "12": 10.84335, + "13": 10.85001, + "14": 10.87346, + "15": 10.80974, + "16": 10.80359, + "17": 10.75702, + "18": 10.80691, + "19": 10.78689, + "20": 10.73095, + "21": 10.70872, + "22": 10.57886, + "23": 10.71772, + "24": 10.63253, + "25": 10.57332, + "26": 10.62323, + "27": 10.63892, + "28": 10.60509, + "29": 10.61796, + "30": 10.42067, + "31": 10.18074, + "32": 10.50619, + "33": 10.50937, + "34": 10.27626, + "35": 10.3249, + "36": 10.29423, + "37": 10.40006, + "38": 10.26099, + "39": 10.44197, + "40": 10.1644, + "41": 10.2004, + "42": 10.26981, + "43": 9.93054, + "44": 10.04184, + "45": 9.9288, + "46": 9.89638, + "47": 10.18471, + "48": 9.93119, + "49": 9.62763, + "50": 9.98402 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5082.0, + "2": 5274.0, + "3": 5447.0, + "4": 5269.0, + "5": 6020.0, + "6": 6160.0, + "7": 5592.0, + "8": 5309.0, + "9": 5743.0, + "10": 4800.0, + "11": 6186.0, + "12": 5648.0, + "13": 6106.0, + "14": 6126.0, + "15": 5600.0, + "16": 5819.0, + "17": 5669.0, + "18": 5547.0, + "19": 5711.0, + "20": 5380.0, + "21": 5677.0, + "22": 5023.0, + "23": 6080.0, + "24": 5403.0, + "25": 5120.0, + "26": 5431.0, + "27": 5866.0, + "28": 6035.0, + "29": 6154.0, + "30": 5456.0, + "31": 4832.0, + "32": 5956.0, + "33": 6301.0, + "34": 5366.0, + "35": 5900.0, + "36": 5703.0, + "37": 6744.0, + "38": 6098.0, + "39": 6737.0, + "40": 5994.0, + "41": 6144.0, + "42": 6542.0, + "43": 5751.0, + "44": 5876.0, + "45": 5795.0, + "46": 6162.0, + "47": 6736.0, + "48": 6331.0, + "49": 6235.0, + "50": 6668.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 627718656.0, + "2": 627719168.0, + "3": 627719168.0, + "4": 627720704.0, + "5": 627718656.0, + "6": 627718656.0, + "7": 627718144.0, + "8": 627718144.0, + "9": 627718144.0, + "10": 627719168.0, + "11": 627719680.0, + "12": 627719168.0, + "13": 627719680.0, + "14": 627717120.0, + "15": 627720192.0, + "16": 627717632.0, + "17": 627718144.0, + "18": 627719680.0, + "19": 627719168.0, + "20": 627717120.0, + "21": 627718144.0, + "22": 627720192.0, + "23": 627720192.0, + "24": 627718144.0, + "25": 627718656.0, + "26": 627718144.0, + "27": 627717120.0, + "28": 627718656.0, + "29": 627717120.0, + "30": 627720192.0, + "31": 627715072.0, + "32": 627720192.0, + "33": 627717632.0, + "34": 627719168.0, + "35": 627716608.0, + "36": 627719168.0, + "37": 627718144.0, + "38": 627718656.0, + "39": 627715584.0, + "40": 627717632.0, + "41": 627714560.0, + "42": 627718144.0, + "43": 627713536.0, + "44": 627714048.0, + "45": 627719168.0, + "46": 627716096.0, + "47": 627717120.0, + "48": 627716608.0, + "49": 627715072.0, + "50": 627718144.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 870138880.0, + "2": 1099332096.0, + "3": 1099950080.0, + "4": 1102007296.0, + "5": 1102007296.0, + "6": 1102007296.0, + "7": 1102007296.0, + "8": 1102007296.0, + "9": 1102007296.0, + "10": 1102007296.0, + "11": 1102007296.0, + "12": 1102007296.0, + "13": 1103012352.0, + "14": 1103012352.0, + "15": 1103012352.0, + "16": 1103012352.0, + "17": 1103012352.0, + "18": 1103012352.0, + "19": 1103012352.0, + "20": 1103012352.0, + "21": 1103012352.0, + "22": 1103012352.0, + "23": 1103012352.0, + "24": 1103012352.0, + "25": 1103012352.0, + "26": 1103012352.0, + "27": 1103012352.0, + "28": 1103012352.0, + "29": 1103012352.0, + "30": 1103012352.0, + "31": 1103012352.0, + "32": 1103012352.0, + "33": 1103012352.0, + "34": 1103012352.0, + "35": 1103012352.0, + "36": 1103012352.0, + "37": 1103012352.0, + "38": 1103012352.0, + "39": 1103012352.0, + "40": 1103012352.0, + "41": 1103012352.0, + "42": 1103012352.0, + "43": 1103012352.0, + "44": 1103012352.0, + "45": 1103012352.0, + "46": 1103012352.0, + "47": 1103012352.0, + "48": 1103012352.0, + "49": 1103012352.0, + "50": 1103012352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 20.31176, + "2": 0.54582, + "3": 0.4713, + "4": 0.49552, + "5": 0.45024, + "6": 0.44845, + "7": 0.46159, + "8": 0.44727, + "9": 0.45224, + "10": 0.44611, + "11": 0.44928, + "12": 0.4393, + "13": 0.44861, + "14": 0.43419, + "15": 0.46035, + "16": 0.44467, + "17": 0.44969, + "18": 0.45329, + "19": 0.45261, + "20": 0.47266, + "21": 0.44362, + "22": 0.44618, + "23": 0.44658, + "24": 0.44334, + "25": 0.45084, + "26": 0.4522, + "27": 0.44323, + "28": 0.44959, + "29": 0.44013, + "30": 0.44198, + "31": 0.44974, + "32": 0.44838, + "33": 0.4388, + "34": 0.46145, + "35": 0.4454, + "36": 0.43557, + "37": 0.43704, + "38": 0.45184, + "39": 0.43707, + "40": 0.43729, + "41": 0.44791, + "42": 0.44386, + "43": 0.44641, + "44": 0.43881, + "45": 0.45139, + "46": 0.46177, + "47": 0.46449, + "48": 0.44551, + "49": 0.47013, + "50": 0.44517 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..c9eee5d9463 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.81746, + "2": 10.82149, + "3": 10.82234, + "4": 10.79883, + "5": 10.84067, + "6": 10.85636, + "7": 10.81775, + "8": 10.81498, + "9": 10.83664, + "10": 10.7822, + "11": 10.85151, + "12": 10.84335, + "13": 10.85001, + "14": 10.87346, + "15": 10.80974, + "16": 10.80359, + "17": 10.75702, + "18": 10.80691, + "19": 10.78689, + "20": 10.73095, + "21": 10.70872, + "22": 10.57886, + "23": 10.71772, + "24": 10.63253, + "25": 10.57332, + "26": 10.62323, + "27": 10.63892, + "28": 10.60509, + "29": 10.61796, + "30": 10.42067, + "31": 10.18074, + "32": 10.50619, + "33": 10.50937, + "34": 10.27626, + "35": 10.3249, + "36": 10.29423, + "37": 10.40006, + "38": 10.26099, + "39": 10.44197, + "40": 10.1644, + "41": 10.2004, + "42": 10.26981, + "43": 9.93054, + "44": 10.04184, + "45": 9.9288, + "46": 9.89638, + "47": 10.18471, + "48": 9.93119, + "49": 9.62763, + "50": 9.98402 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5082.0, + "2": 5274.0, + "3": 5447.0, + "4": 5269.0, + "5": 6020.0, + "6": 6160.0, + "7": 5592.0, + "8": 5309.0, + "9": 5743.0, + "10": 4800.0, + "11": 6186.0, + "12": 5648.0, + "13": 6106.0, + "14": 6126.0, + "15": 5600.0, + "16": 5819.0, + "17": 5669.0, + "18": 5547.0, + "19": 5711.0, + "20": 5380.0, + "21": 5677.0, + "22": 5023.0, + "23": 6080.0, + "24": 5403.0, + "25": 5120.0, + "26": 5431.0, + "27": 5866.0, + "28": 6035.0, + "29": 6154.0, + "30": 5456.0, + "31": 4832.0, + "32": 5956.0, + "33": 6301.0, + "34": 5366.0, + "35": 5900.0, + "36": 5703.0, + "37": 6744.0, + "38": 6098.0, + "39": 6737.0, + "40": 5994.0, + "41": 6144.0, + "42": 6542.0, + "43": 5751.0, + "44": 5876.0, + "45": 5795.0, + "46": 6162.0, + "47": 6736.0, + "48": 6331.0, + "49": 6235.0, + "50": 6668.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 627718656.0, + "2": 627719168.0, + "3": 627719168.0, + "4": 627720704.0, + "5": 627718656.0, + "6": 627718656.0, + "7": 627718144.0, + "8": 627718144.0, + "9": 627718144.0, + "10": 627719168.0, + "11": 627719680.0, + "12": 627719168.0, + "13": 627719680.0, + "14": 627717120.0, + "15": 627720192.0, + "16": 627717632.0, + "17": 627718144.0, + "18": 627719680.0, + "19": 627719168.0, + "20": 627717120.0, + "21": 627718144.0, + "22": 627720192.0, + "23": 627720192.0, + "24": 627718144.0, + "25": 627718656.0, + "26": 627718144.0, + "27": 627717120.0, + "28": 627718656.0, + "29": 627717120.0, + "30": 627720192.0, + "31": 627715072.0, + "32": 627720192.0, + "33": 627717632.0, + "34": 627719168.0, + "35": 627716608.0, + "36": 627719168.0, + "37": 627718144.0, + "38": 627718656.0, + "39": 627715584.0, + "40": 627717632.0, + "41": 627714560.0, + "42": 627718144.0, + "43": 627713536.0, + "44": 627714048.0, + "45": 627719168.0, + "46": 627716096.0, + "47": 627717120.0, + "48": 627716608.0, + "49": 627715072.0, + "50": 627718144.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 870138880.0, + "2": 1099332096.0, + "3": 1099950080.0, + "4": 1102007296.0, + "5": 1102007296.0, + "6": 1102007296.0, + "7": 1102007296.0, + "8": 1102007296.0, + "9": 1102007296.0, + "10": 1102007296.0, + "11": 1102007296.0, + "12": 1102007296.0, + "13": 1103012352.0, + "14": 1103012352.0, + "15": 1103012352.0, + "16": 1103012352.0, + "17": 1103012352.0, + "18": 1103012352.0, + "19": 1103012352.0, + "20": 1103012352.0, + "21": 1103012352.0, + "22": 1103012352.0, + "23": 1103012352.0, + "24": 1103012352.0, + "25": 1103012352.0, + "26": 1103012352.0, + "27": 1103012352.0, + "28": 1103012352.0, + "29": 1103012352.0, + "30": 1103012352.0, + "31": 1103012352.0, + "32": 1103012352.0, + "33": 1103012352.0, + "34": 1103012352.0, + "35": 1103012352.0, + "36": 1103012352.0, + "37": 1103012352.0, + "38": 1103012352.0, + "39": 1103012352.0, + "40": 1103012352.0, + "41": 1103012352.0, + "42": 1103012352.0, + "43": 1103012352.0, + "44": 1103012352.0, + "45": 1103012352.0, + "46": 1103012352.0, + "47": 1103012352.0, + "48": 1103012352.0, + "49": 1103012352.0, + "50": 1103012352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 17.75731, + "2": 0.59137, + "3": 0.52847, + "4": 0.55398, + "5": 0.51736, + "6": 0.51707, + "7": 0.52895, + "8": 0.51861, + "9": 0.5181, + "10": 0.51717, + "11": 0.51445, + "12": 0.51129, + "13": 0.51494, + "14": 0.51037, + "15": 0.51828, + "16": 0.50983, + "17": 0.51156, + "18": 0.51029, + "19": 0.51087, + "20": 0.51452, + "21": 0.5039, + "22": 0.51296, + "23": 0.50822, + "24": 0.51693, + "25": 0.51087, + "26": 0.51188, + "27": 0.51138, + "28": 0.51374, + "29": 0.50808, + "30": 0.50936, + "31": 0.51301, + "32": 0.5132, + "33": 0.51, + "34": 0.51133, + "35": 0.51556, + "36": 0.51397, + "37": 0.51183, + "38": 0.51721, + "39": 0.50468, + "40": 0.50915, + "41": 0.51802, + "42": 0.51064, + "43": 0.51335, + "44": 0.50717, + "45": 0.51189, + "46": 0.52735, + "47": 0.52015, + "48": 0.50421, + "49": 0.5285, + "50": 0.50368 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json index d8f66f8d26b..4918ee299d7 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.81746, + "2": 10.82149, + "3": 10.82234, + "4": 10.79883, "5": 10.84067, + "6": 10.85636, + "7": 10.81775, + "8": 10.81498, + "9": 10.83664, "10": 10.7822, + "11": 10.85151, + "12": 10.84335, + "13": 10.85001, + "14": 10.87346, "15": 10.80974, + "16": 10.80359, + "17": 10.75702, + "18": 10.80691, + "19": 10.78689, "20": 10.73095, + "21": 10.70872, + "22": 10.57886, + "23": 10.71772, + "24": 10.63253, "25": 10.57332, + "26": 10.62323, + "27": 10.63892, + "28": 10.60509, + "29": 10.61796, "30": 10.42067, + "31": 10.18074, + "32": 10.50619, + "33": 10.50937, + "34": 10.27626, "35": 10.3249, + "36": 10.29423, + "37": 10.40006, + "38": 10.26099, + "39": 10.44197, "40": 10.1644, + "41": 10.2004, + "42": 10.26981, + "43": 9.93054, + "44": 10.04184, "45": 9.9288, + "46": 9.89638, + "47": 10.18471, + "48": 9.93119, + "49": 9.62763, "50": 9.98402 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 5082.0, + "2": 5274.0, + "3": 5447.0, + "4": 5269.0, "5": 6020.0, + "6": 6160.0, + "7": 5592.0, + "8": 5309.0, + "9": 5743.0, "10": 4800.0, + "11": 6186.0, + "12": 5648.0, + "13": 6106.0, + "14": 6126.0, "15": 5600.0, + "16": 5819.0, + "17": 5669.0, + "18": 5547.0, + "19": 5711.0, "20": 5380.0, + "21": 5677.0, + "22": 5023.0, + "23": 6080.0, + "24": 5403.0, "25": 5120.0, + "26": 5431.0, + "27": 5866.0, + "28": 6035.0, + "29": 6154.0, "30": 5456.0, + "31": 4832.0, + "32": 5956.0, + "33": 6301.0, + "34": 5366.0, "35": 5900.0, + "36": 5703.0, + "37": 6744.0, + "38": 6098.0, + "39": 6737.0, "40": 5994.0, + "41": 6144.0, + "42": 6542.0, + "43": 5751.0, + "44": 5876.0, "45": 5795.0, + "46": 6162.0, + "47": 6736.0, + "48": 6331.0, + "49": 6235.0, "50": 6668.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 627718656.0, + "2": 627719168.0, + "3": 627719168.0, + "4": 627720704.0, "5": 627718656.0, + "6": 627718656.0, + "7": 627718144.0, + "8": 627718144.0, + "9": 627718144.0, "10": 627719168.0, + "11": 627719680.0, + "12": 627719168.0, + "13": 627719680.0, + "14": 627717120.0, "15": 627720192.0, + "16": 627717632.0, + "17": 627718144.0, + "18": 627719680.0, + "19": 627719168.0, "20": 627717120.0, + "21": 627718144.0, + "22": 627720192.0, + "23": 627720192.0, + "24": 627718144.0, "25": 627718656.0, + "26": 627718144.0, + "27": 627717120.0, + "28": 627718656.0, + "29": 627717120.0, "30": 627720192.0, + "31": 627715072.0, + "32": 627720192.0, + "33": 627717632.0, + "34": 627719168.0, "35": 627716608.0, + "36": 627719168.0, + "37": 627718144.0, + "38": 627718656.0, + "39": 627715584.0, "40": 627717632.0, + "41": 627714560.0, + "42": 627718144.0, + "43": 627713536.0, + "44": 627714048.0, "45": 627719168.0, + "46": 627716096.0, + "47": 627717120.0, + "48": 627716608.0, + "49": 627715072.0, "50": 627718144.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 870138880.0, + "2": 1099332096.0, + "3": 1099950080.0, + "4": 1102007296.0, "5": 1102007296.0, + "6": 1102007296.0, + "7": 1102007296.0, + "8": 1102007296.0, + "9": 1102007296.0, "10": 1102007296.0, + "11": 1102007296.0, + "12": 1102007296.0, + "13": 1103012352.0, + "14": 1103012352.0, "15": 1103012352.0, + "16": 1103012352.0, + "17": 1103012352.0, + "18": 1103012352.0, + "19": 1103012352.0, "20": 1103012352.0, + "21": 1103012352.0, + "22": 1103012352.0, + "23": 1103012352.0, + "24": 1103012352.0, "25": 1103012352.0, + "26": 1103012352.0, + "27": 1103012352.0, + "28": 1103012352.0, + "29": 1103012352.0, "30": 1103012352.0, + "31": 1103012352.0, + "32": 1103012352.0, + "33": 1103012352.0, + "34": 1103012352.0, "35": 1103012352.0, + "36": 1103012352.0, + "37": 1103012352.0, + "38": 1103012352.0, + "39": 1103012352.0, "40": 1103012352.0, + "41": 1103012352.0, + "42": 1103012352.0, + "43": 1103012352.0, + "44": 1103012352.0, "45": 1103012352.0, + "46": 1103012352.0, + "47": 1103012352.0, + "48": 1103012352.0, + "49": 1103012352.0, "50": 1103012352.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 20.08249, - "5": 0.4425, - "10": 0.44364, - "15": 0.4517, - "20": 0.45348, - "25": 0.44927, - "30": 0.44258, - "35": 0.45719, - "40": 0.44034, - "45": 0.45039, - "50": 0.45412 + "1": 17.91075, + "2": 0.58262, + "3": 0.51891, + "4": 0.5535, + "5": 0.50364, + "6": 0.50993, + "7": 0.51644, + "8": 0.5062, + "9": 0.50479, + "10": 0.50352, + "11": 0.50142, + "12": 0.50105, + "13": 0.50984, + "14": 0.49899, + "15": 0.5144, + "16": 0.49725, + "17": 0.50222, + "18": 0.50011, + "19": 0.50584, + "20": 0.502, + "21": 0.49935, + "22": 0.51276, + "23": 0.50351, + "24": 0.50235, + "25": 0.49997, + "26": 0.50146, + "27": 0.49644, + "28": 0.49951, + "29": 0.49788, + "30": 0.50224, + "31": 0.50481, + "32": 0.50353, + "33": 0.50198, + "34": 0.50088, + "35": 0.50994, + "36": 0.49922, + "37": 0.49884, + "38": 0.51305, + "39": 0.49951, + "40": 0.49857, + "41": 0.5133, + "42": 0.50758, + "43": 0.51002, + "44": 0.50205, + "45": 0.51091, + "46": 0.52453, + "47": 0.52953, + "48": 0.50437, + "49": 0.52951, + "50": 0.50206 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..7d5050e9ca8 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82196, + "2": 10.84132, + "3": 10.81128, + "4": 10.82231, + "5": 10.84518, + "6": 10.8626, + "7": 10.84391, + "8": 10.84701, + "9": 10.84948, + "10": 10.78921, + "11": 10.85726, + "12": 10.84459, + "13": 10.87146, + "14": 10.87456, + "15": 10.8336, + "16": 10.80914, + "17": 10.79111, + "18": 10.81065, + "19": 10.80588, + "20": 10.73505, + "21": 10.71444, + "22": 10.57729, + "23": 10.72656, + "24": 10.61835, + "25": 10.58138, + "26": 10.63781, + "27": 10.63741, + "28": 10.60575, + "29": 10.61061, + "30": 10.40958, + "31": 10.16916, + "32": 10.49914, + "33": 10.49662, + "34": 10.26146, + "35": 10.31467, + "36": 10.28534, + "37": 10.38868, + "38": 10.24742, + "39": 10.43812, + "40": 10.14618, + "41": 10.19703, + "42": 10.26135, + "43": 9.9103, + "44": 10.02321, + "45": 9.91713, + "46": 9.89492, + "47": 10.19337, + "48": 9.93091, + "49": 9.61227, + "50": 9.97428 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4960.0, + "2": 5301.0, + "3": 5425.0, + "4": 5034.0, + "5": 6022.0, + "6": 6072.0, + "7": 5250.0, + "8": 5157.0, + "9": 5645.0, + "10": 4813.0, + "11": 6049.0, + "12": 5580.0, + "13": 5963.0, + "14": 5902.0, + "15": 5586.0, + "16": 5890.0, + "17": 5611.0, + "18": 5514.0, + "19": 5628.0, + "20": 5068.0, + "21": 5603.0, + "22": 5087.0, + "23": 6008.0, + "24": 5364.0, + "25": 4868.0, + "26": 5594.0, + "27": 5626.0, + "28": 5973.0, + "29": 6225.0, + "30": 5528.0, + "31": 4650.0, + "32": 5940.0, + "33": 6315.0, + "34": 5284.0, + "35": 5700.0, + "36": 5633.0, + "37": 6648.0, + "38": 6194.0, + "39": 6933.0, + "40": 6137.0, + "41": 6314.0, + "42": 6416.0, + "43": 5714.0, + "44": 5892.0, + "45": 6030.0, + "46": 6086.0, + "47": 6881.0, + "48": 6386.0, + "49": 6242.0, + "50": 6652.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 598359040.0, + "2": 598359040.0, + "3": 598359040.0, + "4": 598359552.0, + "5": 598358016.0, + "6": 598358016.0, + "7": 598355456.0, + "8": 598359552.0, + "9": 598356480.0, + "10": 598356992.0, + "11": 598358016.0, + "12": 598359040.0, + "13": 598359040.0, + "14": 598358528.0, + "15": 598359040.0, + "16": 598358528.0, + "17": 598353408.0, + "18": 598358016.0, + "19": 598359040.0, + "20": 598357504.0, + "21": 598359040.0, + "22": 598354432.0, + "23": 598355968.0, + "24": 598356480.0, + "25": 598357504.0, + "26": 598356480.0, + "27": 598360064.0, + "28": 598358016.0, + "29": 598355456.0, + "30": 598358528.0, + "31": 598356480.0, + "32": 598356992.0, + "33": 598359552.0, + "34": 598358016.0, + "35": 598356480.0, + "36": 598358016.0, + "37": 598359040.0, + "38": 598358016.0, + "39": 598357504.0, + "40": 598357504.0, + "41": 598351872.0, + "42": 598358528.0, + "43": 598352896.0, + "44": 598354944.0, + "45": 598355968.0, + "46": 598351872.0, + "47": 598359040.0, + "48": 598354944.0, + "49": 598353408.0, + "50": 598358016.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 842904576.0, + "2": 1072649216.0, + "3": 1072649216.0, + "4": 1072649216.0, + "5": 1072649216.0, + "6": 1072649216.0, + "7": 1072649216.0, + "8": 1072649216.0, + "9": 1072649216.0, + "10": 1072649216.0, + "11": 1072649216.0, + "12": 1072649216.0, + "13": 1072649216.0, + "14": 1072709632.0, + "15": 1072709632.0, + "16": 1073532416.0, + "17": 1073532416.0, + "18": 1073532416.0, + "19": 1073532416.0, + "20": 1073532416.0, + "21": 1073532416.0, + "22": 1073532416.0, + "23": 1073532416.0, + "24": 1073532416.0, + "25": 1073532416.0, + "26": 1073532416.0, + "27": 1073532416.0, + "28": 1073532416.0, + "29": 1073532416.0, + "30": 1073532416.0, + "31": 1073532416.0, + "32": 1073532416.0, + "33": 1073532416.0, + "34": 1073532416.0, + "35": 1073532416.0, + "36": 1073532416.0, + "37": 1073532416.0, + "38": 1073532416.0, + "39": 1073532416.0, + "40": 1073532416.0, + "41": 1073532416.0, + "42": 1073532416.0, + "43": 1073532416.0, + "44": 1073532416.0, + "45": 1073532416.0, + "46": 1073532416.0, + "47": 1073532416.0, + "48": 1073532416.0, + "49": 1073532416.0, + "50": 1073532416.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.25563, + "2": 0.84048, + "3": 0.76934, + "4": 0.70267, + "5": 0.70067, + "6": 0.73137, + "7": 0.70039, + "8": 0.69557, + "9": 0.69658, + "10": 0.69913, + "11": 0.69847, + "12": 0.70123, + "13": 0.69803, + "14": 0.74546, + "15": 0.69706, + "16": 0.69684, + "17": 0.69413, + "18": 0.6926, + "19": 0.69376, + "20": 0.69387, + "21": 0.69326, + "22": 0.78586, + "23": 0.72599, + "24": 0.72235, + "25": 0.72284, + "26": 0.69513, + "27": 0.69273, + "28": 0.69235, + "29": 0.69264, + "30": 0.69356, + "31": 0.6931, + "32": 0.69432, + "33": 0.69145, + "34": 0.69259, + "35": 0.69173, + "36": 0.69116, + "37": 0.69404, + "38": 0.69316, + "39": 0.69303, + "40": 0.6953, + "41": 0.6947, + "42": 0.69578, + "43": 0.69462, + "44": 0.69287, + "45": 0.69391, + "46": 0.69672, + "47": 0.69316, + "48": 0.69498, + "49": 0.70272, + "50": 0.688 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci.json new file mode 100644 index 00000000000..9b45d0fd625 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82196, + "2": 10.84132, + "3": 10.81128, + "4": 10.82231, + "5": 10.84518, + "6": 10.8626, + "7": 10.84391, + "8": 10.84701, + "9": 10.84948, + "10": 10.78921, + "11": 10.85726, + "12": 10.84459, + "13": 10.87146, + "14": 10.87456, + "15": 10.8336, + "16": 10.80914, + "17": 10.79111, + "18": 10.81065, + "19": 10.80588, + "20": 10.73505, + "21": 10.71444, + "22": 10.57729, + "23": 10.72656, + "24": 10.61835, + "25": 10.58138, + "26": 10.63781, + "27": 10.63741, + "28": 10.60575, + "29": 10.61061, + "30": 10.40958, + "31": 10.16916, + "32": 10.49914, + "33": 10.49662, + "34": 10.26146, + "35": 10.31467, + "36": 10.28534, + "37": 10.38868, + "38": 10.24742, + "39": 10.43812, + "40": 10.14618, + "41": 10.19703, + "42": 10.26135, + "43": 9.9103, + "44": 10.02321, + "45": 9.91713, + "46": 9.89492, + "47": 10.19337, + "48": 9.93091, + "49": 9.61227, + "50": 9.97428 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4960.0, + "2": 5301.0, + "3": 5425.0, + "4": 5034.0, + "5": 6022.0, + "6": 6072.0, + "7": 5250.0, + "8": 5157.0, + "9": 5645.0, + "10": 4813.0, + "11": 6049.0, + "12": 5580.0, + "13": 5963.0, + "14": 5902.0, + "15": 5586.0, + "16": 5890.0, + "17": 5611.0, + "18": 5514.0, + "19": 5628.0, + "20": 5068.0, + "21": 5603.0, + "22": 5087.0, + "23": 6008.0, + "24": 5364.0, + "25": 4868.0, + "26": 5594.0, + "27": 5626.0, + "28": 5973.0, + "29": 6225.0, + "30": 5528.0, + "31": 4650.0, + "32": 5940.0, + "33": 6315.0, + "34": 5284.0, + "35": 5700.0, + "36": 5633.0, + "37": 6648.0, + "38": 6194.0, + "39": 6933.0, + "40": 6137.0, + "41": 6314.0, + "42": 6416.0, + "43": 5714.0, + "44": 5892.0, + "45": 6030.0, + "46": 6086.0, + "47": 6881.0, + "48": 6386.0, + "49": 6242.0, + "50": 6652.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 598359040.0, + "2": 598359040.0, + "3": 598359040.0, + "4": 598359552.0, + "5": 598358016.0, + "6": 598358016.0, + "7": 598355456.0, + "8": 598359552.0, + "9": 598356480.0, + "10": 598356992.0, + "11": 598358016.0, + "12": 598359040.0, + "13": 598359040.0, + "14": 598358528.0, + "15": 598359040.0, + "16": 598358528.0, + "17": 598353408.0, + "18": 598358016.0, + "19": 598359040.0, + "20": 598357504.0, + "21": 598359040.0, + "22": 598354432.0, + "23": 598355968.0, + "24": 598356480.0, + "25": 598357504.0, + "26": 598356480.0, + "27": 598360064.0, + "28": 598358016.0, + "29": 598355456.0, + "30": 598358528.0, + "31": 598356480.0, + "32": 598356992.0, + "33": 598359552.0, + "34": 598358016.0, + "35": 598356480.0, + "36": 598358016.0, + "37": 598359040.0, + "38": 598358016.0, + "39": 598357504.0, + "40": 598357504.0, + "41": 598351872.0, + "42": 598358528.0, + "43": 598352896.0, + "44": 598354944.0, + "45": 598355968.0, + "46": 598351872.0, + "47": 598359040.0, + "48": 598354944.0, + "49": 598353408.0, + "50": 598358016.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 842904576.0, + "2": 1072649216.0, + "3": 1072649216.0, + "4": 1072649216.0, + "5": 1072649216.0, + "6": 1072649216.0, + "7": 1072649216.0, + "8": 1072649216.0, + "9": 1072649216.0, + "10": 1072649216.0, + "11": 1072649216.0, + "12": 1072649216.0, + "13": 1072649216.0, + "14": 1072709632.0, + "15": 1072709632.0, + "16": 1073532416.0, + "17": 1073532416.0, + "18": 1073532416.0, + "19": 1073532416.0, + "20": 1073532416.0, + "21": 1073532416.0, + "22": 1073532416.0, + "23": 1073532416.0, + "24": 1073532416.0, + "25": 1073532416.0, + "26": 1073532416.0, + "27": 1073532416.0, + "28": 1073532416.0, + "29": 1073532416.0, + "30": 1073532416.0, + "31": 1073532416.0, + "32": 1073532416.0, + "33": 1073532416.0, + "34": 1073532416.0, + "35": 1073532416.0, + "36": 1073532416.0, + "37": 1073532416.0, + "38": 1073532416.0, + "39": 1073532416.0, + "40": 1073532416.0, + "41": 1073532416.0, + "42": 1073532416.0, + "43": 1073532416.0, + "44": 1073532416.0, + "45": 1073532416.0, + "46": 1073532416.0, + "47": 1073532416.0, + "48": 1073532416.0, + "49": 1073532416.0, + "50": 1073532416.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 17.19224, + "2": 0.80625, + "3": 0.70873, + "4": 0.71373, + "5": 0.75099, + "6": 0.7011, + "7": 0.70052, + "8": 0.70566, + "9": 0.71562, + "10": 0.72846, + "11": 0.69613, + "12": 0.7157, + "13": 0.69994, + "14": 0.69612, + "15": 0.69543, + "16": 0.69411, + "17": 0.69454, + "18": 0.69705, + "19": 0.6969, + "20": 0.69948, + "21": 0.69454, + "22": 0.69425, + "23": 0.69428, + "24": 0.69194, + "25": 0.69013, + "26": 0.69277, + "27": 0.68916, + "28": 0.69161, + "29": 0.69773, + "30": 0.68894, + "31": 0.69363, + "32": 0.69912, + "33": 0.7057, + "34": 0.70009, + "35": 0.7044, + "36": 0.69831, + "37": 0.69777, + "38": 0.70193, + "39": 0.69786, + "40": 0.69142, + "41": 0.70011, + "42": 0.70081, + "43": 0.70081, + "44": 0.70437, + "45": 0.70168, + "46": 0.69713, + "47": 0.70166, + "48": 0.69823, + "49": 0.67973, + "50": 0.68287 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..f80469c23a2 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.81746, + "2": 10.82149, + "3": 10.82234, + "4": 10.79883, + "5": 10.84067, + "6": 10.85636, + "7": 10.81775, + "8": 10.81498, + "9": 10.83664, + "10": 10.7822, + "11": 10.85151, + "12": 10.84335, + "13": 10.85001, + "14": 10.87346, + "15": 10.80974, + "16": 10.80359, + "17": 10.75702, + "18": 10.80691, + "19": 10.78689, + "20": 10.73095, + "21": 10.70872, + "22": 10.57886, + "23": 10.71772, + "24": 10.63253, + "25": 10.57332, + "26": 10.62323, + "27": 10.63892, + "28": 10.60509, + "29": 10.61796, + "30": 10.42067, + "31": 10.18074, + "32": 10.50619, + "33": 10.50937, + "34": 10.27626, + "35": 10.3249, + "36": 10.29423, + "37": 10.40006, + "38": 10.26099, + "39": 10.44197, + "40": 10.1644, + "41": 10.2004, + "42": 10.26981, + "43": 9.93054, + "44": 10.04184, + "45": 9.9288, + "46": 9.89638, + "47": 10.18471, + "48": 9.93119, + "49": 9.62763, + "50": 9.98402 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5082.0, + "2": 5274.0, + "3": 5447.0, + "4": 5269.0, + "5": 6020.0, + "6": 6160.0, + "7": 5592.0, + "8": 5309.0, + "9": 5743.0, + "10": 4800.0, + "11": 6186.0, + "12": 5648.0, + "13": 6106.0, + "14": 6126.0, + "15": 5600.0, + "16": 5819.0, + "17": 5669.0, + "18": 5547.0, + "19": 5711.0, + "20": 5380.0, + "21": 5677.0, + "22": 5023.0, + "23": 6080.0, + "24": 5403.0, + "25": 5120.0, + "26": 5431.0, + "27": 5866.0, + "28": 6035.0, + "29": 6154.0, + "30": 5456.0, + "31": 4832.0, + "32": 5956.0, + "33": 6301.0, + "34": 5366.0, + "35": 5900.0, + "36": 5703.0, + "37": 6744.0, + "38": 6098.0, + "39": 6737.0, + "40": 5994.0, + "41": 6144.0, + "42": 6542.0, + "43": 5751.0, + "44": 5876.0, + "45": 5795.0, + "46": 6162.0, + "47": 6736.0, + "48": 6331.0, + "49": 6235.0, + "50": 6668.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 627718656.0, + "2": 627719168.0, + "3": 627719168.0, + "4": 627720704.0, + "5": 627718656.0, + "6": 627718656.0, + "7": 627718144.0, + "8": 627718144.0, + "9": 627718144.0, + "10": 627719168.0, + "11": 627719680.0, + "12": 627719168.0, + "13": 627719680.0, + "14": 627717120.0, + "15": 627720192.0, + "16": 627717632.0, + "17": 627718144.0, + "18": 627719680.0, + "19": 627719168.0, + "20": 627717120.0, + "21": 627718144.0, + "22": 627720192.0, + "23": 627720192.0, + "24": 627718144.0, + "25": 627718656.0, + "26": 627718144.0, + "27": 627717120.0, + "28": 627718656.0, + "29": 627717120.0, + "30": 627720192.0, + "31": 627715072.0, + "32": 627720192.0, + "33": 627717632.0, + "34": 627719168.0, + "35": 627716608.0, + "36": 627719168.0, + "37": 627718144.0, + "38": 627718656.0, + "39": 627715584.0, + "40": 627717632.0, + "41": 627714560.0, + "42": 627718144.0, + "43": 627713536.0, + "44": 627714048.0, + "45": 627719168.0, + "46": 627716096.0, + "47": 627717120.0, + "48": 627716608.0, + "49": 627715072.0, + "50": 627718144.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 870138880.0, + "2": 1099332096.0, + "3": 1099950080.0, + "4": 1102007296.0, + "5": 1102007296.0, + "6": 1102007296.0, + "7": 1102007296.0, + "8": 1102007296.0, + "9": 1102007296.0, + "10": 1102007296.0, + "11": 1102007296.0, + "12": 1102007296.0, + "13": 1103012352.0, + "14": 1103012352.0, + "15": 1103012352.0, + "16": 1103012352.0, + "17": 1103012352.0, + "18": 1103012352.0, + "19": 1103012352.0, + "20": 1103012352.0, + "21": 1103012352.0, + "22": 1103012352.0, + "23": 1103012352.0, + "24": 1103012352.0, + "25": 1103012352.0, + "26": 1103012352.0, + "27": 1103012352.0, + "28": 1103012352.0, + "29": 1103012352.0, + "30": 1103012352.0, + "31": 1103012352.0, + "32": 1103012352.0, + "33": 1103012352.0, + "34": 1103012352.0, + "35": 1103012352.0, + "36": 1103012352.0, + "37": 1103012352.0, + "38": 1103012352.0, + "39": 1103012352.0, + "40": 1103012352.0, + "41": 1103012352.0, + "42": 1103012352.0, + "43": 1103012352.0, + "44": 1103012352.0, + "45": 1103012352.0, + "46": 1103012352.0, + "47": 1103012352.0, + "48": 1103012352.0, + "49": 1103012352.0, + "50": 1103012352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 19.72199, + "2": 0.55482, + "3": 0.46042, + "4": 0.48082, + "5": 0.43967, + "6": 0.44947, + "7": 0.44996, + "8": 0.44231, + "9": 0.44422, + "10": 0.44437, + "11": 0.44012, + "12": 0.43933, + "13": 0.44783, + "14": 0.43652, + "15": 0.44961, + "16": 0.43438, + "17": 0.44393, + "18": 0.43947, + "19": 0.44737, + "20": 0.44146, + "21": 0.43755, + "22": 0.44263, + "23": 0.43321, + "24": 0.43572, + "25": 0.43146, + "26": 0.43427, + "27": 0.43127, + "28": 0.43972, + "29": 0.43162, + "30": 0.51076, + "31": 0.4451, + "32": 0.4416, + "33": 0.45169, + "34": 0.43371, + "35": 0.44399, + "36": 0.42875, + "37": 0.44051, + "38": 0.45464, + "39": 0.43269, + "40": 0.43351, + "41": 0.4407, + "42": 0.4495, + "43": 0.44929, + "44": 0.44083, + "45": 0.45508, + "46": 0.46229, + "47": 0.4728, + "48": 0.43019, + "49": 0.45756, + "50": 0.43145 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..25a8b5ae572 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.81746, + "2": 10.82149, + "3": 10.82234, + "4": 10.79883, + "5": 10.84067, + "6": 10.85636, + "7": 10.81775, + "8": 10.81498, + "9": 10.83664, + "10": 10.7822, + "11": 10.85151, + "12": 10.84335, + "13": 10.85001, + "14": 10.87346, + "15": 10.80974, + "16": 10.80359, + "17": 10.75702, + "18": 10.80691, + "19": 10.78689, + "20": 10.73095, + "21": 10.70872, + "22": 10.57886, + "23": 10.71772, + "24": 10.63253, + "25": 10.57332, + "26": 10.62323, + "27": 10.63892, + "28": 10.60509, + "29": 10.61796, + "30": 10.42067, + "31": 10.18074, + "32": 10.50619, + "33": 10.50937, + "34": 10.27626, + "35": 10.3249, + "36": 10.29423, + "37": 10.40006, + "38": 10.26099, + "39": 10.44197, + "40": 10.1644, + "41": 10.2004, + "42": 10.26981, + "43": 9.93054, + "44": 10.04184, + "45": 9.9288, + "46": 9.89638, + "47": 10.18471, + "48": 9.93119, + "49": 9.62763, + "50": 9.98402 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5082.0, + "2": 5274.0, + "3": 5447.0, + "4": 5269.0, + "5": 6020.0, + "6": 6160.0, + "7": 5592.0, + "8": 5309.0, + "9": 5743.0, + "10": 4800.0, + "11": 6186.0, + "12": 5648.0, + "13": 6106.0, + "14": 6126.0, + "15": 5600.0, + "16": 5819.0, + "17": 5669.0, + "18": 5547.0, + "19": 5711.0, + "20": 5380.0, + "21": 5677.0, + "22": 5023.0, + "23": 6080.0, + "24": 5403.0, + "25": 5120.0, + "26": 5431.0, + "27": 5866.0, + "28": 6035.0, + "29": 6154.0, + "30": 5456.0, + "31": 4832.0, + "32": 5956.0, + "33": 6301.0, + "34": 5366.0, + "35": 5900.0, + "36": 5703.0, + "37": 6744.0, + "38": 6098.0, + "39": 6737.0, + "40": 5994.0, + "41": 6144.0, + "42": 6542.0, + "43": 5751.0, + "44": 5876.0, + "45": 5795.0, + "46": 6162.0, + "47": 6736.0, + "48": 6331.0, + "49": 6235.0, + "50": 6668.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 627718656.0, + "2": 627719168.0, + "3": 627719168.0, + "4": 627720704.0, + "5": 627718656.0, + "6": 627718656.0, + "7": 627718144.0, + "8": 627718144.0, + "9": 627718144.0, + "10": 627719168.0, + "11": 627719680.0, + "12": 627719168.0, + "13": 627719680.0, + "14": 627717120.0, + "15": 627720192.0, + "16": 627717632.0, + "17": 627718144.0, + "18": 627719680.0, + "19": 627719168.0, + "20": 627717120.0, + "21": 627718144.0, + "22": 627720192.0, + "23": 627720192.0, + "24": 627718144.0, + "25": 627718656.0, + "26": 627718144.0, + "27": 627717120.0, + "28": 627718656.0, + "29": 627717120.0, + "30": 627720192.0, + "31": 627715072.0, + "32": 627720192.0, + "33": 627717632.0, + "34": 627719168.0, + "35": 627716608.0, + "36": 627719168.0, + "37": 627718144.0, + "38": 627718656.0, + "39": 627715584.0, + "40": 627717632.0, + "41": 627714560.0, + "42": 627718144.0, + "43": 627713536.0, + "44": 627714048.0, + "45": 627719168.0, + "46": 627716096.0, + "47": 627717120.0, + "48": 627716608.0, + "49": 627715072.0, + "50": 627718144.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 870138880.0, + "2": 1099332096.0, + "3": 1099950080.0, + "4": 1102007296.0, + "5": 1102007296.0, + "6": 1102007296.0, + "7": 1102007296.0, + "8": 1102007296.0, + "9": 1102007296.0, + "10": 1102007296.0, + "11": 1102007296.0, + "12": 1102007296.0, + "13": 1103012352.0, + "14": 1103012352.0, + "15": 1103012352.0, + "16": 1103012352.0, + "17": 1103012352.0, + "18": 1103012352.0, + "19": 1103012352.0, + "20": 1103012352.0, + "21": 1103012352.0, + "22": 1103012352.0, + "23": 1103012352.0, + "24": 1103012352.0, + "25": 1103012352.0, + "26": 1103012352.0, + "27": 1103012352.0, + "28": 1103012352.0, + "29": 1103012352.0, + "30": 1103012352.0, + "31": 1103012352.0, + "32": 1103012352.0, + "33": 1103012352.0, + "34": 1103012352.0, + "35": 1103012352.0, + "36": 1103012352.0, + "37": 1103012352.0, + "38": 1103012352.0, + "39": 1103012352.0, + "40": 1103012352.0, + "41": 1103012352.0, + "42": 1103012352.0, + "43": 1103012352.0, + "44": 1103012352.0, + "45": 1103012352.0, + "46": 1103012352.0, + "47": 1103012352.0, + "48": 1103012352.0, + "49": 1103012352.0, + "50": 1103012352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 17.91902, + "2": 0.59117, + "3": 0.52614, + "4": 0.54746, + "5": 0.5056, + "6": 0.50649, + "7": 0.52305, + "8": 0.50853, + "9": 0.50644, + "10": 0.50303, + "11": 0.50387, + "12": 0.50249, + "13": 0.51153, + "14": 0.49861, + "15": 0.51318, + "16": 0.50066, + "17": 0.50888, + "18": 0.50788, + "19": 0.51533, + "20": 0.51425, + "21": 0.51111, + "22": 0.5116, + "23": 0.50626, + "24": 0.5049, + "25": 0.51101, + "26": 0.50993, + "27": 0.5073, + "28": 0.50949, + "29": 0.50784, + "30": 0.50783, + "31": 0.51255, + "32": 0.51065, + "33": 0.50731, + "34": 0.50768, + "35": 0.51749, + "36": 0.50656, + "37": 0.51012, + "38": 0.51668, + "39": 0.50475, + "40": 0.50784, + "41": 0.51405, + "42": 0.51014, + "43": 0.51186, + "44": 0.50532, + "45": 0.51211, + "46": 0.52864, + "47": 0.52545, + "48": 0.50927, + "49": 0.52883, + "50": 0.50373 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_h100.json index 9010e3064a4..90c75c99e13 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.80475, + "2": 10.821, + "3": 10.8216, + "4": 10.79306, "5": 10.84831, + "6": 10.85888, + "7": 10.83177, + "8": 10.82362, + "9": 10.83757, "10": 10.78732, + "11": 10.86732, + "12": 10.85395, + "13": 10.86171, + "14": 10.88343, "15": 10.79765, + "16": 10.79986, + "17": 10.76238, + "18": 10.80286, + "19": 10.7945, "20": 10.71733, + "21": 10.70194, + "22": 10.55147, + "23": 10.72167, + "24": 10.60698, "25": 10.54614, + "26": 10.6136, + "27": 10.63974, + "28": 10.60486, + "29": 10.62277, "30": 10.41109, + "31": 10.1456, + "32": 10.51017, + "33": 10.50089, + "34": 10.25812, "35": 10.3154, + "36": 10.27895, + "37": 10.41061, + "38": 10.25908, + "39": 10.45334, "40": 10.1604, + "41": 10.20557, + "42": 10.26792, + "43": 9.90468, + "44": 10.03233, "45": 9.91098, + "46": 9.87857, + "47": 10.20952, + "48": 9.93178, + "49": 9.61584, "50": 9.98565 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 5474.0, + "2": 5853.0, + "3": 5875.0, + "4": 6041.0, "5": 6601.0, + "6": 6654.0, + "7": 6135.0, + "8": 5761.0, + "9": 6505.0, "10": 5497.0, + "11": 6994.0, + "12": 6523.0, + "13": 6807.0, + "14": 6969.0, "15": 6154.0, + "16": 6667.0, + "17": 6368.0, + "18": 6298.0, + "19": 6353.0, "20": 5998.0, + "21": 6264.0, + "22": 5628.0, + "23": 6620.0, + "24": 6063.0, "25": 5649.0, + "26": 6226.0, + "27": 6409.0, + "28": 6790.0, + "29": 7055.0, "30": 6430.0, + "31": 5565.0, + "32": 6615.0, + "33": 6969.0, + "34": 6107.0, "35": 6538.0, + "36": 6486.0, + "37": 7272.0, + "38": 6923.0, + "39": 7497.0, "40": 6997.0, + "41": 6747.0, + "42": 7228.0, + "43": 6629.0, + "44": 6752.0, "45": 6557.0, + "46": 6904.0, + "47": 7474.0, + "48": 7165.0, + "49": 7244.0, "50": 7331.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 491766784.0, + "2": 491767296.0, + "3": 491765760.0, + "4": 491767296.0, "5": 491766784.0, + "6": 491767808.0, + "7": 491767296.0, + "8": 491768320.0, + "9": 491767808.0, "10": 491767296.0, + "11": 491765248.0, + "12": 491764736.0, + "13": 491766272.0, + "14": 491767808.0, "15": 491768832.0, + "16": 491769856.0, + "17": 491767296.0, + "18": 491765248.0, + "19": 491766272.0, "20": 491766784.0, + "21": 491768320.0, + "22": 491768320.0, + "23": 491765760.0, + "24": 491766272.0, "25": 491766272.0, + "26": 491767296.0, + "27": 491766784.0, + "28": 491767296.0, + "29": 491766272.0, "30": 491766272.0, + "31": 491767808.0, + "32": 491765760.0, + "33": 491764736.0, + "34": 491768320.0, "35": 491769344.0, + "36": 491765760.0, + "37": 491765248.0, + "38": 491766272.0, + "39": 491767808.0, "40": 491765760.0, + "41": 491768320.0, + "42": 491766272.0, + "43": 491768832.0, + "44": 491768320.0, "45": 491765248.0, + "46": 491768320.0, + "47": 491765760.0, + "48": 491766784.0, + "49": 491766784.0, "50": 491765248.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1047229440.0, + "2": 1213900288.0, + "3": 1213900288.0, + "4": 1213900288.0, "5": 1213900288.0, + "6": 1213900288.0, + "7": 1213900288.0, + "8": 1213900288.0, + "9": 1213900288.0, "10": 1213900288.0, + "11": 1213900288.0, + "12": 1213900288.0, + "13": 1213900288.0, + "14": 1213900288.0, "15": 1213900288.0, + "16": 1213900288.0, + "17": 1213900288.0, + "18": 1213900288.0, + "19": 1213900288.0, "20": 1213900288.0, + "21": 1213900288.0, + "22": 1213900288.0, + "23": 1213900288.0, + "24": 1213900288.0, "25": 1213900288.0, + "26": 1213900288.0, + "27": 1213900288.0, + "28": 1213900288.0, + "29": 1213900288.0, "30": 1213900288.0, + "31": 1213900288.0, + "32": 1213900288.0, + "33": 1213900288.0, + "34": 1213900288.0, "35": 1213900288.0, + "36": 1213900288.0, + "37": 1213900288.0, + "38": 1213900288.0, + "39": 1213900288.0, "40": 1213900288.0, + "41": 1213900288.0, + "42": 1213900288.0, + "43": 1213900288.0, + "44": 1213900288.0, "45": 1213900288.0, + "46": 1213900288.0, + "47": 1213900288.0, + "48": 1213900288.0, + "49": 1213900288.0, "50": 1213900288.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 15.3547, - "5": 0.36735, - "10": 0.37327, - "15": 0.36612, - "20": 0.37034, - "25": 0.36884, - "30": 0.37157, - "35": 0.38429, - "40": 0.38666, - "45": 0.39183, - "50": 0.38705 + "1": 13.19467, + "2": 0.48448, + "3": 0.44871, + "4": 0.46924, + "5": 0.42566, + "6": 0.43083, + "7": 0.43901, + "8": 0.42599, + "9": 0.42583, + "10": 0.42829, + "11": 0.4235, + "12": 0.42225, + "13": 0.4285, + "14": 0.42372, + "15": 0.43098, + "16": 0.4172, + "17": 0.43302, + "18": 0.41927, + "19": 0.4331, + "20": 0.43471, + "21": 0.41939, + "22": 0.43275, + "23": 0.41768, + "24": 0.42806, + "25": 0.42095, + "26": 0.42731, + "27": 0.42655, + "28": 0.42892, + "29": 0.42736, + "30": 0.42769, + "31": 0.43481, + "32": 0.4238, + "33": 0.42194, + "34": 0.43633, + "35": 0.43921, + "36": 0.43121, + "37": 0.42193, + "38": 0.42605, + "39": 0.42408, + "40": 0.42556, + "41": 0.43247, + "42": 0.42213, + "43": 0.44451, + "44": 0.42353, + "45": 0.42949, + "46": 0.46147, + "47": 0.44954, + "48": 0.44275, + "49": 0.44961, + "50": 0.4304 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..c47332e4152 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.79175, + "2": 10.80907, + "3": 10.81011, + "4": 10.78146, + "5": 10.82288, + "6": 10.84057, + "7": 10.81192, + "8": 10.80005, + "9": 10.81667, + "10": 10.7688, + "11": 10.8618, + "12": 10.84042, + "13": 10.84452, + "14": 10.86421, + "15": 10.79157, + "16": 10.78199, + "17": 10.75122, + "18": 10.79446, + "19": 10.79523, + "20": 10.71001, + "21": 10.68811, + "22": 10.53736, + "23": 10.7066, + "24": 10.58865, + "25": 10.54662, + "26": 10.59492, + "27": 10.62142, + "28": 10.5969, + "29": 10.60036, + "30": 10.39407, + "31": 10.12951, + "32": 10.49684, + "33": 10.48779, + "34": 10.24347, + "35": 10.30461, + "36": 10.26056, + "37": 10.38859, + "38": 10.24848, + "39": 10.43799, + "40": 10.13303, + "41": 10.18651, + "42": 10.25823, + "43": 9.892, + "44": 10.02576, + "45": 9.90015, + "46": 9.88387, + "47": 10.19565, + "48": 9.91255, + "49": 9.60147, + "50": 9.97874 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5656.0, + "2": 6018.0, + "3": 5790.0, + "4": 5941.0, + "5": 6476.0, + "6": 6653.0, + "7": 6287.0, + "8": 5875.0, + "9": 6239.0, + "10": 5453.0, + "11": 6936.0, + "12": 6711.0, + "13": 6655.0, + "14": 6814.0, + "15": 6233.0, + "16": 6533.0, + "17": 6397.0, + "18": 6112.0, + "19": 6678.0, + "20": 5837.0, + "21": 6403.0, + "22": 5715.0, + "23": 6744.0, + "24": 6051.0, + "25": 5811.0, + "26": 6104.0, + "27": 6484.0, + "28": 6884.0, + "29": 7253.0, + "30": 6047.0, + "31": 5593.0, + "32": 6625.0, + "33": 7054.0, + "34": 6104.0, + "35": 6712.0, + "36": 6684.0, + "37": 7523.0, + "38": 7273.0, + "39": 7620.0, + "40": 7062.0, + "41": 6895.0, + "42": 7426.0, + "43": 6713.0, + "44": 6664.0, + "45": 6681.0, + "46": 6923.0, + "47": 7705.0, + "48": 7248.0, + "49": 7331.0, + "50": 7527.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 462408192.0, + "2": 462406144.0, + "3": 462409728.0, + "4": 462406144.0, + "5": 462407680.0, + "6": 462408192.0, + "7": 462410752.0, + "8": 462410752.0, + "9": 462407168.0, + "10": 462410240.0, + "11": 462408192.0, + "12": 462408192.0, + "13": 462408704.0, + "14": 462409728.0, + "15": 462409728.0, + "16": 462407168.0, + "17": 462408704.0, + "18": 462408704.0, + "19": 462408704.0, + "20": 462408704.0, + "21": 462406144.0, + "22": 462412800.0, + "23": 462409216.0, + "24": 462408704.0, + "25": 462406144.0, + "26": 462410240.0, + "27": 462405120.0, + "28": 462408192.0, + "29": 462407168.0, + "30": 462406144.0, + "31": 462413312.0, + "32": 462408704.0, + "33": 462409216.0, + "34": 462406144.0, + "35": 462410240.0, + "36": 462407168.0, + "37": 462409728.0, + "38": 462408192.0, + "39": 462408192.0, + "40": 462407680.0, + "41": 462411264.0, + "42": 462409728.0, + "43": 462411264.0, + "44": 462407680.0, + "45": 462408704.0, + "46": 462410752.0, + "47": 462407680.0, + "48": 462408192.0, + "49": 462409728.0, + "50": 462409216.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1019807232.0, + "2": 1186372608.0, + "3": 1186372608.0, + "4": 1186372608.0, + "5": 1186372608.0, + "6": 1186372608.0, + "7": 1186372608.0, + "8": 1186372608.0, + "9": 1186372608.0, + "10": 1186372608.0, + "11": 1186372608.0, + "12": 1186372608.0, + "13": 1186372608.0, + "14": 1186372608.0, + "15": 1186372608.0, + "16": 1186372608.0, + "17": 1186372608.0, + "18": 1186372608.0, + "19": 1186372608.0, + "20": 1186372608.0, + "21": 1186372608.0, + "22": 1186372608.0, + "23": 1186372608.0, + "24": 1186372608.0, + "25": 1186372608.0, + "26": 1186372608.0, + "27": 1186372608.0, + "28": 1186372608.0, + "29": 1186372608.0, + "30": 1186372608.0, + "31": 1186372608.0, + "32": 1186372608.0, + "33": 1186372608.0, + "34": 1186372608.0, + "35": 1186372608.0, + "36": 1186372608.0, + "37": 1186372608.0, + "38": 1186372608.0, + "39": 1186372608.0, + "40": 1186372608.0, + "41": 1186372608.0, + "42": 1186372608.0, + "43": 1186372608.0, + "44": 1186372608.0, + "45": 1186372608.0, + "46": 1186372608.0, + "47": 1186372608.0, + "48": 1186372608.0, + "49": 1186372608.0, + "50": 1186372608.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 9.69829, + "2": 0.75133, + "3": 0.68321, + "4": 0.68299, + "5": 0.61733, + "6": 0.57979, + "7": 0.57675, + "8": 0.57837, + "9": 0.58539, + "10": 0.58222, + "11": 0.58158, + "12": 0.58184, + "13": 0.58692, + "14": 0.58497, + "15": 0.59994, + "16": 0.59773, + "17": 0.57959, + "18": 0.57818, + "19": 0.57753, + "20": 0.57723, + "21": 0.57903, + "22": 0.57678, + "23": 0.58682, + "24": 0.57654, + "25": 0.57615, + "26": 0.57702, + "27": 0.57613, + "28": 0.57457, + "29": 0.57523, + "30": 0.57623, + "31": 0.57821, + "32": 0.57613, + "33": 0.57379, + "34": 0.57684, + "35": 0.57784, + "36": 0.57665, + "37": 0.57697, + "38": 0.57594, + "39": 0.57412, + "40": 0.57582, + "41": 0.57418, + "42": 0.57387, + "43": 0.57626, + "44": 0.57569, + "45": 0.57598, + "46": 0.57593, + "47": 0.57827, + "48": 0.57811, + "49": 0.57776, + "50": 0.57779 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json new file mode 100644 index 00000000000..301ddfc5e91 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.79175, + "2": 10.80907, + "3": 10.81011, + "4": 10.78146, + "5": 10.82288, + "6": 10.84057, + "7": 10.81192, + "8": 10.80005, + "9": 10.81667, + "10": 10.7688, + "11": 10.8618, + "12": 10.84042, + "13": 10.84452, + "14": 10.86421, + "15": 10.79157, + "16": 10.78199, + "17": 10.75122, + "18": 10.79446, + "19": 10.79523, + "20": 10.71001, + "21": 10.68811, + "22": 10.53736, + "23": 10.7066, + "24": 10.58865, + "25": 10.54662, + "26": 10.59492, + "27": 10.62142, + "28": 10.5969, + "29": 10.60036, + "30": 10.39407, + "31": 10.12951, + "32": 10.49684, + "33": 10.48779, + "34": 10.24347, + "35": 10.30461, + "36": 10.26056, + "37": 10.38859, + "38": 10.24848, + "39": 10.43799, + "40": 10.13303, + "41": 10.18651, + "42": 10.25823, + "43": 9.892, + "44": 10.02576, + "45": 9.90015, + "46": 9.88387, + "47": 10.19565, + "48": 9.91255, + "49": 9.60147, + "50": 9.97874 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5656.0, + "2": 6018.0, + "3": 5790.0, + "4": 5941.0, + "5": 6476.0, + "6": 6653.0, + "7": 6287.0, + "8": 5875.0, + "9": 6239.0, + "10": 5453.0, + "11": 6936.0, + "12": 6711.0, + "13": 6655.0, + "14": 6814.0, + "15": 6233.0, + "16": 6533.0, + "17": 6397.0, + "18": 6112.0, + "19": 6678.0, + "20": 5837.0, + "21": 6403.0, + "22": 5715.0, + "23": 6744.0, + "24": 6051.0, + "25": 5811.0, + "26": 6104.0, + "27": 6484.0, + "28": 6884.0, + "29": 7253.0, + "30": 6047.0, + "31": 5593.0, + "32": 6625.0, + "33": 7054.0, + "34": 6104.0, + "35": 6712.0, + "36": 6684.0, + "37": 7523.0, + "38": 7273.0, + "39": 7620.0, + "40": 7062.0, + "41": 6895.0, + "42": 7426.0, + "43": 6713.0, + "44": 6664.0, + "45": 6681.0, + "46": 6923.0, + "47": 7705.0, + "48": 7248.0, + "49": 7331.0, + "50": 7527.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 462408192.0, + "2": 462406144.0, + "3": 462409728.0, + "4": 462406144.0, + "5": 462407680.0, + "6": 462408192.0, + "7": 462410752.0, + "8": 462410752.0, + "9": 462407168.0, + "10": 462410240.0, + "11": 462408192.0, + "12": 462408192.0, + "13": 462408704.0, + "14": 462409728.0, + "15": 462409728.0, + "16": 462407168.0, + "17": 462408704.0, + "18": 462408704.0, + "19": 462408704.0, + "20": 462408704.0, + "21": 462406144.0, + "22": 462412800.0, + "23": 462409216.0, + "24": 462408704.0, + "25": 462406144.0, + "26": 462410240.0, + "27": 462405120.0, + "28": 462408192.0, + "29": 462407168.0, + "30": 462406144.0, + "31": 462413312.0, + "32": 462408704.0, + "33": 462409216.0, + "34": 462406144.0, + "35": 462410240.0, + "36": 462407168.0, + "37": 462409728.0, + "38": 462408192.0, + "39": 462408192.0, + "40": 462407680.0, + "41": 462411264.0, + "42": 462409728.0, + "43": 462411264.0, + "44": 462407680.0, + "45": 462408704.0, + "46": 462410752.0, + "47": 462407680.0, + "48": 462408192.0, + "49": 462409728.0, + "50": 462409216.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1019807232.0, + "2": 1186372608.0, + "3": 1186372608.0, + "4": 1186372608.0, + "5": 1186372608.0, + "6": 1186372608.0, + "7": 1186372608.0, + "8": 1186372608.0, + "9": 1186372608.0, + "10": 1186372608.0, + "11": 1186372608.0, + "12": 1186372608.0, + "13": 1186372608.0, + "14": 1186372608.0, + "15": 1186372608.0, + "16": 1186372608.0, + "17": 1186372608.0, + "18": 1186372608.0, + "19": 1186372608.0, + "20": 1186372608.0, + "21": 1186372608.0, + "22": 1186372608.0, + "23": 1186372608.0, + "24": 1186372608.0, + "25": 1186372608.0, + "26": 1186372608.0, + "27": 1186372608.0, + "28": 1186372608.0, + "29": 1186372608.0, + "30": 1186372608.0, + "31": 1186372608.0, + "32": 1186372608.0, + "33": 1186372608.0, + "34": 1186372608.0, + "35": 1186372608.0, + "36": 1186372608.0, + "37": 1186372608.0, + "38": 1186372608.0, + "39": 1186372608.0, + "40": 1186372608.0, + "41": 1186372608.0, + "42": 1186372608.0, + "43": 1186372608.0, + "44": 1186372608.0, + "45": 1186372608.0, + "46": 1186372608.0, + "47": 1186372608.0, + "48": 1186372608.0, + "49": 1186372608.0, + "50": 1186372608.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.87016, + "2": 0.65629, + "3": 0.56435, + "4": 0.5717, + "5": 0.56322, + "6": 0.56979, + "7": 0.56582, + "8": 0.56867, + "9": 0.57661, + "10": 0.56784, + "11": 0.57189, + "12": 0.57201, + "13": 0.57482, + "14": 0.57089, + "15": 0.57194, + "16": 0.56916, + "17": 0.57352, + "18": 0.56823, + "19": 0.56931, + "20": 0.56782, + "21": 0.56743, + "22": 0.5663, + "23": 0.56569, + "24": 0.56599, + "25": 0.56544, + "26": 0.56524, + "27": 0.56556, + "28": 0.56547, + "29": 0.56456, + "30": 0.56668, + "31": 0.57243, + "32": 0.56549, + "33": 0.56604, + "34": 0.5659, + "35": 0.56549, + "36": 0.56418, + "37": 0.56524, + "38": 0.56422, + "39": 0.56426, + "40": 0.56469, + "41": 0.56367, + "42": 0.56796, + "43": 0.57027, + "44": 0.57157, + "45": 0.56565, + "46": 0.56924, + "47": 0.57401, + "48": 0.57226, + "49": 0.56767, + "50": 0.56405 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..d9811bb579f --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.80475, + "2": 10.821, + "3": 10.8216, + "4": 10.79306, + "5": 10.84831, + "6": 10.85888, + "7": 10.83177, + "8": 10.82362, + "9": 10.83757, + "10": 10.78732, + "11": 10.86732, + "12": 10.85395, + "13": 10.86171, + "14": 10.88343, + "15": 10.79765, + "16": 10.79986, + "17": 10.76238, + "18": 10.80286, + "19": 10.7945, + "20": 10.71733, + "21": 10.70194, + "22": 10.55147, + "23": 10.72167, + "24": 10.60698, + "25": 10.54614, + "26": 10.6136, + "27": 10.63974, + "28": 10.60486, + "29": 10.62277, + "30": 10.41109, + "31": 10.1456, + "32": 10.51017, + "33": 10.50089, + "34": 10.25812, + "35": 10.3154, + "36": 10.27895, + "37": 10.41061, + "38": 10.25908, + "39": 10.45334, + "40": 10.1604, + "41": 10.20557, + "42": 10.26792, + "43": 9.90468, + "44": 10.03233, + "45": 9.91098, + "46": 9.87857, + "47": 10.20952, + "48": 9.93178, + "49": 9.61584, + "50": 9.98565 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5474.0, + "2": 5853.0, + "3": 5875.0, + "4": 6041.0, + "5": 6601.0, + "6": 6654.0, + "7": 6135.0, + "8": 5761.0, + "9": 6505.0, + "10": 5497.0, + "11": 6994.0, + "12": 6523.0, + "13": 6807.0, + "14": 6969.0, + "15": 6154.0, + "16": 6667.0, + "17": 6368.0, + "18": 6298.0, + "19": 6353.0, + "20": 5998.0, + "21": 6264.0, + "22": 5628.0, + "23": 6620.0, + "24": 6063.0, + "25": 5649.0, + "26": 6226.0, + "27": 6409.0, + "28": 6790.0, + "29": 7055.0, + "30": 6430.0, + "31": 5565.0, + "32": 6615.0, + "33": 6969.0, + "34": 6107.0, + "35": 6538.0, + "36": 6486.0, + "37": 7272.0, + "38": 6923.0, + "39": 7497.0, + "40": 6997.0, + "41": 6747.0, + "42": 7228.0, + "43": 6629.0, + "44": 6752.0, + "45": 6557.0, + "46": 6904.0, + "47": 7474.0, + "48": 7165.0, + "49": 7244.0, + "50": 7331.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 491766784.0, + "2": 491767296.0, + "3": 491765760.0, + "4": 491767296.0, + "5": 491766784.0, + "6": 491767808.0, + "7": 491767296.0, + "8": 491768320.0, + "9": 491767808.0, + "10": 491767296.0, + "11": 491765248.0, + "12": 491764736.0, + "13": 491766272.0, + "14": 491767808.0, + "15": 491768832.0, + "16": 491769856.0, + "17": 491767296.0, + "18": 491765248.0, + "19": 491766272.0, + "20": 491766784.0, + "21": 491768320.0, + "22": 491768320.0, + "23": 491765760.0, + "24": 491766272.0, + "25": 491766272.0, + "26": 491767296.0, + "27": 491766784.0, + "28": 491767296.0, + "29": 491766272.0, + "30": 491766272.0, + "31": 491767808.0, + "32": 491765760.0, + "33": 491764736.0, + "34": 491768320.0, + "35": 491769344.0, + "36": 491765760.0, + "37": 491765248.0, + "38": 491766272.0, + "39": 491767808.0, + "40": 491765760.0, + "41": 491768320.0, + "42": 491766272.0, + "43": 491768832.0, + "44": 491768320.0, + "45": 491765248.0, + "46": 491768320.0, + "47": 491765760.0, + "48": 491766784.0, + "49": 491766784.0, + "50": 491765248.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1047229440.0, + "2": 1213900288.0, + "3": 1213900288.0, + "4": 1213900288.0, + "5": 1213900288.0, + "6": 1213900288.0, + "7": 1213900288.0, + "8": 1213900288.0, + "9": 1213900288.0, + "10": 1213900288.0, + "11": 1213900288.0, + "12": 1213900288.0, + "13": 1213900288.0, + "14": 1213900288.0, + "15": 1213900288.0, + "16": 1213900288.0, + "17": 1213900288.0, + "18": 1213900288.0, + "19": 1213900288.0, + "20": 1213900288.0, + "21": 1213900288.0, + "22": 1213900288.0, + "23": 1213900288.0, + "24": 1213900288.0, + "25": 1213900288.0, + "26": 1213900288.0, + "27": 1213900288.0, + "28": 1213900288.0, + "29": 1213900288.0, + "30": 1213900288.0, + "31": 1213900288.0, + "32": 1213900288.0, + "33": 1213900288.0, + "34": 1213900288.0, + "35": 1213900288.0, + "36": 1213900288.0, + "37": 1213900288.0, + "38": 1213900288.0, + "39": 1213900288.0, + "40": 1213900288.0, + "41": 1213900288.0, + "42": 1213900288.0, + "43": 1213900288.0, + "44": 1213900288.0, + "45": 1213900288.0, + "46": 1213900288.0, + "47": 1213900288.0, + "48": 1213900288.0, + "49": 1213900288.0, + "50": 1213900288.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 14.81321, + "2": 0.47201, + "3": 0.40381, + "4": 0.41626, + "5": 0.37526, + "6": 0.39128, + "7": 0.38006, + "8": 0.38712, + "9": 0.37978, + "10": 0.36542, + "11": 0.37019, + "12": 0.3584, + "13": 0.37121, + "14": 0.37141, + "15": 0.37291, + "16": 0.36319, + "17": 0.3701, + "18": 0.35732, + "19": 0.36745, + "20": 0.36768, + "21": 0.36322, + "22": 0.36627, + "23": 0.36042, + "24": 0.36521, + "25": 0.36471, + "26": 0.36406, + "27": 0.35919, + "28": 0.37411, + "29": 0.35657, + "30": 0.36834, + "31": 0.37292, + "32": 0.35489, + "33": 0.36692, + "34": 0.37173, + "35": 0.37097, + "36": 0.36594, + "37": 0.36691, + "38": 0.36847, + "39": 0.36166, + "40": 0.36415, + "41": 0.36888, + "42": 0.36642, + "43": 0.37419, + "44": 0.37026, + "45": 0.36033, + "46": 0.39777, + "47": 0.37677, + "48": 0.36794, + "49": 0.3863, + "50": 0.36013 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..b250bf7ac21 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.80475, + "2": 10.821, + "3": 10.8216, + "4": 10.79306, + "5": 10.84831, + "6": 10.85888, + "7": 10.83177, + "8": 10.82362, + "9": 10.83757, + "10": 10.78732, + "11": 10.86732, + "12": 10.85395, + "13": 10.86171, + "14": 10.88343, + "15": 10.79765, + "16": 10.79986, + "17": 10.76238, + "18": 10.80286, + "19": 10.7945, + "20": 10.71733, + "21": 10.70194, + "22": 10.55147, + "23": 10.72167, + "24": 10.60698, + "25": 10.54614, + "26": 10.6136, + "27": 10.63974, + "28": 10.60486, + "29": 10.62277, + "30": 10.41109, + "31": 10.1456, + "32": 10.51017, + "33": 10.50089, + "34": 10.25812, + "35": 10.3154, + "36": 10.27895, + "37": 10.41061, + "38": 10.25908, + "39": 10.45334, + "40": 10.1604, + "41": 10.20557, + "42": 10.26792, + "43": 9.90468, + "44": 10.03233, + "45": 9.91098, + "46": 9.87857, + "47": 10.20952, + "48": 9.93178, + "49": 9.61584, + "50": 9.98565 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5474.0, + "2": 5853.0, + "3": 5875.0, + "4": 6041.0, + "5": 6601.0, + "6": 6654.0, + "7": 6135.0, + "8": 5761.0, + "9": 6505.0, + "10": 5497.0, + "11": 6994.0, + "12": 6523.0, + "13": 6807.0, + "14": 6969.0, + "15": 6154.0, + "16": 6667.0, + "17": 6368.0, + "18": 6298.0, + "19": 6353.0, + "20": 5998.0, + "21": 6264.0, + "22": 5628.0, + "23": 6620.0, + "24": 6063.0, + "25": 5649.0, + "26": 6226.0, + "27": 6409.0, + "28": 6790.0, + "29": 7055.0, + "30": 6430.0, + "31": 5565.0, + "32": 6615.0, + "33": 6969.0, + "34": 6107.0, + "35": 6538.0, + "36": 6486.0, + "37": 7272.0, + "38": 6923.0, + "39": 7497.0, + "40": 6997.0, + "41": 6747.0, + "42": 7228.0, + "43": 6629.0, + "44": 6752.0, + "45": 6557.0, + "46": 6904.0, + "47": 7474.0, + "48": 7165.0, + "49": 7244.0, + "50": 7331.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 491766784.0, + "2": 491767296.0, + "3": 491765760.0, + "4": 491767296.0, + "5": 491766784.0, + "6": 491767808.0, + "7": 491767296.0, + "8": 491768320.0, + "9": 491767808.0, + "10": 491767296.0, + "11": 491765248.0, + "12": 491764736.0, + "13": 491766272.0, + "14": 491767808.0, + "15": 491768832.0, + "16": 491769856.0, + "17": 491767296.0, + "18": 491765248.0, + "19": 491766272.0, + "20": 491766784.0, + "21": 491768320.0, + "22": 491768320.0, + "23": 491765760.0, + "24": 491766272.0, + "25": 491766272.0, + "26": 491767296.0, + "27": 491766784.0, + "28": 491767296.0, + "29": 491766272.0, + "30": 491766272.0, + "31": 491767808.0, + "32": 491765760.0, + "33": 491764736.0, + "34": 491768320.0, + "35": 491769344.0, + "36": 491765760.0, + "37": 491765248.0, + "38": 491766272.0, + "39": 491767808.0, + "40": 491765760.0, + "41": 491768320.0, + "42": 491766272.0, + "43": 491768832.0, + "44": 491768320.0, + "45": 491765248.0, + "46": 491768320.0, + "47": 491765760.0, + "48": 491766784.0, + "49": 491766784.0, + "50": 491765248.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1047229440.0, + "2": 1213900288.0, + "3": 1213900288.0, + "4": 1213900288.0, + "5": 1213900288.0, + "6": 1213900288.0, + "7": 1213900288.0, + "8": 1213900288.0, + "9": 1213900288.0, + "10": 1213900288.0, + "11": 1213900288.0, + "12": 1213900288.0, + "13": 1213900288.0, + "14": 1213900288.0, + "15": 1213900288.0, + "16": 1213900288.0, + "17": 1213900288.0, + "18": 1213900288.0, + "19": 1213900288.0, + "20": 1213900288.0, + "21": 1213900288.0, + "22": 1213900288.0, + "23": 1213900288.0, + "24": 1213900288.0, + "25": 1213900288.0, + "26": 1213900288.0, + "27": 1213900288.0, + "28": 1213900288.0, + "29": 1213900288.0, + "30": 1213900288.0, + "31": 1213900288.0, + "32": 1213900288.0, + "33": 1213900288.0, + "34": 1213900288.0, + "35": 1213900288.0, + "36": 1213900288.0, + "37": 1213900288.0, + "38": 1213900288.0, + "39": 1213900288.0, + "40": 1213900288.0, + "41": 1213900288.0, + "42": 1213900288.0, + "43": 1213900288.0, + "44": 1213900288.0, + "45": 1213900288.0, + "46": 1213900288.0, + "47": 1213900288.0, + "48": 1213900288.0, + "49": 1213900288.0, + "50": 1213900288.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 13.26707, + "2": 0.52806, + "3": 0.46475, + "4": 0.47125, + "5": 0.42985, + "6": 0.42614, + "7": 0.43552, + "8": 0.42689, + "9": 0.42927, + "10": 0.42373, + "11": 0.42662, + "12": 0.42301, + "13": 0.42359, + "14": 0.4226, + "15": 0.42796, + "16": 0.42415, + "17": 0.4235, + "18": 0.41948, + "19": 0.42601, + "20": 0.42722, + "21": 0.4176, + "22": 0.41953, + "23": 0.42303, + "24": 0.4187, + "25": 0.42281, + "26": 0.42449, + "27": 0.41941, + "28": 0.42935, + "29": 0.417, + "30": 0.4261, + "31": 0.42904, + "32": 0.41844, + "33": 0.41687, + "34": 0.43419, + "35": 0.43727, + "36": 0.42315, + "37": 0.42179, + "38": 0.42403, + "39": 0.4179, + "40": 0.42443, + "41": 0.42169, + "42": 0.42155, + "43": 0.43942, + "44": 0.42209, + "45": 0.41972, + "46": 0.46515, + "47": 0.43911, + "48": 0.43693, + "49": 0.44745, + "50": 0.4198 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..8f5e5238362 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.79288, + "2": 10.81339, + "3": 10.8144, + "4": 10.77657, + "5": 10.828, + "6": 10.84293, + "7": 10.81053, + "8": 10.80366, + "9": 10.81505, + "10": 10.76831, + "11": 10.86961, + "12": 10.83911, + "13": 10.85295, + "14": 10.86545, + "15": 10.79073, + "16": 10.78351, + "17": 10.7488, + "18": 10.79251, + "19": 10.78822, + "20": 10.7066, + "21": 10.68957, + "22": 10.53861, + "23": 10.70542, + "24": 10.59106, + "25": 10.54061, + "26": 10.59556, + "27": 10.61836, + "28": 10.59188, + "29": 10.6008, + "30": 10.39485, + "31": 10.12988, + "32": 10.49622, + "33": 10.48801, + "34": 10.24185, + "35": 10.30488, + "36": 10.25446, + "37": 10.38879, + "38": 10.24767, + "39": 10.43653, + "40": 10.13079, + "41": 10.18439, + "42": 10.25364, + "43": 9.89225, + "44": 10.0224, + "45": 9.90236, + "46": 9.88337, + "47": 10.1948, + "48": 9.91124, + "49": 9.59882, + "50": 9.97938 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5601.0, + "2": 5974.0, + "3": 5786.0, + "4": 5760.0, + "5": 6601.0, + "6": 6753.0, + "7": 6231.0, + "8": 5822.0, + "9": 6446.0, + "10": 5254.0, + "11": 6740.0, + "12": 6313.0, + "13": 6672.0, + "14": 6909.0, + "15": 6250.0, + "16": 6391.0, + "17": 6290.0, + "18": 6086.0, + "19": 6278.0, + "20": 5969.0, + "21": 6461.0, + "22": 5583.0, + "23": 6602.0, + "24": 5982.0, + "25": 5816.0, + "26": 6162.0, + "27": 6378.0, + "28": 6931.0, + "29": 7197.0, + "30": 6181.0, + "31": 5568.0, + "32": 6876.0, + "33": 6980.0, + "34": 6144.0, + "35": 6751.0, + "36": 6501.0, + "37": 7367.0, + "38": 7095.0, + "39": 7558.0, + "40": 6831.0, + "41": 6929.0, + "42": 7131.0, + "43": 6817.0, + "44": 6736.0, + "45": 6881.0, + "46": 7006.0, + "47": 7622.0, + "48": 7384.0, + "49": 7363.0, + "50": 7684.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 458211840.0, + "2": 458213376.0, + "3": 458214400.0, + "4": 458211840.0, + "5": 458212864.0, + "6": 458215424.0, + "7": 458212864.0, + "8": 458212864.0, + "9": 458214912.0, + "10": 458214912.0, + "11": 458214912.0, + "12": 458213888.0, + "13": 458214912.0, + "14": 458213376.0, + "15": 458215424.0, + "16": 458214400.0, + "17": 458214400.0, + "18": 458215424.0, + "19": 458209792.0, + "20": 458212864.0, + "21": 458211840.0, + "22": 458219520.0, + "23": 458213888.0, + "24": 458214912.0, + "25": 458215424.0, + "26": 458213376.0, + "27": 458213888.0, + "28": 458213888.0, + "29": 458212864.0, + "30": 458211840.0, + "31": 458218496.0, + "32": 458214912.0, + "33": 458212352.0, + "34": 458214400.0, + "35": 458214400.0, + "36": 458215424.0, + "37": 458213888.0, + "38": 458213888.0, + "39": 458213888.0, + "40": 458214912.0, + "41": 458216448.0, + "42": 458213888.0, + "43": 458217472.0, + "44": 458212864.0, + "45": 458213888.0, + "46": 458216960.0, + "47": 458214400.0, + "48": 458212352.0, + "49": 458215424.0, + "50": 458214912.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1016394240.0, + "2": 1180904960.0, + "3": 1180904960.0, + "4": 1180904960.0, + "5": 1180904960.0, + "6": 1180904960.0, + "7": 1180904960.0, + "8": 1180904960.0, + "9": 1180934144.0, + "10": 1180934144.0, + "11": 1180934144.0, + "12": 1180934144.0, + "13": 1180934144.0, + "14": 1180934144.0, + "15": 1180990976.0, + "16": 1180990976.0, + "17": 1180990976.0, + "18": 1180990976.0, + "19": 1180990976.0, + "20": 1180990976.0, + "21": 1180990976.0, + "22": 1180990976.0, + "23": 1180990976.0, + "24": 1180990976.0, + "25": 1181222912.0, + "26": 1181222912.0, + "27": 1181222912.0, + "28": 1181222912.0, + "29": 1181222912.0, + "30": 1181222912.0, + "31": 1181222912.0, + "32": 1181222912.0, + "33": 1181222912.0, + "34": 1181222912.0, + "35": 1181468160.0, + "36": 1181468160.0, + "37": 1181468160.0, + "38": 1181468160.0, + "39": 1181468160.0, + "40": 1181468160.0, + "41": 1181468160.0, + "42": 1181468160.0, + "43": 1181468160.0, + "44": 1183467008.0, + "45": 1183467008.0, + "46": 1183467008.0, + "47": 1183467008.0, + "48": 1183467008.0, + "49": 1183467008.0, + "50": 1183467008.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 16.05941, + "2": 0.66923, + "3": 0.61216, + "4": 0.58734, + "5": 0.60006, + "6": 0.58013, + "7": 0.60084, + "8": 0.59342, + "9": 0.59047, + "10": 0.60222, + "11": 0.58523, + "12": 0.60039, + "13": 0.58622, + "14": 0.59318, + "15": 0.59774, + "16": 0.58824, + "17": 0.60997, + "18": 0.58565, + "19": 0.596, + "20": 0.59978, + "21": 0.58617, + "22": 0.60156, + "23": 0.58205, + "24": 0.60247, + "25": 0.60354, + "26": 0.5839, + "27": 0.61043, + "28": 0.58334, + "29": 0.60152, + "30": 0.59973, + "31": 0.58621, + "32": 0.59768, + "33": 0.58349, + "34": 0.59991, + "35": 0.59183, + "36": 0.58804, + "37": 0.60327, + "38": 0.58347, + "39": 0.60102, + "40": 0.58409, + "41": 0.59493, + "42": 0.5989, + "43": 0.58752, + "44": 0.59927, + "45": 0.59465, + "46": 0.60409, + "47": 0.60265, + "48": 0.5887, + "49": 0.6087, + "50": 0.58454 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..19437ff4a78 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.79288, + "2": 10.81339, + "3": 10.8144, + "4": 10.77657, + "5": 10.828, + "6": 10.84293, + "7": 10.81053, + "8": 10.80366, + "9": 10.81505, + "10": 10.76831, + "11": 10.86961, + "12": 10.83911, + "13": 10.85295, + "14": 10.86545, + "15": 10.79073, + "16": 10.78351, + "17": 10.7488, + "18": 10.79251, + "19": 10.78822, + "20": 10.7066, + "21": 10.68957, + "22": 10.53861, + "23": 10.70542, + "24": 10.59106, + "25": 10.54061, + "26": 10.59556, + "27": 10.61836, + "28": 10.59188, + "29": 10.6008, + "30": 10.39485, + "31": 10.12988, + "32": 10.49622, + "33": 10.48801, + "34": 10.24185, + "35": 10.30488, + "36": 10.25446, + "37": 10.38879, + "38": 10.24767, + "39": 10.43653, + "40": 10.13079, + "41": 10.18439, + "42": 10.25364, + "43": 9.89225, + "44": 10.0224, + "45": 9.90236, + "46": 9.88337, + "47": 10.1948, + "48": 9.91124, + "49": 9.59882, + "50": 9.97938 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5601.0, + "2": 5974.0, + "3": 5786.0, + "4": 5760.0, + "5": 6601.0, + "6": 6753.0, + "7": 6231.0, + "8": 5822.0, + "9": 6446.0, + "10": 5254.0, + "11": 6740.0, + "12": 6313.0, + "13": 6672.0, + "14": 6909.0, + "15": 6250.0, + "16": 6391.0, + "17": 6290.0, + "18": 6086.0, + "19": 6278.0, + "20": 5969.0, + "21": 6461.0, + "22": 5583.0, + "23": 6602.0, + "24": 5982.0, + "25": 5816.0, + "26": 6162.0, + "27": 6378.0, + "28": 6931.0, + "29": 7197.0, + "30": 6181.0, + "31": 5568.0, + "32": 6876.0, + "33": 6980.0, + "34": 6144.0, + "35": 6751.0, + "36": 6501.0, + "37": 7367.0, + "38": 7095.0, + "39": 7558.0, + "40": 6831.0, + "41": 6929.0, + "42": 7131.0, + "43": 6817.0, + "44": 6736.0, + "45": 6881.0, + "46": 7006.0, + "47": 7622.0, + "48": 7384.0, + "49": 7363.0, + "50": 7684.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 458211840.0, + "2": 458213376.0, + "3": 458214400.0, + "4": 458211840.0, + "5": 458212864.0, + "6": 458215424.0, + "7": 458212864.0, + "8": 458212864.0, + "9": 458214912.0, + "10": 458214912.0, + "11": 458214912.0, + "12": 458213888.0, + "13": 458214912.0, + "14": 458213376.0, + "15": 458215424.0, + "16": 458214400.0, + "17": 458214400.0, + "18": 458215424.0, + "19": 458209792.0, + "20": 458212864.0, + "21": 458211840.0, + "22": 458219520.0, + "23": 458213888.0, + "24": 458214912.0, + "25": 458215424.0, + "26": 458213376.0, + "27": 458213888.0, + "28": 458213888.0, + "29": 458212864.0, + "30": 458211840.0, + "31": 458218496.0, + "32": 458214912.0, + "33": 458212352.0, + "34": 458214400.0, + "35": 458214400.0, + "36": 458215424.0, + "37": 458213888.0, + "38": 458213888.0, + "39": 458213888.0, + "40": 458214912.0, + "41": 458216448.0, + "42": 458213888.0, + "43": 458217472.0, + "44": 458212864.0, + "45": 458213888.0, + "46": 458216960.0, + "47": 458214400.0, + "48": 458212352.0, + "49": 458215424.0, + "50": 458214912.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1016394240.0, + "2": 1180904960.0, + "3": 1180904960.0, + "4": 1180904960.0, + "5": 1180904960.0, + "6": 1180904960.0, + "7": 1180904960.0, + "8": 1180904960.0, + "9": 1180934144.0, + "10": 1180934144.0, + "11": 1180934144.0, + "12": 1180934144.0, + "13": 1180934144.0, + "14": 1180934144.0, + "15": 1180990976.0, + "16": 1180990976.0, + "17": 1180990976.0, + "18": 1180990976.0, + "19": 1180990976.0, + "20": 1180990976.0, + "21": 1180990976.0, + "22": 1180990976.0, + "23": 1180990976.0, + "24": 1180990976.0, + "25": 1181222912.0, + "26": 1181222912.0, + "27": 1181222912.0, + "28": 1181222912.0, + "29": 1181222912.0, + "30": 1181222912.0, + "31": 1181222912.0, + "32": 1181222912.0, + "33": 1181222912.0, + "34": 1181222912.0, + "35": 1181468160.0, + "36": 1181468160.0, + "37": 1181468160.0, + "38": 1181468160.0, + "39": 1181468160.0, + "40": 1181468160.0, + "41": 1181468160.0, + "42": 1181468160.0, + "43": 1181468160.0, + "44": 1183467008.0, + "45": 1183467008.0, + "46": 1183467008.0, + "47": 1183467008.0, + "48": 1183467008.0, + "49": 1183467008.0, + "50": 1183467008.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 20.92117, + "2": 0.78495, + "3": 0.65993, + "4": 0.60281, + "5": 0.62415, + "6": 0.59632, + "7": 0.61058, + "8": 0.60884, + "9": 0.61298, + "10": 0.60737, + "11": 0.59282, + "12": 0.62404, + "13": 0.59787, + "14": 0.5992, + "15": 0.60558, + "16": 0.58919, + "17": 0.60862, + "18": 0.58494, + "19": 0.59977, + "20": 0.59905, + "21": 0.58779, + "22": 0.60691, + "23": 0.58773, + "24": 0.59879, + "25": 0.59399, + "26": 0.58416, + "27": 0.59705, + "28": 0.58558, + "29": 0.60279, + "30": 0.59279, + "31": 0.59125, + "32": 0.60528, + "33": 0.58125, + "34": 0.59849, + "35": 0.5851, + "36": 0.59833, + "37": 0.59938, + "38": 0.58782, + "39": 0.59605, + "40": 0.58815, + "41": 0.59763, + "42": 0.60014, + "43": 0.58419, + "44": 0.59775, + "45": 0.58451, + "46": 0.60219, + "47": 0.59473, + "48": 0.58641, + "49": 0.6019, + "50": 0.58426 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json index 966de8bb1bb..eba1757fe35 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.80475, + "2": 10.821, + "3": 10.8216, + "4": 10.79306, "5": 10.84831, + "6": 10.85888, + "7": 10.83177, + "8": 10.82362, + "9": 10.83757, "10": 10.78732, + "11": 10.86732, + "12": 10.85395, + "13": 10.86171, + "14": 10.88343, "15": 10.79765, + "16": 10.79986, + "17": 10.76238, + "18": 10.80286, + "19": 10.7945, "20": 10.71733, + "21": 10.70194, + "22": 10.55147, + "23": 10.72167, + "24": 10.60698, "25": 10.54614, + "26": 10.6136, + "27": 10.63974, + "28": 10.60486, + "29": 10.62277, "30": 10.41109, + "31": 10.1456, + "32": 10.51017, + "33": 10.50089, + "34": 10.25812, "35": 10.3154, + "36": 10.27895, + "37": 10.41061, + "38": 10.25908, + "39": 10.45334, "40": 10.1604, + "41": 10.20557, + "42": 10.26792, + "43": 9.90468, + "44": 10.03233, "45": 9.91098, + "46": 9.87857, + "47": 10.20952, + "48": 9.93178, + "49": 9.61584, "50": 9.98565 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 5474.0, + "2": 5853.0, + "3": 5875.0, + "4": 6041.0, "5": 6601.0, + "6": 6654.0, + "7": 6135.0, + "8": 5761.0, + "9": 6505.0, "10": 5497.0, + "11": 6994.0, + "12": 6523.0, + "13": 6807.0, + "14": 6969.0, "15": 6154.0, + "16": 6667.0, + "17": 6368.0, + "18": 6298.0, + "19": 6353.0, "20": 5998.0, + "21": 6264.0, + "22": 5628.0, + "23": 6620.0, + "24": 6063.0, "25": 5649.0, + "26": 6226.0, + "27": 6409.0, + "28": 6790.0, + "29": 7055.0, "30": 6430.0, + "31": 5565.0, + "32": 6615.0, + "33": 6969.0, + "34": 6107.0, "35": 6538.0, + "36": 6486.0, + "37": 7272.0, + "38": 6923.0, + "39": 7497.0, "40": 6997.0, + "41": 6747.0, + "42": 7228.0, + "43": 6629.0, + "44": 6752.0, "45": 6557.0, + "46": 6904.0, + "47": 7474.0, + "48": 7165.0, + "49": 7244.0, "50": 7331.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 491766784.0, + "2": 491767296.0, + "3": 491765760.0, + "4": 491767296.0, "5": 491766784.0, + "6": 491767808.0, + "7": 491767296.0, + "8": 491768320.0, + "9": 491767808.0, "10": 491767296.0, + "11": 491765248.0, + "12": 491764736.0, + "13": 491766272.0, + "14": 491767808.0, "15": 491768832.0, + "16": 491769856.0, + "17": 491767296.0, + "18": 491765248.0, + "19": 491766272.0, "20": 491766784.0, + "21": 491768320.0, + "22": 491768320.0, + "23": 491765760.0, + "24": 491766272.0, "25": 491766272.0, + "26": 491767296.0, + "27": 491766784.0, + "28": 491767296.0, + "29": 491766272.0, "30": 491766272.0, + "31": 491767808.0, + "32": 491765760.0, + "33": 491764736.0, + "34": 491768320.0, "35": 491769344.0, + "36": 491765760.0, + "37": 491765248.0, + "38": 491766272.0, + "39": 491767808.0, "40": 491765760.0, + "41": 491768320.0, + "42": 491766272.0, + "43": 491768832.0, + "44": 491768320.0, "45": 491765248.0, + "46": 491768320.0, + "47": 491765760.0, + "48": 491766784.0, + "49": 491766784.0, "50": 491765248.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1047229440.0, + "2": 1213900288.0, + "3": 1213900288.0, + "4": 1213900288.0, "5": 1213900288.0, + "6": 1213900288.0, + "7": 1213900288.0, + "8": 1213900288.0, + "9": 1213900288.0, "10": 1213900288.0, + "11": 1213900288.0, + "12": 1213900288.0, + "13": 1213900288.0, + "14": 1213900288.0, "15": 1213900288.0, + "16": 1213900288.0, + "17": 1213900288.0, + "18": 1213900288.0, + "19": 1213900288.0, "20": 1213900288.0, + "21": 1213900288.0, + "22": 1213900288.0, + "23": 1213900288.0, + "24": 1213900288.0, "25": 1213900288.0, + "26": 1213900288.0, + "27": 1213900288.0, + "28": 1213900288.0, + "29": 1213900288.0, "30": 1213900288.0, + "31": 1213900288.0, + "32": 1213900288.0, + "33": 1213900288.0, + "34": 1213900288.0, "35": 1213900288.0, + "36": 1213900288.0, + "37": 1213900288.0, + "38": 1213900288.0, + "39": 1213900288.0, "40": 1213900288.0, + "41": 1213900288.0, + "42": 1213900288.0, + "43": 1213900288.0, + "44": 1213900288.0, "45": 1213900288.0, + "46": 1213900288.0, + "47": 1213900288.0, + "48": 1213900288.0, + "49": 1213900288.0, "50": 1213900288.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 14.78242, - "5": 0.36146, - "10": 0.35831, - "15": 0.36317, - "20": 0.36704, - "25": 0.35673, - "30": 0.36236, - "35": 0.37486, - "40": 0.36477, - "45": 0.36076, - "50": 0.36594 + "1": 13.09447, + "2": 0.51607, + "3": 0.44405, + "4": 0.45969, + "5": 0.41888, + "6": 0.42393, + "7": 0.42442, + "8": 0.41943, + "9": 0.41271, + "10": 0.41462, + "11": 0.41487, + "12": 0.40591, + "13": 0.41444, + "14": 0.40303, + "15": 0.41598, + "16": 0.40637, + "17": 0.40922, + "18": 0.41209, + "19": 0.40964, + "20": 0.4238, + "21": 0.4078, + "22": 0.41408, + "23": 0.41657, + "24": 0.40953, + "25": 0.41984, + "26": 0.41935, + "27": 0.41845, + "28": 0.42267, + "29": 0.41439, + "30": 0.42344, + "31": 0.42201, + "32": 0.42025, + "33": 0.4143, + "34": 0.50551, + "35": 0.44065, + "36": 0.41296, + "37": 0.41985, + "38": 0.41541, + "39": 0.41687, + "40": 0.41757, + "41": 0.4181, + "42": 0.41983, + "43": 0.42929, + "44": 0.41833, + "45": 0.41337, + "46": 0.46022, + "47": 0.43427, + "48": 0.42794, + "49": 0.44841, + "50": 0.41311 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..aeb8f53adff --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.79175, + "2": 10.80907, + "3": 10.81011, + "4": 10.78146, + "5": 10.82288, + "6": 10.84057, + "7": 10.81192, + "8": 10.80005, + "9": 10.81667, + "10": 10.7688, + "11": 10.8618, + "12": 10.84042, + "13": 10.84452, + "14": 10.86421, + "15": 10.79157, + "16": 10.78199, + "17": 10.75122, + "18": 10.79446, + "19": 10.79523, + "20": 10.71001, + "21": 10.68811, + "22": 10.53736, + "23": 10.7066, + "24": 10.58865, + "25": 10.54662, + "26": 10.59492, + "27": 10.62142, + "28": 10.5969, + "29": 10.60036, + "30": 10.39407, + "31": 10.12951, + "32": 10.49684, + "33": 10.48779, + "34": 10.24347, + "35": 10.30461, + "36": 10.26056, + "37": 10.38859, + "38": 10.24848, + "39": 10.43799, + "40": 10.13303, + "41": 10.18651, + "42": 10.25823, + "43": 9.892, + "44": 10.02576, + "45": 9.90015, + "46": 9.88387, + "47": 10.19565, + "48": 9.91255, + "49": 9.60147, + "50": 9.97874 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5656.0, + "2": 6018.0, + "3": 5790.0, + "4": 5941.0, + "5": 6476.0, + "6": 6653.0, + "7": 6287.0, + "8": 5875.0, + "9": 6239.0, + "10": 5453.0, + "11": 6936.0, + "12": 6711.0, + "13": 6655.0, + "14": 6814.0, + "15": 6233.0, + "16": 6533.0, + "17": 6397.0, + "18": 6112.0, + "19": 6678.0, + "20": 5837.0, + "21": 6403.0, + "22": 5715.0, + "23": 6744.0, + "24": 6051.0, + "25": 5811.0, + "26": 6104.0, + "27": 6484.0, + "28": 6884.0, + "29": 7253.0, + "30": 6047.0, + "31": 5593.0, + "32": 6625.0, + "33": 7054.0, + "34": 6104.0, + "35": 6712.0, + "36": 6684.0, + "37": 7523.0, + "38": 7273.0, + "39": 7620.0, + "40": 7062.0, + "41": 6895.0, + "42": 7426.0, + "43": 6713.0, + "44": 6664.0, + "45": 6681.0, + "46": 6923.0, + "47": 7705.0, + "48": 7248.0, + "49": 7331.0, + "50": 7527.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 462408192.0, + "2": 462406144.0, + "3": 462409728.0, + "4": 462406144.0, + "5": 462407680.0, + "6": 462408192.0, + "7": 462410752.0, + "8": 462410752.0, + "9": 462407168.0, + "10": 462410240.0, + "11": 462408192.0, + "12": 462408192.0, + "13": 462408704.0, + "14": 462409728.0, + "15": 462409728.0, + "16": 462407168.0, + "17": 462408704.0, + "18": 462408704.0, + "19": 462408704.0, + "20": 462408704.0, + "21": 462406144.0, + "22": 462412800.0, + "23": 462409216.0, + "24": 462408704.0, + "25": 462406144.0, + "26": 462410240.0, + "27": 462405120.0, + "28": 462408192.0, + "29": 462407168.0, + "30": 462406144.0, + "31": 462413312.0, + "32": 462408704.0, + "33": 462409216.0, + "34": 462406144.0, + "35": 462410240.0, + "36": 462407168.0, + "37": 462409728.0, + "38": 462408192.0, + "39": 462408192.0, + "40": 462407680.0, + "41": 462411264.0, + "42": 462409728.0, + "43": 462411264.0, + "44": 462407680.0, + "45": 462408704.0, + "46": 462410752.0, + "47": 462407680.0, + "48": 462408192.0, + "49": 462409728.0, + "50": 462409216.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1019807232.0, + "2": 1186372608.0, + "3": 1186372608.0, + "4": 1186372608.0, + "5": 1186372608.0, + "6": 1186372608.0, + "7": 1186372608.0, + "8": 1186372608.0, + "9": 1186372608.0, + "10": 1186372608.0, + "11": 1186372608.0, + "12": 1186372608.0, + "13": 1186372608.0, + "14": 1186372608.0, + "15": 1186372608.0, + "16": 1186372608.0, + "17": 1186372608.0, + "18": 1186372608.0, + "19": 1186372608.0, + "20": 1186372608.0, + "21": 1186372608.0, + "22": 1186372608.0, + "23": 1186372608.0, + "24": 1186372608.0, + "25": 1186372608.0, + "26": 1186372608.0, + "27": 1186372608.0, + "28": 1186372608.0, + "29": 1186372608.0, + "30": 1186372608.0, + "31": 1186372608.0, + "32": 1186372608.0, + "33": 1186372608.0, + "34": 1186372608.0, + "35": 1186372608.0, + "36": 1186372608.0, + "37": 1186372608.0, + "38": 1186372608.0, + "39": 1186372608.0, + "40": 1186372608.0, + "41": 1186372608.0, + "42": 1186372608.0, + "43": 1186372608.0, + "44": 1186372608.0, + "45": 1186372608.0, + "46": 1186372608.0, + "47": 1186372608.0, + "48": 1186372608.0, + "49": 1186372608.0, + "50": 1186372608.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 7.63206, + "2": 0.65692, + "3": 0.5824, + "4": 0.58308, + "5": 0.58182, + "6": 0.57849, + "7": 0.57628, + "8": 0.57557, + "9": 0.57694, + "10": 0.57443, + "11": 0.57466, + "12": 0.57548, + "13": 0.57752, + "14": 0.58301, + "15": 0.57494, + "16": 0.5737, + "17": 0.57748, + "18": 0.57584, + "19": 0.57312, + "20": 0.57465, + "21": 0.57268, + "22": 0.57394, + "23": 0.57466, + "24": 0.57498, + "25": 0.57708, + "26": 0.57279, + "27": 0.57369, + "28": 0.57312, + "29": 0.57271, + "30": 0.57407, + "31": 0.5737, + "32": 0.57173, + "33": 0.57054, + "34": 0.5736, + "35": 0.57222, + "36": 0.57349, + "37": 0.57417, + "38": 0.57356, + "39": 0.57214, + "40": 0.57186, + "41": 0.57234, + "42": 0.57304, + "43": 0.5732, + "44": 0.5724, + "45": 0.5728, + "46": 0.57286, + "47": 0.57315, + "48": 0.57441, + "49": 0.57353, + "50": 0.57322 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci.json new file mode 100644 index 00000000000..b4b3a0e2762 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.79175, + "2": 10.80907, + "3": 10.81011, + "4": 10.78146, + "5": 10.82288, + "6": 10.84057, + "7": 10.81192, + "8": 10.80005, + "9": 10.81667, + "10": 10.7688, + "11": 10.8618, + "12": 10.84042, + "13": 10.84452, + "14": 10.86421, + "15": 10.79157, + "16": 10.78199, + "17": 10.75122, + "18": 10.79446, + "19": 10.79523, + "20": 10.71001, + "21": 10.68811, + "22": 10.53736, + "23": 10.7066, + "24": 10.58865, + "25": 10.54662, + "26": 10.59492, + "27": 10.62142, + "28": 10.5969, + "29": 10.60036, + "30": 10.39407, + "31": 10.12951, + "32": 10.49684, + "33": 10.48779, + "34": 10.24347, + "35": 10.30461, + "36": 10.26056, + "37": 10.38859, + "38": 10.24848, + "39": 10.43799, + "40": 10.13303, + "41": 10.18651, + "42": 10.25823, + "43": 9.892, + "44": 10.02576, + "45": 9.90015, + "46": 9.88387, + "47": 10.19565, + "48": 9.91255, + "49": 9.60147, + "50": 9.97874 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5656.0, + "2": 6018.0, + "3": 5790.0, + "4": 5941.0, + "5": 6476.0, + "6": 6653.0, + "7": 6287.0, + "8": 5875.0, + "9": 6239.0, + "10": 5453.0, + "11": 6936.0, + "12": 6711.0, + "13": 6655.0, + "14": 6814.0, + "15": 6233.0, + "16": 6533.0, + "17": 6397.0, + "18": 6112.0, + "19": 6678.0, + "20": 5837.0, + "21": 6403.0, + "22": 5715.0, + "23": 6744.0, + "24": 6051.0, + "25": 5811.0, + "26": 6104.0, + "27": 6484.0, + "28": 6884.0, + "29": 7253.0, + "30": 6047.0, + "31": 5593.0, + "32": 6625.0, + "33": 7054.0, + "34": 6104.0, + "35": 6712.0, + "36": 6684.0, + "37": 7523.0, + "38": 7273.0, + "39": 7620.0, + "40": 7062.0, + "41": 6895.0, + "42": 7426.0, + "43": 6713.0, + "44": 6664.0, + "45": 6681.0, + "46": 6923.0, + "47": 7705.0, + "48": 7248.0, + "49": 7331.0, + "50": 7527.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 462408192.0, + "2": 462406144.0, + "3": 462409728.0, + "4": 462406144.0, + "5": 462407680.0, + "6": 462408192.0, + "7": 462410752.0, + "8": 462410752.0, + "9": 462407168.0, + "10": 462410240.0, + "11": 462408192.0, + "12": 462408192.0, + "13": 462408704.0, + "14": 462409728.0, + "15": 462409728.0, + "16": 462407168.0, + "17": 462408704.0, + "18": 462408704.0, + "19": 462408704.0, + "20": 462408704.0, + "21": 462406144.0, + "22": 462412800.0, + "23": 462409216.0, + "24": 462408704.0, + "25": 462406144.0, + "26": 462410240.0, + "27": 462405120.0, + "28": 462408192.0, + "29": 462407168.0, + "30": 462406144.0, + "31": 462413312.0, + "32": 462408704.0, + "33": 462409216.0, + "34": 462406144.0, + "35": 462410240.0, + "36": 462407168.0, + "37": 462409728.0, + "38": 462408192.0, + "39": 462408192.0, + "40": 462407680.0, + "41": 462411264.0, + "42": 462409728.0, + "43": 462411264.0, + "44": 462407680.0, + "45": 462408704.0, + "46": 462410752.0, + "47": 462407680.0, + "48": 462408192.0, + "49": 462409728.0, + "50": 462409216.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1019807232.0, + "2": 1186372608.0, + "3": 1186372608.0, + "4": 1186372608.0, + "5": 1186372608.0, + "6": 1186372608.0, + "7": 1186372608.0, + "8": 1186372608.0, + "9": 1186372608.0, + "10": 1186372608.0, + "11": 1186372608.0, + "12": 1186372608.0, + "13": 1186372608.0, + "14": 1186372608.0, + "15": 1186372608.0, + "16": 1186372608.0, + "17": 1186372608.0, + "18": 1186372608.0, + "19": 1186372608.0, + "20": 1186372608.0, + "21": 1186372608.0, + "22": 1186372608.0, + "23": 1186372608.0, + "24": 1186372608.0, + "25": 1186372608.0, + "26": 1186372608.0, + "27": 1186372608.0, + "28": 1186372608.0, + "29": 1186372608.0, + "30": 1186372608.0, + "31": 1186372608.0, + "32": 1186372608.0, + "33": 1186372608.0, + "34": 1186372608.0, + "35": 1186372608.0, + "36": 1186372608.0, + "37": 1186372608.0, + "38": 1186372608.0, + "39": 1186372608.0, + "40": 1186372608.0, + "41": 1186372608.0, + "42": 1186372608.0, + "43": 1186372608.0, + "44": 1186372608.0, + "45": 1186372608.0, + "46": 1186372608.0, + "47": 1186372608.0, + "48": 1186372608.0, + "49": 1186372608.0, + "50": 1186372608.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 7.40856, + "2": 0.64197, + "3": 0.58531, + "4": 0.58507, + "5": 0.57697, + "6": 0.5793, + "7": 0.5782, + "8": 0.58243, + "9": 0.58414, + "10": 0.58249, + "11": 0.58253, + "12": 0.58879, + "13": 0.58756, + "14": 0.5805, + "15": 0.57895, + "16": 0.58121, + "17": 0.58174, + "18": 0.58068, + "19": 0.58124, + "20": 0.58037, + "21": 0.58171, + "22": 0.58014, + "23": 0.5805, + "24": 0.5793, + "25": 0.58053, + "26": 0.58187, + "27": 0.57993, + "28": 0.57974, + "29": 0.58115, + "30": 0.58209, + "31": 0.58796, + "32": 0.58194, + "33": 0.58092, + "34": 0.58015, + "35": 0.5818, + "36": 0.58003, + "37": 0.58229, + "38": 0.58277, + "39": 0.57819, + "40": 0.57868, + "41": 0.57976, + "42": 0.57721, + "43": 0.57953, + "44": 0.58081, + "45": 0.57938, + "46": 0.58149, + "47": 0.58214, + "48": 0.58119, + "49": 0.58151, + "50": 0.57895 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..4fb97350a0f --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.80475, + "2": 10.821, + "3": 10.8216, + "4": 10.79306, + "5": 10.84831, + "6": 10.85888, + "7": 10.83177, + "8": 10.82362, + "9": 10.83757, + "10": 10.78732, + "11": 10.86732, + "12": 10.85395, + "13": 10.86171, + "14": 10.88343, + "15": 10.79765, + "16": 10.79986, + "17": 10.76238, + "18": 10.80286, + "19": 10.7945, + "20": 10.71733, + "21": 10.70194, + "22": 10.55147, + "23": 10.72167, + "24": 10.60698, + "25": 10.54614, + "26": 10.6136, + "27": 10.63974, + "28": 10.60486, + "29": 10.62277, + "30": 10.41109, + "31": 10.1456, + "32": 10.51017, + "33": 10.50089, + "34": 10.25812, + "35": 10.3154, + "36": 10.27895, + "37": 10.41061, + "38": 10.25908, + "39": 10.45334, + "40": 10.1604, + "41": 10.20557, + "42": 10.26792, + "43": 9.90468, + "44": 10.03233, + "45": 9.91098, + "46": 9.87857, + "47": 10.20952, + "48": 9.93178, + "49": 9.61584, + "50": 9.98565 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5474.0, + "2": 5853.0, + "3": 5875.0, + "4": 6041.0, + "5": 6601.0, + "6": 6654.0, + "7": 6135.0, + "8": 5761.0, + "9": 6505.0, + "10": 5497.0, + "11": 6994.0, + "12": 6523.0, + "13": 6807.0, + "14": 6969.0, + "15": 6154.0, + "16": 6667.0, + "17": 6368.0, + "18": 6298.0, + "19": 6353.0, + "20": 5998.0, + "21": 6264.0, + "22": 5628.0, + "23": 6620.0, + "24": 6063.0, + "25": 5649.0, + "26": 6226.0, + "27": 6409.0, + "28": 6790.0, + "29": 7055.0, + "30": 6430.0, + "31": 5565.0, + "32": 6615.0, + "33": 6969.0, + "34": 6107.0, + "35": 6538.0, + "36": 6486.0, + "37": 7272.0, + "38": 6923.0, + "39": 7497.0, + "40": 6997.0, + "41": 6747.0, + "42": 7228.0, + "43": 6629.0, + "44": 6752.0, + "45": 6557.0, + "46": 6904.0, + "47": 7474.0, + "48": 7165.0, + "49": 7244.0, + "50": 7331.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 491766784.0, + "2": 491767296.0, + "3": 491765760.0, + "4": 491767296.0, + "5": 491766784.0, + "6": 491767808.0, + "7": 491767296.0, + "8": 491768320.0, + "9": 491767808.0, + "10": 491767296.0, + "11": 491765248.0, + "12": 491764736.0, + "13": 491766272.0, + "14": 491767808.0, + "15": 491768832.0, + "16": 491769856.0, + "17": 491767296.0, + "18": 491765248.0, + "19": 491766272.0, + "20": 491766784.0, + "21": 491768320.0, + "22": 491768320.0, + "23": 491765760.0, + "24": 491766272.0, + "25": 491766272.0, + "26": 491767296.0, + "27": 491766784.0, + "28": 491767296.0, + "29": 491766272.0, + "30": 491766272.0, + "31": 491767808.0, + "32": 491765760.0, + "33": 491764736.0, + "34": 491768320.0, + "35": 491769344.0, + "36": 491765760.0, + "37": 491765248.0, + "38": 491766272.0, + "39": 491767808.0, + "40": 491765760.0, + "41": 491768320.0, + "42": 491766272.0, + "43": 491768832.0, + "44": 491768320.0, + "45": 491765248.0, + "46": 491768320.0, + "47": 491765760.0, + "48": 491766784.0, + "49": 491766784.0, + "50": 491765248.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1047229440.0, + "2": 1213900288.0, + "3": 1213900288.0, + "4": 1213900288.0, + "5": 1213900288.0, + "6": 1213900288.0, + "7": 1213900288.0, + "8": 1213900288.0, + "9": 1213900288.0, + "10": 1213900288.0, + "11": 1213900288.0, + "12": 1213900288.0, + "13": 1213900288.0, + "14": 1213900288.0, + "15": 1213900288.0, + "16": 1213900288.0, + "17": 1213900288.0, + "18": 1213900288.0, + "19": 1213900288.0, + "20": 1213900288.0, + "21": 1213900288.0, + "22": 1213900288.0, + "23": 1213900288.0, + "24": 1213900288.0, + "25": 1213900288.0, + "26": 1213900288.0, + "27": 1213900288.0, + "28": 1213900288.0, + "29": 1213900288.0, + "30": 1213900288.0, + "31": 1213900288.0, + "32": 1213900288.0, + "33": 1213900288.0, + "34": 1213900288.0, + "35": 1213900288.0, + "36": 1213900288.0, + "37": 1213900288.0, + "38": 1213900288.0, + "39": 1213900288.0, + "40": 1213900288.0, + "41": 1213900288.0, + "42": 1213900288.0, + "43": 1213900288.0, + "44": 1213900288.0, + "45": 1213900288.0, + "46": 1213900288.0, + "47": 1213900288.0, + "48": 1213900288.0, + "49": 1213900288.0, + "50": 1213900288.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 15.17122, + "2": 0.48582, + "3": 0.38154, + "4": 0.40574, + "5": 0.36399, + "6": 0.36563, + "7": 0.3696, + "8": 0.36586, + "9": 0.36758, + "10": 0.36149, + "11": 0.37339, + "12": 0.36971, + "13": 0.36807, + "14": 0.36325, + "15": 0.36851, + "16": 0.36056, + "17": 0.36306, + "18": 0.36443, + "19": 0.36656, + "20": 0.36899, + "21": 0.35832, + "22": 0.35751, + "23": 0.36137, + "24": 0.35806, + "25": 0.35888, + "26": 0.36389, + "27": 0.35895, + "28": 0.36593, + "29": 0.36043, + "30": 0.36535, + "31": 0.38123, + "32": 0.36798, + "33": 0.36325, + "34": 0.3734, + "35": 0.37508, + "36": 0.37043, + "37": 0.38008, + "38": 0.37006, + "39": 0.37268, + "40": 0.37049, + "41": 0.37086, + "42": 0.36713, + "43": 0.37942, + "44": 0.38971, + "45": 0.37293, + "46": 0.41366, + "47": 0.39088, + "48": 0.37854, + "49": 0.41143, + "50": 0.37319 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..eb4665ad7e2 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.80475, + "2": 10.821, + "3": 10.8216, + "4": 10.79306, + "5": 10.84831, + "6": 10.85888, + "7": 10.83177, + "8": 10.82362, + "9": 10.83757, + "10": 10.78732, + "11": 10.86732, + "12": 10.85395, + "13": 10.86171, + "14": 10.88343, + "15": 10.79765, + "16": 10.79986, + "17": 10.76238, + "18": 10.80286, + "19": 10.7945, + "20": 10.71733, + "21": 10.70194, + "22": 10.55147, + "23": 10.72167, + "24": 10.60698, + "25": 10.54614, + "26": 10.6136, + "27": 10.63974, + "28": 10.60486, + "29": 10.62277, + "30": 10.41109, + "31": 10.1456, + "32": 10.51017, + "33": 10.50089, + "34": 10.25812, + "35": 10.3154, + "36": 10.27895, + "37": 10.41061, + "38": 10.25908, + "39": 10.45334, + "40": 10.1604, + "41": 10.20557, + "42": 10.26792, + "43": 9.90468, + "44": 10.03233, + "45": 9.91098, + "46": 9.87857, + "47": 10.20952, + "48": 9.93178, + "49": 9.61584, + "50": 9.98565 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5474.0, + "2": 5853.0, + "3": 5875.0, + "4": 6041.0, + "5": 6601.0, + "6": 6654.0, + "7": 6135.0, + "8": 5761.0, + "9": 6505.0, + "10": 5497.0, + "11": 6994.0, + "12": 6523.0, + "13": 6807.0, + "14": 6969.0, + "15": 6154.0, + "16": 6667.0, + "17": 6368.0, + "18": 6298.0, + "19": 6353.0, + "20": 5998.0, + "21": 6264.0, + "22": 5628.0, + "23": 6620.0, + "24": 6063.0, + "25": 5649.0, + "26": 6226.0, + "27": 6409.0, + "28": 6790.0, + "29": 7055.0, + "30": 6430.0, + "31": 5565.0, + "32": 6615.0, + "33": 6969.0, + "34": 6107.0, + "35": 6538.0, + "36": 6486.0, + "37": 7272.0, + "38": 6923.0, + "39": 7497.0, + "40": 6997.0, + "41": 6747.0, + "42": 7228.0, + "43": 6629.0, + "44": 6752.0, + "45": 6557.0, + "46": 6904.0, + "47": 7474.0, + "48": 7165.0, + "49": 7244.0, + "50": 7331.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 491766784.0, + "2": 491767296.0, + "3": 491765760.0, + "4": 491767296.0, + "5": 491766784.0, + "6": 491767808.0, + "7": 491767296.0, + "8": 491768320.0, + "9": 491767808.0, + "10": 491767296.0, + "11": 491765248.0, + "12": 491764736.0, + "13": 491766272.0, + "14": 491767808.0, + "15": 491768832.0, + "16": 491769856.0, + "17": 491767296.0, + "18": 491765248.0, + "19": 491766272.0, + "20": 491766784.0, + "21": 491768320.0, + "22": 491768320.0, + "23": 491765760.0, + "24": 491766272.0, + "25": 491766272.0, + "26": 491767296.0, + "27": 491766784.0, + "28": 491767296.0, + "29": 491766272.0, + "30": 491766272.0, + "31": 491767808.0, + "32": 491765760.0, + "33": 491764736.0, + "34": 491768320.0, + "35": 491769344.0, + "36": 491765760.0, + "37": 491765248.0, + "38": 491766272.0, + "39": 491767808.0, + "40": 491765760.0, + "41": 491768320.0, + "42": 491766272.0, + "43": 491768832.0, + "44": 491768320.0, + "45": 491765248.0, + "46": 491768320.0, + "47": 491765760.0, + "48": 491766784.0, + "49": 491766784.0, + "50": 491765248.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1047229440.0, + "2": 1213900288.0, + "3": 1213900288.0, + "4": 1213900288.0, + "5": 1213900288.0, + "6": 1213900288.0, + "7": 1213900288.0, + "8": 1213900288.0, + "9": 1213900288.0, + "10": 1213900288.0, + "11": 1213900288.0, + "12": 1213900288.0, + "13": 1213900288.0, + "14": 1213900288.0, + "15": 1213900288.0, + "16": 1213900288.0, + "17": 1213900288.0, + "18": 1213900288.0, + "19": 1213900288.0, + "20": 1213900288.0, + "21": 1213900288.0, + "22": 1213900288.0, + "23": 1213900288.0, + "24": 1213900288.0, + "25": 1213900288.0, + "26": 1213900288.0, + "27": 1213900288.0, + "28": 1213900288.0, + "29": 1213900288.0, + "30": 1213900288.0, + "31": 1213900288.0, + "32": 1213900288.0, + "33": 1213900288.0, + "34": 1213900288.0, + "35": 1213900288.0, + "36": 1213900288.0, + "37": 1213900288.0, + "38": 1213900288.0, + "39": 1213900288.0, + "40": 1213900288.0, + "41": 1213900288.0, + "42": 1213900288.0, + "43": 1213900288.0, + "44": 1213900288.0, + "45": 1213900288.0, + "46": 1213900288.0, + "47": 1213900288.0, + "48": 1213900288.0, + "49": 1213900288.0, + "50": 1213900288.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 13.31352, + "2": 0.50754, + "3": 0.44486, + "4": 0.4668, + "5": 0.42238, + "6": 0.42115, + "7": 0.42604, + "8": 0.4217, + "9": 0.42265, + "10": 0.41522, + "11": 0.41976, + "12": 0.41287, + "13": 0.42113, + "14": 0.41948, + "15": 0.4211, + "16": 0.41519, + "17": 0.42043, + "18": 0.415, + "19": 0.42142, + "20": 0.42878, + "21": 0.4145, + "22": 0.42054, + "23": 0.41581, + "24": 0.42934, + "25": 0.43897, + "26": 0.42648, + "27": 0.42242, + "28": 0.42576, + "29": 0.42795, + "30": 0.42485, + "31": 0.43439, + "32": 0.42257, + "33": 0.41924, + "34": 0.43519, + "35": 0.43865, + "36": 0.42518, + "37": 0.42435, + "38": 0.42597, + "39": 0.42134, + "40": 0.42937, + "41": 0.42822, + "42": 0.42413, + "43": 0.44197, + "44": 0.42413, + "45": 0.42687, + "46": 0.46081, + "47": 0.45208, + "48": 0.43527, + "49": 0.44658, + "50": 0.41965 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..a0f445c56dc --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.79288, + "2": 10.81339, + "3": 10.8144, + "4": 10.77657, + "5": 10.828, + "6": 10.84293, + "7": 10.81053, + "8": 10.80366, + "9": 10.81505, + "10": 10.76831, + "11": 10.86961, + "12": 10.83911, + "13": 10.85295, + "14": 10.86545, + "15": 10.79073, + "16": 10.78351, + "17": 10.7488, + "18": 10.79251, + "19": 10.78822, + "20": 10.7066, + "21": 10.68957, + "22": 10.53861, + "23": 10.70542, + "24": 10.59106, + "25": 10.54061, + "26": 10.59556, + "27": 10.61836, + "28": 10.59188, + "29": 10.6008, + "30": 10.39485, + "31": 10.12988, + "32": 10.49622, + "33": 10.48801, + "34": 10.24185, + "35": 10.30488, + "36": 10.25446, + "37": 10.38879, + "38": 10.24767, + "39": 10.43653, + "40": 10.13079, + "41": 10.18439, + "42": 10.25364, + "43": 9.89225, + "44": 10.0224, + "45": 9.90236, + "46": 9.88337, + "47": 10.1948, + "48": 9.91124, + "49": 9.59882, + "50": 9.97938 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5601.0, + "2": 5974.0, + "3": 5786.0, + "4": 5760.0, + "5": 6601.0, + "6": 6753.0, + "7": 6231.0, + "8": 5822.0, + "9": 6446.0, + "10": 5254.0, + "11": 6740.0, + "12": 6313.0, + "13": 6672.0, + "14": 6909.0, + "15": 6250.0, + "16": 6391.0, + "17": 6290.0, + "18": 6086.0, + "19": 6278.0, + "20": 5969.0, + "21": 6461.0, + "22": 5583.0, + "23": 6602.0, + "24": 5982.0, + "25": 5816.0, + "26": 6162.0, + "27": 6378.0, + "28": 6931.0, + "29": 7197.0, + "30": 6181.0, + "31": 5568.0, + "32": 6876.0, + "33": 6980.0, + "34": 6144.0, + "35": 6751.0, + "36": 6501.0, + "37": 7367.0, + "38": 7095.0, + "39": 7558.0, + "40": 6831.0, + "41": 6929.0, + "42": 7131.0, + "43": 6817.0, + "44": 6736.0, + "45": 6881.0, + "46": 7006.0, + "47": 7622.0, + "48": 7384.0, + "49": 7363.0, + "50": 7684.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 458211840.0, + "2": 458213376.0, + "3": 458214400.0, + "4": 458211840.0, + "5": 458212864.0, + "6": 458215424.0, + "7": 458212864.0, + "8": 458212864.0, + "9": 458214912.0, + "10": 458214912.0, + "11": 458214912.0, + "12": 458213888.0, + "13": 458214912.0, + "14": 458213376.0, + "15": 458215424.0, + "16": 458214400.0, + "17": 458214400.0, + "18": 458215424.0, + "19": 458209792.0, + "20": 458212864.0, + "21": 458211840.0, + "22": 458219520.0, + "23": 458213888.0, + "24": 458214912.0, + "25": 458215424.0, + "26": 458213376.0, + "27": 458213888.0, + "28": 458213888.0, + "29": 458212864.0, + "30": 458211840.0, + "31": 458218496.0, + "32": 458214912.0, + "33": 458212352.0, + "34": 458214400.0, + "35": 458214400.0, + "36": 458215424.0, + "37": 458213888.0, + "38": 458213888.0, + "39": 458213888.0, + "40": 458214912.0, + "41": 458216448.0, + "42": 458213888.0, + "43": 458217472.0, + "44": 458212864.0, + "45": 458213888.0, + "46": 458216960.0, + "47": 458214400.0, + "48": 458212352.0, + "49": 458215424.0, + "50": 458214912.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1016394240.0, + "2": 1180904960.0, + "3": 1180904960.0, + "4": 1180904960.0, + "5": 1180904960.0, + "6": 1180904960.0, + "7": 1180904960.0, + "8": 1180904960.0, + "9": 1180934144.0, + "10": 1180934144.0, + "11": 1180934144.0, + "12": 1180934144.0, + "13": 1180934144.0, + "14": 1180934144.0, + "15": 1180990976.0, + "16": 1180990976.0, + "17": 1180990976.0, + "18": 1180990976.0, + "19": 1180990976.0, + "20": 1180990976.0, + "21": 1180990976.0, + "22": 1180990976.0, + "23": 1180990976.0, + "24": 1180990976.0, + "25": 1181222912.0, + "26": 1181222912.0, + "27": 1181222912.0, + "28": 1181222912.0, + "29": 1181222912.0, + "30": 1181222912.0, + "31": 1181222912.0, + "32": 1181222912.0, + "33": 1181222912.0, + "34": 1181222912.0, + "35": 1181468160.0, + "36": 1181468160.0, + "37": 1181468160.0, + "38": 1181468160.0, + "39": 1181468160.0, + "40": 1181468160.0, + "41": 1181468160.0, + "42": 1181468160.0, + "43": 1181468160.0, + "44": 1183467008.0, + "45": 1183467008.0, + "46": 1183467008.0, + "47": 1183467008.0, + "48": 1183467008.0, + "49": 1183467008.0, + "50": 1183467008.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 15.04132, + "2": 0.66987, + "3": 0.59594, + "4": 0.61167, + "5": 0.59747, + "6": 0.59554, + "7": 0.59774, + "8": 0.59108, + "9": 0.5993, + "10": 0.58738, + "11": 0.60339, + "12": 0.58716, + "13": 0.58921, + "14": 0.59746, + "15": 0.5794, + "16": 0.59504, + "17": 0.58538, + "18": 0.58652, + "19": 0.59212, + "20": 0.58939, + "21": 0.59669, + "22": 0.58476, + "23": 0.58776, + "24": 0.58842, + "25": 0.58684, + "26": 0.59629, + "27": 0.58034, + "28": 0.59676, + "29": 0.58449, + "30": 0.59286, + "31": 0.59012, + "32": 0.58016, + "33": 0.59804, + "34": 0.58394, + "35": 0.67758, + "36": 0.87613, + "37": 0.81369, + "38": 0.83448, + "39": 0.86288, + "40": 0.58264, + "41": 0.59313, + "42": 0.57727, + "43": 0.58849, + "44": 0.57983, + "45": 0.58518, + "46": 0.58778, + "47": 0.58381, + "48": 0.59237, + "49": 0.58055, + "50": 0.59541 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..582aec1d02a --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.79288, + "2": 10.81339, + "3": 10.8144, + "4": 10.77657, + "5": 10.828, + "6": 10.84293, + "7": 10.81053, + "8": 10.80366, + "9": 10.81505, + "10": 10.76831, + "11": 10.86961, + "12": 10.83911, + "13": 10.85295, + "14": 10.86545, + "15": 10.79073, + "16": 10.78351, + "17": 10.7488, + "18": 10.79251, + "19": 10.78822, + "20": 10.7066, + "21": 10.68957, + "22": 10.53861, + "23": 10.70542, + "24": 10.59106, + "25": 10.54061, + "26": 10.59556, + "27": 10.61836, + "28": 10.59188, + "29": 10.6008, + "30": 10.39485, + "31": 10.12988, + "32": 10.49622, + "33": 10.48801, + "34": 10.24185, + "35": 10.30488, + "36": 10.25446, + "37": 10.38879, + "38": 10.24767, + "39": 10.43653, + "40": 10.13079, + "41": 10.18439, + "42": 10.25364, + "43": 9.89225, + "44": 10.0224, + "45": 9.90236, + "46": 9.88337, + "47": 10.1948, + "48": 9.91124, + "49": 9.59882, + "50": 9.97938 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5601.0, + "2": 5974.0, + "3": 5786.0, + "4": 5760.0, + "5": 6601.0, + "6": 6753.0, + "7": 6231.0, + "8": 5822.0, + "9": 6446.0, + "10": 5254.0, + "11": 6740.0, + "12": 6313.0, + "13": 6672.0, + "14": 6909.0, + "15": 6250.0, + "16": 6391.0, + "17": 6290.0, + "18": 6086.0, + "19": 6278.0, + "20": 5969.0, + "21": 6461.0, + "22": 5583.0, + "23": 6602.0, + "24": 5982.0, + "25": 5816.0, + "26": 6162.0, + "27": 6378.0, + "28": 6931.0, + "29": 7197.0, + "30": 6181.0, + "31": 5568.0, + "32": 6876.0, + "33": 6980.0, + "34": 6144.0, + "35": 6751.0, + "36": 6501.0, + "37": 7367.0, + "38": 7095.0, + "39": 7558.0, + "40": 6831.0, + "41": 6929.0, + "42": 7131.0, + "43": 6817.0, + "44": 6736.0, + "45": 6881.0, + "46": 7006.0, + "47": 7622.0, + "48": 7384.0, + "49": 7363.0, + "50": 7684.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 458211840.0, + "2": 458213376.0, + "3": 458214400.0, + "4": 458211840.0, + "5": 458212864.0, + "6": 458215424.0, + "7": 458212864.0, + "8": 458212864.0, + "9": 458214912.0, + "10": 458214912.0, + "11": 458214912.0, + "12": 458213888.0, + "13": 458214912.0, + "14": 458213376.0, + "15": 458215424.0, + "16": 458214400.0, + "17": 458214400.0, + "18": 458215424.0, + "19": 458209792.0, + "20": 458212864.0, + "21": 458211840.0, + "22": 458219520.0, + "23": 458213888.0, + "24": 458214912.0, + "25": 458215424.0, + "26": 458213376.0, + "27": 458213888.0, + "28": 458213888.0, + "29": 458212864.0, + "30": 458211840.0, + "31": 458218496.0, + "32": 458214912.0, + "33": 458212352.0, + "34": 458214400.0, + "35": 458214400.0, + "36": 458215424.0, + "37": 458213888.0, + "38": 458213888.0, + "39": 458213888.0, + "40": 458214912.0, + "41": 458216448.0, + "42": 458213888.0, + "43": 458217472.0, + "44": 458212864.0, + "45": 458213888.0, + "46": 458216960.0, + "47": 458214400.0, + "48": 458212352.0, + "49": 458215424.0, + "50": 458214912.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1016394240.0, + "2": 1180904960.0, + "3": 1180904960.0, + "4": 1180904960.0, + "5": 1180904960.0, + "6": 1180904960.0, + "7": 1180904960.0, + "8": 1180904960.0, + "9": 1180934144.0, + "10": 1180934144.0, + "11": 1180934144.0, + "12": 1180934144.0, + "13": 1180934144.0, + "14": 1180934144.0, + "15": 1180990976.0, + "16": 1180990976.0, + "17": 1180990976.0, + "18": 1180990976.0, + "19": 1180990976.0, + "20": 1180990976.0, + "21": 1180990976.0, + "22": 1180990976.0, + "23": 1180990976.0, + "24": 1180990976.0, + "25": 1181222912.0, + "26": 1181222912.0, + "27": 1181222912.0, + "28": 1181222912.0, + "29": 1181222912.0, + "30": 1181222912.0, + "31": 1181222912.0, + "32": 1181222912.0, + "33": 1181222912.0, + "34": 1181222912.0, + "35": 1181468160.0, + "36": 1181468160.0, + "37": 1181468160.0, + "38": 1181468160.0, + "39": 1181468160.0, + "40": 1181468160.0, + "41": 1181468160.0, + "42": 1181468160.0, + "43": 1181468160.0, + "44": 1183467008.0, + "45": 1183467008.0, + "46": 1183467008.0, + "47": 1183467008.0, + "48": 1183467008.0, + "49": 1183467008.0, + "50": 1183467008.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 20.00855, + "2": 0.70527, + "3": 0.59745, + "4": 0.60744, + "5": 0.61261, + "6": 0.61644, + "7": 0.60659, + "8": 0.59978, + "9": 0.59747, + "10": 0.59353, + "11": 0.59787, + "12": 0.59073, + "13": 0.58796, + "14": 0.5969, + "15": 0.59327, + "16": 0.59709, + "17": 0.58809, + "18": 0.59153, + "19": 0.59156, + "20": 0.58419, + "21": 0.59403, + "22": 0.58324, + "23": 0.59332, + "24": 0.59867, + "25": 0.58715, + "26": 0.59642, + "27": 0.58832, + "28": 0.59214, + "29": 0.58522, + "30": 0.58573, + "31": 0.59427, + "32": 0.58249, + "33": 0.59123, + "34": 0.582, + "35": 0.59565, + "36": 0.59193, + "37": 0.58268, + "38": 0.59363, + "39": 0.58071, + "40": 0.58884, + "41": 0.58702, + "42": 0.58338, + "43": 0.58987, + "44": 0.58365, + "45": 0.59495, + "46": 0.58622, + "47": 0.58253, + "48": 0.59065, + "49": 0.58385, + "50": 0.59154 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json index 6cc67512418..daecd2a50e1 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.81692, + "2": 10.82534, + "3": 10.82401, + "4": 10.79801, "5": 10.8415, + "6": 10.85912, + "7": 10.81927, + "8": 10.81789, + "9": 10.83554, "10": 10.78266, + "11": 10.85455, + "12": 10.84582, + "13": 10.84996, + "14": 10.87821, "15": 10.80684, + "16": 10.80662, + "17": 10.76305, + "18": 10.80188, + "19": 10.79303, "20": 10.73474, + "21": 10.71067, + "22": 10.57636, + "23": 10.7196, + "24": 10.63305, "25": 10.56916, + "26": 10.62589, + "27": 10.64466, + "28": 10.60792, + "29": 10.61761, "30": 10.42214, + "31": 10.17719, + "32": 10.50701, + "33": 10.50561, + "34": 10.27485, "35": 10.3276, + "36": 10.29275, + "37": 10.40262, + "38": 10.25679, + "39": 10.43615, "40": 10.16589, + "41": 10.20032, + "42": 10.27424, + "43": 9.93044, + "44": 10.04415, "45": 9.92936, + "46": 9.89984, + "47": 10.18573, + "48": 9.93082, + "49": 9.6257, "50": 9.98437 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 12899.0, + "2": 14592.0, + "3": 14243.0, + "4": 13886.0, "5": 15732.0, + "6": 16250.0, + "7": 15453.0, + "8": 13386.0, + "9": 15159.0, "10": 12804.0, + "11": 16441.0, + "12": 14951.0, + "13": 16151.0, + "14": 16330.0, "15": 15144.0, + "16": 15588.0, + "17": 15315.0, + "18": 14902.0, + "19": 15436.0, "20": 13814.0, + "21": 13977.0, + "22": 12814.0, + "23": 16615.0, + "24": 13785.0, "25": 13451.0, + "26": 14681.0, + "27": 15288.0, + "28": 16290.0, + "29": 16880.0, "30": 14583.0, + "31": 13272.0, + "32": 15972.0, + "33": 16904.0, + "34": 14406.0, "35": 14981.0, + "36": 15576.0, + "37": 17584.0, + "38": 16136.0, + "39": 17650.0, "40": 16506.0, + "41": 16391.0, + "42": 17008.0, + "43": 15459.0, + "44": 15097.0, "45": 16136.0, + "46": 16845.0, + "47": 19101.0, + "48": 16405.0, + "49": 16558.0, "50": 18439.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 659394560.0, + "2": 659346944.0, + "3": 659401728.0, + "4": 659351040.0, "5": 659623424.0, + "6": 659348480.0, + "7": 659508736.0, + "8": 659353088.0, + "9": 659383296.0, "10": 659347456.0, + "11": 659350016.0, + "12": 659437056.0, + "13": 659356160.0, + "14": 659702272.0, "15": 659658240.0, + "16": 659450880.0, + "17": 659438080.0, + "18": 659384320.0, + "19": 659492352.0, "20": 659372544.0, + "21": 659350016.0, + "22": 659347456.0, + "23": 659348992.0, + "24": 659430400.0, "25": 659347968.0, + "26": 659378176.0, + "27": 659353088.0, + "28": 659346944.0, + "29": 659440640.0, "30": 659732480.0, + "31": 659361792.0, + "32": 659345920.0, + "33": 659473920.0, + "34": 660008448.0, "35": 659819520.0, + "36": 659363840.0, + "37": 659418624.0, + "38": 659351040.0, + "39": 659449344.0, "40": 659586560.0, + "41": 659387392.0, + "42": 659476480.0, + "43": 659567104.0, + "44": 659344384.0, "45": 659346944.0, + "46": 659466752.0, + "47": 659345408.0, + "48": 659835392.0, + "49": 659494400.0, "50": 659346432.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1853294080.0, + "2": 2083995136.0, + "3": 2084402688.0, + "4": 2084433408.0, "5": 2084433408.0, + "6": 2084433408.0, + "7": 2085503488.0, + "8": 2085503488.0, + "9": 2085503488.0, "10": 2085503488.0, + "11": 2085503488.0, + "12": 2085503488.0, + "13": 2085503488.0, + "14": 2085503488.0, "15": 2085503488.0, + "16": 2085503488.0, + "17": 2085503488.0, + "18": 2085503488.0, + "19": 2085503488.0, "20": 2085503488.0, + "21": 2085503488.0, + "22": 2085503488.0, + "23": 2085503488.0, + "24": 2085503488.0, "25": 2085503488.0, + "26": 2085503488.0, + "27": 2085503488.0, + "28": 2085503488.0, + "29": 2085503488.0, "30": 2085503488.0, + "31": 2085503488.0, + "32": 2085503488.0, + "33": 2085503488.0, + "34": 2085503488.0, "35": 2085503488.0, + "36": 2085503488.0, + "37": 2085503488.0, + "38": 2085503488.0, + "39": 2085503488.0, "40": 2085503488.0, + "41": 2085503488.0, + "42": 2085503488.0, + "43": 2085503488.0, + "44": 2085503488.0, "45": 2085503488.0, + "46": 2085503488.0, + "47": 2085503488.0, + "48": 2085503488.0, + "49": 2085503488.0, "50": 2085503488.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 15.75879, - "5": 0.23618, - "10": 0.23433, - "15": 0.2393, - "20": 0.23468, - "25": 0.22203, - "30": 0.22111, - "35": 0.22708, - "40": 0.22283, - "45": 0.23253, - "50": 0.22333 + "1": 13.70163, + "2": 0.32995, + "3": 0.28329, + "4": 0.30327, + "5": 0.26887, + "6": 0.26248, + "7": 0.28317, + "8": 0.26472, + "9": 0.26858, + "10": 0.26512, + "11": 0.28434, + "12": 0.25515, + "13": 0.26048, + "14": 0.25624, + "15": 0.27581, + "16": 0.25102, + "17": 0.25664, + "18": 0.25657, + "19": 0.25806, + "20": 0.2591, + "21": 0.25054, + "22": 0.26613, + "23": 0.2877, + "24": 0.2503, + "25": 0.25227, + "26": 0.26224, + "27": 0.25269, + "28": 0.26737, + "29": 0.25139, + "30": 0.25065, + "31": 0.30552, + "32": 0.25136, + "33": 0.2573, + "34": 0.26376, + "35": 0.25668, + "36": 0.25566, + "37": 0.25143, + "38": 0.2666, + "39": 0.25121, + "40": 0.25249, + "41": 0.25912, + "42": 0.25442, + "43": 0.2721, + "44": 0.25368, + "45": 0.26494, + "46": 0.27206, + "47": 0.25676, + "48": 0.27981, + "49": 0.31376, + "50": 0.26619 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..075265941da --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82004, + "2": 10.8392, + "3": 10.81124, + "4": 10.81983, + "5": 10.84794, + "6": 10.8608, + "7": 10.84085, + "8": 10.84432, + "9": 10.8504, + "10": 10.79461, + "11": 10.85658, + "12": 10.84848, + "13": 10.86929, + "14": 10.8667, + "15": 10.82911, + "16": 10.81111, + "17": 10.79027, + "18": 10.80981, + "19": 10.81143, + "20": 10.73175, + "21": 10.71285, + "22": 10.58199, + "23": 10.72, + "24": 10.61704, + "25": 10.57964, + "26": 10.63372, + "27": 10.6365, + "28": 10.60641, + "29": 10.61561, + "30": 10.40859, + "31": 10.17068, + "32": 10.49958, + "33": 10.4963, + "34": 10.25574, + "35": 10.31503, + "36": 10.28536, + "37": 10.38742, + "38": 10.24676, + "39": 10.44249, + "40": 10.14367, + "41": 10.19116, + "42": 10.25654, + "43": 9.90671, + "44": 10.02653, + "45": 9.914, + "46": 9.89613, + "47": 10.18885, + "48": 9.92993, + "49": 9.61419, + "50": 9.97565 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 12826.0, + "2": 14613.0, + "3": 14549.0, + "4": 13422.0, + "5": 15951.0, + "6": 16055.0, + "7": 15208.0, + "8": 12944.0, + "9": 15110.0, + "10": 12611.0, + "11": 16586.0, + "12": 14954.0, + "13": 15925.0, + "14": 16182.0, + "15": 14834.0, + "16": 16023.0, + "17": 15486.0, + "18": 15116.0, + "19": 15584.0, + "20": 13675.0, + "21": 13873.0, + "22": 12917.0, + "23": 16766.0, + "24": 13924.0, + "25": 13129.0, + "26": 14794.0, + "27": 15169.0, + "28": 16393.0, + "29": 16719.0, + "30": 14652.0, + "31": 13126.0, + "32": 15987.0, + "33": 17372.0, + "34": 14206.0, + "35": 15183.0, + "36": 15837.0, + "37": 17507.0, + "38": 16617.0, + "39": 17712.0, + "40": 16971.0, + "41": 16795.0, + "42": 17304.0, + "43": 15578.0, + "44": 15564.0, + "45": 16188.0, + "46": 17443.0, + "47": 19238.0, + "48": 16575.0, + "49": 16273.0, + "50": 18998.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 629738496.0, + "2": 629741056.0, + "3": 629741056.0, + "4": 629737472.0, + "5": 629945856.0, + "6": 629820928.0, + "7": 629735936.0, + "8": 629741056.0, + "9": 629863424.0, + "10": 629771776.0, + "11": 629848064.0, + "12": 629767168.0, + "13": 629744128.0, + "14": 629783040.0, + "15": 629743616.0, + "16": 629762560.0, + "17": 629806592.0, + "18": 629742592.0, + "19": 629779456.0, + "20": 629873664.0, + "21": 629740032.0, + "22": 629789696.0, + "23": 629762560.0, + "24": 630001664.0, + "25": 629747712.0, + "26": 629774848.0, + "27": 629774848.0, + "28": 629755392.0, + "29": 629753856.0, + "30": 629757440.0, + "31": 629736448.0, + "32": 629881344.0, + "33": 629818880.0, + "34": 629858304.0, + "35": 629787136.0, + "36": 630003712.0, + "37": 629769216.0, + "38": 629809664.0, + "39": 629830144.0, + "40": 629740544.0, + "41": 629737984.0, + "42": 630415360.0, + "43": 629748224.0, + "44": 629811712.0, + "45": 629760000.0, + "46": 629824000.0, + "47": 629742080.0, + "48": 629881344.0, + "49": 630102528.0, + "50": 629818880.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1822359552.0, + "2": 2052654592.0, + "3": 2053963264.0, + "4": 2053963264.0, + "5": 2053963264.0, + "6": 2053963264.0, + "7": 2054027776.0, + "8": 2054027776.0, + "9": 2054027776.0, + "10": 2054027776.0, + "11": 2054060032.0, + "12": 2054060032.0, + "13": 2054418944.0, + "14": 2054418944.0, + "15": 2054439936.0, + "16": 2054439936.0, + "17": 2054439936.0, + "18": 2054439936.0, + "19": 2054439936.0, + "20": 2054439936.0, + "21": 2054439936.0, + "22": 2054439936.0, + "23": 2054439936.0, + "24": 2054439936.0, + "25": 2054439936.0, + "26": 2054439936.0, + "27": 2054439936.0, + "28": 2054439936.0, + "29": 2054439936.0, + "30": 2054439936.0, + "31": 2054439936.0, + "32": 2054439936.0, + "33": 2054439936.0, + "34": 2054439936.0, + "35": 2054439936.0, + "36": 2054439936.0, + "37": 2054439936.0, + "38": 2054439936.0, + "39": 2054439936.0, + "40": 2054439936.0, + "41": 2054439936.0, + "42": 2054439936.0, + "43": 2054439936.0, + "44": 2054439936.0, + "45": 2054439936.0, + "46": 2054439936.0, + "47": 2054769152.0, + "48": 2054769152.0, + "49": 2054769152.0, + "50": 2054769152.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 7.01599, + "2": 0.45355, + "3": 0.36565, + "4": 0.36091, + "5": 0.35921, + "6": 0.35888, + "7": 0.35757, + "8": 0.35792, + "9": 0.35736, + "10": 0.3584, + "11": 0.359, + "12": 0.35941, + "13": 0.35718, + "14": 0.35719, + "15": 0.35705, + "16": 0.35632, + "17": 0.3593, + "18": 0.35903, + "19": 0.35833, + "20": 0.35817, + "21": 0.36067, + "22": 0.36054, + "23": 0.35773, + "24": 0.35639, + "25": 0.35602, + "26": 0.35542, + "27": 0.35615, + "28": 0.35911, + "29": 0.35797, + "30": 0.35947, + "31": 0.358, + "32": 0.35582, + "33": 0.35562, + "34": 0.35699, + "35": 0.35618, + "36": 0.35545, + "37": 0.35505, + "38": 0.35456, + "39": 0.35537, + "40": 0.3546, + "41": 0.35684, + "42": 0.35798, + "43": 0.35335, + "44": 0.3508, + "45": 0.35489, + "46": 0.35218, + "47": 0.35103, + "48": 0.3519, + "49": 0.35301, + "50": 0.34945 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json new file mode 100644 index 00000000000..cd548b7f7bb --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82004, + "2": 10.8392, + "3": 10.81124, + "4": 10.81983, + "5": 10.84794, + "6": 10.8608, + "7": 10.84085, + "8": 10.84432, + "9": 10.8504, + "10": 10.79461, + "11": 10.85658, + "12": 10.84848, + "13": 10.86929, + "14": 10.8667, + "15": 10.82911, + "16": 10.81111, + "17": 10.79027, + "18": 10.80981, + "19": 10.81143, + "20": 10.73175, + "21": 10.71285, + "22": 10.58199, + "23": 10.72, + "24": 10.61704, + "25": 10.57964, + "26": 10.63372, + "27": 10.6365, + "28": 10.60641, + "29": 10.61561, + "30": 10.40859, + "31": 10.17068, + "32": 10.49958, + "33": 10.4963, + "34": 10.25574, + "35": 10.31503, + "36": 10.28536, + "37": 10.38742, + "38": 10.24676, + "39": 10.44249, + "40": 10.14367, + "41": 10.19116, + "42": 10.25654, + "43": 9.90671, + "44": 10.02653, + "45": 9.914, + "46": 9.89613, + "47": 10.18885, + "48": 9.92993, + "49": 9.61419, + "50": 9.97565 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 12826.0, + "2": 14613.0, + "3": 14549.0, + "4": 13422.0, + "5": 15951.0, + "6": 16055.0, + "7": 15208.0, + "8": 12944.0, + "9": 15110.0, + "10": 12611.0, + "11": 16586.0, + "12": 14954.0, + "13": 15925.0, + "14": 16182.0, + "15": 14834.0, + "16": 16023.0, + "17": 15486.0, + "18": 15116.0, + "19": 15584.0, + "20": 13675.0, + "21": 13873.0, + "22": 12917.0, + "23": 16766.0, + "24": 13924.0, + "25": 13129.0, + "26": 14794.0, + "27": 15169.0, + "28": 16393.0, + "29": 16719.0, + "30": 14652.0, + "31": 13126.0, + "32": 15987.0, + "33": 17372.0, + "34": 14206.0, + "35": 15183.0, + "36": 15837.0, + "37": 17507.0, + "38": 16617.0, + "39": 17712.0, + "40": 16971.0, + "41": 16795.0, + "42": 17304.0, + "43": 15578.0, + "44": 15564.0, + "45": 16188.0, + "46": 17443.0, + "47": 19238.0, + "48": 16575.0, + "49": 16273.0, + "50": 18998.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 629738496.0, + "2": 629741056.0, + "3": 629741056.0, + "4": 629737472.0, + "5": 629945856.0, + "6": 629820928.0, + "7": 629735936.0, + "8": 629741056.0, + "9": 629863424.0, + "10": 629771776.0, + "11": 629848064.0, + "12": 629767168.0, + "13": 629744128.0, + "14": 629783040.0, + "15": 629743616.0, + "16": 629762560.0, + "17": 629806592.0, + "18": 629742592.0, + "19": 629779456.0, + "20": 629873664.0, + "21": 629740032.0, + "22": 629789696.0, + "23": 629762560.0, + "24": 630001664.0, + "25": 629747712.0, + "26": 629774848.0, + "27": 629774848.0, + "28": 629755392.0, + "29": 629753856.0, + "30": 629757440.0, + "31": 629736448.0, + "32": 629881344.0, + "33": 629818880.0, + "34": 629858304.0, + "35": 629787136.0, + "36": 630003712.0, + "37": 629769216.0, + "38": 629809664.0, + "39": 629830144.0, + "40": 629740544.0, + "41": 629737984.0, + "42": 630415360.0, + "43": 629748224.0, + "44": 629811712.0, + "45": 629760000.0, + "46": 629824000.0, + "47": 629742080.0, + "48": 629881344.0, + "49": 630102528.0, + "50": 629818880.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1822359552.0, + "2": 2052654592.0, + "3": 2053963264.0, + "4": 2053963264.0, + "5": 2053963264.0, + "6": 2053963264.0, + "7": 2054027776.0, + "8": 2054027776.0, + "9": 2054027776.0, + "10": 2054027776.0, + "11": 2054060032.0, + "12": 2054060032.0, + "13": 2054418944.0, + "14": 2054418944.0, + "15": 2054439936.0, + "16": 2054439936.0, + "17": 2054439936.0, + "18": 2054439936.0, + "19": 2054439936.0, + "20": 2054439936.0, + "21": 2054439936.0, + "22": 2054439936.0, + "23": 2054439936.0, + "24": 2054439936.0, + "25": 2054439936.0, + "26": 2054439936.0, + "27": 2054439936.0, + "28": 2054439936.0, + "29": 2054439936.0, + "30": 2054439936.0, + "31": 2054439936.0, + "32": 2054439936.0, + "33": 2054439936.0, + "34": 2054439936.0, + "35": 2054439936.0, + "36": 2054439936.0, + "37": 2054439936.0, + "38": 2054439936.0, + "39": 2054439936.0, + "40": 2054439936.0, + "41": 2054439936.0, + "42": 2054439936.0, + "43": 2054439936.0, + "44": 2054439936.0, + "45": 2054439936.0, + "46": 2054439936.0, + "47": 2054769152.0, + "48": 2054769152.0, + "49": 2054769152.0, + "50": 2054769152.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 7.28409, + "2": 0.41637, + "3": 0.36538, + "4": 0.36475, + "5": 0.36291, + "6": 0.36269, + "7": 0.3621, + "8": 0.36618, + "9": 0.41513, + "10": 0.35991, + "11": 0.35833, + "12": 0.35938, + "13": 0.35969, + "14": 0.35865, + "15": 0.35898, + "16": 0.35973, + "17": 0.35887, + "18": 0.3593, + "19": 0.35818, + "20": 0.35872, + "21": 0.36111, + "22": 0.36267, + "23": 0.36505, + "24": 0.36152, + "25": 0.35943, + "26": 0.36139, + "27": 0.35871, + "28": 0.35976, + "29": 0.36014, + "30": 0.36074, + "31": 0.36299, + "32": 0.35944, + "33": 0.36216, + "34": 0.362, + "35": 0.36095, + "36": 0.36098, + "37": 0.3688, + "38": 0.36204, + "39": 0.35854, + "40": 0.3619, + "41": 0.35612, + "42": 0.35586, + "43": 0.35734, + "44": 0.35693, + "45": 0.35773, + "46": 0.35625, + "47": 0.35614, + "48": 0.35584, + "49": 0.35496, + "50": 0.35545 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..2906cfee84e --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.81692, + "2": 10.82534, + "3": 10.82401, + "4": 10.79801, + "5": 10.8415, + "6": 10.85912, + "7": 10.81927, + "8": 10.81789, + "9": 10.83554, + "10": 10.78266, + "11": 10.85455, + "12": 10.84582, + "13": 10.84996, + "14": 10.87821, + "15": 10.80684, + "16": 10.80662, + "17": 10.76305, + "18": 10.80188, + "19": 10.79303, + "20": 10.73474, + "21": 10.71067, + "22": 10.57636, + "23": 10.7196, + "24": 10.63305, + "25": 10.56916, + "26": 10.62589, + "27": 10.64466, + "28": 10.60792, + "29": 10.61761, + "30": 10.42214, + "31": 10.17719, + "32": 10.50701, + "33": 10.50561, + "34": 10.27485, + "35": 10.3276, + "36": 10.29275, + "37": 10.40262, + "38": 10.25679, + "39": 10.43615, + "40": 10.16589, + "41": 10.20032, + "42": 10.27424, + "43": 9.93044, + "44": 10.04415, + "45": 9.92936, + "46": 9.89984, + "47": 10.18573, + "48": 9.93082, + "49": 9.6257, + "50": 9.98437 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 12899.0, + "2": 14592.0, + "3": 14243.0, + "4": 13886.0, + "5": 15732.0, + "6": 16250.0, + "7": 15453.0, + "8": 13386.0, + "9": 15159.0, + "10": 12804.0, + "11": 16441.0, + "12": 14951.0, + "13": 16151.0, + "14": 16330.0, + "15": 15144.0, + "16": 15588.0, + "17": 15315.0, + "18": 14902.0, + "19": 15436.0, + "20": 13814.0, + "21": 13977.0, + "22": 12814.0, + "23": 16615.0, + "24": 13785.0, + "25": 13451.0, + "26": 14681.0, + "27": 15288.0, + "28": 16290.0, + "29": 16880.0, + "30": 14583.0, + "31": 13272.0, + "32": 15972.0, + "33": 16904.0, + "34": 14406.0, + "35": 14981.0, + "36": 15576.0, + "37": 17584.0, + "38": 16136.0, + "39": 17650.0, + "40": 16506.0, + "41": 16391.0, + "42": 17008.0, + "43": 15459.0, + "44": 15097.0, + "45": 16136.0, + "46": 16845.0, + "47": 19101.0, + "48": 16405.0, + "49": 16558.0, + "50": 18439.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 659394560.0, + "2": 659346944.0, + "3": 659401728.0, + "4": 659351040.0, + "5": 659623424.0, + "6": 659348480.0, + "7": 659508736.0, + "8": 659353088.0, + "9": 659383296.0, + "10": 659347456.0, + "11": 659350016.0, + "12": 659437056.0, + "13": 659356160.0, + "14": 659702272.0, + "15": 659658240.0, + "16": 659450880.0, + "17": 659438080.0, + "18": 659384320.0, + "19": 659492352.0, + "20": 659372544.0, + "21": 659350016.0, + "22": 659347456.0, + "23": 659348992.0, + "24": 659430400.0, + "25": 659347968.0, + "26": 659378176.0, + "27": 659353088.0, + "28": 659346944.0, + "29": 659440640.0, + "30": 659732480.0, + "31": 659361792.0, + "32": 659345920.0, + "33": 659473920.0, + "34": 660008448.0, + "35": 659819520.0, + "36": 659363840.0, + "37": 659418624.0, + "38": 659351040.0, + "39": 659449344.0, + "40": 659586560.0, + "41": 659387392.0, + "42": 659476480.0, + "43": 659567104.0, + "44": 659344384.0, + "45": 659346944.0, + "46": 659466752.0, + "47": 659345408.0, + "48": 659835392.0, + "49": 659494400.0, + "50": 659346432.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1853294080.0, + "2": 2083995136.0, + "3": 2084402688.0, + "4": 2084433408.0, + "5": 2084433408.0, + "6": 2084433408.0, + "7": 2085503488.0, + "8": 2085503488.0, + "9": 2085503488.0, + "10": 2085503488.0, + "11": 2085503488.0, + "12": 2085503488.0, + "13": 2085503488.0, + "14": 2085503488.0, + "15": 2085503488.0, + "16": 2085503488.0, + "17": 2085503488.0, + "18": 2085503488.0, + "19": 2085503488.0, + "20": 2085503488.0, + "21": 2085503488.0, + "22": 2085503488.0, + "23": 2085503488.0, + "24": 2085503488.0, + "25": 2085503488.0, + "26": 2085503488.0, + "27": 2085503488.0, + "28": 2085503488.0, + "29": 2085503488.0, + "30": 2085503488.0, + "31": 2085503488.0, + "32": 2085503488.0, + "33": 2085503488.0, + "34": 2085503488.0, + "35": 2085503488.0, + "36": 2085503488.0, + "37": 2085503488.0, + "38": 2085503488.0, + "39": 2085503488.0, + "40": 2085503488.0, + "41": 2085503488.0, + "42": 2085503488.0, + "43": 2085503488.0, + "44": 2085503488.0, + "45": 2085503488.0, + "46": 2085503488.0, + "47": 2085503488.0, + "48": 2085503488.0, + "49": 2085503488.0, + "50": 2085503488.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 15.33188, + "2": 0.39945, + "3": 0.26382, + "4": 0.2701, + "5": 0.24001, + "6": 0.23463, + "7": 0.24587, + "8": 0.23051, + "9": 0.23491, + "10": 0.23256, + "11": 0.2548, + "12": 0.23554, + "13": 0.24407, + "14": 0.23603, + "15": 0.24759, + "16": 0.23243, + "17": 0.23641, + "18": 0.23374, + "19": 0.22953, + "20": 0.23517, + "21": 0.22989, + "22": 0.2361, + "23": 0.24153, + "24": 0.23019, + "25": 0.22803, + "26": 0.23226, + "27": 0.22872, + "28": 0.23463, + "29": 0.23254, + "30": 0.22883, + "31": 0.27127, + "32": 0.22829, + "33": 0.24048, + "34": 0.26445, + "35": 0.2532, + "36": 0.24919, + "37": 0.22702, + "38": 0.22443, + "39": 0.22286, + "40": 0.21951, + "41": 0.22887, + "42": 0.22125, + "43": 0.23026, + "44": 0.22208, + "45": 0.23148, + "46": 0.24241, + "47": 0.22735, + "48": 0.22857, + "49": 0.27512, + "50": 0.22154 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..eb013c007ca --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.81692, + "2": 10.82534, + "3": 10.82401, + "4": 10.79801, + "5": 10.8415, + "6": 10.85912, + "7": 10.81927, + "8": 10.81789, + "9": 10.83554, + "10": 10.78266, + "11": 10.85455, + "12": 10.84582, + "13": 10.84996, + "14": 10.87821, + "15": 10.80684, + "16": 10.80662, + "17": 10.76305, + "18": 10.80188, + "19": 10.79303, + "20": 10.73474, + "21": 10.71067, + "22": 10.57636, + "23": 10.7196, + "24": 10.63305, + "25": 10.56916, + "26": 10.62589, + "27": 10.64466, + "28": 10.60792, + "29": 10.61761, + "30": 10.42214, + "31": 10.17719, + "32": 10.50701, + "33": 10.50561, + "34": 10.27485, + "35": 10.3276, + "36": 10.29275, + "37": 10.40262, + "38": 10.25679, + "39": 10.43615, + "40": 10.16589, + "41": 10.20032, + "42": 10.27424, + "43": 9.93044, + "44": 10.04415, + "45": 9.92936, + "46": 9.89984, + "47": 10.18573, + "48": 9.93082, + "49": 9.6257, + "50": 9.98437 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 12899.0, + "2": 14592.0, + "3": 14243.0, + "4": 13886.0, + "5": 15732.0, + "6": 16250.0, + "7": 15453.0, + "8": 13386.0, + "9": 15159.0, + "10": 12804.0, + "11": 16441.0, + "12": 14951.0, + "13": 16151.0, + "14": 16330.0, + "15": 15144.0, + "16": 15588.0, + "17": 15315.0, + "18": 14902.0, + "19": 15436.0, + "20": 13814.0, + "21": 13977.0, + "22": 12814.0, + "23": 16615.0, + "24": 13785.0, + "25": 13451.0, + "26": 14681.0, + "27": 15288.0, + "28": 16290.0, + "29": 16880.0, + "30": 14583.0, + "31": 13272.0, + "32": 15972.0, + "33": 16904.0, + "34": 14406.0, + "35": 14981.0, + "36": 15576.0, + "37": 17584.0, + "38": 16136.0, + "39": 17650.0, + "40": 16506.0, + "41": 16391.0, + "42": 17008.0, + "43": 15459.0, + "44": 15097.0, + "45": 16136.0, + "46": 16845.0, + "47": 19101.0, + "48": 16405.0, + "49": 16558.0, + "50": 18439.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 659394560.0, + "2": 659346944.0, + "3": 659401728.0, + "4": 659351040.0, + "5": 659623424.0, + "6": 659348480.0, + "7": 659508736.0, + "8": 659353088.0, + "9": 659383296.0, + "10": 659347456.0, + "11": 659350016.0, + "12": 659437056.0, + "13": 659356160.0, + "14": 659702272.0, + "15": 659658240.0, + "16": 659450880.0, + "17": 659438080.0, + "18": 659384320.0, + "19": 659492352.0, + "20": 659372544.0, + "21": 659350016.0, + "22": 659347456.0, + "23": 659348992.0, + "24": 659430400.0, + "25": 659347968.0, + "26": 659378176.0, + "27": 659353088.0, + "28": 659346944.0, + "29": 659440640.0, + "30": 659732480.0, + "31": 659361792.0, + "32": 659345920.0, + "33": 659473920.0, + "34": 660008448.0, + "35": 659819520.0, + "36": 659363840.0, + "37": 659418624.0, + "38": 659351040.0, + "39": 659449344.0, + "40": 659586560.0, + "41": 659387392.0, + "42": 659476480.0, + "43": 659567104.0, + "44": 659344384.0, + "45": 659346944.0, + "46": 659466752.0, + "47": 659345408.0, + "48": 659835392.0, + "49": 659494400.0, + "50": 659346432.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1853294080.0, + "2": 2083995136.0, + "3": 2084402688.0, + "4": 2084433408.0, + "5": 2084433408.0, + "6": 2084433408.0, + "7": 2085503488.0, + "8": 2085503488.0, + "9": 2085503488.0, + "10": 2085503488.0, + "11": 2085503488.0, + "12": 2085503488.0, + "13": 2085503488.0, + "14": 2085503488.0, + "15": 2085503488.0, + "16": 2085503488.0, + "17": 2085503488.0, + "18": 2085503488.0, + "19": 2085503488.0, + "20": 2085503488.0, + "21": 2085503488.0, + "22": 2085503488.0, + "23": 2085503488.0, + "24": 2085503488.0, + "25": 2085503488.0, + "26": 2085503488.0, + "27": 2085503488.0, + "28": 2085503488.0, + "29": 2085503488.0, + "30": 2085503488.0, + "31": 2085503488.0, + "32": 2085503488.0, + "33": 2085503488.0, + "34": 2085503488.0, + "35": 2085503488.0, + "36": 2085503488.0, + "37": 2085503488.0, + "38": 2085503488.0, + "39": 2085503488.0, + "40": 2085503488.0, + "41": 2085503488.0, + "42": 2085503488.0, + "43": 2085503488.0, + "44": 2085503488.0, + "45": 2085503488.0, + "46": 2085503488.0, + "47": 2085503488.0, + "48": 2085503488.0, + "49": 2085503488.0, + "50": 2085503488.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 13.92506, + "2": 0.34079, + "3": 0.28891, + "4": 0.30652, + "5": 0.27326, + "6": 0.26908, + "7": 0.28337, + "8": 0.26429, + "9": 0.27048, + "10": 0.26866, + "11": 0.28689, + "12": 0.25961, + "13": 0.26511, + "14": 0.26065, + "15": 0.27834, + "16": 0.26398, + "17": 0.26064, + "18": 0.26661, + "19": 0.26487, + "20": 0.27686, + "21": 0.26249, + "22": 0.2677, + "23": 0.26859, + "24": 0.26049, + "25": 0.26086, + "26": 0.26279, + "27": 0.25983, + "28": 0.26561, + "29": 0.26345, + "30": 0.26142, + "31": 0.30613, + "32": 0.26049, + "33": 0.26142, + "34": 0.27278, + "35": 0.25691, + "36": 0.26151, + "37": 0.25654, + "38": 0.25753, + "39": 0.2576, + "40": 0.25839, + "41": 0.27219, + "42": 0.25851, + "43": 0.2668, + "44": 0.26229, + "45": 0.27182, + "46": 0.27691, + "47": 0.26299, + "48": 0.27152, + "49": 0.31513, + "50": 0.25813 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..af91e248c50 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82416, + "2": 10.83928, + "3": 10.81612, + "4": 10.8212, + "5": 10.84149, + "6": 10.86581, + "7": 10.84393, + "8": 10.84532, + "9": 10.85565, + "10": 10.79041, + "11": 10.85899, + "12": 10.84824, + "13": 10.86636, + "14": 10.86561, + "15": 10.8302, + "16": 10.80989, + "17": 10.79387, + "18": 10.80839, + "19": 10.8082, + "20": 10.73076, + "21": 10.71085, + "22": 10.57952, + "23": 10.71929, + "24": 10.61457, + "25": 10.57969, + "26": 10.64041, + "27": 10.63805, + "28": 10.61227, + "29": 10.61246, + "30": 10.41029, + "31": 10.16791, + "32": 10.49732, + "33": 10.49177, + "34": 10.25296, + "35": 10.31774, + "36": 10.28708, + "37": 10.38564, + "38": 10.24733, + "39": 10.43639, + "40": 10.14481, + "41": 10.19445, + "42": 10.25646, + "43": 9.91204, + "44": 10.02501, + "45": 9.91307, + "46": 9.89277, + "47": 10.1916, + "48": 9.928, + "49": 9.60925, + "50": 9.97569 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 12709.0, + "2": 14371.0, + "3": 14612.0, + "4": 13787.0, + "5": 15699.0, + "6": 16096.0, + "7": 15514.0, + "8": 13078.0, + "9": 15208.0, + "10": 12503.0, + "11": 16317.0, + "12": 15023.0, + "13": 16173.0, + "14": 16307.0, + "15": 14756.0, + "16": 15746.0, + "17": 15339.0, + "18": 15071.0, + "19": 15163.0, + "20": 13658.0, + "21": 13822.0, + "22": 12883.0, + "23": 16852.0, + "24": 13629.0, + "25": 13295.0, + "26": 15055.0, + "27": 15392.0, + "28": 16101.0, + "29": 16813.0, + "30": 14801.0, + "31": 12991.0, + "32": 16054.0, + "33": 17242.0, + "34": 14599.0, + "35": 15233.0, + "36": 15992.0, + "37": 17624.0, + "38": 16275.0, + "39": 17931.0, + "40": 16737.0, + "41": 16765.0, + "42": 17162.0, + "43": 15421.0, + "44": 15537.0, + "45": 16130.0, + "46": 17720.0, + "47": 19461.0, + "48": 16585.0, + "49": 16329.0, + "50": 19242.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 625133056.0, + "2": 625139200.0, + "3": 625138176.0, + "4": 625133568.0, + "5": 625138176.0, + "6": 625136640.0, + "7": 625130496.0, + "8": 625135616.0, + "9": 625136640.0, + "10": 625133568.0, + "11": 625137152.0, + "12": 625138176.0, + "13": 625138176.0, + "14": 625134592.0, + "15": 625135616.0, + "16": 625138176.0, + "17": 625130496.0, + "18": 625137664.0, + "19": 625137152.0, + "20": 625137664.0, + "21": 625137152.0, + "22": 625134080.0, + "23": 625131520.0, + "24": 625134080.0, + "25": 625134080.0, + "26": 625136128.0, + "27": 625138688.0, + "28": 625166848.0, + "29": 625137152.0, + "30": 625135616.0, + "31": 625131008.0, + "32": 625134592.0, + "33": 625137152.0, + "34": 625134080.0, + "35": 625134592.0, + "36": 625135616.0, + "37": 625137664.0, + "38": 625136128.0, + "39": 625135104.0, + "40": 625138176.0, + "41": 625134080.0, + "42": 625139712.0, + "43": 625133056.0, + "44": 625133056.0, + "45": 625135616.0, + "46": 625127936.0, + "47": 625136128.0, + "48": 625126912.0, + "49": 625131520.0, + "50": 625137664.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1818507264.0, + "2": 2049025536.0, + "3": 2049341440.0, + "4": 2049341440.0, + "5": 2049341440.0, + "6": 2049341440.0, + "7": 2049341440.0, + "8": 2049549312.0, + "9": 2049549312.0, + "10": 2049549312.0, + "11": 2050059264.0, + "12": 2050059264.0, + "13": 2050059264.0, + "14": 2050059264.0, + "15": 2050059264.0, + "16": 2050059264.0, + "17": 2050059264.0, + "18": 2050059264.0, + "19": 2050059264.0, + "20": 2050059264.0, + "21": 2050059264.0, + "22": 2050059264.0, + "23": 2050059264.0, + "24": 2050059264.0, + "25": 2050059264.0, + "26": 2050059264.0, + "27": 2050059264.0, + "28": 2050059264.0, + "29": 2050059264.0, + "30": 2050059264.0, + "31": 2050059264.0, + "32": 2050059264.0, + "33": 2050059264.0, + "34": 2050059264.0, + "35": 2050059264.0, + "36": 2050059264.0, + "37": 2050059264.0, + "38": 2050059264.0, + "39": 2050059264.0, + "40": 2050059264.0, + "41": 2050059264.0, + "42": 2050059264.0, + "43": 2050059264.0, + "44": 2050059264.0, + "45": 2050059264.0, + "46": 2050059264.0, + "47": 2050059264.0, + "48": 2050059264.0, + "49": 2050059264.0, + "50": 2050148352.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 13.44804, + "2": 0.4545, + "3": 0.40145, + "4": 0.39962, + "5": 0.40214, + "6": 0.40788, + "7": 0.40992, + "8": 0.40872, + "9": 0.40355, + "10": 0.40545, + "11": 0.41454, + "12": 0.39604, + "13": 0.40021, + "14": 0.39269, + "15": 0.38202, + "16": 0.40653, + "17": 0.39389, + "18": 0.40314, + "19": 0.39215, + "20": 0.38662, + "21": 0.39822, + "22": 0.39482, + "23": 0.39892, + "24": 0.39111, + "25": 0.43645, + "26": 0.44712, + "27": 0.43121, + "28": 0.42413, + "29": 0.43447, + "30": 0.44716, + "31": 0.39545, + "32": 0.40817, + "33": 0.43535, + "34": 0.44181, + "35": 0.41776, + "36": 0.44963, + "37": 0.41369, + "38": 0.35924, + "39": 0.35768, + "40": 0.36975, + "41": 0.35836, + "42": 0.35907, + "43": 0.36834, + "44": 0.35722, + "45": 0.35442, + "46": 0.36721, + "47": 0.35342, + "48": 0.368, + "49": 0.35736, + "50": 0.35455 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..31b44874771 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82416, + "2": 10.83928, + "3": 10.81612, + "4": 10.8212, + "5": 10.84149, + "6": 10.86581, + "7": 10.84393, + "8": 10.84532, + "9": 10.85565, + "10": 10.79041, + "11": 10.85899, + "12": 10.84824, + "13": 10.86636, + "14": 10.86561, + "15": 10.8302, + "16": 10.80989, + "17": 10.79387, + "18": 10.80839, + "19": 10.8082, + "20": 10.73076, + "21": 10.71085, + "22": 10.57952, + "23": 10.71929, + "24": 10.61457, + "25": 10.57969, + "26": 10.64041, + "27": 10.63805, + "28": 10.61227, + "29": 10.61246, + "30": 10.41029, + "31": 10.16791, + "32": 10.49732, + "33": 10.49177, + "34": 10.25296, + "35": 10.31774, + "36": 10.28708, + "37": 10.38564, + "38": 10.24733, + "39": 10.43639, + "40": 10.14481, + "41": 10.19445, + "42": 10.25646, + "43": 9.91204, + "44": 10.02501, + "45": 9.91307, + "46": 9.89277, + "47": 10.1916, + "48": 9.928, + "49": 9.60925, + "50": 9.97569 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 12709.0, + "2": 14371.0, + "3": 14612.0, + "4": 13787.0, + "5": 15699.0, + "6": 16096.0, + "7": 15514.0, + "8": 13078.0, + "9": 15208.0, + "10": 12503.0, + "11": 16317.0, + "12": 15023.0, + "13": 16173.0, + "14": 16307.0, + "15": 14756.0, + "16": 15746.0, + "17": 15339.0, + "18": 15071.0, + "19": 15163.0, + "20": 13658.0, + "21": 13822.0, + "22": 12883.0, + "23": 16852.0, + "24": 13629.0, + "25": 13295.0, + "26": 15055.0, + "27": 15392.0, + "28": 16101.0, + "29": 16813.0, + "30": 14801.0, + "31": 12991.0, + "32": 16054.0, + "33": 17242.0, + "34": 14599.0, + "35": 15233.0, + "36": 15992.0, + "37": 17624.0, + "38": 16275.0, + "39": 17931.0, + "40": 16737.0, + "41": 16765.0, + "42": 17162.0, + "43": 15421.0, + "44": 15537.0, + "45": 16130.0, + "46": 17720.0, + "47": 19461.0, + "48": 16585.0, + "49": 16329.0, + "50": 19242.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 625759744.0, + "2": 625139200.0, + "3": 625138176.0, + "4": 625133568.0, + "5": 625138176.0, + "6": 625136640.0, + "7": 625130496.0, + "8": 625135616.0, + "9": 625136640.0, + "10": 625133568.0, + "11": 625137152.0, + "12": 625138176.0, + "13": 625138176.0, + "14": 625134592.0, + "15": 625135616.0, + "16": 625138176.0, + "17": 625130496.0, + "18": 625137664.0, + "19": 625137152.0, + "20": 625137664.0, + "21": 625137152.0, + "22": 625134080.0, + "23": 625131520.0, + "24": 625134080.0, + "25": 625134080.0, + "26": 625136128.0, + "27": 625138688.0, + "28": 625166848.0, + "29": 625137152.0, + "30": 625135616.0, + "31": 625131008.0, + "32": 625134592.0, + "33": 625137152.0, + "34": 625134080.0, + "35": 625134592.0, + "36": 625135616.0, + "37": 625137664.0, + "38": 625136128.0, + "39": 625135104.0, + "40": 625138176.0, + "41": 625134080.0, + "42": 625139712.0, + "43": 625133056.0, + "44": 625133056.0, + "45": 625135616.0, + "46": 625127936.0, + "47": 625136128.0, + "48": 625126912.0, + "49": 625131520.0, + "50": 625137664.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1819058176.0, + "2": 2049025536.0, + "3": 2049507328.0, + "4": 2049507328.0, + "5": 2049507328.0, + "6": 2049507328.0, + "7": 2049507328.0, + "8": 2049549312.0, + "9": 2049549312.0, + "10": 2049549312.0, + "11": 2050408448.0, + "12": 2050408448.0, + "13": 2050408448.0, + "14": 2050408448.0, + "15": 2050408448.0, + "16": 2050408448.0, + "17": 2050408448.0, + "18": 2050408448.0, + "19": 2050408448.0, + "20": 2050408448.0, + "21": 2050408448.0, + "22": 2050408448.0, + "23": 2050408448.0, + "24": 2050408448.0, + "25": 2050408448.0, + "26": 2050408448.0, + "27": 2050408448.0, + "28": 2050408448.0, + "29": 2050408448.0, + "30": 2050408448.0, + "31": 2050408448.0, + "32": 2050408448.0, + "33": 2050408448.0, + "34": 2050408448.0, + "35": 2050408448.0, + "36": 2050408448.0, + "37": 2050408448.0, + "38": 2050408448.0, + "39": 2050408448.0, + "40": 2050408448.0, + "41": 2050408448.0, + "42": 2050408448.0, + "43": 2050408448.0, + "44": 2050408448.0, + "45": 2050408448.0, + "46": 2050408448.0, + "47": 2050408448.0, + "48": 2050408448.0, + "49": 2050408448.0, + "50": 2050408448.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 22.77068, + "2": 0.46494, + "3": 0.3723, + "4": 0.36903, + "5": 0.37035, + "6": 0.36273, + "7": 0.36764, + "8": 0.36608, + "9": 0.36149, + "10": 0.37099, + "11": 0.36751, + "12": 0.36086, + "13": 0.37084, + "14": 0.36048, + "15": 0.36546, + "16": 0.36953, + "17": 0.36319, + "18": 0.36789, + "19": 0.36444, + "20": 0.3601, + "21": 0.37091, + "22": 0.36503, + "23": 0.3598, + "24": 0.36881, + "25": 0.36119, + "26": 0.36751, + "27": 0.36776, + "28": 0.35964, + "29": 0.36504, + "30": 0.36585, + "31": 0.36136, + "32": 0.37411, + "33": 0.36177, + "34": 0.36157, + "35": 0.36662, + "36": 0.35886, + "37": 0.36442, + "38": 0.36579, + "39": 0.35855, + "40": 0.36631, + "41": 0.36531, + "42": 0.35897, + "43": 0.37205, + "44": 0.36369, + "45": 0.3598, + "46": 0.3686, + "47": 0.36017, + "48": 0.36176, + "49": 0.36902, + "50": 0.35813 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..0f2637a9511 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,344 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.04748, + "2": 11.03561, + "3": 9.58774, + "4": 9.25819, + "5": 9.53583, + "6": 9.8804, + "7": 9.48247, + "8": 8.93575, + "9": 8.65813, + "10": 9.0567, + "11": 8.49445, + "12": 8.52444, + "13": 8.45239, + "14": 7.97323, + "15": 8.0476, + "16": 8.07971, + "17": 8.09081, + "18": 7.76437, + "19": 8.14892, + "20": 7.89868, + "21": 7.59371, + "22": 7.54743, + "23": 7.43222, + "24": 7.4302, + "25": 7.67579, + "26": 7.06929, + "27": 7.62041, + "28": 7.32495, + "29": 7.49042, + "30": 7.64391, + "31": 7.39435, + "32": 7.58789, + "33": 7.64037, + "34": 7.69778, + "35": 7.20998, + "36": 7.08538, + "37": 7.42584, + "38": 7.18804, + "39": 7.55054, + "40": 7.54446, + "41": 7.49287, + "42": 7.24937, + "43": 7.23587, + "44": 7.41595, + "45": 7.18755, + "46": 6.89949, + "47": 7.29966, + "48": 7.14134, + "49": 7.58963, + "50": 7.03602 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 38802612.0, + "2": 38543592.0, + "3": 38739528.0, + "4": 279937824.0, + "5": 259189728.0, + "6": 271446400.0, + "7": 604773504.0, + "8": 768892544.0, + "9": 645824128.0, + "10": 744257088.0, + "11": 718888576.0, + "12": 746732544.0, + "13": 871990976.0, + "14": 821645632.0, + "15": 724250816.0, + "16": 932241472.0, + "17": 648958912.0, + "18": 649120000.0, + "19": 925992960.0, + "20": 989207936.0, + "21": 819324096.0, + "22": 736955072.0, + "23": 910497792.0, + "24": 876716672.0, + "25": 843170688.0, + "26": 809573824.0, + "27": 854086912.0, + "28": 802857664.0, + "29": 805523328.0, + "30": 775645184.0, + "31": 771754624.0, + "32": 749733696.0, + "33": 718385216.0, + "34": 724771200.0, + "35": 737655104.0, + "36": 690419968.0, + "37": 673203456.0, + "38": 627239552.0, + "39": 614047168.0, + "40": 607288512.0, + "41": 582590592.0, + "42": 548211200.0, + "43": 532740640.0, + "44": 554239168.0, + "45": 514790528.0, + "46": 350258560.0, + "47": 472420128.0, + "48": 453788736.0, + "49": 440597216.0, + "50": 303063296.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 6637267456.0, + "2": 6637269504.0, + "3": 6637269504.0, + "4": 6637269504.0, + "5": 6637269504.0, + "6": 6637269504.0, + "7": 6637269504.0, + "8": 6637269504.0, + "9": 6637269504.0, + "10": 6637269504.0, + "11": 6637269504.0, + "12": 6637269504.0, + "13": 6637269504.0, + "14": 6637269504.0, + "15": 6637269504.0, + "16": 6637269504.0, + "17": 6637269504.0, + "18": 6637269504.0, + "19": 6637269504.0, + "20": 6637269504.0, + "21": 6637269504.0, + "22": 6637269504.0, + "23": 6637269504.0, + "24": 6637269504.0, + "25": 6637269504.0, + "26": 6637269504.0, + "27": 6637269504.0, + "28": 6637269504.0, + "29": 6637269504.0, + "30": 6637269504.0, + "31": 6637269504.0, + "32": 6637269504.0, + "33": 6637269504.0, + "34": 6637269504.0, + "35": 6637269504.0, + "36": 6637269504.0, + "37": 6637269504.0, + "38": 6637269504.0, + "39": 6637269504.0, + "40": 6637269504.0, + "41": 6637269504.0, + "42": 6637269504.0, + "43": 6637269504.0, + "44": 6637269504.0, + "45": 6637269504.0, + "46": 6637269504.0, + "47": 6637269504.0, + "48": 6637269504.0, + "49": 6637269504.0, + "50": 6637269504.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 55055331328.0, + "2": 57809321984.0, + "3": 57918455808.0, + "4": 57918455808.0, + "5": 57918455808.0, + "6": 57918455808.0, + "7": 57918455808.0, + "8": 57918455808.0, + "9": 57918455808.0, + "10": 57918455808.0, + "11": 57918455808.0, + "12": 57918455808.0, + "13": 57931390976.0, + "14": 57931390976.0, + "15": 57931390976.0, + "16": 57931390976.0, + "17": 57931390976.0, + "18": 57931390976.0, + "19": 57931390976.0, + "20": 57931390976.0, + "21": 57931390976.0, + "22": 57931390976.0, + "23": 57931390976.0, + "24": 57931390976.0, + "25": 57931390976.0, + "26": 57931390976.0, + "27": 57931390976.0, + "28": 57931390976.0, + "29": 57931390976.0, + "30": 57931390976.0, + "31": 57931390976.0, + "32": 58003226624.0, + "33": 58003226624.0, + "34": 58003226624.0, + "35": 58003226624.0, + "36": 58003226624.0, + "37": 58003226624.0, + "38": 58003226624.0, + "39": 58003226624.0, + "40": 58003226624.0, + "41": 58003226624.0, + "42": 58003226624.0, + "43": 58003226624.0, + "44": 58183614464.0, + "45": 58234208256.0, + "46": 58555555840.0, + "47": 58555555840.0, + "48": 58555555840.0, + "49": 58555555840.0, + "50": 58780934144.0 + } + }, + "mtp_1 loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.07654, + "2": 11.07406, + "3": 10.53881, + "4": 10.09803, + "5": 9.81154, + "6": 10.06236, + "7": 9.79762, + "8": 9.07117, + "9": 8.87049, + "10": 9.127, + "11": 8.49853, + "12": 8.53046, + "13": 8.42444, + "14": 7.847, + "15": 7.99077, + "16": 8.05015, + "17": 8.00064, + "18": 7.73104, + "19": 8.11087, + "20": 7.82933, + "21": 7.52501, + "22": 7.49916, + "23": 7.36982, + "24": 7.37235, + "25": 7.61578, + "26": 7.02029, + "27": 7.56014, + "28": 7.2681, + "29": 7.44399, + "30": 7.58618, + "31": 7.32468, + "32": 7.50596, + "33": 7.5715, + "34": 7.63581, + "35": 7.15224, + "36": 7.01784, + "37": 7.35163, + "38": 7.12551, + "39": 7.48656, + "40": 7.47408, + "41": 7.42096, + "42": 7.17595, + "43": 7.16059, + "44": 7.34289, + "45": 7.11969, + "46": 6.82753, + "47": 7.23525, + "48": 7.08042, + "49": 7.51043, + "50": 6.9735 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 69.29797, + "2": 1.7261, + "3": 1.40981, + "4": 2.16562, + "5": 1.7862, + "6": 1.7469, + "7": 1.96688, + "8": 1.97301, + "9": 1.74665, + "10": 1.69613, + "11": 1.02979, + "12": 1.02408, + "13": 1.03261, + "14": 1.02432, + "15": 1.0529, + "16": 1.04491, + "17": 1.03693, + "18": 1.03399, + "19": 1.03627, + "20": 1.02284, + "21": 1.01667, + "22": 1.02932, + "23": 1.03591, + "24": 1.03466, + "25": 1.03149, + "26": 1.03165, + "27": 1.02342, + "28": 1.03777, + "29": 1.04061, + "30": 1.05641, + "31": 1.02382, + "32": 1.01775, + "33": 1.03039, + "34": 1.03693, + "35": 1.03153, + "36": 1.02699, + "37": 1.02756, + "38": 1.02919, + "39": 1.01773, + "40": 1.03491, + "41": 1.03152, + "42": 1.03035, + "43": 1.0221, + "44": 1.05201, + "45": 1.02579, + "46": 1.02798, + "47": 1.03857, + "48": 1.02772, + "49": 1.0408, + "50": 1.03745 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..b3668b31178 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,344 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.04748, + "2": 11.03561, + "3": 9.58773, + "4": 9.25819, + "5": 9.52742, + "6": 9.87911, + "7": 9.48366, + "8": 8.93879, + "9": 8.6551, + "10": 9.10915, + "11": 8.51806, + "12": 8.54732, + "13": 8.48144, + "14": 8.05312, + "15": 8.10118, + "16": 8.10344, + "17": 8.08878, + "18": 7.78589, + "19": 8.15794, + "20": 7.88069, + "21": 7.58542, + "22": 7.54895, + "23": 7.4296, + "24": 7.41901, + "25": 7.67277, + "26": 7.07835, + "27": 7.61157, + "28": 7.31513, + "29": 7.49487, + "30": 7.64287, + "31": 7.39102, + "32": 7.59148, + "33": 7.6393, + "34": 7.70086, + "35": 7.2119, + "36": 7.08623, + "37": 7.43064, + "38": 7.18999, + "39": 7.5525, + "40": 7.54961, + "41": 7.49385, + "42": 7.25481, + "43": 7.24066, + "44": 7.42131, + "45": 7.19201, + "46": 6.90547, + "47": 7.30704, + "48": 7.15325, + "49": 7.60504, + "50": 7.04512 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 38802612.0, + "2": 38543592.0, + "3": 38739480.0, + "4": 279954336.0, + "5": 249745312.0, + "6": 268288496.0, + "7": 604756224.0, + "8": 781485184.0, + "9": 636362112.0, + "10": 653025216.0, + "11": 668551168.0, + "12": 765583616.0, + "13": 815362944.0, + "14": 834270656.0, + "15": 755756096.0, + "16": 995153536.0, + "17": 938291584.0, + "18": 721524928.0, + "19": 756173504.0, + "20": 901129600.0, + "21": 721816384.0, + "22": 831311872.0, + "23": 803536768.0, + "24": 628253248.0, + "25": 663895680.0, + "26": 847321664.0, + "27": 828927424.0, + "28": 777678976.0, + "29": 764628608.0, + "30": 781930112.0, + "31": 771767616.0, + "32": 771755392.0, + "33": 586323648.0, + "34": 734207552.0, + "35": 690468480.0, + "36": 485982688.0, + "37": 506506336.0, + "38": 642964160.0, + "39": 661240000.0, + "40": 645048768.0, + "41": 636072704.0, + "42": 491645856.0, + "43": 601942528.0, + "44": 623448960.0, + "45": 539959424.0, + "46": 532669088.0, + "47": 529039680.0, + "48": 504121984.0, + "49": 478344480.0, + "50": 331385728.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 6637267456.0, + "2": 6637269504.0, + "3": 6637269504.0, + "4": 6637269504.0, + "5": 6637269504.0, + "6": 6637269504.0, + "7": 6637269504.0, + "8": 6637269504.0, + "9": 6637269504.0, + "10": 6637269504.0, + "11": 6637269504.0, + "12": 6637269504.0, + "13": 6637269504.0, + "14": 6637269504.0, + "15": 6637269504.0, + "16": 6637269504.0, + "17": 6637269504.0, + "18": 6637269504.0, + "19": 6637269504.0, + "20": 6637269504.0, + "21": 6637269504.0, + "22": 6637269504.0, + "23": 6637269504.0, + "24": 6637269504.0, + "25": 6637269504.0, + "26": 6637269504.0, + "27": 6637269504.0, + "28": 6637269504.0, + "29": 6637269504.0, + "30": 6637269504.0, + "31": 6637269504.0, + "32": 6637269504.0, + "33": 6637269504.0, + "34": 6637269504.0, + "35": 6637269504.0, + "36": 6637269504.0, + "37": 6637269504.0, + "38": 6637269504.0, + "39": 6637269504.0, + "40": 6637269504.0, + "41": 6637269504.0, + "42": 6637269504.0, + "43": 6637269504.0, + "44": 6637269504.0, + "45": 6637269504.0, + "46": 6637269504.0, + "47": 6637269504.0, + "48": 6637269504.0, + "49": 6637269504.0, + "50": 6637269504.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 55055331328.0, + "2": 57809321984.0, + "3": 57919823872.0, + "4": 57919823872.0, + "5": 57919823872.0, + "6": 57919823872.0, + "7": 57919823872.0, + "8": 57919823872.0, + "9": 57919823872.0, + "10": 57919823872.0, + "11": 57919823872.0, + "12": 57919823872.0, + "13": 57932275712.0, + "14": 57932275712.0, + "15": 57932275712.0, + "16": 57932275712.0, + "17": 57932275712.0, + "18": 57932275712.0, + "19": 57932275712.0, + "20": 57932275712.0, + "21": 57932275712.0, + "22": 57932275712.0, + "23": 57932275712.0, + "24": 57932275712.0, + "25": 57932275712.0, + "26": 57932275712.0, + "27": 57932275712.0, + "28": 57932275712.0, + "29": 57932275712.0, + "30": 57932275712.0, + "31": 57932275712.0, + "32": 57932275712.0, + "33": 57932275712.0, + "34": 57932275712.0, + "35": 57932275712.0, + "36": 57932275712.0, + "37": 57932275712.0, + "38": 57932275712.0, + "39": 57932275712.0, + "40": 57932275712.0, + "41": 57932275712.0, + "42": 57932275712.0, + "43": 57932275712.0, + "44": 57932275712.0, + "45": 57932275712.0, + "46": 57932275712.0, + "47": 57932275712.0, + "48": 57932275712.0, + "49": 57932275712.0, + "50": 57932275712.0 + } + }, + "mtp_1 loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.07654, + "2": 11.07406, + "3": 10.53883, + "4": 10.09801, + "5": 9.81156, + "6": 10.06025, + "7": 9.7962, + "8": 9.06987, + "9": 8.86879, + "10": 9.13393, + "11": 8.5017, + "12": 8.54094, + "13": 8.43678, + "14": 7.85637, + "15": 7.99846, + "16": 8.05889, + "17": 8.01134, + "18": 7.73929, + "19": 8.1188, + "20": 7.83458, + "21": 7.53103, + "22": 7.50125, + "23": 7.37135, + "24": 7.37419, + "25": 7.61596, + "26": 7.01586, + "27": 7.55739, + "28": 7.26274, + "29": 7.43991, + "30": 7.58436, + "31": 7.32289, + "32": 7.50362, + "33": 7.56884, + "34": 7.6339, + "35": 7.151, + "36": 7.01725, + "37": 7.35013, + "38": 7.12483, + "39": 7.48708, + "40": 7.47451, + "41": 7.4181, + "42": 7.17557, + "43": 7.15957, + "44": 7.34227, + "45": 7.12176, + "46": 6.82526, + "47": 7.23374, + "48": 7.07893, + "49": 7.5077, + "50": 6.97094 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 57.80279, + "2": 1.26321, + "3": 1.18918, + "4": 2.24643, + "5": 2.25191, + "6": 1.80757, + "7": 2.09086, + "8": 1.69153, + "9": 1.81279, + "10": 1.64882, + "11": 1.03476, + "12": 1.03593, + "13": 1.04348, + "14": 1.03841, + "15": 1.04432, + "16": 1.05281, + "17": 1.04826, + "18": 1.04981, + "19": 1.05351, + "20": 1.04668, + "21": 1.05254, + "22": 1.05391, + "23": 1.04635, + "24": 1.05503, + "25": 1.04226, + "26": 1.0684, + "27": 1.04985, + "28": 1.04233, + "29": 1.05036, + "30": 1.06219, + "31": 1.044, + "32": 1.05614, + "33": 1.05729, + "34": 1.05618, + "35": 1.06289, + "36": 1.05761, + "37": 1.05956, + "38": 1.06343, + "39": 1.06848, + "40": 1.06027, + "41": 1.05493, + "42": 1.05258, + "43": 1.04879, + "44": 1.04949, + "45": 1.05964, + "46": 1.04465, + "47": 1.0491, + "48": 1.05387, + "49": 1.05218, + "50": 1.05453 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json index 657f6cef025..a7b4d2b32ca 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.81131, + "2": 10.83052, + "3": 10.82093, + "4": 10.81347, "5": 10.84338, + "6": 10.84743, + "7": 10.85254, + "8": 10.83482, + "9": 10.84276, "10": 10.77693, + "11": 10.8459, + "12": 10.85115, + "13": 10.84165, + "14": 10.8714, "15": 10.83613, + "16": 10.79815, + "17": 10.77288, + "18": 10.8075, + "19": 10.78773, "20": 10.73433, + "21": 10.69461, + "22": 10.56597, + "23": 10.71611, + "24": 10.61321, "25": 10.552, + "26": 10.61364, + "27": 10.62702, + "28": 10.59546, + "29": 10.59195, "30": 10.3916, + "31": 10.14615, + "32": 10.47399, + "33": 10.47051, + "34": 10.23435, "35": 10.29318, + "36": 10.26627, + "37": 10.37219, + "38": 10.2254, + "39": 10.42101, "40": 10.13002, + "41": 10.16265, + "42": 10.24278, + "43": 9.88237, + "44": 9.99105, "45": 9.87295, + "46": 9.85181, + "47": 10.15633, + "48": 9.8915, + "49": 9.58889, "50": 9.9543, + "51": 9.8849, + "52": 9.78004, + "53": 10.10188, + "54": 9.98715, "55": 9.9027, + "56": 9.66837, + "57": 9.53524, + "58": 9.89495, + "59": 9.62892, "60": 9.54308, + "61": 9.72727, + "62": 10.0332, + "63": 9.45215, + "64": 9.83179, "65": 8.99109, + "66": 9.76394, + "67": 9.40349, + "68": 9.83129, + "69": 9.81856, "70": 9.77262, + "71": 9.658, + "72": 9.64033, + "73": 9.55124, + "74": 9.02026, "75": 9.47695, + "76": 9.13586, + "77": 10.09787, + "78": 9.75274, + "79": 9.41697, "80": 9.45074, + "81": 9.52041, + "82": 9.73203, + "83": 9.36912, + "84": 9.45039, "85": 9.65229, + "86": 9.1123, + "87": 9.61119, + "88": 9.78708, + "89": 9.64625, "90": 9.83474, + "91": 9.39429, + "92": 9.39178, + "93": 9.12787, + "94": 8.86637, "95": 9.54352, + "96": 9.55716, + "97": 9.332, + "98": 9.69189, + "99": 8.92072, "100": 9.41916 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1216.0, + "2": 1361.0, + "3": 1221.0, + "4": 1222.0, "5": 1385.0, + "6": 1467.0, + "7": 1252.0, + "8": 1355.0, + "9": 1346.0, "10": 1335.0, + "11": 1278.0, + "12": 1185.0, + "13": 1203.0, + "14": 1385.0, "15": 1303.0, + "16": 1377.0, + "17": 1229.0, + "18": 1291.0, + "19": 1244.0, "20": 1183.0, + "21": 1262.0, + "22": 1122.0, + "23": 1301.0, + "24": 1066.0, "25": 1182.0, + "26": 1263.0, + "27": 1162.0, + "28": 1262.0, + "29": 1179.0, "30": 1168.0, + "31": 991.0, + "32": 1092.0, + "33": 1183.0, + "34": 1081.0, "35": 1146.0, + "36": 1076.0, + "37": 1252.0, + "38": 1176.0, + "39": 1225.0, "40": 1303.0, + "41": 1104.0, + "42": 1210.0, + "43": 1116.0, + "44": 1165.0, "45": 1097.0, + "46": 1308.0, + "47": 1165.0, + "48": 1134.0, + "49": 1272.0, "50": 1083.0, + "51": 1234.0, + "52": 1274.0, + "53": 1393.0, + "54": 1299.0, "55": 1186.0, + "56": 1267.0, + "57": 1161.0, + "58": 1326.0, + "59": 1403.0, "60": 1177.0, + "61": 1363.0, + "62": 1302.0, + "63": 1245.0, + "64": 1378.0, "65": 1330.0, + "66": 1363.0, + "67": 1286.0, + "68": 1313.0, + "69": 1295.0, "70": 1459.0, + "71": 1374.0, + "72": 1092.0, + "73": 1274.0, + "74": 943.0, "75": 1059.0, + "76": 1323.0, + "77": 1475.0, + "78": 1487.0, + "79": 1496.0, "80": 1382.0, + "81": 1470.0, + "82": 1417.0, + "83": 1177.0, + "84": 1506.0, "85": 1420.0, + "86": 1281.0, + "87": 1540.0, + "88": 1467.0, + "89": 1452.0, "90": 1350.0, + "91": 1010.0, + "92": 1324.0, + "93": 1349.0, + "94": 1197.0, "95": 2503.0, + "96": 2373.0, + "97": 1490.0, + "98": 2541.0, + "99": 1367.0, "100": 1122.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 788517888.0, + "2": 788488192.0, + "3": 788535296.0, + "4": 788513280.0, "5": 788537344.0, + "6": 788479488.0, + "7": 788502528.0, + "8": 788510208.0, + "9": 788526080.0, "10": 788538368.0, + "11": 788513280.0, + "12": 788484096.0, + "13": 788542464.0, + "14": 788451328.0, "15": 788503040.0, + "16": 788440576.0, + "17": 788558336.0, + "18": 788535296.0, + "19": 788542464.0, "20": 788470784.0, + "21": 788508672.0, + "22": 788594176.0, + "23": 788573696.0, + "24": 788513280.0, "25": 788655616.0, + "26": 788566016.0, + "27": 788630528.0, + "28": 788568576.0, + "29": 788610560.0, "30": 788587520.0, + "31": 788647424.0, + "32": 788602880.0, + "33": 788616704.0, + "34": 788577792.0, "35": 788616704.0, + "36": 788642304.0, + "37": 788597760.0, + "38": 788650496.0, + "39": 788663296.0, "40": 788550144.0, + "41": 788591616.0, + "42": 788575232.0, + "43": 788541952.0, + "44": 788623872.0, "45": 788491264.0, + "46": 788503552.0, + "47": 788572160.0, + "48": 788488704.0, + "49": 788461568.0, "50": 788487168.0, + "51": 788523008.0, + "52": 788483584.0, + "53": 788513792.0, + "54": 788503552.0, "55": 788499968.0, + "56": 788459008.0, + "57": 788456448.0, + "58": 788499968.0, + "59": 788503552.0, "60": 788491264.0, + "61": 788463616.0, + "62": 788497408.0, + "63": 788449792.0, + "64": 788465664.0, "65": 788408320.0, + "66": 788445696.0, + "67": 788445696.0, + "68": 788456448.0, + "69": 788473856.0, "70": 788497408.0, + "71": 788453888.0, + "72": 788413952.0, + "73": 788444160.0, + "74": 788419072.0, "75": 788441600.0, + "76": 788412928.0, + "77": 788471296.0, + "78": 788462592.0, + "79": 788419072.0, "80": 788411392.0, + "81": 788430848.0, + "82": 788439040.0, + "83": 788435456.0, + "84": 788471296.0, "85": 788461056.0, + "86": 788395008.0, + "87": 788490752.0, + "88": 788493312.0, + "89": 788501504.0, "90": 788531712.0, + "91": 788513792.0, + "92": 788516864.0, + "93": 788487168.0, + "94": 788506624.0, "95": 788543488.0, + "96": 788563456.0, + "97": 788579840.0, + "98": 788590592.0, + "99": 788514816.0, "100": 788570624.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 3023035904.0, + "2": 3179259392.0, + "3": 3206071808.0, + "4": 3206071808.0, "5": 3206539776.0, + "6": 3206539776.0, + "7": 3206539776.0, + "8": 3206539776.0, + "9": 3206539776.0, "10": 3206539776.0, + "11": 3206539776.0, + "12": 3206539776.0, + "13": 3207718400.0, + "14": 3207718400.0, "15": 3207718400.0, + "16": 3207718400.0, + "17": 3219952640.0, + "18": 3219952640.0, + "19": 3219952640.0, "20": 3219952640.0, + "21": 3219952640.0, + "22": 3239834624.0, + "23": 3239834624.0, + "24": 3239834624.0, "25": 3276544000.0, + "26": 3276544000.0, + "27": 3276544000.0, + "28": 3276544000.0, + "29": 3276544000.0, "30": 3276544000.0, + "31": 3276544000.0, + "32": 3276544000.0, + "33": 3276544000.0, + "34": 3276544000.0, "35": 3276544000.0, + "36": 3276544000.0, + "37": 3276544000.0, + "38": 3276544000.0, + "39": 3281670656.0, "40": 3281670656.0, + "41": 3281670656.0, + "42": 3281670656.0, + "43": 3281670656.0, + "44": 3281670656.0, "45": 3281670656.0, + "46": 3281670656.0, + "47": 3281670656.0, + "48": 3281670656.0, + "49": 3281670656.0, "50": 3281670656.0, + "51": 3281670656.0, + "52": 3281670656.0, + "53": 3281670656.0, + "54": 3281670656.0, "55": 3281670656.0, + "56": 3281670656.0, + "57": 3281670656.0, + "58": 3281670656.0, + "59": 3281670656.0, "60": 3281670656.0, + "61": 3281670656.0, + "62": 3281670656.0, + "63": 3281670656.0, + "64": 3281670656.0, "65": 3281670656.0, + "66": 3281670656.0, + "67": 3281670656.0, + "68": 3281670656.0, + "69": 3281670656.0, "70": 3281670656.0, + "71": 3281670656.0, + "72": 3281670656.0, + "73": 3281670656.0, + "74": 3281670656.0, "75": 3281670656.0, + "76": 3281670656.0, + "77": 3281670656.0, + "78": 3281670656.0, + "79": 3281670656.0, "80": 3281670656.0, + "81": 3281670656.0, + "82": 3281670656.0, + "83": 3281670656.0, + "84": 3281670656.0, "85": 3281670656.0, + "86": 3281670656.0, + "87": 3281670656.0, + "88": 3281670656.0, + "89": 3281670656.0, "90": 3281670656.0, + "91": 3281670656.0, + "92": 3281670656.0, + "93": 3281670656.0, + "94": 3281670656.0, "95": 3281670656.0, + "96": 3281670656.0, + "97": 3281670656.0, + "98": 3281670656.0, + "99": 3281670656.0, "100": 3281670656.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 11.3696, - "5": 0.16522, - "10": 0.1423, - "15": 0.12936, - "20": 0.14324, - "25": 0.1364, - "30": 0.15701, - "35": 0.15051, - "40": 0.14884, - "45": 0.15496, - "50": 0.15176, - "55": 0.1467, - "60": 0.16277, - "65": 0.14457, - "70": 0.16001, - "75": 0.15317, - "80": 0.15169, - "85": 0.15317, - "90": 0.14836, - "95": 0.1485, - "100": 0.1485 + "1": 11.7037, + "2": 0.22491, + "3": 0.19533, + "4": 0.17539, + "5": 0.18483, + "6": 0.16647, + "7": 0.1641, + "8": 0.16288, + "9": 0.15397, + "10": 0.15258, + "11": 0.15812, + "12": 0.15338, + "13": 0.14727, + "14": 0.15276, + "15": 0.1431, + "16": 0.1553, + "17": 0.14923, + "18": 0.15041, + "19": 0.15216, + "20": 0.15811, + "21": 0.14566, + "22": 0.14796, + "23": 0.15503, + "24": 0.15065, + "25": 0.15039, + "26": 0.15548, + "27": 0.158, + "28": 0.16038, + "29": 0.16862, + "30": 0.16712, + "31": 0.16858, + "32": 0.16095, + "33": 0.163, + "34": 0.1624, + "35": 0.16519, + "36": 0.16981, + "37": 0.16271, + "38": 0.16155, + "39": 0.17014, + "40": 0.1593, + "41": 0.167, + "42": 0.16495, + "43": 0.1718, + "44": 0.16565, + "45": 0.16518, + "46": 0.16648, + "47": 0.16483, + "48": 0.16244, + "49": 0.16707, + "50": 0.16226, + "51": 0.1715, + "52": 0.16281, + "53": 0.16077, + "54": 0.15821, + "55": 0.15951, + "56": 0.16684, + "57": 0.16109, + "58": 0.16192, + "59": 0.16349, + "60": 0.16237, + "61": 0.15955, + "62": 0.15954, + "63": 0.15968, + "64": 0.16092, + "65": 0.1539, + "66": 0.16199, + "67": 0.15811, + "68": 0.1652, + "69": 0.16307, + "70": 0.17014, + "71": 0.15399, + "72": 0.16312, + "73": 0.15787, + "74": 0.16598, + "75": 0.16279, + "76": 0.15216, + "77": 0.16031, + "78": 0.15503, + "79": 0.16083, + "80": 0.16046, + "81": 0.15996, + "82": 0.15176, + "83": 0.16328, + "84": 0.16094, + "85": 0.16065, + "86": 0.1554, + "87": 0.15864, + "88": 0.16406, + "89": 0.15924, + "90": 0.15731, + "91": 0.15776, + "92": 0.16339, + "93": 0.15877, + "94": 0.15733, + "95": 0.15774, + "96": 0.15579, + "97": 0.16338, + "98": 0.15898, + "99": 0.16066, + "100": 0.15749 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..b4d227b10e3 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.81131, + "2": 10.83052, + "3": 10.82093, + "4": 10.81347, + "5": 10.84338, + "6": 10.84743, + "7": 10.85254, + "8": 10.83482, + "9": 10.84276, + "10": 10.77693, + "11": 10.8459, + "12": 10.85115, + "13": 10.84165, + "14": 10.8714, + "15": 10.83613, + "16": 10.79815, + "17": 10.77288, + "18": 10.8075, + "19": 10.78773, + "20": 10.73433, + "21": 10.69461, + "22": 10.56597, + "23": 10.71611, + "24": 10.61321, + "25": 10.552, + "26": 10.61364, + "27": 10.62702, + "28": 10.59546, + "29": 10.59195, + "30": 10.3916, + "31": 10.14615, + "32": 10.47399, + "33": 10.47051, + "34": 10.23435, + "35": 10.29318, + "36": 10.26627, + "37": 10.37219, + "38": 10.2254, + "39": 10.42101, + "40": 10.13002, + "41": 10.16265, + "42": 10.24278, + "43": 9.88237, + "44": 9.99105, + "45": 9.87295, + "46": 9.85181, + "47": 10.15633, + "48": 9.8915, + "49": 9.58889, + "50": 9.9543, + "51": 9.8849, + "52": 9.78004, + "53": 10.10188, + "54": 9.98715, + "55": 9.9027, + "56": 9.66837, + "57": 9.53524, + "58": 9.89495, + "59": 9.62892, + "60": 9.54308, + "61": 9.72727, + "62": 10.0332, + "63": 9.45215, + "64": 9.83179, + "65": 8.99109, + "66": 9.76394, + "67": 9.40349, + "68": 9.83129, + "69": 9.81856, + "70": 9.77262, + "71": 9.658, + "72": 9.64033, + "73": 9.55124, + "74": 9.02026, + "75": 9.47695, + "76": 9.13586, + "77": 10.09787, + "78": 9.75274, + "79": 9.41697, + "80": 9.45074, + "81": 9.52041, + "82": 9.73203, + "83": 9.36912, + "84": 9.45039, + "85": 9.65229, + "86": 9.1123, + "87": 9.61119, + "88": 9.78708, + "89": 9.64625, + "90": 9.83474, + "91": 9.39429, + "92": 9.39178, + "93": 9.12787, + "94": 8.86637, + "95": 9.54352, + "96": 9.55716, + "97": 9.332, + "98": 9.69189, + "99": 8.92072, + "100": 9.41916 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1216.0, + "2": 1361.0, + "3": 1221.0, + "4": 1222.0, + "5": 1385.0, + "6": 1467.0, + "7": 1252.0, + "8": 1355.0, + "9": 1346.0, + "10": 1335.0, + "11": 1278.0, + "12": 1185.0, + "13": 1203.0, + "14": 1385.0, + "15": 1303.0, + "16": 1377.0, + "17": 1229.0, + "18": 1291.0, + "19": 1244.0, + "20": 1183.0, + "21": 1262.0, + "22": 1122.0, + "23": 1301.0, + "24": 1066.0, + "25": 1182.0, + "26": 1263.0, + "27": 1162.0, + "28": 1262.0, + "29": 1179.0, + "30": 1168.0, + "31": 991.0, + "32": 1092.0, + "33": 1183.0, + "34": 1081.0, + "35": 1146.0, + "36": 1076.0, + "37": 1252.0, + "38": 1176.0, + "39": 1225.0, + "40": 1303.0, + "41": 1104.0, + "42": 1210.0, + "43": 1116.0, + "44": 1165.0, + "45": 1097.0, + "46": 1308.0, + "47": 1165.0, + "48": 1134.0, + "49": 1272.0, + "50": 1083.0, + "51": 1234.0, + "52": 1274.0, + "53": 1393.0, + "54": 1299.0, + "55": 1186.0, + "56": 1267.0, + "57": 1161.0, + "58": 1326.0, + "59": 1403.0, + "60": 1177.0, + "61": 1363.0, + "62": 1302.0, + "63": 1245.0, + "64": 1378.0, + "65": 1330.0, + "66": 1363.0, + "67": 1286.0, + "68": 1313.0, + "69": 1295.0, + "70": 1459.0, + "71": 1374.0, + "72": 1092.0, + "73": 1274.0, + "74": 943.0, + "75": 1059.0, + "76": 1323.0, + "77": 1475.0, + "78": 1487.0, + "79": 1496.0, + "80": 1382.0, + "81": 1470.0, + "82": 1417.0, + "83": 1177.0, + "84": 1506.0, + "85": 1420.0, + "86": 1281.0, + "87": 1540.0, + "88": 1467.0, + "89": 1452.0, + "90": 1350.0, + "91": 1010.0, + "92": 1324.0, + "93": 1349.0, + "94": 1197.0, + "95": 2503.0, + "96": 2373.0, + "97": 1490.0, + "98": 2541.0, + "99": 1367.0, + "100": 1122.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 788517888.0, + "2": 788488192.0, + "3": 788535296.0, + "4": 788513280.0, + "5": 788537344.0, + "6": 788479488.0, + "7": 788502528.0, + "8": 788510208.0, + "9": 788526080.0, + "10": 788538368.0, + "11": 788513280.0, + "12": 788484096.0, + "13": 788542464.0, + "14": 788451328.0, + "15": 788503040.0, + "16": 788440576.0, + "17": 788558336.0, + "18": 788535296.0, + "19": 788542464.0, + "20": 788470784.0, + "21": 788508672.0, + "22": 788594176.0, + "23": 788573696.0, + "24": 788513280.0, + "25": 788655616.0, + "26": 788566016.0, + "27": 788630528.0, + "28": 788568576.0, + "29": 788610560.0, + "30": 788587520.0, + "31": 788647424.0, + "32": 788602880.0, + "33": 788616704.0, + "34": 788577792.0, + "35": 788616704.0, + "36": 788642304.0, + "37": 788597760.0, + "38": 788650496.0, + "39": 788663296.0, + "40": 788550144.0, + "41": 788591616.0, + "42": 788575232.0, + "43": 788541952.0, + "44": 788623872.0, + "45": 788491264.0, + "46": 788503552.0, + "47": 788572160.0, + "48": 788488704.0, + "49": 788461568.0, + "50": 788487168.0, + "51": 788523008.0, + "52": 788483584.0, + "53": 788513792.0, + "54": 788503552.0, + "55": 788499968.0, + "56": 788459008.0, + "57": 788456448.0, + "58": 788499968.0, + "59": 788503552.0, + "60": 788491264.0, + "61": 788463616.0, + "62": 788497408.0, + "63": 788449792.0, + "64": 788465664.0, + "65": 788408320.0, + "66": 788445696.0, + "67": 788445696.0, + "68": 788456448.0, + "69": 788473856.0, + "70": 788497408.0, + "71": 788453888.0, + "72": 788413952.0, + "73": 788444160.0, + "74": 788419072.0, + "75": 788441600.0, + "76": 788412928.0, + "77": 788471296.0, + "78": 788462592.0, + "79": 788419072.0, + "80": 788411392.0, + "81": 788430848.0, + "82": 788439040.0, + "83": 788435456.0, + "84": 788471296.0, + "85": 788461056.0, + "86": 788395008.0, + "87": 788490752.0, + "88": 788493312.0, + "89": 788501504.0, + "90": 788531712.0, + "91": 788513792.0, + "92": 788516864.0, + "93": 788487168.0, + "94": 788506624.0, + "95": 788543488.0, + "96": 788563456.0, + "97": 788579840.0, + "98": 788590592.0, + "99": 788514816.0, + "100": 788570624.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3023035904.0, + "2": 3179259392.0, + "3": 3206071808.0, + "4": 3206071808.0, + "5": 3206539776.0, + "6": 3206539776.0, + "7": 3206539776.0, + "8": 3206539776.0, + "9": 3206539776.0, + "10": 3206539776.0, + "11": 3206539776.0, + "12": 3206539776.0, + "13": 3207718400.0, + "14": 3207718400.0, + "15": 3207718400.0, + "16": 3207718400.0, + "17": 3219952640.0, + "18": 3219952640.0, + "19": 3219952640.0, + "20": 3219952640.0, + "21": 3219952640.0, + "22": 3239834624.0, + "23": 3239834624.0, + "24": 3239834624.0, + "25": 3276544000.0, + "26": 3276544000.0, + "27": 3276544000.0, + "28": 3276544000.0, + "29": 3276544000.0, + "30": 3276544000.0, + "31": 3276544000.0, + "32": 3276544000.0, + "33": 3276544000.0, + "34": 3276544000.0, + "35": 3276544000.0, + "36": 3276544000.0, + "37": 3276544000.0, + "38": 3276544000.0, + "39": 3281670656.0, + "40": 3281670656.0, + "41": 3281670656.0, + "42": 3281670656.0, + "43": 3281670656.0, + "44": 3281670656.0, + "45": 3281670656.0, + "46": 3281670656.0, + "47": 3281670656.0, + "48": 3281670656.0, + "49": 3281670656.0, + "50": 3281670656.0, + "51": 3281670656.0, + "52": 3281670656.0, + "53": 3281670656.0, + "54": 3281670656.0, + "55": 3281670656.0, + "56": 3281670656.0, + "57": 3281670656.0, + "58": 3281670656.0, + "59": 3281670656.0, + "60": 3281670656.0, + "61": 3281670656.0, + "62": 3281670656.0, + "63": 3281670656.0, + "64": 3281670656.0, + "65": 3281670656.0, + "66": 3281670656.0, + "67": 3281670656.0, + "68": 3281670656.0, + "69": 3281670656.0, + "70": 3281670656.0, + "71": 3281670656.0, + "72": 3281670656.0, + "73": 3281670656.0, + "74": 3281670656.0, + "75": 3281670656.0, + "76": 3281670656.0, + "77": 3281670656.0, + "78": 3281670656.0, + "79": 3281670656.0, + "80": 3281670656.0, + "81": 3281670656.0, + "82": 3281670656.0, + "83": 3281670656.0, + "84": 3281670656.0, + "85": 3281670656.0, + "86": 3281670656.0, + "87": 3281670656.0, + "88": 3281670656.0, + "89": 3281670656.0, + "90": 3281670656.0, + "91": 3281670656.0, + "92": 3281670656.0, + "93": 3281670656.0, + "94": 3281670656.0, + "95": 3281670656.0, + "96": 3281670656.0, + "97": 3281670656.0, + "98": 3281670656.0, + "99": 3281670656.0, + "100": 3281670656.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 13.21246, + "2": 0.19223, + "3": 0.15847, + "4": 0.14572, + "5": 0.16957, + "6": 0.15266, + "7": 0.1476, + "8": 0.14988, + "9": 0.13878, + "10": 0.14012, + "11": 0.14591, + "12": 0.13945, + "13": 0.13431, + "14": 0.13944, + "15": 0.12844, + "16": 0.14372, + "17": 0.13297, + "18": 0.13719, + "19": 0.13802, + "20": 0.14981, + "21": 0.14099, + "22": 0.12975, + "23": 0.13616, + "24": 0.13752, + "25": 0.13502, + "26": 0.14149, + "27": 0.14818, + "28": 0.14416, + "29": 0.15275, + "30": 0.15077, + "31": 0.15206, + "32": 0.14915, + "33": 0.14666, + "34": 0.1514, + "35": 0.15021, + "36": 0.15193, + "37": 0.14779, + "38": 0.14835, + "39": 0.15073, + "40": 0.14707, + "41": 0.15268, + "42": 0.14878, + "43": 0.15579, + "44": 0.15254, + "45": 0.14999, + "46": 0.20896, + "47": 0.15273, + "48": 0.1484, + "49": 0.15559, + "50": 0.15018, + "51": 0.16013, + "52": 0.15399, + "53": 0.15753, + "54": 0.14895, + "55": 0.14858, + "56": 0.16309, + "57": 0.15206, + "58": 0.15115, + "59": 0.15315, + "60": 0.15387, + "61": 0.14946, + "62": 0.15213, + "63": 0.14874, + "64": 0.15283, + "65": 0.14602, + "66": 0.15458, + "67": 0.15123, + "68": 0.1551, + "69": 0.15244, + "70": 0.16045, + "71": 0.14441, + "72": 0.15574, + "73": 0.15315, + "74": 0.15619, + "75": 0.15269, + "76": 0.14224, + "77": 0.15289, + "78": 0.14961, + "79": 0.153, + "80": 0.15606, + "81": 0.15226, + "82": 0.14364, + "83": 0.15261, + "84": 0.15146, + "85": 0.15268, + "86": 0.14691, + "87": 0.15346, + "88": 0.15373, + "89": 0.14793, + "90": 0.14784, + "91": 0.14748, + "92": 0.15356, + "93": 0.14881, + "94": 0.14846, + "95": 0.14747, + "96": 0.14823, + "97": 0.15527, + "98": 0.15043, + "99": 0.15066, + "100": 0.14841 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..2ffe6fcfe65 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.81131, + "2": 10.83052, + "3": 10.82093, + "4": 10.81347, + "5": 10.84338, + "6": 10.84743, + "7": 10.85254, + "8": 10.83482, + "9": 10.84276, + "10": 10.77693, + "11": 10.8459, + "12": 10.85115, + "13": 10.84165, + "14": 10.8714, + "15": 10.83613, + "16": 10.79815, + "17": 10.77288, + "18": 10.8075, + "19": 10.78773, + "20": 10.73433, + "21": 10.69461, + "22": 10.56597, + "23": 10.71611, + "24": 10.61321, + "25": 10.552, + "26": 10.61364, + "27": 10.62702, + "28": 10.59546, + "29": 10.59195, + "30": 10.3916, + "31": 10.14615, + "32": 10.47399, + "33": 10.47051, + "34": 10.23435, + "35": 10.29318, + "36": 10.26627, + "37": 10.37219, + "38": 10.2254, + "39": 10.42101, + "40": 10.13002, + "41": 10.16265, + "42": 10.24278, + "43": 9.88237, + "44": 9.99105, + "45": 9.87295, + "46": 9.85181, + "47": 10.15633, + "48": 9.8915, + "49": 9.58889, + "50": 9.9543, + "51": 9.8849, + "52": 9.78004, + "53": 10.10188, + "54": 9.98715, + "55": 9.9027, + "56": 9.66837, + "57": 9.53524, + "58": 9.89495, + "59": 9.62892, + "60": 9.54308, + "61": 9.72727, + "62": 10.0332, + "63": 9.45215, + "64": 9.83179, + "65": 8.99109, + "66": 9.76394, + "67": 9.40349, + "68": 9.83129, + "69": 9.81856, + "70": 9.77262, + "71": 9.658, + "72": 9.64033, + "73": 9.55124, + "74": 9.02026, + "75": 9.47695, + "76": 9.13586, + "77": 10.09787, + "78": 9.75274, + "79": 9.41697, + "80": 9.45074, + "81": 9.52041, + "82": 9.73203, + "83": 9.36912, + "84": 9.45039, + "85": 9.65229, + "86": 9.1123, + "87": 9.61119, + "88": 9.78708, + "89": 9.64625, + "90": 9.83474, + "91": 9.39429, + "92": 9.39178, + "93": 9.12787, + "94": 8.86637, + "95": 9.54352, + "96": 9.55716, + "97": 9.332, + "98": 9.69189, + "99": 8.92072, + "100": 9.41916 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1216.0, + "2": 1361.0, + "3": 1221.0, + "4": 1222.0, + "5": 1385.0, + "6": 1467.0, + "7": 1252.0, + "8": 1355.0, + "9": 1346.0, + "10": 1335.0, + "11": 1278.0, + "12": 1185.0, + "13": 1203.0, + "14": 1385.0, + "15": 1303.0, + "16": 1377.0, + "17": 1229.0, + "18": 1291.0, + "19": 1244.0, + "20": 1183.0, + "21": 1262.0, + "22": 1122.0, + "23": 1301.0, + "24": 1066.0, + "25": 1182.0, + "26": 1263.0, + "27": 1162.0, + "28": 1262.0, + "29": 1179.0, + "30": 1168.0, + "31": 991.0, + "32": 1092.0, + "33": 1183.0, + "34": 1081.0, + "35": 1146.0, + "36": 1076.0, + "37": 1252.0, + "38": 1176.0, + "39": 1225.0, + "40": 1303.0, + "41": 1104.0, + "42": 1210.0, + "43": 1116.0, + "44": 1165.0, + "45": 1097.0, + "46": 1308.0, + "47": 1165.0, + "48": 1134.0, + "49": 1272.0, + "50": 1083.0, + "51": 1234.0, + "52": 1274.0, + "53": 1393.0, + "54": 1299.0, + "55": 1186.0, + "56": 1267.0, + "57": 1161.0, + "58": 1326.0, + "59": 1403.0, + "60": 1177.0, + "61": 1363.0, + "62": 1302.0, + "63": 1245.0, + "64": 1378.0, + "65": 1330.0, + "66": 1363.0, + "67": 1286.0, + "68": 1313.0, + "69": 1295.0, + "70": 1459.0, + "71": 1374.0, + "72": 1092.0, + "73": 1274.0, + "74": 943.0, + "75": 1059.0, + "76": 1323.0, + "77": 1475.0, + "78": 1487.0, + "79": 1496.0, + "80": 1382.0, + "81": 1470.0, + "82": 1417.0, + "83": 1177.0, + "84": 1506.0, + "85": 1420.0, + "86": 1281.0, + "87": 1540.0, + "88": 1467.0, + "89": 1452.0, + "90": 1350.0, + "91": 1010.0, + "92": 1324.0, + "93": 1349.0, + "94": 1197.0, + "95": 2503.0, + "96": 2373.0, + "97": 1490.0, + "98": 2541.0, + "99": 1367.0, + "100": 1122.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 788517888.0, + "2": 788488192.0, + "3": 788535296.0, + "4": 788513280.0, + "5": 788537344.0, + "6": 788479488.0, + "7": 788502528.0, + "8": 788510208.0, + "9": 788526080.0, + "10": 788538368.0, + "11": 788513280.0, + "12": 788484096.0, + "13": 788542464.0, + "14": 788451328.0, + "15": 788503040.0, + "16": 788440576.0, + "17": 788558336.0, + "18": 788535296.0, + "19": 788542464.0, + "20": 788470784.0, + "21": 788508672.0, + "22": 788594176.0, + "23": 788573696.0, + "24": 788513280.0, + "25": 788655616.0, + "26": 788566016.0, + "27": 788630528.0, + "28": 788568576.0, + "29": 788610560.0, + "30": 788587520.0, + "31": 788647424.0, + "32": 788602880.0, + "33": 788616704.0, + "34": 788577792.0, + "35": 788616704.0, + "36": 788642304.0, + "37": 788597760.0, + "38": 788650496.0, + "39": 788663296.0, + "40": 788550144.0, + "41": 788591616.0, + "42": 788575232.0, + "43": 788541952.0, + "44": 788623872.0, + "45": 788491264.0, + "46": 788503552.0, + "47": 788572160.0, + "48": 788488704.0, + "49": 788461568.0, + "50": 788487168.0, + "51": 788523008.0, + "52": 788483584.0, + "53": 788513792.0, + "54": 788503552.0, + "55": 788499968.0, + "56": 788459008.0, + "57": 788456448.0, + "58": 788499968.0, + "59": 788503552.0, + "60": 788491264.0, + "61": 788463616.0, + "62": 788497408.0, + "63": 788449792.0, + "64": 788465664.0, + "65": 788408320.0, + "66": 788445696.0, + "67": 788445696.0, + "68": 788456448.0, + "69": 788473856.0, + "70": 788497408.0, + "71": 788453888.0, + "72": 788413952.0, + "73": 788444160.0, + "74": 788419072.0, + "75": 788441600.0, + "76": 788412928.0, + "77": 788471296.0, + "78": 788462592.0, + "79": 788419072.0, + "80": 788411392.0, + "81": 788430848.0, + "82": 788439040.0, + "83": 788435456.0, + "84": 788471296.0, + "85": 788461056.0, + "86": 788395008.0, + "87": 788490752.0, + "88": 788493312.0, + "89": 788501504.0, + "90": 788531712.0, + "91": 788513792.0, + "92": 788516864.0, + "93": 788487168.0, + "94": 788506624.0, + "95": 788543488.0, + "96": 788563456.0, + "97": 788579840.0, + "98": 788590592.0, + "99": 788514816.0, + "100": 788570624.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 3023035904.0, + "2": 3179259392.0, + "3": 3206071808.0, + "4": 3206071808.0, + "5": 3206539776.0, + "6": 3206539776.0, + "7": 3206539776.0, + "8": 3206539776.0, + "9": 3206539776.0, + "10": 3206539776.0, + "11": 3206539776.0, + "12": 3206539776.0, + "13": 3207718400.0, + "14": 3207718400.0, + "15": 3207718400.0, + "16": 3207718400.0, + "17": 3219952640.0, + "18": 3219952640.0, + "19": 3219952640.0, + "20": 3219952640.0, + "21": 3219952640.0, + "22": 3239834624.0, + "23": 3239834624.0, + "24": 3239834624.0, + "25": 3276544000.0, + "26": 3276544000.0, + "27": 3276544000.0, + "28": 3276544000.0, + "29": 3276544000.0, + "30": 3276544000.0, + "31": 3276544000.0, + "32": 3276544000.0, + "33": 3276544000.0, + "34": 3276544000.0, + "35": 3276544000.0, + "36": 3276544000.0, + "37": 3276544000.0, + "38": 3276544000.0, + "39": 3281670656.0, + "40": 3281670656.0, + "41": 3281670656.0, + "42": 3281670656.0, + "43": 3281670656.0, + "44": 3281670656.0, + "45": 3281670656.0, + "46": 3281670656.0, + "47": 3281670656.0, + "48": 3281670656.0, + "49": 3281670656.0, + "50": 3281670656.0, + "51": 3281670656.0, + "52": 3281670656.0, + "53": 3281670656.0, + "54": 3281670656.0, + "55": 3281670656.0, + "56": 3281670656.0, + "57": 3281670656.0, + "58": 3281670656.0, + "59": 3281670656.0, + "60": 3281670656.0, + "61": 3281670656.0, + "62": 3281670656.0, + "63": 3281670656.0, + "64": 3281670656.0, + "65": 3281670656.0, + "66": 3281670656.0, + "67": 3281670656.0, + "68": 3281670656.0, + "69": 3281670656.0, + "70": 3281670656.0, + "71": 3281670656.0, + "72": 3281670656.0, + "73": 3281670656.0, + "74": 3281670656.0, + "75": 3281670656.0, + "76": 3281670656.0, + "77": 3281670656.0, + "78": 3281670656.0, + "79": 3281670656.0, + "80": 3281670656.0, + "81": 3281670656.0, + "82": 3281670656.0, + "83": 3281670656.0, + "84": 3281670656.0, + "85": 3281670656.0, + "86": 3281670656.0, + "87": 3281670656.0, + "88": 3281670656.0, + "89": 3281670656.0, + "90": 3281670656.0, + "91": 3281670656.0, + "92": 3281670656.0, + "93": 3281670656.0, + "94": 3281670656.0, + "95": 3281670656.0, + "96": 3281670656.0, + "97": 3281670656.0, + "98": 3281670656.0, + "99": 3281670656.0, + "100": 3281670656.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.84919, + "2": 0.21301, + "3": 0.1875, + "4": 0.18049, + "5": 0.18318, + "6": 0.16229, + "7": 0.16391, + "8": 0.16206, + "9": 0.1519, + "10": 0.15265, + "11": 0.15406, + "12": 0.15153, + "13": 0.14262, + "14": 0.15066, + "15": 0.1386, + "16": 0.15377, + "17": 0.14672, + "18": 0.15, + "19": 0.15031, + "20": 0.15363, + "21": 0.14157, + "22": 0.14022, + "23": 0.15031, + "24": 0.14784, + "25": 0.14617, + "26": 0.15072, + "27": 0.15826, + "28": 0.15989, + "29": 0.17285, + "30": 0.16368, + "31": 0.16977, + "32": 0.1612, + "33": 0.15985, + "34": 0.15796, + "35": 0.16549, + "36": 0.16888, + "37": 0.16396, + "38": 0.16275, + "39": 0.16316, + "40": 0.15731, + "41": 0.16488, + "42": 0.16446, + "43": 0.16827, + "44": 0.16392, + "45": 0.16192, + "46": 0.16633, + "47": 0.16308, + "48": 0.16007, + "49": 0.16464, + "50": 0.15794, + "51": 0.17113, + "52": 0.16522, + "53": 0.1626, + "54": 0.15774, + "55": 0.15957, + "56": 0.16666, + "57": 0.16407, + "58": 0.16282, + "59": 0.16402, + "60": 0.16235, + "61": 0.15906, + "62": 0.16273, + "63": 0.16172, + "64": 0.16219, + "65": 0.15545, + "66": 0.16335, + "67": 0.16169, + "68": 0.16503, + "69": 0.1641, + "70": 0.17009, + "71": 0.1546, + "72": 0.16631, + "73": 0.16013, + "74": 0.166, + "75": 0.1647, + "76": 0.15257, + "77": 0.16369, + "78": 0.156, + "79": 0.16228, + "80": 0.16107, + "81": 0.16212, + "82": 0.15365, + "83": 0.16258, + "84": 0.16459, + "85": 0.16137, + "86": 0.15549, + "87": 0.1627, + "88": 0.16309, + "89": 0.16008, + "90": 0.15864, + "91": 0.15894, + "92": 0.1647, + "93": 0.16045, + "94": 0.1601, + "95": 0.15909, + "96": 0.15624, + "97": 0.16592, + "98": 0.15827, + "99": 0.16214, + "100": 0.15589 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json index 34f7db22ade..ae1c2034cde 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.81442, + "2": 10.81882, + "3": 10.81531, + "4": 10.80285, "5": 10.8513, + "6": 10.85015, + "7": 10.83865, + "8": 10.83952, + "9": 10.82187, "10": 10.77753, + "11": 10.86422, + "12": 10.83724, + "13": 10.85876, + "14": 10.86332, "15": 10.79795, + "16": 10.79507, + "17": 10.77121, + "18": 10.78932, + "19": 10.78375, "20": 10.71658, + "21": 10.68392, + "22": 10.53046, + "23": 10.69852, + "24": 10.58536, "25": 10.52392, + "26": 10.58331, + "27": 10.60949, + "28": 10.57165, + "29": 10.59009, "30": 10.35681, + "31": 10.09394, + "32": 10.45893, + "33": 10.45658, + "34": 10.20513, "35": 10.26714, + "36": 10.22334, + "37": 10.35301, + "38": 10.19469, + "39": 10.4172, "40": 10.08945, + "41": 10.12779, + "42": 10.21205, + "43": 9.83115, + "44": 9.9694, "45": 9.83605, + "46": 9.81694, + "47": 10.15399, + "48": 9.85315, + "49": 9.53452, "50": 9.91905, + "51": 9.85365, + "52": 9.74298, + "53": 10.07139, + "54": 9.96275, "55": 9.88234, + "56": 9.63465, + "57": 9.4865, + "58": 9.84855, + "59": 9.58914, "60": 9.5108, + "61": 9.70318, + "62": 9.99619, + "63": 9.40059, + "64": 9.78463, "65": 8.95371, + "66": 9.7179, + "67": 9.36926, + "68": 9.79814, + "69": 9.79668, "70": 9.74892, + "71": 9.63192, + "72": 9.59949, + "73": 9.50317, + "74": 8.9522, "75": 9.43106, + "76": 9.09064, + "77": 10.08076, + "78": 9.73534, + "79": 9.3887, "80": 9.41432, + "81": 9.48416, + "82": 9.7092, + "83": 9.31507, + "84": 9.41846, "85": 9.6224, + "86": 9.07938, + "87": 9.59206, + "88": 9.74951, + "89": 9.60449, "90": 9.82577, + "91": 9.34236, + "92": 9.35861, + "93": 9.07987, + "94": 8.82784, "95": 9.50868, + "96": 9.52112, + "97": 9.30601, + "98": 9.66582, + "99": 8.87718, "100": 9.38975 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 5476.0, + "2": 5726.0, + "3": 5820.0, + "4": 5738.0, "5": 6334.0, + "6": 6609.0, + "7": 5986.0, + "8": 5915.0, + "9": 6387.0, "10": 5090.0, + "11": 6596.0, + "12": 6165.0, + "13": 6559.0, + "14": 6568.0, "15": 6041.0, + "16": 6363.0, + "17": 6226.0, + "18": 5986.0, + "19": 6413.0, "20": 5738.0, + "21": 6248.0, + "22": 5765.0, + "23": 6895.0, + "24": 6096.0, "25": 5736.0, + "26": 6113.0, + "27": 6495.0, + "28": 6754.0, + "29": 7066.0, "30": 6254.0, + "31": 5809.0, + "32": 6893.0, + "33": 7278.0, + "34": 6486.0, "35": 6750.0, + "36": 6625.0, + "37": 7510.0, + "38": 7131.0, + "39": 7741.0, "40": 7222.0, + "41": 7096.0, + "42": 7656.0, + "43": 7205.0, + "44": 7138.0, "45": 7019.0, + "46": 7235.0, + "47": 7542.0, + "48": 7734.0, + "49": 7610.0, "50": 7710.0, + "51": 8076.0, + "52": 7867.0, + "53": 8874.0, + "54": 8747.0, "55": 7601.0, + "56": 7891.0, + "57": 7603.0, + "58": 8731.0, + "59": 8257.0, "60": 7964.0, + "61": 8450.0, + "62": 8632.0, + "63": 7806.0, + "64": 8923.0, "65": 8276.0, + "66": 9208.0, + "67": 8240.0, + "68": 8439.0, + "69": 8765.0, "70": 9578.0, + "71": 9145.0, + "72": 8894.0, + "73": 8946.0, + "74": 6930.0, "75": 7952.0, + "76": 8482.0, + "77": 12156.0, + "78": 9554.0, + "79": 12899.0, "80": 11642.0, + "81": 9977.0, + "82": 9786.0, + "83": 14238.0, + "84": 13757.0, "85": 46448.0, + "86": 9803.0, + "87": 14740.0, + "88": 9790.0, + "89": 10097.0, "90": 11246.0, + "91": 8938.0, + "92": 9088.0, + "93": 8203.0, + "94": 9445.0, "95": 9762.0, + "96": 47617.0, + "97": 8875.0, + "98": 11078.0, + "99": 15373.0, "100": 9275.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 628059136.0, + "2": 628060160.0, + "3": 628060160.0, + "4": 628060160.0, "5": 628060160.0, + "6": 628060160.0, + "7": 628060160.0, + "8": 628060160.0, + "9": 628060160.0, "10": 628060160.0, + "11": 628060160.0, + "12": 628060160.0, + "13": 628060160.0, + "14": 628060160.0, "15": 628060160.0, + "16": 628060160.0, + "17": 628060160.0, + "18": 628060160.0, + "19": 628060160.0, "20": 628060160.0, + "21": 628060160.0, + "22": 628060160.0, + "23": 628060160.0, + "24": 628060160.0, "25": 628060160.0, + "26": 628060160.0, + "27": 628060160.0, + "28": 628060160.0, + "29": 628060160.0, "30": 628060160.0, + "31": 628060160.0, + "32": 628060160.0, + "33": 628060160.0, + "34": 628060160.0, "35": 628060160.0, + "36": 628060160.0, + "37": 628060160.0, + "38": 628060160.0, + "39": 628060160.0, "40": 628060160.0, + "41": 628060160.0, + "42": 628060160.0, + "43": 628060160.0, + "44": 628060160.0, "45": 628060160.0, + "46": 628060160.0, + "47": 628060160.0, + "48": 628060160.0, + "49": 628060160.0, "50": 628060160.0, + "51": 628060160.0, + "52": 628060160.0, + "53": 628060160.0, + "54": 628060160.0, "55": 628060160.0, + "56": 628060160.0, + "57": 628060160.0, + "58": 628060160.0, + "59": 628060160.0, "60": 628060160.0, + "61": 628060160.0, + "62": 628060160.0, + "63": 628060160.0, + "64": 628060160.0, "65": 628060160.0, + "66": 628060160.0, + "67": 628060160.0, + "68": 628060160.0, + "69": 628060160.0, "70": 628060160.0, + "71": 628060160.0, + "72": 628060160.0, + "73": 628060160.0, + "74": 628060160.0, "75": 628060160.0, + "76": 628060160.0, + "77": 628060160.0, + "78": 628060160.0, + "79": 628060160.0, "80": 628060160.0, + "81": 628060160.0, + "82": 628060160.0, + "83": 628060160.0, + "84": 628060160.0, "85": 628060160.0, + "86": 628060160.0, + "87": 628060160.0, + "88": 628060160.0, + "89": 628060160.0, "90": 628060160.0, + "91": 628060160.0, + "92": 628060160.0, + "93": 628060160.0, + "94": 628060160.0, "95": 628060160.0, + "96": 628060160.0, + "97": 628060160.0, + "98": 628060160.0, + "99": 628060160.0, "100": 628060160.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 966226944.0, + "2": 1135178752.0, + "3": 1135178752.0, + "4": 1142161920.0, "5": 1142161920.0, + "6": 1142161920.0, + "7": 1142161920.0, + "8": 1142161920.0, + "9": 1142161920.0, "10": 1142161920.0, + "11": 1142161920.0, + "12": 1142161920.0, + "13": 1142161920.0, + "14": 1142161920.0, "15": 1142161920.0, + "16": 1142161920.0, + "17": 1142161920.0, + "18": 1142161920.0, + "19": 1142161920.0, "20": 1142161920.0, + "21": 1142161920.0, + "22": 1142161920.0, + "23": 1142161920.0, + "24": 1142161920.0, "25": 1142161920.0, + "26": 1142161920.0, + "27": 1142161920.0, + "28": 1142161920.0, + "29": 1142161920.0, "30": 1142161920.0, + "31": 1142161920.0, + "32": 1142161920.0, + "33": 1142161920.0, + "34": 1142161920.0, "35": 1142161920.0, + "36": 1142161920.0, + "37": 1142161920.0, + "38": 1142161920.0, + "39": 1142161920.0, "40": 1142161920.0, + "41": 1142161920.0, + "42": 1142161920.0, + "43": 1142161920.0, + "44": 1142161920.0, "45": 1142161920.0, + "46": 1142161920.0, + "47": 1142161920.0, + "48": 1142161920.0, + "49": 1142161920.0, "50": 1142161920.0, + "51": 1142161920.0, + "52": 1142161920.0, + "53": 1142161920.0, + "54": 1142161920.0, "55": 1142161920.0, + "56": 1142161920.0, + "57": 1142161920.0, + "58": 1142161920.0, + "59": 1142161920.0, "60": 1142161920.0, + "61": 1145419776.0, + "62": 1145419776.0, + "63": 1145419776.0, + "64": 1145419776.0, "65": 1145419776.0, + "66": 1145419776.0, + "67": 1145419776.0, + "68": 1145419776.0, + "69": 1145419776.0, "70": 1145419776.0, + "71": 1145419776.0, + "72": 1145419776.0, + "73": 1145419776.0, + "74": 1145419776.0, "75": 1145419776.0, + "76": 1149517312.0, + "77": 1149517312.0, + "78": 1149517312.0, + "79": 1149517312.0, "80": 1149517312.0, + "81": 1149517312.0, + "82": 1149517312.0, + "83": 1149517312.0, + "84": 1149517312.0, "85": 1149517312.0, + "86": 1149517312.0, + "87": 1149517312.0, + "88": 1149517312.0, + "89": 1149517312.0, "90": 1149517312.0, + "91": 1149517312.0, + "92": 1149517312.0, + "93": 1149517312.0, + "94": 1149517312.0, "95": 1149517312.0, + "96": 1149517312.0, + "97": 1149517312.0, + "98": 1149517312.0, + "99": 1149517312.0, "100": 1149517312.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 22.4417, - "5": 0.54127, - "10": 0.51699, - "15": 0.49577, - "20": 0.49101, - "25": 0.50704, - "30": 0.53551, - "35": 0.49875, - "40": 0.49003, - "45": 0.49309, - "50": 0.49843, - "55": 0.48281, - "60": 0.50246, - "65": 0.49261, - "70": 0.49745, - "75": 0.49851, - "80": 0.52914, - "85": 0.49531, - "90": 0.49632, - "95": 0.49182, - "100": 0.49317 + "1": 20.57901, + "2": 0.68043, + "3": 0.63562, + "4": 0.61398, + "5": 0.61337, + "6": 0.60234, + "7": 0.60862, + "8": 0.60734, + "9": 0.58969, + "10": 0.58747, + "11": 0.5811, + "12": 0.58339, + "13": 0.58104, + "14": 0.57128, + "15": 0.57144, + "16": 0.57507, + "17": 0.56755, + "18": 0.57095, + "19": 0.56394, + "20": 0.56491, + "21": 0.5641, + "22": 0.57257, + "23": 0.56993, + "24": 0.57313, + "25": 0.59644, + "26": 0.57728, + "27": 0.56326, + "28": 0.58965, + "29": 0.57459, + "30": 0.58292, + "31": 0.5611, + "32": 0.57216, + "33": 0.56117, + "34": 0.56648, + "35": 0.57301, + "36": 0.5682, + "37": 0.57344, + "38": 0.57412, + "39": 0.57266, + "40": 0.56976, + "41": 0.58248, + "42": 0.56977, + "43": 0.59296, + "44": 0.57825, + "45": 0.57205, + "46": 0.57416, + "47": 0.56382, + "48": 0.56705, + "49": 0.56054, + "50": 0.57803, + "51": 0.5794, + "52": 0.57311, + "53": 0.55689, + "54": 0.56928, + "55": 0.56498, + "56": 0.5793, + "57": 0.59551, + "58": 0.57445, + "59": 0.57266, + "60": 0.56772, + "61": 0.56341, + "62": 0.56683, + "63": 0.56161, + "64": 0.56821, + "65": 0.57696, + "66": 0.57433, + "67": 0.5584, + "68": 0.57566, + "69": 0.57071, + "70": 0.56326, + "71": 0.57066, + "72": 0.55601, + "73": 0.58093, + "74": 0.59092, + "75": 0.57258, + "76": 0.57145, + "77": 0.55748, + "78": 0.57398, + "79": 0.56823, + "80": 0.56858, + "81": 0.55889, + "82": 0.56474, + "83": 0.56681, + "84": 0.5624, + "85": 0.56593, + "86": 0.55528, + "87": 0.56493, + "88": 0.54955, + "89": 0.56961, + "90": 0.55961, + "91": 0.56585, + "92": 0.58153, + "93": 0.56914, + "94": 0.58194, + "95": 0.56106, + "96": 0.56571, + "97": 0.56072, + "98": 0.56686, + "99": 0.55834, + "100": 0.56357 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..fac0ec053dd --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.81442, + "2": 10.81882, + "3": 10.81551, + "4": 10.80292, + "5": 10.85144, + "6": 10.85011, + "7": 10.83867, + "8": 10.83952, + "9": 10.82213, + "10": 10.77746, + "11": 10.86426, + "12": 10.83689, + "13": 10.85831, + "14": 10.86354, + "15": 10.79774, + "16": 10.79537, + "17": 10.77155, + "18": 10.78908, + "19": 10.78343, + "20": 10.71629, + "21": 10.6835, + "22": 10.53061, + "23": 10.69849, + "24": 10.58571, + "25": 10.52397, + "26": 10.58327, + "27": 10.60963, + "28": 10.57207, + "29": 10.59012, + "30": 10.35613, + "31": 10.09392, + "32": 10.45887, + "33": 10.45644, + "34": 10.20494, + "35": 10.26735, + "36": 10.22333, + "37": 10.35299, + "38": 10.19476, + "39": 10.41731, + "40": 10.08948, + "41": 10.12721, + "42": 10.21207, + "43": 9.8313, + "44": 9.96936, + "45": 9.83601, + "46": 9.81666, + "47": 10.1539, + "48": 9.85279, + "49": 9.53447, + "50": 9.91909, + "51": 9.85364, + "52": 9.74286, + "53": 10.07155, + "54": 9.96279, + "55": 9.88223, + "56": 9.63465, + "57": 9.48633, + "58": 9.84878, + "59": 9.58904, + "60": 9.51094, + "61": 9.7032, + "62": 9.99637, + "63": 9.40044, + "64": 9.78465, + "65": 8.95366, + "66": 9.71808, + "67": 9.36931, + "68": 9.79818, + "69": 9.79667, + "70": 9.74899, + "71": 9.63213, + "72": 9.59956, + "73": 9.50308, + "74": 8.95202, + "75": 9.43084, + "76": 9.09067, + "77": 10.08102, + "78": 9.73521, + "79": 9.38853, + "80": 9.41418, + "81": 9.48403, + "82": 9.70907, + "83": 9.3152, + "84": 9.41838, + "85": 9.62222, + "86": 9.07945, + "87": 9.59202, + "88": 9.74953, + "89": 9.60441, + "90": 9.82577, + "91": 9.34232, + "92": 9.35837, + "93": 9.07969, + "94": 8.82793, + "95": 9.50864, + "96": 9.52117, + "97": 9.30605, + "98": 9.6658, + "99": 8.87716, + "100": 9.38997 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 5488.0, + "2": 5704.0, + "3": 5788.0, + "4": 5853.0, + "5": 6401.0, + "6": 6686.0, + "7": 5949.0, + "8": 5811.0, + "9": 6280.0, + "10": 5192.0, + "11": 6645.0, + "12": 6193.0, + "13": 6525.0, + "14": 6487.0, + "15": 6258.0, + "16": 6261.0, + "17": 6080.0, + "18": 5901.0, + "19": 6228.0, + "20": 5713.0, + "21": 6265.0, + "22": 5788.0, + "23": 6618.0, + "24": 6159.0, + "25": 5674.0, + "26": 6218.0, + "27": 6180.0, + "28": 6802.0, + "29": 7006.0, + "30": 6195.0, + "31": 5847.0, + "32": 6680.0, + "33": 7327.0, + "34": 6433.0, + "35": 6593.0, + "36": 6717.0, + "37": 7545.0, + "38": 7130.0, + "39": 7928.0, + "40": 7233.0, + "41": 7093.0, + "42": 7653.0, + "43": 7136.0, + "44": 7113.0, + "45": 7167.0, + "46": 7435.0, + "47": 7501.0, + "48": 7648.0, + "49": 7520.0, + "50": 7701.0, + "51": 7847.0, + "52": 7828.0, + "53": 8765.0, + "54": 8799.0, + "55": 7683.0, + "56": 7972.0, + "57": 7642.0, + "58": 8419.0, + "59": 8276.0, + "60": 7917.0, + "61": 8598.0, + "62": 8394.0, + "63": 7896.0, + "64": 9047.0, + "65": 8280.0, + "66": 9315.0, + "67": 8277.0, + "68": 8341.0, + "69": 8737.0, + "70": 9764.0, + "71": 9050.0, + "72": 9036.0, + "73": 9076.0, + "74": 6969.0, + "75": 7833.0, + "76": 8450.0, + "77": 13505.0, + "78": 9634.0, + "79": 13982.0, + "80": 11548.0, + "81": 10035.0, + "82": 9732.0, + "83": 9037.0, + "84": 9522.0, + "85": 46479.0, + "86": 8626.0, + "87": 11964.0, + "88": 9637.0, + "89": 10273.0, + "90": 11256.0, + "91": 8811.0, + "92": 9218.0, + "93": 8281.0, + "94": 9390.0, + "95": 9376.0, + "96": 13248.0, + "97": 8945.0, + "98": 10682.0, + "99": 15485.0, + "100": 9101.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 628059136.0, + "2": 628060160.0, + "3": 628060160.0, + "4": 628060160.0, + "5": 628060160.0, + "6": 628060160.0, + "7": 628060160.0, + "8": 628060160.0, + "9": 628060160.0, + "10": 628060160.0, + "11": 628060160.0, + "12": 628060160.0, + "13": 628060160.0, + "14": 628060160.0, + "15": 628060160.0, + "16": 628060160.0, + "17": 628060160.0, + "18": 628060160.0, + "19": 628060160.0, + "20": 628060160.0, + "21": 628060160.0, + "22": 628060160.0, + "23": 628060160.0, + "24": 628060160.0, + "25": 628060160.0, + "26": 628060160.0, + "27": 628060160.0, + "28": 628060160.0, + "29": 628060160.0, + "30": 628060160.0, + "31": 628060160.0, + "32": 628060160.0, + "33": 628060160.0, + "34": 628060160.0, + "35": 628060160.0, + "36": 628060160.0, + "37": 628060160.0, + "38": 628060160.0, + "39": 628060160.0, + "40": 628060160.0, + "41": 628060160.0, + "42": 628060160.0, + "43": 628060160.0, + "44": 628060160.0, + "45": 628060160.0, + "46": 628060160.0, + "47": 628060160.0, + "48": 628060160.0, + "49": 628060160.0, + "50": 628060160.0, + "51": 628060160.0, + "52": 628060160.0, + "53": 628060160.0, + "54": 628060160.0, + "55": 628060160.0, + "56": 628060160.0, + "57": 628060160.0, + "58": 628060160.0, + "59": 628060160.0, + "60": 628060160.0, + "61": 628060160.0, + "62": 628060160.0, + "63": 628060160.0, + "64": 628060160.0, + "65": 628060160.0, + "66": 628060160.0, + "67": 628060160.0, + "68": 628060160.0, + "69": 628060160.0, + "70": 628060160.0, + "71": 628060160.0, + "72": 628060160.0, + "73": 628060160.0, + "74": 628060160.0, + "75": 628060160.0, + "76": 628060160.0, + "77": 628060160.0, + "78": 628060160.0, + "79": 628060160.0, + "80": 628060160.0, + "81": 628060160.0, + "82": 628060160.0, + "83": 628060160.0, + "84": 628060160.0, + "85": 628060160.0, + "86": 628060160.0, + "87": 628060160.0, + "88": 628060160.0, + "89": 628060160.0, + "90": 628060160.0, + "91": 628060160.0, + "92": 628060160.0, + "93": 628060160.0, + "94": 628060160.0, + "95": 628060160.0, + "96": 628060160.0, + "97": 628060160.0, + "98": 628060160.0, + "99": 628060160.0, + "100": 628060160.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 966226944.0, + "2": 1135178752.0, + "3": 1135178752.0, + "4": 1142154752.0, + "5": 1142154752.0, + "6": 1142154752.0, + "7": 1142154752.0, + "8": 1142154752.0, + "9": 1142154752.0, + "10": 1142154752.0, + "11": 1142154752.0, + "12": 1142154752.0, + "13": 1142154752.0, + "14": 1142154752.0, + "15": 1142154752.0, + "16": 1142154752.0, + "17": 1142154752.0, + "18": 1142154752.0, + "19": 1142154752.0, + "20": 1142154752.0, + "21": 1142154752.0, + "22": 1142154752.0, + "23": 1142154752.0, + "24": 1142154752.0, + "25": 1142154752.0, + "26": 1142154752.0, + "27": 1142154752.0, + "28": 1142154752.0, + "29": 1142154752.0, + "30": 1142154752.0, + "31": 1142154752.0, + "32": 1142154752.0, + "33": 1142154752.0, + "34": 1142154752.0, + "35": 1142154752.0, + "36": 1142154752.0, + "37": 1142154752.0, + "38": 1142154752.0, + "39": 1142154752.0, + "40": 1142154752.0, + "41": 1142154752.0, + "42": 1142154752.0, + "43": 1142154752.0, + "44": 1142154752.0, + "45": 1142154752.0, + "46": 1142154752.0, + "47": 1142154752.0, + "48": 1142154752.0, + "49": 1142154752.0, + "50": 1142154752.0, + "51": 1142154752.0, + "52": 1142154752.0, + "53": 1142154752.0, + "54": 1142154752.0, + "55": 1142154752.0, + "56": 1142154752.0, + "57": 1142154752.0, + "58": 1142154752.0, + "59": 1142154752.0, + "60": 1142154752.0, + "61": 1145444352.0, + "62": 1145444352.0, + "63": 1145444352.0, + "64": 1145444352.0, + "65": 1145444352.0, + "66": 1145444352.0, + "67": 1145444352.0, + "68": 1145444352.0, + "69": 1145444352.0, + "70": 1145444352.0, + "71": 1145444352.0, + "72": 1145444352.0, + "73": 1145444352.0, + "74": 1145444352.0, + "75": 1145444352.0, + "76": 1149560320.0, + "77": 1149560320.0, + "78": 1149560320.0, + "79": 1149560320.0, + "80": 1149560320.0, + "81": 1149560320.0, + "82": 1149560320.0, + "83": 1149560320.0, + "84": 1149560320.0, + "85": 1149560320.0, + "86": 1149560320.0, + "87": 1149560320.0, + "88": 1149560320.0, + "89": 1149560320.0, + "90": 1149560320.0, + "91": 1149560320.0, + "92": 1149560320.0, + "93": 1149560320.0, + "94": 1149560320.0, + "95": 1149560320.0, + "96": 1149560320.0, + "97": 1149560320.0, + "98": 1149560320.0, + "99": 1149560320.0, + "100": 1149560320.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 22.49159, + "2": 0.64465, + "3": 0.55144, + "4": 0.54612, + "5": 0.54224, + "6": 0.53272, + "7": 0.53156, + "8": 0.52769, + "9": 0.51643, + "10": 0.51904, + "11": 0.51365, + "12": 0.51064, + "13": 0.5046, + "14": 0.50595, + "15": 0.49656, + "16": 0.51295, + "17": 0.49558, + "18": 0.50544, + "19": 0.49807, + "20": 0.50213, + "21": 0.50583, + "22": 0.52086, + "23": 0.51086, + "24": 0.50937, + "25": 0.5124, + "26": 0.51291, + "27": 0.52068, + "28": 0.54211, + "29": 0.52886, + "30": 0.52175, + "31": 0.51586, + "32": 0.5142, + "33": 0.49143, + "34": 0.49103, + "35": 0.49405, + "36": 0.49048, + "37": 0.48575, + "38": 0.49941, + "39": 0.50795, + "40": 0.51375, + "41": 0.49293, + "42": 0.48855, + "43": 0.5029, + "44": 0.49021, + "45": 0.50044, + "46": 0.4959, + "47": 0.49439, + "48": 0.48796, + "49": 0.48244, + "50": 0.50689, + "51": 0.53388, + "52": 0.49313, + "53": 0.50127, + "54": 0.50696, + "55": 0.50505, + "56": 0.50751, + "57": 0.50921, + "58": 0.49608, + "59": 0.49342, + "60": 0.49604, + "61": 0.49149, + "62": 0.48784, + "63": 0.48712, + "64": 0.48464, + "65": 0.51125, + "66": 0.48673, + "67": 0.48738, + "68": 0.48812, + "69": 0.4924, + "70": 0.48944, + "71": 0.48906, + "72": 0.48542, + "73": 0.50073, + "74": 0.49165, + "75": 0.48855, + "76": 0.49114, + "77": 0.49358, + "78": 0.48743, + "79": 0.49072, + "80": 0.48515, + "81": 0.48089, + "82": 0.48965, + "83": 0.49061, + "84": 0.48204, + "85": 0.46988, + "86": 0.49418, + "87": 0.48287, + "88": 0.47854, + "89": 0.48256, + "90": 0.48294, + "91": 0.4982, + "92": 0.48423, + "93": 0.47976, + "94": 0.48336, + "95": 0.47914, + "96": 0.71379, + "97": 1.04054, + "98": 3.57564, + "99": 4.591, + "100": 0.98086 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..68b72267704 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.81442, + "2": 10.81882, + "3": 10.81551, + "4": 10.80292, + "5": 10.85144, + "6": 10.85011, + "7": 10.83867, + "8": 10.83952, + "9": 10.82213, + "10": 10.77746, + "11": 10.86426, + "12": 10.83689, + "13": 10.85831, + "14": 10.86354, + "15": 10.79774, + "16": 10.79537, + "17": 10.77155, + "18": 10.78908, + "19": 10.78343, + "20": 10.71629, + "21": 10.6835, + "22": 10.53061, + "23": 10.69849, + "24": 10.58571, + "25": 10.52397, + "26": 10.58327, + "27": 10.60963, + "28": 10.57207, + "29": 10.59012, + "30": 10.35613, + "31": 10.09392, + "32": 10.45887, + "33": 10.45644, + "34": 10.20494, + "35": 10.26735, + "36": 10.22333, + "37": 10.35299, + "38": 10.19476, + "39": 10.41731, + "40": 10.08948, + "41": 10.12721, + "42": 10.21207, + "43": 9.8313, + "44": 9.96936, + "45": 9.83601, + "46": 9.81666, + "47": 10.1539, + "48": 9.85279, + "49": 9.53447, + "50": 9.91909, + "51": 9.85364, + "52": 9.74286, + "53": 10.07155, + "54": 9.96279, + "55": 9.88223, + "56": 9.63465, + "57": 9.48633, + "58": 9.84878, + "59": 9.58904, + "60": 9.51094, + "61": 9.7032, + "62": 9.99637, + "63": 9.40044, + "64": 9.78465, + "65": 8.95366, + "66": 9.71808, + "67": 9.36931, + "68": 9.79818, + "69": 9.79667, + "70": 9.74899, + "71": 9.63213, + "72": 9.59956, + "73": 9.50308, + "74": 8.95202, + "75": 9.43084, + "76": 9.09067, + "77": 10.08102, + "78": 9.73521, + "79": 9.38853, + "80": 9.41418, + "81": 9.48403, + "82": 9.70907, + "83": 9.3152, + "84": 9.41838, + "85": 9.62222, + "86": 9.07945, + "87": 9.59202, + "88": 9.74953, + "89": 9.60441, + "90": 9.82577, + "91": 9.34232, + "92": 9.35837, + "93": 9.07969, + "94": 8.82793, + "95": 9.50864, + "96": 9.52117, + "97": 9.30605, + "98": 9.6658, + "99": 8.87716, + "100": 9.38997 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 5488.0, + "2": 5704.0, + "3": 5788.0, + "4": 5853.0, + "5": 6401.0, + "6": 6686.0, + "7": 5949.0, + "8": 5811.0, + "9": 6280.0, + "10": 5192.0, + "11": 6645.0, + "12": 6193.0, + "13": 6525.0, + "14": 6487.0, + "15": 6258.0, + "16": 6261.0, + "17": 6080.0, + "18": 5901.0, + "19": 6228.0, + "20": 5713.0, + "21": 6265.0, + "22": 5788.0, + "23": 6618.0, + "24": 6159.0, + "25": 5674.0, + "26": 6218.0, + "27": 6180.0, + "28": 6802.0, + "29": 7006.0, + "30": 6195.0, + "31": 5847.0, + "32": 6680.0, + "33": 7327.0, + "34": 6433.0, + "35": 6593.0, + "36": 6717.0, + "37": 7545.0, + "38": 7130.0, + "39": 7928.0, + "40": 7233.0, + "41": 7093.0, + "42": 7653.0, + "43": 7136.0, + "44": 7113.0, + "45": 7167.0, + "46": 7435.0, + "47": 7501.0, + "48": 7648.0, + "49": 7520.0, + "50": 7701.0, + "51": 7847.0, + "52": 7828.0, + "53": 8765.0, + "54": 8799.0, + "55": 7683.0, + "56": 7972.0, + "57": 7642.0, + "58": 8419.0, + "59": 8276.0, + "60": 7917.0, + "61": 8598.0, + "62": 8394.0, + "63": 7896.0, + "64": 9047.0, + "65": 8280.0, + "66": 9315.0, + "67": 8277.0, + "68": 8341.0, + "69": 8737.0, + "70": 9764.0, + "71": 9050.0, + "72": 9036.0, + "73": 9076.0, + "74": 6969.0, + "75": 7833.0, + "76": 8450.0, + "77": 13505.0, + "78": 9634.0, + "79": 13982.0, + "80": 11548.0, + "81": 10035.0, + "82": 9732.0, + "83": 9037.0, + "84": 9522.0, + "85": 46479.0, + "86": 8626.0, + "87": 11964.0, + "88": 9637.0, + "89": 10273.0, + "90": 11256.0, + "91": 8811.0, + "92": 9218.0, + "93": 8281.0, + "94": 9390.0, + "95": 9376.0, + "96": 13248.0, + "97": 8945.0, + "98": 10682.0, + "99": 15485.0, + "100": 9101.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 628059136.0, + "2": 628060160.0, + "3": 628060160.0, + "4": 628060160.0, + "5": 628060160.0, + "6": 628060160.0, + "7": 628060160.0, + "8": 628060160.0, + "9": 628060160.0, + "10": 628060160.0, + "11": 628060160.0, + "12": 628060160.0, + "13": 628060160.0, + "14": 628060160.0, + "15": 628060160.0, + "16": 628060160.0, + "17": 628060160.0, + "18": 628060160.0, + "19": 628060160.0, + "20": 628060160.0, + "21": 628060160.0, + "22": 628060160.0, + "23": 628060160.0, + "24": 628060160.0, + "25": 628060160.0, + "26": 628060160.0, + "27": 628060160.0, + "28": 628060160.0, + "29": 628060160.0, + "30": 628060160.0, + "31": 628060160.0, + "32": 628060160.0, + "33": 628060160.0, + "34": 628060160.0, + "35": 628060160.0, + "36": 628060160.0, + "37": 628060160.0, + "38": 628060160.0, + "39": 628060160.0, + "40": 628060160.0, + "41": 628060160.0, + "42": 628060160.0, + "43": 628060160.0, + "44": 628060160.0, + "45": 628060160.0, + "46": 628060160.0, + "47": 628060160.0, + "48": 628060160.0, + "49": 628060160.0, + "50": 628060160.0, + "51": 628060160.0, + "52": 628060160.0, + "53": 628060160.0, + "54": 628060160.0, + "55": 628060160.0, + "56": 628060160.0, + "57": 628060160.0, + "58": 628060160.0, + "59": 628060160.0, + "60": 628060160.0, + "61": 628060160.0, + "62": 628060160.0, + "63": 628060160.0, + "64": 628060160.0, + "65": 628060160.0, + "66": 628060160.0, + "67": 628060160.0, + "68": 628060160.0, + "69": 628060160.0, + "70": 628060160.0, + "71": 628060160.0, + "72": 628060160.0, + "73": 628060160.0, + "74": 628060160.0, + "75": 628060160.0, + "76": 628060160.0, + "77": 628060160.0, + "78": 628060160.0, + "79": 628060160.0, + "80": 628060160.0, + "81": 628060160.0, + "82": 628060160.0, + "83": 628060160.0, + "84": 628060160.0, + "85": 628060160.0, + "86": 628060160.0, + "87": 628060160.0, + "88": 628060160.0, + "89": 628060160.0, + "90": 628060160.0, + "91": 628060160.0, + "92": 628060160.0, + "93": 628060160.0, + "94": 628060160.0, + "95": 628060160.0, + "96": 628060160.0, + "97": 628060160.0, + "98": 628060160.0, + "99": 628060160.0, + "100": 628060160.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 966226944.0, + "2": 1135178752.0, + "3": 1135178752.0, + "4": 1142154752.0, + "5": 1142154752.0, + "6": 1142154752.0, + "7": 1142154752.0, + "8": 1142154752.0, + "9": 1142154752.0, + "10": 1142154752.0, + "11": 1142154752.0, + "12": 1142154752.0, + "13": 1142154752.0, + "14": 1142154752.0, + "15": 1142154752.0, + "16": 1142154752.0, + "17": 1142154752.0, + "18": 1142154752.0, + "19": 1142154752.0, + "20": 1142154752.0, + "21": 1142154752.0, + "22": 1142154752.0, + "23": 1142154752.0, + "24": 1142154752.0, + "25": 1142154752.0, + "26": 1142154752.0, + "27": 1142154752.0, + "28": 1142154752.0, + "29": 1142154752.0, + "30": 1142154752.0, + "31": 1142154752.0, + "32": 1142154752.0, + "33": 1142154752.0, + "34": 1142154752.0, + "35": 1142154752.0, + "36": 1142154752.0, + "37": 1142154752.0, + "38": 1142154752.0, + "39": 1142154752.0, + "40": 1142154752.0, + "41": 1142154752.0, + "42": 1142154752.0, + "43": 1142154752.0, + "44": 1142154752.0, + "45": 1142154752.0, + "46": 1142154752.0, + "47": 1142154752.0, + "48": 1142154752.0, + "49": 1142154752.0, + "50": 1142154752.0, + "51": 1142154752.0, + "52": 1142154752.0, + "53": 1142154752.0, + "54": 1142154752.0, + "55": 1142154752.0, + "56": 1142154752.0, + "57": 1142154752.0, + "58": 1142154752.0, + "59": 1142154752.0, + "60": 1142154752.0, + "61": 1145444352.0, + "62": 1145444352.0, + "63": 1145444352.0, + "64": 1145444352.0, + "65": 1145444352.0, + "66": 1145444352.0, + "67": 1145444352.0, + "68": 1145444352.0, + "69": 1145444352.0, + "70": 1145444352.0, + "71": 1145444352.0, + "72": 1145444352.0, + "73": 1145444352.0, + "74": 1145444352.0, + "75": 1145444352.0, + "76": 1149560320.0, + "77": 1149560320.0, + "78": 1149560320.0, + "79": 1149560320.0, + "80": 1149560320.0, + "81": 1149560320.0, + "82": 1149560320.0, + "83": 1149560320.0, + "84": 1149560320.0, + "85": 1149560320.0, + "86": 1149560320.0, + "87": 1149560320.0, + "88": 1149560320.0, + "89": 1149560320.0, + "90": 1149560320.0, + "91": 1149560320.0, + "92": 1149560320.0, + "93": 1149560320.0, + "94": 1149560320.0, + "95": 1149560320.0, + "96": 1149560320.0, + "97": 1149560320.0, + "98": 1149560320.0, + "99": 1149560320.0, + "100": 1149560320.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 20.38736, + "2": 0.68138, + "3": 0.62881, + "4": 0.61692, + "5": 0.61365, + "6": 0.60735, + "7": 0.60006, + "8": 0.59897, + "9": 0.59763, + "10": 0.6122, + "11": 0.59106, + "12": 0.59749, + "13": 0.60001, + "14": 0.58446, + "15": 0.57929, + "16": 0.58508, + "17": 0.5725, + "18": 0.57386, + "19": 0.57617, + "20": 0.57081, + "21": 0.57614, + "22": 0.57046, + "23": 0.57731, + "24": 0.56893, + "25": 0.58004, + "26": 0.56911, + "27": 0.60575, + "28": 0.61474, + "29": 0.58874, + "30": 0.57969, + "31": 0.57737, + "32": 0.58556, + "33": 0.5704, + "34": 0.57592, + "35": 0.58241, + "36": 0.57697, + "37": 0.57978, + "38": 0.57647, + "39": 0.56977, + "40": 0.58017, + "41": 0.57153, + "42": 0.57267, + "43": 0.5881, + "44": 0.57211, + "45": 0.59552, + "46": 0.56308, + "47": 0.5736, + "48": 0.58403, + "49": 0.57693, + "50": 0.57016, + "51": 0.57233, + "52": 0.55871, + "53": 0.5593, + "54": 0.55755, + "55": 0.56057, + "56": 0.56649, + "57": 0.56057, + "58": 0.56658, + "59": 0.55825, + "60": 0.57038, + "61": 0.5563, + "62": 0.56031, + "63": 0.56901, + "64": 0.56097, + "65": 0.56153, + "66": 0.56761, + "67": 0.5785, + "68": 0.57341, + "69": 0.57139, + "70": 0.56231, + "71": 0.55874, + "72": 0.55834, + "73": 0.55824, + "74": 0.5552, + "75": 0.5593, + "76": 0.56038, + "77": 0.56527, + "78": 0.56728, + "79": 0.56424, + "80": 0.55564, + "81": 0.55955, + "82": 0.55867, + "83": 0.56254, + "84": 0.55754, + "85": 0.55409, + "86": 0.55901, + "87": 0.55904, + "88": 0.57097, + "89": 0.5735, + "90": 0.55808, + "91": 0.55819, + "92": 0.58224, + "93": 0.55845, + "94": 0.56512, + "95": 0.5709, + "96": 0.56099, + "97": 0.56779, + "98": 0.55446, + "99": 0.56053, + "100": 0.56338 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 2f5cb0af999..73cf979651d 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.81565, - "5": 10.83826, - "10": 10.79021, - "15": 10.80531, - "20": 10.74643, - "25": 10.57512, - "30": 10.44697, - "35": 10.33173, - "40": 10.19856, - "45": 9.94354, - "50": 10.00316, - "55": 9.96304, - "60": 9.60428, - "65": 9.02427, - "70": 9.81034, - "75": 9.50548, - "80": 9.46755, - "85": 9.67934, - "90": 9.85571, - "95": 9.56508, - "100": 9.45426 + "2": 10.81048, + "3": 10.81233, + "4": 10.79117, + "5": 10.83746, + "6": 10.85118, + "7": 10.82091, + "8": 10.82093, + "9": 10.8306, + "10": 10.78973, + "11": 10.86282, + "12": 10.84288, + "13": 10.85757, + "14": 10.86228, + "15": 10.80658, + "16": 10.80321, + "17": 10.77911, + "18": 10.80744, + "19": 10.79401, + "20": 10.7468, + "21": 10.72178, + "22": 10.58777, + "23": 10.72976, + "24": 10.63294, + "25": 10.57502, + "26": 10.63703, + "27": 10.65005, + "28": 10.63549, + "29": 10.64376, + "30": 10.44681, + "31": 10.1944, + "32": 10.52431, + "33": 10.51785, + "34": 10.28836, + "35": 10.33178, + "36": 10.31279, + "37": 10.42677, + "38": 10.27938, + "39": 10.47551, + "40": 10.19739, + "41": 10.21538, + "42": 10.28746, + "43": 9.94274, + "44": 10.05688, + "45": 9.94329, + "46": 9.90894, + "47": 10.21235, + "48": 9.95052, + "49": 9.63658, + "50": 10.00313, + "51": 9.92286, + "52": 9.82764, + "53": 10.14637, + "54": 10.0431, + "55": 9.9628, + "56": 9.70471, + "57": 9.58557, + "58": 9.91688, + "59": 9.66027, + "60": 9.60417, + "61": 9.77863, + "62": 10.06255, + "63": 9.47237, + "64": 9.85394, + "65": 9.02479, + "66": 9.79388, + "67": 9.43332, + "68": 9.85348, + "69": 9.84692, + "70": 9.81038, + "71": 9.68427, + "72": 9.6602, + "73": 9.57277, + "74": 9.05997, + "75": 9.50545, + "76": 9.17937, + "77": 10.12733, + "78": 9.77455, + "79": 9.44211, + "80": 9.46753, + "81": 9.53839, + "82": 9.75754, + "83": 9.38711, + "84": 9.46669, + "85": 9.67912, + "86": 9.13537, + "87": 9.63456, + "88": 9.80822, + "89": 9.67886, + "90": 9.8558, + "91": 9.41297, + "92": 9.41787, + "93": 9.15369, + "94": 8.90217, + "95": 9.56536, + "96": 9.58437, + "97": 9.35832, + "98": 9.73042, + "99": 8.9586, + "100": 9.454 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 30837.0, - "5": 36033.0, - "10": 29790.0, - "15": 34550.0, - "20": 32683.0, - "25": 30957.0, - "30": 32603.0, - "35": 34043.0, - "40": 35657.0, - "45": 35490.0, - "50": 38984.0, - "55": 36972.0, - "60": 39721.0, - "65": 40930.0, - "70": 45588.0, - "75": 38781.0, - "80": 46737.0, - "85": 49087.0, - "90": 49441.0, - "95": 46735.0, - "100": 43962.0 + "1": 31083.0, + "2": 32874.0, + "3": 33614.0, + "4": 30796.0, + "5": 35950.0, + "6": 37383.0, + "7": 35302.0, + "8": 31308.0, + "9": 34522.0, + "10": 29757.0, + "11": 38942.0, + "12": 34991.0, + "13": 37045.0, + "14": 37494.0, + "15": 34692.0, + "16": 36080.0, + "17": 35060.0, + "18": 34989.0, + "19": 36144.0, + "20": 32462.0, + "21": 33369.0, + "22": 29795.0, + "23": 37622.0, + "24": 32511.0, + "25": 31055.0, + "26": 34301.0, + "27": 36030.0, + "28": 36741.0, + "29": 38257.0, + "30": 32928.0, + "31": 30048.0, + "32": 36406.0, + "33": 37595.0, + "34": 32918.0, + "35": 33986.0, + "36": 35154.0, + "37": 37803.0, + "38": 35542.0, + "39": 39006.0, + "40": 35753.0, + "41": 35748.0, + "42": 37390.0, + "43": 34087.0, + "44": 33554.0, + "45": 35464.0, + "46": 37091.0, + "47": 40542.0, + "48": 36522.0, + "49": 36534.0, + "50": 38785.0, + "51": 37126.0, + "52": 36939.0, + "53": 41763.0, + "54": 41138.0, + "55": 37048.0, + "56": 40483.0, + "57": 36998.0, + "58": 41877.0, + "59": 39208.0, + "60": 40087.0, + "61": 40325.0, + "62": 44268.0, + "63": 38629.0, + "64": 43656.0, + "65": 40940.0, + "66": 44302.0, + "67": 40075.0, + "68": 40632.0, + "69": 40527.0, + "70": 45260.0, + "71": 41111.0, + "72": 40161.0, + "73": 44972.0, + "74": 34095.0, + "75": 38490.0, + "76": 46162.0, + "77": 46055.0, + "78": 46750.0, + "79": 47560.0, + "80": 46440.0, + "81": 49629.0, + "82": 49227.0, + "83": 44834.0, + "84": 45877.0, + "85": 49064.0, + "86": 45232.0, + "87": 49124.0, + "88": 46347.0, + "89": 48837.0, + "90": 49499.0, + "91": 44289.0, + "92": 47277.0, + "93": 46847.0, + "94": 46311.0, + "95": 47245.0, + "96": 50336.0, + "97": 47016.0, + "98": 49606.0, + "99": 47799.0, + "100": 43700.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1016564224.0, - "5": 1016564224.0, - "10": 1016563712.0, + "2": 1016563712.0, + "3": 1016564224.0, + "4": 1016563712.0, + "5": 1016564736.0, + "6": 1016565248.0, + "7": 1016564736.0, + "8": 1016565248.0, + "9": 1016562688.0, + "10": 1016564736.0, + "11": 1016562176.0, + "12": 1016564224.0, + "13": 1016563200.0, + "14": 1016563712.0, "15": 1016564736.0, - "20": 1016563200.0, - "25": 1016564736.0, + "16": 1016562688.0, + "17": 1016565248.0, + "18": 1016564736.0, + "19": 1016563200.0, + "20": 1016563712.0, + "21": 1016564224.0, + "22": 1016564736.0, + "23": 1016564736.0, + "24": 1016563200.0, + "25": 1016565248.0, + "26": 1016562176.0, + "27": 1016562688.0, + "28": 1016562176.0, + "29": 1016562688.0, "30": 1016566784.0, + "31": 1016569344.0, + "32": 1016565248.0, + "33": 1016564736.0, + "34": 1016565248.0, "35": 1016565248.0, - "40": 1016564224.0, - "45": 1016565760.0, - "50": 1016565760.0, - "55": 1016569856.0, - "60": 1017439232.0, + "36": 1016565760.0, + "37": 1016564736.0, + "38": 1016564224.0, + "39": 1016562688.0, + "40": 1016945152.0, + "41": 1016567808.0, + "42": 1016564224.0, + "43": 1016568320.0, + "44": 1016565760.0, + "45": 1016565248.0, + "46": 1016569344.0, + "47": 1016564224.0, + "48": 1016569856.0, + "49": 1017010688.0, + "50": 1016567296.0, + "51": 1016566272.0, + "52": 1016575488.0, + "53": 1016568320.0, + "54": 1016567296.0, + "55": 1016569344.0, + "56": 1016565248.0, + "57": 1016575488.0, + "58": 1016569856.0, + "59": 1016574976.0, + "60": 1016571392.0, + "61": 1016567808.0, + "62": 1016566272.0, + "63": 1016576512.0, + "64": 1016572416.0, "65": 1016584192.0, + "66": 1016569344.0, + "67": 1016570368.0, + "68": 1016566272.0, + "69": 1016570880.0, "70": 1016569344.0, + "71": 1016566784.0, + "72": 1016915968.0, + "73": 1016572928.0, + "74": 1016577536.0, "75": 1016567296.0, - "80": 1016572416.0, - "85": 1016575488.0, - "90": 1016569344.0, - "95": 1016568320.0, - "100": 1016573440.0 + "76": 1016565760.0, + "77": 1016567296.0, + "78": 1016572928.0, + "79": 1016569344.0, + "80": 1016572928.0, + "81": 1016569856.0, + "82": 1016572416.0, + "83": 1016568832.0, + "84": 1016573440.0, + "85": 1016574976.0, + "86": 1016574976.0, + "87": 1016568832.0, + "88": 1016571904.0, + "89": 1016578048.0, + "90": 1016568832.0, + "91": 1016566784.0, + "92": 1016566784.0, + "93": 1016570368.0, + "94": 1016571904.0, + "95": 1016567808.0, + "96": 1016566784.0, + "97": 1016573440.0, + "98": 1016566272.0, + "99": 1016578048.0, + "100": 1016573952.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2560655872.0, - "5": 2828328960.0, - "10": 2829020160.0, - "15": 2829965312.0, - "20": 2830715392.0, - "25": 2830928896.0, - "30": 2831643648.0, - "35": 2836671488.0, - "40": 2836671488.0, - "45": 2836671488.0, - "50": 2838087680.0, - "55": 2843240960.0, - "60": 2844131328.0, - "65": 2859044864.0, - "70": 2859044864.0, - "75": 2859044864.0, - "80": 2859044864.0, - "85": 2859044864.0, - "90": 2859044864.0, - "95": 2859044864.0, - "100": 2859044864.0 + "2": 2827037696.0, + "3": 2827377152.0, + "4": 2827377152.0, + "5": 2827506688.0, + "6": 2827618816.0, + "7": 2828691456.0, + "8": 2828691456.0, + "9": 2828691456.0, + "10": 2828691456.0, + "11": 2828691456.0, + "12": 2828691456.0, + "13": 2828691456.0, + "14": 2828691456.0, + "15": 2829756416.0, + "16": 2829756416.0, + "17": 2830923264.0, + "18": 2830923264.0, + "19": 2830923264.0, + "20": 2830923264.0, + "21": 2830923264.0, + "22": 2830923264.0, + "23": 2830923264.0, + "24": 2830923264.0, + "25": 2830923264.0, + "26": 2830923264.0, + "27": 2830923264.0, + "28": 2830923264.0, + "29": 2830923264.0, + "30": 2833604608.0, + "31": 2833604608.0, + "32": 2833604608.0, + "33": 2833604608.0, + "34": 2833604608.0, + "35": 2833604608.0, + "36": 2833604608.0, + "37": 2833604608.0, + "38": 2833604608.0, + "39": 2833604608.0, + "40": 2833604608.0, + "41": 2835652608.0, + "42": 2835652608.0, + "43": 2835652608.0, + "44": 2835652608.0, + "45": 2835652608.0, + "46": 2836792832.0, + "47": 2836792832.0, + "48": 2837318656.0, + "49": 2837318656.0, + "50": 2837318656.0, + "51": 2837318656.0, + "52": 2841922048.0, + "53": 2841922048.0, + "54": 2841922048.0, + "55": 2841922048.0, + "56": 2844188672.0, + "57": 2847232512.0, + "58": 2847232512.0, + "59": 2847232512.0, + "60": 2847232512.0, + "61": 2847232512.0, + "62": 2847232512.0, + "63": 2847301120.0, + "64": 2847301120.0, + "65": 2858460160.0, + "66": 2858460160.0, + "67": 2858460160.0, + "68": 2858460160.0, + "69": 2858460160.0, + "70": 2858460160.0, + "71": 2858460160.0, + "72": 2858460160.0, + "73": 2858460160.0, + "74": 2858460160.0, + "75": 2858460160.0, + "76": 2858460160.0, + "77": 2858460160.0, + "78": 2858460160.0, + "79": 2858460160.0, + "80": 2858460160.0, + "81": 2858460160.0, + "82": 2858460160.0, + "83": 2858460160.0, + "84": 2858460160.0, + "85": 2858460160.0, + "86": 2858460160.0, + "87": 2858460160.0, + "88": 2858460160.0, + "89": 2858460160.0, + "90": 2858460160.0, + "91": 2858460160.0, + "92": 2858460160.0, + "93": 2858460160.0, + "94": 2858460160.0, + "95": 2858460160.0, + "96": 2858460160.0, + "97": 2858460160.0, + "98": 2858460160.0, + "99": 2858460160.0, + "100": 2858460160.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 18.86394, - "5": 0.16112, - "10": 0.15425, - "15": 0.15762, - "20": 0.14093, - "25": 0.14225, - "30": 0.14726, - "35": 0.14414, - "40": 0.15356, - "45": 0.14839, - "50": 0.15508, - "55": 0.15077, - "60": 0.17983, - "65": 0.2249, - "70": 0.15318, - "75": 0.15837, - "80": 0.17114, - "85": 0.14811, - "90": 0.14827, - "95": 0.15176, - "100": 0.14608 + "1": 17.04363, + "2": 0.27177, + "3": 0.19697, + "4": 0.20207, + "5": 0.17488, + "6": 0.1736, + "7": 0.18134, + "8": 0.17934, + "9": 0.17175, + "10": 0.16904, + "11": 0.17256, + "12": 0.16161, + "13": 0.166, + "14": 0.16567, + "15": 0.18106, + "16": 0.16499, + "17": 0.17792, + "18": 0.16846, + "19": 0.16132, + "20": 0.16075, + "21": 0.163, + "22": 0.17697, + "23": 0.16348, + "24": 0.16046, + "25": 0.16003, + "26": 0.16209, + "27": 0.16858, + "28": 0.16512, + "29": 0.15718, + "30": 0.17279, + "31": 0.20344, + "32": 0.17311, + "33": 0.1614, + "34": 0.18789, + "35": 0.16679, + "36": 0.16768, + "37": 0.15911, + "38": 0.16709, + "39": 0.16032, + "40": 0.18009, + "41": 0.16959, + "42": 0.16653, + "43": 0.17964, + "44": 0.1656, + "45": 0.16422, + "46": 0.18029, + "47": 0.16168, + "48": 0.19024, + "49": 0.22183, + "50": 0.16427, + "51": 0.17603, + "52": 0.17568, + "53": 0.16571, + "54": 0.16402, + "55": 0.17797, + "56": 0.22204, + "57": 0.17949, + "58": 0.1779, + "59": 0.18785, + "60": 0.1904, + "61": 0.1671, + "62": 0.17396, + "63": 0.17822, + "64": 0.17482, + "65": 0.24849, + "66": 0.17181, + "67": 0.23022, + "68": 0.19374, + "69": 0.17091, + "70": 0.17566, + "71": 0.19661, + "72": 0.17367, + "73": 0.21284, + "74": 0.19024, + "75": 0.18071, + "76": 0.20274, + "77": 0.17462, + "78": 0.18216, + "79": 0.18476, + "80": 0.18669, + "81": 0.17032, + "82": 0.16285, + "83": 0.17256, + "84": 0.19021, + "85": 0.16572, + "86": 0.20934, + "87": 0.17261, + "88": 0.16413, + "89": 0.17944, + "90": 0.1661, + "91": 0.19779, + "92": 0.17507, + "93": 0.18998, + "94": 0.20674, + "95": 0.16927, + "96": 0.16793, + "97": 0.17702, + "98": 0.16074, + "99": 0.17652, + "100": 0.17041 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..dca66d633f5 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.81565, + "2": 10.81048, + "3": 10.81268, + "4": 10.79108, + "5": 10.83781, + "6": 10.85065, + "7": 10.82134, + "8": 10.8202, + "9": 10.83075, + "10": 10.79026, + "11": 10.86297, + "12": 10.84282, + "13": 10.85729, + "14": 10.86207, + "15": 10.80535, + "16": 10.80362, + "17": 10.77916, + "18": 10.80764, + "19": 10.79451, + "20": 10.74621, + "21": 10.72181, + "22": 10.58717, + "23": 10.72927, + "24": 10.63248, + "25": 10.57614, + "26": 10.63793, + "27": 10.64955, + "28": 10.63533, + "29": 10.64332, + "30": 10.44626, + "31": 10.19362, + "32": 10.52448, + "33": 10.51821, + "34": 10.28825, + "35": 10.33113, + "36": 10.31229, + "37": 10.42674, + "38": 10.279, + "39": 10.47591, + "40": 10.19781, + "41": 10.21483, + "42": 10.28721, + "43": 9.94225, + "44": 10.05777, + "45": 9.9434, + "46": 9.90939, + "47": 10.21227, + "48": 9.95, + "49": 9.63638, + "50": 10.00366, + "51": 9.92331, + "52": 9.8284, + "53": 10.14655, + "54": 10.04302, + "55": 9.9627, + "56": 9.70496, + "57": 9.58521, + "58": 9.91705, + "59": 9.66061, + "60": 9.60423, + "61": 9.77841, + "62": 10.06213, + "63": 9.47178, + "64": 9.85438, + "65": 9.02476, + "66": 9.79406, + "67": 9.43345, + "68": 9.8534, + "69": 9.847, + "70": 9.81051, + "71": 9.68406, + "72": 9.6601, + "73": 9.57296, + "74": 9.0603, + "75": 9.50552, + "76": 9.17947, + "77": 10.12779, + "78": 9.77444, + "79": 9.44215, + "80": 9.46725, + "81": 9.53865, + "82": 9.75696, + "83": 9.3874, + "84": 9.46663, + "85": 9.67947, + "86": 9.13533, + "87": 9.63433, + "88": 9.80834, + "89": 9.67888, + "90": 9.85563, + "91": 9.41308, + "92": 9.41812, + "93": 9.15371, + "94": 8.90222, + "95": 9.56497, + "96": 9.58428, + "97": 9.35825, + "98": 9.72999, + "99": 8.95886, + "100": 9.45414 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 30853.0, + "2": 33000.0, + "3": 33775.0, + "4": 30857.0, + "5": 35956.0, + "6": 37573.0, + "7": 35446.0, + "8": 31027.0, + "9": 34894.0, + "10": 29923.0, + "11": 38736.0, + "12": 35245.0, + "13": 36983.0, + "14": 38078.0, + "15": 34560.0, + "16": 36096.0, + "17": 34585.0, + "18": 34936.0, + "19": 36301.0, + "20": 32788.0, + "21": 33385.0, + "22": 29942.0, + "23": 37625.0, + "24": 32018.0, + "25": 31043.0, + "26": 34310.0, + "27": 35942.0, + "28": 37348.0, + "29": 38027.0, + "30": 32865.0, + "31": 30072.0, + "32": 36198.0, + "33": 37604.0, + "34": 32768.0, + "35": 34129.0, + "36": 34811.0, + "37": 37917.0, + "38": 35861.0, + "39": 38592.0, + "40": 35652.0, + "41": 35428.0, + "42": 37701.0, + "43": 33967.0, + "44": 33425.0, + "45": 35778.0, + "46": 37279.0, + "47": 40356.0, + "48": 36144.0, + "49": 36492.0, + "50": 39148.0, + "51": 37394.0, + "52": 36918.0, + "53": 41574.0, + "54": 40654.0, + "55": 37274.0, + "56": 40316.0, + "57": 36713.0, + "58": 42042.0, + "59": 39264.0, + "60": 39816.0, + "61": 40579.0, + "62": 44097.0, + "63": 38397.0, + "64": 43253.0, + "65": 40953.0, + "66": 44326.0, + "67": 40344.0, + "68": 40398.0, + "69": 40614.0, + "70": 45248.0, + "71": 41445.0, + "72": 39901.0, + "73": 44369.0, + "74": 33925.0, + "75": 38833.0, + "76": 46358.0, + "77": 46064.0, + "78": 46904.0, + "79": 47560.0, + "80": 46979.0, + "81": 50283.0, + "82": 49634.0, + "83": 45153.0, + "84": 45874.0, + "85": 49161.0, + "86": 45106.0, + "87": 49057.0, + "88": 46592.0, + "89": 48712.0, + "90": 49552.0, + "91": 43836.0, + "92": 47360.0, + "93": 46675.0, + "94": 46653.0, + "95": 46726.0, + "96": 50152.0, + "97": 47102.0, + "98": 50317.0, + "99": 48088.0, + "100": 43362.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1016564224.0, + "2": 1016563712.0, + "3": 1016564224.0, + "4": 1016563200.0, + "5": 1016564736.0, + "6": 1016565248.0, + "7": 1016563712.0, + "8": 1016565248.0, + "9": 1016562688.0, + "10": 1016564736.0, + "11": 1016562688.0, + "12": 1016564224.0, + "13": 1016563200.0, + "14": 1016563712.0, + "15": 1017374720.0, + "16": 1016562176.0, + "17": 1016565248.0, + "18": 1016566272.0, + "19": 1016563712.0, + "20": 1016564224.0, + "21": 1016564224.0, + "22": 1016566272.0, + "23": 1016563712.0, + "24": 1016563200.0, + "25": 1016565248.0, + "26": 1016833024.0, + "27": 1016562688.0, + "28": 1016562176.0, + "29": 1016562688.0, + "30": 1016565760.0, + "31": 1016568832.0, + "32": 1016565248.0, + "33": 1016564736.0, + "34": 1016564736.0, + "35": 1016565248.0, + "36": 1016901120.0, + "37": 1016564736.0, + "38": 1016564224.0, + "39": 1016562688.0, + "40": 1016563712.0, + "41": 1016567296.0, + "42": 1016564736.0, + "43": 1016567808.0, + "44": 1016564736.0, + "45": 1016565760.0, + "46": 1016569856.0, + "47": 1016564224.0, + "48": 1016569856.0, + "49": 1016568320.0, + "50": 1017070592.0, + "51": 1016566272.0, + "52": 1016575488.0, + "53": 1016567808.0, + "54": 1016976896.0, + "55": 1016569856.0, + "56": 1016565248.0, + "57": 1016574976.0, + "58": 1017060352.0, + "59": 1016573952.0, + "60": 1016571904.0, + "61": 1016568320.0, + "62": 1016566784.0, + "63": 1016576512.0, + "64": 1016572416.0, + "65": 1016584192.0, + "66": 1016568832.0, + "67": 1016570368.0, + "68": 1016566272.0, + "69": 1016570880.0, + "70": 1016937984.0, + "71": 1016567296.0, + "72": 1016571904.0, + "73": 1016572416.0, + "74": 1016577024.0, + "75": 1016567296.0, + "76": 1016565248.0, + "77": 1016566272.0, + "78": 1016572928.0, + "79": 1016568320.0, + "80": 1016572416.0, + "81": 1016570368.0, + "82": 1016571392.0, + "83": 1016568320.0, + "84": 1016573440.0, + "85": 1016574976.0, + "86": 1016574976.0, + "87": 1016567808.0, + "88": 1016570880.0, + "89": 1016577024.0, + "90": 1016568320.0, + "91": 1016566784.0, + "92": 1016567808.0, + "93": 1016569856.0, + "94": 1016571904.0, + "95": 1016568320.0, + "96": 1016718336.0, + "97": 1016573440.0, + "98": 1016565248.0, + "99": 1016578560.0, + "100": 1016574464.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2560655872.0, + "2": 2827037696.0, + "3": 2827638272.0, + "4": 2827638272.0, + "5": 2827638272.0, + "6": 2828292608.0, + "7": 2829339648.0, + "8": 2829339648.0, + "9": 2829339648.0, + "10": 2831441920.0, + "11": 2831441920.0, + "12": 2831441920.0, + "13": 2831441920.0, + "14": 2831441920.0, + "15": 2831441920.0, + "16": 2831441920.0, + "17": 2831441920.0, + "18": 2831441920.0, + "19": 2831441920.0, + "20": 2831441920.0, + "21": 2831441920.0, + "22": 2831441920.0, + "23": 2831441920.0, + "24": 2831441920.0, + "25": 2831441920.0, + "26": 2831441920.0, + "27": 2831441920.0, + "28": 2831441920.0, + "29": 2831441920.0, + "30": 2831441920.0, + "31": 2836701184.0, + "32": 2836701184.0, + "33": 2836701184.0, + "34": 2836701184.0, + "35": 2836701184.0, + "36": 2836701184.0, + "37": 2836701184.0, + "38": 2836701184.0, + "39": 2836701184.0, + "40": 2836701184.0, + "41": 2836701184.0, + "42": 2836701184.0, + "43": 2836701184.0, + "44": 2836701184.0, + "45": 2836701184.0, + "46": 2836701184.0, + "47": 2836701184.0, + "48": 2836701184.0, + "49": 2836701184.0, + "50": 2836701184.0, + "51": 2836701184.0, + "52": 2842246656.0, + "53": 2842246656.0, + "54": 2842246656.0, + "55": 2842246656.0, + "56": 2843695104.0, + "57": 2848199680.0, + "58": 2848199680.0, + "59": 2848199680.0, + "60": 2848199680.0, + "61": 2848199680.0, + "62": 2848199680.0, + "63": 2848199680.0, + "64": 2848199680.0, + "65": 2859411456.0, + "66": 2859411456.0, + "67": 2859411456.0, + "68": 2859411456.0, + "69": 2859411456.0, + "70": 2859411456.0, + "71": 2859411456.0, + "72": 2859411456.0, + "73": 2859411456.0, + "74": 2859411456.0, + "75": 2859411456.0, + "76": 2859411456.0, + "77": 2859411456.0, + "78": 2859411456.0, + "79": 2859411456.0, + "80": 2859411456.0, + "81": 2859411456.0, + "82": 2859411456.0, + "83": 2859411456.0, + "84": 2859411456.0, + "85": 2859411456.0, + "86": 2859411456.0, + "87": 2859411456.0, + "88": 2859411456.0, + "89": 2859411456.0, + "90": 2859411456.0, + "91": 2859411456.0, + "92": 2859411456.0, + "93": 2859411456.0, + "94": 2859411456.0, + "95": 2859411456.0, + "96": 2859411456.0, + "97": 2859411456.0, + "98": 2859411456.0, + "99": 2859411456.0, + "100": 2859411456.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 18.49276, + "2": 0.25843, + "3": 0.17872, + "4": 0.17622, + "5": 0.16425, + "6": 0.15462, + "7": 0.16221, + "8": 0.15923, + "9": 0.1611, + "10": 0.1478, + "11": 0.15494, + "12": 0.14547, + "13": 0.14411, + "14": 0.14989, + "15": 0.16302, + "16": 0.14821, + "17": 0.16657, + "18": 0.14513, + "19": 0.15296, + "20": 0.14437, + "21": 0.14735, + "22": 0.17451, + "23": 0.16059, + "24": 0.152, + "25": 0.15395, + "26": 0.15115, + "27": 0.15887, + "28": 0.15234, + "29": 0.1421, + "30": 0.15091, + "31": 0.18973, + "32": 0.14778, + "33": 0.14785, + "34": 0.1727, + "35": 0.15646, + "36": 0.16437, + "37": 0.1441, + "38": 0.15823, + "39": 0.14495, + "40": 0.16334, + "41": 0.14314, + "42": 0.14405, + "43": 0.15348, + "44": 0.14397, + "45": 0.15389, + "46": 0.17277, + "47": 0.14442, + "48": 0.16289, + "49": 0.21224, + "50": 0.14457, + "51": 0.17927, + "52": 0.15446, + "53": 0.14459, + "54": 0.14896, + "55": 0.1558, + "56": 0.2105, + "57": 0.17156, + "58": 0.146, + "59": 0.15771, + "60": 0.162, + "61": 0.14241, + "62": 0.14184, + "63": 0.15693, + "64": 0.16199, + "65": 0.22761, + "66": 0.14583, + "67": 0.22988, + "68": 0.15495, + "69": 0.15509, + "70": 0.15156, + "71": 0.17782, + "72": 0.15675, + "73": 0.18088, + "74": 0.17013, + "75": 0.16039, + "76": 0.17974, + "77": 0.13903, + "78": 0.15719, + "79": 0.1635, + "80": 0.17904, + "81": 0.14997, + "82": 0.15986, + "83": 0.1669, + "84": 0.17349, + "85": 0.14723, + "86": 0.19019, + "87": 0.15235, + "88": 0.14689, + "89": 0.16952, + "90": 0.1487, + "91": 0.1826, + "92": 0.15727, + "93": 0.17286, + "94": 0.18554, + "95": 0.14872, + "96": 0.14426, + "97": 0.15953, + "98": 0.14361, + "99": 0.15897, + "100": 0.14814 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..d869313b50f --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.81565, + "2": 10.81048, + "3": 10.8127, + "4": 10.79089, + "5": 10.83784, + "6": 10.85116, + "7": 10.82036, + "8": 10.82117, + "9": 10.83043, + "10": 10.78955, + "11": 10.86357, + "12": 10.84268, + "13": 10.85799, + "14": 10.86268, + "15": 10.80594, + "16": 10.80356, + "17": 10.77851, + "18": 10.80762, + "19": 10.79465, + "20": 10.747, + "21": 10.72249, + "22": 10.58742, + "23": 10.72933, + "24": 10.63238, + "25": 10.575, + "26": 10.638, + "27": 10.64966, + "28": 10.63496, + "29": 10.64307, + "30": 10.44635, + "31": 10.19441, + "32": 10.52449, + "33": 10.51815, + "34": 10.28843, + "35": 10.33138, + "36": 10.3123, + "37": 10.4265, + "38": 10.27866, + "39": 10.47612, + "40": 10.19821, + "41": 10.21536, + "42": 10.28769, + "43": 9.94235, + "44": 10.05775, + "45": 9.94354, + "46": 9.90902, + "47": 10.21214, + "48": 9.94982, + "49": 9.63605, + "50": 10.00335, + "51": 9.92304, + "52": 9.82779, + "53": 10.14656, + "54": 10.04338, + "55": 9.96311, + "56": 9.70508, + "57": 9.58542, + "58": 9.91687, + "59": 9.66061, + "60": 9.60393, + "61": 9.77855, + "62": 10.0624, + "63": 9.47205, + "64": 9.85428, + "65": 9.02467, + "66": 9.79454, + "67": 9.43333, + "68": 9.85327, + "69": 9.847, + "70": 9.81072, + "71": 9.684, + "72": 9.66023, + "73": 9.57314, + "74": 9.05973, + "75": 9.50551, + "76": 9.17942, + "77": 10.12761, + "78": 9.77438, + "79": 9.44209, + "80": 9.46747, + "81": 9.53873, + "82": 9.75725, + "83": 9.38702, + "84": 9.46662, + "85": 9.67918, + "86": 9.13556, + "87": 9.63426, + "88": 9.80794, + "89": 9.67925, + "90": 9.85561, + "91": 9.41267, + "92": 9.41773, + "93": 9.15396, + "94": 8.90227, + "95": 9.56526, + "96": 9.58425, + "97": 9.35836, + "98": 9.7302, + "99": 8.95917, + "100": 9.45408 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 30991.0, + "2": 32927.0, + "3": 33481.0, + "4": 30866.0, + "5": 36255.0, + "6": 37186.0, + "7": 35644.0, + "8": 31356.0, + "9": 34832.0, + "10": 29855.0, + "11": 38396.0, + "12": 35164.0, + "13": 37118.0, + "14": 38011.0, + "15": 34458.0, + "16": 35843.0, + "17": 34836.0, + "18": 35149.0, + "19": 36044.0, + "20": 32823.0, + "21": 33340.0, + "22": 30040.0, + "23": 37733.0, + "24": 31992.0, + "25": 31045.0, + "26": 34280.0, + "27": 36064.0, + "28": 36993.0, + "29": 38087.0, + "30": 32689.0, + "31": 30361.0, + "32": 36050.0, + "33": 37627.0, + "34": 33149.0, + "35": 34316.0, + "36": 35026.0, + "37": 37852.0, + "38": 35490.0, + "39": 38325.0, + "40": 35730.0, + "41": 35890.0, + "42": 37811.0, + "43": 34239.0, + "44": 33282.0, + "45": 35354.0, + "46": 37112.0, + "47": 40323.0, + "48": 36296.0, + "49": 36098.0, + "50": 38996.0, + "51": 37187.0, + "52": 36798.0, + "53": 41385.0, + "54": 41151.0, + "55": 36715.0, + "56": 40382.0, + "57": 36942.0, + "58": 42415.0, + "59": 39138.0, + "60": 39766.0, + "61": 40532.0, + "62": 43919.0, + "63": 38747.0, + "64": 43509.0, + "65": 40794.0, + "66": 44093.0, + "67": 40369.0, + "68": 40509.0, + "69": 40728.0, + "70": 45431.0, + "71": 41117.0, + "72": 39982.0, + "73": 44758.0, + "74": 34170.0, + "75": 38601.0, + "76": 46113.0, + "77": 45621.0, + "78": 47007.0, + "79": 47410.0, + "80": 46647.0, + "81": 50449.0, + "82": 49494.0, + "83": 45080.0, + "84": 46331.0, + "85": 48470.0, + "86": 45870.0, + "87": 49138.0, + "88": 46357.0, + "89": 48274.0, + "90": 50049.0, + "91": 43937.0, + "92": 47318.0, + "93": 46654.0, + "94": 46515.0, + "95": 47167.0, + "96": 50587.0, + "97": 46623.0, + "98": 49830.0, + "99": 48092.0, + "100": 43643.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1016564224.0, + "2": 1016563712.0, + "3": 1016564224.0, + "4": 1017172480.0, + "5": 1016564224.0, + "6": 1016565248.0, + "7": 1016564736.0, + "8": 1016565248.0, + "9": 1016562688.0, + "10": 1016564736.0, + "11": 1016562688.0, + "12": 1016565248.0, + "13": 1016564736.0, + "14": 1016564224.0, + "15": 1016564736.0, + "16": 1016562176.0, + "17": 1016564736.0, + "18": 1016565760.0, + "19": 1016563200.0, + "20": 1016563200.0, + "21": 1016564224.0, + "22": 1016566272.0, + "23": 1016564736.0, + "24": 1016564224.0, + "25": 1016564736.0, + "26": 1016562176.0, + "27": 1016563200.0, + "28": 1016562688.0, + "29": 1016562688.0, + "30": 1016566272.0, + "31": 1016569856.0, + "32": 1016564736.0, + "33": 1016564736.0, + "34": 1016565248.0, + "35": 1017459712.0, + "36": 1016565248.0, + "37": 1016565248.0, + "38": 1016564224.0, + "39": 1016562176.0, + "40": 1016565248.0, + "41": 1016567808.0, + "42": 1016564224.0, + "43": 1016568320.0, + "44": 1016565760.0, + "45": 1016565760.0, + "46": 1016570368.0, + "47": 1016565248.0, + "48": 1016569856.0, + "49": 1016568832.0, + "50": 1016565760.0, + "51": 1016566272.0, + "52": 1016574976.0, + "53": 1016567808.0, + "54": 1016566784.0, + "55": 1016569856.0, + "56": 1016565248.0, + "57": 1016574976.0, + "58": 1017110528.0, + "59": 1016574976.0, + "60": 1016571904.0, + "61": 1016567296.0, + "62": 1016565760.0, + "63": 1016576000.0, + "64": 1016572928.0, + "65": 1016585216.0, + "66": 1016568832.0, + "67": 1016569344.0, + "68": 1016566272.0, + "69": 1016569856.0, + "70": 1016569344.0, + "71": 1016566272.0, + "72": 1016571392.0, + "73": 1016572416.0, + "74": 1016577536.0, + "75": 1016567296.0, + "76": 1016565760.0, + "77": 1016566272.0, + "78": 1016572928.0, + "79": 1016568832.0, + "80": 1016572416.0, + "81": 1016570368.0, + "82": 1016571904.0, + "83": 1016568832.0, + "84": 1016573440.0, + "85": 1016575488.0, + "86": 1016574976.0, + "87": 1016568320.0, + "88": 1016816640.0, + "89": 1016577024.0, + "90": 1016569344.0, + "91": 1016566784.0, + "92": 1016566784.0, + "93": 1016569856.0, + "94": 1016571392.0, + "95": 1016567808.0, + "96": 1016566784.0, + "97": 1016573952.0, + "98": 1016565760.0, + "99": 1016577024.0, + "100": 1016574464.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2560655872.0, + "2": 2827037696.0, + "3": 2827771392.0, + "4": 2828163584.0, + "5": 2828163584.0, + "6": 2828163584.0, + "7": 2829373440.0, + "8": 2829373440.0, + "9": 2829373440.0, + "10": 2829925376.0, + "11": 2829925376.0, + "12": 2829925376.0, + "13": 2829925376.0, + "14": 2829925376.0, + "15": 2830320640.0, + "16": 2830320640.0, + "17": 2830320640.0, + "18": 2830320640.0, + "19": 2830320640.0, + "20": 2830320640.0, + "21": 2830320640.0, + "22": 2830406144.0, + "23": 2830406144.0, + "24": 2830406144.0, + "25": 2830406144.0, + "26": 2830406144.0, + "27": 2830406144.0, + "28": 2830406144.0, + "29": 2830406144.0, + "30": 2831433216.0, + "31": 2836904960.0, + "32": 2836904960.0, + "33": 2836904960.0, + "34": 2836904960.0, + "35": 2836904960.0, + "36": 2836904960.0, + "37": 2836904960.0, + "38": 2836904960.0, + "39": 2836904960.0, + "40": 2836904960.0, + "41": 2836904960.0, + "42": 2836904960.0, + "43": 2836904960.0, + "44": 2836904960.0, + "45": 2836904960.0, + "46": 2837527040.0, + "47": 2837527040.0, + "48": 2837527040.0, + "49": 2837527040.0, + "50": 2837527040.0, + "51": 2837527040.0, + "52": 2844526592.0, + "53": 2844526592.0, + "54": 2844526592.0, + "55": 2844526592.0, + "56": 2844526592.0, + "57": 2845833216.0, + "58": 2845833216.0, + "59": 2845833216.0, + "60": 2845833216.0, + "61": 2845833216.0, + "62": 2845833216.0, + "63": 2847350784.0, + "64": 2847350784.0, + "65": 2859365376.0, + "66": 2859365376.0, + "67": 2859365376.0, + "68": 2859365376.0, + "69": 2859365376.0, + "70": 2859365376.0, + "71": 2859365376.0, + "72": 2859365376.0, + "73": 2859365376.0, + "74": 2859365376.0, + "75": 2859365376.0, + "76": 2859365376.0, + "77": 2859365376.0, + "78": 2859365376.0, + "79": 2859365376.0, + "80": 2859365376.0, + "81": 2859365376.0, + "82": 2859365376.0, + "83": 2859365376.0, + "84": 2859365376.0, + "85": 2859365376.0, + "86": 2859365376.0, + "87": 2859365376.0, + "88": 2859365376.0, + "89": 2859365376.0, + "90": 2859365376.0, + "91": 2859365376.0, + "92": 2859365376.0, + "93": 2859365376.0, + "94": 2859365376.0, + "95": 2859365376.0, + "96": 2859365376.0, + "97": 2859365376.0, + "98": 2859365376.0, + "99": 2859365376.0, + "100": 2859365376.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 17.55161, + "2": 0.27584, + "3": 0.20906, + "4": 0.18821, + "5": 0.17883, + "6": 0.17484, + "7": 0.18214, + "8": 0.18025, + "9": 0.16785, + "10": 0.16718, + "11": 0.17122, + "12": 0.16341, + "13": 0.16356, + "14": 0.16447, + "15": 0.17469, + "16": 0.16231, + "17": 0.17002, + "18": 0.1621, + "19": 0.16543, + "20": 0.16097, + "21": 0.16113, + "22": 0.17866, + "23": 0.16939, + "24": 0.16784, + "25": 0.16322, + "26": 0.15752, + "27": 0.16042, + "28": 0.16296, + "29": 0.16022, + "30": 0.16569, + "31": 0.20634, + "32": 0.16627, + "33": 0.16203, + "34": 0.18965, + "35": 0.1656, + "36": 0.17227, + "37": 0.16394, + "38": 0.16364, + "39": 0.15966, + "40": 0.17482, + "41": 0.16992, + "42": 0.16079, + "43": 0.17541, + "44": 0.1626, + "45": 0.16436, + "46": 0.1838, + "47": 0.15773, + "48": 0.18504, + "49": 0.22116, + "50": 0.16497, + "51": 0.17193, + "52": 0.17228, + "53": 0.15999, + "54": 0.15946, + "55": 0.1611, + "56": 0.21983, + "57": 0.18423, + "58": 0.16229, + "59": 0.18268, + "60": 0.17406, + "61": 0.15956, + "62": 0.16172, + "63": 0.17465, + "64": 0.17307, + "65": 0.25477, + "66": 0.15926, + "67": 0.23477, + "68": 0.16872, + "69": 0.16094, + "70": 0.16631, + "71": 0.18552, + "72": 0.16728, + "73": 0.1889, + "74": 0.17586, + "75": 0.17577, + "76": 0.21503, + "77": 0.16576, + "78": 0.17284, + "79": 0.18166, + "80": 0.19235, + "81": 0.17347, + "82": 0.1597, + "83": 0.17024, + "84": 0.17843, + "85": 0.15917, + "86": 0.20315, + "87": 0.16523, + "88": 0.16367, + "89": 0.18499, + "90": 0.16286, + "91": 0.19025, + "92": 0.17186, + "93": 0.19123, + "94": 0.19378, + "95": 0.16849, + "96": 0.16781, + "97": 0.17705, + "98": 0.15729, + "99": 0.17119, + "100": 0.16 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index a13cf8b8c89..f763ccd7669 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.78091, + "2": 10.80272, + "3": 10.8036, + "4": 10.77566, "5": 10.83259, + "6": 10.83704, + "7": 10.79793, + "8": 10.79364, + "9": 10.808, "10": 10.76116, + "11": 10.85297, + "12": 10.84152, + "13": 10.8247, + "14": 10.85822, "15": 10.78238, + "16": 10.77927, + "17": 10.74878, + "18": 10.7897, + "19": 10.7749, "20": 10.71704, + "21": 10.70811, + "22": 10.54787, + "23": 10.72978, + "24": 10.60324, "25": 10.55979, + "26": 10.61611, + "27": 10.6446, + "28": 10.62463, + "29": 10.63492, "30": 10.42362, + "31": 10.16499, + "32": 10.51313, + "33": 10.5094, + "34": 10.2668, "35": 10.32318, + "36": 10.28865, + "37": 10.41114, + "38": 10.26426, + "39": 10.45, "40": 10.17473, + "41": 10.20958, + "42": 10.27824, + "43": 9.91831, + "44": 10.03131, "45": 9.91995, + "46": 9.8862, + "47": 10.19255, + "48": 9.92803, + "49": 9.61616, "50": 9.98532, + "51": 9.90528, + "52": 9.80364, + "53": 10.12728, + "54": 10.00036, "55": 9.9362, + "56": 9.68506, + "57": 9.55805, + "58": 9.90514, + "59": 9.63857, "60": 9.57451, + "61": 9.76864, + "62": 10.03802, + "63": 9.44503, + "64": 9.82796, "65": 9.00712, + "66": 9.77422, + "67": 9.41277, + "68": 9.84111, + "69": 9.82784, "70": 9.79011, + "71": 9.66957, + "72": 9.62799, + "73": 9.5473, + "74": 9.03663, "75": 9.49153, + "76": 9.16783, + "77": 10.10857, + "78": 9.77081, + "79": 9.4383, "80": 9.45436, + "81": 9.52266, + "82": 9.7424, + "83": 9.37076, + "84": 9.45377, "85": 9.65832, + "86": 9.12522, + "87": 9.62697, + "88": 9.79619, + "89": 9.66054, "90": 9.85081, + "91": 9.39408, + "92": 9.40744, + "93": 9.13595, + "94": 8.89048, "95": 9.563, + "96": 9.5714, + "97": 9.34318, + "98": 9.73026, + "99": 8.95002, "100": 9.4424 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 30994.0, + "2": 32962.0, + "3": 33026.0, + "4": 30732.0, "5": 36042.0, + "6": 37038.0, + "7": 34481.0, + "8": 31368.0, + "9": 33980.0, "10": 29532.0, + "11": 37852.0, + "12": 34972.0, + "13": 36994.0, + "14": 37789.0, "15": 34058.0, + "16": 36656.0, + "17": 34700.0, + "18": 34946.0, + "19": 35228.0, "20": 32392.0, + "21": 33247.0, + "22": 30040.0, + "23": 37891.0, + "24": 32099.0, "25": 30921.0, + "26": 34212.0, + "27": 34975.0, + "28": 36746.0, + "29": 37759.0, "30": 32786.0, + "31": 30423.0, + "32": 35992.0, + "33": 36915.0, + "34": 32293.0, "35": 33654.0, + "36": 34755.0, + "37": 37859.0, + "38": 36022.0, + "39": 38343.0, "40": 35963.0, + "41": 35882.0, + "42": 36774.0, + "43": 34186.0, + "44": 33572.0, "45": 35574.0, + "46": 37208.0, + "47": 40154.0, + "48": 36385.0, + "49": 36259.0, "50": 38861.0, + "51": 38061.0, + "52": 37025.0, + "53": 41802.0, + "54": 41253.0, "55": 37654.0, + "56": 41164.0, + "57": 37682.0, + "58": 41782.0, + "59": 39444.0, "60": 40691.0, + "61": 40876.0, + "62": 43113.0, + "63": 38389.0, + "64": 43217.0, "65": 41689.0, + "66": 45525.0, + "67": 41717.0, + "68": 40369.0, + "69": 41287.0, "70": 45545.0, + "71": 41651.0, + "72": 41881.0, + "73": 45139.0, + "74": 35747.0, "75": 39155.0, + "76": 44874.0, + "77": 45442.0, + "78": 46782.0, + "79": 48776.0, "80": 47161.0, + "81": 51277.0, + "82": 49953.0, + "83": 45334.0, + "84": 46096.0, "85": 49238.0, + "86": 46118.0, + "87": 49880.0, + "88": 47115.0, + "89": 48583.0, "90": 49057.0, + "91": 45950.0, + "92": 47820.0, + "93": 46437.0, + "94": 47530.0, "95": 48000.0, + "96": 50285.0, + "97": 46225.0, + "98": 49809.0, + "99": 47890.0, "100": 44636.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 892864512.0, + "2": 892868608.0, + "3": 892868608.0, + "4": 892864512.0, "5": 892865024.0, + "6": 892866560.0, + "7": 892866048.0, + "8": 892867584.0, + "9": 892865536.0, "10": 892867584.0, + "11": 892866048.0, + "12": 892865536.0, + "13": 892865536.0, + "14": 892868096.0, "15": 892867584.0, + "16": 892867072.0, + "17": 892867584.0, + "18": 892869632.0, + "19": 892868096.0, "20": 892866560.0, + "21": 892866560.0, + "22": 892863488.0, + "23": 892864512.0, + "24": 892867072.0, "25": 892863488.0, + "26": 892866560.0, + "27": 892867072.0, + "28": 892865536.0, + "29": 892866048.0, "30": 892863488.0, + "31": 892862464.0, + "32": 892861952.0, + "33": 892866048.0, + "34": 892865536.0, "35": 892865024.0, + "36": 892868608.0, + "37": 892867072.0, + "38": 892866560.0, + "39": 892866048.0, "40": 892867072.0, + "41": 892865536.0, + "42": 892867584.0, + "43": 892861440.0, + "44": 892862976.0, "45": 892865024.0, + "46": 892864512.0, + "47": 892865024.0, + "48": 892861440.0, + "49": 892863488.0, "50": 892867072.0, + "51": 892860416.0, + "52": 892858880.0, + "53": 892861440.0, + "54": 892861440.0, "55": 892862464.0, + "56": 892865024.0, + "57": 892857344.0, + "58": 892859392.0, + "59": 892858880.0, "60": 892859904.0, + "61": 892868608.0, + "62": 892865536.0, + "63": 892861952.0, + "64": 892863488.0, "65": 892851712.0, + "66": 892866048.0, + "67": 892861440.0, + "68": 892868608.0, + "69": 892864512.0, "70": 892866560.0, + "71": 892868608.0, + "72": 892860416.0, + "73": 892868096.0, + "74": 892858368.0, "75": 892867072.0, + "76": 892866560.0, + "77": 892867072.0, + "78": 892863488.0, + "79": 892864512.0, "80": 892864512.0, + "81": 892866048.0, + "82": 892864000.0, + "83": 892860928.0, + "84": 892861440.0, "85": 892861952.0, + "86": 892861440.0, + "87": 892870144.0, + "88": 892862464.0, + "89": 892864512.0, "90": 892866048.0, + "91": 892867072.0, + "92": 892865536.0, + "93": 892868608.0, + "94": 892864512.0, "95": 892865024.0, + "96": 892865024.0, + "97": 892862976.0, + "98": 892867584.0, + "99": 892859904.0, "100": 892861952.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1867566080.0, + "2": 2107252736.0, + "3": 2107252736.0, + "4": 2107252736.0, "5": 2107481600.0, + "6": 2107481600.0, + "7": 2107481600.0, + "8": 2107481600.0, + "9": 2107481600.0, "10": 2108814336.0, + "11": 2108814336.0, + "12": 2108814336.0, + "13": 2108814336.0, + "14": 2108814336.0, "15": 2108814336.0, + "16": 2109139456.0, + "17": 2109139456.0, + "18": 2109139456.0, + "19": 2109139456.0, "20": 2109139456.0, + "21": 2109139456.0, + "22": 2109139456.0, + "23": 2109139456.0, + "24": 2109139456.0, "25": 2109139456.0, + "26": 2109139456.0, + "27": 2109139456.0, + "28": 2109139456.0, + "29": 2109139456.0, "30": 2109139456.0, + "31": 2109139456.0, + "32": 2109139456.0, + "33": 2109139456.0, + "34": 2109139456.0, "35": 2109139456.0, + "36": 2109139456.0, + "37": 2109139456.0, + "38": 2109139456.0, + "39": 2109139456.0, "40": 2109139456.0, + "41": 2109139456.0, + "42": 2109139456.0, + "43": 2109139456.0, + "44": 2109139456.0, "45": 2109139456.0, + "46": 2109139456.0, + "47": 2109139456.0, + "48": 2109139456.0, + "49": 2109139456.0, "50": 2109139456.0, + "51": 2109139456.0, + "52": 2109139456.0, + "53": 2109139456.0, + "54": 2109139456.0, "55": 2109139456.0, + "56": 2109139456.0, + "57": 2109139456.0, + "58": 2109139456.0, + "59": 2109139456.0, "60": 2109139456.0, + "61": 2109139456.0, + "62": 2109139456.0, + "63": 2109139456.0, + "64": 2109139456.0, "65": 2109139456.0, + "66": 2109139456.0, + "67": 2109139456.0, + "68": 2109139456.0, + "69": 2109139456.0, "70": 2109139456.0, + "71": 2109139456.0, + "72": 2109139456.0, + "73": 2109139456.0, + "74": 2109139456.0, "75": 2109139456.0, + "76": 2109139456.0, + "77": 2109139456.0, + "78": 2109139456.0, + "79": 2109139456.0, "80": 2109139456.0, + "81": 2109139456.0, + "82": 2109139456.0, + "83": 2109139456.0, + "84": 2109139456.0, "85": 2109139456.0, + "86": 2109139456.0, + "87": 2109897728.0, + "88": 2109897728.0, + "89": 2109897728.0, "90": 2109897728.0, + "91": 2109897728.0, + "92": 2109897728.0, + "93": 2109897728.0, + "94": 2109897728.0, "95": 2109897728.0, + "96": 2109897728.0, + "97": 2109897728.0, + "98": 2109897728.0, + "99": 2109897728.0, "100": 2109897728.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 15.73372, - "5": 0.22156, - "10": 0.21766, - "15": 0.22279, - "20": 0.2043, - "25": 0.2023, - "30": 0.20179, - "35": 0.20654, - "40": 0.20904, - "45": 0.21995, - "50": 0.20076, - "55": 0.21849, - "60": 0.21439, - "65": 0.26977, - "70": 0.20736, - "75": 0.21282, - "80": 0.22233, - "85": 0.21095, - "90": 0.20403, - "95": 0.21788, - "100": 0.20993 + "1": 14.31194, + "2": 0.35602, + "3": 0.27118, + "4": 0.26003, + "5": 0.25566, + "6": 0.23955, + "7": 0.25733, + "8": 0.24144, + "9": 0.24541, + "10": 0.24933, + "11": 0.24384, + "12": 0.23671, + "13": 0.23911, + "14": 0.23582, + "15": 0.24799, + "16": 0.24336, + "17": 0.25026, + "18": 0.2284, + "19": 0.23348, + "20": 0.23732, + "21": 0.23466, + "22": 0.23579, + "23": 0.23473, + "24": 0.24834, + "25": 0.23298, + "26": 0.2337, + "27": 0.2322, + "28": 0.23129, + "29": 0.23719, + "30": 0.24475, + "31": 0.27609, + "32": 0.24141, + "33": 0.23534, + "34": 0.25714, + "35": 0.24161, + "36": 0.23358, + "37": 0.23063, + "38": 0.23854, + "39": 0.23304, + "40": 0.2404, + "41": 0.23771, + "42": 0.2345, + "43": 0.24255, + "44": 0.23514, + "45": 0.25421, + "46": 0.26534, + "47": 0.23362, + "48": 0.25382, + "49": 0.27095, + "50": 0.23751, + "51": 0.2738, + "52": 0.26505, + "53": 0.23078, + "54": 0.23459, + "55": 0.2529, + "56": 0.29375, + "57": 0.26697, + "58": 0.24903, + "59": 0.24384, + "60": 0.24359, + "61": 0.2298, + "62": 0.2365, + "63": 0.24866, + "64": 0.23579, + "65": 0.30261, + "66": 0.23489, + "67": 0.28661, + "68": 0.2497, + "69": 0.2358, + "70": 0.23664, + "71": 0.26035, + "72": 0.24553, + "73": 0.27252, + "74": 0.26037, + "75": 0.24806, + "76": 0.26257, + "77": 0.23946, + "78": 0.24328, + "79": 0.24753, + "80": 0.25383, + "81": 0.23677, + "82": 0.23361, + "83": 0.23998, + "84": 0.2503, + "85": 0.2394, + "86": 0.24786, + "87": 0.22954, + "88": 0.23347, + "89": 0.24991, + "90": 0.23017, + "91": 0.25015, + "92": 0.23807, + "93": 0.24597, + "94": 0.26925, + "95": 0.25645, + "96": 0.23369, + "97": 0.24492, + "98": 0.22834, + "99": 0.23921, + "100": 0.23446 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..12778ad6bb9 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.78091, + "2": 10.80272, + "3": 10.8036, + "4": 10.77566, + "5": 10.83259, + "6": 10.83704, + "7": 10.79793, + "8": 10.79364, + "9": 10.808, + "10": 10.76116, + "11": 10.85297, + "12": 10.84152, + "13": 10.8247, + "14": 10.85822, + "15": 10.78238, + "16": 10.77927, + "17": 10.74878, + "18": 10.7897, + "19": 10.7749, + "20": 10.71704, + "21": 10.70811, + "22": 10.54787, + "23": 10.72978, + "24": 10.60324, + "25": 10.55979, + "26": 10.61611, + "27": 10.6446, + "28": 10.62463, + "29": 10.63492, + "30": 10.42362, + "31": 10.16499, + "32": 10.51313, + "33": 10.5094, + "34": 10.2668, + "35": 10.32318, + "36": 10.28865, + "37": 10.41114, + "38": 10.26426, + "39": 10.45, + "40": 10.17473, + "41": 10.20958, + "42": 10.27824, + "43": 9.91831, + "44": 10.03131, + "45": 9.91995, + "46": 9.8862, + "47": 10.19255, + "48": 9.92803, + "49": 9.61616, + "50": 9.98532, + "51": 9.90528, + "52": 9.80364, + "53": 10.12728, + "54": 10.00036, + "55": 9.9362, + "56": 9.68506, + "57": 9.55805, + "58": 9.90514, + "59": 9.63857, + "60": 9.57451, + "61": 9.76864, + "62": 10.03802, + "63": 9.44503, + "64": 9.82796, + "65": 9.00712, + "66": 9.77422, + "67": 9.41277, + "68": 9.84111, + "69": 9.82784, + "70": 9.79011, + "71": 9.66957, + "72": 9.62799, + "73": 9.5473, + "74": 9.03663, + "75": 9.49153, + "76": 9.16783, + "77": 10.10857, + "78": 9.77081, + "79": 9.4383, + "80": 9.45436, + "81": 9.52266, + "82": 9.7424, + "83": 9.37076, + "84": 9.45377, + "85": 9.65832, + "86": 9.12522, + "87": 9.62697, + "88": 9.79619, + "89": 9.66054, + "90": 9.85081, + "91": 9.39408, + "92": 9.40744, + "93": 9.13595, + "94": 8.89048, + "95": 9.563, + "96": 9.5714, + "97": 9.34318, + "98": 9.73026, + "99": 8.95002, + "100": 9.4424 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 30994.0, + "2": 32962.0, + "3": 33026.0, + "4": 30732.0, + "5": 36042.0, + "6": 37038.0, + "7": 34481.0, + "8": 31368.0, + "9": 33980.0, + "10": 29532.0, + "11": 37852.0, + "12": 34972.0, + "13": 36994.0, + "14": 37789.0, + "15": 34058.0, + "16": 36656.0, + "17": 34700.0, + "18": 34946.0, + "19": 35228.0, + "20": 32392.0, + "21": 33247.0, + "22": 30040.0, + "23": 37891.0, + "24": 32099.0, + "25": 30921.0, + "26": 34212.0, + "27": 34975.0, + "28": 36746.0, + "29": 37759.0, + "30": 32786.0, + "31": 30423.0, + "32": 35992.0, + "33": 36915.0, + "34": 32293.0, + "35": 33654.0, + "36": 34755.0, + "37": 37859.0, + "38": 36022.0, + "39": 38343.0, + "40": 35963.0, + "41": 35882.0, + "42": 36774.0, + "43": 34186.0, + "44": 33572.0, + "45": 35574.0, + "46": 37208.0, + "47": 40154.0, + "48": 36385.0, + "49": 36259.0, + "50": 38861.0, + "51": 38061.0, + "52": 37025.0, + "53": 41802.0, + "54": 41253.0, + "55": 37654.0, + "56": 41164.0, + "57": 37682.0, + "58": 41782.0, + "59": 39444.0, + "60": 40691.0, + "61": 40876.0, + "62": 43113.0, + "63": 38389.0, + "64": 43217.0, + "65": 41689.0, + "66": 45525.0, + "67": 41717.0, + "68": 40369.0, + "69": 41287.0, + "70": 45545.0, + "71": 41651.0, + "72": 41881.0, + "73": 45139.0, + "74": 35747.0, + "75": 39155.0, + "76": 44874.0, + "77": 45442.0, + "78": 46782.0, + "79": 48776.0, + "80": 47161.0, + "81": 51277.0, + "82": 49953.0, + "83": 45334.0, + "84": 46096.0, + "85": 49238.0, + "86": 46118.0, + "87": 49880.0, + "88": 47115.0, + "89": 48583.0, + "90": 49057.0, + "91": 45950.0, + "92": 47820.0, + "93": 46437.0, + "94": 47530.0, + "95": 48000.0, + "96": 50285.0, + "97": 46225.0, + "98": 49809.0, + "99": 47890.0, + "100": 44636.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 892864512.0, + "2": 892868608.0, + "3": 892868608.0, + "4": 892864512.0, + "5": 892865024.0, + "6": 892866560.0, + "7": 892866048.0, + "8": 892867584.0, + "9": 892865536.0, + "10": 892867584.0, + "11": 892866048.0, + "12": 892865536.0, + "13": 892865536.0, + "14": 892868096.0, + "15": 892867584.0, + "16": 892867072.0, + "17": 892867584.0, + "18": 892869632.0, + "19": 892868096.0, + "20": 892866560.0, + "21": 892866560.0, + "22": 892863488.0, + "23": 892864512.0, + "24": 892867072.0, + "25": 892863488.0, + "26": 892866560.0, + "27": 892867072.0, + "28": 892865536.0, + "29": 892866048.0, + "30": 892863488.0, + "31": 892862464.0, + "32": 892861952.0, + "33": 892866048.0, + "34": 892865536.0, + "35": 892865024.0, + "36": 892868608.0, + "37": 892867072.0, + "38": 892866560.0, + "39": 892866048.0, + "40": 892867072.0, + "41": 892865536.0, + "42": 892867584.0, + "43": 892861440.0, + "44": 892862976.0, + "45": 892865024.0, + "46": 892864512.0, + "47": 892865024.0, + "48": 892861440.0, + "49": 892863488.0, + "50": 892867072.0, + "51": 892860416.0, + "52": 892858880.0, + "53": 892861440.0, + "54": 892861440.0, + "55": 892862464.0, + "56": 892865024.0, + "57": 892857344.0, + "58": 892859392.0, + "59": 892858880.0, + "60": 892859904.0, + "61": 892868608.0, + "62": 892865536.0, + "63": 892861952.0, + "64": 892863488.0, + "65": 892851712.0, + "66": 892866048.0, + "67": 892861440.0, + "68": 892868608.0, + "69": 892864512.0, + "70": 892866560.0, + "71": 892868608.0, + "72": 892860416.0, + "73": 892868096.0, + "74": 892858368.0, + "75": 892867072.0, + "76": 892866560.0, + "77": 892867072.0, + "78": 892863488.0, + "79": 892864512.0, + "80": 892864512.0, + "81": 892866048.0, + "82": 892864000.0, + "83": 892860928.0, + "84": 892861440.0, + "85": 892861952.0, + "86": 892861440.0, + "87": 892870144.0, + "88": 892862464.0, + "89": 892864512.0, + "90": 892866048.0, + "91": 892867072.0, + "92": 892865536.0, + "93": 892868608.0, + "94": 892864512.0, + "95": 892865024.0, + "96": 892865024.0, + "97": 892862976.0, + "98": 892867584.0, + "99": 892859904.0, + "100": 892861952.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1867566080.0, + "2": 2107252736.0, + "3": 2107252736.0, + "4": 2107252736.0, + "5": 2107481600.0, + "6": 2107481600.0, + "7": 2107481600.0, + "8": 2107481600.0, + "9": 2107481600.0, + "10": 2108814336.0, + "11": 2108814336.0, + "12": 2108814336.0, + "13": 2108814336.0, + "14": 2108814336.0, + "15": 2108814336.0, + "16": 2109139456.0, + "17": 2109139456.0, + "18": 2109139456.0, + "19": 2109139456.0, + "20": 2109139456.0, + "21": 2109139456.0, + "22": 2109139456.0, + "23": 2109139456.0, + "24": 2109139456.0, + "25": 2109139456.0, + "26": 2109139456.0, + "27": 2109139456.0, + "28": 2109139456.0, + "29": 2109139456.0, + "30": 2109139456.0, + "31": 2109139456.0, + "32": 2109139456.0, + "33": 2109139456.0, + "34": 2109139456.0, + "35": 2109139456.0, + "36": 2109139456.0, + "37": 2109139456.0, + "38": 2109139456.0, + "39": 2109139456.0, + "40": 2109139456.0, + "41": 2109139456.0, + "42": 2109139456.0, + "43": 2109139456.0, + "44": 2109139456.0, + "45": 2109139456.0, + "46": 2109139456.0, + "47": 2109139456.0, + "48": 2109139456.0, + "49": 2109139456.0, + "50": 2109139456.0, + "51": 2109139456.0, + "52": 2109139456.0, + "53": 2109139456.0, + "54": 2109139456.0, + "55": 2109139456.0, + "56": 2109139456.0, + "57": 2109139456.0, + "58": 2109139456.0, + "59": 2109139456.0, + "60": 2109139456.0, + "61": 2109139456.0, + "62": 2109139456.0, + "63": 2109139456.0, + "64": 2109139456.0, + "65": 2109139456.0, + "66": 2109139456.0, + "67": 2109139456.0, + "68": 2109139456.0, + "69": 2109139456.0, + "70": 2109139456.0, + "71": 2109139456.0, + "72": 2109139456.0, + "73": 2109139456.0, + "74": 2109139456.0, + "75": 2109139456.0, + "76": 2109139456.0, + "77": 2109139456.0, + "78": 2109139456.0, + "79": 2109139456.0, + "80": 2109139456.0, + "81": 2109139456.0, + "82": 2109139456.0, + "83": 2109139456.0, + "84": 2109139456.0, + "85": 2109139456.0, + "86": 2109139456.0, + "87": 2109897728.0, + "88": 2109897728.0, + "89": 2109897728.0, + "90": 2109897728.0, + "91": 2109897728.0, + "92": 2109897728.0, + "93": 2109897728.0, + "94": 2109897728.0, + "95": 2109897728.0, + "96": 2109897728.0, + "97": 2109897728.0, + "98": 2109897728.0, + "99": 2109897728.0, + "100": 2109897728.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 14.94584, + "2": 0.28148, + "3": 0.23092, + "4": 0.2272, + "5": 0.21174, + "6": 0.2052, + "7": 0.2177, + "8": 0.20762, + "9": 0.21011, + "10": 0.20762, + "11": 0.20739, + "12": 0.20558, + "13": 0.20293, + "14": 0.20366, + "15": 0.2151, + "16": 0.20336, + "17": 0.211, + "18": 0.20107, + "19": 0.19975, + "20": 0.19946, + "21": 0.20167, + "22": 0.20546, + "23": 0.2079, + "24": 0.21407, + "25": 0.20322, + "26": 0.20113, + "27": 0.2036, + "28": 0.20193, + "29": 0.20351, + "30": 0.20276, + "31": 0.24088, + "32": 0.20552, + "33": 0.2062, + "34": 0.22507, + "35": 0.21674, + "36": 0.20224, + "37": 0.2024, + "38": 0.20522, + "39": 0.20019, + "40": 0.20848, + "41": 0.20633, + "42": 0.20422, + "43": 0.22047, + "44": 0.21076, + "45": 0.22033, + "46": 0.23288, + "47": 0.20066, + "48": 0.2262, + "49": 0.25589, + "50": 0.2006, + "51": 0.21639, + "52": 0.23518, + "53": 0.20634, + "54": 0.20906, + "55": 0.22297, + "56": 0.2742, + "57": 0.23575, + "58": 0.21113, + "59": 0.21965, + "60": 0.21956, + "61": 0.20714, + "62": 0.20897, + "63": 0.21858, + "64": 0.21079, + "65": 0.26753, + "66": 0.2086, + "67": 0.2478, + "68": 0.22097, + "69": 0.20663, + "70": 0.20836, + "71": 0.22856, + "72": 0.21708, + "73": 0.24693, + "74": 0.23784, + "75": 0.21364, + "76": 0.23055, + "77": 0.20122, + "78": 0.21746, + "79": 0.21857, + "80": 0.22508, + "81": 0.21322, + "82": 0.21041, + "83": 0.24051, + "84": 0.26987, + "85": 0.27857, + "86": 0.28871, + "87": 0.24894, + "88": 0.21388, + "89": 0.22289, + "90": 0.20477, + "91": 0.22651, + "92": 0.21738, + "93": 0.22137, + "94": 0.23367, + "95": 0.21527, + "96": 0.20516, + "97": 0.22856, + "98": 0.20431, + "99": 0.21662, + "100": 0.2101 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..c598c8c5c86 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.78091, + "2": 10.80272, + "3": 10.8036, + "4": 10.77566, + "5": 10.83259, + "6": 10.83704, + "7": 10.79793, + "8": 10.79364, + "9": 10.808, + "10": 10.76116, + "11": 10.85297, + "12": 10.84152, + "13": 10.8247, + "14": 10.85822, + "15": 10.78238, + "16": 10.77927, + "17": 10.74878, + "18": 10.7897, + "19": 10.7749, + "20": 10.71704, + "21": 10.70811, + "22": 10.54787, + "23": 10.72978, + "24": 10.60324, + "25": 10.55979, + "26": 10.61611, + "27": 10.6446, + "28": 10.62463, + "29": 10.63492, + "30": 10.42362, + "31": 10.16499, + "32": 10.51313, + "33": 10.5094, + "34": 10.2668, + "35": 10.32318, + "36": 10.28865, + "37": 10.41114, + "38": 10.26426, + "39": 10.45, + "40": 10.17473, + "41": 10.20958, + "42": 10.27824, + "43": 9.91831, + "44": 10.03131, + "45": 9.91995, + "46": 9.8862, + "47": 10.19255, + "48": 9.92803, + "49": 9.61616, + "50": 9.98532, + "51": 9.90528, + "52": 9.80364, + "53": 10.12728, + "54": 10.00036, + "55": 9.9362, + "56": 9.68506, + "57": 9.55805, + "58": 9.90514, + "59": 9.63857, + "60": 9.57451, + "61": 9.76864, + "62": 10.03802, + "63": 9.44503, + "64": 9.82796, + "65": 9.00712, + "66": 9.77422, + "67": 9.41277, + "68": 9.84111, + "69": 9.82784, + "70": 9.79011, + "71": 9.66957, + "72": 9.62799, + "73": 9.5473, + "74": 9.03663, + "75": 9.49153, + "76": 9.16783, + "77": 10.10857, + "78": 9.77081, + "79": 9.4383, + "80": 9.45436, + "81": 9.52266, + "82": 9.7424, + "83": 9.37076, + "84": 9.45377, + "85": 9.65832, + "86": 9.12522, + "87": 9.62697, + "88": 9.79619, + "89": 9.66054, + "90": 9.85081, + "91": 9.39408, + "92": 9.40744, + "93": 9.13595, + "94": 8.89048, + "95": 9.563, + "96": 9.5714, + "97": 9.34318, + "98": 9.73026, + "99": 8.95002, + "100": 9.4424 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 30994.0, + "2": 32962.0, + "3": 33026.0, + "4": 30732.0, + "5": 36042.0, + "6": 37038.0, + "7": 34481.0, + "8": 31368.0, + "9": 33980.0, + "10": 29532.0, + "11": 37852.0, + "12": 34972.0, + "13": 36994.0, + "14": 37789.0, + "15": 34058.0, + "16": 36656.0, + "17": 34700.0, + "18": 34946.0, + "19": 35228.0, + "20": 32392.0, + "21": 33247.0, + "22": 30040.0, + "23": 37891.0, + "24": 32099.0, + "25": 30921.0, + "26": 34212.0, + "27": 34975.0, + "28": 36746.0, + "29": 37759.0, + "30": 32786.0, + "31": 30423.0, + "32": 35992.0, + "33": 36915.0, + "34": 32293.0, + "35": 33654.0, + "36": 34755.0, + "37": 37859.0, + "38": 36022.0, + "39": 38343.0, + "40": 35963.0, + "41": 35882.0, + "42": 36774.0, + "43": 34186.0, + "44": 33572.0, + "45": 35574.0, + "46": 37208.0, + "47": 40154.0, + "48": 36385.0, + "49": 36259.0, + "50": 38861.0, + "51": 38061.0, + "52": 37025.0, + "53": 41802.0, + "54": 41253.0, + "55": 37654.0, + "56": 41164.0, + "57": 37682.0, + "58": 41782.0, + "59": 39444.0, + "60": 40691.0, + "61": 40876.0, + "62": 43113.0, + "63": 38389.0, + "64": 43217.0, + "65": 41689.0, + "66": 45525.0, + "67": 41717.0, + "68": 40369.0, + "69": 41287.0, + "70": 45545.0, + "71": 41651.0, + "72": 41881.0, + "73": 45139.0, + "74": 35747.0, + "75": 39155.0, + "76": 44874.0, + "77": 45442.0, + "78": 46782.0, + "79": 48776.0, + "80": 47161.0, + "81": 51277.0, + "82": 49953.0, + "83": 45334.0, + "84": 46096.0, + "85": 49238.0, + "86": 46118.0, + "87": 49880.0, + "88": 47115.0, + "89": 48583.0, + "90": 49057.0, + "91": 45950.0, + "92": 47820.0, + "93": 46437.0, + "94": 47530.0, + "95": 48000.0, + "96": 50285.0, + "97": 46225.0, + "98": 49809.0, + "99": 47890.0, + "100": 44636.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 892864512.0, + "2": 892868608.0, + "3": 892868608.0, + "4": 892864512.0, + "5": 892865024.0, + "6": 892866560.0, + "7": 892866048.0, + "8": 892867584.0, + "9": 892865536.0, + "10": 892867584.0, + "11": 892866048.0, + "12": 892865536.0, + "13": 892865536.0, + "14": 892868096.0, + "15": 892867584.0, + "16": 892867072.0, + "17": 892867584.0, + "18": 892869632.0, + "19": 892868096.0, + "20": 892866560.0, + "21": 892866560.0, + "22": 892863488.0, + "23": 892864512.0, + "24": 892867072.0, + "25": 892863488.0, + "26": 892866560.0, + "27": 892867072.0, + "28": 892865536.0, + "29": 892866048.0, + "30": 892863488.0, + "31": 892862464.0, + "32": 892861952.0, + "33": 892866048.0, + "34": 892865536.0, + "35": 892865024.0, + "36": 892868608.0, + "37": 892867072.0, + "38": 892866560.0, + "39": 892866048.0, + "40": 892867072.0, + "41": 892865536.0, + "42": 892867584.0, + "43": 892861440.0, + "44": 892862976.0, + "45": 892865024.0, + "46": 892864512.0, + "47": 892865024.0, + "48": 892861440.0, + "49": 892863488.0, + "50": 892867072.0, + "51": 892860416.0, + "52": 892858880.0, + "53": 892861440.0, + "54": 892861440.0, + "55": 892862464.0, + "56": 892865024.0, + "57": 892857344.0, + "58": 892859392.0, + "59": 892858880.0, + "60": 892859904.0, + "61": 892868608.0, + "62": 892865536.0, + "63": 892861952.0, + "64": 892863488.0, + "65": 892851712.0, + "66": 892866048.0, + "67": 892861440.0, + "68": 892868608.0, + "69": 892864512.0, + "70": 892866560.0, + "71": 892868608.0, + "72": 892860416.0, + "73": 892868096.0, + "74": 892858368.0, + "75": 892867072.0, + "76": 892866560.0, + "77": 892867072.0, + "78": 892863488.0, + "79": 892864512.0, + "80": 892864512.0, + "81": 892866048.0, + "82": 892864000.0, + "83": 892860928.0, + "84": 892861440.0, + "85": 892861952.0, + "86": 892861440.0, + "87": 892870144.0, + "88": 892862464.0, + "89": 892864512.0, + "90": 892866048.0, + "91": 892867072.0, + "92": 892865536.0, + "93": 892868608.0, + "94": 892864512.0, + "95": 892865024.0, + "96": 892865024.0, + "97": 892862976.0, + "98": 892867584.0, + "99": 892859904.0, + "100": 892861952.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1867566080.0, + "2": 2107252736.0, + "3": 2107252736.0, + "4": 2107252736.0, + "5": 2107481600.0, + "6": 2107481600.0, + "7": 2107481600.0, + "8": 2107481600.0, + "9": 2107481600.0, + "10": 2108814336.0, + "11": 2108814336.0, + "12": 2108814336.0, + "13": 2108814336.0, + "14": 2108814336.0, + "15": 2108814336.0, + "16": 2109139456.0, + "17": 2109139456.0, + "18": 2109139456.0, + "19": 2109139456.0, + "20": 2109139456.0, + "21": 2109139456.0, + "22": 2109139456.0, + "23": 2109139456.0, + "24": 2109139456.0, + "25": 2109139456.0, + "26": 2109139456.0, + "27": 2109139456.0, + "28": 2109139456.0, + "29": 2109139456.0, + "30": 2109139456.0, + "31": 2109139456.0, + "32": 2109139456.0, + "33": 2109139456.0, + "34": 2109139456.0, + "35": 2109139456.0, + "36": 2109139456.0, + "37": 2109139456.0, + "38": 2109139456.0, + "39": 2109139456.0, + "40": 2109139456.0, + "41": 2109139456.0, + "42": 2109139456.0, + "43": 2109139456.0, + "44": 2109139456.0, + "45": 2109139456.0, + "46": 2109139456.0, + "47": 2109139456.0, + "48": 2109139456.0, + "49": 2109139456.0, + "50": 2109139456.0, + "51": 2109139456.0, + "52": 2109139456.0, + "53": 2109139456.0, + "54": 2109139456.0, + "55": 2109139456.0, + "56": 2109139456.0, + "57": 2109139456.0, + "58": 2109139456.0, + "59": 2109139456.0, + "60": 2109139456.0, + "61": 2109139456.0, + "62": 2109139456.0, + "63": 2109139456.0, + "64": 2109139456.0, + "65": 2109139456.0, + "66": 2109139456.0, + "67": 2109139456.0, + "68": 2109139456.0, + "69": 2109139456.0, + "70": 2109139456.0, + "71": 2109139456.0, + "72": 2109139456.0, + "73": 2109139456.0, + "74": 2109139456.0, + "75": 2109139456.0, + "76": 2109139456.0, + "77": 2109139456.0, + "78": 2109139456.0, + "79": 2109139456.0, + "80": 2109139456.0, + "81": 2109139456.0, + "82": 2109139456.0, + "83": 2109139456.0, + "84": 2109139456.0, + "85": 2109139456.0, + "86": 2109139456.0, + "87": 2109897728.0, + "88": 2109897728.0, + "89": 2109897728.0, + "90": 2109897728.0, + "91": 2109897728.0, + "92": 2109897728.0, + "93": 2109897728.0, + "94": 2109897728.0, + "95": 2109897728.0, + "96": 2109897728.0, + "97": 2109897728.0, + "98": 2109897728.0, + "99": 2109897728.0, + "100": 2109897728.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 14.1374, + "2": 0.29466, + "3": 0.26236, + "4": 0.26156, + "5": 0.24237, + "6": 0.23849, + "7": 0.252, + "8": 0.24427, + "9": 0.24029, + "10": 0.23618, + "11": 0.23659, + "12": 0.23342, + "13": 0.23316, + "14": 0.23233, + "15": 0.24856, + "16": 0.23522, + "17": 0.24126, + "18": 0.22751, + "19": 0.2299, + "20": 0.23346, + "21": 0.23441, + "22": 0.22921, + "23": 0.23376, + "24": 0.23927, + "25": 0.23185, + "26": 0.23099, + "27": 0.22756, + "28": 0.2284, + "29": 0.22889, + "30": 0.23032, + "31": 0.26621, + "32": 0.23553, + "33": 0.23683, + "34": 0.25808, + "35": 0.23912, + "36": 0.23198, + "37": 0.23086, + "38": 0.23515, + "39": 0.2291, + "40": 0.24108, + "41": 0.23663, + "42": 0.23631, + "43": 0.23891, + "44": 0.23205, + "45": 0.24801, + "46": 0.2689, + "47": 0.23258, + "48": 0.25079, + "49": 0.26858, + "50": 0.2361, + "51": 0.27052, + "52": 0.26801, + "53": 0.23804, + "54": 0.23998, + "55": 0.25008, + "56": 0.29894, + "57": 0.26807, + "58": 0.23939, + "59": 0.24845, + "60": 0.24835, + "61": 0.24071, + "62": 0.23697, + "63": 0.25187, + "64": 0.24293, + "65": 0.31273, + "66": 0.23771, + "67": 0.28851, + "68": 0.25834, + "69": 0.24387, + "70": 0.23624, + "71": 0.26612, + "72": 0.25067, + "73": 0.28048, + "74": 0.26617, + "75": 0.24822, + "76": 0.26459, + "77": 0.23429, + "78": 0.24496, + "79": 0.24741, + "80": 0.25523, + "81": 0.2433, + "82": 0.23696, + "83": 0.2421, + "84": 0.24973, + "85": 0.24316, + "86": 0.25585, + "87": 0.23448, + "88": 0.23245, + "89": 0.25191, + "90": 0.23373, + "91": 0.25927, + "92": 0.24203, + "93": 0.25124, + "94": 0.26498, + "95": 0.24482, + "96": 0.23378, + "97": 0.25053, + "98": 0.23165, + "99": 0.24761, + "100": 0.23858 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 0ff756ea400..0938c76ab04 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.78091, + "2": 10.80272, + "3": 10.8036, + "4": 10.77566, "5": 10.83259, + "6": 10.83704, + "7": 10.79728, + "8": 10.79467, + "9": 10.80828, "10": 10.76154, + "11": 10.85384, + "12": 10.84189, + "13": 10.82465, + "14": 10.85824, "15": 10.78235, + "16": 10.77923, + "17": 10.7484, + "18": 10.78919, + "19": 10.77567, "20": 10.71707, + "21": 10.70767, + "22": 10.54782, + "23": 10.72977, + "24": 10.60346, "25": 10.55815, + "26": 10.61659, + "27": 10.6449, + "28": 10.62536, + "29": 10.6349, "30": 10.42303, + "31": 10.16459, + "32": 10.51284, + "33": 10.50836, + "34": 10.2667, "35": 10.32353, + "36": 10.2895, + "37": 10.41051, + "38": 10.26406, + "39": 10.44988, "40": 10.17537, + "41": 10.20908, + "42": 10.27843, + "43": 9.91808, + "44": 10.03128, "45": 9.92032, + "46": 9.88579, + "47": 10.19208, + "48": 9.92758, + "49": 9.61634, "50": 9.98512, + "51": 9.90532, + "52": 9.8039, + "53": 10.12749, + "54": 10.00016, "55": 9.93664, + "56": 9.68581, + "57": 9.55837, + "58": 9.90508, + "59": 9.63839, "60": 9.57464, + "61": 9.76841, + "62": 10.03826, + "63": 9.44553, + "64": 9.82755, "65": 9.00746, + "66": 9.77476, + "67": 9.41315, + "68": 9.84101, + "69": 9.8283, "70": 9.79049, + "71": 9.66947, + "72": 9.62799, + "73": 9.54696, + "74": 9.03684, "75": 9.49167, + "76": 9.16779, + "77": 10.1088, + "78": 9.77072, + "79": 9.43806, "80": 9.45438, + "81": 9.5225, + "82": 9.74228, + "83": 9.36999, + "84": 9.45397, "85": 9.65808, + "86": 9.12501, + "87": 9.62705, + "88": 9.79641, + "89": 9.66075, "90": 9.8512, + "91": 9.39414, + "92": 9.40741, + "93": 9.13573, + "94": 8.89066, "95": 9.56273, + "96": 9.5712, + "97": 9.34355, + "98": 9.73013, + "99": 8.95039, "100": 9.44212 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 30994.0, + "2": 32962.0, + "3": 33026.0, + "4": 30732.0, "5": 36042.0, + "6": 36987.0, + "7": 34490.0, + "8": 31442.0, + "9": 33931.0, "10": 29993.0, + "11": 37681.0, + "12": 34978.0, + "13": 36675.0, + "14": 37601.0, "15": 34369.0, + "16": 36581.0, + "17": 34615.0, + "18": 34408.0, + "19": 35362.0, "20": 32532.0, + "21": 33181.0, + "22": 30426.0, + "23": 37807.0, + "24": 32299.0, "25": 30879.0, + "26": 33994.0, + "27": 34721.0, + "28": 36576.0, + "29": 37196.0, "30": 32443.0, + "31": 30177.0, + "32": 35948.0, + "33": 37549.0, + "34": 32243.0, "35": 33961.0, + "36": 34340.0, + "37": 37853.0, + "38": 35694.0, + "39": 38797.0, "40": 36317.0, + "41": 35380.0, + "42": 36704.0, + "43": 34045.0, + "44": 33691.0, "45": 35877.0, + "46": 36737.0, + "47": 40148.0, + "48": 36696.0, + "49": 36203.0, "50": 38688.0, + "51": 37791.0, + "52": 37021.0, + "53": 41944.0, + "54": 40947.0, "55": 37727.0, + "56": 40761.0, + "57": 37481.0, + "58": 41787.0, + "59": 39365.0, "60": 40922.0, + "61": 41100.0, + "62": 43388.0, + "63": 38269.0, + "64": 43526.0, "65": 41821.0, + "66": 44876.0, + "67": 42497.0, + "68": 39967.0, + "69": 41255.0, "70": 45781.0, + "71": 42348.0, + "72": 42151.0, + "73": 45043.0, + "74": 35705.0, "75": 39397.0, + "76": 45340.0, + "77": 45670.0, + "78": 46614.0, + "79": 49159.0, "80": 47317.0, + "81": 51048.0, + "82": 49312.0, + "83": 45257.0, + "84": 45494.0, "85": 49366.0, + "86": 45783.0, + "87": 50223.0, + "88": 47536.0, + "89": 48826.0, "90": 49499.0, + "91": 45726.0, + "92": 47926.0, + "93": 46433.0, + "94": 47675.0, "95": 47504.0, + "96": 50174.0, + "97": 46465.0, + "98": 49255.0, + "99": 48053.0, "100": 44507.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1254501376.0, + "2": 1254505472.0, + "3": 1254505472.0, + "4": 1254501376.0, "5": 1254501888.0, + "6": 1254503424.0, + "7": 1254503936.0, + "8": 1254503936.0, + "9": 1254501888.0, "10": 1254503424.0, + "11": 1254503936.0, + "12": 1254502912.0, + "13": 1254500864.0, + "14": 1254505472.0, "15": 1254504448.0, + "16": 1254503424.0, + "17": 1254504448.0, + "18": 1254502400.0, + "19": 1254503936.0, "20": 1254503424.0, + "21": 1254503424.0, + "22": 1254501376.0, + "23": 1254500864.0, + "24": 1254503424.0, "25": 1254500352.0, + "26": 1254502400.0, + "27": 1254501888.0, + "28": 1254502912.0, + "29": 1254505472.0, "30": 1254500352.0, + "31": 1254499328.0, + "32": 1254500352.0, + "33": 1254502912.0, + "34": 1254502912.0, "35": 1254501888.0, + "36": 1254505472.0, + "37": 1254503424.0, + "38": 1254503936.0, + "39": 1254502912.0, "40": 1254502912.0, + "41": 1254503424.0, + "42": 1254502912.0, + "43": 1254499840.0, + "44": 1254501376.0, "45": 1254502400.0, + "46": 1254500864.0, + "47": 1254503936.0, + "48": 1254499840.0, + "49": 1254500352.0, "50": 1254502912.0, + "51": 1254496768.0, + "52": 1254496256.0, + "53": 1254497792.0, + "54": 1254498304.0, "55": 1254500352.0, + "56": 1254501888.0, + "57": 1254493184.0, + "58": 1254498304.0, + "59": 1254495232.0, "60": 1254496768.0, + "61": 1254504960.0, + "62": 1254503936.0, + "63": 1254499328.0, + "64": 1254498816.0, "65": 1254488576.0, + "66": 1254502912.0, + "67": 1254498304.0, + "68": 1254505984.0, + "69": 1254501376.0, "70": 1254502912.0, + "71": 1254504960.0, + "72": 1254496256.0, + "73": 1254504448.0, + "74": 1254495232.0, "75": 1254504448.0, + "76": 1254503424.0, + "77": 1254503936.0, + "78": 1254500352.0, + "79": 1254500864.0, "80": 1254499840.0, + "81": 1254503424.0, + "82": 1254500352.0, + "83": 1254497792.0, + "84": 1254497280.0, "85": 1254499328.0, + "86": 1254498816.0, + "87": 1254505472.0, + "88": 1254499328.0, + "89": 1254500864.0, "90": 1254502912.0, + "91": 1254505472.0, + "92": 1254502912.0, + "93": 1254505472.0, + "94": 1254500352.0, "95": 1254501888.0, + "96": 1254501888.0, + "97": 1254499328.0, + "98": 1254507520.0, + "99": 1254497280.0, "100": 1254499840.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1987779584.0, + "2": 2468141568.0, + "3": 2468920320.0, + "4": 2468920320.0, "5": 2468920320.0, + "6": 2468920320.0, + "7": 2468920320.0, + "8": 2468920320.0, + "9": 2469234688.0, "10": 2469234688.0, + "11": 2469234688.0, + "12": 2469234688.0, + "13": 2469234688.0, + "14": 2469234688.0, "15": 2469234688.0, + "16": 2469234688.0, + "17": 2469234688.0, + "18": 2469234688.0, + "19": 2469234688.0, "20": 2469234688.0, + "21": 2469234688.0, + "22": 2469234688.0, + "23": 2469234688.0, + "24": 2469234688.0, "25": 2469234688.0, + "26": 2469234688.0, + "27": 2469234688.0, + "28": 2469234688.0, + "29": 2469234688.0, "30": 2469234688.0, + "31": 2469234688.0, + "32": 2469234688.0, + "33": 2469234688.0, + "34": 2469234688.0, "35": 2469234688.0, + "36": 2469234688.0, + "37": 2469234688.0, + "38": 2469234688.0, + "39": 2469234688.0, "40": 2469234688.0, + "41": 2469234688.0, + "42": 2469234688.0, + "43": 2469234688.0, + "44": 2469234688.0, "45": 2469234688.0, + "46": 2469234688.0, + "47": 2469234688.0, + "48": 2469234688.0, + "49": 2469234688.0, "50": 2469234688.0, + "51": 2469234688.0, + "52": 2469234688.0, + "53": 2469234688.0, + "54": 2469234688.0, "55": 2469234688.0, + "56": 2469234688.0, + "57": 2469234688.0, + "58": 2469234688.0, + "59": 2469234688.0, "60": 2469234688.0, + "61": 2469234688.0, + "62": 2469234688.0, + "63": 2469234688.0, + "64": 2469234688.0, "65": 2469234688.0, + "66": 2469234688.0, + "67": 2469234688.0, + "68": 2469234688.0, + "69": 2469234688.0, "70": 2469234688.0, + "71": 2469234688.0, + "72": 2469234688.0, + "73": 2469234688.0, + "74": 2469234688.0, "75": 2469234688.0, + "76": 2471084032.0, + "77": 2471084032.0, + "78": 2471084032.0, + "79": 2471084032.0, "80": 2471084032.0, + "81": 2471084032.0, + "82": 2471084032.0, + "83": 2471084032.0, + "84": 2471084032.0, "85": 2471084032.0, + "86": 2471084032.0, + "87": 2471084032.0, + "88": 2471084032.0, + "89": 2471084032.0, "90": 2471084032.0, + "91": 2471084032.0, + "92": 2471084032.0, + "93": 2471084032.0, + "94": 2471084032.0, "95": 2471084032.0, + "96": 2471084032.0, + "97": 2471084032.0, + "98": 2471084032.0, + "99": 2471084032.0, "100": 2471084032.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 14.69201, - "5": 0.22243, - "10": 0.21151, - "15": 0.23075, - "20": 0.20988, - "25": 0.20888, - "30": 0.20701, - "35": 0.21011, - "40": 0.23615, - "45": 0.23553, - "50": 0.21576, - "55": 0.22099, - "60": 0.21927, - "65": 0.27911, - "70": 0.2143, - "75": 0.22985, - "80": 0.22209, - "85": 0.21722, - "90": 0.21557, - "95": 0.22417, - "100": 0.21151 + "1": 16.45406, + "2": 0.30376, + "3": 0.27406, + "4": 0.26359, + "5": 0.25039, + "6": 0.25242, + "7": 0.26015, + "8": 0.2474, + "9": 0.25416, + "10": 0.2407, + "11": 0.24653, + "12": 0.23844, + "13": 0.2391, + "14": 0.2434, + "15": 0.25985, + "16": 0.24412, + "17": 0.25323, + "18": 0.24184, + "19": 0.23932, + "20": 0.23754, + "21": 0.23862, + "22": 0.24163, + "23": 0.24143, + "24": 0.23752, + "25": 0.23707, + "26": 0.24138, + "27": 0.23747, + "28": 0.2399, + "29": 0.2399, + "30": 0.24117, + "31": 0.28742, + "32": 0.24862, + "33": 0.24794, + "34": 0.28035, + "35": 0.24832, + "36": 0.24669, + "37": 0.23974, + "38": 0.25045, + "39": 0.239, + "40": 0.26253, + "41": 0.24423, + "42": 0.25718, + "43": 0.25559, + "44": 0.24336, + "45": 0.27381, + "46": 0.27372, + "47": 0.24664, + "48": 0.25954, + "49": 0.30788, + "50": 0.25811, + "51": 0.26735, + "52": 0.27368, + "53": 0.24833, + "54": 0.24973, + "55": 0.25579, + "56": 0.30268, + "57": 0.26237, + "58": 0.24805, + "59": 0.25916, + "60": 0.25631, + "61": 0.54796, + "62": 0.24754, + "63": 0.27021, + "64": 0.25819, + "65": 0.32296, + "66": 0.2505, + "67": 0.30141, + "68": 0.26641, + "69": 0.24765, + "70": 0.2537, + "71": 0.26961, + "72": 0.25601, + "73": 0.27973, + "74": 0.27306, + "75": 0.25761, + "76": 0.27858, + "77": 0.24804, + "78": 0.26307, + "79": 0.25987, + "80": 0.26126, + "81": 0.25077, + "82": 0.24475, + "83": 0.25581, + "84": 0.267, + "85": 0.25176, + "86": 0.2659, + "87": 0.24692, + "88": 0.24749, + "89": 0.26384, + "90": 0.24272, + "91": 0.26651, + "92": 0.25574, + "93": 0.26453, + "94": 0.27259, + "95": 0.25268, + "96": 0.24969, + "97": 0.2596, + "98": 0.24136, + "99": 0.25695, + "100": 0.25268 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..025cf16fd46 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.78091, + "2": 10.80272, + "3": 10.8036, + "4": 10.77566, + "5": 10.83259, + "6": 10.83704, + "7": 10.79728, + "8": 10.79467, + "9": 10.80828, + "10": 10.76154, + "11": 10.85384, + "12": 10.84189, + "13": 10.82465, + "14": 10.85824, + "15": 10.78235, + "16": 10.77923, + "17": 10.7484, + "18": 10.78919, + "19": 10.77567, + "20": 10.71707, + "21": 10.70767, + "22": 10.54782, + "23": 10.72977, + "24": 10.60346, + "25": 10.55815, + "26": 10.61659, + "27": 10.6449, + "28": 10.62536, + "29": 10.6349, + "30": 10.42303, + "31": 10.16459, + "32": 10.51284, + "33": 10.50836, + "34": 10.2667, + "35": 10.32353, + "36": 10.2895, + "37": 10.41051, + "38": 10.26406, + "39": 10.44988, + "40": 10.17537, + "41": 10.20908, + "42": 10.27843, + "43": 9.91808, + "44": 10.03128, + "45": 9.92032, + "46": 9.88579, + "47": 10.19208, + "48": 9.92758, + "49": 9.61634, + "50": 9.98512, + "51": 9.90532, + "52": 9.8039, + "53": 10.12749, + "54": 10.00016, + "55": 9.93664, + "56": 9.68581, + "57": 9.55837, + "58": 9.90508, + "59": 9.63839, + "60": 9.57464, + "61": 9.76841, + "62": 10.03826, + "63": 9.44553, + "64": 9.82755, + "65": 9.00746, + "66": 9.77476, + "67": 9.41315, + "68": 9.84101, + "69": 9.8283, + "70": 9.79049, + "71": 9.66947, + "72": 9.62799, + "73": 9.54696, + "74": 9.03684, + "75": 9.49167, + "76": 9.16779, + "77": 10.1088, + "78": 9.77072, + "79": 9.43806, + "80": 9.45438, + "81": 9.5225, + "82": 9.74228, + "83": 9.36999, + "84": 9.45397, + "85": 9.65808, + "86": 9.12501, + "87": 9.62705, + "88": 9.79641, + "89": 9.66075, + "90": 9.8512, + "91": 9.39414, + "92": 9.40741, + "93": 9.13573, + "94": 8.89066, + "95": 9.56273, + "96": 9.5712, + "97": 9.34355, + "98": 9.73013, + "99": 8.95039, + "100": 9.44212 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 30994.0, + "2": 32962.0, + "3": 33026.0, + "4": 30732.0, + "5": 36042.0, + "6": 36987.0, + "7": 34490.0, + "8": 31442.0, + "9": 33931.0, + "10": 29993.0, + "11": 37681.0, + "12": 34978.0, + "13": 36675.0, + "14": 37601.0, + "15": 34369.0, + "16": 36581.0, + "17": 34615.0, + "18": 34408.0, + "19": 35362.0, + "20": 32532.0, + "21": 33181.0, + "22": 30426.0, + "23": 37807.0, + "24": 32299.0, + "25": 30879.0, + "26": 33994.0, + "27": 34721.0, + "28": 36576.0, + "29": 37196.0, + "30": 32443.0, + "31": 30177.0, + "32": 35948.0, + "33": 37549.0, + "34": 32243.0, + "35": 33961.0, + "36": 34340.0, + "37": 37853.0, + "38": 35694.0, + "39": 38797.0, + "40": 36317.0, + "41": 35380.0, + "42": 36704.0, + "43": 34045.0, + "44": 33691.0, + "45": 35877.0, + "46": 36737.0, + "47": 40148.0, + "48": 36696.0, + "49": 36203.0, + "50": 38688.0, + "51": 37791.0, + "52": 37021.0, + "53": 41944.0, + "54": 40947.0, + "55": 37727.0, + "56": 40761.0, + "57": 37481.0, + "58": 41787.0, + "59": 39365.0, + "60": 40922.0, + "61": 41100.0, + "62": 43388.0, + "63": 38269.0, + "64": 43526.0, + "65": 41821.0, + "66": 44876.0, + "67": 42497.0, + "68": 39967.0, + "69": 41255.0, + "70": 45781.0, + "71": 42348.0, + "72": 42151.0, + "73": 45043.0, + "74": 35705.0, + "75": 39397.0, + "76": 45340.0, + "77": 45670.0, + "78": 46614.0, + "79": 49159.0, + "80": 47317.0, + "81": 51048.0, + "82": 49312.0, + "83": 45257.0, + "84": 45494.0, + "85": 49366.0, + "86": 45783.0, + "87": 50223.0, + "88": 47536.0, + "89": 48826.0, + "90": 49499.0, + "91": 45726.0, + "92": 47926.0, + "93": 46433.0, + "94": 47675.0, + "95": 47504.0, + "96": 50174.0, + "97": 46465.0, + "98": 49255.0, + "99": 48053.0, + "100": 44507.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1254501376.0, + "2": 1254505472.0, + "3": 1254505472.0, + "4": 1254501376.0, + "5": 1254501888.0, + "6": 1254503424.0, + "7": 1254503936.0, + "8": 1254503936.0, + "9": 1254501888.0, + "10": 1254503424.0, + "11": 1254503936.0, + "12": 1254502912.0, + "13": 1254500864.0, + "14": 1254505472.0, + "15": 1254504448.0, + "16": 1254503424.0, + "17": 1254504448.0, + "18": 1254502400.0, + "19": 1254503936.0, + "20": 1254503424.0, + "21": 1254503424.0, + "22": 1254501376.0, + "23": 1254500864.0, + "24": 1254503424.0, + "25": 1254500352.0, + "26": 1254502400.0, + "27": 1254501888.0, + "28": 1254502912.0, + "29": 1254505472.0, + "30": 1254500352.0, + "31": 1254499328.0, + "32": 1254500352.0, + "33": 1254502912.0, + "34": 1254502912.0, + "35": 1254501888.0, + "36": 1254505472.0, + "37": 1254503424.0, + "38": 1254503936.0, + "39": 1254502912.0, + "40": 1254502912.0, + "41": 1254503424.0, + "42": 1254502912.0, + "43": 1254499840.0, + "44": 1254501376.0, + "45": 1254502400.0, + "46": 1254500864.0, + "47": 1254503936.0, + "48": 1254499840.0, + "49": 1254500352.0, + "50": 1254502912.0, + "51": 1254496768.0, + "52": 1254496256.0, + "53": 1254497792.0, + "54": 1254498304.0, + "55": 1254500352.0, + "56": 1254501888.0, + "57": 1254493184.0, + "58": 1254498304.0, + "59": 1254495232.0, + "60": 1254496768.0, + "61": 1254504960.0, + "62": 1254503936.0, + "63": 1254499328.0, + "64": 1254498816.0, + "65": 1254488576.0, + "66": 1254502912.0, + "67": 1254498304.0, + "68": 1254505984.0, + "69": 1254501376.0, + "70": 1254502912.0, + "71": 1254504960.0, + "72": 1254496256.0, + "73": 1254504448.0, + "74": 1254495232.0, + "75": 1254504448.0, + "76": 1254503424.0, + "77": 1254503936.0, + "78": 1254500352.0, + "79": 1254500864.0, + "80": 1254499840.0, + "81": 1254503424.0, + "82": 1254500352.0, + "83": 1254497792.0, + "84": 1254497280.0, + "85": 1254499328.0, + "86": 1254498816.0, + "87": 1254505472.0, + "88": 1254499328.0, + "89": 1254500864.0, + "90": 1254502912.0, + "91": 1254505472.0, + "92": 1254502912.0, + "93": 1254505472.0, + "94": 1254500352.0, + "95": 1254501888.0, + "96": 1254501888.0, + "97": 1254499328.0, + "98": 1254507520.0, + "99": 1254497280.0, + "100": 1254499840.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1987779584.0, + "2": 2468141568.0, + "3": 2468920320.0, + "4": 2468920320.0, + "5": 2468920320.0, + "6": 2468920320.0, + "7": 2468920320.0, + "8": 2468920320.0, + "9": 2469234688.0, + "10": 2469234688.0, + "11": 2469234688.0, + "12": 2469234688.0, + "13": 2469234688.0, + "14": 2469234688.0, + "15": 2469234688.0, + "16": 2469234688.0, + "17": 2469234688.0, + "18": 2469234688.0, + "19": 2469234688.0, + "20": 2469234688.0, + "21": 2469234688.0, + "22": 2469234688.0, + "23": 2469234688.0, + "24": 2469234688.0, + "25": 2469234688.0, + "26": 2469234688.0, + "27": 2469234688.0, + "28": 2469234688.0, + "29": 2469234688.0, + "30": 2469234688.0, + "31": 2469234688.0, + "32": 2469234688.0, + "33": 2469234688.0, + "34": 2469234688.0, + "35": 2469234688.0, + "36": 2469234688.0, + "37": 2469234688.0, + "38": 2469234688.0, + "39": 2469234688.0, + "40": 2469234688.0, + "41": 2469234688.0, + "42": 2469234688.0, + "43": 2469234688.0, + "44": 2469234688.0, + "45": 2469234688.0, + "46": 2469234688.0, + "47": 2469234688.0, + "48": 2469234688.0, + "49": 2469234688.0, + "50": 2469234688.0, + "51": 2469234688.0, + "52": 2469234688.0, + "53": 2469234688.0, + "54": 2469234688.0, + "55": 2469234688.0, + "56": 2469234688.0, + "57": 2469234688.0, + "58": 2469234688.0, + "59": 2469234688.0, + "60": 2469234688.0, + "61": 2469234688.0, + "62": 2469234688.0, + "63": 2469234688.0, + "64": 2469234688.0, + "65": 2469234688.0, + "66": 2469234688.0, + "67": 2469234688.0, + "68": 2469234688.0, + "69": 2469234688.0, + "70": 2469234688.0, + "71": 2469234688.0, + "72": 2469234688.0, + "73": 2469234688.0, + "74": 2469234688.0, + "75": 2469234688.0, + "76": 2471084032.0, + "77": 2471084032.0, + "78": 2471084032.0, + "79": 2471084032.0, + "80": 2471084032.0, + "81": 2471084032.0, + "82": 2471084032.0, + "83": 2471084032.0, + "84": 2471084032.0, + "85": 2471084032.0, + "86": 2471084032.0, + "87": 2471084032.0, + "88": 2471084032.0, + "89": 2471084032.0, + "90": 2471084032.0, + "91": 2471084032.0, + "92": 2471084032.0, + "93": 2471084032.0, + "94": 2471084032.0, + "95": 2471084032.0, + "96": 2471084032.0, + "97": 2471084032.0, + "98": 2471084032.0, + "99": 2471084032.0, + "100": 2471084032.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 15.17389, + "2": 0.29264, + "3": 0.24602, + "4": 0.24527, + "5": 0.22453, + "6": 0.22311, + "7": 0.23274, + "8": 0.2252, + "9": 0.22875, + "10": 0.21336, + "11": 0.21953, + "12": 0.21057, + "13": 0.21762, + "14": 0.22015, + "15": 0.22934, + "16": 0.21241, + "17": 0.22416, + "18": 0.21545, + "19": 0.21467, + "20": 0.21475, + "21": 0.21061, + "22": 0.21275, + "23": 0.21475, + "24": 0.21185, + "25": 0.21253, + "26": 0.2112, + "27": 0.21285, + "28": 0.2167, + "29": 0.20854, + "30": 0.21576, + "31": 0.23787, + "32": 0.21289, + "33": 0.22111, + "34": 0.23768, + "35": 0.2106, + "36": 0.22199, + "37": 0.21758, + "38": 0.21584, + "39": 0.21031, + "40": 0.2149, + "41": 0.21829, + "42": 0.2324, + "43": 0.21985, + "44": 0.21241, + "45": 0.23011, + "46": 0.23336, + "47": 0.21312, + "48": 0.2234, + "49": 0.24557, + "50": 0.21111, + "51": 0.25988, + "52": 0.23849, + "53": 0.21639, + "54": 0.21699, + "55": 0.22888, + "56": 0.30406, + "57": 0.23464, + "58": 0.23245, + "59": 0.22402, + "60": 0.22789, + "61": 0.21859, + "62": 0.21793, + "63": 0.25413, + "64": 0.23301, + "65": 0.2935, + "66": 0.22039, + "67": 0.3074, + "68": 0.2458, + "69": 0.21734, + "70": 0.21543, + "71": 0.23323, + "72": 0.22846, + "73": 0.25747, + "74": 0.23067, + "75": 0.21956, + "76": 0.24584, + "77": 0.222, + "78": 0.22595, + "79": 0.23137, + "80": 0.22335, + "81": 0.22154, + "82": 0.21547, + "83": 0.22443, + "84": 0.22286, + "85": 0.22074, + "86": 0.2341, + "87": 0.21707, + "88": 0.21529, + "89": 0.2232, + "90": 0.21712, + "91": 0.23519, + "92": 0.22408, + "93": 0.23443, + "94": 0.24578, + "95": 0.22228, + "96": 0.21797, + "97": 0.22197, + "98": 0.21363, + "99": 0.22332, + "100": 0.22233 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..1a09e73e300 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.78091, + "2": 10.80272, + "3": 10.8036, + "4": 10.77566, + "5": 10.83259, + "6": 10.83704, + "7": 10.79728, + "8": 10.79467, + "9": 10.80828, + "10": 10.76154, + "11": 10.85384, + "12": 10.84189, + "13": 10.82465, + "14": 10.85824, + "15": 10.78235, + "16": 10.77923, + "17": 10.7484, + "18": 10.78919, + "19": 10.77567, + "20": 10.71707, + "21": 10.70767, + "22": 10.54782, + "23": 10.72977, + "24": 10.60346, + "25": 10.55815, + "26": 10.61659, + "27": 10.6449, + "28": 10.62536, + "29": 10.6349, + "30": 10.42303, + "31": 10.16459, + "32": 10.51284, + "33": 10.50836, + "34": 10.2667, + "35": 10.32353, + "36": 10.2895, + "37": 10.41051, + "38": 10.26406, + "39": 10.44988, + "40": 10.17537, + "41": 10.20908, + "42": 10.27843, + "43": 9.91808, + "44": 10.03128, + "45": 9.92032, + "46": 9.88579, + "47": 10.19208, + "48": 9.92758, + "49": 9.61634, + "50": 9.98512, + "51": 9.90532, + "52": 9.8039, + "53": 10.12749, + "54": 10.00016, + "55": 9.93664, + "56": 9.68581, + "57": 9.55837, + "58": 9.90508, + "59": 9.63839, + "60": 9.57464, + "61": 9.76841, + "62": 10.03826, + "63": 9.44553, + "64": 9.82755, + "65": 9.00746, + "66": 9.77476, + "67": 9.41315, + "68": 9.84101, + "69": 9.8283, + "70": 9.79049, + "71": 9.66947, + "72": 9.62799, + "73": 9.54696, + "74": 9.03684, + "75": 9.49167, + "76": 9.16779, + "77": 10.1088, + "78": 9.77072, + "79": 9.43806, + "80": 9.45438, + "81": 9.5225, + "82": 9.74228, + "83": 9.36999, + "84": 9.45397, + "85": 9.65808, + "86": 9.12501, + "87": 9.62705, + "88": 9.79641, + "89": 9.66075, + "90": 9.8512, + "91": 9.39414, + "92": 9.40741, + "93": 9.13573, + "94": 8.89066, + "95": 9.56273, + "96": 9.5712, + "97": 9.34355, + "98": 9.73013, + "99": 8.95039, + "100": 9.44212 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 30994.0, + "2": 32962.0, + "3": 33026.0, + "4": 30732.0, + "5": 36042.0, + "6": 36987.0, + "7": 34490.0, + "8": 31442.0, + "9": 33931.0, + "10": 29993.0, + "11": 37681.0, + "12": 34978.0, + "13": 36675.0, + "14": 37601.0, + "15": 34369.0, + "16": 36581.0, + "17": 34615.0, + "18": 34408.0, + "19": 35362.0, + "20": 32532.0, + "21": 33181.0, + "22": 30426.0, + "23": 37807.0, + "24": 32299.0, + "25": 30879.0, + "26": 33994.0, + "27": 34721.0, + "28": 36576.0, + "29": 37196.0, + "30": 32443.0, + "31": 30177.0, + "32": 35948.0, + "33": 37549.0, + "34": 32243.0, + "35": 33961.0, + "36": 34340.0, + "37": 37853.0, + "38": 35694.0, + "39": 38797.0, + "40": 36317.0, + "41": 35380.0, + "42": 36704.0, + "43": 34045.0, + "44": 33691.0, + "45": 35877.0, + "46": 36737.0, + "47": 40148.0, + "48": 36696.0, + "49": 36203.0, + "50": 38688.0, + "51": 37791.0, + "52": 37021.0, + "53": 41944.0, + "54": 40947.0, + "55": 37727.0, + "56": 40761.0, + "57": 37481.0, + "58": 41787.0, + "59": 39365.0, + "60": 40922.0, + "61": 41100.0, + "62": 43388.0, + "63": 38269.0, + "64": 43526.0, + "65": 41821.0, + "66": 44876.0, + "67": 42497.0, + "68": 39967.0, + "69": 41255.0, + "70": 45781.0, + "71": 42348.0, + "72": 42151.0, + "73": 45043.0, + "74": 35705.0, + "75": 39397.0, + "76": 45340.0, + "77": 45670.0, + "78": 46614.0, + "79": 49159.0, + "80": 47317.0, + "81": 51048.0, + "82": 49312.0, + "83": 45257.0, + "84": 45494.0, + "85": 49366.0, + "86": 45783.0, + "87": 50223.0, + "88": 47536.0, + "89": 48826.0, + "90": 49499.0, + "91": 45726.0, + "92": 47926.0, + "93": 46433.0, + "94": 47675.0, + "95": 47504.0, + "96": 50174.0, + "97": 46465.0, + "98": 49255.0, + "99": 48053.0, + "100": 44507.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1254501376.0, + "2": 1254505472.0, + "3": 1254505472.0, + "4": 1254501376.0, + "5": 1254501888.0, + "6": 1254503424.0, + "7": 1254503936.0, + "8": 1254503936.0, + "9": 1254501888.0, + "10": 1254503424.0, + "11": 1254503936.0, + "12": 1254502912.0, + "13": 1254500864.0, + "14": 1254505472.0, + "15": 1254504448.0, + "16": 1254503424.0, + "17": 1254504448.0, + "18": 1254502400.0, + "19": 1254503936.0, + "20": 1254503424.0, + "21": 1254503424.0, + "22": 1254501376.0, + "23": 1254500864.0, + "24": 1254503424.0, + "25": 1254500352.0, + "26": 1254502400.0, + "27": 1254501888.0, + "28": 1254502912.0, + "29": 1254505472.0, + "30": 1254500352.0, + "31": 1254499328.0, + "32": 1254500352.0, + "33": 1254502912.0, + "34": 1254502912.0, + "35": 1254501888.0, + "36": 1254505472.0, + "37": 1254503424.0, + "38": 1254503936.0, + "39": 1254502912.0, + "40": 1254502912.0, + "41": 1254503424.0, + "42": 1254502912.0, + "43": 1254499840.0, + "44": 1254501376.0, + "45": 1254502400.0, + "46": 1254500864.0, + "47": 1254503936.0, + "48": 1254499840.0, + "49": 1254500352.0, + "50": 1254502912.0, + "51": 1254496768.0, + "52": 1254496256.0, + "53": 1254497792.0, + "54": 1254498304.0, + "55": 1254500352.0, + "56": 1254501888.0, + "57": 1254493184.0, + "58": 1254498304.0, + "59": 1254495232.0, + "60": 1254496768.0, + "61": 1254504960.0, + "62": 1254503936.0, + "63": 1254499328.0, + "64": 1254498816.0, + "65": 1254488576.0, + "66": 1254502912.0, + "67": 1254498304.0, + "68": 1254505984.0, + "69": 1254501376.0, + "70": 1254502912.0, + "71": 1254504960.0, + "72": 1254496256.0, + "73": 1254504448.0, + "74": 1254495232.0, + "75": 1254504448.0, + "76": 1254503424.0, + "77": 1254503936.0, + "78": 1254500352.0, + "79": 1254500864.0, + "80": 1254499840.0, + "81": 1254503424.0, + "82": 1254500352.0, + "83": 1254497792.0, + "84": 1254497280.0, + "85": 1254499328.0, + "86": 1254498816.0, + "87": 1254505472.0, + "88": 1254499328.0, + "89": 1254500864.0, + "90": 1254502912.0, + "91": 1254505472.0, + "92": 1254502912.0, + "93": 1254505472.0, + "94": 1254500352.0, + "95": 1254501888.0, + "96": 1254501888.0, + "97": 1254499328.0, + "98": 1254507520.0, + "99": 1254497280.0, + "100": 1254499840.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1987779584.0, + "2": 2468141568.0, + "3": 2468920320.0, + "4": 2468920320.0, + "5": 2468920320.0, + "6": 2468920320.0, + "7": 2468920320.0, + "8": 2468920320.0, + "9": 2469234688.0, + "10": 2469234688.0, + "11": 2469234688.0, + "12": 2469234688.0, + "13": 2469234688.0, + "14": 2469234688.0, + "15": 2469234688.0, + "16": 2469234688.0, + "17": 2469234688.0, + "18": 2469234688.0, + "19": 2469234688.0, + "20": 2469234688.0, + "21": 2469234688.0, + "22": 2469234688.0, + "23": 2469234688.0, + "24": 2469234688.0, + "25": 2469234688.0, + "26": 2469234688.0, + "27": 2469234688.0, + "28": 2469234688.0, + "29": 2469234688.0, + "30": 2469234688.0, + "31": 2469234688.0, + "32": 2469234688.0, + "33": 2469234688.0, + "34": 2469234688.0, + "35": 2469234688.0, + "36": 2469234688.0, + "37": 2469234688.0, + "38": 2469234688.0, + "39": 2469234688.0, + "40": 2469234688.0, + "41": 2469234688.0, + "42": 2469234688.0, + "43": 2469234688.0, + "44": 2469234688.0, + "45": 2469234688.0, + "46": 2469234688.0, + "47": 2469234688.0, + "48": 2469234688.0, + "49": 2469234688.0, + "50": 2469234688.0, + "51": 2469234688.0, + "52": 2469234688.0, + "53": 2469234688.0, + "54": 2469234688.0, + "55": 2469234688.0, + "56": 2469234688.0, + "57": 2469234688.0, + "58": 2469234688.0, + "59": 2469234688.0, + "60": 2469234688.0, + "61": 2469234688.0, + "62": 2469234688.0, + "63": 2469234688.0, + "64": 2469234688.0, + "65": 2469234688.0, + "66": 2469234688.0, + "67": 2469234688.0, + "68": 2469234688.0, + "69": 2469234688.0, + "70": 2469234688.0, + "71": 2469234688.0, + "72": 2469234688.0, + "73": 2469234688.0, + "74": 2469234688.0, + "75": 2469234688.0, + "76": 2471084032.0, + "77": 2471084032.0, + "78": 2471084032.0, + "79": 2471084032.0, + "80": 2471084032.0, + "81": 2471084032.0, + "82": 2471084032.0, + "83": 2471084032.0, + "84": 2471084032.0, + "85": 2471084032.0, + "86": 2471084032.0, + "87": 2471084032.0, + "88": 2471084032.0, + "89": 2471084032.0, + "90": 2471084032.0, + "91": 2471084032.0, + "92": 2471084032.0, + "93": 2471084032.0, + "94": 2471084032.0, + "95": 2471084032.0, + "96": 2471084032.0, + "97": 2471084032.0, + "98": 2471084032.0, + "99": 2471084032.0, + "100": 2471084032.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 16.55217, + "2": 0.35181, + "3": 0.30566, + "4": 0.27474, + "5": 0.25821, + "6": 0.24756, + "7": 0.26543, + "8": 0.25377, + "9": 0.25669, + "10": 0.24857, + "11": 0.25265, + "12": 0.25052, + "13": 0.25023, + "14": 0.24925, + "15": 0.26244, + "16": 0.25012, + "17": 0.26253, + "18": 0.24643, + "19": 0.24809, + "20": 0.24556, + "21": 0.24394, + "22": 0.251, + "23": 0.24828, + "24": 0.24669, + "25": 0.24387, + "26": 0.24678, + "27": 0.24651, + "28": 0.25139, + "29": 0.24752, + "30": 0.24424, + "31": 0.28311, + "32": 0.25225, + "33": 0.24909, + "34": 0.26885, + "35": 0.25395, + "36": 0.2523, + "37": 0.24797, + "38": 0.25223, + "39": 0.24992, + "40": 0.25852, + "41": 0.24878, + "42": 0.2538, + "43": 0.2597, + "44": 0.24622, + "45": 0.26158, + "46": 0.27295, + "47": 0.2509, + "48": 0.26644, + "49": 0.28407, + "50": 0.25557, + "51": 0.26677, + "52": 0.27657, + "53": 0.25511, + "54": 0.25626, + "55": 0.26088, + "56": 0.30712, + "57": 0.27149, + "58": 0.25315, + "59": 0.26247, + "60": 0.26163, + "61": 0.25105, + "62": 0.24787, + "63": 0.27859, + "64": 0.26395, + "65": 0.32678, + "66": 0.25441, + "67": 0.30841, + "68": 0.27583, + "69": 0.2474, + "70": 0.25895, + "71": 0.27463, + "72": 0.26044, + "73": 0.27953, + "74": 0.27908, + "75": 0.26127, + "76": 0.28492, + "77": 0.25287, + "78": 0.26927, + "79": 0.26632, + "80": 0.26465, + "81": 0.25418, + "82": 0.25, + "83": 0.26012, + "84": 0.27232, + "85": 0.25707, + "86": 0.26564, + "87": 0.25446, + "88": 0.24718, + "89": 0.26899, + "90": 0.24357, + "91": 0.27455, + "92": 0.25494, + "93": 0.26852, + "94": 0.27917, + "95": 0.258, + "96": 0.25134, + "97": 0.26377, + "98": 0.24669, + "99": 0.26096, + "100": 0.25411 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 36909804253..7688d6ec4ea 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.80815, + "2": 10.82612, + "3": 10.83032, + "4": 10.80963, "5": 10.84127, + "6": 10.8581, + "7": 10.81967, + "8": 10.82506, + "9": 10.83749, "10": 10.7783, + "11": 10.85781, + "12": 10.85539, + "13": 10.85233, + "14": 10.86699, "15": 10.81253, + "16": 10.80292, + "17": 10.78098, + "18": 10.80788, + "19": 10.79276, "20": 10.74548, + "21": 10.72785, + "22": 10.59608, + "23": 10.73999, + "24": 10.63509, "25": 10.59832, + "26": 10.63517, + "27": 10.65744, + "28": 10.64536, + "29": 10.65122, "30": 10.44144, + "31": 10.21465, + "32": 10.53342, + "33": 10.52518, + "34": 10.30171, "35": 10.34871, + "36": 10.30843, + "37": 10.42353, + "38": 10.28859, + "39": 10.45514, "40": 10.19363, + "41": 10.22791, + "42": 10.29725, + "43": 9.95871, + "44": 10.06717, "45": 9.95955, + "46": 9.92614, + "47": 10.20607, + "48": 9.96021, + "49": 9.65854, "50": 10.01296 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 31590.0, + "2": 32940.0, + "3": 33668.0, + "4": 31186.0, "5": 36214.0, + "6": 37169.0, + "7": 34770.0, + "8": 31862.0, + "9": 34102.0, "10": 30394.0, + "11": 38432.0, + "12": 35039.0, + "13": 37236.0, + "14": 37668.0, "15": 34199.0, + "16": 36659.0, + "17": 34831.0, + "18": 35011.0, + "19": 35486.0, "20": 33221.0, + "21": 33971.0, + "22": 30501.0, + "23": 38411.0, + "24": 32764.0, "25": 31363.0, + "26": 34624.0, + "27": 36096.0, + "28": 37021.0, + "29": 37900.0, "30": 33066.0, + "31": 29871.0, + "32": 36113.0, + "33": 38168.0, + "34": 33074.0, "35": 34300.0, + "36": 35363.0, + "37": 38150.0, + "38": 35798.0, + "39": 38945.0, "40": 35780.0, + "41": 35999.0, + "42": 36611.0, + "43": 33781.0, + "44": 34207.0, "45": 35198.0, + "46": 36779.0, + "47": 40585.0, + "48": 36434.0, + "49": 35787.0, "50": 38996.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1027085824.0, + "2": 1027085824.0, + "3": 1027086848.0, + "4": 1027086336.0, "5": 1027086848.0, + "6": 1027085312.0, + "7": 1027081728.0, + "8": 1027082752.0, + "9": 1027089408.0, "10": 1027083776.0, + "11": 1027084288.0, + "12": 1027084288.0, + "13": 1027086848.0, + "14": 1027083776.0, "15": 1027085312.0, + "16": 1027086336.0, + "17": 1027084288.0, + "18": 1027088384.0, + "19": 1027086848.0, "20": 1027089920.0, + "21": 1027083264.0, + "22": 1027086336.0, + "23": 1027086848.0, + "24": 1027085824.0, "25": 1027084288.0, + "26": 1027085312.0, + "27": 1027085312.0, + "28": 1027082752.0, + "29": 1027083776.0, "30": 1027082240.0, + "31": 1027074048.0, + "32": 1027077120.0, + "33": 1027086336.0, + "34": 1027083264.0, "35": 1027085312.0, + "36": 1027083776.0, + "37": 1027084288.0, + "38": 1027085312.0, + "39": 1027080704.0, "40": 1027081728.0, + "41": 1027083264.0, + "42": 1027086848.0, + "43": 1027079680.0, + "44": 1027082752.0, "45": 1027082752.0, + "46": 1027073536.0, + "47": 1027082752.0, + "48": 1027081216.0, + "49": 1027077120.0, "50": 1027084800.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 3007080960.0, + "2": 3247499776.0, + "3": 3247499776.0, + "4": 3248093184.0, "5": 3248476160.0, + "6": 3248476160.0, + "7": 3248476160.0, + "8": 3248476160.0, + "9": 3248476160.0, "10": 3249142784.0, + "11": 3249142784.0, + "12": 3249142784.0, + "13": 3249142784.0, + "14": 3249142784.0, "15": 3249142784.0, + "16": 3249142784.0, + "17": 3249142784.0, + "18": 3249142784.0, + "19": 3249142784.0, "20": 3249142784.0, + "21": 3249142784.0, + "22": 3249860608.0, + "23": 3249860608.0, + "24": 3249972736.0, "25": 3249972736.0, + "26": 3249972736.0, + "27": 3249972736.0, + "28": 3249972736.0, + "29": 3249972736.0, "30": 3249972736.0, + "31": 3249972736.0, + "32": 3249972736.0, + "33": 3249972736.0, + "34": 3249972736.0, "35": 3249972736.0, + "36": 3249972736.0, + "37": 3249972736.0, + "38": 3249972736.0, + "39": 3249972736.0, "40": 3249972736.0, + "41": 3249972736.0, + "42": 3249972736.0, + "43": 3249972736.0, + "44": 3249972736.0, "45": 3249972736.0, + "46": 3249972736.0, + "47": 3249972736.0, + "48": 3249972736.0, + "49": 3249972736.0, "50": 3249972736.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 14.77721, - "5": 0.21434, - "10": 0.20442, - "15": 0.2258, - "20": 0.19737, - "25": 0.19707, - "30": 0.20038, - "35": 0.19865, - "40": 0.22651, - "45": 0.21953, - "50": 0.20317 + "1": 13.191, + "2": 0.30069, + "3": 0.25544, + "4": 0.25726, + "5": 0.25285, + "6": 0.23678, + "7": 0.24206, + "8": 0.23892, + "9": 0.23754, + "10": 0.23806, + "11": 0.22979, + "12": 0.23562, + "13": 0.24016, + "14": 0.22801, + "15": 0.25436, + "16": 0.23327, + "17": 0.24589, + "18": 0.23141, + "19": 0.23961, + "20": 0.23003, + "21": 0.22997, + "22": 0.23267, + "23": 0.22726, + "24": 0.22991, + "25": 0.22721, + "26": 0.23348, + "27": 0.23492, + "28": 0.22428, + "29": 0.23121, + "30": 0.23005, + "31": 0.27744, + "32": 0.22525, + "33": 0.22626, + "34": 0.26339, + "35": 0.23208, + "36": 0.24495, + "37": 0.22722, + "38": 0.23099, + "39": 0.22752, + "40": 0.25494, + "41": 0.24054, + "42": 0.22921, + "43": 0.249, + "44": 0.2389, + "45": 0.24525, + "46": 0.26032, + "47": 0.22841, + "48": 0.26262, + "49": 0.30096, + "50": 0.2341 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..275dd98287a --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.80815, + "2": 10.82612, + "3": 10.83032, + "4": 10.80963, + "5": 10.84127, + "6": 10.8581, + "7": 10.81967, + "8": 10.82506, + "9": 10.83749, + "10": 10.7783, + "11": 10.85781, + "12": 10.85539, + "13": 10.85233, + "14": 10.86699, + "15": 10.81253, + "16": 10.80292, + "17": 10.78098, + "18": 10.80788, + "19": 10.79276, + "20": 10.74548, + "21": 10.72785, + "22": 10.59608, + "23": 10.73999, + "24": 10.63509, + "25": 10.59832, + "26": 10.63517, + "27": 10.65744, + "28": 10.64536, + "29": 10.65122, + "30": 10.44144, + "31": 10.21465, + "32": 10.53342, + "33": 10.52518, + "34": 10.30171, + "35": 10.34871, + "36": 10.30843, + "37": 10.42353, + "38": 10.28859, + "39": 10.45514, + "40": 10.19363, + "41": 10.22791, + "42": 10.29725, + "43": 9.95871, + "44": 10.06717, + "45": 9.95955, + "46": 9.92614, + "47": 10.20607, + "48": 9.96021, + "49": 9.65854, + "50": 10.01296 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 31590.0, + "2": 32940.0, + "3": 33668.0, + "4": 31186.0, + "5": 36214.0, + "6": 37169.0, + "7": 34770.0, + "8": 31862.0, + "9": 34102.0, + "10": 30394.0, + "11": 38432.0, + "12": 35039.0, + "13": 37236.0, + "14": 37668.0, + "15": 34199.0, + "16": 36659.0, + "17": 34831.0, + "18": 35011.0, + "19": 35486.0, + "20": 33221.0, + "21": 33971.0, + "22": 30501.0, + "23": 38411.0, + "24": 32764.0, + "25": 31363.0, + "26": 34624.0, + "27": 36096.0, + "28": 37021.0, + "29": 37900.0, + "30": 33066.0, + "31": 29871.0, + "32": 36113.0, + "33": 38168.0, + "34": 33074.0, + "35": 34300.0, + "36": 35363.0, + "37": 38150.0, + "38": 35798.0, + "39": 38945.0, + "40": 35780.0, + "41": 35999.0, + "42": 36611.0, + "43": 33781.0, + "44": 34207.0, + "45": 35198.0, + "46": 36779.0, + "47": 40585.0, + "48": 36434.0, + "49": 35787.0, + "50": 38996.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1027085824.0, + "2": 1027085824.0, + "3": 1027086848.0, + "4": 1027086336.0, + "5": 1027086848.0, + "6": 1027085312.0, + "7": 1027081728.0, + "8": 1027082752.0, + "9": 1027089408.0, + "10": 1027083776.0, + "11": 1027084288.0, + "12": 1027084288.0, + "13": 1027086848.0, + "14": 1027083776.0, + "15": 1027085312.0, + "16": 1027086336.0, + "17": 1027084288.0, + "18": 1027088384.0, + "19": 1027086848.0, + "20": 1027089920.0, + "21": 1027083264.0, + "22": 1027086336.0, + "23": 1027086848.0, + "24": 1027085824.0, + "25": 1027084288.0, + "26": 1027085312.0, + "27": 1027085312.0, + "28": 1027082752.0, + "29": 1027083776.0, + "30": 1027082240.0, + "31": 1027074048.0, + "32": 1027077120.0, + "33": 1027086336.0, + "34": 1027083264.0, + "35": 1027085312.0, + "36": 1027083776.0, + "37": 1027084288.0, + "38": 1027085312.0, + "39": 1027080704.0, + "40": 1027081728.0, + "41": 1027083264.0, + "42": 1027086848.0, + "43": 1027079680.0, + "44": 1027082752.0, + "45": 1027082752.0, + "46": 1027073536.0, + "47": 1027082752.0, + "48": 1027081216.0, + "49": 1027077120.0, + "50": 1027084800.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3007080960.0, + "2": 3247499776.0, + "3": 3247499776.0, + "4": 3248093184.0, + "5": 3248476160.0, + "6": 3248476160.0, + "7": 3248476160.0, + "8": 3248476160.0, + "9": 3248476160.0, + "10": 3249142784.0, + "11": 3249142784.0, + "12": 3249142784.0, + "13": 3249142784.0, + "14": 3249142784.0, + "15": 3249142784.0, + "16": 3249142784.0, + "17": 3249142784.0, + "18": 3249142784.0, + "19": 3249142784.0, + "20": 3249142784.0, + "21": 3249142784.0, + "22": 3249860608.0, + "23": 3249860608.0, + "24": 3249972736.0, + "25": 3249972736.0, + "26": 3249972736.0, + "27": 3249972736.0, + "28": 3249972736.0, + "29": 3249972736.0, + "30": 3249972736.0, + "31": 3249972736.0, + "32": 3249972736.0, + "33": 3249972736.0, + "34": 3249972736.0, + "35": 3249972736.0, + "36": 3249972736.0, + "37": 3249972736.0, + "38": 3249972736.0, + "39": 3249972736.0, + "40": 3249972736.0, + "41": 3249972736.0, + "42": 3249972736.0, + "43": 3249972736.0, + "44": 3249972736.0, + "45": 3249972736.0, + "46": 3249972736.0, + "47": 3249972736.0, + "48": 3249972736.0, + "49": 3249972736.0, + "50": 3249972736.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.71692, + "2": 0.26373, + "3": 0.22224, + "4": 0.22077, + "5": 0.21189, + "6": 0.20289, + "7": 0.21135, + "8": 0.20381, + "9": 0.19968, + "10": 0.20492, + "11": 0.19946, + "12": 0.20155, + "13": 0.20199, + "14": 0.19656, + "15": 0.22053, + "16": 0.20059, + "17": 0.21367, + "18": 0.19607, + "19": 0.20515, + "20": 0.19743, + "21": 0.19704, + "22": 0.20196, + "23": 0.19722, + "24": 0.20083, + "25": 0.19715, + "26": 0.19715, + "27": 0.19781, + "28": 0.19694, + "29": 0.20125, + "30": 0.19779, + "31": 0.23471, + "32": 0.19855, + "33": 0.19914, + "34": 0.22545, + "35": 0.19732, + "36": 0.21424, + "37": 0.19385, + "38": 0.2012, + "39": 0.19477, + "40": 0.21557, + "41": 0.20631, + "42": 0.20013, + "43": 0.20558, + "44": 0.2055, + "45": 0.2088, + "46": 0.21767, + "47": 0.19618, + "48": 0.22507, + "49": 0.24168, + "50": 0.19817 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..089545b6f4a --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.80815, + "2": 10.82612, + "3": 10.83032, + "4": 10.80963, + "5": 10.84127, + "6": 10.8581, + "7": 10.81967, + "8": 10.82506, + "9": 10.83749, + "10": 10.7783, + "11": 10.85781, + "12": 10.85539, + "13": 10.85233, + "14": 10.86699, + "15": 10.81253, + "16": 10.80292, + "17": 10.78098, + "18": 10.80788, + "19": 10.79276, + "20": 10.74548, + "21": 10.72785, + "22": 10.59608, + "23": 10.73999, + "24": 10.63509, + "25": 10.59832, + "26": 10.63517, + "27": 10.65744, + "28": 10.64536, + "29": 10.65122, + "30": 10.44144, + "31": 10.21465, + "32": 10.53342, + "33": 10.52518, + "34": 10.30171, + "35": 10.34871, + "36": 10.30843, + "37": 10.42353, + "38": 10.28859, + "39": 10.45514, + "40": 10.19363, + "41": 10.22791, + "42": 10.29725, + "43": 9.95871, + "44": 10.06717, + "45": 9.95955, + "46": 9.92614, + "47": 10.20607, + "48": 9.96021, + "49": 9.65854, + "50": 10.01296 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 31590.0, + "2": 32940.0, + "3": 33668.0, + "4": 31186.0, + "5": 36214.0, + "6": 37169.0, + "7": 34770.0, + "8": 31862.0, + "9": 34102.0, + "10": 30394.0, + "11": 38432.0, + "12": 35039.0, + "13": 37236.0, + "14": 37668.0, + "15": 34199.0, + "16": 36659.0, + "17": 34831.0, + "18": 35011.0, + "19": 35486.0, + "20": 33221.0, + "21": 33971.0, + "22": 30501.0, + "23": 38411.0, + "24": 32764.0, + "25": 31363.0, + "26": 34624.0, + "27": 36096.0, + "28": 37021.0, + "29": 37900.0, + "30": 33066.0, + "31": 29871.0, + "32": 36113.0, + "33": 38168.0, + "34": 33074.0, + "35": 34300.0, + "36": 35363.0, + "37": 38150.0, + "38": 35798.0, + "39": 38945.0, + "40": 35780.0, + "41": 35999.0, + "42": 36611.0, + "43": 33781.0, + "44": 34207.0, + "45": 35198.0, + "46": 36779.0, + "47": 40585.0, + "48": 36434.0, + "49": 35787.0, + "50": 38996.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1027085824.0, + "2": 1027085824.0, + "3": 1027086848.0, + "4": 1027086336.0, + "5": 1027086848.0, + "6": 1027085312.0, + "7": 1027081728.0, + "8": 1027082752.0, + "9": 1027089408.0, + "10": 1027083776.0, + "11": 1027084288.0, + "12": 1027084288.0, + "13": 1027086848.0, + "14": 1027083776.0, + "15": 1027085312.0, + "16": 1027086336.0, + "17": 1027084288.0, + "18": 1027088384.0, + "19": 1027086848.0, + "20": 1027089920.0, + "21": 1027083264.0, + "22": 1027086336.0, + "23": 1027086848.0, + "24": 1027085824.0, + "25": 1027084288.0, + "26": 1027085312.0, + "27": 1027085312.0, + "28": 1027082752.0, + "29": 1027083776.0, + "30": 1027082240.0, + "31": 1027074048.0, + "32": 1027077120.0, + "33": 1027086336.0, + "34": 1027083264.0, + "35": 1027085312.0, + "36": 1027083776.0, + "37": 1027084288.0, + "38": 1027085312.0, + "39": 1027080704.0, + "40": 1027081728.0, + "41": 1027083264.0, + "42": 1027086848.0, + "43": 1027079680.0, + "44": 1027082752.0, + "45": 1027082752.0, + "46": 1027073536.0, + "47": 1027082752.0, + "48": 1027081216.0, + "49": 1027077120.0, + "50": 1027084800.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3007080960.0, + "2": 3247499776.0, + "3": 3247499776.0, + "4": 3248093184.0, + "5": 3248476160.0, + "6": 3248476160.0, + "7": 3248476160.0, + "8": 3248476160.0, + "9": 3248476160.0, + "10": 3249142784.0, + "11": 3249142784.0, + "12": 3249142784.0, + "13": 3249142784.0, + "14": 3249142784.0, + "15": 3249142784.0, + "16": 3249142784.0, + "17": 3249142784.0, + "18": 3249142784.0, + "19": 3249142784.0, + "20": 3249142784.0, + "21": 3249142784.0, + "22": 3249860608.0, + "23": 3249860608.0, + "24": 3249972736.0, + "25": 3249972736.0, + "26": 3249972736.0, + "27": 3249972736.0, + "28": 3249972736.0, + "29": 3249972736.0, + "30": 3249972736.0, + "31": 3249972736.0, + "32": 3249972736.0, + "33": 3249972736.0, + "34": 3249972736.0, + "35": 3249972736.0, + "36": 3249972736.0, + "37": 3249972736.0, + "38": 3249972736.0, + "39": 3249972736.0, + "40": 3249972736.0, + "41": 3249972736.0, + "42": 3249972736.0, + "43": 3249972736.0, + "44": 3249972736.0, + "45": 3249972736.0, + "46": 3249972736.0, + "47": 3249972736.0, + "48": 3249972736.0, + "49": 3249972736.0, + "50": 3249972736.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 13.20887, + "2": 0.29449, + "3": 0.26099, + "4": 0.25199, + "5": 0.24285, + "6": 0.23658, + "7": 0.24248, + "8": 0.23258, + "9": 0.22661, + "10": 0.23769, + "11": 0.22933, + "12": 0.23288, + "13": 0.23074, + "14": 0.22376, + "15": 0.25054, + "16": 0.22881, + "17": 0.23932, + "18": 0.22427, + "19": 0.23467, + "20": 0.22747, + "21": 0.22662, + "22": 0.22866, + "23": 0.22726, + "24": 0.22901, + "25": 0.22654, + "26": 0.22683, + "27": 0.22909, + "28": 0.2264, + "29": 0.23339, + "30": 0.23066, + "31": 0.27285, + "32": 0.22966, + "33": 0.23016, + "34": 0.24956, + "35": 0.23114, + "36": 0.24161, + "37": 0.22585, + "38": 0.23047, + "39": 0.22695, + "40": 0.24845, + "41": 0.23491, + "42": 0.22656, + "43": 0.23744, + "44": 0.23602, + "45": 0.24859, + "46": 0.25828, + "47": 0.2367, + "48": 0.2564, + "49": 0.27812, + "50": 0.23401 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..96602c602c1 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.8028, + "2": 10.82515, + "3": 10.81853, + "4": 10.80334, + "5": 10.85317, + "6": 10.86077, + "7": 10.83004, + "8": 10.82041, + "9": 10.8343, + "10": 10.79253, + "11": 10.86874, + "12": 10.84623, + "13": 10.85032, + "14": 10.87276, + "15": 10.81762, + "16": 10.80827, + "17": 10.78057, + "18": 10.80212, + "19": 10.80623, + "20": 10.74263, + "21": 10.72129, + "22": 10.60064, + "23": 10.73585, + "24": 10.62773, + "25": 10.58726, + "26": 10.64479, + "27": 10.65744, + "28": 10.633, + "29": 10.64664, + "30": 10.43425, + "31": 10.20993, + "32": 10.52274, + "33": 10.5182, + "34": 10.30593, + "35": 10.35057, + "36": 10.32257, + "37": 10.42006, + "38": 10.28232, + "39": 10.47402, + "40": 10.18634, + "41": 10.22711, + "42": 10.29407, + "43": 9.96562, + "44": 10.07121, + "45": 9.95891, + "46": 9.92944, + "47": 10.23158, + "48": 9.96456, + "49": 9.6648, + "50": 10.0194 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 31321.0, + "2": 33507.0, + "3": 33742.0, + "4": 31142.0, + "5": 36637.0, + "6": 37952.0, + "7": 35367.0, + "8": 31793.0, + "9": 34742.0, + "10": 30318.0, + "11": 38311.0, + "12": 35873.0, + "13": 37077.0, + "14": 38139.0, + "15": 35096.0, + "16": 36153.0, + "17": 34599.0, + "18": 35615.0, + "19": 36094.0, + "20": 33013.0, + "21": 33392.0, + "22": 30732.0, + "23": 37995.0, + "24": 32271.0, + "25": 30677.0, + "26": 34406.0, + "27": 35346.0, + "28": 37369.0, + "29": 38116.0, + "30": 32775.0, + "31": 30305.0, + "32": 36349.0, + "33": 38243.0, + "34": 33070.0, + "35": 34420.0, + "36": 34971.0, + "37": 38372.0, + "38": 36065.0, + "39": 38349.0, + "40": 36074.0, + "41": 36445.0, + "42": 37346.0, + "43": 33959.0, + "44": 33566.0, + "45": 35624.0, + "46": 36724.0, + "47": 40791.0, + "48": 35583.0, + "49": 34833.0, + "50": 39159.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 863505920.0, + "2": 863503872.0, + "3": 863507968.0, + "4": 863507968.0, + "5": 863502848.0, + "6": 863505408.0, + "7": 863508480.0, + "8": 863503872.0, + "9": 863506432.0, + "10": 863507456.0, + "11": 863503872.0, + "12": 863504896.0, + "13": 863506432.0, + "14": 863506432.0, + "15": 863503872.0, + "16": 863507456.0, + "17": 863511552.0, + "18": 863502848.0, + "19": 863505408.0, + "20": 863504896.0, + "21": 863508480.0, + "22": 863509504.0, + "23": 863507968.0, + "24": 863506944.0, + "25": 863506944.0, + "26": 863506944.0, + "27": 863504896.0, + "28": 863504896.0, + "29": 863505408.0, + "30": 863508992.0, + "31": 863515136.0, + "32": 863512064.0, + "33": 863506944.0, + "34": 863509504.0, + "35": 863511040.0, + "36": 863508992.0, + "37": 863505408.0, + "38": 863505920.0, + "39": 863507456.0, + "40": 863508480.0, + "41": 863513600.0, + "42": 863506432.0, + "43": 863510016.0, + "44": 863512576.0, + "45": 863503872.0, + "46": 863524352.0, + "47": 863503872.0, + "48": 863517696.0, + "49": 863512064.0, + "50": 863505920.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2741182976.0, + "2": 2981644288.0, + "3": 2982039040.0, + "4": 2984047104.0, + "5": 2984047104.0, + "6": 2984047104.0, + "7": 2984047104.0, + "8": 2984047104.0, + "9": 2984047104.0, + "10": 2984047104.0, + "11": 2984047104.0, + "12": 2984047104.0, + "13": 2984047104.0, + "14": 2984047104.0, + "15": 2984047104.0, + "16": 2984047104.0, + "17": 2985508864.0, + "18": 2985508864.0, + "19": 2985508864.0, + "20": 2985508864.0, + "21": 2985508864.0, + "22": 2985508864.0, + "23": 2985508864.0, + "24": 2985508864.0, + "25": 2985508864.0, + "26": 2985508864.0, + "27": 2985508864.0, + "28": 2985508864.0, + "29": 2985508864.0, + "30": 2985508864.0, + "31": 2986932736.0, + "32": 2986932736.0, + "33": 2986932736.0, + "34": 2986932736.0, + "35": 2986932736.0, + "36": 2986932736.0, + "37": 2986932736.0, + "38": 2986932736.0, + "39": 2986932736.0, + "40": 2988336640.0, + "41": 2988336640.0, + "42": 2988336640.0, + "43": 2988336640.0, + "44": 2988336640.0, + "45": 2988336640.0, + "46": 2990742016.0, + "47": 2990742016.0, + "48": 2990742016.0, + "49": 2990742016.0, + "50": 2990742016.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 28.65799, + "2": 0.38137, + "3": 0.29722, + "4": 0.29497, + "5": 0.29498, + "6": 0.29349, + "7": 0.28205, + "8": 0.28271, + "9": 0.28924, + "10": 0.28158, + "11": 0.28091, + "12": 0.28034, + "13": 0.28985, + "14": 0.28034, + "15": 0.28108, + "16": 0.28775, + "17": 0.28792, + "18": 0.28403, + "19": 0.28372, + "20": 0.2913, + "21": 0.28324, + "22": 0.28526, + "23": 0.28665, + "24": 0.28778, + "25": 0.28462, + "26": 0.28385, + "27": 0.29573, + "28": 0.28896, + "29": 0.28509, + "30": 0.28863, + "31": 0.28863, + "32": 0.28591, + "33": 0.28417, + "34": 0.2921, + "35": 0.28486, + "36": 0.28401, + "37": 0.28884, + "38": 0.28899, + "39": 0.28435, + "40": 0.28532, + "41": 0.29387, + "42": 0.28493, + "43": 0.28685, + "44": 0.28897, + "45": 0.28501, + "46": 0.28487, + "47": 0.28307, + "48": 0.29529, + "49": 0.28524, + "50": 0.28877 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..9dab947d0b7 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.8028, + "2": 10.82515, + "3": 10.81853, + "4": 10.80334, + "5": 10.85317, + "6": 10.86077, + "7": 10.83004, + "8": 10.82041, + "9": 10.8343, + "10": 10.79253, + "11": 10.86874, + "12": 10.84623, + "13": 10.85032, + "14": 10.87276, + "15": 10.81762, + "16": 10.80827, + "17": 10.78057, + "18": 10.80212, + "19": 10.80623, + "20": 10.74263, + "21": 10.72129, + "22": 10.60064, + "23": 10.73585, + "24": 10.62773, + "25": 10.58726, + "26": 10.64479, + "27": 10.65744, + "28": 10.633, + "29": 10.64664, + "30": 10.43425, + "31": 10.20993, + "32": 10.52274, + "33": 10.5182, + "34": 10.30593, + "35": 10.35057, + "36": 10.32257, + "37": 10.42006, + "38": 10.28232, + "39": 10.47402, + "40": 10.18634, + "41": 10.22711, + "42": 10.29407, + "43": 9.96562, + "44": 10.07121, + "45": 9.95891, + "46": 9.92944, + "47": 10.23158, + "48": 9.96456, + "49": 9.6648, + "50": 10.0194 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 31321.0, + "2": 33507.0, + "3": 33742.0, + "4": 31142.0, + "5": 36637.0, + "6": 37952.0, + "7": 35367.0, + "8": 31793.0, + "9": 34742.0, + "10": 30318.0, + "11": 38311.0, + "12": 35873.0, + "13": 37077.0, + "14": 38139.0, + "15": 35096.0, + "16": 36153.0, + "17": 34599.0, + "18": 35615.0, + "19": 36094.0, + "20": 33013.0, + "21": 33392.0, + "22": 30732.0, + "23": 37995.0, + "24": 32271.0, + "25": 30677.0, + "26": 34406.0, + "27": 35346.0, + "28": 37369.0, + "29": 38116.0, + "30": 32775.0, + "31": 30305.0, + "32": 36349.0, + "33": 38243.0, + "34": 33070.0, + "35": 34420.0, + "36": 34971.0, + "37": 38372.0, + "38": 36065.0, + "39": 38349.0, + "40": 36074.0, + "41": 36445.0, + "42": 37346.0, + "43": 33959.0, + "44": 33566.0, + "45": 35624.0, + "46": 36724.0, + "47": 40791.0, + "48": 35583.0, + "49": 34833.0, + "50": 39159.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 863505920.0, + "2": 863503872.0, + "3": 863507968.0, + "4": 863507968.0, + "5": 863502848.0, + "6": 863505408.0, + "7": 863508480.0, + "8": 863503872.0, + "9": 863506432.0, + "10": 863507456.0, + "11": 863503872.0, + "12": 863504896.0, + "13": 863506432.0, + "14": 863506432.0, + "15": 863503872.0, + "16": 863507456.0, + "17": 863511552.0, + "18": 863502848.0, + "19": 863505408.0, + "20": 863504896.0, + "21": 863508480.0, + "22": 863509504.0, + "23": 863507968.0, + "24": 863506944.0, + "25": 863506944.0, + "26": 863506944.0, + "27": 863504896.0, + "28": 863504896.0, + "29": 863505408.0, + "30": 863508992.0, + "31": 863515136.0, + "32": 863512064.0, + "33": 863506944.0, + "34": 863509504.0, + "35": 863511040.0, + "36": 863508992.0, + "37": 863505408.0, + "38": 863505920.0, + "39": 863507456.0, + "40": 863508480.0, + "41": 863513600.0, + "42": 863506432.0, + "43": 863510016.0, + "44": 863512576.0, + "45": 863503872.0, + "46": 863524352.0, + "47": 863503872.0, + "48": 863517696.0, + "49": 863512064.0, + "50": 863505920.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2741182976.0, + "2": 2981644288.0, + "3": 2982039040.0, + "4": 2984047104.0, + "5": 2984047104.0, + "6": 2984047104.0, + "7": 2984047104.0, + "8": 2984047104.0, + "9": 2984047104.0, + "10": 2984047104.0, + "11": 2984047104.0, + "12": 2984047104.0, + "13": 2984047104.0, + "14": 2984047104.0, + "15": 2984047104.0, + "16": 2984047104.0, + "17": 2985508864.0, + "18": 2985508864.0, + "19": 2985508864.0, + "20": 2985508864.0, + "21": 2985508864.0, + "22": 2985508864.0, + "23": 2985508864.0, + "24": 2985508864.0, + "25": 2985508864.0, + "26": 2985508864.0, + "27": 2985508864.0, + "28": 2985508864.0, + "29": 2985508864.0, + "30": 2985508864.0, + "31": 2986932736.0, + "32": 2986932736.0, + "33": 2986932736.0, + "34": 2986932736.0, + "35": 2986932736.0, + "36": 2986932736.0, + "37": 2986932736.0, + "38": 2986932736.0, + "39": 2986932736.0, + "40": 2988336640.0, + "41": 2988336640.0, + "42": 2988336640.0, + "43": 2988336640.0, + "44": 2988336640.0, + "45": 2988336640.0, + "46": 2990742016.0, + "47": 2990742016.0, + "48": 2990742016.0, + "49": 2990742016.0, + "50": 2990742016.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 26.77929, + "2": 0.35069, + "3": 0.29635, + "4": 0.29093, + "5": 0.29737, + "6": 0.28672, + "7": 0.287, + "8": 0.28763, + "9": 0.27837, + "10": 0.2836, + "11": 0.27718, + "12": 0.28544, + "13": 0.27594, + "14": 0.2837, + "15": 0.27575, + "16": 0.27871, + "17": 0.28446, + "18": 0.27545, + "19": 0.28584, + "20": 0.27829, + "21": 0.28615, + "22": 0.27646, + "23": 0.28898, + "24": 0.28121, + "25": 0.27681, + "26": 0.28221, + "27": 0.27678, + "28": 0.28281, + "29": 0.27538, + "30": 0.28558, + "31": 0.27818, + "32": 0.28487, + "33": 0.28365, + "34": 0.27627, + "35": 0.28667, + "36": 0.27506, + "37": 0.27898, + "38": 0.27579, + "39": 0.27983, + "40": 0.27537, + "41": 0.28267, + "42": 0.28389, + "43": 0.27833, + "44": 0.28559, + "45": 0.27679, + "46": 0.28352, + "47": 0.27541, + "48": 0.28696, + "49": 0.27685, + "50": 0.27938 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 114ac89edd7..5219c47c6db 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.80815, + "2": 10.82612, + "3": 10.83032, + "4": 10.80963, "5": 10.84127, + "6": 10.8581, + "7": 10.81967, + "8": 10.82506, + "9": 10.83749, "10": 10.7783, + "11": 10.85781, + "12": 10.85539, + "13": 10.85233, + "14": 10.86699, "15": 10.81253, + "16": 10.80292, + "17": 10.78098, + "18": 10.80788, + "19": 10.79276, "20": 10.74548, + "21": 10.72785, + "22": 10.59608, + "23": 10.73999, + "24": 10.63509, "25": 10.59832, + "26": 10.63517, + "27": 10.65744, + "28": 10.64536, + "29": 10.65122, "30": 10.44144, + "31": 10.21465, + "32": 10.53342, + "33": 10.52518, + "34": 10.30171, "35": 10.34871, + "36": 10.30843, + "37": 10.42353, + "38": 10.28859, + "39": 10.45514, "40": 10.19363, + "41": 10.22791, + "42": 10.29725, + "43": 9.95871, + "44": 10.06717, "45": 9.95955, + "46": 9.92614, + "47": 10.20607, + "48": 9.96021, + "49": 9.65854, "50": 10.01296 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 31590.0, + "2": 32940.0, + "3": 33668.0, + "4": 31186.0, "5": 36214.0, + "6": 37169.0, + "7": 34770.0, + "8": 31862.0, + "9": 34102.0, "10": 30394.0, + "11": 38432.0, + "12": 35039.0, + "13": 37236.0, + "14": 37668.0, "15": 34199.0, + "16": 36659.0, + "17": 34831.0, + "18": 35011.0, + "19": 35486.0, "20": 33221.0, + "21": 33971.0, + "22": 30501.0, + "23": 38411.0, + "24": 32764.0, "25": 31363.0, + "26": 34624.0, + "27": 36096.0, + "28": 37021.0, + "29": 37900.0, "30": 33066.0, + "31": 29871.0, + "32": 36113.0, + "33": 38168.0, + "34": 33074.0, "35": 34300.0, + "36": 35363.0, + "37": 38150.0, + "38": 35798.0, + "39": 38945.0, "40": 35780.0, + "41": 35999.0, + "42": 36611.0, + "43": 33781.0, + "44": 34207.0, "45": 35198.0, + "46": 36779.0, + "47": 40585.0, + "48": 36434.0, + "49": 35787.0, "50": 38996.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1027085824.0, + "2": 1027085824.0, + "3": 1027086848.0, + "4": 1027086336.0, "5": 1027086848.0, + "6": 1027085312.0, + "7": 1027081728.0, + "8": 1027082752.0, + "9": 1027089408.0, "10": 1027083776.0, + "11": 1027084288.0, + "12": 1027084288.0, + "13": 1027086848.0, + "14": 1027083776.0, "15": 1027085312.0, + "16": 1027086336.0, + "17": 1027084288.0, + "18": 1027088384.0, + "19": 1027086848.0, "20": 1027089920.0, + "21": 1027083264.0, + "22": 1027086336.0, + "23": 1027086848.0, + "24": 1027085824.0, "25": 1027084288.0, + "26": 1027085312.0, + "27": 1027085312.0, + "28": 1027082752.0, + "29": 1027083776.0, "30": 1027082240.0, + "31": 1027074048.0, + "32": 1027077120.0, + "33": 1027086336.0, + "34": 1027083264.0, "35": 1027085312.0, + "36": 1027083776.0, + "37": 1027084288.0, + "38": 1027085312.0, + "39": 1027080704.0, "40": 1027081728.0, + "41": 1027083264.0, + "42": 1027086848.0, + "43": 1027079680.0, + "44": 1027082752.0, "45": 1027082752.0, + "46": 1027073536.0, + "47": 1027082752.0, + "48": 1027081216.0, + "49": 1027077120.0, "50": 1027084800.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 3007080960.0, + "2": 3247499776.0, + "3": 3247499776.0, + "4": 3248093184.0, "5": 3248476160.0, + "6": 3248476160.0, + "7": 3248476160.0, + "8": 3248476160.0, + "9": 3248476160.0, "10": 3249142784.0, + "11": 3249142784.0, + "12": 3249142784.0, + "13": 3249142784.0, + "14": 3249142784.0, "15": 3249142784.0, + "16": 3249142784.0, + "17": 3249142784.0, + "18": 3249142784.0, + "19": 3249142784.0, "20": 3249142784.0, + "21": 3249142784.0, + "22": 3249860608.0, + "23": 3249860608.0, + "24": 3249972736.0, "25": 3249972736.0, + "26": 3249972736.0, + "27": 3249972736.0, + "28": 3249972736.0, + "29": 3249972736.0, "30": 3249972736.0, + "31": 3249972736.0, + "32": 3249972736.0, + "33": 3249972736.0, + "34": 3249972736.0, "35": 3249972736.0, + "36": 3249972736.0, + "37": 3249972736.0, + "38": 3249972736.0, + "39": 3249972736.0, "40": 3249972736.0, + "41": 3249972736.0, + "42": 3249972736.0, + "43": 3249972736.0, + "44": 3249972736.0, "45": 3249972736.0, + "46": 3249972736.0, + "47": 3249972736.0, + "48": 3249972736.0, + "49": 3249972736.0, "50": 3249972736.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 11.83817, - "5": 0.24003, - "10": 0.21528, - "15": 0.22788, - "20": 0.20411, - "25": 0.20559, - "30": 0.20453, - "35": 0.20404, - "40": 0.21841, - "45": 0.2091, - "50": 0.20464 + "1": 13.23313, + "2": 0.31808, + "3": 0.27025, + "4": 0.253, + "5": 0.25938, + "6": 0.23222, + "7": 0.24127, + "8": 0.23468, + "9": 0.22881, + "10": 0.23244, + "11": 0.23056, + "12": 0.23078, + "13": 0.23301, + "14": 0.22477, + "15": 0.24897, + "16": 0.22593, + "17": 0.24178, + "18": 0.23034, + "19": 0.23887, + "20": 0.24186, + "21": 0.23006, + "22": 0.23215, + "23": 0.22763, + "24": 0.22889, + "25": 0.22662, + "26": 0.22794, + "27": 0.22851, + "28": 0.22653, + "29": 0.22859, + "30": 0.22789, + "31": 0.27081, + "32": 0.22893, + "33": 0.22575, + "34": 0.24635, + "35": 0.22739, + "36": 0.2416, + "37": 0.24045, + "38": 0.23118, + "39": 0.2275, + "40": 0.24632, + "41": 0.233, + "42": 0.22755, + "43": 0.25276, + "44": 0.2354, + "45": 0.2355, + "46": 0.25059, + "47": 0.22589, + "48": 0.25741, + "49": 0.27315, + "50": 0.22384 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..ad63e8c681e --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.80815, + "2": 10.82612, + "3": 10.83032, + "4": 10.80963, + "5": 10.84127, + "6": 10.8581, + "7": 10.81967, + "8": 10.82506, + "9": 10.83749, + "10": 10.7783, + "11": 10.85781, + "12": 10.85539, + "13": 10.85233, + "14": 10.86699, + "15": 10.81253, + "16": 10.80292, + "17": 10.78098, + "18": 10.80788, + "19": 10.79276, + "20": 10.74548, + "21": 10.72785, + "22": 10.59608, + "23": 10.73999, + "24": 10.63509, + "25": 10.59832, + "26": 10.63517, + "27": 10.65744, + "28": 10.64536, + "29": 10.65122, + "30": 10.44144, + "31": 10.21465, + "32": 10.53342, + "33": 10.52518, + "34": 10.30171, + "35": 10.34871, + "36": 10.30843, + "37": 10.42353, + "38": 10.28859, + "39": 10.45514, + "40": 10.19363, + "41": 10.22791, + "42": 10.29725, + "43": 9.95871, + "44": 10.06717, + "45": 9.95955, + "46": 9.92614, + "47": 10.20607, + "48": 9.96021, + "49": 9.65854, + "50": 10.01296 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 31590.0, + "2": 32940.0, + "3": 33668.0, + "4": 31186.0, + "5": 36214.0, + "6": 37169.0, + "7": 34770.0, + "8": 31862.0, + "9": 34102.0, + "10": 30394.0, + "11": 38432.0, + "12": 35039.0, + "13": 37236.0, + "14": 37668.0, + "15": 34199.0, + "16": 36659.0, + "17": 34831.0, + "18": 35011.0, + "19": 35486.0, + "20": 33221.0, + "21": 33971.0, + "22": 30501.0, + "23": 38411.0, + "24": 32764.0, + "25": 31363.0, + "26": 34624.0, + "27": 36096.0, + "28": 37021.0, + "29": 37900.0, + "30": 33066.0, + "31": 29871.0, + "32": 36113.0, + "33": 38168.0, + "34": 33074.0, + "35": 34300.0, + "36": 35363.0, + "37": 38150.0, + "38": 35798.0, + "39": 38945.0, + "40": 35780.0, + "41": 35999.0, + "42": 36611.0, + "43": 33781.0, + "44": 34207.0, + "45": 35198.0, + "46": 36779.0, + "47": 40585.0, + "48": 36434.0, + "49": 35787.0, + "50": 38996.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1027085824.0, + "2": 1027085824.0, + "3": 1027086848.0, + "4": 1027086336.0, + "5": 1027086848.0, + "6": 1027085312.0, + "7": 1027081728.0, + "8": 1027082752.0, + "9": 1027089408.0, + "10": 1027083776.0, + "11": 1027084288.0, + "12": 1027084288.0, + "13": 1027086848.0, + "14": 1027083776.0, + "15": 1027085312.0, + "16": 1027086336.0, + "17": 1027084288.0, + "18": 1027088384.0, + "19": 1027086848.0, + "20": 1027089920.0, + "21": 1027083264.0, + "22": 1027086336.0, + "23": 1027086848.0, + "24": 1027085824.0, + "25": 1027084288.0, + "26": 1027085312.0, + "27": 1027085312.0, + "28": 1027082752.0, + "29": 1027083776.0, + "30": 1027082240.0, + "31": 1027074048.0, + "32": 1027077120.0, + "33": 1027086336.0, + "34": 1027083264.0, + "35": 1027085312.0, + "36": 1027083776.0, + "37": 1027084288.0, + "38": 1027085312.0, + "39": 1027080704.0, + "40": 1027081728.0, + "41": 1027083264.0, + "42": 1027086848.0, + "43": 1027079680.0, + "44": 1027082752.0, + "45": 1027082752.0, + "46": 1027073536.0, + "47": 1027082752.0, + "48": 1027081216.0, + "49": 1027077120.0, + "50": 1027084800.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3007080960.0, + "2": 3247499776.0, + "3": 3247499776.0, + "4": 3248093184.0, + "5": 3248476160.0, + "6": 3248476160.0, + "7": 3248476160.0, + "8": 3248476160.0, + "9": 3248476160.0, + "10": 3249142784.0, + "11": 3249142784.0, + "12": 3249142784.0, + "13": 3249142784.0, + "14": 3249142784.0, + "15": 3249142784.0, + "16": 3249142784.0, + "17": 3249142784.0, + "18": 3249142784.0, + "19": 3249142784.0, + "20": 3249142784.0, + "21": 3249142784.0, + "22": 3249860608.0, + "23": 3249860608.0, + "24": 3249972736.0, + "25": 3249972736.0, + "26": 3249972736.0, + "27": 3249972736.0, + "28": 3249972736.0, + "29": 3249972736.0, + "30": 3249972736.0, + "31": 3249972736.0, + "32": 3249972736.0, + "33": 3249972736.0, + "34": 3249972736.0, + "35": 3249972736.0, + "36": 3249972736.0, + "37": 3249972736.0, + "38": 3249972736.0, + "39": 3249972736.0, + "40": 3249972736.0, + "41": 3249972736.0, + "42": 3249972736.0, + "43": 3249972736.0, + "44": 3249972736.0, + "45": 3249972736.0, + "46": 3249972736.0, + "47": 3249972736.0, + "48": 3249972736.0, + "49": 3249972736.0, + "50": 3249972736.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 14.64212, + "2": 0.27662, + "3": 0.22726, + "4": 0.22741, + "5": 0.21976, + "6": 0.21005, + "7": 0.21904, + "8": 0.20701, + "9": 0.20029, + "10": 0.21109, + "11": 0.20188, + "12": 0.20386, + "13": 0.20452, + "14": 0.19789, + "15": 0.21511, + "16": 0.20036, + "17": 0.21345, + "18": 0.20466, + "19": 0.20569, + "20": 0.19783, + "21": 0.19857, + "22": 0.20281, + "23": 0.20165, + "24": 0.20398, + "25": 0.20864, + "26": 0.20632, + "27": 0.20092, + "28": 0.20357, + "29": 0.20116, + "30": 0.19889, + "31": 0.23444, + "32": 0.19868, + "33": 0.19728, + "34": 0.21322, + "35": 0.19907, + "36": 0.20947, + "37": 0.1964, + "38": 0.20026, + "39": 0.19448, + "40": 0.21304, + "41": 0.20077, + "42": 0.19863, + "43": 0.21502, + "44": 0.21008, + "45": 0.20452, + "46": 0.22473, + "47": 0.20011, + "48": 0.22634, + "49": 0.23823, + "50": 0.20221 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..c49c5a579c0 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.80815, + "2": 10.82612, + "3": 10.83032, + "4": 10.80963, + "5": 10.84127, + "6": 10.8581, + "7": 10.81967, + "8": 10.82506, + "9": 10.83749, + "10": 10.7783, + "11": 10.85781, + "12": 10.85539, + "13": 10.85233, + "14": 10.86699, + "15": 10.81253, + "16": 10.80292, + "17": 10.78098, + "18": 10.80788, + "19": 10.79276, + "20": 10.74548, + "21": 10.72785, + "22": 10.59608, + "23": 10.73999, + "24": 10.63509, + "25": 10.59832, + "26": 10.63517, + "27": 10.65744, + "28": 10.64536, + "29": 10.65122, + "30": 10.44144, + "31": 10.21465, + "32": 10.53342, + "33": 10.52518, + "34": 10.30171, + "35": 10.34871, + "36": 10.30843, + "37": 10.42353, + "38": 10.28859, + "39": 10.45514, + "40": 10.19363, + "41": 10.22791, + "42": 10.29725, + "43": 9.95871, + "44": 10.06717, + "45": 9.95955, + "46": 9.92614, + "47": 10.20607, + "48": 9.96021, + "49": 9.65854, + "50": 10.01296 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 31590.0, + "2": 32940.0, + "3": 33668.0, + "4": 31186.0, + "5": 36214.0, + "6": 37169.0, + "7": 34770.0, + "8": 31862.0, + "9": 34102.0, + "10": 30394.0, + "11": 38432.0, + "12": 35039.0, + "13": 37236.0, + "14": 37668.0, + "15": 34199.0, + "16": 36659.0, + "17": 34831.0, + "18": 35011.0, + "19": 35486.0, + "20": 33221.0, + "21": 33971.0, + "22": 30501.0, + "23": 38411.0, + "24": 32764.0, + "25": 31363.0, + "26": 34624.0, + "27": 36096.0, + "28": 37021.0, + "29": 37900.0, + "30": 33066.0, + "31": 29871.0, + "32": 36113.0, + "33": 38168.0, + "34": 33074.0, + "35": 34300.0, + "36": 35363.0, + "37": 38150.0, + "38": 35798.0, + "39": 38945.0, + "40": 35780.0, + "41": 35999.0, + "42": 36611.0, + "43": 33781.0, + "44": 34207.0, + "45": 35198.0, + "46": 36779.0, + "47": 40585.0, + "48": 36434.0, + "49": 35787.0, + "50": 38996.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1027085824.0, + "2": 1027085824.0, + "3": 1027086848.0, + "4": 1027086336.0, + "5": 1027086848.0, + "6": 1027085312.0, + "7": 1027081728.0, + "8": 1027082752.0, + "9": 1027089408.0, + "10": 1027083776.0, + "11": 1027084288.0, + "12": 1027084288.0, + "13": 1027086848.0, + "14": 1027083776.0, + "15": 1027085312.0, + "16": 1027086336.0, + "17": 1027084288.0, + "18": 1027088384.0, + "19": 1027086848.0, + "20": 1027089920.0, + "21": 1027083264.0, + "22": 1027086336.0, + "23": 1027086848.0, + "24": 1027085824.0, + "25": 1027084288.0, + "26": 1027085312.0, + "27": 1027085312.0, + "28": 1027082752.0, + "29": 1027083776.0, + "30": 1027082240.0, + "31": 1027074048.0, + "32": 1027077120.0, + "33": 1027086336.0, + "34": 1027083264.0, + "35": 1027085312.0, + "36": 1027083776.0, + "37": 1027084288.0, + "38": 1027085312.0, + "39": 1027080704.0, + "40": 1027081728.0, + "41": 1027083264.0, + "42": 1027086848.0, + "43": 1027079680.0, + "44": 1027082752.0, + "45": 1027082752.0, + "46": 1027073536.0, + "47": 1027082752.0, + "48": 1027081216.0, + "49": 1027077120.0, + "50": 1027084800.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3007080960.0, + "2": 3247499776.0, + "3": 3247499776.0, + "4": 3248093184.0, + "5": 3248476160.0, + "6": 3248476160.0, + "7": 3248476160.0, + "8": 3248476160.0, + "9": 3248476160.0, + "10": 3249142784.0, + "11": 3249142784.0, + "12": 3249142784.0, + "13": 3249142784.0, + "14": 3249142784.0, + "15": 3249142784.0, + "16": 3249142784.0, + "17": 3249142784.0, + "18": 3249142784.0, + "19": 3249142784.0, + "20": 3249142784.0, + "21": 3249142784.0, + "22": 3249860608.0, + "23": 3249860608.0, + "24": 3249972736.0, + "25": 3249972736.0, + "26": 3249972736.0, + "27": 3249972736.0, + "28": 3249972736.0, + "29": 3249972736.0, + "30": 3249972736.0, + "31": 3249972736.0, + "32": 3249972736.0, + "33": 3249972736.0, + "34": 3249972736.0, + "35": 3249972736.0, + "36": 3249972736.0, + "37": 3249972736.0, + "38": 3249972736.0, + "39": 3249972736.0, + "40": 3249972736.0, + "41": 3249972736.0, + "42": 3249972736.0, + "43": 3249972736.0, + "44": 3249972736.0, + "45": 3249972736.0, + "46": 3249972736.0, + "47": 3249972736.0, + "48": 3249972736.0, + "49": 3249972736.0, + "50": 3249972736.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 13.35552, + "2": 0.37785, + "3": 0.29632, + "4": 0.29599, + "5": 0.25057, + "6": 0.2376, + "7": 0.24788, + "8": 0.2386, + "9": 0.23567, + "10": 0.23981, + "11": 0.23457, + "12": 0.23608, + "13": 0.24093, + "14": 0.23076, + "15": 0.25524, + "16": 0.23573, + "17": 0.24636, + "18": 0.2348, + "19": 0.23922, + "20": 0.23445, + "21": 0.22924, + "22": 0.23872, + "23": 0.23172, + "24": 0.23116, + "25": 0.23103, + "26": 0.23556, + "27": 0.23228, + "28": 0.23323, + "29": 0.23495, + "30": 0.23011, + "31": 0.27652, + "32": 0.23015, + "33": 0.22902, + "34": 0.25666, + "35": 0.23045, + "36": 0.24626, + "37": 0.23146, + "38": 0.2344, + "39": 0.22864, + "40": 0.24642, + "41": 0.23788, + "42": 0.23274, + "43": 0.24326, + "44": 0.23733, + "45": 0.24263, + "46": 0.25392, + "47": 0.23328, + "48": 0.26156, + "49": 0.27837, + "50": 0.23303 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..171568354d3 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.8174, + "2": 10.8238, + "3": 10.83034, + "4": 10.79848, + "5": 10.86097, + "6": 10.86968, + "7": 10.83041, + "8": 10.83047, + "9": 10.83634, + "10": 10.80463, + "11": 10.87361, + "12": 10.85679, + "13": 10.86371, + "14": 10.87941, + "15": 10.79539, + "16": 10.79946, + "17": 10.7712, + "18": 10.80138, + "19": 10.78756, + "20": 10.71135, + "21": 10.67535, + "22": 10.53788, + "23": 10.68977, + "24": 10.57497, + "25": 10.51962, + "26": 10.57943, + "27": 10.58547, + "28": 10.55147, + "29": 10.56806, + "30": 10.33346, + "31": 10.06567, + "32": 10.42406, + "33": 10.43002, + "34": 10.16343, + "35": 10.22683, + "36": 10.19343, + "37": 10.30857, + "38": 10.14766, + "39": 10.38079, + "40": 10.041, + "41": 10.08555, + "42": 10.17528, + "43": 9.76706, + "44": 9.91338, + "45": 9.7722, + "46": 9.75215, + "47": 10.11047, + "48": 9.79832, + "49": 9.4591, + "50": 9.86932 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 26535.0, + "2": 29510.0, + "3": 29143.0, + "4": 28253.0, + "5": 31546.0, + "6": 32394.0, + "7": 30992.0, + "8": 27483.0, + "9": 30277.0, + "10": 25541.0, + "11": 33316.0, + "12": 30322.0, + "13": 32492.0, + "14": 32959.0, + "15": 30463.0, + "16": 31824.0, + "17": 30856.0, + "18": 30543.0, + "19": 31088.0, + "20": 28331.0, + "21": 28793.0, + "22": 27857.0, + "23": 33708.0, + "24": 28428.0, + "25": 27263.0, + "26": 30930.0, + "27": 31082.0, + "28": 32928.0, + "29": 34437.0, + "30": 29642.0, + "31": 28293.0, + "32": 32660.0, + "33": 35555.0, + "34": 30589.0, + "35": 32022.0, + "36": 33586.0, + "37": 35917.0, + "38": 34614.0, + "39": 37197.0, + "40": 34911.0, + "41": 33219.0, + "42": 35534.0, + "43": 34573.0, + "44": 33331.0, + "45": 35017.0, + "46": 35205.0, + "47": 39557.0, + "48": 35883.0, + "49": 36444.0, + "50": 38975.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1356172288.0, + "2": 1356165120.0, + "3": 1356179968.0, + "4": 1356190208.0, + "5": 1356170240.0, + "6": 1356170752.0, + "7": 1356184064.0, + "8": 1356165632.0, + "9": 1356161536.0, + "10": 1356160000.0, + "11": 1356167168.0, + "12": 1356178944.0, + "13": 1356167168.0, + "14": 1356162560.0, + "15": 1356180480.0, + "16": 1356185088.0, + "17": 1356156416.0, + "18": 1356187136.0, + "19": 1356171264.0, + "20": 1356170240.0, + "21": 1356188160.0, + "22": 1356186112.0, + "23": 1356185600.0, + "24": 1356181504.0, + "25": 1356182528.0, + "26": 1356189696.0, + "27": 1356189696.0, + "28": 1356181504.0, + "29": 1356182528.0, + "30": 1356198400.0, + "31": 1356187136.0, + "32": 1356177408.0, + "33": 1356187648.0, + "34": 1356187648.0, + "35": 1356182016.0, + "36": 1356178432.0, + "37": 1356182528.0, + "38": 1356186112.0, + "39": 1356170240.0, + "40": 1356156416.0, + "41": 1356169728.0, + "42": 1356151808.0, + "43": 1356151808.0, + "44": 1356146688.0, + "45": 1356140544.0, + "46": 1356133888.0, + "47": 1356111872.0, + "48": 1356119552.0, + "49": 1356118528.0, + "50": 1356098560.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3163797504.0, + "2": 3730006528.0, + "3": 3748878336.0, + "4": 3752917504.0, + "5": 3752917504.0, + "6": 3752917504.0, + "7": 3758158848.0, + "8": 3758158848.0, + "9": 3758158848.0, + "10": 3758158848.0, + "11": 3758158848.0, + "12": 3758158848.0, + "13": 3758158848.0, + "14": 3758158848.0, + "15": 3758158848.0, + "16": 3758158848.0, + "17": 3758158848.0, + "18": 3758158848.0, + "19": 3758158848.0, + "20": 3758158848.0, + "21": 3758158848.0, + "22": 3758158848.0, + "23": 3758158848.0, + "24": 3758158848.0, + "25": 3758158848.0, + "26": 3758158848.0, + "27": 3758158848.0, + "28": 3758158848.0, + "29": 3770054144.0, + "30": 3770054144.0, + "31": 3770054144.0, + "32": 3770054144.0, + "33": 3770054144.0, + "34": 3770054144.0, + "35": 3770054144.0, + "36": 3770054144.0, + "37": 3770054144.0, + "38": 3770054144.0, + "39": 3770054144.0, + "40": 3770054144.0, + "41": 3770054144.0, + "42": 3770054144.0, + "43": 3770054144.0, + "44": 3770054144.0, + "45": 3770054144.0, + "46": 3770054144.0, + "47": 3770054144.0, + "48": 3770054144.0, + "49": 3770054144.0, + "50": 3770054144.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 27.36516, + "2": 0.36408, + "3": 0.2993, + "4": 0.29949, + "5": 0.30461, + "6": 0.29574, + "7": 0.30041, + "8": 0.30848, + "9": 0.29849, + "10": 0.29846, + "11": 0.30503, + "12": 0.29885, + "13": 0.29495, + "14": 0.29657, + "15": 0.30665, + "16": 0.29545, + "17": 0.2982, + "18": 0.30792, + "19": 0.29588, + "20": 0.29657, + "21": 0.30198, + "22": 0.30357, + "23": 0.30049, + "24": 0.29959, + "25": 0.30994, + "26": 0.29865, + "27": 0.3002, + "28": 0.30774, + "29": 0.30125, + "30": 0.30366, + "31": 0.32063, + "32": 0.31461, + "33": 0.30383, + "34": 0.30388, + "35": 0.31199, + "36": 0.30381, + "37": 0.30412, + "38": 0.31439, + "39": 0.30499, + "40": 0.30779, + "41": 0.33024, + "42": 0.31735, + "43": 0.30791, + "44": 0.31609, + "45": 0.3076, + "46": 0.31885, + "47": 0.31309, + "48": 0.31902, + "49": 0.30799, + "50": 0.30894 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json new file mode 100644 index 00000000000..52e3e931ee9 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.8174, + "2": 10.8238, + "3": 10.83034, + "4": 10.79848, + "5": 10.86097, + "6": 10.86968, + "7": 10.83041, + "8": 10.83047, + "9": 10.83634, + "10": 10.80463, + "11": 10.87361, + "12": 10.85679, + "13": 10.86371, + "14": 10.87941, + "15": 10.79539, + "16": 10.79946, + "17": 10.7712, + "18": 10.80138, + "19": 10.78756, + "20": 10.71135, + "21": 10.67535, + "22": 10.53788, + "23": 10.68977, + "24": 10.57497, + "25": 10.51962, + "26": 10.57943, + "27": 10.58547, + "28": 10.55147, + "29": 10.56806, + "30": 10.33346, + "31": 10.06567, + "32": 10.42406, + "33": 10.43002, + "34": 10.16343, + "35": 10.22683, + "36": 10.19343, + "37": 10.30857, + "38": 10.14766, + "39": 10.38079, + "40": 10.041, + "41": 10.08555, + "42": 10.17528, + "43": 9.76706, + "44": 9.91338, + "45": 9.7722, + "46": 9.75215, + "47": 10.11047, + "48": 9.79832, + "49": 9.4591, + "50": 9.86932 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 26535.0, + "2": 29510.0, + "3": 29143.0, + "4": 28253.0, + "5": 31546.0, + "6": 32394.0, + "7": 30992.0, + "8": 27483.0, + "9": 30277.0, + "10": 25541.0, + "11": 33316.0, + "12": 30322.0, + "13": 32492.0, + "14": 32959.0, + "15": 30463.0, + "16": 31824.0, + "17": 30856.0, + "18": 30543.0, + "19": 31088.0, + "20": 28331.0, + "21": 28793.0, + "22": 27857.0, + "23": 33708.0, + "24": 28428.0, + "25": 27263.0, + "26": 30930.0, + "27": 31082.0, + "28": 32928.0, + "29": 34437.0, + "30": 29642.0, + "31": 28293.0, + "32": 32660.0, + "33": 35555.0, + "34": 30589.0, + "35": 32022.0, + "36": 33586.0, + "37": 35917.0, + "38": 34614.0, + "39": 37197.0, + "40": 34911.0, + "41": 33219.0, + "42": 35534.0, + "43": 34573.0, + "44": 33331.0, + "45": 35017.0, + "46": 35205.0, + "47": 39557.0, + "48": 35883.0, + "49": 36444.0, + "50": 38975.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1356172288.0, + "2": 1356165120.0, + "3": 1356179968.0, + "4": 1356190208.0, + "5": 1356170240.0, + "6": 1356170752.0, + "7": 1356184064.0, + "8": 1356165632.0, + "9": 1356161536.0, + "10": 1356160000.0, + "11": 1356167168.0, + "12": 1356178944.0, + "13": 1356167168.0, + "14": 1356162560.0, + "15": 1356180480.0, + "16": 1356185088.0, + "17": 1356156416.0, + "18": 1356187136.0, + "19": 1356171264.0, + "20": 1356170240.0, + "21": 1356188160.0, + "22": 1356186112.0, + "23": 1356185600.0, + "24": 1356181504.0, + "25": 1356182528.0, + "26": 1356189696.0, + "27": 1356189696.0, + "28": 1356181504.0, + "29": 1356182528.0, + "30": 1356198400.0, + "31": 1356187136.0, + "32": 1356177408.0, + "33": 1356187648.0, + "34": 1356187648.0, + "35": 1356182016.0, + "36": 1356178432.0, + "37": 1356182528.0, + "38": 1356186112.0, + "39": 1356170240.0, + "40": 1356156416.0, + "41": 1356169728.0, + "42": 1356151808.0, + "43": 1356151808.0, + "44": 1356146688.0, + "45": 1356140544.0, + "46": 1356133888.0, + "47": 1356111872.0, + "48": 1356119552.0, + "49": 1356118528.0, + "50": 1356098560.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3163797504.0, + "2": 3730006528.0, + "3": 3748878336.0, + "4": 3752917504.0, + "5": 3752917504.0, + "6": 3752917504.0, + "7": 3758158848.0, + "8": 3758158848.0, + "9": 3758158848.0, + "10": 3758158848.0, + "11": 3758158848.0, + "12": 3758158848.0, + "13": 3758158848.0, + "14": 3758158848.0, + "15": 3758158848.0, + "16": 3758158848.0, + "17": 3758158848.0, + "18": 3758158848.0, + "19": 3758158848.0, + "20": 3758158848.0, + "21": 3758158848.0, + "22": 3758158848.0, + "23": 3758158848.0, + "24": 3758158848.0, + "25": 3758158848.0, + "26": 3758158848.0, + "27": 3758158848.0, + "28": 3758158848.0, + "29": 3770054144.0, + "30": 3770054144.0, + "31": 3770054144.0, + "32": 3770054144.0, + "33": 3770054144.0, + "34": 3770054144.0, + "35": 3770054144.0, + "36": 3770054144.0, + "37": 3770054144.0, + "38": 3770054144.0, + "39": 3770054144.0, + "40": 3770054144.0, + "41": 3770054144.0, + "42": 3770054144.0, + "43": 3770054144.0, + "44": 3770054144.0, + "45": 3770054144.0, + "46": 3770054144.0, + "47": 3770054144.0, + "48": 3770054144.0, + "49": 3770054144.0, + "50": 3770054144.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 29.25664, + "2": 0.52496, + "3": 0.31117, + "4": 0.3115, + "5": 0.30744, + "6": 0.3073, + "7": 0.30608, + "8": 0.30768, + "9": 0.30608, + "10": 0.30812, + "11": 0.30587, + "12": 0.30181, + "13": 0.30601, + "14": 0.30172, + "15": 0.2992, + "16": 0.30316, + "17": 0.29987, + "18": 0.30154, + "19": 0.30104, + "20": 0.30976, + "21": 0.3056, + "22": 0.29977, + "23": 0.30766, + "24": 0.30782, + "25": 0.3, + "26": 0.30831, + "27": 0.3064, + "28": 0.30211, + "29": 0.30977, + "30": 0.30627, + "31": 0.31683, + "32": 0.31896, + "33": 0.308, + "34": 0.31449, + "35": 0.30656, + "36": 0.31192, + "37": 0.31478, + "38": 0.30653, + "39": 0.31106, + "40": 0.31664, + "41": 0.32127, + "42": 0.32489, + "43": 0.31002, + "44": 0.31115, + "45": 0.3117, + "46": 0.32232, + "47": 0.31526, + "48": 0.31918, + "49": 0.35454, + "50": 0.31865 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index cc62903f69e..6e2a34b26f8 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,91 +2,286 @@ "lm loss": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.77518, + "2": 10.78038, + "3": 10.79302, + "4": 10.74107, "5": 10.82013, + "6": 10.82951, + "7": 10.7953, + "8": 10.78263, + "9": 10.79278, "10": 10.7446, + "11": 10.85147, + "12": 10.82613, + "13": 10.82825, + "14": 10.85504, "15": 10.75536, + "16": 10.75777, + "17": 10.72319, + "18": 10.76274, + "19": 10.75075, "20": 10.66587, + "21": 10.6419, + "22": 10.47523, + "23": 10.66959, + "24": 10.54157, "25": 10.4825, + "26": 10.55255, + "27": 10.57459, + "28": 10.55159, + "29": 10.5668, "30": 10.31134, + "31": 10.01921, + "32": 10.42655, + "33": 10.42294, + "34": 10.14739, "35": 10.21574, + "36": 10.15811, + "37": 10.30279, + "38": 10.14031, + "39": 10.36301, "40": 10.02669, + "41": 10.07635, + "42": 10.16156, + "43": 9.74374, + "44": 9.88962, "45": 9.75874, + "46": 9.73618, + "47": 10.0844, + "48": 9.78532, + "49": 9.45072, "50": 9.85634 } }, "num-zeros": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { "1": 27105.0, + "2": 28791.0, + "3": 29282.0, + "4": 27583.0, "5": 31595.0, + "6": 32831.0, + "7": 31023.0, + "8": 27107.0, + "9": 30780.0, "10": 25505.0, + "11": 33684.0, + "12": 30235.0, + "13": 32960.0, + "14": 32880.0, "15": 30405.0, + "16": 32455.0, + "17": 30933.0, + "18": 30623.0, + "19": 30803.0, "20": 28593.0, + "21": 29002.0, + "22": 27030.0, + "23": 34463.0, + "24": 29154.0, "25": 27827.0, + "26": 31119.0, + "27": 32108.0, + "28": 33412.0, + "29": 34737.0, "30": 30465.0, + "31": 28775.0, + "32": 33115.0, + "33": 34745.0, + "34": 30785.0, "35": 32116.0, + "36": 33968.0, + "37": 36757.0, + "38": 34150.0, + "39": 37240.0, "40": 35353.0, + "41": 34638.0, + "42": 36703.0, + "43": 34601.0, + "44": 33783.0, "45": 35388.0, + "46": 35484.0, + "47": 40591.0, + "48": 36671.0, + "49": 36174.0, "50": 38231.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 1562143232.0, - "5": 1562716672.0, - "10": 1564981248.0, - "15": 1565375488.0, - "20": 1564531200.0, - "25": 1564925952.0, - "30": 1563997184.0, - "35": 1563508224.0, - "40": 1564344832.0, - "45": 1566202880.0, - "50": 1563379712.0 + "1": 1563272704.0, + "2": 1562858496.0, + "3": 1564486144.0, + "4": 1564041216.0, + "5": 1561823232.0, + "6": 1563443712.0, + "7": 1564206592.0, + "8": 1563517952.0, + "9": 1562183680.0, + "10": 1565040640.0, + "11": 1562508800.0, + "12": 1561081344.0, + "13": 1562479616.0, + "14": 1562858496.0, + "15": 1563188736.0, + "16": 1562045440.0, + "17": 1564147712.0, + "18": 1564288512.0, + "19": 1562883584.0, + "20": 1562017792.0, + "21": 1562184704.0, + "22": 1562030080.0, + "23": 1562267136.0, + "24": 1561898496.0, + "25": 1563593728.0, + "26": 1563150336.0, + "27": 1564444160.0, + "28": 1562418176.0, + "29": 1562973184.0, + "30": 1563487744.0, + "31": 1563070976.0, + "32": 1563377664.0, + "33": 1564346368.0, + "34": 1561956352.0, + "35": 1563001344.0, + "36": 1563246080.0, + "37": 1564364800.0, + "38": 1562608640.0, + "39": 1564432896.0, + "40": 1563148288.0, + "41": 1563740160.0, + "42": 1565268480.0, + "43": 1565179392.0, + "44": 1562279936.0, + "45": 1564082176.0, + "46": 1563706368.0, + "47": 1561835008.0, + "48": 1561798144.0, + "49": 1562701824.0, + "50": 1565224960.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 3676670976.0, - "5": 4262092288.0, - "10": 4290165248.0, - "15": 4290165248.0, - "20": 4290165248.0, - "25": 4290165248.0, - "30": 4290165248.0, - "35": 4290165248.0, - "40": 4290165248.0, - "45": 4290165248.0, - "50": 4290165248.0 + "1": 3678389248.0, + "2": 4261802496.0, + "3": 4262688768.0, + "4": 4262688768.0, + "5": 4262688768.0, + "6": 4288888832.0, + "7": 4288888832.0, + "8": 4288888832.0, + "9": 4288888832.0, + "10": 4288888832.0, + "11": 4288888832.0, + "12": 4288888832.0, + "13": 4288888832.0, + "14": 4288888832.0, + "15": 4288888832.0, + "16": 4288888832.0, + "17": 4288888832.0, + "18": 4288888832.0, + "19": 4288888832.0, + "20": 4288888832.0, + "21": 4288888832.0, + "22": 4288888832.0, + "23": 4288888832.0, + "24": 4288888832.0, + "25": 4288888832.0, + "26": 4288888832.0, + "27": 4288888832.0, + "28": 4288888832.0, + "29": 4288888832.0, + "30": 4288888832.0, + "31": 4288888832.0, + "32": 4288888832.0, + "33": 4288888832.0, + "34": 4288888832.0, + "35": 4288888832.0, + "36": 4288888832.0, + "37": 4288888832.0, + "38": 4288888832.0, + "39": 4288888832.0, + "40": 4288888832.0, + "41": 4288888832.0, + "42": 4288888832.0, + "43": 4288888832.0, + "44": 4288888832.0, + "45": 4288888832.0, + "46": 4288888832.0, + "47": 4288888832.0, + "48": 4288888832.0, + "49": 4288888832.0, + "50": 4288888832.0 } }, "iteration-time": { "start_step": 1, "end_step": 50, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 17.03683, - "5": 0.33922, - "10": 0.30304, - "15": 0.29637, - "20": 0.2596, - "25": 0.25723, - "30": 0.27136, - "35": 0.26623, - "40": 0.26866, - "45": 0.25523, - "50": 0.25705 + "1": 19.14758, + "2": 0.49766, + "3": 0.44107, + "4": 0.37175, + "5": 0.37026, + "6": 0.33176, + "7": 0.32446, + "8": 0.31735, + "9": 0.3291, + "10": 0.32512, + "11": 0.30495, + "12": 0.31438, + "13": 0.29955, + "14": 0.30728, + "15": 0.31532, + "16": 0.29631, + "17": 0.30956, + "18": 0.30533, + "19": 0.30054, + "20": 0.30291, + "21": 0.30231, + "22": 0.32081, + "23": 0.29797, + "24": 0.3059, + "25": 0.3093, + "26": 0.30535, + "27": 0.30202, + "28": 0.31154, + "29": 0.30205, + "30": 0.3198, + "31": 0.36657, + "32": 0.30974, + "33": 0.34056, + "34": 0.32396, + "35": 0.34679, + "36": 0.30488, + "37": 0.31477, + "38": 0.31377, + "39": 0.31065, + "40": 0.30631, + "41": 0.30771, + "42": 0.3003, + "43": 0.30915, + "44": 0.31796, + "45": 0.2949, + "46": 0.30522, + "47": 0.30099, + "48": 0.30303, + "49": 0.30198, + "50": 0.29985 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..3c9a1238968 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.77518, + "2": 10.78038, + "3": 10.79302, + "4": 10.74107, + "5": 10.82013, + "6": 10.82951, + "7": 10.7953, + "8": 10.78263, + "9": 10.79278, + "10": 10.7446, + "11": 10.85147, + "12": 10.82613, + "13": 10.82825, + "14": 10.85504, + "15": 10.75536, + "16": 10.75777, + "17": 10.72319, + "18": 10.76274, + "19": 10.75075, + "20": 10.66587, + "21": 10.6419, + "22": 10.47523, + "23": 10.66959, + "24": 10.54157, + "25": 10.4825, + "26": 10.55255, + "27": 10.57459, + "28": 10.55159, + "29": 10.5668, + "30": 10.31134, + "31": 10.01921, + "32": 10.42655, + "33": 10.42294, + "34": 10.14739, + "35": 10.21574, + "36": 10.15811, + "37": 10.30279, + "38": 10.14031, + "39": 10.36301, + "40": 10.02669, + "41": 10.07635, + "42": 10.16156, + "43": 9.74374, + "44": 9.88962, + "45": 9.75874, + "46": 9.73618, + "47": 10.0844, + "48": 9.78532, + "49": 9.45072, + "50": 9.85634 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 27105.0, + "2": 28791.0, + "3": 29282.0, + "4": 27583.0, + "5": 31595.0, + "6": 32831.0, + "7": 31023.0, + "8": 27107.0, + "9": 30780.0, + "10": 25505.0, + "11": 33684.0, + "12": 30235.0, + "13": 32960.0, + "14": 32880.0, + "15": 30405.0, + "16": 32455.0, + "17": 30933.0, + "18": 30623.0, + "19": 30803.0, + "20": 28593.0, + "21": 29002.0, + "22": 27030.0, + "23": 34463.0, + "24": 29154.0, + "25": 27827.0, + "26": 31119.0, + "27": 32108.0, + "28": 33412.0, + "29": 34737.0, + "30": 30465.0, + "31": 28775.0, + "32": 33115.0, + "33": 34745.0, + "34": 30785.0, + "35": 32116.0, + "36": 33968.0, + "37": 36757.0, + "38": 34150.0, + "39": 37240.0, + "40": 35353.0, + "41": 34638.0, + "42": 36703.0, + "43": 34601.0, + "44": 33783.0, + "45": 35388.0, + "46": 35484.0, + "47": 40591.0, + "48": 36671.0, + "49": 36174.0, + "50": 38231.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1560780288.0, + "2": 1562661888.0, + "3": 1561168384.0, + "4": 1562873856.0, + "5": 1561988096.0, + "6": 1562931712.0, + "7": 1560774144.0, + "8": 1560396800.0, + "9": 1559476224.0, + "10": 1561237504.0, + "11": 1560092160.0, + "12": 1561073152.0, + "13": 1560844288.0, + "14": 1560660992.0, + "15": 1561358848.0, + "16": 1562046464.0, + "17": 1562270720.0, + "18": 1561111040.0, + "19": 1560918528.0, + "20": 1560393728.0, + "21": 1559810048.0, + "22": 1560937472.0, + "23": 1560980992.0, + "24": 1563885056.0, + "25": 1564661760.0, + "26": 1562321920.0, + "27": 1560262144.0, + "28": 1561913344.0, + "29": 1561421824.0, + "30": 1562089984.0, + "31": 1563574784.0, + "32": 1560473600.0, + "33": 1560724480.0, + "34": 1560988672.0, + "35": 1559951872.0, + "36": 1561882112.0, + "37": 1560333312.0, + "38": 1561226240.0, + "39": 1562092032.0, + "40": 1563557888.0, + "41": 1561459712.0, + "42": 1561729536.0, + "43": 1562591744.0, + "44": 1562273792.0, + "45": 1560520704.0, + "46": 1565477888.0, + "47": 1562011136.0, + "48": 1562666496.0, + "49": 1560133632.0, + "50": 1562494976.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3682551808.0, + "2": 4261875200.0, + "3": 4261875200.0, + "4": 4261875200.0, + "5": 4262492672.0, + "6": 4286960640.0, + "7": 4286960640.0, + "8": 4286960640.0, + "9": 4286960640.0, + "10": 4286960640.0, + "11": 4286960640.0, + "12": 4286960640.0, + "13": 4286960640.0, + "14": 4286960640.0, + "15": 4286960640.0, + "16": 4286960640.0, + "17": 4286960640.0, + "18": 4286960640.0, + "19": 4286960640.0, + "20": 4286960640.0, + "21": 4286960640.0, + "22": 4286960640.0, + "23": 4286960640.0, + "24": 4286960640.0, + "25": 4286960640.0, + "26": 4286960640.0, + "27": 4286960640.0, + "28": 4286960640.0, + "29": 4286960640.0, + "30": 4286960640.0, + "31": 4286960640.0, + "32": 4286960640.0, + "33": 4286960640.0, + "34": 4286960640.0, + "35": 4286960640.0, + "36": 4286960640.0, + "37": 4286960640.0, + "38": 4286960640.0, + "39": 4286960640.0, + "40": 4286960640.0, + "41": 4286960640.0, + "42": 4286960640.0, + "43": 4286960640.0, + "44": 4286960640.0, + "45": 4286960640.0, + "46": 4286960640.0, + "47": 4286960640.0, + "48": 4286960640.0, + "49": 4286960640.0, + "50": 4286960640.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 20.83226, + "2": 0.4277, + "3": 0.36235, + "4": 0.32018, + "5": 0.32467, + "6": 0.2866, + "7": 0.29271, + "8": 0.2778, + "9": 0.28029, + "10": 0.27681, + "11": 0.26073, + "12": 0.26966, + "13": 0.26171, + "14": 0.26964, + "15": 0.26556, + "16": 0.26142, + "17": 0.26797, + "18": 0.26832, + "19": 0.25503, + "20": 0.26854, + "21": 0.26028, + "22": 0.27376, + "23": 0.26433, + "24": 0.27688, + "25": 0.26452, + "26": 0.26581, + "27": 0.26181, + "28": 0.26407, + "29": 0.26847, + "30": 0.28514, + "31": 0.27185, + "32": 0.26438, + "33": 0.26828, + "34": 0.27142, + "35": 0.27204, + "36": 0.28491, + "37": 0.28927, + "38": 0.26843, + "39": 0.27153, + "40": 0.27149, + "41": 0.2612, + "42": 0.25803, + "43": 0.27298, + "44": 0.28995, + "45": 0.28088, + "46": 0.28702, + "47": 0.27506, + "48": 0.2642, + "49": 0.26659, + "50": 0.25965 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..acf98f05d31 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.77518, + "2": 10.78038, + "3": 10.79302, + "4": 10.74107, + "5": 10.82013, + "6": 10.82951, + "7": 10.7953, + "8": 10.78263, + "9": 10.79278, + "10": 10.7446, + "11": 10.85147, + "12": 10.82613, + "13": 10.82825, + "14": 10.85504, + "15": 10.75536, + "16": 10.75777, + "17": 10.72319, + "18": 10.76274, + "19": 10.75075, + "20": 10.66587, + "21": 10.6419, + "22": 10.47523, + "23": 10.66959, + "24": 10.54157, + "25": 10.4825, + "26": 10.55255, + "27": 10.57459, + "28": 10.55159, + "29": 10.5668, + "30": 10.31134, + "31": 10.01921, + "32": 10.42655, + "33": 10.42294, + "34": 10.14739, + "35": 10.21574, + "36": 10.15811, + "37": 10.30279, + "38": 10.14031, + "39": 10.36301, + "40": 10.02669, + "41": 10.07635, + "42": 10.16156, + "43": 9.74374, + "44": 9.88962, + "45": 9.75874, + "46": 9.73618, + "47": 10.0844, + "48": 9.78532, + "49": 9.45072, + "50": 9.85634 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 27105.0, + "2": 28791.0, + "3": 29282.0, + "4": 27583.0, + "5": 31595.0, + "6": 32831.0, + "7": 31023.0, + "8": 27107.0, + "9": 30780.0, + "10": 25505.0, + "11": 33684.0, + "12": 30235.0, + "13": 32960.0, + "14": 32880.0, + "15": 30405.0, + "16": 32455.0, + "17": 30933.0, + "18": 30623.0, + "19": 30803.0, + "20": 28593.0, + "21": 29002.0, + "22": 27030.0, + "23": 34463.0, + "24": 29154.0, + "25": 27827.0, + "26": 31119.0, + "27": 32108.0, + "28": 33412.0, + "29": 34737.0, + "30": 30465.0, + "31": 28775.0, + "32": 33115.0, + "33": 34745.0, + "34": 30785.0, + "35": 32116.0, + "36": 33968.0, + "37": 36757.0, + "38": 34150.0, + "39": 37240.0, + "40": 35353.0, + "41": 34638.0, + "42": 36703.0, + "43": 34601.0, + "44": 33783.0, + "45": 35388.0, + "46": 35484.0, + "47": 40591.0, + "48": 36671.0, + "49": 36174.0, + "50": 38231.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1561367040.0, + "2": 1560972288.0, + "3": 1561248256.0, + "4": 1560096768.0, + "5": 1559926784.0, + "6": 1561850368.0, + "7": 1560161792.0, + "8": 1560285184.0, + "9": 1560998912.0, + "10": 1561293824.0, + "11": 1560700416.0, + "12": 1562299904.0, + "13": 1560526848.0, + "14": 1561499648.0, + "15": 1559979520.0, + "16": 1561232384.0, + "17": 1561337856.0, + "18": 1560266240.0, + "19": 1561224704.0, + "20": 1560222720.0, + "21": 1561771008.0, + "22": 1559743488.0, + "23": 1560801792.0, + "24": 1561316864.0, + "25": 1560606720.0, + "26": 1562301440.0, + "27": 1560251904.0, + "28": 1559861248.0, + "29": 1559861248.0, + "30": 1560919552.0, + "31": 1561406976.0, + "32": 1565212672.0, + "33": 1560626176.0, + "34": 1561871360.0, + "35": 1560959488.0, + "36": 1561910784.0, + "37": 1559904256.0, + "38": 1560347648.0, + "39": 1562116608.0, + "40": 1562510336.0, + "41": 1562299392.0, + "42": 1561589248.0, + "43": 1560753664.0, + "44": 1561721856.0, + "45": 1561170944.0, + "46": 1561996288.0, + "47": 1560805888.0, + "48": 1561083392.0, + "49": 1560795136.0, + "50": 1561778176.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3680567296.0, + "2": 4256236032.0, + "3": 4260136960.0, + "4": 4260136960.0, + "5": 4261063168.0, + "6": 4289287168.0, + "7": 4289287168.0, + "8": 4289287168.0, + "9": 4289287168.0, + "10": 4289287168.0, + "11": 4289287168.0, + "12": 4289287168.0, + "13": 4289287168.0, + "14": 4289287168.0, + "15": 4289287168.0, + "16": 4289287168.0, + "17": 4289287168.0, + "18": 4289287168.0, + "19": 4289287168.0, + "20": 4289287168.0, + "21": 4289287168.0, + "22": 4289287168.0, + "23": 4289287168.0, + "24": 4289287168.0, + "25": 4289287168.0, + "26": 4289287168.0, + "27": 4289287168.0, + "28": 4289287168.0, + "29": 4289287168.0, + "30": 4289287168.0, + "31": 4289287168.0, + "32": 4289287168.0, + "33": 4289287168.0, + "34": 4289287168.0, + "35": 4289287168.0, + "36": 4289287168.0, + "37": 4289287168.0, + "38": 4289287168.0, + "39": 4289287168.0, + "40": 4289287168.0, + "41": 4289287168.0, + "42": 4289287168.0, + "43": 4289287168.0, + "44": 4289287168.0, + "45": 4289287168.0, + "46": 4289287168.0, + "47": 4289287168.0, + "48": 4289287168.0, + "49": 4289287168.0, + "50": 4289287168.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 18.57368, + "2": 0.50382, + "3": 0.41522, + "4": 0.37227, + "5": 0.37501, + "6": 0.33117, + "7": 0.32515, + "8": 0.31941, + "9": 0.32367, + "10": 0.32326, + "11": 0.30606, + "12": 0.30616, + "13": 0.29955, + "14": 0.30443, + "15": 0.30558, + "16": 0.29289, + "17": 0.30498, + "18": 0.29213, + "19": 0.29318, + "20": 0.29695, + "21": 0.29798, + "22": 0.31295, + "23": 0.29473, + "24": 0.29975, + "25": 0.29698, + "26": 0.30574, + "27": 0.29785, + "28": 0.30807, + "29": 0.29928, + "30": 0.3087, + "31": 0.30718, + "32": 0.30993, + "33": 0.30203, + "34": 0.31719, + "35": 0.30742, + "36": 0.30563, + "37": 0.31427, + "38": 0.31171, + "39": 0.31768, + "40": 0.30755, + "41": 0.30394, + "42": 0.29792, + "43": 0.30454, + "44": 0.31398, + "45": 0.29651, + "46": 0.31171, + "47": 0.29161, + "48": 0.3034, + "49": 0.2972, + "50": 0.29959 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..58eb3fc16cd --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.95004, + "2": 10.9521, + "3": 10.5115, + "4": 9.96454, + "5": 9.93941, + "6": 9.67273, + "7": 10.20975, + "8": 9.49716, + "9": 9.55902, + "10": 9.79742, + "11": 9.30109, + "12": 9.40483, + "13": 9.39546, + "14": 8.84681, + "15": 9.02444, + "16": 9.07121, + "17": 9.04574, + "18": 8.75678, + "19": 9.18159, + "20": 8.8595, + "21": 8.53503, + "22": 8.55182, + "23": 8.42441, + "24": 8.37608, + "25": 8.64304, + "26": 7.97393, + "27": 8.56806, + "28": 8.19764, + "29": 8.3928, + "30": 8.67283, + "31": 8.289, + "32": 8.43572, + "33": 8.5568, + "34": 8.66018, + "35": 8.07934, + "36": 7.94976, + "37": 8.29565, + "38": 7.98044, + "39": 8.39201, + "40": 8.35513, + "41": 8.31876, + "42": 8.0583, + "43": 8.03283, + "44": 8.24243, + "45": 8.10277, + "46": 7.61696, + "47": 8.15273, + "48": 8.00569, + "49": 8.38688, + "50": 7.81491 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 19403624.0, + "2": 19274194.0, + "3": 19372760.0, + "4": 86525248.0, + "5": 148575568.0, + "6": 145226704.0, + "7": 171879984.0, + "8": 195785248.0, + "9": 164124752.0, + "10": 167684736.0, + "11": 221077344.0, + "12": 200384224.0, + "13": 248872528.0, + "14": 211169424.0, + "15": 214304608.0, + "16": 216075632.0, + "17": 267845984.0, + "18": 170470336.0, + "19": 176865072.0, + "20": 187955392.0, + "21": 225750704.0, + "22": 247396816.0, + "23": 211643856.0, + "24": 205638464.0, + "25": 277022272.0, + "26": 291562304.0, + "27": 225789840.0, + "28": 288202368.0, + "29": 198390384.0, + "30": 213302208.0, + "31": 227204752.0, + "32": 271112416.0, + "33": 231840432.0, + "34": 203575536.0, + "35": 191152368.0, + "36": 222566928.0, + "37": 177810112.0, + "38": 228708544.0, + "39": 211168784.0, + "40": 215603968.0, + "41": 200089440.0, + "42": 228529888.0, + "43": 198782848.0, + "44": 141902272.0, + "45": 181922816.0, + "46": 115369856.0, + "47": 170214176.0, + "48": 137292832.0, + "49": 97654936.0, + "50": 160979632.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4883602432.0, + "2": 4885017088.0, + "3": 4882657792.0, + "4": 4883046912.0, + "5": 4883725824.0, + "6": 4883713536.0, + "7": 4883040768.0, + "8": 4883273216.0, + "9": 4882952704.0, + "10": 4885949952.0, + "11": 4883990016.0, + "12": 4887679488.0, + "13": 4884011520.0, + "14": 4882899456.0, + "15": 4883515904.0, + "16": 4883990016.0, + "17": 4883410432.0, + "18": 4883673600.0, + "19": 4882903552.0, + "20": 4884541952.0, + "21": 4883138048.0, + "22": 4883247616.0, + "23": 4883839488.0, + "24": 4885058048.0, + "25": 4882676224.0, + "26": 4884058624.0, + "27": 4884724224.0, + "28": 4884874752.0, + "29": 4883127808.0, + "30": 4883252736.0, + "31": 4882955776.0, + "32": 4885190144.0, + "33": 4883845632.0, + "34": 4884392448.0, + "35": 4883083776.0, + "36": 4883851776.0, + "37": 4885246464.0, + "38": 4882680320.0, + "39": 4884296192.0, + "40": 4884689408.0, + "41": 4882836992.0, + "42": 4883972608.0, + "43": 4884519424.0, + "44": 4883354112.0, + "45": 4883495424.0, + "46": 4882788864.0, + "47": 4883144192.0, + "48": 4883688960.0, + "49": 4884182528.0, + "50": 4885279232.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 41210470400.0, + "2": 41210470400.0, + "3": 41210470400.0, + "4": 41210470400.0, + "5": 41210470400.0, + "6": 41210470400.0, + "7": 41210470400.0, + "8": 41210470400.0, + "9": 41210470400.0, + "10": 41210470400.0, + "11": 41210470400.0, + "12": 41210470400.0, + "13": 41210470400.0, + "14": 41210470400.0, + "15": 41210470400.0, + "16": 41210470400.0, + "17": 41210470400.0, + "18": 41210470400.0, + "19": 41210470400.0, + "20": 41210470400.0, + "21": 41210470400.0, + "22": 41210470400.0, + "23": 41210470400.0, + "24": 41210470400.0, + "25": 41210470400.0, + "26": 41210470400.0, + "27": 41210470400.0, + "28": 41210470400.0, + "29": 41210470400.0, + "30": 41210470400.0, + "31": 41210470400.0, + "32": 41210470400.0, + "33": 41210470400.0, + "34": 41210470400.0, + "35": 41210470400.0, + "36": 41210470400.0, + "37": 41210470400.0, + "38": 41210470400.0, + "39": 41210470400.0, + "40": 41210470400.0, + "41": 41210470400.0, + "42": 41210470400.0, + "43": 41210470400.0, + "44": 41210470400.0, + "45": 41210470400.0, + "46": 41210470400.0, + "47": 41210470400.0, + "48": 41210470400.0, + "49": 41210470400.0, + "50": 41210470400.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 86.8085, + "2": 1.10913, + "3": 0.99097, + "4": 0.89412, + "5": 1.25997, + "6": 0.98162, + "7": 0.98318, + "8": 1.13296, + "9": 0.88126, + "10": 0.8633, + "11": 2.2744, + "12": 4.5393, + "13": 3.22763, + "14": 1.64923, + "15": 0.86595, + "16": 0.86575, + "17": 0.85272, + "18": 0.85454, + "19": 0.85281, + "20": 0.87018, + "21": 0.84654, + "22": 0.8494, + "23": 0.84882, + "24": 0.84482, + "25": 0.85311, + "26": 0.84678, + "27": 0.84096, + "28": 0.8412, + "29": 0.84156, + "30": 0.84475, + "31": 0.84747, + "32": 0.85058, + "33": 0.84977, + "34": 0.8479, + "35": 0.85234, + "36": 0.85012, + "37": 0.85087, + "38": 0.84594, + "39": 0.84558, + "40": 0.84807, + "41": 0.84183, + "42": 0.8439, + "43": 0.84221, + "44": 0.84248, + "45": 0.84257, + "46": 0.83922, + "47": 0.84311, + "48": 0.84159, + "49": 0.84011, + "50": 0.8353 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..daa04af43dd --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.95004, + "2": 10.9521, + "3": 10.5115, + "4": 9.96454, + "5": 9.93941, + "6": 9.67273, + "7": 10.20975, + "8": 9.49716, + "9": 9.55902, + "10": 9.79742, + "11": 9.30109, + "12": 9.40483, + "13": 9.39546, + "14": 8.84681, + "15": 9.02444, + "16": 9.07121, + "17": 9.04574, + "18": 8.75678, + "19": 9.18159, + "20": 8.8595, + "21": 8.53503, + "22": 8.55182, + "23": 8.42441, + "24": 8.37608, + "25": 8.64304, + "26": 7.97393, + "27": 8.56806, + "28": 8.19764, + "29": 8.3928, + "30": 8.67283, + "31": 8.289, + "32": 8.43572, + "33": 8.5568, + "34": 8.66018, + "35": 8.07934, + "36": 7.94976, + "37": 8.29565, + "38": 7.98044, + "39": 8.39201, + "40": 8.35513, + "41": 8.31876, + "42": 8.0583, + "43": 8.03283, + "44": 8.24243, + "45": 8.10277, + "46": 7.61696, + "47": 8.15273, + "48": 8.00569, + "49": 8.38688, + "50": 7.81491 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 19403624.0, + "2": 19274194.0, + "3": 19372760.0, + "4": 86525248.0, + "5": 148575568.0, + "6": 145226704.0, + "7": 171879984.0, + "8": 195785248.0, + "9": 164124752.0, + "10": 167684736.0, + "11": 221077344.0, + "12": 200384224.0, + "13": 248872528.0, + "14": 211169424.0, + "15": 214304608.0, + "16": 216075632.0, + "17": 267845984.0, + "18": 170470336.0, + "19": 176865072.0, + "20": 187955392.0, + "21": 225750704.0, + "22": 247396816.0, + "23": 211643856.0, + "24": 205638464.0, + "25": 277022272.0, + "26": 291562304.0, + "27": 225789840.0, + "28": 288202368.0, + "29": 198390384.0, + "30": 213302208.0, + "31": 227204752.0, + "32": 271112416.0, + "33": 231840432.0, + "34": 203575536.0, + "35": 191152368.0, + "36": 222566928.0, + "37": 177810112.0, + "38": 228708544.0, + "39": 211168784.0, + "40": 215603968.0, + "41": 200089440.0, + "42": 228529888.0, + "43": 198782848.0, + "44": 141902272.0, + "45": 181922816.0, + "46": 115369856.0, + "47": 170214176.0, + "48": 137292832.0, + "49": 97654936.0, + "50": 160979632.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4882187264.0, + "2": 4881607168.0, + "3": 4882283008.0, + "4": 4881322496.0, + "5": 4882174464.0, + "6": 4883177984.0, + "7": 4883252736.0, + "8": 4881774080.0, + "9": 4881443328.0, + "10": 4884319744.0, + "11": 4882319872.0, + "12": 4881232384.0, + "13": 4880836096.0, + "14": 4882124288.0, + "15": 4882108928.0, + "16": 4883384832.0, + "17": 4880466432.0, + "18": 4881518080.0, + "19": 4881734144.0, + "20": 4883215872.0, + "21": 4883534336.0, + "22": 4882774528.0, + "23": 4881818112.0, + "24": 4882441728.0, + "25": 4880546304.0, + "26": 4882178560.0, + "27": 4881892864.0, + "28": 4881869312.0, + "29": 4882979328.0, + "30": 4882715136.0, + "31": 4883084800.0, + "32": 4881436160.0, + "33": 4881766912.0, + "34": 4881406464.0, + "35": 4881531392.0, + "36": 4881479168.0, + "37": 4882455040.0, + "38": 4882054656.0, + "39": 4882005504.0, + "40": 4882743808.0, + "41": 4881211904.0, + "42": 4881378816.0, + "43": 4882133504.0, + "44": 4881860096.0, + "45": 4883165696.0, + "46": 4882168320.0, + "47": 4881526272.0, + "48": 4882125312.0, + "49": 4881533440.0, + "50": 4881598976.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 41210470400.0, + "2": 41210470400.0, + "3": 41210470400.0, + "4": 41210470400.0, + "5": 41210470400.0, + "6": 41210470400.0, + "7": 41210470400.0, + "8": 41210470400.0, + "9": 41210470400.0, + "10": 41210470400.0, + "11": 41210470400.0, + "12": 41210470400.0, + "13": 41210470400.0, + "14": 41210470400.0, + "15": 41210470400.0, + "16": 41210470400.0, + "17": 41210470400.0, + "18": 41210470400.0, + "19": 41210470400.0, + "20": 41210470400.0, + "21": 41210470400.0, + "22": 41210470400.0, + "23": 41210470400.0, + "24": 41210470400.0, + "25": 41210470400.0, + "26": 41210470400.0, + "27": 41210470400.0, + "28": 41210470400.0, + "29": 41210470400.0, + "30": 41210470400.0, + "31": 41210470400.0, + "32": 41210470400.0, + "33": 41210470400.0, + "34": 41210470400.0, + "35": 41210470400.0, + "36": 41210470400.0, + "37": 41210470400.0, + "38": 41210470400.0, + "39": 41210470400.0, + "40": 41210470400.0, + "41": 41210470400.0, + "42": 41210470400.0, + "43": 41210470400.0, + "44": 41210470400.0, + "45": 41210470400.0, + "46": 41210470400.0, + "47": 41210470400.0, + "48": 41210470400.0, + "49": 41210470400.0, + "50": 41210470400.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 96.21947, + "2": 1.10023, + "3": 0.96399, + "4": 0.91113, + "5": 1.27509, + "6": 1.00484, + "7": 1.01236, + "8": 1.1739, + "9": 0.89406, + "10": 0.88836, + "11": 0.92033, + "12": 0.88331, + "13": 0.88179, + "14": 0.88307, + "15": 0.88648, + "16": 0.88425, + "17": 0.87155, + "18": 0.87556, + "19": 0.87374, + "20": 0.8744, + "21": 0.86757, + "22": 0.87217, + "23": 0.8736, + "24": 0.86646, + "25": 0.87328, + "26": 0.87121, + "27": 0.85886, + "28": 0.86392, + "29": 0.86385, + "30": 0.86425, + "31": 0.8631, + "32": 0.8617, + "33": 0.86069, + "34": 0.86829, + "35": 0.86837, + "36": 0.86776, + "37": 0.86686, + "38": 0.86359, + "39": 0.8677, + "40": 0.86441, + "41": 0.86179, + "42": 0.86079, + "43": 0.86149, + "44": 0.86222, + "45": 0.86336, + "46": 0.85875, + "47": 0.86219, + "48": 0.86026, + "49": 0.85894, + "50": 0.8544 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..7de18dbf1e6 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,160 @@ +{ + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " Wait for the moment when the music stops, and the lights come up, and the DJ says, \"Ladies and gentlemen, we have a", + "generated_tokens": [ + 32844, + 1394, + 1278, + 4735, + 2200, + 1278, + 7146, + 30774, + 1044, + 1321, + 1278, + 26466, + 3930, + 2015, + 1044, + 1321, + 1278, + 30245, + 8223, + 1044, + 1429, + 1076, + 1332, + 1564, + 1321, + 94228, + 1044, + 1729, + 1736, + 1261 + ], + "latency": 14.808601379394531, + "cuda_graph_request_count_map": null, + "step_count": 30, + "logprobs": [ + -10.448518753051758, + -3.716420888900757, + -2.8180086612701416, + -1.243106484413147, + -0.24187560379505157, + -1.8551081418991089, + -2.376569986343384, + -1.9645118713378906, + -2.109349250793457, + -6.07373046875, + -0.813377320766449, + -2.4836418628692627, + -3.536860466003418, + -4.1545329093933105, + -1.963995099067688, + -1.7819465398788452, + -2.2402775287628174, + -7.124850273132324, + -0.04146730899810791, + -1.8956966400146484, + -5.0023088455200195, + -8.669596672058105, + -10.017587661743164, + -0.8539565801620483, + -4.7854743003845215, + -0.8601016402244568, + -2.2983784675598145, + -0.02070772275328636, + -0.03607859089970589, + -3.367767095565796, + -8.763198852539062, + -1.2790724039077759, + -6.628117084503174, + -3.8508503437042236, + -3.782344341278076, + -4.235904693603516, + -2.1973977088928223, + -1.0788196325302124, + -0.2020731270313263, + -0.7980086207389832, + -4.743488311767578, + -9.224939346313477, + -0.013240979053080082, + -3.2384161949157715, + -1.2238521575927734, + -3.97282338142395, + -0.7767954468727112, + -0.002251710742712021, + -2.981565475463867, + -10.490681648254395, + -2.981630802154541, + -1.152982234954834, + -4.928977012634277, + -0.20187364518642426, + -0.04966790974140167, + -1.3515344858169556, + -2.200042963027954, + -4.3557939529418945, + -0.36048629879951477, + -4.087867736816406, + -0.40046849846839905, + -0.13703589141368866, + -2.805037260055542, + -10.71006965637207, + -0.051668114960193634, + -3.277766704559326, + -0.8607810735702515, + -4.699098110198975, + -0.2629980444908142, + -2.686246633529663, + -0.8297598361968994, + -1.6083959341049194, + -5.793962478637695, + -16.94595718383789, + -2.966357707977295, + -0.11854737997055054, + -7.449464321136475, + -1.0872507095336914, + -2.057858943939209, + -1.5261168479919434, + -0.2606821358203888, + -5.62846565246582, + -0.006751700770109892, + -7.793324947357178, + -2.7264108657836914, + -2.9370150566101074, + -3.0170741081237793, + -2.344959020614624, + -0.3987772464752197, + -1.5143157243728638, + -2.3020801544189453, + -0.5609080791473389, + -1.3160275220870972, + -1.987931728363037, + -1.7064098119735718, + -0.7751765847206116, + -0.49781349301338196, + -1.2841160297393799, + -1.5651875734329224, + -0.9735848307609558, + -0.4030272364616394, + -0.4352472424507141, + -0.044518083333969116, + -1.274898648262024, + -2.1242129802703857, + -2.7193076610565186, + -0.7538051009178162, + -0.41235291957855225, + -2.793597459793091, + -0.005556969437748194, + -0.0006632988806813955, + -0.018600093200802803, + -0.2592391073703766, + -0.1656094491481781, + -1.9508270025253296, + -1.1184629201889038, + -0.39283478260040283 + ] + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..6da9de60910 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,160 @@ +{ + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " Wait for the moment when the music stops, and the lights come up, and the DJ says, \"Ladies and gentlemen, we have a", + "generated_tokens": [ + 32844, + 1394, + 1278, + 4735, + 2200, + 1278, + 7146, + 30774, + 1044, + 1321, + 1278, + 26466, + 3930, + 2015, + 1044, + 1321, + 1278, + 30245, + 8223, + 1044, + 1429, + 1076, + 1332, + 1564, + 1321, + 94228, + 1044, + 1729, + 1736, + 1261 + ], + "latency": 16.235759735107422, + "cuda_graph_request_count_map": null, + "step_count": 30, + "logprobs": [ + -10.448518753051758, + -3.716420888900757, + -2.8180086612701416, + -1.243106484413147, + -0.24187560379505157, + -1.8551081418991089, + -2.376569986343384, + -1.9645118713378906, + -2.109349250793457, + -6.07373046875, + -0.813377320766449, + -2.4836418628692627, + -3.536860466003418, + -4.1545329093933105, + -1.963995099067688, + -1.7819465398788452, + -2.2402775287628174, + -7.124850273132324, + -0.04146730899810791, + -1.8956966400146484, + -5.0023088455200195, + -8.669596672058105, + -10.017587661743164, + -0.8539565801620483, + -4.7854743003845215, + -0.8601016402244568, + -2.2983784675598145, + -0.02070772275328636, + -0.03607859089970589, + -3.367767095565796, + -8.763198852539062, + -1.2790724039077759, + -6.628117084503174, + -3.8508503437042236, + -3.782344341278076, + -4.235904693603516, + -2.1973977088928223, + -1.0788196325302124, + -0.2020731270313263, + -0.7980086207389832, + -4.743488311767578, + -9.224939346313477, + -0.013240979053080082, + -3.2384161949157715, + -1.2238521575927734, + -3.97282338142395, + -0.7767954468727112, + -0.002251710742712021, + -2.981565475463867, + -10.490681648254395, + -2.981630802154541, + -1.152982234954834, + -4.928977012634277, + -0.20187364518642426, + -0.04966790974140167, + -1.3515344858169556, + -2.200042963027954, + -4.3557939529418945, + -0.36048629879951477, + -4.087867736816406, + -0.40046849846839905, + -0.13703589141368866, + -2.805037260055542, + -10.71006965637207, + -0.051668114960193634, + -3.277766704559326, + -0.8607810735702515, + -4.699098110198975, + -0.2629980444908142, + -2.686246633529663, + -0.8297598361968994, + -1.6083959341049194, + -5.793962478637695, + -16.94595718383789, + -2.966357707977295, + -0.11854737997055054, + -7.449464321136475, + -1.0872507095336914, + -2.057858943939209, + -1.5261168479919434, + -0.2606821358203888, + -5.62846565246582, + -0.006751700770109892, + -7.793324947357178, + -2.7264108657836914, + -2.9370150566101074, + -3.0170741081237793, + -2.344959020614624, + -0.3987772464752197, + -1.5143157243728638, + -2.3020801544189453, + -0.5609080791473389, + -1.3160275220870972, + -1.987931728363037, + -1.7064098119735718, + -0.7751765847206116, + -0.49781349301338196, + -1.2841160297393799, + -1.5651875734329224, + -0.9735848307609558, + -0.4030272364616394, + -0.4352472424507141, + -0.044518083333969116, + -1.274898648262024, + -2.1242129802703857, + -2.7193076610565186, + -0.7538051009178162, + -0.41235291957855225, + -2.793597459793091, + -0.005556969437748194, + -0.0006632988806813955, + -0.018600093200802803, + -0.2592391073703766, + -0.1656094491481781, + -1.9508270025253296, + -1.1184629201889038, + -0.39283478260040283 + ] + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..c476959ff0d --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1 @@ +{"0": {"input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", "generated_text": " Wait for the moment when the music stops, and the lights come up, and the DJ says, \"I'm going to play a song for you", "generated_tokens": [32844, 1394, 1278, 4735, 2200, 1278, 7146, 30774, 1044, 1321, 1278, 26466, 3930, 2015, 1044, 1321, 1278, 30245, 8223, 1044, 1429, 1073, 4525, 4670, 1317, 3354, 1261, 6947, 1394, 1636], "tpot": [2.733039379119873, 0.6462976336479187, 0.07264169305562973, 0.0742710754275322, 0.07288099080324173, 0.07153938710689545, 0.07088476419448853, 0.07172102481126785, 0.07192070782184601, 0.07100768387317657, 0.07138767838478088, 0.07139590382575989, 0.07129142433404922, 0.07098300755023956, 0.07138735800981522, 0.07275772839784622, 0.07109101116657257, 0.07179228961467743, 0.07515615969896317, 0.07377561926841736, 0.07309594005346298, 0.07474038749933243, 0.07406358420848846, 0.07546690851449966, 0.07540509104728699, 0.0726393610239029, 0.07218870520591736, 0.07215183973312378, 0.07114642858505249, 0.07710829377174377], "latency": 5.430960623547435, "logprobs": [-9.965213775634766, -3.6972405910491943, -2.8163998126983643, -1.3259482383728027, -0.22894315421581268, -1.801922082901001, -2.380244493484497, -1.9902539253234863, -2.195096731185913, -6.201530456542969, -0.8732167482376099, -2.3890693187713623, -3.4655370712280273, -4.265195369720459, -1.9843286275863647, -1.8525164127349854, -2.247467517852783, -7.156258583068848, -0.04102461040019989, -1.9811111688613892, -5.029601097106934, -8.902811050415039, -9.822186470031738, -0.7156577706336975, -4.822559833526611, -0.830146074295044, -2.264935255050659, -0.02063065394759178, -0.0366678312420845, -3.4783172607421875, -8.650375366210938, -1.247912883758545, -6.612592697143555, -3.64731502532959, -3.6577675342559814, -4.237436771392822, -2.1768712997436523, -1.0792245864868164, -0.22580334544181824, -0.7873495221138, -4.81827974319458, -8.96638011932373, -0.01367227640002966, -3.1769614219665527, -1.3207263946533203, -3.995314121246338, -0.7868635654449463, -0.0021346656139940023, -2.9099419116973877, -10.611204147338867, -3.244929313659668, -1.103176474571228, -4.869075775146484, -0.2279863953590393, -0.06238075718283653, -1.2982008457183838, -2.208366632461548, -4.412147045135498, -0.3588172495365143, -4.0025200843811035, -0.3714170753955841, -0.14747798442840576, -2.7178127765655518, -10.553118705749512, -0.057451825588941574, -3.381279945373535, -0.8944476842880249, -4.724348068237305, -0.25962480902671814, -2.655942678451538, -0.8473785519599915, -1.5853822231292725, -5.768069267272949, -16.949235916137695, -2.675042152404785, -0.12979209423065186, -7.452098369598389, -1.1089909076690674, -2.0911808013916016, -1.5204540491104126, -0.29428866505622864, -5.85228157043457, -0.006600246299058199, -7.733879089355469, -2.7058277130126953, -2.9573605060577393, -3.0196847915649414, -2.450732469558716, -0.3994073271751404, -1.426312804222107, -2.2726848125457764, -0.6103246212005615, -1.3297024965286255, -1.936716914176941, -1.7187526226043701, -0.7779486775398254, -0.5053722858428955, -1.300978660583496, -1.588526964187622, -0.9849303960800171, -0.4031231701374054, -0.4341556429862976, -0.04193130508065224, -1.2715754508972168, -2.116468906402588, -2.6802122592926025, -0.8255553245544434, -0.42921727895736694, -2.904050350189209, -1.4616029262542725, -1.6294372081756592, -0.05650198459625244, -1.3804056644439697, -1.3228214979171753, -1.268000602722168, -1.2933895587921143, -0.5357464551925659]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..b6c02c060a6 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/golden_values_dev_dgxh100_eos.json @@ -0,0 +1 @@ +{"0": {"input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", "generated_text": " Wait for the moment when the music stops, and the lights come up, and the DJ says, \"I'm going to play a song for you", "generated_tokens": [32844, 1394, 1278, 4735, 2200, 1278, 7146, 30774, 1044, 1321, 1278, 26466, 3930, 2015, 1044, 1321, 1278, 30245, 8223, 1044, 1429, 1073, 4525, 4670, 1317, 3354, 1261, 6947, 1394, 1636], "tpot": [2.503589630126953, 0.6800563931465149, 0.08277347683906555, 0.08122985810041428, 0.08191356807947159, 0.08138781040906906, 0.08179532736539841, 0.08161459118127823, 0.07997913658618927, 0.08174006640911102, 0.08130563050508499, 0.08119283616542816, 0.083525151014328, 0.0812133401632309, 0.08146921545267105, 0.0823666900396347, 0.0816216692328453, 0.08155478537082672, 0.08143103867769241, 0.08030703663825989, 0.08146540820598602, 0.0814284160733223, 0.0816466212272644, 0.08187752962112427, 0.0818835198879242, 0.08184870332479477, 0.08239014446735382, 0.08098144084215164, 0.08134105801582336, 0.0866490826010704], "latency": 5.490644988021813, "logprobs": [-9.965213775634766, -3.6972405910491943, -2.8163998126983643, -1.3259482383728027, -0.22894315421581268, -1.801922082901001, -2.380244493484497, -1.9902539253234863, -2.195096731185913, -6.201530456542969, -0.8732167482376099, -2.3890693187713623, -3.4655370712280273, -4.265195369720459, -1.9843286275863647, -1.8525164127349854, -2.247467517852783, -7.156258583068848, -0.04102461040019989, -1.9811111688613892, -5.029601097106934, -8.902811050415039, -9.822186470031738, -0.7156577706336975, -4.822559833526611, -0.830146074295044, -2.264935255050659, -0.02063065394759178, -0.0366678312420845, -3.4783172607421875, -8.650375366210938, -1.247912883758545, -6.612592697143555, -3.64731502532959, -3.6577675342559814, -4.237436771392822, -2.1768712997436523, -1.0792245864868164, -0.22580334544181824, -0.7873495221138, -4.81827974319458, -8.96638011932373, -0.01367227640002966, -3.1769614219665527, -1.3207263946533203, -3.995314121246338, -0.7868635654449463, -0.0021346656139940023, -2.9099419116973877, -10.611204147338867, -3.244929313659668, -1.103176474571228, -4.869075775146484, -0.2279863953590393, -0.06238075718283653, -1.2982008457183838, -2.208366632461548, -4.412147045135498, -0.3588172495365143, -4.0025200843811035, -0.3714170753955841, -0.14747798442840576, -2.7178127765655518, -10.553118705749512, -0.057451825588941574, -3.381279945373535, -0.8944476842880249, -4.724348068237305, -0.25962480902671814, -2.655942678451538, -0.8473785519599915, -1.5853822231292725, -5.768069267272949, -16.949235916137695, -2.675042152404785, -0.12979209423065186, -7.452098369598389, -1.1089909076690674, -2.0911808013916016, -1.5204540491104126, -0.29428866505622864, -5.85228157043457, -0.006600246299058199, -7.733879089355469, -2.7058277130126953, -2.9573605060577393, -3.0196847915649414, -2.450732469558716, -0.3994073271751404, -1.426312804222107, -2.2726848125457764, -0.6103246212005615, -1.3297024965286255, -1.936716914176941, -1.7187526226043701, -0.7779486775398254, -0.5053722858428955, -1.300978660583496, -1.588526964187622, -0.9849303960800171, -0.4031231701374054, -0.4341556429862976, -0.04193130508065224, -1.2715754508972168, -2.116468906402588, -2.6802122592926025, -0.8255553245544434, -0.42921727895736694, -2.904050350189209, -1.4616029262542725, -1.6294372081756592, -0.05650198459625244, -1.3804056644439697, -1.3228214979171753, -1.268000602722168, -1.2933895587921143, -0.5357464551925659]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..73fd0caaba6 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1 @@ +{"0": {"input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", "generated_text": " Wait for the moment when the music stops, and the lights come up, and the DJ says, \"I'm going to play a song for you", "generated_tokens": [32844, 1394, 1278, 4735, 2200, 1278, 7146, 30774, 1044, 1321, 1278, 26466, 3930, 2015, 1044, 1321, 1278, 30245, 8223, 1044, 1429, 1073, 4525, 4670, 1317, 3354, 1261, 6947, 1394, 1636], "tpot": [10.43424129486084, 0.7638993859291077, 0.09107974171638489, 0.08577366918325424, 0.08719602972269058, 0.083721823990345, 0.08272668719291687, 0.08146601915359497, 0.08189938962459564, 0.08049139380455017, 0.07883225381374359, 0.07785692811012268, 0.08183623105287552, 0.07833318412303925, 0.07873958349227905, 0.07888400554656982, 0.07648000121116638, 0.07849132269620895, 0.07743385434150696, 0.0782134085893631, 0.07679852843284607, 0.08008908480405807, 0.07658396661281586, 0.07823677361011505, 0.07748432457447052, 0.0787697285413742, 0.08206255733966827, 0.08375174552202225, 0.08225465565919876, 0.07925853133201599], "latency": 13.472718173637986, "logprobs": [-10.448518753051758, -3.693941593170166, -2.833103656768799, -1.2445695400238037, -0.23799529671669006, -1.7522815465927124, -2.378152370452881, -1.9484899044036865, -2.108924388885498, -6.127920150756836, -0.8197959661483765, -2.477976083755493, -3.492497444152832, -4.170319557189941, -1.9918553829193115, -1.8618279695510864, -2.2335567474365234, -7.071791172027588, -0.039936937391757965, -1.9948835372924805, -5.008172512054443, -8.708097457885742, -9.903486251831055, -0.851460337638855, -4.765171051025391, -0.8707393407821655, -2.219733238220215, -0.01853257417678833, -0.035978663712739944, -3.387631416320801, -8.754067420959473, -1.2686023712158203, -6.662981986999512, -3.7872395515441895, -3.6667354106903076, -4.171259880065918, -2.2128500938415527, -1.091404914855957, -0.22139909863471985, -0.8265669941902161, -4.746159553527832, -9.04170036315918, -0.013459297828376293, -3.17301607131958, -1.3139652013778687, -3.9821701049804688, -0.7707944512367249, -0.002040567807853222, -2.9162371158599854, -10.677328109741211, -3.1504364013671875, -1.1485933065414429, -4.871399402618408, -0.20786719024181366, -0.06325722485780716, -1.3587590456008911, -2.207646369934082, -4.407937049865723, -0.36253970861434937, -4.0189995765686035, -0.3988611698150635, -0.13855230808258057, -2.7199528217315674, -10.558171272277832, -0.04671315476298332, -3.5006980895996094, -0.9756439328193665, -4.673828125, -0.2634696066379547, -2.5747756958007812, -0.8531911969184875, -1.6041897535324097, -5.738401412963867, -16.978456497192383, -2.6206722259521484, -0.14098073542118073, -7.450814247131348, -1.076573371887207, -2.129807472229004, -1.5724716186523438, -0.29326727986335754, -5.609436511993408, -0.0065282415598630905, -7.79502010345459, -2.715085744857788, -3.0889575481414795, -3.0355961322784424, -2.4395439624786377, -0.3983170986175537, -1.5089631080627441, -2.276723861694336, -0.6004312038421631, -1.3054823875427246, -1.9454480409622192, -1.7226327657699585, -0.7742734551429749, -0.49186939001083374, -1.2962923049926758, -1.567298173904419, -1.0149078369140625, -0.40288272500038147, -0.4789682626724243, -0.04533138871192932, -1.2695876359939575, -2.223480224609375, -2.6703481674194336, -0.7677091956138611, -0.42749911546707153, -2.8563802242279053, -1.5350499153137207, -1.6456167697906494, -0.05149398744106293, -1.3739523887634277, -1.3543274402618408, -1.2655469179153442, -1.307403326034546, -0.497008740901947]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..6e6ce1505c0 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgxh100_eos.json @@ -0,0 +1 @@ +{"0": {"input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", "generated_text": " Wait for the moment when the music stops, and the lights come up, and the DJ says, \"I'm going to play a song for you", "generated_tokens": [32844, 1394, 1278, 4735, 2200, 1278, 7146, 30774, 1044, 1321, 1278, 26466, 3930, 2015, 1044, 1321, 1278, 30245, 8223, 1044, 1429, 1073, 4525, 4670, 1317, 3354, 1261, 6947, 1394, 1636], "tpot": [10.709007263183594, 0.669678270816803, 0.09804461151361465, 0.095348060131073, 0.09667164832353592, 0.09561737626791, 0.0947360023856163, 0.09328848123550415, 0.11012643575668335, 0.1598961353302002, 0.14693699777126312, 0.09262124449014664, 0.09305571019649506, 0.09258509427309036, 0.09176912158727646, 0.09196281433105469, 0.09245385974645615, 0.09135404974222183, 0.0905960276722908, 0.09146220982074738, 0.09172549843788147, 0.09252317249774933, 0.09408310800790787, 0.09566400200128555, 0.09318371117115021, 0.09361443668603897, 0.09234358370304108, 0.09181750565767288, 0.09110204875469208, 0.09121545404195786], "latency": 14.143519142875448, "logprobs": [-10.448518753051758, -3.693941593170166, -2.833103656768799, -1.2445695400238037, -0.23799529671669006, -1.7522815465927124, -2.378152370452881, -1.9484899044036865, -2.108924388885498, -6.127920150756836, -0.8197959661483765, -2.477976083755493, -3.492497444152832, -4.170319557189941, -1.9918553829193115, -1.8618279695510864, -2.2335567474365234, -7.071791172027588, -0.039936937391757965, -1.9948835372924805, -5.008172512054443, -8.708097457885742, -9.903486251831055, -0.851460337638855, -4.765171051025391, -0.8707393407821655, -2.219733238220215, -0.01853257417678833, -0.035978663712739944, -3.387631416320801, -8.754067420959473, -1.2686023712158203, -6.662981986999512, -3.7872395515441895, -3.6667354106903076, -4.171259880065918, -2.2128500938415527, -1.091404914855957, -0.22139909863471985, -0.8265669941902161, -4.746159553527832, -9.04170036315918, -0.013459297828376293, -3.17301607131958, -1.3139652013778687, -3.9821701049804688, -0.7707944512367249, -0.002040567807853222, -2.9162371158599854, -10.677328109741211, -3.1504364013671875, -1.1485933065414429, -4.871399402618408, -0.20786719024181366, -0.06325722485780716, -1.3587590456008911, -2.207646369934082, -4.407937049865723, -0.36253970861434937, -4.0189995765686035, -0.3988611698150635, -0.13855230808258057, -2.7199528217315674, -10.558171272277832, -0.04671315476298332, -3.5006980895996094, -0.9756439328193665, -4.673828125, -0.2634696066379547, -2.5747756958007812, -0.8531911969184875, -1.6041897535324097, -5.738401412963867, -16.978456497192383, -2.6206722259521484, -0.14098073542118073, -7.450814247131348, -1.076573371887207, -2.129807472229004, -1.5724716186523438, -0.29326727986335754, -5.609436511993408, -0.0065282415598630905, -7.79502010345459, -2.715085744857788, -3.0889575481414795, -3.0355961322784424, -2.4395439624786377, -0.3983170986175537, -1.5089631080627441, -2.276723861694336, -0.6004312038421631, -1.3054823875427246, -1.9454480409622192, -1.7226327657699585, -0.7742734551429749, -0.49186939001083374, -1.2962923049926758, -1.567298173904419, -1.0149078369140625, -0.40288272500038147, -0.4789682626724243, -0.04533138871192932, -1.2695876359939575, -2.223480224609375, -2.6703481674194336, -0.7677091956138611, -0.42749911546707153, -2.8563802242279053, -1.5350499153137207, -1.6456167697906494, -0.05149398744106293, -1.3739523887634277, -1.3543274402618408, -1.2655469179153442, -1.307403326034546, -0.497008740901947]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..c9c6ca750a5 --- /dev/null +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 9.14877, + "2": 9.15171, + "3": 9.14691, + "4": 9.15346, + "5": 9.15057, + "6": 9.14683, + "7": 9.14378, + "8": 9.14363, + "9": 9.15069, + "10": 9.15231, + "11": 9.14609, + "12": 9.14125, + "13": 9.1414, + "14": 9.14248, + "15": 9.13419, + "16": 9.12601, + "17": 9.12407, + "18": 9.12053, + "19": 9.11789, + "20": 9.09777, + "21": 9.06948, + "22": 9.06985, + "23": 9.07079, + "24": 9.06043, + "25": 9.05505, + "26": 9.05713, + "27": 9.04089, + "28": 9.0186, + "29": 9.00353, + "30": 8.99697, + "31": 8.99484, + "32": 8.98416, + "33": 8.97763, + "34": 8.98617, + "35": 8.94993, + "36": 8.94557, + "37": 8.92133, + "38": 8.94104, + "39": 8.92482, + "40": 8.87122, + "41": 8.89627, + "42": 8.87601, + "43": 8.87414, + "44": 8.8411, + "45": 8.81228, + "46": 8.79564, + "47": 8.84576, + "48": 8.77191, + "49": 8.78047, + "50": 8.76196 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3477955.0, + "2": 3392302.0, + "3": 3630021.0, + "4": 3532452.0, + "5": 3783960.0, + "6": 3584449.0, + "7": 3478372.0, + "8": 3414330.0, + "9": 3511649.0, + "10": 3544311.0, + "11": 3475468.0, + "12": 3518965.0, + "13": 3591786.0, + "14": 3549396.0, + "15": 3421163.0, + "16": 3383319.0, + "17": 3424120.0, + "18": 3509184.0, + "19": 3426107.0, + "20": 3465915.0, + "21": 3700118.0, + "22": 3474397.0, + "23": 3693474.0, + "24": 3405657.0, + "25": 3457588.0, + "26": 3479130.0, + "27": 3555371.0, + "28": 3496999.0, + "29": 3561842.0, + "30": 3708011.0, + "31": 3397663.0, + "32": 3467970.0, + "33": 3515742.0, + "34": 3501589.0, + "35": 3432484.0, + "36": 3453953.0, + "37": 3958777.0, + "38": 3488640.0, + "39": 3409958.0, + "40": 3614258.0, + "41": 3425709.0, + "42": 3643603.0, + "43": 3473029.0, + "44": 3448331.0, + "45": 3452202.0, + "46": 3585738.0, + "47": 3467386.0, + "48": 3462962.0, + "49": 3529813.0, + "50": 3412019.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2431335424.0, + "2": 2431335424.0, + "3": 2431335424.0, + "4": 2431335424.0, + "5": 2431335424.0, + "6": 2431335424.0, + "7": 2431335424.0, + "8": 2431335424.0, + "9": 2431335424.0, + "10": 2431335424.0, + "11": 2431335424.0, + "12": 2431335424.0, + "13": 2431335424.0, + "14": 2431335424.0, + "15": 2431335424.0, + "16": 2431335424.0, + "17": 2431335424.0, + "18": 2431335424.0, + "19": 2431335424.0, + "20": 2431335424.0, + "21": 2431335424.0, + "22": 2431335424.0, + "23": 2431335424.0, + "24": 2431335424.0, + "25": 2431335424.0, + "26": 2431335424.0, + "27": 2431335424.0, + "28": 2431335424.0, + "29": 2431335424.0, + "30": 2431335424.0, + "31": 2431335424.0, + "32": 2431335424.0, + "33": 2431335424.0, + "34": 2431335424.0, + "35": 2431335424.0, + "36": 2431335424.0, + "37": 2431335424.0, + "38": 2431335424.0, + "39": 2431335424.0, + "40": 2431335424.0, + "41": 2431335424.0, + "42": 2431335424.0, + "43": 2431335424.0, + "44": 2431335424.0, + "45": 2431335424.0, + "46": 2431335424.0, + "47": 2431335424.0, + "48": 2431335424.0, + "49": 2431335424.0, + "50": 2431335424.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 14740086784.0, + "2": 15773663232.0, + "3": 15773663232.0, + "4": 15773663232.0, + "5": 15773663232.0, + "6": 15773663232.0, + "7": 15773663232.0, + "8": 15773663232.0, + "9": 15773663232.0, + "10": 15773663232.0, + "11": 15773663232.0, + "12": 15773663232.0, + "13": 15773663232.0, + "14": 15773663232.0, + "15": 15773663232.0, + "16": 15773663232.0, + "17": 15773663232.0, + "18": 15773663232.0, + "19": 15773663232.0, + "20": 15773663232.0, + "21": 15773663232.0, + "22": 15773663232.0, + "23": 15773663232.0, + "24": 15773663232.0, + "25": 15773663232.0, + "26": 15773663232.0, + "27": 15773663232.0, + "28": 15773663232.0, + "29": 15773663232.0, + "30": 15773663232.0, + "31": 15773663232.0, + "32": 15773663232.0, + "33": 15773663232.0, + "34": 15773663232.0, + "35": 15773663232.0, + "36": 15773663232.0, + "37": 15773663232.0, + "38": 15773663232.0, + "39": 15773663232.0, + "40": 15773663232.0, + "41": 15773663232.0, + "42": 15773663232.0, + "43": 15773663232.0, + "44": 15773663232.0, + "45": 15773663232.0, + "46": 15773663232.0, + "47": 15773663232.0, + "48": 15773663232.0, + "49": 15773663232.0, + "50": 15773663232.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 6.39505, + "2": 0.21516, + "3": 0.18624, + "4": 0.175, + "5": 0.17379, + "6": 0.17879, + "7": 0.17408, + "8": 0.17518, + "9": 0.17364, + "10": 0.17554, + "11": 0.17315, + "12": 0.17503, + "13": 0.17414, + "14": 0.17548, + "15": 0.17545, + "16": 0.17826, + "17": 0.17718, + "18": 0.19728, + "19": 0.18692, + "20": 0.17494, + "21": 0.17798, + "22": 0.19601, + "23": 0.19365, + "24": 0.17678, + "25": 0.17574, + "26": 0.17806, + "27": 0.17921, + "28": 0.18107, + "29": 0.17587, + "30": 0.18109, + "31": 0.18577, + "32": 0.1776, + "33": 0.17358, + "34": 0.18514, + "35": 0.18404, + "36": 0.18319, + "37": 0.17375, + "38": 0.19861, + "39": 0.18522, + "40": 0.17986, + "41": 0.18196, + "42": 0.17906, + "43": 0.1816, + "44": 0.17873, + "45": 0.1842, + "46": 0.18193, + "47": 0.18207, + "48": 0.18599, + "49": 0.17271, + "50": 0.18388 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..7dbf0c3c806 --- /dev/null +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 9.14877, + "2": 9.15171, + "3": 9.14691, + "4": 9.15346, + "5": 9.15057, + "6": 9.14683, + "7": 9.14378, + "8": 9.14363, + "9": 9.15069, + "10": 9.15231, + "11": 9.14609, + "12": 9.14125, + "13": 9.1414, + "14": 9.14248, + "15": 9.13419, + "16": 9.12601, + "17": 9.12407, + "18": 9.12053, + "19": 9.11789, + "20": 9.09777, + "21": 9.06948, + "22": 9.06985, + "23": 9.07079, + "24": 9.06043, + "25": 9.05505, + "26": 9.05713, + "27": 9.04089, + "28": 9.0186, + "29": 9.00353, + "30": 8.99697, + "31": 8.99484, + "32": 8.98416, + "33": 8.97763, + "34": 8.98617, + "35": 8.94993, + "36": 8.94557, + "37": 8.92133, + "38": 8.94104, + "39": 8.92482, + "40": 8.87122, + "41": 8.89627, + "42": 8.87601, + "43": 8.87414, + "44": 8.8411, + "45": 8.81228, + "46": 8.79564, + "47": 8.84576, + "48": 8.77191, + "49": 8.78047, + "50": 8.76196 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3477955.0, + "2": 3392302.0, + "3": 3630021.0, + "4": 3532452.0, + "5": 3783960.0, + "6": 3584449.0, + "7": 3478372.0, + "8": 3414330.0, + "9": 3511649.0, + "10": 3544311.0, + "11": 3475468.0, + "12": 3518965.0, + "13": 3591786.0, + "14": 3549396.0, + "15": 3421163.0, + "16": 3383319.0, + "17": 3424120.0, + "18": 3509184.0, + "19": 3426107.0, + "20": 3465915.0, + "21": 3700118.0, + "22": 3474397.0, + "23": 3693474.0, + "24": 3405657.0, + "25": 3457588.0, + "26": 3479130.0, + "27": 3555371.0, + "28": 3496999.0, + "29": 3561842.0, + "30": 3708011.0, + "31": 3397663.0, + "32": 3467970.0, + "33": 3515742.0, + "34": 3501589.0, + "35": 3432484.0, + "36": 3453953.0, + "37": 3958777.0, + "38": 3488640.0, + "39": 3409958.0, + "40": 3614258.0, + "41": 3425709.0, + "42": 3643603.0, + "43": 3473029.0, + "44": 3448331.0, + "45": 3452202.0, + "46": 3585738.0, + "47": 3467386.0, + "48": 3462962.0, + "49": 3529813.0, + "50": 3412019.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 2431335424.0, + "2": 2431335424.0, + "3": 2431335424.0, + "4": 2431335424.0, + "5": 2431335424.0, + "6": 2431335424.0, + "7": 2431335424.0, + "8": 2431335424.0, + "9": 2431335424.0, + "10": 2431335424.0, + "11": 2431335424.0, + "12": 2431335424.0, + "13": 2431335424.0, + "14": 2431335424.0, + "15": 2431335424.0, + "16": 2431335424.0, + "17": 2431335424.0, + "18": 2431335424.0, + "19": 2431335424.0, + "20": 2431335424.0, + "21": 2431335424.0, + "22": 2431335424.0, + "23": 2431335424.0, + "24": 2431335424.0, + "25": 2431335424.0, + "26": 2431335424.0, + "27": 2431335424.0, + "28": 2431335424.0, + "29": 2431335424.0, + "30": 2431335424.0, + "31": 2431335424.0, + "32": 2431335424.0, + "33": 2431335424.0, + "34": 2431335424.0, + "35": 2431335424.0, + "36": 2431335424.0, + "37": 2431335424.0, + "38": 2431335424.0, + "39": 2431335424.0, + "40": 2431335424.0, + "41": 2431335424.0, + "42": 2431335424.0, + "43": 2431335424.0, + "44": 2431335424.0, + "45": 2431335424.0, + "46": 2431335424.0, + "47": 2431335424.0, + "48": 2431335424.0, + "49": 2431335424.0, + "50": 2431335424.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 14740086784.0, + "2": 15773663232.0, + "3": 15773663232.0, + "4": 15773663232.0, + "5": 15773663232.0, + "6": 15773663232.0, + "7": 15773663232.0, + "8": 15773663232.0, + "9": 15773663232.0, + "10": 15773663232.0, + "11": 15773663232.0, + "12": 15773663232.0, + "13": 15773663232.0, + "14": 15773663232.0, + "15": 15773663232.0, + "16": 15773663232.0, + "17": 15773663232.0, + "18": 15773663232.0, + "19": 15773663232.0, + "20": 15773663232.0, + "21": 15773663232.0, + "22": 15773663232.0, + "23": 15773663232.0, + "24": 15773663232.0, + "25": 15773663232.0, + "26": 15773663232.0, + "27": 15773663232.0, + "28": 15773663232.0, + "29": 15773663232.0, + "30": 15773663232.0, + "31": 15773663232.0, + "32": 15773663232.0, + "33": 15773663232.0, + "34": 15773663232.0, + "35": 15773663232.0, + "36": 15773663232.0, + "37": 15773663232.0, + "38": 15773663232.0, + "39": 15773663232.0, + "40": 15773663232.0, + "41": 15773663232.0, + "42": 15773663232.0, + "43": 15773663232.0, + "44": 15773663232.0, + "45": 15773663232.0, + "46": 15773663232.0, + "47": 15773663232.0, + "48": 15773663232.0, + "49": 15773663232.0, + "50": 15773663232.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5.97454, + "2": 0.19297, + "3": 0.18331, + "4": 0.18419, + "5": 0.18099, + "6": 0.18354, + "7": 0.18332, + "8": 0.18477, + "9": 0.18391, + "10": 0.18412, + "11": 0.18154, + "12": 0.18441, + "13": 0.18338, + "14": 0.1859, + "15": 0.18316, + "16": 0.18298, + "17": 0.18167, + "18": 0.18385, + "19": 0.18358, + "20": 0.18325, + "21": 0.18392, + "22": 0.1826, + "23": 0.18266, + "24": 0.18333, + "25": 0.18413, + "26": 0.185, + "27": 0.18218, + "28": 0.18361, + "29": 0.18161, + "30": 0.18366, + "31": 0.18238, + "32": 0.18355, + "33": 0.18274, + "34": 0.18399, + "35": 0.18232, + "36": 0.18405, + "37": 0.18325, + "38": 0.18367, + "39": 0.18313, + "40": 0.18319, + "41": 0.18244, + "42": 0.18305, + "43": 0.18287, + "44": 0.18263, + "45": 0.18326, + "46": 0.18213, + "47": 0.18261, + "48": 0.18333, + "49": 0.18287, + "50": 0.18284 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 7d734c87640..5e195fce69e 100644 --- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,287 @@ -{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 9.28651, "5": 9.27695, "10": 9.28293, "15": 9.25309, "20": 9.20817, "25": 9.1444, "30": 9.0783, "35": 8.95924, "40": 8.90642, "45": 8.81379, "50": 8.73494}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5959400.0, "5": 6498093.0, "10": 6529058.0, "15": 6530023.0, "20": 6527801.0, "25": 6993035.0, "30": 6468659.0, "35": 7065192.0, "40": 6555154.0, "45": 6680008.0, "50": 6238169.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1653820416.0, "5": 1653820416.0, "10": 1653820416.0, "15": 1653820416.0, "20": 1653820416.0, "25": 1653820416.0, "30": 1653820416.0, "35": 1653820416.0, "40": 1653820416.0, "45": 1653820416.0, "50": 1653820416.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1653824512.0, "5": 2142515200.0, "10": 2142515200.0, "15": 2142515200.0, "20": 2142515200.0, "25": 2142515200.0, "30": 2142515200.0, "35": 2142515200.0, "40": 2142515200.0, "45": 2142515200.0, "50": 2142515200.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 15.11206, "5": 0.77394, "10": 0.7922, "15": 0.78343, "20": 1.06047, "25": 0.81006, "30": 1.0155, "35": 0.81042, "40": 0.79935, "45": 0.79234, "50": 0.78227}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 9.28651, + "2": 9.28395, + "3": 9.28076, + "4": 9.28861, + "5": 9.27695, + "6": 9.28726, + "7": 9.27836, + "8": 9.28267, + "9": 9.28528, + "10": 9.28293, + "11": 9.28342, + "12": 9.27384, + "13": 9.27126, + "14": 9.27209, + "15": 9.25309, + "16": 9.24492, + "17": 9.24857, + "18": 9.22951, + "19": 9.23151, + "20": 9.20817, + "21": 9.17046, + "22": 9.15049, + "23": 9.16842, + "24": 9.15079, + "25": 9.1444, + "26": 9.14727, + "27": 9.12295, + "28": 9.09719, + "29": 9.09388, + "30": 9.0783, + "31": 8.97175, + "32": 9.03158, + "33": 9.02021, + "34": 8.98662, + "35": 8.95924, + "36": 8.97139, + "37": 8.91443, + "38": 8.88795, + "39": 8.88883, + "40": 8.90642, + "41": 8.81811, + "42": 8.87405, + "43": 8.85666, + "44": 8.81697, + "45": 8.81379, + "46": 8.84457, + "47": 8.73721, + "48": 8.66931, + "49": 8.70107, + "50": 8.73494 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5959400.0, + "2": 6553837.0, + "3": 7313493.0, + "4": 6377142.0, + "5": 6498093.0, + "6": 7151947.0, + "7": 6210401.0, + "8": 6334645.0, + "9": 6624584.0, + "10": 6529058.0, + "11": 7466715.0, + "12": 6471579.0, + "13": 6003497.0, + "14": 8071952.0, + "15": 6530023.0, + "16": 7526922.0, + "17": 6034909.0, + "18": 6289605.0, + "19": 6162573.0, + "20": 6527801.0, + "21": 6981914.0, + "22": 7132792.0, + "23": 5928465.0, + "24": 6210239.0, + "25": 6993035.0, + "26": 6471579.0, + "27": 6355357.0, + "28": 6877112.0, + "29": 6380110.0, + "30": 6468659.0, + "31": 8165130.0, + "32": 6765448.0, + "33": 6355561.0, + "34": 6662237.0, + "35": 7065192.0, + "36": 6076915.0, + "37": 7785518.0, + "38": 6727009.0, + "39": 7315902.0, + "40": 6555154.0, + "41": 7314617.0, + "42": 6591869.0, + "43": 6928017.0, + "44": 7274417.0, + "45": 6680008.0, + "46": 6232372.0, + "47": 6496696.0, + "48": 6809696.0, + "49": 6753491.0, + "50": 6238169.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1653820416.0, + "2": 1653820416.0, + "3": 1653820416.0, + "4": 1653820416.0, + "5": 1653820416.0, + "6": 1653820416.0, + "7": 1653820416.0, + "8": 1653820416.0, + "9": 1653820416.0, + "10": 1653820416.0, + "11": 1653820416.0, + "12": 1653820416.0, + "13": 1653820416.0, + "14": 1653820416.0, + "15": 1653820416.0, + "16": 1653820416.0, + "17": 1653820416.0, + "18": 1653820416.0, + "19": 1653820416.0, + "20": 1653820416.0, + "21": 1653820416.0, + "22": 1653820416.0, + "23": 1653820416.0, + "24": 1653820416.0, + "25": 1653820416.0, + "26": 1653820416.0, + "27": 1653820416.0, + "28": 1653820416.0, + "29": 1653820416.0, + "30": 1653820416.0, + "31": 1653820416.0, + "32": 1653820416.0, + "33": 1653820416.0, + "34": 1653820416.0, + "35": 1653820416.0, + "36": 1653820416.0, + "37": 1653820416.0, + "38": 1653820416.0, + "39": 1653820416.0, + "40": 1653820416.0, + "41": 1653820416.0, + "42": 1653820416.0, + "43": 1653820416.0, + "44": 1653820416.0, + "45": 1653820416.0, + "46": 1653820416.0, + "47": 1653820416.0, + "48": 1653820416.0, + "49": 1653820416.0, + "50": 1653820416.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1653824512.0, + "2": 2142515200.0, + "3": 2142515200.0, + "4": 2142515200.0, + "5": 2142515200.0, + "6": 2142515200.0, + "7": 2142515200.0, + "8": 2142515200.0, + "9": 2142515200.0, + "10": 2142515200.0, + "11": 2142515200.0, + "12": 2142515200.0, + "13": 2142515200.0, + "14": 2142515200.0, + "15": 2142515200.0, + "16": 2142515200.0, + "17": 2142515200.0, + "18": 2142515200.0, + "19": 2142515200.0, + "20": 2142515200.0, + "21": 2142515200.0, + "22": 2142515200.0, + "23": 2142515200.0, + "24": 2142515200.0, + "25": 2142515200.0, + "26": 2142515200.0, + "27": 2142515200.0, + "28": 2142515200.0, + "29": 2142515200.0, + "30": 2142515200.0, + "31": 2142515200.0, + "32": 2142515200.0, + "33": 2142515200.0, + "34": 2142515200.0, + "35": 2142515200.0, + "36": 2142515200.0, + "37": 2142515200.0, + "38": 2142515200.0, + "39": 2142515200.0, + "40": 2142515200.0, + "41": 2142515200.0, + "42": 2142515200.0, + "43": 2142515200.0, + "44": 2142515200.0, + "45": 2142515200.0, + "46": 2142515200.0, + "47": 2142515200.0, + "48": 2142515200.0, + "49": 2142515200.0, + "50": 2142515200.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 14.37901, + "2": 1.00945, + "3": 0.97719, + "4": 1.00246, + "5": 0.95207, + "6": 0.95, + "7": 0.94753, + "8": 0.94707, + "9": 0.94823, + "10": 0.95034, + "11": 0.97925, + "12": 0.97702, + "13": 0.94374, + "14": 1.21224, + "15": 0.94966, + "16": 0.9451, + "17": 0.94563, + "18": 0.94303, + "19": 1.24824, + "20": 0.9452, + "21": 0.97627, + "22": 0.98348, + "23": 1.30411, + "24": 0.94959, + "25": 0.94296, + "26": 0.95158, + "27": 0.94465, + "28": 0.94877, + "29": 0.94644, + "30": 0.94814, + "31": 1.31598, + "32": 0.98424, + "33": 1.24311, + "34": 0.94977, + "35": 1.30685, + "36": 0.94683, + "37": 0.95372, + "38": 0.94948, + "39": 0.95294, + "40": 1.3288, + "41": 0.97347, + "42": 0.9497, + "43": 1.30833, + "44": 0.94555, + "45": 0.94659, + "46": 0.95663, + "47": 0.95211, + "48": 0.95051, + "49": 0.94741, + "50": 0.96304 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..40e463c4e4e --- /dev/null +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 9.28651, + "2": 9.28395, + "3": 9.28076, + "4": 9.28861, + "5": 9.27695, + "6": 9.28726, + "7": 9.27836, + "8": 9.28267, + "9": 9.28528, + "10": 9.28293, + "11": 9.28342, + "12": 9.27384, + "13": 9.27126, + "14": 9.27209, + "15": 9.25309, + "16": 9.24492, + "17": 9.24857, + "18": 9.22951, + "19": 9.23151, + "20": 9.20817, + "21": 9.17046, + "22": 9.15049, + "23": 9.16842, + "24": 9.15079, + "25": 9.1444, + "26": 9.14727, + "27": 9.12295, + "28": 9.09719, + "29": 9.09388, + "30": 9.0783, + "31": 8.97175, + "32": 9.03158, + "33": 9.02021, + "34": 8.98662, + "35": 8.95924, + "36": 8.97139, + "37": 8.91443, + "38": 8.88795, + "39": 8.88883, + "40": 8.90642, + "41": 8.81811, + "42": 8.87405, + "43": 8.85666, + "44": 8.81697, + "45": 8.81379, + "46": 8.84457, + "47": 8.73721, + "48": 8.66931, + "49": 8.70107, + "50": 8.73494 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5959400.0, + "2": 6553837.0, + "3": 7313493.0, + "4": 6377142.0, + "5": 6498093.0, + "6": 7151947.0, + "7": 6210401.0, + "8": 6334645.0, + "9": 6624584.0, + "10": 6529058.0, + "11": 7466715.0, + "12": 6471579.0, + "13": 6003497.0, + "14": 8071952.0, + "15": 6530023.0, + "16": 7526922.0, + "17": 6034909.0, + "18": 6289605.0, + "19": 6162573.0, + "20": 6527801.0, + "21": 6981914.0, + "22": 7132792.0, + "23": 5928465.0, + "24": 6210239.0, + "25": 6993035.0, + "26": 6471579.0, + "27": 6355357.0, + "28": 6877112.0, + "29": 6380110.0, + "30": 6468659.0, + "31": 8165130.0, + "32": 6765448.0, + "33": 6355561.0, + "34": 6662237.0, + "35": 7065192.0, + "36": 6076915.0, + "37": 7785518.0, + "38": 6727009.0, + "39": 7315902.0, + "40": 6555154.0, + "41": 7314617.0, + "42": 6591869.0, + "43": 6928017.0, + "44": 7274417.0, + "45": 6680008.0, + "46": 6232372.0, + "47": 6496696.0, + "48": 6809696.0, + "49": 6753491.0, + "50": 6238169.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1653820416.0, + "2": 1653820416.0, + "3": 1653820416.0, + "4": 1653820416.0, + "5": 1653820416.0, + "6": 1653820416.0, + "7": 1653820416.0, + "8": 1653820416.0, + "9": 1653820416.0, + "10": 1653820416.0, + "11": 1653820416.0, + "12": 1653820416.0, + "13": 1653820416.0, + "14": 1653820416.0, + "15": 1653820416.0, + "16": 1653820416.0, + "17": 1653820416.0, + "18": 1653820416.0, + "19": 1653820416.0, + "20": 1653820416.0, + "21": 1653820416.0, + "22": 1653820416.0, + "23": 1653820416.0, + "24": 1653820416.0, + "25": 1653820416.0, + "26": 1653820416.0, + "27": 1653820416.0, + "28": 1653820416.0, + "29": 1653820416.0, + "30": 1653820416.0, + "31": 1653820416.0, + "32": 1653820416.0, + "33": 1653820416.0, + "34": 1653820416.0, + "35": 1653820416.0, + "36": 1653820416.0, + "37": 1653820416.0, + "38": 1653820416.0, + "39": 1653820416.0, + "40": 1653820416.0, + "41": 1653820416.0, + "42": 1653820416.0, + "43": 1653820416.0, + "44": 1653820416.0, + "45": 1653820416.0, + "46": 1653820416.0, + "47": 1653820416.0, + "48": 1653820416.0, + "49": 1653820416.0, + "50": 1653820416.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1653824512.0, + "2": 2142515200.0, + "3": 2142515200.0, + "4": 2142515200.0, + "5": 2142515200.0, + "6": 2142515200.0, + "7": 2142515200.0, + "8": 2142515200.0, + "9": 2142515200.0, + "10": 2142515200.0, + "11": 2142515200.0, + "12": 2142515200.0, + "13": 2142515200.0, + "14": 2142515200.0, + "15": 2142515200.0, + "16": 2142515200.0, + "17": 2142515200.0, + "18": 2142515200.0, + "19": 2142515200.0, + "20": 2142515200.0, + "21": 2142515200.0, + "22": 2142515200.0, + "23": 2142515200.0, + "24": 2142515200.0, + "25": 2142515200.0, + "26": 2142515200.0, + "27": 2142515200.0, + "28": 2142515200.0, + "29": 2142515200.0, + "30": 2142515200.0, + "31": 2142515200.0, + "32": 2142515200.0, + "33": 2142515200.0, + "34": 2142515200.0, + "35": 2142515200.0, + "36": 2142515200.0, + "37": 2142515200.0, + "38": 2142515200.0, + "39": 2142515200.0, + "40": 2142515200.0, + "41": 2142515200.0, + "42": 2142515200.0, + "43": 2142515200.0, + "44": 2142515200.0, + "45": 2142515200.0, + "46": 2142515200.0, + "47": 2142515200.0, + "48": 2142515200.0, + "49": 2142515200.0, + "50": 2142515200.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 15.02389, + "2": 0.90938, + "3": 0.833, + "4": 0.83139, + "5": 0.87938, + "6": 0.8436, + "7": 0.84341, + "8": 0.84254, + "9": 0.83392, + "10": 0.8484, + "11": 0.84151, + "12": 0.84392, + "13": 0.84466, + "14": 0.85987, + "15": 0.85033, + "16": 0.84631, + "17": 0.86049, + "18": 0.84475, + "19": 1.16176, + "20": 0.84338, + "21": 0.8904, + "22": 0.85197, + "23": 1.15742, + "24": 0.84195, + "25": 0.84346, + "26": 0.84406, + "27": 0.84866, + "28": 0.87098, + "29": 0.83524, + "30": 1.14004, + "31": 1.16138, + "32": 0.8533, + "33": 0.84361, + "34": 0.84484, + "35": 0.84276, + "36": 0.83752, + "37": 0.84209, + "38": 0.84471, + "39": 0.8405, + "40": 1.1684, + "41": 0.84052, + "42": 0.83772, + "43": 1.16777, + "44": 1.14427, + "45": 0.84262, + "46": 1.19422, + "47": 0.84418, + "48": 0.85685, + "49": 0.84021, + "50": 0.84726 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..bf52c8e8fd4 --- /dev/null +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 9.28651, + "2": 9.28395, + "3": 9.28076, + "4": 9.28861, + "5": 9.27695, + "6": 9.28726, + "7": 9.27836, + "8": 9.28267, + "9": 9.28528, + "10": 9.28293, + "11": 9.28342, + "12": 9.27384, + "13": 9.27126, + "14": 9.27209, + "15": 9.25309, + "16": 9.24492, + "17": 9.24857, + "18": 9.22951, + "19": 9.23151, + "20": 9.20817, + "21": 9.17046, + "22": 9.15049, + "23": 9.16842, + "24": 9.15079, + "25": 9.1444, + "26": 9.14727, + "27": 9.12295, + "28": 9.09719, + "29": 9.09388, + "30": 9.0783, + "31": 8.97175, + "32": 9.03158, + "33": 9.02021, + "34": 8.98662, + "35": 8.95924, + "36": 8.97139, + "37": 8.91443, + "38": 8.88795, + "39": 8.88883, + "40": 8.90642, + "41": 8.81811, + "42": 8.87405, + "43": 8.85666, + "44": 8.81697, + "45": 8.81379, + "46": 8.84457, + "47": 8.73721, + "48": 8.66931, + "49": 8.70107, + "50": 8.73494 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5959400.0, + "2": 6553837.0, + "3": 7313493.0, + "4": 6377142.0, + "5": 6498093.0, + "6": 7151947.0, + "7": 6210401.0, + "8": 6334645.0, + "9": 6624584.0, + "10": 6529058.0, + "11": 7466715.0, + "12": 6471579.0, + "13": 6003497.0, + "14": 8071952.0, + "15": 6530023.0, + "16": 7526922.0, + "17": 6034909.0, + "18": 6289605.0, + "19": 6162573.0, + "20": 6527801.0, + "21": 6981914.0, + "22": 7132792.0, + "23": 5928465.0, + "24": 6210239.0, + "25": 6993035.0, + "26": 6471579.0, + "27": 6355357.0, + "28": 6877112.0, + "29": 6380110.0, + "30": 6468659.0, + "31": 8165130.0, + "32": 6765448.0, + "33": 6355561.0, + "34": 6662237.0, + "35": 7065192.0, + "36": 6076915.0, + "37": 7785518.0, + "38": 6727009.0, + "39": 7315902.0, + "40": 6555154.0, + "41": 7314617.0, + "42": 6591869.0, + "43": 6928017.0, + "44": 7274417.0, + "45": 6680008.0, + "46": 6232372.0, + "47": 6496696.0, + "48": 6809696.0, + "49": 6753491.0, + "50": 6238169.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1653820416.0, + "2": 1653820416.0, + "3": 1653820416.0, + "4": 1653820416.0, + "5": 1653820416.0, + "6": 1653820416.0, + "7": 1653820416.0, + "8": 1653820416.0, + "9": 1653820416.0, + "10": 1653820416.0, + "11": 1653820416.0, + "12": 1653820416.0, + "13": 1653820416.0, + "14": 1653820416.0, + "15": 1653820416.0, + "16": 1653820416.0, + "17": 1653820416.0, + "18": 1653820416.0, + "19": 1653820416.0, + "20": 1653820416.0, + "21": 1653820416.0, + "22": 1653820416.0, + "23": 1653820416.0, + "24": 1653820416.0, + "25": 1653820416.0, + "26": 1653820416.0, + "27": 1653820416.0, + "28": 1653820416.0, + "29": 1653820416.0, + "30": 1653820416.0, + "31": 1653820416.0, + "32": 1653820416.0, + "33": 1653820416.0, + "34": 1653820416.0, + "35": 1653820416.0, + "36": 1653820416.0, + "37": 1653820416.0, + "38": 1653820416.0, + "39": 1653820416.0, + "40": 1653820416.0, + "41": 1653820416.0, + "42": 1653820416.0, + "43": 1653820416.0, + "44": 1653820416.0, + "45": 1653820416.0, + "46": 1653820416.0, + "47": 1653820416.0, + "48": 1653820416.0, + "49": 1653820416.0, + "50": 1653820416.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1653824512.0, + "2": 2142515200.0, + "3": 2142515200.0, + "4": 2142515200.0, + "5": 2142515200.0, + "6": 2142515200.0, + "7": 2142515200.0, + "8": 2142515200.0, + "9": 2142515200.0, + "10": 2142515200.0, + "11": 2142515200.0, + "12": 2142515200.0, + "13": 2142515200.0, + "14": 2142515200.0, + "15": 2142515200.0, + "16": 2142515200.0, + "17": 2142515200.0, + "18": 2142515200.0, + "19": 2142515200.0, + "20": 2142515200.0, + "21": 2142515200.0, + "22": 2142515200.0, + "23": 2142515200.0, + "24": 2142515200.0, + "25": 2142515200.0, + "26": 2142515200.0, + "27": 2142515200.0, + "28": 2142515200.0, + "29": 2142515200.0, + "30": 2142515200.0, + "31": 2142515200.0, + "32": 2142515200.0, + "33": 2142515200.0, + "34": 2142515200.0, + "35": 2142515200.0, + "36": 2142515200.0, + "37": 2142515200.0, + "38": 2142515200.0, + "39": 2142515200.0, + "40": 2142515200.0, + "41": 2142515200.0, + "42": 2142515200.0, + "43": 2142515200.0, + "44": 2142515200.0, + "45": 2142515200.0, + "46": 2142515200.0, + "47": 2142515200.0, + "48": 2142515200.0, + "49": 2142515200.0, + "50": 2142515200.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 14.64684, + "2": 0.98193, + "3": 0.95861, + "4": 0.96167, + "5": 0.96222, + "6": 0.96444, + "7": 0.95334, + "8": 0.95675, + "9": 0.95004, + "10": 0.9526, + "11": 0.94782, + "12": 0.95256, + "13": 0.95466, + "14": 0.95046, + "15": 0.96366, + "16": 0.95156, + "17": 0.95425, + "18": 0.9544, + "19": 1.2298, + "20": 0.95303, + "21": 0.95634, + "22": 0.95632, + "23": 0.95424, + "24": 0.95464, + "25": 0.96269, + "26": 0.96616, + "27": 0.94874, + "28": 0.94988, + "29": 1.26385, + "30": 0.95465, + "31": 1.2033, + "32": 0.9571, + "33": 0.956, + "34": 0.95832, + "35": 1.32667, + "36": 0.95679, + "37": 0.95623, + "38": 0.96193, + "39": 0.96003, + "40": 1.25799, + "41": 0.95599, + "42": 0.95891, + "43": 1.55786, + "44": 0.96371, + "45": 0.96764, + "46": 0.95894, + "47": 0.96017, + "48": 0.95646, + "49": 0.961, + "50": 0.96278 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 165aa133737..0bff8d085b5 100644 --- a/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -1 +1,162 @@ -{"lm loss": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 10.74903, "5": 11.07413, "10": 9.25112, "15": 8.79113, "20": 8.16452, "25": 7.78994}}, "num-zeros": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 245867.0, "5": 251594.0, "10": 252461.0, "15": 261948.0, "20": 248292.0, "25": 237325.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 40674893824.0, "5": 40674893824.0, "10": 40674893824.0, "15": 40674893824.0, "20": 40674893824.0, "25": 40674893824.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 40674897920.0, "5": 44982894592.0, "10": 44982894592.0, "15": 44982894592.0, "20": 44982894592.0, "25": 44982894592.0}}, "iteration-time": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 13.38447, "5": 0.36674, "10": 0.37116, "15": 0.6292, "20": 0.37325, "25": 0.37334}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 10.74903, + "2": 10.75924, + "3": 16.15622, + "4": 20.1728, + "5": 11.07413, + "6": 10.29087, + "7": 10.31369, + "8": 10.31557, + "9": 9.68992, + "10": 9.25112, + "11": 9.43376, + "12": 9.8267, + "13": 8.88334, + "14": 8.49023, + "15": 8.79113, + "16": 7.95739, + "17": 7.70005, + "18": 7.81826, + "19": 8.21562, + "20": 8.16452, + "21": 7.833, + "22": 7.71899, + "23": 7.88724, + "24": 7.70093, + "25": 7.78994 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 245867.0, + "2": 256817.0, + "3": 248438.0, + "4": 233541.0, + "5": 251594.0, + "6": 259588.0, + "7": 256938.0, + "8": 237612.0, + "9": 241154.0, + "10": 252461.0, + "11": 288146.0, + "12": 248712.0, + "13": 241371.0, + "14": 228365.0, + "15": 261948.0, + "16": 237032.0, + "17": 249760.0, + "18": 251590.0, + "19": 257104.0, + "20": 248292.0, + "21": 231805.0, + "22": 223805.0, + "23": 247959.0, + "24": 250798.0, + "25": 237325.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 40735711232.0, + "2": 40735711232.0, + "3": 40735711232.0, + "4": 40735711232.0, + "5": 40735711232.0, + "6": 40735711232.0, + "7": 40735711232.0, + "8": 40735711232.0, + "9": 40735711232.0, + "10": 40735711232.0, + "11": 40735711232.0, + "12": 40735711232.0, + "13": 40735711232.0, + "14": 40735711232.0, + "15": 40735711232.0, + "16": 40735711232.0, + "17": 40735711232.0, + "18": 40735711232.0, + "19": 40735711232.0, + "20": 40735711232.0, + "21": 40735711232.0, + "22": 40735711232.0, + "23": 40735711232.0, + "24": 40735711232.0, + "25": 40735711232.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 40735711232.0, + "2": 44991991808.0, + "3": 44993564672.0, + "4": 44993564672.0, + "5": 44993564672.0, + "6": 44993564672.0, + "7": 44993564672.0, + "8": 44993564672.0, + "9": 44993564672.0, + "10": 44993564672.0, + "11": 44993564672.0, + "12": 44993564672.0, + "13": 44993564672.0, + "14": 44993564672.0, + "15": 44993564672.0, + "16": 44993564672.0, + "17": 44993564672.0, + "18": 44993564672.0, + "19": 44993564672.0, + "20": 44993564672.0, + "21": 44993564672.0, + "22": 44993564672.0, + "23": 44993564672.0, + "24": 44993564672.0, + "25": 44993564672.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 10.24757, + "2": 0.4815, + "3": 0.41556, + "4": 0.40564, + "5": 0.40743, + "6": 0.40813, + "7": 0.42484, + "8": 0.41261, + "9": 0.40523, + "10": 0.41064, + "11": 0.40795, + "12": 0.409, + "13": 0.41219, + "14": 0.41524, + "15": 0.41267, + "16": 0.40783, + "17": 0.40886, + "18": 0.41321, + "19": 0.40795, + "20": 0.41032, + "21": 0.41828, + "22": 0.40867, + "23": 0.42317, + "24": 0.40771, + "25": 0.4176 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..a5fc1a5f4c5 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,162 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 10.74903, + "2": 10.75924, + "3": 16.15622, + "4": 20.1728, + "5": 11.07413, + "6": 10.29087, + "7": 10.31369, + "8": 10.31557, + "9": 9.68992, + "10": 9.25112, + "11": 9.43376, + "12": 9.8267, + "13": 8.88334, + "14": 8.49023, + "15": 8.79113, + "16": 7.95739, + "17": 7.70005, + "18": 7.81826, + "19": 8.21562, + "20": 8.16452, + "21": 7.833, + "22": 7.71899, + "23": 7.88724, + "24": 7.70093, + "25": 7.78994 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 245867.0, + "2": 256817.0, + "3": 248438.0, + "4": 233541.0, + "5": 251594.0, + "6": 259588.0, + "7": 256938.0, + "8": 237612.0, + "9": 241154.0, + "10": 252461.0, + "11": 288146.0, + "12": 248712.0, + "13": 241371.0, + "14": 228365.0, + "15": 261948.0, + "16": 237032.0, + "17": 249760.0, + "18": 251590.0, + "19": 257104.0, + "20": 248292.0, + "21": 231805.0, + "22": 223805.0, + "23": 247959.0, + "24": 250798.0, + "25": 237325.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 40735711232.0, + "2": 40735711232.0, + "3": 40735711232.0, + "4": 40735711232.0, + "5": 40735711232.0, + "6": 40735711232.0, + "7": 40735711232.0, + "8": 40735711232.0, + "9": 40735711232.0, + "10": 40735711232.0, + "11": 40735711232.0, + "12": 40735711232.0, + "13": 40735711232.0, + "14": 40735711232.0, + "15": 40735711232.0, + "16": 40735711232.0, + "17": 40735711232.0, + "18": 40735711232.0, + "19": 40735711232.0, + "20": 40735711232.0, + "21": 40735711232.0, + "22": 40735711232.0, + "23": 40735711232.0, + "24": 40735711232.0, + "25": 40735711232.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 40735711232.0, + "2": 44991991808.0, + "3": 44993564672.0, + "4": 44993564672.0, + "5": 44993564672.0, + "6": 44993564672.0, + "7": 44993564672.0, + "8": 44993564672.0, + "9": 44993564672.0, + "10": 44993564672.0, + "11": 44993564672.0, + "12": 44993564672.0, + "13": 44993564672.0, + "14": 44993564672.0, + "15": 44993564672.0, + "16": 44993564672.0, + "17": 44993564672.0, + "18": 44993564672.0, + "19": 44993564672.0, + "20": 44993564672.0, + "21": 44993564672.0, + "22": 44993564672.0, + "23": 44993564672.0, + "24": 44993564672.0, + "25": 44993564672.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 13.38163, + "2": 0.76932, + "3": 0.59621, + "4": 0.3807, + "5": 0.37959, + "6": 0.38757, + "7": 0.38242, + "8": 0.39662, + "9": 0.38425, + "10": 0.38671, + "11": 0.3878, + "12": 0.37911, + "13": 0.38138, + "14": 0.38215, + "15": 0.37904, + "16": 0.3847, + "17": 0.38241, + "18": 0.38681, + "19": 0.39003, + "20": 0.37797, + "21": 0.3854, + "22": 0.71416, + "23": 0.38609, + "24": 0.37862, + "25": 0.37919 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..45c06ac2f7e --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,162 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 10.74903, + "2": 10.75924, + "3": 16.15622, + "4": 20.1728, + "5": 11.07413, + "6": 10.29087, + "7": 10.31369, + "8": 10.31557, + "9": 9.68992, + "10": 9.25112, + "11": 9.43376, + "12": 9.8267, + "13": 8.88334, + "14": 8.49023, + "15": 8.79113, + "16": 7.95739, + "17": 7.70005, + "18": 7.81826, + "19": 8.21562, + "20": 8.16452, + "21": 7.833, + "22": 7.71899, + "23": 7.88724, + "24": 7.70093, + "25": 7.78994 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 245867.0, + "2": 256817.0, + "3": 248438.0, + "4": 233541.0, + "5": 251594.0, + "6": 259588.0, + "7": 256938.0, + "8": 237612.0, + "9": 241154.0, + "10": 252461.0, + "11": 288146.0, + "12": 248712.0, + "13": 241371.0, + "14": 228365.0, + "15": 261948.0, + "16": 237032.0, + "17": 249760.0, + "18": 251590.0, + "19": 257104.0, + "20": 248292.0, + "21": 231805.0, + "22": 223805.0, + "23": 247959.0, + "24": 250798.0, + "25": 237325.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 40735711232.0, + "2": 40735711232.0, + "3": 40735711232.0, + "4": 40735711232.0, + "5": 40735711232.0, + "6": 40735711232.0, + "7": 40735711232.0, + "8": 40735711232.0, + "9": 40735711232.0, + "10": 40735711232.0, + "11": 40735711232.0, + "12": 40735711232.0, + "13": 40735711232.0, + "14": 40735711232.0, + "15": 40735711232.0, + "16": 40735711232.0, + "17": 40735711232.0, + "18": 40735711232.0, + "19": 40735711232.0, + "20": 40735711232.0, + "21": 40735711232.0, + "22": 40735711232.0, + "23": 40735711232.0, + "24": 40735711232.0, + "25": 40735711232.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 40735711232.0, + "2": 44991991808.0, + "3": 44993564672.0, + "4": 44993564672.0, + "5": 44993564672.0, + "6": 44993564672.0, + "7": 44993564672.0, + "8": 44993564672.0, + "9": 44993564672.0, + "10": 44993564672.0, + "11": 44993564672.0, + "12": 44993564672.0, + "13": 44993564672.0, + "14": 44993564672.0, + "15": 44993564672.0, + "16": 44993564672.0, + "17": 44993564672.0, + "18": 44993564672.0, + "19": 44993564672.0, + "20": 44993564672.0, + "21": 44993564672.0, + "22": 44993564672.0, + "23": 44993564672.0, + "24": 44993564672.0, + "25": 44993564672.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 25, + "step_interval": 1, + "values": { + "1": 12.25468, + "2": 0.47853, + "3": 0.41459, + "4": 0.41066, + "5": 0.4125, + "6": 0.42243, + "7": 0.40926, + "8": 0.41832, + "9": 0.4068, + "10": 0.41071, + "11": 0.41068, + "12": 0.41187, + "13": 0.42064, + "14": 0.4228, + "15": 0.41026, + "16": 0.81409, + "17": 0.41651, + "18": 0.41416, + "19": 0.41418, + "20": 0.41217, + "21": 0.42084, + "22": 0.4131, + "23": 0.41106, + "24": 0.41518, + "25": 0.41106 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index f3a09e92509..8284e160db8 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.34904, + "2": 10.34488, + "3": 9.79407, + "4": 9.59568, "5": 9.42065, + "6": 9.41856, + "7": 9.28073, + "8": 9.18973, + "9": 9.06584, "10": 9.00206, + "11": 8.81497, + "12": 8.78107, + "13": 8.82506, + "14": 8.6728, "15": 8.6368, + "16": 8.51926, + "17": 8.45732, + "18": 8.37037, + "19": 8.36068, "20": 8.25456, + "21": 8.24268, + "22": 8.13404, + "23": 8.06818, + "24": 8.11464, "25": 7.95146, + "26": 8.08186, + "27": 7.86814, + "28": 7.94027, + "29": 7.77604, "30": 7.84595, + "31": 7.81568, + "32": 7.65964, + "33": 7.77905, + "34": 7.53277, "35": 7.6586, + "36": 7.51541, + "37": 7.44748, + "38": 7.4824, + "39": 7.46523, "40": 7.49146, + "41": 7.40822, + "42": 7.35649, + "43": 7.43806, + "44": 7.35517, "45": 7.35103, + "46": 7.27859, + "47": 7.44152, + "48": 7.2683, + "49": 7.32389, "50": 7.14549, + "51": 7.36541, + "52": 7.12192, + "53": 7.09189, + "54": 7.22759, "55": 7.13584, + "56": 7.20822, + "57": 7.31316, + "58": 6.99088, + "59": 7.09934, "60": 7.12683, + "61": 7.1014, + "62": 7.23954, + "63": 7.14417, + "64": 7.06836, "65": 6.98412, + "66": 7.03768, + "67": 7.02847, + "68": 7.1299, + "69": 7.01456, "70": 7.04997, + "71": 6.89408, + "72": 6.98553, + "73": 6.96694, + "74": 6.90297, "75": 7.0574, + "76": 6.9581, + "77": 7.06903, + "78": 7.02133, + "79": 6.8504, "80": 6.91935, + "81": 6.95874, + "82": 7.04745, + "83": 6.98522, + "84": 6.99712, "85": 6.83565, + "86": 7.04156, + "87": 6.96476, + "88": 6.89883, + "89": 6.80051, "90": 7.22593, + "91": 6.70562, + "92": 7.0381, + "93": 6.88685, + "94": 7.03908, "95": 6.84815, + "96": 6.95281, + "97": 6.94344, + "98": 6.86987, + "99": 6.99502, "100": 6.96683 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 43274.0, + "2": 44071.0, + "3": 44760.0, + "4": 42385.0, "5": 45378.0, + "6": 40938.0, + "7": 43150.0, + "8": 45450.0, + "9": 42428.0, "10": 45373.0, + "11": 43974.0, + "12": 44591.0, + "13": 43897.0, + "14": 46204.0, "15": 43924.0, + "16": 41613.0, + "17": 43852.0, + "18": 44669.0, + "19": 42579.0, "20": 44769.0, + "21": 44761.0, + "22": 41873.0, + "23": 45441.0, + "24": 43081.0, "25": 42452.0, + "26": 43947.0, + "27": 46247.0, + "28": 46419.0, + "29": 46169.0, "30": 44035.0, + "31": 41152.0, + "32": 43347.0, + "33": 45435.0, + "34": 43300.0, "35": 43284.0, + "36": 42483.0, + "37": 40070.0, + "38": 42561.0, + "39": 44706.0, "40": 43260.0, + "41": 44642.0, + "42": 43192.0, + "43": 45439.0, + "44": 44588.0, "45": 43274.0, + "46": 43921.0, + "47": 42364.0, + "48": 44740.0, + "49": 43152.0, "50": 43348.0, + "51": 41112.0, + "52": 43837.0, + "53": 43913.0, + "54": 41704.0, "55": 43870.0, + "56": 43209.0, + "57": 42636.0, + "58": 43841.0, + "59": 44630.0, "60": 41219.0, + "61": 39702.0, + "62": 44739.0, + "63": 44651.0, + "64": 45372.0, "65": 44682.0, + "66": 45351.0, + "67": 43174.0, + "68": 42502.0, + "69": 43834.0, "70": 45514.0, + "71": 43291.0, + "72": 44767.0, + "73": 45384.0, + "74": 42457.0, "75": 44673.0, + "76": 43876.0, + "77": 42026.0, + "78": 40350.0, + "79": 38918.0, "80": 41092.0, + "81": 45364.0, + "82": 43198.0, + "83": 38467.0, + "84": 42477.0, "85": 43981.0, + "86": 45667.0, + "87": 40863.0, + "88": 41772.0, + "89": 41104.0, "90": 44669.0, + "91": 46134.0, + "92": 41634.0, + "93": 43241.0, + "94": 39538.0, "95": 43915.0, + "96": 44683.0, + "97": 45405.0, + "98": 41791.0, + "99": 45414.0, "100": 42458.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1132053504.0, + "2": 1132053504.0, + "3": 1132053504.0, + "4": 1132053504.0, "5": 1132053504.0, + "6": 1132053504.0, + "7": 1132053504.0, + "8": 1132053504.0, + "9": 1132053504.0, "10": 1132053504.0, + "11": 1132053504.0, + "12": 1132053504.0, + "13": 1132053504.0, + "14": 1132053504.0, "15": 1132053504.0, + "16": 1132053504.0, + "17": 1132053504.0, + "18": 1132053504.0, + "19": 1132053504.0, "20": 1132053504.0, + "21": 1132053504.0, + "22": 1132053504.0, + "23": 1132053504.0, + "24": 1132053504.0, "25": 1132053504.0, + "26": 1132053504.0, + "27": 1132053504.0, + "28": 1132053504.0, + "29": 1132053504.0, "30": 1132053504.0, + "31": 1132053504.0, + "32": 1132053504.0, + "33": 1132053504.0, + "34": 1132053504.0, "35": 1132053504.0, + "36": 1132053504.0, + "37": 1132053504.0, + "38": 1132053504.0, + "39": 1132053504.0, "40": 1132053504.0, + "41": 1132053504.0, + "42": 1132053504.0, + "43": 1132053504.0, + "44": 1132053504.0, "45": 1132053504.0, + "46": 1132053504.0, + "47": 1132053504.0, + "48": 1132053504.0, + "49": 1132053504.0, "50": 1132053504.0, + "51": 1132053504.0, + "52": 1132053504.0, + "53": 1132053504.0, + "54": 1132053504.0, "55": 1132053504.0, + "56": 1132053504.0, + "57": 1132053504.0, + "58": 1132053504.0, + "59": 1132053504.0, "60": 1132053504.0, + "61": 1132053504.0, + "62": 1132053504.0, + "63": 1132053504.0, + "64": 1132053504.0, "65": 1132053504.0, + "66": 1132053504.0, + "67": 1132053504.0, + "68": 1132053504.0, + "69": 1132053504.0, "70": 1132053504.0, + "71": 1132053504.0, + "72": 1132053504.0, + "73": 1132053504.0, + "74": 1132053504.0, "75": 1132053504.0, + "76": 1132053504.0, + "77": 1132053504.0, + "78": 1132053504.0, + "79": 1132053504.0, "80": 1132053504.0, + "81": 1132053504.0, + "82": 1132053504.0, + "83": 1132053504.0, + "84": 1132053504.0, "85": 1132053504.0, + "86": 1132053504.0, + "87": 1132053504.0, + "88": 1132053504.0, + "89": 1132053504.0, "90": 1132053504.0, + "91": 1132053504.0, + "92": 1132053504.0, + "93": 1132053504.0, + "94": 1132053504.0, "95": 1132053504.0, + "96": 1132053504.0, + "97": 1132053504.0, + "98": 1132053504.0, + "99": 1132053504.0, "100": 1132053504.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1409266176.0, + "2": 1864166912.0, + "3": 1864166912.0, + "4": 1864166912.0, "5": 1864166912.0, + "6": 1864166912.0, + "7": 1864166912.0, + "8": 1864166912.0, + "9": 1864166912.0, "10": 1864166912.0, + "11": 1864166912.0, + "12": 1864166912.0, + "13": 1864166912.0, + "14": 1864166912.0, "15": 1864166912.0, + "16": 1864166912.0, + "17": 1864166912.0, + "18": 1864166912.0, + "19": 1864166912.0, "20": 1864166912.0, + "21": 1864166912.0, + "22": 1864166912.0, + "23": 1864166912.0, + "24": 1864166912.0, "25": 1864166912.0, + "26": 1864166912.0, + "27": 1864166912.0, + "28": 1864166912.0, + "29": 1864166912.0, "30": 1864166912.0, + "31": 1864166912.0, + "32": 1864166912.0, + "33": 1864166912.0, + "34": 1864166912.0, "35": 1864166912.0, + "36": 1864166912.0, + "37": 1864166912.0, + "38": 1864166912.0, + "39": 1864166912.0, "40": 1864166912.0, + "41": 1864166912.0, + "42": 1864166912.0, + "43": 1864166912.0, + "44": 1864166912.0, "45": 1864166912.0, + "46": 1864166912.0, + "47": 1864166912.0, + "48": 1864166912.0, + "49": 1864166912.0, "50": 1864166912.0, + "51": 1864166912.0, + "52": 1864166912.0, + "53": 1864166912.0, + "54": 1864166912.0, "55": 1864166912.0, + "56": 1864166912.0, + "57": 1864166912.0, + "58": 1864166912.0, + "59": 1864166912.0, "60": 1864166912.0, + "61": 1864166912.0, + "62": 1864166912.0, + "63": 1864166912.0, + "64": 1864166912.0, "65": 1864166912.0, + "66": 1864166912.0, + "67": 1864166912.0, + "68": 1864166912.0, + "69": 1864166912.0, "70": 1864166912.0, + "71": 1864166912.0, + "72": 1864166912.0, + "73": 1864166912.0, + "74": 1864166912.0, "75": 1864166912.0, + "76": 1864166912.0, + "77": 1864166912.0, + "78": 1864166912.0, + "79": 1864166912.0, "80": 1864166912.0, + "81": 1864166912.0, + "82": 1864166912.0, + "83": 1864166912.0, + "84": 1864166912.0, "85": 1864166912.0, + "86": 1864166912.0, + "87": 1864166912.0, + "88": 1864166912.0, + "89": 1864166912.0, "90": 1864166912.0, + "91": 1864166912.0, + "92": 1864166912.0, + "93": 1864166912.0, + "94": 1864166912.0, "95": 1864166912.0, + "96": 1864166912.0, + "97": 1864166912.0, + "98": 1864166912.0, + "99": 1864166912.0, "100": 1864166912.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 9.92821, - "5": 0.84728, - "10": 0.55604, - "15": 0.56749, - "20": 0.88464, - "25": 0.56066, - "30": 0.56065, - "35": 0.55291, - "40": 0.56895, - "45": 0.55838, - "50": 0.56254, - "55": 0.55721, - "60": 0.55871, - "65": 0.55687, - "70": 0.55579, - "75": 0.55255, - "80": 0.83995, - "85": 0.55623, - "90": 0.56239, - "95": 0.56105, - "100": 0.5538 + "1": 9.73359, + "2": 0.67213, + "3": 0.64227, + "4": 0.63808, + "5": 0.64274, + "6": 0.67444, + "7": 0.656, + "8": 0.64304, + "9": 0.64801, + "10": 0.6494, + "11": 0.64362, + "12": 0.64541, + "13": 0.64198, + "14": 0.64063, + "15": 0.64548, + "16": 0.64104, + "17": 0.64359, + "18": 0.64166, + "19": 0.65505, + "20": 0.73426, + "21": 0.95714, + "22": 0.65, + "23": 0.63689, + "24": 0.6432, + "25": 0.96753, + "26": 1.01279, + "27": 0.6456, + "28": 0.64422, + "29": 0.64535, + "30": 1.02938, + "31": 0.64295, + "32": 0.64549, + "33": 1.10839, + "34": 0.66812, + "35": 0.64537, + "36": 0.64987, + "37": 0.64712, + "38": 0.6499, + "39": 0.64672, + "40": 0.64485, + "41": 0.64456, + "42": 0.64313, + "43": 0.64617, + "44": 0.64605, + "45": 0.64551, + "46": 0.64651, + "47": 0.70467, + "48": 0.67348, + "49": 0.65815, + "50": 0.65354, + "51": 0.64544, + "52": 0.6421, + "53": 0.64328, + "54": 0.64635, + "55": 0.6411, + "56": 0.64965, + "57": 0.64264, + "58": 0.64835, + "59": 0.64574, + "60": 0.64782, + "61": 0.64933, + "62": 0.65052, + "63": 0.64609, + "64": 0.68144, + "65": 0.64542, + "66": 0.64402, + "67": 0.64496, + "68": 0.64484, + "69": 0.64035, + "70": 0.64288, + "71": 0.64575, + "72": 0.69431, + "73": 0.64645, + "74": 0.64787, + "75": 0.65414, + "76": 0.64408, + "77": 0.64637, + "78": 0.64886, + "79": 0.66194, + "80": 0.65332, + "81": 0.65413, + "82": 0.65243, + "83": 0.64364, + "84": 0.64934, + "85": 0.6425, + "86": 0.96767, + "87": 0.92546, + "88": 0.6477, + "89": 0.64523, + "90": 0.64767, + "91": 0.65445, + "92": 0.64953, + "93": 0.65409, + "94": 0.69319, + "95": 0.65121, + "96": 0.64906, + "97": 0.65378, + "98": 0.6511, + "99": 0.65393, + "100": 0.65491 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..4d566ec6c1b --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.34904, + "2": 10.34488, + "3": 9.79407, + "4": 9.59568, + "5": 9.42065, + "6": 9.41856, + "7": 9.28073, + "8": 9.18973, + "9": 9.06584, + "10": 9.00206, + "11": 8.81497, + "12": 8.78107, + "13": 8.82506, + "14": 8.6728, + "15": 8.6368, + "16": 8.51926, + "17": 8.45732, + "18": 8.37037, + "19": 8.36068, + "20": 8.25456, + "21": 8.24268, + "22": 8.13404, + "23": 8.06818, + "24": 8.11464, + "25": 7.95146, + "26": 8.08186, + "27": 7.86814, + "28": 7.94027, + "29": 7.77604, + "30": 7.84595, + "31": 7.81568, + "32": 7.65964, + "33": 7.77905, + "34": 7.53277, + "35": 7.6586, + "36": 7.51541, + "37": 7.44748, + "38": 7.4824, + "39": 7.46523, + "40": 7.49146, + "41": 7.40822, + "42": 7.35649, + "43": 7.43806, + "44": 7.35517, + "45": 7.35103, + "46": 7.27859, + "47": 7.44152, + "48": 7.2683, + "49": 7.32389, + "50": 7.14549, + "51": 7.36541, + "52": 7.12192, + "53": 7.09189, + "54": 7.22759, + "55": 7.13584, + "56": 7.20822, + "57": 7.31316, + "58": 6.99088, + "59": 7.09934, + "60": 7.12683, + "61": 7.1014, + "62": 7.23954, + "63": 7.14417, + "64": 7.06836, + "65": 6.98412, + "66": 7.03768, + "67": 7.02847, + "68": 7.1299, + "69": 7.01456, + "70": 7.04997, + "71": 6.89408, + "72": 6.98553, + "73": 6.96694, + "74": 6.90297, + "75": 7.0574, + "76": 6.9581, + "77": 7.06903, + "78": 7.02133, + "79": 6.8504, + "80": 6.91935, + "81": 6.95874, + "82": 7.04745, + "83": 6.98522, + "84": 6.99712, + "85": 6.83565, + "86": 7.04156, + "87": 6.96476, + "88": 6.89883, + "89": 6.80051, + "90": 7.22593, + "91": 6.70562, + "92": 7.0381, + "93": 6.88685, + "94": 7.03908, + "95": 6.84815, + "96": 6.95281, + "97": 6.94344, + "98": 6.86987, + "99": 6.99502, + "100": 6.96683 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43274.0, + "2": 44071.0, + "3": 44760.0, + "4": 42385.0, + "5": 45378.0, + "6": 40938.0, + "7": 43150.0, + "8": 45450.0, + "9": 42428.0, + "10": 45373.0, + "11": 43974.0, + "12": 44591.0, + "13": 43897.0, + "14": 46204.0, + "15": 43924.0, + "16": 41613.0, + "17": 43852.0, + "18": 44669.0, + "19": 42579.0, + "20": 44769.0, + "21": 44761.0, + "22": 41873.0, + "23": 45441.0, + "24": 43081.0, + "25": 42452.0, + "26": 43947.0, + "27": 46247.0, + "28": 46419.0, + "29": 46169.0, + "30": 44035.0, + "31": 41152.0, + "32": 43347.0, + "33": 45435.0, + "34": 43300.0, + "35": 43284.0, + "36": 42483.0, + "37": 40070.0, + "38": 42561.0, + "39": 44706.0, + "40": 43260.0, + "41": 44642.0, + "42": 43192.0, + "43": 45439.0, + "44": 44588.0, + "45": 43274.0, + "46": 43921.0, + "47": 42364.0, + "48": 44740.0, + "49": 43152.0, + "50": 43348.0, + "51": 41112.0, + "52": 43837.0, + "53": 43913.0, + "54": 41704.0, + "55": 43870.0, + "56": 43209.0, + "57": 42636.0, + "58": 43841.0, + "59": 44630.0, + "60": 41219.0, + "61": 39702.0, + "62": 44739.0, + "63": 44651.0, + "64": 45372.0, + "65": 44682.0, + "66": 45351.0, + "67": 43174.0, + "68": 42502.0, + "69": 43834.0, + "70": 45514.0, + "71": 43291.0, + "72": 44767.0, + "73": 45384.0, + "74": 42457.0, + "75": 44673.0, + "76": 43876.0, + "77": 42026.0, + "78": 40350.0, + "79": 38918.0, + "80": 41092.0, + "81": 45364.0, + "82": 43198.0, + "83": 38467.0, + "84": 42477.0, + "85": 43981.0, + "86": 45667.0, + "87": 40863.0, + "88": 41772.0, + "89": 41104.0, + "90": 44669.0, + "91": 46134.0, + "92": 41634.0, + "93": 43241.0, + "94": 39538.0, + "95": 43915.0, + "96": 44683.0, + "97": 45405.0, + "98": 41791.0, + "99": 45414.0, + "100": 42458.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1132053504.0, + "2": 1132053504.0, + "3": 1132053504.0, + "4": 1132053504.0, + "5": 1132053504.0, + "6": 1132053504.0, + "7": 1132053504.0, + "8": 1132053504.0, + "9": 1132053504.0, + "10": 1132053504.0, + "11": 1132053504.0, + "12": 1132053504.0, + "13": 1132053504.0, + "14": 1132053504.0, + "15": 1132053504.0, + "16": 1132053504.0, + "17": 1132053504.0, + "18": 1132053504.0, + "19": 1132053504.0, + "20": 1132053504.0, + "21": 1132053504.0, + "22": 1132053504.0, + "23": 1132053504.0, + "24": 1132053504.0, + "25": 1132053504.0, + "26": 1132053504.0, + "27": 1132053504.0, + "28": 1132053504.0, + "29": 1132053504.0, + "30": 1132053504.0, + "31": 1132053504.0, + "32": 1132053504.0, + "33": 1132053504.0, + "34": 1132053504.0, + "35": 1132053504.0, + "36": 1132053504.0, + "37": 1132053504.0, + "38": 1132053504.0, + "39": 1132053504.0, + "40": 1132053504.0, + "41": 1132053504.0, + "42": 1132053504.0, + "43": 1132053504.0, + "44": 1132053504.0, + "45": 1132053504.0, + "46": 1132053504.0, + "47": 1132053504.0, + "48": 1132053504.0, + "49": 1132053504.0, + "50": 1132053504.0, + "51": 1132053504.0, + "52": 1132053504.0, + "53": 1132053504.0, + "54": 1132053504.0, + "55": 1132053504.0, + "56": 1132053504.0, + "57": 1132053504.0, + "58": 1132053504.0, + "59": 1132053504.0, + "60": 1132053504.0, + "61": 1132053504.0, + "62": 1132053504.0, + "63": 1132053504.0, + "64": 1132053504.0, + "65": 1132053504.0, + "66": 1132053504.0, + "67": 1132053504.0, + "68": 1132053504.0, + "69": 1132053504.0, + "70": 1132053504.0, + "71": 1132053504.0, + "72": 1132053504.0, + "73": 1132053504.0, + "74": 1132053504.0, + "75": 1132053504.0, + "76": 1132053504.0, + "77": 1132053504.0, + "78": 1132053504.0, + "79": 1132053504.0, + "80": 1132053504.0, + "81": 1132053504.0, + "82": 1132053504.0, + "83": 1132053504.0, + "84": 1132053504.0, + "85": 1132053504.0, + "86": 1132053504.0, + "87": 1132053504.0, + "88": 1132053504.0, + "89": 1132053504.0, + "90": 1132053504.0, + "91": 1132053504.0, + "92": 1132053504.0, + "93": 1132053504.0, + "94": 1132053504.0, + "95": 1132053504.0, + "96": 1132053504.0, + "97": 1132053504.0, + "98": 1132053504.0, + "99": 1132053504.0, + "100": 1132053504.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1409266176.0, + "2": 1864166912.0, + "3": 1864166912.0, + "4": 1864166912.0, + "5": 1864166912.0, + "6": 1864166912.0, + "7": 1864166912.0, + "8": 1864166912.0, + "9": 1864166912.0, + "10": 1864166912.0, + "11": 1864166912.0, + "12": 1864166912.0, + "13": 1864166912.0, + "14": 1864166912.0, + "15": 1864166912.0, + "16": 1864166912.0, + "17": 1864166912.0, + "18": 1864166912.0, + "19": 1864166912.0, + "20": 1864166912.0, + "21": 1864166912.0, + "22": 1864166912.0, + "23": 1864166912.0, + "24": 1864166912.0, + "25": 1864166912.0, + "26": 1864166912.0, + "27": 1864166912.0, + "28": 1864166912.0, + "29": 1864166912.0, + "30": 1864166912.0, + "31": 1864166912.0, + "32": 1864166912.0, + "33": 1864166912.0, + "34": 1864166912.0, + "35": 1864166912.0, + "36": 1864166912.0, + "37": 1864166912.0, + "38": 1864166912.0, + "39": 1864166912.0, + "40": 1864166912.0, + "41": 1864166912.0, + "42": 1864166912.0, + "43": 1864166912.0, + "44": 1864166912.0, + "45": 1864166912.0, + "46": 1864166912.0, + "47": 1864166912.0, + "48": 1864166912.0, + "49": 1864166912.0, + "50": 1864166912.0, + "51": 1864166912.0, + "52": 1864166912.0, + "53": 1864166912.0, + "54": 1864166912.0, + "55": 1864166912.0, + "56": 1864166912.0, + "57": 1864166912.0, + "58": 1864166912.0, + "59": 1864166912.0, + "60": 1864166912.0, + "61": 1864166912.0, + "62": 1864166912.0, + "63": 1864166912.0, + "64": 1864166912.0, + "65": 1864166912.0, + "66": 1864166912.0, + "67": 1864166912.0, + "68": 1864166912.0, + "69": 1864166912.0, + "70": 1864166912.0, + "71": 1864166912.0, + "72": 1864166912.0, + "73": 1864166912.0, + "74": 1864166912.0, + "75": 1864166912.0, + "76": 1864166912.0, + "77": 1864166912.0, + "78": 1864166912.0, + "79": 1864166912.0, + "80": 1864166912.0, + "81": 1864166912.0, + "82": 1864166912.0, + "83": 1864166912.0, + "84": 1864166912.0, + "85": 1864166912.0, + "86": 1864166912.0, + "87": 1864166912.0, + "88": 1864166912.0, + "89": 1864166912.0, + "90": 1864166912.0, + "91": 1864166912.0, + "92": 1864166912.0, + "93": 1864166912.0, + "94": 1864166912.0, + "95": 1864166912.0, + "96": 1864166912.0, + "97": 1864166912.0, + "98": 1864166912.0, + "99": 1864166912.0, + "100": 1864166912.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.29236, + "2": 0.67893, + "3": 0.58934, + "4": 0.59882, + "5": 0.5783, + "6": 0.57112, + "7": 0.5684, + "8": 0.55955, + "9": 0.5654, + "10": 0.56541, + "11": 0.57111, + "12": 0.57899, + "13": 0.56135, + "14": 0.56951, + "15": 0.56653, + "16": 0.56906, + "17": 0.5749, + "18": 0.56365, + "19": 0.56829, + "20": 0.93294, + "21": 0.56791, + "22": 0.56512, + "23": 0.57032, + "24": 0.56889, + "25": 0.56027, + "26": 0.87556, + "27": 0.56766, + "28": 0.88828, + "29": 0.56306, + "30": 0.56316, + "31": 0.88671, + "32": 1.03162, + "33": 0.90854, + "34": 0.88126, + "35": 0.56957, + "36": 0.56621, + "37": 0.56647, + "38": 0.56957, + "39": 0.56463, + "40": 0.5668, + "41": 0.56277, + "42": 0.58937, + "43": 0.56553, + "44": 0.5682, + "45": 0.56815, + "46": 0.56571, + "47": 0.57199, + "48": 0.57128, + "49": 0.59172, + "50": 0.56455, + "51": 0.56546, + "52": 0.56259, + "53": 0.56063, + "54": 0.56207, + "55": 0.55985, + "56": 0.57542, + "57": 0.56257, + "58": 0.55932, + "59": 0.56051, + "60": 0.56182, + "61": 0.58999, + "62": 0.55986, + "63": 0.56154, + "64": 0.56167, + "65": 0.56072, + "66": 0.57597, + "67": 0.56011, + "68": 0.55956, + "69": 0.56507, + "70": 0.58296, + "71": 0.56017, + "72": 0.56437, + "73": 0.56838, + "74": 0.56548, + "75": 0.57028, + "76": 0.56574, + "77": 0.56397, + "78": 0.56279, + "79": 0.56782, + "80": 0.56585, + "81": 0.56243, + "82": 0.5641, + "83": 0.56477, + "84": 0.5852, + "85": 0.56257, + "86": 0.84754, + "87": 0.56761, + "88": 0.56425, + "89": 0.57197, + "90": 0.85557, + "91": 0.56904, + "92": 0.57069, + "93": 0.56223, + "94": 0.56609, + "95": 0.565, + "96": 0.56747, + "97": 0.56431, + "98": 0.58797, + "99": 0.89814, + "100": 0.5783 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..2400879202c --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.34904, + "2": 10.34488, + "3": 9.79407, + "4": 9.59568, + "5": 9.42065, + "6": 9.41856, + "7": 9.28073, + "8": 9.18973, + "9": 9.06584, + "10": 9.00206, + "11": 8.81497, + "12": 8.78107, + "13": 8.82506, + "14": 8.6728, + "15": 8.6368, + "16": 8.51926, + "17": 8.45732, + "18": 8.37037, + "19": 8.36068, + "20": 8.25456, + "21": 8.24268, + "22": 8.13404, + "23": 8.06818, + "24": 8.11464, + "25": 7.95146, + "26": 8.08186, + "27": 7.86814, + "28": 7.94027, + "29": 7.77604, + "30": 7.84595, + "31": 7.81568, + "32": 7.65964, + "33": 7.77905, + "34": 7.53277, + "35": 7.6586, + "36": 7.51541, + "37": 7.44748, + "38": 7.4824, + "39": 7.46523, + "40": 7.49146, + "41": 7.40822, + "42": 7.35649, + "43": 7.43806, + "44": 7.35517, + "45": 7.35103, + "46": 7.27859, + "47": 7.44152, + "48": 7.2683, + "49": 7.32389, + "50": 7.14549, + "51": 7.36541, + "52": 7.12192, + "53": 7.09189, + "54": 7.22759, + "55": 7.13584, + "56": 7.20822, + "57": 7.31316, + "58": 6.99088, + "59": 7.09934, + "60": 7.12683, + "61": 7.1014, + "62": 7.23954, + "63": 7.14417, + "64": 7.06836, + "65": 6.98412, + "66": 7.03768, + "67": 7.02847, + "68": 7.1299, + "69": 7.01456, + "70": 7.04997, + "71": 6.89408, + "72": 6.98553, + "73": 6.96694, + "74": 6.90297, + "75": 7.0574, + "76": 6.9581, + "77": 7.06903, + "78": 7.02133, + "79": 6.8504, + "80": 6.91935, + "81": 6.95874, + "82": 7.04745, + "83": 6.98522, + "84": 6.99712, + "85": 6.83565, + "86": 7.04156, + "87": 6.96476, + "88": 6.89883, + "89": 6.80051, + "90": 7.22593, + "91": 6.70562, + "92": 7.0381, + "93": 6.88685, + "94": 7.03908, + "95": 6.84815, + "96": 6.95281, + "97": 6.94344, + "98": 6.86987, + "99": 6.99502, + "100": 6.96683 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43274.0, + "2": 44071.0, + "3": 44760.0, + "4": 42385.0, + "5": 45378.0, + "6": 40938.0, + "7": 43150.0, + "8": 45450.0, + "9": 42428.0, + "10": 45373.0, + "11": 43974.0, + "12": 44591.0, + "13": 43897.0, + "14": 46204.0, + "15": 43924.0, + "16": 41613.0, + "17": 43852.0, + "18": 44669.0, + "19": 42579.0, + "20": 44769.0, + "21": 44761.0, + "22": 41873.0, + "23": 45441.0, + "24": 43081.0, + "25": 42452.0, + "26": 43947.0, + "27": 46247.0, + "28": 46419.0, + "29": 46169.0, + "30": 44035.0, + "31": 41152.0, + "32": 43347.0, + "33": 45435.0, + "34": 43300.0, + "35": 43284.0, + "36": 42483.0, + "37": 40070.0, + "38": 42561.0, + "39": 44706.0, + "40": 43260.0, + "41": 44642.0, + "42": 43192.0, + "43": 45439.0, + "44": 44588.0, + "45": 43274.0, + "46": 43921.0, + "47": 42364.0, + "48": 44740.0, + "49": 43152.0, + "50": 43348.0, + "51": 41112.0, + "52": 43837.0, + "53": 43913.0, + "54": 41704.0, + "55": 43870.0, + "56": 43209.0, + "57": 42636.0, + "58": 43841.0, + "59": 44630.0, + "60": 41219.0, + "61": 39702.0, + "62": 44739.0, + "63": 44651.0, + "64": 45372.0, + "65": 44682.0, + "66": 45351.0, + "67": 43174.0, + "68": 42502.0, + "69": 43834.0, + "70": 45514.0, + "71": 43291.0, + "72": 44767.0, + "73": 45384.0, + "74": 42457.0, + "75": 44673.0, + "76": 43876.0, + "77": 42026.0, + "78": 40350.0, + "79": 38918.0, + "80": 41092.0, + "81": 45364.0, + "82": 43198.0, + "83": 38467.0, + "84": 42477.0, + "85": 43981.0, + "86": 45667.0, + "87": 40863.0, + "88": 41772.0, + "89": 41104.0, + "90": 44669.0, + "91": 46134.0, + "92": 41634.0, + "93": 43241.0, + "94": 39538.0, + "95": 43915.0, + "96": 44683.0, + "97": 45405.0, + "98": 41791.0, + "99": 45414.0, + "100": 42458.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1132053504.0, + "2": 1132053504.0, + "3": 1132053504.0, + "4": 1132053504.0, + "5": 1132053504.0, + "6": 1132053504.0, + "7": 1132053504.0, + "8": 1132053504.0, + "9": 1132053504.0, + "10": 1132053504.0, + "11": 1132053504.0, + "12": 1132053504.0, + "13": 1132053504.0, + "14": 1132053504.0, + "15": 1132053504.0, + "16": 1132053504.0, + "17": 1132053504.0, + "18": 1132053504.0, + "19": 1132053504.0, + "20": 1132053504.0, + "21": 1132053504.0, + "22": 1132053504.0, + "23": 1132053504.0, + "24": 1132053504.0, + "25": 1132053504.0, + "26": 1132053504.0, + "27": 1132053504.0, + "28": 1132053504.0, + "29": 1132053504.0, + "30": 1132053504.0, + "31": 1132053504.0, + "32": 1132053504.0, + "33": 1132053504.0, + "34": 1132053504.0, + "35": 1132053504.0, + "36": 1132053504.0, + "37": 1132053504.0, + "38": 1132053504.0, + "39": 1132053504.0, + "40": 1132053504.0, + "41": 1132053504.0, + "42": 1132053504.0, + "43": 1132053504.0, + "44": 1132053504.0, + "45": 1132053504.0, + "46": 1132053504.0, + "47": 1132053504.0, + "48": 1132053504.0, + "49": 1132053504.0, + "50": 1132053504.0, + "51": 1132053504.0, + "52": 1132053504.0, + "53": 1132053504.0, + "54": 1132053504.0, + "55": 1132053504.0, + "56": 1132053504.0, + "57": 1132053504.0, + "58": 1132053504.0, + "59": 1132053504.0, + "60": 1132053504.0, + "61": 1132053504.0, + "62": 1132053504.0, + "63": 1132053504.0, + "64": 1132053504.0, + "65": 1132053504.0, + "66": 1132053504.0, + "67": 1132053504.0, + "68": 1132053504.0, + "69": 1132053504.0, + "70": 1132053504.0, + "71": 1132053504.0, + "72": 1132053504.0, + "73": 1132053504.0, + "74": 1132053504.0, + "75": 1132053504.0, + "76": 1132053504.0, + "77": 1132053504.0, + "78": 1132053504.0, + "79": 1132053504.0, + "80": 1132053504.0, + "81": 1132053504.0, + "82": 1132053504.0, + "83": 1132053504.0, + "84": 1132053504.0, + "85": 1132053504.0, + "86": 1132053504.0, + "87": 1132053504.0, + "88": 1132053504.0, + "89": 1132053504.0, + "90": 1132053504.0, + "91": 1132053504.0, + "92": 1132053504.0, + "93": 1132053504.0, + "94": 1132053504.0, + "95": 1132053504.0, + "96": 1132053504.0, + "97": 1132053504.0, + "98": 1132053504.0, + "99": 1132053504.0, + "100": 1132053504.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1409266176.0, + "2": 1864166912.0, + "3": 1864166912.0, + "4": 1864166912.0, + "5": 1864166912.0, + "6": 1864166912.0, + "7": 1864166912.0, + "8": 1864166912.0, + "9": 1864166912.0, + "10": 1864166912.0, + "11": 1864166912.0, + "12": 1864166912.0, + "13": 1864166912.0, + "14": 1864166912.0, + "15": 1864166912.0, + "16": 1864166912.0, + "17": 1864166912.0, + "18": 1864166912.0, + "19": 1864166912.0, + "20": 1864166912.0, + "21": 1864166912.0, + "22": 1864166912.0, + "23": 1864166912.0, + "24": 1864166912.0, + "25": 1864166912.0, + "26": 1864166912.0, + "27": 1864166912.0, + "28": 1864166912.0, + "29": 1864166912.0, + "30": 1864166912.0, + "31": 1864166912.0, + "32": 1864166912.0, + "33": 1864166912.0, + "34": 1864166912.0, + "35": 1864166912.0, + "36": 1864166912.0, + "37": 1864166912.0, + "38": 1864166912.0, + "39": 1864166912.0, + "40": 1864166912.0, + "41": 1864166912.0, + "42": 1864166912.0, + "43": 1864166912.0, + "44": 1864166912.0, + "45": 1864166912.0, + "46": 1864166912.0, + "47": 1864166912.0, + "48": 1864166912.0, + "49": 1864166912.0, + "50": 1864166912.0, + "51": 1864166912.0, + "52": 1864166912.0, + "53": 1864166912.0, + "54": 1864166912.0, + "55": 1864166912.0, + "56": 1864166912.0, + "57": 1864166912.0, + "58": 1864166912.0, + "59": 1864166912.0, + "60": 1864166912.0, + "61": 1864166912.0, + "62": 1864166912.0, + "63": 1864166912.0, + "64": 1864166912.0, + "65": 1864166912.0, + "66": 1864166912.0, + "67": 1864166912.0, + "68": 1864166912.0, + "69": 1864166912.0, + "70": 1864166912.0, + "71": 1864166912.0, + "72": 1864166912.0, + "73": 1864166912.0, + "74": 1864166912.0, + "75": 1864166912.0, + "76": 1864166912.0, + "77": 1864166912.0, + "78": 1864166912.0, + "79": 1864166912.0, + "80": 1864166912.0, + "81": 1864166912.0, + "82": 1864166912.0, + "83": 1864166912.0, + "84": 1864166912.0, + "85": 1864166912.0, + "86": 1864166912.0, + "87": 1864166912.0, + "88": 1864166912.0, + "89": 1864166912.0, + "90": 1864166912.0, + "91": 1864166912.0, + "92": 1864166912.0, + "93": 1864166912.0, + "94": 1864166912.0, + "95": 1864166912.0, + "96": 1864166912.0, + "97": 1864166912.0, + "98": 1864166912.0, + "99": 1864166912.0, + "100": 1864166912.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.54009, + "2": 0.66845, + "3": 0.64084, + "4": 0.64526, + "5": 0.64331, + "6": 0.65463, + "7": 0.63991, + "8": 0.63854, + "9": 0.64034, + "10": 0.63886, + "11": 0.63968, + "12": 0.64441, + "13": 0.63828, + "14": 0.64647, + "15": 0.64199, + "16": 0.63783, + "17": 0.64359, + "18": 0.66439, + "19": 0.64718, + "20": 0.63999, + "21": 0.65677, + "22": 0.95191, + "23": 0.64765, + "24": 0.98317, + "25": 1.63221, + "26": 0.64915, + "27": 0.64318, + "28": 0.99238, + "29": 0.64655, + "30": 0.64693, + "31": 0.64241, + "32": 0.98967, + "33": 0.64928, + "34": 0.64294, + "35": 0.65629, + "36": 0.64358, + "37": 0.64814, + "38": 0.64325, + "39": 0.64509, + "40": 0.64733, + "41": 0.64693, + "42": 0.65392, + "43": 0.64721, + "44": 0.64487, + "45": 0.64766, + "46": 0.65872, + "47": 0.65402, + "48": 0.65486, + "49": 0.64433, + "50": 0.64917, + "51": 0.64197, + "52": 0.64647, + "53": 0.64656, + "54": 0.64815, + "55": 0.64573, + "56": 0.6539, + "57": 0.64582, + "58": 0.64668, + "59": 0.64431, + "60": 0.64957, + "61": 0.64703, + "62": 0.64671, + "63": 0.65979, + "64": 0.64599, + "65": 0.6466, + "66": 0.64754, + "67": 0.6471, + "68": 0.64756, + "69": 0.64621, + "70": 0.65906, + "71": 0.64587, + "72": 0.65969, + "73": 0.64476, + "74": 0.65304, + "75": 0.64786, + "76": 0.65077, + "77": 0.66405, + "78": 0.6472, + "79": 0.64431, + "80": 0.64472, + "81": 0.64407, + "82": 0.64326, + "83": 0.93161, + "84": 0.65573, + "85": 0.63999, + "86": 0.64393, + "87": 0.92064, + "88": 0.64399, + "89": 0.64306, + "90": 0.64439, + "91": 0.6414, + "92": 0.64504, + "93": 0.64858, + "94": 0.64041, + "95": 0.64497, + "96": 0.64493, + "97": 0.64508, + "98": 0.6444, + "99": 0.64587, + "100": 0.64886 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 81031669a61..899d650d38b 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.34904, + "2": 10.34488, + "3": 9.79407, + "4": 9.59568, "5": 9.42065, + "6": 9.41856, + "7": 9.28073, + "8": 9.18973, + "9": 9.06584, "10": 9.00206, + "11": 8.81497, + "12": 8.78107, + "13": 8.82506, + "14": 8.6728, "15": 8.6368, + "16": 8.51926, + "17": 8.45732, + "18": 8.37037, + "19": 8.36068, "20": 8.25456, + "21": 8.24268, + "22": 8.13404, + "23": 8.06818, + "24": 8.11464, "25": 7.95146, + "26": 8.08186, + "27": 7.86814, + "28": 7.94027, + "29": 7.77604, "30": 7.84595, + "31": 7.81568, + "32": 7.65964, + "33": 7.77905, + "34": 7.53277, "35": 7.6586, + "36": 7.51541, + "37": 7.44748, + "38": 7.4824, + "39": 7.46523, "40": 7.49146, + "41": 7.40822, + "42": 7.35649, + "43": 7.43806, + "44": 7.35517, "45": 7.35103, + "46": 7.27859, + "47": 7.44152, + "48": 7.2683, + "49": 7.32389, "50": 7.14549, + "51": 7.36541, + "52": 7.12192, + "53": 7.09189, + "54": 7.22759, "55": 7.13584, + "56": 7.20822, + "57": 7.31316, + "58": 6.99088, + "59": 7.09934, "60": 7.12683, + "61": 7.1014, + "62": 7.23954, + "63": 7.14417, + "64": 7.06836, "65": 6.98412, + "66": 7.03768, + "67": 7.02847, + "68": 7.1299, + "69": 7.01456, "70": 7.04997, + "71": 6.89408, + "72": 6.98553, + "73": 6.96694, + "74": 6.90297, "75": 7.0574, + "76": 6.9581, + "77": 7.06903, + "78": 7.02133, + "79": 6.8504, "80": 6.91935, + "81": 6.95874, + "82": 7.04745, + "83": 6.98522, + "84": 6.99712, "85": 6.83565, + "86": 7.04156, + "87": 6.96476, + "88": 6.89883, + "89": 6.80051, "90": 7.22593, + "91": 6.70562, + "92": 7.0381, + "93": 6.88685, + "94": 7.03908, "95": 6.84815, + "96": 6.95281, + "97": 6.94344, + "98": 6.86987, + "99": 6.99502, "100": 6.96683 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 43274.0, + "2": 44071.0, + "3": 44760.0, + "4": 42385.0, "5": 45378.0, + "6": 40938.0, + "7": 43150.0, + "8": 45450.0, + "9": 42428.0, "10": 45373.0, + "11": 43974.0, + "12": 44591.0, + "13": 43897.0, + "14": 46204.0, "15": 43924.0, + "16": 41613.0, + "17": 43852.0, + "18": 44669.0, + "19": 42579.0, "20": 44769.0, + "21": 44761.0, + "22": 41873.0, + "23": 45441.0, + "24": 43081.0, "25": 42452.0, + "26": 43947.0, + "27": 46247.0, + "28": 46419.0, + "29": 46169.0, "30": 44035.0, + "31": 41152.0, + "32": 43347.0, + "33": 45435.0, + "34": 43300.0, "35": 43284.0, + "36": 42483.0, + "37": 40070.0, + "38": 42561.0, + "39": 44706.0, "40": 43260.0, + "41": 44642.0, + "42": 43192.0, + "43": 45439.0, + "44": 44588.0, "45": 43274.0, + "46": 43921.0, + "47": 42364.0, + "48": 44740.0, + "49": 43152.0, "50": 43348.0, + "51": 41112.0, + "52": 43837.0, + "53": 43913.0, + "54": 41704.0, "55": 43870.0, + "56": 43209.0, + "57": 42636.0, + "58": 43841.0, + "59": 44630.0, "60": 41219.0, + "61": 39702.0, + "62": 44739.0, + "63": 44651.0, + "64": 45372.0, "65": 44682.0, + "66": 45351.0, + "67": 43174.0, + "68": 42502.0, + "69": 43834.0, "70": 45514.0, + "71": 43291.0, + "72": 44767.0, + "73": 45384.0, + "74": 42457.0, "75": 44673.0, + "76": 43876.0, + "77": 42026.0, + "78": 40350.0, + "79": 38918.0, "80": 41092.0, + "81": 45364.0, + "82": 43198.0, + "83": 38467.0, + "84": 42477.0, "85": 43981.0, + "86": 45667.0, + "87": 40863.0, + "88": 41772.0, + "89": 41104.0, "90": 44669.0, + "91": 46134.0, + "92": 41634.0, + "93": 43241.0, + "94": 39538.0, "95": 43915.0, + "96": 44683.0, + "97": 45405.0, + "98": 41791.0, + "99": 45414.0, "100": 42458.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1132053504.0, + "2": 1132053504.0, + "3": 1132053504.0, + "4": 1132053504.0, "5": 1132053504.0, + "6": 1132053504.0, + "7": 1132053504.0, + "8": 1132053504.0, + "9": 1132053504.0, "10": 1132053504.0, + "11": 1132053504.0, + "12": 1132053504.0, + "13": 1132053504.0, + "14": 1132053504.0, "15": 1132053504.0, + "16": 1132053504.0, + "17": 1132053504.0, + "18": 1132053504.0, + "19": 1132053504.0, "20": 1132053504.0, + "21": 1132053504.0, + "22": 1132053504.0, + "23": 1132053504.0, + "24": 1132053504.0, "25": 1132053504.0, + "26": 1132053504.0, + "27": 1132053504.0, + "28": 1132053504.0, + "29": 1132053504.0, "30": 1132053504.0, + "31": 1132053504.0, + "32": 1132053504.0, + "33": 1132053504.0, + "34": 1132053504.0, "35": 1132053504.0, + "36": 1132053504.0, + "37": 1132053504.0, + "38": 1132053504.0, + "39": 1132053504.0, "40": 1132053504.0, + "41": 1132053504.0, + "42": 1132053504.0, + "43": 1132053504.0, + "44": 1132053504.0, "45": 1132053504.0, + "46": 1132053504.0, + "47": 1132053504.0, + "48": 1132053504.0, + "49": 1132053504.0, "50": 1132053504.0, + "51": 1132053504.0, + "52": 1132053504.0, + "53": 1132053504.0, + "54": 1132053504.0, "55": 1132053504.0, + "56": 1132053504.0, + "57": 1132053504.0, + "58": 1132053504.0, + "59": 1132053504.0, "60": 1132053504.0, + "61": 1132053504.0, + "62": 1132053504.0, + "63": 1132053504.0, + "64": 1132053504.0, "65": 1132053504.0, + "66": 1132053504.0, + "67": 1132053504.0, + "68": 1132053504.0, + "69": 1132053504.0, "70": 1132053504.0, + "71": 1132053504.0, + "72": 1132053504.0, + "73": 1132053504.0, + "74": 1132053504.0, "75": 1132053504.0, + "76": 1132053504.0, + "77": 1132053504.0, + "78": 1132053504.0, + "79": 1132053504.0, "80": 1132053504.0, + "81": 1132053504.0, + "82": 1132053504.0, + "83": 1132053504.0, + "84": 1132053504.0, "85": 1132053504.0, + "86": 1132053504.0, + "87": 1132053504.0, + "88": 1132053504.0, + "89": 1132053504.0, "90": 1132053504.0, + "91": 1132053504.0, + "92": 1132053504.0, + "93": 1132053504.0, + "94": 1132053504.0, "95": 1132053504.0, + "96": 1132053504.0, + "97": 1132053504.0, + "98": 1132053504.0, + "99": 1132053504.0, "100": 1132053504.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1409266176.0, + "2": 1864166912.0, + "3": 1864166912.0, + "4": 1864166912.0, "5": 1864166912.0, + "6": 1864166912.0, + "7": 1864166912.0, + "8": 1864166912.0, + "9": 1864166912.0, "10": 1864166912.0, + "11": 1864166912.0, + "12": 1864166912.0, + "13": 1864166912.0, + "14": 1864166912.0, "15": 1864166912.0, + "16": 1864166912.0, + "17": 1864166912.0, + "18": 1864166912.0, + "19": 1864166912.0, "20": 1864166912.0, + "21": 1864166912.0, + "22": 1864166912.0, + "23": 1864166912.0, + "24": 1864166912.0, "25": 1864166912.0, + "26": 1864166912.0, + "27": 1864166912.0, + "28": 1864166912.0, + "29": 1864166912.0, "30": 1864166912.0, + "31": 1864166912.0, + "32": 1864166912.0, + "33": 1864166912.0, + "34": 1864166912.0, "35": 1864166912.0, + "36": 1864166912.0, + "37": 1864166912.0, + "38": 1864166912.0, + "39": 1864166912.0, "40": 1864166912.0, + "41": 1864166912.0, + "42": 1864166912.0, + "43": 1864166912.0, + "44": 1864166912.0, "45": 1864166912.0, + "46": 1864166912.0, + "47": 1864166912.0, + "48": 1864166912.0, + "49": 1864166912.0, "50": 1864166912.0, + "51": 1864166912.0, + "52": 1864166912.0, + "53": 1864166912.0, + "54": 1864166912.0, "55": 1864166912.0, + "56": 1864166912.0, + "57": 1864166912.0, + "58": 1864166912.0, + "59": 1864166912.0, "60": 1864166912.0, + "61": 1864166912.0, + "62": 1864166912.0, + "63": 1864166912.0, + "64": 1864166912.0, "65": 1864166912.0, + "66": 1864166912.0, + "67": 1864166912.0, + "68": 1864166912.0, + "69": 1864166912.0, "70": 1864166912.0, + "71": 1864166912.0, + "72": 1864166912.0, + "73": 1864166912.0, + "74": 1864166912.0, "75": 1864166912.0, + "76": 1864166912.0, + "77": 1864166912.0, + "78": 1864166912.0, + "79": 1864166912.0, "80": 1864166912.0, + "81": 1864166912.0, + "82": 1864166912.0, + "83": 1864166912.0, + "84": 1864166912.0, "85": 1864166912.0, + "86": 1864166912.0, + "87": 1864166912.0, + "88": 1864166912.0, + "89": 1864166912.0, "90": 1864166912.0, + "91": 1864166912.0, + "92": 1864166912.0, + "93": 1864166912.0, + "94": 1864166912.0, "95": 1864166912.0, + "96": 1864166912.0, + "97": 1864166912.0, + "98": 1864166912.0, + "99": 1864166912.0, "100": 1864166912.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 8.71448, - "5": 0.56781, - "10": 0.56843, - "15": 0.57548, - "20": 0.88447, - "25": 0.87922, - "30": 0.58734, - "35": 0.58492, - "40": 0.57893, - "45": 0.58782, - "50": 0.57316, - "55": 0.5549, - "60": 0.55728, - "65": 0.55905, - "70": 0.5662, - "75": 0.56127, - "80": 0.55317, - "85": 0.5553, - "90": 0.55754, - "95": 0.5596, - "100": 0.91445 + "1": 9.67922, + "2": 0.68152, + "3": 0.65295, + "4": 0.64618, + "5": 0.65142, + "6": 0.64889, + "7": 0.65383, + "8": 0.6456, + "9": 0.66119, + "10": 0.65998, + "11": 0.6579, + "12": 0.65779, + "13": 0.6603, + "14": 0.65806, + "15": 1.0135, + "16": 0.65488, + "17": 0.931, + "18": 1.08662, + "19": 0.66372, + "20": 0.66034, + "21": 0.65544, + "22": 0.66308, + "23": 0.66077, + "24": 1.04108, + "25": 0.6666, + "26": 0.97428, + "27": 0.65856, + "28": 0.66326, + "29": 0.65747, + "30": 0.6582, + "31": 1.10061, + "32": 1.04733, + "33": 0.65682, + "34": 0.65788, + "35": 0.66349, + "36": 0.65804, + "37": 0.66396, + "38": 0.65876, + "39": 0.65606, + "40": 0.6586, + "41": 0.65742, + "42": 0.66367, + "43": 0.66411, + "44": 0.65879, + "45": 0.66227, + "46": 0.66361, + "47": 0.66004, + "48": 0.6614, + "49": 0.65707, + "50": 0.65748, + "51": 0.66048, + "52": 0.65517, + "53": 0.65236, + "54": 0.6505, + "55": 0.65061, + "56": 0.65419, + "57": 0.64612, + "58": 0.6508, + "59": 0.64828, + "60": 0.64805, + "61": 0.99903, + "62": 0.6529, + "63": 0.65264, + "64": 0.64941, + "65": 0.65259, + "66": 0.64896, + "67": 0.64907, + "68": 0.65692, + "69": 0.64922, + "70": 0.65143, + "71": 0.64786, + "72": 0.6595, + "73": 0.65025, + "74": 0.64993, + "75": 0.64539, + "76": 0.65147, + "77": 0.65111, + "78": 0.64894, + "79": 0.65192, + "80": 0.94887, + "81": 0.64772, + "82": 0.64406, + "83": 0.64869, + "84": 0.95425, + "85": 0.64926, + "86": 0.64526, + "87": 0.64401, + "88": 0.95609, + "89": 0.64807, + "90": 0.64544, + "91": 0.9603, + "92": 0.64218, + "93": 0.64853, + "94": 0.64394, + "95": 1.01268, + "96": 1.05755, + "97": 0.65312, + "98": 0.65341, + "99": 0.65751, + "100": 0.64782 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..47d23248800 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.34904, + "2": 10.34488, + "3": 9.79407, + "4": 9.59568, + "5": 9.42065, + "6": 9.41856, + "7": 9.28073, + "8": 9.18973, + "9": 9.06584, + "10": 9.00206, + "11": 8.81497, + "12": 8.78107, + "13": 8.82506, + "14": 8.6728, + "15": 8.6368, + "16": 8.51926, + "17": 8.45732, + "18": 8.37037, + "19": 8.36068, + "20": 8.25456, + "21": 8.24268, + "22": 8.13404, + "23": 8.06818, + "24": 8.11464, + "25": 7.95146, + "26": 8.08186, + "27": 7.86814, + "28": 7.94027, + "29": 7.77604, + "30": 7.84595, + "31": 7.81568, + "32": 7.65964, + "33": 7.77905, + "34": 7.53277, + "35": 7.6586, + "36": 7.51541, + "37": 7.44748, + "38": 7.4824, + "39": 7.46523, + "40": 7.49146, + "41": 7.40822, + "42": 7.35649, + "43": 7.43806, + "44": 7.35517, + "45": 7.35103, + "46": 7.27859, + "47": 7.44152, + "48": 7.2683, + "49": 7.32389, + "50": 7.14549, + "51": 7.36541, + "52": 7.12192, + "53": 7.09189, + "54": 7.22759, + "55": 7.13584, + "56": 7.20822, + "57": 7.31316, + "58": 6.99088, + "59": 7.09934, + "60": 7.12683, + "61": 7.1014, + "62": 7.23954, + "63": 7.14417, + "64": 7.06836, + "65": 6.98412, + "66": 7.03768, + "67": 7.02847, + "68": 7.1299, + "69": 7.01456, + "70": 7.04997, + "71": 6.89408, + "72": 6.98553, + "73": 6.96694, + "74": 6.90297, + "75": 7.0574, + "76": 6.9581, + "77": 7.06903, + "78": 7.02133, + "79": 6.8504, + "80": 6.91935, + "81": 6.95874, + "82": 7.04745, + "83": 6.98522, + "84": 6.99712, + "85": 6.83565, + "86": 7.04156, + "87": 6.96476, + "88": 6.89883, + "89": 6.80051, + "90": 7.22593, + "91": 6.70562, + "92": 7.0381, + "93": 6.88685, + "94": 7.03908, + "95": 6.84815, + "96": 6.95281, + "97": 6.94344, + "98": 6.86987, + "99": 6.99502, + "100": 6.96683 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43274.0, + "2": 44071.0, + "3": 44760.0, + "4": 42385.0, + "5": 45378.0, + "6": 40938.0, + "7": 43150.0, + "8": 45450.0, + "9": 42428.0, + "10": 45373.0, + "11": 43974.0, + "12": 44591.0, + "13": 43897.0, + "14": 46204.0, + "15": 43924.0, + "16": 41613.0, + "17": 43852.0, + "18": 44669.0, + "19": 42579.0, + "20": 44769.0, + "21": 44761.0, + "22": 41873.0, + "23": 45441.0, + "24": 43081.0, + "25": 42452.0, + "26": 43947.0, + "27": 46247.0, + "28": 46419.0, + "29": 46169.0, + "30": 44035.0, + "31": 41152.0, + "32": 43347.0, + "33": 45435.0, + "34": 43300.0, + "35": 43284.0, + "36": 42483.0, + "37": 40070.0, + "38": 42561.0, + "39": 44706.0, + "40": 43260.0, + "41": 44642.0, + "42": 43192.0, + "43": 45439.0, + "44": 44588.0, + "45": 43274.0, + "46": 43921.0, + "47": 42364.0, + "48": 44740.0, + "49": 43152.0, + "50": 43348.0, + "51": 41112.0, + "52": 43837.0, + "53": 43913.0, + "54": 41704.0, + "55": 43870.0, + "56": 43209.0, + "57": 42636.0, + "58": 43841.0, + "59": 44630.0, + "60": 41219.0, + "61": 39702.0, + "62": 44739.0, + "63": 44651.0, + "64": 45372.0, + "65": 44682.0, + "66": 45351.0, + "67": 43174.0, + "68": 42502.0, + "69": 43834.0, + "70": 45514.0, + "71": 43291.0, + "72": 44767.0, + "73": 45384.0, + "74": 42457.0, + "75": 44673.0, + "76": 43876.0, + "77": 42026.0, + "78": 40350.0, + "79": 38918.0, + "80": 41092.0, + "81": 45364.0, + "82": 43198.0, + "83": 38467.0, + "84": 42477.0, + "85": 43981.0, + "86": 45667.0, + "87": 40863.0, + "88": 41772.0, + "89": 41104.0, + "90": 44669.0, + "91": 46134.0, + "92": 41634.0, + "93": 43241.0, + "94": 39538.0, + "95": 43915.0, + "96": 44683.0, + "97": 45405.0, + "98": 41791.0, + "99": 45414.0, + "100": 42458.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1132053504.0, + "2": 1132053504.0, + "3": 1132053504.0, + "4": 1132053504.0, + "5": 1132053504.0, + "6": 1132053504.0, + "7": 1132053504.0, + "8": 1132053504.0, + "9": 1132053504.0, + "10": 1132053504.0, + "11": 1132053504.0, + "12": 1132053504.0, + "13": 1132053504.0, + "14": 1132053504.0, + "15": 1132053504.0, + "16": 1132053504.0, + "17": 1132053504.0, + "18": 1132053504.0, + "19": 1132053504.0, + "20": 1132053504.0, + "21": 1132053504.0, + "22": 1132053504.0, + "23": 1132053504.0, + "24": 1132053504.0, + "25": 1132053504.0, + "26": 1132053504.0, + "27": 1132053504.0, + "28": 1132053504.0, + "29": 1132053504.0, + "30": 1132053504.0, + "31": 1132053504.0, + "32": 1132053504.0, + "33": 1132053504.0, + "34": 1132053504.0, + "35": 1132053504.0, + "36": 1132053504.0, + "37": 1132053504.0, + "38": 1132053504.0, + "39": 1132053504.0, + "40": 1132053504.0, + "41": 1132053504.0, + "42": 1132053504.0, + "43": 1132053504.0, + "44": 1132053504.0, + "45": 1132053504.0, + "46": 1132053504.0, + "47": 1132053504.0, + "48": 1132053504.0, + "49": 1132053504.0, + "50": 1132053504.0, + "51": 1132053504.0, + "52": 1132053504.0, + "53": 1132053504.0, + "54": 1132053504.0, + "55": 1132053504.0, + "56": 1132053504.0, + "57": 1132053504.0, + "58": 1132053504.0, + "59": 1132053504.0, + "60": 1132053504.0, + "61": 1132053504.0, + "62": 1132053504.0, + "63": 1132053504.0, + "64": 1132053504.0, + "65": 1132053504.0, + "66": 1132053504.0, + "67": 1132053504.0, + "68": 1132053504.0, + "69": 1132053504.0, + "70": 1132053504.0, + "71": 1132053504.0, + "72": 1132053504.0, + "73": 1132053504.0, + "74": 1132053504.0, + "75": 1132053504.0, + "76": 1132053504.0, + "77": 1132053504.0, + "78": 1132053504.0, + "79": 1132053504.0, + "80": 1132053504.0, + "81": 1132053504.0, + "82": 1132053504.0, + "83": 1132053504.0, + "84": 1132053504.0, + "85": 1132053504.0, + "86": 1132053504.0, + "87": 1132053504.0, + "88": 1132053504.0, + "89": 1132053504.0, + "90": 1132053504.0, + "91": 1132053504.0, + "92": 1132053504.0, + "93": 1132053504.0, + "94": 1132053504.0, + "95": 1132053504.0, + "96": 1132053504.0, + "97": 1132053504.0, + "98": 1132053504.0, + "99": 1132053504.0, + "100": 1132053504.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1409266176.0, + "2": 1864166912.0, + "3": 1864166912.0, + "4": 1864166912.0, + "5": 1864166912.0, + "6": 1864166912.0, + "7": 1864166912.0, + "8": 1864166912.0, + "9": 1864166912.0, + "10": 1864166912.0, + "11": 1864166912.0, + "12": 1864166912.0, + "13": 1864166912.0, + "14": 1864166912.0, + "15": 1864166912.0, + "16": 1864166912.0, + "17": 1864166912.0, + "18": 1864166912.0, + "19": 1864166912.0, + "20": 1864166912.0, + "21": 1864166912.0, + "22": 1864166912.0, + "23": 1864166912.0, + "24": 1864166912.0, + "25": 1864166912.0, + "26": 1864166912.0, + "27": 1864166912.0, + "28": 1864166912.0, + "29": 1864166912.0, + "30": 1864166912.0, + "31": 1864166912.0, + "32": 1864166912.0, + "33": 1864166912.0, + "34": 1864166912.0, + "35": 1864166912.0, + "36": 1864166912.0, + "37": 1864166912.0, + "38": 1864166912.0, + "39": 1864166912.0, + "40": 1864166912.0, + "41": 1864166912.0, + "42": 1864166912.0, + "43": 1864166912.0, + "44": 1864166912.0, + "45": 1864166912.0, + "46": 1864166912.0, + "47": 1864166912.0, + "48": 1864166912.0, + "49": 1864166912.0, + "50": 1864166912.0, + "51": 1864166912.0, + "52": 1864166912.0, + "53": 1864166912.0, + "54": 1864166912.0, + "55": 1864166912.0, + "56": 1864166912.0, + "57": 1864166912.0, + "58": 1864166912.0, + "59": 1864166912.0, + "60": 1864166912.0, + "61": 1864166912.0, + "62": 1864166912.0, + "63": 1864166912.0, + "64": 1864166912.0, + "65": 1864166912.0, + "66": 1864166912.0, + "67": 1864166912.0, + "68": 1864166912.0, + "69": 1864166912.0, + "70": 1864166912.0, + "71": 1864166912.0, + "72": 1864166912.0, + "73": 1864166912.0, + "74": 1864166912.0, + "75": 1864166912.0, + "76": 1864166912.0, + "77": 1864166912.0, + "78": 1864166912.0, + "79": 1864166912.0, + "80": 1864166912.0, + "81": 1864166912.0, + "82": 1864166912.0, + "83": 1864166912.0, + "84": 1864166912.0, + "85": 1864166912.0, + "86": 1864166912.0, + "87": 1864166912.0, + "88": 1864166912.0, + "89": 1864166912.0, + "90": 1864166912.0, + "91": 1864166912.0, + "92": 1864166912.0, + "93": 1864166912.0, + "94": 1864166912.0, + "95": 1864166912.0, + "96": 1864166912.0, + "97": 1864166912.0, + "98": 1864166912.0, + "99": 1864166912.0, + "100": 1864166912.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.0714, + "2": 0.65344, + "3": 0.61776, + "4": 0.55941, + "5": 0.56517, + "6": 0.55953, + "7": 0.56488, + "8": 0.56168, + "9": 0.55963, + "10": 0.56502, + "11": 0.56812, + "12": 0.58499, + "13": 0.58777, + "14": 0.56659, + "15": 0.55908, + "16": 0.56702, + "17": 0.56652, + "18": 0.56368, + "19": 0.57588, + "20": 0.57328, + "21": 0.57961, + "22": 0.56693, + "23": 0.87697, + "24": 0.56276, + "25": 0.56409, + "26": 0.89777, + "27": 0.89041, + "28": 0.56631, + "29": 0.5637, + "30": 0.56457, + "31": 0.56285, + "32": 0.56729, + "33": 1.2087, + "34": 1.26391, + "35": 0.57364, + "36": 0.56616, + "37": 0.56143, + "38": 0.56332, + "39": 0.56267, + "40": 0.56706, + "41": 0.56887, + "42": 0.5604, + "43": 0.56419, + "44": 0.55389, + "45": 0.55665, + "46": 0.56256, + "47": 0.5757, + "48": 0.62949, + "49": 0.55714, + "50": 0.55326, + "51": 0.56303, + "52": 0.56765, + "53": 0.56019, + "54": 0.56447, + "55": 0.56674, + "56": 0.55563, + "57": 0.55623, + "58": 0.55651, + "59": 0.55616, + "60": 0.55374, + "61": 0.55657, + "62": 0.55473, + "63": 0.56052, + "64": 0.55785, + "65": 0.55653, + "66": 0.56406, + "67": 0.56415, + "68": 0.56582, + "69": 0.55566, + "70": 0.555, + "71": 0.55709, + "72": 0.56314, + "73": 0.55571, + "74": 0.55495, + "75": 0.56028, + "76": 0.88389, + "77": 0.56277, + "78": 0.56491, + "79": 0.57616, + "80": 0.58894, + "81": 0.56216, + "82": 0.56187, + "83": 0.56108, + "84": 0.56853, + "85": 0.55814, + "86": 0.56093, + "87": 0.56078, + "88": 0.913, + "89": 0.55681, + "90": 0.55754, + "91": 0.56679, + "92": 0.55927, + "93": 0.89203, + "94": 0.56272, + "95": 0.55822, + "96": 0.56068, + "97": 0.91075, + "98": 0.56624, + "99": 0.92145, + "100": 0.88359 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..11ef3fbd8c5 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.34904, + "2": 10.34488, + "3": 9.79407, + "4": 9.59568, + "5": 9.42065, + "6": 9.41856, + "7": 9.28073, + "8": 9.18973, + "9": 9.06584, + "10": 9.00206, + "11": 8.81497, + "12": 8.78107, + "13": 8.82506, + "14": 8.6728, + "15": 8.6368, + "16": 8.51926, + "17": 8.45732, + "18": 8.37037, + "19": 8.36068, + "20": 8.25456, + "21": 8.24268, + "22": 8.13404, + "23": 8.06818, + "24": 8.11464, + "25": 7.95146, + "26": 8.08186, + "27": 7.86814, + "28": 7.94027, + "29": 7.77604, + "30": 7.84595, + "31": 7.81568, + "32": 7.65964, + "33": 7.77905, + "34": 7.53277, + "35": 7.6586, + "36": 7.51541, + "37": 7.44748, + "38": 7.4824, + "39": 7.46523, + "40": 7.49146, + "41": 7.40822, + "42": 7.35649, + "43": 7.43806, + "44": 7.35517, + "45": 7.35103, + "46": 7.27859, + "47": 7.44152, + "48": 7.2683, + "49": 7.32389, + "50": 7.14549, + "51": 7.36541, + "52": 7.12192, + "53": 7.09189, + "54": 7.22759, + "55": 7.13584, + "56": 7.20822, + "57": 7.31316, + "58": 6.99088, + "59": 7.09934, + "60": 7.12683, + "61": 7.1014, + "62": 7.23954, + "63": 7.14417, + "64": 7.06836, + "65": 6.98412, + "66": 7.03768, + "67": 7.02847, + "68": 7.1299, + "69": 7.01456, + "70": 7.04997, + "71": 6.89408, + "72": 6.98553, + "73": 6.96694, + "74": 6.90297, + "75": 7.0574, + "76": 6.9581, + "77": 7.06903, + "78": 7.02133, + "79": 6.8504, + "80": 6.91935, + "81": 6.95874, + "82": 7.04745, + "83": 6.98522, + "84": 6.99712, + "85": 6.83565, + "86": 7.04156, + "87": 6.96476, + "88": 6.89883, + "89": 6.80051, + "90": 7.22593, + "91": 6.70562, + "92": 7.0381, + "93": 6.88685, + "94": 7.03908, + "95": 6.84815, + "96": 6.95281, + "97": 6.94344, + "98": 6.86987, + "99": 6.99502, + "100": 6.96683 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43274.0, + "2": 44071.0, + "3": 44760.0, + "4": 42385.0, + "5": 45378.0, + "6": 40938.0, + "7": 43150.0, + "8": 45450.0, + "9": 42428.0, + "10": 45373.0, + "11": 43974.0, + "12": 44591.0, + "13": 43897.0, + "14": 46204.0, + "15": 43924.0, + "16": 41613.0, + "17": 43852.0, + "18": 44669.0, + "19": 42579.0, + "20": 44769.0, + "21": 44761.0, + "22": 41873.0, + "23": 45441.0, + "24": 43081.0, + "25": 42452.0, + "26": 43947.0, + "27": 46247.0, + "28": 46419.0, + "29": 46169.0, + "30": 44035.0, + "31": 41152.0, + "32": 43347.0, + "33": 45435.0, + "34": 43300.0, + "35": 43284.0, + "36": 42483.0, + "37": 40070.0, + "38": 42561.0, + "39": 44706.0, + "40": 43260.0, + "41": 44642.0, + "42": 43192.0, + "43": 45439.0, + "44": 44588.0, + "45": 43274.0, + "46": 43921.0, + "47": 42364.0, + "48": 44740.0, + "49": 43152.0, + "50": 43348.0, + "51": 41112.0, + "52": 43837.0, + "53": 43913.0, + "54": 41704.0, + "55": 43870.0, + "56": 43209.0, + "57": 42636.0, + "58": 43841.0, + "59": 44630.0, + "60": 41219.0, + "61": 39702.0, + "62": 44739.0, + "63": 44651.0, + "64": 45372.0, + "65": 44682.0, + "66": 45351.0, + "67": 43174.0, + "68": 42502.0, + "69": 43834.0, + "70": 45514.0, + "71": 43291.0, + "72": 44767.0, + "73": 45384.0, + "74": 42457.0, + "75": 44673.0, + "76": 43876.0, + "77": 42026.0, + "78": 40350.0, + "79": 38918.0, + "80": 41092.0, + "81": 45364.0, + "82": 43198.0, + "83": 38467.0, + "84": 42477.0, + "85": 43981.0, + "86": 45667.0, + "87": 40863.0, + "88": 41772.0, + "89": 41104.0, + "90": 44669.0, + "91": 46134.0, + "92": 41634.0, + "93": 43241.0, + "94": 39538.0, + "95": 43915.0, + "96": 44683.0, + "97": 45405.0, + "98": 41791.0, + "99": 45414.0, + "100": 42458.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1132053504.0, + "2": 1132053504.0, + "3": 1132053504.0, + "4": 1132053504.0, + "5": 1132053504.0, + "6": 1132053504.0, + "7": 1132053504.0, + "8": 1132053504.0, + "9": 1132053504.0, + "10": 1132053504.0, + "11": 1132053504.0, + "12": 1132053504.0, + "13": 1132053504.0, + "14": 1132053504.0, + "15": 1132053504.0, + "16": 1132053504.0, + "17": 1132053504.0, + "18": 1132053504.0, + "19": 1132053504.0, + "20": 1132053504.0, + "21": 1132053504.0, + "22": 1132053504.0, + "23": 1132053504.0, + "24": 1132053504.0, + "25": 1132053504.0, + "26": 1132053504.0, + "27": 1132053504.0, + "28": 1132053504.0, + "29": 1132053504.0, + "30": 1132053504.0, + "31": 1132053504.0, + "32": 1132053504.0, + "33": 1132053504.0, + "34": 1132053504.0, + "35": 1132053504.0, + "36": 1132053504.0, + "37": 1132053504.0, + "38": 1132053504.0, + "39": 1132053504.0, + "40": 1132053504.0, + "41": 1132053504.0, + "42": 1132053504.0, + "43": 1132053504.0, + "44": 1132053504.0, + "45": 1132053504.0, + "46": 1132053504.0, + "47": 1132053504.0, + "48": 1132053504.0, + "49": 1132053504.0, + "50": 1132053504.0, + "51": 1132053504.0, + "52": 1132053504.0, + "53": 1132053504.0, + "54": 1132053504.0, + "55": 1132053504.0, + "56": 1132053504.0, + "57": 1132053504.0, + "58": 1132053504.0, + "59": 1132053504.0, + "60": 1132053504.0, + "61": 1132053504.0, + "62": 1132053504.0, + "63": 1132053504.0, + "64": 1132053504.0, + "65": 1132053504.0, + "66": 1132053504.0, + "67": 1132053504.0, + "68": 1132053504.0, + "69": 1132053504.0, + "70": 1132053504.0, + "71": 1132053504.0, + "72": 1132053504.0, + "73": 1132053504.0, + "74": 1132053504.0, + "75": 1132053504.0, + "76": 1132053504.0, + "77": 1132053504.0, + "78": 1132053504.0, + "79": 1132053504.0, + "80": 1132053504.0, + "81": 1132053504.0, + "82": 1132053504.0, + "83": 1132053504.0, + "84": 1132053504.0, + "85": 1132053504.0, + "86": 1132053504.0, + "87": 1132053504.0, + "88": 1132053504.0, + "89": 1132053504.0, + "90": 1132053504.0, + "91": 1132053504.0, + "92": 1132053504.0, + "93": 1132053504.0, + "94": 1132053504.0, + "95": 1132053504.0, + "96": 1132053504.0, + "97": 1132053504.0, + "98": 1132053504.0, + "99": 1132053504.0, + "100": 1132053504.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1409266176.0, + "2": 1864166912.0, + "3": 1864166912.0, + "4": 1864166912.0, + "5": 1864166912.0, + "6": 1864166912.0, + "7": 1864166912.0, + "8": 1864166912.0, + "9": 1864166912.0, + "10": 1864166912.0, + "11": 1864166912.0, + "12": 1864166912.0, + "13": 1864166912.0, + "14": 1864166912.0, + "15": 1864166912.0, + "16": 1864166912.0, + "17": 1864166912.0, + "18": 1864166912.0, + "19": 1864166912.0, + "20": 1864166912.0, + "21": 1864166912.0, + "22": 1864166912.0, + "23": 1864166912.0, + "24": 1864166912.0, + "25": 1864166912.0, + "26": 1864166912.0, + "27": 1864166912.0, + "28": 1864166912.0, + "29": 1864166912.0, + "30": 1864166912.0, + "31": 1864166912.0, + "32": 1864166912.0, + "33": 1864166912.0, + "34": 1864166912.0, + "35": 1864166912.0, + "36": 1864166912.0, + "37": 1864166912.0, + "38": 1864166912.0, + "39": 1864166912.0, + "40": 1864166912.0, + "41": 1864166912.0, + "42": 1864166912.0, + "43": 1864166912.0, + "44": 1864166912.0, + "45": 1864166912.0, + "46": 1864166912.0, + "47": 1864166912.0, + "48": 1864166912.0, + "49": 1864166912.0, + "50": 1864166912.0, + "51": 1864166912.0, + "52": 1864166912.0, + "53": 1864166912.0, + "54": 1864166912.0, + "55": 1864166912.0, + "56": 1864166912.0, + "57": 1864166912.0, + "58": 1864166912.0, + "59": 1864166912.0, + "60": 1864166912.0, + "61": 1864166912.0, + "62": 1864166912.0, + "63": 1864166912.0, + "64": 1864166912.0, + "65": 1864166912.0, + "66": 1864166912.0, + "67": 1864166912.0, + "68": 1864166912.0, + "69": 1864166912.0, + "70": 1864166912.0, + "71": 1864166912.0, + "72": 1864166912.0, + "73": 1864166912.0, + "74": 1864166912.0, + "75": 1864166912.0, + "76": 1864166912.0, + "77": 1864166912.0, + "78": 1864166912.0, + "79": 1864166912.0, + "80": 1864166912.0, + "81": 1864166912.0, + "82": 1864166912.0, + "83": 1864166912.0, + "84": 1864166912.0, + "85": 1864166912.0, + "86": 1864166912.0, + "87": 1864166912.0, + "88": 1864166912.0, + "89": 1864166912.0, + "90": 1864166912.0, + "91": 1864166912.0, + "92": 1864166912.0, + "93": 1864166912.0, + "94": 1864166912.0, + "95": 1864166912.0, + "96": 1864166912.0, + "97": 1864166912.0, + "98": 1864166912.0, + "99": 1864166912.0, + "100": 1864166912.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.74091, + "2": 0.66943, + "3": 0.64954, + "4": 0.64695, + "5": 0.65419, + "6": 0.6513, + "7": 0.64556, + "8": 0.6385, + "9": 0.64307, + "10": 0.63679, + "11": 0.64386, + "12": 0.64012, + "13": 0.63889, + "14": 0.63958, + "15": 0.64024, + "16": 0.63721, + "17": 0.6492, + "18": 0.65247, + "19": 0.64523, + "20": 1.0041, + "21": 0.64739, + "22": 1.02158, + "23": 0.96313, + "24": 0.64631, + "25": 0.64337, + "26": 0.64702, + "27": 0.64516, + "28": 0.64748, + "29": 0.64657, + "30": 0.95958, + "31": 1.05772, + "32": 0.64319, + "33": 0.64455, + "34": 0.64044, + "35": 0.6445, + "36": 0.64649, + "37": 0.64593, + "38": 0.64912, + "39": 0.64665, + "40": 0.64585, + "41": 0.64603, + "42": 0.64765, + "43": 0.64548, + "44": 0.64732, + "45": 0.64996, + "46": 0.65909, + "47": 0.66335, + "48": 0.64625, + "49": 0.64641, + "50": 0.64822, + "51": 0.65982, + "52": 0.64882, + "53": 0.64892, + "54": 0.64636, + "55": 0.64591, + "56": 0.65232, + "57": 0.64591, + "58": 0.64572, + "59": 0.64949, + "60": 0.64277, + "61": 0.64766, + "62": 0.64726, + "63": 0.64637, + "64": 0.64901, + "65": 0.6476, + "66": 0.64458, + "67": 0.64951, + "68": 0.64438, + "69": 0.64854, + "70": 0.65268, + "71": 0.64762, + "72": 1.02587, + "73": 0.65274, + "74": 0.65942, + "75": 0.65091, + "76": 0.65181, + "77": 0.65582, + "78": 0.64434, + "79": 0.65116, + "80": 0.65073, + "81": 0.64645, + "82": 0.65405, + "83": 0.65107, + "84": 0.64883, + "85": 0.94272, + "86": 0.65641, + "87": 0.99204, + "88": 0.96199, + "89": 0.64856, + "90": 0.65165, + "91": 0.65163, + "92": 0.6506, + "93": 0.64828, + "94": 0.64682, + "95": 1.01586, + "96": 1.04151, + "97": 0.65481, + "98": 0.64703, + "99": 0.64964, + "100": 0.65343 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index fca23f6593f..702c35ca9af 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.34897, + "2": 10.34482, + "3": 9.79428, + "4": 9.59585, "5": 9.42074, + "6": 9.41847, + "7": 9.28062, + "8": 9.18972, + "9": 9.06519, "10": 9.00183, + "11": 8.81475, + "12": 8.7808, + "13": 8.82493, + "14": 8.67261, "15": 8.6364, + "16": 8.51896, + "17": 8.45704, + "18": 8.37007, + "19": 8.36039, "20": 8.25417, + "21": 8.2421, + "22": 8.13324, + "23": 8.06764, + "24": 8.1142, "25": 7.95082, + "26": 8.08156, + "27": 7.86764, + "28": 7.93993, + "29": 7.77566, "30": 7.84559, + "31": 7.8152, + "32": 7.65941, + "33": 7.77856, + "34": 7.53188, "35": 7.65804, + "36": 7.51464, + "37": 7.44686, + "38": 7.48161, + "39": 7.46435, "40": 7.49084, + "41": 7.40827, + "42": 7.35625, + "43": 7.43764, + "44": 7.35439, "45": 7.35042, + "46": 7.27853, + "47": 7.4405, + "48": 7.26763, + "49": 7.32341, "50": 7.14486, + "51": 7.36469, + "52": 7.12044, + "53": 7.09167, + "54": 7.22712, "55": 7.13495, + "56": 7.20751, + "57": 7.31287, + "58": 6.99063, + "59": 7.09849, "60": 7.12665, + "61": 7.10047, + "62": 7.23974, + "63": 7.14358, + "64": 7.06717, "65": 6.98408, + "66": 7.03692, + "67": 7.02875, + "68": 7.12914, + "69": 7.01425, "70": 7.04954, + "71": 6.89312, + "72": 6.98513, + "73": 6.96734, + "74": 6.90236, "75": 7.05611, + "76": 6.95986, + "77": 7.06862, + "78": 7.0204, + "79": 6.8505, "80": 6.92019, + "81": 6.95982, + "82": 7.04575, + "83": 6.98617, + "84": 6.99991, "85": 6.83511, + "86": 7.04087, + "87": 6.96604, + "88": 6.90125, + "89": 6.80345, "90": 7.22384, + "91": 6.70505, + "92": 7.03979, + "93": 6.8857, + "94": 7.04044, "95": 6.84746, + "96": 6.9546, + "97": 6.94425, + "98": 6.86865, + "99": 6.9948, "100": 6.96761 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 43289.0, + "2": 44062.0, + "3": 44747.0, + "4": 42377.0, "5": 45372.0, + "6": 40957.0, + "7": 43147.0, + "8": 45474.0, + "9": 42425.0, "10": 45380.0, + "11": 43984.0, + "12": 44594.0, + "13": 43914.0, + "14": 46203.0, "15": 43914.0, + "16": 41632.0, + "17": 43870.0, + "18": 44691.0, + "19": 42574.0, "20": 44769.0, + "21": 44757.0, + "22": 41854.0, + "23": 45440.0, + "24": 43066.0, "25": 42458.0, + "26": 43949.0, + "27": 46224.0, + "28": 46395.0, + "29": 46168.0, "30": 44028.0, + "31": 41131.0, + "32": 43348.0, + "33": 45441.0, + "34": 43316.0, "35": 43258.0, + "36": 42459.0, + "37": 40074.0, + "38": 42544.0, + "39": 44707.0, "40": 43237.0, + "41": 44652.0, + "42": 43196.0, + "43": 45435.0, + "44": 44591.0, "45": 43263.0, + "46": 43930.0, + "47": 42373.0, + "48": 44713.0, + "49": 43128.0, "50": 43361.0, + "51": 41133.0, + "52": 43849.0, + "53": 43899.0, + "54": 41704.0, "55": 43863.0, + "56": 43205.0, + "57": 42636.0, + "58": 43835.0, + "59": 44623.0, "60": 41226.0, + "61": 39705.0, + "62": 44732.0, + "63": 44659.0, + "64": 45371.0, "65": 44682.0, + "66": 45341.0, + "67": 43169.0, + "68": 42486.0, + "69": 43829.0, "70": 45529.0, + "71": 43294.0, + "72": 44745.0, + "73": 45364.0, + "74": 42463.0, "75": 44679.0, + "76": 43882.0, + "77": 42042.0, + "78": 40356.0, + "79": 38928.0, "80": 41079.0, + "81": 45349.0, + "82": 43226.0, + "83": 38474.0, + "84": 42415.0, "85": 43989.0, + "86": 45673.0, + "87": 40850.0, + "88": 41756.0, + "89": 41065.0, "90": 44686.0, + "91": 46135.0, + "92": 41609.0, + "93": 43267.0, + "94": 39525.0, "95": 43921.0, + "96": 44683.0, + "97": 45412.0, + "98": 41832.0, + "99": 45416.0, "100": 42457.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1104069120.0, + "2": 1104069120.0, + "3": 1104069120.0, + "4": 1104069120.0, "5": 1104069120.0, + "6": 1104069120.0, + "7": 1104069120.0, + "8": 1104069120.0, + "9": 1104069120.0, "10": 1104069120.0, + "11": 1104069120.0, + "12": 1104069120.0, + "13": 1104069120.0, + "14": 1104069120.0, "15": 1104069120.0, + "16": 1104069120.0, + "17": 1104069120.0, + "18": 1104069120.0, + "19": 1104069120.0, "20": 1104069120.0, + "21": 1104069120.0, + "22": 1104069120.0, + "23": 1104069120.0, + "24": 1104069120.0, "25": 1104069120.0, + "26": 1104069120.0, + "27": 1104069120.0, + "28": 1104069120.0, + "29": 1104069120.0, "30": 1104069120.0, + "31": 1104069120.0, + "32": 1104069120.0, + "33": 1104069120.0, + "34": 1104069120.0, "35": 1104069120.0, + "36": 1104069120.0, + "37": 1104069120.0, + "38": 1104069120.0, + "39": 1104069120.0, "40": 1104069120.0, + "41": 1104069120.0, + "42": 1104069120.0, + "43": 1104069120.0, + "44": 1104069120.0, "45": 1104069120.0, + "46": 1104069120.0, + "47": 1104069120.0, + "48": 1104069120.0, + "49": 1104069120.0, "50": 1104069120.0, + "51": 1104069120.0, + "52": 1104069120.0, + "53": 1104069120.0, + "54": 1104069120.0, "55": 1104069120.0, + "56": 1104069120.0, + "57": 1104069120.0, + "58": 1104069120.0, + "59": 1104069120.0, "60": 1104069120.0, + "61": 1104069120.0, + "62": 1104069120.0, + "63": 1104069120.0, + "64": 1104069120.0, "65": 1104069120.0, + "66": 1104069120.0, + "67": 1104069120.0, + "68": 1104069120.0, + "69": 1104069120.0, "70": 1104069120.0, + "71": 1104069120.0, + "72": 1104069120.0, + "73": 1104069120.0, + "74": 1104069120.0, "75": 1104069120.0, + "76": 1104069120.0, + "77": 1104069120.0, + "78": 1104069120.0, + "79": 1104069120.0, "80": 1104069120.0, + "81": 1104069120.0, + "82": 1104069120.0, + "83": 1104069120.0, + "84": 1104069120.0, "85": 1104069120.0, + "86": 1104069120.0, + "87": 1104069120.0, + "88": 1104069120.0, + "89": 1104069120.0, "90": 1104069120.0, + "91": 1104069120.0, + "92": 1104069120.0, + "93": 1104069120.0, + "94": 1104069120.0, "95": 1104069120.0, + "96": 1104069120.0, + "97": 1104069120.0, + "98": 1104069120.0, + "99": 1104069120.0, "100": 1104069120.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1368630784.0, + "2": 1833295360.0, + "3": 1833295360.0, + "4": 1833295360.0, "5": 1833295360.0, + "6": 1833295360.0, + "7": 1833295360.0, + "8": 1833295360.0, + "9": 1833295360.0, "10": 1833295360.0, + "11": 1833295360.0, + "12": 1833295360.0, + "13": 1833295360.0, + "14": 1833295360.0, "15": 1833295360.0, + "16": 1833295360.0, + "17": 1833295360.0, + "18": 1833295360.0, + "19": 1833295360.0, "20": 1833295360.0, + "21": 1833295360.0, + "22": 1833295360.0, + "23": 1833295360.0, + "24": 1833295360.0, "25": 1833295360.0, + "26": 1833295360.0, + "27": 1833295360.0, + "28": 1833295360.0, + "29": 1833295360.0, "30": 1833295360.0, + "31": 1833295360.0, + "32": 1833295360.0, + "33": 1833295360.0, + "34": 1833295360.0, "35": 1833295360.0, + "36": 1833295360.0, + "37": 1833295360.0, + "38": 1833295360.0, + "39": 1833295360.0, "40": 1833295360.0, + "41": 1833295360.0, + "42": 1833295360.0, + "43": 1833295360.0, + "44": 1833295360.0, "45": 1833295360.0, + "46": 1833295360.0, + "47": 1833295360.0, + "48": 1833295360.0, + "49": 1833295360.0, "50": 1833295360.0, + "51": 1833295360.0, + "52": 1833295360.0, + "53": 1833295360.0, + "54": 1833295360.0, "55": 1833295360.0, + "56": 1833295360.0, + "57": 1833295360.0, + "58": 1833295360.0, + "59": 1833295360.0, "60": 1833295360.0, + "61": 1833295360.0, + "62": 1833295360.0, + "63": 1833295360.0, + "64": 1833295360.0, "65": 1833295360.0, + "66": 1833295360.0, + "67": 1833295360.0, + "68": 1833295360.0, + "69": 1833295360.0, "70": 1833295360.0, + "71": 1833295360.0, + "72": 1833295360.0, + "73": 1833295360.0, + "74": 1833295360.0, "75": 1833295360.0, + "76": 1833295360.0, + "77": 1833295360.0, + "78": 1833295360.0, + "79": 1833295360.0, "80": 1833295360.0, + "81": 1833295360.0, + "82": 1833295360.0, + "83": 1833295360.0, + "84": 1833295360.0, "85": 1833295360.0, + "86": 1833295360.0, + "87": 1833295360.0, + "88": 1833295360.0, + "89": 1833295360.0, "90": 1833295360.0, + "91": 1833295360.0, + "92": 1833295360.0, + "93": 1833295360.0, + "94": 1833295360.0, "95": 1833295360.0, + "96": 1833295360.0, + "97": 1833295360.0, + "98": 1833295360.0, + "99": 1833295360.0, "100": 1833295360.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 9.42985, - "5": 0.45373, - "10": 0.45713, - "15": 0.47883, - "20": 0.47411, - "25": 0.4628, - "30": 0.47727, - "35": 0.46474, - "40": 0.46129, - "45": 0.49682, - "50": 0.47506, - "55": 0.47981, - "60": 0.47061, - "65": 0.46638, - "70": 0.46506, - "75": 0.47547, - "80": 0.46762, - "85": 0.47281, - "90": 0.46137, - "95": 0.47198, - "100": 0.46836 + "1": 9.42728, + "2": 0.63617, + "3": 0.52215, + "4": 0.51838, + "5": 0.5248, + "6": 0.52221, + "7": 0.53157, + "8": 0.52268, + "9": 0.51794, + "10": 0.52148, + "11": 0.51655, + "12": 0.52503, + "13": 0.5178, + "14": 0.52926, + "15": 0.52639, + "16": 0.53361, + "17": 0.52309, + "18": 0.52324, + "19": 0.51834, + "20": 0.54965, + "21": 0.5586, + "22": 0.53836, + "23": 0.5225, + "24": 0.51851, + "25": 0.5199, + "26": 0.51853, + "27": 0.51882, + "28": 0.52551, + "29": 0.52254, + "30": 0.5192, + "31": 0.52201, + "32": 0.521, + "33": 0.52114, + "34": 0.51459, + "35": 0.52645, + "36": 0.51875, + "37": 0.5214, + "38": 0.52019, + "39": 0.54698, + "40": 0.54492, + "41": 0.51667, + "42": 0.52631, + "43": 0.52495, + "44": 0.52655, + "45": 0.52461, + "46": 0.53027, + "47": 0.5196, + "48": 0.52577, + "49": 0.51681, + "50": 0.53016, + "51": 0.51782, + "52": 0.52245, + "53": 0.51733, + "54": 0.523, + "55": 0.51904, + "56": 0.53679, + "57": 0.52102, + "58": 0.55143, + "59": 0.55915, + "60": 0.5493, + "61": 0.525, + "62": 0.52356, + "63": 0.53373, + "64": 0.81727, + "65": 0.52459, + "66": 0.79536, + "67": 0.52103, + "68": 0.5317, + "69": 0.52528, + "70": 0.78794, + "71": 0.53084, + "72": 0.51933, + "73": 0.53233, + "74": 0.52693, + "75": 0.53508, + "76": 0.56134, + "77": 0.53435, + "78": 0.51717, + "79": 0.52701, + "80": 0.52068, + "81": 0.52531, + "82": 0.5217, + "83": 0.52326, + "84": 0.52412, + "85": 0.84182, + "86": 0.52908, + "87": 0.51925, + "88": 0.52315, + "89": 0.52102, + "90": 0.52827, + "91": 0.54314, + "92": 0.52504, + "93": 0.52556, + "94": 0.8296, + "95": 0.83995, + "96": 0.85045, + "97": 0.78149, + "98": 0.54296, + "99": 0.5427, + "100": 0.55085 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..9abfa38cf9f --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.34897, + "2": 10.34482, + "3": 9.79428, + "4": 9.59585, + "5": 9.42074, + "6": 9.41847, + "7": 9.28062, + "8": 9.18972, + "9": 9.06519, + "10": 9.00183, + "11": 8.81475, + "12": 8.7808, + "13": 8.82493, + "14": 8.67261, + "15": 8.6364, + "16": 8.51896, + "17": 8.45704, + "18": 8.37007, + "19": 8.36039, + "20": 8.25417, + "21": 8.2421, + "22": 8.13324, + "23": 8.06764, + "24": 8.1142, + "25": 7.95082, + "26": 8.08156, + "27": 7.86764, + "28": 7.93993, + "29": 7.77566, + "30": 7.84559, + "31": 7.8152, + "32": 7.65941, + "33": 7.77856, + "34": 7.53188, + "35": 7.65804, + "36": 7.51464, + "37": 7.44686, + "38": 7.48161, + "39": 7.46435, + "40": 7.49084, + "41": 7.40827, + "42": 7.35625, + "43": 7.43764, + "44": 7.35439, + "45": 7.35042, + "46": 7.27853, + "47": 7.4405, + "48": 7.26763, + "49": 7.32341, + "50": 7.14486, + "51": 7.36469, + "52": 7.12044, + "53": 7.09167, + "54": 7.22712, + "55": 7.13495, + "56": 7.20751, + "57": 7.31287, + "58": 6.99063, + "59": 7.09849, + "60": 7.12665, + "61": 7.10047, + "62": 7.23974, + "63": 7.14358, + "64": 7.06717, + "65": 6.98408, + "66": 7.03692, + "67": 7.02875, + "68": 7.12914, + "69": 7.01425, + "70": 7.04954, + "71": 6.89312, + "72": 6.98513, + "73": 6.96734, + "74": 6.90236, + "75": 7.05611, + "76": 6.95986, + "77": 7.06862, + "78": 7.0204, + "79": 6.8505, + "80": 6.92019, + "81": 6.95982, + "82": 7.04575, + "83": 6.98617, + "84": 6.99991, + "85": 6.83511, + "86": 7.04087, + "87": 6.96604, + "88": 6.90125, + "89": 6.80345, + "90": 7.22384, + "91": 6.70505, + "92": 7.03979, + "93": 6.8857, + "94": 7.04044, + "95": 6.84746, + "96": 6.9546, + "97": 6.94425, + "98": 6.86865, + "99": 6.9948, + "100": 6.96761 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43289.0, + "2": 44062.0, + "3": 44747.0, + "4": 42377.0, + "5": 45372.0, + "6": 40957.0, + "7": 43147.0, + "8": 45474.0, + "9": 42425.0, + "10": 45380.0, + "11": 43984.0, + "12": 44594.0, + "13": 43914.0, + "14": 46203.0, + "15": 43914.0, + "16": 41632.0, + "17": 43870.0, + "18": 44691.0, + "19": 42574.0, + "20": 44769.0, + "21": 44757.0, + "22": 41854.0, + "23": 45440.0, + "24": 43066.0, + "25": 42458.0, + "26": 43949.0, + "27": 46224.0, + "28": 46395.0, + "29": 46168.0, + "30": 44028.0, + "31": 41131.0, + "32": 43348.0, + "33": 45441.0, + "34": 43316.0, + "35": 43258.0, + "36": 42459.0, + "37": 40074.0, + "38": 42544.0, + "39": 44707.0, + "40": 43237.0, + "41": 44652.0, + "42": 43196.0, + "43": 45435.0, + "44": 44591.0, + "45": 43263.0, + "46": 43930.0, + "47": 42373.0, + "48": 44713.0, + "49": 43128.0, + "50": 43361.0, + "51": 41133.0, + "52": 43849.0, + "53": 43899.0, + "54": 41704.0, + "55": 43863.0, + "56": 43205.0, + "57": 42636.0, + "58": 43835.0, + "59": 44623.0, + "60": 41226.0, + "61": 39705.0, + "62": 44732.0, + "63": 44659.0, + "64": 45371.0, + "65": 44682.0, + "66": 45341.0, + "67": 43169.0, + "68": 42486.0, + "69": 43829.0, + "70": 45529.0, + "71": 43294.0, + "72": 44745.0, + "73": 45364.0, + "74": 42463.0, + "75": 44679.0, + "76": 43882.0, + "77": 42042.0, + "78": 40356.0, + "79": 38928.0, + "80": 41079.0, + "81": 45349.0, + "82": 43226.0, + "83": 38474.0, + "84": 42415.0, + "85": 43989.0, + "86": 45673.0, + "87": 40850.0, + "88": 41756.0, + "89": 41065.0, + "90": 44686.0, + "91": 46135.0, + "92": 41609.0, + "93": 43267.0, + "94": 39525.0, + "95": 43921.0, + "96": 44683.0, + "97": 45412.0, + "98": 41832.0, + "99": 45416.0, + "100": 42457.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1104069120.0, + "2": 1104069120.0, + "3": 1104069120.0, + "4": 1104069120.0, + "5": 1104069120.0, + "6": 1104069120.0, + "7": 1104069120.0, + "8": 1104069120.0, + "9": 1104069120.0, + "10": 1104069120.0, + "11": 1104069120.0, + "12": 1104069120.0, + "13": 1104069120.0, + "14": 1104069120.0, + "15": 1104069120.0, + "16": 1104069120.0, + "17": 1104069120.0, + "18": 1104069120.0, + "19": 1104069120.0, + "20": 1104069120.0, + "21": 1104069120.0, + "22": 1104069120.0, + "23": 1104069120.0, + "24": 1104069120.0, + "25": 1104069120.0, + "26": 1104069120.0, + "27": 1104069120.0, + "28": 1104069120.0, + "29": 1104069120.0, + "30": 1104069120.0, + "31": 1104069120.0, + "32": 1104069120.0, + "33": 1104069120.0, + "34": 1104069120.0, + "35": 1104069120.0, + "36": 1104069120.0, + "37": 1104069120.0, + "38": 1104069120.0, + "39": 1104069120.0, + "40": 1104069120.0, + "41": 1104069120.0, + "42": 1104069120.0, + "43": 1104069120.0, + "44": 1104069120.0, + "45": 1104069120.0, + "46": 1104069120.0, + "47": 1104069120.0, + "48": 1104069120.0, + "49": 1104069120.0, + "50": 1104069120.0, + "51": 1104069120.0, + "52": 1104069120.0, + "53": 1104069120.0, + "54": 1104069120.0, + "55": 1104069120.0, + "56": 1104069120.0, + "57": 1104069120.0, + "58": 1104069120.0, + "59": 1104069120.0, + "60": 1104069120.0, + "61": 1104069120.0, + "62": 1104069120.0, + "63": 1104069120.0, + "64": 1104069120.0, + "65": 1104069120.0, + "66": 1104069120.0, + "67": 1104069120.0, + "68": 1104069120.0, + "69": 1104069120.0, + "70": 1104069120.0, + "71": 1104069120.0, + "72": 1104069120.0, + "73": 1104069120.0, + "74": 1104069120.0, + "75": 1104069120.0, + "76": 1104069120.0, + "77": 1104069120.0, + "78": 1104069120.0, + "79": 1104069120.0, + "80": 1104069120.0, + "81": 1104069120.0, + "82": 1104069120.0, + "83": 1104069120.0, + "84": 1104069120.0, + "85": 1104069120.0, + "86": 1104069120.0, + "87": 1104069120.0, + "88": 1104069120.0, + "89": 1104069120.0, + "90": 1104069120.0, + "91": 1104069120.0, + "92": 1104069120.0, + "93": 1104069120.0, + "94": 1104069120.0, + "95": 1104069120.0, + "96": 1104069120.0, + "97": 1104069120.0, + "98": 1104069120.0, + "99": 1104069120.0, + "100": 1104069120.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1368630784.0, + "2": 1833295360.0, + "3": 1833295360.0, + "4": 1833295360.0, + "5": 1833295360.0, + "6": 1833295360.0, + "7": 1833295360.0, + "8": 1833295360.0, + "9": 1833295360.0, + "10": 1833295360.0, + "11": 1833295360.0, + "12": 1833295360.0, + "13": 1833295360.0, + "14": 1833295360.0, + "15": 1833295360.0, + "16": 1833295360.0, + "17": 1833295360.0, + "18": 1833295360.0, + "19": 1833295360.0, + "20": 1833295360.0, + "21": 1833295360.0, + "22": 1833295360.0, + "23": 1833295360.0, + "24": 1833295360.0, + "25": 1833295360.0, + "26": 1833295360.0, + "27": 1833295360.0, + "28": 1833295360.0, + "29": 1833295360.0, + "30": 1833295360.0, + "31": 1833295360.0, + "32": 1833295360.0, + "33": 1833295360.0, + "34": 1833295360.0, + "35": 1833295360.0, + "36": 1833295360.0, + "37": 1833295360.0, + "38": 1833295360.0, + "39": 1833295360.0, + "40": 1833295360.0, + "41": 1833295360.0, + "42": 1833295360.0, + "43": 1833295360.0, + "44": 1833295360.0, + "45": 1833295360.0, + "46": 1833295360.0, + "47": 1833295360.0, + "48": 1833295360.0, + "49": 1833295360.0, + "50": 1833295360.0, + "51": 1833295360.0, + "52": 1833295360.0, + "53": 1833295360.0, + "54": 1833295360.0, + "55": 1833295360.0, + "56": 1833295360.0, + "57": 1833295360.0, + "58": 1833295360.0, + "59": 1833295360.0, + "60": 1833295360.0, + "61": 1833295360.0, + "62": 1833295360.0, + "63": 1833295360.0, + "64": 1833295360.0, + "65": 1833295360.0, + "66": 1833295360.0, + "67": 1833295360.0, + "68": 1833295360.0, + "69": 1833295360.0, + "70": 1833295360.0, + "71": 1833295360.0, + "72": 1833295360.0, + "73": 1833295360.0, + "74": 1833295360.0, + "75": 1833295360.0, + "76": 1833295360.0, + "77": 1833295360.0, + "78": 1833295360.0, + "79": 1833295360.0, + "80": 1833295360.0, + "81": 1833295360.0, + "82": 1833295360.0, + "83": 1833295360.0, + "84": 1833295360.0, + "85": 1833295360.0, + "86": 1833295360.0, + "87": 1833295360.0, + "88": 1833295360.0, + "89": 1833295360.0, + "90": 1833295360.0, + "91": 1833295360.0, + "92": 1833295360.0, + "93": 1833295360.0, + "94": 1833295360.0, + "95": 1833295360.0, + "96": 1833295360.0, + "97": 1833295360.0, + "98": 1833295360.0, + "99": 1833295360.0, + "100": 1833295360.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.97888, + "2": 0.55212, + "3": 0.46939, + "4": 0.48338, + "5": 0.4977, + "6": 0.48497, + "7": 0.48521, + "8": 0.48365, + "9": 0.47845, + "10": 0.48441, + "11": 0.48622, + "12": 0.49049, + "13": 0.49384, + "14": 0.48918, + "15": 0.48451, + "16": 0.49344, + "17": 0.49291, + "18": 0.49613, + "19": 0.49898, + "20": 0.49079, + "21": 0.48153, + "22": 0.48369, + "23": 0.4824, + "24": 0.4958, + "25": 0.48572, + "26": 0.50758, + "27": 0.48722, + "28": 0.47977, + "29": 0.5598, + "30": 0.47951, + "31": 1.06254, + "32": 0.7493, + "33": 1.59176, + "34": 0.85052, + "35": 2.25233, + "36": 1.66198, + "37": 0.68722, + "38": 0.4632, + "39": 0.46558, + "40": 0.52308, + "41": 0.47497, + "42": 0.46579, + "43": 0.46956, + "44": 0.46788, + "45": 0.47342, + "46": 0.53067, + "47": 0.48889, + "48": 0.47648, + "49": 0.47372, + "50": 0.46927, + "51": 0.46862, + "52": 0.47754, + "53": 0.47724, + "54": 0.47513, + "55": 0.46395, + "56": 0.46587, + "57": 0.78252, + "58": 0.46515, + "59": 0.46114, + "60": 0.46011, + "61": 0.45394, + "62": 0.45518, + "63": 0.48166, + "64": 0.47197, + "65": 0.97766, + "66": 0.45863, + "67": 0.45331, + "68": 0.45132, + "69": 0.4828, + "70": 0.45508, + "71": 0.45601, + "72": 1.14428, + "73": 0.45179, + "74": 0.4534, + "75": 0.46049, + "76": 0.46918, + "77": 0.45685, + "78": 0.45627, + "79": 0.46018, + "80": 0.46056, + "81": 0.46543, + "82": 0.45359, + "83": 0.78935, + "84": 0.46472, + "85": 0.45517, + "86": 0.46043, + "87": 0.45426, + "88": 0.45214, + "89": 0.45913, + "90": 0.45237, + "91": 0.46312, + "92": 0.79955, + "93": 0.45537, + "94": 0.45217, + "95": 0.45359, + "96": 0.45058, + "97": 0.45281, + "98": 0.46149, + "99": 0.45894, + "100": 0.46912 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..2e0ee7ee230 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.34897, + "2": 10.34482, + "3": 9.79428, + "4": 9.59585, + "5": 9.42074, + "6": 9.41847, + "7": 9.28062, + "8": 9.18972, + "9": 9.06519, + "10": 9.00183, + "11": 8.81475, + "12": 8.7808, + "13": 8.82493, + "14": 8.67261, + "15": 8.6364, + "16": 8.51896, + "17": 8.45704, + "18": 8.37007, + "19": 8.36039, + "20": 8.25417, + "21": 8.2421, + "22": 8.13324, + "23": 8.06764, + "24": 8.1142, + "25": 7.95082, + "26": 8.08156, + "27": 7.86764, + "28": 7.93993, + "29": 7.77566, + "30": 7.84559, + "31": 7.8152, + "32": 7.65941, + "33": 7.77856, + "34": 7.53188, + "35": 7.65804, + "36": 7.51464, + "37": 7.44686, + "38": 7.48161, + "39": 7.46435, + "40": 7.49084, + "41": 7.40827, + "42": 7.35625, + "43": 7.43764, + "44": 7.35439, + "45": 7.35042, + "46": 7.27853, + "47": 7.4405, + "48": 7.26763, + "49": 7.32341, + "50": 7.14486, + "51": 7.36469, + "52": 7.12044, + "53": 7.09167, + "54": 7.22712, + "55": 7.13495, + "56": 7.20751, + "57": 7.31287, + "58": 6.99063, + "59": 7.09849, + "60": 7.12665, + "61": 7.10047, + "62": 7.23974, + "63": 7.14358, + "64": 7.06717, + "65": 6.98408, + "66": 7.03692, + "67": 7.02875, + "68": 7.12914, + "69": 7.01425, + "70": 7.04954, + "71": 6.89312, + "72": 6.98513, + "73": 6.96734, + "74": 6.90236, + "75": 7.05611, + "76": 6.95986, + "77": 7.06862, + "78": 7.0204, + "79": 6.8505, + "80": 6.92019, + "81": 6.95982, + "82": 7.04575, + "83": 6.98617, + "84": 6.99991, + "85": 6.83511, + "86": 7.04087, + "87": 6.96604, + "88": 6.90125, + "89": 6.80345, + "90": 7.22384, + "91": 6.70505, + "92": 7.03979, + "93": 6.8857, + "94": 7.04044, + "95": 6.84746, + "96": 6.9546, + "97": 6.94425, + "98": 6.86865, + "99": 6.9948, + "100": 6.96761 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43289.0, + "2": 44062.0, + "3": 44747.0, + "4": 42377.0, + "5": 45372.0, + "6": 40957.0, + "7": 43147.0, + "8": 45474.0, + "9": 42425.0, + "10": 45380.0, + "11": 43984.0, + "12": 44594.0, + "13": 43914.0, + "14": 46203.0, + "15": 43914.0, + "16": 41632.0, + "17": 43870.0, + "18": 44691.0, + "19": 42574.0, + "20": 44769.0, + "21": 44757.0, + "22": 41854.0, + "23": 45440.0, + "24": 43066.0, + "25": 42458.0, + "26": 43949.0, + "27": 46224.0, + "28": 46395.0, + "29": 46168.0, + "30": 44028.0, + "31": 41131.0, + "32": 43348.0, + "33": 45441.0, + "34": 43316.0, + "35": 43258.0, + "36": 42459.0, + "37": 40074.0, + "38": 42544.0, + "39": 44707.0, + "40": 43237.0, + "41": 44652.0, + "42": 43196.0, + "43": 45435.0, + "44": 44591.0, + "45": 43263.0, + "46": 43930.0, + "47": 42373.0, + "48": 44713.0, + "49": 43128.0, + "50": 43361.0, + "51": 41133.0, + "52": 43849.0, + "53": 43899.0, + "54": 41704.0, + "55": 43863.0, + "56": 43205.0, + "57": 42636.0, + "58": 43835.0, + "59": 44623.0, + "60": 41226.0, + "61": 39705.0, + "62": 44732.0, + "63": 44659.0, + "64": 45371.0, + "65": 44682.0, + "66": 45341.0, + "67": 43169.0, + "68": 42486.0, + "69": 43829.0, + "70": 45529.0, + "71": 43294.0, + "72": 44745.0, + "73": 45364.0, + "74": 42463.0, + "75": 44679.0, + "76": 43882.0, + "77": 42042.0, + "78": 40356.0, + "79": 38928.0, + "80": 41079.0, + "81": 45349.0, + "82": 43226.0, + "83": 38474.0, + "84": 42415.0, + "85": 43989.0, + "86": 45673.0, + "87": 40850.0, + "88": 41756.0, + "89": 41065.0, + "90": 44686.0, + "91": 46135.0, + "92": 41609.0, + "93": 43267.0, + "94": 39525.0, + "95": 43921.0, + "96": 44683.0, + "97": 45412.0, + "98": 41832.0, + "99": 45416.0, + "100": 42457.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1104069120.0, + "2": 1104069120.0, + "3": 1104069120.0, + "4": 1104069120.0, + "5": 1104069120.0, + "6": 1104069120.0, + "7": 1104069120.0, + "8": 1104069120.0, + "9": 1104069120.0, + "10": 1104069120.0, + "11": 1104069120.0, + "12": 1104069120.0, + "13": 1104069120.0, + "14": 1104069120.0, + "15": 1104069120.0, + "16": 1104069120.0, + "17": 1104069120.0, + "18": 1104069120.0, + "19": 1104069120.0, + "20": 1104069120.0, + "21": 1104069120.0, + "22": 1104069120.0, + "23": 1104069120.0, + "24": 1104069120.0, + "25": 1104069120.0, + "26": 1104069120.0, + "27": 1104069120.0, + "28": 1104069120.0, + "29": 1104069120.0, + "30": 1104069120.0, + "31": 1104069120.0, + "32": 1104069120.0, + "33": 1104069120.0, + "34": 1104069120.0, + "35": 1104069120.0, + "36": 1104069120.0, + "37": 1104069120.0, + "38": 1104069120.0, + "39": 1104069120.0, + "40": 1104069120.0, + "41": 1104069120.0, + "42": 1104069120.0, + "43": 1104069120.0, + "44": 1104069120.0, + "45": 1104069120.0, + "46": 1104069120.0, + "47": 1104069120.0, + "48": 1104069120.0, + "49": 1104069120.0, + "50": 1104069120.0, + "51": 1104069120.0, + "52": 1104069120.0, + "53": 1104069120.0, + "54": 1104069120.0, + "55": 1104069120.0, + "56": 1104069120.0, + "57": 1104069120.0, + "58": 1104069120.0, + "59": 1104069120.0, + "60": 1104069120.0, + "61": 1104069120.0, + "62": 1104069120.0, + "63": 1104069120.0, + "64": 1104069120.0, + "65": 1104069120.0, + "66": 1104069120.0, + "67": 1104069120.0, + "68": 1104069120.0, + "69": 1104069120.0, + "70": 1104069120.0, + "71": 1104069120.0, + "72": 1104069120.0, + "73": 1104069120.0, + "74": 1104069120.0, + "75": 1104069120.0, + "76": 1104069120.0, + "77": 1104069120.0, + "78": 1104069120.0, + "79": 1104069120.0, + "80": 1104069120.0, + "81": 1104069120.0, + "82": 1104069120.0, + "83": 1104069120.0, + "84": 1104069120.0, + "85": 1104069120.0, + "86": 1104069120.0, + "87": 1104069120.0, + "88": 1104069120.0, + "89": 1104069120.0, + "90": 1104069120.0, + "91": 1104069120.0, + "92": 1104069120.0, + "93": 1104069120.0, + "94": 1104069120.0, + "95": 1104069120.0, + "96": 1104069120.0, + "97": 1104069120.0, + "98": 1104069120.0, + "99": 1104069120.0, + "100": 1104069120.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1368630784.0, + "2": 1833295360.0, + "3": 1833295360.0, + "4": 1833295360.0, + "5": 1833295360.0, + "6": 1833295360.0, + "7": 1833295360.0, + "8": 1833295360.0, + "9": 1833295360.0, + "10": 1833295360.0, + "11": 1833295360.0, + "12": 1833295360.0, + "13": 1833295360.0, + "14": 1833295360.0, + "15": 1833295360.0, + "16": 1833295360.0, + "17": 1833295360.0, + "18": 1833295360.0, + "19": 1833295360.0, + "20": 1833295360.0, + "21": 1833295360.0, + "22": 1833295360.0, + "23": 1833295360.0, + "24": 1833295360.0, + "25": 1833295360.0, + "26": 1833295360.0, + "27": 1833295360.0, + "28": 1833295360.0, + "29": 1833295360.0, + "30": 1833295360.0, + "31": 1833295360.0, + "32": 1833295360.0, + "33": 1833295360.0, + "34": 1833295360.0, + "35": 1833295360.0, + "36": 1833295360.0, + "37": 1833295360.0, + "38": 1833295360.0, + "39": 1833295360.0, + "40": 1833295360.0, + "41": 1833295360.0, + "42": 1833295360.0, + "43": 1833295360.0, + "44": 1833295360.0, + "45": 1833295360.0, + "46": 1833295360.0, + "47": 1833295360.0, + "48": 1833295360.0, + "49": 1833295360.0, + "50": 1833295360.0, + "51": 1833295360.0, + "52": 1833295360.0, + "53": 1833295360.0, + "54": 1833295360.0, + "55": 1833295360.0, + "56": 1833295360.0, + "57": 1833295360.0, + "58": 1833295360.0, + "59": 1833295360.0, + "60": 1833295360.0, + "61": 1833295360.0, + "62": 1833295360.0, + "63": 1833295360.0, + "64": 1833295360.0, + "65": 1833295360.0, + "66": 1833295360.0, + "67": 1833295360.0, + "68": 1833295360.0, + "69": 1833295360.0, + "70": 1833295360.0, + "71": 1833295360.0, + "72": 1833295360.0, + "73": 1833295360.0, + "74": 1833295360.0, + "75": 1833295360.0, + "76": 1833295360.0, + "77": 1833295360.0, + "78": 1833295360.0, + "79": 1833295360.0, + "80": 1833295360.0, + "81": 1833295360.0, + "82": 1833295360.0, + "83": 1833295360.0, + "84": 1833295360.0, + "85": 1833295360.0, + "86": 1833295360.0, + "87": 1833295360.0, + "88": 1833295360.0, + "89": 1833295360.0, + "90": 1833295360.0, + "91": 1833295360.0, + "92": 1833295360.0, + "93": 1833295360.0, + "94": 1833295360.0, + "95": 1833295360.0, + "96": 1833295360.0, + "97": 1833295360.0, + "98": 1833295360.0, + "99": 1833295360.0, + "100": 1833295360.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.3446, + "2": 0.55186, + "3": 0.52074, + "4": 0.52226, + "5": 0.51961, + "6": 0.52672, + "7": 0.52451, + "8": 0.52369, + "9": 0.54507, + "10": 0.53931, + "11": 0.55505, + "12": 0.52851, + "13": 0.51692, + "14": 0.52026, + "15": 0.51979, + "16": 0.53317, + "17": 0.52489, + "18": 0.59625, + "19": 0.52238, + "20": 0.53197, + "21": 0.52211, + "22": 0.51979, + "23": 0.52551, + "24": 0.52413, + "25": 0.52676, + "26": 0.5192, + "27": 0.52336, + "28": 0.53671, + "29": 0.53561, + "30": 0.51609, + "31": 0.55983, + "32": 0.5166, + "33": 0.53721, + "34": 0.52158, + "35": 0.53727, + "36": 0.5279, + "37": 0.51655, + "38": 0.51986, + "39": 0.5223, + "40": 0.52388, + "41": 0.52083, + "42": 0.52801, + "43": 0.52136, + "44": 0.52414, + "45": 0.52048, + "46": 0.53415, + "47": 0.54831, + "48": 0.58827, + "49": 0.55044, + "50": 0.52682, + "51": 0.52339, + "52": 0.51726, + "53": 0.518, + "54": 0.51935, + "55": 0.52073, + "56": 0.52732, + "57": 0.51867, + "58": 0.51876, + "59": 0.5213, + "60": 0.51779, + "61": 0.52225, + "62": 0.52041, + "63": 0.51793, + "64": 0.5135, + "65": 0.51913, + "66": 0.86034, + "67": 0.51468, + "68": 0.90156, + "69": 0.51931, + "70": 0.53602, + "71": 0.51818, + "72": 0.51744, + "73": 0.54454, + "74": 0.51831, + "75": 0.521, + "76": 0.52894, + "77": 0.53227, + "78": 0.51806, + "79": 0.51818, + "80": 0.51632, + "81": 0.51704, + "82": 0.51542, + "83": 0.51861, + "84": 0.53204, + "85": 0.52011, + "86": 0.53043, + "87": 0.94359, + "88": 0.51776, + "89": 0.51799, + "90": 0.51773, + "91": 0.51828, + "92": 0.52318, + "93": 0.51688, + "94": 0.51939, + "95": 0.51554, + "96": 0.9, + "97": 0.96079, + "98": 0.52856, + "99": 0.51996, + "100": 0.52921 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json index 49f18d73ef1..791f5758ea5 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.34897, + "2": 10.34482, + "3": 9.79428, + "4": 9.59585, "5": 9.42074, + "6": 9.41847, + "7": 9.28062, + "8": 9.18972, + "9": 9.06519, "10": 9.00183, + "11": 8.81475, + "12": 8.7808, + "13": 8.82493, + "14": 8.67261, "15": 8.6364, + "16": 8.51896, + "17": 8.45704, + "18": 8.37007, + "19": 8.36039, "20": 8.25417, + "21": 8.2421, + "22": 8.13324, + "23": 8.06764, + "24": 8.1142, "25": 7.95082, + "26": 8.08156, + "27": 7.86764, + "28": 7.93993, + "29": 7.77566, "30": 7.84559, + "31": 7.8152, + "32": 7.65941, + "33": 7.77856, + "34": 7.53188, "35": 7.65804, + "36": 7.51464, + "37": 7.44686, + "38": 7.48161, + "39": 7.46435, "40": 7.49084, + "41": 7.40827, + "42": 7.35625, + "43": 7.43764, + "44": 7.35439, "45": 7.35042, + "46": 7.27853, + "47": 7.4405, + "48": 7.26763, + "49": 7.32341, "50": 7.14486, + "51": 7.36469, + "52": 7.12044, + "53": 7.09167, + "54": 7.22712, "55": 7.13495, + "56": 7.20751, + "57": 7.31287, + "58": 6.99063, + "59": 7.09849, "60": 7.12665, + "61": 7.10047, + "62": 7.23974, + "63": 7.14358, + "64": 7.06717, "65": 6.98408, + "66": 7.03692, + "67": 7.02875, + "68": 7.12914, + "69": 7.01425, "70": 7.04954, + "71": 6.89312, + "72": 6.98513, + "73": 6.96734, + "74": 6.90236, "75": 7.05611, + "76": 6.95986, + "77": 7.06862, + "78": 7.0204, + "79": 6.8505, "80": 6.92019, + "81": 6.95982, + "82": 7.04575, + "83": 6.98617, + "84": 6.99991, "85": 6.83511, + "86": 7.04087, + "87": 6.96604, + "88": 6.90125, + "89": 6.80345, "90": 7.22384, + "91": 6.70505, + "92": 7.03979, + "93": 6.8857, + "94": 7.04044, "95": 6.84746, + "96": 6.9546, + "97": 6.94425, + "98": 6.86865, + "99": 6.9948, "100": 6.96761 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 43289.0, + "2": 44062.0, + "3": 44747.0, + "4": 42377.0, "5": 45372.0, + "6": 40957.0, + "7": 43147.0, + "8": 45474.0, + "9": 42425.0, "10": 45380.0, + "11": 43984.0, + "12": 44594.0, + "13": 43914.0, + "14": 46203.0, "15": 43914.0, + "16": 41632.0, + "17": 43870.0, + "18": 44691.0, + "19": 42574.0, "20": 44769.0, + "21": 44757.0, + "22": 41854.0, + "23": 45440.0, + "24": 43066.0, "25": 42458.0, + "26": 43949.0, + "27": 46224.0, + "28": 46395.0, + "29": 46168.0, "30": 44028.0, + "31": 41131.0, + "32": 43348.0, + "33": 45441.0, + "34": 43316.0, "35": 43258.0, + "36": 42459.0, + "37": 40074.0, + "38": 42544.0, + "39": 44707.0, "40": 43237.0, + "41": 44652.0, + "42": 43196.0, + "43": 45435.0, + "44": 44591.0, "45": 43263.0, + "46": 43930.0, + "47": 42373.0, + "48": 44713.0, + "49": 43128.0, "50": 43361.0, + "51": 41133.0, + "52": 43849.0, + "53": 43899.0, + "54": 41704.0, "55": 43863.0, + "56": 43205.0, + "57": 42636.0, + "58": 43835.0, + "59": 44623.0, "60": 41226.0, + "61": 39705.0, + "62": 44732.0, + "63": 44659.0, + "64": 45371.0, "65": 44682.0, + "66": 45341.0, + "67": 43169.0, + "68": 42486.0, + "69": 43829.0, "70": 45529.0, + "71": 43294.0, + "72": 44745.0, + "73": 45364.0, + "74": 42463.0, "75": 44679.0, + "76": 43882.0, + "77": 42042.0, + "78": 40356.0, + "79": 38928.0, "80": 41079.0, + "81": 45349.0, + "82": 43226.0, + "83": 38474.0, + "84": 42415.0, "85": 43989.0, + "86": 45673.0, + "87": 40850.0, + "88": 41756.0, + "89": 41065.0, "90": 44686.0, + "91": 46135.0, + "92": 41609.0, + "93": 43267.0, + "94": 39525.0, "95": 43921.0, + "96": 44683.0, + "97": 45412.0, + "98": 41832.0, + "99": 45416.0, "100": 42457.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1104069120.0, + "2": 1104069120.0, + "3": 1104069120.0, + "4": 1104069120.0, "5": 1104069120.0, + "6": 1104069120.0, + "7": 1104069120.0, + "8": 1104069120.0, + "9": 1104069120.0, "10": 1104069120.0, + "11": 1104069120.0, + "12": 1104069120.0, + "13": 1104069120.0, + "14": 1104069120.0, "15": 1104069120.0, + "16": 1104069120.0, + "17": 1104069120.0, + "18": 1104069120.0, + "19": 1104069120.0, "20": 1104069120.0, + "21": 1104069120.0, + "22": 1104069120.0, + "23": 1104069120.0, + "24": 1104069120.0, "25": 1104069120.0, + "26": 1104069120.0, + "27": 1104069120.0, + "28": 1104069120.0, + "29": 1104069120.0, "30": 1104069120.0, + "31": 1104069120.0, + "32": 1104069120.0, + "33": 1104069120.0, + "34": 1104069120.0, "35": 1104069120.0, + "36": 1104069120.0, + "37": 1104069120.0, + "38": 1104069120.0, + "39": 1104069120.0, "40": 1104069120.0, + "41": 1104069120.0, + "42": 1104069120.0, + "43": 1104069120.0, + "44": 1104069120.0, "45": 1104069120.0, + "46": 1104069120.0, + "47": 1104069120.0, + "48": 1104069120.0, + "49": 1104069120.0, "50": 1104069120.0, + "51": 1104069120.0, + "52": 1104069120.0, + "53": 1104069120.0, + "54": 1104069120.0, "55": 1104069120.0, + "56": 1104069120.0, + "57": 1104069120.0, + "58": 1104069120.0, + "59": 1104069120.0, "60": 1104069120.0, + "61": 1104069120.0, + "62": 1104069120.0, + "63": 1104069120.0, + "64": 1104069120.0, "65": 1104069120.0, + "66": 1104069120.0, + "67": 1104069120.0, + "68": 1104069120.0, + "69": 1104069120.0, "70": 1104069120.0, + "71": 1104069120.0, + "72": 1104069120.0, + "73": 1104069120.0, + "74": 1104069120.0, "75": 1104069120.0, + "76": 1104069120.0, + "77": 1104069120.0, + "78": 1104069120.0, + "79": 1104069120.0, "80": 1104069120.0, + "81": 1104069120.0, + "82": 1104069120.0, + "83": 1104069120.0, + "84": 1104069120.0, "85": 1104069120.0, + "86": 1104069120.0, + "87": 1104069120.0, + "88": 1104069120.0, + "89": 1104069120.0, "90": 1104069120.0, + "91": 1104069120.0, + "92": 1104069120.0, + "93": 1104069120.0, + "94": 1104069120.0, "95": 1104069120.0, + "96": 1104069120.0, + "97": 1104069120.0, + "98": 1104069120.0, + "99": 1104069120.0, "100": 1104069120.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 1368630784.0, + "2": 1833295360.0, + "3": 1833295360.0, + "4": 1833295360.0, "5": 1833295360.0, + "6": 1833295360.0, + "7": 1833295360.0, + "8": 1833295360.0, + "9": 1833295360.0, "10": 1833295360.0, + "11": 1833295360.0, + "12": 1833295360.0, + "13": 1833295360.0, + "14": 1833295360.0, "15": 1833295360.0, + "16": 1833295360.0, + "17": 1833295360.0, + "18": 1833295360.0, + "19": 1833295360.0, "20": 1833295360.0, + "21": 1833295360.0, + "22": 1833295360.0, + "23": 1833295360.0, + "24": 1833295360.0, "25": 1833295360.0, + "26": 1833295360.0, + "27": 1833295360.0, + "28": 1833295360.0, + "29": 1833295360.0, "30": 1833295360.0, + "31": 1833295360.0, + "32": 1833295360.0, + "33": 1833295360.0, + "34": 1833295360.0, "35": 1833295360.0, + "36": 1833295360.0, + "37": 1833295360.0, + "38": 1833295360.0, + "39": 1833295360.0, "40": 1833295360.0, + "41": 1833295360.0, + "42": 1833295360.0, + "43": 1833295360.0, + "44": 1833295360.0, "45": 1833295360.0, + "46": 1833295360.0, + "47": 1833295360.0, + "48": 1833295360.0, + "49": 1833295360.0, "50": 1833295360.0, + "51": 1833295360.0, + "52": 1833295360.0, + "53": 1833295360.0, + "54": 1833295360.0, "55": 1833295360.0, + "56": 1833295360.0, + "57": 1833295360.0, + "58": 1833295360.0, + "59": 1833295360.0, "60": 1833295360.0, + "61": 1833295360.0, + "62": 1833295360.0, + "63": 1833295360.0, + "64": 1833295360.0, "65": 1833295360.0, + "66": 1833295360.0, + "67": 1833295360.0, + "68": 1833295360.0, + "69": 1833295360.0, "70": 1833295360.0, + "71": 1833295360.0, + "72": 1833295360.0, + "73": 1833295360.0, + "74": 1833295360.0, "75": 1833295360.0, + "76": 1833295360.0, + "77": 1833295360.0, + "78": 1833295360.0, + "79": 1833295360.0, "80": 1833295360.0, + "81": 1833295360.0, + "82": 1833295360.0, + "83": 1833295360.0, + "84": 1833295360.0, "85": 1833295360.0, + "86": 1833295360.0, + "87": 1833295360.0, + "88": 1833295360.0, + "89": 1833295360.0, "90": 1833295360.0, + "91": 1833295360.0, + "92": 1833295360.0, + "93": 1833295360.0, + "94": 1833295360.0, "95": 1833295360.0, + "96": 1833295360.0, + "97": 1833295360.0, + "98": 1833295360.0, + "99": 1833295360.0, "100": 1833295360.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 8.34115, - "5": 0.45893, - "10": 0.45098, - "15": 0.46238, - "20": 0.44885, - "25": 0.4602, - "30": 0.44717, - "35": 0.45167, - "40": 0.46266, - "45": 0.44352, - "50": 0.78806, - "55": 0.46254, - "60": 0.45899, - "65": 0.47177, - "70": 0.44807, - "75": 0.44966, - "80": 0.44473, - "85": 0.45029, - "90": 0.48553, - "95": 0.4471, - "100": 0.46649 + "1": 9.43749, + "2": 0.56177, + "3": 0.54092, + "4": 0.53069, + "5": 0.54015, + "6": 0.52654, + "7": 0.52537, + "8": 0.529, + "9": 0.52024, + "10": 0.54001, + "11": 0.52228, + "12": 0.52764, + "13": 0.52112, + "14": 0.52842, + "15": 0.53159, + "16": 0.52768, + "17": 0.53602, + "18": 0.52711, + "19": 0.5217, + "20": 0.53787, + "21": 0.52947, + "22": 0.52812, + "23": 0.522, + "24": 0.525, + "25": 0.5262, + "26": 0.5262, + "27": 0.52831, + "28": 0.5236, + "29": 0.54456, + "30": 0.51906, + "31": 0.52674, + "32": 0.52164, + "33": 0.5315, + "34": 0.52077, + "35": 0.53196, + "36": 0.52142, + "37": 0.52841, + "38": 0.52733, + "39": 0.52595, + "40": 0.52329, + "41": 0.52463, + "42": 0.52373, + "43": 0.5242, + "44": 0.53002, + "45": 0.52375, + "46": 0.52927, + "47": 0.52485, + "48": 0.54174, + "49": 0.52535, + "50": 0.52504, + "51": 0.53766, + "52": 0.52768, + "53": 0.52759, + "54": 0.52754, + "55": 0.53938, + "56": 0.53362, + "57": 0.53077, + "58": 0.52676, + "59": 0.53132, + "60": 0.52333, + "61": 0.52796, + "62": 0.53758, + "63": 0.53371, + "64": 0.52937, + "65": 0.53002, + "66": 0.53001, + "67": 0.52768, + "68": 0.52999, + "69": 0.52873, + "70": 0.54329, + "71": 0.52577, + "72": 0.53281, + "73": 0.52373, + "74": 0.53896, + "75": 0.53536, + "76": 0.52444, + "77": 0.53551, + "78": 0.55804, + "79": 0.55697, + "80": 0.53175, + "81": 0.53929, + "82": 0.52759, + "83": 0.53135, + "84": 0.53043, + "85": 0.53678, + "86": 0.58197, + "87": 0.54322, + "88": 0.52771, + "89": 0.88532, + "90": 0.5352, + "91": 0.5432, + "92": 0.53256, + "93": 0.53, + "94": 0.53231, + "95": 0.53588, + "96": 0.5246, + "97": 0.53401, + "98": 0.53042, + "99": 0.53172, + "100": 0.52281 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..7f620001acb --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.34897, + "2": 10.34482, + "3": 9.79428, + "4": 9.59585, + "5": 9.42074, + "6": 9.41847, + "7": 9.28062, + "8": 9.18972, + "9": 9.06519, + "10": 9.00183, + "11": 8.81475, + "12": 8.7808, + "13": 8.82493, + "14": 8.67261, + "15": 8.6364, + "16": 8.51896, + "17": 8.45704, + "18": 8.37007, + "19": 8.36039, + "20": 8.25417, + "21": 8.2421, + "22": 8.13324, + "23": 8.06764, + "24": 8.1142, + "25": 7.95082, + "26": 8.08156, + "27": 7.86764, + "28": 7.93993, + "29": 7.77566, + "30": 7.84559, + "31": 7.8152, + "32": 7.65941, + "33": 7.77856, + "34": 7.53188, + "35": 7.65804, + "36": 7.51464, + "37": 7.44686, + "38": 7.48161, + "39": 7.46435, + "40": 7.49084, + "41": 7.40827, + "42": 7.35625, + "43": 7.43764, + "44": 7.35439, + "45": 7.35042, + "46": 7.27853, + "47": 7.4405, + "48": 7.26763, + "49": 7.32341, + "50": 7.14486, + "51": 7.36469, + "52": 7.12044, + "53": 7.09167, + "54": 7.22712, + "55": 7.13495, + "56": 7.20751, + "57": 7.31287, + "58": 6.99063, + "59": 7.09849, + "60": 7.12665, + "61": 7.10047, + "62": 7.23974, + "63": 7.14358, + "64": 7.06717, + "65": 6.98408, + "66": 7.03692, + "67": 7.02875, + "68": 7.12914, + "69": 7.01425, + "70": 7.04954, + "71": 6.89312, + "72": 6.98513, + "73": 6.96734, + "74": 6.90236, + "75": 7.05611, + "76": 6.95986, + "77": 7.06862, + "78": 7.0204, + "79": 6.8505, + "80": 6.92019, + "81": 6.95982, + "82": 7.04575, + "83": 6.98617, + "84": 6.99991, + "85": 6.83511, + "86": 7.04087, + "87": 6.96604, + "88": 6.90125, + "89": 6.80345, + "90": 7.22384, + "91": 6.70505, + "92": 7.03979, + "93": 6.8857, + "94": 7.04044, + "95": 6.84746, + "96": 6.9546, + "97": 6.94425, + "98": 6.86865, + "99": 6.9948, + "100": 6.96761 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43289.0, + "2": 44062.0, + "3": 44747.0, + "4": 42377.0, + "5": 45372.0, + "6": 40957.0, + "7": 43147.0, + "8": 45474.0, + "9": 42425.0, + "10": 45380.0, + "11": 43984.0, + "12": 44594.0, + "13": 43914.0, + "14": 46203.0, + "15": 43914.0, + "16": 41632.0, + "17": 43870.0, + "18": 44691.0, + "19": 42574.0, + "20": 44769.0, + "21": 44757.0, + "22": 41854.0, + "23": 45440.0, + "24": 43066.0, + "25": 42458.0, + "26": 43949.0, + "27": 46224.0, + "28": 46395.0, + "29": 46168.0, + "30": 44028.0, + "31": 41131.0, + "32": 43348.0, + "33": 45441.0, + "34": 43316.0, + "35": 43258.0, + "36": 42459.0, + "37": 40074.0, + "38": 42544.0, + "39": 44707.0, + "40": 43237.0, + "41": 44652.0, + "42": 43196.0, + "43": 45435.0, + "44": 44591.0, + "45": 43263.0, + "46": 43930.0, + "47": 42373.0, + "48": 44713.0, + "49": 43128.0, + "50": 43361.0, + "51": 41133.0, + "52": 43849.0, + "53": 43899.0, + "54": 41704.0, + "55": 43863.0, + "56": 43205.0, + "57": 42636.0, + "58": 43835.0, + "59": 44623.0, + "60": 41226.0, + "61": 39705.0, + "62": 44732.0, + "63": 44659.0, + "64": 45371.0, + "65": 44682.0, + "66": 45341.0, + "67": 43169.0, + "68": 42486.0, + "69": 43829.0, + "70": 45529.0, + "71": 43294.0, + "72": 44745.0, + "73": 45364.0, + "74": 42463.0, + "75": 44679.0, + "76": 43882.0, + "77": 42042.0, + "78": 40356.0, + "79": 38928.0, + "80": 41079.0, + "81": 45349.0, + "82": 43226.0, + "83": 38474.0, + "84": 42415.0, + "85": 43989.0, + "86": 45673.0, + "87": 40850.0, + "88": 41756.0, + "89": 41065.0, + "90": 44686.0, + "91": 46135.0, + "92": 41609.0, + "93": 43267.0, + "94": 39525.0, + "95": 43921.0, + "96": 44683.0, + "97": 45412.0, + "98": 41832.0, + "99": 45416.0, + "100": 42457.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1104069120.0, + "2": 1104069120.0, + "3": 1104069120.0, + "4": 1104069120.0, + "5": 1104069120.0, + "6": 1104069120.0, + "7": 1104069120.0, + "8": 1104069120.0, + "9": 1104069120.0, + "10": 1104069120.0, + "11": 1104069120.0, + "12": 1104069120.0, + "13": 1104069120.0, + "14": 1104069120.0, + "15": 1104069120.0, + "16": 1104069120.0, + "17": 1104069120.0, + "18": 1104069120.0, + "19": 1104069120.0, + "20": 1104069120.0, + "21": 1104069120.0, + "22": 1104069120.0, + "23": 1104069120.0, + "24": 1104069120.0, + "25": 1104069120.0, + "26": 1104069120.0, + "27": 1104069120.0, + "28": 1104069120.0, + "29": 1104069120.0, + "30": 1104069120.0, + "31": 1104069120.0, + "32": 1104069120.0, + "33": 1104069120.0, + "34": 1104069120.0, + "35": 1104069120.0, + "36": 1104069120.0, + "37": 1104069120.0, + "38": 1104069120.0, + "39": 1104069120.0, + "40": 1104069120.0, + "41": 1104069120.0, + "42": 1104069120.0, + "43": 1104069120.0, + "44": 1104069120.0, + "45": 1104069120.0, + "46": 1104069120.0, + "47": 1104069120.0, + "48": 1104069120.0, + "49": 1104069120.0, + "50": 1104069120.0, + "51": 1104069120.0, + "52": 1104069120.0, + "53": 1104069120.0, + "54": 1104069120.0, + "55": 1104069120.0, + "56": 1104069120.0, + "57": 1104069120.0, + "58": 1104069120.0, + "59": 1104069120.0, + "60": 1104069120.0, + "61": 1104069120.0, + "62": 1104069120.0, + "63": 1104069120.0, + "64": 1104069120.0, + "65": 1104069120.0, + "66": 1104069120.0, + "67": 1104069120.0, + "68": 1104069120.0, + "69": 1104069120.0, + "70": 1104069120.0, + "71": 1104069120.0, + "72": 1104069120.0, + "73": 1104069120.0, + "74": 1104069120.0, + "75": 1104069120.0, + "76": 1104069120.0, + "77": 1104069120.0, + "78": 1104069120.0, + "79": 1104069120.0, + "80": 1104069120.0, + "81": 1104069120.0, + "82": 1104069120.0, + "83": 1104069120.0, + "84": 1104069120.0, + "85": 1104069120.0, + "86": 1104069120.0, + "87": 1104069120.0, + "88": 1104069120.0, + "89": 1104069120.0, + "90": 1104069120.0, + "91": 1104069120.0, + "92": 1104069120.0, + "93": 1104069120.0, + "94": 1104069120.0, + "95": 1104069120.0, + "96": 1104069120.0, + "97": 1104069120.0, + "98": 1104069120.0, + "99": 1104069120.0, + "100": 1104069120.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1368630784.0, + "2": 1833295360.0, + "3": 1833295360.0, + "4": 1833295360.0, + "5": 1833295360.0, + "6": 1833295360.0, + "7": 1833295360.0, + "8": 1833295360.0, + "9": 1833295360.0, + "10": 1833295360.0, + "11": 1833295360.0, + "12": 1833295360.0, + "13": 1833295360.0, + "14": 1833295360.0, + "15": 1833295360.0, + "16": 1833295360.0, + "17": 1833295360.0, + "18": 1833295360.0, + "19": 1833295360.0, + "20": 1833295360.0, + "21": 1833295360.0, + "22": 1833295360.0, + "23": 1833295360.0, + "24": 1833295360.0, + "25": 1833295360.0, + "26": 1833295360.0, + "27": 1833295360.0, + "28": 1833295360.0, + "29": 1833295360.0, + "30": 1833295360.0, + "31": 1833295360.0, + "32": 1833295360.0, + "33": 1833295360.0, + "34": 1833295360.0, + "35": 1833295360.0, + "36": 1833295360.0, + "37": 1833295360.0, + "38": 1833295360.0, + "39": 1833295360.0, + "40": 1833295360.0, + "41": 1833295360.0, + "42": 1833295360.0, + "43": 1833295360.0, + "44": 1833295360.0, + "45": 1833295360.0, + "46": 1833295360.0, + "47": 1833295360.0, + "48": 1833295360.0, + "49": 1833295360.0, + "50": 1833295360.0, + "51": 1833295360.0, + "52": 1833295360.0, + "53": 1833295360.0, + "54": 1833295360.0, + "55": 1833295360.0, + "56": 1833295360.0, + "57": 1833295360.0, + "58": 1833295360.0, + "59": 1833295360.0, + "60": 1833295360.0, + "61": 1833295360.0, + "62": 1833295360.0, + "63": 1833295360.0, + "64": 1833295360.0, + "65": 1833295360.0, + "66": 1833295360.0, + "67": 1833295360.0, + "68": 1833295360.0, + "69": 1833295360.0, + "70": 1833295360.0, + "71": 1833295360.0, + "72": 1833295360.0, + "73": 1833295360.0, + "74": 1833295360.0, + "75": 1833295360.0, + "76": 1833295360.0, + "77": 1833295360.0, + "78": 1833295360.0, + "79": 1833295360.0, + "80": 1833295360.0, + "81": 1833295360.0, + "82": 1833295360.0, + "83": 1833295360.0, + "84": 1833295360.0, + "85": 1833295360.0, + "86": 1833295360.0, + "87": 1833295360.0, + "88": 1833295360.0, + "89": 1833295360.0, + "90": 1833295360.0, + "91": 1833295360.0, + "92": 1833295360.0, + "93": 1833295360.0, + "94": 1833295360.0, + "95": 1833295360.0, + "96": 1833295360.0, + "97": 1833295360.0, + "98": 1833295360.0, + "99": 1833295360.0, + "100": 1833295360.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 8.41131, + "2": 0.5911, + "3": 0.46668, + "4": 0.46572, + "5": 0.48182, + "6": 0.47419, + "7": 0.45962, + "8": 0.46076, + "9": 0.46022, + "10": 0.46056, + "11": 0.45992, + "12": 0.46724, + "13": 0.46712, + "14": 0.46827, + "15": 0.4727, + "16": 0.49253, + "17": 0.47082, + "18": 0.47424, + "19": 0.46849, + "20": 0.45979, + "21": 0.47104, + "22": 0.46485, + "23": 0.46326, + "24": 0.47218, + "25": 0.46353, + "26": 0.46063, + "27": 0.45609, + "28": 0.4748, + "29": 0.45917, + "30": 0.46344, + "31": 0.45858, + "32": 0.46504, + "33": 0.46109, + "34": 0.46003, + "35": 0.46415, + "36": 0.466, + "37": 0.46298, + "38": 0.46081, + "39": 0.46051, + "40": 0.46065, + "41": 0.46838, + "42": 0.49321, + "43": 0.47091, + "44": 0.46781, + "45": 0.45909, + "46": 0.4623, + "47": 0.46684, + "48": 0.46817, + "49": 0.47488, + "50": 0.46159, + "51": 0.4696, + "52": 0.46902, + "53": 0.46394, + "54": 0.46398, + "55": 0.48419, + "56": 0.48174, + "57": 0.46979, + "58": 0.46441, + "59": 0.46756, + "60": 0.45954, + "61": 0.46551, + "62": 0.46355, + "63": 0.4631, + "64": 0.46313, + "65": 0.47693, + "66": 0.46943, + "67": 0.45954, + "68": 0.46555, + "69": 0.46002, + "70": 0.47351, + "71": 0.46163, + "72": 0.46815, + "73": 0.46171, + "74": 0.46772, + "75": 0.75351, + "76": 0.46342, + "77": 0.47886, + "78": 0.47771, + "79": 0.47646, + "80": 0.47943, + "81": 0.47905, + "82": 0.47, + "83": 0.46092, + "84": 1.47835, + "85": 0.47794, + "86": 0.97054, + "87": 3.1063, + "88": 0.466, + "89": 1.9497, + "90": 0.4647, + "91": 0.47038, + "92": 0.46503, + "93": 0.47547, + "94": 0.48315, + "95": 0.48851, + "96": 0.50856, + "97": 0.49788, + "98": 0.48078, + "99": 0.5127, + "100": 0.46344 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..b9a799c779f --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.34897, + "2": 10.34482, + "3": 9.79428, + "4": 9.59585, + "5": 9.42074, + "6": 9.41847, + "7": 9.28062, + "8": 9.18972, + "9": 9.06519, + "10": 9.00183, + "11": 8.81475, + "12": 8.7808, + "13": 8.82493, + "14": 8.67261, + "15": 8.6364, + "16": 8.51896, + "17": 8.45704, + "18": 8.37007, + "19": 8.36039, + "20": 8.25417, + "21": 8.2421, + "22": 8.13324, + "23": 8.06764, + "24": 8.1142, + "25": 7.95082, + "26": 8.08156, + "27": 7.86764, + "28": 7.93993, + "29": 7.77566, + "30": 7.84559, + "31": 7.8152, + "32": 7.65941, + "33": 7.77856, + "34": 7.53188, + "35": 7.65804, + "36": 7.51464, + "37": 7.44686, + "38": 7.48161, + "39": 7.46435, + "40": 7.49084, + "41": 7.40827, + "42": 7.35625, + "43": 7.43764, + "44": 7.35439, + "45": 7.35042, + "46": 7.27853, + "47": 7.4405, + "48": 7.26763, + "49": 7.32341, + "50": 7.14486, + "51": 7.36469, + "52": 7.12044, + "53": 7.09167, + "54": 7.22712, + "55": 7.13495, + "56": 7.20751, + "57": 7.31287, + "58": 6.99063, + "59": 7.09849, + "60": 7.12665, + "61": 7.10047, + "62": 7.23974, + "63": 7.14358, + "64": 7.06717, + "65": 6.98408, + "66": 7.03692, + "67": 7.02875, + "68": 7.12914, + "69": 7.01425, + "70": 7.04954, + "71": 6.89312, + "72": 6.98513, + "73": 6.96734, + "74": 6.90236, + "75": 7.05611, + "76": 6.95986, + "77": 7.06862, + "78": 7.0204, + "79": 6.8505, + "80": 6.92019, + "81": 6.95982, + "82": 7.04575, + "83": 6.98617, + "84": 6.99991, + "85": 6.83511, + "86": 7.04087, + "87": 6.96604, + "88": 6.90125, + "89": 6.80345, + "90": 7.22384, + "91": 6.70505, + "92": 7.03979, + "93": 6.8857, + "94": 7.04044, + "95": 6.84746, + "96": 6.9546, + "97": 6.94425, + "98": 6.86865, + "99": 6.9948, + "100": 6.96761 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43289.0, + "2": 44062.0, + "3": 44747.0, + "4": 42377.0, + "5": 45372.0, + "6": 40957.0, + "7": 43147.0, + "8": 45474.0, + "9": 42425.0, + "10": 45380.0, + "11": 43984.0, + "12": 44594.0, + "13": 43914.0, + "14": 46203.0, + "15": 43914.0, + "16": 41632.0, + "17": 43870.0, + "18": 44691.0, + "19": 42574.0, + "20": 44769.0, + "21": 44757.0, + "22": 41854.0, + "23": 45440.0, + "24": 43066.0, + "25": 42458.0, + "26": 43949.0, + "27": 46224.0, + "28": 46395.0, + "29": 46168.0, + "30": 44028.0, + "31": 41131.0, + "32": 43348.0, + "33": 45441.0, + "34": 43316.0, + "35": 43258.0, + "36": 42459.0, + "37": 40074.0, + "38": 42544.0, + "39": 44707.0, + "40": 43237.0, + "41": 44652.0, + "42": 43196.0, + "43": 45435.0, + "44": 44591.0, + "45": 43263.0, + "46": 43930.0, + "47": 42373.0, + "48": 44713.0, + "49": 43128.0, + "50": 43361.0, + "51": 41133.0, + "52": 43849.0, + "53": 43899.0, + "54": 41704.0, + "55": 43863.0, + "56": 43205.0, + "57": 42636.0, + "58": 43835.0, + "59": 44623.0, + "60": 41226.0, + "61": 39705.0, + "62": 44732.0, + "63": 44659.0, + "64": 45371.0, + "65": 44682.0, + "66": 45341.0, + "67": 43169.0, + "68": 42486.0, + "69": 43829.0, + "70": 45529.0, + "71": 43294.0, + "72": 44745.0, + "73": 45364.0, + "74": 42463.0, + "75": 44679.0, + "76": 43882.0, + "77": 42042.0, + "78": 40356.0, + "79": 38928.0, + "80": 41079.0, + "81": 45349.0, + "82": 43226.0, + "83": 38474.0, + "84": 42415.0, + "85": 43989.0, + "86": 45673.0, + "87": 40850.0, + "88": 41756.0, + "89": 41065.0, + "90": 44686.0, + "91": 46135.0, + "92": 41609.0, + "93": 43267.0, + "94": 39525.0, + "95": 43921.0, + "96": 44683.0, + "97": 45412.0, + "98": 41832.0, + "99": 45416.0, + "100": 42457.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1104069120.0, + "2": 1104069120.0, + "3": 1104069120.0, + "4": 1104069120.0, + "5": 1104069120.0, + "6": 1104069120.0, + "7": 1104069120.0, + "8": 1104069120.0, + "9": 1104069120.0, + "10": 1104069120.0, + "11": 1104069120.0, + "12": 1104069120.0, + "13": 1104069120.0, + "14": 1104069120.0, + "15": 1104069120.0, + "16": 1104069120.0, + "17": 1104069120.0, + "18": 1104069120.0, + "19": 1104069120.0, + "20": 1104069120.0, + "21": 1104069120.0, + "22": 1104069120.0, + "23": 1104069120.0, + "24": 1104069120.0, + "25": 1104069120.0, + "26": 1104069120.0, + "27": 1104069120.0, + "28": 1104069120.0, + "29": 1104069120.0, + "30": 1104069120.0, + "31": 1104069120.0, + "32": 1104069120.0, + "33": 1104069120.0, + "34": 1104069120.0, + "35": 1104069120.0, + "36": 1104069120.0, + "37": 1104069120.0, + "38": 1104069120.0, + "39": 1104069120.0, + "40": 1104069120.0, + "41": 1104069120.0, + "42": 1104069120.0, + "43": 1104069120.0, + "44": 1104069120.0, + "45": 1104069120.0, + "46": 1104069120.0, + "47": 1104069120.0, + "48": 1104069120.0, + "49": 1104069120.0, + "50": 1104069120.0, + "51": 1104069120.0, + "52": 1104069120.0, + "53": 1104069120.0, + "54": 1104069120.0, + "55": 1104069120.0, + "56": 1104069120.0, + "57": 1104069120.0, + "58": 1104069120.0, + "59": 1104069120.0, + "60": 1104069120.0, + "61": 1104069120.0, + "62": 1104069120.0, + "63": 1104069120.0, + "64": 1104069120.0, + "65": 1104069120.0, + "66": 1104069120.0, + "67": 1104069120.0, + "68": 1104069120.0, + "69": 1104069120.0, + "70": 1104069120.0, + "71": 1104069120.0, + "72": 1104069120.0, + "73": 1104069120.0, + "74": 1104069120.0, + "75": 1104069120.0, + "76": 1104069120.0, + "77": 1104069120.0, + "78": 1104069120.0, + "79": 1104069120.0, + "80": 1104069120.0, + "81": 1104069120.0, + "82": 1104069120.0, + "83": 1104069120.0, + "84": 1104069120.0, + "85": 1104069120.0, + "86": 1104069120.0, + "87": 1104069120.0, + "88": 1104069120.0, + "89": 1104069120.0, + "90": 1104069120.0, + "91": 1104069120.0, + "92": 1104069120.0, + "93": 1104069120.0, + "94": 1104069120.0, + "95": 1104069120.0, + "96": 1104069120.0, + "97": 1104069120.0, + "98": 1104069120.0, + "99": 1104069120.0, + "100": 1104069120.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1368630784.0, + "2": 1833295360.0, + "3": 1833295360.0, + "4": 1833295360.0, + "5": 1833295360.0, + "6": 1833295360.0, + "7": 1833295360.0, + "8": 1833295360.0, + "9": 1833295360.0, + "10": 1833295360.0, + "11": 1833295360.0, + "12": 1833295360.0, + "13": 1833295360.0, + "14": 1833295360.0, + "15": 1833295360.0, + "16": 1833295360.0, + "17": 1833295360.0, + "18": 1833295360.0, + "19": 1833295360.0, + "20": 1833295360.0, + "21": 1833295360.0, + "22": 1833295360.0, + "23": 1833295360.0, + "24": 1833295360.0, + "25": 1833295360.0, + "26": 1833295360.0, + "27": 1833295360.0, + "28": 1833295360.0, + "29": 1833295360.0, + "30": 1833295360.0, + "31": 1833295360.0, + "32": 1833295360.0, + "33": 1833295360.0, + "34": 1833295360.0, + "35": 1833295360.0, + "36": 1833295360.0, + "37": 1833295360.0, + "38": 1833295360.0, + "39": 1833295360.0, + "40": 1833295360.0, + "41": 1833295360.0, + "42": 1833295360.0, + "43": 1833295360.0, + "44": 1833295360.0, + "45": 1833295360.0, + "46": 1833295360.0, + "47": 1833295360.0, + "48": 1833295360.0, + "49": 1833295360.0, + "50": 1833295360.0, + "51": 1833295360.0, + "52": 1833295360.0, + "53": 1833295360.0, + "54": 1833295360.0, + "55": 1833295360.0, + "56": 1833295360.0, + "57": 1833295360.0, + "58": 1833295360.0, + "59": 1833295360.0, + "60": 1833295360.0, + "61": 1833295360.0, + "62": 1833295360.0, + "63": 1833295360.0, + "64": 1833295360.0, + "65": 1833295360.0, + "66": 1833295360.0, + "67": 1833295360.0, + "68": 1833295360.0, + "69": 1833295360.0, + "70": 1833295360.0, + "71": 1833295360.0, + "72": 1833295360.0, + "73": 1833295360.0, + "74": 1833295360.0, + "75": 1833295360.0, + "76": 1833295360.0, + "77": 1833295360.0, + "78": 1833295360.0, + "79": 1833295360.0, + "80": 1833295360.0, + "81": 1833295360.0, + "82": 1833295360.0, + "83": 1833295360.0, + "84": 1833295360.0, + "85": 1833295360.0, + "86": 1833295360.0, + "87": 1833295360.0, + "88": 1833295360.0, + "89": 1833295360.0, + "90": 1833295360.0, + "91": 1833295360.0, + "92": 1833295360.0, + "93": 1833295360.0, + "94": 1833295360.0, + "95": 1833295360.0, + "96": 1833295360.0, + "97": 1833295360.0, + "98": 1833295360.0, + "99": 1833295360.0, + "100": 1833295360.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.38956, + "2": 0.54892, + "3": 0.53756, + "4": 0.52845, + "5": 0.52687, + "6": 0.51818, + "7": 0.52819, + "8": 0.52051, + "9": 0.52526, + "10": 0.52865, + "11": 0.52834, + "12": 0.52573, + "13": 0.52783, + "14": 0.52938, + "15": 0.51899, + "16": 0.53517, + "17": 0.52289, + "18": 0.5363, + "19": 0.5954, + "20": 0.55838, + "21": 0.52166, + "22": 0.54146, + "23": 0.53649, + "24": 0.52785, + "25": 0.52349, + "26": 0.52481, + "27": 0.52376, + "28": 0.52226, + "29": 0.5291, + "30": 0.52613, + "31": 0.52719, + "32": 0.52341, + "33": 0.52646, + "34": 0.52272, + "35": 0.53016, + "36": 0.51941, + "37": 0.52643, + "38": 0.51914, + "39": 0.53109, + "40": 0.52353, + "41": 0.55102, + "42": 0.52656, + "43": 0.53223, + "44": 0.53438, + "45": 0.53126, + "46": 0.53776, + "47": 0.52511, + "48": 0.53521, + "49": 0.52743, + "50": 0.52883, + "51": 0.54078, + "52": 0.52088, + "53": 0.53221, + "54": 0.52473, + "55": 0.54396, + "56": 0.52771, + "57": 0.52699, + "58": 0.53079, + "59": 0.52445, + "60": 0.53037, + "61": 0.52164, + "62": 0.532, + "63": 0.52392, + "64": 0.53062, + "65": 0.52269, + "66": 0.53306, + "67": 0.5173, + "68": 0.54063, + "69": 0.52464, + "70": 0.92233, + "71": 0.53301, + "72": 0.52584, + "73": 0.55029, + "74": 0.54931, + "75": 0.54907, + "76": 0.53191, + "77": 0.53522, + "78": 0.53487, + "79": 0.52543, + "80": 0.53474, + "81": 0.52635, + "82": 0.54801, + "83": 0.52605, + "84": 0.53393, + "85": 0.52523, + "86": 0.53947, + "87": 0.52933, + "88": 0.53447, + "89": 0.53, + "90": 0.5287, + "91": 0.53326, + "92": 0.54604, + "93": 0.53649, + "94": 0.5297, + "95": 0.54163, + "96": 0.52549, + "97": 0.53256, + "98": 0.53104, + "99": 0.54062, + "100": 0.52332 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json index 2a483ef0d3a..9a9cb7962ee 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.3313, + "2": 10.35273, + "3": 9.79594, + "4": 9.60954, "5": 9.42267, + "6": 9.45134, + "7": 9.34339, + "8": 9.27517, + "9": 9.09683, "10": 9.07209, + "11": 8.8835, + "12": 8.83706, + "13": 8.86832, + "14": 8.71037, "15": 8.68183, + "16": 8.56139, + "17": 8.52303, + "18": 8.43962, + "19": 8.40445, "20": 8.29516, + "21": 8.27051, + "22": 8.17907, + "23": 8.12669, + "24": 8.14854, "25": 7.99081, + "26": 8.12208, + "27": 7.90451, + "28": 7.98651, + "29": 7.80842, "30": 7.86913, + "31": 7.83557, + "32": 7.7216, + "33": 7.80364, + "34": 7.59209, "35": 7.68371, + "36": 7.53869, + "37": 7.47624, + "38": 7.51683, + "39": 7.49967, "40": 7.51717, + "41": 7.43167, + "42": 7.40089, + "43": 7.4492, + "44": 7.3892, "45": 7.3802, + "46": 7.29486, + "47": 7.44839, + "48": 7.282, + "49": 7.34647, "50": 7.17125, + "51": 7.37351, + "52": 7.13362, + "53": 7.11248, + "54": 7.23395, "55": 7.14784, + "56": 7.2278, + "57": 7.33273, + "58": 6.99464, + "59": 7.11597, "60": 7.13216, + "61": 7.10561, + "62": 7.26519, + "63": 7.14764, + "64": 7.08702, "65": 6.98658, + "66": 7.04733, + "67": 7.04745, + "68": 7.14076, + "69": 7.24347, "70": 7.05974, + "71": 6.89358, + "72": 6.99793, + "73": 6.97928, + "74": 6.91973, "75": 7.05295, + "76": 6.96054, + "77": 7.07939, + "78": 7.0137, + "79": 6.88344, "80": 6.93032, + "81": 6.96568, + "82": 7.05273, + "83": 6.98785, + "84": 7.00434, "85": 6.84596, + "86": 7.03651, + "87": 6.96347, + "88": 6.91343, + "89": 6.80657, "90": 7.23629, + "91": 6.70068, + "92": 7.05694, + "93": 6.89292, + "94": 7.05848, "95": 6.84802, + "96": 6.9679, + "97": 6.9429, + "98": 6.87432, + "99": 7.01828, "100": 6.98491 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 43340.0, + "2": 44096.0, + "3": 44784.0, + "4": 42468.0, "5": 45416.0, + "6": 40967.0, + "7": 43183.0, + "8": 45463.0, + "9": 42562.0, "10": 45358.0, + "11": 44024.0, + "12": 44607.0, + "13": 43921.0, + "14": 46213.0, "15": 43945.0, + "16": 41749.0, + "17": 43868.0, + "18": 44723.0, + "19": 42609.0, "20": 44784.0, + "21": 44794.0, + "22": 41882.0, + "23": 45474.0, + "24": 43082.0, "25": 42696.0, + "26": 43952.0, + "27": 46262.0, + "28": 46418.0, + "29": 46154.0, "30": 44052.0, + "31": 41259.0, + "32": 43443.0, + "33": 45485.0, + "34": 43346.0, "35": 43279.0, + "36": 42498.0, + "37": 40653.0, + "38": 42538.0, + "39": 44772.0, "40": 43278.0, + "41": 44664.0, + "42": 43297.0, + "43": 45448.0, + "44": 44622.0, "45": 43354.0, + "46": 43931.0, + "47": 42505.0, + "48": 44726.0, + "49": 43168.0, "50": 43402.0, + "51": 41200.0, + "52": 43884.0, + "53": 43946.0, + "54": 41916.0, "55": 43925.0, + "56": 43252.0, + "57": 42636.0, + "58": 43941.0, + "59": 44619.0, "60": 41400.0, + "61": 39750.0, + "62": 44764.0, + "63": 44671.0, + "64": 45375.0, "65": 44753.0, + "66": 45404.0, + "67": 43154.0, + "68": 42551.0, + "69": 43844.0, "70": 45537.0, + "71": 43335.0, + "72": 44839.0, + "73": 45372.0, + "74": 42511.0, "75": 44712.0, + "76": 43930.0, + "77": 42073.0, + "78": 40535.0, + "79": 38992.0, "80": 41092.0, + "81": 45382.0, + "82": 43275.0, + "83": 38475.0, + "84": 42418.0, "85": 43979.0, + "86": 45691.0, + "87": 41145.0, + "88": 41782.0, + "89": 41042.0, "90": 44713.0, + "91": 46270.0, + "92": 41845.0, + "93": 43272.0, + "94": 39536.0, "95": 44085.0, + "96": 44689.0, + "97": 45411.0, + "98": 41858.0, + "99": 45575.0, "100": 42501.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 4168870400.0, + "2": 4168870400.0, + "3": 4168870400.0, + "4": 4168870400.0, "5": 4168870400.0, + "6": 4168870400.0, + "7": 4168870400.0, + "8": 4168870400.0, + "9": 4168870400.0, "10": 4168870400.0, + "11": 4168870400.0, + "12": 4168870400.0, + "13": 4168870400.0, + "14": 4168870400.0, "15": 4168870400.0, + "16": 4168870400.0, + "17": 4168870400.0, + "18": 4168870400.0, + "19": 4168870400.0, "20": 4168870400.0, + "21": 4168870400.0, + "22": 4168870400.0, + "23": 4168870400.0, + "24": 4168870400.0, "25": 4168870400.0, + "26": 4168870400.0, + "27": 4168870400.0, + "28": 4168870400.0, + "29": 4168870400.0, "30": 4168870400.0, + "31": 4168870400.0, + "32": 4168870400.0, + "33": 4168870400.0, + "34": 4168870400.0, "35": 4168870400.0, + "36": 4168870400.0, + "37": 4168870400.0, + "38": 4168870400.0, + "39": 4168870400.0, "40": 4168870400.0, + "41": 4168870400.0, + "42": 4168870400.0, + "43": 4168870400.0, + "44": 4168870400.0, "45": 4168870400.0, + "46": 4168870400.0, + "47": 4168870400.0, + "48": 4168870400.0, + "49": 4168870400.0, "50": 4168870400.0, + "51": 4168870400.0, + "52": 4168870400.0, + "53": 4168870400.0, + "54": 4168870400.0, "55": 4168870400.0, + "56": 4168870400.0, + "57": 4168870400.0, + "58": 4168870400.0, + "59": 4168870400.0, "60": 4168870400.0, + "61": 4168870400.0, + "62": 4168870400.0, + "63": 4168870400.0, + "64": 4168870400.0, "65": 4168870400.0, + "66": 4168870400.0, + "67": 4168870400.0, + "68": 4168870400.0, + "69": 4168870400.0, "70": 4168870400.0, + "71": 4168870400.0, + "72": 4168870400.0, + "73": 4168870400.0, + "74": 4168870400.0, "75": 4168870400.0, + "76": 4168870400.0, + "77": 4168870400.0, + "78": 4168870400.0, + "79": 4168870400.0, "80": 4168870400.0, + "81": 4168870400.0, + "82": 4168870400.0, + "83": 4168870400.0, + "84": 4168870400.0, "85": 4168870400.0, + "86": 4168870400.0, + "87": 4168870400.0, + "88": 4168870400.0, + "89": 4168870400.0, "90": 4168870400.0, + "91": 4168870400.0, + "92": 4168870400.0, + "93": 4168870400.0, + "94": 4168870400.0, "95": 4168870400.0, + "96": 4168870400.0, + "97": 4168870400.0, + "98": 4168870400.0, + "99": 4168870400.0, "100": 4168870400.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 4375071232.0, + "2": 6204402688.0, + "3": 6206499840.0, + "4": 6206499840.0, "5": 6206499840.0, + "6": 6206499840.0, + "7": 6206499840.0, + "8": 6206499840.0, + "9": 6206499840.0, "10": 6206499840.0, + "11": 6206499840.0, + "12": 6206499840.0, + "13": 6206499840.0, + "14": 6206499840.0, "15": 6206499840.0, + "16": 6206499840.0, + "17": 6206499840.0, + "18": 6206499840.0, + "19": 6206499840.0, "20": 6206499840.0, + "21": 6206499840.0, + "22": 6206499840.0, + "23": 6206499840.0, + "24": 6206499840.0, "25": 6206499840.0, + "26": 6206499840.0, + "27": 6206499840.0, + "28": 6206499840.0, + "29": 6206499840.0, "30": 6206499840.0, + "31": 6206499840.0, + "32": 6206499840.0, + "33": 6206499840.0, + "34": 6206499840.0, "35": 6206499840.0, + "36": 6206499840.0, + "37": 6206499840.0, + "38": 6206499840.0, + "39": 6206499840.0, "40": 6206499840.0, + "41": 6206499840.0, + "42": 6206499840.0, + "43": 6206499840.0, + "44": 6206499840.0, "45": 6206499840.0, + "46": 6206499840.0, + "47": 6206499840.0, + "48": 6206499840.0, + "49": 6206499840.0, "50": 6206499840.0, + "51": 6206499840.0, + "52": 6206499840.0, + "53": 6206499840.0, + "54": 6206499840.0, "55": 6206499840.0, + "56": 6206499840.0, + "57": 6206499840.0, + "58": 6206499840.0, + "59": 6206499840.0, "60": 6206499840.0, + "61": 6206499840.0, + "62": 6206499840.0, + "63": 6206499840.0, + "64": 6206499840.0, "65": 6206499840.0, + "66": 6206499840.0, + "67": 6206499840.0, + "68": 6206499840.0, + "69": 6206499840.0, "70": 6206499840.0, + "71": 6206499840.0, + "72": 6206499840.0, + "73": 6206499840.0, + "74": 6206499840.0, "75": 6206499840.0, + "76": 6206499840.0, + "77": 6206499840.0, + "78": 6206499840.0, + "79": 6206499840.0, "80": 6206499840.0, + "81": 6206499840.0, + "82": 6206499840.0, + "83": 6206499840.0, + "84": 6206499840.0, "85": 6206499840.0, + "86": 6206499840.0, + "87": 6206499840.0, + "88": 6206499840.0, + "89": 6206499840.0, "90": 6206499840.0, + "91": 6206499840.0, + "92": 6206499840.0, + "93": 6206499840.0, + "94": 6206499840.0, "95": 6206499840.0, + "96": 6206499840.0, + "97": 6206499840.0, + "98": 6206499840.0, + "99": 6206499840.0, "100": 6206499840.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 5.96824, - "5": 0.16199, - "10": 0.16035, - "15": 0.16138, - "20": 0.16464, - "25": 0.16244, - "30": 0.16034, - "35": 0.16315, - "40": 0.1629, - "45": 0.1679, - "50": 0.163, - "55": 0.16422, - "60": 0.16092, - "65": 0.17177, - "70": 0.16664, - "75": 0.16285, - "80": 0.15979, - "85": 0.16193, - "90": 0.16426, - "95": 0.16461, - "100": 0.49883 + "1": 7.18555, + "2": 0.22912, + "3": 0.19495, + "4": 0.19292, + "5": 0.1933, + "6": 0.20082, + "7": 0.1898, + "8": 0.19078, + "9": 0.19631, + "10": 0.18961, + "11": 0.19602, + "12": 0.19712, + "13": 0.19248, + "14": 0.19302, + "15": 0.19445, + "16": 0.19515, + "17": 0.19565, + "18": 0.18839, + "19": 0.19044, + "20": 0.1878, + "21": 0.19199, + "22": 0.19051, + "23": 0.19216, + "24": 0.19009, + "25": 0.18449, + "26": 0.19206, + "27": 0.19, + "28": 0.19154, + "29": 0.19019, + "30": 0.18961, + "31": 0.18739, + "32": 0.19441, + "33": 0.18956, + "34": 0.19188, + "35": 0.20225, + "36": 0.1956, + "37": 0.20085, + "38": 0.20338, + "39": 0.19512, + "40": 0.20945, + "41": 0.20775, + "42": 0.20695, + "43": 0.20502, + "44": 0.19536, + "45": 0.1972, + "46": 0.19693, + "47": 0.2056, + "48": 0.19367, + "49": 0.19288, + "50": 0.19187, + "51": 0.19233, + "52": 0.19557, + "53": 0.19068, + "54": 0.18458, + "55": 0.18565, + "56": 0.18636, + "57": 0.19313, + "58": 0.18633, + "59": 0.18858, + "60": 0.18486, + "61": 0.18799, + "62": 0.18531, + "63": 0.19385, + "64": 0.18893, + "65": 0.1968, + "66": 0.19472, + "67": 0.19267, + "68": 0.19586, + "69": 0.22272, + "70": 0.22071, + "71": 0.18794, + "72": 0.19924, + "73": 0.19888, + "74": 0.22693, + "75": 0.20741, + "76": 0.19831, + "77": 0.20398, + "78": 0.19269, + "79": 0.19066, + "80": 0.18543, + "81": 0.18666, + "82": 0.18559, + "83": 0.19153, + "84": 0.18527, + "85": 0.18623, + "86": 0.48843, + "87": 0.18991, + "88": 0.18251, + "89": 0.18473, + "90": 0.18511, + "91": 0.19021, + "92": 0.19055, + "93": 0.18545, + "94": 0.1853, + "95": 0.18396, + "96": 0.1848, + "97": 0.19407, + "98": 0.18533, + "99": 0.18593, + "100": 0.48771 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..72278130300 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.34492, + "2": 10.36433, + "3": 9.73145, + "4": 9.57923, + "5": 9.3892, + "6": 9.41078, + "7": 9.30545, + "8": 9.24872, + "9": 9.09363, + "10": 9.01571, + "11": 8.86227, + "12": 8.79088, + "13": 8.80884, + "14": 8.67658, + "15": 8.64615, + "16": 8.53973, + "17": 8.47875, + "18": 8.38919, + "19": 8.36145, + "20": 8.26963, + "21": 8.26321, + "22": 8.15047, + "23": 8.08861, + "24": 8.12416, + "25": 7.99467, + "26": 8.08474, + "27": 7.87741, + "28": 7.95852, + "29": 7.79567, + "30": 7.87463, + "31": 7.83211, + "32": 7.69448, + "33": 7.78447, + "34": 7.55753, + "35": 7.65847, + "36": 7.52861, + "37": 7.44889, + "38": 7.50364, + "39": 7.48064, + "40": 7.50295, + "41": 7.3974, + "42": 7.37184, + "43": 7.44291, + "44": 7.38083, + "45": 7.36112, + "46": 7.29391, + "47": 7.475, + "48": 7.29535, + "49": 7.3607, + "50": 7.19186, + "51": 7.38728, + "52": 7.13728, + "53": 7.12477, + "54": 7.23618, + "55": 7.16789, + "56": 7.22866, + "57": 7.34625, + "58": 7.03082, + "59": 7.12273, + "60": 7.16511, + "61": 7.11656, + "62": 7.26779, + "63": 7.16695, + "64": 7.08275, + "65": 7.00051, + "66": 7.07139, + "67": 7.05884, + "68": 7.14563, + "69": 7.03993, + "70": 7.07139, + "71": 6.91636, + "72": 7.02022, + "73": 6.99002, + "74": 6.91408, + "75": 7.07586, + "76": 6.97032, + "77": 7.08431, + "78": 7.03516, + "79": 6.88312, + "80": 6.95246, + "81": 6.98441, + "82": 7.06806, + "83": 7.00882, + "84": 7.01789, + "85": 6.86372, + "86": 7.04924, + "87": 6.99288, + "88": 6.92333, + "89": 6.82337, + "90": 7.25405, + "91": 6.72212, + "92": 7.05344, + "93": 6.91633, + "94": 7.0654, + "95": 6.85964, + "96": 6.98723, + "97": 6.96749, + "98": 6.89904, + "99": 7.02746, + "100": 6.99698 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43313.0, + "2": 44075.0, + "3": 44779.0, + "4": 42461.0, + "5": 45406.0, + "6": 40995.0, + "7": 43185.0, + "8": 45480.0, + "9": 42555.0, + "10": 45370.0, + "11": 44017.0, + "12": 44619.0, + "13": 43939.0, + "14": 46223.0, + "15": 43950.0, + "16": 41732.0, + "17": 43869.0, + "18": 44696.0, + "19": 42631.0, + "20": 44806.0, + "21": 44813.0, + "22": 41897.0, + "23": 45483.0, + "24": 43099.0, + "25": 42740.0, + "26": 43950.0, + "27": 46249.0, + "28": 46424.0, + "29": 46206.0, + "30": 44052.0, + "31": 41268.0, + "32": 43408.0, + "33": 45487.0, + "34": 43390.0, + "35": 43279.0, + "36": 42533.0, + "37": 40700.0, + "38": 42585.0, + "39": 44772.0, + "40": 43242.0, + "41": 44698.0, + "42": 43271.0, + "43": 45502.0, + "44": 44648.0, + "45": 43344.0, + "46": 43923.0, + "47": 42519.0, + "48": 44691.0, + "49": 43190.0, + "50": 43411.0, + "51": 41175.0, + "52": 43901.0, + "53": 43967.0, + "54": 41964.0, + "55": 43968.0, + "56": 43280.0, + "57": 42566.0, + "58": 43903.0, + "59": 44657.0, + "60": 41346.0, + "61": 39760.0, + "62": 44779.0, + "63": 44680.0, + "64": 45395.0, + "65": 44726.0, + "66": 45386.0, + "67": 43197.0, + "68": 42570.0, + "69": 43834.0, + "70": 45545.0, + "71": 43402.0, + "72": 44828.0, + "73": 45410.0, + "74": 42508.0, + "75": 44680.0, + "76": 43936.0, + "77": 42111.0, + "78": 40541.0, + "79": 38950.0, + "80": 41138.0, + "81": 45397.0, + "82": 43256.0, + "83": 38500.0, + "84": 42533.0, + "85": 44039.0, + "86": 45756.0, + "87": 41125.0, + "88": 41799.0, + "89": 41088.0, + "90": 44735.0, + "91": 46292.0, + "92": 41852.0, + "93": 43234.0, + "94": 39581.0, + "95": 44094.0, + "96": 44736.0, + "97": 45487.0, + "98": 41852.0, + "99": 45522.0, + "100": 42475.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4138985984.0, + "2": 4138985984.0, + "3": 4138985984.0, + "4": 4138985984.0, + "5": 4138985984.0, + "6": 4138985984.0, + "7": 4138985984.0, + "8": 4138985984.0, + "9": 4138985984.0, + "10": 4138985984.0, + "11": 4138985984.0, + "12": 4138985984.0, + "13": 4138985984.0, + "14": 4138985984.0, + "15": 4138985984.0, + "16": 4138985984.0, + "17": 4138985984.0, + "18": 4138985984.0, + "19": 4138985984.0, + "20": 4138985984.0, + "21": 4138985984.0, + "22": 4138985984.0, + "23": 4138985984.0, + "24": 4138985984.0, + "25": 4138985984.0, + "26": 4138985984.0, + "27": 4138985984.0, + "28": 4138985984.0, + "29": 4138985984.0, + "30": 4138985984.0, + "31": 4138985984.0, + "32": 4138985984.0, + "33": 4138985984.0, + "34": 4138985984.0, + "35": 4138985984.0, + "36": 4138985984.0, + "37": 4138985984.0, + "38": 4138985984.0, + "39": 4138985984.0, + "40": 4138985984.0, + "41": 4138985984.0, + "42": 4138985984.0, + "43": 4138985984.0, + "44": 4138985984.0, + "45": 4138985984.0, + "46": 4138985984.0, + "47": 4138985984.0, + "48": 4138985984.0, + "49": 4138985984.0, + "50": 4138985984.0, + "51": 4138985984.0, + "52": 4138985984.0, + "53": 4138985984.0, + "54": 4138985984.0, + "55": 4138985984.0, + "56": 4138985984.0, + "57": 4138985984.0, + "58": 4138985984.0, + "59": 4138985984.0, + "60": 4138985984.0, + "61": 4138985984.0, + "62": 4138985984.0, + "63": 4138985984.0, + "64": 4138985984.0, + "65": 4138985984.0, + "66": 4138985984.0, + "67": 4138985984.0, + "68": 4138985984.0, + "69": 4138985984.0, + "70": 4138985984.0, + "71": 4138985984.0, + "72": 4138985984.0, + "73": 4138985984.0, + "74": 4138985984.0, + "75": 4138985984.0, + "76": 4138985984.0, + "77": 4138985984.0, + "78": 4138985984.0, + "79": 4138985984.0, + "80": 4138985984.0, + "81": 4138985984.0, + "82": 4138985984.0, + "83": 4138985984.0, + "84": 4138985984.0, + "85": 4138985984.0, + "86": 4138985984.0, + "87": 4138985984.0, + "88": 4138985984.0, + "89": 4138985984.0, + "90": 4138985984.0, + "91": 4138985984.0, + "92": 4138985984.0, + "93": 4138985984.0, + "94": 4138985984.0, + "95": 4138985984.0, + "96": 4138985984.0, + "97": 4138985984.0, + "98": 4138985984.0, + "99": 4138985984.0, + "100": 4138985984.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4345973248.0, + "2": 6174256128.0, + "3": 6177401856.0, + "4": 6177401856.0, + "5": 6177401856.0, + "6": 6177401856.0, + "7": 6177401856.0, + "8": 6177401856.0, + "9": 6177401856.0, + "10": 6177401856.0, + "11": 6177401856.0, + "12": 6177401856.0, + "13": 6177401856.0, + "14": 6177401856.0, + "15": 6177401856.0, + "16": 6177401856.0, + "17": 6177401856.0, + "18": 6177401856.0, + "19": 6177401856.0, + "20": 6177401856.0, + "21": 6177401856.0, + "22": 6177401856.0, + "23": 6177401856.0, + "24": 6177401856.0, + "25": 6177401856.0, + "26": 6177401856.0, + "27": 6177401856.0, + "28": 6177401856.0, + "29": 6177401856.0, + "30": 6177401856.0, + "31": 6177401856.0, + "32": 6177401856.0, + "33": 6177401856.0, + "34": 6177401856.0, + "35": 6177401856.0, + "36": 6177401856.0, + "37": 6177401856.0, + "38": 6177401856.0, + "39": 6177401856.0, + "40": 6177401856.0, + "41": 6177401856.0, + "42": 6177401856.0, + "43": 6177401856.0, + "44": 6177401856.0, + "45": 6177401856.0, + "46": 6177401856.0, + "47": 6177401856.0, + "48": 6177401856.0, + "49": 6177401856.0, + "50": 6177401856.0, + "51": 6177401856.0, + "52": 6177401856.0, + "53": 6177401856.0, + "54": 6177401856.0, + "55": 6177401856.0, + "56": 6177401856.0, + "57": 6177401856.0, + "58": 6177401856.0, + "59": 6177401856.0, + "60": 6177401856.0, + "61": 6177401856.0, + "62": 6177401856.0, + "63": 6177401856.0, + "64": 6177401856.0, + "65": 6177401856.0, + "66": 6177401856.0, + "67": 6177401856.0, + "68": 6177401856.0, + "69": 6177401856.0, + "70": 6177401856.0, + "71": 6177401856.0, + "72": 6177401856.0, + "73": 6177401856.0, + "74": 6177401856.0, + "75": 6177401856.0, + "76": 6177401856.0, + "77": 6177401856.0, + "78": 6177401856.0, + "79": 6177401856.0, + "80": 6177401856.0, + "81": 6177401856.0, + "82": 6177401856.0, + "83": 6177401856.0, + "84": 6177401856.0, + "85": 6177401856.0, + "86": 6177401856.0, + "87": 6177401856.0, + "88": 6177401856.0, + "89": 6177401856.0, + "90": 6177401856.0, + "91": 6177401856.0, + "92": 6177401856.0, + "93": 6177401856.0, + "94": 6177401856.0, + "95": 6177401856.0, + "96": 6177401856.0, + "97": 6177401856.0, + "98": 6177401856.0, + "99": 6177401856.0, + "100": 6177401856.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 7.37564, + "2": 0.47907, + "3": 0.26318, + "4": 0.26361, + "5": 0.26788, + "6": 0.26504, + "7": 0.26585, + "8": 0.26222, + "9": 0.26257, + "10": 0.26426, + "11": 0.26743, + "12": 0.26324, + "13": 0.2631, + "14": 0.26214, + "15": 0.26226, + "16": 0.26202, + "17": 0.26215, + "18": 0.26191, + "19": 0.26192, + "20": 0.26328, + "21": 0.28093, + "22": 0.26248, + "23": 0.26259, + "24": 0.26257, + "25": 0.26193, + "26": 0.26229, + "27": 0.26207, + "28": 0.26284, + "29": 0.26248, + "30": 0.26171, + "31": 0.26369, + "32": 0.26295, + "33": 0.26244, + "34": 0.26239, + "35": 0.26289, + "36": 0.26221, + "37": 0.26173, + "38": 0.26276, + "39": 0.26177, + "40": 0.26145, + "41": 0.72968, + "42": 0.26423, + "43": 0.26386, + "44": 0.26138, + "45": 0.26438, + "46": 0.26265, + "47": 0.26382, + "48": 0.26338, + "49": 0.2647, + "50": 0.26389, + "51": 0.27004, + "52": 0.28055, + "53": 0.26495, + "54": 0.26509, + "55": 0.60834, + "56": 0.26487, + "57": 0.26475, + "58": 0.26728, + "59": 0.27353, + "60": 0.2644, + "61": 0.26294, + "62": 0.27032, + "63": 0.26838, + "64": 0.26385, + "65": 0.26288, + "66": 0.74822, + "67": 0.26372, + "68": 0.72466, + "69": 0.26508, + "70": 0.76862, + "71": 0.26359, + "72": 0.26496, + "73": 0.26691, + "74": 0.26615, + "75": 0.26787, + "76": 0.26937, + "77": 0.26491, + "78": 0.26651, + "79": 0.26743, + "80": 0.26533, + "81": 0.2655, + "82": 0.26612, + "83": 0.26497, + "84": 0.26502, + "85": 0.2647, + "86": 0.26554, + "87": 0.26569, + "88": 0.26554, + "89": 0.26468, + "90": 0.26229, + "91": 0.26142, + "92": 0.26206, + "93": 0.26215, + "94": 0.26471, + "95": 0.26142, + "96": 0.65482, + "97": 0.26367, + "98": 0.26226, + "99": 0.26183, + "100": 0.26175 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci.json new file mode 100644 index 00000000000..96fd81c74b6 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.34492, + "2": 10.36433, + "3": 9.73145, + "4": 9.57923, + "5": 9.3892, + "6": 9.41078, + "7": 9.30545, + "8": 9.24872, + "9": 9.09363, + "10": 9.01571, + "11": 8.86227, + "12": 8.79088, + "13": 8.80884, + "14": 8.67658, + "15": 8.64615, + "16": 8.53973, + "17": 8.47875, + "18": 8.38919, + "19": 8.36145, + "20": 8.26963, + "21": 8.26321, + "22": 8.15047, + "23": 8.08861, + "24": 8.12416, + "25": 7.99467, + "26": 8.08474, + "27": 7.87741, + "28": 7.95852, + "29": 7.79567, + "30": 7.87463, + "31": 7.83211, + "32": 7.69448, + "33": 7.78447, + "34": 7.55753, + "35": 7.65847, + "36": 7.52861, + "37": 7.44889, + "38": 7.50364, + "39": 7.48064, + "40": 7.50295, + "41": 7.3974, + "42": 7.37184, + "43": 7.44291, + "44": 7.38083, + "45": 7.36112, + "46": 7.29391, + "47": 7.475, + "48": 7.29535, + "49": 7.3607, + "50": 7.19186, + "51": 7.38728, + "52": 7.13728, + "53": 7.12477, + "54": 7.23618, + "55": 7.16789, + "56": 7.22866, + "57": 7.34625, + "58": 7.03082, + "59": 7.12273, + "60": 7.16511, + "61": 7.11656, + "62": 7.26779, + "63": 7.16695, + "64": 7.08275, + "65": 7.00051, + "66": 7.07139, + "67": 7.05884, + "68": 7.14563, + "69": 7.03993, + "70": 7.07139, + "71": 6.91636, + "72": 7.02022, + "73": 6.99002, + "74": 6.91408, + "75": 7.07586, + "76": 6.97032, + "77": 7.08431, + "78": 7.03516, + "79": 6.88312, + "80": 6.95246, + "81": 6.98441, + "82": 7.06806, + "83": 7.00882, + "84": 7.01789, + "85": 6.86372, + "86": 7.04924, + "87": 6.99288, + "88": 6.92333, + "89": 6.82337, + "90": 7.25405, + "91": 6.72212, + "92": 7.05344, + "93": 6.91633, + "94": 7.0654, + "95": 6.85964, + "96": 6.98723, + "97": 6.96749, + "98": 6.89904, + "99": 7.02746, + "100": 6.99698 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43313.0, + "2": 44075.0, + "3": 44779.0, + "4": 42461.0, + "5": 45406.0, + "6": 40995.0, + "7": 43185.0, + "8": 45480.0, + "9": 42555.0, + "10": 45370.0, + "11": 44017.0, + "12": 44619.0, + "13": 43939.0, + "14": 46223.0, + "15": 43950.0, + "16": 41732.0, + "17": 43869.0, + "18": 44696.0, + "19": 42631.0, + "20": 44806.0, + "21": 44813.0, + "22": 41897.0, + "23": 45483.0, + "24": 43099.0, + "25": 42740.0, + "26": 43950.0, + "27": 46249.0, + "28": 46424.0, + "29": 46206.0, + "30": 44052.0, + "31": 41268.0, + "32": 43408.0, + "33": 45487.0, + "34": 43390.0, + "35": 43279.0, + "36": 42533.0, + "37": 40700.0, + "38": 42585.0, + "39": 44772.0, + "40": 43242.0, + "41": 44698.0, + "42": 43271.0, + "43": 45502.0, + "44": 44648.0, + "45": 43344.0, + "46": 43923.0, + "47": 42519.0, + "48": 44691.0, + "49": 43190.0, + "50": 43411.0, + "51": 41175.0, + "52": 43901.0, + "53": 43967.0, + "54": 41964.0, + "55": 43968.0, + "56": 43280.0, + "57": 42566.0, + "58": 43903.0, + "59": 44657.0, + "60": 41346.0, + "61": 39760.0, + "62": 44779.0, + "63": 44680.0, + "64": 45395.0, + "65": 44726.0, + "66": 45386.0, + "67": 43197.0, + "68": 42570.0, + "69": 43834.0, + "70": 45545.0, + "71": 43402.0, + "72": 44828.0, + "73": 45410.0, + "74": 42508.0, + "75": 44680.0, + "76": 43936.0, + "77": 42111.0, + "78": 40541.0, + "79": 38950.0, + "80": 41138.0, + "81": 45397.0, + "82": 43256.0, + "83": 38500.0, + "84": 42533.0, + "85": 44039.0, + "86": 45756.0, + "87": 41125.0, + "88": 41799.0, + "89": 41088.0, + "90": 44735.0, + "91": 46292.0, + "92": 41852.0, + "93": 43234.0, + "94": 39581.0, + "95": 44094.0, + "96": 44736.0, + "97": 45487.0, + "98": 41852.0, + "99": 45522.0, + "100": 42475.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4138985984.0, + "2": 4138985984.0, + "3": 4138985984.0, + "4": 4138985984.0, + "5": 4138985984.0, + "6": 4138985984.0, + "7": 4138985984.0, + "8": 4138985984.0, + "9": 4138985984.0, + "10": 4138985984.0, + "11": 4138985984.0, + "12": 4138985984.0, + "13": 4138985984.0, + "14": 4138985984.0, + "15": 4138985984.0, + "16": 4138985984.0, + "17": 4138985984.0, + "18": 4138985984.0, + "19": 4138985984.0, + "20": 4138985984.0, + "21": 4138985984.0, + "22": 4138985984.0, + "23": 4138985984.0, + "24": 4138985984.0, + "25": 4138985984.0, + "26": 4138985984.0, + "27": 4138985984.0, + "28": 4138985984.0, + "29": 4138985984.0, + "30": 4138985984.0, + "31": 4138985984.0, + "32": 4138985984.0, + "33": 4138985984.0, + "34": 4138985984.0, + "35": 4138985984.0, + "36": 4138985984.0, + "37": 4138985984.0, + "38": 4138985984.0, + "39": 4138985984.0, + "40": 4138985984.0, + "41": 4138985984.0, + "42": 4138985984.0, + "43": 4138985984.0, + "44": 4138985984.0, + "45": 4138985984.0, + "46": 4138985984.0, + "47": 4138985984.0, + "48": 4138985984.0, + "49": 4138985984.0, + "50": 4138985984.0, + "51": 4138985984.0, + "52": 4138985984.0, + "53": 4138985984.0, + "54": 4138985984.0, + "55": 4138985984.0, + "56": 4138985984.0, + "57": 4138985984.0, + "58": 4138985984.0, + "59": 4138985984.0, + "60": 4138985984.0, + "61": 4138985984.0, + "62": 4138985984.0, + "63": 4138985984.0, + "64": 4138985984.0, + "65": 4138985984.0, + "66": 4138985984.0, + "67": 4138985984.0, + "68": 4138985984.0, + "69": 4138985984.0, + "70": 4138985984.0, + "71": 4138985984.0, + "72": 4138985984.0, + "73": 4138985984.0, + "74": 4138985984.0, + "75": 4138985984.0, + "76": 4138985984.0, + "77": 4138985984.0, + "78": 4138985984.0, + "79": 4138985984.0, + "80": 4138985984.0, + "81": 4138985984.0, + "82": 4138985984.0, + "83": 4138985984.0, + "84": 4138985984.0, + "85": 4138985984.0, + "86": 4138985984.0, + "87": 4138985984.0, + "88": 4138985984.0, + "89": 4138985984.0, + "90": 4138985984.0, + "91": 4138985984.0, + "92": 4138985984.0, + "93": 4138985984.0, + "94": 4138985984.0, + "95": 4138985984.0, + "96": 4138985984.0, + "97": 4138985984.0, + "98": 4138985984.0, + "99": 4138985984.0, + "100": 4138985984.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4345973248.0, + "2": 6174256128.0, + "3": 6177401856.0, + "4": 6177401856.0, + "5": 6177401856.0, + "6": 6177401856.0, + "7": 6177401856.0, + "8": 6177401856.0, + "9": 6177401856.0, + "10": 6177401856.0, + "11": 6177401856.0, + "12": 6177401856.0, + "13": 6177401856.0, + "14": 6177401856.0, + "15": 6177401856.0, + "16": 6177401856.0, + "17": 6177401856.0, + "18": 6177401856.0, + "19": 6177401856.0, + "20": 6177401856.0, + "21": 6177401856.0, + "22": 6177401856.0, + "23": 6177401856.0, + "24": 6177401856.0, + "25": 6177401856.0, + "26": 6177401856.0, + "27": 6177401856.0, + "28": 6177401856.0, + "29": 6177401856.0, + "30": 6177401856.0, + "31": 6177401856.0, + "32": 6177401856.0, + "33": 6177401856.0, + "34": 6177401856.0, + "35": 6177401856.0, + "36": 6177401856.0, + "37": 6177401856.0, + "38": 6177401856.0, + "39": 6177401856.0, + "40": 6177401856.0, + "41": 6177401856.0, + "42": 6177401856.0, + "43": 6177401856.0, + "44": 6177401856.0, + "45": 6177401856.0, + "46": 6177401856.0, + "47": 6177401856.0, + "48": 6177401856.0, + "49": 6177401856.0, + "50": 6177401856.0, + "51": 6177401856.0, + "52": 6177401856.0, + "53": 6177401856.0, + "54": 6177401856.0, + "55": 6177401856.0, + "56": 6177401856.0, + "57": 6177401856.0, + "58": 6177401856.0, + "59": 6177401856.0, + "60": 6177401856.0, + "61": 6177401856.0, + "62": 6177401856.0, + "63": 6177401856.0, + "64": 6177401856.0, + "65": 6177401856.0, + "66": 6177401856.0, + "67": 6177401856.0, + "68": 6177401856.0, + "69": 6177401856.0, + "70": 6177401856.0, + "71": 6177401856.0, + "72": 6177401856.0, + "73": 6177401856.0, + "74": 6177401856.0, + "75": 6177401856.0, + "76": 6177401856.0, + "77": 6177401856.0, + "78": 6177401856.0, + "79": 6177401856.0, + "80": 6177401856.0, + "81": 6177401856.0, + "82": 6177401856.0, + "83": 6177401856.0, + "84": 6177401856.0, + "85": 6177401856.0, + "86": 6177401856.0, + "87": 6177401856.0, + "88": 6177401856.0, + "89": 6177401856.0, + "90": 6177401856.0, + "91": 6177401856.0, + "92": 6177401856.0, + "93": 6177401856.0, + "94": 6177401856.0, + "95": 6177401856.0, + "96": 6177401856.0, + "97": 6177401856.0, + "98": 6177401856.0, + "99": 6177401856.0, + "100": 6177401856.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 7.07146, + "2": 0.76333, + "3": 0.25771, + "4": 0.25798, + "5": 0.26042, + "6": 0.26046, + "7": 0.25457, + "8": 0.25511, + "9": 0.2545, + "10": 0.25426, + "11": 0.25469, + "12": 0.25997, + "13": 0.25528, + "14": 0.25614, + "15": 0.25513, + "16": 0.25483, + "17": 0.25502, + "18": 0.2548, + "19": 0.25406, + "20": 0.25473, + "21": 0.25442, + "22": 0.25742, + "23": 0.25489, + "24": 0.25468, + "25": 0.25473, + "26": 0.25514, + "27": 0.25485, + "28": 0.25816, + "29": 0.7004, + "30": 0.25418, + "31": 0.25433, + "32": 0.25688, + "33": 0.25464, + "34": 0.25871, + "35": 0.2549, + "36": 0.25562, + "37": 0.25614, + "38": 0.26065, + "39": 0.25541, + "40": 0.25812, + "41": 0.25448, + "42": 0.25927, + "43": 0.25478, + "44": 0.25871, + "45": 0.25543, + "46": 0.25643, + "47": 0.25677, + "48": 0.25828, + "49": 0.2635, + "50": 0.26946, + "51": 0.29227, + "52": 0.28254, + "53": 0.28602, + "54": 0.25359, + "55": 0.2527, + "56": 0.25629, + "57": 0.26137, + "58": 0.25726, + "59": 0.25218, + "60": 0.25733, + "61": 0.25525, + "62": 0.25763, + "63": 0.25252, + "64": 0.26416, + "65": 0.25869, + "66": 0.25931, + "67": 0.26105, + "68": 0.26311, + "69": 0.25743, + "70": 0.25561, + "71": 0.2518, + "72": 0.25716, + "73": 0.26251, + "74": 0.27278, + "75": 0.25271, + "76": 0.25285, + "77": 0.25408, + "78": 0.70817, + "79": 0.25523, + "80": 0.26051, + "81": 0.26069, + "82": 0.25995, + "83": 0.25528, + "84": 0.25685, + "85": 0.25548, + "86": 0.74098, + "87": 0.25554, + "88": 0.27779, + "89": 0.28379, + "90": 0.28037, + "91": 0.28316, + "92": 0.2777, + "93": 0.25778, + "94": 0.25143, + "95": 0.25144, + "96": 0.25195, + "97": 0.25167, + "98": 0.25838, + "99": 0.25302, + "100": 0.25157 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..c1e5927389e --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.3313, + "2": 10.35273, + "3": 9.79594, + "4": 9.60954, + "5": 9.42267, + "6": 9.45134, + "7": 9.34339, + "8": 9.27517, + "9": 9.09683, + "10": 9.07209, + "11": 8.8835, + "12": 8.83706, + "13": 8.86832, + "14": 8.71037, + "15": 8.68183, + "16": 8.56139, + "17": 8.52303, + "18": 8.43962, + "19": 8.40445, + "20": 8.29516, + "21": 8.27051, + "22": 8.17907, + "23": 8.12669, + "24": 8.14854, + "25": 7.99081, + "26": 8.12208, + "27": 7.90451, + "28": 7.98651, + "29": 7.80842, + "30": 7.86913, + "31": 7.83557, + "32": 7.7216, + "33": 7.80364, + "34": 7.59209, + "35": 7.68371, + "36": 7.53869, + "37": 7.47624, + "38": 7.51683, + "39": 7.49967, + "40": 7.51717, + "41": 7.43167, + "42": 7.40089, + "43": 7.4492, + "44": 7.3892, + "45": 7.3802, + "46": 7.29486, + "47": 7.44839, + "48": 7.282, + "49": 7.34647, + "50": 7.17125, + "51": 7.37351, + "52": 7.13362, + "53": 7.11248, + "54": 7.23395, + "55": 7.14784, + "56": 7.2278, + "57": 7.33273, + "58": 6.99464, + "59": 7.11597, + "60": 7.13216, + "61": 7.10561, + "62": 7.26519, + "63": 7.14764, + "64": 7.08702, + "65": 6.98658, + "66": 7.04733, + "67": 7.04745, + "68": 7.14076, + "69": 7.24347, + "70": 7.05974, + "71": 6.89358, + "72": 6.99793, + "73": 6.97928, + "74": 6.91973, + "75": 7.05295, + "76": 6.96054, + "77": 7.07939, + "78": 7.0137, + "79": 6.88344, + "80": 6.93032, + "81": 6.96568, + "82": 7.05273, + "83": 6.98785, + "84": 7.00434, + "85": 6.84596, + "86": 7.03651, + "87": 6.96347, + "88": 6.91343, + "89": 6.80657, + "90": 7.23629, + "91": 6.70068, + "92": 7.05694, + "93": 6.89292, + "94": 7.05848, + "95": 6.84802, + "96": 6.9679, + "97": 6.9429, + "98": 6.87432, + "99": 7.01828, + "100": 6.98491 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43340.0, + "2": 44096.0, + "3": 44784.0, + "4": 42468.0, + "5": 45416.0, + "6": 40967.0, + "7": 43183.0, + "8": 45463.0, + "9": 42562.0, + "10": 45358.0, + "11": 44024.0, + "12": 44607.0, + "13": 43921.0, + "14": 46213.0, + "15": 43945.0, + "16": 41749.0, + "17": 43868.0, + "18": 44723.0, + "19": 42609.0, + "20": 44784.0, + "21": 44794.0, + "22": 41882.0, + "23": 45474.0, + "24": 43082.0, + "25": 42696.0, + "26": 43952.0, + "27": 46262.0, + "28": 46418.0, + "29": 46154.0, + "30": 44052.0, + "31": 41259.0, + "32": 43443.0, + "33": 45485.0, + "34": 43346.0, + "35": 43279.0, + "36": 42498.0, + "37": 40653.0, + "38": 42538.0, + "39": 44772.0, + "40": 43278.0, + "41": 44664.0, + "42": 43297.0, + "43": 45448.0, + "44": 44622.0, + "45": 43354.0, + "46": 43931.0, + "47": 42505.0, + "48": 44726.0, + "49": 43168.0, + "50": 43402.0, + "51": 41200.0, + "52": 43884.0, + "53": 43946.0, + "54": 41916.0, + "55": 43925.0, + "56": 43252.0, + "57": 42636.0, + "58": 43941.0, + "59": 44619.0, + "60": 41400.0, + "61": 39750.0, + "62": 44764.0, + "63": 44671.0, + "64": 45375.0, + "65": 44753.0, + "66": 45404.0, + "67": 43154.0, + "68": 42551.0, + "69": 43844.0, + "70": 45537.0, + "71": 43335.0, + "72": 44839.0, + "73": 45372.0, + "74": 42511.0, + "75": 44712.0, + "76": 43930.0, + "77": 42073.0, + "78": 40535.0, + "79": 38992.0, + "80": 41092.0, + "81": 45382.0, + "82": 43275.0, + "83": 38475.0, + "84": 42418.0, + "85": 43979.0, + "86": 45691.0, + "87": 41145.0, + "88": 41782.0, + "89": 41042.0, + "90": 44713.0, + "91": 46270.0, + "92": 41845.0, + "93": 43272.0, + "94": 39536.0, + "95": 44085.0, + "96": 44689.0, + "97": 45411.0, + "98": 41858.0, + "99": 45575.0, + "100": 42501.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4168870400.0, + "2": 4168870400.0, + "3": 4168870400.0, + "4": 4168870400.0, + "5": 4168870400.0, + "6": 4168870400.0, + "7": 4168870400.0, + "8": 4168870400.0, + "9": 4168870400.0, + "10": 4168870400.0, + "11": 4168870400.0, + "12": 4168870400.0, + "13": 4168870400.0, + "14": 4168870400.0, + "15": 4168870400.0, + "16": 4168870400.0, + "17": 4168870400.0, + "18": 4168870400.0, + "19": 4168870400.0, + "20": 4168870400.0, + "21": 4168870400.0, + "22": 4168870400.0, + "23": 4168870400.0, + "24": 4168870400.0, + "25": 4168870400.0, + "26": 4168870400.0, + "27": 4168870400.0, + "28": 4168870400.0, + "29": 4168870400.0, + "30": 4168870400.0, + "31": 4168870400.0, + "32": 4168870400.0, + "33": 4168870400.0, + "34": 4168870400.0, + "35": 4168870400.0, + "36": 4168870400.0, + "37": 4168870400.0, + "38": 4168870400.0, + "39": 4168870400.0, + "40": 4168870400.0, + "41": 4168870400.0, + "42": 4168870400.0, + "43": 4168870400.0, + "44": 4168870400.0, + "45": 4168870400.0, + "46": 4168870400.0, + "47": 4168870400.0, + "48": 4168870400.0, + "49": 4168870400.0, + "50": 4168870400.0, + "51": 4168870400.0, + "52": 4168870400.0, + "53": 4168870400.0, + "54": 4168870400.0, + "55": 4168870400.0, + "56": 4168870400.0, + "57": 4168870400.0, + "58": 4168870400.0, + "59": 4168870400.0, + "60": 4168870400.0, + "61": 4168870400.0, + "62": 4168870400.0, + "63": 4168870400.0, + "64": 4168870400.0, + "65": 4168870400.0, + "66": 4168870400.0, + "67": 4168870400.0, + "68": 4168870400.0, + "69": 4168870400.0, + "70": 4168870400.0, + "71": 4168870400.0, + "72": 4168870400.0, + "73": 4168870400.0, + "74": 4168870400.0, + "75": 4168870400.0, + "76": 4168870400.0, + "77": 4168870400.0, + "78": 4168870400.0, + "79": 4168870400.0, + "80": 4168870400.0, + "81": 4168870400.0, + "82": 4168870400.0, + "83": 4168870400.0, + "84": 4168870400.0, + "85": 4168870400.0, + "86": 4168870400.0, + "87": 4168870400.0, + "88": 4168870400.0, + "89": 4168870400.0, + "90": 4168870400.0, + "91": 4168870400.0, + "92": 4168870400.0, + "93": 4168870400.0, + "94": 4168870400.0, + "95": 4168870400.0, + "96": 4168870400.0, + "97": 4168870400.0, + "98": 4168870400.0, + "99": 4168870400.0, + "100": 4168870400.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4375071232.0, + "2": 6204402688.0, + "3": 6206499840.0, + "4": 6206499840.0, + "5": 6206499840.0, + "6": 6206499840.0, + "7": 6206499840.0, + "8": 6206499840.0, + "9": 6206499840.0, + "10": 6206499840.0, + "11": 6206499840.0, + "12": 6206499840.0, + "13": 6206499840.0, + "14": 6206499840.0, + "15": 6206499840.0, + "16": 6206499840.0, + "17": 6206499840.0, + "18": 6206499840.0, + "19": 6206499840.0, + "20": 6206499840.0, + "21": 6206499840.0, + "22": 6206499840.0, + "23": 6206499840.0, + "24": 6206499840.0, + "25": 6206499840.0, + "26": 6206499840.0, + "27": 6206499840.0, + "28": 6206499840.0, + "29": 6206499840.0, + "30": 6206499840.0, + "31": 6206499840.0, + "32": 6206499840.0, + "33": 6206499840.0, + "34": 6206499840.0, + "35": 6206499840.0, + "36": 6206499840.0, + "37": 6206499840.0, + "38": 6206499840.0, + "39": 6206499840.0, + "40": 6206499840.0, + "41": 6206499840.0, + "42": 6206499840.0, + "43": 6206499840.0, + "44": 6206499840.0, + "45": 6206499840.0, + "46": 6206499840.0, + "47": 6206499840.0, + "48": 6206499840.0, + "49": 6206499840.0, + "50": 6206499840.0, + "51": 6206499840.0, + "52": 6206499840.0, + "53": 6206499840.0, + "54": 6206499840.0, + "55": 6206499840.0, + "56": 6206499840.0, + "57": 6206499840.0, + "58": 6206499840.0, + "59": 6206499840.0, + "60": 6206499840.0, + "61": 6206499840.0, + "62": 6206499840.0, + "63": 6206499840.0, + "64": 6206499840.0, + "65": 6206499840.0, + "66": 6206499840.0, + "67": 6206499840.0, + "68": 6206499840.0, + "69": 6206499840.0, + "70": 6206499840.0, + "71": 6206499840.0, + "72": 6206499840.0, + "73": 6206499840.0, + "74": 6206499840.0, + "75": 6206499840.0, + "76": 6206499840.0, + "77": 6206499840.0, + "78": 6206499840.0, + "79": 6206499840.0, + "80": 6206499840.0, + "81": 6206499840.0, + "82": 6206499840.0, + "83": 6206499840.0, + "84": 6206499840.0, + "85": 6206499840.0, + "86": 6206499840.0, + "87": 6206499840.0, + "88": 6206499840.0, + "89": 6206499840.0, + "90": 6206499840.0, + "91": 6206499840.0, + "92": 6206499840.0, + "93": 6206499840.0, + "94": 6206499840.0, + "95": 6206499840.0, + "96": 6206499840.0, + "97": 6206499840.0, + "98": 6206499840.0, + "99": 6206499840.0, + "100": 6206499840.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 7.56951, + "2": 0.36564, + "3": 0.16506, + "4": 0.16216, + "5": 0.16401, + "6": 0.1643, + "7": 0.16404, + "8": 0.16401, + "9": 0.16504, + "10": 0.1617, + "11": 0.16576, + "12": 0.16229, + "13": 0.16499, + "14": 0.16561, + "15": 0.16438, + "16": 0.16356, + "17": 0.16261, + "18": 0.16022, + "19": 0.16185, + "20": 0.1635, + "21": 0.16599, + "22": 0.16234, + "23": 0.16167, + "24": 0.16807, + "25": 0.16164, + "26": 0.16553, + "27": 0.16403, + "28": 0.16811, + "29": 0.16239, + "30": 0.16649, + "31": 0.16267, + "32": 0.16749, + "33": 0.1637, + "34": 0.16943, + "35": 0.16268, + "36": 0.17031, + "37": 0.16717, + "38": 0.17077, + "39": 0.16691, + "40": 0.17033, + "41": 0.16714, + "42": 0.1713, + "43": 0.16706, + "44": 0.16889, + "45": 0.1679, + "46": 0.16944, + "47": 0.16158, + "48": 0.16604, + "49": 0.16504, + "50": 0.17162, + "51": 0.16897, + "52": 0.17155, + "53": 0.16436, + "54": 0.17087, + "55": 0.16555, + "56": 0.16962, + "57": 0.16191, + "58": 0.17048, + "59": 0.1671, + "60": 0.16952, + "61": 0.16638, + "62": 0.1732, + "63": 0.19062, + "64": 0.17721, + "65": 0.16282, + "66": 0.16924, + "67": 0.16252, + "68": 0.16523, + "69": 0.16729, + "70": 0.53751, + "71": 0.16521, + "72": 0.17116, + "73": 0.16408, + "74": 0.16918, + "75": 0.16612, + "76": 0.21043, + "77": 0.17541, + "78": 0.20915, + "79": 0.19264, + "80": 0.16783, + "81": 0.16133, + "82": 0.16441, + "83": 0.16468, + "84": 0.16274, + "85": 0.16617, + "86": 0.16466, + "87": 0.16539, + "88": 0.16381, + "89": 0.1685, + "90": 0.1636, + "91": 0.17069, + "92": 0.16636, + "93": 0.16881, + "94": 0.16448, + "95": 0.16838, + "96": 0.16612, + "97": 0.1674, + "98": 0.16485, + "99": 0.17249, + "100": 0.16394 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..8809a47cd54 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.3313, + "2": 10.35273, + "3": 9.79594, + "4": 9.60954, + "5": 9.42267, + "6": 9.45134, + "7": 9.34339, + "8": 9.27517, + "9": 9.09683, + "10": 9.07209, + "11": 8.8835, + "12": 8.83706, + "13": 8.86832, + "14": 8.71037, + "15": 8.68183, + "16": 8.56139, + "17": 8.52303, + "18": 8.43962, + "19": 8.40445, + "20": 8.29516, + "21": 8.27051, + "22": 8.17907, + "23": 8.12669, + "24": 8.14854, + "25": 7.99081, + "26": 8.12208, + "27": 7.90451, + "28": 7.98651, + "29": 7.80842, + "30": 7.86913, + "31": 7.83557, + "32": 7.7216, + "33": 7.80364, + "34": 7.59209, + "35": 7.68371, + "36": 7.53869, + "37": 7.47624, + "38": 7.51683, + "39": 7.49967, + "40": 7.51717, + "41": 7.43167, + "42": 7.40089, + "43": 7.4492, + "44": 7.3892, + "45": 7.3802, + "46": 7.29486, + "47": 7.44839, + "48": 7.282, + "49": 7.34647, + "50": 7.17125, + "51": 7.37351, + "52": 7.13362, + "53": 7.11248, + "54": 7.23395, + "55": 7.14784, + "56": 7.2278, + "57": 7.33273, + "58": 6.99464, + "59": 7.11597, + "60": 7.13216, + "61": 7.10561, + "62": 7.26519, + "63": 7.14764, + "64": 7.08702, + "65": 6.98658, + "66": 7.04733, + "67": 7.04745, + "68": 7.14076, + "69": 7.24347, + "70": 7.05974, + "71": 6.89358, + "72": 6.99793, + "73": 6.97928, + "74": 6.91973, + "75": 7.05295, + "76": 6.96054, + "77": 7.07939, + "78": 7.0137, + "79": 6.88344, + "80": 6.93032, + "81": 6.96568, + "82": 7.05273, + "83": 6.98785, + "84": 7.00434, + "85": 6.84596, + "86": 7.03651, + "87": 6.96347, + "88": 6.91343, + "89": 6.80657, + "90": 7.23629, + "91": 6.70068, + "92": 7.05694, + "93": 6.89292, + "94": 7.05848, + "95": 6.84802, + "96": 6.9679, + "97": 6.9429, + "98": 6.87432, + "99": 7.01828, + "100": 6.98491 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43340.0, + "2": 44096.0, + "3": 44784.0, + "4": 42468.0, + "5": 45416.0, + "6": 40967.0, + "7": 43183.0, + "8": 45463.0, + "9": 42562.0, + "10": 45358.0, + "11": 44024.0, + "12": 44607.0, + "13": 43921.0, + "14": 46213.0, + "15": 43945.0, + "16": 41749.0, + "17": 43868.0, + "18": 44723.0, + "19": 42609.0, + "20": 44784.0, + "21": 44794.0, + "22": 41882.0, + "23": 45474.0, + "24": 43082.0, + "25": 42696.0, + "26": 43952.0, + "27": 46262.0, + "28": 46418.0, + "29": 46154.0, + "30": 44052.0, + "31": 41259.0, + "32": 43443.0, + "33": 45485.0, + "34": 43346.0, + "35": 43279.0, + "36": 42498.0, + "37": 40653.0, + "38": 42538.0, + "39": 44772.0, + "40": 43278.0, + "41": 44664.0, + "42": 43297.0, + "43": 45448.0, + "44": 44622.0, + "45": 43354.0, + "46": 43931.0, + "47": 42505.0, + "48": 44726.0, + "49": 43168.0, + "50": 43402.0, + "51": 41200.0, + "52": 43884.0, + "53": 43946.0, + "54": 41916.0, + "55": 43925.0, + "56": 43252.0, + "57": 42636.0, + "58": 43941.0, + "59": 44619.0, + "60": 41400.0, + "61": 39750.0, + "62": 44764.0, + "63": 44671.0, + "64": 45375.0, + "65": 44753.0, + "66": 45404.0, + "67": 43154.0, + "68": 42551.0, + "69": 43844.0, + "70": 45537.0, + "71": 43335.0, + "72": 44839.0, + "73": 45372.0, + "74": 42511.0, + "75": 44712.0, + "76": 43930.0, + "77": 42073.0, + "78": 40535.0, + "79": 38992.0, + "80": 41092.0, + "81": 45382.0, + "82": 43275.0, + "83": 38475.0, + "84": 42418.0, + "85": 43979.0, + "86": 45691.0, + "87": 41145.0, + "88": 41782.0, + "89": 41042.0, + "90": 44713.0, + "91": 46270.0, + "92": 41845.0, + "93": 43272.0, + "94": 39536.0, + "95": 44085.0, + "96": 44689.0, + "97": 45411.0, + "98": 41858.0, + "99": 45575.0, + "100": 42501.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4168870400.0, + "2": 4168870400.0, + "3": 4168870400.0, + "4": 4168870400.0, + "5": 4168870400.0, + "6": 4168870400.0, + "7": 4168870400.0, + "8": 4168870400.0, + "9": 4168870400.0, + "10": 4168870400.0, + "11": 4168870400.0, + "12": 4168870400.0, + "13": 4168870400.0, + "14": 4168870400.0, + "15": 4168870400.0, + "16": 4168870400.0, + "17": 4168870400.0, + "18": 4168870400.0, + "19": 4168870400.0, + "20": 4168870400.0, + "21": 4168870400.0, + "22": 4168870400.0, + "23": 4168870400.0, + "24": 4168870400.0, + "25": 4168870400.0, + "26": 4168870400.0, + "27": 4168870400.0, + "28": 4168870400.0, + "29": 4168870400.0, + "30": 4168870400.0, + "31": 4168870400.0, + "32": 4168870400.0, + "33": 4168870400.0, + "34": 4168870400.0, + "35": 4168870400.0, + "36": 4168870400.0, + "37": 4168870400.0, + "38": 4168870400.0, + "39": 4168870400.0, + "40": 4168870400.0, + "41": 4168870400.0, + "42": 4168870400.0, + "43": 4168870400.0, + "44": 4168870400.0, + "45": 4168870400.0, + "46": 4168870400.0, + "47": 4168870400.0, + "48": 4168870400.0, + "49": 4168870400.0, + "50": 4168870400.0, + "51": 4168870400.0, + "52": 4168870400.0, + "53": 4168870400.0, + "54": 4168870400.0, + "55": 4168870400.0, + "56": 4168870400.0, + "57": 4168870400.0, + "58": 4168870400.0, + "59": 4168870400.0, + "60": 4168870400.0, + "61": 4168870400.0, + "62": 4168870400.0, + "63": 4168870400.0, + "64": 4168870400.0, + "65": 4168870400.0, + "66": 4168870400.0, + "67": 4168870400.0, + "68": 4168870400.0, + "69": 4168870400.0, + "70": 4168870400.0, + "71": 4168870400.0, + "72": 4168870400.0, + "73": 4168870400.0, + "74": 4168870400.0, + "75": 4168870400.0, + "76": 4168870400.0, + "77": 4168870400.0, + "78": 4168870400.0, + "79": 4168870400.0, + "80": 4168870400.0, + "81": 4168870400.0, + "82": 4168870400.0, + "83": 4168870400.0, + "84": 4168870400.0, + "85": 4168870400.0, + "86": 4168870400.0, + "87": 4168870400.0, + "88": 4168870400.0, + "89": 4168870400.0, + "90": 4168870400.0, + "91": 4168870400.0, + "92": 4168870400.0, + "93": 4168870400.0, + "94": 4168870400.0, + "95": 4168870400.0, + "96": 4168870400.0, + "97": 4168870400.0, + "98": 4168870400.0, + "99": 4168870400.0, + "100": 4168870400.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4375071232.0, + "2": 6204402688.0, + "3": 6206499840.0, + "4": 6206499840.0, + "5": 6206499840.0, + "6": 6206499840.0, + "7": 6206499840.0, + "8": 6206499840.0, + "9": 6206499840.0, + "10": 6206499840.0, + "11": 6206499840.0, + "12": 6206499840.0, + "13": 6206499840.0, + "14": 6206499840.0, + "15": 6206499840.0, + "16": 6206499840.0, + "17": 6206499840.0, + "18": 6206499840.0, + "19": 6206499840.0, + "20": 6206499840.0, + "21": 6206499840.0, + "22": 6206499840.0, + "23": 6206499840.0, + "24": 6206499840.0, + "25": 6206499840.0, + "26": 6206499840.0, + "27": 6206499840.0, + "28": 6206499840.0, + "29": 6206499840.0, + "30": 6206499840.0, + "31": 6206499840.0, + "32": 6206499840.0, + "33": 6206499840.0, + "34": 6206499840.0, + "35": 6206499840.0, + "36": 6206499840.0, + "37": 6206499840.0, + "38": 6206499840.0, + "39": 6206499840.0, + "40": 6206499840.0, + "41": 6206499840.0, + "42": 6206499840.0, + "43": 6206499840.0, + "44": 6206499840.0, + "45": 6206499840.0, + "46": 6206499840.0, + "47": 6206499840.0, + "48": 6206499840.0, + "49": 6206499840.0, + "50": 6206499840.0, + "51": 6206499840.0, + "52": 6206499840.0, + "53": 6206499840.0, + "54": 6206499840.0, + "55": 6206499840.0, + "56": 6206499840.0, + "57": 6206499840.0, + "58": 6206499840.0, + "59": 6206499840.0, + "60": 6206499840.0, + "61": 6206499840.0, + "62": 6206499840.0, + "63": 6206499840.0, + "64": 6206499840.0, + "65": 6206499840.0, + "66": 6206499840.0, + "67": 6206499840.0, + "68": 6206499840.0, + "69": 6206499840.0, + "70": 6206499840.0, + "71": 6206499840.0, + "72": 6206499840.0, + "73": 6206499840.0, + "74": 6206499840.0, + "75": 6206499840.0, + "76": 6206499840.0, + "77": 6206499840.0, + "78": 6206499840.0, + "79": 6206499840.0, + "80": 6206499840.0, + "81": 6206499840.0, + "82": 6206499840.0, + "83": 6206499840.0, + "84": 6206499840.0, + "85": 6206499840.0, + "86": 6206499840.0, + "87": 6206499840.0, + "88": 6206499840.0, + "89": 6206499840.0, + "90": 6206499840.0, + "91": 6206499840.0, + "92": 6206499840.0, + "93": 6206499840.0, + "94": 6206499840.0, + "95": 6206499840.0, + "96": 6206499840.0, + "97": 6206499840.0, + "98": 6206499840.0, + "99": 6206499840.0, + "100": 6206499840.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 7.22025, + "2": 0.31576, + "3": 0.19278, + "4": 0.19432, + "5": 0.18909, + "6": 0.19307, + "7": 0.18922, + "8": 0.19506, + "9": 0.18834, + "10": 0.19233, + "11": 0.18825, + "12": 0.19571, + "13": 0.19081, + "14": 0.19613, + "15": 0.18954, + "16": 0.18825, + "17": 0.18583, + "18": 0.18933, + "19": 0.1896, + "20": 0.19136, + "21": 0.18842, + "22": 0.19581, + "23": 0.18752, + "24": 0.19277, + "25": 0.18759, + "26": 0.19405, + "27": 0.18784, + "28": 0.18762, + "29": 0.19232, + "30": 0.18798, + "31": 0.18713, + "32": 0.18948, + "33": 0.18968, + "34": 0.19011, + "35": 0.18907, + "36": 0.18983, + "37": 0.18857, + "38": 0.18728, + "39": 0.18835, + "40": 0.18777, + "41": 0.188, + "42": 0.18818, + "43": 0.18602, + "44": 0.18972, + "45": 0.19276, + "46": 0.18816, + "47": 0.18794, + "48": 0.19299, + "49": 0.19241, + "50": 0.18805, + "51": 0.18895, + "52": 0.19459, + "53": 0.18821, + "54": 0.18597, + "55": 0.189, + "56": 0.18748, + "57": 0.18709, + "58": 0.19127, + "59": 0.19097, + "60": 0.18702, + "61": 0.18725, + "62": 0.18762, + "63": 0.19407, + "64": 0.19411, + "65": 0.20071, + "66": 0.19555, + "67": 0.22543, + "68": 0.21724, + "69": 0.22635, + "70": 0.52922, + "71": 0.19086, + "72": 0.19899, + "73": 0.51667, + "74": 0.20138, + "75": 0.19507, + "76": 0.24987, + "77": 0.22838, + "78": 0.51523, + "79": 0.19126, + "80": 0.18911, + "81": 0.19269, + "82": 0.18816, + "83": 0.18902, + "84": 0.18942, + "85": 0.19004, + "86": 0.50868, + "87": 0.19274, + "88": 0.18813, + "89": 0.19169, + "90": 0.50854, + "91": 0.1924, + "92": 0.18906, + "93": 0.19016, + "94": 0.1902, + "95": 0.19338, + "96": 0.51468, + "97": 0.19597, + "98": 0.19147, + "99": 0.19626, + "100": 0.18852 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_h100.json index 7a1c2a35b70..24fbb5008a6 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.37205, + "2": 10.36993, + "3": 9.85245, + "4": 9.61997, "5": 9.40867, + "6": 9.43219, + "7": 9.31484, + "8": 9.27336, + "9": 9.11412, "10": 9.03968, + "11": 8.87198, + "12": 8.80862, + "13": 8.83469, + "14": 8.69021, "15": 8.66221, + "16": 8.54816, + "17": 8.50088, + "18": 8.42516, + "19": 8.38808, "20": 8.28073, + "21": 8.26592, + "22": 8.15988, + "23": 8.11241, + "24": 8.14271, "25": 7.98425, + "26": 8.10594, + "27": 7.88954, + "28": 7.9705, + "29": 7.81272, "30": 7.87636, + "31": 7.82505, + "32": 7.70262, + "33": 7.80169, + "34": 7.56872, "35": 7.67373, + "36": 7.54686, + "37": 7.47401, + "38": 7.50726, + "39": 7.49794, "40": 7.51081, + "41": 7.41055, + "42": 7.37984, + "43": 7.44091, + "44": 7.39372, "45": 7.37241, + "46": 7.28404, + "47": 7.46627, + "48": 7.29038, + "49": 7.35015, "50": 7.17193, + "51": 7.37002, + "52": 7.14463, + "53": 7.12651, + "54": 7.23742, "55": 7.15579, + "56": 7.23152, + "57": 7.3354, + "58": 7.01365, + "59": 7.11427, "60": 7.15124, + "61": 7.1088, + "62": 7.26824, + "63": 7.15182, + "64": 7.08401, "65": 6.99127, + "66": 7.05305, + "67": 7.04353, + "68": 7.13973, + "69": 7.03243, "70": 7.05831, + "71": 6.90378, + "72": 6.99805, + "73": 6.97678, + "74": 6.91757, "75": 7.06665, + "76": 6.95719, + "77": 7.08701, + "78": 7.03266, + "79": 6.8532, "80": 6.93633, + "81": 6.97582, + "82": 7.0624, + "83": 6.98226, + "84": 7.00923, "85": 6.8507, + "86": 7.04663, + "87": 6.97947, + "88": 6.91093, + "89": 6.8168, "90": 7.24561, + "91": 6.7048, + "92": 7.05407, + "93": 6.89399, + "94": 7.0542, "95": 6.85047, + "96": 6.96463, + "97": 6.95624, + "98": 6.8829, + "99": 7.00419, "100": 6.98982 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 43288.0, + "2": 44033.0, + "3": 44733.0, + "4": 42406.0, "5": 45371.0, + "6": 40945.0, + "7": 43173.0, + "8": 45430.0, + "9": 42421.0, "10": 45369.0, + "11": 43974.0, + "12": 44588.0, + "13": 43908.0, + "14": 46215.0, "15": 43901.0, + "16": 41603.0, + "17": 43832.0, + "18": 44695.0, + "19": 42547.0, "20": 44758.0, + "21": 44777.0, + "22": 41821.0, + "23": 45434.0, + "24": 43080.0, "25": 42439.0, + "26": 43936.0, + "27": 46214.0, + "28": 46342.0, + "29": 46135.0, "30": 43995.0, + "31": 41271.0, + "32": 43336.0, + "33": 45440.0, + "34": 43287.0, "35": 43240.0, + "36": 42490.0, + "37": 40078.0, + "38": 42510.0, + "39": 44722.0, "40": 43230.0, + "41": 44669.0, + "42": 43262.0, + "43": 45476.0, + "44": 44624.0, "45": 43326.0, + "46": 43945.0, + "47": 42395.0, + "48": 44675.0, + "49": 43169.0, "50": 43381.0, + "51": 41131.0, + "52": 43830.0, + "53": 43914.0, + "54": 42004.0, "55": 43871.0, + "56": 43227.0, + "57": 42550.0, + "58": 43816.0, + "59": 44631.0, "60": 41183.0, + "61": 39721.0, + "62": 44752.0, + "63": 44696.0, + "64": 45351.0, "65": 44694.0, + "66": 45350.0, + "67": 43132.0, + "68": 42535.0, + "69": 43829.0, "70": 45533.0, + "71": 43322.0, + "72": 44749.0, + "73": 45365.0, + "74": 42492.0, "75": 44655.0, + "76": 43920.0, + "77": 42080.0, + "78": 40298.0, + "79": 38909.0, "80": 41117.0, + "81": 45370.0, + "82": 43206.0, + "83": 38501.0, + "84": 42484.0, "85": 43986.0, + "86": 45704.0, + "87": 40839.0, + "88": 41828.0, + "89": 41074.0, "90": 44663.0, + "91": 46169.0, + "92": 41807.0, + "93": 43228.0, + "94": 39549.0, "95": 44090.0, + "96": 44711.0, + "97": 45390.0, + "98": 41799.0, + "99": 45426.0, "100": 42443.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2194357248.0, + "2": 2194357248.0, + "3": 2194357248.0, + "4": 2194357248.0, "5": 2194357248.0, + "6": 2194357248.0, + "7": 2194357248.0, + "8": 2194357248.0, + "9": 2194357248.0, "10": 2194357248.0, + "11": 2194357248.0, + "12": 2194357248.0, + "13": 2194357248.0, + "14": 2194357248.0, "15": 2194357248.0, + "16": 2194357248.0, + "17": 2194357248.0, + "18": 2194357248.0, + "19": 2194357248.0, "20": 2194357248.0, + "21": 2194357248.0, + "22": 2194357248.0, + "23": 2194357248.0, + "24": 2194357248.0, "25": 2194357248.0, + "26": 2194357248.0, + "27": 2194357248.0, + "28": 2194357248.0, + "29": 2194357248.0, "30": 2194357248.0, + "31": 2194357248.0, + "32": 2194357248.0, + "33": 2194357248.0, + "34": 2194357248.0, "35": 2194357248.0, + "36": 2194357248.0, + "37": 2194357248.0, + "38": 2194357248.0, + "39": 2194357248.0, "40": 2194357248.0, + "41": 2194357248.0, + "42": 2194357248.0, + "43": 2194357248.0, + "44": 2194357248.0, "45": 2194357248.0, + "46": 2194357248.0, + "47": 2194357248.0, + "48": 2194357248.0, + "49": 2194357248.0, "50": 2194357248.0, + "51": 2194357248.0, + "52": 2194357248.0, + "53": 2194357248.0, + "54": 2194357248.0, "55": 2194357248.0, + "56": 2194357248.0, + "57": 2194357248.0, + "58": 2194357248.0, + "59": 2194357248.0, "60": 2194357248.0, + "61": 2194357248.0, + "62": 2194357248.0, + "63": 2194357248.0, + "64": 2194357248.0, "65": 2194357248.0, + "66": 2194357248.0, + "67": 2194357248.0, + "68": 2194357248.0, + "69": 2194357248.0, "70": 2194357248.0, + "71": 2194357248.0, + "72": 2194357248.0, + "73": 2194357248.0, + "74": 2194357248.0, "75": 2194357248.0, + "76": 2194357248.0, + "77": 2194357248.0, + "78": 2194357248.0, + "79": 2194357248.0, "80": 2194357248.0, + "81": 2194357248.0, + "82": 2194357248.0, + "83": 2194357248.0, + "84": 2194357248.0, "85": 2194357248.0, + "86": 2194357248.0, + "87": 2194357248.0, + "88": 2194357248.0, + "89": 2194357248.0, "90": 2194357248.0, + "91": 2194357248.0, + "92": 2194357248.0, + "93": 2194357248.0, + "94": 2194357248.0, "95": 2194357248.0, + "96": 2194357248.0, + "97": 2194357248.0, + "98": 2194357248.0, + "99": 2194357248.0, "100": 2194357248.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2443624960.0, + "2": 3375193600.0, + "3": 3375193600.0, + "4": 3375193600.0, "5": 3375193600.0, + "6": 3375193600.0, + "7": 3375193600.0, + "8": 3375193600.0, + "9": 3375193600.0, "10": 3375193600.0, + "11": 3375193600.0, + "12": 3375193600.0, + "13": 3375193600.0, + "14": 3375193600.0, "15": 3375193600.0, + "16": 3375193600.0, + "17": 3375193600.0, + "18": 3375193600.0, + "19": 3375193600.0, "20": 3375193600.0, + "21": 3375193600.0, + "22": 3375193600.0, + "23": 3375193600.0, + "24": 3375193600.0, "25": 3375193600.0, + "26": 3375193600.0, + "27": 3375193600.0, + "28": 3375193600.0, + "29": 3375193600.0, "30": 3375193600.0, + "31": 3375193600.0, + "32": 3375193600.0, + "33": 3375193600.0, + "34": 3375193600.0, "35": 3375193600.0, + "36": 3375193600.0, + "37": 3375193600.0, + "38": 3375193600.0, + "39": 3375193600.0, "40": 3375193600.0, + "41": 3375193600.0, + "42": 3375193600.0, + "43": 3375193600.0, + "44": 3375193600.0, "45": 3375193600.0, + "46": 3375193600.0, + "47": 3375193600.0, + "48": 3375193600.0, + "49": 3375193600.0, "50": 3375193600.0, + "51": 3375193600.0, + "52": 3375193600.0, + "53": 3375193600.0, + "54": 3375193600.0, "55": 3375193600.0, + "56": 3375193600.0, + "57": 3375193600.0, + "58": 3375193600.0, + "59": 3375193600.0, "60": 3375193600.0, + "61": 3375193600.0, + "62": 3375193600.0, + "63": 3375193600.0, + "64": 3375193600.0, "65": 3375193600.0, + "66": 3375193600.0, + "67": 3375193600.0, + "68": 3375193600.0, + "69": 3375193600.0, "70": 3375193600.0, + "71": 3375193600.0, + "72": 3375193600.0, + "73": 3375193600.0, + "74": 3375193600.0, "75": 3375193600.0, + "76": 3375193600.0, + "77": 3375193600.0, + "78": 3375193600.0, + "79": 3375193600.0, "80": 3375193600.0, + "81": 3375193600.0, + "82": 3375193600.0, + "83": 3375193600.0, + "84": 3375193600.0, "85": 3375193600.0, + "86": 3375193600.0, + "87": 3375193600.0, + "88": 3375193600.0, + "89": 3375193600.0, "90": 3375193600.0, + "91": 3375193600.0, + "92": 3375193600.0, + "93": 3375193600.0, + "94": 3375193600.0, "95": 3375193600.0, + "96": 3375193600.0, + "97": 3375193600.0, + "98": 3375193600.0, + "99": 3375193600.0, "100": 3375193600.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 6.57343, - "5": 0.29301, - "10": 0.29182, - "15": 0.29668, - "20": 0.2961, - "25": 0.29961, - "30": 0.29549, - "35": 0.30714, - "40": 0.29592, - "45": 0.29418, - "50": 0.29188, - "55": 0.29019, - "60": 0.29199, - "65": 0.5931, - "70": 0.59584, - "75": 0.29011, - "80": 0.29788, - "85": 0.30993, - "90": 0.2992, - "95": 0.29538, - "100": 0.29811 + "1": 9.51792, + "2": 0.37696, + "3": 0.35384, + "4": 0.34824, + "5": 0.34677, + "6": 0.36735, + "7": 0.37639, + "8": 0.37373, + "9": 0.37798, + "10": 0.37384, + "11": 0.37808, + "12": 0.37762, + "13": 0.37479, + "14": 0.38389, + "15": 0.37511, + "16": 0.3766, + "17": 0.37666, + "18": 0.37513, + "19": 0.36239, + "20": 0.34482, + "21": 0.36935, + "22": 0.37904, + "23": 0.36041, + "24": 0.35765, + "25": 0.36227, + "26": 0.3603, + "27": 0.36061, + "28": 0.35888, + "29": 0.36254, + "30": 0.3638, + "31": 0.36821, + "32": 0.36371, + "33": 0.36426, + "34": 0.63693, + "35": 0.38755, + "36": 0.37078, + "37": 0.36346, + "38": 0.36485, + "39": 0.36467, + "40": 0.43549, + "41": 0.35057, + "42": 0.35472, + "43": 0.35255, + "44": 0.34681, + "45": 0.34612, + "46": 0.3502, + "47": 0.34647, + "48": 0.7097, + "49": 0.34958, + "50": 0.34947, + "51": 0.68193, + "52": 0.66437, + "53": 0.6483, + "54": 0.35744, + "55": 0.34501, + "56": 0.35464, + "57": 0.3506, + "58": 0.34648, + "59": 0.35134, + "60": 0.34883, + "61": 0.34803, + "62": 0.35208, + "63": 0.3458, + "64": 0.34919, + "65": 0.35351, + "66": 0.35034, + "67": 0.34776, + "68": 0.35303, + "69": 0.34862, + "70": 0.35025, + "71": 0.35221, + "72": 0.34546, + "73": 0.34844, + "74": 0.35311, + "75": 0.34698, + "76": 0.34803, + "77": 0.34856, + "78": 0.34471, + "79": 0.64787, + "80": 0.34702, + "81": 0.35417, + "82": 0.34815, + "83": 0.34811, + "84": 0.36328, + "85": 0.35053, + "86": 0.34968, + "87": 0.641, + "88": 0.35086, + "89": 0.35762, + "90": 0.34969, + "91": 0.35083, + "92": 0.36212, + "93": 0.35255, + "94": 0.35084, + "95": 0.35297, + "96": 0.34869, + "97": 0.3518, + "98": 0.3551, + "99": 0.35073, + "100": 0.35332 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..5c3d959191a --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.38869, + "2": 10.39385, + "3": 9.78084, + "4": 9.59727, + "5": 9.38084, + "6": 9.40579, + "7": 9.30788, + "8": 9.24106, + "9": 9.12192, + "10": 9.05709, + "11": 8.87331, + "12": 8.7937, + "13": 8.84028, + "14": 8.68508, + "15": 8.65595, + "16": 8.54356, + "17": 8.50088, + "18": 8.39002, + "19": 8.36442, + "20": 8.26189, + "21": 8.27089, + "22": 8.14388, + "23": 8.07456, + "24": 8.11903, + "25": 7.98194, + "26": 8.08775, + "27": 7.87135, + "28": 7.96498, + "29": 7.80253, + "30": 7.86925, + "31": 7.81724, + "32": 7.68778, + "33": 7.78042, + "34": 7.55486, + "35": 7.66275, + "36": 7.52238, + "37": 7.44446, + "38": 7.50242, + "39": 7.45039, + "40": 7.5007, + "41": 7.39051, + "42": 7.36065, + "43": 7.43329, + "44": 7.3762, + "45": 7.34875, + "46": 7.28162, + "47": 7.46112, + "48": 7.28762, + "49": 7.35376, + "50": 7.18139, + "51": 7.36575, + "52": 7.1333, + "53": 7.11549, + "54": 7.22921, + "55": 7.15407, + "56": 7.22241, + "57": 7.32951, + "58": 7.02329, + "59": 7.11369, + "60": 7.14724, + "61": 7.11415, + "62": 7.24749, + "63": 7.15673, + "64": 7.08408, + "65": 6.99707, + "66": 7.06064, + "67": 7.04874, + "68": 7.14167, + "69": 7.0346, + "70": 7.06003, + "71": 6.92549, + "72": 7.00408, + "73": 6.97962, + "74": 6.92272, + "75": 7.0608, + "76": 6.97256, + "77": 7.08183, + "78": 7.01864, + "79": 6.8552, + "80": 6.94288, + "81": 6.97634, + "82": 7.06647, + "83": 6.99975, + "84": 7.00894, + "85": 6.85973, + "86": 7.03631, + "87": 6.98045, + "88": 6.91491, + "89": 6.81048, + "90": 7.24972, + "91": 6.71004, + "92": 7.04898, + "93": 6.90555, + "94": 7.06456, + "95": 6.84835, + "96": 6.97647, + "97": 6.9631, + "98": 6.88688, + "99": 7.01307, + "100": 6.9828 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43318.0, + "2": 44050.0, + "3": 44756.0, + "4": 42391.0, + "5": 45385.0, + "6": 40966.0, + "7": 43182.0, + "8": 45459.0, + "9": 42453.0, + "10": 45371.0, + "11": 43978.0, + "12": 44598.0, + "13": 43892.0, + "14": 46190.0, + "15": 43897.0, + "16": 41608.0, + "17": 43825.0, + "18": 44703.0, + "19": 42550.0, + "20": 44769.0, + "21": 44793.0, + "22": 41844.0, + "23": 45444.0, + "24": 43071.0, + "25": 42476.0, + "26": 43926.0, + "27": 46218.0, + "28": 46430.0, + "29": 46178.0, + "30": 43985.0, + "31": 41281.0, + "32": 43347.0, + "33": 45448.0, + "34": 43305.0, + "35": 43264.0, + "36": 42485.0, + "37": 40077.0, + "38": 42514.0, + "39": 44723.0, + "40": 43230.0, + "41": 44653.0, + "42": 43269.0, + "43": 45446.0, + "44": 44588.0, + "45": 43278.0, + "46": 43896.0, + "47": 42369.0, + "48": 44704.0, + "49": 43172.0, + "50": 43381.0, + "51": 41175.0, + "52": 43812.0, + "53": 43934.0, + "54": 41932.0, + "55": 43857.0, + "56": 43277.0, + "57": 42576.0, + "58": 43835.0, + "59": 44629.0, + "60": 41225.0, + "61": 39716.0, + "62": 44773.0, + "63": 44717.0, + "64": 45367.0, + "65": 44683.0, + "66": 45367.0, + "67": 43136.0, + "68": 42523.0, + "69": 43828.0, + "70": 45534.0, + "71": 43316.0, + "72": 44750.0, + "73": 45364.0, + "74": 42445.0, + "75": 44679.0, + "76": 43875.0, + "77": 42100.0, + "78": 40289.0, + "79": 38949.0, + "80": 41115.0, + "81": 45362.0, + "82": 43205.0, + "83": 38475.0, + "84": 42459.0, + "85": 44010.0, + "86": 45731.0, + "87": 40860.0, + "88": 41793.0, + "89": 41068.0, + "90": 44673.0, + "91": 46149.0, + "92": 41798.0, + "93": 43246.0, + "94": 39583.0, + "95": 44064.0, + "96": 44715.0, + "97": 45390.0, + "98": 41808.0, + "99": 45436.0, + "100": 42520.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2164472832.0, + "2": 2164472832.0, + "3": 2164472832.0, + "4": 2164472832.0, + "5": 2164472832.0, + "6": 2164472832.0, + "7": 2164472832.0, + "8": 2164472832.0, + "9": 2164472832.0, + "10": 2164472832.0, + "11": 2164472832.0, + "12": 2164472832.0, + "13": 2164472832.0, + "14": 2164472832.0, + "15": 2164472832.0, + "16": 2164472832.0, + "17": 2164472832.0, + "18": 2164472832.0, + "19": 2164472832.0, + "20": 2164472832.0, + "21": 2164472832.0, + "22": 2164472832.0, + "23": 2164472832.0, + "24": 2164472832.0, + "25": 2164472832.0, + "26": 2164472832.0, + "27": 2164472832.0, + "28": 2164472832.0, + "29": 2164472832.0, + "30": 2164472832.0, + "31": 2164472832.0, + "32": 2164472832.0, + "33": 2164472832.0, + "34": 2164472832.0, + "35": 2164472832.0, + "36": 2164472832.0, + "37": 2164472832.0, + "38": 2164472832.0, + "39": 2164472832.0, + "40": 2164472832.0, + "41": 2164472832.0, + "42": 2164472832.0, + "43": 2164472832.0, + "44": 2164472832.0, + "45": 2164472832.0, + "46": 2164472832.0, + "47": 2164472832.0, + "48": 2164472832.0, + "49": 2164472832.0, + "50": 2164472832.0, + "51": 2164472832.0, + "52": 2164472832.0, + "53": 2164472832.0, + "54": 2164472832.0, + "55": 2164472832.0, + "56": 2164472832.0, + "57": 2164472832.0, + "58": 2164472832.0, + "59": 2164472832.0, + "60": 2164472832.0, + "61": 2164472832.0, + "62": 2164472832.0, + "63": 2164472832.0, + "64": 2164472832.0, + "65": 2164472832.0, + "66": 2164472832.0, + "67": 2164472832.0, + "68": 2164472832.0, + "69": 2164472832.0, + "70": 2164472832.0, + "71": 2164472832.0, + "72": 2164472832.0, + "73": 2164472832.0, + "74": 2164472832.0, + "75": 2164472832.0, + "76": 2164472832.0, + "77": 2164472832.0, + "78": 2164472832.0, + "79": 2164472832.0, + "80": 2164472832.0, + "81": 2164472832.0, + "82": 2164472832.0, + "83": 2164472832.0, + "84": 2164472832.0, + "85": 2164472832.0, + "86": 2164472832.0, + "87": 2164472832.0, + "88": 2164472832.0, + "89": 2164472832.0, + "90": 2164472832.0, + "91": 2164472832.0, + "92": 2164472832.0, + "93": 2164472832.0, + "94": 2164472832.0, + "95": 2164472832.0, + "96": 2164472832.0, + "97": 2164472832.0, + "98": 2164472832.0, + "99": 2164472832.0, + "100": 2164472832.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2413216256.0, + "2": 3345833472.0, + "3": 3345833472.0, + "4": 3345833472.0, + "5": 3345833472.0, + "6": 3345833472.0, + "7": 3345833472.0, + "8": 3345833472.0, + "9": 3345833472.0, + "10": 3345833472.0, + "11": 3345833472.0, + "12": 3345833472.0, + "13": 3345833472.0, + "14": 3345833472.0, + "15": 3345833472.0, + "16": 3345833472.0, + "17": 3345833472.0, + "18": 3345833472.0, + "19": 3345833472.0, + "20": 3345833472.0, + "21": 3345833472.0, + "22": 3345833472.0, + "23": 3345833472.0, + "24": 3345833472.0, + "25": 3345833472.0, + "26": 3345833472.0, + "27": 3345833472.0, + "28": 3345833472.0, + "29": 3345833472.0, + "30": 3345833472.0, + "31": 3345833472.0, + "32": 3345833472.0, + "33": 3345833472.0, + "34": 3345833472.0, + "35": 3345833472.0, + "36": 3345833472.0, + "37": 3345833472.0, + "38": 3345833472.0, + "39": 3345833472.0, + "40": 3345833472.0, + "41": 3345833472.0, + "42": 3345833472.0, + "43": 3345833472.0, + "44": 3345833472.0, + "45": 3345833472.0, + "46": 3345833472.0, + "47": 3345833472.0, + "48": 3345833472.0, + "49": 3345833472.0, + "50": 3345833472.0, + "51": 3345833472.0, + "52": 3345833472.0, + "53": 3345833472.0, + "54": 3345833472.0, + "55": 3345833472.0, + "56": 3345833472.0, + "57": 3345833472.0, + "58": 3345833472.0, + "59": 3345833472.0, + "60": 3345833472.0, + "61": 3345833472.0, + "62": 3345833472.0, + "63": 3345833472.0, + "64": 3345833472.0, + "65": 3345833472.0, + "66": 3345833472.0, + "67": 3345833472.0, + "68": 3345833472.0, + "69": 3345833472.0, + "70": 3345833472.0, + "71": 3345833472.0, + "72": 3345833472.0, + "73": 3345833472.0, + "74": 3345833472.0, + "75": 3345833472.0, + "76": 3345833472.0, + "77": 3345833472.0, + "78": 3345833472.0, + "79": 3345833472.0, + "80": 3345833472.0, + "81": 3345833472.0, + "82": 3345833472.0, + "83": 3345833472.0, + "84": 3345833472.0, + "85": 3345833472.0, + "86": 3345833472.0, + "87": 3345833472.0, + "88": 3345833472.0, + "89": 3345833472.0, + "90": 3345833472.0, + "91": 3345833472.0, + "92": 3345833472.0, + "93": 3345833472.0, + "94": 3345833472.0, + "95": 3345833472.0, + "96": 3345833472.0, + "97": 3345833472.0, + "98": 3345833472.0, + "99": 3345833472.0, + "100": 3345833472.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 7.41599, + "2": 0.55044, + "3": 0.4601, + "4": 0.46093, + "5": 0.45888, + "6": 0.46549, + "7": 0.46196, + "8": 0.46392, + "9": 0.46142, + "10": 0.46273, + "11": 0.46181, + "12": 0.53125, + "13": 0.51435, + "14": 0.47772, + "15": 0.47916, + "16": 0.47028, + "17": 0.46912, + "18": 0.47611, + "19": 0.48447, + "20": 0.47544, + "21": 0.47048, + "22": 0.47872, + "23": 0.47823, + "24": 0.48021, + "25": 0.46999, + "26": 0.4776, + "27": 0.47549, + "28": 0.47983, + "29": 0.47292, + "30": 0.47463, + "31": 0.82354, + "32": 0.9356, + "33": 0.47582, + "34": 0.47311, + "35": 0.4737, + "36": 0.49142, + "37": 0.4757, + "38": 0.46626, + "39": 0.48967, + "40": 0.46469, + "41": 0.8495, + "42": 0.46682, + "43": 0.46339, + "44": 0.464, + "45": 0.46339, + "46": 0.4651, + "47": 0.46486, + "48": 0.7679, + "49": 0.82614, + "50": 0.46574, + "51": 0.81746, + "52": 0.80226, + "53": 0.46381, + "54": 0.51852, + "55": 0.46533, + "56": 0.46349, + "57": 0.46462, + "58": 0.46325, + "59": 0.46221, + "60": 0.98653, + "61": 0.46476, + "62": 0.46489, + "63": 0.4641, + "64": 0.46387, + "65": 0.46447, + "66": 0.46497, + "67": 0.46419, + "68": 0.46372, + "69": 0.46378, + "70": 0.46549, + "71": 0.46682, + "72": 0.4674, + "73": 0.46459, + "74": 0.46681, + "75": 0.46573, + "76": 0.46408, + "77": 0.465, + "78": 0.46602, + "79": 0.49286, + "80": 0.46795, + "81": 0.46459, + "82": 0.46605, + "83": 0.46772, + "84": 0.4651, + "85": 0.4646, + "86": 0.46421, + "87": 0.46391, + "88": 0.46392, + "89": 0.4668, + "90": 0.46462, + "91": 0.46389, + "92": 0.46949, + "93": 0.46646, + "94": 0.46559, + "95": 0.46701, + "96": 0.46805, + "97": 0.46541, + "98": 0.46506, + "99": 0.46495, + "100": 0.46492 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci.json new file mode 100644 index 00000000000..2482dd80c70 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.38869, + "2": 10.39385, + "3": 9.78084, + "4": 9.59727, + "5": 9.38084, + "6": 9.40579, + "7": 9.30788, + "8": 9.24106, + "9": 9.12192, + "10": 9.05709, + "11": 8.87331, + "12": 8.7937, + "13": 8.84028, + "14": 8.68508, + "15": 8.65595, + "16": 8.54356, + "17": 8.50088, + "18": 8.39002, + "19": 8.36442, + "20": 8.26189, + "21": 8.27089, + "22": 8.14388, + "23": 8.07456, + "24": 8.11903, + "25": 7.98194, + "26": 8.08775, + "27": 7.87135, + "28": 7.96498, + "29": 7.80253, + "30": 7.86925, + "31": 7.81724, + "32": 7.68778, + "33": 7.78042, + "34": 7.55486, + "35": 7.66275, + "36": 7.52238, + "37": 7.44446, + "38": 7.50242, + "39": 7.45039, + "40": 7.5007, + "41": 7.39051, + "42": 7.36065, + "43": 7.43329, + "44": 7.3762, + "45": 7.34875, + "46": 7.28162, + "47": 7.46112, + "48": 7.28762, + "49": 7.35376, + "50": 7.18139, + "51": 7.36575, + "52": 7.1333, + "53": 7.11549, + "54": 7.22921, + "55": 7.15407, + "56": 7.22241, + "57": 7.32951, + "58": 7.02329, + "59": 7.11369, + "60": 7.14724, + "61": 7.11415, + "62": 7.24749, + "63": 7.15673, + "64": 7.08408, + "65": 6.99707, + "66": 7.06064, + "67": 7.04874, + "68": 7.14167, + "69": 7.0346, + "70": 7.06003, + "71": 6.92549, + "72": 7.00408, + "73": 6.97962, + "74": 6.92272, + "75": 7.0608, + "76": 6.97256, + "77": 7.08183, + "78": 7.01864, + "79": 6.8552, + "80": 6.94288, + "81": 6.97634, + "82": 7.06647, + "83": 6.99975, + "84": 7.00894, + "85": 6.85973, + "86": 7.03631, + "87": 6.98045, + "88": 6.91491, + "89": 6.81048, + "90": 7.24972, + "91": 6.71004, + "92": 7.04898, + "93": 6.90555, + "94": 7.06456, + "95": 6.84835, + "96": 6.97647, + "97": 6.9631, + "98": 6.88688, + "99": 7.01307, + "100": 6.9828 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43318.0, + "2": 44050.0, + "3": 44756.0, + "4": 42391.0, + "5": 45385.0, + "6": 40966.0, + "7": 43182.0, + "8": 45459.0, + "9": 42453.0, + "10": 45371.0, + "11": 43978.0, + "12": 44598.0, + "13": 43892.0, + "14": 46190.0, + "15": 43897.0, + "16": 41608.0, + "17": 43825.0, + "18": 44703.0, + "19": 42550.0, + "20": 44769.0, + "21": 44793.0, + "22": 41844.0, + "23": 45444.0, + "24": 43071.0, + "25": 42476.0, + "26": 43926.0, + "27": 46218.0, + "28": 46430.0, + "29": 46178.0, + "30": 43985.0, + "31": 41281.0, + "32": 43347.0, + "33": 45448.0, + "34": 43305.0, + "35": 43264.0, + "36": 42485.0, + "37": 40077.0, + "38": 42514.0, + "39": 44723.0, + "40": 43230.0, + "41": 44653.0, + "42": 43269.0, + "43": 45446.0, + "44": 44588.0, + "45": 43278.0, + "46": 43896.0, + "47": 42369.0, + "48": 44704.0, + "49": 43172.0, + "50": 43381.0, + "51": 41175.0, + "52": 43812.0, + "53": 43934.0, + "54": 41932.0, + "55": 43857.0, + "56": 43277.0, + "57": 42576.0, + "58": 43835.0, + "59": 44629.0, + "60": 41225.0, + "61": 39716.0, + "62": 44773.0, + "63": 44717.0, + "64": 45367.0, + "65": 44683.0, + "66": 45367.0, + "67": 43136.0, + "68": 42523.0, + "69": 43828.0, + "70": 45534.0, + "71": 43316.0, + "72": 44750.0, + "73": 45364.0, + "74": 42445.0, + "75": 44679.0, + "76": 43875.0, + "77": 42100.0, + "78": 40289.0, + "79": 38949.0, + "80": 41115.0, + "81": 45362.0, + "82": 43205.0, + "83": 38475.0, + "84": 42459.0, + "85": 44010.0, + "86": 45731.0, + "87": 40860.0, + "88": 41793.0, + "89": 41068.0, + "90": 44673.0, + "91": 46149.0, + "92": 41798.0, + "93": 43246.0, + "94": 39583.0, + "95": 44064.0, + "96": 44715.0, + "97": 45390.0, + "98": 41808.0, + "99": 45436.0, + "100": 42520.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2164472832.0, + "2": 2164472832.0, + "3": 2164472832.0, + "4": 2164472832.0, + "5": 2164472832.0, + "6": 2164472832.0, + "7": 2164472832.0, + "8": 2164472832.0, + "9": 2164472832.0, + "10": 2164472832.0, + "11": 2164472832.0, + "12": 2164472832.0, + "13": 2164472832.0, + "14": 2164472832.0, + "15": 2164472832.0, + "16": 2164472832.0, + "17": 2164472832.0, + "18": 2164472832.0, + "19": 2164472832.0, + "20": 2164472832.0, + "21": 2164472832.0, + "22": 2164472832.0, + "23": 2164472832.0, + "24": 2164472832.0, + "25": 2164472832.0, + "26": 2164472832.0, + "27": 2164472832.0, + "28": 2164472832.0, + "29": 2164472832.0, + "30": 2164472832.0, + "31": 2164472832.0, + "32": 2164472832.0, + "33": 2164472832.0, + "34": 2164472832.0, + "35": 2164472832.0, + "36": 2164472832.0, + "37": 2164472832.0, + "38": 2164472832.0, + "39": 2164472832.0, + "40": 2164472832.0, + "41": 2164472832.0, + "42": 2164472832.0, + "43": 2164472832.0, + "44": 2164472832.0, + "45": 2164472832.0, + "46": 2164472832.0, + "47": 2164472832.0, + "48": 2164472832.0, + "49": 2164472832.0, + "50": 2164472832.0, + "51": 2164472832.0, + "52": 2164472832.0, + "53": 2164472832.0, + "54": 2164472832.0, + "55": 2164472832.0, + "56": 2164472832.0, + "57": 2164472832.0, + "58": 2164472832.0, + "59": 2164472832.0, + "60": 2164472832.0, + "61": 2164472832.0, + "62": 2164472832.0, + "63": 2164472832.0, + "64": 2164472832.0, + "65": 2164472832.0, + "66": 2164472832.0, + "67": 2164472832.0, + "68": 2164472832.0, + "69": 2164472832.0, + "70": 2164472832.0, + "71": 2164472832.0, + "72": 2164472832.0, + "73": 2164472832.0, + "74": 2164472832.0, + "75": 2164472832.0, + "76": 2164472832.0, + "77": 2164472832.0, + "78": 2164472832.0, + "79": 2164472832.0, + "80": 2164472832.0, + "81": 2164472832.0, + "82": 2164472832.0, + "83": 2164472832.0, + "84": 2164472832.0, + "85": 2164472832.0, + "86": 2164472832.0, + "87": 2164472832.0, + "88": 2164472832.0, + "89": 2164472832.0, + "90": 2164472832.0, + "91": 2164472832.0, + "92": 2164472832.0, + "93": 2164472832.0, + "94": 2164472832.0, + "95": 2164472832.0, + "96": 2164472832.0, + "97": 2164472832.0, + "98": 2164472832.0, + "99": 2164472832.0, + "100": 2164472832.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2413216256.0, + "2": 3345833472.0, + "3": 3345833472.0, + "4": 3345833472.0, + "5": 3345833472.0, + "6": 3345833472.0, + "7": 3345833472.0, + "8": 3345833472.0, + "9": 3345833472.0, + "10": 3345833472.0, + "11": 3345833472.0, + "12": 3345833472.0, + "13": 3345833472.0, + "14": 3345833472.0, + "15": 3345833472.0, + "16": 3345833472.0, + "17": 3345833472.0, + "18": 3345833472.0, + "19": 3345833472.0, + "20": 3345833472.0, + "21": 3345833472.0, + "22": 3345833472.0, + "23": 3345833472.0, + "24": 3345833472.0, + "25": 3345833472.0, + "26": 3345833472.0, + "27": 3345833472.0, + "28": 3345833472.0, + "29": 3345833472.0, + "30": 3345833472.0, + "31": 3345833472.0, + "32": 3345833472.0, + "33": 3345833472.0, + "34": 3345833472.0, + "35": 3345833472.0, + "36": 3345833472.0, + "37": 3345833472.0, + "38": 3345833472.0, + "39": 3345833472.0, + "40": 3345833472.0, + "41": 3345833472.0, + "42": 3345833472.0, + "43": 3345833472.0, + "44": 3345833472.0, + "45": 3345833472.0, + "46": 3345833472.0, + "47": 3345833472.0, + "48": 3345833472.0, + "49": 3345833472.0, + "50": 3345833472.0, + "51": 3345833472.0, + "52": 3345833472.0, + "53": 3345833472.0, + "54": 3345833472.0, + "55": 3345833472.0, + "56": 3345833472.0, + "57": 3345833472.0, + "58": 3345833472.0, + "59": 3345833472.0, + "60": 3345833472.0, + "61": 3345833472.0, + "62": 3345833472.0, + "63": 3345833472.0, + "64": 3345833472.0, + "65": 3345833472.0, + "66": 3345833472.0, + "67": 3345833472.0, + "68": 3345833472.0, + "69": 3345833472.0, + "70": 3345833472.0, + "71": 3345833472.0, + "72": 3345833472.0, + "73": 3345833472.0, + "74": 3345833472.0, + "75": 3345833472.0, + "76": 3345833472.0, + "77": 3345833472.0, + "78": 3345833472.0, + "79": 3345833472.0, + "80": 3345833472.0, + "81": 3345833472.0, + "82": 3345833472.0, + "83": 3345833472.0, + "84": 3345833472.0, + "85": 3345833472.0, + "86": 3345833472.0, + "87": 3345833472.0, + "88": 3345833472.0, + "89": 3345833472.0, + "90": 3345833472.0, + "91": 3345833472.0, + "92": 3345833472.0, + "93": 3345833472.0, + "94": 3345833472.0, + "95": 3345833472.0, + "96": 3345833472.0, + "97": 3345833472.0, + "98": 3345833472.0, + "99": 3345833472.0, + "100": 3345833472.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.60644, + "2": 0.57986, + "3": 0.47823, + "4": 0.48281, + "5": 0.48093, + "6": 0.47347, + "7": 0.47326, + "8": 0.47378, + "9": 0.4723, + "10": 0.4709, + "11": 0.47371, + "12": 0.47257, + "13": 0.47211, + "14": 0.4725, + "15": 0.47332, + "16": 0.47413, + "17": 0.4746, + "18": 0.47281, + "19": 0.47707, + "20": 0.47306, + "21": 0.4732, + "22": 0.46995, + "23": 0.47593, + "24": 0.47349, + "25": 0.47467, + "26": 0.48697, + "27": 0.46764, + "28": 0.47083, + "29": 0.47011, + "30": 0.47001, + "31": 0.46787, + "32": 0.82338, + "33": 0.47926, + "34": 0.482, + "35": 0.46965, + "36": 0.4706, + "37": 0.93011, + "38": 0.80405, + "39": 0.47254, + "40": 0.47196, + "41": 0.82549, + "42": 0.47441, + "43": 0.47469, + "44": 0.47149, + "45": 0.47417, + "46": 0.47445, + "47": 0.47452, + "48": 0.47581, + "49": 0.47293, + "50": 0.47057, + "51": 0.94959, + "52": 0.47119, + "53": 0.4725, + "54": 0.47393, + "55": 0.47401, + "56": 0.47324, + "57": 0.47407, + "58": 0.4761, + "59": 0.47586, + "60": 0.47378, + "61": 0.4733, + "62": 0.4737, + "63": 0.47104, + "64": 0.47276, + "65": 0.47318, + "66": 0.89402, + "67": 0.47315, + "68": 0.4734, + "69": 0.4712, + "70": 0.47401, + "71": 0.47383, + "72": 0.47295, + "73": 0.47295, + "74": 0.47389, + "75": 0.47397, + "76": 0.47329, + "77": 0.47294, + "78": 0.47471, + "79": 0.47574, + "80": 0.4753, + "81": 0.47352, + "82": 0.47352, + "83": 0.47483, + "84": 0.78574, + "85": 0.47734, + "86": 0.48545, + "87": 0.4736, + "88": 1.03977, + "89": 0.47047, + "90": 0.47102, + "91": 0.47334, + "92": 0.47576, + "93": 0.4727, + "94": 0.47956, + "95": 0.47304, + "96": 0.47172, + "97": 0.47639, + "98": 0.47474, + "99": 0.47123, + "100": 0.47327 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..bda6217caaa --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.37205, + "2": 10.36993, + "3": 9.85245, + "4": 9.61997, + "5": 9.40867, + "6": 9.43219, + "7": 9.31484, + "8": 9.27336, + "9": 9.11412, + "10": 9.03968, + "11": 8.87198, + "12": 8.80862, + "13": 8.83469, + "14": 8.69021, + "15": 8.66221, + "16": 8.54816, + "17": 8.50088, + "18": 8.42516, + "19": 8.38808, + "20": 8.28073, + "21": 8.26592, + "22": 8.15988, + "23": 8.11241, + "24": 8.14271, + "25": 7.98425, + "26": 8.10594, + "27": 7.88954, + "28": 7.9705, + "29": 7.81272, + "30": 7.87636, + "31": 7.82505, + "32": 7.70262, + "33": 7.80169, + "34": 7.56872, + "35": 7.67373, + "36": 7.54686, + "37": 7.47401, + "38": 7.50726, + "39": 7.49794, + "40": 7.51081, + "41": 7.41055, + "42": 7.37984, + "43": 7.44091, + "44": 7.39372, + "45": 7.37241, + "46": 7.28404, + "47": 7.46627, + "48": 7.29038, + "49": 7.35015, + "50": 7.17193, + "51": 7.37002, + "52": 7.14463, + "53": 7.12651, + "54": 7.23742, + "55": 7.15579, + "56": 7.23152, + "57": 7.3354, + "58": 7.01365, + "59": 7.11427, + "60": 7.15124, + "61": 7.1088, + "62": 7.26824, + "63": 7.15182, + "64": 7.08401, + "65": 6.99127, + "66": 7.05305, + "67": 7.04353, + "68": 7.13973, + "69": 7.03243, + "70": 7.05831, + "71": 6.90378, + "72": 6.99805, + "73": 6.97678, + "74": 6.91757, + "75": 7.06665, + "76": 6.95719, + "77": 7.08701, + "78": 7.03266, + "79": 6.8532, + "80": 6.93633, + "81": 6.97582, + "82": 7.0624, + "83": 6.98226, + "84": 7.00923, + "85": 6.8507, + "86": 7.04663, + "87": 6.97947, + "88": 6.91093, + "89": 6.8168, + "90": 7.24561, + "91": 6.7048, + "92": 7.05407, + "93": 6.89399, + "94": 7.0542, + "95": 6.85047, + "96": 6.96463, + "97": 6.95624, + "98": 6.8829, + "99": 7.00419, + "100": 6.98982 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43288.0, + "2": 44033.0, + "3": 44733.0, + "4": 42406.0, + "5": 45371.0, + "6": 40945.0, + "7": 43173.0, + "8": 45430.0, + "9": 42421.0, + "10": 45369.0, + "11": 43974.0, + "12": 44588.0, + "13": 43908.0, + "14": 46215.0, + "15": 43901.0, + "16": 41603.0, + "17": 43832.0, + "18": 44695.0, + "19": 42547.0, + "20": 44758.0, + "21": 44777.0, + "22": 41821.0, + "23": 45434.0, + "24": 43080.0, + "25": 42439.0, + "26": 43936.0, + "27": 46214.0, + "28": 46342.0, + "29": 46135.0, + "30": 43995.0, + "31": 41271.0, + "32": 43336.0, + "33": 45440.0, + "34": 43287.0, + "35": 43240.0, + "36": 42490.0, + "37": 40078.0, + "38": 42510.0, + "39": 44722.0, + "40": 43230.0, + "41": 44669.0, + "42": 43262.0, + "43": 45476.0, + "44": 44624.0, + "45": 43326.0, + "46": 43945.0, + "47": 42395.0, + "48": 44675.0, + "49": 43169.0, + "50": 43381.0, + "51": 41131.0, + "52": 43830.0, + "53": 43914.0, + "54": 42004.0, + "55": 43871.0, + "56": 43227.0, + "57": 42550.0, + "58": 43816.0, + "59": 44631.0, + "60": 41183.0, + "61": 39721.0, + "62": 44752.0, + "63": 44696.0, + "64": 45351.0, + "65": 44694.0, + "66": 45350.0, + "67": 43132.0, + "68": 42535.0, + "69": 43829.0, + "70": 45533.0, + "71": 43322.0, + "72": 44749.0, + "73": 45365.0, + "74": 42492.0, + "75": 44655.0, + "76": 43920.0, + "77": 42080.0, + "78": 40298.0, + "79": 38909.0, + "80": 41117.0, + "81": 45370.0, + "82": 43206.0, + "83": 38501.0, + "84": 42484.0, + "85": 43986.0, + "86": 45704.0, + "87": 40839.0, + "88": 41828.0, + "89": 41074.0, + "90": 44663.0, + "91": 46169.0, + "92": 41807.0, + "93": 43228.0, + "94": 39549.0, + "95": 44090.0, + "96": 44711.0, + "97": 45390.0, + "98": 41799.0, + "99": 45426.0, + "100": 42443.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2194357248.0, + "2": 2194357248.0, + "3": 2194357248.0, + "4": 2194357248.0, + "5": 2194357248.0, + "6": 2194357248.0, + "7": 2194357248.0, + "8": 2194357248.0, + "9": 2194357248.0, + "10": 2194357248.0, + "11": 2194357248.0, + "12": 2194357248.0, + "13": 2194357248.0, + "14": 2194357248.0, + "15": 2194357248.0, + "16": 2194357248.0, + "17": 2194357248.0, + "18": 2194357248.0, + "19": 2194357248.0, + "20": 2194357248.0, + "21": 2194357248.0, + "22": 2194357248.0, + "23": 2194357248.0, + "24": 2194357248.0, + "25": 2194357248.0, + "26": 2194357248.0, + "27": 2194357248.0, + "28": 2194357248.0, + "29": 2194357248.0, + "30": 2194357248.0, + "31": 2194357248.0, + "32": 2194357248.0, + "33": 2194357248.0, + "34": 2194357248.0, + "35": 2194357248.0, + "36": 2194357248.0, + "37": 2194357248.0, + "38": 2194357248.0, + "39": 2194357248.0, + "40": 2194357248.0, + "41": 2194357248.0, + "42": 2194357248.0, + "43": 2194357248.0, + "44": 2194357248.0, + "45": 2194357248.0, + "46": 2194357248.0, + "47": 2194357248.0, + "48": 2194357248.0, + "49": 2194357248.0, + "50": 2194357248.0, + "51": 2194357248.0, + "52": 2194357248.0, + "53": 2194357248.0, + "54": 2194357248.0, + "55": 2194357248.0, + "56": 2194357248.0, + "57": 2194357248.0, + "58": 2194357248.0, + "59": 2194357248.0, + "60": 2194357248.0, + "61": 2194357248.0, + "62": 2194357248.0, + "63": 2194357248.0, + "64": 2194357248.0, + "65": 2194357248.0, + "66": 2194357248.0, + "67": 2194357248.0, + "68": 2194357248.0, + "69": 2194357248.0, + "70": 2194357248.0, + "71": 2194357248.0, + "72": 2194357248.0, + "73": 2194357248.0, + "74": 2194357248.0, + "75": 2194357248.0, + "76": 2194357248.0, + "77": 2194357248.0, + "78": 2194357248.0, + "79": 2194357248.0, + "80": 2194357248.0, + "81": 2194357248.0, + "82": 2194357248.0, + "83": 2194357248.0, + "84": 2194357248.0, + "85": 2194357248.0, + "86": 2194357248.0, + "87": 2194357248.0, + "88": 2194357248.0, + "89": 2194357248.0, + "90": 2194357248.0, + "91": 2194357248.0, + "92": 2194357248.0, + "93": 2194357248.0, + "94": 2194357248.0, + "95": 2194357248.0, + "96": 2194357248.0, + "97": 2194357248.0, + "98": 2194357248.0, + "99": 2194357248.0, + "100": 2194357248.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2443624960.0, + "2": 3375193600.0, + "3": 3375193600.0, + "4": 3375193600.0, + "5": 3375193600.0, + "6": 3375193600.0, + "7": 3375193600.0, + "8": 3375193600.0, + "9": 3375193600.0, + "10": 3375193600.0, + "11": 3375193600.0, + "12": 3375193600.0, + "13": 3375193600.0, + "14": 3375193600.0, + "15": 3375193600.0, + "16": 3375193600.0, + "17": 3375193600.0, + "18": 3375193600.0, + "19": 3375193600.0, + "20": 3375193600.0, + "21": 3375193600.0, + "22": 3375193600.0, + "23": 3375193600.0, + "24": 3375193600.0, + "25": 3375193600.0, + "26": 3375193600.0, + "27": 3375193600.0, + "28": 3375193600.0, + "29": 3375193600.0, + "30": 3375193600.0, + "31": 3375193600.0, + "32": 3375193600.0, + "33": 3375193600.0, + "34": 3375193600.0, + "35": 3375193600.0, + "36": 3375193600.0, + "37": 3375193600.0, + "38": 3375193600.0, + "39": 3375193600.0, + "40": 3375193600.0, + "41": 3375193600.0, + "42": 3375193600.0, + "43": 3375193600.0, + "44": 3375193600.0, + "45": 3375193600.0, + "46": 3375193600.0, + "47": 3375193600.0, + "48": 3375193600.0, + "49": 3375193600.0, + "50": 3375193600.0, + "51": 3375193600.0, + "52": 3375193600.0, + "53": 3375193600.0, + "54": 3375193600.0, + "55": 3375193600.0, + "56": 3375193600.0, + "57": 3375193600.0, + "58": 3375193600.0, + "59": 3375193600.0, + "60": 3375193600.0, + "61": 3375193600.0, + "62": 3375193600.0, + "63": 3375193600.0, + "64": 3375193600.0, + "65": 3375193600.0, + "66": 3375193600.0, + "67": 3375193600.0, + "68": 3375193600.0, + "69": 3375193600.0, + "70": 3375193600.0, + "71": 3375193600.0, + "72": 3375193600.0, + "73": 3375193600.0, + "74": 3375193600.0, + "75": 3375193600.0, + "76": 3375193600.0, + "77": 3375193600.0, + "78": 3375193600.0, + "79": 3375193600.0, + "80": 3375193600.0, + "81": 3375193600.0, + "82": 3375193600.0, + "83": 3375193600.0, + "84": 3375193600.0, + "85": 3375193600.0, + "86": 3375193600.0, + "87": 3375193600.0, + "88": 3375193600.0, + "89": 3375193600.0, + "90": 3375193600.0, + "91": 3375193600.0, + "92": 3375193600.0, + "93": 3375193600.0, + "94": 3375193600.0, + "95": 3375193600.0, + "96": 3375193600.0, + "97": 3375193600.0, + "98": 3375193600.0, + "99": 3375193600.0, + "100": 3375193600.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.22746, + "2": 0.38672, + "3": 0.30057, + "4": 0.29952, + "5": 0.29937, + "6": 0.29647, + "7": 0.29649, + "8": 0.29992, + "9": 0.29725, + "10": 0.29982, + "11": 0.29727, + "12": 0.3034, + "13": 0.29711, + "14": 0.29921, + "15": 0.2997, + "16": 0.29771, + "17": 0.29978, + "18": 0.30707, + "19": 0.30368, + "20": 0.30288, + "21": 0.30688, + "22": 0.30971, + "23": 0.29768, + "24": 0.30093, + "25": 0.30176, + "26": 0.30414, + "27": 0.29913, + "28": 0.29878, + "29": 0.29642, + "30": 0.3006, + "31": 0.30797, + "32": 0.30896, + "33": 0.30968, + "34": 0.3612, + "35": 0.30538, + "36": 0.30053, + "37": 0.59472, + "38": 0.30268, + "39": 0.306, + "40": 0.29983, + "41": 0.30255, + "42": 0.30761, + "43": 0.30015, + "44": 0.30214, + "45": 0.29904, + "46": 0.29871, + "47": 0.63098, + "48": 0.58973, + "49": 0.29989, + "50": 0.29759, + "51": 0.29699, + "52": 0.30117, + "53": 0.61374, + "54": 0.30194, + "55": 0.29408, + "56": 0.6341, + "57": 0.29608, + "58": 0.29787, + "59": 0.29707, + "60": 0.30154, + "61": 0.29779, + "62": 0.29855, + "63": 0.60825, + "64": 0.29897, + "65": 0.30635, + "66": 0.61882, + "67": 0.29871, + "68": 0.29693, + "69": 0.30148, + "70": 0.31212, + "71": 0.30211, + "72": 0.29679, + "73": 0.30078, + "74": 0.29883, + "75": 0.2978, + "76": 0.30303, + "77": 0.29772, + "78": 0.29776, + "79": 0.29689, + "80": 0.30425, + "81": 0.29967, + "82": 0.29825, + "83": 0.297, + "84": 0.30863, + "85": 0.30218, + "86": 0.30302, + "87": 0.30826, + "88": 0.30068, + "89": 0.29946, + "90": 0.60541, + "91": 0.30424, + "92": 0.30059, + "93": 0.30421, + "94": 0.30633, + "95": 0.29891, + "96": 0.35038, + "97": 0.29632, + "98": 0.29835, + "99": 0.29931, + "100": 0.30272 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..89582b25851 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.37205, + "2": 10.36993, + "3": 9.85245, + "4": 9.61997, + "5": 9.40867, + "6": 9.43219, + "7": 9.31484, + "8": 9.27336, + "9": 9.11412, + "10": 9.03968, + "11": 8.87198, + "12": 8.80862, + "13": 8.83469, + "14": 8.69021, + "15": 8.66221, + "16": 8.54816, + "17": 8.50088, + "18": 8.42516, + "19": 8.38808, + "20": 8.28073, + "21": 8.26592, + "22": 8.15988, + "23": 8.11241, + "24": 8.14271, + "25": 7.98425, + "26": 8.10594, + "27": 7.88954, + "28": 7.9705, + "29": 7.81272, + "30": 7.87636, + "31": 7.82505, + "32": 7.70262, + "33": 7.80169, + "34": 7.56872, + "35": 7.67373, + "36": 7.54686, + "37": 7.47401, + "38": 7.50726, + "39": 7.49794, + "40": 7.51081, + "41": 7.41055, + "42": 7.37984, + "43": 7.44091, + "44": 7.39372, + "45": 7.37241, + "46": 7.28404, + "47": 7.46627, + "48": 7.29038, + "49": 7.35015, + "50": 7.17193, + "51": 7.37002, + "52": 7.14463, + "53": 7.12651, + "54": 7.23742, + "55": 7.15579, + "56": 7.23152, + "57": 7.3354, + "58": 7.01365, + "59": 7.11427, + "60": 7.15124, + "61": 7.1088, + "62": 7.26824, + "63": 7.15182, + "64": 7.08401, + "65": 6.99127, + "66": 7.05305, + "67": 7.04353, + "68": 7.13973, + "69": 7.03243, + "70": 7.05831, + "71": 6.90378, + "72": 6.99805, + "73": 6.97678, + "74": 6.91757, + "75": 7.06665, + "76": 6.95719, + "77": 7.08701, + "78": 7.03266, + "79": 6.8532, + "80": 6.93633, + "81": 6.97582, + "82": 7.0624, + "83": 6.98226, + "84": 7.00923, + "85": 6.8507, + "86": 7.04663, + "87": 6.97947, + "88": 6.91093, + "89": 6.8168, + "90": 7.24561, + "91": 6.7048, + "92": 7.05407, + "93": 6.89399, + "94": 7.0542, + "95": 6.85047, + "96": 6.96463, + "97": 6.95624, + "98": 6.8829, + "99": 7.00419, + "100": 6.98982 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43288.0, + "2": 44033.0, + "3": 44733.0, + "4": 42406.0, + "5": 45371.0, + "6": 40945.0, + "7": 43173.0, + "8": 45430.0, + "9": 42421.0, + "10": 45369.0, + "11": 43974.0, + "12": 44588.0, + "13": 43908.0, + "14": 46215.0, + "15": 43901.0, + "16": 41603.0, + "17": 43832.0, + "18": 44695.0, + "19": 42547.0, + "20": 44758.0, + "21": 44777.0, + "22": 41821.0, + "23": 45434.0, + "24": 43080.0, + "25": 42439.0, + "26": 43936.0, + "27": 46214.0, + "28": 46342.0, + "29": 46135.0, + "30": 43995.0, + "31": 41271.0, + "32": 43336.0, + "33": 45440.0, + "34": 43287.0, + "35": 43240.0, + "36": 42490.0, + "37": 40078.0, + "38": 42510.0, + "39": 44722.0, + "40": 43230.0, + "41": 44669.0, + "42": 43262.0, + "43": 45476.0, + "44": 44624.0, + "45": 43326.0, + "46": 43945.0, + "47": 42395.0, + "48": 44675.0, + "49": 43169.0, + "50": 43381.0, + "51": 41131.0, + "52": 43830.0, + "53": 43914.0, + "54": 42004.0, + "55": 43871.0, + "56": 43227.0, + "57": 42550.0, + "58": 43816.0, + "59": 44631.0, + "60": 41183.0, + "61": 39721.0, + "62": 44752.0, + "63": 44696.0, + "64": 45351.0, + "65": 44694.0, + "66": 45350.0, + "67": 43132.0, + "68": 42535.0, + "69": 43829.0, + "70": 45533.0, + "71": 43322.0, + "72": 44749.0, + "73": 45365.0, + "74": 42492.0, + "75": 44655.0, + "76": 43920.0, + "77": 42080.0, + "78": 40298.0, + "79": 38909.0, + "80": 41117.0, + "81": 45370.0, + "82": 43206.0, + "83": 38501.0, + "84": 42484.0, + "85": 43986.0, + "86": 45704.0, + "87": 40839.0, + "88": 41828.0, + "89": 41074.0, + "90": 44663.0, + "91": 46169.0, + "92": 41807.0, + "93": 43228.0, + "94": 39549.0, + "95": 44090.0, + "96": 44711.0, + "97": 45390.0, + "98": 41799.0, + "99": 45426.0, + "100": 42443.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2194357248.0, + "2": 2194357248.0, + "3": 2194357248.0, + "4": 2194357248.0, + "5": 2194357248.0, + "6": 2194357248.0, + "7": 2194357248.0, + "8": 2194357248.0, + "9": 2194357248.0, + "10": 2194357248.0, + "11": 2194357248.0, + "12": 2194357248.0, + "13": 2194357248.0, + "14": 2194357248.0, + "15": 2194357248.0, + "16": 2194357248.0, + "17": 2194357248.0, + "18": 2194357248.0, + "19": 2194357248.0, + "20": 2194357248.0, + "21": 2194357248.0, + "22": 2194357248.0, + "23": 2194357248.0, + "24": 2194357248.0, + "25": 2194357248.0, + "26": 2194357248.0, + "27": 2194357248.0, + "28": 2194357248.0, + "29": 2194357248.0, + "30": 2194357248.0, + "31": 2194357248.0, + "32": 2194357248.0, + "33": 2194357248.0, + "34": 2194357248.0, + "35": 2194357248.0, + "36": 2194357248.0, + "37": 2194357248.0, + "38": 2194357248.0, + "39": 2194357248.0, + "40": 2194357248.0, + "41": 2194357248.0, + "42": 2194357248.0, + "43": 2194357248.0, + "44": 2194357248.0, + "45": 2194357248.0, + "46": 2194357248.0, + "47": 2194357248.0, + "48": 2194357248.0, + "49": 2194357248.0, + "50": 2194357248.0, + "51": 2194357248.0, + "52": 2194357248.0, + "53": 2194357248.0, + "54": 2194357248.0, + "55": 2194357248.0, + "56": 2194357248.0, + "57": 2194357248.0, + "58": 2194357248.0, + "59": 2194357248.0, + "60": 2194357248.0, + "61": 2194357248.0, + "62": 2194357248.0, + "63": 2194357248.0, + "64": 2194357248.0, + "65": 2194357248.0, + "66": 2194357248.0, + "67": 2194357248.0, + "68": 2194357248.0, + "69": 2194357248.0, + "70": 2194357248.0, + "71": 2194357248.0, + "72": 2194357248.0, + "73": 2194357248.0, + "74": 2194357248.0, + "75": 2194357248.0, + "76": 2194357248.0, + "77": 2194357248.0, + "78": 2194357248.0, + "79": 2194357248.0, + "80": 2194357248.0, + "81": 2194357248.0, + "82": 2194357248.0, + "83": 2194357248.0, + "84": 2194357248.0, + "85": 2194357248.0, + "86": 2194357248.0, + "87": 2194357248.0, + "88": 2194357248.0, + "89": 2194357248.0, + "90": 2194357248.0, + "91": 2194357248.0, + "92": 2194357248.0, + "93": 2194357248.0, + "94": 2194357248.0, + "95": 2194357248.0, + "96": 2194357248.0, + "97": 2194357248.0, + "98": 2194357248.0, + "99": 2194357248.0, + "100": 2194357248.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2443624960.0, + "2": 3375193600.0, + "3": 3375193600.0, + "4": 3375193600.0, + "5": 3375193600.0, + "6": 3375193600.0, + "7": 3375193600.0, + "8": 3375193600.0, + "9": 3375193600.0, + "10": 3375193600.0, + "11": 3375193600.0, + "12": 3375193600.0, + "13": 3375193600.0, + "14": 3375193600.0, + "15": 3375193600.0, + "16": 3375193600.0, + "17": 3375193600.0, + "18": 3375193600.0, + "19": 3375193600.0, + "20": 3375193600.0, + "21": 3375193600.0, + "22": 3375193600.0, + "23": 3375193600.0, + "24": 3375193600.0, + "25": 3375193600.0, + "26": 3375193600.0, + "27": 3375193600.0, + "28": 3375193600.0, + "29": 3375193600.0, + "30": 3375193600.0, + "31": 3375193600.0, + "32": 3375193600.0, + "33": 3375193600.0, + "34": 3375193600.0, + "35": 3375193600.0, + "36": 3375193600.0, + "37": 3375193600.0, + "38": 3375193600.0, + "39": 3375193600.0, + "40": 3375193600.0, + "41": 3375193600.0, + "42": 3375193600.0, + "43": 3375193600.0, + "44": 3375193600.0, + "45": 3375193600.0, + "46": 3375193600.0, + "47": 3375193600.0, + "48": 3375193600.0, + "49": 3375193600.0, + "50": 3375193600.0, + "51": 3375193600.0, + "52": 3375193600.0, + "53": 3375193600.0, + "54": 3375193600.0, + "55": 3375193600.0, + "56": 3375193600.0, + "57": 3375193600.0, + "58": 3375193600.0, + "59": 3375193600.0, + "60": 3375193600.0, + "61": 3375193600.0, + "62": 3375193600.0, + "63": 3375193600.0, + "64": 3375193600.0, + "65": 3375193600.0, + "66": 3375193600.0, + "67": 3375193600.0, + "68": 3375193600.0, + "69": 3375193600.0, + "70": 3375193600.0, + "71": 3375193600.0, + "72": 3375193600.0, + "73": 3375193600.0, + "74": 3375193600.0, + "75": 3375193600.0, + "76": 3375193600.0, + "77": 3375193600.0, + "78": 3375193600.0, + "79": 3375193600.0, + "80": 3375193600.0, + "81": 3375193600.0, + "82": 3375193600.0, + "83": 3375193600.0, + "84": 3375193600.0, + "85": 3375193600.0, + "86": 3375193600.0, + "87": 3375193600.0, + "88": 3375193600.0, + "89": 3375193600.0, + "90": 3375193600.0, + "91": 3375193600.0, + "92": 3375193600.0, + "93": 3375193600.0, + "94": 3375193600.0, + "95": 3375193600.0, + "96": 3375193600.0, + "97": 3375193600.0, + "98": 3375193600.0, + "99": 3375193600.0, + "100": 3375193600.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.37156, + "2": 0.38887, + "3": 0.36602, + "4": 0.35866, + "5": 0.36165, + "6": 0.37465, + "7": 0.35731, + "8": 0.3641, + "9": 0.35988, + "10": 0.35622, + "11": 0.36397, + "12": 0.36059, + "13": 0.35322, + "14": 0.36378, + "15": 0.35044, + "16": 0.351, + "17": 0.3614, + "18": 0.3499, + "19": 0.3502, + "20": 0.35899, + "21": 0.34832, + "22": 0.35463, + "23": 0.36264, + "24": 0.3582, + "25": 0.68028, + "26": 0.35807, + "27": 0.36086, + "28": 0.3546, + "29": 0.35008, + "30": 0.36639, + "31": 0.35917, + "32": 0.35093, + "33": 0.42545, + "34": 0.36458, + "35": 0.36139, + "36": 0.66018, + "37": 0.36179, + "38": 0.35264, + "39": 0.35347, + "40": 0.35947, + "41": 0.65933, + "42": 0.36488, + "43": 0.35596, + "44": 0.35639, + "45": 0.35817, + "46": 0.35914, + "47": 0.65482, + "48": 0.35543, + "49": 0.3548, + "50": 0.36559, + "51": 0.3585, + "52": 0.35668, + "53": 0.3592, + "54": 0.35503, + "55": 0.36108, + "56": 0.74128, + "57": 0.36657, + "58": 0.36018, + "59": 0.35608, + "60": 0.36593, + "61": 0.35388, + "62": 0.35617, + "63": 0.63145, + "64": 0.35737, + "65": 0.36509, + "66": 0.35793, + "67": 0.36215, + "68": 0.35502, + "69": 0.35608, + "70": 0.36406, + "71": 0.35939, + "72": 0.36012, + "73": 0.36102, + "74": 0.35997, + "75": 0.35821, + "76": 0.36372, + "77": 0.36015, + "78": 0.36089, + "79": 0.3626, + "80": 0.36632, + "81": 0.36481, + "82": 0.38444, + "83": 0.36154, + "84": 0.37204, + "85": 0.35784, + "86": 0.35591, + "87": 0.36678, + "88": 0.73353, + "89": 0.36867, + "90": 0.36231, + "91": 0.36826, + "92": 0.35945, + "93": 0.36394, + "94": 0.43835, + "95": 0.36152, + "96": 0.36154, + "97": 0.35778, + "98": 0.35857, + "99": 0.36061, + "100": 0.35857 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_h100.json index d92e66d3e29..a2d102b7a2b 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.36406, + "2": 10.37672, + "3": 9.84285, + "4": 9.61995, "5": 9.4049, + "6": 9.42891, + "7": 9.31288, + "8": 9.27047, + "9": 9.10629, "10": 9.03569, + "11": 8.86423, + "12": 8.80988, + "13": 8.8329, + "14": 8.69011, "15": 8.66187, + "16": 8.54768, + "17": 8.50183, + "18": 8.42362, + "19": 8.38674, "20": 8.27993, + "21": 8.26472, + "22": 8.15738, + "23": 8.11148, + "24": 8.14234, "25": 7.98343, + "26": 8.10636, + "27": 7.88853, + "28": 7.97024, + "29": 7.8121, "30": 7.87698, + "31": 7.82339, + "32": 7.70086, + "33": 7.80317, + "34": 7.56843, "35": 7.67276, + "36": 7.54942, + "37": 7.475, + "38": 7.51068, + "39": 7.49979, "40": 7.51131, + "41": 7.41252, + "42": 7.38333, + "43": 7.4414, + "44": 7.39857, "45": 7.37352, + "46": 7.28824, + "47": 7.4683, + "48": 7.29457, + "49": 7.35181, "50": 7.17223, + "51": 7.37216, + "52": 7.14588, + "53": 7.12384, + "54": 7.23984, "55": 7.15454, + "56": 7.23308, + "57": 7.33501, + "58": 7.01226, + "59": 7.12063, "60": 7.15043, + "61": 7.11076, + "62": 7.26458, + "63": 7.1544, + "64": 7.08651, "65": 6.99077, + "66": 7.05503, + "67": 7.04463, + "68": 7.136, + "69": 7.03404, "70": 7.05994, + "71": 6.90146, + "72": 6.99845, + "73": 6.97783, + "74": 6.92205, "75": 7.06268, + "76": 6.95612, + "77": 7.08838, + "78": 7.02608, + "79": 6.85354, "80": 6.93543, + "81": 6.97396, + "82": 7.05854, + "83": 6.98003, + "84": 7.00602, "85": 6.84771, + "86": 7.04197, + "87": 6.97366, + "88": 6.90817, + "89": 6.80902, "90": 7.23999, + "91": 6.70221, + "92": 7.0543, + "93": 6.89332, + "94": 7.05002, "95": 6.84547, + "96": 6.96202, + "97": 6.95355, + "98": 6.8731, + "99": 6.99831, "100": 6.98508 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 43317.0, + "2": 44065.0, + "3": 44730.0, + "4": 42374.0, "5": 45387.0, + "6": 40937.0, + "7": 43166.0, + "8": 45433.0, + "9": 42439.0, "10": 45374.0, + "11": 43947.0, + "12": 44584.0, + "13": 43908.0, + "14": 46205.0, "15": 43901.0, + "16": 41607.0, + "17": 43831.0, + "18": 44698.0, + "19": 42543.0, "20": 44759.0, + "21": 44734.0, + "22": 41850.0, + "23": 45416.0, + "24": 43069.0, "25": 42442.0, + "26": 43923.0, + "27": 46212.0, + "28": 46362.0, + "29": 46133.0, "30": 43978.0, + "31": 41220.0, + "32": 43307.0, + "33": 45440.0, + "34": 43284.0, "35": 43248.0, + "36": 42437.0, + "37": 40066.0, + "38": 42483.0, + "39": 44702.0, "40": 43230.0, + "41": 44672.0, + "42": 43202.0, + "43": 45459.0, + "44": 44609.0, "45": 43265.0, + "46": 43915.0, + "47": 42366.0, + "48": 44650.0, + "49": 43139.0, "50": 43399.0, + "51": 41159.0, + "52": 43818.0, + "53": 43924.0, + "54": 41952.0, "55": 43866.0, + "56": 43239.0, + "57": 42540.0, + "58": 43856.0, + "59": 44589.0, "60": 41152.0, + "61": 39709.0, + "62": 44822.0, + "63": 44663.0, + "64": 45372.0, "65": 44676.0, + "66": 45345.0, + "67": 43130.0, + "68": 42567.0, + "69": 43812.0, "70": 45538.0, + "71": 43282.0, + "72": 44765.0, + "73": 45354.0, + "74": 42517.0, "75": 44666.0, + "76": 43904.0, + "77": 42041.0, + "78": 40320.0, + "79": 38914.0, "80": 41081.0, + "81": 45333.0, + "82": 43195.0, + "83": 38489.0, + "84": 42436.0, "85": 43978.0, + "86": 45680.0, + "87": 40832.0, + "88": 41797.0, + "89": 41083.0, "90": 44676.0, + "91": 46190.0, + "92": 41837.0, + "93": 43234.0, + "94": 39504.0, "95": 44067.0, + "96": 44684.0, + "97": 45419.0, + "98": 41854.0, + "99": 45431.0, "100": 42479.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2195405824.0, + "2": 2195405824.0, + "3": 2195405824.0, + "4": 2195405824.0, "5": 2195405824.0, + "6": 2195405824.0, + "7": 2195405824.0, + "8": 2195405824.0, + "9": 2195405824.0, "10": 2195405824.0, + "11": 2195405824.0, + "12": 2195405824.0, + "13": 2195405824.0, + "14": 2195405824.0, "15": 2195405824.0, + "16": 2195405824.0, + "17": 2195405824.0, + "18": 2195405824.0, + "19": 2195405824.0, "20": 2195405824.0, + "21": 2195405824.0, + "22": 2195405824.0, + "23": 2195405824.0, + "24": 2195405824.0, "25": 2195405824.0, + "26": 2195405824.0, + "27": 2195405824.0, + "28": 2195405824.0, + "29": 2195405824.0, "30": 2195405824.0, + "31": 2195405824.0, + "32": 2195405824.0, + "33": 2195405824.0, + "34": 2195405824.0, "35": 2195405824.0, + "36": 2195405824.0, + "37": 2195405824.0, + "38": 2195405824.0, + "39": 2195405824.0, "40": 2195405824.0, + "41": 2195405824.0, + "42": 2195405824.0, + "43": 2195405824.0, + "44": 2195405824.0, "45": 2195405824.0, + "46": 2195405824.0, + "47": 2195405824.0, + "48": 2195405824.0, + "49": 2195405824.0, "50": 2195405824.0, + "51": 2195405824.0, + "52": 2195405824.0, + "53": 2195405824.0, + "54": 2195405824.0, "55": 2195405824.0, + "56": 2195405824.0, + "57": 2195405824.0, + "58": 2195405824.0, + "59": 2195405824.0, "60": 2195405824.0, + "61": 2195405824.0, + "62": 2195405824.0, + "63": 2195405824.0, + "64": 2195405824.0, "65": 2195405824.0, + "66": 2195405824.0, + "67": 2195405824.0, + "68": 2195405824.0, + "69": 2195405824.0, "70": 2195405824.0, + "71": 2195405824.0, + "72": 2195405824.0, + "73": 2195405824.0, + "74": 2195405824.0, "75": 2195405824.0, + "76": 2195405824.0, + "77": 2195405824.0, + "78": 2195405824.0, + "79": 2195405824.0, "80": 2195405824.0, + "81": 2195405824.0, + "82": 2195405824.0, + "83": 2195405824.0, + "84": 2195405824.0, "85": 2195405824.0, + "86": 2195405824.0, + "87": 2195405824.0, + "88": 2195405824.0, + "89": 2195405824.0, "90": 2195405824.0, + "91": 2195405824.0, + "92": 2195405824.0, + "93": 2195405824.0, + "94": 2195405824.0, "95": 2195405824.0, + "96": 2195405824.0, + "97": 2195405824.0, + "98": 2195405824.0, + "99": 2195405824.0, "100": 2195405824.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2302114304.0, + "2": 3236697600.0, + "3": 3236697600.0, + "4": 3236697600.0, "5": 3236697600.0, + "6": 3236697600.0, + "7": 3236697600.0, + "8": 3236697600.0, + "9": 3236697600.0, "10": 3236697600.0, + "11": 3236697600.0, + "12": 3236697600.0, + "13": 3236697600.0, + "14": 3236697600.0, "15": 3236697600.0, + "16": 3236697600.0, + "17": 3236697600.0, + "18": 3236697600.0, + "19": 3236697600.0, "20": 3236697600.0, + "21": 3236697600.0, + "22": 3236697600.0, + "23": 3236697600.0, + "24": 3236697600.0, "25": 3236697600.0, + "26": 3236697600.0, + "27": 3236697600.0, + "28": 3236697600.0, + "29": 3236697600.0, "30": 3236697600.0, + "31": 3236697600.0, + "32": 3236697600.0, + "33": 3236697600.0, + "34": 3236697600.0, "35": 3236697600.0, + "36": 3236697600.0, + "37": 3236697600.0, + "38": 3236697600.0, + "39": 3236697600.0, "40": 3236697600.0, + "41": 3236697600.0, + "42": 3236697600.0, + "43": 3236697600.0, + "44": 3236697600.0, "45": 3236697600.0, + "46": 3236697600.0, + "47": 3236697600.0, + "48": 3236697600.0, + "49": 3236697600.0, "50": 3236697600.0, + "51": 3236697600.0, + "52": 3236697600.0, + "53": 3236697600.0, + "54": 3236697600.0, "55": 3236697600.0, + "56": 3236697600.0, + "57": 3236697600.0, + "58": 3236697600.0, + "59": 3236697600.0, "60": 3236697600.0, + "61": 3236697600.0, + "62": 3236697600.0, + "63": 3236697600.0, + "64": 3236697600.0, "65": 3236697600.0, + "66": 3236697600.0, + "67": 3236697600.0, + "68": 3236697600.0, + "69": 3236697600.0, "70": 3236697600.0, + "71": 3236697600.0, + "72": 3236697600.0, + "73": 3236697600.0, + "74": 3236697600.0, "75": 3236697600.0, + "76": 3236697600.0, + "77": 3236697600.0, + "78": 3236697600.0, + "79": 3236697600.0, "80": 3236697600.0, + "81": 3236697600.0, + "82": 3236697600.0, + "83": 3236697600.0, + "84": 3236697600.0, "85": 3236697600.0, + "86": 3236697600.0, + "87": 3236697600.0, + "88": 3236697600.0, + "89": 3236697600.0, "90": 3236697600.0, + "91": 3236697600.0, + "92": 3236697600.0, + "93": 3236697600.0, + "94": 3236697600.0, "95": 3236697600.0, + "96": 3236697600.0, + "97": 3236697600.0, + "98": 3236697600.0, + "99": 3236697600.0, "100": 3236697600.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 8.09413, - "5": 0.31937, - "10": 0.3209, - "15": 0.34398, - "20": 0.33703, - "25": 0.33879, - "30": 0.32402, - "35": 0.32278, - "40": 0.32002, - "45": 0.31746, - "50": 0.3177, - "55": 0.31702, - "60": 0.31688, - "65": 0.35512, - "70": 0.32025, - "75": 0.32573, - "80": 0.32598, - "85": 0.32473, - "90": 0.31989, - "95": 0.32153, - "100": 0.33062 + "1": 9.77057, + "2": 0.47803, + "3": 0.39521, + "4": 0.3896, + "5": 0.40677, + "6": 0.40092, + "7": 0.37896, + "8": 0.41825, + "9": 0.38419, + "10": 0.38253, + "11": 0.388, + "12": 0.37925, + "13": 0.38239, + "14": 0.38417, + "15": 0.38038, + "16": 0.38563, + "17": 0.37955, + "18": 0.37924, + "19": 0.38589, + "20": 0.38224, + "21": 0.38465, + "22": 0.39351, + "23": 0.39472, + "24": 0.41255, + "25": 0.37965, + "26": 0.38355, + "27": 0.38309, + "28": 0.38253, + "29": 0.38831, + "30": 0.39434, + "31": 0.38798, + "32": 0.39078, + "33": 0.38911, + "34": 0.39627, + "35": 0.39394, + "36": 0.38355, + "37": 0.39453, + "38": 0.39933, + "39": 0.77019, + "40": 0.39504, + "41": 0.39035, + "42": 0.38272, + "43": 0.69367, + "44": 0.38983, + "45": 0.38622, + "46": 0.39091, + "47": 0.38234, + "48": 0.40833, + "49": 0.39525, + "50": 0.39478, + "51": 0.38185, + "52": 0.72146, + "53": 0.71311, + "54": 0.39457, + "55": 0.38277, + "56": 0.38969, + "57": 0.38363, + "58": 0.39928, + "59": 0.38579, + "60": 0.74396, + "61": 0.38508, + "62": 0.70202, + "63": 0.38295, + "64": 0.38027, + "65": 0.38758, + "66": 0.38184, + "67": 0.38386, + "68": 0.39654, + "69": 0.4087, + "70": 0.38668, + "71": 0.38146, + "72": 0.3836, + "73": 0.38965, + "74": 0.38207, + "75": 0.39256, + "76": 0.38363, + "77": 0.38092, + "78": 0.39131, + "79": 0.38231, + "80": 0.38962, + "81": 0.39663, + "82": 0.3956, + "83": 0.38416, + "84": 0.38159, + "85": 0.40841, + "86": 0.40201, + "87": 0.37934, + "88": 0.38888, + "89": 0.38181, + "90": 0.38763, + "91": 0.38558, + "92": 0.3862, + "93": 0.39397, + "94": 0.39231, + "95": 0.38616, + "96": 0.39411, + "97": 0.39063, + "98": 0.39664, + "99": 0.39039, + "100": 0.38619 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..42f8893c04e --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.38736, + "2": 10.37971, + "3": 9.79428, + "4": 9.59941, + "5": 9.38281, + "6": 9.40765, + "7": 9.31116, + "8": 9.25004, + "9": 9.1304, + "10": 9.06783, + "11": 8.89519, + "12": 8.8149, + "13": 8.82749, + "14": 8.69768, + "15": 8.65706, + "16": 8.54479, + "17": 8.50168, + "18": 8.39069, + "19": 8.36692, + "20": 8.26603, + "21": 8.27533, + "22": 8.14757, + "23": 8.0735, + "24": 8.12127, + "25": 7.98158, + "26": 8.09181, + "27": 7.87361, + "28": 7.96832, + "29": 7.80579, + "30": 7.87182, + "31": 7.818, + "32": 7.69078, + "33": 7.7864, + "34": 7.55667, + "35": 7.66308, + "36": 7.52559, + "37": 7.44779, + "38": 7.50335, + "39": 7.45281, + "40": 7.50499, + "41": 7.38901, + "42": 7.36263, + "43": 7.43543, + "44": 7.37578, + "45": 7.3523, + "46": 7.2817, + "47": 7.46121, + "48": 7.29037, + "49": 7.35179, + "50": 7.17986, + "51": 7.36821, + "52": 7.13332, + "53": 7.11532, + "54": 7.23214, + "55": 7.15383, + "56": 7.22184, + "57": 7.33328, + "58": 7.02116, + "59": 7.11467, + "60": 7.14998, + "61": 7.1117, + "62": 7.25117, + "63": 7.15586, + "64": 7.08539, + "65": 6.99542, + "66": 7.05924, + "67": 7.04804, + "68": 7.13906, + "69": 7.03428, + "70": 7.0643, + "71": 6.9218, + "72": 7.00511, + "73": 6.97917, + "74": 6.92066, + "75": 7.06414, + "76": 6.97532, + "77": 7.0837, + "78": 7.01986, + "79": 6.86115, + "80": 6.94493, + "81": 6.97847, + "82": 7.06834, + "83": 6.99434, + "84": 7.01114, + "85": 6.8595, + "86": 7.04211, + "87": 6.98111, + "88": 6.91353, + "89": 6.81096, + "90": 7.25918, + "91": 6.71195, + "92": 7.05431, + "93": 6.91084, + "94": 7.06872, + "95": 6.84927, + "96": 6.98126, + "97": 6.96743, + "98": 6.89421, + "99": 7.0152, + "100": 6.99082 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43296.0, + "2": 44067.0, + "3": 44759.0, + "4": 42367.0, + "5": 45373.0, + "6": 40966.0, + "7": 43147.0, + "8": 45448.0, + "9": 42470.0, + "10": 45357.0, + "11": 43969.0, + "12": 44583.0, + "13": 43897.0, + "14": 46189.0, + "15": 43909.0, + "16": 41613.0, + "17": 43823.0, + "18": 44678.0, + "19": 42556.0, + "20": 44765.0, + "21": 44723.0, + "22": 41820.0, + "23": 45463.0, + "24": 43077.0, + "25": 42457.0, + "26": 43913.0, + "27": 46221.0, + "28": 46390.0, + "29": 46160.0, + "30": 43999.0, + "31": 41276.0, + "32": 43316.0, + "33": 45432.0, + "34": 43303.0, + "35": 43276.0, + "36": 42461.0, + "37": 40045.0, + "38": 42557.0, + "39": 44701.0, + "40": 43214.0, + "41": 44667.0, + "42": 43241.0, + "43": 45448.0, + "44": 44605.0, + "45": 43265.0, + "46": 43892.0, + "47": 42375.0, + "48": 44656.0, + "49": 43182.0, + "50": 43383.0, + "51": 41130.0, + "52": 43841.0, + "53": 43918.0, + "54": 41894.0, + "55": 43861.0, + "56": 43229.0, + "57": 42488.0, + "58": 43831.0, + "59": 44616.0, + "60": 41267.0, + "61": 39701.0, + "62": 44746.0, + "63": 44704.0, + "64": 45346.0, + "65": 44696.0, + "66": 45356.0, + "67": 43133.0, + "68": 42535.0, + "69": 43803.0, + "70": 45504.0, + "71": 43309.0, + "72": 44800.0, + "73": 45401.0, + "74": 42467.0, + "75": 44661.0, + "76": 43882.0, + "77": 42110.0, + "78": 40337.0, + "79": 38924.0, + "80": 41077.0, + "81": 45349.0, + "82": 43228.0, + "83": 38446.0, + "84": 42443.0, + "85": 43970.0, + "86": 45668.0, + "87": 40846.0, + "88": 41780.0, + "89": 41056.0, + "90": 44657.0, + "91": 46133.0, + "92": 41748.0, + "93": 43205.0, + "94": 39556.0, + "95": 44047.0, + "96": 44668.0, + "97": 45383.0, + "98": 41817.0, + "99": 45425.0, + "100": 42429.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2166438912.0, + "2": 2166438912.0, + "3": 2166438912.0, + "4": 2166438912.0, + "5": 2166438912.0, + "6": 2166438912.0, + "7": 2166438912.0, + "8": 2166438912.0, + "9": 2166438912.0, + "10": 2166438912.0, + "11": 2166438912.0, + "12": 2166438912.0, + "13": 2166438912.0, + "14": 2166438912.0, + "15": 2166438912.0, + "16": 2166438912.0, + "17": 2166438912.0, + "18": 2166438912.0, + "19": 2166438912.0, + "20": 2166438912.0, + "21": 2166438912.0, + "22": 2166438912.0, + "23": 2166438912.0, + "24": 2166438912.0, + "25": 2166438912.0, + "26": 2166438912.0, + "27": 2166438912.0, + "28": 2166438912.0, + "29": 2166438912.0, + "30": 2166438912.0, + "31": 2166438912.0, + "32": 2166438912.0, + "33": 2166438912.0, + "34": 2166438912.0, + "35": 2166438912.0, + "36": 2166438912.0, + "37": 2166438912.0, + "38": 2166438912.0, + "39": 2166438912.0, + "40": 2166438912.0, + "41": 2166438912.0, + "42": 2166438912.0, + "43": 2166438912.0, + "44": 2166438912.0, + "45": 2166438912.0, + "46": 2166438912.0, + "47": 2166438912.0, + "48": 2166438912.0, + "49": 2166438912.0, + "50": 2166438912.0, + "51": 2166438912.0, + "52": 2166438912.0, + "53": 2166438912.0, + "54": 2166438912.0, + "55": 2166438912.0, + "56": 2166438912.0, + "57": 2166438912.0, + "58": 2166438912.0, + "59": 2166438912.0, + "60": 2166438912.0, + "61": 2166438912.0, + "62": 2166438912.0, + "63": 2166438912.0, + "64": 2166438912.0, + "65": 2166438912.0, + "66": 2166438912.0, + "67": 2166438912.0, + "68": 2166438912.0, + "69": 2166438912.0, + "70": 2166438912.0, + "71": 2166438912.0, + "72": 2166438912.0, + "73": 2166438912.0, + "74": 2166438912.0, + "75": 2166438912.0, + "76": 2166438912.0, + "77": 2166438912.0, + "78": 2166438912.0, + "79": 2166438912.0, + "80": 2166438912.0, + "81": 2166438912.0, + "82": 2166438912.0, + "83": 2166438912.0, + "84": 2166438912.0, + "85": 2166438912.0, + "86": 2166438912.0, + "87": 2166438912.0, + "88": 2166438912.0, + "89": 2166438912.0, + "90": 2166438912.0, + "91": 2166438912.0, + "92": 2166438912.0, + "93": 2166438912.0, + "94": 2166438912.0, + "95": 2166438912.0, + "96": 2166438912.0, + "97": 2166438912.0, + "98": 2166438912.0, + "99": 2166438912.0, + "100": 2166438912.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2274851328.0, + "2": 3206419968.0, + "3": 3206419968.0, + "4": 3206419968.0, + "5": 3206419968.0, + "6": 3206419968.0, + "7": 3206419968.0, + "8": 3206419968.0, + "9": 3206419968.0, + "10": 3206419968.0, + "11": 3206419968.0, + "12": 3206419968.0, + "13": 3206419968.0, + "14": 3206419968.0, + "15": 3206419968.0, + "16": 3206419968.0, + "17": 3206419968.0, + "18": 3206419968.0, + "19": 3206419968.0, + "20": 3206419968.0, + "21": 3206419968.0, + "22": 3206419968.0, + "23": 3206419968.0, + "24": 3206419968.0, + "25": 3206419968.0, + "26": 3206419968.0, + "27": 3206419968.0, + "28": 3206419968.0, + "29": 3206419968.0, + "30": 3206419968.0, + "31": 3206419968.0, + "32": 3206419968.0, + "33": 3206419968.0, + "34": 3206419968.0, + "35": 3206419968.0, + "36": 3206419968.0, + "37": 3206419968.0, + "38": 3206419968.0, + "39": 3206419968.0, + "40": 3206419968.0, + "41": 3206419968.0, + "42": 3206419968.0, + "43": 3206419968.0, + "44": 3206419968.0, + "45": 3206419968.0, + "46": 3206419968.0, + "47": 3206419968.0, + "48": 3206419968.0, + "49": 3206419968.0, + "50": 3206419968.0, + "51": 3206419968.0, + "52": 3206419968.0, + "53": 3206419968.0, + "54": 3206419968.0, + "55": 3206419968.0, + "56": 3206419968.0, + "57": 3206419968.0, + "58": 3206419968.0, + "59": 3206419968.0, + "60": 3206419968.0, + "61": 3206419968.0, + "62": 3206419968.0, + "63": 3206419968.0, + "64": 3206419968.0, + "65": 3206419968.0, + "66": 3206419968.0, + "67": 3206419968.0, + "68": 3206419968.0, + "69": 3206419968.0, + "70": 3206419968.0, + "71": 3206419968.0, + "72": 3206419968.0, + "73": 3206419968.0, + "74": 3206419968.0, + "75": 3206419968.0, + "76": 3206419968.0, + "77": 3206419968.0, + "78": 3206419968.0, + "79": 3206419968.0, + "80": 3206419968.0, + "81": 3206419968.0, + "82": 3206419968.0, + "83": 3206419968.0, + "84": 3206419968.0, + "85": 3206419968.0, + "86": 3206419968.0, + "87": 3206419968.0, + "88": 3206419968.0, + "89": 3206419968.0, + "90": 3206419968.0, + "91": 3206419968.0, + "92": 3206419968.0, + "93": 3206419968.0, + "94": 3206419968.0, + "95": 3206419968.0, + "96": 3206419968.0, + "97": 3206419968.0, + "98": 3206419968.0, + "99": 3206419968.0, + "100": 3206419968.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.79361, + "2": 0.67288, + "3": 0.52904, + "4": 0.52848, + "5": 0.52694, + "6": 0.52432, + "7": 0.52615, + "8": 0.52266, + "9": 0.52374, + "10": 0.5232, + "11": 0.52312, + "12": 0.52381, + "13": 0.52382, + "14": 0.52651, + "15": 0.52105, + "16": 0.52462, + "17": 0.52071, + "18": 0.52032, + "19": 0.52362, + "20": 0.54485, + "21": 0.52759, + "22": 0.52436, + "23": 0.52524, + "24": 0.52386, + "25": 0.52609, + "26": 0.98269, + "27": 0.52975, + "28": 0.52764, + "29": 0.5238, + "30": 0.90661, + "31": 0.52495, + "32": 0.52564, + "33": 0.55189, + "34": 0.52776, + "35": 0.52657, + "36": 0.94715, + "37": 0.52293, + "38": 0.51989, + "39": 0.52527, + "40": 1.00044, + "41": 0.51994, + "42": 0.52847, + "43": 0.52094, + "44": 0.52021, + "45": 0.83393, + "46": 0.52176, + "47": 0.52027, + "48": 0.52022, + "49": 0.92078, + "50": 0.52274, + "51": 0.52157, + "52": 0.51992, + "53": 0.52125, + "54": 0.52141, + "55": 0.52033, + "56": 0.52301, + "57": 0.52177, + "58": 0.52323, + "59": 0.52166, + "60": 1.02908, + "61": 0.52105, + "62": 0.84789, + "63": 0.52207, + "64": 0.52113, + "65": 0.52291, + "66": 0.52373, + "67": 0.5236, + "68": 0.52294, + "69": 0.52215, + "70": 0.5232, + "71": 0.5226, + "72": 0.52198, + "73": 0.52284, + "74": 0.52142, + "75": 0.52267, + "76": 0.52615, + "77": 0.51991, + "78": 0.52249, + "79": 0.52283, + "80": 0.522, + "81": 0.5205, + "82": 0.52145, + "83": 0.52129, + "84": 0.5242, + "85": 0.52276, + "86": 0.52121, + "87": 0.52263, + "88": 0.51919, + "89": 0.51905, + "90": 0.52153, + "91": 0.52154, + "92": 0.52132, + "93": 0.52497, + "94": 0.5276, + "95": 0.52062, + "96": 0.52743, + "97": 0.52114, + "98": 0.52333, + "99": 0.51967, + "100": 0.52209 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxa100_dracooci.json new file mode 100644 index 00000000000..47b085ccb06 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.38736, + "2": 10.37971, + "3": 9.79428, + "4": 9.59941, + "5": 9.38281, + "6": 9.40765, + "7": 9.31116, + "8": 9.25004, + "9": 9.1304, + "10": 9.06783, + "11": 8.89519, + "12": 8.8149, + "13": 8.82749, + "14": 8.69768, + "15": 8.65706, + "16": 8.54479, + "17": 8.50168, + "18": 8.39069, + "19": 8.36692, + "20": 8.26603, + "21": 8.27533, + "22": 8.14757, + "23": 8.0735, + "24": 8.12127, + "25": 7.98158, + "26": 8.09181, + "27": 7.87361, + "28": 7.96832, + "29": 7.80579, + "30": 7.87182, + "31": 7.818, + "32": 7.69078, + "33": 7.7864, + "34": 7.55667, + "35": 7.66308, + "36": 7.52559, + "37": 7.44779, + "38": 7.50335, + "39": 7.45281, + "40": 7.50499, + "41": 7.38901, + "42": 7.36263, + "43": 7.43543, + "44": 7.37578, + "45": 7.3523, + "46": 7.2817, + "47": 7.46121, + "48": 7.29037, + "49": 7.35179, + "50": 7.17986, + "51": 7.36821, + "52": 7.13332, + "53": 7.11532, + "54": 7.23214, + "55": 7.15383, + "56": 7.22184, + "57": 7.33328, + "58": 7.02116, + "59": 7.11467, + "60": 7.14998, + "61": 7.1117, + "62": 7.25117, + "63": 7.15586, + "64": 7.08539, + "65": 6.99542, + "66": 7.05924, + "67": 7.04804, + "68": 7.13906, + "69": 7.03428, + "70": 7.0643, + "71": 6.9218, + "72": 7.00511, + "73": 6.97917, + "74": 6.92066, + "75": 7.06414, + "76": 6.97532, + "77": 7.0837, + "78": 7.01986, + "79": 6.86115, + "80": 6.94493, + "81": 6.97847, + "82": 7.06834, + "83": 6.99434, + "84": 7.01114, + "85": 6.8595, + "86": 7.04211, + "87": 6.98111, + "88": 6.91353, + "89": 6.81096, + "90": 7.25918, + "91": 6.71195, + "92": 7.05431, + "93": 6.91084, + "94": 7.06872, + "95": 6.84927, + "96": 6.98126, + "97": 6.96743, + "98": 6.89421, + "99": 7.0152, + "100": 6.99082 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43296.0, + "2": 44067.0, + "3": 44759.0, + "4": 42367.0, + "5": 45373.0, + "6": 40966.0, + "7": 43147.0, + "8": 45448.0, + "9": 42470.0, + "10": 45357.0, + "11": 43969.0, + "12": 44583.0, + "13": 43897.0, + "14": 46189.0, + "15": 43909.0, + "16": 41613.0, + "17": 43823.0, + "18": 44678.0, + "19": 42556.0, + "20": 44765.0, + "21": 44723.0, + "22": 41820.0, + "23": 45463.0, + "24": 43077.0, + "25": 42457.0, + "26": 43913.0, + "27": 46221.0, + "28": 46390.0, + "29": 46160.0, + "30": 43999.0, + "31": 41276.0, + "32": 43316.0, + "33": 45432.0, + "34": 43303.0, + "35": 43276.0, + "36": 42461.0, + "37": 40045.0, + "38": 42557.0, + "39": 44701.0, + "40": 43214.0, + "41": 44667.0, + "42": 43241.0, + "43": 45448.0, + "44": 44605.0, + "45": 43265.0, + "46": 43892.0, + "47": 42375.0, + "48": 44656.0, + "49": 43182.0, + "50": 43383.0, + "51": 41130.0, + "52": 43841.0, + "53": 43918.0, + "54": 41894.0, + "55": 43861.0, + "56": 43229.0, + "57": 42488.0, + "58": 43831.0, + "59": 44616.0, + "60": 41267.0, + "61": 39701.0, + "62": 44746.0, + "63": 44704.0, + "64": 45346.0, + "65": 44696.0, + "66": 45356.0, + "67": 43133.0, + "68": 42535.0, + "69": 43803.0, + "70": 45504.0, + "71": 43309.0, + "72": 44800.0, + "73": 45401.0, + "74": 42467.0, + "75": 44661.0, + "76": 43882.0, + "77": 42110.0, + "78": 40337.0, + "79": 38924.0, + "80": 41077.0, + "81": 45349.0, + "82": 43228.0, + "83": 38446.0, + "84": 42443.0, + "85": 43970.0, + "86": 45668.0, + "87": 40846.0, + "88": 41780.0, + "89": 41056.0, + "90": 44657.0, + "91": 46133.0, + "92": 41748.0, + "93": 43205.0, + "94": 39556.0, + "95": 44047.0, + "96": 44668.0, + "97": 45383.0, + "98": 41817.0, + "99": 45425.0, + "100": 42429.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2166438912.0, + "2": 2166438912.0, + "3": 2166438912.0, + "4": 2166438912.0, + "5": 2166438912.0, + "6": 2166438912.0, + "7": 2166438912.0, + "8": 2166438912.0, + "9": 2166438912.0, + "10": 2166438912.0, + "11": 2166438912.0, + "12": 2166438912.0, + "13": 2166438912.0, + "14": 2166438912.0, + "15": 2166438912.0, + "16": 2166438912.0, + "17": 2166438912.0, + "18": 2166438912.0, + "19": 2166438912.0, + "20": 2166438912.0, + "21": 2166438912.0, + "22": 2166438912.0, + "23": 2166438912.0, + "24": 2166438912.0, + "25": 2166438912.0, + "26": 2166438912.0, + "27": 2166438912.0, + "28": 2166438912.0, + "29": 2166438912.0, + "30": 2166438912.0, + "31": 2166438912.0, + "32": 2166438912.0, + "33": 2166438912.0, + "34": 2166438912.0, + "35": 2166438912.0, + "36": 2166438912.0, + "37": 2166438912.0, + "38": 2166438912.0, + "39": 2166438912.0, + "40": 2166438912.0, + "41": 2166438912.0, + "42": 2166438912.0, + "43": 2166438912.0, + "44": 2166438912.0, + "45": 2166438912.0, + "46": 2166438912.0, + "47": 2166438912.0, + "48": 2166438912.0, + "49": 2166438912.0, + "50": 2166438912.0, + "51": 2166438912.0, + "52": 2166438912.0, + "53": 2166438912.0, + "54": 2166438912.0, + "55": 2166438912.0, + "56": 2166438912.0, + "57": 2166438912.0, + "58": 2166438912.0, + "59": 2166438912.0, + "60": 2166438912.0, + "61": 2166438912.0, + "62": 2166438912.0, + "63": 2166438912.0, + "64": 2166438912.0, + "65": 2166438912.0, + "66": 2166438912.0, + "67": 2166438912.0, + "68": 2166438912.0, + "69": 2166438912.0, + "70": 2166438912.0, + "71": 2166438912.0, + "72": 2166438912.0, + "73": 2166438912.0, + "74": 2166438912.0, + "75": 2166438912.0, + "76": 2166438912.0, + "77": 2166438912.0, + "78": 2166438912.0, + "79": 2166438912.0, + "80": 2166438912.0, + "81": 2166438912.0, + "82": 2166438912.0, + "83": 2166438912.0, + "84": 2166438912.0, + "85": 2166438912.0, + "86": 2166438912.0, + "87": 2166438912.0, + "88": 2166438912.0, + "89": 2166438912.0, + "90": 2166438912.0, + "91": 2166438912.0, + "92": 2166438912.0, + "93": 2166438912.0, + "94": 2166438912.0, + "95": 2166438912.0, + "96": 2166438912.0, + "97": 2166438912.0, + "98": 2166438912.0, + "99": 2166438912.0, + "100": 2166438912.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2274851328.0, + "2": 3206419968.0, + "3": 3206419968.0, + "4": 3206419968.0, + "5": 3206419968.0, + "6": 3206419968.0, + "7": 3206419968.0, + "8": 3206419968.0, + "9": 3206419968.0, + "10": 3206419968.0, + "11": 3206419968.0, + "12": 3206419968.0, + "13": 3206419968.0, + "14": 3206419968.0, + "15": 3206419968.0, + "16": 3206419968.0, + "17": 3206419968.0, + "18": 3206419968.0, + "19": 3206419968.0, + "20": 3206419968.0, + "21": 3206419968.0, + "22": 3206419968.0, + "23": 3206419968.0, + "24": 3206419968.0, + "25": 3206419968.0, + "26": 3206419968.0, + "27": 3206419968.0, + "28": 3206419968.0, + "29": 3206419968.0, + "30": 3206419968.0, + "31": 3206419968.0, + "32": 3206419968.0, + "33": 3206419968.0, + "34": 3206419968.0, + "35": 3206419968.0, + "36": 3206419968.0, + "37": 3206419968.0, + "38": 3206419968.0, + "39": 3206419968.0, + "40": 3206419968.0, + "41": 3206419968.0, + "42": 3206419968.0, + "43": 3206419968.0, + "44": 3206419968.0, + "45": 3206419968.0, + "46": 3206419968.0, + "47": 3206419968.0, + "48": 3206419968.0, + "49": 3206419968.0, + "50": 3206419968.0, + "51": 3206419968.0, + "52": 3206419968.0, + "53": 3206419968.0, + "54": 3206419968.0, + "55": 3206419968.0, + "56": 3206419968.0, + "57": 3206419968.0, + "58": 3206419968.0, + "59": 3206419968.0, + "60": 3206419968.0, + "61": 3206419968.0, + "62": 3206419968.0, + "63": 3206419968.0, + "64": 3206419968.0, + "65": 3206419968.0, + "66": 3206419968.0, + "67": 3206419968.0, + "68": 3206419968.0, + "69": 3206419968.0, + "70": 3206419968.0, + "71": 3206419968.0, + "72": 3206419968.0, + "73": 3206419968.0, + "74": 3206419968.0, + "75": 3206419968.0, + "76": 3206419968.0, + "77": 3206419968.0, + "78": 3206419968.0, + "79": 3206419968.0, + "80": 3206419968.0, + "81": 3206419968.0, + "82": 3206419968.0, + "83": 3206419968.0, + "84": 3206419968.0, + "85": 3206419968.0, + "86": 3206419968.0, + "87": 3206419968.0, + "88": 3206419968.0, + "89": 3206419968.0, + "90": 3206419968.0, + "91": 3206419968.0, + "92": 3206419968.0, + "93": 3206419968.0, + "94": 3206419968.0, + "95": 3206419968.0, + "96": 3206419968.0, + "97": 3206419968.0, + "98": 3206419968.0, + "99": 3206419968.0, + "100": 3206419968.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.73376, + "2": 0.65941, + "3": 0.51203, + "4": 0.51525, + "5": 0.52038, + "6": 0.51334, + "7": 0.51752, + "8": 0.5127, + "9": 0.51252, + "10": 0.51101, + "11": 0.51366, + "12": 0.50297, + "13": 0.50253, + "14": 0.50965, + "15": 0.50415, + "16": 0.50379, + "17": 0.50831, + "18": 0.50394, + "19": 0.50529, + "20": 0.50608, + "21": 0.51227, + "22": 0.50603, + "23": 0.50603, + "24": 0.50551, + "25": 0.5064, + "26": 0.5045, + "27": 0.50456, + "28": 0.50408, + "29": 0.50983, + "30": 0.97806, + "31": 0.93746, + "32": 0.50302, + "33": 0.51581, + "34": 0.52445, + "35": 0.51009, + "36": 0.51001, + "37": 0.98759, + "38": 0.5072, + "39": 0.50626, + "40": 0.53153, + "41": 0.84585, + "42": 0.50894, + "43": 0.51171, + "44": 0.99354, + "45": 1.01626, + "46": 0.51162, + "47": 0.509, + "48": 0.51118, + "49": 0.5092, + "50": 0.50955, + "51": 0.5099, + "52": 0.88089, + "53": 0.92181, + "54": 0.50199, + "55": 0.50201, + "56": 0.5042, + "57": 0.50152, + "58": 0.50188, + "59": 0.50229, + "60": 0.5022, + "61": 0.50158, + "62": 0.50418, + "63": 0.50455, + "64": 0.50212, + "65": 0.50523, + "66": 0.50164, + "67": 0.50093, + "68": 0.49939, + "69": 0.49983, + "70": 0.50804, + "71": 0.51035, + "72": 0.51332, + "73": 0.49997, + "74": 0.50164, + "75": 0.51172, + "76": 0.50371, + "77": 0.50466, + "78": 0.50784, + "79": 0.51289, + "80": 0.50935, + "81": 0.50705, + "82": 0.50671, + "83": 0.50317, + "84": 0.50489, + "85": 0.52254, + "86": 0.50659, + "87": 0.50805, + "88": 0.50211, + "89": 0.50127, + "90": 0.50552, + "91": 0.5025, + "92": 0.50458, + "93": 0.50451, + "94": 0.50155, + "95": 0.50402, + "96": 0.50113, + "97": 0.50935, + "98": 0.50158, + "99": 0.50243, + "100": 0.50094 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..3be9df673c7 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.36406, + "2": 10.37672, + "3": 9.84285, + "4": 9.61995, + "5": 9.4049, + "6": 9.42891, + "7": 9.31288, + "8": 9.27047, + "9": 9.10629, + "10": 9.03569, + "11": 8.86423, + "12": 8.80988, + "13": 8.8329, + "14": 8.69011, + "15": 8.66187, + "16": 8.54768, + "17": 8.50183, + "18": 8.42362, + "19": 8.38674, + "20": 8.27993, + "21": 8.26472, + "22": 8.15738, + "23": 8.11148, + "24": 8.14234, + "25": 7.98343, + "26": 8.10636, + "27": 7.88853, + "28": 7.97024, + "29": 7.8121, + "30": 7.87698, + "31": 7.82339, + "32": 7.70086, + "33": 7.80317, + "34": 7.56843, + "35": 7.67276, + "36": 7.54942, + "37": 7.475, + "38": 7.51068, + "39": 7.49979, + "40": 7.51131, + "41": 7.41252, + "42": 7.38333, + "43": 7.4414, + "44": 7.39857, + "45": 7.37352, + "46": 7.28824, + "47": 7.4683, + "48": 7.29457, + "49": 7.35181, + "50": 7.17223, + "51": 7.37216, + "52": 7.14588, + "53": 7.12384, + "54": 7.23984, + "55": 7.15454, + "56": 7.23308, + "57": 7.33501, + "58": 7.01226, + "59": 7.12063, + "60": 7.15043, + "61": 7.11076, + "62": 7.26458, + "63": 7.1544, + "64": 7.08651, + "65": 6.99077, + "66": 7.05503, + "67": 7.04463, + "68": 7.136, + "69": 7.03404, + "70": 7.05994, + "71": 6.90146, + "72": 6.99845, + "73": 6.97783, + "74": 6.92205, + "75": 7.06268, + "76": 6.95612, + "77": 7.08838, + "78": 7.02608, + "79": 6.85354, + "80": 6.93543, + "81": 6.97396, + "82": 7.05854, + "83": 6.98003, + "84": 7.00602, + "85": 6.84771, + "86": 7.04197, + "87": 6.97366, + "88": 6.90817, + "89": 6.80902, + "90": 7.23999, + "91": 6.70221, + "92": 7.0543, + "93": 6.89332, + "94": 7.05002, + "95": 6.84547, + "96": 6.96202, + "97": 6.95355, + "98": 6.8731, + "99": 6.99831, + "100": 6.98508 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43317.0, + "2": 44065.0, + "3": 44730.0, + "4": 42374.0, + "5": 45387.0, + "6": 40937.0, + "7": 43166.0, + "8": 45433.0, + "9": 42439.0, + "10": 45374.0, + "11": 43947.0, + "12": 44584.0, + "13": 43908.0, + "14": 46205.0, + "15": 43901.0, + "16": 41607.0, + "17": 43831.0, + "18": 44698.0, + "19": 42543.0, + "20": 44759.0, + "21": 44734.0, + "22": 41850.0, + "23": 45416.0, + "24": 43069.0, + "25": 42442.0, + "26": 43923.0, + "27": 46212.0, + "28": 46362.0, + "29": 46133.0, + "30": 43978.0, + "31": 41220.0, + "32": 43307.0, + "33": 45440.0, + "34": 43284.0, + "35": 43248.0, + "36": 42437.0, + "37": 40066.0, + "38": 42483.0, + "39": 44702.0, + "40": 43230.0, + "41": 44672.0, + "42": 43202.0, + "43": 45459.0, + "44": 44609.0, + "45": 43265.0, + "46": 43915.0, + "47": 42366.0, + "48": 44650.0, + "49": 43139.0, + "50": 43399.0, + "51": 41159.0, + "52": 43818.0, + "53": 43924.0, + "54": 41952.0, + "55": 43866.0, + "56": 43239.0, + "57": 42540.0, + "58": 43856.0, + "59": 44589.0, + "60": 41152.0, + "61": 39709.0, + "62": 44822.0, + "63": 44663.0, + "64": 45372.0, + "65": 44676.0, + "66": 45345.0, + "67": 43130.0, + "68": 42567.0, + "69": 43812.0, + "70": 45538.0, + "71": 43282.0, + "72": 44765.0, + "73": 45354.0, + "74": 42517.0, + "75": 44666.0, + "76": 43904.0, + "77": 42041.0, + "78": 40320.0, + "79": 38914.0, + "80": 41081.0, + "81": 45333.0, + "82": 43195.0, + "83": 38489.0, + "84": 42436.0, + "85": 43978.0, + "86": 45680.0, + "87": 40832.0, + "88": 41797.0, + "89": 41083.0, + "90": 44676.0, + "91": 46190.0, + "92": 41837.0, + "93": 43234.0, + "94": 39504.0, + "95": 44067.0, + "96": 44684.0, + "97": 45419.0, + "98": 41854.0, + "99": 45431.0, + "100": 42479.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2195405824.0, + "2": 2195405824.0, + "3": 2195405824.0, + "4": 2195405824.0, + "5": 2195405824.0, + "6": 2195405824.0, + "7": 2195405824.0, + "8": 2195405824.0, + "9": 2195405824.0, + "10": 2195405824.0, + "11": 2195405824.0, + "12": 2195405824.0, + "13": 2195405824.0, + "14": 2195405824.0, + "15": 2195405824.0, + "16": 2195405824.0, + "17": 2195405824.0, + "18": 2195405824.0, + "19": 2195405824.0, + "20": 2195405824.0, + "21": 2195405824.0, + "22": 2195405824.0, + "23": 2195405824.0, + "24": 2195405824.0, + "25": 2195405824.0, + "26": 2195405824.0, + "27": 2195405824.0, + "28": 2195405824.0, + "29": 2195405824.0, + "30": 2195405824.0, + "31": 2195405824.0, + "32": 2195405824.0, + "33": 2195405824.0, + "34": 2195405824.0, + "35": 2195405824.0, + "36": 2195405824.0, + "37": 2195405824.0, + "38": 2195405824.0, + "39": 2195405824.0, + "40": 2195405824.0, + "41": 2195405824.0, + "42": 2195405824.0, + "43": 2195405824.0, + "44": 2195405824.0, + "45": 2195405824.0, + "46": 2195405824.0, + "47": 2195405824.0, + "48": 2195405824.0, + "49": 2195405824.0, + "50": 2195405824.0, + "51": 2195405824.0, + "52": 2195405824.0, + "53": 2195405824.0, + "54": 2195405824.0, + "55": 2195405824.0, + "56": 2195405824.0, + "57": 2195405824.0, + "58": 2195405824.0, + "59": 2195405824.0, + "60": 2195405824.0, + "61": 2195405824.0, + "62": 2195405824.0, + "63": 2195405824.0, + "64": 2195405824.0, + "65": 2195405824.0, + "66": 2195405824.0, + "67": 2195405824.0, + "68": 2195405824.0, + "69": 2195405824.0, + "70": 2195405824.0, + "71": 2195405824.0, + "72": 2195405824.0, + "73": 2195405824.0, + "74": 2195405824.0, + "75": 2195405824.0, + "76": 2195405824.0, + "77": 2195405824.0, + "78": 2195405824.0, + "79": 2195405824.0, + "80": 2195405824.0, + "81": 2195405824.0, + "82": 2195405824.0, + "83": 2195405824.0, + "84": 2195405824.0, + "85": 2195405824.0, + "86": 2195405824.0, + "87": 2195405824.0, + "88": 2195405824.0, + "89": 2195405824.0, + "90": 2195405824.0, + "91": 2195405824.0, + "92": 2195405824.0, + "93": 2195405824.0, + "94": 2195405824.0, + "95": 2195405824.0, + "96": 2195405824.0, + "97": 2195405824.0, + "98": 2195405824.0, + "99": 2195405824.0, + "100": 2195405824.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2302114304.0, + "2": 3236697600.0, + "3": 3236697600.0, + "4": 3236697600.0, + "5": 3236697600.0, + "6": 3236697600.0, + "7": 3236697600.0, + "8": 3236697600.0, + "9": 3236697600.0, + "10": 3236697600.0, + "11": 3236697600.0, + "12": 3236697600.0, + "13": 3236697600.0, + "14": 3236697600.0, + "15": 3236697600.0, + "16": 3236697600.0, + "17": 3236697600.0, + "18": 3236697600.0, + "19": 3236697600.0, + "20": 3236697600.0, + "21": 3236697600.0, + "22": 3236697600.0, + "23": 3236697600.0, + "24": 3236697600.0, + "25": 3236697600.0, + "26": 3236697600.0, + "27": 3236697600.0, + "28": 3236697600.0, + "29": 3236697600.0, + "30": 3236697600.0, + "31": 3236697600.0, + "32": 3236697600.0, + "33": 3236697600.0, + "34": 3236697600.0, + "35": 3236697600.0, + "36": 3236697600.0, + "37": 3236697600.0, + "38": 3236697600.0, + "39": 3236697600.0, + "40": 3236697600.0, + "41": 3236697600.0, + "42": 3236697600.0, + "43": 3236697600.0, + "44": 3236697600.0, + "45": 3236697600.0, + "46": 3236697600.0, + "47": 3236697600.0, + "48": 3236697600.0, + "49": 3236697600.0, + "50": 3236697600.0, + "51": 3236697600.0, + "52": 3236697600.0, + "53": 3236697600.0, + "54": 3236697600.0, + "55": 3236697600.0, + "56": 3236697600.0, + "57": 3236697600.0, + "58": 3236697600.0, + "59": 3236697600.0, + "60": 3236697600.0, + "61": 3236697600.0, + "62": 3236697600.0, + "63": 3236697600.0, + "64": 3236697600.0, + "65": 3236697600.0, + "66": 3236697600.0, + "67": 3236697600.0, + "68": 3236697600.0, + "69": 3236697600.0, + "70": 3236697600.0, + "71": 3236697600.0, + "72": 3236697600.0, + "73": 3236697600.0, + "74": 3236697600.0, + "75": 3236697600.0, + "76": 3236697600.0, + "77": 3236697600.0, + "78": 3236697600.0, + "79": 3236697600.0, + "80": 3236697600.0, + "81": 3236697600.0, + "82": 3236697600.0, + "83": 3236697600.0, + "84": 3236697600.0, + "85": 3236697600.0, + "86": 3236697600.0, + "87": 3236697600.0, + "88": 3236697600.0, + "89": 3236697600.0, + "90": 3236697600.0, + "91": 3236697600.0, + "92": 3236697600.0, + "93": 3236697600.0, + "94": 3236697600.0, + "95": 3236697600.0, + "96": 3236697600.0, + "97": 3236697600.0, + "98": 3236697600.0, + "99": 3236697600.0, + "100": 3236697600.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.39562, + "2": 0.44691, + "3": 0.3459, + "4": 0.34935, + "5": 0.34659, + "6": 0.35056, + "7": 0.3495, + "8": 0.35113, + "9": 0.34945, + "10": 0.35049, + "11": 0.35158, + "12": 0.34969, + "13": 0.34855, + "14": 0.35082, + "15": 0.35148, + "16": 0.35346, + "17": 0.35991, + "18": 0.35857, + "19": 0.35651, + "20": 0.35734, + "21": 0.36107, + "22": 0.35291, + "23": 0.34878, + "24": 0.34924, + "25": 0.34966, + "26": 0.35397, + "27": 0.35048, + "28": 0.39139, + "29": 0.35978, + "30": 0.35049, + "31": 0.35472, + "32": 0.34768, + "33": 0.3681, + "34": 0.37086, + "35": 0.35372, + "36": 0.35661, + "37": 0.96115, + "38": 0.69943, + "39": 0.35304, + "40": 0.39899, + "41": 0.3519, + "42": 0.35367, + "43": 0.35089, + "44": 0.35181, + "45": 0.85196, + "46": 0.353, + "47": 0.35065, + "48": 0.34986, + "49": 0.34987, + "50": 0.35017, + "51": 0.35243, + "52": 0.34764, + "53": 0.68786, + "54": 0.35071, + "55": 0.35502, + "56": 0.36533, + "57": 0.34855, + "58": 0.35098, + "59": 0.34751, + "60": 0.66551, + "61": 0.35376, + "62": 0.65487, + "63": 0.36102, + "64": 0.35122, + "65": 0.35654, + "66": 0.36028, + "67": 0.36743, + "68": 0.36013, + "69": 0.36151, + "70": 0.36618, + "71": 0.34619, + "72": 0.36448, + "73": 0.35934, + "74": 0.36235, + "75": 0.35742, + "76": 0.35529, + "77": 0.36633, + "78": 0.35551, + "79": 0.35185, + "80": 0.34938, + "81": 0.34965, + "82": 0.35454, + "83": 0.34716, + "84": 0.36305, + "85": 0.35771, + "86": 0.34829, + "87": 0.35483, + "88": 0.34874, + "89": 0.34898, + "90": 0.35072, + "91": 0.34969, + "92": 0.3539, + "93": 0.34627, + "94": 0.34706, + "95": 0.34587, + "96": 0.34804, + "97": 0.34773, + "98": 0.36076, + "99": 0.38382, + "100": 0.35651 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..30c495148f4 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.36406, + "2": 10.37672, + "3": 9.84285, + "4": 9.61995, + "5": 9.4049, + "6": 9.42891, + "7": 9.31288, + "8": 9.27047, + "9": 9.10629, + "10": 9.03569, + "11": 8.86423, + "12": 8.80988, + "13": 8.8329, + "14": 8.69011, + "15": 8.66187, + "16": 8.54768, + "17": 8.50183, + "18": 8.42362, + "19": 8.38674, + "20": 8.27993, + "21": 8.26472, + "22": 8.15738, + "23": 8.11148, + "24": 8.14234, + "25": 7.98343, + "26": 8.10636, + "27": 7.88853, + "28": 7.97024, + "29": 7.8121, + "30": 7.87698, + "31": 7.82339, + "32": 7.70086, + "33": 7.80317, + "34": 7.56843, + "35": 7.67276, + "36": 7.54942, + "37": 7.475, + "38": 7.51068, + "39": 7.49979, + "40": 7.51131, + "41": 7.41252, + "42": 7.38333, + "43": 7.4414, + "44": 7.39857, + "45": 7.37352, + "46": 7.28824, + "47": 7.4683, + "48": 7.29457, + "49": 7.35181, + "50": 7.17223, + "51": 7.37216, + "52": 7.14588, + "53": 7.12384, + "54": 7.23984, + "55": 7.15454, + "56": 7.23308, + "57": 7.33501, + "58": 7.01226, + "59": 7.12063, + "60": 7.15043, + "61": 7.11076, + "62": 7.26458, + "63": 7.1544, + "64": 7.08651, + "65": 6.99077, + "66": 7.05503, + "67": 7.04463, + "68": 7.136, + "69": 7.03404, + "70": 7.05994, + "71": 6.90146, + "72": 6.99845, + "73": 6.97783, + "74": 6.92205, + "75": 7.06268, + "76": 6.95612, + "77": 7.08838, + "78": 7.02608, + "79": 6.85354, + "80": 6.93543, + "81": 6.97396, + "82": 7.05854, + "83": 6.98003, + "84": 7.00602, + "85": 6.84771, + "86": 7.04197, + "87": 6.97366, + "88": 6.90817, + "89": 6.80902, + "90": 7.23999, + "91": 6.70221, + "92": 7.0543, + "93": 6.89332, + "94": 7.05002, + "95": 6.84547, + "96": 6.96202, + "97": 6.95355, + "98": 6.8731, + "99": 6.99831, + "100": 6.98508 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43317.0, + "2": 44065.0, + "3": 44730.0, + "4": 42374.0, + "5": 45387.0, + "6": 40937.0, + "7": 43166.0, + "8": 45433.0, + "9": 42439.0, + "10": 45374.0, + "11": 43947.0, + "12": 44584.0, + "13": 43908.0, + "14": 46205.0, + "15": 43901.0, + "16": 41607.0, + "17": 43831.0, + "18": 44698.0, + "19": 42543.0, + "20": 44759.0, + "21": 44734.0, + "22": 41850.0, + "23": 45416.0, + "24": 43069.0, + "25": 42442.0, + "26": 43923.0, + "27": 46212.0, + "28": 46362.0, + "29": 46133.0, + "30": 43978.0, + "31": 41220.0, + "32": 43307.0, + "33": 45440.0, + "34": 43284.0, + "35": 43248.0, + "36": 42437.0, + "37": 40066.0, + "38": 42483.0, + "39": 44702.0, + "40": 43230.0, + "41": 44672.0, + "42": 43202.0, + "43": 45459.0, + "44": 44609.0, + "45": 43265.0, + "46": 43915.0, + "47": 42366.0, + "48": 44650.0, + "49": 43139.0, + "50": 43399.0, + "51": 41159.0, + "52": 43818.0, + "53": 43924.0, + "54": 41952.0, + "55": 43866.0, + "56": 43239.0, + "57": 42540.0, + "58": 43856.0, + "59": 44589.0, + "60": 41152.0, + "61": 39709.0, + "62": 44822.0, + "63": 44663.0, + "64": 45372.0, + "65": 44676.0, + "66": 45345.0, + "67": 43130.0, + "68": 42567.0, + "69": 43812.0, + "70": 45538.0, + "71": 43282.0, + "72": 44765.0, + "73": 45354.0, + "74": 42517.0, + "75": 44666.0, + "76": 43904.0, + "77": 42041.0, + "78": 40320.0, + "79": 38914.0, + "80": 41081.0, + "81": 45333.0, + "82": 43195.0, + "83": 38489.0, + "84": 42436.0, + "85": 43978.0, + "86": 45680.0, + "87": 40832.0, + "88": 41797.0, + "89": 41083.0, + "90": 44676.0, + "91": 46190.0, + "92": 41837.0, + "93": 43234.0, + "94": 39504.0, + "95": 44067.0, + "96": 44684.0, + "97": 45419.0, + "98": 41854.0, + "99": 45431.0, + "100": 42479.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2195405824.0, + "2": 2195405824.0, + "3": 2195405824.0, + "4": 2195405824.0, + "5": 2195405824.0, + "6": 2195405824.0, + "7": 2195405824.0, + "8": 2195405824.0, + "9": 2195405824.0, + "10": 2195405824.0, + "11": 2195405824.0, + "12": 2195405824.0, + "13": 2195405824.0, + "14": 2195405824.0, + "15": 2195405824.0, + "16": 2195405824.0, + "17": 2195405824.0, + "18": 2195405824.0, + "19": 2195405824.0, + "20": 2195405824.0, + "21": 2195405824.0, + "22": 2195405824.0, + "23": 2195405824.0, + "24": 2195405824.0, + "25": 2195405824.0, + "26": 2195405824.0, + "27": 2195405824.0, + "28": 2195405824.0, + "29": 2195405824.0, + "30": 2195405824.0, + "31": 2195405824.0, + "32": 2195405824.0, + "33": 2195405824.0, + "34": 2195405824.0, + "35": 2195405824.0, + "36": 2195405824.0, + "37": 2195405824.0, + "38": 2195405824.0, + "39": 2195405824.0, + "40": 2195405824.0, + "41": 2195405824.0, + "42": 2195405824.0, + "43": 2195405824.0, + "44": 2195405824.0, + "45": 2195405824.0, + "46": 2195405824.0, + "47": 2195405824.0, + "48": 2195405824.0, + "49": 2195405824.0, + "50": 2195405824.0, + "51": 2195405824.0, + "52": 2195405824.0, + "53": 2195405824.0, + "54": 2195405824.0, + "55": 2195405824.0, + "56": 2195405824.0, + "57": 2195405824.0, + "58": 2195405824.0, + "59": 2195405824.0, + "60": 2195405824.0, + "61": 2195405824.0, + "62": 2195405824.0, + "63": 2195405824.0, + "64": 2195405824.0, + "65": 2195405824.0, + "66": 2195405824.0, + "67": 2195405824.0, + "68": 2195405824.0, + "69": 2195405824.0, + "70": 2195405824.0, + "71": 2195405824.0, + "72": 2195405824.0, + "73": 2195405824.0, + "74": 2195405824.0, + "75": 2195405824.0, + "76": 2195405824.0, + "77": 2195405824.0, + "78": 2195405824.0, + "79": 2195405824.0, + "80": 2195405824.0, + "81": 2195405824.0, + "82": 2195405824.0, + "83": 2195405824.0, + "84": 2195405824.0, + "85": 2195405824.0, + "86": 2195405824.0, + "87": 2195405824.0, + "88": 2195405824.0, + "89": 2195405824.0, + "90": 2195405824.0, + "91": 2195405824.0, + "92": 2195405824.0, + "93": 2195405824.0, + "94": 2195405824.0, + "95": 2195405824.0, + "96": 2195405824.0, + "97": 2195405824.0, + "98": 2195405824.0, + "99": 2195405824.0, + "100": 2195405824.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2302114304.0, + "2": 3236697600.0, + "3": 3236697600.0, + "4": 3236697600.0, + "5": 3236697600.0, + "6": 3236697600.0, + "7": 3236697600.0, + "8": 3236697600.0, + "9": 3236697600.0, + "10": 3236697600.0, + "11": 3236697600.0, + "12": 3236697600.0, + "13": 3236697600.0, + "14": 3236697600.0, + "15": 3236697600.0, + "16": 3236697600.0, + "17": 3236697600.0, + "18": 3236697600.0, + "19": 3236697600.0, + "20": 3236697600.0, + "21": 3236697600.0, + "22": 3236697600.0, + "23": 3236697600.0, + "24": 3236697600.0, + "25": 3236697600.0, + "26": 3236697600.0, + "27": 3236697600.0, + "28": 3236697600.0, + "29": 3236697600.0, + "30": 3236697600.0, + "31": 3236697600.0, + "32": 3236697600.0, + "33": 3236697600.0, + "34": 3236697600.0, + "35": 3236697600.0, + "36": 3236697600.0, + "37": 3236697600.0, + "38": 3236697600.0, + "39": 3236697600.0, + "40": 3236697600.0, + "41": 3236697600.0, + "42": 3236697600.0, + "43": 3236697600.0, + "44": 3236697600.0, + "45": 3236697600.0, + "46": 3236697600.0, + "47": 3236697600.0, + "48": 3236697600.0, + "49": 3236697600.0, + "50": 3236697600.0, + "51": 3236697600.0, + "52": 3236697600.0, + "53": 3236697600.0, + "54": 3236697600.0, + "55": 3236697600.0, + "56": 3236697600.0, + "57": 3236697600.0, + "58": 3236697600.0, + "59": 3236697600.0, + "60": 3236697600.0, + "61": 3236697600.0, + "62": 3236697600.0, + "63": 3236697600.0, + "64": 3236697600.0, + "65": 3236697600.0, + "66": 3236697600.0, + "67": 3236697600.0, + "68": 3236697600.0, + "69": 3236697600.0, + "70": 3236697600.0, + "71": 3236697600.0, + "72": 3236697600.0, + "73": 3236697600.0, + "74": 3236697600.0, + "75": 3236697600.0, + "76": 3236697600.0, + "77": 3236697600.0, + "78": 3236697600.0, + "79": 3236697600.0, + "80": 3236697600.0, + "81": 3236697600.0, + "82": 3236697600.0, + "83": 3236697600.0, + "84": 3236697600.0, + "85": 3236697600.0, + "86": 3236697600.0, + "87": 3236697600.0, + "88": 3236697600.0, + "89": 3236697600.0, + "90": 3236697600.0, + "91": 3236697600.0, + "92": 3236697600.0, + "93": 3236697600.0, + "94": 3236697600.0, + "95": 3236697600.0, + "96": 3236697600.0, + "97": 3236697600.0, + "98": 3236697600.0, + "99": 3236697600.0, + "100": 3236697600.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.46115, + "2": 0.46835, + "3": 0.38416, + "4": 0.37391, + "5": 0.37703, + "6": 0.38173, + "7": 0.37456, + "8": 0.37696, + "9": 0.37338, + "10": 0.37687, + "11": 0.38251, + "12": 0.38037, + "13": 0.37996, + "14": 0.38264, + "15": 0.37959, + "16": 0.38232, + "17": 0.37852, + "18": 0.37735, + "19": 0.3812, + "20": 0.37493, + "21": 0.38227, + "22": 0.38196, + "23": 0.37745, + "24": 0.3782, + "25": 0.37181, + "26": 0.37935, + "27": 0.38539, + "28": 0.38393, + "29": 0.3826, + "30": 0.37839, + "31": 0.38438, + "32": 0.64523, + "33": 0.37971, + "34": 0.38082, + "35": 0.74313, + "36": 0.3848, + "37": 0.38169, + "38": 0.38154, + "39": 0.40495, + "40": 0.40243, + "41": 0.37972, + "42": 0.37792, + "43": 0.38261, + "44": 0.37607, + "45": 0.37463, + "46": 0.37881, + "47": 0.37293, + "48": 0.37592, + "49": 0.659, + "50": 0.37783, + "51": 0.38158, + "52": 0.73901, + "53": 0.37684, + "54": 0.37707, + "55": 0.42405, + "56": 0.38184, + "57": 0.37936, + "58": 0.37539, + "59": 0.37591, + "60": 0.72267, + "61": 0.37815, + "62": 0.77277, + "63": 0.38815, + "64": 0.3807, + "65": 0.37848, + "66": 0.38143, + "67": 0.37999, + "68": 0.38158, + "69": 0.38427, + "70": 0.37479, + "71": 0.38252, + "72": 0.38036, + "73": 0.38116, + "74": 0.38336, + "75": 0.3771, + "76": 0.37876, + "77": 0.38102, + "78": 0.37864, + "79": 0.38095, + "80": 0.37954, + "81": 0.37575, + "82": 0.38084, + "83": 0.38192, + "84": 0.38267, + "85": 0.38765, + "86": 0.38467, + "87": 0.3817, + "88": 0.37395, + "89": 0.37751, + "90": 0.38076, + "91": 0.37565, + "92": 0.38237, + "93": 0.37738, + "94": 0.37726, + "95": 0.38237, + "96": 0.38018, + "97": 0.38525, + "98": 0.40815, + "99": 0.38117, + "100": 0.38201 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgx_h100.json index 3c05fe99417..438130bae1c 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.33127, + "2": 10.35281, + "3": 9.79613, + "4": 9.60968, "5": 9.42269, + "6": 9.45137, + "7": 9.34348, + "8": 9.27525, + "9": 9.09676, "10": 9.0722, + "11": 8.8835, + "12": 8.83711, + "13": 8.86836, + "14": 8.71039, "15": 8.68191, + "16": 8.56149, + "17": 8.52311, + "18": 8.43963, + "19": 8.40439, "20": 8.29506, + "21": 8.27059, + "22": 8.17902, + "23": 8.12669, + "24": 8.14846, "25": 7.9909, + "26": 8.12216, + "27": 7.90453, + "28": 7.98655, + "29": 7.80845, "30": 7.86918, + "31": 7.83571, + "32": 7.72178, + "33": 7.80378, + "34": 7.59229, "35": 7.68371, + "36": 7.53883, + "37": 7.47609, + "38": 7.5168, + "39": 7.49978, "40": 7.51704, + "41": 7.43174, + "42": 7.40104, + "43": 7.44926, + "44": 7.38919, "45": 7.38016, + "46": 7.29476, + "47": 7.44829, + "48": 7.28213, + "49": 7.34657, "50": 7.17116, + "51": 7.37361, + "52": 7.13381, + "53": 7.11244, + "54": 7.23402, "55": 7.14785, + "56": 7.22775, + "57": 7.33273, + "58": 6.99461, + "59": 7.11599, "60": 7.13222, + "61": 7.1056, + "62": 7.26513, + "63": 7.14772, + "64": 7.08696, "65": 6.98643, + "66": 7.04728, + "67": 7.04697, + "68": 7.14062, + "69": 7.2435, "70": 7.05957, + "71": 6.89356, + "72": 6.99769, + "73": 6.97897, + "74": 6.91983, "75": 7.05297, + "76": 6.96036, + "77": 7.0791, + "78": 7.01392, + "79": 6.88358, "80": 6.93014, + "81": 6.96553, + "82": 7.05265, + "83": 6.98788, + "84": 7.00427, "85": 6.84577, + "86": 7.03621, + "87": 6.96327, + "88": 6.9137, + "89": 6.80631, "90": 7.23619, + "91": 6.70015, + "92": 7.05679, + "93": 6.89287, + "94": 7.05835, "95": 6.84786, + "96": 6.96771, + "97": 6.94258, + "98": 6.87388, + "99": 7.01816, "100": 6.98466 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 43334.0, + "2": 44100.0, + "3": 44771.0, + "4": 42457.0, "5": 45411.0, + "6": 40966.0, + "7": 43193.0, + "8": 45457.0, + "9": 42550.0, "10": 45360.0, + "11": 44029.0, + "12": 44605.0, + "13": 43917.0, + "14": 46219.0, "15": 43943.0, + "16": 41732.0, + "17": 43861.0, + "18": 44721.0, + "19": 42597.0, "20": 44797.0, + "21": 44792.0, + "22": 41891.0, + "23": 45473.0, + "24": 43081.0, "25": 42682.0, + "26": 43950.0, + "27": 46253.0, + "28": 46447.0, + "29": 46164.0, "30": 44042.0, + "31": 41263.0, + "32": 43440.0, + "33": 45483.0, + "34": 43349.0, "35": 43273.0, + "36": 42490.0, + "37": 40647.0, + "38": 42549.0, + "39": 44766.0, "40": 43281.0, + "41": 44669.0, + "42": 43287.0, + "43": 45454.0, + "44": 44627.0, "45": 43353.0, + "46": 43925.0, + "47": 42498.0, + "48": 44758.0, + "49": 43173.0, "50": 43402.0, + "51": 41198.0, + "52": 43900.0, + "53": 43938.0, + "54": 41922.0, "55": 43916.0, + "56": 43237.0, + "57": 42634.0, + "58": 43916.0, + "59": 44616.0, "60": 41414.0, + "61": 39759.0, + "62": 44750.0, + "63": 44673.0, + "64": 45378.0, "65": 44765.0, + "66": 45401.0, + "67": 43155.0, + "68": 42552.0, + "69": 43831.0, "70": 45546.0, + "71": 43332.0, + "72": 44847.0, + "73": 45376.0, + "74": 42503.0, "75": 44704.0, + "76": 43916.0, + "77": 42101.0, + "78": 40543.0, + "79": 38997.0, "80": 41079.0, + "81": 45377.0, + "82": 43254.0, + "83": 38473.0, + "84": 42420.0, "85": 43989.0, + "86": 45694.0, + "87": 41164.0, + "88": 41773.0, + "89": 41047.0, "90": 44710.0, + "91": 46274.0, + "92": 41823.0, + "93": 43286.0, + "94": 39530.0, "95": 44074.0, + "96": 44686.0, + "97": 45424.0, + "98": 41849.0, + "99": 45567.0, "100": 42485.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 4158515200.0, + "2": 4158515200.0, + "3": 4158515200.0, + "4": 4158515200.0, "5": 4158515200.0, + "6": 4158515200.0, + "7": 4158515200.0, + "8": 4158515200.0, + "9": 4158515200.0, "10": 4158515200.0, + "11": 4158515200.0, + "12": 4158515200.0, + "13": 4158515200.0, + "14": 4158515200.0, "15": 4158515200.0, + "16": 4158515200.0, + "17": 4158515200.0, + "18": 4158515200.0, + "19": 4158515200.0, "20": 4158515200.0, + "21": 4158515200.0, + "22": 4158515200.0, + "23": 4158515200.0, + "24": 4158515200.0, "25": 4158515200.0, + "26": 4158515200.0, + "27": 4158515200.0, + "28": 4158515200.0, + "29": 4158515200.0, "30": 4158515200.0, + "31": 4158515200.0, + "32": 4158515200.0, + "33": 4158515200.0, + "34": 4158515200.0, "35": 4158515200.0, + "36": 4158515200.0, + "37": 4158515200.0, + "38": 4158515200.0, + "39": 4158515200.0, "40": 4158515200.0, + "41": 4158515200.0, + "42": 4158515200.0, + "43": 4158515200.0, + "44": 4158515200.0, "45": 4158515200.0, + "46": 4158515200.0, + "47": 4158515200.0, + "48": 4158515200.0, + "49": 4158515200.0, "50": 4158515200.0, + "51": 4158515200.0, + "52": 4158515200.0, + "53": 4158515200.0, + "54": 4158515200.0, "55": 4158515200.0, + "56": 4158515200.0, + "57": 4158515200.0, + "58": 4158515200.0, + "59": 4158515200.0, "60": 4158515200.0, + "61": 4158515200.0, + "62": 4158515200.0, + "63": 4158515200.0, + "64": 4158515200.0, "65": 4158515200.0, + "66": 4158515200.0, + "67": 4158515200.0, + "68": 4158515200.0, + "69": 4158515200.0, "70": 4158515200.0, + "71": 4158515200.0, + "72": 4158515200.0, + "73": 4158515200.0, + "74": 4158515200.0, "75": 4158515200.0, + "76": 4158515200.0, + "77": 4158515200.0, + "78": 4158515200.0, + "79": 4158515200.0, "80": 4158515200.0, + "81": 4158515200.0, + "82": 4158515200.0, + "83": 4158515200.0, + "84": 4158515200.0, "85": 4158515200.0, + "86": 4158515200.0, + "87": 4158515200.0, + "88": 4158515200.0, + "89": 4158515200.0, "90": 4158515200.0, + "91": 4158515200.0, + "92": 4158515200.0, + "93": 4158515200.0, + "94": 4158515200.0, "95": 4158515200.0, + "96": 4158515200.0, + "97": 4158515200.0, + "98": 4158515200.0, + "99": 4158515200.0, "100": 4158515200.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 4349380608.0, + "2": 6185459712.0, + "3": 6187556864.0, + "4": 6187556864.0, "5": 6187556864.0, + "6": 6187556864.0, + "7": 6187556864.0, + "8": 6187556864.0, + "9": 6187556864.0, "10": 6187556864.0, + "11": 6187556864.0, + "12": 6187556864.0, + "13": 6187556864.0, + "14": 6187556864.0, "15": 6187556864.0, + "16": 6187556864.0, + "17": 6187556864.0, + "18": 6187556864.0, + "19": 6187556864.0, "20": 6187556864.0, + "21": 6187556864.0, + "22": 6187556864.0, + "23": 6187556864.0, + "24": 6187556864.0, "25": 6187556864.0, + "26": 6187556864.0, + "27": 6187556864.0, + "28": 6187556864.0, + "29": 6187556864.0, "30": 6187556864.0, + "31": 6187556864.0, + "32": 6187556864.0, + "33": 6187556864.0, + "34": 6187556864.0, "35": 6187556864.0, + "36": 6187556864.0, + "37": 6187556864.0, + "38": 6187556864.0, + "39": 6187556864.0, "40": 6187556864.0, + "41": 6187556864.0, + "42": 6187556864.0, + "43": 6187556864.0, + "44": 6187556864.0, "45": 6187556864.0, + "46": 6187556864.0, + "47": 6187556864.0, + "48": 6187556864.0, + "49": 6187556864.0, "50": 6187556864.0, + "51": 6187556864.0, + "52": 6187556864.0, + "53": 6187556864.0, + "54": 6187556864.0, "55": 6187556864.0, + "56": 6187556864.0, + "57": 6187556864.0, + "58": 6187556864.0, + "59": 6187556864.0, "60": 6187556864.0, + "61": 6187556864.0, + "62": 6187556864.0, + "63": 6187556864.0, + "64": 6187556864.0, "65": 6187556864.0, + "66": 6187556864.0, + "67": 6187556864.0, + "68": 6187556864.0, + "69": 6187556864.0, "70": 6187556864.0, + "71": 6187556864.0, + "72": 6187556864.0, + "73": 6187556864.0, + "74": 6187556864.0, "75": 6187556864.0, + "76": 6187556864.0, + "77": 6187556864.0, + "78": 6187556864.0, + "79": 6187556864.0, "80": 6187556864.0, + "81": 6187556864.0, + "82": 6187556864.0, + "83": 6187556864.0, + "84": 6187556864.0, "85": 6187556864.0, + "86": 6187556864.0, + "87": 6187556864.0, + "88": 6187556864.0, + "89": 6187556864.0, "90": 6187556864.0, + "91": 6187556864.0, + "92": 6187556864.0, + "93": 6187556864.0, + "94": 6187556864.0, "95": 6187556864.0, + "96": 6187556864.0, + "97": 6187556864.0, + "98": 6187556864.0, + "99": 6187556864.0, "100": 6187556864.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 5.88206, - "5": 0.14455, - "10": 0.1392, - "15": 0.14565, - "20": 0.1396, - "25": 0.13933, - "30": 0.13875, - "35": 0.14498, - "40": 0.13976, - "45": 0.14331, - "50": 0.14852, - "55": 0.13993, - "60": 0.1429, - "65": 0.14345, - "70": 0.17591, - "75": 0.14145, - "80": 0.14297, - "85": 0.14009, - "90": 0.14121, - "95": 0.13997, - "100": 0.14256 + "1": 7.07395, + "2": 0.19501, + "3": 0.16284, + "4": 0.15592, + "5": 0.16485, + "6": 0.15452, + "7": 0.1627, + "8": 0.15835, + "9": 0.15975, + "10": 0.15881, + "11": 0.16294, + "12": 0.15929, + "13": 0.16216, + "14": 0.15673, + "15": 0.16042, + "16": 0.15452, + "17": 0.16802, + "18": 0.15623, + "19": 0.16501, + "20": 0.15961, + "21": 0.16269, + "22": 0.15556, + "23": 0.16412, + "24": 0.1564, + "25": 0.1614, + "26": 0.15776, + "27": 0.16056, + "28": 0.16086, + "29": 0.16026, + "30": 0.15782, + "31": 0.1619, + "32": 0.1567, + "33": 0.16353, + "34": 0.1553, + "35": 0.16202, + "36": 0.15695, + "37": 0.16347, + "38": 0.15703, + "39": 0.1638, + "40": 0.1549, + "41": 0.15808, + "42": 0.1603, + "43": 0.15931, + "44": 0.15772, + "45": 0.16421, + "46": 0.15573, + "47": 0.16133, + "48": 0.1567, + "49": 0.16354, + "50": 0.15698, + "51": 0.15998, + "52": 0.15347, + "53": 0.16223, + "54": 0.1565, + "55": 0.16429, + "56": 0.15654, + "57": 0.16548, + "58": 0.15761, + "59": 0.16437, + "60": 0.15677, + "61": 0.16238, + "62": 0.15845, + "63": 0.16393, + "64": 0.16321, + "65": 0.16208, + "66": 0.15975, + "67": 0.16831, + "68": 0.15965, + "69": 0.16375, + "70": 0.16321, + "71": 0.17306, + "72": 0.15973, + "73": 0.16591, + "74": 0.1637, + "75": 0.16984, + "76": 0.16123, + "77": 0.17281, + "78": 0.16826, + "79": 0.17136, + "80": 0.16673, + "81": 0.16135, + "82": 0.16815, + "83": 0.20097, + "84": 0.19663, + "85": 0.16475, + "86": 0.16782, + "87": 0.16163, + "88": 0.16356, + "89": 0.16018, + "90": 0.16416, + "91": 0.15961, + "92": 0.16129, + "93": 0.15562, + "94": 0.1646, + "95": 0.15685, + "96": 0.16321, + "97": 0.15621, + "98": 0.16585, + "99": 0.15667, + "100": 0.17074 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..cefa267841e --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.34494, + "2": 10.36431, + "3": 9.73158, + "4": 9.57928, + "5": 9.38931, + "6": 9.41074, + "7": 9.30545, + "8": 9.24868, + "9": 9.09349, + "10": 9.01569, + "11": 8.86286, + "12": 8.79096, + "13": 8.80892, + "14": 8.67669, + "15": 8.64631, + "16": 8.5398, + "17": 8.47895, + "18": 8.38945, + "19": 8.36156, + "20": 8.26966, + "21": 8.26333, + "22": 8.15066, + "23": 8.08893, + "24": 8.12421, + "25": 7.99493, + "26": 8.08494, + "27": 7.87755, + "28": 7.95863, + "29": 7.79585, + "30": 7.87492, + "31": 7.83245, + "32": 7.69489, + "33": 7.78469, + "34": 7.55767, + "35": 7.65834, + "36": 7.52881, + "37": 7.44912, + "38": 7.50398, + "39": 7.48056, + "40": 7.50302, + "41": 7.39767, + "42": 7.37206, + "43": 7.44301, + "44": 7.3811, + "45": 7.36143, + "46": 7.29415, + "47": 7.47498, + "48": 7.29564, + "49": 7.36092, + "50": 7.19205, + "51": 7.38769, + "52": 7.13773, + "53": 7.125, + "54": 7.23668, + "55": 7.16852, + "56": 7.22884, + "57": 7.34699, + "58": 7.03128, + "59": 7.1229, + "60": 7.16587, + "61": 7.1174, + "62": 7.26837, + "63": 7.16759, + "64": 7.08376, + "65": 7.00099, + "66": 7.07203, + "67": 7.05971, + "68": 7.14618, + "69": 7.03944, + "70": 7.07162, + "71": 6.91653, + "72": 7.02025, + "73": 6.9904, + "74": 6.9146, + "75": 7.07611, + "76": 6.97098, + "77": 7.08446, + "78": 7.03608, + "79": 6.88325, + "80": 6.95251, + "81": 6.985, + "82": 7.06843, + "83": 7.00882, + "84": 7.0181, + "85": 6.8641, + "86": 7.04979, + "87": 6.99342, + "88": 6.9238, + "89": 6.82406, + "90": 7.25457, + "91": 6.7226, + "92": 7.05372, + "93": 6.91688, + "94": 7.066, + "95": 6.8601, + "96": 6.98742, + "97": 6.96796, + "98": 6.89964, + "99": 7.02766, + "100": 6.99745 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43305.0, + "2": 44091.0, + "3": 44794.0, + "4": 42436.0, + "5": 45413.0, + "6": 40989.0, + "7": 43195.0, + "8": 45462.0, + "9": 42551.0, + "10": 45379.0, + "11": 44016.0, + "12": 44629.0, + "13": 43937.0, + "14": 46250.0, + "15": 43956.0, + "16": 41728.0, + "17": 43873.0, + "18": 44716.0, + "19": 42648.0, + "20": 44818.0, + "21": 44812.0, + "22": 41883.0, + "23": 45468.0, + "24": 43112.0, + "25": 42745.0, + "26": 43949.0, + "27": 46268.0, + "28": 46429.0, + "29": 46199.0, + "30": 44042.0, + "31": 41264.0, + "32": 43413.0, + "33": 45478.0, + "34": 43375.0, + "35": 43297.0, + "36": 42545.0, + "37": 40689.0, + "38": 42575.0, + "39": 44772.0, + "40": 43251.0, + "41": 44707.0, + "42": 43261.0, + "43": 45506.0, + "44": 44652.0, + "45": 43345.0, + "46": 43935.0, + "47": 42506.0, + "48": 44693.0, + "49": 43200.0, + "50": 43415.0, + "51": 41174.0, + "52": 43885.0, + "53": 43959.0, + "54": 41961.0, + "55": 43960.0, + "56": 43269.0, + "57": 42561.0, + "58": 43898.0, + "59": 44654.0, + "60": 41326.0, + "61": 39744.0, + "62": 44774.0, + "63": 44682.0, + "64": 45396.0, + "65": 44730.0, + "66": 45388.0, + "67": 43196.0, + "68": 42556.0, + "69": 43825.0, + "70": 45543.0, + "71": 43407.0, + "72": 44832.0, + "73": 45412.0, + "74": 42502.0, + "75": 44684.0, + "76": 43926.0, + "77": 42100.0, + "78": 40525.0, + "79": 38954.0, + "80": 41118.0, + "81": 45412.0, + "82": 43238.0, + "83": 38495.0, + "84": 42524.0, + "85": 44024.0, + "86": 45749.0, + "87": 41116.0, + "88": 41798.0, + "89": 41078.0, + "90": 44744.0, + "91": 46266.0, + "92": 41865.0, + "93": 43254.0, + "94": 39588.0, + "95": 44092.0, + "96": 44732.0, + "97": 45474.0, + "98": 41859.0, + "99": 45537.0, + "100": 42500.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4158515200.0, + "2": 4158515200.0, + "3": 4158515200.0, + "4": 4158515200.0, + "5": 4158515200.0, + "6": 4158515200.0, + "7": 4158515200.0, + "8": 4158515200.0, + "9": 4158515200.0, + "10": 4158515200.0, + "11": 4158515200.0, + "12": 4158515200.0, + "13": 4158515200.0, + "14": 4158515200.0, + "15": 4158515200.0, + "16": 4158515200.0, + "17": 4158515200.0, + "18": 4158515200.0, + "19": 4158515200.0, + "20": 4158515200.0, + "21": 4158515200.0, + "22": 4158515200.0, + "23": 4158515200.0, + "24": 4158515200.0, + "25": 4158515200.0, + "26": 4158515200.0, + "27": 4158515200.0, + "28": 4158515200.0, + "29": 4158515200.0, + "30": 4158515200.0, + "31": 4158515200.0, + "32": 4158515200.0, + "33": 4158515200.0, + "34": 4158515200.0, + "35": 4158515200.0, + "36": 4158515200.0, + "37": 4158515200.0, + "38": 4158515200.0, + "39": 4158515200.0, + "40": 4158515200.0, + "41": 4158515200.0, + "42": 4158515200.0, + "43": 4158515200.0, + "44": 4158515200.0, + "45": 4158515200.0, + "46": 4158515200.0, + "47": 4158515200.0, + "48": 4158515200.0, + "49": 4158515200.0, + "50": 4158515200.0, + "51": 4158515200.0, + "52": 4158515200.0, + "53": 4158515200.0, + "54": 4158515200.0, + "55": 4158515200.0, + "56": 4158515200.0, + "57": 4158515200.0, + "58": 4158515200.0, + "59": 4158515200.0, + "60": 4158515200.0, + "61": 4158515200.0, + "62": 4158515200.0, + "63": 4158515200.0, + "64": 4158515200.0, + "65": 4158515200.0, + "66": 4158515200.0, + "67": 4158515200.0, + "68": 4158515200.0, + "69": 4158515200.0, + "70": 4158515200.0, + "71": 4158515200.0, + "72": 4158515200.0, + "73": 4158515200.0, + "74": 4158515200.0, + "75": 4158515200.0, + "76": 4158515200.0, + "77": 4158515200.0, + "78": 4158515200.0, + "79": 4158515200.0, + "80": 4158515200.0, + "81": 4158515200.0, + "82": 4158515200.0, + "83": 4158515200.0, + "84": 4158515200.0, + "85": 4158515200.0, + "86": 4158515200.0, + "87": 4158515200.0, + "88": 4158515200.0, + "89": 4158515200.0, + "90": 4158515200.0, + "91": 4158515200.0, + "92": 4158515200.0, + "93": 4158515200.0, + "94": 4158515200.0, + "95": 4158515200.0, + "96": 4158515200.0, + "97": 4158515200.0, + "98": 4158515200.0, + "99": 4158515200.0, + "100": 4158515200.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4349380608.0, + "2": 6185459712.0, + "3": 6187556864.0, + "4": 6187556864.0, + "5": 6187556864.0, + "6": 6187556864.0, + "7": 6187556864.0, + "8": 6187556864.0, + "9": 6187556864.0, + "10": 6187556864.0, + "11": 6187556864.0, + "12": 6187556864.0, + "13": 6187556864.0, + "14": 6187556864.0, + "15": 6187556864.0, + "16": 6187556864.0, + "17": 6187556864.0, + "18": 6187556864.0, + "19": 6187556864.0, + "20": 6187556864.0, + "21": 6187556864.0, + "22": 6187556864.0, + "23": 6187556864.0, + "24": 6187556864.0, + "25": 6187556864.0, + "26": 6187556864.0, + "27": 6187556864.0, + "28": 6187556864.0, + "29": 6187556864.0, + "30": 6187556864.0, + "31": 6187556864.0, + "32": 6187556864.0, + "33": 6187556864.0, + "34": 6187556864.0, + "35": 6187556864.0, + "36": 6187556864.0, + "37": 6187556864.0, + "38": 6187556864.0, + "39": 6187556864.0, + "40": 6187556864.0, + "41": 6187556864.0, + "42": 6187556864.0, + "43": 6187556864.0, + "44": 6187556864.0, + "45": 6187556864.0, + "46": 6187556864.0, + "47": 6187556864.0, + "48": 6187556864.0, + "49": 6187556864.0, + "50": 6187556864.0, + "51": 6187556864.0, + "52": 6187556864.0, + "53": 6187556864.0, + "54": 6187556864.0, + "55": 6187556864.0, + "56": 6187556864.0, + "57": 6187556864.0, + "58": 6187556864.0, + "59": 6187556864.0, + "60": 6187556864.0, + "61": 6187556864.0, + "62": 6187556864.0, + "63": 6187556864.0, + "64": 6187556864.0, + "65": 6187556864.0, + "66": 6187556864.0, + "67": 6187556864.0, + "68": 6187556864.0, + "69": 6187556864.0, + "70": 6187556864.0, + "71": 6187556864.0, + "72": 6187556864.0, + "73": 6187556864.0, + "74": 6187556864.0, + "75": 6187556864.0, + "76": 6187556864.0, + "77": 6187556864.0, + "78": 6187556864.0, + "79": 6187556864.0, + "80": 6187556864.0, + "81": 6187556864.0, + "82": 6187556864.0, + "83": 6187556864.0, + "84": 6187556864.0, + "85": 6187556864.0, + "86": 6187556864.0, + "87": 6187556864.0, + "88": 6187556864.0, + "89": 6187556864.0, + "90": 6187556864.0, + "91": 6187556864.0, + "92": 6187556864.0, + "93": 6187556864.0, + "94": 6187556864.0, + "95": 6187556864.0, + "96": 6187556864.0, + "97": 6187556864.0, + "98": 6187556864.0, + "99": 6187556864.0, + "100": 6187556864.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.07631, + "2": 0.42115, + "3": 0.24529, + "4": 0.23719, + "5": 0.2516, + "6": 0.2477, + "7": 0.2382, + "8": 0.23994, + "9": 0.26017, + "10": 0.27742, + "11": 0.24722, + "12": 0.243, + "13": 0.23789, + "14": 0.24255, + "15": 0.24011, + "16": 0.23679, + "17": 0.24823, + "18": 0.24785, + "19": 0.2488, + "20": 0.24836, + "21": 0.25124, + "22": 0.26347, + "23": 0.25688, + "24": 0.25176, + "25": 0.25034, + "26": 0.24652, + "27": 0.25028, + "28": 0.24542, + "29": 0.24835, + "30": 0.25164, + "31": 0.24716, + "32": 0.23244, + "33": 0.24002, + "34": 0.23187, + "35": 0.2359, + "36": 0.23168, + "37": 0.23963, + "38": 0.23232, + "39": 0.23677, + "40": 0.23188, + "41": 0.23971, + "42": 0.23201, + "43": 0.24022, + "44": 0.2318, + "45": 0.24134, + "46": 0.23272, + "47": 0.24039, + "48": 0.23386, + "49": 0.23758, + "50": 0.23159, + "51": 0.25559, + "52": 0.28119, + "53": 0.27021, + "54": 0.24392, + "55": 0.23902, + "56": 0.23405, + "57": 0.24193, + "58": 0.23238, + "59": 0.2443, + "60": 0.232, + "61": 0.2448, + "62": 0.23419, + "63": 0.24179, + "64": 0.23763, + "65": 0.24278, + "66": 0.23814, + "67": 0.23636, + "68": 0.23943, + "69": 0.23382, + "70": 0.23642, + "71": 0.23981, + "72": 0.23228, + "73": 0.23188, + "74": 0.23232, + "75": 0.23217, + "76": 0.2324, + "77": 0.23204, + "78": 0.23241, + "79": 0.23249, + "80": 0.23152, + "81": 0.23163, + "82": 0.23217, + "83": 0.23187, + "84": 0.23224, + "85": 0.23215, + "86": 0.23155, + "87": 0.23144, + "88": 0.23215, + "89": 0.23207, + "90": 0.23116, + "91": 0.23213, + "92": 0.23203, + "93": 0.23167, + "94": 0.23097, + "95": 0.23272, + "96": 0.23147, + "97": 0.23203, + "98": 0.23135, + "99": 0.23167, + "100": 0.23206 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxa100_dracooci.json new file mode 100644 index 00000000000..10ef1405966 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.34494, + "2": 10.36431, + "3": 9.73158, + "4": 9.57928, + "5": 9.38931, + "6": 9.41074, + "7": 9.30545, + "8": 9.24868, + "9": 9.09349, + "10": 9.01569, + "11": 8.86286, + "12": 8.79096, + "13": 8.80892, + "14": 8.67669, + "15": 8.64631, + "16": 8.5398, + "17": 8.47895, + "18": 8.38945, + "19": 8.36156, + "20": 8.26966, + "21": 8.26333, + "22": 8.15066, + "23": 8.08893, + "24": 8.12421, + "25": 7.99493, + "26": 8.08494, + "27": 7.87755, + "28": 7.95863, + "29": 7.79585, + "30": 7.87492, + "31": 7.83245, + "32": 7.69489, + "33": 7.78469, + "34": 7.55767, + "35": 7.65834, + "36": 7.52881, + "37": 7.44912, + "38": 7.50398, + "39": 7.48056, + "40": 7.50302, + "41": 7.39767, + "42": 7.37206, + "43": 7.44301, + "44": 7.3811, + "45": 7.36143, + "46": 7.29415, + "47": 7.47498, + "48": 7.29564, + "49": 7.36092, + "50": 7.19205, + "51": 7.38769, + "52": 7.13773, + "53": 7.125, + "54": 7.23668, + "55": 7.16852, + "56": 7.22884, + "57": 7.34699, + "58": 7.03128, + "59": 7.1229, + "60": 7.16587, + "61": 7.1174, + "62": 7.26837, + "63": 7.16759, + "64": 7.08376, + "65": 7.00099, + "66": 7.07203, + "67": 7.05971, + "68": 7.14618, + "69": 7.03944, + "70": 7.07162, + "71": 6.91653, + "72": 7.02025, + "73": 6.9904, + "74": 6.9146, + "75": 7.07611, + "76": 6.97098, + "77": 7.08446, + "78": 7.03608, + "79": 6.88325, + "80": 6.95251, + "81": 6.985, + "82": 7.06843, + "83": 7.00882, + "84": 7.0181, + "85": 6.8641, + "86": 7.04979, + "87": 6.99342, + "88": 6.9238, + "89": 6.82406, + "90": 7.25457, + "91": 6.7226, + "92": 7.05372, + "93": 6.91688, + "94": 7.066, + "95": 6.8601, + "96": 6.98742, + "97": 6.96796, + "98": 6.89964, + "99": 7.02766, + "100": 6.99745 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43305.0, + "2": 44091.0, + "3": 44794.0, + "4": 42436.0, + "5": 45413.0, + "6": 40989.0, + "7": 43195.0, + "8": 45462.0, + "9": 42551.0, + "10": 45379.0, + "11": 44016.0, + "12": 44629.0, + "13": 43937.0, + "14": 46250.0, + "15": 43956.0, + "16": 41728.0, + "17": 43873.0, + "18": 44716.0, + "19": 42648.0, + "20": 44818.0, + "21": 44812.0, + "22": 41883.0, + "23": 45468.0, + "24": 43112.0, + "25": 42745.0, + "26": 43949.0, + "27": 46268.0, + "28": 46429.0, + "29": 46199.0, + "30": 44042.0, + "31": 41264.0, + "32": 43413.0, + "33": 45478.0, + "34": 43375.0, + "35": 43297.0, + "36": 42545.0, + "37": 40689.0, + "38": 42575.0, + "39": 44772.0, + "40": 43251.0, + "41": 44707.0, + "42": 43261.0, + "43": 45506.0, + "44": 44652.0, + "45": 43345.0, + "46": 43935.0, + "47": 42506.0, + "48": 44693.0, + "49": 43200.0, + "50": 43415.0, + "51": 41174.0, + "52": 43885.0, + "53": 43959.0, + "54": 41961.0, + "55": 43960.0, + "56": 43269.0, + "57": 42561.0, + "58": 43898.0, + "59": 44654.0, + "60": 41326.0, + "61": 39744.0, + "62": 44774.0, + "63": 44682.0, + "64": 45396.0, + "65": 44730.0, + "66": 45388.0, + "67": 43196.0, + "68": 42556.0, + "69": 43825.0, + "70": 45543.0, + "71": 43407.0, + "72": 44832.0, + "73": 45412.0, + "74": 42502.0, + "75": 44684.0, + "76": 43926.0, + "77": 42100.0, + "78": 40525.0, + "79": 38954.0, + "80": 41118.0, + "81": 45412.0, + "82": 43238.0, + "83": 38495.0, + "84": 42524.0, + "85": 44024.0, + "86": 45749.0, + "87": 41116.0, + "88": 41798.0, + "89": 41078.0, + "90": 44744.0, + "91": 46266.0, + "92": 41865.0, + "93": 43254.0, + "94": 39588.0, + "95": 44092.0, + "96": 44732.0, + "97": 45474.0, + "98": 41859.0, + "99": 45537.0, + "100": 42500.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4158515200.0, + "2": 4158515200.0, + "3": 4158515200.0, + "4": 4158515200.0, + "5": 4158515200.0, + "6": 4158515200.0, + "7": 4158515200.0, + "8": 4158515200.0, + "9": 4158515200.0, + "10": 4158515200.0, + "11": 4158515200.0, + "12": 4158515200.0, + "13": 4158515200.0, + "14": 4158515200.0, + "15": 4158515200.0, + "16": 4158515200.0, + "17": 4158515200.0, + "18": 4158515200.0, + "19": 4158515200.0, + "20": 4158515200.0, + "21": 4158515200.0, + "22": 4158515200.0, + "23": 4158515200.0, + "24": 4158515200.0, + "25": 4158515200.0, + "26": 4158515200.0, + "27": 4158515200.0, + "28": 4158515200.0, + "29": 4158515200.0, + "30": 4158515200.0, + "31": 4158515200.0, + "32": 4158515200.0, + "33": 4158515200.0, + "34": 4158515200.0, + "35": 4158515200.0, + "36": 4158515200.0, + "37": 4158515200.0, + "38": 4158515200.0, + "39": 4158515200.0, + "40": 4158515200.0, + "41": 4158515200.0, + "42": 4158515200.0, + "43": 4158515200.0, + "44": 4158515200.0, + "45": 4158515200.0, + "46": 4158515200.0, + "47": 4158515200.0, + "48": 4158515200.0, + "49": 4158515200.0, + "50": 4158515200.0, + "51": 4158515200.0, + "52": 4158515200.0, + "53": 4158515200.0, + "54": 4158515200.0, + "55": 4158515200.0, + "56": 4158515200.0, + "57": 4158515200.0, + "58": 4158515200.0, + "59": 4158515200.0, + "60": 4158515200.0, + "61": 4158515200.0, + "62": 4158515200.0, + "63": 4158515200.0, + "64": 4158515200.0, + "65": 4158515200.0, + "66": 4158515200.0, + "67": 4158515200.0, + "68": 4158515200.0, + "69": 4158515200.0, + "70": 4158515200.0, + "71": 4158515200.0, + "72": 4158515200.0, + "73": 4158515200.0, + "74": 4158515200.0, + "75": 4158515200.0, + "76": 4158515200.0, + "77": 4158515200.0, + "78": 4158515200.0, + "79": 4158515200.0, + "80": 4158515200.0, + "81": 4158515200.0, + "82": 4158515200.0, + "83": 4158515200.0, + "84": 4158515200.0, + "85": 4158515200.0, + "86": 4158515200.0, + "87": 4158515200.0, + "88": 4158515200.0, + "89": 4158515200.0, + "90": 4158515200.0, + "91": 4158515200.0, + "92": 4158515200.0, + "93": 4158515200.0, + "94": 4158515200.0, + "95": 4158515200.0, + "96": 4158515200.0, + "97": 4158515200.0, + "98": 4158515200.0, + "99": 4158515200.0, + "100": 4158515200.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4349380608.0, + "2": 6185459712.0, + "3": 6187556864.0, + "4": 6187556864.0, + "5": 6187556864.0, + "6": 6187556864.0, + "7": 6187556864.0, + "8": 6187556864.0, + "9": 6187556864.0, + "10": 6187556864.0, + "11": 6187556864.0, + "12": 6187556864.0, + "13": 6187556864.0, + "14": 6187556864.0, + "15": 6187556864.0, + "16": 6187556864.0, + "17": 6187556864.0, + "18": 6187556864.0, + "19": 6187556864.0, + "20": 6187556864.0, + "21": 6187556864.0, + "22": 6187556864.0, + "23": 6187556864.0, + "24": 6187556864.0, + "25": 6187556864.0, + "26": 6187556864.0, + "27": 6187556864.0, + "28": 6187556864.0, + "29": 6187556864.0, + "30": 6187556864.0, + "31": 6187556864.0, + "32": 6187556864.0, + "33": 6187556864.0, + "34": 6187556864.0, + "35": 6187556864.0, + "36": 6187556864.0, + "37": 6187556864.0, + "38": 6187556864.0, + "39": 6187556864.0, + "40": 6187556864.0, + "41": 6187556864.0, + "42": 6187556864.0, + "43": 6187556864.0, + "44": 6187556864.0, + "45": 6187556864.0, + "46": 6187556864.0, + "47": 6187556864.0, + "48": 6187556864.0, + "49": 6187556864.0, + "50": 6187556864.0, + "51": 6187556864.0, + "52": 6187556864.0, + "53": 6187556864.0, + "54": 6187556864.0, + "55": 6187556864.0, + "56": 6187556864.0, + "57": 6187556864.0, + "58": 6187556864.0, + "59": 6187556864.0, + "60": 6187556864.0, + "61": 6187556864.0, + "62": 6187556864.0, + "63": 6187556864.0, + "64": 6187556864.0, + "65": 6187556864.0, + "66": 6187556864.0, + "67": 6187556864.0, + "68": 6187556864.0, + "69": 6187556864.0, + "70": 6187556864.0, + "71": 6187556864.0, + "72": 6187556864.0, + "73": 6187556864.0, + "74": 6187556864.0, + "75": 6187556864.0, + "76": 6187556864.0, + "77": 6187556864.0, + "78": 6187556864.0, + "79": 6187556864.0, + "80": 6187556864.0, + "81": 6187556864.0, + "82": 6187556864.0, + "83": 6187556864.0, + "84": 6187556864.0, + "85": 6187556864.0, + "86": 6187556864.0, + "87": 6187556864.0, + "88": 6187556864.0, + "89": 6187556864.0, + "90": 6187556864.0, + "91": 6187556864.0, + "92": 6187556864.0, + "93": 6187556864.0, + "94": 6187556864.0, + "95": 6187556864.0, + "96": 6187556864.0, + "97": 6187556864.0, + "98": 6187556864.0, + "99": 6187556864.0, + "100": 6187556864.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 8.56097, + "2": 0.5665, + "3": 0.23157, + "4": 0.23387, + "5": 0.24864, + "6": 0.23399, + "7": 0.23692, + "8": 0.23082, + "9": 0.23218, + "10": 0.23429, + "11": 0.22503, + "12": 0.23455, + "13": 0.22526, + "14": 0.23323, + "15": 0.23735, + "16": 0.236, + "17": 0.22678, + "18": 0.23575, + "19": 0.22315, + "20": 0.2333, + "21": 0.22422, + "22": 0.22407, + "23": 0.2339, + "24": 0.22414, + "25": 0.22406, + "26": 0.23317, + "27": 0.22305, + "28": 0.22383, + "29": 0.23323, + "30": 0.224, + "31": 0.22377, + "32": 0.22673, + "33": 0.23037, + "34": 0.22469, + "35": 0.22408, + "36": 0.22989, + "37": 0.2238, + "38": 0.22507, + "39": 0.22859, + "40": 0.24027, + "41": 0.23144, + "42": 0.23374, + "43": 0.22475, + "44": 0.22417, + "45": 0.23296, + "46": 0.22427, + "47": 0.22489, + "48": 0.23424, + "49": 0.22498, + "50": 0.22454, + "51": 0.23236, + "52": 0.22777, + "53": 0.22625, + "54": 0.23366, + "55": 0.22841, + "56": 0.23206, + "57": 0.23467, + "58": 0.2277, + "59": 0.23045, + "60": 0.23628, + "61": 0.22728, + "62": 0.22507, + "63": 0.23342, + "64": 0.22668, + "65": 0.22514, + "66": 0.23559, + "67": 0.2309, + "68": 0.25201, + "69": 0.23266, + "70": 0.2274, + "71": 0.23936, + "72": 0.23585, + "73": 0.24105, + "74": 0.23426, + "75": 0.23113, + "76": 0.23658, + "77": 0.22773, + "78": 0.22825, + "79": 0.23279, + "80": 0.22595, + "81": 0.22568, + "82": 0.22609, + "83": 0.22518, + "84": 0.22622, + "85": 0.2284, + "86": 0.22625, + "87": 0.22909, + "88": 0.22703, + "89": 0.22595, + "90": 0.6034, + "91": 0.22715, + "92": 0.22553, + "93": 0.22635, + "94": 0.22592, + "95": 0.22566, + "96": 0.22563, + "97": 0.22615, + "98": 0.22511, + "99": 0.23442, + "100": 0.22512 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..a044dd0e135 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.33127, + "2": 10.35281, + "3": 9.79613, + "4": 9.60968, + "5": 9.42269, + "6": 9.45137, + "7": 9.34348, + "8": 9.27525, + "9": 9.09676, + "10": 9.0722, + "11": 8.8835, + "12": 8.83711, + "13": 8.86836, + "14": 8.71039, + "15": 8.68191, + "16": 8.56149, + "17": 8.52311, + "18": 8.43963, + "19": 8.40439, + "20": 8.29506, + "21": 8.27059, + "22": 8.17902, + "23": 8.12669, + "24": 8.14846, + "25": 7.9909, + "26": 8.12216, + "27": 7.90453, + "28": 7.98655, + "29": 7.80845, + "30": 7.86918, + "31": 7.83571, + "32": 7.72178, + "33": 7.80378, + "34": 7.59229, + "35": 7.68371, + "36": 7.53883, + "37": 7.47609, + "38": 7.5168, + "39": 7.49978, + "40": 7.51704, + "41": 7.43174, + "42": 7.40104, + "43": 7.44926, + "44": 7.38919, + "45": 7.38016, + "46": 7.29476, + "47": 7.44829, + "48": 7.28213, + "49": 7.34657, + "50": 7.17116, + "51": 7.37361, + "52": 7.13381, + "53": 7.11244, + "54": 7.23402, + "55": 7.14785, + "56": 7.22775, + "57": 7.33273, + "58": 6.99461, + "59": 7.11599, + "60": 7.13222, + "61": 7.1056, + "62": 7.26513, + "63": 7.14772, + "64": 7.08696, + "65": 6.98643, + "66": 7.04728, + "67": 7.04697, + "68": 7.14062, + "69": 7.2435, + "70": 7.05957, + "71": 6.89356, + "72": 6.99769, + "73": 6.97897, + "74": 6.91983, + "75": 7.05297, + "76": 6.96036, + "77": 7.0791, + "78": 7.01392, + "79": 6.88358, + "80": 6.93014, + "81": 6.96553, + "82": 7.05265, + "83": 6.98788, + "84": 7.00427, + "85": 6.84577, + "86": 7.03621, + "87": 6.96327, + "88": 6.9137, + "89": 6.80631, + "90": 7.23619, + "91": 6.70015, + "92": 7.05679, + "93": 6.89287, + "94": 7.05835, + "95": 6.84786, + "96": 6.96771, + "97": 6.94258, + "98": 6.87388, + "99": 7.01816, + "100": 6.98466 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43334.0, + "2": 44100.0, + "3": 44771.0, + "4": 42457.0, + "5": 45411.0, + "6": 40966.0, + "7": 43193.0, + "8": 45457.0, + "9": 42550.0, + "10": 45360.0, + "11": 44029.0, + "12": 44605.0, + "13": 43917.0, + "14": 46219.0, + "15": 43943.0, + "16": 41732.0, + "17": 43861.0, + "18": 44721.0, + "19": 42597.0, + "20": 44797.0, + "21": 44792.0, + "22": 41891.0, + "23": 45473.0, + "24": 43081.0, + "25": 42682.0, + "26": 43950.0, + "27": 46253.0, + "28": 46447.0, + "29": 46164.0, + "30": 44042.0, + "31": 41263.0, + "32": 43440.0, + "33": 45483.0, + "34": 43349.0, + "35": 43273.0, + "36": 42490.0, + "37": 40647.0, + "38": 42549.0, + "39": 44766.0, + "40": 43281.0, + "41": 44669.0, + "42": 43287.0, + "43": 45454.0, + "44": 44627.0, + "45": 43353.0, + "46": 43925.0, + "47": 42498.0, + "48": 44758.0, + "49": 43173.0, + "50": 43402.0, + "51": 41198.0, + "52": 43900.0, + "53": 43938.0, + "54": 41922.0, + "55": 43916.0, + "56": 43237.0, + "57": 42634.0, + "58": 43916.0, + "59": 44616.0, + "60": 41414.0, + "61": 39759.0, + "62": 44750.0, + "63": 44673.0, + "64": 45378.0, + "65": 44765.0, + "66": 45401.0, + "67": 43155.0, + "68": 42552.0, + "69": 43831.0, + "70": 45546.0, + "71": 43332.0, + "72": 44847.0, + "73": 45376.0, + "74": 42503.0, + "75": 44704.0, + "76": 43916.0, + "77": 42101.0, + "78": 40543.0, + "79": 38997.0, + "80": 41079.0, + "81": 45377.0, + "82": 43254.0, + "83": 38473.0, + "84": 42420.0, + "85": 43989.0, + "86": 45694.0, + "87": 41164.0, + "88": 41773.0, + "89": 41047.0, + "90": 44710.0, + "91": 46274.0, + "92": 41823.0, + "93": 43286.0, + "94": 39530.0, + "95": 44074.0, + "96": 44686.0, + "97": 45424.0, + "98": 41849.0, + "99": 45567.0, + "100": 42485.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4158515200.0, + "2": 4158515200.0, + "3": 4158515200.0, + "4": 4158515200.0, + "5": 4158515200.0, + "6": 4158515200.0, + "7": 4158515200.0, + "8": 4158515200.0, + "9": 4158515200.0, + "10": 4158515200.0, + "11": 4158515200.0, + "12": 4158515200.0, + "13": 4158515200.0, + "14": 4158515200.0, + "15": 4158515200.0, + "16": 4158515200.0, + "17": 4158515200.0, + "18": 4158515200.0, + "19": 4158515200.0, + "20": 4158515200.0, + "21": 4158515200.0, + "22": 4158515200.0, + "23": 4158515200.0, + "24": 4158515200.0, + "25": 4158515200.0, + "26": 4158515200.0, + "27": 4158515200.0, + "28": 4158515200.0, + "29": 4158515200.0, + "30": 4158515200.0, + "31": 4158515200.0, + "32": 4158515200.0, + "33": 4158515200.0, + "34": 4158515200.0, + "35": 4158515200.0, + "36": 4158515200.0, + "37": 4158515200.0, + "38": 4158515200.0, + "39": 4158515200.0, + "40": 4158515200.0, + "41": 4158515200.0, + "42": 4158515200.0, + "43": 4158515200.0, + "44": 4158515200.0, + "45": 4158515200.0, + "46": 4158515200.0, + "47": 4158515200.0, + "48": 4158515200.0, + "49": 4158515200.0, + "50": 4158515200.0, + "51": 4158515200.0, + "52": 4158515200.0, + "53": 4158515200.0, + "54": 4158515200.0, + "55": 4158515200.0, + "56": 4158515200.0, + "57": 4158515200.0, + "58": 4158515200.0, + "59": 4158515200.0, + "60": 4158515200.0, + "61": 4158515200.0, + "62": 4158515200.0, + "63": 4158515200.0, + "64": 4158515200.0, + "65": 4158515200.0, + "66": 4158515200.0, + "67": 4158515200.0, + "68": 4158515200.0, + "69": 4158515200.0, + "70": 4158515200.0, + "71": 4158515200.0, + "72": 4158515200.0, + "73": 4158515200.0, + "74": 4158515200.0, + "75": 4158515200.0, + "76": 4158515200.0, + "77": 4158515200.0, + "78": 4158515200.0, + "79": 4158515200.0, + "80": 4158515200.0, + "81": 4158515200.0, + "82": 4158515200.0, + "83": 4158515200.0, + "84": 4158515200.0, + "85": 4158515200.0, + "86": 4158515200.0, + "87": 4158515200.0, + "88": 4158515200.0, + "89": 4158515200.0, + "90": 4158515200.0, + "91": 4158515200.0, + "92": 4158515200.0, + "93": 4158515200.0, + "94": 4158515200.0, + "95": 4158515200.0, + "96": 4158515200.0, + "97": 4158515200.0, + "98": 4158515200.0, + "99": 4158515200.0, + "100": 4158515200.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4349380608.0, + "2": 6185459712.0, + "3": 6187556864.0, + "4": 6187556864.0, + "5": 6187556864.0, + "6": 6187556864.0, + "7": 6187556864.0, + "8": 6187556864.0, + "9": 6187556864.0, + "10": 6187556864.0, + "11": 6187556864.0, + "12": 6187556864.0, + "13": 6187556864.0, + "14": 6187556864.0, + "15": 6187556864.0, + "16": 6187556864.0, + "17": 6187556864.0, + "18": 6187556864.0, + "19": 6187556864.0, + "20": 6187556864.0, + "21": 6187556864.0, + "22": 6187556864.0, + "23": 6187556864.0, + "24": 6187556864.0, + "25": 6187556864.0, + "26": 6187556864.0, + "27": 6187556864.0, + "28": 6187556864.0, + "29": 6187556864.0, + "30": 6187556864.0, + "31": 6187556864.0, + "32": 6187556864.0, + "33": 6187556864.0, + "34": 6187556864.0, + "35": 6187556864.0, + "36": 6187556864.0, + "37": 6187556864.0, + "38": 6187556864.0, + "39": 6187556864.0, + "40": 6187556864.0, + "41": 6187556864.0, + "42": 6187556864.0, + "43": 6187556864.0, + "44": 6187556864.0, + "45": 6187556864.0, + "46": 6187556864.0, + "47": 6187556864.0, + "48": 6187556864.0, + "49": 6187556864.0, + "50": 6187556864.0, + "51": 6187556864.0, + "52": 6187556864.0, + "53": 6187556864.0, + "54": 6187556864.0, + "55": 6187556864.0, + "56": 6187556864.0, + "57": 6187556864.0, + "58": 6187556864.0, + "59": 6187556864.0, + "60": 6187556864.0, + "61": 6187556864.0, + "62": 6187556864.0, + "63": 6187556864.0, + "64": 6187556864.0, + "65": 6187556864.0, + "66": 6187556864.0, + "67": 6187556864.0, + "68": 6187556864.0, + "69": 6187556864.0, + "70": 6187556864.0, + "71": 6187556864.0, + "72": 6187556864.0, + "73": 6187556864.0, + "74": 6187556864.0, + "75": 6187556864.0, + "76": 6187556864.0, + "77": 6187556864.0, + "78": 6187556864.0, + "79": 6187556864.0, + "80": 6187556864.0, + "81": 6187556864.0, + "82": 6187556864.0, + "83": 6187556864.0, + "84": 6187556864.0, + "85": 6187556864.0, + "86": 6187556864.0, + "87": 6187556864.0, + "88": 6187556864.0, + "89": 6187556864.0, + "90": 6187556864.0, + "91": 6187556864.0, + "92": 6187556864.0, + "93": 6187556864.0, + "94": 6187556864.0, + "95": 6187556864.0, + "96": 6187556864.0, + "97": 6187556864.0, + "98": 6187556864.0, + "99": 6187556864.0, + "100": 6187556864.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.68377, + "2": 0.24636, + "3": 0.14697, + "4": 0.14068, + "5": 0.14575, + "6": 0.13961, + "7": 0.14621, + "8": 0.14223, + "9": 0.14582, + "10": 0.13865, + "11": 0.1453, + "12": 0.13885, + "13": 0.14702, + "14": 0.14162, + "15": 0.1468, + "16": 0.14692, + "17": 0.14326, + "18": 0.14146, + "19": 0.15015, + "20": 0.13999, + "21": 0.14878, + "22": 0.13993, + "23": 0.14535, + "24": 0.1378, + "25": 0.15024, + "26": 0.1375, + "27": 0.13991, + "28": 0.14118, + "29": 0.14057, + "30": 0.14015, + "31": 0.1384, + "32": 0.13865, + "33": 0.14194, + "34": 0.14009, + "35": 0.14432, + "36": 0.14051, + "37": 0.1489, + "38": 0.13976, + "39": 0.14433, + "40": 0.13889, + "41": 0.14744, + "42": 0.14045, + "43": 0.14474, + "44": 0.14195, + "45": 0.14259, + "46": 0.13761, + "47": 0.14569, + "48": 0.15734, + "49": 0.18844, + "50": 0.14153, + "51": 0.14057, + "52": 0.14132, + "53": 0.14241, + "54": 0.14306, + "55": 0.1436, + "56": 0.14347, + "57": 0.13981, + "58": 0.13906, + "59": 0.14322, + "60": 0.13735, + "61": 0.14083, + "62": 0.14416, + "63": 0.14191, + "64": 0.14246, + "65": 0.13711, + "66": 0.1364, + "67": 0.13655, + "68": 0.1365, + "69": 0.13935, + "70": 0.15757, + "71": 0.13997, + "72": 0.13995, + "73": 0.14045, + "74": 0.1419, + "75": 0.14171, + "76": 0.14479, + "77": 0.17363, + "78": 0.15289, + "79": 0.1416, + "80": 0.14577, + "81": 0.14478, + "82": 0.14716, + "83": 0.14872, + "84": 0.15369, + "85": 0.15016, + "86": 0.13782, + "87": 0.1585, + "88": 0.15072, + "89": 0.13834, + "90": 0.13681, + "91": 0.139, + "92": 0.13751, + "93": 0.13694, + "94": 0.13764, + "95": 0.13659, + "96": 0.13726, + "97": 0.13676, + "98": 0.13872, + "99": 0.13604, + "100": 0.13543 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..e788215b20a --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.33127, + "2": 10.35281, + "3": 9.79613, + "4": 9.60968, + "5": 9.42269, + "6": 9.45137, + "7": 9.34348, + "8": 9.27525, + "9": 9.09676, + "10": 9.0722, + "11": 8.8835, + "12": 8.83711, + "13": 8.86836, + "14": 8.71039, + "15": 8.68191, + "16": 8.56149, + "17": 8.52311, + "18": 8.43963, + "19": 8.40439, + "20": 8.29506, + "21": 8.27059, + "22": 8.17902, + "23": 8.12669, + "24": 8.14846, + "25": 7.9909, + "26": 8.12216, + "27": 7.90453, + "28": 7.98655, + "29": 7.80845, + "30": 7.86918, + "31": 7.83571, + "32": 7.72178, + "33": 7.80378, + "34": 7.59229, + "35": 7.68371, + "36": 7.53883, + "37": 7.47609, + "38": 7.5168, + "39": 7.49978, + "40": 7.51704, + "41": 7.43174, + "42": 7.40104, + "43": 7.44926, + "44": 7.38919, + "45": 7.38016, + "46": 7.29476, + "47": 7.44829, + "48": 7.28213, + "49": 7.34657, + "50": 7.17116, + "51": 7.37361, + "52": 7.13381, + "53": 7.11244, + "54": 7.23402, + "55": 7.14785, + "56": 7.22775, + "57": 7.33273, + "58": 6.99461, + "59": 7.11599, + "60": 7.13222, + "61": 7.1056, + "62": 7.26513, + "63": 7.14772, + "64": 7.08696, + "65": 6.98643, + "66": 7.04728, + "67": 7.04697, + "68": 7.14062, + "69": 7.2435, + "70": 7.05957, + "71": 6.89356, + "72": 6.99769, + "73": 6.97897, + "74": 6.91983, + "75": 7.05297, + "76": 6.96036, + "77": 7.0791, + "78": 7.01392, + "79": 6.88358, + "80": 6.93014, + "81": 6.96553, + "82": 7.05265, + "83": 6.98788, + "84": 7.00427, + "85": 6.84577, + "86": 7.03621, + "87": 6.96327, + "88": 6.9137, + "89": 6.80631, + "90": 7.23619, + "91": 6.70015, + "92": 7.05679, + "93": 6.89287, + "94": 7.05835, + "95": 6.84786, + "96": 6.96771, + "97": 6.94258, + "98": 6.87388, + "99": 7.01816, + "100": 6.98466 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43334.0, + "2": 44100.0, + "3": 44771.0, + "4": 42457.0, + "5": 45411.0, + "6": 40966.0, + "7": 43193.0, + "8": 45457.0, + "9": 42550.0, + "10": 45360.0, + "11": 44029.0, + "12": 44605.0, + "13": 43917.0, + "14": 46219.0, + "15": 43943.0, + "16": 41732.0, + "17": 43861.0, + "18": 44721.0, + "19": 42597.0, + "20": 44797.0, + "21": 44792.0, + "22": 41891.0, + "23": 45473.0, + "24": 43081.0, + "25": 42682.0, + "26": 43950.0, + "27": 46253.0, + "28": 46447.0, + "29": 46164.0, + "30": 44042.0, + "31": 41263.0, + "32": 43440.0, + "33": 45483.0, + "34": 43349.0, + "35": 43273.0, + "36": 42490.0, + "37": 40647.0, + "38": 42549.0, + "39": 44766.0, + "40": 43281.0, + "41": 44669.0, + "42": 43287.0, + "43": 45454.0, + "44": 44627.0, + "45": 43353.0, + "46": 43925.0, + "47": 42498.0, + "48": 44758.0, + "49": 43173.0, + "50": 43402.0, + "51": 41198.0, + "52": 43900.0, + "53": 43938.0, + "54": 41922.0, + "55": 43916.0, + "56": 43237.0, + "57": 42634.0, + "58": 43916.0, + "59": 44616.0, + "60": 41414.0, + "61": 39759.0, + "62": 44750.0, + "63": 44673.0, + "64": 45378.0, + "65": 44765.0, + "66": 45401.0, + "67": 43155.0, + "68": 42552.0, + "69": 43831.0, + "70": 45546.0, + "71": 43332.0, + "72": 44847.0, + "73": 45376.0, + "74": 42503.0, + "75": 44704.0, + "76": 43916.0, + "77": 42101.0, + "78": 40543.0, + "79": 38997.0, + "80": 41079.0, + "81": 45377.0, + "82": 43254.0, + "83": 38473.0, + "84": 42420.0, + "85": 43989.0, + "86": 45694.0, + "87": 41164.0, + "88": 41773.0, + "89": 41047.0, + "90": 44710.0, + "91": 46274.0, + "92": 41823.0, + "93": 43286.0, + "94": 39530.0, + "95": 44074.0, + "96": 44686.0, + "97": 45424.0, + "98": 41849.0, + "99": 45567.0, + "100": 42485.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4158515200.0, + "2": 4158515200.0, + "3": 4158515200.0, + "4": 4158515200.0, + "5": 4158515200.0, + "6": 4158515200.0, + "7": 4158515200.0, + "8": 4158515200.0, + "9": 4158515200.0, + "10": 4158515200.0, + "11": 4158515200.0, + "12": 4158515200.0, + "13": 4158515200.0, + "14": 4158515200.0, + "15": 4158515200.0, + "16": 4158515200.0, + "17": 4158515200.0, + "18": 4158515200.0, + "19": 4158515200.0, + "20": 4158515200.0, + "21": 4158515200.0, + "22": 4158515200.0, + "23": 4158515200.0, + "24": 4158515200.0, + "25": 4158515200.0, + "26": 4158515200.0, + "27": 4158515200.0, + "28": 4158515200.0, + "29": 4158515200.0, + "30": 4158515200.0, + "31": 4158515200.0, + "32": 4158515200.0, + "33": 4158515200.0, + "34": 4158515200.0, + "35": 4158515200.0, + "36": 4158515200.0, + "37": 4158515200.0, + "38": 4158515200.0, + "39": 4158515200.0, + "40": 4158515200.0, + "41": 4158515200.0, + "42": 4158515200.0, + "43": 4158515200.0, + "44": 4158515200.0, + "45": 4158515200.0, + "46": 4158515200.0, + "47": 4158515200.0, + "48": 4158515200.0, + "49": 4158515200.0, + "50": 4158515200.0, + "51": 4158515200.0, + "52": 4158515200.0, + "53": 4158515200.0, + "54": 4158515200.0, + "55": 4158515200.0, + "56": 4158515200.0, + "57": 4158515200.0, + "58": 4158515200.0, + "59": 4158515200.0, + "60": 4158515200.0, + "61": 4158515200.0, + "62": 4158515200.0, + "63": 4158515200.0, + "64": 4158515200.0, + "65": 4158515200.0, + "66": 4158515200.0, + "67": 4158515200.0, + "68": 4158515200.0, + "69": 4158515200.0, + "70": 4158515200.0, + "71": 4158515200.0, + "72": 4158515200.0, + "73": 4158515200.0, + "74": 4158515200.0, + "75": 4158515200.0, + "76": 4158515200.0, + "77": 4158515200.0, + "78": 4158515200.0, + "79": 4158515200.0, + "80": 4158515200.0, + "81": 4158515200.0, + "82": 4158515200.0, + "83": 4158515200.0, + "84": 4158515200.0, + "85": 4158515200.0, + "86": 4158515200.0, + "87": 4158515200.0, + "88": 4158515200.0, + "89": 4158515200.0, + "90": 4158515200.0, + "91": 4158515200.0, + "92": 4158515200.0, + "93": 4158515200.0, + "94": 4158515200.0, + "95": 4158515200.0, + "96": 4158515200.0, + "97": 4158515200.0, + "98": 4158515200.0, + "99": 4158515200.0, + "100": 4158515200.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4349380608.0, + "2": 6185459712.0, + "3": 6187556864.0, + "4": 6187556864.0, + "5": 6187556864.0, + "6": 6187556864.0, + "7": 6187556864.0, + "8": 6187556864.0, + "9": 6187556864.0, + "10": 6187556864.0, + "11": 6187556864.0, + "12": 6187556864.0, + "13": 6187556864.0, + "14": 6187556864.0, + "15": 6187556864.0, + "16": 6187556864.0, + "17": 6187556864.0, + "18": 6187556864.0, + "19": 6187556864.0, + "20": 6187556864.0, + "21": 6187556864.0, + "22": 6187556864.0, + "23": 6187556864.0, + "24": 6187556864.0, + "25": 6187556864.0, + "26": 6187556864.0, + "27": 6187556864.0, + "28": 6187556864.0, + "29": 6187556864.0, + "30": 6187556864.0, + "31": 6187556864.0, + "32": 6187556864.0, + "33": 6187556864.0, + "34": 6187556864.0, + "35": 6187556864.0, + "36": 6187556864.0, + "37": 6187556864.0, + "38": 6187556864.0, + "39": 6187556864.0, + "40": 6187556864.0, + "41": 6187556864.0, + "42": 6187556864.0, + "43": 6187556864.0, + "44": 6187556864.0, + "45": 6187556864.0, + "46": 6187556864.0, + "47": 6187556864.0, + "48": 6187556864.0, + "49": 6187556864.0, + "50": 6187556864.0, + "51": 6187556864.0, + "52": 6187556864.0, + "53": 6187556864.0, + "54": 6187556864.0, + "55": 6187556864.0, + "56": 6187556864.0, + "57": 6187556864.0, + "58": 6187556864.0, + "59": 6187556864.0, + "60": 6187556864.0, + "61": 6187556864.0, + "62": 6187556864.0, + "63": 6187556864.0, + "64": 6187556864.0, + "65": 6187556864.0, + "66": 6187556864.0, + "67": 6187556864.0, + "68": 6187556864.0, + "69": 6187556864.0, + "70": 6187556864.0, + "71": 6187556864.0, + "72": 6187556864.0, + "73": 6187556864.0, + "74": 6187556864.0, + "75": 6187556864.0, + "76": 6187556864.0, + "77": 6187556864.0, + "78": 6187556864.0, + "79": 6187556864.0, + "80": 6187556864.0, + "81": 6187556864.0, + "82": 6187556864.0, + "83": 6187556864.0, + "84": 6187556864.0, + "85": 6187556864.0, + "86": 6187556864.0, + "87": 6187556864.0, + "88": 6187556864.0, + "89": 6187556864.0, + "90": 6187556864.0, + "91": 6187556864.0, + "92": 6187556864.0, + "93": 6187556864.0, + "94": 6187556864.0, + "95": 6187556864.0, + "96": 6187556864.0, + "97": 6187556864.0, + "98": 6187556864.0, + "99": 6187556864.0, + "100": 6187556864.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 7.09171, + "2": 0.19937, + "3": 0.15739, + "4": 0.15626, + "5": 0.15726, + "6": 0.16596, + "7": 0.15866, + "8": 0.16018, + "9": 0.16342, + "10": 0.15848, + "11": 0.1563, + "12": 0.15949, + "13": 0.16471, + "14": 0.1653, + "15": 0.15904, + "16": 0.15673, + "17": 0.15845, + "18": 0.15591, + "19": 0.15809, + "20": 0.1593, + "21": 0.15934, + "22": 0.1588, + "23": 0.15615, + "24": 0.15816, + "25": 0.15513, + "26": 0.16623, + "27": 0.1635, + "28": 0.15796, + "29": 0.15745, + "30": 0.15659, + "31": 0.15757, + "32": 0.15805, + "33": 0.16121, + "34": 0.15918, + "35": 0.15628, + "36": 0.16015, + "37": 0.15954, + "38": 0.15711, + "39": 0.16207, + "40": 0.16543, + "41": 0.16329, + "42": 0.15895, + "43": 0.15771, + "44": 0.16372, + "45": 0.15827, + "46": 0.16205, + "47": 0.16175, + "48": 0.15754, + "49": 0.15916, + "50": 0.15618, + "51": 0.15693, + "52": 0.16151, + "53": 0.16143, + "54": 0.16281, + "55": 0.15891, + "56": 0.16235, + "57": 0.16248, + "58": 0.16949, + "59": 0.16264, + "60": 0.15666, + "61": 0.19456, + "62": 0.19414, + "63": 0.16346, + "64": 0.16675, + "65": 0.16803, + "66": 0.1748, + "67": 0.16431, + "68": 0.1587, + "69": 0.16219, + "70": 0.16457, + "71": 0.1716, + "72": 0.16546, + "73": 0.16711, + "74": 0.16142, + "75": 0.17042, + "76": 0.17092, + "77": 0.16596, + "78": 0.16577, + "79": 0.15743, + "80": 0.15851, + "81": 0.15791, + "82": 0.16001, + "83": 0.15783, + "84": 0.15788, + "85": 0.15665, + "86": 0.16107, + "87": 0.15608, + "88": 0.15928, + "89": 0.16138, + "90": 0.15621, + "91": 0.15886, + "92": 0.15808, + "93": 0.15911, + "94": 0.16777, + "95": 0.16017, + "96": 0.15821, + "97": 0.15642, + "98": 0.16061, + "99": 0.157, + "100": 0.15975 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json index a6e8f276b7b..522245541ce 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.33127, + "2": 10.35281, + "3": 9.79613, + "4": 9.60968, "5": 9.42269, + "6": 9.45137, + "7": 9.34348, + "8": 9.27525, + "9": 9.09676, "10": 9.0722, + "11": 8.8835, + "12": 8.83711, + "13": 8.86836, + "14": 8.71039, "15": 8.68191, + "16": 8.56149, + "17": 8.52311, + "18": 8.43963, + "19": 8.40439, "20": 8.29506, + "21": 8.27059, + "22": 8.17902, + "23": 8.12669, + "24": 8.14846, "25": 7.9909, + "26": 8.12216, + "27": 7.90453, + "28": 7.98655, + "29": 7.80845, "30": 7.86918, + "31": 7.83571, + "32": 7.72178, + "33": 7.80378, + "34": 7.59229, "35": 7.68371, + "36": 7.53883, + "37": 7.47609, + "38": 7.5168, + "39": 7.49978, "40": 7.51704, + "41": 7.43174, + "42": 7.40104, + "43": 7.44926, + "44": 7.38919, "45": 7.38016, + "46": 7.29476, + "47": 7.44829, + "48": 7.28213, + "49": 7.34657, "50": 7.17116, + "51": 7.37361, + "52": 7.13381, + "53": 7.11244, + "54": 7.23402, "55": 7.14785, + "56": 7.22775, + "57": 7.33273, + "58": 6.99461, + "59": 7.11599, "60": 7.13222, + "61": 7.1056, + "62": 7.26513, + "63": 7.14772, + "64": 7.08696, "65": 6.98643, + "66": 7.04728, + "67": 7.04697, + "68": 7.14062, + "69": 7.2435, "70": 7.05957, + "71": 6.89356, + "72": 6.99769, + "73": 6.97897, + "74": 6.91983, "75": 7.05297, + "76": 6.96036, + "77": 7.0791, + "78": 7.01392, + "79": 6.88358, "80": 6.93014, + "81": 6.96553, + "82": 7.05265, + "83": 6.98788, + "84": 7.00427, "85": 6.84577, + "86": 7.03621, + "87": 6.96327, + "88": 6.9137, + "89": 6.80631, "90": 7.23619, + "91": 6.70015, + "92": 7.05679, + "93": 6.89287, + "94": 7.05835, "95": 6.84786, + "96": 6.96771, + "97": 6.94258, + "98": 6.87388, + "99": 7.01816, "100": 6.98466 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 43334.0, + "2": 44100.0, + "3": 44771.0, + "4": 42457.0, "5": 45411.0, + "6": 40966.0, + "7": 43193.0, + "8": 45457.0, + "9": 42550.0, "10": 45360.0, + "11": 44029.0, + "12": 44605.0, + "13": 43917.0, + "14": 46219.0, "15": 43943.0, + "16": 41732.0, + "17": 43861.0, + "18": 44721.0, + "19": 42597.0, "20": 44797.0, + "21": 44792.0, + "22": 41891.0, + "23": 45473.0, + "24": 43081.0, "25": 42682.0, + "26": 43950.0, + "27": 46253.0, + "28": 46447.0, + "29": 46164.0, "30": 44042.0, + "31": 41263.0, + "32": 43440.0, + "33": 45483.0, + "34": 43349.0, "35": 43273.0, + "36": 42490.0, + "37": 40647.0, + "38": 42549.0, + "39": 44766.0, "40": 43281.0, + "41": 44669.0, + "42": 43287.0, + "43": 45454.0, + "44": 44627.0, "45": 43353.0, + "46": 43925.0, + "47": 42498.0, + "48": 44758.0, + "49": 43173.0, "50": 43402.0, + "51": 41198.0, + "52": 43900.0, + "53": 43938.0, + "54": 41922.0, "55": 43916.0, + "56": 43237.0, + "57": 42634.0, + "58": 43916.0, + "59": 44616.0, "60": 41414.0, + "61": 39759.0, + "62": 44750.0, + "63": 44673.0, + "64": 45378.0, "65": 44765.0, + "66": 45401.0, + "67": 43155.0, + "68": 42552.0, + "69": 43831.0, "70": 45546.0, + "71": 43332.0, + "72": 44847.0, + "73": 45376.0, + "74": 42503.0, "75": 44704.0, + "76": 43916.0, + "77": 42101.0, + "78": 40543.0, + "79": 38997.0, "80": 41079.0, + "81": 45377.0, + "82": 43254.0, + "83": 38473.0, + "84": 42420.0, "85": 43989.0, + "86": 45694.0, + "87": 41164.0, + "88": 41773.0, + "89": 41047.0, "90": 44710.0, + "91": 46274.0, + "92": 41823.0, + "93": 43286.0, + "94": 39530.0, "95": 44074.0, + "96": 44686.0, + "97": 45424.0, + "98": 41849.0, + "99": 45567.0, "100": 42485.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 4158515200.0, + "2": 4158515200.0, + "3": 4158515200.0, + "4": 4158515200.0, "5": 4158515200.0, + "6": 4158515200.0, + "7": 4158515200.0, + "8": 4158515200.0, + "9": 4158515200.0, "10": 4158515200.0, + "11": 4158515200.0, + "12": 4158515200.0, + "13": 4158515200.0, + "14": 4158515200.0, "15": 4158515200.0, + "16": 4158515200.0, + "17": 4158515200.0, + "18": 4158515200.0, + "19": 4158515200.0, "20": 4158515200.0, + "21": 4158515200.0, + "22": 4158515200.0, + "23": 4158515200.0, + "24": 4158515200.0, "25": 4158515200.0, + "26": 4158515200.0, + "27": 4158515200.0, + "28": 4158515200.0, + "29": 4158515200.0, "30": 4158515200.0, + "31": 4158515200.0, + "32": 4158515200.0, + "33": 4158515200.0, + "34": 4158515200.0, "35": 4158515200.0, + "36": 4158515200.0, + "37": 4158515200.0, + "38": 4158515200.0, + "39": 4158515200.0, "40": 4158515200.0, + "41": 4158515200.0, + "42": 4158515200.0, + "43": 4158515200.0, + "44": 4158515200.0, "45": 4158515200.0, + "46": 4158515200.0, + "47": 4158515200.0, + "48": 4158515200.0, + "49": 4158515200.0, "50": 4158515200.0, + "51": 4158515200.0, + "52": 4158515200.0, + "53": 4158515200.0, + "54": 4158515200.0, "55": 4158515200.0, + "56": 4158515200.0, + "57": 4158515200.0, + "58": 4158515200.0, + "59": 4158515200.0, "60": 4158515200.0, + "61": 4158515200.0, + "62": 4158515200.0, + "63": 4158515200.0, + "64": 4158515200.0, "65": 4158515200.0, + "66": 4158515200.0, + "67": 4158515200.0, + "68": 4158515200.0, + "69": 4158515200.0, "70": 4158515200.0, + "71": 4158515200.0, + "72": 4158515200.0, + "73": 4158515200.0, + "74": 4158515200.0, "75": 4158515200.0, + "76": 4158515200.0, + "77": 4158515200.0, + "78": 4158515200.0, + "79": 4158515200.0, "80": 4158515200.0, + "81": 4158515200.0, + "82": 4158515200.0, + "83": 4158515200.0, + "84": 4158515200.0, "85": 4158515200.0, + "86": 4158515200.0, + "87": 4158515200.0, + "88": 4158515200.0, + "89": 4158515200.0, "90": 4158515200.0, + "91": 4158515200.0, + "92": 4158515200.0, + "93": 4158515200.0, + "94": 4158515200.0, "95": 4158515200.0, + "96": 4158515200.0, + "97": 4158515200.0, + "98": 4158515200.0, + "99": 4158515200.0, "100": 4158515200.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 4349380608.0, + "2": 6185459712.0, + "3": 6187556864.0, + "4": 6187556864.0, "5": 6187556864.0, + "6": 6187556864.0, + "7": 6187556864.0, + "8": 6187556864.0, + "9": 6187556864.0, "10": 6187556864.0, + "11": 6187556864.0, + "12": 6187556864.0, + "13": 6187556864.0, + "14": 6187556864.0, "15": 6187556864.0, + "16": 6187556864.0, + "17": 6187556864.0, + "18": 6187556864.0, + "19": 6187556864.0, "20": 6187556864.0, + "21": 6187556864.0, + "22": 6187556864.0, + "23": 6187556864.0, + "24": 6187556864.0, "25": 6187556864.0, + "26": 6187556864.0, + "27": 6187556864.0, + "28": 6187556864.0, + "29": 6187556864.0, "30": 6187556864.0, + "31": 6187556864.0, + "32": 6187556864.0, + "33": 6187556864.0, + "34": 6187556864.0, "35": 6187556864.0, + "36": 6187556864.0, + "37": 6187556864.0, + "38": 6187556864.0, + "39": 6187556864.0, "40": 6187556864.0, + "41": 6187556864.0, + "42": 6187556864.0, + "43": 6187556864.0, + "44": 6187556864.0, "45": 6187556864.0, + "46": 6187556864.0, + "47": 6187556864.0, + "48": 6187556864.0, + "49": 6187556864.0, "50": 6187556864.0, + "51": 6187556864.0, + "52": 6187556864.0, + "53": 6187556864.0, + "54": 6187556864.0, "55": 6187556864.0, + "56": 6187556864.0, + "57": 6187556864.0, + "58": 6187556864.0, + "59": 6187556864.0, "60": 6187556864.0, + "61": 6187556864.0, + "62": 6187556864.0, + "63": 6187556864.0, + "64": 6187556864.0, "65": 6187556864.0, + "66": 6187556864.0, + "67": 6187556864.0, + "68": 6187556864.0, + "69": 6187556864.0, "70": 6187556864.0, + "71": 6187556864.0, + "72": 6187556864.0, + "73": 6187556864.0, + "74": 6187556864.0, "75": 6187556864.0, + "76": 6187556864.0, + "77": 6187556864.0, + "78": 6187556864.0, + "79": 6187556864.0, "80": 6187556864.0, + "81": 6187556864.0, + "82": 6187556864.0, + "83": 6187556864.0, + "84": 6187556864.0, "85": 6187556864.0, + "86": 6187556864.0, + "87": 6187556864.0, + "88": 6187556864.0, + "89": 6187556864.0, "90": 6187556864.0, + "91": 6187556864.0, + "92": 6187556864.0, + "93": 6187556864.0, + "94": 6187556864.0, "95": 6187556864.0, + "96": 6187556864.0, + "97": 6187556864.0, + "98": 6187556864.0, + "99": 6187556864.0, "100": 6187556864.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 6.70165, - "5": 0.14534, - "10": 0.14168, - "15": 0.17276, - "20": 0.14261, - "25": 0.13952, - "30": 0.14413, - "35": 0.14472, - "40": 0.14192, - "45": 0.14279, - "50": 0.14289, - "55": 0.14388, - "60": 0.14497, - "65": 0.14852, - "70": 0.14194, - "75": 0.1395, - "80": 0.14222, - "85": 0.13902, - "90": 0.1372, - "95": 0.13582, - "100": 0.13567 + "1": 7.04606, + "2": 0.19929, + "3": 0.2017, + "4": 0.19828, + "5": 0.15529, + "6": 0.15657, + "7": 0.1562, + "8": 0.15746, + "9": 0.15848, + "10": 0.1552, + "11": 0.15643, + "12": 0.15719, + "13": 0.15888, + "14": 0.15791, + "15": 0.15908, + "16": 0.15414, + "17": 0.1552, + "18": 0.15205, + "19": 0.18443, + "20": 0.19907, + "21": 0.16002, + "22": 0.1541, + "23": 0.1541, + "24": 0.15347, + "25": 0.15557, + "26": 0.15649, + "27": 0.16008, + "28": 0.15592, + "29": 0.15544, + "30": 0.15449, + "31": 0.15601, + "32": 0.15477, + "33": 0.159, + "34": 0.15733, + "35": 0.15695, + "36": 0.15477, + "37": 0.15376, + "38": 0.15585, + "39": 0.15472, + "40": 0.16007, + "41": 0.15379, + "42": 0.15522, + "43": 0.15668, + "44": 0.15453, + "45": 0.15571, + "46": 0.15742, + "47": 0.1588, + "48": 0.15282, + "49": 0.15611, + "50": 0.15733, + "51": 0.15969, + "52": 0.15894, + "53": 0.16067, + "54": 0.16019, + "55": 0.15633, + "56": 0.15774, + "57": 0.15905, + "58": 0.16207, + "59": 0.16104, + "60": 0.15837, + "61": 0.15701, + "62": 0.15604, + "63": 0.15894, + "64": 0.15836, + "65": 0.16179, + "66": 0.16196, + "67": 0.16049, + "68": 0.15825, + "69": 0.15755, + "70": 0.15963, + "71": 0.16471, + "72": 0.16654, + "73": 0.16164, + "74": 0.15823, + "75": 0.16142, + "76": 0.16113, + "77": 0.16286, + "78": 0.16729, + "79": 0.16051, + "80": 0.1567, + "81": 0.15597, + "82": 0.15346, + "83": 0.15578, + "84": 0.15723, + "85": 0.1555, + "86": 0.15702, + "87": 0.15866, + "88": 0.15938, + "89": 0.15659, + "90": 0.15777, + "91": 0.1688, + "92": 0.15804, + "93": 0.15347, + "94": 0.15467, + "95": 0.15963, + "96": 0.15485, + "97": 0.1585, + "98": 0.17109, + "99": 0.15645, + "100": 0.15472 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..46dc9be60a4 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.34494, + "2": 10.36431, + "3": 9.73158, + "4": 9.57928, + "5": 9.38931, + "6": 9.41074, + "7": 9.30545, + "8": 9.24868, + "9": 9.09349, + "10": 9.01569, + "11": 8.86286, + "12": 8.79096, + "13": 8.80892, + "14": 8.67669, + "15": 8.64631, + "16": 8.5398, + "17": 8.47895, + "18": 8.38945, + "19": 8.36156, + "20": 8.26966, + "21": 8.26333, + "22": 8.15066, + "23": 8.08893, + "24": 8.12421, + "25": 7.99493, + "26": 8.08494, + "27": 7.87755, + "28": 7.95863, + "29": 7.79585, + "30": 7.87492, + "31": 7.83245, + "32": 7.69489, + "33": 7.78469, + "34": 7.55767, + "35": 7.65834, + "36": 7.52881, + "37": 7.44912, + "38": 7.50398, + "39": 7.48056, + "40": 7.50302, + "41": 7.39767, + "42": 7.37206, + "43": 7.44301, + "44": 7.3811, + "45": 7.36143, + "46": 7.29415, + "47": 7.47498, + "48": 7.29564, + "49": 7.36092, + "50": 7.19205, + "51": 7.38769, + "52": 7.13773, + "53": 7.125, + "54": 7.23668, + "55": 7.16852, + "56": 7.22884, + "57": 7.34699, + "58": 7.03128, + "59": 7.1229, + "60": 7.16587, + "61": 7.1174, + "62": 7.26837, + "63": 7.16759, + "64": 7.08376, + "65": 7.00099, + "66": 7.07203, + "67": 7.05971, + "68": 7.14618, + "69": 7.03944, + "70": 7.07162, + "71": 6.91653, + "72": 7.02025, + "73": 6.9904, + "74": 6.9146, + "75": 7.07611, + "76": 6.97098, + "77": 7.08446, + "78": 7.03608, + "79": 6.88325, + "80": 6.95251, + "81": 6.985, + "82": 7.06843, + "83": 7.00882, + "84": 7.0181, + "85": 6.8641, + "86": 7.04979, + "87": 6.99342, + "88": 6.9238, + "89": 6.82406, + "90": 7.25457, + "91": 6.7226, + "92": 7.05372, + "93": 6.91688, + "94": 7.066, + "95": 6.8601, + "96": 6.98742, + "97": 6.96796, + "98": 6.89964, + "99": 7.02766, + "100": 6.99745 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43305.0, + "2": 44091.0, + "3": 44794.0, + "4": 42436.0, + "5": 45413.0, + "6": 40989.0, + "7": 43195.0, + "8": 45462.0, + "9": 42551.0, + "10": 45379.0, + "11": 44016.0, + "12": 44629.0, + "13": 43937.0, + "14": 46250.0, + "15": 43956.0, + "16": 41728.0, + "17": 43873.0, + "18": 44716.0, + "19": 42648.0, + "20": 44818.0, + "21": 44812.0, + "22": 41883.0, + "23": 45468.0, + "24": 43112.0, + "25": 42745.0, + "26": 43949.0, + "27": 46268.0, + "28": 46429.0, + "29": 46199.0, + "30": 44042.0, + "31": 41264.0, + "32": 43413.0, + "33": 45478.0, + "34": 43375.0, + "35": 43297.0, + "36": 42545.0, + "37": 40689.0, + "38": 42575.0, + "39": 44772.0, + "40": 43251.0, + "41": 44707.0, + "42": 43261.0, + "43": 45506.0, + "44": 44652.0, + "45": 43345.0, + "46": 43935.0, + "47": 42506.0, + "48": 44693.0, + "49": 43200.0, + "50": 43415.0, + "51": 41174.0, + "52": 43885.0, + "53": 43959.0, + "54": 41961.0, + "55": 43960.0, + "56": 43269.0, + "57": 42561.0, + "58": 43898.0, + "59": 44654.0, + "60": 41326.0, + "61": 39744.0, + "62": 44774.0, + "63": 44682.0, + "64": 45396.0, + "65": 44730.0, + "66": 45388.0, + "67": 43196.0, + "68": 42556.0, + "69": 43825.0, + "70": 45543.0, + "71": 43407.0, + "72": 44832.0, + "73": 45412.0, + "74": 42502.0, + "75": 44684.0, + "76": 43926.0, + "77": 42100.0, + "78": 40525.0, + "79": 38954.0, + "80": 41118.0, + "81": 45412.0, + "82": 43238.0, + "83": 38495.0, + "84": 42524.0, + "85": 44024.0, + "86": 45749.0, + "87": 41116.0, + "88": 41798.0, + "89": 41078.0, + "90": 44744.0, + "91": 46266.0, + "92": 41865.0, + "93": 43254.0, + "94": 39588.0, + "95": 44092.0, + "96": 44732.0, + "97": 45474.0, + "98": 41859.0, + "99": 45537.0, + "100": 42500.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4158515200.0, + "2": 4158515200.0, + "3": 4158515200.0, + "4": 4158515200.0, + "5": 4158515200.0, + "6": 4158515200.0, + "7": 4158515200.0, + "8": 4158515200.0, + "9": 4158515200.0, + "10": 4158515200.0, + "11": 4158515200.0, + "12": 4158515200.0, + "13": 4158515200.0, + "14": 4158515200.0, + "15": 4158515200.0, + "16": 4158515200.0, + "17": 4158515200.0, + "18": 4158515200.0, + "19": 4158515200.0, + "20": 4158515200.0, + "21": 4158515200.0, + "22": 4158515200.0, + "23": 4158515200.0, + "24": 4158515200.0, + "25": 4158515200.0, + "26": 4158515200.0, + "27": 4158515200.0, + "28": 4158515200.0, + "29": 4158515200.0, + "30": 4158515200.0, + "31": 4158515200.0, + "32": 4158515200.0, + "33": 4158515200.0, + "34": 4158515200.0, + "35": 4158515200.0, + "36": 4158515200.0, + "37": 4158515200.0, + "38": 4158515200.0, + "39": 4158515200.0, + "40": 4158515200.0, + "41": 4158515200.0, + "42": 4158515200.0, + "43": 4158515200.0, + "44": 4158515200.0, + "45": 4158515200.0, + "46": 4158515200.0, + "47": 4158515200.0, + "48": 4158515200.0, + "49": 4158515200.0, + "50": 4158515200.0, + "51": 4158515200.0, + "52": 4158515200.0, + "53": 4158515200.0, + "54": 4158515200.0, + "55": 4158515200.0, + "56": 4158515200.0, + "57": 4158515200.0, + "58": 4158515200.0, + "59": 4158515200.0, + "60": 4158515200.0, + "61": 4158515200.0, + "62": 4158515200.0, + "63": 4158515200.0, + "64": 4158515200.0, + "65": 4158515200.0, + "66": 4158515200.0, + "67": 4158515200.0, + "68": 4158515200.0, + "69": 4158515200.0, + "70": 4158515200.0, + "71": 4158515200.0, + "72": 4158515200.0, + "73": 4158515200.0, + "74": 4158515200.0, + "75": 4158515200.0, + "76": 4158515200.0, + "77": 4158515200.0, + "78": 4158515200.0, + "79": 4158515200.0, + "80": 4158515200.0, + "81": 4158515200.0, + "82": 4158515200.0, + "83": 4158515200.0, + "84": 4158515200.0, + "85": 4158515200.0, + "86": 4158515200.0, + "87": 4158515200.0, + "88": 4158515200.0, + "89": 4158515200.0, + "90": 4158515200.0, + "91": 4158515200.0, + "92": 4158515200.0, + "93": 4158515200.0, + "94": 4158515200.0, + "95": 4158515200.0, + "96": 4158515200.0, + "97": 4158515200.0, + "98": 4158515200.0, + "99": 4158515200.0, + "100": 4158515200.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4349380608.0, + "2": 6185459712.0, + "3": 6186508288.0, + "4": 6186508288.0, + "5": 6186508288.0, + "6": 6186508288.0, + "7": 6186508288.0, + "8": 6186508288.0, + "9": 6186508288.0, + "10": 6186508288.0, + "11": 6186508288.0, + "12": 6186508288.0, + "13": 6186508288.0, + "14": 6186508288.0, + "15": 6186508288.0, + "16": 6186508288.0, + "17": 6186508288.0, + "18": 6186508288.0, + "19": 6186508288.0, + "20": 6186508288.0, + "21": 6186508288.0, + "22": 6186508288.0, + "23": 6186508288.0, + "24": 6186508288.0, + "25": 6186508288.0, + "26": 6186508288.0, + "27": 6186508288.0, + "28": 6186508288.0, + "29": 6186508288.0, + "30": 6186508288.0, + "31": 6186508288.0, + "32": 6186508288.0, + "33": 6186508288.0, + "34": 6186508288.0, + "35": 6186508288.0, + "36": 6186508288.0, + "37": 6186508288.0, + "38": 6186508288.0, + "39": 6186508288.0, + "40": 6186508288.0, + "41": 6186508288.0, + "42": 6186508288.0, + "43": 6186508288.0, + "44": 6186508288.0, + "45": 6186508288.0, + "46": 6186508288.0, + "47": 6186508288.0, + "48": 6186508288.0, + "49": 6186508288.0, + "50": 6186508288.0, + "51": 6186508288.0, + "52": 6186508288.0, + "53": 6186508288.0, + "54": 6186508288.0, + "55": 6186508288.0, + "56": 6186508288.0, + "57": 6186508288.0, + "58": 6186508288.0, + "59": 6186508288.0, + "60": 6186508288.0, + "61": 6186508288.0, + "62": 6186508288.0, + "63": 6186508288.0, + "64": 6186508288.0, + "65": 6186508288.0, + "66": 6186508288.0, + "67": 6186508288.0, + "68": 6186508288.0, + "69": 6186508288.0, + "70": 6186508288.0, + "71": 6186508288.0, + "72": 6186508288.0, + "73": 6186508288.0, + "74": 6186508288.0, + "75": 6186508288.0, + "76": 6186508288.0, + "77": 6186508288.0, + "78": 6186508288.0, + "79": 6186508288.0, + "80": 6186508288.0, + "81": 6186508288.0, + "82": 6186508288.0, + "83": 6186508288.0, + "84": 6186508288.0, + "85": 6186508288.0, + "86": 6186508288.0, + "87": 6186508288.0, + "88": 6186508288.0, + "89": 6186508288.0, + "90": 6186508288.0, + "91": 6186508288.0, + "92": 6186508288.0, + "93": 6186508288.0, + "94": 6186508288.0, + "95": 6186508288.0, + "96": 6186508288.0, + "97": 6186508288.0, + "98": 6186508288.0, + "99": 6186508288.0, + "100": 6186508288.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 8.21684, + "2": 0.37772, + "3": 0.23303, + "4": 0.23009, + "5": 0.22929, + "6": 0.22867, + "7": 0.22881, + "8": 0.22909, + "9": 0.22901, + "10": 0.22924, + "11": 0.23187, + "12": 0.22897, + "13": 0.23042, + "14": 0.2296, + "15": 0.22858, + "16": 0.22859, + "17": 0.22788, + "18": 0.22827, + "19": 0.22884, + "20": 0.23119, + "21": 0.23125, + "22": 0.22876, + "23": 0.22795, + "24": 0.22894, + "25": 0.22857, + "26": 0.22882, + "27": 0.22865, + "28": 0.22894, + "29": 0.22835, + "30": 0.23042, + "31": 0.22904, + "32": 0.23034, + "33": 0.22865, + "34": 0.22876, + "35": 0.22767, + "36": 0.23145, + "37": 0.22819, + "38": 0.22929, + "39": 0.23937, + "40": 0.23013, + "41": 0.23989, + "42": 0.25348, + "43": 0.23486, + "44": 0.23088, + "45": 0.23068, + "46": 0.22861, + "47": 0.22901, + "48": 0.23829, + "49": 0.23037, + "50": 0.23633, + "51": 0.23085, + "52": 0.22798, + "53": 0.22797, + "54": 0.22841, + "55": 0.23845, + "56": 0.2312, + "57": 0.23463, + "58": 0.23191, + "59": 0.23051, + "60": 0.23189, + "61": 0.23338, + "62": 0.2342, + "63": 0.24812, + "64": 0.23433, + "65": 0.23118, + "66": 0.23175, + "67": 0.2309, + "68": 0.23178, + "69": 0.23371, + "70": 0.24569, + "71": 0.23723, + "72": 0.23422, + "73": 0.23146, + "74": 0.23179, + "75": 0.23182, + "76": 0.23205, + "77": 0.23407, + "78": 0.23174, + "79": 0.23271, + "80": 0.23234, + "81": 0.23065, + "82": 0.23148, + "83": 0.23229, + "84": 0.23128, + "85": 0.23341, + "86": 0.23319, + "87": 0.23195, + "88": 0.23228, + "89": 0.23287, + "90": 0.2318, + "91": 0.23237, + "92": 0.23164, + "93": 0.2304, + "94": 0.23017, + "95": 0.23214, + "96": 0.23143, + "97": 0.23171, + "98": 0.23065, + "99": 0.23302, + "100": 0.23775 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci.json new file mode 100644 index 00000000000..80c9681e5c3 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.34494, + "2": 10.36431, + "3": 9.73158, + "4": 9.57928, + "5": 9.38931, + "6": 9.41074, + "7": 9.30545, + "8": 9.24868, + "9": 9.09349, + "10": 9.01569, + "11": 8.86286, + "12": 8.79096, + "13": 8.80892, + "14": 8.67669, + "15": 8.64631, + "16": 8.5398, + "17": 8.47895, + "18": 8.38945, + "19": 8.36156, + "20": 8.26966, + "21": 8.26333, + "22": 8.15066, + "23": 8.08893, + "24": 8.12421, + "25": 7.99493, + "26": 8.08494, + "27": 7.87755, + "28": 7.95863, + "29": 7.79585, + "30": 7.87492, + "31": 7.83245, + "32": 7.69489, + "33": 7.78469, + "34": 7.55767, + "35": 7.65834, + "36": 7.52881, + "37": 7.44912, + "38": 7.50398, + "39": 7.48056, + "40": 7.50302, + "41": 7.39767, + "42": 7.37206, + "43": 7.44301, + "44": 7.3811, + "45": 7.36143, + "46": 7.29415, + "47": 7.47498, + "48": 7.29564, + "49": 7.36092, + "50": 7.19205, + "51": 7.38769, + "52": 7.13773, + "53": 7.125, + "54": 7.23668, + "55": 7.16852, + "56": 7.22884, + "57": 7.34699, + "58": 7.03128, + "59": 7.1229, + "60": 7.16587, + "61": 7.1174, + "62": 7.26837, + "63": 7.16759, + "64": 7.08376, + "65": 7.00099, + "66": 7.07203, + "67": 7.05971, + "68": 7.14618, + "69": 7.03944, + "70": 7.07162, + "71": 6.91653, + "72": 7.02025, + "73": 6.9904, + "74": 6.9146, + "75": 7.07611, + "76": 6.97098, + "77": 7.08446, + "78": 7.03608, + "79": 6.88325, + "80": 6.95251, + "81": 6.985, + "82": 7.06843, + "83": 7.00882, + "84": 7.0181, + "85": 6.8641, + "86": 7.04979, + "87": 6.99342, + "88": 6.9238, + "89": 6.82406, + "90": 7.25457, + "91": 6.7226, + "92": 7.05372, + "93": 6.91688, + "94": 7.066, + "95": 6.8601, + "96": 6.98742, + "97": 6.96796, + "98": 6.89964, + "99": 7.02766, + "100": 6.99745 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43305.0, + "2": 44091.0, + "3": 44794.0, + "4": 42436.0, + "5": 45413.0, + "6": 40989.0, + "7": 43195.0, + "8": 45462.0, + "9": 42551.0, + "10": 45379.0, + "11": 44016.0, + "12": 44629.0, + "13": 43937.0, + "14": 46250.0, + "15": 43956.0, + "16": 41728.0, + "17": 43873.0, + "18": 44716.0, + "19": 42648.0, + "20": 44818.0, + "21": 44812.0, + "22": 41883.0, + "23": 45468.0, + "24": 43112.0, + "25": 42745.0, + "26": 43949.0, + "27": 46268.0, + "28": 46429.0, + "29": 46199.0, + "30": 44042.0, + "31": 41264.0, + "32": 43413.0, + "33": 45478.0, + "34": 43375.0, + "35": 43297.0, + "36": 42545.0, + "37": 40689.0, + "38": 42575.0, + "39": 44772.0, + "40": 43251.0, + "41": 44707.0, + "42": 43261.0, + "43": 45506.0, + "44": 44652.0, + "45": 43345.0, + "46": 43935.0, + "47": 42506.0, + "48": 44693.0, + "49": 43200.0, + "50": 43415.0, + "51": 41174.0, + "52": 43885.0, + "53": 43959.0, + "54": 41961.0, + "55": 43960.0, + "56": 43269.0, + "57": 42561.0, + "58": 43898.0, + "59": 44654.0, + "60": 41326.0, + "61": 39744.0, + "62": 44774.0, + "63": 44682.0, + "64": 45396.0, + "65": 44730.0, + "66": 45388.0, + "67": 43196.0, + "68": 42556.0, + "69": 43825.0, + "70": 45543.0, + "71": 43407.0, + "72": 44832.0, + "73": 45412.0, + "74": 42502.0, + "75": 44684.0, + "76": 43926.0, + "77": 42100.0, + "78": 40525.0, + "79": 38954.0, + "80": 41118.0, + "81": 45412.0, + "82": 43238.0, + "83": 38495.0, + "84": 42524.0, + "85": 44024.0, + "86": 45749.0, + "87": 41116.0, + "88": 41798.0, + "89": 41078.0, + "90": 44744.0, + "91": 46266.0, + "92": 41865.0, + "93": 43254.0, + "94": 39588.0, + "95": 44092.0, + "96": 44732.0, + "97": 45474.0, + "98": 41859.0, + "99": 45537.0, + "100": 42500.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4158515200.0, + "2": 4158515200.0, + "3": 4158515200.0, + "4": 4158515200.0, + "5": 4158515200.0, + "6": 4158515200.0, + "7": 4158515200.0, + "8": 4158515200.0, + "9": 4158515200.0, + "10": 4158515200.0, + "11": 4158515200.0, + "12": 4158515200.0, + "13": 4158515200.0, + "14": 4158515200.0, + "15": 4158515200.0, + "16": 4158515200.0, + "17": 4158515200.0, + "18": 4158515200.0, + "19": 4158515200.0, + "20": 4158515200.0, + "21": 4158515200.0, + "22": 4158515200.0, + "23": 4158515200.0, + "24": 4158515200.0, + "25": 4158515200.0, + "26": 4158515200.0, + "27": 4158515200.0, + "28": 4158515200.0, + "29": 4158515200.0, + "30": 4158515200.0, + "31": 4158515200.0, + "32": 4158515200.0, + "33": 4158515200.0, + "34": 4158515200.0, + "35": 4158515200.0, + "36": 4158515200.0, + "37": 4158515200.0, + "38": 4158515200.0, + "39": 4158515200.0, + "40": 4158515200.0, + "41": 4158515200.0, + "42": 4158515200.0, + "43": 4158515200.0, + "44": 4158515200.0, + "45": 4158515200.0, + "46": 4158515200.0, + "47": 4158515200.0, + "48": 4158515200.0, + "49": 4158515200.0, + "50": 4158515200.0, + "51": 4158515200.0, + "52": 4158515200.0, + "53": 4158515200.0, + "54": 4158515200.0, + "55": 4158515200.0, + "56": 4158515200.0, + "57": 4158515200.0, + "58": 4158515200.0, + "59": 4158515200.0, + "60": 4158515200.0, + "61": 4158515200.0, + "62": 4158515200.0, + "63": 4158515200.0, + "64": 4158515200.0, + "65": 4158515200.0, + "66": 4158515200.0, + "67": 4158515200.0, + "68": 4158515200.0, + "69": 4158515200.0, + "70": 4158515200.0, + "71": 4158515200.0, + "72": 4158515200.0, + "73": 4158515200.0, + "74": 4158515200.0, + "75": 4158515200.0, + "76": 4158515200.0, + "77": 4158515200.0, + "78": 4158515200.0, + "79": 4158515200.0, + "80": 4158515200.0, + "81": 4158515200.0, + "82": 4158515200.0, + "83": 4158515200.0, + "84": 4158515200.0, + "85": 4158515200.0, + "86": 4158515200.0, + "87": 4158515200.0, + "88": 4158515200.0, + "89": 4158515200.0, + "90": 4158515200.0, + "91": 4158515200.0, + "92": 4158515200.0, + "93": 4158515200.0, + "94": 4158515200.0, + "95": 4158515200.0, + "96": 4158515200.0, + "97": 4158515200.0, + "98": 4158515200.0, + "99": 4158515200.0, + "100": 4158515200.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4349380608.0, + "2": 6185459712.0, + "3": 6187556864.0, + "4": 6187556864.0, + "5": 6187556864.0, + "6": 6187556864.0, + "7": 6187556864.0, + "8": 6187556864.0, + "9": 6187556864.0, + "10": 6187556864.0, + "11": 6187556864.0, + "12": 6187556864.0, + "13": 6187556864.0, + "14": 6187556864.0, + "15": 6187556864.0, + "16": 6187556864.0, + "17": 6187556864.0, + "18": 6187556864.0, + "19": 6187556864.0, + "20": 6187556864.0, + "21": 6187556864.0, + "22": 6187556864.0, + "23": 6187556864.0, + "24": 6187556864.0, + "25": 6187556864.0, + "26": 6187556864.0, + "27": 6187556864.0, + "28": 6187556864.0, + "29": 6187556864.0, + "30": 6187556864.0, + "31": 6187556864.0, + "32": 6187556864.0, + "33": 6187556864.0, + "34": 6187556864.0, + "35": 6187556864.0, + "36": 6187556864.0, + "37": 6187556864.0, + "38": 6187556864.0, + "39": 6187556864.0, + "40": 6187556864.0, + "41": 6187556864.0, + "42": 6187556864.0, + "43": 6187556864.0, + "44": 6187556864.0, + "45": 6187556864.0, + "46": 6187556864.0, + "47": 6187556864.0, + "48": 6187556864.0, + "49": 6187556864.0, + "50": 6187556864.0, + "51": 6187556864.0, + "52": 6187556864.0, + "53": 6187556864.0, + "54": 6187556864.0, + "55": 6187556864.0, + "56": 6187556864.0, + "57": 6187556864.0, + "58": 6187556864.0, + "59": 6187556864.0, + "60": 6187556864.0, + "61": 6187556864.0, + "62": 6187556864.0, + "63": 6187556864.0, + "64": 6187556864.0, + "65": 6187556864.0, + "66": 6187556864.0, + "67": 6187556864.0, + "68": 6187556864.0, + "69": 6187556864.0, + "70": 6187556864.0, + "71": 6187556864.0, + "72": 6187556864.0, + "73": 6187556864.0, + "74": 6187556864.0, + "75": 6187556864.0, + "76": 6187556864.0, + "77": 6187556864.0, + "78": 6187556864.0, + "79": 6187556864.0, + "80": 6187556864.0, + "81": 6187556864.0, + "82": 6187556864.0, + "83": 6187556864.0, + "84": 6187556864.0, + "85": 6187556864.0, + "86": 6187556864.0, + "87": 6187556864.0, + "88": 6187556864.0, + "89": 6187556864.0, + "90": 6187556864.0, + "91": 6187556864.0, + "92": 6187556864.0, + "93": 6187556864.0, + "94": 6187556864.0, + "95": 6187556864.0, + "96": 6187556864.0, + "97": 6187556864.0, + "98": 6187556864.0, + "99": 6187556864.0, + "100": 6187556864.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.32163, + "2": 0.38506, + "3": 0.23264, + "4": 0.23088, + "5": 0.23265, + "6": 0.23173, + "7": 0.23126, + "8": 0.23038, + "9": 0.23084, + "10": 0.23209, + "11": 0.23149, + "12": 0.23231, + "13": 0.23319, + "14": 0.22867, + "15": 0.22812, + "16": 0.22793, + "17": 0.22839, + "18": 0.22788, + "19": 0.22802, + "20": 0.22831, + "21": 0.22863, + "22": 0.22778, + "23": 0.22775, + "24": 0.2276, + "25": 0.22851, + "26": 0.22788, + "27": 0.22874, + "28": 0.22765, + "29": 0.2281, + "30": 0.2293, + "31": 0.22952, + "32": 0.22888, + "33": 0.22916, + "34": 0.22869, + "35": 0.22859, + "36": 0.22919, + "37": 0.22959, + "38": 0.22853, + "39": 0.22896, + "40": 0.22961, + "41": 0.22873, + "42": 0.22928, + "43": 0.22982, + "44": 0.22937, + "45": 0.22999, + "46": 0.22841, + "47": 0.23003, + "48": 0.22906, + "49": 0.23037, + "50": 0.22982, + "51": 0.23126, + "52": 0.22892, + "53": 0.23322, + "54": 0.22861, + "55": 0.23475, + "56": 0.22765, + "57": 0.23073, + "58": 0.22912, + "59": 0.23304, + "60": 0.23302, + "61": 0.23295, + "62": 0.23275, + "63": 0.23408, + "64": 0.234, + "65": 0.23292, + "66": 0.22871, + "67": 0.23056, + "68": 0.22829, + "69": 0.23494, + "70": 0.22853, + "71": 0.23538, + "72": 0.23311, + "73": 0.23976, + "74": 0.23226, + "75": 0.22923, + "76": 0.23951, + "77": 0.23749, + "78": 0.22838, + "79": 0.22723, + "80": 0.22612, + "81": 0.22628, + "82": 0.22606, + "83": 0.22681, + "84": 0.23292, + "85": 0.22707, + "86": 0.22686, + "87": 0.22866, + "88": 0.22831, + "89": 0.22841, + "90": 0.2279, + "91": 0.22948, + "92": 0.22866, + "93": 0.22908, + "94": 0.2282, + "95": 0.22949, + "96": 0.22803, + "97": 0.22905, + "98": 0.22804, + "99": 0.22947, + "100": 0.22895 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..f1c0511f9d6 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.33127, + "2": 10.35281, + "3": 9.79613, + "4": 9.60968, + "5": 9.42269, + "6": 9.45137, + "7": 9.34348, + "8": 9.27525, + "9": 9.09676, + "10": 9.0722, + "11": 8.8835, + "12": 8.83711, + "13": 8.86836, + "14": 8.71039, + "15": 8.68191, + "16": 8.56149, + "17": 8.52311, + "18": 8.43963, + "19": 8.40439, + "20": 8.29506, + "21": 8.27059, + "22": 8.17902, + "23": 8.12669, + "24": 8.14846, + "25": 7.9909, + "26": 8.12216, + "27": 7.90453, + "28": 7.98655, + "29": 7.80845, + "30": 7.86918, + "31": 7.83571, + "32": 7.72178, + "33": 7.80378, + "34": 7.59229, + "35": 7.68371, + "36": 7.53883, + "37": 7.47609, + "38": 7.5168, + "39": 7.49978, + "40": 7.51704, + "41": 7.43174, + "42": 7.40104, + "43": 7.44926, + "44": 7.38919, + "45": 7.38016, + "46": 7.29476, + "47": 7.44829, + "48": 7.28213, + "49": 7.34657, + "50": 7.17116, + "51": 7.37361, + "52": 7.13381, + "53": 7.11244, + "54": 7.23402, + "55": 7.14785, + "56": 7.22775, + "57": 7.33273, + "58": 6.99461, + "59": 7.11599, + "60": 7.13222, + "61": 7.1056, + "62": 7.26513, + "63": 7.14772, + "64": 7.08696, + "65": 6.98643, + "66": 7.04728, + "67": 7.04697, + "68": 7.14062, + "69": 7.2435, + "70": 7.05957, + "71": 6.89356, + "72": 6.99769, + "73": 6.97897, + "74": 6.91983, + "75": 7.05297, + "76": 6.96036, + "77": 7.0791, + "78": 7.01392, + "79": 6.88358, + "80": 6.93014, + "81": 6.96553, + "82": 7.05265, + "83": 6.98788, + "84": 7.00427, + "85": 6.84577, + "86": 7.03621, + "87": 6.96327, + "88": 6.9137, + "89": 6.80631, + "90": 7.23619, + "91": 6.70015, + "92": 7.05679, + "93": 6.89287, + "94": 7.05835, + "95": 6.84786, + "96": 6.96771, + "97": 6.94258, + "98": 6.87388, + "99": 7.01816, + "100": 6.98466 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43334.0, + "2": 44100.0, + "3": 44771.0, + "4": 42457.0, + "5": 45411.0, + "6": 40966.0, + "7": 43193.0, + "8": 45457.0, + "9": 42550.0, + "10": 45360.0, + "11": 44029.0, + "12": 44605.0, + "13": 43917.0, + "14": 46219.0, + "15": 43943.0, + "16": 41732.0, + "17": 43861.0, + "18": 44721.0, + "19": 42597.0, + "20": 44797.0, + "21": 44792.0, + "22": 41891.0, + "23": 45473.0, + "24": 43081.0, + "25": 42682.0, + "26": 43950.0, + "27": 46253.0, + "28": 46447.0, + "29": 46164.0, + "30": 44042.0, + "31": 41263.0, + "32": 43440.0, + "33": 45483.0, + "34": 43349.0, + "35": 43273.0, + "36": 42490.0, + "37": 40647.0, + "38": 42549.0, + "39": 44766.0, + "40": 43281.0, + "41": 44669.0, + "42": 43287.0, + "43": 45454.0, + "44": 44627.0, + "45": 43353.0, + "46": 43925.0, + "47": 42498.0, + "48": 44758.0, + "49": 43173.0, + "50": 43402.0, + "51": 41198.0, + "52": 43900.0, + "53": 43938.0, + "54": 41922.0, + "55": 43916.0, + "56": 43237.0, + "57": 42634.0, + "58": 43916.0, + "59": 44616.0, + "60": 41414.0, + "61": 39759.0, + "62": 44750.0, + "63": 44673.0, + "64": 45378.0, + "65": 44765.0, + "66": 45401.0, + "67": 43155.0, + "68": 42552.0, + "69": 43831.0, + "70": 45546.0, + "71": 43332.0, + "72": 44847.0, + "73": 45376.0, + "74": 42503.0, + "75": 44704.0, + "76": 43916.0, + "77": 42101.0, + "78": 40543.0, + "79": 38997.0, + "80": 41079.0, + "81": 45377.0, + "82": 43254.0, + "83": 38473.0, + "84": 42420.0, + "85": 43989.0, + "86": 45694.0, + "87": 41164.0, + "88": 41773.0, + "89": 41047.0, + "90": 44710.0, + "91": 46274.0, + "92": 41823.0, + "93": 43286.0, + "94": 39530.0, + "95": 44074.0, + "96": 44686.0, + "97": 45424.0, + "98": 41849.0, + "99": 45567.0, + "100": 42485.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4158515200.0, + "2": 4158515200.0, + "3": 4158515200.0, + "4": 4158515200.0, + "5": 4158515200.0, + "6": 4158515200.0, + "7": 4158515200.0, + "8": 4158515200.0, + "9": 4158515200.0, + "10": 4158515200.0, + "11": 4158515200.0, + "12": 4158515200.0, + "13": 4158515200.0, + "14": 4158515200.0, + "15": 4158515200.0, + "16": 4158515200.0, + "17": 4158515200.0, + "18": 4158515200.0, + "19": 4158515200.0, + "20": 4158515200.0, + "21": 4158515200.0, + "22": 4158515200.0, + "23": 4158515200.0, + "24": 4158515200.0, + "25": 4158515200.0, + "26": 4158515200.0, + "27": 4158515200.0, + "28": 4158515200.0, + "29": 4158515200.0, + "30": 4158515200.0, + "31": 4158515200.0, + "32": 4158515200.0, + "33": 4158515200.0, + "34": 4158515200.0, + "35": 4158515200.0, + "36": 4158515200.0, + "37": 4158515200.0, + "38": 4158515200.0, + "39": 4158515200.0, + "40": 4158515200.0, + "41": 4158515200.0, + "42": 4158515200.0, + "43": 4158515200.0, + "44": 4158515200.0, + "45": 4158515200.0, + "46": 4158515200.0, + "47": 4158515200.0, + "48": 4158515200.0, + "49": 4158515200.0, + "50": 4158515200.0, + "51": 4158515200.0, + "52": 4158515200.0, + "53": 4158515200.0, + "54": 4158515200.0, + "55": 4158515200.0, + "56": 4158515200.0, + "57": 4158515200.0, + "58": 4158515200.0, + "59": 4158515200.0, + "60": 4158515200.0, + "61": 4158515200.0, + "62": 4158515200.0, + "63": 4158515200.0, + "64": 4158515200.0, + "65": 4158515200.0, + "66": 4158515200.0, + "67": 4158515200.0, + "68": 4158515200.0, + "69": 4158515200.0, + "70": 4158515200.0, + "71": 4158515200.0, + "72": 4158515200.0, + "73": 4158515200.0, + "74": 4158515200.0, + "75": 4158515200.0, + "76": 4158515200.0, + "77": 4158515200.0, + "78": 4158515200.0, + "79": 4158515200.0, + "80": 4158515200.0, + "81": 4158515200.0, + "82": 4158515200.0, + "83": 4158515200.0, + "84": 4158515200.0, + "85": 4158515200.0, + "86": 4158515200.0, + "87": 4158515200.0, + "88": 4158515200.0, + "89": 4158515200.0, + "90": 4158515200.0, + "91": 4158515200.0, + "92": 4158515200.0, + "93": 4158515200.0, + "94": 4158515200.0, + "95": 4158515200.0, + "96": 4158515200.0, + "97": 4158515200.0, + "98": 4158515200.0, + "99": 4158515200.0, + "100": 4158515200.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4349380608.0, + "2": 6185459712.0, + "3": 6187556864.0, + "4": 6187556864.0, + "5": 6187556864.0, + "6": 6187556864.0, + "7": 6187556864.0, + "8": 6187556864.0, + "9": 6187556864.0, + "10": 6187556864.0, + "11": 6187556864.0, + "12": 6187556864.0, + "13": 6187556864.0, + "14": 6187556864.0, + "15": 6187556864.0, + "16": 6187556864.0, + "17": 6187556864.0, + "18": 6187556864.0, + "19": 6187556864.0, + "20": 6187556864.0, + "21": 6187556864.0, + "22": 6187556864.0, + "23": 6187556864.0, + "24": 6187556864.0, + "25": 6187556864.0, + "26": 6187556864.0, + "27": 6187556864.0, + "28": 6187556864.0, + "29": 6187556864.0, + "30": 6187556864.0, + "31": 6187556864.0, + "32": 6187556864.0, + "33": 6187556864.0, + "34": 6187556864.0, + "35": 6187556864.0, + "36": 6187556864.0, + "37": 6187556864.0, + "38": 6187556864.0, + "39": 6187556864.0, + "40": 6187556864.0, + "41": 6187556864.0, + "42": 6187556864.0, + "43": 6187556864.0, + "44": 6187556864.0, + "45": 6187556864.0, + "46": 6187556864.0, + "47": 6187556864.0, + "48": 6187556864.0, + "49": 6187556864.0, + "50": 6187556864.0, + "51": 6187556864.0, + "52": 6187556864.0, + "53": 6187556864.0, + "54": 6187556864.0, + "55": 6187556864.0, + "56": 6187556864.0, + "57": 6187556864.0, + "58": 6187556864.0, + "59": 6187556864.0, + "60": 6187556864.0, + "61": 6187556864.0, + "62": 6187556864.0, + "63": 6187556864.0, + "64": 6187556864.0, + "65": 6187556864.0, + "66": 6187556864.0, + "67": 6187556864.0, + "68": 6187556864.0, + "69": 6187556864.0, + "70": 6187556864.0, + "71": 6187556864.0, + "72": 6187556864.0, + "73": 6187556864.0, + "74": 6187556864.0, + "75": 6187556864.0, + "76": 6187556864.0, + "77": 6187556864.0, + "78": 6187556864.0, + "79": 6187556864.0, + "80": 6187556864.0, + "81": 6187556864.0, + "82": 6187556864.0, + "83": 6187556864.0, + "84": 6187556864.0, + "85": 6187556864.0, + "86": 6187556864.0, + "87": 6187556864.0, + "88": 6187556864.0, + "89": 6187556864.0, + "90": 6187556864.0, + "91": 6187556864.0, + "92": 6187556864.0, + "93": 6187556864.0, + "94": 6187556864.0, + "95": 6187556864.0, + "96": 6187556864.0, + "97": 6187556864.0, + "98": 6187556864.0, + "99": 6187556864.0, + "100": 6187556864.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 7.44745, + "2": 0.28877, + "3": 0.13863, + "4": 0.13991, + "5": 0.1386, + "6": 0.1688, + "7": 0.13897, + "8": 0.14655, + "9": 0.14408, + "10": 0.14011, + "11": 0.14086, + "12": 0.13894, + "13": 0.13997, + "14": 0.15002, + "15": 0.14424, + "16": 0.14057, + "17": 0.13971, + "18": 0.14204, + "19": 0.13911, + "20": 0.13847, + "21": 0.1511, + "22": 0.1466, + "23": 0.13965, + "24": 0.13912, + "25": 0.1401, + "26": 0.13945, + "27": 0.13889, + "28": 0.14975, + "29": 0.14768, + "30": 0.14096, + "31": 0.1397, + "32": 0.13848, + "33": 0.14003, + "34": 0.13906, + "35": 0.15106, + "36": 0.14946, + "37": 0.13936, + "38": 0.13863, + "39": 0.13854, + "40": 0.13912, + "41": 0.13768, + "42": 0.16204, + "43": 0.14058, + "44": 0.14047, + "45": 0.14051, + "46": 0.13844, + "47": 0.14085, + "48": 0.14712, + "49": 0.14538, + "50": 0.14262, + "51": 0.14224, + "52": 0.14099, + "53": 0.14182, + "54": 0.14142, + "55": 0.14151, + "56": 0.17071, + "57": 0.16514, + "58": 0.14109, + "59": 0.14613, + "60": 0.13996, + "61": 0.1438, + "62": 0.1439, + "63": 0.1704, + "64": 0.17016, + "65": 0.14013, + "66": 0.1408, + "67": 0.14073, + "68": 0.14112, + "69": 0.14885, + "70": 0.15051, + "71": 0.1459, + "72": 0.14741, + "73": 0.14647, + "74": 0.14559, + "75": 0.14518, + "76": 0.14651, + "77": 0.18065, + "78": 0.17614, + "79": 0.14661, + "80": 0.14187, + "81": 0.14198, + "82": 0.13988, + "83": 0.14058, + "84": 0.14152, + "85": 0.14263, + "86": 0.14317, + "87": 0.14179, + "88": 0.14281, + "89": 0.13999, + "90": 0.14469, + "91": 0.142, + "92": 0.14198, + "93": 0.14441, + "94": 0.14544, + "95": 0.14559, + "96": 0.14352, + "97": 0.14163, + "98": 0.14642, + "99": 0.14323, + "100": 0.14598 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..e0a55371afb --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.33127, + "2": 10.35281, + "3": 9.79613, + "4": 9.60968, + "5": 9.42269, + "6": 9.45137, + "7": 9.34348, + "8": 9.27525, + "9": 9.09676, + "10": 9.0722, + "11": 8.8835, + "12": 8.83711, + "13": 8.86836, + "14": 8.71039, + "15": 8.68191, + "16": 8.56149, + "17": 8.52311, + "18": 8.43963, + "19": 8.40439, + "20": 8.29506, + "21": 8.27059, + "22": 8.17902, + "23": 8.12669, + "24": 8.14846, + "25": 7.9909, + "26": 8.12216, + "27": 7.90453, + "28": 7.98655, + "29": 7.80845, + "30": 7.86918, + "31": 7.83571, + "32": 7.72178, + "33": 7.80378, + "34": 7.59229, + "35": 7.68371, + "36": 7.53883, + "37": 7.47609, + "38": 7.5168, + "39": 7.49978, + "40": 7.51704, + "41": 7.43174, + "42": 7.40104, + "43": 7.44926, + "44": 7.38919, + "45": 7.38016, + "46": 7.29476, + "47": 7.44829, + "48": 7.28213, + "49": 7.34657, + "50": 7.17116, + "51": 7.37361, + "52": 7.13381, + "53": 7.11244, + "54": 7.23402, + "55": 7.14785, + "56": 7.22775, + "57": 7.33273, + "58": 6.99461, + "59": 7.11599, + "60": 7.13222, + "61": 7.1056, + "62": 7.26513, + "63": 7.14772, + "64": 7.08696, + "65": 6.98643, + "66": 7.04728, + "67": 7.04697, + "68": 7.14062, + "69": 7.2435, + "70": 7.05957, + "71": 6.89356, + "72": 6.99769, + "73": 6.97897, + "74": 6.91983, + "75": 7.05297, + "76": 6.96036, + "77": 7.0791, + "78": 7.01392, + "79": 6.88358, + "80": 6.93014, + "81": 6.96553, + "82": 7.05265, + "83": 6.98788, + "84": 7.00427, + "85": 6.84577, + "86": 7.03621, + "87": 6.96327, + "88": 6.9137, + "89": 6.80631, + "90": 7.23619, + "91": 6.70015, + "92": 7.05679, + "93": 6.89287, + "94": 7.05835, + "95": 6.84786, + "96": 6.96771, + "97": 6.94258, + "98": 6.87388, + "99": 7.01816, + "100": 6.98466 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43334.0, + "2": 44100.0, + "3": 44771.0, + "4": 42457.0, + "5": 45411.0, + "6": 40966.0, + "7": 43193.0, + "8": 45457.0, + "9": 42550.0, + "10": 45360.0, + "11": 44029.0, + "12": 44605.0, + "13": 43917.0, + "14": 46219.0, + "15": 43943.0, + "16": 41732.0, + "17": 43861.0, + "18": 44721.0, + "19": 42597.0, + "20": 44797.0, + "21": 44792.0, + "22": 41891.0, + "23": 45473.0, + "24": 43081.0, + "25": 42682.0, + "26": 43950.0, + "27": 46253.0, + "28": 46447.0, + "29": 46164.0, + "30": 44042.0, + "31": 41263.0, + "32": 43440.0, + "33": 45483.0, + "34": 43349.0, + "35": 43273.0, + "36": 42490.0, + "37": 40647.0, + "38": 42549.0, + "39": 44766.0, + "40": 43281.0, + "41": 44669.0, + "42": 43287.0, + "43": 45454.0, + "44": 44627.0, + "45": 43353.0, + "46": 43925.0, + "47": 42498.0, + "48": 44758.0, + "49": 43173.0, + "50": 43402.0, + "51": 41198.0, + "52": 43900.0, + "53": 43938.0, + "54": 41922.0, + "55": 43916.0, + "56": 43237.0, + "57": 42634.0, + "58": 43916.0, + "59": 44616.0, + "60": 41414.0, + "61": 39759.0, + "62": 44750.0, + "63": 44673.0, + "64": 45378.0, + "65": 44765.0, + "66": 45401.0, + "67": 43155.0, + "68": 42552.0, + "69": 43831.0, + "70": 45546.0, + "71": 43332.0, + "72": 44847.0, + "73": 45376.0, + "74": 42503.0, + "75": 44704.0, + "76": 43916.0, + "77": 42101.0, + "78": 40543.0, + "79": 38997.0, + "80": 41079.0, + "81": 45377.0, + "82": 43254.0, + "83": 38473.0, + "84": 42420.0, + "85": 43989.0, + "86": 45694.0, + "87": 41164.0, + "88": 41773.0, + "89": 41047.0, + "90": 44710.0, + "91": 46274.0, + "92": 41823.0, + "93": 43286.0, + "94": 39530.0, + "95": 44074.0, + "96": 44686.0, + "97": 45424.0, + "98": 41849.0, + "99": 45567.0, + "100": 42485.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4158515200.0, + "2": 4158515200.0, + "3": 4158515200.0, + "4": 4158515200.0, + "5": 4158515200.0, + "6": 4158515200.0, + "7": 4158515200.0, + "8": 4158515200.0, + "9": 4158515200.0, + "10": 4158515200.0, + "11": 4158515200.0, + "12": 4158515200.0, + "13": 4158515200.0, + "14": 4158515200.0, + "15": 4158515200.0, + "16": 4158515200.0, + "17": 4158515200.0, + "18": 4158515200.0, + "19": 4158515200.0, + "20": 4158515200.0, + "21": 4158515200.0, + "22": 4158515200.0, + "23": 4158515200.0, + "24": 4158515200.0, + "25": 4158515200.0, + "26": 4158515200.0, + "27": 4158515200.0, + "28": 4158515200.0, + "29": 4158515200.0, + "30": 4158515200.0, + "31": 4158515200.0, + "32": 4158515200.0, + "33": 4158515200.0, + "34": 4158515200.0, + "35": 4158515200.0, + "36": 4158515200.0, + "37": 4158515200.0, + "38": 4158515200.0, + "39": 4158515200.0, + "40": 4158515200.0, + "41": 4158515200.0, + "42": 4158515200.0, + "43": 4158515200.0, + "44": 4158515200.0, + "45": 4158515200.0, + "46": 4158515200.0, + "47": 4158515200.0, + "48": 4158515200.0, + "49": 4158515200.0, + "50": 4158515200.0, + "51": 4158515200.0, + "52": 4158515200.0, + "53": 4158515200.0, + "54": 4158515200.0, + "55": 4158515200.0, + "56": 4158515200.0, + "57": 4158515200.0, + "58": 4158515200.0, + "59": 4158515200.0, + "60": 4158515200.0, + "61": 4158515200.0, + "62": 4158515200.0, + "63": 4158515200.0, + "64": 4158515200.0, + "65": 4158515200.0, + "66": 4158515200.0, + "67": 4158515200.0, + "68": 4158515200.0, + "69": 4158515200.0, + "70": 4158515200.0, + "71": 4158515200.0, + "72": 4158515200.0, + "73": 4158515200.0, + "74": 4158515200.0, + "75": 4158515200.0, + "76": 4158515200.0, + "77": 4158515200.0, + "78": 4158515200.0, + "79": 4158515200.0, + "80": 4158515200.0, + "81": 4158515200.0, + "82": 4158515200.0, + "83": 4158515200.0, + "84": 4158515200.0, + "85": 4158515200.0, + "86": 4158515200.0, + "87": 4158515200.0, + "88": 4158515200.0, + "89": 4158515200.0, + "90": 4158515200.0, + "91": 4158515200.0, + "92": 4158515200.0, + "93": 4158515200.0, + "94": 4158515200.0, + "95": 4158515200.0, + "96": 4158515200.0, + "97": 4158515200.0, + "98": 4158515200.0, + "99": 4158515200.0, + "100": 4158515200.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 4349380608.0, + "2": 6185459712.0, + "3": 6187556864.0, + "4": 6187556864.0, + "5": 6187556864.0, + "6": 6187556864.0, + "7": 6187556864.0, + "8": 6187556864.0, + "9": 6187556864.0, + "10": 6187556864.0, + "11": 6187556864.0, + "12": 6187556864.0, + "13": 6187556864.0, + "14": 6187556864.0, + "15": 6187556864.0, + "16": 6187556864.0, + "17": 6187556864.0, + "18": 6187556864.0, + "19": 6187556864.0, + "20": 6187556864.0, + "21": 6187556864.0, + "22": 6187556864.0, + "23": 6187556864.0, + "24": 6187556864.0, + "25": 6187556864.0, + "26": 6187556864.0, + "27": 6187556864.0, + "28": 6187556864.0, + "29": 6187556864.0, + "30": 6187556864.0, + "31": 6187556864.0, + "32": 6187556864.0, + "33": 6187556864.0, + "34": 6187556864.0, + "35": 6187556864.0, + "36": 6187556864.0, + "37": 6187556864.0, + "38": 6187556864.0, + "39": 6187556864.0, + "40": 6187556864.0, + "41": 6187556864.0, + "42": 6187556864.0, + "43": 6187556864.0, + "44": 6187556864.0, + "45": 6187556864.0, + "46": 6187556864.0, + "47": 6187556864.0, + "48": 6187556864.0, + "49": 6187556864.0, + "50": 6187556864.0, + "51": 6187556864.0, + "52": 6187556864.0, + "53": 6187556864.0, + "54": 6187556864.0, + "55": 6187556864.0, + "56": 6187556864.0, + "57": 6187556864.0, + "58": 6187556864.0, + "59": 6187556864.0, + "60": 6187556864.0, + "61": 6187556864.0, + "62": 6187556864.0, + "63": 6187556864.0, + "64": 6187556864.0, + "65": 6187556864.0, + "66": 6187556864.0, + "67": 6187556864.0, + "68": 6187556864.0, + "69": 6187556864.0, + "70": 6187556864.0, + "71": 6187556864.0, + "72": 6187556864.0, + "73": 6187556864.0, + "74": 6187556864.0, + "75": 6187556864.0, + "76": 6187556864.0, + "77": 6187556864.0, + "78": 6187556864.0, + "79": 6187556864.0, + "80": 6187556864.0, + "81": 6187556864.0, + "82": 6187556864.0, + "83": 6187556864.0, + "84": 6187556864.0, + "85": 6187556864.0, + "86": 6187556864.0, + "87": 6187556864.0, + "88": 6187556864.0, + "89": 6187556864.0, + "90": 6187556864.0, + "91": 6187556864.0, + "92": 6187556864.0, + "93": 6187556864.0, + "94": 6187556864.0, + "95": 6187556864.0, + "96": 6187556864.0, + "97": 6187556864.0, + "98": 6187556864.0, + "99": 6187556864.0, + "100": 6187556864.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 6.98463, + "2": 0.19558, + "3": 0.15734, + "4": 0.15695, + "5": 0.15774, + "6": 0.15468, + "7": 0.15373, + "8": 0.15721, + "9": 0.15375, + "10": 0.15555, + "11": 0.15762, + "12": 0.15358, + "13": 0.15446, + "14": 0.15343, + "15": 0.15567, + "16": 0.15597, + "17": 0.19986, + "18": 0.19685, + "19": 0.15757, + "20": 0.16418, + "21": 0.1662, + "22": 0.1633, + "23": 0.15542, + "24": 0.16131, + "25": 0.15713, + "26": 0.16116, + "27": 0.15731, + "28": 0.16645, + "29": 0.1581, + "30": 0.16334, + "31": 0.15469, + "32": 0.1607, + "33": 0.15565, + "34": 0.16369, + "35": 0.15592, + "36": 0.16404, + "37": 0.15034, + "38": 0.15864, + "39": 0.15017, + "40": 0.1607, + "41": 0.15387, + "42": 0.17077, + "43": 0.15397, + "44": 0.1563, + "45": 0.15512, + "46": 0.16115, + "47": 0.15635, + "48": 0.16292, + "49": 0.15581, + "50": 0.16402, + "51": 0.15457, + "52": 0.16232, + "53": 0.156, + "54": 0.16433, + "55": 0.15283, + "56": 0.19434, + "57": 0.19273, + "58": 0.15955, + "59": 0.15405, + "60": 0.15503, + "61": 0.15418, + "62": 0.15446, + "63": 0.15778, + "64": 0.1578, + "65": 0.16024, + "66": 0.15656, + "67": 0.15524, + "68": 0.15394, + "69": 0.16041, + "70": 0.16082, + "71": 0.16503, + "72": 0.16142, + "73": 0.16242, + "74": 0.15995, + "75": 0.15816, + "76": 0.16199, + "77": 0.16827, + "78": 0.15987, + "79": 0.15797, + "80": 0.15617, + "81": 0.15308, + "82": 0.15484, + "83": 0.15382, + "84": 0.16856, + "85": 0.15976, + "86": 0.15794, + "87": 0.15409, + "88": 0.15333, + "89": 0.15511, + "90": 0.15333, + "91": 0.17162, + "92": 0.15418, + "93": 0.15421, + "94": 0.15169, + "95": 0.15479, + "96": 0.15268, + "97": 0.1552, + "98": 0.1575, + "99": 0.15403, + "100": 0.15379 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgx_h100.json index 3ab4415923d..b7f4830a0c8 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgx_h100.json @@ -2,141 +2,536 @@ "lm loss": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 10.372, + "2": 10.37006, + "3": 9.85232, + "4": 9.61996, "5": 9.40868, + "6": 9.43215, + "7": 9.31482, + "8": 9.27336, + "9": 9.1139, "10": 9.03962, + "11": 8.87218, + "12": 8.80873, + "13": 8.83468, + "14": 8.69011, "15": 8.66228, + "16": 8.54828, + "17": 8.50093, + "18": 8.42525, + "19": 8.3881, "20": 8.2807, + "21": 8.26609, + "22": 8.16003, + "23": 8.1124, + "24": 8.14262, "25": 7.98432, + "26": 8.10592, + "27": 7.88963, + "28": 7.97037, + "29": 7.81276, "30": 7.87638, + "31": 7.82516, + "32": 7.70248, + "33": 7.80198, + "34": 7.56872, "35": 7.67379, + "36": 7.54691, + "37": 7.47408, + "38": 7.50739, + "39": 7.49773, "40": 7.51091, + "41": 7.41065, + "42": 7.37995, + "43": 7.44078, + "44": 7.39393, "45": 7.37239, + "46": 7.28427, + "47": 7.46631, + "48": 7.2905, + "49": 7.35025, "50": 7.17204, + "51": 7.37012, + "52": 7.14467, + "53": 7.12652, + "54": 7.23751, "55": 7.15586, + "56": 7.23154, + "57": 7.33541, + "58": 7.01363, + "59": 7.11431, "60": 7.15121, + "61": 7.10904, + "62": 7.26834, + "63": 7.15176, + "64": 7.08415, "65": 6.99114, + "66": 7.05301, + "67": 7.04354, + "68": 7.1398, + "69": 7.03224, "70": 7.05832, + "71": 6.90372, + "72": 6.99794, + "73": 6.9769, + "74": 6.91759, "75": 7.06626, + "76": 6.95758, + "77": 7.0871, + "78": 7.03238, + "79": 6.85274, "80": 6.93633, + "81": 6.97617, + "82": 7.06196, + "83": 6.98213, + "84": 7.00931, "85": 6.85082, + "86": 7.04673, + "87": 6.97907, + "88": 6.91096, + "89": 6.81719, "90": 7.2459, + "91": 6.7046, + "92": 7.05377, + "93": 6.89397, + "94": 7.0542, "95": 6.85031, + "96": 6.96441, + "97": 6.95632, + "98": 6.88246, + "99": 7.00392, "100": 6.98993 } }, "num-zeros": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 43299.0, + "2": 44047.0, + "3": 44744.0, + "4": 42405.0, "5": 45385.0, + "6": 40946.0, + "7": 43183.0, + "8": 45446.0, + "9": 42445.0, "10": 45361.0, + "11": 43966.0, + "12": 44593.0, + "13": 43907.0, + "14": 46210.0, "15": 43904.0, + "16": 41614.0, + "17": 43840.0, + "18": 44687.0, + "19": 42536.0, "20": 44746.0, + "21": 44767.0, + "22": 41831.0, + "23": 45449.0, + "24": 43072.0, "25": 42457.0, + "26": 43921.0, + "27": 46208.0, + "28": 46361.0, + "29": 46146.0, "30": 43976.0, + "31": 41272.0, + "32": 43348.0, + "33": 45431.0, + "34": 43295.0, "35": 43264.0, + "36": 42493.0, + "37": 40075.0, + "38": 42518.0, + "39": 44713.0, "40": 43230.0, + "41": 44666.0, + "42": 43251.0, + "43": 45471.0, + "44": 44600.0, "45": 43330.0, + "46": 43932.0, + "47": 42400.0, + "48": 44673.0, + "49": 43149.0, "50": 43373.0, + "51": 41142.0, + "52": 43824.0, + "53": 43917.0, + "54": 42023.0, "55": 43883.0, + "56": 43235.0, + "57": 42536.0, + "58": 43829.0, + "59": 44648.0, "60": 41187.0, + "61": 39720.0, + "62": 44740.0, + "63": 44690.0, + "64": 45358.0, "65": 44695.0, + "66": 45364.0, + "67": 43138.0, + "68": 42538.0, + "69": 43820.0, "70": 45549.0, + "71": 43324.0, + "72": 44760.0, + "73": 45363.0, + "74": 42473.0, "75": 44666.0, + "76": 43903.0, + "77": 42082.0, + "78": 40295.0, + "79": 38890.0, "80": 41131.0, + "81": 45363.0, + "82": 43206.0, + "83": 38487.0, + "84": 42462.0, "85": 43985.0, + "86": 45695.0, + "87": 40826.0, + "88": 41822.0, + "89": 41069.0, "90": 44664.0, + "91": 46170.0, + "92": 41797.0, + "93": 43208.0, + "94": 39552.0, "95": 44106.0, + "96": 44697.0, + "97": 45398.0, + "98": 41792.0, + "99": 45429.0, "100": 42437.0 } }, "mem-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2171550208.0, + "2": 2171550208.0, + "3": 2171550208.0, + "4": 2171550208.0, "5": 2171550208.0, + "6": 2171550208.0, + "7": 2171550208.0, + "8": 2171550208.0, + "9": 2171550208.0, "10": 2171550208.0, + "11": 2171550208.0, + "12": 2171550208.0, + "13": 2171550208.0, + "14": 2171550208.0, "15": 2171550208.0, + "16": 2171550208.0, + "17": 2171550208.0, + "18": 2171550208.0, + "19": 2171550208.0, "20": 2171550208.0, + "21": 2171550208.0, + "22": 2171550208.0, + "23": 2171550208.0, + "24": 2171550208.0, "25": 2171550208.0, + "26": 2171550208.0, + "27": 2171550208.0, + "28": 2171550208.0, + "29": 2171550208.0, "30": 2171550208.0, + "31": 2171550208.0, + "32": 2171550208.0, + "33": 2171550208.0, + "34": 2171550208.0, "35": 2171550208.0, + "36": 2171550208.0, + "37": 2171550208.0, + "38": 2171550208.0, + "39": 2171550208.0, "40": 2171550208.0, + "41": 2171550208.0, + "42": 2171550208.0, + "43": 2171550208.0, + "44": 2171550208.0, "45": 2171550208.0, + "46": 2171550208.0, + "47": 2171550208.0, + "48": 2171550208.0, + "49": 2171550208.0, "50": 2171550208.0, + "51": 2171550208.0, + "52": 2171550208.0, + "53": 2171550208.0, + "54": 2171550208.0, "55": 2171550208.0, + "56": 2171550208.0, + "57": 2171550208.0, + "58": 2171550208.0, + "59": 2171550208.0, "60": 2171550208.0, + "61": 2171550208.0, + "62": 2171550208.0, + "63": 2171550208.0, + "64": 2171550208.0, "65": 2171550208.0, + "66": 2171550208.0, + "67": 2171550208.0, + "68": 2171550208.0, + "69": 2171550208.0, "70": 2171550208.0, + "71": 2171550208.0, + "72": 2171550208.0, + "73": 2171550208.0, + "74": 2171550208.0, "75": 2171550208.0, + "76": 2171550208.0, + "77": 2171550208.0, + "78": 2171550208.0, + "79": 2171550208.0, "80": 2171550208.0, + "81": 2171550208.0, + "82": 2171550208.0, + "83": 2171550208.0, + "84": 2171550208.0, "85": 2171550208.0, + "86": 2171550208.0, + "87": 2171550208.0, + "88": 2171550208.0, + "89": 2171550208.0, "90": 2171550208.0, + "91": 2171550208.0, + "92": 2171550208.0, + "93": 2171550208.0, + "94": 2171550208.0, "95": 2171550208.0, + "96": 2171550208.0, + "97": 2171550208.0, + "98": 2171550208.0, + "99": 2171550208.0, "100": 2171550208.0 } }, "mem-max-allocated-bytes": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { "1": 2407642624.0, + "2": 3336458752.0, + "3": 3336458752.0, + "4": 3336458752.0, "5": 3336458752.0, + "6": 3336458752.0, + "7": 3336458752.0, + "8": 3336458752.0, + "9": 3336458752.0, "10": 3336458752.0, + "11": 3336458752.0, + "12": 3336458752.0, + "13": 3336458752.0, + "14": 3336458752.0, "15": 3336458752.0, + "16": 3336458752.0, + "17": 3336458752.0, + "18": 3336458752.0, + "19": 3336458752.0, "20": 3336458752.0, + "21": 3336458752.0, + "22": 3336458752.0, + "23": 3336458752.0, + "24": 3336458752.0, "25": 3336458752.0, + "26": 3336458752.0, + "27": 3336458752.0, + "28": 3336458752.0, + "29": 3336458752.0, "30": 3336458752.0, + "31": 3336458752.0, + "32": 3336458752.0, + "33": 3336458752.0, + "34": 3336458752.0, "35": 3336458752.0, + "36": 3336458752.0, + "37": 3336458752.0, + "38": 3336458752.0, + "39": 3336458752.0, "40": 3336458752.0, + "41": 3336458752.0, + "42": 3336458752.0, + "43": 3336458752.0, + "44": 3336458752.0, "45": 3336458752.0, + "46": 3336458752.0, + "47": 3336458752.0, + "48": 3336458752.0, + "49": 3336458752.0, "50": 3336458752.0, + "51": 3336458752.0, + "52": 3336458752.0, + "53": 3336458752.0, + "54": 3336458752.0, "55": 3336458752.0, + "56": 3336458752.0, + "57": 3336458752.0, + "58": 3336458752.0, + "59": 3336458752.0, "60": 3336458752.0, + "61": 3336458752.0, + "62": 3336458752.0, + "63": 3336458752.0, + "64": 3336458752.0, "65": 3336458752.0, + "66": 3336458752.0, + "67": 3336458752.0, + "68": 3336458752.0, + "69": 3336458752.0, "70": 3336458752.0, + "71": 3336458752.0, + "72": 3336458752.0, + "73": 3336458752.0, + "74": 3336458752.0, "75": 3336458752.0, + "76": 3336458752.0, + "77": 3336458752.0, + "78": 3336458752.0, + "79": 3336458752.0, "80": 3336458752.0, + "81": 3336458752.0, + "82": 3336458752.0, + "83": 3336458752.0, + "84": 3336458752.0, "85": 3336458752.0, + "86": 3336458752.0, + "87": 3336458752.0, + "88": 3336458752.0, + "89": 3336458752.0, "90": 3336458752.0, + "91": 3336458752.0, + "92": 3336458752.0, + "93": 3336458752.0, + "94": 3336458752.0, "95": 3336458752.0, + "96": 3336458752.0, + "97": 3336458752.0, + "98": 3336458752.0, + "99": 3336458752.0, "100": 3336458752.0 } }, "iteration-time": { "start_step": 1, "end_step": 100, - "step_interval": 5, + "step_interval": 1, "values": { - "1": 9.63895, - "5": 0.26386, - "10": 0.26904, - "15": 0.26572, - "20": 0.2594, - "25": 0.25916, - "30": 0.25941, - "35": 0.34452, - "40": 0.26089, - "45": 0.26208, - "50": 0.25808, - "55": 0.26854, - "60": 0.25663, - "65": 0.25854, - "70": 0.25853, - "75": 0.25618, - "80": 0.25673, - "85": 0.25977, - "90": 0.25957, - "95": 0.26011, - "100": 0.25873 + "1": 9.03109, + "2": 0.35076, + "3": 0.33208, + "4": 0.30024, + "5": 0.29051, + "6": 0.29151, + "7": 0.2915, + "8": 0.29069, + "9": 0.28128, + "10": 0.28633, + "11": 0.28968, + "12": 0.29187, + "13": 0.28737, + "14": 0.28701, + "15": 0.29554, + "16": 0.28451, + "17": 0.28904, + "18": 0.28765, + "19": 0.2927, + "20": 0.29433, + "21": 0.28956, + "22": 0.28517, + "23": 0.29568, + "24": 0.29372, + "25": 0.28702, + "26": 0.27993, + "27": 0.28025, + "28": 0.28025, + "29": 0.28655, + "30": 0.28192, + "31": 0.28723, + "32": 0.29054, + "33": 0.29967, + "34": 0.28855, + "35": 0.31974, + "36": 0.32479, + "37": 0.28367, + "38": 0.29414, + "39": 0.30161, + "40": 0.29066, + "41": 0.2857, + "42": 0.29152, + "43": 0.28567, + "44": 0.28393, + "45": 0.29254, + "46": 0.28887, + "47": 0.29566, + "48": 0.2879, + "49": 0.28337, + "50": 0.28858, + "51": 0.28557, + "52": 0.28641, + "53": 0.28977, + "54": 0.28532, + "55": 0.28322, + "56": 0.2855, + "57": 0.29617, + "58": 0.28816, + "59": 0.28781, + "60": 0.28732, + "61": 0.28426, + "62": 0.29092, + "63": 0.29263, + "64": 0.28875, + "65": 0.28714, + "66": 0.29018, + "67": 0.28162, + "68": 0.28703, + "69": 0.29503, + "70": 0.29276, + "71": 0.2824, + "72": 0.29151, + "73": 0.29279, + "74": 0.28282, + "75": 0.28454, + "76": 0.28479, + "77": 0.28239, + "78": 0.28785, + "79": 0.29392, + "80": 0.28563, + "81": 0.282, + "82": 0.29276, + "83": 0.29502, + "84": 0.28441, + "85": 0.28063, + "86": 0.29172, + "87": 0.2867, + "88": 0.29629, + "89": 0.29585, + "90": 0.29326, + "91": 0.28326, + "92": 0.28263, + "93": 0.2913, + "94": 0.2943, + "95": 0.28216, + "96": 0.29001, + "97": 0.29031, + "98": 0.28912, + "99": 0.68367, + "100": 0.296 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json new file mode 100644 index 00000000000..a5713a081ad --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.38854, + "2": 10.3937, + "3": 9.78105, + "4": 9.59731, + "5": 9.38095, + "6": 9.4057, + "7": 9.30785, + "8": 9.24107, + "9": 9.12192, + "10": 9.05714, + "11": 8.87325, + "12": 8.79368, + "13": 8.84026, + "14": 8.68518, + "15": 8.65603, + "16": 8.54372, + "17": 8.50113, + "18": 8.39001, + "19": 8.36443, + "20": 8.26193, + "21": 8.27097, + "22": 8.14406, + "23": 8.07467, + "24": 8.11915, + "25": 7.98192, + "26": 8.08777, + "27": 7.87148, + "28": 7.96511, + "29": 7.80258, + "30": 7.86937, + "31": 7.81742, + "32": 7.68788, + "33": 7.7805, + "34": 7.55497, + "35": 7.66279, + "36": 7.52257, + "37": 7.44455, + "38": 7.5026, + "39": 7.4504, + "40": 7.50083, + "41": 7.39053, + "42": 7.36073, + "43": 7.4333, + "44": 7.37641, + "45": 7.34894, + "46": 7.28171, + "47": 7.46122, + "48": 7.2877, + "49": 7.35375, + "50": 7.18147, + "51": 7.36608, + "52": 7.13343, + "53": 7.11575, + "54": 7.22932, + "55": 7.1542, + "56": 7.22261, + "57": 7.32969, + "58": 7.02356, + "59": 7.11377, + "60": 7.14734, + "61": 7.11404, + "62": 7.24755, + "63": 7.1568, + "64": 7.08414, + "65": 6.9972, + "66": 7.06074, + "67": 7.04881, + "68": 7.14167, + "69": 7.03482, + "70": 7.06009, + "71": 6.92578, + "72": 7.0043, + "73": 6.97965, + "74": 6.92276, + "75": 7.06086, + "76": 6.97271, + "77": 7.08186, + "78": 7.01883, + "79": 6.85524, + "80": 6.94306, + "81": 6.97637, + "82": 7.06676, + "83": 6.99984, + "84": 7.0089, + "85": 6.85989, + "86": 7.03607, + "87": 6.98072, + "88": 6.91508, + "89": 6.81068, + "90": 7.24967, + "91": 6.71006, + "92": 7.04916, + "93": 6.9057, + "94": 7.06458, + "95": 6.84836, + "96": 6.97667, + "97": 6.96312, + "98": 6.88704, + "99": 7.013, + "100": 6.98289 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43331.0, + "2": 44051.0, + "3": 44760.0, + "4": 42395.0, + "5": 45376.0, + "6": 40957.0, + "7": 43160.0, + "8": 45463.0, + "9": 42446.0, + "10": 45361.0, + "11": 43965.0, + "12": 44605.0, + "13": 43884.0, + "14": 46187.0, + "15": 43888.0, + "16": 41604.0, + "17": 43828.0, + "18": 44690.0, + "19": 42562.0, + "20": 44777.0, + "21": 44792.0, + "22": 41854.0, + "23": 45465.0, + "24": 43071.0, + "25": 42465.0, + "26": 43917.0, + "27": 46228.0, + "28": 46431.0, + "29": 46169.0, + "30": 43995.0, + "31": 41278.0, + "32": 43346.0, + "33": 45463.0, + "34": 43298.0, + "35": 43276.0, + "36": 42490.0, + "37": 40069.0, + "38": 42527.0, + "39": 44730.0, + "40": 43245.0, + "41": 44653.0, + "42": 43269.0, + "43": 45462.0, + "44": 44594.0, + "45": 43285.0, + "46": 43915.0, + "47": 42370.0, + "48": 44704.0, + "49": 43164.0, + "50": 43365.0, + "51": 41167.0, + "52": 43825.0, + "53": 43945.0, + "54": 41947.0, + "55": 43853.0, + "56": 43268.0, + "57": 42591.0, + "58": 43843.0, + "59": 44625.0, + "60": 41218.0, + "61": 39714.0, + "62": 44779.0, + "63": 44716.0, + "64": 45359.0, + "65": 44684.0, + "66": 45355.0, + "67": 43146.0, + "68": 42519.0, + "69": 43835.0, + "70": 45522.0, + "71": 43316.0, + "72": 44767.0, + "73": 45365.0, + "74": 42449.0, + "75": 44695.0, + "76": 43885.0, + "77": 42092.0, + "78": 40278.0, + "79": 38915.0, + "80": 41096.0, + "81": 45372.0, + "82": 43206.0, + "83": 38481.0, + "84": 42474.0, + "85": 43990.0, + "86": 45729.0, + "87": 40884.0, + "88": 41772.0, + "89": 41076.0, + "90": 44676.0, + "91": 46159.0, + "92": 41790.0, + "93": 43242.0, + "94": 39566.0, + "95": 44077.0, + "96": 44741.0, + "97": 45379.0, + "98": 41802.0, + "99": 45441.0, + "100": 42530.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2171550208.0, + "2": 2171550208.0, + "3": 2171550208.0, + "4": 2171550208.0, + "5": 2171550208.0, + "6": 2171550208.0, + "7": 2171550208.0, + "8": 2171550208.0, + "9": 2171550208.0, + "10": 2171550208.0, + "11": 2171550208.0, + "12": 2171550208.0, + "13": 2171550208.0, + "14": 2171550208.0, + "15": 2171550208.0, + "16": 2171550208.0, + "17": 2171550208.0, + "18": 2171550208.0, + "19": 2171550208.0, + "20": 2171550208.0, + "21": 2171550208.0, + "22": 2171550208.0, + "23": 2171550208.0, + "24": 2171550208.0, + "25": 2171550208.0, + "26": 2171550208.0, + "27": 2171550208.0, + "28": 2171550208.0, + "29": 2171550208.0, + "30": 2171550208.0, + "31": 2171550208.0, + "32": 2171550208.0, + "33": 2171550208.0, + "34": 2171550208.0, + "35": 2171550208.0, + "36": 2171550208.0, + "37": 2171550208.0, + "38": 2171550208.0, + "39": 2171550208.0, + "40": 2171550208.0, + "41": 2171550208.0, + "42": 2171550208.0, + "43": 2171550208.0, + "44": 2171550208.0, + "45": 2171550208.0, + "46": 2171550208.0, + "47": 2171550208.0, + "48": 2171550208.0, + "49": 2171550208.0, + "50": 2171550208.0, + "51": 2171550208.0, + "52": 2171550208.0, + "53": 2171550208.0, + "54": 2171550208.0, + "55": 2171550208.0, + "56": 2171550208.0, + "57": 2171550208.0, + "58": 2171550208.0, + "59": 2171550208.0, + "60": 2171550208.0, + "61": 2171550208.0, + "62": 2171550208.0, + "63": 2171550208.0, + "64": 2171550208.0, + "65": 2171550208.0, + "66": 2171550208.0, + "67": 2171550208.0, + "68": 2171550208.0, + "69": 2171550208.0, + "70": 2171550208.0, + "71": 2171550208.0, + "72": 2171550208.0, + "73": 2171550208.0, + "74": 2171550208.0, + "75": 2171550208.0, + "76": 2171550208.0, + "77": 2171550208.0, + "78": 2171550208.0, + "79": 2171550208.0, + "80": 2171550208.0, + "81": 2171550208.0, + "82": 2171550208.0, + "83": 2171550208.0, + "84": 2171550208.0, + "85": 2171550208.0, + "86": 2171550208.0, + "87": 2171550208.0, + "88": 2171550208.0, + "89": 2171550208.0, + "90": 2171550208.0, + "91": 2171550208.0, + "92": 2171550208.0, + "93": 2171550208.0, + "94": 2171550208.0, + "95": 2171550208.0, + "96": 2171550208.0, + "97": 2171550208.0, + "98": 2171550208.0, + "99": 2171550208.0, + "100": 2171550208.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2407642624.0, + "2": 3336458752.0, + "3": 3336458752.0, + "4": 3336458752.0, + "5": 3336458752.0, + "6": 3336458752.0, + "7": 3336458752.0, + "8": 3336458752.0, + "9": 3336458752.0, + "10": 3336458752.0, + "11": 3336458752.0, + "12": 3336458752.0, + "13": 3336458752.0, + "14": 3336458752.0, + "15": 3336458752.0, + "16": 3336458752.0, + "17": 3336458752.0, + "18": 3336458752.0, + "19": 3336458752.0, + "20": 3336458752.0, + "21": 3336458752.0, + "22": 3336458752.0, + "23": 3336458752.0, + "24": 3336458752.0, + "25": 3336458752.0, + "26": 3336458752.0, + "27": 3336458752.0, + "28": 3336458752.0, + "29": 3336458752.0, + "30": 3336458752.0, + "31": 3336458752.0, + "32": 3336458752.0, + "33": 3336458752.0, + "34": 3336458752.0, + "35": 3336458752.0, + "36": 3336458752.0, + "37": 3336458752.0, + "38": 3336458752.0, + "39": 3336458752.0, + "40": 3336458752.0, + "41": 3336458752.0, + "42": 3336458752.0, + "43": 3336458752.0, + "44": 3336458752.0, + "45": 3336458752.0, + "46": 3336458752.0, + "47": 3336458752.0, + "48": 3336458752.0, + "49": 3336458752.0, + "50": 3336458752.0, + "51": 3336458752.0, + "52": 3336458752.0, + "53": 3336458752.0, + "54": 3336458752.0, + "55": 3336458752.0, + "56": 3336458752.0, + "57": 3336458752.0, + "58": 3336458752.0, + "59": 3336458752.0, + "60": 3336458752.0, + "61": 3336458752.0, + "62": 3336458752.0, + "63": 3336458752.0, + "64": 3336458752.0, + "65": 3336458752.0, + "66": 3336458752.0, + "67": 3336458752.0, + "68": 3336458752.0, + "69": 3336458752.0, + "70": 3336458752.0, + "71": 3336458752.0, + "72": 3336458752.0, + "73": 3336458752.0, + "74": 3336458752.0, + "75": 3336458752.0, + "76": 3336458752.0, + "77": 3336458752.0, + "78": 3336458752.0, + "79": 3336458752.0, + "80": 3336458752.0, + "81": 3336458752.0, + "82": 3336458752.0, + "83": 3336458752.0, + "84": 3336458752.0, + "85": 3336458752.0, + "86": 3336458752.0, + "87": 3336458752.0, + "88": 3336458752.0, + "89": 3336458752.0, + "90": 3336458752.0, + "91": 3336458752.0, + "92": 3336458752.0, + "93": 3336458752.0, + "94": 3336458752.0, + "95": 3336458752.0, + "96": 3336458752.0, + "97": 3336458752.0, + "98": 3336458752.0, + "99": 3336458752.0, + "100": 3336458752.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.42312, + "2": 0.62411, + "3": 0.40707, + "4": 0.42011, + "5": 0.41971, + "6": 0.41837, + "7": 0.42045, + "8": 0.41593, + "9": 0.41528, + "10": 0.41547, + "11": 0.41748, + "12": 0.41599, + "13": 0.41809, + "14": 0.41896, + "15": 0.41063, + "16": 0.41325, + "17": 0.41257, + "18": 0.41693, + "19": 0.40667, + "20": 0.40481, + "21": 0.40784, + "22": 0.40485, + "23": 0.40809, + "24": 0.41044, + "25": 0.40445, + "26": 0.40696, + "27": 0.40798, + "28": 0.40651, + "29": 0.40546, + "30": 0.40687, + "31": 0.4062, + "32": 0.40345, + "33": 0.40106, + "34": 0.40598, + "35": 0.4189, + "36": 0.40223, + "37": 0.39806, + "38": 0.39879, + "39": 0.40009, + "40": 0.39858, + "41": 0.39851, + "42": 0.39932, + "43": 0.39763, + "44": 0.39856, + "45": 0.39923, + "46": 0.39891, + "47": 0.39808, + "48": 0.39851, + "49": 0.39952, + "50": 0.39952, + "51": 0.39938, + "52": 0.39883, + "53": 0.39509, + "54": 0.39364, + "55": 0.39489, + "56": 0.39363, + "57": 0.39345, + "58": 0.39394, + "59": 0.39402, + "60": 0.39395, + "61": 0.39343, + "62": 0.39309, + "63": 0.39586, + "64": 0.39408, + "65": 0.40348, + "66": 0.39311, + "67": 0.39329, + "68": 0.39593, + "69": 0.39468, + "70": 0.39577, + "71": 0.39317, + "72": 0.39338, + "73": 0.39355, + "74": 0.39362, + "75": 0.39435, + "76": 0.39315, + "77": 0.39232, + "78": 0.39379, + "79": 0.39337, + "80": 0.39379, + "81": 0.3971, + "82": 0.39385, + "83": 0.39875, + "84": 0.39836, + "85": 0.39368, + "86": 0.39332, + "87": 0.3934, + "88": 0.40166, + "89": 0.3951, + "90": 0.39501, + "91": 0.39618, + "92": 0.39935, + "93": 0.39375, + "94": 0.39481, + "95": 0.39382, + "96": 0.3928, + "97": 0.39282, + "98": 0.39402, + "99": 0.39342, + "100": 0.39435 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci.json new file mode 100644 index 00000000000..87a5820cc8c --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.38854, + "2": 10.3937, + "3": 9.78105, + "4": 9.59731, + "5": 9.38095, + "6": 9.4057, + "7": 9.30785, + "8": 9.24107, + "9": 9.12192, + "10": 9.05714, + "11": 8.87325, + "12": 8.79368, + "13": 8.84026, + "14": 8.68518, + "15": 8.65603, + "16": 8.54372, + "17": 8.50113, + "18": 8.39001, + "19": 8.36443, + "20": 8.26193, + "21": 8.27097, + "22": 8.14406, + "23": 8.07467, + "24": 8.11915, + "25": 7.98192, + "26": 8.08777, + "27": 7.87148, + "28": 7.96511, + "29": 7.80258, + "30": 7.86937, + "31": 7.81742, + "32": 7.68788, + "33": 7.7805, + "34": 7.55497, + "35": 7.66279, + "36": 7.52257, + "37": 7.44455, + "38": 7.5026, + "39": 7.4504, + "40": 7.50083, + "41": 7.39053, + "42": 7.36073, + "43": 7.4333, + "44": 7.37641, + "45": 7.34894, + "46": 7.28171, + "47": 7.46122, + "48": 7.2877, + "49": 7.35375, + "50": 7.18147, + "51": 7.36608, + "52": 7.13343, + "53": 7.11575, + "54": 7.22932, + "55": 7.1542, + "56": 7.22261, + "57": 7.32969, + "58": 7.02356, + "59": 7.11377, + "60": 7.14734, + "61": 7.11404, + "62": 7.24755, + "63": 7.1568, + "64": 7.08414, + "65": 6.9972, + "66": 7.06074, + "67": 7.04881, + "68": 7.14167, + "69": 7.03482, + "70": 7.06009, + "71": 6.92578, + "72": 7.0043, + "73": 6.97965, + "74": 6.92276, + "75": 7.06086, + "76": 6.97271, + "77": 7.08186, + "78": 7.01883, + "79": 6.85524, + "80": 6.94306, + "81": 6.97637, + "82": 7.06676, + "83": 6.99984, + "84": 7.0089, + "85": 6.85989, + "86": 7.03607, + "87": 6.98072, + "88": 6.91508, + "89": 6.81068, + "90": 7.24967, + "91": 6.71006, + "92": 7.04916, + "93": 6.9057, + "94": 7.06458, + "95": 6.84836, + "96": 6.97667, + "97": 6.96312, + "98": 6.88704, + "99": 7.013, + "100": 6.98289 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43331.0, + "2": 44051.0, + "3": 44760.0, + "4": 42395.0, + "5": 45376.0, + "6": 40957.0, + "7": 43160.0, + "8": 45463.0, + "9": 42446.0, + "10": 45361.0, + "11": 43965.0, + "12": 44605.0, + "13": 43884.0, + "14": 46187.0, + "15": 43888.0, + "16": 41604.0, + "17": 43828.0, + "18": 44690.0, + "19": 42562.0, + "20": 44777.0, + "21": 44792.0, + "22": 41854.0, + "23": 45465.0, + "24": 43071.0, + "25": 42465.0, + "26": 43917.0, + "27": 46228.0, + "28": 46431.0, + "29": 46169.0, + "30": 43995.0, + "31": 41278.0, + "32": 43346.0, + "33": 45463.0, + "34": 43298.0, + "35": 43276.0, + "36": 42490.0, + "37": 40069.0, + "38": 42527.0, + "39": 44730.0, + "40": 43245.0, + "41": 44653.0, + "42": 43269.0, + "43": 45462.0, + "44": 44594.0, + "45": 43285.0, + "46": 43915.0, + "47": 42370.0, + "48": 44704.0, + "49": 43164.0, + "50": 43365.0, + "51": 41167.0, + "52": 43825.0, + "53": 43945.0, + "54": 41947.0, + "55": 43853.0, + "56": 43268.0, + "57": 42591.0, + "58": 43843.0, + "59": 44625.0, + "60": 41218.0, + "61": 39714.0, + "62": 44779.0, + "63": 44716.0, + "64": 45359.0, + "65": 44684.0, + "66": 45355.0, + "67": 43146.0, + "68": 42519.0, + "69": 43835.0, + "70": 45522.0, + "71": 43316.0, + "72": 44767.0, + "73": 45365.0, + "74": 42449.0, + "75": 44695.0, + "76": 43885.0, + "77": 42092.0, + "78": 40278.0, + "79": 38915.0, + "80": 41096.0, + "81": 45372.0, + "82": 43206.0, + "83": 38481.0, + "84": 42474.0, + "85": 43990.0, + "86": 45729.0, + "87": 40884.0, + "88": 41772.0, + "89": 41076.0, + "90": 44676.0, + "91": 46159.0, + "92": 41790.0, + "93": 43242.0, + "94": 39566.0, + "95": 44077.0, + "96": 44741.0, + "97": 45379.0, + "98": 41802.0, + "99": 45441.0, + "100": 42530.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2171550208.0, + "2": 2171550208.0, + "3": 2171550208.0, + "4": 2171550208.0, + "5": 2171550208.0, + "6": 2171550208.0, + "7": 2171550208.0, + "8": 2171550208.0, + "9": 2171550208.0, + "10": 2171550208.0, + "11": 2171550208.0, + "12": 2171550208.0, + "13": 2171550208.0, + "14": 2171550208.0, + "15": 2171550208.0, + "16": 2171550208.0, + "17": 2171550208.0, + "18": 2171550208.0, + "19": 2171550208.0, + "20": 2171550208.0, + "21": 2171550208.0, + "22": 2171550208.0, + "23": 2171550208.0, + "24": 2171550208.0, + "25": 2171550208.0, + "26": 2171550208.0, + "27": 2171550208.0, + "28": 2171550208.0, + "29": 2171550208.0, + "30": 2171550208.0, + "31": 2171550208.0, + "32": 2171550208.0, + "33": 2171550208.0, + "34": 2171550208.0, + "35": 2171550208.0, + "36": 2171550208.0, + "37": 2171550208.0, + "38": 2171550208.0, + "39": 2171550208.0, + "40": 2171550208.0, + "41": 2171550208.0, + "42": 2171550208.0, + "43": 2171550208.0, + "44": 2171550208.0, + "45": 2171550208.0, + "46": 2171550208.0, + "47": 2171550208.0, + "48": 2171550208.0, + "49": 2171550208.0, + "50": 2171550208.0, + "51": 2171550208.0, + "52": 2171550208.0, + "53": 2171550208.0, + "54": 2171550208.0, + "55": 2171550208.0, + "56": 2171550208.0, + "57": 2171550208.0, + "58": 2171550208.0, + "59": 2171550208.0, + "60": 2171550208.0, + "61": 2171550208.0, + "62": 2171550208.0, + "63": 2171550208.0, + "64": 2171550208.0, + "65": 2171550208.0, + "66": 2171550208.0, + "67": 2171550208.0, + "68": 2171550208.0, + "69": 2171550208.0, + "70": 2171550208.0, + "71": 2171550208.0, + "72": 2171550208.0, + "73": 2171550208.0, + "74": 2171550208.0, + "75": 2171550208.0, + "76": 2171550208.0, + "77": 2171550208.0, + "78": 2171550208.0, + "79": 2171550208.0, + "80": 2171550208.0, + "81": 2171550208.0, + "82": 2171550208.0, + "83": 2171550208.0, + "84": 2171550208.0, + "85": 2171550208.0, + "86": 2171550208.0, + "87": 2171550208.0, + "88": 2171550208.0, + "89": 2171550208.0, + "90": 2171550208.0, + "91": 2171550208.0, + "92": 2171550208.0, + "93": 2171550208.0, + "94": 2171550208.0, + "95": 2171550208.0, + "96": 2171550208.0, + "97": 2171550208.0, + "98": 2171550208.0, + "99": 2171550208.0, + "100": 2171550208.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2407642624.0, + "2": 3336458752.0, + "3": 3336458752.0, + "4": 3336458752.0, + "5": 3336458752.0, + "6": 3336458752.0, + "7": 3336458752.0, + "8": 3336458752.0, + "9": 3336458752.0, + "10": 3336458752.0, + "11": 3336458752.0, + "12": 3336458752.0, + "13": 3336458752.0, + "14": 3336458752.0, + "15": 3336458752.0, + "16": 3336458752.0, + "17": 3336458752.0, + "18": 3336458752.0, + "19": 3336458752.0, + "20": 3336458752.0, + "21": 3336458752.0, + "22": 3336458752.0, + "23": 3336458752.0, + "24": 3336458752.0, + "25": 3336458752.0, + "26": 3336458752.0, + "27": 3336458752.0, + "28": 3336458752.0, + "29": 3336458752.0, + "30": 3336458752.0, + "31": 3336458752.0, + "32": 3336458752.0, + "33": 3336458752.0, + "34": 3336458752.0, + "35": 3336458752.0, + "36": 3336458752.0, + "37": 3336458752.0, + "38": 3336458752.0, + "39": 3336458752.0, + "40": 3336458752.0, + "41": 3336458752.0, + "42": 3336458752.0, + "43": 3336458752.0, + "44": 3336458752.0, + "45": 3336458752.0, + "46": 3336458752.0, + "47": 3336458752.0, + "48": 3336458752.0, + "49": 3336458752.0, + "50": 3336458752.0, + "51": 3336458752.0, + "52": 3336458752.0, + "53": 3336458752.0, + "54": 3336458752.0, + "55": 3336458752.0, + "56": 3336458752.0, + "57": 3336458752.0, + "58": 3336458752.0, + "59": 3336458752.0, + "60": 3336458752.0, + "61": 3336458752.0, + "62": 3336458752.0, + "63": 3336458752.0, + "64": 3336458752.0, + "65": 3336458752.0, + "66": 3336458752.0, + "67": 3336458752.0, + "68": 3336458752.0, + "69": 3336458752.0, + "70": 3336458752.0, + "71": 3336458752.0, + "72": 3336458752.0, + "73": 3336458752.0, + "74": 3336458752.0, + "75": 3336458752.0, + "76": 3336458752.0, + "77": 3336458752.0, + "78": 3336458752.0, + "79": 3336458752.0, + "80": 3336458752.0, + "81": 3336458752.0, + "82": 3336458752.0, + "83": 3336458752.0, + "84": 3336458752.0, + "85": 3336458752.0, + "86": 3336458752.0, + "87": 3336458752.0, + "88": 3336458752.0, + "89": 3336458752.0, + "90": 3336458752.0, + "91": 3336458752.0, + "92": 3336458752.0, + "93": 3336458752.0, + "94": 3336458752.0, + "95": 3336458752.0, + "96": 3336458752.0, + "97": 3336458752.0, + "98": 3336458752.0, + "99": 3336458752.0, + "100": 3336458752.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 7.65153, + "2": 0.53984, + "3": 0.42661, + "4": 0.41593, + "5": 0.40702, + "6": 0.40818, + "7": 0.40561, + "8": 0.40327, + "9": 0.40232, + "10": 0.40905, + "11": 0.41597, + "12": 0.41177, + "13": 0.4131, + "14": 0.41425, + "15": 0.40979, + "16": 0.41034, + "17": 0.40766, + "18": 0.41324, + "19": 0.40983, + "20": 0.40973, + "21": 0.41258, + "22": 0.40882, + "23": 0.41161, + "24": 0.41499, + "25": 0.40883, + "26": 0.41065, + "27": 0.41442, + "28": 0.42182, + "29": 0.41133, + "30": 0.40692, + "31": 0.40463, + "32": 0.40734, + "33": 0.41503, + "34": 0.40436, + "35": 0.40604, + "36": 0.40609, + "37": 0.40425, + "38": 0.40616, + "39": 0.40517, + "40": 0.40457, + "41": 0.40404, + "42": 0.40366, + "43": 0.40482, + "44": 0.40536, + "45": 0.40416, + "46": 0.40309, + "47": 0.40454, + "48": 0.40394, + "49": 0.40592, + "50": 0.40575, + "51": 0.40587, + "52": 0.40615, + "53": 0.4075, + "54": 0.8929, + "55": 0.40675, + "56": 0.40691, + "57": 0.40758, + "58": 0.40852, + "59": 0.40647, + "60": 0.40547, + "61": 0.40637, + "62": 0.40696, + "63": 0.40776, + "64": 0.40276, + "65": 0.40178, + "66": 0.40265, + "67": 0.40328, + "68": 0.40315, + "69": 0.40883, + "70": 0.40216, + "71": 0.40455, + "72": 0.40323, + "73": 0.40261, + "74": 0.40269, + "75": 0.40043, + "76": 0.40039, + "77": 0.40035, + "78": 0.39953, + "79": 0.39986, + "80": 0.40626, + "81": 0.40677, + "82": 0.39929, + "83": 0.40058, + "84": 0.40833, + "85": 0.40235, + "86": 0.39878, + "87": 0.40207, + "88": 0.39947, + "89": 0.39981, + "90": 0.39896, + "91": 0.39963, + "92": 0.40003, + "93": 0.39864, + "94": 0.40427, + "95": 0.39942, + "96": 0.40168, + "97": 0.40276, + "98": 0.39869, + "99": 0.40201, + "100": 0.39949 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..f6481fb6aae --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.372, + "2": 10.37006, + "3": 9.85232, + "4": 9.61996, + "5": 9.40868, + "6": 9.43215, + "7": 9.31482, + "8": 9.27336, + "9": 9.1139, + "10": 9.03962, + "11": 8.87218, + "12": 8.80873, + "13": 8.83468, + "14": 8.69011, + "15": 8.66228, + "16": 8.54828, + "17": 8.50093, + "18": 8.42525, + "19": 8.3881, + "20": 8.2807, + "21": 8.26609, + "22": 8.16003, + "23": 8.1124, + "24": 8.14262, + "25": 7.98432, + "26": 8.10592, + "27": 7.88963, + "28": 7.97037, + "29": 7.81276, + "30": 7.87638, + "31": 7.82516, + "32": 7.70248, + "33": 7.80198, + "34": 7.56872, + "35": 7.67379, + "36": 7.54691, + "37": 7.47408, + "38": 7.50739, + "39": 7.49773, + "40": 7.51091, + "41": 7.41065, + "42": 7.37995, + "43": 7.44078, + "44": 7.39393, + "45": 7.37239, + "46": 7.28427, + "47": 7.46631, + "48": 7.2905, + "49": 7.35025, + "50": 7.17204, + "51": 7.37012, + "52": 7.14467, + "53": 7.12652, + "54": 7.23751, + "55": 7.15586, + "56": 7.23154, + "57": 7.33541, + "58": 7.01363, + "59": 7.11431, + "60": 7.15121, + "61": 7.10904, + "62": 7.26834, + "63": 7.15176, + "64": 7.08415, + "65": 6.99114, + "66": 7.05301, + "67": 7.04354, + "68": 7.1398, + "69": 7.03224, + "70": 7.05832, + "71": 6.90372, + "72": 6.99794, + "73": 6.9769, + "74": 6.91759, + "75": 7.06626, + "76": 6.95758, + "77": 7.0871, + "78": 7.03238, + "79": 6.85274, + "80": 6.93633, + "81": 6.97617, + "82": 7.06196, + "83": 6.98213, + "84": 7.00931, + "85": 6.85082, + "86": 7.04673, + "87": 6.97907, + "88": 6.91096, + "89": 6.81719, + "90": 7.2459, + "91": 6.7046, + "92": 7.05377, + "93": 6.89397, + "94": 7.0542, + "95": 6.85031, + "96": 6.96441, + "97": 6.95632, + "98": 6.88246, + "99": 7.00392, + "100": 6.98993 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43299.0, + "2": 44047.0, + "3": 44744.0, + "4": 42405.0, + "5": 45385.0, + "6": 40946.0, + "7": 43183.0, + "8": 45446.0, + "9": 42445.0, + "10": 45361.0, + "11": 43966.0, + "12": 44593.0, + "13": 43907.0, + "14": 46210.0, + "15": 43904.0, + "16": 41614.0, + "17": 43840.0, + "18": 44687.0, + "19": 42536.0, + "20": 44746.0, + "21": 44767.0, + "22": 41831.0, + "23": 45449.0, + "24": 43072.0, + "25": 42457.0, + "26": 43921.0, + "27": 46208.0, + "28": 46361.0, + "29": 46146.0, + "30": 43976.0, + "31": 41272.0, + "32": 43348.0, + "33": 45431.0, + "34": 43295.0, + "35": 43264.0, + "36": 42493.0, + "37": 40075.0, + "38": 42518.0, + "39": 44713.0, + "40": 43230.0, + "41": 44666.0, + "42": 43251.0, + "43": 45471.0, + "44": 44600.0, + "45": 43330.0, + "46": 43932.0, + "47": 42400.0, + "48": 44673.0, + "49": 43149.0, + "50": 43373.0, + "51": 41142.0, + "52": 43824.0, + "53": 43917.0, + "54": 42023.0, + "55": 43883.0, + "56": 43235.0, + "57": 42536.0, + "58": 43829.0, + "59": 44648.0, + "60": 41187.0, + "61": 39720.0, + "62": 44740.0, + "63": 44690.0, + "64": 45358.0, + "65": 44695.0, + "66": 45364.0, + "67": 43138.0, + "68": 42538.0, + "69": 43820.0, + "70": 45549.0, + "71": 43324.0, + "72": 44760.0, + "73": 45363.0, + "74": 42473.0, + "75": 44666.0, + "76": 43903.0, + "77": 42082.0, + "78": 40295.0, + "79": 38890.0, + "80": 41131.0, + "81": 45363.0, + "82": 43206.0, + "83": 38487.0, + "84": 42462.0, + "85": 43985.0, + "86": 45695.0, + "87": 40826.0, + "88": 41822.0, + "89": 41069.0, + "90": 44664.0, + "91": 46170.0, + "92": 41797.0, + "93": 43208.0, + "94": 39552.0, + "95": 44106.0, + "96": 44697.0, + "97": 45398.0, + "98": 41792.0, + "99": 45429.0, + "100": 42437.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2171550208.0, + "2": 2171550208.0, + "3": 2171550208.0, + "4": 2171550208.0, + "5": 2171550208.0, + "6": 2171550208.0, + "7": 2171550208.0, + "8": 2171550208.0, + "9": 2171550208.0, + "10": 2171550208.0, + "11": 2171550208.0, + "12": 2171550208.0, + "13": 2171550208.0, + "14": 2171550208.0, + "15": 2171550208.0, + "16": 2171550208.0, + "17": 2171550208.0, + "18": 2171550208.0, + "19": 2171550208.0, + "20": 2171550208.0, + "21": 2171550208.0, + "22": 2171550208.0, + "23": 2171550208.0, + "24": 2171550208.0, + "25": 2171550208.0, + "26": 2171550208.0, + "27": 2171550208.0, + "28": 2171550208.0, + "29": 2171550208.0, + "30": 2171550208.0, + "31": 2171550208.0, + "32": 2171550208.0, + "33": 2171550208.0, + "34": 2171550208.0, + "35": 2171550208.0, + "36": 2171550208.0, + "37": 2171550208.0, + "38": 2171550208.0, + "39": 2171550208.0, + "40": 2171550208.0, + "41": 2171550208.0, + "42": 2171550208.0, + "43": 2171550208.0, + "44": 2171550208.0, + "45": 2171550208.0, + "46": 2171550208.0, + "47": 2171550208.0, + "48": 2171550208.0, + "49": 2171550208.0, + "50": 2171550208.0, + "51": 2171550208.0, + "52": 2171550208.0, + "53": 2171550208.0, + "54": 2171550208.0, + "55": 2171550208.0, + "56": 2171550208.0, + "57": 2171550208.0, + "58": 2171550208.0, + "59": 2171550208.0, + "60": 2171550208.0, + "61": 2171550208.0, + "62": 2171550208.0, + "63": 2171550208.0, + "64": 2171550208.0, + "65": 2171550208.0, + "66": 2171550208.0, + "67": 2171550208.0, + "68": 2171550208.0, + "69": 2171550208.0, + "70": 2171550208.0, + "71": 2171550208.0, + "72": 2171550208.0, + "73": 2171550208.0, + "74": 2171550208.0, + "75": 2171550208.0, + "76": 2171550208.0, + "77": 2171550208.0, + "78": 2171550208.0, + "79": 2171550208.0, + "80": 2171550208.0, + "81": 2171550208.0, + "82": 2171550208.0, + "83": 2171550208.0, + "84": 2171550208.0, + "85": 2171550208.0, + "86": 2171550208.0, + "87": 2171550208.0, + "88": 2171550208.0, + "89": 2171550208.0, + "90": 2171550208.0, + "91": 2171550208.0, + "92": 2171550208.0, + "93": 2171550208.0, + "94": 2171550208.0, + "95": 2171550208.0, + "96": 2171550208.0, + "97": 2171550208.0, + "98": 2171550208.0, + "99": 2171550208.0, + "100": 2171550208.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2407642624.0, + "2": 3336458752.0, + "3": 3336458752.0, + "4": 3336458752.0, + "5": 3336458752.0, + "6": 3336458752.0, + "7": 3336458752.0, + "8": 3336458752.0, + "9": 3336458752.0, + "10": 3336458752.0, + "11": 3336458752.0, + "12": 3336458752.0, + "13": 3336458752.0, + "14": 3336458752.0, + "15": 3336458752.0, + "16": 3336458752.0, + "17": 3336458752.0, + "18": 3336458752.0, + "19": 3336458752.0, + "20": 3336458752.0, + "21": 3336458752.0, + "22": 3336458752.0, + "23": 3336458752.0, + "24": 3336458752.0, + "25": 3336458752.0, + "26": 3336458752.0, + "27": 3336458752.0, + "28": 3336458752.0, + "29": 3336458752.0, + "30": 3336458752.0, + "31": 3336458752.0, + "32": 3336458752.0, + "33": 3336458752.0, + "34": 3336458752.0, + "35": 3336458752.0, + "36": 3336458752.0, + "37": 3336458752.0, + "38": 3336458752.0, + "39": 3336458752.0, + "40": 3336458752.0, + "41": 3336458752.0, + "42": 3336458752.0, + "43": 3336458752.0, + "44": 3336458752.0, + "45": 3336458752.0, + "46": 3336458752.0, + "47": 3336458752.0, + "48": 3336458752.0, + "49": 3336458752.0, + "50": 3336458752.0, + "51": 3336458752.0, + "52": 3336458752.0, + "53": 3336458752.0, + "54": 3336458752.0, + "55": 3336458752.0, + "56": 3336458752.0, + "57": 3336458752.0, + "58": 3336458752.0, + "59": 3336458752.0, + "60": 3336458752.0, + "61": 3336458752.0, + "62": 3336458752.0, + "63": 3336458752.0, + "64": 3336458752.0, + "65": 3336458752.0, + "66": 3336458752.0, + "67": 3336458752.0, + "68": 3336458752.0, + "69": 3336458752.0, + "70": 3336458752.0, + "71": 3336458752.0, + "72": 3336458752.0, + "73": 3336458752.0, + "74": 3336458752.0, + "75": 3336458752.0, + "76": 3336458752.0, + "77": 3336458752.0, + "78": 3336458752.0, + "79": 3336458752.0, + "80": 3336458752.0, + "81": 3336458752.0, + "82": 3336458752.0, + "83": 3336458752.0, + "84": 3336458752.0, + "85": 3336458752.0, + "86": 3336458752.0, + "87": 3336458752.0, + "88": 3336458752.0, + "89": 3336458752.0, + "90": 3336458752.0, + "91": 3336458752.0, + "92": 3336458752.0, + "93": 3336458752.0, + "94": 3336458752.0, + "95": 3336458752.0, + "96": 3336458752.0, + "97": 3336458752.0, + "98": 3336458752.0, + "99": 3336458752.0, + "100": 3336458752.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.60166, + "2": 0.33673, + "3": 0.25171, + "4": 0.25375, + "5": 0.25753, + "6": 0.27787, + "7": 0.24971, + "8": 0.2503, + "9": 0.25048, + "10": 0.24978, + "11": 0.25041, + "12": 0.24978, + "13": 0.25194, + "14": 0.2514, + "15": 0.25318, + "16": 0.25109, + "17": 0.25362, + "18": 0.24882, + "19": 0.24704, + "20": 0.25004, + "21": 0.27982, + "22": 0.24826, + "23": 0.24772, + "24": 0.251, + "25": 0.24928, + "26": 0.24917, + "27": 0.25053, + "28": 0.25787, + "29": 0.24964, + "30": 0.24738, + "31": 0.24871, + "32": 0.24723, + "33": 0.25394, + "34": 0.24523, + "35": 0.26602, + "36": 0.25389, + "37": 0.25278, + "38": 0.24491, + "39": 0.2522, + "40": 0.25493, + "41": 0.25366, + "42": 0.27735, + "43": 0.2544, + "44": 0.25245, + "45": 0.25589, + "46": 0.24817, + "47": 0.24991, + "48": 0.2536, + "49": 0.27661, + "50": 0.25098, + "51": 0.252, + "52": 0.25923, + "53": 0.26278, + "54": 0.25083, + "55": 0.25065, + "56": 0.281, + "57": 0.25168, + "58": 0.25062, + "59": 0.24811, + "60": 0.25419, + "61": 0.2513, + "62": 0.24774, + "63": 0.24385, + "64": 0.24558, + "65": 0.24527, + "66": 0.24409, + "67": 0.24307, + "68": 0.24418, + "69": 0.24735, + "70": 0.26794, + "71": 0.24394, + "72": 0.24559, + "73": 0.24851, + "74": 0.24204, + "75": 0.24385, + "76": 0.24384, + "77": 0.2634, + "78": 0.24391, + "79": 0.24432, + "80": 0.24643, + "81": 0.24693, + "82": 0.2446, + "83": 0.24366, + "84": 0.24512, + "85": 0.25101, + "86": 0.24393, + "87": 0.24582, + "88": 0.24672, + "89": 0.24434, + "90": 0.24628, + "91": 0.24503, + "92": 0.24574, + "93": 0.25036, + "94": 0.25184, + "95": 0.254, + "96": 0.24924, + "97": 0.25063, + "98": 0.25449, + "99": 0.24818, + "100": 0.24724 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..81670d237ce --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.372, + "2": 10.37006, + "3": 9.85232, + "4": 9.61996, + "5": 9.40868, + "6": 9.43215, + "7": 9.31482, + "8": 9.27336, + "9": 9.1139, + "10": 9.03962, + "11": 8.87218, + "12": 8.80873, + "13": 8.83468, + "14": 8.69011, + "15": 8.66228, + "16": 8.54828, + "17": 8.50093, + "18": 8.42525, + "19": 8.3881, + "20": 8.2807, + "21": 8.26609, + "22": 8.16003, + "23": 8.1124, + "24": 8.14262, + "25": 7.98432, + "26": 8.10592, + "27": 7.88963, + "28": 7.97037, + "29": 7.81276, + "30": 7.87638, + "31": 7.82516, + "32": 7.70248, + "33": 7.80198, + "34": 7.56872, + "35": 7.67379, + "36": 7.54691, + "37": 7.47408, + "38": 7.50739, + "39": 7.49773, + "40": 7.51091, + "41": 7.41065, + "42": 7.37995, + "43": 7.44078, + "44": 7.39393, + "45": 7.37239, + "46": 7.28427, + "47": 7.46631, + "48": 7.2905, + "49": 7.35025, + "50": 7.17204, + "51": 7.37012, + "52": 7.14467, + "53": 7.12652, + "54": 7.23751, + "55": 7.15586, + "56": 7.23154, + "57": 7.33541, + "58": 7.01363, + "59": 7.11431, + "60": 7.15121, + "61": 7.10904, + "62": 7.26834, + "63": 7.15176, + "64": 7.08415, + "65": 6.99114, + "66": 7.05301, + "67": 7.04354, + "68": 7.1398, + "69": 7.03224, + "70": 7.05832, + "71": 6.90372, + "72": 6.99794, + "73": 6.9769, + "74": 6.91759, + "75": 7.06626, + "76": 6.95758, + "77": 7.0871, + "78": 7.03238, + "79": 6.85274, + "80": 6.93633, + "81": 6.97617, + "82": 7.06196, + "83": 6.98213, + "84": 7.00931, + "85": 6.85082, + "86": 7.04673, + "87": 6.97907, + "88": 6.91096, + "89": 6.81719, + "90": 7.2459, + "91": 6.7046, + "92": 7.05377, + "93": 6.89397, + "94": 7.0542, + "95": 6.85031, + "96": 6.96441, + "97": 6.95632, + "98": 6.88246, + "99": 7.00392, + "100": 6.98993 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 43299.0, + "2": 44047.0, + "3": 44744.0, + "4": 42405.0, + "5": 45385.0, + "6": 40946.0, + "7": 43183.0, + "8": 45446.0, + "9": 42445.0, + "10": 45361.0, + "11": 43966.0, + "12": 44593.0, + "13": 43907.0, + "14": 46210.0, + "15": 43904.0, + "16": 41614.0, + "17": 43840.0, + "18": 44687.0, + "19": 42536.0, + "20": 44746.0, + "21": 44767.0, + "22": 41831.0, + "23": 45449.0, + "24": 43072.0, + "25": 42457.0, + "26": 43921.0, + "27": 46208.0, + "28": 46361.0, + "29": 46146.0, + "30": 43976.0, + "31": 41272.0, + "32": 43348.0, + "33": 45431.0, + "34": 43295.0, + "35": 43264.0, + "36": 42493.0, + "37": 40075.0, + "38": 42518.0, + "39": 44713.0, + "40": 43230.0, + "41": 44666.0, + "42": 43251.0, + "43": 45471.0, + "44": 44600.0, + "45": 43330.0, + "46": 43932.0, + "47": 42400.0, + "48": 44673.0, + "49": 43149.0, + "50": 43373.0, + "51": 41142.0, + "52": 43824.0, + "53": 43917.0, + "54": 42023.0, + "55": 43883.0, + "56": 43235.0, + "57": 42536.0, + "58": 43829.0, + "59": 44648.0, + "60": 41187.0, + "61": 39720.0, + "62": 44740.0, + "63": 44690.0, + "64": 45358.0, + "65": 44695.0, + "66": 45364.0, + "67": 43138.0, + "68": 42538.0, + "69": 43820.0, + "70": 45549.0, + "71": 43324.0, + "72": 44760.0, + "73": 45363.0, + "74": 42473.0, + "75": 44666.0, + "76": 43903.0, + "77": 42082.0, + "78": 40295.0, + "79": 38890.0, + "80": 41131.0, + "81": 45363.0, + "82": 43206.0, + "83": 38487.0, + "84": 42462.0, + "85": 43985.0, + "86": 45695.0, + "87": 40826.0, + "88": 41822.0, + "89": 41069.0, + "90": 44664.0, + "91": 46170.0, + "92": 41797.0, + "93": 43208.0, + "94": 39552.0, + "95": 44106.0, + "96": 44697.0, + "97": 45398.0, + "98": 41792.0, + "99": 45429.0, + "100": 42437.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2171550208.0, + "2": 2171550208.0, + "3": 2171550208.0, + "4": 2171550208.0, + "5": 2171550208.0, + "6": 2171550208.0, + "7": 2171550208.0, + "8": 2171550208.0, + "9": 2171550208.0, + "10": 2171550208.0, + "11": 2171550208.0, + "12": 2171550208.0, + "13": 2171550208.0, + "14": 2171550208.0, + "15": 2171550208.0, + "16": 2171550208.0, + "17": 2171550208.0, + "18": 2171550208.0, + "19": 2171550208.0, + "20": 2171550208.0, + "21": 2171550208.0, + "22": 2171550208.0, + "23": 2171550208.0, + "24": 2171550208.0, + "25": 2171550208.0, + "26": 2171550208.0, + "27": 2171550208.0, + "28": 2171550208.0, + "29": 2171550208.0, + "30": 2171550208.0, + "31": 2171550208.0, + "32": 2171550208.0, + "33": 2171550208.0, + "34": 2171550208.0, + "35": 2171550208.0, + "36": 2171550208.0, + "37": 2171550208.0, + "38": 2171550208.0, + "39": 2171550208.0, + "40": 2171550208.0, + "41": 2171550208.0, + "42": 2171550208.0, + "43": 2171550208.0, + "44": 2171550208.0, + "45": 2171550208.0, + "46": 2171550208.0, + "47": 2171550208.0, + "48": 2171550208.0, + "49": 2171550208.0, + "50": 2171550208.0, + "51": 2171550208.0, + "52": 2171550208.0, + "53": 2171550208.0, + "54": 2171550208.0, + "55": 2171550208.0, + "56": 2171550208.0, + "57": 2171550208.0, + "58": 2171550208.0, + "59": 2171550208.0, + "60": 2171550208.0, + "61": 2171550208.0, + "62": 2171550208.0, + "63": 2171550208.0, + "64": 2171550208.0, + "65": 2171550208.0, + "66": 2171550208.0, + "67": 2171550208.0, + "68": 2171550208.0, + "69": 2171550208.0, + "70": 2171550208.0, + "71": 2171550208.0, + "72": 2171550208.0, + "73": 2171550208.0, + "74": 2171550208.0, + "75": 2171550208.0, + "76": 2171550208.0, + "77": 2171550208.0, + "78": 2171550208.0, + "79": 2171550208.0, + "80": 2171550208.0, + "81": 2171550208.0, + "82": 2171550208.0, + "83": 2171550208.0, + "84": 2171550208.0, + "85": 2171550208.0, + "86": 2171550208.0, + "87": 2171550208.0, + "88": 2171550208.0, + "89": 2171550208.0, + "90": 2171550208.0, + "91": 2171550208.0, + "92": 2171550208.0, + "93": 2171550208.0, + "94": 2171550208.0, + "95": 2171550208.0, + "96": 2171550208.0, + "97": 2171550208.0, + "98": 2171550208.0, + "99": 2171550208.0, + "100": 2171550208.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2407642624.0, + "2": 3336458752.0, + "3": 3336458752.0, + "4": 3336458752.0, + "5": 3336458752.0, + "6": 3336458752.0, + "7": 3336458752.0, + "8": 3336458752.0, + "9": 3336458752.0, + "10": 3336458752.0, + "11": 3336458752.0, + "12": 3336458752.0, + "13": 3336458752.0, + "14": 3336458752.0, + "15": 3336458752.0, + "16": 3336458752.0, + "17": 3336458752.0, + "18": 3336458752.0, + "19": 3336458752.0, + "20": 3336458752.0, + "21": 3336458752.0, + "22": 3336458752.0, + "23": 3336458752.0, + "24": 3336458752.0, + "25": 3336458752.0, + "26": 3336458752.0, + "27": 3336458752.0, + "28": 3336458752.0, + "29": 3336458752.0, + "30": 3336458752.0, + "31": 3336458752.0, + "32": 3336458752.0, + "33": 3336458752.0, + "34": 3336458752.0, + "35": 3336458752.0, + "36": 3336458752.0, + "37": 3336458752.0, + "38": 3336458752.0, + "39": 3336458752.0, + "40": 3336458752.0, + "41": 3336458752.0, + "42": 3336458752.0, + "43": 3336458752.0, + "44": 3336458752.0, + "45": 3336458752.0, + "46": 3336458752.0, + "47": 3336458752.0, + "48": 3336458752.0, + "49": 3336458752.0, + "50": 3336458752.0, + "51": 3336458752.0, + "52": 3336458752.0, + "53": 3336458752.0, + "54": 3336458752.0, + "55": 3336458752.0, + "56": 3336458752.0, + "57": 3336458752.0, + "58": 3336458752.0, + "59": 3336458752.0, + "60": 3336458752.0, + "61": 3336458752.0, + "62": 3336458752.0, + "63": 3336458752.0, + "64": 3336458752.0, + "65": 3336458752.0, + "66": 3336458752.0, + "67": 3336458752.0, + "68": 3336458752.0, + "69": 3336458752.0, + "70": 3336458752.0, + "71": 3336458752.0, + "72": 3336458752.0, + "73": 3336458752.0, + "74": 3336458752.0, + "75": 3336458752.0, + "76": 3336458752.0, + "77": 3336458752.0, + "78": 3336458752.0, + "79": 3336458752.0, + "80": 3336458752.0, + "81": 3336458752.0, + "82": 3336458752.0, + "83": 3336458752.0, + "84": 3336458752.0, + "85": 3336458752.0, + "86": 3336458752.0, + "87": 3336458752.0, + "88": 3336458752.0, + "89": 3336458752.0, + "90": 3336458752.0, + "91": 3336458752.0, + "92": 3336458752.0, + "93": 3336458752.0, + "94": 3336458752.0, + "95": 3336458752.0, + "96": 3336458752.0, + "97": 3336458752.0, + "98": 3336458752.0, + "99": 3336458752.0, + "100": 3336458752.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.16897, + "2": 0.35143, + "3": 0.28496, + "4": 0.28172, + "5": 0.28308, + "6": 0.2855, + "7": 0.28287, + "8": 0.28079, + "9": 0.2809, + "10": 0.28329, + "11": 0.28038, + "12": 0.28371, + "13": 0.28032, + "14": 0.28362, + "15": 0.28125, + "16": 0.28046, + "17": 0.28421, + "18": 0.28132, + "19": 0.2808, + "20": 0.28432, + "21": 0.28578, + "22": 0.28205, + "23": 0.28411, + "24": 0.28378, + "25": 0.28227, + "26": 0.28231, + "27": 0.28353, + "28": 0.28497, + "29": 0.29981, + "30": 0.28557, + "31": 0.28777, + "32": 0.28808, + "33": 0.28609, + "34": 0.32585, + "35": 0.341, + "36": 0.2886, + "37": 0.28157, + "38": 0.2916, + "39": 0.28501, + "40": 0.27952, + "41": 0.27767, + "42": 0.28062, + "43": 0.28781, + "44": 0.2839, + "45": 0.282, + "46": 0.27837, + "47": 0.27883, + "48": 0.27865, + "49": 0.28179, + "50": 0.27881, + "51": 0.27669, + "52": 0.28063, + "53": 0.27909, + "54": 0.27716, + "55": 0.27807, + "56": 0.2785, + "57": 0.27679, + "58": 0.28004, + "59": 0.27659, + "60": 0.27984, + "61": 0.2771, + "62": 0.27714, + "63": 0.2802, + "64": 0.2918, + "65": 0.27948, + "66": 0.27839, + "67": 0.28573, + "68": 0.27933, + "69": 0.27893, + "70": 0.27964, + "71": 0.2767, + "72": 0.27816, + "73": 0.28004, + "74": 0.27997, + "75": 0.28095, + "76": 0.27752, + "77": 0.27912, + "78": 0.28068, + "79": 0.27992, + "80": 0.28771, + "81": 0.28046, + "82": 0.28352, + "83": 0.28376, + "84": 0.28337, + "85": 0.28197, + "86": 0.27949, + "87": 0.27909, + "88": 0.28479, + "89": 0.28248, + "90": 0.27742, + "91": 0.27819, + "92": 0.2809, + "93": 0.28123, + "94": 0.27933, + "95": 0.28364, + "96": 0.28523, + "97": 0.28365, + "98": 0.27822, + "99": 0.28382, + "100": 0.28917 + } + } +} \ No newline at end of file diff --git a/tests/test_utils/python_scripts/download_golden_values.py b/tests/test_utils/python_scripts/download_golden_values.py index af0a58c3522..650867f231f 100644 --- a/tests/test_utils/python_scripts/download_golden_values.py +++ b/tests/test_utils/python_scripts/download_golden_values.py @@ -55,8 +55,8 @@ def main(pipeline_id: int, only_failing: bool): for functional_pipeline_job in functional_pipeline_jobs: job = project.jobs.get(functional_pipeline_job.id) logger.info("Starting with job %s", job.name) - if only_failing and job.status != "failed": - logger.info("Job %s is not failing. Skipping.", job.name) + if only_failing and job.status == "success": + logger.info("Job %s is successful. Skipping.", job.name) continue try: @@ -66,26 +66,44 @@ def main(pipeline_id: int, only_failing: bool): zip = zipfile.ZipFile(file_name) zip.extractall("tmp") logger.info("Downloaded artifacts of job %s", job.name) - except Exception: + except Exception as e: + logger.error("Failed to download artifacts of job %s due to %s", job.name, e) continue os.unlink(file_name) restart_dir = os.listdir(pathlib.Path("tmp") / "results" / "iteration=0")[-1] - golden_values_source = ( - pathlib.Path(ASSETS_DIR) - / f"{restart_dir}" - / "assets" - / "basic" - / f"{job.name.replace('_', '-').lower()}-{environment.replace('_', '-')}" - / f"golden_values_{environment}.json" + golden_values_sources = list( + ( + pathlib.Path(ASSETS_DIR) + / f"{restart_dir}" + / "assets" + / "basic" + / f"{job.name.replace('_', '-').lower()}-{environment.replace('_', '-')}" + ).glob("g*.json") ) + + if len(golden_values_sources) == 1: + golden_values_source = golden_values_sources[0] + else: + logger.info( + "Golden values for %s does not exist. Skip.", str(golden_values_sources) + ) + continue + + golden_values_source_name = golden_values_source.name + golden_values_source_name = golden_values_source_name.replace("_dgx_h100", "") + golden_values_source_name = golden_values_source_name.replace("_dgx_a100", "") + golden_values_source_name = golden_values_source_name.replace( + "generations", "golden_values" + ) + golden_values_target = ( pathlib.Path("tests") / "functional_tests" / 'test_cases' / job.stage / job.name - / f"golden_values_{environment}.json" + / golden_values_source_name ) if golden_values_source.exists(): diff --git a/tests/test_utils/python_scripts/launch_jet_workload.py b/tests/test_utils/python_scripts/launch_jet_workload.py index da0ddf9b93b..ec7e2d4a3ae 100644 --- a/tests/test_utils/python_scripts/launch_jet_workload.py +++ b/tests/test_utils/python_scripts/launch_jet_workload.py @@ -108,6 +108,7 @@ def launch_and_wait_for_completion( ), "HF_HUB_CACHE": "/lustre/fsw/coreai_dlalgo_mcore/hf_hub", "TRANSFORMERS_OFFLINE": "1", + "CLUSTER": cluster, } } } @@ -486,15 +487,17 @@ def main( ) if is_flaky_failure(concat_allranks_logs): - logger.error("Detected flaky failure, attempt restart.") + if n_attempts < 9: + logger.error("Detected flaky failure, attempt restart.") n_attempts += 1 continue if ( "FAILED tests/functional_tests/python_test_utils" in concat_mainrank_log ) and re.compile(r"\bEXIT_CODE=0\b").search(concat_mainrank_log) is not None: - logger.error("Non-determinism, let's try another node.") n_nondeterminism_attemps += 1 + if n_nondeterminism_attemps < 3: + logger.error("Non-determinism, let's try another node.") continue telemetrics_and_exit( diff --git a/tests/test_utils/recipes/bert.yaml b/tests/test_utils/recipes/bert.yaml index 66e870e66c6..f0be62e4701 100644 --- a/tests/test_utils/recipes/bert.yaml +++ b/tests/test_utils/recipes/bert.yaml @@ -50,7 +50,7 @@ spec: "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/$NAME" "TRAINING_SCRIPT_PATH=pretrain_bert.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" diff --git a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml b/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml index 2b7966bb04a..b276ac66d85 100644 --- a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml +++ b/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml @@ -48,7 +48,7 @@ spec: "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" "OUTPUT_PATH={assets_dir}" - "TENSORBOARD_PATH={assets_dir}/generations.json" + "TENSORBOARD_PATH={assets_dir}/generations_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" "ENABLE_LIGHTWEIGHT_MODE=false" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" @@ -67,4 +67,3 @@ products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - diff --git a/tests/test_utils/recipes/gpt-dynamic-inference.yaml b/tests/test_utils/recipes/gpt-dynamic-inference.yaml index 9346c0c8123..757d3d2cd26 100644 --- a/tests/test_utils/recipes/gpt-dynamic-inference.yaml +++ b/tests/test_utils/recipes/gpt-dynamic-inference.yaml @@ -49,7 +49,7 @@ spec: "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" "OUTPUT_PATH={assets_dir}" - "TENSORBOARD_PATH={assets_dir}/generations.json" + "TENSORBOARD_PATH={assets_dir}/generations_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" "ENABLE_LIGHTWEIGHT_MODE=false" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" @@ -73,4 +73,3 @@ products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - diff --git a/tests/test_utils/recipes/gpt-grads.yaml b/tests/test_utils/recipes/gpt-grads.yaml index 6915a348598..ea569362311 100644 --- a/tests/test_utils/recipes/gpt-grads.yaml +++ b/tests/test_utils/recipes/gpt-grads.yaml @@ -55,7 +55,7 @@ spec: "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/$NAME" "TRAINING_SCRIPT_PATH=pretrain_gpt.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" "N_REPEAT=1" "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" diff --git a/tests/test_utils/recipes/gpt-nemo.yaml b/tests/test_utils/recipes/gpt-nemo.yaml index fc57f54d7d7..848c1a56071 100644 --- a/tests/test_utils/recipes/gpt-nemo.yaml +++ b/tests/test_utils/recipes/gpt-nemo.yaml @@ -47,7 +47,7 @@ spec: "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}" "TRAINING_SCRIPT_PATH=\"nemo llm pretrain -y --factory {nemo_model}\"" "TRAINING_PARAMS_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "GOLDEN_VALUES_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" ) diff --git a/tests/test_utils/recipes/gpt-static-inference.yaml b/tests/test_utils/recipes/gpt-static-inference.yaml index 15385fc707a..424c424bbbf 100644 --- a/tests/test_utils/recipes/gpt-static-inference.yaml +++ b/tests/test_utils/recipes/gpt-static-inference.yaml @@ -48,9 +48,9 @@ spec: "DATA_CACHE_PATH=/workspace/data/cache" "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_static_inference.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" "OUTPUT_PATH={assets_dir}" - "TENSORBOARD_PATH={assets_dir}/generations.json" + "TENSORBOARD_PATH={assets_dir}/generations_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" "ENABLE_LIGHTWEIGHT_MODE=false" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/gpt.yaml index 83ac3a5d99a..b29fc21e877 100644 --- a/tests/test_utils/recipes/gpt.yaml +++ b/tests/test_utils/recipes/gpt.yaml @@ -61,7 +61,7 @@ spec: "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/$NAME" "TRAINING_SCRIPT_PATH=pretrain_gpt.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" diff --git a/tests/test_utils/recipes/mamba-static-inference.yaml b/tests/test_utils/recipes/mamba-static-inference.yaml index 3c4faf4ace7..f0e29999d43 100644 --- a/tests/test_utils/recipes/mamba-static-inference.yaml +++ b/tests/test_utils/recipes/mamba-static-inference.yaml @@ -45,9 +45,9 @@ spec: "DATA_CACHE_PATH=/workspace/data/cache" "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_static_inference.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" "OUTPUT_PATH={assets_dir}" - "TENSORBOARD_PATH={assets_dir}/generations.json" + "TENSORBOARD_PATH={assets_dir}/generations_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" "ENABLE_LIGHTWEIGHT_MODE=false" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" diff --git a/tests/test_utils/recipes/mamba.yaml b/tests/test_utils/recipes/mamba.yaml index f4dea805e65..7c1f9a3627f 100644 --- a/tests/test_utils/recipes/mamba.yaml +++ b/tests/test_utils/recipes/mamba.yaml @@ -48,7 +48,7 @@ spec: "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}" "TRAINING_SCRIPT_PATH=pretrain_mamba.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" diff --git a/tests/test_utils/recipes/mimo.yaml b/tests/test_utils/recipes/mimo.yaml index 4abd34b7030..dfde82656dc 100644 --- a/tests/test_utils/recipes/mimo.yaml +++ b/tests/test_utils/recipes/mimo.yaml @@ -52,7 +52,7 @@ spec: "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}/checkpoints" "TRAINING_SCRIPT_PATH=./examples/mimo/train.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" ) diff --git a/tests/test_utils/recipes/moe-dynamic-inference.yaml b/tests/test_utils/recipes/moe-dynamic-inference.yaml index 516f7a390ff..3a48c2564a5 100644 --- a/tests/test_utils/recipes/moe-dynamic-inference.yaml +++ b/tests/test_utils/recipes/moe-dynamic-inference.yaml @@ -46,9 +46,9 @@ spec: "DATA_CACHE_PATH=/workspace/data/cache" "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_dynamic_inference.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" "OUTPUT_PATH={assets_dir}" - "TENSORBOARD_PATH={assets_dir}/generations.json" + "TENSORBOARD_PATH={assets_dir}/generations_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" "ENABLE_LIGHTWEIGHT_MODE=false" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" @@ -61,4 +61,4 @@ products: products: - environment: [dev] scope: [mr] - platforms: [dgx_h100] \ No newline at end of file + platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/moe-static-inference.yaml b/tests/test_utils/recipes/moe-static-inference.yaml index 0a86cffdf31..951820cb7ae 100644 --- a/tests/test_utils/recipes/moe-static-inference.yaml +++ b/tests/test_utils/recipes/moe-static-inference.yaml @@ -46,9 +46,9 @@ spec: "DATA_CACHE_PATH=/workspace/data/cache" "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_static_inference.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" "OUTPUT_PATH={assets_dir}" - "TENSORBOARD_PATH={assets_dir}/generations.json" + "TENSORBOARD_PATH={assets_dir}/generations_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" "ENABLE_LIGHTWEIGHT_MODE=false" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" @@ -66,4 +66,4 @@ products: products: - environment: [dev] scope: [mr] - platforms: [dgx_h100] \ No newline at end of file + platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml index 5cfa307c685..972288bd905 100644 --- a/tests/test_utils/recipes/moe.yaml +++ b/tests/test_utils/recipes/moe.yaml @@ -52,7 +52,7 @@ spec: "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/$NAME" "TRAINING_SCRIPT_PATH=pretrain_gpt.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" @@ -213,9 +213,9 @@ products: platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed] products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] - - environment: [dev] - scope: [mr-slim] - platforms: [dgx_h100] + - environment: [dev] + scope: [mr] + platforms: [dgx_h100] + - environment: [dev] + scope: [mr-slim] + platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/multimodal-llava.yaml b/tests/test_utils/recipes/multimodal-llava.yaml index d95fa186172..4de7f0a9c0f 100644 --- a/tests/test_utils/recipes/multimodal-llava.yaml +++ b/tests/test_utils/recipes/multimodal-llava.yaml @@ -49,7 +49,7 @@ spec: "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}/checkpoints" "TRAINING_SCRIPT_PATH=pretrain_vlm.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" diff --git a/tests/test_utils/recipes/t5.yaml b/tests/test_utils/recipes/t5.yaml index 222ce2e9216..31a72e9b5a1 100644 --- a/tests/test_utils/recipes/t5.yaml +++ b/tests/test_utils/recipes/t5.yaml @@ -50,7 +50,7 @@ spec: "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/$NAME" "TRAINING_SCRIPT_PATH=pretrain_t5.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" diff --git a/tests/unit_tests/test_muon_optimizer.py b/tests/unit_tests/test_muon_optimizer.py index d5dffcd0e19..97d78fe6c70 100644 --- a/tests/unit_tests/test_muon_optimizer.py +++ b/tests/unit_tests/test_muon_optimizer.py @@ -1,15 +1,14 @@ import os -import pytest - -from packaging.version import Version +import pytest import torch import torch.nn as nn import torch.nn.functional as F +from packaging.version import Version from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig from megatron.core.optimizer import OptimizerConfig -from megatron.core.optimizer.muon import get_megatron_muon_optimizer, TensorParallelMuon +from megatron.core.optimizer.muon import TensorParallelMuon, get_megatron_muon_optimizer from megatron.core.transformer import TransformerConfig from tests.unit_tests.test_utilities import Utils from tests.unit_tests.test_utils import _deinit_distributed, _init_distributed From c7590d8c3733619efa87a1a0733ac4cceedc683a Mon Sep 17 00:00:00 2001 From: Yuzhong Wang Date: Mon, 13 Oct 2025 03:15:44 -0700 Subject: [PATCH 014/248] ADLR/megatron-lm!4070 - [DEV] Support Qwen3next --- gpt_builders.py | 4 +- megatron/core/models/gpt/gpt_layer_specs.py | 465 +++++++++------ .../gpt/linear_attention_module_specs.py | 39 ++ megatron/core/models/gpt/moe_module_specs.py | 6 +- megatron/core/ssm/gated_delta_net.py | 551 ++++++++++++++++++ megatron/core/transformer/attention.py | 95 ++- megatron/core/transformer/moe/moe_layer.py | 5 +- megatron/core/transformer/spec_utils.py | 1 + .../core/transformer/transformer_config.py | 85 +++ megatron/training/arguments.py | 65 ++- megatron/training/checkpointing.py | 24 +- megatron/training/training.py | 134 ++++- megatron/training/utils.py | 4 + pyproject.toml | 1 + tests/unit_tests/ssm/test_gated_delta_net.py | 319 ++++++++++ .../transformer/moe/test_shared_experts.py | 9 +- .../unit_tests/transformer/test_attention.py | 20 +- uv.lock | 221 ++++++- 18 files changed, 1792 insertions(+), 256 deletions(-) create mode 100644 megatron/core/models/gpt/linear_attention_module_specs.py create mode 100644 megatron/core/ssm/gated_delta_net.py create mode 100644 tests/unit_tests/ssm/test_gated_delta_net.py diff --git a/gpt_builders.py b/gpt_builders.py index 89b228815ff..591f74bb20c 100644 --- a/gpt_builders.py +++ b/gpt_builders.py @@ -41,7 +41,7 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None): else: use_te = args.transformer_impl == "transformer_engine" - if args.num_experts: + if args.num_experts or (args.linear_attention_type is not None): # Define the decoder block spec transformer_layer_spec = get_gpt_decoder_block_spec( config, @@ -112,6 +112,7 @@ def _get_transformer_layer_spec(use_te, config): args.moe_grouped_gemm, args.qk_layernorm, args.multi_latent_attention, + args.linear_attention_type, moe_use_legacy_grouped_gemm=args.moe_use_legacy_grouped_gemm, qk_l2_norm=args.qk_l2_norm, use_kitchen=config.use_kitchen, @@ -122,6 +123,7 @@ def _get_transformer_layer_spec(use_te, config): args.moe_grouped_gemm, args.qk_layernorm, args.multi_latent_attention, + args.linear_attention_type, moe_use_legacy_grouped_gemm=args.moe_use_legacy_grouped_gemm, normalization=args.normalization, use_kitchen=config.use_kitchen, diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py index 68c1eb8c953..e3ef7f20141 100755 --- a/megatron/core/models/gpt/gpt_layer_specs.py +++ b/megatron/core/models/gpt/gpt_layer_specs.py @@ -5,6 +5,9 @@ from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider +from megatron.core.models.gpt.linear_attention_module_specs import ( + get_linear_attention_module_spec_for_backend, +) from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec_for_backend from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules from megatron.core.transformer.enums import AttnMaskType, LayerType @@ -74,8 +77,10 @@ def get_gpt_layer_with_transformer_engine_spec( moe_grouped_gemm: Optional[bool] = False, qk_layernorm: Optional[bool] = False, multi_latent_attention: Optional[bool] = False, + linear_attention_type: Optional[str] = None, fp8: Optional[str] = None, # pylint: disable=unused-argument moe_use_legacy_grouped_gemm: Optional[bool] = False, + normalization: Optional[str] = None, qk_l2_norm: Optional[bool] = False, use_te_op_fuser: Optional[bool] = False, use_kitchen: bool = False, @@ -88,10 +93,14 @@ def get_gpt_layer_with_transformer_engine_spec( num_experts (int, optional): Number of experts. Defaults to None. moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False. qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False. + multi_latent_attention (bool, optional): To use multi-latent attention. Defaults to False. + linear_attention_type (str, optional): The type of linear attention. Defaults to None. fp8 (str, optional): Deprecated. For temporary Nemo compatibility. moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP. Defaults to False. + normalization (str, optional): The normalization to use. Defaults to None. qk_l2_norm (bool, optional): To use l2 norm for queries/keys. Defaults to False. + use_kitchen (bool, optional): To use KitchenSpecProvider. Defaults to False. use_te_op_fuser (bool, optional): Use Transformer Engine's operation-based API, which may enable certain operation fusions. Defaults to False. @@ -115,8 +124,22 @@ def get_gpt_layer_with_transformer_engine_spec( else: backend = TESpecProvider() + sharded_state_dict_keys_map = {} + + attention = get_attention_module_spec_for_backend( + backend=backend, + sharded_state_dict_keys_map=sharded_state_dict_keys_map, + linear_attention_type=linear_attention_type, + qk_layernorm=qk_layernorm, + qk_l2_norm=qk_l2_norm, + multi_latent_attention=multi_latent_attention, + mla_down_proj_use_column_parallel=False, + normalization=normalization, + ) + mlp = get_mlp_module_spec_for_backend( backend=backend, + sharded_state_dict_keys_map=sharded_state_dict_keys_map, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm, @@ -124,77 +147,13 @@ def get_gpt_layer_with_transformer_engine_spec( use_te_activation_func=use_te_activation_func, ) - if multi_latent_attention: - assert qk_l2_norm is False, "qk_l2_norm is not supported with MLA." - linear_q_up_proj = ( - backend.column_parallel_layer_norm_linear() - if qk_layernorm - else backend.column_parallel_linear() - ) - linear_kv_up_proj = ( - backend.column_parallel_layer_norm_linear() - if qk_layernorm - else backend.column_parallel_linear() - ) - return ModuleSpec( - module=TransformerLayer, - submodules=TransformerLayerSubmodules( - input_layernorm=backend.layer_norm(), - self_attention=ModuleSpec( - module=MLASelfAttention, - params={"attn_mask_type": AttnMaskType.causal}, - submodules=MLASelfAttentionSubmodules( - linear_q_proj=backend.column_parallel_linear(), - linear_q_down_proj=backend.linear(), - linear_q_up_proj=linear_q_up_proj, - linear_kv_down_proj=backend.linear(), - linear_kv_up_proj=linear_kv_up_proj, - core_attention=backend.core_attention(), - linear_proj=backend.row_parallel_linear(), - q_layernorm=IdentityOp, - kv_layernorm=IdentityOp, - ), - ), - self_attn_bda=get_bias_dropout_add, - pre_mlp_layernorm=backend.layer_norm() if num_experts else IdentityOp, - mlp=mlp, - mlp_bda=get_bias_dropout_add, - ), - ) - else: - qk_norm = backend.layer_norm(for_qk=True) - return ModuleSpec( - module=TransformerLayer, - submodules=TransformerLayerSubmodules( - self_attention=ModuleSpec( - module=SelfAttention, - params={"attn_mask_type": AttnMaskType.causal}, - submodules=SelfAttentionSubmodules( - linear_qkv=backend.column_parallel_layer_norm_linear(), - core_attention=backend.core_attention(), - linear_proj=backend.row_parallel_linear(), - q_layernorm=( - L2Norm if qk_l2_norm else (qk_norm if qk_layernorm else IdentityOp) - ), - k_layernorm=( - L2Norm if qk_l2_norm else (qk_norm if qk_layernorm else IdentityOp) - ), - ), - ), - self_attn_bda=get_bias_dropout_add, - pre_mlp_layernorm=backend.layer_norm() if num_experts else IdentityOp, - mlp=mlp, - mlp_bda=get_bias_dropout_add, - sharded_state_dict_keys_map={ - "mlp.0.weight": "mlp.linear_fc1.layer_norm_weight", - "mlp.0.bias": "mlp.linear_fc1.layer_norm_bias", - "mlp.1.basic_ops.0.weight": "mlp.linear_fc1.weight", - "mlp.1.basic_ops.1.bias": "mlp.linear_fc1.bias", - "mlp.3.basic_ops.0.weight": "mlp.linear_fc2.weight", - "mlp.3.basic_ops.1.bias": "mlp.linear_fc2.bias", - }, - ), - ) + return get_transformer_layer_spec_for_backend( + backend=backend, + attention=attention, + mlp=mlp, + sharded_state_dict_keys_map=sharded_state_dict_keys_map, + normalization=normalization, + ) def get_gpt_layer_local_spec( @@ -202,6 +161,7 @@ def get_gpt_layer_local_spec( moe_grouped_gemm: Optional[bool] = False, qk_layernorm: Optional[bool] = False, multi_latent_attention: Optional[bool] = False, + linear_attention_type: Optional[str] = None, fp8: Optional[str] = None, # pylint: disable=unused-argument moe_use_legacy_grouped_gemm: Optional[bool] = False, normalization: Optional[str] = None, @@ -215,10 +175,14 @@ def get_gpt_layer_local_spec( num_experts (int, optional): Number of experts. Defaults to None. moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False. qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False. + multi_latent_attention (bool, optional): To use multi-latent attention. Defaults to False. + linear_attention_type (str, optional): The type of linear attention. Defaults to None. fp8 (str, optional): Deprecated. For temporary Nemo compatibility. moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP. Defaults to False. + normalization (str, optional): The normalization to use. Defaults to None. qk_l2_norm (bool, optional): To use l2 norm for queries/keys. Defaults to False. + use_kitchen (bool, optional): To use KitchenSpecProvider. Defaults to False. Returns: ModuleSpec: Module specification with Megatron-Core modules @@ -229,13 +193,6 @@ def get_gpt_layer_local_spec( backend = KitchenSpecProvider(fallback=LocalSpecProvider()) else: backend = LocalSpecProvider() - # Adjust for RMS norm. - if normalization == "RMSNorm": - layer_norm = backend.layer_norm(rms_norm=True, for_qk=False) - qk_norm = backend.layer_norm(rms_norm=True, for_qk=True) - else: - layer_norm = backend.layer_norm(rms_norm=False, for_qk=False) - qk_norm = backend.layer_norm(rms_norm=False, for_qk=True) if fp8 is not None: warnings.warn( @@ -243,6 +200,22 @@ def get_gpt_layer_local_spec( " and will be removed soon. Please update your code accordingly." ) + if linear_attention_type is not None: + raise NotImplementedError("Linear attention is not supported with local spec yet.") + + sharded_state_dict_keys_map = {} + + attention = get_attention_module_spec_for_backend( + backend=backend, + sharded_state_dict_keys_map=sharded_state_dict_keys_map, + linear_attention_type=linear_attention_type, + qk_layernorm=qk_layernorm, + qk_l2_norm=qk_l2_norm, + multi_latent_attention=multi_latent_attention, + mla_down_proj_use_column_parallel=True, + normalization=normalization, + ) + mlp = get_mlp_module_spec_for_backend( backend=backend, num_experts=num_experts, @@ -250,63 +223,162 @@ def get_gpt_layer_local_spec( moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm, ) + return get_transformer_layer_spec_for_backend( + backend=backend, + attention=attention, + mlp=mlp, + sharded_state_dict_keys_map=sharded_state_dict_keys_map, + normalization=normalization, + ) + + +def get_transformer_layer_spec_for_backend( + backend: BackendSpecProvider, + attention: ModuleSpec, + mlp: ModuleSpec, + sharded_state_dict_keys_map: Optional[dict] = None, + normalization: Optional[str] = None, +) -> ModuleSpec: + """Helper function to get module spec for TransformerLayer""" + + rms_norm = normalization == "RMSNorm" + + input_layernorm = ( + IdentityOp + if attention.metainfo["fuse_input_layernorm"] + else backend.layer_norm(rms_norm=rms_norm, for_qk=False) + ) + pre_mlp_layernorm = ( + IdentityOp + if mlp.metainfo["fuse_pre_mlp_layernorm"] + else backend.layer_norm(rms_norm=rms_norm, for_qk=False) + ) + + transformer_layer = ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=input_layernorm, + self_attention=attention, + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=pre_mlp_layernorm, + mlp=mlp, + mlp_bda=get_bias_dropout_add, + sharded_state_dict_keys_map=sharded_state_dict_keys_map, + ), + ) + return transformer_layer + + +def get_attention_module_spec_for_backend( + backend: BackendSpecProvider, + sharded_state_dict_keys_map: dict, + linear_attention_type: Optional[str] = None, + qk_layernorm: Optional[bool] = False, + qk_l2_norm: Optional[bool] = False, + multi_latent_attention: Optional[bool] = False, + mla_down_proj_use_column_parallel: Optional[bool] = False, + normalization: Optional[str] = None, +) -> ModuleSpec: + """Helper function to get module spec for Attention""" + + if linear_attention_type is not None: + return get_linear_attention_module_spec_for_backend( + backend=backend, + linear_attention_type=linear_attention_type, + normalization=normalization, + ) + + # Adjust for RMS norm. + rms_norm = normalization == "RMSNorm" + qk_norm = backend.layer_norm(rms_norm=rms_norm, for_qk=True) + if multi_latent_attention: assert qk_l2_norm is False, "qk_l2_norm is not supported with MLA." - return ModuleSpec( - module=TransformerLayer, - submodules=TransformerLayerSubmodules( - input_layernorm=layer_norm, - self_attention=ModuleSpec( - module=MLASelfAttention, - params={"attn_mask_type": AttnMaskType.causal}, - submodules=MLASelfAttentionSubmodules( - linear_q_proj=backend.column_parallel_linear(), - linear_q_down_proj=backend.column_parallel_linear(), - linear_q_up_proj=backend.column_parallel_linear(), - linear_kv_down_proj=backend.column_parallel_linear(), - linear_kv_up_proj=backend.column_parallel_linear(), - core_attention=backend.core_attention(), - linear_proj=backend.row_parallel_linear(), - q_layernorm=qk_norm if qk_layernorm else IdentityOp, - kv_layernorm=qk_norm if qk_layernorm else IdentityOp, - ), - ), - self_attn_bda=get_bias_dropout_add, - pre_mlp_layernorm=layer_norm, - mlp=mlp, - mlp_bda=get_bias_dropout_add, + linear_q_down_proj = ( + backend.column_parallel_linear() + if mla_down_proj_use_column_parallel + else backend.linear() + ) + linear_kv_down_proj = ( + backend.column_parallel_linear() + if mla_down_proj_use_column_parallel + else backend.linear() + ) + linear_q_up_proj = ( + backend.column_parallel_layer_norm_linear() + if qk_layernorm and backend.fuse_layernorm_and_linear() + else backend.column_parallel_linear() + ) + linear_kv_up_proj = ( + backend.column_parallel_layer_norm_linear() + if qk_layernorm and backend.fuse_layernorm_and_linear() + else backend.column_parallel_linear() + ) + qk_norm = ( + backend.layer_norm(rms_norm=rms_norm, for_qk=True) + if qk_layernorm and not backend.fuse_layernorm_and_linear() + else IdentityOp + ) + attention = ModuleSpec( + module=MLASelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=MLASelfAttentionSubmodules( + linear_q_proj=backend.column_parallel_linear(), + linear_q_down_proj=linear_q_down_proj, + linear_q_up_proj=linear_q_up_proj, + linear_kv_down_proj=linear_kv_down_proj, + linear_kv_up_proj=linear_kv_up_proj, + core_attention=backend.core_attention(), + linear_proj=backend.row_parallel_linear(), + q_layernorm=qk_norm, + kv_layernorm=qk_norm, ), + metainfo={"fuse_input_layernorm": False}, ) else: - return ModuleSpec( - module=TransformerLayer, - submodules=TransformerLayerSubmodules( - input_layernorm=layer_norm, - self_attention=ModuleSpec( - module=SelfAttention, - params={"attn_mask_type": AttnMaskType.causal}, - submodules=SelfAttentionSubmodules( - linear_qkv=backend.column_parallel_linear(), - core_attention=backend.core_attention(), - linear_proj=backend.row_parallel_linear(), - q_layernorm=( - L2Norm if qk_l2_norm else (qk_norm if qk_layernorm else IdentityOp) - ), - k_layernorm=( - L2Norm if qk_l2_norm else (qk_norm if qk_layernorm else IdentityOp) - ), - ), - ), - self_attn_bda=get_bias_dropout_add, - pre_mlp_layernorm=layer_norm, - mlp=mlp, - mlp_bda=get_bias_dropout_add, - sharded_state_dict_keys_map={ - "input_layernorm.": "self_attention.linear_qkv.layer_norm_", - "pre_mlp_layernorm.": "mlp.linear_fc1.layer_norm_", - }, + linear_qkv = ( + backend.column_parallel_layer_norm_linear() + if backend.fuse_layernorm_and_linear() + else backend.column_parallel_linear() + ) + if qk_l2_norm: + qk_norm = L2Norm + elif qk_layernorm: + qk_norm = backend.layer_norm(rms_norm=rms_norm, for_qk=True) + else: + qk_norm = IdentityOp + attention = ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=linear_qkv, + core_attention=backend.core_attention(), + linear_proj=backend.row_parallel_linear(), + q_layernorm=qk_norm, + k_layernorm=qk_norm, ), + metainfo={"fuse_input_layernorm": backend.fuse_layernorm_and_linear()}, ) + if backend.fuse_layernorm_and_linear(): + sharded_state_dict_keys_map.update( + { + "mlp.0.weight": "mlp.linear_fc1.layer_norm_weight", + "mlp.0.bias": "mlp.linear_fc1.layer_norm_bias", + "mlp.1.basic_ops.0.weight": "mlp.linear_fc1.weight", + "mlp.1.basic_ops.1.bias": "mlp.linear_fc1.bias", + "mlp.3.basic_ops.0.weight": "mlp.linear_fc2.weight", + "mlp.3.basic_ops.1.bias": "mlp.linear_fc2.bias", + } + ) + else: + sharded_state_dict_keys_map.update( + { + "input_layernorm.": "self_attention.linear_qkv.layer_norm_", + "pre_mlp_layernorm.": "mlp.linear_fc1.layer_norm_", + } + ) + + return attention def _get_mlp_module_spec( @@ -365,6 +437,7 @@ def get_mlp_module_spec( def get_mlp_module_spec_for_backend( backend: BackendSpecProvider, + sharded_state_dict_keys_map: Optional[dict] = None, num_experts: Optional[int] = None, moe_grouped_gemm: Optional[bool] = False, moe_use_legacy_grouped_gemm: Optional[bool] = False, @@ -382,13 +455,16 @@ def get_mlp_module_spec_for_backend( if backend.fuse_layernorm_and_linear(): linear_fc1 = backend.column_parallel_layer_norm_linear() assert linear_fc1 is not None + fuse_pre_mlp_layernorm = True else: linear_fc1 = backend.column_parallel_linear() + fuse_pre_mlp_layernorm = False return ModuleSpec( module=module, submodules=MLPSubmodules( linear_fc1=linear_fc1, linear_fc2=linear_fc2, activation_func=activation_func ), + metainfo={"fuse_pre_mlp_layernorm": fuse_pre_mlp_layernorm}, ) else: # Mixture of experts with modules in megatron core. @@ -409,57 +485,62 @@ def get_gpt_decoder_block_spec( vp_stage: Optional[int] = None, pp_rank: Optional[int] = None, ) -> TransformerBlockSubmodules: - """GPT block spec.""" + """Helper function to get GPT block spec. + + Return a list of transformer layer spec of the current pipeline stage.""" + + get_layer_spec_kwargs = { + "qk_layernorm": config.qk_layernorm, + "moe_use_legacy_grouped_gemm": config.moe_use_legacy_grouped_gemm, + "qk_l2_norm": qk_l2_norm, + "use_kitchen": config.use_kitchen, + "normalization": normalization, + } if use_transformer_engine: layer_norm_impl = TENorm - dense_layer_spec = get_gpt_layer_with_transformer_engine_spec( - num_experts=None, - moe_grouped_gemm=False, - qk_layernorm=config.qk_layernorm, - multi_latent_attention=config.multi_latent_attention, - moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, - qk_l2_norm=qk_l2_norm, - use_kitchen=config.use_kitchen, - use_te_activation_func=config.use_te_activation_func, - ) - moe_layer_spec = get_gpt_layer_with_transformer_engine_spec( - num_experts=config.num_moe_experts, - moe_grouped_gemm=config.moe_grouped_gemm, - qk_layernorm=config.qk_layernorm, - multi_latent_attention=config.multi_latent_attention, - moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, - qk_l2_norm=qk_l2_norm, - use_kitchen=config.use_kitchen, - use_te_activation_func=config.use_te_activation_func, - ) + get_layer_spec_kwargs["use_te_activation_func"] = config.use_te_activation_func + get_layer_spec_fn = get_gpt_layer_with_transformer_engine_spec else: layer_norm_impl = LNImpl - dense_layer_spec = get_gpt_layer_local_spec( - num_experts=None, - moe_grouped_gemm=False, - qk_layernorm=config.qk_layernorm, - multi_latent_attention=config.multi_latent_attention, - moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, - normalization=normalization, - qk_l2_norm=qk_l2_norm, - use_kitchen=config.use_kitchen, - ) - moe_layer_spec = get_gpt_layer_local_spec( - num_experts=config.num_moe_experts, - moe_grouped_gemm=config.moe_grouped_gemm, - qk_layernorm=config.qk_layernorm, - multi_latent_attention=config.multi_latent_attention, - moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, - normalization=normalization, - qk_l2_norm=qk_l2_norm, - use_kitchen=config.use_kitchen, - ) + get_layer_spec_fn = get_gpt_layer_local_spec + + layer_spec_dict = {} + for mlp_type in ["dense", "moe"]: + for attention_type in ["softmax_attention", "linear_attention"]: + if mlp_type == "moe": + if config.moe_layer_freq is None: + # Skip if there is no MoE layer in the model. + continue + num_experts = config.num_moe_experts + moe_grouped_gemm = config.moe_grouped_gemm + else: + num_experts = None + moe_grouped_gemm = None + if attention_type == "linear_attention": + if config.linear_attention_type is None: + # Skip if there is no linear attention layer in the model. + continue + linear_attention_type = config.linear_attention_type + multi_latent_attention = None + else: + linear_attention_type = None + multi_latent_attention = config.multi_latent_attention + + layer_spec_key = f"{mlp_type}_{attention_type}" + layer_spec_dict[layer_spec_key] = get_layer_spec_fn( + num_experts=num_experts, + moe_grouped_gemm=moe_grouped_gemm, + multi_latent_attention=multi_latent_attention, + linear_attention_type=linear_attention_type, + **get_layer_spec_kwargs, + ) # Parse config.moe_layer_freq to determine the pattern of expert/dense layers. # 0 stands for dense layers, 1 stands for expert layers. # For integer N: Creates a pattern with one expert layer every N layers. # For string pattern: Evaluates the str directly (e.g. "[1,0,1]" for alternating expert/dense). if isinstance(config.moe_layer_freq, int): + # [1,0,0,...,0,1,0,0,...,0,...] moe_layer_pattern = [ 1 if (i % config.moe_layer_freq == 0) else 0 for i in range(config.num_layers) ] @@ -475,15 +556,49 @@ def get_gpt_decoder_block_spec( f"Invalid moe_layer_freq: {type(config.moe_layer_freq)}, {config.moe_layer_freq}" ) + # Parse config.linear_attention_freq to determine the pattern of expert/dense layers. + # 0 stands for SDPA layers, 1 stands for LA layers. + # For integer N: Creates a pattern with (N-1) LA layers and 1 SDPA layer every N layers. + # For string pattern: Evaluates the str directly (e.g. "[1,0,1]" for alternating LA/SDPA). + if isinstance(config.linear_attention_freq, int): + linear_attention_pattern = [ + # [1,1,...,1,0,1,1,...,1,0,...] + 0 if ((i + 1) % config.linear_attention_freq == 0) else 1 + for i in range(config.num_layers) + ] + elif isinstance(config.linear_attention_freq, list): + linear_attention_pattern = config.linear_attention_freq + assert len(linear_attention_pattern) == config.num_layers, ( + f"Invalid length of linear_attention_pattern: {len(linear_attention_pattern)}, " + f"expected {config.num_layers}, " + f"current linear attention pattern: {config.linear_attention_freq}" + ) + elif config.linear_attention_freq is None: + if config.linear_attention_type is None: + linear_attention_pattern = [0] * config.num_layers + else: + linear_attention_pattern = [1] * config.num_layers + warnings.warn( + "Linear attention type is specified but linear_attention_freq is None. " + "Setting linear_attention_pattern to [1] * config.num_layers as default." + ) + else: + raise ValueError( + f"Invalid linear_attention_freq: {type(config.linear_attention_freq)}," + f" {config.linear_attention_freq}" + ) + # Create the layer specs for the model. layer_specs = [] for layer_number in range(config.num_layers): - if moe_layer_pattern[layer_number] == 1: - layer_specs.append(moe_layer_spec) - elif moe_layer_pattern[layer_number] == 0: - layer_specs.append(dense_layer_spec) - else: - raise ValueError(f"Invalid layer pattern: {moe_layer_pattern}") + mlp_type = "moe" if moe_layer_pattern[layer_number] else "dense" + attention_type = ( + "linear_attention" if linear_attention_pattern[layer_number] else "softmax_attention" + ) + layer_spec_key = f"{mlp_type}_{attention_type}" + if layer_spec_key not in layer_spec_dict: + raise ValueError(f"Invalid layer spec key: {layer_spec_key}") + layer_specs.append(layer_spec_dict[layer_spec_key]) # Slice the layer specs to only include the layers that are built in this pipeline stage. # Note: MCore layer_number starts at 1 diff --git a/megatron/core/models/gpt/linear_attention_module_specs.py b/megatron/core/models/gpt/linear_attention_module_specs.py new file mode 100644 index 00000000000..af23b4b2c08 --- /dev/null +++ b/megatron/core/models/gpt/linear_attention_module_specs.py @@ -0,0 +1,39 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +from typing import Optional + +from megatron.core.models.backends import BackendSpecProvider +from megatron.core.ssm.gated_delta_net import GatedDeltaNet, GatedDeltaNetSubmodules +from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules +from megatron.core.transformer.spec_utils import ModuleSpec + + +def get_linear_attention_module_spec_for_backend( + backend: BackendSpecProvider, linear_attention_type: str, normalization: Optional[str] = None +) -> ModuleSpec: + """Helper function to get module spec for Linear Attention""" + rms_norm = normalization == "RMSNorm" + if linear_attention_type == "mamba": + attention = ( + ModuleSpec( + module=MambaMixer, + submodules=MambaMixerSubmodules( + in_proj=backend.column_parallel_layer_norm_linear(), + out_proj=backend.row_parallel_linear(), + ), + metainfo={"fuse_input_layernorm": True}, + ), + ) + elif linear_attention_type == "gated_delta_net": + attention = ModuleSpec( + module=GatedDeltaNet, + submodules=GatedDeltaNetSubmodules( + in_proj=backend.column_parallel_layer_norm_linear(), + out_norm=backend.layer_norm(rms_norm=rms_norm, for_qk=False), + out_proj=backend.row_parallel_linear(), + ), + metainfo={"fuse_input_layernorm": True}, + ) + else: + raise ValueError(f"Invalid linear attention type: {linear_attention_type}") + return attention diff --git a/megatron/core/models/gpt/moe_module_specs.py b/megatron/core/models/gpt/moe_module_specs.py index e1ea7c163e9..1de0f14efcd 100755 --- a/megatron/core/models/gpt/moe_module_specs.py +++ b/megatron/core/models/gpt/moe_module_specs.py @@ -65,10 +65,12 @@ def get_moe_module_spec_for_backend( experts = ModuleSpec(module=expert_module, submodules=expert_submodule) # shared experts spec - shared_experts = ModuleSpec(module=SharedExpertMLP, params={"gate": False}, submodules=mlp) + shared_experts = ModuleSpec(module=SharedExpertMLP, submodules=mlp) # MoE module spec moe_module_spec = ModuleSpec( - module=MoELayer, submodules=MoESubmodules(experts=experts, shared_experts=shared_experts) + module=MoELayer, + submodules=MoESubmodules(experts=experts, shared_experts=shared_experts), + metainfo={"fuse_pre_mlp_layernorm": False}, ) return moe_module_spec diff --git a/megatron/core/ssm/gated_delta_net.py b/megatron/core/ssm/gated_delta_net.py new file mode 100644 index 00000000000..45588341a39 --- /dev/null +++ b/megatron/core/ssm/gated_delta_net.py @@ -0,0 +1,551 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, Songlin Yang, Jan Kautz, Ali Hatamizadeh. + +# Some of this code was adopted from https://github.com/huggingface/transformers +# This source code is licensed under the Apache license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from dataclasses import dataclass, replace +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + +from megatron.core.dist_checkpointing import ShardedTensor +from megatron.core.dist_checkpointing.mapping import ReplicaId, ShardedTensorFactory +from megatron.core.fp8_utils import get_fp8_align_size +from megatron.core.inference.contexts import BaseInferenceContext +from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.process_groups_config import ProcessGroupCollection +from megatron.core.tensor_parallel import get_cuda_rng_tracker +from megatron.core.transformer import TransformerConfig +from megatron.core.transformer.identity_op import IdentityOp +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.utils import ( + make_sharded_tensors_for_checkpoint, + sharded_state_dict_default, +) +from megatron.core.utils import deprecate_inference_params, nvtx_range_pop, nvtx_range_push + +# TODO: Implement GatedDeltaNetContextParallel +# from .gated_delta_net_context_parallel import GatedDeltaNetContextParallel + +try: + from fla.modules.l2norm import l2norm + from fla.ops.gated_delta_rule import chunk_gated_delta_rule, fused_recurrent_gated_delta_rule + + HAVE_FLA = True +except ImportError: + chunk_gated_delta_rule = None + fused_recurrent_gated_delta_rule = None + + HAVE_FLA = False + +try: + from causal_conv1d import causal_conv1d_fn, causal_conv1d_update +except ImportError: + causal_conv1d_fn = None + causal_conv1d_update = None + + +logger = logging.getLogger(__name__) + + +@dataclass +class GatedDeltaNetSubmodules: + """ + Contains the module specs for the input linear, output norm, and output linear layers. + """ + + in_proj: Union[ModuleSpec, type] = IdentityOp + out_norm: Union[ModuleSpec, type] = IdentityOp + out_proj: Union[ModuleSpec, type] = IdentityOp + + +class GatedDeltaNet(MegatronModule): + """Gated Delta Net (GDN) layer class + + GDN layer takes input with size [s, b, h] + and returns output of the same size. + """ + + def __init__( + self, + config: TransformerConfig, + submodules: GatedDeltaNetSubmodules, + layer_number: int = None, + bias: bool = False, + conv_bias: bool = False, + conv_init: Optional[float] = None, + use_qk_l2norm: bool = True, + A_init_range: Tuple[float, float] = (1, 16), + pg_collection: ProcessGroupCollection = None, + ): + """ + Args: + config: The config of the model. + submodules: Contains the module specs for the input and output linear layers. + layer_number: The layer number of this GDN layer. + bias: Whether to use bias in the linear layers. + conv_bias: Whether to use bias in the causal convolution. + conv_init: The initialization range for the causal convolution weights. + use_qk_l2norm: Whether to use L2 normalization in the kernel of the gated delta rule. + A_init_range: The initialization range for the attention weights. + pg_collection: The required process groups to use for tensor model parallel and context + parallel. + """ + + if not HAVE_FLA: + raise ImportError("FLA is not installed. Please install it with `pip install fla`.") + + super().__init__(config) + + # Attributes from arguments + self.layer_number = layer_number + self.bias = bias + self.conv_bias = conv_bias + self.conv_init = conv_init + assert A_init_range[0] >= 0 and A_init_range[1] >= A_init_range[0] + self.A_init_range = A_init_range + self.use_qk_l2norm = use_qk_l2norm + assert pg_collection is not None, "pg_collection must be provided for GatedDeltaNet" + self.pg_collection = pg_collection + self.tp_size = self.pg_collection.tp.size() + self.sp_size = self.tp_size if config.sequence_parallel else 1 + + # Attributes from config + self.config = config + self.hidden_size = config.hidden_size + self.act_fn = config.activation_func + self.activation = self.act_fn.__name__ + self.conv_kernel_dim = config.linear_conv_kernel_dim + self.key_head_dim = config.linear_key_head_dim + self.value_head_dim = config.linear_value_head_dim + self.num_key_heads = config.linear_num_key_heads + self.num_value_heads = config.linear_num_value_heads + self.qk_dim = self.key_head_dim * self.num_key_heads + self.v_dim = self.value_head_dim * self.num_value_heads + + # Input projection (hidden_states -> q, k, v, gate, beta, alpha) + # TODO: for now, output gate is forced for GDN. + # We may remove this restriction in the future. + self.in_proj_dim = self.qk_dim * 2 + self.v_dim * 2 + self.num_value_heads * 2 + if self.config.fp8: + fp8_align_size = get_fp8_align_size(self.config.fp8_recipe) + assert self.in_proj_dim % fp8_align_size == 0, ( + "For FP8, the innermost dimension of the GDN layer " + "input projection output tensor must be a multiple of 16." + ) + self.in_proj = build_module( + submodules.in_proj, + self.hidden_size, + self.in_proj_dim, + config=self.config, + init_method=self.config.init_method, + gather_output=False, + bias=bias, + skip_bias_add=False, + is_expert=False, + tp_comm_buffer_name="fc1", + tp_group=self.pg_collection.tp, + ) + + # Conv1d for QKV + self.conv_dim = self.qk_dim * 2 + self.v_dim + self.conv_dim_local_tp = self.conv_dim // self.tp_size + + # weight shape: [conv_dim, 1, d_conv] + # bias shape: [conv_dim] + self.conv1d = nn.Conv1d( + in_channels=self.conv_dim_local_tp, + out_channels=self.conv_dim_local_tp, + bias=conv_bias, + kernel_size=self.conv_kernel_dim, + groups=self.conv_dim_local_tp, + padding=self.conv_kernel_dim - 1, + device=torch.cuda.current_device(), + dtype=config.params_dtype, + ) + setattr(self.conv1d.weight, "tensor_model_parallel", True) + if conv_bias: + setattr(self.conv1d.bias, "tensor_model_parallel", True) + + # Time step projection (discretization) + self.num_v_heads_local_tp = self.num_value_heads // self.tp_size + # dt_bias parameter + self.dt_bias = nn.Parameter( + torch.empty( + self.num_v_heads_local_tp, + dtype=config.params_dtype, + device=torch.cuda.current_device(), + ) + ) + setattr(self.dt_bias, "tensor_model_parallel", True) + # A_log parameter + self.A_log = nn.Parameter( + torch.empty( + self.num_v_heads_local_tp, + dtype=config.params_dtype, + device=torch.cuda.current_device(), + ) + ) + setattr(self.A_log, "tensor_model_parallel", True) + + # Output layernorm before projection + self.out_norm = build_module( + submodules.out_norm, + config=self.config, + hidden_size=self.value_head_dim, + eps=self.config.layernorm_epsilon, + ) + + self.out_proj = build_module( + submodules.out_proj, + self.v_dim, + self.hidden_size, + config=self.config, + init_method=self.config.output_layer_init_method, + bias=bias, + input_is_parallel=True, + skip_bias_add=True, + is_expert=False, + tp_comm_buffer_name="fc2", + tp_group=self.pg_collection.tp, + ) + + # TODO: support CP + + self.reset_parameters() + + def reset_parameters(self): + """Reset the parameters.""" + if self.config.perform_initialization: + with get_cuda_rng_tracker().fork(): + # conv1d.weight + if self.conv_init is not None: + nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init) + # dt_bias + torch.ones( + self.num_v_heads_local_tp, + out=self.dt_bias.data, + dtype=self.config.params_dtype, + device=torch.cuda.current_device(), + ) + # A_log + A = torch.empty( + self.num_v_heads_local_tp, + dtype=self.config.params_dtype, + device=torch.cuda.current_device(), + ).uniform_(*self.A_init_range) + self.A_log.data.copy_(A) + + def forward( + self, + hidden_states: Tensor, + attention_mask: Tensor, + key_value_states: Optional[Tensor] = None, + inference_context: Optional[BaseInferenceContext] = None, + rotary_pos_emb: Optional[Union[Tensor, Tuple[Tensor, Tensor]]] = None, + rotary_pos_cos: Optional[Tensor] = None, + rotary_pos_sin: Optional[Tensor] = None, + rotary_pos_cos_sin: Optional[Tensor] = None, + attention_bias: Optional[Tensor] = None, + packed_seq_params: Optional[PackedSeqParams] = None, + sequence_len_offset: Optional[int] = None, + *, + inference_params: Optional[BaseInferenceContext] = None, + ): + """ + Perform a forward pass through the GDN module. + + Args: + hidden_states (Tensor): Hidden states. + attention_mask (Tensor): Attention mask. + key_value_states (Optional[Tensor]): Key/value states (for cross attention). + inference_context (Optional[BaseInferenceContext]): Inference context that manages + KV cache. + rotary_pos_emb (Optional[Union[Tensor, Tuple[Tensor, Tensor]]]): Rotary + embedding tensor(s). + rotary_pos_cos (Optional[Tensor]): Rotary embedding cosine. + rotary_pos_sin (Optional[Tensor]): Rotary embedding sine. + rotary_pos_cos_sin (Optional[Tensor]): Combined rotary embedding cosine and sine. + attention_bias (Optional[Tensor]): Attention bias. + packed_seq_params (Optional[PackedSeqparams]): Parameters used for THD format. + sequence_len_offset (Optional[int]): Sequence length offset used for + inference CUDA graphs. + + Return: + (Tuple[Tensor, Tensor]) GDN output and bias. + + """ + # TODO: Deal with attention_mask + + inference_context = deprecate_inference_params(inference_context, inference_params) + + seq_len, batch, _ = hidden_states.shape + seq_len = seq_len * self.sp_size + + if inference_context is not None: + assert ( + inference_context.is_static_batching() + ), "GDN does not currently support dynamic inference batching." + assert not self.config.sequence_parallel + # TODO: support inference + raise NotImplementedError("GDN does not support inference for now.") + + if packed_seq_params is not None: + # TODO: support packed sequence + raise NotImplementedError("GDN does not support packed sequence for now.") + + # Input projection + nvtx_range_push(suffix="in_proj") + qkvzba, _ = self.in_proj(hidden_states) + nvtx_range_pop(suffix="in_proj") + + # Transpose: s b x --> b s x + # From sbhd to bshd format + qkvzba = qkvzba.transpose(0, 1) + + # Split, reorder, and reshape the tensor into q, k, v, gate, beta, alpha + qkv, gate, beta, alpha = torch.split( + qkvzba, + [ + (self.qk_dim * 2 + self.v_dim) // self.tp_size, + self.v_dim // self.tp_size, + self.num_value_heads // self.tp_size, + self.num_value_heads // self.tp_size, + ], + dim=-1, + ) + gate = gate.reshape(batch, seq_len, -1, self.value_head_dim) + beta = beta.reshape(batch, seq_len, -1) + alpha = alpha.reshape(batch, seq_len, -1) + + # Convolution on qkv + qkv = qkv.transpose(1, 2).contiguous() # b, s, d -> b, d, s + nvtx_range_push(suffix="conv1d") + if causal_conv1d_fn is None: + qkv = self.act_fn(self.conv1d(qkv)[..., :seq_len]) + else: + assert self.activation in ["silu", "swish"] + qkv = causal_conv1d_fn( + x=qkv, + weight=self.conv1d.weight.squeeze(1), # d, 1, w -> d, w + bias=self.conv1d.bias, + activation=self.activation, + ) + nvtx_range_pop(suffix="conv1d") + # Split qkv into query, key, and value + qkv = qkv.transpose(1, 2) # b, d, s -> b, s, d + query, key, value = torch.split( + qkv, + [self.qk_dim // self.tp_size, self.qk_dim // self.tp_size, self.v_dim // self.tp_size], + dim=-1, + ) + query = query.reshape(batch, seq_len, -1, self.key_head_dim) + key = key.reshape(batch, seq_len, -1, self.key_head_dim) + value = value.reshape(batch, seq_len, -1, self.value_head_dim) + # Apply L2 norm to query and key + if self.use_qk_l2norm: + query = l2norm(query.contiguous()) + key = l2norm(key.contiguous()) + if self.num_value_heads // self.num_key_heads > 1: + query = query.repeat_interleave(self.num_value_heads // self.num_key_heads, dim=2) + key = key.repeat_interleave(self.num_value_heads // self.num_key_heads, dim=2) + + # Make contiguous + query = query.contiguous() + key = key.contiguous() + value = value.contiguous() + gate = gate.contiguous() + beta = beta.contiguous() + alpha = alpha.contiguous() + + # Calculate g and beta + nvtx_range_push(suffix="g_and_beta") + g = -self.A_log.exp() * F.softplus(alpha.float() + self.dt_bias) # In fp32 + beta = beta.sigmoid() + nvtx_range_pop(suffix="g_and_beta") + + nvtx_range_push(suffix="gated_delta_rule") + core_attn_out, last_recurrent_state = chunk_gated_delta_rule( + query, + key, + value, + g=g, + beta=beta, + initial_state=None, + output_final_state=False, + use_qk_l2norm_in_kernel=False, + ) + nvtx_range_pop(suffix="gated_delta_rule") + + # RMSNorm + nvtx_range_push(suffix="gated_norm") + norm_out = self._torch_compiled_gated_norm(core_attn_out, gate) + nvtx_range_pop(suffix="gated_norm") + + # Transpose: b s x --> s b x + # From bshd back to sbhd format + norm_out = norm_out.reshape(batch, seq_len, -1) + norm_out = norm_out.transpose(0, 1).contiguous() + + # Output projection + nvtx_range_push(suffix="out_proj") + out, out_bias = self.out_proj(norm_out) + nvtx_range_pop(suffix="out_proj") + + return out, out_bias + + @torch.compile + def _torch_compiled_gated_norm(self, x, gate): + # Output Norm + x_dtype = x.dtype + x = x.reshape(-1, x.shape[-1]) + y = self.out_norm(x) + # Output gate + gate = gate.reshape(-1, gate.shape[-1]) + y = y * self.act_fn(gate.float()) + y = y.to(x_dtype) + return y + + def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None): + """Provide a sharded state dictionary for distributed checkpointing.""" + sharded_state_dict = {} + # Parameters + self._save_to_state_dict(sharded_state_dict, "", keep_vars=True) + sharded_state_dict = make_sharded_tensors_for_checkpoint( + sharded_state_dict, + prefix, + tensor_parallel_layers_axis_map={ + "A_log": 0, + "dt_bias": 0, + }, # parameters sharded across TP + sharded_offsets=sharded_offsets, + ) + # Submodules + for name, module in self.named_children(): + if name == "conv1d": + # Add TP sharding for Conv1d + module_sd = module.state_dict(prefix="", keep_vars=True) + tp_sharding_map = {f"weight": 0} + if self.conv_bias: + tp_sharding_map[f"bias"] = 0 + module_sharded_sd = make_sharded_tensors_for_checkpoint( + module_sd, f"{prefix}{name}.", tp_sharding_map, sharded_offsets + ) + else: + module_sharded_sd = sharded_state_dict_default( + module, f"{prefix}{name}.", sharded_offsets, metadata + ) + + sharded_state_dict.update(module_sharded_sd) + + # At this point the TP sharding is correctly defined for each tensor, but some of the + # tensors must be additionally split into separate parts + in_proj_dim_local_tp = self.in_proj_dim // self.tp_size + assert sharded_state_dict[f"{prefix}in_proj.weight"].data.size(0) == in_proj_dim_local_tp, ( + in_proj_dim_local_tp, + sharded_state_dict[f"{prefix}in_proj.weight"], + ) + + sharded_state_dict[f"{prefix}in_proj.weight"] = _split_tensor_factory( + sharded_state_dict[f"{prefix}in_proj.weight"], + [ + self.qk_dim // self.tp_size, + self.qk_dim // self.tp_size, + self.v_dim // self.tp_size, + self.v_dim // self.tp_size, + self.num_value_heads // self.tp_size, + self.num_value_heads // self.tp_size, + ], + ["query", "key", "value", "z", "beta", "alpha"], + 0, + ) + + conv_layer_name_list = ["conv1d.weight"] + assert ( + sharded_state_dict[f"{prefix}conv1d.weight"].data.size(0) == self.conv_dim_local_tp + ), (self.conv_dim_local_tp, sharded_state_dict[f"{prefix}conv1d.weight"]) + if self.conv_bias: + conv_layer_name_list.append("conv1d.bias") + assert ( + sharded_state_dict[f"{prefix}conv1d.bias"].data.size(0) == self.conv_dim_local_tp + ), (self.conv_dim_local_tp, sharded_state_dict[f"{prefix}conv1d.bias"]) + for conv_layer_name in conv_layer_name_list: + sharded_state_dict[f"{prefix}{conv_layer_name}"] = _split_tensor_factory( + sharded_state_dict[f"{prefix}{conv_layer_name}"], + [ + self.qk_dim // self.tp_size, + self.qk_dim // self.tp_size, + self.v_dim // self.tp_size, + ], + ["query", "key", "value"], + 0, + ) + + return sharded_state_dict + + +def _split_tensor_factory( + orig_sh_ten: ShardedTensor, split_sections: List[int], split_names: List[str], split_dim: int +) -> ShardedTensorFactory: + """Builds a factory that splits a given ShardedTensor into several independent chunks.""" + assert isinstance(orig_sh_ten, ShardedTensor), type(orig_sh_ten) + orig_sh_ten_no_data = orig_sh_ten.without_data() # remove `data` reference + + if sum(split_sections) != orig_sh_ten_no_data.local_shape[split_dim]: + raise ValueError( + f"Split sections must cover the whole dimension size, " + f"got {split_sections=} vs dimensions size " + f"{orig_sh_ten_no_data.local_shape[split_dim]}" + ) + + assert not isinstance( + split_sections, int + ), "Splitting into predefined section sizes is supported (`split_sections` must be a list)" + assert len(split_sections) == len(split_names), (len(split_sections), len(split_names)) + + @torch.no_grad() + def sh_ten_build_fn( + key: str, t: torch.Tensor, replica_id: ReplicaId, flattened_range: Optional[slice] + ): + factory_sh_ten = replace( + orig_sh_ten_no_data, + key=key, + data=t, + dtype=t.dtype, + replica_id=replica_id, + flattened_range=flattened_range, + ) + + chunk_sh_tens = [] + split_start = 0 + for split_size, split_name in zip(split_sections, split_names): + split_chunks = factory_sh_ten.narrow(split_dim, split_start, split_size) + for sh_ten in split_chunks: + sh_ten.key = f"{sh_ten.key}.{split_name}" + chunk_sh_tens.extend(split_chunks) + split_start += split_size + + assert split_start == orig_sh_ten_no_data.local_shape[split_dim], ( + split_start, + orig_sh_ten_no_data.local_shape[split_dim], + ) + assert sum(sh_ten.data.numel() for sh_ten in chunk_sh_tens) == t.numel(), ( + chunk_sh_tens, + t.shape, + ) + return chunk_sh_tens + + @torch.no_grad() + def sh_ten_merge_fn(sub_state_dict): + return torch.cat(sub_state_dict) + + return ShardedTensorFactory( + orig_sh_ten.key, orig_sh_ten.data, sh_ten_build_fn, sh_ten_merge_fn, orig_sh_ten.replica_id + ) diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index 74d30477e5c..518d82a0332 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -434,7 +434,7 @@ def _adjust_key_value_for_inference( return query, key, value, rotary_pos_emb, attn_mask_type, block_table @abstractmethod - def get_query_key_value_tensors(self, hidden_states, key_value_states, split_qkv=True): + def get_query_key_value_tensors(self, hidden_states, key_value_states, output_gate, split_qkv=True): """ This method needs to be implemented based on whether the derived class is "self-attn" or "cross-attn". @@ -718,19 +718,25 @@ def forward( self.k_layernorm is None or isinstance(self.k_layernorm, IdentityOp), ] ) + output_gate = self.config.attention_output_gate # Check if fused_single_qkv_rope is requested but either unavailable or not # supported for the current use case. if self.attention_type != "cross": assert not ( self.config.fused_single_qkv_rope and split_qkv ), "fused_single_qkv_rope requested but not available/supported for the config." + if output_gate: + assert split_qkv, "output_gate is not supported for unsplit mixed_qkv tensor." qkv_output = self.get_query_key_value_tensors( - hidden_states, key_value_states, split_qkv=split_qkv + hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv ) attn_mask_type = self.attn_mask_type block_table = None - if split_qkv: + gate = None + if output_gate and split_qkv: + query, key, value, gate = qkv_output + elif split_qkv: query, key, value = qkv_output else: mixed_qkv, qkv_split_arg_list = qkv_output @@ -912,6 +918,12 @@ def forward( core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1) nvtx_range_pop(suffix="core_attention") + # Output gate + if gate is not None: + nvtx_range_push(suffix="output_gate") + core_attn_out = self._torch_compiled_output_gate(core_attn_out, gate) + nvtx_range_pop(suffix="output_gate") + # ================= # Output. [sq, b, h] # ================= @@ -922,6 +934,15 @@ def forward( return output, bias + @torch.compile + def _torch_compiled_output_gate(self, x, gate): + x_dtype = x.dtype + gate = gate.contiguous() + gate = gate.view(*x.shape) + x = x * torch.sigmoid(gate.float()) + x = x.to(x_dtype) + return x + def set_for_recompute_input_layernorm(self): """Set the attention layer for recompute input_layernorm. Only needed for fp8.""" raise NotImplementedError("set_for_recompute_input_layernorm is not implemented.") @@ -953,10 +974,13 @@ def __init__( pg_collection=pg_collection, ) + self.linear_qkv_out_dim = self.query_projection_size + 2 * self.kv_projection_size + if self.config.attention_output_gate: + self.linear_qkv_out_dim += self.config.kv_channels * self.config.num_attention_heads self.linear_qkv = build_module( submodules.linear_qkv, self.config.hidden_size, - self.query_projection_size + 2 * self.kv_projection_size, + self.linear_qkv_out_dim, config=self.config, init_method=self.config.init_method, gather_output=False, @@ -1058,30 +1082,44 @@ def _compare(srcs, tgts, names, parallelism): "TP", ) - def get_query_key_value_tensors(self, hidden_states, key_value_states=None, split_qkv=True): + def get_query_key_value_tensors( + self, hidden_states, + key_value_states=None, + output_gate=False, + split_qkv=True + ): """ - Derives `query`, `key` and `value` tensors from `hidden_states`. If `split_qkv=False`, then - the unsplit mixed_qkv tensor is returned. + Derives `query`, `key`, `value` tensors from `hidden_states`. + If `output_gate` is True, then also derives `gate` tensor. + If `split_qkv=False`, then the unsplit mixed_qkv tensor is returned. """ - # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)] + # If no output gate: Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)] + # If have output gate: Attention heads [sq, b, h] --> [sq, b, ng * (2 * np/ng + 2) * hn)] mixed_qkv, _ = self.linear_qkv(hidden_states) + num_query_heads_per_group = ( + self.num_attention_heads_per_partition // self.num_query_groups_per_partition + ) + if output_gate: + num_qkv_heads_per_group = 2 * num_query_heads_per_group + 2 + else: + num_qkv_heads_per_group = num_query_heads_per_group + 2 - # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn] + # If no output gate: [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn] + # If have output gate: [sq, b, hp] --> [sq, b, ng, (2 * np/ng + 2) * hn] new_tensor_shape = mixed_qkv.size()[:-1] + ( self.num_query_groups_per_partition, - ( - (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2) - * self.hidden_size_per_attention_head - ), + num_qkv_heads_per_group * self.hidden_size_per_attention_head, ) mixed_qkv = mixed_qkv.view(*new_tensor_shape) + # Split the tensor into query, gate, key, and value. + # If no output gate: [sq, b, ng, (np/ng + 2) * hn] + # --> [sq, b, ng, np/ng * hn], None, [sq, b, ng, hn], [sq, b, ng, hn] + # If have output gate: [sq, b, ng, (2 * np/ng + 2) * hn] + # --> [sq, b, ng, np/ng * hn], [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn] split_arg_list = [ - ( - self.num_attention_heads_per_partition - // self.num_query_groups_per_partition - * self.hidden_size_per_attention_head - ), + num_query_heads_per_group * self.hidden_size_per_attention_head, + num_query_heads_per_group * self.hidden_size_per_attention_head if output_gate else 0, self.hidden_size_per_attention_head, self.hidden_size_per_attention_head, ] @@ -1091,18 +1129,15 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None, spli return mixed_qkv, split_arg_list if SplitAlongDim is not None: - - # [sq, b, ng, (np/ng + 2) * hn] - # --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn] - (query, key, value) = SplitAlongDim(mixed_qkv, 3, split_arg_list) + (query, gate, key, value) = SplitAlongDim(mixed_qkv, 3, split_arg_list) else: + (query, gate, key, value) = torch.split(mixed_qkv, split_arg_list, dim=3) - # [sq, b, ng, (np/ng + 2) * hn] - # --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn] - (query, key, value) = torch.split(mixed_qkv, split_arg_list, dim=3) - - # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] + # Query [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head) + if output_gate: + # Gate [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] + gate = gate.reshape(gate.size(0), gate.size(1), -1, self.hidden_size_per_attention_head) if self.q_layernorm is not None: query = self.q_layernorm(query) @@ -1113,6 +1148,8 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None, spli if self.config.test_mode: self.run_realtime_tests() + if output_gate: + return query, key, value, gate return query, key, value def backward_dw(self) -> NoReturn: @@ -1189,11 +1226,13 @@ def __init__( is_expert=False, ) - def get_query_key_value_tensors(self, hidden_states, key_value_states, split_qkv=True): + def get_query_key_value_tensors(self, hidden_states, key_value_states, output_gate=False, split_qkv=True): """ Derives `query` tensor from `hidden_states`, and `key`/`value` tensors from `key_value_states`. """ + assert not output_gate, "Output gate is not supported in cross attention for now." + assert split_qkv, "split_qkv must be True for CrossAttention" # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] mixed_kv, _ = self.linear_kv(key_value_states) diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py index bbb5fce4e33..2e6fb68e444 100644 --- a/megatron/core/transformer/moe/moe_layer.py +++ b/megatron/core/transformer/moe/moe_layer.py @@ -161,7 +161,10 @@ def __init__( # Initialize shared experts if self.use_shared_expert: self.shared_experts = build_module( - self.submodules.shared_experts, config=self.config, pg_collection=pg_collection + self.submodules.shared_experts, + config=self.config, + pg_collection=pg_collection, + gate=self.config.moe_shared_expert_gate, ) if self.shared_expert_overlap: self.token_dispatcher.set_shared_experts(self.shared_experts) diff --git a/megatron/core/transformer/spec_utils.py b/megatron/core/transformer/spec_utils.py index b3de8541734..897d88d2aa3 100644 --- a/megatron/core/transformer/spec_utils.py +++ b/megatron/core/transformer/spec_utils.py @@ -25,6 +25,7 @@ class ModuleSpec: module: Union[Tuple, type] params: dict = field(default_factory=lambda: {}) submodules: type = None + metainfo: dict = field(default_factory=lambda: {}) def import_module(module_path: Tuple[str]): diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index 88da736415e..dc11239836f 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -192,6 +192,9 @@ class TransformerConfig(ModelParallelConfig): qk_layernorm: bool = False """Whether to apply `normalization` type of normalization to the query and key embeddings.""" + attention_output_gate: bool = False + """Whether to apply output gate to the attention layers.""" + test_mode: bool = False """Whether to run real-time tests.""" @@ -212,6 +215,34 @@ class TransformerConfig(ModelParallelConfig): moe_deepep_num_sms: int = 20 """Number of SMs to use for DeepEP.""" + #################### + # linear attention + #################### + linear_attention_type: Optional[str] = None + """Type of linear attention to use. Currently support gated_delta_net.""" + + linear_attention_freq: Optional[Union[int, List[int]]] = None + """Frequency between LA (linear attention) layers + and SDPA (scaled dot-product attention) layers. + Accepts either: + - An integer N: Represents a (N-1):N ratio, meaning (N-1) LA layers for every 1 SDPA layer + - A list that defines a custom pattern, e.g.: [1,1,1,0,1,1,1,0,1,1,1,0]""" + + linear_conv_kernel_dim: Optional[int] = None + """Conv kernel dimension for the gated delta net.""" + + linear_key_head_dim: Optional[int] = None + """Query and key head dimension for the gated delta net.""" + + linear_value_head_dim: Optional[int] = None + """Value and gate head dimension for the gated delta net.""" + + linear_num_key_heads: Optional[int] = None + """Number of query and key heads for the gated delta net.""" + + linear_num_value_heads: Optional[int] = None + """Number of value and gate heads for the gated delta net.""" + #################### # initialization #################### @@ -429,6 +460,9 @@ class TransformerConfig(ModelParallelConfig): there are multiple shared experts. None means no shared expert.""" + moe_shared_expert_gate: bool = False + """Enable gate for shared expert.""" + moe_shared_expert_overlap: bool = False """Enable overlapping between shared expert computations and dispatcher communications. Without this, the shared epxerts execute after the routed experts.""" @@ -744,6 +778,54 @@ def __post_init__(self): f"tensor_model_parallel_size ({self.tensor_model_parallel_size})." ) + if self.linear_attention_type is not None: + supported_la_types = ["gated_delta_net", "mamba"] + assert self.linear_attention_type in supported_la_types, ( + f"linear_attention_type ({self.linear_attention_type}) only support" + f" one of {supported_la_types}." + ) + assert ( + self.linear_attention_freq is not None + ), f"linear_attention_freq must be set for linear attention." + + if self.linear_attention_type == "gated_delta_net": + # Check required parameters + assert ( + self.linear_conv_kernel_dim is not None + ), "linear_conv_kernel_dim must be set for gated delta net." + assert ( + self.linear_key_head_dim is not None + ), "linear_key_head_dim must be set for gated delta net." + assert ( + self.linear_value_head_dim is not None + ), "linear_value_head_dim must be set for gated delta net." + assert ( + self.linear_num_key_heads is not None + ), "linear_num_key_heads must be set for gated delta net." + assert ( + self.linear_num_value_heads is not None + ), "linear_num_value_heads must be set for gated delta net." + assert self.linear_num_value_heads % self.linear_num_key_heads == 0, ( + f"linear_num_value_heads ({self.linear_num_value_heads}) must be a multiple of " + f"linear_num_key_heads ({self.linear_num_key_heads})." + ) + + # Check tensor parallelism compatibility + assert ( + self.linear_num_key_heads % self.tensor_model_parallel_size == 0 + ), "linear_num_key_heads must be a multiple of tensor_model_parallel_size." + assert ( + self.linear_num_value_heads % self.tensor_model_parallel_size == 0 + ), "linear_num_value_heads must be a multiple of tensor_model_parallel_size." + + # Do not support yet, but coming soon. + assert self.context_parallel_size == 1, ( + f"Gated delta net does not support context parallel for now," + f" but got {self.context_parallel_size=}." + ) + elif self.linear_attention_type == "mamba": + raise NotImplementedError("Mamba is not supported yet.") + if self.fp8: # cannot support first last layer bf16 with delayed scaling if self.first_last_layers_bf16 and self.fp8_recipe == Fp8Recipe.delayed: @@ -1553,6 +1635,9 @@ def __post_init__(self): if self.multi_latent_attention and self.apply_rope_fusion and self.rope_type != "yarn": raise ValueError("apply_rope_fusion for MLA only works with YARN RoPE.") + if self.attention_output_gate: + raise NotImplementedError("Output gate is not supported for MLA yet.") + if self.cache_mla_latents: assert ( self.apply_rope_fusion is False diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index dc33a639e8d..29db36ca6e0 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -69,6 +69,7 @@ def add_megatron_arguments(parser: argparse.ArgumentParser): parser = _add_vision_args(parser) parser = _add_moe_args(parser) parser = _add_mla_args(parser) + parser = _add_linear_attention_args(parser) parser = _add_heterogeneous_args(parser) parser = _add_logging_args(parser) parser = _add_straggler_detector_args(parser) @@ -319,7 +320,7 @@ def moe_freq_type(x): This allows defining arbitrary patterns of expert and dense layers. The pattern length must match the total number of transformer layers. Examples: - "([0]+[1]*23)": 1 dense layer followed by 23 experts layers + "([0]+[1]*23)": 1 dense layer followed by 23 expert layers "([1]*3+[0]*2)*2": Three expert layers followed by two dense layers, repeated twice. """ if isinstance(x, int): @@ -332,6 +333,31 @@ def moe_freq_type(x): # it's a single int but in str return int(x) +def la_freq_type(x): + """Frequency between LA (linear attention) layers and SDPA (scaled dot-product attention) layers. + + Accepts either: + - An integer N: Represents a (N-1):N ratio, meaning (N-1) LA layers for every 1 SDPA layer + - A string "N": Same as above, but provided as a string + - A string containing a Python list expression that defines a custom pattern, e.g.: + "([1]*3+[0]*1)*3" evaluates to [1,1,1,0,1,1,1,0,1,1,1,0] + where 1 indicates an LA layer and 0 indicates a SDPA layer. + This allows defining arbitrary patterns of LA and SDPA layers. + The pattern length must match the total number of transformer layers. + Examples: + "([0]+[1]*23)": 1 SDPA layer followed by 23 LA layers + "([1]*3+[0]*2)*2": Three LA layers followed by two SDPA layers, repeated twice. + """ + if x is None or isinstance(x, int): + return x + assert isinstance(x, str) + if '[' in x: + # it's a custom pattern + return _eval_pattern(x) + else: + # it's a single int but in str + return int(x) + def tuple_type(x): """ Convert a string to a tuple of integers. @@ -1542,6 +1568,8 @@ def _add_network_size_args(parser): group.add_argument('--group-query-attention', action='store_true', help='Use group-query attention.') group.add_argument('--num-query-groups', type=int, default=1) + group.add_argument('--attention-output-gate', action='store_true', + help='Whether to apply output gate to the attention.') group.add_argument('--softmax-type', type=str, default='vanilla', choices=['learnable', 'vanilla', 'off-by-one'], help='Type of softmax to use for the attention. Supports both a fixed offset and ' @@ -1860,6 +1888,12 @@ def _add_regularization_args(parser): group.add_argument('--weight-decay-incr-style', type=str, default='constant', choices=['constant', 'linear', 'cosine'], help='Weight decay increment function.') + group.add_argument('--no-weight-decay-cond-type', type=str, choices=['qwen3_next'], + help='Type of no weight decay condition. Choices: ' + 'None (default): param no weight decay if and only if it is 1D; or it is bias; ' + 'or it is embedding and embedding_init_method_std is not None. ' + '"qwen3_next": In addition to the default rules, ' + 'apply weight decay to qk layernorm as a special case.') group.add_argument('--clip-grad', type=float, default=1.0, help='Gradient clipping based on global L2 norm.') group.add_argument('--adam-beta1', type=float, default=0.9, @@ -3028,7 +3062,7 @@ def _add_moe_args(parser): '- A string containing a Python list expression that defines a custom pattern, e.g.: ' '"([1]*3+[0]*1)*3" evaluates to [1,1,1,0,1,1,1,0,1,1,1,0] ' 'where 1 indicates an expert layer and 0 indicates a dense layer. ' - 'Examples: "([0]+[1]*23)": 1 dense layer followed by 23 experts layers, ' + 'Examples: "([0]+[1]*23)": 1 dense layer followed by 23 expert layers, ' '"([1]*3+[0]*2)*2": Three expert layers followed by two dense layers, repeated twice.') group.add_argument('--moe-ffn-hidden-size', type=int, default=None, help='The hidden size of each expert\'s feed-forward network (ffn). ' @@ -3037,6 +3071,8 @@ def _add_moe_args(parser): help='Shared expert total ffn hidden size. ' 'It should be equal to "num_shared_experts * ffn_size_of_each_shared_expert" if there are multiple shared experts. ' 'None means no shared expert.') + group.add_argument('--moe-shared-expert-gate', action='store_true', + help='Enable gate for shared expert. Only effective when moe-shared-expert-intermediate-size is set.') group.add_argument('--moe-shared-expert-overlap', action='store_true', help='Enable overlapping between shared expert computations and dispatcher communications. ' 'Without this, the shared epxerts execute after the routed experts. ' @@ -3161,6 +3197,31 @@ def _add_mla_args(parser): return parser +def _add_linear_attention_args(parser): + group = parser.add_argument_group(title="la") + group.add_argument('--linear-attention-type', default=None, choices=['gated_delta_net', 'mamba'], type=str, + help='Type of linear attention to use. Currently support gated_delta_net and mamba.') + group.add_argument('--linear-attention-freq', type=la_freq_type, default=None, + help='Frequency between LA (linear attention) layers and' + ' SDPA (scaled dot-product attention) layers. Accepts either: ' + '- An integer N: Represents a (N-1):N ratio, meaning (N-1) LA layers for every 1 SDPA layer ' + '- A string containing a Python list expression that defines a custom pattern, e.g.: ' + '"([1]*3+[0]*1)*3" evaluates to [1,1,1,0,1,1,1,0,1,1,1,0] ' + 'where 1 indicates an LA layer and 0 indicates a SDPA layer. ' + 'Examples: "([0]+[1]*23)": 1 SDPA layer followed by 23 LA layers, ' + '"([1]*3+[0]*2)*2": Three LA layers followed by two SDPA layers, repeated twice.') + group.add_argument('--linear-conv-kernel-dim', default=4, type=int, + help='Conv kernel dimension for the gated delta net.') + group.add_argument('--linear-key-head-dim', default=128, type=int, + help='Query and key head dimension for the gated delta net.') + group.add_argument('--linear-value-head-dim', default=128, type=int, + help='Value and gate head dimension for the gated delta net.') + group.add_argument('--linear-num-key-heads', default=16, type=int, + help='Number of query and key heads for the gated delta net.') + group.add_argument('--linear-num-value-heads', default=32, type=int, + help='Number of value and gate heads for the gated delta net.') + return parser + def _add_heterogeneous_args(parser): """ Heterogeneous models refer to transformer architectures where individual layers can differ diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py index deff728aa23..e0dc794d38a 100644 --- a/megatron/training/checkpointing.py +++ b/megatron/training/checkpointing.py @@ -8,6 +8,7 @@ import shutil import sys import threading +import types from argparse import Namespace from enum import Enum, auto from logging import getLogger @@ -1424,18 +1425,27 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load', ignore_rng_state = False ignore_rerun_state = True if ckpt_format == "torch_dist": + state_dict_args = ( + state_dict.get('args', types.SimpleNamespace()) + if state_dict is not None + else types.SimpleNamespace() + ) + if not hasattr(state_dict_args, 'tensor_model_parallel_size'): + print_rank_0('WARNING: does not find TP size in checkpoint args, using 1 as default.') + if not hasattr(state_dict_args, 'pipeline_model_parallel_size'): + print_rank_0('WARNING: does not find PP size in checkpoint args, using 1 as default.') ckpt_tp_pp = ( - state_dict['args'].tensor_model_parallel_size, - state_dict['args'].pipeline_model_parallel_size, + getattr(state_dict_args, 'tensor_model_parallel_size', 1), + getattr(state_dict_args, 'pipeline_model_parallel_size', 1), ) run_tp_pp = ( args.tensor_model_parallel_size, args.pipeline_model_parallel_size, ) - ckpt_world_size = getattr(state_dict['args'], 'world_size', 0) + ckpt_world_size = getattr(state_dict_args, 'world_size', 0) run_world_size = getattr(args, 'world_size', 0) - ckpt_dp = getattr(state_dict['args'], 'data_parallel_size', 0) + ckpt_dp = getattr(state_dict_args, 'data_parallel_size', 0) run_dp = getattr(args, 'data_parallel_size', 0) mismatch_msg = "(TP, PP) mismatch after resume ({} vs {} from checkpoint)".format( run_tp_pp, ckpt_tp_pp @@ -1443,7 +1453,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load', # Determine if RNG state will be loaded if (ckpt_tp_pp == run_tp_pp and not release and not args.finetune and not args.no_load_rng - and not getattr(state_dict['args'], 'no_save_rng', False)): + and not getattr(state_dict_args, 'no_save_rng', False)): gen_sd_rng_state = get_rng_state(args.ckpt_format) # we can load the rng state else: ignore_rng_state = True @@ -1458,7 +1468,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load', print_rank_0(f'sharded_state_dict metadata loaded from the checkpoint: {sharded_sd_metadata}') # Determine if optimizer state will be loaded if (not release and not args.finetune and not args.no_load_optim - and not getattr(state_dict['args'], 'no_save_optim', False)): + and not getattr(state_dict_args, 'no_save_optim', False)): gen_sd_optim = optimizer gen_sd_opt_param_scheduler = opt_param_scheduler @@ -1469,7 +1479,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load', # (for MCore v0.13+ checkpoints `sharded_sd_metadata is not None`) sharded_sd_metadata = { 'distrib_optim_sharding_type': ('fully_sharded_model_space' - if getattr(state_dict['args'], 'ckpt_fully_parallel_save', False) + if getattr(state_dict_args, 'ckpt_fully_parallel_save', False) else 'dp_zero_gather_scatter'), } if ( diff --git a/megatron/training/training.py b/megatron/training/training.py index bc5fefa86ba..3b354581760 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -237,9 +237,6 @@ def hybrid_flops(batch_size, seq_len, hidden_size, def transformer_flops(): """Calculate FLOPs for a standard Transformer model.""" # TODO(helenn/dnarayanan): Refactor this to reuse the helper methods. - # Attention projection size. - query_projection_size = args.kv_channels * args.num_attention_heads - query_projection_to_hidden_size_ratio = query_projection_size / args.hidden_size # Group Query Attention. if not args.group_query_attention: args.num_query_groups = args.num_attention_heads @@ -330,10 +327,9 @@ def transformer_flops(): + args.num_attention_heads * (args.qk_head_dim + args.qk_pos_emb_head_dim) + 1 ) - self_attn_term = ( + standard_self_attn_term = ( 3 * 2 # fwd(1) + bwd(2) *FMA - * num_layers * ( ## q lora + rope + q norm q_term @@ -350,29 +346,98 @@ def transformer_flops(): ## core attn + args.seq_length * (args.num_attention_heads * (args.qk_head_dim + args.qk_pos_emb_head_dim)) - / 2 + / 2 # causal mask (only half of the mask is non-zero) + args.seq_length * args.num_attention_heads * args.v_head_dim / 2 ) ) else: ## MHA or GQA - self_attn_term = ( - expansion_factor - * num_layers - * args.hidden_size - * args.hidden_size + query_projection_size = args.kv_channels * args.num_attention_heads + key_projection_size = args.kv_channels * args.num_query_groups + value_projection_size = args.kv_channels * args.num_query_groups + standard_self_attn_term = ( + 3 + * 2 # fwd(1) + bwd(2) *FMA * ( - ( - 1 - + (args.num_query_groups / args.num_attention_heads) - # # Only half of the attention matrix is non-zero and needs to be multiplied with V. - + (args.seq_length / args.hidden_size / 2) - ) - * query_projection_to_hidden_size_ratio + ## qkv proj + args.hidden_size + * (query_projection_size + key_projection_size + value_projection_size) + ## core attention + + query_projection_size + * args.seq_length + / 2 # causal mask (only half of the mask is non-zero) + * 2 # QK^T and (QK^T)V + ## out proj + + query_projection_size + * args.hidden_size ) ) + if args.linear_attention_type is not None: + # Calculate number of dense and MoE Transformer MLPs. + if isinstance(args.linear_attention_freq, int): + linear_attention_pattern = [ + # [1,1,...,1,0,1,1,...,1,0,...] + 0 if ((i + 1) % args.linear_attention_freq == 0) + else 1 for i in range(num_layers) + ] + elif isinstance(args.linear_attention_freq, list): + linear_attention_pattern = args.linear_attention_freq + assert len(linear_attention_pattern) == num_layers, ( + f"Invalid length of linear_attention_pattern: {len(linear_attention_pattern)}, " + f"expected {num_layers}, " + f"current linear attention pattern: {args.linear_attention_freq}" + ) + elif args.linear_attention_freq is None: + linear_attention_pattern = [1] * num_layers + else: + raise ValueError( + f"Invalid linear_attention_freq: {type(args.linear_attention_freq)}," + f" {args.linear_attention_freq}" + ) + num_linear_attention_layers = sum(linear_attention_pattern) + num_standard_attention_layers = num_layers - num_linear_attention_layers + + if args.linear_attention_type == "gated_delta_net": + # Calculate the FLOPs for the gated delta net attention. + qk_head_dim = args.linear_key_head_dim + v_head_dim = args.linear_value_head_dim + num_qk_heads = args.linear_num_key_heads + num_v_heads = args.linear_num_value_heads + qk_dim = qk_head_dim * num_qk_heads + v_dim = v_head_dim * num_v_heads + linear_self_attn_term = ( + 3 + * 2 # fwd(1) + bwd(2) *FMA + * ( + ## in proj + args.hidden_size + * (2 * qk_dim + 2 * v_dim + 2 * num_v_heads) + ## conv1d + + args.linear_conv_kernel_dim + * (2 * qk_dim + v_dim) + ## gated delta rule + + num_v_heads + * (v_head_dim ** 2) + * 4 # KK^T, VK^T, S(a(I-bKK^T)), and SQ + ## out proj + + args.hidden_size + * v_dim + ) + ) + else: + raise ValueError(f"Invalid linear_attention_type: {args.linear_attention_type}") + else: + num_linear_attention_layers = 0 + linear_self_attn_term = 0 + num_standard_attention_layers = num_layers + + self_attn_term = ( + linear_self_attn_term * num_linear_attention_layers + + standard_self_attn_term * num_standard_attention_layers + ) + total_floating_point_operations = ( batch_size * args.seq_length @@ -528,6 +593,30 @@ def reorder_inner_param_groups(optimizer_state_dict): return preprocessed_common_state_dict +def get_no_wd_decay_cond(no_wd_decay_cond_type, default_skip_embedding_weight_decay): + """Get the no weight decay condition function.""" + + # Default case: no_wd_decay_cond_type is None + no_wd_decay_cond_fn = None + + if no_wd_decay_cond_type == 'qwen3_next': + # Qwen3-Next applies weight decay to qk layernorm as a special case + def qwen3_next_no_wd_decay_cond(name, param): + if "q_layernorm" in name or "k_layernorm" in name: + no_wd = False + else: + no_wd = ( + name.endswith(".bias") + or len(param.shape) == 1 + or (default_skip_embedding_weight_decay and "embedding" in name) + ) + return no_wd + no_wd_decay_cond_fn = qwen3_next_no_wd_decay_cond + elif no_wd_decay_cond_type is not None: + raise ValueError(f"Invalid no_wd_decay_cond_type: {no_wd_decay_cond_type}") + + return no_wd_decay_cond_fn + def pretrain( train_valid_test_dataset_provider, model_provider, @@ -664,8 +753,15 @@ def pretrain( # Model, optimizer, and learning rate. timers('model-and-optimizer-setup', log_level=0).start(barrier=True) + no_wd_decay_cond = get_no_wd_decay_cond( + args.no_weight_decay_cond_type, + default_skip_embedding_weight_decay=args.embedding_init_method_std is not None, + ) model, optimizer, opt_param_scheduler = setup_model_and_optimizer( - model_provider, model_type, checkpointing_context=checkpointing_context + model_provider, + model_type, + checkpointing_context=checkpointing_context, + no_wd_decay_cond=no_wd_decay_cond, ) timers('model-and-optimizer-setup').stop() diff --git a/megatron/training/utils.py b/megatron/training/utils.py index cef71160791..ee46991bce5 100644 --- a/megatron/training/utils.py +++ b/megatron/training/utils.py @@ -38,6 +38,7 @@ from megatron.core.utils import ( get_batch_on_this_cp_rank, get_data_parallel_group_if_dtensor, + is_torch_min_version, to_local_if_dtensor, unwrap_model, ) @@ -271,6 +272,9 @@ def report_memory(name): string += ' | max allocated: {}'.format(torch.cuda.max_memory_allocated() / mega_bytes) string += ' | reserved: {}'.format(torch.cuda.memory_reserved() / mega_bytes) string += ' | max reserved: {}'.format(torch.cuda.max_memory_reserved() / mega_bytes) + if is_torch_min_version("2.6.0"): + # device usage is not supported in torch < 2.6.0 + string += ' | device usage: {}'.format(torch.cuda.device_memory_used() / mega_bytes) if mpu.get_data_parallel_rank() == 0: print("[Rank {}] {}".format(torch.distributed.get_rank(), string), flush=True) diff --git a/pyproject.toml b/pyproject.toml index 3362a0181c1..0a0fb9993f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,6 +76,7 @@ dev = [ "setuptools<80.0.0", "mamba-ssm~=2.2", "causal-conv1d~=1.5", + "flash-linear-attention~=0.3.2", "nv-grouped-gemm~=1.1", "transformer-engine[pytorch]>=2.6.0a0,<2.8.0", "nvidia-resiliency-ext>=0.4.0a0,<0.5.0", diff --git a/tests/unit_tests/ssm/test_gated_delta_net.py b/tests/unit_tests/ssm/test_gated_delta_net.py new file mode 100644 index 00000000000..dbf8d203634 --- /dev/null +++ b/tests/unit_tests/ssm/test_gated_delta_net.py @@ -0,0 +1,319 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +from functools import partial +from unittest import mock + +import pytest +import torch +import torch.nn.functional as F + +from megatron.core import parallel_state +from megatron.core.models.common.embeddings.rope_utils import ( + get_pos_emb_on_this_cp_rank as get_tensor_on_this_cp_rank, +) +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec +from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.process_groups_config import ProcessGroupCollection +from megatron.core.ssm.gated_delta_net import GatedDeltaNet +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer import TransformerConfig +from megatron.training.arguments import parse_args +from megatron.training.checkpointing import load_checkpoint, save_checkpoint +from megatron.training.global_vars import set_args +from megatron.training.training import get_model +from megatron.training.utils import unwrap_model +from tests.unit_tests.dist_checkpointing import ( + TempNamedDir, + init_basic_mock_args, + init_checkpointing_mock_args, +) +from tests.unit_tests.test_utilities import Utils + +try: + import fla + + HAVE_FLA = True +except ImportError: + HAVE_FLA = False + + +@pytest.mark.parametrize( + ("tp_size", "sp", "cp_size"), + [ + (1, False, 1), + (2, False, 1), + (2, True, 1), + # GDN does not support CP for now. Leave it for future work. + ], +) +@pytest.mark.skipif(not HAVE_FLA, reason="FLA is not installed.") +@pytest.mark.internal +class TestGatedDeltaNet: + + @pytest.fixture(scope='function', autouse=True) + def setup_method(self, tp_size, sp, cp_size): + # Initialize parallel and random seed + Utils.initialize_model_parallel( + tensor_model_parallel_size=tp_size, + pipeline_model_parallel_size=1, + context_parallel_size=cp_size, + ) + model_parallel_cuda_manual_seed(123) + self.tp_size = tp_size + self.cp_size = cp_size + self.sp_size = tp_size if sp else 1 + + # Get TP and CP process groups from device mesh + tp_group = parallel_state.get_tensor_model_parallel_group() + cp_group = parallel_state.get_context_parallel_group() + pg_collection = ProcessGroupCollection(tp=tp_group, cp=cp_group) + + # Initialize model + self.transformer_config = TransformerConfig( + hidden_size=256, + linear_conv_kernel_dim=2, + linear_key_head_dim=64, + linear_value_head_dim=64, + linear_num_key_heads=4, + linear_num_value_heads=8, + num_layers=1, + normalization="RMSNorm", + use_cpu_initialization=True, + layernorm_zero_centered_gamma=True, + num_attention_heads=8, + activation_func=F.silu, + bf16=True, + tensor_model_parallel_size=tp_size, + sequence_parallel=sp, + context_parallel_size=cp_size, + ) + gdn_submodules = get_gpt_layer_with_transformer_engine_spec( + linear_attention_type="gated_delta_net", normalization="RMSNorm" + ).submodules.self_attention.submodules + + self.gdn = GatedDeltaNet( + self.transformer_config, + submodules=gdn_submodules, + layer_number=1, + bias=False, + conv_bias=False, + conv_init=1.0, + use_qk_l2norm=True, + A_init_range=(1, 16), + pg_collection=pg_collection, + ) + self.gdn = self.gdn.cuda().bfloat16() + + def teardown_method(self): + Utils.destroy_model_parallel() + + def test_gpu_forward(self): + gdn = self.gdn + + micro_batch_size = 2 + seq_length = 64 + hidden_states = torch.ones( + (seq_length // self.sp_size // self.cp_size, micro_batch_size, gdn.config.hidden_size), + device=torch.cuda.current_device(), + dtype=torch.bfloat16, + ) + attention_mask = None + + output, bias = gdn(hidden_states, attention_mask) + + assert output.dim() == 3, f"Output too many dimensions ({output.shape=})" + assert output.shape[0] == seq_length // self.sp_size // self.cp_size, ( + f"Output shape {output.shape[0]=} mismatch with " + f" {seq_length=} // {self.sp_size=} // {self.cp_size=}." + ) + assert ( + output.shape[1] == micro_batch_size + ), f"Output shape {output.shape[1]=} mismatch with {micro_batch_size=}" + assert ( + output.shape[2] == gdn.config.hidden_size + ), f"Output shape {output.shape[2]=} mismatch with {gdn.config.hidden_size=}" + assert ( + output.dtype == hidden_states.dtype + ), f"Output dtype {output.dtype=} mismatch with {hidden_states.dtype=}" + + +@pytest.mark.parametrize( + ("tp", "sp", "cp"), + [ + (4, False, 1), # TP w/o SP + (4, True, 1), # TP w/ SP + # CP does not support GDN for now. Add it once it is supported. + ], +) +@pytest.mark.skipif(not HAVE_FLA, reason="FLA is not installed.") +def test_parallel_gated_delta_net_correctness(tmp_path_dist_ckpt, tp, sp, cp): + # Constants + seed = 123 + sequence_length = 256 + micro_batch_size = 4 + hidden_size = 128 + normalization = "RMSNorm" + + # Model initialization function + def initialize_gpt_model(config, pre_process=True, post_process=True, vp_stage=None): + layer_spec = get_gpt_layer_with_transformer_engine_spec( + linear_attention_type="gated_delta_net", normalization=normalization + ) + gpt_model = GPTModel( + config=config, + transformer_layer_spec=layer_spec, + vocab_size=128, + max_sequence_length=sequence_length, + pre_process=pre_process, + post_process=post_process, + vp_stage=vp_stage, + ) + return gpt_model + + # Initialize baseline parallel state + Utils.initialize_model_parallel( + tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=1 + ) + + # Initialize input hidden states + torch.manual_seed(seed) + model_parallel_cuda_manual_seed(seed) + input_hidden_states = ( + torch.rand((sequence_length, micro_batch_size, hidden_size)) + .cuda() + .bfloat16() + .requires_grad_(True) + ) + + # Initialize transformer config + transformer_config = TransformerConfig( + hidden_size=128, + linear_conv_kernel_dim=2, + linear_key_head_dim=32, + linear_value_head_dim=32, + linear_num_key_heads=4, + linear_num_value_heads=8, + num_layers=1, + normalization=normalization, + use_cpu_initialization=True, + layernorm_zero_centered_gamma=True, + num_attention_heads=8, + activation_func=F.silu, + bf16=True, + ) + + with TempNamedDir(tmp_path_dist_ckpt / 'test_parallel_gdn', sync=True) as ckpt_dir: + # Set argument + mock_args = parse_args(ignore_unknown_args=True) + set_args(mock_args) + + # Initialize baseline model + init_basic_mock_args(mock_args, 1, 1, bf16=True) + mock_args.context_parallel_size = 1 + mock_args.sequence_parallel = 1 + gpt_model = unwrap_model( + get_model(partial(initialize_gpt_model, config=transformer_config)) + ) + + # Initialize args and save checkpoint + init_checkpointing_mock_args(mock_args, ckpt_dir, False) + mock_args.no_save_optim = True + mock_args.no_save_rng = True + mock_args.no_load_optim = True + mock_args.no_load_rng = True + save_checkpoint(10, gpt_model, None, None, 0) + + # Calculate baseline output + attention = gpt_model[0].decoder.layers[0].self_attention + output_hidden_states_baseline, bias_hidden_states_baseline = attention( + input_hidden_states, attention_mask=None + ) + output_hidden_states_baseline.sum().backward() + + # Save baseline output + input_grad_baseline = input_hidden_states.grad.detach() + output_hidden_states_baseline = output_hidden_states_baseline.detach() + + # Initialize parallel model + Utils.destroy_model_parallel() + Utils.initialize_model_parallel( + tensor_model_parallel_size=tp, pipeline_model_parallel_size=1, context_parallel_size=cp + ) + torch.manual_seed(seed) + model_parallel_cuda_manual_seed(seed) + transformer_config.context_parallel_size = cp + transformer_config.tensor_model_parallel_size = tp + transformer_config.sequence_parallel = sp + init_basic_mock_args(mock_args, tp, 1, bf16=True) + mock_args.context_parallel_size = cp + mock_args.sequence_parallel = sp + gpt_model = unwrap_model( + get_model(partial(initialize_gpt_model, config=transformer_config)) + ) + with mock.patch('megatron.training.checkpointing.check_checkpoint_args'): + with mock.patch('megatron.training.checkpointing.update_num_microbatches'): + load_checkpoint(gpt_model, None, None) + + # Function to get tensor on this tp and cp rank + cp_group = parallel_state.get_context_parallel_group() + tp_rank = parallel_state.get_tensor_model_parallel_rank() + + def get_tensor_on_this_rank(tensor): + if cp > 1: + tensor = get_tensor_on_this_cp_rank(tensor, 0, cp_group) + if tp > 1 and sp: + sp_seg = sequence_length // tp // cp + tensor = tensor[tp_rank * sp_seg : (tp_rank + 1) * sp_seg] + return tensor + + # Calculate parallel model output + input_hidden_states = get_tensor_on_this_rank(input_hidden_states) + input_hidden_states = input_hidden_states.detach().requires_grad_(True) + parallel_attention = gpt_model[0].decoder.layers[0].self_attention + output_hidden_states_parallel, bias_hidden_states_parallel = parallel_attention( + input_hidden_states, attention_mask=None + ) + output_hidden_states_parallel.sum().backward() + input_grad_parallel = input_hidden_states.grad.detach() + + # Check if the output is the same + if cp: + atol, rtol = 5e-3, 5e-3 + else: + atol, rtol = 5e-4, 5e-4 + output_hidden_states_baseline = get_tensor_on_this_rank(output_hidden_states_baseline) + input_grad_baseline = get_tensor_on_this_rank(input_grad_baseline) + + assert torch.all( + ~torch.isnan(output_hidden_states_baseline) + ), "output_hidden_states_baseline contains nan" + assert torch.all( + ~torch.isinf(output_hidden_states_baseline) + ), "output_hidden_states_baseline contains inf" + assert torch.all(~torch.isnan(input_grad_baseline)), "input_grad_baseline contains nan" + assert torch.all(~torch.isinf(input_grad_baseline)), "input_grad_baseline contains inf" + assert torch.all( + ~torch.isnan(output_hidden_states_parallel) + ), "output_hidden_states_parallel contains nan" + assert torch.all( + ~torch.isinf(output_hidden_states_parallel) + ), "output_hidden_states_parallel contains inf" + assert torch.all(~torch.isnan(input_grad_parallel)), "input_grad_parallel contains nan" + assert torch.all(~torch.isinf(input_grad_parallel)), "input_grad_parallel contains inf" + + torch.testing.assert_close( + output_hidden_states_baseline, + output_hidden_states_parallel, + atol=atol, + rtol=rtol, + msg=lambda msg: f"Mismatch in output_hidden_states: {msg}", + ) + torch.testing.assert_close( + input_grad_baseline, + input_grad_parallel, + atol=atol, + rtol=rtol, + msg=lambda msg: f"Mismatch in input_grad: {msg}", + ) + + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/transformer/moe/test_shared_experts.py b/tests/unit_tests/transformer/moe/test_shared_experts.py index f721c482937..6df4d2fd369 100644 --- a/tests/unit_tests/transformer/moe/test_shared_experts.py +++ b/tests/unit_tests/transformer/moe/test_shared_experts.py @@ -20,7 +20,8 @@ def teardown_method(self, method): @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @pytest.mark.internal - def test_gpu_forward(self): + @pytest.mark.parametrize("shared_expert_gate", [False, True]) + def test_gpu_forward(self, shared_expert_gate): Utils.initialize_model_parallel(1, 1) model_parallel_cuda_manual_seed(123) print("done intializing") @@ -38,6 +39,7 @@ def test_gpu_forward(self): moe_router_load_balancing_type="sinkhorn", moe_router_topk=1, add_bias_linear=False, + moe_shared_expert_gate=shared_expert_gate, ) transformer_layer_spec = get_gpt_layer_local_spec( num_experts=num_moe_experts, moe_grouped_gemm=False @@ -49,7 +51,10 @@ def test_gpu_forward(self): assert isinstance(self.moe_layer, MoELayer) num_weights = sum([p.numel() for p in self.moe_layer.parameters()]) - assert num_weights == 3480 + 1152 + if shared_expert_gate: + assert num_weights == 3480 + 1152 + 12 # 12 is the weight of the gate + else: + assert num_weights == 3480 + 1152 assert self.moe_layer.shared_experts is not None assert self.moe_layer.shared_experts.stream is None assert self.moe_layer.token_dispatcher.shared_experts is None diff --git a/tests/unit_tests/transformer/test_attention.py b/tests/unit_tests/transformer/test_attention.py index 7e0e8c55807..419fc17ca0a 100644 --- a/tests/unit_tests/transformer/test_attention.py +++ b/tests/unit_tests/transformer/test_attention.py @@ -25,9 +25,11 @@ HAVE_FUSED_QKV_ROPE = False +@pytest.mark.parametrize("output_gate", [False, True]) class TestParallelAttention: - def setup_method(self, method): + @pytest.fixture(scope='function', autouse=True) + def setup_method(self, output_gate): Utils.initialize_model_parallel(1, 1) model_parallel_cuda_manual_seed(123) self.transformer_config = TransformerConfig( @@ -37,6 +39,7 @@ def setup_method(self, method): use_cpu_initialization=True, bf16=True, params_dtype=torch.bfloat16, + attention_output_gate=output_gate, ) self.parallel_attention = SelfAttention( self.transformer_config, @@ -44,7 +47,7 @@ def setup_method(self, method): layer_number=1, ) - def teardown_method(self, method): + def teardown_method(self): Utils.destroy_model_parallel() def test_constructor(self): @@ -52,7 +55,10 @@ def test_constructor(self): assert self.parallel_attention.layer_number == 1 num_weights = sum([p.numel() for p in self.parallel_attention.parameters()]) - assert num_weights == 66304 + if self.transformer_config.attention_output_gate: + assert num_weights == 82816 + else: + assert num_weights == 66304 def test_cpu_forward(self): # we can't currently do this because the global memory buffer is on GPU @@ -157,12 +163,15 @@ def test_checkpointed_gpu_forward(self): assert bias.shape[0] == config.hidden_size +@pytest.mark.parametrize("output_gate", [False, True]) class TestSelfAttention: - def setup_method(self, method): + @pytest.fixture(scope='function', autouse=True) + def setup_method(self, output_gate): + self.output_gate = output_gate Utils.destroy_model_parallel() - def teardown_method(self, method): + def teardown_method(self): Utils.destroy_model_parallel() def run_self_attention(self, pg_collection): @@ -171,6 +180,7 @@ def run_self_attention(self, pg_collection): num_layers=2, hidden_size=128, num_attention_heads=4, + attention_output_gate=self.output_gate, tensor_model_parallel_size=tensor_model_parallel_size, use_cpu_initialization=False, ) diff --git a/uv.lock b/uv.lock index 84da2bd685a..9634d2cbf88 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", @@ -631,7 +631,7 @@ name = "cffi" version = "2.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pycparser", marker = "implementation_name != 'PyPy'" }, + { name = "pycparser", marker = "(python_full_version < '3.12' and implementation_name != 'PyPy') or (python_full_version == '3.12.*' and implementation_name != 'PyPy' and extra == 'extra-13-megatron-core-dev') or (python_full_version == '3.12.*' and implementation_name != 'PyPy' and extra == 'extra-13-megatron-core-lts') or (python_full_version >= '3.13' and implementation_name != 'PyPy' and extra == 'extra-13-megatron-core-dev') or (implementation_name != 'PyPy' and platform_python_implementation != 'PyPy') or (implementation_name == 'PyPy' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" } wheels = [ @@ -777,7 +777,7 @@ name = "click" version = "8.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "colorama", marker = "sys_platform == 'win32' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/46/61/de6cd827efad202d7057d93e0fed9294b96952e188f7384832791c7b2254/click-8.3.0.tar.gz", hash = "sha256:e7b8232224eba16f4ebe410c25ced9f7875cb5f3263ffc93cc3e8da705e229c4", size = 276943, upload-time = "2025-09-18T17:32:23.696Z" } wheels = [ @@ -1080,6 +1080,30 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7c/24/f7351052cf9db771fe4f32fca47fd66e6d9b53d8613b17faf7d130a9d553/cython-3.1.4-py3-none-any.whl", hash = "sha256:d194d95e4fa029a3f6c7d46bdd16d973808c7ea4797586911fdb67cb98b1a2c6", size = 1227541, upload-time = "2025-09-16T07:20:29.595Z" }, ] +[[package]] +name = "datasets" +version = "4.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dill" }, + { name = "filelock" }, + { name = "fsspec", extra = ["http"], marker = "extra == 'extra-13-megatron-core-dev'" }, + { name = "huggingface-hub" }, + { name = "multiprocess" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pandas" }, + { name = "pyarrow" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "xxhash" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/91/a4/73f8e6ef52c535e1d20d5b2ca83bfe6de399d8b8b8a61ccc8d63d60735aa/datasets-4.1.1.tar.gz", hash = "sha256:7d8d5ba8b12861d2c44bfff9c83484ebfafff1ff553371e5901a8d3aab5450e2", size = 579324, upload-time = "2025-09-18T13:14:27.108Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/c8/09012ac195a0aab58755800d2efdc0e7d5905053509f12cb5d136c911cda/datasets-4.1.1-py3-none-any.whl", hash = "sha256:62e4f6899a36be9ec74a7e759a6951253cc85b3fcfa0a759b0efa8353b149dac", size = 503623, upload-time = "2025-09-18T13:14:25.111Z" }, +] + [[package]] name = "decorator" version = "5.2.1" @@ -1274,6 +1298,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" }, ] +[[package]] +name = "fla-core" +version = "0.3.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "einops" }, + { name = "torch", marker = "sys_platform == 'never'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/67/c6/10a1149b07e6bab45b2cb2d07f6b827716c2baf5f3404161753f25c6389b/fla_core-0.3.2.tar.gz", hash = "sha256:d38db16bc4e1c6fa8c04df442f246da1e6926a209426bc6ef703d41bfbc37c92", size = 296725, upload-time = "2025-09-10T07:43:40.155Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/f5/74947b33c07682280e65adbdf17c4ee94b30232df2f728bafecf13d1d820/fla_core-0.3.2-py3-none-any.whl", hash = "sha256:e751d5a41e33eee721a6fb6588bd857f6f36e0d14719a23b1ebdbd617d307209", size = 413594, upload-time = "2025-09-10T07:43:37.786Z" }, +] + [[package]] name = "flake8" version = "7.1.0" @@ -1288,6 +1325,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/43/d5147aadaa52558e94e024811f2f9543b4bd7203b3a9659eeb5dff9c61b3/flake8-7.1.0-py2.py3-none-any.whl", hash = "sha256:2e416edcc62471a64cea09353f4e7bdba32aeb079b6e360554c659a122b1bc6a", size = 57569, upload-time = "2024-06-15T21:37:05.342Z" }, ] +[[package]] +name = "flash-linear-attention" +version = "0.3.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "datasets" }, + { name = "fla-core" }, + { name = "pytest" }, + { name = "transformers" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/84/f6/e62c1e562a288557eba7f06f168a7615813d1a227327b8beb8ba426da2c5/flash_linear_attention-0.3.2.tar.gz", hash = "sha256:9147747316c2951fed4ebeb4fa87977c05d807dc70c93b46250b68a6eb1183e2", size = 150880, upload-time = "2025-09-10T07:43:41.37Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/d0/35ce9eac5f52c72005095aaa12a393d2656ed7ffedf925b2381a6b76d10c/flash_linear_attention-0.3.2-py3-none-any.whl", hash = "sha256:604e73361437ba786420ab195e2caa3fd19280503761e703fa353c5ce5c65376", size = 274592, upload-time = "2025-09-10T07:43:39.107Z" }, +] + [[package]] name = "flash-mla" version = "1.0.0+9edee0c" @@ -1474,6 +1526,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/47/71/70db47e4f6ce3e5c37a607355f80da8860a33226be640226ac52cb05ef2e/fsspec-2025.9.0-py3-none-any.whl", hash = "sha256:530dc2a2af60a414a832059574df4a6e10cce927f6f4a78209390fe38955cfb7", size = 199289, upload-time = "2025-09-02T19:10:47.708Z" }, ] +[package.optional-dependencies] +http = [ + { name = "aiohttp" }, +] + [[package]] name = "gitdb" version = "4.0.12" @@ -1671,7 +1728,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, { name = "fsspec" }, - { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "packaging" }, { name = "pyyaml" }, { name = "requests" }, @@ -2176,6 +2233,7 @@ dev = [ { name = "av" }, { name = "causal-conv1d" }, { name = "einops" }, + { name = "flash-linear-attention" }, { name = "flashinfer-python" }, { name = "mamba-ssm" }, { name = "megatron-energon", extra = ["av-decode"], marker = "extra == 'extra-13-megatron-core-dev'" }, @@ -2272,6 +2330,7 @@ requires-dist = [ { name = "causal-conv1d", marker = "extra == 'dev'", specifier = "~=1.5" }, { name = "einops", marker = "extra == 'dev'", specifier = "~=0.8" }, { name = "einops", marker = "extra == 'lts'" }, + { name = "flash-linear-attention", marker = "extra == 'dev'", specifier = "~=0.3.2" }, { name = "flashinfer-python", marker = "extra == 'dev'" }, { name = "flask-restful", marker = "extra == 'mlm'" }, { name = "mamba-ssm", marker = "extra == 'dev'", specifier = "~=2.2" }, @@ -2659,6 +2718,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fd/69/b547032297c7e63ba2af494edba695d781af8a0c6e89e4d06cf848b21d80/multidict-6.6.4-py3-none-any.whl", hash = "sha256:27d8f8e125c07cb954e54d75d04905a9bba8a439c1d84aca94949d4d03d8601c", size = 12313, upload-time = "2025-08-11T12:08:46.891Z" }, ] +[[package]] +name = "multiprocess" +version = "0.70.16" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dill" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b5/ae/04f39c5d0d0def03247c2893d6f2b83c136bf3320a2154d7b8858f2ba72d/multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1", size = 1772603, upload-time = "2024-01-28T18:52:34.85Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/76/6e712a2623d146d314f17598df5de7224c85c0060ef63fd95cc15a25b3fa/multiprocess-0.70.16-pp310-pypy310_pp73-macosx_10_13_x86_64.whl", hash = "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee", size = 134980, upload-time = "2024-01-28T18:52:15.731Z" }, + { url = "https://files.pythonhosted.org/packages/0f/ab/1e6e8009e380e22254ff539ebe117861e5bdb3bff1fc977920972237c6c7/multiprocess-0.70.16-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec", size = 134982, upload-time = "2024-01-28T18:52:17.783Z" }, + { url = "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", size = 134824, upload-time = "2024-01-28T18:52:26.062Z" }, + { url = "https://files.pythonhosted.org/packages/50/15/b56e50e8debaf439f44befec5b2af11db85f6e0f344c3113ae0be0593a91/multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a", size = 143519, upload-time = "2024-01-28T18:52:28.115Z" }, + { url = "https://files.pythonhosted.org/packages/0a/7d/a988f258104dcd2ccf1ed40fdc97e26c4ac351eeaf81d76e266c52d84e2f/multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e", size = 146741, upload-time = "2024-01-28T18:52:29.395Z" }, + { url = "https://files.pythonhosted.org/packages/ea/89/38df130f2c799090c978b366cfdf5b96d08de5b29a4a293df7f7429fa50b/multiprocess-0.70.16-py38-none-any.whl", hash = "sha256:a71d82033454891091a226dfc319d0cfa8019a4e888ef9ca910372a446de4435", size = 132628, upload-time = "2024-01-28T18:52:30.853Z" }, + { url = "https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", size = 133351, upload-time = "2024-01-28T18:52:31.981Z" }, +] + [[package]] name = "mypy-extensions" version = "1.1.0" @@ -3575,6 +3652,49 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/99/6c/64cafaceea3f99927e84b38a362ec6a8f24f33061c90bda77dfe1cd4c3c6/pulp-3.3.0-py3-none-any.whl", hash = "sha256:dd6ad2d63f196d1254eddf9dcff5cd224912c1f046120cb7c143c5b0eda63fae", size = 16387700, upload-time = "2025-09-18T08:14:53.368Z" }, ] +[[package]] +name = "pyarrow" +version = "21.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ef/c2/ea068b8f00905c06329a3dfcd40d0fcc2b7d0f2e355bdb25b65e0a0e4cd4/pyarrow-21.0.0.tar.gz", hash = "sha256:5051f2dccf0e283ff56335760cbc8622cf52264d67e359d5569541ac11b6d5bc", size = 1133487, upload-time = "2025-07-18T00:57:31.761Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/d9/110de31880016e2afc52d8580b397dbe47615defbf09ca8cf55f56c62165/pyarrow-21.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e563271e2c5ff4d4a4cbeb2c83d5cf0d4938b891518e676025f7268c6fe5fe26", size = 31196837, upload-time = "2025-07-18T00:54:34.755Z" }, + { url = "https://files.pythonhosted.org/packages/df/5f/c1c1997613abf24fceb087e79432d24c19bc6f7259cab57c2c8e5e545fab/pyarrow-21.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:fee33b0ca46f4c85443d6c450357101e47d53e6c3f008d658c27a2d020d44c79", size = 32659470, upload-time = "2025-07-18T00:54:38.329Z" }, + { url = "https://files.pythonhosted.org/packages/3e/ed/b1589a777816ee33ba123ba1e4f8f02243a844fed0deec97bde9fb21a5cf/pyarrow-21.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:7be45519b830f7c24b21d630a31d48bcebfd5d4d7f9d3bdb49da9cdf6d764edb", size = 41055619, upload-time = "2025-07-18T00:54:42.172Z" }, + { url = "https://files.pythonhosted.org/packages/44/28/b6672962639e85dc0ac36f71ab3a8f5f38e01b51343d7aa372a6b56fa3f3/pyarrow-21.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:26bfd95f6bff443ceae63c65dc7e048670b7e98bc892210acba7e4995d3d4b51", size = 42733488, upload-time = "2025-07-18T00:54:47.132Z" }, + { url = "https://files.pythonhosted.org/packages/f8/cc/de02c3614874b9089c94eac093f90ca5dfa6d5afe45de3ba847fd950fdf1/pyarrow-21.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:bd04ec08f7f8bd113c55868bd3fc442a9db67c27af098c5f814a3091e71cc61a", size = 43329159, upload-time = "2025-07-18T00:54:51.686Z" }, + { url = "https://files.pythonhosted.org/packages/a6/3e/99473332ac40278f196e105ce30b79ab8affab12f6194802f2593d6b0be2/pyarrow-21.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9b0b14b49ac10654332a805aedfc0147fb3469cbf8ea951b3d040dab12372594", size = 45050567, upload-time = "2025-07-18T00:54:56.679Z" }, + { url = "https://files.pythonhosted.org/packages/7b/f5/c372ef60593d713e8bfbb7e0c743501605f0ad00719146dc075faf11172b/pyarrow-21.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:9d9f8bcb4c3be7738add259738abdeddc363de1b80e3310e04067aa1ca596634", size = 26217959, upload-time = "2025-07-18T00:55:00.482Z" }, + { url = "https://files.pythonhosted.org/packages/94/dc/80564a3071a57c20b7c32575e4a0120e8a330ef487c319b122942d665960/pyarrow-21.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c077f48aab61738c237802836fc3844f85409a46015635198761b0d6a688f87b", size = 31243234, upload-time = "2025-07-18T00:55:03.812Z" }, + { url = "https://files.pythonhosted.org/packages/ea/cc/3b51cb2db26fe535d14f74cab4c79b191ed9a8cd4cbba45e2379b5ca2746/pyarrow-21.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:689f448066781856237eca8d1975b98cace19b8dd2ab6145bf49475478bcaa10", size = 32714370, upload-time = "2025-07-18T00:55:07.495Z" }, + { url = "https://files.pythonhosted.org/packages/24/11/a4431f36d5ad7d83b87146f515c063e4d07ef0b7240876ddb885e6b44f2e/pyarrow-21.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:479ee41399fcddc46159a551705b89c05f11e8b8cb8e968f7fec64f62d91985e", size = 41135424, upload-time = "2025-07-18T00:55:11.461Z" }, + { url = "https://files.pythonhosted.org/packages/74/dc/035d54638fc5d2971cbf1e987ccd45f1091c83bcf747281cf6cc25e72c88/pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:40ebfcb54a4f11bcde86bc586cbd0272bac0d516cfa539c799c2453768477569", size = 42823810, upload-time = "2025-07-18T00:55:16.301Z" }, + { url = "https://files.pythonhosted.org/packages/2e/3b/89fced102448a9e3e0d4dded1f37fa3ce4700f02cdb8665457fcc8015f5b/pyarrow-21.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8d58d8497814274d3d20214fbb24abcad2f7e351474357d552a8d53bce70c70e", size = 43391538, upload-time = "2025-07-18T00:55:23.82Z" }, + { url = "https://files.pythonhosted.org/packages/fb/bb/ea7f1bd08978d39debd3b23611c293f64a642557e8141c80635d501e6d53/pyarrow-21.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:585e7224f21124dd57836b1530ac8f2df2afc43c861d7bf3d58a4870c42ae36c", size = 45120056, upload-time = "2025-07-18T00:55:28.231Z" }, + { url = "https://files.pythonhosted.org/packages/6e/0b/77ea0600009842b30ceebc3337639a7380cd946061b620ac1a2f3cb541e2/pyarrow-21.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:555ca6935b2cbca2c0e932bedd853e9bc523098c39636de9ad4693b5b1df86d6", size = 26220568, upload-time = "2025-07-18T00:55:32.122Z" }, + { url = "https://files.pythonhosted.org/packages/ca/d4/d4f817b21aacc30195cf6a46ba041dd1be827efa4a623cc8bf39a1c2a0c0/pyarrow-21.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:3a302f0e0963db37e0a24a70c56cf91a4faa0bca51c23812279ca2e23481fccd", size = 31160305, upload-time = "2025-07-18T00:55:35.373Z" }, + { url = "https://files.pythonhosted.org/packages/a2/9c/dcd38ce6e4b4d9a19e1d36914cb8e2b1da4e6003dd075474c4cfcdfe0601/pyarrow-21.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:b6b27cf01e243871390474a211a7922bfbe3bda21e39bc9160daf0da3fe48876", size = 32684264, upload-time = "2025-07-18T00:55:39.303Z" }, + { url = "https://files.pythonhosted.org/packages/4f/74/2a2d9f8d7a59b639523454bec12dba35ae3d0a07d8ab529dc0809f74b23c/pyarrow-21.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:e72a8ec6b868e258a2cd2672d91f2860ad532d590ce94cdf7d5e7ec674ccf03d", size = 41108099, upload-time = "2025-07-18T00:55:42.889Z" }, + { url = "https://files.pythonhosted.org/packages/ad/90/2660332eeb31303c13b653ea566a9918484b6e4d6b9d2d46879a33ab0622/pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b7ae0bbdc8c6674259b25bef5d2a1d6af5d39d7200c819cf99e07f7dfef1c51e", size = 42829529, upload-time = "2025-07-18T00:55:47.069Z" }, + { url = "https://files.pythonhosted.org/packages/33/27/1a93a25c92717f6aa0fca06eb4700860577d016cd3ae51aad0e0488ac899/pyarrow-21.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:58c30a1729f82d201627c173d91bd431db88ea74dcaa3885855bc6203e433b82", size = 43367883, upload-time = "2025-07-18T00:55:53.069Z" }, + { url = "https://files.pythonhosted.org/packages/05/d9/4d09d919f35d599bc05c6950095e358c3e15148ead26292dfca1fb659b0c/pyarrow-21.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:072116f65604b822a7f22945a7a6e581cfa28e3454fdcc6939d4ff6090126623", size = 45133802, upload-time = "2025-07-18T00:55:57.714Z" }, + { url = "https://files.pythonhosted.org/packages/71/30/f3795b6e192c3ab881325ffe172e526499eb3780e306a15103a2764916a2/pyarrow-21.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:cf56ec8b0a5c8c9d7021d6fd754e688104f9ebebf1bf4449613c9531f5346a18", size = 26203175, upload-time = "2025-07-18T00:56:01.364Z" }, + { url = "https://files.pythonhosted.org/packages/16/ca/c7eaa8e62db8fb37ce942b1ea0c6d7abfe3786ca193957afa25e71b81b66/pyarrow-21.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e99310a4ebd4479bcd1964dff9e14af33746300cb014aa4a3781738ac63baf4a", size = 31154306, upload-time = "2025-07-18T00:56:04.42Z" }, + { url = "https://files.pythonhosted.org/packages/ce/e8/e87d9e3b2489302b3a1aea709aaca4b781c5252fcb812a17ab6275a9a484/pyarrow-21.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:d2fe8e7f3ce329a71b7ddd7498b3cfac0eeb200c2789bd840234f0dc271a8efe", size = 32680622, upload-time = "2025-07-18T00:56:07.505Z" }, + { url = "https://files.pythonhosted.org/packages/84/52/79095d73a742aa0aba370c7942b1b655f598069489ab387fe47261a849e1/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:f522e5709379d72fb3da7785aa489ff0bb87448a9dc5a75f45763a795a089ebd", size = 41104094, upload-time = "2025-07-18T00:56:10.994Z" }, + { url = "https://files.pythonhosted.org/packages/89/4b/7782438b551dbb0468892a276b8c789b8bbdb25ea5c5eb27faadd753e037/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:69cbbdf0631396e9925e048cfa5bce4e8c3d3b41562bbd70c685a8eb53a91e61", size = 42825576, upload-time = "2025-07-18T00:56:15.569Z" }, + { url = "https://files.pythonhosted.org/packages/b3/62/0f29de6e0a1e33518dec92c65be0351d32d7ca351e51ec5f4f837a9aab91/pyarrow-21.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:731c7022587006b755d0bdb27626a1a3bb004bb56b11fb30d98b6c1b4718579d", size = 43368342, upload-time = "2025-07-18T00:56:19.531Z" }, + { url = "https://files.pythonhosted.org/packages/90/c7/0fa1f3f29cf75f339768cc698c8ad4ddd2481c1742e9741459911c9ac477/pyarrow-21.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:dc56bc708f2d8ac71bd1dcb927e458c93cec10b98eb4120206a4091db7b67b99", size = 45131218, upload-time = "2025-07-18T00:56:23.347Z" }, + { url = "https://files.pythonhosted.org/packages/01/63/581f2076465e67b23bc5a37d4a2abff8362d389d29d8105832e82c9c811c/pyarrow-21.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:186aa00bca62139f75b7de8420f745f2af12941595bbbfa7ed3870ff63e25636", size = 26087551, upload-time = "2025-07-18T00:56:26.758Z" }, + { url = "https://files.pythonhosted.org/packages/c9/ab/357d0d9648bb8241ee7348e564f2479d206ebe6e1c47ac5027c2e31ecd39/pyarrow-21.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:a7a102574faa3f421141a64c10216e078df467ab9576684d5cd696952546e2da", size = 31290064, upload-time = "2025-07-18T00:56:30.214Z" }, + { url = "https://files.pythonhosted.org/packages/3f/8a/5685d62a990e4cac2043fc76b4661bf38d06efed55cf45a334b455bd2759/pyarrow-21.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:1e005378c4a2c6db3ada3ad4c217b381f6c886f0a80d6a316fe586b90f77efd7", size = 32727837, upload-time = "2025-07-18T00:56:33.935Z" }, + { url = "https://files.pythonhosted.org/packages/fc/de/c0828ee09525c2bafefd3e736a248ebe764d07d0fd762d4f0929dbc516c9/pyarrow-21.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:65f8e85f79031449ec8706b74504a316805217b35b6099155dd7e227eef0d4b6", size = 41014158, upload-time = "2025-07-18T00:56:37.528Z" }, + { url = "https://files.pythonhosted.org/packages/6e/26/a2865c420c50b7a3748320b614f3484bfcde8347b2639b2b903b21ce6a72/pyarrow-21.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:3a81486adc665c7eb1a2bde0224cfca6ceaba344a82a971ef059678417880eb8", size = 42667885, upload-time = "2025-07-18T00:56:41.483Z" }, + { url = "https://files.pythonhosted.org/packages/0a/f9/4ee798dc902533159250fb4321267730bc0a107d8c6889e07c3add4fe3a5/pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fc0d2f88b81dcf3ccf9a6ae17f89183762c8a94a5bdcfa09e05cfe413acf0503", size = 43276625, upload-time = "2025-07-18T00:56:48.002Z" }, + { url = "https://files.pythonhosted.org/packages/5a/da/e02544d6997037a4b0d22d8e5f66bc9315c3671371a8b18c79ade1cefe14/pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6299449adf89df38537837487a4f8d3bd91ec94354fdd2a7d30bc11c48ef6e79", size = 44951890, upload-time = "2025-07-18T00:56:52.568Z" }, + { url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" }, +] + [[package]] name = "pybind11" version = "3.0.1" @@ -5061,7 +5181,7 @@ name = "sympy" version = "1.14.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "mpmath", marker = "sys_platform != 'linux'" }, + { name = "mpmath", marker = "sys_platform != 'linux' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } wheels = [ @@ -5310,15 +5430,15 @@ name = "torch" version = "2.8.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock", marker = "sys_platform != 'linux'" }, - { name = "fsspec", marker = "sys_platform != 'linux'" }, - { name = "jinja2", marker = "sys_platform != 'linux'" }, + { name = "filelock", marker = "sys_platform != 'linux' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "fsspec", marker = "sys_platform != 'linux' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "jinja2", marker = "sys_platform != 'linux' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and sys_platform != 'linux') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and sys_platform != 'linux') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "setuptools", marker = "python_full_version >= '3.12' and sys_platform != 'linux'" }, - { name = "sympy", marker = "sys_platform != 'linux'" }, - { name = "triton", marker = "sys_platform == 'never'" }, - { name = "typing-extensions", marker = "sys_platform != 'linux'" }, + { name = "setuptools", marker = "(python_full_version >= '3.12' and sys_platform != 'linux') or (python_full_version < '3.12' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "sympy", marker = "sys_platform != 'linux' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "triton", marker = "sys_platform == 'never' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "typing-extensions", marker = "sys_platform != 'linux' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/63/28/110f7274254f1b8476c561dada127173f994afa2b1ffc044efb773c15650/torch-2.8.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:0be92c08b44009d4131d1ff7a8060d10bafdb7ddcb7359ef8d8c5169007ea905", size = 102052793, upload-time = "2025-08-06T14:53:15.852Z" }, @@ -5415,7 +5535,7 @@ name = "tqdm" version = "4.67.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "colorama", marker = "sys_platform == 'win32' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" } wheels = [ @@ -5490,7 +5610,7 @@ name = "triton" version = "3.4.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "setuptools", marker = "sys_platform != 'linux'" }, + { name = "setuptools", marker = "sys_platform != 'linux' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/62/ee/0ee5f64a87eeda19bbad9bc54ae5ca5b98186ed00055281fd40fb4beb10e/triton-3.4.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7ff2785de9bc02f500e085420273bb5cc9c9bb767584a4aa28d6e360cec70128", size = 155430069, upload-time = "2025-07-30T19:58:21.715Z" }, @@ -5961,6 +6081,79 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1f/03/75a399549e82b6a20ff84d71ee9e777caf6bc687e8004d8b3699565a6aad/xattr-1.2.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb669f01627962ce2bc556f19d421162247bc2cad0d4625d6ea5eb32af4cf29b", size = 17908, upload-time = "2025-07-14T03:15:32.335Z" }, ] +[[package]] +name = "xxhash" +version = "3.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/00/5e/d6e5258d69df8b4ed8c83b6664f2b47d30d2dec551a29ad72a6c69eafd31/xxhash-3.5.0.tar.gz", hash = "sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f", size = 84241, upload-time = "2024-08-17T09:20:38.972Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/8a/0e9feca390d512d293afd844d31670e25608c4a901e10202aa98785eab09/xxhash-3.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ece616532c499ee9afbb83078b1b952beffef121d989841f7f4b3dc5ac0fd212", size = 31970, upload-time = "2024-08-17T09:17:35.675Z" }, + { url = "https://files.pythonhosted.org/packages/16/e6/be5aa49580cd064a18200ab78e29b88b1127e1a8c7955eb8ecf81f2626eb/xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3171f693dbc2cef6477054a665dc255d996646b4023fe56cb4db80e26f4cc520", size = 30801, upload-time = "2024-08-17T09:17:37.353Z" }, + { url = "https://files.pythonhosted.org/packages/20/ee/b8a99ebbc6d1113b3a3f09e747fa318c3cde5b04bd9c197688fadf0eeae8/xxhash-3.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c5d3e570ef46adaf93fc81b44aca6002b5a4d8ca11bd0580c07eac537f36680", size = 220927, upload-time = "2024-08-17T09:17:38.835Z" }, + { url = "https://files.pythonhosted.org/packages/58/62/15d10582ef159283a5c2b47f6d799fc3303fe3911d5bb0bcc820e1ef7ff4/xxhash-3.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7cb29a034301e2982df8b1fe6328a84f4b676106a13e9135a0d7e0c3e9f806da", size = 200360, upload-time = "2024-08-17T09:17:40.851Z" }, + { url = "https://files.pythonhosted.org/packages/23/41/61202663ea9b1bd8e53673b8ec9e2619989353dba8cfb68e59a9cbd9ffe3/xxhash-3.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d0d307d27099bb0cbeea7260eb39ed4fdb99c5542e21e94bb6fd29e49c57a23", size = 428528, upload-time = "2024-08-17T09:17:42.545Z" }, + { url = "https://files.pythonhosted.org/packages/f2/07/d9a3059f702dec5b3b703737afb6dda32f304f6e9da181a229dafd052c29/xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0342aafd421795d740e514bc9858ebddfc705a75a8c5046ac56d85fe97bf196", size = 194149, upload-time = "2024-08-17T09:17:44.361Z" }, + { url = "https://files.pythonhosted.org/packages/eb/58/27caadf78226ecf1d62dbd0c01d152ed381c14c1ee4ad01f0d460fc40eac/xxhash-3.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3dbbd9892c5ebffeca1ed620cf0ade13eb55a0d8c84e0751a6653adc6ac40d0c", size = 207703, upload-time = "2024-08-17T09:17:46.656Z" }, + { url = "https://files.pythonhosted.org/packages/b1/08/32d558ce23e1e068453c39aed7b3c1cdc690c177873ec0ca3a90d5808765/xxhash-3.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4cc2d67fdb4d057730c75a64c5923abfa17775ae234a71b0200346bfb0a7f482", size = 216255, upload-time = "2024-08-17T09:17:48.031Z" }, + { url = "https://files.pythonhosted.org/packages/3f/d4/2b971e2d2b0a61045f842b622ef11e94096cf1f12cd448b6fd426e80e0e2/xxhash-3.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ec28adb204b759306a3d64358a5e5c07d7b1dd0ccbce04aa76cb9377b7b70296", size = 202744, upload-time = "2024-08-17T09:17:50.045Z" }, + { url = "https://files.pythonhosted.org/packages/19/ae/6a6438864a8c4c39915d7b65effd85392ebe22710412902487e51769146d/xxhash-3.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1328f6d8cca2b86acb14104e381225a3d7b42c92c4b86ceae814e5c400dbb415", size = 210115, upload-time = "2024-08-17T09:17:51.834Z" }, + { url = "https://files.pythonhosted.org/packages/48/7d/b3c27c27d1fc868094d02fe4498ccce8cec9fcc591825c01d6bcb0b4fc49/xxhash-3.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8d47ebd9f5d9607fd039c1fbf4994e3b071ea23eff42f4ecef246ab2b7334198", size = 414247, upload-time = "2024-08-17T09:17:53.094Z" }, + { url = "https://files.pythonhosted.org/packages/a1/05/918f9e7d2fbbd334b829997045d341d6239b563c44e683b9a7ef8fe50f5d/xxhash-3.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b96d559e0fcddd3343c510a0fe2b127fbff16bf346dd76280b82292567523442", size = 191419, upload-time = "2024-08-17T09:17:54.906Z" }, + { url = "https://files.pythonhosted.org/packages/08/29/dfe393805b2f86bfc47c290b275f0b7c189dc2f4e136fd4754f32eb18a8d/xxhash-3.5.0-cp310-cp310-win32.whl", hash = "sha256:61c722ed8d49ac9bc26c7071eeaa1f6ff24053d553146d5df031802deffd03da", size = 30114, upload-time = "2024-08-17T09:17:56.566Z" }, + { url = "https://files.pythonhosted.org/packages/7b/d7/aa0b22c4ebb7c3ccb993d4c565132abc641cd11164f8952d89eb6a501909/xxhash-3.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:9bed5144c6923cc902cd14bb8963f2d5e034def4486ab0bbe1f58f03f042f9a9", size = 30003, upload-time = "2024-08-17T09:17:57.596Z" }, + { url = "https://files.pythonhosted.org/packages/69/12/f969b81541ee91b55f1ce469d7ab55079593c80d04fd01691b550e535000/xxhash-3.5.0-cp310-cp310-win_arm64.whl", hash = "sha256:893074d651cf25c1cc14e3bea4fceefd67f2921b1bb8e40fcfeba56820de80c6", size = 26773, upload-time = "2024-08-17T09:17:59.169Z" }, + { url = "https://files.pythonhosted.org/packages/b8/c7/afed0f131fbda960ff15eee7f304fa0eeb2d58770fade99897984852ef23/xxhash-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02c2e816896dc6f85922ced60097bcf6f008dedfc5073dcba32f9c8dd786f3c1", size = 31969, upload-time = "2024-08-17T09:18:00.852Z" }, + { url = "https://files.pythonhosted.org/packages/8c/0c/7c3bc6d87e5235672fcc2fb42fd5ad79fe1033925f71bf549ee068c7d1ca/xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6027dcd885e21581e46d3c7f682cfb2b870942feeed58a21c29583512c3f09f8", size = 30800, upload-time = "2024-08-17T09:18:01.863Z" }, + { url = "https://files.pythonhosted.org/packages/04/9e/01067981d98069eec1c20201f8c145367698e9056f8bc295346e4ea32dd1/xxhash-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1308fa542bbdbf2fa85e9e66b1077eea3a88bef38ee8a06270b4298a7a62a166", size = 221566, upload-time = "2024-08-17T09:18:03.461Z" }, + { url = "https://files.pythonhosted.org/packages/d4/09/d4996de4059c3ce5342b6e1e6a77c9d6c91acce31f6ed979891872dd162b/xxhash-3.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c28b2fdcee797e1c1961cd3bcd3d545cab22ad202c846235197935e1df2f8ef7", size = 201214, upload-time = "2024-08-17T09:18:05.616Z" }, + { url = "https://files.pythonhosted.org/packages/62/f5/6d2dc9f8d55a7ce0f5e7bfef916e67536f01b85d32a9fbf137d4cadbee38/xxhash-3.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:924361811732ddad75ff23e90efd9ccfda4f664132feecb90895bade6a1b4623", size = 429433, upload-time = "2024-08-17T09:18:06.957Z" }, + { url = "https://files.pythonhosted.org/packages/d9/72/9256303f10e41ab004799a4aa74b80b3c5977d6383ae4550548b24bd1971/xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89997aa1c4b6a5b1e5b588979d1da048a3c6f15e55c11d117a56b75c84531f5a", size = 194822, upload-time = "2024-08-17T09:18:08.331Z" }, + { url = "https://files.pythonhosted.org/packages/34/92/1a3a29acd08248a34b0e6a94f4e0ed9b8379a4ff471f1668e4dce7bdbaa8/xxhash-3.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c4f4e8c59837de103344eb1c8a3851f670309eb5c361f746805c5471b8c88", size = 208538, upload-time = "2024-08-17T09:18:10.332Z" }, + { url = "https://files.pythonhosted.org/packages/53/ad/7fa1a109663366de42f724a1cdb8e796a260dbac45047bce153bc1e18abf/xxhash-3.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbd2ecfbfee70bc1a4acb7461fa6af7748ec2ab08ac0fa298f281c51518f982c", size = 216953, upload-time = "2024-08-17T09:18:11.707Z" }, + { url = "https://files.pythonhosted.org/packages/35/02/137300e24203bf2b2a49b48ce898ecce6fd01789c0fcd9c686c0a002d129/xxhash-3.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:25b5a51dc3dfb20a10833c8eee25903fd2e14059e9afcd329c9da20609a307b2", size = 203594, upload-time = "2024-08-17T09:18:13.799Z" }, + { url = "https://files.pythonhosted.org/packages/23/03/aeceb273933d7eee248c4322b98b8e971f06cc3880e5f7602c94e5578af5/xxhash-3.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a8fb786fb754ef6ff8c120cb96629fb518f8eb5a61a16aac3a979a9dbd40a084", size = 210971, upload-time = "2024-08-17T09:18:15.824Z" }, + { url = "https://files.pythonhosted.org/packages/e3/64/ed82ec09489474cbb35c716b189ddc1521d8b3de12b1b5ab41ce7f70253c/xxhash-3.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a905ad00ad1e1c34fe4e9d7c1d949ab09c6fa90c919860c1534ff479f40fd12d", size = 415050, upload-time = "2024-08-17T09:18:17.142Z" }, + { url = "https://files.pythonhosted.org/packages/71/43/6db4c02dcb488ad4e03bc86d70506c3d40a384ee73c9b5c93338eb1f3c23/xxhash-3.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:963be41bcd49f53af6d795f65c0da9b4cc518c0dd9c47145c98f61cb464f4839", size = 192216, upload-time = "2024-08-17T09:18:18.779Z" }, + { url = "https://files.pythonhosted.org/packages/22/6d/db4abec29e7a567455344433d095fdb39c97db6955bb4a2c432e486b4d28/xxhash-3.5.0-cp311-cp311-win32.whl", hash = "sha256:109b436096d0a2dd039c355fa3414160ec4d843dfecc64a14077332a00aeb7da", size = 30120, upload-time = "2024-08-17T09:18:20.009Z" }, + { url = "https://files.pythonhosted.org/packages/52/1c/fa3b61c0cf03e1da4767213672efe186b1dfa4fc901a4a694fb184a513d1/xxhash-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b702f806693201ad6c0a05ddbbe4c8f359626d0b3305f766077d51388a6bac58", size = 30003, upload-time = "2024-08-17T09:18:21.052Z" }, + { url = "https://files.pythonhosted.org/packages/6b/8e/9e6fc572acf6e1cc7ccb01973c213f895cb8668a9d4c2b58a99350da14b7/xxhash-3.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:c4dcb4120d0cc3cc448624147dba64e9021b278c63e34a38789b688fd0da9bf3", size = 26777, upload-time = "2024-08-17T09:18:22.809Z" }, + { url = "https://files.pythonhosted.org/packages/07/0e/1bfce2502c57d7e2e787600b31c83535af83746885aa1a5f153d8c8059d6/xxhash-3.5.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:14470ace8bd3b5d51318782cd94e6f94431974f16cb3b8dc15d52f3b69df8e00", size = 31969, upload-time = "2024-08-17T09:18:24.025Z" }, + { url = "https://files.pythonhosted.org/packages/3f/d6/8ca450d6fe5b71ce521b4e5db69622383d039e2b253e9b2f24f93265b52c/xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:59aa1203de1cb96dbeab595ded0ad0c0056bb2245ae11fac11c0ceea861382b9", size = 30787, upload-time = "2024-08-17T09:18:25.318Z" }, + { url = "https://files.pythonhosted.org/packages/5b/84/de7c89bc6ef63d750159086a6ada6416cc4349eab23f76ab870407178b93/xxhash-3.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08424f6648526076e28fae6ea2806c0a7d504b9ef05ae61d196d571e5c879c84", size = 220959, upload-time = "2024-08-17T09:18:26.518Z" }, + { url = "https://files.pythonhosted.org/packages/fe/86/51258d3e8a8545ff26468c977101964c14d56a8a37f5835bc0082426c672/xxhash-3.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:61a1ff00674879725b194695e17f23d3248998b843eb5e933007ca743310f793", size = 200006, upload-time = "2024-08-17T09:18:27.905Z" }, + { url = "https://files.pythonhosted.org/packages/02/0a/96973bd325412feccf23cf3680fd2246aebf4b789122f938d5557c54a6b2/xxhash-3.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2f2c61bee5844d41c3eb015ac652a0229e901074951ae48581d58bfb2ba01be", size = 428326, upload-time = "2024-08-17T09:18:29.335Z" }, + { url = "https://files.pythonhosted.org/packages/11/a7/81dba5010f7e733de88af9555725146fc133be97ce36533867f4c7e75066/xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d32a592cac88d18cc09a89172e1c32d7f2a6e516c3dfde1b9adb90ab5df54a6", size = 194380, upload-time = "2024-08-17T09:18:30.706Z" }, + { url = "https://files.pythonhosted.org/packages/fb/7d/f29006ab398a173f4501c0e4977ba288f1c621d878ec217b4ff516810c04/xxhash-3.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:70dabf941dede727cca579e8c205e61121afc9b28516752fd65724be1355cc90", size = 207934, upload-time = "2024-08-17T09:18:32.133Z" }, + { url = "https://files.pythonhosted.org/packages/8a/6e/6e88b8f24612510e73d4d70d9b0c7dff62a2e78451b9f0d042a5462c8d03/xxhash-3.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e5d0ddaca65ecca9c10dcf01730165fd858533d0be84c75c327487c37a906a27", size = 216301, upload-time = "2024-08-17T09:18:33.474Z" }, + { url = "https://files.pythonhosted.org/packages/af/51/7862f4fa4b75a25c3b4163c8a873f070532fe5f2d3f9b3fc869c8337a398/xxhash-3.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e5b5e16c5a480fe5f59f56c30abdeba09ffd75da8d13f6b9b6fd224d0b4d0a2", size = 203351, upload-time = "2024-08-17T09:18:34.889Z" }, + { url = "https://files.pythonhosted.org/packages/22/61/8d6a40f288f791cf79ed5bb113159abf0c81d6efb86e734334f698eb4c59/xxhash-3.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149b7914451eb154b3dfaa721315117ea1dac2cc55a01bfbd4df7c68c5dd683d", size = 210294, upload-time = "2024-08-17T09:18:36.355Z" }, + { url = "https://files.pythonhosted.org/packages/17/02/215c4698955762d45a8158117190261b2dbefe9ae7e5b906768c09d8bc74/xxhash-3.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:eade977f5c96c677035ff39c56ac74d851b1cca7d607ab3d8f23c6b859379cab", size = 414674, upload-time = "2024-08-17T09:18:38.536Z" }, + { url = "https://files.pythonhosted.org/packages/31/5c/b7a8db8a3237cff3d535261325d95de509f6a8ae439a5a7a4ffcff478189/xxhash-3.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fa9f547bd98f5553d03160967866a71056a60960be00356a15ecc44efb40ba8e", size = 192022, upload-time = "2024-08-17T09:18:40.138Z" }, + { url = "https://files.pythonhosted.org/packages/78/e3/dd76659b2811b3fd06892a8beb850e1996b63e9235af5a86ea348f053e9e/xxhash-3.5.0-cp312-cp312-win32.whl", hash = "sha256:f7b58d1fd3551b8c80a971199543379be1cee3d0d409e1f6d8b01c1a2eebf1f8", size = 30170, upload-time = "2024-08-17T09:18:42.163Z" }, + { url = "https://files.pythonhosted.org/packages/d9/6b/1c443fe6cfeb4ad1dcf231cdec96eb94fb43d6498b4469ed8b51f8b59a37/xxhash-3.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:fa0cafd3a2af231b4e113fba24a65d7922af91aeb23774a8b78228e6cd785e3e", size = 30040, upload-time = "2024-08-17T09:18:43.699Z" }, + { url = "https://files.pythonhosted.org/packages/0f/eb/04405305f290173acc0350eba6d2f1a794b57925df0398861a20fbafa415/xxhash-3.5.0-cp312-cp312-win_arm64.whl", hash = "sha256:586886c7e89cb9828bcd8a5686b12e161368e0064d040e225e72607b43858ba2", size = 26796, upload-time = "2024-08-17T09:18:45.29Z" }, + { url = "https://files.pythonhosted.org/packages/c9/b8/e4b3ad92d249be5c83fa72916c9091b0965cb0faeff05d9a0a3870ae6bff/xxhash-3.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:37889a0d13b0b7d739cfc128b1c902f04e32de17b33d74b637ad42f1c55101f6", size = 31795, upload-time = "2024-08-17T09:18:46.813Z" }, + { url = "https://files.pythonhosted.org/packages/fc/d8/b3627a0aebfbfa4c12a41e22af3742cf08c8ea84f5cc3367b5de2d039cce/xxhash-3.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:97a662338797c660178e682f3bc180277b9569a59abfb5925e8620fba00b9fc5", size = 30792, upload-time = "2024-08-17T09:18:47.862Z" }, + { url = "https://files.pythonhosted.org/packages/c3/cc/762312960691da989c7cd0545cb120ba2a4148741c6ba458aa723c00a3f8/xxhash-3.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f85e0108d51092bdda90672476c7d909c04ada6923c14ff9d913c4f7dc8a3bc", size = 220950, upload-time = "2024-08-17T09:18:49.06Z" }, + { url = "https://files.pythonhosted.org/packages/fe/e9/cc266f1042c3c13750e86a535496b58beb12bf8c50a915c336136f6168dc/xxhash-3.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd2fd827b0ba763ac919440042302315c564fdb797294d86e8cdd4578e3bc7f3", size = 199980, upload-time = "2024-08-17T09:18:50.445Z" }, + { url = "https://files.pythonhosted.org/packages/bf/85/a836cd0dc5cc20376de26b346858d0ac9656f8f730998ca4324921a010b9/xxhash-3.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:82085c2abec437abebf457c1d12fccb30cc8b3774a0814872511f0f0562c768c", size = 428324, upload-time = "2024-08-17T09:18:51.988Z" }, + { url = "https://files.pythonhosted.org/packages/b4/0e/15c243775342ce840b9ba34aceace06a1148fa1630cd8ca269e3223987f5/xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07fda5de378626e502b42b311b049848c2ef38784d0d67b6f30bb5008642f8eb", size = 194370, upload-time = "2024-08-17T09:18:54.164Z" }, + { url = "https://files.pythonhosted.org/packages/87/a1/b028bb02636dfdc190da01951d0703b3d904301ed0ef6094d948983bef0e/xxhash-3.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c279f0d2b34ef15f922b77966640ade58b4ccdfef1c4d94b20f2a364617a493f", size = 207911, upload-time = "2024-08-17T09:18:55.509Z" }, + { url = "https://files.pythonhosted.org/packages/80/d5/73c73b03fc0ac73dacf069fdf6036c9abad82de0a47549e9912c955ab449/xxhash-3.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:89e66ceed67b213dec5a773e2f7a9e8c58f64daeb38c7859d8815d2c89f39ad7", size = 216352, upload-time = "2024-08-17T09:18:57.073Z" }, + { url = "https://files.pythonhosted.org/packages/b6/2a/5043dba5ddbe35b4fe6ea0a111280ad9c3d4ba477dd0f2d1fe1129bda9d0/xxhash-3.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bcd51708a633410737111e998ceb3b45d3dbc98c0931f743d9bb0a209033a326", size = 203410, upload-time = "2024-08-17T09:18:58.54Z" }, + { url = "https://files.pythonhosted.org/packages/a2/b2/9a8ded888b7b190aed75b484eb5c853ddd48aa2896e7b59bbfbce442f0a1/xxhash-3.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3ff2c0a34eae7df88c868be53a8dd56fbdf592109e21d4bfa092a27b0bf4a7bf", size = 210322, upload-time = "2024-08-17T09:18:59.943Z" }, + { url = "https://files.pythonhosted.org/packages/98/62/440083fafbc917bf3e4b67c2ade621920dd905517e85631c10aac955c1d2/xxhash-3.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:4e28503dccc7d32e0b9817aa0cbfc1f45f563b2c995b7a66c4c8a0d232e840c7", size = 414725, upload-time = "2024-08-17T09:19:01.332Z" }, + { url = "https://files.pythonhosted.org/packages/75/db/009206f7076ad60a517e016bb0058381d96a007ce3f79fa91d3010f49cc2/xxhash-3.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a6c50017518329ed65a9e4829154626f008916d36295b6a3ba336e2458824c8c", size = 192070, upload-time = "2024-08-17T09:19:03.007Z" }, + { url = "https://files.pythonhosted.org/packages/1f/6d/c61e0668943a034abc3a569cdc5aeae37d686d9da7e39cf2ed621d533e36/xxhash-3.5.0-cp313-cp313-win32.whl", hash = "sha256:53a068fe70301ec30d868ece566ac90d873e3bb059cf83c32e76012c889b8637", size = 30172, upload-time = "2024-08-17T09:19:04.355Z" }, + { url = "https://files.pythonhosted.org/packages/96/14/8416dce965f35e3d24722cdf79361ae154fa23e2ab730e5323aa98d7919e/xxhash-3.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:80babcc30e7a1a484eab952d76a4f4673ff601f54d5142c26826502740e70b43", size = 30041, upload-time = "2024-08-17T09:19:05.435Z" }, + { url = "https://files.pythonhosted.org/packages/27/ee/518b72faa2073f5aa8e3262408d284892cb79cf2754ba0c3a5870645ef73/xxhash-3.5.0-cp313-cp313-win_arm64.whl", hash = "sha256:4811336f1ce11cac89dcbd18f3a25c527c16311709a89313c3acaf771def2d4b", size = 26801, upload-time = "2024-08-17T09:19:06.547Z" }, + { url = "https://files.pythonhosted.org/packages/ab/9a/233606bada5bd6f50b2b72c45de3d9868ad551e83893d2ac86dc7bb8553a/xxhash-3.5.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:2014c5b3ff15e64feecb6b713af12093f75b7926049e26a580e94dcad3c73d8c", size = 29732, upload-time = "2024-08-17T09:20:11.175Z" }, + { url = "https://files.pythonhosted.org/packages/0c/67/f75276ca39e2c6604e3bee6c84e9db8a56a4973fde9bf35989787cf6e8aa/xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fab81ef75003eda96239a23eda4e4543cedc22e34c373edcaf744e721a163986", size = 36214, upload-time = "2024-08-17T09:20:12.335Z" }, + { url = "https://files.pythonhosted.org/packages/0f/f8/f6c61fd794229cc3848d144f73754a0c107854372d7261419dcbbd286299/xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e2febf914ace002132aa09169cc572e0d8959d0f305f93d5828c4836f9bc5a6", size = 32020, upload-time = "2024-08-17T09:20:13.537Z" }, + { url = "https://files.pythonhosted.org/packages/79/d3/c029c99801526f859e6b38d34ab87c08993bf3dcea34b11275775001638a/xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5d3a10609c51da2a1c0ea0293fc3968ca0a18bd73838455b5bca3069d7f8e32b", size = 40515, upload-time = "2024-08-17T09:20:14.669Z" }, + { url = "https://files.pythonhosted.org/packages/62/e3/bef7b82c1997579c94de9ac5ea7626d01ae5858aa22bf4fcb38bf220cb3e/xxhash-3.5.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5a74f23335b9689b66eb6dbe2a931a88fcd7a4c2cc4b1cb0edba8ce381c7a1da", size = 30064, upload-time = "2024-08-17T09:20:15.925Z" }, +] + [[package]] name = "yarl" version = "1.21.0" From c7dee4ba612e3989d9b5a1ed1fb0a1487e00a24c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 14 Oct 2025 09:21:48 +0000 Subject: [PATCH 015/248] !4236 - [Dev] Formatting dev branch code to avoid linting pipeline failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .../core/optimizer/layer_wise_optimizer.py | 42 +++++++++++-------- megatron/core/ssm/gated_delta_net.py | 6 +-- megatron/core/transformer/attention.py | 13 +++--- .../core/transformer/moe/token_dispatcher.py | 6 ++- megatron/core/transformer/spec_utils.py | 9 +++- 5 files changed, 47 insertions(+), 29 deletions(-) diff --git a/megatron/core/optimizer/layer_wise_optimizer.py b/megatron/core/optimizer/layer_wise_optimizer.py index 6c77be48e30..2bf4e5e613b 100644 --- a/megatron/core/optimizer/layer_wise_optimizer.py +++ b/megatron/core/optimizer/layer_wise_optimizer.py @@ -1,13 +1,13 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import List, Optional import torch from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.utils import get_pg_rank, get_pg_size -from .clip_grads import clip_grad_by_total_norm_fp32, count_zeros_fp32, get_grad_norm_fp32 +from .clip_grads import count_zeros_fp32, get_grad_norm_fp32 from .optimizer import ChainedOptimizer, Float16OptimizerWithFloat16Params, MegatronOptimizer from .optimizer_config import OptimizerConfig @@ -15,16 +15,19 @@ class LayerWiseDistributedOptimizer(ChainedOptimizer): """Layer-wise distributed optimizer for Megatron-core models. - This is a experimental distributed optimizer wrapper that distributes weight to DP ranks by full layer. - Implemented as ChainedOptimizer to support different weights use different optimizers (e.g. muon+adam) - When using, keep all megatron distributed optimizer related options OFF. + This is a experimental distributed optimizer wrapper that distributes weight to DP ranks + by full layer. Implemented as ChainedOptimizer to support different weights use different + optimizers (e.g. muon+adam). When using, keep all megatron distributed optimizer related + options OFF. How LayerWiseDistributedOptimizer work: 1. weights are splited into lists and each rank only keep its shard in its optimizer - 2. Megatron DDP handle allreduce grad for all params, note that each rank have full model and grad + 2. Megatron DDP handle allreduce grad for all params, note that each rank have full model + and grad. 3. optimizer is already modified so only param belong to this DP rank is updated 3. grad_norm and zero counting will reduce metrics globally in step function - 4. Do regular update with chained optimizers, optimizer is already modified so partial update happens + 4. Do regular update with chained optimizers, optimizer is already modified so partial update + happens. 5. allgather updated params to every rank(currently through broadcast loop) """ @@ -37,7 +40,8 @@ def __init__( self.pg_collection = pg_collection self.shard_params(optimizers) # wrap optimizer after sharding to avoid unnecessary master weight creation - # TODO(deyuf): check if underlying optimizer.config need to fixed and if so can use that instead of passing + # TODO(deyuf): check if underlying optimizer.config need to fixed and if so can use + # that instead of passing if config.bf16: if isinstance(optimizers[0], Float16OptimizerWithFloat16Params): raise TypeError('LayerWiseDistributedOptimizer received Float16 optimizer already.') @@ -47,17 +51,20 @@ def __init__( super().__init__(optimizers) # TODO(kunlun, deyuf): potential future perf optimization - # since allreduce is unchanged and handled by megatron DDP, they're already in contiguous gbuf - # so instead of shard param by layer randomly, we can still shard by buf range but keep some "extras" - # to keep boundary weight not sharded. This way each rank do some duplicated work but we can call - # single allgather later and all current distopt optimization can be applied + # since allreduce is unchanged and handled by megatron DDP, they're already in contiguous + # gbuf, so instead of shard param by layer randomly, we can still shard by buf range but + # keep some "extras" to keep boundary weight not sharded. This way each rank do some + # duplicated work but we can call single allgather later and all current distopt + # optimization can be applied. def shard_params(self, optimizers): """Shard all params into lists by rank.""" - # We'll optimize sharding later if there is perf issue. should be ok since linear are grouped already - # Key is to create separate sharding for dp/expt parallel, saved in dp_cp_params_list, expt_dp_params_list - # example of 4 dp rank and 10 non-expert parameters p0-p9, then dp_cp_params_list will look like - # [[p0, p4, p8], [p1, p5, p9], [p2, p6], [p3, p7]] + # We'll optimize sharding later if there is perf issue. should be ok since linear are + # grouped already. + # Key is to create separate sharding for dp/expt parallel, saved in dp_cp_params_list, + # expt_dp_params_list. + # Example of 4 dp rank and 10 non-expert parameters p0-p9, then dp_cp_params_list will + # look like: [[p0, p4, p8], [p1, p5, p9], [p2, p6], [p3, p7]] # simplify when dp_cp group size is 1 if get_pg_size(self.pg_collection.dp_cp) == 1: @@ -70,7 +77,8 @@ def shard_params(self, optimizers): expt_dp_size = get_pg_size(self.pg_collection.expt_dp) self.dp_cp_params_list = [[] for _ in range(dp_cp_size)] self.expt_dp_params_list = [[] for _ in range(expt_dp_size)] - # get all param groups, this is called before init so cannot rely on Chained optimizer method + # get all param groups, this is called before init so cannot rely on + # Chained optimizer method param_groups = [] for optimizer in optimizers: param_groups += optimizer.param_groups diff --git a/megatron/core/ssm/gated_delta_net.py b/megatron/core/ssm/gated_delta_net.py index 45588341a39..e12dfd68062 100644 --- a/megatron/core/ssm/gated_delta_net.py +++ b/megatron/core/ssm/gated_delta_net.py @@ -36,20 +36,18 @@ try: from fla.modules.l2norm import l2norm - from fla.ops.gated_delta_rule import chunk_gated_delta_rule, fused_recurrent_gated_delta_rule + from fla.ops.gated_delta_rule import chunk_gated_delta_rule HAVE_FLA = True except ImportError: chunk_gated_delta_rule = None - fused_recurrent_gated_delta_rule = None HAVE_FLA = False try: - from causal_conv1d import causal_conv1d_fn, causal_conv1d_update + from causal_conv1d import causal_conv1d_fn except ImportError: causal_conv1d_fn = None - causal_conv1d_update = None logger = logging.getLogger(__name__) diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index 518d82a0332..870b8ad1c40 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -434,7 +434,9 @@ def _adjust_key_value_for_inference( return query, key, value, rotary_pos_emb, attn_mask_type, block_table @abstractmethod - def get_query_key_value_tensors(self, hidden_states, key_value_states, output_gate, split_qkv=True): + def get_query_key_value_tensors( + self, hidden_states, key_value_states, output_gate, split_qkv=True + ): """ This method needs to be implemented based on whether the derived class is "self-attn" or "cross-attn". @@ -1083,10 +1085,7 @@ def _compare(srcs, tgts, names, parallelism): ) def get_query_key_value_tensors( - self, hidden_states, - key_value_states=None, - output_gate=False, - split_qkv=True + self, hidden_states, key_value_states=None, output_gate=False, split_qkv=True ): """ Derives `query`, `key`, `value` tensors from `hidden_states`. @@ -1226,7 +1225,9 @@ def __init__( is_expert=False, ) - def get_query_key_value_tensors(self, hidden_states, key_value_states, output_gate=False, split_qkv=True): + def get_query_key_value_tensors( + self, hidden_states, key_value_states, output_gate=False, split_qkv=True + ): """ Derives `query` tensor from `hidden_states`, and `key`/`value` tensors from `key_value_states`. diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py index 82fb7b00583..ec64d1887a1 100644 --- a/megatron/core/transformer/moe/token_dispatcher.py +++ b/megatron/core/transformer/moe/token_dispatcher.py @@ -33,6 +33,8 @@ from megatron.core.transformer.moe.shared_experts import SharedExpertMLP from megatron.core.transformer.transformer_config import TransformerConfig +logger = logging.getLogger(__name__) + """ We use the following notation throughout this file: H: hidden size B: micro batch size @@ -989,7 +991,9 @@ def dispatch( # DeepEP only supports float32 probs if self.token_probs.dtype != torch.float32: if self.token_probs.dtype in [torch.bfloat16, torch.float16]: - print("DeepEP only supports float32 probs, please set --moe-router-dtype=fp32") + logger.info( + "DeepEP only supports float32 probs, please set --moe-router-dtype=fp32" + ) self.token_probs = self.token_probs.float() # downcast or upcast hidden_states, dispatched_indices, dispatched_probs, num_tokens_per_expert, handle = ( fused_dispatch( diff --git a/megatron/core/transformer/spec_utils.py b/megatron/core/transformer/spec_utils.py index 897d88d2aa3..24df1add0eb 100644 --- a/megatron/core/transformer/spec_utils.py +++ b/megatron/core/transformer/spec_utils.py @@ -1,9 +1,12 @@ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +import logging import types from dataclasses import dataclass, field from typing import Tuple, Union +logger = logging.getLogger(__name__) + @dataclass class ModuleSpec: @@ -38,12 +41,15 @@ def import_module(module_path: Tuple[str]): try: module = __import__(base_path, globals(), locals(), [name]) except ImportError as e: - print(f"couldn't import module due to {e}") + logger.error(f"couldn't import module due to {e}") return None return vars(module)[name] def get_module(spec_or_module: Union[ModuleSpec, type], **additional_kwargs): + """Retrieve the module class or function specified by a ModuleSpec or + return it as is if already provided. + """ # If a module clas is already provided return it as is if isinstance(spec_or_module, (type, types.FunctionType)): return spec_or_module @@ -57,6 +63,7 @@ def get_module(spec_or_module: Union[ModuleSpec, type], **additional_kwargs): def build_module(spec_or_module: Union[ModuleSpec, type], *args, **kwargs): + """Build a module from a ModuleSpec or return it as is if already provided.""" # If the passed `spec_or_module` is # a `Function`, then return it as it is # NOTE: to support an already initialized module add the following condition From 4c3a1be68cfac256e31a230722fbce439b66aa32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 14 Oct 2025 09:23:13 +0000 Subject: [PATCH 016/248] !4211 - ci(fix): Cherrypicking from forks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/00.pre.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml index c91ffc80995..c912d5297d2 100644 --- a/.gitlab/stages/00.pre.yml +++ b/.gitlab/stages/00.pre.yml @@ -176,14 +176,16 @@ pre:maybe_cherry_pick_to_main: TITLE=$(echo -E $MR | jq '.title' | tr -d '"') MILESTONE_ID=$(echo -E $MR | jq '.milestone.id' | tr -d '"') - git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git" + git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/$CI_PROJECT_PATH.git" + git remote add mr-origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/$CI_MERGE_REQUEST_SOURCE_PROJECT_PATH.git" + git config --global user.email "mcore-bot@nvidia.com" git config --global user.name "Mcore Bot" git fetch origin dev - git fetch origin $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME + git fetch mr-origin $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME git checkout $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME - START_COMMIT=$(git merge-base origin/dev origin/$CI_MERGE_REQUEST_SOURCE_BRANCH_NAME) + START_COMMIT=$(git merge-base origin/dev mr-origin/$CI_MERGE_REQUEST_SOURCE_BRANCH_NAME) END_COMMIT=$(git rev-parse HEAD) git fetch origin main From 7c350f5af0a13ef9ee01da4a5fb3e7376956972d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 14 Oct 2025 09:23:33 +0000 Subject: [PATCH 017/248] !4239 - ci: Check out dev for formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/02.test.yml | 2 +- tools/autoformat.sh | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index 71f49f55055..513fe430c21 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -289,7 +289,7 @@ test:linting_formatting: - git fetch origin main:main - | if [[ "$CI_MERGE_REQUEST_PROJECT_PATH" == "$CI_MERGE_REQUEST_SOURCE_PROJECT_PATH" ]]; then - bash tools/autoformat.sh + BASE_REF="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" bash tools/autoformat.sh set -e git fetch origin $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME git checkout $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME diff --git a/tools/autoformat.sh b/tools/autoformat.sh index 6c3e76b3eaa..85d1d19c7cb 100755 --- a/tools/autoformat.sh +++ b/tools/autoformat.sh @@ -15,6 +15,8 @@ CHECK_ONLY=${CHECK_ONLY:-false} SKIP_DOCS=${SKIP_DOCS:-false} BASE_REF=${BASE_REF:-main} +git remote set-url origin "https://${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git" +git fetch origin ${BASE_REF} CHANGED_FILES=$(git diff --name-only --diff-filter=d --merge-base origin/${BASE_REF} megatron/core tests/ | grep '\.py$' || true) ADDITIONAL_ARGS="" ADDITIONAL_BLACK_ARGS="" From 46687cdd8586aaa561d169a843db7848edf7e86a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 14 Oct 2025 17:56:25 +0000 Subject: [PATCH 018/248] ci: Fix formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/02.test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index 513fe430c21..34418612b92 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -287,6 +287,8 @@ test:linting_formatting: fi - set +e - git fetch origin main:main + - echo -e "machine gitlab-master.nvidia.com\n login gitlab-ci-token\n password $CI_JOB_TOKEN" >~/.netrc + - chmod 600 ~/.netrc" - | if [[ "$CI_MERGE_REQUEST_PROJECT_PATH" == "$CI_MERGE_REQUEST_SOURCE_PROJECT_PATH" ]]; then BASE_REF="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" bash tools/autoformat.sh From 50ed5eb1021a65b6de7b636ae84acd176e8319a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 14 Oct 2025 18:15:02 +0000 Subject: [PATCH 019/248] ci: Fix linting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/02.test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index 34418612b92..358ad740e01 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -288,7 +288,7 @@ test:linting_formatting: - set +e - git fetch origin main:main - echo -e "machine gitlab-master.nvidia.com\n login gitlab-ci-token\n password $CI_JOB_TOKEN" >~/.netrc - - chmod 600 ~/.netrc" + - chmod 600 ~/.netrc - | if [[ "$CI_MERGE_REQUEST_PROJECT_PATH" == "$CI_MERGE_REQUEST_SOURCE_PROJECT_PATH" ]]; then BASE_REF="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" bash tools/autoformat.sh From b01ad5b0361082e447dbe7b9b9764dc3411e059a Mon Sep 17 00:00:00 2001 From: Li Tao Date: Tue, 14 Oct 2025 11:15:08 -0700 Subject: [PATCH 020/248] ADLR/megatron-lm!4225 - [Dev][NVFP4][MOE] Proper NVFP4 Zero Padding for MOE Co-authored-by: Zhongbo Zhu --- megatron/core/fp4_utils.py | 14 ++++++ megatron/core/transformer/moe/README.md | 4 +- megatron/core/transformer/moe/experts.py | 47 ++++++++++++------- .../core/transformer/moe/token_dispatcher.py | 27 +++++++++-- .../core/transformer/transformer_config.py | 26 +++++++--- megatron/training/arguments.py | 11 +++-- .../transformer/moe/test_token_dispatcher.py | 12 ++--- 7 files changed, 100 insertions(+), 41 deletions(-) diff --git a/megatron/core/fp4_utils.py b/megatron/core/fp4_utils.py index eae4bf91de6..eb02a4796b0 100644 --- a/megatron/core/fp4_utils.py +++ b/megatron/core/fp4_utils.py @@ -47,6 +47,20 @@ def is_nvfp4tensor(tensor: torch.Tensor) -> bool: return HAVE_TE_FP4_TENSOR_CLASS and isinstance(tensor, FP4_TENSOR_CLASS) +def get_fp4_align_size(fp4_recipe: Fp4Recipe) -> int: + """ + Get the alignment size required for FP4 GEMM. + FP4 GEMM requires Blackwell and later architectures. + + The value 32 is a hardware requirement: TMA (Tensor Memory Accelerator) requires + a 16-byte aligned address for efficient memory access. Since FP4 uses 4 bits per value, + 16 bytes (128 bits) corresponds to 32 FP4 values. Therefore, the alignment size for FP4 + is 32. With this alignment, NVFP4 GEMM can be performed efficiently. + """ + # pylint: disable=unused-argument + return 32 + + def dequantize_fp4_tensor(fp4_tensor: torch.Tensor) -> torch.Tensor: """Dequantize a fp4 tensor to a higher precision tensor.""" if is_te_min_version("2.7.0.dev0"): diff --git a/megatron/core/transformer/moe/README.md b/megatron/core/transformer/moe/README.md index c7c22201404..56be6fc2463 100644 --- a/megatron/core/transformer/moe/README.md +++ b/megatron/core/transformer/moe/README.md @@ -235,7 +235,7 @@ Enable A2A overlap across different batches inspired by the DSv3 DualPipe implme | --moe-router-fusion | Enable fusion for MoE TopK routing and aux-loss computation. This is only supported in TransformerEngine 2.7.0 and above. | | --moe-router-bias-update-rate | The expert bias is updated based on the number of assigned tokens to each expert in a global batch, where the bias is increased for experts with less assigned tokens and decreased for experts with more assigned tokens. Default is 1e-3 same as that used in DeepSeekV3. | | --moe-router-force-load-balancing | (Experimental) Force override routing to balance token distribution using random logits for MoE routers, supporting naive top-k and group-limited top-k. This experimental feature is for benchmarking purposes only! | -| --moe-router-padding-for-fp8 | Pad the routing_map to make sure the number of tokens each expert received is a multiple of 16/32 for FP8 precision. It is suggested to enable this for dropless training with FP8 precision when num_local_experts > 1. This is a more efficient way to pad for FP8 which eliminates the explicit padding in the GroupedMLP layer. | +| --moe-router-padding-for-quantization | Pad the routing_map to make sure the number of tokens each expert received is a multiple of 16/32 for FP8/FP4 precision. It is suggested to enable this for dropless training with FP8 precision when num_local_experts > 1. This is a more efficient way to pad for FP8 which eliminates the explicit padding in the GroupedMLP layer. | | --moe-aux-loss-coeff | Scaling coefficient for the aux loss: a starting value of 1e-2 is recommended. Default is 0.0. | | --moe-z-loss-coeff | Scaling coefficient for the z-loss: a starting value of 1e-3 is recommended. Default is None. | | --moe-input-jitter-eps | Add noise to the input tensor by applying jitter with a specified epsilon value. Default is None. | @@ -464,7 +464,7 @@ Therefore, there are two recommended ways during the first 200 steps to avoid th **FP8 Training Best Practice** - Using latest version of [TransformerEngine](https://github.com/NVIDIA/TransformerEngine). -- Enable router padding with `--moe-router-padding-for-fp8` to reduce padding overhead. +- Enable router padding with `--moe-router-padding-for-quantization` to reduce padding overhead. - Enable native FP8 weights with `--fp8-param-gather` to reduce weights memory cost. ### Reference Best Parallel Mapping diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py index d8dd3d03f02..e73864a50fa 100644 --- a/megatron/core/transformer/moe/experts.py +++ b/megatron/core/transformer/moe/experts.py @@ -21,6 +21,7 @@ ShardedTensorFactory, ) from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding +from megatron.core.fp4_utils import get_fp4_align_size from megatron.core.fp8_utils import get_fp8_align_size from megatron.core.fusions.fused_bias_geglu import quick_gelu, weighted_bias_quick_geglu_impl from megatron.core.fusions.fused_bias_swiglu import weighted_bias_swiglu_impl @@ -134,8 +135,10 @@ def glu(x): self.config.recompute_granularity == 'selective' and "moe_act" in self.config.recompute_modules ) - if self.activation_recompute and self.config.fp8: - raise ValueError("moe_act recompute for fp8 cannot work with the legacy GroupedMLP.") + if self.activation_recompute and (self.config.fp8 or self.config.fp4): + raise ValueError( + "moe_act recompute for fp8 or fp4 cannot work with the legacy GroupedMLP." + ) @jit_fuser def activation_func_with_probs(x, probs): @@ -809,15 +812,15 @@ def __init__( self.config.recompute_granularity == 'selective' and "moe_act" in self.config.recompute_modules ) - if self.activation_recompute and self.config.fp8: + if self.activation_recompute and (self.config.fp8 or self.config.fp4): from megatron.core.extensions.transformer_engine import set_save_original_input set_save_original_input(self.linear_fc2) - if self.config.fp8: - assert HAVE_TE, "FP8 requires TE." - self.fp8_padding = Fp8Padding(self.num_local_experts) - self.fp8_unpadding = Fp8Unpadding(self.num_local_experts) + if self.config.fp8 or self.config.fp4: + assert HAVE_TE, "FP8 and FP4 requires TE." + self.quantization_padding = Fp8Padding(self.num_local_experts) + self.quantization_unpadding = Fp8Unpadding(self.num_local_experts) @staticmethod def _apply_bias(intermediate_parallel, bias_parallel, tokens_per_expert, permuted_probs): @@ -857,12 +860,12 @@ def forward( output (torch.Tensor): The output of the local experts. """ tokens_per_expert = tokens_per_expert.tolist() - if self.config.fp8: + if self.config.fp8 or self.config.fp4: actual_tokens_per_expert = tokens_per_expert - permuted_local_hidden_states, tokens_per_expert = self.fp8_padding( + permuted_local_hidden_states, tokens_per_expert = self.quantization_padding( permuted_local_hidden_states, tokens_per_expert ) - permuted_probs, _ = self.fp8_padding( + permuted_probs, _ = self.quantization_padding( permuted_probs.unsqueeze(-1), actual_tokens_per_expert ) else: @@ -954,8 +957,8 @@ def glu(x): output, output_bias = self.linear_fc2(intermediate_parallel, tokens_per_expert) # upad and concat the output - if self.config.fp8: - output = self.fp8_unpadding(output, actual_tokens_per_expert) + if self.config.fp8 or self.config.fp4: + output = self.quantization_unpadding(output, actual_tokens_per_expert) output = self._apply_bias(output, output_bias, tokens_per_expert, permuted_probs) output_bias = None @@ -1051,10 +1054,18 @@ def __init__( ) self.local_experts.append(expert) - def _pad_tensor_for_fp8(self, hidden, probs): + def _get_align_size_for_quantization(self): + """Get the alignment size for quantization.""" + if self.config.fp8: + return get_fp8_align_size(self.config.fp8_recipe) + elif self.config.fp4: + return get_fp4_align_size(self.config.fp4_recipe) + return 16 + + def _pad_tensor_for_quantization(self, hidden, probs): """Padding tensor shape to multiples of 16/32.""" actual_num_tokens = hidden.shape[0] - divisor = get_fp8_align_size(self.config.fp8_recipe) + divisor = self._get_align_size_for_quantization() padded_num_tokens = ceil(actual_num_tokens / divisor) * divisor - actual_num_tokens if padded_num_tokens > 0: pad_tensor = torch.zeros( @@ -1086,8 +1097,8 @@ def forward( permuted_probs = torch.ones_like(permuted_probs) if self.num_local_experts == 1: - if self.config.fp8: - hidden, probs = self._pad_tensor_for_fp8( + if self.config.fp8 or self.config.fp4: + hidden, probs = self._pad_tensor_for_quantization( permuted_local_hidden_states, permuted_probs ) output, output_bias = self.local_experts[0](hidden, probs) @@ -1106,8 +1117,8 @@ def forward( output_local_list = [] for expert, tokens, probs in zip(self.local_experts, tokens_list, probs_list): - if self.config.fp8: - hidden, probs = self._pad_tensor_for_fp8(tokens, probs) + if self.config.fp8 or self.config.fp4: + hidden, probs = self._pad_tensor_for_quantization(tokens, probs) output, output_bias = expert(hidden, probs) output = output[: tokens.shape[0]] else: diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py index ec64d1887a1..142aa74a19e 100644 --- a/megatron/core/transformer/moe/token_dispatcher.py +++ b/megatron/core/transformer/moe/token_dispatcher.py @@ -8,6 +8,7 @@ from megatron.core import utils from megatron.core.config import is_experimental_enabled +from megatron.core.fp4_utils import get_fp4_align_size from megatron.core.fp8_utils import get_fp8_align_size from megatron.core.fusions.fused_indices_converter import fused_indices_to_multihot from megatron.core.fusions.fused_pad_routing_map import fused_pad_routing_map @@ -195,6 +196,14 @@ def set_shared_experts(self, shared_experts): assert self.config.moe_shared_expert_overlap self.shared_experts = shared_experts + def get_align_size_for_quantization(self): + """Get the alignment size for quantization.""" + if self.config.fp8: + return get_fp8_align_size(self.config.fp8_recipe) + elif self.config.fp4: + return get_fp4_align_size(self.config.fp4_recipe) + return 16 + class MoEAllGatherTokenDispatcher(MoETokenDispatcher): """ @@ -474,7 +483,7 @@ def preprocess(self, routing_map: torch.Tensor) -> torch.Tensor: if ( self.config.moe_expert_capacity_factor is not None - or self.config.moe_router_padding_for_fp8 + or self.config.moe_router_padding_for_quantization ): # When using token dropping or router padding, output size is dynamic. # Need to sync output size GPU->CPU before allocating output buffer @@ -576,8 +585,8 @@ def dispatch_preprocess( assert routing_map.dtype == torch.bool, "Expected bool tensor for mask" hidden_states = hidden_states.view(-1, self.hidden_shape[-1]) - if self.config.moe_router_padding_for_fp8: - pad_multiple = get_fp8_align_size(self.config.fp8_recipe) + if self.config.moe_router_padding_for_quantization: + pad_multiple = self.get_align_size_for_quantization() if is_experimental_enabled() and self.config.moe_permute_fusion: self.routing_map = fused_pad_routing_map(self.routing_map, pad_multiple) else: @@ -1075,7 +1084,7 @@ def _pad_routing_map( """ Pad the routing map to the nearest multiple of the pad_multiple. """ - pad_multiple = get_fp8_align_size(self.config.fp8_recipe) + pad_multiple = self.get_align_size_for_quantization() num_input_tokens = routing_map.shape[0] target_tokens_per_expert = ( @@ -1110,7 +1119,7 @@ def get_permuted_hidden_states_by_experts(self, hidden_states: torch.Tensor) -> self.dispatched_routing_map, self.dispatched_probs = self._indices_to_multihot( self.dispatched_indices, self.dispatched_probs ) - if self.config.moe_router_padding_for_fp8: + if self.config.moe_router_padding_for_quantization: self.dispatched_routing_map, self.tokens_per_expert = self._pad_routing_map( self.dispatched_routing_map, self.tokens_per_expert ) @@ -1138,6 +1147,14 @@ def get_restored_hidden_states_by_experts(self, hidden_states: torch.Tensor) -> ) return hidden_states + def get_align_size_for_quantization(self): + """Get the alignment size for quantization.""" + if self.config.fp8: + return get_fp8_align_size(self.config.fp8_recipe) + elif self.config.fp4: + return get_fp4_align_size(self.config.fp4_recipe) + return 16 + class MoEFlexTokenDispatcher(MoETokenDispatcher): """A flexible token dispatcher that abstracts the underlying tensor and expert diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index dc11239836f..8b36425ca2a 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -497,10 +497,14 @@ class TransformerConfig(ModelParallelConfig): DEPRECATED and replaced by moe_router_num_groups and moe_router_group_topk. """ + moe_router_padding_for_quantization: Optional[bool] = False + """Whether to pad the routing_map to make sure the number of tokens each expert receives + is a multiple of 16/32 for quantized precision (e.g., FP8, FP4). This can remove the explicit + padding in the GroupedMLP layer.""" + moe_router_padding_for_fp8: Optional[bool] = False - """Whether to pad the routing_map to make sure the number of tokens each expert received - is a multiple of 16/32 for FP8 precision. This can remove the explicit padding in the - GroupedMLP layer.""" + """[Compatibility alias for moe_router_padding_for_quantization] + Enabling this will also enable moe_router_padding_for_quantization.""" moe_router_num_groups: Optional[int] = None """Number of groups to divide experts into for group-limited routing. @@ -1389,13 +1393,23 @@ def __post_init__(self): ) if self.moe_router_padding_for_fp8: - if self.fp8 is None: - raise ValueError("fp8 must be specified when moe_router_padding_for_fp8 is True.") + # enable moe_router_padding_for_quantization + warnings.warn( + "--moe-router-padding-for-fp8 is going to be deprecated. " + "Use --moe-router-padding-for-quantization instead." + ) + self.moe_router_padding_for_quantization = True + + if self.moe_router_padding_for_quantization: + if self.fp8 is None and self.fp4 is None: + raise ValueError( + "fp8/fp4 must be specified when moe_router_padding_for_quantization is True." + ) if self.moe_token_dispatcher_type in ["allgather", "alltoall_seq"]: raise ValueError( "allgather and alltoall_seq dispatcher does not support " - "moe_router_padding_for_fp8." + "moe_router_padding_for_quantization." ) if ( diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 29db36ca6e0..905538ffc9e 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -3129,12 +3129,15 @@ def _add_moe_args(parser): 'The default value 1e-3 is same as that used in DeepSeekV3.') group.add_argument('--moe-router-force-load-balancing', action='store_true', help='[Experimental] Force override routing to balance token distribution using random logits for MoE routers, supporting naive top-k and group-limited top-k. This experimental feature is for benchmarking purposes only!') - group.add_argument('--moe-router-padding-for-fp8', action='store_true', + group.add_argument('--moe-router-padding-for-quantization', action='store_true', help='Pad the routing_map to make sure the number of tokens each expert received ' - 'is a multiple of 16/32 for FP8 precision. It is suggested to enable this for ' - 'dropless training with FP8 precision when num_local_experts > 1. This is a more ' - 'efficient way to pad for FP8 which eliminates the explicit padding in the ' + 'is a multiple of 16/32 for FP8/FP4 precision. It is suggested to enable this for ' + 'dropless training with FP8/FP4 precision when num_local_experts > 1. This is a more ' + 'efficient way to pad for FP8/FP4 which eliminates the explicit padding in the ' 'GroupedMLP layer.') + group.add_argument('--moe-router-padding-for-fp8', action='store_true', + help='[Compatibility alias for --moe-router-padding-for-quantization] ' + 'Enabling this will also enable --moe-router-padding-for-quantization.') group.add_argument('--moe-aux-loss-coeff', type=float, nargs='+', default=0.0, help='Scaling coefficient for the aux loss: a starting value of 1e-2 is recommended.') group.add_argument('--moe-z-loss-coeff', type=float, default=None, diff --git a/tests/unit_tests/transformer/moe/test_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_token_dispatcher.py index 328b8837790..82138bc637d 100644 --- a/tests/unit_tests/transformer/moe/test_token_dispatcher.py +++ b/tests/unit_tests/transformer/moe/test_token_dispatcher.py @@ -280,15 +280,15 @@ def dispatcher_router_padding_for_fp8_test(self): """Test if the routing map is padded correctly for FP8 training. The test runs the forward flow twice: - 1. First with moe_router_padding_for_fp8=False - 2. Then with moe_router_padding_for_fp8=True + 1. First with moe_router_padding_for_quantization=False + 2. Then with moe_router_padding_for_quantization=True We verify that: 1. The results are the same in both cases 2. The number of tokens received by each expert is padded to a multiple of 16 """ - # First run with moe_router_padding_for_fp8 = False - moe_layer = self.new_moe_layer(moe_router_padding_for_fp8=False) + # First run with moe_router_padding_for_quantization = False + moe_layer = self.new_moe_layer(moe_router_padding_for_quantization=False) num_tokens = 32 hidden_states = torch.randn( @@ -309,8 +309,8 @@ def dispatcher_router_padding_for_fp8_test(self): grad_1 = hidden_states.grad.clone() hidden_states.grad = None - # Run with moe_router_padding_for_fp8 = True - moe_layer_2 = self.new_moe_layer(moe_router_padding_for_fp8=True, fp8="hybrid") + # Run with moe_router_padding_for_quantization = True + moe_layer_2 = self.new_moe_layer(moe_router_padding_for_quantization=True, fp8="hybrid") moe_layer_2.load_state_dict(moe_layer.state_dict()) probs_2, indices_2 = moe_layer_2.router(hidden_states) From 061bc3765ab6132f9caa0203c7fe7227bc4f5c48 Mon Sep 17 00:00:00 2001 From: Hao Wu Date: Tue, 14 Oct 2025 18:48:17 -0700 Subject: [PATCH 021/248] ADLR/megatron-lm!4248 - ADLR/megatron-lm!4159 - Fix ProcessGroupCollection missing initialization --- megatron/core/optimizer/__init__.py | 11 +- megatron/core/parallel_state.py | 22 ++-- megatron/core/process_groups_config.py | 103 ++++++++++++++---- tests/unit_tests/test_optimizer.py | 17 ++- .../unit_tests/test_process_groups_config.py | 33 ++++++ .../test_transformer_block_custom_pgs.py | 11 +- 6 files changed, 161 insertions(+), 36 deletions(-) diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py index c644160cda7..307538fad22 100644 --- a/megatron/core/optimizer/__init__.py +++ b/megatron/core/optimizer/__init__.py @@ -281,6 +281,7 @@ def _get_megatron_optimizer_based_on_param_groups( data_parallel_group: Optional[torch.distributed.ProcessGroup] = None, data_parallel_group_gloo: Optional[torch.distributed.ProcessGroup] = None, data_parallel_group_idx: Optional[int] = None, + intra_dist_opt_group: Optional[torch.distributed.ProcessGroup] = None, distributed_optimizer_instance_id: Optional[int] = 0, ) -> MegatronOptimizer: """Get Megatron optimizer based on parameter groups. @@ -459,11 +460,7 @@ def init_state_fn(opt, config=None): # This is needed for case where num_distributed_optimizer_instances > 1. In this case, # weight gradients are all-reduced across optimizer instances, so each instance has # the duplicated weight gradients, need to reduce gradient stats inside each instance. - setattr( - optimizer, - 'grad_stats_parallel_group', - parallel_state.get_intra_distributed_optimizer_instance_group(), - ) + setattr(optimizer, 'grad_stats_parallel_group', intra_dist_opt_group) else: optimizer = Float16OptimizerWithFloat16Params(*optimizer_args) setattr(optimizer, 'grad_stats_parallel_group', model_parallel_group) @@ -532,6 +529,7 @@ def get_megatron_optimizer( expt_tp_pp_group = process_groups['expt_tp_pp_group'] intra_dp_cp_group_gloo = process_groups['intra_dp_cp_group_gloo'] intra_expt_dp_group_gloo = process_groups['intra_expt_dp_group_gloo'] + intra_dist_opt_group = process_groups['intra_dist_opt_group'] model_parallel_rank = get_pg_rank(mp_group) @@ -570,6 +568,7 @@ def get_megatron_optimizer( data_parallel_group=dp_cp_group, data_parallel_group_gloo=intra_dp_cp_group_gloo, data_parallel_group_idx=model_parallel_rank, + intra_dist_opt_group=intra_dist_opt_group, distributed_optimizer_instance_id=distributed_optimizer_instance_id, ) ) @@ -610,6 +609,7 @@ def get_megatron_optimizer( data_parallel_group=intra_dp_cp_group, data_parallel_group_gloo=intra_dp_cp_group_gloo, data_parallel_group_idx=model_parallel_rank, + intra_dist_opt_group=intra_dist_opt_group, distributed_optimizer_instance_id=distributed_optimizer_instance_id, ) ) @@ -643,6 +643,7 @@ def get_megatron_optimizer( data_parallel_group=intra_expt_dp_group, data_parallel_group_gloo=expt_data_parallel_group_gloo, data_parallel_group_idx=expt_model_parallel_rank, + intra_dist_opt_group=intra_dist_opt_group, distributed_optimizer_instance_id=distributed_optimizer_instance_id, ) ) diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py index be7eaf27ce4..1e41bf9d8c2 100644 --- a/megatron/core/parallel_state.py +++ b/megatron/core/parallel_state.py @@ -13,6 +13,8 @@ from .utils import GlobalMemoryBuffer, is_torch_min_version +logger = logging.getLogger(__name__) + try: import einops @@ -1892,23 +1894,25 @@ def get_expert_data_parallel_world_size(partial_expert_data_parallel=False): return 0 -def get_intra_distributed_optimizer_instance_group(): +def get_intra_distributed_optimizer_instance_group(check_initialized=True): """Get the group of all GPUs in a distributed optimizer instance.""" - assert ( - _INTRA_DISTRIBUTED_OPTIMIZER_INSTANCE_GROUP is not None - ), "Intra distributed optimizer instance group is not initialized" + if check_initialized: + assert ( + _INTRA_DISTRIBUTED_OPTIMIZER_INSTANCE_GROUP is not None + ), "Intra distributed optimizer instance group is not initialized" return _INTRA_DISTRIBUTED_OPTIMIZER_INSTANCE_GROUP -def get_inter_distributed_optimizer_instance_group(): +def get_inter_distributed_optimizer_instance_group(check_initialized=True): """Get the group spanning the different distributed optimizer instances. Attention and MLP/Expert share same inter-instance group, so only built inter_partial_expert_data_parallel_group, and return it at here. """ - assert _INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP is not None, ( - "Attention and MLP/Expert share same inter distributed optimize instance group, " - "which has not been initialized" - ) + if check_initialized: + assert _INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP is not None, ( + "Attention and MLP/Expert share same inter distributed optimize instance group, " + "which has not been initialized" + ) return _INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP diff --git a/megatron/core/process_groups_config.py b/megatron/core/process_groups_config.py index 989a31b6f33..07c922ea685 100644 --- a/megatron/core/process_groups_config.py +++ b/megatron/core/process_groups_config.py @@ -127,9 +127,12 @@ class ProcessGroupCollection: # _INTRA_EXPERT_DATA_PARALLEL_GROUP intra_expt_dp: torch.distributed.ProcessGroup = field(init=False) - # _INTER_DISTRIBUTED_OPTIMIZER_INSTANCE_GROUP + # _INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP inter_dist_opt: torch.distributed.ProcessGroup = field(init=False) + # _INTRA_DISTRIBUTED_OPTIMIZER_INSTANCE_GROUP + intra_dist_opt: torch.distributed.ProcessGroup = field(init=False) + def __init__(self, **kwargs): for key in kwargs: if key in [field.name for field in fields(self)]: @@ -161,29 +164,71 @@ def use_mpu_process_groups(cls, required_pgs: Optional[List[str]] = None): # Mapping of attribute names to their initialization functions pg_to_func = { - 'tp': parallel_state.get_tensor_model_parallel_group, - 'pp': parallel_state.get_pipeline_model_parallel_group, - 'mp': parallel_state.get_model_parallel_group, - 'cp': parallel_state.get_context_parallel_group, - 'tp_cp': parallel_state.get_tensor_and_context_parallel_group, - 'hcp': parallel_state.get_hierarchical_context_parallel_groups, - 'ep': parallel_state.get_expert_model_parallel_group, - 'expt_tp': parallel_state.get_expert_tensor_parallel_group, - 'tp_ep': parallel_state.get_expert_tensor_and_model_parallel_group, - 'tp_ep_pp': parallel_state.get_expert_tensor_model_pipeline_parallel_group, - 'embd': parallel_state.get_embedding_group, - 'pos_embd': parallel_state.get_position_embedding_group, + 'tp': partial(parallel_state.get_tensor_model_parallel_group, check_initialized=False), + 'pp': partial( + parallel_state.get_pipeline_model_parallel_group, check_initialized=False + ), + 'mp': partial(parallel_state.get_model_parallel_group, check_initialized=False), + 'cp': partial(parallel_state.get_context_parallel_group, check_initialized=False), + 'tp_cp': partial( + parallel_state.get_tensor_and_context_parallel_group, check_initialized=False + ), + 'hcp': partial( + parallel_state.get_hierarchical_context_parallel_groups, check_initialized=False + ), + 'ep': partial(parallel_state.get_expert_model_parallel_group, check_initialized=False), + 'expt_tp': partial( + parallel_state.get_expert_tensor_parallel_group, check_initialized=False + ), + 'tp_ep': partial( + parallel_state.get_expert_tensor_and_model_parallel_group, check_initialized=False + ), + 'tp_ep_pp': partial( + parallel_state.get_expert_tensor_model_pipeline_parallel_group, + check_initialized=False, + ), + 'embd': partial(parallel_state.get_embedding_group, check_initialized=False), + 'pos_embd': partial( + parallel_state.get_position_embedding_group, check_initialized=False + ), + 'dp': parallel_state.get_data_parallel_group, + 'dp_cp': partial(parallel_state.get_data_parallel_group, with_context_parallel=True), + 'intra_dp_cp': partial( + parallel_state.get_data_parallel_group, + with_context_parallel=True, + partial_data_parallel=True, + ), + 'intra_expt_dp': partial( + parallel_state.get_expert_data_parallel_group, + check_initialized=False, + partial_expert_data_parallel=True, + ), + 'inter_dist_opt': partial( + parallel_state.get_inter_distributed_optimizer_instance_group, + check_initialized=False, + ), + 'intra_dist_opt': partial( + parallel_state.get_intra_distributed_optimizer_instance_group, + check_initialized=False, + ), # TODO (Hepteract): remove this once distributed checkpoint is refactored - 'expt_dp': parallel_state.get_expert_data_parallel_group, + 'expt_dp': partial( + parallel_state.get_expert_data_parallel_group, check_initialized=False + ), 'tp_dp_cp': partial( - parallel_state.get_tensor_and_data_parallel_group, with_context_parallel=True + parallel_state.get_tensor_and_data_parallel_group, + check_initialized=False, + with_context_parallel=True, ), } + assert all( + pg in pg_to_func for pg in required_pgs + ), f"Initialization function for process group not defined for all \ + ProcessGroupCollection fields" + # Build initialization dict by calling appropriate parallel_state get_foo_group - init_dict = { - pg: pg_to_func[pg](check_initialized=False) for pg in required_pgs if pg in pg_to_func - } + init_dict = {pg: pg_to_func[pg]() for pg in required_pgs} return cls(**init_dict) @@ -212,6 +257,7 @@ def setup_process_groups_for_optimizer( - mp_group: Model parallel group - expt_tp_pp_group: Expert tensor-model-pipeline parallel group - inter_dist_opt_group: Inter distributed optimizer group (may be None) + - intra_dist_opt_group: Intra distributed optimizer group (may be None) - intra_dp_cp_group_gloo: Gloo version of intra_dp_cp_group (may be None) - intra_expt_dp_group_gloo: Gloo version of intra_expt_dp_group (may be None) """ @@ -233,6 +279,7 @@ def setup_process_groups_for_optimizer( intra_expt_dp_group = parallel_state.get_expert_data_parallel_group( partial_expert_data_parallel=True ) + intra_dist_opt_group = parallel_state.get_intra_distributed_optimizer_instance_group() # Gloo groups if use_gloo_process_groups: @@ -310,20 +357,32 @@ def setup_process_groups_for_optimizer( hasattr(pg_collection, 'intra_dp_cp') and hasattr(pg_collection, 'intra_expt_dp') and hasattr(pg_collection, 'inter_dist_opt') + and hasattr(pg_collection, 'intra_dist_opt') ): raise ValueError( - "intra_dp_cp, intra_expt_dp, and inter_dist_opt " + "intra_dp_cp, intra_expt_dp, inter_dist_opt, and intra_dist_opt " "process groups are required when using multiple optimizer " "instances (>1) but not provided in pg_collection" ) intra_dp_cp_group = pg_collection.intra_dp_cp intra_expt_dp_group = pg_collection.intra_expt_dp inter_dist_opt_group = pg_collection.inter_dist_opt + + if ddp_config.use_distributed_optimizer: + if not hasattr(pg_collection, 'intra_dist_opt'): + raise ValueError( + "intra_dist_opt process group is required but not provided in " + "pg_collection. Please explicitly set it to None if you don't need it." + ) + intra_dist_opt_group = pg_collection.intra_dist_opt + else: + intra_dist_opt_group = None else: # No ddp_config available - use simple fallback intra_dp_cp_group = dp_cp_group intra_expt_dp_group = expt_dp_group inter_dist_opt_group = None + intra_dist_opt_group = None # 5. Model communication groups if not hasattr(pg_collection, 'mp'): @@ -359,6 +418,7 @@ def setup_process_groups_for_optimizer( 'mp_group': mp_group, 'expt_tp_pp_group': expt_tp_pp_group, 'inter_dist_opt_group': inter_dist_opt_group, + 'intra_dist_opt_group': intra_dist_opt_group, 'intra_dp_cp_group_gloo': intra_dp_cp_group_gloo, 'intra_expt_dp_group_gloo': intra_expt_dp_group_gloo, } @@ -411,6 +471,11 @@ def setup_process_groups_for_ddp( if ddp_config.num_distributed_optimizer_instances > 1 else None ), + 'intra_dist_opt_group': ( + parallel_state.get_intra_distributed_optimizer_instance_group() + if ddp_config.use_distributed_optimizer + else None + ), } else: # Use provided process group collection with validation and fallbacks diff --git a/tests/unit_tests/test_optimizer.py b/tests/unit_tests/test_optimizer.py index 35969565a18..d8f6e3a2eeb 100644 --- a/tests/unit_tests/test_optimizer.py +++ b/tests/unit_tests/test_optimizer.py @@ -420,10 +420,16 @@ def test_get_megatron_optimizer_with_custom_process_groups(world_size, tp_size, mp_mesh = device_mesh["pp", "tp"] mp_group = mp_mesh._flatten().get_group() + # Create intra_dist_opt group + # It has the same ranks as dp_cp group when num_distributed_optimizer_instances is not > 1 + intra_dist_opt_mesh = device_mesh["dp", "cp"] + intra_dist_opt_group = intra_dist_opt_mesh._flatten().get_group() + # Create process group configurations pg_collection = ProcessGroupCollection() pg_collection.dp = dp_group pg_collection.dp_cp = dp_cp_group + pg_collection.intra_dist_opt = intra_dist_opt_group pg_collection.expt_dp = None # Not using expert parallelism in this test pg_collection.tp = tp_group @@ -547,12 +553,19 @@ def test_get_megatron_optimizer_custom_process_groups_validation(): pg_collection=pg_collection_no_expt_dp, ) - # Test 4: Missing mp attribute in pg_collection + # Test 4: Missing intra_dist_opt and mp attribute in pg_collection pg_collection_complete = ProcessGroupCollection() pg_collection_complete.dp = torch.distributed.new_group() pg_collection_complete.expt_dp = None # Explicitly set to None as allowed - # Missing required 'mp' attribute + # Missing required 'intra_dist_opt' attribute + with pytest.raises(ValueError, match="intra_dist_opt process group is required"): + get_megatron_optimizer( + config=optimizer_config, model_chunks=model_chunks, pg_collection=pg_collection_complete + ) + + pg_collection_complete.intra_dist_opt = None # Explicitly set to None as allowed + # Missing required 'mp' attribute with pytest.raises(ValueError, match="mp process group is required"): get_megatron_optimizer( config=optimizer_config, model_chunks=model_chunks, pg_collection=pg_collection_complete diff --git a/tests/unit_tests/test_process_groups_config.py b/tests/unit_tests/test_process_groups_config.py index 0b7e886d61a..032de47e951 100644 --- a/tests/unit_tests/test_process_groups_config.py +++ b/tests/unit_tests/test_process_groups_config.py @@ -1,8 +1,10 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +import pytest import torch.distributed as dist from megatron.core.process_groups_config import ProcessGroupCollection +from tests.unit_tests.test_utilities import Utils class TestProcessGroupsConfig: @@ -64,3 +66,34 @@ def test_hierarchical_context_parallel_groups(self, mocker): assert len(model_pgs.hcp) == 2 assert model_pgs.hcp[0] == mock_pg1 assert model_pgs.hcp[1] == mock_pg2 + + +class TestPGConfigDefaultInitialization: + + def setup_method(self, method): + Utils.initialize_model_parallel(1, 1) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_default_initialization(self): + """Test default initialization of ProcessGroupCollection.""" + # Create instance + model_pgs = ProcessGroupCollection.use_mpu_process_groups() + + # Test that instance was created successfully + assert hasattr(model_pgs, 'tp') + assert hasattr(model_pgs, 'pp') + assert hasattr(model_pgs, 'dp') + assert hasattr(model_pgs, 'dp_cp') + + # Test that only required process groups were initialized + model_pgs = ProcessGroupCollection.use_mpu_process_groups(['tp', 'pp', 'cp']) + assert hasattr(model_pgs, 'tp') + assert hasattr(model_pgs, 'pp') + assert hasattr(model_pgs, 'cp') + assert not hasattr(model_pgs, 'dp') + + # Test that an error is raised if an invalid process group is requested + with pytest.raises(ValueError, match=r"Invalid process groups requested"): + model_pgs = ProcessGroupCollection.use_mpu_process_groups(['tp', 'pp', 'foo']) diff --git a/tests/unit_tests/transformer/test_transformer_block_custom_pgs.py b/tests/unit_tests/transformer/test_transformer_block_custom_pgs.py index e8d708db8aa..bb64efe7449 100644 --- a/tests/unit_tests/transformer/test_transformer_block_custom_pgs.py +++ b/tests/unit_tests/transformer/test_transformer_block_custom_pgs.py @@ -422,10 +422,19 @@ def test_fwd_bwd_pass_non_uniform_transformer_block( attn_pg_collection = ProcessGroupCollection(tp=attn_tp_group, cp=attn_cp_group) mlp_pg_collection = ProcessGroupCollection(tp=mlp_tp_group) + default_pg_collection = ProcessGroupCollection.use_mpu_process_groups( + required_pgs=['tp', 'pp', 'cp'] + ) # Get the layer spec with different process groups for attention and mlp hetro_layer_spec = _gpt_te_layer_spec_with_hetro_pgs(attn_pg_collection, mlp_pg_collection) - custom_block = TransformerBlock(transformer_config, hetro_layer_spec).cuda().bfloat16() + custom_block = ( + TransformerBlock( + transformer_config, hetro_layer_spec, pg_collection=default_pg_collection + ) + .cuda() + .bfloat16() + ) sequence_length = 4096 micro_batch_size = 2 From b007b91525b4f08ac25dc1dcc5a27d3f9854009a Mon Sep 17 00:00:00 2001 From: Dennis Liu Date: Tue, 14 Oct 2025 20:23:09 -0700 Subject: [PATCH 022/248] ADLR/megatron-lm!4207 - Refactor dev functional tests. --- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../golden_values_dev_dgx_h100.json | 110 ++-- .../golden_values_dev_dgxh100_coreweave.json | 500 +++++++++--------- .../model_config.yaml | 5 +- .../model_config.yaml.tmp | 132 +++++ .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../golden_values_dev_dgxh100_coreweave.json | 344 ++++++++++++ .../model_config.yaml | 5 +- .../golden_values_dev_dgxh100_coreweave.json | 498 ++++++++--------- .../model_config.yaml | 9 +- .../golden_values_dev_dgxh100_coreweave.json | 344 ++++++++++++ .../model_config.yaml | 5 +- tests/test_utils/recipes/bert.yaml | 101 ---- tests/test_utils/recipes/moe.yaml | 70 +-- tests/test_utils/recipes/t5.yaml | 116 ---- 17 files changed, 1400 insertions(+), 851 deletions(-) create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml.tmp create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgxh100_coreweave.json delete mode 100644 tests/test_utils/recipes/bert.yaml delete mode 100644 tests/test_utils/recipes/t5.yaml diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml index 2354ecd7fd9..041d35cab11 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml @@ -57,4 +57,4 @@ MODEL_ARGS: --no-bias-gelu-fusion: true --log-memory-to-tensorboard: true --use-tp-pp-dp-mapping: true -TEST_TYPE: regular +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml index 7c0a103200a..7f9613ba222 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml @@ -56,4 +56,4 @@ MODEL_ARGS: --disable-bias-linear: true --no-bias-gelu-fusion: true --log-memory-to-tensorboard: true -TEST_TYPE: regular +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json index 9816ef27d80..5f29261761b 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json @@ -4,17 +4,17 @@ "end_step": 50, "step_interval": 5, "values": { - "1": 11.04748, - "5": 9.53183, - "10": 9.0582, - "15": 8.04864, - "20": 7.90062, - "25": 7.67495, - "30": 7.64523, - "35": 7.21226, - "40": 7.54531, - "45": 7.1859, - "50": 7.03421 + "1": 11.04737, + "5": 9.52647, + "10": 9.05826, + "15": 8.04442, + "20": 7.89153, + "25": 7.67197, + "30": 7.64284, + "35": 7.2114, + "40": 7.54179, + "45": 7.18472, + "50": 7.03329 } }, "num-zeros": { @@ -22,17 +22,17 @@ "end_step": 50, "step_interval": 5, "values": { - "1": 38802624.0, - "5": 256032528.0, - "10": 734802368.0, - "15": 733708032.0, - "20": 964047040.0, - "25": 827440640.0, - "30": 753621760.0, - "35": 721925632.0, - "40": 585270144.0, - "45": 511642912.0, - "50": 447736576.0 + "1": 38802604.0, + "5": 252879712.0, + "10": 728514944.0, + "15": 711699968.0, + "20": 992357632.0, + "25": 884068160.0, + "30": 794514496.0, + "35": 712491648.0, + "40": 588410624.0, + "45": 521081920.0, + "50": 432013312.0 } }, "mem-allocated-bytes": { @@ -58,17 +58,17 @@ "end_step": 50, "step_interval": 5, "values": { - "1": 54207885312.0, - "5": 57055031296.0, - "10": 57055031296.0, - "15": 57055031296.0, - "20": 57055031296.0, - "25": 57055031296.0, - "30": 57055031296.0, - "35": 57055031296.0, - "40": 57055031296.0, - "45": 57055031296.0, - "50": 57221648384.0 + "1": 22860046336.0, + "5": 25729300480.0, + "10": 25729300480.0, + "15": 25888860160.0, + "20": 25888860160.0, + "25": 25888860160.0, + "30": 25888860160.0, + "35": 25888860160.0, + "40": 26620856320.0, + "45": 26620856320.0, + "50": 26620856320.0 } }, "mtp_1 loss": { @@ -76,17 +76,17 @@ "end_step": 50, "step_interval": 5, "values": { - "1": 11.07654, - "5": 9.81153, - "10": 9.12699, - "15": 7.99246, - "20": 7.83056, - "25": 7.61672, - "30": 7.58819, - "35": 7.15342, - "40": 7.47463, - "45": 7.12042, - "50": 6.97381 + "1": 11.07644, + "5": 9.81173, + "10": 9.12712, + "15": 7.99147, + "20": 7.82967, + "25": 7.61319, + "30": 7.58479, + "35": 7.15178, + "40": 7.47349, + "45": 7.12034, + "50": 6.97212 } }, "iteration-time": { @@ -94,17 +94,17 @@ "end_step": 50, "step_interval": 5, "values": { - "1": 50.25533, - "5": 2.27026, - "10": 1.07136, - "15": 1.14652, - "20": 1.0723, - "25": 1.07693, - "30": 1.05572, - "35": 1.06285, - "40": 1.06142, - "45": 1.07083, - "50": 1.07307 + "1": 59.91943, + "5": 2.44769, + "10": 1.07968, + "15": 1.04699, + "20": 0.93032, + "25": 0.92301, + "30": 0.92916, + "35": 0.94157, + "40": 0.95917, + "45": 0.94382, + "50": 0.94866 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json index 0f2637a9511..17dce39fb21 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.04748, - "2": 11.03561, - "3": 9.58774, - "4": 9.25819, - "5": 9.53583, - "6": 9.8804, - "7": 9.48247, - "8": 8.93575, - "9": 8.65813, - "10": 9.0567, - "11": 8.49445, - "12": 8.52444, - "13": 8.45239, - "14": 7.97323, - "15": 8.0476, - "16": 8.07971, - "17": 8.09081, - "18": 7.76437, - "19": 8.14892, - "20": 7.89868, - "21": 7.59371, - "22": 7.54743, - "23": 7.43222, - "24": 7.4302, - "25": 7.67579, - "26": 7.06929, - "27": 7.62041, - "28": 7.32495, - "29": 7.49042, - "30": 7.64391, - "31": 7.39435, - "32": 7.58789, - "33": 7.64037, - "34": 7.69778, - "35": 7.20998, - "36": 7.08538, - "37": 7.42584, - "38": 7.18804, - "39": 7.55054, - "40": 7.54446, - "41": 7.49287, - "42": 7.24937, - "43": 7.23587, - "44": 7.41595, - "45": 7.18755, - "46": 6.89949, - "47": 7.29966, - "48": 7.14134, - "49": 7.58963, - "50": 7.03602 + "1": 11.04737, + "2": 11.03581, + "3": 9.58839, + "4": 9.258, + "5": 9.52647, + "6": 9.907, + "7": 9.48764, + "8": 8.94128, + "9": 8.65518, + "10": 9.05826, + "11": 8.49585, + "12": 8.52509, + "13": 8.4535, + "14": 7.97148, + "15": 8.04442, + "16": 8.08093, + "17": 8.08585, + "18": 7.76263, + "19": 8.14979, + "20": 7.89153, + "21": 7.57836, + "22": 7.54353, + "23": 7.43311, + "24": 7.42342, + "25": 7.67197, + "26": 7.07162, + "27": 7.6134, + "28": 7.31484, + "29": 7.48975, + "30": 7.64284, + "31": 7.39141, + "32": 7.58528, + "33": 7.6358, + "34": 7.69534, + "35": 7.2114, + "36": 7.08322, + "37": 7.42539, + "38": 7.18849, + "39": 7.5489, + "40": 7.54179, + "41": 7.48887, + "42": 7.24738, + "43": 7.2341, + "44": 7.41462, + "45": 7.18472, + "46": 6.89672, + "47": 7.30005, + "48": 7.14262, + "49": 7.58803, + "50": 7.03329 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 38802612.0, - "2": 38543592.0, - "3": 38739528.0, - "4": 279937824.0, - "5": 259189728.0, - "6": 271446400.0, - "7": 604773504.0, - "8": 768892544.0, - "9": 645824128.0, - "10": 744257088.0, - "11": 718888576.0, - "12": 746732544.0, - "13": 871990976.0, - "14": 821645632.0, - "15": 724250816.0, - "16": 932241472.0, - "17": 648958912.0, - "18": 649120000.0, - "19": 925992960.0, - "20": 989207936.0, - "21": 819324096.0, - "22": 736955072.0, - "23": 910497792.0, - "24": 876716672.0, - "25": 843170688.0, - "26": 809573824.0, - "27": 854086912.0, - "28": 802857664.0, - "29": 805523328.0, - "30": 775645184.0, - "31": 771754624.0, - "32": 749733696.0, - "33": 718385216.0, - "34": 724771200.0, - "35": 737655104.0, - "36": 690419968.0, - "37": 673203456.0, - "38": 627239552.0, - "39": 614047168.0, - "40": 607288512.0, - "41": 582590592.0, - "42": 548211200.0, - "43": 532740640.0, - "44": 554239168.0, - "45": 514790528.0, - "46": 350258560.0, - "47": 472420128.0, - "48": 453788736.0, - "49": 440597216.0, - "50": 303063296.0 + "1": 38802604.0, + "2": 38543572.0, + "3": 38739364.0, + "4": 283087744.0, + "5": 252879712.0, + "6": 261986800.0, + "7": 595325120.0, + "8": 778328192.0, + "9": 667827904.0, + "10": 728514944.0, + "11": 718857664.0, + "12": 778200448.0, + "13": 884592256.0, + "14": 846830080.0, + "15": 711699968.0, + "16": 929099456.0, + "17": 718131072.0, + "18": 690071360.0, + "19": 944853824.0, + "20": 992357632.0, + "21": 794133440.0, + "22": 909975808.0, + "23": 919936064.0, + "24": 895588736.0, + "25": 884068160.0, + "26": 869339392.0, + "27": 857232640.0, + "28": 846888320.0, + "29": 821245440.0, + "30": 794514496.0, + "31": 756025600.0, + "32": 762315264.0, + "33": 759280512.0, + "34": 759373696.0, + "35": 712491648.0, + "36": 677834240.0, + "37": 632307392.0, + "38": 614655616.0, + "39": 607761664.0, + "40": 588410624.0, + "41": 582593792.0, + "42": 573377664.0, + "43": 579927552.0, + "44": 579405952.0, + "45": 521081920.0, + "46": 488627232.0, + "47": 478708544.0, + "48": 475807040.0, + "49": 450025824.0, + "50": 432013312.0 } }, "mem-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 55055331328.0, - "2": 57809321984.0, - "3": 57918455808.0, - "4": 57918455808.0, - "5": 57918455808.0, - "6": 57918455808.0, - "7": 57918455808.0, - "8": 57918455808.0, - "9": 57918455808.0, - "10": 57918455808.0, - "11": 57918455808.0, - "12": 57918455808.0, - "13": 57931390976.0, - "14": 57931390976.0, - "15": 57931390976.0, - "16": 57931390976.0, - "17": 57931390976.0, - "18": 57931390976.0, - "19": 57931390976.0, - "20": 57931390976.0, - "21": 57931390976.0, - "22": 57931390976.0, - "23": 57931390976.0, - "24": 57931390976.0, - "25": 57931390976.0, - "26": 57931390976.0, - "27": 57931390976.0, - "28": 57931390976.0, - "29": 57931390976.0, - "30": 57931390976.0, - "31": 57931390976.0, - "32": 58003226624.0, - "33": 58003226624.0, - "34": 58003226624.0, - "35": 58003226624.0, - "36": 58003226624.0, - "37": 58003226624.0, - "38": 58003226624.0, - "39": 58003226624.0, - "40": 58003226624.0, - "41": 58003226624.0, - "42": 58003226624.0, - "43": 58003226624.0, - "44": 58183614464.0, - "45": 58234208256.0, - "46": 58555555840.0, - "47": 58555555840.0, - "48": 58555555840.0, - "49": 58555555840.0, - "50": 58780934144.0 + "1": 22860046336.0, + "2": 25612713984.0, + "3": 25729300480.0, + "4": 25729300480.0, + "5": 25729300480.0, + "6": 25729300480.0, + "7": 25729300480.0, + "8": 25729300480.0, + "9": 25729300480.0, + "10": 25729300480.0, + "11": 25729300480.0, + "12": 25729300480.0, + "13": 25888860160.0, + "14": 25888860160.0, + "15": 25888860160.0, + "16": 25888860160.0, + "17": 25888860160.0, + "18": 25888860160.0, + "19": 25888860160.0, + "20": 25888860160.0, + "21": 25888860160.0, + "22": 25888860160.0, + "23": 25888860160.0, + "24": 25888860160.0, + "25": 25888860160.0, + "26": 25888860160.0, + "27": 25888860160.0, + "28": 25888860160.0, + "29": 25888860160.0, + "30": 25888860160.0, + "31": 25888860160.0, + "32": 25888860160.0, + "33": 25888860160.0, + "34": 25888860160.0, + "35": 25888860160.0, + "36": 25888860160.0, + "37": 25888860160.0, + "38": 26026612736.0, + "39": 26610898944.0, + "40": 26620856320.0, + "41": 26620856320.0, + "42": 26620856320.0, + "43": 26620856320.0, + "44": 26620856320.0, + "45": 26620856320.0, + "46": 26620856320.0, + "47": 26620856320.0, + "48": 26620856320.0, + "49": 26620856320.0, + "50": 26620856320.0 } }, "mtp_1 loss": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.07654, - "2": 11.07406, - "3": 10.53881, - "4": 10.09803, - "5": 9.81154, - "6": 10.06236, - "7": 9.79762, - "8": 9.07117, - "9": 8.87049, - "10": 9.127, - "11": 8.49853, - "12": 8.53046, - "13": 8.42444, - "14": 7.847, - "15": 7.99077, - "16": 8.05015, - "17": 8.00064, - "18": 7.73104, - "19": 8.11087, - "20": 7.82933, - "21": 7.52501, - "22": 7.49916, - "23": 7.36982, - "24": 7.37235, - "25": 7.61578, - "26": 7.02029, - "27": 7.56014, - "28": 7.2681, - "29": 7.44399, - "30": 7.58618, - "31": 7.32468, - "32": 7.50596, - "33": 7.5715, - "34": 7.63581, - "35": 7.15224, - "36": 7.01784, - "37": 7.35163, - "38": 7.12551, - "39": 7.48656, - "40": 7.47408, - "41": 7.42096, - "42": 7.17595, - "43": 7.16059, - "44": 7.34289, - "45": 7.11969, - "46": 6.82753, - "47": 7.23525, - "48": 7.08042, - "49": 7.51043, - "50": 6.9735 + "1": 11.07644, + "2": 11.07413, + "3": 10.53865, + "4": 10.09826, + "5": 9.81173, + "6": 10.07241, + "7": 9.79857, + "8": 9.07114, + "9": 8.86995, + "10": 9.12712, + "11": 8.49873, + "12": 8.53173, + "13": 8.426, + "14": 7.84827, + "15": 7.99147, + "16": 8.05097, + "17": 8.00164, + "18": 7.73164, + "19": 8.11121, + "20": 7.82967, + "21": 7.52376, + "22": 7.49787, + "23": 7.3697, + "24": 7.37154, + "25": 7.61319, + "26": 7.02025, + "27": 7.559, + "28": 7.26735, + "29": 7.44367, + "30": 7.58479, + "31": 7.32416, + "32": 7.50469, + "33": 7.56964, + "34": 7.63474, + "35": 7.15178, + "36": 7.01748, + "37": 7.34976, + "38": 7.12419, + "39": 7.4868, + "40": 7.47349, + "41": 7.42217, + "42": 7.17743, + "43": 7.16238, + "44": 7.34394, + "45": 7.12034, + "46": 6.82708, + "47": 7.235, + "48": 7.07985, + "49": 7.51123, + "50": 6.97212 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 69.29797, - "2": 1.7261, - "3": 1.40981, - "4": 2.16562, - "5": 1.7862, - "6": 1.7469, - "7": 1.96688, - "8": 1.97301, - "9": 1.74665, - "10": 1.69613, - "11": 1.02979, - "12": 1.02408, - "13": 1.03261, - "14": 1.02432, - "15": 1.0529, - "16": 1.04491, - "17": 1.03693, - "18": 1.03399, - "19": 1.03627, - "20": 1.02284, - "21": 1.01667, - "22": 1.02932, - "23": 1.03591, - "24": 1.03466, - "25": 1.03149, - "26": 1.03165, - "27": 1.02342, - "28": 1.03777, - "29": 1.04061, - "30": 1.05641, - "31": 1.02382, - "32": 1.01775, - "33": 1.03039, - "34": 1.03693, - "35": 1.03153, - "36": 1.02699, - "37": 1.02756, - "38": 1.02919, - "39": 1.01773, - "40": 1.03491, - "41": 1.03152, - "42": 1.03035, - "43": 1.0221, - "44": 1.05201, - "45": 1.02579, - "46": 1.02798, - "47": 1.03857, - "48": 1.02772, - "49": 1.0408, - "50": 1.03745 + "1": 63.23561, + "2": 1.12406, + "3": 0.92471, + "4": 1.95991, + "5": 1.98896, + "6": 1.40765, + "7": 1.83926, + "8": 1.3919, + "9": 1.58886, + "10": 0.76479, + "11": 0.74358, + "12": 0.74438, + "13": 0.75457, + "14": 0.74884, + "15": 0.7437, + "16": 0.81872, + "17": 0.74739, + "18": 0.75196, + "19": 0.76647, + "20": 0.74522, + "21": 0.73871, + "22": 0.73978, + "23": 0.73654, + "24": 0.73919, + "25": 0.73709, + "26": 0.78913, + "27": 0.75434, + "28": 0.7477, + "29": 0.73673, + "30": 0.74952, + "31": 0.75513, + "32": 0.74212, + "33": 0.74433, + "34": 0.74812, + "35": 0.7512, + "36": 0.74822, + "37": 0.74176, + "38": 0.7553, + "39": 0.77677, + "40": 0.76693, + "41": 0.76205, + "42": 0.76182, + "43": 0.76665, + "44": 0.76169, + "45": 0.74735, + "46": 0.74195, + "47": 0.75025, + "48": 0.74129, + "49": 0.74367, + "50": 0.74308 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml index 5390afcd09b..0cce9b4edb6 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml @@ -17,8 +17,7 @@ MODEL_ARGS: --use-distributed-optimizer: true --overlap-grad-reduce: true --overlap-param-gather: true - # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN - --attention-backend: unfused # TODO: switch back to fused attention after fix + --attention-backend: fused # Training args --use-mcore-models: true --sequence-parallel: true @@ -123,7 +122,7 @@ MODEL_ARGS: # Add mixed precision args --bf16: true --exit-interval: 50 -TEST_TYPE: regular +TEST_TYPE: ckpt-resume METRICS: - "iteration-time" - "lm loss" diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml.tmp b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml.tmp new file mode 100644 index 00000000000..e36d590170d --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml.tmp @@ -0,0 +1,132 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True + NCCL_NVLS_ENABLE: 0 + PYTHONWARNINGS: ignore + NCCL_DEBUG: VERSION +MODEL_ARGS: + # Distributed args + --distributed-timeout-minutes: 60 + --tensor-model-parallel-size: 2 + --pipeline-model-parallel-size: 2 + --num-virtual-stages-per-pipeline-rank: 4 + --expert-model-parallel-size: 4 + --context-parallel-size: 1 + --expert-tensor-parallel-size: 1 + --use-distributed-optimizer: true + --overlap-grad-reduce: true + --overlap-param-gather: true + --attention-backend: fused + # Training args + --use-mcore-models: true + --sequence-parallel: true + --disable-bias-linear: true + --micro-batch-size: 4 + --global-batch-size: 32 + --train-iters: 50 + --exit-duration-in-mins: 230 + --no-check-for-nan-in-loss-and-grad: true + --no-rope-fusion: true + --cross-entropy-loss-fusion: true + --cross-entropy-fusion-impl: native + --manual-gc: true + --manual-gc-interval: 100 + # Transformer Engine args + --transformer-impl: transformer_engine + # Data args + --seq-length: 4096 + --data-cache-path: /workspace/data/cache + --data-path: /workspace/data/gpt3_data/my-gpt3_00_text_document + --vocab-file: /workspace/data/gpt3_data/bpe/vocab.json + --merge-file: /workspace/data/gpt3_data/bpe/merges.txt + --split: 949,50,1 + # Add network size args + --num-layers: 16 + --hidden-size: 1024 + --ffn-hidden-size: 4096 + --num-attention-heads: 32 + --kv-channels: 128 + --max-position-embeddings: 4096 + --position-embedding-type: rope + --rotary-base: 10000 + --make-vocab-size-divisible-by: 3232 + --normalization: RMSNorm + --norm-epsilon: 1e-6 + --swiglu: true + --untie-embeddings-and-output-weights: true + --multi-latent-attention: true + # Comment out the following MTP args to disable MTP + --mtp-num-layers: 1 + --mtp-loss-scaling-factor: 0.1 + # Add regularization args + --attention-dropout: 0.0 + --hidden-dropout: 0.0 + --clip-grad: 1.0 + --weight-decay: 0.1 + --qk-layernorm: true + # Add learning rate args + --lr-warmup-fraction: .01 + --lr: 0.00015 + --min-lr: 1.0e-5 + --lr-decay-style: cosine + --adam-beta1: 0.9 + --adam-beta2: 0.95 + # Add MoE args + --num-experts: 32 + --moe-layer-freq: ([0]*1+[1]*15) + --moe-ffn-hidden-size: 1024 + --moe-shared-expert-intermediate-size: 1024 + --moe-router-load-balancing-type: seq_aux_loss + --moe-router-topk: 4 + --moe-token-dispatcher-type: alltoall + --moe-router-pre-softmax: true + --moe-grouped-gemm: true + --moe-aux-loss-coeff: 1e-4 + --moe-router-group-topk: 2 + --moe-router-num-groups: 4 + --moe-router-topk-scaling-factor: 2.0 + --moe-router-score-function: sigmoid + --moe-router-enable-expert-bias: true + --moe-router-bias-update-rate: 1e-3 + --moe-router-dtype: fp32 + --moe-permute-fusion: true + # Add MLA args + --q-lora-rank: 1536 + --kv-lora-rank: 512 + --qk-head-dim: 128 + --qk-pos-emb-head-dim: 64 + --v-head-dim: 128 + --rotary-scaling-factor: 40 + --mscale: 1.0 + --mscale-all-dim: 1.0 + # Add validation args + --eval-iters: 32 + --eval-interval: 200 + # Add checkpointing args + --save: /opt/megatron-lm/runs/82c8dc72-e955-4033-a246-b61784f57fa7/checkpoints + --load: /tmp/checkpoints/ + --save-interval: 25 + # Add initialization args + --init-method-std: 0.02 + # Add logging args + --log-timers-to-tensorboard: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-params-norm: true + --log-validation-ppl-to-tensorboard: true + --log-throughput: true + --log-interval: 1 + --logging-level: 40 + --tensorboard-dir: /opt/megatron-lm/runs/82c8dc72-e955-4033-a246-b61784f57fa7/tensorboard + # Add mixed precision args + --bf16: true + --exit-interval: 50 +TEST_TYPE: regular +METRICS: + - "iteration-time" + - "lm loss" + - "num-zeros" + - "mem-allocated-bytes" + - "mem-max-allocated-bytes" + - "mtp_1 loss" diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml index 19a8b4fc639..4e553f2f9ed 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml @@ -33,7 +33,7 @@ MODEL_ARGS: --clip-grad: 1.0 --lr-warmup-fraction: .01 --log-interval: 1 - --save-interval: 10000 + --save-interval: 25 --eval-interval: 1000 --eval-iters: 10 --transformer-impl: transformer_engine @@ -61,4 +61,4 @@ MODEL_ARGS: --attention-backend: unfused --no-bias-gelu-fusion: true --log-memory-to-tensorboard: true -TEST_TYPE: regular +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml index f27db4a8021..7ba366f1d1b 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml @@ -34,7 +34,7 @@ MODEL_ARGS: --clip-grad: 1.0 --lr-warmup-fraction: .01 --log-interval: 1 - --save-interval: 10000 + --save-interval: 25 --eval-interval: 1000 --eval-iters: 10 --transformer-impl: transformer_engine @@ -63,4 +63,4 @@ MODEL_ARGS: --no-bias-gelu-fusion: true --log-memory-to-tensorboard: true --exit-interval: 50 -TEST_TYPE: regular +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..cdd69820131 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,344 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.04747, + "2": 11.03489, + "3": 9.59197, + "4": 9.2607, + "5": 9.25316, + "6": 9.70587, + "7": 9.46635, + "8": 9.01114, + "9": 8.72173, + "10": 9.06704, + "11": 8.59397, + "12": 8.5643, + "13": 8.44846, + "14": 7.97921, + "15": 8.04905, + "16": 8.09886, + "17": 8.04172, + "18": 7.76126, + "19": 8.14014, + "20": 7.86027, + "21": 7.54995, + "22": 7.53872, + "23": 7.40693, + "24": 7.40435, + "25": 7.66065, + "26": 7.05772, + "27": 7.59552, + "28": 7.30627, + "29": 7.48007, + "30": 7.63012, + "31": 7.38325, + "32": 7.57843, + "33": 7.62828, + "34": 7.68919, + "35": 7.20168, + "36": 7.07506, + "37": 7.41935, + "38": 7.17961, + "39": 7.54005, + "40": 7.53821, + "41": 7.47888, + "42": 7.24055, + "43": 7.2256, + "44": 7.40803, + "45": 7.1775, + "46": 6.88877, + "47": 7.29436, + "48": 7.13581, + "49": 7.58407, + "50": 7.02865 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 38802648.0, + "2": 38543564.0, + "3": 38740428.0, + "4": 264349216.0, + "5": 224711328.0, + "6": 359592256.0, + "7": 683584064.0, + "8": 850747136.0, + "9": 781151872.0, + "10": 863934336.0, + "11": 784956928.0, + "12": 787741824.0, + "13": 906642432.0, + "14": 793413952.0, + "15": 724351360.0, + "16": 929182656.0, + "17": 728944832.0, + "18": 715233856.0, + "19": 894586752.0, + "20": 942182208.0, + "21": 712310464.0, + "22": 903670336.0, + "23": 882199552.0, + "24": 867334400.0, + "25": 874751488.0, + "26": 844191104.0, + "27": 813243648.0, + "28": 626785920.0, + "29": 808773120.0, + "30": 602759296.0, + "31": 793783168.0, + "32": 768613888.0, + "33": 721639040.0, + "34": 734472448.0, + "35": 734570880.0, + "36": 703058560.0, + "37": 692109824.0, + "38": 649260992.0, + "39": 620422656.0, + "40": 604143616.0, + "41": 598320448.0, + "42": 573424384.0, + "43": 576846912.0, + "44": 570038144.0, + "45": 540081024.0, + "46": 501251008.0, + "47": 497637664.0, + "48": 494691072.0, + "49": 490977312.0, + "50": 463542304.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 7321331200.0, + "2": 7321333248.0, + "3": 7321333248.0, + "4": 7321333248.0, + "5": 7321333248.0, + "6": 7321333248.0, + "7": 7321333248.0, + "8": 7321333248.0, + "9": 7321333248.0, + "10": 7321333248.0, + "11": 7321333248.0, + "12": 7321333248.0, + "13": 7321333248.0, + "14": 7321333248.0, + "15": 7321333248.0, + "16": 7321333248.0, + "17": 7321333248.0, + "18": 7321333248.0, + "19": 7321333248.0, + "20": 7321333248.0, + "21": 7321333248.0, + "22": 7321333248.0, + "23": 7321333248.0, + "24": 7321333248.0, + "25": 7321333248.0, + "26": 7321333248.0, + "27": 7321333248.0, + "28": 7321333248.0, + "29": 7321333248.0, + "30": 7321333248.0, + "31": 7321333248.0, + "32": 7321333248.0, + "33": 7321333248.0, + "34": 7321333248.0, + "35": 7321333248.0, + "36": 7321333248.0, + "37": 7321333248.0, + "38": 7321333248.0, + "39": 7321333248.0, + "40": 7321333248.0, + "41": 7321333248.0, + "42": 7321333248.0, + "43": 7321333248.0, + "44": 7321333248.0, + "45": 7321333248.0, + "46": 7321333248.0, + "47": 7321333248.0, + "48": 7321333248.0, + "49": 7321333248.0, + "50": 7321333248.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 22198937600.0, + "2": 24950007808.0, + "3": 24950007808.0, + "4": 24950007808.0, + "5": 24950007808.0, + "6": 24950007808.0, + "7": 24950007808.0, + "8": 24950007808.0, + "9": 24950007808.0, + "10": 24950007808.0, + "11": 24950007808.0, + "12": 24950007808.0, + "13": 24950007808.0, + "14": 24950007808.0, + "15": 24950007808.0, + "16": 24950007808.0, + "17": 24950007808.0, + "18": 24950007808.0, + "19": 24950007808.0, + "20": 24950007808.0, + "21": 24950007808.0, + "22": 24950007808.0, + "23": 24950007808.0, + "24": 24950007808.0, + "25": 24950007808.0, + "26": 24950007808.0, + "27": 25072799744.0, + "28": 25343600640.0, + "29": 25625788416.0, + "30": 25625788416.0, + "31": 25628155904.0, + "32": 25707937792.0, + "33": 25707937792.0, + "34": 25707937792.0, + "35": 25707937792.0, + "36": 25707937792.0, + "37": 25707937792.0, + "38": 25707937792.0, + "39": 25707937792.0, + "40": 25707937792.0, + "41": 25707937792.0, + "42": 25707937792.0, + "43": 25707937792.0, + "44": 25707937792.0, + "45": 25707937792.0, + "46": 25707937792.0, + "47": 25707937792.0, + "48": 25707937792.0, + "49": 25707937792.0, + "50": 25707937792.0 + } + }, + "mtp_1 loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.07742, + "2": 11.07559, + "3": 10.5272, + "4": 10.08877, + "5": 9.81119, + "6": 9.88673, + "7": 9.70278, + "8": 8.9944, + "9": 8.79002, + "10": 9.07171, + "11": 8.44594, + "12": 8.50226, + "13": 8.40983, + "14": 7.83955, + "15": 7.97902, + "16": 8.03361, + "17": 7.99642, + "18": 7.71928, + "19": 8.10116, + "20": 7.82113, + "21": 7.51112, + "22": 7.48906, + "23": 7.35335, + "24": 7.35884, + "25": 7.60836, + "26": 7.01391, + "27": 7.54721, + "28": 7.25644, + "29": 7.43129, + "30": 7.57524, + "31": 7.321, + "32": 7.50218, + "33": 7.56009, + "34": 7.62505, + "35": 7.14234, + "36": 7.0092, + "37": 7.34655, + "38": 7.11926, + "39": 7.4822, + "40": 7.46808, + "41": 7.41272, + "42": 7.1698, + "43": 7.15213, + "44": 7.33728, + "45": 7.11437, + "46": 6.81846, + "47": 7.2282, + "48": 7.07339, + "49": 7.50345, + "50": 6.96783 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 71.2429, + "2": 1.39205, + "3": 1.3521, + "4": 1.31895, + "5": 0.86745, + "6": 0.86249, + "7": 1.0949, + "8": 1.03022, + "9": 0.80778, + "10": 0.82011, + "11": 0.81426, + "12": 0.8098, + "13": 0.81209, + "14": 0.81361, + "15": 0.80969, + "16": 0.81315, + "17": 0.85127, + "18": 0.80813, + "19": 0.81928, + "20": 0.81012, + "21": 0.8101, + "22": 0.81064, + "23": 0.80537, + "24": 0.81149, + "25": 0.81261, + "26": 0.81877, + "27": 0.80314, + "28": 0.80383, + "29": 0.83563, + "30": 0.80254, + "31": 0.80006, + "32": 0.80658, + "33": 0.81426, + "34": 0.81824, + "35": 0.81124, + "36": 0.80978, + "37": 0.80679, + "38": 0.80838, + "39": 0.81028, + "40": 0.81044, + "41": 0.81268, + "42": 0.81318, + "43": 0.79311, + "44": 0.80471, + "45": 0.80526, + "46": 0.79795, + "47": 0.80592, + "48": 0.80158, + "49": 0.80635, + "50": 0.79969 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml index 7ebd9f0d1af..c920037f0f2 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml @@ -17,8 +17,7 @@ MODEL_ARGS: --use-distributed-optimizer: true --overlap-grad-reduce: true --overlap-param-gather: true - # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN - --attention-backend: unfused # TODO: switch back to fused attention after fix + --attention-backend: fused # Training args --use-mcore-models: true --sequence-parallel: true @@ -126,7 +125,7 @@ MODEL_ARGS: --fp8-format: hybrid --fp8-recipe: tensorwise --exit-interval: 50 -TEST_TYPE: regular # Usually ckpt-resume, but as a WAR to #513 set to regular +TEST_TYPE: ckpt-resume # Usually ckpt-resume, but as a WAR to #513 set to regular METRICS: - "iteration-time" - "lm loss" diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json index 58eb3fc16cd..7c3cd772f4f 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 10.95004, - "2": 10.9521, - "3": 10.5115, - "4": 9.96454, - "5": 9.93941, + "1": 10.94947, + "2": 10.95236, + "3": 10.50817, + "4": 9.96373, + "5": 9.93907, "6": 9.67273, - "7": 10.20975, - "8": 9.49716, - "9": 9.55902, - "10": 9.79742, - "11": 9.30109, - "12": 9.40483, - "13": 9.39546, - "14": 8.84681, - "15": 9.02444, - "16": 9.07121, - "17": 9.04574, - "18": 8.75678, - "19": 9.18159, - "20": 8.8595, - "21": 8.53503, - "22": 8.55182, - "23": 8.42441, - "24": 8.37608, - "25": 8.64304, - "26": 7.97393, - "27": 8.56806, - "28": 8.19764, - "29": 8.3928, - "30": 8.67283, - "31": 8.289, - "32": 8.43572, - "33": 8.5568, - "34": 8.66018, - "35": 8.07934, - "36": 7.94976, - "37": 8.29565, - "38": 7.98044, - "39": 8.39201, - "40": 8.35513, - "41": 8.31876, - "42": 8.0583, - "43": 8.03283, - "44": 8.24243, - "45": 8.10277, - "46": 7.61696, - "47": 8.15273, - "48": 8.00569, - "49": 8.38688, - "50": 7.81491 + "7": 10.2137, + "8": 9.4963, + "9": 9.56483, + "10": 9.7979, + "11": 9.30107, + "12": 9.40465, + "13": 9.39581, + "14": 8.84796, + "15": 9.02503, + "16": 9.07162, + "17": 9.04638, + "18": 8.75696, + "19": 9.18152, + "20": 8.86295, + "21": 8.5361, + "22": 8.55339, + "23": 8.42711, + "24": 8.37747, + "25": 8.64415, + "26": 7.97441, + "27": 8.56675, + "28": 8.19618, + "29": 8.39325, + "30": 8.67137, + "31": 8.28979, + "32": 8.43623, + "33": 8.55717, + "34": 8.6598, + "35": 8.07929, + "36": 7.94958, + "37": 8.29465, + "38": 7.9784, + "39": 8.39172, + "40": 8.35622, + "41": 8.31635, + "42": 8.06507, + "43": 8.03396, + "44": 8.24146, + "45": 8.1039, + "46": 7.61771, + "47": 8.15375, + "48": 8.00818, + "49": 8.38737, + "50": 7.81612 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 19403624.0, - "2": 19274194.0, - "3": 19372760.0, - "4": 86525248.0, - "5": 148575568.0, - "6": 145226704.0, - "7": 171879984.0, - "8": 195785248.0, - "9": 164124752.0, - "10": 167684736.0, - "11": 221077344.0, - "12": 200384224.0, - "13": 248872528.0, - "14": 211169424.0, - "15": 214304608.0, - "16": 216075632.0, - "17": 267845984.0, - "18": 170470336.0, - "19": 176865072.0, - "20": 187955392.0, - "21": 225750704.0, - "22": 247396816.0, - "23": 211643856.0, - "24": 205638464.0, - "25": 277022272.0, - "26": 291562304.0, - "27": 225789840.0, - "28": 288202368.0, - "29": 198390384.0, - "30": 213302208.0, - "31": 227204752.0, - "32": 271112416.0, - "33": 231840432.0, - "34": 203575536.0, - "35": 191152368.0, - "36": 222566928.0, - "37": 177810112.0, - "38": 228708544.0, - "39": 211168784.0, - "40": 215603968.0, - "41": 200089440.0, - "42": 228529888.0, - "43": 198782848.0, - "44": 141902272.0, - "45": 181922816.0, - "46": 115369856.0, - "47": 170214176.0, - "48": 137292832.0, - "49": 97654936.0, - "50": 160979632.0 + "1": 19403784.0, + "2": 19274252.0, + "3": 19373794.0, + "4": 89687600.0, + "5": 139124400.0, + "6": 138949920.0, + "7": 170316512.0, + "8": 192665728.0, + "9": 168817872.0, + "10": 156652864.0, + "11": 217935232.0, + "12": 213007792.0, + "13": 228424704.0, + "14": 217442256.0, + "15": 237921408.0, + "16": 225523072.0, + "17": 225458384.0, + "18": 164166928.0, + "19": 164457904.0, + "20": 180124848.0, + "21": 230463232.0, + "22": 230096384.0, + "23": 210054656.0, + "24": 200985472.0, + "25": 248708512.0, + "26": 301000896.0, + "27": 205364384.0, + "28": 270886048.0, + "29": 259695952.0, + "30": 224280720.0, + "31": 244360992.0, + "32": 189382672.0, + "33": 231930816.0, + "34": 206712432.0, + "35": 194319616.0, + "36": 246163408.0, + "37": 193561968.0, + "38": 228822688.0, + "39": 226941728.0, + "40": 196742032.0, + "41": 200179904.0, + "42": 219112640.0, + "43": 186235920.0, + "44": 138763920.0, + "45": 148907984.0, + "46": 109115896.0, + "47": 167015728.0, + "48": 156135104.0, + "49": 91378480.0, + "50": 164099648.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 4883602432.0, - "2": 4885017088.0, - "3": 4882657792.0, - "4": 4883046912.0, - "5": 4883725824.0, - "6": 4883713536.0, - "7": 4883040768.0, - "8": 4883273216.0, - "9": 4882952704.0, - "10": 4885949952.0, - "11": 4883990016.0, - "12": 4887679488.0, - "13": 4884011520.0, - "14": 4882899456.0, - "15": 4883515904.0, - "16": 4883990016.0, - "17": 4883410432.0, - "18": 4883673600.0, - "19": 4882903552.0, - "20": 4884541952.0, - "21": 4883138048.0, - "22": 4883247616.0, - "23": 4883839488.0, - "24": 4885058048.0, - "25": 4882676224.0, - "26": 4884058624.0, - "27": 4884724224.0, - "28": 4884874752.0, - "29": 4883127808.0, - "30": 4883252736.0, - "31": 4882955776.0, - "32": 4885190144.0, - "33": 4883845632.0, - "34": 4884392448.0, - "35": 4883083776.0, - "36": 4883851776.0, - "37": 4885246464.0, - "38": 4882680320.0, - "39": 4884296192.0, - "40": 4884689408.0, - "41": 4882836992.0, - "42": 4883972608.0, - "43": 4884519424.0, - "44": 4883354112.0, - "45": 4883495424.0, - "46": 4882788864.0, - "47": 4883144192.0, - "48": 4883688960.0, - "49": 4884182528.0, - "50": 4885279232.0 + "1": 4751680512.0, + "2": 4752032256.0, + "3": 4751058432.0, + "4": 4751692288.0, + "5": 4750785024.0, + "6": 4750721536.0, + "7": 4750738944.0, + "8": 4750471680.0, + "9": 4750078464.0, + "10": 4750671360.0, + "11": 4750662144.0, + "12": 4750013952.0, + "13": 4750343680.0, + "14": 4750866944.0, + "15": 4751114752.0, + "16": 4754016768.0, + "17": 4751645184.0, + "18": 4749773312.0, + "19": 4751623680.0, + "20": 4749661696.0, + "21": 4751997440.0, + "22": 4751115776.0, + "23": 4750557696.0, + "24": 4751779328.0, + "25": 4750678528.0, + "26": 4749646336.0, + "27": 4750984704.0, + "28": 4752366080.0, + "29": 4750876160.0, + "30": 4750423552.0, + "31": 4750733824.0, + "32": 4751212032.0, + "33": 4750073344.0, + "34": 4751521280.0, + "35": 4750867968.0, + "36": 4750440960.0, + "37": 4750258688.0, + "38": 4751287808.0, + "39": 4749742592.0, + "40": 4750831104.0, + "41": 4750516736.0, + "42": 4750870016.0, + "43": 4750633472.0, + "44": 4750676480.0, + "45": 4750337536.0, + "46": 4751146496.0, + "47": 4750629376.0, + "48": 4750627328.0, + "49": 4751527424.0, + "50": 4750583296.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 41210470400.0, - "2": 41210470400.0, - "3": 41210470400.0, - "4": 41210470400.0, - "5": 41210470400.0, - "6": 41210470400.0, - "7": 41210470400.0, - "8": 41210470400.0, - "9": 41210470400.0, - "10": 41210470400.0, - "11": 41210470400.0, - "12": 41210470400.0, - "13": 41210470400.0, - "14": 41210470400.0, - "15": 41210470400.0, - "16": 41210470400.0, - "17": 41210470400.0, - "18": 41210470400.0, - "19": 41210470400.0, - "20": 41210470400.0, - "21": 41210470400.0, - "22": 41210470400.0, - "23": 41210470400.0, - "24": 41210470400.0, - "25": 41210470400.0, - "26": 41210470400.0, - "27": 41210470400.0, - "28": 41210470400.0, - "29": 41210470400.0, - "30": 41210470400.0, - "31": 41210470400.0, - "32": 41210470400.0, - "33": 41210470400.0, - "34": 41210470400.0, - "35": 41210470400.0, - "36": 41210470400.0, - "37": 41210470400.0, - "38": 41210470400.0, - "39": 41210470400.0, - "40": 41210470400.0, - "41": 41210470400.0, - "42": 41210470400.0, - "43": 41210470400.0, - "44": 41210470400.0, - "45": 41210470400.0, - "46": 41210470400.0, - "47": 41210470400.0, - "48": 41210470400.0, - "49": 41210470400.0, - "50": 41210470400.0 + "1": 11458484224.0, + "2": 12450223104.0, + "3": 12450223104.0, + "4": 12450223104.0, + "5": 12450223104.0, + "6": 12572350464.0, + "7": 12815280128.0, + "8": 12815280128.0, + "9": 13430808576.0, + "10": 13558942720.0, + "11": 13558942720.0, + "12": 13558942720.0, + "13": 13558942720.0, + "14": 13558942720.0, + "15": 13558942720.0, + "16": 13558942720.0, + "17": 13558942720.0, + "18": 13558942720.0, + "19": 13558942720.0, + "20": 13558942720.0, + "21": 13764741120.0, + "22": 13887232000.0, + "23": 13887232000.0, + "24": 13887232000.0, + "25": 13887232000.0, + "26": 13887232000.0, + "27": 13887232000.0, + "28": 13887232000.0, + "29": 13887232000.0, + "30": 13887232000.0, + "31": 13887232000.0, + "32": 13887232000.0, + "33": 13887232000.0, + "34": 13887232000.0, + "35": 13887232000.0, + "36": 13887232000.0, + "37": 13887232000.0, + "38": 13887232000.0, + "39": 13887232000.0, + "40": 13887232000.0, + "41": 13887232000.0, + "42": 13887232000.0, + "43": 13887232000.0, + "44": 13887232000.0, + "45": 13887232000.0, + "46": 13887232000.0, + "47": 13887232000.0, + "48": 13887232000.0, + "49": 13887232000.0, + "50": 13887232000.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 86.8085, - "2": 1.10913, - "3": 0.99097, - "4": 0.89412, - "5": 1.25997, - "6": 0.98162, - "7": 0.98318, - "8": 1.13296, - "9": 0.88126, - "10": 0.8633, - "11": 2.2744, - "12": 4.5393, - "13": 3.22763, - "14": 1.64923, - "15": 0.86595, - "16": 0.86575, - "17": 0.85272, - "18": 0.85454, - "19": 0.85281, - "20": 0.87018, - "21": 0.84654, - "22": 0.8494, - "23": 0.84882, - "24": 0.84482, - "25": 0.85311, - "26": 0.84678, - "27": 0.84096, - "28": 0.8412, - "29": 0.84156, - "30": 0.84475, - "31": 0.84747, - "32": 0.85058, - "33": 0.84977, - "34": 0.8479, - "35": 0.85234, - "36": 0.85012, - "37": 0.85087, - "38": 0.84594, - "39": 0.84558, - "40": 0.84807, - "41": 0.84183, - "42": 0.8439, - "43": 0.84221, - "44": 0.84248, - "45": 0.84257, - "46": 0.83922, - "47": 0.84311, - "48": 0.84159, - "49": 0.84011, - "50": 0.8353 + "1": 83.38985, + "2": 0.80022, + "3": 0.71751, + "4": 0.65556, + "5": 0.98544, + "6": 0.76766, + "7": 0.73114, + "8": 0.76226, + "9": 0.62791, + "10": 0.62224, + "11": 0.69873, + "12": 0.62401, + "13": 0.62467, + "14": 0.62054, + "15": 0.6218, + "16": 0.61653, + "17": 0.6184, + "18": 0.63217, + "19": 0.61609, + "20": 0.62413, + "21": 0.60966, + "22": 0.60967, + "23": 0.60674, + "24": 0.60595, + "25": 0.60063, + "26": 0.60502, + "27": 0.60923, + "28": 0.60939, + "29": 0.61217, + "30": 0.60702, + "31": 0.61517, + "32": 0.60803, + "33": 0.60624, + "34": 0.6123, + "35": 0.61133, + "36": 0.60971, + "37": 0.61215, + "38": 0.61014, + "39": 0.62694, + "40": 0.60532, + "41": 0.60477, + "42": 0.60297, + "43": 0.60073, + "44": 0.59786, + "45": 0.60582, + "46": 0.60848, + "47": 0.60019, + "48": 0.60064, + "49": 0.60304, + "50": 0.58276 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml index 23842f00384..9fdcb460cf3 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml @@ -17,8 +17,7 @@ MODEL_ARGS: --use-distributed-optimizer: true --overlap-grad-reduce: true --overlap-param-gather: true - # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN - --attention-backend: unfused # TODO: switch back to fused attention after fix + --attention-backend: fused # Training args --use-mcore-models: true --sequence-parallel: true @@ -128,10 +127,10 @@ MODEL_ARGS: --fp8-format: hybrid --fp8-recipe: tensorwise --exit-interval: 50 -TEST_TYPE: regular # Usually ckpt-resume, but as a WAR to #513 set to regular +TEST_TYPE: ckpt-resume # Usually ckpt-resume, but as a WAR to #513 set to regular METRICS: - "iteration-time" - "lm loss" - "num-zeros" - - "mem-allocated-bytes" - - "mem-max-allocated-bytes" + # - "mem-allocated-bytes" + # - "mem-max-allocated-bytes" # Disable for now since resume training has more memory cost. To be investigated. diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..8c4f243d4c2 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,344 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.0637, + "2": 11.03838, + "3": 9.79196, + "4": 14.17309, + "5": 9.48263, + "6": 9.30356, + "7": 9.27632, + "8": 8.75189, + "9": 8.70462, + "10": 9.04035, + "11": 8.41109, + "12": 8.53109, + "13": 8.43144, + "14": 7.93673, + "15": 8.00837, + "16": 8.08212, + "17": 8.06887, + "18": 7.75236, + "19": 8.13737, + "20": 7.88364, + "21": 7.56605, + "22": 7.55552, + "23": 7.42862, + "24": 7.41252, + "25": 7.67597, + "26": 7.08176, + "27": 7.62221, + "28": 7.32629, + "29": 7.49894, + "30": 7.63447, + "31": 7.3983, + "32": 7.59785, + "33": 7.64396, + "34": 7.70726, + "35": 7.21393, + "36": 7.08985, + "37": 7.42971, + "38": 7.19273, + "39": 7.56041, + "40": 7.55564, + "41": 7.49928, + "42": 7.25988, + "43": 7.24878, + "44": 7.42783, + "45": 7.21045, + "46": 6.91669, + "47": 7.31999, + "48": 7.16939, + "49": 7.62783, + "50": 7.05439 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 38802064.0, + "2": 38543200.0, + "3": 38744220.0, + "4": 166695072.0, + "5": 394456256.0, + "6": 441303136.0, + "7": 538731776.0, + "8": 680781184.0, + "9": 564001216.0, + "10": 571185472.0, + "11": 624455360.0, + "12": 680622208.0, + "13": 777548288.0, + "14": 717772992.0, + "15": 699100416.0, + "16": 677486208.0, + "17": 645761024.0, + "18": 671155776.0, + "19": 674320512.0, + "20": 891692160.0, + "21": 658833920.0, + "22": 802998016.0, + "23": 756352768.0, + "24": 772904192.0, + "25": 748799104.0, + "26": 771817792.0, + "27": 772312064.0, + "28": 655008000.0, + "29": 783495808.0, + "30": 794511296.0, + "31": 756035712.0, + "32": 535862592.0, + "33": 680633984.0, + "34": 482597312.0, + "35": 671593792.0, + "36": 658959488.0, + "37": 626012736.0, + "38": 614650240.0, + "39": 595183872.0, + "40": 421718816.0, + "41": 557433600.0, + "42": 545065344.0, + "43": 539024064.0, + "44": 544803840.0, + "45": 517934176.0, + "46": 504352736.0, + "47": 497582464.0, + "48": 500981632.0, + "49": 490922656.0, + "50": 472902496.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 6294696448.0, + "2": 6295491072.0, + "3": 6296283648.0, + "4": 6297076224.0, + "5": 6297868800.0, + "6": 6298661376.0, + "7": 6294104064.0, + "8": 6294896640.0, + "9": 6295689216.0, + "10": 6296481792.0, + "11": 6294500352.0, + "12": 6295292928.0, + "13": 6296085504.0, + "14": 6296878080.0, + "15": 6297670656.0, + "16": 6298463232.0, + "17": 6299255808.0, + "18": 6300048384.0, + "19": 6300840960.0, + "20": 6301633536.0, + "21": 6302426112.0, + "22": 6303218688.0, + "23": 6304011264.0, + "24": 6304803840.0, + "25": 6305596416.0, + "26": 6306388992.0, + "27": 6307181568.0, + "28": 6307974144.0, + "29": 6308766720.0, + "30": 6309559296.0, + "31": 6310351872.0, + "32": 6311144448.0, + "33": 6311937024.0, + "34": 6312729600.0, + "35": 6313522176.0, + "36": 6314314752.0, + "37": 6315107328.0, + "38": 6315899904.0, + "39": 6316692480.0, + "40": 6317485056.0, + "41": 6318277632.0, + "42": 6319070208.0, + "43": 6319862784.0, + "44": 6320655360.0, + "45": 6321447936.0, + "46": 6322240512.0, + "47": 6323033088.0, + "48": 6323825664.0, + "49": 6324618240.0, + "50": 6325410816.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 46771978240.0, + "2": 49466654720.0, + "3": 51157819392.0, + "4": 51157819392.0, + "5": 51157819392.0, + "6": 51157819392.0, + "7": 51157819392.0, + "8": 51157819392.0, + "9": 51157819392.0, + "10": 51157819392.0, + "11": 51157819392.0, + "12": 51157819392.0, + "13": 51157819392.0, + "14": 51157819392.0, + "15": 51157819392.0, + "16": 51157819392.0, + "17": 51157819392.0, + "18": 51157819392.0, + "19": 51157819392.0, + "20": 51157819392.0, + "21": 51157819392.0, + "22": 51157819392.0, + "23": 51157819392.0, + "24": 51157819392.0, + "25": 51157819392.0, + "26": 51157819392.0, + "27": 51157819392.0, + "28": 51157819392.0, + "29": 51157819392.0, + "30": 51157819392.0, + "31": 51157819392.0, + "32": 51157819392.0, + "33": 51157819392.0, + "34": 51157819392.0, + "35": 51157819392.0, + "36": 51157819392.0, + "37": 51157819392.0, + "38": 51157819392.0, + "39": 51157819392.0, + "40": 51157819392.0, + "41": 51157819392.0, + "42": 51157819392.0, + "43": 51157819392.0, + "44": 51157819392.0, + "45": 51157819392.0, + "46": 51157819392.0, + "47": 51157819392.0, + "48": 51157819392.0, + "49": 51157819392.0, + "50": 51157819392.0 + } + }, + "mtp_1 loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.04508, + "2": 11.05397, + "3": 10.54505, + "4": 9.99194, + "5": 9.76285, + "6": 9.45507, + "7": 9.54431, + "8": 8.91725, + "9": 8.74784, + "10": 9.04997, + "11": 8.40193, + "12": 8.48288, + "13": 8.36926, + "14": 7.81448, + "15": 7.93865, + "16": 8.02231, + "17": 7.96741, + "18": 7.70552, + "19": 8.09012, + "20": 7.79984, + "21": 7.48241, + "22": 7.49502, + "23": 7.35415, + "24": 7.34793, + "25": 7.60324, + "26": 7.01638, + "27": 7.55495, + "28": 7.24721, + "29": 7.43133, + "30": 7.56633, + "31": 7.31391, + "32": 7.50445, + "33": 7.55658, + "34": 7.62234, + "35": 7.13802, + "36": 7.00593, + "37": 7.33916, + "38": 7.1095, + "39": 7.4736, + "40": 7.45784, + "41": 7.40514, + "42": 7.15986, + "43": 7.14965, + "44": 7.32758, + "45": 7.11892, + "46": 6.81056, + "47": 7.2234, + "48": 7.06789, + "49": 7.503, + "50": 6.9559 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 71.51538, + "2": 1.72071, + "3": 1.31657, + "4": 1.18423, + "5": 3.82179, + "6": 2.3037, + "7": 3.15765, + "8": 1.26325, + "9": 1.04414, + "10": 1.05643, + "11": 2.7525, + "12": 1.03473, + "13": 1.05477, + "14": 1.05184, + "15": 1.06441, + "16": 1.1362, + "17": 1.05355, + "18": 1.05093, + "19": 1.04209, + "20": 1.03871, + "21": 1.04773, + "22": 1.05492, + "23": 1.02882, + "24": 1.05172, + "25": 1.03632, + "26": 1.04229, + "27": 1.04662, + "28": 1.05014, + "29": 1.03047, + "30": 1.0813, + "31": 1.06319, + "32": 1.02842, + "33": 1.041, + "34": 1.02275, + "35": 1.03563, + "36": 1.0411, + "37": 1.02865, + "38": 1.03454, + "39": 1.05619, + "40": 1.04996, + "41": 1.02719, + "42": 1.05309, + "43": 1.03532, + "44": 1.05042, + "45": 1.03343, + "46": 1.04769, + "47": 1.03458, + "48": 1.04744, + "49": 1.04302, + "50": 1.0386 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml index 0a37ee08498..4036686e888 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml @@ -16,8 +16,7 @@ MODEL_ARGS: --use-distributed-optimizer: true --overlap-grad-reduce: true --overlap-param-gather: true - # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN - --attention-backend: unfused # TODO: switch back to fused attention after fix + --attention-backend: unfused # Training args --use-mcore-models: true --sequence-parallel: true @@ -126,7 +125,7 @@ MODEL_ARGS: --bf16: true --exit-interval: 50 --overlap-moe-expert-parallel-comm: true -TEST_TYPE: regular # Usually ckpt-resume, but as a WAR to #513 set to regular +TEST_TYPE: ckpt-resume # Usually ckpt-resume, but as a WAR to #513 set to regular METRICS: - "iteration-time" - "lm loss" diff --git a/tests/test_utils/recipes/bert.yaml b/tests/test_utils/recipes/bert.yaml deleted file mode 100644 index f0be62e4701..00000000000 --- a/tests/test_utils/recipes/bert.yaml +++ /dev/null @@ -1,101 +0,0 @@ -type: basic -format_version: 1 -maintainers: [mcore] -loggers: [stdout] -spec: - name: "{test_case}_{environment}_{platforms}" - model: bert - nodes: 1 - build: mcore-pyt-{environment} - gpus: 8 - platforms: dgx_a100 - time_limit: - n_repeat: - artifacts: - /workspace/data/bert_data: text/the_pile/bert_shard00 - /workspace/checkpoints/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G_dev: model/mcore_bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_{platforms}_1N8G_dev/28359448 - script_setup: | - unset https_proxy - echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc - - # Checkout latest - cd /opt - rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm - git init - git remote add origin $MCORE_REPO - git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*' - git fetch origin $MCORE_MR_COMMIT - git checkout $MCORE_MR_COMMIT - git rev-parse HEAD - - # Checkout backwards-ref - cd /opt - rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy - git init - git remote add origin $MCORE_REPO - git fetch origin $MCORE_BACKWARDS_COMMIT - git checkout $MCORE_BACKWARDS_COMMIT - git rev-parse HEAD - rm -rf megatron; cp -a /opt/megatron-lm/megatron ./ - script: |- - ls - cd /opt/megatron-lm - NAME=$(echo {test_case}_{environment} | sed 's/dgx_h100/dgx_a100/g') - ARGUMENTS=( - "DATA_PATH=/workspace/data/bert_data" - "DATA_CACHE_PATH=/workspace/data/cache" - "OUTPUT_PATH={assets_dir}" - "TENSORBOARD_PATH={assets_dir}/tensorboard" - "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints" - "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/$NAME" - "TRAINING_SCRIPT_PATH=pretrain_bert.py" - "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" - "N_REPEAT={n_repeat}" - "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" - "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" - ) - - bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} - -products: - - test_case: [bert_mr_mcore_tp2_pp2_dgx_a100_1N8G] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] - - test_case: [bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] - - test_case: [bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] - - test_case: [bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] - - test_case: [bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2] - products: - - environment: [dev] - scope: [nightly] - platforms: [dgx_h100] - - test_case: [bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2] - products: - - environment: [dev] - scope: [nightly] - platforms: [dgx_h100] - - test_case: [bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1] - products: - - environment: [dev] - scope: [nightly] - platforms: [dgx_h100] - # - test_case: [bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G] - # products: - # - environment: [dev] - # scope: [mr] - # platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml index 972288bd905..af4b4203803 100644 --- a/tests/test_utils/recipes/moe.yaml +++ b/tests/test_utils/recipes/moe.yaml @@ -64,18 +64,6 @@ products: ####################################################################### # Nightly tests: Run both DEV and LTS unless something is flaky # ####################################################################### - - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel] - products: - - environment: [dev] - scope: [nightly] - platforms: [dgx_a100, dgx_h100] - - environment: [lts] - scope: [nightly] - - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel] - products: - - environment: [dev] - scope: [nightly] - platforms: [dgx_a100, dgx_h100] - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel] products: - environment: [dev] @@ -83,32 +71,11 @@ products: platforms: [dgx_a100, dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last] - products: - - environment: [dev] - scope: [nightly] - platforms: [dgx_a100, dgx_h100] - - environment: [lts] - scope: [nightly] - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last] products: - environment: [dev] scope: [nightly] platforms: [dgx_a100, dgx_h100] - - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic] - products: - - environment: [dev] - scope: [nightly] - platforms: [dgx_a100, dgx_h100] - - environment: [lts] - scope: [nightly] - - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last] - products: - - environment: [dev] - scope: [nightly] - platforms: [dgx_a100, dgx_h100] - - environment: [lts] - scope: [nightly] # - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts] # products: # non-determinism: #478 # - environment: [dev, lts] @@ -125,43 +92,21 @@ products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - # - test_case: [gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8] - # products: - # - environment: [dev] - # scope: [mr] - # platforms: [dgx_h100] # hang: #513 - test_case: [gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] # hang: #513 - # - test_case: [gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental] - # products: - # - environment: [dev] - # scope: [mr] - # platforms: [dgx_h100] # hang: #513 - - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental] products: - environment: [dev] scope: [mr] - platforms: [dgx_h100] + platforms: [dgx_h100] # hang: #513 - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] - - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G] - products: - # - environment: [dev] - # scope: [mr] - # platforms: [dgx_h100] # hang: #513 - - environment: [lts] - scope: [nightly] - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G] products: - environment: [dev] @@ -187,6 +132,11 @@ products: - environment: [dev] scope: [mr] platforms: [dgx_h100] + - test_case: [gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_h100] ####################################################################### # Super important MR tests that run for both DEV and LTS per MR # ####################################################################### @@ -203,7 +153,7 @@ products: ########################### # Merge train tests # ########################### - - test_case: [gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer] + - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed] products: - environment: [dev] scope: [mr] @@ -211,11 +161,11 @@ products: - environment: [dev] scope: [mr-slim] platforms: [dgx_h100] - - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8] products: - environment: [dev] scope: [mr] - platforms: [dgx_h100] + platforms: [dgx_h100] # hang: #513 - environment: [dev] scope: [mr-slim] platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/t5.yaml b/tests/test_utils/recipes/t5.yaml deleted file mode 100644 index 31a72e9b5a1..00000000000 --- a/tests/test_utils/recipes/t5.yaml +++ /dev/null @@ -1,116 +0,0 @@ -type: basic -format_version: 1 -maintainers: [mcore] -loggers: [stdout] -spec: - name: "{test_case}_{environment}_{platforms}" - model: t5 - build: mcore-pyt-{environment} - nodes: 1 - gpus: 8 - platforms: dgx_a100 - artifacts: - /workspace/data/t5_data: text/the_pile/t5_shard00 - /workspace/checkpoints/t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_dgx_a100_1N8G_dev: model/mcore_t5/t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_{platforms}_1N8G_dev/28359448 - script_setup: | - unset https_proxy - echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc - - # Checkout latest - cd /opt - rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm - git init - git remote add origin $MCORE_REPO - git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*' - git fetch origin $MCORE_MR_COMMIT - git checkout $MCORE_MR_COMMIT - git rev-parse HEAD - - # Checkout backwards-ref - cd /opt - rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy - git init - git remote add origin $MCORE_REPO - git fetch origin $MCORE_BACKWARDS_COMMIT - git checkout $MCORE_BACKWARDS_COMMIT - git rev-parse HEAD - rm -rf megatron; cp -a /opt/megatron-lm/megatron ./ - script: |- - ls - cd /opt/megatron-lm - - NAME=$(echo {test_case}_{environment} | sed 's/dgx_h100/dgx_a100/g') - - ARGUMENTS=( - "DATA_PATH=/workspace/data/t5_data" - "DATA_CACHE_PATH=/workspace/data/cache" - "OUTPUT_PATH={assets_dir}" - "TENSORBOARD_PATH={assets_dir}/tensorboard" - "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints" - "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/$NAME" - "TRAINING_SCRIPT_PATH=pretrain_t5.py" - "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" - "N_REPEAT={n_repeat}" - "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" - "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" - ) - - bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} - -products: - - test_case: [t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] - - test_case: [t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] - - test_case: [t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] - - test_case: [t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] - - test_case: [t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] - - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch] - products: - - environment: [dev] - scope: [nightly] - platforms: [dgx_a100, dgx_h100] - - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1] - products: - - environment: [dev] - scope: [nightly] - platforms: [dgx_a100, dgx_h100] - - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel] - products: - - environment: [dev] - scope: [nightly] - platforms: [dgx_a100, dgx_h100] - - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1] - products: - - environment: [dev] - scope: [nightly] - platforms: [dgx_a100, dgx_h100] - - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch] - products: - - environment: [dev] - scope: [nightly] - platforms: [dgx_a100, dgx_h100] - - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1] - products: - - environment: [dev] - scope: [nightly] - platforms: [dgx_a100, dgx_h100] From 32e9518b0e6a91049e9c0ae3b1c471a0d3fd348a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 15 Oct 2025 06:36:13 +0000 Subject: [PATCH 023/248] ci: No batch short anymore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/02.test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index 358ad740e01..af972c8d0cf 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -76,7 +76,7 @@ test:unit_tests_configure: "--test-cases all" "--cluster dgxh100_coreweave" "--platform dgx_h100" - "--partition batch_short,batch" + "--partition batch" "--container-image ${UTILITY_IMAGE}" "--container-tag ${CI_PIPELINE_ID}" "--dependent-job test:unit_tests_configure" From bbc762d5c62f28f31944507f8628719d2f84a6db Mon Sep 17 00:00:00 2001 From: Li Tao Date: Wed, 15 Oct 2025 06:07:18 -0700 Subject: [PATCH 024/248] ADLR/megatron-lm!4231 - [Dev] fix(dataset): fix the divergence when using dsv3 tokenizer after !3646; Have datasets account for tokenizers which incorrectly define PAD; Co-authored-by: Teodor-Dumitru Ene --- megatron/core/datasets/bert_dataset.py | 10 ++-- .../blended_megatron_dataset_config.py | 11 ++++ megatron/core/datasets/gpt_dataset.py | 8 --- megatron/core/datasets/megatron_dataset.py | 47 +++++++++++++++++ megatron/core/datasets/t5_dataset.py | 4 +- megatron/training/arguments.py | 14 +++++ megatron/training/tokenizer/tokenizer.py | 52 +++++++++++++------ pretrain_bert.py | 1 + pretrain_gpt.py | 1 + pretrain_mamba.py | 1 + pretrain_retro.py | 1 + pretrain_t5.py | 1 + pretrain_vlm.py | 1 + 13 files changed, 120 insertions(+), 32 deletions(-) diff --git a/megatron/core/datasets/bert_dataset.py b/megatron/core/datasets/bert_dataset.py index 314efb46cd6..6772a4e6644 100644 --- a/megatron/core/datasets/bert_dataset.py +++ b/megatron/core/datasets/bert_dataset.py @@ -139,18 +139,14 @@ def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]: assert length_pads >= 0 tokens = numpy.array(tokens, dtype=numpy.int64) - tokens = numpy.pad(tokens, (0, length_pads), constant_values=self.config.tokenizer.pad) + tokens = numpy.pad(tokens, (0, length_pads), constant_values=self._pad_token_id) assignments = numpy.array(assignments, dtype=numpy.int64) - assignments = numpy.pad( - assignments, (0, length_pads), constant_values=self.config.tokenizer.pad - ) + assignments = numpy.pad(assignments, (0, length_pads), constant_values=self._pad_token_id) # Get the padding mask mask_pads = numpy.ones(length_toks, dtype=numpy.int64) - mask_pads = numpy.pad( - mask_pads, (0, length_pads), constant_values=self.config.tokenizer.pad - ) + mask_pads = numpy.pad(mask_pads, (0, length_pads), constant_values=self._pad_token_id) # Mask the labels labels = numpy.zeros(self.config.sequence_length, dtype=numpy.int64) - 1 diff --git a/megatron/core/datasets/blended_megatron_dataset_config.py b/megatron/core/datasets/blended_megatron_dataset_config.py index 3222ece836f..fd7132acc0f 100644 --- a/megatron/core/datasets/blended_megatron_dataset_config.py +++ b/megatron/core/datasets/blended_megatron_dataset_config.py @@ -77,6 +77,17 @@ class BlendedMegatronDatasetConfig: datasets(s). """ + allow_ambiguous_pad_tokens: Optional[bool] = False + """Whether to prevent pad tokens already present in the dataset from being masked out + when the pad token incorrectly shares the same id with other special tokens. + Treating such tokens as pad tokens results in training instability and divergence. + Such a scenario is best resolved by fixing the tokenizer, but leaving this option as False + provides a workaround. + This argument will have no effect if the tokenizer is correct. However, should the user + desire to train on a dataset that intentionally contains pad tokens - while also using an + incorrect tokenizer - this option may be set to True. This is typically not recommended. + """ + def __post_init__(self) -> None: """Do asserts and set fields post init""" if self.blend_per_split is not None and any(self.blend_per_split): diff --git a/megatron/core/datasets/gpt_dataset.py b/megatron/core/datasets/gpt_dataset.py index 7ea63df8051..c96fed08065 100644 --- a/megatron/core/datasets/gpt_dataset.py +++ b/megatron/core/datasets/gpt_dataset.py @@ -20,9 +20,6 @@ logger = logging.getLogger(__name__) -_PAD_TOKEN_ID = -1 - - @dataclass class GPTDatasetConfig(BlendedMegatronDatasetConfig): """Configuration object for Megatron Core GPT datasets""" @@ -105,11 +102,6 @@ def __init__( self.cached_loss_mask = None self.cached_position_ids = None - try: - self._pad_token_id = self.config.tokenizer.pad - except Exception: - self._pad_token_id = _PAD_TOKEN_ID - (self.document_index, self.sample_index, self.shuffle_index) = ( self._build_document_sample_shuffle_indices() ) diff --git a/megatron/core/datasets/megatron_dataset.py b/megatron/core/datasets/megatron_dataset.py index 0980ef92d36..185a87e1ac5 100644 --- a/megatron/core/datasets/megatron_dataset.py +++ b/megatron/core/datasets/megatron_dataset.py @@ -2,6 +2,7 @@ import hashlib import json +import warnings from abc import ABC, abstractmethod from collections import OrderedDict from typing import Dict, Iterable, List, Optional, Union @@ -16,6 +17,9 @@ LowLevelDataset = Union[IndexedDataset, Iterable] +_PAD_TOKEN_ID = -1 + + class MegatronDataset(ABC, torch.utils.data.Dataset): """The highest level wrapper class from which all dataset classes should inherit @@ -66,6 +70,49 @@ def __init__( self.unique_description.encode("utf-8"), usedforsecurity=False ).hexdigest() + # Handle pad token id provided by the tokenizer + try: + self._pad_token_id = self.config.tokenizer.pad + except Exception: + self._pad_token_id = _PAD_TOKEN_ID + + # Check if pad token id collides with any other special tokens + try: + _special_tokens_list = [ + v for k, v in self.config.tokenizer.special_tokens_dict.items() if k != "pad_token" + ] + except (AttributeError, IndexError, ValueError): + _special_tokens_list = [] + # If the tokenizer does not have a special_tokens_dict attribute, at least check eos and eod + if not _special_tokens_list: + try: + _special_tokens_list.append(self.config.tokenizer.eos) + except AttributeError: + pass + try: + _special_tokens_list.append(self.config.tokenizer.eod) + except AttributeError: + pass + + if self._pad_token_id in _special_tokens_list: + if self.config.allow_ambiguous_pad_tokens: + # This will break training, but users must explicitly opt-in to this behavior. + warnings.warn( + "The pad token id in the tokenizer collides with another special token id. " + "This may cause instability and lack of covergence during training. " + "Do not ignore this warning if you do not understand the implications. " + ) + else: + # Reset the pad token id to a value which is guaranteed not to be in the dataset. + self._pad_token_id = _PAD_TOKEN_ID + warnings.warn( + "The pad token id in the tokenizer collides with another special token id. " + "This may cause instability and lack of covergence during training. " + "As such, the training flow will avoid masking out any pad tokens already " + "present in the dataset. If you would like to disable this behavior, " + "please provide a tokenizer with a uniquely-defined pad token id." + ) + @staticmethod def numel_low_level_dataset(low_level_dataset: LowLevelDataset) -> int: """Return the number of elements in the underlying low level dataset for the purpose of diff --git a/megatron/core/datasets/t5_dataset.py b/megatron/core/datasets/t5_dataset.py index 85da1480e10..8e3531b1e86 100644 --- a/megatron/core/datasets/t5_dataset.py +++ b/megatron/core/datasets/t5_dataset.py @@ -286,12 +286,12 @@ def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]: encoder_input = numpy.array(encoder_input, dtype=numpy.int64) encoder_input = numpy.pad( - encoder_input, (0, length_pads_encoder), constant_values=self.config.tokenizer.pad + encoder_input, (0, length_pads_encoder), constant_values=self._pad_token_id ) decoder_input = numpy.array(decoder_input, dtype=numpy.int64) decoder_input = numpy.pad( - decoder_input, (0, length_pads_decoder), constant_values=self.config.tokenizer.pad + decoder_input, (0, length_pads_decoder), constant_values=self._pad_token_id ) # Create attention and history masks diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 905538ffc9e..fa9a0f6d751 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -2902,6 +2902,20 @@ def _add_data_args(parser): help='Path to cache index files when using s3 or msc dataloader') group.add_argument('--mid-level-dataset-surplus', type=float, default=0.005, help='The sample surplus to build for the mid-level datasets(s)') + group.add_argument('--allow-ambiguous-pad-tokens', action='store_true', + help='Whether to prevent pad tokens already present in the dataset ' + 'from being masked out when the pad token incorrectly shares the same id ' + 'with other special tokens in the tokenizer. Note that this argument has ' + 'no effect when the tokenizer correctly provides a unique id for the pad. ' + 'Masking out such ambiguous pad tokens results in training instability. ' + 'Such a scenario is best resolved by fixing the tokenizer; leaving this ' + 'option as False provides a workaround. ' + 'When left to the default of False, any token ids that collide with the ' + 'pad token id - as provided by the tokenizer - will not be masked out of ' + 'the loss calculation: it cannot be determined whether they are truly pad. ' + 'If instead this argument is set, the training flow will treat all tokens ' + 'that share the same id as the pad token as true pad tokens, potentially ' + 'causing severe training instability.') return parser diff --git a/megatron/training/tokenizer/tokenizer.py b/megatron/training/tokenizer/tokenizer.py index b1aad6819b4..13b7526ca07 100644 --- a/megatron/training/tokenizer/tokenizer.py +++ b/megatron/training/tokenizer/tokenizer.py @@ -48,7 +48,7 @@ def build_tokenizer(args, **kwargs): tokenizer = _GPTSentencePieceTokenizer(args.tokenizer_model) elif args.tokenizer_type == 'HuggingFaceTokenizer': tokenizer = _HuggingFaceTokenizer( - args.tokenizer_model, trust_remote_code = args.trust_remote_code, **kwargs, + args.tokenizer_model, trust_remote_code=args.trust_remote_code, **kwargs ) elif args.tokenizer_type == 'Llama2Tokenizer': assert args.tokenizer_model is not None @@ -78,11 +78,7 @@ def build_tokenizer(args, **kwargs): kwargs = dict() if args.tokenizer_prompt_format == "nvlm-yi-34b": - kwargs = { - "from_slow": True, - "legacy": False, - "add_bos_token": True, - } + kwargs = {"from_slow": True, "legacy": False, "add_bos_token": True} # Currently, only HuggingFace tokenizers are supported. underlying_tokenizer = transformers.AutoTokenizer.from_pretrained( @@ -97,10 +93,7 @@ def build_tokenizer(args, **kwargs): args.force_system_message, ) elif args.tokenizer_type == "SFTTokenizer": - tokenizer = SFTTokenizer( - args.tokenizer_model, - args.sft_tokenizer_prompt_format, - ) + tokenizer = SFTTokenizer(args.tokenizer_model, args.sft_tokenizer_prompt_format) elif args.tokenizer_type == 'NullMultimodalTokenizer': assert args.vocab_size is not None tokenizer = _NullMultimodalTokenizer(args.vocab_size) @@ -144,7 +137,7 @@ def __init__(self, pretrained_model_name_or_path, trust_remote_code=False, **kwa self._tokenizer = transformers.AutoTokenizer.from_pretrained( pretrained_model_name_or_path=pretrained_model_name_or_path, trust_remote_code=trust_remote_code, - **kwargs + **kwargs, ) self._vocab = self._tokenizer.get_vocab() self._inv_vocab = {token_id: token for token, token_id in self._vocab.items()} @@ -367,6 +360,10 @@ def detokenize(self, token_ids): def eod(self): return self.eod_id + @property + def eos(self): + return self.eod_id + class _SentencePieceTokenizer(MegatronLegacyTokenizer): """SentencePieceTokenizer-Megatron wrapper""" @@ -573,6 +570,10 @@ def mask(self): def eod(self): return self._eos_id + @property + def eos(self): + return self._eos_id + @property def additional_special_tokens_ids(self): return None @@ -623,6 +624,10 @@ def mask(self): def eod(self): return self.eos_id + @property + def eos(self): + return self.eos_id + @property def additional_special_tokens_ids(self): return None @@ -747,7 +752,7 @@ def bos(self) -> int: @property def eos(self) -> int: return self._eos_id - + @property def pad(self) -> int: return self._pad_id @@ -858,19 +863,30 @@ def mask(self): def eod(self): return self._eod_id + @property + def eos(self): + return self._eod_id + @property def additional_special_tokens_ids(self): return None + class _NullMultimodalTokenizer(MegatronLegacyTokenizer): def __init__(self, vocab_size, image_token=None, image_token_id=None): super().__init__(None, vocab_size=vocab_size) self._vocab_size_without_eod = int(vocab_size) self._eod_id = self._vocab_size_without_eod - from megatron.core.models.multimodal.llava_model import DEFAULT_IMAGE_TOKEN_INDEX, IMAGE_TOKEN + from megatron.core.models.multimodal.llava_model import ( + DEFAULT_IMAGE_TOKEN_INDEX, + IMAGE_TOKEN, + ) + self._image_token = image_token if image_token is not None else IMAGE_TOKEN - self._image_token_id = image_token_id if image_token_id is not None else DEFAULT_IMAGE_TOKEN_INDEX + self._image_token_id = ( + image_token_id if image_token_id is not None else DEFAULT_IMAGE_TOKEN_INDEX + ) def tokenize(self, text): return [int(x) for x in text.split(' ')] @@ -887,7 +903,9 @@ def offsets(self, ids: list[int], text: str) -> list[int]: return offsets def convert_tokens_to_ids(self, tokens): - ids = [(int(t) if t != self._image_token else self._image_token_id) for t in tokens.split(' ')] + ids = [ + (int(t) if t != self._image_token else self._image_token_id) for t in tokens.split(' ') + ] return ids if len(ids) > 1 else ids[0] @property @@ -918,6 +936,10 @@ def mask(self): def eod(self): return self._eod_id + @property + def eos(self): + return self._eod_id + @property def additional_special_tokens_ids(self): return None diff --git a/pretrain_bert.py b/pretrain_bert.py index a5e2728db89..401c32b4cb9 100644 --- a/pretrain_bert.py +++ b/pretrain_bert.py @@ -172,6 +172,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None masking_use_geometric_distribution=False, classification_head=args.bert_binary_head, mid_level_dataset_surplus=args.mid_level_dataset_surplus, + allow_ambiguous_pad_tokens=args.allow_ambiguous_pad_tokens, ) print_rank_0('> building train, validation, and test datasets ' diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 3b785077664..0c1fd016593 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -190,6 +190,7 @@ def core_gpt_dataset_config_from_args(args): create_attention_mask=args.create_attention_mask_in_dataloader, object_storage_cache_path=args.object_storage_cache_path, mid_level_dataset_surplus=args.mid_level_dataset_surplus, + allow_ambiguous_pad_tokens=args.allow_ambiguous_pad_tokens, ) diff --git a/pretrain_mamba.py b/pretrain_mamba.py index eaf78f7ba9a..8717af11810 100644 --- a/pretrain_mamba.py +++ b/pretrain_mamba.py @@ -186,6 +186,7 @@ def core_gpt_dataset_config_from_args(args): create_attention_mask=args.create_attention_mask_in_dataloader, object_storage_cache_path=args.object_storage_cache_path, mid_level_dataset_surplus=args.mid_level_dataset_surplus, + allow_ambiguous_pad_tokens=args.allow_ambiguous_pad_tokens, ) diff --git a/pretrain_retro.py b/pretrain_retro.py index 100cf605657..63abbac5e39 100644 --- a/pretrain_retro.py +++ b/pretrain_retro.py @@ -210,6 +210,7 @@ def train_valid_test_datasets_provider(train_valid_test_num_samples): reset_attention_mask=args.reset_attention_mask, eod_mask_loss=args.eod_mask_loss, mid_level_dataset_surplus=args.mid_level_dataset_surplus, + allow_ambiguous_pad_tokens=args.allow_ambiguous_pad_tokens, ) # GPT datasets. diff --git a/pretrain_t5.py b/pretrain_t5.py index 6e6d9ad2c06..e74e7d8809e 100644 --- a/pretrain_t5.py +++ b/pretrain_t5.py @@ -233,6 +233,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples: int): masking_use_longer_ngrams=False, masking_use_geometric_distribution=True, mid_level_dataset_surplus=args.mid_level_dataset_surplus, + allow_ambiguous_pad_tokens=args.allow_ambiguous_pad_tokens, ) print_rank_0('> building train, validation, and test datasets for T5 ...') diff --git a/pretrain_vlm.py b/pretrain_vlm.py index ce1a5102444..524931d2727 100644 --- a/pretrain_vlm.py +++ b/pretrain_vlm.py @@ -224,6 +224,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): image_w=args.img_w, preprocess_func=_preprocess_data_for_llava, mid_level_dataset_surplus=args.mid_level_dataset_surplus, + allow_ambiguous_pad_tokens=args.allow_ambiguous_pad_tokens, ) print_rank_0("> building train, validation, and test datasets for multimodal ...") From df41a69aa0a08f4044f7f07fa22f62021b092813 Mon Sep 17 00:00:00 2001 From: Dennis Liu Date: Thu, 16 Oct 2025 02:38:22 -0700 Subject: [PATCH 025/248] ADLR/megatron-lm!4254 - [Dev] Fix dev nightly functional tests. --- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../golden_values_dev_dgxh100_eos.json | 500 +++++++++--------- .../model_config.yaml.tmp | 132 ----- .../golden_values_dev_dgxh100_eos.json | 344 ++++++++++++ .../golden_values_dev_dgxh100_eos.json | 498 ++++++++--------- .../golden_values_dev_dgxh100_eos.json | 344 ++++++++++++ tests/test_utils/recipes/moe.yaml | 4 - 8 files changed, 1189 insertions(+), 637 deletions(-) delete mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml.tmp create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml index 041d35cab11..dc19a6c7698 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml @@ -33,7 +33,7 @@ MODEL_ARGS: --clip-grad: 1.0 --lr-warmup-fraction: .01 --log-interval: 1 - --save-interval: 10000 + --save-interval: 25 --eval-interval: 1000 --eval-iters: 10 --transformer-impl: transformer_engine diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml index 7f9613ba222..30c921c6feb 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml @@ -33,7 +33,7 @@ MODEL_ARGS: --clip-grad: 1.0 --lr-warmup-fraction: .01 --log-interval: 1 - --save-interval: 10000 + --save-interval: 25 --eval-interval: 1000 --eval-iters: 10 --transformer-impl: local diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_eos.json index b3668b31178..f95a91d4ff2 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_eos.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_eos.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.04748, - "2": 11.03561, - "3": 9.58773, - "4": 9.25819, - "5": 9.52742, - "6": 9.87911, - "7": 9.48366, - "8": 8.93879, - "9": 8.6551, - "10": 9.10915, - "11": 8.51806, - "12": 8.54732, - "13": 8.48144, - "14": 8.05312, - "15": 8.10118, - "16": 8.10344, - "17": 8.08878, - "18": 7.78589, - "19": 8.15794, - "20": 7.88069, - "21": 7.58542, - "22": 7.54895, - "23": 7.4296, - "24": 7.41901, - "25": 7.67277, - "26": 7.07835, - "27": 7.61157, - "28": 7.31513, - "29": 7.49487, - "30": 7.64287, - "31": 7.39102, - "32": 7.59148, - "33": 7.6393, - "34": 7.70086, - "35": 7.2119, - "36": 7.08623, - "37": 7.43064, - "38": 7.18999, - "39": 7.5525, - "40": 7.54961, - "41": 7.49385, - "42": 7.25481, - "43": 7.24066, - "44": 7.42131, - "45": 7.19201, - "46": 6.90547, - "47": 7.30704, - "48": 7.15325, - "49": 7.60504, - "50": 7.04512 + "1": 11.04737, + "2": 11.03581, + "3": 9.58845, + "4": 9.25804, + "5": 9.54964, + "6": 9.8667, + "7": 9.47894, + "8": 8.92828, + "9": 8.66752, + "10": 9.05851, + "11": 8.49951, + "12": 8.52674, + "13": 8.45287, + "14": 7.99202, + "15": 8.05428, + "16": 8.08384, + "17": 8.09398, + "18": 7.76937, + "19": 8.14784, + "20": 7.88774, + "21": 7.58582, + "22": 7.5453, + "23": 7.4272, + "24": 7.42741, + "25": 7.67702, + "26": 7.06883, + "27": 7.61756, + "28": 7.33112, + "29": 7.49469, + "30": 7.6427, + "31": 7.39392, + "32": 7.58751, + "33": 7.64167, + "34": 7.70181, + "35": 7.21084, + "36": 7.08821, + "37": 7.42759, + "38": 7.19136, + "39": 7.55273, + "40": 7.54649, + "41": 7.49652, + "42": 7.25161, + "43": 7.2371, + "44": 7.41599, + "45": 7.19163, + "46": 6.90225, + "47": 7.30109, + "48": 7.14398, + "49": 7.59284, + "50": 7.03691 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 38802612.0, - "2": 38543592.0, - "3": 38739480.0, - "4": 279954336.0, - "5": 249745312.0, - "6": 268288496.0, - "7": 604756224.0, - "8": 781485184.0, - "9": 636362112.0, - "10": 653025216.0, - "11": 668551168.0, - "12": 765583616.0, - "13": 815362944.0, - "14": 834270656.0, - "15": 755756096.0, - "16": 995153536.0, - "17": 938291584.0, - "18": 721524928.0, - "19": 756173504.0, - "20": 901129600.0, - "21": 721816384.0, - "22": 831311872.0, - "23": 803536768.0, - "24": 628253248.0, - "25": 663895680.0, - "26": 847321664.0, - "27": 828927424.0, - "28": 777678976.0, - "29": 764628608.0, - "30": 781930112.0, - "31": 771767616.0, - "32": 771755392.0, - "33": 586323648.0, - "34": 734207552.0, - "35": 690468480.0, - "36": 485982688.0, - "37": 506506336.0, - "38": 642964160.0, - "39": 661240000.0, - "40": 645048768.0, - "41": 636072704.0, - "42": 491645856.0, - "43": 601942528.0, - "44": 623448960.0, - "45": 539959424.0, - "46": 532669088.0, - "47": 529039680.0, - "48": 504121984.0, - "49": 478344480.0, - "50": 331385728.0 + "1": 38802620.0, + "2": 38543572.0, + "3": 38741428.0, + "4": 283089696.0, + "5": 256049008.0, + "6": 261995024.0, + "7": 601623744.0, + "8": 775170304.0, + "9": 645831808.0, + "10": 728519104.0, + "11": 740861312.0, + "12": 743565504.0, + "13": 893967040.0, + "14": 963173120.0, + "15": 746290304.0, + "16": 938543360.0, + "17": 730738816.0, + "18": 671172416.0, + "19": 922829888.0, + "20": 948314368.0, + "21": 778417216.0, + "22": 938284544.0, + "23": 926223744.0, + "24": 917606784.0, + "25": 918668992.0, + "26": 866192768.0, + "27": 866673856.0, + "28": 856325760.0, + "29": 836978240.0, + "30": 800803136.0, + "31": 790628096.0, + "32": 756030016.0, + "33": 734117312.0, + "34": 734209792.0, + "35": 731364736.0, + "36": 690416960.0, + "37": 679491584.0, + "38": 639823360.0, + "39": 632918272.0, + "40": 610431680.0, + "41": 598315904.0, + "42": 576523840.0, + "43": 406952768.0, + "44": 569968896.0, + "45": 539956736.0, + "46": 365988928.0, + "47": 503877472.0, + "48": 500972512.0, + "49": 478340480.0, + "50": 457181248.0 } }, "mem-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 55055331328.0, - "2": 57809321984.0, - "3": 57919823872.0, - "4": 57919823872.0, - "5": 57919823872.0, - "6": 57919823872.0, - "7": 57919823872.0, - "8": 57919823872.0, - "9": 57919823872.0, - "10": 57919823872.0, - "11": 57919823872.0, - "12": 57919823872.0, - "13": 57932275712.0, - "14": 57932275712.0, - "15": 57932275712.0, - "16": 57932275712.0, - "17": 57932275712.0, - "18": 57932275712.0, - "19": 57932275712.0, - "20": 57932275712.0, - "21": 57932275712.0, - "22": 57932275712.0, - "23": 57932275712.0, - "24": 57932275712.0, - "25": 57932275712.0, - "26": 57932275712.0, - "27": 57932275712.0, - "28": 57932275712.0, - "29": 57932275712.0, - "30": 57932275712.0, - "31": 57932275712.0, - "32": 57932275712.0, - "33": 57932275712.0, - "34": 57932275712.0, - "35": 57932275712.0, - "36": 57932275712.0, - "37": 57932275712.0, - "38": 57932275712.0, - "39": 57932275712.0, - "40": 57932275712.0, - "41": 57932275712.0, - "42": 57932275712.0, - "43": 57932275712.0, - "44": 57932275712.0, - "45": 57932275712.0, - "46": 57932275712.0, - "47": 57932275712.0, - "48": 57932275712.0, - "49": 57932275712.0, - "50": 57932275712.0 + "1": 22860046336.0, + "2": 25612713984.0, + "3": 25730244608.0, + "4": 25730244608.0, + "5": 25730244608.0, + "6": 25730244608.0, + "7": 25730244608.0, + "8": 25730244608.0, + "9": 25730244608.0, + "10": 25730244608.0, + "11": 25730244608.0, + "12": 25730244608.0, + "13": 26180298752.0, + "14": 26180298752.0, + "15": 26180298752.0, + "16": 26180298752.0, + "17": 26180298752.0, + "18": 26180298752.0, + "19": 26180298752.0, + "20": 26180298752.0, + "21": 26180298752.0, + "22": 26180298752.0, + "23": 26180298752.0, + "24": 26180298752.0, + "25": 26180298752.0, + "26": 26180298752.0, + "27": 26180298752.0, + "28": 26180298752.0, + "29": 26180298752.0, + "30": 26180298752.0, + "31": 26180298752.0, + "32": 26180298752.0, + "33": 26180298752.0, + "34": 26180298752.0, + "35": 26180298752.0, + "36": 26180298752.0, + "37": 26180298752.0, + "38": 26180298752.0, + "39": 26180298752.0, + "40": 26180298752.0, + "41": 26180298752.0, + "42": 26180298752.0, + "43": 26180298752.0, + "44": 26180298752.0, + "45": 26180298752.0, + "46": 26180298752.0, + "47": 26180298752.0, + "48": 26180298752.0, + "49": 26180298752.0, + "50": 26180298752.0 } }, "mtp_1 loss": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.07654, - "2": 11.07406, - "3": 10.53883, - "4": 10.09801, - "5": 9.81156, - "6": 10.06025, - "7": 9.7962, - "8": 9.06987, - "9": 8.86879, - "10": 9.13393, - "11": 8.5017, - "12": 8.54094, - "13": 8.43678, - "14": 7.85637, - "15": 7.99846, - "16": 8.05889, - "17": 8.01134, - "18": 7.73929, - "19": 8.1188, - "20": 7.83458, - "21": 7.53103, - "22": 7.50125, - "23": 7.37135, - "24": 7.37419, - "25": 7.61596, - "26": 7.01586, - "27": 7.55739, - "28": 7.26274, - "29": 7.43991, - "30": 7.58436, - "31": 7.32289, - "32": 7.50362, - "33": 7.56884, - "34": 7.6339, - "35": 7.151, - "36": 7.01725, - "37": 7.35013, - "38": 7.12483, - "39": 7.48708, - "40": 7.47451, - "41": 7.4181, - "42": 7.17557, - "43": 7.15957, - "44": 7.34227, - "45": 7.12176, - "46": 6.82526, - "47": 7.23374, - "48": 7.07893, - "49": 7.5077, - "50": 6.97094 + "1": 11.07644, + "2": 11.07413, + "3": 10.53858, + "4": 10.0983, + "5": 9.8117, + "6": 10.05948, + "7": 9.79869, + "8": 9.0727, + "9": 8.87366, + "10": 9.12893, + "11": 8.49884, + "12": 8.52992, + "13": 8.42414, + "14": 7.84688, + "15": 7.99135, + "16": 8.05047, + "17": 8.0004, + "18": 7.73069, + "19": 8.11023, + "20": 7.82948, + "21": 7.51921, + "22": 7.49606, + "23": 7.37196, + "24": 7.37047, + "25": 7.61349, + "26": 7.01867, + "27": 7.5586, + "28": 7.26599, + "29": 7.44466, + "30": 7.58701, + "31": 7.32783, + "32": 7.50657, + "33": 7.56866, + "34": 7.63344, + "35": 7.15071, + "36": 7.01674, + "37": 7.34958, + "38": 7.12576, + "39": 7.48596, + "40": 7.47304, + "41": 7.41897, + "42": 7.17558, + "43": 7.16122, + "44": 7.34251, + "45": 7.12147, + "46": 6.82911, + "47": 7.23414, + "48": 7.07998, + "49": 7.51108, + "50": 6.9741 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 57.80279, - "2": 1.26321, - "3": 1.18918, - "4": 2.24643, - "5": 2.25191, - "6": 1.80757, - "7": 2.09086, - "8": 1.69153, - "9": 1.81279, - "10": 1.64882, - "11": 1.03476, - "12": 1.03593, - "13": 1.04348, - "14": 1.03841, - "15": 1.04432, - "16": 1.05281, - "17": 1.04826, - "18": 1.04981, - "19": 1.05351, - "20": 1.04668, - "21": 1.05254, - "22": 1.05391, - "23": 1.04635, - "24": 1.05503, - "25": 1.04226, - "26": 1.0684, - "27": 1.04985, - "28": 1.04233, - "29": 1.05036, - "30": 1.06219, - "31": 1.044, - "32": 1.05614, - "33": 1.05729, - "34": 1.05618, - "35": 1.06289, - "36": 1.05761, - "37": 1.05956, - "38": 1.06343, - "39": 1.06848, - "40": 1.06027, - "41": 1.05493, - "42": 1.05258, - "43": 1.04879, - "44": 1.04949, - "45": 1.05964, - "46": 1.04465, - "47": 1.0491, - "48": 1.05387, - "49": 1.05218, - "50": 1.05453 + "1": 57.89597, + "2": 1.02226, + "3": 0.91676, + "4": 1.99588, + "5": 2.00486, + "6": 1.51451, + "7": 1.1193, + "8": 1.44004, + "9": 1.59872, + "10": 0.77647, + "11": 0.76373, + "12": 0.78131, + "13": 0.77869, + "14": 0.76703, + "15": 1.37612, + "16": 0.78402, + "17": 0.78337, + "18": 0.78947, + "19": 0.77286, + "20": 0.76873, + "21": 0.76722, + "22": 0.76847, + "23": 0.77301, + "24": 0.77475, + "25": 0.78165, + "26": 0.81166, + "27": 1.50584, + "28": 0.78435, + "29": 0.79046, + "30": 0.77828, + "31": 0.77039, + "32": 0.78392, + "33": 0.77294, + "34": 0.77717, + "35": 0.78379, + "36": 0.76722, + "37": 0.78405, + "38": 0.78584, + "39": 0.77423, + "40": 0.77729, + "41": 0.78273, + "42": 0.78119, + "43": 0.77474, + "44": 0.79851, + "45": 0.7826, + "46": 0.78586, + "47": 0.77961, + "48": 0.77947, + "49": 0.77944, + "50": 0.77976 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml.tmp b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml.tmp deleted file mode 100644 index e36d590170d..00000000000 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml.tmp +++ /dev/null @@ -1,132 +0,0 @@ -ENV_VARS: - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True - NCCL_NVLS_ENABLE: 0 - PYTHONWARNINGS: ignore - NCCL_DEBUG: VERSION -MODEL_ARGS: - # Distributed args - --distributed-timeout-minutes: 60 - --tensor-model-parallel-size: 2 - --pipeline-model-parallel-size: 2 - --num-virtual-stages-per-pipeline-rank: 4 - --expert-model-parallel-size: 4 - --context-parallel-size: 1 - --expert-tensor-parallel-size: 1 - --use-distributed-optimizer: true - --overlap-grad-reduce: true - --overlap-param-gather: true - --attention-backend: fused - # Training args - --use-mcore-models: true - --sequence-parallel: true - --disable-bias-linear: true - --micro-batch-size: 4 - --global-batch-size: 32 - --train-iters: 50 - --exit-duration-in-mins: 230 - --no-check-for-nan-in-loss-and-grad: true - --no-rope-fusion: true - --cross-entropy-loss-fusion: true - --cross-entropy-fusion-impl: native - --manual-gc: true - --manual-gc-interval: 100 - # Transformer Engine args - --transformer-impl: transformer_engine - # Data args - --seq-length: 4096 - --data-cache-path: /workspace/data/cache - --data-path: /workspace/data/gpt3_data/my-gpt3_00_text_document - --vocab-file: /workspace/data/gpt3_data/bpe/vocab.json - --merge-file: /workspace/data/gpt3_data/bpe/merges.txt - --split: 949,50,1 - # Add network size args - --num-layers: 16 - --hidden-size: 1024 - --ffn-hidden-size: 4096 - --num-attention-heads: 32 - --kv-channels: 128 - --max-position-embeddings: 4096 - --position-embedding-type: rope - --rotary-base: 10000 - --make-vocab-size-divisible-by: 3232 - --normalization: RMSNorm - --norm-epsilon: 1e-6 - --swiglu: true - --untie-embeddings-and-output-weights: true - --multi-latent-attention: true - # Comment out the following MTP args to disable MTP - --mtp-num-layers: 1 - --mtp-loss-scaling-factor: 0.1 - # Add regularization args - --attention-dropout: 0.0 - --hidden-dropout: 0.0 - --clip-grad: 1.0 - --weight-decay: 0.1 - --qk-layernorm: true - # Add learning rate args - --lr-warmup-fraction: .01 - --lr: 0.00015 - --min-lr: 1.0e-5 - --lr-decay-style: cosine - --adam-beta1: 0.9 - --adam-beta2: 0.95 - # Add MoE args - --num-experts: 32 - --moe-layer-freq: ([0]*1+[1]*15) - --moe-ffn-hidden-size: 1024 - --moe-shared-expert-intermediate-size: 1024 - --moe-router-load-balancing-type: seq_aux_loss - --moe-router-topk: 4 - --moe-token-dispatcher-type: alltoall - --moe-router-pre-softmax: true - --moe-grouped-gemm: true - --moe-aux-loss-coeff: 1e-4 - --moe-router-group-topk: 2 - --moe-router-num-groups: 4 - --moe-router-topk-scaling-factor: 2.0 - --moe-router-score-function: sigmoid - --moe-router-enable-expert-bias: true - --moe-router-bias-update-rate: 1e-3 - --moe-router-dtype: fp32 - --moe-permute-fusion: true - # Add MLA args - --q-lora-rank: 1536 - --kv-lora-rank: 512 - --qk-head-dim: 128 - --qk-pos-emb-head-dim: 64 - --v-head-dim: 128 - --rotary-scaling-factor: 40 - --mscale: 1.0 - --mscale-all-dim: 1.0 - # Add validation args - --eval-iters: 32 - --eval-interval: 200 - # Add checkpointing args - --save: /opt/megatron-lm/runs/82c8dc72-e955-4033-a246-b61784f57fa7/checkpoints - --load: /tmp/checkpoints/ - --save-interval: 25 - # Add initialization args - --init-method-std: 0.02 - # Add logging args - --log-timers-to-tensorboard: true - --log-memory-to-tensorboard: true - --log-num-zeros-in-grad: true - --log-params-norm: true - --log-validation-ppl-to-tensorboard: true - --log-throughput: true - --log-interval: 1 - --logging-level: 40 - --tensorboard-dir: /opt/megatron-lm/runs/82c8dc72-e955-4033-a246-b61784f57fa7/tensorboard - # Add mixed precision args - --bf16: true - --exit-interval: 50 -TEST_TYPE: regular -METRICS: - - "iteration-time" - - "lm loss" - - "num-zeros" - - "mem-allocated-bytes" - - "mem-max-allocated-bytes" - - "mtp_1 loss" diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..d4aa4cb5ee9 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,344 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.04747, + "2": 11.03489, + "3": 9.59197, + "4": 9.2607, + "5": 9.25316, + "6": 9.70587, + "7": 9.46635, + "8": 9.01114, + "9": 8.72173, + "10": 9.06704, + "11": 8.59397, + "12": 8.5643, + "13": 8.44846, + "14": 7.97921, + "15": 8.04905, + "16": 8.09886, + "17": 8.04172, + "18": 7.76126, + "19": 8.14014, + "20": 7.86027, + "21": 7.54995, + "22": 7.53872, + "23": 7.40693, + "24": 7.40435, + "25": 7.66065, + "26": 7.05772, + "27": 7.59552, + "28": 7.30627, + "29": 7.48007, + "30": 7.63012, + "31": 7.38325, + "32": 7.57843, + "33": 7.62828, + "34": 7.68919, + "35": 7.20168, + "36": 7.07506, + "37": 7.41935, + "38": 7.17961, + "39": 7.54005, + "40": 7.53821, + "41": 7.47888, + "42": 7.24055, + "43": 7.2256, + "44": 7.40803, + "45": 7.1775, + "46": 6.88877, + "47": 7.29436, + "48": 7.13581, + "49": 7.58407, + "50": 7.02865 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 38802648.0, + "2": 38543564.0, + "3": 38740428.0, + "4": 264349216.0, + "5": 224711328.0, + "6": 359592256.0, + "7": 683584064.0, + "8": 850747136.0, + "9": 781151872.0, + "10": 863934336.0, + "11": 784956928.0, + "12": 787741824.0, + "13": 906642432.0, + "14": 793413952.0, + "15": 724351360.0, + "16": 929182656.0, + "17": 728944832.0, + "18": 715233856.0, + "19": 894586752.0, + "20": 942182208.0, + "21": 712310464.0, + "22": 903670336.0, + "23": 882199552.0, + "24": 867334400.0, + "25": 874751488.0, + "26": 844191104.0, + "27": 813243648.0, + "28": 626785920.0, + "29": 808773120.0, + "30": 602759296.0, + "31": 793783168.0, + "32": 768613888.0, + "33": 721639040.0, + "34": 734472448.0, + "35": 734570880.0, + "36": 703058560.0, + "37": 692109824.0, + "38": 649260992.0, + "39": 620422656.0, + "40": 604143616.0, + "41": 598320448.0, + "42": 573424384.0, + "43": 576846912.0, + "44": 570038144.0, + "45": 540081024.0, + "46": 501251008.0, + "47": 497637664.0, + "48": 494691072.0, + "49": 490977312.0, + "50": 463542304.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 7321331200.0, + "2": 7321333248.0, + "3": 7321333248.0, + "4": 7321333248.0, + "5": 7321333248.0, + "6": 7321333248.0, + "7": 7321333248.0, + "8": 7321333248.0, + "9": 7321333248.0, + "10": 7321333248.0, + "11": 7321333248.0, + "12": 7321333248.0, + "13": 7321333248.0, + "14": 7321333248.0, + "15": 7321333248.0, + "16": 7321333248.0, + "17": 7321333248.0, + "18": 7321333248.0, + "19": 7321333248.0, + "20": 7321333248.0, + "21": 7321333248.0, + "22": 7321333248.0, + "23": 7321333248.0, + "24": 7321333248.0, + "25": 7321333248.0, + "26": 7321333248.0, + "27": 7321333248.0, + "28": 7321333248.0, + "29": 7321333248.0, + "30": 7321333248.0, + "31": 7321333248.0, + "32": 7321333248.0, + "33": 7321333248.0, + "34": 7321333248.0, + "35": 7321333248.0, + "36": 7321333248.0, + "37": 7321333248.0, + "38": 7321333248.0, + "39": 7321333248.0, + "40": 7321333248.0, + "41": 7321333248.0, + "42": 7321333248.0, + "43": 7321333248.0, + "44": 7321333248.0, + "45": 7321333248.0, + "46": 7321333248.0, + "47": 7321333248.0, + "48": 7321333248.0, + "49": 7321333248.0, + "50": 7321333248.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 22198937600.0, + "2": 24950007808.0, + "3": 24950007808.0, + "4": 24950007808.0, + "5": 24950007808.0, + "6": 24950007808.0, + "7": 24950007808.0, + "8": 24950007808.0, + "9": 24950007808.0, + "10": 24950007808.0, + "11": 24950007808.0, + "12": 24950007808.0, + "13": 24950007808.0, + "14": 24950007808.0, + "15": 24950007808.0, + "16": 24950007808.0, + "17": 24950007808.0, + "18": 24950007808.0, + "19": 24950007808.0, + "20": 24950007808.0, + "21": 24950007808.0, + "22": 24950007808.0, + "23": 24950007808.0, + "24": 24950007808.0, + "25": 24950007808.0, + "26": 24950007808.0, + "27": 25072799744.0, + "28": 25343600640.0, + "29": 25625788416.0, + "30": 25625788416.0, + "31": 25628155904.0, + "32": 25707937792.0, + "33": 25707937792.0, + "34": 25707937792.0, + "35": 25707937792.0, + "36": 25707937792.0, + "37": 25707937792.0, + "38": 25707937792.0, + "39": 25707937792.0, + "40": 25707937792.0, + "41": 25707937792.0, + "42": 25707937792.0, + "43": 25707937792.0, + "44": 25707937792.0, + "45": 25707937792.0, + "46": 25707937792.0, + "47": 25707937792.0, + "48": 25707937792.0, + "49": 25707937792.0, + "50": 25707937792.0 + } + }, + "mtp_1 loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.07742, + "2": 11.07559, + "3": 10.5272, + "4": 10.08877, + "5": 9.81119, + "6": 9.88673, + "7": 9.70278, + "8": 8.9944, + "9": 8.79002, + "10": 9.07171, + "11": 8.44594, + "12": 8.50226, + "13": 8.40983, + "14": 7.83955, + "15": 7.97902, + "16": 8.03361, + "17": 7.99642, + "18": 7.71928, + "19": 8.10116, + "20": 7.82113, + "21": 7.51112, + "22": 7.48906, + "23": 7.35335, + "24": 7.35884, + "25": 7.60836, + "26": 7.01391, + "27": 7.54721, + "28": 7.25644, + "29": 7.43129, + "30": 7.57524, + "31": 7.321, + "32": 7.50218, + "33": 7.56009, + "34": 7.62505, + "35": 7.14234, + "36": 7.0092, + "37": 7.34655, + "38": 7.11926, + "39": 7.4822, + "40": 7.46808, + "41": 7.41272, + "42": 7.1698, + "43": 7.15213, + "44": 7.33728, + "45": 7.11437, + "46": 6.81846, + "47": 7.2282, + "48": 7.07339, + "49": 7.50345, + "50": 6.96783 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 66.41406, + "2": 1.09711, + "3": 0.98871, + "4": 1.29382, + "5": 0.90133, + "6": 0.89235, + "7": 1.14675, + "8": 1.06393, + "9": 0.87141, + "10": 0.88489, + "11": 0.87653, + "12": 0.86844, + "13": 0.87292, + "14": 0.88542, + "15": 0.87413, + "16": 0.8658, + "17": 0.86683, + "18": 0.85604, + "19": 0.87144, + "20": 0.8739, + "21": 0.87412, + "22": 0.8842, + "23": 0.87866, + "24": 0.87817, + "25": 0.87219, + "26": 0.88191, + "27": 0.86283, + "28": 0.85644, + "29": 0.85444, + "30": 0.86821, + "31": 0.8659, + "32": 0.86683, + "33": 0.86547, + "34": 0.86171, + "35": 0.84405, + "36": 0.84744, + "37": 0.84896, + "38": 0.85314, + "39": 0.85693, + "40": 0.83956, + "41": 0.844, + "42": 0.84413, + "43": 0.83996, + "44": 0.84204, + "45": 0.84489, + "46": 0.83423, + "47": 0.83738, + "48": 0.85356, + "49": 0.86096, + "50": 0.85603 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json index daa04af43dd..9ba3e686ab8 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 10.95004, - "2": 10.9521, - "3": 10.5115, - "4": 9.96454, - "5": 9.93941, + "1": 10.94947, + "2": 10.95236, + "3": 10.50817, + "4": 9.96373, + "5": 9.93907, "6": 9.67273, - "7": 10.20975, - "8": 9.49716, - "9": 9.55902, - "10": 9.79742, - "11": 9.30109, - "12": 9.40483, - "13": 9.39546, - "14": 8.84681, - "15": 9.02444, - "16": 9.07121, - "17": 9.04574, - "18": 8.75678, - "19": 9.18159, - "20": 8.8595, - "21": 8.53503, - "22": 8.55182, - "23": 8.42441, - "24": 8.37608, - "25": 8.64304, - "26": 7.97393, - "27": 8.56806, - "28": 8.19764, - "29": 8.3928, - "30": 8.67283, - "31": 8.289, - "32": 8.43572, - "33": 8.5568, - "34": 8.66018, - "35": 8.07934, - "36": 7.94976, - "37": 8.29565, - "38": 7.98044, - "39": 8.39201, - "40": 8.35513, - "41": 8.31876, - "42": 8.0583, - "43": 8.03283, - "44": 8.24243, - "45": 8.10277, - "46": 7.61696, - "47": 8.15273, - "48": 8.00569, - "49": 8.38688, - "50": 7.81491 + "7": 10.2137, + "8": 9.4963, + "9": 9.56483, + "10": 9.7979, + "11": 9.30107, + "12": 9.40465, + "13": 9.39581, + "14": 8.84796, + "15": 9.02503, + "16": 9.07162, + "17": 9.04638, + "18": 8.75696, + "19": 9.18152, + "20": 8.86295, + "21": 8.5361, + "22": 8.55339, + "23": 8.42711, + "24": 8.37747, + "25": 8.64415, + "26": 7.97441, + "27": 8.56675, + "28": 8.19618, + "29": 8.39325, + "30": 8.67137, + "31": 8.28979, + "32": 8.43623, + "33": 8.55717, + "34": 8.6598, + "35": 8.07929, + "36": 7.94958, + "37": 8.29465, + "38": 7.9784, + "39": 8.39172, + "40": 8.35622, + "41": 8.31635, + "42": 8.06507, + "43": 8.03396, + "44": 8.24146, + "45": 8.1039, + "46": 7.61771, + "47": 8.15375, + "48": 8.00818, + "49": 8.38737, + "50": 7.81612 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 19403624.0, - "2": 19274194.0, - "3": 19372760.0, - "4": 86525248.0, - "5": 148575568.0, - "6": 145226704.0, - "7": 171879984.0, - "8": 195785248.0, - "9": 164124752.0, - "10": 167684736.0, - "11": 221077344.0, - "12": 200384224.0, - "13": 248872528.0, - "14": 211169424.0, - "15": 214304608.0, - "16": 216075632.0, - "17": 267845984.0, - "18": 170470336.0, - "19": 176865072.0, - "20": 187955392.0, - "21": 225750704.0, - "22": 247396816.0, - "23": 211643856.0, - "24": 205638464.0, - "25": 277022272.0, - "26": 291562304.0, - "27": 225789840.0, - "28": 288202368.0, - "29": 198390384.0, - "30": 213302208.0, - "31": 227204752.0, - "32": 271112416.0, - "33": 231840432.0, - "34": 203575536.0, - "35": 191152368.0, - "36": 222566928.0, - "37": 177810112.0, - "38": 228708544.0, - "39": 211168784.0, - "40": 215603968.0, - "41": 200089440.0, - "42": 228529888.0, - "43": 198782848.0, - "44": 141902272.0, - "45": 181922816.0, - "46": 115369856.0, - "47": 170214176.0, - "48": 137292832.0, - "49": 97654936.0, - "50": 160979632.0 + "1": 19403784.0, + "2": 19274252.0, + "3": 19373794.0, + "4": 89687600.0, + "5": 139124400.0, + "6": 138949920.0, + "7": 170316512.0, + "8": 192665728.0, + "9": 168817872.0, + "10": 156652864.0, + "11": 217935232.0, + "12": 213007792.0, + "13": 228424704.0, + "14": 217442256.0, + "15": 237921408.0, + "16": 225523072.0, + "17": 225458384.0, + "18": 164166928.0, + "19": 164457904.0, + "20": 180124848.0, + "21": 230463232.0, + "22": 230096384.0, + "23": 210054656.0, + "24": 200985472.0, + "25": 248708512.0, + "26": 301000896.0, + "27": 205364384.0, + "28": 270886048.0, + "29": 259695952.0, + "30": 224280720.0, + "31": 244360992.0, + "32": 189382672.0, + "33": 231930816.0, + "34": 206712432.0, + "35": 194319616.0, + "36": 246163408.0, + "37": 193561968.0, + "38": 228822688.0, + "39": 226941728.0, + "40": 196742032.0, + "41": 200179904.0, + "42": 219112640.0, + "43": 186235920.0, + "44": 138763920.0, + "45": 148907984.0, + "46": 109115896.0, + "47": 167015728.0, + "48": 156135104.0, + "49": 91378480.0, + "50": 164099648.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 4882187264.0, - "2": 4881607168.0, - "3": 4882283008.0, - "4": 4881322496.0, - "5": 4882174464.0, - "6": 4883177984.0, - "7": 4883252736.0, - "8": 4881774080.0, - "9": 4881443328.0, - "10": 4884319744.0, - "11": 4882319872.0, - "12": 4881232384.0, - "13": 4880836096.0, - "14": 4882124288.0, - "15": 4882108928.0, - "16": 4883384832.0, - "17": 4880466432.0, - "18": 4881518080.0, - "19": 4881734144.0, - "20": 4883215872.0, - "21": 4883534336.0, - "22": 4882774528.0, - "23": 4881818112.0, - "24": 4882441728.0, - "25": 4880546304.0, - "26": 4882178560.0, - "27": 4881892864.0, - "28": 4881869312.0, - "29": 4882979328.0, - "30": 4882715136.0, - "31": 4883084800.0, - "32": 4881436160.0, - "33": 4881766912.0, - "34": 4881406464.0, - "35": 4881531392.0, - "36": 4881479168.0, - "37": 4882455040.0, - "38": 4882054656.0, - "39": 4882005504.0, - "40": 4882743808.0, - "41": 4881211904.0, - "42": 4881378816.0, - "43": 4882133504.0, - "44": 4881860096.0, - "45": 4883165696.0, - "46": 4882168320.0, - "47": 4881526272.0, - "48": 4882125312.0, - "49": 4881533440.0, - "50": 4881598976.0 + "1": 4749337600.0, + "2": 4748343808.0, + "3": 4747997696.0, + "4": 4747469312.0, + "5": 4745943552.0, + "6": 4746412544.0, + "7": 4749017600.0, + "8": 4746762752.0, + "9": 4746394112.0, + "10": 4748286464.0, + "11": 4747621888.0, + "12": 4747802112.0, + "13": 4746905088.0, + "14": 4746850816.0, + "15": 4745785856.0, + "16": 4746166784.0, + "17": 4745583104.0, + "18": 4746839552.0, + "19": 4746510848.0, + "20": 4748375552.0, + "21": 4746974720.0, + "22": 4747533824.0, + "23": 4746271232.0, + "24": 4747352576.0, + "25": 4746148352.0, + "26": 4746516992.0, + "27": 4748668416.0, + "28": 4746871296.0, + "29": 4747913728.0, + "30": 4746131968.0, + "31": 4747437568.0, + "32": 4748567040.0, + "33": 4746713600.0, + "34": 4747983360.0, + "35": 4747450880.0, + "36": 4748372480.0, + "37": 4747075072.0, + "38": 4748749312.0, + "39": 4747972096.0, + "40": 4746372608.0, + "41": 4747513344.0, + "42": 4747912704.0, + "43": 4746867200.0, + "44": 4747612672.0, + "45": 4748287488.0, + "46": 4746935808.0, + "47": 4748032512.0, + "48": 4747668992.0, + "49": 4747238912.0, + "50": 4749120000.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 41210470400.0, - "2": 41210470400.0, - "3": 41210470400.0, - "4": 41210470400.0, - "5": 41210470400.0, - "6": 41210470400.0, - "7": 41210470400.0, - "8": 41210470400.0, - "9": 41210470400.0, - "10": 41210470400.0, - "11": 41210470400.0, - "12": 41210470400.0, - "13": 41210470400.0, - "14": 41210470400.0, - "15": 41210470400.0, - "16": 41210470400.0, - "17": 41210470400.0, - "18": 41210470400.0, - "19": 41210470400.0, - "20": 41210470400.0, - "21": 41210470400.0, - "22": 41210470400.0, - "23": 41210470400.0, - "24": 41210470400.0, - "25": 41210470400.0, - "26": 41210470400.0, - "27": 41210470400.0, - "28": 41210470400.0, - "29": 41210470400.0, - "30": 41210470400.0, - "31": 41210470400.0, - "32": 41210470400.0, - "33": 41210470400.0, - "34": 41210470400.0, - "35": 41210470400.0, - "36": 41210470400.0, - "37": 41210470400.0, - "38": 41210470400.0, - "39": 41210470400.0, - "40": 41210470400.0, - "41": 41210470400.0, - "42": 41210470400.0, - "43": 41210470400.0, - "44": 41210470400.0, - "45": 41210470400.0, - "46": 41210470400.0, - "47": 41210470400.0, - "48": 41210470400.0, - "49": 41210470400.0, - "50": 41210470400.0 + "1": 11455561728.0, + "2": 12440659968.0, + "3": 12440659968.0, + "4": 12440659968.0, + "5": 12440659968.0, + "6": 12576563200.0, + "7": 12813101056.0, + "8": 12813101056.0, + "9": 13424891904.0, + "10": 13556338688.0, + "11": 13556338688.0, + "12": 13556338688.0, + "13": 13556338688.0, + "14": 13556338688.0, + "15": 13556338688.0, + "16": 13556338688.0, + "17": 13556338688.0, + "18": 13556338688.0, + "19": 13556338688.0, + "20": 13556338688.0, + "21": 13758310400.0, + "22": 13883041792.0, + "23": 13883041792.0, + "24": 13883041792.0, + "25": 13883041792.0, + "26": 13883041792.0, + "27": 13883041792.0, + "28": 13883041792.0, + "29": 13883041792.0, + "30": 13883041792.0, + "31": 13883041792.0, + "32": 13883041792.0, + "33": 13883041792.0, + "34": 13883041792.0, + "35": 13883041792.0, + "36": 13883041792.0, + "37": 13883041792.0, + "38": 13883041792.0, + "39": 13883041792.0, + "40": 13883041792.0, + "41": 13883041792.0, + "42": 13883041792.0, + "43": 13883041792.0, + "44": 13883041792.0, + "45": 13883041792.0, + "46": 13883041792.0, + "47": 13883041792.0, + "48": 13883041792.0, + "49": 13883041792.0, + "50": 13883041792.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 96.21947, - "2": 1.10023, - "3": 0.96399, - "4": 0.91113, - "5": 1.27509, - "6": 1.00484, - "7": 1.01236, - "8": 1.1739, - "9": 0.89406, - "10": 0.88836, - "11": 0.92033, - "12": 0.88331, - "13": 0.88179, - "14": 0.88307, - "15": 0.88648, - "16": 0.88425, - "17": 0.87155, - "18": 0.87556, - "19": 0.87374, - "20": 0.8744, - "21": 0.86757, - "22": 0.87217, - "23": 0.8736, - "24": 0.86646, - "25": 0.87328, - "26": 0.87121, - "27": 0.85886, - "28": 0.86392, - "29": 0.86385, - "30": 0.86425, - "31": 0.8631, - "32": 0.8617, - "33": 0.86069, - "34": 0.86829, - "35": 0.86837, - "36": 0.86776, - "37": 0.86686, - "38": 0.86359, - "39": 0.8677, - "40": 0.86441, - "41": 0.86179, - "42": 0.86079, - "43": 0.86149, - "44": 0.86222, - "45": 0.86336, - "46": 0.85875, - "47": 0.86219, - "48": 0.86026, - "49": 0.85894, - "50": 0.8544 + "1": 99.19363, + "2": 0.87925, + "3": 0.76355, + "4": 0.70351, + "5": 1.06855, + "6": 0.8083, + "7": 0.79282, + "8": 0.81872, + "9": 0.67053, + "10": 0.64913, + "11": 0.72935, + "12": 0.64945, + "13": 0.64181, + "14": 0.63807, + "15": 0.65651, + "16": 0.66428, + "17": 0.65744, + "18": 0.65362, + "19": 0.65862, + "20": 0.6544, + "21": 0.64288, + "22": 0.64951, + "23": 0.64322, + "24": 0.64447, + "25": 0.63601, + "26": 0.62955, + "27": 0.6244, + "28": 0.62697, + "29": 0.62787, + "30": 0.6295, + "31": 0.63726, + "32": 0.62178, + "33": 0.62521, + "34": 0.62615, + "35": 0.61895, + "36": 0.62424, + "37": 0.62219, + "38": 0.62548, + "39": 0.62127, + "40": 0.62356, + "41": 0.6165, + "42": 0.61786, + "43": 0.61742, + "44": 0.61943, + "45": 0.61884, + "46": 0.62012, + "47": 0.61656, + "48": 0.6143, + "49": 0.61232, + "50": 0.6085 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..29b1b467978 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,344 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.0637, + "2": 11.03838, + "3": 9.79196, + "4": 14.17309, + "5": 9.48263, + "6": 9.30356, + "7": 9.27632, + "8": 8.75189, + "9": 8.70462, + "10": 9.04035, + "11": 8.41109, + "12": 8.53109, + "13": 8.43144, + "14": 7.93673, + "15": 8.00837, + "16": 8.08212, + "17": 8.06887, + "18": 7.75236, + "19": 8.13737, + "20": 7.88364, + "21": 7.56605, + "22": 7.55552, + "23": 7.42862, + "24": 7.41252, + "25": 7.67597, + "26": 7.08176, + "27": 7.62221, + "28": 7.32629, + "29": 7.49894, + "30": 7.63447, + "31": 7.3983, + "32": 7.59785, + "33": 7.64396, + "34": 7.70726, + "35": 7.21393, + "36": 7.08985, + "37": 7.42971, + "38": 7.19273, + "39": 7.56041, + "40": 7.55564, + "41": 7.49928, + "42": 7.25988, + "43": 7.24878, + "44": 7.42783, + "45": 7.21045, + "46": 6.91669, + "47": 7.31999, + "48": 7.16939, + "49": 7.62783, + "50": 7.05439 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 38802064.0, + "2": 38543200.0, + "3": 38744220.0, + "4": 166695072.0, + "5": 394456256.0, + "6": 441303136.0, + "7": 538731776.0, + "8": 680781184.0, + "9": 564001216.0, + "10": 571185472.0, + "11": 624455360.0, + "12": 680622208.0, + "13": 777548288.0, + "14": 717772992.0, + "15": 699100416.0, + "16": 677486208.0, + "17": 645761024.0, + "18": 671155776.0, + "19": 674320512.0, + "20": 891692160.0, + "21": 658833920.0, + "22": 802998016.0, + "23": 756352768.0, + "24": 772904192.0, + "25": 748799104.0, + "26": 771817792.0, + "27": 772312064.0, + "28": 655008000.0, + "29": 783495808.0, + "30": 794511296.0, + "31": 756035712.0, + "32": 535862592.0, + "33": 680633984.0, + "34": 482597312.0, + "35": 671593792.0, + "36": 658959488.0, + "37": 626012736.0, + "38": 614650240.0, + "39": 595183872.0, + "40": 421718816.0, + "41": 557433600.0, + "42": 545065344.0, + "43": 539024064.0, + "44": 544803840.0, + "45": 517934176.0, + "46": 504352736.0, + "47": 497582464.0, + "48": 500981632.0, + "49": 490922656.0, + "50": 472902496.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 6294696448.0, + "2": 6295491072.0, + "3": 6296283648.0, + "4": 6297076224.0, + "5": 6297868800.0, + "6": 6298661376.0, + "7": 6294104064.0, + "8": 6294896640.0, + "9": 6295689216.0, + "10": 6296481792.0, + "11": 6294500352.0, + "12": 6295292928.0, + "13": 6296085504.0, + "14": 6296878080.0, + "15": 6297670656.0, + "16": 6298463232.0, + "17": 6299255808.0, + "18": 6300048384.0, + "19": 6300840960.0, + "20": 6301633536.0, + "21": 6302426112.0, + "22": 6303218688.0, + "23": 6304011264.0, + "24": 6304803840.0, + "25": 6305596416.0, + "26": 6306388992.0, + "27": 6307181568.0, + "28": 6307974144.0, + "29": 6308766720.0, + "30": 6309559296.0, + "31": 6310351872.0, + "32": 6311144448.0, + "33": 6311937024.0, + "34": 6312729600.0, + "35": 6313522176.0, + "36": 6314314752.0, + "37": 6315107328.0, + "38": 6315899904.0, + "39": 6316692480.0, + "40": 6317485056.0, + "41": 6318277632.0, + "42": 6319070208.0, + "43": 6319862784.0, + "44": 6320655360.0, + "45": 6321447936.0, + "46": 6322240512.0, + "47": 6323033088.0, + "48": 6323825664.0, + "49": 6324618240.0, + "50": 6325410816.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 46771978240.0, + "2": 49466654720.0, + "3": 51157819392.0, + "4": 51157819392.0, + "5": 51157819392.0, + "6": 51157819392.0, + "7": 51157819392.0, + "8": 51157819392.0, + "9": 51157819392.0, + "10": 51157819392.0, + "11": 51157819392.0, + "12": 51157819392.0, + "13": 51157819392.0, + "14": 51157819392.0, + "15": 51157819392.0, + "16": 51157819392.0, + "17": 51157819392.0, + "18": 51157819392.0, + "19": 51157819392.0, + "20": 51157819392.0, + "21": 51157819392.0, + "22": 51157819392.0, + "23": 51157819392.0, + "24": 51157819392.0, + "25": 51157819392.0, + "26": 51157819392.0, + "27": 51157819392.0, + "28": 51157819392.0, + "29": 51157819392.0, + "30": 51157819392.0, + "31": 51157819392.0, + "32": 51157819392.0, + "33": 51157819392.0, + "34": 51157819392.0, + "35": 51157819392.0, + "36": 51157819392.0, + "37": 51157819392.0, + "38": 51157819392.0, + "39": 51157819392.0, + "40": 51157819392.0, + "41": 51157819392.0, + "42": 51157819392.0, + "43": 51157819392.0, + "44": 51157819392.0, + "45": 51157819392.0, + "46": 51157819392.0, + "47": 51157819392.0, + "48": 51157819392.0, + "49": 51157819392.0, + "50": 51157819392.0 + } + }, + "mtp_1 loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.04508, + "2": 11.05397, + "3": 10.54505, + "4": 9.99194, + "5": 9.76285, + "6": 9.45507, + "7": 9.54431, + "8": 8.91725, + "9": 8.74784, + "10": 9.04997, + "11": 8.40193, + "12": 8.48288, + "13": 8.36926, + "14": 7.81448, + "15": 7.93865, + "16": 8.02231, + "17": 7.96741, + "18": 7.70552, + "19": 8.09012, + "20": 7.79984, + "21": 7.48241, + "22": 7.49502, + "23": 7.35415, + "24": 7.34793, + "25": 7.60324, + "26": 7.01638, + "27": 7.55495, + "28": 7.24721, + "29": 7.43133, + "30": 7.56633, + "31": 7.31391, + "32": 7.50445, + "33": 7.55658, + "34": 7.62234, + "35": 7.13802, + "36": 7.00593, + "37": 7.33916, + "38": 7.1095, + "39": 7.4736, + "40": 7.45784, + "41": 7.40514, + "42": 7.15986, + "43": 7.14965, + "44": 7.32758, + "45": 7.11892, + "46": 6.81056, + "47": 7.2234, + "48": 7.06789, + "49": 7.503, + "50": 6.9559 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 58.25602, + "2": 1.30671, + "3": 1.18374, + "4": 1.08853, + "5": 3.28347, + "6": 2.13071, + "7": 2.96694, + "8": 1.2675, + "9": 1.07672, + "10": 1.07909, + "11": 2.90834, + "12": 1.06176, + "13": 1.06257, + "14": 1.06668, + "15": 1.08083, + "16": 1.08186, + "17": 1.06861, + "18": 1.07223, + "19": 1.06661, + "20": 1.07354, + "21": 1.07863, + "22": 1.08557, + "23": 1.06174, + "24": 1.07533, + "25": 1.06172, + "26": 1.06344, + "27": 1.05522, + "28": 1.05011, + "29": 1.04098, + "30": 1.04622, + "31": 1.0423, + "32": 1.04292, + "33": 1.06328, + "34": 1.03657, + "35": 1.04963, + "36": 1.05103, + "37": 1.04147, + "38": 1.04912, + "39": 1.04838, + "40": 1.04559, + "41": 1.05462, + "42": 1.05103, + "43": 1.04965, + "44": 1.05296, + "45": 1.05039, + "46": 1.05609, + "47": 1.0476, + "48": 1.053, + "49": 1.04626, + "50": 1.05911 + } + } +} \ No newline at end of file diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml index af4b4203803..638ee1a89a3 100644 --- a/tests/test_utils/recipes/moe.yaml +++ b/tests/test_utils/recipes/moe.yaml @@ -69,8 +69,6 @@ products: - environment: [dev] scope: [nightly] platforms: [dgx_a100, dgx_h100] - - environment: [lts] - scope: [nightly] - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last] products: - environment: [dev] @@ -125,8 +123,6 @@ products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - - environment: [lts] - scope: [nightly] - test_case: [gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer] products: - environment: [dev] From bf1a5035f1f776b0bded8bffa0a36eeb573a7a8e Mon Sep 17 00:00:00 2001 From: Deyu Fu Date: Thu, 16 Oct 2025 18:44:00 -0700 Subject: [PATCH 026/248] ADLR/megatron-lm!4232 - [DEV] improve muon and layer-wise dist opt unit tests --- .../core/optimizer/layer_wise_optimizer.py | 2 +- tests/unit_tests/test_layer_wise_optimizer.py | 394 +++++++++++ tests/unit_tests/test_muon_optimizer.py | 653 +++++++++++++++--- 3 files changed, 934 insertions(+), 115 deletions(-) create mode 100644 tests/unit_tests/test_layer_wise_optimizer.py diff --git a/megatron/core/optimizer/layer_wise_optimizer.py b/megatron/core/optimizer/layer_wise_optimizer.py index 2bf4e5e613b..620b1a1994e 100644 --- a/megatron/core/optimizer/layer_wise_optimizer.py +++ b/megatron/core/optimizer/layer_wise_optimizer.py @@ -84,7 +84,7 @@ def shard_params(self, optimizers): param_groups += optimizer.param_groups for group in param_groups: params_this_rank = [] - if group["is_expert_parallel"]: + if group.get("is_expert_parallel", False): for p in group["params"]: if expt_dp_idx == get_pg_rank(self.pg_collection.expt_dp): params_this_rank.append(p) diff --git a/tests/unit_tests/test_layer_wise_optimizer.py b/tests/unit_tests/test_layer_wise_optimizer.py new file mode 100644 index 00000000000..3993e217734 --- /dev/null +++ b/tests/unit_tests/test_layer_wise_optimizer.py @@ -0,0 +1,394 @@ +import os +import tempfile + +import pytest +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging.version import Version + +from megatron.core import parallel_state +from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig +from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer +from megatron.core.optimizer.layer_wise_optimizer import LayerWiseDistributedOptimizer +from megatron.core.optimizer.optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer +from megatron.core.process_groups_config import ProcessGroupCollection +from megatron.core.transformer import TransformerConfig +from megatron.core.utils import get_pg_size +from tests.unit_tests.test_utilities import Utils + +# Skip all tests in this file for LTS versions +pytestmark = pytest.mark.skipif( + Version(os.getenv('NVIDIA_PYTORCH_VERSION', "24.01")) <= Version("25.05"), + reason="Skip layer-wise optimizer for LTS test", +) + + +class SimpleModel(nn.Module): + """Simple model for testing LayerWiseDistributedOptimizer. + + Model with 5 layers to ensure more than 8 parameters (10 total: 5 weights + 5 biases). + """ + + def __init__(self, input_size=80, hidden_size=48, output_size=10): + super().__init__() + self.fc1 = nn.Linear(input_size, hidden_size) + self.fc2 = nn.Linear(hidden_size, 32) + self.fc3 = nn.Linear(32, 24) + self.fc4 = nn.Linear(24, 16) + self.fc5 = nn.Linear(16, output_size) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = F.relu(self.fc3(x)) + x = F.relu(self.fc4(x)) + x = self.fc5(x) + return x + + +class TinyModel(nn.Module): + """Tiny model with only 1 layer (2 parameters: weight and bias).""" + + def __init__(self): + super().__init__() + self.fc1 = nn.Linear(10, 5) + + def forward(self, x): + return self.fc1(x) + + +@pytest.mark.skipif( + int(os.getenv('WORLD_SIZE', '1')) == 1, reason="Multi-rank test requires WORLD_SIZE > 1" +) +class TestLayerWiseOptimizer: + """Test class for LayerWiseDistributedOptimizer with common setup code.""" + + @pytest.fixture(autouse=True) + def setup_and_teardown(self): + """Setup and teardown for each test.""" + world = int(os.getenv('WORLD_SIZE', '1')) + rank = int(os.getenv('RANK', '0')) + Utils.initialize_model_parallel() + yield + Utils.destroy_model_parallel() + + def create_model_and_optimizer( + self, + model_class=SimpleModel, + clip_grad=1.0, + model_kwargs=None, + use_layer_wise=True, + copy_from=None, + ): + """Create model, DDP wrapper, and optimizer. + + Args: + model_class: Model class to instantiate + clip_grad: Optional gradient clipping value + model_kwargs: Optional kwargs for model initialization + use_layer_wise: If True, wrap optimizer in LayerWiseDistributedOptimizer; + if False, use get_megatron_optimizer instead (for reference) + + Returns: + tuple: (model, optimizer, pg_collection) + """ + if model_kwargs is None: + model_kwargs = {} + + model = model_class(**model_kwargs).bfloat16().cuda() + model.requires_grad_(True) + + ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=False) + model = DistributedDataParallel( + TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model + ) + if copy_from: + model.module.load_state_dict(copy_from.module.state_dict()) + else: + model.broadcast_params() + + optimizer_config = OptimizerConfig( + optimizer='adam', + lr=0.01, + weight_decay=0.01, + bf16=not use_layer_wise, + use_distributed_optimizer=False, + clip_grad=clip_grad, + ) + + pg_collection = ProcessGroupCollection.use_mpu_process_groups() + pg_collection.dp_cp = parallel_state.get_data_parallel_group(with_context_parallel=True) + pg_collection.expt_dp = parallel_state.get_expert_data_parallel_group() + + optimizer = get_megatron_optimizer(optimizer_config, [model]) + if use_layer_wise: + optimizer_config.bf16 = True + optimizer = LayerWiseDistributedOptimizer( + optimizer.chained_optimizers, optimizer_config, pg_collection + ) + return model, optimizer, pg_collection + + def create_reference_model(self, model): + """Create a reference model by cloning the current model.""" + reference_model = type(model.module)().bfloat16().cuda() + reference_model.load_state_dict(model.module.state_dict()) + return reference_model + + def test_basic(self): + """Test basic LayerWiseDistributedOptimizer initialization and step with bf16.""" + model, optimizer, pg_collection = self.create_model_and_optimizer() + + # Verify basic properties + assert optimizer is not None, "Optimizer should not be None" + assert hasattr(optimizer, 'chained_optimizers'), "Should be a ChainedOptimizer" + + reference_model = self.create_reference_model(model) + + input_tensor = torch.randn(16, 80, dtype=torch.bfloat16, device='cuda') + output = model(input_tensor) + loss = output.sum() + loss.backward() + + update_successful, grad_norm, num_zeros = optimizer.step() + + assert update_successful, "Optimizer step should be successful" + + # Verify parameters were updated + params_updated = 0 + for param, ref_param in zip(model.parameters(), reference_model.parameters()): + if not torch.equal(param.data, ref_param.data): + params_updated += 1 + + assert params_updated > 0, "At least some parameters should be updated" + + # Verify all ranks have the same updated parameters (test allgather) + dp_size = get_pg_size(pg_collection.dp_cp) + + if dp_size > 1: + for name, param in model.named_parameters(): + # Gather parameters from all ranks + param_list = [torch.zeros_like(param.data) for _ in range(dp_size)] + torch.distributed.all_gather(param_list, param.data, group=pg_collection.dp_cp) + + # Verify all ranks have the same parameter values + for i in range(1, dp_size): + try: + torch.testing.assert_close(param_list[0], param_list[i]) + except AssertionError as e: + # Append additional context without overwriting the default message + raise AssertionError( + f"Parameter {name} differs between rank 0 and rank {i}. {str(e)}" + ) from None + + def test_get_grad_norm(self): + """Test LayerWiseDistributedOptimizer gradient norm computation.""" + model, optimizer, pg_collection = self.create_model_and_optimizer() + reference_model, reference_optimizer, _ = self.create_model_and_optimizer( + use_layer_wise=False + ) + + # Set same gradients on both models + # note that model is different at this point but we're only testing grad norm here + for param, ref_param in zip(model.parameters(), reference_model.parameters()): + grad_value = torch.randn_like(param) + torch.distributed.broadcast(grad_value, src=0, group=pg_collection.dp_cp) + param.main_grad = grad_value.float().detach() + ref_param.main_grad = grad_value.float().detach() + + # Test get_grad_norm on both optimizers + optimizer.prepare_grads() + grad_norm = optimizer.get_grad_norm() + + reference_optimizer.prepare_grads() + reference_grad_norm = reference_optimizer.get_grad_norm() + + assert grad_norm is not None, "Grad norm should not be None" + assert grad_norm >= 0, "Grad norm should be non-negative" + + # Compare with reference optimizer grad norm + torch.testing.assert_close(grad_norm, reference_grad_norm, rtol=1e-5, atol=1e-5) + + def test_state_dict(self): + """Test LayerWiseDistributedOptimizer state dict save and load.""" + model, optimizer, pg_collection = self.create_model_and_optimizer() + + for param in model.parameters(): + param.grad = torch.randn_like(param) + optimizer.step() + + # Test state_dict + state_dict = optimizer.state_dict() + + # Test load_state_dict + # TODO(deyuf): fix this. not going through get() will cause missing keys like wd_mult + # optimizer.load_state_dict(state_dict) + + def test_save_load_file(self): + """Test LayerWiseDistributedOptimizer save and load state dict to/from file.""" + model, optimizer, pg_collection = self.create_model_and_optimizer() + + for param in model.parameters(): + param.grad = torch.randn_like(param) + optimizer.step() + + # Test save to file + with tempfile.NamedTemporaryFile(delete=False, suffix='.pt') as tmp_file: + temp_filename = tmp_file.name + + try: + optimizer.save_state_dict_to_file(temp_filename) + assert os.path.exists(temp_filename), "State dict file should be created" + + # Test load from file + # TODO(deyuf): fix this. not going through get() will cause missing keys like wd_mult + # optimizer.load_state_dict_from_file(temp_filename) + finally: + # Clean up temporary file + if os.path.exists(temp_filename): + os.remove(temp_filename) + + def test_multiple_optimizers(self): + """Test LayerWiseDistributedOptimizer with multiple chained optimizers. + + This test properly tests allgather functionality with multiple ranks. + """ + model = SimpleModel().bfloat16().cuda() + model.requires_grad_(True) + + ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=False) + model = DistributedDataParallel( + TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model + ) + + optimizer_config = OptimizerConfig( + optimizer='adam', lr=0.01, bf16=True, use_distributed_optimizer=False + ) + + # Split parameters into two groups for testing multiple optimizers + params = list(model.parameters()) + mid_point = len(params) // 2 + param_groups_1 = [{'params': params[:mid_point]}] + param_groups_2 = [{'params': params[mid_point:]}] + + # Create two separate base optimizers + base_optimizer_1 = torch.optim.Adam(param_groups_1, lr=optimizer_config.lr) + base_optimizer_2 = torch.optim.Adam(param_groups_2, lr=optimizer_config.lr) + + wrapped_optimizer_1 = FP32Optimizer(base_optimizer_1, optimizer_config, None) + wrapped_optimizer_2 = FP32Optimizer(base_optimizer_2, optimizer_config, None) + + pg_collection = ProcessGroupCollection.use_mpu_process_groups() + pg_collection.dp_cp = parallel_state.get_data_parallel_group(with_context_parallel=True) + pg_collection.expt_dp = parallel_state.get_expert_data_parallel_group() + + optimizer = LayerWiseDistributedOptimizer( + [wrapped_optimizer_1, wrapped_optimizer_2], optimizer_config, pg_collection + ) + + assert len(optimizer.chained_optimizers) == 2, "Should have two chained optimizers" + + # Set gradients and test optimizer step - this will trigger allgather + for param in model.parameters(): + param.grad = torch.randn_like(param) + + update_successful, grad_norm, num_zeros = optimizer.step() + + assert update_successful, "Optimizer step should be successful" + + def test_bf16_wrapping(self): + """Test LayerWiseDistributedOptimizer automatically wraps optimizer with bf16.""" + model, optimizer, pg_collection = self.create_model_and_optimizer() + + # Verify bf16 wrapping happened + assert isinstance( + optimizer.chained_optimizers[0], Float16OptimizerWithFloat16Params + ), "Optimizer should be wrapped in Float16OptimizerWithFloat16Params" + + for param in model.parameters(): + param.grad = torch.randn_like(param) + + update_successful, grad_norm, num_zeros = optimizer.step() + + assert update_successful, "Optimizer step should be successful" + + def test_bf16_error(self): + """Test LayerWiseDistributedOptimizer raises error when receiving pre-wrapped Float16 optimizer.""" + model = SimpleModel().bfloat16().cuda() + model.requires_grad_(True) + + ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=False) + model = DistributedDataParallel( + TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model + ) + + optimizer_config = OptimizerConfig( + optimizer='adam', lr=0.01, bf16=True, use_distributed_optimizer=False + ) + + # Create base optimizer and manually wrap in Float16 optimizer + param_groups = [{'params': list(model.parameters())}] + base_optimizer = torch.optim.Adam(param_groups, lr=optimizer_config.lr) + wrapped_optimizer = Float16OptimizerWithFloat16Params( + base_optimizer, optimizer_config, None, None + ) + + pg_collection = ProcessGroupCollection.use_mpu_process_groups() + pg_collection.dp_cp = parallel_state.get_data_parallel_group(with_context_parallel=True) + pg_collection.expt_dp = parallel_state.get_expert_data_parallel_group() + + # Should raise TypeError when receiving already-wrapped Float16 optimizer + with pytest.raises( + TypeError, match='LayerWiseDistributedOptimizer received Float16 optimizer already' + ): + LayerWiseDistributedOptimizer([wrapped_optimizer], optimizer_config, pg_collection) + + def _run_parameter_update_test(self, model_class=SimpleModel): + """Helper method to test parameter updates with a given model class. + + Args: + model_class: Model class to use for testing + """ + model, optimizer, pg_collection = self.create_model_and_optimizer(model_class=model_class) + + # Create reference model and optimizer using the same function + reference_model, reference_optimizer, _ = self.create_model_and_optimizer( + model_class=model_class, use_layer_wise=False, copy_from=model + ) + + # Set same gradients on both models + for param, ref_param in zip(model.parameters(), reference_model.parameters()): + assert torch.equal(param.data, ref_param.data) + torch.testing.assert_close(param.data, ref_param.data, rtol=1e-5, atol=1e-5) + grad_value = torch.randn_like(param) + torch.distributed.broadcast(grad_value, src=0, group=pg_collection.dp_cp) + param.main_grad = grad_value.clone().detach() + ref_param.main_grad = grad_value.clone().detach() + + optimizer.step() + + # Verify at least some parameters were updated + params_updated = 0 + for param, ref_param in zip(model.parameters(), reference_model.parameters()): + if not torch.equal(param.data, ref_param.data): + params_updated += 1 + + assert params_updated > 0, "At least some parameters should be updated" + + reference_optimizer.step() + + # Verify updated values match reference optimizer + for param, ref_param in zip(model.parameters(), reference_model.parameters()): + torch.testing.assert_close(param.data, ref_param.data, rtol=1e-5, atol=1e-5) + + def test_parameter_updates(self): + """Test LayerWiseDistributedOptimizer actually updates model parameters.""" + self._run_parameter_update_test() + + def test_parameter_updates_insufficient_parameters(self): + """Test LayerWiseDistributedOptimizer when there are insufficient parameters for all ranks. + + Uses a tiny model with only 1 layer (2 parameters: weight and bias). + This will be insufficient when world size > 2. + """ + self._run_parameter_update_test(model_class=TinyModel) diff --git a/tests/unit_tests/test_muon_optimizer.py b/tests/unit_tests/test_muon_optimizer.py index 97d78fe6c70..71d77dc6ecc 100644 --- a/tests/unit_tests/test_muon_optimizer.py +++ b/tests/unit_tests/test_muon_optimizer.py @@ -6,30 +6,39 @@ import torch.nn.functional as F from packaging.version import Version +from megatron.core import parallel_state from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig from megatron.core.optimizer import OptimizerConfig from megatron.core.optimizer.muon import TensorParallelMuon, get_megatron_muon_optimizer +from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer import TransformerConfig from tests.unit_tests.test_utilities import Utils -from tests.unit_tests.test_utils import _deinit_distributed, _init_distributed + +# Skip all tests in this file for LTS versions +pytestmark = pytest.mark.skipif( + Version(os.getenv('NVIDIA_PYTORCH_VERSION', "24.01")) <= Version("25.05"), + reason="Skip muon optimizer for LTS test", +) class Net(nn.Module): def __init__(self): super().__init__() self.fc1 = nn.Linear(80, 48) - self.fc2 = nn.Linear(48, 10) + self.fc2 = nn.Linear(48, 32) + self.fc3 = nn.Linear(32, 24) + self.fc4 = nn.Linear(24, 16) + self.fc5 = nn.Linear(16, 10) def forward(self, x): x = F.relu(self.fc1(x)) - x = self.fc2(x) + x = F.relu(self.fc2(x)) + x = F.relu(self.fc3(x)) + x = F.relu(self.fc4(x)) + x = self.fc5(x) return x -@pytest.mark.skipif( - Version(os.getenv('NVIDIA_PYTORCH_VERSION', "24.01")) <= Version("25.05"), - reason="Skip muon optimizer for LTS test", -) def test_muon_optimizer_smoke(): """Smoke test for TensorParallelMuon optimizer.""" # Create a simple linear model for testing @@ -92,153 +101,569 @@ def test_muon_optimizer_smoke(): @pytest.mark.skipif( - Version(os.getenv('NVIDIA_PYTORCH_VERSION', "24.01")) <= Version("25.05"), - reason="Skip muon optimizer for LTS test", + int(os.getenv('WORLD_SIZE', '1')) == 1, reason="Multi-rank test requires WORLD_SIZE > 1" ) -def test_get_megatron_muon_optimizer_smoke(): - """Smoke test for get_megatron_muon_optimizer function.""" - world = int(os.getenv('WORLD_SIZE', '1')) - rank = int(os.getenv('RANK', '0')) - - # Setup: distributed, model - _init_distributed(world, rank) - Utils.initialize_model_parallel() +class TestMuonOptimizerMultiRank: + """Test class for Muon optimizer with multi-rank setup.""" + + @pytest.fixture(autouse=True) + def setup_and_teardown(self): + """Setup and teardown for each test.""" + Utils.initialize_model_parallel() + yield + Utils.destroy_model_parallel() + + def create_ddp_model(self, model): + """Wrap model in DDP. + + Args: + model: Model to wrap + + Returns: + DDP-wrapped model + """ + ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=False) + return DistributedDataParallel( + TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model + ) + + def test_get_megatron_muon_optimizer_smoke(self): + """Smoke test for get_megatron_muon_optimizer function.""" + model = Net().bfloat16().cuda() + model.requires_grad_(True) + model = self.create_ddp_model(model) + + # Ensure all parameters require gradients + for param in model.parameters(): + assert param.requires_grad, "All parameters should require gradients" + + # Create optimizer config for Muon + optimizer_config = OptimizerConfig( + optimizer='muon', # This will be changed internally to 'adam' for non-linear params + lr=0.01, + weight_decay=0.01, + bf16=True, + use_distributed_optimizer=False, # Muon doesn't support distributed optimizer + muon_momentum=0.95, + muon_use_nesterov=True, + muon_fp32_matmul_prec="medium", + muon_num_ns_steps=5, + muon_scale_mode="spectral", + muon_tp_mode="duplicated", + ) + + # Test creating the optimizer + optimizer = get_megatron_muon_optimizer( + config=optimizer_config, + model_chunks=[model], + use_gloo_process_groups=True, + layer_wise_distributed_optimizer=False, + ) + + # Test basic properties + assert optimizer is not None, "Optimizer should not be None" + assert hasattr(optimizer, 'param_groups'), "Optimizer should have param_groups" + assert hasattr(optimizer, 'chained_optimizers'), "Should be a ChainedOptimizer" + assert len(optimizer.chained_optimizers) >= 1, "Should have at least one chained optimizer" + + # Test forward and backward pass + input_tensor = torch.randn(16, 80, dtype=torch.bfloat16, device='cuda') + output = model(input_tensor) + loss = output.sum() + loss.backward() + + # Store original parameters + original_params = {} + for name, param in model.named_parameters(): + original_params[name] = param.data.clone() + + # Test optimizer step + optimizer.step() + + # Verify at least some parameters were updated + params_updated = 0 + for name, param in model.named_parameters(): + if not torch.equal(param.data, original_params[name]): + params_updated += 1 + + assert params_updated > 0, "At least some parameters should be updated after optimizer step" + + # Test zero_grad + optimizer.zero_grad() + for param in model.parameters(): + assert param.grad is None or torch.all( + param.grad == 0 + ), f"Gradients should be zeroed for all parameters" + + # Test state_dict and load_state_dict + state_dict = optimizer.state_dict() + assert isinstance(state_dict, list), "State dict should be a list" + + # Load state dict should not raise error + optimizer.load_state_dict(state_dict) + + def test_get_megatron_muon_optimizer_validation(self): + """Test validation logic for get_megatron_muon_optimizer.""" + model = torch.nn.Linear(100, 50, bias=False, dtype=torch.bfloat16, device='cuda') + model.requires_grad_(True) + model = self.create_ddp_model(model) + + # Test 1: Distributed optimizer should raise exception + optimizer_config_dist = OptimizerConfig( + optimizer='muon', + lr=0.01, + bf16=True, + use_distributed_optimizer=True, # This should cause an exception + ) + + with pytest.raises(Exception, match='muon with dist optimizer is not supported'): + get_megatron_muon_optimizer(config=optimizer_config_dist, model_chunks=[model]) + + # Test 2: FP16 should raise exception + optimizer_config_fp16 = OptimizerConfig( + optimizer='muon', + lr=0.01, + fp16=True, # This should cause an exception + use_distributed_optimizer=False, + ) + + with pytest.raises(Exception, match='muon with fp16 is not supported'): + get_megatron_muon_optimizer(config=optimizer_config_fp16, model_chunks=[model]) + + # Test 3: Invalid num_ns_steps should raise exception + optimizer_config_invalid_ns = OptimizerConfig( + optimizer='muon', + lr=0.01, + bf16=True, + use_distributed_optimizer=False, + muon_num_ns_steps=0, # This should cause an exception + ) + + with pytest.raises(ValueError, match='num_ns_steps must be at least 1'): + get_megatron_muon_optimizer(config=optimizer_config_invalid_ns, model_chunks=[model]) + + def test_get_megatron_muon_optimizer_layer_wise(self): + """Test get_megatron_muon_optimizer with layer-wise distributed optimizer.""" + model = Net().bfloat16().cuda() + model.requires_grad_(True) + model = self.create_ddp_model(model) + + optimizer_config = OptimizerConfig( + optimizer='muon', + lr=0.01, + weight_decay=0.01, + bf16=True, + use_distributed_optimizer=False, + muon_momentum=0.95, + muon_use_nesterov=True, + muon_fp32_matmul_prec="medium", + muon_num_ns_steps=5, + muon_scale_mode="spectral", + muon_tp_mode="duplicated", + ) + + # Test with layer_wise_distributed_optimizer=True + optimizer = get_megatron_muon_optimizer( + config=optimizer_config, + model_chunks=[model], + use_gloo_process_groups=True, + layer_wise_distributed_optimizer=True, + ) + + # Verify it's a LayerWiseDistributedOptimizer + from megatron.core.optimizer.layer_wise_optimizer import LayerWiseDistributedOptimizer + + assert isinstance( + optimizer, LayerWiseDistributedOptimizer + ), "Should return LayerWiseDistributedOptimizer" + + # Test forward and backward pass + input_tensor = torch.randn(16, 80, dtype=torch.bfloat16, device='cuda') + output = model(input_tensor) + loss = output.sum() + loss.backward() + + # Test optimizer step + update_successful, grad_norm, num_zeros = optimizer.step() + + assert update_successful, "Optimizer step should be successful" + assert grad_norm is not None or grad_norm is None, "Grad norm should be returned" + + +@pytest.mark.parametrize("mode", ["duplicated", "blockwise", "distributed"]) +def test_muon_optimizer_different_modes_single_rank(mode): + """Test TensorParallelMuon optimizer with different modes on single rank. + + When TP size is 1, all modes should produce the same result. + """ + # Set random seed for reproducibility + torch.manual_seed(42) + torch.cuda.manual_seed(42) - # Create a model with both linear and non-linear parameters - model = Net().bfloat16().cuda() + model = torch.nn.Linear(100, 50, bias=False, dtype=torch.float32, device='cuda') model.requires_grad_(True) + model.weight.data.normal_(0, 0.02) - # Wrap in DDP (required for Megatron optimizer) - ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=False) - model = DistributedDataParallel( - TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model + optimizer = TensorParallelMuon( + params=[model.weight], + lr=0.01, + momentum_beta=0.95, + weight_decay=0.0, # Disable weight decay for deterministic comparison + num_ns_steps=5, + pg_collection=None, + mode=mode, ) - # Ensure all parameters require gradients - for param in model.parameters(): - assert param.requires_grad, "All parameters should require gradients" + # Use fixed input for deterministic results + torch.manual_seed(42) + input_tensor = torch.randn(32, 100, dtype=torch.float32, device='cuda') + + output = model(input_tensor) + loss = output.sum() + loss.backward() - # Create optimizer config for Muon - optimizer_config = OptimizerConfig( - optimizer='muon', # This will be changed internally to 'adam' for non-linear params + original_weight = model.weight.data.clone() + optimizer.step() + + # Verify weight was updated + assert not torch.equal( + model.weight.data, original_weight + ), f"Weight should be updated with mode={mode}" + + +@pytest.mark.skipif( + int(os.getenv('WORLD_SIZE', '1')) == 1, reason="Multi-rank test requires WORLD_SIZE > 1" +) +class TestMuonOptimizerMultiRankTP: + """Test class for Muon optimizer with multi-rank and tensor parallel setup.""" + + @pytest.fixture(autouse=True) + def setup_and_teardown(self): + """Setup and teardown for each test with tensor parallel.""" + world = int(os.getenv('WORLD_SIZE', '1')) + Utils.initialize_model_parallel(tensor_model_parallel_size=min(world, 2)) + yield + Utils.destroy_model_parallel() + + def create_tp_model_and_optimizer(self, mode): + """Create model with TP and optimizer. + + Args: + mode: Muon optimizer mode + + Returns: + tuple: (model, optimizer, pg_collection) + """ + rank = int(os.getenv('RANK', '0')) + pg_collection = ProcessGroupCollection.use_mpu_process_groups() + + # Create model with partition_dim for TP + torch.manual_seed(42 + rank) + model = torch.nn.Linear(100, 50, bias=False, dtype=torch.float32, device='cuda') + model.requires_grad_(True) + model.weight.data.normal_(0, 0.02) + model.weight.partition_dim = 0 # Set partition dimension for TP + + optimizer = TensorParallelMuon( + params=[model.weight], + lr=0.01, + momentum_beta=0.95, + weight_decay=0.0, + num_ns_steps=5, + pg_collection=pg_collection, + mode=mode, + ) + + return model, optimizer + + @pytest.mark.parametrize("mode", ["duplicated", "distributed"]) + def test_muon_optimizer_modes_multirank_same_result(self, mode): + """Test that duplicated and distributed modes produce same results with TP > 1.""" + model, optimizer = self.create_tp_model_and_optimizer(mode) + + # Use fixed input for deterministic results + torch.manual_seed(42) + input_tensor = torch.randn(32, 100, dtype=torch.float32, device='cuda') + + output = model(input_tensor) + loss = output.sum() + loss.backward() + + original_weight = model.weight.data.clone() + optimizer.step() + + # Verify weight was updated + assert not torch.equal( + model.weight.data, original_weight + ), f"Weight should be updated with mode={mode}" + + def test_muon_optimizer_blockwise_mode_different_result(self): + """Test that blockwise mode produces different results than duplicated/distributed with TP > 1.""" + model, optimizer = self.create_tp_model_and_optimizer("blockwise") + + # Use fixed input for deterministic results + torch.manual_seed(42) + input_tensor = torch.randn(32, 100, dtype=torch.float32, device='cuda') + + output = model(input_tensor) + loss = output.sum() + loss.backward() + + original_weight = model.weight.data.clone() + optimizer.step() + + # Verify weight was updated + assert not torch.equal( + model.weight.data, original_weight + ), "Weight should be updated with mode=blockwise" + + +@pytest.mark.parametrize( + "coefficient_type_and_steps", [("simple", 3), ("quintic", 5), ("polar_express", 8)] +) +def test_muon_optimizer_coefficient_types(coefficient_type_and_steps): + """Test TensorParallelMuon optimizer with different coefficient types.""" + model = torch.nn.Linear(80, 40, bias=False, dtype=torch.float32, device='cuda') + model.requires_grad_(True) + model.weight.data.fill_(1.0) + + optimizer = TensorParallelMuon( + params=[model.weight], lr=0.01, - weight_decay=0.01, - bf16=True, - use_distributed_optimizer=False, # Muon doesn't support distributed optimizer - muon_momentum=0.95, - muon_use_nesterov=True, - muon_fp32_matmul_prec="medium", - muon_num_ns_steps=5, - muon_scale_mode="spectral", - muon_tp_mode="duplicated", + coefficient_type=coefficient_type_and_steps[0], + num_ns_steps=coefficient_type_and_steps[1], + pg_collection=None, + mode="duplicated", ) - # Test creating the optimizer - optimizer = get_megatron_muon_optimizer( - config=optimizer_config, - model_chunks=[model], - use_gloo_process_groups=True, - layer_wise_distributed_optimizer=False, - ) + input_tensor = torch.randn(16, 80, dtype=torch.float32, device='cuda') + output = model(input_tensor) + loss = output.sum() + loss.backward() - # Test basic properties - assert optimizer is not None, "Optimizer should not be None" - assert hasattr(optimizer, 'param_groups'), "Optimizer should have param_groups" - assert hasattr(optimizer, 'chained_optimizers'), "Should be a ChainedOptimizer" - assert len(optimizer.chained_optimizers) >= 1, "Should have at least one chained optimizer" + original_weight = model.weight.data.clone() + optimizer.step() - # Test forward and backward pass - input_tensor = torch.randn(16, 80, dtype=torch.bfloat16, device='cuda') + assert not torch.equal( + model.weight.data, original_weight + ), f"Weight should be updated with coefficient_type={coefficient_type_and_steps[0]} and num_ns_steps={coefficient_type_and_steps[1]}" + + +@pytest.mark.parametrize("scale_mode", ["spectral", "unit_rms_norm", "shape_scaling"]) +def test_muon_optimizer_scale_modes(scale_mode): + """Test TensorParallelMuon optimizer with different scale modes.""" + model = torch.nn.Linear(60, 30, bias=False, dtype=torch.float32, device='cuda') + model.requires_grad_(True) + model.weight.data.fill_(1.0) + + optimizer = TensorParallelMuon( + params=[model.weight], + lr=0.01, + scale_mode=scale_mode, + num_ns_steps=5, + pg_collection=None, + mode="duplicated", + ) + + input_tensor = torch.randn(16, 60, dtype=torch.float32, device='cuda') output = model(input_tensor) loss = output.sum() loss.backward() - # Store original parameters - original_params = {} - for name, param in model.named_parameters(): - original_params[name] = param.data.clone() + original_weight = model.weight.data.clone() + optimizer.step() - # Test optimizer step + assert not torch.equal( + model.weight.data, original_weight + ), f"Weight should be updated with scale_mode={scale_mode}" + + +@pytest.mark.parametrize("use_nesterov", [True, False]) +def test_muon_optimizer_nesterov(use_nesterov): + """Test TensorParallelMuon optimizer with and without Nesterov momentum.""" + model = torch.nn.Linear(50, 25, bias=False, dtype=torch.float32, device='cuda') + model.requires_grad_(True) + model.weight.data.fill_(1.0) + + optimizer = TensorParallelMuon( + params=[model.weight], + lr=0.01, + momentum_beta=0.9, + use_nesterov=use_nesterov, + num_ns_steps=5, + pg_collection=None, + mode="duplicated", + ) + + input_tensor = torch.randn(16, 50, dtype=torch.float32, device='cuda') + output = model(input_tensor) + loss = output.sum() + loss.backward() + + original_weight = model.weight.data.clone() optimizer.step() - # Verify at least some parameters were updated - params_updated = 0 - for name, param in model.named_parameters(): - if not torch.equal(param.data, original_params[name]): - params_updated += 1 + assert not torch.equal( + model.weight.data, original_weight + ), f"Weight should be updated with use_nesterov={use_nesterov}" - assert params_updated > 0, "At least some parameters should be updated after optimizer step" - # Test zero_grad - optimizer.zero_grad() - for param in model.parameters(): - assert param.grad is None or torch.all( - param.grad == 0 - ), f"Gradients should be zeroed for all parameters" +def test_muon_optimizer_multiple_steps(): + """Test TensorParallelMuon optimizer across multiple optimization steps.""" + model = torch.nn.Linear(100, 50, bias=False, dtype=torch.float32, device='cuda') + model.requires_grad_(True) + model.weight.data.fill_(1.0) - # Test state_dict and load_state_dict - state_dict = optimizer.state_dict() - assert isinstance(state_dict, list), "State dict should be a list" + optimizer = TensorParallelMuon( + params=[model.weight], + lr=0.01, + momentum_beta=0.95, + weight_decay=0.01, + num_ns_steps=5, + pg_collection=None, + mode="duplicated", + ) - # Load state dict should not raise error - optimizer.load_state_dict(state_dict) + weights_history = [model.weight.data.clone()] - _deinit_distributed() + for i in range(3): + input_tensor = torch.randn(32, 100, dtype=torch.float32, device='cuda') + output = model(input_tensor) + loss = output.sum() + loss.backward() + optimizer.step() + optimizer.zero_grad() + weights_history.append(model.weight.data.clone()) -@pytest.mark.skipif( - Version(os.getenv('NVIDIA_PYTORCH_VERSION', "24.01")) <= Version("25.05"), - reason="Skip muon optimizer for LTS test", -) -def test_get_megatron_muon_optimizer_validation(): - """Test validation logic for get_megatron_muon_optimizer.""" - world = int(os.getenv('WORLD_SIZE', '1')) - rank = int(os.getenv('RANK', '0')) + # Verify weights changed at each step + for i in range(len(weights_history) - 1): + assert not torch.equal( + weights_history[i], weights_history[i + 1] + ), f"Weight should change at step {i}" - # Setup: distributed, model - _init_distributed(world, rank) - Utils.initialize_model_parallel() - # Create a simple model - model = torch.nn.Linear(100, 50, bias=False, dtype=torch.bfloat16, device='cuda') +@pytest.mark.skip(reason="split qkv is not implemented yet") +def test_muon_optimizer_qkv_split(): + """Test TensorParallelMuon optimizer with QKV splitting.""" + # Create a model with QKV-like parameter + qkv_size = 3 * 64 * 16 # Combined Q, K, V dimensions, 16 heads x 64 per head + hidden_size = 1024 + model = torch.nn.Linear(hidden_size, qkv_size, bias=False, dtype=torch.float32, device='cuda') model.requires_grad_(True) - ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=False) - model = DistributedDataParallel( - TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model + model.weight.data.fill_(1.0) + + # Mark parameter as QKV + model.weight.is_qkv = True + + # QKV split shapes: [Q_size, K_size, V_size] + qkv_split_shapes = (64, 64, 64) + + # Test with split_qkv=True + optimizer_split = TensorParallelMuon( + params=[model.weight], + lr=0.01, + split_qkv=True, + is_qkv_fn=lambda p: getattr(p, 'is_qkv', False), + qkv_split_shapes=qkv_split_shapes, + num_ns_steps=5, + pg_collection=None, + mode="duplicated", ) - # Test 1: Distributed optimizer should raise exception - optimizer_config_dist = OptimizerConfig( - optimizer='muon', + input_tensor = torch.randn(16, hidden_size, dtype=torch.float32, device='cuda') + output = model(input_tensor) + loss = output.sum() + loss.backward() + + original_weight = model.weight.data.clone() + optimizer_split.step() + weight_with_split = model.weight.data.clone() + + assert not torch.equal( + weight_with_split, original_weight + ), "QKV weight should be updated with split_qkv=True" + + # Reset model and test with split_qkv=False + model.weight.data.fill_(1.0) + optimizer_no_split = TensorParallelMuon( + params=[model.weight], lr=0.01, - bf16=True, - use_distributed_optimizer=True, # This should cause an exception + split_qkv=False, + num_ns_steps=5, + pg_collection=None, + mode="duplicated", ) - with pytest.raises(Exception, match='muon with dist optimizer is not supported'): - get_megatron_muon_optimizer(config=optimizer_config_dist, model_chunks=[model]) + output = model(input_tensor) + loss = output.sum() + loss.backward() + + optimizer_no_split.step() + weight_without_split = model.weight.data.clone() + + assert not torch.equal( + weight_without_split, original_weight + ), "QKV weight should be updated with split_qkv=False" + + # Ensure the two results are different + assert not torch.equal( + weight_with_split, weight_without_split + ), "Weights should be different between split_qkv=True and split_qkv=False" + - # Test 2: FP16 should raise exception - optimizer_config_fp16 = OptimizerConfig( - optimizer='muon', +def test_muon_optimizer_extra_scale_factor(): + """Test TensorParallelMuon optimizer with different extra_scale_factor values.""" + model = torch.nn.Linear(80, 40, bias=False, dtype=torch.float32, device='cuda') + model.requires_grad_(True) + model.weight.data.fill_(1.0) + + optimizer = TensorParallelMuon( + params=[model.weight], lr=0.01, - fp16=True, # This should cause an exception - use_distributed_optimizer=False, + extra_scale_factor=2.0, + num_ns_steps=5, + pg_collection=None, + mode="duplicated", ) - with pytest.raises(Exception, match='muon with fp16 is not supported'): - get_megatron_muon_optimizer(config=optimizer_config_fp16, model_chunks=[model]) + input_tensor = torch.randn(16, 80, dtype=torch.float32, device='cuda') + output = model(input_tensor) + loss = output.sum() + loss.backward() + + original_weight = model.weight.data.clone() + optimizer.step() + + assert not torch.equal( + model.weight.data, original_weight + ), "Weight should be updated with extra_scale_factor" - # Test 3: Invalid num_ns_steps should raise exception - optimizer_config_invalid_ns = OptimizerConfig( - optimizer='muon', + +@pytest.mark.parametrize("num_ns_steps", [5, 15, 25]) +def test_muon_optimizer_num_ns_steps(num_ns_steps): + """Test TensorParallelMuon optimizer with different numbers of Newton-Schulz steps.""" + model = torch.nn.Linear(60, 30, bias=False, dtype=torch.float32, device='cuda') + model.requires_grad_(True) + model.weight.data.fill_(1.0) + + optimizer = TensorParallelMuon( + params=[model.weight], lr=0.01, - bf16=True, - use_distributed_optimizer=False, - muon_num_ns_steps=0, # This should cause an exception + coefficient_type="quintic", + num_ns_steps=num_ns_steps, + pg_collection=None, + mode="duplicated", ) - with pytest.raises(ValueError, match='num_ns_steps must be at least 1'): - get_megatron_muon_optimizer(config=optimizer_config_invalid_ns, model_chunks=[model]) + input_tensor = torch.randn(16, 60, dtype=torch.float32, device='cuda') + output = model(input_tensor) + loss = output.sum() + loss.backward() - _deinit_distributed() + original_weight = model.weight.data.clone() + optimizer.step() + + assert not torch.equal( + model.weight.data, original_weight + ), f"Weight should be updated with num_ns_steps={num_ns_steps}" From 6802bec8c8a704dccbddc87e32b20a1476b37869 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Mon, 20 Oct 2025 10:35:39 -0700 Subject: [PATCH 027/248] ADLR/megatron-lm!4296 - [DEV] fix(MoE): Fix parameter initialization --- megatron/core/transformer/dot_product_attention.py | 2 ++ megatron/core/transformer/moe/router.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/megatron/core/transformer/dot_product_attention.py b/megatron/core/transformer/dot_product_attention.py index 2a958722e46..2a6ac65a685 100644 --- a/megatron/core/transformer/dot_product_attention.py +++ b/megatron/core/transformer/dot_product_attention.py @@ -126,6 +126,8 @@ def __init__( ) ), ) + if config.perform_initialization: + self.softmax_offset = config.init_method(self.softmax_offset) else: raise ValueError("Softmax type not supported") diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py index 068d680c798..7fa4692ef2f 100644 --- a/megatron/core/transformer/moe/router.py +++ b/megatron/core/transformer/moe/router.py @@ -66,6 +66,8 @@ def reset_parameters(self): """Reset the router parameters.""" if self.config.perform_initialization: self.config.init_method(self.weight) + if self.bias is not None: + self.config.init_method(self.bias) self.weight.data = self.weight.data.to(dtype=self.config.params_dtype) setattr(self.weight, 'sequence_parallel', self.config.sequence_parallel) if self.bias is not None: From a6ca591e61acefc904d00793f7fb8c34c8fbb206 Mon Sep 17 00:00:00 2001 From: Zijie Yan Date: Tue, 21 Oct 2025 06:37:25 +0000 Subject: [PATCH 028/248] [Dev] Fix attention output gate for TE2.8 --- megatron/core/transformer/attention.py | 60 ++++++++++++------- .../core/transformer/transformer_config.py | 4 ++ .../unit_tests/transformer/test_attention.py | 2 + 3 files changed, 43 insertions(+), 23 deletions(-) diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index 870b8ad1c40..655955d8ed0 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -1098,10 +1098,9 @@ def get_query_key_value_tensors( num_query_heads_per_group = ( self.num_attention_heads_per_partition // self.num_query_groups_per_partition ) + num_qkv_heads_per_group = num_query_heads_per_group + 2 if output_gate: - num_qkv_heads_per_group = 2 * num_query_heads_per_group + 2 - else: - num_qkv_heads_per_group = num_query_heads_per_group + 2 + num_qkv_heads_per_group += num_query_heads_per_group # If no output gate: [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn] # If have output gate: [sq, b, hp] --> [sq, b, ng, (2 * np/ng + 2) * hn] @@ -1112,31 +1111,43 @@ def get_query_key_value_tensors( mixed_qkv = mixed_qkv.view(*new_tensor_shape) # Split the tensor into query, gate, key, and value. - # If no output gate: [sq, b, ng, (np/ng + 2) * hn] - # --> [sq, b, ng, np/ng * hn], None, [sq, b, ng, hn], [sq, b, ng, hn] - # If have output gate: [sq, b, ng, (2 * np/ng + 2) * hn] - # --> [sq, b, ng, np/ng * hn], [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn] - split_arg_list = [ - num_query_heads_per_group * self.hidden_size_per_attention_head, - num_query_heads_per_group * self.hidden_size_per_attention_head if output_gate else 0, - self.hidden_size_per_attention_head, - self.hidden_size_per_attention_head, - ] - - # Return unsplit mixed_qkv and split_arg_list - if not split_qkv: - return mixed_qkv, split_arg_list + if output_gate: + if not split_qkv: + raise ValueError("split_qkv not supported for gated attention yet.") + # If have output gate: [sq, b, ng, (2 * np/ng + 2) * hn] + # --> [sq, b, ng, np/ng * hn], [sq, b, ng, np/ng * hn], + # [sq, b, ng, hn], [sq, b, ng, hn] + split_arg_list = [ + num_query_heads_per_group * self.hidden_size_per_attention_head, + num_query_heads_per_group * self.hidden_size_per_attention_head, + self.hidden_size_per_attention_head, + self.hidden_size_per_attention_head, + ] - if SplitAlongDim is not None: - (query, gate, key, value) = SplitAlongDim(mixed_qkv, 3, split_arg_list) + if SplitAlongDim is not None: + (query, gate, key, value) = SplitAlongDim(mixed_qkv, 3, split_arg_list) + else: + (query, gate, key, value) = torch.split(mixed_qkv, split_arg_list, dim=3) else: - (query, gate, key, value) = torch.split(mixed_qkv, split_arg_list, dim=3) + # If no output gate: [sq, b, ng, (np/ng + 2) * hn] + # --> [sq, b, ng, np/ng * hn], None, [sq, b, ng, hn], [sq, b, ng, hn] + split_arg_list = [ + num_query_heads_per_group * self.hidden_size_per_attention_head, + self.hidden_size_per_attention_head, + self.hidden_size_per_attention_head, + ] + + # Return unsplit mixed_qkv and split_arg_list + if not split_qkv: + return mixed_qkv, split_arg_list + + if SplitAlongDim is not None: + (query, key, value) = SplitAlongDim(mixed_qkv, 3, split_arg_list) + else: + (query, key, value) = torch.split(mixed_qkv, split_arg_list, dim=3) # Query [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head) - if output_gate: - # Gate [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] - gate = gate.reshape(gate.size(0), gate.size(1), -1, self.hidden_size_per_attention_head) if self.q_layernorm is not None: query = self.q_layernorm(query) @@ -1148,7 +1159,10 @@ def get_query_key_value_tensors( self.run_realtime_tests() if output_gate: + # Gate [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] + gate = gate.reshape(*gate.shape[:2], -1, self.hidden_size_per_attention_head) return query, key, value, gate + return query, key, value def backward_dw(self) -> NoReturn: diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index 8b36425ca2a..89fbcb36f5a 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -1337,6 +1337,10 @@ def __post_init__(self): "apply_rope_fusion is not available. Please install TE >= 1.4." ) + if self.fused_single_qkv_rope: + if self.attention_output_gate: + raise ValueError("fused_single_qkv_rope does not support gated attention for now.") + if self.multi_latent_attention and self.rotary_interleaved: raise ValueError("rotary_interleaved does not work with multi_latent_attention.") diff --git a/tests/unit_tests/transformer/test_attention.py b/tests/unit_tests/transformer/test_attention.py index 419fc17ca0a..23858937c72 100644 --- a/tests/unit_tests/transformer/test_attention.py +++ b/tests/unit_tests/transformer/test_attention.py @@ -96,6 +96,8 @@ def test_fused_rope_gpu_forward(self, rotary_interleaved, fused_qkv_rope): self.parallel_attention.config.apply_rope_fusion = True if rotary_interleaved and not is_te_min_version("2.3.0"): pytest.skip("Only TE >= 2.3.0 supports interleaved fused RoPE.") + if fused_qkv_rope and self.parallel_attention.config.attention_output_gate: + pytest.skip("Fused QKV RoPE does not support gated attention for now.") if fused_qkv_rope and not HAVE_FUSED_QKV_ROPE: pytest.skip("Fused QKV RoPE not available.") self.parallel_attention.config.rotary_interleaved = rotary_interleaved From 78433248157486b881af7b359af7cb649728ef92 Mon Sep 17 00:00:00 2001 From: Zijie Yan Date: Tue, 21 Oct 2025 07:27:39 +0000 Subject: [PATCH 029/248] Cleanup UT and toml --- docker/Dockerfile.ci.dev | 12 +++++++++--- pyproject.toml | 17 +++++++++-------- .../transformer/test_multi_token_prediction.py | 7 +++---- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev index b3295697f31..1357dc5219d 100644 --- a/docker/Dockerfile.ci.dev +++ b/docker/Dockerfile.ci.dev @@ -31,8 +31,10 @@ COPY megatron/core/__init__.py /workspace/megatron/core/ COPY megatron/core/package_info.py /workspace/megatron/core/ RUN --mount=type=cache,target=/root/.cache/uv \ bash -ex <<"EOF" + export NVTE_CUDA_ARCHS="80;90;100" uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages - uv sync --extra dev --extra mlm --link-mode copy --locked --all-groups \ + uv sync --only-group build + uv sync --extra dev --extra mlm --link-mode copy --locked \ --no-install-package torch \ --no-install-package torchvision \ --no-install-package triton \ @@ -51,15 +53,19 @@ RUN --mount=type=cache,target=/root/.cache/uv \ EOF # Install DeepEP +COPY docker/patches/deepep.patch /workspace/deepep.patch RUN bash -ex <<"EOF" cd /workspace - uv pip install nvidia-nvshmem-cu12 + uv pip install nvidia-nvshmem-cu13 pushd /opt/venv/lib/python3.12/site-packages/nvidia/nvshmem/lib/ ln -s libnvshmem_host.so.3 libnvshmem_host.so popd git clone --branch v1.2.1 https://github.com/deepseek-ai/DeepEP.git - TORCH_CUDA_ARCH_LIST="9.0" uv pip install --no-build-isolation -v DeepEP/. + pushd DeepEP + patch -p1 < /workspace/deepep.patch + popd + TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" uv pip install --no-build-isolation -v DeepEP/. rm -rf DeepEP EOF diff --git a/pyproject.toml b/pyproject.toml index 0a0fb9993f5..91d66de7efe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. [build-system] -requires = ["setuptools<80.0.0", "pybind11"] +requires = ["setuptools>=80.0.0", "pybind11", "packaging>=24.2"] build-backend = "setuptools.build_meta" [tool.setuptools] @@ -76,9 +76,8 @@ dev = [ "setuptools<80.0.0", "mamba-ssm~=2.2", "causal-conv1d~=1.5", - "flash-linear-attention~=0.3.2", "nv-grouped-gemm~=1.1", - "transformer-engine[pytorch]>=2.6.0a0,<2.8.0", + "transformer-engine[pytorch]>=2.7.0a0,<2.9.0", "nvidia-resiliency-ext>=0.4.0a0,<0.5.0", "nvidia-modelopt[torch]>=0.33.0a0,<0.34.0; sys_platform != 'darwin'", "megatron-energon[av_decode]~=6.0", @@ -86,6 +85,8 @@ dev = [ "flashinfer-python", "wget", "onnxscript", + "flash-linear-attention~=0.3.2", + "emerging_optimizers" ] lts = [ @@ -130,6 +131,7 @@ build = [ "pybind11", "Cython>=3.0.0", "torch", + "nvidia-mathdx", # for TE ] linting = [ "ruff~=0.9.0", @@ -140,17 +142,16 @@ linting = [ ] ci = ["python-gitlab", "slack-sdk", "pandas"] flash_mla = ["flash_mla"] -emerging_optimizers = ["emerging_optimizers"] [tool.uv] default-groups = ["linting", "build", "test"] no-build-isolation-package = [ - "transformer-engine", - "transformer-engine-torch", - "mamba-ssm", "causal-conv1d", "nv-grouped-gemm", "flash_mla", + "mamba-ssm", + "transformer-engine", + "transformer-engine-torch", ] link-mode = "copy" conflicts = [[{ extra = "lts" }, { extra = "dev" }]] @@ -167,8 +168,8 @@ override-dependencies = [ flash_mla = [ { git = "https://github.com/deepseek-ai/FlashMLA", rev = "9edee0c022cd0938148a18e334203b0aab43aa19" }, ] +transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.8" } # on `release_v2.8` -# transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "0289e76380088358a584d809faf69effab1a7cda" } # on `release_v2.7 emerging_optimizers = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git", rev= "fb1add873e7851ec34b48581ea1b15761b73d189"} [tool.isort] diff --git a/tests/unit_tests/transformer/test_multi_token_prediction.py b/tests/unit_tests/transformer/test_multi_token_prediction.py index 65e58eaede4..9b9d2c67881 100644 --- a/tests/unit_tests/transformer/test_multi_token_prediction.py +++ b/tests/unit_tests/transformer/test_multi_token_prediction.py @@ -101,7 +101,7 @@ def test_constructor_local(self, tp): assert num_weights == 15216 * config.mtp_num_layers @pytest.mark.skipif(not HAVE_TE, reason="transformer_engine not available") - @pytest.mark.parametrize(('tp', 'cp'), [(1, 1), (1, 2), (2, 1), (2, 2)]) + @pytest.mark.parametrize(('tp', 'cp'), [(1, 1), (2, 1), (2, 2)]) def test_constructor_ues_te(self, tp, cp): """Test basic construction of MTP module.""" torch.manual_seed(_SEED) @@ -249,7 +249,7 @@ def get_batch(self, seq_length, micro_batch_size): not HAVE_TE or not is_te_min_version("2.1.0"), reason="grouped_gemm requires TransformerEngine >= 2.1.0", ) - @pytest.mark.parametrize(("tp", "cp"), [(1, 1), (1, 2), (2, 1), (2, 2)]) + @pytest.mark.parametrize(("tp", "cp"), [(2, 1), (2, 2)]) def test_sharded_state_dict(self, tp, cp): """Test MTP with different tensor parallel sizes.""" args = self.create_test_args(tp, cp, self.seq_length, self.micro_batch_size) @@ -268,9 +268,8 @@ def test_sharded_state_dict(self, tp, cp): not HAVE_TE or not is_te_min_version("2.1.0"), reason="grouped_gemm requires TransformerEngine >= 2.1.0", ) - @pytest.mark.parametrize("full_recompute", [False, True]) @pytest.mark.parametrize( - ("tp", "cp"), [(1, 1), (1, 2), (1, 4), (2, 1), (2, 2), (2, 4), (4, 1), (4, 2)] + ("tp", "cp", "full_recompute"), [(1, 1, False), (1, 4, False), (2, 4, False), (4, 1, True)] ) def test_forward_backward(self, tmp_path_dist_ckpt, tp, cp, full_recompute): """Test MTP forward and backward with gptmodel.""" From a48a416c14760bbe606b45e88f9798fd8b288654 Mon Sep 17 00:00:00 2001 From: Zijie Yan Date: Tue, 21 Oct 2025 08:26:25 +0000 Subject: [PATCH 030/248] Clean up functional test --- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../golden_values_dev_dgx_h100.json | 110 ++-- .../golden_values_dev_dgxh100_coreweave.json | 500 +++++++++--------- .../golden_values_dev_dgxh100_eos.json | 500 +++++++++--------- .../model_config.yaml | 5 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 5 +- .../golden_values_dev_dgxh100_coreweave.json | 498 ++++++++--------- .../golden_values_dev_dgxh100_eos.json | 498 ++++++++--------- .../model_config.yaml | 11 +- .../golden_values_dev_dgxh100_coreweave.json | 344 ------------ .../golden_values_dev_dgxh100_eos.json | 344 ------------ .../model_config.yaml | 5 +- tests/test_utils/recipes/moe.yaml | 23 +- 16 files changed, 1089 insertions(+), 1770 deletions(-) delete mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgxh100_coreweave.json delete mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml index dc19a6c7698..2354ecd7fd9 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml @@ -33,7 +33,7 @@ MODEL_ARGS: --clip-grad: 1.0 --lr-warmup-fraction: .01 --log-interval: 1 - --save-interval: 25 + --save-interval: 10000 --eval-interval: 1000 --eval-iters: 10 --transformer-impl: transformer_engine @@ -57,4 +57,4 @@ MODEL_ARGS: --no-bias-gelu-fusion: true --log-memory-to-tensorboard: true --use-tp-pp-dp-mapping: true -TEST_TYPE: ckpt-resume +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml index 30c921c6feb..7c0a103200a 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml @@ -33,7 +33,7 @@ MODEL_ARGS: --clip-grad: 1.0 --lr-warmup-fraction: .01 --log-interval: 1 - --save-interval: 25 + --save-interval: 10000 --eval-interval: 1000 --eval-iters: 10 --transformer-impl: local @@ -56,4 +56,4 @@ MODEL_ARGS: --disable-bias-linear: true --no-bias-gelu-fusion: true --log-memory-to-tensorboard: true -TEST_TYPE: ckpt-resume +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json index 5f29261761b..d06b2b1d235 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json @@ -4,17 +4,17 @@ "end_step": 50, "step_interval": 5, "values": { - "1": 11.04737, - "5": 9.52647, - "10": 9.05826, - "15": 8.04442, - "20": 7.89153, - "25": 7.67197, - "30": 7.64284, - "35": 7.2114, - "40": 7.54179, - "45": 7.18472, - "50": 7.03329 + "1": 11.04748, + "5": 9.53583, + "10": 9.0567, + "15": 8.0476, + "20": 7.89868, + "25": 7.67579, + "30": 7.64391, + "35": 7.20998, + "40": 7.54446, + "45": 7.18755, + "50": 7.03602 } }, "num-zeros": { @@ -22,17 +22,17 @@ "end_step": 50, "step_interval": 5, "values": { - "1": 38802604.0, - "5": 252879712.0, - "10": 728514944.0, - "15": 711699968.0, - "20": 992357632.0, - "25": 884068160.0, - "30": 794514496.0, - "35": 712491648.0, - "40": 588410624.0, - "45": 521081920.0, - "50": 432013312.0 + "1": 38802612.0, + "5": 259189728.0, + "10": 744257088.0, + "15": 724250816.0, + "20": 989207936.0, + "25": 843170688.0, + "30": 775645184.0, + "35": 737655104.0, + "40": 607288512.0, + "45": 514790528.0, + "50": 303063296.0 } }, "mem-allocated-bytes": { @@ -58,17 +58,17 @@ "end_step": 50, "step_interval": 5, "values": { - "1": 22860046336.0, - "5": 25729300480.0, - "10": 25729300480.0, - "15": 25888860160.0, - "20": 25888860160.0, - "25": 25888860160.0, - "30": 25888860160.0, - "35": 25888860160.0, - "40": 26620856320.0, - "45": 26620856320.0, - "50": 26620856320.0 + "1": 55055331328.0, + "5": 57918455808.0, + "10": 57918455808.0, + "15": 57931390976.0, + "20": 57931390976.0, + "25": 57931390976.0, + "30": 57931390976.0, + "35": 58003226624.0, + "40": 58003226624.0, + "45": 58234208256.0, + "50": 58780934144.0 } }, "mtp_1 loss": { @@ -76,17 +76,17 @@ "end_step": 50, "step_interval": 5, "values": { - "1": 11.07644, - "5": 9.81173, - "10": 9.12712, - "15": 7.99147, - "20": 7.82967, - "25": 7.61319, - "30": 7.58479, - "35": 7.15178, - "40": 7.47349, - "45": 7.12034, - "50": 6.97212 + "1": 11.07654, + "5": 9.81154, + "10": 9.127, + "15": 7.99077, + "20": 7.82933, + "25": 7.61578, + "30": 7.58618, + "35": 7.15224, + "40": 7.47408, + "45": 7.11969, + "50": 6.9735 } }, "iteration-time": { @@ -94,17 +94,17 @@ "end_step": 50, "step_interval": 5, "values": { - "1": 59.91943, - "5": 2.44769, - "10": 1.07968, - "15": 1.04699, - "20": 0.93032, - "25": 0.92301, - "30": 0.92916, - "35": 0.94157, - "40": 0.95917, - "45": 0.94382, - "50": 0.94866 + "1": 71.27032, + "5": 2.09978, + "10": 1.95997, + "15": 1.137, + "20": 1.13455, + "25": 1.13415, + "30": 1.15078, + "35": 1.15064, + "40": 1.13889, + "45": 1.124, + "50": 1.13608 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json index 17dce39fb21..0f2637a9511 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.04737, - "2": 11.03581, - "3": 9.58839, - "4": 9.258, - "5": 9.52647, - "6": 9.907, - "7": 9.48764, - "8": 8.94128, - "9": 8.65518, - "10": 9.05826, - "11": 8.49585, - "12": 8.52509, - "13": 8.4535, - "14": 7.97148, - "15": 8.04442, - "16": 8.08093, - "17": 8.08585, - "18": 7.76263, - "19": 8.14979, - "20": 7.89153, - "21": 7.57836, - "22": 7.54353, - "23": 7.43311, - "24": 7.42342, - "25": 7.67197, - "26": 7.07162, - "27": 7.6134, - "28": 7.31484, - "29": 7.48975, - "30": 7.64284, - "31": 7.39141, - "32": 7.58528, - "33": 7.6358, - "34": 7.69534, - "35": 7.2114, - "36": 7.08322, - "37": 7.42539, - "38": 7.18849, - "39": 7.5489, - "40": 7.54179, - "41": 7.48887, - "42": 7.24738, - "43": 7.2341, - "44": 7.41462, - "45": 7.18472, - "46": 6.89672, - "47": 7.30005, - "48": 7.14262, - "49": 7.58803, - "50": 7.03329 + "1": 11.04748, + "2": 11.03561, + "3": 9.58774, + "4": 9.25819, + "5": 9.53583, + "6": 9.8804, + "7": 9.48247, + "8": 8.93575, + "9": 8.65813, + "10": 9.0567, + "11": 8.49445, + "12": 8.52444, + "13": 8.45239, + "14": 7.97323, + "15": 8.0476, + "16": 8.07971, + "17": 8.09081, + "18": 7.76437, + "19": 8.14892, + "20": 7.89868, + "21": 7.59371, + "22": 7.54743, + "23": 7.43222, + "24": 7.4302, + "25": 7.67579, + "26": 7.06929, + "27": 7.62041, + "28": 7.32495, + "29": 7.49042, + "30": 7.64391, + "31": 7.39435, + "32": 7.58789, + "33": 7.64037, + "34": 7.69778, + "35": 7.20998, + "36": 7.08538, + "37": 7.42584, + "38": 7.18804, + "39": 7.55054, + "40": 7.54446, + "41": 7.49287, + "42": 7.24937, + "43": 7.23587, + "44": 7.41595, + "45": 7.18755, + "46": 6.89949, + "47": 7.29966, + "48": 7.14134, + "49": 7.58963, + "50": 7.03602 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 38802604.0, - "2": 38543572.0, - "3": 38739364.0, - "4": 283087744.0, - "5": 252879712.0, - "6": 261986800.0, - "7": 595325120.0, - "8": 778328192.0, - "9": 667827904.0, - "10": 728514944.0, - "11": 718857664.0, - "12": 778200448.0, - "13": 884592256.0, - "14": 846830080.0, - "15": 711699968.0, - "16": 929099456.0, - "17": 718131072.0, - "18": 690071360.0, - "19": 944853824.0, - "20": 992357632.0, - "21": 794133440.0, - "22": 909975808.0, - "23": 919936064.0, - "24": 895588736.0, - "25": 884068160.0, - "26": 869339392.0, - "27": 857232640.0, - "28": 846888320.0, - "29": 821245440.0, - "30": 794514496.0, - "31": 756025600.0, - "32": 762315264.0, - "33": 759280512.0, - "34": 759373696.0, - "35": 712491648.0, - "36": 677834240.0, - "37": 632307392.0, - "38": 614655616.0, - "39": 607761664.0, - "40": 588410624.0, - "41": 582593792.0, - "42": 573377664.0, - "43": 579927552.0, - "44": 579405952.0, - "45": 521081920.0, - "46": 488627232.0, - "47": 478708544.0, - "48": 475807040.0, - "49": 450025824.0, - "50": 432013312.0 + "1": 38802612.0, + "2": 38543592.0, + "3": 38739528.0, + "4": 279937824.0, + "5": 259189728.0, + "6": 271446400.0, + "7": 604773504.0, + "8": 768892544.0, + "9": 645824128.0, + "10": 744257088.0, + "11": 718888576.0, + "12": 746732544.0, + "13": 871990976.0, + "14": 821645632.0, + "15": 724250816.0, + "16": 932241472.0, + "17": 648958912.0, + "18": 649120000.0, + "19": 925992960.0, + "20": 989207936.0, + "21": 819324096.0, + "22": 736955072.0, + "23": 910497792.0, + "24": 876716672.0, + "25": 843170688.0, + "26": 809573824.0, + "27": 854086912.0, + "28": 802857664.0, + "29": 805523328.0, + "30": 775645184.0, + "31": 771754624.0, + "32": 749733696.0, + "33": 718385216.0, + "34": 724771200.0, + "35": 737655104.0, + "36": 690419968.0, + "37": 673203456.0, + "38": 627239552.0, + "39": 614047168.0, + "40": 607288512.0, + "41": 582590592.0, + "42": 548211200.0, + "43": 532740640.0, + "44": 554239168.0, + "45": 514790528.0, + "46": 350258560.0, + "47": 472420128.0, + "48": 453788736.0, + "49": 440597216.0, + "50": 303063296.0 } }, "mem-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 22860046336.0, - "2": 25612713984.0, - "3": 25729300480.0, - "4": 25729300480.0, - "5": 25729300480.0, - "6": 25729300480.0, - "7": 25729300480.0, - "8": 25729300480.0, - "9": 25729300480.0, - "10": 25729300480.0, - "11": 25729300480.0, - "12": 25729300480.0, - "13": 25888860160.0, - "14": 25888860160.0, - "15": 25888860160.0, - "16": 25888860160.0, - "17": 25888860160.0, - "18": 25888860160.0, - "19": 25888860160.0, - "20": 25888860160.0, - "21": 25888860160.0, - "22": 25888860160.0, - "23": 25888860160.0, - "24": 25888860160.0, - "25": 25888860160.0, - "26": 25888860160.0, - "27": 25888860160.0, - "28": 25888860160.0, - "29": 25888860160.0, - "30": 25888860160.0, - "31": 25888860160.0, - "32": 25888860160.0, - "33": 25888860160.0, - "34": 25888860160.0, - "35": 25888860160.0, - "36": 25888860160.0, - "37": 25888860160.0, - "38": 26026612736.0, - "39": 26610898944.0, - "40": 26620856320.0, - "41": 26620856320.0, - "42": 26620856320.0, - "43": 26620856320.0, - "44": 26620856320.0, - "45": 26620856320.0, - "46": 26620856320.0, - "47": 26620856320.0, - "48": 26620856320.0, - "49": 26620856320.0, - "50": 26620856320.0 + "1": 55055331328.0, + "2": 57809321984.0, + "3": 57918455808.0, + "4": 57918455808.0, + "5": 57918455808.0, + "6": 57918455808.0, + "7": 57918455808.0, + "8": 57918455808.0, + "9": 57918455808.0, + "10": 57918455808.0, + "11": 57918455808.0, + "12": 57918455808.0, + "13": 57931390976.0, + "14": 57931390976.0, + "15": 57931390976.0, + "16": 57931390976.0, + "17": 57931390976.0, + "18": 57931390976.0, + "19": 57931390976.0, + "20": 57931390976.0, + "21": 57931390976.0, + "22": 57931390976.0, + "23": 57931390976.0, + "24": 57931390976.0, + "25": 57931390976.0, + "26": 57931390976.0, + "27": 57931390976.0, + "28": 57931390976.0, + "29": 57931390976.0, + "30": 57931390976.0, + "31": 57931390976.0, + "32": 58003226624.0, + "33": 58003226624.0, + "34": 58003226624.0, + "35": 58003226624.0, + "36": 58003226624.0, + "37": 58003226624.0, + "38": 58003226624.0, + "39": 58003226624.0, + "40": 58003226624.0, + "41": 58003226624.0, + "42": 58003226624.0, + "43": 58003226624.0, + "44": 58183614464.0, + "45": 58234208256.0, + "46": 58555555840.0, + "47": 58555555840.0, + "48": 58555555840.0, + "49": 58555555840.0, + "50": 58780934144.0 } }, "mtp_1 loss": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.07644, - "2": 11.07413, - "3": 10.53865, - "4": 10.09826, - "5": 9.81173, - "6": 10.07241, - "7": 9.79857, - "8": 9.07114, - "9": 8.86995, - "10": 9.12712, - "11": 8.49873, - "12": 8.53173, - "13": 8.426, - "14": 7.84827, - "15": 7.99147, - "16": 8.05097, - "17": 8.00164, - "18": 7.73164, - "19": 8.11121, - "20": 7.82967, - "21": 7.52376, - "22": 7.49787, - "23": 7.3697, - "24": 7.37154, - "25": 7.61319, - "26": 7.02025, - "27": 7.559, - "28": 7.26735, - "29": 7.44367, - "30": 7.58479, - "31": 7.32416, - "32": 7.50469, - "33": 7.56964, - "34": 7.63474, - "35": 7.15178, - "36": 7.01748, - "37": 7.34976, - "38": 7.12419, - "39": 7.4868, - "40": 7.47349, - "41": 7.42217, - "42": 7.17743, - "43": 7.16238, - "44": 7.34394, - "45": 7.12034, - "46": 6.82708, - "47": 7.235, - "48": 7.07985, - "49": 7.51123, - "50": 6.97212 + "1": 11.07654, + "2": 11.07406, + "3": 10.53881, + "4": 10.09803, + "5": 9.81154, + "6": 10.06236, + "7": 9.79762, + "8": 9.07117, + "9": 8.87049, + "10": 9.127, + "11": 8.49853, + "12": 8.53046, + "13": 8.42444, + "14": 7.847, + "15": 7.99077, + "16": 8.05015, + "17": 8.00064, + "18": 7.73104, + "19": 8.11087, + "20": 7.82933, + "21": 7.52501, + "22": 7.49916, + "23": 7.36982, + "24": 7.37235, + "25": 7.61578, + "26": 7.02029, + "27": 7.56014, + "28": 7.2681, + "29": 7.44399, + "30": 7.58618, + "31": 7.32468, + "32": 7.50596, + "33": 7.5715, + "34": 7.63581, + "35": 7.15224, + "36": 7.01784, + "37": 7.35163, + "38": 7.12551, + "39": 7.48656, + "40": 7.47408, + "41": 7.42096, + "42": 7.17595, + "43": 7.16059, + "44": 7.34289, + "45": 7.11969, + "46": 6.82753, + "47": 7.23525, + "48": 7.08042, + "49": 7.51043, + "50": 6.9735 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 63.23561, - "2": 1.12406, - "3": 0.92471, - "4": 1.95991, - "5": 1.98896, - "6": 1.40765, - "7": 1.83926, - "8": 1.3919, - "9": 1.58886, - "10": 0.76479, - "11": 0.74358, - "12": 0.74438, - "13": 0.75457, - "14": 0.74884, - "15": 0.7437, - "16": 0.81872, - "17": 0.74739, - "18": 0.75196, - "19": 0.76647, - "20": 0.74522, - "21": 0.73871, - "22": 0.73978, - "23": 0.73654, - "24": 0.73919, - "25": 0.73709, - "26": 0.78913, - "27": 0.75434, - "28": 0.7477, - "29": 0.73673, - "30": 0.74952, - "31": 0.75513, - "32": 0.74212, - "33": 0.74433, - "34": 0.74812, - "35": 0.7512, - "36": 0.74822, - "37": 0.74176, - "38": 0.7553, - "39": 0.77677, - "40": 0.76693, - "41": 0.76205, - "42": 0.76182, - "43": 0.76665, - "44": 0.76169, - "45": 0.74735, - "46": 0.74195, - "47": 0.75025, - "48": 0.74129, - "49": 0.74367, - "50": 0.74308 + "1": 69.29797, + "2": 1.7261, + "3": 1.40981, + "4": 2.16562, + "5": 1.7862, + "6": 1.7469, + "7": 1.96688, + "8": 1.97301, + "9": 1.74665, + "10": 1.69613, + "11": 1.02979, + "12": 1.02408, + "13": 1.03261, + "14": 1.02432, + "15": 1.0529, + "16": 1.04491, + "17": 1.03693, + "18": 1.03399, + "19": 1.03627, + "20": 1.02284, + "21": 1.01667, + "22": 1.02932, + "23": 1.03591, + "24": 1.03466, + "25": 1.03149, + "26": 1.03165, + "27": 1.02342, + "28": 1.03777, + "29": 1.04061, + "30": 1.05641, + "31": 1.02382, + "32": 1.01775, + "33": 1.03039, + "34": 1.03693, + "35": 1.03153, + "36": 1.02699, + "37": 1.02756, + "38": 1.02919, + "39": 1.01773, + "40": 1.03491, + "41": 1.03152, + "42": 1.03035, + "43": 1.0221, + "44": 1.05201, + "45": 1.02579, + "46": 1.02798, + "47": 1.03857, + "48": 1.02772, + "49": 1.0408, + "50": 1.03745 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_eos.json index f95a91d4ff2..b3668b31178 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_eos.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_eos.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.04737, - "2": 11.03581, - "3": 9.58845, - "4": 9.25804, - "5": 9.54964, - "6": 9.8667, - "7": 9.47894, - "8": 8.92828, - "9": 8.66752, - "10": 9.05851, - "11": 8.49951, - "12": 8.52674, - "13": 8.45287, - "14": 7.99202, - "15": 8.05428, - "16": 8.08384, - "17": 8.09398, - "18": 7.76937, - "19": 8.14784, - "20": 7.88774, - "21": 7.58582, - "22": 7.5453, - "23": 7.4272, - "24": 7.42741, - "25": 7.67702, - "26": 7.06883, - "27": 7.61756, - "28": 7.33112, - "29": 7.49469, - "30": 7.6427, - "31": 7.39392, - "32": 7.58751, - "33": 7.64167, - "34": 7.70181, - "35": 7.21084, - "36": 7.08821, - "37": 7.42759, - "38": 7.19136, - "39": 7.55273, - "40": 7.54649, - "41": 7.49652, - "42": 7.25161, - "43": 7.2371, - "44": 7.41599, - "45": 7.19163, - "46": 6.90225, - "47": 7.30109, - "48": 7.14398, - "49": 7.59284, - "50": 7.03691 + "1": 11.04748, + "2": 11.03561, + "3": 9.58773, + "4": 9.25819, + "5": 9.52742, + "6": 9.87911, + "7": 9.48366, + "8": 8.93879, + "9": 8.6551, + "10": 9.10915, + "11": 8.51806, + "12": 8.54732, + "13": 8.48144, + "14": 8.05312, + "15": 8.10118, + "16": 8.10344, + "17": 8.08878, + "18": 7.78589, + "19": 8.15794, + "20": 7.88069, + "21": 7.58542, + "22": 7.54895, + "23": 7.4296, + "24": 7.41901, + "25": 7.67277, + "26": 7.07835, + "27": 7.61157, + "28": 7.31513, + "29": 7.49487, + "30": 7.64287, + "31": 7.39102, + "32": 7.59148, + "33": 7.6393, + "34": 7.70086, + "35": 7.2119, + "36": 7.08623, + "37": 7.43064, + "38": 7.18999, + "39": 7.5525, + "40": 7.54961, + "41": 7.49385, + "42": 7.25481, + "43": 7.24066, + "44": 7.42131, + "45": 7.19201, + "46": 6.90547, + "47": 7.30704, + "48": 7.15325, + "49": 7.60504, + "50": 7.04512 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 38802620.0, - "2": 38543572.0, - "3": 38741428.0, - "4": 283089696.0, - "5": 256049008.0, - "6": 261995024.0, - "7": 601623744.0, - "8": 775170304.0, - "9": 645831808.0, - "10": 728519104.0, - "11": 740861312.0, - "12": 743565504.0, - "13": 893967040.0, - "14": 963173120.0, - "15": 746290304.0, - "16": 938543360.0, - "17": 730738816.0, - "18": 671172416.0, - "19": 922829888.0, - "20": 948314368.0, - "21": 778417216.0, - "22": 938284544.0, - "23": 926223744.0, - "24": 917606784.0, - "25": 918668992.0, - "26": 866192768.0, - "27": 866673856.0, - "28": 856325760.0, - "29": 836978240.0, - "30": 800803136.0, - "31": 790628096.0, - "32": 756030016.0, - "33": 734117312.0, - "34": 734209792.0, - "35": 731364736.0, - "36": 690416960.0, - "37": 679491584.0, - "38": 639823360.0, - "39": 632918272.0, - "40": 610431680.0, - "41": 598315904.0, - "42": 576523840.0, - "43": 406952768.0, - "44": 569968896.0, - "45": 539956736.0, - "46": 365988928.0, - "47": 503877472.0, - "48": 500972512.0, - "49": 478340480.0, - "50": 457181248.0 + "1": 38802612.0, + "2": 38543592.0, + "3": 38739480.0, + "4": 279954336.0, + "5": 249745312.0, + "6": 268288496.0, + "7": 604756224.0, + "8": 781485184.0, + "9": 636362112.0, + "10": 653025216.0, + "11": 668551168.0, + "12": 765583616.0, + "13": 815362944.0, + "14": 834270656.0, + "15": 755756096.0, + "16": 995153536.0, + "17": 938291584.0, + "18": 721524928.0, + "19": 756173504.0, + "20": 901129600.0, + "21": 721816384.0, + "22": 831311872.0, + "23": 803536768.0, + "24": 628253248.0, + "25": 663895680.0, + "26": 847321664.0, + "27": 828927424.0, + "28": 777678976.0, + "29": 764628608.0, + "30": 781930112.0, + "31": 771767616.0, + "32": 771755392.0, + "33": 586323648.0, + "34": 734207552.0, + "35": 690468480.0, + "36": 485982688.0, + "37": 506506336.0, + "38": 642964160.0, + "39": 661240000.0, + "40": 645048768.0, + "41": 636072704.0, + "42": 491645856.0, + "43": 601942528.0, + "44": 623448960.0, + "45": 539959424.0, + "46": 532669088.0, + "47": 529039680.0, + "48": 504121984.0, + "49": 478344480.0, + "50": 331385728.0 } }, "mem-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 22860046336.0, - "2": 25612713984.0, - "3": 25730244608.0, - "4": 25730244608.0, - "5": 25730244608.0, - "6": 25730244608.0, - "7": 25730244608.0, - "8": 25730244608.0, - "9": 25730244608.0, - "10": 25730244608.0, - "11": 25730244608.0, - "12": 25730244608.0, - "13": 26180298752.0, - "14": 26180298752.0, - "15": 26180298752.0, - "16": 26180298752.0, - "17": 26180298752.0, - "18": 26180298752.0, - "19": 26180298752.0, - "20": 26180298752.0, - "21": 26180298752.0, - "22": 26180298752.0, - "23": 26180298752.0, - "24": 26180298752.0, - "25": 26180298752.0, - "26": 26180298752.0, - "27": 26180298752.0, - "28": 26180298752.0, - "29": 26180298752.0, - "30": 26180298752.0, - "31": 26180298752.0, - "32": 26180298752.0, - "33": 26180298752.0, - "34": 26180298752.0, - "35": 26180298752.0, - "36": 26180298752.0, - "37": 26180298752.0, - "38": 26180298752.0, - "39": 26180298752.0, - "40": 26180298752.0, - "41": 26180298752.0, - "42": 26180298752.0, - "43": 26180298752.0, - "44": 26180298752.0, - "45": 26180298752.0, - "46": 26180298752.0, - "47": 26180298752.0, - "48": 26180298752.0, - "49": 26180298752.0, - "50": 26180298752.0 + "1": 55055331328.0, + "2": 57809321984.0, + "3": 57919823872.0, + "4": 57919823872.0, + "5": 57919823872.0, + "6": 57919823872.0, + "7": 57919823872.0, + "8": 57919823872.0, + "9": 57919823872.0, + "10": 57919823872.0, + "11": 57919823872.0, + "12": 57919823872.0, + "13": 57932275712.0, + "14": 57932275712.0, + "15": 57932275712.0, + "16": 57932275712.0, + "17": 57932275712.0, + "18": 57932275712.0, + "19": 57932275712.0, + "20": 57932275712.0, + "21": 57932275712.0, + "22": 57932275712.0, + "23": 57932275712.0, + "24": 57932275712.0, + "25": 57932275712.0, + "26": 57932275712.0, + "27": 57932275712.0, + "28": 57932275712.0, + "29": 57932275712.0, + "30": 57932275712.0, + "31": 57932275712.0, + "32": 57932275712.0, + "33": 57932275712.0, + "34": 57932275712.0, + "35": 57932275712.0, + "36": 57932275712.0, + "37": 57932275712.0, + "38": 57932275712.0, + "39": 57932275712.0, + "40": 57932275712.0, + "41": 57932275712.0, + "42": 57932275712.0, + "43": 57932275712.0, + "44": 57932275712.0, + "45": 57932275712.0, + "46": 57932275712.0, + "47": 57932275712.0, + "48": 57932275712.0, + "49": 57932275712.0, + "50": 57932275712.0 } }, "mtp_1 loss": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.07644, - "2": 11.07413, - "3": 10.53858, - "4": 10.0983, - "5": 9.8117, - "6": 10.05948, - "7": 9.79869, - "8": 9.0727, - "9": 8.87366, - "10": 9.12893, - "11": 8.49884, - "12": 8.52992, - "13": 8.42414, - "14": 7.84688, - "15": 7.99135, - "16": 8.05047, - "17": 8.0004, - "18": 7.73069, - "19": 8.11023, - "20": 7.82948, - "21": 7.51921, - "22": 7.49606, - "23": 7.37196, - "24": 7.37047, - "25": 7.61349, - "26": 7.01867, - "27": 7.5586, - "28": 7.26599, - "29": 7.44466, - "30": 7.58701, - "31": 7.32783, - "32": 7.50657, - "33": 7.56866, - "34": 7.63344, - "35": 7.15071, - "36": 7.01674, - "37": 7.34958, - "38": 7.12576, - "39": 7.48596, - "40": 7.47304, - "41": 7.41897, - "42": 7.17558, - "43": 7.16122, - "44": 7.34251, - "45": 7.12147, - "46": 6.82911, - "47": 7.23414, - "48": 7.07998, - "49": 7.51108, - "50": 6.9741 + "1": 11.07654, + "2": 11.07406, + "3": 10.53883, + "4": 10.09801, + "5": 9.81156, + "6": 10.06025, + "7": 9.7962, + "8": 9.06987, + "9": 8.86879, + "10": 9.13393, + "11": 8.5017, + "12": 8.54094, + "13": 8.43678, + "14": 7.85637, + "15": 7.99846, + "16": 8.05889, + "17": 8.01134, + "18": 7.73929, + "19": 8.1188, + "20": 7.83458, + "21": 7.53103, + "22": 7.50125, + "23": 7.37135, + "24": 7.37419, + "25": 7.61596, + "26": 7.01586, + "27": 7.55739, + "28": 7.26274, + "29": 7.43991, + "30": 7.58436, + "31": 7.32289, + "32": 7.50362, + "33": 7.56884, + "34": 7.6339, + "35": 7.151, + "36": 7.01725, + "37": 7.35013, + "38": 7.12483, + "39": 7.48708, + "40": 7.47451, + "41": 7.4181, + "42": 7.17557, + "43": 7.15957, + "44": 7.34227, + "45": 7.12176, + "46": 6.82526, + "47": 7.23374, + "48": 7.07893, + "49": 7.5077, + "50": 6.97094 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 57.89597, - "2": 1.02226, - "3": 0.91676, - "4": 1.99588, - "5": 2.00486, - "6": 1.51451, - "7": 1.1193, - "8": 1.44004, - "9": 1.59872, - "10": 0.77647, - "11": 0.76373, - "12": 0.78131, - "13": 0.77869, - "14": 0.76703, - "15": 1.37612, - "16": 0.78402, - "17": 0.78337, - "18": 0.78947, - "19": 0.77286, - "20": 0.76873, - "21": 0.76722, - "22": 0.76847, - "23": 0.77301, - "24": 0.77475, - "25": 0.78165, - "26": 0.81166, - "27": 1.50584, - "28": 0.78435, - "29": 0.79046, - "30": 0.77828, - "31": 0.77039, - "32": 0.78392, - "33": 0.77294, - "34": 0.77717, - "35": 0.78379, - "36": 0.76722, - "37": 0.78405, - "38": 0.78584, - "39": 0.77423, - "40": 0.77729, - "41": 0.78273, - "42": 0.78119, - "43": 0.77474, - "44": 0.79851, - "45": 0.7826, - "46": 0.78586, - "47": 0.77961, - "48": 0.77947, - "49": 0.77944, - "50": 0.77976 + "1": 57.80279, + "2": 1.26321, + "3": 1.18918, + "4": 2.24643, + "5": 2.25191, + "6": 1.80757, + "7": 2.09086, + "8": 1.69153, + "9": 1.81279, + "10": 1.64882, + "11": 1.03476, + "12": 1.03593, + "13": 1.04348, + "14": 1.03841, + "15": 1.04432, + "16": 1.05281, + "17": 1.04826, + "18": 1.04981, + "19": 1.05351, + "20": 1.04668, + "21": 1.05254, + "22": 1.05391, + "23": 1.04635, + "24": 1.05503, + "25": 1.04226, + "26": 1.0684, + "27": 1.04985, + "28": 1.04233, + "29": 1.05036, + "30": 1.06219, + "31": 1.044, + "32": 1.05614, + "33": 1.05729, + "34": 1.05618, + "35": 1.06289, + "36": 1.05761, + "37": 1.05956, + "38": 1.06343, + "39": 1.06848, + "40": 1.06027, + "41": 1.05493, + "42": 1.05258, + "43": 1.04879, + "44": 1.04949, + "45": 1.05964, + "46": 1.04465, + "47": 1.0491, + "48": 1.05387, + "49": 1.05218, + "50": 1.05453 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml index 0cce9b4edb6..5390afcd09b 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml @@ -17,7 +17,8 @@ MODEL_ARGS: --use-distributed-optimizer: true --overlap-grad-reduce: true --overlap-param-gather: true - --attention-backend: fused + # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN + --attention-backend: unfused # TODO: switch back to fused attention after fix # Training args --use-mcore-models: true --sequence-parallel: true @@ -122,7 +123,7 @@ MODEL_ARGS: # Add mixed precision args --bf16: true --exit-interval: 50 -TEST_TYPE: ckpt-resume +TEST_TYPE: regular METRICS: - "iteration-time" - "lm loss" diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml index 4e553f2f9ed..19a8b4fc639 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml @@ -33,7 +33,7 @@ MODEL_ARGS: --clip-grad: 1.0 --lr-warmup-fraction: .01 --log-interval: 1 - --save-interval: 25 + --save-interval: 10000 --eval-interval: 1000 --eval-iters: 10 --transformer-impl: transformer_engine @@ -61,4 +61,4 @@ MODEL_ARGS: --attention-backend: unfused --no-bias-gelu-fusion: true --log-memory-to-tensorboard: true -TEST_TYPE: ckpt-resume +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml index 7ba366f1d1b..f27db4a8021 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml @@ -34,7 +34,7 @@ MODEL_ARGS: --clip-grad: 1.0 --lr-warmup-fraction: .01 --log-interval: 1 - --save-interval: 25 + --save-interval: 10000 --eval-interval: 1000 --eval-iters: 10 --transformer-impl: transformer_engine @@ -63,4 +63,4 @@ MODEL_ARGS: --no-bias-gelu-fusion: true --log-memory-to-tensorboard: true --exit-interval: 50 -TEST_TYPE: ckpt-resume +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml index c920037f0f2..7ebd9f0d1af 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml @@ -17,7 +17,8 @@ MODEL_ARGS: --use-distributed-optimizer: true --overlap-grad-reduce: true --overlap-param-gather: true - --attention-backend: fused + # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN + --attention-backend: unfused # TODO: switch back to fused attention after fix # Training args --use-mcore-models: true --sequence-parallel: true @@ -125,7 +126,7 @@ MODEL_ARGS: --fp8-format: hybrid --fp8-recipe: tensorwise --exit-interval: 50 -TEST_TYPE: ckpt-resume # Usually ckpt-resume, but as a WAR to #513 set to regular +TEST_TYPE: regular # Usually ckpt-resume, but as a WAR to #513 set to regular METRICS: - "iteration-time" - "lm loss" diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json index 7c3cd772f4f..58eb3fc16cd 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 10.94947, - "2": 10.95236, - "3": 10.50817, - "4": 9.96373, - "5": 9.93907, + "1": 10.95004, + "2": 10.9521, + "3": 10.5115, + "4": 9.96454, + "5": 9.93941, "6": 9.67273, - "7": 10.2137, - "8": 9.4963, - "9": 9.56483, - "10": 9.7979, - "11": 9.30107, - "12": 9.40465, - "13": 9.39581, - "14": 8.84796, - "15": 9.02503, - "16": 9.07162, - "17": 9.04638, - "18": 8.75696, - "19": 9.18152, - "20": 8.86295, - "21": 8.5361, - "22": 8.55339, - "23": 8.42711, - "24": 8.37747, - "25": 8.64415, - "26": 7.97441, - "27": 8.56675, - "28": 8.19618, - "29": 8.39325, - "30": 8.67137, - "31": 8.28979, - "32": 8.43623, - "33": 8.55717, - "34": 8.6598, - "35": 8.07929, - "36": 7.94958, - "37": 8.29465, - "38": 7.9784, - "39": 8.39172, - "40": 8.35622, - "41": 8.31635, - "42": 8.06507, - "43": 8.03396, - "44": 8.24146, - "45": 8.1039, - "46": 7.61771, - "47": 8.15375, - "48": 8.00818, - "49": 8.38737, - "50": 7.81612 + "7": 10.20975, + "8": 9.49716, + "9": 9.55902, + "10": 9.79742, + "11": 9.30109, + "12": 9.40483, + "13": 9.39546, + "14": 8.84681, + "15": 9.02444, + "16": 9.07121, + "17": 9.04574, + "18": 8.75678, + "19": 9.18159, + "20": 8.8595, + "21": 8.53503, + "22": 8.55182, + "23": 8.42441, + "24": 8.37608, + "25": 8.64304, + "26": 7.97393, + "27": 8.56806, + "28": 8.19764, + "29": 8.3928, + "30": 8.67283, + "31": 8.289, + "32": 8.43572, + "33": 8.5568, + "34": 8.66018, + "35": 8.07934, + "36": 7.94976, + "37": 8.29565, + "38": 7.98044, + "39": 8.39201, + "40": 8.35513, + "41": 8.31876, + "42": 8.0583, + "43": 8.03283, + "44": 8.24243, + "45": 8.10277, + "46": 7.61696, + "47": 8.15273, + "48": 8.00569, + "49": 8.38688, + "50": 7.81491 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 19403784.0, - "2": 19274252.0, - "3": 19373794.0, - "4": 89687600.0, - "5": 139124400.0, - "6": 138949920.0, - "7": 170316512.0, - "8": 192665728.0, - "9": 168817872.0, - "10": 156652864.0, - "11": 217935232.0, - "12": 213007792.0, - "13": 228424704.0, - "14": 217442256.0, - "15": 237921408.0, - "16": 225523072.0, - "17": 225458384.0, - "18": 164166928.0, - "19": 164457904.0, - "20": 180124848.0, - "21": 230463232.0, - "22": 230096384.0, - "23": 210054656.0, - "24": 200985472.0, - "25": 248708512.0, - "26": 301000896.0, - "27": 205364384.0, - "28": 270886048.0, - "29": 259695952.0, - "30": 224280720.0, - "31": 244360992.0, - "32": 189382672.0, - "33": 231930816.0, - "34": 206712432.0, - "35": 194319616.0, - "36": 246163408.0, - "37": 193561968.0, - "38": 228822688.0, - "39": 226941728.0, - "40": 196742032.0, - "41": 200179904.0, - "42": 219112640.0, - "43": 186235920.0, - "44": 138763920.0, - "45": 148907984.0, - "46": 109115896.0, - "47": 167015728.0, - "48": 156135104.0, - "49": 91378480.0, - "50": 164099648.0 + "1": 19403624.0, + "2": 19274194.0, + "3": 19372760.0, + "4": 86525248.0, + "5": 148575568.0, + "6": 145226704.0, + "7": 171879984.0, + "8": 195785248.0, + "9": 164124752.0, + "10": 167684736.0, + "11": 221077344.0, + "12": 200384224.0, + "13": 248872528.0, + "14": 211169424.0, + "15": 214304608.0, + "16": 216075632.0, + "17": 267845984.0, + "18": 170470336.0, + "19": 176865072.0, + "20": 187955392.0, + "21": 225750704.0, + "22": 247396816.0, + "23": 211643856.0, + "24": 205638464.0, + "25": 277022272.0, + "26": 291562304.0, + "27": 225789840.0, + "28": 288202368.0, + "29": 198390384.0, + "30": 213302208.0, + "31": 227204752.0, + "32": 271112416.0, + "33": 231840432.0, + "34": 203575536.0, + "35": 191152368.0, + "36": 222566928.0, + "37": 177810112.0, + "38": 228708544.0, + "39": 211168784.0, + "40": 215603968.0, + "41": 200089440.0, + "42": 228529888.0, + "43": 198782848.0, + "44": 141902272.0, + "45": 181922816.0, + "46": 115369856.0, + "47": 170214176.0, + "48": 137292832.0, + "49": 97654936.0, + "50": 160979632.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 4751680512.0, - "2": 4752032256.0, - "3": 4751058432.0, - "4": 4751692288.0, - "5": 4750785024.0, - "6": 4750721536.0, - "7": 4750738944.0, - "8": 4750471680.0, - "9": 4750078464.0, - "10": 4750671360.0, - "11": 4750662144.0, - "12": 4750013952.0, - "13": 4750343680.0, - "14": 4750866944.0, - "15": 4751114752.0, - "16": 4754016768.0, - "17": 4751645184.0, - "18": 4749773312.0, - "19": 4751623680.0, - "20": 4749661696.0, - "21": 4751997440.0, - "22": 4751115776.0, - "23": 4750557696.0, - "24": 4751779328.0, - "25": 4750678528.0, - "26": 4749646336.0, - "27": 4750984704.0, - "28": 4752366080.0, - "29": 4750876160.0, - "30": 4750423552.0, - "31": 4750733824.0, - "32": 4751212032.0, - "33": 4750073344.0, - "34": 4751521280.0, - "35": 4750867968.0, - "36": 4750440960.0, - "37": 4750258688.0, - "38": 4751287808.0, - "39": 4749742592.0, - "40": 4750831104.0, - "41": 4750516736.0, - "42": 4750870016.0, - "43": 4750633472.0, - "44": 4750676480.0, - "45": 4750337536.0, - "46": 4751146496.0, - "47": 4750629376.0, - "48": 4750627328.0, - "49": 4751527424.0, - "50": 4750583296.0 + "1": 4883602432.0, + "2": 4885017088.0, + "3": 4882657792.0, + "4": 4883046912.0, + "5": 4883725824.0, + "6": 4883713536.0, + "7": 4883040768.0, + "8": 4883273216.0, + "9": 4882952704.0, + "10": 4885949952.0, + "11": 4883990016.0, + "12": 4887679488.0, + "13": 4884011520.0, + "14": 4882899456.0, + "15": 4883515904.0, + "16": 4883990016.0, + "17": 4883410432.0, + "18": 4883673600.0, + "19": 4882903552.0, + "20": 4884541952.0, + "21": 4883138048.0, + "22": 4883247616.0, + "23": 4883839488.0, + "24": 4885058048.0, + "25": 4882676224.0, + "26": 4884058624.0, + "27": 4884724224.0, + "28": 4884874752.0, + "29": 4883127808.0, + "30": 4883252736.0, + "31": 4882955776.0, + "32": 4885190144.0, + "33": 4883845632.0, + "34": 4884392448.0, + "35": 4883083776.0, + "36": 4883851776.0, + "37": 4885246464.0, + "38": 4882680320.0, + "39": 4884296192.0, + "40": 4884689408.0, + "41": 4882836992.0, + "42": 4883972608.0, + "43": 4884519424.0, + "44": 4883354112.0, + "45": 4883495424.0, + "46": 4882788864.0, + "47": 4883144192.0, + "48": 4883688960.0, + "49": 4884182528.0, + "50": 4885279232.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11458484224.0, - "2": 12450223104.0, - "3": 12450223104.0, - "4": 12450223104.0, - "5": 12450223104.0, - "6": 12572350464.0, - "7": 12815280128.0, - "8": 12815280128.0, - "9": 13430808576.0, - "10": 13558942720.0, - "11": 13558942720.0, - "12": 13558942720.0, - "13": 13558942720.0, - "14": 13558942720.0, - "15": 13558942720.0, - "16": 13558942720.0, - "17": 13558942720.0, - "18": 13558942720.0, - "19": 13558942720.0, - "20": 13558942720.0, - "21": 13764741120.0, - "22": 13887232000.0, - "23": 13887232000.0, - "24": 13887232000.0, - "25": 13887232000.0, - "26": 13887232000.0, - "27": 13887232000.0, - "28": 13887232000.0, - "29": 13887232000.0, - "30": 13887232000.0, - "31": 13887232000.0, - "32": 13887232000.0, - "33": 13887232000.0, - "34": 13887232000.0, - "35": 13887232000.0, - "36": 13887232000.0, - "37": 13887232000.0, - "38": 13887232000.0, - "39": 13887232000.0, - "40": 13887232000.0, - "41": 13887232000.0, - "42": 13887232000.0, - "43": 13887232000.0, - "44": 13887232000.0, - "45": 13887232000.0, - "46": 13887232000.0, - "47": 13887232000.0, - "48": 13887232000.0, - "49": 13887232000.0, - "50": 13887232000.0 + "1": 41210470400.0, + "2": 41210470400.0, + "3": 41210470400.0, + "4": 41210470400.0, + "5": 41210470400.0, + "6": 41210470400.0, + "7": 41210470400.0, + "8": 41210470400.0, + "9": 41210470400.0, + "10": 41210470400.0, + "11": 41210470400.0, + "12": 41210470400.0, + "13": 41210470400.0, + "14": 41210470400.0, + "15": 41210470400.0, + "16": 41210470400.0, + "17": 41210470400.0, + "18": 41210470400.0, + "19": 41210470400.0, + "20": 41210470400.0, + "21": 41210470400.0, + "22": 41210470400.0, + "23": 41210470400.0, + "24": 41210470400.0, + "25": 41210470400.0, + "26": 41210470400.0, + "27": 41210470400.0, + "28": 41210470400.0, + "29": 41210470400.0, + "30": 41210470400.0, + "31": 41210470400.0, + "32": 41210470400.0, + "33": 41210470400.0, + "34": 41210470400.0, + "35": 41210470400.0, + "36": 41210470400.0, + "37": 41210470400.0, + "38": 41210470400.0, + "39": 41210470400.0, + "40": 41210470400.0, + "41": 41210470400.0, + "42": 41210470400.0, + "43": 41210470400.0, + "44": 41210470400.0, + "45": 41210470400.0, + "46": 41210470400.0, + "47": 41210470400.0, + "48": 41210470400.0, + "49": 41210470400.0, + "50": 41210470400.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 83.38985, - "2": 0.80022, - "3": 0.71751, - "4": 0.65556, - "5": 0.98544, - "6": 0.76766, - "7": 0.73114, - "8": 0.76226, - "9": 0.62791, - "10": 0.62224, - "11": 0.69873, - "12": 0.62401, - "13": 0.62467, - "14": 0.62054, - "15": 0.6218, - "16": 0.61653, - "17": 0.6184, - "18": 0.63217, - "19": 0.61609, - "20": 0.62413, - "21": 0.60966, - "22": 0.60967, - "23": 0.60674, - "24": 0.60595, - "25": 0.60063, - "26": 0.60502, - "27": 0.60923, - "28": 0.60939, - "29": 0.61217, - "30": 0.60702, - "31": 0.61517, - "32": 0.60803, - "33": 0.60624, - "34": 0.6123, - "35": 0.61133, - "36": 0.60971, - "37": 0.61215, - "38": 0.61014, - "39": 0.62694, - "40": 0.60532, - "41": 0.60477, - "42": 0.60297, - "43": 0.60073, - "44": 0.59786, - "45": 0.60582, - "46": 0.60848, - "47": 0.60019, - "48": 0.60064, - "49": 0.60304, - "50": 0.58276 + "1": 86.8085, + "2": 1.10913, + "3": 0.99097, + "4": 0.89412, + "5": 1.25997, + "6": 0.98162, + "7": 0.98318, + "8": 1.13296, + "9": 0.88126, + "10": 0.8633, + "11": 2.2744, + "12": 4.5393, + "13": 3.22763, + "14": 1.64923, + "15": 0.86595, + "16": 0.86575, + "17": 0.85272, + "18": 0.85454, + "19": 0.85281, + "20": 0.87018, + "21": 0.84654, + "22": 0.8494, + "23": 0.84882, + "24": 0.84482, + "25": 0.85311, + "26": 0.84678, + "27": 0.84096, + "28": 0.8412, + "29": 0.84156, + "30": 0.84475, + "31": 0.84747, + "32": 0.85058, + "33": 0.84977, + "34": 0.8479, + "35": 0.85234, + "36": 0.85012, + "37": 0.85087, + "38": 0.84594, + "39": 0.84558, + "40": 0.84807, + "41": 0.84183, + "42": 0.8439, + "43": 0.84221, + "44": 0.84248, + "45": 0.84257, + "46": 0.83922, + "47": 0.84311, + "48": 0.84159, + "49": 0.84011, + "50": 0.8353 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json index 9ba3e686ab8..daa04af43dd 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 10.94947, - "2": 10.95236, - "3": 10.50817, - "4": 9.96373, - "5": 9.93907, + "1": 10.95004, + "2": 10.9521, + "3": 10.5115, + "4": 9.96454, + "5": 9.93941, "6": 9.67273, - "7": 10.2137, - "8": 9.4963, - "9": 9.56483, - "10": 9.7979, - "11": 9.30107, - "12": 9.40465, - "13": 9.39581, - "14": 8.84796, - "15": 9.02503, - "16": 9.07162, - "17": 9.04638, - "18": 8.75696, - "19": 9.18152, - "20": 8.86295, - "21": 8.5361, - "22": 8.55339, - "23": 8.42711, - "24": 8.37747, - "25": 8.64415, - "26": 7.97441, - "27": 8.56675, - "28": 8.19618, - "29": 8.39325, - "30": 8.67137, - "31": 8.28979, - "32": 8.43623, - "33": 8.55717, - "34": 8.6598, - "35": 8.07929, - "36": 7.94958, - "37": 8.29465, - "38": 7.9784, - "39": 8.39172, - "40": 8.35622, - "41": 8.31635, - "42": 8.06507, - "43": 8.03396, - "44": 8.24146, - "45": 8.1039, - "46": 7.61771, - "47": 8.15375, - "48": 8.00818, - "49": 8.38737, - "50": 7.81612 + "7": 10.20975, + "8": 9.49716, + "9": 9.55902, + "10": 9.79742, + "11": 9.30109, + "12": 9.40483, + "13": 9.39546, + "14": 8.84681, + "15": 9.02444, + "16": 9.07121, + "17": 9.04574, + "18": 8.75678, + "19": 9.18159, + "20": 8.8595, + "21": 8.53503, + "22": 8.55182, + "23": 8.42441, + "24": 8.37608, + "25": 8.64304, + "26": 7.97393, + "27": 8.56806, + "28": 8.19764, + "29": 8.3928, + "30": 8.67283, + "31": 8.289, + "32": 8.43572, + "33": 8.5568, + "34": 8.66018, + "35": 8.07934, + "36": 7.94976, + "37": 8.29565, + "38": 7.98044, + "39": 8.39201, + "40": 8.35513, + "41": 8.31876, + "42": 8.0583, + "43": 8.03283, + "44": 8.24243, + "45": 8.10277, + "46": 7.61696, + "47": 8.15273, + "48": 8.00569, + "49": 8.38688, + "50": 7.81491 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 19403784.0, - "2": 19274252.0, - "3": 19373794.0, - "4": 89687600.0, - "5": 139124400.0, - "6": 138949920.0, - "7": 170316512.0, - "8": 192665728.0, - "9": 168817872.0, - "10": 156652864.0, - "11": 217935232.0, - "12": 213007792.0, - "13": 228424704.0, - "14": 217442256.0, - "15": 237921408.0, - "16": 225523072.0, - "17": 225458384.0, - "18": 164166928.0, - "19": 164457904.0, - "20": 180124848.0, - "21": 230463232.0, - "22": 230096384.0, - "23": 210054656.0, - "24": 200985472.0, - "25": 248708512.0, - "26": 301000896.0, - "27": 205364384.0, - "28": 270886048.0, - "29": 259695952.0, - "30": 224280720.0, - "31": 244360992.0, - "32": 189382672.0, - "33": 231930816.0, - "34": 206712432.0, - "35": 194319616.0, - "36": 246163408.0, - "37": 193561968.0, - "38": 228822688.0, - "39": 226941728.0, - "40": 196742032.0, - "41": 200179904.0, - "42": 219112640.0, - "43": 186235920.0, - "44": 138763920.0, - "45": 148907984.0, - "46": 109115896.0, - "47": 167015728.0, - "48": 156135104.0, - "49": 91378480.0, - "50": 164099648.0 + "1": 19403624.0, + "2": 19274194.0, + "3": 19372760.0, + "4": 86525248.0, + "5": 148575568.0, + "6": 145226704.0, + "7": 171879984.0, + "8": 195785248.0, + "9": 164124752.0, + "10": 167684736.0, + "11": 221077344.0, + "12": 200384224.0, + "13": 248872528.0, + "14": 211169424.0, + "15": 214304608.0, + "16": 216075632.0, + "17": 267845984.0, + "18": 170470336.0, + "19": 176865072.0, + "20": 187955392.0, + "21": 225750704.0, + "22": 247396816.0, + "23": 211643856.0, + "24": 205638464.0, + "25": 277022272.0, + "26": 291562304.0, + "27": 225789840.0, + "28": 288202368.0, + "29": 198390384.0, + "30": 213302208.0, + "31": 227204752.0, + "32": 271112416.0, + "33": 231840432.0, + "34": 203575536.0, + "35": 191152368.0, + "36": 222566928.0, + "37": 177810112.0, + "38": 228708544.0, + "39": 211168784.0, + "40": 215603968.0, + "41": 200089440.0, + "42": 228529888.0, + "43": 198782848.0, + "44": 141902272.0, + "45": 181922816.0, + "46": 115369856.0, + "47": 170214176.0, + "48": 137292832.0, + "49": 97654936.0, + "50": 160979632.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 4749337600.0, - "2": 4748343808.0, - "3": 4747997696.0, - "4": 4747469312.0, - "5": 4745943552.0, - "6": 4746412544.0, - "7": 4749017600.0, - "8": 4746762752.0, - "9": 4746394112.0, - "10": 4748286464.0, - "11": 4747621888.0, - "12": 4747802112.0, - "13": 4746905088.0, - "14": 4746850816.0, - "15": 4745785856.0, - "16": 4746166784.0, - "17": 4745583104.0, - "18": 4746839552.0, - "19": 4746510848.0, - "20": 4748375552.0, - "21": 4746974720.0, - "22": 4747533824.0, - "23": 4746271232.0, - "24": 4747352576.0, - "25": 4746148352.0, - "26": 4746516992.0, - "27": 4748668416.0, - "28": 4746871296.0, - "29": 4747913728.0, - "30": 4746131968.0, - "31": 4747437568.0, - "32": 4748567040.0, - "33": 4746713600.0, - "34": 4747983360.0, - "35": 4747450880.0, - "36": 4748372480.0, - "37": 4747075072.0, - "38": 4748749312.0, - "39": 4747972096.0, - "40": 4746372608.0, - "41": 4747513344.0, - "42": 4747912704.0, - "43": 4746867200.0, - "44": 4747612672.0, - "45": 4748287488.0, - "46": 4746935808.0, - "47": 4748032512.0, - "48": 4747668992.0, - "49": 4747238912.0, - "50": 4749120000.0 + "1": 4882187264.0, + "2": 4881607168.0, + "3": 4882283008.0, + "4": 4881322496.0, + "5": 4882174464.0, + "6": 4883177984.0, + "7": 4883252736.0, + "8": 4881774080.0, + "9": 4881443328.0, + "10": 4884319744.0, + "11": 4882319872.0, + "12": 4881232384.0, + "13": 4880836096.0, + "14": 4882124288.0, + "15": 4882108928.0, + "16": 4883384832.0, + "17": 4880466432.0, + "18": 4881518080.0, + "19": 4881734144.0, + "20": 4883215872.0, + "21": 4883534336.0, + "22": 4882774528.0, + "23": 4881818112.0, + "24": 4882441728.0, + "25": 4880546304.0, + "26": 4882178560.0, + "27": 4881892864.0, + "28": 4881869312.0, + "29": 4882979328.0, + "30": 4882715136.0, + "31": 4883084800.0, + "32": 4881436160.0, + "33": 4881766912.0, + "34": 4881406464.0, + "35": 4881531392.0, + "36": 4881479168.0, + "37": 4882455040.0, + "38": 4882054656.0, + "39": 4882005504.0, + "40": 4882743808.0, + "41": 4881211904.0, + "42": 4881378816.0, + "43": 4882133504.0, + "44": 4881860096.0, + "45": 4883165696.0, + "46": 4882168320.0, + "47": 4881526272.0, + "48": 4882125312.0, + "49": 4881533440.0, + "50": 4881598976.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11455561728.0, - "2": 12440659968.0, - "3": 12440659968.0, - "4": 12440659968.0, - "5": 12440659968.0, - "6": 12576563200.0, - "7": 12813101056.0, - "8": 12813101056.0, - "9": 13424891904.0, - "10": 13556338688.0, - "11": 13556338688.0, - "12": 13556338688.0, - "13": 13556338688.0, - "14": 13556338688.0, - "15": 13556338688.0, - "16": 13556338688.0, - "17": 13556338688.0, - "18": 13556338688.0, - "19": 13556338688.0, - "20": 13556338688.0, - "21": 13758310400.0, - "22": 13883041792.0, - "23": 13883041792.0, - "24": 13883041792.0, - "25": 13883041792.0, - "26": 13883041792.0, - "27": 13883041792.0, - "28": 13883041792.0, - "29": 13883041792.0, - "30": 13883041792.0, - "31": 13883041792.0, - "32": 13883041792.0, - "33": 13883041792.0, - "34": 13883041792.0, - "35": 13883041792.0, - "36": 13883041792.0, - "37": 13883041792.0, - "38": 13883041792.0, - "39": 13883041792.0, - "40": 13883041792.0, - "41": 13883041792.0, - "42": 13883041792.0, - "43": 13883041792.0, - "44": 13883041792.0, - "45": 13883041792.0, - "46": 13883041792.0, - "47": 13883041792.0, - "48": 13883041792.0, - "49": 13883041792.0, - "50": 13883041792.0 + "1": 41210470400.0, + "2": 41210470400.0, + "3": 41210470400.0, + "4": 41210470400.0, + "5": 41210470400.0, + "6": 41210470400.0, + "7": 41210470400.0, + "8": 41210470400.0, + "9": 41210470400.0, + "10": 41210470400.0, + "11": 41210470400.0, + "12": 41210470400.0, + "13": 41210470400.0, + "14": 41210470400.0, + "15": 41210470400.0, + "16": 41210470400.0, + "17": 41210470400.0, + "18": 41210470400.0, + "19": 41210470400.0, + "20": 41210470400.0, + "21": 41210470400.0, + "22": 41210470400.0, + "23": 41210470400.0, + "24": 41210470400.0, + "25": 41210470400.0, + "26": 41210470400.0, + "27": 41210470400.0, + "28": 41210470400.0, + "29": 41210470400.0, + "30": 41210470400.0, + "31": 41210470400.0, + "32": 41210470400.0, + "33": 41210470400.0, + "34": 41210470400.0, + "35": 41210470400.0, + "36": 41210470400.0, + "37": 41210470400.0, + "38": 41210470400.0, + "39": 41210470400.0, + "40": 41210470400.0, + "41": 41210470400.0, + "42": 41210470400.0, + "43": 41210470400.0, + "44": 41210470400.0, + "45": 41210470400.0, + "46": 41210470400.0, + "47": 41210470400.0, + "48": 41210470400.0, + "49": 41210470400.0, + "50": 41210470400.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 99.19363, - "2": 0.87925, - "3": 0.76355, - "4": 0.70351, - "5": 1.06855, - "6": 0.8083, - "7": 0.79282, - "8": 0.81872, - "9": 0.67053, - "10": 0.64913, - "11": 0.72935, - "12": 0.64945, - "13": 0.64181, - "14": 0.63807, - "15": 0.65651, - "16": 0.66428, - "17": 0.65744, - "18": 0.65362, - "19": 0.65862, - "20": 0.6544, - "21": 0.64288, - "22": 0.64951, - "23": 0.64322, - "24": 0.64447, - "25": 0.63601, - "26": 0.62955, - "27": 0.6244, - "28": 0.62697, - "29": 0.62787, - "30": 0.6295, - "31": 0.63726, - "32": 0.62178, - "33": 0.62521, - "34": 0.62615, - "35": 0.61895, - "36": 0.62424, - "37": 0.62219, - "38": 0.62548, - "39": 0.62127, - "40": 0.62356, - "41": 0.6165, - "42": 0.61786, - "43": 0.61742, - "44": 0.61943, - "45": 0.61884, - "46": 0.62012, - "47": 0.61656, - "48": 0.6143, - "49": 0.61232, - "50": 0.6085 + "1": 96.21947, + "2": 1.10023, + "3": 0.96399, + "4": 0.91113, + "5": 1.27509, + "6": 1.00484, + "7": 1.01236, + "8": 1.1739, + "9": 0.89406, + "10": 0.88836, + "11": 0.92033, + "12": 0.88331, + "13": 0.88179, + "14": 0.88307, + "15": 0.88648, + "16": 0.88425, + "17": 0.87155, + "18": 0.87556, + "19": 0.87374, + "20": 0.8744, + "21": 0.86757, + "22": 0.87217, + "23": 0.8736, + "24": 0.86646, + "25": 0.87328, + "26": 0.87121, + "27": 0.85886, + "28": 0.86392, + "29": 0.86385, + "30": 0.86425, + "31": 0.8631, + "32": 0.8617, + "33": 0.86069, + "34": 0.86829, + "35": 0.86837, + "36": 0.86776, + "37": 0.86686, + "38": 0.86359, + "39": 0.8677, + "40": 0.86441, + "41": 0.86179, + "42": 0.86079, + "43": 0.86149, + "44": 0.86222, + "45": 0.86336, + "46": 0.85875, + "47": 0.86219, + "48": 0.86026, + "49": 0.85894, + "50": 0.8544 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml index 9fdcb460cf3..11d62eb1490 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml @@ -17,7 +17,8 @@ MODEL_ARGS: --use-distributed-optimizer: true --overlap-grad-reduce: true --overlap-param-gather: true - --attention-backend: fused + # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN + --attention-backend: unfused # TODO: switch back to fused attention after fix # Training args --use-mcore-models: true --sequence-parallel: true @@ -118,7 +119,7 @@ MODEL_ARGS: --logging-level: 40 --tensorboard-dir: ${TENSORBOARD_PATH} # CUDA Graph args - --external-cuda-graph: true + --cuda-graph-impl: transformer_engine --cuda-graph-scope: attn --cuda-graph-warmup-steps: 0 --te-rng-tracker: true @@ -127,10 +128,10 @@ MODEL_ARGS: --fp8-format: hybrid --fp8-recipe: tensorwise --exit-interval: 50 -TEST_TYPE: ckpt-resume # Usually ckpt-resume, but as a WAR to #513 set to regular +TEST_TYPE: regular # Usually ckpt-resume, but as a WAR to #513 set to regular METRICS: - "iteration-time" - "lm loss" - "num-zeros" - # - "mem-allocated-bytes" - # - "mem-max-allocated-bytes" # Disable for now since resume training has more memory cost. To be investigated. + - "mem-allocated-bytes" + - "mem-max-allocated-bytes" diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgxh100_coreweave.json deleted file mode 100644 index 8c4f243d4c2..00000000000 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgxh100_coreweave.json +++ /dev/null @@ -1,344 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 11.0637, - "2": 11.03838, - "3": 9.79196, - "4": 14.17309, - "5": 9.48263, - "6": 9.30356, - "7": 9.27632, - "8": 8.75189, - "9": 8.70462, - "10": 9.04035, - "11": 8.41109, - "12": 8.53109, - "13": 8.43144, - "14": 7.93673, - "15": 8.00837, - "16": 8.08212, - "17": 8.06887, - "18": 7.75236, - "19": 8.13737, - "20": 7.88364, - "21": 7.56605, - "22": 7.55552, - "23": 7.42862, - "24": 7.41252, - "25": 7.67597, - "26": 7.08176, - "27": 7.62221, - "28": 7.32629, - "29": 7.49894, - "30": 7.63447, - "31": 7.3983, - "32": 7.59785, - "33": 7.64396, - "34": 7.70726, - "35": 7.21393, - "36": 7.08985, - "37": 7.42971, - "38": 7.19273, - "39": 7.56041, - "40": 7.55564, - "41": 7.49928, - "42": 7.25988, - "43": 7.24878, - "44": 7.42783, - "45": 7.21045, - "46": 6.91669, - "47": 7.31999, - "48": 7.16939, - "49": 7.62783, - "50": 7.05439 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 38802064.0, - "2": 38543200.0, - "3": 38744220.0, - "4": 166695072.0, - "5": 394456256.0, - "6": 441303136.0, - "7": 538731776.0, - "8": 680781184.0, - "9": 564001216.0, - "10": 571185472.0, - "11": 624455360.0, - "12": 680622208.0, - "13": 777548288.0, - "14": 717772992.0, - "15": 699100416.0, - "16": 677486208.0, - "17": 645761024.0, - "18": 671155776.0, - "19": 674320512.0, - "20": 891692160.0, - "21": 658833920.0, - "22": 802998016.0, - "23": 756352768.0, - "24": 772904192.0, - "25": 748799104.0, - "26": 771817792.0, - "27": 772312064.0, - "28": 655008000.0, - "29": 783495808.0, - "30": 794511296.0, - "31": 756035712.0, - "32": 535862592.0, - "33": 680633984.0, - "34": 482597312.0, - "35": 671593792.0, - "36": 658959488.0, - "37": 626012736.0, - "38": 614650240.0, - "39": 595183872.0, - "40": 421718816.0, - "41": 557433600.0, - "42": 545065344.0, - "43": 539024064.0, - "44": 544803840.0, - "45": 517934176.0, - "46": 504352736.0, - "47": 497582464.0, - "48": 500981632.0, - "49": 490922656.0, - "50": 472902496.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 6294696448.0, - "2": 6295491072.0, - "3": 6296283648.0, - "4": 6297076224.0, - "5": 6297868800.0, - "6": 6298661376.0, - "7": 6294104064.0, - "8": 6294896640.0, - "9": 6295689216.0, - "10": 6296481792.0, - "11": 6294500352.0, - "12": 6295292928.0, - "13": 6296085504.0, - "14": 6296878080.0, - "15": 6297670656.0, - "16": 6298463232.0, - "17": 6299255808.0, - "18": 6300048384.0, - "19": 6300840960.0, - "20": 6301633536.0, - "21": 6302426112.0, - "22": 6303218688.0, - "23": 6304011264.0, - "24": 6304803840.0, - "25": 6305596416.0, - "26": 6306388992.0, - "27": 6307181568.0, - "28": 6307974144.0, - "29": 6308766720.0, - "30": 6309559296.0, - "31": 6310351872.0, - "32": 6311144448.0, - "33": 6311937024.0, - "34": 6312729600.0, - "35": 6313522176.0, - "36": 6314314752.0, - "37": 6315107328.0, - "38": 6315899904.0, - "39": 6316692480.0, - "40": 6317485056.0, - "41": 6318277632.0, - "42": 6319070208.0, - "43": 6319862784.0, - "44": 6320655360.0, - "45": 6321447936.0, - "46": 6322240512.0, - "47": 6323033088.0, - "48": 6323825664.0, - "49": 6324618240.0, - "50": 6325410816.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 46771978240.0, - "2": 49466654720.0, - "3": 51157819392.0, - "4": 51157819392.0, - "5": 51157819392.0, - "6": 51157819392.0, - "7": 51157819392.0, - "8": 51157819392.0, - "9": 51157819392.0, - "10": 51157819392.0, - "11": 51157819392.0, - "12": 51157819392.0, - "13": 51157819392.0, - "14": 51157819392.0, - "15": 51157819392.0, - "16": 51157819392.0, - "17": 51157819392.0, - "18": 51157819392.0, - "19": 51157819392.0, - "20": 51157819392.0, - "21": 51157819392.0, - "22": 51157819392.0, - "23": 51157819392.0, - "24": 51157819392.0, - "25": 51157819392.0, - "26": 51157819392.0, - "27": 51157819392.0, - "28": 51157819392.0, - "29": 51157819392.0, - "30": 51157819392.0, - "31": 51157819392.0, - "32": 51157819392.0, - "33": 51157819392.0, - "34": 51157819392.0, - "35": 51157819392.0, - "36": 51157819392.0, - "37": 51157819392.0, - "38": 51157819392.0, - "39": 51157819392.0, - "40": 51157819392.0, - "41": 51157819392.0, - "42": 51157819392.0, - "43": 51157819392.0, - "44": 51157819392.0, - "45": 51157819392.0, - "46": 51157819392.0, - "47": 51157819392.0, - "48": 51157819392.0, - "49": 51157819392.0, - "50": 51157819392.0 - } - }, - "mtp_1 loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 11.04508, - "2": 11.05397, - "3": 10.54505, - "4": 9.99194, - "5": 9.76285, - "6": 9.45507, - "7": 9.54431, - "8": 8.91725, - "9": 8.74784, - "10": 9.04997, - "11": 8.40193, - "12": 8.48288, - "13": 8.36926, - "14": 7.81448, - "15": 7.93865, - "16": 8.02231, - "17": 7.96741, - "18": 7.70552, - "19": 8.09012, - "20": 7.79984, - "21": 7.48241, - "22": 7.49502, - "23": 7.35415, - "24": 7.34793, - "25": 7.60324, - "26": 7.01638, - "27": 7.55495, - "28": 7.24721, - "29": 7.43133, - "30": 7.56633, - "31": 7.31391, - "32": 7.50445, - "33": 7.55658, - "34": 7.62234, - "35": 7.13802, - "36": 7.00593, - "37": 7.33916, - "38": 7.1095, - "39": 7.4736, - "40": 7.45784, - "41": 7.40514, - "42": 7.15986, - "43": 7.14965, - "44": 7.32758, - "45": 7.11892, - "46": 6.81056, - "47": 7.2234, - "48": 7.06789, - "49": 7.503, - "50": 6.9559 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 71.51538, - "2": 1.72071, - "3": 1.31657, - "4": 1.18423, - "5": 3.82179, - "6": 2.3037, - "7": 3.15765, - "8": 1.26325, - "9": 1.04414, - "10": 1.05643, - "11": 2.7525, - "12": 1.03473, - "13": 1.05477, - "14": 1.05184, - "15": 1.06441, - "16": 1.1362, - "17": 1.05355, - "18": 1.05093, - "19": 1.04209, - "20": 1.03871, - "21": 1.04773, - "22": 1.05492, - "23": 1.02882, - "24": 1.05172, - "25": 1.03632, - "26": 1.04229, - "27": 1.04662, - "28": 1.05014, - "29": 1.03047, - "30": 1.0813, - "31": 1.06319, - "32": 1.02842, - "33": 1.041, - "34": 1.02275, - "35": 1.03563, - "36": 1.0411, - "37": 1.02865, - "38": 1.03454, - "39": 1.05619, - "40": 1.04996, - "41": 1.02719, - "42": 1.05309, - "43": 1.03532, - "44": 1.05042, - "45": 1.03343, - "46": 1.04769, - "47": 1.03458, - "48": 1.04744, - "49": 1.04302, - "50": 1.0386 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgxh100_eos.json deleted file mode 100644 index 29b1b467978..00000000000 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgxh100_eos.json +++ /dev/null @@ -1,344 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 11.0637, - "2": 11.03838, - "3": 9.79196, - "4": 14.17309, - "5": 9.48263, - "6": 9.30356, - "7": 9.27632, - "8": 8.75189, - "9": 8.70462, - "10": 9.04035, - "11": 8.41109, - "12": 8.53109, - "13": 8.43144, - "14": 7.93673, - "15": 8.00837, - "16": 8.08212, - "17": 8.06887, - "18": 7.75236, - "19": 8.13737, - "20": 7.88364, - "21": 7.56605, - "22": 7.55552, - "23": 7.42862, - "24": 7.41252, - "25": 7.67597, - "26": 7.08176, - "27": 7.62221, - "28": 7.32629, - "29": 7.49894, - "30": 7.63447, - "31": 7.3983, - "32": 7.59785, - "33": 7.64396, - "34": 7.70726, - "35": 7.21393, - "36": 7.08985, - "37": 7.42971, - "38": 7.19273, - "39": 7.56041, - "40": 7.55564, - "41": 7.49928, - "42": 7.25988, - "43": 7.24878, - "44": 7.42783, - "45": 7.21045, - "46": 6.91669, - "47": 7.31999, - "48": 7.16939, - "49": 7.62783, - "50": 7.05439 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 38802064.0, - "2": 38543200.0, - "3": 38744220.0, - "4": 166695072.0, - "5": 394456256.0, - "6": 441303136.0, - "7": 538731776.0, - "8": 680781184.0, - "9": 564001216.0, - "10": 571185472.0, - "11": 624455360.0, - "12": 680622208.0, - "13": 777548288.0, - "14": 717772992.0, - "15": 699100416.0, - "16": 677486208.0, - "17": 645761024.0, - "18": 671155776.0, - "19": 674320512.0, - "20": 891692160.0, - "21": 658833920.0, - "22": 802998016.0, - "23": 756352768.0, - "24": 772904192.0, - "25": 748799104.0, - "26": 771817792.0, - "27": 772312064.0, - "28": 655008000.0, - "29": 783495808.0, - "30": 794511296.0, - "31": 756035712.0, - "32": 535862592.0, - "33": 680633984.0, - "34": 482597312.0, - "35": 671593792.0, - "36": 658959488.0, - "37": 626012736.0, - "38": 614650240.0, - "39": 595183872.0, - "40": 421718816.0, - "41": 557433600.0, - "42": 545065344.0, - "43": 539024064.0, - "44": 544803840.0, - "45": 517934176.0, - "46": 504352736.0, - "47": 497582464.0, - "48": 500981632.0, - "49": 490922656.0, - "50": 472902496.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 6294696448.0, - "2": 6295491072.0, - "3": 6296283648.0, - "4": 6297076224.0, - "5": 6297868800.0, - "6": 6298661376.0, - "7": 6294104064.0, - "8": 6294896640.0, - "9": 6295689216.0, - "10": 6296481792.0, - "11": 6294500352.0, - "12": 6295292928.0, - "13": 6296085504.0, - "14": 6296878080.0, - "15": 6297670656.0, - "16": 6298463232.0, - "17": 6299255808.0, - "18": 6300048384.0, - "19": 6300840960.0, - "20": 6301633536.0, - "21": 6302426112.0, - "22": 6303218688.0, - "23": 6304011264.0, - "24": 6304803840.0, - "25": 6305596416.0, - "26": 6306388992.0, - "27": 6307181568.0, - "28": 6307974144.0, - "29": 6308766720.0, - "30": 6309559296.0, - "31": 6310351872.0, - "32": 6311144448.0, - "33": 6311937024.0, - "34": 6312729600.0, - "35": 6313522176.0, - "36": 6314314752.0, - "37": 6315107328.0, - "38": 6315899904.0, - "39": 6316692480.0, - "40": 6317485056.0, - "41": 6318277632.0, - "42": 6319070208.0, - "43": 6319862784.0, - "44": 6320655360.0, - "45": 6321447936.0, - "46": 6322240512.0, - "47": 6323033088.0, - "48": 6323825664.0, - "49": 6324618240.0, - "50": 6325410816.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 46771978240.0, - "2": 49466654720.0, - "3": 51157819392.0, - "4": 51157819392.0, - "5": 51157819392.0, - "6": 51157819392.0, - "7": 51157819392.0, - "8": 51157819392.0, - "9": 51157819392.0, - "10": 51157819392.0, - "11": 51157819392.0, - "12": 51157819392.0, - "13": 51157819392.0, - "14": 51157819392.0, - "15": 51157819392.0, - "16": 51157819392.0, - "17": 51157819392.0, - "18": 51157819392.0, - "19": 51157819392.0, - "20": 51157819392.0, - "21": 51157819392.0, - "22": 51157819392.0, - "23": 51157819392.0, - "24": 51157819392.0, - "25": 51157819392.0, - "26": 51157819392.0, - "27": 51157819392.0, - "28": 51157819392.0, - "29": 51157819392.0, - "30": 51157819392.0, - "31": 51157819392.0, - "32": 51157819392.0, - "33": 51157819392.0, - "34": 51157819392.0, - "35": 51157819392.0, - "36": 51157819392.0, - "37": 51157819392.0, - "38": 51157819392.0, - "39": 51157819392.0, - "40": 51157819392.0, - "41": 51157819392.0, - "42": 51157819392.0, - "43": 51157819392.0, - "44": 51157819392.0, - "45": 51157819392.0, - "46": 51157819392.0, - "47": 51157819392.0, - "48": 51157819392.0, - "49": 51157819392.0, - "50": 51157819392.0 - } - }, - "mtp_1 loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 11.04508, - "2": 11.05397, - "3": 10.54505, - "4": 9.99194, - "5": 9.76285, - "6": 9.45507, - "7": 9.54431, - "8": 8.91725, - "9": 8.74784, - "10": 9.04997, - "11": 8.40193, - "12": 8.48288, - "13": 8.36926, - "14": 7.81448, - "15": 7.93865, - "16": 8.02231, - "17": 7.96741, - "18": 7.70552, - "19": 8.09012, - "20": 7.79984, - "21": 7.48241, - "22": 7.49502, - "23": 7.35415, - "24": 7.34793, - "25": 7.60324, - "26": 7.01638, - "27": 7.55495, - "28": 7.24721, - "29": 7.43133, - "30": 7.56633, - "31": 7.31391, - "32": 7.50445, - "33": 7.55658, - "34": 7.62234, - "35": 7.13802, - "36": 7.00593, - "37": 7.33916, - "38": 7.1095, - "39": 7.4736, - "40": 7.45784, - "41": 7.40514, - "42": 7.15986, - "43": 7.14965, - "44": 7.32758, - "45": 7.11892, - "46": 6.81056, - "47": 7.2234, - "48": 7.06789, - "49": 7.503, - "50": 6.9559 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 58.25602, - "2": 1.30671, - "3": 1.18374, - "4": 1.08853, - "5": 3.28347, - "6": 2.13071, - "7": 2.96694, - "8": 1.2675, - "9": 1.07672, - "10": 1.07909, - "11": 2.90834, - "12": 1.06176, - "13": 1.06257, - "14": 1.06668, - "15": 1.08083, - "16": 1.08186, - "17": 1.06861, - "18": 1.07223, - "19": 1.06661, - "20": 1.07354, - "21": 1.07863, - "22": 1.08557, - "23": 1.06174, - "24": 1.07533, - "25": 1.06172, - "26": 1.06344, - "27": 1.05522, - "28": 1.05011, - "29": 1.04098, - "30": 1.04622, - "31": 1.0423, - "32": 1.04292, - "33": 1.06328, - "34": 1.03657, - "35": 1.04963, - "36": 1.05103, - "37": 1.04147, - "38": 1.04912, - "39": 1.04838, - "40": 1.04559, - "41": 1.05462, - "42": 1.05103, - "43": 1.04965, - "44": 1.05296, - "45": 1.05039, - "46": 1.05609, - "47": 1.0476, - "48": 1.053, - "49": 1.04626, - "50": 1.05911 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml index 4036686e888..0a37ee08498 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml @@ -16,7 +16,8 @@ MODEL_ARGS: --use-distributed-optimizer: true --overlap-grad-reduce: true --overlap-param-gather: true - --attention-backend: unfused + # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN + --attention-backend: unfused # TODO: switch back to fused attention after fix # Training args --use-mcore-models: true --sequence-parallel: true @@ -125,7 +126,7 @@ MODEL_ARGS: --bf16: true --exit-interval: 50 --overlap-moe-expert-parallel-comm: true -TEST_TYPE: ckpt-resume # Usually ckpt-resume, but as a WAR to #513 set to regular +TEST_TYPE: regular # Usually ckpt-resume, but as a WAR to #513 set to regular METRICS: - "iteration-time" - "lm loss" diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml index 638ee1a89a3..ddfb8d1980b 100644 --- a/tests/test_utils/recipes/moe.yaml +++ b/tests/test_utils/recipes/moe.yaml @@ -95,11 +95,11 @@ products: - environment: [dev] scope: [mr] platforms: [dgx_h100] # hang: #513 - - test_case: [gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] # hang: #513 + # - test_case: [gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental] + # products: + # - environment: [dev] + # scope: [mr] + # platforms: [dgx_h100] # hang: #513 - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G] products: - environment: [dev] @@ -128,11 +128,6 @@ products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - - test_case: [gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] ####################################################################### # Super important MR tests that run for both DEV and LTS per MR # ####################################################################### @@ -149,6 +144,14 @@ products: ########################### # Merge train tests # ########################### + - test_case: [gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_h100] + - environment: [dev] + scope: [mr-slim] + platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed] products: - environment: [dev] From 12839ed0d8b2da8c97fe0eaa0fd73c497f1ff1f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 21 Oct 2025 23:40:26 +0000 Subject: [PATCH 031/248] build: Fix jet MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- docker/Dockerfile.ci.dev | 2 +- docker/Dockerfile.ci.lts | 2 +- docker/Dockerfile.ci.nemo | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev index b3295697f31..92d7a129d0b 100644 --- a/docker/Dockerfile.ci.dev +++ b/docker/Dockerfile.ci.dev @@ -83,6 +83,6 @@ RUN --mount=type=secret,id=JET_INDEX_URLS \ LOGGER_INDEX_URL=$(cat /run/secrets/LOGGER_INDEX_URL) uv pip install --no-cache-dir --upgrade $LOGGER_INDEX_URL "one-logger" uv pip install --no-cache-dir --upgrade "setuptools<80.0.0" - uv pip install --no-cache-dir --upgrade $JET_INDEX_URLS "jet-client~=2.0" + uv pip install --no-cache-dir --upgrade $JET_INDEX_URLS "jet-client~=3.0" EOF ### diff --git a/docker/Dockerfile.ci.lts b/docker/Dockerfile.ci.lts index 8889760cfc8..7da27a03f1d 100644 --- a/docker/Dockerfile.ci.lts +++ b/docker/Dockerfile.ci.lts @@ -93,6 +93,6 @@ RUN --mount=type=secret,id=JET_INDEX_URLS \ LOGGER_INDEX_URL=$(cat /run/secrets/LOGGER_INDEX_URL) uv pip install --no-cache-dir --upgrade $LOGGER_INDEX_URL "one-logger" uv pip install --no-cache-dir --upgrade "setuptools<80.0.0" - uv pip install --no-cache-dir --upgrade $JET_INDEX_URLS "jet-client~=2.0" + uv pip install --no-cache-dir --upgrade $JET_INDEX_URLS "jet-client~=3.0" EOF ### \ No newline at end of file diff --git a/docker/Dockerfile.ci.nemo b/docker/Dockerfile.ci.nemo index 0452976a8c7..2369602f54d 100644 --- a/docker/Dockerfile.ci.nemo +++ b/docker/Dockerfile.ci.nemo @@ -14,7 +14,7 @@ FROM main as jet ARG JET_API_VERSION RUN --mount=type=secret,id=JET_INDEX_URLS \ JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \ - pip install --no-cache-dir jet-api==$JET_API_VERSION "jet-client~=2.0" --upgrade $JET_INDEX_URLS + pip install --no-cache-dir jet-api==$JET_API_VERSION "jet-client~=3.0" --upgrade $JET_INDEX_URLS ENV PATH="$PATH:/opt/jet/bin" ### From a8bad4b441127242ab60d9bf79e1a52c2b361d34 Mon Sep 17 00:00:00 2001 From: Yu Yao Date: Tue, 21 Oct 2025 16:48:20 -0700 Subject: [PATCH 032/248] ADLR/megatron-lm!4312 - [dev] Set tensor-parallel attributes irrespective of perform_initialization Co-authored-by: Mcore Bot Co-authored-by: yaoyu-33 --- megatron/core/tensor_parallel/layers.py | 24 +++++ megatron/core/transformer/moe/experts.py | 17 ++++ .../test_tp_attrs_without_init.py | 87 +++++++++++++++++++ 3 files changed, 128 insertions(+) create mode 100644 tests/unit_tests/tensor_parallel/test_tp_attrs_without_init.py diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py index 773c61597bc..5ca290ff680 100644 --- a/megatron/core/tensor_parallel/layers.py +++ b/megatron/core/tensor_parallel/layers.py @@ -248,6 +248,10 @@ def __init__( rank=get_pg_rank(self.tp_group), world_size=get_pg_size(self.tp_group), ) + else: + set_tensor_model_parallel_attributes( + tensor=self.weight, is_parallel=True, dim=0, stride=1 + ) else: self.weight = Parameter( torch.empty( @@ -259,6 +263,10 @@ def __init__( ) if config.perform_initialization: _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=1) + else: + set_tensor_model_parallel_attributes( + tensor=self.weight, is_parallel=True, dim=0, stride=1 + ) def forward(self, input_): """Forward. @@ -858,6 +866,10 @@ def __init__( rank=rank, world_size=world_size, ) + else: + set_tensor_model_parallel_attributes( + tensor=self.weight, is_parallel=True, dim=0, stride=stride + ) else: self.weight = Parameter( torch.empty( @@ -875,6 +887,10 @@ def __init__( stride=stride, is_expert=self.is_expert, ) + else: + set_tensor_model_parallel_attributes( + tensor=self.weight, is_parallel=True, dim=0, stride=stride + ) setattr(self.weight, "allreduce", not (self.is_expert and self.expert_parallel)) else: @@ -1170,6 +1186,10 @@ def __init__( rank=rank, world_size=world_size, ) + else: + set_tensor_model_parallel_attributes( + tensor=self.weight, is_parallel=True, dim=1, stride=stride + ) else: self.weight = Parameter( torch.empty( @@ -1187,6 +1207,10 @@ def __init__( stride=stride, is_expert=self.is_expert, ) + else: + set_tensor_model_parallel_attributes( + tensor=self.weight, is_parallel=True, dim=1, stride=stride + ) setattr(self.weight, "allreduce", not (self.is_expert and self.expert_parallel)) if bias: diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py index e73864a50fa..d0ac20a7536 100644 --- a/megatron/core/transformer/moe/experts.py +++ b/megatron/core/transformer/moe/experts.py @@ -30,6 +30,7 @@ from megatron.core.tensor_parallel.layers import ( _initialize_affine_weight_cpu, _initialize_affine_weight_gpu, + set_tensor_model_parallel_attributes, ) from megatron.core.tensor_parallel.utils import divide from megatron.core.transformer.mlp import MLP, MLPSubmodules, apply_swiglu_sharded_factory @@ -208,6 +209,14 @@ def activation_func_with_probs(x, probs): rank=tp_rank, world_size=tp_size, ) + else: + # Ensure TP attrs are set even when not initializing + set_tensor_model_parallel_attributes( + tensor=self.weight1, is_parallel=True, dim=1, stride=1 + ) + set_tensor_model_parallel_attributes( + tensor=self.weight2, is_parallel=True, dim=0, stride=1 + ) else: self.weight1 = Parameter( torch.empty( @@ -232,6 +241,14 @@ def activation_func_with_probs(x, probs): _initialize_affine_weight_gpu( self.weight2, config.output_layer_init_method, partition_dim=0, is_expert=True ) + else: + # Ensure TP attrs are set even when not initializing + set_tensor_model_parallel_attributes( + tensor=self.weight1, is_parallel=True, dim=1, stride=1 + ) + set_tensor_model_parallel_attributes( + tensor=self.weight2, is_parallel=True, dim=0, stride=1 + ) setattr(self.weight1, 'allreduce', not self.expert_parallel) setattr(self.weight2, 'allreduce', not self.expert_parallel) diff --git a/tests/unit_tests/tensor_parallel/test_tp_attrs_without_init.py b/tests/unit_tests/tensor_parallel/test_tp_attrs_without_init.py new file mode 100644 index 00000000000..f7a518e8e88 --- /dev/null +++ b/tests/unit_tests/tensor_parallel/test_tp_attrs_without_init.py @@ -0,0 +1,87 @@ +import pytest +import torch + +from megatron.core.tensor_parallel.layers import ( + ColumnParallelLinear, + RowParallelLinear, + VocabParallelEmbedding, +) +from megatron.core.transformer.transformer_config import TransformerConfig +from tests.unit_tests.test_utilities import Utils + + +class TestTPAttributesWithoutInitialization: + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.parametrize("use_cpu_init", [True, False]) + def test_vocab_parallel_embedding_tp_attrs_no_init(self, use_cpu_init): + Utils.initialize_model_parallel(tensor_model_parallel_size=2) + cfg = TransformerConfig( + num_layers=1, + hidden_size=8, + num_attention_heads=4, + use_cpu_initialization=use_cpu_init, + perform_initialization=False, + ) + + emb = VocabParallelEmbedding( + num_embeddings=16, embedding_dim=8, init_method=cfg.init_method, config=cfg + ) + w = emb.weight + assert hasattr(w, "tensor_model_parallel") and w.tensor_model_parallel is True + assert hasattr(w, "partition_dim") and w.partition_dim == 0 + assert hasattr(w, "partition_stride") and w.partition_stride == 1 + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.parametrize("use_cpu_init", [True, False]) + def test_column_parallel_linear_tp_attrs_no_init(self, use_cpu_init): + Utils.initialize_model_parallel(tensor_model_parallel_size=2) + cfg = TransformerConfig( + num_layers=1, + hidden_size=8, + num_attention_heads=4, + use_cpu_initialization=use_cpu_init, + perform_initialization=False, + ) + + layer = ColumnParallelLinear( + input_size=8, + output_size=8, + init_method=cfg.init_method, + bias=True, + config=cfg, + skip_bias_add=False, + ) + w = layer.weight + assert hasattr(w, "tensor_model_parallel") and w.tensor_model_parallel is True + assert hasattr(w, "partition_dim") and w.partition_dim == 0 + assert hasattr(w, "partition_stride") and w.partition_stride == 1 + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.parametrize("use_cpu_init", [True, False]) + def test_row_parallel_linear_tp_attrs_no_init(self, use_cpu_init): + Utils.initialize_model_parallel(tensor_model_parallel_size=2) + cfg = TransformerConfig( + num_layers=1, + hidden_size=8, + num_attention_heads=4, + use_cpu_initialization=use_cpu_init, + perform_initialization=False, + ) + + layer = RowParallelLinear( + input_size=8, + output_size=8, + init_method=cfg.init_method, + bias=True, + input_is_parallel=True, + config=cfg, + skip_bias_add=False, + ) + w = layer.weight + assert hasattr(w, "tensor_model_parallel") and w.tensor_model_parallel is True + assert hasattr(w, "partition_dim") and w.partition_dim == 1 + assert hasattr(w, "partition_stride") and w.partition_stride == 1 From d9153a50ce14f5e4802a079526552dfbc476149f Mon Sep 17 00:00:00 2001 From: "Tong Liu (Engrg-Hardware 1)" Date: Tue, 21 Oct 2025 23:10:16 -0700 Subject: [PATCH 033/248] ADLR/megatron-lm!4237 - [Dev] perf(MoE): Add the Hybrid-EP backend to the Flex Dispatcher --- docker/Dockerfile.ci.dev | 7 +- .../common/model_chunk_schedule_plan.py | 6 +- .../core/models/gpt/fine_grained_callables.py | 5 +- megatron/core/transformer/moe/README.md | 13 +- megatron/core/transformer/moe/fused_a2a.py | 270 ++++++++++++++++++ .../core/transformer/moe/token_dispatcher.py | 201 +++++++++++-- .../core/transformer/transformer_config.py | 20 +- megatron/training/arguments.py | 8 +- .../a2a_overlap/test_schedule_chunk_1f1b.py | 2 +- .../a2a_overlap/test_schedule_layer_1f1b.py | 4 +- .../transformer/moe/test_token_dispatcher.py | 52 +++- .../transformer/test_submodule_callables.py | 2 +- 12 files changed, 542 insertions(+), 48 deletions(-) diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev index 92d7a129d0b..1ad8d76324b 100644 --- a/docker/Dockerfile.ci.dev +++ b/docker/Dockerfile.ci.dev @@ -58,8 +58,11 @@ RUN bash -ex <<"EOF" ln -s libnvshmem_host.so.3 libnvshmem_host.so popd - git clone --branch v1.2.1 https://github.com/deepseek-ai/DeepEP.git - TORCH_CUDA_ARCH_LIST="9.0" uv pip install --no-build-isolation -v DeepEP/. + git clone --branch hybrid-ep https://github.com/deepseek-ai/DeepEP.git + cd DeepEP + git checkout 3f601f7ac1c062c46502646ff04c535013bfca00 + TORCH_CUDA_ARCH_LIST="9.0" uv pip install --no-build-isolation -v . + cd .. rm -rf DeepEP EOF diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py index 6a411ccdcf6..d501c11a0a9 100644 --- a/megatron/core/models/common/model_chunk_schedule_plan.py +++ b/megatron/core/models/common/model_chunk_schedule_plan.py @@ -107,7 +107,11 @@ def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args): if is_mtp else isinstance(self.layer.mlp, MoELayer) ) - enable_deepep = self.layer.config.moe_enable_deepep + + enable_deepep = ( + self.layer.config.moe_token_dispatcher_type == "flex" + and self.layer.config.moe_flex_dispatcher_backend == "deepep" + ) extra_args["enable_deepep"] = enable_deepep extra_args["is_moe"] = is_moe extra_args["delay_wgrad_compute"] = self.layer.config.delay_wgrad_compute diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index fbecc047682..36298fed66b 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -325,7 +325,10 @@ def build_transformer_layer_callables(layer: TransformerLayer): """ is_moe = isinstance(layer.mlp, MoELayer) - enable_deepep = layer.config.moe_enable_deepep + enable_deepep = ( + layer.config.moe_token_dispatcher_type == "flex" + and layer.config.moe_flex_dispatcher_backend == "deepep" + ) def submodule_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor): """ diff --git a/megatron/core/transformer/moe/README.md b/megatron/core/transformer/moe/README.md index 56be6fc2463..1ab325a939b 100644 --- a/megatron/core/transformer/moe/README.md +++ b/megatron/core/transformer/moe/README.md @@ -13,6 +13,7 @@ Megatron-Core MoE provides comprehensive parallelism strategies, seamlessly inte - Support Multi-Token Prediction (MTP) - Batch-level overlapping to hide EP-A2A communication - **Support DeepSeek's DeepEP for efficient token dispatching and combining** +- Support HybridEP for efficient token dispatching and combining within intra-node and MNNVL scenarios. - Add fusion for token permutation and unpermutation - Support Uneven virtual pipeline parallel split - Support output-discarding checkpointing on some submodules @@ -172,7 +173,13 @@ Note: The MoE model structure is defined through script arguments. All MoE-relat ### Leverage DeepSeek's DeepEP for High-Performance Cross-Node Token Dispatching - [DeepSeek-DeepEP](https://github.com/deepseek-ai/deepep) provides a highly optimized implementation for MoE token dispatching and combining operations, specifically designed for large-scale MoE training scenarios. - DeepEP is particularly recommended for training large-scale, fine-grained MoE architectures such as DeepSeek-V3 and other advanced MoE models. -- To enable DeepEP in your training configuration, simply set `--moe-token-dispatcher-type=flex` and `--moe-enable-deepep` in your command line arguments. +- To enable DeepEP in your training configuration, simply set `--moe-token-dispatcher-type=flex` and `--moe-flex-dispatcher-backend=deepep` in your command line arguments. + +### Integrate HybridEP for High-Performance Intra-Node Token Dispatching +- [HybridEP](https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) is developed by NVIDIA as an optimized solution for large-scale MoE (Mixture of Experts) all-to-all communication. It is designed to leverage NVIDIA GPU hardware capabilities, significantly reducing Streaming Multiprocessor (SM) resource usage. +- HybridEP currently supports intra-node and multi-node NVLink scenarios. +- To enable HybridEP, set `--moe-token-dispatcher-type=flex` and + `--moe-flex-dispatcher-backend=hybridep` in your command line arguments. ### CUDA Graph Support CUDA Graph functionality can be enabled through two options: @@ -240,7 +247,7 @@ Enable A2A overlap across different batches inspired by the DSv3 DualPipe implme | --moe-z-loss-coeff | Scaling coefficient for the z-loss: a starting value of 1e-3 is recommended. Default is None. | | --moe-input-jitter-eps | Add noise to the input tensor by applying jitter with a specified epsilon value. Default is None. | | --moe-token-dispatcher-type | Determines the token dispatcher type. Choices are "allgather", "alltoall". Default is "allgather". We recommend using 'alltoall' if expert parallelism is applied. We have upgraded the "alltoall" dispatcher in place during MCore v0.9, while the original implementation renamed as "alltoall_seq" is retained until MCore v0.13.| -| --moe-enable-deepep | (Experimental) Enable DeepSeek/DeepEP for efficient token dispatching and combine in MoE models. Only works with flex token dispatcher by setting --moe-token-dispatcher-type=flex. | +| --moe-flex-dispatcher-backend | (Experimental) Select the backend for the flex token dispatcher. Supported options: "deepep", "hybridep". Enables efficient token dispatching and combining for MoE models. | | --moe-per-layer-logging | Enable per-layer logging for MoE, currently supports auxiliary loss and z loss. | | --moe-expert-capacity-factor | The capacity factor for each expert, None means no token will be dropped. Default is None. | | --moe-pad-expert-input-to-capacity | Pads the input for each expert to match the expert capacity length, effective only after the --moe-expert-capacity-factor is set. | @@ -441,7 +448,7 @@ By setting `--expert-tensor-parallel-size`, we can set MoE-specific TP size. - Token Dispatcher sends tokens to the designated expert, involves tensor rearangement and communications. - Dispatcher `allgather` is the default option. It achieves better performance and efficiency when only tensor parallelism is used or when the Top-k value is very large. - Dispatcher `alltoall` is recommended if expert parallelism is applied. -- Dispatcher `flex` is a new dispatcher decouples communication group from model parallelism. Currently, only the DeepEP backend is supported for by setting `--moe-enable-deepep`. +- Dispatcher `flex` is a new dispatcher decouples communication group from model parallelism. It supports two backends(DeepEP and HybridEP) selectable via `--moe-flex-dispatcher-backend`. **Enable Communication Overlap** - Enable `--overlap-param-gather` and `--overlap-grad-reduce` with distributed optimizer. diff --git a/megatron/core/transformer/moe/fused_a2a.py b/megatron/core/transformer/moe/fused_a2a.py index 00a840f2b7f..60b0b11a32c 100644 --- a/megatron/core/transformer/moe/fused_a2a.py +++ b/megatron/core/transformer/moe/fused_a2a.py @@ -262,3 +262,273 @@ def set_deepep_num_sms(num_sms): fused_dispatch = None fused_combine = None set_deepep_num_sms = None + + +try: + from deep_ep import HybridEPBuffer + + HAVE_HYBRIDEP = True +except ImportError: + HAVE_HYBRIDEP = False + +_hybrid_ep_buffer = None + + +def init_hybrid_ep_buffer( + group: torch.distributed.ProcessGroup, + hidden_dim: int, + seq_len: int, + num_local_experts: int, + num_sms_dispatch_api: int, + num_sms_combine_api: int, + fp8_dispatch: bool, +) -> None: + ''' + Initialize the HybridEP buffer, including buffer allocation and metadata + initialization. + + If a runtime dispatch/combine requires a larger buffer than the one + initialized, the buffer will be reallocated at runtime, + incuring extra run-time overhead. + + Args: + group (torch.distributed.ProcessGroup): + Process group for HybridEP all-to-all communication. + hidden_dim (int): + Hidden dimension of the input tensor. + seq_len (int): + Maximum sequence length of the input tensor. + num_local_experts (int): + Number of local experts. + num_sms_dispatch_api (int): + Number of SMs used by the dispatch API. + num_sms_combine_api (int): + Number of SMs used by the combine API. + fp8_dispatch (bool): + Whether to use FP8 communication during the dispatch phase. + ''' + assert not fp8_dispatch, "HybridEP dispatcher does not support fp8 dispatch now" + global _hybrid_ep_buffer + _hybrid_ep_buffer = HybridEPBuffer( + group=group, + hidden_dim=hidden_dim, + max_num_of_tokens_per_rank=seq_len, + num_local_experts=num_local_experts, + use_fp8=fp8_dispatch, + num_sms_dispatch_api=num_sms_dispatch_api, + num_sms_combine_api=num_sms_combine_api, + ) + + +class HybridEPDispatch(torch.autograd.Function): + ''' + Fused dispatch operation for permute + dispatch a2a + permute using the HybridEP backend + ''' + + @staticmethod + def forward( + ctx, + x, + routing_map, + probs, + group, + num_local_experts, + num_sms_dispatch_api=24, + num_sms_combine_api=24, + num_dispatched_tokens=None, + num_permuted_tokens=None, + pad_multiple=None, + ): + ''' + Forward pass of fused dispatch of the HybridEP backend + ''' + if _hybrid_ep_buffer is None: + seq_len, hidden_dim = x.shape[-2:] + fp8_dispatch = False # Currently, we do not support fp8 dispatch + init_hybrid_ep_buffer( + group, + hidden_dim, + seq_len, + num_local_experts, + num_sms_dispatch_api, + num_sms_combine_api, + fp8_dispatch, + ) + # Defaultly, the output token_per_expert and num_dispatched_tokens_tensor + # will be put on the CPU to avoid the potential sync in combine/backward pass, + # but if we provide the num_dispatched_tokens and num_permuted_tokens on CPU, + # we do not need to the D2H here. + use_host_meta = num_dispatched_tokens is None or num_permuted_tokens is None + # Process the dispatch + ( + dispatched_hidden, + dispatched_probs, + dispatched_scaling_factor, + tokens_per_expert, + handle, + ) = _hybrid_ep_buffer.dispatch_with_permute( + hidden=x, + routing_map=routing_map, + probs=probs, + scaling_factor=None, + num_of_experts_per_rank=num_local_experts, + pad_multiple=pad_multiple, + num_dispatched_tokens=num_dispatched_tokens, + num_permuted_tokens=num_permuted_tokens, + use_host_meta=use_host_meta, + ) + + ctx.handle = handle + ctx.pad_multiple = pad_multiple + ctx.num_dispatched_tokens = num_dispatched_tokens + return ( + dispatched_hidden, + dispatched_probs, + dispatched_scaling_factor, + tokens_per_expert, + handle, + ) + + @staticmethod + def backward(ctx, grad_x, grad_probs, grad_scaling_factor, grad_tokens_per_expert, grad_handle): + ''' + Backward pass of fused dispatch of the HybridEP backend + ''' + handle = ctx.handle + combined_hidden, combined_probs = _hybrid_ep_buffer.combine_with_unpermute( + hidden=grad_x, + probs=grad_probs, + handle=handle, + pad_multiple=ctx.pad_multiple, + num_dispatched_tokens=ctx.num_dispatched_tokens, + ) + return combined_hidden, None, combined_probs, None, None, None, None, None, None, None + + +class HybridEPCombine(torch.autograd.Function): + ''' + Fused combine operation for permute + combine a2a + permute using the HybridEP backend + ''' + + @staticmethod + def forward( + ctx, x, handle, num_dispatched_tokens=None, num_permuted_tokens=None, pad_multiple=None + ): + ''' + Forward pass of fused combine of the HybridEP backend + ''' + combined_hidden, _ = _hybrid_ep_buffer.combine_with_unpermute( + hidden=x, + handle=handle, + pad_multiple=pad_multiple, + num_dispatched_tokens=num_dispatched_tokens, + ) + ctx.handle = handle + ctx.pad_multiple = pad_multiple + ctx.num_dispatched_tokens = num_dispatched_tokens + ctx.num_permuted_tokens = num_permuted_tokens + return combined_hidden + + @staticmethod + def backward(ctx, grad_x): + ''' + Backward pass of fused combine of the HybridEP backend + ''' + handle = ctx.handle + dispatched_hidden, _, _, _, _ = _hybrid_ep_buffer.dispatch_with_permute( + hidden=grad_x, + scaling_factor=None, + handle=handle, + pad_multiple=ctx.pad_multiple, + num_dispatched_tokens=ctx.num_dispatched_tokens, + num_permuted_tokens=ctx.num_permuted_tokens, + ) + return dispatched_hidden, None, None, None, None + + +if HAVE_HYBRIDEP: + + def hybrid_ep_dispatch( + x, + routing_map, + probs, + group, + num_local_experts, + num_sms_dispatch_api=24, + num_sms_combine_api=24, + num_dispatched_tokens=None, + num_permuted_tokens=None, + pad_multiple=None, + ): + ''' + Perform fused dispatch for "permute + dispatch a2a + permute" using the + HybridEP backend. + + Args: + x (torch.Tensor): + Input hidden states to dispatch. + routing_map (torch.Tensor): + Map indicating which expert each token is routed to. + probs (torch.Tensor): + Routing probabilities for each token-expert pair. + group (torch.distributed.ProcessGroup): + Process group used for communication. + num_local_experts (int): + Number of local experts. + num_sms_dispatch_api (int): + Number of SMs used by the dispatch API. + num_sms_combine_api (int): + Number of SMs used by the combine API. + num_dispatched_tokens (int): + Number of tokens after dispatch but before permute. HybridEP uses this + to allocate buffers. If not provided, HybridEP obtains the size from + a GPU tensor, which causes a D2H synchronization. + num_permuted_tokens (int): + Number of tokens after permute. HybridEP uses this to allocate buffers. + If not provided, HybridEP obtains the size from a GPU tensor, + which causes a D2H synchronization. + pad_multiple (int): + Alignment multiple required for FP8 GEMM. If not provided, no padding + is performed. + ''' + return HybridEPDispatch.apply( + x, + routing_map, + probs, + group, + num_local_experts, + num_sms_dispatch_api, + num_sms_combine_api, + num_dispatched_tokens, + num_permuted_tokens, + pad_multiple, + ) + + def hybrid_ep_combine(x, handle, num_dispatched_tokens, num_permuted_tokens, pad_multiple): + ''' + Perform fused combine operation for unpermute + combine a2a + unpermute + using the HybridEP backend + + args: + x (torch.Tensor): + Input hidden states to combine + handle (EventHandle): + Communication handle from dispatch operation + num_dispatched_tokens (int): + The number of tokens after unpermute but before combine. HybridEP uses this + to allocate buffers. If not provided, HybridEP obtains the size from a GPU tensor, + which causes a D2H synchronization. + num_permuted_tokens (int): The number of tokens before unpermute. HybridEP uses this + to allocate buffers. If not provided, HybridEP obtains the size from a GPU tensor, + which causes a D2H synchronization. + pad_multiple (int): + The alignment multiple required for FP8 GEMM. If not provided, no padding + is performed. + ''' + return HybridEPCombine.apply( + x, handle, num_dispatched_tokens, num_permuted_tokens, pad_multiple + ) + +else: + hybrid_ep_dispatch = None + hybrid_ep_combine = None diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py index 142aa74a19e..46f94ebe79a 100644 --- a/megatron/core/transformer/moe/token_dispatcher.py +++ b/megatron/core/transformer/moe/token_dispatcher.py @@ -20,6 +20,8 @@ from megatron.core.transformer.moe.fused_a2a import ( fused_combine, fused_dispatch, + hybrid_ep_combine, + hybrid_ep_dispatch, set_deepep_num_sms, ) from megatron.core.transformer.moe.moe_utils import ( @@ -899,11 +901,6 @@ def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: """Combine the hidden_states after expert processing.""" pass - @abstractmethod - def get_dispached_metadata(self) -> torch.Tensor: - """Get the metadata of the dispatched hidden_states.""" - pass - @abstractmethod def get_permuted_hidden_states_by_experts(self, hidden_states: torch.Tensor) -> torch.Tensor: """Get the permuted hidden states by instances.""" @@ -915,6 +912,161 @@ def get_restored_hidden_states_by_experts(self, hidden_states: torch.Tensor) -> pass +class _HybridEPManager(_DispatchManager): + """ + A manager class to handle fused all-to-all communication processes for MoE models using + HybridEP backend. See https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep for more details. + + The workflow of the HybridEP dispatcher is: + (1) setup_metadata(): Process routing map and probabilities to prepare dispatch metadata + (2) dispatch(): + - Permute tokens for communication, perform all-to-all communication, + and permute tokens for experts in single step + (3) combine(): + - Unpermute tokens for communication, perform all-to-all communication, + and unpermute tokens for attention in single step + """ + + def __init__( + self, + group: torch.distributed.ProcessGroup, + num_local_experts: int, + num_experts: int, + config: TransformerConfig, + ): + """ + Initialize the HybridEP dispatcher. + + Args: + group (torch.distributed.ProcessGroup): The process group to use for communication. + This should be the ETPxEP group. + num_local_experts (int): The number of local experts. + num_experts (int): The total number of experts in the group. + config (TransformerConfig): The configuration for the transformer model. + """ + self.group = group + self.num_local_experts = num_local_experts + self.num_experts = num_experts + self.config = config + self.permute_fusion = config.moe_permute_fusion + self.capacity_factor = config.moe_expert_capacity_factor + # Drop and pad the input to capacity. + self.drop_and_pad = self.config.moe_pad_expert_input_to_capacity + if self.drop_and_pad: + assert self.capacity_factor is not None + self.capacity = None + # The up-bound for the number of tokens after dispatch op, -1 means no up-bound, + # which will cause a CPU sync + self.num_dispatched_tokens = None + # Actually the sum of tokens_per_expert, the up-bound for the number of tokens + # after permute op, -1 means no up-bound, will cause a CPU sync + self.num_permuted_tokens = None + + # Metadata + self.token_probs: Optional[torch.Tensor] = None + # Handle used for combine operation + self.handle = None + # Used for padding the output for each expert + self.pad_multiple = None + + if hybrid_ep_dispatch is None: + raise ImportError( + "HybridEP is not installed. Please install HybridEP package from " + "https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep." + ) + + def setup_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor): + num_tokens = routing_map.shape[0] + self.routing_map = routing_map.reshape(num_tokens, self.num_experts) + self.token_probs = probs.reshape(num_tokens, self.num_experts) + # Compute the capacity for each expert at the drop_and_pad mode + if self.drop_and_pad: + num_out_tokens = num_tokens * self.config.moe_router_topk + # Drop and pad the input to capacity. + self.capacity = get_capacity( + num_tokens=num_out_tokens, + num_experts=self.num_experts, + capacity_factor=self.capacity_factor, + ) + # We cannot predict the actual number of tokens after the dispatch op, + # so we set it to the worst case in drop_and_pad mode + self.num_dispatched_tokens = self.capacity * self.group.size() * self.num_local_experts + # In drop_and_pad mode, the number of tokens after the permute op + # can be computed on the CPU + self.num_permuted_tokens = self.num_dispatched_tokens + self.tokens_per_expert = torch.full( + (self.num_local_experts,), self.capacity * self.group.size(), dtype=torch.long + ) + + def dispatch( + self, + hidden_states: torch.Tensor, + async_finish: bool = True, + allocate_on_comm_stream: bool = True, + ) -> torch.Tensor: + # HybridEP only supports float32 probs + if self.token_probs.dtype != torch.float32: + if self.token_probs.dtype in [torch.bfloat16, torch.float16]: + logger.warning( + "HybridEP only supports float32 probs, please set --moe-router-dtype=fp32" + ) + self.token_probs = self.token_probs.float() # downcast or upcast + if self.config.fp8: + self.pad_multiple = get_fp8_align_size(self.config.fp8_recipe) + dispatched_hidden, self.dispatched_probs, _, tokens_per_expert, self.handle = ( + hybrid_ep_dispatch( + x=hidden_states, + routing_map=self.routing_map, + probs=self.token_probs, + group=self.group, + num_local_experts=self.num_local_experts, + num_sms_dispatch_api=self.config.moe_hybridep_num_sms, + num_sms_combine_api=self.config.moe_hybridep_num_sms, + num_dispatched_tokens=self.num_dispatched_tokens, + num_permuted_tokens=self.num_permuted_tokens, + pad_multiple=self.pad_multiple, + ) + ) + + if not self.drop_and_pad: + self.tokens_per_expert = tokens_per_expert + # self.num_permuted_tokens is necessary to allocate the output tensor for permute + self.num_permuted_tokens = self.tokens_per_expert.sum() + + return dispatched_hidden + + def combine( + self, + hidden_states: torch.Tensor, + async_finish: bool = True, + allocate_on_comm_stream: bool = True, + ) -> torch.Tensor: + hidden_states = hybrid_ep_combine( + x=hidden_states, + handle=self.handle, + num_dispatched_tokens=self.num_dispatched_tokens, + num_permuted_tokens=self.num_permuted_tokens, + pad_multiple=self.pad_multiple, + ) + # Release the used handle/num_permuted_tokens which could change in each iteration + self.handle = None + self.num_permuted_tokens = None + self.num_dispatched_tokens = None + return hidden_states + + def get_permuted_hidden_states_by_experts(self, hidden_states: torch.Tensor) -> torch.Tensor: + return hidden_states, self.dispatched_probs + + def get_restored_hidden_states_by_experts(self, hidden_states: torch.Tensor) -> torch.Tensor: + return hidden_states + + def get_number_of_tokens_per_expert(self) -> torch.Tensor: + ''' + Get the number of tokens per expert. + ''' + return self.tokens_per_expert + + class _DeepepManager(_DispatchManager): """ A manager class to handle fused all-to-all communication processes for MoE models using @@ -1000,7 +1152,7 @@ def dispatch( # DeepEP only supports float32 probs if self.token_probs.dtype != torch.float32: if self.token_probs.dtype in [torch.bfloat16, torch.float16]: - logger.info( + logger.warning( "DeepEP only supports float32 probs, please set --moe-router-dtype=fp32" ) self.token_probs = self.token_probs.float() # downcast or upcast @@ -1052,9 +1204,6 @@ def _indices_to_multihot(self, indices, probs): multihot_probs[row_indices, valid_indices] = probs[mask] return multihot_routing_map.bool(), multihot_probs - def get_dispached_metadata(self) -> torch.Tensor: - return self.dispatched_indices, self.dispatched_probs - def get_number_of_tokens_per_expert(self) -> torch.Tensor: """ Get the number of tokens per expert. @@ -1183,19 +1332,27 @@ def __init__( self.num_local_experts = num_local_experts self.local_expert_indices = local_expert_indices assert self.tp_size * self.ep_size > 1, "Flex token dispatcher requires TPxEP > 1" - assert ( - self.config.moe_enable_deepep - ), "DeepEP is not enabled. Please set --moe-enable-deepep to use DeepEP backend." - assert ( - self.config.moe_pad_expert_input_to_capacity is False - ), "Flex token dispatcher does not support --moe-pad-expert-input-to-capacity" - self._comm_manager = _DeepepManager( - group=self.tp_ep_group, - num_local_experts=self.num_local_experts, - router_topk=self.tp_size * self.config.moe_router_topk, - num_experts=self.tp_size * self.config.num_moe_experts, - config=self.config, - ) + if self.config.moe_flex_dispatcher_backend == "deepep": + self._comm_manager = _DeepepManager( + group=self.tp_ep_group, + num_local_experts=self.num_local_experts, + router_topk=self.tp_size * self.config.moe_router_topk, + num_experts=self.tp_size * self.config.num_moe_experts, + config=self.config, + ) + elif self.config.moe_flex_dispatcher_backend == "hybridep": + self._comm_manager = _HybridEPManager( + group=self.tp_ep_group, + num_local_experts=self.num_local_experts, + num_experts=self.tp_size * self.config.num_moe_experts, + config=self.config, + ) + else: + raise ValueError( + f"Invalid backend: {self.config.moe_flex_dispatcher_backend}" + "Please set --moe-flex-dispatcher-backend=deepep or " + "--moe-flex-dispatcher-backend=hybridep" + ) def set_shared_experts(self, shared_experts): raise NotImplementedError( diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index 8b36425ca2a..a597470e6dc 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -215,6 +215,10 @@ class TransformerConfig(ModelParallelConfig): moe_deepep_num_sms: int = 20 """Number of SMs to use for DeepEP.""" + moe_hybridep_num_sms: int = 16 + """Number of SMs to use for HybridEP. In pure NVL scenarios, + 16 SMs can generally achieve good bandwidth.""" + #################### # linear attention #################### @@ -590,6 +594,11 @@ class TransformerConfig(ModelParallelConfig): moe_enable_deepep: bool = False """[Experimental] Enable DeepEP for efficient token dispatching and combine in MoE models.""" + moe_flex_dispatcher_backend: str = "deepep" + """[Experimental] The backend to use for flex token dispatcher. The default is "deepep". + Options are "deepep" and "hybridep". Currently only "hybridep" backend supports + the MNNVL case.""" + moe_per_layer_logging: bool = False """Enable per-layer logging for MoE, currently supports auxiliary loss and z loss.""" @@ -892,11 +901,18 @@ def __post_init__(self): if self.moe_enable_deepep: if self.moe_token_dispatcher_type != "flex": raise ValueError("DeepEP backend is only supported with flex token dispatcher.") + logging.warning( + "moe_enable_deepep is deprecated." + "Please use --moe-flex-dispatcher-backend=deepep instead." + ) if self.moe_token_dispatcher_type == "flex": - if self.moe_pad_expert_input_to_capacity: + if self.moe_pad_expert_input_to_capacity and ( + self.moe_enable_deepep or self.moe_flex_dispatcher_backend == "deepep" + ): raise ValueError( - "Flex token dispatcher does not support moe_pad_expert_input_to_capacity" + "Flex token dispatcher with deepep backend does not support " + "moe_pad_expert_input_to_capacity" ) if self.moe_shared_expert_intermediate_size is not None: diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index fa9a0f6d751..fe7add9bd21 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -3166,9 +3166,15 @@ def _add_moe_args(parser): default='allgather', help="The type of token dispatcher to use. The default is 'allgather'. Options are 'allgather', 'alltoall'. We recommend using 'alltoall' when applying expert parallelism. For more information, please refer to the documentation in core/moe/README.") group.add_argument('--moe-enable-deepep', action='store_true', - help='[Experimental] Enable DeepSeek/DeepEP for efficient token dispatching and combine in MoE models. Only works with flex token dispatcher by setting --moe-token-dispatcher-type=flex.') + help='DEPRECATED: Please use --moe-flex-dispatcher-backend=deepep instead.') + group.add_argument('--moe-flex-dispatcher-backend', type=str, + choices=['deepep', 'hybridep'], + default='deepep', + help='The backend to use for flex token dispatcher. The default is "deepep". Options are "deepep" and "hybridep".') group.add_argument('--moe-deepep-num-sms', type=int, default=20, help='Number of SMs to use for DeepEP.') + group.add_argument('--moe-hybridep-num-sms', type=int, default=16, + help='Number of SMs to use for HybridEP.') group.add_argument('--moe-permute-fusion', action='store_true', help='Fuse token rearrangement ops during token dispatching.') # Token dropping arguments diff --git a/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py b/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py index 2dd0f20fe2c..81e61a3404a 100644 --- a/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py +++ b/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py @@ -96,7 +96,7 @@ def test_1f1b_schedule_model_chunk(self, mtp_layers, dispatcher_type, fp8_flag, # create TransformerConfig extra_kwargs = {"moe_token_dispatcher_type": dispatcher_type} if dispatcher_type == "flex": - extra_kwargs["moe_enable_deepep"] = True + extra_kwargs["moe_flex_dispatcher_backend"] = "deepep" extra_kwargs["moe_router_dtype"] = "fp32" if fp8_flag is not None: extra_kwargs["fp8"] = fp8_flag[0] diff --git a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py index 729a6e0f2f5..f39a10c5bf3 100644 --- a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py +++ b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py @@ -362,7 +362,7 @@ def test_transformer_layer_overlap(self, dispatcher_type, fp8_flag): extra_kwargs = {"moe_token_dispatcher_type": dispatcher_type} if dispatcher_type == "flex": - extra_kwargs["moe_enable_deepep"] = True + extra_kwargs["moe_flex_dispatcher_backend"] = "deepep" extra_kwargs["moe_router_dtype"] = "fp32" if fp8_flag is not None: extra_kwargs["fp8"] = fp8_flag[0] @@ -415,7 +415,7 @@ def test_mtp_layer_overlap(self, dispatcher_type, fp8_flag): "mtp_loss_scaling_factor": 1.1, } if dispatcher_type == "flex": - extra_kwargs["moe_enable_deepep"] = True + extra_kwargs["moe_flex_dispatcher_backend"] = "deepep" extra_kwargs["moe_router_dtype"] = "fp32" if fp8_flag is not None: extra_kwargs["fp8_recipe"] = fp8_flag[1] diff --git a/tests/unit_tests/transformer/moe/test_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_token_dispatcher.py index 82138bc637d..c2462ef73ad 100644 --- a/tests/unit_tests/transformer/moe/test_token_dispatcher.py +++ b/tests/unit_tests/transformer/moe/test_token_dispatcher.py @@ -91,7 +91,7 @@ def __init__( sequence_parallel=tp_size > 1, add_bias_linear=kwargs.get("add_bias_linear", False), moe_permute_fusion=kwargs.get("moe_permute_fusion", False), - moe_enable_deepep=kwargs.get("moe_enable_deepep", False), + moe_flex_dispatcher_backend=kwargs.get("moe_flex_dispatcher_backend", None), ) # init moe layer @@ -411,7 +411,16 @@ def is_deep_ep_available(): return HAVE_DEEP_EP -@pytest.mark.skipif(not is_deep_ep_available(), reason="Deep EP is not available") +def is_hybrid_ep_available(): + from megatron.core.transformer.moe.fused_a2a import HAVE_HYBRIDEP + + return HAVE_HYBRIDEP + + +@pytest.mark.skipif( + not is_deep_ep_available() and not is_hybrid_ep_available(), + reason="Deep EP and Hybrid EP are not available", +) class TestFlexDispatcher: def setup_method(self, method): pass @@ -421,9 +430,14 @@ def teardown_method(self, method): @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @pytest.mark.internal - @pytest.mark.parametrize("tp_size,ep_size", [(8, 1), (1, 8), (2, 4)]) + @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2)]) @pytest.mark.parametrize("permute_fusion", permute_fusion_params) - def test_forward_backward(self, tp_size, ep_size, permute_fusion): + @pytest.mark.parametrize("moe_flex_dispatcher_backend", ["deepep", "hybridep"]) + def test_forward_backward(self, tp_size, ep_size, permute_fusion, moe_flex_dispatcher_backend): + if moe_flex_dispatcher_backend == "deepep" and not is_deep_ep_available(): + pytest.skip("Deep EP is not available") + if moe_flex_dispatcher_backend == "hybridep" and not is_hybrid_ep_available(): + pytest.skip("Hybrid EP is not available") if permute_fusion: config.ENABLE_EXPERIMENTAL = True container = MoEModelTestContainer( @@ -435,8 +449,8 @@ def test_forward_backward(self, tp_size, ep_size, permute_fusion): moe_router_load_balancing_type="aux_loss", moe_token_dispatcher_type="flex", moe_permute_fusion=permute_fusion, - hidden_size=32, - moe_enable_deepep=True, + hidden_size=1024, + moe_flex_dispatcher_backend=moe_flex_dispatcher_backend, test_dtype=torch.bfloat16, ) container.dispatcher_dropless_test() @@ -448,7 +462,14 @@ def test_forward_backward(self, tp_size, ep_size, permute_fusion): @pytest.mark.timeout(120) @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2)]) @pytest.mark.parametrize("permute_fusion", permute_fusion_params) - def test_capacity_forward_backward(self, tp_size, ep_size, permute_fusion): + @pytest.mark.parametrize("moe_flex_dispatcher_backend", ["deepep", "hybridep"]) + def test_capacity_forward_backward( + self, tp_size, ep_size, permute_fusion, moe_flex_dispatcher_backend + ): + if moe_flex_dispatcher_backend == "deepep" and not is_deep_ep_available(): + pytest.skip("Deep EP is not available") + if moe_flex_dispatcher_backend == "hybridep" and not is_hybrid_ep_available(): + pytest.skip("Hybrid EP is not available") if permute_fusion: config.ENABLE_EXPERIMENTAL = True container = MoEModelTestContainer( @@ -463,8 +484,8 @@ def test_capacity_forward_backward(self, tp_size, ep_size, permute_fusion): moe_expert_capacity_factor=0.5, moe_pad_expert_input_to_capacity=False, moe_permute_fusion=permute_fusion, - hidden_size=32, - moe_enable_deepep=True, + hidden_size=1024, + moe_flex_dispatcher_backend=moe_flex_dispatcher_backend, test_dtype=torch.bfloat16, ) container.dispatcher_capacity_test() @@ -478,7 +499,14 @@ def test_capacity_forward_backward(self, tp_size, ep_size, permute_fusion): @pytest.mark.timeout(120) @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2)]) @pytest.mark.parametrize("permute_fusion", [True]) - def test_router_padding_for_fp8_forward_backward(self, tp_size, ep_size, permute_fusion): + @pytest.mark.parametrize("moe_flex_dispatcher_backend", ["deepep", "hybridep"]) + def test_router_padding_for_fp8_forward_backward( + self, tp_size, ep_size, permute_fusion, moe_flex_dispatcher_backend + ): + if moe_flex_dispatcher_backend == "deepep" and not is_deep_ep_available(): + pytest.skip("Deep EP is not available") + if moe_flex_dispatcher_backend == "hybridep" and not is_hybrid_ep_available(): + pytest.skip("Hybrid EP is not available") if permute_fusion: config.ENABLE_EXPERIMENTAL = True container = MoEModelTestContainer( @@ -491,8 +519,8 @@ def test_router_padding_for_fp8_forward_backward(self, tp_size, ep_size, permute moe_token_dispatcher_type="flex", moe_pad_expert_input_to_capacity=False, moe_permute_fusion=permute_fusion, - hidden_size=32, - moe_enable_deepep=True, + hidden_size=1024, + moe_flex_dispatcher_backend=moe_flex_dispatcher_backend, test_dtype=torch.bfloat16, ) container.dispatcher_router_padding_for_fp8_test() diff --git a/tests/unit_tests/transformer/test_submodule_callables.py b/tests/unit_tests/transformer/test_submodule_callables.py index d0f5ad12d3f..141982a17cf 100644 --- a/tests/unit_tests/transformer/test_submodule_callables.py +++ b/tests/unit_tests/transformer/test_submodule_callables.py @@ -137,7 +137,7 @@ def test_1f1b_overlap(self, dispatcher_type, grouped_gemm, permute_fusion): "moe_permute_fusion": permute_fusion, } if dispatcher_type == "flex": - extra_kwargs["moe_enable_deepep"] = True + extra_kwargs["moe_flex_dispatcher_backend"] = "deepep" extra_kwargs["moe_router_dtype"] = "fp32" config = get_test_config(extra_kwargs=extra_kwargs, moe_grouped_gemm=grouped_gemm) microbatches = 4 From 2e565067015c92f58c217c5a6c2cc54271ce3a2e Mon Sep 17 00:00:00 2001 From: Shifang Xu Date: Wed, 22 Oct 2025 05:09:18 -0700 Subject: [PATCH 034/248] ADLR/megatron-lm!4235 - [dev] Support multimodule communication Co-authored-by: Mcore Bot --- .../pipeline_parallel/bridge_communicator.py | 3 - .../multimodule_communicator.py | 523 ++++++++++++ .../test_multimodule_communicator.py | 780 ++++++++++++++++++ 3 files changed, 1303 insertions(+), 3 deletions(-) create mode 100644 megatron/core/pipeline_parallel/multimodule_communicator.py create mode 100644 tests/unit_tests/pipeline_parallel/test_multimodule_communicator.py diff --git a/megatron/core/pipeline_parallel/bridge_communicator.py b/megatron/core/pipeline_parallel/bridge_communicator.py index a67ded6bf08..f1e74a2f16d 100644 --- a/megatron/core/pipeline_parallel/bridge_communicator.py +++ b/megatron/core/pipeline_parallel/bridge_communicator.py @@ -628,9 +628,6 @@ def send_forward_recv_backward( dist.broadcast( shape_tensor, src=self.current_rank, group=self.src_grid_broadcast_pg ) - dist.broadcast( - shape_tensor, src=self.current_rank, group=self.src_grid_broadcast_pg - ) # Broadcast the tensors to all ranks in the group dist.broadcast( diff --git a/megatron/core/pipeline_parallel/multimodule_communicator.py b/megatron/core/pipeline_parallel/multimodule_communicator.py new file mode 100644 index 00000000000..dfda270ef76 --- /dev/null +++ b/megatron/core/pipeline_parallel/multimodule_communicator.py @@ -0,0 +1,523 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +import logging +from dataclasses import dataclass +from typing import Dict, List, Optional, Union + +import torch +import torch.distributed as dist + +from megatron.core.hyper_comm_grid import HyperCommGrid +from megatron.core.model_parallel_config import ModelParallelConfig +from megatron.core.pipeline_parallel.bridge_communicator import BridgeCommunicator +from megatron.core.pipeline_parallel.p2p_communication import P2PCommunicator + +# Types +Shape = Union[List[int], torch.Size] + + +@dataclass +class RankModuleInfo: + """Information about a rank in a module.""" + + # the stage of the current rank in the current module's pipeline. + pp_rank: int # the stage of the current rank in the current module's pipeline + pp_size: int # the number of ranks in the current module's pipeline + p2p_communicator: Optional[P2PCommunicator] + # key is either the src or dst module name connected to the current module + # one module may have multiple bridge communicators if it has multiple + # incoming or outgoing connections. + bridge_comms_as_src_module: Optional[List[BridgeCommunicator]] + bridge_comms_as_dest_module: Optional[List[BridgeCommunicator]] + # the absolute first stage in the overall model + # no incoming connections + is_source_stage: Optional[bool] = True + # the absolute last stage in the overall model + # no outgoing connections + is_terminal_stage: Optional[bool] = True + + +class MultiModulePipelineCommunicator: + """Communicator for a multi-module pipeline.""" + + def __init__( + self, + module_to_grid_map: Dict[str, HyperCommGrid], + topology: Dict[str, List[str]], + config: ModelParallelConfig, + dim_mapping: Dict[str, List[int]] = None, + ): + """ + Initialize the MultiModulePipelineCommunicator. + + Args: + module_to_grid_map (dict): A dictionary mapping module names to HyperCommGrids. + Example: + module_to_grid_map = { + 'image_encoder': image_encoder_grid, + 'audio_encoder': audio_encoder_grid, + 'llm': llm_grid, + 'generator': generator_grid + } + topology (dict): A dictionary mapping module names to lists of outgoing modules. + Example: + topology = { + 'image_encoder': ['llm'], + 'audio_encoder': ['llm'], + 'llm': ['generator'], + 'generator': [] + } + config (ModelParallelConfig): A ModelParallelConfig object. + dim_mapping (Dict[str, List[int]]): Dimension mapping for sequence, batch, hidden. + Example: + dim_mapping = {'s': 0, 'h': 2, 'b': 1} + Default: None + """ + self.module_to_grid_map = module_to_grid_map + self.topology = topology + self.config = config + self.dim_mapping = dim_mapping + self.current_rank = dist.get_rank() + + # Build bridge communicators for all modules + self.bridge_comms = [] + self._build_bridge_comms() + + self.rank_module_map = {} + self._build_rank_module_info_map() + + def _build_bridge_comms(self): + """Construct and store BridgeCommunicator objects that describe the outgoing + communication relationships for all of the modules. + """ + for src_module_name, src_grid in self.module_to_grid_map.items(): + for dest_module_name in self.topology[src_module_name]: + dest_grid = self.module_to_grid_map[dest_module_name] + bridge_comm = BridgeCommunicator( + src_grid=src_grid, + dest_grid=dest_grid, + dim_mapping=self.dim_mapping, + comm_dtype=self.config.pipeline_dtype, + src_module_name=src_module_name, + dest_module_name=dest_module_name, + ) + self.bridge_comms.append(bridge_comm) + + @property + def is_pp_first_stage(self): + """Return True if the current rank has the absolute first stage in the overall model. + + The absolute first stage is defined as: + 1. The current rank must be in the first PP stage (pp_rank == 0) of some module + 2. That module must be a source module (no incoming connections in topology) + """ + for module_name, rank_module_info in self.rank_module_map.items(): + # Check if this rank is at the first PP stage of this module + if rank_module_info.pp_rank == 0: + # Check if this module is a source module (no incoming connections) + if self._is_source_module(module_name): + return True + return False + + @property + def is_pp_last_stage(self): + """Return True if the current rank has the absolute last stage in the overall model. + + The absolute last stage is defined as: + 1. The current rank must be in the last PP stage of some module + 2. That module must be a sink module (no outgoing connections in topology) + """ + for module_name, rank_module_info in self.rank_module_map.items(): + # Check if this rank is at the last PP stage of this module + if rank_module_info.pp_rank == rank_module_info.pp_size - 1: + # Check if this module is a sink module (no outgoing connections) + if self._is_sink_module(module_name): + return True + return False + + def _is_source_module(self, module_name: str) -> bool: + """Check if a module is a source module (has no incoming connections).""" + # A module is a source if no other module lists it as a destination + for src_module, dest_modules in self.topology.items(): + if module_name in dest_modules: + return False + return True + + def _is_sink_module(self, module_name: str) -> bool: + """Check if a module is a sink module (has no outgoing connections).""" + return len(self.topology.get(module_name, [])) == 0 + + def is_current_rank_in_grid(self, grid: HyperCommGrid) -> bool: + """Check if the current rank is in the grid.""" + return grid.rank_offset <= self.current_rank < grid.rank_offset + grid.size + + @property + def num_warmup_microbatches(self): + """Calculate the number of warmup microbatches for the current rank. + + Uses the same simple logic as P2PCommunicator: + total_pipeline_stages - current_rank_stage - 1 + + Returns: + int: Number of warmup microbatches for this rank + """ + # Get total pipeline depth across all modules + total_stages = self.compute_total_pipeline_stages(self.topology, self.module_to_grid_map) + + # Get current rank's position in the overall pipeline (0-indexed) + # Use compute_total_pipeline_stages with current rank to get cumulative position + if self.rank_module_map: + # Take the first module this rank belongs to + # TODO: ykarnati - improve this logic. + module_name = next(iter(self.rank_module_map.keys())) + current_stage = ( + self.compute_total_pipeline_stages( + self.topology, + self.module_to_grid_map, + rank=self.current_rank, + module_name=module_name, + ) + - 1 + ) # Convert from 1-indexed to 0-indexed + else: + current_stage = 0 + + assert ( + current_stage <= total_stages + ), f"current_stage: {current_stage} is greater than total_stages: {total_stages}" + logging.debug( + f"[Rank {dist.get_rank()} ][MultiModulePipelineCommunicator] " + f"current_stage: {current_stage} total_stages: {total_stages} " + f"num_warmup_microbatches: {total_stages - current_stage - 1}" + ) + return total_stages - current_stage - 1 + + def _build_rank_module_info_map(self): + """For each module in the current rank, initialize the P2P communicator + and build the bridge communicator info for the module. + Each rank may hold multiple modules when colocated. + """ + for module_name, module_grid in self.module_to_grid_map.items(): + if self.is_current_rank_in_grid(module_grid): + # Initialize P2P communicator + pp_group = module_grid.get_pg('pp') + p2p_comm = P2PCommunicator(pp_group, self.config) + pp_size = dist.get_world_size(pp_group) + rank_in_pp_group = dist.get_group_rank(pp_group, self.current_rank) + pp_rank = rank_in_pp_group % pp_size + + bridge_comms_as_dest_module = [] + bridge_comms_as_src_module = [] + # If first stage, check if the module has any incoming modules + # If so, initialize bridge communicator + if pp_rank == 0: + for bridge_comm in self.bridge_comms: + if ( + bridge_comm.is_current_rank_in_grid(bridge_comm.dest_grid) + and bridge_comm.dest_module_name == module_name + ): + bridge_comms_as_dest_module.append(bridge_comm) + # If last stage, check if the module has any outgoing modules + # If so, initialize bridge communicator + if pp_rank == pp_size - 1: + for bridge_comm in self.bridge_comms: + if ( + bridge_comm.is_current_rank_in_grid(bridge_comm.src_grid) + and bridge_comm.src_module_name == module_name + ): + bridge_comms_as_src_module.append(bridge_comm) + # Build RankModuleInfo for the module + rank_module_info = RankModuleInfo( + pp_rank=pp_rank, + pp_size=pp_size, + p2p_communicator=p2p_comm, + bridge_comms_as_dest_module=bridge_comms_as_dest_module, + bridge_comms_as_src_module=bridge_comms_as_src_module, + ) + self.rank_module_map[module_name] = rank_module_info + + def recv_forward( + self, tensor_shape: Optional[Shape] = None, is_first_stage: bool = False + ) -> Dict[str, torch.Tensor]: + """Receive forward activation tensor. + + Args: + tensor_shape: Expected activation tensor shape + + Returns: + A dictionary mapping module names to tensors. + """ + logging.debug( + f"[Rank {dist.get_rank()} ][MultiModulePipelineCommunicator] " + f"[receive_forward] tensors_shape: {tensor_shape}, is_first_stage: {is_first_stage}" + ) + input_dict = {} + for module_name, rank_module_info in self.rank_module_map.items(): + + if rank_module_info.pp_rank == 0: + # If first stage, and has incoming modules, receive forward activation + # from incoming modules. + for bridge_comm in rank_module_info.bridge_comms_as_dest_module: + input_dict[bridge_comm.src_module_name] = bridge_comm.recv_forward() + else: + # If not first stage, receive forward activation tensor from P2P communicator. + input_dict[module_name] = rank_module_info.p2p_communicator.recv_forward( + tensor_shapes=tensor_shape, is_first_stage=False + ) + return input_dict + + def send_forward(self, output_dict: Dict[str, torch.Tensor], is_last_stage: bool = False): + """Send forward activation tensor. + + Args: + output_dict: A dictionary mapping module names to tensors. + """ + logging.debug( + f"[Rank {dist.get_rank()} ][MultiModulePipelineCommunicator] " + f"[send_forward] output_dict keys: {output_dict.keys()}, is_last_stage: {is_last_stage}" + ) + for module_name, rank_module_info in self.rank_module_map.items(): + if rank_module_info.pp_rank == rank_module_info.pp_size - 1: + # If last stage, and has outgoing modules, send forward activation + # by using bridge communicator. + for bridge_comm in rank_module_info.bridge_comms_as_src_module: + bridge_comm.send_forward(output_dict[module_name]) + else: + # If not last stage, send forward activation by using P2P communicator. + rank_module_info.p2p_communicator.send_forward( + output_dict[module_name], is_last_stage=False + ) + + def send_forward_recv_backward( + self, + output_dict: Dict[str, torch.Tensor], + tensor_shape: Optional[Shape] = None, + is_last_stage: bool = False, + ) -> Dict[str, torch.Tensor]: + """Send forward activation tensor and receive backward activation tensor. + + Args: + output_dict: A dictionary mapping module names to tensors. + tensor_shape: Expected gradient tensor shape + + Returns: + A dictionary mapping module names to tensors. + """ + logging.debug( + f"[Rank {dist.get_rank()} ][MultiModulePipelineCommunicator] " + f"[send_forward_recv_backward] output_dict keys: {output_dict.keys()}, " + f"tensor_shape: {tensor_shape}, is_last_stage: {is_last_stage}" + ) + grad_dict = {} + for module_name, rank_module_info in self.rank_module_map.items(): + if rank_module_info.pp_rank == rank_module_info.pp_size - 1: + # If last stage, and has outgoing modules, send forward activation and + # receive backward gradient by using bridge communicator. + for bridge_comm in rank_module_info.bridge_comms_as_src_module: + grad_dict[bridge_comm.src_module_name] = bridge_comm.send_forward_recv_backward( + output_dict[module_name] + ) + else: + # If not last stage, send forward activation and receive backward gradient + # by using P2P communicator. + grad_dict[module_name] = ( + rank_module_info.p2p_communicator.send_forward_recv_backward( + output_dict[module_name], tensor_shapes=tensor_shape, is_last_stage=False + ) + ) + return grad_dict + + def send_backward_recv_forward( + self, + grad_dict: Dict[str, torch.Tensor], + tensor_shape: Optional[Shape] = None, + is_first_stage: bool = False, + ) -> Dict[str, torch.Tensor]: + """Send backward activation tensor and receive forward activation tensor. + + Args: + grad_dict: A dictionary mapping module names to tensors. + tensor_shape: Expected gradient tensor shape + + Returns: + A dictionary mapping module names to tensors. + """ + logging.debug( + f"[Rank {dist.get_rank()} ][MultiModulePipelineCommunicator] " + f"[send_backward_recv_forward] grad_dict keys: {grad_dict.keys()}, " + f"tensor_shape: {tensor_shape}, is_first_stage: {is_first_stage}" + ) + input_dict = {} + for module_name, rank_module_info in self.rank_module_map.items(): + if rank_module_info.pp_rank == 0: + for bridge_comm in rank_module_info.bridge_comms_as_dest_module: + # If first stage, and has incoming modules, send backward gradient and + # receive forward activation by using bridge communicator. + input_dict[bridge_comm.src_module_name] = ( + bridge_comm.send_backward_recv_forward( + grad_dict[bridge_comm.src_module_name] + ) + ) + else: + # If not first stage, send backward gradient and receive forward activation + # by using P2P communicator. + input_dict[module_name] = ( + rank_module_info.p2p_communicator.send_backward_recv_forward( + grad_dict[module_name], tensor_shapes=tensor_shape, is_first_stage=False + ) + ) + return input_dict + + def recv_backward( + self, tensor_shape: Optional[Shape] = None, is_last_stage: bool = False + ) -> Dict[str, torch.Tensor]: + """Receive backward activation tensor. + + Args: + tensor_shape: Expected gradient tensor shape + + Returns: + A dictionary mapping module names to tensors. + """ + logging.debug( + f"[Rank {dist.get_rank()} ][MultiModulePipelineCommunicator] " + f"[recv_backward] tensor_shape: {tensor_shape}, is_last_stage: {is_last_stage}" + ) + grad_dict = {} + for module_name, rank_module_info in self.rank_module_map.items(): + if rank_module_info.pp_rank == rank_module_info.pp_size - 1: + # If last stage, and has incoming modules, receive backward gradient + # by using bridge communicator. + for bridge_comm in rank_module_info.bridge_comms_as_src_module: + grad_dict[bridge_comm.src_module_name] = bridge_comm.recv_backward() + else: + # If not last stage, receive backward gradient by using P2P communicator. + grad_dict[module_name] = rank_module_info.p2p_communicator.recv_backward( + tensor_shapes=tensor_shape, is_last_stage=False + ) + return grad_dict + + def send_backward(self, grad_dict: Dict[str, torch.Tensor], is_first_stage: bool = False): + """Send backward activation tensor. + + Args: + grad_dict: A dictionary mapping module names to tensors. + """ + logging.debug( + f"[Rank {dist.get_rank()} ][MultiModulePipelineCommunicator] " + f"[send_backward] grad_dict keys: {grad_dict.keys()}, is_first_stage: {is_first_stage}" + ) + for module_name, rank_module_info in self.rank_module_map.items(): + if rank_module_info.pp_rank == 0: + # If first stage, and has incoming modules, send backward activation + # by using bridge communicator. + for bridge_comm in rank_module_info.bridge_comms_as_dest_module: + bridge_comm.send_backward(grad_dict[bridge_comm.src_module_name]) + else: + # If not first stage, send backward activation by using P2P communicator. + rank_module_info.p2p_communicator.send_backward( + grad_dict[module_name], is_first_stage=False + ) + + @staticmethod + def compute_total_pipeline_stages( + topology: Dict[str, List[str]], + module_to_grid_map: Dict[str, HyperCommGrid], + rank: Optional[int] = None, + module_name: Optional[str] = None, + ) -> int: + """Compute the total number of pipeline stages across a multi-module chain. + + Interprets ``topology`` as a directed acyclic graph (DAG) where nodes are modules + and edges indicate forward data flow from source to destination modules. Each node + is assigned a weight equal to its pipeline parallel size (number of PP stages). + + The total number of stages is defined as the length of the longest path in this DAG + under node weights. + + If ``rank`` is None (default), returns the maximum over all terminal (sink) modules of + the sum of PP sizes along a path ending at that terminal. For example, given: + + image_encoder ->\ + -> llm -> generator + audio_encoder ->/ + + the total is: max(pp(image_encoder), pp(audio_encoder)) + pp(llm) + pp(generator). + + If ``rank`` is provided, the result is the total number of pipeline stages up to (and + including) the PP stage that ``rank`` occupies inside its module. In this case, the + weight of the target module equals (pp_rank_index(rank) + 1) instead of the module's + full PP size; other modules still contribute their full PP sizes. If the rank belongs to + multiple modules (colocation), pass ``module_name`` to disambiguate; otherwise the + maximum across all candidate modules containing the rank is returned. + + Args: + topology: Mapping from a module to its list of outgoing modules. + module_to_grid_map: Mapping from module name to its ``HyperCommGrid``. + + Returns: + The total number of pipeline stages along the longest path given the constraints. + + Raises: + ValueError: If the topology contains cycles; or has no terminal nodes when + ``rank`` is None + """ + nodes = set(module_to_grid_map.keys()) + # Build adjacency and reverse-adjacency (predecessors). + adj: Dict[str, List[str]] = {node: list(topology.get(node, [])) for node in nodes} + preds: Dict[str, List[str]] = {node: [] for node in nodes} + for src, outs in adj.items(): + for dst in outs: + preds[dst].append(src) + + # Identify terminal nodes (no outgoing edges) for the rank=None case. + sinks = [node for node, outs in adj.items() if not outs] + if rank is None and not sinks: + raise ValueError( + "Topology must be a DAG with at least one terminal (no outgoing) module." + ) + + def pp_size(name: str) -> int: + grid = module_to_grid_map[name] + pp_dim_index = grid.dim_names.index('pp') + return grid.shape[pp_dim_index] + + def partial_weight_for_target(target: str) -> Optional[int]: + if rank is None: + return None + grid = module_to_grid_map.get(target) + rank_groups = grid._gen_rank_enum(['pp']) + stage_index: Optional[int] = None + for group in rank_groups: + if rank in group: + stage_index = group.index(rank) + break + return stage_index + 1 + + def longest_path_to(target: str) -> int: + visiting = set() + partial = partial_weight_for_target(target) + + def weight(name: str) -> int: + if partial is not None and name == target: + return partial + return pp_size(name) + + def dfs(node: str) -> int: + if node in visiting: + raise ValueError("Topology contains cycles; expected a DAG.") + visiting.add(node) + best = 0 + for p in preds.get(node, []): + val = dfs(p) + if val > best: + best = val + visiting.remove(node) + return weight(node) + best + + return dfs(target) + + if rank is None: + return max(longest_path_to(sink) for sink in sinks) + + return longest_path_to(module_name) diff --git a/tests/unit_tests/pipeline_parallel/test_multimodule_communicator.py b/tests/unit_tests/pipeline_parallel/test_multimodule_communicator.py new file mode 100644 index 00000000000..73739859f42 --- /dev/null +++ b/tests/unit_tests/pipeline_parallel/test_multimodule_communicator.py @@ -0,0 +1,780 @@ +import logging +import os +import sys + +import pytest +import torch +import torch.distributed as dist +from packaging import version + +from megatron.core import parallel_state +from megatron.core.hyper_comm_grid import HyperCommGrid +from megatron.core.model_parallel_config import ModelParallelConfig +from megatron.core.pipeline_parallel.multimodule_communicator import MultiModulePipelineCommunicator +from tests.unit_tests.pipeline_parallel.test_bridge_communicator import ( + _avg_params, + _create_transformer_block, + _get_pg_collection_from_grid, + create_hypercomm_grid, + get_transformer_block_and_grid, +) +from tests.unit_tests.test_utilities import Utils + + +class TestMultiModulePipelineCommunicator: + + @classmethod + def setup_class(cls): + """Set up distributed environment for the entire test class.""" + if not dist.is_initialized(): + dist.init_process_group(backend="nccl") + if torch.cuda.is_available(): + torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) + + world_size = dist.get_world_size() + if world_size != 8: + pytest.skip( + f"These tests require 8 GPUs, but only {world_size} are available.", + allow_module_level=True, + ) + + def teardown_class(cls): + Utils.destroy_model_parallel() + + def test_multimodule_communicator_init(self): + """Test MultiModulePipelineCommunicator initialization.""" + + # Create process group grids for each module + image_encoder_grid = create_hypercomm_grid(offset=0, tp=1, cp=1, pp=1, dp=1) + audio_encoder_grid = create_hypercomm_grid(offset=1, tp=1, cp=1, pp=1, dp=1) + llm_grid = create_hypercomm_grid(offset=2, tp=2, cp=1, pp=2, dp=1) + generator_grid = create_hypercomm_grid(offset=6, tp=2, cp=1, pp=1, dp=1) + + # Define module-grid mapping + module_to_grid_map = { + 'image_encoder': image_encoder_grid, + 'audio_encoder': audio_encoder_grid, + 'llm': llm_grid, + 'generator': generator_grid, + } + # Define module computation topology + topology = { + 'image_encoder': ['llm'], + 'audio_encoder': ['llm'], + 'llm': ['generator'], + 'generator': [], + } + config = ModelParallelConfig(bf16=True) + # Initialize communicator + mllm_comm = MultiModulePipelineCommunicator(module_to_grid_map, topology, config) + # Test attributes match expectations + assert mllm_comm.module_to_grid_map == module_to_grid_map + assert mllm_comm.topology == topology + assert mllm_comm.config == config + assert mllm_comm.current_rank == dist.get_rank() + + def test_compute_total_pipeline_stages(self): + """Test compute_total_pipeline_stages for overall chain and until specific ranks.""" + + # Create process group grids for each module + image_encoder_grid = create_hypercomm_grid(offset=0, tp=1, cp=1, pp=1, dp=1) + audio_encoder_grid = create_hypercomm_grid(offset=1, tp=1, cp=1, pp=1, dp=1) + llm_grid = create_hypercomm_grid(offset=2, tp=2, cp=1, pp=2, dp=1) + generator_grid = create_hypercomm_grid(offset=6, tp=1, cp=1, pp=1, dp=2) + + # Define module-grid mapping and topology + module_to_grid_map = { + 'image_encoder': image_encoder_grid, + 'audio_encoder': audio_encoder_grid, + 'llm': llm_grid, + 'generator': generator_grid, + } + topology = { + 'image_encoder': ['llm'], + 'audio_encoder': ['llm'], + 'llm': ['generator'], + 'generator': [], + } + + # Overall total pipeline stages: max(1,1) + 2 + 1 = 4 + total = MultiModulePipelineCommunicator.compute_total_pipeline_stages( + topology, module_to_grid_map + ) + assert total == 4 + + llm_pp_rank = MultiModulePipelineCommunicator.compute_total_pipeline_stages( + topology, module_to_grid_map, rank=2, module_name='llm' + ) + assert llm_pp_rank == 2 + + def test_send_forward_recv_forward(self): + """Test send_forward and recv_forward operations.""" + if not dist.is_initialized(): + pytest.skip("Distributed not initialized") + + # Create process group grids for each module + image_encoder_grid = create_hypercomm_grid(offset=0, tp=1, cp=1, pp=1, dp=1) + audio_encoder_grid = create_hypercomm_grid(offset=1, tp=1, cp=1, pp=1, dp=1) + llm_grid = create_hypercomm_grid(offset=2, tp=2, cp=1, pp=2, dp=1) + generator_grid = create_hypercomm_grid(offset=6, tp=1, cp=1, pp=1, dp=2) + + # Set up module-grid mapping and topology + module_to_grid_map = { + 'image_encoder': image_encoder_grid, + 'audio_encoder': audio_encoder_grid, + 'llm': llm_grid, + 'generator': generator_grid, + } + topology = { + 'image_encoder': ['llm'], + 'audio_encoder': ['llm'], + 'llm': ['generator'], + 'generator': [], + } + config = ModelParallelConfig(pipeline_dtype=torch.float) + mllm_comm = MultiModulePipelineCommunicator(module_to_grid_map, topology, config) + + # Simulate forward communication for each module + if mllm_comm.is_current_rank_in_grid(image_encoder_grid): + # Image encoder sends output forward + output_dict = {'image_encoder': torch.randn(2, 8, 128).cuda()} + mllm_comm.send_forward(output_dict) + if mllm_comm.is_current_rank_in_grid(audio_encoder_grid): + # Audio encoder sends output forward + output_dict = {'audio_encoder': torch.randn(2, 16, 128).cuda()} + mllm_comm.send_forward(output_dict) + if mllm_comm.is_current_rank_in_grid(llm_grid): + output_dict = {'llm': torch.randn(2, 32, 128).cuda()} + if dist.get_rank() == 2 or dist.get_rank() == 3: + # LLM stage receives both image and audio outputs + input_dict = mllm_comm.recv_forward() + assert input_dict['image_encoder'].shape == (2, 8, 128) + assert input_dict['audio_encoder'].shape == (2, 16, 128) + mllm_comm.send_forward(output_dict) + else: + # LLM stage receives concatenated LLM outputs + input_dict = mllm_comm.recv_forward(tensor_shape=(2, 32, 128)) + assert input_dict['llm'].shape == (2, 32, 128) + mllm_comm.send_forward(output_dict) + if mllm_comm.is_current_rank_in_grid(generator_grid): + # Generator module receives final LLM output + input_dict = mllm_comm.recv_forward() + assert input_dict['llm'].shape == (1, 32, 128) + + def test_send_forward_recv_forward_with_different_pp_size(self): + """Test for the case when pp(image_encoder) != pp(audio_encoder).""" + if not dist.is_initialized(): + pytest.skip("Distributed not initialized") + + # Create process group grids for each module + image_encoder_grid = create_hypercomm_grid(offset=0, tp=1, cp=1, pp=2, dp=1) + audio_encoder_grid = create_hypercomm_grid(offset=2, tp=2, cp=1, pp=1, dp=1) + llm_grid = create_hypercomm_grid(offset=4, tp=1, cp=1, pp=4, dp=1) + + # Set up module-grid mapping and topology + module_to_grid_map = { + 'image_encoder': image_encoder_grid, + 'audio_encoder': audio_encoder_grid, + 'llm': llm_grid, + } + topology = {'image_encoder': ['llm'], 'audio_encoder': ['llm'], 'llm': []} + config = ModelParallelConfig(pipeline_dtype=torch.float) + mllm_comm = MultiModulePipelineCommunicator(module_to_grid_map, topology, config) + + # Simulate forward communication for each module + if mllm_comm.is_current_rank_in_grid(image_encoder_grid): + output_dict = {'image_encoder': torch.randn(2, 8, 128).cuda()} + if dist.get_rank() == 0: + # Image encoder sends output forward + mllm_comm.send_forward(output_dict) + else: + # Image stage receives image outputs + input_dict = mllm_comm.recv_forward(tensor_shape=(2, 8, 128)) + assert input_dict['image_encoder'].shape == (2, 8, 128) + mllm_comm.send_forward(output_dict) + if mllm_comm.is_current_rank_in_grid(audio_encoder_grid): + # Audio encoder sends output forward + output_dict = {'audio_encoder': torch.randn(2, 16, 128).cuda()} + mllm_comm.send_forward(output_dict) + if mllm_comm.is_current_rank_in_grid(llm_grid): + output_dict = {'llm': torch.randn(2, 32, 128).cuda()} + if dist.get_rank() == 4: + # LLM stage receives both image and audio outputs + input_dict = mllm_comm.recv_forward() + assert input_dict['image_encoder'].shape == (2, 8, 128) + assert input_dict['audio_encoder'].shape == (2, 16, 128) + mllm_comm.send_forward(output_dict) + elif dist.get_rank() == 5 or dist.get_rank() == 6: + # LLM stage receives concatenated LLM outputs + input_dict = mllm_comm.recv_forward(tensor_shape=(2, 32, 128)) + assert input_dict['llm'].shape == (2, 32, 128) + mllm_comm.send_forward(output_dict) + elif dist.get_rank() == 7: + # LLM stage receives concatenated LLM outputs + input_dict = mllm_comm.recv_forward(tensor_shape=(2, 32, 128)) + assert input_dict['llm'].shape == (2, 32, 128) + + def test_send_backward_recv_backward(self): + """Test send_backward and recv_backward operations.""" + if not dist.is_initialized(): + pytest.skip("Distributed not initialized") + + # Create process group grids for each module + image_encoder_grid = create_hypercomm_grid(offset=0, tp=1, cp=1, pp=1, dp=1) + audio_encoder_grid = create_hypercomm_grid(offset=1, tp=1, cp=1, pp=1, dp=1) + llm_grid = create_hypercomm_grid(offset=2, tp=2, cp=1, pp=2, dp=1) + generator_grid = create_hypercomm_grid(offset=6, tp=1, cp=1, pp=1, dp=2) + + # Set up module-grid mapping and topology + module_to_grid_map = { + 'image_encoder': image_encoder_grid, + 'audio_encoder': audio_encoder_grid, + 'llm': llm_grid, + 'generator': generator_grid, + } + topology = { + 'image_encoder': ['llm'], + 'audio_encoder': ['llm'], + 'llm': ['generator'], + 'generator': [], + } + config = ModelParallelConfig(pipeline_dtype=torch.float) + mllm_comm = MultiModulePipelineCommunicator(module_to_grid_map, topology, config) + + # Simulate backward communication for each module + if mllm_comm.is_current_rank_in_grid(generator_grid): + # Generator sends gradient backward + grad_dict = {'llm': torch.randn(1, 32, 128).cuda()} + mllm_comm.send_backward(grad_dict) + if mllm_comm.is_current_rank_in_grid(llm_grid): + if dist.get_rank() == 4 or dist.get_rank() == 5: + # LLM receives expanded gradient and sends backward + received_grad = mllm_comm.recv_backward() + assert received_grad['llm'].shape == (2, 32, 128) + grad_dict = {'llm': torch.randn(2, 32, 128).cuda()} + mllm_comm.send_backward(grad_dict) + else: + # LLM receives gradient and sends backward to both image/audio encoders + received_grad = mllm_comm.recv_backward(tensor_shape=(2, 32, 128)) + assert received_grad['llm'].shape == (2, 32, 128) + grad_dict = { + 'image_encoder': torch.randn(2, 8, 128).cuda(), + 'audio_encoder': torch.randn(2, 16, 128).cuda(), + } + mllm_comm.send_backward(grad_dict) + if mllm_comm.is_current_rank_in_grid(image_encoder_grid): + # Image encoder receives its gradient + received_grad = mllm_comm.recv_backward() + assert received_grad['image_encoder'].shape == (2, 8, 128) + if mllm_comm.is_current_rank_in_grid(audio_encoder_grid): + # Audio encoder receives its gradient + received_grad = mllm_comm.recv_backward() + assert received_grad['audio_encoder'].shape == (2, 16, 128) + + @pytest.mark.skipif( + version.parse(torch.__version__) < version.parse('2.3.0'), + reason="Feature requires PyTorch 2.3 or later", + ) + def test_send_forward_recv_backward_send_backward_recv_forward(self): + """Test send_forward_recv_backward and send_backward_recv_forward operations.""" + if not dist.is_initialized(): + pytest.skip("Distributed not initialized") + + # Create process group grids for each module + image_encoder_grid = create_hypercomm_grid(offset=0, tp=1, cp=1, pp=1, dp=1) + audio_encoder_grid = create_hypercomm_grid(offset=1, tp=1, cp=1, pp=1, dp=1) + llm_grid = create_hypercomm_grid(offset=2, tp=2, cp=1, pp=2, dp=1) + generator_grid = create_hypercomm_grid(offset=6, tp=1, cp=1, pp=1, dp=2) + + # Set up module-grid mapping and topology + module_to_grid_map = { + 'image_encoder': image_encoder_grid, + 'audio_encoder': audio_encoder_grid, + 'llm': llm_grid, + 'generator': generator_grid, + } + topology = { + 'image_encoder': ['llm'], + 'audio_encoder': ['llm'], + 'llm': ['generator'], + 'generator': [], + } + config = ModelParallelConfig(pipeline_dtype=torch.float) + mllm_comm = MultiModulePipelineCommunicator(module_to_grid_map, topology, config) + + # Simulate bidirectional send/recv for forward and backward in pipeline + + # Encoder stages send forward to the first stage of LLM, and receive backward from the first stage of LLM + if mllm_comm.is_current_rank_in_grid(image_encoder_grid): + output_dict = {'image_encoder': torch.randn(2, 8, 128).cuda()} + received_grad = mllm_comm.send_forward_recv_backward(output_dict) + assert received_grad['image_encoder'].shape == (2, 8, 128) + if mllm_comm.is_current_rank_in_grid(audio_encoder_grid): + output_dict = {'audio_encoder': torch.randn(2, 16, 128).cuda()} + received_grad = mllm_comm.send_forward_recv_backward(output_dict) + assert received_grad['audio_encoder'].shape == (2, 16, 128) + if mllm_comm.is_current_rank_in_grid(llm_grid): + if dist.get_rank() == 2 or dist.get_rank() == 3: + grad_dict = { + 'image_encoder': torch.randn(2, 8, 128).cuda(), + 'audio_encoder': torch.randn(2, 16, 128).cuda(), + } + input_dict = mllm_comm.send_backward_recv_forward(grad_dict) + assert input_dict['image_encoder'].shape == (2, 8, 128) + assert input_dict['audio_encoder'].shape == (2, 16, 128) + + # First stage of LLM sends forward to the second stage of LLM, and receive backward from the second stage of LLM + if mllm_comm.is_current_rank_in_grid(llm_grid): + if dist.get_rank() == 2 or dist.get_rank() == 3: + output_dict = {'llm': torch.randn(2, 32, 128).cuda()} + received_grad = mllm_comm.send_forward_recv_backward( + output_dict, tensor_shape=(2, 32, 128) + ) + assert received_grad['llm'].shape == (2, 32, 128) + if dist.get_rank() == 4 or dist.get_rank() == 5: + grad_dict = {'llm': torch.randn(2, 32, 128).cuda()} + input_dict = mllm_comm.send_backward_recv_forward( + grad_dict, tensor_shape=(2, 32, 128) + ) + assert input_dict['llm'].shape == (2, 32, 128) + + # Second stage of LLM sends forward to generator, and receive backward from generator + if mllm_comm.is_current_rank_in_grid(llm_grid): + if dist.get_rank() == 4 or dist.get_rank() == 5: + output_dict = {'llm': torch.randn(2, 32, 128).cuda()} + received_grad = mllm_comm.send_forward_recv_backward(output_dict) + assert received_grad['llm'].shape == (2, 32, 128) + if mllm_comm.is_current_rank_in_grid(generator_grid): + grad_dict = {'llm': torch.randn(1, 32, 128).cuda()} + input_dict = mllm_comm.send_backward_recv_forward(grad_dict) + assert input_dict['llm'].shape == (1, 32, 128) + + @pytest.mark.skipif( + version.parse(torch.__version__) < version.parse('2.3.0'), + reason="Feature requires PyTorch 2.3 or later", + ) + def test_send_forward_recv_forward_with_transformer_blocks(self): + """Test send_forward and recv_forward operations.""" + + # Set model/test dimensions for easier debugging and output comparison + hidden_size = 16 + sequence_length = 2 + micro_batch_size = 2 + + # For reproducibility, set a fixed seed + torch.manual_seed(12345) + dtype = torch.float32 + + # Create random input hidden states tensor + hidden_states = torch.randn( + (sequence_length, micro_batch_size, hidden_size), device="cuda" + ).to(dtype) + current_rank = dist.get_rank() + + # ========== Initialize tensor model-parallel environment ========== + parallel_state_tp = 2 + Utils.initialize_model_parallel(tensor_model_parallel_size=2) + + # ========== Build reference 1D grid and transformer block for weight sharing ========== + ref_grid = create_hypercomm_grid(offset=0, tp=1, cp=1, pp=1, dp=8) + ref_pg_collection = _get_pg_collection_from_grid(ref_grid) + ref_block = _create_transformer_block( + dtype=dtype, hidden_size=hidden_size, pg_collection=ref_pg_collection + ) + _avg_params( + ref_block, ref_grid.get_pg("dp") + ) # Ensure parameters are averaged across data parallel (DP) + + # ========== Create different transformer blocks for each model stage ========== + # Image encoder + image_encoder_block, image_encoder_grid = get_transformer_block_and_grid( + ref_block, + tp_size=1, + cp_size=1, + pp_size=1, + dp_size=1, + grid_offset=0, + hidden_size=hidden_size, + dtype=dtype, + ) + # Audio encoder + audio_encoder_block, audio_encoder_grid = get_transformer_block_and_grid( + ref_block, + tp_size=1, + cp_size=1, + pp_size=1, + dp_size=1, + grid_offset=1, + hidden_size=hidden_size, + dtype=dtype, + ) + # LLM (Large Language Model) block with tensor & pipeline parallelism + llm_block, llm_grid = get_transformer_block_and_grid( + ref_block, + tp_size=2, + cp_size=1, + pp_size=2, + dp_size=1, + grid_offset=2, + hidden_size=hidden_size, + dtype=dtype, + ) + # Generator block (final stage) with DP=2 + generator_block, generator_grid = get_transformer_block_and_grid( + ref_block, + tp_size=1, + cp_size=1, + pp_size=1, + dp_size=2, + grid_offset=6, + hidden_size=hidden_size, + dtype=dtype, + ) + + # ========== Define module-to-grid correspondence and pipeline topology ========== + module_to_grid_map = { + 'image_encoder': image_encoder_grid, + 'audio_encoder': audio_encoder_grid, + 'llm': llm_grid, + 'generator': generator_grid, + } + topology = { + 'image_encoder': ['llm'], # image_encoder sends output to llm + 'audio_encoder': ['llm'], # audio_encoder sends output to llm + 'llm': ['generator'], # llm sends output to generator + 'generator': [], # generator is the final module + } + config = ModelParallelConfig(pipeline_dtype=torch.float) + # Define dimension mapping for sequence, batch, hidden + dim_mapping = {'s': 0, 'h': 2, 'b': 1} + seq_dim = dim_mapping['s'] + + # Communication handler for multi-module pipeline (send/recv abstraction) + mllm_comm = MultiModulePipelineCommunicator( + module_to_grid_map, topology, config, dim_mapping=dim_mapping + ) + + # ========== Run actual distributed pipeline blocks (per process, depending on role) ========== + if mllm_comm.is_current_rank_in_grid(image_encoder_grid): + # Image encoder rank: run forward and send output + image_encoder_output = image_encoder_block( + hidden_states=hidden_states, attention_mask=None + ) + output_dict = {'image_encoder': image_encoder_output} + mllm_comm.send_forward(output_dict) + if mllm_comm.is_current_rank_in_grid(audio_encoder_grid): + # Audio encoder rank: run forward and send output + audio_encoder_output = audio_encoder_block( + hidden_states=hidden_states, attention_mask=None + ) + output_dict = {'audio_encoder': audio_encoder_output} + mllm_comm.send_forward(output_dict) + if mllm_comm.is_current_rank_in_grid(llm_grid): + if dist.get_rank() == 2 or dist.get_rank() == 3: + # LLM stage 0 (receives both image and audio, concatenates along seq_dim) + input_dict = mllm_comm.recv_forward() + llm_output = llm_block( + hidden_states=torch.cat( + [input_dict['image_encoder'], input_dict['audio_encoder']], dim=seq_dim + ), + attention_mask=None, + ) + output_dict = {'llm': llm_output} + mllm_comm.send_forward(output_dict) + else: + # LLM stage 1 (receives output of previous LLM stage) + input_dict = mllm_comm.recv_forward( + tensor_shape=(sequence_length * 2, micro_batch_size, hidden_size) + ) + llm_output = llm_block(hidden_states=input_dict['llm'], attention_mask=None) + output_dict = {'llm': llm_output} + mllm_comm.send_forward(output_dict) + + if mllm_comm.is_current_rank_in_grid(generator_grid): + # Generator block: only receives from llm and runs forward + input_dict = mllm_comm.recv_forward() + generator_output = generator_block(hidden_states=input_dict['llm'], attention_mask=None) + + # ========== Build a reference (serial/global) pipeline for correctness checking ========== + global_image_encoder_block, _ = get_transformer_block_and_grid( + ref_block, + tp_size=parallel_state_tp, + use_global_parallel_state=True, + hidden_size=hidden_size, + dtype=dtype, + ) + global_audio_encoder_block, _ = get_transformer_block_and_grid( + ref_block, + tp_size=parallel_state_tp, + use_global_parallel_state=True, + hidden_size=hidden_size, + dtype=dtype, + ) + global_llm_block_pp_rank_0, _ = get_transformer_block_and_grid( + ref_block, + tp_size=parallel_state_tp, + use_global_parallel_state=True, + hidden_size=hidden_size, + dtype=dtype, + ) + global_llm_block_pp_rank_1, _ = get_transformer_block_and_grid( + ref_block, + tp_size=parallel_state_tp, + use_global_parallel_state=True, + hidden_size=hidden_size, + dtype=dtype, + ) + global_generator_block, _ = get_transformer_block_and_grid( + ref_block, + tp_size=parallel_state_tp, + use_global_parallel_state=True, + hidden_size=hidden_size, + dtype=dtype, + ) + + # Run each stage sequentially as a global pipeline (for truth) + global_image_encoder_output = global_image_encoder_block( + hidden_states=hidden_states, attention_mask=None + ) + global_audio_encoder_output = global_audio_encoder_block( + hidden_states=hidden_states, attention_mask=None + ) + # Compare output between global and distributed blocks for image/audio stage + if current_rank == 0: + torch.testing.assert_close( + global_image_encoder_output, image_encoder_output, rtol=1e-3, atol=1e-3 + ) + if current_rank == 1: + torch.testing.assert_close( + global_audio_encoder_output, audio_encoder_output, rtol=1e-3, atol=1e-3 + ) + + # Feed outputs to LLM stages (emulate pipeline cut with concatenation) + global_llm_input = torch.cat( + [global_image_encoder_output, global_audio_encoder_output], dim=seq_dim + ) + global_llm_pp_rank_0_output = global_llm_block_pp_rank_0( + hidden_states=global_llm_input, attention_mask=None + ) + if current_rank == 2 or current_rank == 3: + torch.testing.assert_close( + global_llm_pp_rank_0_output, llm_output, rtol=1e-3, atol=1e-3 + ) + global_llm_pp_rank_1_output = global_llm_block_pp_rank_1( + hidden_states=global_llm_pp_rank_0_output, attention_mask=None + ) + if current_rank == 4 or current_rank == 5: + torch.testing.assert_close( + global_llm_pp_rank_1_output, llm_output, rtol=1e-3, atol=1e-3 + ) + + # Generator output and comparison to distributed output (for each DP chunk) + global_generator_block_output = global_generator_block( + hidden_states=global_llm_pp_rank_1_output, attention_mask=None + ) + global_generator_block_chunks = torch.split( + global_generator_block_output, global_generator_block_output.shape[1] // 2, dim=1 + ) + if current_rank == 6: + torch.testing.assert_close( + global_generator_block_chunks[0], generator_output, rtol=1e-3, atol=1e-3 + ) + if current_rank == 7: + torch.testing.assert_close( + global_generator_block_chunks[1], generator_output, rtol=1e-3, atol=1e-3 + ) + + @pytest.mark.skipif( + version.parse(torch.__version__) < version.parse('2.3.0'), + reason="Feature requires PyTorch 2.3 or later", + ) + @pytest.mark.parametrize( + "grid1_tp, grid1_pp, grid1_dp, grid2_tp, grid2_pp, grid2_dp, parallel_state_tp", + [ + (2, 1, 1, 2, 1, 1, 2), # TP2PP1DP1 to TP2PP1DP1 + (2, 1, 1, 2, 2, 1, 2), # TP2PP1DP1 to TP2PP2DP1 + (2, 2, 1, 2, 2, 1, 2), # TP2PP2DP1 to TP2PP2DP1 + (4, 1, 1, 4, 1, 1, 4), # TP4DP1 to TP4DP1 + (2, 1, 2, 4, 1, 1, 2), # TP2DP2 to TP4DP1 + (4, 1, 1, 2, 1, 2, 2), # TP4DP1 to TP2DP2 + (2, 1, 2, 1, 1, 4, 2), # TP2DP2 to TP1DP4 + ], + ) + def test_send_forward_recv_forward_with_transformer_blocks_and_different_parallelisms( + self, grid1_tp, grid1_pp, grid1_dp, grid2_tp, grid2_pp, grid2_dp, parallel_state_tp + ): + """Test bridge communicator with two transformer blocks having different process group configurations.""" + # Model and input configuration + hidden_size = 16 + sequence_length = 2 + micro_batch_size = 8 + torch.manual_seed(12345) + dtype = torch.float32 + + # Create random input tensor on CUDA + hidden_states = torch.randn( + (sequence_length, micro_batch_size, hidden_size), device="cuda" + ).to(dtype) + hidden_states_ref = hidden_states.clone() + current_rank = dist.get_rank() + + # Initialize model parallel with desired TP + Utils.initialize_model_parallel(tensor_model_parallel_size=parallel_state_tp) + + # Build a reference grid and block for parameter sharing & DP averaging + ref_grid = create_hypercomm_grid(offset=0, tp=1, cp=1, pp=1, dp=8) + ref_pg_collection = _get_pg_collection_from_grid(ref_grid) + ref_block = _create_transformer_block( + dtype=dtype, hidden_size=hidden_size, pg_collection=ref_pg_collection + ) + _avg_params( + ref_block, ref_grid.get_pg("dp") + ) # Synchronize parameters across DP for reproducibility + + # ====== Create two transformer block+grid pairs with different TP/DP settings ====== + block_grid_1, grid_1 = get_transformer_block_and_grid( + ref_block, + tp_size=grid1_tp, + pp_size=grid1_pp, + dp_size=grid1_dp, + grid_offset=0, + hidden_size=hidden_size, + dtype=dtype, + ) + + block_grid_2, grid_2 = get_transformer_block_and_grid( + ref_block, + tp_size=grid2_tp, + pp_size=grid2_pp, + dp_size=grid2_dp, + grid_offset=grid_1.size, + hidden_size=hidden_size, + dtype=dtype, + ) + + dist.barrier() # Synchronize ranks before communication + + # Module-grid map and pipeline communication topology + module_to_grid_map = {'image_encoder': grid_1, 'llm': grid_2} + topology = { + 'image_encoder': ['llm'], # image_encoder sends forward results to llm + 'llm': [], # llm is the last stage here + } + config = ModelParallelConfig(pipeline_dtype=torch.float) + mllm_comm = MultiModulePipelineCommunicator( + module_to_grid_map, topology, config, dim_mapping={'s': 0, 'h': 2, 'b': 1} + ) + + output_grid_2 = None + # If current rank is in the first grid, run first block and send output + if grid_1 is not None and mllm_comm.is_current_rank_in_grid(grid_1): + rank_module_info = mllm_comm.rank_module_map['image_encoder'] + if rank_module_info.pp_rank == 0: + hidden_states = block_grid_1(hidden_states=hidden_states, attention_mask=None) + mllm_comm.send_forward({'image_encoder': hidden_states}) + else: + input_dict = mllm_comm.recv_forward( + tensor_shape=(sequence_length, micro_batch_size, hidden_size) + ) + hidden_states = input_dict['image_encoder'] + hidden_states = block_grid_1(hidden_states=hidden_states, attention_mask=None) + mllm_comm.send_forward({'image_encoder': hidden_states}) + + # If current rank is in second grid, receive and run the second block + if grid_2 is not None and mllm_comm.is_current_rank_in_grid(grid_2): + rank_module_info = mllm_comm.rank_module_map['llm'] + if rank_module_info.pp_rank == 0: + input_dict = mllm_comm.recv_forward() + hidden_states = input_dict['image_encoder'] + hidden_states = block_grid_2(hidden_states=hidden_states, attention_mask=None) + if rank_module_info.pp_rank == rank_module_info.pp_size - 1: + output_grid_2 = hidden_states + else: + mllm_comm.send_forward({'llm': hidden_states}) + elif rank_module_info.pp_rank < rank_module_info.pp_size - 1: + input_dict = mllm_comm.recv_forward( + tensor_shape=( + sequence_length, + (grid1_dp * micro_batch_size) // grid2_dp, + hidden_size, + ) + ) + hidden_states = input_dict['llm'] + hidden_states = block_grid_2(hidden_states=hidden_states, attention_mask=None) + mllm_comm.send_forward({'llm': hidden_states}) + else: + input_dict = mllm_comm.recv_forward( + tensor_shape=( + sequence_length, + (grid1_dp * micro_batch_size) // grid2_dp, + hidden_size, + ) + ) + hidden_states = input_dict['llm'] + output_grid_2 = block_grid_2(hidden_states=hidden_states, attention_mask=None) + + # Compute expected output shape based on change in DP size (chunk/expand batch dimension appropriately) + factor = max(grid1_dp, grid2_dp) // min(grid1_dp, grid2_dp) + expected_output_shape = ( + sequence_length, + ( + micro_batch_size * factor + if grid1_dp > grid2_dp + else micro_batch_size // factor + ), + hidden_size, + ) + assert ( + output_grid_2.shape == expected_output_shape + ), f"Output2 shape mismatch: {output_grid_2.shape}" + + # ====== Reference: global (replicated) pipeline forward for correctness checking ====== + global_block_1, _ = get_transformer_block_and_grid( + ref_block, + tp_size=parallel_state_tp, + use_global_parallel_state=True, + hidden_size=hidden_size, + dtype=dtype, + ) + global_block_2, _ = get_transformer_block_and_grid( + ref_block, + tp_size=parallel_state_tp, + use_global_parallel_state=True, + hidden_size=hidden_size, + dtype=dtype, + ) + + for i in range(grid1_pp): + hidden_states_ref = global_block_1(hidden_states=hidden_states_ref, attention_mask=None) + + for i in range(grid2_pp): + hidden_states_ref = global_block_2(hidden_states=hidden_states_ref, attention_mask=None) + + # Output comparison under different DP compositions between grids + if ( + grid_2 is not None + and mllm_comm.is_current_rank_in_grid(grid_2) + and rank_module_info.pp_rank == rank_module_info.pp_size - 1 + ): + if grid1_dp == grid2_dp: + # DP size matches: all outputs directly compared + torch.testing.assert_close(hidden_states_ref, output_grid_2, rtol=1e-3, atol=1e-3) + elif grid1_dp < grid2_dp: + # If grid2 expands DP: each output_grid_2 chunk corresponds to a split of the reference output + grid2_dp_ranks = grid_2._gen_rank_enum([x for x in grid_2.dim_names if x != "dp"]) + global_block_2_chunks = torch.split( + hidden_states_ref, hidden_states_ref.shape[1] // (grid2_dp // grid1_dp), dim=1 + ) + relevant_chunk = None + for i, dp_ranks in enumerate(grid2_dp_ranks): + if current_rank in dp_ranks: + relevant_chunk = global_block_2_chunks[i % len(global_block_2_chunks)] + torch.testing.assert_close(relevant_chunk, output_grid_2, rtol=1e-3, atol=1e-3) + else: + # If DP shrinks (grid1_dp > grid2_dp): just compare the relevant first chunk + output_grid_2_first_chunk = torch.chunk(output_grid_2, grid1_dp // grid2_dp, dim=1)[ + 0 + ] + torch.testing.assert_close( + hidden_states_ref, output_grid_2_first_chunk, rtol=1e-3, atol=1e-3 + ) From 97ef777c4277eb4d8ad4b2e2f0a8513c5e08caaa Mon Sep 17 00:00:00 2001 From: Zijie Yan Date: Thu, 23 Oct 2025 02:26:12 +0000 Subject: [PATCH 035/248] chore: Update golden values. --- docker/Dockerfile.ci.dev | 10 +- .../golden_values_dev_dgxh100_coreweave.json | 600 +++++++++--------- .../golden_values_dev_dgxh100_eos.json | 600 +++++++++--------- uv.lock | 595 +++++++++-------- 4 files changed, 896 insertions(+), 909 deletions(-) diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev index caa2b9e1b86..f5da7afada9 100644 --- a/docker/Dockerfile.ci.dev +++ b/docker/Dockerfile.ci.dev @@ -61,11 +61,11 @@ RUN bash -ex <<"EOF" ln -s libnvshmem_host.so.3 libnvshmem_host.so popd - git clone --branch hybrid-ep https://github.com/deepseek-ai/DeepEP.git - cd DeepEP - git checkout 3f601f7ac1c062c46502646ff04c535013bfca00 - TORCH_CUDA_ARCH_LIST="9.0" uv pip install --no-build-isolation -v . - cd .. + git clone --branch v1.2.1 https://github.com/deepseek-ai/DeepEP.git + pushd DeepEP + patch -p1 < /workspace/deepep.patch + popd + TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" uv pip install --no-build-isolation -v DeepEP/. rm -rf DeepEP EOF diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_coreweave.json index cdd69820131..0af1bff480e 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_coreweave.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.04747, - "2": 11.03489, - "3": 9.59197, - "4": 9.2607, - "5": 9.25316, - "6": 9.70587, - "7": 9.46635, - "8": 9.01114, - "9": 8.72173, - "10": 9.06704, - "11": 8.59397, - "12": 8.5643, - "13": 8.44846, - "14": 7.97921, - "15": 8.04905, - "16": 8.09886, - "17": 8.04172, - "18": 7.76126, - "19": 8.14014, - "20": 7.86027, - "21": 7.54995, - "22": 7.53872, - "23": 7.40693, - "24": 7.40435, - "25": 7.66065, - "26": 7.05772, - "27": 7.59552, - "28": 7.30627, - "29": 7.48007, - "30": 7.63012, - "31": 7.38325, - "32": 7.57843, - "33": 7.62828, - "34": 7.68919, - "35": 7.20168, - "36": 7.07506, - "37": 7.41935, - "38": 7.17961, - "39": 7.54005, - "40": 7.53821, - "41": 7.47888, - "42": 7.24055, - "43": 7.2256, - "44": 7.40803, - "45": 7.1775, - "46": 6.88877, - "47": 7.29436, - "48": 7.13581, - "49": 7.58407, - "50": 7.02865 + "1": 11.04624, + "2": 11.03476, + "3": 9.59903, + "4": 9.26301, + "5": 9.36373, + "6": 9.59608, + "7": 9.45214, + "8": 8.95198, + "9": 8.65952, + "10": 9.17778, + "11": 9.21306, + "12": 8.68184, + "13": 8.6038, + "14": 8.01576, + "15": 8.13595, + "16": 8.20124, + "17": 8.13602, + "18": 7.83369, + "19": 8.22974, + "20": 7.9452, + "21": 7.62338, + "22": 7.60791, + "23": 7.48374, + "24": 7.46559, + "25": 7.71274, + "26": 7.12081, + "27": 7.64626, + "28": 7.35234, + "29": 7.52084, + "30": 7.67784, + "31": 7.42246, + "32": 7.6137, + "33": 7.66159, + "34": 7.72817, + "35": 7.23134, + "36": 7.10612, + "37": 7.44953, + "38": 7.20946, + "39": 7.57073, + "40": 7.56124, + "41": 7.51119, + "42": 7.27048, + "43": 7.25633, + "44": 7.43634, + "45": 7.21132, + "46": 6.91913, + "47": 7.32211, + "48": 7.16551, + "49": 7.6155, + "50": 7.05648 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 38802648.0, - "2": 38543564.0, - "3": 38740428.0, - "4": 264349216.0, - "5": 224711328.0, - "6": 359592256.0, - "7": 683584064.0, - "8": 850747136.0, - "9": 781151872.0, - "10": 863934336.0, - "11": 784956928.0, - "12": 787741824.0, - "13": 906642432.0, - "14": 793413952.0, - "15": 724351360.0, - "16": 929182656.0, - "17": 728944832.0, - "18": 715233856.0, - "19": 894586752.0, - "20": 942182208.0, - "21": 712310464.0, - "22": 903670336.0, - "23": 882199552.0, - "24": 867334400.0, - "25": 874751488.0, - "26": 844191104.0, - "27": 813243648.0, - "28": 626785920.0, - "29": 808773120.0, - "30": 602759296.0, - "31": 793783168.0, - "32": 768613888.0, - "33": 721639040.0, - "34": 734472448.0, - "35": 734570880.0, - "36": 703058560.0, - "37": 692109824.0, - "38": 649260992.0, - "39": 620422656.0, - "40": 604143616.0, - "41": 598320448.0, - "42": 573424384.0, - "43": 576846912.0, - "44": 570038144.0, - "45": 540081024.0, - "46": 501251008.0, - "47": 497637664.0, - "48": 494691072.0, - "49": 490977312.0, - "50": 463542304.0 + "1": 38802568, + "2": 38543544, + "3": 41886704, + "4": 264367872, + "5": 224737792, + "6": 302994528, + "7": 645808768, + "8": 775291136, + "9": 765475328, + "10": 675259904, + "11": 615098624, + "12": 702764352, + "13": 934951360, + "14": 1060699008, + "15": 802967296, + "16": 1026771392, + "17": 756706880, + "18": 715253696, + "19": 929126208, + "20": 875969472, + "21": 665188032, + "22": 903854976, + "23": 747044352, + "24": 920777856, + "25": 733230528, + "26": 863183104, + "27": 879318336, + "28": 916219136, + "29": 909384256, + "30": 879622720, + "31": 866425152, + "32": 819074560, + "33": 589493056, + "34": 772011648, + "35": 778655488, + "36": 759651584, + "37": 761302144, + "38": 463804224, + "39": 543038400, + "40": 497278720, + "41": 658241792, + "42": 661600512, + "43": 495713632, + "44": 673788672, + "45": 470873536, + "46": 614455040, + "47": 554219584, + "48": 570200064, + "49": 557109312, + "50": 347212736 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 7321331200.0, - "2": 7321333248.0, - "3": 7321333248.0, - "4": 7321333248.0, - "5": 7321333248.0, - "6": 7321333248.0, - "7": 7321333248.0, - "8": 7321333248.0, - "9": 7321333248.0, - "10": 7321333248.0, - "11": 7321333248.0, - "12": 7321333248.0, - "13": 7321333248.0, - "14": 7321333248.0, - "15": 7321333248.0, - "16": 7321333248.0, - "17": 7321333248.0, - "18": 7321333248.0, - "19": 7321333248.0, - "20": 7321333248.0, - "21": 7321333248.0, - "22": 7321333248.0, - "23": 7321333248.0, - "24": 7321333248.0, - "25": 7321333248.0, - "26": 7321333248.0, - "27": 7321333248.0, - "28": 7321333248.0, - "29": 7321333248.0, - "30": 7321333248.0, - "31": 7321333248.0, - "32": 7321333248.0, - "33": 7321333248.0, - "34": 7321333248.0, - "35": 7321333248.0, - "36": 7321333248.0, - "37": 7321333248.0, - "38": 7321333248.0, - "39": 7321333248.0, - "40": 7321333248.0, - "41": 7321333248.0, - "42": 7321333248.0, - "43": 7321333248.0, - "44": 7321333248.0, - "45": 7321333248.0, - "46": 7321333248.0, - "47": 7321333248.0, - "48": 7321333248.0, - "49": 7321333248.0, - "50": 7321333248.0 + "1": 7321308672, + "2": 7321310720, + "3": 7321310720, + "4": 7321310720, + "5": 7321310720, + "6": 7321310720, + "7": 7321310720, + "8": 7321310720, + "9": 7321310720, + "10": 7321310720, + "11": 7321310720, + "12": 7321310720, + "13": 7321310720, + "14": 7321310720, + "15": 7321310720, + "16": 7321310720, + "17": 7321310720, + "18": 7321310720, + "19": 7321310720, + "20": 7321310720, + "21": 7321310720, + "22": 7321310720, + "23": 7321310720, + "24": 7321310720, + "25": 7321310720, + "26": 7321310720, + "27": 7321310720, + "28": 7321310720, + "29": 7321310720, + "30": 7321310720, + "31": 7321310720, + "32": 7321310720, + "33": 7321310720, + "34": 7321310720, + "35": 7321310720, + "36": 7321310720, + "37": 7321310720, + "38": 7321310720, + "39": 7321310720, + "40": 7321310720, + "41": 7321310720, + "42": 7321310720, + "43": 7321310720, + "44": 7321310720, + "45": 7321310720, + "46": 7321310720, + "47": 7321310720, + "48": 7321310720, + "49": 7321310720, + "50": 7321310720 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 22198937600.0, - "2": 24950007808.0, - "3": 24950007808.0, - "4": 24950007808.0, - "5": 24950007808.0, - "6": 24950007808.0, - "7": 24950007808.0, - "8": 24950007808.0, - "9": 24950007808.0, - "10": 24950007808.0, - "11": 24950007808.0, - "12": 24950007808.0, - "13": 24950007808.0, - "14": 24950007808.0, - "15": 24950007808.0, - "16": 24950007808.0, - "17": 24950007808.0, - "18": 24950007808.0, - "19": 24950007808.0, - "20": 24950007808.0, - "21": 24950007808.0, - "22": 24950007808.0, - "23": 24950007808.0, - "24": 24950007808.0, - "25": 24950007808.0, - "26": 24950007808.0, - "27": 25072799744.0, - "28": 25343600640.0, - "29": 25625788416.0, - "30": 25625788416.0, - "31": 25628155904.0, - "32": 25707937792.0, - "33": 25707937792.0, - "34": 25707937792.0, - "35": 25707937792.0, - "36": 25707937792.0, - "37": 25707937792.0, - "38": 25707937792.0, - "39": 25707937792.0, - "40": 25707937792.0, - "41": 25707937792.0, - "42": 25707937792.0, - "43": 25707937792.0, - "44": 25707937792.0, - "45": 25707937792.0, - "46": 25707937792.0, - "47": 25707937792.0, - "48": 25707937792.0, - "49": 25707937792.0, - "50": 25707937792.0 + "1": 54396813312, + "2": 57149165568, + "3": 57165475840, + "4": 57165475840, + "5": 57165475840, + "6": 57165475840, + "7": 57165475840, + "8": 57165475840, + "9": 57165475840, + "10": 57165475840, + "11": 57165475840, + "12": 57165475840, + "13": 57165475840, + "14": 57165475840, + "15": 57165475840, + "16": 57165475840, + "17": 57165475840, + "18": 57165475840, + "19": 57165475840, + "20": 57165475840, + "21": 57165475840, + "22": 57165475840, + "23": 57165475840, + "24": 57165475840, + "25": 57165475840, + "26": 57165475840, + "27": 57165475840, + "28": 57165475840, + "29": 57165475840, + "30": 57165475840, + "31": 57165475840, + "32": 57165475840, + "33": 57165475840, + "34": 57165475840, + "35": 57165475840, + "36": 57165475840, + "37": 57165475840, + "38": 57165475840, + "39": 57165475840, + "40": 57295986688, + "41": 57295986688, + "42": 57331482624, + "43": 57360437248, + "44": 57561960448, + "45": 57561960448, + "46": 57561960448, + "47": 57585307648, + "48": 57602347008, + "49": 57823961088, + "50": 57823961088 } }, "mtp_1 loss": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.07742, - "2": 11.07559, - "3": 10.5272, - "4": 10.08877, - "5": 9.81119, - "6": 9.88673, - "7": 9.70278, - "8": 8.9944, - "9": 8.79002, - "10": 9.07171, - "11": 8.44594, - "12": 8.50226, - "13": 8.40983, - "14": 7.83955, - "15": 7.97902, - "16": 8.03361, - "17": 7.99642, - "18": 7.71928, - "19": 8.10116, - "20": 7.82113, - "21": 7.51112, - "22": 7.48906, - "23": 7.35335, - "24": 7.35884, - "25": 7.60836, - "26": 7.01391, - "27": 7.54721, - "28": 7.25644, - "29": 7.43129, - "30": 7.57524, - "31": 7.321, - "32": 7.50218, - "33": 7.56009, - "34": 7.62505, - "35": 7.14234, - "36": 7.0092, - "37": 7.34655, - "38": 7.11926, - "39": 7.4822, - "40": 7.46808, - "41": 7.41272, - "42": 7.1698, - "43": 7.15213, - "44": 7.33728, - "45": 7.11437, - "46": 6.81846, - "47": 7.2282, - "48": 7.07339, - "49": 7.50345, - "50": 6.96783 + "1": 11.07779, + "2": 11.07564, + "3": 10.52904, + "4": 10.08924, + "5": 9.81101, + "6": 9.88786, + "7": 9.72987, + "8": 9.02044, + "9": 8.8145, + "10": 9.09362, + "11": 8.77612, + "12": 8.56714, + "13": 8.54777, + "14": 8.04338, + "15": 8.10946, + "16": 8.13231, + "17": 8.0853, + "18": 7.83475, + "19": 8.21923, + "20": 7.91097, + "21": 7.58489, + "22": 7.56231, + "23": 7.44204, + "24": 7.44303, + "25": 7.67594, + "26": 7.07138, + "27": 7.60696, + "28": 7.30925, + "29": 7.48219, + "30": 7.62699, + "31": 7.3655, + "32": 7.54203, + "33": 7.60199, + "34": 7.66716, + "35": 7.18385, + "36": 7.05252, + "37": 7.38377, + "38": 7.15521, + "39": 7.51639, + "40": 7.4929, + "41": 7.44762, + "42": 7.20298, + "43": 7.18681, + "44": 7.36683, + "45": 7.15506, + "46": 6.85064, + "47": 7.26072, + "48": 7.10489, + "49": 7.53477, + "50": 6.99715 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 71.2429, - "2": 1.39205, - "3": 1.3521, - "4": 1.31895, - "5": 0.86745, - "6": 0.86249, - "7": 1.0949, - "8": 1.03022, - "9": 0.80778, - "10": 0.82011, - "11": 0.81426, - "12": 0.8098, - "13": 0.81209, - "14": 0.81361, - "15": 0.80969, - "16": 0.81315, - "17": 0.85127, - "18": 0.80813, - "19": 0.81928, - "20": 0.81012, - "21": 0.8101, - "22": 0.81064, - "23": 0.80537, - "24": 0.81149, - "25": 0.81261, - "26": 0.81877, - "27": 0.80314, - "28": 0.80383, - "29": 0.83563, - "30": 0.80254, - "31": 0.80006, - "32": 0.80658, - "33": 0.81426, - "34": 0.81824, - "35": 0.81124, - "36": 0.80978, - "37": 0.80679, - "38": 0.80838, - "39": 0.81028, - "40": 0.81044, - "41": 0.81268, - "42": 0.81318, - "43": 0.79311, - "44": 0.80471, - "45": 0.80526, - "46": 0.79795, - "47": 0.80592, - "48": 0.80158, - "49": 0.80635, - "50": 0.79969 + "1": 98.46571, + "2": 1.63304, + "3": 1.32772, + "4": 1.63453, + "5": 1.11673, + "6": 1.14377, + "7": 1.33213, + "8": 1.32699, + "9": 1.07499, + "10": 1.12938, + "11": 1.07438, + "12": 1.11078, + "13": 1.06958, + "14": 1.08718, + "15": 1.10547, + "16": 1.07557, + "17": 1.08606, + "18": 1.0832, + "19": 1.08226, + "20": 1.126, + "21": 1.08645, + "22": 1.07978, + "23": 1.07859, + "24": 1.08221, + "25": 1.08192, + "26": 1.09185, + "27": 1.0923, + "28": 1.09562, + "29": 1.10486, + "30": 1.10038, + "31": 1.09094, + "32": 1.08693, + "33": 1.0883, + "34": 1.08169, + "35": 1.08611, + "36": 1.07758, + "37": 1.07933, + "38": 1.08289, + "39": 1.07885, + "40": 1.08075, + "41": 1.0781, + "42": 1.08028, + "43": 1.08035, + "44": 1.08973, + "45": 1.08944, + "46": 1.07483, + "47": 1.08306, + "48": 1.07701, + "49": 1.0768, + "50": 1.07022 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_eos.json index d4aa4cb5ee9..585139e83c9 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_eos.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_eos.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.04747, - "2": 11.03489, - "3": 9.59197, - "4": 9.2607, - "5": 9.25316, - "6": 9.70587, - "7": 9.46635, - "8": 9.01114, - "9": 8.72173, - "10": 9.06704, - "11": 8.59397, - "12": 8.5643, - "13": 8.44846, - "14": 7.97921, - "15": 8.04905, - "16": 8.09886, - "17": 8.04172, - "18": 7.76126, - "19": 8.14014, - "20": 7.86027, - "21": 7.54995, - "22": 7.53872, - "23": 7.40693, - "24": 7.40435, - "25": 7.66065, - "26": 7.05772, - "27": 7.59552, - "28": 7.30627, - "29": 7.48007, - "30": 7.63012, - "31": 7.38325, - "32": 7.57843, - "33": 7.62828, - "34": 7.68919, - "35": 7.20168, - "36": 7.07506, - "37": 7.41935, - "38": 7.17961, - "39": 7.54005, - "40": 7.53821, - "41": 7.47888, - "42": 7.24055, - "43": 7.2256, - "44": 7.40803, - "45": 7.1775, - "46": 6.88877, - "47": 7.29436, - "48": 7.13581, - "49": 7.58407, - "50": 7.02865 + "1": 11.04624, + "2": 11.03476, + "3": 9.59903, + "4": 9.26301, + "5": 9.36373, + "6": 9.59608, + "7": 9.45214, + "8": 8.95198, + "9": 8.65952, + "10": 9.17778, + "11": 9.21306, + "12": 8.68184, + "13": 8.6038, + "14": 8.01576, + "15": 8.13595, + "16": 8.20124, + "17": 8.13602, + "18": 7.83369, + "19": 8.22974, + "20": 7.9452, + "21": 7.62338, + "22": 7.60791, + "23": 7.48374, + "24": 7.46559, + "25": 7.71274, + "26": 7.12081, + "27": 7.64626, + "28": 7.35234, + "29": 7.52084, + "30": 7.67784, + "31": 7.42246, + "32": 7.6137, + "33": 7.66159, + "34": 7.72817, + "35": 7.23134, + "36": 7.10612, + "37": 7.44953, + "38": 7.20946, + "39": 7.57073, + "40": 7.56124, + "41": 7.51119, + "42": 7.27048, + "43": 7.25633, + "44": 7.43634, + "45": 7.21132, + "46": 6.91913, + "47": 7.32211, + "48": 7.16551, + "49": 7.6155, + "50": 7.05648 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 38802648.0, - "2": 38543564.0, - "3": 38740428.0, - "4": 264349216.0, - "5": 224711328.0, - "6": 359592256.0, - "7": 683584064.0, - "8": 850747136.0, - "9": 781151872.0, - "10": 863934336.0, - "11": 784956928.0, - "12": 787741824.0, - "13": 906642432.0, - "14": 793413952.0, - "15": 724351360.0, - "16": 929182656.0, - "17": 728944832.0, - "18": 715233856.0, - "19": 894586752.0, - "20": 942182208.0, - "21": 712310464.0, - "22": 903670336.0, - "23": 882199552.0, - "24": 867334400.0, - "25": 874751488.0, - "26": 844191104.0, - "27": 813243648.0, - "28": 626785920.0, - "29": 808773120.0, - "30": 602759296.0, - "31": 793783168.0, - "32": 768613888.0, - "33": 721639040.0, - "34": 734472448.0, - "35": 734570880.0, - "36": 703058560.0, - "37": 692109824.0, - "38": 649260992.0, - "39": 620422656.0, - "40": 604143616.0, - "41": 598320448.0, - "42": 573424384.0, - "43": 576846912.0, - "44": 570038144.0, - "45": 540081024.0, - "46": 501251008.0, - "47": 497637664.0, - "48": 494691072.0, - "49": 490977312.0, - "50": 463542304.0 + "1": 38802568, + "2": 38543544, + "3": 41886704, + "4": 264367872, + "5": 224737792, + "6": 302994528, + "7": 645808768, + "8": 775291136, + "9": 765475328, + "10": 675259904, + "11": 615098624, + "12": 702764352, + "13": 934951360, + "14": 1060699008, + "15": 802967296, + "16": 1026771392, + "17": 756706880, + "18": 715253696, + "19": 929126208, + "20": 875969472, + "21": 665188032, + "22": 903854976, + "23": 747044352, + "24": 920777856, + "25": 733230528, + "26": 863183104, + "27": 879318336, + "28": 916219136, + "29": 909384256, + "30": 879622720, + "31": 866425152, + "32": 819074560, + "33": 589493056, + "34": 772011648, + "35": 778655488, + "36": 759651584, + "37": 761302144, + "38": 463804224, + "39": 543038400, + "40": 497278720, + "41": 658241792, + "42": 661600512, + "43": 495713632, + "44": 673788672, + "45": 470873536, + "46": 614455040, + "47": 554219584, + "48": 570200064, + "49": 557109312, + "50": 347212736 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 7321331200.0, - "2": 7321333248.0, - "3": 7321333248.0, - "4": 7321333248.0, - "5": 7321333248.0, - "6": 7321333248.0, - "7": 7321333248.0, - "8": 7321333248.0, - "9": 7321333248.0, - "10": 7321333248.0, - "11": 7321333248.0, - "12": 7321333248.0, - "13": 7321333248.0, - "14": 7321333248.0, - "15": 7321333248.0, - "16": 7321333248.0, - "17": 7321333248.0, - "18": 7321333248.0, - "19": 7321333248.0, - "20": 7321333248.0, - "21": 7321333248.0, - "22": 7321333248.0, - "23": 7321333248.0, - "24": 7321333248.0, - "25": 7321333248.0, - "26": 7321333248.0, - "27": 7321333248.0, - "28": 7321333248.0, - "29": 7321333248.0, - "30": 7321333248.0, - "31": 7321333248.0, - "32": 7321333248.0, - "33": 7321333248.0, - "34": 7321333248.0, - "35": 7321333248.0, - "36": 7321333248.0, - "37": 7321333248.0, - "38": 7321333248.0, - "39": 7321333248.0, - "40": 7321333248.0, - "41": 7321333248.0, - "42": 7321333248.0, - "43": 7321333248.0, - "44": 7321333248.0, - "45": 7321333248.0, - "46": 7321333248.0, - "47": 7321333248.0, - "48": 7321333248.0, - "49": 7321333248.0, - "50": 7321333248.0 + "1": 7321308672, + "2": 7321310720, + "3": 7321310720, + "4": 7321310720, + "5": 7321310720, + "6": 7321310720, + "7": 7321310720, + "8": 7321310720, + "9": 7321310720, + "10": 7321310720, + "11": 7321310720, + "12": 7321310720, + "13": 7321310720, + "14": 7321310720, + "15": 7321310720, + "16": 7321310720, + "17": 7321310720, + "18": 7321310720, + "19": 7321310720, + "20": 7321310720, + "21": 7321310720, + "22": 7321310720, + "23": 7321310720, + "24": 7321310720, + "25": 7321310720, + "26": 7321310720, + "27": 7321310720, + "28": 7321310720, + "29": 7321310720, + "30": 7321310720, + "31": 7321310720, + "32": 7321310720, + "33": 7321310720, + "34": 7321310720, + "35": 7321310720, + "36": 7321310720, + "37": 7321310720, + "38": 7321310720, + "39": 7321310720, + "40": 7321310720, + "41": 7321310720, + "42": 7321310720, + "43": 7321310720, + "44": 7321310720, + "45": 7321310720, + "46": 7321310720, + "47": 7321310720, + "48": 7321310720, + "49": 7321310720, + "50": 7321310720 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 22198937600.0, - "2": 24950007808.0, - "3": 24950007808.0, - "4": 24950007808.0, - "5": 24950007808.0, - "6": 24950007808.0, - "7": 24950007808.0, - "8": 24950007808.0, - "9": 24950007808.0, - "10": 24950007808.0, - "11": 24950007808.0, - "12": 24950007808.0, - "13": 24950007808.0, - "14": 24950007808.0, - "15": 24950007808.0, - "16": 24950007808.0, - "17": 24950007808.0, - "18": 24950007808.0, - "19": 24950007808.0, - "20": 24950007808.0, - "21": 24950007808.0, - "22": 24950007808.0, - "23": 24950007808.0, - "24": 24950007808.0, - "25": 24950007808.0, - "26": 24950007808.0, - "27": 25072799744.0, - "28": 25343600640.0, - "29": 25625788416.0, - "30": 25625788416.0, - "31": 25628155904.0, - "32": 25707937792.0, - "33": 25707937792.0, - "34": 25707937792.0, - "35": 25707937792.0, - "36": 25707937792.0, - "37": 25707937792.0, - "38": 25707937792.0, - "39": 25707937792.0, - "40": 25707937792.0, - "41": 25707937792.0, - "42": 25707937792.0, - "43": 25707937792.0, - "44": 25707937792.0, - "45": 25707937792.0, - "46": 25707937792.0, - "47": 25707937792.0, - "48": 25707937792.0, - "49": 25707937792.0, - "50": 25707937792.0 + "1": 54396813312, + "2": 57149165568, + "3": 57165475840, + "4": 57165475840, + "5": 57165475840, + "6": 57165475840, + "7": 57165475840, + "8": 57165475840, + "9": 57165475840, + "10": 57165475840, + "11": 57165475840, + "12": 57165475840, + "13": 57165475840, + "14": 57165475840, + "15": 57165475840, + "16": 57165475840, + "17": 57165475840, + "18": 57165475840, + "19": 57165475840, + "20": 57165475840, + "21": 57165475840, + "22": 57165475840, + "23": 57165475840, + "24": 57165475840, + "25": 57165475840, + "26": 57165475840, + "27": 57165475840, + "28": 57165475840, + "29": 57165475840, + "30": 57165475840, + "31": 57165475840, + "32": 57165475840, + "33": 57165475840, + "34": 57165475840, + "35": 57165475840, + "36": 57165475840, + "37": 57165475840, + "38": 57165475840, + "39": 57165475840, + "40": 57295986688, + "41": 57295986688, + "42": 57331482624, + "43": 57360437248, + "44": 57561960448, + "45": 57561960448, + "46": 57561960448, + "47": 57585307648, + "48": 57602347008, + "49": 57823961088, + "50": 57823961088 } }, "mtp_1 loss": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.07742, - "2": 11.07559, - "3": 10.5272, - "4": 10.08877, - "5": 9.81119, - "6": 9.88673, - "7": 9.70278, - "8": 8.9944, - "9": 8.79002, - "10": 9.07171, - "11": 8.44594, - "12": 8.50226, - "13": 8.40983, - "14": 7.83955, - "15": 7.97902, - "16": 8.03361, - "17": 7.99642, - "18": 7.71928, - "19": 8.10116, - "20": 7.82113, - "21": 7.51112, - "22": 7.48906, - "23": 7.35335, - "24": 7.35884, - "25": 7.60836, - "26": 7.01391, - "27": 7.54721, - "28": 7.25644, - "29": 7.43129, - "30": 7.57524, - "31": 7.321, - "32": 7.50218, - "33": 7.56009, - "34": 7.62505, - "35": 7.14234, - "36": 7.0092, - "37": 7.34655, - "38": 7.11926, - "39": 7.4822, - "40": 7.46808, - "41": 7.41272, - "42": 7.1698, - "43": 7.15213, - "44": 7.33728, - "45": 7.11437, - "46": 6.81846, - "47": 7.2282, - "48": 7.07339, - "49": 7.50345, - "50": 6.96783 + "1": 11.07779, + "2": 11.07564, + "3": 10.52904, + "4": 10.08924, + "5": 9.81101, + "6": 9.88786, + "7": 9.72987, + "8": 9.02044, + "9": 8.8145, + "10": 9.09362, + "11": 8.77612, + "12": 8.56714, + "13": 8.54777, + "14": 8.04338, + "15": 8.10946, + "16": 8.13231, + "17": 8.0853, + "18": 7.83475, + "19": 8.21923, + "20": 7.91097, + "21": 7.58489, + "22": 7.56231, + "23": 7.44204, + "24": 7.44303, + "25": 7.67594, + "26": 7.07138, + "27": 7.60696, + "28": 7.30925, + "29": 7.48219, + "30": 7.62699, + "31": 7.3655, + "32": 7.54203, + "33": 7.60199, + "34": 7.66716, + "35": 7.18385, + "36": 7.05252, + "37": 7.38377, + "38": 7.15521, + "39": 7.51639, + "40": 7.4929, + "41": 7.44762, + "42": 7.20298, + "43": 7.18681, + "44": 7.36683, + "45": 7.15506, + "46": 6.85064, + "47": 7.26072, + "48": 7.10489, + "49": 7.53477, + "50": 6.99715 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 66.41406, - "2": 1.09711, - "3": 0.98871, - "4": 1.29382, - "5": 0.90133, - "6": 0.89235, - "7": 1.14675, - "8": 1.06393, - "9": 0.87141, - "10": 0.88489, - "11": 0.87653, - "12": 0.86844, - "13": 0.87292, - "14": 0.88542, - "15": 0.87413, - "16": 0.8658, - "17": 0.86683, - "18": 0.85604, - "19": 0.87144, - "20": 0.8739, - "21": 0.87412, - "22": 0.8842, - "23": 0.87866, - "24": 0.87817, - "25": 0.87219, - "26": 0.88191, - "27": 0.86283, - "28": 0.85644, - "29": 0.85444, - "30": 0.86821, - "31": 0.8659, - "32": 0.86683, - "33": 0.86547, - "34": 0.86171, - "35": 0.84405, - "36": 0.84744, - "37": 0.84896, - "38": 0.85314, - "39": 0.85693, - "40": 0.83956, - "41": 0.844, - "42": 0.84413, - "43": 0.83996, - "44": 0.84204, - "45": 0.84489, - "46": 0.83423, - "47": 0.83738, - "48": 0.85356, - "49": 0.86096, - "50": 0.85603 + "1": 89.12995, + "2": 1.33749, + "3": 1.24205, + "4": 1.63759, + "5": 1.13139, + "6": 1.12938, + "7": 1.37914, + "8": 1.3886, + "9": 1.10046, + "10": 1.11649, + "11": 1.11259, + "12": 1.10822, + "13": 1.10532, + "14": 1.11189, + "15": 1.1132, + "16": 1.10539, + "17": 1.11434, + "18": 1.11836, + "19": 1.11073, + "20": 1.11278, + "21": 1.11212, + "22": 1.10671, + "23": 1.11034, + "24": 1.11107, + "25": 1.11085, + "26": 1.10756, + "27": 1.10109, + "28": 1.1069, + "29": 1.11354, + "30": 1.11254, + "31": 1.10893, + "32": 1.11311, + "33": 1.10722, + "34": 1.10243, + "35": 1.10358, + "36": 1.09746, + "37": 1.09875, + "38": 1.10151, + "39": 1.10188, + "40": 1.10069, + "41": 1.10545, + "42": 1.10709, + "43": 1.1028, + "44": 1.10723, + "45": 1.10614, + "46": 1.09997, + "47": 1.1053, + "48": 1.10274, + "49": 1.09986, + "50": 1.10191 } } } \ No newline at end of file diff --git a/uv.lock b/uv.lock index 2d2e178241f..1046481f7ec 100644 --- a/uv.lock +++ b/uv.lock @@ -1637,63 +1637,63 @@ wheels = [ [[package]] name = "grpcio" -version = "1.75.1" +version = "1.76.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/9d/f7/8963848164c7604efb3a3e6ee457fdb3a469653e19002bd24742473254f8/grpcio-1.75.1.tar.gz", hash = "sha256:3e81d89ece99b9ace23a6916880baca613c03a799925afb2857887efa8b1b3d2", size = 12731327, upload-time = "2025-09-26T09:03:36.887Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/51/57/89fd829fb00a6d0bee3fbcb2c8a7aa0252d908949b6ab58bfae99d39d77e/grpcio-1.75.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:1712b5890b22547dd29f3215c5788d8fc759ce6dd0b85a6ba6e2731f2d04c088", size = 5705534, upload-time = "2025-09-26T09:00:52.225Z" }, - { url = "https://files.pythonhosted.org/packages/76/dd/2f8536e092551cf804e96bcda79ecfbc51560b214a0f5b7ebc253f0d4664/grpcio-1.75.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:8d04e101bba4b55cea9954e4aa71c24153ba6182481b487ff376da28d4ba46cf", size = 11484103, upload-time = "2025-09-26T09:00:59.457Z" }, - { url = "https://files.pythonhosted.org/packages/9a/3d/affe2fb897804c98d56361138e73786af8f4dd876b9d9851cfe6342b53c8/grpcio-1.75.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:683cfc70be0c1383449097cba637317e4737a357cfc185d887fd984206380403", size = 6289953, upload-time = "2025-09-26T09:01:03.699Z" }, - { url = "https://files.pythonhosted.org/packages/87/aa/0f40b7f47a0ff10d7e482bc3af22dac767c7ff27205915f08962d5ca87a2/grpcio-1.75.1-cp310-cp310-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:491444c081a54dcd5e6ada57314321ae526377f498d4aa09d975c3241c5b9e1c", size = 6949785, upload-time = "2025-09-26T09:01:07.504Z" }, - { url = "https://files.pythonhosted.org/packages/a5/45/b04407e44050781821c84f26df71b3f7bc469923f92f9f8bc27f1406dbcc/grpcio-1.75.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ce08d4e112d0d38487c2b631ec8723deac9bc404e9c7b1011426af50a79999e4", size = 6465708, upload-time = "2025-09-26T09:01:11.028Z" }, - { url = "https://files.pythonhosted.org/packages/09/3e/4ae3ec0a4d20dcaafbb6e597defcde06399ccdc5b342f607323f3b47f0a3/grpcio-1.75.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5a2acda37fc926ccc4547977ac3e56b1df48fe200de968e8c8421f6e3093df6c", size = 7100912, upload-time = "2025-09-26T09:01:14.393Z" }, - { url = "https://files.pythonhosted.org/packages/34/3f/a9085dab5c313bb0cb853f222d095e2477b9b8490a03634cdd8d19daa5c3/grpcio-1.75.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:745c5fe6bf05df6a04bf2d11552c7d867a2690759e7ab6b05c318a772739bd75", size = 8042497, upload-time = "2025-09-26T09:01:17.759Z" }, - { url = "https://files.pythonhosted.org/packages/c3/87/ea54eba931ab9ed3f999ba95f5d8d01a20221b664725bab2fe93e3dee848/grpcio-1.75.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:259526a7159d39e2db40d566fe3e8f8e034d0fb2db5bf9c00e09aace655a4c2b", size = 7493284, upload-time = "2025-09-26T09:01:20.896Z" }, - { url = "https://files.pythonhosted.org/packages/b7/5e/287f1bf1a998f4ac46ef45d518de3b5da08b4e86c7cb5e1108cee30b0282/grpcio-1.75.1-cp310-cp310-win32.whl", hash = "sha256:f4b29b9aabe33fed5df0a85e5f13b09ff25e2c05bd5946d25270a8bd5682dac9", size = 3950809, upload-time = "2025-09-26T09:01:23.695Z" }, - { url = "https://files.pythonhosted.org/packages/a4/a2/3cbfc06a4ec160dc77403b29ecb5cf76ae329eb63204fea6a7c715f1dfdb/grpcio-1.75.1-cp310-cp310-win_amd64.whl", hash = "sha256:cf2e760978dcce7ff7d465cbc7e276c3157eedc4c27aa6de7b594c7a295d3d61", size = 4644704, upload-time = "2025-09-26T09:01:25.763Z" }, - { url = "https://files.pythonhosted.org/packages/0c/3c/35ca9747473a306bfad0cee04504953f7098527cd112a4ab55c55af9e7bd/grpcio-1.75.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:573855ca2e58e35032aff30bfbd1ee103fbcf4472e4b28d4010757700918e326", size = 5709761, upload-time = "2025-09-26T09:01:28.528Z" }, - { url = "https://files.pythonhosted.org/packages/c9/2c/ecbcb4241e4edbe85ac2663f885726fea0e947767401288b50d8fdcb9200/grpcio-1.75.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:6a4996a2c8accc37976dc142d5991adf60733e223e5c9a2219e157dc6a8fd3a2", size = 11496691, upload-time = "2025-09-26T09:01:31.214Z" }, - { url = "https://files.pythonhosted.org/packages/81/40/bc07aee2911f0d426fa53fe636216100c31a8ea65a400894f280274cb023/grpcio-1.75.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b1ea1bbe77ecbc1be00af2769f4ae4a88ce93be57a4f3eebd91087898ed749f9", size = 6296084, upload-time = "2025-09-26T09:01:34.596Z" }, - { url = "https://files.pythonhosted.org/packages/b8/d1/10c067f6c67396cbf46448b80f27583b5e8c4b46cdfbe18a2a02c2c2f290/grpcio-1.75.1-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:e5b425aee54cc5e3e3c58f00731e8a33f5567965d478d516d35ef99fd648ab68", size = 6950403, upload-time = "2025-09-26T09:01:36.736Z" }, - { url = "https://files.pythonhosted.org/packages/3f/42/5f628abe360b84dfe8dd8f32be6b0606dc31dc04d3358eef27db791ea4d5/grpcio-1.75.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0049a7bf547dafaeeb1db17079ce79596c298bfe308fc084d023c8907a845b9a", size = 6470166, upload-time = "2025-09-26T09:01:39.474Z" }, - { url = "https://files.pythonhosted.org/packages/c3/93/a24035080251324019882ee2265cfde642d6476c0cf8eb207fc693fcebdc/grpcio-1.75.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5b8ea230c7f77c0a1a3208a04a1eda164633fb0767b4cefd65a01079b65e5b1f", size = 7107828, upload-time = "2025-09-26T09:01:41.782Z" }, - { url = "https://files.pythonhosted.org/packages/e4/f8/d18b984c1c9ba0318e3628dbbeb6af77a5007f02abc378c845070f2d3edd/grpcio-1.75.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:36990d629c3c9fb41e546414e5af52d0a7af37ce7113d9682c46d7e2919e4cca", size = 8045421, upload-time = "2025-09-26T09:01:45.835Z" }, - { url = "https://files.pythonhosted.org/packages/7e/b6/4bf9aacff45deca5eac5562547ed212556b831064da77971a4e632917da3/grpcio-1.75.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b10ad908118d38c2453ade7ff790e5bce36580c3742919007a2a78e3a1e521ca", size = 7503290, upload-time = "2025-09-26T09:01:49.28Z" }, - { url = "https://files.pythonhosted.org/packages/3b/15/d8d69d10223cb54c887a2180bd29fe5fa2aec1d4995c8821f7aa6eaf72e4/grpcio-1.75.1-cp311-cp311-win32.whl", hash = "sha256:d6be2b5ee7bea656c954dcf6aa8093c6f0e6a3ef9945c99d99fcbfc88c5c0bfe", size = 3950631, upload-time = "2025-09-26T09:01:51.23Z" }, - { url = "https://files.pythonhosted.org/packages/8a/40/7b8642d45fff6f83300c24eaac0380a840e5e7fe0e8d80afd31b99d7134e/grpcio-1.75.1-cp311-cp311-win_amd64.whl", hash = "sha256:61c692fb05956b17dd6d1ab480f7f10ad0536dba3bc8fd4e3c7263dc244ed772", size = 4646131, upload-time = "2025-09-26T09:01:53.266Z" }, - { url = "https://files.pythonhosted.org/packages/3a/81/42be79e73a50aaa20af66731c2defeb0e8c9008d9935a64dd8ea8e8c44eb/grpcio-1.75.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:7b888b33cd14085d86176b1628ad2fcbff94cfbbe7809465097aa0132e58b018", size = 5668314, upload-time = "2025-09-26T09:01:55.424Z" }, - { url = "https://files.pythonhosted.org/packages/c5/a7/3686ed15822fedc58c22f82b3a7403d9faf38d7c33de46d4de6f06e49426/grpcio-1.75.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:8775036efe4ad2085975531d221535329f5dac99b6c2a854a995456098f99546", size = 11476125, upload-time = "2025-09-26T09:01:57.927Z" }, - { url = "https://files.pythonhosted.org/packages/14/85/21c71d674f03345ab183c634ecd889d3330177e27baea8d5d247a89b6442/grpcio-1.75.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bb658f703468d7fbb5dcc4037c65391b7dc34f808ac46ed9136c24fc5eeb041d", size = 6246335, upload-time = "2025-09-26T09:02:00.76Z" }, - { url = "https://files.pythonhosted.org/packages/fd/db/3beb661bc56a385ae4fa6b0e70f6b91ac99d47afb726fe76aaff87ebb116/grpcio-1.75.1-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:4b7177a1cdb3c51b02b0c0a256b0a72fdab719600a693e0e9037949efffb200b", size = 6916309, upload-time = "2025-09-26T09:02:02.894Z" }, - { url = "https://files.pythonhosted.org/packages/1e/9c/eda9fe57f2b84343d44c1b66cf3831c973ba29b078b16a27d4587a1fdd47/grpcio-1.75.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7d4fa6ccc3ec2e68a04f7b883d354d7fea22a34c44ce535a2f0c0049cf626ddf", size = 6435419, upload-time = "2025-09-26T09:02:05.055Z" }, - { url = "https://files.pythonhosted.org/packages/c3/b8/090c98983e0a9d602e3f919a6e2d4e470a8b489452905f9a0fa472cac059/grpcio-1.75.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3d86880ecaeb5b2f0a8afa63824de93adb8ebe4e49d0e51442532f4e08add7d6", size = 7064893, upload-time = "2025-09-26T09:02:07.275Z" }, - { url = "https://files.pythonhosted.org/packages/ec/c0/6d53d4dbbd00f8bd81571f5478d8a95528b716e0eddb4217cc7cb45aae5f/grpcio-1.75.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a8041d2f9e8a742aeae96f4b047ee44e73619f4f9d24565e84d5446c623673b6", size = 8011922, upload-time = "2025-09-26T09:02:09.527Z" }, - { url = "https://files.pythonhosted.org/packages/f2/7c/48455b2d0c5949678d6982c3e31ea4d89df4e16131b03f7d5c590811cbe9/grpcio-1.75.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3652516048bf4c314ce12be37423c79829f46efffb390ad64149a10c6071e8de", size = 7466181, upload-time = "2025-09-26T09:02:12.279Z" }, - { url = "https://files.pythonhosted.org/packages/fd/12/04a0e79081e3170b6124f8cba9b6275871276be06c156ef981033f691880/grpcio-1.75.1-cp312-cp312-win32.whl", hash = "sha256:44b62345d8403975513af88da2f3d5cc76f73ca538ba46596f92a127c2aea945", size = 3938543, upload-time = "2025-09-26T09:02:14.77Z" }, - { url = "https://files.pythonhosted.org/packages/5f/d7/11350d9d7fb5adc73d2b0ebf6ac1cc70135577701e607407fe6739a90021/grpcio-1.75.1-cp312-cp312-win_amd64.whl", hash = "sha256:b1e191c5c465fa777d4cafbaacf0c01e0d5278022082c0abbd2ee1d6454ed94d", size = 4641938, upload-time = "2025-09-26T09:02:16.927Z" }, - { url = "https://files.pythonhosted.org/packages/46/74/bac4ab9f7722164afdf263ae31ba97b8174c667153510322a5eba4194c32/grpcio-1.75.1-cp313-cp313-linux_armv7l.whl", hash = "sha256:3bed22e750d91d53d9e31e0af35a7b0b51367e974e14a4ff229db5b207647884", size = 5672779, upload-time = "2025-09-26T09:02:19.11Z" }, - { url = "https://files.pythonhosted.org/packages/a6/52/d0483cfa667cddaa294e3ab88fd2c2a6e9dc1a1928c0e5911e2e54bd5b50/grpcio-1.75.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:5b8f381eadcd6ecaa143a21e9e80a26424c76a0a9b3d546febe6648f3a36a5ac", size = 11470623, upload-time = "2025-09-26T09:02:22.117Z" }, - { url = "https://files.pythonhosted.org/packages/cf/e4/d1954dce2972e32384db6a30273275e8c8ea5a44b80347f9055589333b3f/grpcio-1.75.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5bf4001d3293e3414d0cf99ff9b1139106e57c3a66dfff0c5f60b2a6286ec133", size = 6248838, upload-time = "2025-09-26T09:02:26.426Z" }, - { url = "https://files.pythonhosted.org/packages/06/43/073363bf63826ba8077c335d797a8d026f129dc0912b69c42feaf8f0cd26/grpcio-1.75.1-cp313-cp313-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:9f82ff474103e26351dacfe8d50214e7c9322960d8d07ba7fa1d05ff981c8b2d", size = 6922663, upload-time = "2025-09-26T09:02:28.724Z" }, - { url = "https://files.pythonhosted.org/packages/c2/6f/076ac0df6c359117676cacfa8a377e2abcecec6a6599a15a672d331f6680/grpcio-1.75.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0ee119f4f88d9f75414217823d21d75bfe0e6ed40135b0cbbfc6376bc9f7757d", size = 6436149, upload-time = "2025-09-26T09:02:30.971Z" }, - { url = "https://files.pythonhosted.org/packages/6b/27/1d08824f1d573fcb1fa35ede40d6020e68a04391709939e1c6f4193b445f/grpcio-1.75.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:664eecc3abe6d916fa6cf8dd6b778e62fb264a70f3430a3180995bf2da935446", size = 7067989, upload-time = "2025-09-26T09:02:33.233Z" }, - { url = "https://files.pythonhosted.org/packages/c6/98/98594cf97b8713feb06a8cb04eeef60b4757e3e2fb91aa0d9161da769843/grpcio-1.75.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:c32193fa08b2fbebf08fe08e84f8a0aad32d87c3ad42999c65e9449871b1c66e", size = 8010717, upload-time = "2025-09-26T09:02:36.011Z" }, - { url = "https://files.pythonhosted.org/packages/8c/7e/bb80b1bba03c12158f9254762cdf5cced4a9bc2e8ed51ed335915a5a06ef/grpcio-1.75.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5cebe13088b9254f6e615bcf1da9131d46cfa4e88039454aca9cb65f639bd3bc", size = 7463822, upload-time = "2025-09-26T09:02:38.26Z" }, - { url = "https://files.pythonhosted.org/packages/23/1c/1ea57fdc06927eb5640f6750c697f596f26183573069189eeaf6ef86ba2d/grpcio-1.75.1-cp313-cp313-win32.whl", hash = "sha256:4b4c678e7ed50f8ae8b8dbad15a865ee73ce12668b6aaf411bf3258b5bc3f970", size = 3938490, upload-time = "2025-09-26T09:02:40.268Z" }, - { url = "https://files.pythonhosted.org/packages/4b/24/fbb8ff1ccadfbf78ad2401c41aceaf02b0d782c084530d8871ddd69a2d49/grpcio-1.75.1-cp313-cp313-win_amd64.whl", hash = "sha256:5573f51e3f296a1bcf71e7a690c092845fb223072120f4bdb7a5b48e111def66", size = 4642538, upload-time = "2025-09-26T09:02:42.519Z" }, - { url = "https://files.pythonhosted.org/packages/f2/1b/9a0a5cecd24302b9fdbcd55d15ed6267e5f3d5b898ff9ac8cbe17ee76129/grpcio-1.75.1-cp314-cp314-linux_armv7l.whl", hash = "sha256:c05da79068dd96723793bffc8d0e64c45f316248417515f28d22204d9dae51c7", size = 5673319, upload-time = "2025-09-26T09:02:44.742Z" }, - { url = "https://files.pythonhosted.org/packages/c6/ec/9d6959429a83fbf5df8549c591a8a52bb313976f6646b79852c4884e3225/grpcio-1.75.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:06373a94fd16ec287116a825161dca179a0402d0c60674ceeec8c9fba344fe66", size = 11480347, upload-time = "2025-09-26T09:02:47.539Z" }, - { url = "https://files.pythonhosted.org/packages/09/7a/26da709e42c4565c3d7bf999a9569da96243ce34a8271a968dee810a7cf1/grpcio-1.75.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4484f4b7287bdaa7a5b3980f3c7224c3c622669405d20f69549f5fb956ad0421", size = 6254706, upload-time = "2025-09-26T09:02:50.4Z" }, - { url = "https://files.pythonhosted.org/packages/f1/08/dcb26a319d3725f199c97e671d904d84ee5680de57d74c566a991cfab632/grpcio-1.75.1-cp314-cp314-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:2720c239c1180eee69f7883c1d4c83fc1a495a2535b5fa322887c70bf02b16e8", size = 6922501, upload-time = "2025-09-26T09:02:52.711Z" }, - { url = "https://files.pythonhosted.org/packages/78/66/044d412c98408a5e23cb348845979a2d17a2e2b6c3c34c1ec91b920f49d0/grpcio-1.75.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:07a554fa31c668cf0e7a188678ceeca3cb8fead29bbe455352e712ec33ca701c", size = 6437492, upload-time = "2025-09-26T09:02:55.542Z" }, - { url = "https://files.pythonhosted.org/packages/4e/9d/5e3e362815152aa1afd8b26ea613effa005962f9da0eec6e0e4527e7a7d1/grpcio-1.75.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:3e71a2105210366bfc398eef7f57a664df99194f3520edb88b9c3a7e46ee0d64", size = 7081061, upload-time = "2025-09-26T09:02:58.261Z" }, - { url = "https://files.pythonhosted.org/packages/1e/1a/46615682a19e100f46e31ddba9ebc297c5a5ab9ddb47b35443ffadb8776c/grpcio-1.75.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:8679aa8a5b67976776d3c6b0521e99d1c34db8a312a12bcfd78a7085cb9b604e", size = 8010849, upload-time = "2025-09-26T09:03:00.548Z" }, - { url = "https://files.pythonhosted.org/packages/67/8e/3204b94ac30b0f675ab1c06540ab5578660dc8b690db71854d3116f20d00/grpcio-1.75.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:aad1c774f4ebf0696a7f148a56d39a3432550612597331792528895258966dc0", size = 7464478, upload-time = "2025-09-26T09:03:03.096Z" }, - { url = "https://files.pythonhosted.org/packages/b7/97/2d90652b213863b2cf466d9c1260ca7e7b67a16780431b3eb1d0420e3d5b/grpcio-1.75.1-cp314-cp314-win32.whl", hash = "sha256:62ce42d9994446b307649cb2a23335fa8e927f7ab2cbf5fcb844d6acb4d85f9c", size = 4012672, upload-time = "2025-09-26T09:03:05.477Z" }, - { url = "https://files.pythonhosted.org/packages/f9/df/e2e6e9fc1c985cd1a59e6996a05647c720fe8a03b92f5ec2d60d366c531e/grpcio-1.75.1-cp314-cp314-win_amd64.whl", hash = "sha256:f86e92275710bea3000cb79feca1762dc0ad3b27830dd1a74e82ab321d4ee464", size = 4772475, upload-time = "2025-09-26T09:03:07.661Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/b6/e0/318c1ce3ae5a17894d5791e87aea147587c9e702f24122cc7a5c8bbaeeb1/grpcio-1.76.0.tar.gz", hash = "sha256:7be78388d6da1a25c0d5ec506523db58b18be22d9c37d8d3a32c08be4987bd73", size = 12785182, upload-time = "2025-10-21T16:23:12.106Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/17/ff4795dc9a34b6aee6ec379f1b66438a3789cd1315aac0cbab60d92f74b3/grpcio-1.76.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:65a20de41e85648e00305c1bb09a3598f840422e522277641145a32d42dcefcc", size = 5840037, upload-time = "2025-10-21T16:20:25.069Z" }, + { url = "https://files.pythonhosted.org/packages/4e/ff/35f9b96e3fa2f12e1dcd58a4513a2e2294a001d64dec81677361b7040c9a/grpcio-1.76.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:40ad3afe81676fd9ec6d9d406eda00933f218038433980aa19d401490e46ecde", size = 11836482, upload-time = "2025-10-21T16:20:30.113Z" }, + { url = "https://files.pythonhosted.org/packages/3e/1c/8374990f9545e99462caacea5413ed783014b3b66ace49e35c533f07507b/grpcio-1.76.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:035d90bc79eaa4bed83f524331d55e35820725c9fbb00ffa1904d5550ed7ede3", size = 6407178, upload-time = "2025-10-21T16:20:32.733Z" }, + { url = "https://files.pythonhosted.org/packages/1e/77/36fd7d7c75a6c12542c90a6d647a27935a1ecaad03e0ffdb7c42db6b04d2/grpcio-1.76.0-cp310-cp310-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:4215d3a102bd95e2e11b5395c78562967959824156af11fa93d18fdd18050990", size = 7075684, upload-time = "2025-10-21T16:20:35.435Z" }, + { url = "https://files.pythonhosted.org/packages/38/f7/e3cdb252492278e004722306c5a8935eae91e64ea11f0af3437a7de2e2b7/grpcio-1.76.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:49ce47231818806067aea3324d4bf13825b658ad662d3b25fada0bdad9b8a6af", size = 6611133, upload-time = "2025-10-21T16:20:37.541Z" }, + { url = "https://files.pythonhosted.org/packages/7e/20/340db7af162ccd20a0893b5f3c4a5d676af7b71105517e62279b5b61d95a/grpcio-1.76.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8cc3309d8e08fd79089e13ed4819d0af72aa935dd8f435a195fd152796752ff2", size = 7195507, upload-time = "2025-10-21T16:20:39.643Z" }, + { url = "https://files.pythonhosted.org/packages/10/f0/b2160addc1487bd8fa4810857a27132fb4ce35c1b330c2f3ac45d697b106/grpcio-1.76.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:971fd5a1d6e62e00d945423a567e42eb1fa678ba89072832185ca836a94daaa6", size = 8160651, upload-time = "2025-10-21T16:20:42.492Z" }, + { url = "https://files.pythonhosted.org/packages/2c/2c/ac6f98aa113c6ef111b3f347854e99ebb7fb9d8f7bb3af1491d438f62af4/grpcio-1.76.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9d9adda641db7207e800a7f089068f6f645959f2df27e870ee81d44701dd9db3", size = 7620568, upload-time = "2025-10-21T16:20:45.995Z" }, + { url = "https://files.pythonhosted.org/packages/90/84/7852f7e087285e3ac17a2703bc4129fafee52d77c6c82af97d905566857e/grpcio-1.76.0-cp310-cp310-win32.whl", hash = "sha256:063065249d9e7e0782d03d2bca50787f53bd0fb89a67de9a7b521c4a01f1989b", size = 3998879, upload-time = "2025-10-21T16:20:48.592Z" }, + { url = "https://files.pythonhosted.org/packages/10/30/d3d2adcbb6dd3ff59d6ac3df6ef830e02b437fb5c90990429fd180e52f30/grpcio-1.76.0-cp310-cp310-win_amd64.whl", hash = "sha256:a6ae758eb08088d36812dd5d9af7a9859c05b1e0f714470ea243694b49278e7b", size = 4706892, upload-time = "2025-10-21T16:20:50.697Z" }, + { url = "https://files.pythonhosted.org/packages/a0/00/8163a1beeb6971f66b4bbe6ac9457b97948beba8dd2fc8e1281dce7f79ec/grpcio-1.76.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:2e1743fbd7f5fa713a1b0a8ac8ebabf0ec980b5d8809ec358d488e273b9cf02a", size = 5843567, upload-time = "2025-10-21T16:20:52.829Z" }, + { url = "https://files.pythonhosted.org/packages/10/c1/934202f5cf335e6d852530ce14ddb0fef21be612ba9ecbbcbd4d748ca32d/grpcio-1.76.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:a8c2cf1209497cf659a667d7dea88985e834c24b7c3b605e6254cbb5076d985c", size = 11848017, upload-time = "2025-10-21T16:20:56.705Z" }, + { url = "https://files.pythonhosted.org/packages/11/0b/8dec16b1863d74af6eb3543928600ec2195af49ca58b16334972f6775663/grpcio-1.76.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:08caea849a9d3c71a542827d6df9d5a69067b0a1efbea8a855633ff5d9571465", size = 6412027, upload-time = "2025-10-21T16:20:59.3Z" }, + { url = "https://files.pythonhosted.org/packages/d7/64/7b9e6e7ab910bea9d46f2c090380bab274a0b91fb0a2fe9b0cd399fffa12/grpcio-1.76.0-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:f0e34c2079d47ae9f6188211db9e777c619a21d4faba6977774e8fa43b085e48", size = 7075913, upload-time = "2025-10-21T16:21:01.645Z" }, + { url = "https://files.pythonhosted.org/packages/68/86/093c46e9546073cefa789bd76d44c5cb2abc824ca62af0c18be590ff13ba/grpcio-1.76.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8843114c0cfce61b40ad48df65abcfc00d4dba82eae8718fab5352390848c5da", size = 6615417, upload-time = "2025-10-21T16:21:03.844Z" }, + { url = "https://files.pythonhosted.org/packages/f7/b6/5709a3a68500a9c03da6fb71740dcdd5ef245e39266461a03f31a57036d8/grpcio-1.76.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8eddfb4d203a237da6f3cc8a540dad0517d274b5a1e9e636fd8d2c79b5c1d397", size = 7199683, upload-time = "2025-10-21T16:21:06.195Z" }, + { url = "https://files.pythonhosted.org/packages/91/d3/4b1f2bf16ed52ce0b508161df3a2d186e4935379a159a834cb4a7d687429/grpcio-1.76.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:32483fe2aab2c3794101c2a159070584e5db11d0aa091b2c0ea9c4fc43d0d749", size = 8163109, upload-time = "2025-10-21T16:21:08.498Z" }, + { url = "https://files.pythonhosted.org/packages/5c/61/d9043f95f5f4cf085ac5dd6137b469d41befb04bd80280952ffa2a4c3f12/grpcio-1.76.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:dcfe41187da8992c5f40aa8c5ec086fa3672834d2be57a32384c08d5a05b4c00", size = 7626676, upload-time = "2025-10-21T16:21:10.693Z" }, + { url = "https://files.pythonhosted.org/packages/36/95/fd9a5152ca02d8881e4dd419cdd790e11805979f499a2e5b96488b85cf27/grpcio-1.76.0-cp311-cp311-win32.whl", hash = "sha256:2107b0c024d1b35f4083f11245c0e23846ae64d02f40b2b226684840260ed054", size = 3997688, upload-time = "2025-10-21T16:21:12.746Z" }, + { url = "https://files.pythonhosted.org/packages/60/9c/5c359c8d4c9176cfa3c61ecd4efe5affe1f38d9bae81e81ac7186b4c9cc8/grpcio-1.76.0-cp311-cp311-win_amd64.whl", hash = "sha256:522175aba7af9113c48ec10cc471b9b9bd4f6ceb36aeb4544a8e2c80ed9d252d", size = 4709315, upload-time = "2025-10-21T16:21:15.26Z" }, + { url = "https://files.pythonhosted.org/packages/bf/05/8e29121994b8d959ffa0afd28996d452f291b48cfc0875619de0bde2c50c/grpcio-1.76.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:81fd9652b37b36f16138611c7e884eb82e0cec137c40d3ef7c3f9b3ed00f6ed8", size = 5799718, upload-time = "2025-10-21T16:21:17.939Z" }, + { url = "https://files.pythonhosted.org/packages/d9/75/11d0e66b3cdf998c996489581bdad8900db79ebd83513e45c19548f1cba4/grpcio-1.76.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:04bbe1bfe3a68bbfd4e52402ab7d4eb59d72d02647ae2042204326cf4bbad280", size = 11825627, upload-time = "2025-10-21T16:21:20.466Z" }, + { url = "https://files.pythonhosted.org/packages/28/50/2f0aa0498bc188048f5d9504dcc5c2c24f2eb1a9337cd0fa09a61a2e75f0/grpcio-1.76.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d388087771c837cdb6515539f43b9d4bf0b0f23593a24054ac16f7a960be16f4", size = 6359167, upload-time = "2025-10-21T16:21:23.122Z" }, + { url = "https://files.pythonhosted.org/packages/66/e5/bbf0bb97d29ede1d59d6588af40018cfc345b17ce979b7b45424628dc8bb/grpcio-1.76.0-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:9f8f757bebaaea112c00dba718fc0d3260052ce714e25804a03f93f5d1c6cc11", size = 7044267, upload-time = "2025-10-21T16:21:25.995Z" }, + { url = "https://files.pythonhosted.org/packages/f5/86/f6ec2164f743d9609691115ae8ece098c76b894ebe4f7c94a655c6b03e98/grpcio-1.76.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:980a846182ce88c4f2f7e2c22c56aefd515daeb36149d1c897f83cf57999e0b6", size = 6573963, upload-time = "2025-10-21T16:21:28.631Z" }, + { url = "https://files.pythonhosted.org/packages/60/bc/8d9d0d8505feccfdf38a766d262c71e73639c165b311c9457208b56d92ae/grpcio-1.76.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f92f88e6c033db65a5ae3d97905c8fea9c725b63e28d5a75cb73b49bda5024d8", size = 7164484, upload-time = "2025-10-21T16:21:30.837Z" }, + { url = "https://files.pythonhosted.org/packages/67/e6/5d6c2fc10b95edf6df9b8f19cf10a34263b7fd48493936fffd5085521292/grpcio-1.76.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4baf3cbe2f0be3289eb68ac8ae771156971848bb8aaff60bad42005539431980", size = 8127777, upload-time = "2025-10-21T16:21:33.577Z" }, + { url = "https://files.pythonhosted.org/packages/3f/c8/dce8ff21c86abe025efe304d9e31fdb0deaaa3b502b6a78141080f206da0/grpcio-1.76.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:615ba64c208aaceb5ec83bfdce7728b80bfeb8be97562944836a7a0a9647d882", size = 7594014, upload-time = "2025-10-21T16:21:41.882Z" }, + { url = "https://files.pythonhosted.org/packages/e0/42/ad28191ebf983a5d0ecef90bab66baa5a6b18f2bfdef9d0a63b1973d9f75/grpcio-1.76.0-cp312-cp312-win32.whl", hash = "sha256:45d59a649a82df5718fd9527ce775fd66d1af35e6d31abdcdc906a49c6822958", size = 3984750, upload-time = "2025-10-21T16:21:44.006Z" }, + { url = "https://files.pythonhosted.org/packages/9e/00/7bd478cbb851c04a48baccaa49b75abaa8e4122f7d86da797500cccdd771/grpcio-1.76.0-cp312-cp312-win_amd64.whl", hash = "sha256:c088e7a90b6017307f423efbb9d1ba97a22aa2170876223f9709e9d1de0b5347", size = 4704003, upload-time = "2025-10-21T16:21:46.244Z" }, + { url = "https://files.pythonhosted.org/packages/fc/ed/71467ab770effc9e8cef5f2e7388beb2be26ed642d567697bb103a790c72/grpcio-1.76.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:26ef06c73eb53267c2b319f43e6634c7556ea37672029241a056629af27c10e2", size = 5807716, upload-time = "2025-10-21T16:21:48.475Z" }, + { url = "https://files.pythonhosted.org/packages/2c/85/c6ed56f9817fab03fa8a111ca91469941fb514e3e3ce6d793cb8f1e1347b/grpcio-1.76.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:45e0111e73f43f735d70786557dc38141185072d7ff8dc1829d6a77ac1471468", size = 11821522, upload-time = "2025-10-21T16:21:51.142Z" }, + { url = "https://files.pythonhosted.org/packages/ac/31/2b8a235ab40c39cbc141ef647f8a6eb7b0028f023015a4842933bc0d6831/grpcio-1.76.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:83d57312a58dcfe2a3a0f9d1389b299438909a02db60e2f2ea2ae2d8034909d3", size = 6362558, upload-time = "2025-10-21T16:21:54.213Z" }, + { url = "https://files.pythonhosted.org/packages/bd/64/9784eab483358e08847498ee56faf8ff6ea8e0a4592568d9f68edc97e9e9/grpcio-1.76.0-cp313-cp313-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:3e2a27c89eb9ac3d81ec8835e12414d73536c6e620355d65102503064a4ed6eb", size = 7049990, upload-time = "2025-10-21T16:21:56.476Z" }, + { url = "https://files.pythonhosted.org/packages/2b/94/8c12319a6369434e7a184b987e8e9f3b49a114c489b8315f029e24de4837/grpcio-1.76.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:61f69297cba3950a524f61c7c8ee12e55c486cb5f7db47ff9dcee33da6f0d3ae", size = 6575387, upload-time = "2025-10-21T16:21:59.051Z" }, + { url = "https://files.pythonhosted.org/packages/15/0f/f12c32b03f731f4a6242f771f63039df182c8b8e2cf8075b245b409259d4/grpcio-1.76.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6a15c17af8839b6801d554263c546c69c4d7718ad4321e3166175b37eaacca77", size = 7166668, upload-time = "2025-10-21T16:22:02.049Z" }, + { url = "https://files.pythonhosted.org/packages/ff/2d/3ec9ce0c2b1d92dd59d1c3264aaec9f0f7c817d6e8ac683b97198a36ed5a/grpcio-1.76.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:25a18e9810fbc7e7f03ec2516addc116a957f8cbb8cbc95ccc80faa072743d03", size = 8124928, upload-time = "2025-10-21T16:22:04.984Z" }, + { url = "https://files.pythonhosted.org/packages/1a/74/fd3317be5672f4856bcdd1a9e7b5e17554692d3db9a3b273879dc02d657d/grpcio-1.76.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:931091142fd8cc14edccc0845a79248bc155425eee9a98b2db2ea4f00a235a42", size = 7589983, upload-time = "2025-10-21T16:22:07.881Z" }, + { url = "https://files.pythonhosted.org/packages/45/bb/ca038cf420f405971f19821c8c15bcbc875505f6ffadafe9ffd77871dc4c/grpcio-1.76.0-cp313-cp313-win32.whl", hash = "sha256:5e8571632780e08526f118f74170ad8d50fb0a48c23a746bef2a6ebade3abd6f", size = 3984727, upload-time = "2025-10-21T16:22:10.032Z" }, + { url = "https://files.pythonhosted.org/packages/41/80/84087dc56437ced7cdd4b13d7875e7439a52a261e3ab4e06488ba6173b0a/grpcio-1.76.0-cp313-cp313-win_amd64.whl", hash = "sha256:f9f7bd5faab55f47231ad8dba7787866b69f5e93bc306e3915606779bbfb4ba8", size = 4702799, upload-time = "2025-10-21T16:22:12.709Z" }, + { url = "https://files.pythonhosted.org/packages/b4/46/39adac80de49d678e6e073b70204091e76631e03e94928b9ea4ecf0f6e0e/grpcio-1.76.0-cp314-cp314-linux_armv7l.whl", hash = "sha256:ff8a59ea85a1f2191a0ffcc61298c571bc566332f82e5f5be1b83c9d8e668a62", size = 5808417, upload-time = "2025-10-21T16:22:15.02Z" }, + { url = "https://files.pythonhosted.org/packages/9c/f5/a4531f7fb8b4e2a60b94e39d5d924469b7a6988176b3422487be61fe2998/grpcio-1.76.0-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:06c3d6b076e7b593905d04fdba6a0525711b3466f43b3400266f04ff735de0cd", size = 11828219, upload-time = "2025-10-21T16:22:17.954Z" }, + { url = "https://files.pythonhosted.org/packages/4b/1c/de55d868ed7a8bd6acc6b1d6ddc4aa36d07a9f31d33c912c804adb1b971b/grpcio-1.76.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fd5ef5932f6475c436c4a55e4336ebbe47bd3272be04964a03d316bbf4afbcbc", size = 6367826, upload-time = "2025-10-21T16:22:20.721Z" }, + { url = "https://files.pythonhosted.org/packages/59/64/99e44c02b5adb0ad13ab3adc89cb33cb54bfa90c74770f2607eea629b86f/grpcio-1.76.0-cp314-cp314-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:b331680e46239e090f5b3cead313cc772f6caa7d0fc8de349337563125361a4a", size = 7049550, upload-time = "2025-10-21T16:22:23.637Z" }, + { url = "https://files.pythonhosted.org/packages/43/28/40a5be3f9a86949b83e7d6a2ad6011d993cbe9b6bd27bea881f61c7788b6/grpcio-1.76.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2229ae655ec4e8999599469559e97630185fdd53ae1e8997d147b7c9b2b72cba", size = 6575564, upload-time = "2025-10-21T16:22:26.016Z" }, + { url = "https://files.pythonhosted.org/packages/4b/a9/1be18e6055b64467440208a8559afac243c66a8b904213af6f392dc2212f/grpcio-1.76.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:490fa6d203992c47c7b9e4a9d39003a0c2bcc1c9aa3c058730884bbbb0ee9f09", size = 7176236, upload-time = "2025-10-21T16:22:28.362Z" }, + { url = "https://files.pythonhosted.org/packages/0f/55/dba05d3fcc151ce6e81327541d2cc8394f442f6b350fead67401661bf041/grpcio-1.76.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:479496325ce554792dba6548fae3df31a72cef7bad71ca2e12b0e58f9b336bfc", size = 8125795, upload-time = "2025-10-21T16:22:31.075Z" }, + { url = "https://files.pythonhosted.org/packages/4a/45/122df922d05655f63930cf42c9e3f72ba20aadb26c100ee105cad4ce4257/grpcio-1.76.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:1c9b93f79f48b03ada57ea24725d83a30284a012ec27eab2cf7e50a550cbbbcc", size = 7592214, upload-time = "2025-10-21T16:22:33.831Z" }, + { url = "https://files.pythonhosted.org/packages/4a/6e/0b899b7f6b66e5af39e377055fb4a6675c9ee28431df5708139df2e93233/grpcio-1.76.0-cp314-cp314-win32.whl", hash = "sha256:747fa73efa9b8b1488a95d0ba1039c8e2dca0f741612d80415b1e1c560febf4e", size = 4062961, upload-time = "2025-10-21T16:22:36.468Z" }, + { url = "https://files.pythonhosted.org/packages/19/41/0b430b01a2eb38ee887f88c1f07644a1df8e289353b78e82b37ef988fb64/grpcio-1.76.0-cp314-cp314-win_amd64.whl", hash = "sha256:922fa70ba549fce362d2e2871ab542082d66e2aaf0c19480ea453905b01f384e", size = 4834462, upload-time = "2025-10-21T16:22:39.772Z" }, ] [[package]] @@ -2666,7 +2666,7 @@ wheels = [ [[package]] name = "multi-storage-client" -version = "0.32.0" +version = "0.33.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, @@ -2683,22 +2683,22 @@ dependencies = [ { name = "xattr" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/4f/a4/c5294930789d50ac9745d0f04a22c925278b9593add0d4c28c0633cc21d6/multi_storage_client-0.32.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c02be32131ea5d5dedf537a5985aaf318aafe8c361cf58796850eac9219f0966", size = 5274899, upload-time = "2025-10-10T21:36:42.846Z" }, - { url = "https://files.pythonhosted.org/packages/e5/2f/d09abbf037e87943de338bb578091125779fc3b3b4a5a58fd7d4b02bdd63/multi_storage_client-0.32.0-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:bbfd9a5bdff5337b7698755876bdb1ff1ea906a5c299c7ebb33f2e92cc23d55d", size = 5395977, upload-time = "2025-10-10T21:36:17.875Z" }, - { url = "https://files.pythonhosted.org/packages/62/89/3508d9cc0985da78d11e897e69296d5b88a7e6d59d5bfeee0ecdad2a1ee3/multi_storage_client-0.32.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc5206c6f86a43499bdebfcc1d21617d4263fc7b49fa14afc531098f956f7998", size = 3171327, upload-time = "2025-10-10T21:43:15.737Z" }, - { url = "https://files.pythonhosted.org/packages/ea/a9/e958250c52254e9a2a9944a3fd92521bc3a521a3ade4f36742ff61a8bb64/multi_storage_client-0.32.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6598970ea0b7355185aa92dca79e8dd01669c60060106d4ff60b5cfb183bf7e4", size = 3343998, upload-time = "2025-10-10T21:40:55.721Z" }, - { url = "https://files.pythonhosted.org/packages/13/6c/cbaa0bc8464e3b7c5ab826c008b60930733ebd4e7aa3f258d6d6ee989b65/multi_storage_client-0.32.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8cca798a817cee747d957176eeb716208dbe4cd4c66b4a4d4a24abb73dde6cd2", size = 5274417, upload-time = "2025-10-10T21:39:45.954Z" }, - { url = "https://files.pythonhosted.org/packages/3d/92/fa6cfdc40b39b1f7e92bbbc654d3d1c9882806b561a8e0498c17b5771375/multi_storage_client-0.32.0-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:2d25c8e42f289bce788606db3cebabe41ab35840a35fce0349c660d214dc3a00", size = 5396247, upload-time = "2025-10-10T21:41:42.428Z" }, - { url = "https://files.pythonhosted.org/packages/2c/4d/a6140ea6a2b1d2d180adeb424305fd97682975bbd0eb52d7ba841eb477d9/multi_storage_client-0.32.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:52f6e592a7532b986d46181f42952aeb334c781b83f0b6175c3efe998d01a646", size = 3172948, upload-time = "2025-10-10T21:41:18.508Z" }, - { url = "https://files.pythonhosted.org/packages/83/18/2c68bbcf1bedc943e51fc279cee70e474dab8cc42fef12ce0a4cb80d11df/multi_storage_client-0.32.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:35cd768a19e24246dc8207e6812f23a688933a9a1f1dbced0ec7d0f25c0f086f", size = 3344283, upload-time = "2025-10-10T21:44:02.03Z" }, - { url = "https://files.pythonhosted.org/packages/30/fc/ab252dc0f9080706ec5cdce0ea17e76825885b163b4dd52c5b9909e8adf6/multi_storage_client-0.32.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7cdd9af98981430594c4a47a5283b4dac51d6cad7c983b00dd0fec9daaa0061e", size = 5266870, upload-time = "2025-10-10T21:37:53.421Z" }, - { url = "https://files.pythonhosted.org/packages/9d/c4/2ff90f2bc3bc9318b9158640e8cf92d57e96f1daa8c4222f2ff587615211/multi_storage_client-0.32.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:86b0a319cecefa3d9130a0f0976b5059b0234a4a9c01467151fa364350e6679e", size = 5393630, upload-time = "2025-10-10T21:35:30.693Z" }, - { url = "https://files.pythonhosted.org/packages/20/c0/385ab374dddaaad9588ab6eef3dd200bfa6adac4148b674dfae10bfdc1af/multi_storage_client-0.32.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5e481509ca3d09289069c68c519a09eef2c82684e6e50ba2628e043a611de5b", size = 3175520, upload-time = "2025-10-10T21:35:54.182Z" }, - { url = "https://files.pythonhosted.org/packages/15/fe/40663eb2fcca12a22523f39cb03eb00791cd198dbf3d5cd5e9279e354915/multi_storage_client-0.32.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33cfa3f50e54b0318c1488736e1cf8896a292a72e8282aa7793487fe78e8745a", size = 3344998, upload-time = "2025-10-10T21:42:05.781Z" }, - { url = "https://files.pythonhosted.org/packages/02/9f/071749072958d5ed00f728d5287e08a8bd46aadebbb60fcf63a84cdb908c/multi_storage_client-0.32.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c690e2f701bf00e2dc117f7c9b89f88ca7aa86f8335e293597bdada6adec11fc", size = 5265048, upload-time = "2025-10-10T21:44:25.477Z" }, - { url = "https://files.pythonhosted.org/packages/2e/eb/76abc34996a960c7c23f61e9d07b2861ed96047ba0f768aa74e279fab76a/multi_storage_client-0.32.0-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:14b1bdc765d060b250335b495c9fca5bcc0957625244b1bc4803029b2755c7b4", size = 5392366, upload-time = "2025-10-10T21:40:32.831Z" }, - { url = "https://files.pythonhosted.org/packages/e9/35/56255ad4247d877d13accf35dde3e0ec8f2087290def6adbe787ddc952d9/multi_storage_client-0.32.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c1f139337d7320af3f15d725aee172893386ade01d89af0ae5aab19d501b354", size = 3174684, upload-time = "2025-10-10T21:40:09.993Z" }, - { url = "https://files.pythonhosted.org/packages/3d/a4/98761f87f30ec7f1afb730a648e58b386067c00c2d8736b18cf543fff57b/multi_storage_client-0.32.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:643bcf247be3bbaea0004c2e003af0aa8ae79258087ed2360670e685499698ed", size = 3344163, upload-time = "2025-10-10T21:43:39.164Z" }, + { url = "https://files.pythonhosted.org/packages/5c/c4/6279fb7d4b8b0a7af060047d592f00f8d49c547adfebe50bcd8d0d2dc8a5/multi_storage_client-0.33.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:df52b3040ef5698c6388fa589bd63812ae0d2f967d358a792abcad5638686590", size = 5282006, upload-time = "2025-10-23T03:45:37.761Z" }, + { url = "https://files.pythonhosted.org/packages/22/3b/23d8beccd73b887c4552bf884275611255b5028388fa3317365cd56c2a93/multi_storage_client-0.33.0-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:370da04b1e56a601ba505a29d42fcabc19b583e10d725a37bc0c11ba3573d211", size = 5403083, upload-time = "2025-10-23T03:53:11.998Z" }, + { url = "https://files.pythonhosted.org/packages/b0/ad/dc355d05fd369da0d800e5f7de24da0393f542c5a6f775f6bcee7edcacb1/multi_storage_client-0.33.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c57749a28ec5d49440f465fd73e4e2feaab18ece9b6e57c73395308b41950f66", size = 3178432, upload-time = "2025-10-23T04:07:00.543Z" }, + { url = "https://files.pythonhosted.org/packages/e0/ad/97b54419d8a58f696b85504568391a627641152f80650d7d2697fc2702ed/multi_storage_client-0.33.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c7d95f5fe094aab00a240bf6aa11dfe85bec293b76b3688ec3a9c33d86c751d2", size = 3351102, upload-time = "2025-10-23T03:47:47.622Z" }, + { url = "https://files.pythonhosted.org/packages/52/28/1038a68b9df1b179a61967ce9f7d2e80b9954cdb289801afecde5f7660db/multi_storage_client-0.33.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4b5a0f5a0b7684835be20ae6782070884982a86665e9bab317375a56a20294d1", size = 5281523, upload-time = "2025-10-23T04:06:36.671Z" }, + { url = "https://files.pythonhosted.org/packages/6c/c5/e18de5e2a2671efdc0a12383b8d63f523044ca453525725b3450d0179c0e/multi_storage_client-0.33.0-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:0db694311f90f44ee8f6f7734a14a0857738a467f2ae201649218a3ecf1f6ab2", size = 5403353, upload-time = "2025-10-23T04:07:25.941Z" }, + { url = "https://files.pythonhosted.org/packages/7e/c9/d9f65eb2370151dbbb06925f4216ee017e6cdbf7657263fd98e60944e52b/multi_storage_client-0.33.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cbe3a0b856f0b968f9fc693670a521b5a995b625351241ca008f866fdfff62a", size = 3180052, upload-time = "2025-10-23T03:57:32.797Z" }, + { url = "https://files.pythonhosted.org/packages/e7/38/08b9d84c93b19ae87caf542ae77f17dfa44a85281ba09de660ffcf3a7718/multi_storage_client-0.33.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:018e7e82255feeff973ff02563f11a30f5e507e4cbc87a2167a9568740144ef2", size = 3351389, upload-time = "2025-10-23T04:02:07.348Z" }, + { url = "https://files.pythonhosted.org/packages/6a/31/c95634a27723b5ba9d2d74158444cc5e40b151b51ae59ca196fc9993f039/multi_storage_client-0.33.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:030b3a592c6352605e9ebdb8d9303dd42daf5d171ffa684f3283d4a5c6e2edfe", size = 5273976, upload-time = "2025-10-23T04:04:35.99Z" }, + { url = "https://files.pythonhosted.org/packages/8c/cf/82d1778d73c3baaec331da4ae8d01fa7934bcd73336aa88a08d86d080347/multi_storage_client-0.33.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:14dc0ace16d3830917427d6376d14ef62bd053fb2509f893998555ca1e9c4dcb", size = 5400735, upload-time = "2025-10-23T03:58:37.149Z" }, + { url = "https://files.pythonhosted.org/packages/fc/34/a6194ec725ef80c02de58b5ed3520bb1711807df75a27f7214effd22df34/multi_storage_client-0.33.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a2821765d5c6de365b5b1dcdc7cf2ebba719ff4061fd02975639629f8aa319f6", size = 3182623, upload-time = "2025-10-23T04:03:29.551Z" }, + { url = "https://files.pythonhosted.org/packages/8f/36/7ec85178fd1dd69c278407a82acaccfb806449deda13f3dbd41f653d73bd/multi_storage_client-0.33.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f92f89480c58067fa53c178785b86e7650e16f277a61a732a8a7019173b16129", size = 3352104, upload-time = "2025-10-23T04:08:51.005Z" }, + { url = "https://files.pythonhosted.org/packages/88/ef/f2eb2efefb0e0588b29ed573b8354ecd72c38e6143da7ed5ecf53e859bf8/multi_storage_client-0.33.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ed9af7e77e3cbac1f614816062b36975dcbc610bd3f8c86741d48aa18c718781", size = 5272154, upload-time = "2025-10-23T04:07:49.572Z" }, + { url = "https://files.pythonhosted.org/packages/1e/49/050aa4fccb2579d2ef5bd0d27169ec98fe85c92bba7a2c31154c491a4f75/multi_storage_client-0.33.0-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:c9d75e95a266ee858cf20c88ed255021552de67a40af9c8884d2fc22037dcd2b", size = 5399474, upload-time = "2025-10-23T04:09:14.545Z" }, + { url = "https://files.pythonhosted.org/packages/f6/4b/70c2df3b60c28360f185188d351e9c3958b702614963a09ffb1dc251c1ca/multi_storage_client-0.33.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48195a2ab9e6e9a2763bde17184cad2bdef82684353e210d0d325f20cea18869", size = 3181788, upload-time = "2025-10-23T04:03:10.404Z" }, + { url = "https://files.pythonhosted.org/packages/9b/96/5008852677fdad10eb9d8dd08a6ea58c6f7e820199a3b2c56607186ac6d5/multi_storage_client-0.33.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd64403efdcee2a6efcf7bfdb01422dd174c146014563b09f44590346fd835e6", size = 3351269, upload-time = "2025-10-23T04:00:34.714Z" }, ] [[package]] @@ -4679,109 +4679,109 @@ wheels = [ [[package]] name = "regex" -version = "2025.10.22" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/90/f2/97d95db85e11cc85f97581cfc8b4a0405c7fb6099003c23ffaaa0cb4f31d/regex-2025.10.22.tar.gz", hash = "sha256:cc50db098b9d678ace33176a3ab4099616726ae4680fee6ac292302e8950fc4c", size = 400985, upload-time = "2025-10-21T00:48:37.365Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/69/42/2904bb22aaaebaa8348673cfbacd704dba2160d847bf17cc6209349a8b7d/regex-2025.10.22-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:afa5307263ef2883cff3c1055a58239d97c28a888b813489b04ff063f64610d6", size = 487959, upload-time = "2025-10-21T00:45:00.385Z" }, - { url = "https://files.pythonhosted.org/packages/28/87/ecc953aec36f3c79585d40d2ce3a90ae28aed434c681cfcbed19ce9b4bba/regex-2025.10.22-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cfd87258e5879cec2f02907a043d69d72c864723209565ae8cd905a823b94976", size = 290421, upload-time = "2025-10-21T00:45:02.122Z" }, - { url = "https://files.pythonhosted.org/packages/e5/81/aca223093854fb1e385580f6e7ef48fc895ecfe2a8d66133850b8cc12d49/regex-2025.10.22-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:53a184fa09354b02f18fe3c50de3b809386dbc1bbfa8e51598e300342cde5a11", size = 288284, upload-time = "2025-10-21T00:45:03.587Z" }, - { url = "https://files.pythonhosted.org/packages/42/36/08e03e31cc9dbf5951012a2188d5fd8c79ddc10c2e12849bf434158a1ae3/regex-2025.10.22-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:924a79f8e248271713bc0e1fdd7e48b4632a61152f448e446b8fd724f0715ae8", size = 781457, upload-time = "2025-10-21T00:45:05.105Z" }, - { url = "https://files.pythonhosted.org/packages/af/28/a1e08f43b850948044b3ab3169472c62e0d59be3e47049a27817a8b3c694/regex-2025.10.22-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:84cd327fd1f245e74a6fe0827e2775cd1de83c4a8cbce1da1627d07c233c5f58", size = 850605, upload-time = "2025-10-21T00:45:06.647Z" }, - { url = "https://files.pythonhosted.org/packages/5f/65/d864a9a4a3e0ba4ff3f8798481cc9bdc7304a337c999b69e148d0ad320ff/regex-2025.10.22-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:28c4fcf105ae1a09769110669280a3dfe84b291d856368c8b4d77ccf4345434e", size = 898563, upload-time = "2025-10-21T00:45:08.618Z" }, - { url = "https://files.pythonhosted.org/packages/cc/95/6ae15342e49b9fc1cd8aef350675b3b53446599114c190b3b9df5f4e0bce/regex-2025.10.22-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e32f91f414442d0d6fc6e0b7b58e05afd4deed92c852796f3122822f646fc42e", size = 791535, upload-time = "2025-10-21T00:45:09.888Z" }, - { url = "https://files.pythonhosted.org/packages/ff/f9/b557590b7ed1f5b8d2452ba8eda8959c4acacbad4ddd764df32438e74f2d/regex-2025.10.22-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:11d2a65fd118c1e409e27dab9aa0a65ebbcab1b836ed441e6e4f78dccc4bd6ef", size = 782461, upload-time = "2025-10-21T00:45:11.636Z" }, - { url = "https://files.pythonhosted.org/packages/94/dd/1cf6bb815f96137f500282ff209c4cfddfaebfe52cf7eb52ce183d389b41/regex-2025.10.22-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7ebde462d55fbbc96d888dad35bd413c8a3d53e3423aa23cc8f01c3398f39148", size = 774582, upload-time = "2025-10-21T00:45:14.192Z" }, - { url = "https://files.pythonhosted.org/packages/03/17/5d6777c93df720c755e4a3b85badaaece51dfe8161cbd1cf70b5a6522a5c/regex-2025.10.22-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1093a856ed0afdcfc89f65c97a143b1593538827701cc6519c6bc0f1c150e5f6", size = 845647, upload-time = "2025-10-21T00:45:15.486Z" }, - { url = "https://files.pythonhosted.org/packages/dd/65/431ae5c24c4db5a26b9d5a4c927381b351c6eaa031b61c91e2ed17857135/regex-2025.10.22-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:716a35741a61333c16d29e544685f3dbfa1df48593ad07e92f77b4a831b4c271", size = 836036, upload-time = "2025-10-21T00:45:16.869Z" }, - { url = "https://files.pythonhosted.org/packages/2f/0e/12c4dce8880364dfb0f31a46ee8dc896805fc6cef473b7491879f30ebd33/regex-2025.10.22-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:4782376eb8dbeacaa69b34498e280e8e95947532f8938081e916bbce871bfbab", size = 779705, upload-time = "2025-10-21T00:45:18.472Z" }, - { url = "https://files.pythonhosted.org/packages/1d/6b/cd053d41840fd1e4a2cce4abab07248d4ca70c52ed6555490b56e077920c/regex-2025.10.22-cp310-cp310-win32.whl", hash = "sha256:086cc892b1f8e1d8fe7a060012268a21b96ec25b87b4618c12a853564261f63e", size = 265664, upload-time = "2025-10-21T00:45:20.163Z" }, - { url = "https://files.pythonhosted.org/packages/22/66/557b06253b10ea57198362fb4f6df8860f9d84ee25fcf9a7ca065c9c9984/regex-2025.10.22-cp310-cp310-win_amd64.whl", hash = "sha256:e25f9fb71b775a6d97096cb6c2ac26c675e8c99219afac7f9321f2f4daa46227", size = 277587, upload-time = "2025-10-21T00:45:21.579Z" }, - { url = "https://files.pythonhosted.org/packages/32/44/37a7cbcac47804b4ed34ffb03da494db7eef3992d42d4eb4fa4e0e840a11/regex-2025.10.22-cp310-cp310-win_arm64.whl", hash = "sha256:d0ecea4950b363a9bb1d01c35cff73c0bc762ebdf91109c806ca33a0cbc9ff03", size = 269980, upload-time = "2025-10-21T00:45:22.889Z" }, - { url = "https://files.pythonhosted.org/packages/4e/88/739a7c7dc641976fa3d66c0770f6bb2c6ef5cc3f6b44e039f58bffcfbff3/regex-2025.10.22-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e6b0c007a8b6a9500354eeab8478b18b1cca6ac3fd500f6c3ae017ed617de497", size = 487951, upload-time = "2025-10-21T00:45:24.675Z" }, - { url = "https://files.pythonhosted.org/packages/8d/6f/7157a845b79bfc68560f17268e8b6c2cd5757b5ca396608118a8209c3489/regex-2025.10.22-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:51170deaffec87e48004f9dab53ff0c4db8d10e2ff7630a78467ccd50f656328", size = 290421, upload-time = "2025-10-21T00:45:26.281Z" }, - { url = "https://files.pythonhosted.org/packages/bc/e4/a73127c12d6ed1ee97b81aed80b3a63499e409fe947cfcc491197312ebf0/regex-2025.10.22-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:333afc5e00f43598080ff1d00d5948462905ea514343fbdc5a889e7c3d7c23b6", size = 288282, upload-time = "2025-10-21T00:45:27.988Z" }, - { url = "https://files.pythonhosted.org/packages/67/69/10f1d84cd43ce52257cbc8b4af0e1a7b1b61988ee22e494eda7419702884/regex-2025.10.22-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:31221a2a095173e3121842c9f864a5902703dc5ff0d3298c0fe08f9a8a1d80b1", size = 793289, upload-time = "2025-10-21T00:45:30.192Z" }, - { url = "https://files.pythonhosted.org/packages/dd/30/cb4dd079787a76c96acddb15465bc1895ef67a02c4de60890b7b073328ad/regex-2025.10.22-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5de5505e5aac808e2a97515e1d74db99da23259da9dfaf833c1a10f8972d2096", size = 860320, upload-time = "2025-10-21T00:45:31.587Z" }, - { url = "https://files.pythonhosted.org/packages/ea/6f/25fd36431739dce27bdecb7c6a7e215a545a40577e683fc2708fa6235639/regex-2025.10.22-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:809c6f74840f18574da0ce8365d8635f0f1568552363b9a54adf0b41039a4406", size = 907011, upload-time = "2025-10-21T00:45:33.214Z" }, - { url = "https://files.pythonhosted.org/packages/0d/96/67fc321360de627c5406aed97be803240227770a29d09117157d56899c4d/regex-2025.10.22-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4bd26a33cad0f24c045fe2d84e70a75f8bd82cb79121382c0ed6c035d247854c", size = 800313, upload-time = "2025-10-21T00:45:34.943Z" }, - { url = "https://files.pythonhosted.org/packages/17/e9/eff1e7cebb027130242b70b2c81a07d9a2d98414c67ea81fac5e32cda8d2/regex-2025.10.22-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:330b0cd6922f93cc0322002467f347b605555a4d64997f3598c06cf8c1303a7f", size = 782837, upload-time = "2025-10-21T00:45:36.335Z" }, - { url = "https://files.pythonhosted.org/packages/a5/64/d9eab04a6f3c043ef5d9cabc94d2d6b522c2bc57e68de8e6f88b080ff66a/regex-2025.10.22-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:6763d77bcca503aa1c24b675d05d44c764149f222b7eb6bb3423cebea5eec6e9", size = 854270, upload-time = "2025-10-21T00:45:43.158Z" }, - { url = "https://files.pythonhosted.org/packages/84/8f/a354bf4b41bfa157d731d3628ba677aff7f0c33603939459bba5ba2e4204/regex-2025.10.22-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:1eba7681913574c0a8025d435bbc6d10855b273d8f8c0e2d2fc9a981cd05704f", size = 845770, upload-time = "2025-10-21T00:45:44.776Z" }, - { url = "https://files.pythonhosted.org/packages/e7/9e/40a95cc48771d29a55e36d98e34be4f6a8d965fef99dff9056003e32273d/regex-2025.10.22-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:25b80a2ea85f6e06cecf5a3d3a51adb62d19072055bf39d9cabcb29462fffd1d", size = 788777, upload-time = "2025-10-21T00:45:46.551Z" }, - { url = "https://files.pythonhosted.org/packages/68/87/c9d542090675d014d36bece68d48c314a733ad59d3f4999103813a7bb020/regex-2025.10.22-cp311-cp311-win32.whl", hash = "sha256:c4d655be922039bb4ff8fd8363c71bc8da439f7c7260045e4ff10c774e80606b", size = 265667, upload-time = "2025-10-21T00:45:48.211Z" }, - { url = "https://files.pythonhosted.org/packages/47/89/98075b8c5a30b70f156af5caa833f57d0967cb0385fbcc1df37a9a0ca702/regex-2025.10.22-cp311-cp311-win_amd64.whl", hash = "sha256:b7ec554c0ed3aa93e0fb91c436b69654c11ab84a701ae3918dbe8fcd1b73984a", size = 277601, upload-time = "2025-10-21T00:45:49.844Z" }, - { url = "https://files.pythonhosted.org/packages/1f/b7/6664611fc6bdd38e8bf773e135954d10c0ee4326099114b0d00a52c85c96/regex-2025.10.22-cp311-cp311-win_arm64.whl", hash = "sha256:c4347ab5146bdd8b27fdb831f8cf882ec0238c7fdb6baddda1344d07ea8245b2", size = 269973, upload-time = "2025-10-21T00:45:51.535Z" }, - { url = "https://files.pythonhosted.org/packages/95/a8/3380a8cb20c255878a9f1165b33c4d6a31d8f5417650c22b73bdcaadd281/regex-2025.10.22-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:8b66971471306def7e6baf18ead3f416347d56eb5e295f8a75014d13be92e9fd", size = 489185, upload-time = "2025-10-21T00:45:52.929Z" }, - { url = "https://files.pythonhosted.org/packages/b0/1c/e1eb33fc1f3a7851cc0f53b588790e14edeeb618e80fd5fd7ea987f9957d/regex-2025.10.22-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:8c93b179960f4f2f517fe47da9984848d8342a6903b4d24649f4ee9bd22ccd3c", size = 291124, upload-time = "2025-10-21T00:45:54.934Z" }, - { url = "https://files.pythonhosted.org/packages/1b/21/6cc0fe9d4ebd7d6e19c08e77f41082103d52c671eb7eb01cc032e9bccbd4/regex-2025.10.22-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9b4fa8d221b5db3226029978c8c3f66f2e4c6d871e94b726bcd357e746b7a63", size = 288796, upload-time = "2025-10-21T00:45:56.248Z" }, - { url = "https://files.pythonhosted.org/packages/23/b0/d74069acbcc60b54977e693dd673099352b024f7f037cec201b0d96b7d99/regex-2025.10.22-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a2a0d4e5f63c8de13fbab94d4a25cc6b02f1007b84e2d4c74f48c242eacb06f1", size = 798441, upload-time = "2025-10-21T00:45:57.896Z" }, - { url = "https://files.pythonhosted.org/packages/2c/f3/69cd09c226ce0fc6a5cf48b5dea716c0139abed41d02fa81fa774e56e713/regex-2025.10.22-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d8df6c82c544eed8314667a1fb8f705a9a802a9d6368045354319588ff56708d", size = 864038, upload-time = "2025-10-21T00:46:00.298Z" }, - { url = "https://files.pythonhosted.org/packages/8e/b0/77bd0e6838f579cc5a02b9e18bc0a759d0ed85b9a8d4d44ad6d3478a40ec/regex-2025.10.22-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a114c2735369334a755a844abd15d5a12716635cc4677fb4e6d793ce369310f6", size = 912054, upload-time = "2025-10-21T00:46:02.358Z" }, - { url = "https://files.pythonhosted.org/packages/2d/41/c320c3408050eefa516d352d9e05fd4d6af5da7ec0daea56d1e68bb9096c/regex-2025.10.22-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5d53115edada199723b831a49c7e1585ddda7940fb2ba7a78d12bf22e92f23e2", size = 803374, upload-time = "2025-10-21T00:46:03.837Z" }, - { url = "https://files.pythonhosted.org/packages/88/ed/0942c27223ce6bff95087f4859991634d995d6e186807e038fd1c2c3759c/regex-2025.10.22-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b4a7d813fdffe99ae0ecc17c80f652c8946c05a6a090eb2560719d02dfdb4b0", size = 787714, upload-time = "2025-10-21T00:46:05.934Z" }, - { url = "https://files.pythonhosted.org/packages/1c/40/10e2657ed24966742efd68eeb566e26af1eea3925dfe761ce14260a69161/regex-2025.10.22-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:81fb24976e3f71d765edec8a3175abb10359918d8997ca6a756fd68dd3c051f6", size = 858392, upload-time = "2025-10-21T00:46:07.801Z" }, - { url = "https://files.pythonhosted.org/packages/f3/48/bd382281e2f3bcfc2f355b5283ef16d8175b6df4cb6ed532529b715baf07/regex-2025.10.22-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:d881e96a443528a83f46ab69714befeb35f4d0caf359c43a606b82cb717a5df9", size = 850482, upload-time = "2025-10-21T00:46:09.893Z" }, - { url = "https://files.pythonhosted.org/packages/2e/5c/fdc0ac5eb3f21a6f19158cce3150e57a65d9770709b8521e09fe9febe813/regex-2025.10.22-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:42abc81ee54e06bef4dbc8e7b8394a57882c718ed3c6aabfea47e429feb94ee9", size = 789633, upload-time = "2025-10-21T00:46:11.687Z" }, - { url = "https://files.pythonhosted.org/packages/a2/ef/c2e63968c9130a17d79431ba8aa98ada02962435436ef506fb4cef139760/regex-2025.10.22-cp312-cp312-win32.whl", hash = "sha256:db30ab87b3d745b7e95e69099e1c4bf544c3f3800b9376b935943e86f650705a", size = 266060, upload-time = "2025-10-21T00:46:13.577Z" }, - { url = "https://files.pythonhosted.org/packages/5d/9d/57bc04978add42a62391f8082e94ec3a8c3448d49e349ede8c2c66ca0a55/regex-2025.10.22-cp312-cp312-win_amd64.whl", hash = "sha256:64190fa0432ed254416898ff3b687648e025445bfa357988f20f1332f651f650", size = 276928, upload-time = "2025-10-21T00:46:15.18Z" }, - { url = "https://files.pythonhosted.org/packages/89/50/760700909a618de1c2405f3a0557a3ec9b4eba516a261aa85fe973d3a354/regex-2025.10.22-cp312-cp312-win_arm64.whl", hash = "sha256:cdfc74d0af9b0cb9bd442619489582b32efc348db651a44967ba5fb71b8d3dee", size = 270103, upload-time = "2025-10-21T00:46:16.903Z" }, - { url = "https://files.pythonhosted.org/packages/c9/25/4c056f41ae981b41e316e44e0ba76efe0b3655c8a070580c3c069765d4e8/regex-2025.10.22-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8d49aebe7cb99d80680ff55ff9475bf122c6e3e8a34aec7496aefc90196ac350", size = 488944, upload-time = "2025-10-21T00:46:18.67Z" }, - { url = "https://files.pythonhosted.org/packages/b5/4e/79e7882d35a613517a63d574d80e68c2e8e2d4c67aeaa0c564025cb9e3d6/regex-2025.10.22-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:45367f329e32988d33e5ebdb69b7fb9eb3fc1d9b789b00724e5ddabb75647064", size = 290995, upload-time = "2025-10-21T00:46:20.089Z" }, - { url = "https://files.pythonhosted.org/packages/e9/ed/228d94f8af1da578100822d7a3e8a82dc4f0ffbf07c626293deb0b0aff86/regex-2025.10.22-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:fb449bc9d0f379c1064986621e6088a8d28cf628074700c18bd151855f4c9e2f", size = 288686, upload-time = "2025-10-21T00:46:21.769Z" }, - { url = "https://files.pythonhosted.org/packages/be/e9/203bff375a555b79d36fc707ad99584dc8847b4ef5182656a6e156946395/regex-2025.10.22-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:154919a381798a7ff07371bff86c6ca4cd9cee6110d163867ff12311ad18d7ac", size = 798465, upload-time = "2025-10-21T00:46:23.55Z" }, - { url = "https://files.pythonhosted.org/packages/fd/31/0660d5bbefcc0ecb0e4f654f69a28a47253da7997ae64fc24e86aff27971/regex-2025.10.22-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:29b4f447d8a514021011d24a50979d5aa1e7d2a99b150eea979221849bd9c77a", size = 863995, upload-time = "2025-10-21T00:46:25.129Z" }, - { url = "https://files.pythonhosted.org/packages/c8/45/a9e1b6fc5b91976ef5b7f456213da52fb4ce24a7846de7d8777a1c305ac5/regex-2025.10.22-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c0bd5398ca8b3f9c1f0d09719c195124e955c4677b55b9d5a728eca5f407eb03", size = 912144, upload-time = "2025-10-21T00:46:26.747Z" }, - { url = "https://files.pythonhosted.org/packages/6b/86/98813e259d8b791891b27c2a6e7ce4fc23bc4222fb46e55f473683ae586e/regex-2025.10.22-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ecb0fbbd37ae701d12b90bacb03ad36c89b0d2d67eab02b5862ab3e1a50ea49e", size = 803370, upload-time = "2025-10-21T00:46:28.314Z" }, - { url = "https://files.pythonhosted.org/packages/fc/8e/53f27f735368896d777603cf76124b74949ce89123c2c99006834ee29924/regex-2025.10.22-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:419c5fff30240ed10ee55f2d7dd3b54dcc02502568e94be4522b54be63d59aff", size = 787763, upload-time = "2025-10-21T00:46:30.378Z" }, - { url = "https://files.pythonhosted.org/packages/c5/83/2759cdcdff775205871e10db4d1bf09afa7fbb55af850c5cfb0e9e699090/regex-2025.10.22-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b71b5c4a00467304ebfae0235b763129af2de074b02e78e959d8990c553c0a6e", size = 858336, upload-time = "2025-10-21T00:46:32.287Z" }, - { url = "https://files.pythonhosted.org/packages/6f/b5/6fe37d832e1e2cb4e82c444844e1eca88de9171d766f2f9cbe308409a2d8/regex-2025.10.22-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa800228137127de4cce1875f0ddeb4ce19d33fd0ac6450c3b00b942866748e7", size = 850401, upload-time = "2025-10-21T00:46:34.275Z" }, - { url = "https://files.pythonhosted.org/packages/30/57/b9c2b316a87dad82a8845b1854be743441ef375774497f11f13658d016b7/regex-2025.10.22-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:44c8c46b7160260e0cd8b0f7c20ff6269976278d8187646d3e741d8dfe5fcdbc", size = 789738, upload-time = "2025-10-21T00:46:36.421Z" }, - { url = "https://files.pythonhosted.org/packages/d1/5f/e8bb23662647d519d1ea24f9b30d19c291237aea721662b3d563af6326df/regex-2025.10.22-cp313-cp313-win32.whl", hash = "sha256:701c53e8cb0c73c39d72dc4be71ee88478904b4066bd31f95e2b6fdfac49102e", size = 266055, upload-time = "2025-10-21T00:46:38.062Z" }, - { url = "https://files.pythonhosted.org/packages/d9/12/035e5c09d1c5e64a640b3c0b2e4b01580e8a36cf0abb99d978422601158d/regex-2025.10.22-cp313-cp313-win_amd64.whl", hash = "sha256:4a3a6320015223d0a14fdc2706e65ca64e7e3d97016acef1349a39c3a0bbbd81", size = 276919, upload-time = "2025-10-21T00:46:39.636Z" }, - { url = "https://files.pythonhosted.org/packages/be/d3/44dfed03966d26942c53597951035cece3ecf4cb56945ee0bf15014ff092/regex-2025.10.22-cp313-cp313-win_arm64.whl", hash = "sha256:dbb3eb2433ad2158e9719369ea2184329145f50ffae2e6328985fc0de6a71984", size = 270104, upload-time = "2025-10-21T00:46:41.349Z" }, - { url = "https://files.pythonhosted.org/packages/9c/b9/ccd603c3ad0eead387eaa79203eca0c6846e065e10cb30a717ce2813a878/regex-2025.10.22-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:3fcce0c2b0b7a8f4a029154d7ae9040d2ff5bed77085cd3bf9a56b61a8cda009", size = 491846, upload-time = "2025-10-21T00:46:43.097Z" }, - { url = "https://files.pythonhosted.org/packages/06/f4/e96216c9faf36fbf42474702afe6efdaecf5b9e5fbce0a77ead5f00191d8/regex-2025.10.22-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:46338f1390c9ddf6c163949cd53558a89ab7c7edbb4713b9d2b7cdf71c87a75a", size = 292541, upload-time = "2025-10-21T00:46:44.996Z" }, - { url = "https://files.pythonhosted.org/packages/08/19/26b9fbd2daac8e783d3f008e5e18e99c9f31c880c9ba644511e3107e2f86/regex-2025.10.22-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:ca58844dc33b4297ae24505db9528be6862a8b2b961f60f6acc0869ea1291d1a", size = 290899, upload-time = "2025-10-21T00:46:46.564Z" }, - { url = "https://files.pythonhosted.org/packages/9b/43/cd1512382caedfdb2f663948485ab001cb073631a0d94706db524385eaf5/regex-2025.10.22-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c4d54ae939c325b8027277f998cc7dd175447745bd12d6a93c09ebebda1226a", size = 807309, upload-time = "2025-10-21T00:46:48.408Z" }, - { url = "https://files.pythonhosted.org/packages/13/69/6aaa805ed5b53a1a3d6115691745cfd20370f3dddc027f4fcdb8cb050251/regex-2025.10.22-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8c311ee233a59483d6e3b78d669981f387ca2ce162b029895bddb74cbc37e53", size = 873241, upload-time = "2025-10-21T00:46:50.056Z" }, - { url = "https://files.pythonhosted.org/packages/75/21/224fe5b25fff1c6ac921246e51603785e688fc8e0d23dabc77d7e62b1b6b/regex-2025.10.22-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:64fc5557f8798a6ac439cabb80ea28c97e509e03ed1a1b23e16f6f7f95ee53fc", size = 914793, upload-time = "2025-10-21T00:46:51.648Z" }, - { url = "https://files.pythonhosted.org/packages/15/56/9349b5a283b3b05387ecd147962880ef1532827c073d5caf0d291048aaea/regex-2025.10.22-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e7957cab18a1148752372bd6acf23ecc54785d13439ef14024134d37e51e9b77", size = 812580, upload-time = "2025-10-21T00:46:53.585Z" }, - { url = "https://files.pythonhosted.org/packages/39/71/450cb85d91bc3c6e01589caa6de4b28445ae77fb8915895d9427996926d7/regex-2025.10.22-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9adaf0a0cefd826192045946bb8922e19d321934fa661efa3744d0aea130b667", size = 795344, upload-time = "2025-10-21T00:46:55.312Z" }, - { url = "https://files.pythonhosted.org/packages/75/b3/f8e6f2651a22662b00005f0b26f53438b89b33159469e8a279a07b9d951a/regex-2025.10.22-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:61e564ff5eb999e2ccf8311d7cb61ecb24c502ee5116b181b0348b4d882de480", size = 868213, upload-time = "2025-10-21T00:46:57.255Z" }, - { url = "https://files.pythonhosted.org/packages/37/aa/9dfa760dd368f2a9bc01d1a50edbc838b5ce330ca4142149420acde6d13d/regex-2025.10.22-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:1aa9a1ec0ab3f10210626795bcfe84b0ac20490d085ea4d7628fe381a98592be", size = 854538, upload-time = "2025-10-21T00:46:58.992Z" }, - { url = "https://files.pythonhosted.org/packages/55/62/e3ef2330f1b2e63fb1e096a53d3335a2dea5e77364cf8a17341e8acb24f1/regex-2025.10.22-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ffe59e0b0d93cf4999565236b5a36a7d22b10f5f7fed59f423bd5f7542453832", size = 799346, upload-time = "2025-10-21T00:47:00.738Z" }, - { url = "https://files.pythonhosted.org/packages/45/7e/ae3de5c8a26394be05ad1e2b252dd82425ab72ff7f4e79b03f8a431ecbfa/regex-2025.10.22-cp313-cp313t-win32.whl", hash = "sha256:36ba31e30b9c74a536a08635ca12cb0588ce39298b2cd7904194c2227c284d88", size = 268657, upload-time = "2025-10-21T00:47:02.958Z" }, - { url = "https://files.pythonhosted.org/packages/4e/1a/d6673cb4f28a368d51316b67c1067a246651731c8fbff50e99060b8ed483/regex-2025.10.22-cp313-cp313t-win_amd64.whl", hash = "sha256:d7d9992c44a5186c6539f9717b6a6e639d4f57f919d238e660f4ce42a22f0ced", size = 280076, upload-time = "2025-10-21T00:47:04.973Z" }, - { url = "https://files.pythonhosted.org/packages/26/40/30702d35b888a6cc1a290ec6b244109f827eddedb61af77b42c6c5f63928/regex-2025.10.22-cp313-cp313t-win_arm64.whl", hash = "sha256:28ce6c33b836c63ef0a4ec137fd0f136627b71075a5cfffb8c5aaef8ce4535b6", size = 271219, upload-time = "2025-10-21T00:47:06.678Z" }, - { url = "https://files.pythonhosted.org/packages/93/f2/9977dcdf246c79d906a0286b440a9cd40df04848044b7a269e9b4dcaf2dd/regex-2025.10.22-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:6f8d3d623d1bd4a8eb6eecc86e9ec80a130f071232f8e3d9d907693ca63ab5b6", size = 488962, upload-time = "2025-10-21T00:47:08.288Z" }, - { url = "https://files.pythonhosted.org/packages/b4/f0/1eff0e3a1d71cb81556b36320295f2970555de0b7d1378760aeb2deed132/regex-2025.10.22-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:f6d9cff7fc70884e3938ea0887dc06ee588647df9ce4b943a3f95b18f8479a58", size = 290936, upload-time = "2025-10-21T00:47:10.191Z" }, - { url = "https://files.pythonhosted.org/packages/37/fe/ca2f6f955f897ace6539ada97c9419d01b254686b24317c26d738dc641bd/regex-2025.10.22-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:6442d1cd67645854d04ba26ba47f697200b77fb6a11a43dccf38406113515c4f", size = 288767, upload-time = "2025-10-21T00:47:11.939Z" }, - { url = "https://files.pythonhosted.org/packages/9a/07/a10e2d7cca7b714d1be61cae05aaf3a44517f29b933e8113d490a1c5e908/regex-2025.10.22-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4385761deae1f5082f308267482530b9c286e005627d3afca80eb0bc6de97e70", size = 798885, upload-time = "2025-10-21T00:47:13.713Z" }, - { url = "https://files.pythonhosted.org/packages/ae/ba/e5f89ed297ab495c1545600ca3d67133e0a008bdea17af1f78e6ab0b8a2e/regex-2025.10.22-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c859b07e2ee607881e6ce7e9b99a02730408cfc3f7e9f5d407c015eb79dcb60b", size = 864767, upload-time = "2025-10-21T00:47:15.542Z" }, - { url = "https://files.pythonhosted.org/packages/6e/2e/2a4c50a4216c155dbb98b0243e6b918cfa4f19c293eff381363db657e5f0/regex-2025.10.22-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c4b2eeb15be534fd2499eab59696fada35a5cb2e45606e381d6a35f5dedc8fcf", size = 911393, upload-time = "2025-10-21T00:47:17.327Z" }, - { url = "https://files.pythonhosted.org/packages/2b/67/38d6f87b2fdef338fb6d1531abfeac61be5b14178ce0467fd87ca75bc7de/regex-2025.10.22-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d79c066145e1229c5733e4d774d17cbc20899681a9086f2a9f943eb4df18d8ec", size = 803144, upload-time = "2025-10-21T00:47:19.095Z" }, - { url = "https://files.pythonhosted.org/packages/3d/cd/24aa1da7beab4f98e637b56b5eac8aede966e27ac184e8d8462fc038ed01/regex-2025.10.22-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8ab1d067208191540ca9f38e9e7ae002da1b1fc31d1b21b818d1bd7a944a673e", size = 787831, upload-time = "2025-10-21T00:47:20.845Z" }, - { url = "https://files.pythonhosted.org/packages/bf/94/e46d13ec3cd6a0bce252b74a71ed711b6767c815967a16ce64b50db66a2b/regex-2025.10.22-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:8f9c02832afb85e4eccde6a098da7e61942ddd9f2220406fd9c5efbbf0d774e8", size = 859160, upload-time = "2025-10-21T00:47:22.862Z" }, - { url = "https://files.pythonhosted.org/packages/f1/bd/5231cba2089e8be74d62907bea593b5c92b011890ee98d7a00bf02dd6174/regex-2025.10.22-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:a99dbe41ee88b9a1338ebd39eaf41dc33800265a44db7e2b2558bb416378cd04", size = 849897, upload-time = "2025-10-21T00:47:24.635Z" }, - { url = "https://files.pythonhosted.org/packages/cc/2b/38efccb6763321dfb3ca700d487dc897fc56f6d480c5f5f7bf28dc203820/regex-2025.10.22-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7aad963cffe1967ff78f37550b961146b59c3db1d06e70471e6a35767ffa2ddd", size = 789371, upload-time = "2025-10-21T00:47:26.627Z" }, - { url = "https://files.pythonhosted.org/packages/39/bb/37ca05e146ebf1da46a85aaed11bbece5990b9e889afde8d256139c8fc88/regex-2025.10.22-cp314-cp314-win32.whl", hash = "sha256:8fcea7bf64460d3a8dd7e8626f04cc93149f62367015fecbf72ed8a71e91ee60", size = 271452, upload-time = "2025-10-21T00:47:28.727Z" }, - { url = "https://files.pythonhosted.org/packages/bc/4d/a899b6ec14d7f174f6ed557223644d50b89331f36b2aa324b603f8289a05/regex-2025.10.22-cp314-cp314-win_amd64.whl", hash = "sha256:01a2679bb0286075b0488129b35fc2b1de88538d17f14dc15dd53ecbaaa7548a", size = 280173, upload-time = "2025-10-21T00:47:30.499Z" }, - { url = "https://files.pythonhosted.org/packages/94/9a/21496131abac3d68cc54d4d99bf97ff0385f66c63a1028172f2f6730ddd0/regex-2025.10.22-cp314-cp314-win_arm64.whl", hash = "sha256:6c79ee40c56db2f9090d3ba2cd730488184e522ccd53da6563f45e826fae03d0", size = 273203, upload-time = "2025-10-21T00:47:32.657Z" }, - { url = "https://files.pythonhosted.org/packages/28/40/2e5c9dab10e262f36bc0e1a8f7a9c4318618e9fcf7e7fa1d42f348ed43c9/regex-2025.10.22-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:fe200435c5f40efbfbc0591256f96c31e3709704906edc88817f631571682af6", size = 491858, upload-time = "2025-10-21T00:47:34.424Z" }, - { url = "https://files.pythonhosted.org/packages/40/af/9f4ed3a4ecd3a2bdb58e4190268fdcac934afe32898b9e091fe20f5f97ee/regex-2025.10.22-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:21b6eb4d8a1402aa6a05b98c0a5c353ee68cecfea6eca24542aa992aa2537405", size = 292535, upload-time = "2025-10-21T00:47:36.129Z" }, - { url = "https://files.pythonhosted.org/packages/8f/14/4025dd4cf7bf278d061de8ec8f8bb1105a9235294fb3d8437f0f38f498c7/regex-2025.10.22-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f811bb96131be670a59572caeebf2a94e60cd028f2fc2844e38bdb96f5bbbb14", size = 290907, upload-time = "2025-10-21T00:47:37.963Z" }, - { url = "https://files.pythonhosted.org/packages/a2/7b/a9675643093f800903e1617c3cb651d8684557607ace4af8a023d0fedb28/regex-2025.10.22-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:261a10c8d0dc918fdb3ba83b960f9745de07992696439a2d9b442bf48093b619", size = 807546, upload-time = "2025-10-21T00:47:40.075Z" }, - { url = "https://files.pythonhosted.org/packages/c7/ca/e8d0d9048676efcbd9f946dd03f5bdbd48040cc31d5a36048c7af8cfe076/regex-2025.10.22-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:18d073751341b9a9152d11ae92b468ffe1a1b16caa974a307c1beb117af6a478", size = 873323, upload-time = "2025-10-21T00:47:42.273Z" }, - { url = "https://files.pythonhosted.org/packages/b8/63/39d8352ca76cbb15affe6a48ddef3c6471adebe50cb0c6be626bb69d87a1/regex-2025.10.22-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:36878ced03cfe8e80d22af09fb564e2dddb736bf7c323d4467ff0d52fe6629fd", size = 914854, upload-time = "2025-10-21T00:47:44.379Z" }, - { url = "https://files.pythonhosted.org/packages/ab/fa/47d54acf73907018f92403414014d0728d31dbacaa86d39fdd7ddeffcb08/regex-2025.10.22-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e76167ff542770dd2ffab2b869ef43ebbfc3a683a504e5c259ab64f13e6a17df", size = 812723, upload-time = "2025-10-21T00:47:46.368Z" }, - { url = "https://files.pythonhosted.org/packages/ff/a2/f814b9f762d4713fb55b4f9abc733c368b4f5b6d08dbda58bd72c4062ce4/regex-2025.10.22-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:9bf8f164cdd1f1f9c9244eaf5f55573ddabb7bdc89541fcd0b9e931b37a46f87", size = 795438, upload-time = "2025-10-21T00:47:48.355Z" }, - { url = "https://files.pythonhosted.org/packages/89/82/5a78e32780e89eed8b64d8af06e654363131456b7121863072aea509a358/regex-2025.10.22-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:48361da216575aeffdff05fe902b4025f790f492336c33c455846960d151555e", size = 868337, upload-time = "2025-10-21T00:47:50.613Z" }, - { url = "https://files.pythonhosted.org/packages/c1/06/d533134280c1ee9ef40d586ce7f4b0fe598c284d8feef0c1c82e777df4fc/regex-2025.10.22-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:68afe6a9a856f48282df47301452654144e9be74f23cdce9e3d000b7f3050a07", size = 854565, upload-time = "2025-10-21T00:47:52.905Z" }, - { url = "https://files.pythonhosted.org/packages/12/c1/0954b6ae0d5da6a3362148bca5e80ce67281beca1b064fb06d3b05c0f19d/regex-2025.10.22-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:60a0251d6618d19c51799308511d7b6a63265bc425c7217a1b809eca927624a5", size = 799565, upload-time = "2025-10-21T00:47:55.127Z" }, - { url = "https://files.pythonhosted.org/packages/d3/d9/fbef87ba02d3668678b7a71b2d79a2ca092089dc530d83c609d83a82c9f8/regex-2025.10.22-cp314-cp314t-win32.whl", hash = "sha256:20ad0f712ff769003d90b442175779ad8ce7028e2640e10e0878b8a24e6373d1", size = 274427, upload-time = "2025-10-21T00:47:57.097Z" }, - { url = "https://files.pythonhosted.org/packages/db/df/58fd290ae0b5e223f42e25f1b3a1f445ceeee7d56016b615ab0207fd6552/regex-2025.10.22-cp314-cp314t-win_amd64.whl", hash = "sha256:94485cf318cd628f61dede6e1f9ab1956818ee7dcc59fb51d82e589c1c1a8f03", size = 284141, upload-time = "2025-10-21T00:47:59.661Z" }, - { url = "https://files.pythonhosted.org/packages/31/f2/01599f68ca68ded192f04209effb8630be4ff261b51b888000aea6f5a752/regex-2025.10.22-cp314-cp314t-win_arm64.whl", hash = "sha256:76bc9875244f1cf27e2e75dd9c8faf2c6dc8c9ff33afa98cf55e94969bea6fdd", size = 274499, upload-time = "2025-10-21T00:48:01.985Z" }, +version = "2025.10.23" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/c8/1d2160d36b11fbe0a61acb7c3c81ab032d9ec8ad888ac9e0a61b85ab99dd/regex-2025.10.23.tar.gz", hash = "sha256:8cbaf8ceb88f96ae2356d01b9adf5e6306fa42fa6f7eab6b97794e37c959ac26", size = 401266, upload-time = "2025-10-21T15:58:20.23Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/11/849d5d23633a77047465eaae4cc0cbf24ded7aa496c02e8b9710e28b1687/regex-2025.10.23-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:17bbcde374bef1c5fad9b131f0e28a6a24856dd90368d8c0201e2b5a69533daa", size = 487957, upload-time = "2025-10-21T15:54:26.151Z" }, + { url = "https://files.pythonhosted.org/packages/87/12/5985386e7e3200a0d6a6417026d2c758d783a932428a5efc0a42ca1ddf74/regex-2025.10.23-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b4e10434279cc8567f99ca6e018e9025d14f2fded2a603380b6be2090f476426", size = 290419, upload-time = "2025-10-21T15:54:28.804Z" }, + { url = "https://files.pythonhosted.org/packages/67/cf/a8615923f962f8fdc41a3a6093a48726955e8b1993f4614b26a41d249f9b/regex-2025.10.23-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9c9bb421cbe7012c744a5a56cf4d6c80829c72edb1a2991677299c988d6339c8", size = 288285, upload-time = "2025-10-21T15:54:30.47Z" }, + { url = "https://files.pythonhosted.org/packages/4e/3d/6a3a1e12c86354cd0b3cbf8c3dd6acbe853609ee3b39d47ecd3ce95caf84/regex-2025.10.23-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:275cd1c2ed8c4a78ebfa489618d7aee762e8b4732da73573c3e38236ec5f65de", size = 781458, upload-time = "2025-10-21T15:54:31.978Z" }, + { url = "https://files.pythonhosted.org/packages/46/47/76a8da004489f2700361754859e373b87a53d043de8c47f4d1583fd39d78/regex-2025.10.23-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7b426ae7952f3dc1e73a86056d520bd4e5f021397484a6835902fc5648bcacce", size = 850605, upload-time = "2025-10-21T15:54:33.753Z" }, + { url = "https://files.pythonhosted.org/packages/67/05/fa886461f97d45a6f4b209699cb994dc6d6212d6e219d29444dac5005775/regex-2025.10.23-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c5cdaf5b6d37c7da1967dbe729d819461aab6a98a072feef65bbcff0a6e60649", size = 898563, upload-time = "2025-10-21T15:54:35.431Z" }, + { url = "https://files.pythonhosted.org/packages/2d/db/3ddd8d01455f23cabad7499f4199de0df92f5e96d39633203ff9d0b592dc/regex-2025.10.23-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3bfeff0b08f296ab28b4332a7e03ca31c437ee78b541ebc874bbf540e5932f8d", size = 791535, upload-time = "2025-10-21T15:54:37.269Z" }, + { url = "https://files.pythonhosted.org/packages/7c/ae/0fa5cbf41ca92b6ec3370222fcb6c68b240d68ab10e803d086c03a19fd9e/regex-2025.10.23-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5f97236a67307b775f30a74ef722b64b38b7ab7ba3bb4a2508518a5de545459c", size = 782461, upload-time = "2025-10-21T15:54:39.187Z" }, + { url = "https://files.pythonhosted.org/packages/d4/23/70af22a016df11af4def27870eb175c2c7235b72d411ecf75a4b4a422cb6/regex-2025.10.23-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:be19e7de499940cd72475fb8e46ab2ecb1cf5906bebdd18a89f9329afb1df82f", size = 774583, upload-time = "2025-10-21T15:54:41.018Z" }, + { url = "https://files.pythonhosted.org/packages/7a/ee/a54a6851f6905f33d3c4ed64e8737b1d85ed01b5724712530ddc0f9abdb1/regex-2025.10.23-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:883df76ee42d9ecb82b37ff8d01caea5895b3f49630a64d21111078bbf8ef64c", size = 845649, upload-time = "2025-10-21T15:54:42.615Z" }, + { url = "https://files.pythonhosted.org/packages/80/7d/c3ec1cae14e01fab00e38c41ed35f47a853359e95e9c023e9a4381bb122c/regex-2025.10.23-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:2e9117d1d35fc2addae6281019ecc70dc21c30014b0004f657558b91c6a8f1a7", size = 836037, upload-time = "2025-10-21T15:54:44.63Z" }, + { url = "https://files.pythonhosted.org/packages/15/ae/45771140dd43c4d67c87b54d3728078ed6a96599d9fc7ba6825086236782/regex-2025.10.23-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0ff1307f531a5d8cf5c20ea517254551ff0a8dc722193aab66c656c5a900ea68", size = 779705, upload-time = "2025-10-21T15:54:46.08Z" }, + { url = "https://files.pythonhosted.org/packages/b8/95/074e2581760eafce7c816a352b7d3a322536e5b68c346d1a8bacd895545c/regex-2025.10.23-cp310-cp310-win32.whl", hash = "sha256:7888475787cbfee4a7cd32998eeffe9a28129fa44ae0f691b96cb3939183ef41", size = 265663, upload-time = "2025-10-21T15:54:47.854Z" }, + { url = "https://files.pythonhosted.org/packages/f7/c7/a25f56a718847e34d3f1608c72eadeb67653bff1a0411da023dd8f4c647b/regex-2025.10.23-cp310-cp310-win_amd64.whl", hash = "sha256:ec41a905908496ce4906dab20fb103c814558db1d69afc12c2f384549c17936a", size = 277587, upload-time = "2025-10-21T15:54:49.571Z" }, + { url = "https://files.pythonhosted.org/packages/d3/e5/63eb17c6b5deaefd93c2bbb1feae7c0a8d2157da25883a6ca2569cf7a663/regex-2025.10.23-cp310-cp310-win_arm64.whl", hash = "sha256:b2b7f19a764d5e966d5a62bf2c28a8b4093cc864c6734510bdb4aeb840aec5e6", size = 269979, upload-time = "2025-10-21T15:54:51.375Z" }, + { url = "https://files.pythonhosted.org/packages/82/e5/74b7cd5cd76b4171f9793042045bb1726f7856dd56e582fc3e058a7a8a5e/regex-2025.10.23-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6c531155bf9179345e85032052a1e5fe1a696a6abf9cea54b97e8baefff970fd", size = 487960, upload-time = "2025-10-21T15:54:53.253Z" }, + { url = "https://files.pythonhosted.org/packages/b9/08/854fa4b3b20471d1df1c71e831b6a1aa480281e37791e52a2df9641ec5c6/regex-2025.10.23-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:912e9df4e89d383681268d38ad8f5780d7cccd94ba0e9aa09ca7ab7ab4f8e7eb", size = 290425, upload-time = "2025-10-21T15:54:55.21Z" }, + { url = "https://files.pythonhosted.org/packages/ab/d3/6272b1dd3ca1271661e168762b234ad3e00dbdf4ef0c7b9b72d2d159efa7/regex-2025.10.23-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4f375c61bfc3138b13e762fe0ae76e3bdca92497816936534a0177201666f44f", size = 288278, upload-time = "2025-10-21T15:54:56.862Z" }, + { url = "https://files.pythonhosted.org/packages/14/8f/c7b365dd9d9bc0a36e018cb96f2ffb60d2ba8deb589a712b437f67de2920/regex-2025.10.23-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e248cc9446081119128ed002a3801f8031e0c219b5d3c64d3cc627da29ac0a33", size = 793289, upload-time = "2025-10-21T15:54:58.352Z" }, + { url = "https://files.pythonhosted.org/packages/d4/fb/b8fbe9aa16cf0c21f45ec5a6c74b4cecbf1a1c0deb7089d4a6f83a9c1caa/regex-2025.10.23-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b52bf9282fdf401e4f4e721f0f61fc4b159b1307244517789702407dd74e38ca", size = 860321, upload-time = "2025-10-21T15:54:59.813Z" }, + { url = "https://files.pythonhosted.org/packages/b0/81/bf41405c772324926a9bd8a640dedaa42da0e929241834dfce0733070437/regex-2025.10.23-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5c084889ab2c59765a0d5ac602fd1c3c244f9b3fcc9a65fdc7ba6b74c5287490", size = 907011, upload-time = "2025-10-21T15:55:01.968Z" }, + { url = "https://files.pythonhosted.org/packages/a4/fb/5ad6a8b92d3f88f3797b51bb4ef47499acc2d0b53d2fbe4487a892f37a73/regex-2025.10.23-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d80e8eb79009bdb0936658c44ca06e2fbbca67792013e3818eea3f5f228971c2", size = 800312, upload-time = "2025-10-21T15:55:04.15Z" }, + { url = "https://files.pythonhosted.org/packages/42/48/b4efba0168a2b57f944205d823f8e8a3a1ae6211a34508f014ec2c712f4f/regex-2025.10.23-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b6f259118ba87b814a8ec475380aee5f5ae97a75852a3507cf31d055b01b5b40", size = 782839, upload-time = "2025-10-21T15:55:05.641Z" }, + { url = "https://files.pythonhosted.org/packages/13/2a/c9efb4c6c535b0559c1fa8e431e0574d229707c9ca718600366fcfef6801/regex-2025.10.23-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:9b8c72a242683dcc72d37595c4f1278dfd7642b769e46700a8df11eab19dfd82", size = 854270, upload-time = "2025-10-21T15:55:07.27Z" }, + { url = "https://files.pythonhosted.org/packages/34/2d/68eecc1bdaee020e8ba549502291c9450d90d8590d0552247c9b543ebf7b/regex-2025.10.23-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a8d7b7a0a3df9952f9965342159e0c1f05384c0f056a47ce8b61034f8cecbe83", size = 845771, upload-time = "2025-10-21T15:55:09.477Z" }, + { url = "https://files.pythonhosted.org/packages/a5/cd/a1ae499cf9b87afb47a67316bbf1037a7c681ffe447c510ed98c0aa2c01c/regex-2025.10.23-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:413bfea20a484c524858125e92b9ce6ffdd0a4b97d4ff96b5859aa119b0f1bdd", size = 788778, upload-time = "2025-10-21T15:55:11.396Z" }, + { url = "https://files.pythonhosted.org/packages/38/f9/70765e63f5ea7d43b2b6cd4ee9d3323f16267e530fb2a420d92d991cf0fc/regex-2025.10.23-cp311-cp311-win32.whl", hash = "sha256:f76deef1f1019a17dad98f408b8f7afc4bd007cbe835ae77b737e8c7f19ae575", size = 265666, upload-time = "2025-10-21T15:55:13.306Z" }, + { url = "https://files.pythonhosted.org/packages/9c/1a/18e9476ee1b63aaec3844d8e1cb21842dc19272c7e86d879bfc0dcc60db3/regex-2025.10.23-cp311-cp311-win_amd64.whl", hash = "sha256:59bba9f7125536f23fdab5deeea08da0c287a64c1d3acc1c7e99515809824de8", size = 277600, upload-time = "2025-10-21T15:55:15.087Z" }, + { url = "https://files.pythonhosted.org/packages/1d/1b/c019167b1f7a8ec77251457e3ff0339ed74ca8bce1ea13138dc98309c923/regex-2025.10.23-cp311-cp311-win_arm64.whl", hash = "sha256:b103a752b6f1632ca420225718d6ed83f6a6ced3016dd0a4ab9a6825312de566", size = 269974, upload-time = "2025-10-21T15:55:16.841Z" }, + { url = "https://files.pythonhosted.org/packages/f6/57/eeb274d83ab189d02d778851b1ac478477522a92b52edfa6e2ae9ff84679/regex-2025.10.23-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:7a44d9c00f7a0a02d3b777429281376370f3d13d2c75ae74eb94e11ebcf4a7fc", size = 489187, upload-time = "2025-10-21T15:55:18.322Z" }, + { url = "https://files.pythonhosted.org/packages/55/5c/7dad43a9b6ea88bf77e0b8b7729a4c36978e1043165034212fd2702880c6/regex-2025.10.23-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b83601f84fde939ae3478bb32a3aef36f61b58c3208d825c7e8ce1a735f143f2", size = 291122, upload-time = "2025-10-21T15:55:20.2Z" }, + { url = "https://files.pythonhosted.org/packages/66/21/38b71e6f2818f0f4b281c8fba8d9d57cfca7b032a648fa59696e0a54376a/regex-2025.10.23-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ec13647907bb9d15fd192bbfe89ff06612e098a5709e7d6ecabbdd8f7908fc45", size = 288797, upload-time = "2025-10-21T15:55:21.932Z" }, + { url = "https://files.pythonhosted.org/packages/be/95/888f069c89e7729732a6d7cca37f76b44bfb53a1e35dda8a2c7b65c1b992/regex-2025.10.23-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:78d76dd2957d62501084e7012ddafc5fcd406dd982b7a9ca1ea76e8eaaf73e7e", size = 798442, upload-time = "2025-10-21T15:55:23.747Z" }, + { url = "https://files.pythonhosted.org/packages/76/70/4f903c608faf786627a8ee17c06e0067b5acade473678b69c8094b248705/regex-2025.10.23-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8668e5f067e31a47699ebb354f43aeb9c0ef136f915bd864243098524482ac43", size = 864039, upload-time = "2025-10-21T15:55:25.656Z" }, + { url = "https://files.pythonhosted.org/packages/62/19/2df67b526bf25756c7f447dde554fc10a220fd839cc642f50857d01e4a7b/regex-2025.10.23-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a32433fe3deb4b2d8eda88790d2808fed0dc097e84f5e683b4cd4f42edef6cca", size = 912057, upload-time = "2025-10-21T15:55:27.309Z" }, + { url = "https://files.pythonhosted.org/packages/99/14/9a39b7c9e007968411bc3c843cc14cf15437510c0a9991f080cab654fd16/regex-2025.10.23-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d97d73818c642c938db14c0668167f8d39520ca9d983604575ade3fda193afcc", size = 803374, upload-time = "2025-10-21T15:55:28.9Z" }, + { url = "https://files.pythonhosted.org/packages/d4/f7/3495151dd3ca79949599b6d069b72a61a2c5e24fc441dccc79dcaf708fe6/regex-2025.10.23-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bca7feecc72ee33579e9f6ddf8babbe473045717a0e7dbc347099530f96e8b9a", size = 787714, upload-time = "2025-10-21T15:55:30.628Z" }, + { url = "https://files.pythonhosted.org/packages/28/65/ee882455e051131869957ee8597faea45188c9a98c0dad724cfb302d4580/regex-2025.10.23-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:7e24af51e907d7457cc4a72691ec458320b9ae67dc492f63209f01eecb09de32", size = 858392, upload-time = "2025-10-21T15:55:32.322Z" }, + { url = "https://files.pythonhosted.org/packages/53/25/9287fef5be97529ebd3ac79d256159cb709a07eb58d4be780d1ca3885da8/regex-2025.10.23-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:d10bcde58bbdf18146f3a69ec46dd03233b94a4a5632af97aa5378da3a47d288", size = 850484, upload-time = "2025-10-21T15:55:34.037Z" }, + { url = "https://files.pythonhosted.org/packages/f3/b4/b49b88b4fea2f14dc73e5b5842755e782fc2e52f74423d6f4adc130d5880/regex-2025.10.23-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:44383bc0c933388516c2692c9a7503e1f4a67e982f20b9a29d2fb70c6494f147", size = 789634, upload-time = "2025-10-21T15:55:35.958Z" }, + { url = "https://files.pythonhosted.org/packages/b6/3c/2f8d199d0e84e78bcd6bdc2be9b62410624f6b796e2893d1837ae738b160/regex-2025.10.23-cp312-cp312-win32.whl", hash = "sha256:6040a86f95438a0114bba16e51dfe27f1bc004fd29fe725f54a586f6d522b079", size = 266060, upload-time = "2025-10-21T15:55:37.902Z" }, + { url = "https://files.pythonhosted.org/packages/d7/67/c35e80969f6ded306ad70b0698863310bdf36aca57ad792f45ddc0e2271f/regex-2025.10.23-cp312-cp312-win_amd64.whl", hash = "sha256:436b4c4352fe0762e3bfa34a5567079baa2ef22aa9c37cf4d128979ccfcad842", size = 276931, upload-time = "2025-10-21T15:55:39.502Z" }, + { url = "https://files.pythonhosted.org/packages/f5/a1/4ed147de7d2b60174f758412c87fa51ada15cd3296a0ff047f4280aaa7ca/regex-2025.10.23-cp312-cp312-win_arm64.whl", hash = "sha256:f4b1b1991617055b46aff6f6db24888c1f05f4db9801349d23f09ed0714a9335", size = 270103, upload-time = "2025-10-21T15:55:41.24Z" }, + { url = "https://files.pythonhosted.org/packages/28/c6/195a6217a43719d5a6a12cc192a22d12c40290cecfa577f00f4fb822f07d/regex-2025.10.23-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:b7690f95404a1293923a296981fd943cca12c31a41af9c21ba3edd06398fc193", size = 488956, upload-time = "2025-10-21T15:55:42.887Z" }, + { url = "https://files.pythonhosted.org/packages/4c/93/181070cd1aa2fa541ff2d3afcf763ceecd4937b34c615fa92765020a6c90/regex-2025.10.23-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1a32d77aeaea58a13230100dd8797ac1a84c457f3af2fdf0d81ea689d5a9105b", size = 290997, upload-time = "2025-10-21T15:55:44.53Z" }, + { url = "https://files.pythonhosted.org/packages/b6/c5/9d37fbe3a40ed8dda78c23e1263002497540c0d1522ed75482ef6c2000f0/regex-2025.10.23-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b24b29402f264f70a3c81f45974323b41764ff7159655360543b7cabb73e7d2f", size = 288686, upload-time = "2025-10-21T15:55:46.186Z" }, + { url = "https://files.pythonhosted.org/packages/5f/e7/db610ff9f10c2921f9b6ac0c8d8be4681b28ddd40fc0549429366967e61f/regex-2025.10.23-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:563824a08c7c03d96856d84b46fdb3bbb7cfbdf79da7ef68725cda2ce169c72a", size = 798466, upload-time = "2025-10-21T15:55:48.24Z" }, + { url = "https://files.pythonhosted.org/packages/90/10/aab883e1fa7fe2feb15ac663026e70ca0ae1411efa0c7a4a0342d9545015/regex-2025.10.23-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a0ec8bdd88d2e2659c3518087ee34b37e20bd169419ffead4240a7004e8ed03b", size = 863996, upload-time = "2025-10-21T15:55:50.478Z" }, + { url = "https://files.pythonhosted.org/packages/a2/b0/8f686dd97a51f3b37d0238cd00a6d0f9ccabe701f05b56de1918571d0d61/regex-2025.10.23-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b577601bfe1d33913fcd9276d7607bbac827c4798d9e14d04bf37d417a6c41cb", size = 912145, upload-time = "2025-10-21T15:55:52.215Z" }, + { url = "https://files.pythonhosted.org/packages/a3/ca/639f8cd5b08797bca38fc5e7e07f76641a428cf8c7fca05894caf045aa32/regex-2025.10.23-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7c9f2c68ac6cb3de94eea08a437a75eaa2bd33f9e97c84836ca0b610a5804368", size = 803370, upload-time = "2025-10-21T15:55:53.944Z" }, + { url = "https://files.pythonhosted.org/packages/0d/1e/a40725bb76959eddf8abc42a967bed6f4851b39f5ac4f20e9794d7832aa5/regex-2025.10.23-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:89f8b9ea3830c79468e26b0e21c3585f69f105157c2154a36f6b7839f8afb351", size = 787767, upload-time = "2025-10-21T15:55:56.004Z" }, + { url = "https://files.pythonhosted.org/packages/3d/d8/8ee9858062936b0f99656dce390aa667c6e7fb0c357b1b9bf76fb5e2e708/regex-2025.10.23-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:98fd84c4e4ea185b3bb5bf065261ab45867d8875032f358a435647285c722673", size = 858335, upload-time = "2025-10-21T15:55:58.185Z" }, + { url = "https://files.pythonhosted.org/packages/d8/0a/ed5faaa63fa8e3064ab670e08061fbf09e3a10235b19630cf0cbb9e48c0a/regex-2025.10.23-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:1e11d3e5887b8b096f96b4154dfb902f29c723a9556639586cd140e77e28b313", size = 850402, upload-time = "2025-10-21T15:56:00.023Z" }, + { url = "https://files.pythonhosted.org/packages/79/14/d05f617342f4b2b4a23561da500ca2beab062bfcc408d60680e77ecaf04d/regex-2025.10.23-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4f13450328a6634348d47a88367e06b64c9d84980ef6a748f717b13f8ce64e87", size = 789739, upload-time = "2025-10-21T15:56:01.967Z" }, + { url = "https://files.pythonhosted.org/packages/f9/7b/e8ce8eef42a15f2c3461f8b3e6e924bbc86e9605cb534a393aadc8d3aff8/regex-2025.10.23-cp313-cp313-win32.whl", hash = "sha256:37be9296598a30c6a20236248cb8b2c07ffd54d095b75d3a2a2ee5babdc51df1", size = 266054, upload-time = "2025-10-21T15:56:05.291Z" }, + { url = "https://files.pythonhosted.org/packages/71/2d/55184ed6be6473187868d2f2e6a0708195fc58270e62a22cbf26028f2570/regex-2025.10.23-cp313-cp313-win_amd64.whl", hash = "sha256:ea7a3c283ce0f06fe789365841e9174ba05f8db16e2fd6ae00a02df9572c04c0", size = 276917, upload-time = "2025-10-21T15:56:07.303Z" }, + { url = "https://files.pythonhosted.org/packages/9c/d4/927eced0e2bd45c45839e556f987f8c8f8683268dd3c00ad327deb3b0172/regex-2025.10.23-cp313-cp313-win_arm64.whl", hash = "sha256:d9a4953575f300a7bab71afa4cd4ac061c7697c89590a2902b536783eeb49a4f", size = 270105, upload-time = "2025-10-21T15:56:09.857Z" }, + { url = "https://files.pythonhosted.org/packages/3e/b3/95b310605285573341fc062d1d30b19a54f857530e86c805f942c4ff7941/regex-2025.10.23-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:7d6606524fa77b3912c9ef52a42ef63c6cfbfc1077e9dc6296cd5da0da286044", size = 491850, upload-time = "2025-10-21T15:56:11.685Z" }, + { url = "https://files.pythonhosted.org/packages/a4/8f/207c2cec01e34e56db1eff606eef46644a60cf1739ecd474627db90ad90b/regex-2025.10.23-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:c037aadf4d64bdc38af7db3dbd34877a057ce6524eefcb2914d6d41c56f968cc", size = 292537, upload-time = "2025-10-21T15:56:13.963Z" }, + { url = "https://files.pythonhosted.org/packages/98/3b/025240af4ada1dc0b5f10d73f3e5122d04ce7f8908ab8881e5d82b9d61b6/regex-2025.10.23-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:99018c331fb2529084a0c9b4c713dfa49fafb47c7712422e49467c13a636c656", size = 290904, upload-time = "2025-10-21T15:56:16.016Z" }, + { url = "https://files.pythonhosted.org/packages/81/8e/104ac14e2d3450c43db18ec03e1b96b445a94ae510b60138f00ce2cb7ca1/regex-2025.10.23-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fd8aba965604d70306eb90a35528f776e59112a7114a5162824d43b76fa27f58", size = 807311, upload-time = "2025-10-21T15:56:17.818Z" }, + { url = "https://files.pythonhosted.org/packages/19/63/78aef90141b7ce0be8a18e1782f764f6997ad09de0e05251f0d2503a914a/regex-2025.10.23-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:238e67264b4013e74136c49f883734f68656adf8257bfa13b515626b31b20f8e", size = 873241, upload-time = "2025-10-21T15:56:19.941Z" }, + { url = "https://files.pythonhosted.org/packages/b3/a8/80eb1201bb49ae4dba68a1b284b4211ed9daa8e74dc600018a10a90399fb/regex-2025.10.23-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b2eb48bd9848d66fd04826382f5e8491ae633de3233a3d64d58ceb4ecfa2113a", size = 914794, upload-time = "2025-10-21T15:56:22.488Z" }, + { url = "https://files.pythonhosted.org/packages/f0/d5/1984b6ee93281f360a119a5ca1af6a8ca7d8417861671388bf750becc29b/regex-2025.10.23-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d36591ce06d047d0c0fe2fc5f14bfbd5b4525d08a7b6a279379085e13f0e3d0e", size = 812581, upload-time = "2025-10-21T15:56:24.319Z" }, + { url = "https://files.pythonhosted.org/packages/c4/39/11ebdc6d9927172a64ae237d16763145db6bd45ebb4055c17b88edab72a7/regex-2025.10.23-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b5d4ece8628d6e364302006366cea3ee887db397faebacc5dacf8ef19e064cf8", size = 795346, upload-time = "2025-10-21T15:56:26.232Z" }, + { url = "https://files.pythonhosted.org/packages/3b/b4/89a591bcc08b5e436af43315284bd233ba77daf0cf20e098d7af12f006c1/regex-2025.10.23-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:39a7e8083959cb1c4ff74e483eecb5a65d3b3e1d821b256e54baf61782c906c6", size = 868214, upload-time = "2025-10-21T15:56:28.597Z" }, + { url = "https://files.pythonhosted.org/packages/3d/ff/58ba98409c1dbc8316cdb20dafbc63ed267380a07780cafecaf5012dabc9/regex-2025.10.23-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:842d449a8fefe546f311656cf8c0d6729b08c09a185f1cad94c756210286d6a8", size = 854540, upload-time = "2025-10-21T15:56:30.875Z" }, + { url = "https://files.pythonhosted.org/packages/9a/f2/4a9e9338d67626e2071b643f828a482712ad15889d7268e11e9a63d6f7e9/regex-2025.10.23-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:d614986dc68506be8f00474f4f6960e03e4ca9883f7df47744800e7d7c08a494", size = 799346, upload-time = "2025-10-21T15:56:32.725Z" }, + { url = "https://files.pythonhosted.org/packages/63/be/543d35c46bebf6f7bf2be538cca74d6585f25714700c36f37f01b92df551/regex-2025.10.23-cp313-cp313t-win32.whl", hash = "sha256:a5b7a26b51a9df473ec16a1934d117443a775ceb7b39b78670b2e21893c330c9", size = 268657, upload-time = "2025-10-21T15:56:34.577Z" }, + { url = "https://files.pythonhosted.org/packages/14/9f/4dd6b7b612037158bb2c9bcaa710e6fb3c40ad54af441b9c53b3a137a9f1/regex-2025.10.23-cp313-cp313t-win_amd64.whl", hash = "sha256:ce81c5544a5453f61cb6f548ed358cfb111e3b23f3cd42d250a4077a6be2a7b6", size = 280075, upload-time = "2025-10-21T15:56:36.767Z" }, + { url = "https://files.pythonhosted.org/packages/81/7a/5bd0672aa65d38c8da6747c17c8b441bdb53d816c569e3261013af8e83cf/regex-2025.10.23-cp313-cp313t-win_arm64.whl", hash = "sha256:e9bf7f6699f490e4e43c44757aa179dab24d1960999c84ab5c3d5377714ed473", size = 271219, upload-time = "2025-10-21T15:56:39.033Z" }, + { url = "https://files.pythonhosted.org/packages/73/f6/0caf29fec943f201fbc8822879c99d31e59c1d51a983d9843ee5cf398539/regex-2025.10.23-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:5b5cb5b6344c4c4c24b2dc87b0bfee78202b07ef7633385df70da7fcf6f7cec6", size = 488960, upload-time = "2025-10-21T15:56:40.849Z" }, + { url = "https://files.pythonhosted.org/packages/8e/7d/ebb7085b8fa31c24ce0355107cea2b92229d9050552a01c5d291c42aecea/regex-2025.10.23-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a6ce7973384c37bdf0f371a843f95a6e6f4e1489e10e0cf57330198df72959c5", size = 290932, upload-time = "2025-10-21T15:56:42.875Z" }, + { url = "https://files.pythonhosted.org/packages/27/41/43906867287cbb5ca4cee671c3cc8081e15deef86a8189c3aad9ac9f6b4d/regex-2025.10.23-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2ee3663f2c334959016b56e3bd0dd187cbc73f948e3a3af14c3caaa0c3035d10", size = 288766, upload-time = "2025-10-21T15:56:44.894Z" }, + { url = "https://files.pythonhosted.org/packages/ab/9e/ea66132776700fc77a39b1056e7a5f1308032fead94507e208dc6716b7cd/regex-2025.10.23-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2003cc82a579107e70d013482acce8ba773293f2db534fb532738395c557ff34", size = 798884, upload-time = "2025-10-21T15:56:47.178Z" }, + { url = "https://files.pythonhosted.org/packages/d5/99/aed1453687ab63819a443930770db972c5c8064421f0d9f5da9ad029f26b/regex-2025.10.23-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:182c452279365a93a9f45874f7f191ec1c51e1f1eb41bf2b16563f1a40c1da3a", size = 864768, upload-time = "2025-10-21T15:56:49.793Z" }, + { url = "https://files.pythonhosted.org/packages/99/5d/732fe747a1304805eb3853ce6337eea16b169f7105a0d0dd9c6a5ffa9948/regex-2025.10.23-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b1249e9ff581c5b658c8f0437f883b01f1edcf424a16388591e7c05e5e9e8b0c", size = 911394, upload-time = "2025-10-21T15:56:52.186Z" }, + { url = "https://files.pythonhosted.org/packages/5e/48/58a1f6623466522352a6efa153b9a3714fc559d9f930e9bc947b4a88a2c3/regex-2025.10.23-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b841698f93db3ccc36caa1900d2a3be281d9539b822dc012f08fc80b46a3224", size = 803145, upload-time = "2025-10-21T15:56:55.142Z" }, + { url = "https://files.pythonhosted.org/packages/ea/f6/7dea79be2681a5574ab3fc237aa53b2c1dfd6bd2b44d4640b6c76f33f4c1/regex-2025.10.23-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:956d89e0c92d471e8f7eee73f73fdff5ed345886378c45a43175a77538a1ffe4", size = 787831, upload-time = "2025-10-21T15:56:57.203Z" }, + { url = "https://files.pythonhosted.org/packages/3a/ad/07b76950fbbe65f88120ca2d8d845047c401450f607c99ed38862904671d/regex-2025.10.23-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:5c259cb363299a0d90d63b5c0d7568ee98419861618a95ee9d91a41cb9954462", size = 859162, upload-time = "2025-10-21T15:56:59.195Z" }, + { url = "https://files.pythonhosted.org/packages/41/87/374f3b2021b22aa6a4fc0b750d63f9721e53d1631a238f7a1c343c1cd288/regex-2025.10.23-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:185d2b18c062820b3a40d8fefa223a83f10b20a674bf6e8c4a432e8dfd844627", size = 849899, upload-time = "2025-10-21T15:57:01.747Z" }, + { url = "https://files.pythonhosted.org/packages/12/4a/7f7bb17c5a5a9747249807210e348450dab9212a46ae6d23ebce86ba6a2b/regex-2025.10.23-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:281d87fa790049c2b7c1b4253121edd80b392b19b5a3d28dc2a77579cb2a58ec", size = 789372, upload-time = "2025-10-21T15:57:04.018Z" }, + { url = "https://files.pythonhosted.org/packages/c9/dd/9c7728ff544fea09bbc8635e4c9e7c423b11c24f1a7a14e6ac4831466709/regex-2025.10.23-cp314-cp314-win32.whl", hash = "sha256:63b81eef3656072e4ca87c58084c7a9c2b81d41a300b157be635a8a675aacfb8", size = 271451, upload-time = "2025-10-21T15:57:06.266Z" }, + { url = "https://files.pythonhosted.org/packages/48/f8/ef7837ff858eb74079c4804c10b0403c0b740762e6eedba41062225f7117/regex-2025.10.23-cp314-cp314-win_amd64.whl", hash = "sha256:0967c5b86f274800a34a4ed862dfab56928144d03cb18821c5153f8777947796", size = 280173, upload-time = "2025-10-21T15:57:08.206Z" }, + { url = "https://files.pythonhosted.org/packages/8e/d0/d576e1dbd9885bfcd83d0e90762beea48d9373a6f7ed39170f44ed22e336/regex-2025.10.23-cp314-cp314-win_arm64.whl", hash = "sha256:c70dfe58b0a00b36aa04cdb0f798bf3e0adc31747641f69e191109fd8572c9a9", size = 273206, upload-time = "2025-10-21T15:57:10.367Z" }, + { url = "https://files.pythonhosted.org/packages/a6/d0/2025268315e8b2b7b660039824cb7765a41623e97d4cd421510925400487/regex-2025.10.23-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:1f5799ea1787aa6de6c150377d11afad39a38afd033f0c5247aecb997978c422", size = 491854, upload-time = "2025-10-21T15:57:12.526Z" }, + { url = "https://files.pythonhosted.org/packages/44/35/5681c2fec5e8b33454390af209c4353dfc44606bf06d714b0b8bd0454ffe/regex-2025.10.23-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:a9639ab7540cfea45ef57d16dcbea2e22de351998d614c3ad2f9778fa3bdd788", size = 292542, upload-time = "2025-10-21T15:57:15.158Z" }, + { url = "https://files.pythonhosted.org/packages/5d/17/184eed05543b724132e4a18149e900f5189001fcfe2d64edaae4fbaf36b4/regex-2025.10.23-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:08f52122c352eb44c3421dab78b9b73a8a77a282cc8314ae576fcaa92b780d10", size = 290903, upload-time = "2025-10-21T15:57:17.108Z" }, + { url = "https://files.pythonhosted.org/packages/25/d0/5e3347aa0db0de382dddfa133a7b0ae72f24b4344f3989398980b44a3924/regex-2025.10.23-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ebf1baebef1c4088ad5a5623decec6b52950f0e4d7a0ae4d48f0a99f8c9cb7d7", size = 807546, upload-time = "2025-10-21T15:57:19.179Z" }, + { url = "https://files.pythonhosted.org/packages/d2/bb/40c589bbdce1be0c55e9f8159789d58d47a22014f2f820cf2b517a5cd193/regex-2025.10.23-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:16b0f1c2e2d566c562d5c384c2b492646be0a19798532fdc1fdedacc66e3223f", size = 873322, upload-time = "2025-10-21T15:57:21.36Z" }, + { url = "https://files.pythonhosted.org/packages/fe/56/a7e40c01575ac93360e606278d359f91829781a9f7fb6e5aa435039edbda/regex-2025.10.23-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f7ada5d9dceafaab92646aa00c10a9efd9b09942dd9b0d7c5a4b73db92cc7e61", size = 914855, upload-time = "2025-10-21T15:57:24.044Z" }, + { url = "https://files.pythonhosted.org/packages/5c/4b/d55587b192763db3163c3f508b3b67b31bb6f5e7a0e08b83013d0a59500a/regex-2025.10.23-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3a36b4005770044bf08edecc798f0e41a75795b9e7c9c12fe29da8d792ef870c", size = 812724, upload-time = "2025-10-21T15:57:26.123Z" }, + { url = "https://files.pythonhosted.org/packages/33/20/18bac334955fbe99d17229f4f8e98d05e4a501ac03a442be8facbb37c304/regex-2025.10.23-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:af7b2661dcc032da1fae82069b5ebf2ac1dfcd5359ef8b35e1367bfc92181432", size = 795439, upload-time = "2025-10-21T15:57:28.497Z" }, + { url = "https://files.pythonhosted.org/packages/67/46/c57266be9df8549c7d85deb4cb82280cb0019e46fff677534c5fa1badfa4/regex-2025.10.23-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:1cb976810ac1416a67562c2e5ba0accf6f928932320fef302e08100ed681b38e", size = 868336, upload-time = "2025-10-21T15:57:30.867Z" }, + { url = "https://files.pythonhosted.org/packages/b8/f3/bd5879e41ef8187fec5e678e94b526a93f99e7bbe0437b0f2b47f9101694/regex-2025.10.23-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:1a56a54be3897d62f54290190fbcd754bff6932934529fbf5b29933da28fcd43", size = 854567, upload-time = "2025-10-21T15:57:33.062Z" }, + { url = "https://files.pythonhosted.org/packages/e6/57/2b6bbdbd2f24dfed5b028033aa17ad8f7d86bb28f1a892cac8b3bc89d059/regex-2025.10.23-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8f3e6d202fb52c2153f532043bbcf618fd177df47b0b306741eb9b60ba96edc3", size = 799565, upload-time = "2025-10-21T15:57:35.153Z" }, + { url = "https://files.pythonhosted.org/packages/c7/ba/a6168f542ba73b151ed81237adf6b869c7b2f7f8d51618111296674e20ee/regex-2025.10.23-cp314-cp314t-win32.whl", hash = "sha256:1fa1186966b2621b1769fd467c7b22e317e6ba2d2cdcecc42ea3089ef04a8521", size = 274428, upload-time = "2025-10-21T15:57:37.996Z" }, + { url = "https://files.pythonhosted.org/packages/ef/a0/c84475e14a2829e9b0864ebf77c3f7da909df9d8acfe2bb540ff0072047c/regex-2025.10.23-cp314-cp314t-win_amd64.whl", hash = "sha256:08a15d40ce28362eac3e78e83d75475147869c1ff86bc93285f43b4f4431a741", size = 284140, upload-time = "2025-10-21T15:57:40.027Z" }, + { url = "https://files.pythonhosted.org/packages/51/33/6a08ade0eee5b8ba79386869fa6f77afeb835b60510f3525db987e2fffc4/regex-2025.10.23-cp314-cp314t-win_arm64.whl", hash = "sha256:a93e97338e1c8ea2649e130dcfbe8cd69bba5e1e163834752ab64dcb4de6d5ed", size = 274497, upload-time = "2025-10-21T15:57:42.389Z" }, ] [[package]] @@ -4835,137 +4835,124 @@ wheels = [ [[package]] name = "rpds-py" -version = "0.27.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e9/dd/2c0cbe774744272b0ae725f44032c77bdcab6e8bcf544bffa3b6e70c8dba/rpds_py-0.27.1.tar.gz", hash = "sha256:26a1c73171d10b7acccbded82bf6a586ab8203601e565badc74bbbf8bc5a10f8", size = 27479, upload-time = "2025-08-27T12:16:36.024Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a5/ed/3aef893e2dd30e77e35d20d4ddb45ca459db59cead748cad9796ad479411/rpds_py-0.27.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:68afeec26d42ab3b47e541b272166a0b4400313946871cba3ed3a4fc0cab1cef", size = 371606, upload-time = "2025-08-27T12:12:25.189Z" }, - { url = "https://files.pythonhosted.org/packages/6d/82/9818b443e5d3eb4c83c3994561387f116aae9833b35c484474769c4a8faf/rpds_py-0.27.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:74e5b2f7bb6fa38b1b10546d27acbacf2a022a8b5543efb06cfebc72a59c85be", size = 353452, upload-time = "2025-08-27T12:12:27.433Z" }, - { url = "https://files.pythonhosted.org/packages/99/c7/d2a110ffaaa397fc6793a83c7bd3545d9ab22658b7cdff05a24a4535cc45/rpds_py-0.27.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9024de74731df54546fab0bfbcdb49fae19159ecaecfc8f37c18d2c7e2c0bd61", size = 381519, upload-time = "2025-08-27T12:12:28.719Z" }, - { url = "https://files.pythonhosted.org/packages/5a/bc/e89581d1f9d1be7d0247eaef602566869fdc0d084008ba139e27e775366c/rpds_py-0.27.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:31d3ebadefcd73b73928ed0b2fd696f7fefda8629229f81929ac9c1854d0cffb", size = 394424, upload-time = "2025-08-27T12:12:30.207Z" }, - { url = "https://files.pythonhosted.org/packages/ac/2e/36a6861f797530e74bb6ed53495f8741f1ef95939eed01d761e73d559067/rpds_py-0.27.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b2e7f8f169d775dd9092a1743768d771f1d1300453ddfe6325ae3ab5332b4657", size = 523467, upload-time = "2025-08-27T12:12:31.808Z" }, - { url = "https://files.pythonhosted.org/packages/c4/59/c1bc2be32564fa499f988f0a5c6505c2f4746ef96e58e4d7de5cf923d77e/rpds_py-0.27.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d905d16f77eb6ab2e324e09bfa277b4c8e5e6b8a78a3e7ff8f3cdf773b4c013", size = 402660, upload-time = "2025-08-27T12:12:33.444Z" }, - { url = "https://files.pythonhosted.org/packages/0a/ec/ef8bf895f0628dd0a59e54d81caed6891663cb9c54a0f4bb7da918cb88cf/rpds_py-0.27.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50c946f048209e6362e22576baea09193809f87687a95a8db24e5fbdb307b93a", size = 384062, upload-time = "2025-08-27T12:12:34.857Z" }, - { url = "https://files.pythonhosted.org/packages/69/f7/f47ff154be8d9a5e691c083a920bba89cef88d5247c241c10b9898f595a1/rpds_py-0.27.1-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:3deab27804d65cd8289eb814c2c0e807c4b9d9916c9225e363cb0cf875eb67c1", size = 401289, upload-time = "2025-08-27T12:12:36.085Z" }, - { url = "https://files.pythonhosted.org/packages/3b/d9/ca410363efd0615814ae579f6829cafb39225cd63e5ea5ed1404cb345293/rpds_py-0.27.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8b61097f7488de4be8244c89915da8ed212832ccf1e7c7753a25a394bf9b1f10", size = 417718, upload-time = "2025-08-27T12:12:37.401Z" }, - { url = "https://files.pythonhosted.org/packages/e3/a0/8cb5c2ff38340f221cc067cc093d1270e10658ba4e8d263df923daa18e86/rpds_py-0.27.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8a3f29aba6e2d7d90528d3c792555a93497fe6538aa65eb675b44505be747808", size = 558333, upload-time = "2025-08-27T12:12:38.672Z" }, - { url = "https://files.pythonhosted.org/packages/6f/8c/1b0de79177c5d5103843774ce12b84caa7164dfc6cd66378768d37db11bf/rpds_py-0.27.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:dd6cd0485b7d347304067153a6dc1d73f7d4fd995a396ef32a24d24b8ac63ac8", size = 589127, upload-time = "2025-08-27T12:12:41.48Z" }, - { url = "https://files.pythonhosted.org/packages/c8/5e/26abb098d5e01266b0f3a2488d299d19ccc26849735d9d2b95c39397e945/rpds_py-0.27.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:6f4461bf931108c9fa226ffb0e257c1b18dc2d44cd72b125bec50ee0ab1248a9", size = 554899, upload-time = "2025-08-27T12:12:42.925Z" }, - { url = "https://files.pythonhosted.org/packages/de/41/905cc90ced13550db017f8f20c6d8e8470066c5738ba480d7ba63e3d136b/rpds_py-0.27.1-cp310-cp310-win32.whl", hash = "sha256:ee5422d7fb21f6a00c1901bf6559c49fee13a5159d0288320737bbf6585bd3e4", size = 217450, upload-time = "2025-08-27T12:12:44.813Z" }, - { url = "https://files.pythonhosted.org/packages/75/3d/6bef47b0e253616ccdf67c283e25f2d16e18ccddd38f92af81d5a3420206/rpds_py-0.27.1-cp310-cp310-win_amd64.whl", hash = "sha256:3e039aabf6d5f83c745d5f9a0a381d031e9ed871967c0a5c38d201aca41f3ba1", size = 228447, upload-time = "2025-08-27T12:12:46.204Z" }, - { url = "https://files.pythonhosted.org/packages/b5/c1/7907329fbef97cbd49db6f7303893bd1dd5a4a3eae415839ffdfb0762cae/rpds_py-0.27.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:be898f271f851f68b318872ce6ebebbc62f303b654e43bf72683dbdc25b7c881", size = 371063, upload-time = "2025-08-27T12:12:47.856Z" }, - { url = "https://files.pythonhosted.org/packages/11/94/2aab4bc86228bcf7c48760990273653a4900de89c7537ffe1b0d6097ed39/rpds_py-0.27.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:62ac3d4e3e07b58ee0ddecd71d6ce3b1637de2d373501412df395a0ec5f9beb5", size = 353210, upload-time = "2025-08-27T12:12:49.187Z" }, - { url = "https://files.pythonhosted.org/packages/3a/57/f5eb3ecf434342f4f1a46009530e93fd201a0b5b83379034ebdb1d7c1a58/rpds_py-0.27.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4708c5c0ceb2d034f9991623631d3d23cb16e65c83736ea020cdbe28d57c0a0e", size = 381636, upload-time = "2025-08-27T12:12:50.492Z" }, - { url = "https://files.pythonhosted.org/packages/ae/f4/ef95c5945e2ceb5119571b184dd5a1cc4b8541bbdf67461998cfeac9cb1e/rpds_py-0.27.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:abfa1171a9952d2e0002aba2ad3780820b00cc3d9c98c6630f2e93271501f66c", size = 394341, upload-time = "2025-08-27T12:12:52.024Z" }, - { url = "https://files.pythonhosted.org/packages/5a/7e/4bd610754bf492d398b61725eb9598ddd5eb86b07d7d9483dbcd810e20bc/rpds_py-0.27.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b507d19f817ebaca79574b16eb2ae412e5c0835542c93fe9983f1e432aca195", size = 523428, upload-time = "2025-08-27T12:12:53.779Z" }, - { url = "https://files.pythonhosted.org/packages/9f/e5/059b9f65a8c9149361a8b75094864ab83b94718344db511fd6117936ed2a/rpds_py-0.27.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:168b025f8fd8d8d10957405f3fdcef3dc20f5982d398f90851f4abc58c566c52", size = 402923, upload-time = "2025-08-27T12:12:55.15Z" }, - { url = "https://files.pythonhosted.org/packages/f5/48/64cabb7daced2968dd08e8a1b7988bf358d7bd5bcd5dc89a652f4668543c/rpds_py-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb56c6210ef77caa58e16e8c17d35c63fe3f5b60fd9ba9d424470c3400bcf9ed", size = 384094, upload-time = "2025-08-27T12:12:57.194Z" }, - { url = "https://files.pythonhosted.org/packages/ae/e1/dc9094d6ff566bff87add8a510c89b9e158ad2ecd97ee26e677da29a9e1b/rpds_py-0.27.1-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:d252f2d8ca0195faa707f8eb9368955760880b2b42a8ee16d382bf5dd807f89a", size = 401093, upload-time = "2025-08-27T12:12:58.985Z" }, - { url = "https://files.pythonhosted.org/packages/37/8e/ac8577e3ecdd5593e283d46907d7011618994e1d7ab992711ae0f78b9937/rpds_py-0.27.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6e5e54da1e74b91dbc7996b56640f79b195d5925c2b78efaa8c5d53e1d88edde", size = 417969, upload-time = "2025-08-27T12:13:00.367Z" }, - { url = "https://files.pythonhosted.org/packages/66/6d/87507430a8f74a93556fe55c6485ba9c259949a853ce407b1e23fea5ba31/rpds_py-0.27.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ffce0481cc6e95e5b3f0a47ee17ffbd234399e6d532f394c8dce320c3b089c21", size = 558302, upload-time = "2025-08-27T12:13:01.737Z" }, - { url = "https://files.pythonhosted.org/packages/3a/bb/1db4781ce1dda3eecc735e3152659a27b90a02ca62bfeea17aee45cc0fbc/rpds_py-0.27.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:a205fdfe55c90c2cd8e540ca9ceba65cbe6629b443bc05db1f590a3db8189ff9", size = 589259, upload-time = "2025-08-27T12:13:03.127Z" }, - { url = "https://files.pythonhosted.org/packages/7b/0e/ae1c8943d11a814d01b482e1f8da903f88047a962dff9bbdadf3bd6e6fd1/rpds_py-0.27.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:689fb5200a749db0415b092972e8eba85847c23885c8543a8b0f5c009b1a5948", size = 554983, upload-time = "2025-08-27T12:13:04.516Z" }, - { url = "https://files.pythonhosted.org/packages/b2/d5/0b2a55415931db4f112bdab072443ff76131b5ac4f4dc98d10d2d357eb03/rpds_py-0.27.1-cp311-cp311-win32.whl", hash = "sha256:3182af66048c00a075010bc7f4860f33913528a4b6fc09094a6e7598e462fe39", size = 217154, upload-time = "2025-08-27T12:13:06.278Z" }, - { url = "https://files.pythonhosted.org/packages/24/75/3b7ffe0d50dc86a6a964af0d1cc3a4a2cdf437cb7b099a4747bbb96d1819/rpds_py-0.27.1-cp311-cp311-win_amd64.whl", hash = "sha256:b4938466c6b257b2f5c4ff98acd8128ec36b5059e5c8f8372d79316b1c36bb15", size = 228627, upload-time = "2025-08-27T12:13:07.625Z" }, - { url = "https://files.pythonhosted.org/packages/8d/3f/4fd04c32abc02c710f09a72a30c9a55ea3cc154ef8099078fd50a0596f8e/rpds_py-0.27.1-cp311-cp311-win_arm64.whl", hash = "sha256:2f57af9b4d0793e53266ee4325535a31ba48e2f875da81a9177c9926dfa60746", size = 220998, upload-time = "2025-08-27T12:13:08.972Z" }, - { url = "https://files.pythonhosted.org/packages/bd/fe/38de28dee5df58b8198c743fe2bea0c785c6d40941b9950bac4cdb71a014/rpds_py-0.27.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:ae2775c1973e3c30316892737b91f9283f9908e3cc7625b9331271eaaed7dc90", size = 361887, upload-time = "2025-08-27T12:13:10.233Z" }, - { url = "https://files.pythonhosted.org/packages/7c/9a/4b6c7eedc7dd90986bf0fab6ea2a091ec11c01b15f8ba0a14d3f80450468/rpds_py-0.27.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2643400120f55c8a96f7c9d858f7be0c88d383cd4653ae2cf0d0c88f668073e5", size = 345795, upload-time = "2025-08-27T12:13:11.65Z" }, - { url = "https://files.pythonhosted.org/packages/6f/0e/e650e1b81922847a09cca820237b0edee69416a01268b7754d506ade11ad/rpds_py-0.27.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16323f674c089b0360674a4abd28d5042947d54ba620f72514d69be4ff64845e", size = 385121, upload-time = "2025-08-27T12:13:13.008Z" }, - { url = "https://files.pythonhosted.org/packages/1b/ea/b306067a712988e2bff00dcc7c8f31d26c29b6d5931b461aa4b60a013e33/rpds_py-0.27.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9a1f4814b65eacac94a00fc9a526e3fdafd78e439469644032032d0d63de4881", size = 398976, upload-time = "2025-08-27T12:13:14.368Z" }, - { url = "https://files.pythonhosted.org/packages/2c/0a/26dc43c8840cb8fe239fe12dbc8d8de40f2365e838f3d395835dde72f0e5/rpds_py-0.27.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ba32c16b064267b22f1850a34051121d423b6f7338a12b9459550eb2096e7ec", size = 525953, upload-time = "2025-08-27T12:13:15.774Z" }, - { url = "https://files.pythonhosted.org/packages/22/14/c85e8127b573aaf3a0cbd7fbb8c9c99e735a4a02180c84da2a463b766e9e/rpds_py-0.27.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5c20f33fd10485b80f65e800bbe5f6785af510b9f4056c5a3c612ebc83ba6cb", size = 407915, upload-time = "2025-08-27T12:13:17.379Z" }, - { url = "https://files.pythonhosted.org/packages/ed/7b/8f4fee9ba1fb5ec856eb22d725a4efa3deb47f769597c809e03578b0f9d9/rpds_py-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:466bfe65bd932da36ff279ddd92de56b042f2266d752719beb97b08526268ec5", size = 386883, upload-time = "2025-08-27T12:13:18.704Z" }, - { url = "https://files.pythonhosted.org/packages/86/47/28fa6d60f8b74fcdceba81b272f8d9836ac0340570f68f5df6b41838547b/rpds_py-0.27.1-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:41e532bbdcb57c92ba3be62c42e9f096431b4cf478da9bc3bc6ce5c38ab7ba7a", size = 405699, upload-time = "2025-08-27T12:13:20.089Z" }, - { url = "https://files.pythonhosted.org/packages/d0/fd/c5987b5e054548df56953a21fe2ebed51fc1ec7c8f24fd41c067b68c4a0a/rpds_py-0.27.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f149826d742b406579466283769a8ea448eed82a789af0ed17b0cd5770433444", size = 423713, upload-time = "2025-08-27T12:13:21.436Z" }, - { url = "https://files.pythonhosted.org/packages/ac/ba/3c4978b54a73ed19a7d74531be37a8bcc542d917c770e14d372b8daea186/rpds_py-0.27.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:80c60cfb5310677bd67cb1e85a1e8eb52e12529545441b43e6f14d90b878775a", size = 562324, upload-time = "2025-08-27T12:13:22.789Z" }, - { url = "https://files.pythonhosted.org/packages/b5/6c/6943a91768fec16db09a42b08644b960cff540c66aab89b74be6d4a144ba/rpds_py-0.27.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:7ee6521b9baf06085f62ba9c7a3e5becffbc32480d2f1b351559c001c38ce4c1", size = 593646, upload-time = "2025-08-27T12:13:24.122Z" }, - { url = "https://files.pythonhosted.org/packages/11/73/9d7a8f4be5f4396f011a6bb7a19fe26303a0dac9064462f5651ced2f572f/rpds_py-0.27.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a512c8263249a9d68cac08b05dd59d2b3f2061d99b322813cbcc14c3c7421998", size = 558137, upload-time = "2025-08-27T12:13:25.557Z" }, - { url = "https://files.pythonhosted.org/packages/6e/96/6772cbfa0e2485bcceef8071de7821f81aeac8bb45fbfd5542a3e8108165/rpds_py-0.27.1-cp312-cp312-win32.whl", hash = "sha256:819064fa048ba01b6dadc5116f3ac48610435ac9a0058bbde98e569f9e785c39", size = 221343, upload-time = "2025-08-27T12:13:26.967Z" }, - { url = "https://files.pythonhosted.org/packages/67/b6/c82f0faa9af1c6a64669f73a17ee0eeef25aff30bb9a1c318509efe45d84/rpds_py-0.27.1-cp312-cp312-win_amd64.whl", hash = "sha256:d9199717881f13c32c4046a15f024971a3b78ad4ea029e8da6b86e5aa9cf4594", size = 232497, upload-time = "2025-08-27T12:13:28.326Z" }, - { url = "https://files.pythonhosted.org/packages/e1/96/2817b44bd2ed11aebacc9251da03689d56109b9aba5e311297b6902136e2/rpds_py-0.27.1-cp312-cp312-win_arm64.whl", hash = "sha256:33aa65b97826a0e885ef6e278fbd934e98cdcfed80b63946025f01e2f5b29502", size = 222790, upload-time = "2025-08-27T12:13:29.71Z" }, - { url = "https://files.pythonhosted.org/packages/cc/77/610aeee8d41e39080c7e14afa5387138e3c9fa9756ab893d09d99e7d8e98/rpds_py-0.27.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e4b9fcfbc021633863a37e92571d6f91851fa656f0180246e84cbd8b3f6b329b", size = 361741, upload-time = "2025-08-27T12:13:31.039Z" }, - { url = "https://files.pythonhosted.org/packages/3a/fc/c43765f201c6a1c60be2043cbdb664013def52460a4c7adace89d6682bf4/rpds_py-0.27.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1441811a96eadca93c517d08df75de45e5ffe68aa3089924f963c782c4b898cf", size = 345574, upload-time = "2025-08-27T12:13:32.902Z" }, - { url = "https://files.pythonhosted.org/packages/20/42/ee2b2ca114294cd9847d0ef9c26d2b0851b2e7e00bf14cc4c0b581df0fc3/rpds_py-0.27.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55266dafa22e672f5a4f65019015f90336ed31c6383bd53f5e7826d21a0e0b83", size = 385051, upload-time = "2025-08-27T12:13:34.228Z" }, - { url = "https://files.pythonhosted.org/packages/fd/e8/1e430fe311e4799e02e2d1af7c765f024e95e17d651612425b226705f910/rpds_py-0.27.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d78827d7ac08627ea2c8e02c9e5b41180ea5ea1f747e9db0915e3adf36b62dcf", size = 398395, upload-time = "2025-08-27T12:13:36.132Z" }, - { url = "https://files.pythonhosted.org/packages/82/95/9dc227d441ff2670651c27a739acb2535ccaf8b351a88d78c088965e5996/rpds_py-0.27.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae92443798a40a92dc5f0b01d8a7c93adde0c4dc965310a29ae7c64d72b9fad2", size = 524334, upload-time = "2025-08-27T12:13:37.562Z" }, - { url = "https://files.pythonhosted.org/packages/87/01/a670c232f401d9ad461d9a332aa4080cd3cb1d1df18213dbd0d2a6a7ab51/rpds_py-0.27.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c46c9dd2403b66a2a3b9720ec4b74d4ab49d4fabf9f03dfdce2d42af913fe8d0", size = 407691, upload-time = "2025-08-27T12:13:38.94Z" }, - { url = "https://files.pythonhosted.org/packages/03/36/0a14aebbaa26fe7fab4780c76f2239e76cc95a0090bdb25e31d95c492fcd/rpds_py-0.27.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2efe4eb1d01b7f5f1939f4ef30ecea6c6b3521eec451fb93191bf84b2a522418", size = 386868, upload-time = "2025-08-27T12:13:40.192Z" }, - { url = "https://files.pythonhosted.org/packages/3b/03/8c897fb8b5347ff6c1cc31239b9611c5bf79d78c984430887a353e1409a1/rpds_py-0.27.1-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:15d3b4d83582d10c601f481eca29c3f138d44c92187d197aff663a269197c02d", size = 405469, upload-time = "2025-08-27T12:13:41.496Z" }, - { url = "https://files.pythonhosted.org/packages/da/07/88c60edc2df74850d496d78a1fdcdc7b54360a7f610a4d50008309d41b94/rpds_py-0.27.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4ed2e16abbc982a169d30d1a420274a709949e2cbdef119fe2ec9d870b42f274", size = 422125, upload-time = "2025-08-27T12:13:42.802Z" }, - { url = "https://files.pythonhosted.org/packages/6b/86/5f4c707603e41b05f191a749984f390dabcbc467cf833769b47bf14ba04f/rpds_py-0.27.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a75f305c9b013289121ec0f1181931975df78738cdf650093e6b86d74aa7d8dd", size = 562341, upload-time = "2025-08-27T12:13:44.472Z" }, - { url = "https://files.pythonhosted.org/packages/b2/92/3c0cb2492094e3cd9baf9e49bbb7befeceb584ea0c1a8b5939dca4da12e5/rpds_py-0.27.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:67ce7620704745881a3d4b0ada80ab4d99df390838839921f99e63c474f82cf2", size = 592511, upload-time = "2025-08-27T12:13:45.898Z" }, - { url = "https://files.pythonhosted.org/packages/10/bb/82e64fbb0047c46a168faa28d0d45a7851cd0582f850b966811d30f67ad8/rpds_py-0.27.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9d992ac10eb86d9b6f369647b6a3f412fc0075cfd5d799530e84d335e440a002", size = 557736, upload-time = "2025-08-27T12:13:47.408Z" }, - { url = "https://files.pythonhosted.org/packages/00/95/3c863973d409210da7fb41958172c6b7dbe7fc34e04d3cc1f10bb85e979f/rpds_py-0.27.1-cp313-cp313-win32.whl", hash = "sha256:4f75e4bd8ab8db624e02c8e2fc4063021b58becdbe6df793a8111d9343aec1e3", size = 221462, upload-time = "2025-08-27T12:13:48.742Z" }, - { url = "https://files.pythonhosted.org/packages/ce/2c/5867b14a81dc217b56d95a9f2a40fdbc56a1ab0181b80132beeecbd4b2d6/rpds_py-0.27.1-cp313-cp313-win_amd64.whl", hash = "sha256:f9025faafc62ed0b75a53e541895ca272815bec18abe2249ff6501c8f2e12b83", size = 232034, upload-time = "2025-08-27T12:13:50.11Z" }, - { url = "https://files.pythonhosted.org/packages/c7/78/3958f3f018c01923823f1e47f1cc338e398814b92d83cd278364446fac66/rpds_py-0.27.1-cp313-cp313-win_arm64.whl", hash = "sha256:ed10dc32829e7d222b7d3b93136d25a406ba9788f6a7ebf6809092da1f4d279d", size = 222392, upload-time = "2025-08-27T12:13:52.587Z" }, - { url = "https://files.pythonhosted.org/packages/01/76/1cdf1f91aed5c3a7bf2eba1f1c4e4d6f57832d73003919a20118870ea659/rpds_py-0.27.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:92022bbbad0d4426e616815b16bc4127f83c9a74940e1ccf3cfe0b387aba0228", size = 358355, upload-time = "2025-08-27T12:13:54.012Z" }, - { url = "https://files.pythonhosted.org/packages/c3/6f/bf142541229374287604caf3bb2a4ae17f0a580798fd72d3b009b532db4e/rpds_py-0.27.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:47162fdab9407ec3f160805ac3e154df042e577dd53341745fc7fb3f625e6d92", size = 342138, upload-time = "2025-08-27T12:13:55.791Z" }, - { url = "https://files.pythonhosted.org/packages/1a/77/355b1c041d6be40886c44ff5e798b4e2769e497b790f0f7fd1e78d17e9a8/rpds_py-0.27.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb89bec23fddc489e5d78b550a7b773557c9ab58b7946154a10a6f7a214a48b2", size = 380247, upload-time = "2025-08-27T12:13:57.683Z" }, - { url = "https://files.pythonhosted.org/packages/d6/a4/d9cef5c3946ea271ce2243c51481971cd6e34f21925af2783dd17b26e815/rpds_py-0.27.1-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e48af21883ded2b3e9eb48cb7880ad8598b31ab752ff3be6457001d78f416723", size = 390699, upload-time = "2025-08-27T12:13:59.137Z" }, - { url = "https://files.pythonhosted.org/packages/3a/06/005106a7b8c6c1a7e91b73169e49870f4af5256119d34a361ae5240a0c1d/rpds_py-0.27.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6f5b7bd8e219ed50299e58551a410b64daafb5017d54bbe822e003856f06a802", size = 521852, upload-time = "2025-08-27T12:14:00.583Z" }, - { url = "https://files.pythonhosted.org/packages/e5/3e/50fb1dac0948e17a02eb05c24510a8fe12d5ce8561c6b7b7d1339ab7ab9c/rpds_py-0.27.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08f1e20bccf73b08d12d804d6e1c22ca5530e71659e6673bce31a6bb71c1e73f", size = 402582, upload-time = "2025-08-27T12:14:02.034Z" }, - { url = "https://files.pythonhosted.org/packages/cb/b0/f4e224090dc5b0ec15f31a02d746ab24101dd430847c4d99123798661bfc/rpds_py-0.27.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dc5dceeaefcc96dc192e3a80bbe1d6c410c469e97bdd47494a7d930987f18b2", size = 384126, upload-time = "2025-08-27T12:14:03.437Z" }, - { url = "https://files.pythonhosted.org/packages/54/77/ac339d5f82b6afff1df8f0fe0d2145cc827992cb5f8eeb90fc9f31ef7a63/rpds_py-0.27.1-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:d76f9cc8665acdc0c9177043746775aa7babbf479b5520b78ae4002d889f5c21", size = 399486, upload-time = "2025-08-27T12:14:05.443Z" }, - { url = "https://files.pythonhosted.org/packages/d6/29/3e1c255eee6ac358c056a57d6d6869baa00a62fa32eea5ee0632039c50a3/rpds_py-0.27.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:134fae0e36022edad8290a6661edf40c023562964efea0cc0ec7f5d392d2aaef", size = 414832, upload-time = "2025-08-27T12:14:06.902Z" }, - { url = "https://files.pythonhosted.org/packages/3f/db/6d498b844342deb3fa1d030598db93937a9964fcf5cb4da4feb5f17be34b/rpds_py-0.27.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:eb11a4f1b2b63337cfd3b4d110af778a59aae51c81d195768e353d8b52f88081", size = 557249, upload-time = "2025-08-27T12:14:08.37Z" }, - { url = "https://files.pythonhosted.org/packages/60/f3/690dd38e2310b6f68858a331399b4d6dbb9132c3e8ef8b4333b96caf403d/rpds_py-0.27.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:13e608ac9f50a0ed4faec0e90ece76ae33b34c0e8656e3dceb9a7db994c692cd", size = 587356, upload-time = "2025-08-27T12:14:10.034Z" }, - { url = "https://files.pythonhosted.org/packages/86/e3/84507781cccd0145f35b1dc32c72675200c5ce8d5b30f813e49424ef68fc/rpds_py-0.27.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dd2135527aa40f061350c3f8f89da2644de26cd73e4de458e79606384f4f68e7", size = 555300, upload-time = "2025-08-27T12:14:11.783Z" }, - { url = "https://files.pythonhosted.org/packages/e5/ee/375469849e6b429b3516206b4580a79e9ef3eb12920ddbd4492b56eaacbe/rpds_py-0.27.1-cp313-cp313t-win32.whl", hash = "sha256:3020724ade63fe320a972e2ffd93b5623227e684315adce194941167fee02688", size = 216714, upload-time = "2025-08-27T12:14:13.629Z" }, - { url = "https://files.pythonhosted.org/packages/21/87/3fc94e47c9bd0742660e84706c311a860dcae4374cf4a03c477e23ce605a/rpds_py-0.27.1-cp313-cp313t-win_amd64.whl", hash = "sha256:8ee50c3e41739886606388ba3ab3ee2aae9f35fb23f833091833255a31740797", size = 228943, upload-time = "2025-08-27T12:14:14.937Z" }, - { url = "https://files.pythonhosted.org/packages/70/36/b6e6066520a07cf029d385de869729a895917b411e777ab1cde878100a1d/rpds_py-0.27.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:acb9aafccaae278f449d9c713b64a9e68662e7799dbd5859e2c6b3c67b56d334", size = 362472, upload-time = "2025-08-27T12:14:16.333Z" }, - { url = "https://files.pythonhosted.org/packages/af/07/b4646032e0dcec0df9c73a3bd52f63bc6c5f9cda992f06bd0e73fe3fbebd/rpds_py-0.27.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:b7fb801aa7f845ddf601c49630deeeccde7ce10065561d92729bfe81bd21fb33", size = 345676, upload-time = "2025-08-27T12:14:17.764Z" }, - { url = "https://files.pythonhosted.org/packages/b0/16/2f1003ee5d0af4bcb13c0cf894957984c32a6751ed7206db2aee7379a55e/rpds_py-0.27.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe0dd05afb46597b9a2e11c351e5e4283c741237e7f617ffb3252780cca9336a", size = 385313, upload-time = "2025-08-27T12:14:19.829Z" }, - { url = "https://files.pythonhosted.org/packages/05/cd/7eb6dd7b232e7f2654d03fa07f1414d7dfc980e82ba71e40a7c46fd95484/rpds_py-0.27.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b6dfb0e058adb12d8b1d1b25f686e94ffa65d9995a5157afe99743bf7369d62b", size = 399080, upload-time = "2025-08-27T12:14:21.531Z" }, - { url = "https://files.pythonhosted.org/packages/20/51/5829afd5000ec1cb60f304711f02572d619040aa3ec033d8226817d1e571/rpds_py-0.27.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ed090ccd235f6fa8bb5861684567f0a83e04f52dfc2e5c05f2e4b1309fcf85e7", size = 523868, upload-time = "2025-08-27T12:14:23.485Z" }, - { url = "https://files.pythonhosted.org/packages/05/2c/30eebca20d5db95720ab4d2faec1b5e4c1025c473f703738c371241476a2/rpds_py-0.27.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bf876e79763eecf3e7356f157540d6a093cef395b65514f17a356f62af6cc136", size = 408750, upload-time = "2025-08-27T12:14:24.924Z" }, - { url = "https://files.pythonhosted.org/packages/90/1a/cdb5083f043597c4d4276eae4e4c70c55ab5accec078da8611f24575a367/rpds_py-0.27.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:12ed005216a51b1d6e2b02a7bd31885fe317e45897de81d86dcce7d74618ffff", size = 387688, upload-time = "2025-08-27T12:14:27.537Z" }, - { url = "https://files.pythonhosted.org/packages/7c/92/cf786a15320e173f945d205ab31585cc43969743bb1a48b6888f7a2b0a2d/rpds_py-0.27.1-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:ee4308f409a40e50593c7e3bb8cbe0b4d4c66d1674a316324f0c2f5383b486f9", size = 407225, upload-time = "2025-08-27T12:14:28.981Z" }, - { url = "https://files.pythonhosted.org/packages/33/5c/85ee16df5b65063ef26017bef33096557a4c83fbe56218ac7cd8c235f16d/rpds_py-0.27.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0b08d152555acf1f455154d498ca855618c1378ec810646fcd7c76416ac6dc60", size = 423361, upload-time = "2025-08-27T12:14:30.469Z" }, - { url = "https://files.pythonhosted.org/packages/4b/8e/1c2741307fcabd1a334ecf008e92c4f47bb6f848712cf15c923becfe82bb/rpds_py-0.27.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:dce51c828941973a5684d458214d3a36fcd28da3e1875d659388f4f9f12cc33e", size = 562493, upload-time = "2025-08-27T12:14:31.987Z" }, - { url = "https://files.pythonhosted.org/packages/04/03/5159321baae9b2222442a70c1f988cbbd66b9be0675dd3936461269be360/rpds_py-0.27.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:c1476d6f29eb81aa4151c9a31219b03f1f798dc43d8af1250a870735516a1212", size = 592623, upload-time = "2025-08-27T12:14:33.543Z" }, - { url = "https://files.pythonhosted.org/packages/ff/39/c09fd1ad28b85bc1d4554a8710233c9f4cefd03d7717a1b8fbfd171d1167/rpds_py-0.27.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:3ce0cac322b0d69b63c9cdb895ee1b65805ec9ffad37639f291dd79467bee675", size = 558800, upload-time = "2025-08-27T12:14:35.436Z" }, - { url = "https://files.pythonhosted.org/packages/c5/d6/99228e6bbcf4baa764b18258f519a9035131d91b538d4e0e294313462a98/rpds_py-0.27.1-cp314-cp314-win32.whl", hash = "sha256:dfbfac137d2a3d0725758cd141f878bf4329ba25e34979797c89474a89a8a3a3", size = 221943, upload-time = "2025-08-27T12:14:36.898Z" }, - { url = "https://files.pythonhosted.org/packages/be/07/c802bc6b8e95be83b79bdf23d1aa61d68324cb1006e245d6c58e959e314d/rpds_py-0.27.1-cp314-cp314-win_amd64.whl", hash = "sha256:a6e57b0abfe7cc513450fcf529eb486b6e4d3f8aee83e92eb5f1ef848218d456", size = 233739, upload-time = "2025-08-27T12:14:38.386Z" }, - { url = "https://files.pythonhosted.org/packages/c8/89/3e1b1c16d4c2d547c5717377a8df99aee8099ff050f87c45cb4d5fa70891/rpds_py-0.27.1-cp314-cp314-win_arm64.whl", hash = "sha256:faf8d146f3d476abfee026c4ae3bdd9ca14236ae4e4c310cbd1cf75ba33d24a3", size = 223120, upload-time = "2025-08-27T12:14:39.82Z" }, - { url = "https://files.pythonhosted.org/packages/62/7e/dc7931dc2fa4a6e46b2a4fa744a9fe5c548efd70e0ba74f40b39fa4a8c10/rpds_py-0.27.1-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:ba81d2b56b6d4911ce735aad0a1d4495e808b8ee4dc58715998741a26874e7c2", size = 358944, upload-time = "2025-08-27T12:14:41.199Z" }, - { url = "https://files.pythonhosted.org/packages/e6/22/4af76ac4e9f336bfb1a5f240d18a33c6b2fcaadb7472ac7680576512b49a/rpds_py-0.27.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:84f7d509870098de0e864cad0102711c1e24e9b1a50ee713b65928adb22269e4", size = 342283, upload-time = "2025-08-27T12:14:42.699Z" }, - { url = "https://files.pythonhosted.org/packages/1c/15/2a7c619b3c2272ea9feb9ade67a45c40b3eeb500d503ad4c28c395dc51b4/rpds_py-0.27.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e960fc78fecd1100539f14132425e1d5fe44ecb9239f8f27f079962021523e", size = 380320, upload-time = "2025-08-27T12:14:44.157Z" }, - { url = "https://files.pythonhosted.org/packages/a2/7d/4c6d243ba4a3057e994bb5bedd01b5c963c12fe38dde707a52acdb3849e7/rpds_py-0.27.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:62f85b665cedab1a503747617393573995dac4600ff51869d69ad2f39eb5e817", size = 391760, upload-time = "2025-08-27T12:14:45.845Z" }, - { url = "https://files.pythonhosted.org/packages/b4/71/b19401a909b83bcd67f90221330bc1ef11bc486fe4e04c24388d28a618ae/rpds_py-0.27.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fed467af29776f6556250c9ed85ea5a4dd121ab56a5f8b206e3e7a4c551e48ec", size = 522476, upload-time = "2025-08-27T12:14:47.364Z" }, - { url = "https://files.pythonhosted.org/packages/e4/44/1a3b9715c0455d2e2f0f6df5ee6d6f5afdc423d0773a8a682ed2b43c566c/rpds_py-0.27.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2729615f9d430af0ae6b36cf042cb55c0936408d543fb691e1a9e36648fd35a", size = 403418, upload-time = "2025-08-27T12:14:49.991Z" }, - { url = "https://files.pythonhosted.org/packages/1c/4b/fb6c4f14984eb56673bc868a66536f53417ddb13ed44b391998100a06a96/rpds_py-0.27.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b207d881a9aef7ba753d69c123a35d96ca7cb808056998f6b9e8747321f03b8", size = 384771, upload-time = "2025-08-27T12:14:52.159Z" }, - { url = "https://files.pythonhosted.org/packages/c0/56/d5265d2d28b7420d7b4d4d85cad8ef891760f5135102e60d5c970b976e41/rpds_py-0.27.1-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:639fd5efec029f99b79ae47e5d7e00ad8a773da899b6309f6786ecaf22948c48", size = 400022, upload-time = "2025-08-27T12:14:53.859Z" }, - { url = "https://files.pythonhosted.org/packages/8f/e9/9f5fc70164a569bdd6ed9046486c3568d6926e3a49bdefeeccfb18655875/rpds_py-0.27.1-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fecc80cb2a90e28af8a9b366edacf33d7a91cbfe4c2c4544ea1246e949cfebeb", size = 416787, upload-time = "2025-08-27T12:14:55.673Z" }, - { url = "https://files.pythonhosted.org/packages/d4/64/56dd03430ba491db943a81dcdef115a985aac5f44f565cd39a00c766d45c/rpds_py-0.27.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:42a89282d711711d0a62d6f57d81aa43a1368686c45bc1c46b7f079d55692734", size = 557538, upload-time = "2025-08-27T12:14:57.245Z" }, - { url = "https://files.pythonhosted.org/packages/3f/36/92cc885a3129993b1d963a2a42ecf64e6a8e129d2c7cc980dbeba84e55fb/rpds_py-0.27.1-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:cf9931f14223de59551ab9d38ed18d92f14f055a5f78c1d8ad6493f735021bbb", size = 588512, upload-time = "2025-08-27T12:14:58.728Z" }, - { url = "https://files.pythonhosted.org/packages/dd/10/6b283707780a81919f71625351182b4f98932ac89a09023cb61865136244/rpds_py-0.27.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f39f58a27cc6e59f432b568ed8429c7e1641324fbe38131de852cd77b2d534b0", size = 555813, upload-time = "2025-08-27T12:15:00.334Z" }, - { url = "https://files.pythonhosted.org/packages/04/2e/30b5ea18c01379da6272a92825dd7e53dc9d15c88a19e97932d35d430ef7/rpds_py-0.27.1-cp314-cp314t-win32.whl", hash = "sha256:d5fa0ee122dc09e23607a28e6d7b150da16c662e66409bbe85230e4c85bb528a", size = 217385, upload-time = "2025-08-27T12:15:01.937Z" }, - { url = "https://files.pythonhosted.org/packages/32/7d/97119da51cb1dd3f2f3c0805f155a3aa4a95fa44fe7d78ae15e69edf4f34/rpds_py-0.27.1-cp314-cp314t-win_amd64.whl", hash = "sha256:6567d2bb951e21232c2f660c24cf3470bb96de56cdcb3f071a83feeaff8a2772", size = 230097, upload-time = "2025-08-27T12:15:03.961Z" }, - { url = "https://files.pythonhosted.org/packages/d5/63/b7cc415c345625d5e62f694ea356c58fb964861409008118f1245f8c3347/rpds_py-0.27.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7ba22cb9693df986033b91ae1d7a979bc399237d45fccf875b76f62bb9e52ddf", size = 371360, upload-time = "2025-08-27T12:15:29.218Z" }, - { url = "https://files.pythonhosted.org/packages/e5/8c/12e1b24b560cf378b8ffbdb9dc73abd529e1adcfcf82727dfd29c4a7b88d/rpds_py-0.27.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5b640501be9288c77738b5492b3fd3abc4ba95c50c2e41273c8a1459f08298d3", size = 353933, upload-time = "2025-08-27T12:15:30.837Z" }, - { url = "https://files.pythonhosted.org/packages/9b/85/1bb2210c1f7a1b99e91fea486b9f0f894aa5da3a5ec7097cbad7dec6d40f/rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb08b65b93e0c6dd70aac7f7890a9c0938d5ec71d5cb32d45cf844fb8ae47636", size = 382962, upload-time = "2025-08-27T12:15:32.348Z" }, - { url = "https://files.pythonhosted.org/packages/cc/c9/a839b9f219cf80ed65f27a7f5ddbb2809c1b85c966020ae2dff490e0b18e/rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d7ff07d696a7a38152ebdb8212ca9e5baab56656749f3d6004b34ab726b550b8", size = 394412, upload-time = "2025-08-27T12:15:33.839Z" }, - { url = "https://files.pythonhosted.org/packages/02/2d/b1d7f928b0b1f4fc2e0133e8051d199b01d7384875adc63b6ddadf3de7e5/rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fb7c72262deae25366e3b6c0c0ba46007967aea15d1eea746e44ddba8ec58dcc", size = 523972, upload-time = "2025-08-27T12:15:35.377Z" }, - { url = "https://files.pythonhosted.org/packages/a9/af/2cbf56edd2d07716df1aec8a726b3159deb47cb5c27e1e42b71d705a7c2f/rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b002cab05d6339716b03a4a3a2ce26737f6231d7b523f339fa061d53368c9d8", size = 403273, upload-time = "2025-08-27T12:15:37.051Z" }, - { url = "https://files.pythonhosted.org/packages/c0/93/425e32200158d44ff01da5d9612c3b6711fe69f606f06e3895511f17473b/rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23f6b69d1c26c4704fec01311963a41d7de3ee0570a84ebde4d544e5a1859ffc", size = 385278, upload-time = "2025-08-27T12:15:38.571Z" }, - { url = "https://files.pythonhosted.org/packages/eb/1a/1a04a915ecd0551bfa9e77b7672d1937b4b72a0fc204a17deef76001cfb2/rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:530064db9146b247351f2a0250b8f00b289accea4596a033e94be2389977de71", size = 402084, upload-time = "2025-08-27T12:15:40.529Z" }, - { url = "https://files.pythonhosted.org/packages/51/f7/66585c0fe5714368b62951d2513b684e5215beaceab2c6629549ddb15036/rpds_py-0.27.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7b90b0496570bd6b0321724a330d8b545827c4df2034b6ddfc5f5275f55da2ad", size = 419041, upload-time = "2025-08-27T12:15:42.191Z" }, - { url = "https://files.pythonhosted.org/packages/8e/7e/83a508f6b8e219bba2d4af077c35ba0e0cdd35a751a3be6a7cba5a55ad71/rpds_py-0.27.1-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:879b0e14a2da6a1102a3fc8af580fc1ead37e6d6692a781bd8c83da37429b5ab", size = 560084, upload-time = "2025-08-27T12:15:43.839Z" }, - { url = "https://files.pythonhosted.org/packages/66/66/bb945683b958a1b19eb0fe715594630d0f36396ebdef4d9b89c2fa09aa56/rpds_py-0.27.1-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:0d807710df3b5faa66c731afa162ea29717ab3be17bdc15f90f2d9f183da4059", size = 590115, upload-time = "2025-08-27T12:15:46.647Z" }, - { url = "https://files.pythonhosted.org/packages/12/00/ccfaafaf7db7e7adace915e5c2f2c2410e16402561801e9c7f96683002d3/rpds_py-0.27.1-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:3adc388fc3afb6540aec081fa59e6e0d3908722771aa1e37ffe22b220a436f0b", size = 556561, upload-time = "2025-08-27T12:15:48.219Z" }, - { url = "https://files.pythonhosted.org/packages/e1/b7/92b6ed9aad103bfe1c45df98453dfae40969eef2cb6c6239c58d7e96f1b3/rpds_py-0.27.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c796c0c1cc68cb08b0284db4229f5af76168172670c74908fdbd4b7d7f515819", size = 229125, upload-time = "2025-08-27T12:15:49.956Z" }, - { url = "https://files.pythonhosted.org/packages/0c/ed/e1fba02de17f4f76318b834425257c8ea297e415e12c68b4361f63e8ae92/rpds_py-0.27.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cdfe4bb2f9fe7458b7453ad3c33e726d6d1c7c0a72960bcc23800d77384e42df", size = 371402, upload-time = "2025-08-27T12:15:51.561Z" }, - { url = "https://files.pythonhosted.org/packages/af/7c/e16b959b316048b55585a697e94add55a4ae0d984434d279ea83442e460d/rpds_py-0.27.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:8fabb8fd848a5f75a2324e4a84501ee3a5e3c78d8603f83475441866e60b94a3", size = 354084, upload-time = "2025-08-27T12:15:53.219Z" }, - { url = "https://files.pythonhosted.org/packages/de/c1/ade645f55de76799fdd08682d51ae6724cb46f318573f18be49b1e040428/rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eda8719d598f2f7f3e0f885cba8646644b55a187762bec091fa14a2b819746a9", size = 383090, upload-time = "2025-08-27T12:15:55.158Z" }, - { url = "https://files.pythonhosted.org/packages/1f/27/89070ca9b856e52960da1472efcb6c20ba27cfe902f4f23ed095b9cfc61d/rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3c64d07e95606ec402a0a1c511fe003873fa6af630bda59bac77fac8b4318ebc", size = 394519, upload-time = "2025-08-27T12:15:57.238Z" }, - { url = "https://files.pythonhosted.org/packages/b3/28/be120586874ef906aa5aeeae95ae8df4184bc757e5b6bd1c729ccff45ed5/rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93a2ed40de81bcff59aabebb626562d48332f3d028ca2036f1d23cbb52750be4", size = 523817, upload-time = "2025-08-27T12:15:59.237Z" }, - { url = "https://files.pythonhosted.org/packages/a8/ef/70cc197bc11cfcde02a86f36ac1eed15c56667c2ebddbdb76a47e90306da/rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:387ce8c44ae94e0ec50532d9cb0edce17311024c9794eb196b90e1058aadeb66", size = 403240, upload-time = "2025-08-27T12:16:00.923Z" }, - { url = "https://files.pythonhosted.org/packages/cf/35/46936cca449f7f518f2f4996e0e8344db4b57e2081e752441154089d2a5f/rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aaf94f812c95b5e60ebaf8bfb1898a7d7cb9c1af5744d4a67fa47796e0465d4e", size = 385194, upload-time = "2025-08-27T12:16:02.802Z" }, - { url = "https://files.pythonhosted.org/packages/e1/62/29c0d3e5125c3270b51415af7cbff1ec587379c84f55a5761cc9efa8cd06/rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:4848ca84d6ded9b58e474dfdbad4b8bfb450344c0551ddc8d958bf4b36aa837c", size = 402086, upload-time = "2025-08-27T12:16:04.806Z" }, - { url = "https://files.pythonhosted.org/packages/8f/66/03e1087679227785474466fdd04157fb793b3b76e3fcf01cbf4c693c1949/rpds_py-0.27.1-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2bde09cbcf2248b73c7c323be49b280180ff39fadcfe04e7b6f54a678d02a7cf", size = 419272, upload-time = "2025-08-27T12:16:06.471Z" }, - { url = "https://files.pythonhosted.org/packages/6a/24/e3e72d265121e00b063aef3e3501e5b2473cf1b23511d56e529531acf01e/rpds_py-0.27.1-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:94c44ee01fd21c9058f124d2d4f0c9dc7634bec93cd4b38eefc385dabe71acbf", size = 560003, upload-time = "2025-08-27T12:16:08.06Z" }, - { url = "https://files.pythonhosted.org/packages/26/ca/f5a344c534214cc2d41118c0699fffbdc2c1bc7046f2a2b9609765ab9c92/rpds_py-0.27.1-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:df8b74962e35c9249425d90144e721eed198e6555a0e22a563d29fe4486b51f6", size = 590482, upload-time = "2025-08-27T12:16:10.137Z" }, - { url = "https://files.pythonhosted.org/packages/ce/08/4349bdd5c64d9d193c360aa9db89adeee6f6682ab8825dca0a3f535f434f/rpds_py-0.27.1-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:dc23e6820e3b40847e2f4a7726462ba0cf53089512abe9ee16318c366494c17a", size = 556523, upload-time = "2025-08-27T12:16:12.188Z" }, +version = "0.28.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/48/dc/95f074d43452b3ef5d06276696ece4b3b5d696e7c9ad7173c54b1390cd70/rpds_py-0.28.0.tar.gz", hash = "sha256:abd4df20485a0983e2ca334a216249b6186d6e3c1627e106651943dbdb791aea", size = 27419, upload-time = "2025-10-22T22:24:29.327Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/82/f8/13bb772dc7cbf2c3c5b816febc34fa0cb2c64a08e0569869585684ce6631/rpds_py-0.28.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:7b6013db815417eeb56b2d9d7324e64fcd4fa289caeee6e7a78b2e11fc9b438a", size = 362820, upload-time = "2025-10-22T22:21:15.074Z" }, + { url = "https://files.pythonhosted.org/packages/84/91/6acce964aab32469c3dbe792cb041a752d64739c534e9c493c701ef0c032/rpds_py-0.28.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a4c6b05c685c0c03f80dabaeb73e74218c49deea965ca63f76a752807397207", size = 348499, upload-time = "2025-10-22T22:21:17.658Z" }, + { url = "https://files.pythonhosted.org/packages/f1/93/c05bb1f4f5e0234db7c4917cb8dd5e2e0a9a7b26dc74b1b7bee3c9cfd477/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4794c6c3fbe8f9ac87699b131a1f26e7b4abcf6d828da46a3a52648c7930eba", size = 379356, upload-time = "2025-10-22T22:21:19.847Z" }, + { url = "https://files.pythonhosted.org/packages/5c/37/e292da436f0773e319753c567263427cdf6c645d30b44f09463ff8216cda/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2e8456b6ee5527112ff2354dd9087b030e3429e43a74f480d4a5ca79d269fd85", size = 390151, upload-time = "2025-10-22T22:21:21.569Z" }, + { url = "https://files.pythonhosted.org/packages/76/87/a4e3267131616e8faf10486dc00eaedf09bd61c87f01e5ef98e782ee06c9/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:beb880a9ca0a117415f241f66d56025c02037f7c4efc6fe59b5b8454f1eaa50d", size = 524831, upload-time = "2025-10-22T22:21:23.394Z" }, + { url = "https://files.pythonhosted.org/packages/e1/c8/4a4ca76f0befae9515da3fad11038f0fce44f6bb60b21fe9d9364dd51fb0/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6897bebb118c44b38c9cb62a178e09f1593c949391b9a1a6fe777ccab5934ee7", size = 404687, upload-time = "2025-10-22T22:21:25.201Z" }, + { url = "https://files.pythonhosted.org/packages/6a/65/118afe854424456beafbbebc6b34dcf6d72eae3a08b4632bc4220f8240d9/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1b553dd06e875249fd43efd727785efb57a53180e0fde321468222eabbeaafa", size = 382683, upload-time = "2025-10-22T22:21:26.536Z" }, + { url = "https://files.pythonhosted.org/packages/f7/bc/0625064041fb3a0c77ecc8878c0e8341b0ae27ad0f00cf8f2b57337a1e63/rpds_py-0.28.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:f0b2044fdddeea5b05df832e50d2a06fe61023acb44d76978e1b060206a8a476", size = 398927, upload-time = "2025-10-22T22:21:27.864Z" }, + { url = "https://files.pythonhosted.org/packages/5d/1a/fed7cf2f1ee8a5e4778f2054153f2cfcf517748875e2f5b21cf8907cd77d/rpds_py-0.28.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:05cf1e74900e8da73fa08cc76c74a03345e5a3e37691d07cfe2092d7d8e27b04", size = 411590, upload-time = "2025-10-22T22:21:29.474Z" }, + { url = "https://files.pythonhosted.org/packages/c1/64/a8e0f67fa374a6c472dbb0afdaf1ef744724f165abb6899f20e2f1563137/rpds_py-0.28.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:efd489fec7c311dae25e94fe7eeda4b3d06be71c68f2cf2e8ef990ffcd2cd7e8", size = 559843, upload-time = "2025-10-22T22:21:30.917Z" }, + { url = "https://files.pythonhosted.org/packages/a9/ea/e10353f6d7c105be09b8135b72787a65919971ae0330ad97d87e4e199880/rpds_py-0.28.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ada7754a10faacd4f26067e62de52d6af93b6d9542f0df73c57b9771eb3ba9c4", size = 584188, upload-time = "2025-10-22T22:21:32.827Z" }, + { url = "https://files.pythonhosted.org/packages/18/b0/a19743e0763caf0c89f6fc6ba6fbd9a353b24ffb4256a492420c5517da5a/rpds_py-0.28.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c2a34fd26588949e1e7977cfcbb17a9a42c948c100cab890c6d8d823f0586457", size = 550052, upload-time = "2025-10-22T22:21:34.702Z" }, + { url = "https://files.pythonhosted.org/packages/de/bc/ec2c004f6c7d6ab1e25dae875cdb1aee087c3ebed5b73712ed3000e3851a/rpds_py-0.28.0-cp310-cp310-win32.whl", hash = "sha256:f9174471d6920cbc5e82a7822de8dfd4dcea86eb828b04fc8c6519a77b0ee51e", size = 215110, upload-time = "2025-10-22T22:21:36.645Z" }, + { url = "https://files.pythonhosted.org/packages/6c/de/4ce8abf59674e17187023933547d2018363e8fc76ada4f1d4d22871ccb6e/rpds_py-0.28.0-cp310-cp310-win_amd64.whl", hash = "sha256:6e32dd207e2c4f8475257a3540ab8a93eff997abfa0a3fdb287cae0d6cd874b8", size = 223850, upload-time = "2025-10-22T22:21:38.006Z" }, + { url = "https://files.pythonhosted.org/packages/a6/34/058d0db5471c6be7bef82487ad5021ff8d1d1d27794be8730aad938649cf/rpds_py-0.28.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:03065002fd2e287725d95fbc69688e0c6daf6c6314ba38bdbaa3895418e09296", size = 362344, upload-time = "2025-10-22T22:21:39.713Z" }, + { url = "https://files.pythonhosted.org/packages/5d/67/9503f0ec8c055a0782880f300c50a2b8e5e72eb1f94dfc2053da527444dd/rpds_py-0.28.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:28ea02215f262b6d078daec0b45344c89e161eab9526b0d898221d96fdda5f27", size = 348440, upload-time = "2025-10-22T22:21:41.056Z" }, + { url = "https://files.pythonhosted.org/packages/68/2e/94223ee9b32332a41d75b6f94b37b4ce3e93878a556fc5f152cbd856a81f/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25dbade8fbf30bcc551cb352376c0ad64b067e4fc56f90e22ba70c3ce205988c", size = 379068, upload-time = "2025-10-22T22:21:42.593Z" }, + { url = "https://files.pythonhosted.org/packages/b4/25/54fd48f9f680cfc44e6a7f39a5fadf1d4a4a1fd0848076af4a43e79f998c/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3c03002f54cc855860bfdc3442928ffdca9081e73b5b382ed0b9e8efe6e5e205", size = 390518, upload-time = "2025-10-22T22:21:43.998Z" }, + { url = "https://files.pythonhosted.org/packages/1b/85/ac258c9c27f2ccb1bd5d0697e53a82ebcf8088e3186d5d2bf8498ee7ed44/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b9699fa7990368b22032baf2b2dce1f634388e4ffc03dfefaaac79f4695edc95", size = 525319, upload-time = "2025-10-22T22:21:45.645Z" }, + { url = "https://files.pythonhosted.org/packages/40/cb/c6734774789566d46775f193964b76627cd5f42ecf246d257ce84d1912ed/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b9b06fe1a75e05e0713f06ea0c89ecb6452210fd60e2f1b6ddc1067b990e08d9", size = 404896, upload-time = "2025-10-22T22:21:47.544Z" }, + { url = "https://files.pythonhosted.org/packages/1f/53/14e37ce83202c632c89b0691185dca9532288ff9d390eacae3d2ff771bae/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac9f83e7b326a3f9ec3ef84cda98fb0a74c7159f33e692032233046e7fd15da2", size = 382862, upload-time = "2025-10-22T22:21:49.176Z" }, + { url = "https://files.pythonhosted.org/packages/6a/83/f3642483ca971a54d60caa4449f9d6d4dbb56a53e0072d0deff51b38af74/rpds_py-0.28.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:0d3259ea9ad8743a75a43eb7819324cdab393263c91be86e2d1901ee65c314e0", size = 398848, upload-time = "2025-10-22T22:21:51.024Z" }, + { url = "https://files.pythonhosted.org/packages/44/09/2d9c8b2f88e399b4cfe86efdf2935feaf0394e4f14ab30c6c5945d60af7d/rpds_py-0.28.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a7548b345f66f6695943b4ef6afe33ccd3f1b638bd9afd0f730dd255c249c9e", size = 412030, upload-time = "2025-10-22T22:21:52.665Z" }, + { url = "https://files.pythonhosted.org/packages/dd/f5/e1cec473d4bde6df1fd3738be8e82d64dd0600868e76e92dfeaebbc2d18f/rpds_py-0.28.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9a40040aa388b037eb39416710fbcce9443498d2eaab0b9b45ae988b53f5c67", size = 559700, upload-time = "2025-10-22T22:21:54.123Z" }, + { url = "https://files.pythonhosted.org/packages/8d/be/73bb241c1649edbf14e98e9e78899c2c5e52bbe47cb64811f44d2cc11808/rpds_py-0.28.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8f60c7ea34e78c199acd0d3cda37a99be2c861dd2b8cf67399784f70c9f8e57d", size = 584581, upload-time = "2025-10-22T22:21:56.102Z" }, + { url = "https://files.pythonhosted.org/packages/9c/9c/ffc6e9218cd1eb5c2c7dbd276c87cd10e8c2232c456b554169eb363381df/rpds_py-0.28.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1571ae4292649100d743b26d5f9c63503bb1fedf538a8f29a98dce2d5ba6b4e6", size = 549981, upload-time = "2025-10-22T22:21:58.253Z" }, + { url = "https://files.pythonhosted.org/packages/5f/50/da8b6d33803a94df0149345ee33e5d91ed4d25fc6517de6a25587eae4133/rpds_py-0.28.0-cp311-cp311-win32.whl", hash = "sha256:5cfa9af45e7c1140af7321fa0bef25b386ee9faa8928c80dc3a5360971a29e8c", size = 214729, upload-time = "2025-10-22T22:21:59.625Z" }, + { url = "https://files.pythonhosted.org/packages/12/fd/b0f48c4c320ee24c8c20df8b44acffb7353991ddf688af01eef5f93d7018/rpds_py-0.28.0-cp311-cp311-win_amd64.whl", hash = "sha256:dd8d86b5d29d1b74100982424ba53e56033dc47720a6de9ba0259cf81d7cecaa", size = 223977, upload-time = "2025-10-22T22:22:01.092Z" }, + { url = "https://files.pythonhosted.org/packages/b4/21/c8e77a2ac66e2ec4e21f18a04b4e9a0417ecf8e61b5eaeaa9360a91713b4/rpds_py-0.28.0-cp311-cp311-win_arm64.whl", hash = "sha256:4e27d3a5709cc2b3e013bf93679a849213c79ae0573f9b894b284b55e729e120", size = 217326, upload-time = "2025-10-22T22:22:02.944Z" }, + { url = "https://files.pythonhosted.org/packages/b8/5c/6c3936495003875fe7b14f90ea812841a08fca50ab26bd840e924097d9c8/rpds_py-0.28.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6b4f28583a4f247ff60cd7bdda83db8c3f5b05a7a82ff20dd4b078571747708f", size = 366439, upload-time = "2025-10-22T22:22:04.525Z" }, + { url = "https://files.pythonhosted.org/packages/56/f9/a0f1ca194c50aa29895b442771f036a25b6c41a35e4f35b1a0ea713bedae/rpds_py-0.28.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d678e91b610c29c4b3d52a2c148b641df2b4676ffe47c59f6388d58b99cdc424", size = 348170, upload-time = "2025-10-22T22:22:06.397Z" }, + { url = "https://files.pythonhosted.org/packages/18/ea/42d243d3a586beb72c77fa5def0487daf827210069a95f36328e869599ea/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e819e0e37a44a78e1383bf1970076e2ccc4dc8c2bbaa2f9bd1dc987e9afff628", size = 378838, upload-time = "2025-10-22T22:22:07.932Z" }, + { url = "https://files.pythonhosted.org/packages/e7/78/3de32e18a94791af8f33601402d9d4f39613136398658412a4e0b3047327/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5ee514e0f0523db5d3fb171f397c54875dbbd69760a414dccf9d4d7ad628b5bd", size = 393299, upload-time = "2025-10-22T22:22:09.435Z" }, + { url = "https://files.pythonhosted.org/packages/13/7e/4bdb435afb18acea2eb8a25ad56b956f28de7c59f8a1d32827effa0d4514/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5f3fa06d27fdcee47f07a39e02862da0100cb4982508f5ead53ec533cd5fe55e", size = 518000, upload-time = "2025-10-22T22:22:11.326Z" }, + { url = "https://files.pythonhosted.org/packages/31/d0/5f52a656875cdc60498ab035a7a0ac8f399890cc1ee73ebd567bac4e39ae/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:46959ef2e64f9e4a41fc89aa20dbca2b85531f9a72c21099a3360f35d10b0d5a", size = 408746, upload-time = "2025-10-22T22:22:13.143Z" }, + { url = "https://files.pythonhosted.org/packages/3e/cd/49ce51767b879cde77e7ad9fae164ea15dce3616fe591d9ea1df51152706/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8455933b4bcd6e83fde3fefc987a023389c4b13f9a58c8d23e4b3f6d13f78c84", size = 386379, upload-time = "2025-10-22T22:22:14.602Z" }, + { url = "https://files.pythonhosted.org/packages/6a/99/e4e1e1ee93a98f72fc450e36c0e4d99c35370220e815288e3ecd2ec36a2a/rpds_py-0.28.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:ad50614a02c8c2962feebe6012b52f9802deec4263946cddea37aaf28dd25a66", size = 401280, upload-time = "2025-10-22T22:22:16.063Z" }, + { url = "https://files.pythonhosted.org/packages/61/35/e0c6a57488392a8b319d2200d03dad2b29c0db9996f5662c3b02d0b86c02/rpds_py-0.28.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e5deca01b271492553fdb6c7fd974659dce736a15bae5dad7ab8b93555bceb28", size = 412365, upload-time = "2025-10-22T22:22:17.504Z" }, + { url = "https://files.pythonhosted.org/packages/ff/6a/841337980ea253ec797eb084665436007a1aad0faac1ba097fb906c5f69c/rpds_py-0.28.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:735f8495a13159ce6a0d533f01e8674cec0c57038c920495f87dcb20b3ddb48a", size = 559573, upload-time = "2025-10-22T22:22:19.108Z" }, + { url = "https://files.pythonhosted.org/packages/e7/5e/64826ec58afd4c489731f8b00729c5f6afdb86f1df1df60bfede55d650bb/rpds_py-0.28.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:961ca621ff10d198bbe6ba4957decca61aa2a0c56695384c1d6b79bf61436df5", size = 583973, upload-time = "2025-10-22T22:22:20.768Z" }, + { url = "https://files.pythonhosted.org/packages/b6/ee/44d024b4843f8386a4eeaa4c171b3d31d55f7177c415545fd1a24c249b5d/rpds_py-0.28.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2374e16cc9131022e7d9a8f8d65d261d9ba55048c78f3b6e017971a4f5e6353c", size = 553800, upload-time = "2025-10-22T22:22:22.25Z" }, + { url = "https://files.pythonhosted.org/packages/7d/89/33e675dccff11a06d4d85dbb4d1865f878d5020cbb69b2c1e7b2d3f82562/rpds_py-0.28.0-cp312-cp312-win32.whl", hash = "sha256:d15431e334fba488b081d47f30f091e5d03c18527c325386091f31718952fe08", size = 216954, upload-time = "2025-10-22T22:22:24.105Z" }, + { url = "https://files.pythonhosted.org/packages/af/36/45f6ebb3210887e8ee6dbf1bc710ae8400bb417ce165aaf3024b8360d999/rpds_py-0.28.0-cp312-cp312-win_amd64.whl", hash = "sha256:a410542d61fc54710f750d3764380b53bf09e8c4edbf2f9141a82aa774a04f7c", size = 227844, upload-time = "2025-10-22T22:22:25.551Z" }, + { url = "https://files.pythonhosted.org/packages/57/91/f3fb250d7e73de71080f9a221d19bd6a1c1eb0d12a1ea26513f6c1052ad6/rpds_py-0.28.0-cp312-cp312-win_arm64.whl", hash = "sha256:1f0cfd1c69e2d14f8c892b893997fa9a60d890a0c8a603e88dca4955f26d1edd", size = 217624, upload-time = "2025-10-22T22:22:26.914Z" }, + { url = "https://files.pythonhosted.org/packages/d3/03/ce566d92611dfac0085c2f4b048cd53ed7c274a5c05974b882a908d540a2/rpds_py-0.28.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e9e184408a0297086f880556b6168fa927d677716f83d3472ea333b42171ee3b", size = 366235, upload-time = "2025-10-22T22:22:28.397Z" }, + { url = "https://files.pythonhosted.org/packages/00/34/1c61da1b25592b86fd285bd7bd8422f4c9d748a7373b46126f9ae792a004/rpds_py-0.28.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:edd267266a9b0448f33dc465a97cfc5d467594b600fe28e7fa2f36450e03053a", size = 348241, upload-time = "2025-10-22T22:22:30.171Z" }, + { url = "https://files.pythonhosted.org/packages/fc/00/ed1e28616848c61c493a067779633ebf4b569eccaacf9ccbdc0e7cba2b9d/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85beb8b3f45e4e32f6802fb6cd6b17f615ef6c6a52f265371fb916fae02814aa", size = 378079, upload-time = "2025-10-22T22:22:31.644Z" }, + { url = "https://files.pythonhosted.org/packages/11/b2/ccb30333a16a470091b6e50289adb4d3ec656fd9951ba8c5e3aaa0746a67/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d2412be8d00a1b895f8ad827cc2116455196e20ed994bb704bf138fe91a42724", size = 393151, upload-time = "2025-10-22T22:22:33.453Z" }, + { url = "https://files.pythonhosted.org/packages/8c/d0/73e2217c3ee486d555cb84920597480627d8c0240ff3062005c6cc47773e/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cf128350d384b777da0e68796afdcebc2e9f63f0e9f242217754e647f6d32491", size = 517520, upload-time = "2025-10-22T22:22:34.949Z" }, + { url = "https://files.pythonhosted.org/packages/c4/91/23efe81c700427d0841a4ae7ea23e305654381831e6029499fe80be8a071/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a2036d09b363aa36695d1cc1a97b36865597f4478470b0697b5ee9403f4fe399", size = 408699, upload-time = "2025-10-22T22:22:36.584Z" }, + { url = "https://files.pythonhosted.org/packages/ca/ee/a324d3198da151820a326c1f988caaa4f37fc27955148a76fff7a2d787a9/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8e1e9be4fa6305a16be628959188e4fd5cd6f1b0e724d63c6d8b2a8adf74ea6", size = 385720, upload-time = "2025-10-22T22:22:38.014Z" }, + { url = "https://files.pythonhosted.org/packages/19/ad/e68120dc05af8b7cab4a789fccd8cdcf0fe7e6581461038cc5c164cd97d2/rpds_py-0.28.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:0a403460c9dd91a7f23fc3188de6d8977f1d9603a351d5db6cf20aaea95b538d", size = 401096, upload-time = "2025-10-22T22:22:39.869Z" }, + { url = "https://files.pythonhosted.org/packages/99/90/c1e070620042459d60df6356b666bb1f62198a89d68881816a7ed121595a/rpds_py-0.28.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d7366b6553cdc805abcc512b849a519167db8f5e5c3472010cd1228b224265cb", size = 411465, upload-time = "2025-10-22T22:22:41.395Z" }, + { url = "https://files.pythonhosted.org/packages/68/61/7c195b30d57f1b8d5970f600efee72a4fad79ec829057972e13a0370fd24/rpds_py-0.28.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5b43c6a3726efd50f18d8120ec0551241c38785b68952d240c45ea553912ac41", size = 558832, upload-time = "2025-10-22T22:22:42.871Z" }, + { url = "https://files.pythonhosted.org/packages/b0/3d/06f3a718864773f69941d4deccdf18e5e47dd298b4628062f004c10f3b34/rpds_py-0.28.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:0cb7203c7bc69d7c1585ebb33a2e6074492d2fc21ad28a7b9d40457ac2a51ab7", size = 583230, upload-time = "2025-10-22T22:22:44.877Z" }, + { url = "https://files.pythonhosted.org/packages/66/df/62fc783781a121e77fee9a21ead0a926f1b652280a33f5956a5e7833ed30/rpds_py-0.28.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7a52a5169c664dfb495882adc75c304ae1d50df552fbd68e100fdc719dee4ff9", size = 553268, upload-time = "2025-10-22T22:22:46.441Z" }, + { url = "https://files.pythonhosted.org/packages/84/85/d34366e335140a4837902d3dea89b51f087bd6a63c993ebdff59e93ee61d/rpds_py-0.28.0-cp313-cp313-win32.whl", hash = "sha256:2e42456917b6687215b3e606ab46aa6bca040c77af7df9a08a6dcfe8a4d10ca5", size = 217100, upload-time = "2025-10-22T22:22:48.342Z" }, + { url = "https://files.pythonhosted.org/packages/3c/1c/f25a3f3752ad7601476e3eff395fe075e0f7813fbb9862bd67c82440e880/rpds_py-0.28.0-cp313-cp313-win_amd64.whl", hash = "sha256:e0a0311caedc8069d68fc2bf4c9019b58a2d5ce3cd7cb656c845f1615b577e1e", size = 227759, upload-time = "2025-10-22T22:22:50.219Z" }, + { url = "https://files.pythonhosted.org/packages/e0/d6/5f39b42b99615b5bc2f36ab90423ea404830bdfee1c706820943e9a645eb/rpds_py-0.28.0-cp313-cp313-win_arm64.whl", hash = "sha256:04c1b207ab8b581108801528d59ad80aa83bb170b35b0ddffb29c20e411acdc1", size = 217326, upload-time = "2025-10-22T22:22:51.647Z" }, + { url = "https://files.pythonhosted.org/packages/5c/8b/0c69b72d1cee20a63db534be0df271effe715ef6c744fdf1ff23bb2b0b1c/rpds_py-0.28.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:f296ea3054e11fc58ad42e850e8b75c62d9a93a9f981ad04b2e5ae7d2186ff9c", size = 355736, upload-time = "2025-10-22T22:22:53.211Z" }, + { url = "https://files.pythonhosted.org/packages/f7/6d/0c2ee773cfb55c31a8514d2cece856dd299170a49babd50dcffb15ddc749/rpds_py-0.28.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5a7306c19b19005ad98468fcefeb7100b19c79fc23a5f24a12e06d91181193fa", size = 342677, upload-time = "2025-10-22T22:22:54.723Z" }, + { url = "https://files.pythonhosted.org/packages/e2/1c/22513ab25a27ea205144414724743e305e8153e6abe81833b5e678650f5a/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5d9b86aa501fed9862a443c5c3116f6ead8bc9296185f369277c42542bd646b", size = 371847, upload-time = "2025-10-22T22:22:56.295Z" }, + { url = "https://files.pythonhosted.org/packages/60/07/68e6ccdb4b05115ffe61d31afc94adef1833d3a72f76c9632d4d90d67954/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e5bbc701eff140ba0e872691d573b3d5d30059ea26e5785acba9132d10c8c31d", size = 381800, upload-time = "2025-10-22T22:22:57.808Z" }, + { url = "https://files.pythonhosted.org/packages/73/bf/6d6d15df80781d7f9f368e7c1a00caf764436518c4877fb28b029c4624af/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a5690671cd672a45aa8616d7374fdf334a1b9c04a0cac3c854b1136e92374fe", size = 518827, upload-time = "2025-10-22T22:22:59.826Z" }, + { url = "https://files.pythonhosted.org/packages/7b/d3/2decbb2976cc452cbf12a2b0aaac5f1b9dc5dd9d1f7e2509a3ee00421249/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9f1d92ecea4fa12f978a367c32a5375a1982834649cdb96539dcdc12e609ab1a", size = 399471, upload-time = "2025-10-22T22:23:01.968Z" }, + { url = "https://files.pythonhosted.org/packages/b1/2c/f30892f9e54bd02e5faca3f6a26d6933c51055e67d54818af90abed9748e/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d252db6b1a78d0a3928b6190156042d54c93660ce4d98290d7b16b5296fb7cc", size = 377578, upload-time = "2025-10-22T22:23:03.52Z" }, + { url = "https://files.pythonhosted.org/packages/f0/5d/3bce97e5534157318f29ac06bf2d279dae2674ec12f7cb9c12739cee64d8/rpds_py-0.28.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:d61b355c3275acb825f8777d6c4505f42b5007e357af500939d4a35b19177259", size = 390482, upload-time = "2025-10-22T22:23:05.391Z" }, + { url = "https://files.pythonhosted.org/packages/e3/f0/886bd515ed457b5bd93b166175edb80a0b21a210c10e993392127f1e3931/rpds_py-0.28.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:acbe5e8b1026c0c580d0321c8aae4b0a1e1676861d48d6e8c6586625055b606a", size = 402447, upload-time = "2025-10-22T22:23:06.93Z" }, + { url = "https://files.pythonhosted.org/packages/42/b5/71e8777ac55e6af1f4f1c05b47542a1eaa6c33c1cf0d300dca6a1c6e159a/rpds_py-0.28.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:8aa23b6f0fc59b85b4c7d89ba2965af274346f738e8d9fc2455763602e62fd5f", size = 552385, upload-time = "2025-10-22T22:23:08.557Z" }, + { url = "https://files.pythonhosted.org/packages/5d/cb/6ca2d70cbda5a8e36605e7788c4aa3bea7c17d71d213465a5a675079b98d/rpds_py-0.28.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:7b14b0c680286958817c22d76fcbca4800ddacef6f678f3a7c79a1fe7067fe37", size = 575642, upload-time = "2025-10-22T22:23:10.348Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d4/407ad9960ca7856d7b25c96dcbe019270b5ffdd83a561787bc682c797086/rpds_py-0.28.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bcf1d210dfee61a6c86551d67ee1031899c0fdbae88b2d44a569995d43797712", size = 544507, upload-time = "2025-10-22T22:23:12.434Z" }, + { url = "https://files.pythonhosted.org/packages/51/31/2f46fe0efcac23fbf5797c6b6b7e1c76f7d60773e525cb65fcbc582ee0f2/rpds_py-0.28.0-cp313-cp313t-win32.whl", hash = "sha256:3aa4dc0fdab4a7029ac63959a3ccf4ed605fee048ba67ce89ca3168da34a1342", size = 205376, upload-time = "2025-10-22T22:23:13.979Z" }, + { url = "https://files.pythonhosted.org/packages/92/e4/15947bda33cbedfc134490a41841ab8870a72a867a03d4969d886f6594a2/rpds_py-0.28.0-cp313-cp313t-win_amd64.whl", hash = "sha256:7b7d9d83c942855e4fdcfa75d4f96f6b9e272d42fffcb72cd4bb2577db2e2907", size = 215907, upload-time = "2025-10-22T22:23:15.5Z" }, + { url = "https://files.pythonhosted.org/packages/08/47/ffe8cd7a6a02833b10623bf765fbb57ce977e9a4318ca0e8cf97e9c3d2b3/rpds_py-0.28.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:dcdcb890b3ada98a03f9f2bb108489cdc7580176cb73b4f2d789e9a1dac1d472", size = 353830, upload-time = "2025-10-22T22:23:17.03Z" }, + { url = "https://files.pythonhosted.org/packages/f9/9f/890f36cbd83a58491d0d91ae0db1702639edb33fb48eeb356f80ecc6b000/rpds_py-0.28.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:f274f56a926ba2dc02976ca5b11c32855cbd5925534e57cfe1fda64e04d1add2", size = 341819, upload-time = "2025-10-22T22:23:18.57Z" }, + { url = "https://files.pythonhosted.org/packages/09/e3/921eb109f682aa24fb76207698fbbcf9418738f35a40c21652c29053f23d/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fe0438ac4a29a520ea94c8c7f1754cdd8feb1bc490dfda1bfd990072363d527", size = 373127, upload-time = "2025-10-22T22:23:20.216Z" }, + { url = "https://files.pythonhosted.org/packages/23/13/bce4384d9f8f4989f1a9599c71b7a2d877462e5fd7175e1f69b398f729f4/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8a358a32dd3ae50e933347889b6af9a1bdf207ba5d1a3f34e1a38cd3540e6733", size = 382767, upload-time = "2025-10-22T22:23:21.787Z" }, + { url = "https://files.pythonhosted.org/packages/23/e1/579512b2d89a77c64ccef5a0bc46a6ef7f72ae0cf03d4b26dcd52e57ee0a/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e80848a71c78aa328fefaba9c244d588a342c8e03bda518447b624ea64d1ff56", size = 517585, upload-time = "2025-10-22T22:23:23.699Z" }, + { url = "https://files.pythonhosted.org/packages/62/3c/ca704b8d324a2591b0b0adcfcaadf9c862375b11f2f667ac03c61b4fd0a6/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f586db2e209d54fe177e58e0bc4946bea5fb0102f150b1b2f13de03e1f0976f8", size = 399828, upload-time = "2025-10-22T22:23:25.713Z" }, + { url = "https://files.pythonhosted.org/packages/da/37/e84283b9e897e3adc46b4c88bb3f6ec92a43bd4d2f7ef5b13459963b2e9c/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ae8ee156d6b586e4292491e885d41483136ab994e719a13458055bec14cf370", size = 375509, upload-time = "2025-10-22T22:23:27.32Z" }, + { url = "https://files.pythonhosted.org/packages/1a/c2/a980beab869d86258bf76ec42dec778ba98151f253a952b02fe36d72b29c/rpds_py-0.28.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:a805e9b3973f7e27f7cab63a6b4f61d90f2e5557cff73b6e97cd5b8540276d3d", size = 392014, upload-time = "2025-10-22T22:23:29.332Z" }, + { url = "https://files.pythonhosted.org/packages/da/b5/b1d3c5f9d3fa5aeef74265f9c64de3c34a0d6d5cd3c81c8b17d5c8f10ed4/rpds_py-0.28.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5d3fd16b6dc89c73a4da0b4ac8b12a7ecc75b2864b95c9e5afed8003cb50a728", size = 402410, upload-time = "2025-10-22T22:23:31.14Z" }, + { url = "https://files.pythonhosted.org/packages/74/ae/cab05ff08dfcc052afc73dcb38cbc765ffc86f94e966f3924cd17492293c/rpds_py-0.28.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:6796079e5d24fdaba6d49bda28e2c47347e89834678f2bc2c1b4fc1489c0fb01", size = 553593, upload-time = "2025-10-22T22:23:32.834Z" }, + { url = "https://files.pythonhosted.org/packages/70/80/50d5706ea2a9bfc9e9c5f401d91879e7c790c619969369800cde202da214/rpds_py-0.28.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:76500820c2af232435cbe215e3324c75b950a027134e044423f59f5b9a1ba515", size = 576925, upload-time = "2025-10-22T22:23:34.47Z" }, + { url = "https://files.pythonhosted.org/packages/ab/12/85a57d7a5855a3b188d024b099fd09c90db55d32a03626d0ed16352413ff/rpds_py-0.28.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:bbdc5640900a7dbf9dd707fe6388972f5bbd883633eb68b76591044cfe346f7e", size = 542444, upload-time = "2025-10-22T22:23:36.093Z" }, + { url = "https://files.pythonhosted.org/packages/6c/65/10643fb50179509150eb94d558e8837c57ca8b9adc04bd07b98e57b48f8c/rpds_py-0.28.0-cp314-cp314-win32.whl", hash = "sha256:adc8aa88486857d2b35d75f0640b949759f79dc105f50aa2c27816b2e0dd749f", size = 207968, upload-time = "2025-10-22T22:23:37.638Z" }, + { url = "https://files.pythonhosted.org/packages/b4/84/0c11fe4d9aaea784ff4652499e365963222481ac647bcd0251c88af646eb/rpds_py-0.28.0-cp314-cp314-win_amd64.whl", hash = "sha256:66e6fa8e075b58946e76a78e69e1a124a21d9a48a5b4766d15ba5b06869d1fa1", size = 218876, upload-time = "2025-10-22T22:23:39.179Z" }, + { url = "https://files.pythonhosted.org/packages/0f/e0/3ab3b86ded7bb18478392dc3e835f7b754cd446f62f3fc96f4fe2aca78f6/rpds_py-0.28.0-cp314-cp314-win_arm64.whl", hash = "sha256:a6fe887c2c5c59413353b7c0caff25d0e566623501ccfff88957fa438a69377d", size = 212506, upload-time = "2025-10-22T22:23:40.755Z" }, + { url = "https://files.pythonhosted.org/packages/51/ec/d5681bb425226c3501eab50fc30e9d275de20c131869322c8a1729c7b61c/rpds_py-0.28.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:7a69df082db13c7070f7b8b1f155fa9e687f1d6aefb7b0e3f7231653b79a067b", size = 355433, upload-time = "2025-10-22T22:23:42.259Z" }, + { url = "https://files.pythonhosted.org/packages/be/ec/568c5e689e1cfb1ea8b875cffea3649260955f677fdd7ddc6176902d04cd/rpds_py-0.28.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b1cde22f2c30ebb049a9e74c5374994157b9b70a16147d332f89c99c5960737a", size = 342601, upload-time = "2025-10-22T22:23:44.372Z" }, + { url = "https://files.pythonhosted.org/packages/32/fe/51ada84d1d2a1d9d8f2c902cfddd0133b4a5eb543196ab5161d1c07ed2ad/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5338742f6ba7a51012ea470bd4dc600a8c713c0c72adaa0977a1b1f4327d6592", size = 372039, upload-time = "2025-10-22T22:23:46.025Z" }, + { url = "https://files.pythonhosted.org/packages/07/c1/60144a2f2620abade1a78e0d91b298ac2d9b91bc08864493fa00451ef06e/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e1460ebde1bcf6d496d80b191d854adedcc619f84ff17dc1c6d550f58c9efbba", size = 382407, upload-time = "2025-10-22T22:23:48.098Z" }, + { url = "https://files.pythonhosted.org/packages/45/ed/091a7bbdcf4038a60a461df50bc4c82a7ed6d5d5e27649aab61771c17585/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e3eb248f2feba84c692579257a043a7699e28a77d86c77b032c1d9fbb3f0219c", size = 518172, upload-time = "2025-10-22T22:23:50.16Z" }, + { url = "https://files.pythonhosted.org/packages/54/dd/02cc90c2fd9c2ef8016fd7813bfacd1c3a1325633ec8f244c47b449fc868/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3bbba5def70b16cd1c1d7255666aad3b290fbf8d0fe7f9f91abafb73611a91", size = 399020, upload-time = "2025-10-22T22:23:51.81Z" }, + { url = "https://files.pythonhosted.org/packages/ab/81/5d98cc0329bbb911ccecd0b9e19fbf7f3a5de8094b4cda5e71013b2dd77e/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3114f4db69ac5a1f32e7e4d1cbbe7c8f9cf8217f78e6e002cedf2d54c2a548ed", size = 377451, upload-time = "2025-10-22T22:23:53.711Z" }, + { url = "https://files.pythonhosted.org/packages/b4/07/4d5bcd49e3dfed2d38e2dcb49ab6615f2ceb9f89f5a372c46dbdebb4e028/rpds_py-0.28.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:4b0cb8a906b1a0196b863d460c0222fb8ad0f34041568da5620f9799b83ccf0b", size = 390355, upload-time = "2025-10-22T22:23:55.299Z" }, + { url = "https://files.pythonhosted.org/packages/3f/79/9f14ba9010fee74e4f40bf578735cfcbb91d2e642ffd1abe429bb0b96364/rpds_py-0.28.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cf681ac76a60b667106141e11a92a3330890257e6f559ca995fbb5265160b56e", size = 403146, upload-time = "2025-10-22T22:23:56.929Z" }, + { url = "https://files.pythonhosted.org/packages/39/4c/f08283a82ac141331a83a40652830edd3a4a92c34e07e2bbe00baaea2f5f/rpds_py-0.28.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1e8ee6413cfc677ce8898d9cde18cc3a60fc2ba756b0dec5b71eb6eb21c49fa1", size = 552656, upload-time = "2025-10-22T22:23:58.62Z" }, + { url = "https://files.pythonhosted.org/packages/61/47/d922fc0666f0dd8e40c33990d055f4cc6ecff6f502c2d01569dbed830f9b/rpds_py-0.28.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:b3072b16904d0b5572a15eb9d31c1954e0d3227a585fc1351aa9878729099d6c", size = 576782, upload-time = "2025-10-22T22:24:00.312Z" }, + { url = "https://files.pythonhosted.org/packages/d3/0c/5bafdd8ccf6aa9d3bfc630cfece457ff5b581af24f46a9f3590f790e3df2/rpds_py-0.28.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b670c30fd87a6aec281c3c9896d3bae4b205fd75d79d06dc87c2503717e46092", size = 544671, upload-time = "2025-10-22T22:24:02.297Z" }, + { url = "https://files.pythonhosted.org/packages/2c/37/dcc5d8397caa924988693519069d0beea077a866128719351a4ad95e82fc/rpds_py-0.28.0-cp314-cp314t-win32.whl", hash = "sha256:8014045a15b4d2b3476f0a287fcc93d4f823472d7d1308d47884ecac9e612be3", size = 205749, upload-time = "2025-10-22T22:24:03.848Z" }, + { url = "https://files.pythonhosted.org/packages/d7/69/64d43b21a10d72b45939a28961216baeb721cc2a430f5f7c3bfa21659a53/rpds_py-0.28.0-cp314-cp314t-win_amd64.whl", hash = "sha256:7a4e59c90d9c27c561eb3160323634a9ff50b04e4f7820600a2beb0ac90db578", size = 216233, upload-time = "2025-10-22T22:24:05.471Z" }, + { url = "https://files.pythonhosted.org/packages/ae/bc/b43f2ea505f28119bd551ae75f70be0c803d2dbcd37c1b3734909e40620b/rpds_py-0.28.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f5e7101145427087e493b9c9b959da68d357c28c562792300dd21a095118ed16", size = 363913, upload-time = "2025-10-22T22:24:07.129Z" }, + { url = "https://files.pythonhosted.org/packages/28/f2/db318195d324c89a2c57dc5195058cbadd71b20d220685c5bd1da79ee7fe/rpds_py-0.28.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:31eb671150b9c62409a888850aaa8e6533635704fe2b78335f9aaf7ff81eec4d", size = 350452, upload-time = "2025-10-22T22:24:08.754Z" }, + { url = "https://files.pythonhosted.org/packages/ae/f2/1391c819b8573a4898cedd6b6c5ec5bc370ce59e5d6bdcebe3c9c1db4588/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48b55c1f64482f7d8bd39942f376bfdf2f6aec637ee8c805b5041e14eeb771db", size = 380957, upload-time = "2025-10-22T22:24:10.826Z" }, + { url = "https://files.pythonhosted.org/packages/5a/5c/e5de68ee7eb7248fce93269833d1b329a196d736aefb1a7481d1e99d1222/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:24743a7b372e9a76171f6b69c01aedf927e8ac3e16c474d9fe20d552a8cb45c7", size = 391919, upload-time = "2025-10-22T22:24:12.559Z" }, + { url = "https://files.pythonhosted.org/packages/fb/4f/2376336112cbfeb122fd435d608ad8d5041b3aed176f85a3cb32c262eb80/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:389c29045ee8bbb1627ea190b4976a310a295559eaf9f1464a1a6f2bf84dde78", size = 528541, upload-time = "2025-10-22T22:24:14.197Z" }, + { url = "https://files.pythonhosted.org/packages/68/53/5ae232e795853dd20da7225c5dd13a09c0a905b1a655e92bdf8d78a99fd9/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23690b5827e643150cf7b49569679ec13fe9a610a15949ed48b85eb7f98f34ec", size = 405629, upload-time = "2025-10-22T22:24:16.001Z" }, + { url = "https://files.pythonhosted.org/packages/b9/2d/351a3b852b683ca9b6b8b38ed9efb2347596973849ba6c3a0e99877c10aa/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f0c9266c26580e7243ad0d72fc3e01d6b33866cfab5084a6da7576bcf1c4f72", size = 384123, upload-time = "2025-10-22T22:24:17.585Z" }, + { url = "https://files.pythonhosted.org/packages/e0/15/870804daa00202728cc91cb8e2385fa9f1f4eb49857c49cfce89e304eae6/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:4c6c4db5d73d179746951486df97fd25e92396be07fc29ee8ff9a8f5afbdfb27", size = 400923, upload-time = "2025-10-22T22:24:19.512Z" }, + { url = "https://files.pythonhosted.org/packages/53/25/3706b83c125fa2a0bccceac951de3f76631f6bd0ee4d02a0ed780712ef1b/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a3b695a8fa799dd2cfdb4804b37096c5f6dba1ac7f48a7fbf6d0485bcd060316", size = 413767, upload-time = "2025-10-22T22:24:21.316Z" }, + { url = "https://files.pythonhosted.org/packages/ef/f9/ce43dbe62767432273ed2584cef71fef8411bddfb64125d4c19128015018/rpds_py-0.28.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:6aa1bfce3f83baf00d9c5fcdbba93a3ab79958b4c7d7d1f55e7fe68c20e63912", size = 561530, upload-time = "2025-10-22T22:24:22.958Z" }, + { url = "https://files.pythonhosted.org/packages/46/c9/ffe77999ed8f81e30713dd38fd9ecaa161f28ec48bb80fa1cd9118399c27/rpds_py-0.28.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:7b0f9dceb221792b3ee6acb5438eb1f02b0cb2c247796a72b016dcc92c6de829", size = 585453, upload-time = "2025-10-22T22:24:24.779Z" }, + { url = "https://files.pythonhosted.org/packages/ed/d2/4a73b18821fd4669762c855fd1f4e80ceb66fb72d71162d14da58444a763/rpds_py-0.28.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:5d0145edba8abd3db0ab22b5300c99dc152f5c9021fab861be0f0544dc3cbc5f", size = 552199, upload-time = "2025-10-22T22:24:26.54Z" }, ] [[package]] From 3bf9b874c32ebbbaa6f895be988e04a19fdce7ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 23 Oct 2025 12:41:25 +0000 Subject: [PATCH 036/248] cp: !4298 - ci: Refactor testsytem - Removal of JET Artifacts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/actions/action.yml | 212 +++++------------- .github/workflows/cicd-main.yml | 128 +++++++++-- pyproject.toml | 6 +- .../shell_test_utils/run_ci_test.sh | 8 +- .../shell_test_utils/start_interactive_job.sh | 50 +---- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 4 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 4 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 4 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 4 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 4 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 4 +- .../bert/bert_release/model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 1 + .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 3 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 5 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../model_config.yaml | 4 +- .../t5/t5_release/model_config.yaml | 2 +- .../generate_jet_trigger_job.py | 6 +- .../python_scripts/generate_local_jobs.py | 6 +- .../python_scripts/launch_jet_workload.py | 8 +- .../launch_nemo_run_workload.py | 52 +++-- tests/test_utils/python_scripts/notify.py | 11 - .../{common.py => recipe_parser.py} | 39 +++- .../{common.yaml => ckpt_converter.yaml} | 0 .../gpt-dynamic-inference-cuda-graphs.yaml | 5 +- ...pt-dynamic-inference-with-coordinator.yaml | 11 +- .../recipes/gpt-dynamic-inference.yaml | 18 +- tests/test_utils/recipes/gpt-grads.yaml | 11 +- tests/test_utils/recipes/gpt-nemo.yaml | 14 +- .../recipes/gpt-static-inference.yaml | 21 +- tests/test_utils/recipes/gpt.yaml | 59 ++--- .../recipes/mamba-static-inference.yaml | 10 +- tests/test_utils/recipes/mamba.yaml | 12 +- tests/test_utils/recipes/mimo.yaml | 8 +- .../recipes/moe-dynamic-inference.yaml | 11 +- .../recipes/moe-static-inference.yaml | 8 +- tests/test_utils/recipes/moe.yaml | 14 +- .../test_utils/recipes/multimodal-llava.yaml | 6 +- uv.lock | 68 +++--- 252 files changed, 698 insertions(+), 751 deletions(-) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2 => bert_mcore_tp1_pp2}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2 => bert_mcore_tp1_pp2}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2 => bert_mcore_tp1_pp2}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2 => bert_mcore_tp1_pp2}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2 => bert_mcore_tp1_pp2}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2 => bert_mcore_tp1_pp2}/model_config.yaml (88%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2 => bert_mcore_tp1_pp4_vp2}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2 => bert_mcore_tp1_pp4_vp2}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2 => bert_mcore_tp1_pp4_vp2}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2 => bert_mcore_tp1_pp4_vp2}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2 => bert_mcore_tp1_pp4_vp2}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2 => bert_mcore_tp1_pp4_vp2}/model_config.yaml (89%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_dgx_a100_1N8G => bert_mcore_tp2_pp2}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_dgx_a100_1N8G => bert_mcore_tp2_pp2}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_dgx_a100_1N8G => bert_mcore_tp2_pp2}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_dgx_a100_1N8G => bert_mcore_tp2_pp2}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_dgx_a100_1N8G => bert_mcore_tp2_pp2}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_dgx_a100_1N8G => bert_mcore_tp2_pp2}/model_config.yaml (88%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G => bert_mcore_tp2_pp2_frozen_resume_torch_dist}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G => bert_mcore_tp2_pp2_frozen_resume_torch_dist}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G => bert_mcore_tp2_pp2_frozen_resume_torch_dist}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G => bert_mcore_tp2_pp2_frozen_resume_torch_dist}/model_config.yaml (89%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G => bert_mcore_tp2_pp2_local_spec}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G => bert_mcore_tp2_pp2_local_spec}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G => bert_mcore_tp2_pp2_local_spec}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G => bert_mcore_tp2_pp2_local_spec}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G => bert_mcore_tp2_pp2_local_spec}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G => bert_mcore_tp2_pp2_local_spec}/model_config.yaml (88%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G => bert_mcore_tp2_pp2_resume_torch_dist}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G => bert_mcore_tp2_pp2_resume_torch_dist}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G => bert_mcore_tp2_pp2_resume_torch_dist}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G => bert_mcore_tp2_pp2_resume_torch_dist}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G => bert_mcore_tp2_pp2_resume_torch_dist}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G => bert_mcore_tp2_pp2_resume_torch_dist}/model_config.yaml (84%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G => bert_mcore_tp2_pp2_resume_torch_dist_local_spec}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G => bert_mcore_tp2_pp2_resume_torch_dist_local_spec}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G => bert_mcore_tp2_pp2_resume_torch_dist_local_spec}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G => bert_mcore_tp2_pp2_resume_torch_dist_local_spec}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G => bert_mcore_tp2_pp2_resume_torch_dist_local_spec}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G => bert_mcore_tp2_pp2_resume_torch_dist_local_spec}/model_config.yaml (84%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1 => bert_mcore_tp4_pp1}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1 => bert_mcore_tp4_pp1}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1 => bert_mcore_tp4_pp1}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1 => bert_mcore_tp4_pp1}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1 => bert_mcore_tp4_pp1}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/bert/{bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1 => bert_mcore_tp4_pp1}/model_config.yaml (88%) rename tests/test_utils/python_scripts/{common.py => recipe_parser.py} (89%) rename tests/test_utils/recipes/{common.yaml => ckpt_converter.yaml} (100%) diff --git a/.github/actions/action.yml b/.github/actions/action.yml index b9a02e1e3f5..8b7fd373a98 100644 --- a/.github/actions/action.yml +++ b/.github/actions/action.yml @@ -15,6 +15,9 @@ name: "Test Template" description: "Template for running NeMo tests in a containerized environment" inputs: + container-image: + description: "Container image to use for test" + required: true timeout: description: "Max runtime of test in minutes" required: false @@ -46,83 +49,44 @@ inputs: runs: using: "composite" steps: - - name: Copy data - shell: bash - if: inputs.is_unit_test == 'false' - env: - SOURCE_DIR: /mnt/datadrive/TestData/megatron-lm/artifacts - TARGET_DIR: /home/runner/_work/TestData/megatron-lm/artifacts - MODEL: ${{ inputs.model }} - run: | - mkdir -p $TARGET_DIR/text/data/ - - if [[ "$MODEL" == "bert" ]]; then - mkdir -p $TARGET_DIR/text/the_pile/bert_shard00/ - cp -a $SOURCE_DIR/text/the_pile/bert_shard00/. $TARGET_DIR/text/data/ - elif [[ "$MODEL" == "gpt" ]] || [[ "$MODEL" == "moe" ]]; then - cp -a $SOURCE_DIR/text/the_pile/shard00/. $TARGET_DIR/text/data/ - fi - - - name: Install curl, sudo - shell: bash - run: | - sudo apt-get update - sudo apt-get install -y curl uuid-runtime - - name: Checkout repository uses: actions/checkout@v2 - with: - path: ${{ github.workspace }}/Megatron-LM - - - name: Cache uv - uses: actions/cache@v4 - id: cache - with: - path: cache-mount - key: ${{ runner.os }}-uv-${{ hashFiles('**/uv.lock') }} - restore-keys: | - ${{ runner.os }}-uv- - - name: Restore Docker cache mounts - uses: reproducible-containers/buildkit-cache-dance@5b81f4d29dc8397a7d341dba3aeecc7ec54d6361 - with: - cache-dir: cache-mount - dockerfile: docker/Dockerfile.ci.dev - skip-extraction: ${{ steps.cache.outputs.cache-hit }} + - name: Change ownership of /home/runner/ + shell: bash + run: sudo chown -R $(whoami) /home/runner/ - name: Setup python uses: actions/setup-python@v5 with: python-version: 3.12 - - name: Download test data - shell: bash - env: - GH_TOKEN: ${{ inputs.PAT }} - TIMEOUT: ${{ inputs.timeout }} - IS_UNIT_TEST: ${{ inputs.is_unit_test == 'true' }} + - name: Install uuidgen + shell: bash -x -e -u -o pipefail {0} run: | - echo "::group::Download test data" - pip install --no-cache-dir pygithub click - python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets - echo "::endgroup::" + apt-get update + apt-get install -y uuid-runtime - name: Create run-script (unit test) - shell: bash + shell: bash -x -e -u -o pipefail {0} if: inputs.is_unit_test == 'true' run: | echo "::group::Create run-script" cmd=$(cat <<'RUN_TEST_EOF' #!/bin/bash - docker exec -t test_container_${{ github.run_id }} bash -c ' - set -e - bash /opt/megatron-lm/tests/unit_tests/run_ci_test.sh \ - --tag ${{ inputs.tag }} \ - --environment dev \ - --bucket '\''${{ inputs.test_case }}'\'' \ - --log-dir /opt/megatron-lm/outputs/logs - ' + export PYTHONPATH=$(pwd) + export NEMORUN_HOME=$(pwd) + pip install --no-cache-dir uv + uv sync --only-group test + uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \ + --scope unit-tests \ + --model unit-tests \ + --test-case '${{ inputs.test_case }}' \ + --environment dev \ + --platform dgx_h100 \ + --tag ${{ inputs.tag }} \ + --container-image ${{ inputs.container-image }} RUN_TEST_EOF ) @@ -130,7 +94,7 @@ runs: echo "::endgroup::" - name: Create run-script (e2e test) - shell: bash + shell: bash -x -e -u -o pipefail {0} if: inputs.is_unit_test == 'false' env: MODEL: ${{ inputs.model }} @@ -138,118 +102,64 @@ runs: echo "::group::Create run-script" cmd=$(cat <<'RUN_TEST_EOF' #!/bin/bash - - - - docker exec -t test_container_${{ github.run_id }} bash -c ' - - set -e - ls -al /workspace/data - - if [[ "${{ inputs.model }}" == "bert" ]]; then - TRAINING_SCRIPT_PATH=pretrain_bert.py - elif [[ "${{ inputs.model }}" == "gpt" ]] || [[ "${{ inputs.model }}" == "moe" ]]; then - TRAINING_SCRIPT_PATH=pretrain_gpt.py - fi - - ARGUMENTS=( - "DATA_PATH=/workspace/data" - "DATA_CACHE_PATH=/workspace/data/cache" - "OUTPUT_PATH=$(pwd)/outputs/" - "TENSORBOARD_PATH=$(pwd)/tensorboard" - "CHECKPOINT_SAVE_PATH=$(pwd)/checkpoints" - "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/$NAME" - "TRAINING_SCRIPT_PATH=$TRAINING_SCRIPT_PATH" - "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/${{inputs.model}}/${{inputs.test_case}}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/${{inputs.model}}/${{inputs.test_case}}/golden_values_dev_dgx_h100.json" - "N_REPEAT=5" - "ENABLE_LIGHTWEIGHT_MODE=false" - "RECORD_CHECKPOINTS=false" - ) - - bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${ARGUMENTS[@]} - ' + set -euxo pipefail + + export PYTHONPATH=$(pwd) + export NEMORUN_HOME=$(pwd) + pip install --no-cache-dir uv + uv sync --only-group test + uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \ + --scope mr \ + --model ${{ inputs.model }} \ + --test-case ${{ inputs.test_case }} \ + --environment dev \ + --platform dgx_h100 \ + --container-image ${{ inputs.container-image }} \ + --data-dir /mnt/datadrive/TestData/megatron-lm/artifacts RUN_TEST_EOF ) echo "$cmd" | tee "job.sh" echo "::endgroup::" - - name: Build container - shell: bash - env: - GH_TOKEN: ${{ inputs.PAT }} - run: | - echo "::group::Build test container" - docker build -f docker/Dockerfile.ci.dev --build-arg FROM_IMAGE_NAME="nvcr.io/nvidia/pytorch:25.06-py3" --target=main -t megatron-core . - echo "::endgroup::" - - - name: Start container - shell: bash - run: | - echo "::group::Start test container" - set -x - - cmd=$(cat < functional-tests.json + + echo "functional-tests=$(cat functional-tests.json)" | tee -a "$GITHUB_OUTPUT" cicd-functional-tests-latest: strategy: fail-fast: false matrix: - include: - - model: "gpt" - test_case: "gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G" - - model: "gpt" - test_case: "gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G" - - model: "moe" - test_case: "gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer" - - model: "moe" - test_case: "gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed" + include: ${{ fromJson(needs.cicd-parse-functional-tests.outputs.functional-tests) }} needs: - pre-flight - cicd-wait-in-queue - - cicd-unit-tests-latest + - cicd-parse-functional-tests + # - cicd-unit-tests-latest runs-on: nvidia-ci-aws-gpu-x8 name: "${{ matrix.model }}/${{ matrix.test_case }} - latest" environment: nemo-ci @@ -149,7 +246,7 @@ jobs: || needs.pre-flight.outputs.is_ci_workload == 'true' || needs.pre-flight.outputs.force_run_all == 'true' ) - && !needs.pre-flight.outputs.is_merge_group == 'true' + && needs.pre-flight.outputs.is_merge_group == 'false' && !cancelled() steps: - name: Checkout @@ -163,6 +260,7 @@ jobs: timeout: ${{ matrix.timeout || 30 }} is_unit_test: "false" PAT: ${{ secrets.PAT }} + container-image: 766267172432.dkr.ecr.us-east-1.amazonaws.com/megatron-lm:1864 # ${{ github.sha }} Nemo_CICD_Test: needs: @@ -243,7 +341,7 @@ jobs: && !cancelled() strategy: matrix: - flag: [unit-test, e2e] + flag: [unit-test] steps: - name: Checkout uses: actions/checkout@v4 diff --git a/pyproject.toml b/pyproject.toml index 91d66de7efe..aaabab3875c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,7 +86,7 @@ dev = [ "wget", "onnxscript", "flash-linear-attention~=0.3.2", - "emerging_optimizers" + "emerging_optimizers", ] lts = [ @@ -170,8 +170,8 @@ flash_mla = [ ] transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.8" } # on `release_v2.8` -emerging_optimizers = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git", rev= "fb1add873e7851ec34b48581ea1b15761b73d189"} - +emerging_optimizers = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git", rev = "fb1add873e7851ec34b48581ea1b15761b73d189" } +nemo-run = { git = "https://github.com/NVIDIA-NeMo/Run.git", rev = "8ca8f7952a597f944985f1f1368a7acb9aa3a6c2" } [tool.isort] profile = "black" # black-compatible line_length = 100 # should match black parameters diff --git a/tests/functional_tests/shell_test_utils/run_ci_test.sh b/tests/functional_tests/shell_test_utils/run_ci_test.sh index b24423773e5..75cb4e619e7 100644 --- a/tests/functional_tests/shell_test_utils/run_ci_test.sh +++ b/tests/functional_tests/shell_test_utils/run_ci_test.sh @@ -48,6 +48,8 @@ for mandatory_var in "${MANDATORY_VARS[@]}"; do fi done +set -exo pipefail + # Extract settings from params file TEST_TYPE=$(cat $TRAINING_PARAMS_PATH | /usr/local/bin/yq '.TEST_TYPE') @@ -64,7 +66,7 @@ else fi mkdir -p $CHECKPOINT_SAVE_PATH -mkdir -p $CHECKPOINT_LOAD_PATH +mkdir -p $CHECKPOINT_LOAD_PATH || true _CHECKPOINT_LOAD_PATH=$CHECKPOINT_LOAD_PATH _CHECKPOINT_SAVE_PATH=$CHECKPOINT_SAVE_PATH @@ -103,6 +105,10 @@ if [[ "$MODE" == "pretraining" && "$TEST_TYPE" != "release" ]]; then TRAIN_ITERS=$(cat $TRAINING_PARAMS_PATH | /usr/local/bin/yq '.MODEL_ARGS."--exit-interval" // "100"') fi +elif [[ "$MODE" == "inference" && "$TEST_TYPE" != "release" ]]; then + if [[ "$ENABLE_LIGHTWEIGHT_MODE" == "true" && "$IS_NEMO_TEST" == "false" ]]; then + /usr/local/bin/yq -i '.ENV_VARS."SKIP_PYTEST" = 1' $TRAINING_PARAMS_PATH + fi fi if [[ "$MODE" == "pretraining" && "$TEST_TYPE" = "release" ]]; then diff --git a/tests/functional_tests/shell_test_utils/start_interactive_job.sh b/tests/functional_tests/shell_test_utils/start_interactive_job.sh index d3b6055e55b..0b30fc01283 100644 --- a/tests/functional_tests/shell_test_utils/start_interactive_job.sh +++ b/tests/functional_tests/shell_test_utils/start_interactive_job.sh @@ -78,56 +78,8 @@ if [ -z "$PARTITION" ] || [ -z "$SLURM_ACCOUNT" ] || [ -z "$IMAGE" ] || [ -z "$D exit 1 fi -# Check if recipes directory exists -if [ ! -d "$RECIPES_DIR" ]; then - echo "Error: Recipes directory '$RECIPES_DIR' does not exist" - exit 1 -fi - -# Create copy of recipes with interpolated artifacts -python -m tests.test_utils.python_scripts.common --recipes-dir $RECIPES_DIR --output-dir $RECIPES_DIR/interpolated - # Add current directory to container mounts -CONTAINER_MOUNTS="$(pwd):/opt/megatron-lm" - -# Process each YAML file in the recipes directory -if [ ! -f "$YAML_FILE" ]; then - continue -fi - -echo "Processing $(basename "$YAML_FILE")..." -YAML_FILE=workflows.yaml -# Extract artifacts from YAML file -while IFS=: read -r value key; do - # Skip empty or malformed entries - if [ -z "$value" ] || [ -z "$key" ] || [ "$value" = "/data/" ] || [ "$key" = "/data/" ]; then - continue - fi - - # Skip entries that don't start with a forward slash - if [[ ! "$key" =~ ^/ ]]; then - continue - fi - - # Create the mount string - mount="${DATASET_DIR}/${value}:${key}" - - # Skip if we've seen this mount before - if [ "${seen_mounts[$mount]}" = "1" ]; then - echo "Skipping duplicate mount: $mount" - continue - fi - - # Mark this mount as seen - seen_mounts[$mount]=1 - - if [ -z "$CONTAINER_MOUNTS" ]; then - CONTAINER_MOUNTS="$mount" - else - CONTAINER_MOUNTS="${CONTAINER_MOUNTS},$mount" - fi -done < <(yq eval '.[].spec.artifacts | to_entries | .[] | "\(.value):\(.key)"' "$YAML_FILE") -rm $YAML_FILE +CONTAINER_MOUNTS="$DATASET_DIR:/mnt/artifacts,$(pwd):/opt/megatron-lm" # Build the final srun command SRUN_CMD="srun \ diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/model_config.yaml similarity index 88% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml rename to tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/model_config.yaml index 7ccfd215dcc..ede505eb2f4 100644 --- a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/model_config.yaml @@ -22,8 +22,8 @@ MODEL_ARGS: --lr-decay-iters: 990000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-bert_00_text_sentence - --vocab-file: ${DATA_PATH}/vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/bert_shard00/my-bert_00_text_sentence + --vocab-file: ${DATA_PATH}/text/the_pile/bert_shard00/vocab.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.0001 diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/model_config.yaml similarity index 89% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml rename to tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/model_config.yaml index b4c5decf82e..e606d04a88c 100644 --- a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/model_config.yaml @@ -22,8 +22,8 @@ MODEL_ARGS: --lr-decay-iters: 990000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-bert_00_text_sentence - --vocab-file: ${DATA_PATH}/vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/bert_shard00/my-bert_00_text_sentence + --vocab-file: ${DATA_PATH}/text/the_pile/bert_shard00/vocab.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.0001 diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/model_config.yaml similarity index 88% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/model_config.yaml index 11909062fb8..e7bb67a9ed8 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/model_config.yaml @@ -22,8 +22,8 @@ MODEL_ARGS: --lr-decay-iters: 990000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-bert_00_text_sentence - --vocab-file: ${DATA_PATH}/vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/bert_shard00/my-bert_00_text_sentence + --vocab-file: ${DATA_PATH}/text/the_pile/bert_shard00/vocab.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.0001 diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/model_config.yaml similarity index 89% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/model_config.yaml index 09864ee106a..6f38457cdd0 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/model_config.yaml @@ -22,8 +22,8 @@ MODEL_ARGS: --lr-decay-iters: 990000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-bert_00_text_sentence - --vocab-file: ${DATA_PATH}/vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/bert_shard00/my-bert_00_text_sentence + --vocab-file: ${DATA_PATH}/text/the_pile/bert_shard00/vocab.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.0001 diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/model_config.yaml similarity index 88% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/model_config.yaml index 7eeac331ad3..def6878c889 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/model_config.yaml @@ -22,8 +22,8 @@ MODEL_ARGS: --lr-decay-iters: 990000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-bert_00_text_sentence - --vocab-file: ${DATA_PATH}/vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/bert_shard00/my-bert_00_text_sentence + --vocab-file: ${DATA_PATH}/text/the_pile/bert_shard00/vocab.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.0001 diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/model_config.yaml similarity index 84% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/model_config.yaml index 94d9cbfd83f..8b993bfaec3 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/model_config.yaml @@ -22,8 +22,8 @@ MODEL_ARGS: --lr-decay-iters: 990000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-bert_00_text_sentence - --vocab-file: ${DATA_PATH}/vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/bert_shard00/my-bert_00_text_sentence + --vocab-file: ${DATA_PATH}/text/the_pile/bert_shard00/vocab.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.0001 @@ -42,6 +42,6 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --ckpt-format: torch - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --attention-backend: unfused TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/model_config.yaml similarity index 84% rename from tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/model_config.yaml index c496f84f196..05a3d0730c8 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/model_config.yaml @@ -22,8 +22,8 @@ MODEL_ARGS: --lr-decay-iters: 990000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-bert_00_text_sentence - --vocab-file: ${DATA_PATH}/vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/bert_shard00/my-bert_00_text_sentence + --vocab-file: ${DATA_PATH}/text/the_pile/bert_shard00/vocab.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.0001 @@ -40,7 +40,7 @@ MODEL_ARGS: --use-checkpoint-args: true --use-checkpoint-opt_param-scheduler: true --no-gradient-accumulation-fusion: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --ckpt-format: torch diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/model_config.yaml similarity index 88% rename from tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml rename to tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/model_config.yaml index 59607ba28d4..777be078e4d 100644 --- a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/model_config.yaml @@ -22,8 +22,8 @@ MODEL_ARGS: --lr-decay-iters: 990000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-bert_00_text_sentence - --vocab-file: ${DATA_PATH}/vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/bert_shard00/my-bert_00_text_sentence + --vocab-file: ${DATA_PATH}/text/the_pile/bert_shard00/vocab.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.0001 diff --git a/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml index c4b80767c63..68cbb230996 100644 --- a/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml @@ -27,7 +27,7 @@ MODEL_ARGS: --pipeline-model-parallel-size: 8 # Data args --data-path: ${DATA_BLEND} - --vocab-file: ${DATA_PATH}/vocab.txt + --vocab-file: ${DATA_PATH}/text/the_pile/bert_shard00/vocab.txt --split: 949,50,1 --data-cache-path: ${DATA_CACHE_PATH} # EVAL_AND_LOGGING_ARGS diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml index c2d14870924..208827c9aea 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -46,7 +46,7 @@ MODEL_ARGS: --no-gradient-accumulation-fusion: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --apply-query-key-layer-scaling: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te/model_config.yaml index 3b8c3563f41..15fbeb4f986 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -47,7 +47,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml index 4c7132e2d1c..573cddceff0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -48,7 +48,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml index 0be73f09e67..f897d2b9a8e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -44,7 +44,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --apply-query-key-layer-scaling: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml index eac35eeb2ab..7345237d672 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -43,7 +43,7 @@ MODEL_ARGS: --no-gradient-accumulation-fusion: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml index d5960cff7ac..e15844bafb7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -44,7 +44,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml index ee577dda37a..c7dfcfe48e3 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -44,7 +44,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --apply-query-key-layer-scaling: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml index 60bf33c7e78..e829340190e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -45,7 +45,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml index 33da65bd2b7..863cf9cac25 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -43,7 +43,7 @@ MODEL_ARGS: --no-gradient-accumulation-fusion: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --apply-query-key-layer-scaling: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml index b57638bcd80..fcb9fa2884f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml index 6070ad5e039..0e32dbd913a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -44,7 +44,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml index 387f03d450d..246fb33da57 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml index 967567958f0..196492f1ec7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml index 1b5de4373f6..665388ce7a1 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -49,7 +49,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_mxfp8_tp_sp_cp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_mxfp8_tp_sp_cp/model_config.yaml index ccff1cf44fd..f4cbb87d27d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_mxfp8_tp_sp_cp/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_mxfp8_tp_sp_cp/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml index 7fe999b2a6a..80218da886d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml @@ -20,7 +20,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -49,7 +49,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml index 0e243b61138..96b4a6c0ccc 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -44,7 +44,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml index 453c506742b..c46be1c819b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -42,7 +42,7 @@ MODEL_ARGS: --deterministic-mode: true --no-gradient-accumulation-fusion: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml index 8211c7f40f6..c151135828d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -47,7 +47,7 @@ MODEL_ARGS: --fp8-amax-compute-algo: max --attention-softmax-in-fp32: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml index cf4fe01721c..40dea9779c9 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -48,7 +48,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml index 51475b1a653..fb47009a77d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml index 02db21e9477..32dd88dfb72 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -48,7 +48,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml index 3f650edfa8a..21c6ac25e83 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -48,7 +48,7 @@ MODEL_ARGS: --fp8-amax-compute-algo: max --attention-softmax-in-fp32: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml index 95e4fd5b48e..59707f588c0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -51,7 +51,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml index a38d289752f..0e62673a628 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml @@ -20,7 +20,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml index bbbcf96b674..4361bf233cd 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -49,7 +49,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml index 01736c68999..ed56bc7cfad 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml index 9bd15f98877..fe4a6575953 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -47,7 +47,7 @@ MODEL_ARGS: --fp8-amax-compute-algo: max --attention-softmax-in-fp32: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/model_config.yaml index 48cf5e1cfac..c2a26a070fb 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/model_config.yaml @@ -58,6 +58,7 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true + --exit-interval: 25 TEST_TYPE: regular METRICS: - "iteration-time" diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_disable/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_disable/model_config.yaml index 9b641b68d75..14d585d84a7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_disable/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_disable/model_config.yaml @@ -63,7 +63,7 @@ MODEL_ARGS: --load: ${CHECKPOINT_LOAD_PATH} # data settings --data-cache-path: ${DATA_CACHE_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt # logging settings diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_enable/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_enable/model_config.yaml index d18a37d7823..df91f9a95eb 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_enable/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_enable/model_config.yaml @@ -62,7 +62,7 @@ MODEL_ARGS: --load: ${CHECKPOINT_LOAD_PATH} # data settings --data-cache-path: ${DATA_CACHE_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt # logging settings diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_persistent_1/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_persistent_1/model_config.yaml index 3258e398b1e..849df09f27f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_persistent_1/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_persistent_1/model_config.yaml @@ -63,7 +63,7 @@ MODEL_ARGS: --load: ${CHECKPOINT_LOAD_PATH} # data settings --data-cache-path: ${DATA_CACHE_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt # logging settings diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_persistent_2/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_persistent_2/model_config.yaml index 5fd21f6175a..3316142031f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_persistent_2/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_persistent_2/model_config.yaml @@ -62,7 +62,7 @@ MODEL_ARGS: --load: ${CHECKPOINT_LOAD_PATH} # data settings --data-cache-path: ${DATA_CACHE_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt # logging settings diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_reshard/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_reshard/model_config.yaml index 65bdc723480..4b8d6a47b9c 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_reshard/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_reshard/model_config.yaml @@ -63,7 +63,7 @@ MODEL_ARGS: --load: ${CHECKPOINT_LOAD_PATH} # data settings --data-cache-path: ${DATA_CACHE_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt # logging settings diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_resume/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_resume/model_config.yaml index fd313d7a959..43937abe664 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_resume/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_resume/model_config.yaml @@ -62,7 +62,7 @@ MODEL_ARGS: --load: ${CHECKPOINT_LOAD_PATH} # data settings --data-cache-path: ${DATA_CACHE_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt # logging settings diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_resume_check_grads/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_resume_check_grads/model_config.yaml index 476d0e08cf1..e9c35d0e86d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_resume_check_grads/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_resume_check_grads/model_config.yaml @@ -59,7 +59,6 @@ BASE_MODEL_ARGS: &BASE_MODEL_ARGS --num-query-groups: 8 --seq-length: 512 --kv-channels: 128 - --ffn-hidden-size: 8192 --group-query-attention: true --normalization: RMSNorm --swiglu: true @@ -90,7 +89,7 @@ BASE_MODEL_ARGS: &BASE_MODEL_ARGS --load: ${CHECKPOINT_LOAD_PATH} # data settings --data-cache-path: ${DATA_CACHE_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt # logging settings diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_transient/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_transient/model_config.yaml index 48d188d81c7..5021a029d3b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_transient/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_transient/model_config.yaml @@ -64,7 +64,7 @@ MODEL_ARGS: --exit-interval: 4 # data settings --data-cache-path: ${DATA_CACHE_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt # logging settings diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml index fd43e992119..8031bf55d8d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -46,7 +46,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml index 1e11b3ff94a..5ed4553ad1d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml index efe469636e9..6eac7d0da72 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -48,7 +48,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml index a0785630f36..750986482c7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -48,7 +48,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml index ff347789ff1..f34c980d821 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -49,7 +49,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml index e09ac1ce49e..7c880daf577 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -48,7 +48,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml index af2f93042ea..7f0958f94f2 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -48,7 +48,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: flash diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml index 3f6379f90ff..7271fe996d6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml index c49288bf939..7c5a764ccb9 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -47,7 +47,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml index ef2d6010e6f..2491fd02e96 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -49,7 +49,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml index 4f3560b8c35..58d4628f72d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -47,7 +47,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml index cb4e11e3d3c..5fcf15a2c3e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -48,7 +48,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml index 388afdaed4a..6b66183c1dc 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -46,7 +46,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml index 4defebeac39..089fd7808ff 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/model_config.yaml index 47ec5c2bddf..3d8843214a3 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml index 89ff19ad1e8..4dc43353c9f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -47,7 +47,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/model_config.yaml index 58554cc1121..7133af75b8f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml index a63a24f6aa0..1e29b79848b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -47,7 +47,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml index 7281f21ce90..27d8203d307 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -48,7 +48,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml index b6527f0f7c7..bc0da950ac8 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -46,7 +46,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml index f7822d5c5dc..962e08d5e73 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -47,7 +47,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml index d4fb79b2bea..8942fa94b55 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -48,7 +48,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml index ac8332843f7..7f6ae92394d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -45,7 +45,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml index 2a13801a9d1..65ea19f9bd8 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -47,7 +47,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml index 4a235266b14..99a04b44fe3 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml index 3dece98a527..aa041fec6de 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml index fbb85c1a7d2..a1150d0db09 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -47,7 +47,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml index b0fd77bb767..907c86da3b1 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -52,7 +52,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml index 170c1397ba1..503e702c4f5 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml index 9473172d43a..c8d15bbf005 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -49,7 +49,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml index e64e70ae046..8db3c6529df 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml index e28ce4aea78..243a52e84bd 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -48,7 +48,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml index efbe0f3d7cc..699ca43cc7b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml index 835e017ccce..b3a950dcb5e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -49,7 +49,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml index f9b74000068..0e71ea6c268 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml index 13a92a6133d..6aa5a991e90 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/model_config.yaml index 89d3d84146e..4907dfb7f4c 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/model_config.yaml index 4fba5fca3a8..b894bf3bd20 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml index 9e8d9b87466..cfdbe747764 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -45,7 +45,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --decoder-first-pipeline-num-layers: 2 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/model_config.yaml index dd5d83e0603..f9f58db94f9 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -47,7 +47,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --account-for-embedding-in-pipeline-split: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index 325268c5a9d..db560c8aac5 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -46,7 +46,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: flash diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index 85ff6feb92d..c6a2379b571 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml index 0ef2b566008..1ad10c02caa 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml index b267aa17fd2..364a41d2fe1 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml @@ -37,7 +37,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -62,7 +62,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml index 5f76e8f8b18..ac70eb6bd1e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -49,7 +49,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index c03a621f91d..585aea5c26e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -47,7 +47,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: flash diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml index d853b772bb9..f8f7bded190 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -50,7 +50,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml index 8af4e996340..6234292f5ff 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/model_config.yaml index a168bf941f9..d510bd15c0f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/model_config.yaml index c28625ec1f0..ccc411e5879 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -45,7 +45,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: flash diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/model_config.yaml index 3a1f90a9273..5a9f0ea8a89 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/model_config.yaml index d2e2e266ff6..920ad6832d8 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/model_config.yaml index 683a855ab88..78e7e3a45ca 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -46,7 +46,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: flash diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/model_config.yaml index f35f4f3d99f..36a000292f5 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -45,7 +45,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: flash diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index 5a5d023dbf5..ddbc04621a6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -46,7 +46,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: flash diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml index 98fca77b1b8..31e5bb16ad5 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -42,7 +42,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml index f68e6657c26..76cfaf020af 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -46,7 +46,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml index e800a1bb0e3..3488b4d1585 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -45,7 +45,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml index e97bc5217c8..3a9b912ed0c 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -45,7 +45,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/model_config.yaml index 8fa925d715d..586f90f1cf6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/model_config.yaml @@ -27,7 +27,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml index b0aa1f66235..dd928979546 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -46,7 +46,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml index 8d7abbe27d4..bf6520edcd6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -45,7 +45,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index b31c1bc3ef9..f7c1c7ee725 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -46,7 +46,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: flash diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml index aac3d65eb87..deaadae81a3 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -44,7 +44,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml index a12763a2117..fbbe2255a82 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -46,7 +46,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml index 9d8400459f1..383ec818661 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -47,7 +47,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml index fcc217aa470..14cefe1e409 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -45,7 +45,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml index b9d5f466afc..3cf39c93e9c 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml index 5d78d653aae..4fd3ccba030 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -46,7 +46,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml index b19f7ffcb9c..e8f7fee1215 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -43,7 +43,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/model_config.yaml index ac68729bd5e..d6a183799fd 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml index 6fee9172272..8df2e496bb1 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -46,7 +46,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml index dea5ced0081..7cd304fc880 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -48,7 +48,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml index 1c2e8ff6304..72f029c9044 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -47,7 +47,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml index 73f311df459..75a0ffc2adc 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -47,7 +47,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml index 83a671b2c26..de4164176bb 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -49,7 +49,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml index eee1bb896f2..2ee48e8111c 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -48,7 +48,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml index 1c83796b116..8f09dae5fec 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml index 8543a37af49..1ac8ec45c24 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -46,7 +46,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --log-memory-to-tensorboard: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml index 46dfa985920..37fb8b1cccd 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -46,7 +46,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml index 6f776fc09b1..1406468fadf 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml @@ -25,7 +25,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -49,7 +49,7 @@ MODEL_ARGS: --no-gradient-accumulation-fusion: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml index 363f31519f9..2ec2c402230 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml @@ -9,7 +9,7 @@ MODEL_ARGS: --tiktoken-pattern: v2 --use-mcore-models: true --tokenizer-type: TikTokenizer - --tokenizer-model: ${DATA_PATH}/mcore_mistral/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --auto-detect-ckpt-format: true --max-tokens-to-oom: 3600000 --inference-max-seq-length: 4096 @@ -19,7 +19,7 @@ MODEL_ARGS: --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true --timing-log-level: 2 - --load: ${CHECKPOINT_LOAD_PATH}/mcore_mistral/model + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1 --distributed-backend: nccl --log-interval: 1 --transformer-impl: transformer_engine diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/model_config.yaml index c0b563c663b..13e56a13c85 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/model_config.yaml @@ -9,7 +9,7 @@ MODEL_ARGS: --tiktoken-pattern: v2 --use-mcore-models: true --tokenizer-type: TikTokenizer - --tokenizer-model: ${DATA_PATH}/mcore_mistral/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --auto-detect-ckpt-format: true --max-tokens-to-oom: 3600000 --inference-max-seq-length: 4096 @@ -19,7 +19,7 @@ MODEL_ARGS: --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true --timing-log-level: 2 - --load: ${CHECKPOINT_LOAD_PATH}/mcore_mistral/model + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1 --distributed-backend: nccl --log-interval: 1 --transformer-impl: transformer_engine @@ -49,7 +49,7 @@ MODEL_ARGS: --inference-ckpt-non-strict: true # To handle the extra_state errors --output-path: ${TENSORBOARD_PATH} --output-every-n-results: 32 - --prompt-file: ${DATA_PATH}/sharegpt/filtered-benchmark/processed.jsonl + --prompt-file: ${DATA_PATH}/text/sharegpt-vicuna/filtered/processed.jsonl --prompt-file-num-truncate: 128 # originally 1024 --num-tokens-to-generate: 128 # originally 512 --incoming-requests-per-step: 32 diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml index 024d2ede3da..b99100f65eb 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml @@ -9,7 +9,7 @@ MODEL_ARGS: --tiktoken-pattern: v2 --use-mcore-models: true --tokenizer-type: TikTokenizer - --tokenizer-model: ${DATA_PATH}/mcore_mistral/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --auto-detect-ckpt-format: true --max-tokens-to-oom: 3600000 --inference-max-seq-length: 4096 @@ -19,7 +19,7 @@ MODEL_ARGS: --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true --timing-log-level: 2 - --load: ${CHECKPOINT_LOAD_PATH}/mcore_mistral/model + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ --distributed-backend: nccl --log-interval: 1 --transformer-impl: transformer_engine @@ -51,7 +51,7 @@ MODEL_ARGS: --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." --incoming-requests-per-step: 32 --use-flashinfer-fused-rope: true - + METRICS: - "generated_tokens" - "logprobs" diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml index f2d3dee3904..7a2cc9b0c78 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml @@ -9,7 +9,7 @@ MODEL_ARGS: --tiktoken-pattern: v2 --use-mcore-models: true --tokenizer-type: TikTokenizer - --tokenizer-model: ${DATA_PATH}/mcore_mistral/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --auto-detect-ckpt-format: true --max-tokens-to-oom: 3600000 --inference-max-seq-length: 4096 @@ -19,7 +19,7 @@ MODEL_ARGS: --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true --timing-log-level: 2 - --load: ${CHECKPOINT_LOAD_PATH}/mcore_mistral/model + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ --distributed-backend: nccl --log-interval: 1 --transformer-impl: transformer_engine diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml index 5fe1ecf5c8f..0b31d16af75 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml @@ -9,7 +9,7 @@ MODEL_ARGS: --tiktoken-pattern: v2 --use-mcore-models: true --tokenizer-type: TikTokenizer - --tokenizer-model: ${DATA_PATH}/mcore_mistral/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --auto-detect-ckpt-format: true --max-tokens-to-oom: 3600000 --inference-max-seq-length: 4096 @@ -19,7 +19,7 @@ MODEL_ARGS: --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true --timing-log-level: 2 - --load: ${CHECKPOINT_LOAD_PATH}/mcore_mistral/model + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ --distributed-backend: nccl --log-interval: 1 --transformer-impl: transformer_engine diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/model_config.yaml index 90e93dfdcd8..3b10336138d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/model_config.yaml @@ -12,8 +12,8 @@ MODEL_ARGS: --log-memory-to-tensorboard: true --timing-log-level: 2 # See the mount paths defined in the top level tests/test_utils/recipes/gpt-static-inference.yaml - --load: ${CHECKPOINT_LOAD_PATH}/deepseek_16b_pyt/model/checkpoints - --tokenizer-model: ${DATA_PATH}/deepseek_16b_pyt/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --load: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/checkpoints + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --tokenizer-type: TikTokenizer --tiktoken-pattern: v2 --distributed-backend: nccl diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/model_config.yaml index 18fe5beff99..04e6caa3303 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/model_config.yaml @@ -9,7 +9,7 @@ MODEL_ARGS: --tiktoken-pattern: v2 --use-mcore-models: true --tokenizer-type: TikTokenizer - --tokenizer-model: ${DATA_PATH}/mcore_mistral/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --auto-detect-ckpt-format: true --max-tokens-to-oom: 3600000 --inference-max-seq-length: 4096 @@ -19,7 +19,7 @@ MODEL_ARGS: --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true --timing-log-level: 2 - --load: ${CHECKPOINT_LOAD_PATH}/mcore_mistral/model + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ --distributed-backend: nccl --log-interval: 1 --transformer-impl: transformer_engine diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/model_config.yaml index d03c69f8325..9aa1a6e1c96 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/model_config.yaml @@ -9,7 +9,7 @@ MODEL_ARGS: --tiktoken-pattern: v2 --use-mcore-models: true --tokenizer-type: TikTokenizer - --tokenizer-model: ${DATA_PATH}/mcore_mistral/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --auto-detect-ckpt-format: true --max-tokens-to-oom: 3600000 --inference-max-seq-length: 4096 @@ -19,7 +19,7 @@ MODEL_ARGS: --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true --timing-log-level: 2 - --load: ${CHECKPOINT_LOAD_PATH}/mcore_mistral/model + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ --distributed-backend: nccl --log-interval: 1 --transformer-impl: transformer_engine diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml index d78c45e380c..b3564f8226a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml @@ -9,7 +9,7 @@ MODEL_ARGS: --tiktoken-pattern: v2 --use-mcore-models: true --tokenizer-type: TikTokenizer - --tokenizer-model: ${DATA_PATH}/mcore_mistral/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --auto-detect-ckpt-format: true --max-tokens-to-oom: 3600000 --inference-max-seq-length: 4096 @@ -19,7 +19,7 @@ MODEL_ARGS: --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true --timing-log-level: 2 - --load: ${CHECKPOINT_LOAD_PATH}/mcore_mistral/model + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ --distributed-backend: nccl --log-interval: 1 --transformer-impl: transformer_engine diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/model_config.yaml index 3de471e8f8b..4350c4a6f50 100644 --- a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/model_config.yaml @@ -26,7 +26,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp4_cp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp4_cp1_dgx_a100_1N8G/model_config.yaml index 21fa690e66d..b571dca2dd0 100644 --- a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp4_cp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp4_cp1_dgx_a100_1N8G/model_config.yaml @@ -26,7 +26,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/model_config.yaml index f3942d7ae4a..941d3f6f829 100644 --- a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/model_config.yaml @@ -26,7 +26,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/model_config.yaml index 76891deaa85..588cfe3e80a 100644 --- a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/model_config.yaml @@ -26,7 +26,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/model_config.yaml index 4e55935511c..75e4d3123bd 100644 --- a/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/model_config.yaml +++ b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/model_config.yaml @@ -11,8 +11,8 @@ MODEL_ARGS: --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true --timing-log-level: 2 - --load: ${CHECKPOINT_LOAD_PATH}/mamba_hybrid_2b/checkpoint - --tokenizer-model: ${DATA_PATH}/mamba_hybrid_2b/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --load: ${CHECKPOINT_LOAD_PATH}/model/mamba_hybrid_2b/dcp/mcore-v1_bf16/checkpoint + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mamba_hybrid_2b/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --tokenizer-type: TikTokenizer --tiktoken-pattern: v2 --distributed-backend: nccl diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/model_config.yaml index 2af1fa222c1..301b68e7382 100644 --- a/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/model_config.yaml @@ -11,8 +11,8 @@ MODEL_ARGS: --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true --timing-log-level: 2 - --load: ${CHECKPOINT_LOAD_PATH}/mamba_hybrid_2b/checkpoint - --tokenizer-model: ${DATA_PATH}/mamba_hybrid_2b/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --load: ${CHECKPOINT_LOAD_PATH}/model/mamba_hybrid_2b/dcp/mcore-v1_bf16/checkpoint + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mamba_hybrid_2b/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --tokenizer-type: TikTokenizer --tiktoken-pattern: v2 --distributed-backend: nccl diff --git a/tests/functional_tests/test_cases/mimo/mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8/model_config.yaml b/tests/functional_tests/test_cases/mimo/mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8/model_config.yaml index 447b5a094e8..ced98a352b1 100644 --- a/tests/functional_tests/test_cases/mimo/mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8/model_config.yaml +++ b/tests/functional_tests/test_cases/mimo/mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Ring CUBLAS_WORKSPACE_CONFIG: :4096:8 - ARTIFACTS_ROOT: /workspace/checkpoints MODEL_ARGS: --num-layers: 32 --hidden-size: 4096 @@ -48,8 +47,8 @@ MODEL_ARGS: --deterministic-mode: true --log-memory-to-tensorboard: true --dataloader-type: external - --data-path: ${DATA_PATH} - --language-model-checkpoint: ${ARTIFACTS_ROOT}/vicuna_7b_pyt/dcp/mcore-v1.5_fp32/weights + --data-path: ${DATA_PATH}/mixed/mcore_mimo_vlm/llava_pretrain_energon + --language-model-checkpoint: ${CHECKPOINT_LOAD_PATH}/model/vicuna_7b_pyt/dcp/mcore-v1.5_fp32/weights --auto-detect-ckpt-format: true --accumulate-allreduce-grads-in-fp32: true --position-embedding-type: rope diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/model_config.yaml index f955dbf17a7..6bdb19e1001 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/model_config.yaml index f5014a23b5c..97db543f73c 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/model_config.yaml index 7cb050257a9..45ae64df053 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -50,7 +50,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: flash diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml index 2354ecd7fd9..bb3f5df251d 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -49,7 +49,7 @@ MODEL_ARGS: --no-gradient-accumulation-fusion: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: flash diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml index 7c0a103200a..5ce2939b05d 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -49,7 +49,7 @@ MODEL_ARGS: --no-gradient-accumulation-fusion: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/model_config.yaml index a01439c83cc..60652f0ded9 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -50,7 +50,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml index 984e8bd51f3..8411f00055e 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -49,7 +49,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --disable-bias-linear: true diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml index 617d2a70b58..ac03efd36a5 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -49,7 +49,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml index 34070006ad7..989a24acaf7 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -49,7 +49,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml index 5390afcd09b..52eb433afd5 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml @@ -38,7 +38,7 @@ MODEL_ARGS: # Data args --seq-length: 4096 --data-cache-path: ${DATA_CACHE_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/model_config.yaml index 8dcf744be8f..b95d5c04a1a 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/model_config.yaml index 2dd0fda1c25..5268bf68b33 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml index 8e98f65315b..8f4f022345a 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -50,7 +50,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --disable-bias-linear: true diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml index 27b2db92ca9..aa83c79ceb2 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml index e5dd41580d0..758f7af8f0f 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml index f78250b86e2..2ef041c07af 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -53,7 +53,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --ckpt-assume-constant-structure: true --data-cache-path: ${DATA_CACHE_PATH} --bf16: true diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml index e970e1e0209..29a63c7d148 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml index be2a2cb6a6f..a15bbf77196 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -52,7 +52,7 @@ MODEL_ARGS: --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --ckpt-assume-constant-structure: true --data-cache-path: ${DATA_CACHE_PATH} --bf16: true diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml index 0888531f330..a7e85122831 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -54,7 +54,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --no-bias-gelu-fusion: true diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml index 19a8b4fc639..a5f390a463d 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml index 12c43095c41..7ffcd448b37 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -51,7 +51,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml index a88a8b74b97..e7aa73ba6b1 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -56,7 +56,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-save-pre-mcore-014: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml index b22cd9ba9ba..3806ae26529 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -52,7 +52,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --use-mcore-models: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml index 91a908a4fcd..4820a43bf3f 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -55,7 +55,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --attention-backend: unfused diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml index f27db4a8021..488b8ad92d2 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml @@ -22,7 +22,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml index 7ebd9f0d1af..e8c45375110 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml @@ -38,7 +38,7 @@ MODEL_ARGS: # Data args --seq-length: 4096 --data-cache-path: ${DATA_CACHE_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml index 11d62eb1490..c7f0bde3e82 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml @@ -38,7 +38,7 @@ MODEL_ARGS: # Data args --seq-length: 4096 --data-cache-path: ${DATA_CACHE_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml index 0a37ee08498..bf1c5a45cc9 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml @@ -39,7 +39,7 @@ MODEL_ARGS: # Data args --seq-length: 4096 --data-cache-path: ${DATA_CACHE_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml index e46fc9246b7..e593e94f5ac 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document --vocab-file: ${DATA_PATH}/bpe/vocab.json --merge-file: ${DATA_PATH}/bpe/merges.txt --split: 949,50,1 @@ -58,7 +58,7 @@ MODEL_ARGS: --use-mcore-models: true --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --no-bias-gelu-fusion: true diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml index df6ca00d00e..d94b06f5ac8 100644 --- a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml @@ -11,8 +11,8 @@ MODEL_ARGS: --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true --timing-log-level: 2 - --load: ${CHECKPOINT_LOAD_PATH}/deepseek_16b_pyt/model/checkpoints - --tokenizer-model: ${DATA_PATH}/deepseek_16b_pyt/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --load: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/checkpoints + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --tokenizer-type: TikTokenizer --tiktoken-pattern: v2 --distributed-backend: nccl diff --git a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/model_config.yaml index 3f09b79d8e7..a9171008b7c 100644 --- a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/model_config.yaml @@ -11,8 +11,8 @@ MODEL_ARGS: --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true --timing-log-level: 2 - --load: ${CHECKPOINT_LOAD_PATH}/deepseek_16b_pyt/model/checkpoints - --tokenizer-model: ${DATA_PATH}/deepseek_16b_pyt/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --load: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/checkpoints + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --tokenizer-type: TikTokenizer --tiktoken-pattern: v2 --distributed-backend: nccl diff --git a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml index 26a9f7afc1e..116992b2d7f 100644 --- a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml @@ -11,8 +11,8 @@ MODEL_ARGS: --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true --timing-log-level: 2 - --load: ${CHECKPOINT_LOAD_PATH}/deepseek_16b_pyt/model/checkpoints - --tokenizer-model: ${DATA_PATH}/deepseek_16b_pyt/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --load: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/checkpoints + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --tokenizer-type: TikTokenizer --tiktoken-pattern: v2 --distributed-backend: nccl diff --git a/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml index e9556f5f36e..234236c7d26 100644 --- a/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml @@ -29,8 +29,8 @@ MODEL_ARGS: --vocab-extra-ids: 100 --init-method-std: 0.015 --transformer-impl: transformer_engine - --data-path: ${DATA_PATH}/my-t5_00_text_document - --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/t5_shard00/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/t5_shard00/bert-large-cased-vocab.txt --tokenizer-type: BertWordPieceCase --calculate-per-token-loss: true --split: 99982,9,9 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml index 48f79ab9977..54ad28a8e8a 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml @@ -29,8 +29,8 @@ MODEL_ARGS: --vocab-extra-ids: 100 --init-method-std: 0.015 --transformer-impl: transformer_engine - --data-path: ${DATA_PATH}/my-t5_00_text_document - --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/t5_shard00/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/t5_shard00/bert-large-cased-vocab.txt --tokenizer-type: BertWordPieceCase --calculate-per-token-loss: true --split: 99982,9,9 @@ -50,7 +50,7 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --deterministic-mode: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --attention-backend: unfused --log-memory-to-tensorboard: true TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml index 941f616134e..9cc675a35f6 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml @@ -29,8 +29,8 @@ MODEL_ARGS: --vocab-extra-ids: 100 --init-method-std: 0.015 --transformer-impl: transformer_engine - --data-path: ${DATA_PATH}/my-t5_00_text_document - --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/t5_shard00/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/t5_shard00/bert-large-cased-vocab.txt --tokenizer-type: BertWordPieceCase --calculate-per-token-loss: true --split: 99982,9,9 @@ -50,7 +50,7 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --deterministic-mode: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --attention-backend: unfused --log-memory-to-tensorboard: true TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml index 4a1f05c07ab..5dc3478de12 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml @@ -29,8 +29,8 @@ MODEL_ARGS: --vocab-extra-ids: 100 --init-method-std: 0.015 --transformer-impl: local - --data-path: ${DATA_PATH}/my-t5_00_text_document - --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/t5_shard00/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/t5_shard00/bert-large-cased-vocab.txt --tokenizer-type: BertWordPieceCase --calculate-per-token-loss: true --split: 99982,9,9 @@ -50,6 +50,6 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --deterministic-mode: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --log-memory-to-tensorboard: true TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml index 9bd3c8b887e..1bf1e028390 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml @@ -29,8 +29,8 @@ MODEL_ARGS: --vocab-extra-ids: 100 --init-method-std: 0.015 --transformer-impl: local - --data-path: ${DATA_PATH}/my-t5_00_text_document - --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/t5_shard00/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/t5_shard00/bert-large-cased-vocab.txt --tokenizer-type: BertWordPieceCase --calculate-per-token-loss: true --split: 99982,9,9 @@ -50,6 +50,6 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --deterministic-mode: true --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --dist-ckpt-strictness: log_all # backward compatibility for TE changes --log-memory-to-tensorboard: true TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml index ae465aecc67..76afded197d 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml @@ -29,8 +29,8 @@ MODEL_ARGS: --vocab-extra-ids: 100 --init-method-std: 0.015 --transformer-impl: transformer_engine - --data-path: ${DATA_PATH}/my-t5_00_text_document - --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/t5_shard00/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/t5_shard00/bert-large-cased-vocab.txt --tokenizer-type: BertWordPieceCase --calculate-per-token-loss: true --split: 99982,9,9 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml index 4df31e32ed9..2ab4e9730d7 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml @@ -29,8 +29,8 @@ MODEL_ARGS: --vocab-extra-ids: 100 --init-method-std: 0.015 --transformer-impl: transformer_engine - --data-path: ${DATA_PATH}/my-t5_00_text_document - --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/t5_shard00/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/t5_shard00/bert-large-cased-vocab.txt --tokenizer-type: BertWordPieceCase --calculate-per-token-loss: true --split: 99982,9,9 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml index 6a5a701a776..37085e01771 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml @@ -29,8 +29,8 @@ MODEL_ARGS: --vocab-extra-ids: 100 --init-method-std: 0.015 --transformer-impl: transformer_engine - --data-path: ${DATA_PATH}/my-t5_00_text_document - --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/t5_shard00/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/t5_shard00/bert-large-cased-vocab.txt --tokenizer-type: BertWordPieceCase --calculate-per-token-loss: true --split: 99982,9,9 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml index 268cd275db5..46e7209823f 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml @@ -29,8 +29,8 @@ MODEL_ARGS: --vocab-extra-ids: 100 --init-method-std: 0.015 --transformer-impl: local - --data-path: ${DATA_PATH}/my-t5_00_text_document - --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/t5_shard00/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/t5_shard00/bert-large-cased-vocab.txt --tokenizer-type: BertWordPieceCase --calculate-per-token-loss: true --split: 99982,9,9 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml index 8d871796477..0b11a3c137c 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml @@ -29,8 +29,8 @@ MODEL_ARGS: --vocab-extra-ids: 100 --init-method-std: 0.015 --transformer-impl: local - --data-path: ${DATA_PATH}/my-t5_00_text_document - --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/t5_shard00/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/t5_shard00/bert-large-cased-vocab.txt --tokenizer-type: BertWordPieceCase --calculate-per-token-loss: true --split: 99982,9,9 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml index d315b91295e..c305e4a86dd 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml @@ -29,8 +29,8 @@ MODEL_ARGS: --vocab-extra-ids: 100 --init-method-std: 0.015 --transformer-impl: local - --data-path: ${DATA_PATH}/my-t5_00_text_document - --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --data-path: ${DATA_PATH}/text/the_pile/t5_shard00/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/t5_shard00/bert-large-cased-vocab.txt --tokenizer-type: BertWordPieceCase --calculate-per-token-loss: true --split: 99982,9,9 diff --git a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml index de1e2d982ec..d30207b5b51 100644 --- a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml @@ -37,7 +37,7 @@ MODEL_ARGS: --pipeline-model-parallel-size: 1 # Data args --data-path: ${DATA_BLEND} - --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --vocab-file: ${DATA_PATH}/text/the_pile/t5_shard00/bert-large-cased-vocab.txt --tokenizer-type: BertWordPieceCase --split: 99982,9,9 --data-cache-path: ${DATA_CACHE_PATH} diff --git a/tests/test_utils/python_scripts/generate_jet_trigger_job.py b/tests/test_utils/python_scripts/generate_jet_trigger_job.py index 9c6edc05657..50d8598ae66 100644 --- a/tests/test_utils/python_scripts/generate_jet_trigger_job.py +++ b/tests/test_utils/python_scripts/generate_jet_trigger_job.py @@ -4,7 +4,7 @@ import click import yaml -from tests.test_utils.python_scripts import common +from tests.test_utils.python_scripts import recipe_parser BASE_PATH = pathlib.Path(__file__).parent.resolve() @@ -81,7 +81,7 @@ def main( ): list_of_test_cases = [ test_case - for test_case in common.load_workloads( + for test_case in recipe_parser.load_workloads( scope=scope, container_tag=container_tag, environment=environment, @@ -158,7 +158,7 @@ def main( for test_idx, test_case in enumerate(list_of_test_cases): job_tags = list(tags) - job_tags.append(f"cluster/{common.resolve_cluster_config(cluster)}") + job_tags.append(f"cluster/{recipe_parser.resolve_cluster_config(cluster)}") script = [ "export PYTHONPATH=$(pwd); " diff --git a/tests/test_utils/python_scripts/generate_local_jobs.py b/tests/test_utils/python_scripts/generate_local_jobs.py index 6a16af24a30..4a7cf2d7c13 100644 --- a/tests/test_utils/python_scripts/generate_local_jobs.py +++ b/tests/test_utils/python_scripts/generate_local_jobs.py @@ -11,7 +11,7 @@ import click import yaml -from tests.test_utils.python_scripts import common +from tests.test_utils.python_scripts import recipe_parser def load_script(config_path: str) -> str: @@ -68,7 +68,7 @@ def main( enable_lightweight_mode: bool = False, record_checkpoints: bool = False, ): - workloads = common.load_workloads( + workloads = recipe_parser.load_workloads( container_image="none", scope=scope, model=model, @@ -77,6 +77,8 @@ def main( container_tag="none", ) + print(workloads) + for workload in workloads: if workload.type == "build": continue diff --git a/tests/test_utils/python_scripts/launch_jet_workload.py b/tests/test_utils/python_scripts/launch_jet_workload.py index 254f522c6fb..0e3ed179f4a 100644 --- a/tests/test_utils/python_scripts/launch_jet_workload.py +++ b/tests/test_utils/python_scripts/launch_jet_workload.py @@ -17,7 +17,7 @@ from jetclient.facades.objects import log as jet_log from jetclient.services.dtos.pipeline import PipelineStatus -from tests.test_utils.python_scripts import common +from tests.test_utils.python_scripts import recipe_parser BASE_PATH = pathlib.Path(__file__).parent.resolve() DASHBOARD_ENDPOINT = os.getenv("DASHBOARD_ENDPOINT") @@ -70,7 +70,7 @@ def launch_and_wait_for_completion( ).workloads.submit( workloads=[ jetclient.JETWorkloadManifest(**workload) - for workload in common.load_workloads( + for workload in recipe_parser.load_workloads( test_case=test_case, n_repeat=n_repeat, time_limit=(1200 if enable_lightweight_mode else time_limit), @@ -83,7 +83,7 @@ def launch_and_wait_for_completion( record_checkpoints=record_checkpoints, ) ], - config_id=f"mcore/{common.resolve_cluster_config(cluster)}", + config_id=f"mcore/{recipe_parser.resolve_cluster_config(cluster)}", custom_config={ "launchers": {cluster: cluster_config}, "executors": { @@ -116,7 +116,7 @@ def launch_and_wait_for_completion( }, "outputs": { "enabled": True, - "artifacts_storages": [common.resolve_artifact_config(cluster)], + "artifacts_storages": [recipe_parser.resolve_artifact_config(cluster)], }, }, wait_for_validation=True, diff --git a/tests/test_utils/python_scripts/launch_nemo_run_workload.py b/tests/test_utils/python_scripts/launch_nemo_run_workload.py index d0ba6c4fe85..1aa1c560052 100644 --- a/tests/test_utils/python_scripts/launch_nemo_run_workload.py +++ b/tests/test_utils/python_scripts/launch_nemo_run_workload.py @@ -1,10 +1,16 @@ +import logging import os import pathlib +import sys +from typing import Optional import click import nemo_run as run -from tests.test_utils.python_scripts import common +from tests.test_utils.python_scripts import recipe_parser + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) @click.command() @@ -13,8 +19,20 @@ @click.option("--test-case", required=True, type=str, help="Test case of the workload") @click.option("--environment", required=True, type=str, help="Environment of the workload") @click.option("--platform", required=True, type=str, help="Platform of the workload") -def main(scope, model, test_case, environment, platform): - workloads = common.load_workloads( +@click.option("--container-image", required=True, type=str, help="Container image of the workload") +@click.option("--data-dir", required=False, type=str, help="Data directory of the workload") +@click.option("--tag", required=False, type=str, help="Tag of the workload") +def main( + scope, + model, + test_case, + environment, + platform, + container_image, + data_dir: Optional[str] = None, + tag: Optional[str] = None, +): + workloads = recipe_parser.load_workloads( container_image="none", scope=scope, model=model, @@ -22,17 +40,17 @@ def main(scope, model, test_case, environment, platform): environment=environment, container_tag="none", platform=platform, + tag=tag, ) workloads = [workload for workload in workloads if workload.type != "build"] - print(workloads) assert len(workloads) == 1, f"Expected exactly one workload, got {len(workloads)}" workload = workloads[0] magic_values = dict(workload.spec) - magic_values["assets_dir"] = "$OUTPUT_PATH" - magic_values["artifacts_dir"] = "$OUTPUT_PATH" + magic_values["assets_dir"] = "/opt/megatron-lm/assets_dir" + magic_values["artifacts_dir"] = "/opt/megatron-lm/artifacts_dir" magic_values["environment"] = environment magic_values["test_case"] = workload.spec["test_case"] magic_values["name"] = workload.spec["name"].format(**magic_values) @@ -40,17 +58,13 @@ def main(scope, model, test_case, environment, platform): inline_script = run.Script(inline=workload.spec["script"]) - artifacts = [ - "{host_path}:{mount_path}".format( - mount_path=mount_path, host_path=str(pathlib.Path("/root") / host_path) - ) - for mount_path, host_path in workload.spec["artifacts"].items() - ] + artifacts = [] artifacts.append(f"{os.getcwd()}:/opt/megatron-lm") - print(artifacts) + if data_dir: + artifacts.append(f"{pathlib.Path(data_dir)}:/mnt/artifacts") executor = run.DockerExecutor( - container_image="megatron-core", + container_image=container_image, num_gpus=-1, runtime="nvidia", ipc_mode="host", @@ -59,15 +73,23 @@ def main(scope, model, test_case, environment, platform): "PYTHONUNBUFFERED": "1", "OUTPUT_PATH": os.getcwd(), "ENABLE_LIGHTWEIGHT_MODE": "true", + "N_REPEAT": "1", }, packager=run.Packager(), volumes=artifacts, ) - with run.Experiment("docker-experiment", executor=executor, log_level="INFO") as exp: + with run.Experiment("mcore-ci-test", executor=executor, log_level="INFO") as exp: _ = exp.add([inline_script], tail_logs=False, name="task-1") + exp.dryrun(log=True) exp.run(detach=False, tail_logs=True, sequential=False) + result_dict = exp.status(return_dict=True) + _, job_dict = list(result_dict.items())[0] + + logger.info(f"Job status: {job_dict["status"]}") + sys.exit(0 if str(job_dict["status"]) == "SUCCEEDED" else 1) + if __name__ == "__main__": main() diff --git a/tests/test_utils/python_scripts/notify.py b/tests/test_utils/python_scripts/notify.py index 4cff0db7f6e..7da00dc401a 100644 --- a/tests/test_utils/python_scripts/notify.py +++ b/tests/test_utils/python_scripts/notify.py @@ -22,17 +22,6 @@ def get_gitlab_handle(): return gitlab.Gitlab(f"https://{GITLAB_ENDPOINT}", private_token=os.getenv("RO_API_TOKEN")) -def extract_surrounding_text(text, keyword="error", context=400, fallback_length=800): - index = text.rfind(keyword) # Find the last occurrence - if index == -1: - return text[-fallback_length:] # Return last 800 chars if keyword is not found - - start = max(0, index - context) # Ensure we don't go below 0 - end = min(len(text), index + len(keyword)) # Ensure we don't exceed the text length - - return text[start:end] - - def get_jobs_per_bridge(pipeline_id: int, type_of_job: str): bridge = {} for pipeline_bridge in ( diff --git a/tests/test_utils/python_scripts/common.py b/tests/test_utils/python_scripts/recipe_parser.py similarity index 89% rename from tests/test_utils/python_scripts/common.py rename to tests/test_utils/python_scripts/recipe_parser.py index 23c191cc399..e26d04d6f20 100644 --- a/tests/test_utils/python_scripts/common.py +++ b/tests/test_utils/python_scripts/recipe_parser.py @@ -1,12 +1,16 @@ import copy import itertools +import logging import pathlib from typing import List, Optional +import click import yaml BASE_PATH = pathlib.Path(__file__).parent.resolve() +logger = logging.getLogger(__name__) + class dotdict(dict): """dot.notation access to dictionary attributes""" @@ -25,6 +29,8 @@ def resolve_cluster_config(cluster: str) -> str: return "draco-oci-ord" if cluster == "dgxh100_coreweave": return "coreweave" + if cluster == "ghci": + return "ghci" raise ValueError(f"Unknown cluster {cluster} provided.") @@ -95,15 +101,15 @@ def filter_by_test_case(workload_manifests: List[dotdict], test_case: str) -> Op workload_manifests = list( workload_manifest for workload_manifest in workload_manifests - if workload_manifest.spec["test_case"] == test_case + if workload_manifest["spec"]["test_case"] == test_case ) if len(workload_manifests) > 1: - print("Duplicate test_case found!") + logger.info("Duplicate test_case found!") return None if len(workload_manifests) == 0: - print("No test_case found!") + logger.info("No test_case found!") return None return workload_manifests[0] @@ -118,7 +124,7 @@ def filter_by_scope(workload_manifests: List[dotdict], scope: str) -> List[dotdi ) if len(workload_manifests) == 0: - print("No test_case found!") + logger.info("No test_case found!") return [] return workload_manifests @@ -136,7 +142,7 @@ def filter_by_environment(workload_manifests: List[dotdict], environment: str) - ) if len(workload_manifests_copy) == 0: - print("No test_case found!") + logger.info("No test_case found!") return [] return workload_manifests_copy @@ -153,7 +159,7 @@ def filter_by_platform(workload_manifests: List[dotdict], platform: str) -> List ) if len(workload_manifests) == 0: - print("No test_case found!") + logger.info("No test_case found!") return [] return workload_manifests @@ -168,7 +174,7 @@ def filter_by_model(workload_manifests: List[dotdict], model: str) -> List[dotdi ) if len(workload_manifests) == 0: - print("No test_case found!") + logger.info("No test_case found!") return [] return workload_manifests @@ -184,7 +190,7 @@ def filter_by_tag(workload_manifests: List[dotdict], tag: str) -> List[dotdict]: ) if len(workload_manifests) == 0: - print("No test_case found!") + logger.info("No test_case found!") return [] return workload_manifests @@ -200,7 +206,7 @@ def filter_by_test_cases(workload_manifests: List[dotdict], test_cases: str) -> ) if len(workload_manifests) == 0: - print("No test_case found!") + logger.info("No test_case found!") return [] return workload_manifests @@ -269,7 +275,9 @@ def load_workloads( workload.spec["artifacts"] = { key: value.replace(r"{platforms}", workload.spec["platforms"]) for key, value in ( - workload.spec["artifacts"].items() if "artifacts" in workload.spec else {} + workload.spec["artifacts"].items() + if "artifacts" in workload.spec and workload.spec["artifacts"] is not None + else {} ) } @@ -288,9 +296,16 @@ def load_workloads( return workloads -if __name__ == "__main__": - workflows = load_workloads(container_tag="main") +@click.command() +@click.option("--model", required=False, type=str, default=None, help="Model to select") +@click.option("--test-case", required=False, type=str, default=None, help="Test case to select") +def main(model: Optional[str], test_case: Optional[str]): + workflows = load_workloads(container_tag="main", model=model, test_case=test_case) # Save workflows to YAML file output_file = "workflows.yaml" with open(output_file, "w") as f: yaml.dump([dict(workflow) for workflow in workflows], f) + + +if __name__ == "__main__": + main() diff --git a/tests/test_utils/recipes/common.yaml b/tests/test_utils/recipes/ckpt_converter.yaml similarity index 100% rename from tests/test_utils/recipes/common.yaml rename to tests/test_utils/recipes/ckpt_converter.yaml diff --git a/tests/test_utils/recipes/gpt-dynamic-inference-cuda-graphs.yaml b/tests/test_utils/recipes/gpt-dynamic-inference-cuda-graphs.yaml index e96bcaa4ee7..dd90bc38e88 100644 --- a/tests/test_utils/recipes/gpt-dynamic-inference-cuda-graphs.yaml +++ b/tests/test_utils/recipes/gpt-dynamic-inference-cuda-graphs.yaml @@ -11,8 +11,7 @@ spec: n_repeat: 1 platforms: dgx_a100 artifacts: - /workspace/data/mcore_mistral/model: model/mcore_mistral/nemo_minitron-0.5b/v1 - /workspace/data/mcore_mistral/tokenizer: model/mcore_mistral/nemo_minitron-0.5b/v1 + /workspace/data/model/mcore_mistral: model/mcore_mistral/nemo_minitron-0.5b/v1 script_setup: | unset https_proxy echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc @@ -44,7 +43,7 @@ spec: --tee "0:3,7:3" \ --redirects "3" \ --nproc_per_node 1 \ - tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_validation/cuda_graphs.py --checkpoint-dir /workspace/data/mcore_mistral/model --tokenizer-model /workspace/data/mcore_mistral/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_validation/cuda_graphs.py --checkpoint-dir /workspace/data/model/mcore_mistral --tokenizer-model /workspace/data/model/mcore_mistral/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json products: - test_case: [gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_validation] diff --git a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml b/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml index b276ac66d85..56ecdabcded 100644 --- a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml +++ b/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml @@ -11,8 +11,7 @@ spec: n_repeat: 1 platforms: dgx_a100 artifacts: - /workspace/data/mcore_mistral/model: model/mcore_mistral/nemo_minitron-0.5b/v1 - /workspace/data/mcore_mistral/tokenizer: model/mcore_mistral/nemo_minitron-0.5b/v1 + /workspace/data/model/mcore_mistral: model/mcore_mistral/nemo_minitron-0.5b/v1 script_setup: | unset https_proxy echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc @@ -40,9 +39,9 @@ spec: cd /opt/megatron-lm ARGUMENTS=( - "CHECKPOINT_LOAD_PATH=/workspace/data" + "CHECKPOINT_LOAD_PATH=/mnt/artifacts" "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" - "DATA_PATH=/workspace/data" + "DATA_PATH=null" "DATA_CACHE_PATH=/workspace/data/cache" "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" @@ -50,7 +49,7 @@ spec: "OUTPUT_PATH={assets_dir}" "TENSORBOARD_PATH={assets_dir}/generations_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" - "ENABLE_LIGHTWEIGHT_MODE=false" + "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" ) @@ -65,5 +64,5 @@ products: - test_case: [gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/gpt-dynamic-inference.yaml b/tests/test_utils/recipes/gpt-dynamic-inference.yaml index cd7bfd3fbec..914d3c0a757 100644 --- a/tests/test_utils/recipes/gpt-dynamic-inference.yaml +++ b/tests/test_utils/recipes/gpt-dynamic-inference.yaml @@ -10,10 +10,6 @@ spec: gpus: 1 n_repeat: 1 platforms: dgx_a100 - artifacts: - /workspace/data/mcore_mistral/model: model/mcore_mistral/nemo_minitron-0.5b/v1 - /workspace/data/mcore_mistral/tokenizer: model/mcore_mistral/nemo_minitron-0.5b/v1 - /workspace/data/sharegpt/filtered-benchmark: text/sharegpt-vicuna/filtered script_setup: | unset https_proxy echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc @@ -41,17 +37,17 @@ spec: cd /opt/megatron-lm ARGUMENTS=( - "CHECKPOINT_LOAD_PATH=/workspace/data" + "CHECKPOINT_LOAD_PATH=/mnt/artifacts" "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" - "DATA_PATH=/workspace/data" - "DATA_CACHE_PATH=/workspace/data/cache" + "DATA_PATH=/mnt/artifacts/" + "DATA_CACHE_PATH=/workspace/data/cache" "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_dynamic_inference.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" "OUTPUT_PATH={assets_dir}" "TENSORBOARD_PATH={assets_dir}/generations_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" - "ENABLE_LIGHTWEIGHT_MODE=false" + "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" ) @@ -61,17 +57,17 @@ products: - test_case: [gpt_dynamic_inference_tp1_pp1_583m_logitsmatch] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt_dynamic_inference_tp8_pp1_583m_logitsmatch] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only] products: diff --git a/tests/test_utils/recipes/gpt-grads.yaml b/tests/test_utils/recipes/gpt-grads.yaml index ea569362311..205985d5e13 100644 --- a/tests/test_utils/recipes/gpt-grads.yaml +++ b/tests/test_utils/recipes/gpt-grads.yaml @@ -11,10 +11,7 @@ spec: n_repeat: 1 platforms: dgx_h100 artifacts: - /workspace/data/gpt3_data: text/the_pile/shard00 - /workspace/checkpoints/gpt3_mr_mcore_reruns_resume_check_grads_dev: model/mcore_gpt/gpt3_4b_pyt/25.03.05_bf16_rerun-enabled_v2 - /workspace/checkpoints/gpt3_mr_mcore_reruns_resume_check_grads_lts: model/mcore_gpt/gpt3_4b_pyt/25.03.05_bf16_rerun-enabled_v2 - /workspace/checkpoints/gpt_teacher: model/gpt_dummy_pyt/ckpt/24.10.0_bf16_teacher + /mnt/artifacts/text/the_pile/shard00: text/the_pile/shard00 script_setup: | unset https_proxy echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc @@ -47,12 +44,12 @@ spec: # Note: This test is very expensive, so we hardcode N_REPEAT=1 ARGUMENTS=( - "DATA_PATH=/workspace/data/gpt3_data" + "DATA_PATH=/mnt/artifacts" "DATA_CACHE_PATH=/workspace/data/cache" "OUTPUT_PATH={assets_dir}" "TENSORBOARD_PATH={assets_dir}/tensorboard" "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints" - "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/$NAME" + "CHECKPOINT_LOAD_PATH=/mnt/artifacts" "TRAINING_SCRIPT_PATH=pretrain_gpt.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" @@ -67,5 +64,5 @@ products: - test_case: [gpt3_mr_mcore_reruns_resume_check_grads] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/gpt-nemo.yaml b/tests/test_utils/recipes/gpt-nemo.yaml index 848c1a56071..14c2106ed31 100644 --- a/tests/test_utils/recipes/gpt-nemo.yaml +++ b/tests/test_utils/recipes/gpt-nemo.yaml @@ -44,7 +44,7 @@ spec: "OUTPUT_PATH={assets_dir}" "TENSORBOARD_PATH={assets_dir}/tensorboard" "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints" - "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}" + "CHECKPOINT_LOAD_PATH=/mnt/artifacts/model/{name}" "TRAINING_SCRIPT_PATH=\"nemo llm pretrain -y --factory {nemo_model}\"" "TRAINING_PARAMS_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" @@ -57,36 +57,36 @@ products: - test_case: [llama3-nemo_8b_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp2_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [deprecated] platforms: [dgx_h100] nemo_model: [llama3_8b] - test_case: [llama3-nemo_8b_mr_mbs4_gbs64_mcore_te_tp1_pp1_cp2_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [deprecated] platforms: [dgx_h100] nemo_model: [llama3_8b] - test_case: [mixtral-nemo_8x7b_mr_mbs1_gbs8_mcore_te_tp2_pp1_ep2_1N8G] products: - environment: [dev] - scope: [mr] + scope: [deprecated] platforms: [dgx_h100] nemo_model: [mixtral_8x7b] - test_case: [gemma2-nemo_2b_mr_mbs1_gbs8_mcore_te_tp4_pp1_cp1_1N8G] products: - environment: [dev] - scope: [mr] + scope: [deprecated] platforms: [dgx_h100] nemo_model: [gemma2_2b] - test_case: [bert-nemo_340m_mr_mbs2_gbs32_mcore_te_tp2_pp2_1N8G] products: - environment: [dev] - scope: [mr] + scope: [deprecated] platforms: [dgx_h100] nemo_model: [bert_340m] - test_case: [t5-nemo_220m_mr_mbs4_gbs64_te_tp1_pp1_1N8G] products: - environment: [dev] - scope: [mr] + scope: [deprecated] platforms: [dgx_h100] nemo_model: [t5_220m] diff --git a/tests/test_utils/recipes/gpt-static-inference.yaml b/tests/test_utils/recipes/gpt-static-inference.yaml index 424c424bbbf..9ed7f6c09f9 100644 --- a/tests/test_utils/recipes/gpt-static-inference.yaml +++ b/tests/test_utils/recipes/gpt-static-inference.yaml @@ -10,11 +10,6 @@ spec: gpus: 1 n_repeat: 1 platforms: dgx_a100 - artifacts: - /workspace/data/mcore_mistral/model: model/mcore_mistral/nemo_minitron-0.5b/v1 - /workspace/data/mcore_mistral/tokenizer: model/mcore_mistral/nemo_minitron-0.5b/v1 - /workspace/data/deepseek_16b_pyt/model: model/deepseek_16b_pyt/dcp/mcore-v1_bf16 - /workspace/data/deepseek_16b_pyt/tokenizer: model/deepseek_16b_pyt/dcp/mcore-v1_bf16 script_setup: | unset https_proxy echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc @@ -42,17 +37,17 @@ spec: cd /opt/megatron-lm ARGUMENTS=( - "CHECKPOINT_LOAD_PATH=/workspace/data" + "CHECKPOINT_LOAD_PATH=/mnt/artifacts/" "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" - "DATA_PATH=/workspace/data" - "DATA_CACHE_PATH=/workspace/data/cache" + "DATA_PATH=null" + "DATA_CACHE_PATH=/workspace/data/cache" "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_static_inference.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" "OUTPUT_PATH={assets_dir}" "TENSORBOARD_PATH={assets_dir}/generations_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" - "ENABLE_LIGHTWEIGHT_MODE=false" + "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" ) @@ -62,20 +57,20 @@ products: - test_case: [gpt_static_inference_tp1_pp1_583m_logitsmatch] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt_static_inference_tp1_pp1_583m_cudagraphs] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/gpt.yaml index b29fc21e877..5eb29ac2605 100644 --- a/tests/test_utils/recipes/gpt.yaml +++ b/tests/test_utils/recipes/gpt.yaml @@ -10,19 +10,6 @@ spec: gpus: 8 n_repeat: 5 platforms: dgx_a100 - artifacts: - /workspace/data/gpt3_data: text/the_pile/shard00 - /workspace/checkpoints/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_{platforms}_1N8G_dev/24475828 - /workspace/checkpoints/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_{platforms}_1N8G_dev/28359448 - /workspace/checkpoints/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_{platforms}_1N8G_dev/28359448 - /workspace/checkpoints/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_{platforms}_1N8G_dev/28359448 - /workspace/checkpoints/gpt3_mr_mcore_reruns_resume_dev: model/mcore_gpt/gpt3_4b_pyt/25.03.05_bf16_rerun-enabled_v2 - /workspace/checkpoints/gpt3_mr_mcore_reruns_resume_lts: model/mcore_gpt/gpt3_4b_pyt/25.03.05_bf16_rerun-enabled_v2 - /workspace/checkpoints/gpt3_mr_mcore_reruns_reshard_dev: model/mcore_gpt/gpt3_4b_pyt/25.03.05_bf16_rerun-enabled_v2 - /workspace/checkpoints/gpt3_mr_mcore_reruns_reshard_lts: model/mcore_gpt/gpt3_4b_pyt/25.03.05_bf16_rerun-enabled_v2 - /workspace/checkpoints/gpt3_mr_mcore_reruns_persistent_2_dev: model/mcore_gpt/gpt3_4b_pyt/25.03.05_bf16_rerun-persistent_v2 - /workspace/checkpoints/gpt3_mr_mcore_reruns_persistent_2_lts: model/mcore_gpt/gpt3_4b_pyt/25.03.05_bf16_rerun-persistent_v2 - /workspace/checkpoints/gpt_teacher: model/gpt_dummy_pyt/ckpt/24.10.0_bf16_teacher script_setup: | unset https_proxy echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc @@ -53,12 +40,12 @@ spec: NAME=$(echo {test_case}_{environment} | sed 's/dgx_h100/dgx_a100/g') ARGUMENTS=( - "DATA_PATH=/workspace/data/gpt3_data" + "DATA_PATH=/mnt/artifacts" "DATA_CACHE_PATH=/workspace/data/cache" "OUTPUT_PATH={assets_dir}" "TENSORBOARD_PATH={assets_dir}/tensorboard" "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints" - "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/$NAME" + "CHECKPOINT_LOAD_PATH=/mnt/artifacts/" "TRAINING_SCRIPT_PATH=pretrain_gpt.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" @@ -192,7 +179,7 @@ products: - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] @@ -248,21 +235,21 @@ products: - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] @@ -283,55 +270,55 @@ products: - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] @@ -353,7 +340,7 @@ products: - test_case: [gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G] products: @@ -407,21 +394,21 @@ products: - test_case: [gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # OOM: #434 - test_case: [gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # OOM: #434 @@ -451,7 +438,7 @@ products: - test_case: [gpt3_mr_mcore_reruns_persistent_1] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] @@ -467,7 +454,7 @@ products: - environment: [lts] scope: [mr] - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [dev] scope: [mr-slim] @@ -475,7 +462,7 @@ products: - test_case: [gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [mr] @@ -484,7 +471,7 @@ products: - environment: [lts] scope: [mr] - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [dev] scope: [mr-slim] diff --git a/tests/test_utils/recipes/mamba-static-inference.yaml b/tests/test_utils/recipes/mamba-static-inference.yaml index f0e29999d43..a4eaecaa53e 100644 --- a/tests/test_utils/recipes/mamba-static-inference.yaml +++ b/tests/test_utils/recipes/mamba-static-inference.yaml @@ -39,9 +39,9 @@ spec: cd /opt/megatron-lm ARGUMENTS=( - "CHECKPOINT_LOAD_PATH=/workspace/data" + "CHECKPOINT_LOAD_PATH=/mnt/artifacts" "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" - "DATA_PATH=/workspace/data" + "DATA_PATH=null" "DATA_CACHE_PATH=/workspace/data/cache" "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_static_inference.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" @@ -49,7 +49,7 @@ spec: "OUTPUT_PATH={assets_dir}" "TENSORBOARD_PATH={assets_dir}/generations_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" - "ENABLE_LIGHTWEIGHT_MODE=false" + "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" ) @@ -59,10 +59,10 @@ products: - test_case: [hybrid_static_inference_tp1_pp1_2B_logitsmatch] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [hybrid_static_inference_tp1_pp1_2B_cudagraphs] products: - environment: [dev] scope: [mr] - platforms: [dgx_h100] + platforms: [dg x_h100] diff --git a/tests/test_utils/recipes/mamba.yaml b/tests/test_utils/recipes/mamba.yaml index 7c1f9a3627f..0f8a4085ea5 100644 --- a/tests/test_utils/recipes/mamba.yaml +++ b/tests/test_utils/recipes/mamba.yaml @@ -10,8 +10,6 @@ spec: gpus: 8 n_repeat: 5 platforms: dgx_a100 - artifacts: - /workspace/data/gpt3_data: text/the_pile/shard00 script_setup: | unset https_proxy echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc @@ -40,12 +38,12 @@ spec: cd /opt/megatron-lm ARGUMENTS=( - "DATA_PATH=/workspace/data/gpt3_data" + "DATA_PATH=/mnt/artifacts" "DATA_CACHE_PATH=/workspace/data/cache" "OUTPUT_PATH={assets_dir}" "TENSORBOARD_PATH={assets_dir}/tensorboard" "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints" - "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}" + "CHECKPOINT_LOAD_PATH=/mnt/artifacts/model/{name}" "TRAINING_SCRIPT_PATH=pretrain_mamba.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" @@ -60,7 +58,7 @@ products: - test_case: [hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] # - environment: [lts] # disabled until triton is bumped # scope: [nightly] @@ -76,7 +74,7 @@ products: - test_case: [hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] # - environment: [lts] # disabled until triton is bumped # scope: [nightly] @@ -84,7 +82,7 @@ products: - test_case: [hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] # - environment: [lts] # disabled until triton is bumped # scope: [nightly] diff --git a/tests/test_utils/recipes/mimo.yaml b/tests/test_utils/recipes/mimo.yaml index dfde82656dc..41e735776f9 100644 --- a/tests/test_utils/recipes/mimo.yaml +++ b/tests/test_utils/recipes/mimo.yaml @@ -11,7 +11,7 @@ spec: platforms: dgx_h100 artifacts: /workspace/data/llava_pretrain_energon: mixed/mcore_mimo_vlm/llava_pretrain_energon - /workspace/checkpoints/vicuna_7b_pyt/dcp/mcore-v1.5_fp32: model/vicuna_7b_pyt/dcp/mcore-v1.5_fp32 + /mnt/artifacts/model/vicuna_7b_pyt/dcp/mcore-v1.5_fp32: model/vicuna_7b_pyt/dcp/mcore-v1.5_fp32 time_limit: n_repeat: test_case: @@ -44,12 +44,12 @@ spec: cd /opt/megatron-lm NAME=$(echo {test_case}_{environment} | sed 's/dgx_h100/dgx_a100/g') ARGUMENTS=( - "DATA_PATH='/workspace/data/llava_pretrain_energon/'" - "DATA_CACHE_PATH='-'" + "DATA_PATH=/mnt/artifacts" + "DATA_CACHE_PATH=/workspace/data/cache" "OUTPUT_PATH={assets_dir}" "TENSORBOARD_PATH={assets_dir}/tensorboard" "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints" - "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}/checkpoints" + "CHECKPOINT_LOAD_PATH=/mnt/artifacts" "TRAINING_SCRIPT_PATH=./examples/mimo/train.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" diff --git a/tests/test_utils/recipes/moe-dynamic-inference.yaml b/tests/test_utils/recipes/moe-dynamic-inference.yaml index 36d09cb36c4..c9d1be57add 100644 --- a/tests/test_utils/recipes/moe-dynamic-inference.yaml +++ b/tests/test_utils/recipes/moe-dynamic-inference.yaml @@ -10,9 +10,6 @@ spec: gpus: 8 n_repeat: 1 platforms: dgx_a100 - artifacts: - /workspace/data/deepseek_16b_pyt/model: model/deepseek_16b_pyt/dcp/mcore-v1_bf16 - /workspace/data/deepseek_16b_pyt/tokenizer: model/deepseek_16b_pyt/dcp/mcore-v1_bf16 script_setup: | unset https_proxy echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc @@ -40,9 +37,9 @@ spec: cd /opt/megatron-lm ARGUMENTS=( - "CHECKPOINT_LOAD_PATH=/workspace/data" + "CHECKPOINT_LOAD_PATH=/mnt/artifacts" "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" - "DATA_PATH=/workspace/data" + "DATA_PATH=null" "DATA_CACHE_PATH=/workspace/data/cache" "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_dynamic_inference.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" @@ -50,7 +47,7 @@ spec: "OUTPUT_PATH={assets_dir}" "TENSORBOARD_PATH={assets_dir}/generations_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" - "ENABLE_LIGHTWEIGHT_MODE=false" + "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" ) @@ -60,7 +57,7 @@ products: - test_case: [gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch] products: diff --git a/tests/test_utils/recipes/moe-static-inference.yaml b/tests/test_utils/recipes/moe-static-inference.yaml index c1411283ad9..f2f98fbc146 100644 --- a/tests/test_utils/recipes/moe-static-inference.yaml +++ b/tests/test_utils/recipes/moe-static-inference.yaml @@ -11,8 +11,6 @@ spec: n_repeat: 1 platforms: dgx_a100 artifacts: - /workspace/data/deepseek_16b_pyt/model: model/deepseek_16b_pyt/dcp/mcore-v1_bf16 - /workspace/data/deepseek_16b_pyt/tokenizer: model/deepseek_16b_pyt/dcp/mcore-v1_bf16 script_setup: | unset https_proxy echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc @@ -40,9 +38,9 @@ spec: cd /opt/megatron-lm ARGUMENTS=( - "CHECKPOINT_LOAD_PATH=/workspace/data" + "CHECKPOINT_LOAD_PATH=/mnt/artifacts" "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" - "DATA_PATH=/workspace/data" + "DATA_PATH=null" "DATA_CACHE_PATH=/workspace/data/cache" "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_static_inference.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" @@ -50,7 +48,7 @@ spec: "OUTPUT_PATH={assets_dir}" "TENSORBOARD_PATH={assets_dir}/generations_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" - "ENABLE_LIGHTWEIGHT_MODE=false" + "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" ) diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml index ddfb8d1980b..fd8f00c242f 100644 --- a/tests/test_utils/recipes/moe.yaml +++ b/tests/test_utils/recipes/moe.yaml @@ -10,10 +10,6 @@ spec: gpus: 8 n_repeat: 5 platforms: dgx_a100 - artifacts: - /workspace/data/gpt3_data: text/the_pile/shard00 - /workspace/checkpoints/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_{platforms}_1N8G_dev/28359448 - /workspace/checkpoints/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_{platforms}_1N8G_dev/28359448 script_setup: | unset https_proxy echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc @@ -44,12 +40,12 @@ spec: NAME=$(echo {test_case}_{environment} | sed 's/dgx_h100/dgx_a100/g') ARGUMENTS=( - "DATA_PATH=/workspace/data/gpt3_data" + "DATA_PATH=/mnt/artifacts" "DATA_CACHE_PATH=/workspace/data/cache" "OUTPUT_PATH={assets_dir}" "TENSORBOARD_PATH={assets_dir}/tensorboard" "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints" - "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/$NAME" + "CHECKPOINT_LOAD_PATH=/mnt/artifacts" "TRAINING_SCRIPT_PATH=pretrain_gpt.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" @@ -108,7 +104,7 @@ products: - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] # TODO: The migration of custom fsdp causes EP + FSDP to be temporarily unavailable, which will be fixed in a subsequent MR. # - test_case: [gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G] @@ -121,7 +117,7 @@ products: - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer] products: @@ -155,7 +151,7 @@ products: - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [dev] scope: [mr-slim] diff --git a/tests/test_utils/recipes/multimodal-llava.yaml b/tests/test_utils/recipes/multimodal-llava.yaml index 4de7f0a9c0f..65393f14f50 100644 --- a/tests/test_utils/recipes/multimodal-llava.yaml +++ b/tests/test_utils/recipes/multimodal-llava.yaml @@ -46,7 +46,7 @@ spec: "OUTPUT_PATH={assets_dir}" "TENSORBOARD_PATH={assets_dir}/tensorboard" "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints" - "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}/checkpoints" + "CHECKPOINT_LOAD_PATH=/mnt/artifacts/model/{name}/checkpoints" "TRAINING_SCRIPT_PATH=pretrain_vlm.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" @@ -61,10 +61,10 @@ products: - test_case: [multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] diff --git a/uv.lock b/uv.lock index 1046481f7ec..28110f38852 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", @@ -281,10 +281,10 @@ name = "anyio" version = "4.9.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "exceptiongroup", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, { name = "idna" }, { name = "sniffio" }, - { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/95/7d/4c1bd541d4dffa1b52bd83fb8527089e097a106fc90b467a7313b105f840/anyio-4.9.0.tar.gz", hash = "sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028", size = 190949, upload-time = "2025-03-17T00:02:54.77Z" } wheels = [ @@ -668,7 +668,7 @@ name = "cffi" version = "2.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pycparser", marker = "implementation_name != 'PyPy' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "pycparser", marker = "implementation_name != 'PyPy'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" } wheels = [ @@ -839,7 +839,7 @@ name = "click" version = "8.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/46/61/de6cd827efad202d7057d93e0fed9294b96952e188f7384832791c7b2254/click-8.3.0.tar.gz", hash = "sha256:e7b8232224eba16f4ebe410c25ced9f7875cb5f3263ffc93cc3e8da705e229c4", size = 276943, upload-time = "2025-09-18T17:32:23.696Z" } wheels = [ @@ -1291,7 +1291,7 @@ name = "exceptiongroup" version = "1.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" } wheels = [ @@ -1799,7 +1799,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, { name = "fsspec" }, - { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, { name = "packaging" }, { name = "pyyaml" }, { name = "requests" }, @@ -2469,7 +2469,7 @@ linting = [ ] test = [ { name = "coverage" }, - { name = "nemo-run" }, + { name = "nemo-run", git = "https://github.com/NVIDIA-NeMo/Run.git?rev=8ca8f7952a597f944985f1f1368a7acb9aa3a6c2" }, { name = "nltk" }, { name = "pydantic" }, { name = "pygithub" }, @@ -2886,8 +2886,8 @@ wheels = [ [[package]] name = "nemo-run" -version = "0.6.0" -source = { registry = "https://pypi.org/simple" } +version = "0.7.0rc0.dev0" +source = { git = "https://github.com/NVIDIA-NeMo/Run.git?rev=8ca8f7952a597f944985f1f1368a7acb9aa3a6c2#8ca8f7952a597f944985f1f1368a7acb9aa3a6c2" } dependencies = [ { name = "catalogue" }, { name = "cryptography" }, @@ -2905,10 +2905,6 @@ dependencies = [ { name = "torchx" }, { name = "typer" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8b/0a/161c5f9534946f096d7ba16e40874cf9ebbff17d57c1f88173b4b32cf067/nemo_run-0.6.0.tar.gz", hash = "sha256:8c2ec0a87a0e4df799ee527422fd2df366926cdc4cc8e0b666df98b550cd9bb7", size = 2284395, upload-time = "2025-10-09T16:07:25.718Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ac/2e/56750d75ec35a692e9eb0ac0f780da9f12c8e599b8273b9eabc33ae0ca30/nemo_run-0.6.0-py3-none-any.whl", hash = "sha256:7b6473aded379e9c793b7f1f64c7f44ce3ef70b4ea27dad95fd84523531ac403", size = 235439, upload-time = "2025-10-09T16:07:24.46Z" }, -] [[package]] name = "networkx" @@ -4410,12 +4406,12 @@ name = "pytest" version = "8.3.5" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "exceptiongroup", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, { name = "iniconfig" }, { name = "packaging" }, { name = "pluggy" }, - { name = "tomli", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891, upload-time = "2025-03-02T12:54:54.503Z" } wheels = [ @@ -4670,7 +4666,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "attrs" }, { name = "rpds-py" }, - { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" } wheels = [ @@ -5890,24 +5886,24 @@ dependencies = [ { name = "jinja2" }, { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "nvidia-cuda-cupti-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "nvidia-cuda-nvrtc-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "nvidia-cuda-runtime-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "nvidia-cudnn-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "nvidia-cufft-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "nvidia-cufile-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "nvidia-curand-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "nvidia-cusolver-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "nvidia-cusparse-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "nvidia-cusparselt-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "nvidia-nccl-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "nvidia-nvshmem-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "nvidia-nvtx-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "setuptools", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvshmem-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "setuptools", marker = "python_full_version >= '3.12'" }, { name = "sympy" }, - { name = "triton", marker = "sys_platform == 'never' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "triton", marker = "sys_platform == 'never'" }, { name = "typing-extensions" }, ] wheels = [ @@ -6021,7 +6017,7 @@ name = "tqdm" version = "4.67.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" } wheels = [ From 265f4ee482a0b60a59b088a59e4eaed35e26ffef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 23 Oct 2025 15:28:18 +0000 Subject: [PATCH 037/248] ci: Add copyright-checker for GitHub MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/actions/action.yml | 26 +- .github/workflows/cicd-main.yml | 158 +++++- .github/workflows/copyright-check.yml | 19 +- megatron/core/config_logger.py | 14 +- .../golden_values_dev_dgxh100_dgxc.json | 287 ++++++++++ .../golden_values_dev_dgxh100_dgxc.json | 537 ++++++++++++++++++ .../golden_values_dev_dgxh100_dgxc.json | 344 +++++++++++ .../golden_values_dev_dgxh100_dgxc.json | 537 ++++++++++++++++++ .../launch_nemo_run_workload.py | 13 +- 9 files changed, 1916 insertions(+), 19 deletions(-) create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_dgxc.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_dgxc.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_dgxc.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_dgxc.json diff --git a/.github/actions/action.yml b/.github/actions/action.yml index 8b7fd373a98..d726fcabc9f 100644 --- a/.github/actions/action.yml +++ b/.github/actions/action.yml @@ -93,6 +93,27 @@ runs: echo "$cmd" | tee "job.sh" echo "::endgroup::" + - name: Get PR info + id: get-pr-info + if: startsWith(github.ref, 'refs/heads/pull-request/') + uses: nv-gha-runners/get-pr-info@main + + - name: Install GH CLI + shell: bash -x -e -u -o pipefail {0} + run: | + apt-get update + apt-get install -y gh + + - name: Has Run tests label + shell: bash -x -e -u -o pipefail {0} + id: has-run-tests-label + env: + GH_TOKEN: ${{ github.token }} + run: | + PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} + HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') + echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT + - name: Create run-script (e2e test) shell: bash -x -e -u -o pipefail {0} if: inputs.is_unit_test == 'false' @@ -115,7 +136,8 @@ runs: --environment dev \ --platform dgx_h100 \ --container-image ${{ inputs.container-image }} \ - --data-dir /mnt/datadrive/TestData/megatron-lm/artifacts + --data-dir /mnt/datadrive/TestData/megatron-lm/artifacts \ + --enable-lightweight-mode RUN_TEST_EOF ) @@ -200,5 +222,5 @@ runs: uses: actions/upload-artifact@v4 with: name: ${{ steps.check.outputs.logs_report }} - path: logs + path: ${{ inputs.is_unit_test == 'true' && 'logs' || 'assets_dir' }} include-hidden-files: true diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 7f030bfb641..a56afb74c71 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -32,6 +32,113 @@ permissions: contents: read jobs: + is-not-external-contributor: + runs-on: ubuntu-latest + environment: nemo-ci + outputs: + is_external_contributor: ${{ github.event.pull_request.user.type == 'User' }} + permissions: + issues: write + pull-requests: write + env: + GITHUB_TOKEN: ${{ secrets.PAT }} + REPO: ${{ github.repository }} + SCHEDULED_JOB: ${{ github.event_name == 'schedule' }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + token: ${{ env.GITHUB_TOKEN }} + + - name: Get PR info + id: get-pr-info + if: startsWith(github.ref, 'refs/heads/pull-request/') + uses: nv-gha-runners/get-pr-info@main + + - name: Check membership + id: check-membership + run: | + PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }} + + if [ "${{ env.SCHEDULED_JOB }}" == "true" ]; then + echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT + exit 0 + fi + + echo "Checking if $PR_AUTHOR is a repo collaborator..." + API_URL="https://api.github.com/repos/$REPO/collaborators/$PR_AUTHOR" + REPO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + $API_URL) + + echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA-NeMo..." + API_URL="https://api.github.com/orgs/NVIDIA-NeMo/members/$PR_AUTHOR" + ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + $API_URL) + + echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA..." + API_URL="https://api.github.com/orgs/NVIDIA/members/$PR_AUTHOR" + ORG_NVIDIA_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + $API_URL) + + if [ "$REPO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_MEMBERSHIP_RESPONSE" -eq 204 ]; then + echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT + else + echo "is_maintainer=false" | tee -a $GITHUB_OUTPUT + fi + + - name: Find Comment + uses: peter-evans/find-comment@v4 + if: startsWith(github.ref, 'refs/heads/pull-request/') + id: fc + with: + issue-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} + repository: ${{ github.repository }} + body-includes: "" + + - name: Delete comment + uses: actions/github-script@v7 + if: startsWith(github.ref, 'refs/heads/pull-request/') && steps.fc.outputs.comment-id != '' + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + await github.rest.issues.deleteComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: ${{ steps.fc.outputs.comment-id }} + }) + + - name: Write pull request comment + if: startsWith(github.ref, 'refs/heads/pull-request/') && steps.check-membership.outputs.is_maintainer == 'false' + uses: peter-evans/create-or-update-comment@v5 + with: + issue-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} + repository: ${{ github.repository }} + body: | + + + Thank you for your contribution! + + NVIDIA Megatron-LM is currently transitioning to development on Github. We will aim to review your PR after we complete our transition and stabilize our Github development process. + + Thank you for your understanding. + + - name: exit + run: | + if [ "${{ steps.check-membership.outputs.is_maintainer }}" == "true" ]; then + exit 0 + else + exit 1 + fi + pre-flight: uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.0 @@ -188,7 +295,7 @@ jobs: PAT: ${{ secrets.PAT }} container-image: 766267172432.dkr.ecr.us-east-1.amazonaws.com/megatron-lm:1864 #${{ github.sha }} - cicd-parse-functional-tests: + cicd-parse-integration-tests: runs-on: ubuntu-latest needs: - pre-flight @@ -196,17 +303,44 @@ jobs: # - cicd-container-build # - cicd-unit-tests-latest outputs: - functional-tests: ${{ steps.main.outputs.functional-tests }} + integration-tests: ${{ steps.main.outputs.integration-tests }} steps: - name: Checkout uses: actions/checkout@v4 + - name: Get PR info + id: get-pr-info + if: startsWith(github.ref, 'refs/heads/pull-request/') + uses: nv-gha-runners/get-pr-info@main + + - name: Has Run tests label + id: has-run-tests-label + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} + HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') + echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT + - name: Parse functional tests id: main + env: + HAS_RUN_TESTS_LABEL: ${{ steps.has-run-tests-label.outputs.HAS_RUN_TESTS_LABEL }} run: | export PYTHONPATH=$(pwd) + + if [ "$HAS_RUN_TESTS_LABEL" == "true" ]; then + ARGS=( + --scope mr + --enable-lightweight-mode + ) + else + ARGS=( + --scope mr-slim + ) + fi + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ - --scope mr \ --n-repeat 5 \ --time-limit 2700 \ --test-cases all \ @@ -218,24 +352,24 @@ jobs: --no-enable-warmup \ --environment dev \ --platform dgx_h100 \ - --enable-lightweight-mode \ --cluster ghci \ - --output-path functional-tests.yaml + ${ARGS[@]} \ + --output-path integration-tests.yaml - cat functional-tests.yaml | \ - yq -o json 'del(.default, .stages, .workflow) | to_entries | map({"model": .value.stage, "test_case": .key})' | jq -c > functional-tests.json + cat integration-tests.yaml | \ + yq -o json 'del(.default, .stages, .workflow) | to_entries | map({"model": .value.stage, "test_case": .key}) | sort_by(.model, .test_case)' | jq -c > integration-tests.json - echo "functional-tests=$(cat functional-tests.json)" | tee -a "$GITHUB_OUTPUT" + echo "integration-tests=$(cat integration-tests.json)" | tee -a "$GITHUB_OUTPUT" - cicd-functional-tests-latest: + cicd-integration-tests-latest: strategy: fail-fast: false matrix: - include: ${{ fromJson(needs.cicd-parse-functional-tests.outputs.functional-tests) }} + include: ${{ fromJson(needs.cicd-parse-integration-tests.outputs.integration-tests) }} needs: - pre-flight - cicd-wait-in-queue - - cicd-parse-functional-tests + - cicd-parse-integration-tests # - cicd-unit-tests-latest runs-on: nvidia-ci-aws-gpu-x8 name: "${{ matrix.model }}/${{ matrix.test_case }} - latest" @@ -266,7 +400,7 @@ jobs: needs: - pre-flight - cicd-unit-tests-latest - - cicd-functional-tests-latest + - cicd-integration-tests-latest if: | ( needs.pre-flight.outputs.docs_only == 'true' diff --git a/.github/workflows/copyright-check.yml b/.github/workflows/copyright-check.yml index c65bb402a26..8b075448833 100644 --- a/.github/workflows/copyright-check.yml +++ b/.github/workflows/copyright-check.yml @@ -10,7 +10,7 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. +# limitations under the License.. name: Copyright check @@ -30,7 +30,9 @@ jobs: if: | !(needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true') - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_copyright_check.yml@v0.2.0 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_copyright_check.yml@v0.65.2 + with: + from-year: 2019 copyright-check-summary: needs: [pre-flight, copyright-check] @@ -44,4 +46,15 @@ jobs: runs-on: ubuntu-latest steps: - name: Result - run: echo Copyright check successful + run: | + FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0 + + if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then + echo "✅ All previous jobs completed successfully" + exit 0 + else + echo "❌ Found $FAILED_JOBS failed job(s)" + # Show which jobs failed + gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name' + exit 1 + fi diff --git a/megatron/core/config_logger.py b/megatron/core/config_logger.py index 4e666bb274e..bee2be09205 100644 --- a/megatron/core/config_logger.py +++ b/megatron/core/config_logger.py @@ -1,4 +1,16 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import dataclasses import json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_dgxc.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_dgxc.json new file mode 100644 index 00000000000..737ecfb1b9d --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_dgxc.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.84523, + "2": 10.85412, + "3": 10.85365, + "4": 10.83867, + "5": 10.87428, + "6": 10.89334, + "7": 10.8541, + "8": 10.86235, + "9": 10.86352, + "10": 10.82859, + "11": 10.88772, + "12": 10.87148, + "13": 10.87938, + "14": 10.89123, + "15": 10.81927, + "16": 10.83063, + "17": 10.79878, + "18": 10.81771, + "19": 10.81957, + "20": 10.72749, + "21": 10.70552, + "22": 10.56396, + "23": 10.72823, + "24": 10.60839, + "25": 10.55198, + "26": 10.60868, + "27": 10.62879, + "28": 10.58271, + "29": 10.59982, + "30": 10.36511, + "31": 10.12096, + "32": 10.47628, + "33": 10.46906, + "34": 10.22326, + "35": 10.27848, + "36": 10.22883, + "37": 10.35947, + "38": 10.19331, + "39": 10.41586, + "40": 10.09773, + "41": 10.15718, + "42": 10.22441, + "43": 9.83281, + "44": 9.96935, + "45": 9.84205, + "46": 9.83017, + "47": 10.15602, + "48": 9.85503, + "49": 9.54049, + "50": 9.91258 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1725.0, + "2": 1664.0, + "3": 1710.0, + "4": 1712.0, + "5": 1834.0, + "6": 1743.0, + "7": 1803.0, + "8": 1737.0, + "9": 1779.0, + "10": 1459.0, + "11": 1898.0, + "12": 1661.0, + "13": 1860.0, + "14": 1764.0, + "15": 1886.0, + "16": 1916.0, + "17": 1773.0, + "18": 1702.0, + "19": 1742.0, + "20": 1649.0, + "21": 1899.0, + "22": 1631.0, + "23": 1960.0, + "24": 1570.0, + "25": 1647.0, + "26": 1649.0, + "27": 1811.0, + "28": 1930.0, + "29": 1910.0, + "30": 1964.0, + "31": 1536.0, + "32": 1873.0, + "33": 2191.0, + "34": 1838.0, + "35": 2017.0, + "36": 1916.0, + "37": 2345.0, + "38": 2247.0, + "39": 2374.0, + "40": 2207.0, + "41": 2246.0, + "42": 2291.0, + "43": 2027.0, + "44": 2147.0, + "45": 2164.0, + "46": 2300.0, + "47": 2418.0, + "48": 2467.0, + "49": 2255.0, + "50": 2224.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 552328704.0, + "2": 552328704.0, + "3": 552328704.0, + "4": 552328704.0, + "5": 552328704.0, + "6": 552328704.0, + "7": 552328704.0, + "8": 552328704.0, + "9": 552328704.0, + "10": 552328704.0, + "11": 552328704.0, + "12": 552328704.0, + "13": 552328704.0, + "14": 552328704.0, + "15": 552328704.0, + "16": 552328704.0, + "17": 552328704.0, + "18": 552328704.0, + "19": 552328704.0, + "20": 552328704.0, + "21": 552328704.0, + "22": 552328704.0, + "23": 552328704.0, + "24": 552328704.0, + "25": 552328704.0, + "26": 552328704.0, + "27": 552328704.0, + "28": 552328704.0, + "29": 552328704.0, + "30": 552328704.0, + "31": 552328704.0, + "32": 552328704.0, + "33": 552328704.0, + "34": 552328704.0, + "35": 552328704.0, + "36": 552328704.0, + "37": 552328704.0, + "38": 552328704.0, + "39": 552328704.0, + "40": 552328704.0, + "41": 552328704.0, + "42": 552328704.0, + "43": 552328704.0, + "44": 552328704.0, + "45": 552328704.0, + "46": 552328704.0, + "47": 552328704.0, + "48": 552328704.0, + "49": 552328704.0, + "50": 552328704.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3798208000.0, + "2": 3943007744.0, + "3": 3943007744.0, + "4": 3943007744.0, + "5": 3943007744.0, + "6": 3943007744.0, + "7": 3943007744.0, + "8": 3943007744.0, + "9": 3943007744.0, + "10": 3943007744.0, + "11": 3943007744.0, + "12": 3943007744.0, + "13": 3943007744.0, + "14": 3943007744.0, + "15": 3943007744.0, + "16": 3943007744.0, + "17": 3943007744.0, + "18": 3943007744.0, + "19": 3943007744.0, + "20": 3943007744.0, + "21": 3943007744.0, + "22": 3943007744.0, + "23": 3943007744.0, + "24": 3943007744.0, + "25": 3943007744.0, + "26": 3943007744.0, + "27": 3943007744.0, + "28": 3943007744.0, + "29": 3943007744.0, + "30": 3943007744.0, + "31": 3943007744.0, + "32": 3943007744.0, + "33": 3943007744.0, + "34": 3943007744.0, + "35": 3943007744.0, + "36": 3943007744.0, + "37": 3943007744.0, + "38": 3943007744.0, + "39": 3943007744.0, + "40": 3943007744.0, + "41": 3943007744.0, + "42": 3943007744.0, + "43": 3943007744.0, + "44": 3943007744.0, + "45": 3943007744.0, + "46": 3943007744.0, + "47": 3943007744.0, + "48": 3943007744.0, + "49": 3943007744.0, + "50": 3943007744.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 13.33022, + "2": 0.14078, + "3": 0.13198, + "4": 0.12852, + "5": 0.13083, + "6": 0.13237, + "7": 0.13228, + "8": 0.1313, + "9": 0.12811, + "10": 0.1288, + "11": 0.33424, + "12": 0.13269, + "13": 0.12918, + "14": 0.12679, + "15": 0.12826, + "16": 0.12904, + "17": 0.12886, + "18": 0.12955, + "19": 0.1304, + "20": 0.13345, + "21": 0.33748, + "22": 0.12668, + "23": 0.13016, + "24": 0.13048, + "25": 0.13063, + "26": 0.12607, + "27": 0.12969, + "28": 0.12911, + "29": 0.12982, + "30": 0.12875, + "31": 0.33159, + "32": 0.13001, + "33": 0.12965, + "34": 0.12637, + "35": 0.12796, + "36": 0.12613, + "37": 0.13026, + "38": 0.1296, + "39": 0.12924, + "40": 0.12739, + "41": 0.33311, + "42": 0.12916, + "43": 0.12923, + "44": 0.12827, + "45": 0.12448, + "46": 0.12337, + "47": 0.12316, + "48": 0.12962, + "49": 0.12832, + "50": 0.12865 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_dgxc.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_dgxc.json new file mode 100644 index 00000000000..8bf73ebcf59 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_dgxc.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.84163, + "2": 10.85598, + "3": 10.84413, + "4": 10.84124, + "5": 10.85872, + "6": 10.86316, + "7": 10.85184, + "8": 10.84645, + "9": 10.85647, + "10": 10.81849, + "11": 10.85923, + "12": 10.84285, + "13": 10.86432, + "14": 10.85423, + "15": 10.81015, + "16": 10.81588, + "17": 10.78949, + "18": 10.79683, + "19": 10.79073, + "20": 10.70819, + "21": 10.69322, + "22": 10.58504, + "23": 10.70217, + "24": 10.60546, + "25": 10.57102, + "26": 10.61967, + "27": 10.61501, + "28": 10.56369, + "29": 10.56725, + "30": 10.39695, + "31": 10.16591, + "32": 10.4573, + "33": 10.45199, + "34": 10.2392, + "35": 10.28351, + "36": 10.24677, + "37": 10.3427, + "38": 10.20546, + "39": 10.39187, + "40": 10.09767, + "41": 10.1526, + "42": 10.21051, + "43": 9.87726, + "44": 9.98291, + "45": 9.86165, + "46": 9.83587, + "47": 10.13369, + "48": 9.87212, + "49": 9.56121, + "50": 9.91045, + "51": 9.85839, + "52": 9.7506, + "53": 10.05817, + "54": 9.96076, + "55": 9.88738, + "56": 9.6344, + "57": 9.4967, + "58": 9.83343, + "59": 9.59391, + "60": 9.51376, + "61": 9.69928, + "62": 9.98089, + "63": 9.39065, + "64": 9.77599, + "65": 8.9571, + "66": 9.70054, + "67": 9.37, + "68": 9.78529, + "69": 9.78966, + "70": 9.74676, + "71": 9.61906, + "72": 9.58963, + "73": 9.49629, + "74": 8.94963, + "75": 9.42381, + "76": 9.07799, + "77": 10.07105, + "78": 9.72632, + "79": 9.37966, + "80": 9.40721, + "81": 9.48238, + "82": 9.70152, + "83": 9.30657, + "84": 9.41464, + "85": 9.61784, + "86": 9.08212, + "87": 9.59511, + "88": 9.75008, + "89": 9.60356, + "90": 9.82256, + "91": 9.33721, + "92": 9.35861, + "93": 9.07956, + "94": 8.83268, + "95": 9.51351, + "96": 9.52947, + "97": 9.31813, + "98": 9.67451, + "99": 8.88607, + "100": 9.40106 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1736.0, + "2": 1692.0, + "3": 1695.0, + "4": 1761.0, + "5": 1955.0, + "6": 1791.0, + "7": 1943.0, + "8": 1681.0, + "9": 1884.0, + "10": 1441.0, + "11": 1942.0, + "12": 1786.0, + "13": 1940.0, + "14": 1862.0, + "15": 1907.0, + "16": 1947.0, + "17": 1827.0, + "18": 1907.0, + "19": 1818.0, + "20": 1700.0, + "21": 1911.0, + "22": 1720.0, + "23": 1938.0, + "24": 1707.0, + "25": 1686.0, + "26": 1792.0, + "27": 1891.0, + "28": 1976.0, + "29": 1958.0, + "30": 1941.0, + "31": 1622.0, + "32": 1970.0, + "33": 2129.0, + "34": 1830.0, + "35": 1907.0, + "36": 1892.0, + "37": 2395.0, + "38": 2161.0, + "39": 2493.0, + "40": 2224.0, + "41": 2201.0, + "42": 2175.0, + "43": 1920.0, + "44": 1955.0, + "45": 1956.0, + "46": 2166.0, + "47": 2517.0, + "48": 2272.0, + "49": 2211.0, + "50": 2232.0, + "51": 2621.0, + "52": 2597.0, + "53": 2926.0, + "54": 2633.0, + "55": 2206.0, + "56": 2627.0, + "57": 2328.0, + "58": 2886.0, + "59": 2639.0, + "60": 2157.0, + "61": 2736.0, + "62": 2544.0, + "63": 2332.0, + "64": 2948.0, + "65": 2630.0, + "66": 2931.0, + "67": 2717.0, + "68": 2643.0, + "69": 2955.0, + "70": 3040.0, + "71": 2882.0, + "72": 2390.0, + "73": 2812.0, + "74": 1844.0, + "75": 2461.0, + "76": 3067.0, + "77": 3152.0, + "78": 3018.0, + "79": 3008.0, + "80": 3104.0, + "81": 3589.0, + "82": 3218.0, + "83": 2748.0, + "84": 3217.0, + "85": 3167.0, + "86": 2876.0, + "87": 3604.0, + "88": 3017.0, + "89": 3249.0, + "90": 3069.0, + "91": 2865.0, + "92": 3074.0, + "93": 2680.0, + "94": 3392.0, + "95": 3206.0, + "96": 3401.0, + "97": 3107.0, + "98": 3624.0, + "99": 3007.0, + "100": 3111.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 299204096.0, + "2": 299204096.0, + "3": 299204096.0, + "4": 299204096.0, + "5": 299204096.0, + "6": 299204096.0, + "7": 299204096.0, + "8": 299204096.0, + "9": 299204096.0, + "10": 299204096.0, + "11": 299204096.0, + "12": 299204096.0, + "13": 299204096.0, + "14": 299204096.0, + "15": 299204096.0, + "16": 299204096.0, + "17": 299204096.0, + "18": 299204096.0, + "19": 299204096.0, + "20": 299204096.0, + "21": 299204096.0, + "22": 299204096.0, + "23": 299204096.0, + "24": 299204096.0, + "25": 299204096.0, + "26": 299204096.0, + "27": 299204096.0, + "28": 299204096.0, + "29": 299204096.0, + "30": 299204096.0, + "31": 299204096.0, + "32": 299204096.0, + "33": 299204096.0, + "34": 299204096.0, + "35": 299204096.0, + "36": 299204096.0, + "37": 299204096.0, + "38": 299204096.0, + "39": 299204096.0, + "40": 299204096.0, + "41": 299204096.0, + "42": 299204096.0, + "43": 299204096.0, + "44": 299204096.0, + "45": 299204096.0, + "46": 299204096.0, + "47": 299204096.0, + "48": 299204096.0, + "49": 299204096.0, + "50": 299204096.0, + "51": 299204096.0, + "52": 299204096.0, + "53": 299204096.0, + "54": 299204096.0, + "55": 299204096.0, + "56": 299204096.0, + "57": 299204096.0, + "58": 299204096.0, + "59": 299204096.0, + "60": 299204096.0, + "61": 299204096.0, + "62": 299204096.0, + "63": 299204096.0, + "64": 299204096.0, + "65": 299204096.0, + "66": 299204096.0, + "67": 299204096.0, + "68": 299204096.0, + "69": 299204096.0, + "70": 299204096.0, + "71": 299204096.0, + "72": 299204096.0, + "73": 299204096.0, + "74": 299204096.0, + "75": 299204096.0, + "76": 299204096.0, + "77": 299204096.0, + "78": 299204096.0, + "79": 299204096.0, + "80": 299204096.0, + "81": 299204096.0, + "82": 299204096.0, + "83": 299204096.0, + "84": 299204096.0, + "85": 299204096.0, + "86": 299204096.0, + "87": 299204096.0, + "88": 299204096.0, + "89": 299204096.0, + "90": 299204096.0, + "91": 299204096.0, + "92": 299204096.0, + "93": 299204096.0, + "94": 299204096.0, + "95": 299204096.0, + "96": 299204096.0, + "97": 299204096.0, + "98": 299204096.0, + "99": 299204096.0, + "100": 299204096.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 977520128.0, + "2": 1042465280.0, + "3": 1042465280.0, + "4": 1042465280.0, + "5": 1042465280.0, + "6": 1042465280.0, + "7": 1042465280.0, + "8": 1042465280.0, + "9": 1042465280.0, + "10": 1042465280.0, + "11": 1042465280.0, + "12": 1042465280.0, + "13": 1042465280.0, + "14": 1042465280.0, + "15": 1042465280.0, + "16": 1042465280.0, + "17": 1042465280.0, + "18": 1042465280.0, + "19": 1042465280.0, + "20": 1042465280.0, + "21": 1042465280.0, + "22": 1042465280.0, + "23": 1042465280.0, + "24": 1042465280.0, + "25": 1042465280.0, + "26": 1042465280.0, + "27": 1042465280.0, + "28": 1042465280.0, + "29": 1042465280.0, + "30": 1042465280.0, + "31": 1042465280.0, + "32": 1042465280.0, + "33": 1042465280.0, + "34": 1042465280.0, + "35": 1042465280.0, + "36": 1042465280.0, + "37": 1042465280.0, + "38": 1042465280.0, + "39": 1042465280.0, + "40": 1042465280.0, + "41": 1042465280.0, + "42": 1042465280.0, + "43": 1042465280.0, + "44": 1042465280.0, + "45": 1042465280.0, + "46": 1042465280.0, + "47": 1042465280.0, + "48": 1042465280.0, + "49": 1042465280.0, + "50": 1042465280.0, + "51": 1042465280.0, + "52": 1042465280.0, + "53": 1042465280.0, + "54": 1042465280.0, + "55": 1042465280.0, + "56": 1042465280.0, + "57": 1042465280.0, + "58": 1042465280.0, + "59": 1042465280.0, + "60": 1042465280.0, + "61": 1042465280.0, + "62": 1042465280.0, + "63": 1042465280.0, + "64": 1042465280.0, + "65": 1042465280.0, + "66": 1042465280.0, + "67": 1042465280.0, + "68": 1042465280.0, + "69": 1042465280.0, + "70": 1042465280.0, + "71": 1042465280.0, + "72": 1042465280.0, + "73": 1042465280.0, + "74": 1042465280.0, + "75": 1042465280.0, + "76": 1042465280.0, + "77": 1042465280.0, + "78": 1042465280.0, + "79": 1042465280.0, + "80": 1042465280.0, + "81": 1042465280.0, + "82": 1042465280.0, + "83": 1042465280.0, + "84": 1042465280.0, + "85": 1042465280.0, + "86": 1042465280.0, + "87": 1042465280.0, + "88": 1042465280.0, + "89": 1042465280.0, + "90": 1042465280.0, + "91": 1042465280.0, + "92": 1042465280.0, + "93": 1042465280.0, + "94": 1042465280.0, + "95": 1042465280.0, + "96": 1042465280.0, + "97": 1042465280.0, + "98": 1042465280.0, + "99": 1042465280.0, + "100": 1042465280.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 7.4943, + "2": 0.30777, + "3": 0.28744, + "4": 0.28478, + "5": 0.28355, + "6": 0.28205, + "7": 0.28648, + "8": 0.28145, + "9": 0.28384, + "10": 0.28181, + "11": 0.28279, + "12": 0.29109, + "13": 0.28824, + "14": 0.28545, + "15": 0.28902, + "16": 0.28736, + "17": 0.28857, + "18": 0.28805, + "19": 0.28819, + "20": 0.28484, + "21": 0.28898, + "22": 0.28201, + "23": 0.29011, + "24": 0.28393, + "25": 0.29706, + "26": 0.30988, + "27": 0.2925, + "28": 0.28946, + "29": 0.29323, + "30": 0.29381, + "31": 0.29538, + "32": 0.28808, + "33": 0.30043, + "34": 0.29302, + "35": 0.2845, + "36": 0.28795, + "37": 0.28827, + "38": 0.2899, + "39": 0.29094, + "40": 0.28938, + "41": 0.28856, + "42": 0.29185, + "43": 0.28692, + "44": 0.28562, + "45": 0.28753, + "46": 0.29142, + "47": 0.29037, + "48": 0.28879, + "49": 0.28294, + "50": 0.28321, + "51": 0.30977, + "52": 8.12602, + "53": 5.69198, + "54": 4.43736, + "55": 5.06277, + "56": 5.45623, + "57": 5.46825, + "58": 7.06638, + "59": 4.24603, + "60": 8.21666, + "61": 4.4828, + "62": 6.62355, + "63": 5.55937, + "64": 3.34027, + "65": 5.0081, + "66": 4.41115, + "67": 4.97292, + "68": 4.81, + "69": 5.36112, + "70": 5.8305, + "71": 3.63336, + "72": 8.33029, + "73": 3.31876, + "74": 4.77939, + "75": 5.56427, + "76": 6.70233, + "77": 4.87125, + "78": 3.17949, + "79": 4.79331, + "80": 5.00405, + "81": 4.17384, + "82": 5.59422, + "83": 6.29678, + "84": 3.92285, + "85": 4.83815, + "86": 3.89693, + "87": 3.12272, + "88": 4.27964, + "89": 4.13974, + "90": 3.51718, + "91": 3.66628, + "92": 4.80546, + "93": 4.94171, + "94": 2.69087, + "95": 4.90083, + "96": 5.10401, + "97": 4.90487, + "98": 3.9353, + "99": 3.9083, + "100": 3.6134 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_dgxc.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_dgxc.json new file mode 100644 index 00000000000..13b71c1d7f0 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_dgxc.json @@ -0,0 +1,344 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.04722, + "2": 11.03572, + "3": 9.58802, + "4": 9.25807, + "5": 9.46595, + "6": 9.99646, + "7": 9.50952, + "8": 8.97596, + "9": 8.64768, + "10": 9.40103, + "11": 8.86556, + "12": 8.63563, + "13": 8.52125, + "14": 8.08824, + "15": 8.1958, + "16": 8.22112, + "17": 8.14098, + "18": 7.8386, + "19": 8.23438, + "20": 7.95361, + "21": 7.62549, + "22": 7.60352, + "23": 7.47957, + "24": 7.46573, + "25": 7.70343, + "26": 7.10719, + "27": 7.64313, + "28": 7.34582, + "29": 7.5169, + "30": 7.67511, + "31": 7.41799, + "32": 7.61213, + "33": 7.66582, + "34": 7.73101, + "35": 7.23081, + "36": 7.10765, + "37": 7.4476, + "38": 7.21053, + "39": 7.57508, + "40": 7.5662, + "41": 7.51605, + "42": 7.27243, + "43": 7.25706, + "44": 7.44, + "45": 7.21244, + "46": 6.92421, + "47": 7.32604, + "48": 7.17147, + "49": 7.62154, + "50": 7.0624 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 38802612.0, + "2": 38543656.0, + "3": 38739356.0, + "4": 273649600.0, + "5": 252887040.0, + "6": 255692384.0, + "7": 598483264.0, + "8": 787737984.0, + "9": 696133120.0, + "10": 505146368.0, + "11": 718888640.0, + "12": 872597184.0, + "13": 947495104.0, + "14": 1076398976.0, + "15": 856390592.0, + "16": 1048635648.0, + "17": 831370688.0, + "18": 963679552.0, + "19": 970018240.0, + "20": 935737344.0, + "21": 904189312.0, + "22": 887937280.0, + "23": 894777856.0, + "24": 703744192.0, + "25": 909232512.0, + "26": 875633216.0, + "27": 894981376.0, + "28": 919242816.0, + "29": 931351552.0, + "30": 929784768.0, + "31": 941621376.0, + "32": 885000768.0, + "33": 828484096.0, + "34": 822284800.0, + "35": 832032128.0, + "36": 787939392.0, + "37": 770719808.0, + "38": 561204672.0, + "39": 617201536.0, + "40": 695374592.0, + "41": 698978816.0, + "42": 692913728.0, + "43": 668003776.0, + "44": 673780992.0, + "45": 631182912.0, + "46": 444613312.0, + "47": 591957824.0, + "48": 617363968.0, + "49": 585295808.0, + "50": 570423872.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 6637272576.0, + "2": 6637274624.0, + "3": 6637274624.0, + "4": 6637274624.0, + "5": 6637274624.0, + "6": 6637274624.0, + "7": 6637274624.0, + "8": 6637274624.0, + "9": 6637274624.0, + "10": 6637274624.0, + "11": 6637274624.0, + "12": 6637274624.0, + "13": 6637274624.0, + "14": 6637274624.0, + "15": 6637274624.0, + "16": 6637274624.0, + "17": 6637274624.0, + "18": 6637274624.0, + "19": 6637274624.0, + "20": 6637274624.0, + "21": 6637274624.0, + "22": 6637274624.0, + "23": 6637274624.0, + "24": 6637274624.0, + "25": 6637274624.0, + "26": 6637274624.0, + "27": 6637274624.0, + "28": 6637274624.0, + "29": 6637274624.0, + "30": 6637274624.0, + "31": 6637274624.0, + "32": 6637274624.0, + "33": 6637274624.0, + "34": 6637274624.0, + "35": 6637274624.0, + "36": 6637274624.0, + "37": 6637274624.0, + "38": 6637274624.0, + "39": 6637274624.0, + "40": 6637274624.0, + "41": 6637274624.0, + "42": 6637274624.0, + "43": 6637274624.0, + "44": 6637274624.0, + "45": 6637274624.0, + "46": 6637274624.0, + "47": 6637274624.0, + "48": 6637274624.0, + "49": 6637274624.0, + "50": 6637274624.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 55056003072.0, + "2": 57810763776.0, + "3": 57920647168.0, + "4": 57920647168.0, + "5": 57920647168.0, + "6": 57920647168.0, + "7": 57920647168.0, + "8": 57920647168.0, + "9": 57920647168.0, + "10": 57920647168.0, + "11": 57920647168.0, + "12": 57920647168.0, + "13": 57920647168.0, + "14": 57920647168.0, + "15": 57920647168.0, + "16": 57920647168.0, + "17": 57920647168.0, + "18": 57920647168.0, + "19": 57920647168.0, + "20": 57920647168.0, + "21": 57920647168.0, + "22": 57920647168.0, + "23": 57920647168.0, + "24": 57920647168.0, + "25": 57920647168.0, + "26": 57920647168.0, + "27": 57920647168.0, + "28": 57920647168.0, + "29": 57920647168.0, + "30": 57920647168.0, + "31": 57920647168.0, + "32": 57920647168.0, + "33": 57920647168.0, + "34": 57961472000.0, + "35": 57961472000.0, + "36": 57961472000.0, + "37": 57961472000.0, + "38": 57961472000.0, + "39": 57961472000.0, + "40": 57961472000.0, + "41": 57961472000.0, + "42": 57961472000.0, + "43": 57961472000.0, + "44": 57961472000.0, + "45": 57961472000.0, + "46": 57961472000.0, + "47": 57961472000.0, + "48": 57961472000.0, + "49": 57961472000.0, + "50": 57961472000.0 + } + }, + "mtp_1 loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.07648, + "2": 11.07404, + "3": 10.53854, + "4": 10.09813, + "5": 9.81166, + "6": 10.09741, + "7": 9.79481, + "8": 9.0642, + "9": 8.86016, + "10": 9.34039, + "11": 8.51318, + "12": 8.59467, + "13": 8.5292, + "14": 7.95757, + "15": 8.06962, + "16": 8.11802, + "17": 8.06993, + "18": 7.80587, + "19": 8.19192, + "20": 7.8906, + "21": 7.57063, + "22": 7.55091, + "23": 7.41606, + "24": 7.42454, + "25": 7.65274, + "26": 7.05583, + "27": 7.59747, + "28": 7.29984, + "29": 7.472, + "30": 7.61908, + "31": 7.35179, + "32": 7.52979, + "33": 7.59161, + "34": 7.66287, + "35": 7.17383, + "36": 7.04133, + "37": 7.37081, + "38": 7.1443, + "39": 7.50879, + "40": 7.48921, + "41": 7.43802, + "42": 7.19405, + "43": 7.17581, + "44": 7.35785, + "45": 7.13985, + "46": 6.84014, + "47": 7.25094, + "48": 7.09407, + "49": 7.52321, + "50": 6.98987 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 86.39826, + "2": 1.422, + "3": 10.22559, + "4": 14.42033, + "5": 8.84175, + "6": 7.82667, + "7": 11.39742, + "8": 6.95883, + "9": 9.03821, + "10": 10.04724, + "11": 6.73176, + "12": 10.40096, + "13": 8.09212, + "14": 12.48417, + "15": 10.47434, + "16": 5.38933, + "17": 9.91136, + "18": 12.5031, + "19": 3.69959, + "20": 6.47676, + "21": 8.9867, + "22": 6.26614, + "23": 14.73195, + "24": 5.95294, + "25": 7.82357, + "26": 1.13211, + "27": 10.86033, + "28": 5.6863, + "29": 8.4589, + "30": 11.41315, + "31": 8.85024, + "32": 4.72753, + "33": 8.44604, + "34": 10.74723, + "35": 6.95053, + "36": 6.82478, + "37": 7.84389, + "38": 9.46014, + "39": 8.6244, + "40": 5.78378, + "41": 6.9593, + "42": 5.09864, + "43": 8.81575, + "44": 6.08546, + "45": 10.08201, + "46": 6.04881, + "47": 7.73914, + "48": 7.18802, + "49": 7.82111, + "50": 7.94794 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_dgxc.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_dgxc.json new file mode 100644 index 00000000000..3f2294f2670 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_dgxc.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.81442, + "2": 10.81882, + "3": 10.81551, + "4": 10.80292, + "5": 10.85144, + "6": 10.85011, + "7": 10.83867, + "8": 10.83952, + "9": 10.82213, + "10": 10.77746, + "11": 10.86426, + "12": 10.83689, + "13": 10.85831, + "14": 10.86354, + "15": 10.79774, + "16": 10.79537, + "17": 10.77155, + "18": 10.78908, + "19": 10.78343, + "20": 10.71629, + "21": 10.6835, + "22": 10.53061, + "23": 10.69849, + "24": 10.58571, + "25": 10.52397, + "26": 10.58327, + "27": 10.60963, + "28": 10.57207, + "29": 10.59012, + "30": 10.35613, + "31": 10.09392, + "32": 10.45887, + "33": 10.45644, + "34": 10.20494, + "35": 10.26735, + "36": 10.22333, + "37": 10.35299, + "38": 10.19476, + "39": 10.41731, + "40": 10.08948, + "41": 10.12721, + "42": 10.21207, + "43": 9.8313, + "44": 9.96936, + "45": 9.83601, + "46": 9.81666, + "47": 10.1539, + "48": 9.85279, + "49": 9.53447, + "50": 9.91909, + "51": 9.85364, + "52": 9.74286, + "53": 10.07155, + "54": 9.96279, + "55": 9.88223, + "56": 9.63465, + "57": 9.48633, + "58": 9.84878, + "59": 9.58904, + "60": 9.51094, + "61": 9.7032, + "62": 9.99637, + "63": 9.40044, + "64": 9.78465, + "65": 8.95366, + "66": 9.71808, + "67": 9.36931, + "68": 9.79818, + "69": 9.79667, + "70": 9.74899, + "71": 9.63213, + "72": 9.59956, + "73": 9.50308, + "74": 8.95202, + "75": 9.43084, + "76": 9.09067, + "77": 10.08102, + "78": 9.73521, + "79": 9.38853, + "80": 9.41418, + "81": 9.48403, + "82": 9.70907, + "83": 9.3152, + "84": 9.41838, + "85": 9.62222, + "86": 9.07945, + "87": 9.59202, + "88": 9.74953, + "89": 9.60441, + "90": 9.82577, + "91": 9.34232, + "92": 9.35837, + "93": 9.07969, + "94": 8.82793, + "95": 9.50864, + "96": 9.52117, + "97": 9.30605, + "98": 9.6658, + "99": 8.87716, + "100": 9.38997 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 5488.0, + "2": 5704.0, + "3": 5788.0, + "4": 5853.0, + "5": 6401.0, + "6": 6686.0, + "7": 5949.0, + "8": 5811.0, + "9": 6280.0, + "10": 5192.0, + "11": 6645.0, + "12": 6193.0, + "13": 6525.0, + "14": 6487.0, + "15": 6258.0, + "16": 6261.0, + "17": 6080.0, + "18": 5901.0, + "19": 6228.0, + "20": 5713.0, + "21": 6265.0, + "22": 5788.0, + "23": 6618.0, + "24": 6159.0, + "25": 5674.0, + "26": 6218.0, + "27": 6180.0, + "28": 6802.0, + "29": 7006.0, + "30": 6195.0, + "31": 5847.0, + "32": 6680.0, + "33": 7327.0, + "34": 6433.0, + "35": 6593.0, + "36": 6717.0, + "37": 7545.0, + "38": 7130.0, + "39": 7928.0, + "40": 7233.0, + "41": 7093.0, + "42": 7653.0, + "43": 7136.0, + "44": 7113.0, + "45": 7167.0, + "46": 7435.0, + "47": 7501.0, + "48": 7648.0, + "49": 7520.0, + "50": 7701.0, + "51": 7847.0, + "52": 7828.0, + "53": 8765.0, + "54": 8799.0, + "55": 7683.0, + "56": 7972.0, + "57": 7642.0, + "58": 8419.0, + "59": 8276.0, + "60": 7917.0, + "61": 8598.0, + "62": 8394.0, + "63": 7896.0, + "64": 9047.0, + "65": 8280.0, + "66": 9315.0, + "67": 8277.0, + "68": 8341.0, + "69": 8737.0, + "70": 9764.0, + "71": 9050.0, + "72": 9036.0, + "73": 9076.0, + "74": 6969.0, + "75": 7833.0, + "76": 8450.0, + "77": 13505.0, + "78": 9634.0, + "79": 13982.0, + "80": 11548.0, + "81": 10035.0, + "82": 9732.0, + "83": 9037.0, + "84": 9522.0, + "85": 46479.0, + "86": 8626.0, + "87": 11964.0, + "88": 9637.0, + "89": 10273.0, + "90": 11256.0, + "91": 8811.0, + "92": 9218.0, + "93": 8281.0, + "94": 9390.0, + "95": 9376.0, + "96": 13248.0, + "97": 8945.0, + "98": 10682.0, + "99": 15485.0, + "100": 9101.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 628643840.0, + "2": 628644864.0, + "3": 628644864.0, + "4": 628644864.0, + "5": 628644864.0, + "6": 628644864.0, + "7": 628644864.0, + "8": 628644864.0, + "9": 628644864.0, + "10": 628644864.0, + "11": 628644864.0, + "12": 628644864.0, + "13": 628644864.0, + "14": 628644864.0, + "15": 628644864.0, + "16": 628644864.0, + "17": 628644864.0, + "18": 628644864.0, + "19": 628644864.0, + "20": 628644864.0, + "21": 628644864.0, + "22": 628644864.0, + "23": 628644864.0, + "24": 628644864.0, + "25": 628644864.0, + "26": 628644864.0, + "27": 628644864.0, + "28": 628644864.0, + "29": 628644864.0, + "30": 628644864.0, + "31": 628644864.0, + "32": 628644864.0, + "33": 628644864.0, + "34": 628644864.0, + "35": 628644864.0, + "36": 628644864.0, + "37": 628644864.0, + "38": 628644864.0, + "39": 628644864.0, + "40": 628644864.0, + "41": 628644864.0, + "42": 628644864.0, + "43": 628644864.0, + "44": 628644864.0, + "45": 628644864.0, + "46": 628644864.0, + "47": 628644864.0, + "48": 628644864.0, + "49": 628644864.0, + "50": 628644864.0, + "51": 628644864.0, + "52": 628644864.0, + "53": 628644864.0, + "54": 628644864.0, + "55": 628644864.0, + "56": 628644864.0, + "57": 628644864.0, + "58": 628644864.0, + "59": 628644864.0, + "60": 628644864.0, + "61": 628644864.0, + "62": 628644864.0, + "63": 628644864.0, + "64": 628644864.0, + "65": 628644864.0, + "66": 628644864.0, + "67": 628644864.0, + "68": 628644864.0, + "69": 628644864.0, + "70": 628644864.0, + "71": 628644864.0, + "72": 628644864.0, + "73": 628644864.0, + "74": 628644864.0, + "75": 628644864.0, + "76": 628644864.0, + "77": 628644864.0, + "78": 628644864.0, + "79": 628644864.0, + "80": 628644864.0, + "81": 628644864.0, + "82": 628644864.0, + "83": 628644864.0, + "84": 628644864.0, + "85": 628644864.0, + "86": 628644864.0, + "87": 628644864.0, + "88": 628644864.0, + "89": 628644864.0, + "90": 628644864.0, + "91": 628644864.0, + "92": 628644864.0, + "93": 628644864.0, + "94": 628644864.0, + "95": 628644864.0, + "96": 628644864.0, + "97": 628644864.0, + "98": 628644864.0, + "99": 628644864.0, + "100": 628644864.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 982153216.0, + "2": 1149395968.0, + "3": 1149395968.0, + "4": 1155440128.0, + "5": 1155440128.0, + "6": 1155440128.0, + "7": 1155440128.0, + "8": 1155440128.0, + "9": 1155440128.0, + "10": 1155440128.0, + "11": 1155440128.0, + "12": 1155440128.0, + "13": 1155440128.0, + "14": 1155440128.0, + "15": 1155440128.0, + "16": 1155440128.0, + "17": 1155440128.0, + "18": 1155440128.0, + "19": 1155440128.0, + "20": 1155440128.0, + "21": 1155440128.0, + "22": 1155440128.0, + "23": 1155440128.0, + "24": 1155440128.0, + "25": 1155440128.0, + "26": 1155440128.0, + "27": 1155440128.0, + "28": 1155440128.0, + "29": 1155440128.0, + "30": 1155440128.0, + "31": 1155440128.0, + "32": 1155440128.0, + "33": 1155440128.0, + "34": 1155440128.0, + "35": 1155440128.0, + "36": 1155440128.0, + "37": 1155440128.0, + "38": 1155440128.0, + "39": 1155440128.0, + "40": 1155440128.0, + "41": 1155440128.0, + "42": 1155440128.0, + "43": 1155440128.0, + "44": 1155440128.0, + "45": 1155440128.0, + "46": 1155440128.0, + "47": 1155440128.0, + "48": 1155440128.0, + "49": 1155440128.0, + "50": 1155440128.0, + "51": 1155440128.0, + "52": 1155440128.0, + "53": 1155440128.0, + "54": 1155440128.0, + "55": 1155440128.0, + "56": 1155440128.0, + "57": 1155440128.0, + "58": 1155440128.0, + "59": 1155440128.0, + "60": 1155999232.0, + "61": 1159285760.0, + "62": 1159285760.0, + "63": 1159285760.0, + "64": 1159285760.0, + "65": 1159285760.0, + "66": 1159285760.0, + "67": 1159285760.0, + "68": 1159285760.0, + "69": 1159285760.0, + "70": 1159285760.0, + "71": 1159285760.0, + "72": 1159285760.0, + "73": 1159285760.0, + "74": 1159285760.0, + "75": 1159285760.0, + "76": 1164709376.0, + "77": 1164709376.0, + "78": 1164709376.0, + "79": 1164709376.0, + "80": 1164709376.0, + "81": 1164709376.0, + "82": 1164709376.0, + "83": 1164709376.0, + "84": 1164709376.0, + "85": 1164709376.0, + "86": 1164709376.0, + "87": 1164709376.0, + "88": 1164709376.0, + "89": 1164709376.0, + "90": 1164709376.0, + "91": 1164709376.0, + "92": 1164709376.0, + "93": 1164709376.0, + "94": 1164709376.0, + "95": 1164709376.0, + "96": 1164709376.0, + "97": 1164709376.0, + "98": 1164709376.0, + "99": 1164709376.0, + "100": 1164709376.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 14.68355, + "2": 0.90574, + "3": 0.83204, + "4": 0.80726, + "5": 0.81604, + "6": 0.80698, + "7": 0.79149, + "8": 0.78879, + "9": 5.49279, + "10": 7.0174, + "11": 5.92452, + "12": 5.81078, + "13": 7.25845, + "14": 4.93946, + "15": 4.83531, + "16": 4.47779, + "17": 6.18406, + "18": 6.12945, + "19": 10.25032, + "20": 7.44996, + "21": 6.16308, + "22": 9.83266, + "23": 6.97961, + "24": 8.73643, + "25": 7.58409, + "26": 6.5798, + "27": 9.71829, + "28": 7.38708, + "29": 8.61355, + "30": 7.20245, + "31": 7.15976, + "32": 10.8435, + "33": 7.30066, + "34": 4.75923, + "35": 7.80722, + "36": 7.65565, + "37": 8.21042, + "38": 7.29325, + "39": 8.35765, + "40": 9.13683, + "41": 9.17388, + "42": 8.76786, + "43": 6.60222, + "44": 9.37932, + "45": 8.70546, + "46": 7.2996, + "47": 7.24309, + "48": 8.69252, + "49": 6.05433, + "50": 8.17077, + "51": 5.49966, + "52": 8.23075, + "53": 7.32789, + "54": 8.08693, + "55": 6.13482, + "56": 7.89454, + "57": 6.91153, + "58": 10.68201, + "59": 4.20298, + "60": 10.28771, + "61": 9.10505, + "62": 8.665, + "63": 7.47889, + "64": 6.00947, + "65": 6.44144, + "66": 7.43135, + "67": 6.56432, + "68": 8.03943, + "69": 7.40803, + "70": 8.51347, + "71": 7.69153, + "72": 8.48279, + "73": 5.96062, + "74": 6.63264, + "75": 8.55139, + "76": 8.45504, + "77": 6.34534, + "78": 4.89292, + "79": 8.63417, + "80": 6.91863, + "81": 6.90981, + "82": 9.79368, + "83": 10.43847, + "84": 6.26228, + "85": 5.61723, + "86": 6.31752, + "87": 5.27251, + "88": 7.88452, + "89": 6.17004, + "90": 7.59884, + "91": 8.09035, + "92": 5.87686, + "93": 6.89489, + "94": 4.69639, + "95": 6.85708, + "96": 7.35569, + "97": 6.66015, + "98": 7.07087, + "99": 6.85994, + "100": 5.88721 + } + } +} \ No newline at end of file diff --git a/tests/test_utils/python_scripts/launch_nemo_run_workload.py b/tests/test_utils/python_scripts/launch_nemo_run_workload.py index 1aa1c560052..b3032eb15c4 100644 --- a/tests/test_utils/python_scripts/launch_nemo_run_workload.py +++ b/tests/test_utils/python_scripts/launch_nemo_run_workload.py @@ -22,6 +22,15 @@ @click.option("--container-image", required=True, type=str, help="Container image of the workload") @click.option("--data-dir", required=False, type=str, help="Data directory of the workload") @click.option("--tag", required=False, type=str, help="Tag of the workload") +@click.option( + "--enable-lightweight-mode", + is_flag=True, + show_default=True, + required=False, + type=bool, + default=False, + help="To enable lightweight mode", +) def main( scope, model, @@ -31,6 +40,7 @@ def main( container_image, data_dir: Optional[str] = None, tag: Optional[str] = None, + enable_lightweight_mode: Optional[bool] = False, ): workloads = recipe_parser.load_workloads( container_image="none", @@ -72,8 +82,9 @@ def main( env_vars={ "PYTHONUNBUFFERED": "1", "OUTPUT_PATH": os.getcwd(), - "ENABLE_LIGHTWEIGHT_MODE": "true", + "ENABLE_LIGHTWEIGHT_MODE": str(enable_lightweight_mode).lower(), "N_REPEAT": "1", + "CLUSTER": "dgxh100_dgxc", }, packager=run.Packager(), volumes=artifacts, From bec65822072a298c89937de67a778e1b76b54015 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 23 Oct 2025 16:04:53 +0000 Subject: [PATCH 038/248] ADLR/megatron-lm!4298 - ci: Refactor testsytem - Removal of JET Artifacts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/actions/action.yml | 4 +- .../workflows/build-test-publish-wheel.yml | 1 + .github/workflows/cicd-main.yml | 66 +- .github/workflows/copyright-check.yml | 3 +- .github/workflows/install-test.yml | 1 + .gitlab/stages/04.functional-tests.yml | 2 +- pyproject.toml | 1 - .../python_test_utils/common.py | 22 +- .../test_pretraining_regular_pipeline.py | 37 +- .../shell_test_utils/_run_training.sh | 6 +- .../shell_test_utils/run_ci_test.sh | 8 +- .../bert/bert_mcore_tp1_pp2/model_config.yaml | 2 +- .../bert_mcore_tp1_pp4_vp2/model_config.yaml | 2 +- .../bert/bert_mcore_tp2_pp2/model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../bert/bert_mcore_tp4_pp1/model_config.yaml | 2 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../model_config.yaml | 0 .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../README.md | 0 .../model_config.yaml | 8 +- .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_dgxc.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 287 + .../golden_values_dev_dgxh100_eos.json | 287 + .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 8 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_dgxc.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 8 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 ...olden_values_dev_dgxa100_dracooci-ord.json | 0 .../golden_values_dev_dgxa100_dracooci.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 ...olden_values_dev_dgxa100_dracooci-ord.json | 0 .../golden_values_dev_dgxa100_dracooci.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgxh100_coreweave.json | 287 - .../golden_values_dev_dgxh100_eos.json | 287 - .../model_config.yaml | 6 +- .../tp_comm_overlap_cfg.yaml | 0 .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgxh100_coreweave.json | 0 .../model_config.yaml | 6 +- .../tp_comm_overlap_cfg.yaml | 0 .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../golden_values_dev_dgx_h100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgxh100_coreweave.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_h100.json | 342 +- .../model_config.yaml | 2 +- .../model_config.yaml | 6 +- .../golden_values_dev_dgx_h100.json | 5398 +++++++++-------- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../model_config.yaml | 6 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 ...olden_values_dev_dgxa100_dracooci-ord.json | 0 .../golden_values_dev_dgxa100_dracooci.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 ...olden_values_dev_dgxa100_dracooci-ord.json | 0 .../golden_values_dev_dgxa100_dracooci.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../model_config.yaml | 6 +- .../model_config.yaml | 4 +- .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../model_config.yaml | 4 +- .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../model_config.yaml | 4 +- .../golden_values_dev_dgx_h100.json | 0 .../model_config.yaml | 4 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 ...olden_values_dev_dgxa100_dracooci-ord.json | 0 .../golden_values_dev_dgxa100_dracooci.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 ...olden_values_dev_dgxa100_dracooci-ord.json | 0 .../golden_values_dev_dgxa100_dracooci.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 ...olden_values_dev_dgxa100_dracooci-ord.json | 0 .../golden_values_dev_dgxa100_dracooci.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 ...olden_values_dev_dgxa100_dracooci-ord.json | 0 .../golden_values_dev_dgxa100_dracooci.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 ...olden_values_dev_dgxa100_dracooci-ord.json | 0 .../golden_values_dev_dgxa100_dracooci.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 ...olden_values_lts_dgxa100_dracooci-ord.json | 0 .../golden_values_lts_dgxa100_dracooci.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 6 +- .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_dgxc.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../model_config.yaml | 6 +- .../model_config.yaml | 8 +- .../model_config.yaml | 2 +- .../model_config.yaml | 6 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 2 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 2 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../model_config.yaml | 2 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 ...olden_values_dev_dgxa100_dracooci-ord.json | 0 .../golden_values_dev_dgxa100_dracooci.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 2 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 ...olden_values_dev_dgxa100_dracooci-ord.json | 0 .../golden_values_dev_dgxa100_dracooci.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 2 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 ...olden_values_dev_dgxa100_dracooci-ord.json | 0 .../golden_values_dev_dgxa100_dracooci.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 2 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 2 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 2 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 ...olden_values_dev_dgxa100_dracooci-ord.json | 0 .../golden_values_dev_dgxa100_dracooci.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 2 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 ...olden_values_dev_dgxa100_dracooci-ord.json | 0 .../golden_values_dev_dgxa100_dracooci.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 2 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 ...olden_values_dev_dgxa100_dracooci-ord.json | 0 .../golden_values_dev_dgxa100_dracooci.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 2 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 2 +- .../golden_values_dev_dgx_a100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgxh100_coreweave.json | 0 .../golden_values_dev_dgxh100_eos.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../model_config.yaml | 2 +- .../t5/t5_release/model_config.yaml | 2 +- .../golden_values_lts_dgx_a100.json | 0 .../golden_values_lts_dgx_a100.json | 0 .../python_scripts/launch_jet_workload.py | 1 + .../launch_nemo_run_workload.py | 64 +- tests/test_utils/recipes/ckpt_converter.yaml | 1 + .../gpt-dynamic-inference-cuda-graphs.yaml | 2 - ...pt-dynamic-inference-with-coordinator.yaml | 2 - tests/test_utils/recipes/gpt-grads.yaml | 4 +- tests/test_utils/recipes/gpt.yaml | 156 +- .../recipes/mamba-static-inference.yaml | 2 - .../recipes/moe-static-inference.yaml | 1 - tests/test_utils/recipes/moe.yaml | 30 +- .../test_utils/recipes/multimodal-llava.yaml | 4 +- uv.lock | 116 +- 870 files changed, 4255 insertions(+), 4159 deletions(-) rename tests/functional_tests/test_cases/gpt/{gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed => gpt3_7b_tp1_pp4_memory_speed}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed => gpt3_7b_tp1_pp4_memory_speed}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed => gpt3_7b_tp1_pp4_memory_speed}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed => gpt3_7b_tp1_pp4_memory_speed}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed => gpt3_7b_tp1_pp4_memory_speed}/model_config.yaml (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed => gpt3_7b_tp4_pp1_memory_speed}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed => gpt3_7b_tp4_pp1_memory_speed}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed => gpt3_7b_tp4_pp1_memory_speed}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed => gpt3_7b_tp4_pp1_memory_speed}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed => gpt3_7b_tp4_pp1_memory_speed}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed => gpt3_7b_tp4_pp1_memory_speed}/model_config.yaml (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_reruns_disable => gpt3_mcore_reruns_disable}/model_config.yaml (94%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_reruns_enable => gpt3_mcore_reruns_enable}/model_config.yaml (93%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_reruns_persistent_1 => gpt3_mcore_reruns_persistent_1}/model_config.yaml (94%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_reruns_persistent_2 => gpt3_mcore_reruns_persistent_2}/model_config.yaml (94%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_reruns_reshard => gpt3_mcore_reruns_reshard}/model_config.yaml (94%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_reruns_resume => gpt3_mcore_reruns_resume}/model_config.yaml (93%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_reruns_resume_check_grads => gpt3_mcore_reruns_resume_check_grads}/README.md (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_reruns_resume_check_grads => gpt3_mcore_reruns_resume_check_grads}/model_config.yaml (94%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_reruns_transient => gpt3_mcore_reruns_transient}/model_config.yaml (94%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_uniform_full_recompute}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_uniform_full_recompute}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp1_uniform_full_recompute}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_rope_embeddings}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_rope_embeddings}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_rope_embeddings}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_disable_bias_linear}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_disable_bias_linear}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_disable_bias_linear}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_sequence_parallel}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_sequence_parallel}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_sequence_parallel}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_swiglu}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_swiglu}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_swiglu}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_untie_embeddings_and_outputs}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_untie_embeddings_and_outputs}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_untie_embeddings_and_outputs}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_decoupled_lr}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_decoupled_lr}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_decoupled_lr}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer}/golden_values_dev_dgxh100_dgxc.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/model_config.yaml (92%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap}/golden_values_dev_dgx_h100.json (100%) create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgxh100_eos.json rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_cp2_nondeterministic}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_cp2_nondeterministic}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_cp2_nondeterministic}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_fsdp2_resume_torch_dist}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_modelopt_distill_resume => gpt3_mcore_te_tp2_pp1_modelopt_distill_resume}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_modelopt_distill_resume => gpt3_mcore_te_tp2_pp1_modelopt_distill_resume}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_modelopt_distill_resume => gpt3_mcore_te_tp2_pp1_modelopt_distill_resume}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_nondeterministic}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_nondeterministic}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_nondeterministic}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_nondeterministic}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_nondeterministic}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_nondeterministic}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_nondeterministic}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cp2_nondeterministic}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion}/model_config.yaml (89%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_ddp_average_in_collective}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_ddp_average_in_collective}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_ddp_average_in_collective}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_defer_embedding_wgrad_compute}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_defer_embedding_wgrad_compute}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_defer_embedding_wgrad_compute}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_mla}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_mla}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_mla}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_mla}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_mla}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_no_mmap_bin_files}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_no_mmap_bin_files}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_no_mmap_bin_files}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G => gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G => gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G => gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G => gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G => gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G => gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_qk_layernorm_test_mode}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_qk_layernorm_test_mode}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_qk_layernorm_test_mode}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgxh100_dgxc.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G => gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather => gpt3_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather => gpt3_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather => gpt3_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te => gpt3_mcore_tp1_pp1_fsdp2_resume_torch_dist_te}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather => gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather => gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather => gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather => gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather => gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather => gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather => gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather => gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2 => gpt3_mcore_tp1_pp2}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2 => gpt3_mcore_tp1_pp2}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2 => gpt3_mcore_tp1_pp2}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2 => gpt3_mcore_tp1_pp2}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2 => gpt3_mcore_tp1_pp2}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2 => gpt3_mcore_tp1_pp2}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2 => gpt3_mcore_tp1_pp2}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2 => gpt3_mcore_tp1_pp2}/model_config.yaml (89%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16 => gpt3_mcore_tp1_pp2_fp16}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16 => gpt3_mcore_tp1_pp2_fp16}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16 => gpt3_mcore_tp1_pp2_fp16}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist => gpt3_mcore_tp1_pp2_resume_torch_dist}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist => gpt3_mcore_tp1_pp2_resume_torch_dist}/golden_values_dev_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist => gpt3_mcore_tp1_pp2_resume_torch_dist}/golden_values_dev_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist => gpt3_mcore_tp1_pp2_resume_torch_dist}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist => gpt3_mcore_tp1_pp2_resume_torch_dist}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist => gpt3_mcore_tp1_pp2_resume_torch_dist}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist => gpt3_mcore_tp1_pp2_resume_torch_dist}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4 => gpt3_mcore_tp1_pp4}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4 => gpt3_mcore_tp1_pp4}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4 => gpt3_mcore_tp1_pp4}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4 => gpt3_mcore_tp1_pp4}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4 => gpt3_mcore_tp1_pp4}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4 => gpt3_mcore_tp1_pp4}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4 => gpt3_mcore_tp1_pp4}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4 => gpt3_mcore_tp1_pp4}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist => gpt3_mcore_tp1_pp4_resume_torch_dist}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist => gpt3_mcore_tp1_pp4_resume_torch_dist}/golden_values_dev_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist => gpt3_mcore_tp1_pp4_resume_torch_dist}/golden_values_dev_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist => gpt3_mcore_tp1_pp4_resume_torch_dist}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist => gpt3_mcore_tp1_pp4_resume_torch_dist}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist => gpt3_mcore_tp1_pp4_resume_torch_dist}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist => gpt3_mcore_tp1_pp4_resume_torch_dist}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G => gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G => gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G => gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G => gpt3_mcore_tp2_pp2_uninstall_te}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G => gpt3_mcore_tp2_pp2_uninstall_te}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G => gpt3_mcore_tp2_pp2_uninstall_te}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1 => gpt3_mcore_tp4_pp1}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1 => gpt3_mcore_tp4_pp1}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1 => gpt3_mcore_tp4_pp1}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch => gpt3_mcore_tp4_pp1_resume_torch}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch => gpt3_mcore_tp4_pp1_resume_torch}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch => gpt3_mcore_tp4_pp1_resume_torch}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch => gpt3_mcore_tp4_pp1_resume_torch}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch => gpt3_mcore_tp4_pp1_resume_torch}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch => gpt3_mcore_tp4_pp1_resume_torch}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch => gpt3_mcore_tp4_pp1_resume_torch}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch => gpt3_mcore_tp4_pp1_resume_torch}/model_config.yaml (89%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist => gpt3_mcore_tp4_pp1_resume_torch_dist}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist => gpt3_mcore_tp4_pp1_resume_torch_dist}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist => gpt3_mcore_tp4_pp1_resume_torch_dist}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist => gpt3_mcore_tp4_pp1_resume_torch_dist}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist => gpt3_mcore_tp4_pp1_resume_torch_dist}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist => gpt3_mcore_tp4_pp1_resume_torch_dist}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist => gpt3_mcore_tp4_pp1_resume_torch_dist}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist => gpt3_mcore_tp4_pp1_resume_torch_dist}/model_config.yaml (90%) delete mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json delete mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap => gpt3_weekly_dgx_b200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap => gpt3_weekly_dgx_b200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap}/tp_comm_overlap_cfg.yaml (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp => gpt3_weekly_dgx_b200_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap => gpt3_weekly_dgx_b200_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_mxfp8_tp_sp_cp => gpt3_weekly_dgx_b200_mcore_tp4_cp2_mxfp8_tp_sp_cp}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap => gpt3_weekly_dgx_b200_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel => gpt3_weekly_dgx_h100_mcore_nondet_tp1_pp1_fp8_no_model_parallel}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel => gpt3_weekly_dgx_h100_mcore_nondet_tp1_pp1_fp8_no_model_parallel}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel => gpt3_weekly_dgx_h100_mcore_nondet_tp1_pp1_fp8_no_model_parallel}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline => gpt3_weekly_dgx_h100_mcore_tp1_pp1_bf16_baseline}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline => gpt3_weekly_dgx_h100_mcore_tp1_pp1_bf16_baseline}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline => gpt3_weekly_dgx_h100_mcore_tp1_pp1_bf16_baseline}/model_config.yaml (89%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel => gpt3_weekly_dgx_h100_mcore_tp1_pp1_fp8_no_model_parallel}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel => gpt3_weekly_dgx_h100_mcore_tp1_pp1_fp8_no_model_parallel}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel => gpt3_weekly_dgx_h100_mcore_tp1_pp1_fp8_no_model_parallel}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp => gpt3_weekly_dgx_h100_mcore_tp1_pp2_fp8_pp}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp => gpt3_weekly_dgx_h100_mcore_tp1_pp2_fp8_pp}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp => gpt3_weekly_dgx_h100_mcore_tp1_pp2_fp8_pp}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap => gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap => gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap}/model_config.yaml (92%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap => gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap}/tp_comm_overlap_cfg.yaml (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp => gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp => gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp => gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp => gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp_sp}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp => gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp_sp}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp => gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp_sp}/model_config.yaml (90%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp => gpt3_weekly_dgx_h100_mcore_tp2_pp2_native_fp8_tp_pp_sp}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp => gpt3_weekly_dgx_h100_mcore_tp2_pp2_native_fp8_tp_pp_sp}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp => gpt3_weekly_dgx_h100_mcore_tp2_pp2_native_fp8_tp_pp_sp}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp => gpt3_weekly_dgx_h100_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap => gpt3_weekly_dgx_h100_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap => gpt3_weekly_dgx_h100_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap => gpt3_weekly_dgx_h100_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap => gpt3_weekly_dgx_h100_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap}/model_config.yaml (91%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp => gpt3_weekly_dgx_h100_mcore_tp4_pp2_fp8_tp_pp}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp => gpt3_weekly_dgx_h100_mcore_tp4_pp2_fp8_tp_pp}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/gpt/{gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp => gpt3_weekly_dgx_h100_mcore_tp4_pp2_fp8_tp_pp}/model_config.yaml (90%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic}/golden_values_dev_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic}/golden_values_dev_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic}/model_config.yaml (91%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last}/golden_values_dev_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last}/golden_values_dev_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last => gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last}/model_config.yaml (91%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer}/model_config.yaml (91%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer}/model_config.yaml (92%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM}/model_config.yaml (91%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer}/model_config.yaml (92%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM}/model_config.yaml (91%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances}/model_config.yaml (92%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router}/model_config.yaml (91%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective}/model_config.yaml (92%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer}/model_config.yaml (91%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM}/model_config.yaml (92%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_top2router}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_top2router}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts2parallel_top2router}/model_config.yaml (91%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4}/model_config.yaml (92%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G => gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4}/model_config.yaml (92%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8 => gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8 => gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8 => gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8 => gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8}/model_config.yaml (96%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph => gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph => gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph => gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph => gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph}/model_config.yaml (96%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental => gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental => gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental}/model_config.yaml (96%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G => gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G => gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G => gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G => gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router}/model_config.yaml (92%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel}/golden_values_dev_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel}/golden_values_dev_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel}/model_config.yaml (91%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last}/golden_values_dev_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last}/golden_values_dev_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last}/model_config.yaml (91%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel}/golden_values_dev_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel}/golden_values_dev_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel}/model_config.yaml (91%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last}/golden_values_dev_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last}/golden_values_dev_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last => gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last}/model_config.yaml (91%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel}/model_config.yaml (91%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel}/golden_values_dev_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel}/golden_values_dev_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel}/golden_values_lts_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel}/golden_values_lts_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel => gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel}/model_config.yaml (91%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts => gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts}/golden_values_dev.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts => gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts}/golden_values_lts.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts => gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts}/model_config.yaml (91%) rename tests/functional_tests/test_cases/moe/{gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer => gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer => gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer => gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer => gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer => gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer => gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer}/model_config.yaml (92%) rename tests/functional_tests/test_cases/moe/{gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer => gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer => gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer => gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer}/golden_values_dev_dgxh100_dgxc.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer => gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/moe/{gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer => gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer}/model_config.yaml (92%) rename tests/functional_tests/test_cases/multimodal-llava/{multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G => multimodal_llava_mcore_te_tp1_pp1}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/multimodal-llava/{multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G => multimodal_llava_mcore_te_tp1_pp1}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/multimodal-llava/{multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G => multimodal_llava_mcore_te_tp1_pp1}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/multimodal-llava/{multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G => multimodal_llava_mcore_te_tp1_pp1}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/multimodal-llava/{multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G => multimodal_llava_mcore_te_tp1_pp1}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/multimodal-llava/{multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G => multimodal_llava_mcore_te_tp1_pp1}/model_config.yaml (98%) rename tests/functional_tests/test_cases/multimodal-llava/{multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G => multimodal_llava_mcore_te_tp4_sp_cp2}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/multimodal-llava/{multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G => multimodal_llava_mcore_te_tp4_sp_cp2}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/multimodal-llava/{multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G => multimodal_llava_mcore_te_tp4_sp_cp2}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/multimodal-llava/{multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G => multimodal_llava_mcore_te_tp4_sp_cp2}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/multimodal-llava/{multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G => multimodal_llava_mcore_te_tp4_sp_cp2}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/multimodal-llava/{multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G => multimodal_llava_mcore_te_tp4_sp_cp2}/model_config.yaml (98%) rename tests/functional_tests/test_cases/t5/{t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G => t5_11b_mcore_tp4_pp1}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G => t5_11b_mcore_tp4_pp1}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G => t5_11b_mcore_tp4_pp1}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/t5/{t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G => t5_11b_mcore_tp4_pp1}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/t5/{t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G => t5_11b_mcore_tp4_pp1}/model_config.yaml (98%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch => t5_mcore_te_tp1_pp1_vp1_resume_torch}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch => t5_mcore_te_tp1_pp1_vp1_resume_torch}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch => t5_mcore_te_tp1_pp1_vp1_resume_torch}/golden_values_dev_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch => t5_mcore_te_tp1_pp1_vp1_resume_torch}/golden_values_dev_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch => t5_mcore_te_tp1_pp1_vp1_resume_torch}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch => t5_mcore_te_tp1_pp1_vp1_resume_torch}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch => t5_mcore_te_tp1_pp1_vp1_resume_torch}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch => t5_mcore_te_tp1_pp1_vp1_resume_torch}/model_config.yaml (98%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1 => t5_mcore_te_tp2_pp1_vp1}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1 => t5_mcore_te_tp2_pp1_vp1}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1 => t5_mcore_te_tp2_pp1_vp1}/golden_values_dev_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1 => t5_mcore_te_tp2_pp1_vp1}/golden_values_dev_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1 => t5_mcore_te_tp2_pp1_vp1}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1 => t5_mcore_te_tp2_pp1_vp1}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1 => t5_mcore_te_tp2_pp1_vp1}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1 => t5_mcore_te_tp2_pp1_vp1}/model_config.yaml (98%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel => t5_mcore_te_tp2_pp1_vp1_sequence_parallel}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel => t5_mcore_te_tp2_pp1_vp1_sequence_parallel}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel => t5_mcore_te_tp2_pp1_vp1_sequence_parallel}/golden_values_dev_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel => t5_mcore_te_tp2_pp1_vp1_sequence_parallel}/golden_values_dev_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel => t5_mcore_te_tp2_pp1_vp1_sequence_parallel}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel => t5_mcore_te_tp2_pp1_vp1_sequence_parallel}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel => t5_mcore_te_tp2_pp1_vp1_sequence_parallel}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel => t5_mcore_te_tp2_pp1_vp1_sequence_parallel}/model_config.yaml (98%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G => t5_mcore_te_tp4_pp1}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G => t5_mcore_te_tp4_pp1}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G => t5_mcore_te_tp4_pp1}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G => t5_mcore_te_tp4_pp1}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G => t5_mcore_te_tp4_pp1}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G => t5_mcore_te_tp4_pp1}/model_config.yaml (98%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G => t5_mcore_te_tp4_pp1_resume_torch_dist}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G => t5_mcore_te_tp4_pp1_resume_torch_dist}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G => t5_mcore_te_tp4_pp1_resume_torch_dist}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G => t5_mcore_te_tp4_pp1_resume_torch_dist}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G => t5_mcore_te_tp4_pp1_resume_torch_dist}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G => t5_mcore_te_tp4_pp1_resume_torch_dist}/model_config.yaml (98%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1 => t5_mcore_tp1_pp1_vp1}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1 => t5_mcore_tp1_pp1_vp1}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1 => t5_mcore_tp1_pp1_vp1}/golden_values_dev_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1 => t5_mcore_tp1_pp1_vp1}/golden_values_dev_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1 => t5_mcore_tp1_pp1_vp1}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1 => t5_mcore_tp1_pp1_vp1}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1 => t5_mcore_tp1_pp1_vp1}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1 => t5_mcore_tp1_pp1_vp1}/model_config.yaml (98%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch => t5_mcore_tp1_pp1_vp1_resume_torch}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch => t5_mcore_tp1_pp1_vp1_resume_torch}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch => t5_mcore_tp1_pp1_vp1_resume_torch}/golden_values_dev_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch => t5_mcore_tp1_pp1_vp1_resume_torch}/golden_values_dev_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch => t5_mcore_tp1_pp1_vp1_resume_torch}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch => t5_mcore_tp1_pp1_vp1_resume_torch}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch => t5_mcore_tp1_pp1_vp1_resume_torch}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch => t5_mcore_tp1_pp1_vp1_resume_torch}/model_config.yaml (98%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1 => t5_mcore_tp2_pp1_vp1}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1 => t5_mcore_tp2_pp1_vp1}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1 => t5_mcore_tp2_pp1_vp1}/golden_values_dev_dgxa100_dracooci-ord.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1 => t5_mcore_tp2_pp1_vp1}/golden_values_dev_dgxa100_dracooci.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1 => t5_mcore_tp2_pp1_vp1}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1 => t5_mcore_tp2_pp1_vp1}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1 => t5_mcore_tp2_pp1_vp1}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1 => t5_mcore_tp2_pp1_vp1}/model_config.yaml (98%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G => t5_mcore_tp4_pp1}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G => t5_mcore_tp4_pp1}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G => t5_mcore_tp4_pp1}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G => t5_mcore_tp4_pp1}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G => t5_mcore_tp4_pp1}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G => t5_mcore_tp4_pp1}/model_config.yaml (98%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G => t5_mcore_tp4_pp1_resume_torch_dist}/golden_values_dev_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G => t5_mcore_tp4_pp1_resume_torch_dist}/golden_values_dev_dgx_h100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G => t5_mcore_tp4_pp1_resume_torch_dist}/golden_values_dev_dgxh100_coreweave.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G => t5_mcore_tp4_pp1_resume_torch_dist}/golden_values_dev_dgxh100_eos.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G => t5_mcore_tp4_pp1_resume_torch_dist}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G => t5_mcore_tp4_pp1_resume_torch_dist}/model_config.yaml (98%) rename tests/functional_tests/test_cases/t5/{t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1 => t5_weekly_mcore_te_tp2_pp1_vp1}/golden_values_lts_dgx_a100.json (100%) rename tests/functional_tests/test_cases/t5/{t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel => t5_weekly_mcore_te_tp2_pp1_vp1_sequence_parallel}/golden_values_lts_dgx_a100.json (100%) diff --git a/.github/actions/action.yml b/.github/actions/action.yml index d726fcabc9f..d2f43599182 100644 --- a/.github/actions/action.yml +++ b/.github/actions/action.yml @@ -173,6 +173,7 @@ runs: - name: Check result id: check shell: bash -x -e -u -o pipefail {0} + if: always() env: IS_UNIT_TEST: ${{ inputs.is_unit_test == 'true' }} run: | @@ -210,7 +211,7 @@ runs: - name: Upload coverage uses: actions/upload-artifact@v4 - if: ${{ steps.check.outputs.coverage_report != 'none' }} + if: ${{ always() && steps.check.outputs.coverage_report != 'none' }} with: name: ${{ steps.check.outputs.coverage_report }} path: | @@ -220,6 +221,7 @@ runs: - name: Upload logs uses: actions/upload-artifact@v4 + if: always() with: name: ${{ steps.check.outputs.logs_report }} path: ${{ inputs.is_unit_test == 'true' && 'logs' || 'assets_dir' }} diff --git a/.github/workflows/build-test-publish-wheel.yml b/.github/workflows/build-test-publish-wheel.yml index 95795e67f60..0b6cdd7efdb 100644 --- a/.github/workflows/build-test-publish-wheel.yml +++ b/.github/workflows/build-test-publish-wheel.yml @@ -17,6 +17,7 @@ name: Build, test, and publish a PyPi wheel (to testpypi). on: push: branches: + - dev - main - "pull-request/[0-9]+" - "deploy-release/*" diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index a56afb74c71..94d486f2fb5 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -17,6 +17,7 @@ on: - cron: "0 */2 * * *" push: branches: + - dev - main - "pull-request/[0-9]+" - "deploy-release/*" @@ -31,6 +32,9 @@ permissions: id-token: write contents: read +env: + container-registry: 766267172432.dkr.ecr.us-east-1.amazonaws.com + jobs: is-not-external-contributor: runs-on: ubuntu-latest @@ -140,6 +144,7 @@ jobs: fi pre-flight: + needs: [is-not-external-contributor] uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.0 linting: @@ -177,6 +182,8 @@ jobs: - name: Run linting run: | export PATH=".venv/bin:$PATH" + export GITLAB_ENDPOINT=github.com + export CI_PROJECT_NAMESPACE=NVIDIA export BASE_REF="${{ startsWith(github.ref, 'refs/heads/pull-request/') && fromJSON(steps.get-pr-info.outputs.pr-info).base.ref || 'HEAD~1' }}" export CHECK_ONLY=true export SKIP_DOCS=false @@ -232,10 +239,38 @@ jobs: python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets echo "::endgroup::" + - name: Install GH CLI + shell: bash + run: | + apt-get update + apt-get install -y gh + - name: Pull cache run: | - docker pull 766267172432.dkr.ecr.us-east-1.amazonaws.com/megatron-lm:main || true - docker pull 766267172432.dkr.ecr.us-east-1.amazonaws.com/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} || true + docker pull ${{ env.container-registry }}/megatron-lm:main || true + docker pull ${{ env.container-registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} || true + + - name: Get last merged PR + id: cache_from + env: + GH_TOKEN: ${{ github.token }} + run: | + LAST_PRS=$(gh api graphql -f query=' + query { + repository(owner: "NVIDIA", name: "Megatron-LM") { + pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) { + nodes { + number + } + } + } + }' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do + echo "${{ env.container-registry }}/megatron-lm:$number" + done) + + echo "LAST_PRS< Callable: - def _func(y_pred: List[Union[float, int]], y_true: List[Union[float, int]]): - - return np.mean([np.mean(y_pred), np.mean(y_true)]) * rtol - - return _func - - class TypeOfTestResult(enum.Enum): APPROXIMATE = 1 DETERMINISTIC = 2 @@ -46,7 +38,6 @@ class NotDeterminsticError(Exception): class ApproximateTest(Test): atol: Union[int, float] = 0 - atol_func: Optional[Callable] = None rtol: float = 1e-5 @property @@ -58,16 +49,14 @@ def error_message(self, metric_name: str) -> NotApproximateError: class DeterministicTest(Test): - @property - def atol(self) -> Union[int, float]: - return 0 - - atol_func: Optional[Callable] = None - @property def rtol(self) -> float: return 0.0 + @property + def atol(self) -> Union[int, float]: + return 0 + @property def type_of_test_result(self) -> TypeOfTestResult: return TypeOfTestResult.DETERMINISTIC @@ -235,8 +224,7 @@ def pipeline( golden = np.array(golden_value_list) # Tolerance check - rtol = 0 if test.type_of_test_result == TypeOfTestResult.DETERMINISTIC else 0.10 - is_close = np.isclose(actual, golden, rtol=rtol, atol=0) + is_close = np.isclose(actual, golden, rtol=test.rtol, atol=test.atol) num_failing_steps_allowed = min(max(total_steps_evaluated // 100, 1), 50) passing = np.mean(is_close) >= (num_failing_steps_allowed / total_steps_evaluated) diff --git a/tests/functional_tests/python_test_utils/test_pretraining_regular_pipeline.py b/tests/functional_tests/python_test_utils/test_pretraining_regular_pipeline.py index a38016d1e50..db03d30f65a 100644 --- a/tests/functional_tests/python_test_utils/test_pretraining_regular_pipeline.py +++ b/tests/functional_tests/python_test_utils/test_pretraining_regular_pipeline.py @@ -9,35 +9,14 @@ logger = logging.getLogger(__name__) CHECK_THRESHOLDS = { - "iteration-time": [ - common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0) - ], - "mem-allocated-bytes": [ - common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0) - ], - "mem-max-allocated-bytes": [ - common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0) - ], - "lm loss": [ - common.DeterministicTest(), - common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0), - ], - "mtp_1 loss": [ - common.DeterministicTest(), - common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0), - ], - "num-zeros": [ - common.DeterministicTest(), - common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.20), rtol=0), - ], - "generated_tokens": [ - common.DeterministicTest(), - common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0), - ], - "logprobs": [ - common.DeterministicTest(), - common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0), - ], + "iteration-time": [common.ApproximateTest(atol=0, rtol=0.25)], + "mem-allocated-bytes": [common.ApproximateTest(atol=0, rtol=0.05)], + "mem-max-allocated-bytes": [common.ApproximateTest(atol=0, rtol=0.05)], + "lm loss": [common.DeterministicTest(), common.ApproximateTest(atol=0, rtol=0.05)], + "mtp_1 loss": [common.DeterministicTest(), common.ApproximateTest(atol=0, rtol=0.05)], + "num-zeros": [common.DeterministicTest(), common.ApproximateTest(atol=0, rtol=0.05)], + "generated_tokens": [common.DeterministicTest(), common.ApproximateTest(atol=0, rtol=0.05)], + "logprobs": [common.DeterministicTest(), common.ApproximateTest(atol=0, rtol=0.05)], } diff --git a/tests/functional_tests/shell_test_utils/_run_training.sh b/tests/functional_tests/shell_test_utils/_run_training.sh index 5179c02c3b5..1d0e77a3477 100644 --- a/tests/functional_tests/shell_test_utils/_run_training.sh +++ b/tests/functional_tests/shell_test_utils/_run_training.sh @@ -8,7 +8,7 @@ set -euxo pipefail -echo "------ARGUMENTS LIST --------" +set +x for ARGUMENT in "$@"; do KEY=$(echo $ARGUMENT | cut -f1 -d=) @@ -18,7 +18,7 @@ for ARGUMENT in "$@"; do export "$KEY"="$VALUE" echo "$KEY=$VALUE" done -echo "---------------------------------" +set -x # Check that mandatory vars are set MANDATORY_VARS=( @@ -39,9 +39,11 @@ for mandatory_var in "${MANDATORY_VARS[@]}"; do fi done +set +x # Envsubst model_params cat $TRAINING_PARAMS_PATH | envsubst "$(env | cut -d= -f1 | sed -e 's/^/$/')" >$TRAINING_PARAMS_PATH.tmp TRAINING_PARAMS_PATH="$TRAINING_PARAMS_PATH.tmp" +set -x # Pull env vars to export ENV_VARS=$(/usr/local/bin/yq '... comments="" | .ENV_VARS | to_entries | .[] | [.key + "=" + .value] | join(" ")' "$TRAINING_PARAMS_PATH") diff --git a/tests/functional_tests/shell_test_utils/run_ci_test.sh b/tests/functional_tests/shell_test_utils/run_ci_test.sh index 75cb4e619e7..5a6ea64f42d 100644 --- a/tests/functional_tests/shell_test_utils/run_ci_test.sh +++ b/tests/functional_tests/shell_test_utils/run_ci_test.sh @@ -8,9 +8,7 @@ ulimit -Sn $(ulimit -Hn) # Increase soft limit for number of processes to match hard limit ulimit -Su $(ulimit -Hu) -echo "------ARGUMENTS LIST --------" -# Use eval to properly handle quoted arguments -eval "set -- $@" +set +x for ARGUMENT in "$@"; do # Split on first = only, preserving any subsequent = signs in the value KEY="${ARGUMENT%%=*}" @@ -26,7 +24,7 @@ for ARGUMENT in "$@"; do export "$KEY"="$(eval echo $VALUE)" echo "$KEY=$VALUE" done -echo "---------------------------------" +set -x # Check that mandatory vars are set MANDATORY_VARS=( @@ -306,7 +304,7 @@ for i in $(seq 1 $N_REPEAT); do fi # For inference jobs - if [[ "$MODE" == "inference" ]]; then + if [[ "$MODE" == "inference" && ("$TRAINING_EXIT_CODE" -eq 0 || "$TEST_TYPE" == "release") ]]; then if [[ "$TEST_TYPE" == "frozen-start" ]]; then uv run --no-sync pytest -s -o log_cli=true --log-cli-level=info $ROOT_DIR/tests/functional_tests/python_test_utils/test_inference_regular_pipeline.py \ --golden-values-path $GOLDEN_VALUES_PATH \ diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/model_config.yaml index ede505eb2f4..60537ce8776 100644 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --seq-length: 512 --max-position-embeddings: 512 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 990000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/model_config.yaml index e606d04a88c..0e908381456 100644 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --seq-length: 512 --max-position-embeddings: 512 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 990000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/model_config.yaml index e7bb67a9ed8..f965ee1d9ef 100644 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --seq-length: 512 --max-position-embeddings: 512 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 990000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/model_config.yaml index 6f38457cdd0..fc4c836c98a 100644 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --seq-length: 512 --max-position-embeddings: 512 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 990000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/model_config.yaml index def6878c889..8974bc1ea24 100644 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --seq-length: 512 --max-position-embeddings: 512 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 990000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/model_config.yaml index 8b993bfaec3..49135684124 100644 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --seq-length: 512 --max-position-embeddings: 512 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 990000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/model_config.yaml index 05a3d0730c8..6c0dc550515 100644 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --seq-length: 512 --max-position-embeddings: 512 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 990000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/model_config.yaml index 777be078e4d..e001ea4dc08 100644 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --seq-length: 512 --max-position-embeddings: 512 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 990000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/model_config.yaml similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/model_config.yaml diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/model_config.yaml similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/model_config.yaml diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_disable/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_disable/model_config.yaml similarity index 94% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_disable/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_disable/model_config.yaml index 14d585d84a7..2026f11ade2 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_disable/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_disable/model_config.yaml @@ -64,11 +64,11 @@ MODEL_ARGS: # data settings --data-cache-path: ${DATA_CACHE_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt # logging settings --tensorboard-dir: ${TENSORBOARD_PATH} - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --log-params-norm: true --log-num-zeros-in-grad: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_enable/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_enable/model_config.yaml similarity index 93% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_enable/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_enable/model_config.yaml index df91f9a95eb..41cb6561429 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_enable/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_enable/model_config.yaml @@ -63,11 +63,11 @@ MODEL_ARGS: # data settings --data-cache-path: ${DATA_CACHE_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt # logging settings --tensorboard-dir: ${TENSORBOARD_PATH} - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --log-params-norm: true --log-num-zeros-in-grad: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_persistent_1/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_persistent_1/model_config.yaml similarity index 94% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_persistent_1/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_persistent_1/model_config.yaml index 849df09f27f..9cd921e9833 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_persistent_1/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_persistent_1/model_config.yaml @@ -64,11 +64,11 @@ MODEL_ARGS: # data settings --data-cache-path: ${DATA_CACHE_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt # logging settings --tensorboard-dir: ${TENSORBOARD_PATH} - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --log-params-norm: true --log-num-zeros-in-grad: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_persistent_2/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_persistent_2/model_config.yaml similarity index 94% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_persistent_2/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_persistent_2/model_config.yaml index 3316142031f..f902393d049 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_persistent_2/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_persistent_2/model_config.yaml @@ -63,11 +63,11 @@ MODEL_ARGS: # data settings --data-cache-path: ${DATA_CACHE_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt # logging settings --tensorboard-dir: ${TENSORBOARD_PATH} - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --log-params-norm: true --log-num-zeros-in-grad: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_reshard/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_reshard/model_config.yaml similarity index 94% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_reshard/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_reshard/model_config.yaml index 4b8d6a47b9c..2e82cad10a8 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_reshard/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_reshard/model_config.yaml @@ -64,11 +64,11 @@ MODEL_ARGS: # data settings --data-cache-path: ${DATA_CACHE_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt # logging settings --tensorboard-dir: ${TENSORBOARD_PATH} - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --log-params-norm: true --log-num-zeros-in-grad: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_resume/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_resume/model_config.yaml similarity index 93% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_resume/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_resume/model_config.yaml index 43937abe664..0abd4db698e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_resume/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_resume/model_config.yaml @@ -63,11 +63,11 @@ MODEL_ARGS: # data settings --data-cache-path: ${DATA_CACHE_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt # logging settings --tensorboard-dir: ${TENSORBOARD_PATH} - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --log-params-norm: true --log-num-zeros-in-grad: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_resume_check_grads/README.md b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_resume_check_grads/README.md similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_resume_check_grads/README.md rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_resume_check_grads/README.md diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_resume_check_grads/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_resume_check_grads/model_config.yaml similarity index 94% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_resume_check_grads/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_resume_check_grads/model_config.yaml index e9c35d0e86d..582c9523f73 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_resume_check_grads/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_resume_check_grads/model_config.yaml @@ -86,15 +86,15 @@ BASE_MODEL_ARGS: &BASE_MODEL_ARGS --ckpt-format: torch_dist --dist-ckpt-strictness: log_all # backward compatibility for TE changes --save: ${CHECKPOINT_SAVE_PATH} - --load: ${CHECKPOINT_LOAD_PATH} + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_gpt/gpt3_4b_pyt/25.03.05_bf16_rerun-enabled_v2 # data settings --data-cache-path: ${DATA_CACHE_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt # logging settings --tensorboard-dir: ${TENSORBOARD_PATH} - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --log-params-norm: true --log-num-zeros-in-grad: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_transient/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_transient/model_config.yaml similarity index 94% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_transient/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_transient/model_config.yaml index 5021a029d3b..59a57e2212b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_reruns_transient/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_reruns_transient/model_config.yaml @@ -65,11 +65,11 @@ MODEL_ARGS: # data settings --data-cache-path: ${DATA_CACHE_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt # logging settings --tensorboard-dir: ${TENSORBOARD_PATH} - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --log-params-norm: true --log-num-zeros-in-grad: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/model_config.yaml index 8031bf55d8d..2d5e340fa6d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer/model_config.yaml index 5ed4553ad1d..c7b46ff9b8d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/model_config.yaml index 6eac7d0da72..82506115963 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files/model_config.yaml index 750986482c7..4a5bf3d8fc7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/model_config.yaml index f34c980d821..bb0708b11ef 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_uniform_full_recompute/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_uniform_full_recompute/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_uniform_full_recompute/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_uniform_full_recompute/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_uniform_full_recompute/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_uniform_full_recompute/model_config.yaml index 7c880daf577..a5dbe2157e5 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_uniform_full_recompute/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic/model_config.yaml index 7f0958f94f2..4aeea406fb9 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic/model_config.yaml index 7271fe996d6..6d6bf2b5b94 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/model_config.yaml index 7c5a764ccb9..5e4131a43ca 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/model_config.yaml index 2491fd02e96..c75d099790f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings/model_config.yaml index 58d4628f72d..ffabf9583f6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion/model_config.yaml index 5fcf15a2c3e..b391387f9ff 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_disable_bias_linear/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_disable_bias_linear/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_disable_bias_linear/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_disable_bias_linear/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_disable_bias_linear/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_disable_bias_linear/model_config.yaml index 6b66183c1dc..5415e3de96d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_disable_bias_linear/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu/model_config.yaml index 089fd7808ff..8d372f5539d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear/model_config.yaml index 3d8843214a3..d91e9be4f54 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/model_config.yaml index 4dc43353c9f..7d069ce9ec6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/model_config.yaml index 7133af75b8f..ea882318c7e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel/model_config.yaml index 1e29b79848b..d67dd6af765 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/model_config.yaml index 27d8203d307..1e25f4bd4e1 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/model_config.yaml index bc0da950ac8..2d734908089 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_sequence_parallel/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_sequence_parallel/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_sequence_parallel/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_sequence_parallel/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_sequence_parallel/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_sequence_parallel/model_config.yaml index 962e08d5e73..319164782fc 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_sequence_parallel/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_swiglu/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_swiglu/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_swiglu/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_swiglu/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_swiglu/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_swiglu/model_config.yaml index 8942fa94b55..a3a1a458739 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_swiglu/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_untie_embeddings_and_outputs/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_untie_embeddings_and_outputs/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_untie_embeddings_and_outputs/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_untie_embeddings_and_outputs/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_untie_embeddings_and_outputs/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_untie_embeddings_and_outputs/model_config.yaml index 7f6ae92394d..ea8f4bb71d0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_untie_embeddings_and_outputs/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/model_config.yaml index aa041fec6de..ea869246a7c 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss/model_config.yaml index 65ea19f9bd8..767283cf2a1 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_decoupled_lr/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_decoupled_lr/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_decoupled_lr/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_decoupled_lr/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_decoupled_lr/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_decoupled_lr/model_config.yaml index 99a04b44fe3..46ff13cb9a8 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_decoupled_lr/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce/model_config.yaml index a1150d0db09..5a1b1ce289d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml index 907c86da3b1..31ffc9c8111 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_dgxc.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgxh100_dgxc.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_dgxc.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgxh100_dgxc.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/model_config.yaml index 503e702c4f5..0bd25e79735 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied/model_config.yaml index c8d15bbf005..778e7d361b3 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/model_config.yaml index 8db3c6529df..d502c3e1fef 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist/model_config.yaml index 699ca43cc7b..edc9bc1ff2a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/model_config.yaml index 243a52e84bd..1b9c96b3f7d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/model_config.yaml index b3a950dcb5e..fed75814df5 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml similarity index 92% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml index 0e71ea6c268..af06fe06903 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/model_config.yaml index 6aa5a991e90..035549f8fb6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/model_config.yaml index 4907dfb7f4c..ef758e5639f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..0c1ce6fced4 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.83936, + "2": 10.8442, + "3": 10.86813, + "4": 10.86022, + "5": 10.87939, + "6": 10.85969, + "7": 10.86386, + "8": 10.8444, + "9": 10.88995, + "10": 10.8926, + "11": 10.89136, + "12": 10.85312, + "13": 10.87319, + "14": 10.83805, + "15": 10.83088, + "16": 10.82011, + "17": 10.79138, + "18": 10.81055, + "19": 10.77977, + "20": 10.6635, + "21": 10.69765, + "22": 10.67421, + "23": 10.77344, + "24": 10.63919, + "25": 10.50497, + "26": 10.61911, + "27": 10.56921, + "28": 10.46859, + "29": 10.41119, + "30": 10.42916, + "31": 10.52553, + "32": 10.34942, + "33": 10.2967, + "34": 10.46909, + "35": 9.99632, + "36": 10.13945, + "37": 10.0434, + "38": 10.4139, + "39": 9.80941, + "40": 10.12495, + "41": 10.14883, + "42": 10.04042, + "43": 10.22142, + "44": 10.07348, + "45": 9.71369, + "46": 10.00449, + "47": 9.94758, + "48": 9.68856, + "49": 9.93637, + "50": 9.96042 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1026.0, + "2": 1184.0, + "3": 1226.0, + "4": 1248.0, + "5": 1259.0, + "6": 1421.0, + "7": 1182.0, + "8": 1036.0, + "9": 1293.0, + "10": 1319.0, + "11": 1212.0, + "12": 1373.0, + "13": 1327.0, + "14": 1121.0, + "15": 1217.0, + "16": 1163.0, + "17": 1246.0, + "18": 1280.0, + "19": 1128.0, + "20": 1019.0, + "21": 1147.0, + "22": 1156.0, + "23": 1341.0, + "24": 1312.0, + "25": 1066.0, + "26": 1138.0, + "27": 1270.0, + "28": 1260.0, + "29": 1292.0, + "30": 1532.0, + "31": 1477.0, + "32": 1460.0, + "33": 1537.0, + "34": 1513.0, + "35": 1235.0, + "36": 1316.0, + "37": 1466.0, + "38": 1564.0, + "39": 1380.0, + "40": 1513.0, + "41": 1633.0, + "42": 1509.0, + "43": 1731.0, + "44": 1636.0, + "45": 1501.0, + "46": 1884.0, + "47": 1567.0, + "48": 1631.0, + "49": 1825.0, + "50": 1639.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759682560.0, + "2": 759682560.0, + "3": 759682560.0, + "4": 759682560.0, + "5": 759682560.0, + "6": 759682560.0, + "7": 759682560.0, + "8": 759682560.0, + "9": 759682560.0, + "10": 759682560.0, + "11": 759682560.0, + "12": 759682560.0, + "13": 759682560.0, + "14": 759682560.0, + "15": 759682560.0, + "16": 759682560.0, + "17": 759682560.0, + "18": 759682560.0, + "19": 759682560.0, + "20": 759682560.0, + "21": 759682560.0, + "22": 759682560.0, + "23": 759682560.0, + "24": 759682560.0, + "25": 759682560.0, + "26": 759682560.0, + "27": 759682560.0, + "28": 759682560.0, + "29": 759682560.0, + "30": 759682560.0, + "31": 759682560.0, + "32": 759682560.0, + "33": 759682560.0, + "34": 759682560.0, + "35": 759682560.0, + "36": 759682560.0, + "37": 759682560.0, + "38": 759682560.0, + "39": 759682560.0, + "40": 759682560.0, + "41": 759682560.0, + "42": 759682560.0, + "43": 759682560.0, + "44": 759682560.0, + "45": 759682560.0, + "46": 759682560.0, + "47": 759682560.0, + "48": 759682560.0, + "49": 759682560.0, + "50": 759682560.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4340903936.0, + "2": 4622615552.0, + "3": 4622615552.0, + "4": 4622615552.0, + "5": 4622615552.0, + "6": 4622615552.0, + "7": 4622615552.0, + "8": 4622615552.0, + "9": 4622615552.0, + "10": 4622615552.0, + "11": 4622615552.0, + "12": 4622615552.0, + "13": 4622615552.0, + "14": 4622615552.0, + "15": 4622615552.0, + "16": 4622615552.0, + "17": 4622615552.0, + "18": 4622615552.0, + "19": 4622615552.0, + "20": 4622615552.0, + "21": 4622615552.0, + "22": 4622615552.0, + "23": 4622615552.0, + "24": 4622615552.0, + "25": 4622615552.0, + "26": 4622615552.0, + "27": 4622615552.0, + "28": 4622615552.0, + "29": 4622615552.0, + "30": 4622615552.0, + "31": 4622615552.0, + "32": 4622615552.0, + "33": 4622615552.0, + "34": 4622615552.0, + "35": 4622615552.0, + "36": 4622615552.0, + "37": 4622615552.0, + "38": 4622615552.0, + "39": 4622615552.0, + "40": 4622615552.0, + "41": 4622615552.0, + "42": 4622615552.0, + "43": 4622615552.0, + "44": 4622615552.0, + "45": 4622615552.0, + "46": 4622615552.0, + "47": 4622615552.0, + "48": 4622615552.0, + "49": 4622615552.0, + "50": 4622615552.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 14.98171, + "2": 0.13344, + "3": 0.10755, + "4": 0.10562, + "5": 0.10638, + "6": 0.10549, + "7": 0.10612, + "8": 0.10814, + "9": 0.10654, + "10": 0.10633, + "11": 0.10725, + "12": 0.10667, + "13": 0.10769, + "14": 0.10593, + "15": 0.10694, + "16": 0.10715, + "17": 0.1064, + "18": 0.10706, + "19": 0.10964, + "20": 0.1054, + "21": 0.10752, + "22": 0.10979, + "23": 0.10834, + "24": 0.10667, + "25": 0.10762, + "26": 0.10605, + "27": 0.10756, + "28": 0.1059, + "29": 0.10662, + "30": 0.10738, + "31": 0.1065, + "32": 0.1074, + "33": 0.10712, + "34": 0.10631, + "35": 0.10672, + "36": 0.10785, + "37": 0.10664, + "38": 0.1064, + "39": 0.10666, + "40": 0.10518, + "41": 0.10655, + "42": 0.10605, + "43": 0.10563, + "44": 0.1064, + "45": 0.10629, + "46": 0.10691, + "47": 0.10711, + "48": 0.10618, + "49": 0.10991, + "50": 0.10529 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..2a87d7e4de5 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.83936, + "2": 10.8442, + "3": 10.86813, + "4": 10.86022, + "5": 10.87939, + "6": 10.85969, + "7": 10.86386, + "8": 10.8444, + "9": 10.88995, + "10": 10.8926, + "11": 10.89136, + "12": 10.85312, + "13": 10.87319, + "14": 10.83805, + "15": 10.83088, + "16": 10.82011, + "17": 10.79138, + "18": 10.81055, + "19": 10.77977, + "20": 10.6635, + "21": 10.69765, + "22": 10.67421, + "23": 10.77344, + "24": 10.63919, + "25": 10.50497, + "26": 10.61911, + "27": 10.56921, + "28": 10.46859, + "29": 10.41119, + "30": 10.42916, + "31": 10.52553, + "32": 10.34942, + "33": 10.2967, + "34": 10.46909, + "35": 9.99632, + "36": 10.13945, + "37": 10.0434, + "38": 10.4139, + "39": 9.80941, + "40": 10.12495, + "41": 10.14883, + "42": 10.04042, + "43": 10.22142, + "44": 10.07348, + "45": 9.71369, + "46": 10.00449, + "47": 9.94758, + "48": 9.68856, + "49": 9.93637, + "50": 9.96042 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1026.0, + "2": 1184.0, + "3": 1226.0, + "4": 1248.0, + "5": 1259.0, + "6": 1421.0, + "7": 1182.0, + "8": 1036.0, + "9": 1293.0, + "10": 1319.0, + "11": 1212.0, + "12": 1373.0, + "13": 1327.0, + "14": 1121.0, + "15": 1217.0, + "16": 1163.0, + "17": 1246.0, + "18": 1280.0, + "19": 1128.0, + "20": 1019.0, + "21": 1147.0, + "22": 1156.0, + "23": 1341.0, + "24": 1312.0, + "25": 1066.0, + "26": 1138.0, + "27": 1270.0, + "28": 1260.0, + "29": 1292.0, + "30": 1532.0, + "31": 1477.0, + "32": 1460.0, + "33": 1537.0, + "34": 1513.0, + "35": 1235.0, + "36": 1316.0, + "37": 1466.0, + "38": 1564.0, + "39": 1380.0, + "40": 1513.0, + "41": 1633.0, + "42": 1509.0, + "43": 1731.0, + "44": 1636.0, + "45": 1501.0, + "46": 1884.0, + "47": 1567.0, + "48": 1631.0, + "49": 1825.0, + "50": 1639.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 759682560.0, + "2": 759682560.0, + "3": 759682560.0, + "4": 759682560.0, + "5": 759682560.0, + "6": 759682560.0, + "7": 759682560.0, + "8": 759682560.0, + "9": 759682560.0, + "10": 759682560.0, + "11": 759682560.0, + "12": 759682560.0, + "13": 759682560.0, + "14": 759682560.0, + "15": 759682560.0, + "16": 759682560.0, + "17": 759682560.0, + "18": 759682560.0, + "19": 759682560.0, + "20": 759682560.0, + "21": 759682560.0, + "22": 759682560.0, + "23": 759682560.0, + "24": 759682560.0, + "25": 759682560.0, + "26": 759682560.0, + "27": 759682560.0, + "28": 759682560.0, + "29": 759682560.0, + "30": 759682560.0, + "31": 759682560.0, + "32": 759682560.0, + "33": 759682560.0, + "34": 759682560.0, + "35": 759682560.0, + "36": 759682560.0, + "37": 759682560.0, + "38": 759682560.0, + "39": 759682560.0, + "40": 759682560.0, + "41": 759682560.0, + "42": 759682560.0, + "43": 759682560.0, + "44": 759682560.0, + "45": 759682560.0, + "46": 759682560.0, + "47": 759682560.0, + "48": 759682560.0, + "49": 759682560.0, + "50": 759682560.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4340903936.0, + "2": 4622615552.0, + "3": 4622615552.0, + "4": 4622615552.0, + "5": 4622615552.0, + "6": 4622615552.0, + "7": 4622615552.0, + "8": 4622615552.0, + "9": 4622615552.0, + "10": 4622615552.0, + "11": 4622615552.0, + "12": 4622615552.0, + "13": 4622615552.0, + "14": 4622615552.0, + "15": 4622615552.0, + "16": 4622615552.0, + "17": 4622615552.0, + "18": 4622615552.0, + "19": 4622615552.0, + "20": 4622615552.0, + "21": 4622615552.0, + "22": 4622615552.0, + "23": 4622615552.0, + "24": 4622615552.0, + "25": 4622615552.0, + "26": 4622615552.0, + "27": 4622615552.0, + "28": 4622615552.0, + "29": 4622615552.0, + "30": 4622615552.0, + "31": 4622615552.0, + "32": 4622615552.0, + "33": 4622615552.0, + "34": 4622615552.0, + "35": 4622615552.0, + "36": 4622615552.0, + "37": 4622615552.0, + "38": 4622615552.0, + "39": 4622615552.0, + "40": 4622615552.0, + "41": 4622615552.0, + "42": 4622615552.0, + "43": 4622615552.0, + "44": 4622615552.0, + "45": 4622615552.0, + "46": 4622615552.0, + "47": 4622615552.0, + "48": 4622615552.0, + "49": 4622615552.0, + "50": 4622615552.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 13.61511, + "2": 0.1778, + "3": 0.1277, + "4": 0.12936, + "5": 0.13227, + "6": 0.12879, + "7": 0.12864, + "8": 0.12608, + "9": 0.12256, + "10": 0.12099, + "11": 0.12182, + "12": 0.12459, + "13": 0.12256, + "14": 0.12133, + "15": 0.12193, + "16": 0.12162, + "17": 0.12333, + "18": 0.12123, + "19": 0.1213, + "20": 0.12425, + "21": 0.12132, + "22": 0.12275, + "23": 0.12087, + "24": 0.12024, + "25": 0.12097, + "26": 0.12149, + "27": 0.1222, + "28": 0.1211, + "29": 0.12079, + "30": 0.12068, + "31": 0.12272, + "32": 0.12225, + "33": 0.12154, + "34": 0.11969, + "35": 0.12134, + "36": 0.12208, + "37": 0.12324, + "38": 0.13559, + "39": 0.13696, + "40": 0.12255, + "41": 0.12095, + "42": 0.12133, + "43": 0.12263, + "44": 0.1226, + "45": 0.12131, + "46": 0.12049, + "47": 0.12042, + "48": 0.12231, + "49": 0.12137, + "50": 0.12131 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/model_config.yaml index b894bf3bd20..06545179645 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/model_config.yaml index cfdbe747764..8710e92a138 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/model_config.yaml index f9f58db94f9..dea9b4aad98 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_cp2_nondeterministic/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_cp2_nondeterministic/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_cp2_nondeterministic/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_cp2_nondeterministic/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_cp2_nondeterministic/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_cp2_nondeterministic/model_config.yaml index db560c8aac5..5394f9d0070 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_cp2_nondeterministic/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic/model_config.yaml index c6a2379b571..4bd321b43da 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_fsdp2_resume_torch_dist/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_fsdp2_resume_torch_dist/model_config.yaml index 1ad10c02caa..1229288b9be 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_fsdp2_resume_torch_dist/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_modelopt_distill_resume/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_modelopt_distill_resume/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_modelopt_distill_resume/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_modelopt_distill_resume/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_modelopt_distill_resume/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_modelopt_distill_resume/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_modelopt_distill_resume/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_modelopt_distill_resume/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml index 364a41d2fe1..556fcfbcf11 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml @@ -10,7 +10,7 @@ BEFORE_SCRIPT: | mkdir -p ${DATA_CACHE_PATH}/distill && echo $DISTILL_CONFIG | yq -P > ${DATA_CACHE_PATH}/distill/distill_config.yaml MODEL_ARGS: --export-te-mcore-model: true - --export-kd-teacher-load: ${ARTIFACTS_ROOT}/gpt_teacher + --export-kd-teacher-load: ${DATA_PATH}/model/gpt_dummy_pyt/ckpt/24.10.0_bf16_teacher --export-kd-cfg: ${DATA_CACHE_PATH}/distill/distill_config.yaml --auto-detect-ckpt-format: true --num-layers: 12 @@ -33,13 +33,13 @@ MODEL_ARGS: --untie-embeddings-and-output-weights: true --disable-bias-linear: true --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances/model_config.yaml index ac70eb6bd1e..3175a07cc88 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/model_config.yaml index 585aea5c26e..3f427a04f9d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/model_config.yaml index f8f7bded190..d3446e92c2e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2/model_config.yaml index 3a9b912ed0c..05b166f0a7b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/model_config.yaml index ccc411e5879..70155c2ff81 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/model_config.yaml index 6234292f5ff..92f4bfb1cdf 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/model_config.yaml index d510bd15c0f..b4d63762604 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/model_config.yaml index 5a9f0ea8a89..880d7fc7ce0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/model_config.yaml index 920ad6832d8..013569c5882 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/model_config.yaml index 78e7e3a45ca..6aad7304c19 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/model_config.yaml index 36a000292f5..8866fa67175 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/model_config.yaml index ddbc04621a6..f4649e2d303 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_nondeterministic/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/model_config.yaml similarity index 89% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/model_config.yaml index 31e5bb16ad5..a77cd637800 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_ddp_average_in_collective/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_ddp_average_in_collective/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_ddp_average_in_collective/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_ddp_average_in_collective/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_ddp_average_in_collective/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_ddp_average_in_collective/model_config.yaml index 76cfaf020af..9f416e74884 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_ddp_average_in_collective/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_defer_embedding_wgrad_compute/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_defer_embedding_wgrad_compute/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_defer_embedding_wgrad_compute/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_defer_embedding_wgrad_compute/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_defer_embedding_wgrad_compute/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_defer_embedding_wgrad_compute/model_config.yaml index 3488b4d1585..2622612205a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_defer_embedding_wgrad_compute/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/model_config.yaml index 586f90f1cf6..00f01d3bac0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mla/model_config.yaml @@ -23,13 +23,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader/model_config.yaml index dd928979546..3d1b350ced0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_mmap_bin_files/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_mmap_bin_files/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_mmap_bin_files/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_mmap_bin_files/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_mmap_bin_files/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_mmap_bin_files/model_config.yaml index bf6520edcd6..d4939a8c2cf 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_no_mmap_bin_files/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/model_config.yaml index 14cefe1e409..af4aa0bf4fc 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/model_config.yaml index f7c1c7ee725..9fbe95431e0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/model_config.yaml index deaadae81a3..54d49da6c14 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/model_config.yaml index fbbe2255a82..f906e5f8439 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/model_config.yaml index 383ec818661..7e2261ae518 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/model_config.yaml index 3cf39c93e9c..ea5523e1d2a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files/model_config.yaml index 4fd3ccba030..afbc17a0301 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/model_config.yaml index e8f7fee1215..bcbfdad6616 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/model_config.yaml index d6a183799fd..ecc62315f9f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml index 8df2e496bb1..89c6943100e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml index 7cd304fc880..9d8de380f83 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_qk_layernorm_test_mode/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_qk_layernorm_test_mode/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_qk_layernorm_test_mode/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_qk_layernorm_test_mode/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_qk_layernorm_test_mode/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_qk_layernorm_test_mode/model_config.yaml index 72f029c9044..18a7195b436 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_qk_layernorm_test_mode/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/model_config.yaml index 75a0ffc2adc..fe8e0f493d1 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_dgxc.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_dgxc.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_dgxc.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_dgxc.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml index de4164176bb..136c696ef2f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/model_config.yaml index 2ee48e8111c..0f842738f62 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone/model_config.yaml index 8f09dae5fec..4aa0b36a84b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/model_config.yaml index 1ac8ec45c24..620eeaeff46 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml index 208827c9aea..b8a79c7a083 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_fsdp2_resume_torch_dist_te/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_fsdp2_resume_torch_dist_te/model_config.yaml index 15fbeb4f986..4febeeb3aca 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_fsdp2_resume_torch_dist_te/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml index 573cddceff0..8793230c3c9 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/model_config.yaml similarity index 89% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/model_config.yaml index f897d2b9a8e..84da70b66c7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/model_config.yaml @@ -1,4 +1,4 @@ -ENV_VARS: +s`ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Ring @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_fp16/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_fp16/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_fp16/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_fp16/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_fp16/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_fp16/model_config.yaml index 7345237d672..f4c058fb0a0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_fp16/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_dev_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/model_config.yaml index e15844bafb7..e2a0f1f1f69 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2_resume_torch_dist/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/model_config.yaml index c7dfcfe48e3..b9b786ee247 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_dev_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/model_config.yaml index e829340190e..b4991e3621e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp4_resume_torch_dist/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te/model_config.yaml index 37fb8b1cccd..cc6a76a97d9 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp2_pp2_uninstall_te/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp2_pp2_uninstall_te/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp2_pp2_uninstall_te/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp2_pp2_uninstall_te/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp2_pp2_uninstall_te/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp2_pp2_uninstall_te/model_config.yaml index 1406468fadf..7601d0188ae 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp2_pp2_uninstall_te/model_config.yaml @@ -21,13 +21,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1/model_config.yaml index 863cf9cac25..a365aae9089 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/model_config.yaml similarity index 89% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/model_config.yaml index fcb9fa2884f..c9473f99f96 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/model_config.yaml index 0e32dbd913a..23b58cdc782 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp4_pp1_resume_torch_dist/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json deleted file mode 100644 index 67c8ef8abff..00000000000 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json +++ /dev/null @@ -1,287 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 10.83936, - "2": 10.8442, - "3": 10.86813, - "4": 10.86022, - "5": 10.87939, - "6": 10.85969, - "7": 10.86386, - "8": 10.8444, - "9": 10.88995, - "10": 10.8926, - "11": 10.89136, - "12": 10.85312, - "13": 10.87319, - "14": 10.83805, - "15": 10.83088, - "16": 10.82011, - "17": 10.79138, - "18": 10.81055, - "19": 10.77977, - "20": 10.6635, - "21": 10.69765, - "22": 10.67421, - "23": 10.77344, - "24": 10.63919, - "25": 10.50497, - "26": 10.61911, - "27": 10.56921, - "28": 10.46859, - "29": 10.41119, - "30": 10.42916, - "31": 10.52553, - "32": 10.34942, - "33": 10.2967, - "34": 10.46909, - "35": 9.99632, - "36": 10.13945, - "37": 10.0434, - "38": 10.4139, - "39": 9.80941, - "40": 10.12495, - "41": 10.14883, - "42": 10.04042, - "43": 10.22142, - "44": 10.07348, - "45": 9.71369, - "46": 10.00449, - "47": 9.94758, - "48": 9.68856, - "49": 9.93637, - "50": 9.96042 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 1026.0, - "2": 1184.0, - "3": 1226.0, - "4": 1248.0, - "5": 1259.0, - "6": 1421.0, - "7": 1182.0, - "8": 1036.0, - "9": 1293.0, - "10": 1319.0, - "11": 1212.0, - "12": 1373.0, - "13": 1327.0, - "14": 1121.0, - "15": 1217.0, - "16": 1163.0, - "17": 1246.0, - "18": 1280.0, - "19": 1128.0, - "20": 1019.0, - "21": 1147.0, - "22": 1156.0, - "23": 1341.0, - "24": 1312.0, - "25": 1066.0, - "26": 1138.0, - "27": 1270.0, - "28": 1260.0, - "29": 1292.0, - "30": 1532.0, - "31": 1477.0, - "32": 1460.0, - "33": 1537.0, - "34": 1513.0, - "35": 1235.0, - "36": 1316.0, - "37": 1466.0, - "38": 1564.0, - "39": 1380.0, - "40": 1513.0, - "41": 1633.0, - "42": 1509.0, - "43": 1731.0, - "44": 1636.0, - "45": 1501.0, - "46": 1884.0, - "47": 1567.0, - "48": 1631.0, - "49": 1825.0, - "50": 1639.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 759681536.0, - "2": 759681536.0, - "3": 759681536.0, - "4": 759681536.0, - "5": 759681536.0, - "6": 759681536.0, - "7": 759681536.0, - "8": 759681536.0, - "9": 759681536.0, - "10": 759681536.0, - "11": 759681536.0, - "12": 759681536.0, - "13": 759681536.0, - "14": 759681536.0, - "15": 759681536.0, - "16": 759681536.0, - "17": 759681536.0, - "18": 759681536.0, - "19": 759681536.0, - "20": 759681536.0, - "21": 759681536.0, - "22": 759681536.0, - "23": 759681536.0, - "24": 759681536.0, - "25": 759681536.0, - "26": 759681536.0, - "27": 759681536.0, - "28": 759681536.0, - "29": 759681536.0, - "30": 759681536.0, - "31": 759681536.0, - "32": 759681536.0, - "33": 759681536.0, - "34": 759681536.0, - "35": 759681536.0, - "36": 759681536.0, - "37": 759681536.0, - "38": 759681536.0, - "39": 759681536.0, - "40": 759681536.0, - "41": 759681536.0, - "42": 759681536.0, - "43": 759681536.0, - "44": 759681536.0, - "45": 759681536.0, - "46": 759681536.0, - "47": 759681536.0, - "48": 759681536.0, - "49": 759681536.0, - "50": 759681536.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 4340902912.0, - "2": 4622614528.0, - "3": 4622614528.0, - "4": 4622614528.0, - "5": 4622614528.0, - "6": 4622614528.0, - "7": 4622614528.0, - "8": 4622614528.0, - "9": 4622614528.0, - "10": 4622614528.0, - "11": 4622614528.0, - "12": 4622614528.0, - "13": 4622614528.0, - "14": 4622614528.0, - "15": 4622614528.0, - "16": 4622614528.0, - "17": 4622614528.0, - "18": 4622614528.0, - "19": 4622614528.0, - "20": 4622614528.0, - "21": 4622614528.0, - "22": 4622614528.0, - "23": 4622614528.0, - "24": 4622614528.0, - "25": 4622614528.0, - "26": 4622614528.0, - "27": 4622614528.0, - "28": 4622614528.0, - "29": 4622614528.0, - "30": 4622614528.0, - "31": 4622614528.0, - "32": 4622614528.0, - "33": 4622614528.0, - "34": 4622614528.0, - "35": 4622614528.0, - "36": 4622614528.0, - "37": 4622614528.0, - "38": 4622614528.0, - "39": 4622614528.0, - "40": 4622614528.0, - "41": 4622614528.0, - "42": 4622614528.0, - "43": 4622614528.0, - "44": 4622614528.0, - "45": 4622614528.0, - "46": 4622614528.0, - "47": 4622614528.0, - "48": 4622614528.0, - "49": 4622614528.0, - "50": 4622614528.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 13.91724, - "2": 0.27573, - "3": 0.23467, - "4": 0.23594, - "5": 0.23302, - "6": 0.23216, - "7": 0.23399, - "8": 0.23423, - "9": 0.23365, - "10": 0.23211, - "11": 0.2332, - "12": 0.23283, - "13": 0.23445, - "14": 0.23405, - "15": 0.23349, - "16": 0.23298, - "17": 0.23305, - "18": 0.23251, - "19": 0.23322, - "20": 0.23348, - "21": 0.23189, - "22": 0.23316, - "23": 0.2316, - "24": 0.23233, - "25": 0.23512, - "26": 0.23232, - "27": 0.23306, - "28": 0.23244, - "29": 0.23331, - "30": 0.23258, - "31": 0.23311, - "32": 0.23326, - "33": 0.23418, - "34": 0.23411, - "35": 0.23489, - "36": 0.2317, - "37": 0.23483, - "38": 0.23235, - "39": 0.23511, - "40": 0.23413, - "41": 0.23395, - "42": 0.23405, - "43": 0.23331, - "44": 0.23297, - "45": 0.23473, - "46": 0.23192, - "47": 0.23377, - "48": 0.23322, - "49": 0.23042, - "50": 0.23263 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json deleted file mode 100644 index 5e0ca24c497..00000000000 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json +++ /dev/null @@ -1,287 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 10.83936, - "2": 10.8442, - "3": 10.86813, - "4": 10.86022, - "5": 10.87939, - "6": 10.85969, - "7": 10.86386, - "8": 10.8444, - "9": 10.88995, - "10": 10.8926, - "11": 10.89136, - "12": 10.85312, - "13": 10.87319, - "14": 10.83805, - "15": 10.83088, - "16": 10.82011, - "17": 10.79138, - "18": 10.81055, - "19": 10.77977, - "20": 10.6635, - "21": 10.69765, - "22": 10.67421, - "23": 10.77344, - "24": 10.63919, - "25": 10.50497, - "26": 10.61911, - "27": 10.56921, - "28": 10.46859, - "29": 10.41119, - "30": 10.42916, - "31": 10.52553, - "32": 10.34942, - "33": 10.2967, - "34": 10.46909, - "35": 9.99632, - "36": 10.13945, - "37": 10.0434, - "38": 10.4139, - "39": 9.80941, - "40": 10.12495, - "41": 10.14883, - "42": 10.04042, - "43": 10.22142, - "44": 10.07348, - "45": 9.71369, - "46": 10.00449, - "47": 9.94758, - "48": 9.68856, - "49": 9.93637, - "50": 9.96042 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 1026.0, - "2": 1184.0, - "3": 1226.0, - "4": 1248.0, - "5": 1259.0, - "6": 1421.0, - "7": 1182.0, - "8": 1036.0, - "9": 1293.0, - "10": 1319.0, - "11": 1212.0, - "12": 1373.0, - "13": 1327.0, - "14": 1121.0, - "15": 1217.0, - "16": 1163.0, - "17": 1246.0, - "18": 1280.0, - "19": 1128.0, - "20": 1019.0, - "21": 1147.0, - "22": 1156.0, - "23": 1341.0, - "24": 1312.0, - "25": 1066.0, - "26": 1138.0, - "27": 1270.0, - "28": 1260.0, - "29": 1292.0, - "30": 1532.0, - "31": 1477.0, - "32": 1460.0, - "33": 1537.0, - "34": 1513.0, - "35": 1235.0, - "36": 1316.0, - "37": 1466.0, - "38": 1564.0, - "39": 1380.0, - "40": 1513.0, - "41": 1633.0, - "42": 1509.0, - "43": 1731.0, - "44": 1636.0, - "45": 1501.0, - "46": 1884.0, - "47": 1567.0, - "48": 1631.0, - "49": 1825.0, - "50": 1639.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 759681536.0, - "2": 759681536.0, - "3": 759681536.0, - "4": 759681536.0, - "5": 759681536.0, - "6": 759681536.0, - "7": 759681536.0, - "8": 759681536.0, - "9": 759681536.0, - "10": 759681536.0, - "11": 759681536.0, - "12": 759681536.0, - "13": 759681536.0, - "14": 759681536.0, - "15": 759681536.0, - "16": 759681536.0, - "17": 759681536.0, - "18": 759681536.0, - "19": 759681536.0, - "20": 759681536.0, - "21": 759681536.0, - "22": 759681536.0, - "23": 759681536.0, - "24": 759681536.0, - "25": 759681536.0, - "26": 759681536.0, - "27": 759681536.0, - "28": 759681536.0, - "29": 759681536.0, - "30": 759681536.0, - "31": 759681536.0, - "32": 759681536.0, - "33": 759681536.0, - "34": 759681536.0, - "35": 759681536.0, - "36": 759681536.0, - "37": 759681536.0, - "38": 759681536.0, - "39": 759681536.0, - "40": 759681536.0, - "41": 759681536.0, - "42": 759681536.0, - "43": 759681536.0, - "44": 759681536.0, - "45": 759681536.0, - "46": 759681536.0, - "47": 759681536.0, - "48": 759681536.0, - "49": 759681536.0, - "50": 759681536.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 4340902912.0, - "2": 4622614528.0, - "3": 4622614528.0, - "4": 4622614528.0, - "5": 4622614528.0, - "6": 4622614528.0, - "7": 4622614528.0, - "8": 4622614528.0, - "9": 4622614528.0, - "10": 4622614528.0, - "11": 4622614528.0, - "12": 4622614528.0, - "13": 4622614528.0, - "14": 4622614528.0, - "15": 4622614528.0, - "16": 4622614528.0, - "17": 4622614528.0, - "18": 4622614528.0, - "19": 4622614528.0, - "20": 4622614528.0, - "21": 4622614528.0, - "22": 4622614528.0, - "23": 4622614528.0, - "24": 4622614528.0, - "25": 4622614528.0, - "26": 4622614528.0, - "27": 4622614528.0, - "28": 4622614528.0, - "29": 4622614528.0, - "30": 4622614528.0, - "31": 4622614528.0, - "32": 4622614528.0, - "33": 4622614528.0, - "34": 4622614528.0, - "35": 4622614528.0, - "36": 4622614528.0, - "37": 4622614528.0, - "38": 4622614528.0, - "39": 4622614528.0, - "40": 4622614528.0, - "41": 4622614528.0, - "42": 4622614528.0, - "43": 4622614528.0, - "44": 4622614528.0, - "45": 4622614528.0, - "46": 4622614528.0, - "47": 4622614528.0, - "48": 4622614528.0, - "49": 4622614528.0, - "50": 4622614528.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 12.785, - "2": 0.28429, - "3": 0.25654, - "4": 0.25675, - "5": 0.25763, - "6": 0.25556, - "7": 0.25403, - "8": 0.25276, - "9": 0.25351, - "10": 0.25546, - "11": 0.25488, - "12": 0.25607, - "13": 0.25404, - "14": 0.25256, - "15": 0.25733, - "16": 0.25987, - "17": 0.25778, - "18": 0.25053, - "19": 0.25288, - "20": 0.258, - "21": 0.25606, - "22": 0.25231, - "23": 0.25223, - "24": 0.26464, - "25": 0.26469, - "26": 0.25015, - "27": 0.25378, - "28": 0.25459, - "29": 0.26134, - "30": 0.26129, - "31": 0.2595, - "32": 0.26444, - "33": 0.25568, - "34": 0.25514, - "35": 0.25087, - "36": 0.25275, - "37": 0.25383, - "38": 0.24953, - "39": 0.24996, - "40": 0.25393, - "41": 0.25556, - "42": 0.25158, - "43": 0.25124, - "44": 0.25, - "45": 0.25586, - "46": 0.26057, - "47": 0.25868, - "48": 0.26304, - "49": 0.2615, - "50": 0.26261 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml index 246fb33da57..f6892ae5c24 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/tp_comm_overlap_cfg.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/tp_comm_overlap_cfg.yaml similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/tp_comm_overlap_cfg.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/tp_comm_overlap_cfg.yaml diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml index 196492f1ec7..9c23cb7938f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml index 665388ce7a1..4727007ffe2 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_mxfp8_tp_sp_cp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp4_cp2_mxfp8_tp_sp_cp/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_mxfp8_tp_sp_cp/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp4_cp2_mxfp8_tp_sp_cp/model_config.yaml index f4cbb87d27d..bba1f1ad19e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_mxfp8_tp_sp_cp/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp4_cp2_mxfp8_tp_sp_cp/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml index 80218da886d..5ac9b7b4701 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml @@ -16,13 +16,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml index 96b4a6c0ccc..0e70965cb2b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp1_bf16_baseline/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp1_bf16_baseline/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp1_bf16_baseline/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp1_bf16_baseline/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp1_bf16_baseline/model_config.yaml similarity index 89% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp1_bf16_baseline/model_config.yaml index c46be1c819b..db5dea3ae6e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp1_bf16_baseline/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml index c151135828d..12063418adf 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp2_fp8_pp/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp2_fp8_pp/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp2_fp8_pp/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp2_fp8_pp/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp2_fp8_pp/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp2_fp8_pp/model_config.yaml index 40dea9779c9..51a2f6cfc7c 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp1_pp2_fp8_pp/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml similarity index 92% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml index fb47009a77d..5668a7575e2 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/tp_comm_overlap_cfg.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/tp_comm_overlap_cfg.yaml similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/tp_comm_overlap_cfg.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/tp_comm_overlap_cfg.yaml diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml index 32dd88dfb72..66c9f171be5 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml index 21c6ac25e83..54237309642 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml index 59707f588c0..a5903e51b6c 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml index 0e62673a628..ac5ff6cfbbf 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp/model_config.yaml @@ -16,13 +16,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml index 4361bf233cd..3963a359ea9 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml index ed56bc7cfad..ddb34ad850b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_pp2_fp8_tp_pp/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_pp2_fp8_tp_pp/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_pp2_fp8_tp_pp/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_pp2_fp8_tp_pp/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml similarity index 90% rename from tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml rename to tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml index fe4a6575953..cf9f6b6ceb8 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 2000 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgx_h100.json index b8f1a38fa0f..8776674df82 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgx_h100.json @@ -1,159 +1,187 @@ { - "0": { - "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", - "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", - "generated_tokens": [ - 3060, - 2430, - 1636, - 2012, - 1317, - 1278, - 2362, - 1307, - 1278, - 16070, - 1044, - 1321, - 1636, - 23067, - 1455, - 1593, - 1395, - 1605, - 3140, - 5152, - 1513, - 1747, - 1046, - 2409, - 1395, - 3140, - 5152, - 1513, - 1278, - 2362 - ], - "latency": 2.491823673248291, - "logprobs": [ - -9.362494468688965, - -2.827894449234009, - -4.557381629943848, - -1.4968647956848145, - -0.717312216758728, - -1.7262351512908936, - -2.522736072540283, - -2.1782360076904297, - -2.3603432178497314, - -6.136383533477783, - -1.4676916599273682, - -3.468963384628296, - -4.424870491027832, - -3.7345848083496094, - -2.012619972229004, - -1.8833301067352295, - -3.5708768367767334, - -6.8197832107543945, - -0.3122292757034302, - -0.9820290207862854, - -6.532033443450928, - -7.498172760009766, - -12.615165710449219, - -2.409003496170044, - -3.8550546169281006, - -0.5105050802230835, - -4.2802581787109375, - -0.06971167027950287, - -0.054025799036026, - -3.319596767425537, - -9.703240394592285, - -1.0997297763824463, - -6.224854469299316, - -5.234503269195557, - -3.934987783432007, - -2.5263679027557373, - -3.1843955516815186, - -5.880871295928955, - -1.8436813354492188, - -5.906496047973633, - -12.15787410736084, - -12.5841064453125, - -0.0819428563117981, - -2.6212656497955322, - -1.4329369068145752, - -2.885145425796509, - -1.2901865243911743, - -0.006647023372352123, - -3.5115818977355957, - -12.945953369140625, - -3.793078899383545, - -3.0094375610351562, - -5.966838836669922, - -0.8998424410820007, - -0.040962252765893936, - -1.5467679500579834, - -1.0785343647003174, - -5.73494815826416, - -0.38491737842559814, - -5.017007827758789, - -0.5568072199821472, - -0.5968841910362244, - -2.3609962463378906, - -13.582086563110352, - -0.09050048142671585, - -3.7264108657836914, - -1.1208789348602295, - -6.052675247192383, - -0.5848909616470337, - -3.5906238555908203, - -0.9494907855987549, - -1.5676641464233398, - -5.127577781677246, - -17.19189453125, - -6.698403835296631, - -1.0449178218841553, - -4.365664958953857, - -1.1243419647216797, - -2.2092156410217285, - -1.8081634044647217, - -0.23330983519554138, - -9.439546585083008, - -0.2947109341621399, - -7.253565788269043, - -2.3855936527252197, - -4.629369258880615, - -3.4186267852783203, - -1.9727531671524048, - -2.331681251525879, - -1.5606917142868042, - -2.454296588897705, - -1.5334703922271729, - -1.2631131410598755, - -2.657367706298828, - -0.6480202078819275, - -0.4550393521785736, - -1.3625166416168213, - -0.8142069578170776, - -0.4496593475341797, - -0.9312890768051147, - -1.732723355293274, - -0.44613128900527954, - -1.6895122528076172, - -0.6082233190536499, - -1.0978344678878784, - -1.1122435331344604, - -0.002520838286727667, - -1.4072327613830566, - -0.007462364621460438, - -0.7548662424087524, - -0.9937503337860107, - -0.0675487294793129, - -0.9595617055892944, - -0.029961343854665756, - -2.205785036087036, - -1.2615025043487549, - -0.7878209352493286 - ] - }, - "throughput": 109.3571928299837 + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", + "generated_tokens": [ + 3060, + 2430, + 1636, + 2012, + 1317, + 1278, + 2362, + 1307, + 1278, + 16070, + 1044, + 1321, + 1636, + 23067, + 1455, + 1593, + 1395, + 1605, + 3140, + 5152, + 1513, + 1747, + 1046, + 2409, + 1395, + 3140, + 5152, + 1513, + 1278, + 2362 + ], + "latency": 0.290968656539917, + "cuda_graph_request_count_map": { + "372": 0, + "360": 0, + "336": 0, + "312": 0, + "288": 0, + "264": 0, + "240": 0, + "216": 0, + "192": 0, + "168": 0, + "144": 0, + "120": 0, + "96": 0, + "72": 0, + "48": 0, + "24": 29 + }, + "step_count": 240, + "logprobs": [ + -9.362494468688965, + -2.827894449234009, + -4.557381629943848, + -1.4968647956848145, + -0.717312216758728, + -1.7262351512908936, + -2.522736072540283, + -2.1782360076904297, + -2.3603432178497314, + -6.136383533477783, + -1.4676916599273682, + -3.468963384628296, + -4.424870491027832, + -3.7345848083496094, + -2.012619972229004, + -1.8833301067352295, + -3.5708768367767334, + -6.8197832107543945, + -0.3122292757034302, + -0.9820290207862854, + -6.532033443450928, + -7.498172760009766, + -12.615165710449219, + -2.409003496170044, + -3.8550546169281006, + -0.5105050802230835, + -4.2802581787109375, + -0.06971167027950287, + -0.054025799036026, + -3.319596767425537, + -9.703240394592285, + -1.0997297763824463, + -6.224854469299316, + -5.234503269195557, + -3.934987783432007, + -2.5263679027557373, + -3.1843955516815186, + -5.880871295928955, + -1.8436813354492188, + -5.906496047973633, + -12.15787410736084, + -12.5841064453125, + -0.0819428563117981, + -2.6212656497955322, + -1.4329369068145752, + -2.885145425796509, + -1.2901865243911743, + -0.006647023372352123, + -3.5115818977355957, + -12.945953369140625, + -3.793078899383545, + -3.0094375610351562, + -5.966838836669922, + -0.8998424410820007, + -0.040962252765893936, + -1.5467679500579834, + -1.0785343647003174, + -5.73494815826416, + -0.38491737842559814, + -5.017007827758789, + -0.5568072199821472, + -0.5968841910362244, + -2.3609962463378906, + -13.582086563110352, + -0.09050048142671585, + -3.7264108657836914, + -1.1208789348602295, + -6.052675247192383, + -0.5848909616470337, + -3.5906238555908203, + -0.9494907855987549, + -1.5676641464233398, + -5.127577781677246, + -17.19189453125, + -6.698403835296631, + -1.0449178218841553, + -4.365664958953857, + -1.1243419647216797, + -2.2092156410217285, + -1.8081634044647217, + -0.23330983519554138, + -9.439546585083008, + -0.2947109341621399, + -7.253565788269043, + -2.3855936527252197, + -4.629369258880615, + -3.4186267852783203, + -1.9727531671524048, + -2.331681251525879, + -1.5606917142868042, + -2.454296588897705, + -1.5334703922271729, + -1.2631131410598755, + -2.657367706298828, + -0.6480202078819275, + -0.4550393521785736, + -1.3625166416168213, + -0.8142069578170776, + -0.4496593475341797, + -0.9312890768051147, + -1.732723355293274, + -0.44613128900527954, + -1.6895122528076172, + -0.6082233190536499, + -1.0978344678878784, + -1.1122435331344604, + -0.002520838286727667, + -1.4072327613830566, + -0.007462364621460438, + -0.7548662424087524, + -0.9937503337860107, + -0.0675487294793129, + -0.9595617055892944, + -0.029961343854665756, + -2.205785036087036, + -1.2615025043487549, + -0.7878209352493286 + ] + }, + "throughput": [ + 2.3393335747358535, + 102.34586197079994, + 103.58898028807208, + 104.45258510126983, + 103.72620640365217, + 104.56994550823111, + 105.82297004422847, + 102.44643771631509 + ] } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml index 2ec2c402230..0675b047464 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --micro-batch-size: 1 --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true - --timing-log-level: 2 + --timing-log-level: 0 --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1 --distributed-backend: nccl --log-interval: 1 diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/model_config.yaml index 67c9de20806..2ba9050ceaf 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/model_config.yaml @@ -9,7 +9,7 @@ MODEL_ARGS: --tiktoken-pattern: v2 --use-mcore-models: true --tokenizer-type: TikTokenizer - --tokenizer-model: ${DATA_PATH}/mcore_mistral/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --auto-detect-ckpt-format: true --max-tokens-to-oom: 3600000 --inference-max-seq-length: 4096 @@ -18,8 +18,8 @@ MODEL_ARGS: --micro-batch-size: 1 --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true - --timing-log-level: 2 - --load: ${CHECKPOINT_LOAD_PATH}/mcore_mistral/model + --timing-log-level: 0 + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1 --distributed-backend: nccl --log-interval: 1 --transformer-impl: transformer_engine diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json index d76a889a3fa..31b66789d94 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json @@ -1,2699 +1,2703 @@ { - "0": { - "input_prompt": "The $500 Cup of coffee?\nConsider this, most Americans spend an average of $1,500-2,000 a year on this bean water.\nI have a few question for you: \nHow has business been the past few months?\nDo you ever feel like your business is stuck?\nDon't feel like you're able to improve performance and make changes required to achieve success ?\nAre your customers spneding less and less and wanting more?\nHave the gas prices affected your business?\nDo you have employees and do they hate you or wish they could quit?\n\nNow, before you and I can decide wheter or not I will be a good fit for your business we should talk this over with coffee.\nAnd, just to warn you this isn't some casual thing. This is not a date or time to be personal or social (but by all means share what you will coz I'll gladly listen).\nTher eare two major talking points and stratagies we will focios on in our lil coffee social\nFor one, we will find your unique selling Proposition (USP).\nDo have the best price? Are you the cheapest in town? Are your customers jerks? Do you haVE REGULARS? Why do people come back?\nwe'll also look for the holes in your business bucket. I'm willing to bet there's a hole or two in your business we'll find together that'll make this 500 dollar cup of Joe pay for itse;f immedietly.\nMany find themselves to be more profitable by just finding out where the dollars are escaping in their business and I like to think of myself as a guy that comes along with some spakel or putty and patch those holes up for you.\nBeleive me, just fixing one hole can mean a lot...just think about a sinking boat that has a hole in it that's about 3\u201d in diameter... it doesn't take long to sink.\nI have no agenda, besides f=getting to know your business and seeing wher I can patch the holes and find what makes you do darn unique (I know this won't take long.)\nMany folks, I bet, will find what they need to get off their chest with a quick phone call and they just paypal me the money and make a coffee at home. Look, that's fine too.\nI just to get you ot of your comfort zone, because this is where it all starts my frind.\nSome smart GOAT entrepreneur will probably get everything they need just out of our lil mini consulatant for the more extensive business consukting I offer, and look, that's fine, too.\nMaybe this cup of coffee will be all you need to gtet back on your feet, but not only surive, but thrive!\nI'm not trying to make frineds, or make a bunch of money, or look down your shirt\nBut this is only going to be a 45 minute (max) coffee chat\nAnd, I know you (and me) both have a lot on our plates. So no messing around\nAfter our consultation I will follow up with you in a few days and see how things are going, then I will be emailing you about once or twice every two weeks, just to make sure you're staying on task and implementing what we went over.\nTghere is no obligation to go any further and will gladly give you back your money if this pre-consultation doesn't put you on the right path or you don't get any value out of it...", - "generated_text": " $ is a$ is a $ is a $ is a $ is a $ is a $$1, you\n$ $$ $\n$ $- $\n$\n$\n$ $$$\n$\n$\n$\n$\n$\n$\n$\n$???????, $\n-1\n$\n1.5.\n$\n$, you\n$.\n$\n1,1.1\nI\n$.\nI\n\n\nHow\n$,,,0,1,0,0.0\nIn\nThe\nThe\nThe\n", - "generated_tokens": [ - 1659, - 1395, - 1261, - 1036, - 1395, - 1261, - 1659, - 1395, - 1261, - 1659, - 1395, - 1261, - 1659, - 1395, - 1261, - 1659, - 1395, - 1261, - 1659, - 1036, - 1049, - 1044, - 1636, - 1010, - 1036, - 1659, - 1036, - 1659, - 1010, - 1036, - 1659, - 1045, - 1659, - 1010, - 1036, - 1010, - 1036, - 1010, - 1036, - 1659, - 1036, - 1036, - 1010, - 1036, - 1010, - 1036, - 1010, - 1036, - 1010, - 1036, - 1010, - 1036, - 1010, - 1036, - 1010, - 1036, - 1010, - 1036, - 1063, - 1063, - 1063, - 1063, - 1063, - 1063, - 1063, - 1044, - 1659, - 1010, - 1045, - 1049, - 1010, - 1036, - 1010, - 1049, - 1046, - 1053, - 1046, - 1010, - 1036, - 1010, - 1036, - 1044, - 1636, - 1010, - 1036, - 1046, - 1010, - 1036, - 1010, - 1049, - 1044, - 1049, - 1046, - 1049, - 1010, - 1073, - 1010, - 1036, - 1046, - 1010, - 1073, - 1010, - 1010, - 1010, - 7801, - 1010, - 1036, - 1044, - 1044, - 1044, - 1048, - 1044, - 1049, - 1044, - 1048, - 1044, - 1048, - 1046, - 1048, - 1010, - 1785, - 1010, - 1784, - 1010, - 1784, - 1010, - 1784, - 1010 - ], - "latency": 6.757228374481201, - "cuda_graph_request_count_map": null, - "step_count": 2048, - "logprobs": [ - -7.7319135665893555, - -2.188307285308838, - -0.7547445297241211, - -0.7294313311576843, - -10.238386154174805, - -3.3775341510772705, - -6.394498825073242, - -7.354557037353516, - -9.018157958984375, - -3.012073040008545, - -3.2584073543548584, - -5.220732688903809, - -4.620487213134766, - -2.5078930854797363, - -3.752683162689209, - -0.13360372185707092, - -0.05705544352531433, - -0.41462242603302, - -1.585279941558838, - -1.6438164710998535, - -1.9557222127914429, - -0.3989897072315216, - -0.0365302674472332, - -6.368816375732422, - -0.8731719255447388, - -0.022585075348615646, - -0.2775891423225403, - -0.0027362785767763853, - -0.0006812873762100935, - -1.581446647644043, - -0.008688976056873798, - -0.3532317280769348, - -6.071163177490234, - -9.162371635437012, - -9.965556144714355, - -2.400461196899414, - -2.9898362159729004, - -2.9803032875061035, - -2.12601900100708, - -3.500912666320801, - -7.015069007873535, - -2.278961420059204, - -0.46380555629730225, - -4.078739166259766, - -1.9430254697799683, - -3.5642244815826416, - -3.689701795578003, - -6.201474189758301, - -6.580833911895752, - -2.3081111907958984, - -5.42717170715332, - -1.1886008977890015, - -1.172760248184204, - -1.3571951389312744, - -1.3551844358444214, - -3.376784324645996, - -0.05118789151310921, - -4.064360618591309, - -2.575554847717285, - -0.6994737386703491, - -2.56724214553833, - -2.1888976097106934, - -0.4816131591796875, - -4.070178985595703, - -2.0060782432556152, - -6.858033180236816, - -0.059200502932071686, - -3.214278221130371, - -0.9671833515167236, - -0.823198676109314, - -1.0130078792572021, - -4.595561981201172, - -0.012724989093840122, - -5.214311599731445, - -8.246870040893555, - -3.1476030349731445, - -3.299684524536133, - -4.218191146850586, - -7.318399429321289, - -0.8580498695373535, - -3.0894036293029785, - -1.886361002922058, - -7.217658996582031, - -3.271679639816284, - -3.9717154502868652, - -1.8835484981536865, - -10.034332275390625, - -11.382490158081055, - -5.417011260986328, - -7.505967140197754, - -2.33837890625, - -0.07904055714607239, - -3.294971227645874, - -7.813640594482422, - -1.7646901607513428, - -4.025320053100586, - -3.5977325439453125, - -4.390352249145508, - -9.147806167602539, - -0.5303041934967041, - -7.721246242523193, - -0.6311959028244019, - -0.8119025230407715, - -0.7227814197540283, - -1.8369406461715698, - -0.20933297276496887, - -1.5395950078964233, - -4.424448490142822, - -4.084965705871582, - -3.355497360229492, - -1.0475609302520752, - -6.479413986206055, - -0.7810530662536621, - -2.132437229156494, - -6.648703098297119, - -2.9522438049316406, - -1.2485712766647339, - -4.040503025054932, - -2.3415768146514893, - -5.358206748962402, - -1.6258506774902344, - -3.956300973892212, - -0.732298731803894, - -7.441117286682129, - -1.5242161750793457, - -2.4555861949920654, - -4.295163154602051, - -9.687600135803223, - -0.8213484883308411, - -1.2446978092193604, - -0.01942702941596508, - -4.619411468505859, - -3.3297007083892822, - -2.2139487266540527, - -3.691431999206543, - -2.6574106216430664, - -6.075929641723633, - -0.6123450994491577, - -1.2942559719085693, - -0.6262839436531067, - -7.398006439208984, - -4.4869890213012695, - -4.202048301696777, - -4.982994079589844, - -0.637227475643158, - -3.061023235321045, - -10.117584228515625, - -3.8567495346069336, - -4.0480828285217285, - -2.472019672393799, - -4.246374607086182, - -1.3939155340194702, - -7.132441520690918, - -0.20108745992183685, - -4.986658573150635, - -4.387957572937012, - -0.01108358334749937, - -4.209756851196289, - -7.271108627319336, - -4.047314643859863, - -2.6497321128845215, - -1.4763175249099731, - -0.28365400433540344, - -3.5247769355773926, - -1.4226995706558228, - -4.327237129211426, - -2.0407187938690186, - -6.1437907218933105, - -1.5190880298614502, - -2.5511486530303955, - -7.504094123840332, - -2.152172565460205, - -6.708334922790527, - -6.913146495819092, - -3.6959621906280518, - -6.752341270446777, - -0.63083815574646, - -0.12433214485645294, - -5.0525641441345215, - -4.435934066772461, - -0.45601028203964233, - -6.3459577560424805, - -9.882917404174805, - -3.1422882080078125, - -2.550520658493042, - -3.2099051475524902, - -6.278127193450928, - -0.07764133810997009, - -3.155696153640747, - -1.933587670326233, - -9.61027717590332, - -6.211391925811768, - -4.664543151855469, - -6.783782005310059, - -5.676271438598633, - -8.605900764465332, - -0.0824289619922638, - -3.5463995933532715, - -13.374168395996094, - -1.2401021718978882, - -1.8734056949615479, - -3.4154422283172607, - -1.6733763217926025, - -17.633970260620117, - -9.345113754272461, - -0.6277351975440979, - -2.9617538452148438, - -2.5565333366394043, - -10.10580825805664, - -7.130337715148926, - -7.36820125579834, - -4.098911285400391, - -5.747079372406006, - -2.945054769515991, - -0.7887389063835144, - -1.6583149433135986, - -1.0165244340896606, - -6.581666946411133, - -5.926386833190918, - -5.845194339752197, - -0.9657630920410156, - -7.868755340576172, - -1.3244551420211792, - -0.2657390236854553, - -0.06403665244579315, - -2.983020782470703, - -5.943899631500244, - -7.877285957336426, - -3.593116283416748, - -3.819509506225586, - -7.226177215576172, - -2.5206997394561768, - -3.385587215423584, - -0.37499159574508667, - -1.4698283672332764, - -3.1460342407226562, - -0.0077166082337498665, - -4.350916862487793, - -3.2183218002319336, - -0.6242184638977051, - -1.4782464504241943, - -2.8054311275482178, - -3.0831401348114014, - -12.17662525177002, - -2.113419532775879, - -1.6448111534118652, - -2.1834323406219482, - -0.7630388140678406, - -10.1896390914917, - -6.234405517578125, - -11.46288776397705, - -1.003785490989685, - -4.211658477783203, - -1.5010679960250854, - -5.859302043914795, - -2.0465080738067627, - -3.7468819618225098, - -4.684195518493652, - -4.318704128265381, - -2.7234389781951904, - -9.00437068939209, - -3.043811321258545, - -3.1384406089782715, - -2.713779926300049, - -2.095993995666504, - -2.1484954357147217, - -10.274479866027832, - -0.682350754737854, - -0.25973302125930786, - -3.6964316368103027, - -13.434456825256348, - -2.3368239402770996, - -5.382724761962891, - -1.9073458909988403, - -5.905669212341309, - -0.032165709882974625, - -1.6530004739761353, - -2.728893280029297, - -1.640552043914795, - -1.1391171216964722, - -1.4353511333465576, - -4.003787994384766, - -0.3450564742088318, - -0.7168521285057068, - -0.34650325775146484, - -0.3616408705711365, - -7.062709331512451, - -1.2851682901382446, - -2.299129009246826, - -8.800156593322754, - -5.208735466003418, - -4.780910491943359, - -2.78342342376709, - -4.469717979431152, - -6.909726619720459, - -2.5114197731018066, - -0.659822404384613, - -0.6915416121482849, - -3.2363741397857666, - -0.5283617377281189, - -0.10473938286304474, - -6.215325832366943, - -7.283237934112549, - -1.6797031164169312, - -11.50100040435791, - -7.5822978019714355, - -3.387317657470703, - -11.407575607299805, - -5.441976547241211, - -3.3264851570129395, - -0.7265786528587341, - -1.382750153541565, - -7.841699600219727, - -8.105277061462402, - -3.9569506645202637, - -4.963083267211914, - -0.5492897629737854, - -4.6081390380859375, - -5.870400905609131, - -3.957930088043213, - -5.275494575500488, - -4.105091094970703, - -2.15435528755188, - -2.8472700119018555, - -1.1278448104858398, - -8.226571083068848, - -0.40629008412361145, - -9.916461944580078, - -4.616743087768555, - -1.691868543624878, - -0.6639478802680969, - -2.5716753005981445, - -6.676954746246338, - -6.535329818725586, - -0.4170510768890381, - -1.443942904472351, - -3.145481824874878, - -1.440589427947998, - -0.26935356855392456, - -0.9647155404090881, - -4.335958957672119, - -1.5647850036621094, - -5.890466690063477, - -3.01654052734375, - -1.9168468713760376, - -3.7365682125091553, - -8.001864433288574, - -10.680083274841309, - -4.489352226257324, - -4.6058149337768555, - -7.69011116027832, - -3.6247005462646484, - -1.5600426197052002, - -10.2160062789917, - -5.004643440246582, - -0.19602319598197937, - -3.375545024871826, - -2.669325590133667, - -1.3932737112045288, - -1.6410658359527588, - -6.847603797912598, - -6.744344711303711, - -0.5215591192245483, - -0.25840020179748535, - -1.1448237895965576, - -5.57253885269165, - -7.251138687133789, - -4.221924781799316, - -0.7688062787055969, - -2.504502534866333, - -3.146519660949707, - -2.206653356552124, - -1.4295082092285156, - -7.96943998336792, - -4.332189083099365, - -2.5750505924224854, - -1.7102608680725098, - -5.311381816864014, - -8.897522926330566, - -2.994919538497925, - -3.3397974967956543, - -2.1794328689575195, - -2.437566041946411, - -0.3181810975074768, - -0.27412793040275574, - -0.7914466857910156, - -2.3470635414123535, - -2.4099245071411133, - -2.491870880126953, - -3.024170160293579, - -1.9719040393829346, - -11.373910903930664, - -1.4279751777648926, - -0.14573107659816742, - -2.055763006210327, - -6.366893291473389, - -4.24091911315918, - -0.00709194503724575, - -2.0199716091156006, - -2.524750232696533, - -1.4272525310516357, - -0.5185190439224243, - -2.927150011062622, - -2.7070627212524414, - -3.365638017654419, - -4.318085193634033, - -7.773144721984863, - -1.7947180271148682, - -7.657534599304199, - -8.767786026000977, - -14.74280071258545, - -1.8042558431625366, - -3.2712037563323975, - -1.4002125263214111, - -4.887944221496582, - -1.4821010828018188, - -1.5255622863769531, - -5.879070281982422, - -4.463839530944824, - -5.1955976486206055, - -5.665647506713867, - -0.3775045573711395, - -5.9350481033325195, - -2.800539255142212, - -0.13162286579608917, - -3.034379720687866, - -4.729524612426758, - -4.6252641677856445, - -3.850942611694336, - -2.4760568141937256, - -6.059760093688965, - -10.12075138092041, - -0.9469369649887085, - -11.595907211303711, - -6.875324726104736, - -4.268826007843018, - -2.835529088973999, - -3.8626279830932617, - -4.876199245452881, - -0.013071090914309025, - -4.964417934417725, - -0.7445687055587769, - -5.707155227661133, - -6.10660457611084, - -4.317755699157715, - -4.440443992614746, - -2.9202542304992676, - -4.743522644042969, - -1.2569392919540405, - -2.8675737380981445, - -2.3151841163635254, - -4.318130970001221, - -1.9054772853851318, - -1.1808521747589111, - -0.765956461429596, - -2.768916606903076, - -6.237791061401367, - -1.7224305868148804, - -7.137521743774414, - -4.512486457824707, - -1.9069950580596924, - -4.145983695983887, - -5.365190505981445, - -0.059828490018844604, - -2.273892879486084, - -3.4013004302978516, - -5.035730361938477, - -6.501443386077881, - -9.903446197509766, - -1.6332892179489136, - -2.1572084426879883, - -1.6149548292160034, - -1.4698481559753418, - -6.01010799407959, - -2.2243528366088867, - -6.900836944580078, - -6.0930986404418945, - -2.974020481109619, - -3.225423574447632, - -8.423272132873535, - -1.3423724174499512, - -3.626147508621216, - -0.4862469434738159, - -6.860866546630859, - -3.8910953998565674, - -2.33319354057312, - -1.7229185104370117, - -2.215972423553467, - -8.99046516418457, - -4.099084854125977, - -2.4191012382507324, - -8.288970947265625, - -2.9641928672790527, - -1.5036451816558838, - -3.0544614791870117, - -0.0715634673833847, - -2.444031238555908, - -4.520998954772949, - -3.972568988800049, - -0.4985870122909546, - -2.1651363372802734, - -3.4427435398101807, - -1.730639100074768, - -0.9458961486816406, - -7.740211009979248, - -9.39163875579834, - -3.895984172821045, - -1.7523534297943115, - -5.41331672668457, - -8.910720825195312, - -12.971094131469727, - -3.0455880165100098, - -10.501265525817871, - -3.3864927291870117, - -4.842309951782227, - -3.9964733123779297, - -7.3046793937683105, - -2.6607093811035156, - -1.3541781902313232, - -5.003270626068115, - -3.944551944732666, - -0.11356143653392792, - -5.174440383911133, - -9.628616333007812, - -8.654989242553711, - -8.980416297912598, - -6.670101642608643, - -5.488286018371582, - -5.943419933319092, - -2.126483201980591, - -8.054739952087402, - -7.458671569824219, - -2.5267202854156494, - -6.455472946166992, - -8.655346870422363, - -7.903901100158691, - -6.221062660217285, - -7.129237174987793, - -4.2345380783081055, - -2.5375306606292725, - -7.697700500488281, - -1.567080020904541, - -2.084331750869751, - -0.25020831823349, - -1.5145041942596436, - -4.619244575500488, - -0.2970108985900879, - -0.4977554678916931, - -6.197869300842285, - -4.030620098114014, - -7.232107639312744, - -0.21076253056526184, - -1.563366174697876, - -1.133756160736084, - -2.708237648010254, - -4.080535888671875, - -0.6818401217460632, - -0.1864331066608429, - -0.49012088775634766, - -8.732468605041504, - -11.945040702819824, - -5.243098735809326, - -1.5294703245162964, - -0.8935543298721313, - -0.6174070835113525, - -1.5068217515945435, - -3.5766501426696777, - -5.393096923828125, - -4.202867031097412, - -14.765748023986816, - -5.2513813972473145, - -0.7597705721855164, - -0.2502063810825348, - -1.7403976917266846, - -2.8000779151916504, - -1.9808133840560913, - -2.1654744148254395, - -1.8629226684570312, - -3.222038745880127, - -0.040942225605249405, - -2.3384013175964355, - -10.210381507873535, - -4.5859761238098145, - -0.5805734395980835, - -3.7019288539886475, - -2.001936674118042, - -2.7876083850860596, - -2.9799084663391113, - -4.349887371063232, - -0.0792960673570633, - -1.4366114139556885, - -1.0813264846801758, - -1.3510822057724, - -6.7060699462890625, - -5.436615943908691, - -3.978389263153076, - -6.785447597503662, - -6.147171497344971, - -3.97414231300354, - -4.332991600036621, - -0.9269428253173828, - -5.1237101554870605, - -4.486598968505859, - -0.04678357392549515, - -1.0307552814483643, - -1.4249452352523804, - -4.517682075500488, - -3.561821699142456, - -2.0815205574035645, - -0.6041194200515747, - -5.992964744567871, - -7.092092514038086, - -0.48916709423065186, - -2.6405677795410156, - -4.3345723152160645, - -3.533582925796509, - -3.1233346462249756, - -3.107872486114502, - -1.9901115894317627, - -3.1052846908569336, - -1.8440347909927368, - -6.21368408203125, - -1.8796799182891846, - -2.705214738845825, - -0.2987763583660126, - -4.070865154266357, - -1.6675832271575928, - -1.3896636962890625, - -1.5731089115142822, - -3.526170015335083, - -2.5088443756103516, - -1.208929419517517, - -3.673125743865967, - -2.501532554626465, - -6.875064373016357, - -8.512459754943848, - -1.042314052581787, - -3.657850980758667, - -7.0950798988342285, - -4.974049091339111, - -8.14085578918457, - -3.529888153076172, - -1.9389504194259644, - -7.0902204513549805, - -2.409292459487915, - -2.9428021907806396, - -1.688283085823059, - -3.622368335723877, - -2.0903351306915283, - -4.160663604736328, - -3.1683764457702637, - -1.2135626077651978, - -7.566033363342285, - -3.1186251640319824, - -5.899919509887695, - -0.9518840312957764, - -2.656729221343994, - -2.2994377613067627, - -6.806836128234863, - -1.280236840248108, - -2.838846206665039, - -1.3598848581314087, - -11.707776069641113, - -3.134333372116089, - -0.6230669617652893, - -8.219222068786621, - -7.562507152557373, - -7.489459037780762, - -1.5368008613586426, - -7.149652481079102, - -5.749268054962158, - -3.162869691848755, - -2.7235195636749268, - -6.128931999206543, - -1.1934199333190918, - -3.986410617828369, - -3.76609468460083, - -1.712721586227417, - -3.195504903793335, - -8.397743225097656, - -3.1260581016540527, - -9.792022705078125, - -4.217884540557861, - -11.583260536193848, - -5.987588882446289, - -5.178754806518555, - -6.994749069213867, - -5.167606353759766, - -7.124668121337891, - -6.201416015625, - -10.203682899475098, - -6.858526229858398, - -2.733592987060547, - -5.078882217407227, - -9.003358840942383, - -4.704894542694092, - -3.9085562229156494, - -7.247268199920654, - -7.091092109680176, - -4.4150166511535645, - -7.56699275970459, - -9.485116004943848, - -1.9977033138275146, - -6.65272331237793, - -2.236643075942993, - -7.518955707550049, - -5.525973320007324, - -4.67877721786499, - -6.608670234680176, - -5.536133766174316, - -10.772479057312012, - -10.8853178024292, - -3.6156129837036133, - -6.751470565795898, - -6.4537434577941895, - -3.4220399856567383, - -8.251005172729492, - -3.2146153450012207, - -6.330069541931152, - -1.5551663637161255, - -6.520583629608154, - -10.450878143310547, - -5.8788957595825195, - -3.7398200035095215, - -3.9084208011627197, - -0.3640081584453583, - -6.961522102355957, - -6.066243648529053, - -7.270624160766602, - -5.098455429077148, - -2.7642822265625, - -5.460171699523926, - -7.362828731536865, - -2.558631658554077, - -2.186410427093506, - -2.5309929847717285, - -2.46756649017334, - -2.0306026935577393, - -1.8713470697402954, - -2.108008623123169, - -1.2698389291763306, - -2.1712756156921387, - -2.4432802200317383, - -1.1477653980255127, - -1.8417484760284424, - -2.5971946716308594, - -1.8250831365585327, - -2.103092670440674, - -2.5183165073394775, - -2.9367291927337646, - -1.9412965774536133, - -1.7692793607711792, - -2.864521026611328, - -3.1332175731658936, - -1.098311185836792, - -2.946441173553467, - -2.2800471782684326, - -3.1929852962493896, - -2.754260778427124, - -3.485616445541382, - -3.3010287284851074, - -2.5537776947021484, - -2.6752865314483643, - -3.1617612838745117, - -2.4571690559387207, - -2.060081958770752, - -2.425969362258911, - -2.212725877761841, - -2.4232254028320312, - -3.0587053298950195, - -2.4074010848999023, - -2.457937479019165, - -2.319617986679077, - -2.6340954303741455, - -2.599524736404419, - -2.5302212238311768, - -1.6849274635314941, - -2.2609786987304688, - -2.039928674697876, - -1.9474098682403564, - -2.3550753593444824, - -1.718749761581421, - -2.413884162902832, - -1.6247628927230835, - -2.4784040451049805, - -1.828325629234314, - -1.3880831003189087, - -1.4448199272155762, - -1.1477117538452148, - -1.1669728755950928, - -1.8787822723388672, - -1.5565840005874634, - -1.6666553020477295, - -1.747725248336792, - -1.959598422050476, - -2.0376486778259277, - -2.345367431640625, - -2.055098533630371, - -1.3940613269805908, - -3.4385242462158203, - -2.7489635944366455, - -3.2590157985687256, - -3.1128957271575928, - -1.7070379257202148, - -3.9010369777679443, - -3.21574068069458, - -3.3850393295288086, - -1.8778185844421387, - -2.698211908340454, - -1.8060741424560547, - -2.0845324993133545, - -3.4797585010528564, - -2.263254404067993, - -3.083108901977539, - -1.6589758396148682, - -2.687279224395752, - -1.77505624294281, - -2.6142921447753906, - -1.934045672416687, - -1.8834377527236938, - -2.8038980960845947, - -1.550542950630188, - -3.4054152965545654, - -1.724036693572998, - -2.3146564960479736, - -1.5134503841400146, - -2.9289023876190186, - -1.5285141468048096, - -3.421035051345825, - -1.3757282495498657, - -3.441431760787964, - -1.5286564826965332, - -3.4372904300689697, - -3.173043966293335, - -1.1313854455947876, - -1.486415147781372, - -2.506413221359253, - -3.5165903568267822, - -1.4112176895141602, - -3.7175354957580566, - -2.2170844078063965, - -0.704839289188385, - -0.6626103520393372, - -2.5483946800231934, - -0.949668288230896, - -3.1339564323425293, - -1.3326977491378784, - -3.5493476390838623, - -0.6785370111465454, - -3.236161470413208, - -2.1347084045410156, - -3.802447557449341, - -2.585503339767456, - -3.9361765384674072, - -2.290905714035034, - -3.714280605316162, - -1.793616533279419, - -3.7252492904663086, - -1.450188159942627, - -3.11938738822937, - -1.25174880027771 - ] - }, - "32": { - "input_prompt": "create a conversational article", - "generated_text": " about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article.", - "generated_tokens": [ - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046 - ], - "latency": 20.777740478515625, - "cuda_graph_request_count_map": null, - "step_count": 2048, - "logprobs": [ - -4.4165568351745605, - -11.358176231384277, - -0.0701780766248703, - -7.797665119171143, - -2.6805992126464844, - -1.4707680940628052, - -3.0390255451202393, - -1.6902849674224854, - -1.270594835281372, - -1.1936196088790894, - -1.2523558139801025, - -2.7270259857177734, - -1.2371309995651245, - -0.9618493318557739, - -0.4379909038543701, - -1.3917063474655151, - -1.1055524349212646, - -0.9122569561004639, - -0.9911308288574219, - -0.08436793833971024, - -0.5424078106880188, - -0.9181017279624939, - -0.5873759388923645, - -0.19014373421669006, - -0.06655456870794296, - -0.15252672135829926, - -0.09415211528539658, - -0.009787309914827347, - -0.013910251669585705, - -0.005296128336340189, - -0.005677408073097467, - -0.02013739012181759, - -0.21594694256782532, - -0.07153760641813278, - -0.0066444179974496365, - -0.010198505595326424, - -0.011980246752500534, - -0.003686776151880622, - -0.0037619550712406635, - -0.0022467151284217834, - -0.004088377580046654, - -0.021828632801771164, - -0.0012669878778979182, - -0.09768074005842209, - -0.02652405947446823, - -0.0019286142196506262, - -0.002283824374899268, - -0.0032225127797573805, - -0.0009741804678924382, - -0.0009415484382770956, - -0.001211624126881361, - -0.001135300612077117, - -0.002340436913073063, - -0.0010846928926184773, - -0.0509282611310482, - -0.03832047060132027, - -0.00257422705180943, - -0.0022806129418313503, - -0.00262785074301064, - -0.0008195855189114809, - -0.0010239601833745837, - -0.0013777059502899647, - -0.0009899006690829992, - -0.0018756669014692307, - -0.0015304292319342494, - -0.08506463468074799, - -0.01893703266978264, - -0.0013797297142446041, - -0.0014461545506492257, - -0.0013971101725474, - -0.0005869334563612938, - -0.0005212855176068842, - -0.000876757490914315, - -0.0005256939912214875, - -0.0012863941956311464, - -0.0015691122971475124, - -0.051276568323373795, - -0.00973513163626194, - -0.0010469438275322318, - -0.0011531615164130926, - -0.0009969270322471857, - -0.00038342276820912957, - -0.0004032037395518273, - -0.000730247818864882, - -0.0003275334893260151, - -0.0008700875914655626, - -0.0017572689102962613, - -0.044393111020326614, - -0.013102858327329159, - -0.0011463745031505823, - -0.0012070996453985572, - -0.0012325793504714966, - -0.0005048430757597089, - -0.0004876854654867202, - -0.0007901645149104297, - -0.00041500062798149884, - -0.0009869233472272754, - -0.0018687656847760081, - -0.03943866863846779, - -0.014425630681216717, - -0.0014756753807887435, - -0.001423775334842503, - -0.001209719106554985, - -0.0005046047735959291, - -0.00042167355422861874, - -0.0007688426994718611, - -0.0002699726028367877, - -0.0006598440813831985, - -0.0017849955474957824, - -0.038999658077955246, - -0.012665312737226486, - -0.0014427024871110916, - -0.0014492495683953166, - -0.001016576774418354, - -0.00042083943844772875, - -0.00033241944038309157, - -0.0006403064471669495, - -0.00022373080719262362, - -0.0007053509471006691, - -0.0018597226589918137, - -0.030997740104794502, - -0.011259939521551132, - -0.0012655591126531363, - -0.00134151556994766, - -0.0008106521563604474, - -0.0003513672563713044, - -0.0002964295563288033, - -0.0006368515896610916, - -0.00020180096908006817, - -0.0005779979983344674, - -0.0016014858847483993, - -0.0271126888692379 - ] - }, - "64": { - "input_prompt": "App Concept: \"Eggy's Interactive Adventure World\"\n\nEggy's Interactive Adventure World is a cutting-edge app designed for phones and tablets that combines immersive storytelling with engaging, interactive experiences for both parents and children. This innovative app allows users to explore the world of Eggy and participate in a series of adventures that teach essential life skills, foster personal growth, and strengthen parent-child bonds.\n\nKey Features:\n\n1. Interactive Stories: The app brings the 20 Eggy stories to life through interactive, animated storytelling. Users can read, listen, and actively participate in the stories, making choices that guide Eggy through various adventures, each with unique lessons and experiences.\n2. Augmented Reality (AR) Integration: The app utilizes AR technology, allowing users to interact with Eggy and the story characters in their real-world environment. This immersive experience creates a deeper connection with the stories and encourages engagement.\n3. Personalized Adventures: The app customizes each story based on the user's interests, needs, and personal growth areas. By collecting user input, the app tailors the storylines and learning outcomes to ensure that each adventure resonates with the child's unique experiences.\n4. Parent-Child Cooperative Play: The app encourages parents and children to collaborate and engage in the stories together. Through a split-screen mode or multi-device connectivity, parents and children can actively participate in the stories and learn essential life skills together.\n5. Skill-Building Mini-Games: Interspersed throughout the stories are educational mini-games that reinforce the lessons and themes. These games are designed to be fun, engaging, and accessible to children, encouraging them to practice and develop various skills.\n6. Progress Tracking and Rewards: The app tracks the user's progress through the stories and rewards them with badges, trophies, and other collectibles. This gamification aspect encourages continued engagement and fosters a sense of accomplishment and growth.\n7. Community Features: The app provides a safe, moderated online community where parents can share their experiences, discuss the stories and lessons, and connect with other families who are also using the app. This community fosters support, inspiration, and learning.\n8. Expert Resources: The app includes access to resources from experts in child development, psychology, and education, providing parents with additional tools and guidance on fostering the growth and well-being of their children.\n\nEggy's Interactive Adventure World is a one-of-a-kind, mind-blowing app that revolutionizes the way parents and children engage with stories and learn essential life skills. Combining the power of storytelling, cutting-edge technology, and expert resources, this app is a must-have for families looking to strengthen their bonds and support their children's personal growth.", - "generated_text": "The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season was the 10th season was the 10 season was the 10 season was the 10", - "generated_tokens": [ - 2, - 1784, - 1032, - 1049, - 1057, - 1057, - 1057, - 1882, - 1050, - 1048, - 1048, - 1048, - 5526, - 1486, - 1278, - 1032, - 1049, - 1048, - 1411, - 5526, - 1307, - 1278, - 5805, - 46982, - 8525, - 1319, - 19644, - 1076, - 1577, - 1531, - 1032, - 1049, - 1057, - 1057, - 1057, - 1882, - 1050, - 1048, - 1048, - 1048, - 5526, - 1486, - 1278, - 1032, - 1049, - 1048, - 1411, - 5526, - 1307, - 1278, - 5805, - 46982, - 8525, - 1319, - 19644, - 1076, - 1577, - 1531, - 1032, - 1049, - 1057, - 1057, - 1057, - 1882, - 1050, - 1048, - 1048, - 1048, - 5526, - 1486, - 1278, - 1032, - 1049, - 1048, - 1411, - 5526, - 1307, - 1278, - 5805, - 46982, - 8525, - 1319, - 19644, - 1076, - 1577, - 1531, - 1032, - 1049, - 1057, - 1057, - 1057, - 1882, - 1050, - 1048, - 1048, - 1048, - 5526, - 1486, - 1278, - 1032, - 1049, - 1048, - 1411, - 5526, - 1486, - 1278, - 1032, - 1049, - 1048, - 1411, - 5526, - 1486, - 1278, - 1032, - 1049, - 1048, - 5526, - 1486, - 1278, - 1032, - 1049, - 1048, - 5526, - 1486, - 1278, - 1032, - 1049, - 1048 - ], - "latency": 35.2662193775177, - "cuda_graph_request_count_map": null, - "step_count": 2048, - "logprobs": [ - -12.107745170593262, - -2.9727728366851807, - -3.720092535018921, - -5.592433929443359, - -10.964235305786133, - -3.654498338699341, - -9.33439826965332, - -4.833785057067871, - -5.187321662902832, - -2.6944785118103027, - -6.9262237548828125, - -0.654232919216156, - -0.5550781488418579, - -0.21346639096736908, - -0.0134271876886487, - -0.010840100236237049, - -1.3878544569015503, - -0.6296291351318359, - -7.9766011238098145, - -0.4393192231655121, - -5.639142036437988, - -3.277270793914795, - -1.0206468105316162, - -11.703084945678711, - -0.7100943922996521, - -0.2809169888496399, - -2.771284818649292, - -7.190817832946777, - -4.048691749572754, - -0.012056218460202217, - -3.3802318572998047, - -0.6807184815406799, - -3.4844107627868652, - -3.312331199645996, - -0.5001641511917114, - -2.61255145072937, - -4.243694305419922, - -4.333778381347656, - -6.0625810623168945, - -0.011777156963944435, - -0.37577226758003235, - -0.9490834474563599, - -3.5450198650360107, - -2.1778035163879395, - -0.45957911014556885, - -3.00771164894104, - -1.7600425481796265, - -0.09766030311584473, - -2.467618942260742, - -1.329679012298584, - -0.8384320735931396, - -1.1864604949951172, - -3.628342866897583, - -0.2470003068447113, - -1.8938640356063843, - -5.168431282043457, - -0.05005566030740738, - -2.258014678955078, - -2.449028968811035, - -0.0034086955711245537, - -3.9485883712768555, - -1.6201664209365845, - -5.139942646026611, - -4.859354496002197, - -0.23686674237251282, - -0.5541543364524841, - -2.5826025009155273, - -6.114635467529297, - -4.3380208015441895, - -0.7412900924682617, - -0.3221715986728668, - -0.13805493712425232, - -4.1797332763671875, - -7.3456268310546875, - -0.13762745261192322, - -2.0905232429504395, - -1.0178627967834473, - -4.108260631561279, - -0.6007124185562134, - -1.0410642623901367, - -4.122039794921875, - -0.35905471444129944, - -1.4274661540985107, - -4.139932155609131, - -0.4237431585788727, - -1.6294409036636353, - -0.9811424016952515, - -4.132790565490723, - -1.1318120956420898, - -6.8258256912231445, - -1.5455098152160645, - -0.6984409093856812, - -13.664215087890625, - -0.1166313961148262, - -1.6347849369049072, - -0.28875046968460083, - -0.03130083531141281, - -1.5293006896972656, - -1.6488375663757324, - -4.224111557006836, - -4.760683059692383, - -1.9758747816085815, - -1.5828256607055664, - -2.8463857173919678, - -0.2620386481285095, - -1.7243889570236206, - -1.7945923805236816, - -0.8884308338165283, - -0.3766394555568695, - -0.34033581614494324, - -9.05566692352295, - -0.22754782438278198, - -0.033802058547735214, - -0.34108465909957886, - -0.5644669532775879, - -2.0925779342651367, - -4.547505855560303, - -10.870464324951172, - -1.1072022914886475, - -5.503787994384766, - -3.259672164916992, - -0.007964519783854485, - -3.0111639499664307, - -4.246737480163574, - -0.7813188433647156, - -3.331031322479248, - -4.485962867736816, - -0.9492117166519165, - -2.6757047176361084, - -1.1591349840164185, - -1.122117519378662, - -2.629878044128418, - -5.986321926116943, - -0.2146703153848648, - -0.002392764901742339, - -7.372479438781738, - -0.007077385671436787, - -0.06599216908216476, - -0.0970711037516594, - -3.2874932289123535, - -0.0019583588000386953, - -0.9122000336647034, - -4.930907249450684, - -0.019508399069309235, - -0.308611661195755, - -0.07778516411781311, - -3.8497893810272217, - -0.46124517917633057, - -0.38821348547935486, - -2.668412208557129, - -1.845987319946289, - -0.06470083445310593, - -0.006619549356400967, - -1.2610487937927246, - -0.13015533983707428, - -3.365312099456787, - -0.0014690094394609332, - -1.6789823770523071, - -1.2499005794525146, - -3.3992111682891846, - -5.563300132751465, - -0.823418140411377, - -4.24124813079834, - -1.6597849130630493, - -0.6941139698028564, - -1.5637556314468384, - -0.5482053756713867, - -0.9507225751876831, - -3.764758586883545, - -0.0006518622976727784, - -0.7540555000305176, - -5.058262825012207, - -0.3302401602268219, - -2.8130555152893066, - -0.17079885303974152, - -2.871047019958496, - -0.3991694450378418, - -3.1476998329162598, - -0.3488404452800751, - -2.0545666217803955, - -4.201597690582275, - -5.164614677429199, - -0.0271432027220726, - -0.0009785869624465704, - -3.3444161415100098, - -1.3117046356201172, - -6.375423431396484, - -0.05535568296909332, - -0.3919340968132019, - -0.060594215989112854, - -6.507473468780518, - -0.0023910999298095703, - -2.143423318862915, - -3.335618257522583, - -2.953970432281494, - -0.0013383012264966965, - -0.8080525398254395, - -0.29526084661483765, - -0.04036511853337288, - -3.231475353240967, - -1.0585589408874512, - -6.136373043060303, - -0.006182829383760691, - -0.035548023879528046, - -5.509808540344238, - -1.8490750789642334, - -9.83314037322998, - -0.07037576287984848, - -3.1621387004852295, - -6.762360095977783, - -1.3490527868270874, - -3.601043462753296, - -1.176393985748291, - -0.4342959523200989, - -0.06266004592180252, - -5.464046001434326, - -0.017946599051356316, - -1.0416009426116943, - -1.6117159128189087, - -12.289417266845703, - -1.5004339218139648, - -5.76563835144043, - -4.038386821746826, - -0.20812086760997772, - -3.6306562423706055, - -1.3901070356369019, - -1.087137222290039, - -2.423213243484497, - -4.503086090087891, - -0.0008031480247154832, - -0.03627370297908783, - -0.1653430461883545, - -7.958648681640625, - -1.1018548011779785, - -1.290948748588562, - -3.8049263954162598, - -1.8253734111785889, - -0.059022851288318634, - -0.0013984196120873094, - -4.698851585388184, - -2.5421664714813232, - -0.024493809789419174, - -4.828659534454346, - -3.0295286178588867, - -3.550312042236328, - -0.1185273677110672, - -0.22595760226249695, - -0.10782183706760406, - -1.4033282995224, - -0.4485701024532318, - -0.2889708876609802, - -0.05471855774521828, - -0.007632025051862001, - -2.1156554222106934, - -0.6249589323997498, - -4.198577404022217, - -0.14178156852722168, - -4.284021377563477, - -2.227515935897827, - -3.5022120475769043, - -0.19575819373130798, - -15.964509963989258, - -4.055960655212402, - -11.125024795532227, - -0.7681724429130554, - -3.0436902046203613, - -7.030262470245361, - -4.376729488372803, - -5.476145267486572, - -0.4219042658805847, - -3.7689766883850098, - -0.060010604560375214, - -0.8134393692016602, - -0.11386934667825699, - -0.025473715737462044, - -0.09736856073141098, - -4.357361793518066, - -0.3670865297317505, - -0.08063744008541107, - -0.1311480849981308, - -1.0903867483139038, - -1.2705107927322388, - -1.5076212882995605, - -4.295275688171387, - -0.04185756668448448, - -0.19810955226421356, - -1.9645220041275024, - -0.9597910642623901, - -0.13429655134677887, - -0.002283110748976469, - -7.066074371337891, - -3.639211654663086, - -1.0263917446136475, - -8.124760627746582, - -1.132537841796875, - -0.09160765260457993, - -0.08996370434761047, - -10.165366172790527, - -3.501585006713867, - -0.0019847711082547903, - -0.05309417471289635, - -0.31209683418273926, - -0.15089339017868042, - -1.23564875125885, - -1.2685208320617676, - -7.832758903503418, - -0.19271136820316315, - -0.014305183663964272, - -0.0007532381569035351, - -0.44688940048217773, - -2.6239724159240723, - -1.738666296005249, - -1.6480977535247803, - -0.46753185987472534, - -8.656959533691406, - -3.79868483543396, - -0.9281394481658936, - -2.2381181716918945, - -1.7654449939727783, - -0.4948798418045044, - -0.025028761476278305, - -1.5435361862182617, - -1.6390818357467651, - -1.4962153434753418, - -0.3425217270851135, - -0.013077914714813232, - -0.038474079221487045, - -5.3364362716674805, - -0.42365288734436035, - -1.884093999862671, - -3.510357618331909, - -6.198029518127441, - -0.44375038146972656, - -0.0008789013954810798, - -3.6025230884552, - -1.419615626335144, - -2.6723289489746094, - -5.775190830230713, - -1.1380761861801147, - -2.6683366298675537, - -0.43395891785621643, - -0.003145867260172963, - -8.63144302368164, - -1.646262764930725, - -1.732487678527832, - -4.561546802520752, - -0.5277953147888184, - -0.07333153486251831, - -0.5624169707298279, - -0.12201295047998428, - -2.6561455726623535, - -1.1071691513061523, - -2.6895060539245605, - -0.040864069014787674, - -0.04126371443271637, - -1.8294739723205566, - -0.09022177755832672, - -0.3154001832008362, - -0.46215569972991943, - -2.2462844848632812, - -0.30149081349372864, - -0.52588951587677, - -8.288043975830078, - -0.0002057340752799064, - -0.8021711707115173, - -4.4546098709106445, - -0.0001565095444675535, - -0.0015961299650371075, - -0.15216240286827087, - -0.3677564561367035, - -5.018707275390625, - -0.7850045561790466, - -1.9582659006118774, - -1.0046892166137695, - -10.0401029586792, - -0.16878114640712738, - -5.944240570068359, - -1.5523078441619873, - -5.7253522872924805, - -0.47948503494262695, - -0.44009655714035034, - -5.671053886413574, - -0.003280022880062461, - -0.7937742471694946, - -0.9639376401901245, - -0.00030048147891648114, - -1.0747740268707275, - -0.8839919567108154, - -3.416811466217041, - -1.6602673530578613, - -0.2706959843635559, - -0.0024333172477781773, - -4.478696823120117, - -6.20179557800293, - -0.11359559744596481, - -0.202009916305542, - -0.022310219705104828, - -2.367263078689575, - -1.0405994653701782, - -5.984308242797852, - -2.105138063430786, - -9.583202362060547, - -0.0004957877099514008, - -3.0655455589294434, - -0.0669412910938263, - -0.8977450728416443, - -2.2271294593811035, - -2.6617536544799805, - -1.8184051513671875, - -0.8291114568710327, - -0.4864235818386078, - -0.7993525862693787, - -3.51106858253479, - -2.1530935764312744, - -0.257144957780838, - -1.3934082984924316, - -1.3137131929397583, - -0.3384077548980713, - -0.1697217971086502, - -2.353395938873291, - -0.03406282886862755, - -0.39059701561927795, - -3.422821044921875, - -1.7117210626602173, - -0.7018465399742126, - -1.5995906591415405, - -3.6218395233154297, - -0.12497704476118088, - -0.16966234147548676, - -0.7313685417175293, - -0.4956285357475281, - -1.0840849876403809, - -5.042126655578613, - -0.00031704644788987935, - -7.683258056640625, - -0.9210801720619202, - -4.687852382659912, - -0.0028814247343689203, - -0.043382611125707626, - -4.1948652267456055, - -2.66593337059021, - -0.06153333932161331, - -0.0023110604379326105, - -6.729236602783203, - -5.777127742767334, - -0.08932067453861237, - -0.09890018403530121, - -0.009886111132800579, - -3.1145148277282715, - -3.725565195083618, - -0.0021998509764671326, - -3.9927196502685547, - -2.753793239593506, - -1.6037236452102661, - -0.17461130023002625, - -4.804804801940918, - -0.2311229705810547, - -0.30256444215774536, - -2.235363006591797, - -0.006614102050662041, - -0.34757524728775024, - -1.4946835041046143, - -1.222062587738037, - -3.658839225769043, - -1.356170892715454, - -0.5371109843254089, - -3.7580835819244385, - -4.54621696472168, - -0.31577637791633606, - -3.677156925201416, - -2.7181396484375, - -7.4674882888793945, - -0.00019369633810129017, - -2.3798398971557617, - -2.5452184677124023, - -0.2858496308326721, - -4.315659523010254, - -0.025835415348410606, - -0.000603493710514158, - -0.2546294331550598, - -0.12032663822174072, - -2.006908655166626, - -5.990736961364746, - -7.146596908569336, - -0.23356498777866364, - -0.2201036810874939, - -0.01235415879637003, - -0.011248741298913956, - -1.4155778884887695, - -0.40242519974708557, - -5.877886772155762, - -0.7865053415298462, - -0.03231288120150566, - -0.004864405374974012, - -0.0050629740580916405, - -2.7049152851104736, - -6.822089195251465, - -0.39252761006355286, - -1.2290617227554321, - -0.007630132604390383, - -3.485461711883545, - -0.47985684871673584, - -6.1813530921936035, - -0.03757825121283531, - -0.37834712862968445, - -0.22192610800266266, - -1.165318489074707, - -0.5220151543617249, - -0.1289423257112503, - -3.216222047805786, - -1.0787583589553833, - -3.0716826915740967, - -0.6023419499397278, - -2.558605194091797, - -0.927433431148529, - -0.00364841241389513, - -0.14910078048706055, - -0.7318926453590393, - -6.159773826599121, - -0.0015301911626011133, - -1.8908276557922363, - -1.9641315937042236, - -0.021651331335306168, - -2.1648828983306885, - -2.2700207233428955, - -7.833290100097656, - -0.03397307172417641, - -0.8344621658325195, - -0.02225659228861332, - -0.06639260798692703, - -2.3780317306518555, - -3.180129051208496, - -0.09030630439519882, - -2.4138312339782715, - -1.3445552587509155, - -1.848326325416565, - -0.9726964831352234, - -2.851792335510254, - -0.0630769282579422, - -0.0011394681641831994, - -0.05843213573098183, - -2.6616668701171875, - -1.575437068939209, - -0.180197611451149, - -5.552371501922607, - -0.26108410954475403, - -2.529611587524414, - -0.37780019640922546, - -5.141795635223389, - -0.5921107530593872, - -0.2474975287914276, - -0.10687454044818878, - -4.891775131225586, - -0.25011152029037476, - -2.4100728034973145, - -1.358667016029358, - -2.790961503982544, - -3.8654675483703613, - -1.0076243877410889, - -0.7456949949264526, - -1.5575554370880127, - -2.05328631401062, - -1.6538066864013672, - -0.0558217354118824, - -0.0001817776501411572, - -0.0011643542675301433, - -0.038359593600034714, - -1.4208931922912598, - -0.542127251625061, - -0.3162364959716797, - -0.3966117799282074, - -1.1765563488006592, - -1.7920958995819092, - -0.18425509333610535, - -0.1092008650302887, - -0.46676987409591675, - -0.24977745115756989, - -1.0375996828079224, - -0.5268858671188354, - -0.008942908607423306, - -0.6404479146003723, - -0.0033111530356109142, - -5.3165931603871286e-05, - -0.5154370665550232, - -0.39286962151527405, - -1.401839256286621, - -0.6232213973999023, - -0.02168831042945385, - -0.004282470792531967, - -0.005199837032705545, - -0.09748794883489609, - -0.040823787450790405, - -0.00014852374442853034, - -0.0005832401220686734, - -0.005303124897181988, - -0.6537013053894043, - -0.38026049733161926, - -0.04189129173755646, - -0.010385753586888313, - -0.008756335824728012, - -0.013362848199903965, - -0.000504723924677819, - -0.002797620603814721, - -0.0014512732159346342, - -0.0013321106089279056, - -0.010883613489568233, - -0.005159396678209305, - -0.004701037425547838, - -0.01591104455292225, - -0.001474246964789927, - -1.2278481335670222e-05, - -0.010548785328865051, - -0.08341525495052338, - -0.03858809545636177, - -0.056062061339616776, - -0.0009532198309898376, - -0.0005789510905742645, - -0.0008986725588329136, - -0.00710969977080822, - -0.0006561510381288826, - -1.4781842764932662e-05, - -5.578839045483619e-05, - -0.0006398299592547119, - -0.0028786908369511366, - -0.0034092895220965147, - -0.008268529549241066, - -0.006602259818464518, - -0.004517706111073494, - -0.02233586646616459, - -0.0006323245470412076, - -0.009195122867822647, - -0.0029284947086125612, - -0.004457537550479174, - -0.017873765900731087, - -0.008801711723208427, - -0.0036383166443556547, - -0.08078611642122269, - -0.006347495596855879, - -0.0002177716523874551, - -0.04688572511076927, - -0.2718890309333801, - -0.07996802777051926, - -0.04674842208623886, - -0.009984076954424381, - -0.010000954382121563, - -0.050126753747463226, - -0.5864179730415344, - -0.2915390133857727, - -0.008090462535619736, - -0.032190412282943726, - -0.03461571782827377, - -0.2785419523715973, - -0.05830562859773636, - -0.02893950417637825, - -0.12241066247224808, - -0.02711048536002636, - -0.16450686752796173, - -0.09852994978427887, - -0.2651047706604004, - -0.35559725761413574, - -0.12606258690357208, - -0.32793670892715454, - -0.20878805220127106, - -0.7587923407554626, - -0.4730657637119293, - -1.496794581413269, - -0.2486363798379898, - -0.20723387598991394, - -0.1872958242893219, - -0.19151091575622559, - -0.3350580036640167, - -1.3085839748382568, - -0.6109102964401245, - -0.2947172224521637, - -0.37130236625671387, - -0.19888469576835632, - -0.18297068774700165, - -0.9978674054145813, - -0.5471905469894409, - -0.4379975199699402, - -0.407988041639328, - -0.17325688898563385, - -0.16938896477222443 - ] - }, - "96": { - "input_prompt": "what is the concept of double materiality in sustainability?", - "generated_text": " What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double material", - "generated_tokens": [ - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318 - ], - "latency": 50.88405132293701, - "cuda_graph_request_count_map": null, - "step_count": 2048, - "logprobs": [ - -4.917365074157715, - -0.9960631132125854, - -7.875392913818359, - -0.2993181347846985, - -7.760880470275879, - -10.308395385742188, - -2.1807961463928223, - -1.6412583589553833, - -9.521512985229492, - -1.627489447593689, - -1.8410861492156982, - -0.9285702705383301, - -0.2576955556869507, - -0.9641067981719971, - -0.02314644306898117, - -0.6696561574935913, - -0.07035009562969208, - -0.004622488282620907, - -0.025748632848262787, - -0.06276137381792068, - -0.17385317385196686, - -0.3285445272922516, - -0.0592009499669075, - -0.007940039038658142, - -0.22664028406143188, - -0.0017957051750272512, - -0.022929180413484573, - -0.005733947269618511, - -0.0012996093137189746, - -0.006419987417757511, - -0.02376849390566349, - -0.27800270915031433, - -0.4650723934173584, - -0.04936715215444565, - -0.003972141072154045, - -0.01477995328605175, - -0.0012044801842421293, - -0.014891182072460651, - -0.002709767082706094, - -0.0009939497103914618, - -0.0028436246793717146, - -0.006759870797395706, - -0.15416178107261658, - -0.20121537148952484, - -0.016414370387792587, - -0.0015769677702337503, - -0.008138825185596943, - -0.0007713441736996174, - -0.013819841668009758, - -0.003826678032055497, - -0.0005918181850574911, - -0.0014938872773200274, - -0.00485716899856925, - -0.081083282828331, - -0.09642580896615982, - -0.009630884043872356, - -0.0010948146227747202, - -0.007085552904754877, - -0.0006310140597634017, - -0.013073914684355259, - -0.0039152647368609905, - -0.000364713923772797, - -0.001292108790948987, - -0.004158303141593933, - -0.044283974915742874, - -0.05722038820385933, - -0.006369172595441341, - -0.0007976687629707158, - -0.005993015132844448, - -0.0004935238393954933, - -0.011310506612062454, - -0.002951553324237466, - -0.000387831823900342, - -0.000977038755081594, - -0.0036971091758459806, - -0.030511993914842606, - -0.04246694967150688, - -0.004863100592046976, - -0.0006927236099727452, - -0.005206122528761625, - -0.0005129451747052372, - -0.00894621666520834, - -0.0028565814718604088, - -0.00041333239641971886, - -0.0009002208826132119, - -0.0033131728414446115, - -0.021188799291849136, - -0.03330245241522789, - -0.0038543473929166794, - -0.0006504327175207436, - -0.004474864806979895, - -0.00048029806930571795, - -0.009718249551951885, - -0.0030443770810961723, - -0.0003743662964552641, - -0.0009439303539693356, - -0.003729770192876458, - -0.016505014151334763, - -0.0290373582392931, - -0.003315192647278309, - -0.0005821678787469864, - -0.004148805979639292, - -0.00042489083716645837, - -0.006856840569525957, - -0.0028660909738391638, - -0.00032574593205936253, - -0.0006986799417063594, - -0.003671098267659545, - -0.012792548164725304, - -0.02553274855017662, - -0.002730690874159336, - -0.0005067494930699468, - -0.0036923582665622234, - -0.0004451475979294628, - -0.007225453853607178, - -0.002746859099715948, - -0.0003323002893012017, - -0.0008517451351508498, - -0.003630714723840356, - -0.011284693144261837, - -0.02186688780784607, - -0.0025819556321948767, - -0.0004931663861498237, - -0.0031721293926239014, - -0.00040725519647821784, - -0.0062383925542235374, - -0.00238693761639297, - -0.00040749352774582803, - -0.0007970731821842492, - -0.003604583442211151, - -0.010075542144477367, - -0.022386692464351654, - -0.0025295186787843704, - -0.00046302087139338255, - -0.003138143103569746, - -0.0004233417857903987, - -0.006178564392030239, - -0.0021037368569523096 - ] - }, - "127": { - "input_prompt": "as before we talk about using , i usually do if else() to in case the property would be null before.\nif today i public a property and i want to use using to instead , will code be like this?\n\npublic Material Mat;\n\nusing(Mat){\n// Do something;\n}\n\nis that right?", - "generated_text": " i am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using", - "generated_tokens": [ - 1623, - 1855, - 1605, - 5257, - 1693, - 1623, - 1855, - 2505, - 1278, - 8462, - 22692, - 1505, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 17047, - 1809, - 1623, - 1855, - 1605, - 5257, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 17047, - 1505, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 73751, - 1338, - 1073, - 1855, - 1605, - 5257, - 1693, - 1623, - 1855, - 2505, - 1278, - 8462, - 22692, - 1505, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 17047, - 1809, - 1623, - 1855, - 1605, - 5257, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 17047, - 1505, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 73751, - 1338, - 1073, - 1855, - 1605, - 5257, - 1693, - 1623, - 1855, - 2505, - 1278, - 8462, - 22692, - 1505, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 17047, - 1809, - 1623, - 1855, - 1605, - 5257, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 17047, - 1505, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 73751, - 1338, - 1073, - 1855, - 1605, - 5257, - 1693, - 1623, - 1855, - 2505 - ], - "latency": 62.613630533218384, - "cuda_graph_request_count_map": null, - "step_count": 2048, - "logprobs": [ - -7.793755531311035, - -4.224940776824951, - -5.446587562561035, - -0.267395555973053, - -6.118141174316406, - -11.105525970458984, - -6.67517614364624, - -6.902530670166016, - -3.2218151092529297, - -5.99644660949707, - -8.431673049926758, - -8.04836368560791, - -3.992594003677368, - -7.550079345703125, - -2.873685836791992, - -3.3877997398376465, - -7.9306745529174805, - -7.005484580993652, - -1.0481306314468384, - -2.686237335205078, - -6.131283760070801, - -6.2994704246521, - -7.931419372558594, - -11.3147554397583, - -8.544670104980469, - -12.01729679107666, - -3.89847469329834, - -1.7964364290237427, - -2.952878952026367, - -1.9217232465744019, - -2.272329330444336, - -0.37552154064178467, - -1.667820692062378, - -7.510344505310059, - -3.498040199279785, - -7.980632305145264, - -7.672002792358398, - -4.4999470710754395, - -7.155375003814697, - -2.4486124515533447, - -4.785946846008301, - -1.153855800628662, - -2.3994438648223877, - -4.250652313232422, - -12.24446964263916, - -8.344388008117676, - -2.608186721801758, - -5.200589179992676, - -8.25888442993164, - -3.6245617866516113, - -7.689338207244873, - -7.345355033874512, - -1.2661759853363037, - -7.265620231628418, - -1.9884108304977417, - -6.269482612609863, - -2.41705584526062, - -1.8929681777954102, - -1.8259913921356201, - -2.0997350215911865, - -2.323200225830078, - -1.3998825550079346, - -0.8789899945259094, - -1.082053542137146, - -1.1831339597702026, - -1.4462857246398926, - -1.6481035947799683, - -1.4408715963363647, - -1.2603964805603027, - -1.5267670154571533, - -1.6345772743225098, - -1.3796477317810059, - -0.7609691023826599, - -0.3548354506492615, - -0.7552334666252136, - -0.44776833057403564, - -1.1078286170959473, - -1.3036658763885498, - -0.5214896202087402, - -0.8486822843551636, - -0.22470997273921967, - -0.4705755412578583, - -0.5639711022377014, - -0.5388108491897583, - -0.6052999496459961, - -0.1002030223608017, - -0.286334365606308, - -0.45798981189727783, - -1.0107953548431396, - -0.11875647306442261, - -0.6969441771507263, - -0.4609107971191406, - -0.07614769786596298, - -0.5035472512245178, - -0.1682187020778656, - -0.10476160794496536, - -0.6586751341819763, - -0.35806939005851746, - -1.5364394187927246, - -2.4093759059906006, - -1.977368950843811, - -1.6216907501220703, - -0.27647316455841064, - -0.2991848587989807, - -0.2783535420894623, - -0.05913994088768959, - -0.03023873083293438, - -0.043339803814888, - -0.7320341467857361, - -0.0030677898321300745, - -0.0332595594227314, - -0.012804670259356499, - -0.004041599575430155, - -0.0014899593079462647, - -0.001948602613992989, - -0.0029070996679365635, - -0.040939707309007645, - -0.013942227698862553, - -0.04897322878241539, - -0.011005887761712074, - -0.0044113704934716225, - -0.0013179434463381767, - -0.003658389439806342, - -0.009758152067661285, - -0.0014104428701102734, - -0.0016671819612383842, - -0.000771939754486084, - -0.0015519729349762201, - -0.003720743814483285, - -0.004249115474522114, - -0.00485657574608922, - -0.005053604021668434, - -0.002336274366825819, - -0.0009155849111266434, - -0.0004978132783435285, - -0.0005953923100605607, - -0.0011395872570574284, - -0.001485078944824636, - -0.3072909712791443, - -1.7295066118240356, - -0.4807289242744446, - -0.1245415136218071, - -0.011858444660902023, - -0.020613837987184525, - -0.011020978912711143, - -0.003106294432654977, - -0.0009966888464987278, - -0.0019349202048033476, - -0.037407051771879196, - -0.0003496989083942026, - -0.005922981072217226, - -0.007394562941044569, - -0.0006037319544702768, - -0.0008836655179038644, - -0.0002884448622353375, - -0.00047600860125385225, - -0.0024947968777269125, - -0.00442774873226881, - -0.004059052560478449, - -0.0018594847060739994, - -0.0006179092451930046, - -0.00022635281493421644, - -0.0006730675231665373, - -0.003022746881470084, - -0.0002343380037928, - -0.00047791501856409013, - -9.440929716220126e-05, - -0.00021550717065110803, - -0.0013523490633815527, - -0.0032202552538365126, - -0.001157686347141862, - -0.004449942149221897, - -0.0016590891173109412, - -0.00101062236353755, - -0.0003079893649555743, - -0.00048375347978435457, - -0.0021734442561864853, - -0.00423036003485322, - -0.11514264345169067, - -0.8658493757247925, - -0.084366075694561, - -0.02140468917787075, - -0.0060798698104918, - -0.008638513274490833, - -0.003212531330063939, - -0.0009598892065696418, - -0.00032085992279462516 - ] - }, - "throughput": 120.8737525217505 + "0": { + "input_prompt": "The $500 Cup of coffee?\nConsider this, most Americans spend an average of $1,500-2,000 a year on this bean water.\nI have a few question for you: \nHow has business been the past few months?\nDo you ever feel like your business is stuck?\nDon't feel like you're able to improve performance and make changes required to achieve success ?\nAre your customers spneding less and less and wanting more?\nHave the gas prices affected your business?\nDo you have employees and do they hate you or wish they could quit?\n\nNow, before you and I can decide wheter or not I will be a good fit for your business we should talk this over with coffee.\nAnd, just to warn you this isn't some casual thing. This is not a date or time to be personal or social (but by all means share what you will coz I'll gladly listen).\nTher eare two major talking points and stratagies we will focios on in our lil coffee social\nFor one, we will find your unique selling Proposition (USP).\nDo have the best price? Are you the cheapest in town? Are your customers jerks? Do you haVE REGULARS? Why do people come back?\nwe'll also look for the holes in your business bucket. I'm willing to bet there's a hole or two in your business we'll find together that'll make this 500 dollar cup of Joe pay for itse;f immedietly.\nMany find themselves to be more profitable by just finding out where the dollars are escaping in their business and I like to think of myself as a guy that comes along with some spakel or putty and patch those holes up for you.\nBeleive me, just fixing one hole can mean a lot...just think about a sinking boat that has a hole in it that's about 3\u201d in diameter... it doesn't take long to sink.\nI have no agenda, besides f=getting to know your business and seeing wher I can patch the holes and find what makes you do darn unique (I know this won't take long.)\nMany folks, I bet, will find what they need to get off their chest with a quick phone call and they just paypal me the money and make a coffee at home. Look, that's fine too.\nI just to get you ot of your comfort zone, because this is where it all starts my frind.\nSome smart GOAT entrepreneur will probably get everything they need just out of our lil mini consulatant for the more extensive business consukting I offer, and look, that's fine, too.\nMaybe this cup of coffee will be all you need to gtet back on your feet, but not only surive, but thrive!\nI'm not trying to make frineds, or make a bunch of money, or look down your shirt\nBut this is only going to be a 45 minute (max) coffee chat\nAnd, I know you (and me) both have a lot on our plates. So no messing around\nAfter our consultation I will follow up with you in a few days and see how things are going, then I will be emailing you about once or twice every two weeks, just to make sure you're staying on task and implementing what we went over.\nTghere is no obligation to go any further and will gladly give you back your money if this pre-consultation doesn't put you on the right path or you don't get any value out of it...", + "generated_text": " $ is a$ is a $ is a $ is a $ is a $ is a $$1, you\n$ $$ $\n$ $- $\n$\n$\n$ $$$\n$\n$\n$\n$\n$\n$\n$\n$???????, $\n-1\n$\n1.5.\n$\n$, you\n$.\n$\n1,1.1\nI\n$.\nI\n\n\nHow\n$,,,0,1,0,0.0\nIn\nThe\nThe\nThe\n", + "generated_tokens": [ + 1659, + 1395, + 1261, + 1036, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1036, + 1049, + 1044, + 1636, + 1010, + 1036, + 1659, + 1036, + 1659, + 1010, + 1036, + 1659, + 1045, + 1659, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1659, + 1036, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1063, + 1063, + 1063, + 1063, + 1063, + 1063, + 1063, + 1044, + 1659, + 1010, + 1045, + 1049, + 1010, + 1036, + 1010, + 1049, + 1046, + 1053, + 1046, + 1010, + 1036, + 1010, + 1036, + 1044, + 1636, + 1010, + 1036, + 1046, + 1010, + 1036, + 1010, + 1049, + 1044, + 1049, + 1046, + 1049, + 1010, + 1073, + 1010, + 1036, + 1046, + 1010, + 1073, + 1010, + 1010, + 1010, + 7801, + 1010, + 1036, + 1044, + 1044, + 1044, + 1048, + 1044, + 1049, + 1044, + 1048, + 1044, + 1048, + 1046, + 1048, + 1010, + 1785, + 1010, + 1784, + 1010, + 1784, + 1010, + 1784, + 1010 + ], + "latency": 9.833553552627563, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -7.7319135665893555, + -2.188307285308838, + -0.7547445297241211, + -0.7294313311576843, + -10.238386154174805, + -3.3775341510772705, + -6.394498825073242, + -7.354557037353516, + -9.018157958984375, + -3.012073040008545, + -3.2584073543548584, + -5.220732688903809, + -4.620487213134766, + -2.5078930854797363, + -3.752683162689209, + -0.13360372185707092, + -0.05705544352531433, + -0.41462242603302, + -1.585279941558838, + -1.6438164710998535, + -1.9557222127914429, + -0.3989897072315216, + -0.0365302674472332, + -6.368816375732422, + -0.8731719255447388, + -0.022585075348615646, + -0.2775891423225403, + -0.0027362785767763853, + -0.0006812873762100935, + -1.581446647644043, + -0.008688976056873798, + -0.3532317280769348, + -6.071163177490234, + -9.162371635437012, + -9.965556144714355, + -2.400461196899414, + -2.9898362159729004, + -2.9803032875061035, + -2.12601900100708, + -3.500912666320801, + -7.015069007873535, + -2.278961420059204, + -0.46380555629730225, + -4.078739166259766, + -1.9430254697799683, + -3.5642244815826416, + -3.689701795578003, + -6.201474189758301, + -6.580833911895752, + -2.3081111907958984, + -5.42717170715332, + -1.1886008977890015, + -1.172760248184204, + -1.3571951389312744, + -1.3551844358444214, + -3.376784324645996, + -0.05118789151310921, + -4.064360618591309, + -2.575554847717285, + -0.6994737386703491, + -2.56724214553833, + -2.1888976097106934, + -0.4816131591796875, + -4.070178985595703, + -2.0060782432556152, + -6.858033180236816, + -0.059200502932071686, + -3.214278221130371, + -0.9671833515167236, + -0.823198676109314, + -1.0130078792572021, + -4.595561981201172, + -0.012724989093840122, + -5.214311599731445, + -8.246870040893555, + -3.1476030349731445, + -3.299684524536133, + -4.218191146850586, + -7.318399429321289, + -0.8580498695373535, + -3.0894036293029785, + -1.886361002922058, + -7.217658996582031, + -3.271679639816284, + -3.9717154502868652, + -1.8835484981536865, + -10.034332275390625, + -11.382490158081055, + -5.417011260986328, + -7.505967140197754, + -2.33837890625, + -0.07904055714607239, + -3.294971227645874, + -7.813640594482422, + -1.7646901607513428, + -4.025320053100586, + -3.5977325439453125, + -4.390352249145508, + -9.147806167602539, + -0.5303041934967041, + -7.721246242523193, + -0.6311959028244019, + -0.8119025230407715, + -0.7227814197540283, + -1.8369406461715698, + -0.20933297276496887, + -1.5395950078964233, + -4.424448490142822, + -4.084965705871582, + -3.355497360229492, + -1.0475609302520752, + -6.479413986206055, + -0.7810530662536621, + -2.132437229156494, + -6.648703098297119, + -2.9522438049316406, + -1.2485712766647339, + -4.040503025054932, + -2.3415768146514893, + -5.358206748962402, + -1.6258506774902344, + -3.956300973892212, + -0.732298731803894, + -7.441117286682129, + -1.5242161750793457, + -2.4555861949920654, + -4.295163154602051, + -9.687600135803223, + -0.8213484883308411, + -1.2446978092193604, + -0.01942702941596508, + -4.619411468505859, + -3.3297007083892822, + -2.2139487266540527, + -3.691431999206543, + -2.6574106216430664, + -6.075929641723633, + -0.6123450994491577, + -1.2942559719085693, + -0.6262839436531067, + -7.398006439208984, + -4.4869890213012695, + -4.202048301696777, + -4.982994079589844, + -0.637227475643158, + -3.061023235321045, + -10.117584228515625, + -3.8567495346069336, + -4.0480828285217285, + -2.472019672393799, + -4.246374607086182, + -1.3939155340194702, + -7.132441520690918, + -0.20108745992183685, + -4.986658573150635, + -4.387957572937012, + -0.01108358334749937, + -4.209756851196289, + -7.271108627319336, + -4.047314643859863, + -2.6497321128845215, + -1.4763175249099731, + -0.28365400433540344, + -3.5247769355773926, + -1.4226995706558228, + -4.327237129211426, + -2.0407187938690186, + -6.1437907218933105, + -1.5190880298614502, + -2.5511486530303955, + -7.504094123840332, + -2.152172565460205, + -6.708334922790527, + -6.913146495819092, + -3.6959621906280518, + -6.752341270446777, + -0.63083815574646, + -0.12433214485645294, + -5.0525641441345215, + -4.435934066772461, + -0.45601028203964233, + -6.3459577560424805, + -9.882917404174805, + -3.1422882080078125, + -2.550520658493042, + -3.2099051475524902, + -6.278127193450928, + -0.07764133810997009, + -3.155696153640747, + -1.933587670326233, + -9.61027717590332, + -6.211391925811768, + -4.664543151855469, + -6.783782005310059, + -5.676271438598633, + -8.605900764465332, + -0.0824289619922638, + -3.5463995933532715, + -13.374168395996094, + -1.2401021718978882, + -1.8734056949615479, + -3.4154422283172607, + -1.6733763217926025, + -17.633970260620117, + -9.345113754272461, + -0.6277351975440979, + -2.9617538452148438, + -2.5565333366394043, + -10.10580825805664, + -7.130337715148926, + -7.36820125579834, + -4.098911285400391, + -5.747079372406006, + -2.945054769515991, + -0.7887389063835144, + -1.6583149433135986, + -1.0165244340896606, + -6.581666946411133, + -5.926386833190918, + -5.845194339752197, + -0.9657630920410156, + -7.868755340576172, + -1.3244551420211792, + -0.2657390236854553, + -0.06403665244579315, + -2.983020782470703, + -5.943899631500244, + -7.877285957336426, + -3.593116283416748, + -3.819509506225586, + -7.226177215576172, + -2.5206997394561768, + -3.385587215423584, + -0.37499159574508667, + -1.4698283672332764, + -3.1460342407226562, + -0.0077166082337498665, + -4.350916862487793, + -3.2183218002319336, + -0.6242184638977051, + -1.4782464504241943, + -2.8054311275482178, + -3.0831401348114014, + -12.17662525177002, + -2.113419532775879, + -1.6448111534118652, + -2.1834323406219482, + -0.7630388140678406, + -10.1896390914917, + -6.234405517578125, + -11.46288776397705, + -1.003785490989685, + -4.211658477783203, + -1.5010679960250854, + -5.859302043914795, + -2.0465080738067627, + -3.7468819618225098, + -4.684195518493652, + -4.318704128265381, + -2.7234389781951904, + -9.00437068939209, + -3.043811321258545, + -3.1384406089782715, + -2.713779926300049, + -2.095993995666504, + -2.1484954357147217, + -10.274479866027832, + -0.682350754737854, + -0.25973302125930786, + -3.6964316368103027, + -13.434456825256348, + -2.3368239402770996, + -5.382724761962891, + -1.9073458909988403, + -5.905669212341309, + -0.032165709882974625, + -1.6530004739761353, + -2.728893280029297, + -1.640552043914795, + -1.1391171216964722, + -1.4353511333465576, + -4.003787994384766, + -0.3450564742088318, + -0.7168521285057068, + -0.34650325775146484, + -0.3616408705711365, + -7.062709331512451, + -1.2851682901382446, + -2.299129009246826, + -8.800156593322754, + -5.208735466003418, + -4.780910491943359, + -2.78342342376709, + -4.469717979431152, + -6.909726619720459, + -2.5114197731018066, + -0.659822404384613, + -0.6915416121482849, + -3.2363741397857666, + -0.5283617377281189, + -0.10473938286304474, + -6.215325832366943, + -7.283237934112549, + -1.6797031164169312, + -11.50100040435791, + -7.5822978019714355, + -3.387317657470703, + -11.407575607299805, + -5.441976547241211, + -3.3264851570129395, + -0.7265786528587341, + -1.382750153541565, + -7.841699600219727, + -8.105277061462402, + -3.9569506645202637, + -4.963083267211914, + -0.5492897629737854, + -4.6081390380859375, + -5.870400905609131, + -3.957930088043213, + -5.275494575500488, + -4.105091094970703, + -2.15435528755188, + -2.8472700119018555, + -1.1278448104858398, + -8.226571083068848, + -0.40629008412361145, + -9.916461944580078, + -4.616743087768555, + -1.691868543624878, + -0.6639478802680969, + -2.5716753005981445, + -6.676954746246338, + -6.535329818725586, + -0.4170510768890381, + -1.443942904472351, + -3.145481824874878, + -1.440589427947998, + -0.26935356855392456, + -0.9647155404090881, + -4.335958957672119, + -1.5647850036621094, + -5.890466690063477, + -3.01654052734375, + -1.9168468713760376, + -3.7365682125091553, + -8.001864433288574, + -10.680083274841309, + -4.489352226257324, + -4.6058149337768555, + -7.69011116027832, + -3.6247005462646484, + -1.5600426197052002, + -10.2160062789917, + -5.004643440246582, + -0.19602319598197937, + -3.375545024871826, + -2.669325590133667, + -1.3932737112045288, + -1.6410658359527588, + -6.847603797912598, + -6.744344711303711, + -0.5215591192245483, + -0.25840020179748535, + -1.1448237895965576, + -5.57253885269165, + -7.251138687133789, + -4.221924781799316, + -0.7688062787055969, + -2.504502534866333, + -3.146519660949707, + -2.206653356552124, + -1.4295082092285156, + -7.96943998336792, + -4.332189083099365, + -2.5750505924224854, + -1.7102608680725098, + -5.311381816864014, + -8.897522926330566, + -2.994919538497925, + -3.3397974967956543, + -2.1794328689575195, + -2.437566041946411, + -0.3181810975074768, + -0.27412793040275574, + -0.7914466857910156, + -2.3470635414123535, + -2.4099245071411133, + -2.491870880126953, + -3.024170160293579, + -1.9719040393829346, + -11.373910903930664, + -1.4279751777648926, + -0.14573107659816742, + -2.055763006210327, + -6.366893291473389, + -4.24091911315918, + -0.00709194503724575, + -2.0199716091156006, + -2.524750232696533, + -1.4272525310516357, + -0.5185190439224243, + -2.927150011062622, + -2.7070627212524414, + -3.365638017654419, + -4.318085193634033, + -7.773144721984863, + -1.7947180271148682, + -7.657534599304199, + -8.767786026000977, + -14.74280071258545, + -1.8042558431625366, + -3.2712037563323975, + -1.4002125263214111, + -4.887944221496582, + -1.4821010828018188, + -1.5255622863769531, + -5.879070281982422, + -4.463839530944824, + -5.1955976486206055, + -5.665647506713867, + -0.3775045573711395, + -5.9350481033325195, + -2.800539255142212, + -0.13162286579608917, + -3.034379720687866, + -4.729524612426758, + -4.6252641677856445, + -3.850942611694336, + -2.4760568141937256, + -6.059760093688965, + -10.12075138092041, + -0.9469369649887085, + -11.595907211303711, + -6.875324726104736, + -4.268826007843018, + -2.835529088973999, + -3.8626279830932617, + -4.876199245452881, + -0.013071090914309025, + -4.964417934417725, + -0.7445687055587769, + -5.707155227661133, + -6.10660457611084, + -4.317755699157715, + -4.440443992614746, + -2.9202542304992676, + -4.743522644042969, + -1.2569392919540405, + -2.8675737380981445, + -2.3151841163635254, + -4.318130970001221, + -1.9054772853851318, + -1.1808521747589111, + -0.765956461429596, + -2.768916606903076, + -6.237791061401367, + -1.7224305868148804, + -7.137521743774414, + -4.512486457824707, + -1.9069950580596924, + -4.145983695983887, + -5.365190505981445, + -0.059828490018844604, + -2.273892879486084, + -3.4013004302978516, + -5.035730361938477, + -6.501443386077881, + -9.903446197509766, + -1.6332892179489136, + -2.1572084426879883, + -1.6149548292160034, + -1.4698481559753418, + -6.01010799407959, + -2.2243528366088867, + -6.900836944580078, + -6.0930986404418945, + -2.974020481109619, + -3.225423574447632, + -8.423272132873535, + -1.3423724174499512, + -3.626147508621216, + -0.4862469434738159, + -6.860866546630859, + -3.8910953998565674, + -2.33319354057312, + -1.7229185104370117, + -2.215972423553467, + -8.99046516418457, + -4.099084854125977, + -2.4191012382507324, + -8.288970947265625, + -2.9641928672790527, + -1.5036451816558838, + -3.0544614791870117, + -0.0715634673833847, + -2.444031238555908, + -4.520998954772949, + -3.972568988800049, + -0.4985870122909546, + -2.1651363372802734, + -3.4427435398101807, + -1.730639100074768, + -0.9458961486816406, + -7.740211009979248, + -9.39163875579834, + -3.895984172821045, + -1.7523534297943115, + -5.41331672668457, + -8.910720825195312, + -12.971094131469727, + -3.0455880165100098, + -10.501265525817871, + -3.3864927291870117, + -4.842309951782227, + -3.9964733123779297, + -7.3046793937683105, + -2.6607093811035156, + -1.3541781902313232, + -5.003270626068115, + -3.944551944732666, + -0.11356143653392792, + -5.174440383911133, + -9.628616333007812, + -8.654989242553711, + -8.980416297912598, + -6.670101642608643, + -5.488286018371582, + -5.943419933319092, + -2.126483201980591, + -8.054739952087402, + -7.458671569824219, + -2.5267202854156494, + -6.455472946166992, + -8.655346870422363, + -7.903901100158691, + -6.221062660217285, + -7.129237174987793, + -4.2345380783081055, + -2.5375306606292725, + -7.697700500488281, + -1.567080020904541, + -2.084331750869751, + -0.25020831823349, + -1.5145041942596436, + -4.619244575500488, + -0.2970108985900879, + -0.4977554678916931, + -6.197869300842285, + -4.030620098114014, + -7.232107639312744, + -0.21076253056526184, + -1.563366174697876, + -1.133756160736084, + -2.708237648010254, + -4.080535888671875, + -0.6818401217460632, + -0.1864331066608429, + -0.49012088775634766, + -8.732468605041504, + -11.945040702819824, + -5.243098735809326, + -1.5294703245162964, + -0.8935543298721313, + -0.6174070835113525, + -1.5068217515945435, + -3.5766501426696777, + -5.393096923828125, + -4.202867031097412, + -14.765748023986816, + -5.2513813972473145, + -0.7597705721855164, + -0.2502063810825348, + -1.7403976917266846, + -2.8000779151916504, + -1.9808133840560913, + -2.1654744148254395, + -1.8629226684570312, + -3.222038745880127, + -0.040942225605249405, + -2.3384013175964355, + -10.210381507873535, + -4.5859761238098145, + -0.5805734395980835, + -3.7019288539886475, + -2.001936674118042, + -2.7876083850860596, + -2.9799084663391113, + -4.349887371063232, + -0.0792960673570633, + -1.4366114139556885, + -1.0813264846801758, + -1.3510822057724, + -6.7060699462890625, + -5.436615943908691, + -3.978389263153076, + -6.785447597503662, + -6.147171497344971, + -3.97414231300354, + -4.332991600036621, + -0.9269428253173828, + -5.1237101554870605, + -4.486598968505859, + -0.04678357392549515, + -1.0307552814483643, + -1.4249452352523804, + -4.517682075500488, + -3.561821699142456, + -2.0815205574035645, + -0.6041194200515747, + -5.992964744567871, + -7.092092514038086, + -0.48916709423065186, + -2.6405677795410156, + -4.3345723152160645, + -3.533582925796509, + -3.1233346462249756, + -3.107872486114502, + -1.9901115894317627, + -3.1052846908569336, + -1.8440347909927368, + -6.21368408203125, + -1.8796799182891846, + -2.705214738845825, + -0.2987763583660126, + -4.070865154266357, + -1.6675832271575928, + -1.3896636962890625, + -1.5731089115142822, + -3.526170015335083, + -2.5088443756103516, + -1.208929419517517, + -3.673125743865967, + -2.501532554626465, + -6.875064373016357, + -8.512459754943848, + -1.042314052581787, + -3.657850980758667, + -7.0950798988342285, + -4.974049091339111, + -8.14085578918457, + -3.529888153076172, + -1.9389504194259644, + -7.0902204513549805, + -2.409292459487915, + -2.9428021907806396, + -1.688283085823059, + -3.622368335723877, + -2.0903351306915283, + -4.160663604736328, + -3.1683764457702637, + -1.2135626077651978, + -7.566033363342285, + -3.1186251640319824, + -5.899919509887695, + -0.9518840312957764, + -2.656729221343994, + -2.2994377613067627, + -6.806836128234863, + -1.280236840248108, + -2.838846206665039, + -1.3598848581314087, + -11.707776069641113, + -3.134333372116089, + -0.6230669617652893, + -8.219222068786621, + -7.562507152557373, + -7.489459037780762, + -1.5368008613586426, + -7.149652481079102, + -5.749268054962158, + -3.162869691848755, + -2.7235195636749268, + -6.128931999206543, + -1.1934199333190918, + -3.986410617828369, + -3.76609468460083, + -1.712721586227417, + -3.195504903793335, + -8.397743225097656, + -3.1260581016540527, + -9.792022705078125, + -4.217884540557861, + -11.583260536193848, + -5.987588882446289, + -5.178754806518555, + -6.994749069213867, + -5.167606353759766, + -7.124668121337891, + -6.201416015625, + -10.203682899475098, + -6.858526229858398, + -2.733592987060547, + -5.078882217407227, + -9.003358840942383, + -4.704894542694092, + -3.9085562229156494, + -7.247268199920654, + -7.091092109680176, + -4.4150166511535645, + -7.56699275970459, + -9.485116004943848, + -1.9977033138275146, + -6.65272331237793, + -2.236643075942993, + -7.518955707550049, + -5.525973320007324, + -4.67877721786499, + -6.608670234680176, + -5.536133766174316, + -10.772479057312012, + -10.8853178024292, + -3.6156129837036133, + -6.751470565795898, + -6.4537434577941895, + -3.4220399856567383, + -8.251005172729492, + -3.2146153450012207, + -6.330069541931152, + -1.5551663637161255, + -6.520583629608154, + -10.450878143310547, + -5.8788957595825195, + -3.7398200035095215, + -3.9084208011627197, + -0.3640081584453583, + -6.961522102355957, + -6.066243648529053, + -7.270624160766602, + -5.098455429077148, + -2.7642822265625, + -5.460171699523926, + -7.362828731536865, + -2.558631658554077, + -2.186410427093506, + -2.5309929847717285, + -2.46756649017334, + -2.0306026935577393, + -1.8713470697402954, + -2.108008623123169, + -1.2698389291763306, + -2.1712756156921387, + -2.4432802200317383, + -1.1477653980255127, + -1.8417484760284424, + -2.5971946716308594, + -1.8250831365585327, + -2.103092670440674, + -2.5183165073394775, + -2.9367291927337646, + -1.9412965774536133, + -1.7692793607711792, + -2.864521026611328, + -3.1332175731658936, + -1.098311185836792, + -2.946441173553467, + -2.2800471782684326, + -3.1929852962493896, + -2.754260778427124, + -3.485616445541382, + -3.3010287284851074, + -2.5537776947021484, + -2.6752865314483643, + -3.1617612838745117, + -2.4571690559387207, + -2.060081958770752, + -2.425969362258911, + -2.212725877761841, + -2.4232254028320312, + -3.0587053298950195, + -2.4074010848999023, + -2.457937479019165, + -2.319617986679077, + -2.6340954303741455, + -2.599524736404419, + -2.5302212238311768, + -1.6849274635314941, + -2.2609786987304688, + -2.039928674697876, + -1.9474098682403564, + -2.3550753593444824, + -1.718749761581421, + -2.413884162902832, + -1.6247628927230835, + -2.4784040451049805, + -1.828325629234314, + -1.3880831003189087, + -1.4448199272155762, + -1.1477117538452148, + -1.1669728755950928, + -1.8787822723388672, + -1.5565840005874634, + -1.6666553020477295, + -1.747725248336792, + -1.959598422050476, + -2.0376486778259277, + -2.345367431640625, + -2.055098533630371, + -1.3940613269805908, + -3.4385242462158203, + -2.7489635944366455, + -3.2590157985687256, + -3.1128957271575928, + -1.7070379257202148, + -3.9010369777679443, + -3.21574068069458, + -3.3850393295288086, + -1.8778185844421387, + -2.698211908340454, + -1.8060741424560547, + -2.0845324993133545, + -3.4797585010528564, + -2.263254404067993, + -3.083108901977539, + -1.6589758396148682, + -2.687279224395752, + -1.77505624294281, + -2.6142921447753906, + -1.934045672416687, + -1.8834377527236938, + -2.8038980960845947, + -1.550542950630188, + -3.4054152965545654, + -1.724036693572998, + -2.3146564960479736, + -1.5134503841400146, + -2.9289023876190186, + -1.5285141468048096, + -3.421035051345825, + -1.3757282495498657, + -3.441431760787964, + -1.5286564826965332, + -3.4372904300689697, + -3.173043966293335, + -1.1313854455947876, + -1.486415147781372, + -2.506413221359253, + -3.5165903568267822, + -1.4112176895141602, + -3.7175354957580566, + -2.2170844078063965, + -0.704839289188385, + -0.6626103520393372, + -2.5483946800231934, + -0.949668288230896, + -3.1339564323425293, + -1.3326977491378784, + -3.5493476390838623, + -0.6785370111465454, + -3.236161470413208, + -2.1347084045410156, + -3.802447557449341, + -2.585503339767456, + -3.9361765384674072, + -2.290905714035034, + -3.714280605316162, + -1.793616533279419, + -3.7252492904663086, + -1.450188159942627, + -3.11938738822937, + -1.25174880027771 + ] + }, + "32": { + "input_prompt": "create a conversational article", + "generated_text": " about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article.", + "generated_tokens": [ + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046 + ], + "latency": 49.05716586112976, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -4.4165568351745605, + -11.358176231384277, + -0.0701780766248703, + -7.797665119171143, + -2.6805992126464844, + -1.4707680940628052, + -3.0390255451202393, + -1.6902849674224854, + -1.270594835281372, + -1.1936196088790894, + -1.2523558139801025, + -2.7270259857177734, + -1.2371309995651245, + -0.9618493318557739, + -0.4379909038543701, + -1.3917063474655151, + -1.1055524349212646, + -0.9122569561004639, + -0.9911308288574219, + -0.08436793833971024, + -0.5424078106880188, + -0.9181017279624939, + -0.5873759388923645, + -0.19014373421669006, + -0.06655456870794296, + -0.15252672135829926, + -0.09415211528539658, + -0.009787309914827347, + -0.013910251669585705, + -0.005296128336340189, + -0.005677408073097467, + -0.02013739012181759, + -0.21594694256782532, + -0.07153760641813278, + -0.0066444179974496365, + -0.010198505595326424, + -0.011980246752500534, + -0.003686776151880622, + -0.0037619550712406635, + -0.0022467151284217834, + -0.004088377580046654, + -0.021828632801771164, + -0.0012669878778979182, + -0.09768074005842209, + -0.02652405947446823, + -0.0019286142196506262, + -0.002283824374899268, + -0.0032225127797573805, + -0.0009741804678924382, + -0.0009415484382770956, + -0.001211624126881361, + -0.001135300612077117, + -0.002340436913073063, + -0.0010846928926184773, + -0.0509282611310482, + -0.03832047060132027, + -0.00257422705180943, + -0.0022806129418313503, + -0.00262785074301064, + -0.0008195855189114809, + -0.0010239601833745837, + -0.0013777059502899647, + -0.0009899006690829992, + -0.0018756669014692307, + -0.0015304292319342494, + -0.08506463468074799, + -0.01893703266978264, + -0.0013797297142446041, + -0.0014461545506492257, + -0.0013971101725474, + -0.0005869334563612938, + -0.0005212855176068842, + -0.000876757490914315, + -0.0005256939912214875, + -0.0012863941956311464, + -0.0015691122971475124, + -0.051276568323373795, + -0.00973513163626194, + -0.0010469438275322318, + -0.0011531615164130926, + -0.0009969270322471857, + -0.00038342276820912957, + -0.0004032037395518273, + -0.000730247818864882, + -0.0003275334893260151, + -0.0008700875914655626, + -0.0017572689102962613, + -0.044393111020326614, + -0.013102858327329159, + -0.0011463745031505823, + -0.0012070996453985572, + -0.0012325793504714966, + -0.0005048430757597089, + -0.0004876854654867202, + -0.0007901645149104297, + -0.00041500062798149884, + -0.0009869233472272754, + -0.0018687656847760081, + -0.03943866863846779, + -0.014425630681216717, + -0.0014756753807887435, + -0.001423775334842503, + -0.001209719106554985, + -0.0005046047735959291, + -0.00042167355422861874, + -0.0007688426994718611, + -0.0002699726028367877, + -0.0006598440813831985, + -0.0017849955474957824, + -0.038999658077955246, + -0.012665312737226486, + -0.0014427024871110916, + -0.0014492495683953166, + -0.001016576774418354, + -0.00042083943844772875, + -0.00033241944038309157, + -0.0006403064471669495, + -0.00022373080719262362, + -0.0007053509471006691, + -0.0018597226589918137, + -0.030997740104794502, + -0.011259939521551132, + -0.0012655591126531363, + -0.00134151556994766, + -0.0008106521563604474, + -0.0003513672563713044, + -0.0002964295563288033, + -0.0006368515896610916, + -0.00020180096908006817, + -0.0005779979983344674, + -0.0016014858847483993, + -0.0271126888692379 + ] + }, + "64": { + "input_prompt": "App Concept: \"Eggy's Interactive Adventure World\"\n\nEggy's Interactive Adventure World is a cutting-edge app designed for phones and tablets that combines immersive storytelling with engaging, interactive experiences for both parents and children. This innovative app allows users to explore the world of Eggy and participate in a series of adventures that teach essential life skills, foster personal growth, and strengthen parent-child bonds.\n\nKey Features:\n\n1. Interactive Stories: The app brings the 20 Eggy stories to life through interactive, animated storytelling. Users can read, listen, and actively participate in the stories, making choices that guide Eggy through various adventures, each with unique lessons and experiences.\n2. Augmented Reality (AR) Integration: The app utilizes AR technology, allowing users to interact with Eggy and the story characters in their real-world environment. This immersive experience creates a deeper connection with the stories and encourages engagement.\n3. Personalized Adventures: The app customizes each story based on the user's interests, needs, and personal growth areas. By collecting user input, the app tailors the storylines and learning outcomes to ensure that each adventure resonates with the child's unique experiences.\n4. Parent-Child Cooperative Play: The app encourages parents and children to collaborate and engage in the stories together. Through a split-screen mode or multi-device connectivity, parents and children can actively participate in the stories and learn essential life skills together.\n5. Skill-Building Mini-Games: Interspersed throughout the stories are educational mini-games that reinforce the lessons and themes. These games are designed to be fun, engaging, and accessible to children, encouraging them to practice and develop various skills.\n6. Progress Tracking and Rewards: The app tracks the user's progress through the stories and rewards them with badges, trophies, and other collectibles. This gamification aspect encourages continued engagement and fosters a sense of accomplishment and growth.\n7. Community Features: The app provides a safe, moderated online community where parents can share their experiences, discuss the stories and lessons, and connect with other families who are also using the app. This community fosters support, inspiration, and learning.\n8. Expert Resources: The app includes access to resources from experts in child development, psychology, and education, providing parents with additional tools and guidance on fostering the growth and well-being of their children.\n\nEggy's Interactive Adventure World is a one-of-a-kind, mind-blowing app that revolutionizes the way parents and children engage with stories and learn essential life skills. Combining the power of storytelling, cutting-edge technology, and expert resources, this app is a must-have for families looking to strengthen their bonds and support their children's personal growth.", + "generated_text": "The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season was the 10th season was the 10 season was the 10 season was the 10", + "generated_tokens": [ + 2, + 1784, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048 + ], + "latency": 87.92628955841064, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -12.107745170593262, + -2.9727728366851807, + -3.720092535018921, + -5.592433929443359, + -10.964235305786133, + -3.654498338699341, + -9.33439826965332, + -4.833785057067871, + -5.187321662902832, + -2.6944785118103027, + -6.9262237548828125, + -0.654232919216156, + -0.5550781488418579, + -0.21346639096736908, + -0.0134271876886487, + -0.010840100236237049, + -1.3878544569015503, + -0.6296291351318359, + -7.9766011238098145, + -0.4393192231655121, + -5.639142036437988, + -3.277270793914795, + -1.0206468105316162, + -11.703084945678711, + -0.7100943922996521, + -0.2809169888496399, + -2.771284818649292, + -7.190817832946777, + -4.048691749572754, + -0.012056218460202217, + -3.3802318572998047, + -0.6807184815406799, + -3.4844107627868652, + -3.312331199645996, + -0.5001641511917114, + -2.61255145072937, + -4.243694305419922, + -4.333778381347656, + -6.0625810623168945, + -0.011777156963944435, + -0.37577226758003235, + -0.9490834474563599, + -3.5450198650360107, + -2.1778035163879395, + -0.45957911014556885, + -3.00771164894104, + -1.7600425481796265, + -0.09766030311584473, + -2.467618942260742, + -1.329679012298584, + -0.8384320735931396, + -1.1864604949951172, + -3.628342866897583, + -0.2470003068447113, + -1.8938640356063843, + -5.168431282043457, + -0.05005566030740738, + -2.258014678955078, + -2.449028968811035, + -0.0034086955711245537, + -3.9485883712768555, + -1.6201664209365845, + -5.139942646026611, + -4.859354496002197, + -0.23686674237251282, + -0.5541543364524841, + -2.5826025009155273, + -6.114635467529297, + -4.3380208015441895, + -0.7412900924682617, + -0.3221715986728668, + -0.13805493712425232, + -4.1797332763671875, + -7.3456268310546875, + -0.13762745261192322, + -2.0905232429504395, + -1.0178627967834473, + -4.108260631561279, + -0.6007124185562134, + -1.0410642623901367, + -4.122039794921875, + -0.35905471444129944, + -1.4274661540985107, + -4.139932155609131, + -0.4237431585788727, + -1.6294409036636353, + -0.9811424016952515, + -4.132790565490723, + -1.1318120956420898, + -6.8258256912231445, + -1.5455098152160645, + -0.6984409093856812, + -13.664215087890625, + -0.1166313961148262, + -1.6347849369049072, + -0.28875046968460083, + -0.03130083531141281, + -1.5293006896972656, + -1.6488375663757324, + -4.224111557006836, + -4.760683059692383, + -1.9758747816085815, + -1.5828256607055664, + -2.8463857173919678, + -0.2620386481285095, + -1.7243889570236206, + -1.7945923805236816, + -0.8884308338165283, + -0.3766394555568695, + -0.34033581614494324, + -9.05566692352295, + -0.22754782438278198, + -0.033802058547735214, + -0.34108465909957886, + -0.5644669532775879, + -2.0925779342651367, + -4.547505855560303, + -10.870464324951172, + -1.1072022914886475, + -5.503787994384766, + -3.259672164916992, + -0.007964519783854485, + -3.0111639499664307, + -4.246737480163574, + -0.7813188433647156, + -3.331031322479248, + -4.485962867736816, + -0.9492117166519165, + -2.6757047176361084, + -1.1591349840164185, + -1.122117519378662, + -2.629878044128418, + -5.986321926116943, + -0.2146703153848648, + -0.002392764901742339, + -7.372479438781738, + -0.007077385671436787, + -0.06599216908216476, + -0.0970711037516594, + -3.2874932289123535, + -0.0019583588000386953, + -0.9122000336647034, + -4.930907249450684, + -0.019508399069309235, + -0.308611661195755, + -0.07778516411781311, + -3.8497893810272217, + -0.46124517917633057, + -0.38821348547935486, + -2.668412208557129, + -1.845987319946289, + -0.06470083445310593, + -0.006619549356400967, + -1.2610487937927246, + -0.13015533983707428, + -3.365312099456787, + -0.0014690094394609332, + -1.6789823770523071, + -1.2499005794525146, + -3.3992111682891846, + -5.563300132751465, + -0.823418140411377, + -4.24124813079834, + -1.6597849130630493, + -0.6941139698028564, + -1.5637556314468384, + -0.5482053756713867, + -0.9507225751876831, + -3.764758586883545, + -0.0006518622976727784, + -0.7540555000305176, + -5.058262825012207, + -0.3302401602268219, + -2.8130555152893066, + -0.17079885303974152, + -2.871047019958496, + -0.3991694450378418, + -3.1476998329162598, + -0.3488404452800751, + -2.0545666217803955, + -4.201597690582275, + -5.164614677429199, + -0.0271432027220726, + -0.0009785869624465704, + -3.3444161415100098, + -1.3117046356201172, + -6.375423431396484, + -0.05535568296909332, + -0.3919340968132019, + -0.060594215989112854, + -6.507473468780518, + -0.0023910999298095703, + -2.143423318862915, + -3.335618257522583, + -2.953970432281494, + -0.0013383012264966965, + -0.8080525398254395, + -0.29526084661483765, + -0.04036511853337288, + -3.231475353240967, + -1.0585589408874512, + -6.136373043060303, + -0.006182829383760691, + -0.035548023879528046, + -5.509808540344238, + -1.8490750789642334, + -9.83314037322998, + -0.07037576287984848, + -3.1621387004852295, + -6.762360095977783, + -1.3490527868270874, + -3.601043462753296, + -1.176393985748291, + -0.4342959523200989, + -0.06266004592180252, + -5.464046001434326, + -0.017946599051356316, + -1.0416009426116943, + -1.6117159128189087, + -12.289417266845703, + -1.5004339218139648, + -5.76563835144043, + -4.038386821746826, + -0.20812086760997772, + -3.6306562423706055, + -1.3901070356369019, + -1.087137222290039, + -2.423213243484497, + -4.503086090087891, + -0.0008031480247154832, + -0.03627370297908783, + -0.1653430461883545, + -7.958648681640625, + -1.1018548011779785, + -1.290948748588562, + -3.8049263954162598, + -1.8253734111785889, + -0.059022851288318634, + -0.0013984196120873094, + -4.698851585388184, + -2.5421664714813232, + -0.024493809789419174, + -4.828659534454346, + -3.0295286178588867, + -3.550312042236328, + -0.1185273677110672, + -0.22595760226249695, + -0.10782183706760406, + -1.4033282995224, + -0.4485701024532318, + -0.2889708876609802, + -0.05471855774521828, + -0.007632025051862001, + -2.1156554222106934, + -0.6249589323997498, + -4.198577404022217, + -0.14178156852722168, + -4.284021377563477, + -2.227515935897827, + -3.5022120475769043, + -0.19575819373130798, + -15.964509963989258, + -4.055960655212402, + -11.125024795532227, + -0.7681724429130554, + -3.0436902046203613, + -7.030262470245361, + -4.376729488372803, + -5.476145267486572, + -0.4219042658805847, + -3.7689766883850098, + -0.060010604560375214, + -0.8134393692016602, + -0.11386934667825699, + -0.025473715737462044, + -0.09736856073141098, + -4.357361793518066, + -0.3670865297317505, + -0.08063744008541107, + -0.1311480849981308, + -1.0903867483139038, + -1.2705107927322388, + -1.5076212882995605, + -4.295275688171387, + -0.04185756668448448, + -0.19810955226421356, + -1.9645220041275024, + -0.9597910642623901, + -0.13429655134677887, + -0.002283110748976469, + -7.066074371337891, + -3.639211654663086, + -1.0263917446136475, + -8.124760627746582, + -1.132537841796875, + -0.09160765260457993, + -0.08996370434761047, + -10.165366172790527, + -3.501585006713867, + -0.0019847711082547903, + -0.05309417471289635, + -0.31209683418273926, + -0.15089339017868042, + -1.23564875125885, + -1.2685208320617676, + -7.832758903503418, + -0.19271136820316315, + -0.014305183663964272, + -0.0007532381569035351, + -0.44688940048217773, + -2.6239724159240723, + -1.738666296005249, + -1.6480977535247803, + -0.46753185987472534, + -8.656959533691406, + -3.79868483543396, + -0.9281394481658936, + -2.2381181716918945, + -1.7654449939727783, + -0.4948798418045044, + -0.025028761476278305, + -1.5435361862182617, + -1.6390818357467651, + -1.4962153434753418, + -0.3425217270851135, + -0.013077914714813232, + -0.038474079221487045, + -5.3364362716674805, + -0.42365288734436035, + -1.884093999862671, + -3.510357618331909, + -6.198029518127441, + -0.44375038146972656, + -0.0008789013954810798, + -3.6025230884552, + -1.419615626335144, + -2.6723289489746094, + -5.775190830230713, + -1.1380761861801147, + -2.6683366298675537, + -0.43395891785621643, + -0.003145867260172963, + -8.63144302368164, + -1.646262764930725, + -1.732487678527832, + -4.561546802520752, + -0.5277953147888184, + -0.07333153486251831, + -0.5624169707298279, + -0.12201295047998428, + -2.6561455726623535, + -1.1071691513061523, + -2.6895060539245605, + -0.040864069014787674, + -0.04126371443271637, + -1.8294739723205566, + -0.09022177755832672, + -0.3154001832008362, + -0.46215569972991943, + -2.2462844848632812, + -0.30149081349372864, + -0.52588951587677, + -8.288043975830078, + -0.0002057340752799064, + -0.8021711707115173, + -4.4546098709106445, + -0.0001565095444675535, + -0.0015961299650371075, + -0.15216240286827087, + -0.3677564561367035, + -5.018707275390625, + -0.7850045561790466, + -1.9582659006118774, + -1.0046892166137695, + -10.0401029586792, + -0.16878114640712738, + -5.944240570068359, + -1.5523078441619873, + -5.7253522872924805, + -0.47948503494262695, + -0.44009655714035034, + -5.671053886413574, + -0.003280022880062461, + -0.7937742471694946, + -0.9639376401901245, + -0.00030048147891648114, + -1.0747740268707275, + -0.8839919567108154, + -3.416811466217041, + -1.6602673530578613, + -0.2706959843635559, + -0.0024333172477781773, + -4.478696823120117, + -6.20179557800293, + -0.11359559744596481, + -0.202009916305542, + -0.022310219705104828, + -2.367263078689575, + -1.0405994653701782, + -5.984308242797852, + -2.105138063430786, + -9.583202362060547, + -0.0004957877099514008, + -3.0655455589294434, + -0.0669412910938263, + -0.8977450728416443, + -2.2271294593811035, + -2.6617536544799805, + -1.8184051513671875, + -0.8291114568710327, + -0.4864235818386078, + -0.7993525862693787, + -3.51106858253479, + -2.1530935764312744, + -0.257144957780838, + -1.3934082984924316, + -1.3137131929397583, + -0.3384077548980713, + -0.1697217971086502, + -2.353395938873291, + -0.03406282886862755, + -0.39059701561927795, + -3.422821044921875, + -1.7117210626602173, + -0.7018465399742126, + -1.5995906591415405, + -3.6218395233154297, + -0.12497704476118088, + -0.16966234147548676, + -0.7313685417175293, + -0.4956285357475281, + -1.0840849876403809, + -5.042126655578613, + -0.00031704644788987935, + -7.683258056640625, + -0.9210801720619202, + -4.687852382659912, + -0.0028814247343689203, + -0.043382611125707626, + -4.1948652267456055, + -2.66593337059021, + -0.06153333932161331, + -0.0023110604379326105, + -6.729236602783203, + -5.777127742767334, + -0.08932067453861237, + -0.09890018403530121, + -0.009886111132800579, + -3.1145148277282715, + -3.725565195083618, + -0.0021998509764671326, + -3.9927196502685547, + -2.753793239593506, + -1.6037236452102661, + -0.17461130023002625, + -4.804804801940918, + -0.2311229705810547, + -0.30256444215774536, + -2.235363006591797, + -0.006614102050662041, + -0.34757524728775024, + -1.4946835041046143, + -1.222062587738037, + -3.658839225769043, + -1.356170892715454, + -0.5371109843254089, + -3.7580835819244385, + -4.54621696472168, + -0.31577637791633606, + -3.677156925201416, + -2.7181396484375, + -7.4674882888793945, + -0.00019369633810129017, + -2.3798398971557617, + -2.5452184677124023, + -0.2858496308326721, + -4.315659523010254, + -0.025835415348410606, + -0.000603493710514158, + -0.2546294331550598, + -0.12032663822174072, + -2.006908655166626, + -5.990736961364746, + -7.146596908569336, + -0.23356498777866364, + -0.2201036810874939, + -0.01235415879637003, + -0.011248741298913956, + -1.4155778884887695, + -0.40242519974708557, + -5.877886772155762, + -0.7865053415298462, + -0.03231288120150566, + -0.004864405374974012, + -0.0050629740580916405, + -2.7049152851104736, + -6.822089195251465, + -0.39252761006355286, + -1.2290617227554321, + -0.007630132604390383, + -3.485461711883545, + -0.47985684871673584, + -6.1813530921936035, + -0.03757825121283531, + -0.37834712862968445, + -0.22192610800266266, + -1.165318489074707, + -0.5220151543617249, + -0.1289423257112503, + -3.216222047805786, + -1.0787583589553833, + -3.0716826915740967, + -0.6023419499397278, + -2.558605194091797, + -0.927433431148529, + -0.00364841241389513, + -0.14910078048706055, + -0.7318926453590393, + -6.159773826599121, + -0.0015301911626011133, + -1.8908276557922363, + -1.9641315937042236, + -0.021651331335306168, + -2.1648828983306885, + -2.2700207233428955, + -7.833290100097656, + -0.03397307172417641, + -0.8344621658325195, + -0.02225659228861332, + -0.06639260798692703, + -2.3780317306518555, + -3.180129051208496, + -0.09030630439519882, + -2.4138312339782715, + -1.3445552587509155, + -1.848326325416565, + -0.9726964831352234, + -2.851792335510254, + -0.0630769282579422, + -0.0011394681641831994, + -0.05843213573098183, + -2.6616668701171875, + -1.575437068939209, + -0.180197611451149, + -5.552371501922607, + -0.26108410954475403, + -2.529611587524414, + -0.37780019640922546, + -5.141795635223389, + -0.5921107530593872, + -0.2474975287914276, + -0.10687454044818878, + -4.891775131225586, + -0.25011152029037476, + -2.4100728034973145, + -1.358667016029358, + -2.790961503982544, + -3.8654675483703613, + -1.0076243877410889, + -0.7456949949264526, + -1.5575554370880127, + -2.05328631401062, + -1.6538066864013672, + -0.0558217354118824, + -0.0001817776501411572, + -0.0011643542675301433, + -0.038359593600034714, + -1.4208931922912598, + -0.542127251625061, + -0.3162364959716797, + -0.3966117799282074, + -1.1765563488006592, + -1.7920958995819092, + -0.18425509333610535, + -0.1092008650302887, + -0.46676987409591675, + -0.24977745115756989, + -1.0375996828079224, + -0.5268858671188354, + -0.008942908607423306, + -0.6404479146003723, + -0.0033111530356109142, + -5.3165931603871286e-05, + -0.5154370665550232, + -0.39286962151527405, + -1.401839256286621, + -0.6232213973999023, + -0.02168831042945385, + -0.004282470792531967, + -0.005199837032705545, + -0.09748794883489609, + -0.040823787450790405, + -0.00014852374442853034, + -0.0005832401220686734, + -0.005303124897181988, + -0.6537013053894043, + -0.38026049733161926, + -0.04189129173755646, + -0.010385753586888313, + -0.008756335824728012, + -0.013362848199903965, + -0.000504723924677819, + -0.002797620603814721, + -0.0014512732159346342, + -0.0013321106089279056, + -0.010883613489568233, + -0.005159396678209305, + -0.004701037425547838, + -0.01591104455292225, + -0.001474246964789927, + -1.2278481335670222e-05, + -0.010548785328865051, + -0.08341525495052338, + -0.03858809545636177, + -0.056062061339616776, + -0.0009532198309898376, + -0.0005789510905742645, + -0.0008986725588329136, + -0.00710969977080822, + -0.0006561510381288826, + -1.4781842764932662e-05, + -5.578839045483619e-05, + -0.0006398299592547119, + -0.0028786908369511366, + -0.0034092895220965147, + -0.008268529549241066, + -0.006602259818464518, + -0.004517706111073494, + -0.02233586646616459, + -0.0006323245470412076, + -0.009195122867822647, + -0.0029284947086125612, + -0.004457537550479174, + -0.017873765900731087, + -0.008801711723208427, + -0.0036383166443556547, + -0.08078611642122269, + -0.006347495596855879, + -0.0002177716523874551, + -0.04688572511076927, + -0.2718890309333801, + -0.07996802777051926, + -0.04674842208623886, + -0.009984076954424381, + -0.010000954382121563, + -0.050126753747463226, + -0.5864179730415344, + -0.2915390133857727, + -0.008090462535619736, + -0.032190412282943726, + -0.03461571782827377, + -0.2785419523715973, + -0.05830562859773636, + -0.02893950417637825, + -0.12241066247224808, + -0.02711048536002636, + -0.16450686752796173, + -0.09852994978427887, + -0.2651047706604004, + -0.35559725761413574, + -0.12606258690357208, + -0.32793670892715454, + -0.20878805220127106, + -0.7587923407554626, + -0.4730657637119293, + -1.496794581413269, + -0.2486363798379898, + -0.20723387598991394, + -0.1872958242893219, + -0.19151091575622559, + -0.3350580036640167, + -1.3085839748382568, + -0.6109102964401245, + -0.2947172224521637, + -0.37130236625671387, + -0.19888469576835632, + -0.18297068774700165, + -0.9978674054145813, + -0.5471905469894409, + -0.4379975199699402, + -0.407988041639328, + -0.17325688898563385, + -0.16938896477222443 + ] + }, + "96": { + "input_prompt": "what is the concept of double materiality in sustainability?", + "generated_text": " What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double material", + "generated_tokens": [ + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318 + ], + "latency": 126.90091466903687, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -4.917365074157715, + -0.9960631132125854, + -7.875392913818359, + -0.2993181347846985, + -7.760880470275879, + -10.308395385742188, + -2.1807961463928223, + -1.6412583589553833, + -9.521512985229492, + -1.627489447593689, + -1.8410861492156982, + -0.9285702705383301, + -0.2576955556869507, + -0.9641067981719971, + -0.02314644306898117, + -0.6696561574935913, + -0.07035009562969208, + -0.004622488282620907, + -0.025748632848262787, + -0.06276137381792068, + -0.17385317385196686, + -0.3285445272922516, + -0.0592009499669075, + -0.007940039038658142, + -0.22664028406143188, + -0.0017957051750272512, + -0.022929180413484573, + -0.005733947269618511, + -0.0012996093137189746, + -0.006419987417757511, + -0.02376849390566349, + -0.27800270915031433, + -0.4650723934173584, + -0.04936715215444565, + -0.003972141072154045, + -0.01477995328605175, + -0.0012044801842421293, + -0.014891182072460651, + -0.002709767082706094, + -0.0009939497103914618, + -0.0028436246793717146, + -0.006759870797395706, + -0.15416178107261658, + -0.20121537148952484, + -0.016414370387792587, + -0.0015769677702337503, + -0.008138825185596943, + -0.0007713441736996174, + -0.013819841668009758, + -0.003826678032055497, + -0.0005918181850574911, + -0.0014938872773200274, + -0.00485716899856925, + -0.081083282828331, + -0.09642580896615982, + -0.009630884043872356, + -0.0010948146227747202, + -0.007085552904754877, + -0.0006310140597634017, + -0.013073914684355259, + -0.0039152647368609905, + -0.000364713923772797, + -0.001292108790948987, + -0.004158303141593933, + -0.044283974915742874, + -0.05722038820385933, + -0.006369172595441341, + -0.0007976687629707158, + -0.005993015132844448, + -0.0004935238393954933, + -0.011310506612062454, + -0.002951553324237466, + -0.000387831823900342, + -0.000977038755081594, + -0.0036971091758459806, + -0.030511993914842606, + -0.04246694967150688, + -0.004863100592046976, + -0.0006927236099727452, + -0.005206122528761625, + -0.0005129451747052372, + -0.00894621666520834, + -0.0028565814718604088, + -0.00041333239641971886, + -0.0009002208826132119, + -0.0033131728414446115, + -0.021188799291849136, + -0.03330245241522789, + -0.0038543473929166794, + -0.0006504327175207436, + -0.004474864806979895, + -0.00048029806930571795, + -0.009718249551951885, + -0.0030443770810961723, + -0.0003743662964552641, + -0.0009439303539693356, + -0.003729770192876458, + -0.016505014151334763, + -0.0290373582392931, + -0.003315192647278309, + -0.0005821678787469864, + -0.004148805979639292, + -0.00042489083716645837, + -0.006856840569525957, + -0.0028660909738391638, + -0.00032574593205936253, + -0.0006986799417063594, + -0.003671098267659545, + -0.012792548164725304, + -0.02553274855017662, + -0.002730690874159336, + -0.0005067494930699468, + -0.0036923582665622234, + -0.0004451475979294628, + -0.007225453853607178, + -0.002746859099715948, + -0.0003323002893012017, + -0.0008517451351508498, + -0.003630714723840356, + -0.011284693144261837, + -0.02186688780784607, + -0.0025819556321948767, + -0.0004931663861498237, + -0.0031721293926239014, + -0.00040725519647821784, + -0.0062383925542235374, + -0.00238693761639297, + -0.00040749352774582803, + -0.0007970731821842492, + -0.003604583442211151, + -0.010075542144477367, + -0.022386692464351654, + -0.0025295186787843704, + -0.00046302087139338255, + -0.003138143103569746, + -0.0004233417857903987, + -0.006178564392030239, + -0.0021037368569523096 + ] + }, + "127": { + "input_prompt": "as before we talk about using , i usually do if else() to in case the property would be null before.\nif today i public a property and i want to use using to instead , will code be like this?\n\npublic Material Mat;\n\nusing(Mat){\n// Do something;\n}\n\nis that right?", + "generated_text": " i am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using", + "generated_tokens": [ + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505 + ], + "latency": 156.19056010246277, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -7.793755531311035, + -4.224940776824951, + -5.446587562561035, + -0.267395555973053, + -6.118141174316406, + -11.105525970458984, + -6.67517614364624, + -6.902530670166016, + -3.2218151092529297, + -5.99644660949707, + -8.431673049926758, + -8.04836368560791, + -3.992594003677368, + -7.550079345703125, + -2.873685836791992, + -3.3877997398376465, + -7.9306745529174805, + -7.005484580993652, + -1.0481306314468384, + -2.686237335205078, + -6.131283760070801, + -6.2994704246521, + -7.931419372558594, + -11.3147554397583, + -8.544670104980469, + -12.01729679107666, + -3.89847469329834, + -1.7964364290237427, + -2.952878952026367, + -1.9217232465744019, + -2.272329330444336, + -0.37552154064178467, + -1.667820692062378, + -7.510344505310059, + -3.498040199279785, + -7.980632305145264, + -7.672002792358398, + -4.4999470710754395, + -7.155375003814697, + -2.4486124515533447, + -4.785946846008301, + -1.153855800628662, + -2.3994438648223877, + -4.250652313232422, + -12.24446964263916, + -8.344388008117676, + -2.608186721801758, + -5.200589179992676, + -8.25888442993164, + -3.6245617866516113, + -7.689338207244873, + -7.345355033874512, + -1.2661759853363037, + -7.265620231628418, + -1.9884108304977417, + -6.269482612609863, + -2.41705584526062, + -1.8929681777954102, + -1.8259913921356201, + -2.0997350215911865, + -2.323200225830078, + -1.3998825550079346, + -0.8789899945259094, + -1.082053542137146, + -1.1831339597702026, + -1.4462857246398926, + -1.6481035947799683, + -1.4408715963363647, + -1.2603964805603027, + -1.5267670154571533, + -1.6345772743225098, + -1.3796477317810059, + -0.7609691023826599, + -0.3548354506492615, + -0.7552334666252136, + -0.44776833057403564, + -1.1078286170959473, + -1.3036658763885498, + -0.5214896202087402, + -0.8486822843551636, + -0.22470997273921967, + -0.4705755412578583, + -0.5639711022377014, + -0.5388108491897583, + -0.6052999496459961, + -0.1002030223608017, + -0.286334365606308, + -0.45798981189727783, + -1.0107953548431396, + -0.11875647306442261, + -0.6969441771507263, + -0.4609107971191406, + -0.07614769786596298, + -0.5035472512245178, + -0.1682187020778656, + -0.10476160794496536, + -0.6586751341819763, + -0.35806939005851746, + -1.5364394187927246, + -2.4093759059906006, + -1.977368950843811, + -1.6216907501220703, + -0.27647316455841064, + -0.2991848587989807, + -0.2783535420894623, + -0.05913994088768959, + -0.03023873083293438, + -0.043339803814888, + -0.7320341467857361, + -0.0030677898321300745, + -0.0332595594227314, + -0.012804670259356499, + -0.004041599575430155, + -0.0014899593079462647, + -0.001948602613992989, + -0.0029070996679365635, + -0.040939707309007645, + -0.013942227698862553, + -0.04897322878241539, + -0.011005887761712074, + -0.0044113704934716225, + -0.0013179434463381767, + -0.003658389439806342, + -0.009758152067661285, + -0.0014104428701102734, + -0.0016671819612383842, + -0.000771939754486084, + -0.0015519729349762201, + -0.003720743814483285, + -0.004249115474522114, + -0.00485657574608922, + -0.005053604021668434, + -0.002336274366825819, + -0.0009155849111266434, + -0.0004978132783435285, + -0.0005953923100605607, + -0.0011395872570574284, + -0.001485078944824636, + -0.3072909712791443, + -1.7295066118240356, + -0.4807289242744446, + -0.1245415136218071, + -0.011858444660902023, + -0.020613837987184525, + -0.011020978912711143, + -0.003106294432654977, + -0.0009966888464987278, + -0.0019349202048033476, + -0.037407051771879196, + -0.0003496989083942026, + -0.005922981072217226, + -0.007394562941044569, + -0.0006037319544702768, + -0.0008836655179038644, + -0.0002884448622353375, + -0.00047600860125385225, + -0.0024947968777269125, + -0.00442774873226881, + -0.004059052560478449, + -0.0018594847060739994, + -0.0006179092451930046, + -0.00022635281493421644, + -0.0006730675231665373, + -0.003022746881470084, + -0.0002343380037928, + -0.00047791501856409013, + -9.440929716220126e-05, + -0.00021550717065110803, + -0.0013523490633815527, + -0.0032202552538365126, + -0.001157686347141862, + -0.004449942149221897, + -0.0016590891173109412, + -0.00101062236353755, + -0.0003079893649555743, + -0.00048375347978435457, + -0.0021734442561864853, + -0.00423036003485322, + -0.11514264345169067, + -0.8658493757247925, + -0.084366075694561, + -0.02140468917787075, + -0.0060798698104918, + -0.008638513274490833, + -0.003212531330063939, + -0.0009598892065696418, + -0.00032085992279462516 + ] + }, + "throughput": [ + 92.14086318169623, + 104.14077061259405, + 104.70701879377005 + ] } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/model_config.yaml index 13e56a13c85..96ada2bf1e9 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --micro-batch-size: 1 --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true - --timing-log-level: 2 + --timing-log-level: 0 --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1 --distributed-backend: nccl --log-interval: 1 diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml index b99100f65eb..a4f47d3705f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --micro-batch-size: 1 --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true - --timing-log-level: 2 + --timing-log-level: 0 --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ --distributed-backend: nccl --log-interval: 1 diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml index 7a2cc9b0c78..59186f8d532 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --micro-batch-size: 1 --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true - --timing-log-level: 2 + --timing-log-level: 0 --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ --distributed-backend: nccl --log-interval: 1 diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml index 0b31d16af75..612e621534d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --micro-batch-size: 1 --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true - --timing-log-level: 2 + --timing-log-level: 0 --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ --distributed-backend: nccl --log-interval: 1 diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/model_config.yaml index 3b10336138d..cb06eae2e7e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/model_config.yaml @@ -10,7 +10,7 @@ MODEL_ARGS: --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 # See the mount paths defined in the top level tests/test_utils/recipes/gpt-static-inference.yaml --load: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/checkpoints --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/model_config.yaml index 04e6caa3303..c080cd5f5a7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_cudagraphs/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --micro-batch-size: 1 --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true - --timing-log-level: 2 + --timing-log-level: 0 --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ --distributed-backend: nccl --log-interval: 1 diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/model_config.yaml index 9aa1a6e1c96..e3a4d695ead 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --micro-batch-size: 1 --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true - --timing-log-level: 2 + --timing-log-level: 0 --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ --distributed-backend: nccl --log-interval: 1 diff --git a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml index b3564f8226a..90a1836347e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_583m_logitsmatch/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --micro-batch-size: 1 --no-load-optim: true --no-use-tokenizer-model-from-checkpoint-args: true - --timing-log-level: 2 + --timing-log-level: 0 --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ --distributed-backend: nccl --log-interval: 1 diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/model_config.yaml index 4350c4a6f50..199cf809ba2 100644 --- a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G/model_config.yaml @@ -22,13 +22,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp4_cp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp4_cp1_dgx_a100_1N8G/model_config.yaml index b571dca2dd0..0983337becc 100644 --- a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp4_cp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp4_cp1_dgx_a100_1N8G/model_config.yaml @@ -22,13 +22,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/model_config.yaml index 941d3f6f829..7f7aac5d78b 100644 --- a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G/model_config.yaml @@ -22,13 +22,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/model_config.yaml index 588cfe3e80a..93418f580fc 100644 --- a/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G/model_config.yaml @@ -22,13 +22,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/model_config.yaml index 75e4d3123bd..7702274db5f 100644 --- a/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/model_config.yaml +++ b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_cudagraphs/model_config.yaml @@ -10,7 +10,7 @@ MODEL_ARGS: --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --load: ${CHECKPOINT_LOAD_PATH}/model/mamba_hybrid_2b/dcp/mcore-v1_bf16/checkpoint --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mamba_hybrid_2b/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --tokenizer-type: TikTokenizer diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/model_config.yaml index 301b68e7382..9a7769eb432 100644 --- a/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/hybrid/hybrid_static_inference_tp1_pp1_2B_logitsmatch/model_config.yaml @@ -10,7 +10,7 @@ MODEL_ARGS: --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --load: ${CHECKPOINT_LOAD_PATH}/model/mamba_hybrid_2b/dcp/mcore-v1_bf16/checkpoint --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mamba_hybrid_2b/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --tokenizer-type: TikTokenizer diff --git a/tests/functional_tests/test_cases/mimo/mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8/model_config.yaml b/tests/functional_tests/test_cases/mimo/mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8/model_config.yaml index ced98a352b1..2daf74b89a7 100644 --- a/tests/functional_tests/test_cases/mimo/mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8/model_config.yaml +++ b/tests/functional_tests/test_cases/mimo/mimo_vlm_pretrain_convergence_tp1_pp1_cp1_dp8/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --seq-length: 4096 --max-position-embeddings: 4096 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 2200 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/model_config.yaml index 6bdb19e1001..cdabc4b6225 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/model_config.yaml index 97db543f73c..731ff82d8d4 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/model_config.yaml index 8f4f022345a..f7fd8b2963d 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer/model_config.yaml @@ -15,13 +15,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer/model_config.yaml similarity index 92% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer/model_config.yaml index aa83c79ceb2..61b5c9339ba 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM/model_config.yaml index 758f7af8f0f..a3995df9627 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/model_config.yaml similarity index 92% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/model_config.yaml index 2ef041c07af..8672163186c 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM/model_config.yaml index 29a63c7d148..4ed0bb89001 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/model_config.yaml similarity index 92% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/model_config.yaml index a15bbf77196..8e267b178b4 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml index a7e85122831..9490d832f7d 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/model_config.yaml similarity index 92% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/model_config.yaml index a5f390a463d..b84bf45b890 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer/model_config.yaml index 7ffcd448b37..b5c774d4d3c 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/model_config.yaml similarity index 92% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/model_config.yaml index e7aa73ba6b1..d02951177b0 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_top2router/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_top2router/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_top2router/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_top2router/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_top2router/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_top2router/model_config.yaml index 3806ae26529..8c75b0a2e76 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_top2router/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/model_config.yaml similarity index 92% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/model_config.yaml index 4820a43bf3f..978babb72ff 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/model_config.yaml similarity index 92% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/model_config.yaml index 488b8ad92d2..b6a7c223acc 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml index 52eb433afd5..4c991767ca3 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/model_config.yaml @@ -39,8 +39,8 @@ MODEL_ARGS: --seq-length: 4096 --data-cache-path: ${DATA_CACHE_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 # Add network size args --num-layers: 16 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml similarity index 96% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml index e8c45375110..a1a5219ecb4 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/model_config.yaml @@ -39,8 +39,8 @@ MODEL_ARGS: --seq-length: 4096 --data-cache-path: ${DATA_CACHE_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 # Add network size args --num-layers: 16 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml similarity index 96% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml index c7f0bde3e82..bd565830970 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/model_config.yaml @@ -39,8 +39,8 @@ MODEL_ARGS: --seq-length: 4096 --data-cache-path: ${DATA_CACHE_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 # Add network size args --num-layers: 16 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml similarity index 96% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml index bf1c5a45cc9..efb1fedf93c 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/model_config.yaml @@ -40,8 +40,8 @@ MODEL_ARGS: --seq-length: 4096 --data-cache-path: ${DATA_CACHE_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 # Add network size args --num-layers: 16 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml similarity index 92% rename from tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml index e593e94f5ac..3ecd68b9841 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/model_config.yaml index 45ae64df053..c147b689e71 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml index bb3f5df251d..f77c2a41f68 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml index 5ce2939b05d..12e6698a5f4 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/model_config.yaml index 60652f0ded9..c714e058651 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml index 8411f00055e..86a05a93562 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml index ac03efd36a5..5020d9d9397 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/golden_values_dev.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts/golden_values_dev.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/golden_values_dev.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts/golden_values_dev.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/golden_values_lts.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts/golden_values_lts.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/golden_values_lts.json rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts/golden_values_lts.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml similarity index 91% rename from tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml index 989a24acaf7..d763069b566 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml @@ -17,13 +17,13 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/model_config.yaml similarity index 92% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/model_config.yaml index b95d5c04a1a..cd7656d240f 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --max-position-embeddings: 1024 --disable-bias-linear: true --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_dgxc.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_dgxc.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_dgxc.json rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_dgxc.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/model_config.yaml similarity index 92% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/model_config.yaml rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/model_config.yaml index 5268bf68b33..fb438f0edda 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/model_config.yaml @@ -18,13 +18,13 @@ MODEL_ARGS: --max-position-embeddings: 1024 --disable-bias-linear: true --train-iters: 100 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 --distributed-backend: nccl --lr: 0.00015 diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml index c6e7916ea72..0e1f9110793 100644 --- a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml @@ -10,9 +10,9 @@ MODEL_ARGS: --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true - --timing-log-level: 2 - --load: ${CHECKPOINT_LOAD_PATH}/deepseek_16b_pyt/model/checkpoints - --tokenizer-model: ${DATA_PATH}/deepseek_16b_pyt/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --timing-log-level: 0 + --load: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/checkpoints + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --tokenizer-type: TikTokenizer --tiktoken-pattern: v2 --distributed-backend: nccl @@ -82,4 +82,4 @@ MODEL_ARGS: --inference-repeat-n: 8 METRICS: - "generated_tokens" - - "logprobs" \ No newline at end of file + - "logprobs" diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml index d94b06f5ac8..1b9eaaf1f65 100644 --- a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml @@ -10,7 +10,7 @@ MODEL_ARGS: --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --load: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/checkpoints --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --tokenizer-type: TikTokenizer diff --git a/tests/functional_tests/test_cases/moe/gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml index 2289078dd5b..3ba12056190 100644 --- a/tests/functional_tests/test_cases/moe/gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml @@ -10,9 +10,9 @@ MODEL_ARGS: --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true - --timing-log-level: 2 - --load: ${CHECKPOINT_LOAD_PATH}/deepseek_16b_pyt/model/checkpoints - --tokenizer-model: ${DATA_PATH}/deepseek_16b_pyt/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --timing-log-level: 0 + --load: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/checkpoints + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --tokenizer-type: TikTokenizer --tiktoken-pattern: v2 --distributed-backend: nccl diff --git a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/model_config.yaml index a9171008b7c..569eb969d72 100644 --- a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch/model_config.yaml @@ -10,7 +10,7 @@ MODEL_ARGS: --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --load: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/checkpoints --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --tokenizer-type: TikTokenizer diff --git a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml index 116992b2d7f..366d2f23575 100644 --- a/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml @@ -10,7 +10,7 @@ MODEL_ARGS: --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --load: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/checkpoints --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/deepseek_16b_pyt/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json --tokenizer-type: TikTokenizer diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/model_config.yaml similarity index 98% rename from tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/model_config.yaml index 4b59ffaca86..2898070f957 100644 --- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/model_config.yaml @@ -19,7 +19,7 @@ MODEL_ARGS: --seq-length: 1024 --max-position-embeddings: 1024 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/model_config.yaml similarity index 98% rename from tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/model_config.yaml index a13b09397eb..23bdaac5010 100644 --- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/model_config.yaml @@ -21,7 +21,7 @@ MODEL_ARGS: --disable-vision-class-token: true --max-position-embeddings: 4096 --train-iters: 50 - --timing-log-level: 2 + --timing-log-level: 0 --lr-decay-iters: 320000 --save: ${CHECKPOINT_SAVE_PATH} --load: ${CHECKPOINT_LOAD_PATH} diff --git a/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_11b_mcore_tp4_pp1/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_11b_mcore_tp4_pp1/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_11b_mcore_tp4_pp1/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/t5/t5_11b_mcore_tp4_pp1/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_11b_mcore_tp4_pp1/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/t5/t5_11b_mcore_tp4_pp1/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_11b_mcore_tp4_pp1/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/t5/t5_11b_mcore_tp4_pp1/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_11b_mcore_tp4_pp1/model_config.yaml similarity index 98% rename from tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/t5/t5_11b_mcore_tp4_pp1/model_config.yaml index 234236c7d26..c2798ecf6af 100644 --- a/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_11b_mcore_tp4_pp1/model_config.yaml @@ -41,7 +41,7 @@ MODEL_ARGS: --log-num-zeros-in-grad: true --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --save-interval: 10000 --eval-interval: 1000 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml similarity index 98% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml index 76afded197d..aa0f67ff311 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml @@ -41,7 +41,7 @@ MODEL_ARGS: --log-num-zeros-in-grad: true --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --save-interval: 50 --eval-interval: 1000 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/model_config.yaml similarity index 98% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/model_config.yaml index 2ab4e9730d7..59c1d0f280f 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1/model_config.yaml @@ -41,7 +41,7 @@ MODEL_ARGS: --log-num-zeros-in-grad: true --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --save-interval: 10000 --eval-interval: 1000 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml similarity index 98% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml index 37085e01771..80a84a26e0c 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml @@ -41,7 +41,7 @@ MODEL_ARGS: --log-num-zeros-in-grad: true --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --save-interval: 10000 --eval-interval: 1000 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/model_config.yaml similarity index 98% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/model_config.yaml index 54ad28a8e8a..047280dec39 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1/model_config.yaml @@ -41,7 +41,7 @@ MODEL_ARGS: --log-num-zeros-in-grad: true --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --save-interval: 10000 --eval-interval: 1000 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/model_config.yaml similarity index 98% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/model_config.yaml index 9cc675a35f6..1611c02251b 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_mcore_te_tp4_pp1_resume_torch_dist/model_config.yaml @@ -41,7 +41,7 @@ MODEL_ARGS: --log-num-zeros-in-grad: true --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --save-interval: 50 --eval-interval: 1000 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/model_config.yaml similarity index 98% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml rename to tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/model_config.yaml index 46e7209823f..12ccecb5883 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1/model_config.yaml @@ -41,7 +41,7 @@ MODEL_ARGS: --log-num-zeros-in-grad: true --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --save-interval: 10000 --eval-interval: 1000 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml similarity index 98% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml rename to tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml index 0b11a3c137c..8559fd587d1 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml @@ -41,7 +41,7 @@ MODEL_ARGS: --log-num-zeros-in-grad: true --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --save-interval: 50 --eval-interval: 1000 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci-ord.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgxa100_dracooci.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/model_config.yaml similarity index 98% rename from tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml rename to tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/model_config.yaml index c305e4a86dd..9c6a835571c 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_mcore_tp2_pp1_vp1/model_config.yaml @@ -41,7 +41,7 @@ MODEL_ARGS: --log-num-zeros-in-grad: true --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --save-interval: 10000 --eval-interval: 1000 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/model_config.yaml similarity index 98% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/model_config.yaml index 5dc3478de12..dd3896ad88a 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1/model_config.yaml @@ -41,7 +41,7 @@ MODEL_ARGS: --log-num-zeros-in-grad: true --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --save-interval: 10000 --eval-interval: 1000 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/model_config.yaml similarity index 98% rename from tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml rename to tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/model_config.yaml index 1bf1e028390..4c955dd5441 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_mcore_tp4_pp1_resume_torch_dist/model_config.yaml @@ -41,7 +41,7 @@ MODEL_ARGS: --log-num-zeros-in-grad: true --log-validation-ppl-to-tensorboard: true --log-timers-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --log-interval: 1 --save-interval: 50 --eval-interval: 1000 diff --git a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml index d30207b5b51..964acdba5cf 100644 --- a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml @@ -55,7 +55,7 @@ MODEL_ARGS: --log-num-zeros-in-grad: true --log-params-norm: true --log-validation-ppl-to-tensorboard: true - --timing-log-level: 2 + --timing-log-level: 0 --wandb-project: megatron-core-release-runs --wandb-exp-name: ${WANDB_EXPERIMENT} METRICS: diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_weekly_mcore_te_tp2_pp1_vp1/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_weekly_mcore_te_tp2_pp1_vp1/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/t5/t5_weekly_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/t5/t5_weekly_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts_dgx_a100.json diff --git a/tests/test_utils/python_scripts/launch_jet_workload.py b/tests/test_utils/python_scripts/launch_jet_workload.py index 0e3ed179f4a..7dc4a7357a7 100644 --- a/tests/test_utils/python_scripts/launch_jet_workload.py +++ b/tests/test_utils/python_scripts/launch_jet_workload.py @@ -288,6 +288,7 @@ def is_flaky_failure(concat_allranks_logs: str) -> bool: or "Unpack failed: incomplete input" in concat_allranks_logs or "unspecified launch failure" in concat_allranks_logs or "free(): corrupted unsorted chunks" in concat_allranks_logs + or "Segfault encountered" in concat_allranks_logs ) diff --git a/tests/test_utils/python_scripts/launch_nemo_run_workload.py b/tests/test_utils/python_scripts/launch_nemo_run_workload.py index b3032eb15c4..648ac28d19a 100644 --- a/tests/test_utils/python_scripts/launch_nemo_run_workload.py +++ b/tests/test_utils/python_scripts/launch_nemo_run_workload.py @@ -13,6 +13,34 @@ logger = logging.getLogger(__name__) +def is_flaky_failure(concat_allranks_logs: str) -> bool: + """Assumes that certain keywords hint towards intermittent failures""" + + return ( + "The server socket has failed to listen on any local network address." + in concat_allranks_logs + or "Some NCCL operations have failed or timed out." in concat_allranks_logs + or "uncorrectable ECC error encountered" in concat_allranks_logs + or "illegal memory access" in concat_allranks_logs + or "illegal instruction" in concat_allranks_logs + or "torch.distributed.DistNetworkError" in concat_allranks_logs + or "Segmentation fault" in concat_allranks_logs + or "found NaN in" in concat_allranks_logs + or "For debugging consider passing CUDA_LAUNCH_BLOCKING=1" in concat_allranks_logs + or "double free or corruption" in concat_allranks_logs + or "Call to CUDA function failed." in concat_allranks_logs + or "Connection reset by peer" in concat_allranks_logs + or "invalid pointer" in concat_allranks_logs + or "malloc(): unaligned tcache chunk detected" in concat_allranks_logs + or "zmq.error.ZMQError: Address already in use" in concat_allranks_logs + or "We couldn't connect to 'https://huggingface.co'" in concat_allranks_logs + or "Unpack failed: incomplete input" in concat_allranks_logs + or "unspecified launch failure" in concat_allranks_logs + or "free(): corrupted unsorted chunks" in concat_allranks_logs + or "Segfault encountered" in concat_allranks_logs + ) + + @click.command() @click.option("--scope", required=True, type=str, help="Scope of the workload") @click.option("--model", required=True, type=str, help="Model of the workload") @@ -89,11 +117,39 @@ def main( packager=run.Packager(), volumes=artifacts, ) - with run.Experiment("mcore-ci-test", executor=executor, log_level="INFO") as exp: - _ = exp.add([inline_script], tail_logs=False, name="task-1") - exp.dryrun(log=True) - exp.run(detach=False, tail_logs=True, sequential=False) + n_attempts = 0 + while n_attempts < 3: + with run.Experiment("mcore-ci-test", executor=executor, log_level="INFO") as exp: + _ = exp.add([inline_script], tail_logs=False, name="task-1") + + exp.dryrun(log=True) + exp.run(detach=False, tail_logs=True, sequential=False) + + result_dict = exp.status(return_dict=True) + _, job_dict = list(result_dict.items())[0] + succeeded = str(job_dict["status"]) == "SUCCEEDED" + + if succeeded: + logger.info(f"Job succeeded with status: {job_dict["status"]}") + sys.exit(0) + + logger.error(f"Job failed with status: {job_dict["status"]}") + log_file_paths = pathlib.Path(os.getcwd()).glob("assets_dir/logs/*/*/attempt_0/*/std*.log") + all_ranks_all_logs = [] + for log_file_path in log_file_paths: + with open(log_file_path, "r") as f: + all_logs = f.readlines() + all_ranks_all_logs.extend(all_logs) + all_ranks_all_logs_string = "\n".join(all_ranks_all_logs) + if is_flaky_failure(all_ranks_all_logs_string): + logger.warning("Detected flaky failure, attempt restart.") + n_attempts += 1 + continue + + sys.exit(1) + + sys.exit(1) result_dict = exp.status(return_dict=True) _, job_dict = list(result_dict.items())[0] diff --git a/tests/test_utils/recipes/ckpt_converter.yaml b/tests/test_utils/recipes/ckpt_converter.yaml index 5d705869958..f78f184a326 100644 --- a/tests/test_utils/recipes/ckpt_converter.yaml +++ b/tests/test_utils/recipes/ckpt_converter.yaml @@ -34,6 +34,7 @@ spec: rm -rf megatron; cp -a /opt/megatron-lm/megatron ./ script: |- ls + cd /opt/megatron-lm torchrun \ diff --git a/tests/test_utils/recipes/gpt-dynamic-inference-cuda-graphs.yaml b/tests/test_utils/recipes/gpt-dynamic-inference-cuda-graphs.yaml index dd90bc38e88..47b8d346150 100644 --- a/tests/test_utils/recipes/gpt-dynamic-inference-cuda-graphs.yaml +++ b/tests/test_utils/recipes/gpt-dynamic-inference-cuda-graphs.yaml @@ -10,8 +10,6 @@ spec: gpus: 1 n_repeat: 1 platforms: dgx_a100 - artifacts: - /workspace/data/model/mcore_mistral: model/mcore_mistral/nemo_minitron-0.5b/v1 script_setup: | unset https_proxy echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc diff --git a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml b/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml index 56ecdabcded..dd8cf6b945d 100644 --- a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml +++ b/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml @@ -10,8 +10,6 @@ spec: gpus: 1 n_repeat: 1 platforms: dgx_a100 - artifacts: - /workspace/data/model/mcore_mistral: model/mcore_mistral/nemo_minitron-0.5b/v1 script_setup: | unset https_proxy echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc diff --git a/tests/test_utils/recipes/gpt-grads.yaml b/tests/test_utils/recipes/gpt-grads.yaml index 205985d5e13..cdd3a050ff2 100644 --- a/tests/test_utils/recipes/gpt-grads.yaml +++ b/tests/test_utils/recipes/gpt-grads.yaml @@ -10,8 +10,6 @@ spec: gpus: 8 n_repeat: 1 platforms: dgx_h100 - artifacts: - /mnt/artifacts/text/the_pile/shard00: text/the_pile/shard00 script_setup: | unset https_proxy echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc @@ -61,7 +59,7 @@ spec: bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} products: - - test_case: [gpt3_mr_mcore_reruns_resume_check_grads] + - test_case: [gpt3_mcore_reruns_resume_check_grads] products: - environment: [dev] scope: [mr, mr-github] diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/gpt.yaml index 5eb29ac2605..0dafb8685c2 100644 --- a/tests/test_utils/recipes/gpt.yaml +++ b/tests/test_utils/recipes/gpt.yaml @@ -60,43 +60,43 @@ products: ####################################################################### # Nightly tests: Run both DEV and LTS unless something is flaky # ####################################################################### - - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather] + - test_case: [gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather] products: - environment: [dev] scope: [nightly] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2] + - test_case: [gpt3_mcore_tp1_pp2] products: - environment: [lts] scope: [nightly] - environment: [dev] scope: [nightly] platforms: [dgx_h100] - - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist] + - test_case: [gpt3_mcore_tp1_pp2_resume_torch_dist] products: - environment: [dev, lts] scope: [nightly] - - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4] + - test_case: [gpt3_mcore_tp1_pp4] products: - environment: [lts] scope: [nightly] - environment: [dev] scope: [nightly] platforms: [dgx_h100] - - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist] + - test_case: [gpt3_mcore_tp1_pp4_resume_torch_dist] products: - environment: [dev, lts] scope: [nightly] - - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch] + - test_case: [gpt3_mcore_tp4_pp1_resume_torch] products: - environment: [lts] scope: [nightly] - environment: [dev] scope: [nightly] platforms: [dgx_h100] - - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist] + - test_case: [gpt3_mcore_tp4_pp1_resume_torch_dist] products: - environment: [lts] scope: [nightly] @@ -107,215 +107,215 @@ products: # MR tests: Mostly DEV on MR, and LTS on nightly cadence, except for # # some very important tests. # ####################################################################### - - test_case: [gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - # - test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G] + # - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic] # products: # - environment: [dev] # scope: [mr] # - environment: [lts] # scope: [nightly] # Non-deterministic: #487 - - test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # outdated TE: #501 - - test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # non-determinism: #436 - - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # non-determinism: #437 - - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp4_vp1] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - # - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G] + # - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist] # products: # - environment: [dev] # scope: [mr] # platforms: [dgx_h100] # Hangs: #513 # - environment: [lts] # scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather] products: # - environment: [dev] # scope: [mr] # platforms: [dgx_h100] # Hangs: #513 - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied] products: # - environment: [dev] # scope: [mr] # Hangs: #513 # platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap] products: # - environment: [dev] # scope: [mr] # Hangs: #513 # platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_cp2] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_cp2_nondeterministic] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion] products: - environment: [dev] scope: [mr, mr-github] @@ -323,110 +323,110 @@ products: - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_mla] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader] products: # - environment: [dev] # scope: [mr] # Hangs: #513 # platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G] + - test_case: [gpt3_mcore_tp2_pp2_uninstall_te] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed] + - test_case: [gpt3_7b_tp1_pp4_memory_speed] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # OOM: #434 - - test_case: [gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed] + - test_case: [gpt3_7b_tp4_pp1_memory_speed] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # OOM: #434 - - test_case: [gpt3_mr_mcore_te_tp2_zp_z3_resume_fsdp_dtensor_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp2_pp1_modelopt_distill_resume] + - test_case: [gpt3_mcore_te_tp2_pp1_modelopt_distill_resume] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # Outdated: #502 - # - test_case: [gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G] + # - test_case: [gpt3_mcore_te_tp2_pp1_fsdp2_resume_torch_dist] # products: # - environment: [dev] # scope: [mr] # Broken: #484 @@ -435,21 +435,21 @@ products: ####################################################################### # Super important MR tests that run for both DEV and LTS per MR # ####################################################################### - - test_case: [gpt3_mr_mcore_reruns_persistent_1] + - test_case: [gpt3_mcore_reruns_persistent_1] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] - # - test_case: [gpt3_mr_mcore_reruns_persistent_2] + # - test_case: [gpt3_mcore_reruns_persistent_2] # products: # - environment: [dev] # scope: [mr] # platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer] products: - environment: [lts] scope: [mr] @@ -459,14 +459,14 @@ products: - environment: [dev] scope: [mr-slim] platforms: [dgx_h100] - - test_case: [gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [mr] - - test_case: [gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather] products: - environment: [lts] scope: [mr] @@ -476,79 +476,79 @@ products: - environment: [dev] scope: [mr-slim] platforms: [dgx_h100] - - test_case: [gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [mr] - # - test_case: [gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G] + # - test_case: [gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone] # products: # - environment: [dev] # scope: [mr] # platforms: [dgx_h100] - # - test_case: [gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G] + # - test_case: [gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer] # products: # - environment: [dev] # scope: [mr] # platforms: [dgx_h100] - # - test_case: [gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G] + # - test_case: [gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu] # products: # - environment: [dev] # scope: [mr] # platforms: [dgx_h100] - # - test_case: [gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G] + # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic] # products: # - environment: [dev] # scope: [mr] # platforms: [dgx_a100, dgx_h100] - # - test_case: [gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap] + # - test_case: [gpt3_weekly_dgx_b200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap] # products: # - environment: [dev] # scope: [weekly] # platforms: [dgx_b200] - # - test_case: [gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp] + # - test_case: [gpt3_weekly_dgx_b200_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp] # products: # - environment: [dev] # scope: [weekly] # platforms: [dgx_b200] - # - test_case: [gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap] + # - test_case: [gpt3_weekly_dgx_b200_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap] # products: # - environment: [dev] # scope: [weekly] # platforms: [dgx_b200] - # - test_case: [gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_mxfp8_tp_sp_cp] + # - test_case: [gpt3_weekly_dgx_b200_mcore_tp4_cp2_mxfp8_tp_sp_cp] # products: # - environment: [dev] # scope: [weekly] # platforms: [dgx_b200] - # - test_case: [gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap] + # - test_case: [gpt3_weekly_dgx_b200_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap] # products: # - environment: [dev] # scope: [weekly] # platforms: [dgx_b200] - - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap] + - test_case: [gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap] products: - environment: [dev] scope: [weekly] platforms: [dgx_h100] - # - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp] + # - test_case: [gpt3_weekly_dgx_h100_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp] # products: # - environment: [dev] # scope: [weekly] # platforms: [dgx_h100] - # - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap] + # - test_case: [gpt3_weekly_dgx_h100_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap] # products: # - environment: [dev] # scope: [weekly] # platforms: [dgx_h100] - - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap] + - test_case: [gpt3_weekly_dgx_h100_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap] products: - environment: [dev] scope: [weekly] platforms: [dgx_h100] - # - test_case: [gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G] + # - test_case: [gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te] # products: # - environment: [dev, lts] # scope: [mr] # Non-deterministic: #483 diff --git a/tests/test_utils/recipes/mamba-static-inference.yaml b/tests/test_utils/recipes/mamba-static-inference.yaml index a4eaecaa53e..e727c4db5ee 100644 --- a/tests/test_utils/recipes/mamba-static-inference.yaml +++ b/tests/test_utils/recipes/mamba-static-inference.yaml @@ -10,8 +10,6 @@ spec: gpus: 1 n_repeat: 1 platforms: dgx_a100 - artifacts: - /workspace/data/mamba_hybrid_2b: model/mamba_hybrid_2b/dcp/mcore-v1_bf16 script_setup: | unset https_proxy echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc diff --git a/tests/test_utils/recipes/moe-static-inference.yaml b/tests/test_utils/recipes/moe-static-inference.yaml index f2f98fbc146..c11cd294592 100644 --- a/tests/test_utils/recipes/moe-static-inference.yaml +++ b/tests/test_utils/recipes/moe-static-inference.yaml @@ -10,7 +10,6 @@ spec: gpus: 8 n_repeat: 1 platforms: dgx_a100 - artifacts: script_setup: | unset https_proxy echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml index fd8f00c242f..8164ca37df8 100644 --- a/tests/test_utils/recipes/moe.yaml +++ b/tests/test_utils/recipes/moe.yaml @@ -60,17 +60,17 @@ products: ####################################################################### # Nightly tests: Run both DEV and LTS unless something is flaky # ####################################################################### - - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel] + - test_case: [gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel] products: - environment: [dev] scope: [nightly] platforms: [dgx_a100, dgx_h100] - - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last] + - test_case: [gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last] products: - environment: [dev] scope: [nightly] platforms: [dgx_a100, dgx_h100] - # - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts] + # - test_case: [gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts] # products: # non-determinism: #478 # - environment: [dev, lts] # scope: [nightly] @@ -81,45 +81,45 @@ products: # MR tests: Mostly DEV on MR, and LTS on nightly cadence, except for # # some very important tests. # ####################################################################### - - test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph] + - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] # hang: #513 - # - test_case: [gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental] + # - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental] # products: # - environment: [dev] # scope: [mr] # platforms: [dgx_h100] # hang: #513 - - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] # TODO: The migration of custom fsdp causes EP + FSDP to be temporarily unavailable, which will be fixed in a subsequent MR. - # - test_case: [gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G] + # - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router] # products: # - environment: [dev] # scope: [mr] # platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] - - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G] + - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - - test_case: [gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer] + - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer] products: - environment: [dev] scope: [mr] @@ -127,12 +127,12 @@ products: ####################################################################### # Super important MR tests that run for both DEV and LTS per MR # ####################################################################### - # - test_case: [gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G] + # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer] # products: # - environment: [dev] # scope: [mr] # platforms: [dgx_h100] - # - test_case: [gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G] + # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM] # products: # - environment: [dev] # scope: [mr] @@ -140,7 +140,7 @@ products: ########################### # Merge train tests # ########################### - - test_case: [gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer] + - test_case: [gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer] products: - environment: [dev] scope: [mr] @@ -156,7 +156,7 @@ products: - environment: [dev] scope: [mr-slim] platforms: [dgx_h100] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8] + - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8] products: - environment: [dev] scope: [mr] diff --git a/tests/test_utils/recipes/multimodal-llava.yaml b/tests/test_utils/recipes/multimodal-llava.yaml index 65393f14f50..80a30f050bc 100644 --- a/tests/test_utils/recipes/multimodal-llava.yaml +++ b/tests/test_utils/recipes/multimodal-llava.yaml @@ -58,12 +58,12 @@ spec: bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} products: - - test_case: [multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G] + - test_case: [multimodal_llava_mcore_te_tp1_pp1] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - - test_case: [multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G] + - test_case: [multimodal_llava_mcore_te_tp4_sp_cp2] products: - environment: [dev] scope: [mr, mr-github] diff --git a/uv.lock b/uv.lock index 28110f38852..f7c8916166b 100644 --- a/uv.lock +++ b/uv.lock @@ -1093,61 +1093,61 @@ wheels = [ [[package]] name = "cython" -version = "3.1.5" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/4d/ab/4e980fbfbc894f95854aabff68a029dd6044a9550c480a1049a65263c72b/cython-3.1.5.tar.gz", hash = "sha256:7e73c7e6da755a8dffb9e0e5c4398e364e37671778624188444f1ff0d9458112", size = 3192050, upload-time = "2025-10-20T06:06:51.928Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b5/9f/677707b1734285632a71a3b644b36e77801ce36a7a34af2e64f516b451f0/cython-3.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d27f08ea53099f0101a0c582f1000fcae51cae177bbd4f6f95adfd8adb7a5271", size = 2993670, upload-time = "2025-10-20T06:08:47.301Z" }, - { url = "https://files.pythonhosted.org/packages/40/28/6fa54e679b33eb8640f1fe0a222096c5f8080d25035a923f444d56ea3046/cython-3.1.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:68cf7d059fd673adf3486e34950612069ec0c235e3ae8455424dfb6fdf85cffd", size = 2918339, upload-time = "2025-10-20T06:08:49.029Z" }, - { url = "https://files.pythonhosted.org/packages/78/7e/f3a5979b16efa916a3494986bb234b2ae66ba81ab2e4e358a0b991eaa288/cython-3.1.5-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:8e9e35cad5ae781abef944ce8a8395e098d6e042e5269cc4bcbc1fc177b1e3e3", size = 3511124, upload-time = "2025-10-20T06:08:51.353Z" }, - { url = "https://files.pythonhosted.org/packages/0c/15/a44cc4b6e2482e5453b2eaac00a52b79d2dd71a5fe8c2000dfc7f06c4d32/cython-3.1.5-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:51798e2a76559dff79faee263c971006ce5ae2ee6ecd2fbf108fce3cc0acbac7", size = 3265544, upload-time = "2025-10-20T06:08:53.564Z" }, - { url = "https://files.pythonhosted.org/packages/13/d0/8fe7ad4115f5b4f9b2643a2efd22bfb301e81b6be618fdbc7d560a5edb7c/cython-3.1.5-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d4d6054f65626d4bb1846da686370394ee83e66a8a752fad7ca362ed8de1cf8c", size = 3427201, upload-time = "2025-10-20T06:08:55.455Z" }, - { url = "https://files.pythonhosted.org/packages/1a/24/b00761f82f323a4c0a2fc0877c5a4ceeb0f9dbc1626b3aed124593edc7c9/cython-3.1.5-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e9744f8c701365bc8081946c68de2f106c5aa70b08c3b989f482d469b9d6fd77", size = 3280702, upload-time = "2025-10-20T06:08:57.669Z" }, - { url = "https://files.pythonhosted.org/packages/e5/d1/c4b151f8ac86a7444a9a73693f51e36956fb106b55358f809870e49f66e0/cython-3.1.5-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:8396663f6c38fa392de2fb5ea7efd7749334d5bb6b95cd58f9d1bd566924a593", size = 3525363, upload-time = "2025-10-20T06:08:59.873Z" }, - { url = "https://files.pythonhosted.org/packages/a9/2f/e8158f27b34b121975f87db2a7ea7d0e8091a30be5602a5a36f28b7c1944/cython-3.1.5-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e069c5af8f646faaacca1a693f74fb27254f7d8ddec2045301d39a8df552c777", size = 3441442, upload-time = "2025-10-20T06:09:01.649Z" }, - { url = "https://files.pythonhosted.org/packages/27/65/9c74b2bd719b563732a0fc5b0162db2d4eac5289bc3452e15b2534dda5d4/cython-3.1.5-cp310-cp310-win32.whl", hash = "sha256:ed0dfaad3a5ca8bf6f3546d40a55f3b879d1f835ca19382d8ca582318de09d49", size = 2484767, upload-time = "2025-10-20T06:09:03.447Z" }, - { url = "https://files.pythonhosted.org/packages/f9/f3/147d524a623f9a1c3269ece074c5a6b9ded38994fddbe57cb4f77d8d3be3/cython-3.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:7af877689440cda31e455003d6f615e0ffca658c7f7dcbf17573bfb469848cdf", size = 2709618, upload-time = "2025-10-20T06:09:05.471Z" }, - { url = "https://files.pythonhosted.org/packages/4b/f3/fcd5a3c43db19884dfafe7794b463728c70147aa1876223f431916d44984/cython-3.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1aad56376c6ff10deee50f3a9ff5a1fddbe24c6debad7041b86cc618f127836a", size = 3026477, upload-time = "2025-10-20T06:09:07.712Z" }, - { url = "https://files.pythonhosted.org/packages/3d/19/81fa80bdeca5cee456ac52728c993e62eaf58407d19232db55536cf66c4b/cython-3.1.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ef1df5201bf6eef6224e04584b0032874bd1e10e9f4e5701bfa502fca2f301bb", size = 2956078, upload-time = "2025-10-20T06:09:09.781Z" }, - { url = "https://files.pythonhosted.org/packages/a1/40/002d72dc5914a8043dc9fed9b05b10fb4d365c5182733af3e0768a388cb7/cython-3.1.5-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:dce715a5b4279b82354855609d96e49a1bdc8a23499fb03d707df3865df3c565", size = 3412101, upload-time = "2025-10-20T06:09:11.762Z" }, - { url = "https://files.pythonhosted.org/packages/ab/3f/8913ffad4f025446a3fa1662675277e340aef3ddb583704b5569698c28dc/cython-3.1.5-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3b185ac9ff4170a584decffb6616457208f5a4148c78613f3118f70603b3759c", size = 3191171, upload-time = "2025-10-20T06:09:16.924Z" }, - { url = "https://files.pythonhosted.org/packages/63/fb/66e72c2e4b88f7f221d6226ab7ada1c572924bd73c3c66f899313c4e33d3/cython-3.1.5-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e3f86927811923958af0a4c68c6b978438cec0070b56dd68f968b2a070e4dc4d", size = 3313920, upload-time = "2025-10-20T06:09:18.856Z" }, - { url = "https://files.pythonhosted.org/packages/bb/40/0858cb88f7cd8b7d1627cefff67fcc0d50c3bd9303a3687f4dbc5d2790cf/cython-3.1.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:61b19977f4af6632413cf89e9126fc9935b33d3d42699ee4370e74ac0ad38fc8", size = 3205839, upload-time = "2025-10-20T06:09:21.473Z" }, - { url = "https://files.pythonhosted.org/packages/d7/e4/8edaf492b365720a553a83d5a1289f4f3198ae2ffd7333142f1b175b3012/cython-3.1.5-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:44ae7765f5d1082efd7a6cc9beedc7499a70e3cac528faad6cfca9d68b879253", size = 3428501, upload-time = "2025-10-20T06:09:23.756Z" }, - { url = "https://files.pythonhosted.org/packages/22/8c/db66aeba98f0374cc18f6311679d1fa984852e0c737815b35df37ffd5be6/cython-3.1.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d5e7a836c18638d7c383e438306c36acd7ea3f5feb78d32796efab506626567a", size = 3330574, upload-time = "2025-10-20T06:09:25.827Z" }, - { url = "https://files.pythonhosted.org/packages/83/4b/5e01ab06d625496e0d0c5cd34d8b1793833fafb4ebde439595fb289bf77e/cython-3.1.5-cp311-cp311-win32.whl", hash = "sha256:f7991ef8da0132962c4a79636e01792cc96e0ede333d8b5d772be8bf218f6549", size = 2482452, upload-time = "2025-10-20T06:09:27.455Z" }, - { url = "https://files.pythonhosted.org/packages/2c/67/71d858413f1753399b303bec74b4322001e1af8215edf7cc34e6e6d7e3ff/cython-3.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:d31861678d88a7c6e69e022e37ed2a7d378fdd6b7843d63f3a2e97fc3fc88d63", size = 2713943, upload-time = "2025-10-20T06:09:29.571Z" }, - { url = "https://files.pythonhosted.org/packages/54/3c/beb8bd4b94ae08cc9b90aac152e917e2fcab1d3189fb5143bc5f1622dc59/cython-3.1.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:38bf7bbe29e8508645d2c3d6313f7fb6872c22f54980f68819422d0812c95f69", size = 3063044, upload-time = "2025-10-20T06:09:32.361Z" }, - { url = "https://files.pythonhosted.org/packages/3b/88/1e0df92588704503a863230fed61d95fc6e38c0db2537eaf6e5c140e5055/cython-3.1.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:61c42f881320a2b34a88806ddee6b424b3caa6fa193b008123704a2896b5bc37", size = 2970800, upload-time = "2025-10-20T06:09:34.58Z" }, - { url = "https://files.pythonhosted.org/packages/5c/27/51854d64c058265ea216cf04239d5818ffb72e200875273acae77e96821f/cython-3.1.5-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:dde94e825ed23d0189a43c7714143de6ab35c7d6ca6dca4b2b2fcd2db418400d", size = 3387292, upload-time = "2025-10-20T06:09:36.218Z" }, - { url = "https://files.pythonhosted.org/packages/86/03/37274f84d775e19234c8ba3b7b9ffee55d038d39312446e1123f9f9e8167/cython-3.1.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:51e8f773a90a61179ebf5eb2f0f711607a39d7c87ba254d9a7693b8dc62b5c8c", size = 3168510, upload-time = "2025-10-20T06:09:38.312Z" }, - { url = "https://files.pythonhosted.org/packages/d2/d2/52bf6d5b18d6faa9c3655c2c2854dd4cc3630e0af7ff89e415fbba713c37/cython-3.1.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:326633ca2aa0233098e75198f955b5836c2dc12b19e1b1aa10877e96b9aee37d", size = 3319825, upload-time = "2025-10-20T06:09:40.229Z" }, - { url = "https://files.pythonhosted.org/packages/93/05/4935c5aff6bc95155168b59990ce364877ae3d97b7cc58b20e93be9c0803/cython-3.1.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7002d9ae1c8863f089195b539c72c927e0f41cc4787e8e369db6e8f22e12b7b8", size = 3181070, upload-time = "2025-10-20T06:09:42.481Z" }, - { url = "https://files.pythonhosted.org/packages/10/c8/65650a07facc6e7aeec9e94358715a1a0f18960f8c5a30f60291c5e911b5/cython-3.1.5-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:6a0905a967bc4eaf6186837efbd023061bc25b5f80599203bad5db858527d9da", size = 3400149, upload-time = "2025-10-20T06:09:47.86Z" }, - { url = "https://files.pythonhosted.org/packages/f7/78/ac690c772d2942ae16498d7cc182f056d3cf42788153685334b78904b087/cython-3.1.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:531e431e23bbd3e658b41a1240d641131a11d5b5689062e9b811a6b4eab4ecf7", size = 3330840, upload-time = "2025-10-20T06:09:49.574Z" }, - { url = "https://files.pythonhosted.org/packages/ac/53/ea4aaf1a80c537b53c8cad6f99980ea7cf80e1be2a3c7db790c58af34b42/cython-3.1.5-cp312-cp312-win32.whl", hash = "sha256:920e2579858b3b47aa9026667d7adbd22a6cccf1e8da1bf3ea01a1c451a4ef0f", size = 2487776, upload-time = "2025-10-20T06:09:51.437Z" }, - { url = "https://files.pythonhosted.org/packages/2a/89/195d56054f8936b38c046fab904aaec4d7e221db2a45b4016d11e909cf2e/cython-3.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:b230b4ef06752c186ebd071989aac6ea60c79078a5430d3d33712cec0dc19ffd", size = 2705869, upload-time = "2025-10-20T06:09:53.08Z" }, - { url = "https://files.pythonhosted.org/packages/89/7e/9b4e099076e6a56939ef7def0ebf7f31f204fc2383be57f31fd0d8c91659/cython-3.1.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3c9b6d424f8b4f621b2d08ee5c344970311df0dac5c259667786b21b77657460", size = 3051579, upload-time = "2025-10-20T06:09:54.733Z" }, - { url = "https://files.pythonhosted.org/packages/a4/4d/4f5d2ab95ed507f8c510bf8044d9d07b44ad1e0a684b3b8796c9003e39ef/cython-3.1.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:08e998a4d5049ea75932674701fa283397477330d1583bc9f63b693a380a38c6", size = 2958963, upload-time = "2025-10-20T06:09:56.45Z" }, - { url = "https://files.pythonhosted.org/packages/f7/0c/c5eb8d2a2f1bbf7b23656609fb4cfc34a0812fca969614c5fbf011bcf122/cython-3.1.5-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:a89cba730a2fd93eb057f0d1f0e0f1d5377f263333ae34038e31df561f77a923", size = 3359452, upload-time = "2025-10-20T06:09:58.617Z" }, - { url = "https://files.pythonhosted.org/packages/b4/b1/8b02f05928e5e5beadafbf6d8c34117f3fb9d5532fd266a9ad80749b50ef/cython-3.1.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2f7994fd7486020cb3a4022121534489d984a42aac773a2eeada1b2e1f057cf9", size = 3154975, upload-time = "2025-10-20T06:10:00.827Z" }, - { url = "https://files.pythonhosted.org/packages/8e/53/a8018e50b64207847ac1de0aa007ca1a3a775ca388f265e85f5d70bcb754/cython-3.1.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b92ed80e3be2b35f594587389d9f7399860c8f17d9e4f23b7046f022f254b10b", size = 3307804, upload-time = "2025-10-20T06:10:02.559Z" }, - { url = "https://files.pythonhosted.org/packages/32/c5/c761968122169696648a5a8a4c228a34e6de2a62b98d27c18c57235f8303/cython-3.1.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ada0c4eb7a98948a2a45444062a07995c8d3fa6fc5bc5a14a0e57ef793d0d8b7", size = 3170533, upload-time = "2025-10-20T06:10:04.952Z" }, - { url = "https://files.pythonhosted.org/packages/47/af/c6e585912d19360bf02408368322a6c458dc1c0e867f75baa8b4f0f6bcdc/cython-3.1.5-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:5a3b6e75c8ffa5a06824be6e3858990ed1e88d432dcfc4ec865d419c44eaa29d", size = 3372608, upload-time = "2025-10-20T06:10:06.622Z" }, - { url = "https://files.pythonhosted.org/packages/95/0f/34aa595446a485333b09398de8a769a9f80e58c2b07918b6268cba5ebe71/cython-3.1.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:834378e535e524168f9e54ae6bb4bbd3e414bbc7e4532945b715bd867a2be0ce", size = 3319976, upload-time = "2025-10-20T06:10:08.303Z" }, - { url = "https://files.pythonhosted.org/packages/8f/e3/620258785bd382c19283f37c65bcaa5d6b2437247b4bb4b40128ca96638a/cython-3.1.5-cp313-cp313-win32.whl", hash = "sha256:18e6049138f4ad45fa3947437fe74126c5d932a36cdb93cb3a70715712021c2d", size = 2481579, upload-time = "2025-10-20T06:10:10.159Z" }, - { url = "https://files.pythonhosted.org/packages/71/98/bd2cd37ee7f2420e73d21082e137ba949186e293044f24c0954a9595d018/cython-3.1.5-cp313-cp313-win_amd64.whl", hash = "sha256:fcebc7112872828f8815eb73e0c1572975f982af8febc56cfa369aa996e24142", size = 2703469, upload-time = "2025-10-20T06:10:11.799Z" }, - { url = "https://files.pythonhosted.org/packages/7c/52/a44f5b3e7988ef3a55ea297cd5b56204ff5d0caaf7df048bcb78efe595ab/cython-3.1.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:888bf3f12aadfb2dc2c41e83932f40fc2ac519933c809aae16e901c4413d6966", size = 3046849, upload-time = "2025-10-20T06:10:14.087Z" }, - { url = "https://files.pythonhosted.org/packages/d2/a8/fb84d9b6cc933b65f4e3cedc4e69a1baa7987f6dfb5165f89298521c2073/cython-3.1.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:85ffc5aa27d2e175bab4c649299aa4ae2b4c559040a5bf50b0ad141e76e17032", size = 2967186, upload-time = "2025-10-20T06:10:16.286Z" }, - { url = "https://files.pythonhosted.org/packages/74/ee/a5aba9d36dacbda936335186a6ee3195bf780fd8a8a98e1a6e17351ca9a4/cython-3.1.5-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:e4d7f37e4217e1e93c944a175865deffbf16c9901eaba48fc35473afbfb658d4", size = 3359989, upload-time = "2025-10-20T06:10:18.384Z" }, - { url = "https://files.pythonhosted.org/packages/08/64/1a058f052c71390b4440c8e1dc93bc09cdf04ec4d49e9fde0524b38e0678/cython-3.1.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5503aa6eec0faeba03428058a4911994cdf1f668baf84c87fad8c862415c5f3d", size = 3193017, upload-time = "2025-10-20T06:10:20.3Z" }, - { url = "https://files.pythonhosted.org/packages/31/fd/de9461718977b59560630bd0ad07dcb77209df7f4e7774ef0ec8f787433d/cython-3.1.5-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:99943633ea61dfb53093e16827cc66c376b1513fb37f5ce8e052e49f4852ae85", size = 3312092, upload-time = "2025-10-20T06:10:21.998Z" }, - { url = "https://files.pythonhosted.org/packages/c0/e3/5b57fa9a72b24b80ba23225d53886d07b714920e6bb19fc83a09977799b6/cython-3.1.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a82183bbbc8591de7ca902f2a22e2ffc82e31fd1a66f1180931f522050db5eb2", size = 3209437, upload-time = "2025-10-20T06:10:23.784Z" }, - { url = "https://files.pythonhosted.org/packages/fd/14/ebe6d9172d0ed6bca68bb21c384694922d7a8eef6dcf8d4c843be7128f0a/cython-3.1.5-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:9daa08ff24ef526ae2aa5560430a3121f1584b406945a17d7e0bbf9c18bf161a", size = 3375201, upload-time = "2025-10-20T06:10:25.703Z" }, - { url = "https://files.pythonhosted.org/packages/25/30/9e28256ceb70511636f5e5340dfa36a4310a41bc0e190734b62b75a7993b/cython-3.1.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d6d13320e01e719cf9668daa88ccd9f84bae74f26ac1a3779b4ec32bc40feaeb", size = 3323425, upload-time = "2025-10-20T06:10:27.484Z" }, - { url = "https://files.pythonhosted.org/packages/13/ff/0f4dc479c6d4fec80a48613141c8ce8de98d75dc549d01cc87364057c4de/cython-3.1.5-cp314-cp314-win32.whl", hash = "sha256:51a7ef5688d3d37d762ee6df83a567b0a67bde7528a467e9dc82df9d9fc23c46", size = 2503714, upload-time = "2025-10-20T06:10:29.144Z" }, - { url = "https://files.pythonhosted.org/packages/19/75/0cd7a00833496aa4c5eb76e6fa118fc51faf92947e090af799fa6ff30c16/cython-3.1.5-cp314-cp314-win_amd64.whl", hash = "sha256:8ac9324feb0694a941794222444600536f9c44b120b5745e1aa7042504281aa1", size = 2735084, upload-time = "2025-10-20T06:10:30.921Z" }, - { url = "https://files.pythonhosted.org/packages/1b/33/8af1a1d424176a5f8710b687b84dd2f403e41b87b0e0acf569d39723f257/cython-3.1.5-py3-none-any.whl", hash = "sha256:1bef4a168f4f650d17d67b43792ed045829b570f1e4108c6c37a56fe268aa728", size = 1227619, upload-time = "2025-10-20T06:06:48.387Z" }, +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/58/6a8321cc0791876dc2509d7a22fc75535a1a7aa770b3496772f58b0a53a4/cython-3.1.6.tar.gz", hash = "sha256:ff4ccffcf98f30ab5723fc45a39c0548a3f6ab14f01d73930c5bfaea455ff01c", size = 3192329, upload-time = "2025-10-23T12:38:20.786Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/63/bb/23e917f1d2a11834730ff07cdb7e7c87ab72c16090b3d61b86477a38cc68/cython-3.1.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c4027b4d1bf7781fdfb2dbe1c1d81ccac9b910831511747e2c9fc8452fb3ea6b", size = 2989648, upload-time = "2025-10-23T12:38:38.272Z" }, + { url = "https://files.pythonhosted.org/packages/cd/72/9ec7797714c65bf45d11fb33361fd5cb522556d8a2a2e808f17db6a3aaf6/cython-3.1.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:141dea9df09f9c711af3b95510bd417c58b2abd33676eef1cb61f25581f7090a", size = 2914302, upload-time = "2025-10-23T12:38:39.888Z" }, + { url = "https://files.pythonhosted.org/packages/30/cd/63d551eb65273e144e9ee84bf697190586201dd02d2fd719b68e7da724e2/cython-3.1.6-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:486376a988268408b7e8ea7b4cccffb914aa497c498b41589fb4a862ba47e050", size = 3507159, upload-time = "2025-10-23T12:38:41.988Z" }, + { url = "https://files.pythonhosted.org/packages/44/bd/c451e15cd89ee98fa5207689505f9a211f79cdb4d18f2f96a7c9c6e7f3f6/cython-3.1.6-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bdc6e63a04ead11812752a5198b85b7fc079688c76712348d072403f18fdeb49", size = 3261427, upload-time = "2025-10-23T12:38:43.838Z" }, + { url = "https://files.pythonhosted.org/packages/5d/dc/a4102de1a15a2ef56fc46e4486da112a8701b63ff98077d0ebaa39792e44/cython-3.1.6-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:47e79f0bfbf403a5d6008bc9e7214e81e647794ca95cae6716399ba21abcc706", size = 3423208, upload-time = "2025-10-23T12:38:45.953Z" }, + { url = "https://files.pythonhosted.org/packages/e0/d6/dff399500588611e2bf189f191cc03bc985c80aaa263242c3abcd93122f7/cython-3.1.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2379f729f1d5a445adb4621f279f7c23aeb6245f036f96cce14b5b2fd1f5ff0a", size = 3276605, upload-time = "2025-10-23T12:38:47.825Z" }, + { url = "https://files.pythonhosted.org/packages/09/b1/af3d75e6b4363abd8efbe18cf90709b7dee38108846f3c7377ee50b8adcb/cython-3.1.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:1089e18d938b6e742f077e398d52e1701080213c4f203755afde6f1b33d9e051", size = 3521386, upload-time = "2025-10-23T12:38:49.929Z" }, + { url = "https://files.pythonhosted.org/packages/0c/58/6fc30fba52c9cf35bb5d02effc7b16cdc9aa3d3aa56b07e47429c59ee657/cython-3.1.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:73576246abbc62397db85cbdde74d2e5d73dabfdb7e593fdbb3671275ffb50ce", size = 3437394, upload-time = "2025-10-23T12:38:52.145Z" }, + { url = "https://files.pythonhosted.org/packages/69/c9/10bde13a679d1dc90b86bba754d94b126637686f4bba7637e14a923b8962/cython-3.1.6-cp310-cp310-win32.whl", hash = "sha256:f48eae3275b3352ba7eb550fc5321b0fb1ba8d916fa9985fb2f02ce42ae69ddd", size = 2480812, upload-time = "2025-10-23T12:38:54.126Z" }, + { url = "https://files.pythonhosted.org/packages/c9/60/c5dd9af41c9ec6ee406b423458065d2d3427422e0eb1bb91794c8ab3b787/cython-3.1.6-cp310-cp310-win_amd64.whl", hash = "sha256:4066908ee24a18572880966de1d0865d178f5ab9828a9249faa97e1ffdfbed9f", size = 2705655, upload-time = "2025-10-23T12:38:56.064Z" }, + { url = "https://files.pythonhosted.org/packages/a7/44/631939fd36577fccf0c47c9cd14fdc3d8125cde166ed2b2f1abdf9a505cc/cython-3.1.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5a1aedd8990f470d108b76ca768d9f1766d6610cf2546b73075dbe1e523daebe", size = 3022464, upload-time = "2025-10-23T12:38:57.677Z" }, + { url = "https://files.pythonhosted.org/packages/ec/68/700aef24fcf73f77940fec7efa27c18da68f6a5446dfce5e3a253ab707e3/cython-3.1.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f75c33e83e224737b1a68b2868bc08bddaabc6f04aef74864ff6069fe2e68341", size = 2952046, upload-time = "2025-10-23T12:38:59.684Z" }, + { url = "https://files.pythonhosted.org/packages/fd/9e/5dba03cc21190bd6756bb4717038a16cc87930ef32399c6d0e6bbbe538b3/cython-3.1.6-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:91b8fb3e961b3344bf257b851f2ce679727f44857fec94d643bcc458601dab54", size = 3408110, upload-time = "2025-10-23T12:39:01.442Z" }, + { url = "https://files.pythonhosted.org/packages/cb/45/81897d8802666d10086639b0f70702d2f9d03bb5358b012bb109b08b4dd1/cython-3.1.6-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1cfeb04d43464f5ff8398b499ba46c6eef22093da0e74b25f972576e768880e7", size = 3187425, upload-time = "2025-10-23T12:39:03.661Z" }, + { url = "https://files.pythonhosted.org/packages/3b/ed/1a1e93703edf37ee822c03013246d2b4c05a8ea689105051205150dadf07/cython-3.1.6-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f32366c198ac663a540ff4fa6ed55801d113183616c51100f4cc533568d2c4cf", size = 3309991, upload-time = "2025-10-23T12:39:05.801Z" }, + { url = "https://files.pythonhosted.org/packages/6e/11/147aefe4bdc5aa4f273283ea62949001d877808f4ad8a3b4774baf05f0ac/cython-3.1.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9856e8cd7f7a95a3f10a8f15fef4d17e5a4a57fb5185fe3482cec4adb0536635", size = 3202048, upload-time = "2025-10-23T12:39:07.52Z" }, + { url = "https://files.pythonhosted.org/packages/ef/82/6a43a68a1c9e22bef7476eb5a4fd8987812972b6746991b7b16b599aa872/cython-3.1.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:6966f4d4ee13eceade2d952dc63bdf313f413c0c3f165aef0d6f62e6f27dab02", size = 3424512, upload-time = "2025-10-23T12:39:09.241Z" }, + { url = "https://files.pythonhosted.org/packages/2e/d1/40dfa6c02bde72669525a2666aff5b0c75b0ec6f9d965b4beb1582ad4b6c/cython-3.1.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:dffb14bc986626be50003f4edc614a2c0a56cbaaf87259f6c763a6d21da14921", size = 3326637, upload-time = "2025-10-23T12:39:11.376Z" }, + { url = "https://files.pythonhosted.org/packages/58/7c/c8dab163f2c9f8e3c4972aee31a45307f2b96733f799aa036ba05292efa8/cython-3.1.6-cp311-cp311-win32.whl", hash = "sha256:cde4748d37483b6c91df9f4327768e2828b1e374cb61bcee06d618958de59b7b", size = 2478500, upload-time = "2025-10-23T12:39:12.958Z" }, + { url = "https://files.pythonhosted.org/packages/e0/34/895cda4ac7e93460cedb28f609a7c056f09c1db5694ed38058f680c56386/cython-3.1.6-cp311-cp311-win_amd64.whl", hash = "sha256:29d6141b0c9697dfcaf5940eceb06353bec76f51f0579658964c0d29418000df", size = 2709986, upload-time = "2025-10-23T12:39:15.042Z" }, + { url = "https://files.pythonhosted.org/packages/70/cd/6e7bb9ef074d35c1b62af91c9f92126fae992d5a8fb6b47fdd1ade67bf56/cython-3.1.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:0d2c32e8f6c65854e8203b381ff7ab540820763756b7c326e2c8dc18c9bbb44e", size = 3059014, upload-time = "2025-10-23T12:39:16.823Z" }, + { url = "https://files.pythonhosted.org/packages/13/04/a1b4fe2a4c72eb8fdcdf6b680908328f920f813caeb72f1b5d2cea40e45c/cython-3.1.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:be24fcde7300a81712af279467ebc79baafc8483eb4dfa4daebf8ee90a826d39", size = 2966746, upload-time = "2025-10-23T12:39:18.56Z" }, + { url = "https://files.pythonhosted.org/packages/57/44/347f48b0ccfaa8233860a64b88a9df851138058ea923583e68625528710f/cython-3.1.6-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5012025af433bd7188fe1f7705df1c4a67e7add80c71658f6c6bc35ea876cc68", size = 3383297, upload-time = "2025-10-23T12:39:20.231Z" }, + { url = "https://files.pythonhosted.org/packages/98/80/e065d0725614ce9ff43624ae1d9f81647c5fd2d88ecffc2614dde703482d/cython-3.1.6-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3b3520e2d4484f927c3ec00d32ffda75ec72cfd6a2ee07adac721cce339fa26f", size = 3164391, upload-time = "2025-10-23T12:39:22.036Z" }, + { url = "https://files.pythonhosted.org/packages/95/e1/3f86f321ff6bfd31310a5478f5ac56eaac3ea0743f6b76543ff5fbcb2b4e/cython-3.1.6-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c8a01d241d775319bcd7adb4144b070e1c4b01cdf841a62032492f07fad9efdc", size = 3316085, upload-time = "2025-10-23T12:39:23.795Z" }, + { url = "https://files.pythonhosted.org/packages/94/b5/677a2f4faa1c036cedbb715edc933b09de3e235891f1fcdaa82f8c3fdc85/cython-3.1.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:fd88799fa7bb177182423e0745c9197c50938c6839ebfbe6fd01539582ed488e", size = 3176911, upload-time = "2025-10-23T12:39:25.749Z" }, + { url = "https://files.pythonhosted.org/packages/f8/e4/21117a7768ab19fcd766f2dd81f0a61d2d24e7a3649eff306349c2ab99a8/cython-3.1.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:f809bae2e00b79c01ff5daf9a260df7c1bc9fda087b9d625592fa28c1a2248a9", size = 3396231, upload-time = "2025-10-23T12:39:28.168Z" }, + { url = "https://files.pythonhosted.org/packages/b5/4e/1152e9bfa0357d2237449fad94673c273f72c011a54c7227bb1291dd4423/cython-3.1.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6f657e7a4b2242d159de603f280928d8e458dfba48144714774ad76c08f5a530", size = 3327101, upload-time = "2025-10-23T12:39:30.361Z" }, + { url = "https://files.pythonhosted.org/packages/39/fe/b7f9dc5ba8ce221aa7d40587d1d7175871b2ea61917c7fa4d5e85a7c042f/cython-3.1.6-cp312-cp312-win32.whl", hash = "sha256:6502f3e58db0ab3e2c983bec2c8c9e45d602e2c7ff921a5a8515b0008d918102", size = 2483823, upload-time = "2025-10-23T12:39:31.986Z" }, + { url = "https://files.pythonhosted.org/packages/40/d5/60261f023b0bdb28f0b9e8f00690b8bdbef692995184bc57f33811f8a936/cython-3.1.6-cp312-cp312-win_amd64.whl", hash = "sha256:71d099d8d6094c5de63a32e67b29964565aed889a218e8d16a94083f4239b904", size = 2701846, upload-time = "2025-10-23T12:39:33.769Z" }, + { url = "https://files.pythonhosted.org/packages/cc/96/22b43125180d9b2814da4271d9450a5cc4623a6c6439b6b1d8faa7675c81/cython-3.1.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:14f0d6b9f803eacf48e9e80ea12a03f54e5f5ac48914341b0a6b81554b3b3154", size = 3047517, upload-time = "2025-10-23T12:39:35.641Z" }, + { url = "https://files.pythonhosted.org/packages/db/09/8abf6ccb13d1e2589e60320423f861952cf4c4ec092cd8536e1beb018e9c/cython-3.1.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ec79615d0e29fa29fd4283bc7a2ed9c3d00532086a0031532d64b724db8c3e8e", size = 2954975, upload-time = "2025-10-23T12:39:37.568Z" }, + { url = "https://files.pythonhosted.org/packages/a6/4d/c3455fb738f52d536e7a113749c0a2242943251ce2d0dfac0e42ebba2fc0/cython-3.1.6-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:037d457738cf4fc12260946c6524b745f488cf413428099f2a064af7612d181f", size = 3355462, upload-time = "2025-10-23T12:39:39.462Z" }, + { url = "https://files.pythonhosted.org/packages/6b/b4/923f4d7ca7d987573aa2df0ca48fa9a103a48ddf1aec9cd8fcef9618b787/cython-3.1.6-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b036cb4ed7abcbc89cc04311832b22ad386c532fdd1fe690e1364aa992a54c7", size = 3150852, upload-time = "2025-10-23T12:39:41.416Z" }, + { url = "https://files.pythonhosted.org/packages/f0/2c/985dd11b6cc3ac2e460c5e0b59030aebca66a85f9423db90e5186e8e9087/cython-3.1.6-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e0fb2694327834c5bda7c5a07605f76437354d0ff76bb8739e77b479d176cf52", size = 3304059, upload-time = "2025-10-23T12:39:43.154Z" }, + { url = "https://files.pythonhosted.org/packages/69/af/b3af74d1d10a0f6d4d9fcdd836959ae54dabb36f84f316b09ccb84dbd8e0/cython-3.1.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92eb7a39e60426165a5b2a219af181e5695c4dedd598e317a7a4d9086bd66b91", size = 3166353, upload-time = "2025-10-23T12:39:45.146Z" }, + { url = "https://files.pythonhosted.org/packages/f1/2d/48130ecef876f141aaded34a961f32be45d2f36aa285de08d2e81aa5fec3/cython-3.1.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:c475018b28f4e7111148bd02b600595090e0aac6cc49615c4586bb4e7f164a22", size = 3368659, upload-time = "2025-10-23T12:39:46.908Z" }, + { url = "https://files.pythonhosted.org/packages/2f/b2/0cd9ff5be3f0d224bc139eea8a8e83066d61ad424cf7fd0f43c3c4b791d4/cython-3.1.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b1b4bb661103cb95c6ca70daf5d39992b2d89fd260b02a54d92e365095ed37eb", size = 3316247, upload-time = "2025-10-23T12:39:48.699Z" }, + { url = "https://files.pythonhosted.org/packages/f9/0f/55f95e166c591fb8fd8caeb1f2c86cf86ef6f7f929a56094615ab757dc11/cython-3.1.6-cp313-cp313-win32.whl", hash = "sha256:69b1bea23b51628b8c9f14c3e0bb4c7dd5be63781bfbaa581b1c683b473c728a", size = 2477610, upload-time = "2025-10-23T12:39:51.014Z" }, + { url = "https://files.pythonhosted.org/packages/2e/07/23aa4577513a5e918c0deaf8a2ab8a9a5e6703e3fe554e3bc2c3bda1ef58/cython-3.1.6-cp313-cp313-win_amd64.whl", hash = "sha256:c844004712a9fe2a6f2ed4d6fe02aabb2e0e34f88c150724aad1afec7caff37a", size = 2699460, upload-time = "2025-10-23T12:39:54.146Z" }, + { url = "https://files.pythonhosted.org/packages/5b/16/e399f6fd33912116aba8bcdfeadd6093ff14996d7b5b72212fe4301e9f96/cython-3.1.6-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:8893619efa77fc83934c1255c619d522711a5cf5933cef0d5c2b9755e8e5fabc", size = 3042822, upload-time = "2025-10-23T12:39:56.081Z" }, + { url = "https://files.pythonhosted.org/packages/94/aa/5500ff58f8972431c0e74783546b8cdc39511493aa44b74a7fde1ec4e654/cython-3.1.6-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:bb49c74220af0b098f406701f0b87876b1c7614716d39786306986b9feea774b", size = 2963154, upload-time = "2025-10-23T12:39:57.933Z" }, + { url = "https://files.pythonhosted.org/packages/cb/04/caa7893a4259e4bdb333a40a2105d58b53294445d9d2cf948eac9f0346b5/cython-3.1.6-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:defbf9571fca78e8a6e21b93d35c0a491d6af77a8e6180a0146da1b3c8eb8ce6", size = 3356015, upload-time = "2025-10-23T12:39:59.856Z" }, + { url = "https://files.pythonhosted.org/packages/df/da/6736caaf38a4d9f09db4b8dd76d0c8f7937820c2eef4d899f80259566298/cython-3.1.6-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cd7ea8c6ce0adf52d142bf37c4d54b8d0356818144a4584a24f2a0b9cdae6b8", size = 3188923, upload-time = "2025-10-23T12:40:01.926Z" }, + { url = "https://files.pythonhosted.org/packages/e8/ba/5dbee7f80c11c57a68b1e26d285e106ab259e7cf50536369b28f952b5809/cython-3.1.6-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c47fcc47553214e0a139fd33199d825c5d13970cd6c1039d2594af855ffb338", size = 3308343, upload-time = "2025-10-23T12:40:03.673Z" }, + { url = "https://files.pythonhosted.org/packages/81/c0/2759f4e2ec2f10ac941b2963de217f0ee6c0f6b2767ddcbaeba799c77dec/cython-3.1.6-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:92489385bca6d1935913540e35701a979618fdfeed4dbec6cad1be924fb487bf", size = 3205352, upload-time = "2025-10-23T12:40:05.431Z" }, + { url = "https://files.pythonhosted.org/packages/c7/fc/077b0084300d42bc69f4c9468c1946882884db859daa48b2b98b8f194fad/cython-3.1.6-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:926a3efd9b7012cdb3df0d1886e6f0e32e0b72a5d311ac2d3f48c0716fd91c6d", size = 3371256, upload-time = "2025-10-23T12:40:07.174Z" }, + { url = "https://files.pythonhosted.org/packages/60/71/4461521017e51b66a2d8dd443a596d636c87149e2d6ae95d664cbfdb1303/cython-3.1.6-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e35118eedfa0138154a43fb6b14e83703dae93193ba9940c747c170ed845cca7", size = 3319689, upload-time = "2025-10-23T12:40:09.181Z" }, + { url = "https://files.pythonhosted.org/packages/5b/53/f8dfff20e06dd3a6a39ed7b5ba784a9797eb206ec7df56f35c0e0ca31a49/cython-3.1.6-cp314-cp314-win32.whl", hash = "sha256:27f2b26442737d6e080900284883e078aae0276dfd7715a49b338f1a9481f7b9", size = 2499779, upload-time = "2025-10-23T12:40:11.306Z" }, + { url = "https://files.pythonhosted.org/packages/0a/cd/fef529bcc8eb6b55caf8bda524ee6194593137579fdc4ee616ff2a40dd2a/cython-3.1.6-cp314-cp314-win_amd64.whl", hash = "sha256:7f75ead2a7cad5ee719427b915711c70e40a114f045b2a9b5bd983484a0b83a7", size = 2731204, upload-time = "2025-10-23T12:40:13.878Z" }, + { url = "https://files.pythonhosted.org/packages/18/d5/7a04640bf559bb890455ffb28978daf7d44f667c3f04a4d422c655c1ba92/cython-3.1.6-py3-none-any.whl", hash = "sha256:91dcf7eb9b6a089ce4e9e1140e571d84c3bca834afb77ec269be7aa9d31a8157", size = 1223550, upload-time = "2025-10-23T12:38:16.732Z" }, ] [[package]] @@ -1794,7 +1794,7 @@ http2 = [ [[package]] name = "huggingface-hub" -version = "0.35.3" +version = "0.36.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, @@ -1806,9 +1806,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/10/7e/a0a97de7c73671863ca6b3f61fa12518caf35db37825e43d63a70956738c/huggingface_hub-0.35.3.tar.gz", hash = "sha256:350932eaa5cc6a4747efae85126ee220e4ef1b54e29d31c3b45c5612ddf0b32a", size = 461798, upload-time = "2025-09-29T14:29:58.625Z" } +sdist = { url = "https://files.pythonhosted.org/packages/98/63/4910c5fa9128fdadf6a9c5ac138e8b1b6cee4ca44bf7915bbfbce4e355ee/huggingface_hub-0.36.0.tar.gz", hash = "sha256:47b3f0e2539c39bf5cde015d63b72ec49baff67b6931c3d97f3f84532e2b8d25", size = 463358, upload-time = "2025-10-23T12:12:01.413Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/31/a0/651f93d154cb72323358bf2bbae3e642bdb5d2f1bfc874d096f7cb159fa0/huggingface_hub-0.35.3-py3-none-any.whl", hash = "sha256:0e3a01829c19d86d03793e4577816fe3bdfc1602ac62c7fb220d593d351224ba", size = 564262, upload-time = "2025-09-29T14:29:55.813Z" }, + { url = "https://files.pythonhosted.org/packages/cb/bd/1a875e0d592d447cbc02805fd3fe0f497714d6a2583f59d14fa9ebad96eb/huggingface_hub-0.36.0-py3-none-any.whl", hash = "sha256:7bcc9ad17d5b3f07b57c78e79d527102d08313caa278a641993acddcb894548d", size = 566094, upload-time = "2025-10-23T12:11:59.557Z" }, ] [[package]] From 855aa49176f6ff44f24b8f389ad66fcddc54217c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 23 Oct 2025 17:44:55 +0200 Subject: [PATCH 039/248] ci: Fix copyright checker (#1889) (#1890) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/cicd-main.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 94d486f2fb5..047926a188e 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -61,10 +61,12 @@ jobs: - name: Check membership id: check-membership + env: + IS_MAIN_BRANCH: ${{ github.ref == 'refs/heads/main' }} run: | PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }} - if [ "${{ env.SCHEDULED_JOB }}" == "true" ]; then + if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ]; then echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT exit 0 fi From 6068fa9edf1bbb4cbdf6b33d3a4753642af18fb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 23 Oct 2025 15:50:15 +0000 Subject: [PATCH 040/248] ci: Run on dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/cicd-main.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 047926a188e..94d486f2fb5 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -61,12 +61,10 @@ jobs: - name: Check membership id: check-membership - env: - IS_MAIN_BRANCH: ${{ github.ref == 'refs/heads/main' }} run: | PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }} - if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ]; then + if [ "${{ env.SCHEDULED_JOB }}" == "true" ]; then echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT exit 0 fi From 885679487613c17360c30dcec2b6d802dc5d9cde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 23 Oct 2025 18:18:55 +0200 Subject: [PATCH 041/248] ci: Bump copyright header (#1894) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/copyright-check.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/copyright-check.yml b/.github/workflows/copyright-check.yml index 0584b2692c2..ff135c6c958 100644 --- a/.github/workflows/copyright-check.yml +++ b/.github/workflows/copyright-check.yml @@ -31,7 +31,7 @@ jobs: if: | !(needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true') - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_copyright_check.yml@v0.65.3 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_copyright_check.yml@v0.65.4 with: from-year: 2019 From beceec698486a033262116199f90318ec3df865e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 23 Oct 2025 16:23:49 +0000 Subject: [PATCH 042/248] ci: Allow runs on dev branch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/cicd-main.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 94d486f2fb5..f21fdfed446 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -61,10 +61,13 @@ jobs: - name: Check membership id: check-membership + env: + IS_MAIN_BRANCH: ${{ github.ref == 'refs/heads/main' }} + IS_DEV_BRANCH: ${{ github.ref == 'refs/heads/dev' }} run: | PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }} - if [ "${{ env.SCHEDULED_JOB }}" == "true" ]; then + if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_DEV_BRANCH}" == "true" ]; then echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT exit 0 fi From 13b6a3675b86a395174d7cca4cc8b636a7cb2704 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 23 Oct 2025 16:29:20 +0000 Subject: [PATCH 043/248] ci: Linting on push MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/cicd-main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index f21fdfed446..e70677e19fb 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -187,7 +187,7 @@ jobs: export PATH=".venv/bin:$PATH" export GITLAB_ENDPOINT=github.com export CI_PROJECT_NAMESPACE=NVIDIA - export BASE_REF="${{ startsWith(github.ref, 'refs/heads/pull-request/') && fromJSON(steps.get-pr-info.outputs.pr-info).base.ref || 'HEAD~1' }}" + export BASE_REF="${{ startsWith(github.ref, 'refs/heads/pull-request/') && fromJSON(steps.get-pr-info.outputs.pr-info).base.ref || github.sha }}" export CHECK_ONLY=true export SKIP_DOCS=false bash tools/autoformat.sh From 14a0a23f4a0e8b2b3c67051c2bd9fbdd4775b62e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 23 Oct 2025 16:31:21 +0000 Subject: [PATCH 044/248] ci: Run linting only on PR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/cicd-main.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index e70677e19fb..89d33506082 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -183,11 +183,12 @@ jobs: uses: nv-gha-runners/get-pr-info@main - name: Run linting + if: startsWith(github.ref, 'refs/heads/pull-request/') run: | export PATH=".venv/bin:$PATH" export GITLAB_ENDPOINT=github.com export CI_PROJECT_NAMESPACE=NVIDIA - export BASE_REF="${{ startsWith(github.ref, 'refs/heads/pull-request/') && fromJSON(steps.get-pr-info.outputs.pr-info).base.ref || github.sha }}" + export BASE_REF="${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}" export CHECK_ONLY=true export SKIP_DOCS=false bash tools/autoformat.sh From 8e035496979cd6eb37595975ab725d93c69a8143 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 23 Oct 2025 17:09:16 +0000 Subject: [PATCH 045/248] ci(fix): HAS_RUN_TESTS_LABEL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/actions/action.yml | 2 +- .github/workflows/cicd-main.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/action.yml b/.github/actions/action.yml index d2f43599182..831f840d22b 100644 --- a/.github/actions/action.yml +++ b/.github/actions/action.yml @@ -111,7 +111,7 @@ runs: GH_TOKEN: ${{ github.token }} run: | PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} - HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') + HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false" echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT - name: Create run-script (e2e test) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 89d33506082..38739c07b1f 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -363,7 +363,7 @@ jobs: GH_TOKEN: ${{ secrets.PAT }} run: | PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} - HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') + HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false" echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT - name: Parse functional tests From da842988caa8fcf68ff6e153f446244f06eb629e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 23 Oct 2025 17:50:57 +0000 Subject: [PATCH 046/248] ci: Fix linting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/02.test.yml | 18 +------- tools/check_copyright.py | 94 ++++++++++++++++++++++++++++++++++++++ tools/copyright.sh | 50 +++++++++----------- 3 files changed, 118 insertions(+), 44 deletions(-) create mode 100644 tools/check_copyright.py diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index af972c8d0cf..db10271da15 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -286,23 +286,8 @@ test:linting_formatting: exit 0 fi - set +e - - git fetch origin main:main - - echo -e "machine gitlab-master.nvidia.com\n login gitlab-ci-token\n password $CI_JOB_TOKEN" >~/.netrc - - chmod 600 ~/.netrc - - | - if [[ "$CI_MERGE_REQUEST_PROJECT_PATH" == "$CI_MERGE_REQUEST_SOURCE_PROJECT_PATH" ]]; then - BASE_REF="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" bash tools/autoformat.sh - set -e - git fetch origin $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME - git checkout $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME - git config --global user.email "mcore-bot@nvidia.com" - git config --global user.name "Mcore Bot" - git remote set-url origin "https://gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git" - git add -A . - git commit -m "chore: Format files" || true - git push -u origin $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME - fi - env + - export GITLAB_ENDPOINT=gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT} - BASE_REF="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" CHECK_ONLY=true SKIP_DOCS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Skip docs"* ]] && echo "true" || echo "false") bash tools/autoformat.sh test:linting_copyright: @@ -318,6 +303,7 @@ test:linting_copyright: needs: [test:build_image] script: - git fetch origin main + - export GITLAB_ENDPOINT=gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT} - bash tools/copyright.sh # Override from template diff --git a/tools/check_copyright.py b/tools/check_copyright.py new file mode 100644 index 00000000000..a62334d2421 --- /dev/null +++ b/tools/check_copyright.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +""" +Script to check and optionally add NVIDIA copyright headers to files. +""" + +import sys +import argparse +from pathlib import Path +from datetime import datetime + +EXPECTED_HEADER = """# Copyright (c) {}-{}, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + + +def has_correct_header(file_path, from_year: int): + """Check if file has the correct copyright header.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Check if the expected header is at the start of the file + return content.startswith(EXPECTED_HEADER.format(from_year, str(datetime.now().year))) + except Exception as e: + print(f"Error reading {file_path}: {e}") + return False + + +def main(): + parser = argparse.ArgumentParser( + description='Check and add NVIDIA copyright headers to files.' + ) + parser.add_argument( + 'files', + nargs='+', + help='Files to check/modify' + ) + parser.add_argument( + '--from-year', + type=int, + required=True, + help='Project creation year' + ) + + args = parser.parse_args() + + missing_headers = [] + + for file_path in args.files: + path = Path(file_path) + + if not path.exists(): + print(f"File not found: {file_path}") + continue + + if not path.is_file(): + print(f"Not a file: {file_path}") + continue + + if has_correct_header(path, args.from_year): + print(f"✓ Header present: {file_path}") + else: + print(f"✗ Header missing: {file_path}") + missing_headers.append(path) + + # Exit with error code if headers are missing and not added + if missing_headers: + print(f"\n{len(missing_headers)} file(s) missing copyright header.") + print("\n") + print("Add or replace the header in those files with the following content:") + print(EXPECTED_HEADER) + print("\n") + print( + "Disclaimer: This must done irrespective of the magnitude of the change " + "or whether your are the file/module author." + ) + sys.exit(1) + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/tools/copyright.sh b/tools/copyright.sh index 66098f84d2b..3223733647e 100644 --- a/tools/copyright.sh +++ b/tools/copyright.sh @@ -1,34 +1,28 @@ #!/bin/bash +set -euox pipefail -# Files ending with .py should have Copyright notice in the first line. -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +GIT_VERSION=$(git version | awk '{print $3}') +GIT_MAJOR=$(echo $GIT_VERSION | awk -F. '{print $1}') +GIT_MINOR=$(echo $GIT_VERSION | awk -F. '{print $2}') -# Move to the project root -cd $SCRIPT_DIR/.. -find_files_with_missing_copyright() { -find ./megatron/ -type f -name '*.py' | while read path; do - echo -en $path"\t" - head -2 $path | grep -iv 'coding=' | head -1 -done \ - | egrep -iv 'Copyright.*NVIDIA CORPORATION.*All rights reserved.' \ - | grep -iv 'BSD 3-Clause License' \ - | grep -iv 'Copyright.*Microsoft' \ - | grep -iv 'Copyright.*The Open AI Team' \ - | grep -iv 'Copyright.*The Google AI' \ - | grep -iv 'Copyright.*Facebook' | while read line; do - echo $line | cut -d' ' -f1 - done -} +if [[ $GIT_MAJOR -eq 2 && $GIT_MINOR -lt 31 ]]; then + echo "Git version must be at least 2.31.0. Found $GIT_VERSION" + exit 1 +fi +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) -declare RESULT=($(find_files_with_missing_copyright)) # (..) = array +BASE_REF=${BASE_REF:-main} +git remote set-url origin "https://${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git" +git fetch origin ${BASE_REF} +CHANGED_FILES=$(git diff --name-only --diff-filter=d --merge-base origin/${BASE_REF} megatron/core tests/ | grep '\.py$' || true) -if [ "${#RESULT[@]}" -gt 0 ]; then - echo "Error: Found files with missing copyright:" - for (( i=0; i<"${#RESULT[@]}"; i++ )); do - echo "path= ${RESULT[$i]}" - done - exit 1; -else - echo "Ok: All files start with copyright notice" -fi +if [[ -n "$CHANGED_FILES" ]]; then + CMD="python ${SCRIPT_DIR}/check_copyright.py" + + # Add the files + CMD="$CMD --from-year 2019 $CHANGED_FILES" + + # Run the check + eval $CMD +fi \ No newline at end of file From 38166a61514d121bac99341763238fe2c984d969 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 23 Oct 2025 20:46:29 +0200 Subject: [PATCH 047/248] ci: Add codeowners to dev branch (#1898) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/CODEOWNERS | 15 +++++++++++++++ .gitlab/stages/02.test.yml | 24 ++++++++++++++++++++---- 2 files changed, 35 insertions(+), 4 deletions(-) create mode 100644 .github/CODEOWNERS diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 00000000000..cc3cb0dbc58 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,15 @@ +megatron/core @NVIDIA/core-nemo @NVIDIA/core-devtech + +.gitlab/ @NVIDIA/ci +.github/ @NVIDIA/ci +.gitlab-ci.yml @NVIDIA/ci +docker/ @NVIDIA/ci +tests/unit_tests/run_ci_test.sh @NVIDIA/ci +tests/test_utils/python_scripts/ +tests/functional_tests/python_test_utils/ @NVIDIA/ci +tests/functional_tests/shell_test_utils/ @NVIDIA/ci +megatron/core/transformer/transformer_block.py @NVIDIA/ci +megatron/core/transformer/transformer_layer.py @NVIDIA/ci +tests/functional_tests/test_cases/ @NVIDIA/ci +tests/functional_tests/recipes/ @NVIDIA/ci +tests/unit_tests/ @NVIDIA/ci diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index db10271da15..f4f06fbca9d 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -11,8 +11,10 @@ include: wait_for_resources: extends: [.test_rules] needs: - - test:linting_formatting - - test:linting_copyright + - job: test:linting_formatting + optional: true + - job: test:linting_copyright + optional: true - job: test:linting_secret_detection optional: true - test:build_image @@ -127,8 +129,10 @@ test:unit_tests_configure: .unit_tests_run: needs: - - test:linting_formatting - - test:linting_copyright + - job: test:linting_formatting + optional: true + - job: test:linting_copyright + optional: true - job: test:linting_secret_detection optional: true - test:unit_tests_configure @@ -280,6 +284,12 @@ test:linting_formatting: needs: [test:build_image] variables: GIT_STRATEGY: "clone" + rules: + - if: $PUBLISH == "yes" + when: never + - if: $CI_PIPELINE_SOURCE == 'push' + when: never + - when: on_success script: - | if [[ "$CI_PIPELINE_SOURCE" != "merge_request_event" ]]; then @@ -301,6 +311,12 @@ test:linting_copyright: - team/megatron image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID} needs: [test:build_image] + rules: + - if: $PUBLISH == "yes" + when: never + - if: $CI_PIPELINE_SOURCE == 'push' + when: never + - when: on_success script: - git fetch origin main - export GITLAB_ENDPOINT=gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT} From 620826b0f7d7e2c588d0584f3e491c4b04fc7694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 23 Oct 2025 19:58:29 +0000 Subject: [PATCH 048/248] ci(fix): dynamic inference tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- tests/test_utils/recipes/gpt-dynamic-inference.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_utils/recipes/gpt-dynamic-inference.yaml b/tests/test_utils/recipes/gpt-dynamic-inference.yaml index 914d3c0a757..748e4734a6d 100644 --- a/tests/test_utils/recipes/gpt-dynamic-inference.yaml +++ b/tests/test_utils/recipes/gpt-dynamic-inference.yaml @@ -43,7 +43,7 @@ spec: "DATA_CACHE_PATH=/workspace/data/cache" "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_dynamic_inference.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_$CLUSTER.json" "OUTPUT_PATH={assets_dir}" "TENSORBOARD_PATH={assets_dir}/generations_{environment}_$CLUSTER.json" "N_REPEAT={n_repeat}" @@ -74,4 +74,3 @@ products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - From 829ae2fa40d4e68c22eb4338cbd7bfc4216ac007 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 23 Oct 2025 21:01:12 +0000 Subject: [PATCH 049/248] ci(fix): No copyright on push MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/02.test.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index f4f06fbca9d..98bcaeefc7d 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -287,7 +287,7 @@ test:linting_formatting: rules: - if: $PUBLISH == "yes" when: never - - if: $CI_PIPELINE_SOURCE == 'push' + - if: $CI_PIPELINE_SOURCE == 'push' || $CI_PIPELINE_SOURCE == 'schedule' when: never - when: on_success script: @@ -318,6 +318,10 @@ test:linting_copyright: when: never - when: on_success script: + - | + if [[ "$CI_PIPELINE_SOURCE" != "merge_request_event" ]]; then + exit 0 + fi - git fetch origin main - export GITLAB_ENDPOINT=gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT} - bash tools/copyright.sh From f73769735d423a1adcdceb2aa81f3ce71febc65e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 24 Oct 2025 05:46:02 +0200 Subject: [PATCH 050/248] ci: Move test optimizer into its own bucket (#1909) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/actions/action.yml | 2 +- .github/workflows/cicd-approve-test-queue.yml | 45 +++++++++--- .github/workflows/cicd-main.yml | 40 ++++++++--- .github/workflows/copyright-check.yml | 6 +- .gitlab/stages/02.test.yml | 1 + tests/test_utils/recipes/unit-tests.yaml | 28 ++++++-- tests/unit_tests/find_test_cases.py | 70 +++++++++++++++++++ tests/unit_tests/run_ci_test.sh | 27 ++----- tools/check_copyright.py | 29 ++------ 9 files changed, 170 insertions(+), 78 deletions(-) create mode 100644 tests/unit_tests/find_test_cases.py diff --git a/.github/actions/action.yml b/.github/actions/action.yml index 831f840d22b..157cb8ec5d1 100644 --- a/.github/actions/action.yml +++ b/.github/actions/action.yml @@ -82,7 +82,7 @@ runs: uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \ --scope unit-tests \ --model unit-tests \ - --test-case '${{ inputs.test_case }}' \ + --test-case "${{ inputs.test_case }}" \ --environment dev \ --platform dgx_h100 \ --tag ${{ inputs.tag }} \ diff --git a/.github/workflows/cicd-approve-test-queue.yml b/.github/workflows/cicd-approve-test-queue.yml index 3e8052c6777..bd87e1d725d 100644 --- a/.github/workflows/cicd-approve-test-queue.yml +++ b/.github/workflows/cicd-approve-test-queue.yml @@ -41,8 +41,8 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.PAT }} MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }} + shell: python run: | - python - <= MAX_CONCURRENCY: print("Maximum concurrency reached, stopping approvals") @@ -113,7 +138,9 @@ jobs: workflow_id = workflow["id"] workflow_name = workflow["display_title"] - print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}") + pr_info = workflow.get("pull_requests", [{}])[0] + pr_number = pr_info.get("number", "unknown") + print(f"Approving workflow {workflow_name} (PR #{pr_number}) with Run Id: {workflow_id}") deployment_url = f"actions/runs/{workflow_id}/pending_deployments" deployment = make_request(deployment_url)[0] @@ -132,8 +159,6 @@ jobs: else: print(f"Failed to approve deployment {deployment['id']}") exit(1) - - EOF notify: if: failure() runs-on: ubuntu-latest diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 38739c07b1f..4a1ae76b081 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -296,21 +296,41 @@ jobs: secrets: | GH_TOKEN=${{ secrets.PAT }} + cicd-parse-unit-tests: + runs-on: ubuntu-latest + outputs: + unit-tests: ${{ steps.parse-unit-tests.outputs.unit-tests }} + needs: + - pre-flight + - cicd-wait-in-queue + - cicd-container-build + if: | + ( + success() + || needs.pre-flight.outputs.is_ci_workload == 'true' + || needs.pre-flight.outputs.force_run_all == 'true' + ) + && needs.pre-flight.outputs.is_merge_group == 'false' + && !cancelled() + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Parse unit tests + id: parse-unit-tests + run: | + cat tests/test_utils/recipes/unit-tests.yaml | yq -o json '[.products[].test_case[] | { "bucket": .}]' | jq -c > unit-tests.json + echo "unit-tests=$(cat unit-tests.json)" | tee -a $GITHUB_OUTPUT + cicd-unit-tests-latest: strategy: fail-fast: false matrix: - include: - - bucket: "unit_tests" - - bucket: "unit_tests/data/" - - bucket: "unit_tests/dist_checkpointing/*.py" - - bucket: "unit_tests/dist_checkpointing/models/" - - bucket: "unit_tests/transformer/*.py" - - bucket: "unit_tests/transformer/moe" + include: ${{ fromJson(needs.cicd-parse-unit-tests.outputs.unit-tests) }} needs: - pre-flight - cicd-wait-in-queue - cicd-container-build + - cicd-parse-unit-tests runs-on: nvidia-ci-aws-gpu-x8 name: "${{ matrix.bucket }} - latest" environment: nemo-ci @@ -332,12 +352,12 @@ jobs: - name: main uses: ./.github/actions with: - test_case: tests/${{ matrix.bucket }} + test_case: ${{ matrix.bucket }} tag: latest timeout: ${{ matrix.timeout || 30 }} is_unit_test: "true" PAT: ${{ secrets.PAT }} - container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }} + container-image: ${{ env.container-registry }}/megatron-lm:1909 #${{ github.sha }} cicd-parse-integration-tests: runs-on: ubuntu-latest @@ -414,7 +434,7 @@ jobs: - pre-flight - cicd-wait-in-queue - cicd-parse-integration-tests - # - cicd-unit-tests-latest + - cicd-unit-tests-latest runs-on: nvidia-ci-aws-gpu-x8 name: "${{ matrix.model }}/${{ matrix.test_case }} - latest" environment: nemo-ci diff --git a/.github/workflows/copyright-check.yml b/.github/workflows/copyright-check.yml index ff135c6c958..0463e1dd962 100644 --- a/.github/workflows/copyright-check.yml +++ b/.github/workflows/copyright-check.yml @@ -10,7 +10,7 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License.. +# limitations under the License. name: Copyright check @@ -31,9 +31,7 @@ jobs: if: | !(needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true') - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_copyright_check.yml@v0.65.4 - with: - from-year: 2019 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_copyright_check.yml@v0.65.9 copyright-check-summary: needs: [pre-flight, copyright-check] diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index 98bcaeefc7d..699bef68181 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -323,6 +323,7 @@ test:linting_copyright: exit 0 fi - git fetch origin main + - BASE_REF="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" - export GITLAB_ENDPOINT=gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT} - bash tools/copyright.sh diff --git a/tests/test_utils/recipes/unit-tests.yaml b/tests/test_utils/recipes/unit-tests.yaml index aef67781168..d84e507c6df 100644 --- a/tests/test_utils/recipes/unit-tests.yaml +++ b/tests/test_utils/recipes/unit-tests.yaml @@ -59,49 +59,63 @@ spec: cp coverage.xml {assets_dir} products: - - test_case: [tests/unit_tests/data/] + - test_case: [tests/unit_tests/data/**/*.py] products: - environment: [lts, dev] tag: [latest, legacy] scope: [unit-tests] n_repeat: [1] time_limit: [1800] - - test_case: [tests/unit_tests/dist_checkpointing/*.py] + - test_case: [tests/unit_tests/dist_checkpointing/test_optimizer.py] products: - environment: [lts, dev] tag: [latest, legacy] scope: [unit-tests] n_repeat: [1] time_limit: [1800] - - test_case: [tests/unit_tests/dist_checkpointing/models/] + - test_case: [tests/unit_tests/dist_checkpointing/**/*.py] products: - environment: [lts, dev] tag: [latest, legacy] scope: [unit-tests] n_repeat: [1] time_limit: [1800] - - test_case: [tests/unit_tests/transformer/*.py] + - test_case: [tests/unit_tests/dist_checkpointing/models/**/*.py] products: - environment: [lts, dev] tag: [latest, legacy] scope: [unit-tests] n_repeat: [1] time_limit: [1800] - - test_case: [tests/unit_tests/transformer/moe] + - test_case: [tests/unit_tests/dist_checkpointing/models/test_moe_experts.py] products: - environment: [lts, dev] tag: [latest, legacy] scope: [unit-tests] n_repeat: [1] time_limit: [1800] - - test_case: [tests/unit_tests/distributed/fsdp] + - test_case: [tests/unit_tests/transformer/**/*.py] + products: + - environment: [lts, dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/transformer/moe/**/*.py] + products: + - environment: [lts, dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/distributed/fsdp/**/*.py] products: - environment: [lts, dev] tag: [latest] scope: [unit-tests] n_repeat: [1] time_limit: [1800] - - test_case: [tests/unit_tests] + - test_case: [tests/unit_tests/**/*.py] products: - environment: [lts, dev] tag: [latest, legacy] diff --git a/tests/unit_tests/find_test_cases.py b/tests/unit_tests/find_test_cases.py new file mode 100644 index 00000000000..2e9f5515b7d --- /dev/null +++ b/tests/unit_tests/find_test_cases.py @@ -0,0 +1,70 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +import subprocess +import sys +from pathlib import Path + + +def get_test_cases(yaml_file): + result = subprocess.run( + ['yq', 'eval', '.products[].test_case[]', yaml_file], + capture_output=True, + text=True, + check=True, + ) + return [line.strip() for line in result.stdout.strip().split('\n') if line.strip()] + + +def get_base_path(pattern): + if '**' in pattern: + return pattern.split('/**')[0] + elif '*' in pattern: + return pattern.rsplit('/', 1)[0] + return pattern.rstrip('/') + + +def is_child_of_bucket(test_case, bucket): + test_base = get_base_path(test_case) + bucket_base = get_base_path(bucket) + return test_base.startswith(bucket_base + '/') + + +def expand_pattern(pattern): + if '**' in pattern: + parts = pattern.split('/**/') + if len(parts) == 2: + base_dir, file_pattern = parts + else: + # Handle case like 'dir/**' + base_dir = pattern.split('/**')[0] + file_pattern = '*.py' + return [str(f) for f in Path(base_dir).rglob(file_pattern) if f.is_file()] + elif '*' in pattern: + base_dir, file_pattern = pattern.rsplit('/', 1) + return [str(f) for f in Path(base_dir).glob(file_pattern) if f.is_file()] + elif Path(pattern).is_file(): + return [pattern] + return [] + + +def main(): + BUCKET = sys.argv[1] + YAML_FILE = 'tests/test_utils/recipes/unit-tests.yaml' + + all_test_cases = get_test_cases(YAML_FILE) + bucket_files = set(expand_pattern(BUCKET)) + + # Collect files from child test cases to ignore + files_to_ignore = set() + for test_case in all_test_cases: + if test_case != BUCKET and is_child_of_bucket(test_case, BUCKET): + files_to_ignore.update(expand_pattern(test_case)) + + # Output files to ignore + for file in sorted(files_to_ignore & bucket_files): + print(f"--ignore={file}") + + +if __name__ == '__main__': + main() diff --git a/tests/unit_tests/run_ci_test.sh b/tests/unit_tests/run_ci_test.sh index 7e12ebbab1e..81dd3ae2a14 100755 --- a/tests/unit_tests/run_ci_test.sh +++ b/tests/unit_tests/run_ci_test.sh @@ -114,27 +114,10 @@ for element in "${MARKER[@]:1}"; do done export BUCKET -IGNORE_TEST_CASES=$( - cat $SCRIPT_PATH/../test_utils/recipes/unit-tests.yaml | - yq eval ' - with(.products[].test_case; del(.[] | select(. == env(BUCKET)))) - | .products[].test_case[] - ' | - tr " " "\n" -) - IGNORE_ARGS=() -while IFS= read -r test_case; do - if [[ $test_case == *\** ]]; then - FILES=($(ls $test_case)) - echo ${FILES[@]} - for file in "${FILES[@]}"; do - IGNORE_ARGS+=("--ignore='$file'") - done - else - IGNORE_ARGS+=("--ignore=$test_case") - fi -done <<<"$IGNORE_TEST_CASES" +while IFS= read -r line; do + [[ -n "$line" ]] && IGNORE_ARGS+=("$line") +done < <(python tests/unit_tests/find_test_cases.py "$BUCKET") echo "------ARGUMENTS for SLURM ---" MASTER_ADDR=${MASTER_ADDR:-localhost} @@ -167,7 +150,7 @@ for i in $(seq $UNIT_TEST_REPEAT); do -m pytest \ -xvs \ ${IGNORE_ARGS[@]} \ - -m "'not experimental and ${MARKER_ARG}'" $BUCKET) + -m "'not experimental and ${MARKER_ARG}'" $(echo "$BUCKET" | sed 's|/\*\*/\*\.py$||')) eval "$CMD" if [[ "$TAG" == "latest" ]]; then @@ -175,7 +158,7 @@ for i in $(seq $UNIT_TEST_REPEAT); do -xvs \ --experimental \ ${IGNORE_ARGS[@]} \ - -m "'experimental and ${MARKER_ARG}'" $BUCKET) + -m "'experimental and ${MARKER_ARG}'" $(echo "$BUCKET" | sed 's|/\*\*/\*\.py$||')) eval "$CMD" fi diff --git a/tools/check_copyright.py b/tools/check_copyright.py index a62334d2421..d63cd906eab 100644 --- a/tools/check_copyright.py +++ b/tools/check_copyright.py @@ -8,30 +8,17 @@ from pathlib import Path from datetime import datetime -EXPECTED_HEADER = """# Copyright (c) {}-{}, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" +EXPECTED_HEADER = """# Copyright (c) {} NVIDIA CORPORATION & AFFILIATES. All rights reserved.""" -def has_correct_header(file_path, from_year: int): +def has_correct_header(file_path): """Check if file has the correct copyright header.""" try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Check if the expected header is at the start of the file - return content.startswith(EXPECTED_HEADER.format(from_year, str(datetime.now().year))) + return content.startswith(EXPECTED_HEADER.format(str(datetime.now().year))) except Exception as e: print(f"Error reading {file_path}: {e}") return False @@ -46,12 +33,6 @@ def main(): nargs='+', help='Files to check/modify' ) - parser.add_argument( - '--from-year', - type=int, - required=True, - help='Project creation year' - ) args = parser.parse_args() @@ -68,7 +49,7 @@ def main(): print(f"Not a file: {file_path}") continue - if has_correct_header(path, args.from_year): + if has_correct_header(path): print(f"✓ Header present: {file_path}") else: print(f"✗ Header missing: {file_path}") @@ -79,7 +60,7 @@ def main(): print(f"\n{len(missing_headers)} file(s) missing copyright header.") print("\n") print("Add or replace the header in those files with the following content:") - print(EXPECTED_HEADER) + print(EXPECTED_HEADER.format(str(datetime.now().year))) print("\n") print( "Disclaimer: This must done irrespective of the magnitude of the change " From 176a2ed5787819cbf6da4ee0a549d2108fd59b66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 24 Oct 2025 10:49:13 +0200 Subject: [PATCH 051/248] ci: Update container image tags to use github.sha --- .github/workflows/cicd-main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 4a1ae76b081..9c2f8ae6f5f 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -357,7 +357,7 @@ jobs: timeout: ${{ matrix.timeout || 30 }} is_unit_test: "true" PAT: ${{ secrets.PAT }} - container-image: ${{ env.container-registry }}/megatron-lm:1909 #${{ github.sha }} + container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }} cicd-parse-integration-tests: runs-on: ubuntu-latest @@ -462,7 +462,7 @@ jobs: timeout: ${{ matrix.timeout || 30 }} is_unit_test: "false" PAT: ${{ secrets.PAT }} - container-image: ${{ env.container-registry }}/megatron-lm:1864 # ${{ github.sha }} + container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }} Nemo_CICD_Test: needs: From d3d204881762dcf25186a9d0a88df8fd91ef46ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 24 Oct 2025 20:35:05 +0200 Subject: [PATCH 052/248] Ko3n1g/chore/merge main into dev (#1903) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig Co-authored-by: James Shen Co-authored-by: Chen-Han Yu Co-authored-by: Shanmugam Ramasamy Co-authored-by: Shanmugam Ramasamy Co-authored-by: Mcore Bot Co-authored-by: Shanmugam Ramasamy Co-authored-by: Siddharth Singh Co-authored-by: Shanmugam Ramasamy Co-authored-by: Youngeun Kwon Co-authored-by: Shunjia Ding Co-authored-by: Maanu Grover Co-authored-by: Jack Chang Co-authored-by: jianbinc Co-authored-by: xuwenc --- .../workflows/build-test-publish-wheel.yml | 6 +- .github/workflows/cicd-approve-test-queue.yml | 60 +++++++++++-------- .github/workflows/cicd-main.yml | 8 +-- .github/workflows/install-test.yml | 6 +- 4 files changed, 50 insertions(+), 30 deletions(-) diff --git a/.github/workflows/build-test-publish-wheel.yml b/.github/workflows/build-test-publish-wheel.yml index 0b6cdd7efdb..1ff9f53202b 100644 --- a/.github/workflows/build-test-publish-wheel.yml +++ b/.github/workflows/build-test-publish-wheel.yml @@ -21,6 +21,8 @@ on: - main - "pull-request/[0-9]+" - "deploy-release/*" + merge_group: + types: [checks_requested] defaults: run: @@ -32,12 +34,13 @@ permissions: jobs: pre-flight: - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.5 build-test-publish-wheel: needs: [pre-flight] if: | !(needs.pre-flight.outputs.docs_only == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true') uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.63.1 with: @@ -61,6 +64,7 @@ jobs: if: | ( needs.pre-flight.outputs.docs_only == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || always() ) diff --git a/.github/workflows/cicd-approve-test-queue.yml b/.github/workflows/cicd-approve-test-queue.yml index bd87e1d725d..1f23905d5d8 100644 --- a/.github/workflows/cicd-approve-test-queue.yml +++ b/.github/workflows/cicd-approve-test-queue.yml @@ -23,6 +23,9 @@ jobs: approve-queue: runs-on: ubuntu-latest environment: main + strategy: + matrix: + branch: [main, dev] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -45,13 +48,13 @@ jobs: run: | import os import requests - + import re # GitHub API configuration GITHUB_TOKEN = os.environ["GITHUB_TOKEN"] REPO = os.environ["GITHUB_REPOSITORY"] - MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"]) - API_BASE = f"https://api.github.com/repos/{REPO}" + MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"]) // 2 + API_BASE = f"https://api.github.com/repos/NVIDIA/Megatron-LM" # Headers for GitHub API headers = { @@ -76,22 +79,30 @@ jobs: print(f"Response: {e.response.text}") return None - def is_pr_targeting_main(workflow_run): - """Check if a workflow run belongs to a PR targeting main branch.""" - # Check if it's a pull_request event - if workflow_run.get("event") != "pull_request": - return False + def is_pr_targeting_branch(workflow_run, target_branch): + """ + Check if a workflow run belongs to a PR targeting the given branch. + Extract PR number from head branch like 'pull-request/1913' and verify base branch. + """ + print(workflow_run.get("head_branch", "")) + head_branch = workflow_run.get("head_branch", "") + match = re.match(r"pull-request/(\d+)", head_branch) + if not match: + return False # Not a PR branch pattern + + pr_number = int(match.group(1)) - # Get the head branch and base branch from pull_requests - pull_requests = workflow_run.get("pull_requests", []) - if not pull_requests: + # Fetch PR info from GitHub API + pr_info = make_request(f"pulls/{pr_number}") + if not pr_info: + print(f"Failed to fetch PR #{pr_number}") return False - - # Check if any PR is targeting main - for pr in pull_requests: - if pr.get("base", {}).get("ref") == "main": - return True - + + base_branch = pr_info.get("base", {}).get("ref") + if base_branch == target_branch: + print(f"PR #{pr_number} targets {target_branch}") + return True + return False # Get current running and queued workflows @@ -99,19 +110,19 @@ jobs: queued_workflow_runs = make_request("actions/runs?status=queued").get("workflow_runs", []) in_progress_workflow_runs = make_request("actions/runs?status=in_progress").get("workflow_runs", []) - # Filter for workflows belonging to PRs targeting main + # Filter for workflows belonging to PRs targeting ${{ matrix.branch }} queued_workflow_runs = [run for run in queued_workflow_runs - if run["name"] == "CICD Megatron-LM" and is_pr_targeting_main(run)] + if run["name"] == "CICD Megatron-LM" and is_pr_targeting_branch(run, "${{ matrix.branch }}")] in_progress_workflow_runs = [run for run in in_progress_workflow_runs - if run["name"] == "CICD Megatron-LM" and is_pr_targeting_main(run)] + if run["name"] == "CICD Megatron-LM" and is_pr_targeting_branch(run, "${{ matrix.branch }}")] # Count running and queued workflows queued_workflows = len(queued_workflow_runs) in_progress_workflows = len(in_progress_workflow_runs) total_workflows = queued_workflows + in_progress_workflows - print(f"Current queued workflows (PRs targeting main): {queued_workflows}") - print(f"Current running workflows (PRs targeting main): {in_progress_workflows}") + print(f"Current queued workflows (PRs targeting ${{ matrix.branch }}): {queued_workflows}") + print(f"Current running workflows (PRs targeting ${{ matrix.branch }}): {in_progress_workflows}") print(f"Total workflows: {total_workflows}") print(f"Max concurrency: {MAX_CONCURRENCY}") @@ -122,8 +133,9 @@ jobs: # Get waiting CI workflows for test environment print("Fetching deployments...") pending_workflows = make_request("actions/runs?status=waiting").get("workflow_runs", []) + print("Pending workflows:", len(pending_workflows)) pending_workflows = [run for run in pending_workflows - if run["name"] == "CICD Megatron-LM" and is_pr_targeting_main(run)] + if run["name"] == "CICD Megatron-LM" and is_pr_targeting_branch(run, "${{ matrix.branch }}")] # Sort deployments by creation date (oldest first) print("Sorting workflows...") @@ -140,7 +152,7 @@ jobs: workflow_name = workflow["display_title"] pr_info = workflow.get("pull_requests", [{}])[0] pr_number = pr_info.get("number", "unknown") - print(f"Approving workflow {workflow_name} (PR #{pr_number}) with Run Id: {workflow_id}") + print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}") deployment_url = f"actions/runs/{workflow_id}/pending_deployments" deployment = make_request(deployment_url)[0] diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 9c2f8ae6f5f..88be3d5bcc3 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -47,7 +47,6 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.PAT }} REPO: ${{ github.repository }} - SCHEDULED_JOB: ${{ github.event_name == 'schedule' }} steps: - name: Checkout repository uses: actions/checkout@v4 @@ -63,11 +62,12 @@ jobs: id: check-membership env: IS_MAIN_BRANCH: ${{ github.ref == 'refs/heads/main' }} - IS_DEV_BRANCH: ${{ github.ref == 'refs/heads/dev' }} + IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }} + SCHEDULED_JOB: ${{ github.event_name == 'schedule' }} run: | PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }} - if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_DEV_BRANCH}" == "true" ]; then + if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] [ "${IS_DEV_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT exit 0 fi @@ -148,7 +148,7 @@ jobs: pre-flight: needs: [is-not-external-contributor] - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.0 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.5 linting: runs-on: ubuntu-latest diff --git a/.github/workflows/install-test.yml b/.github/workflows/install-test.yml index 8e409ef2207..419202dbc2c 100644 --- a/.github/workflows/install-test.yml +++ b/.github/workflows/install-test.yml @@ -24,15 +24,18 @@ on: - main - "pull-request/[0-9]+" - "deploy-release/*" + merge_group: + types: [checks_requested] jobs: pre-flight: - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.5 pip-test-pytorch: needs: [pre-flight] if: | !(needs.pre-flight.outputs.docs_only == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true') runs-on: linux-amd64-cpu16 name: Pip - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch @@ -77,6 +80,7 @@ jobs: needs: [pre-flight] if: | !(needs.pre-flight.outputs.docs_only == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true') runs-on: linux-amd64-cpu16 name: UV - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch From 1ef95d9cc965be5b2373a490eee4f6badda30a7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 24 Oct 2025 18:43:37 +0000 Subject: [PATCH 053/248] ci: Fix approval bot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/00.pre.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml index 35ebef1ecb8..5c74073ff14 100644 --- a/.gitlab/stages/00.pre.yml +++ b/.gitlab/stages/00.pre.yml @@ -379,10 +379,10 @@ pre:approve_merge_gate: gh api "repos/$REPO/actions/runs?status=waiting" --jq '.workflow_runs[].id' \ | while read run_id; do - HEAD_BRANCH=$(gh api "repos/$REPO/actions/runs/$run_id" --jq '.head_branch') + HEAD_BRANCH=$(gh api "repos/$REPO/actions/runs/$run_id" --jq '.head_branch' 2>/dev/null) || continue PR_NUMBER="${HEAD_BRANCH##*/}" if [ -n "$PR_NUMBER" ]; then - PR_BASE=$(gh api "repos/$REPO/pulls/$PR_NUMBER" --jq '.base.ref') + PR_BASE=$(gh api "repos/$REPO/pulls/$PR_NUMBER" --jq '.base.ref' 2>/dev/null) || continue if [ "$PR_BASE" = "$TARGET_BRANCH" ]; then gh api \ --method POST "repos/$REPO/actions/runs/$run_id/pending_deployments" \ From 9b8d7033349d38d57b40dff8aeb4deeb5230d6b8 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Fri, 24 Oct 2025 14:28:40 -0500 Subject: [PATCH 054/248] ci: Fix dev branch CI (#1922) Fix dev branch CI For some reason, on the dev branch, the call to `energy_monitor.pause()` fails in the training script. It does not seem to be related to the dependencies because this still fails when using the same docker image with same pyproject.toml and uv.lock file. I recommend we merge this to unblock the dev branch and allow us more time to dig deeper into the root cause. --------- Signed-off-by: Charlie Truong Co-authored-by: Oliver Koenig --- megatron/training/training.py | 17 ++++++++++------- .../gpt/gpt3_mcore_tp1_pp2/model_config.yaml | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/megatron/training/training.py b/megatron/training/training.py index fec4c1a3dc7..f805dab0f15 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -1241,7 +1241,7 @@ def setup_model_and_optimizer( # set dense model related args in to global args before getting dense model args.num_experts = None args.expert_model_parallel_size = 1 - args.ffn_hidden_size = moe_ffn_hidden_size * args.moe_upcycling_granularity + args.ffn_hidden_size = moe_ffn_hidden_size * args.moe_upcycling_granularity # get dense model dense_model_for_upcycling = get_model(model_provider_func, model_type) @@ -1838,7 +1838,8 @@ def save_checkpoint_and_time( # Stop timer to get accurate train interval time and exclude checkpointing duration timers('interval-time').stop() - energy_monitor.pause() + if args.log_energy: + energy_monitor.pause() # Extra barrier is added to make sure all ranks report the max time. timer_key = 'save-checkpoint-non-persistent' if non_persistent_ckpt else 'save-checkpoint' @@ -1880,7 +1881,9 @@ def save_checkpoint_and_time( ) # Recover timing - energy_monitor.resume() + if args.log_energy: + energy_monitor.resume() + timers('interval-time', log_level=0).start(barrier=True) @@ -2791,7 +2794,7 @@ def evaluate_and_print_results( eval_iters = [args.eval_iters] else: eval_iters = args.eval_iters - + if args.full_validation: assert len(eval_iters) == len(data_iterators) @@ -2807,7 +2810,7 @@ def evaluate_and_print_results( eval_iters = [args.eval_iters] else: eval_iters = args.eval_iters - + for index, (iterator, iterations) in enumerate(zip(data_iterators, eval_iters)): suffix = "" if args.multiple_validation_sets: @@ -2925,7 +2928,7 @@ def build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider build_train_valid_test_datasets_provider, (1, 1, 1) if getattr(args, 'perform_rl_step', False) else None ) valid_ds = [valid_ds] if not isinstance(valid_ds, list) else valid_ds - + # Build dataloders. train_dataloader = build_pretraining_data_loader(train_ds, args.consumed_train_samples) @@ -3000,7 +3003,7 @@ def _get_iterator(dataloader_type, dataloader): if valid_dataloaders is not None: # when using full validation, we need to override eval iters with the correct - # number of iterations on tp rank 0 so that it can be distributed to the other + # number of iterations on tp rank 0 so that it can be distributed to the other # ranks later if args.full_validation: if args.multiple_validation_sets: diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/model_config.yaml index 84da70b66c7..4cc6e53b8c8 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_tp1_pp2/model_config.yaml @@ -1,4 +1,4 @@ -s`ENV_VARS: +ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Ring From 10d280ada1df76241435f47a24b37869354f65ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 24 Oct 2025 21:47:25 +0200 Subject: [PATCH 055/248] Ko3n1g/ci/cherrypick automation dev (#1926) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/cherry-pick-release-commit.yml | 5 ++++- .github/workflows/cicd-main.yml | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index 0fc1da80015..9cf8ed98660 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -17,10 +17,13 @@ on: push: branches: - main + - dev jobs: cherry-pick: - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cherry_pick.yml@v0.31.0 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cherry_pick.yml@v0.65.9 + with: + target-branches-pattern: 'core_(*dev_)?r[0-9]+\.[0-9]+\.[0-9]+' secrets: PAT: ${{ secrets.PAT }} SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }} diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 88be3d5bcc3..f5a999858dd 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -62,6 +62,7 @@ jobs: id: check-membership env: IS_MAIN_BRANCH: ${{ github.ref == 'refs/heads/main' }} + IS_DEV_BRANCH: ${{ github.ref == 'refs/heads/dev' }} IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }} SCHEDULED_JOB: ${{ github.event_name == 'schedule' }} run: | From 017c7b3a3c1f31d25f687b419930e11e46b09d8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 24 Oct 2025 19:52:49 +0000 Subject: [PATCH 056/248] ci: Fix dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/cicd-main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index f5a999858dd..96deabcf9f3 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -68,7 +68,7 @@ jobs: run: | PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }} - if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] [ "${IS_DEV_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then + if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_DEV_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT exit 0 fi From 598d41f2b987ffe2f9f9598d2e41e5ef99e4e4ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 25 Oct 2025 16:20:19 +0200 Subject: [PATCH 057/248] Ko3n1g/chore/merge main into dev20251025 (#1943) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/pull_request_template.md | 64 + .github/workflows/cicd-main.yml | 17 +- .github/workflows/community-bot.yml | 3 +- .gitlab-ci.yml | 4 + .gitlab/stages/00.pre.yml | 51 +- .gitlab/stages/01.build.yml | 2 + .gitlab/stages/02.test.yml | 104 +- .gitlab/stages/03.integration-tests.yml | 2 + .gitlab/stages/04.functional-tests.yml | 2 + .gitlab/stages/05.publish.yml | 48 +- .../golden_values_dev_dgxh100_eos.json | 178 ++ .../golden_values_dev_dgxh100_eos.json | 178 ++ .../golden_values_dev_dgxh100_eos.json | 2699 +++++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 161 + .../python_scripts/approve_merge_gate.py | 117 + tests/test_utils/recipes/unit-tests.yaml | 21 + 16 files changed, 3491 insertions(+), 160 deletions(-) create mode 100644 .github/pull_request_template.md create mode 100644 tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_eos.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgxh100_eos.json create mode 100644 tests/test_utils/python_scripts/approve_merge_gate.py diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 00000000000..7f7dedd27ad --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,64 @@ +# What does this PR do ? + + +:warning: For major changes (either in lines of code or in its impact), please make sure to first share discuss a design-doc with the team. + +## Contribution process + +```mermaid +flowchart LR + A[Pre-checks] --> B[PR Tests] + subgraph Code Review/Approval + C1[Expert Review] --> C2[Final Review] + end + B --> C1 + C2 --> D[Merge] +``` + +### Pre-checks + +- [ ] I want this PR in a versioned release and have added the appropriate Milestone (e.g., `Core 0.8`) +- [ ] I have added relevant unit tests +- [ ] I have added relevant functional tests +- [ ] I have added proper typing to my code [Typing guidelines](https://docs.python.org/3/library/typing.html) +- [ ] I have added relevant documentation +- [ ] I have run the [autoformatter.sh](https://github.com/NVIDIA/Megatron-LM/blob/main/tools/autoformat.sh) on my PR + +### Code review + +The following process is enforced via the CODEOWNERS file for changes into `megatron/core`. For changes outside of `megatron/core`, it is up to the PR author whether or not to tag the Final Reviewer team. + +
+For MRs into `main` branch + +#### (Step 1): Add PR label `Expert Review` + +#### (Step 2): Collect the expert reviewers reviews + +1. Attach the `Expert Review` label when your PR is ready for review. +2. GitHub auto-assigns expert reviewers based on your changes. They will get notified and pick up your PR soon. + +:warning: Only proceed to the next step once all reviewers have approved, merge-conflict are resolved and the CI is passing. +Final Review might get declined if these requirements are not fulfilled. + +#### (Step 3): Final Review + +1. Add `Final Review` label +2. GitHub auto-assigns final reviewers based on your changes. They will get notified and pick up your PR soon. + +#### (Optional Step 4): Cherry-pick into release branch + +If this PR also needs to be merged into `core_r*` release branches, after this PR has been merged, select `Cherry-pick` to open a new PR into the release branch. + +
+ +
+For MRs into `dev` branch +The proposed review process for `dev` branch is under active discussion. + +MRs are mergable after one approval by either `eharper@nvidia.com` or `zijiey@nvidia.com`. +
+ +### Merging your PR + +Any member of [core-adlr](https://github.com/orgs/teams/NVIDIA/core-adlr) and [`core-nemo`](https://github.com/orgs/teams/NVIDIA/core-nemo) will be able to merge your PR. diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 96deabcf9f3..d1e411be98f 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -14,7 +14,7 @@ name: CICD Megatron-LM on: schedule: - - cron: "0 */2 * * *" + - cron: 0 0 * * * push: branches: - dev @@ -23,6 +23,7 @@ on: - "deploy-release/*" merge_group: types: [checks_requested] + workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }} @@ -149,7 +150,7 @@ jobs: pre-flight: needs: [is-not-external-contributor] - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.5 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.10 linting: runs-on: ubuntu-latest @@ -319,7 +320,7 @@ jobs: - name: Parse unit tests id: parse-unit-tests run: | - cat tests/test_utils/recipes/unit-tests.yaml | yq -o json '[.products[].test_case[] | { "bucket": .}]' | jq -c > unit-tests.json + cat tests/test_utils/recipes/unit-tests.yaml | yq -o json '[.products[].test_case[] | { "bucket": .}] | sort_by(.model, .test_case)' | jq -c > unit-tests.json echo "unit-tests=$(cat unit-tests.json)" | tee -a $GITHUB_OUTPUT cicd-unit-tests-latest: @@ -367,6 +368,14 @@ jobs: - cicd-wait-in-queue - cicd-container-build - cicd-unit-tests-latest + if: | + ( + success() + || needs.pre-flight.outputs.is_ci_workload == 'true' + || needs.pre-flight.outputs.force_run_all == 'true' + ) + && needs.pre-flight.outputs.is_merge_group == 'false' + && !cancelled() outputs: integration-tests: ${{ steps.main.outputs.integration-tests }} steps: @@ -491,7 +500,7 @@ jobs: env: GH_TOKEN: ${{ github.token }} RUN_ID: ${{ github.run_id }} - SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' }} + SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }} run: | FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion == "failure")] | length') || echo 0 SKIPPED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion == "skipped")] | length') || echo 0 diff --git a/.github/workflows/community-bot.yml b/.github/workflows/community-bot.yml index 57d482afa34..9f939510ed1 100644 --- a/.github/workflows/community-bot.yml +++ b/.github/workflows/community-bot.yml @@ -21,6 +21,7 @@ on: jobs: community-bot: - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_community_bot.yml@v0.49.1 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_community_bot.yml@v0.65.10 secrets: GH_TOKEN: ${{ secrets.PAT }} + environment: main diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6b46d92aacb..5ddf5f094c2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -18,6 +18,8 @@ workflow: - if: $CI_PROJECT_NAMESPACE != "ADLR" || ($CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_PROJECT_PATH != "ADLR/megatron-lm") when: never + - if: $CI_PIPELINE_SOURCE == "schedule" && ($CI_COMMIT_BRANCH == 'ci-approve-dev' || $CI_COMMIT_BRANCH == 'ci-approve-main') + # ci-branches only for schedule - if: $CI_COMMIT_BRANCH =~ /ci-/ && $CI_PIPELINE_SOURCE != "schedule" when: never @@ -154,6 +156,8 @@ default: when: runner_system_failure variables: + BUILD: + value: "yes" UNIT_TEST: value: "yes" options: diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml index 5c74073ff14..dca3a7b47ae 100644 --- a/.gitlab/stages/00.pre.yml +++ b/.gitlab/stages/00.pre.yml @@ -8,6 +8,7 @@ include: when: always - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' when: always + - when: never stage: .pre @@ -348,53 +349,3 @@ pre:check_status_of_main: - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' when: always - when: never - -pre:approve_merge_gate: - extends: [.pre_rules] - image: maniator/gh - tags: - - arch/amd64 - - env/prod - - origin/jet-fleet - - owner/jet-core - - purpose/utility - - team/megatron - script: - - | - set -eoux pipefail - EXIT_CODE=0 - python tests/test_utils/python_scripts/check_status_of_main.py --target-branch "$CI_COMMIT_BRANCH" --once || EXIT_CODE=$? - - export GH_TOKEN=$GH_TOKEN - export REPO=NVIDIA/Megatron-LM - export TARGET_BRANCH="$CI_COMMIT_BRANCH" - - if [[ $EXIT_CODE -eq 0 ]]; then - STATUS="approved" - COMMENT="Main is healthy. Submitting PR." - else - STATUS="rejected" - COMMENT="Main is not healthy. An automation engineer is investigating. No need to take any action." - fi - - gh api "repos/$REPO/actions/runs?status=waiting" --jq '.workflow_runs[].id' \ - | while read run_id; do - HEAD_BRANCH=$(gh api "repos/$REPO/actions/runs/$run_id" --jq '.head_branch' 2>/dev/null) || continue - PR_NUMBER="${HEAD_BRANCH##*/}" - if [ -n "$PR_NUMBER" ]; then - PR_BASE=$(gh api "repos/$REPO/pulls/$PR_NUMBER" --jq '.base.ref' 2>/dev/null) || continue - if [ "$PR_BASE" = "$TARGET_BRANCH" ]; then - gh api \ - --method POST "repos/$REPO/actions/runs/$run_id/pending_deployments" \ - -F "environment_ids[]=$(gh api "repos/$REPO/environments" --jq '.environments[] | select(.name=="merge-gate") | .id')" \ - -f state="$STATUS" \ - -f comment="$COMMENT"; - fi - fi - done - retry: - max: 2 - rules: - - if: $CI_PIPELINE_SOURCE == "schedule" && ($CI_COMMIT_BRANCH == 'ci-approve-dev' || $CI_COMMIT_BRANCH == 'ci-approve-main') - when: always - - when: never diff --git a/.gitlab/stages/01.build.yml b/.gitlab/stages/01.build.yml index 2fd9e1f32e6..0658daaa9ec 100644 --- a/.gitlab/stages/01.build.yml +++ b/.gitlab/stages/01.build.yml @@ -1,5 +1,7 @@ .build_rules: rules: + - if: $BUILD == "no" + when: never - when: on_success stage: test diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index 699bef68181..2f018f94e66 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -2,6 +2,8 @@ rules: - if: $PUBLISH == "yes" when: never + - if: $BUILD == "no" + when: never - when: on_success stage: test @@ -11,10 +13,6 @@ include: wait_for_resources: extends: [.test_rules] needs: - - job: test:linting_formatting - optional: true - - job: test:linting_copyright - optional: true - job: test:linting_secret_detection optional: true - test:build_image @@ -76,7 +74,7 @@ test:unit_tests_configure: "--n-repeat ${UNIT_TEST_REPEAT}" "--time-limit $(( UNIT_TEST_TIMEOUT * 60 ))" "--test-cases all" - "--cluster dgxh100_coreweave" + "--cluster $H100_CLUSTER" "--platform dgx_h100" "--partition batch" "--container-image ${UTILITY_IMAGE}" @@ -161,46 +159,6 @@ test:unit_tests_configure: - if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0' when: on_success -test:unit_tests_pyt(DEV)_mcore(legacy): - extends: [.unit_tests_run] - variables: - ENVIRONMENT: dev - TAG: legacy - rules: - - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == 'dev' - when: never - - if: $CI_COMMIT_BRANCH == 'ci-dev-unit-test-extended' - when: never - - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /^core_r/ - when: never - - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME != 'main' - when: never - - if: $UNIT_TEST == 'yes' && $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" - allow_failure: true - when: on_success - - if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0' - when: on_success - -test:unit_tests_pyt(LTS)_mcore(legacy): - extends: [.unit_tests_run] - variables: - ENVIRONMENT: lts - TAG: legacy - rules: - - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == 'dev' - when: never - - if: $CI_COMMIT_BRANCH == 'ci-dev-unit-test-extended' - when: never - - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /^core_r/ - when: never - - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME != 'main' - when: never - - if: $UNIT_TEST == 'yes' && $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" - allow_failure: true - when: on_success - - if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0' - when: on_success - test:unit_tests_pyt(DEV)_mcore(latest): extends: [.unit_tests_run] variables: @@ -271,62 +229,6 @@ test:linting_docs_build: - cd documentation/ - ./repo docs -test:linting_formatting: - extends: [.test_rules] - image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID} - tags: - - arch/amd64 - - env/prod - - origin/jet-fleet - - owner/jet-core - - purpose/utility - - team/megatron - needs: [test:build_image] - variables: - GIT_STRATEGY: "clone" - rules: - - if: $PUBLISH == "yes" - when: never - - if: $CI_PIPELINE_SOURCE == 'push' || $CI_PIPELINE_SOURCE == 'schedule' - when: never - - when: on_success - script: - - | - if [[ "$CI_PIPELINE_SOURCE" != "merge_request_event" ]]; then - exit 0 - fi - - set +e - - env - - export GITLAB_ENDPOINT=gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT} - - BASE_REF="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" CHECK_ONLY=true SKIP_DOCS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Skip docs"* ]] && echo "true" || echo "false") bash tools/autoformat.sh - -test:linting_copyright: - extends: [.test_rules] - tags: - - arch/amd64 - - env/prod - - origin/jet-fleet - - owner/jet-core - - purpose/utility - - team/megatron - image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID} - needs: [test:build_image] - rules: - - if: $PUBLISH == "yes" - when: never - - if: $CI_PIPELINE_SOURCE == 'push' - when: never - - when: on_success - script: - - | - if [[ "$CI_PIPELINE_SOURCE" != "merge_request_event" ]]; then - exit 0 - fi - - git fetch origin main - - BASE_REF="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" - - export GITLAB_ENDPOINT=gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT} - - bash tools/copyright.sh - # Override from template secret_detection: rules: diff --git a/.gitlab/stages/03.integration-tests.yml b/.gitlab/stages/03.integration-tests.yml index df4d84234bb..824721b9fb1 100644 --- a/.gitlab/stages/03.integration-tests.yml +++ b/.gitlab/stages/03.integration-tests.yml @@ -1,6 +1,8 @@ .integration_tests_rules: stage: integration_tests rules: + - if: $BUILD == "no" + when: never - if: $INTEGRATION_TEST == "yes" when: on_success - when: never diff --git a/.gitlab/stages/04.functional-tests.yml b/.gitlab/stages/04.functional-tests.yml index ea2f1bcef8c..dbdef4484f2 100644 --- a/.gitlab/stages/04.functional-tests.yml +++ b/.gitlab/stages/04.functional-tests.yml @@ -1,6 +1,8 @@ .functional_tests_rules: stage: functional_tests rules: + - if: $BUILD == "no" + when: never - if: $FUNCTIONAL_TEST == "yes" when: on_success - when: never diff --git a/.gitlab/stages/05.publish.yml b/.gitlab/stages/05.publish.yml index 695479179c5..20495434f6b 100644 --- a/.gitlab/stages/05.publish.yml +++ b/.gitlab/stages/05.publish.yml @@ -1,6 +1,8 @@ .publish_common_freeze: stage: publish rules: + - if: $BUILD == "no" + when: never - if: ($CI_COMMIT_BRANCH == "main") && $PUBLISH == "yes" && $PUBLISH_SCOPE == "code-freeze" when: manual - when: never @@ -538,10 +540,6 @@ publish:upload_statistics: stage: publish image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID} needs: - - job: test:unit_tests_pyt(DEV)_mcore(legacy) - optional: true - - job: test:unit_tests_pyt(LTS)_mcore(legacy) - optional: true - job: test:unit_tests_pyt(DEV)_mcore(latest) - job: test:unit_tests_pyt(LTS)_mcore(latest) - job: functional:run_lts_dgx_a100 @@ -749,3 +747,45 @@ publish:merge_into_dev: - if: $CI_COMMIT_BRANCH == "main" && $CI_PIPELINE_SOURCE == "push" allow_failure: true - when: never + +publish:approve_merge_gate: + stage: publish + image: maniator/gh + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + script: + - | + set -eoux pipefail + EXIT_CODE=0 + apk add python3 + python -m venv .venv + source .venv/bin/activate + pip install --no-cache-dir python-gitlab click pygithub + export GITLAB_ENDPOINT + export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE} + python tests/test_utils/python_scripts/check_status_of_main.py --target-branch "$CI_COMMIT_BRANCH" --once || EXIT_CODE=$? + + export GH_TOKEN=$GH_TOKEN + export REPO=NVIDIA/Megatron-LM + export TARGET_BRANCH="$CI_COMMIT_BRANCH" + + if [[ $EXIT_CODE -eq 0 ]]; then + export STATUS="approved" + export COMMENT="Main is healthy. Submitting PR." + else + export STATUS="rejected" + export COMMENT="Main is not healthy. An automation engineer is investigating. No need to take any action." + fi + + python tests/test_utils/python_scripts/approve_merge_gate.py + retry: + max: 2 + rules: + - if: $CI_PIPELINE_SOURCE == "schedule" || ($CI_COMMIT_BRANCH == 'ci-approve-dev' || $CI_COMMIT_BRANCH == 'ci-approve-main') + when: always + - when: never diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..5decbad6a1a --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,178 @@ +{ + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", + "generated_tokens": [ + 3060, + 2430, + 1636, + 2012, + 1317, + 1278, + 2362, + 1307, + 1278, + 16070, + 1044, + 1321, + 1636, + 23067, + 1455, + 1593, + 1395, + 1605, + 3140, + 5152, + 1513, + 1747, + 1046, + 2409, + 1395, + 3140, + 5152, + 1513, + 1278, + 2362 + ], + "latency": 0.2756917476654053, + "cuda_graph_request_count_map": { + "372": 0, + "360": 0, + "336": 0, + "312": 0, + "288": 0, + "264": 0, + "240": 0, + "216": 0, + "192": 0, + "168": 0, + "144": 0, + "120": 0, + "96": 0, + "72": 0, + "48": 0, + "24": 29 + }, + "step_count": 240, + "logprobs": [ + -9.362494468688965, + -2.827894449234009, + -4.557381629943848, + -1.4968647956848145, + -0.717312216758728, + -1.7262351512908936, + -2.522736072540283, + -2.1782360076904297, + -2.3603432178497314, + -6.136383533477783, + -1.4676916599273682, + -3.468963384628296, + -4.424870491027832, + -3.7345848083496094, + -2.012619972229004, + -1.8833301067352295, + -3.5708768367767334, + -6.8197832107543945, + -0.3122292757034302, + -0.9820290207862854, + -6.532033443450928, + -7.498172760009766, + -12.615165710449219, + -2.409003496170044, + -3.8550546169281006, + -0.5105050802230835, + -4.2802581787109375, + -0.06971167027950287, + -0.054025799036026, + -3.319596767425537, + -9.703240394592285, + -1.0997297763824463, + -6.224854469299316, + -5.234503269195557, + -3.934987783432007, + -2.5263679027557373, + -3.1843955516815186, + -5.880871295928955, + -1.8436813354492188, + -5.906496047973633, + -12.15787410736084, + -12.5841064453125, + -0.0819428563117981, + -2.6212656497955322, + -1.4329369068145752, + -2.885145425796509, + -1.2901865243911743, + -0.006647023372352123, + -3.5115818977355957, + -12.945953369140625, + -3.793078899383545, + -3.0094375610351562, + -5.966838836669922, + -0.8998424410820007, + -0.040962252765893936, + -1.5467679500579834, + -1.0785343647003174, + -5.73494815826416, + -0.38491737842559814, + -5.017007827758789, + -0.5568072199821472, + -0.5968841910362244, + -2.3609962463378906, + -13.582086563110352, + -0.09050048142671585, + -3.7264108657836914, + -1.1208789348602295, + -6.052675247192383, + -0.5848909616470337, + -3.5906238555908203, + -0.9494907855987549, + -1.5676641464233398, + -5.127577781677246, + -17.19189453125, + -6.698403835296631, + -1.0449178218841553, + -4.365664958953857, + -1.1243419647216797, + -2.2092156410217285, + -1.8081634044647217, + -0.23330983519554138, + -9.439546585083008, + -0.2947109341621399, + -7.253565788269043, + -2.3855936527252197, + -4.629369258880615, + -3.4186267852783203, + -1.9727531671524048, + -2.331681251525879, + -1.5606917142868042, + -2.454296588897705, + -1.5334703922271729, + -1.2631131410598755, + -2.657367706298828, + -0.6480202078819275, + -0.4550393521785736, + -1.3625166416168213, + -0.8142069578170776, + -0.4496593475341797, + -0.9312890768051147, + -1.732723355293274, + -0.44613128900527954, + -1.6895122528076172, + -0.6082233190536499, + -1.0978344678878784, + -1.1122435331344604, + -0.002520838286727667, + -1.4072327613830566, + -0.007462364621460438, + -0.7548662424087524, + -0.9937503337860107, + -0.0675487294793129, + -0.9595617055892944, + -0.029961343854665756, + -2.205785036087036, + -1.2615025043487549, + -0.7878209352493286 + ] + }, + "throughput": 105.62266013491053 +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..20da149d1f1 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,178 @@ +{ + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", + "generated_tokens": [ + 3060, + 2430, + 1636, + 2012, + 1317, + 1278, + 2362, + 1307, + 1278, + 16070, + 1044, + 1321, + 1636, + 23067, + 1455, + 1593, + 1395, + 1605, + 3140, + 5152, + 1513, + 1747, + 1046, + 2409, + 1395, + 3140, + 5152, + 1513, + 1278, + 2362 + ], + "latency": 0.3700687885284424, + "cuda_graph_request_count_map": { + "372": 0, + "360": 0, + "336": 0, + "312": 0, + "288": 0, + "264": 0, + "240": 0, + "216": 0, + "192": 0, + "168": 0, + "144": 0, + "120": 0, + "96": 0, + "72": 0, + "48": 0, + "24": 29 + }, + "step_count": 240, + "logprobs": [ + -9.362494468688965, + -2.827894449234009, + -4.557381629943848, + -1.4968647956848145, + -0.717312216758728, + -1.7262351512908936, + -2.522736072540283, + -2.1782360076904297, + -2.3603432178497314, + -6.136383533477783, + -1.4676916599273682, + -3.468963384628296, + -4.424870491027832, + -3.7345848083496094, + -2.012619972229004, + -1.8833301067352295, + -3.5708768367767334, + -6.8197832107543945, + -0.3122292757034302, + -0.9820290207862854, + -6.532033443450928, + -7.498172760009766, + -12.615165710449219, + -2.409003496170044, + -3.8550546169281006, + -0.5105050802230835, + -4.2802581787109375, + -0.06971167027950287, + -0.054025799036026, + -3.319596767425537, + -9.703240394592285, + -1.0997297763824463, + -6.224854469299316, + -5.234503269195557, + -3.934987783432007, + -2.5263679027557373, + -3.1843955516815186, + -5.880871295928955, + -1.8436813354492188, + -5.906496047973633, + -12.15787410736084, + -12.5841064453125, + -0.0819428563117981, + -2.6212656497955322, + -1.4329369068145752, + -2.885145425796509, + -1.2901865243911743, + -0.006647023372352123, + -3.5115818977355957, + -12.945953369140625, + -3.793078899383545, + -3.0094375610351562, + -5.966838836669922, + -0.8998424410820007, + -0.040962252765893936, + -1.5467679500579834, + -1.0785343647003174, + -5.73494815826416, + -0.38491737842559814, + -5.017007827758789, + -0.5568072199821472, + -0.5968841910362244, + -2.3609962463378906, + -13.582086563110352, + -0.09050048142671585, + -3.7264108657836914, + -1.1208789348602295, + -6.052675247192383, + -0.5848909616470337, + -3.5906238555908203, + -0.9494907855987549, + -1.5676641464233398, + -5.127577781677246, + -17.19189453125, + -6.698403835296631, + -1.0449178218841553, + -4.365664958953857, + -1.1243419647216797, + -2.2092156410217285, + -1.8081634044647217, + -0.23330983519554138, + -9.439546585083008, + -0.2947109341621399, + -7.253565788269043, + -2.3855936527252197, + -4.629369258880615, + -3.4186267852783203, + -1.9727531671524048, + -2.331681251525879, + -1.5606917142868042, + -2.454296588897705, + -1.5334703922271729, + -1.2631131410598755, + -2.657367706298828, + -0.6480202078819275, + -0.4550393521785736, + -1.3625166416168213, + -0.8142069578170776, + -0.4496593475341797, + -0.9312890768051147, + -1.732723355293274, + -0.44613128900527954, + -1.6895122528076172, + -0.6082233190536499, + -1.0978344678878784, + -1.1122435331344604, + -0.002520838286727667, + -1.4072327613830566, + -0.007462364621460438, + -0.7548662424087524, + -0.9937503337860107, + -0.0675487294793129, + -0.9595617055892944, + -0.029961343854665756, + -2.205785036087036, + -1.2615025043487549, + -0.7878209352493286 + ] + }, + "throughput": 79.31454807788677 +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..ad16c16b924 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,2699 @@ +{ + "0": { + "input_prompt": "The $500 Cup of coffee?\nConsider this, most Americans spend an average of $1,500-2,000 a year on this bean water.\nI have a few question for you: \nHow has business been the past few months?\nDo you ever feel like your business is stuck?\nDon't feel like you're able to improve performance and make changes required to achieve success ?\nAre your customers spneding less and less and wanting more?\nHave the gas prices affected your business?\nDo you have employees and do they hate you or wish they could quit?\n\nNow, before you and I can decide wheter or not I will be a good fit for your business we should talk this over with coffee.\nAnd, just to warn you this isn't some casual thing. This is not a date or time to be personal or social (but by all means share what you will coz I'll gladly listen).\nTher eare two major talking points and stratagies we will focios on in our lil coffee social\nFor one, we will find your unique selling Proposition (USP).\nDo have the best price? Are you the cheapest in town? Are your customers jerks? Do you haVE REGULARS? Why do people come back?\nwe'll also look for the holes in your business bucket. I'm willing to bet there's a hole or two in your business we'll find together that'll make this 500 dollar cup of Joe pay for itse;f immedietly.\nMany find themselves to be more profitable by just finding out where the dollars are escaping in their business and I like to think of myself as a guy that comes along with some spakel or putty and patch those holes up for you.\nBeleive me, just fixing one hole can mean a lot...just think about a sinking boat that has a hole in it that's about 3\u201d in diameter... it doesn't take long to sink.\nI have no agenda, besides f=getting to know your business and seeing wher I can patch the holes and find what makes you do darn unique (I know this won't take long.)\nMany folks, I bet, will find what they need to get off their chest with a quick phone call and they just paypal me the money and make a coffee at home. Look, that's fine too.\nI just to get you ot of your comfort zone, because this is where it all starts my frind.\nSome smart GOAT entrepreneur will probably get everything they need just out of our lil mini consulatant for the more extensive business consukting I offer, and look, that's fine, too.\nMaybe this cup of coffee will be all you need to gtet back on your feet, but not only surive, but thrive!\nI'm not trying to make frineds, or make a bunch of money, or look down your shirt\nBut this is only going to be a 45 minute (max) coffee chat\nAnd, I know you (and me) both have a lot on our plates. So no messing around\nAfter our consultation I will follow up with you in a few days and see how things are going, then I will be emailing you about once or twice every two weeks, just to make sure you're staying on task and implementing what we went over.\nTghere is no obligation to go any further and will gladly give you back your money if this pre-consultation doesn't put you on the right path or you don't get any value out of it...", + "generated_text": " $ is a$ is a $ is a $ is a $ is a $ is a $$1, you\n$ $$ $\n$ $- $\n$\n$\n$ $$$\n$\n$\n$\n$\n$\n$\n$\n$???????, $\n-1\n$\n1.5.\n$\n$, you\n$.\n$\n1,1.1\nI\n$.\nI\n\n\nHow\n$,,,0,1,0,0.0\nIn\nThe\nThe\nThe\n", + "generated_tokens": [ + 1659, + 1395, + 1261, + 1036, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1036, + 1049, + 1044, + 1636, + 1010, + 1036, + 1659, + 1036, + 1659, + 1010, + 1036, + 1659, + 1045, + 1659, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1659, + 1036, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1063, + 1063, + 1063, + 1063, + 1063, + 1063, + 1063, + 1044, + 1659, + 1010, + 1045, + 1049, + 1010, + 1036, + 1010, + 1049, + 1046, + 1053, + 1046, + 1010, + 1036, + 1010, + 1036, + 1044, + 1636, + 1010, + 1036, + 1046, + 1010, + 1036, + 1010, + 1049, + 1044, + 1049, + 1046, + 1049, + 1010, + 1073, + 1010, + 1036, + 1046, + 1010, + 1073, + 1010, + 1010, + 1010, + 7801, + 1010, + 1036, + 1044, + 1044, + 1044, + 1048, + 1044, + 1049, + 1044, + 1048, + 1044, + 1048, + 1046, + 1048, + 1010, + 1785, + 1010, + 1784, + 1010, + 1784, + 1010, + 1784, + 1010 + ], + "latency": 10.056535482406616, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -7.7319135665893555, + -2.188307285308838, + -0.7547445297241211, + -0.7294313311576843, + -10.238386154174805, + -3.3775341510772705, + -6.394498825073242, + -7.354557037353516, + -9.018157958984375, + -3.012073040008545, + -3.2584073543548584, + -5.220732688903809, + -4.620487213134766, + -2.5078930854797363, + -3.752683162689209, + -0.13360372185707092, + -0.05705544352531433, + -0.41462242603302, + -1.585279941558838, + -1.6438164710998535, + -1.9557222127914429, + -0.3989897072315216, + -0.0365302674472332, + -6.368816375732422, + -0.8731719255447388, + -0.022585075348615646, + -0.2775891423225403, + -0.0027362785767763853, + -0.0006812873762100935, + -1.581446647644043, + -0.008688976056873798, + -0.3532317280769348, + -6.071163177490234, + -9.162371635437012, + -9.965556144714355, + -2.400461196899414, + -2.9898362159729004, + -2.9803032875061035, + -2.12601900100708, + -3.500912666320801, + -7.015069007873535, + -2.278961420059204, + -0.46380555629730225, + -4.078739166259766, + -1.9430254697799683, + -3.5642244815826416, + -3.689701795578003, + -6.201474189758301, + -6.580833911895752, + -2.3081111907958984, + -5.42717170715332, + -1.1886008977890015, + -1.172760248184204, + -1.3571951389312744, + -1.3551844358444214, + -3.376784324645996, + -0.05118789151310921, + -4.064360618591309, + -2.575554847717285, + -0.6994737386703491, + -2.56724214553833, + -2.1888976097106934, + -0.4816131591796875, + -4.070178985595703, + -2.0060782432556152, + -6.858033180236816, + -0.059200502932071686, + -3.214278221130371, + -0.9671833515167236, + -0.823198676109314, + -1.0130078792572021, + -4.595561981201172, + -0.012724989093840122, + -5.214311599731445, + -8.246870040893555, + -3.1476030349731445, + -3.299684524536133, + -4.218191146850586, + -7.318399429321289, + -0.8580498695373535, + -3.0894036293029785, + -1.886361002922058, + -7.217658996582031, + -3.271679639816284, + -3.9717154502868652, + -1.8835484981536865, + -10.034332275390625, + -11.382490158081055, + -5.417011260986328, + -7.505967140197754, + -2.33837890625, + -0.07904055714607239, + -3.294971227645874, + -7.813640594482422, + -1.7646901607513428, + -4.025320053100586, + -3.5977325439453125, + -4.390352249145508, + -9.147806167602539, + -0.5303041934967041, + -7.721246242523193, + -0.6311959028244019, + -0.8119025230407715, + -0.7227814197540283, + -1.8369406461715698, + -0.20933297276496887, + -1.5395950078964233, + -4.424448490142822, + -4.084965705871582, + -3.355497360229492, + -1.0475609302520752, + -6.479413986206055, + -0.7810530662536621, + -2.132437229156494, + -6.648703098297119, + -2.9522438049316406, + -1.2485712766647339, + -4.040503025054932, + -2.3415768146514893, + -5.358206748962402, + -1.6258506774902344, + -3.956300973892212, + -0.732298731803894, + -7.441117286682129, + -1.5242161750793457, + -2.4555861949920654, + -4.295163154602051, + -9.687600135803223, + -0.8213484883308411, + -1.2446978092193604, + -0.01942702941596508, + -4.619411468505859, + -3.3297007083892822, + -2.2139487266540527, + -3.691431999206543, + -2.6574106216430664, + -6.075929641723633, + -0.6123450994491577, + -1.2942559719085693, + -0.6262839436531067, + -7.398006439208984, + -4.4869890213012695, + -4.202048301696777, + -4.982994079589844, + -0.637227475643158, + -3.061023235321045, + -10.117584228515625, + -3.8567495346069336, + -4.0480828285217285, + -2.472019672393799, + -4.246374607086182, + -1.3939155340194702, + -7.132441520690918, + -0.20108745992183685, + -4.986658573150635, + -4.387957572937012, + -0.01108358334749937, + -4.209756851196289, + -7.271108627319336, + -4.047314643859863, + -2.6497321128845215, + -1.4763175249099731, + -0.28365400433540344, + -3.5247769355773926, + -1.4226995706558228, + -4.327237129211426, + -2.0407187938690186, + -6.1437907218933105, + -1.5190880298614502, + -2.5511486530303955, + -7.504094123840332, + -2.152172565460205, + -6.708334922790527, + -6.913146495819092, + -3.6959621906280518, + -6.752341270446777, + -0.63083815574646, + -0.12433214485645294, + -5.0525641441345215, + -4.435934066772461, + -0.45601028203964233, + -6.3459577560424805, + -9.882917404174805, + -3.1422882080078125, + -2.550520658493042, + -3.2099051475524902, + -6.278127193450928, + -0.07764133810997009, + -3.155696153640747, + -1.933587670326233, + -9.61027717590332, + -6.211391925811768, + -4.664543151855469, + -6.783782005310059, + -5.676271438598633, + -8.605900764465332, + -0.0824289619922638, + -3.5463995933532715, + -13.374168395996094, + -1.2401021718978882, + -1.8734056949615479, + -3.4154422283172607, + -1.6733763217926025, + -17.633970260620117, + -9.345113754272461, + -0.6277351975440979, + -2.9617538452148438, + -2.5565333366394043, + -10.10580825805664, + -7.130337715148926, + -7.36820125579834, + -4.098911285400391, + -5.747079372406006, + -2.945054769515991, + -0.7887389063835144, + -1.6583149433135986, + -1.0165244340896606, + -6.581666946411133, + -5.926386833190918, + -5.845194339752197, + -0.9657630920410156, + -7.868755340576172, + -1.3244551420211792, + -0.2657390236854553, + -0.06403665244579315, + -2.983020782470703, + -5.943899631500244, + -7.877285957336426, + -3.593116283416748, + -3.819509506225586, + -7.226177215576172, + -2.5206997394561768, + -3.385587215423584, + -0.37499159574508667, + -1.4698283672332764, + -3.1460342407226562, + -0.0077166082337498665, + -4.350916862487793, + -3.2183218002319336, + -0.6242184638977051, + -1.4782464504241943, + -2.8054311275482178, + -3.0831401348114014, + -12.17662525177002, + -2.113419532775879, + -1.6448111534118652, + -2.1834323406219482, + -0.7630388140678406, + -10.1896390914917, + -6.234405517578125, + -11.46288776397705, + -1.003785490989685, + -4.211658477783203, + -1.5010679960250854, + -5.859302043914795, + -2.0465080738067627, + -3.7468819618225098, + -4.684195518493652, + -4.318704128265381, + -2.7234389781951904, + -9.00437068939209, + -3.043811321258545, + -3.1384406089782715, + -2.713779926300049, + -2.095993995666504, + -2.1484954357147217, + -10.274479866027832, + -0.682350754737854, + -0.25973302125930786, + -3.6964316368103027, + -13.434456825256348, + -2.3368239402770996, + -5.382724761962891, + -1.9073458909988403, + -5.905669212341309, + -0.032165709882974625, + -1.6530004739761353, + -2.728893280029297, + -1.640552043914795, + -1.1391171216964722, + -1.4353511333465576, + -4.003787994384766, + -0.3450564742088318, + -0.7168521285057068, + -0.34650325775146484, + -0.3616408705711365, + -7.062709331512451, + -1.2851682901382446, + -2.299129009246826, + -8.800156593322754, + -5.208735466003418, + -4.780910491943359, + -2.78342342376709, + -4.469717979431152, + -6.909726619720459, + -2.5114197731018066, + -0.659822404384613, + -0.6915416121482849, + -3.2363741397857666, + -0.5283617377281189, + -0.10473938286304474, + -6.215325832366943, + -7.283237934112549, + -1.6797031164169312, + -11.50100040435791, + -7.5822978019714355, + -3.387317657470703, + -11.407575607299805, + -5.441976547241211, + -3.3264851570129395, + -0.7265786528587341, + -1.382750153541565, + -7.841699600219727, + -8.105277061462402, + -3.9569506645202637, + -4.963083267211914, + -0.5492897629737854, + -4.6081390380859375, + -5.870400905609131, + -3.957930088043213, + -5.275494575500488, + -4.105091094970703, + -2.15435528755188, + -2.8472700119018555, + -1.1278448104858398, + -8.226571083068848, + -0.40629008412361145, + -9.916461944580078, + -4.616743087768555, + -1.691868543624878, + -0.6639478802680969, + -2.5716753005981445, + -6.676954746246338, + -6.535329818725586, + -0.4170510768890381, + -1.443942904472351, + -3.145481824874878, + -1.440589427947998, + -0.26935356855392456, + -0.9647155404090881, + -4.335958957672119, + -1.5647850036621094, + -5.890466690063477, + -3.01654052734375, + -1.9168468713760376, + -3.7365682125091553, + -8.001864433288574, + -10.680083274841309, + -4.489352226257324, + -4.6058149337768555, + -7.69011116027832, + -3.6247005462646484, + -1.5600426197052002, + -10.2160062789917, + -5.004643440246582, + -0.19602319598197937, + -3.375545024871826, + -2.669325590133667, + -1.3932737112045288, + -1.6410658359527588, + -6.847603797912598, + -6.744344711303711, + -0.5215591192245483, + -0.25840020179748535, + -1.1448237895965576, + -5.57253885269165, + -7.251138687133789, + -4.221924781799316, + -0.7688062787055969, + -2.504502534866333, + -3.146519660949707, + -2.206653356552124, + -1.4295082092285156, + -7.96943998336792, + -4.332189083099365, + -2.5750505924224854, + -1.7102608680725098, + -5.311381816864014, + -8.897522926330566, + -2.994919538497925, + -3.3397974967956543, + -2.1794328689575195, + -2.437566041946411, + -0.3181810975074768, + -0.27412793040275574, + -0.7914466857910156, + -2.3470635414123535, + -2.4099245071411133, + -2.491870880126953, + -3.024170160293579, + -1.9719040393829346, + -11.373910903930664, + -1.4279751777648926, + -0.14573107659816742, + -2.055763006210327, + -6.366893291473389, + -4.24091911315918, + -0.00709194503724575, + -2.0199716091156006, + -2.524750232696533, + -1.4272525310516357, + -0.5185190439224243, + -2.927150011062622, + -2.7070627212524414, + -3.365638017654419, + -4.318085193634033, + -7.773144721984863, + -1.7947180271148682, + -7.657534599304199, + -8.767786026000977, + -14.74280071258545, + -1.8042558431625366, + -3.2712037563323975, + -1.4002125263214111, + -4.887944221496582, + -1.4821010828018188, + -1.5255622863769531, + -5.879070281982422, + -4.463839530944824, + -5.1955976486206055, + -5.665647506713867, + -0.3775045573711395, + -5.9350481033325195, + -2.800539255142212, + -0.13162286579608917, + -3.034379720687866, + -4.729524612426758, + -4.6252641677856445, + -3.850942611694336, + -2.4760568141937256, + -6.059760093688965, + -10.12075138092041, + -0.9469369649887085, + -11.595907211303711, + -6.875324726104736, + -4.268826007843018, + -2.835529088973999, + -3.8626279830932617, + -4.876199245452881, + -0.013071090914309025, + -4.964417934417725, + -0.7445687055587769, + -5.707155227661133, + -6.10660457611084, + -4.317755699157715, + -4.440443992614746, + -2.9202542304992676, + -4.743522644042969, + -1.2569392919540405, + -2.8675737380981445, + -2.3151841163635254, + -4.318130970001221, + -1.9054772853851318, + -1.1808521747589111, + -0.765956461429596, + -2.768916606903076, + -6.237791061401367, + -1.7224305868148804, + -7.137521743774414, + -4.512486457824707, + -1.9069950580596924, + -4.145983695983887, + -5.365190505981445, + -0.059828490018844604, + -2.273892879486084, + -3.4013004302978516, + -5.035730361938477, + -6.501443386077881, + -9.903446197509766, + -1.6332892179489136, + -2.1572084426879883, + -1.6149548292160034, + -1.4698481559753418, + -6.01010799407959, + -2.2243528366088867, + -6.900836944580078, + -6.0930986404418945, + -2.974020481109619, + -3.225423574447632, + -8.423272132873535, + -1.3423724174499512, + -3.626147508621216, + -0.4862469434738159, + -6.860866546630859, + -3.8910953998565674, + -2.33319354057312, + -1.7229185104370117, + -2.215972423553467, + -8.99046516418457, + -4.099084854125977, + -2.4191012382507324, + -8.288970947265625, + -2.9641928672790527, + -1.5036451816558838, + -3.0544614791870117, + -0.0715634673833847, + -2.444031238555908, + -4.520998954772949, + -3.972568988800049, + -0.4985870122909546, + -2.1651363372802734, + -3.4427435398101807, + -1.730639100074768, + -0.9458961486816406, + -7.740211009979248, + -9.39163875579834, + -3.895984172821045, + -1.7523534297943115, + -5.41331672668457, + -8.910720825195312, + -12.971094131469727, + -3.0455880165100098, + -10.501265525817871, + -3.3864927291870117, + -4.842309951782227, + -3.9964733123779297, + -7.3046793937683105, + -2.6607093811035156, + -1.3541781902313232, + -5.003270626068115, + -3.944551944732666, + -0.11356143653392792, + -5.174440383911133, + -9.628616333007812, + -8.654989242553711, + -8.980416297912598, + -6.670101642608643, + -5.488286018371582, + -5.943419933319092, + -2.126483201980591, + -8.054739952087402, + -7.458671569824219, + -2.5267202854156494, + -6.455472946166992, + -8.655346870422363, + -7.903901100158691, + -6.221062660217285, + -7.129237174987793, + -4.2345380783081055, + -2.5375306606292725, + -7.697700500488281, + -1.567080020904541, + -2.084331750869751, + -0.25020831823349, + -1.5145041942596436, + -4.619244575500488, + -0.2970108985900879, + -0.4977554678916931, + -6.197869300842285, + -4.030620098114014, + -7.232107639312744, + -0.21076253056526184, + -1.563366174697876, + -1.133756160736084, + -2.708237648010254, + -4.080535888671875, + -0.6818401217460632, + -0.1864331066608429, + -0.49012088775634766, + -8.732468605041504, + -11.945040702819824, + -5.243098735809326, + -1.5294703245162964, + -0.8935543298721313, + -0.6174070835113525, + -1.5068217515945435, + -3.5766501426696777, + -5.393096923828125, + -4.202867031097412, + -14.765748023986816, + -5.2513813972473145, + -0.7597705721855164, + -0.2502063810825348, + -1.7403976917266846, + -2.8000779151916504, + -1.9808133840560913, + -2.1654744148254395, + -1.8629226684570312, + -3.222038745880127, + -0.040942225605249405, + -2.3384013175964355, + -10.210381507873535, + -4.5859761238098145, + -0.5805734395980835, + -3.7019288539886475, + -2.001936674118042, + -2.7876083850860596, + -2.9799084663391113, + -4.349887371063232, + -0.0792960673570633, + -1.4366114139556885, + -1.0813264846801758, + -1.3510822057724, + -6.7060699462890625, + -5.436615943908691, + -3.978389263153076, + -6.785447597503662, + -6.147171497344971, + -3.97414231300354, + -4.332991600036621, + -0.9269428253173828, + -5.1237101554870605, + -4.486598968505859, + -0.04678357392549515, + -1.0307552814483643, + -1.4249452352523804, + -4.517682075500488, + -3.561821699142456, + -2.0815205574035645, + -0.6041194200515747, + -5.992964744567871, + -7.092092514038086, + -0.48916709423065186, + -2.6405677795410156, + -4.3345723152160645, + -3.533582925796509, + -3.1233346462249756, + -3.107872486114502, + -1.9901115894317627, + -3.1052846908569336, + -1.8440347909927368, + -6.21368408203125, + -1.8796799182891846, + -2.705214738845825, + -0.2987763583660126, + -4.070865154266357, + -1.6675832271575928, + -1.3896636962890625, + -1.5731089115142822, + -3.526170015335083, + -2.5088443756103516, + -1.208929419517517, + -3.673125743865967, + -2.501532554626465, + -6.875064373016357, + -8.512459754943848, + -1.042314052581787, + -3.657850980758667, + -7.0950798988342285, + -4.974049091339111, + -8.14085578918457, + -3.529888153076172, + -1.9389504194259644, + -7.0902204513549805, + -2.409292459487915, + -2.9428021907806396, + -1.688283085823059, + -3.622368335723877, + -2.0903351306915283, + -4.160663604736328, + -3.1683764457702637, + -1.2135626077651978, + -7.566033363342285, + -3.1186251640319824, + -5.899919509887695, + -0.9518840312957764, + -2.656729221343994, + -2.2994377613067627, + -6.806836128234863, + -1.280236840248108, + -2.838846206665039, + -1.3598848581314087, + -11.707776069641113, + -3.134333372116089, + -0.6230669617652893, + -8.219222068786621, + -7.562507152557373, + -7.489459037780762, + -1.5368008613586426, + -7.149652481079102, + -5.749268054962158, + -3.162869691848755, + -2.7235195636749268, + -6.128931999206543, + -1.1934199333190918, + -3.986410617828369, + -3.76609468460083, + -1.712721586227417, + -3.195504903793335, + -8.397743225097656, + -3.1260581016540527, + -9.792022705078125, + -4.217884540557861, + -11.583260536193848, + -5.987588882446289, + -5.178754806518555, + -6.994749069213867, + -5.167606353759766, + -7.124668121337891, + -6.201416015625, + -10.203682899475098, + -6.858526229858398, + -2.733592987060547, + -5.078882217407227, + -9.003358840942383, + -4.704894542694092, + -3.9085562229156494, + -7.247268199920654, + -7.091092109680176, + -4.4150166511535645, + -7.56699275970459, + -9.485116004943848, + -1.9977033138275146, + -6.65272331237793, + -2.236643075942993, + -7.518955707550049, + -5.525973320007324, + -4.67877721786499, + -6.608670234680176, + -5.536133766174316, + -10.772479057312012, + -10.8853178024292, + -3.6156129837036133, + -6.751470565795898, + -6.4537434577941895, + -3.4220399856567383, + -8.251005172729492, + -3.2146153450012207, + -6.330069541931152, + -1.5551663637161255, + -6.520583629608154, + -10.450878143310547, + -5.8788957595825195, + -3.7398200035095215, + -3.9084208011627197, + -0.3640081584453583, + -6.961522102355957, + -6.066243648529053, + -7.270624160766602, + -5.098455429077148, + -2.7642822265625, + -5.460171699523926, + -7.362828731536865, + -2.558631658554077, + -2.186410427093506, + -2.5309929847717285, + -2.46756649017334, + -2.0306026935577393, + -1.8713470697402954, + -2.108008623123169, + -1.2698389291763306, + -2.1712756156921387, + -2.4432802200317383, + -1.1477653980255127, + -1.8417484760284424, + -2.5971946716308594, + -1.8250831365585327, + -2.103092670440674, + -2.5183165073394775, + -2.9367291927337646, + -1.9412965774536133, + -1.7692793607711792, + -2.864521026611328, + -3.1332175731658936, + -1.098311185836792, + -2.946441173553467, + -2.2800471782684326, + -3.1929852962493896, + -2.754260778427124, + -3.485616445541382, + -3.3010287284851074, + -2.5537776947021484, + -2.6752865314483643, + -3.1617612838745117, + -2.4571690559387207, + -2.060081958770752, + -2.425969362258911, + -2.212725877761841, + -2.4232254028320312, + -3.0587053298950195, + -2.4074010848999023, + -2.457937479019165, + -2.319617986679077, + -2.6340954303741455, + -2.599524736404419, + -2.5302212238311768, + -1.6849274635314941, + -2.2609786987304688, + -2.039928674697876, + -1.9474098682403564, + -2.3550753593444824, + -1.718749761581421, + -2.413884162902832, + -1.6247628927230835, + -2.4784040451049805, + -1.828325629234314, + -1.3880831003189087, + -1.4448199272155762, + -1.1477117538452148, + -1.1669728755950928, + -1.8787822723388672, + -1.5565840005874634, + -1.6666553020477295, + -1.747725248336792, + -1.959598422050476, + -2.0376486778259277, + -2.345367431640625, + -2.055098533630371, + -1.3940613269805908, + -3.4385242462158203, + -2.7489635944366455, + -3.2590157985687256, + -3.1128957271575928, + -1.7070379257202148, + -3.9010369777679443, + -3.21574068069458, + -3.3850393295288086, + -1.8778185844421387, + -2.698211908340454, + -1.8060741424560547, + -2.0845324993133545, + -3.4797585010528564, + -2.263254404067993, + -3.083108901977539, + -1.6589758396148682, + -2.687279224395752, + -1.77505624294281, + -2.6142921447753906, + -1.934045672416687, + -1.8834377527236938, + -2.8038980960845947, + -1.550542950630188, + -3.4054152965545654, + -1.724036693572998, + -2.3146564960479736, + -1.5134503841400146, + -2.9289023876190186, + -1.5285141468048096, + -3.421035051345825, + -1.3757282495498657, + -3.441431760787964, + -1.5286564826965332, + -3.4372904300689697, + -3.173043966293335, + -1.1313854455947876, + -1.486415147781372, + -2.506413221359253, + -3.5165903568267822, + -1.4112176895141602, + -3.7175354957580566, + -2.2170844078063965, + -0.704839289188385, + -0.6626103520393372, + -2.5483946800231934, + -0.949668288230896, + -3.1339564323425293, + -1.3326977491378784, + -3.5493476390838623, + -0.6785370111465454, + -3.236161470413208, + -2.1347084045410156, + -3.802447557449341, + -2.585503339767456, + -3.9361765384674072, + -2.290905714035034, + -3.714280605316162, + -1.793616533279419, + -3.7252492904663086, + -1.450188159942627, + -3.11938738822937, + -1.25174880027771 + ] + }, + "32": { + "input_prompt": "create a conversational article", + "generated_text": " about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article.", + "generated_tokens": [ + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046 + ], + "latency": 48.19877076148987, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -4.4165568351745605, + -11.358176231384277, + -0.0701780766248703, + -7.797665119171143, + -2.6805992126464844, + -1.4707680940628052, + -3.0390255451202393, + -1.6902849674224854, + -1.270594835281372, + -1.1936196088790894, + -1.2523558139801025, + -2.7270259857177734, + -1.2371309995651245, + -0.9618493318557739, + -0.4379909038543701, + -1.3917063474655151, + -1.1055524349212646, + -0.9122569561004639, + -0.9911308288574219, + -0.08436793833971024, + -0.5424078106880188, + -0.9181017279624939, + -0.5873759388923645, + -0.19014373421669006, + -0.06655456870794296, + -0.15252672135829926, + -0.09415211528539658, + -0.009787309914827347, + -0.013910251669585705, + -0.005296128336340189, + -0.005677408073097467, + -0.02013739012181759, + -0.21594694256782532, + -0.07153760641813278, + -0.0066444179974496365, + -0.010198505595326424, + -0.011980246752500534, + -0.003686776151880622, + -0.0037619550712406635, + -0.0022467151284217834, + -0.004088377580046654, + -0.021828632801771164, + -0.0012669878778979182, + -0.09768074005842209, + -0.02652405947446823, + -0.0019286142196506262, + -0.002283824374899268, + -0.0032225127797573805, + -0.0009741804678924382, + -0.0009415484382770956, + -0.001211624126881361, + -0.001135300612077117, + -0.002340436913073063, + -0.0010846928926184773, + -0.0509282611310482, + -0.03832047060132027, + -0.00257422705180943, + -0.0022806129418313503, + -0.00262785074301064, + -0.0008195855189114809, + -0.0010239601833745837, + -0.0013777059502899647, + -0.0009899006690829992, + -0.0018756669014692307, + -0.0015304292319342494, + -0.08506463468074799, + -0.01893703266978264, + -0.0013797297142446041, + -0.0014461545506492257, + -0.0013971101725474, + -0.0005869334563612938, + -0.0005212855176068842, + -0.000876757490914315, + -0.0005256939912214875, + -0.0012863941956311464, + -0.0015691122971475124, + -0.051276568323373795, + -0.00973513163626194, + -0.0010469438275322318, + -0.0011531615164130926, + -0.0009969270322471857, + -0.00038342276820912957, + -0.0004032037395518273, + -0.000730247818864882, + -0.0003275334893260151, + -0.0008700875914655626, + -0.0017572689102962613, + -0.044393111020326614, + -0.013102858327329159, + -0.0011463745031505823, + -0.0012070996453985572, + -0.0012325793504714966, + -0.0005048430757597089, + -0.0004876854654867202, + -0.0007901645149104297, + -0.00041500062798149884, + -0.0009869233472272754, + -0.0018687656847760081, + -0.03943866863846779, + -0.014425630681216717, + -0.0014756753807887435, + -0.001423775334842503, + -0.001209719106554985, + -0.0005046047735959291, + -0.00042167355422861874, + -0.0007688426994718611, + -0.0002699726028367877, + -0.0006598440813831985, + -0.0017849955474957824, + -0.038999658077955246, + -0.012665312737226486, + -0.0014427024871110916, + -0.0014492495683953166, + -0.001016576774418354, + -0.00042083943844772875, + -0.00033241944038309157, + -0.0006403064471669495, + -0.00022373080719262362, + -0.0007053509471006691, + -0.0018597226589918137, + -0.030997740104794502, + -0.011259939521551132, + -0.0012655591126531363, + -0.00134151556994766, + -0.0008106521563604474, + -0.0003513672563713044, + -0.0002964295563288033, + -0.0006368515896610916, + -0.00020180096908006817, + -0.0005779979983344674, + -0.0016014858847483993, + -0.0271126888692379 + ] + }, + "64": { + "input_prompt": "App Concept: \"Eggy's Interactive Adventure World\"\n\nEggy's Interactive Adventure World is a cutting-edge app designed for phones and tablets that combines immersive storytelling with engaging, interactive experiences for both parents and children. This innovative app allows users to explore the world of Eggy and participate in a series of adventures that teach essential life skills, foster personal growth, and strengthen parent-child bonds.\n\nKey Features:\n\n1. Interactive Stories: The app brings the 20 Eggy stories to life through interactive, animated storytelling. Users can read, listen, and actively participate in the stories, making choices that guide Eggy through various adventures, each with unique lessons and experiences.\n2. Augmented Reality (AR) Integration: The app utilizes AR technology, allowing users to interact with Eggy and the story characters in their real-world environment. This immersive experience creates a deeper connection with the stories and encourages engagement.\n3. Personalized Adventures: The app customizes each story based on the user's interests, needs, and personal growth areas. By collecting user input, the app tailors the storylines and learning outcomes to ensure that each adventure resonates with the child's unique experiences.\n4. Parent-Child Cooperative Play: The app encourages parents and children to collaborate and engage in the stories together. Through a split-screen mode or multi-device connectivity, parents and children can actively participate in the stories and learn essential life skills together.\n5. Skill-Building Mini-Games: Interspersed throughout the stories are educational mini-games that reinforce the lessons and themes. These games are designed to be fun, engaging, and accessible to children, encouraging them to practice and develop various skills.\n6. Progress Tracking and Rewards: The app tracks the user's progress through the stories and rewards them with badges, trophies, and other collectibles. This gamification aspect encourages continued engagement and fosters a sense of accomplishment and growth.\n7. Community Features: The app provides a safe, moderated online community where parents can share their experiences, discuss the stories and lessons, and connect with other families who are also using the app. This community fosters support, inspiration, and learning.\n8. Expert Resources: The app includes access to resources from experts in child development, psychology, and education, providing parents with additional tools and guidance on fostering the growth and well-being of their children.\n\nEggy's Interactive Adventure World is a one-of-a-kind, mind-blowing app that revolutionizes the way parents and children engage with stories and learn essential life skills. Combining the power of storytelling, cutting-edge technology, and expert resources, this app is a must-have for families looking to strengthen their bonds and support their children's personal growth.", + "generated_text": "The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season was the 10th season was the 10 season was the 10 season was the 10", + "generated_tokens": [ + 2, + 1784, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048 + ], + "latency": 86.85381531715393, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -12.107745170593262, + -2.9727728366851807, + -3.720092535018921, + -5.592433929443359, + -10.964235305786133, + -3.654498338699341, + -9.33439826965332, + -4.833785057067871, + -5.187321662902832, + -2.6944785118103027, + -6.9262237548828125, + -0.654232919216156, + -0.5550781488418579, + -0.21346639096736908, + -0.0134271876886487, + -0.010840100236237049, + -1.3878544569015503, + -0.6296291351318359, + -7.9766011238098145, + -0.4393192231655121, + -5.639142036437988, + -3.277270793914795, + -1.0206468105316162, + -11.703084945678711, + -0.7100943922996521, + -0.2809169888496399, + -2.771284818649292, + -7.190817832946777, + -4.048691749572754, + -0.012056218460202217, + -3.3802318572998047, + -0.6807184815406799, + -3.4844107627868652, + -3.312331199645996, + -0.5001641511917114, + -2.61255145072937, + -4.243694305419922, + -4.333778381347656, + -6.0625810623168945, + -0.011777156963944435, + -0.37577226758003235, + -0.9490834474563599, + -3.5450198650360107, + -2.1778035163879395, + -0.45957911014556885, + -3.00771164894104, + -1.7600425481796265, + -0.09766030311584473, + -2.467618942260742, + -1.329679012298584, + -0.8384320735931396, + -1.1864604949951172, + -3.628342866897583, + -0.2470003068447113, + -1.8938640356063843, + -5.168431282043457, + -0.05005566030740738, + -2.258014678955078, + -2.449028968811035, + -0.0034086955711245537, + -3.9485883712768555, + -1.6201664209365845, + -5.139942646026611, + -4.859354496002197, + -0.23686674237251282, + -0.5541543364524841, + -2.5826025009155273, + -6.114635467529297, + -4.3380208015441895, + -0.7412900924682617, + -0.3221715986728668, + -0.13805493712425232, + -4.1797332763671875, + -7.3456268310546875, + -0.13762745261192322, + -2.0905232429504395, + -1.0178627967834473, + -4.108260631561279, + -0.6007124185562134, + -1.0410642623901367, + -4.122039794921875, + -0.35905471444129944, + -1.4274661540985107, + -4.139932155609131, + -0.4237431585788727, + -1.6294409036636353, + -0.9811424016952515, + -4.132790565490723, + -1.1318120956420898, + -6.8258256912231445, + -1.5455098152160645, + -0.6984409093856812, + -13.664215087890625, + -0.1166313961148262, + -1.6347849369049072, + -0.28875046968460083, + -0.03130083531141281, + -1.5293006896972656, + -1.6488375663757324, + -4.224111557006836, + -4.760683059692383, + -1.9758747816085815, + -1.5828256607055664, + -2.8463857173919678, + -0.2620386481285095, + -1.7243889570236206, + -1.7945923805236816, + -0.8884308338165283, + -0.3766394555568695, + -0.34033581614494324, + -9.05566692352295, + -0.22754782438278198, + -0.033802058547735214, + -0.34108465909957886, + -0.5644669532775879, + -2.0925779342651367, + -4.547505855560303, + -10.870464324951172, + -1.1072022914886475, + -5.503787994384766, + -3.259672164916992, + -0.007964519783854485, + -3.0111639499664307, + -4.246737480163574, + -0.7813188433647156, + -3.331031322479248, + -4.485962867736816, + -0.9492117166519165, + -2.6757047176361084, + -1.1591349840164185, + -1.122117519378662, + -2.629878044128418, + -5.986321926116943, + -0.2146703153848648, + -0.002392764901742339, + -7.372479438781738, + -0.007077385671436787, + -0.06599216908216476, + -0.0970711037516594, + -3.2874932289123535, + -0.0019583588000386953, + -0.9122000336647034, + -4.930907249450684, + -0.019508399069309235, + -0.308611661195755, + -0.07778516411781311, + -3.8497893810272217, + -0.46124517917633057, + -0.38821348547935486, + -2.668412208557129, + -1.845987319946289, + -0.06470083445310593, + -0.006619549356400967, + -1.2610487937927246, + -0.13015533983707428, + -3.365312099456787, + -0.0014690094394609332, + -1.6789823770523071, + -1.2499005794525146, + -3.3992111682891846, + -5.563300132751465, + -0.823418140411377, + -4.24124813079834, + -1.6597849130630493, + -0.6941139698028564, + -1.5637556314468384, + -0.5482053756713867, + -0.9507225751876831, + -3.764758586883545, + -0.0006518622976727784, + -0.7540555000305176, + -5.058262825012207, + -0.3302401602268219, + -2.8130555152893066, + -0.17079885303974152, + -2.871047019958496, + -0.3991694450378418, + -3.1476998329162598, + -0.3488404452800751, + -2.0545666217803955, + -4.201597690582275, + -5.164614677429199, + -0.0271432027220726, + -0.0009785869624465704, + -3.3444161415100098, + -1.3117046356201172, + -6.375423431396484, + -0.05535568296909332, + -0.3919340968132019, + -0.060594215989112854, + -6.507473468780518, + -0.0023910999298095703, + -2.143423318862915, + -3.335618257522583, + -2.953970432281494, + -0.0013383012264966965, + -0.8080525398254395, + -0.29526084661483765, + -0.04036511853337288, + -3.231475353240967, + -1.0585589408874512, + -6.136373043060303, + -0.006182829383760691, + -0.035548023879528046, + -5.509808540344238, + -1.8490750789642334, + -9.83314037322998, + -0.07037576287984848, + -3.1621387004852295, + -6.762360095977783, + -1.3490527868270874, + -3.601043462753296, + -1.176393985748291, + -0.4342959523200989, + -0.06266004592180252, + -5.464046001434326, + -0.017946599051356316, + -1.0416009426116943, + -1.6117159128189087, + -12.289417266845703, + -1.5004339218139648, + -5.76563835144043, + -4.038386821746826, + -0.20812086760997772, + -3.6306562423706055, + -1.3901070356369019, + -1.087137222290039, + -2.423213243484497, + -4.503086090087891, + -0.0008031480247154832, + -0.03627370297908783, + -0.1653430461883545, + -7.958648681640625, + -1.1018548011779785, + -1.290948748588562, + -3.8049263954162598, + -1.8253734111785889, + -0.059022851288318634, + -0.0013984196120873094, + -4.698851585388184, + -2.5421664714813232, + -0.024493809789419174, + -4.828659534454346, + -3.0295286178588867, + -3.550312042236328, + -0.1185273677110672, + -0.22595760226249695, + -0.10782183706760406, + -1.4033282995224, + -0.4485701024532318, + -0.2889708876609802, + -0.05471855774521828, + -0.007632025051862001, + -2.1156554222106934, + -0.6249589323997498, + -4.198577404022217, + -0.14178156852722168, + -4.284021377563477, + -2.227515935897827, + -3.5022120475769043, + -0.19575819373130798, + -15.964509963989258, + -4.055960655212402, + -11.125024795532227, + -0.7681724429130554, + -3.0436902046203613, + -7.030262470245361, + -4.376729488372803, + -5.476145267486572, + -0.4219042658805847, + -3.7689766883850098, + -0.060010604560375214, + -0.8134393692016602, + -0.11386934667825699, + -0.025473715737462044, + -0.09736856073141098, + -4.357361793518066, + -0.3670865297317505, + -0.08063744008541107, + -0.1311480849981308, + -1.0903867483139038, + -1.2705107927322388, + -1.5076212882995605, + -4.295275688171387, + -0.04185756668448448, + -0.19810955226421356, + -1.9645220041275024, + -0.9597910642623901, + -0.13429655134677887, + -0.002283110748976469, + -7.066074371337891, + -3.639211654663086, + -1.0263917446136475, + -8.124760627746582, + -1.132537841796875, + -0.09160765260457993, + -0.08996370434761047, + -10.165366172790527, + -3.501585006713867, + -0.0019847711082547903, + -0.05309417471289635, + -0.31209683418273926, + -0.15089339017868042, + -1.23564875125885, + -1.2685208320617676, + -7.832758903503418, + -0.19271136820316315, + -0.014305183663964272, + -0.0007532381569035351, + -0.44688940048217773, + -2.6239724159240723, + -1.738666296005249, + -1.6480977535247803, + -0.46753185987472534, + -8.656959533691406, + -3.79868483543396, + -0.9281394481658936, + -2.2381181716918945, + -1.7654449939727783, + -0.4948798418045044, + -0.025028761476278305, + -1.5435361862182617, + -1.6390818357467651, + -1.4962153434753418, + -0.3425217270851135, + -0.013077914714813232, + -0.038474079221487045, + -5.3364362716674805, + -0.42365288734436035, + -1.884093999862671, + -3.510357618331909, + -6.198029518127441, + -0.44375038146972656, + -0.0008789013954810798, + -3.6025230884552, + -1.419615626335144, + -2.6723289489746094, + -5.775190830230713, + -1.1380761861801147, + -2.6683366298675537, + -0.43395891785621643, + -0.003145867260172963, + -8.63144302368164, + -1.646262764930725, + -1.732487678527832, + -4.561546802520752, + -0.5277953147888184, + -0.07333153486251831, + -0.5624169707298279, + -0.12201295047998428, + -2.6561455726623535, + -1.1071691513061523, + -2.6895060539245605, + -0.040864069014787674, + -0.04126371443271637, + -1.8294739723205566, + -0.09022177755832672, + -0.3154001832008362, + -0.46215569972991943, + -2.2462844848632812, + -0.30149081349372864, + -0.52588951587677, + -8.288043975830078, + -0.0002057340752799064, + -0.8021711707115173, + -4.4546098709106445, + -0.0001565095444675535, + -0.0015961299650371075, + -0.15216240286827087, + -0.3677564561367035, + -5.018707275390625, + -0.7850045561790466, + -1.9582659006118774, + -1.0046892166137695, + -10.0401029586792, + -0.16878114640712738, + -5.944240570068359, + -1.5523078441619873, + -5.7253522872924805, + -0.47948503494262695, + -0.44009655714035034, + -5.671053886413574, + -0.003280022880062461, + -0.7937742471694946, + -0.9639376401901245, + -0.00030048147891648114, + -1.0747740268707275, + -0.8839919567108154, + -3.416811466217041, + -1.6602673530578613, + -0.2706959843635559, + -0.0024333172477781773, + -4.478696823120117, + -6.20179557800293, + -0.11359559744596481, + -0.202009916305542, + -0.022310219705104828, + -2.367263078689575, + -1.0405994653701782, + -5.984308242797852, + -2.105138063430786, + -9.583202362060547, + -0.0004957877099514008, + -3.0655455589294434, + -0.0669412910938263, + -0.8977450728416443, + -2.2271294593811035, + -2.6617536544799805, + -1.8184051513671875, + -0.8291114568710327, + -0.4864235818386078, + -0.7993525862693787, + -3.51106858253479, + -2.1530935764312744, + -0.257144957780838, + -1.3934082984924316, + -1.3137131929397583, + -0.3384077548980713, + -0.1697217971086502, + -2.353395938873291, + -0.03406282886862755, + -0.39059701561927795, + -3.422821044921875, + -1.7117210626602173, + -0.7018465399742126, + -1.5995906591415405, + -3.6218395233154297, + -0.12497704476118088, + -0.16966234147548676, + -0.7313685417175293, + -0.4956285357475281, + -1.0840849876403809, + -5.042126655578613, + -0.00031704644788987935, + -7.683258056640625, + -0.9210801720619202, + -4.687852382659912, + -0.0028814247343689203, + -0.043382611125707626, + -4.1948652267456055, + -2.66593337059021, + -0.06153333932161331, + -0.0023110604379326105, + -6.729236602783203, + -5.777127742767334, + -0.08932067453861237, + -0.09890018403530121, + -0.009886111132800579, + -3.1145148277282715, + -3.725565195083618, + -0.0021998509764671326, + -3.9927196502685547, + -2.753793239593506, + -1.6037236452102661, + -0.17461130023002625, + -4.804804801940918, + -0.2311229705810547, + -0.30256444215774536, + -2.235363006591797, + -0.006614102050662041, + -0.34757524728775024, + -1.4946835041046143, + -1.222062587738037, + -3.658839225769043, + -1.356170892715454, + -0.5371109843254089, + -3.7580835819244385, + -4.54621696472168, + -0.31577637791633606, + -3.677156925201416, + -2.7181396484375, + -7.4674882888793945, + -0.00019369633810129017, + -2.3798398971557617, + -2.5452184677124023, + -0.2858496308326721, + -4.315659523010254, + -0.025835415348410606, + -0.000603493710514158, + -0.2546294331550598, + -0.12032663822174072, + -2.006908655166626, + -5.990736961364746, + -7.146596908569336, + -0.23356498777866364, + -0.2201036810874939, + -0.01235415879637003, + -0.011248741298913956, + -1.4155778884887695, + -0.40242519974708557, + -5.877886772155762, + -0.7865053415298462, + -0.03231288120150566, + -0.004864405374974012, + -0.0050629740580916405, + -2.7049152851104736, + -6.822089195251465, + -0.39252761006355286, + -1.2290617227554321, + -0.007630132604390383, + -3.485461711883545, + -0.47985684871673584, + -6.1813530921936035, + -0.03757825121283531, + -0.37834712862968445, + -0.22192610800266266, + -1.165318489074707, + -0.5220151543617249, + -0.1289423257112503, + -3.216222047805786, + -1.0787583589553833, + -3.0716826915740967, + -0.6023419499397278, + -2.558605194091797, + -0.927433431148529, + -0.00364841241389513, + -0.14910078048706055, + -0.7318926453590393, + -6.159773826599121, + -0.0015301911626011133, + -1.8908276557922363, + -1.9641315937042236, + -0.021651331335306168, + -2.1648828983306885, + -2.2700207233428955, + -7.833290100097656, + -0.03397307172417641, + -0.8344621658325195, + -0.02225659228861332, + -0.06639260798692703, + -2.3780317306518555, + -3.180129051208496, + -0.09030630439519882, + -2.4138312339782715, + -1.3445552587509155, + -1.848326325416565, + -0.9726964831352234, + -2.851792335510254, + -0.0630769282579422, + -0.0011394681641831994, + -0.05843213573098183, + -2.6616668701171875, + -1.575437068939209, + -0.180197611451149, + -5.552371501922607, + -0.26108410954475403, + -2.529611587524414, + -0.37780019640922546, + -5.141795635223389, + -0.5921107530593872, + -0.2474975287914276, + -0.10687454044818878, + -4.891775131225586, + -0.25011152029037476, + -2.4100728034973145, + -1.358667016029358, + -2.790961503982544, + -3.8654675483703613, + -1.0076243877410889, + -0.7456949949264526, + -1.5575554370880127, + -2.05328631401062, + -1.6538066864013672, + -0.0558217354118824, + -0.0001817776501411572, + -0.0011643542675301433, + -0.038359593600034714, + -1.4208931922912598, + -0.542127251625061, + -0.3162364959716797, + -0.3966117799282074, + -1.1765563488006592, + -1.7920958995819092, + -0.18425509333610535, + -0.1092008650302887, + -0.46676987409591675, + -0.24977745115756989, + -1.0375996828079224, + -0.5268858671188354, + -0.008942908607423306, + -0.6404479146003723, + -0.0033111530356109142, + -5.3165931603871286e-05, + -0.5154370665550232, + -0.39286962151527405, + -1.401839256286621, + -0.6232213973999023, + -0.02168831042945385, + -0.004282470792531967, + -0.005199837032705545, + -0.09748794883489609, + -0.040823787450790405, + -0.00014852374442853034, + -0.0005832401220686734, + -0.005303124897181988, + -0.6537013053894043, + -0.38026049733161926, + -0.04189129173755646, + -0.010385753586888313, + -0.008756335824728012, + -0.013362848199903965, + -0.000504723924677819, + -0.002797620603814721, + -0.0014512732159346342, + -0.0013321106089279056, + -0.010883613489568233, + -0.005159396678209305, + -0.004701037425547838, + -0.01591104455292225, + -0.001474246964789927, + -1.2278481335670222e-05, + -0.010548785328865051, + -0.08341525495052338, + -0.03858809545636177, + -0.056062061339616776, + -0.0009532198309898376, + -0.0005789510905742645, + -0.0008986725588329136, + -0.00710969977080822, + -0.0006561510381288826, + -1.4781842764932662e-05, + -5.578839045483619e-05, + -0.0006398299592547119, + -0.0028786908369511366, + -0.0034092895220965147, + -0.008268529549241066, + -0.006602259818464518, + -0.004517706111073494, + -0.02233586646616459, + -0.0006323245470412076, + -0.009195122867822647, + -0.0029284947086125612, + -0.004457537550479174, + -0.017873765900731087, + -0.008801711723208427, + -0.0036383166443556547, + -0.08078611642122269, + -0.006347495596855879, + -0.0002177716523874551, + -0.04688572511076927, + -0.2718890309333801, + -0.07996802777051926, + -0.04674842208623886, + -0.009984076954424381, + -0.010000954382121563, + -0.050126753747463226, + -0.5864179730415344, + -0.2915390133857727, + -0.008090462535619736, + -0.032190412282943726, + -0.03461571782827377, + -0.2785419523715973, + -0.05830562859773636, + -0.02893950417637825, + -0.12241066247224808, + -0.02711048536002636, + -0.16450686752796173, + -0.09852994978427887, + -0.2651047706604004, + -0.35559725761413574, + -0.12606258690357208, + -0.32793670892715454, + -0.20878805220127106, + -0.7587923407554626, + -0.4730657637119293, + -1.496794581413269, + -0.2486363798379898, + -0.20723387598991394, + -0.1872958242893219, + -0.19151091575622559, + -0.3350580036640167, + -1.3085839748382568, + -0.6109102964401245, + -0.2947172224521637, + -0.37130236625671387, + -0.19888469576835632, + -0.18297068774700165, + -0.9978674054145813, + -0.5471905469894409, + -0.4379975199699402, + -0.407988041639328, + -0.17325688898563385, + -0.16938896477222443 + ] + }, + "96": { + "input_prompt": "what is the concept of double materiality in sustainability?", + "generated_text": " What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double material", + "generated_tokens": [ + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318 + ], + "latency": 125.58511328697205, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -4.917365074157715, + -0.9960631132125854, + -7.875392913818359, + -0.2993181347846985, + -7.760880470275879, + -10.308395385742188, + -2.1807961463928223, + -1.6412583589553833, + -9.521512985229492, + -1.627489447593689, + -1.8410861492156982, + -0.9285702705383301, + -0.2576955556869507, + -0.9641067981719971, + -0.02314644306898117, + -0.6696561574935913, + -0.07035009562969208, + -0.004622488282620907, + -0.025748632848262787, + -0.06276137381792068, + -0.17385317385196686, + -0.3285445272922516, + -0.0592009499669075, + -0.007940039038658142, + -0.22664028406143188, + -0.0017957051750272512, + -0.022929180413484573, + -0.005733947269618511, + -0.0012996093137189746, + -0.006419987417757511, + -0.02376849390566349, + -0.27800270915031433, + -0.4650723934173584, + -0.04936715215444565, + -0.003972141072154045, + -0.01477995328605175, + -0.0012044801842421293, + -0.014891182072460651, + -0.002709767082706094, + -0.0009939497103914618, + -0.0028436246793717146, + -0.006759870797395706, + -0.15416178107261658, + -0.20121537148952484, + -0.016414370387792587, + -0.0015769677702337503, + -0.008138825185596943, + -0.0007713441736996174, + -0.013819841668009758, + -0.003826678032055497, + -0.0005918181850574911, + -0.0014938872773200274, + -0.00485716899856925, + -0.081083282828331, + -0.09642580896615982, + -0.009630884043872356, + -0.0010948146227747202, + -0.007085552904754877, + -0.0006310140597634017, + -0.013073914684355259, + -0.0039152647368609905, + -0.000364713923772797, + -0.001292108790948987, + -0.004158303141593933, + -0.044283974915742874, + -0.05722038820385933, + -0.006369172595441341, + -0.0007976687629707158, + -0.005993015132844448, + -0.0004935238393954933, + -0.011310506612062454, + -0.002951553324237466, + -0.000387831823900342, + -0.000977038755081594, + -0.0036971091758459806, + -0.030511993914842606, + -0.04246694967150688, + -0.004863100592046976, + -0.0006927236099727452, + -0.005206122528761625, + -0.0005129451747052372, + -0.00894621666520834, + -0.0028565814718604088, + -0.00041333239641971886, + -0.0009002208826132119, + -0.0033131728414446115, + -0.021188799291849136, + -0.03330245241522789, + -0.0038543473929166794, + -0.0006504327175207436, + -0.004474864806979895, + -0.00048029806930571795, + -0.009718249551951885, + -0.0030443770810961723, + -0.0003743662964552641, + -0.0009439303539693356, + -0.003729770192876458, + -0.016505014151334763, + -0.0290373582392931, + -0.003315192647278309, + -0.0005821678787469864, + -0.004148805979639292, + -0.00042489083716645837, + -0.006856840569525957, + -0.0028660909738391638, + -0.00032574593205936253, + -0.0006986799417063594, + -0.003671098267659545, + -0.012792548164725304, + -0.02553274855017662, + -0.002730690874159336, + -0.0005067494930699468, + -0.0036923582665622234, + -0.0004451475979294628, + -0.007225453853607178, + -0.002746859099715948, + -0.0003323002893012017, + -0.0008517451351508498, + -0.003630714723840356, + -0.011284693144261837, + -0.02186688780784607, + -0.0025819556321948767, + -0.0004931663861498237, + -0.0031721293926239014, + -0.00040725519647821784, + -0.0062383925542235374, + -0.00238693761639297, + -0.00040749352774582803, + -0.0007970731821842492, + -0.003604583442211151, + -0.010075542144477367, + -0.022386692464351654, + -0.0025295186787843704, + -0.00046302087139338255, + -0.003138143103569746, + -0.0004233417857903987, + -0.006178564392030239, + -0.0021037368569523096 + ] + }, + "127": { + "input_prompt": "as before we talk about using , i usually do if else() to in case the property would be null before.\nif today i public a property and i want to use using to instead , will code be like this?\n\npublic Material Mat;\n\nusing(Mat){\n// Do something;\n}\n\nis that right?", + "generated_text": " i am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using", + "generated_tokens": [ + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505 + ], + "latency": 154.75680470466614, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -7.793755531311035, + -4.224940776824951, + -5.446587562561035, + -0.267395555973053, + -6.118141174316406, + -11.105525970458984, + -6.67517614364624, + -6.902530670166016, + -3.2218151092529297, + -5.99644660949707, + -8.431673049926758, + -8.04836368560791, + -3.992594003677368, + -7.550079345703125, + -2.873685836791992, + -3.3877997398376465, + -7.9306745529174805, + -7.005484580993652, + -1.0481306314468384, + -2.686237335205078, + -6.131283760070801, + -6.2994704246521, + -7.931419372558594, + -11.3147554397583, + -8.544670104980469, + -12.01729679107666, + -3.89847469329834, + -1.7964364290237427, + -2.952878952026367, + -1.9217232465744019, + -2.272329330444336, + -0.37552154064178467, + -1.667820692062378, + -7.510344505310059, + -3.498040199279785, + -7.980632305145264, + -7.672002792358398, + -4.4999470710754395, + -7.155375003814697, + -2.4486124515533447, + -4.785946846008301, + -1.153855800628662, + -2.3994438648223877, + -4.250652313232422, + -12.24446964263916, + -8.344388008117676, + -2.608186721801758, + -5.200589179992676, + -8.25888442993164, + -3.6245617866516113, + -7.689338207244873, + -7.345355033874512, + -1.2661759853363037, + -7.265620231628418, + -1.9884108304977417, + -6.269482612609863, + -2.41705584526062, + -1.8929681777954102, + -1.8259913921356201, + -2.0997350215911865, + -2.323200225830078, + -1.3998825550079346, + -0.8789899945259094, + -1.082053542137146, + -1.1831339597702026, + -1.4462857246398926, + -1.6481035947799683, + -1.4408715963363647, + -1.2603964805603027, + -1.5267670154571533, + -1.6345772743225098, + -1.3796477317810059, + -0.7609691023826599, + -0.3548354506492615, + -0.7552334666252136, + -0.44776833057403564, + -1.1078286170959473, + -1.3036658763885498, + -0.5214896202087402, + -0.8486822843551636, + -0.22470997273921967, + -0.4705755412578583, + -0.5639711022377014, + -0.5388108491897583, + -0.6052999496459961, + -0.1002030223608017, + -0.286334365606308, + -0.45798981189727783, + -1.0107953548431396, + -0.11875647306442261, + -0.6969441771507263, + -0.4609107971191406, + -0.07614769786596298, + -0.5035472512245178, + -0.1682187020778656, + -0.10476160794496536, + -0.6586751341819763, + -0.35806939005851746, + -1.5364394187927246, + -2.4093759059906006, + -1.977368950843811, + -1.6216907501220703, + -0.27647316455841064, + -0.2991848587989807, + -0.2783535420894623, + -0.05913994088768959, + -0.03023873083293438, + -0.043339803814888, + -0.7320341467857361, + -0.0030677898321300745, + -0.0332595594227314, + -0.012804670259356499, + -0.004041599575430155, + -0.0014899593079462647, + -0.001948602613992989, + -0.0029070996679365635, + -0.040939707309007645, + -0.013942227698862553, + -0.04897322878241539, + -0.011005887761712074, + -0.0044113704934716225, + -0.0013179434463381767, + -0.003658389439806342, + -0.009758152067661285, + -0.0014104428701102734, + -0.0016671819612383842, + -0.000771939754486084, + -0.0015519729349762201, + -0.003720743814483285, + -0.004249115474522114, + -0.00485657574608922, + -0.005053604021668434, + -0.002336274366825819, + -0.0009155849111266434, + -0.0004978132783435285, + -0.0005953923100605607, + -0.0011395872570574284, + -0.001485078944824636, + -0.3072909712791443, + -1.7295066118240356, + -0.4807289242744446, + -0.1245415136218071, + -0.011858444660902023, + -0.020613837987184525, + -0.011020978912711143, + -0.003106294432654977, + -0.0009966888464987278, + -0.0019349202048033476, + -0.037407051771879196, + -0.0003496989083942026, + -0.005922981072217226, + -0.007394562941044569, + -0.0006037319544702768, + -0.0008836655179038644, + -0.0002884448622353375, + -0.00047600860125385225, + -0.0024947968777269125, + -0.00442774873226881, + -0.004059052560478449, + -0.0018594847060739994, + -0.0006179092451930046, + -0.00022635281493421644, + -0.0006730675231665373, + -0.003022746881470084, + -0.0002343380037928, + -0.00047791501856409013, + -9.440929716220126e-05, + -0.00021550717065110803, + -0.0013523490633815527, + -0.0032202552538365126, + -0.001157686347141862, + -0.004449942149221897, + -0.0016590891173109412, + -0.00101062236353755, + -0.0003079893649555743, + -0.00048375347978435457, + -0.0021734442561864853, + -0.00423036003485322, + -0.11514264345169067, + -0.8658493757247925, + -0.084366075694561, + -0.02140468917787075, + -0.0060798698104918, + -0.008638513274490833, + -0.003212531330063939, + -0.0009598892065696418, + -0.00032085992279462516 + ] + }, + "throughput": 105.49771806099545 +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..1491284644d --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,161 @@ +{ + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And that this is the place where you can be yourself, and be yourself in the most beautiful way. And that this is the place where you can", + "generated_tokens": [ + 3060, + 1455, + 1593, + 1395, + 1278, + 3535, + 2478, + 1636, + 1710, + 1402, + 14019, + 1044, + 1321, + 1402, + 14019, + 1294, + 1278, + 2725, + 15568, + 3039, + 1046, + 3060, + 1455, + 1593, + 1395, + 1278, + 3535, + 2478, + 1636, + 1710 + ], + "latency": 2.2049803733825684, + "cuda_graph_request_count_map": null, + "step_count": 240, + "logprobs": [ + -9.358587265014648, + -2.7594826221466064, + -4.608366012573242, + -1.4093360900878906, + -0.6152952313423157, + -1.7217562198638916, + -2.496668815612793, + -2.0547454357147217, + -2.441960573196411, + -6.280838966369629, + -1.5643692016601562, + -3.462346076965332, + -4.428728103637695, + -3.8633861541748047, + -1.9936373233795166, + -1.8929449319839478, + -3.796365737915039, + -6.8360137939453125, + -0.2901247441768646, + -0.9246833324432373, + -6.633338928222656, + -7.166708469390869, + -12.771251678466797, + -2.198296308517456, + -3.7778120040893555, + -0.4983733296394348, + -4.381269454956055, + -0.0666784718632698, + -0.09580295532941818, + -3.2437636852264404, + -10.079947471618652, + -1.172220230102539, + -5.977442741394043, + -5.046236038208008, + -3.855658531188965, + -2.5585858821868896, + -3.356245994567871, + -5.557229518890381, + -1.6787731647491455, + -5.483290672302246, + -12.218501091003418, + -12.61402702331543, + -0.09662941098213196, + -2.5431432723999023, + -1.4071024656295776, + -2.9154715538024902, + -1.1964417695999146, + -0.006458481773734093, + -3.3625335693359375, + -13.262511253356934, + -4.314079761505127, + -2.617699146270752, + -5.987792015075684, + -0.778266429901123, + -0.048888545483350754, + -1.548882007598877, + -1.1381981372833252, + -5.627166748046875, + -0.4078553318977356, + -4.958505630493164, + -0.6187160611152649, + -0.7174848914146423, + -2.469533920288086, + -13.620073318481445, + -0.09088654816150665, + -3.526974678039551, + -1.4195809364318848, + -6.402483940124512, + -0.5898402333259583, + -3.565917491912842, + -0.8561318516731262, + -1.6140165328979492, + -5.370549201965332, + -17.159223556518555, + -6.583524703979492, + -0.8855001926422119, + -4.19431209564209, + -1.2012220621109009, + -2.2563133239746094, + -1.7674944400787354, + -0.22064533829689026, + -9.292220115661621, + -0.12445646524429321, + -7.29617977142334, + -2.526529312133789, + -4.071560859680176, + -3.5568013191223145, + -1.926215410232544, + -2.349026918411255, + -2.2132363319396973, + -0.3125414550304413, + -1.4718132019042969, + -2.149106740951538, + -1.0855519771575928, + -1.631832242012024, + -1.3751734495162964, + -1.9396103620529175, + -1.5293723344802856, + -0.8444125056266785, + -1.2414811849594116, + -1.9522171020507812, + -2.4338042736053467, + -1.5651824474334717, + -0.9498789310455322, + -1.8044980764389038, + -2.356677770614624, + -1.247452974319458, + -1.550165057182312, + -0.5635553598403931, + -0.6177330017089844, + -0.4778785705566406, + -0.020452087745070457, + -0.48500269651412964, + -0.23854275047779083, + -0.06543659418821335, + -0.11837350577116013, + -0.0585334412753582 + ] + }, + "throughput": 13.337338555385374 +} \ No newline at end of file diff --git a/tests/test_utils/python_scripts/approve_merge_gate.py b/tests/test_utils/python_scripts/approve_merge_gate.py new file mode 100644 index 00000000000..dbd4ef99b44 --- /dev/null +++ b/tests/test_utils/python_scripts/approve_merge_gate.py @@ -0,0 +1,117 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 +""" +Approve pending deployments for workflow runs from PRs targeting a specific branch. + +Requirements: + pip install PyGithub + +Usage: + export GH_TOKEN="ghp_..." + export REPO="NVIDIA/Megatron-LM" + export TARGET_BRANCH="main" + export STATUS="approved" + export COMMENT="Auto-approved by CI" + + python approve_pending_deployments.py +""" + +import logging +import os +import re +import sys + +from github import Github, GithubException + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + # Get environment variables + github_token = os.environ.get("GH_TOKEN") + repo_name = os.environ.get("REPO") + target_branch = os.environ.get("TARGET_BRANCH") + status = os.environ.get("STATUS") + comment = os.environ.get("COMMENT", "") + + if not all([github_token, repo_name, target_branch, status]): + logger.error( + "Error: GITHUB_TOKEN, REPO, TARGET_BRANCH, and STATUS environment variables must be set" + ) + sys.exit(1) + + # Initialize GitHub client + g = Github(github_token) + + try: + repo = g.get_repo(repo_name) + except GithubException as e: + logger.error(f"Error accessing repository: {e}") + sys.exit(1) + + # Get merge-gate environment ID + env_id = None + try: + # Note: PyGithub doesn't have direct environment support yet, + # so we use the underlying requester + response = repo._requester.requestJsonAndCheck("GET", f"{repo.url}/environments") + for env in response[1].get("environments", []): + if env.get("name") == "merge-gate": + env_id = env.get("id") + break + + if not env_id: + logger.error("Error: merge-gate environment not found") + sys.exit(1) + except GithubException as e: + logger.error(f"Error fetching environments: {e}") + sys.exit(1) + + logger.info(f"merge-gate environment ID: {env_id}") + + # Get waiting workflow runs + try: + workflow_runs = repo.get_workflow_runs(status="waiting") + except GithubException as e: + logger.error(f"Error fetching workflow runs: {e}") + sys.exit(1) + + logger.info(f"Found {workflow_runs.totalCount} waiting workflow runs") + + # Process each workflow run + for run in workflow_runs: + head_branch = run.head_branch + + # Extract PR number from branch pattern pull-request/(\d+) + match = re.search(r"gh-readonly-queue/([^/]+)/pr-(\d+)-", head_branch) + if not match: + logger.info(f"Skipping Run #{run.id} on {head_branch}: not a PR branch") + continue + + branch_name = match.group(1) + pr_number = int(match.group(2)) + logger.info(f"Processing PR #{pr_number} from run {run.id}") + + if branch_name != target_branch: + logger.info(f"Skipping run {run.id}: targets {branch_name}, not {target_branch}") + continue + + logger.info(f"Processing PR #{pr_number} from run {run.id} (branch: {branch_name})") + + # Approve pending deployment + try: + # PyGithub doesn't have direct support for pending deployments API + # Use the underlying requester + repo._requester.requestJsonAndCheck( + "POST", + f"{repo.url}/actions/runs/{run.id}/pending_deployments", + input={"environment_ids": [env_id], "state": status, "comment": comment}, + ) + logger.info(f"✓ Successfully updated deployment for run {run.id} (PR #{pr_number})") + except GithubException as e: + logger.info(f"✗ Failed to update deployment for run {run.id}: {e}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/tests/test_utils/recipes/unit-tests.yaml b/tests/test_utils/recipes/unit-tests.yaml index d84e507c6df..5d2243a94af 100644 --- a/tests/test_utils/recipes/unit-tests.yaml +++ b/tests/test_utils/recipes/unit-tests.yaml @@ -59,6 +59,27 @@ spec: cp coverage.xml {assets_dir} products: + - test_case: [tests/unit_tests/test_fp8_param.py] + products: + - environment: [lts, dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/pipeline_parallel/**/*.py] + products: + - environment: [lts, dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + - test_case: [tests/unit_tests/models/**/*.py] + products: + - environment: [lts, dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] - test_case: [tests/unit_tests/data/**/*.py] products: - environment: [lts, dev] From 4fc8520d913fc63de37320c2c142f4d8462bdcbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 25 Oct 2025 16:29:53 +0200 Subject: [PATCH 058/248] ci: Fix branch of approval bot (#1945) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/05.publish.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitlab/stages/05.publish.yml b/.gitlab/stages/05.publish.yml index 20495434f6b..f4f1c153ad3 100644 --- a/.gitlab/stages/05.publish.yml +++ b/.gitlab/stages/05.publish.yml @@ -772,7 +772,11 @@ publish:approve_merge_gate: export GH_TOKEN=$GH_TOKEN export REPO=NVIDIA/Megatron-LM - export TARGET_BRANCH="$CI_COMMIT_BRANCH" + if [[ "$CI_COMMIT_BRANCH" == *main* ]]; then + export TARGET_BRANCH="main" + elif [[ "$CI_COMMIT_BRANCH" == *dev* ]]; then + export TARGET_BRANCH="dev" + fi if [[ $EXIT_CODE -eq 0 ]]; then export STATUS="approved" From 574a0095b44d0a2a3e87ad85d0477fd3618bed24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 26 Oct 2025 08:47:16 +0000 Subject: [PATCH 059/248] ci(fix): Approval gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/05.publish.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitlab/stages/05.publish.yml b/.gitlab/stages/05.publish.yml index f4f1c153ad3..d97f457621a 100644 --- a/.gitlab/stages/05.publish.yml +++ b/.gitlab/stages/05.publish.yml @@ -768,6 +768,12 @@ publish:approve_merge_gate: pip install --no-cache-dir python-gitlab click pygithub export GITLAB_ENDPOINT export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE} + if [[ "$CI_COMMIT_BRANCH" == *main* ]]; then + export TARGET_BRANCH="main" + elif [[ "$CI_COMMIT_BRANCH" == *dev* ]]; then + export TARGET_BRANCH="dev" + fi + python tests/test_utils/python_scripts/check_status_of_main.py --target-branch "$CI_COMMIT_BRANCH" --once || EXIT_CODE=$? export GH_TOKEN=$GH_TOKEN From 8243834d39bbd641db38581b0e335a127b808743 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 26 Oct 2025 08:56:37 +0000 Subject: [PATCH 060/248] ci: Approval gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/05.publish.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.gitlab/stages/05.publish.yml b/.gitlab/stages/05.publish.yml index d97f457621a..cf561727a49 100644 --- a/.gitlab/stages/05.publish.yml +++ b/.gitlab/stages/05.publish.yml @@ -778,11 +778,6 @@ publish:approve_merge_gate: export GH_TOKEN=$GH_TOKEN export REPO=NVIDIA/Megatron-LM - if [[ "$CI_COMMIT_BRANCH" == *main* ]]; then - export TARGET_BRANCH="main" - elif [[ "$CI_COMMIT_BRANCH" == *dev* ]]; then - export TARGET_BRANCH="dev" - fi if [[ $EXIT_CODE -eq 0 ]]; then export STATUS="approved" From 106516c91ad9229e66417ad5193c98970cd33275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 26 Oct 2025 08:58:47 +0000 Subject: [PATCH 061/248] ci: Approval gate rule MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/05.publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/stages/05.publish.yml b/.gitlab/stages/05.publish.yml index cf561727a49..f2d229f1cc5 100644 --- a/.gitlab/stages/05.publish.yml +++ b/.gitlab/stages/05.publish.yml @@ -791,6 +791,6 @@ publish:approve_merge_gate: retry: max: 2 rules: - - if: $CI_PIPELINE_SOURCE == "schedule" || ($CI_COMMIT_BRANCH == 'ci-approve-dev' || $CI_COMMIT_BRANCH == 'ci-approve-main') + - if: $CI_PIPELINE_SOURCE == "schedule" && ($CI_COMMIT_BRANCH == 'ci-approve-dev' || $CI_COMMIT_BRANCH == 'ci-approve-main') when: always - when: never From ef48a1309f2b8889373823a5346e0fbad74ea94d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 26 Oct 2025 09:01:44 +0000 Subject: [PATCH 062/248] ci: Update golden values dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .../golden_values_lts_dgxa100_dracooci.json | 200 +++++++++--------- .../golden_values_lts_dgxa100_dracooci.json | 100 ++++----- 2 files changed, 150 insertions(+), 150 deletions(-) diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgxa100_dracooci.json index ec432ff7884..56a53cbf6ca 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgxa100_dracooci.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgxa100_dracooci.json @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 21.09115, - "2": 0.41164, - "3": 0.38182, - "4": 0.38049, - "5": 0.60969, - "6": 0.36583, - "7": 0.36416, - "8": 0.37604, - "9": 0.3679, - "10": 0.36785, - "11": 0.36954, - "12": 0.36975, - "13": 0.36874, - "14": 0.36917, - "15": 0.37218, - "16": 0.37039, - "17": 0.36749, - "18": 0.36956, - "19": 0.37349, - "20": 0.37202, - "21": 0.36788, - "22": 0.37092, - "23": 0.36616, - "24": 0.36575, - "25": 0.36576, - "26": 0.36657, - "27": 0.36754, - "28": 0.36677, - "29": 0.36466, - "30": 0.36792, - "31": 0.36536, - "32": 0.36562, - "33": 0.36872, - "34": 0.36339, - "35": 0.36568, - "36": 0.36568, - "37": 0.36366, - "38": 0.36485, - "39": 0.36421, - "40": 0.35995, - "41": 0.36131, - "42": 0.36351, - "43": 0.36398, - "44": 0.3645, - "45": 0.359, - "46": 0.3614, - "47": 0.35954, - "48": 0.36106, - "49": 0.36508, - "50": 0.36162, - "51": 0.36692, - "52": 0.36519, - "53": 0.3602, - "54": 0.36089, - "55": 0.36195, - "56": 0.35943, - "57": 0.36048, - "58": 0.36032, - "59": 0.36446, - "60": 0.36455, - "61": 0.36016, - "62": 0.36345, - "63": 0.3602, - "64": 0.36067, - "65": 0.36076, - "66": 0.36538, - "67": 0.57124, - "68": 0.36375, - "69": 0.36298, - "70": 0.3623, - "71": 0.36583, - "72": 0.36199, - "73": 0.36503, - "74": 0.3612, - "75": 0.36467, - "76": 0.36386, - "77": 0.36345, - "78": 0.36764, - "79": 0.36585, - "80": 0.36636, - "81": 0.36354, - "82": 0.36426, - "83": 0.36781, - "84": 0.58958, - "85": 0.36576, - "86": 0.36705, - "87": 0.36285, - "88": 0.3685, - "89": 0.36603, - "90": 0.36553, - "91": 0.36328, - "92": 0.36279, - "93": 0.36243, - "94": 0.3647, - "95": 0.3673, - "96": 0.36551, - "97": 0.36297, - "98": 0.36326, - "99": 0.3621, - "100": 0.36226 + "1": 20.13148, + "2": 0.19658, + "3": 0.16932, + "4": 0.16925, + "5": 0.16695, + "6": 0.16969, + "7": 0.4281, + "8": 0.16351, + "9": 0.16208, + "10": 0.37746, + "11": 0.16397, + "12": 0.16616, + "13": 0.16752, + "14": 0.16658, + "15": 0.16626, + "16": 0.16687, + "17": 0.16684, + "18": 0.16721, + "19": 0.16647, + "20": 0.16786, + "21": 0.16027, + "22": 0.16375, + "23": 0.15995, + "24": 0.16197, + "25": 0.16052, + "26": 0.16097, + "27": 0.16002, + "28": 0.16159, + "29": 0.15911, + "30": 0.16097, + "31": 0.15974, + "32": 0.162, + "33": 0.15978, + "34": 0.16068, + "35": 0.16093, + "36": 0.16084, + "37": 0.16071, + "38": 0.16241, + "39": 0.15964, + "40": 0.16151, + "41": 0.16012, + "42": 0.16055, + "43": 0.15998, + "44": 0.16159, + "45": 0.16019, + "46": 0.16043, + "47": 0.16108, + "48": 0.16025, + "49": 0.15985, + "50": 0.16067, + "51": 0.17029, + "52": 0.16714, + "53": 0.16748, + "54": 0.16511, + "55": 0.1671, + "56": 0.1665, + "57": 0.16873, + "58": 0.16673, + "59": 0.16609, + "60": 0.16583, + "61": 0.1659, + "62": 0.16564, + "63": 0.16874, + "64": 0.16698, + "65": 0.1663, + "66": 0.16574, + "67": 0.16591, + "68": 0.16649, + "69": 0.16691, + "70": 0.16656, + "71": 0.16678, + "72": 0.16455, + "73": 0.16685, + "74": 0.16559, + "75": 0.1703, + "76": 0.1649, + "77": 0.16778, + "78": 0.16543, + "79": 0.16601, + "80": 0.1648, + "81": 0.17029, + "82": 0.16906, + "83": 0.17088, + "84": 0.16504, + "85": 0.16803, + "86": 0.16513, + "87": 0.16682, + "88": 0.16712, + "89": 0.16999, + "90": 0.1666, + "91": 0.16704, + "92": 0.16919, + "93": 0.17188, + "94": 0.17115, + "95": 0.16965, + "96": 0.1671, + "97": 0.16712, + "98": 0.17096, + "99": 0.16958, + "100": 0.16893 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgxa100_dracooci.json index 516c7e99194..9e89b4bc3ee 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgxa100_dracooci.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgxa100_dracooci.json @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 19.94048, - "2": 0.39367, - "3": 0.37589, - "4": 0.37388, - "5": 0.66307, - "6": 0.36351, - "7": 0.3595, - "8": 0.36116, - "9": 0.36043, - "10": 0.35758, - "11": 0.36057, - "12": 0.35963, - "13": 0.36072, - "14": 0.35903, - "15": 0.35994, - "16": 0.35763, - "17": 0.36245, - "18": 0.35747, - "19": 0.35878, - "20": 0.35982, - "21": 0.35849, - "22": 0.35936, - "23": 0.35823, - "24": 0.35778, - "25": 0.3606, - "26": 0.35907, - "27": 0.35852, - "28": 0.35911, - "29": 0.35837, - "30": 0.35815, - "31": 0.35909, - "32": 0.35701, - "33": 0.3602, - "34": 0.35976, - "35": 0.36009, - "36": 0.35943, - "37": 0.35776, - "38": 0.35664, - "39": 0.36098, - "40": 0.35836, - "41": 0.35857, - "42": 0.35915, - "43": 0.3572, - "44": 0.35779, - "45": 0.36243, - "46": 0.35772, - "47": 0.35984, - "48": 0.35743, - "49": 0.35726, - "50": 0.35872 + "1": 19.01834, + "2": 0.19131, + "3": 0.16463, + "4": 0.17624, + "5": 0.16919, + "6": 0.16375, + "7": 0.16414, + "8": 0.16407, + "9": 0.16499, + "10": 0.16212, + "11": 0.16324, + "12": 0.16316, + "13": 0.16134, + "14": 0.16068, + "15": 0.16212, + "16": 0.16071, + "17": 0.1623, + "18": 0.16066, + "19": 0.16307, + "20": 0.16502, + "21": 0.16536, + "22": 0.16572, + "23": 0.16545, + "24": 0.16393, + "25": 0.16484, + "26": 0.16386, + "27": 0.16204, + "28": 0.16264, + "29": 0.16076, + "30": 0.16134, + "31": 0.15999, + "32": 0.1604, + "33": 0.16121, + "34": 0.16175, + "35": 0.16122, + "36": 0.16498, + "37": 0.16166, + "38": 0.1626, + "39": 0.16244, + "40": 0.16183, + "41": 0.16437, + "42": 0.16175, + "43": 0.1628, + "44": 0.16269, + "45": 0.16111, + "46": 0.16288, + "47": 0.16257, + "48": 0.16123, + "49": 0.15922, + "50": 0.16035 } } } \ No newline at end of file From 3281c010a2f7829d8274d13abade26632edda13c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 26 Oct 2025 10:51:02 +0000 Subject: [PATCH 063/248] ci: Approval gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/05.publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/stages/05.publish.yml b/.gitlab/stages/05.publish.yml index f2d229f1cc5..68388419a6e 100644 --- a/.gitlab/stages/05.publish.yml +++ b/.gitlab/stages/05.publish.yml @@ -774,7 +774,7 @@ publish:approve_merge_gate: export TARGET_BRANCH="dev" fi - python tests/test_utils/python_scripts/check_status_of_main.py --target-branch "$CI_COMMIT_BRANCH" --once || EXIT_CODE=$? + python tests/test_utils/python_scripts/check_status_of_main.py --target-branch "$TARGET_BRANCH" --once || EXIT_CODE=$? export GH_TOKEN=$GH_TOKEN export REPO=NVIDIA/Megatron-LM From 8fe0c3a563a1b1d76f92914bf7242c5f5529e90b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 26 Oct 2025 12:19:29 +0000 Subject: [PATCH 064/248] ci: Approval bot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/05.publish.yml | 10 ++++-- .../python_scripts/check_status_of_main.py | 32 ++++++++++++------- tools/autoformat.sh | 2 +- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/.gitlab/stages/05.publish.yml b/.gitlab/stages/05.publish.yml index 68388419a6e..024ec2aa490 100644 --- a/.gitlab/stages/05.publish.yml +++ b/.gitlab/stages/05.publish.yml @@ -782,12 +782,18 @@ publish:approve_merge_gate: if [[ $EXIT_CODE -eq 0 ]]; then export STATUS="approved" export COMMENT="Main is healthy. Submitting PR." - else + elif [[ $EXIT_CODE -eq 1 ]]; then export STATUS="rejected" export COMMENT="Main is not healthy. An automation engineer is investigating. No need to take any action." + elif [[ $EXIT_CODE -eq 2 ]]; then + echo "Main is running. We won't cancel the deployment." + exit 0 + fi + + if [[ $EXIT_CODE -lt 2 ]]; then + python tests/test_utils/python_scripts/approve_merge_gate.py fi - python tests/test_utils/python_scripts/approve_merge_gate.py retry: max: 2 rules: diff --git a/tests/test_utils/python_scripts/check_status_of_main.py b/tests/test_utils/python_scripts/check_status_of_main.py index 16f80e6dcf6..a1cae393bfb 100644 --- a/tests/test_utils/python_scripts/check_status_of_main.py +++ b/tests/test_utils/python_scripts/check_status_of_main.py @@ -43,22 +43,26 @@ def most_recent_pipeline(target_branch: str): def is_pending(target_branch: str): pipeline = most_recent_pipeline(target_branch) - is_pending = ( - pipeline.attributes['status'] == 'pending' or pipeline.attributes['status'] == 'running' - ) - is_canceled = pipeline.attributes['status'] == 'canceled' + PENDING_STATUSES = [ + "created", + "waiting_for_resource", + "preparing", + "pending", + "running", + "canceled", + "skipped", + "manual", + "scheduled", + ] + + is_pending = pipeline.attributes['status'] in PENDING_STATUSES if not is_pending: logger.info( f"Main pipeline {pipeline.id} finished with status {pipeline.attributes['status']}" ) - return is_pending or is_canceled - - -def is_sucess(target_branch: str): - pipeline = most_recent_pipeline(target_branch) - return pipeline.attributes['status'] == 'success' + return is_pending @click.command() @@ -71,12 +75,18 @@ def main(target_branch: str, continuous: bool): break time.sleep(60) - if not is_sucess(target_branch=target_branch): + pipeline = most_recent_pipeline(target_branch) + + if pipeline.attributes['status'] == 'failed': logger.error( "Main is broken, we're therefore blocking your merge. Please wait until main is fixed again by checking the repo's front page. If the status is green again, you can re-attempt the merge. Feel free to ping the team if you have any questions." ) sys.exit(1) + if pipeline.attributes['status'] == 'running': + logger.info("Main is running, we won't cancel the deployment.") + sys.exit(2) + sys.exit(0) diff --git a/tools/autoformat.sh b/tools/autoformat.sh index 85d1d19c7cb..fffc7725eb4 100755 --- a/tools/autoformat.sh +++ b/tools/autoformat.sh @@ -15,7 +15,7 @@ CHECK_ONLY=${CHECK_ONLY:-false} SKIP_DOCS=${SKIP_DOCS:-false} BASE_REF=${BASE_REF:-main} -git remote set-url origin "https://${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git" +git remote set-url origin "https://github.com/NVIDIA/Megatron-LM.git" git fetch origin ${BASE_REF} CHANGED_FILES=$(git diff --name-only --diff-filter=d --merge-base origin/${BASE_REF} megatron/core tests/ | grep '\.py$' || true) ADDITIONAL_ARGS="" From a33936d0b169c72f27e2b66680c869ae83d48015 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 26 Oct 2025 14:51:06 +0000 Subject: [PATCH 065/248] ci: Increase time limit for main tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5ddf5f094c2..6523c4a1973 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -41,7 +41,7 @@ workflow: FUNCTIONAL_TEST_SCOPE: mr FUNCTIONAL_TEST_REPEAT: 5 FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no" - FUNCTIONAL_TEST_TIME_LIMIT: 2700 + FUNCTIONAL_TEST_TIME_LIMIT: 3600 CLUSTER_A100: "" CLUSTER_H100: "" PUBLISH: "no" From 51768236aad5e2dccbdbae68ef2032bc8ae44604 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 26 Oct 2025 14:04:08 +0100 Subject: [PATCH 066/248] ci: Auto-assign milestone (#1952) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/auto-assign-milestone.yml | 74 +++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 .github/workflows/auto-assign-milestone.yml diff --git a/.github/workflows/auto-assign-milestone.yml b/.github/workflows/auto-assign-milestone.yml new file mode 100644 index 00000000000..7eae6838332 --- /dev/null +++ b/.github/workflows/auto-assign-milestone.yml @@ -0,0 +1,74 @@ +name: Auto-assign Milestone to PR + +on: + push: + branches: + - "pull-request/[0-9]+" + +permissions: + contents: read + pull-requests: write + issues: write + +jobs: + assign-milestone: + runs-on: ubuntu-latest + environment: nemo-ci + steps: + - name: Get PR info + id: get-pr-info + if: startsWith(github.ref, 'refs/heads/pull-request/') + uses: nv-gha-runners/get-pr-info@main + + - name: Check if PR has milestone + id: check_milestone + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + MILESTONE=$(gh pr view ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} \ + --repo ${{ github.repository }} \ + --json milestone \ + --jq '.milestone.title') + + if [ "$MILESTONE" = "null" ] || [ -z "$MILESTONE" ]; then + echo "has_milestone=false" >> $GITHUB_OUTPUT + else + echo "has_milestone=true" >> $GITHUB_OUTPUT + echo "PR already has milestone: $MILESTONE" + fi + + - name: Get most recent open milestone + if: steps.check_milestone.outputs.has_milestone == 'false' + id: get_milestone + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + # Get the most recent open milestone (sorted by due date, then by creation date) + MILESTONE_NUMBER=$(gh api \ + "repos/${{ github.repository }}/milestones?state=open&sort=due_on&direction=desc" \ + --jq '.[0].number') + + MILESTONE_TITLE=$(gh api \ + "repos/${{ github.repository }}/milestones?state=open&sort=due_on&direction=desc" \ + --jq '.[0].title') + + if [ -z "$MILESTONE_NUMBER" ] || [ "$MILESTONE_NUMBER" = "null" ]; then + echo "No open milestones found" + echo "milestone_found=false" >> $GITHUB_OUTPUT + else + echo "milestone_found=true" >> $GITHUB_OUTPUT + echo "milestone_number=$MILESTONE_NUMBER" >> $GITHUB_OUTPUT + echo "milestone_title=$MILESTONE_TITLE" >> $GITHUB_OUTPUT + echo "Found milestone: $MILESTONE_TITLE (number: $MILESTONE_NUMBER)" + fi + + - name: Assign milestone to PR + if: steps.check_milestone.outputs.has_milestone == 'false' && steps.get_milestone.outputs.milestone_found == 'true' + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + gh pr edit ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} \ + --repo ${{ github.repository }} \ + --milestone "${{ steps.get_milestone.outputs.milestone_title }}" + + echo "✅ Assigned milestone '${{ steps.get_milestone.outputs.milestone_title }}' to PR #${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}" From 4b6ba6019a677f3f806c4f2eb4de3ea46fc83de0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 26 Oct 2025 19:01:05 +0100 Subject: [PATCH 067/248] ci: Run on push to release branch (#1960) (#1962) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6523c4a1973..53574fdea22 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -33,7 +33,7 @@ workflow: - if: $CI_PIPELINE_SOURCE == "web" # For push to main - - if: $CI_PIPELINE_SOURCE == 'push' && ($CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "dev") + - if: $CI_PIPELINE_SOURCE == 'push' && ($CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "dev" || $CI_COMMIT_BRANCH =~ /^core_/) variables: UNIT_TEST: "no" INTEGRATION_TEST: "no" From 221747d02b827ff71858e69c687665198b45debc Mon Sep 17 00:00:00 2001 From: Deyu Fu Date: Mon, 27 Oct 2025 12:20:00 +0800 Subject: [PATCH 068/248] [DEV] support split qkv in muon (#1915) --- megatron/core/optimizer/muon.py | 118 +++++++++++--------- megatron/core/optimizer/optimizer_config.py | 7 +- megatron/core/tensor_parallel/layers.py | 3 +- megatron/training/arguments.py | 10 +- pyproject.toml | 2 +- tests/unit_tests/test_muon_optimizer.py | 3 +- tests/unit_tests/test_optimizer.py | 19 +--- uv.lock | 4 +- 8 files changed, 91 insertions(+), 75 deletions(-) diff --git a/megatron/core/optimizer/muon.py b/megatron/core/optimizer/muon.py index d2dc7533bf9..700ad17e630 100644 --- a/megatron/core/optimizer/muon.py +++ b/megatron/core/optimizer/muon.py @@ -1,10 +1,9 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. """Megatron muon optimizer wrapper to handle tensor-parallel.""" import logging -from functools import partial -from typing import Callable, List, Literal, Optional +from typing import Any, Callable, List, Literal, Optional import torch from torch.optim.optimizer import ParamsT @@ -65,35 +64,36 @@ def __init__( if num_ns_steps < 1: raise ValueError(f"num_ns_steps must be at least 1, got {num_ns_steps}") - orthogonalize_fn = partial( - newton_schulz_tp, - steps=num_ns_steps, - coefficient_type=coefficient_type, - mode="duplicated" if mode == "blockwise" else mode, - ) - scale_factor_fn = partial( - get_muon_scale_factor, mode=scale_mode, extra_scale_factor=extra_scale_factor - ) - - def orthogonalize_fn_tp( - x: torch.Tensor, + def scaled_orthogonalize_fn( + grad: torch.Tensor, tp_group: torch.distributed.ProcessGroup, partition_dim: int | None = None, ) -> torch.Tensor: - return orthogonalize_fn(x, tp_group=tp_group, partition_dim=partition_dim) - - def scale_factor_fn_tp( - size_out: int, size_in: int, partition_dim: int | None = None - ) -> float: - if partition_dim is None: - return scale_factor_fn(size_out, size_in) - - size = [size_out, size_in] - size[partition_dim] *= get_pg_size(pg_collection.tp) if pg_collection else 1 - return scale_factor_fn(*size) + log_single_rank( + logger, + logging.DEBUG, + f'Orthogonalizing grad with {num_ns_steps} steps, {coefficient_type} coefficient, ' + f'{scale_mode} scale mode, extra_scale_factor={extra_scale_factor}', + ) + size = [grad.size(-2), grad.size(-1)] + if partition_dim: + size[partition_dim] *= get_pg_size(tp_group) + orth_grad = newton_schulz_tp( + grad, + steps=num_ns_steps, + coefficient_type=coefficient_type, + tp_group=tp_group, + partition_dim=partition_dim, + mode="duplicated" if mode == "blockwise" else mode, + ) + scale_factor = get_muon_scale_factor(size[0], size[1], mode=scale_mode) + return orth_grad * scale_factor * extra_scale_factor self.pg_collection = pg_collection self.mode = mode + self.split_qkv = split_qkv + self.is_qkv_fn = is_qkv_fn + self.qkv_split_shapes = qkv_split_shapes super().__init__( params, @@ -102,15 +102,11 @@ def scale_factor_fn_tp( use_nesterov, weight_decay, use_decoupled_weight_decay, - split_qkv, - is_qkv_fn, - qkv_split_shapes, fp32_matmul_prec, - orthogonalize_fn_tp, - scale_factor_fn_tp, + scaled_orthogonalize_fn, ) - def orthogonalize(self, p: torch.Tensor, grad: torch.Tensor) -> torch.Tensor: + def orthogonalize(self, p: torch.Tensor, grad: torch.Tensor, **kwargs: Any) -> torch.Tensor: """Orthogonalize the momentum. Args: @@ -122,6 +118,7 @@ def orthogonalize(self, p: torch.Tensor, grad: torch.Tensor) -> torch.Tensor: Returns: The orthogonalized gradient tensor. """ + # TODO(deyuf): switch to group if self.pg_collection: tp_group = ( self.pg_collection.expt_tp @@ -135,27 +132,33 @@ def orthogonalize(self, p: torch.Tensor, grad: torch.Tensor) -> torch.Tensor: # llm-shower use different default value for partition_dim than TE. # Because -1 is a valid index for ndarray, we decided to not overload it. partition_dim = None + if self.split_qkv and self.is_qkv_fn(p): # type: ignore[misc] # split grouped attention parameters (e.g., QKV, GQA, etc.) - qkv_grads = torch.split(grad, self.qkv_split_shapes, dim=0) + grad_shape = grad.shape + log_single_rank( + logger, + logging.DEBUG, + f'qkv split grad shape {grad_shape}, split shapes {self.qkv_split_shapes}', + ) + num_query_groups = grad_shape[0] // sum(self.qkv_split_shapes) + qkv_grads = torch.split( + grad.view(num_query_groups, sum(self.qkv_split_shapes), -1), + self.qkv_split_shapes, + dim=1, + ) + qkv_grads = [g.reshape(-1, grad_shape[-1]) for g in qkv_grads] - # Apply Newton-Schulz to each component - qkv_whitened = [ - self.orthogonalize_fn(g, tp_group=tp_group, partition_dim=partition_dim) + # Apply Newton-Schulz and scales to each component, concat back + qkv_grads = [ + self.scaled_orthogonalize_fn(g, tp_group, partition_dim).view( + num_query_groups, -1, grad_shape[-1] + ) for g in qkv_grads ] - qkv_scales = [ - self.scale_factor_fn(g.size(0), g.size(1), partition_dim) for g in qkv_grads - ] - - # Apply individual scales to each component and concatenate - grad = torch.cat( - [whitened * scale for whitened, scale in zip(qkv_whitened, qkv_scales)] - ) + grad = torch.cat(qkv_grads, dim=1).view(grad_shape) else: - grad = self.orthogonalize_fn( - grad, tp_group=tp_group, partition_dim=partition_dim - ) * self.scale_factor_fn(grad.size(0), grad.size(1), partition_dim) + grad = self.scaled_orthogonalize_fn(grad, tp_group, partition_dim) return grad @@ -206,7 +209,18 @@ def get_megatron_muon_optimizer( # record list of non/linear params linear_params = [] nonlinear_params = [] + for model_chunk in model_chunks: + # use config to determine qkv split shapes. + # no need to check tp since tp splits by head and this is per head(group) dimension + num_attention_heads = model_chunk.config.num_attention_heads + num_query_groups = model_chunk.config.num_query_groups + kv_channels = model_chunk.config.kv_channels + qkv_split_shapes = [ + num_attention_heads // num_query_groups * kv_channels, + kv_channels, + kv_channels, + ] for name, param in model_chunk.named_parameters(): if not param.requires_grad: continue @@ -215,6 +229,10 @@ def get_megatron_muon_optimizer( # change in optimizer if 'experts' in name and 'shared' not in name: param.expert_tp = True + # add flag for qkv parameter + # TODO(deyuf): support MLA + if 'linear_qkv.weight' in name and len(param.shape) == 2: + param.is_qkv = True # TODO(deyuf): might not be sufficient for future algorithm. revisit this conditioning if not getattr(param, 'is_embedding_or_output_parameter', False) and not ( len(param.shape) == 1 @@ -238,7 +256,6 @@ def get_megatron_muon_optimizer( decoupled_min_lr=config.decoupled_min_lr, ) - # TODO(deyuf): support qkv split optimizer = TensorParallelMuon( linear_param_groups, lr=config.lr, @@ -248,8 +265,9 @@ def get_megatron_muon_optimizer( fp32_matmul_prec=config.muon_fp32_matmul_prec, num_ns_steps=config.muon_num_ns_steps, scale_mode=config.muon_scale_mode, - split_qkv=False, - qkv_split_shapes=None, + split_qkv=config.muon_split_qkv, + is_qkv_fn=lambda p: getattr(p, 'is_qkv', False), + qkv_split_shapes=qkv_split_shapes, extra_scale_factor=config.muon_extra_scale_factor, pg_collection=pg_collection, mode=config.muon_tp_mode, diff --git a/megatron/core/optimizer/optimizer_config.py b/megatron/core/optimizer/optimizer_config.py index ced3845804f..8692d1e9b52 100644 --- a/megatron/core/optimizer/optimizer_config.py +++ b/megatron/core/optimizer/optimizer_config.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. from dataclasses import dataclass from typing import Callable, Optional @@ -128,7 +128,10 @@ class OptimizerConfig: muon_momentum: float = 0.95 """The momentum used by the internal SGD.""" - muon_use_nesterov: bool = True + muon_split_qkv: bool = True + """Whether to split QKV parameters for Muon optimizer.""" + + muon_use_nesterov: bool = False """Whether to use Nesterov-style momentum in the internal SGD.""" muon_scale_mode: str = "spectral" diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py index 5ca290ff680..e79d55b9fa3 100644 --- a/megatron/core/tensor_parallel/layers.py +++ b/megatron/core/tensor_parallel/layers.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Parts of the code here are adapted from PyTorch # repo: https://github.com/pytorch/pytorch @@ -57,6 +57,7 @@ _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = { "expert_tp": False, + "is_qkv": False, "tensor_model_parallel": False, "partition_dim": -1, "partition_stride": 1, diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index d1e062edd02..bdf915a8ae1 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. """Megatron arguments.""" @@ -1940,10 +1940,12 @@ def _add_regularization_args(parser): 'numerical stability') group.add_argument('--sgd-momentum', type=float, default=0.9, help='Momentum factor for sgd') - group.add_argument('--muon-momentum', type=float, default=0.95, + group.add_argument('--muon-momentum', type=float, default=0.9, help='Momentum factor for Muon optimizer') - group.add_argument('--muon-no-use-nesterov', action='store_false', default=True, - dest='muon_use_nesterov', + group.add_argument('--muon-no-split-qkv', action='store_false', default=True, + dest='muon_split_qkv', + help='Whether to split QKV parameters for Muon optimizer') + group.add_argument('--muon-use-nesterov', action='store_true', help='Whether to use Nesterov-style momentum in the internal SGD') group.add_argument('--muon-scale-mode', type=str, default='spectral', choices=['spectral', 'unit_rms_norm', 'shape_scaling'], diff --git a/pyproject.toml b/pyproject.toml index d02b873d1d9..db91ce393e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -169,7 +169,7 @@ flash_mla = [ { git = "https://github.com/deepseek-ai/FlashMLA", rev = "9edee0c022cd0938148a18e334203b0aab43aa19" }, ] transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.8" } # on `release_v2.8` -emerging_optimizers = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git", rev = "fb1add873e7851ec34b48581ea1b15761b73d189" } +emerging_optimizers = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git", rev = "cf9909b777ffac18e05b67a6708282cadc000942" } nemo-run = { git = "https://github.com/NVIDIA-NeMo/Run.git", rev = "8ca8f7952a597f944985f1f1368a7acb9aa3a6c2" } [tool.isort] profile = "black" # black-compatible diff --git a/tests/unit_tests/test_muon_optimizer.py b/tests/unit_tests/test_muon_optimizer.py index 71d77dc6ecc..cc99f7a16e6 100644 --- a/tests/unit_tests/test_muon_optimizer.py +++ b/tests/unit_tests/test_muon_optimizer.py @@ -1,3 +1,5 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import os import pytest @@ -543,7 +545,6 @@ def test_muon_optimizer_multiple_steps(): ), f"Weight should change at step {i}" -@pytest.mark.skip(reason="split qkv is not implemented yet") def test_muon_optimizer_qkv_split(): """Test TensorParallelMuon optimizer with QKV splitting.""" # Create a model with QKV-like parameter diff --git a/tests/unit_tests/test_optimizer.py b/tests/unit_tests/test_optimizer.py index d8f6e3a2eeb..3d6b4b3c15e 100644 --- a/tests/unit_tests/test_optimizer.py +++ b/tests/unit_tests/test_optimizer.py @@ -1,3 +1,5 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import os import pytest @@ -244,24 +246,13 @@ def run_model(model, input, optim, fp8_recipe, fp8_recipe_settings): test_model, input, test_optim, fp8_recipe, fp8_recipe_settings ) - rtol = 1e-3 # relative tolerance - atol = 1e-5 # absolute tolerance + rtol, atol = 1.6e-2, 1e-5 # Compare grad norms - allow small difference due to precision - rel_diff = abs(test_grad_norm - baseline_grad_norm) / ( - abs(baseline_grad_norm) + 1e-7 # avoid div by 0 - ) - abs_diff = abs(test_grad_norm - baseline_grad_norm) - assert ( - rel_diff <= rtol or abs_diff <= atol - ), f"Grad norm mismatch: baseline={baseline_grad_norm}, test={test_grad_norm}, rel_diff={rel_diff}, abs_diff={abs_diff}" + torch.testing.assert_close(test_grad_norm, baseline_grad_norm, atol=atol, rtol=rtol) # Compare losses - allow small difference due to precision - loss_rel_diff = abs(test_loss - baseline_loss) / (abs(baseline_loss) + 1e-7) - loss_abs_diff = abs(test_loss - baseline_loss) - assert ( - loss_rel_diff <= rtol or loss_abs_diff <= atol - ), f"Loss mismatch: baseline={baseline_loss}, test={test_loss}, rel_diff={loss_rel_diff}, abs_diff={loss_abs_diff}" + torch.testing.assert_close(test_loss, baseline_loss, atol=atol, rtol=rtol) # Save and reload state dict for the test model state_dict = test_optim.state_dict() diff --git a/uv.lock b/uv.lock index f7c8916166b..c20d3f55dfe 100644 --- a/uv.lock +++ b/uv.lock @@ -1279,7 +1279,7 @@ wheels = [ [[package]] name = "emerging-optimizers" version = "0.1.0" -source = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=fb1add873e7851ec34b48581ea1b15761b73d189#fb1add873e7851ec34b48581ea1b15761b73d189" } +source = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=cf9909b777ffac18e05b67a6708282cadc000942#cf9909b777ffac18e05b67a6708282cadc000942" } dependencies = [ { name = "absl-py" }, { name = "torch", marker = "sys_platform == 'never'" }, @@ -2401,7 +2401,7 @@ requires-dist = [ { name = "causal-conv1d", marker = "extra == 'dev'", specifier = "~=1.5" }, { name = "einops", marker = "extra == 'dev'", specifier = "~=0.8" }, { name = "einops", marker = "extra == 'lts'" }, - { name = "emerging-optimizers", marker = "extra == 'dev'", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=fb1add873e7851ec34b48581ea1b15761b73d189" }, + { name = "emerging-optimizers", marker = "extra == 'dev'", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=cf9909b777ffac18e05b67a6708282cadc000942" }, { name = "flash-linear-attention", marker = "extra == 'dev'", specifier = "~=0.3.2" }, { name = "flashinfer-python", marker = "extra == 'dev'" }, { name = "flask-restful", marker = "extra == 'mlm'" }, From a0a1866ff56fa079aa6fe9cbb2775bbab58170b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 27 Oct 2025 09:27:23 +0000 Subject: [PATCH 069/248] ci: Add golden values for gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .../golden_values_dev_dgxh100_coreweave.json | 187 ++++++++++++++++++ 1 file changed, 187 insertions(+) create mode 100644 tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..ddc6cacf3a8 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,187 @@ +{ + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", + "generated_tokens": [ + 3060, + 2430, + 1636, + 2012, + 1317, + 1278, + 2362, + 1307, + 1278, + 16070, + 1044, + 1321, + 1636, + 23067, + 1455, + 1593, + 1395, + 1605, + 3140, + 5152, + 1513, + 1747, + 1046, + 2409, + 1395, + 3140, + 5152, + 1513, + 1278, + 2362 + ], + "latency": 0.3733036518096924, + "cuda_graph_request_count_map": { + "372": 0, + "360": 0, + "336": 0, + "312": 0, + "288": 0, + "264": 0, + "240": 0, + "216": 0, + "192": 0, + "168": 0, + "144": 0, + "120": 0, + "96": 0, + "72": 0, + "48": 0, + "24": 29 + }, + "step_count": 240, + "logprobs": [ + -9.362494468688965, + -2.827894449234009, + -4.557381629943848, + -1.4968647956848145, + -0.717312216758728, + -1.7262351512908936, + -2.522736072540283, + -2.1782360076904297, + -2.3603432178497314, + -6.136383533477783, + -1.4676916599273682, + -3.468963384628296, + -4.424870491027832, + -3.7345848083496094, + -2.012619972229004, + -1.8833301067352295, + -3.5708768367767334, + -6.8197832107543945, + -0.3122292757034302, + -0.9820290207862854, + -6.532033443450928, + -7.498172760009766, + -12.615165710449219, + -2.409003496170044, + -3.8550546169281006, + -0.5105050802230835, + -4.2802581787109375, + -0.06971167027950287, + -0.054025799036026, + -3.319596767425537, + -9.703240394592285, + -1.0997297763824463, + -6.224854469299316, + -5.234503269195557, + -3.934987783432007, + -2.5263679027557373, + -3.1843955516815186, + -5.880871295928955, + -1.8436813354492188, + -5.906496047973633, + -12.15787410736084, + -12.5841064453125, + -0.0819428563117981, + -2.6212656497955322, + -1.4329369068145752, + -2.885145425796509, + -1.2901865243911743, + -0.006647023372352123, + -3.5115818977355957, + -12.945953369140625, + -3.793078899383545, + -3.0094375610351562, + -5.966838836669922, + -0.8998424410820007, + -0.040962252765893936, + -1.5467679500579834, + -1.0785343647003174, + -5.73494815826416, + -0.38491737842559814, + -5.017007827758789, + -0.5568072199821472, + -0.5968841910362244, + -2.3609962463378906, + -13.582086563110352, + -0.09050048142671585, + -3.7264108657836914, + -1.1208789348602295, + -6.052675247192383, + -0.5848909616470337, + -3.5906238555908203, + -0.9494907855987549, + -1.5676641464233398, + -5.127577781677246, + -17.19189453125, + -6.698403835296631, + -1.0449178218841553, + -4.365664958953857, + -1.1243419647216797, + -2.2092156410217285, + -1.8081634044647217, + -0.23330983519554138, + -9.439546585083008, + -0.2947109341621399, + -7.253565788269043, + -2.3855936527252197, + -4.629369258880615, + -3.4186267852783203, + -1.9727531671524048, + -2.331681251525879, + -1.5606917142868042, + -2.454296588897705, + -1.5334703922271729, + -1.2631131410598755, + -2.657367706298828, + -0.6480202078819275, + -0.4550393521785736, + -1.3625166416168213, + -0.8142069578170776, + -0.4496593475341797, + -0.9312890768051147, + -1.732723355293274, + -0.44613128900527954, + -1.6895122528076172, + -0.6082233190536499, + -1.0978344678878784, + -1.1122435331344604, + -0.002520838286727667, + -1.4072327613830566, + -0.007462364621460438, + -0.7548662424087524, + -0.9937503337860107, + -0.0675487294793129, + -0.9595617055892944, + -0.029961343854665756, + -2.205785036087036, + -1.2615025043487549, + -0.7878209352493286 + ] + }, + "throughput": [ + 14.167753773233736, + 78.68224606460956, + 79.61636072923858, + 79.54665108975186, + 79.62008872611396, + 79.57034369848175, + 79.0717192987748, + 79.63717144611178 + ] +} \ No newline at end of file From c9fb78b85e291e4869df2809e6ee99d257af4fa9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 27 Oct 2025 09:29:47 +0000 Subject: [PATCH 070/248] ci: Add more golden values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .../golden_values_dev_dgxh100_coreweave.json | 187 ++ .../golden_values_dev_dgxh100_coreweave.json | 2703 +++++++++++++++++ .../golden_values_dev_dgxh100_coreweave.json | 170 ++ 3 files changed, 3060 insertions(+) create mode 100644 tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..8076bdc9a25 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,187 @@ +{ + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", + "generated_tokens": [ + 3060, + 2430, + 1636, + 2012, + 1317, + 1278, + 2362, + 1307, + 1278, + 16070, + 1044, + 1321, + 1636, + 23067, + 1455, + 1593, + 1395, + 1605, + 3140, + 5152, + 1513, + 1747, + 1046, + 2409, + 1395, + 3140, + 5152, + 1513, + 1278, + 2362 + ], + "latency": 0.2859375476837158, + "cuda_graph_request_count_map": { + "372": 0, + "360": 0, + "336": 0, + "312": 0, + "288": 0, + "264": 0, + "240": 0, + "216": 0, + "192": 0, + "168": 0, + "144": 0, + "120": 0, + "96": 0, + "72": 0, + "48": 0, + "24": 29 + }, + "step_count": 240, + "logprobs": [ + -9.362494468688965, + -2.827894449234009, + -4.557381629943848, + -1.4968647956848145, + -0.717312216758728, + -1.7262351512908936, + -2.522736072540283, + -2.1782360076904297, + -2.3603432178497314, + -6.136383533477783, + -1.4676916599273682, + -3.468963384628296, + -4.424870491027832, + -3.7345848083496094, + -2.012619972229004, + -1.8833301067352295, + -3.5708768367767334, + -6.8197832107543945, + -0.3122292757034302, + -0.9820290207862854, + -6.532033443450928, + -7.498172760009766, + -12.615165710449219, + -2.409003496170044, + -3.8550546169281006, + -0.5105050802230835, + -4.2802581787109375, + -0.06971167027950287, + -0.054025799036026, + -3.319596767425537, + -9.703240394592285, + -1.0997297763824463, + -6.224854469299316, + -5.234503269195557, + -3.934987783432007, + -2.5263679027557373, + -3.1843955516815186, + -5.880871295928955, + -1.8436813354492188, + -5.906496047973633, + -12.15787410736084, + -12.5841064453125, + -0.0819428563117981, + -2.6212656497955322, + -1.4329369068145752, + -2.885145425796509, + -1.2901865243911743, + -0.006647023372352123, + -3.5115818977355957, + -12.945953369140625, + -3.793078899383545, + -3.0094375610351562, + -5.966838836669922, + -0.8998424410820007, + -0.040962252765893936, + -1.5467679500579834, + -1.0785343647003174, + -5.73494815826416, + -0.38491737842559814, + -5.017007827758789, + -0.5568072199821472, + -0.5968841910362244, + -2.3609962463378906, + -13.582086563110352, + -0.09050048142671585, + -3.7264108657836914, + -1.1208789348602295, + -6.052675247192383, + -0.5848909616470337, + -3.5906238555908203, + -0.9494907855987549, + -1.5676641464233398, + -5.127577781677246, + -17.19189453125, + -6.698403835296631, + -1.0449178218841553, + -4.365664958953857, + -1.1243419647216797, + -2.2092156410217285, + -1.8081634044647217, + -0.23330983519554138, + -9.439546585083008, + -0.2947109341621399, + -7.253565788269043, + -2.3855936527252197, + -4.629369258880615, + -3.4186267852783203, + -1.9727531671524048, + -2.331681251525879, + -1.5606917142868042, + -2.454296588897705, + -1.5334703922271729, + -1.2631131410598755, + -2.657367706298828, + -0.6480202078819275, + -0.4550393521785736, + -1.3625166416168213, + -0.8142069578170776, + -0.4496593475341797, + -0.9312890768051147, + -1.732723355293274, + -0.44613128900527954, + -1.6895122528076172, + -0.6082233190536499, + -1.0978344678878784, + -1.1122435331344604, + -0.002520838286727667, + -1.4072327613830566, + -0.007462364621460438, + -0.7548662424087524, + -0.9937503337860107, + -0.0675487294793129, + -0.9595617055892944, + -0.029961343854665756, + -2.205785036087036, + -1.2615025043487549, + -0.7878209352493286 + ] + }, + "throughput": [ + 4.17304871546938, + 103.09983375107234, + 103.84588149949121, + 103.54772132523577, + 103.90874002236247, + 103.06242433872661, + 103.53792289114989, + 103.82591647661074 + ] +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..7184e0e35c1 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,2703 @@ +{ + "0": { + "input_prompt": "The $500 Cup of coffee?\nConsider this, most Americans spend an average of $1,500-2,000 a year on this bean water.\nI have a few question for you: \nHow has business been the past few months?\nDo you ever feel like your business is stuck?\nDon't feel like you're able to improve performance and make changes required to achieve success ?\nAre your customers spneding less and less and wanting more?\nHave the gas prices affected your business?\nDo you have employees and do they hate you or wish they could quit?\n\nNow, before you and I can decide wheter or not I will be a good fit for your business we should talk this over with coffee.\nAnd, just to warn you this isn't some casual thing. This is not a date or time to be personal or social (but by all means share what you will coz I'll gladly listen).\nTher eare two major talking points and stratagies we will focios on in our lil coffee social\nFor one, we will find your unique selling Proposition (USP).\nDo have the best price? Are you the cheapest in town? Are your customers jerks? Do you haVE REGULARS? Why do people come back?\nwe'll also look for the holes in your business bucket. I'm willing to bet there's a hole or two in your business we'll find together that'll make this 500 dollar cup of Joe pay for itse;f immedietly.\nMany find themselves to be more profitable by just finding out where the dollars are escaping in their business and I like to think of myself as a guy that comes along with some spakel or putty and patch those holes up for you.\nBeleive me, just fixing one hole can mean a lot...just think about a sinking boat that has a hole in it that's about 3\u201d in diameter... it doesn't take long to sink.\nI have no agenda, besides f=getting to know your business and seeing wher I can patch the holes and find what makes you do darn unique (I know this won't take long.)\nMany folks, I bet, will find what they need to get off their chest with a quick phone call and they just paypal me the money and make a coffee at home. Look, that's fine too.\nI just to get you ot of your comfort zone, because this is where it all starts my frind.\nSome smart GOAT entrepreneur will probably get everything they need just out of our lil mini consulatant for the more extensive business consukting I offer, and look, that's fine, too.\nMaybe this cup of coffee will be all you need to gtet back on your feet, but not only surive, but thrive!\nI'm not trying to make frineds, or make a bunch of money, or look down your shirt\nBut this is only going to be a 45 minute (max) coffee chat\nAnd, I know you (and me) both have a lot on our plates. So no messing around\nAfter our consultation I will follow up with you in a few days and see how things are going, then I will be emailing you about once or twice every two weeks, just to make sure you're staying on task and implementing what we went over.\nTghere is no obligation to go any further and will gladly give you back your money if this pre-consultation doesn't put you on the right path or you don't get any value out of it...", + "generated_text": " $ is a$ is a $ is a $ is a $ is a $ is a $$1, you\n$ $$ $\n$ $- $\n$\n$\n$ $$$\n$\n$\n$\n$\n$\n$\n$\n$???????, $\n-1\n$\n1.5.\n$\n$, you\n$.\n$\n1,1.1\nI\n$.\nI\n\n\nHow\n$,,,0,1,0,0.0\nIn\nThe\nThe\nThe\n", + "generated_tokens": [ + 1659, + 1395, + 1261, + 1036, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1036, + 1049, + 1044, + 1636, + 1010, + 1036, + 1659, + 1036, + 1659, + 1010, + 1036, + 1659, + 1045, + 1659, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1659, + 1036, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1063, + 1063, + 1063, + 1063, + 1063, + 1063, + 1063, + 1044, + 1659, + 1010, + 1045, + 1049, + 1010, + 1036, + 1010, + 1049, + 1046, + 1053, + 1046, + 1010, + 1036, + 1010, + 1036, + 1044, + 1636, + 1010, + 1036, + 1046, + 1010, + 1036, + 1010, + 1049, + 1044, + 1049, + 1046, + 1049, + 1010, + 1073, + 1010, + 1036, + 1046, + 1010, + 1073, + 1010, + 1010, + 1010, + 7801, + 1010, + 1036, + 1044, + 1044, + 1044, + 1048, + 1044, + 1049, + 1044, + 1048, + 1044, + 1048, + 1046, + 1048, + 1010, + 1785, + 1010, + 1784, + 1010, + 1784, + 1010, + 1784, + 1010 + ], + "latency": 9.77891230583191, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -7.7319135665893555, + -2.188307285308838, + -0.7547445297241211, + -0.7294313311576843, + -10.238386154174805, + -3.3775341510772705, + -6.394498825073242, + -7.354557037353516, + -9.018157958984375, + -3.012073040008545, + -3.2584073543548584, + -5.220732688903809, + -4.620487213134766, + -2.5078930854797363, + -3.752683162689209, + -0.13360372185707092, + -0.05705544352531433, + -0.41462242603302, + -1.585279941558838, + -1.6438164710998535, + -1.9557222127914429, + -0.3989897072315216, + -0.0365302674472332, + -6.368816375732422, + -0.8731719255447388, + -0.022585075348615646, + -0.2775891423225403, + -0.0027362785767763853, + -0.0006812873762100935, + -1.581446647644043, + -0.008688976056873798, + -0.3532317280769348, + -6.071163177490234, + -9.162371635437012, + -9.965556144714355, + -2.400461196899414, + -2.9898362159729004, + -2.9803032875061035, + -2.12601900100708, + -3.500912666320801, + -7.015069007873535, + -2.278961420059204, + -0.46380555629730225, + -4.078739166259766, + -1.9430254697799683, + -3.5642244815826416, + -3.689701795578003, + -6.201474189758301, + -6.580833911895752, + -2.3081111907958984, + -5.42717170715332, + -1.1886008977890015, + -1.172760248184204, + -1.3571951389312744, + -1.3551844358444214, + -3.376784324645996, + -0.05118789151310921, + -4.064360618591309, + -2.575554847717285, + -0.6994737386703491, + -2.56724214553833, + -2.1888976097106934, + -0.4816131591796875, + -4.070178985595703, + -2.0060782432556152, + -6.858033180236816, + -0.059200502932071686, + -3.214278221130371, + -0.9671833515167236, + -0.823198676109314, + -1.0130078792572021, + -4.595561981201172, + -0.012724989093840122, + -5.214311599731445, + -8.246870040893555, + -3.1476030349731445, + -3.299684524536133, + -4.218191146850586, + -7.318399429321289, + -0.8580498695373535, + -3.0894036293029785, + -1.886361002922058, + -7.217658996582031, + -3.271679639816284, + -3.9717154502868652, + -1.8835484981536865, + -10.034332275390625, + -11.382490158081055, + -5.417011260986328, + -7.505967140197754, + -2.33837890625, + -0.07904055714607239, + -3.294971227645874, + -7.813640594482422, + -1.7646901607513428, + -4.025320053100586, + -3.5977325439453125, + -4.390352249145508, + -9.147806167602539, + -0.5303041934967041, + -7.721246242523193, + -0.6311959028244019, + -0.8119025230407715, + -0.7227814197540283, + -1.8369406461715698, + -0.20933297276496887, + -1.5395950078964233, + -4.424448490142822, + -4.084965705871582, + -3.355497360229492, + -1.0475609302520752, + -6.479413986206055, + -0.7810530662536621, + -2.132437229156494, + -6.648703098297119, + -2.9522438049316406, + -1.2485712766647339, + -4.040503025054932, + -2.3415768146514893, + -5.358206748962402, + -1.6258506774902344, + -3.956300973892212, + -0.732298731803894, + -7.441117286682129, + -1.5242161750793457, + -2.4555861949920654, + -4.295163154602051, + -9.687600135803223, + -0.8213484883308411, + -1.2446978092193604, + -0.01942702941596508, + -4.619411468505859, + -3.3297007083892822, + -2.2139487266540527, + -3.691431999206543, + -2.6574106216430664, + -6.075929641723633, + -0.6123450994491577, + -1.2942559719085693, + -0.6262839436531067, + -7.398006439208984, + -4.4869890213012695, + -4.202048301696777, + -4.982994079589844, + -0.637227475643158, + -3.061023235321045, + -10.117584228515625, + -3.8567495346069336, + -4.0480828285217285, + -2.472019672393799, + -4.246374607086182, + -1.3939155340194702, + -7.132441520690918, + -0.20108745992183685, + -4.986658573150635, + -4.387957572937012, + -0.01108358334749937, + -4.209756851196289, + -7.271108627319336, + -4.047314643859863, + -2.6497321128845215, + -1.4763175249099731, + -0.28365400433540344, + -3.5247769355773926, + -1.4226995706558228, + -4.327237129211426, + -2.0407187938690186, + -6.1437907218933105, + -1.5190880298614502, + -2.5511486530303955, + -7.504094123840332, + -2.152172565460205, + -6.708334922790527, + -6.913146495819092, + -3.6959621906280518, + -6.752341270446777, + -0.63083815574646, + -0.12433214485645294, + -5.0525641441345215, + -4.435934066772461, + -0.45601028203964233, + -6.3459577560424805, + -9.882917404174805, + -3.1422882080078125, + -2.550520658493042, + -3.2099051475524902, + -6.278127193450928, + -0.07764133810997009, + -3.155696153640747, + -1.933587670326233, + -9.61027717590332, + -6.211391925811768, + -4.664543151855469, + -6.783782005310059, + -5.676271438598633, + -8.605900764465332, + -0.0824289619922638, + -3.5463995933532715, + -13.374168395996094, + -1.2401021718978882, + -1.8734056949615479, + -3.4154422283172607, + -1.6733763217926025, + -17.633970260620117, + -9.345113754272461, + -0.6277351975440979, + -2.9617538452148438, + -2.5565333366394043, + -10.10580825805664, + -7.130337715148926, + -7.36820125579834, + -4.098911285400391, + -5.747079372406006, + -2.945054769515991, + -0.7887389063835144, + -1.6583149433135986, + -1.0165244340896606, + -6.581666946411133, + -5.926386833190918, + -5.845194339752197, + -0.9657630920410156, + -7.868755340576172, + -1.3244551420211792, + -0.2657390236854553, + -0.06403665244579315, + -2.983020782470703, + -5.943899631500244, + -7.877285957336426, + -3.593116283416748, + -3.819509506225586, + -7.226177215576172, + -2.5206997394561768, + -3.385587215423584, + -0.37499159574508667, + -1.4698283672332764, + -3.1460342407226562, + -0.0077166082337498665, + -4.350916862487793, + -3.2183218002319336, + -0.6242184638977051, + -1.4782464504241943, + -2.8054311275482178, + -3.0831401348114014, + -12.17662525177002, + -2.113419532775879, + -1.6448111534118652, + -2.1834323406219482, + -0.7630388140678406, + -10.1896390914917, + -6.234405517578125, + -11.46288776397705, + -1.003785490989685, + -4.211658477783203, + -1.5010679960250854, + -5.859302043914795, + -2.0465080738067627, + -3.7468819618225098, + -4.684195518493652, + -4.318704128265381, + -2.7234389781951904, + -9.00437068939209, + -3.043811321258545, + -3.1384406089782715, + -2.713779926300049, + -2.095993995666504, + -2.1484954357147217, + -10.274479866027832, + -0.682350754737854, + -0.25973302125930786, + -3.6964316368103027, + -13.434456825256348, + -2.3368239402770996, + -5.382724761962891, + -1.9073458909988403, + -5.905669212341309, + -0.032165709882974625, + -1.6530004739761353, + -2.728893280029297, + -1.640552043914795, + -1.1391171216964722, + -1.4353511333465576, + -4.003787994384766, + -0.3450564742088318, + -0.7168521285057068, + -0.34650325775146484, + -0.3616408705711365, + -7.062709331512451, + -1.2851682901382446, + -2.299129009246826, + -8.800156593322754, + -5.208735466003418, + -4.780910491943359, + -2.78342342376709, + -4.469717979431152, + -6.909726619720459, + -2.5114197731018066, + -0.659822404384613, + -0.6915416121482849, + -3.2363741397857666, + -0.5283617377281189, + -0.10473938286304474, + -6.215325832366943, + -7.283237934112549, + -1.6797031164169312, + -11.50100040435791, + -7.5822978019714355, + -3.387317657470703, + -11.407575607299805, + -5.441976547241211, + -3.3264851570129395, + -0.7265786528587341, + -1.382750153541565, + -7.841699600219727, + -8.105277061462402, + -3.9569506645202637, + -4.963083267211914, + -0.5492897629737854, + -4.6081390380859375, + -5.870400905609131, + -3.957930088043213, + -5.275494575500488, + -4.105091094970703, + -2.15435528755188, + -2.8472700119018555, + -1.1278448104858398, + -8.226571083068848, + -0.40629008412361145, + -9.916461944580078, + -4.616743087768555, + -1.691868543624878, + -0.6639478802680969, + -2.5716753005981445, + -6.676954746246338, + -6.535329818725586, + -0.4170510768890381, + -1.443942904472351, + -3.145481824874878, + -1.440589427947998, + -0.26935356855392456, + -0.9647155404090881, + -4.335958957672119, + -1.5647850036621094, + -5.890466690063477, + -3.01654052734375, + -1.9168468713760376, + -3.7365682125091553, + -8.001864433288574, + -10.680083274841309, + -4.489352226257324, + -4.6058149337768555, + -7.69011116027832, + -3.6247005462646484, + -1.5600426197052002, + -10.2160062789917, + -5.004643440246582, + -0.19602319598197937, + -3.375545024871826, + -2.669325590133667, + -1.3932737112045288, + -1.6410658359527588, + -6.847603797912598, + -6.744344711303711, + -0.5215591192245483, + -0.25840020179748535, + -1.1448237895965576, + -5.57253885269165, + -7.251138687133789, + -4.221924781799316, + -0.7688062787055969, + -2.504502534866333, + -3.146519660949707, + -2.206653356552124, + -1.4295082092285156, + -7.96943998336792, + -4.332189083099365, + -2.5750505924224854, + -1.7102608680725098, + -5.311381816864014, + -8.897522926330566, + -2.994919538497925, + -3.3397974967956543, + -2.1794328689575195, + -2.437566041946411, + -0.3181810975074768, + -0.27412793040275574, + -0.7914466857910156, + -2.3470635414123535, + -2.4099245071411133, + -2.491870880126953, + -3.024170160293579, + -1.9719040393829346, + -11.373910903930664, + -1.4279751777648926, + -0.14573107659816742, + -2.055763006210327, + -6.366893291473389, + -4.24091911315918, + -0.00709194503724575, + -2.0199716091156006, + -2.524750232696533, + -1.4272525310516357, + -0.5185190439224243, + -2.927150011062622, + -2.7070627212524414, + -3.365638017654419, + -4.318085193634033, + -7.773144721984863, + -1.7947180271148682, + -7.657534599304199, + -8.767786026000977, + -14.74280071258545, + -1.8042558431625366, + -3.2712037563323975, + -1.4002125263214111, + -4.887944221496582, + -1.4821010828018188, + -1.5255622863769531, + -5.879070281982422, + -4.463839530944824, + -5.1955976486206055, + -5.665647506713867, + -0.3775045573711395, + -5.9350481033325195, + -2.800539255142212, + -0.13162286579608917, + -3.034379720687866, + -4.729524612426758, + -4.6252641677856445, + -3.850942611694336, + -2.4760568141937256, + -6.059760093688965, + -10.12075138092041, + -0.9469369649887085, + -11.595907211303711, + -6.875324726104736, + -4.268826007843018, + -2.835529088973999, + -3.8626279830932617, + -4.876199245452881, + -0.013071090914309025, + -4.964417934417725, + -0.7445687055587769, + -5.707155227661133, + -6.10660457611084, + -4.317755699157715, + -4.440443992614746, + -2.9202542304992676, + -4.743522644042969, + -1.2569392919540405, + -2.8675737380981445, + -2.3151841163635254, + -4.318130970001221, + -1.9054772853851318, + -1.1808521747589111, + -0.765956461429596, + -2.768916606903076, + -6.237791061401367, + -1.7224305868148804, + -7.137521743774414, + -4.512486457824707, + -1.9069950580596924, + -4.145983695983887, + -5.365190505981445, + -0.059828490018844604, + -2.273892879486084, + -3.4013004302978516, + -5.035730361938477, + -6.501443386077881, + -9.903446197509766, + -1.6332892179489136, + -2.1572084426879883, + -1.6149548292160034, + -1.4698481559753418, + -6.01010799407959, + -2.2243528366088867, + -6.900836944580078, + -6.0930986404418945, + -2.974020481109619, + -3.225423574447632, + -8.423272132873535, + -1.3423724174499512, + -3.626147508621216, + -0.4862469434738159, + -6.860866546630859, + -3.8910953998565674, + -2.33319354057312, + -1.7229185104370117, + -2.215972423553467, + -8.99046516418457, + -4.099084854125977, + -2.4191012382507324, + -8.288970947265625, + -2.9641928672790527, + -1.5036451816558838, + -3.0544614791870117, + -0.0715634673833847, + -2.444031238555908, + -4.520998954772949, + -3.972568988800049, + -0.4985870122909546, + -2.1651363372802734, + -3.4427435398101807, + -1.730639100074768, + -0.9458961486816406, + -7.740211009979248, + -9.39163875579834, + -3.895984172821045, + -1.7523534297943115, + -5.41331672668457, + -8.910720825195312, + -12.971094131469727, + -3.0455880165100098, + -10.501265525817871, + -3.3864927291870117, + -4.842309951782227, + -3.9964733123779297, + -7.3046793937683105, + -2.6607093811035156, + -1.3541781902313232, + -5.003270626068115, + -3.944551944732666, + -0.11356143653392792, + -5.174440383911133, + -9.628616333007812, + -8.654989242553711, + -8.980416297912598, + -6.670101642608643, + -5.488286018371582, + -5.943419933319092, + -2.126483201980591, + -8.054739952087402, + -7.458671569824219, + -2.5267202854156494, + -6.455472946166992, + -8.655346870422363, + -7.903901100158691, + -6.221062660217285, + -7.129237174987793, + -4.2345380783081055, + -2.5375306606292725, + -7.697700500488281, + -1.567080020904541, + -2.084331750869751, + -0.25020831823349, + -1.5145041942596436, + -4.619244575500488, + -0.2970108985900879, + -0.4977554678916931, + -6.197869300842285, + -4.030620098114014, + -7.232107639312744, + -0.21076253056526184, + -1.563366174697876, + -1.133756160736084, + -2.708237648010254, + -4.080535888671875, + -0.6818401217460632, + -0.1864331066608429, + -0.49012088775634766, + -8.732468605041504, + -11.945040702819824, + -5.243098735809326, + -1.5294703245162964, + -0.8935543298721313, + -0.6174070835113525, + -1.5068217515945435, + -3.5766501426696777, + -5.393096923828125, + -4.202867031097412, + -14.765748023986816, + -5.2513813972473145, + -0.7597705721855164, + -0.2502063810825348, + -1.7403976917266846, + -2.8000779151916504, + -1.9808133840560913, + -2.1654744148254395, + -1.8629226684570312, + -3.222038745880127, + -0.040942225605249405, + -2.3384013175964355, + -10.210381507873535, + -4.5859761238098145, + -0.5805734395980835, + -3.7019288539886475, + -2.001936674118042, + -2.7876083850860596, + -2.9799084663391113, + -4.349887371063232, + -0.0792960673570633, + -1.4366114139556885, + -1.0813264846801758, + -1.3510822057724, + -6.7060699462890625, + -5.436615943908691, + -3.978389263153076, + -6.785447597503662, + -6.147171497344971, + -3.97414231300354, + -4.332991600036621, + -0.9269428253173828, + -5.1237101554870605, + -4.486598968505859, + -0.04678357392549515, + -1.0307552814483643, + -1.4249452352523804, + -4.517682075500488, + -3.561821699142456, + -2.0815205574035645, + -0.6041194200515747, + -5.992964744567871, + -7.092092514038086, + -0.48916709423065186, + -2.6405677795410156, + -4.3345723152160645, + -3.533582925796509, + -3.1233346462249756, + -3.107872486114502, + -1.9901115894317627, + -3.1052846908569336, + -1.8440347909927368, + -6.21368408203125, + -1.8796799182891846, + -2.705214738845825, + -0.2987763583660126, + -4.070865154266357, + -1.6675832271575928, + -1.3896636962890625, + -1.5731089115142822, + -3.526170015335083, + -2.5088443756103516, + -1.208929419517517, + -3.673125743865967, + -2.501532554626465, + -6.875064373016357, + -8.512459754943848, + -1.042314052581787, + -3.657850980758667, + -7.0950798988342285, + -4.974049091339111, + -8.14085578918457, + -3.529888153076172, + -1.9389504194259644, + -7.0902204513549805, + -2.409292459487915, + -2.9428021907806396, + -1.688283085823059, + -3.622368335723877, + -2.0903351306915283, + -4.160663604736328, + -3.1683764457702637, + -1.2135626077651978, + -7.566033363342285, + -3.1186251640319824, + -5.899919509887695, + -0.9518840312957764, + -2.656729221343994, + -2.2994377613067627, + -6.806836128234863, + -1.280236840248108, + -2.838846206665039, + -1.3598848581314087, + -11.707776069641113, + -3.134333372116089, + -0.6230669617652893, + -8.219222068786621, + -7.562507152557373, + -7.489459037780762, + -1.5368008613586426, + -7.149652481079102, + -5.749268054962158, + -3.162869691848755, + -2.7235195636749268, + -6.128931999206543, + -1.1934199333190918, + -3.986410617828369, + -3.76609468460083, + -1.712721586227417, + -3.195504903793335, + -8.397743225097656, + -3.1260581016540527, + -9.792022705078125, + -4.217884540557861, + -11.583260536193848, + -5.987588882446289, + -5.178754806518555, + -6.994749069213867, + -5.167606353759766, + -7.124668121337891, + -6.201416015625, + -10.203682899475098, + -6.858526229858398, + -2.733592987060547, + -5.078882217407227, + -9.003358840942383, + -4.704894542694092, + -3.9085562229156494, + -7.247268199920654, + -7.091092109680176, + -4.4150166511535645, + -7.56699275970459, + -9.485116004943848, + -1.9977033138275146, + -6.65272331237793, + -2.236643075942993, + -7.518955707550049, + -5.525973320007324, + -4.67877721786499, + -6.608670234680176, + -5.536133766174316, + -10.772479057312012, + -10.8853178024292, + -3.6156129837036133, + -6.751470565795898, + -6.4537434577941895, + -3.4220399856567383, + -8.251005172729492, + -3.2146153450012207, + -6.330069541931152, + -1.5551663637161255, + -6.520583629608154, + -10.450878143310547, + -5.8788957595825195, + -3.7398200035095215, + -3.9084208011627197, + -0.3640081584453583, + -6.961522102355957, + -6.066243648529053, + -7.270624160766602, + -5.098455429077148, + -2.7642822265625, + -5.460171699523926, + -7.362828731536865, + -2.558631658554077, + -2.186410427093506, + -2.5309929847717285, + -2.46756649017334, + -2.0306026935577393, + -1.8713470697402954, + -2.108008623123169, + -1.2698389291763306, + -2.1712756156921387, + -2.4432802200317383, + -1.1477653980255127, + -1.8417484760284424, + -2.5971946716308594, + -1.8250831365585327, + -2.103092670440674, + -2.5183165073394775, + -2.9367291927337646, + -1.9412965774536133, + -1.7692793607711792, + -2.864521026611328, + -3.1332175731658936, + -1.098311185836792, + -2.946441173553467, + -2.2800471782684326, + -3.1929852962493896, + -2.754260778427124, + -3.485616445541382, + -3.3010287284851074, + -2.5537776947021484, + -2.6752865314483643, + -3.1617612838745117, + -2.4571690559387207, + -2.060081958770752, + -2.425969362258911, + -2.212725877761841, + -2.4232254028320312, + -3.0587053298950195, + -2.4074010848999023, + -2.457937479019165, + -2.319617986679077, + -2.6340954303741455, + -2.599524736404419, + -2.5302212238311768, + -1.6849274635314941, + -2.2609786987304688, + -2.039928674697876, + -1.9474098682403564, + -2.3550753593444824, + -1.718749761581421, + -2.413884162902832, + -1.6247628927230835, + -2.4784040451049805, + -1.828325629234314, + -1.3880831003189087, + -1.4448199272155762, + -1.1477117538452148, + -1.1669728755950928, + -1.8787822723388672, + -1.5565840005874634, + -1.6666553020477295, + -1.747725248336792, + -1.959598422050476, + -2.0376486778259277, + -2.345367431640625, + -2.055098533630371, + -1.3940613269805908, + -3.4385242462158203, + -2.7489635944366455, + -3.2590157985687256, + -3.1128957271575928, + -1.7070379257202148, + -3.9010369777679443, + -3.21574068069458, + -3.3850393295288086, + -1.8778185844421387, + -2.698211908340454, + -1.8060741424560547, + -2.0845324993133545, + -3.4797585010528564, + -2.263254404067993, + -3.083108901977539, + -1.6589758396148682, + -2.687279224395752, + -1.77505624294281, + -2.6142921447753906, + -1.934045672416687, + -1.8834377527236938, + -2.8038980960845947, + -1.550542950630188, + -3.4054152965545654, + -1.724036693572998, + -2.3146564960479736, + -1.5134503841400146, + -2.9289023876190186, + -1.5285141468048096, + -3.421035051345825, + -1.3757282495498657, + -3.441431760787964, + -1.5286564826965332, + -3.4372904300689697, + -3.173043966293335, + -1.1313854455947876, + -1.486415147781372, + -2.506413221359253, + -3.5165903568267822, + -1.4112176895141602, + -3.7175354957580566, + -2.2170844078063965, + -0.704839289188385, + -0.6626103520393372, + -2.5483946800231934, + -0.949668288230896, + -3.1339564323425293, + -1.3326977491378784, + -3.5493476390838623, + -0.6785370111465454, + -3.236161470413208, + -2.1347084045410156, + -3.802447557449341, + -2.585503339767456, + -3.9361765384674072, + -2.290905714035034, + -3.714280605316162, + -1.793616533279419, + -3.7252492904663086, + -1.450188159942627, + -3.11938738822937, + -1.25174880027771 + ] + }, + "32": { + "input_prompt": "create a conversational article", + "generated_text": " about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article.", + "generated_tokens": [ + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046 + ], + "latency": 48.63822364807129, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -4.4165568351745605, + -11.358176231384277, + -0.0701780766248703, + -7.797665119171143, + -2.6805992126464844, + -1.4707680940628052, + -3.0390255451202393, + -1.6902849674224854, + -1.270594835281372, + -1.1936196088790894, + -1.2523558139801025, + -2.7270259857177734, + -1.2371309995651245, + -0.9618493318557739, + -0.4379909038543701, + -1.3917063474655151, + -1.1055524349212646, + -0.9122569561004639, + -0.9911308288574219, + -0.08436793833971024, + -0.5424078106880188, + -0.9181017279624939, + -0.5873759388923645, + -0.19014373421669006, + -0.06655456870794296, + -0.15252672135829926, + -0.09415211528539658, + -0.009787309914827347, + -0.013910251669585705, + -0.005296128336340189, + -0.005677408073097467, + -0.02013739012181759, + -0.21594694256782532, + -0.07153760641813278, + -0.0066444179974496365, + -0.010198505595326424, + -0.011980246752500534, + -0.003686776151880622, + -0.0037619550712406635, + -0.0022467151284217834, + -0.004088377580046654, + -0.021828632801771164, + -0.0012669878778979182, + -0.09768074005842209, + -0.02652405947446823, + -0.0019286142196506262, + -0.002283824374899268, + -0.0032225127797573805, + -0.0009741804678924382, + -0.0009415484382770956, + -0.001211624126881361, + -0.001135300612077117, + -0.002340436913073063, + -0.0010846928926184773, + -0.0509282611310482, + -0.03832047060132027, + -0.00257422705180943, + -0.0022806129418313503, + -0.00262785074301064, + -0.0008195855189114809, + -0.0010239601833745837, + -0.0013777059502899647, + -0.0009899006690829992, + -0.0018756669014692307, + -0.0015304292319342494, + -0.08506463468074799, + -0.01893703266978264, + -0.0013797297142446041, + -0.0014461545506492257, + -0.0013971101725474, + -0.0005869334563612938, + -0.0005212855176068842, + -0.000876757490914315, + -0.0005256939912214875, + -0.0012863941956311464, + -0.0015691122971475124, + -0.051276568323373795, + -0.00973513163626194, + -0.0010469438275322318, + -0.0011531615164130926, + -0.0009969270322471857, + -0.00038342276820912957, + -0.0004032037395518273, + -0.000730247818864882, + -0.0003275334893260151, + -0.0008700875914655626, + -0.0017572689102962613, + -0.044393111020326614, + -0.013102858327329159, + -0.0011463745031505823, + -0.0012070996453985572, + -0.0012325793504714966, + -0.0005048430757597089, + -0.0004876854654867202, + -0.0007901645149104297, + -0.00041500062798149884, + -0.0009869233472272754, + -0.0018687656847760081, + -0.03943866863846779, + -0.014425630681216717, + -0.0014756753807887435, + -0.001423775334842503, + -0.001209719106554985, + -0.0005046047735959291, + -0.00042167355422861874, + -0.0007688426994718611, + -0.0002699726028367877, + -0.0006598440813831985, + -0.0017849955474957824, + -0.038999658077955246, + -0.012665312737226486, + -0.0014427024871110916, + -0.0014492495683953166, + -0.001016576774418354, + -0.00042083943844772875, + -0.00033241944038309157, + -0.0006403064471669495, + -0.00022373080719262362, + -0.0007053509471006691, + -0.0018597226589918137, + -0.030997740104794502, + -0.011259939521551132, + -0.0012655591126531363, + -0.00134151556994766, + -0.0008106521563604474, + -0.0003513672563713044, + -0.0002964295563288033, + -0.0006368515896610916, + -0.00020180096908006817, + -0.0005779979983344674, + -0.0016014858847483993, + -0.0271126888692379 + ] + }, + "64": { + "input_prompt": "App Concept: \"Eggy's Interactive Adventure World\"\n\nEggy's Interactive Adventure World is a cutting-edge app designed for phones and tablets that combines immersive storytelling with engaging, interactive experiences for both parents and children. This innovative app allows users to explore the world of Eggy and participate in a series of adventures that teach essential life skills, foster personal growth, and strengthen parent-child bonds.\n\nKey Features:\n\n1. Interactive Stories: The app brings the 20 Eggy stories to life through interactive, animated storytelling. Users can read, listen, and actively participate in the stories, making choices that guide Eggy through various adventures, each with unique lessons and experiences.\n2. Augmented Reality (AR) Integration: The app utilizes AR technology, allowing users to interact with Eggy and the story characters in their real-world environment. This immersive experience creates a deeper connection with the stories and encourages engagement.\n3. Personalized Adventures: The app customizes each story based on the user's interests, needs, and personal growth areas. By collecting user input, the app tailors the storylines and learning outcomes to ensure that each adventure resonates with the child's unique experiences.\n4. Parent-Child Cooperative Play: The app encourages parents and children to collaborate and engage in the stories together. Through a split-screen mode or multi-device connectivity, parents and children can actively participate in the stories and learn essential life skills together.\n5. Skill-Building Mini-Games: Interspersed throughout the stories are educational mini-games that reinforce the lessons and themes. These games are designed to be fun, engaging, and accessible to children, encouraging them to practice and develop various skills.\n6. Progress Tracking and Rewards: The app tracks the user's progress through the stories and rewards them with badges, trophies, and other collectibles. This gamification aspect encourages continued engagement and fosters a sense of accomplishment and growth.\n7. Community Features: The app provides a safe, moderated online community where parents can share their experiences, discuss the stories and lessons, and connect with other families who are also using the app. This community fosters support, inspiration, and learning.\n8. Expert Resources: The app includes access to resources from experts in child development, psychology, and education, providing parents with additional tools and guidance on fostering the growth and well-being of their children.\n\nEggy's Interactive Adventure World is a one-of-a-kind, mind-blowing app that revolutionizes the way parents and children engage with stories and learn essential life skills. Combining the power of storytelling, cutting-edge technology, and expert resources, this app is a must-have for families looking to strengthen their bonds and support their children's personal growth.", + "generated_text": "The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season was the 10th season was the 10 season was the 10 season was the 10", + "generated_tokens": [ + 2, + 1784, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048 + ], + "latency": 87.50420427322388, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -12.107745170593262, + -2.9727728366851807, + -3.720092535018921, + -5.592433929443359, + -10.964235305786133, + -3.654498338699341, + -9.33439826965332, + -4.833785057067871, + -5.187321662902832, + -2.6944785118103027, + -6.9262237548828125, + -0.654232919216156, + -0.5550781488418579, + -0.21346639096736908, + -0.0134271876886487, + -0.010840100236237049, + -1.3878544569015503, + -0.6296291351318359, + -7.9766011238098145, + -0.4393192231655121, + -5.639142036437988, + -3.277270793914795, + -1.0206468105316162, + -11.703084945678711, + -0.7100943922996521, + -0.2809169888496399, + -2.771284818649292, + -7.190817832946777, + -4.048691749572754, + -0.012056218460202217, + -3.3802318572998047, + -0.6807184815406799, + -3.4844107627868652, + -3.312331199645996, + -0.5001641511917114, + -2.61255145072937, + -4.243694305419922, + -4.333778381347656, + -6.0625810623168945, + -0.011777156963944435, + -0.37577226758003235, + -0.9490834474563599, + -3.5450198650360107, + -2.1778035163879395, + -0.45957911014556885, + -3.00771164894104, + -1.7600425481796265, + -0.09766030311584473, + -2.467618942260742, + -1.329679012298584, + -0.8384320735931396, + -1.1864604949951172, + -3.628342866897583, + -0.2470003068447113, + -1.8938640356063843, + -5.168431282043457, + -0.05005566030740738, + -2.258014678955078, + -2.449028968811035, + -0.0034086955711245537, + -3.9485883712768555, + -1.6201664209365845, + -5.139942646026611, + -4.859354496002197, + -0.23686674237251282, + -0.5541543364524841, + -2.5826025009155273, + -6.114635467529297, + -4.3380208015441895, + -0.7412900924682617, + -0.3221715986728668, + -0.13805493712425232, + -4.1797332763671875, + -7.3456268310546875, + -0.13762745261192322, + -2.0905232429504395, + -1.0178627967834473, + -4.108260631561279, + -0.6007124185562134, + -1.0410642623901367, + -4.122039794921875, + -0.35905471444129944, + -1.4274661540985107, + -4.139932155609131, + -0.4237431585788727, + -1.6294409036636353, + -0.9811424016952515, + -4.132790565490723, + -1.1318120956420898, + -6.8258256912231445, + -1.5455098152160645, + -0.6984409093856812, + -13.664215087890625, + -0.1166313961148262, + -1.6347849369049072, + -0.28875046968460083, + -0.03130083531141281, + -1.5293006896972656, + -1.6488375663757324, + -4.224111557006836, + -4.760683059692383, + -1.9758747816085815, + -1.5828256607055664, + -2.8463857173919678, + -0.2620386481285095, + -1.7243889570236206, + -1.7945923805236816, + -0.8884308338165283, + -0.3766394555568695, + -0.34033581614494324, + -9.05566692352295, + -0.22754782438278198, + -0.033802058547735214, + -0.34108465909957886, + -0.5644669532775879, + -2.0925779342651367, + -4.547505855560303, + -10.870464324951172, + -1.1072022914886475, + -5.503787994384766, + -3.259672164916992, + -0.007964519783854485, + -3.0111639499664307, + -4.246737480163574, + -0.7813188433647156, + -3.331031322479248, + -4.485962867736816, + -0.9492117166519165, + -2.6757047176361084, + -1.1591349840164185, + -1.122117519378662, + -2.629878044128418, + -5.986321926116943, + -0.2146703153848648, + -0.002392764901742339, + -7.372479438781738, + -0.007077385671436787, + -0.06599216908216476, + -0.0970711037516594, + -3.2874932289123535, + -0.0019583588000386953, + -0.9122000336647034, + -4.930907249450684, + -0.019508399069309235, + -0.308611661195755, + -0.07778516411781311, + -3.8497893810272217, + -0.46124517917633057, + -0.38821348547935486, + -2.668412208557129, + -1.845987319946289, + -0.06470083445310593, + -0.006619549356400967, + -1.2610487937927246, + -0.13015533983707428, + -3.365312099456787, + -0.0014690094394609332, + -1.6789823770523071, + -1.2499005794525146, + -3.3992111682891846, + -5.563300132751465, + -0.823418140411377, + -4.24124813079834, + -1.6597849130630493, + -0.6941139698028564, + -1.5637556314468384, + -0.5482053756713867, + -0.9507225751876831, + -3.764758586883545, + -0.0006518622976727784, + -0.7540555000305176, + -5.058262825012207, + -0.3302401602268219, + -2.8130555152893066, + -0.17079885303974152, + -2.871047019958496, + -0.3991694450378418, + -3.1476998329162598, + -0.3488404452800751, + -2.0545666217803955, + -4.201597690582275, + -5.164614677429199, + -0.0271432027220726, + -0.0009785869624465704, + -3.3444161415100098, + -1.3117046356201172, + -6.375423431396484, + -0.05535568296909332, + -0.3919340968132019, + -0.060594215989112854, + -6.507473468780518, + -0.0023910999298095703, + -2.143423318862915, + -3.335618257522583, + -2.953970432281494, + -0.0013383012264966965, + -0.8080525398254395, + -0.29526084661483765, + -0.04036511853337288, + -3.231475353240967, + -1.0585589408874512, + -6.136373043060303, + -0.006182829383760691, + -0.035548023879528046, + -5.509808540344238, + -1.8490750789642334, + -9.83314037322998, + -0.07037576287984848, + -3.1621387004852295, + -6.762360095977783, + -1.3490527868270874, + -3.601043462753296, + -1.176393985748291, + -0.4342959523200989, + -0.06266004592180252, + -5.464046001434326, + -0.017946599051356316, + -1.0416009426116943, + -1.6117159128189087, + -12.289417266845703, + -1.5004339218139648, + -5.76563835144043, + -4.038386821746826, + -0.20812086760997772, + -3.6306562423706055, + -1.3901070356369019, + -1.087137222290039, + -2.423213243484497, + -4.503086090087891, + -0.0008031480247154832, + -0.03627370297908783, + -0.1653430461883545, + -7.958648681640625, + -1.1018548011779785, + -1.290948748588562, + -3.8049263954162598, + -1.8253734111785889, + -0.059022851288318634, + -0.0013984196120873094, + -4.698851585388184, + -2.5421664714813232, + -0.024493809789419174, + -4.828659534454346, + -3.0295286178588867, + -3.550312042236328, + -0.1185273677110672, + -0.22595760226249695, + -0.10782183706760406, + -1.4033282995224, + -0.4485701024532318, + -0.2889708876609802, + -0.05471855774521828, + -0.007632025051862001, + -2.1156554222106934, + -0.6249589323997498, + -4.198577404022217, + -0.14178156852722168, + -4.284021377563477, + -2.227515935897827, + -3.5022120475769043, + -0.19575819373130798, + -15.964509963989258, + -4.055960655212402, + -11.125024795532227, + -0.7681724429130554, + -3.0436902046203613, + -7.030262470245361, + -4.376729488372803, + -5.476145267486572, + -0.4219042658805847, + -3.7689766883850098, + -0.060010604560375214, + -0.8134393692016602, + -0.11386934667825699, + -0.025473715737462044, + -0.09736856073141098, + -4.357361793518066, + -0.3670865297317505, + -0.08063744008541107, + -0.1311480849981308, + -1.0903867483139038, + -1.2705107927322388, + -1.5076212882995605, + -4.295275688171387, + -0.04185756668448448, + -0.19810955226421356, + -1.9645220041275024, + -0.9597910642623901, + -0.13429655134677887, + -0.002283110748976469, + -7.066074371337891, + -3.639211654663086, + -1.0263917446136475, + -8.124760627746582, + -1.132537841796875, + -0.09160765260457993, + -0.08996370434761047, + -10.165366172790527, + -3.501585006713867, + -0.0019847711082547903, + -0.05309417471289635, + -0.31209683418273926, + -0.15089339017868042, + -1.23564875125885, + -1.2685208320617676, + -7.832758903503418, + -0.19271136820316315, + -0.014305183663964272, + -0.0007532381569035351, + -0.44688940048217773, + -2.6239724159240723, + -1.738666296005249, + -1.6480977535247803, + -0.46753185987472534, + -8.656959533691406, + -3.79868483543396, + -0.9281394481658936, + -2.2381181716918945, + -1.7654449939727783, + -0.4948798418045044, + -0.025028761476278305, + -1.5435361862182617, + -1.6390818357467651, + -1.4962153434753418, + -0.3425217270851135, + -0.013077914714813232, + -0.038474079221487045, + -5.3364362716674805, + -0.42365288734436035, + -1.884093999862671, + -3.510357618331909, + -6.198029518127441, + -0.44375038146972656, + -0.0008789013954810798, + -3.6025230884552, + -1.419615626335144, + -2.6723289489746094, + -5.775190830230713, + -1.1380761861801147, + -2.6683366298675537, + -0.43395891785621643, + -0.003145867260172963, + -8.63144302368164, + -1.646262764930725, + -1.732487678527832, + -4.561546802520752, + -0.5277953147888184, + -0.07333153486251831, + -0.5624169707298279, + -0.12201295047998428, + -2.6561455726623535, + -1.1071691513061523, + -2.6895060539245605, + -0.040864069014787674, + -0.04126371443271637, + -1.8294739723205566, + -0.09022177755832672, + -0.3154001832008362, + -0.46215569972991943, + -2.2462844848632812, + -0.30149081349372864, + -0.52588951587677, + -8.288043975830078, + -0.0002057340752799064, + -0.8021711707115173, + -4.4546098709106445, + -0.0001565095444675535, + -0.0015961299650371075, + -0.15216240286827087, + -0.3677564561367035, + -5.018707275390625, + -0.7850045561790466, + -1.9582659006118774, + -1.0046892166137695, + -10.0401029586792, + -0.16878114640712738, + -5.944240570068359, + -1.5523078441619873, + -5.7253522872924805, + -0.47948503494262695, + -0.44009655714035034, + -5.671053886413574, + -0.003280022880062461, + -0.7937742471694946, + -0.9639376401901245, + -0.00030048147891648114, + -1.0747740268707275, + -0.8839919567108154, + -3.416811466217041, + -1.6602673530578613, + -0.2706959843635559, + -0.0024333172477781773, + -4.478696823120117, + -6.20179557800293, + -0.11359559744596481, + -0.202009916305542, + -0.022310219705104828, + -2.367263078689575, + -1.0405994653701782, + -5.984308242797852, + -2.105138063430786, + -9.583202362060547, + -0.0004957877099514008, + -3.0655455589294434, + -0.0669412910938263, + -0.8977450728416443, + -2.2271294593811035, + -2.6617536544799805, + -1.8184051513671875, + -0.8291114568710327, + -0.4864235818386078, + -0.7993525862693787, + -3.51106858253479, + -2.1530935764312744, + -0.257144957780838, + -1.3934082984924316, + -1.3137131929397583, + -0.3384077548980713, + -0.1697217971086502, + -2.353395938873291, + -0.03406282886862755, + -0.39059701561927795, + -3.422821044921875, + -1.7117210626602173, + -0.7018465399742126, + -1.5995906591415405, + -3.6218395233154297, + -0.12497704476118088, + -0.16966234147548676, + -0.7313685417175293, + -0.4956285357475281, + -1.0840849876403809, + -5.042126655578613, + -0.00031704644788987935, + -7.683258056640625, + -0.9210801720619202, + -4.687852382659912, + -0.0028814247343689203, + -0.043382611125707626, + -4.1948652267456055, + -2.66593337059021, + -0.06153333932161331, + -0.0023110604379326105, + -6.729236602783203, + -5.777127742767334, + -0.08932067453861237, + -0.09890018403530121, + -0.009886111132800579, + -3.1145148277282715, + -3.725565195083618, + -0.0021998509764671326, + -3.9927196502685547, + -2.753793239593506, + -1.6037236452102661, + -0.17461130023002625, + -4.804804801940918, + -0.2311229705810547, + -0.30256444215774536, + -2.235363006591797, + -0.006614102050662041, + -0.34757524728775024, + -1.4946835041046143, + -1.222062587738037, + -3.658839225769043, + -1.356170892715454, + -0.5371109843254089, + -3.7580835819244385, + -4.54621696472168, + -0.31577637791633606, + -3.677156925201416, + -2.7181396484375, + -7.4674882888793945, + -0.00019369633810129017, + -2.3798398971557617, + -2.5452184677124023, + -0.2858496308326721, + -4.315659523010254, + -0.025835415348410606, + -0.000603493710514158, + -0.2546294331550598, + -0.12032663822174072, + -2.006908655166626, + -5.990736961364746, + -7.146596908569336, + -0.23356498777866364, + -0.2201036810874939, + -0.01235415879637003, + -0.011248741298913956, + -1.4155778884887695, + -0.40242519974708557, + -5.877886772155762, + -0.7865053415298462, + -0.03231288120150566, + -0.004864405374974012, + -0.0050629740580916405, + -2.7049152851104736, + -6.822089195251465, + -0.39252761006355286, + -1.2290617227554321, + -0.007630132604390383, + -3.485461711883545, + -0.47985684871673584, + -6.1813530921936035, + -0.03757825121283531, + -0.37834712862968445, + -0.22192610800266266, + -1.165318489074707, + -0.5220151543617249, + -0.1289423257112503, + -3.216222047805786, + -1.0787583589553833, + -3.0716826915740967, + -0.6023419499397278, + -2.558605194091797, + -0.927433431148529, + -0.00364841241389513, + -0.14910078048706055, + -0.7318926453590393, + -6.159773826599121, + -0.0015301911626011133, + -1.8908276557922363, + -1.9641315937042236, + -0.021651331335306168, + -2.1648828983306885, + -2.2700207233428955, + -7.833290100097656, + -0.03397307172417641, + -0.8344621658325195, + -0.02225659228861332, + -0.06639260798692703, + -2.3780317306518555, + -3.180129051208496, + -0.09030630439519882, + -2.4138312339782715, + -1.3445552587509155, + -1.848326325416565, + -0.9726964831352234, + -2.851792335510254, + -0.0630769282579422, + -0.0011394681641831994, + -0.05843213573098183, + -2.6616668701171875, + -1.575437068939209, + -0.180197611451149, + -5.552371501922607, + -0.26108410954475403, + -2.529611587524414, + -0.37780019640922546, + -5.141795635223389, + -0.5921107530593872, + -0.2474975287914276, + -0.10687454044818878, + -4.891775131225586, + -0.25011152029037476, + -2.4100728034973145, + -1.358667016029358, + -2.790961503982544, + -3.8654675483703613, + -1.0076243877410889, + -0.7456949949264526, + -1.5575554370880127, + -2.05328631401062, + -1.6538066864013672, + -0.0558217354118824, + -0.0001817776501411572, + -0.0011643542675301433, + -0.038359593600034714, + -1.4208931922912598, + -0.542127251625061, + -0.3162364959716797, + -0.3966117799282074, + -1.1765563488006592, + -1.7920958995819092, + -0.18425509333610535, + -0.1092008650302887, + -0.46676987409591675, + -0.24977745115756989, + -1.0375996828079224, + -0.5268858671188354, + -0.008942908607423306, + -0.6404479146003723, + -0.0033111530356109142, + -5.3165931603871286e-05, + -0.5154370665550232, + -0.39286962151527405, + -1.401839256286621, + -0.6232213973999023, + -0.02168831042945385, + -0.004282470792531967, + -0.005199837032705545, + -0.09748794883489609, + -0.040823787450790405, + -0.00014852374442853034, + -0.0005832401220686734, + -0.005303124897181988, + -0.6537013053894043, + -0.38026049733161926, + -0.04189129173755646, + -0.010385753586888313, + -0.008756335824728012, + -0.013362848199903965, + -0.000504723924677819, + -0.002797620603814721, + -0.0014512732159346342, + -0.0013321106089279056, + -0.010883613489568233, + -0.005159396678209305, + -0.004701037425547838, + -0.01591104455292225, + -0.001474246964789927, + -1.2278481335670222e-05, + -0.010548785328865051, + -0.08341525495052338, + -0.03858809545636177, + -0.056062061339616776, + -0.0009532198309898376, + -0.0005789510905742645, + -0.0008986725588329136, + -0.00710969977080822, + -0.0006561510381288826, + -1.4781842764932662e-05, + -5.578839045483619e-05, + -0.0006398299592547119, + -0.0028786908369511366, + -0.0034092895220965147, + -0.008268529549241066, + -0.006602259818464518, + -0.004517706111073494, + -0.02233586646616459, + -0.0006323245470412076, + -0.009195122867822647, + -0.0029284947086125612, + -0.004457537550479174, + -0.017873765900731087, + -0.008801711723208427, + -0.0036383166443556547, + -0.08078611642122269, + -0.006347495596855879, + -0.0002177716523874551, + -0.04688572511076927, + -0.2718890309333801, + -0.07996802777051926, + -0.04674842208623886, + -0.009984076954424381, + -0.010000954382121563, + -0.050126753747463226, + -0.5864179730415344, + -0.2915390133857727, + -0.008090462535619736, + -0.032190412282943726, + -0.03461571782827377, + -0.2785419523715973, + -0.05830562859773636, + -0.02893950417637825, + -0.12241066247224808, + -0.02711048536002636, + -0.16450686752796173, + -0.09852994978427887, + -0.2651047706604004, + -0.35559725761413574, + -0.12606258690357208, + -0.32793670892715454, + -0.20878805220127106, + -0.7587923407554626, + -0.4730657637119293, + -1.496794581413269, + -0.2486363798379898, + -0.20723387598991394, + -0.1872958242893219, + -0.19151091575622559, + -0.3350580036640167, + -1.3085839748382568, + -0.6109102964401245, + -0.2947172224521637, + -0.37130236625671387, + -0.19888469576835632, + -0.18297068774700165, + -0.9978674054145813, + -0.5471905469894409, + -0.4379975199699402, + -0.407988041639328, + -0.17325688898563385, + -0.16938896477222443 + ] + }, + "96": { + "input_prompt": "what is the concept of double materiality in sustainability?", + "generated_text": " What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double material", + "generated_tokens": [ + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318 + ], + "latency": 126.4328100681305, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -4.917365074157715, + -0.9960631132125854, + -7.875392913818359, + -0.2993181347846985, + -7.760880470275879, + -10.308395385742188, + -2.1807961463928223, + -1.6412583589553833, + -9.521512985229492, + -1.627489447593689, + -1.8410861492156982, + -0.9285702705383301, + -0.2576955556869507, + -0.9641067981719971, + -0.02314644306898117, + -0.6696561574935913, + -0.07035009562969208, + -0.004622488282620907, + -0.025748632848262787, + -0.06276137381792068, + -0.17385317385196686, + -0.3285445272922516, + -0.0592009499669075, + -0.007940039038658142, + -0.22664028406143188, + -0.0017957051750272512, + -0.022929180413484573, + -0.005733947269618511, + -0.0012996093137189746, + -0.006419987417757511, + -0.02376849390566349, + -0.27800270915031433, + -0.4650723934173584, + -0.04936715215444565, + -0.003972141072154045, + -0.01477995328605175, + -0.0012044801842421293, + -0.014891182072460651, + -0.002709767082706094, + -0.0009939497103914618, + -0.0028436246793717146, + -0.006759870797395706, + -0.15416178107261658, + -0.20121537148952484, + -0.016414370387792587, + -0.0015769677702337503, + -0.008138825185596943, + -0.0007713441736996174, + -0.013819841668009758, + -0.003826678032055497, + -0.0005918181850574911, + -0.0014938872773200274, + -0.00485716899856925, + -0.081083282828331, + -0.09642580896615982, + -0.009630884043872356, + -0.0010948146227747202, + -0.007085552904754877, + -0.0006310140597634017, + -0.013073914684355259, + -0.0039152647368609905, + -0.000364713923772797, + -0.001292108790948987, + -0.004158303141593933, + -0.044283974915742874, + -0.05722038820385933, + -0.006369172595441341, + -0.0007976687629707158, + -0.005993015132844448, + -0.0004935238393954933, + -0.011310506612062454, + -0.002951553324237466, + -0.000387831823900342, + -0.000977038755081594, + -0.0036971091758459806, + -0.030511993914842606, + -0.04246694967150688, + -0.004863100592046976, + -0.0006927236099727452, + -0.005206122528761625, + -0.0005129451747052372, + -0.00894621666520834, + -0.0028565814718604088, + -0.00041333239641971886, + -0.0009002208826132119, + -0.0033131728414446115, + -0.021188799291849136, + -0.03330245241522789, + -0.0038543473929166794, + -0.0006504327175207436, + -0.004474864806979895, + -0.00048029806930571795, + -0.009718249551951885, + -0.0030443770810961723, + -0.0003743662964552641, + -0.0009439303539693356, + -0.003729770192876458, + -0.016505014151334763, + -0.0290373582392931, + -0.003315192647278309, + -0.0005821678787469864, + -0.004148805979639292, + -0.00042489083716645837, + -0.006856840569525957, + -0.0028660909738391638, + -0.00032574593205936253, + -0.0006986799417063594, + -0.003671098267659545, + -0.012792548164725304, + -0.02553274855017662, + -0.002730690874159336, + -0.0005067494930699468, + -0.0036923582665622234, + -0.0004451475979294628, + -0.007225453853607178, + -0.002746859099715948, + -0.0003323002893012017, + -0.0008517451351508498, + -0.003630714723840356, + -0.011284693144261837, + -0.02186688780784607, + -0.0025819556321948767, + -0.0004931663861498237, + -0.0031721293926239014, + -0.00040725519647821784, + -0.0062383925542235374, + -0.00238693761639297, + -0.00040749352774582803, + -0.0007970731821842492, + -0.003604583442211151, + -0.010075542144477367, + -0.022386692464351654, + -0.0025295186787843704, + -0.00046302087139338255, + -0.003138143103569746, + -0.0004233417857903987, + -0.006178564392030239, + -0.0021037368569523096 + ] + }, + "127": { + "input_prompt": "as before we talk about using , i usually do if else() to in case the property would be null before.\nif today i public a property and i want to use using to instead , will code be like this?\n\npublic Material Mat;\n\nusing(Mat){\n// Do something;\n}\n\nis that right?", + "generated_text": " i am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using", + "generated_tokens": [ + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505 + ], + "latency": 155.6906189918518, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -7.793755531311035, + -4.224940776824951, + -5.446587562561035, + -0.267395555973053, + -6.118141174316406, + -11.105525970458984, + -6.67517614364624, + -6.902530670166016, + -3.2218151092529297, + -5.99644660949707, + -8.431673049926758, + -8.04836368560791, + -3.992594003677368, + -7.550079345703125, + -2.873685836791992, + -3.3877997398376465, + -7.9306745529174805, + -7.005484580993652, + -1.0481306314468384, + -2.686237335205078, + -6.131283760070801, + -6.2994704246521, + -7.931419372558594, + -11.3147554397583, + -8.544670104980469, + -12.01729679107666, + -3.89847469329834, + -1.7964364290237427, + -2.952878952026367, + -1.9217232465744019, + -2.272329330444336, + -0.37552154064178467, + -1.667820692062378, + -7.510344505310059, + -3.498040199279785, + -7.980632305145264, + -7.672002792358398, + -4.4999470710754395, + -7.155375003814697, + -2.4486124515533447, + -4.785946846008301, + -1.153855800628662, + -2.3994438648223877, + -4.250652313232422, + -12.24446964263916, + -8.344388008117676, + -2.608186721801758, + -5.200589179992676, + -8.25888442993164, + -3.6245617866516113, + -7.689338207244873, + -7.345355033874512, + -1.2661759853363037, + -7.265620231628418, + -1.9884108304977417, + -6.269482612609863, + -2.41705584526062, + -1.8929681777954102, + -1.8259913921356201, + -2.0997350215911865, + -2.323200225830078, + -1.3998825550079346, + -0.8789899945259094, + -1.082053542137146, + -1.1831339597702026, + -1.4462857246398926, + -1.6481035947799683, + -1.4408715963363647, + -1.2603964805603027, + -1.5267670154571533, + -1.6345772743225098, + -1.3796477317810059, + -0.7609691023826599, + -0.3548354506492615, + -0.7552334666252136, + -0.44776833057403564, + -1.1078286170959473, + -1.3036658763885498, + -0.5214896202087402, + -0.8486822843551636, + -0.22470997273921967, + -0.4705755412578583, + -0.5639711022377014, + -0.5388108491897583, + -0.6052999496459961, + -0.1002030223608017, + -0.286334365606308, + -0.45798981189727783, + -1.0107953548431396, + -0.11875647306442261, + -0.6969441771507263, + -0.4609107971191406, + -0.07614769786596298, + -0.5035472512245178, + -0.1682187020778656, + -0.10476160794496536, + -0.6586751341819763, + -0.35806939005851746, + -1.5364394187927246, + -2.4093759059906006, + -1.977368950843811, + -1.6216907501220703, + -0.27647316455841064, + -0.2991848587989807, + -0.2783535420894623, + -0.05913994088768959, + -0.03023873083293438, + -0.043339803814888, + -0.7320341467857361, + -0.0030677898321300745, + -0.0332595594227314, + -0.012804670259356499, + -0.004041599575430155, + -0.0014899593079462647, + -0.001948602613992989, + -0.0029070996679365635, + -0.040939707309007645, + -0.013942227698862553, + -0.04897322878241539, + -0.011005887761712074, + -0.0044113704934716225, + -0.0013179434463381767, + -0.003658389439806342, + -0.009758152067661285, + -0.0014104428701102734, + -0.0016671819612383842, + -0.000771939754486084, + -0.0015519729349762201, + -0.003720743814483285, + -0.004249115474522114, + -0.00485657574608922, + -0.005053604021668434, + -0.002336274366825819, + -0.0009155849111266434, + -0.0004978132783435285, + -0.0005953923100605607, + -0.0011395872570574284, + -0.001485078944824636, + -0.3072909712791443, + -1.7295066118240356, + -0.4807289242744446, + -0.1245415136218071, + -0.011858444660902023, + -0.020613837987184525, + -0.011020978912711143, + -0.003106294432654977, + -0.0009966888464987278, + -0.0019349202048033476, + -0.037407051771879196, + -0.0003496989083942026, + -0.005922981072217226, + -0.007394562941044569, + -0.0006037319544702768, + -0.0008836655179038644, + -0.0002884448622353375, + -0.00047600860125385225, + -0.0024947968777269125, + -0.00442774873226881, + -0.004059052560478449, + -0.0018594847060739994, + -0.0006179092451930046, + -0.00022635281493421644, + -0.0006730675231665373, + -0.003022746881470084, + -0.0002343380037928, + -0.00047791501856409013, + -9.440929716220126e-05, + -0.00021550717065110803, + -0.0013523490633815527, + -0.0032202552538365126, + -0.001157686347141862, + -0.004449942149221897, + -0.0016590891173109412, + -0.00101062236353755, + -0.0003079893649555743, + -0.00048375347978435457, + -0.0021734442561864853, + -0.00423036003485322, + -0.11514264345169067, + -0.8658493757247925, + -0.084366075694561, + -0.02140468917787075, + -0.0060798698104918, + -0.008638513274490833, + -0.003212531330063939, + -0.0009598892065696418, + -0.00032085992279462516 + ] + }, + "throughput": [ + 93.24123994187065, + 104.94118337233992, + 105.03843789693171 + ] +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..066995bd666 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,170 @@ +{ + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And that this is the place where you can be yourself, and be yourself in the most beautiful way. And that this is the place where you can", + "generated_tokens": [ + 3060, + 1455, + 1593, + 1395, + 1278, + 3535, + 2478, + 1636, + 1710, + 1402, + 14019, + 1044, + 1321, + 1402, + 14019, + 1294, + 1278, + 2725, + 15568, + 3039, + 1046, + 3060, + 1455, + 1593, + 1395, + 1278, + 3535, + 2478, + 1636, + 1710 + ], + "latency": 2.1998238563537598, + "cuda_graph_request_count_map": null, + "step_count": 240, + "logprobs": [ + -9.358587265014648, + -2.7594826221466064, + -4.608366012573242, + -1.4093360900878906, + -0.6152952313423157, + -1.7217562198638916, + -2.496668815612793, + -2.0547454357147217, + -2.441960573196411, + -6.280838966369629, + -1.5643692016601562, + -3.462346076965332, + -4.428728103637695, + -3.8633861541748047, + -1.9936373233795166, + -1.8929449319839478, + -3.796365737915039, + -6.8360137939453125, + -0.2901247441768646, + -0.9246833324432373, + -6.633338928222656, + -7.166708469390869, + -12.771251678466797, + -2.198296308517456, + -3.7778120040893555, + -0.4983733296394348, + -4.381269454956055, + -0.0666784718632698, + -0.09580295532941818, + -3.2437636852264404, + -10.079947471618652, + -1.172220230102539, + -5.977442741394043, + -5.046236038208008, + -3.855658531188965, + -2.5585858821868896, + -3.356245994567871, + -5.557229518890381, + -1.6787731647491455, + -5.483290672302246, + -12.218501091003418, + -12.61402702331543, + -0.09662941098213196, + -2.5431432723999023, + -1.4071024656295776, + -2.9154715538024902, + -1.1964417695999146, + -0.006458481773734093, + -3.3625335693359375, + -13.262511253356934, + -4.314079761505127, + -2.617699146270752, + -5.987792015075684, + -0.778266429901123, + -0.048888545483350754, + -1.548882007598877, + -1.1381981372833252, + -5.627166748046875, + -0.4078553318977356, + -4.958505630493164, + -0.6187160611152649, + -0.7174848914146423, + -2.469533920288086, + -13.620073318481445, + -0.09088654816150665, + -3.526974678039551, + -1.4195809364318848, + -6.402483940124512, + -0.5898402333259583, + -3.565917491912842, + -0.8561318516731262, + -1.6140165328979492, + -5.370549201965332, + -17.159223556518555, + -6.583524703979492, + -0.8855001926422119, + -4.19431209564209, + -1.2012220621109009, + -2.2563133239746094, + -1.7674944400787354, + -0.22064533829689026, + -9.292220115661621, + -0.12445646524429321, + -7.29617977142334, + -2.526529312133789, + -4.071560859680176, + -3.5568013191223145, + -1.926215410232544, + -2.349026918411255, + -2.2132363319396973, + -0.3125414550304413, + -1.4718132019042969, + -2.149106740951538, + -1.0855519771575928, + -1.631832242012024, + -1.3751734495162964, + -1.9396103620529175, + -1.5293723344802856, + -0.8444125056266785, + -1.2414811849594116, + -1.9522171020507812, + -2.4338042736053467, + -1.5651824474334717, + -0.9498789310455322, + -1.8044980764389038, + -2.356677770614624, + -1.247452974319458, + -1.550165057182312, + -0.5635553598403931, + -0.6177330017089844, + -0.4778785705566406, + -0.020452087745070457, + -0.48500269651412964, + -0.23854275047779083, + -0.06543659418821335, + -0.11837350577116013, + -0.0585334412753582 + ] + }, + "throughput": [ + 0.7170174223459943, + 12.998776662244524, + 13.163004282426089, + 13.581765270525981, + 13.619124445335821, + 13.655332144429561, + 13.608264815678803, + 13.614656540485411 + ] +} \ No newline at end of file From 6f5128440a5cd80c073a1b6804f908cf53c2523e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 27 Oct 2025 10:23:00 +0000 Subject: [PATCH 071/248] ci: Aggregate throughput MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .../golden_values_dev_dgxh100_coreweave.json | 361 +- .../golden_values_dev_dgxh100_coreweave.json | 361 +- .../golden_values_dev_dgxh100_coreweave.json | 5398 ++++++++--------- .../golden_values_dev_dgxh100_coreweave.json | 327 +- 4 files changed, 3208 insertions(+), 3239 deletions(-) diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgxh100_coreweave.json index 8076bdc9a25..0e953af50e7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgxh100_coreweave.json @@ -1,187 +1,178 @@ { - "0": { - "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", - "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", - "generated_tokens": [ - 3060, - 2430, - 1636, - 2012, - 1317, - 1278, - 2362, - 1307, - 1278, - 16070, - 1044, - 1321, - 1636, - 23067, - 1455, - 1593, - 1395, - 1605, - 3140, - 5152, - 1513, - 1747, - 1046, - 2409, - 1395, - 3140, - 5152, - 1513, - 1278, - 2362 - ], - "latency": 0.2859375476837158, - "cuda_graph_request_count_map": { - "372": 0, - "360": 0, - "336": 0, - "312": 0, - "288": 0, - "264": 0, - "240": 0, - "216": 0, - "192": 0, - "168": 0, - "144": 0, - "120": 0, - "96": 0, - "72": 0, - "48": 0, - "24": 29 - }, - "step_count": 240, - "logprobs": [ - -9.362494468688965, - -2.827894449234009, - -4.557381629943848, - -1.4968647956848145, - -0.717312216758728, - -1.7262351512908936, - -2.522736072540283, - -2.1782360076904297, - -2.3603432178497314, - -6.136383533477783, - -1.4676916599273682, - -3.468963384628296, - -4.424870491027832, - -3.7345848083496094, - -2.012619972229004, - -1.8833301067352295, - -3.5708768367767334, - -6.8197832107543945, - -0.3122292757034302, - -0.9820290207862854, - -6.532033443450928, - -7.498172760009766, - -12.615165710449219, - -2.409003496170044, - -3.8550546169281006, - -0.5105050802230835, - -4.2802581787109375, - -0.06971167027950287, - -0.054025799036026, - -3.319596767425537, - -9.703240394592285, - -1.0997297763824463, - -6.224854469299316, - -5.234503269195557, - -3.934987783432007, - -2.5263679027557373, - -3.1843955516815186, - -5.880871295928955, - -1.8436813354492188, - -5.906496047973633, - -12.15787410736084, - -12.5841064453125, - -0.0819428563117981, - -2.6212656497955322, - -1.4329369068145752, - -2.885145425796509, - -1.2901865243911743, - -0.006647023372352123, - -3.5115818977355957, - -12.945953369140625, - -3.793078899383545, - -3.0094375610351562, - -5.966838836669922, - -0.8998424410820007, - -0.040962252765893936, - -1.5467679500579834, - -1.0785343647003174, - -5.73494815826416, - -0.38491737842559814, - -5.017007827758789, - -0.5568072199821472, - -0.5968841910362244, - -2.3609962463378906, - -13.582086563110352, - -0.09050048142671585, - -3.7264108657836914, - -1.1208789348602295, - -6.052675247192383, - -0.5848909616470337, - -3.5906238555908203, - -0.9494907855987549, - -1.5676641464233398, - -5.127577781677246, - -17.19189453125, - -6.698403835296631, - -1.0449178218841553, - -4.365664958953857, - -1.1243419647216797, - -2.2092156410217285, - -1.8081634044647217, - -0.23330983519554138, - -9.439546585083008, - -0.2947109341621399, - -7.253565788269043, - -2.3855936527252197, - -4.629369258880615, - -3.4186267852783203, - -1.9727531671524048, - -2.331681251525879, - -1.5606917142868042, - -2.454296588897705, - -1.5334703922271729, - -1.2631131410598755, - -2.657367706298828, - -0.6480202078819275, - -0.4550393521785736, - -1.3625166416168213, - -0.8142069578170776, - -0.4496593475341797, - -0.9312890768051147, - -1.732723355293274, - -0.44613128900527954, - -1.6895122528076172, - -0.6082233190536499, - -1.0978344678878784, - -1.1122435331344604, - -0.002520838286727667, - -1.4072327613830566, - -0.007462364621460438, - -0.7548662424087524, - -0.9937503337860107, - -0.0675487294793129, - -0.9595617055892944, - -0.029961343854665756, - -2.205785036087036, - -1.2615025043487549, - -0.7878209352493286 - ] - }, - "throughput": [ - 4.17304871546938, - 103.09983375107234, - 103.84588149949121, - 103.54772132523577, - 103.90874002236247, - 103.06242433872661, - 103.53792289114989, - 103.82591647661074 - ] + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", + "generated_tokens": [ + 3060, + 2430, + 1636, + 2012, + 1317, + 1278, + 2362, + 1307, + 1278, + 16070, + 1044, + 1321, + 1636, + 23067, + 1455, + 1593, + 1395, + 1605, + 3140, + 5152, + 1513, + 1747, + 1046, + 2409, + 1395, + 3140, + 5152, + 1513, + 1278, + 2362 + ], + "latency": 0.2859375476837158, + "cuda_graph_request_count_map": { + "372": 0, + "360": 0, + "336": 0, + "312": 0, + "288": 0, + "264": 0, + "240": 0, + "216": 0, + "192": 0, + "168": 0, + "144": 0, + "120": 0, + "96": 0, + "72": 0, + "48": 0, + "24": 29 + }, + "step_count": 240, + "logprobs": [ + -9.362494468688965, + -2.827894449234009, + -4.557381629943848, + -1.4968647956848145, + -0.717312216758728, + -1.7262351512908936, + -2.522736072540283, + -2.1782360076904297, + -2.3603432178497314, + -6.136383533477783, + -1.4676916599273682, + -3.468963384628296, + -4.424870491027832, + -3.7345848083496094, + -2.012619972229004, + -1.8833301067352295, + -3.5708768367767334, + -6.8197832107543945, + -0.3122292757034302, + -0.9820290207862854, + -6.532033443450928, + -7.498172760009766, + -12.615165710449219, + -2.409003496170044, + -3.8550546169281006, + -0.5105050802230835, + -4.2802581787109375, + -0.06971167027950287, + -0.054025799036026, + -3.319596767425537, + -9.703240394592285, + -1.0997297763824463, + -6.224854469299316, + -5.234503269195557, + -3.934987783432007, + -2.5263679027557373, + -3.1843955516815186, + -5.880871295928955, + -1.8436813354492188, + -5.906496047973633, + -12.15787410736084, + -12.5841064453125, + -0.0819428563117981, + -2.6212656497955322, + -1.4329369068145752, + -2.885145425796509, + -1.2901865243911743, + -0.006647023372352123, + -3.5115818977355957, + -12.945953369140625, + -3.793078899383545, + -3.0094375610351562, + -5.966838836669922, + -0.8998424410820007, + -0.040962252765893936, + -1.5467679500579834, + -1.0785343647003174, + -5.73494815826416, + -0.38491737842559814, + -5.017007827758789, + -0.5568072199821472, + -0.5968841910362244, + -2.3609962463378906, + -13.582086563110352, + -0.09050048142671585, + -3.7264108657836914, + -1.1208789348602295, + -6.052675247192383, + -0.5848909616470337, + -3.5906238555908203, + -0.9494907855987549, + -1.5676641464233398, + -5.127577781677246, + -17.19189453125, + -6.698403835296631, + -1.0449178218841553, + -4.365664958953857, + -1.1243419647216797, + -2.2092156410217285, + -1.8081634044647217, + -0.23330983519554138, + -9.439546585083008, + -0.2947109341621399, + -7.253565788269043, + -2.3855936527252197, + -4.629369258880615, + -3.4186267852783203, + -1.9727531671524048, + -2.331681251525879, + -1.5606917142868042, + -2.454296588897705, + -1.5334703922271729, + -1.2631131410598755, + -2.657367706298828, + -0.6480202078819275, + -0.4550393521785736, + -1.3625166416168213, + -0.8142069578170776, + -0.4496593475341797, + -0.9312890768051147, + -1.732723355293274, + -0.44613128900527954, + -1.6895122528076172, + -0.6082233190536499, + -1.0978344678878784, + -1.1122435331344604, + -0.002520838286727667, + -1.4072327613830566, + -0.007462364621460438, + -0.7548662424087524, + -0.9937503337860107, + -0.0675487294793129, + -0.9595617055892944, + -0.029961343854665756, + -2.205785036087036, + -1.2615025043487549, + -0.7878209352493286 + ] + }, + "throughput": 103.54772132523577 } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgxh100_coreweave.json index ddc6cacf3a8..771d0c18307 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgxh100_coreweave.json @@ -1,187 +1,178 @@ { - "0": { - "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", - "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", - "generated_tokens": [ - 3060, - 2430, - 1636, - 2012, - 1317, - 1278, - 2362, - 1307, - 1278, - 16070, - 1044, - 1321, - 1636, - 23067, - 1455, - 1593, - 1395, - 1605, - 3140, - 5152, - 1513, - 1747, - 1046, - 2409, - 1395, - 3140, - 5152, - 1513, - 1278, - 2362 - ], - "latency": 0.3733036518096924, - "cuda_graph_request_count_map": { - "372": 0, - "360": 0, - "336": 0, - "312": 0, - "288": 0, - "264": 0, - "240": 0, - "216": 0, - "192": 0, - "168": 0, - "144": 0, - "120": 0, - "96": 0, - "72": 0, - "48": 0, - "24": 29 - }, - "step_count": 240, - "logprobs": [ - -9.362494468688965, - -2.827894449234009, - -4.557381629943848, - -1.4968647956848145, - -0.717312216758728, - -1.7262351512908936, - -2.522736072540283, - -2.1782360076904297, - -2.3603432178497314, - -6.136383533477783, - -1.4676916599273682, - -3.468963384628296, - -4.424870491027832, - -3.7345848083496094, - -2.012619972229004, - -1.8833301067352295, - -3.5708768367767334, - -6.8197832107543945, - -0.3122292757034302, - -0.9820290207862854, - -6.532033443450928, - -7.498172760009766, - -12.615165710449219, - -2.409003496170044, - -3.8550546169281006, - -0.5105050802230835, - -4.2802581787109375, - -0.06971167027950287, - -0.054025799036026, - -3.319596767425537, - -9.703240394592285, - -1.0997297763824463, - -6.224854469299316, - -5.234503269195557, - -3.934987783432007, - -2.5263679027557373, - -3.1843955516815186, - -5.880871295928955, - -1.8436813354492188, - -5.906496047973633, - -12.15787410736084, - -12.5841064453125, - -0.0819428563117981, - -2.6212656497955322, - -1.4329369068145752, - -2.885145425796509, - -1.2901865243911743, - -0.006647023372352123, - -3.5115818977355957, - -12.945953369140625, - -3.793078899383545, - -3.0094375610351562, - -5.966838836669922, - -0.8998424410820007, - -0.040962252765893936, - -1.5467679500579834, - -1.0785343647003174, - -5.73494815826416, - -0.38491737842559814, - -5.017007827758789, - -0.5568072199821472, - -0.5968841910362244, - -2.3609962463378906, - -13.582086563110352, - -0.09050048142671585, - -3.7264108657836914, - -1.1208789348602295, - -6.052675247192383, - -0.5848909616470337, - -3.5906238555908203, - -0.9494907855987549, - -1.5676641464233398, - -5.127577781677246, - -17.19189453125, - -6.698403835296631, - -1.0449178218841553, - -4.365664958953857, - -1.1243419647216797, - -2.2092156410217285, - -1.8081634044647217, - -0.23330983519554138, - -9.439546585083008, - -0.2947109341621399, - -7.253565788269043, - -2.3855936527252197, - -4.629369258880615, - -3.4186267852783203, - -1.9727531671524048, - -2.331681251525879, - -1.5606917142868042, - -2.454296588897705, - -1.5334703922271729, - -1.2631131410598755, - -2.657367706298828, - -0.6480202078819275, - -0.4550393521785736, - -1.3625166416168213, - -0.8142069578170776, - -0.4496593475341797, - -0.9312890768051147, - -1.732723355293274, - -0.44613128900527954, - -1.6895122528076172, - -0.6082233190536499, - -1.0978344678878784, - -1.1122435331344604, - -0.002520838286727667, - -1.4072327613830566, - -0.007462364621460438, - -0.7548662424087524, - -0.9937503337860107, - -0.0675487294793129, - -0.9595617055892944, - -0.029961343854665756, - -2.205785036087036, - -1.2615025043487549, - -0.7878209352493286 - ] - }, - "throughput": [ - 14.167753773233736, - 78.68224606460956, - 79.61636072923858, - 79.54665108975186, - 79.62008872611396, - 79.57034369848175, - 79.0717192987748, - 79.63717144611178 - ] + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", + "generated_tokens": [ + 3060, + 2430, + 1636, + 2012, + 1317, + 1278, + 2362, + 1307, + 1278, + 16070, + 1044, + 1321, + 1636, + 23067, + 1455, + 1593, + 1395, + 1605, + 3140, + 5152, + 1513, + 1747, + 1046, + 2409, + 1395, + 3140, + 5152, + 1513, + 1278, + 2362 + ], + "latency": 0.3733036518096924, + "cuda_graph_request_count_map": { + "372": 0, + "360": 0, + "336": 0, + "312": 0, + "288": 0, + "264": 0, + "240": 0, + "216": 0, + "192": 0, + "168": 0, + "144": 0, + "120": 0, + "96": 0, + "72": 0, + "48": 0, + "24": 29 + }, + "step_count": 240, + "logprobs": [ + -9.362494468688965, + -2.827894449234009, + -4.557381629943848, + -1.4968647956848145, + -0.717312216758728, + -1.7262351512908936, + -2.522736072540283, + -2.1782360076904297, + -2.3603432178497314, + -6.136383533477783, + -1.4676916599273682, + -3.468963384628296, + -4.424870491027832, + -3.7345848083496094, + -2.012619972229004, + -1.8833301067352295, + -3.5708768367767334, + -6.8197832107543945, + -0.3122292757034302, + -0.9820290207862854, + -6.532033443450928, + -7.498172760009766, + -12.615165710449219, + -2.409003496170044, + -3.8550546169281006, + -0.5105050802230835, + -4.2802581787109375, + -0.06971167027950287, + -0.054025799036026, + -3.319596767425537, + -9.703240394592285, + -1.0997297763824463, + -6.224854469299316, + -5.234503269195557, + -3.934987783432007, + -2.5263679027557373, + -3.1843955516815186, + -5.880871295928955, + -1.8436813354492188, + -5.906496047973633, + -12.15787410736084, + -12.5841064453125, + -0.0819428563117981, + -2.6212656497955322, + -1.4329369068145752, + -2.885145425796509, + -1.2901865243911743, + -0.006647023372352123, + -3.5115818977355957, + -12.945953369140625, + -3.793078899383545, + -3.0094375610351562, + -5.966838836669922, + -0.8998424410820007, + -0.040962252765893936, + -1.5467679500579834, + -1.0785343647003174, + -5.73494815826416, + -0.38491737842559814, + -5.017007827758789, + -0.5568072199821472, + -0.5968841910362244, + -2.3609962463378906, + -13.582086563110352, + -0.09050048142671585, + -3.7264108657836914, + -1.1208789348602295, + -6.052675247192383, + -0.5848909616470337, + -3.5906238555908203, + -0.9494907855987549, + -1.5676641464233398, + -5.127577781677246, + -17.19189453125, + -6.698403835296631, + -1.0449178218841553, + -4.365664958953857, + -1.1243419647216797, + -2.2092156410217285, + -1.8081634044647217, + -0.23330983519554138, + -9.439546585083008, + -0.2947109341621399, + -7.253565788269043, + -2.3855936527252197, + -4.629369258880615, + -3.4186267852783203, + -1.9727531671524048, + -2.331681251525879, + -1.5606917142868042, + -2.454296588897705, + -1.5334703922271729, + -1.2631131410598755, + -2.657367706298828, + -0.6480202078819275, + -0.4550393521785736, + -1.3625166416168213, + -0.8142069578170776, + -0.4496593475341797, + -0.9312890768051147, + -1.732723355293274, + -0.44613128900527954, + -1.6895122528076172, + -0.6082233190536499, + -1.0978344678878784, + -1.1122435331344604, + -0.002520838286727667, + -1.4072327613830566, + -0.007462364621460438, + -0.7548662424087524, + -0.9937503337860107, + -0.0675487294793129, + -0.9595617055892944, + -0.029961343854665756, + -2.205785036087036, + -1.2615025043487549, + -0.7878209352493286 + ] + }, + "throughput": 79.54665108975186 } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json index 7184e0e35c1..a76d4f44413 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json @@ -1,2703 +1,2699 @@ { - "0": { - "input_prompt": "The $500 Cup of coffee?\nConsider this, most Americans spend an average of $1,500-2,000 a year on this bean water.\nI have a few question for you: \nHow has business been the past few months?\nDo you ever feel like your business is stuck?\nDon't feel like you're able to improve performance and make changes required to achieve success ?\nAre your customers spneding less and less and wanting more?\nHave the gas prices affected your business?\nDo you have employees and do they hate you or wish they could quit?\n\nNow, before you and I can decide wheter or not I will be a good fit for your business we should talk this over with coffee.\nAnd, just to warn you this isn't some casual thing. This is not a date or time to be personal or social (but by all means share what you will coz I'll gladly listen).\nTher eare two major talking points and stratagies we will focios on in our lil coffee social\nFor one, we will find your unique selling Proposition (USP).\nDo have the best price? Are you the cheapest in town? Are your customers jerks? Do you haVE REGULARS? Why do people come back?\nwe'll also look for the holes in your business bucket. I'm willing to bet there's a hole or two in your business we'll find together that'll make this 500 dollar cup of Joe pay for itse;f immedietly.\nMany find themselves to be more profitable by just finding out where the dollars are escaping in their business and I like to think of myself as a guy that comes along with some spakel or putty and patch those holes up for you.\nBeleive me, just fixing one hole can mean a lot...just think about a sinking boat that has a hole in it that's about 3\u201d in diameter... it doesn't take long to sink.\nI have no agenda, besides f=getting to know your business and seeing wher I can patch the holes and find what makes you do darn unique (I know this won't take long.)\nMany folks, I bet, will find what they need to get off their chest with a quick phone call and they just paypal me the money and make a coffee at home. Look, that's fine too.\nI just to get you ot of your comfort zone, because this is where it all starts my frind.\nSome smart GOAT entrepreneur will probably get everything they need just out of our lil mini consulatant for the more extensive business consukting I offer, and look, that's fine, too.\nMaybe this cup of coffee will be all you need to gtet back on your feet, but not only surive, but thrive!\nI'm not trying to make frineds, or make a bunch of money, or look down your shirt\nBut this is only going to be a 45 minute (max) coffee chat\nAnd, I know you (and me) both have a lot on our plates. So no messing around\nAfter our consultation I will follow up with you in a few days and see how things are going, then I will be emailing you about once or twice every two weeks, just to make sure you're staying on task and implementing what we went over.\nTghere is no obligation to go any further and will gladly give you back your money if this pre-consultation doesn't put you on the right path or you don't get any value out of it...", - "generated_text": " $ is a$ is a $ is a $ is a $ is a $ is a $$1, you\n$ $$ $\n$ $- $\n$\n$\n$ $$$\n$\n$\n$\n$\n$\n$\n$\n$???????, $\n-1\n$\n1.5.\n$\n$, you\n$.\n$\n1,1.1\nI\n$.\nI\n\n\nHow\n$,,,0,1,0,0.0\nIn\nThe\nThe\nThe\n", - "generated_tokens": [ - 1659, - 1395, - 1261, - 1036, - 1395, - 1261, - 1659, - 1395, - 1261, - 1659, - 1395, - 1261, - 1659, - 1395, - 1261, - 1659, - 1395, - 1261, - 1659, - 1036, - 1049, - 1044, - 1636, - 1010, - 1036, - 1659, - 1036, - 1659, - 1010, - 1036, - 1659, - 1045, - 1659, - 1010, - 1036, - 1010, - 1036, - 1010, - 1036, - 1659, - 1036, - 1036, - 1010, - 1036, - 1010, - 1036, - 1010, - 1036, - 1010, - 1036, - 1010, - 1036, - 1010, - 1036, - 1010, - 1036, - 1010, - 1036, - 1063, - 1063, - 1063, - 1063, - 1063, - 1063, - 1063, - 1044, - 1659, - 1010, - 1045, - 1049, - 1010, - 1036, - 1010, - 1049, - 1046, - 1053, - 1046, - 1010, - 1036, - 1010, - 1036, - 1044, - 1636, - 1010, - 1036, - 1046, - 1010, - 1036, - 1010, - 1049, - 1044, - 1049, - 1046, - 1049, - 1010, - 1073, - 1010, - 1036, - 1046, - 1010, - 1073, - 1010, - 1010, - 1010, - 7801, - 1010, - 1036, - 1044, - 1044, - 1044, - 1048, - 1044, - 1049, - 1044, - 1048, - 1044, - 1048, - 1046, - 1048, - 1010, - 1785, - 1010, - 1784, - 1010, - 1784, - 1010, - 1784, - 1010 - ], - "latency": 9.77891230583191, - "cuda_graph_request_count_map": null, - "step_count": 6144, - "logprobs": [ - -7.7319135665893555, - -2.188307285308838, - -0.7547445297241211, - -0.7294313311576843, - -10.238386154174805, - -3.3775341510772705, - -6.394498825073242, - -7.354557037353516, - -9.018157958984375, - -3.012073040008545, - -3.2584073543548584, - -5.220732688903809, - -4.620487213134766, - -2.5078930854797363, - -3.752683162689209, - -0.13360372185707092, - -0.05705544352531433, - -0.41462242603302, - -1.585279941558838, - -1.6438164710998535, - -1.9557222127914429, - -0.3989897072315216, - -0.0365302674472332, - -6.368816375732422, - -0.8731719255447388, - -0.022585075348615646, - -0.2775891423225403, - -0.0027362785767763853, - -0.0006812873762100935, - -1.581446647644043, - -0.008688976056873798, - -0.3532317280769348, - -6.071163177490234, - -9.162371635437012, - -9.965556144714355, - -2.400461196899414, - -2.9898362159729004, - -2.9803032875061035, - -2.12601900100708, - -3.500912666320801, - -7.015069007873535, - -2.278961420059204, - -0.46380555629730225, - -4.078739166259766, - -1.9430254697799683, - -3.5642244815826416, - -3.689701795578003, - -6.201474189758301, - -6.580833911895752, - -2.3081111907958984, - -5.42717170715332, - -1.1886008977890015, - -1.172760248184204, - -1.3571951389312744, - -1.3551844358444214, - -3.376784324645996, - -0.05118789151310921, - -4.064360618591309, - -2.575554847717285, - -0.6994737386703491, - -2.56724214553833, - -2.1888976097106934, - -0.4816131591796875, - -4.070178985595703, - -2.0060782432556152, - -6.858033180236816, - -0.059200502932071686, - -3.214278221130371, - -0.9671833515167236, - -0.823198676109314, - -1.0130078792572021, - -4.595561981201172, - -0.012724989093840122, - -5.214311599731445, - -8.246870040893555, - -3.1476030349731445, - -3.299684524536133, - -4.218191146850586, - -7.318399429321289, - -0.8580498695373535, - -3.0894036293029785, - -1.886361002922058, - -7.217658996582031, - -3.271679639816284, - -3.9717154502868652, - -1.8835484981536865, - -10.034332275390625, - -11.382490158081055, - -5.417011260986328, - -7.505967140197754, - -2.33837890625, - -0.07904055714607239, - -3.294971227645874, - -7.813640594482422, - -1.7646901607513428, - -4.025320053100586, - -3.5977325439453125, - -4.390352249145508, - -9.147806167602539, - -0.5303041934967041, - -7.721246242523193, - -0.6311959028244019, - -0.8119025230407715, - -0.7227814197540283, - -1.8369406461715698, - -0.20933297276496887, - -1.5395950078964233, - -4.424448490142822, - -4.084965705871582, - -3.355497360229492, - -1.0475609302520752, - -6.479413986206055, - -0.7810530662536621, - -2.132437229156494, - -6.648703098297119, - -2.9522438049316406, - -1.2485712766647339, - -4.040503025054932, - -2.3415768146514893, - -5.358206748962402, - -1.6258506774902344, - -3.956300973892212, - -0.732298731803894, - -7.441117286682129, - -1.5242161750793457, - -2.4555861949920654, - -4.295163154602051, - -9.687600135803223, - -0.8213484883308411, - -1.2446978092193604, - -0.01942702941596508, - -4.619411468505859, - -3.3297007083892822, - -2.2139487266540527, - -3.691431999206543, - -2.6574106216430664, - -6.075929641723633, - -0.6123450994491577, - -1.2942559719085693, - -0.6262839436531067, - -7.398006439208984, - -4.4869890213012695, - -4.202048301696777, - -4.982994079589844, - -0.637227475643158, - -3.061023235321045, - -10.117584228515625, - -3.8567495346069336, - -4.0480828285217285, - -2.472019672393799, - -4.246374607086182, - -1.3939155340194702, - -7.132441520690918, - -0.20108745992183685, - -4.986658573150635, - -4.387957572937012, - -0.01108358334749937, - -4.209756851196289, - -7.271108627319336, - -4.047314643859863, - -2.6497321128845215, - -1.4763175249099731, - -0.28365400433540344, - -3.5247769355773926, - -1.4226995706558228, - -4.327237129211426, - -2.0407187938690186, - -6.1437907218933105, - -1.5190880298614502, - -2.5511486530303955, - -7.504094123840332, - -2.152172565460205, - -6.708334922790527, - -6.913146495819092, - -3.6959621906280518, - -6.752341270446777, - -0.63083815574646, - -0.12433214485645294, - -5.0525641441345215, - -4.435934066772461, - -0.45601028203964233, - -6.3459577560424805, - -9.882917404174805, - -3.1422882080078125, - -2.550520658493042, - -3.2099051475524902, - -6.278127193450928, - -0.07764133810997009, - -3.155696153640747, - -1.933587670326233, - -9.61027717590332, - -6.211391925811768, - -4.664543151855469, - -6.783782005310059, - -5.676271438598633, - -8.605900764465332, - -0.0824289619922638, - -3.5463995933532715, - -13.374168395996094, - -1.2401021718978882, - -1.8734056949615479, - -3.4154422283172607, - -1.6733763217926025, - -17.633970260620117, - -9.345113754272461, - -0.6277351975440979, - -2.9617538452148438, - -2.5565333366394043, - -10.10580825805664, - -7.130337715148926, - -7.36820125579834, - -4.098911285400391, - -5.747079372406006, - -2.945054769515991, - -0.7887389063835144, - -1.6583149433135986, - -1.0165244340896606, - -6.581666946411133, - -5.926386833190918, - -5.845194339752197, - -0.9657630920410156, - -7.868755340576172, - -1.3244551420211792, - -0.2657390236854553, - -0.06403665244579315, - -2.983020782470703, - -5.943899631500244, - -7.877285957336426, - -3.593116283416748, - -3.819509506225586, - -7.226177215576172, - -2.5206997394561768, - -3.385587215423584, - -0.37499159574508667, - -1.4698283672332764, - -3.1460342407226562, - -0.0077166082337498665, - -4.350916862487793, - -3.2183218002319336, - -0.6242184638977051, - -1.4782464504241943, - -2.8054311275482178, - -3.0831401348114014, - -12.17662525177002, - -2.113419532775879, - -1.6448111534118652, - -2.1834323406219482, - -0.7630388140678406, - -10.1896390914917, - -6.234405517578125, - -11.46288776397705, - -1.003785490989685, - -4.211658477783203, - -1.5010679960250854, - -5.859302043914795, - -2.0465080738067627, - -3.7468819618225098, - -4.684195518493652, - -4.318704128265381, - -2.7234389781951904, - -9.00437068939209, - -3.043811321258545, - -3.1384406089782715, - -2.713779926300049, - -2.095993995666504, - -2.1484954357147217, - -10.274479866027832, - -0.682350754737854, - -0.25973302125930786, - -3.6964316368103027, - -13.434456825256348, - -2.3368239402770996, - -5.382724761962891, - -1.9073458909988403, - -5.905669212341309, - -0.032165709882974625, - -1.6530004739761353, - -2.728893280029297, - -1.640552043914795, - -1.1391171216964722, - -1.4353511333465576, - -4.003787994384766, - -0.3450564742088318, - -0.7168521285057068, - -0.34650325775146484, - -0.3616408705711365, - -7.062709331512451, - -1.2851682901382446, - -2.299129009246826, - -8.800156593322754, - -5.208735466003418, - -4.780910491943359, - -2.78342342376709, - -4.469717979431152, - -6.909726619720459, - -2.5114197731018066, - -0.659822404384613, - -0.6915416121482849, - -3.2363741397857666, - -0.5283617377281189, - -0.10473938286304474, - -6.215325832366943, - -7.283237934112549, - -1.6797031164169312, - -11.50100040435791, - -7.5822978019714355, - -3.387317657470703, - -11.407575607299805, - -5.441976547241211, - -3.3264851570129395, - -0.7265786528587341, - -1.382750153541565, - -7.841699600219727, - -8.105277061462402, - -3.9569506645202637, - -4.963083267211914, - -0.5492897629737854, - -4.6081390380859375, - -5.870400905609131, - -3.957930088043213, - -5.275494575500488, - -4.105091094970703, - -2.15435528755188, - -2.8472700119018555, - -1.1278448104858398, - -8.226571083068848, - -0.40629008412361145, - -9.916461944580078, - -4.616743087768555, - -1.691868543624878, - -0.6639478802680969, - -2.5716753005981445, - -6.676954746246338, - -6.535329818725586, - -0.4170510768890381, - -1.443942904472351, - -3.145481824874878, - -1.440589427947998, - -0.26935356855392456, - -0.9647155404090881, - -4.335958957672119, - -1.5647850036621094, - -5.890466690063477, - -3.01654052734375, - -1.9168468713760376, - -3.7365682125091553, - -8.001864433288574, - -10.680083274841309, - -4.489352226257324, - -4.6058149337768555, - -7.69011116027832, - -3.6247005462646484, - -1.5600426197052002, - -10.2160062789917, - -5.004643440246582, - -0.19602319598197937, - -3.375545024871826, - -2.669325590133667, - -1.3932737112045288, - -1.6410658359527588, - -6.847603797912598, - -6.744344711303711, - -0.5215591192245483, - -0.25840020179748535, - -1.1448237895965576, - -5.57253885269165, - -7.251138687133789, - -4.221924781799316, - -0.7688062787055969, - -2.504502534866333, - -3.146519660949707, - -2.206653356552124, - -1.4295082092285156, - -7.96943998336792, - -4.332189083099365, - -2.5750505924224854, - -1.7102608680725098, - -5.311381816864014, - -8.897522926330566, - -2.994919538497925, - -3.3397974967956543, - -2.1794328689575195, - -2.437566041946411, - -0.3181810975074768, - -0.27412793040275574, - -0.7914466857910156, - -2.3470635414123535, - -2.4099245071411133, - -2.491870880126953, - -3.024170160293579, - -1.9719040393829346, - -11.373910903930664, - -1.4279751777648926, - -0.14573107659816742, - -2.055763006210327, - -6.366893291473389, - -4.24091911315918, - -0.00709194503724575, - -2.0199716091156006, - -2.524750232696533, - -1.4272525310516357, - -0.5185190439224243, - -2.927150011062622, - -2.7070627212524414, - -3.365638017654419, - -4.318085193634033, - -7.773144721984863, - -1.7947180271148682, - -7.657534599304199, - -8.767786026000977, - -14.74280071258545, - -1.8042558431625366, - -3.2712037563323975, - -1.4002125263214111, - -4.887944221496582, - -1.4821010828018188, - -1.5255622863769531, - -5.879070281982422, - -4.463839530944824, - -5.1955976486206055, - -5.665647506713867, - -0.3775045573711395, - -5.9350481033325195, - -2.800539255142212, - -0.13162286579608917, - -3.034379720687866, - -4.729524612426758, - -4.6252641677856445, - -3.850942611694336, - -2.4760568141937256, - -6.059760093688965, - -10.12075138092041, - -0.9469369649887085, - -11.595907211303711, - -6.875324726104736, - -4.268826007843018, - -2.835529088973999, - -3.8626279830932617, - -4.876199245452881, - -0.013071090914309025, - -4.964417934417725, - -0.7445687055587769, - -5.707155227661133, - -6.10660457611084, - -4.317755699157715, - -4.440443992614746, - -2.9202542304992676, - -4.743522644042969, - -1.2569392919540405, - -2.8675737380981445, - -2.3151841163635254, - -4.318130970001221, - -1.9054772853851318, - -1.1808521747589111, - -0.765956461429596, - -2.768916606903076, - -6.237791061401367, - -1.7224305868148804, - -7.137521743774414, - -4.512486457824707, - -1.9069950580596924, - -4.145983695983887, - -5.365190505981445, - -0.059828490018844604, - -2.273892879486084, - -3.4013004302978516, - -5.035730361938477, - -6.501443386077881, - -9.903446197509766, - -1.6332892179489136, - -2.1572084426879883, - -1.6149548292160034, - -1.4698481559753418, - -6.01010799407959, - -2.2243528366088867, - -6.900836944580078, - -6.0930986404418945, - -2.974020481109619, - -3.225423574447632, - -8.423272132873535, - -1.3423724174499512, - -3.626147508621216, - -0.4862469434738159, - -6.860866546630859, - -3.8910953998565674, - -2.33319354057312, - -1.7229185104370117, - -2.215972423553467, - -8.99046516418457, - -4.099084854125977, - -2.4191012382507324, - -8.288970947265625, - -2.9641928672790527, - -1.5036451816558838, - -3.0544614791870117, - -0.0715634673833847, - -2.444031238555908, - -4.520998954772949, - -3.972568988800049, - -0.4985870122909546, - -2.1651363372802734, - -3.4427435398101807, - -1.730639100074768, - -0.9458961486816406, - -7.740211009979248, - -9.39163875579834, - -3.895984172821045, - -1.7523534297943115, - -5.41331672668457, - -8.910720825195312, - -12.971094131469727, - -3.0455880165100098, - -10.501265525817871, - -3.3864927291870117, - -4.842309951782227, - -3.9964733123779297, - -7.3046793937683105, - -2.6607093811035156, - -1.3541781902313232, - -5.003270626068115, - -3.944551944732666, - -0.11356143653392792, - -5.174440383911133, - -9.628616333007812, - -8.654989242553711, - -8.980416297912598, - -6.670101642608643, - -5.488286018371582, - -5.943419933319092, - -2.126483201980591, - -8.054739952087402, - -7.458671569824219, - -2.5267202854156494, - -6.455472946166992, - -8.655346870422363, - -7.903901100158691, - -6.221062660217285, - -7.129237174987793, - -4.2345380783081055, - -2.5375306606292725, - -7.697700500488281, - -1.567080020904541, - -2.084331750869751, - -0.25020831823349, - -1.5145041942596436, - -4.619244575500488, - -0.2970108985900879, - -0.4977554678916931, - -6.197869300842285, - -4.030620098114014, - -7.232107639312744, - -0.21076253056526184, - -1.563366174697876, - -1.133756160736084, - -2.708237648010254, - -4.080535888671875, - -0.6818401217460632, - -0.1864331066608429, - -0.49012088775634766, - -8.732468605041504, - -11.945040702819824, - -5.243098735809326, - -1.5294703245162964, - -0.8935543298721313, - -0.6174070835113525, - -1.5068217515945435, - -3.5766501426696777, - -5.393096923828125, - -4.202867031097412, - -14.765748023986816, - -5.2513813972473145, - -0.7597705721855164, - -0.2502063810825348, - -1.7403976917266846, - -2.8000779151916504, - -1.9808133840560913, - -2.1654744148254395, - -1.8629226684570312, - -3.222038745880127, - -0.040942225605249405, - -2.3384013175964355, - -10.210381507873535, - -4.5859761238098145, - -0.5805734395980835, - -3.7019288539886475, - -2.001936674118042, - -2.7876083850860596, - -2.9799084663391113, - -4.349887371063232, - -0.0792960673570633, - -1.4366114139556885, - -1.0813264846801758, - -1.3510822057724, - -6.7060699462890625, - -5.436615943908691, - -3.978389263153076, - -6.785447597503662, - -6.147171497344971, - -3.97414231300354, - -4.332991600036621, - -0.9269428253173828, - -5.1237101554870605, - -4.486598968505859, - -0.04678357392549515, - -1.0307552814483643, - -1.4249452352523804, - -4.517682075500488, - -3.561821699142456, - -2.0815205574035645, - -0.6041194200515747, - -5.992964744567871, - -7.092092514038086, - -0.48916709423065186, - -2.6405677795410156, - -4.3345723152160645, - -3.533582925796509, - -3.1233346462249756, - -3.107872486114502, - -1.9901115894317627, - -3.1052846908569336, - -1.8440347909927368, - -6.21368408203125, - -1.8796799182891846, - -2.705214738845825, - -0.2987763583660126, - -4.070865154266357, - -1.6675832271575928, - -1.3896636962890625, - -1.5731089115142822, - -3.526170015335083, - -2.5088443756103516, - -1.208929419517517, - -3.673125743865967, - -2.501532554626465, - -6.875064373016357, - -8.512459754943848, - -1.042314052581787, - -3.657850980758667, - -7.0950798988342285, - -4.974049091339111, - -8.14085578918457, - -3.529888153076172, - -1.9389504194259644, - -7.0902204513549805, - -2.409292459487915, - -2.9428021907806396, - -1.688283085823059, - -3.622368335723877, - -2.0903351306915283, - -4.160663604736328, - -3.1683764457702637, - -1.2135626077651978, - -7.566033363342285, - -3.1186251640319824, - -5.899919509887695, - -0.9518840312957764, - -2.656729221343994, - -2.2994377613067627, - -6.806836128234863, - -1.280236840248108, - -2.838846206665039, - -1.3598848581314087, - -11.707776069641113, - -3.134333372116089, - -0.6230669617652893, - -8.219222068786621, - -7.562507152557373, - -7.489459037780762, - -1.5368008613586426, - -7.149652481079102, - -5.749268054962158, - -3.162869691848755, - -2.7235195636749268, - -6.128931999206543, - -1.1934199333190918, - -3.986410617828369, - -3.76609468460083, - -1.712721586227417, - -3.195504903793335, - -8.397743225097656, - -3.1260581016540527, - -9.792022705078125, - -4.217884540557861, - -11.583260536193848, - -5.987588882446289, - -5.178754806518555, - -6.994749069213867, - -5.167606353759766, - -7.124668121337891, - -6.201416015625, - -10.203682899475098, - -6.858526229858398, - -2.733592987060547, - -5.078882217407227, - -9.003358840942383, - -4.704894542694092, - -3.9085562229156494, - -7.247268199920654, - -7.091092109680176, - -4.4150166511535645, - -7.56699275970459, - -9.485116004943848, - -1.9977033138275146, - -6.65272331237793, - -2.236643075942993, - -7.518955707550049, - -5.525973320007324, - -4.67877721786499, - -6.608670234680176, - -5.536133766174316, - -10.772479057312012, - -10.8853178024292, - -3.6156129837036133, - -6.751470565795898, - -6.4537434577941895, - -3.4220399856567383, - -8.251005172729492, - -3.2146153450012207, - -6.330069541931152, - -1.5551663637161255, - -6.520583629608154, - -10.450878143310547, - -5.8788957595825195, - -3.7398200035095215, - -3.9084208011627197, - -0.3640081584453583, - -6.961522102355957, - -6.066243648529053, - -7.270624160766602, - -5.098455429077148, - -2.7642822265625, - -5.460171699523926, - -7.362828731536865, - -2.558631658554077, - -2.186410427093506, - -2.5309929847717285, - -2.46756649017334, - -2.0306026935577393, - -1.8713470697402954, - -2.108008623123169, - -1.2698389291763306, - -2.1712756156921387, - -2.4432802200317383, - -1.1477653980255127, - -1.8417484760284424, - -2.5971946716308594, - -1.8250831365585327, - -2.103092670440674, - -2.5183165073394775, - -2.9367291927337646, - -1.9412965774536133, - -1.7692793607711792, - -2.864521026611328, - -3.1332175731658936, - -1.098311185836792, - -2.946441173553467, - -2.2800471782684326, - -3.1929852962493896, - -2.754260778427124, - -3.485616445541382, - -3.3010287284851074, - -2.5537776947021484, - -2.6752865314483643, - -3.1617612838745117, - -2.4571690559387207, - -2.060081958770752, - -2.425969362258911, - -2.212725877761841, - -2.4232254028320312, - -3.0587053298950195, - -2.4074010848999023, - -2.457937479019165, - -2.319617986679077, - -2.6340954303741455, - -2.599524736404419, - -2.5302212238311768, - -1.6849274635314941, - -2.2609786987304688, - -2.039928674697876, - -1.9474098682403564, - -2.3550753593444824, - -1.718749761581421, - -2.413884162902832, - -1.6247628927230835, - -2.4784040451049805, - -1.828325629234314, - -1.3880831003189087, - -1.4448199272155762, - -1.1477117538452148, - -1.1669728755950928, - -1.8787822723388672, - -1.5565840005874634, - -1.6666553020477295, - -1.747725248336792, - -1.959598422050476, - -2.0376486778259277, - -2.345367431640625, - -2.055098533630371, - -1.3940613269805908, - -3.4385242462158203, - -2.7489635944366455, - -3.2590157985687256, - -3.1128957271575928, - -1.7070379257202148, - -3.9010369777679443, - -3.21574068069458, - -3.3850393295288086, - -1.8778185844421387, - -2.698211908340454, - -1.8060741424560547, - -2.0845324993133545, - -3.4797585010528564, - -2.263254404067993, - -3.083108901977539, - -1.6589758396148682, - -2.687279224395752, - -1.77505624294281, - -2.6142921447753906, - -1.934045672416687, - -1.8834377527236938, - -2.8038980960845947, - -1.550542950630188, - -3.4054152965545654, - -1.724036693572998, - -2.3146564960479736, - -1.5134503841400146, - -2.9289023876190186, - -1.5285141468048096, - -3.421035051345825, - -1.3757282495498657, - -3.441431760787964, - -1.5286564826965332, - -3.4372904300689697, - -3.173043966293335, - -1.1313854455947876, - -1.486415147781372, - -2.506413221359253, - -3.5165903568267822, - -1.4112176895141602, - -3.7175354957580566, - -2.2170844078063965, - -0.704839289188385, - -0.6626103520393372, - -2.5483946800231934, - -0.949668288230896, - -3.1339564323425293, - -1.3326977491378784, - -3.5493476390838623, - -0.6785370111465454, - -3.236161470413208, - -2.1347084045410156, - -3.802447557449341, - -2.585503339767456, - -3.9361765384674072, - -2.290905714035034, - -3.714280605316162, - -1.793616533279419, - -3.7252492904663086, - -1.450188159942627, - -3.11938738822937, - -1.25174880027771 - ] - }, - "32": { - "input_prompt": "create a conversational article", - "generated_text": " about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article.", - "generated_tokens": [ - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046, - 1531, - 9369, - 2715, - 1402, - 2314, - 1278, - 17915, - 1307, - 1278, - 9369, - 1046 - ], - "latency": 48.63822364807129, - "cuda_graph_request_count_map": null, - "step_count": 6144, - "logprobs": [ - -4.4165568351745605, - -11.358176231384277, - -0.0701780766248703, - -7.797665119171143, - -2.6805992126464844, - -1.4707680940628052, - -3.0390255451202393, - -1.6902849674224854, - -1.270594835281372, - -1.1936196088790894, - -1.2523558139801025, - -2.7270259857177734, - -1.2371309995651245, - -0.9618493318557739, - -0.4379909038543701, - -1.3917063474655151, - -1.1055524349212646, - -0.9122569561004639, - -0.9911308288574219, - -0.08436793833971024, - -0.5424078106880188, - -0.9181017279624939, - -0.5873759388923645, - -0.19014373421669006, - -0.06655456870794296, - -0.15252672135829926, - -0.09415211528539658, - -0.009787309914827347, - -0.013910251669585705, - -0.005296128336340189, - -0.005677408073097467, - -0.02013739012181759, - -0.21594694256782532, - -0.07153760641813278, - -0.0066444179974496365, - -0.010198505595326424, - -0.011980246752500534, - -0.003686776151880622, - -0.0037619550712406635, - -0.0022467151284217834, - -0.004088377580046654, - -0.021828632801771164, - -0.0012669878778979182, - -0.09768074005842209, - -0.02652405947446823, - -0.0019286142196506262, - -0.002283824374899268, - -0.0032225127797573805, - -0.0009741804678924382, - -0.0009415484382770956, - -0.001211624126881361, - -0.001135300612077117, - -0.002340436913073063, - -0.0010846928926184773, - -0.0509282611310482, - -0.03832047060132027, - -0.00257422705180943, - -0.0022806129418313503, - -0.00262785074301064, - -0.0008195855189114809, - -0.0010239601833745837, - -0.0013777059502899647, - -0.0009899006690829992, - -0.0018756669014692307, - -0.0015304292319342494, - -0.08506463468074799, - -0.01893703266978264, - -0.0013797297142446041, - -0.0014461545506492257, - -0.0013971101725474, - -0.0005869334563612938, - -0.0005212855176068842, - -0.000876757490914315, - -0.0005256939912214875, - -0.0012863941956311464, - -0.0015691122971475124, - -0.051276568323373795, - -0.00973513163626194, - -0.0010469438275322318, - -0.0011531615164130926, - -0.0009969270322471857, - -0.00038342276820912957, - -0.0004032037395518273, - -0.000730247818864882, - -0.0003275334893260151, - -0.0008700875914655626, - -0.0017572689102962613, - -0.044393111020326614, - -0.013102858327329159, - -0.0011463745031505823, - -0.0012070996453985572, - -0.0012325793504714966, - -0.0005048430757597089, - -0.0004876854654867202, - -0.0007901645149104297, - -0.00041500062798149884, - -0.0009869233472272754, - -0.0018687656847760081, - -0.03943866863846779, - -0.014425630681216717, - -0.0014756753807887435, - -0.001423775334842503, - -0.001209719106554985, - -0.0005046047735959291, - -0.00042167355422861874, - -0.0007688426994718611, - -0.0002699726028367877, - -0.0006598440813831985, - -0.0017849955474957824, - -0.038999658077955246, - -0.012665312737226486, - -0.0014427024871110916, - -0.0014492495683953166, - -0.001016576774418354, - -0.00042083943844772875, - -0.00033241944038309157, - -0.0006403064471669495, - -0.00022373080719262362, - -0.0007053509471006691, - -0.0018597226589918137, - -0.030997740104794502, - -0.011259939521551132, - -0.0012655591126531363, - -0.00134151556994766, - -0.0008106521563604474, - -0.0003513672563713044, - -0.0002964295563288033, - -0.0006368515896610916, - -0.00020180096908006817, - -0.0005779979983344674, - -0.0016014858847483993, - -0.0271126888692379 - ] - }, - "64": { - "input_prompt": "App Concept: \"Eggy's Interactive Adventure World\"\n\nEggy's Interactive Adventure World is a cutting-edge app designed for phones and tablets that combines immersive storytelling with engaging, interactive experiences for both parents and children. This innovative app allows users to explore the world of Eggy and participate in a series of adventures that teach essential life skills, foster personal growth, and strengthen parent-child bonds.\n\nKey Features:\n\n1. Interactive Stories: The app brings the 20 Eggy stories to life through interactive, animated storytelling. Users can read, listen, and actively participate in the stories, making choices that guide Eggy through various adventures, each with unique lessons and experiences.\n2. Augmented Reality (AR) Integration: The app utilizes AR technology, allowing users to interact with Eggy and the story characters in their real-world environment. This immersive experience creates a deeper connection with the stories and encourages engagement.\n3. Personalized Adventures: The app customizes each story based on the user's interests, needs, and personal growth areas. By collecting user input, the app tailors the storylines and learning outcomes to ensure that each adventure resonates with the child's unique experiences.\n4. Parent-Child Cooperative Play: The app encourages parents and children to collaborate and engage in the stories together. Through a split-screen mode or multi-device connectivity, parents and children can actively participate in the stories and learn essential life skills together.\n5. Skill-Building Mini-Games: Interspersed throughout the stories are educational mini-games that reinforce the lessons and themes. These games are designed to be fun, engaging, and accessible to children, encouraging them to practice and develop various skills.\n6. Progress Tracking and Rewards: The app tracks the user's progress through the stories and rewards them with badges, trophies, and other collectibles. This gamification aspect encourages continued engagement and fosters a sense of accomplishment and growth.\n7. Community Features: The app provides a safe, moderated online community where parents can share their experiences, discuss the stories and lessons, and connect with other families who are also using the app. This community fosters support, inspiration, and learning.\n8. Expert Resources: The app includes access to resources from experts in child development, psychology, and education, providing parents with additional tools and guidance on fostering the growth and well-being of their children.\n\nEggy's Interactive Adventure World is a one-of-a-kind, mind-blowing app that revolutionizes the way parents and children engage with stories and learn essential life skills. Combining the power of storytelling, cutting-edge technology, and expert resources, this app is a must-have for families looking to strengthen their bonds and support their children's personal growth.", - "generated_text": "The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season was the 10th season was the 10 season was the 10 season was the 10", - "generated_tokens": [ - 2, - 1784, - 1032, - 1049, - 1057, - 1057, - 1057, - 1882, - 1050, - 1048, - 1048, - 1048, - 5526, - 1486, - 1278, - 1032, - 1049, - 1048, - 1411, - 5526, - 1307, - 1278, - 5805, - 46982, - 8525, - 1319, - 19644, - 1076, - 1577, - 1531, - 1032, - 1049, - 1057, - 1057, - 1057, - 1882, - 1050, - 1048, - 1048, - 1048, - 5526, - 1486, - 1278, - 1032, - 1049, - 1048, - 1411, - 5526, - 1307, - 1278, - 5805, - 46982, - 8525, - 1319, - 19644, - 1076, - 1577, - 1531, - 1032, - 1049, - 1057, - 1057, - 1057, - 1882, - 1050, - 1048, - 1048, - 1048, - 5526, - 1486, - 1278, - 1032, - 1049, - 1048, - 1411, - 5526, - 1307, - 1278, - 5805, - 46982, - 8525, - 1319, - 19644, - 1076, - 1577, - 1531, - 1032, - 1049, - 1057, - 1057, - 1057, - 1882, - 1050, - 1048, - 1048, - 1048, - 5526, - 1486, - 1278, - 1032, - 1049, - 1048, - 1411, - 5526, - 1486, - 1278, - 1032, - 1049, - 1048, - 1411, - 5526, - 1486, - 1278, - 1032, - 1049, - 1048, - 5526, - 1486, - 1278, - 1032, - 1049, - 1048, - 5526, - 1486, - 1278, - 1032, - 1049, - 1048 - ], - "latency": 87.50420427322388, - "cuda_graph_request_count_map": null, - "step_count": 6144, - "logprobs": [ - -12.107745170593262, - -2.9727728366851807, - -3.720092535018921, - -5.592433929443359, - -10.964235305786133, - -3.654498338699341, - -9.33439826965332, - -4.833785057067871, - -5.187321662902832, - -2.6944785118103027, - -6.9262237548828125, - -0.654232919216156, - -0.5550781488418579, - -0.21346639096736908, - -0.0134271876886487, - -0.010840100236237049, - -1.3878544569015503, - -0.6296291351318359, - -7.9766011238098145, - -0.4393192231655121, - -5.639142036437988, - -3.277270793914795, - -1.0206468105316162, - -11.703084945678711, - -0.7100943922996521, - -0.2809169888496399, - -2.771284818649292, - -7.190817832946777, - -4.048691749572754, - -0.012056218460202217, - -3.3802318572998047, - -0.6807184815406799, - -3.4844107627868652, - -3.312331199645996, - -0.5001641511917114, - -2.61255145072937, - -4.243694305419922, - -4.333778381347656, - -6.0625810623168945, - -0.011777156963944435, - -0.37577226758003235, - -0.9490834474563599, - -3.5450198650360107, - -2.1778035163879395, - -0.45957911014556885, - -3.00771164894104, - -1.7600425481796265, - -0.09766030311584473, - -2.467618942260742, - -1.329679012298584, - -0.8384320735931396, - -1.1864604949951172, - -3.628342866897583, - -0.2470003068447113, - -1.8938640356063843, - -5.168431282043457, - -0.05005566030740738, - -2.258014678955078, - -2.449028968811035, - -0.0034086955711245537, - -3.9485883712768555, - -1.6201664209365845, - -5.139942646026611, - -4.859354496002197, - -0.23686674237251282, - -0.5541543364524841, - -2.5826025009155273, - -6.114635467529297, - -4.3380208015441895, - -0.7412900924682617, - -0.3221715986728668, - -0.13805493712425232, - -4.1797332763671875, - -7.3456268310546875, - -0.13762745261192322, - -2.0905232429504395, - -1.0178627967834473, - -4.108260631561279, - -0.6007124185562134, - -1.0410642623901367, - -4.122039794921875, - -0.35905471444129944, - -1.4274661540985107, - -4.139932155609131, - -0.4237431585788727, - -1.6294409036636353, - -0.9811424016952515, - -4.132790565490723, - -1.1318120956420898, - -6.8258256912231445, - -1.5455098152160645, - -0.6984409093856812, - -13.664215087890625, - -0.1166313961148262, - -1.6347849369049072, - -0.28875046968460083, - -0.03130083531141281, - -1.5293006896972656, - -1.6488375663757324, - -4.224111557006836, - -4.760683059692383, - -1.9758747816085815, - -1.5828256607055664, - -2.8463857173919678, - -0.2620386481285095, - -1.7243889570236206, - -1.7945923805236816, - -0.8884308338165283, - -0.3766394555568695, - -0.34033581614494324, - -9.05566692352295, - -0.22754782438278198, - -0.033802058547735214, - -0.34108465909957886, - -0.5644669532775879, - -2.0925779342651367, - -4.547505855560303, - -10.870464324951172, - -1.1072022914886475, - -5.503787994384766, - -3.259672164916992, - -0.007964519783854485, - -3.0111639499664307, - -4.246737480163574, - -0.7813188433647156, - -3.331031322479248, - -4.485962867736816, - -0.9492117166519165, - -2.6757047176361084, - -1.1591349840164185, - -1.122117519378662, - -2.629878044128418, - -5.986321926116943, - -0.2146703153848648, - -0.002392764901742339, - -7.372479438781738, - -0.007077385671436787, - -0.06599216908216476, - -0.0970711037516594, - -3.2874932289123535, - -0.0019583588000386953, - -0.9122000336647034, - -4.930907249450684, - -0.019508399069309235, - -0.308611661195755, - -0.07778516411781311, - -3.8497893810272217, - -0.46124517917633057, - -0.38821348547935486, - -2.668412208557129, - -1.845987319946289, - -0.06470083445310593, - -0.006619549356400967, - -1.2610487937927246, - -0.13015533983707428, - -3.365312099456787, - -0.0014690094394609332, - -1.6789823770523071, - -1.2499005794525146, - -3.3992111682891846, - -5.563300132751465, - -0.823418140411377, - -4.24124813079834, - -1.6597849130630493, - -0.6941139698028564, - -1.5637556314468384, - -0.5482053756713867, - -0.9507225751876831, - -3.764758586883545, - -0.0006518622976727784, - -0.7540555000305176, - -5.058262825012207, - -0.3302401602268219, - -2.8130555152893066, - -0.17079885303974152, - -2.871047019958496, - -0.3991694450378418, - -3.1476998329162598, - -0.3488404452800751, - -2.0545666217803955, - -4.201597690582275, - -5.164614677429199, - -0.0271432027220726, - -0.0009785869624465704, - -3.3444161415100098, - -1.3117046356201172, - -6.375423431396484, - -0.05535568296909332, - -0.3919340968132019, - -0.060594215989112854, - -6.507473468780518, - -0.0023910999298095703, - -2.143423318862915, - -3.335618257522583, - -2.953970432281494, - -0.0013383012264966965, - -0.8080525398254395, - -0.29526084661483765, - -0.04036511853337288, - -3.231475353240967, - -1.0585589408874512, - -6.136373043060303, - -0.006182829383760691, - -0.035548023879528046, - -5.509808540344238, - -1.8490750789642334, - -9.83314037322998, - -0.07037576287984848, - -3.1621387004852295, - -6.762360095977783, - -1.3490527868270874, - -3.601043462753296, - -1.176393985748291, - -0.4342959523200989, - -0.06266004592180252, - -5.464046001434326, - -0.017946599051356316, - -1.0416009426116943, - -1.6117159128189087, - -12.289417266845703, - -1.5004339218139648, - -5.76563835144043, - -4.038386821746826, - -0.20812086760997772, - -3.6306562423706055, - -1.3901070356369019, - -1.087137222290039, - -2.423213243484497, - -4.503086090087891, - -0.0008031480247154832, - -0.03627370297908783, - -0.1653430461883545, - -7.958648681640625, - -1.1018548011779785, - -1.290948748588562, - -3.8049263954162598, - -1.8253734111785889, - -0.059022851288318634, - -0.0013984196120873094, - -4.698851585388184, - -2.5421664714813232, - -0.024493809789419174, - -4.828659534454346, - -3.0295286178588867, - -3.550312042236328, - -0.1185273677110672, - -0.22595760226249695, - -0.10782183706760406, - -1.4033282995224, - -0.4485701024532318, - -0.2889708876609802, - -0.05471855774521828, - -0.007632025051862001, - -2.1156554222106934, - -0.6249589323997498, - -4.198577404022217, - -0.14178156852722168, - -4.284021377563477, - -2.227515935897827, - -3.5022120475769043, - -0.19575819373130798, - -15.964509963989258, - -4.055960655212402, - -11.125024795532227, - -0.7681724429130554, - -3.0436902046203613, - -7.030262470245361, - -4.376729488372803, - -5.476145267486572, - -0.4219042658805847, - -3.7689766883850098, - -0.060010604560375214, - -0.8134393692016602, - -0.11386934667825699, - -0.025473715737462044, - -0.09736856073141098, - -4.357361793518066, - -0.3670865297317505, - -0.08063744008541107, - -0.1311480849981308, - -1.0903867483139038, - -1.2705107927322388, - -1.5076212882995605, - -4.295275688171387, - -0.04185756668448448, - -0.19810955226421356, - -1.9645220041275024, - -0.9597910642623901, - -0.13429655134677887, - -0.002283110748976469, - -7.066074371337891, - -3.639211654663086, - -1.0263917446136475, - -8.124760627746582, - -1.132537841796875, - -0.09160765260457993, - -0.08996370434761047, - -10.165366172790527, - -3.501585006713867, - -0.0019847711082547903, - -0.05309417471289635, - -0.31209683418273926, - -0.15089339017868042, - -1.23564875125885, - -1.2685208320617676, - -7.832758903503418, - -0.19271136820316315, - -0.014305183663964272, - -0.0007532381569035351, - -0.44688940048217773, - -2.6239724159240723, - -1.738666296005249, - -1.6480977535247803, - -0.46753185987472534, - -8.656959533691406, - -3.79868483543396, - -0.9281394481658936, - -2.2381181716918945, - -1.7654449939727783, - -0.4948798418045044, - -0.025028761476278305, - -1.5435361862182617, - -1.6390818357467651, - -1.4962153434753418, - -0.3425217270851135, - -0.013077914714813232, - -0.038474079221487045, - -5.3364362716674805, - -0.42365288734436035, - -1.884093999862671, - -3.510357618331909, - -6.198029518127441, - -0.44375038146972656, - -0.0008789013954810798, - -3.6025230884552, - -1.419615626335144, - -2.6723289489746094, - -5.775190830230713, - -1.1380761861801147, - -2.6683366298675537, - -0.43395891785621643, - -0.003145867260172963, - -8.63144302368164, - -1.646262764930725, - -1.732487678527832, - -4.561546802520752, - -0.5277953147888184, - -0.07333153486251831, - -0.5624169707298279, - -0.12201295047998428, - -2.6561455726623535, - -1.1071691513061523, - -2.6895060539245605, - -0.040864069014787674, - -0.04126371443271637, - -1.8294739723205566, - -0.09022177755832672, - -0.3154001832008362, - -0.46215569972991943, - -2.2462844848632812, - -0.30149081349372864, - -0.52588951587677, - -8.288043975830078, - -0.0002057340752799064, - -0.8021711707115173, - -4.4546098709106445, - -0.0001565095444675535, - -0.0015961299650371075, - -0.15216240286827087, - -0.3677564561367035, - -5.018707275390625, - -0.7850045561790466, - -1.9582659006118774, - -1.0046892166137695, - -10.0401029586792, - -0.16878114640712738, - -5.944240570068359, - -1.5523078441619873, - -5.7253522872924805, - -0.47948503494262695, - -0.44009655714035034, - -5.671053886413574, - -0.003280022880062461, - -0.7937742471694946, - -0.9639376401901245, - -0.00030048147891648114, - -1.0747740268707275, - -0.8839919567108154, - -3.416811466217041, - -1.6602673530578613, - -0.2706959843635559, - -0.0024333172477781773, - -4.478696823120117, - -6.20179557800293, - -0.11359559744596481, - -0.202009916305542, - -0.022310219705104828, - -2.367263078689575, - -1.0405994653701782, - -5.984308242797852, - -2.105138063430786, - -9.583202362060547, - -0.0004957877099514008, - -3.0655455589294434, - -0.0669412910938263, - -0.8977450728416443, - -2.2271294593811035, - -2.6617536544799805, - -1.8184051513671875, - -0.8291114568710327, - -0.4864235818386078, - -0.7993525862693787, - -3.51106858253479, - -2.1530935764312744, - -0.257144957780838, - -1.3934082984924316, - -1.3137131929397583, - -0.3384077548980713, - -0.1697217971086502, - -2.353395938873291, - -0.03406282886862755, - -0.39059701561927795, - -3.422821044921875, - -1.7117210626602173, - -0.7018465399742126, - -1.5995906591415405, - -3.6218395233154297, - -0.12497704476118088, - -0.16966234147548676, - -0.7313685417175293, - -0.4956285357475281, - -1.0840849876403809, - -5.042126655578613, - -0.00031704644788987935, - -7.683258056640625, - -0.9210801720619202, - -4.687852382659912, - -0.0028814247343689203, - -0.043382611125707626, - -4.1948652267456055, - -2.66593337059021, - -0.06153333932161331, - -0.0023110604379326105, - -6.729236602783203, - -5.777127742767334, - -0.08932067453861237, - -0.09890018403530121, - -0.009886111132800579, - -3.1145148277282715, - -3.725565195083618, - -0.0021998509764671326, - -3.9927196502685547, - -2.753793239593506, - -1.6037236452102661, - -0.17461130023002625, - -4.804804801940918, - -0.2311229705810547, - -0.30256444215774536, - -2.235363006591797, - -0.006614102050662041, - -0.34757524728775024, - -1.4946835041046143, - -1.222062587738037, - -3.658839225769043, - -1.356170892715454, - -0.5371109843254089, - -3.7580835819244385, - -4.54621696472168, - -0.31577637791633606, - -3.677156925201416, - -2.7181396484375, - -7.4674882888793945, - -0.00019369633810129017, - -2.3798398971557617, - -2.5452184677124023, - -0.2858496308326721, - -4.315659523010254, - -0.025835415348410606, - -0.000603493710514158, - -0.2546294331550598, - -0.12032663822174072, - -2.006908655166626, - -5.990736961364746, - -7.146596908569336, - -0.23356498777866364, - -0.2201036810874939, - -0.01235415879637003, - -0.011248741298913956, - -1.4155778884887695, - -0.40242519974708557, - -5.877886772155762, - -0.7865053415298462, - -0.03231288120150566, - -0.004864405374974012, - -0.0050629740580916405, - -2.7049152851104736, - -6.822089195251465, - -0.39252761006355286, - -1.2290617227554321, - -0.007630132604390383, - -3.485461711883545, - -0.47985684871673584, - -6.1813530921936035, - -0.03757825121283531, - -0.37834712862968445, - -0.22192610800266266, - -1.165318489074707, - -0.5220151543617249, - -0.1289423257112503, - -3.216222047805786, - -1.0787583589553833, - -3.0716826915740967, - -0.6023419499397278, - -2.558605194091797, - -0.927433431148529, - -0.00364841241389513, - -0.14910078048706055, - -0.7318926453590393, - -6.159773826599121, - -0.0015301911626011133, - -1.8908276557922363, - -1.9641315937042236, - -0.021651331335306168, - -2.1648828983306885, - -2.2700207233428955, - -7.833290100097656, - -0.03397307172417641, - -0.8344621658325195, - -0.02225659228861332, - -0.06639260798692703, - -2.3780317306518555, - -3.180129051208496, - -0.09030630439519882, - -2.4138312339782715, - -1.3445552587509155, - -1.848326325416565, - -0.9726964831352234, - -2.851792335510254, - -0.0630769282579422, - -0.0011394681641831994, - -0.05843213573098183, - -2.6616668701171875, - -1.575437068939209, - -0.180197611451149, - -5.552371501922607, - -0.26108410954475403, - -2.529611587524414, - -0.37780019640922546, - -5.141795635223389, - -0.5921107530593872, - -0.2474975287914276, - -0.10687454044818878, - -4.891775131225586, - -0.25011152029037476, - -2.4100728034973145, - -1.358667016029358, - -2.790961503982544, - -3.8654675483703613, - -1.0076243877410889, - -0.7456949949264526, - -1.5575554370880127, - -2.05328631401062, - -1.6538066864013672, - -0.0558217354118824, - -0.0001817776501411572, - -0.0011643542675301433, - -0.038359593600034714, - -1.4208931922912598, - -0.542127251625061, - -0.3162364959716797, - -0.3966117799282074, - -1.1765563488006592, - -1.7920958995819092, - -0.18425509333610535, - -0.1092008650302887, - -0.46676987409591675, - -0.24977745115756989, - -1.0375996828079224, - -0.5268858671188354, - -0.008942908607423306, - -0.6404479146003723, - -0.0033111530356109142, - -5.3165931603871286e-05, - -0.5154370665550232, - -0.39286962151527405, - -1.401839256286621, - -0.6232213973999023, - -0.02168831042945385, - -0.004282470792531967, - -0.005199837032705545, - -0.09748794883489609, - -0.040823787450790405, - -0.00014852374442853034, - -0.0005832401220686734, - -0.005303124897181988, - -0.6537013053894043, - -0.38026049733161926, - -0.04189129173755646, - -0.010385753586888313, - -0.008756335824728012, - -0.013362848199903965, - -0.000504723924677819, - -0.002797620603814721, - -0.0014512732159346342, - -0.0013321106089279056, - -0.010883613489568233, - -0.005159396678209305, - -0.004701037425547838, - -0.01591104455292225, - -0.001474246964789927, - -1.2278481335670222e-05, - -0.010548785328865051, - -0.08341525495052338, - -0.03858809545636177, - -0.056062061339616776, - -0.0009532198309898376, - -0.0005789510905742645, - -0.0008986725588329136, - -0.00710969977080822, - -0.0006561510381288826, - -1.4781842764932662e-05, - -5.578839045483619e-05, - -0.0006398299592547119, - -0.0028786908369511366, - -0.0034092895220965147, - -0.008268529549241066, - -0.006602259818464518, - -0.004517706111073494, - -0.02233586646616459, - -0.0006323245470412076, - -0.009195122867822647, - -0.0029284947086125612, - -0.004457537550479174, - -0.017873765900731087, - -0.008801711723208427, - -0.0036383166443556547, - -0.08078611642122269, - -0.006347495596855879, - -0.0002177716523874551, - -0.04688572511076927, - -0.2718890309333801, - -0.07996802777051926, - -0.04674842208623886, - -0.009984076954424381, - -0.010000954382121563, - -0.050126753747463226, - -0.5864179730415344, - -0.2915390133857727, - -0.008090462535619736, - -0.032190412282943726, - -0.03461571782827377, - -0.2785419523715973, - -0.05830562859773636, - -0.02893950417637825, - -0.12241066247224808, - -0.02711048536002636, - -0.16450686752796173, - -0.09852994978427887, - -0.2651047706604004, - -0.35559725761413574, - -0.12606258690357208, - -0.32793670892715454, - -0.20878805220127106, - -0.7587923407554626, - -0.4730657637119293, - -1.496794581413269, - -0.2486363798379898, - -0.20723387598991394, - -0.1872958242893219, - -0.19151091575622559, - -0.3350580036640167, - -1.3085839748382568, - -0.6109102964401245, - -0.2947172224521637, - -0.37130236625671387, - -0.19888469576835632, - -0.18297068774700165, - -0.9978674054145813, - -0.5471905469894409, - -0.4379975199699402, - -0.407988041639328, - -0.17325688898563385, - -0.16938896477222443 - ] - }, - "96": { - "input_prompt": "what is the concept of double materiality in sustainability?", - "generated_text": " What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double material", - "generated_tokens": [ - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318, - 1532, - 1294, - 60187, - 1063, - 5675, - 1395, - 1278, - 7401, - 1307, - 5558, - 4318 - ], - "latency": 126.4328100681305, - "cuda_graph_request_count_map": null, - "step_count": 6144, - "logprobs": [ - -4.917365074157715, - -0.9960631132125854, - -7.875392913818359, - -0.2993181347846985, - -7.760880470275879, - -10.308395385742188, - -2.1807961463928223, - -1.6412583589553833, - -9.521512985229492, - -1.627489447593689, - -1.8410861492156982, - -0.9285702705383301, - -0.2576955556869507, - -0.9641067981719971, - -0.02314644306898117, - -0.6696561574935913, - -0.07035009562969208, - -0.004622488282620907, - -0.025748632848262787, - -0.06276137381792068, - -0.17385317385196686, - -0.3285445272922516, - -0.0592009499669075, - -0.007940039038658142, - -0.22664028406143188, - -0.0017957051750272512, - -0.022929180413484573, - -0.005733947269618511, - -0.0012996093137189746, - -0.006419987417757511, - -0.02376849390566349, - -0.27800270915031433, - -0.4650723934173584, - -0.04936715215444565, - -0.003972141072154045, - -0.01477995328605175, - -0.0012044801842421293, - -0.014891182072460651, - -0.002709767082706094, - -0.0009939497103914618, - -0.0028436246793717146, - -0.006759870797395706, - -0.15416178107261658, - -0.20121537148952484, - -0.016414370387792587, - -0.0015769677702337503, - -0.008138825185596943, - -0.0007713441736996174, - -0.013819841668009758, - -0.003826678032055497, - -0.0005918181850574911, - -0.0014938872773200274, - -0.00485716899856925, - -0.081083282828331, - -0.09642580896615982, - -0.009630884043872356, - -0.0010948146227747202, - -0.007085552904754877, - -0.0006310140597634017, - -0.013073914684355259, - -0.0039152647368609905, - -0.000364713923772797, - -0.001292108790948987, - -0.004158303141593933, - -0.044283974915742874, - -0.05722038820385933, - -0.006369172595441341, - -0.0007976687629707158, - -0.005993015132844448, - -0.0004935238393954933, - -0.011310506612062454, - -0.002951553324237466, - -0.000387831823900342, - -0.000977038755081594, - -0.0036971091758459806, - -0.030511993914842606, - -0.04246694967150688, - -0.004863100592046976, - -0.0006927236099727452, - -0.005206122528761625, - -0.0005129451747052372, - -0.00894621666520834, - -0.0028565814718604088, - -0.00041333239641971886, - -0.0009002208826132119, - -0.0033131728414446115, - -0.021188799291849136, - -0.03330245241522789, - -0.0038543473929166794, - -0.0006504327175207436, - -0.004474864806979895, - -0.00048029806930571795, - -0.009718249551951885, - -0.0030443770810961723, - -0.0003743662964552641, - -0.0009439303539693356, - -0.003729770192876458, - -0.016505014151334763, - -0.0290373582392931, - -0.003315192647278309, - -0.0005821678787469864, - -0.004148805979639292, - -0.00042489083716645837, - -0.006856840569525957, - -0.0028660909738391638, - -0.00032574593205936253, - -0.0006986799417063594, - -0.003671098267659545, - -0.012792548164725304, - -0.02553274855017662, - -0.002730690874159336, - -0.0005067494930699468, - -0.0036923582665622234, - -0.0004451475979294628, - -0.007225453853607178, - -0.002746859099715948, - -0.0003323002893012017, - -0.0008517451351508498, - -0.003630714723840356, - -0.011284693144261837, - -0.02186688780784607, - -0.0025819556321948767, - -0.0004931663861498237, - -0.0031721293926239014, - -0.00040725519647821784, - -0.0062383925542235374, - -0.00238693761639297, - -0.00040749352774582803, - -0.0007970731821842492, - -0.003604583442211151, - -0.010075542144477367, - -0.022386692464351654, - -0.0025295186787843704, - -0.00046302087139338255, - -0.003138143103569746, - -0.0004233417857903987, - -0.006178564392030239, - -0.0021037368569523096 - ] - }, - "127": { - "input_prompt": "as before we talk about using , i usually do if else() to in case the property would be null before.\nif today i public a property and i want to use using to instead , will code be like this?\n\npublic Material Mat;\n\nusing(Mat){\n// Do something;\n}\n\nis that right?", - "generated_text": " i am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using", - "generated_tokens": [ - 1623, - 1855, - 1605, - 5257, - 1693, - 1623, - 1855, - 2505, - 1278, - 8462, - 22692, - 1505, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 17047, - 1809, - 1623, - 1855, - 1605, - 5257, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 17047, - 1505, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 73751, - 1338, - 1073, - 1855, - 1605, - 5257, - 1693, - 1623, - 1855, - 2505, - 1278, - 8462, - 22692, - 1505, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 17047, - 1809, - 1623, - 1855, - 1605, - 5257, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 17047, - 1505, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 73751, - 1338, - 1073, - 1855, - 1605, - 5257, - 1693, - 1623, - 1855, - 2505, - 1278, - 8462, - 22692, - 1505, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 17047, - 1809, - 1623, - 1855, - 1605, - 5257, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 17047, - 1505, - 1693, - 1623, - 1855, - 2505, - 1278, - 22692, - 73751, - 1338, - 1073, - 1855, - 1605, - 5257, - 1693, - 1623, - 1855, - 2505 - ], - "latency": 155.6906189918518, - "cuda_graph_request_count_map": null, - "step_count": 6144, - "logprobs": [ - -7.793755531311035, - -4.224940776824951, - -5.446587562561035, - -0.267395555973053, - -6.118141174316406, - -11.105525970458984, - -6.67517614364624, - -6.902530670166016, - -3.2218151092529297, - -5.99644660949707, - -8.431673049926758, - -8.04836368560791, - -3.992594003677368, - -7.550079345703125, - -2.873685836791992, - -3.3877997398376465, - -7.9306745529174805, - -7.005484580993652, - -1.0481306314468384, - -2.686237335205078, - -6.131283760070801, - -6.2994704246521, - -7.931419372558594, - -11.3147554397583, - -8.544670104980469, - -12.01729679107666, - -3.89847469329834, - -1.7964364290237427, - -2.952878952026367, - -1.9217232465744019, - -2.272329330444336, - -0.37552154064178467, - -1.667820692062378, - -7.510344505310059, - -3.498040199279785, - -7.980632305145264, - -7.672002792358398, - -4.4999470710754395, - -7.155375003814697, - -2.4486124515533447, - -4.785946846008301, - -1.153855800628662, - -2.3994438648223877, - -4.250652313232422, - -12.24446964263916, - -8.344388008117676, - -2.608186721801758, - -5.200589179992676, - -8.25888442993164, - -3.6245617866516113, - -7.689338207244873, - -7.345355033874512, - -1.2661759853363037, - -7.265620231628418, - -1.9884108304977417, - -6.269482612609863, - -2.41705584526062, - -1.8929681777954102, - -1.8259913921356201, - -2.0997350215911865, - -2.323200225830078, - -1.3998825550079346, - -0.8789899945259094, - -1.082053542137146, - -1.1831339597702026, - -1.4462857246398926, - -1.6481035947799683, - -1.4408715963363647, - -1.2603964805603027, - -1.5267670154571533, - -1.6345772743225098, - -1.3796477317810059, - -0.7609691023826599, - -0.3548354506492615, - -0.7552334666252136, - -0.44776833057403564, - -1.1078286170959473, - -1.3036658763885498, - -0.5214896202087402, - -0.8486822843551636, - -0.22470997273921967, - -0.4705755412578583, - -0.5639711022377014, - -0.5388108491897583, - -0.6052999496459961, - -0.1002030223608017, - -0.286334365606308, - -0.45798981189727783, - -1.0107953548431396, - -0.11875647306442261, - -0.6969441771507263, - -0.4609107971191406, - -0.07614769786596298, - -0.5035472512245178, - -0.1682187020778656, - -0.10476160794496536, - -0.6586751341819763, - -0.35806939005851746, - -1.5364394187927246, - -2.4093759059906006, - -1.977368950843811, - -1.6216907501220703, - -0.27647316455841064, - -0.2991848587989807, - -0.2783535420894623, - -0.05913994088768959, - -0.03023873083293438, - -0.043339803814888, - -0.7320341467857361, - -0.0030677898321300745, - -0.0332595594227314, - -0.012804670259356499, - -0.004041599575430155, - -0.0014899593079462647, - -0.001948602613992989, - -0.0029070996679365635, - -0.040939707309007645, - -0.013942227698862553, - -0.04897322878241539, - -0.011005887761712074, - -0.0044113704934716225, - -0.0013179434463381767, - -0.003658389439806342, - -0.009758152067661285, - -0.0014104428701102734, - -0.0016671819612383842, - -0.000771939754486084, - -0.0015519729349762201, - -0.003720743814483285, - -0.004249115474522114, - -0.00485657574608922, - -0.005053604021668434, - -0.002336274366825819, - -0.0009155849111266434, - -0.0004978132783435285, - -0.0005953923100605607, - -0.0011395872570574284, - -0.001485078944824636, - -0.3072909712791443, - -1.7295066118240356, - -0.4807289242744446, - -0.1245415136218071, - -0.011858444660902023, - -0.020613837987184525, - -0.011020978912711143, - -0.003106294432654977, - -0.0009966888464987278, - -0.0019349202048033476, - -0.037407051771879196, - -0.0003496989083942026, - -0.005922981072217226, - -0.007394562941044569, - -0.0006037319544702768, - -0.0008836655179038644, - -0.0002884448622353375, - -0.00047600860125385225, - -0.0024947968777269125, - -0.00442774873226881, - -0.004059052560478449, - -0.0018594847060739994, - -0.0006179092451930046, - -0.00022635281493421644, - -0.0006730675231665373, - -0.003022746881470084, - -0.0002343380037928, - -0.00047791501856409013, - -9.440929716220126e-05, - -0.00021550717065110803, - -0.0013523490633815527, - -0.0032202552538365126, - -0.001157686347141862, - -0.004449942149221897, - -0.0016590891173109412, - -0.00101062236353755, - -0.0003079893649555743, - -0.00048375347978435457, - -0.0021734442561864853, - -0.00423036003485322, - -0.11514264345169067, - -0.8658493757247925, - -0.084366075694561, - -0.02140468917787075, - -0.0060798698104918, - -0.008638513274490833, - -0.003212531330063939, - -0.0009598892065696418, - -0.00032085992279462516 - ] - }, - "throughput": [ - 93.24123994187065, - 104.94118337233992, - 105.03843789693171 - ] + "0": { + "input_prompt": "The $500 Cup of coffee?\nConsider this, most Americans spend an average of $1,500-2,000 a year on this bean water.\nI have a few question for you: \nHow has business been the past few months?\nDo you ever feel like your business is stuck?\nDon't feel like you're able to improve performance and make changes required to achieve success ?\nAre your customers spneding less and less and wanting more?\nHave the gas prices affected your business?\nDo you have employees and do they hate you or wish they could quit?\n\nNow, before you and I can decide wheter or not I will be a good fit for your business we should talk this over with coffee.\nAnd, just to warn you this isn't some casual thing. This is not a date or time to be personal or social (but by all means share what you will coz I'll gladly listen).\nTher eare two major talking points and stratagies we will focios on in our lil coffee social\nFor one, we will find your unique selling Proposition (USP).\nDo have the best price? Are you the cheapest in town? Are your customers jerks? Do you haVE REGULARS? Why do people come back?\nwe'll also look for the holes in your business bucket. I'm willing to bet there's a hole or two in your business we'll find together that'll make this 500 dollar cup of Joe pay for itse;f immedietly.\nMany find themselves to be more profitable by just finding out where the dollars are escaping in their business and I like to think of myself as a guy that comes along with some spakel or putty and patch those holes up for you.\nBeleive me, just fixing one hole can mean a lot...just think about a sinking boat that has a hole in it that's about 3\u201d in diameter... it doesn't take long to sink.\nI have no agenda, besides f=getting to know your business and seeing wher I can patch the holes and find what makes you do darn unique (I know this won't take long.)\nMany folks, I bet, will find what they need to get off their chest with a quick phone call and they just paypal me the money and make a coffee at home. Look, that's fine too.\nI just to get you ot of your comfort zone, because this is where it all starts my frind.\nSome smart GOAT entrepreneur will probably get everything they need just out of our lil mini consulatant for the more extensive business consukting I offer, and look, that's fine, too.\nMaybe this cup of coffee will be all you need to gtet back on your feet, but not only surive, but thrive!\nI'm not trying to make frineds, or make a bunch of money, or look down your shirt\nBut this is only going to be a 45 minute (max) coffee chat\nAnd, I know you (and me) both have a lot on our plates. So no messing around\nAfter our consultation I will follow up with you in a few days and see how things are going, then I will be emailing you about once or twice every two weeks, just to make sure you're staying on task and implementing what we went over.\nTghere is no obligation to go any further and will gladly give you back your money if this pre-consultation doesn't put you on the right path or you don't get any value out of it...", + "generated_text": " $ is a$ is a $ is a $ is a $ is a $ is a $$1, you\n$ $$ $\n$ $- $\n$\n$\n$ $$$\n$\n$\n$\n$\n$\n$\n$\n$???????, $\n-1\n$\n1.5.\n$\n$, you\n$.\n$\n1,1.1\nI\n$.\nI\n\n\nHow\n$,,,0,1,0,0.0\nIn\nThe\nThe\nThe\n", + "generated_tokens": [ + 1659, + 1395, + 1261, + 1036, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1036, + 1049, + 1044, + 1636, + 1010, + 1036, + 1659, + 1036, + 1659, + 1010, + 1036, + 1659, + 1045, + 1659, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1659, + 1036, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1063, + 1063, + 1063, + 1063, + 1063, + 1063, + 1063, + 1044, + 1659, + 1010, + 1045, + 1049, + 1010, + 1036, + 1010, + 1049, + 1046, + 1053, + 1046, + 1010, + 1036, + 1010, + 1036, + 1044, + 1636, + 1010, + 1036, + 1046, + 1010, + 1036, + 1010, + 1049, + 1044, + 1049, + 1046, + 1049, + 1010, + 1073, + 1010, + 1036, + 1046, + 1010, + 1073, + 1010, + 1010, + 1010, + 7801, + 1010, + 1036, + 1044, + 1044, + 1044, + 1048, + 1044, + 1049, + 1044, + 1048, + 1044, + 1048, + 1046, + 1048, + 1010, + 1785, + 1010, + 1784, + 1010, + 1784, + 1010, + 1784, + 1010 + ], + "latency": 9.77891230583191, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -7.7319135665893555, + -2.188307285308838, + -0.7547445297241211, + -0.7294313311576843, + -10.238386154174805, + -3.3775341510772705, + -6.394498825073242, + -7.354557037353516, + -9.018157958984375, + -3.012073040008545, + -3.2584073543548584, + -5.220732688903809, + -4.620487213134766, + -2.5078930854797363, + -3.752683162689209, + -0.13360372185707092, + -0.05705544352531433, + -0.41462242603302, + -1.585279941558838, + -1.6438164710998535, + -1.9557222127914429, + -0.3989897072315216, + -0.0365302674472332, + -6.368816375732422, + -0.8731719255447388, + -0.022585075348615646, + -0.2775891423225403, + -0.0027362785767763853, + -0.0006812873762100935, + -1.581446647644043, + -0.008688976056873798, + -0.3532317280769348, + -6.071163177490234, + -9.162371635437012, + -9.965556144714355, + -2.400461196899414, + -2.9898362159729004, + -2.9803032875061035, + -2.12601900100708, + -3.500912666320801, + -7.015069007873535, + -2.278961420059204, + -0.46380555629730225, + -4.078739166259766, + -1.9430254697799683, + -3.5642244815826416, + -3.689701795578003, + -6.201474189758301, + -6.580833911895752, + -2.3081111907958984, + -5.42717170715332, + -1.1886008977890015, + -1.172760248184204, + -1.3571951389312744, + -1.3551844358444214, + -3.376784324645996, + -0.05118789151310921, + -4.064360618591309, + -2.575554847717285, + -0.6994737386703491, + -2.56724214553833, + -2.1888976097106934, + -0.4816131591796875, + -4.070178985595703, + -2.0060782432556152, + -6.858033180236816, + -0.059200502932071686, + -3.214278221130371, + -0.9671833515167236, + -0.823198676109314, + -1.0130078792572021, + -4.595561981201172, + -0.012724989093840122, + -5.214311599731445, + -8.246870040893555, + -3.1476030349731445, + -3.299684524536133, + -4.218191146850586, + -7.318399429321289, + -0.8580498695373535, + -3.0894036293029785, + -1.886361002922058, + -7.217658996582031, + -3.271679639816284, + -3.9717154502868652, + -1.8835484981536865, + -10.034332275390625, + -11.382490158081055, + -5.417011260986328, + -7.505967140197754, + -2.33837890625, + -0.07904055714607239, + -3.294971227645874, + -7.813640594482422, + -1.7646901607513428, + -4.025320053100586, + -3.5977325439453125, + -4.390352249145508, + -9.147806167602539, + -0.5303041934967041, + -7.721246242523193, + -0.6311959028244019, + -0.8119025230407715, + -0.7227814197540283, + -1.8369406461715698, + -0.20933297276496887, + -1.5395950078964233, + -4.424448490142822, + -4.084965705871582, + -3.355497360229492, + -1.0475609302520752, + -6.479413986206055, + -0.7810530662536621, + -2.132437229156494, + -6.648703098297119, + -2.9522438049316406, + -1.2485712766647339, + -4.040503025054932, + -2.3415768146514893, + -5.358206748962402, + -1.6258506774902344, + -3.956300973892212, + -0.732298731803894, + -7.441117286682129, + -1.5242161750793457, + -2.4555861949920654, + -4.295163154602051, + -9.687600135803223, + -0.8213484883308411, + -1.2446978092193604, + -0.01942702941596508, + -4.619411468505859, + -3.3297007083892822, + -2.2139487266540527, + -3.691431999206543, + -2.6574106216430664, + -6.075929641723633, + -0.6123450994491577, + -1.2942559719085693, + -0.6262839436531067, + -7.398006439208984, + -4.4869890213012695, + -4.202048301696777, + -4.982994079589844, + -0.637227475643158, + -3.061023235321045, + -10.117584228515625, + -3.8567495346069336, + -4.0480828285217285, + -2.472019672393799, + -4.246374607086182, + -1.3939155340194702, + -7.132441520690918, + -0.20108745992183685, + -4.986658573150635, + -4.387957572937012, + -0.01108358334749937, + -4.209756851196289, + -7.271108627319336, + -4.047314643859863, + -2.6497321128845215, + -1.4763175249099731, + -0.28365400433540344, + -3.5247769355773926, + -1.4226995706558228, + -4.327237129211426, + -2.0407187938690186, + -6.1437907218933105, + -1.5190880298614502, + -2.5511486530303955, + -7.504094123840332, + -2.152172565460205, + -6.708334922790527, + -6.913146495819092, + -3.6959621906280518, + -6.752341270446777, + -0.63083815574646, + -0.12433214485645294, + -5.0525641441345215, + -4.435934066772461, + -0.45601028203964233, + -6.3459577560424805, + -9.882917404174805, + -3.1422882080078125, + -2.550520658493042, + -3.2099051475524902, + -6.278127193450928, + -0.07764133810997009, + -3.155696153640747, + -1.933587670326233, + -9.61027717590332, + -6.211391925811768, + -4.664543151855469, + -6.783782005310059, + -5.676271438598633, + -8.605900764465332, + -0.0824289619922638, + -3.5463995933532715, + -13.374168395996094, + -1.2401021718978882, + -1.8734056949615479, + -3.4154422283172607, + -1.6733763217926025, + -17.633970260620117, + -9.345113754272461, + -0.6277351975440979, + -2.9617538452148438, + -2.5565333366394043, + -10.10580825805664, + -7.130337715148926, + -7.36820125579834, + -4.098911285400391, + -5.747079372406006, + -2.945054769515991, + -0.7887389063835144, + -1.6583149433135986, + -1.0165244340896606, + -6.581666946411133, + -5.926386833190918, + -5.845194339752197, + -0.9657630920410156, + -7.868755340576172, + -1.3244551420211792, + -0.2657390236854553, + -0.06403665244579315, + -2.983020782470703, + -5.943899631500244, + -7.877285957336426, + -3.593116283416748, + -3.819509506225586, + -7.226177215576172, + -2.5206997394561768, + -3.385587215423584, + -0.37499159574508667, + -1.4698283672332764, + -3.1460342407226562, + -0.0077166082337498665, + -4.350916862487793, + -3.2183218002319336, + -0.6242184638977051, + -1.4782464504241943, + -2.8054311275482178, + -3.0831401348114014, + -12.17662525177002, + -2.113419532775879, + -1.6448111534118652, + -2.1834323406219482, + -0.7630388140678406, + -10.1896390914917, + -6.234405517578125, + -11.46288776397705, + -1.003785490989685, + -4.211658477783203, + -1.5010679960250854, + -5.859302043914795, + -2.0465080738067627, + -3.7468819618225098, + -4.684195518493652, + -4.318704128265381, + -2.7234389781951904, + -9.00437068939209, + -3.043811321258545, + -3.1384406089782715, + -2.713779926300049, + -2.095993995666504, + -2.1484954357147217, + -10.274479866027832, + -0.682350754737854, + -0.25973302125930786, + -3.6964316368103027, + -13.434456825256348, + -2.3368239402770996, + -5.382724761962891, + -1.9073458909988403, + -5.905669212341309, + -0.032165709882974625, + -1.6530004739761353, + -2.728893280029297, + -1.640552043914795, + -1.1391171216964722, + -1.4353511333465576, + -4.003787994384766, + -0.3450564742088318, + -0.7168521285057068, + -0.34650325775146484, + -0.3616408705711365, + -7.062709331512451, + -1.2851682901382446, + -2.299129009246826, + -8.800156593322754, + -5.208735466003418, + -4.780910491943359, + -2.78342342376709, + -4.469717979431152, + -6.909726619720459, + -2.5114197731018066, + -0.659822404384613, + -0.6915416121482849, + -3.2363741397857666, + -0.5283617377281189, + -0.10473938286304474, + -6.215325832366943, + -7.283237934112549, + -1.6797031164169312, + -11.50100040435791, + -7.5822978019714355, + -3.387317657470703, + -11.407575607299805, + -5.441976547241211, + -3.3264851570129395, + -0.7265786528587341, + -1.382750153541565, + -7.841699600219727, + -8.105277061462402, + -3.9569506645202637, + -4.963083267211914, + -0.5492897629737854, + -4.6081390380859375, + -5.870400905609131, + -3.957930088043213, + -5.275494575500488, + -4.105091094970703, + -2.15435528755188, + -2.8472700119018555, + -1.1278448104858398, + -8.226571083068848, + -0.40629008412361145, + -9.916461944580078, + -4.616743087768555, + -1.691868543624878, + -0.6639478802680969, + -2.5716753005981445, + -6.676954746246338, + -6.535329818725586, + -0.4170510768890381, + -1.443942904472351, + -3.145481824874878, + -1.440589427947998, + -0.26935356855392456, + -0.9647155404090881, + -4.335958957672119, + -1.5647850036621094, + -5.890466690063477, + -3.01654052734375, + -1.9168468713760376, + -3.7365682125091553, + -8.001864433288574, + -10.680083274841309, + -4.489352226257324, + -4.6058149337768555, + -7.69011116027832, + -3.6247005462646484, + -1.5600426197052002, + -10.2160062789917, + -5.004643440246582, + -0.19602319598197937, + -3.375545024871826, + -2.669325590133667, + -1.3932737112045288, + -1.6410658359527588, + -6.847603797912598, + -6.744344711303711, + -0.5215591192245483, + -0.25840020179748535, + -1.1448237895965576, + -5.57253885269165, + -7.251138687133789, + -4.221924781799316, + -0.7688062787055969, + -2.504502534866333, + -3.146519660949707, + -2.206653356552124, + -1.4295082092285156, + -7.96943998336792, + -4.332189083099365, + -2.5750505924224854, + -1.7102608680725098, + -5.311381816864014, + -8.897522926330566, + -2.994919538497925, + -3.3397974967956543, + -2.1794328689575195, + -2.437566041946411, + -0.3181810975074768, + -0.27412793040275574, + -0.7914466857910156, + -2.3470635414123535, + -2.4099245071411133, + -2.491870880126953, + -3.024170160293579, + -1.9719040393829346, + -11.373910903930664, + -1.4279751777648926, + -0.14573107659816742, + -2.055763006210327, + -6.366893291473389, + -4.24091911315918, + -0.00709194503724575, + -2.0199716091156006, + -2.524750232696533, + -1.4272525310516357, + -0.5185190439224243, + -2.927150011062622, + -2.7070627212524414, + -3.365638017654419, + -4.318085193634033, + -7.773144721984863, + -1.7947180271148682, + -7.657534599304199, + -8.767786026000977, + -14.74280071258545, + -1.8042558431625366, + -3.2712037563323975, + -1.4002125263214111, + -4.887944221496582, + -1.4821010828018188, + -1.5255622863769531, + -5.879070281982422, + -4.463839530944824, + -5.1955976486206055, + -5.665647506713867, + -0.3775045573711395, + -5.9350481033325195, + -2.800539255142212, + -0.13162286579608917, + -3.034379720687866, + -4.729524612426758, + -4.6252641677856445, + -3.850942611694336, + -2.4760568141937256, + -6.059760093688965, + -10.12075138092041, + -0.9469369649887085, + -11.595907211303711, + -6.875324726104736, + -4.268826007843018, + -2.835529088973999, + -3.8626279830932617, + -4.876199245452881, + -0.013071090914309025, + -4.964417934417725, + -0.7445687055587769, + -5.707155227661133, + -6.10660457611084, + -4.317755699157715, + -4.440443992614746, + -2.9202542304992676, + -4.743522644042969, + -1.2569392919540405, + -2.8675737380981445, + -2.3151841163635254, + -4.318130970001221, + -1.9054772853851318, + -1.1808521747589111, + -0.765956461429596, + -2.768916606903076, + -6.237791061401367, + -1.7224305868148804, + -7.137521743774414, + -4.512486457824707, + -1.9069950580596924, + -4.145983695983887, + -5.365190505981445, + -0.059828490018844604, + -2.273892879486084, + -3.4013004302978516, + -5.035730361938477, + -6.501443386077881, + -9.903446197509766, + -1.6332892179489136, + -2.1572084426879883, + -1.6149548292160034, + -1.4698481559753418, + -6.01010799407959, + -2.2243528366088867, + -6.900836944580078, + -6.0930986404418945, + -2.974020481109619, + -3.225423574447632, + -8.423272132873535, + -1.3423724174499512, + -3.626147508621216, + -0.4862469434738159, + -6.860866546630859, + -3.8910953998565674, + -2.33319354057312, + -1.7229185104370117, + -2.215972423553467, + -8.99046516418457, + -4.099084854125977, + -2.4191012382507324, + -8.288970947265625, + -2.9641928672790527, + -1.5036451816558838, + -3.0544614791870117, + -0.0715634673833847, + -2.444031238555908, + -4.520998954772949, + -3.972568988800049, + -0.4985870122909546, + -2.1651363372802734, + -3.4427435398101807, + -1.730639100074768, + -0.9458961486816406, + -7.740211009979248, + -9.39163875579834, + -3.895984172821045, + -1.7523534297943115, + -5.41331672668457, + -8.910720825195312, + -12.971094131469727, + -3.0455880165100098, + -10.501265525817871, + -3.3864927291870117, + -4.842309951782227, + -3.9964733123779297, + -7.3046793937683105, + -2.6607093811035156, + -1.3541781902313232, + -5.003270626068115, + -3.944551944732666, + -0.11356143653392792, + -5.174440383911133, + -9.628616333007812, + -8.654989242553711, + -8.980416297912598, + -6.670101642608643, + -5.488286018371582, + -5.943419933319092, + -2.126483201980591, + -8.054739952087402, + -7.458671569824219, + -2.5267202854156494, + -6.455472946166992, + -8.655346870422363, + -7.903901100158691, + -6.221062660217285, + -7.129237174987793, + -4.2345380783081055, + -2.5375306606292725, + -7.697700500488281, + -1.567080020904541, + -2.084331750869751, + -0.25020831823349, + -1.5145041942596436, + -4.619244575500488, + -0.2970108985900879, + -0.4977554678916931, + -6.197869300842285, + -4.030620098114014, + -7.232107639312744, + -0.21076253056526184, + -1.563366174697876, + -1.133756160736084, + -2.708237648010254, + -4.080535888671875, + -0.6818401217460632, + -0.1864331066608429, + -0.49012088775634766, + -8.732468605041504, + -11.945040702819824, + -5.243098735809326, + -1.5294703245162964, + -0.8935543298721313, + -0.6174070835113525, + -1.5068217515945435, + -3.5766501426696777, + -5.393096923828125, + -4.202867031097412, + -14.765748023986816, + -5.2513813972473145, + -0.7597705721855164, + -0.2502063810825348, + -1.7403976917266846, + -2.8000779151916504, + -1.9808133840560913, + -2.1654744148254395, + -1.8629226684570312, + -3.222038745880127, + -0.040942225605249405, + -2.3384013175964355, + -10.210381507873535, + -4.5859761238098145, + -0.5805734395980835, + -3.7019288539886475, + -2.001936674118042, + -2.7876083850860596, + -2.9799084663391113, + -4.349887371063232, + -0.0792960673570633, + -1.4366114139556885, + -1.0813264846801758, + -1.3510822057724, + -6.7060699462890625, + -5.436615943908691, + -3.978389263153076, + -6.785447597503662, + -6.147171497344971, + -3.97414231300354, + -4.332991600036621, + -0.9269428253173828, + -5.1237101554870605, + -4.486598968505859, + -0.04678357392549515, + -1.0307552814483643, + -1.4249452352523804, + -4.517682075500488, + -3.561821699142456, + -2.0815205574035645, + -0.6041194200515747, + -5.992964744567871, + -7.092092514038086, + -0.48916709423065186, + -2.6405677795410156, + -4.3345723152160645, + -3.533582925796509, + -3.1233346462249756, + -3.107872486114502, + -1.9901115894317627, + -3.1052846908569336, + -1.8440347909927368, + -6.21368408203125, + -1.8796799182891846, + -2.705214738845825, + -0.2987763583660126, + -4.070865154266357, + -1.6675832271575928, + -1.3896636962890625, + -1.5731089115142822, + -3.526170015335083, + -2.5088443756103516, + -1.208929419517517, + -3.673125743865967, + -2.501532554626465, + -6.875064373016357, + -8.512459754943848, + -1.042314052581787, + -3.657850980758667, + -7.0950798988342285, + -4.974049091339111, + -8.14085578918457, + -3.529888153076172, + -1.9389504194259644, + -7.0902204513549805, + -2.409292459487915, + -2.9428021907806396, + -1.688283085823059, + -3.622368335723877, + -2.0903351306915283, + -4.160663604736328, + -3.1683764457702637, + -1.2135626077651978, + -7.566033363342285, + -3.1186251640319824, + -5.899919509887695, + -0.9518840312957764, + -2.656729221343994, + -2.2994377613067627, + -6.806836128234863, + -1.280236840248108, + -2.838846206665039, + -1.3598848581314087, + -11.707776069641113, + -3.134333372116089, + -0.6230669617652893, + -8.219222068786621, + -7.562507152557373, + -7.489459037780762, + -1.5368008613586426, + -7.149652481079102, + -5.749268054962158, + -3.162869691848755, + -2.7235195636749268, + -6.128931999206543, + -1.1934199333190918, + -3.986410617828369, + -3.76609468460083, + -1.712721586227417, + -3.195504903793335, + -8.397743225097656, + -3.1260581016540527, + -9.792022705078125, + -4.217884540557861, + -11.583260536193848, + -5.987588882446289, + -5.178754806518555, + -6.994749069213867, + -5.167606353759766, + -7.124668121337891, + -6.201416015625, + -10.203682899475098, + -6.858526229858398, + -2.733592987060547, + -5.078882217407227, + -9.003358840942383, + -4.704894542694092, + -3.9085562229156494, + -7.247268199920654, + -7.091092109680176, + -4.4150166511535645, + -7.56699275970459, + -9.485116004943848, + -1.9977033138275146, + -6.65272331237793, + -2.236643075942993, + -7.518955707550049, + -5.525973320007324, + -4.67877721786499, + -6.608670234680176, + -5.536133766174316, + -10.772479057312012, + -10.8853178024292, + -3.6156129837036133, + -6.751470565795898, + -6.4537434577941895, + -3.4220399856567383, + -8.251005172729492, + -3.2146153450012207, + -6.330069541931152, + -1.5551663637161255, + -6.520583629608154, + -10.450878143310547, + -5.8788957595825195, + -3.7398200035095215, + -3.9084208011627197, + -0.3640081584453583, + -6.961522102355957, + -6.066243648529053, + -7.270624160766602, + -5.098455429077148, + -2.7642822265625, + -5.460171699523926, + -7.362828731536865, + -2.558631658554077, + -2.186410427093506, + -2.5309929847717285, + -2.46756649017334, + -2.0306026935577393, + -1.8713470697402954, + -2.108008623123169, + -1.2698389291763306, + -2.1712756156921387, + -2.4432802200317383, + -1.1477653980255127, + -1.8417484760284424, + -2.5971946716308594, + -1.8250831365585327, + -2.103092670440674, + -2.5183165073394775, + -2.9367291927337646, + -1.9412965774536133, + -1.7692793607711792, + -2.864521026611328, + -3.1332175731658936, + -1.098311185836792, + -2.946441173553467, + -2.2800471782684326, + -3.1929852962493896, + -2.754260778427124, + -3.485616445541382, + -3.3010287284851074, + -2.5537776947021484, + -2.6752865314483643, + -3.1617612838745117, + -2.4571690559387207, + -2.060081958770752, + -2.425969362258911, + -2.212725877761841, + -2.4232254028320312, + -3.0587053298950195, + -2.4074010848999023, + -2.457937479019165, + -2.319617986679077, + -2.6340954303741455, + -2.599524736404419, + -2.5302212238311768, + -1.6849274635314941, + -2.2609786987304688, + -2.039928674697876, + -1.9474098682403564, + -2.3550753593444824, + -1.718749761581421, + -2.413884162902832, + -1.6247628927230835, + -2.4784040451049805, + -1.828325629234314, + -1.3880831003189087, + -1.4448199272155762, + -1.1477117538452148, + -1.1669728755950928, + -1.8787822723388672, + -1.5565840005874634, + -1.6666553020477295, + -1.747725248336792, + -1.959598422050476, + -2.0376486778259277, + -2.345367431640625, + -2.055098533630371, + -1.3940613269805908, + -3.4385242462158203, + -2.7489635944366455, + -3.2590157985687256, + -3.1128957271575928, + -1.7070379257202148, + -3.9010369777679443, + -3.21574068069458, + -3.3850393295288086, + -1.8778185844421387, + -2.698211908340454, + -1.8060741424560547, + -2.0845324993133545, + -3.4797585010528564, + -2.263254404067993, + -3.083108901977539, + -1.6589758396148682, + -2.687279224395752, + -1.77505624294281, + -2.6142921447753906, + -1.934045672416687, + -1.8834377527236938, + -2.8038980960845947, + -1.550542950630188, + -3.4054152965545654, + -1.724036693572998, + -2.3146564960479736, + -1.5134503841400146, + -2.9289023876190186, + -1.5285141468048096, + -3.421035051345825, + -1.3757282495498657, + -3.441431760787964, + -1.5286564826965332, + -3.4372904300689697, + -3.173043966293335, + -1.1313854455947876, + -1.486415147781372, + -2.506413221359253, + -3.5165903568267822, + -1.4112176895141602, + -3.7175354957580566, + -2.2170844078063965, + -0.704839289188385, + -0.6626103520393372, + -2.5483946800231934, + -0.949668288230896, + -3.1339564323425293, + -1.3326977491378784, + -3.5493476390838623, + -0.6785370111465454, + -3.236161470413208, + -2.1347084045410156, + -3.802447557449341, + -2.585503339767456, + -3.9361765384674072, + -2.290905714035034, + -3.714280605316162, + -1.793616533279419, + -3.7252492904663086, + -1.450188159942627, + -3.11938738822937, + -1.25174880027771 + ] + }, + "32": { + "input_prompt": "create a conversational article", + "generated_text": " about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article.", + "generated_tokens": [ + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046 + ], + "latency": 48.63822364807129, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -4.4165568351745605, + -11.358176231384277, + -0.0701780766248703, + -7.797665119171143, + -2.6805992126464844, + -1.4707680940628052, + -3.0390255451202393, + -1.6902849674224854, + -1.270594835281372, + -1.1936196088790894, + -1.2523558139801025, + -2.7270259857177734, + -1.2371309995651245, + -0.9618493318557739, + -0.4379909038543701, + -1.3917063474655151, + -1.1055524349212646, + -0.9122569561004639, + -0.9911308288574219, + -0.08436793833971024, + -0.5424078106880188, + -0.9181017279624939, + -0.5873759388923645, + -0.19014373421669006, + -0.06655456870794296, + -0.15252672135829926, + -0.09415211528539658, + -0.009787309914827347, + -0.013910251669585705, + -0.005296128336340189, + -0.005677408073097467, + -0.02013739012181759, + -0.21594694256782532, + -0.07153760641813278, + -0.0066444179974496365, + -0.010198505595326424, + -0.011980246752500534, + -0.003686776151880622, + -0.0037619550712406635, + -0.0022467151284217834, + -0.004088377580046654, + -0.021828632801771164, + -0.0012669878778979182, + -0.09768074005842209, + -0.02652405947446823, + -0.0019286142196506262, + -0.002283824374899268, + -0.0032225127797573805, + -0.0009741804678924382, + -0.0009415484382770956, + -0.001211624126881361, + -0.001135300612077117, + -0.002340436913073063, + -0.0010846928926184773, + -0.0509282611310482, + -0.03832047060132027, + -0.00257422705180943, + -0.0022806129418313503, + -0.00262785074301064, + -0.0008195855189114809, + -0.0010239601833745837, + -0.0013777059502899647, + -0.0009899006690829992, + -0.0018756669014692307, + -0.0015304292319342494, + -0.08506463468074799, + -0.01893703266978264, + -0.0013797297142446041, + -0.0014461545506492257, + -0.0013971101725474, + -0.0005869334563612938, + -0.0005212855176068842, + -0.000876757490914315, + -0.0005256939912214875, + -0.0012863941956311464, + -0.0015691122971475124, + -0.051276568323373795, + -0.00973513163626194, + -0.0010469438275322318, + -0.0011531615164130926, + -0.0009969270322471857, + -0.00038342276820912957, + -0.0004032037395518273, + -0.000730247818864882, + -0.0003275334893260151, + -0.0008700875914655626, + -0.0017572689102962613, + -0.044393111020326614, + -0.013102858327329159, + -0.0011463745031505823, + -0.0012070996453985572, + -0.0012325793504714966, + -0.0005048430757597089, + -0.0004876854654867202, + -0.0007901645149104297, + -0.00041500062798149884, + -0.0009869233472272754, + -0.0018687656847760081, + -0.03943866863846779, + -0.014425630681216717, + -0.0014756753807887435, + -0.001423775334842503, + -0.001209719106554985, + -0.0005046047735959291, + -0.00042167355422861874, + -0.0007688426994718611, + -0.0002699726028367877, + -0.0006598440813831985, + -0.0017849955474957824, + -0.038999658077955246, + -0.012665312737226486, + -0.0014427024871110916, + -0.0014492495683953166, + -0.001016576774418354, + -0.00042083943844772875, + -0.00033241944038309157, + -0.0006403064471669495, + -0.00022373080719262362, + -0.0007053509471006691, + -0.0018597226589918137, + -0.030997740104794502, + -0.011259939521551132, + -0.0012655591126531363, + -0.00134151556994766, + -0.0008106521563604474, + -0.0003513672563713044, + -0.0002964295563288033, + -0.0006368515896610916, + -0.00020180096908006817, + -0.0005779979983344674, + -0.0016014858847483993, + -0.0271126888692379 + ] + }, + "64": { + "input_prompt": "App Concept: \"Eggy's Interactive Adventure World\"\n\nEggy's Interactive Adventure World is a cutting-edge app designed for phones and tablets that combines immersive storytelling with engaging, interactive experiences for both parents and children. This innovative app allows users to explore the world of Eggy and participate in a series of adventures that teach essential life skills, foster personal growth, and strengthen parent-child bonds.\n\nKey Features:\n\n1. Interactive Stories: The app brings the 20 Eggy stories to life through interactive, animated storytelling. Users can read, listen, and actively participate in the stories, making choices that guide Eggy through various adventures, each with unique lessons and experiences.\n2. Augmented Reality (AR) Integration: The app utilizes AR technology, allowing users to interact with Eggy and the story characters in their real-world environment. This immersive experience creates a deeper connection with the stories and encourages engagement.\n3. Personalized Adventures: The app customizes each story based on the user's interests, needs, and personal growth areas. By collecting user input, the app tailors the storylines and learning outcomes to ensure that each adventure resonates with the child's unique experiences.\n4. Parent-Child Cooperative Play: The app encourages parents and children to collaborate and engage in the stories together. Through a split-screen mode or multi-device connectivity, parents and children can actively participate in the stories and learn essential life skills together.\n5. Skill-Building Mini-Games: Interspersed throughout the stories are educational mini-games that reinforce the lessons and themes. These games are designed to be fun, engaging, and accessible to children, encouraging them to practice and develop various skills.\n6. Progress Tracking and Rewards: The app tracks the user's progress through the stories and rewards them with badges, trophies, and other collectibles. This gamification aspect encourages continued engagement and fosters a sense of accomplishment and growth.\n7. Community Features: The app provides a safe, moderated online community where parents can share their experiences, discuss the stories and lessons, and connect with other families who are also using the app. This community fosters support, inspiration, and learning.\n8. Expert Resources: The app includes access to resources from experts in child development, psychology, and education, providing parents with additional tools and guidance on fostering the growth and well-being of their children.\n\nEggy's Interactive Adventure World is a one-of-a-kind, mind-blowing app that revolutionizes the way parents and children engage with stories and learn essential life skills. Combining the power of storytelling, cutting-edge technology, and expert resources, this app is a must-have for families looking to strengthen their bonds and support their children's personal growth.", + "generated_text": "The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season was the 10th season was the 10 season was the 10 season was the 10", + "generated_tokens": [ + 2, + 1784, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048 + ], + "latency": 87.50420427322388, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -12.107745170593262, + -2.9727728366851807, + -3.720092535018921, + -5.592433929443359, + -10.964235305786133, + -3.654498338699341, + -9.33439826965332, + -4.833785057067871, + -5.187321662902832, + -2.6944785118103027, + -6.9262237548828125, + -0.654232919216156, + -0.5550781488418579, + -0.21346639096736908, + -0.0134271876886487, + -0.010840100236237049, + -1.3878544569015503, + -0.6296291351318359, + -7.9766011238098145, + -0.4393192231655121, + -5.639142036437988, + -3.277270793914795, + -1.0206468105316162, + -11.703084945678711, + -0.7100943922996521, + -0.2809169888496399, + -2.771284818649292, + -7.190817832946777, + -4.048691749572754, + -0.012056218460202217, + -3.3802318572998047, + -0.6807184815406799, + -3.4844107627868652, + -3.312331199645996, + -0.5001641511917114, + -2.61255145072937, + -4.243694305419922, + -4.333778381347656, + -6.0625810623168945, + -0.011777156963944435, + -0.37577226758003235, + -0.9490834474563599, + -3.5450198650360107, + -2.1778035163879395, + -0.45957911014556885, + -3.00771164894104, + -1.7600425481796265, + -0.09766030311584473, + -2.467618942260742, + -1.329679012298584, + -0.8384320735931396, + -1.1864604949951172, + -3.628342866897583, + -0.2470003068447113, + -1.8938640356063843, + -5.168431282043457, + -0.05005566030740738, + -2.258014678955078, + -2.449028968811035, + -0.0034086955711245537, + -3.9485883712768555, + -1.6201664209365845, + -5.139942646026611, + -4.859354496002197, + -0.23686674237251282, + -0.5541543364524841, + -2.5826025009155273, + -6.114635467529297, + -4.3380208015441895, + -0.7412900924682617, + -0.3221715986728668, + -0.13805493712425232, + -4.1797332763671875, + -7.3456268310546875, + -0.13762745261192322, + -2.0905232429504395, + -1.0178627967834473, + -4.108260631561279, + -0.6007124185562134, + -1.0410642623901367, + -4.122039794921875, + -0.35905471444129944, + -1.4274661540985107, + -4.139932155609131, + -0.4237431585788727, + -1.6294409036636353, + -0.9811424016952515, + -4.132790565490723, + -1.1318120956420898, + -6.8258256912231445, + -1.5455098152160645, + -0.6984409093856812, + -13.664215087890625, + -0.1166313961148262, + -1.6347849369049072, + -0.28875046968460083, + -0.03130083531141281, + -1.5293006896972656, + -1.6488375663757324, + -4.224111557006836, + -4.760683059692383, + -1.9758747816085815, + -1.5828256607055664, + -2.8463857173919678, + -0.2620386481285095, + -1.7243889570236206, + -1.7945923805236816, + -0.8884308338165283, + -0.3766394555568695, + -0.34033581614494324, + -9.05566692352295, + -0.22754782438278198, + -0.033802058547735214, + -0.34108465909957886, + -0.5644669532775879, + -2.0925779342651367, + -4.547505855560303, + -10.870464324951172, + -1.1072022914886475, + -5.503787994384766, + -3.259672164916992, + -0.007964519783854485, + -3.0111639499664307, + -4.246737480163574, + -0.7813188433647156, + -3.331031322479248, + -4.485962867736816, + -0.9492117166519165, + -2.6757047176361084, + -1.1591349840164185, + -1.122117519378662, + -2.629878044128418, + -5.986321926116943, + -0.2146703153848648, + -0.002392764901742339, + -7.372479438781738, + -0.007077385671436787, + -0.06599216908216476, + -0.0970711037516594, + -3.2874932289123535, + -0.0019583588000386953, + -0.9122000336647034, + -4.930907249450684, + -0.019508399069309235, + -0.308611661195755, + -0.07778516411781311, + -3.8497893810272217, + -0.46124517917633057, + -0.38821348547935486, + -2.668412208557129, + -1.845987319946289, + -0.06470083445310593, + -0.006619549356400967, + -1.2610487937927246, + -0.13015533983707428, + -3.365312099456787, + -0.0014690094394609332, + -1.6789823770523071, + -1.2499005794525146, + -3.3992111682891846, + -5.563300132751465, + -0.823418140411377, + -4.24124813079834, + -1.6597849130630493, + -0.6941139698028564, + -1.5637556314468384, + -0.5482053756713867, + -0.9507225751876831, + -3.764758586883545, + -0.0006518622976727784, + -0.7540555000305176, + -5.058262825012207, + -0.3302401602268219, + -2.8130555152893066, + -0.17079885303974152, + -2.871047019958496, + -0.3991694450378418, + -3.1476998329162598, + -0.3488404452800751, + -2.0545666217803955, + -4.201597690582275, + -5.164614677429199, + -0.0271432027220726, + -0.0009785869624465704, + -3.3444161415100098, + -1.3117046356201172, + -6.375423431396484, + -0.05535568296909332, + -0.3919340968132019, + -0.060594215989112854, + -6.507473468780518, + -0.0023910999298095703, + -2.143423318862915, + -3.335618257522583, + -2.953970432281494, + -0.0013383012264966965, + -0.8080525398254395, + -0.29526084661483765, + -0.04036511853337288, + -3.231475353240967, + -1.0585589408874512, + -6.136373043060303, + -0.006182829383760691, + -0.035548023879528046, + -5.509808540344238, + -1.8490750789642334, + -9.83314037322998, + -0.07037576287984848, + -3.1621387004852295, + -6.762360095977783, + -1.3490527868270874, + -3.601043462753296, + -1.176393985748291, + -0.4342959523200989, + -0.06266004592180252, + -5.464046001434326, + -0.017946599051356316, + -1.0416009426116943, + -1.6117159128189087, + -12.289417266845703, + -1.5004339218139648, + -5.76563835144043, + -4.038386821746826, + -0.20812086760997772, + -3.6306562423706055, + -1.3901070356369019, + -1.087137222290039, + -2.423213243484497, + -4.503086090087891, + -0.0008031480247154832, + -0.03627370297908783, + -0.1653430461883545, + -7.958648681640625, + -1.1018548011779785, + -1.290948748588562, + -3.8049263954162598, + -1.8253734111785889, + -0.059022851288318634, + -0.0013984196120873094, + -4.698851585388184, + -2.5421664714813232, + -0.024493809789419174, + -4.828659534454346, + -3.0295286178588867, + -3.550312042236328, + -0.1185273677110672, + -0.22595760226249695, + -0.10782183706760406, + -1.4033282995224, + -0.4485701024532318, + -0.2889708876609802, + -0.05471855774521828, + -0.007632025051862001, + -2.1156554222106934, + -0.6249589323997498, + -4.198577404022217, + -0.14178156852722168, + -4.284021377563477, + -2.227515935897827, + -3.5022120475769043, + -0.19575819373130798, + -15.964509963989258, + -4.055960655212402, + -11.125024795532227, + -0.7681724429130554, + -3.0436902046203613, + -7.030262470245361, + -4.376729488372803, + -5.476145267486572, + -0.4219042658805847, + -3.7689766883850098, + -0.060010604560375214, + -0.8134393692016602, + -0.11386934667825699, + -0.025473715737462044, + -0.09736856073141098, + -4.357361793518066, + -0.3670865297317505, + -0.08063744008541107, + -0.1311480849981308, + -1.0903867483139038, + -1.2705107927322388, + -1.5076212882995605, + -4.295275688171387, + -0.04185756668448448, + -0.19810955226421356, + -1.9645220041275024, + -0.9597910642623901, + -0.13429655134677887, + -0.002283110748976469, + -7.066074371337891, + -3.639211654663086, + -1.0263917446136475, + -8.124760627746582, + -1.132537841796875, + -0.09160765260457993, + -0.08996370434761047, + -10.165366172790527, + -3.501585006713867, + -0.0019847711082547903, + -0.05309417471289635, + -0.31209683418273926, + -0.15089339017868042, + -1.23564875125885, + -1.2685208320617676, + -7.832758903503418, + -0.19271136820316315, + -0.014305183663964272, + -0.0007532381569035351, + -0.44688940048217773, + -2.6239724159240723, + -1.738666296005249, + -1.6480977535247803, + -0.46753185987472534, + -8.656959533691406, + -3.79868483543396, + -0.9281394481658936, + -2.2381181716918945, + -1.7654449939727783, + -0.4948798418045044, + -0.025028761476278305, + -1.5435361862182617, + -1.6390818357467651, + -1.4962153434753418, + -0.3425217270851135, + -0.013077914714813232, + -0.038474079221487045, + -5.3364362716674805, + -0.42365288734436035, + -1.884093999862671, + -3.510357618331909, + -6.198029518127441, + -0.44375038146972656, + -0.0008789013954810798, + -3.6025230884552, + -1.419615626335144, + -2.6723289489746094, + -5.775190830230713, + -1.1380761861801147, + -2.6683366298675537, + -0.43395891785621643, + -0.003145867260172963, + -8.63144302368164, + -1.646262764930725, + -1.732487678527832, + -4.561546802520752, + -0.5277953147888184, + -0.07333153486251831, + -0.5624169707298279, + -0.12201295047998428, + -2.6561455726623535, + -1.1071691513061523, + -2.6895060539245605, + -0.040864069014787674, + -0.04126371443271637, + -1.8294739723205566, + -0.09022177755832672, + -0.3154001832008362, + -0.46215569972991943, + -2.2462844848632812, + -0.30149081349372864, + -0.52588951587677, + -8.288043975830078, + -0.0002057340752799064, + -0.8021711707115173, + -4.4546098709106445, + -0.0001565095444675535, + -0.0015961299650371075, + -0.15216240286827087, + -0.3677564561367035, + -5.018707275390625, + -0.7850045561790466, + -1.9582659006118774, + -1.0046892166137695, + -10.0401029586792, + -0.16878114640712738, + -5.944240570068359, + -1.5523078441619873, + -5.7253522872924805, + -0.47948503494262695, + -0.44009655714035034, + -5.671053886413574, + -0.003280022880062461, + -0.7937742471694946, + -0.9639376401901245, + -0.00030048147891648114, + -1.0747740268707275, + -0.8839919567108154, + -3.416811466217041, + -1.6602673530578613, + -0.2706959843635559, + -0.0024333172477781773, + -4.478696823120117, + -6.20179557800293, + -0.11359559744596481, + -0.202009916305542, + -0.022310219705104828, + -2.367263078689575, + -1.0405994653701782, + -5.984308242797852, + -2.105138063430786, + -9.583202362060547, + -0.0004957877099514008, + -3.0655455589294434, + -0.0669412910938263, + -0.8977450728416443, + -2.2271294593811035, + -2.6617536544799805, + -1.8184051513671875, + -0.8291114568710327, + -0.4864235818386078, + -0.7993525862693787, + -3.51106858253479, + -2.1530935764312744, + -0.257144957780838, + -1.3934082984924316, + -1.3137131929397583, + -0.3384077548980713, + -0.1697217971086502, + -2.353395938873291, + -0.03406282886862755, + -0.39059701561927795, + -3.422821044921875, + -1.7117210626602173, + -0.7018465399742126, + -1.5995906591415405, + -3.6218395233154297, + -0.12497704476118088, + -0.16966234147548676, + -0.7313685417175293, + -0.4956285357475281, + -1.0840849876403809, + -5.042126655578613, + -0.00031704644788987935, + -7.683258056640625, + -0.9210801720619202, + -4.687852382659912, + -0.0028814247343689203, + -0.043382611125707626, + -4.1948652267456055, + -2.66593337059021, + -0.06153333932161331, + -0.0023110604379326105, + -6.729236602783203, + -5.777127742767334, + -0.08932067453861237, + -0.09890018403530121, + -0.009886111132800579, + -3.1145148277282715, + -3.725565195083618, + -0.0021998509764671326, + -3.9927196502685547, + -2.753793239593506, + -1.6037236452102661, + -0.17461130023002625, + -4.804804801940918, + -0.2311229705810547, + -0.30256444215774536, + -2.235363006591797, + -0.006614102050662041, + -0.34757524728775024, + -1.4946835041046143, + -1.222062587738037, + -3.658839225769043, + -1.356170892715454, + -0.5371109843254089, + -3.7580835819244385, + -4.54621696472168, + -0.31577637791633606, + -3.677156925201416, + -2.7181396484375, + -7.4674882888793945, + -0.00019369633810129017, + -2.3798398971557617, + -2.5452184677124023, + -0.2858496308326721, + -4.315659523010254, + -0.025835415348410606, + -0.000603493710514158, + -0.2546294331550598, + -0.12032663822174072, + -2.006908655166626, + -5.990736961364746, + -7.146596908569336, + -0.23356498777866364, + -0.2201036810874939, + -0.01235415879637003, + -0.011248741298913956, + -1.4155778884887695, + -0.40242519974708557, + -5.877886772155762, + -0.7865053415298462, + -0.03231288120150566, + -0.004864405374974012, + -0.0050629740580916405, + -2.7049152851104736, + -6.822089195251465, + -0.39252761006355286, + -1.2290617227554321, + -0.007630132604390383, + -3.485461711883545, + -0.47985684871673584, + -6.1813530921936035, + -0.03757825121283531, + -0.37834712862968445, + -0.22192610800266266, + -1.165318489074707, + -0.5220151543617249, + -0.1289423257112503, + -3.216222047805786, + -1.0787583589553833, + -3.0716826915740967, + -0.6023419499397278, + -2.558605194091797, + -0.927433431148529, + -0.00364841241389513, + -0.14910078048706055, + -0.7318926453590393, + -6.159773826599121, + -0.0015301911626011133, + -1.8908276557922363, + -1.9641315937042236, + -0.021651331335306168, + -2.1648828983306885, + -2.2700207233428955, + -7.833290100097656, + -0.03397307172417641, + -0.8344621658325195, + -0.02225659228861332, + -0.06639260798692703, + -2.3780317306518555, + -3.180129051208496, + -0.09030630439519882, + -2.4138312339782715, + -1.3445552587509155, + -1.848326325416565, + -0.9726964831352234, + -2.851792335510254, + -0.0630769282579422, + -0.0011394681641831994, + -0.05843213573098183, + -2.6616668701171875, + -1.575437068939209, + -0.180197611451149, + -5.552371501922607, + -0.26108410954475403, + -2.529611587524414, + -0.37780019640922546, + -5.141795635223389, + -0.5921107530593872, + -0.2474975287914276, + -0.10687454044818878, + -4.891775131225586, + -0.25011152029037476, + -2.4100728034973145, + -1.358667016029358, + -2.790961503982544, + -3.8654675483703613, + -1.0076243877410889, + -0.7456949949264526, + -1.5575554370880127, + -2.05328631401062, + -1.6538066864013672, + -0.0558217354118824, + -0.0001817776501411572, + -0.0011643542675301433, + -0.038359593600034714, + -1.4208931922912598, + -0.542127251625061, + -0.3162364959716797, + -0.3966117799282074, + -1.1765563488006592, + -1.7920958995819092, + -0.18425509333610535, + -0.1092008650302887, + -0.46676987409591675, + -0.24977745115756989, + -1.0375996828079224, + -0.5268858671188354, + -0.008942908607423306, + -0.6404479146003723, + -0.0033111530356109142, + -5.3165931603871286e-05, + -0.5154370665550232, + -0.39286962151527405, + -1.401839256286621, + -0.6232213973999023, + -0.02168831042945385, + -0.004282470792531967, + -0.005199837032705545, + -0.09748794883489609, + -0.040823787450790405, + -0.00014852374442853034, + -0.0005832401220686734, + -0.005303124897181988, + -0.6537013053894043, + -0.38026049733161926, + -0.04189129173755646, + -0.010385753586888313, + -0.008756335824728012, + -0.013362848199903965, + -0.000504723924677819, + -0.002797620603814721, + -0.0014512732159346342, + -0.0013321106089279056, + -0.010883613489568233, + -0.005159396678209305, + -0.004701037425547838, + -0.01591104455292225, + -0.001474246964789927, + -1.2278481335670222e-05, + -0.010548785328865051, + -0.08341525495052338, + -0.03858809545636177, + -0.056062061339616776, + -0.0009532198309898376, + -0.0005789510905742645, + -0.0008986725588329136, + -0.00710969977080822, + -0.0006561510381288826, + -1.4781842764932662e-05, + -5.578839045483619e-05, + -0.0006398299592547119, + -0.0028786908369511366, + -0.0034092895220965147, + -0.008268529549241066, + -0.006602259818464518, + -0.004517706111073494, + -0.02233586646616459, + -0.0006323245470412076, + -0.009195122867822647, + -0.0029284947086125612, + -0.004457537550479174, + -0.017873765900731087, + -0.008801711723208427, + -0.0036383166443556547, + -0.08078611642122269, + -0.006347495596855879, + -0.0002177716523874551, + -0.04688572511076927, + -0.2718890309333801, + -0.07996802777051926, + -0.04674842208623886, + -0.009984076954424381, + -0.010000954382121563, + -0.050126753747463226, + -0.5864179730415344, + -0.2915390133857727, + -0.008090462535619736, + -0.032190412282943726, + -0.03461571782827377, + -0.2785419523715973, + -0.05830562859773636, + -0.02893950417637825, + -0.12241066247224808, + -0.02711048536002636, + -0.16450686752796173, + -0.09852994978427887, + -0.2651047706604004, + -0.35559725761413574, + -0.12606258690357208, + -0.32793670892715454, + -0.20878805220127106, + -0.7587923407554626, + -0.4730657637119293, + -1.496794581413269, + -0.2486363798379898, + -0.20723387598991394, + -0.1872958242893219, + -0.19151091575622559, + -0.3350580036640167, + -1.3085839748382568, + -0.6109102964401245, + -0.2947172224521637, + -0.37130236625671387, + -0.19888469576835632, + -0.18297068774700165, + -0.9978674054145813, + -0.5471905469894409, + -0.4379975199699402, + -0.407988041639328, + -0.17325688898563385, + -0.16938896477222443 + ] + }, + "96": { + "input_prompt": "what is the concept of double materiality in sustainability?", + "generated_text": " What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double material", + "generated_tokens": [ + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318 + ], + "latency": 126.4328100681305, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -4.917365074157715, + -0.9960631132125854, + -7.875392913818359, + -0.2993181347846985, + -7.760880470275879, + -10.308395385742188, + -2.1807961463928223, + -1.6412583589553833, + -9.521512985229492, + -1.627489447593689, + -1.8410861492156982, + -0.9285702705383301, + -0.2576955556869507, + -0.9641067981719971, + -0.02314644306898117, + -0.6696561574935913, + -0.07035009562969208, + -0.004622488282620907, + -0.025748632848262787, + -0.06276137381792068, + -0.17385317385196686, + -0.3285445272922516, + -0.0592009499669075, + -0.007940039038658142, + -0.22664028406143188, + -0.0017957051750272512, + -0.022929180413484573, + -0.005733947269618511, + -0.0012996093137189746, + -0.006419987417757511, + -0.02376849390566349, + -0.27800270915031433, + -0.4650723934173584, + -0.04936715215444565, + -0.003972141072154045, + -0.01477995328605175, + -0.0012044801842421293, + -0.014891182072460651, + -0.002709767082706094, + -0.0009939497103914618, + -0.0028436246793717146, + -0.006759870797395706, + -0.15416178107261658, + -0.20121537148952484, + -0.016414370387792587, + -0.0015769677702337503, + -0.008138825185596943, + -0.0007713441736996174, + -0.013819841668009758, + -0.003826678032055497, + -0.0005918181850574911, + -0.0014938872773200274, + -0.00485716899856925, + -0.081083282828331, + -0.09642580896615982, + -0.009630884043872356, + -0.0010948146227747202, + -0.007085552904754877, + -0.0006310140597634017, + -0.013073914684355259, + -0.0039152647368609905, + -0.000364713923772797, + -0.001292108790948987, + -0.004158303141593933, + -0.044283974915742874, + -0.05722038820385933, + -0.006369172595441341, + -0.0007976687629707158, + -0.005993015132844448, + -0.0004935238393954933, + -0.011310506612062454, + -0.002951553324237466, + -0.000387831823900342, + -0.000977038755081594, + -0.0036971091758459806, + -0.030511993914842606, + -0.04246694967150688, + -0.004863100592046976, + -0.0006927236099727452, + -0.005206122528761625, + -0.0005129451747052372, + -0.00894621666520834, + -0.0028565814718604088, + -0.00041333239641971886, + -0.0009002208826132119, + -0.0033131728414446115, + -0.021188799291849136, + -0.03330245241522789, + -0.0038543473929166794, + -0.0006504327175207436, + -0.004474864806979895, + -0.00048029806930571795, + -0.009718249551951885, + -0.0030443770810961723, + -0.0003743662964552641, + -0.0009439303539693356, + -0.003729770192876458, + -0.016505014151334763, + -0.0290373582392931, + -0.003315192647278309, + -0.0005821678787469864, + -0.004148805979639292, + -0.00042489083716645837, + -0.006856840569525957, + -0.0028660909738391638, + -0.00032574593205936253, + -0.0006986799417063594, + -0.003671098267659545, + -0.012792548164725304, + -0.02553274855017662, + -0.002730690874159336, + -0.0005067494930699468, + -0.0036923582665622234, + -0.0004451475979294628, + -0.007225453853607178, + -0.002746859099715948, + -0.0003323002893012017, + -0.0008517451351508498, + -0.003630714723840356, + -0.011284693144261837, + -0.02186688780784607, + -0.0025819556321948767, + -0.0004931663861498237, + -0.0031721293926239014, + -0.00040725519647821784, + -0.0062383925542235374, + -0.00238693761639297, + -0.00040749352774582803, + -0.0007970731821842492, + -0.003604583442211151, + -0.010075542144477367, + -0.022386692464351654, + -0.0025295186787843704, + -0.00046302087139338255, + -0.003138143103569746, + -0.0004233417857903987, + -0.006178564392030239, + -0.0021037368569523096 + ] + }, + "127": { + "input_prompt": "as before we talk about using , i usually do if else() to in case the property would be null before.\nif today i public a property and i want to use using to instead , will code be like this?\n\npublic Material Mat;\n\nusing(Mat){\n// Do something;\n}\n\nis that right?", + "generated_text": " i am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using", + "generated_tokens": [ + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505 + ], + "latency": 155.6906189918518, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -7.793755531311035, + -4.224940776824951, + -5.446587562561035, + -0.267395555973053, + -6.118141174316406, + -11.105525970458984, + -6.67517614364624, + -6.902530670166016, + -3.2218151092529297, + -5.99644660949707, + -8.431673049926758, + -8.04836368560791, + -3.992594003677368, + -7.550079345703125, + -2.873685836791992, + -3.3877997398376465, + -7.9306745529174805, + -7.005484580993652, + -1.0481306314468384, + -2.686237335205078, + -6.131283760070801, + -6.2994704246521, + -7.931419372558594, + -11.3147554397583, + -8.544670104980469, + -12.01729679107666, + -3.89847469329834, + -1.7964364290237427, + -2.952878952026367, + -1.9217232465744019, + -2.272329330444336, + -0.37552154064178467, + -1.667820692062378, + -7.510344505310059, + -3.498040199279785, + -7.980632305145264, + -7.672002792358398, + -4.4999470710754395, + -7.155375003814697, + -2.4486124515533447, + -4.785946846008301, + -1.153855800628662, + -2.3994438648223877, + -4.250652313232422, + -12.24446964263916, + -8.344388008117676, + -2.608186721801758, + -5.200589179992676, + -8.25888442993164, + -3.6245617866516113, + -7.689338207244873, + -7.345355033874512, + -1.2661759853363037, + -7.265620231628418, + -1.9884108304977417, + -6.269482612609863, + -2.41705584526062, + -1.8929681777954102, + -1.8259913921356201, + -2.0997350215911865, + -2.323200225830078, + -1.3998825550079346, + -0.8789899945259094, + -1.082053542137146, + -1.1831339597702026, + -1.4462857246398926, + -1.6481035947799683, + -1.4408715963363647, + -1.2603964805603027, + -1.5267670154571533, + -1.6345772743225098, + -1.3796477317810059, + -0.7609691023826599, + -0.3548354506492615, + -0.7552334666252136, + -0.44776833057403564, + -1.1078286170959473, + -1.3036658763885498, + -0.5214896202087402, + -0.8486822843551636, + -0.22470997273921967, + -0.4705755412578583, + -0.5639711022377014, + -0.5388108491897583, + -0.6052999496459961, + -0.1002030223608017, + -0.286334365606308, + -0.45798981189727783, + -1.0107953548431396, + -0.11875647306442261, + -0.6969441771507263, + -0.4609107971191406, + -0.07614769786596298, + -0.5035472512245178, + -0.1682187020778656, + -0.10476160794496536, + -0.6586751341819763, + -0.35806939005851746, + -1.5364394187927246, + -2.4093759059906006, + -1.977368950843811, + -1.6216907501220703, + -0.27647316455841064, + -0.2991848587989807, + -0.2783535420894623, + -0.05913994088768959, + -0.03023873083293438, + -0.043339803814888, + -0.7320341467857361, + -0.0030677898321300745, + -0.0332595594227314, + -0.012804670259356499, + -0.004041599575430155, + -0.0014899593079462647, + -0.001948602613992989, + -0.0029070996679365635, + -0.040939707309007645, + -0.013942227698862553, + -0.04897322878241539, + -0.011005887761712074, + -0.0044113704934716225, + -0.0013179434463381767, + -0.003658389439806342, + -0.009758152067661285, + -0.0014104428701102734, + -0.0016671819612383842, + -0.000771939754486084, + -0.0015519729349762201, + -0.003720743814483285, + -0.004249115474522114, + -0.00485657574608922, + -0.005053604021668434, + -0.002336274366825819, + -0.0009155849111266434, + -0.0004978132783435285, + -0.0005953923100605607, + -0.0011395872570574284, + -0.001485078944824636, + -0.3072909712791443, + -1.7295066118240356, + -0.4807289242744446, + -0.1245415136218071, + -0.011858444660902023, + -0.020613837987184525, + -0.011020978912711143, + -0.003106294432654977, + -0.0009966888464987278, + -0.0019349202048033476, + -0.037407051771879196, + -0.0003496989083942026, + -0.005922981072217226, + -0.007394562941044569, + -0.0006037319544702768, + -0.0008836655179038644, + -0.0002884448622353375, + -0.00047600860125385225, + -0.0024947968777269125, + -0.00442774873226881, + -0.004059052560478449, + -0.0018594847060739994, + -0.0006179092451930046, + -0.00022635281493421644, + -0.0006730675231665373, + -0.003022746881470084, + -0.0002343380037928, + -0.00047791501856409013, + -9.440929716220126e-05, + -0.00021550717065110803, + -0.0013523490633815527, + -0.0032202552538365126, + -0.001157686347141862, + -0.004449942149221897, + -0.0016590891173109412, + -0.00101062236353755, + -0.0003079893649555743, + -0.00048375347978435457, + -0.0021734442561864853, + -0.00423036003485322, + -0.11514264345169067, + -0.8658493757247925, + -0.084366075694561, + -0.02140468917787075, + -0.0060798698104918, + -0.008638513274490833, + -0.003212531330063939, + -0.0009598892065696418, + -0.00032085992279462516 + ] + }, + "throughput": 104.94118337233992 } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json index 066995bd666..e1cada771ca 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgxh100_coreweave.json @@ -1,170 +1,161 @@ { - "0": { - "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", - "generated_text": " And that this is the place where you can be yourself, and be yourself in the most beautiful way. And that this is the place where you can", - "generated_tokens": [ - 3060, - 1455, - 1593, - 1395, - 1278, - 3535, - 2478, - 1636, - 1710, - 1402, - 14019, - 1044, - 1321, - 1402, - 14019, - 1294, - 1278, - 2725, - 15568, - 3039, - 1046, - 3060, - 1455, - 1593, - 1395, - 1278, - 3535, - 2478, - 1636, - 1710 - ], - "latency": 2.1998238563537598, - "cuda_graph_request_count_map": null, - "step_count": 240, - "logprobs": [ - -9.358587265014648, - -2.7594826221466064, - -4.608366012573242, - -1.4093360900878906, - -0.6152952313423157, - -1.7217562198638916, - -2.496668815612793, - -2.0547454357147217, - -2.441960573196411, - -6.280838966369629, - -1.5643692016601562, - -3.462346076965332, - -4.428728103637695, - -3.8633861541748047, - -1.9936373233795166, - -1.8929449319839478, - -3.796365737915039, - -6.8360137939453125, - -0.2901247441768646, - -0.9246833324432373, - -6.633338928222656, - -7.166708469390869, - -12.771251678466797, - -2.198296308517456, - -3.7778120040893555, - -0.4983733296394348, - -4.381269454956055, - -0.0666784718632698, - -0.09580295532941818, - -3.2437636852264404, - -10.079947471618652, - -1.172220230102539, - -5.977442741394043, - -5.046236038208008, - -3.855658531188965, - -2.5585858821868896, - -3.356245994567871, - -5.557229518890381, - -1.6787731647491455, - -5.483290672302246, - -12.218501091003418, - -12.61402702331543, - -0.09662941098213196, - -2.5431432723999023, - -1.4071024656295776, - -2.9154715538024902, - -1.1964417695999146, - -0.006458481773734093, - -3.3625335693359375, - -13.262511253356934, - -4.314079761505127, - -2.617699146270752, - -5.987792015075684, - -0.778266429901123, - -0.048888545483350754, - -1.548882007598877, - -1.1381981372833252, - -5.627166748046875, - -0.4078553318977356, - -4.958505630493164, - -0.6187160611152649, - -0.7174848914146423, - -2.469533920288086, - -13.620073318481445, - -0.09088654816150665, - -3.526974678039551, - -1.4195809364318848, - -6.402483940124512, - -0.5898402333259583, - -3.565917491912842, - -0.8561318516731262, - -1.6140165328979492, - -5.370549201965332, - -17.159223556518555, - -6.583524703979492, - -0.8855001926422119, - -4.19431209564209, - -1.2012220621109009, - -2.2563133239746094, - -1.7674944400787354, - -0.22064533829689026, - -9.292220115661621, - -0.12445646524429321, - -7.29617977142334, - -2.526529312133789, - -4.071560859680176, - -3.5568013191223145, - -1.926215410232544, - -2.349026918411255, - -2.2132363319396973, - -0.3125414550304413, - -1.4718132019042969, - -2.149106740951538, - -1.0855519771575928, - -1.631832242012024, - -1.3751734495162964, - -1.9396103620529175, - -1.5293723344802856, - -0.8444125056266785, - -1.2414811849594116, - -1.9522171020507812, - -2.4338042736053467, - -1.5651824474334717, - -0.9498789310455322, - -1.8044980764389038, - -2.356677770614624, - -1.247452974319458, - -1.550165057182312, - -0.5635553598403931, - -0.6177330017089844, - -0.4778785705566406, - -0.020452087745070457, - -0.48500269651412964, - -0.23854275047779083, - -0.06543659418821335, - -0.11837350577116013, - -0.0585334412753582 - ] - }, - "throughput": [ - 0.7170174223459943, - 12.998776662244524, - 13.163004282426089, - 13.581765270525981, - 13.619124445335821, - 13.655332144429561, - 13.608264815678803, - 13.614656540485411 - ] + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And that this is the place where you can be yourself, and be yourself in the most beautiful way. And that this is the place where you can", + "generated_tokens": [ + 3060, + 1455, + 1593, + 1395, + 1278, + 3535, + 2478, + 1636, + 1710, + 1402, + 14019, + 1044, + 1321, + 1402, + 14019, + 1294, + 1278, + 2725, + 15568, + 3039, + 1046, + 3060, + 1455, + 1593, + 1395, + 1278, + 3535, + 2478, + 1636, + 1710 + ], + "latency": 2.1998238563537598, + "cuda_graph_request_count_map": null, + "step_count": 240, + "logprobs": [ + -9.358587265014648, + -2.7594826221466064, + -4.608366012573242, + -1.4093360900878906, + -0.6152952313423157, + -1.7217562198638916, + -2.496668815612793, + -2.0547454357147217, + -2.441960573196411, + -6.280838966369629, + -1.5643692016601562, + -3.462346076965332, + -4.428728103637695, + -3.8633861541748047, + -1.9936373233795166, + -1.8929449319839478, + -3.796365737915039, + -6.8360137939453125, + -0.2901247441768646, + -0.9246833324432373, + -6.633338928222656, + -7.166708469390869, + -12.771251678466797, + -2.198296308517456, + -3.7778120040893555, + -0.4983733296394348, + -4.381269454956055, + -0.0666784718632698, + -0.09580295532941818, + -3.2437636852264404, + -10.079947471618652, + -1.172220230102539, + -5.977442741394043, + -5.046236038208008, + -3.855658531188965, + -2.5585858821868896, + -3.356245994567871, + -5.557229518890381, + -1.6787731647491455, + -5.483290672302246, + -12.218501091003418, + -12.61402702331543, + -0.09662941098213196, + -2.5431432723999023, + -1.4071024656295776, + -2.9154715538024902, + -1.1964417695999146, + -0.006458481773734093, + -3.3625335693359375, + -13.262511253356934, + -4.314079761505127, + -2.617699146270752, + -5.987792015075684, + -0.778266429901123, + -0.048888545483350754, + -1.548882007598877, + -1.1381981372833252, + -5.627166748046875, + -0.4078553318977356, + -4.958505630493164, + -0.6187160611152649, + -0.7174848914146423, + -2.469533920288086, + -13.620073318481445, + -0.09088654816150665, + -3.526974678039551, + -1.4195809364318848, + -6.402483940124512, + -0.5898402333259583, + -3.565917491912842, + -0.8561318516731262, + -1.6140165328979492, + -5.370549201965332, + -17.159223556518555, + -6.583524703979492, + -0.8855001926422119, + -4.19431209564209, + -1.2012220621109009, + -2.2563133239746094, + -1.7674944400787354, + -0.22064533829689026, + -9.292220115661621, + -0.12445646524429321, + -7.29617977142334, + -2.526529312133789, + -4.071560859680176, + -3.5568013191223145, + -1.926215410232544, + -2.349026918411255, + -2.2132363319396973, + -0.3125414550304413, + -1.4718132019042969, + -2.149106740951538, + -1.0855519771575928, + -1.631832242012024, + -1.3751734495162964, + -1.9396103620529175, + -1.5293723344802856, + -0.8444125056266785, + -1.2414811849594116, + -1.9522171020507812, + -2.4338042736053467, + -1.5651824474334717, + -0.9498789310455322, + -1.8044980764389038, + -2.356677770614624, + -1.247452974319458, + -1.550165057182312, + -0.5635553598403931, + -0.6177330017089844, + -0.4778785705566406, + -0.020452087745070457, + -0.48500269651412964, + -0.23854275047779083, + -0.06543659418821335, + -0.11837350577116013, + -0.0585334412753582 + ] + }, + "throughput": 13.581765270525981 } \ No newline at end of file From eb07b693b4aa7c3267b44dce7b55365c8dcc1258 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Mon, 27 Oct 2025 06:22:31 -0500 Subject: [PATCH 072/248] Update dev branch codeowners (#1963) Signed-off-by: Charlie Truong --- .github/CODEOWNERS | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index cc3cb0dbc58..7613dc59da5 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,15 +1,12 @@ -megatron/core @NVIDIA/core-nemo @NVIDIA/core-devtech +* @NVIDIA/core-nemo @NVIDIA/core-devtech .gitlab/ @NVIDIA/ci .github/ @NVIDIA/ci .gitlab-ci.yml @NVIDIA/ci docker/ @NVIDIA/ci tests/unit_tests/run_ci_test.sh @NVIDIA/ci -tests/test_utils/python_scripts/ +tests/test_utils/python_scripts/ tests/functional_tests/python_test_utils/ @NVIDIA/ci tests/functional_tests/shell_test_utils/ @NVIDIA/ci -megatron/core/transformer/transformer_block.py @NVIDIA/ci -megatron/core/transformer/transformer_layer.py @NVIDIA/ci -tests/functional_tests/test_cases/ @NVIDIA/ci -tests/functional_tests/recipes/ @NVIDIA/ci -tests/unit_tests/ @NVIDIA/ci +pyproject.toml @NVIDIA/ci +uv.lock @NVIDIA/ci From fa384d200e4571d0f60ce954eef7d029a0d9cbb6 Mon Sep 17 00:00:00 2001 From: Xin Yao Date: Mon, 27 Oct 2025 16:56:51 +0800 Subject: [PATCH 073/248] [Dev] JIT for MoE router and preprocess (#1918) Signed-off-by: Xin Yao --- .../core/fusions/fused_pad_routing_map.py | 5 ++++- megatron/core/transformer/moe/moe_utils.py | 11 +++++++--- megatron/core/transformer/moe/router.py | 20 +++++++++++++------ .../core/transformer/moe/token_dispatcher.py | 4 +++- 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/megatron/core/fusions/fused_pad_routing_map.py b/megatron/core/fusions/fused_pad_routing_map.py index e7c3a7e48c9..8e4d1763270 100644 --- a/megatron/core/fusions/fused_pad_routing_map.py +++ b/megatron/core/fusions/fused_pad_routing_map.py @@ -1,9 +1,11 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + from unittest.mock import MagicMock import torch from packaging import version +from megatron.core.jit import jit_fuser from megatron.core.utils import experimental_fn, null_decorator try: @@ -69,6 +71,7 @@ def _pad_routing_map_kernel( @experimental_fn(introduced_with_version="0.13.0") +@jit_fuser def fused_pad_routing_map(routing_map: torch.Tensor, pad_multiple: int) -> torch.Tensor: """Fused version of pad_routing_map. Args: diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py index dc857129834..17942fa5a3e 100644 --- a/megatron/core/transformer/moe/moe_utils.py +++ b/megatron/core/transformer/moe/moe_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import math from typing import List, Optional, Union @@ -7,6 +7,7 @@ from megatron.core import parallel_state from megatron.core.process_groups_config import ProcessGroupCollection +from megatron.core.transformer.cuda_graphs import is_graph_capturing try: import transformer_engine as te # pylint: disable=unused-import @@ -905,12 +906,16 @@ class RandomSTE(torch.autograd.Function): """ generator = None + random_logits = None @staticmethod def forward(ctx, logits): """ Forward pass returns random logits with rank-specific seed. """ + if is_graph_capturing() and RandomSTE.random_logits is not None: + return RandomSTE.random_logits + if RandomSTE.generator is None: global_rank = torch.distributed.get_rank() base_seed = 42 @@ -918,8 +923,8 @@ def forward(ctx, logits): RandomSTE.generator = torch.Generator(device=logits.device) RandomSTE.generator.manual_seed(seed) - random_logits = logits.clone().normal_(generator=RandomSTE.generator) - return random_logits + RandomSTE.random_logits = logits.clone().normal_(generator=RandomSTE.generator) + return RandomSTE.random_logits @staticmethod def backward(ctx, grad_output): diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py index 7fa4692ef2f..16fc9d9af8f 100644 --- a/megatron/core/transformer/moe/router.py +++ b/megatron/core/transformer/moe/router.py @@ -1,10 +1,11 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. from abc import ABC, abstractmethod from typing import Optional import torch +from megatron.core.jit import jit_fuser from megatron.core.tensor_parallel import reduce_from_tensor_model_parallel_region from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.moe.moe_utils import ( @@ -468,6 +469,16 @@ def apply_input_jitter(self, input: torch.Tensor): else: return input + @jit_fuser + def _apply_expert_bias(self, routing_map: torch.Tensor): + """ + Update expert bias and tokens_per_expert + Prevent extra local tokens accumulation on evaluation or activation recomputation + """ + if self.enable_expert_bias and torch.is_grad_enabled(): + with torch.no_grad(): + self.local_tokens_per_expert += routing_map.sum(dim=0) + def routing(self, logits: torch.Tensor): """Top-k routing function @@ -526,11 +537,8 @@ def routing(self, logits: torch.Tensor): probs, scores_for_aux_loss, routing_map_for_aux_loss ) - # Update expert bias and tokens_per_expert - # Prevent extra local tokens accumulation on evaluation or activation recomputation - if self.enable_expert_bias and torch.is_grad_enabled(): - with torch.no_grad(): - self.local_tokens_per_expert += routing_map.sum(dim=0) + # Optionally apply expert bias + self._apply_expert_bias(routing_map) return probs, routing_map diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py index 46f94ebe79a..bb034292715 100644 --- a/megatron/core/transformer/moe/token_dispatcher.py +++ b/megatron/core/transformer/moe/token_dispatcher.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import logging from abc import ABC, abstractmethod @@ -12,6 +12,7 @@ from megatron.core.fp8_utils import get_fp8_align_size from megatron.core.fusions.fused_indices_converter import fused_indices_to_multihot from megatron.core.fusions.fused_pad_routing_map import fused_pad_routing_map +from megatron.core.jit import jit_fuser from megatron.core.tensor_parallel import ( all_to_all, gather_from_sequence_parallel_region, @@ -1386,6 +1387,7 @@ def _initialize_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor) - ).contiguous() return routing_map, probs + @jit_fuser def dispatch_preprocess( self, hidden_states: torch.Tensor, routing_map: torch.Tensor, probs: torch.Tensor ): From 9069e1268f495407598d9f6771e363737505dab7 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Mon, 27 Oct 2025 16:57:51 +0800 Subject: [PATCH 074/248] [Dev] feat(moe): Fine-grained activation offloading (#1912) Signed-off-by: Hongbin Liu --- .../fine_grained_activation_offloading.md | 29 + docs/source/api-guide/index.rst | 1 + .../offloading_and_recomputing.png | Bin 0 -> 332427 bytes .../core/extensions/transformer_engine.py | 12 +- .../common/model_chunk_schedule_plan.py | 9 +- .../core/models/gpt/fine_grained_callables.py | 23 +- megatron/core/models/gpt/gpt_model.py | 27 +- .../fine_grained_activation_offload.py | 603 ++++++++++++++++++ megatron/core/pipeline_parallel/schedules.py | 14 +- megatron/core/tensor_parallel/random.py | 9 +- megatron/core/transformer/attention.py | 70 +- megatron/core/transformer/moe/README.md | 14 + megatron/core/transformer/moe/experts.py | 65 +- .../transformer/multi_latent_attention.py | 40 +- .../transformer/multi_token_prediction.py | 7 +- .../core/transformer/transformer_block.py | 10 +- .../core/transformer/transformer_config.py | 43 +- .../core/transformer/transformer_layer.py | 56 +- megatron/training/arguments.py | 11 +- .../golden_values_dev_coreweave.json | 110 ++++ .../golden_values_dev_eos.json | 110 ++++ .../model_config.yaml | 139 ++++ .../golden_values_dev_coreweave.json | 92 +++ .../golden_values_dev_eos.json | 92 +++ .../model_config.yaml | 134 ++++ tests/test_utils/recipes/moe.yaml | 10 + ...test_fine_grained_activation_offloading.py | 187 ++++++ 27 files changed, 1856 insertions(+), 61 deletions(-) create mode 100644 docs/source/api-guide/fine_grained_activation_offloading.md create mode 100644 docs/source/images/fine_grained_activation_offloading/offloading_and_recomputing.png create mode 100644 megatron/core/pipeline_parallel/fine_grained_activation_offload.py create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml create mode 100644 tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py diff --git a/docs/source/api-guide/fine_grained_activation_offloading.md b/docs/source/api-guide/fine_grained_activation_offloading.md new file mode 100644 index 00000000000..b4c2ea753fa --- /dev/null +++ b/docs/source/api-guide/fine_grained_activation_offloading.md @@ -0,0 +1,29 @@ +# Fine-grained Activation Offloading (collaborated with rednote) + +Memory capacity is more and more important with the rising of extreme sparse MoE models like DeepSeek-V3 and Qwen3-235B. Fine-grained recomputing reduces the memory footprint at the cost of extra recomputation, while offloading could utilize the host-device bandwidth to achieve nearly zero-overhead. Fine-grained Activation Offloading targets at offloading the activation at the granularity of specific modules, so that we can calibrate the amount of offloading activation to maximize the training throughput. + +**Features** +* Support PP=1/PP/Interleaved PP +* Compatible with fine-grained recomputation +* Support FP8 +* Support MTP +* Support mixed dense & moe layer +* Support A2A Overlap +* Support CUDA Graph + * (Temporary) cuda graph scope cannot contains the offloading modules + +**Usage** +```bash +# Enable fine-grained activation offloading +--fine-grained-activation-offloading + +# Specify which modules are going to offload its input +# Choices: "attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act". +--offload-modules expert_fc1 +``` +**Compatible with Fine-grained Recomputation** +- For modules with minor perf overhead like layernorm or moe_act, use recomputing to reduce memory footprint; +- For other modules, use offloading to reduce memory footprint; +- Make sure the offloading/reloading could be overlapped with computing; + +![Fine-grained Activation Offloading and Fine-grained Recomputation](../images/fine_grained_activation_offloading/offloading_and_recomputing.png) diff --git a/docs/source/api-guide/index.rst b/docs/source/api-guide/index.rst index 710a7caf4de..ac6d7cb0b2d 100644 --- a/docs/source/api-guide/index.rst +++ b/docs/source/api-guide/index.rst @@ -22,3 +22,4 @@ API Guide optimizer_cpu_offload multi_token_prediction tokenizers + fine_grained_activation_offloading diff --git a/docs/source/images/fine_grained_activation_offloading/offloading_and_recomputing.png b/docs/source/images/fine_grained_activation_offloading/offloading_and_recomputing.png new file mode 100644 index 0000000000000000000000000000000000000000..6c8afa78bb180a0815aff02693690b864e9b01f8 GIT binary patch literal 332427 zcmeFZXH-*bw+4z>P!Lfeq7L@*-C?8~5C;#x)#z@6hLxX~se14JQ{F&<%XMbHn z{-QX;L~;Jla|#O8GtB>->zv{K>l#W5iZBO?bAMf9Og{cPRsHqy&+%;bng8A~oAU3g zFBfK?{rmjFujimh(_@$-A1=L8e(p&@A>8)s`wX|W@iYa+BMOzLkM;b{Y|c{E^DAT! zcOzcEet(CTdP)Ck)tyGC_Sct|nI?BX{XH#RoPUn2RrZz8KNTrPak z{?_uwgRA#0UI#ore?V%T8BLe-Y$_=ZEJk?^1fl&QcD9YhB_$x%g2uqXRMC6qu0EnT z^Y6>QGWZt<|C+(SRPZko{>y~_GU2~W_%9Rw%Y^?j;lE7y|6?W$b~q=NX=~l3@V{{y zTuPZ%SnI?ssCy#nG*DHsshXO*>~>Z?pu3UEUQnOyH3j9xJO7_A`$AtvPKK6d26K`! z+`4Z$?a?a3(u!t?!%{z)E0r(axeuq@JiF%mV=c967$&$X@v($e9!WoaLk0}g4XbbP z{pruwLY%BzG|fBUU~vWYfi~s0y(p*c4_X*}wN34K%F7H9qJVLHZo~nRveQ32i?6jn zLesr;Nxd`q`=Db#J%hNusTQooQaS{BbJ;h1l%YVg8KRRl^z7Sr5pM(}`3=}#9ZKBy z%=GlwOeSGNB%iiXO5&O2BSNR74ytuGw2oYdT16G&gfI42P!L`pnbkI|@q3l3Ne-?! zDyhoCu#$PvW5Neq1**@k{8ICsf=8Q1N*2zVE*nq&he$76l`!&dwDK-pWd0EJP=_0= zmXd=bgmPY|{O!)06#8R6={x6Zsn$}5d6E0GZ9xY6q*#5|T4kFfg#dls9k zpl>snd{v9w!^R~Yg_;=5{>E}P+COxuF??JdKk^M!Q0NL-7X~#cF=c*swK#Bg8od>} z{;}mIrR0#Lqz`|WREH_s2T%BeYEE#sJdG}h^5SKIwxjXk_En_CclV>4?sZ@gtL0&q zNXtF|u79GstKUfgR2ax;x`WSO`V-E<>6GbBt7Ex>zfnB(jh#{Sy>p>gCk+z1ZfoDN zY#e+43npJ~QW9;N(}up5ZWsH%n@vamIH3xGMrDp^yr2PC6x2J_jd06yQSy@{pMUVY z&kV`lQ=%rJJ|^*P=e|2Nqyt>y1X7s|p#cTFayiP9nT19Z2QuTh=OMs*?rDwpK*JtE zFW%loUr|rwKWA{UzfTesp0ae7yd>T>$Y*9DaQAF8chs-v7}j_(64RIV-bpQlaE#l}DBJJHx)9K66_;g)-*ZwZEc|V4ia)G(H5Mna3>+g~ zTD^SolekVFsBA}LN+wxn({gk6RU*_)O0l2&OUuUbm!-7u=OijFi>ERMyVji_dPome zE%MXbVobV#mD`Cb!Z-gHodt(5`n^4O@&+WN{fh2ud&L!sJNJ>4rbf zCcHo=sN9lWe7t(Tk^zz>ee|*iX;WCeekS2AdT`kKzc>6#7B9-rmlrn#hP0C&B)lcd zvp*D8dmo#SAF#s4){Wk$az0V8~>w+nrVLw9l$D1Cc ziD5NE&&w~fl=%JpUSgiFS^khCBlihV{}5Y84XUr2)iG78m^V<*zl+XHzmfGfeEv~( zw%Ni!?{VHz-cr z*y+ho_a0Ebipj=X=x>%C!l!mx-Q2S7FRAIRjfaCc;>#Gx;y(8Bi}_1deUt}xg-mw- zBx8!;afZW8PU25>+DTKasTGfFp&_3Aq-7~Fe5|^Nnpm-PAv{7?=@ZNER{oDulgSo_ zqua7Xomzvv``53FZVL&}dMOpSO|g;9?}YjbUGlt5UwX*-lMg-1XBFg%nhD%N>>j#% z^;CR8rChXpZ4gJJc^GnV4@kJqq(s;*vORGM%QwxfbkjRrjPjI@M17jW09zRXH^%9T zWgaaqtMVUp#mfWgbkd8HIky;Y9VxHE`E~LI0=pZh^Lt*M^PqmU*xWX-v#e=P+<)h?U!T5ayr7iY6v(5gmfUB;pP<35dhLH- z@~7m#L)QU(|GX2te!f-@Ku4qJS-TfhwadkjbiQ|1#lP;#%EypD?V~`^v(8ntYkcx0j#STyd@)@5ZhrqK$oqBO9nLnW_WMh*e;d%n75bg?Z~@A3u!28S_e67d^^v5V z#y-tqY;lj`FYsKvxLMeYI=@#^x{vLmg9Vi^-CscG6%!~rY3TFaMH3{Sjue1id{jQ= zfj3VXbbj!z)A+dff5@=%HQ}>GD#fe5x55m+=;SZn{_oj>N3fiD?xTM=@XoN-!ll>| zIfFh>Gp==gtCi_zTBORK_pJi>MRPI+Cb!cu%PH=b{19^);1g}ZbEuD!##e7$T5Y^> zB#*o%6;JkGR|c}V|F^Y!a_nTKrok20?92AV!{hHV|GzxVFMRnu3IUw?`@Q6-wla~F z>GPae##L}-BI#NSydLMjw>2)>MpyLV&(+v+eW~H}e$>8lg8r;@FP`~Rca{FS|0Mui z*+xu_Tj~EICD|gFM%0PxsqGL65Y(i6tMdPl@GtSyo8G~dRNTLNA43UIb~5a|626tR zBo%HwFYRq{8U!!MAR8 z$g?A?Rg{@rj-TF(<^pbvK=8?PTmK*CS}=pfQ&=j*f z_(sX17HT3PS8x;342c4UJ?6gemmK&T3_kxiow`l}4rWAxCs@U*{B6iC5q{vhxMk>%Jb( zBvle&^Myj9a|P7#m5@NlPo*nD|?d8O&)KS1V6__cx4Fmpu$& zFMFGYaQU&tZn;4g4+k<-WnB}7f##0$Pr zHH;U1)VlY~=&Snqex&Nq^YG<^4fJ!XpAjulQ!s;Pv(pQ;nPoeoJG?=((N6cD+Ub++24%9o$uXvtY;=qR2 zm(g|yg#cAmRN#g=Lws^&uhNF}`>pomoZZNNnD}X1XcM+F*n#Jh%a#Ct+G#mBa;NVo zjv@X*I@pk3!S?jdf6nXY6PcmQ&|4P_*w^XI9Df~Z=qr4Z)GnSA_`^qIySXJ$z%>ug z%tH>_)0bQpE_N=4R{=&jacA7WmJ}&F5GX(*C~!%g8t5Xx1UZv~yViL({4)box8KU6 z;HMSaL));Pm9$WxsN@&i4??Rn@65FZrXIpfG9Jch5)bPV>n zE+SyB@rEiDuV+OfCBoqlZM=89Sy29|k!S|fSIfA45)>koO&hzNuS$1OFQ|CW{A#yb zA5$O9db1BcAS4zf4VbiW*Xqn*FptrR8`5gcK8#$Vrv|mf@%dZGRiGxpQDVq<2W+!ig}))>)`>$Xfg?LEQA&Tq7*6jC&>;|Tu{xy$mvXo&jf&b#O~A z1^!co^QMRr28$@b_6L$_0v3Oi?-hJOu7&3_>PwSTGuMT#)8sw5sW?y{`l5hFz2(l$ zL744&EP7^(x~)6yJI+6E9?tNb#hU%Mh_X%QB1>t?j)s=YZULq3o=Gq+u(ehzJq$>- z;lJ{fReZ$GSG%Sm@3L>f={^9KR;8Eg>w#*Pel@6*Tg{`_mGM|e+c_sWX~m)plDK58 zU_U4C-^w%BH^RtOvch~Y)pg{e&%HQqGrjSnN{k2zfIiROv}Q$>Z1MrjK{|2L+$0nC zSj)d`sRwW_gn){YkG6Hhw@#toP&V6x`U!=f^CHKDwo>@ta8?~*!_3CT-L7?_3e9+ z58V1(s>Etf15Kp4!eUO-tA-jnR%DL%o44K|h^(y+w=T;%_mh!V2DO@ZyWCRFT6*bi z^sF2mYhZ7U8#ZWg`gqHq7e?2!m8s7}cwm3|wA)IK(_glyhVBJ{4oGL~&FVnTwh{Ij zS6Gw4RanP7q*HrNtt2M%7L(zIyOSuPL-z+_M*P|o73hWKg_g`y(`zQc={+rEeN1Yh z=iYJ`(~fWmL3euNJuRuU3mY&Z9-^LUTr%czYN${M<(rtOa5|ylIawI0<{67NJ$2S+ zwnAP+>YryexASMsdAXNFH>Cnacw|5lZ#N9iPMMZ0=56n*wm^$KP=LR^_^%l5#-rB} z5q&h0W5K$lKTw0|5ezWR)UgwKeWXiLTI|;B8~srshN3NH0pIYA0}b!If)iDw3$b}xj%QZ`dC*fx0NjZ~Z&J^D5pmiu-!>q(Fe zARy|~P{SkpBdH<6GW3~x;8*v#=nA$JsTKv%Zzox@SiL=io$jL=z&hP4MQVWULSzNcajlz2vz2}l3YyAbEH{d)(x~P0iF!4= za#^ZjE zLSMw5tFNzbu9jz&ApU{6QWnE~42h+QLNQbL8m_1kYZ8YKJXTtCMw4-&ou>QKHMc=8 z=$m$HV@7~_7*;@4z@t^4RD=02r7Ot#94|f_qTUE1XGmgu1^lw`ad5LKtKR0>MghzW zfx;~bYH;uz`eeD;z`IK-i;lX~!m4-banvR6sp*$lkKVKlU3@M%B>tH$Lu~xqiIWzr zR{bM1X9KQet*H7_8Lx5wL70QkPq7CL52Z!SKon_lT|xXjFOLwBZl5UUb>AlHM>oef zG8smRa()15A&_m(KE2@?R!5JQoV<$N58LJBzD*Q-W=>t)51I4>?}wEJCejtgR|+5* zG0Aug)<>*GXf^#KMuIQNoM(Q~ur{DxE6@4OzQ+~qUWiq_s4IBj;BEy&W)gPVQvtNs zYa@MB6mO-yc_v}np)2&KlHh9c%4+Y?_{x;o8g9tl|I~v8eU_IPGt9_qIlo^~wYZTi zllyIX!sxg|zB}(gBxnu z&kRA?6}D>wNYvL~ig)WW8v2iSDcd$~$qECTJK%-% zam~LMuiGduy>fT=e;FqA>g2rJ&oqaj5fRrfGwJ4-nVuupP7&4@W?kTl{8fu2Zl0sX zSnUGCq$4+8H1s9V_(H2ua_SSt8<;ZHXkC$-yEe%PH?=z5P++Y}5tna@Qne`WeHlY4 z0O9p^pA1rN;g)eLM41PFtT&GDQ=ZL}$=>_a9dt<9bt7E+I98Z1DIZB64_fI6BkGq` zC`=8HmCJPqPIzsX*sF1P2Gv6uhXKDbrdw^{&Mp{c+4u6%cSVd(CxXa^ zpOH)ly$lYM_?#2en;loX>Ims-g;q@cI57S$$u;kunDjzrelf{JcBUz=_N%`H6}JQY z!HjWTtL^azL+YW9@p0ThWK367u{mSKX6VxmxwK!Pc7<>jcXu(!up~6KIsxR{$f%seBLdrx==r6KNT>2j^}-nUB(cvCHh5HkOjPP>-s_z zHembF=uz~>R=}*R^^)h|GL>Tq>nza6Ng+$*Dagb`S~c*IO+U&g&%$t5N?3@{tg~(5 zI4yW9`tQQoui%*GPFrW^Y6#0lTgv!AHO9hK6Xo0b9(z$CE4s#^>nJy0a%G@D<1Qt; z3=|N==>mq`7Yz~Q*BtHHNXuwx72zPYkEew9_zZ3AJEY=7Q<-ku6pM(6*gQLheD8oi z!^2n>YEDx1d(=tzG5JQ!jnJ38aNwosgpWmJmRy<1N zl6(exeABUN<<*#-@nf+uCx_`5>nMu6E%8ueOmV=veEGczcw#GLG^W|ZUkuEExiy3c@;1EOfkE<0H+NUNw(Lg^bNb)B&>XclmLz#9uM37c&B`XpwtF=61 zFe19TZAYUqpN|#OYPDdz>7R(aHWLZUvKsi#j%+Xi?+%X2RlE&k+`u&}{kJNRUDw9h zg0OyiuJ>xaXKo6yW+97j=F$Ou=MQQUZ?xOq303Df1JC+1-h-@gw*rPt?&T}TJgjyr2hozW=Dx6Hu`mc8!blM_(WPv~N)_ z_}T8`PC5Yb{7sT&6jbu0oLZ>B3{aASOSN1Ib224OY3Uvt(jJ7ojrWS9K1#X)E&DRE z9_|U4<_(fRoXSH6u4Bn8<|PXVC|=U%39&N@1wWF;i4b0!6i#_>=~r~`LM8~MbUct^ z30-&n5WFrV+Y;Dm+}WJpW$qP)APbsxvt z`qCRMQ#j`G#X%9#O3umcjU6wm0d5kGw~qW^OOW>uy>fbwq(g>wPgsxnJF@@>>@<(p zbwc1=P;Hpo^^TGx|FbVCwtDIbmgO$AK3%?Ulk$BVAwFN?@3eQ z44|C-bPu<&}nl)|gL&CXuf}%L$PlltX(P4U=6<^g> z51tRL*v)_Q4D3uEO&YK7@ev0Tn|m8{g%fA49eFC~l7r&b2O^>k&V}Xq&1xcpuy8z2 z(946S@CXKMrT921-^+V{}pw>E=RHCms^t}LA@Z@?aU zr{dqKS2@NENt@@?8$ao6Eh2J-JI)NFKhke_#!mW=kQrJ1Mw$-@3(&ur3is92{DQJ` zQ-b`wFH#0(^G*uj-*t}iJ~CX9%RCPKGQ3x`M=bOZGB~{Z+Nd#~x>?^nP<~)2^#%vzEchhCMdy-RZm)EWy>0 zlovkSki1VU^~?AJvf@*su%#rpo1Clf&q;cm=hCz!_5FCO5aWF>y3hQLVZPCPJHp@w z@Zcl-4KUB%goN&hF>Dw^tLlHo$y5;))Wp zdD7!$V4=MUd=>)%DU@PbQny+4#6+-x2FD+F%4L@~oXRqe9O3(#I@Pj|3QD_HPDjE3utqKI zlz1zr&E`T(NQeVom!Pz^8?;&=^Df)+Yjv21d?L}NVbV)gHu0uyY2UnyXFxSubx?8h zvfdygT@$Cpp^Wac@lo6ccz;$(&wOLIri0&y_n0riwF zCbROvSZj18w3tx&C~e5zQMvDnPTi^w9+Q{psTsJfT?Wb9euKZ{>^9!of%psqhgHmk z`3bl)^LwB2dBY!k3oxvB=u_UT4DR;f@jlH5HnbXu!-cTFe6yx^WMs?nO(_kK$9r;u zwWR=p^ZxSAn>D^2BB4}z<;AwApiT?`cB+|x!E~7E6rs0nr1rq@VPw`DS!L;`6O%3K zpvIp|z&@xnw$QbR`7R-@o*qpoJ<0lLv8y6AXA!K%bQ>d@F-y!XrxdEC^r%6>^knlw57_a^t6uDNv=i@I+B4AS zcN&NcUn?~t05boR(Xk1+kH9CYE$dXQ&$!mI9p!l zz1LEAZJrzCBBcsYpO8KJAwFF2tF|}<+d3;Y+J2f~>Cu1$5uUVx^2*QAuQq2QCL)fj zyNIuQKZ=5|IL%hPoO9TvH^T@%zK6JZSo6Fyp{7E6R>QwyxUEM_3qnlk^-2jl_G-@3 zk_-h^%jxtH-iQWOx?qP0wULa>EGEFZ7`+9WRwAG_z=U%iB%`CGiZfv!_4CY)i;LmN ztbqwa>jtKmFNOE4${ih*@Q(VF8XAbq8gXbEs82{A9np-WCeq|OXO)w5I+F)sNvJ!R zrhUEHT)WeK!#zzwCl|S-NV7sMStRSm-Rna;+TH{|6AJ?@zKEQ$H3}p(LWPdfJ*PaW z+-Nj8w%@3c>)htDqqOn6IUv*}qD zTA3ruigt04gIq0ixrXzP@NnU9p~++Ttpt8CQjv@}_dS~9;ViA6cR!;092}_zs{*TG za=rA}yK8V>~Mx23JoE$xKw0X#nvPgOF-&Ex{b>OalH7$By z*KNu(P)CTrwL7FhJqCG5Zr8E>W|p6>D0-}Mur?Gvd_ZjAP2cI2yW{MDA_GhZ_r^`t z2$#bKxpUu}nTc?g7v_DkfA?Ktb37GV=k`zgB4YU0))Yw}*dANLuEy?YZicAGWQO{nndT-~_-v_cqdv~B5Aju9> zD_gry6zsvCb6eNB^p+%jmP)+^*CJlRsAS}RG)(-6-jUW2tad%53{+(zLq#jFuu({N z)@2`}*sETmUNr4dI%{=WuDvH|7wt(^k@z35WYzQiG%Y1ZVBfG6E2oMmK*{n&nCEJ( z)rxk9VHMpEFOKs019pVraa_#fBJhh*9D7oL`rdbomHm=G^SdoVnN>VMy6YG&qD~JQ zJLKXUG<@4ihnQMC(FG98=Qt7?=Msm3x2KZ=QxfKe4rc`UOv*N8v2r#UJ}E-HjdEW4 z8XGmS>nDx8$vde5=2NRVN%=jl#lfv~cUC;#@71|5HM}&khsR9~O=!C;XP*@j8Qb=b zvZ|qr1dDHYvhviLMb}^kROS8Z=g6g=2PPHFd6@k!FH&3*>;Gu__;U{}p-@a_66SKg;viSNC~njQ|Y&W`4z z3;&p;4K%;q0MwRL@owx0oMH%UTkCexaO=v zQ};Yi>N4YTjjU9n{b-{LvrlL*J(t9c$TBsVJZT~NE{*2P>OU?$9rZ~!a0$}4Hb!xS ze7oFB6H^mFrQZJQq-k&*-_$8>0Ck%0enz*7_3_MKBUn=4T(p8W4AG-6vKx@LKV ziD?N?^<4F&&AQ?pLpFAqr$=+;9=YPyh4)aBKA~$O28&dBmPLK%e;S#|uMpD36U%S& zeaSu|N4$C4^I+4J^oK51%%gIC*IGLY1QJ$t@?78ad0vHx=VN0|75$#aPMf_@N7tEX zfjsYlMoTisBTb&YBPHIKEv7c3e6 z%DV~l+(^@zKn@k}X=?yqY&2B)FNE9yiJyPE9ns@v!^IZUnoqKWx^2?OAG& zTJ-S7%PjHQ{lOweCcr;?o8*=ko)7ypd)Bi-iZA#niQd;h*|F?kS)CqzwxdBDt?TwA zZC(=hK0b$%bw6l13w=4SDOA`l9i{nYnAijVv>y`b5>T?E$C02*pj%@o3{~CEJ8w)i!odBJsIv+YMW;#NS{Wl39p;%znTktc}2=?HYFHq zt+@m{M9KC|e{GFboZ47YPO06LpAt?CC4eADaBlUFmW(S`^jD!7wv;vQ&|a)%Tad`k?lZ;MI9wGSjc}@`*J!Omfl+1 z@2di0Kw-WX;FAa9Voak-LqNsx&&H8I+jWfW zgEejhY^NM4Tsy3_6+u9!z98RU6CpWT@z1K9pLq;WwYX%1>nBupc~Z?M7Txrfs2Ve zuK=MYFQjg;dfX(u#x{;_pS^wcxpJ%DhyAq~k}epk9e?PwO{de;qD7l0FP{?IjOUqW zq?Tu-!5I=G0ASpix-ma-ooWtDpojU*T`w6Bo&Mp?1JBM~HH^?0S|%57&q>rD!DSC* zB-9zW$9+-@h^G#lI7?zP-DI4QNc6gfF~Hr9U9x!Aip0+J5QZKyAZhi2#8(?;L>!zE zNQX5JG1uN!ug&=~-Bvm?o(r(Qtx^)DK0Vx3OP7hq>^P{|*c6eKncz8c;^2%15~9{t zzYRkutxVwSO3^|Hf6#^1pGui1E1e)(6``*<_*}T$%7M3SrfTC*8#47op4TG7vg^MM z3L6^-R)FR^HLuhfMvX&}RM0A74l2;1YNx}M{sisrT^e@?2>q_uvrxtW&`Y0V}MYg0;wyi_Tz%wH@n;VopYNJ=k}&6 zaZY_kOc3RJp!fmUyPg;Kvha`c~k(*R{`5IQzt(wxHoZY%BHiE0z z(+ZHUtaIDzV<7oZa1YyLPV3$AR>!mt0EZ zJCTQ)l4qTN#@iX9i*UCV;;vbv^N>)o&Z?!)$G5c?IrY&g45Yzmm$g>i1}MFt+Qm z{|JxmqJ_BemKRv`MZ_a^;wPDTmJI3D&5Q49T)J&o(tNUlNK_F7-nl|Qjod#T+%~jC zgRK({tc5n@+A@NrD?Uu_bkK>jh+scFcTLh2_bK`Vy)NqW7(IK7)5rb^D-~zF18=3( zXxr46Wk-yEn&3{k@<=&d;pV4fS7713^D3D40|;IEJ%+=q#WnX@ZB*nzhB8s)$9#BX zbK0JGMErZ3n5{!5CydAu51nh|DG0LrS`{w(;#%!0M7Ymtbq4M^*s#eID4q}AVG9b2 z5r`S@A8Me@A4HH9fnzRGi}EO6M) z#rSdO-Im`v%(Cfp^|=fwr;xGe9j_hk-5%OCkWgoq%SP^<3Dff^@YGvW39kvy8NEvS zanCRf{BTPEV6FuYUb>=&smQf{HfHko{{lej&=X#-av*(yU!saHW;{GZzJ@f)q&Q9;( zmzf$TbP-doyp=pxQ4}}z1;256711|!&v<@t8J6p9Ya(1eF5JwtX*23QUD~DLA7wU% zb)W6)AgWn{x?%}9t1!}+irJQp4v%nrySNy0Rk8Dl}3@@AVK`K0tYFa z%U<-!2+vYiN~a0_b~z!`0|}okHm`w3bY+grZ&_)^xhIT7Y5k{*BbN>@7*RP4q)sMP z3M#^)kBrQ=4`+pSe#-UwmMS_nxo<5x_E%kS9F`gXH2?M7S;*%8vW)AXT zl<4KO@$4v(99D)_dw2QHs6@+xJVW200TWB$ht{XlxHB-#b$ZeF5u;omP4~ijI1su! zmtyl-@=kQwY;-EYGE%0y>B@^|6EHvHEEz?h-}8if&F?Ox!B2;~K^70JJH!c&+P!Ca z=aQP7xmmCGrVG1WBU@ol`K?=DQ7rOuj}B>^ZK2f zQ|uhN84{{lmrHnc*I>L&$`!CxDY|J7uzxv@nBh5i{QV+)sA1r^*(0@@tjmJkb>VGC zw>L#DtfRkz6z`&%B(kFN29hT**9oz9Kf*Tuj8(=rer+IEd(dQV(@#SiEvE5m(S7To zE*zlpNUmC&+ROOerC@4{j_ImAb61PX*^%#@!=9m^x(6wVu!NMC| zuPdsO70cHOOxb@vESr2&0rM=88?{bPpp5W~uF<(ebRZk5l@umY=$h(pPHkBCHI-gK z`vc==l#=iT4e4c*B1&JGNKu#3?_ZQ04>L^m@>yk$9z8AHcj~_KZNDHR<=gHl&2g2g zIqG(C{bx(xP7Od#ntp<;?|bm0V>Xv*s&9hBmuoZ$ptpzl#DVX2FZI>t^mXwa7CBy8 zHbeovJDzh^dh?INL0*O8J`em0S7&qHcizCtv*%Fh;S7lT48)43In`mp$P3hmNyCt< z-I&x)SZ>K=swgai8;&-&DR|JFwF?cSVcoZKiPU(djzAij6b#^nQcqPDEx26NKBB5! zRt>BUZ=~8g&lAz(_C{w$BVD-i-1@Ah$<&$B`ixx4{bg^XN$PcRR?`=BO+bHU0~@?P&7)Aw5-Gmo^yPb^6$FD~5P4ATI? z7@F3x-r=%m^v)Os9Zd)T%KK1ZO3InFkoPWOgX10#Z@XIGI zsXzerfEAg>olP!|YnWG&BaZ>WoSe=5CsHfFjT>lxpr9*csQzN_=oSn*xc!jy&V?0! zR4`W2RYr(P^e!E5{&EwY+$Z|E>TtD*b$rr}=ZdzetYHUDSr}Oze>nY8hu{^5%|*|%EJa8kQF%ReY-Ir^>!23xENmft2geZ19aGQYX|u+xRIY}coyXesVd%6M<% ztj_e4QIZ3h*I$=TK@XCBOtHwj; z(&-l)GEokvR`A%yG43SZ`}x>3$jYIP$y=-8;nVIs;KwB=m4HjNB(IV(lW-4H*Wi@J z1&ot1{#mp6?pU`MnEsTHfsw})GNrdi?z}?IGT@yQsXLEp**Vp~4ZT3eHS0v92|_T7j-EF5rLbyuobePB1?b*-_)YP%#lN1kFRr-L4yo6 z%!=bn&UIBo5QFl?ViglkCcy9f(+@F-I~6hVVBhp4E1f}X6dCPRE`%7Dk1R=}Q3X8y zMT(HrrJDB%Ck4H-axmvYBl5Q6_2Uk!T6tE!jlRI2kgn(b@?c08k@!b2(@)aO?ZCqo z6^7~Iqyzs0xZ7W!7}c2utVPE3dpM8%JE)* z+>4K*a9RJ@D^VcuP&nO6&nDYaBI^cI@z0y2QH1t>>gPEihn@%Wt#HY+d#g9e zapi!Tt$FRL4i1y@sRNwe4MsQc1CHIUCLCh{`?E3ef%xeed$>7h!&31R*R_3tfmDh0`gbk^YTUSqa z-{|Ms1UvD=2Z(|`r^#~5kd<^Rtfv!`6Pe95_6P3&HdwHx*=7;{Yv@!un7q5Z6=7Zi zUUuI+^X{(nYG`R-cTGR*PoRMe-~Q8P!61uWP&gHNymUv%E#&)EMosxqK7ooca2tkg zm{Do3peNcpQA^i%&^_xzRpprt)l(r}=vcMYG>kMLT?B8FWE!W7t>z%e)eGo$;AvJQ z(I1o{CSr&~`bk2B_E1jJix~ z&Mkd>CRhK~23Ysfd(YBlHJ<9|098|y9<+wbf4j>sUgp&9)yR5tM*9Kf!yE6+L_9rl zDeh?-ADKJg^s;qx*`at?ngFw+%54$r*`BDM&vs~;P&u+)B{56T^`Pcp`(MTFP$iXve%KB1-3!X`aG#0`6-?UsHuhtTt;0pC629@bjn3 zAk(+Ye}v8w;KGzo^kg5u;x9+;k&6AHIw64 z{x%nR0WyZCo|2r{x-@KZo-2tZ$*v{;TLGR6@WW&g$QH3x7xV^8#Q-hX@iWscLc3f# zU3@St&?H!Td(~?69aiHYdF4sW^=Wg$(zgv+6M)wKyz>b##UtM+&{NYXwNV0*s_53? z{sZM=-+uKsNQeI#No8YXjuJFln0Z?ZRk7(hK;OHhF=%|xwpxCosH}9ekq>lZcm1y8 z>KRs(W3>cQ=8{r4P_QL47dZ*l(2iZuKo1Vh9{rR;@pMkO4WV!1&%h!gU=j4A{2k;e zt$Z`AJAyc`J2No=h%r0~13itChLY#7{t&Ujp+t!5;pm#Ns-#dt=w6RSW>9>n%am|5 z6)=AX`s8OSxdsA=NQ&9Q$WI$JGZ#Ztdp|aW@OOs1fOuwG5gY%i4G^ zLk-$RVwAb73Kdl66V-kNBG1#;@8gAZB&5wr&R2{PO9svzG?|aTAjg{Y`7xg8|MP# z$UE(~t7aWNUS8fwfpneGKC5re{y|*Bz*({jw+wTz;4#RyBJxjcMQcoDOQ4LHKd8&T z2G5PX)%(3AUE-YIBO8{!U<3Z>wxd-2`I#Gy?@1SWfzxMshn}TP@PsNCXzDSg;m!2t zAU^#mGY&rJbiiPC`G81N9j7J=$%_YtYjHzjB{X$mMkpKmS=VB>kyd=t*J zvI7Dd~-6fKojJZ4DPJGSSyuq9lY>d^pkbloc_!H(#6q} zP-!WF4zC+!;W3V(Kwt{F7o7xWfWD7UQ@{zPcAsA7JXn7#9ATWh^4AQFjjKaYn47sP zhwGaGPvPwxG99%9+0S}Wa^UU`RMP!uB-7EvgJ=~8O)FmgC7c^5y3fiu!k~8b&9R?b zC%xen*OLPVT3MY%xetOxW7vs(2^?`Jj=#b60}#Z!=NJ4X@oG?Br?8&2vHW#nKM);zyYL6HI@(ii&P((7SsM zxtSr+QIV)L0kG#m1>0()?Yt5Dp<#uZD%~@yij3g}!=ipQ0GKF1u*OcSUr9a;%6mT) zWXhwxA-CW>(3KZnyMMvAea>|EtioV*2`@82UW-nf*=l-mk#`@NexKHx_0vlsowv04!HpjQP=kI?DZPhC-0N@@At(I!EU!5Piiu1-5TGSJ zS2MRK$B!R0NzL;LirC$Q&~SS8)nnUvG2u`6^@69pA6jzSt}(5g>i%Sdksf~k3<%>j zke1b@y~xTq(~9AN{1nes434W?y^jCoW6MK71G{|>*A(=Y#3p69mT!}rb7%U(CN#LT zErH*QMh84lFv$yfvF_@so=mz$*Xy7JchlRW#?>!PGV>newd+B=iE}xW7KZ7!_>B65 zyj!sRk>pDLO6&V6-uCG$luh2;(|Pu#a}Gf5-J14vhm!OJT5Y7fp55kgHdmkbvFqXH zexv_0rr+Yi5$ad$yY`H}A?7Rx``m|1M#n$pe! zZb&q~JY2~>%S9y;M1*vFInKU&Tf3o>a|h6A#{e=kB~LdASm-yKz%*kC$U42%V$LDd zgWwx8X8u^3`-F47QhjthSQQqv}|5-<5xko80vJ-#Di+9gnC-Bp`QA%%K4l4 zy&UgRn)t~ukyFzGWZMJofp9CG)X_=Rgg@zN+PJz>mT}`Vp2}PkR$W8g{J@~_F8mvI zqUyna%z?>({sPCv!^f3(iGkg&u?_?&UH%ZF(}BJvyKg@ng4nAP9F&~6zqGj@>Yiut zn@6w}BnMv)x$OS9SbAjqZ6UK4xWPG-R=f?>kx^b`j;zSJX+=Hz1&x!leXWR*ev~0^ zuoqXPE=-9?xcH$`ZTA0R??1zu+P1%8cw4rJ4G|TkDhNmyq*s-u z^xlgS0qFvU4iO8zNbf{IdXpMzQ0cwb&>@5t0tr2kJPY0DZ1+CpKJWi}Kl;ThxYn9; zuF>Y0WBkTgB%VCH*1LfVyKQv%x1fgrw5WDF#M716dCA^=^5wUzpuTUcaq8i4NTeSmVt&wSx%<^%)B+y0J=~2{03x%$qe56jz*VF+P?OlyPf_Wy7 zoP@~R_mBG|e#Ic~Y=&;)qj@fQ;~e~N7|BP=&)V~eFhA9?D6wfn(MI?^ zv+*1=y1X~*?=EFDNX5tFBytaGO~{z+)4f%BqTuB1np6DOc0;0;6VONQcdL{j=da!6 z!$?`}(x0-0EUBkJ#?~@@93^~PV!Q^HXcQ7Ou@~{~pK|M>A=}46`x@&{2x(a{wg?sH znlEGRSYd6B{Tn;^x~V?c=}+!1PqGn6&#z%uuiter;Bq@YRsrWcYc8NrfGa-YIB%ie z%(6pm#=)XL1{7jSeDZce*mHzFd?pzB6gQ8NtThH8qEn|?cn7aHgFwh zs$^{=JfLwX*p%}mzWf44bkr+4)!MaQjZjP)?MJ7&xiq4j&6v?jtJzAH6U^@r%D)x? zqz?1B%YTY--=jEghe3Lu;^EWxuSG?;4S*;Y8TZ7?-a#o(+#V;D)tG*R=lrVFRZ+o( zqUMieFVZM~XRa;ArceNJm2JZi)p0^E*-&G@3+Q)|Mag@qhtU-69PZ)jyC>wVz8>BF zyfL9xt8%N}ItG$j(9u*>xQ)6?eD|0s_U<&kRF8>B83I%xul*Li`IgOg8P+aT%;{w5 zx*o*18>)!04nG`iU{Bo-(G-7C0m}(Ho%uNyTbQ=7b6MW+9q%bOoFm80Rcu%R%=f+%kt!V-&IOP*>kV6vB(_FYZ;g)#U3$&y~H9+%vuPgTise; zSxx+wG>!~DDJ4o{SI2NKL!$@M?IP#J@wGI8gFb&|;lHq*k8+mUVRPbfJ6&>i&wOyS zUbw!Xea|P{(2`MWYwoex`n$*b{7aAd^-7M|H;NPUp#{8<*k@KQTC^%1sy+F6kC&si z-ilR4Ku8z&Nuz<-KTXb{*K5eNWsUGL4<7(yfnQhB3WII(WQh+}8Zlfv%*XbGMztj$ z6}~$M(}^dMY4_rdz?dMa9X=^q?I>x!+GwE9E<;69ydayo1ozA*T zR1mj#Y8=W0G9t)XSM5m-z3>0pJ|Uv{5j0hxNgxe(P}U%jF_^VpI)7+9R_tUwGBZDB zaaEXp=2e~TGLf~R-Yy{`CF7Cz=kTkPLi#fDh#z+ArU^@n?%YZKK+HymLgs?)OE<1| zw-)d8Br_R>n)6ZPeuG}h_^_`$cWqjDSl8|^PJD;gfQ<8E5R2APsS<6-2*%0RqkBKYa*+#H`X-~@e8*<-=$KGBz0+uCpt=LayI+NFHFc zGio1In*YMm#V5-9I&cmw44Me*1hPnW!Q!xE+zWPeB!3(2kz(8}{~{L;nIxlGwa?83 zEE}x{MCd4Zh^j)jhR(~OhG+|U{jzJ%_@`K*H zFQ95e?~$Q&g~i=DLTB!-;ogWn|0$6KoJ<&b?o(!g(1h@I z!z{T5`;QLZ;i?dRFJcg8aG_+#HL|GKgz-so^TksG*_r#BgY68+l*D@lRtd%y8bEq! zXZzE-a3-LM#chK9I%sQ;NkK5vi@PWj5RMyNK4e5MvCsv-s8nOxJG8a_?J;$a4!lmA zQCx$fKon@!(6w9)DPK>FO>8b8OLV?jD>lHFLYIR=w__q5(OISg-o@tzjWnh`gx9a; ziG7eHYq`=h6&Fi%wlZm#KtoS;urhU!x$S?A9IeJtMmr>`tLLCbS#dFWo#k46Ie~E= z(ZqVT8tw4==)(5H%O?CGqpPfJtIoShbb@>#McW0s^B1*t&3pu);iFUi70CWw3=`C8 zQN-mgnBH*3Wv|ibp<{9XKVXz??AkqVfTFi|><&jeG`&Fij4)6>^BtnAh}Z{*dN1?? z0I1<0Z*SlP)Z!gvk~%MMXfN(@jJGJTPlLC2 z&e7PYD1C-^E5jx8r#`YywH(Z<7wNSq=I(fSjF(t=Oj;b;$!O^y@`_YGWya<_H4q5(;YH43qGr0#FHe-C$1ckSrm;NDZb~cElGiDlY?Y$)%w_#hjog~}gbnwbjFtPR za|h;Sx^>&0CLD@NQYN~uL=UaCAgY6i(Jg`l*Tnt(N9yfeh1<)7vWBwn^{y}E=uPQA z$1>}fEGl8V7?(hZ#GwQy9ptC{ks&@Zs{+f>V3yp2+>5qFm>U(OU&oI+)L8`^>zQ>s z68JsX>TL-t4R|g;j~wr58?_>9>`_zuv^Kc{KKIK)<#x{$_pWfv;k=NM|k9t#@CQV)A5waXamt%xl#-ueS1spOH%Zynk<)9bRL zC;4$s`=CDGcU%)$Rljdecl-7agru=L&`LrpzDZcuWENuhJ%KQqp@6^pHqGk-L~0>)rl@} zNl#vfK>f7LTPZ|`T30&H)jy3q@aqNT%-mc_7l%lBZVvMB4JtIIhy9VsjlA0Xn%&8# zsab(Cc$rI@dnQCS8G{t=>zSFU z+#K5Iz0Xr+N0I+c;4GUuP?q!QkYXyiJ$E%PaSLr z=Gl$bUi7uezulv5`026Hh+!e-Ejbl9cMBXsYmJ)BO!&6N2)UOUVv*Om82-fTZYrHY zt-)+!j79|ar$ZP6(kGbmlV&#Wu1;f20C9d;{Mxn&HLm#CeHnxHa;Q~ay@c3y(nkp> zeuRB#AHD@ZYCYCErt7zmPE`jb)*eKMXoI}RN-(Xjqwe}jt1qIr_bsg0w)YZ5Ai^E! zf_5%7&0$uc8VHg#cRMieu6hLhg9(+*`g73v48<(Ad9QWGs{_oK)jW|zW7PhMXuyiN zjn>DycHcy>*h4GHV`|B`Hbp(>OqVtD7yhU!k&%~srX9+ZlpfbA zADZ6RDAb7YJjRK*0*-*b8J9}wCn`;)PCB6H6eJlflKQqG|@KIpQ&dgJn&$T6*JPrmeSheMVq?ils`dt&85&uaHK_| zNZl4PDWjK)MrQA*wf(huPr~m&&GZ^?kaB^QRaO0rk-{}-=2=4-4zKoz<8# zD{~R@p*W9->1oo~>c9&L) z{o_|HtNJo`jr8IR2a;c}N98wTuq&1sg(_7d*`p+9y;!@f=w}i&H0WQ8J~4x4ykwbs zjxB1DRNgd_iQUd;tDz#xp5RL@V98Nze=l;4nggAYz@F8+>K%T4?^!4X#j1!3lAu+b zXv({v)}$PG>T0Uj^SXkmu&i|zyVR0x2PsS)1lMqqk|SiEBy?Q8@JQ3(XnWRURVsTj z(hdGH3HzI_7nKr5K{CcQMP8Qxf3Ond7}-(0jFVy8m3_~B$sG}|C}Ax5z@hS6%Veeq zv!5HZ?rj=St+Y38CgBwA^_lIU)U;m$iEEDcEyqN)0SxRUb)h zvpf(JYH482UBm))imOEsXHk4+DxEi}EsF%*t0zph9k|aB?#-n8BUMsu_!*@#H7^ad zX~D*+q8*n#F1*iR$?++<2*R{3s6LVdM_wBCPqpnuk%CmJb_!B@B1QmY(!8}*K+8u} z`hXrHP_q#BB!&3Bn+USp(x7e|oEdp76(lWQ)N@?b)Uh~J17cXr+k72}*y(&+v$Fb4 zL;TSrw(ZLDj77Axj8_U`3Yz7Z*GGSB&dPH;P3di7E6k9yep(^%*zLUVWkZ%?Lfn&q z+P(2+kE<>^eB0F1vJEDAm5=-Q2n(NGatxtc%prVwItYv1I&l>TlJ~pkmA&ig=2!#g zuiDNx`xXsU8mX_Zp3Q)QkUQBiGKEgCT!|gGXldLk7ZuI8cBmCyNa|0=^INYU)ci(C z8EXq{ZU&r`$p@2{O*Yokw709q)=r1Gl+95Dg07(e!zYo54ILIrWN%KXP&;+>;mLql zr`6~(?FVy*XMJ4`C{QfXaW0X0(qW~$BbF^C(=}i&`P}qXw+aS7RF{^DeEV@7(^K%y ziLjwZ>m_hk$z6)~^i-mcW|dJv*Nj%;I6OWyL&5Il%tB+KZ9PPRRat(;9=xB%m2r2) zhRVpdp0hI%@2XcWdGGauA@Z{%AJn=1HNl424xT(r{I;2Ij1am?t6@|w=`p0pag}n- zdehHgE%0o;a`v6^-MPNNfEB4_z&}!9?I1Q;S=&@8FXF#OVD8p(YT>)5ce7Eqsj{ru zj`qgE-cC{VVQf#1hWgsf#0M=Pa+MyT@dml`-}m$z;0o`aBV zK}LJ9?Z*1TH6S)-oAF*)?>R(xg%%qA09m18qxT~v0+j!2=CyNlf4vT1x~)XFR`t$d zG$c~@0_U#Eh)eDeUPaeurm0}1gi4oAZG)S7tNN=$L;h)>hUS2JB<&%T&W;;6BpTz= z*V4P=J=i|y`Db&-V+iWpNkcZT|99_zXIobdU19lnI>^YTx%6arX_d*VT9z zwxVYaJT-7Q1~4@HaPaC1hm(bV_aN+TebIc&&>k7CAOUtv&g=Re8;s@ zmwMl<<5eFR)T}f>at?@T*<^Y3&~2dE0aJJHj!`)}lTFZfiN@ z;qr@+UYLvck(F*V~W-6XtRo&&b6=L%J$(`+0`=YWP33Nl; z{(Y*5xpKa<-kCRO%SmYgevWw3g>_kxe^8JE!sPVU>oy_$oa8HVdH7eS;5H(86YVQ0 z7RfssitiylM+)Hqlk5>HJH97@RQXh@p4;bdL>~qhY!P?Wx|5@hn2gu%NF}`OcGMw# zK`7a^xM#*g4)lo&8m7NT5slx6Q4^R5w>lLY9q$riR5uPZWFdHl0RBelrIKk~T9-@_>f@BKd3vcg8_ipR z!IW6sw~Hv7V&ACaoG=$i{KzL^pFTk8h$PyH-cJ6 z6*EH?4env$OG0~>iZn7Iz3eWIHTcZT)yLoM?2lyVEUJ@xg92r zm1w)CLyW=s|p|e0s0MWbhN`&_ph*uV?0xiWuHE9L4h0ytD zhfv-7gM4`=@H`mGWNTJk<^|osqL<0`x^dztnqgqGK@2g}4y!Yx|D4*Xf050Be#6BI z#!c8YYGG~>H2`aoi6oO}@7_Xo@$SdYG9yIx$H*K*SocjEo>Q|(;-59;u?URQZg{sk zl!Veiy~OIZOhv~7_usfcE5u+K-wg3NF)P$0#JkVv4p*N`eYz6H1on%FAsSn`b{{P+ zxD(Ze>a}RITJUZnE$cjZJ=skOTti?z8*^;mWvCa|p2T|yzioCSL{*Q~;9JGo^;n>L z#-M@_suR3^PX#}71>T^uI;vWfVfz+WMu`4AeZ#6QZzY4QDngeiqO!WH-=!Yl^_V@j z_49G_oUUs6sxkmcb-BApU_yD=N;TlvbRHiVO3rVS08yMK@qyb0FRbeQ%Zjb)@#vZ@ z@dB;=D6p1_6IuLV+EHR6QlInWhkvrJ#5&233$&9NM&yq@Pgsb{aO{%$c{VTw#^q~WSD z3ya3u88COE?)M$_mTf#)9a&|{^*sn1m*y+I9_Ee%pK~KxOeRYS92#CJ$Wv_a?3YhI za@s}W>g=gBu<%0cxjMSh)IRv@O}?i10I*-?YHiT9A}yQ3I04(VbH+fGsS|HKS~7NF zTm>9oWFa>WiN_XAXM77aQxJ_Av?wY=LiqU!O@X*;k*m+AdyFmU^IF3f(R&FEpX_OC ztfeIvZit3%8Ldx+!Sx&%n|c8&D{z$#Utw5Yxp3scPv%XF|Gsg$O^}2EiEJ-g>J_gi z|Lmz%-mSIW?`QZVACe3^26RPtz|aG6VGr{Fr@x=4ts@GTx5!^Jh=zRcqvhZW#~1AY zDMZ9Nxh1IX2%b}=aID2n5Lcw+4m?0F=`ETy_~1ZzA6`bHnTw~U`?xQ{&ubrv6WQNS zq$1K^lVT$3WSHD#&O&UL7EMn-PYd@uy<#M&Ca?_xyr`j93ZgB!)pI}>1c2U}?+ z#L~^GR_f0V9KZRzzw8>Z$h(z6SU7q0k%=<5I=(1Hh%8mP;JjvwdBFCks|fA)gtQl> zl580fjl9%(6W)P6cMq*^+YVf~xLd>zc~}f%$aUIVUC+#mXI8|z8o_SUVt@rISNwP-tzK_<1p}&$TogC zvrBfaEZwRni^NU}x^mRp;znvqd=S z*xse8Ul)cvIE)5tNl%AvkFZm+C{Rx0YBN2%x;p#;qcC;`1wNu)^#-+qyTJo~wd$_M zBN)K^YjY)!c~T*GUX%kpT!fk33T6)3YHfinfP2MZjO=5Xm*n;a>>cH9rz2TFS!sfw zf~H*uUBMq>RDPow=#CNO^w~gq7}3D?iPdD1UdQ&7Qqjnhe97l??y%gUdGy+~b0#3W z1oMG=yW$5VPU~rLGSwx_*b7a?ftfpH7oF;pi;!Znvm(p`d3+yjC%}>KMZLShw5bwf z5#7Zr=pDS_aXU6<#K@Lg_^w-YONBc{b*8JP)(4%R_7ii?%hmg?!?w@dfv)XbpW{n* zR3{W>NjMH=1_UdGCs@JZ-vn7wvV;_76W`YZx%XC?!1EI-GNEU$dc8B4>HOn{uQee+;IVm$#QO1u=FA zf1?@mx~HQSGH&Ex&=hiYN=0kd={`NsLp`ZTNN$s&X*c#8UKQFC-ps1{?KqWF2Okth zL^l8#} z*G&vV%F>w+RGVn88k6PqB07rB6J(3B*g|z`O=f%Y;(e<_RabnV`;`hJD=BtGwQ%g3v%?SI_& z0La0sk|wLz3<)qa`~>~@UQ2fO;fy>JUxWYy5Y0TdH)3oae`rGYXgOGw+3j!hoaOlZ z+w?4@>+Jni;0lm~dU-(~`TLhY8k2t?>~H^MpSMvbe(?Zg^68zEh+JApH2_agPoEMT z$-l-c)BOti*o?qL{d51-A5-REoe@3!M@#fh<=3Fzwfy*hz3s1LXgX)8!bx^yFi&Qd zQYy|cfX~AgrMM;h*#Q2J+ksA)8?@eZF$}n4>PE4C;mPhq%I5u`i|(rv#CxVo`g__z zml_X$%7OlgTl!$|)Jf~cN>t5*{Y3MmB|pXiQHlAo>~nvK1p9x~b{d`hnITP#}iR;OOADJ7IQOq)<~t7nGsfQk2m&+!T&J3fWBrF{83XUpw)*5}kmH|hxkD-@m+^=! zTL^;-%)2vG@hsJde~AOYK${?5YW{rm_(9zVO0O{+{1>La(+Ge8fO-|H;&1#R?$0j+ zD7_2*nm?sM|I~yBIzGw#-^>C>Pu;24h^J()(EjvEG`Cmo8Y`Ii5IJ$_Utg;1_@yR& zX^!os&$Hi!&RkqoK>tCi4<q zv(){$rhhzlHaO_Z6CNwVqfXa5sby3+qN!I=B$gLr}`NZAI^{s@zwN&0;PpC_!JO>Bm$$U)J; z{bJoAJUfhxSO0d`(^n@6WRfD13IOR}e`y71A2QJ$@q@+>EKc9ae$_mt|8K}r#; z2qR))tMnJ1-aB#9h~l$y-W}k6-Lq@dZ=ajCEt9iIa|FG}vhi2ek@FN`z_dClynFOl z9-Iw^0z3|b7K9%Y`svFPn1^riaSA_fdSyIgR@`2vpYLp2R$tU4*7cVuA3T*m`&c#p zieH=Tm0P{OU$9HK*Zwl(L3lbG|9Ysu0ed+83hQ5=c>Sf3bWifIam?U8z^%-h*SjB= zz0Zt7{zl)wE%8sf{EI03y7Y^5zog}t7W~qJUs~`>3w~+A|4|FFve+2^(F@>juGlY^ z<8Ln9FZ%oviC-e|OACH!!7nZNr3JsV;FlKs(t=-F@JkDRX~8co_@xEEwBVN({L+G7 zTJTE?{+kvw%YU@aKf`Uq7ZPvxW>(?;9d&nlC;i(162OOWax%kkD|f!$ zRgI|4|F@9$(=;Iat{7FjedS*U`#C-EQFi7?QhYwCu+Y)Di{pDHz4uPF2pQ_|+r+kBs_17ZD75FoD)Q zexd(D1E02AGu^C_vv)_&0NL_{2P&*to-3RMCMDxoZ zKF-nnO?v-FvAYosuuE-IFYf;$;}0IZ33U3Ogz8^Lf2s+nhisF@v;PyDjv4&l>G_|y z@c#z}`!e2h5=zD&gg%ca&u?b<9|`?4d%j4MKBkMiyPs1$l@7XZLg!%B=uIBPC?Gv? z#lfi&#VBz@jT6m*3(=hkUU|fGl;a<*WSg7OdG_6iz(Y@l6_`MIj&jnM=OVIlXWpWQ z8B{ip{LJ0Po{6@z{jW@z@7oKrx;MYx}w-?MT;SQ1JnQig^@l! z0Qb)%ykJF=`fSV!+$zl8y4l7X-K2C$>ZA1yFL3gW6D==HCkmqfMQ;A9wE2);SxWi1 zt&d)~?kbm^R4N;U9ZU*iXx?(qUr&gg`QjujDdF3htefn41lNEpu@dYjHWVV2vG_zzQ| z$m)~X7_~lC|IR&gkZV-;e`dtJ%;PO8$|?4S&BHF^B!dBMTdFqRA*9kVpzoOSneQGj zXLcs(O$};QcL!c1D4DiLjXF|4`5#vQV}NEIz>x19C^z+?25o?ESK1C`hI42bP4WF9 ziBHwfCmqCqTckzWsXZt(s!iqpAHKjJ&wXFO0br_^TT=TUJ@pFO`I;bC()n~gUha>z zEmgj5j0%l3xc&1J|Bc&XzrcAn0W`|uSw z%nwr&kq|X?zBHxN9n^xLihf8}1DZOYxJYDJiRCX3sW@o8aVu;)fE+KqcC%QR!9=AH z_>1!@+k5bV;+%Nz~LnsvD7K!)M^}Il%NVP>gu`_ zd9*De;AV9_fK>xmIcD|0N_Cu&Pi?DOe8S0=7V~;Z28!Q&+2U2Pv;Hud38MjFZtpJX zVCRmNC0CEem$75%_P<*+gPcqs8S8qgO&@BLfvS4ItGbs%?EPJpr1=9r7TTwCQgC6@ z=0;>E`d-j1I6Uk0@MsKTaHjrbV*HWV@2?_jW3{sDqDDkyR`hAp)1F#rLWD$+o^nfd zH@=ljDT|taimc(m1^as(h?w!VXEE^hJ^PFeiQ6ynSM+|76}&6_0ddc`<*paJw=5E! zv|H7h_1HwxmV;SIov2hDJLd0H{lOrIKy5wO{CfsBu6LJgcLW=gHc;I#7u}($e1`GAqEp% zPhd9tJ|X!`2Set!Rq&*QVnD@IW7JKgQuz5-+9UQ?-i%MLr=$?8W-gvj=_8zU|Eq8Q zT@jbKH+-Ne4)Q8GRi7c&M14=YA^dsZE6_ukrC;9Yb+q@pkwj0La|P-en)i&>*^I%9 zZ6-=GCtzSDy?W%jBdYmOq|id-{@AYNt;@M;`ishB2xPnB*!3L z|71x_VVnmje=omsvy4uELQhER z`9pe&aLSu7HjkFzBH;FJspwkQ!*p^vvLT1yMU*#RZ0)Eh_eZ;yI}{nbPnaSvhX%Et zKZ=#UphZVd3#WIGmNvGT%6&CQ(+Unl@#NmOE3Z$sA$I7Fi85mG`OflFo42_)BpK&m zfaaI6#S49CyBL2sWG5^nVjl6>*dN-a0G^Jjy)_P{;Tc8_C!AOfMLc)68??c`&aC!w z1#P)^yiVD^sN%n}oFM~GGg)+4-xA^>$QoZX=_$9r zptY~Hy3`?2BmObTVKmX2q}q9Vw_Q-j`=hh5(R|t%J_-~g0jH^*Nv&&R1$*V%Qo{;4 z*@url8P+e7AglTJw+-<~hYl0HP~9Yx_v;+g75QvX_(6I2*?~i9Yf*gJ{q1k*lIn}n zkCe&}5!WE96}t-<<-en?0lCmPJ{D?lx9G(^_t}3oYEkt01&gu1Eu-t_ zA)*Yhn#qVZzId_d={seSN&PnjN1`b^$SR8Fy)G?OD3*hw5Mtww&KF*GpCu7yi>cvb zpoq{O3U+u}W|r3fGD0J_q4o}&j&vDONfErfnV4eJ6B52=3Ev%I%32TPsdtNw zE|M1GPjg3wWh?9IU9~1iON+L#4uy0+PmGnT%^TOfKN&ER+-$WQ7BC{f#z`khji$C* zM4v-F@3I3IJ~5R?awchR6x<`?j*T9uJV|_Sa99GMm$nXAMNC#V8!xRH3pU%cX$##aILamH1SQ5gA$F+Nzm7+e9!ky=VP;zVEOv7o)$IFRy2(9T$okMxDm6~GP69sz zJ&QuT&e(?<(}N`c`wYC{i$x6GVM_roxc1$LCnt1sTvotYCG96LQh6YA{@cj;p>Cyt znsMQ=GLLzUuEr>hY54m6&8`*hYWb1-)#LS%gL^g~)Q-ztoKNl(;sTQ74LvB%awL)d zY$oD-PoB5HXr*|zPDc&I)>|p=5d3BK%iCTk=uEtS{%U?rz{x=vx8V}y$KlcbIYJ`= z<_VpoS50wftZ9F{>nV@Re>N0maL`!dNqFm99^C!&*gqW5nu-S=k_{!+)QZt{qGAGN z!o_!BMgRmxm=>A19K#U_ln>#)N?$H9gF@WTgvD| zlpoLjQ1TG12BV&N%QsuaV3dTQHWXSdQK`1u1Y&6sk2I;AoVhYrQLaSW*bmoeR zTug{@mrF{k(}5fLh_-AiN-8B58d7;{?k)My>GxtVToTbux;#m}pz#%)842TxKh)yf z7l}YbppMMQhd#yBrKR*V*~@BnyjQD&jo*_UFri-*qS&9fv6xx0=55^B)>sJ???klR z+2w>zyBWqv^R?gR*&O$G;iF=1o3C$iuH|Bg6ro}=NuRnFUS!gR@OW8vq%6H-$(8Ir zX=!YqPS}rY-Hr21?u=*?s4AH5Zoh1%BgEo2+SJ!0flLCoIQMYauPm<|9Jxj&U%-^6 zPqQ)tVUnj=U*dVmgA5p zHgw0c$Vxoms3NYnL`_;~)$7wty>vrGWzp|cl(bA7`N5tBO&J+YA{UN~H3e*qbPDw+ zyXAVfPl?;@vv*YK)Kvo>Lk|3{_Rmv&58L=yGTn31N23R)#!5jrNkZG`JSAJ;KP-x^+@|`ckqMhcUPE-ePb__#MD06^&yY<)Z%23&ER^Np;06{ zWNWEt!7x(F$g5YT)?dGWgKtrb-sQbhNwp+M!bnLm;A>4Y<9(*7FJThfwyRT`lY5n- z9sHEF&j?w$wgs6gZushv7H9BC7iMKS_HK20$4-AKHcgP@a4*#NA+xd<;y@C^CSEfH z9;3UzG7j=suin@`pDztHz4&DYB>a1GZ$@>aAJzv4|76y4>!^&-OqAJi-*dq@~h0cQ8qA|@| zr>M5Q!#F)ldNON94sjV6BY3e1F`?bk-stFEKmYwmJ3p-o`$;U3OT;4VHc{o}IAU1b z*+XTdVf9QHiza$M(hIZ7h9b;t<1pC<%80mgH00?>r{w^U$jTs1aMPbvbjR}bC5l35 zxog$9x-JWah)pTo7}2nsXW2XhctI9bhlSkbL1&X>DS)vfR3ZqmdKBHf$6h&l#yPYzxmbEM7S* zIUpEx9B}5fdl`&GjZHc=nCsau&_3NsglxSAgPTZ1v*`D79d|RSJ93rS&ve>n-5%MD zJ2KYgs&%U-YRii1AN)%s)7k6z@Qb=3Sz)e06d2cM%F?WX++`~Y7OL%!b76K>mhMi> z&!E72$`5p-jBaAN=+RNT!gNL+tgOgFC40&XMqA#jhH!g|x?S<&)SV$iV z>+h$j9ar>(Mb;D6FQ68Q05$Cs1T@>uJ3~hXb9NYZmj(bIDN|{=U=h9`U{{zZER861 zRK8T4$e1`8&=}Pm(-d3P(zumKz3sWS%(`Daru|0#PH7S?cS>n)2PV$WTML~?{)8g7 zaiPU!i`phFk0nC6CT0kn(-pXk9(6JkJ2aT&JvCn44s9ypT6{l?j$L=Oe$|ay`jAoc z%EHm!_D$n!mt=UKY5jUoTg4U4>YDZ-M#-g3PQg!2Jg!RxDsGstj$TqW2Tv$i)|qAfz|g6}Wd zA!0C;S2R~%HWzYE*I+TF!XUwgDB6IgDDhVUN&j>Jzg)e#wCtwuvV73{1eK3}b?Dz} zz?2)@#Z&8c#NFPrUWm6%{n`Q@5-*~uU$d;^@sK4>zi7J?JRd7w?rAGyKu=L_*he0u zLgnyA-sLc6@>vkwkmFja`E%ZG9)&aBufW8Li%5hZ(WAU2=1M^JiP(i)cY+a(*aq(PxrT45J>2wnWRqOc13kuse` z$126=IWgjS(Q)g8!uO)X)F||T55?8)RA#2cpT%ZmR(#C`OZtE{&3qEs2143UEscJR zD-igaU>%lW&QE#)$HV=o+dj}&BsNW<<%e=S>EL{r9OaR<+r8idu5rwm1UbR z6sCu>R-Tlp1@_z=#KJY(b%n{2ns35-FcZ+o_4pGY!MJcBa&>R{H1BfBx|eOg?F$kC ze;JYw1#kYz)!%%QPloUI(gI)Z)7&?^lbeKJavD}%#7sHc?Qsc}o8g-s$7rFf(7GM_ zj$!mE2+7iCnJ})JuKfCa`x+WQZjs{2!czyFCL1%n_KV+6pwR-r{yfNQ-8|tv3EQ<{ zH>um4MCJ}n7b+<)j|j8HAL8mq>Z=wewG%pKgH$*GKX70|EdO9WratX?+K0ba*8+>L z1HRg8W<^7+#*0H7;}QvDOV9weQnn7IMY1;0@*Yb^TNBL}XOaysJ2~N%^OpJ3a|+h8 zKEV42Ek3WXm@X!gtlkU7aan@as!bZYPe-T98{bW7JNuT2{nZLhTzd=9LAZJ!F9*WG zPPv5>U&_xbvksZ@_T{MwGy>5#OMn%5eK>OO z%)YkLwA^5slJr1`Z6P1#Uc?i~{=wwkF4Fg7Q#rulNp(lQOZY=^dArZ7TyOp)7vC?{PI=Zvta z_HbW`IBAfdgQ`P#$P=Mq!6}o);hfj+7Eg&9bXnVlJ(Hk;1?!DZOc`JT6!@*=bw+l| z`Mb)M5U^cOu^Vtb!P00Io#mUZ{EsGo>$^S=Vdi3P+buegF|;?f*@E|SL%jaoimJjM zb5LKn?kLrjub7&*-3GU=uxYK2_vLU{Po_upB-|Di0544qN4s36DQV<>>CGZgLO7&( zj`ygBO-U@wi2POUd|0_Y1h5sK?^#}7c8?LwRb2aOSZ*m(wKp6Vt#QfqN&UsPns*$D zE|2w!3k$*?9dIJDITA`$f*o`xWRTUe8i+7Cu0s4?6}qV-qKf3=l4nzHQ46htx4=iQ z&%R2C;b@$lr;hNde|^|mZnBB>;+7ANUbC>$Vu2pX5m#p+>{d@1ecp~R^EB}(-M<8L zzi@E@NsOaYhjU9J6VQ;hnhO&Hu)sbJVq8L3ZE>dU=giudT-;B^^81%hL@&1q2;bZh zhIk2bSN90B)Stq-T6*tj$F4+{^{7CrL z;X{ji)AtpqI0s*PHFx9U1U|sJ4F{__e8|UPZ%1QZ=QaWHM@s75mt{ViG(;&Z%1vW` zC*M;vbG18XO)$Rw*g5-_Hi@zuSI_B<{G`nAclBSh zUZFzRAg*<{3Alh4CPgB$`!&4XW~r|yiT4-O&V-!rzPVh2=5U!^r6d@M9?6cQz6o|h zdLGBxT2uv1RX#PWAA>K2oR{qHb`CDYp)oRpZwA(s+U z|H|bPCpk~w$t`=m{1R7$sr!65On?|xCwSjCn_ZjrD)Z`E!9+7&hYs4WhWU$zTo8O{ zCVS}Eof>pNTpuu^wqH_;TNd2A72V{7weBDwW43j6>|c~RqnfQd(A0%&_%6xn=_?A0 zi%QLMI-z>*?p4lROvp?ydg9RY)hdH9hV-Npa%^7MFJtG5tnJ=e>GT-5pSp38cDI)w3| z8D9szYUvcU^|-H9`}v4GQZ~1T?u0;RgZg%{!5NSkgkeV*O4bwXYAj8+YDS*U35LG1 z=zmhf6e)y_?O1KL4CYC2%Hxrpoc|2*sj>f1vvP*|p+Q!+j|iezLY8s#$id)FWd2IK z{Sg19^ka8iqu3^NzyEf#kieqFp^P-aW-OLLg3jTgbY280*S;a`s-y9Ry*`06$xA3% z>O`UEv9oua7oNt?=mO~~yKjphsQJEKY8VSc+vcYOo;f(t(bfo*XbWrzV~d5Ma09t3 zTC(-=vVWv#eoAS?eLHzYi+N`TDMMJTn9#c3J(V}@+X>Zz;HJHk*`lp=O!O}qY-M}9 zPzihX1t233guH(1rM9<}5@nq!;U;bC93%N~S6x!J_}X5>xlmY_lI@6nplA^vOZFNM zY@HE1EM*M1pN?z&EEkqGDo+_3ch{|`01inq{+v5>l1?>-+f`MM*TSvsXcW{?lU7Zu`ht zGM9$J8^7lKd~2+fJJs9?a|k%mCf*LaBki2jg`>qbAO;HcOL}`7j<4>VLJRNCl;HEl z-Fy1B4c1wA?W(ILN-fRymyL)BAY#KYAO7dQ`orS%J^}pW+%O(yVjB$p&N*{8*nB^rDKr6gXtMnFC+>Ux#)3ck|)RHgK-x#b--3x44 z)!WmmBctYs!k5YwRe7M1g79qYC@f#nZZSATa678-{`@Dpl{(8cfiI6wpH(#XY1^O^ zjQ4a^%uIh~@Uu**AIsGG%PxOWdA?XYtiaxmsw^eo0xX|sW^W6w-YgRA9@}4(&Xg6HDyR~>WfNJ@L%tlg4vvl1Z;nd zlEKJDk3{YKBL~lqmw3$(TZ!Xo!&iRqLW*KcKPg) za*iIn04qOF!6j?(wb*n~3DM^#yZS5BJ99dSpLjGe^Gep-+vdyD8KS zdwJ|rb@R&iJUWFx9(I2%VpmwabSN)M@r;*O4Yl+*c%MKEWs|H~?GTMHOBl&e{!0b| zh-Tg3_VIyshKwWxi&FWlC%ng5IInO`6fRe%Th4~&BvXLA95CJCa)Lu`Ufy0dD+u4> zs`#glPk?Bn#C0Cs98p=fwQ-_lih9L`iZ>2PFk6}yJBuadDL5`o_5 zoRVhA(uCRPXRaEGYnD7>8!(=9DS~!_NHiG zMS_=uG~7k}Q~y+4r&q#AcH?{zV|U&lly(5Gy#ftere-vk5#yXAV+;)#l{9s3Z1-Pf z!1CkXl8?xiX=J{U5yhwe1^wnl%eHc=ZuX2(47uVq3f<`*g(nz=i!<-sMMPN`npwW-eW z1R?Q$D*_9LLH0*sjx%rkqJ9tmO`S9TmIHc4p*;{RnIjOk{A%j!oydpg&OMeM6KnY1 zI`1Ny>J4(Ff?#bOFf`&;2Ou5j*T1(!%V8o-+{(FD8RC}&&1SHrrbjB`3)^+(1#+Xp zEEu>`b1EvT#|VEQL`F~3#tPAL3{Gv&zvYJs%XWN1Fj*0x1gBmXxM=8N3)m)E25bwpW@{$ zYp2`LD`SnC_s_wauTF=#>P$~C%;A&uiZpS}vdkIokQa2J*R})&%^C&9nluKalCm)q z1r5FdLw!jB9GAW2B$_U4e{?Ote~Vjf_iwA3PlL5kooJ2FJ!3Yv2)X0dn>fFdCx!w< zQQmyHvSYPuv@z-1m?)JS6;Zf-ob7J-^+<^XWs{E)tIa^F`-v%~o(;{&H;CzHmrbS6 zso$YojCQBdIv3PL(K8mwrbwkHq~V|E;;Je*EL($v<%zB;Nl4xH%*Y+a0t4Sm(G7Kz z{2=P0p09~gBFy}ES7(MmsX?B`Eg_&6(d`a=BUOoiRtD@TPCq`nH@SCbGX;Nkk5*V( zOG+SU=-3;nk4x#9q2FSoem2pltG8JE>b{ju(|MzM7Z*x;MDv=<#}jchFePPaCSiy! z7Utd?c4dr57qz{Fe`5}2DrwF;(ZF@#5wi(*n7H{nBT9K3hCi*>Q^*eO3#t7wOC5ue zTwgp=AZkzS?5sujZoKi}xLopx@hmlZ?e?887sjAi$=PKH5FHoq1jeZ&op5o6^j$UU>d|`m?^=v-rILjU1$JcwJX*OXdqV99k<4S6Ip_`u@Q%s5)aDXD6IIH+W z4TY;x8Y(D5yizg}Dv6+B6t<;jjH9#FjXW=3?t?lubE0ianyHoW1oBEA$w{>5O4grpB@bWtkT8Yai{ak?ZZ#Pamjy8*54_oTz^$ z9ecUER=c{Vy?;2s+wltX(1sUGi#33k*q27d?T2PZVQ2Otrf1OOlEcOkMFOddL|paDtZE- zhM2bH&8Y~FWGik!UWjt{vygR9Csq}5aw2ydu2g*6V!F?rQ0_5mi+()QBeb0EjGfv^ zFte;R{}9clMEl7NRYnfn%Zz%Q8y1eiXm2Xadmj#5nAx2G=_@9Sn)xkWcwy)`<^+jnq!IDQ_&T5#mP{%K(@M;Z{?G} zK1ez=ms2{dHP}NA%j1k&Fj)I7JibLk^&u2I7~Oa3ics*!HNgU6W#O_K+Xs5*e)lDS zqB4(_JQhEeAHcz1=iS3?Yzz)#|8>pfF^(!5$VPHd@r0g*QEmwd(OB{XdAha)st5Cr z(spAL>-uRz3Qe}}wiCrcIkH%c%P$AK)iauD-0E3Tq|+;}Mb|6lo1XOG(JdiBRIQLJ zArcZm;E|M+bn+v=-WEQW5#u3|cQY58{+dV18VK}>^1Jh#;QkbL!9Av6Wp}x!gEyxK zF}K_3u{#1Aju^Xv>o(E~HSzLteDc~`7^UGb`mNJ>fy6nz8d&DrnLjqun!Z|JO;%W3 zST%pYD7k&sV?AP^!>xelws4W{hM+oP`M!TVNxbot00QW5ThsxItpyH5!`yC7vLX zINNd%eg010Lnil0VzS7Ih+@sQ+~u5De0r06N8XaFO*t5y;{$8iQKAf(k(&JzIqYAk z&Jv71Bdnm|y$=|?x(Jk46R6U3N4Yw0pI$~2vW z@tWqEMc8`TlLxe#2DLES`GB)Os@;(N(Rcku{tnp;Q$`jE;24;CGv01&_$7Cw5pGi& zgb5;0Lbe{uniM)2wXm__S5{H1`8OyTng0q%b5)F|F?UquA5}J&iGD5{tLuzfl}PVWC#Il|ZJ`p_471hQX8B41xQd zoeioX*=QcvG>lGy(6_>Mnhv{iUlrlJ9?`ae-Y_kZTnWTz#EnYUccqjX#*3SgpNX3Z z@OaNiVz{d1(_0lgH*A3-412gfBGdl^U9!j%jSp{u_^@NV%99$0?kqM`up)15z0cgM z>e!qsc#`6;xnPaFb4@^M&zt6Ms4?j!dbS~`E5sJA<)G)S?^&_!X64p0JOD%hf-?iC zWBAXBd&}f##v&S$wik4xyT`|gHm=Q6y9{c21aI?IAU&21CyoWe{l&wL`VplXowCur zMr(94o9Dcht<^?E9Dw)f!&Y{TYT<_Ukos6W%23(>-(V93h|#(v_`p!ed|KGJcU3fx z(PnFzjV21|EIHFZF0Ia7gN=$frPgiUy;~hkX(V@>dnYM(&omWo#_(XidCbHsG>H2; zTH{tiuqf@rV-laquphH|66fjGf)Qjm4(+Ociv9Shdj_7s zcvHQRW_zbOVb^{$7QTjqn`o5pEM`|V=x)h@;(z*l%lo73P5K$RofpwY3e3}U24YU< zGVOCh`{omVhB{Zk(y|-JDnvMX3x$H;ubcC=T1PwK(>r%=tYQi(&I-jpxE>OL+j$Iv z9x=;!GS&SyL`h;m?{K`qHZDl1Hkf6@N!KM z3uom@2`&yI+fR-~Lhet+iu0|F^vk8PQwJ-SRceTenug*-clzecK`e!?=!*jsy6_bW zOQ`2#*-9)13a4L99cn$N78Waa!sC7ZUi@2)3YRnhGexF;wEP7%M9Byn=s5F7LrHe3 zjP6`}FSxDZNqPs2F*b!RtVzt!slM7`t(kCfK4`#i+&_ zH8b-@jvB+)l`iTX+c?X_?4e%ku3)&<5)L@B#nv-*snLLox?#)cFjEwQ+`Q?%7ZjUF zY~f9siS3fSx=imZBl@_QU+JdU&edfh=b?E=YD@on+eJojXsziGF)RH1z}KX9fiM&$ z45)dkBB9>M_r7JJj)N~gQv<99pU>E+Rw{tBIq2GwO?o7Dm_oAAZKM3N(I=LjY>1;M z$TAY0?>s%mvZ<_QJyfi{4LB@lR?!(@WvshNHL0J;anHbJX3-1 z2BVg(ye(UH5}*Tl)(yRWuguv)`_6_$Ycu!y&z+Fpsq5EC?m~X`ArB|p4*7maCU@$6 zc@2W52r4-!o9JypLSE5^)h_0#uHk=erPdz9v-Z^dlnrSovdh^mBF7N@mxe4i9p?)x zHzPF#PunnPUFB)%c1&5e>*M74vMbXPoK1uTlVAXuYJ4Zth|!8MxS_3}97H`?tI(J{!y@OG(gczH67 zJ>~NEKxy;!3nHZpy{Q=Ex>uwm+Y9!ks()D7nyOc&`Ot2WEwKcY-*W<4z)As+`G&9y4m9)`F9P0XFzBKCxK|L`au5e5 zd)R2&aRxfS-nLw*x>hAus1o_^GG{?Z^jwCi7|*Y8=d40LVl|)o?r&e@x1V@jS%HzQ zyL+0hkEQ4gpshL}AL>M=Zil?eZj)2pcsKi-0Z^dK6GF-YLGKy4=wCP+-w;6uR|OuS z$H{!CrfK^csyF&u$R&bgPhi^`8}ry;%Fxd6R3fOiqk}#+wZ)TS!fzq=WE-G9=(VW@ zHncYXN_n5VZI&e5*xNoUC6bcbxcw}0frI*(uZ3;_+PuArt5)f_QOVhnFAQ{7UC+Es zk-2s4l;2O2N8X-RVgE+;O6K<$pZShnF!+gaqQv~g?JB1uhRYEJTm>FWJ=;BAx^?{G z_2T^XlG7$J5%UhI$$dHEgG+z-Oa?vEs7t z@E#yy_V~Q>Ljf#?2W}#0c7G6KFjSSJtuPO>1iOqcgtD{e8|@TX!}~_#LvP1+VmNay zQh<8KU{k+xn?*x%qInuN3N7bM)gis1^Gh*tg%C?SZw9+dSzh50&GNKUPVd*Z^Pk%h zHkmON)@IWBKFt<;OlhXV3R`Ux^p6TFKNR~zZjY>O;XBH7JQO3+R7f#>6*kI8wz{yM zW9e&0ufvEoZLb7WH_OJRFPKKtFb`(A5L#5|c7txwC@nNVBBA5A&R$hW&E(A7>*{2h zQ?!&s=COXg^!6i~VT!$wop1q<{>N^1PT#o4>xJ5+U{w}v{B%38gwIl0vkhJhYoElH z?1=@H#xp5rh22xvjPJYv8}G_s!d}dRMV-GL^Y(g&(%DgZxBRhAeAd=#Cfo3YvDJsr z*Q9m*dO^6kkY_ti<_8~lQO(b(-etN9Rnp988eq#7nA#8$=H>hQt}Pp$O%ty=X~>E{ z+eSQvAZFv|@yS;7oM5os3?D`4ijExiNjN%!#Uolex+_eRWFkwxC>$2&{Iq2a@DBTE z`ylPYZo08aC!=T}ctW7Gi|DDVn32?8!=}r2&a&v(0k&DWThym3@C;uCne$xM9ZTQy zdkK%N)$y9FRXRP8Wmad0u(io*Nxv0OcvXpg5{Xj)m_2A_NF6HQG6QrMU>BqjVGUPJ zeP!Kh#Tm%8`A)Wj(#%e3+S>yeWUPPEPv6@78|>67I)w#hbKL0Qv>9W@z$-$jn1QI_ zlU)8ovo5=x{yt2zYvIN11FYF{TCpAq9|GrFB}=3(tcZJG9Y;eI8`<)c>YjYa7#Htg zUrhcbG@&_{j-Jk2Z?{@ojiVDzJ!I&6iD?Z1>M1h#(v?e5&>q(-Y^v^ILWq)J;!`)Y zch@j)klK}bdo|t7w?MaI2jsE_-9~SXGS%PvZj=+w_-WiG8*3R%B)pg#D^O6=Zz5A) zdY7ePXu}httSM%ZGdI0W`DF>$DV^+)Z{`{p7>(3#7v9Cxk~il{=nFHev9M8({wY_= zta+oVIE1cO!VTw|@+rgJZpEe>$z(jkgxZeTBn!e(^pgvdN=UynnusSdjQOdWNw`6 z6)d>S1D1diZ?i3y;Vkw;+hdX8A_~Op)^WB#qzc-dJ7-firHETJoCCAy;k6NbZE#Ht z0&+5G$;{taq}xDz4anEOG}I8vE;pgwM|wK*V%1qoR!!n|OlYoF^(7o#4uzc~l|v}b zAwb3#>}3i*Q{mU!(gmg-d`z7ntO1bF`S|;_cI|^Zh0t@q{p8oj{)Qq|>??u0(MR}Z z%($_#f{;bd!`XK9cZ0&_Q;53@zaJ-j;9@`Ap6zTliSX z3R|0f7ej48fDDk8MX8wh`)wie3j)v|y zxh~!1ktd0`FPHaE0^!*}nulCqsc#|HnD(Fno)t3u6@0|XrS1X+Ze7oT&>`#5gwA@lmFdBe)EC*3LWGS|){}!~+ch&%k*%s=_7w_# zeOmk#4&0ix=_6VNHA;P%uFxYUi7J~7R2(6yqV4XR6Iylz?acKNOMV@%#B4^{*`0!I zIepC;!M2RTckIWjteXYj1ucCHn|lU3o={up@@0d%Vv)4Oc_N|F#bopmM*NP-;m}FF zLYn=i-LTH!4{L}M{G)K-b;%g@#7JsYt$QEU@cr!M# zP6Gz81w*?= zkw{;eu0F0`5;{sGNlT`O@6H+9GaJv`1`*}S1~(vxV*jlNh4C*cX3P=vtLgrB*=G>V z5qjpnB&g&KgmlyGsSQv_lkRXixo^xe6YU|tusY5IWMdZ}P(_Dv!L~k;2V0H|8{A|k z=&)}KmCmD@K82iJe$OOGqTUf1W;kudy1&NT@AWZfQ>7N+dJWp5tF+PPNl;RnhVi*nsI%OKPs2SS zbD#~{cF!utW1*x5lvs=QI?n^p;fF1b$W z#15y&y0QGU-?dX^9rUx8!uU%mnDg(7{8_`!8;b)8_M1ZGwaJ}lZrsZ%kl6afDK7kE zU){|gF`g1eAR4hKzjMt_45Q_1C9C$X#`4X^yHFxd{tC@1+W)-+cTx0Kgu(;*T!C_x z4U$b^#z;t5fZu9EAx`c%*`m!o|+pAC-pI<1@CIKD9IzznGyRoK;n=)SNojz-G#Rk!H(M#X`ZSX(>`8z6u~h+|!BX;+w5$M2DFsUbNENzCDJ9g^>9 zP{?l)#aS$}SVtEVrqVm60x|DZ)}+F)z4CpzDX`AnwX;=WVOwglaEuk!Z0pw+5QxIS z$DrKUmblp%-qhG81_%tOcQ-j0XTBg>g>qA!-PHu#x03o}c+8!lnb7%HV)MjCFIb3g zxot&AWwty`Oe|8Q8D?8yeULJa91RS(tsJZ3Jr}K%JfYbHg>{xq#M7`bd1F=4Ux6>Z`Ii2Ns?AOQo zMq$#^11uqC5*8Fk7w#Bp0Q`ryW3}G@4rn6=qtGTVJ!Iz^zam%~(m*K`n{8OOWlNzi z1*|@QWwH|4atL~H~u?uIrVI1xFJyE%py8~ zw#kHf!C}1thiY$=z5y0cFLTMr0_egCVgL&jYHfA?T~AG;J#`)TGJ2P7j14dN)Vt*) zi1-SR#uMT`@jbLASaQM{d>z17=!>DL__MFJWwDtN4bs&cErBUjo_X~~43glWPhlTA z+Pt({-h-xrex@N9h=^M#^G_PQo6uoJ9I+CfQt9`4SZF&%AVcwZq2kM%H?|2 z`U@?EwL7-4*%*nZkHj8h=+@_Uw_uQ{<y?mnDV zR>>PDu}9w)BL~z|mob&z<12lAxcp^mWrFf0JJCejS;VehgdTT`%zjT;`X9v@Q zZ)%mBgWB{s`;yTlppmn)HhpXBeLc1DN@`wuAhrMj!`A5596A0qmyhnz@iYW5&3MjP zVvU2I+r8AI(R0&lYn`u(?N-JjxQ)p1u7tNM%FvfFrlMx=F*)82B(qZM=D9n`MBD6+ zyM-L9S`xObHVeq(Hrg2L!dlC(x{A3vSFp=LVAkABE!32_n`L#%mMdU zfa?v7CTr7Ww(KjluI&}E41!sYfA^*P^rFw`DUPzF#*nuxP-*5a278au5vI;V>!V{F zAe)JSZrHwiv#MU6gHa#(!Fh4jcHNiwv|!AL;k>8I{?X1q@V?Aq;$O=Jnw8?&1%)kK;OAjhDGOgQt*ZE>wz*HEPek+%=m zOa0`ReT<>;RMy;C{Wtp~zkP@Rv+G-I5c5C&!B2{cm{=UH=(-fVv~xWjZfeH&)07}iL(Ud|zxJ%+3bDtqDuNgGIkhA_7!N6zS_un-$^qshw3 zcsU2%O3`k%{J364t`mL2ewKTfDx;Y!w6jAo?8y}rNS5*nIm|fZe%w6Aieh!)uAA53BELtGdF#vFf$2@}z8`p`a2 zychY~7w~`?(9Pvm0qz<~o{1tBD<+rnAPY%zJyl*eot-=MPeWqyQgq5{%(ZSy#eu3H z#2QUMxRn-52fd8Q5voKl87B_xY5fyw;#H=^wjOVXI^Q9pwIMP)kYErEEX}FPAl%sd z&TyD(X(0O#(Cjzc(?o%)u*`zmiqB((R*w`TXhcFjl`?jEGr69YC zL7b4^6$GUQHGhEekbGWGfM=4iw)k3XOOe}-?J3Q_uFP*cJgOjHtGe( zm~;CGaaQ8Drg|9G0#!w=v(Zmt&@)x*Y8OosAF#r}ToeeX@`A}P1WV! zxP;bK!nA`JUZ)>uj)fh&p1m$Oin^m)yz^MhXV}`>;kxNXWwlum+ezt6z?T5u4tsWY zu*_$-(z-Kr!+NU5T2OZ=dA)>4NLWj--dWnksJ>Q#8zqRoU1_5^p(4rJ`1Kq70kyq; z_@1P9#k5Lig@_MlE+r=ptgo*1F9w`uJE!8pI;$ooaKReC#>b%y#2;LTb~hxd)17PO zZO7}YGEc|CZ%X1yHjxuy?}t5*B-?%zqk{@DF6ajPM8noMaQ5kC5;97`25{)B0_FyZ z#VQItP>s@B?>eJpB`B8ZUzinJz^tM(sab-1Piy_;SIYXZH>BnTld_2dzcAulmC4$7 z%_zfpR0_mdsP=G_XRPVcYMDQI5AdUWlt$3d49TkIZ5`iy0gyudw6JoA2+C&(@pO`f zpSlXgMuo`<7})HZNSF0R%TgLt(ByH}I#!!#OZjO-o35>2D~1d#-Wd#v$R2pUEBC^! zXx|ZSZUnL8#sEc2P{B?3m#FE~9J)IcJ1_E7Say|$_s0~hY)D(Tx5;%=pZfI1cqQF= zmFl8h?e5Ii4Na2_l35;@D%T>gbj=cMju<4BaMPn}(Z0wtS<>m=^>nUWtAd<}NC9Wl zwvO)Vp#j(@*fQD^R_8DT9>~~)8fY$vQF_3cbkb9ifSK2Uvs9UjLnUuT(7XoRQRU$5 zGN~p0)ieQ4mHgu0&}S*Lek-DBIpgN5FsdIqsqzf_36~rPVX^%=MhkG(5G?hSG2qkcoY7@B#N>@Q)=MO==izQmZH-Igym^f-6_u!b~^M|0je zyWZwOV^$!~M{9+1W6{!T0My-6jXO?pbPjW=7E~8~Z3Zz2&-zg*HZB{}3B(>OM^hFvIbh>9^|{Ci7JwC@Hu{z1UUOc*#3V(Fxw*Y`7)gh8SKYzQfOCs)~~$j7F#g%LD@F~bRc z12K*o9RatZ!)#!lN276a+m#?rHbU5^afS{=x9yFTDAVMXr2x-4sUjM@EYr;GKK8Jm zA`8jqogTH&uv*9taukQOc2`=gSA>`c9XZ#@b$}M=yQvU$6vMd^!1|q=#%V8FcpEB+ z?Magr!0!qqSa5I(8L?!zzFUgCUg|txZ{0mS?DLZRWp;c(*ue|7smL3ayg3mSmNV+S z`HHdsBd3%!SG!oq@&_OyKll4gY5)k_^B_xuOrKHeOvyH%8y800klgrEoe5*tmCmsJ z;%2`vkpyVl(Ph~zKktOYBlyl={T$r<(BZkYg?G%MpnCoNtL(-S{6Jsx&;t%AoR%P) z!knAL$iMc9;xDL8e{~vT8Ly)gc!uS+yol zz}%lJ-yv%<#}kq(L{JDQlAlFr>1#bI$>;9F+ify=)$eLqnHqaFIqOgjvt@x^g&ngu ztC006J$M434SSV^rCj%fc;4Kbxwf3$n=H%$%=V4OBp7O}mJiiL5Vjs2%bE*yLo8K8 zHi8XsS;NEv6NfYsiqTOKuD22C+9d@p{#pfXs~gnrd4W7^GZD zPM%dY5O`G9nNSF{Tk7me>^g18p#tQ7E^Z`D7$6@krBoW)iVn}~J0cP~b&L&1on3Gw z1LMl?LU+IAXu$e;Yg*Buj8W&%! zOs$6rPicdI0WLSwt9`EDJnmJoz-H>(w0(oUYF!ua#1r-@Kt$9!Q;#gpQM(lKvGqvy znz1W-rfOsP!aG@@uGCly)v3^m_0gtcjS;96Kc z=o8!9^_UlCtn9gi9OCJ#sz4PoHZgWHELVFus8!;57mig2IsX|vu2=RIcV6JnIaz-k z#;=dF3(e!B56Ph2b@t^l54Jx)t91s>Gn#b_D*F*^c zB9c?j{GlLdMVL41a&V2{RD%E8D2XYh&=0I?L2)d6?5ymb@BC}T(!Chr%=RHMm^Lct zEz^Z0V$3g@Yy6%i|;&t%oy+xzCrZVG)1x@MQjAF~SupLs0!Gq;!y z*KvuH{QDj3{bAqz#LsbTjHAHj!0LbX)gO-s_*`lIak1$C-Osgw0ciO$-_xWAq|x76 z?fb@?H8cAoym z-bBdufD*=o(0_FF=rHq|Fr6=3)j)FEg<_=K^9~|giE-kOOZ@oB$cJNNgm2R>o(I?b zyU4yn3s9Hoqk_M4{vS49oC7XISm$5d`jKq^#wY*X`|o0DA8!8(^S?n^W_oD?uQpM8 zPiF4}v0vi@U`Q6)wTg*zzjXd#Ju`zeiR?{u&t9lr^vDYAp8=46hDL%bfC{s4+G@!I zs_npLf4xI9^G#q9F17gEbFUAECI7knzx{)^e@}`SOb~uJ_dY85%N4xinrkE%J+J(y zk=#HvVlY+biqth@;yBWy*y;TaBquVI(#2bZ%0GD~3>YYFEGA)FMtcYVY5IWqBEV@W zdg$|gSl9#1nT+Ozux;{<4>e*12U22xLao{V7r=Hr5S&zb=ii$Ik7WFOUQ}L>J`&IcOx%IH()rMYrJ?WTO`XjSMkQ81fd_(hQ!7^JiPHtG~d0g~A{!`9{chJDasF!H(|o$|YIu$KSu-hX*ZzuCPQ zfTQJue}AaISkG^7uJhf$$v212O&{&_s-#WZO@#(@V_72U32^(3IBPa|Fr$9>wvgY{C^nS&G26Y{~eV77s3CD5)aTX zFq!?o9Q^mB?teM>@4@LG^mU5k{~HnAdhI_8N&>SV4W^OYJ2uwYTSW2ie4V2;KW8FE zz}{Gp(qAc?(Q29wyumMJ>-|(E?{|P zJT?Zt8GVzs;^YtCZl=G;`@Dl*!rDFlpt@TBC8^@cu634{RDHG7mM!sH)D$rnmF2V8 zt~?BL{$}pwm4j!}<%(aO^V(k6-x$5OMX-q<)3^}ab>d|^9651|V}nUoG>ba>hUH>z zaka^hwvL%GV0r0DzaXR@I0V7Qug5;o!G>%R(LbeeyD{%bv##)G-NT8JM;A|H)wQPH zUrGgMX|Sc96?iHu54QeE`SACiH8WrPT%~a3w-<-Vz&lT-DL&JPn_{>?jZwW(C|q^y zAU*SzQ9&<8)6YC{;jFS*kJ8CCU_T7WN3N#<>j8ypLmxDM{q7wTtF|Y57C79TL)9cI zdEIE(pC92IagDRxnaLGeMs(=2a!uR4O z;pj7-RD0I+e&_)%+NF$K&(9y}o31X3{czlqdq&nXOZ9_W@fsX85Dg4WRFd!1uVOif zC?G`o3V!htbsasq{7J~cf7?p{UIVNcyCin+L$m#lU$jEq;&9-&d>*ZGaI=yVC(qz+ zaWK1;IO{a0asZY`TT7as2JbCwwnbI4@Wb67 z17`N!GyXb%)Up3lF31Np6YLYEX@y_dF^BE~M}B(C;KV^p&@)oqXOa8(g1fafzAE;I zg@*x?)H=CP@Q|MkqN8t8*P76}JH_66{Lp0_Xz4q?12qwx(tYrEz}E7908rJRURb0$ zbPhGYNqV`Dsy9AB<=-9hV1m#3%_^v`rUjKcy;{0)@OOX*UkOl>!Gi6fYLQ1R$6iT& zr=*?C(W}SS#)AHw1#oZyAZFM79kLl%?C9Bp1OgDg0}wtdO9VeVbQ3?FlV{BL2)|+~ zSnm)o0w4_u`;M#Wt}*4qE6OYY{!|5eg!Z3U`u<0=|7ii~KaTrk^y;A_np1`Ral}RJ zDAVBun;k9!M`T>JPuS0WM^9F0q{Z*ysK0~X6m<}>kL*qk0-K#M4i4eB;W>yuU=cPp zfRmmi+5eH&{ioLyTq84`kz14Zqy-#t_K*xw=Xlt=hjczAa0==Nhkx-V5P+{gg|;}j zRkPwrfxF-DJQX~0sQeTF;){O(QRPP0cl^*L3@YT1{ z^~*#B*qZU&sO6>*^hSx{(W%=Uu?^&!6-b$(kVD1+Wb&rT_q1tHM~|P66WvXz*m`rizq?wh zvK|@LPMv8MDzT!UrQtRFQ&7CzBiW-%CE{C5TJ7~=aTz{jWK1VOT zT5a8aS)9S(q2(b20y?Tu{wlLt-ak2N?+>6r#o1YDVhfX;)E~HE$nIxxl)3h6TVA8R z;bwZ6*v{*`!aJ=5^^?ID{B}~H%Yv2=2_^S7hl6X}yBe16OpLgU;Sz}d5e!!!0;)0V z`TYa@{DJ+dsQDIGq4n0$n`%a}=fttUM=5|sJtxgSRGZqe@~TkrMRd%4N8N`Cv4TT+ z^#kbmL+{HgKAJdb-)6C&-b-{(VG|3cZ`%IWoO|(8Ms8OEw7ySMerB{(6GcTGCgpbO z2Z?)_k7OP#o}|BLzL>=6eJHW~N^bAgpPzHSHaFJ!abD*^YvlWZuL35L6Vn0E{jl9Z zz573xIkOj9ezV1ONHPD<7yj+CSMPlk{1*rMtIhVU#d{r)o%g58c@N8jzwuj{XU~pa zLLRuHd!&rDus!mSBtGz!{a50h*fY}%2TA)s{M#4tjt7)|7GUF zp#FdNB-o#(^`bd&azYu+@*lM|e|f{dxXgz;fNZRhD-8R;{ox*=0fjw^4@W!IYChcf zi{<|X|NIYx&r~aerP#+9XxaxkcN zqRlRq%0hL2B*2P*wa5VuNcOPQLzEb};p-A<7 zE`cVP(ipl@)E?a=WIt9aWIsZFm74vqzZ`D?$BEYxxRXMHJqSnVyO)m-}NtA&u$% zu`;v!UQaujtUtb5=-*{@R5Q*iUyhbsd*iX!si|E~sj~tl0?m&?6vX&OMR+?+2L2lan+Gz@_$Up>A_M8wl`T8F1ZR32*s) zf|Tv#LjP2Jj@CM=2Bs~xGApcbw<{))AkSGWG8JU~>-C|Fop2*cY+gY_#>kUHX5F6< zAej2`OP7$18HHk`J%(QA6psa6z5=*eA*Us-(GtB{9*6MQoce?99$nwF95Soy_Y>43 zY8orf%AC-uQ}@O4SC=9=V<1s#!K9#pI(zEO&SkP`oxL&h5$LymlliY6kXaQ8UZBSP z%{oxibpq0NRnqyn0=#WPCx0b%R5X10rQ))vS87fsK`getUeJ1Cef^Vujz=4e(NQTD zz55%z_RR1-f+upZp>Q6SD*D)03`&ONDC_w6>bBj?2D&FU^*v|`N)PJOKQDT6@jVrv z{MLB3%I(IbU>BjJdM8aJ`}O?A_w3_uLG&hm>}~VcbIug!{ZW(T7jv?r@=4+0Dx_Dp zcwsvY>+AQ?DJeBiJ>_vG64Tgg%jzZ#7unvNv6qZ&>sIMCTz~7jIAJLem88+Qwyg@4 z^?E-QmSskO%5S`TwV=>TT&s9GpTTI-^H<_5P2Eo;PoQ!z2F)D6;USye@mkGuyG}Hu z=stYkPvmm|vDgPSS-fin)O{Gk=R|{FBPt2%uhnj9z2-xpPf!6vb9=y&>YF00h@Ws$& z)el1=mocs`g`;kj-Ek#wyXnsr&TIB=8^uF&-}G~aZHnrbUcw-87OE0y7L<)~o$Li& z<2S>0jqLU#O&oiS9uoF4+^LO}7X@C*INP zKXH@uW{Z~NS5wlR*sheP9Ub+UtT5}(bQL9VJ249~mGQ`SL8g8K*ZI{C!rwaI%^(ho=TtUYiO7;S_vso2K#n!pyRk3Ubl_ zUKrh$wF~mR%1UicDfegD7@W?^bGGB!==cA?b}lfEB^+_RE{nY=6zM3BT@o_r2|k7= zt$p4I3;fiE5M*7UEP?1`u{otfYgx*-NsU)cZDhGe@2u!@kBW$!QyM~!fyUWl6u5%6 zoLg>yDm(J}wvZqw3w7|<;F{Z0VBlRHA7RrHjYdb}18d4?`9mp7;0-qr6frA@4G(e% z`LsNJ=wBNV?LPLiwk3K+K^@wXXGaCbXQw??z=o?c#pxOi zY6zwcA0}!o`Kd7`tL=}_A)KXvvG~W`#boB5)I_u!Ed(b!45QG>XRf?`- zRw{C`=_cOWo-8@og*h^`qDk~vMtTv%i#LP)8!7vhvrG~+8oufujM!P?TsCnJ8hy_3Wi3#sf|4Mz@x}f z>fTWGsFAb=%v_>jCfxq!`+B+rrrrbc%VO#b7 zWeYd^8X#+HlyoeH znrOKb>c1)HBbttgz#t(bn67ITxR+HsO#*{=@94ZK4i}aWUa>SMdWiwAF%A?5ehvP# zf^iJ3kxti3xfDIWnX{rW?h=@MJU->pn#?oZW~o(P40|0Fb&k`ju)3a*3r7UQ@H(AL>X`*qG($vzJjM|hjZRC4}+0r02ABVQ_`rgghAs~QYw>|Ma* z@%j3)x;T+P(8EJu*}yv_xkLDcG(z6nQIp)fjRyKC77V90%KT{x#;4G?L6k*3d5pRw=x);d9$fKOoIm132E%M|9-aeuOdXQ7D@{9bsZKvAej zm6yd^lTKkwNn+tS&h-VyyDQ-3M8__;k1X0m5@I2Fo3zVeDlZGq8g6P77lR9?i-~3p zi=|u%fns{{1_!5IlEPrq#qh}1`P-E~3;GV%Nfr16iJRxHvfmrrmP!cQ{NkT7d)hUNIQkG(IF71l4gVK%-Ujs7it>;Ib=!%p3}O-XiBmMmFJHg&!2snGS5@L zw(VHh-(j0R-1{pTw!LE@-ajzB?IBZWUcRXfBJSFh+~pSAx!=;udVITapkp)1piRB_ zo>Z$rCvFa zyG%mUr!KLDa3wft)J>s_wcmM^bO3`J5?iZveQ(wQeWNJ0x-(UFD?0}5r)LI-#Oo)K z^+HC8vtY?|(^`ERI<3KlJR`*XbJxM7o!wu}MQz{P?ln@}9~;)*tu-`9OukzZCW5Rd zD-?1@X=t;3xXN!{Ie&^{T61si3$$&l!nu=auE!1VI)JvKu?jOc<3uSNLgYYZs4=E{ zqGq1X+KkKlb46R`^d`(VfsSpyqS`G}bsU-bx{4dT*Kkk=-7x2?+W3H*RWI*V(iFE? znHXkask;SOZu#|6K$&}g_~U8}_CrRm6bcxXYHPT;18|5kQgW*tXi8u3;2swb2Zq(h z`}&BnUhpHtdp_N>Vz(JkCA)7cygCEdKgZ0>%|}arC13t(-1=j(anFt+xF2E5KQn&m zjpOg#;2@S}-S6`tFJjPX8Nu1$BRzWTqGepKA(O2*ILsI)vgmf#q(tJpG5 znj?-{!J*P?$8CEcO-9U4zqW+Da0?~6uyKc2VU-Jae(Pmt>j@p%$dc7Mej92iI*Fy6 z=`Z$hZ-T7vHb(z$3(Q=q^EVjp5xz)qY~g3~W#$|1P&BIlV19S3D7m-QuM&^DNeSi( z$;Hhsr#m$wy`AFw1W+Beccr_(v4aK@B)%Y9oQa*?E{R6I9$fB9cI)DxrTZgW?@JdA z+0YO5+DLI^lNQLOF=u?|X;x=euw+(^-Eu0%l2YILys+V>ja`xWWgO6tb8#%6a_(z# zG{TeOCCe5d107BBB#!xq%_vD{4XucN>=s(9e&|H(BlsFmRIBqPQ@%Xh@A3jZaxRS( z&&DqM!f?;ovrCq0rZwIl&Y~VrH=gc^D}h{(xe}w}&{SD%{Tca@e+_3)-?5t*lO^Spcb&UJm6pAX7odS<@|rNCID_OVHQ zRGi7_AU+srQ39AX5CqIrek?jLB#bW#sGRLPg&*j2A$EM=K~W%AC~$?hXz^Onn^ISt z9=EuGh zrqSKkA6w(do&E;Gqs5_wFa<0-4#LPMJ+1|GVG}<%ejQblLcnz@yBmt32=C6_MjFXt zHm4;Wd*@F>ea=h4ExoF~V3BvPMrQhAig!8L++~yK!#8o4L1v8=q=YW%USi`rJ%>Og z-!KvBsV4uO|HIyUhc%gg?V@8J#z90BLDsqJp55(7S+w^b+Zv48uq-(mP1+ zAiWcmUPBK(N>3nQLJ295{W9~Lv&EUS=ljmSuJhOTZ^~PqwVw5?df#zvq^IVf;DP79 zlRKO{p>_Onb|E*>c~WloApkxvFbMja59bmC;M_`HuY}0?MGZhRjTzN_qQY#vv;it| zQgc>4$x@l54bEGCXc9&?p42I6x_3*pI0chl2rjod!2zyqOrNx=_CLNd-KYkWJ9c*l z>(kR@M)X3L#R2TuwQ81(>*}HR5};B2P`MS`4^6M((Z@zo*8^I}IjcuavrbwQC6IRc zF3`+>GAI9uyQZ5joD8?h$8QkdhoG6 zKPmzNoePSjn;SXy`(Em$W7`kgYu^+{fOMt5#f1#I%g2adDg~eNnfR?c1C`UVpxh21 zbGJhcc*{KN;xc_lCaehurRZb~p(a?#8--^hm#H1)?s4a!&8h3<_7e659CbV0S@}YH zMM`O9u0m8Mc2(9PfPy2{`$TT)Ci+y{f*$GVJX|KKlN~$Ba$1NF8TIPQYJDOMs7DoSfv^)bX|e>P(b;PrX9i<;yRwjeC+I3<6@= zI(lVuk8vfM(UU}Yo{_yp|)Gxbk zAV2iDJZtCt0i zrh*^7ts@c9X4H6hll`5rfE=_nb)#x!c&9hW(R*@SR-kutHhl*^+4H~#wJblBF-?6c z`}4~PubQIj^bXrYULE_kY763{2+!YR#)0Ab+ITmqAIwe$2XElkcIYnh2Pw<6v3OzGeK?ju&Ln}hqHSTGDpk5vc~V}7Y+xWq}-tEO&QvQDISkbl5qLtrqxWdy(Djx z{@wH%NEz%bH{H=ViiEd8YYY^+!Hmj?J+)3_$NWkPL!KH&eVok+%F)5Q^DfJ>&}R&0 zLcjb~`%;T)4^f(u&eO8-_AzX<<$>s%z_L)XKx+<{8egp|pYKkM^T9|5gGx}XsN33n zune<`L~ZJUc~qt7^0`eDL#_fJS^b5)d7VWMj6?yXCF8>I;pZmZgeJvHQ{(mZs*Sv^ z#-w3_Up~x(9GDoVxZ2^JScY>&V^gIMo5>z8FznqaVHu9Yg!OLbXLyU=!2YRyL+Q#r zGw0n1hjR&n!}`RUn?;AepOKhMYV_44Y?(7>)KztthUN>x^dS~PX*#2N3g!ydb94}V zx!Ay^O|P{}Q|WR2c^p(z6<)`Y6Z;V+ZQk8gr~?$Ci222CQe^porFK1)4niGxx5S-f z1K)sGnOz{Gs^j3ugKdTx;vhp|R`vWoG_lRH8+;KJyPjV4dMImoZq7tl4~oL4uVUW< zUwO4rJGDDuW8!x?YE*rOeoeO-+}h&eZQX0tgczIYJk=ygygJA|sDqSRmFA4;bCI{l z+*QbX0RB#7;RD@jUPSeP8fVt$FV zv-k08pKBk}maP!u#%HMEWyfMk2tvVOf@Hw{SA7{j*xpnOaN zg((T|9~6Uv(!Y&w+8238bZ#YrQpv?}3h9M>0Qv*92$E{gNMZjP6}%q6Cm4W}?=T7- zHKa3IC{K$FrMl<=MVm5-IYO$a&sx!qY5?maGu&mhd>@{BGK0)W|NMH_gIY6?`2MfDd6yG24egk) zd-Gpq#A1YP;)%kGaVHr~7jnL#hBU;Po#{Ho)K#XQ|Yu&~^1t>%Drg&-?s4d@?u~@9;o@#7$Rw9_) z{Uq6dKM(@S7(!+wKgS4ZVfia5+N1nYxlW(__!a3Yda1%S{&KU=T~0gPp6sbjob{Vs zD47s`W9ba!818#rz()MG9=^CR$9k|;BN}>{g?)=!L9ixZI)_IwuOYb>$jUs9@!3W9 zY#QXh@Y0sNz>ZLxcveKVY~pp;s*pHF@h@=ju%!yeYWl%;AnW z-_q8d)}OC2H*Q8YF`Us;uz!M=80F3kI-}GWH_R2UxO4M^Cg7*{Gve2MJtOpQs8k-u znjPWSJ_IUy>*6%)6DjM|6m;jzmI>BAN9o8lJ)CPmIpl@-0}_KV#IOkfR8cM1rj6#p55IW| z9K{4fxN9a;21Q1ptyAGo=E$8cuD>aY&0>d^&c$a=CV5&#)cL1iiTdl07)=+cSj$61 z>qhtyKR`4(-|yJIHFtxr#N1+F+P1ClW)GYfx>cU8L!Mtr#bs)+#aA5pI~?wykC90p zz)q*_G0@jA3kupznHQPLv=3WNGR@0Grakkp@ARdv2beiV_(N`9UfnBBf5EuFTRclB zF@9mzJ#ow4B&3d%p|?+*0EOlU58Nh)KLC_V}UMJ9kf^7L)a(oAJN^X`v*q+bIYr z@5J^kip$M3A}7TQA+t|@zE|2_@bypKeVNJ26k;%k8 zqX=zHZWeV;4pM2D{f&VY$Lg`mRAi6-=a0Lpc2>H&imm_S?mU8?MK5>X0H+}9GwJ`R z+ruxfFg&?iAn8N@VR{pC`v;(uukcfxCTC1}s7TCqbdXqQK$Bns)a#uqfJ0nDSYb%K zq+-Cu6Pww#xAH_vRhlaspq126mf;kk?OzZ$+IdTO6l)Td?0HoGbpAS~Ab7Drjq&EV zSmF0N| zDQy^u^JVgDc$!t2*@sggLUf0`pm%Zn5Zz95GyfGrair&Jn@m96|@OBVpOYql;vceKCp&(i=W=DJV-#nSmc+a3f$)=D^9%5TSLG`j&kkXT$c9< zJ5_-LV(OZ9>S`E@31j*`U=VS+i{{JN;X$9ZDRDwN$5e~o`^1sM$z{t|uhzLjLFq;i z-RP^8R^877*oAJ31L?Mb6}dn|_!@vJDdUYh!``RP^mBtn$7*h@%eDS5cVo`S{K#VX zB+2SQ(#v+?kD^Nm(SS4lA8(%Pmb;`ZG#Y%HP3`_qzfjz{e#RUbr~1WdT8OM$k~*%e z3fmscpPTXC^lVElN1?V+?q2I|mW0$4y%`}7UBbcsjv~}`BbAjROY(W1A54szwMX51 z5jUff@pLVaQbq|5Zumv=77Z(^K6L~8xp(SW+M$)#;uo;(Y*)yoV@Cq1;+Tgrd~!e*Wg+uD_utO8^vCP%tRd=QlNMyzn;;w}=IEdfobqpJ4; z*E%RvdK{ExD&w1q<0a}|#DkiV#Rjf}J<+H)1VSzNxjUHKJF`B;&xyD`7NC>y&gnPJ z%2I@%Az{rGtjg-hdt)FW%X{3t9OCZPN4*ol@7+T7ZIWyb{i|xXq4(aP)hSn_57B)x zuK2#>{8B|$bos3hAFm}nJ&RdjW-iky_I7`et5{q|wouI@gdCo+$K@}MzH-?Ul!}p- z88d8sAB%}PXx*Q3Yr7x=ZSrx=EifQlRD(V1P|=K#osJO}7OpAOR%lBtma;W0wO?rQ zCCzAJyW?%drL;@X>9z}_fm{K$sA66^GxB=(F=DYwHtYswpQu}}H#s6XE$F09erA4_ zSn*=&Lk?~H$1Inp5b*jkb5Z1rY12f?Abas8yT+$&RTh?K(5XgQ;Vv$x zm8wI=-z>K2Wat?ddXt_kI7x&R01Sh(b0+Y{^z-pk%x?409Eoj{`9_z8;k>EKcnbuH z?7kaxwXD9#WLIWC-Z5d2F}f*x1-?!q>r0ReCP7x#JS&?m zTja<^gMv@X1&RbH%WS`&s=-w>Kc5{$ej<#S2e_8{O?lZ`n(9Ex94Fbztf#*BW}P+k zg)8Ehc77Gaq-aXGVH_V^^fYNE>^FHxR~G|BRytCQ7iGUKSvTJK0Lz}#bYp>+L0<|b zzkDDu`24bnJ?W{q3qHqf+X7GW9V>oYVn!Q3wIgx6afw2Lxc>Gb1OM~6z&WC16F|+` zKU)TmVD}<*A6GD_m=+p7++OOxa4Gk8Pnnl-f5?^f9%WXjF%Giq0Dm6zFx4*_V4}g0;@oV@V>^*Mjwye9{IdLV z8lFl?MmT-lV=Xn&*U11!o?#egs0OfS~(FG>AqF|FTl)+EX$ox0nDMy9&NHVQ3OJG}ajh@9c9UsK|M) zMUz@oEaoc1E0}CoBVG4H?$&Ns1iw9J8&4rUwNoz|_GEE2g<`wDL4P?&5?9umHEo?n zcvA`!xxs7<-|=@HD5ISNvH*pzmp7GW`%^AvJ7!aGp_y2j-DNwQf^Rfd<-|!@*1A@& zRvzCfiLfrNSU?hH<|pC9%B1-?BUur6Mxij1Ga{`}+d5~*4*7rv{#qIi6c@8ra-DyB z1WGAn1HE7xsQo>j^vZc=Q(eg2wGWYywBUT3KH8*t*Q^}()J3C>HeKdcWvN3>VmH?0 z=?a+xHQ=06)zWq+K}NH19i?nU!xqG3Z?lg3RcCH-+UDtVZ-K{o1tXU1#?C(l&FiO! zKVIzPmB3ASWo=oOERcgZb|Jo-)I%dFAV{x;mEM(a^&IFTZ*Kqmwa(=uqX|gHgc!k@ z)!?7rALkxBl04jGJok@LH#yCLdO?K0czV`^qc|+;5~^WrQ(YQ);T2bb0i=fM5M>zYUiVw?qM&d2+VyB+^hZYPs&%q=(xVK49V;WG*Etr)F<#S17N3xB*V_Hk)3 z7_6{zH?%S)%h_o_i6y_uhu`h6k8|3(jafQs2dMm!=ewzza$a3ygDyhEmRwzly%ldc zo~G%*6X;Cz3)jI=l1&z`t|cnvp4gt1rUdj-&1_vPSwY?a{aS&j+_F}4>(lF>vC{7Q zo316hDL2s9v@E*DUtaysI{jHhcz23d-5|BcZTz`qS5vMPs#lIh_{ld{YYEswBZ&!V%%Se zDdCE-+x*6I{{?(?#6kbD5>t$^Pp3>lg%NQo&JvLn`arbmVfX&%UQ36n!|XyV)nU#a z-UNDQRo|Lj1G^>ODs@Dey4Zy5cj-wOZhOD7Bmtd?dtZfE3KHI`*ETnsgM%$U)Bxyr$_LE^5{*k94 zoYnZo{^xYd@r2xP48g7H_+8VmxyaS}-G~*}W~)ZM0}G;Dj7jKnA9Q^MyvDrQ@tcn< zR!*@Qs=6kiTr)rYCD8pkVTXEu=D2wgIO{GXaF%SNY%SUS91gffQF|TD4=ba56%Par zQ!TJ5q~%8ql1rH^*g?+ac#>dxthkfU0~Rn;DqGJ_hVIXU9o z=I4d!16q#C{aT03l4^n2tJK%#=kRZL$O88Vi{(7ZfC?=y4R9Vm@3u*D!jR$Qhw?UT z$3tdCsDqhXD8YW#Xc%A%X~3)_zb2LC>R|N_^gZO7%xs;VI;puc^frYFNtRnl{l6y@ z(xbG30b`FX>sQB|9 zrt&>cT=2CK_kwlfMzGF@xko0cejekOQoBb%9e!cVV-x7bB!S#<4Y!QTgoIpAsW6R< ztbA>HCVFLtOWQWJUh@b0=jsch^OI9jMFuETx~g8iKcoj$_W16nozMBJJK>MSM|&x) zhhF%0ZZJt^Y{ z^5wU}Oig$KSlLTOPPJ3O6dC^IH4SXX`p z9MubcJs74(6Ahkk;aSR|M*B!u*Xwm6@11P8`{QS?Y36JB6h^eJk`oJZRuBp{d0VOd zCLQ(F{+i&`WtMiHm%}XmMVHq(*`_RJ3+Q&dL zRbpfAIUKM#67;GZzV7?Anr^&2mqT=Oc&CUzt|1FdH<^AGX@owC{--f9q8y4(K2{X+H{0Vp2muni2-e2a5cLtREiq-P%II zH+9w@a?ggz{8(+OcJcJA-valWRjGoWEe5<)1-8oO%NJx)Yi4X@`1yB=c^Kcs0;WCk zm~>d=Gnuwk=zvJZ>&?l{VSAu{)oj?h@b%AmQNfkc>=t@r`F*@N{porhspB)y)Hdf1nN?I=oHKAL-NsGl=Vo6* zJ3eW$Snxy=DCyjM(&8^)v6;14ZckUSUw-&HXDM9NGxdV6(s=tLv{d_At6i{}Hfz`G zZ1a~5`)xgQ>)jt~mtEZ1$a==hWcfLz_;79H4>4E&)$VDs4}|v;3{EO{>so zb`us}Nh15~452$Eo7@r!Cd^R?UhAFQqGJ_8+Mof?rRxmrdeHpW&9d2Lv8kVT`K}C; zC*pl%QBtn^4u*rw6E7A&z7J#Gsu48uX~yLxu#sKv*CZHO0x4wAuL3&Cld!d$K}mbX z$%l=c&4F0h3@>B2Uv>peGirAJ27IOI>mc*&y50>;s%xbQPd!&u=nXs=%2A8DA2th- z>BbJzF4Kn}aI^3&U_4eMS5ub+64TCOkS!yR@mhOB5LC!M=cOLm=UkJ+(I z`SA{xFxkA6QYdq5r=Jl5Pn&TnZ=MvL^{reeQ%mDHRT5XuY)?0=P$^j0^LIfLGII+U z_qr$M#u0Bm1numXUS}50s|Fv*$r81+oSt;O9JCun#O|awRIHqjcLo!c%ftyDE1^9i zcm3^{7tLchxR-ooBtf`?zD6U66bKV&p&?fMTHSvz1UXw3Un*^S>IgQi;=_QUD6^+ z`ZFW1d%eTwx`fSHeSu`4+0Ev7Q1RO=_YeMjH+m^(cY^g6KOQD4U+j$AR!%tdpg4vPpT9Oct*GNZujOWDv&=wV z-g&^GRvO+Qo$Y54Ta^l-3X!H$aM1__PAP*yn0kG}IkV4YestU!n#YfL5!1#d{V|tO z@by*Cu0Y!Tr7a%bCf9jM8u)fOX=dwYZUCjl(t7^Zt$rEH^;)rEAH=R61+5v{Lf0r& z&`~AQbrIh&4ciHx-+4u?8r2KQk$M31;N7J3TpKc(C=+E(C}0)anP@U|sY)W3dqesz zGGz0TkB)1DBiJ@=Rq*5pQ~Din7deD-@y4iO{i}?pt6|jm+I?zQv?r#Tdh&^3SyM?x z$gHT_{xaY&ZK0g!d87Cro7elyrk`TMb!%;}-Gtl<;bK-+l{w^3jbVjIF;1;BuuK2V=nlGg8Up@|Bm{i2`z3X z+Ruy(gqI8p2=U{wsjM~J86xUhU^k-UNUA?07A5-pm$@L}1!w>T52f}c_7CwYeX6n65DMpu;qyyZM5M5(%;s!}X9vr{G%S1(fA;4* zifzBGs9cn8I5RX|>$)rjvY5jhNG%eoQZ1tx!8tMeLGp%v&Qg#7ErH7AK&t`f_6bCf zhu(w)r=`MrT5gO|nj16$9d| zp;wQ2Zqtu4JF1seeoZ3N4q*7HIi9Y-yYc{o`bxb>RSgd(#8Z`|x)dh587Il!T? zG9%r?eJRo9Ufbf0=J{Kc1y3b&9G)5v_xZqzYzM;dJ4-OySyYLjr^*DgdmS&V({|WS zVtv6>QmVO2U;O#08gx?pS$L_8h>?e-=QTx8`!ocjt|`4;0#6^WflX3Ua1--=X&<&R z`%Sh2o^VKfhN6jA_eV&&aSK5TsZB+Q+6z*EaJFI}GpA1vqw2)Tk^Fgv!mG)_3hXj(_Zf48YW4cNSkY+RJx;@3K-`=a0Sh~b^t{0JSXqd@N zF3E*?sF3LytGcp-TUjy~q9a1t5OQ%inZ+TQE_8meG!3ohLmeVuo7x@N00NF;R^&pe zh%S}bF0ay}%g5%#g;v8egKe=xB1^Q;eI)0oj?zVI%TBy|z$S}BeBUD1MOXv>HTGW`h8ak@kBsV3Mw5ik^5*#lF{>m5QvplRkhK`UXbIU z8-$8D5QFh;9;s_pgH0YO0?|5lUm$r5C26rVn?BfJxmyQ>U(N&UQH+E|K7B`j7|yOEzXS9IXO0@X?!aksYz zMyEhdii>;JY;(Bynj5Lo+E#kk_S050mDR+!XB~pAv@MoWtX#KdCZuU9#Hx`#?x~f) zD4l(V4ac|BpZ+tPZ$6?;K=lp|ewl?D$mgn3hx}u|EiNV2DsM|VOn==Kx}PRBPHCtKM&(dzt#1%NCWR02YCK6LY8Z;j8fbdik3Wqkqi8BHf+ z-T8C`)ShQx4Lv@6|3dA@XnW+|L+0n0FE7UzL6V=N#KjZsJGx+1OPe01w5cE^2<*3- z_Z9frY4MiIaU-PaHwiG{XfU8y0cQ+}G+YPA-3qCTWzDz_aB5Yv@Pz^NQ<(|?QI(Zn zHEW)Ny!VVsUw@yni@!nl>7I?^s9f z_+WO4QgSpdGRni;c>035IIr7Br08AXhD|1qO#4gyuJGRr=OY{MeS9*fH(5VXmPR)a zq)=K0>_<9URHF&#tV2OdcCMyve-CYQAueDP04eir@o;b!B;}7GUh=TH>eOO{3%Hl5~xPs|IWE z;Wmq6Sb*Y{iR~Z)UYrmRgWe^}0MVVhLtEhHKuNaK6JJ>#NWL0>2?+O4-EDAYO87)q z&}r^rm$_1d&&mBRpUzZdjyZ4S#6>tqx^;vS1V0OP_!S-x0)@EioR#>ia<&6~hhd*bTI7kBZ$umMB7P7n3? zU-Ku^*()U0K!2HCWEw?L2BiIfDGbg8YJ_dcHjb5*1DpII7)Vu@*w1pr64o}os^gUy z;XeomJ}^)`Q$~hz5G%~$x)7cjboS`GDQotsJZa{j2ZwrDXnS=SRB$2@{&aPk5E!#s zwSrI?-BED$9kBSJuaDnzu2S*X7mZ+DqZ41C-e;zpZK@9L=6 z#oHjE(q@~58Or?-t?d~%=Wz=lpX{e;SU~ALh=5why3T=ZZBmN^Nw@89HdI@?Q_U|w z7f5~Xy>kIkxsS?6;|xjf;w)2;xwotK8uGVKf@quZzcM8M8O3A2^G2hu4nN1xraxwE zH+!2B%p)uh%;@aI)wtVgW_acu@UEJrcl1>{HH>^2DNJPh;YQP+6s0d{3iDCjsApcd zwV_GbVV{8*5u>l$=H`fbX*j6a5VpQ9@HHy&XK6@b?A(C#ZM~~#aO7bkFb*QJwP;|~ zB90QH&u&}sO{2r45W1UI3wp3{7Q*H%c4?gjJJeAdLnM9BnZY!$FzxQ#hZtB@Xfxlw zj87fT(6%k~_R-{_qj5v&xC51Rvs}Q>@cqI4O^sz{!PALEpAJ{hFZU88vcC*IG4nG9 zg?Y$jN$KwHK*T~e@*2k;&uEO{&W*YO^qlu*xwECSEC7!~bR~@kI6-1fWJ9YJw&?9; zLV8VpRO7+#6z)82X=*SHCjg3K_H)*&Nf(n?Jb03lQlW)^l%Jz~1=`x*KW0Tz>lD1q z4UY2Ky@YxTYHsjy?8A_Mna$7~Gqx3Hp~ZS0>6B%CZD>&gYY$QInZ=VJMt-Uwk@nFr{6mj)C}7AOG@;hR-;rd@%ieFi>gkOsm6v<`BVc~ z4O6F7a=&y(&=p9|bps)V-@dOokf+Q2_@(=3{r8|bVRF@!`KPFQyS5f&%Ym_nmAFNP zWx+`@mjYdXty)mVr(z8zKV{GFS1y>fkmw*JTE&~*w#JI|3~H6q=E4uSi1P#JOs>V< z98vL#u;GYZ9v-SNXFPt^ZQC+LHPwqYx9WW6(x9jW)ywYyid^{K$tXeWURB#n><%TM z6Efv{NW9~ChTP48*c`0MKBZ9$sZO{j1L}(#pyub_gNa-;(y`J!nMn5+O^g|U_490-!G#9nsl_T+>RAuM?j;yzf7uC*kU)`d zMpcAPH#_()!XXMI>rOn7utB8deoYE4kh?y>Hu|;O)NFz*$x<=!oTl~xKdtm_Ob@ju z0-wKaPYq1sLC>w3(G8I0WlQvzLw%dG{2u-MK*n7r@=#w6G%t34={dkJy>IHNBsBlR zZeE?NCz+wAmxznUdFuc~!@eg$`f((qihVqTlGl-Z&7@!AVBbiipMw$f&r_#>wt%?K z&f&HC`Zu45726!0gw9U@E$I3KnZz)bgK*J1rrL4^=IrG|pl+db_-oLxH(1V3s*E3h zZnZ#u^o5Q;kB5GbW*e2(cv;Ye3U<~7ZL<{XbQMo zP~^K{I(bR9rDLk19F2RI=ulvk?&as6fT8nl&%`70m zqT3$vX=zSl{*V-G(xy_afeO5wZ0$Q9$B$@1R^8m8V#!*erun)u3vsb?(_794_U-;zJ667^A0SB z%w1vd&d{Cy1tBUzr`UF9eeVirMR2q}>AjOiO>1+2@5Ucqoe|%gZW$h5%f!r2+oB|& zb6zi-ZaD1Qj1aKVq~-9u$O*{VCIsO&6;P} zr<1eety#BeyF-kiW)8o%m_+()H6UOt#wxy&id;8HrsIeMPY(;|;GdO}Ms6~;QE!$< ziPq%8C`J=?_QfcdF@9okvGaKcMDNLH&3I^u1R+yq2Vhz9O}K6FMq0tslRrRfqD zX~yV^E`y@ZPiAg%W%k)of{ylX59$DC25}=mjBAPQ?c>$VR8zyOfngNfj1y`+d{A}g z;vp&+obdCW=V*diX_d6jB8@t^$yIG4DxyKuv;|wV@zp;Dgs=VD!-g(P$tMCH@chiw zA099|ZGF$fYi(~v>@?!?2A4l2kgMGV!!p4_i8(kQuUYrebE&$@A68K@uLBACQdoxq zuE*xYulX)`se*?QLa--JpUyzu8NWGf9$UOj1vW65`k*`5f}QVJr0sjiAzaqx*Kq;4 zh!zp4RdwwEiW0(Q2)~}+-MulhYd8wJJ>KUq$rUgw<~2Tv=sUv1k*gs8K&f)R8J*w) zuVV-uLY>m)M=!I_s`AWy)nx;+e)0CP;1!<5d9g)5i)8}0#O(=6wj&(B^|;)hPV%6% zKe+21D7eA#tn5H_iKG+@!be=EuctTUT7AT1(QBh)+9O1GL-}i?GENRXNf{PI`22JF zkl~%+ULs6H5cbx|DXml6KI54*85(?89o#Zs{Bu{R-9+DhbY?}t(-(}RAbq-Y;~N4~ zqE5dlRt|R78cMFcD1vpuhUm_1(o>_4eYA6S41tj#s{-K5nPE{BQF;Agvag@alK1{4 zJ7+O4F(f!q{&l^>y%}seFyO!!m)=8ux-39l!LN@Gjj5~3hbfqWQn(Xf^0IYXxxk-C z`$XMb5y>7yGn4BLbGJGw$S(~ht@Xh4`Rv7x#!4t_q7Nqz>)K6 zs9R~QE-Qjpgi*!g9=*ZgWZz&4of=e ziT|1mNVDnHSt1M(XmD_`v!7ziw$6U6k22VE%T^KeGV?!fD-hrAU0y+`i}TWTUR028 zxw8bfxAXDb^L#FM+{(ofU86Z10s@#m7)HSKmH~5SN46|XjEzPouNL>jK=)JTw3#3r!g)NWdTBWZf z3x^NM@Rd;sso zK}YlyGx)&(S~5}46Fms13;me{6T`(D(KOKp@uZZP*o9+(Y{r6FKOjslWh^-mT|CSO zS_E$ec<6LeP>V%dwotk3ZjOkYkebW}6Ecemhf~J-_jm<9Ewg}_Epzd+;=D^m)Eq#n zIoh{M*VprjuX*??N!;Dym2cg#a#q;_3OGH?%{EqQ>l=Ja*0ZPi;S$JHB9J=)NrU)e z1$yV|ogD=6UghCR13QzN+OxXGnzIM2;mk1-`%GA0=f#ICu9PeJvINe!Rg}} zPK5$C4y!b`ap+T1m}|O&_?KhA{gE_s!{Np|4!qWWKX61v?5Og4e})PVB~>**%)O4k z?A%mgYI5FuXl;9!8eRMObB#j7^@^ZbfBu%;IG4>e(gnNkRqBC%7n?Uq=uS>u*rwT+ zSGE0-J@FXPMU}+2Di%ijfmambz@$cQ(~2haLGY?4b@e`bVN;gDpHYAj_Zyq=mIqIB zU}Ven&WPKWo=WnWW(JeU-faRR#$j47fFew+z`X=sNuVpfx0lRK(dv{ z@7}3`nMNM(@=yZmcH0xtBR_G_dM9Cb$6L&4sk1KFtzT@^nx)N?Yz}p#mfp&l^ct!|j<>c5Y=P*A6TbN^6q>iqL2$Gllg$43C5rP|n_ap_ zqF(Q)176u>&{0ngWYZg>xXX&2-gPPgdL0}}%<64+5^OES{TxEs-Q_!E$qxot_~xkbFT&WMSNOSv-PHK~K9HBEg`vYL zHOlaZjy6AAvB0gQqe9w1lRsn0wdn~+kD=u7$4q27Mz>Y+etMItHMxy*AvOKG_?Gi( zgY#xBGNu;syrvSbV;t5jd0>TlN}66G`}|mgc)7Qph(Q~rx+#)hxJDmgJO*yaEt^YW zcM4FN@x-~zg)s2{86Di`dPj5CN!hYd$SuQ6yx(QpTfAF>kvJ$)2ywXOmIhfL5NTac zD%Vi@J*D>;sNbabHZ;9TMIJZKT>o+W#9Y*hMXx)GSM&kQf00SO(@oZ?X}`y?a(q&8 z_td)m`Oez_l6#hP^2rX5rH3nnB^D~g1bwBOhcI4lse-QAr_R$6?Xxni%V9Ybj9rg$ z2YUsvS4gp@yT7AEz)g}C8@(p&qk2;gjeKQuezs=I*C)4Uy?lY@Lb9Duk*UXB2olyg zz?)kB%!IUW0(f*{_Dd4Uoe8q6bK^jFK}_)^>h}Yuo`jO|#0tV@9J2)Pxemi|DhD4E z@y!HsXoUg;(#84cyUBTvv*vyGF89n8b`h59i`N`%4fo0(y2p?NlEFS@j9`|i^hNMH z6&&AfE7^%%k=~6E-0b|maqn^3jwaQe!8c^GH9EedxxhK#t zlpg~3paGj!yH0a4~ zk?4-p@%>GHW!(xHk%bRmENhMG_Vo_tqzmJ88nFBHbA&nIS}oUq0NvaKhGWdyuPOq-r(|h zBaWd~gZeo7gnfs46J*_6TW$#l#$iDd{&D7g!p$_*2o!Okp<>`}Pxi z@%gnAoajd4ytv@ES$E~;b~blkj@6`cr}6Hv@gROrKf`7jbUsz??eql6a|)55fwEM{ z%)zt)T7tv!h08iz-t6(tSdG(1e`}y`D5tUz5;b9nR0Vo0tp`S=tex*3+l3z2gWI^q zY`RrR_axTD>V3smlTg@Y!()lnB;`hnBTrzqchUT$Cn~S&hke*q)PRi!iW)b=O&pbh1+xzxw-Z& z`DvZ%B=*3xw3t{en>;(4u`2o(Hb-Gm++mc0T026ISKDJ8Z_sEpO=ho52%R!pj}kBQ zgkY$)f=L&ytA_V7ukZyI#*=VImuznU{d~E1A9rbgm*cVUZ?8+bvphAVJN#Sp*6bZS zhQ<3Sg;ll+Up7w3^U*IxYNlU*w;HJSTuIfOvBe^sO4w3z8^$z54cTokQ)tv_P)YDT_1yJ3V5O!Yq)+Pm9#%f<777&~Q1RA7blj zFPN6|^j%M9qW|pQI6AxDvovxvIhrSF*2{c-w!4@wJRZ!2{EB`ucko0?cP>5sE^HXP z5`*rs;Y!VW&4RmfoBERHsNSVw!+OaCa!a|fA<4`BRgYP{!F7;q{I|_8=huF1uM-Vg z3_fD*kI@vCCz>b#Pc}iJyrPodG{k!vI`-6S{n2CMFUIkKZFplMrrOK7hL_8l-9N}+!3n^(>A^Z;Jl;I_uH@txtVtczm=i|4)Z`IMwqRK;t5gTY&{wRi#HHnG zm~034tD+c88&i3ID#-q}esJb25OR*+T!xt(zawumm%whO8a{ow4N%e~H^a&V|IeH> zKlq1SZh^iY_sv0vQFpx#Glj1|?U)OVP=`t`g+3v+szX~x5@ZG-s-1d$%&L+5WR2F zAjs*-vm&)O+XDeWgU)q>Qn>QVdfJV-GT36@>ABs1SV$*-!5n;eaN3U=CK)vGnF=j; zmn`?WO+Ae}n^}&FM>0w}s3Ce|8L86rnEX#oE#$OzOg*(NY6{If0J#+%B@hiGiNMh% zb%)KBZ_GONsl{gQGVh>*-4vO6)T!}nX=yJtxIJYc%$UgFmKP=f!ja6R$ONx@_0t7w zy}$CYTTcF@W)$^X^%o|EB`2bPWb023*M`gArg3~Dmq8%}L3XaQI>i7j=Z`|#^Hn!S z6Tn`5A@^F}?(>HcF0=B5=rLe^4faXv^O-V9@QAUr@tlhc4jOg#axu25mXso_>c6a- z%%cX}MRrq9Iek2!b_R7UvrP-xUDnn>gWDf>A8Y`Wd+DK|pVlSd9oy%rdm-z74hn%ncbnbt89+1i;O^)Q5B zO+(k0son$&z&${9D2Mt|xl{$X{KYT99@RAQK=0-EXZ!0EN8KtIe) zCQBl`lK_qwFK8gdS$X)$fk+#(ma5tApSUg!+C+Jg?t2)H1v*0JR z`J_YjJ&8;9-Ty)xSu3L+uwB3Nsy?Htp!w`K(iKwO9`?LJgEc#C3X_ zT<6U2jX*578q;)8)uoh!(Wd5)?0tUUQzs78UyAi0J< zW?90qI@peKh#;6SYr!2XSUY}noI|zMdfMKp&XAK;8uavU6N3bTzN7F2*6iifk6Na8 zC-ntyo%{A*j{pkGsdu-%pQc@{1hQv9`m|nt{-&cn&=<#U;PgZ(vI8jmRho2WP~+t1 zZ#z25Ig(Yw>Occ9GXR3%26a?;0|!ecJS8~oUA0(tIu^OHCqu0o>G4dC37j#yiehwB zmt6ZjHUxOyq+YSvz*9!|q!rE*z`YzyE*J~>MywnsO7D6S^`bf&^rGt25* zWc|Uj{*q>Woa2&_X&W(SO#@A$?NQz0y_Jx?SWDhTT;V96yAfu8uY^1lTQ3y%rSRyD zf>3Evc7JJQQub~o!}kJgPrJU69Ba9ndnyY3ha>3whyQbXSttXV5w0%r&&Wvsa?O_x zCl)*vR#N@nLN@)6pZVip(j!DTW1s(bckwsk`>>XvP>@O5g30$dw)?FY^hlvtn|+io zK6-!ez6Jo5W~ce@^okbsztU@2;C9VizS#~gMeUi4#E9x9myJM`|Jow?M*?V`F3J|? zzFqkm4z;rU`g*t~|H-F?6j~bMo3N;(?!MY4g>QfNjb8k3ALhLTpf)Z&yRYNEeX^`a zZ66I{6zQZL1C@Vm=cDQE<9zOf;M;Y7`EM7zPw(H>XJ==4>UTE{U-t0)zybNXw4N^M zrfN$u7~W2ZAj;g0+*|Ug8A{bPqJFMbFgO^^_!c-J4ll-6V`pU~4j6KxJoUfbVn6uL zu#P~jR;OmI)+$6dbSp_^!GbpSOsB(>Ka}e=fSg=@YMBOqZI6_Trx%Tpy?WE|GX{ zT<}wZra1wmbd)gsH@E1!C;N~1eg_bq6VLqfcmHzl_FO>h%Av6CZ>0V&fBAOLUekJU z?*H3AW3K^k%qh+AZ~n_azZC%CP5Iw=%x@l1SrM=;xrcWDze9L)0*V;BR4wq&FZ3@j z^EU|Z@66Nx`$hjp!2jFEo!WBW=)^aR|9^qe{yzf#uiLxxKmN}S{zmKmk1G0qxr2Rw z{8wAyzo78B59 z`%wOLKI`ufasEdamN&;TXlI3D9db^rJ^Pf1g=&%Uo(#?0l+NFOz5uZE=FMdYEdMJw zf9&;}x26dhNeizx@45@Ft+8?*nTg{&vjimczayCf*7+Zu?|+o8|H1iu2-ZL@z03{qzj+XAfOilth7l4!4QKWbdvzqa zeQzjRqxCq=Wztp%=ok1-mM!z~^1q6j=6X_sKG2Zx;G<=S#rNy)zkNbTz5vS{ z{kP-apW?Fe5f7JzYVz0ESEm8Wfj0oMbkEd~SdNje|KRzZ+F$Lj&*!_GEVuGn(TDFH zJYb~##GT$BxBUA3lYW1&zH~aPDH0Ysqsfj_p8gvu|HuL$wk9H8(Cd`j`wwe<-`bxu z`Dxb{XRII7*5A&hk)f08;AaHg{`EQ+&c0g5#7l;13X_N|GLgv6v{q-tn;MvOPTLi)Bl7?YitOXQTeZO=e(XY@`v0%S+<0O^znl2>y@jjB&IN|Lrte1F;dyl3r)vP}=-tSjp^ zzyI;0dLRNuPFW5&{HwHtD8gm_kb20X!{2^vbYj*9=a+@Ip%nv7eF2T40x z5h5L6e}DJ?y2TazWyFg%9<{%&!%rCe`R9*!{wB8bP0!v}gmv2#{{1%JelpE_e%sew z{MDYEf}7+vttI-dWlp=wqyi=M)nP$EM^Ob-xNUvUXRiv3BOS?f-sbwr--vr8c6fGq zXI#J8?P?_GyPY`!1WOZlCI4n)Pd(sE+8^hMhJX8=8c;wprWm3pjS^`Cj8j)Vci6n;B$2mf-VpjCk*K`*jNRBK6~eCMY6!= zL^98*rv@4VjZUb*;4+uw|M9^3e{73C@%-I_hHDmC?M(5tNP4NvXPyUfv0Ry%Z&`(M z7mVMhfP(MF)3|wG`r-6jyrjFlhhAEi0I*X}fAQTS|K7d<06bqmZ&C0lN;Tj z(^aF?7dXDwd5B*Ajw9|DfC&-c_Y(J3_$xPi%I<%=Kt^}oS*nX={V&qqJD{m_+Z$Fy zR1_VhN*|TpRGM@U>C$@_PfWK;KlA3-WBX`!6-cHHAu5Z<*Mt6&(4T`5QUM^{=W4^}pLz1v z_hs6Is7km?ys`@Pmi*F@S|LK zTjJjw9rM>Q{7n4cZutCKj$MBMp8vX)&u{;qwjX-zthIhI>5U>S(mQqIktxV?>_4GO+aUCUyRB9yXXI!UH_1Ury(G*36N|0f9*s3 ztgS%ZU6FL(c8>G%Kldm4)jfZ5FW2bNv|BJ|ci^s*IBK;`tMKxbY$2F%jI(iSVXCX< zXWyf{6y`^tg}gp5bLmHlqb#>edBzffhVTK_*E_(}|^y7LzaiTsN(2dFYWFN?X=IT2qlEy8aoDl6+ zJN>b8h2DxYN=f^MGJ-j?M|WeK5rPyJdo3_vWlQ!-b>KMKEoVUJl`8fCS2J2EAF7&y zFGH(c)Ckf9CW}i9e(aSVn#So4HO~2KLhxM8RVPdHnG8TW<=*%8zm~spO)~8^3Wbie zLm-L7PYDG_p#mu@lPEOR`R!71c0T`kF^B$2>Er;KHL0_Ku!}#be`vvv0-t^??!?-; zpSa|CA9#_p!^TBK>;lV?q32A{LWjyW(>@F~hYv~Izw9?*CdT%&V|%Ad0`gpOPP_CU z6%FWxkB|(9syi%22zF2e5OkkR4a6_-sU+<2==5%e>E`HgVHrv;Y8)g9RqXW$`R&=6 z;>xWTNV)IWsOIGPE?izx{#kZRz`<_If2jR@wBvD7-gT3p1c<$`wU;2?+FMuL=h zw*+p-Vd{)!pO=Z4{}X_LZ$;c1#3&|_H zpv=|o$s_sk;B}exVv}E3Qkew#Hs%$_TG~(W@WeZ18$b35qcu_yemaG&rj}?)_(VNk zyWr?|0T;*7Um=Q*hky%NuPOV-{NI1;rt8Z;wn-W0bfbfe3k_23RE@uXsmaq@PCuQ# zOV<>l!kBmWu(fbIXQ<}s{_D72whUeD!S#D1q?hmLULJ$zg^-+qT{tcnR#~9gF*C`2 zlD|s^p{;2_*x#iJu$exsEH$H&**F*KVaIWTJBM44PCY7j`fbVL;q2x`)H?L$@nyEVE;+EC@jKq(=88mMZ5swVxIhar6(eq1PMbl%~c z9gE6mU!wJ5bvnNu&3eNA}eZy>fJ9kR|irdO)2y#%DZGWhv&AjPhSd(;)0Uros?Eg3g|9$k; zE2!Px?^y;3$fLHzs;tX5@fL9ru97z?%y47zg`aZ>_Ic&LXr(V*5&~+6)4+7djs~{3 z2YTwme7>)wp?eeO1dcLM8pM9!SHI~}inWV0QuJ}DrZ9RH&>y24*HDo7O$ZJQ` zX)=Y`^Lgh7JJ5&spxqud9X8wB#@h`7Ia3Khi?lNVq7WFPh$gn??Oh2`r*FWWb68z= zdwA3LsG`8FGveL#$HbfHH#CydC2wz66hEoy-!ci-N}+wrd@uw1){a#VGYsZ(Ih?%IOVjrL~Bso^0Dr?z$$F~=w03@u%fGY9H>-5oxuU^NfVWoWHVeMeT>W1k`= zat>MI78I`a9<&~qkNe5(YUH_WCAtt?dNPMAe4hb&VS|C82~bTl;OW zZ&4h1YqSSW<31y3rS<+t<@Gx8jSZR z1z{ zDc+sZd$9CU3ef)L{TQL6@;lMM6r2;dd&xQ!!OvX({$$DKM>geu(9VFMMc3|iEuX#< z*s)zZe*7GlF5A#)34RB9i<}x@Q2w&?#=ho*a36*R7JO@gFE)!FZm1M7_)QP>o? zc3mEb13+9_z^0hgeoT%EU1i;YMv8Bj zlaifJ!A|Gviy51t{hrqm^9}n8?n#lO77#N2Lp^!~!SIe;$3g!_m(2PVT}{R!BN|B@ zb^y19P96A;o8k68iK}D{fzHV?IIo@ccs%}yB=M(7m=Xfqnyuz_2DHhse|#55ax>LU zKnmK?+^CcC;JGSPV&gomCWTHu{A(vZ^=n+!004ZG!UBJ&thncw;=9e`TiSWC zvFMe?)+-Q*es`~xe%v$14r_$Zq!p>UVz!?Qo3C$TSsRY<&a!FTcq#OBBcx@ofsN1a zKt`w-Yg~I5ho2F8I+o{K{kpi3m*T;kPRy4W>*uN(71wLGDJ2M^{CWc)?1Ck)$u9_t zD7m1{vM8f^4p8xmWLV1@;y%o~`?V6K>L%t+0#QL?RR6{&+u(c74uWzy;`8i{~uge2_j+J>{uC~Xsf_Ic8!!G!*FPYA$ zFWwaoQ`L0}q)1@9qMYEbMlZfty`d&dVMg5RcO71180cH01wtK{!{gcRSvPoAYhMX% z_9x3={|O?hC)l*35skY_>ZXT`FdF zc}^z82Gdnshu?VR2_%2tXhprSmWQSeyVV!z$EzVk+5%r_ zWG+PNvZfTz4H_u;OY%Z?Ttv(Y_vh8pqM|h;%(9zn>LWFPDK6t4ggUIVU?_V|Xs2{4 zjpSZq0em`FCP_S<9Hlroa9cbIrl6`FF|%yb}_P%mWR}RBZWriI}7S!%|md2;{2Ww~AgNSd9Ky z@N`p)GY^Z3gCD_Yl71uxIGL6L?d~pbTGXN3wV=bBu%{~;_Rt-DVqqgtEnNm$8Fwd} zaVHv+wpx!?Y_$tqrz#pZr>--?<&T%NLdV#C6U@;ISQL&a01j8yHRYf?tjy#1lSZD$%l|6zt8DT;%Y zc%ceh#l(>Ib>k2lyF;ZoUV71G0FBznJCtzjFyV%V>d_koz8N-(cFN1f?#d1yG?y22 zkw(U_LI*cI+iM|VO$T|q$_ZFf8?Z?f?AStvA8Pthh1AXc;u7Pn=`bv3cHT|4SaN}wK z0EHw47%)!Zs_SOkB6_ar4SbByn?$^yrUN1n*N%XLJ<2$0%yxl^5b3bI8j_IMFh_8h zrql!iWRS(Je8xJpSl!Lqj%$vEY*C^Ty;g-Z9C^-FwtqHW~G z3E;5hr*Y|%-H?j7W|CtA!F(lYJSSPbj8Uqdic=ghA03)4}JiSlH6?5hTEn**yd; zl2|B@8ebCr#4q+fSF140l$=l3lFvJJ|1Sr%!x$0f2*g!}}BfFFhXLjf%{nV384bIov=E06`(w=Ne-G&H7uK zBb@J#4A^n&`>+UMZE2+G-VU!(e_6NvU6rol0j&>~%t{hbTsKXz>hhu)sntiA&kkFn zmXMK}>=UJbf`s799*1w+g1$d2>zhGD$=!QA zmg!P*A>e+8gWz}#bN03C+u4uLTpdR3ym-eE2)P!QPVmX_ndS!JDVobP>piX|FAMC9 z@d-D(jphA*;a{Zvg~$rhcbPgg3=+Q|A04aK5ebeefl0!4@>+xnN5S{)`27(5<=_9x z0oG@-e54W11bwVS0DG8U&9>#;`bPnMd(9M5cPt~-`I;PV+ecG|eKJjBd=aL!Ew1hO z#ZfMxYAtMCbbUR+m&dArQcn?=o*A2t3Vg2WxGgS6XqBFYaJO~&OS}vG94d!LUD+91 zV0XUz%G7yAJtV%X%P;_eV}!jL75^e5fD!f=LUkY?ng?}FW)i>MxcXNKo+bsK&o)Heu}5#` zOP9cNCutwsHXu1HRc|oD&EA-}X0eti^VL|rp|3KyjfJ+7$&EHNM3(0j@z%~d$t_kI zp8fnfZNfLUW2q`yL(9R`Kb7^oup}!g0WY~hf!n63@oGAJ3e`g)rMAoBrZgqmZ3RDJ zI%}>}R9dVKi&(NXg7@bWgHtlEc?oFA_bvCR< zJZH4J)R+0d+CYrHl+_2X%>3`u~;v|9l0fEz8S)TsdY6_STEj zUFbHYBW4;&RNp+I4}D!E(|y#>pInCXpWpd%sfIUbj@GqsZoll1|B51`r{-P+<5rhnk=kFx=ORNht_o^ zp|V}rbvbx-`b(UybMr$2oB)sP8&n-9ERa8*(NuO%_3rg6FzjF-x->;ZHF|_|1J53_ z^BnC|I1*IL04bj^%O95|H$IXk82IPZX&l%1&_rrrtW-_A7=_m%zF!C}sD6@@?69Re zFC03%`{+x%=GlU)MLTsLOC9j%(t9mY9F~qB2$QV@%w7B0n}oSdwr07T`_3I*0a=AxMrSfjfSYg4;rI zIhvxz1N>6{dTF z$s&p7SX9eyEvCFQx%%O`CZ_{>F8|T@)}{$Nyn^PTMWrS@aK0WL2sfPMb}7Qry|`(B zjq{~kV_;&wG`1|^3eRJ%7MSfey^9it_EvAJnr^o%TnJoI$9pYuai^RTbUxyF{&dMh zTM+B9G}YAqI7dIwr88*Tr0;OurqSaUdVZQTk$fA(zcJuaeA1silZ`u%)YTi1t9^as zfm!Cd=^p@QiR3-*C}hJfTpFLbh1Jyx;J-PXn-_NJO7o|iVNa1@Ima%z99_g&=+h~_ zjMW;I3(5G5%W#62-}dUYBwbs@jPbZ7oV{{btNDubIB)(8ejjyCv*+_xJ)$_SWv4=Oe^?d=F(rsXQ~DN%!qQG6r( zf`?76Taf)XYY2K$8tzG~6m=Kza3HVHt8q6eb89%{?3xpHs#0n{V<=xZ>}u;2%j1Ox zCcww%vy*i%CEtp-@aIVn)*5nd_vrT=JsZuuHx?_-^Rq3VA_1L!mpL%{;?hY0sB?V_ z0juIZ5F`wOp0|AzPyDKC3FcN=quY zUN030$p==IUtkUsk%Alc7%m48-m(fihqDShmawJj$|q5be_dzxY7AJ7HG{>b7q#CS zw7fLXS;~^L&#T)RT--~Vw0CrmO_|9(6=w={3si@wlr&jp1WEQE5-UG7m}-FYjFe6c z<;Ugs^Sp;DG`)&CKQXCK@&`I`770}IQjnBli+vPzT+5z)v=03L#>v zz1?b#45s3(J*p2-Rke#(ZRp}pq3o((>-^Mwt|`ecqlfm0hBs*Y!v4PE#ndP68EdK}>z1Y0-$6BU8h!tE<++$4CO_C5<9eCQ#X~Q{EdCuKFYI|& z!<8(@j2x*8?7{)*7JFSv&=oV(xpZADCP7ojcpEePDa{G@U0Q*?od}crdB`Rz&p7g7 z14Z3cCtNkGK^6;yX@P}hF1`@8Q^)RN zb;y_m9US#KY}*lqPZL0iHI3?;VWN1GuM6PDMJ0~H9<2-4pLJPXW4Sao{gSnD#Yc{A z;Yhu=w1$OZ0ZmNEByyxlLC!i{cHWK>{uskh8lKtWgs)*_9BV1Bat6ch_ba+Wr_^_U%gZ@CX*q?j?U=#Bf=0Yr7{9A1V zjsrhH0rEXV&xtEaZQFjI9`l}3hew_u=HFfR$^OV2GHIhrPgo>tpCyT|#E3>|ovsZ!ImmzV{jYo_zc7(1#))k5^wA80u7dBYk41d^ zc;WH$db0I7VD0;@&qOohi0^sN>uGEe*a*#qi^_Ym(7&!{cQUW05(V6GP=ii9~uJTwP;1IdfCTr*pAZ)D&m%t@&py z4}DHj@Z59^;C(mycNOzKLGa~pb&dD`FdNzEi!WDqpYCi>E;5eITr!jXHv|8t1db6W z8%`oAmvdj%M=A3YDNZkB)TK;%(EK0Tng9GF|6)k3qBx^hz&i@ffoV5=GxPAlr^+X_ z_k_rfU53(J1-wgiL!TM@UkGC_({0@rK37$h!Av?GRd|`anYE2#7wOF^VhTg{oMqEzAFf9j8fugtQ0Oc>f}1`x~L@;$D)Pht3^E2*z zPy?jD8XqMrj48yXMUlLo7)}o8I2c70qT7(Ath1dp^~nl6UH8@24peE~WBUXenr`Q< zkT*lo5)JO8(xpOFRK~XQ6lXYb!>I*0iCML3?qMIa+zAQVvud6aNGDqafwNeXhg({& zp|~vA{Jq~p?PBB($lwm=$ak8OCZ?iA)2hJ1^-q5OCJB)q7q)~9>tB_4*6q&F>CfPo zy$ewwv;3i%DLyaat_XgoAc?>})w&xH(&RSNTD>z*m7$Ata=|7DQ&mz8Qu%W88#d%B zn-u!Hv9HO;`V2R4bPq^}jC8gb@l}!sLYts|MjHYx=Hvu3$B2qp{^s>zcQ6HYCpTuQ zg767(gbDm2jbhfDcwg3g5Y&Q`Y8$gBNUhF+_3?&h2rpy6lcOOox&0Fl{VkDpC7^Q& zTfWZ-Vo%U@A#FQjuOuw2TC1@G!&LHA3pQVA@H`fV@w)8xdBlU#befC{VskEY7%J`afQ*}G(!J-*LCRXBW@;@V`>PFy*Y7Eqyv=sbQS{~caCniAHh)r+ zO5|;p(U=9rOlyEZO&CQ*%?05N7lij}9L0-QV0Fbh&1olw-mwQM-XGE>)oLvJPkE&x z@0ebdNpw3js4D;TDZC?ed}&>xwC{4cOPCaac)R-039`0cH?vb-4w@V`nW$eEiyodF z&9hi%s6h`0pdTz=FG$GXao&_Y*zp>5d311r91XI`5OytCXuK{n`l+xM#6;`%#;!31 zKgZ$Slbw+@f;XgY?ypbV=k_|B7Ek$LJ$iY z4#FPX*X}@*4DWX>1Clyypz*dzbZ25I5uH8|OL`>}qOtfw#j}E~Wv|lBxvMC5^nVR? z??MCu`mBxT{Jua<6FaiOWW^?m{`8Ghyl}{`cQPcVCKX&!VV{=1^RT zXya8pXSKd_ZW+g$FC4}BD|~^RPE89Y~yMlD-QagC-uRb$eK)uKoOI~?NJ+TmtI%R<}zI0M=s5&>A;KQ+vw^bIUgnV3m zh4&>MJ@E}&SpdaYzp+bAt!Nr0Dp_GS8mT;~_Ef86Joiynl;&-+A-#*0&hHNR7eTOK zaper^+<>X*J^)fl(j<7Q3ZlOZcCn{D?LC{2hNRIV3z@FI((w#hVNE@ zmpLNbFoM(43bE~6?z7NV9*SCv?!W;E;xXlfYI52&4%jF`ZR5e2V$_$|RqC9eDR`2w zL6jR@W!R3#Iaaj+sK8Cf=Y_@0~nX3%f*c zdE4b-Vwbz7Wt4m)2-p(O0Kc=b*u9T@p;AfgOH`GCwsW->enwn)5LJ4UA%J6;Z9GZw zf34&1{Pj*&jFRGwDfhR_Bxwceo=L8k`Cpenq28(YW)k;Q(@K!qcXY}VXS7;|$!m}w zr?(~^7YDx?dHr%YOP_(!d67AWcxS&$WNI)(E&89P7PMbwpDkZ~1z7-e72XPyd3 z$2EhyWV92c8;y?vu@|qNuDoF76O6LW;8Efv=GsT-tOX883mAB3T4+tOL*C^yMxv;3 z7Dp+~pk7Yuw4Aa$JE8k3MR849$h#KK*Zu%mxE2{{Tg7dSXkk9=CNv+u)aAaXa_6f- znn!_dP2psQN=JYrENrlkeGrMaYtSPg{>b1l@$Du*WcqE5k}pz`#{>ag(qhaqW0eo41sMZ^Z=r=I2 zUE_7rW?b!sN^G>8+*D&jd{}zvQr1gd&$6Z>#{}Lb=nO}24lFcpF<{WR)s zCCp);vBjh(0Bb_kTe^&<5GVw+3@e2vSQmQ$xJIg|-%7SWK06Pi zXfsY-%@%pOR|!@q?}Qt#(rAyF^57nW{rr`v8K|5o#WxesPGRbayMe`_hxe-~$RWRV zj;;I--UT+-A47*G7gUKxZ^Us&mNa)`4+d>adfdK}HRo35y|7*aQt zs}>g*?VxOS?iXEgnmtWkZ(vy`-%d%j5%_65J+~~hb`Mu6vZLeZjx|^{Ik%4}Xdx^( zHCOh&@T)UP5co=2zEeEH!HsE@;YxJN9(F{>iM)5#JvVo4c(R>qgWaug^N4D+HO_=3 zZ#OB9@KzTZxe^M|IDmR+25xQ)T`WW{HmGO6Oe)_{b)8&q?4)L@gY(^Q(FWhgf2a~u z1P;_F@UWD8zvGCe?O6`ck2_%MbBZ*Y4=y3FCo8BsKi!A`+hjd(tqV&G@${(L?Q+)` zD>8}tEE{5hhrbCZewlGEkWyW#Jc=vQz|C5<76R$moY&ISX@607;ggBx*~z1ZEr~2k z{)M}xvspXOaZke+#tRmacO!v?5Syg;j{BdPk&r*cBNb{;S(2A76xQqhLK!8 z)E}>u_vx@l-yUgt`N~d0u@y5$!(_QOFxKVSzAt0r#m(e8n(nYVd~~~NcXzVRsocrb zgTKTM4!6h{xMS=1smEy+X9==ddUNNdD0V=)fUPxF!L^a65qS&KW2>t+5{7xMhd*~E zt*QakOT{}Y&R!F~>@rfNm_8W|9AUC*RKBC2HsXo$=0CiC$4<4iZOM&zfAM^)cSB9A z@H@10ajWEHa(6B*zqCQXQxOS)YS*=aixp$W<`-$Cc0MO8nxyM^hwgYsWbk0n9wioK z_B#^KR)5vC`KbgKdiqB>?aB7_HP6hAcm#jwl66_zoKbvrc0aS2R?gQexYfmN+!^zF=vy z3eui^MZao^G$E7k@WL7%l_TK1Ini%F4?xVMugaMh0Wd_g%({yF^8{z;sZJf6AwdiH4=tP}(O=58jRN?FHvF z6v=;mX~FR@@=7&oq=m_Jg1=@e4)4rx4Q8J0)!z#acS$V~5_)*pdihR%udvZbyXpP4 ziZP77iNttec*lZeh@DX(2#Ff8P0qKfcTV(@yc8i|+$*dFs>3`b1WyQzhd&u6#YlV; zY57)|{@(raONH46gKeO-@;CCn6fIWipn?km(3Pa^l%DT?Z!3}c)!k_#B5hcxMyb|L z5+sJ9#Y4Udp;YWzc$I8rcxGy+ zZ(GeV^UnxZ{Z_A6Ai4Dmv3O_EdMBvJ6E>ZMXeuU$*bSlJnBk zA5^Ny<473-byUlHS%nLtD?$Ux5zYIqn&}l$Oc$wDI*K5QVUl8M>iS4u^#-JG@?_^; zM|lq!p$pO6K0JeXs!^8>oKgWkWGc%puiTVv2CZ#%l}cDloNfa=T~r3z9Qws7QwBQe zR8SPaKD;IQ#>?Qf#`XMHJKL=7?SfBmckrZdt@HY9%scLuage z;Al@=)t#&DU-)D`zlWrB`Cvn3!Y|PFh`9G3y3)OaTskl=r`sKU#{8d) z*#E56UAzdZ`d)=^&7nQ4vP*v~9Inn9MavLxqU!%;5J^U<)WRNkDVn=(p2>ur>5(&~ zF#A1%(r|iK0&;T&84x?nFx19g2y3P1NYC^Lan1-T!Ul*SNfWGF$mYf}H}8k_Y%$GV z*%R^G#F^ASkr366+yRj1FdP$#aC6*1>}8+p4bG8tsMDe7RUfa_7$2U2q!l@V9+p>m zc4rtKDu+w?x6c?Tl(&024*b?4ov}vXZTB_tlDi4B#@Au%ZxWrK<%3`iLzTV#9eeCs zm$Zp}_N^0}8hn!~52|Vf(p-5960NHr=S_6ziN4L7%4tyN7Tn6OjeF*}kxZG6sww_c z9-vWO(YpFxW+F`7v#p28Q^d`w7m}6d&zNmY^+Y|JGeCO`6TSa=iA5Z~z{@Kgss6G( z!d!rMa+E$BZaKjF8tq`XYB{7-v|YT4jP~@wOuR_FJyaa${BDVOp)$E9%z0M{WvI(d z!=kpTZJA|*bB=zMQ8j26yVJyzmh5uTweoVyYv?kc)Z)O~fW(biymO~*-uOH%Pk|Ty zfE9()(v;{e_ZThTWqg~Em~vztlE=Y0X&yk|adG7KO3{v6`y6|T(w_6{XkkoUE}oK6 z@|`qCoVlx4uWs|D$=rF~b*FWcVrC|pV2gQM4l!w$zhTM$xFu7Jl-}Wahsq7hx(7ai zn{8eSYy=wd`RH(wh3XcL*azLOM>^=WbGn`!)UNANVcxjK`a3Dj9>gf`xPd_Dt`yBq zK@)b;u~GDYkjq!!D{PEk#nYebX+yrgEu31X66SAUxGPc z!F9SdjwS0CdkeyDg`C^PRZckl(A9O)g!sW+nhdcXu0VYrcfZQfw+GbdK%Qm%(>nAP z5pqcj2YEHExjc04b;doLXx7P*_jRW-Mlh!iEsvYyj?3RWHdXMK5(a5dJtbYvVlG5u}|gPigIKMQEdfdmyiqwtGk+qPv`5zt3e!E ze_u?eLK{ftFY$K=mT|l7vM_}dUF&sk97h3_XdItcM-sV9mrG@61EQix!1zADG^?*d zZ}7T6S76F5)sl4w;o&%*#EQ`DjSJi&a;=bkYt7qSdeYpm@{iRIn#jx_9a*QUUN>@X ztr)#2EtfOYZqTx?8?#nb>sz!l#kfSsrNO4Ea^xd3*YH$`S-sidmDe^SGZ0v?MOInw zQm<)_?3f;k*V&`xQi~;Z5)X!pWe=?GDkh2kO1@cRk5=PG%8U;a`OAn*MP=9tYN|tH zjNp-Y%cAwUs@~6&R5dLg)d4ScYb>Xa;()yWPg$u2th}rb;X>cM?rWE45#7tU7~fc^ zLS$a4k}tm7moZviI&`!$SJ^6AV8IOQ{BqqiM_60o3iMtkIv{;58HRL=oR$4M$^O3_ zXbDH3GNsg0yX^+SIO8=G@m6s#JX(R1bUbLQ#TF`b?{+8P;L5gNzSg>xDR z2U95X=RKxee|*T97k{@5`Rb_DGxkm)sxA&UIM&Giqu4kMfAc?X(0D2gpU7dzxf3B$ zs+TX9?ou=_wWnZSf0#l%OCIQsxeRBmW-hrLW58|pwU?5q0}?dD;K$^G z=Ti!ZRUupS3B(me#`orEHO&CiEqy$x#_mhRm+_h3N9m-D0Cn{onH(6?65*?F$Tptv z70(RwM8-+RCiY}gbUKfs@wR+el|btj0V+do=gluxU?ytBsKK(u{DWZ;-5|v~C8LS^ zC20kv>6Z8L&Uc3n!A)HJy74;9TC=oNEYWLTZ${RLG3=Q^2CM66&Lplyo~j0ci7k#j ziI0EzgzH?v=;808FpTe zv#bt>RxdBN&YBn?`qVTPOzBF#-_4s50B48TjrqnMO5Ikyn$TKxSFDiAxU_T)SEJM* zncdM?{z5%dO2rDuroV3X7n&N1X}FT!A;{(j4H#ds=6%cZUKXy<>>I z(5N^cd6TBdQOa`6fT7d?6AP7Nn%C5_?CBlM(3sEAmrO8~PBINKSK|@6Fmu(kIq;D# z6(P{escJ9%hDC6$>c9o=)xx#cGZIdd2+>(|cx#a>sNL}L2w8N#Sy6(iN5U`+)}%W# z@2^kBQ>yEH75&DPr6?y{J!r3DMicDDoy$$05KzN8-zI`%#PLw=N{DsdcynjA=7hqx#Wm`*`L;K5%|ATVBu^T|@2gcA>6~BK4AT{eVjn1dc8X72}bH+86+!{I4WoKJ?y=HO=@xf~gjGXw=>Sy$h%EXPvR?2_%lU5J@q zZYhXYe}a6lrqsi1d8Zmp89=ojr<>^kwtZuzRhHjrBU$jxXlorM`GhwvKXTX$o!)z7 zUti7YqN!Kj%L()$x2empwNYSgkm!835o(J%k`rh6P}T`Y(&{@_|_ z+T9y+=bS?b1Usb`3-J9T{>^l?`faCz7P0-R&y1J;R$lX)y+NpAO6Cm1%k*vUg2(9W z#kSEN(WgE|K`|HSMn|d@z%|tCUtSe-cyrA_7sU&cbk?@ytwR^U;Tb?DqZg}G{ZCk9woW6vpL73J~}hS)P}dskq*Z} zfzpkz<~fXonu+r@*&q*D#No>FRB?;cb+JqR9b{mimMrZ7W9b}t3!b{33**_(G*nmT z0Yq{cgGNY)?MfG2o;(hbmRV-0BVaJm9yu~~*Z=@>J~lU93=_(JRpdSPnNg2{i**OJ z2ERqgJm{*7n^ei@j9?z5GzmdkfSyL3}-;KC@{BImL zR-rBPDAozFo+uwfsgz4I&YK3UJOr?HiZ!B{UG%`@Dm8kU{f2GoZmyUF6mmn7a55F_fd>QM)42->zpUJtRAeCh zfVa10BJV(y8dp=Lep^b_%4Lyjq5^zPl#{HU!O`d_spm;Zukpd4r$yk3DQq&aX#>U; zUlHP#5`O4X*Dy0kF6a@h?p0;Ou@~4rS<|#KT)hMh_F()w$viMMbNR;{rPL&3u$8EV z^BYaBrf~+AZAoQ9X_22yuxyD8W~d~cqD)oi6;22Yd9ztvt?pe7&E8K+H$$w@k3R}q znA}tn?SHhRK4cl#{II+NLC?4>RH34{&RSuAU#nrcXG0aT)8B#mR+pVKI9S&A5e}dt z2)}SpApk0uby8-OOY-6BtULzJZ>$FJPfOOjtwFD<4G9o4eGU_)t00;2#Yv72akR$4 zOJnO>TS5$FLlF=`j>EHF^{+KHFm7lWl@80Hw)I&*F@ps6jTa*VzAFgFD(pqH*eaJ{ zbZ=S!++ttn!$)(w*FoQU1cCFH9}7y@RvI(3smoakM@D= z9fb|pjAM%-6^aXDSUW$fy30OOzgDOpVsMTK;^*H=`@$ag0OcY!HuyEEB18bGYl}fY z)up;fg%f=Ra?{kRrF@vMvS+$XXL>zThcEgxoKYkCzs@Q7GY|-s$_kyTaffE!2_RB< z50Xi@bcPn*y8gw6FfK6qHJXAReh<>OF9uf7SobV+o1dE~bbB<+C=9-0u`PA_3wsk1 z5#Dikd{-@8;`EI9{5n6(Cd^qv^AOkuXb(&gOsKoDCbV0*vXwSbq%m3O^Lm%;wURmt zW{u!ch$8*6e?{z{djWLTK}1AYWh@I?8dI?oF_0e~iu+1zBd`{CE-N0iZkb<}r+C6fNZNn>WvneOtQmpMbjBOustY;p=LpnWcpZ7oTo0X&h?BQC_|>D_=C;Z zpuh8Xw+6=2j!dIw#A^)gN^}BHMQ-V_yn+Jaf$o)26&CK_YPX8t;x(n6zAz^{+uEt? z)d>dY-|n)V^&Yuue?Rmn&uP5N(>z0a-7K7dt;%1nG#NA4kd6O?&!ri9n3|Ui2GLr^ zEOPMR9(eP)Cl@h0%Cjl@V5rc1{el5 z>rO!!#O;((vGF7{_@Pvgp4FL=iHgK)3zbQgpU6y6q$D*BMPD9sI6ymHz%FSG`Yun? z)o+M5bF;o$GZ@R$JVc%ikeR}t7_+%* zBR@80=NRB^2Z(@JV0jGP9^y8Xhs1m%RJ+nJkQgUhXK=nXMP$u4YLreDz77=k0(+eE z+thXhl_0&>XxY-{IrA*?9plW%sR@Ww(|kEs(<*EzHuC4p&Giu-v9_ObC>N>mG*1qAU5WQy6-4&ky z$JuuWG?i_8kEkFh;sg+tW}#P+-cbZ8(wlT?(m{F+h=2@LX+c^50qF_7Lo5^l=_T|i zy+=wCNC?(va20$!BEREZ- z6q0quFoT;Y{+&i=`kdBA348d=Ow$`sY#pG56-G-Z-1i z%s0M@gKx+0S+p)DA*CcYi>HShnkj{<8Y$JIRe*D6#iX~k6iZ;cbvymxW85Dz^jcS8 z$@DbX)@r2t>kn+O7@g8Wjv4?`#FMW90EC>~XQ^nge%JU#CR{Ya)ku8JEVA0ML3Qt6 zV^!yu)*;MX7Fm%Pla1K}&EL_%g^iSB#45n`6s5t=2SOYI>78{uXl}z41DnlzE?>b-g5i4{ z%-#?zJC(;n*itfL(v>H;NDWL_JYPe&xyKqwvs-i?mv|#N<2)zP1^8k0xL@YsdGadsa)53V$bYj#I zQKJD=~>q}#^! zaBMlg_S}i3R40Uefa(utE;oaZ)lrw688@N7@x>vvp0?w0v!N zmnS^-K0UxIVle)0o|aa`cg?hYx53v`RnnHT9Xj+lG?jN_+?ex?RsT>UrAwu+ zZiQTY^>oRL#Vb?d5M_T*!HmCpOHwiAh>VE^BCWZ}lCoVXJav*$S8#&_l9dmrMczGlpZA8oSnm^Ge2x)cG7s8D=xmXWBpp0hFi)SiEy7i2EZ6-ZG^Yh z1%vSbH=XuJ?bpM>vlE1NQoGsQ*YLsu?c&wMwZJ{YCt0!UTpe6a*SCfn${oF;!K73k ziL7G=K9i`jM(|j&aj~2Eba7vj$ID6KB!W1tDFTdy7!_ku1utDZjgdAl7T*Lwfk9^j zkM(`aK5tFp5(y1X#~qc3nrjUYc`J6Yz*I&Gf~b>jhh6)TVkO$*S24kI=P9adz6c|y zSf|2%i*|q%cs`AGNOE9mx<)MP5O98mPu3HAS{G*@I?G$E_a;(vkpTe+wK&ls3=9;7 zQ*VVi%u3eOi}v_2miTF;ts*?M)KS;$nbX58EvUabXw{A?i+HQ=NfwwX*QDVTSc%1p ztpFITXtCxgA7B=<8BPV@vyCTb?tj4F(~u-g6%MM3C*B4lGaWq7;a`5|gv*tj+ZYfC zNV)w9^3CdrA#ufYFY8jJ@fOfu+o@budDnZ5$e0@hsg~aSSz7Iy0ni{2OPCz=_{$!r z-Th*{d~;(}S5!>A_^&Wn_vV-Mm^;;}i3>X(wm%3uR6X8$%dqGq#)Lzra(8n5(huUj1H@k!fNWV*|guI~J z$DQSFN$-8a4?7CCHgDl4tSpYoQrZ3hQ~>@HfNuH7DNFA~%KW21xl4Wl$*OQ#+xZ!K zE2;^=D{EEUZIzAX`Wvsl!XAQ?>b$xRnEDBpi?~&05%Km`#2hOZ3sr{H znCsFw=sflj`70XJ)O-j1;#E@6p4v^hy8z}SK#fML-b0TTAvmD8Xv{sfvoMpSmZmI` z$~uW;3u$a;8%Z+i8@niS_wA#KE>00;RC>)*R!5cy665Wj4X7YL=UVX7v1vax)&neu>Q3q>-ob^ zRh`}4rnyhqjqBB@bmtgEi|~R`w$cYzyIbZLHc_*5Toh`*rDkuUe6Xt87~+(5B8#Mi zvlFvqaL%!MIii*T^~D-53R2Vw6^92fj(jh0b`Bv9QYSMcgfP4x+kXz3x!>)-vW^*K9@B+=`e zq%^yjTtg=fGk_w=o2H@>Dt9deD@;2~Z= z$chm#3IOz^3bBERURdQM+IGeNXiY;pKz5F$TSLZYNbf%m>BA2_%ScT;Y31s@^%%k8 zs-gT75@#$m94ZQ0A3dsApMu$z?H>oKsBHvHl|r>k zi+~6*>4SmOht)vT3lms8FNMkF^zXN8#~0G4u!}&6_>LFhUsA|(w^TY^dPpJB;iKoz zfc+U(%TDSPfmZj2Bea+AaUvhqV0_46ql+y|E91?$nZW(8+_DDl39j-yxW+Mf?C5+6 z)*9kWOW<$<#Yl=TsxLzIS$IWqmL_h-it<#>%s7D$!{$7qT4(?$S7vusVZEMKp-`hj zj-wU*zw5$pBx<(?g8$T7H4mMqH~vjg{^|#~deXsnI@a{N7rv(b<#)^^o0-%jMYOGb zmOPEQz9;+mF-y&X>T~aHHqk=&S0C7dle9OAc(C^&fi__PG$bQ%QPFu{xiuDg_;67s zzkZ~uH^qVFy!mn0_kukD0%i=*N)z)261AfxfB{Hv?hF0X0RPWU(u@F8tVY~%cl&PR zanGa&KSEzwCQ2NYed3@JgA=vlmX_zDw@H$*Vsyn|l&w7>P|xl8oO3^x0~4p;rwe#=>@6(3vDWc>X?EW? zGRqB<>mJe2LZL&RqG##CzB4?)$0UvceanEf@3E)@7f7%pm!cN|kGzCaU-yE3bH14&7;ziafOOvOL3r%F4R6xfnx~ZG;r{HC^^3UfLjLk8dmNYAa9ZAzQh%k z+%xipDvCTRmUhe_Agw35!B!TSd?Rwum z`1-SF8jlauE>d_)_;_nL`Hs`JvMqf7esbC2CyxQ(GKbpdfG2PoRV#eA5#VFmiO<$c zm~^-KPKti|F^T1|h8B6%ggF~3vY>#a?jL8tzy9&SG@vy0Tga3W`5zzm<0e`Do`R+F zCSTxt{RF0GjhrmDmXZ?vON$AGEc~g@`CUWb&zm}7q{|5-aqB%lG}Znr-qNg;ODHG= z)Gxts;?z&1!r^2wOi$0x4yJ#l$DeC!Hp*@Px!922#Oa?b_TTLcBA4^$VwFl08$*R%C?=|kv%zf+cN+5D1bpy zlVp^BEGt(~i!Qlw|Guxk{^c2vwh1TvJI8)S+`5DuG?K`>l6NJ+1 zSEv6DH-ApF|9!18ULaN&3nDi7|3vYAv`asI<7`WSx4I8Mo%-$_|BH_yV<#f!h&VEl z{xv=N8M`$r0N!%3E&TSszw-a~TRF!Q5vwr|hW*5e`0304L?;R#0lcN9cK_|aBq@hh zc3JnPs+3gkU$SXu6rzprNWc92d`atXqQ;n5@zehuamx70D}HM&yaIouCtxICu>ICD zZB7wr{VHU!4{o}~d}Z^Mv03gSy?Wf5Wk~hq0`V&b#Yd3DT#4Hxm_5AHb&^@*(;m!E z_0QntuXp*_9ZJ^8_8D#oY*e9}qT+!jH#{ch`B3)B;XXhY=knj_^}uzKECX5RcSV%< zVpD8%D?+|qe%T?y;*3beHr%ewDwE0c1;$^P@;tUuG=*z*XOe50o*Nln{}@oh=Rz(q zc%5XuFpDutht`Akv}r<6#6?fp+}&V3Zqy%KMlv}kwmol9Unh3V0{@FO>VK*b<*o7p ze>``%qsUQikB}oLksrY_v{yx0(O|xP`9qO# zv8LpxqLpu^atj<0^w6%3d#wv@X4d&)>w)y=5eGXtiSJW_|Epop zgvzne#@MSyAk~zzuqI1~!))rS_(5i2Lsp&H!SRxdXBA5{j05n(h7GDTXzmUUeNCwX zjvC_RWW?Vc*JWonlv&ru@OcV-25UZc@gMcPcKs#rj8VzO{+BPxTn8lN>LqXWsQ+~v z-zjvt?)7OgT2iFCo#bZ5mqU1#k1di4ZuO>HJCoFmzN9rm{g_z`421c`^bYw+*ukK` zQ)+fzAr&NcVdw$kXnzl_$lc_jFcPWD73Ai+d_Qrs{-Z|Qoe9yE_=H`vz2cIWlWg$P zRdTc8WF*K{27hoP(wi%m?e>S^uO(6q9C{7}J6XtIHl?~zt|d5PGE|sPce?bky9za` z3{Z|ax)jx4@qGVbNv>&Kzwq&8`0AHpa^o%HbbX{p+>VF#ec7ulX1Rjt?nb`L>2iO1_A_x^qgaQJi5&V zzItxybFNW7G~wOvC&eU?@8T9A(#hqgO0+6G?j!KfYGU$WA1xnYJ#E*^iq*N?BSvTZ z-LwsD52w?{k_ixkT1Ao3`fz-1$gu1@%&>F}Z*g(tc*6OXb4%9*=BuU^ZHb5per@Ty zWxZR|EK%ozY{{BC7mo4xaTLb}lcCOFneW|g=62I4dYVOu1B#4D#J`@mr46;D3ydB^ z#f|Qky{-Ow#&&+VSax9npNl7FY%@bM(r|M_40yDGmr>Z(XgBxp;#J}DhEG)>|D2x@ zMMaC|X3{@YwZOfYvBH?)KX#iL2e?vt@r}utL%rNvi)RYRfL0Y-M9YJ+dNex5Jq7Dz zc!+1p$AD?Dvu0Wr@Xj8JfM9cAx<-?`YVZ|>W$&_$-F>6*mbwk1TgC^OTqpO=-6&() z6pkSZLZBd76WNS=3~1M?YrWcRll&&xIzxWYvf0)3T-VW~%jHOSV2%j|p|2KQlH$Hy z4*`HL=w-|HR)Tl$@QU{$zvjcfs!Y#NigEg|AAN1!o8`j|RaB$-W?4~uHI{e#LVtNr_me8yS){FFu5wc1Wh=lGE%g zla%4w;wr+gIfJrlnJn|AS=aq!Z4AB|rIMP(^0lz7%DyO9!Gdv~^$ zM|W$m<23ECN37GGZs90KcTC`U`jx}80FV}#mI7whUB?JyrDsGxbVp6`h0<OyaoKaGPhT0Ur}qVeXZ;TCy)%udlnWlQH+x|yYW_^`tHdF_~W@tjG!O;q6GC1(EI6%eM=(+Wyh@Y@090| z#dW258?rPaU2h&$oLa!HAJfx0j#Eu-N=#Qqf<&%JMRxT$*e2>aJ==RiLBD{$SZ`3Q zF(IEyM!U}K^vxqjJDQN427lRKKY$r?pAz+~JWp%!A#rk8M%`dMFTI5p&bdRIZf!x$ zpt#`prR$|$A6ujoy!3r$ES7r=K6hkh zSUNlJOr_TXokRisY(nGPiAmC4E1429AD23MZ zLlaH@>v6D6Vu6c3OKQfOC(`YGHld3rpgFSGJv>bnmkmMO;Sj7abZTqc$~CY1G)2`e zZ9qbtnm(6}yUL*$))1HPYxxXWVuN8fMcP0e3B~SHemjs!O$)SW-JuD20M%tKZ1x${ zLK5F%!SaM#{Z6pKM!ao65)7i{dARQ`tfwB$F15bZ#;lQeN*UAXv&B5JD@e20?BlGx zgrhKNP`iJiX+ixMC`W^07CF>-H2at#YMqR9nmR;`jGwKC~(G;m`0$7b+ z8g{NN{rYz3J-3Q<-5Ju7$>DQ6L2sT$Q%c0uPVJOGlC}@Py;fcuZGIw&AXc_Yi?>6z z@2Ju?!JKKi0XPBfT9hdbLVH}!r7k0{?TYBtk|{Zqmleitl#O!2YqGY6=44vNl@(By z=eHOMk7CP!cHeeUS`QkRgJuwxCZVu7FB#5_)e6+Nc#Gzm-mzh&P7mc~j z{#I;NQeDC03!X)_$t`4NZhUvj@T^gH)o0g`92=;6+dE-gs6W_2FxfKi08T~j4X`V` zvyRpjFeu1%B!w|vr6x9TSt!>xwWxdg(VTnP)S2uR!a8ic)=u~c{A4o_XE9QXfGh?s zcl3i_aXsjuZoo_!O$PX9d^k!E0S6mSEz`JE^lzV@wuT)gLAbd$JRbY=b-w93yFtbH zLoMUy)(QIVedEHhPN2bd&sfu;nL&&K|3xm~Pvzz*!m#VpMI)|?u!gUx^X$Dx#~U+b z8T9R#ciV=_Nc8m6)1l7qURw@Jz8#n2$O#KGBgr|Na0!$UOp(JF2D%=5=^Sf&*sDtp z_gpK~>NrWQ8Y!=wCUr7-+Q4Ohj9#r?9(5SqV#!p;_bwj~^lfZx^!a$%njR5^Quwz_ z#4xc-Ntr-m~Ga;}uWW3d05nnH&f{agkP~w5&HONDHlUdbt-aZng zD#ia*UCI7cyyyt~ZZxXVN--04VlWL$V<8F%AK@y%Y)Rsj3xn-uQFm}Ql5 zm|*{_O)?g-1wl;h(%))N!OIkB2d3dVTw+%wbh%p0`<`Knk(qgqrhMjU2)en z6z%K+8Q46FLjWb5cJy$Cu)`N=4?-%BD2XYcrtq!Eh{+k7%BGuB_yFZZGkyO_jL%E6 z=vK7(t4A6iI6vCR2Qy_oy*39QUi4}x;AI|tpmRanj$3#P)uW+YpUB6iL<}TXqj}oB zSXmLP6NzxpesOeUkE^yVmSJIxiyzpPR6s`@O|SO@E$TNw>JyqGK|DFgFTCL*JJ5bp z!~-sTfMIZ*o%sE^5|D<<3L1uEt=_m^Vc_Jr8O)T}%a7ELuFUVaX5>y3^?b&#+c_Q| zdF-cG{6P7$s*}iZeB)7TtHIpkei`CKOaJha?#Dh-c=C-)#WD7G{`Twt50m}Jd8eCj zq{_z2Ki)qVy=#8XQ<`CV+9`Eu$xejsc7vaPA+g5||E_HH%9R*KRD;B&0LGy!2#yW*Ka$N^pI{|ce1fa|LUrBizT~4M{~9M zb>VAw^shI&cf~AGP7TGD5xiZ@B(nR$_Hs0-DLXo&J!5rPy(B@d=e=vhF_5u74YO8v zrH+V0Hc@_N8p_v{oBP{2`@Y_C9dR9SgIdgCjvcaO%dzG;^JfXTz`Uq2`e>Inv}7$5 z|E{8|mu;X~Z8_be?`0e{szrV{e#MZ8X*k|1{qXHuy=Hlq5c^!WlV!Dl^n(Hr4y+ z*|LfJfyqFp?f_FjR{M*&tDiTLRs+OZrwt8@OeVFZQOeBD5Xh zc7OEs?hDh#wJ6rF;lZq0`NrRFPB^T=yqueP_yd&c6X~Z-_k1E{#+&coW1jX(F;`s- zilp7@08R0AWKiv*wJu*pUKPn$li&i58$Ml22R>(%FO`dX9SS!a>~9&Rc8j z+6OsdYzp-~)`7I?w#ZgY2@CJ{`c{`k{g~PIu9`yQnmjS6h1ChpzSpx6lU7J( zS#z9Bs>9UKBE9r+6QvQD%u6_`U(LM+{~II-?UtfExsG;^_O6XdACxj-E`Ml@ZS@?j zxwDblhml-<$~2Th?QQ6%xaxGY9afN@KD#Hxu2=RYE@gx=7Chfv@*&=|HxEwY|JZ*k>L$huFOaa zf4d@B4^(yG^D#sdp^bSeE~m3@o`2@0 z=X$IE)EL=qI9i6UUC}FaV(?7q11}$3>_+Q&4H7+LI~B6Ka0x6XWX)lHwA}P|Qs^xKC1zz#d7Vv&Gxg$GO>k5&-(;8eF z5}HPGYW>C5`B$gFot~!8wx}jUBo`Mg8jkVb0&n!JJhLI%``UfxS3Gndr9DpBwLYV5>1%YwUg? zV$-NmpqM2>+!lW~43m{l@tZs>4t}KwzGA|R<@X*WOY84_s!dcp5|jpex};gZf=SI) zdECkpapu}1WFuy?XiN-6MG1R{wbl8m>TVbPltlK>iNp+2zs#Ezp>3+$wZHEa*KSqT zG$mLXQ=2a-t=Kl;SZ5Po-xy5EpIBZvsCK6cj?@*hf6Q)>8&o+QTtyZCn(M3AD1y@^ z{!@(!3t4^Q!^1d)#x>z03woqF5nP?#HVmtMf~>7F0}Q{$%;XP!=W6d{65D;vEPN)0~RDl5qV}vl9Nf&6rHYyofUR z5jH?*@l)(8V*CUqX(M+-0$Z9wo4k%Tn$EX9Z7o(jKz=da?h|BGn?m2Z1|(_E8zu$i zy)=T;_^MKF3NvT+*|Z?zL9}MVRe20o9DKj={May|#9thGq(y5zFCbJkkS(_pPeuOeTx|J+CL&;fdph^%1 z(G`rFe|iGuASPa5-ygR?uUXB#Zt(#w=uk?3~~mgdDbX?fUn47x2(@0WPmFNUnB>}^r_In zm8?P?J6R^ivO2e#m-k!!cJP_|-hBL3dAlDCnX;7yalUva9xwL0`QwUPst1}}4tksl z=wkz^#8F&08`N6o=tkz=vVO1z;?{O;SSuaJx%#8qq(3P)|1^xtc0V)69ZBQ#A)Lva zui3-vR27r;p&bXO-dm)}&Y3cq``kU#o-I$rA3wV2peDu#)xB9_Qs_DNy05h`b2Yux zs-P}cKf1KAYW!+v?c!Qv6fcSC)b@I^o<-w2gnT75=W28eJF3#D9~n2Z>kZTDjCBpb zr9Re;>oIv#TVt~GU@zpucnzyb?z+EMUJb!gnLf*w$rL7j#f>wvRe<}zuT)<#;zfM<(zio}%gSntng$z!1?KntnsMG`!A^&k(q|f#-1A`j`@8)v zPZpEL8D~B&w|ck8272e&HL6YuYvp&>Ad;wcql9gR2T@)XC?8-FWV%|V4&2o>OFgK_ zxM_cHSnp;fFJ5lDeZsrX3gHx#wNF$ZBg+y5=4rl-Jhw&mjTpz-abiQ5W#LX!ipmeb z12d*jAA`zqZ;;lssi>Qt&?%eRHgA~7jFkMGSmcAb?ZW1@)7X5YD2v5Rf|UJ!e3!ZWsV;3LZ^m^~h{=t(-{b3h- z_iPlx)Sk-H*5+IomsQ$l(Eoj8o}_j;%tM~C1m5GgR2sdaazVRP4taV@znSTxXU)0_ zr6X=4b0X@I_d?rKAzkKI1+|7nS8fvz-$q-f zDBkUkIiChg3=yW#errDN8F8Uh>(!n4=N;_UkJ0iaev-(TU9#2Cs>EH9Y}@PtzAJ@P zFPY}-`;5KD{9^F~xNSfB0l|TorLXwg>#5Ks8F;tGZoy72j$87uT}?1e)Mvl_{oau;@4x6_QX;-2Lz{ke9c$FPB-KPTR=G$CqgCKdMp z2qw<$DPB*Z5@`{q>ipH*8;#PJqVr^Qo~+O9=`Q%g+V_X`A3$5$OI{&%S5n%Em&mNE|Qvc&x|VZ z(}V4RY_XIxv}y|j!5+j8HyB(L&2M`F?WoUf=|`1-2W8blMQG>2UzXF*gyG=>x-~Ji z>K*QY1L38CB{kjs6owKjXuL&CWTpB1F$+X>n%^zDO(;@Vf!btmSs1l=~6Hp zqJ#}@$-Xk58-CF803!ik-AW~NM2IuJA&&#*!1;W5Vt%c+-;IgbLU@`*zS5F&eZcV) z@zVXDT)*>UycPmIzoJv)-A~2ebZeWV$+$ZV_?wU+x`8&D1LS^jtxb=s?~xCEHROmE zH$jV^*jor0(U1}x2tM{oe)XJ9$iFYuxn|sB?99t7y5b0{CxolUsNmES2_s^TtSeRPP??1gm^qBlfoJ$;YClY?&6hHetaB&%{m|XS zHX$Lc`!8KnzCjZ6zFv2=Vp^L;CAL@Zn66C@Jqmz(LlWV6YSAHYpH*yPc&g9#$9dL`0;y|yZa?U zgpwdhco03N1oTpAnmxxkHA?0WkEk{|gH8kMfW{K2iw=Su&!G`PmvFeofm05I#FGNEPh` znh2zK4z75$`d2|}hxKt6h^vLZ=V-dNtYi|xcT640X{fxme9E;bg%lA{+d`bioZ3QV2n#^YU`zTbdbX0{othih&FN%-e-V?9ML+Ip{X7}2?3#A?TcAxY;yrL z?^DTa4i=RF3l(i$B*&p%^|d;gRc-scoHvtJm@-fm%Q}l6;@D{YL^1BNo0>}qru#Ob zMolt79#*-o_~R`#Kk*2-j2_p2KDt8=F7HC0%D6EEt4M7V0O6l4je-z~cJWGjwTXwZCDoz=CQn3_R`4B%uLygRPCCXwJjcC1! zGUI?733XW)yI6B%9c&;2wj(tHir->%Hdhpyc8}KDFP)XG2GT~Y&9W0RR2wNKWAul9 zGGtjdoT>2D%R_lx97Wv}6FHGw*FLhg&P8FYCG{q>yjM`d0dLndzhGO-hd5X1E>CmC zbBQY-w-TGef>PhBM4I&2iGRT`TpT*unmcM(<-Uc(=yulz<%HGI%%RB8;)Mp!-~0C=O^&EA56hJ1379QzW6IEO-O_mVeNF(10{x=+ZAEvgApMVrv3pJ5%a>S_Y-yY zED9bfyp+|pOlUg-6)~CZNWS#!cjW94-|>Hc;p8&Bq$WArw((K%L+{$ymiw)C#I@wF zqe#qrTN1n*T-N<{b}e*r>rB`z0~+@Y{pjj(G6U>ElRvdCL?zwKmJ?$6*$~x^*iLPh z*(l3mMaMr{tU}pen{%o+BL>}>rgE1w#h9&d@l4Qwp9syIK9o>&oi(2@kvGSjcso~c z?8)X{#f(sI^cpR6M%(~}5xSuWs+FY2azeQpqM3D{`=Zt3=4M#*@$rK%R>@va3-LsVy{*~b7 zLuN2?t~95-;~?;Lt>ThGy>22;kzYc#wsu`;^e-4;4uKFI;C`#scHz~J8U)57kAKVp z;8)t<+P2{D;H;4sKInjt57`}Tz7hdzZgNI;f#Ax2*RVesy{p~i7ZR{pje@M35ip;G z4VfRMG0srUV;3j_JepDRcS(5!E827;#rlYNn0U>e+%8R?o$T zhwbU#a*0G4odYm*nrHY6jEKMPSFA^q;nLz4EN+!F=lGX%;6RT(1>3dXb-cEW9FG=3 zggti-xIO#A{b58~0jMZp#=!>VORn}Uzz_&LP*<-RC~w@#K}@BwOe-WlnkMH$c%gZ| zcng$e*ssvBWi05uhl6un0`CJA*L}?@6 ztBDmf2p8};m%(^_MO|DLNyP!WgVnv1@35T+~KG z4t}@lOTIaE^DFPVN~p|2F6#0W-6Fa8s{OWa+cL=>eMBY^@1LlV^-xVzg{-D9 zS&5$K5lHk6V%0)5rFaH@Sq6-`F{;hm{(9Fst83lCV}hk%C8x1FqpqI$nm1OOjUF#o z(UieSOS}M48|?Q_HUg|S#~9Hb*ATgWP<+r946z4ucO#vSp!->SO>!TM$8aJvMP?Vcak{{;u+5c?CS4Ji zcufP>!6SbYK80X18xL8}&&-4ifryf_aYdNUvW=i|1kmrF61sw5JuhX&z=ClKmJu@i zO!Sbszs4;!uT`db3Z%=m*CFWqe^>BAs!8yh&3MfmoCTt=T3B_1)2*iMLSzHzIB7qm zO&t_Ob#v?bW)PK!ipTpNdiNKIHml>UMr9D=`~;@+1h6|@@pfHZca`}cuyXj4pOjGncZrFsh7VK zoIF>QwP~-aHzVi0f*{Pg7u4m1tpTifBC`wbE$D?SCk7xkt8ym`)DSsAFABi*Bf?!A zI)l5`0a=2vqs%zEd9Y?FVIFsBpUH4K>F&_-J`a9za`9~cDZJ>2FtpZl^t_2A1`OwR z=W(mC--(57e39BMMO5v%Mma?tInVJ11 z;&e2Oi`U3yB;5^TdzYW4bt}9(urcM^wP5;bb?7_iXF>i;#GOwNnbTnTNj}KHeOkmD z@=KB`Qs^oj=5tQp?9u&d4|>{(|$e9GXn9aC!~i6Mow4Z^h?Nv7@d>8T}=VV9j8eY!qNbxP-I+~)dBICM8K zi+pQgc*$8?6?`Qy%qL=GJ#eymRV2&1C72pKnaKv$-&2WHEXcWMex4|!Ic7PXMHRS( zvKU%Q;aK#f?0Ru@h;Kn=$Vh{Ukqj$Df)boPJU+DTff1bB z8)S*L%8Xjam?FI^M9(`fetv+EvM?_I1?wSQl~=m28cnjQL;kR9*K55wat1rb42>%rj37Bzf!_siy3ZKP{1Z=2|4LPjx% zT7S1;<-v=QC#%cg6?=L6UDe+bp8Qz|lw&17^W2-~Zo3H9-D--i8tz0dNjyB2A{#JT zu9v)^xzI3sYdK)rqwlUZGI&a-tgWJ9W7yS5R_TLr&<)Djh~XA6zR_$&95E;O-c@xf zM`lxfn5}X+jPiC_}3~e)zmIR6eN7EQR&g$eo$2V>8KsMjh>fY_@b#wyC zf3n&?JS;BJc|upBm8T-N|CCjp+A%ORuu6maTF{#Pu!BHMpnAGTqG*L8*?wkNqp;G2 z{Dcy-5e6cecVpDSVYU@)%D<3dp3udc{HRk{*|gkNn)s5!{D#VlK&wVNv_L0aK)5=fScGED!KpKo!czv;Q+WhTo)58Ve!_)7a~|QA{l$?< zFix*ypVJdVxb_Q?NY*`KMNJsamEOKPU=WcUmvm+B>VZ%ZGY=pZ^dP)oHbONn0jOUZ z%f!FP3b=;J!IiKEt=c2Q8i*`=76)}Kdymv=hx3wiS*K-RYR4v<_XJ~bM&^)vMn!06 z5PpphuCnOv6Y5B~HzA{Isbi2ndyDZw@Uc(*{<%s~>LJ)X&A zYhPRxPHj7OfBlJf2$`N;`LfzH{^J$TKl`0Hmht0~Et9(3(fMz*!ZRBNBTIYo7 z&6Ypa%4kaIy)pX032|CE-FA&8IOJX{Rfgc@`J=-J>e!Glg7kbwaHW&oKCN2b5ZMYl zVRJ(vJuaZKZ4=}LbyHL!{^9HE&8q7a16SgCXt!Bx!p2X{Dzx2ZTAVlM?1A=iMR6xP zPE&%bO_jL4Cc6dU>P5k-_soeZ=|XF0cmXR3?4m>*Tg_os3GeXRBblKyqzD#LTt#qF zH}fdfI;>^J+MsRCY|M3eyjDFhc%_HaIF?nu<5x$vwE z@xej$;6M8R+NlS~!Vr8pk-s-26?(SbpW_DMA3Fwfg=g%7N!*IU@C(||q(1y_->%A# zWqui65+bUKTul0eWf~vC-6IE|YZ+3#fg+1@&OvKDQqPkOWtL$7Jcj|VRw3G&dGOf0 z=%Wh{A)gl0GIm$ixX6_3NaD>0ad5VGQVVs0^65+jx=1UQkun;U8>LuCB=M82dy4WD zLzcJCu!r_RXw!|>vU#;_dmm9@)kRYu`7~!D(KoIQ$9jhd;|bGBQ|<9XQi#vBUDzEs zlYFE(Vw4yq)4OfymCrzw&O;b;obx>YDtk!nJ?XCquRsVdnBnq%wT;7&E7){#*!~X< z<6y^bOitCyKsK=t1+i>@lb!?iOeU!@K8#JDr~9b`5whpVS_`RHx;gx@eW+dZKtc9a zK37oJomD&`c3$*FllhJ1z>o|($;gM?y{n}i!r0<`5g|c7_NoSNyZW|4 z_}?1iQj44UB=^1^o2yvwC<`mPPF9KAHf3bzN3d_VKByEo2PoYqL;(h_8=zKuRe#?y z(nA{Vtvsc5zzMY4f)H>DD>F!X^H1ulWyy;i_U42aSRd2`u zt=P@k%QVk zI(K?vyt}B)JmQn%TQV7`h$IuQyKx42RIdYX2#k15Kc0X=p31(PpX~j|J%%*OUsT0k zvgzBcEJK`GU#PCwLXtsvaV2@?d(MfU@;^iR@XtJIS&>IRJ?|P~&?XOVEK5C5uY|Xe zg63I7AXm6^B6~dMx8s}rqbJ7#=&K6(Au`LVhbs5bnKF-?bM6;}1uf|g8?a=!zpm2C zHUNKWV8`8GM&rb#(tC85p-Cr}XPtA8Xn2c#C`>?2d+>e?IcQzw4C~&}I_lOitI*wd zoXg!W6dQ+zvv?VGWKoWH(mU+PTVTI~@O964bC7#MQZjLMU8-Y?%Bu|p_jAMgeBQJ@ zA}3DW3$|VTxQiFWR^Km@U4d$pm|C~CaHb99B4quwh3;;vz>{^ zSgrepoi$6b?|?ZO=yHa-$N3;;Z_>ubvUI4f5a5dUmSI!vJPk+mYb_!~@J^4f6Dxd5 zSuBf8CS|j3F2`dyYcY^UO}hu#@~IPZTaoF^J-;iVH-5;6`~&L#FTdbmKSg|D;aapK zVr}Rndv1vVvocfDip(FkYKzPuM>z9DelUaz#l?p}v1g>R*{hSlcT`NN;QrvOdB}csL%O^2(MngHQ&a!w z@TN(YsOGQ*G_fqHF-XU|5lD*-naUFW5c52!$>lBZ)H}T)#2?>APT`#I zac5T4YoWwnNnKr!eP@{kwgOMy5w-_K_?*3@ras6*TQJ`DVtv&0EFE$ytoH_a&77V) zkU)I5Jk(6iMI8NL>mc+Km}D~hrQ&8zL91Umo`xuzzf}47*bk3NNd#i^9VYv2JMU?v zqkKc9z(aaI59_L)5o%K^FscGGT_UUMbuat-=%k&2M+;K^J*MWLffy~{(?32VuAEoC zzaT}WY)z>13rAgkYkmO;t)p^2i~q=%A4+QY#CJRmi$LSKm!OFxiPgRH zS#h|4$w_IivL{Uvk+8nXy8i9ev_pN))h>C#^iRrhuj+5jbj=Y+yVt1#@iXi8>+s7^ zd(H1!xC@_#1os?bg6TLE#TC`xEazU=1d&J|FsG`_eP}rME5xE;8MrwwvnGi38`4J) zZz14;2!1gp}J& zWm39#w@4?p`Y%I^zTco+HU~-LB=^~ngx`UjzqQL*x*uWwKggNpJrcxr10;S3%TMlF z8D6(;G4wRPPHPRW+WghtoDb74<%~b>L1^DJ-RYdIJm%_mo6rR^amH-E^p&OSZvK?3 zP-(II*yA?a=Jb5tH6T#_ac5lj-l1JTN4@rap*)vx>W`U4?k{F>^?|lspMQa=mp-XI zZuHDz%Za?c{^)6Fn>D#kMzD1}UeyGyuM4pMuQVv|0 z6jiD1PZ;pnTR2`ubO=ML{1Ti3IRxbIC93TO?y^*J`ZH3q#w8A%p)ObvTUIf-nKQMu z)2hqoBU)ces0w$y=JfC7I2-ot;M1;isiS02iBZ0gvW5Isr7Mm1I8Ug((sleu?+#sm z_&;U&2QvRp+mxnU$h^&l0gLA%jn9M<7r%<#&4|{Z8x{F&wV)bMkOH+Z6f-*8l)Q~g&`aik=CKRYle{SP#WoOq`Mh9zJrRdqVIFP zzwaNenPKLfz3)}`T6^zx%&fbU8@3)|NvSm?88SDM;Nf>zkI*-lE0k<<+4+hg_#(3> zE_sN_+q~bh{P={0*Y+>YXo~LY{n1n=mj7j74h*Ys?-&n47U;tcan9+B9yyeEZ~F3s zw?>0w{p_~YJ63J5qz>v6XA$(pcO?)V6x^)4RO&@ywnu#Gn62dma{L107E!hp#x6Ol z;pfc$6rw;47EsZ6Q4oRkKU6d(@5dk=7HMNWNOLIS@a7szvNw_eBC;&q+n$%Ho?k%~ zi^X82w&Wb137YW06B9SY^Um;uOW2EXnPk^@;*IUVNY@}UdTBWy#=(BvV&l3VGV+`4 z)2$RO{(VmbA<%}TO%5;9Y2m*-i8*HRyr3Sx5Y@K_m;Xog{`yM58N<5!s!uGQKcy(> zQgF%#C5pA!YA9Z&_-3x5pjT}z9#lMyI?K1((!-BZbj^gGefJyxsDLB=c7IOGOMg;#S$1@alz`@R z-MfE+{l|WN5xh16iDlZ=BGz`1_%=Pah_4wFG(tp2Yn-I*bV&mZP<$Kh%zDMIe(B?tAh`y>1kN(!jJtpEj`VK3zkd3+N&olH1jf!kk)0`WOxWda2PU44Xh5bXSCn#XDz^)=^7qG10^7J4Giy*vPq!@aE+YiD(p zNPZTqpv3ED_v1A8+5i7x=mJzQ*`zCre>C4Z*+T+WaIM`lewGo?Ess6Xm~wh80SDOd z12vmMf?Ny9QIjQ&!M^UGEbYs1-Ge+3VdjDM(#R#%eEZe@X*6oR*&6fz0?oJl>~`Wm zbSVi5&oIvde<&6si%j|GAbxC*`#kZ-on@WTTgyxU=9xc?YUU4-&DxMrSamqSnZbilVIVBCJ5^{)4W z_4`W*hZixU_opfep-lg{|Eu1<#UL2LIK}Ez>H!`C<&dY(Pyx*LIu9@ZxWjBOdTaSn zJ-_K-lx96hxXSH`^@B+Q7WT7X{Bp5?I6(4=FCm4eRiK}q#^L_^(G$@px);EzPG<-^ zz`E{8K%gkoFxsq5aXPVY(Es*_R`q0ieED(HnU)kWO&IHWp4iCbfPRwKb3c?vpyI~; zNm|;psE*80Vl|%sK3Ymjz$=4>uRQpp=j8~_q`}Fs<_;sZes&1DErWhuRm81Ze%;!g z=i1wgIGYaNVBH{n_UQiu)*JPD!0j=K9E9 z^k<(qY>M|A-#(1t{`B$BiT_lAtAw{7yA`0H>dk4WB)I+F_yp9{SK+PKe@H6E^t|Qw zk#3n)R{z^bVX_TsSZ9*?fYi!$x~9p#uDG>c=1A&JS|?cYqC8&_*LuMloZ$wC$!;7W zG3Q>jpf<_#uyGuWCht!(VK-~|nGC*HlVZqmW@%~hAsVzSAz{up(xTua?q&ZnvG=%u z7OuzE6aDrb0c_Y*aHzgcxR-WG`t?Xd(U>RbP!+pSqjsI9FIz`hEIlDNtZZkwOKyHy zR7{Zg%6c*3#{1F0b?%MvPcscGQl}?5bs1uiDUxB&D|dtRd?akHB9jx~v487XcIt80 zX|C4KXnCJ8#gB@&gCewR{N;PtbKC@?v2PI;#_bNpsZ=#+bEONi(?Spe@!LQ7yB;bv@5S;KI;Jtq=0C}2a1})d*P?TAdkG=e> zXFj6|*mr;ca7W=L{;%Jm)p+HOq&s4e?f9qIv+*%39Ee#7D{?iOLm~u>|NPL#qKgS? zCzfFOEP?T?LjEGfP3hZRB~_e%Hu2jFf0Yr(+~ZtUq0Zj>y+4lt2-}*PN=*4MmBK>y zyt+aknRX;mXoYE@vcWr+rDzOk4eMjQr2fol#x{c)XKB5x#7&L zuin25Ge(xNK6r!tAtiDg`d-Xqv(CJE|876(&xyfq0`=J8^39A*EB82Rr*mGT-u1J* z+@8~8;-qS?I6t|aa8O?Qa@C$f0<}WY&>b9JnL(H~KzB8_BIvMmajo_d`O8H!Dz?l6 zv;4Y&T8&k;jY@vq!D#1SkoltyfL7>iz! z5NE1YfkA(xwhffWO@K!GAXpSoyWvoQW1goF*!;7y>pi z%uMEDhsK@C#ayR;4}tdcrYy>E1uDYB(uk{Q7>?&POv#QyvzAC4?WXhEkG?AJPpPhR zQM~?L$-e^=kaiN}p(ce7|0@)7*T14I$d%Hm`QHPqI^qUTZ|b_IKq{7+f-zw=>b7hH z`j~GIotcgbj?wK2P5rRsH#qiJYJa-gQ%C`QzfXD+`ernqpS0fn>0@P$&=nJiy&(u9W#>H=L`+^02(tob-^VK8;U|iB_)_=SCAK``v zgG+J{?56T9x=iqXrvGK6KfVl8!vOq)h*$ZqS~S%62SS929VUDLM_QdwL} zg8iK1&)@u?M(Bv~nKT_T4gD6sqqi|x&W2$?Z7qS~FeM?w-mmEWdE4(pl@&$Lc|g8q z@BEz;IzS&1{3-R@)PRsI$K!slGya^w$(58ZDk*Qq;+#4m*}Lew-#nA}WRm60KcNyJ z2WSManji6RoZu9EnB2Ac(QgRSZU-!!6&f(%Jwd>D%VlRG{=Ctj^A30~z^8bUW8f9X zs+~rsU*y(?d6NI2aV7sRYceqb7!DQRn19FNq{``l|KrDDCPm%czwKL6T zP(U)^8>Z|vLw|1^FI$ctq?d-+d$X(jDeMhbVBsr*C)v!Oj9tjo4%H3MR$5{{pWIDW z>M%5j$);aAvRghoFpm6BfMiF|Ve3(3Isa3|3uX$q2Lhcz4&A{~A|R6+@r*iyF7Q^s z9!NCgMz(|d|K|99QG#z$dhZWJ$Ggb5TzxBmuIR@9wGPXSAsEOcrT=e-FQ#aY!niKmH_{#qKz}`cETw70Wqmjg0 zFVf#xOHl^2Ql;^yWyWL!-Y7L~zuEMxPOK7q9E61RBtMYsBi!BTf5Y|Dt^u7ze_aO%;WE$H^E!z5?8InrWmgJTE7I6rCBPe1sW9# zU|rPxmXC;kWq*Nn_SS%cfUq#c=Zy>hEm;+~?e?nsVFzo}YHn)(+F}G;zO>3s6tJ`k zHi_EZSUWpAm<&YM`1om;`72BRw>@C7RN}pTj?|;KH+BM>I+4P4y4@EDw6rWg=3H$mH%O35c zF4$+l$=L)!74bfJjp%!}470or)U3SCM&I)Q+!Vwf`?sz{0#R4u&f0_QKQP1v`LgbY zBx8ReDXq3zs)x^Wv(WG86gU~hX+C&Jx-w$Ir4eqt7)V|#tx+~?=L0T~-gXn_3pM(T zLk9Hau&n}m1R@G}JBcLlF7YJlY@uoq>9#k$$(IbQ{Q4rv*h>e$ zSOnsz#WlJkCa6lIh{)$@z_{SOT%J3dZJ)G}K3KBQB8pN>CPEqo$g@QPHI|a(fvdNQ zUZ+OAbj9)+?!^e};ZR(vRH`ssu*6sA8?`Rqj!`<0&!4d&k6sXcUY(z0eDI7aw;gT0 zoS1IgJY}foRo1=8T61O7yIz_sksIU>BP~V=kTYE6otmR8u|Bx0vF-j+hGmgH-4E5K zSsrci_3+uUnl6I9Bwx0t_3bBEjT|bsA|1k6v?n-c=0i3Jbuu+5X7&-O!%I9bhDMZP ziSgC%ZS0O;-c{ZWFFcfxCbqpY_TwQbp7<%D*0ER_EF*R!?TmzBTM!H8cDfZ3{;KlKIeI zXU4BqbAMg?k(~$*eXgB%1ZimhO{Q2{%<;BL8>7Xp3Z$H25}S&*wT!7E4UBDh>M-UnP+v~M!ud3iE;alved_UnW=Q@V zKgSZ|(MZ?g^L(D?g+_MA`N|ays;-`3?rK)D{Wn;RoMnj{W$se@O-RY+7JjC@k|X+O zb~7o%O<~+uBn`hP0A>zNvP~wZvNnl6OojuGA}1CLefv6@O3iM@5k$2{k%Do-+;ARu zPeQPwzOI$H#3?bAdsS4vtp6I;euQT^A-WB>cwC)tl5NLslFruUB%3`MAFKi{b>{L5 zZ%vKolhkM$Xf!vLOwJ> z_r-Z4=PlYDLPM3kP^918tV*$%{G(52;zh%|R?Y0m)KU=tXjL+9x-E;37I$aHAv$%| zrH<)XKKjCcwpbIh6?1tdktQlf;UT&aZ5}6tMOOD`N}eor6yjF0f*<}Sf= zv@fTiZ3re%1s%y#K!@|7{|K-bc~s{D3$xE|zLRAD;-Be5K1a_PQi&U8kV zw?}TcZ&ATZ#@09N2jZrBfeECk#A)wSquw8A#b-D_U9U0FyE$sIHAhJFsFpR7i&_u3 z<}Ig-x%%!ol2sqJzrgA8Muy8#ZG}RToisV}RLMJ}x?qlwV%{*Ri8Yo zfy2cgb&-hkh=PNlw09lvABw0qn>Lq;*;1k#WHl=7R2k_vFYSM6v}ia^QZ%#GHSMl= zgD-o6*4~pJb%(uK;ztbV8nCK!6>Qx}3ldbm*cxW8iq(T|TM(z?9d`^ z{$WhF02X}$1qj=HhDW-&C%JK+X2Oc^K_{AG^X<;(O*&MpmV*fy zq!z0|2CUjRcf%aIm;dMp4UuR zC0BS@+iaQS(lQ|xr4+-rk;Hi`aiDe8E8 z5E-6N)}_``f~p8{R1h`Q(tHRvTWb0c$>->1_OV^uuy+)PN_j$(Peur1+vO!=oFfxb zdrgmdJJ%X7gO;R)wCeSfjw-`__*mx|J}{0jFXd!%d}y5^f;K0wj|>bMGK&LIDny{u z0c-fR5c`R%QycLjq{=O!v)MnWwd)S32jbb`6u)>}yF{(VVJdwf-os*JNv+u%N&3m; zpGH|qsUmenZt}XPT~I2#J8E?UPO$Qs zftTNnGJ2vb!OEt!u73K7a&sF7a#E$Fe3(C6X!f-L4R$qIM!v*aBe1IC-40f$RnAf7 zb>k4C!2ynlaKr~NzBf6`)MJKDR(iL&n+|@HA?)r&C7UYAC0%=>QTaGJes+^(z9wNl zXc1Iq(~CpWT{xNIAuE}E1=f|(Y^df4J0e^%o+|lCwrfFd>n_sBY1Z1v*TF8-x8TTZ ze(SI-t?LeMTa6c0AqL>u_g4Vi1arN61W;*A%tuGG1$X#Q&+G=K9&C^A`3A!CIQKczt+jO zFt0T3zYlAgUEhThSydRh%=smak=Qi~B|{5+nUnN`qeHehU6{5Saq$Q8@QCsgSLXRi zGl#WRfF7w16nuAp{X9`jl94iORmkF23|C}} zfO_$$^_#oiD_~*cb{rM12r!v`_C?b1``Y7u9W*3@$=-Hrpp~S7yR*RBdG)27t0z^- zRsx2YIU{nJ)C9rKC#x7p!Vs-a>|-`~!e;=4nIkurIRKJGdmQD`=0D~G=IDB(yVzz3 z+EceM&|$k2O!4&-oMjMoJl@_Kv4s$!Dm4u9pbxvrt!cemn$`j?YrdJa7STewr)-3} z%Q*^>S~8z$F!-?KLx5mvXY-O0=C6|ww3{J0hWm-aU-`#jh?$xoYRUBlNl0&l%5;3d z{0fSmI8V%98Z=R4@38Hvensot-BQnT@O%a=NkilG2Q9+e-tLD3)X(xk79oDgk_7&T zR^!NQ?KtP84&fiifPj501c41bk-Im&pZ=y$Kio#698d?yE=`yG{ZU-^GXzjQ+1hSA z?^hn|{WCD{vKYGlm`4gv$!39~)4GEm|JFc`JKFHf!blKiH?g1dBO}V|!=rrr7drQ5 z?eyrF`eXL{wiu;mxa~;P*=p1g)WDYasmn{MW&LAV9t;CFoL8RN&n{aH0};jYm_x4x zcekb6T=QAa(2MsJsZ9Za3U4VlQvnxj0SGGNjp$ND8-&hGM%{2Ax`HQl*hqGxyI{#| z9=C+|&RjLz;y43{YDrTg_aZZfGrdb_4tK4!vS=Z zoRmyPgL8%r^HhqR_m$_&73f3jqAUZA{1Ya&LLpN@@95~VyQGUN6;^i6;ir&JUK%|# z?!Ll$2ejVZhB3uvNA1$Gu-x=-X9y<20zLZ4XOo*YZnTxgJ6tMt3obID5gktXzTl*# zeob2lwCrj3?FJfLP+Vm~LhOKMsVH2&{N9og{IYJQGhJ?h0?1W7NsetT9Hf3wD?YD4 zbvbf_>`7?ExFl2YxKk(7zKnFi@BF`DjO zeQdH zpx!FLL&|j~3>F_|N*LR52tyuNLv#jXU$}=P$?GReN`trrh^+%lxj;ikJkD>^vYCf6 zB`j4xDk?UXsNy+4SeKGs1G+A48b>wH*so;$@p!tQDaTcT+jb}AMvjTIJ8pwtf%!jU z@FLavY!ws>>+gC9)B-$wlq!3ZwD%qF(53G1xA({&#$MifLL4KNk%;mjr^?%Bd&3`L zewXeB#Ue2GC(|K?TyINWvuWjG`9v+6Gq(p(CRrVw!!nyvGlK4YPD>)xV5Liq27{x! zs761LLIpZX;>~J{;5ygW*!O#uz~rE~u5{3T>1cTdsmigPuaW(%b?4SWrevL}UD)bp zYAJV_4p4{I+S_*&qZ9reGb9DOEd@uf_(R{uQ#`h>I@EPp zJJKZkG^TnI>zy)b)O8FuHIhUXZ-XFdXgU;gd)M&Q3_7icnU`^{+%z!O4g8=c4p@pd zvLb;yNoSb>s|1<~`pbSfmAI?uc~KT6pUdTgkT*>FCo~zwH~`CAc_c_ek_iFi&BHy>~3hahAv_0Wx~MxGytB;T@(D%wN1JIQE`D z^Yh}l)cbt$Fv+-SGU3iZR8e&!wKJeRc!8Xk%rOM3n!`kks&Cu}lTQ zpgNaEuscp&e(Py3fcrTrPE}3{$HTt#PTWYZf?mg{AsI5_&7jJUSfEi)GqEbFg9b^9 z-dt3Don>o2CvU9G^Hr+ak_Y0G{xd$JGM>&X_Rk|3;JD39k131k2cttL<=cPcTtzql z!=MQgKwy!L$mOlUvWW*^>R6wFxHyJO=7q}Zb(~&O`RJhAu#gqFv&r&-AdLxT!_L7r z7-@ulj{OOj!BPg0)#hZdQ*}h;%8<-&?fR^#vQgfRCK%8>iuZ8bgJ(fpXhR9n@PX6@ zH3iFmL>!zV<%hlK>D!z6r+v91lHK?_Fe9rE8$*|hG)IokR3hDA(7Gi1N zoNw?>NB*;Y%2}88OQ~7l*$EL#V~qcOSF0YQ7MrhRn2z<7JIHK~_^xJYsJUIO6IdrH zVXYJaiE=M6kEMyUjz$84Gt$C&cpTsR2)@pL3O!%ZD15W(3tla;>9%9la^#) zmnJ@Mpxl7VI$r>XdJ$XGy)Y2yuBg7uaR4XCZs85x9 zM1fDl&NNom;h(ki#N8u9t)pSq4vqh2wKs$%MS~3uP_6uMeYW(P|DCr70^gwgM!5i$UbKi{_un zj~HXs>0r-P!{(c>$QZA@T)NS@{E_#r!B3?KQa4n$tu~Y8hOcTAK9=$)08)aXWMMiQzqJn5}#sUjz`!02OLVMdtd$7Gn4+!`1c=YjJGf(iE z%d+0YzP)!u*T6Ud_l-w`IWuZCT*7l9-Q1^P`XfF1&VufrtlT$F@G;~3Rw0gQ#X!I8 zj73X60~{7dBX8<;pH>{rM)H*6x9o)h1-tj|LrI~CtbJ>8{1nZ)H|$)uw`A4r;!T0s zS(JjVj#OSptN;O+ESE%tGcX)}a;C5d-MiuHw^p@Au21u`&vB zuzdTd@&(1U+LU%hHz&I_nEe8a#H?#D*U5Zzm!IH%9~!Na06$Ac(ZtujM*9(hM+Zu- z3#PQkZ!L_C$FVYsi$!Njf%X_IAN-;}2svz97nq4P&kt^U`Au1!hld~==O2T+}Yg-cMoE{R<`#%Q<8Xt!{1*@F4)V7iG`AsEnB%pDS@?h>@o z=1do(Aj5OV=+<$!d=eFkZsr2CJFZlwb)Dx`NGm=@XL<}Cf~9*f$|$Vmw(-g_eYmLeoP(NyAb=sim zw4UaXXS=hT6FKufx7#;3QzkNE{1y#}c8T$zCDo=v1=SE8Yh%z|tGTp1VG3l;fL701 zBaAf605%^9+D{*NeBwvAX7~f%N5Byl&AtZIOW2GKpK@DEnNxXFAH(a|uxhUqP%a`y z7B4D`P}x(KAzoloZ+42;4OIXn;sw{B*%v9*Fx{KU{9z0WO(7w$w8QQ-v4 zVAWgUZDJJ-MH@bYcms`P+JstDxJX2~mZ_r8@qQaOaVS7Fq_|9E@hZJbbaBQ3zHL9%j}$yQ~jxm?sW* z7AjA-H%?>7Z9k&=s{A;~YQzcNCz$=1Lj{*34he^hc{uOJ@+9mk;EJh1Lo5WPz^{!S z&3{V6laGT-5~%Mbc{s8UzTvkYdBg88fPrZNtY}8(Z&8^3Siuwcbl#dRum@9yJsmgt znIOqBkN|VFy{TQ+aTVfd-5Ubv;6o{DJ_E&LD~sKN!>KesFL6tw$qh*MJUI&F`_cFS z!sbzLfa4Q&T`K=_MvlmOEGJcpaB#B(Syd+<5&I_$9=e>q_K%M2VRLh#<;1%?oH{4O z!{0f>GLsOWn73akD6mZo-|pb(3TSpxVRFNzZgD|!cm%@DPoyRvD;r zM|?r?{U4-3o}te^lk=G01wu~~8^ApL_mmf?$mwL{s8I2z`!TU^UtM}QwoT^g=4;ND zW+E8B4JY8E`m%5xG(Z%+l~7!)Qq}Gk(p|9p3@E-x5BB!oR4ekq>Uq2Ff zSi~@R1GL=JItaTHF76$?Yo;%_Aj1rMhVPHZnN8X;myATSZEiMNbGdwFl-9`IIVy9D zBAmizoppy2|NL81cpF_CD{X8>;A}9?Hy0VCZ>QPm^NH_w|N5J|t*39>NOocNlll>O zPCYRU`|7&~GDmNxDqpp8HuLnpxv}zfY@2*Rnz-F*i6xU6;5bHt_wqrrDkb~pMInzo z9@sO)@`5R|fr5zRT9PBq_JBp)6`#<%$g)d8Zb}1E-GxGUS9}0t{=C8;)+U;0fDSC& zP>~YdDl(H(`R4{XlIe@wprsb{^j8gat9Jcjglb9@vA2{iM)@5+e?Jif*qr_@FS^`j*Ap)~dcjZ8{udoyX{`KMSEx4SG7>u|8ke!7s7 z6^D*a`|?BFCeBc{lfjX}1-t(Ke(1{b(iBH@tNsj+RZA$#XuqGSR+JNVwU>l0%|lr? z>ouPv1QEB)4I!t3Ji3HgCs9xX0S=DLvPYOIB&9SwmpY34{^oGqGCtoPMsRM=-3=#ae!6pRhaglk_fbyE2Uj-#$a?&a`TVO+W6sn@p+L{4%F3 zZ+DRK^Wvd%wju%9_beLUE^5Br>i3{&cxMP(Hys)QIqqb}k^SGKbXhI^4?oZQaSfTo;@I)#=3AI!XQx0Olew#Ecj zo^d54sl3Wj-WBsm$-C6^Zn{HMX+smkmyNq_;VPjVoCVKYJK}eHk#3k!DkcfZM@-!Y zul{XI;ef`g=yg-5PRs8=TU(&X^X_Dl?X(UrR{MN?%-?*Ot<>TT#iOG7^2@E`oNx^e z&2fw4cKq#B+FPGqyNr2oTz^8p7c5@+f;wl%j$7)W#HE>qB5RdQ##?~pyh6v*qnXC2 z)Wy{WiLi&PC>1QybqUn%c^f(BjxxDjpCY*z)0cX0xzX+m};G@6tpmAs81@S3L916$9 zbV}-ErBo%aAUzSJ7jfopl?cj2nwp{S3!CJIo0?fk?0GugtXgzMYpd{)oZ6OVrr(t4 zH%8_F|A#=qp<{Sj<+r#8W5;_}v2Tavg0jfyf4%Q4Gj4Gdjfl%`Cup_6*eX)nY>eEf zO7HM3<5EB5+M1jHRL+#KyZFX!GCR3?=wl@TW7o&K3zT^W%51F-{Z+6R$q3A$G}FNm zP`bTP)4ZP=6EOq+6X*2+B6(XoDQ$?52VO}b7;8NGx$lguc02E~k~Av1 z3DEgkZ4T}+&DU9w-wkBNtt9C$&4%GZs+?j@Y&g zXB zxluG=Tg9<>wVl>I)m^sdrSH^*gNatf=rI#Zp?)VRUaiK{1A+|KWxH9{kzsuebt=j( zDAJsrHywh=cNH$GFpR)(~cE^A_OD@&^P#^G5Dy3kZ?6Gej!uH3M0#0nVTH@IK&4a9VstBVX_ z2zHf+yC2zF<=551FuMIhF}9CmXUC2%`({9AiBJY>ix>O~wqb2!6(8CHqPdgvkR{zC z?2iv$Hr6ebU1~Y|{CU7D7h+gLy#6SpHedTFhvzis!h3H?Ga;jNR-K6lw{Z?nmOzoV z{3vc#?~rFgg++_MK6vM=ZEHaJC#b~iS0wM6AZ#+3r6z4ucJ@7f4*V&I-ah5i#4W=l zfo$nG#^WxQzFBS+r;L#gUv+JoHWvEB-!XQ&5_i*ReeUsOD>G-Q{CJyBgjUpzjU_{5W~6d>>3vjsoDL_ zBcWjwM+hM1IYG{Q5j%FgEqS(O+ZR(ySIwAAdY%|BamVde%|l{w<#1b3wkVS?qML!y z9xhE-2)sJkUL#k`V?lT{PpUI)?;K>PQx>dCw9EEpGhH#?8+TuNZ;?%qiO0M4U z0Nvx_ltZu4+8B?}jFu9MS>4pq10B!jqQQBo{V^T+-7$GG)Abd~0Ud^jzdR72p3v~n zC9t0V+?igHFtU4BLtn%u^pt6?I-oSFC_WAvjgvb`mQW=>@_JfPN*apdZ7pW4VjRO? zc`8%8%Uid8&cn@a#yxwf)>n@^tJ4&1->SPUGk^4jywp}~kPbI~7%NHCl*{(eujg}020D+O0yGd@^Qg?Sp_zk4<;5G?qTP!pp8eR6|d9o0yhA9@DTMW0dU9NGiFY3AB=4-R&Ez*Ee#EEvp51;GI zzBgH~ipnpV69S`6t6Qj1QYI*tFM8k`@}gM{VyBi^cv1xOcBX;+!8wV`pWtQ@uX*oy6%Lb;>PIhesWf zPmF687uTA6nX30=B&L;?p;{|HdR=RLU|&&1{%Oed%VE`dl|ePcRyiO!i>7 z)v8pslBm=#Aeu#DQP}f9cIM#bhsXP&WXp_G z$vi1=1&1l=rGWxuzgv>poM-#@D&uXst7+GpHG|9sxVJ`?yZBKyC_9yv{>w9Z-A;ED zhi5XJmxfn;_!Mawoi)85;}7n4Md1D}qZ6aK52}R7S#X^#7s)0Ftou_}JI@`~dRX)+ zptOc%+XbD#1YXUwccxfCVBVEQLQeJkaBzue{*lI)e2M-ny3($LL>C$%H7ER)QQ9pe z(sZu%STz@%l`-9eh?}9G)TItHEum4Em3^AEId4xprW^Inp{;}CK(>9~`$oFyt36ZH z)97SlrUGY#23XrXRc^^gq&al3!80v*)+!ie+5&DdfUM*o_iRv1Etyt>xOFNhk}e_I z6$yt(2U(G(p;-9H>f@*E)3t6QC@U%H@4gyF_TcLA3Tm9(Dh>!{4it+k7=eUkbaTz* zsh%b>5)REIwC%Q$h+;V;t*1AhP9>T4r6D0osYM!FSI@{kctt;Me67a5jdP*;;9!Mj zp-ifqgT$_pA80bwDq^K=IewfX^Nqq(06r=k%QA6!b3IQD!M zwwRTl(aXNTULTeW=Y{Ts$lGIj~4~?)5Sq zhu|lye=bn4Jh*A!AX1D$eJm^}cF^LFF%kCa8%Y!JyU~WNFfMMP66ZXrEj9Ck$yItv zvqVlMcI9EDL2TM-w)IO=-SQ(S8^%uX5Xt}HjYHO*HUxTu19d|xc%~_| zNjgE?<>>8(_;hvuj}84W-)kR_O*33(iGYNg1V}+16kd!!Ue>MixOiQn=8;Ky_!T}AQa(BDGFw<3kW4BqOIkHm}$Ir5JUl)u`M!ubyb z2>8GYJifyhsF>*o>36jg{qZ6Xoj}f;hRCMgIFLf(hdt`n7l==WN6ftnwx7kcS}h8- z-2hg#;z>p}D?Z-#WzWN+3qp~ntr&ZU!B<3I?f#Sje=?qk?M74LZk0B5UteZz{^qn_ zcNwo@plqsHVYs+#hCE2Tr3ib-rNq+fkTzF2%M3BDu_EaH4%qi@K==`V(Q&@Q17R#* z(8r2u-FU01HzhK-WjW9+1=$t@gQP?Twj2gt*dp1Mr6g8z_L?9O9Ab{#u{R5J=o51mdxl491wqN7xnTn2pKl&VMMPkVepTc?-jK?!8-tPE`s*bhmPM&d; zaMW>r2%=cbpueeEf?|EKfWmQzKhpDFO~w~-YKo&4O-lCdV1SCaD4^gZYC`9r5xx3A z8aFOG>?BWdl1Gt4;R~fTtZ;Bnq)C2Uc-W#jBs8^TMT>^t>Ma7Et?7TbbVc9VOBh(7 zX>1Mc*3Aozqe{R>znGi)is>c!njbgVG2Rc{U>h%tgDZ|FwD~NHuF;c`9=2xSp-Fj@_}tdjbjSBry!yKS#K)-!Z&WPBNSU&FFJ^yMbH5R| zAmTj6R{4;!6;*)J^JFC(&36{P6W!4@5U;FG=a%Ri{jy??@Ea1^W5aN@tz3-l!>D)W zj{`X)S<2V+V&A1F;f53j)N8ek19HL_4i$((VleT9X`Gf52LDcfTM zr90FRd@-9ntOaRx6etKbOfE&MqLkaZP~jhZTAcFv=~4q3k*&i{B4E}|1y^yQ2AZvC zllGYhV3+BryD*QLnFp_({;a6P$e*0%qA@?`_3LuiuOlS?O?WsaPzM!uTBvsei5roEM|G^3G*_D3Cp0_vkqBV@^Aw@$wx;Eo+K$S03Wb^OWRNN@hJCdzAO z>-86hsfX>fZ&NTlVw6%-_7b=Av=Nw`&1T4aYN3z@c#oNRIy}9iSDbiaKEks;pI@<9 zP>L)-{$Wz=)(hc)MRQbV~v?!BxFrqBWqo0`5G_gi{ zGK-Fg=rxrYhaB!J6TL%H5_!sn1jW)A1l{5j3Pe$nQ-wcAw!;;`4@bSSXadP);Bm6L zJX(2DCB-27K+jzo;&G);;xL|?AT3E-ghN7ino{mX5ci#@ovbw6M^{qIgeVJBk3|Vb ztZDe?G@~Z3a~DyrH|=H|m#16;wO%iO<}UvHc!f}X?n}i&mi8r1fq8OHjSiF1ttQC* zHbryp7Vy_%Q8Jp?d(H*%5-?tQ%Jb|&F>+c@f-miEmuf+4STs{+Q6z_ToYAPlJr-uJ zeb2Z$N=W(LwO;3RxbF0I-nYJ|68evC7Z3wFrFA@Ak0AWr&|?fP&Z5m79Asef)K;*zWMG{@uSygU5C%kVY8+91jNE>-q{L5iIhylC_qb;}YEQH*-R zEc+@5@lTj7$WZ);K!(nxRLMWKfT!qjTjgGN9I{wUs*w@Z-4@Z<_akqm# ztu*1qn11Hj#R{q=Y`HDjMimybvg}cNiy~`&d{l$W*4&g-h#M*{&(~WM7VAl{Noi?s z46Q)n8Arl+D@W<=M-7O6Jm?_@Tw_Be{_xRh0qPgftf_znYp%{=Erwq{{1!BGU0;nr zb?KTuU;pDjk*`K9?7*<`E!PH`IVlOSb^`-Kr3)Q=(MFN z{Fo*IPh30msF9$RHIDI6*zsPnN4v|mWW7CZ3O2PzA}%-RRS~B59Zd1rj|-MUEza{B z1r$JVje5xh+rhMZV!3t>E(dZCL%f|Lv3-Y;tVUBFi&^y61ct3=8QwLL=l2eA18iu* zSRgoKPOD#ILZQqi0d5!=w??hJ$k-7hog)rXT7NITZqG!LLl>7fqj)^YIiJJY&zU#^ z4R~{Alaf7GFeRY#jyB z=Ej%o1vyQ%B2-#R<~O!AAw%X$Yi4pAt;5v{Ao{cnVnj)A@lwnVJF%mCYehG)d$blDp7 zM@|WEWSpL$mqdTyP1+Tdv+X(tO?t58WDP}b7Vpx|O{vguf95Sv2UA+tv*Cpolrf9< zx4RI;kA@7zwm44~eNkAi!ImjaeHBUMuK-rTHm9|}$2@_4Eim^zl)-ki?M}r(wx|0P z@xX|AhW`v;CPz>3aF**uJ2XpE;jc!T@=F$cWAWk_=Q(!6m#(?YFz$|-4zAg6smeJw zp6d@On;^Ffd$%%3+dw_<+4&c)Q=AQyKGb)Yvuu9@GN$Arg>n9Iqpn?1RO!OIiYe*x z25w1wSQKSU>K7Y+o$j2BXyk9Fq=6`X&d>F5f?$oqVzrtVO#)2$6btQ$iTYh+2)rM? z#6?Q`+_oG6&kcRSjO`r~j(0A-6W9uxyP*^}_-{K@WZ^|vXPI~pzjJwBLc#A)jc-A# zr782efrj(-QuWPa^JK52{$m59{UpexYdZC4aE*Ois>xEjH{m8|h!1K^>F$1m;7 z_v@k-i%K_pS}FLB$$`4jZGh*3WOj=X?{gqq?{B0jL{IpKp>`XkKVSL1!ggYCZ4DTk z()naTzn^2@yoE;Aw#>O5Mkn$Qj1>K#z{36fuc_AN6Et{^ZY-MT#arAI+zk_OjXc{XplXih$^~H*fq#pI)Qryi@ifOIUN~S(0)G zbg_F*1jWI@pooR-XTP^s7ZF0ZaGztFd3@!ln{@Uu!*QQH4}`?!u;8wK2R}7YOgw{_JZzaAdnp10EwutX%ho( zpAw8xV^WXg^T1u%*hKk6OncfSs4fdO&Fd8bt|=3|NdNy>d+WHUw(oyf5CajBR*;Zz zfB{B8L>iP6vtKi}8$`|mL4aL(Rq zuYRw!w{SI6&U|4PI-n`~@J+LjjQf5ug*m-cKct7($p`AP=avz)E~zW<;0H)`LMpA< zQMAIL$e909)C>LzQb!yWyWK?qth1)Pz90tk*^5wc3!)^X%D2@|?*kNa2iO6{Saajo z1@^+3aSUyt&7k}%=l+k^j+_)J)mJLbg3e8o51K;l3uMmfFP#PCa*bD%nJ(QM(UM?O zB5P*WFiVYJ+?hUT`)VMRS!udJi_Q6%#v(r`u;TKgQ{(^^lkE51{>-Sf8*LIA3(f9v zb|Z_RM&$T{*_~Z)cb8AsJySTrq2$`VE=&A(i?c&|GVI}Y=_t)mLe|>i`Bj|Z<#Nm| z%%y!2AXdMfE){F`PHOy8Rz21<*=D#m>D;)NYI;O}r~8ya>SQrOVuIUZejv4z4{AqJ2kX|_ii{m-*WDx7VYuMP-?6+_CU`z>>m@J!%Wdz~|==mrN;c2KEfw5Px9C*P5R3Jdb$aEnRP zsm{OM@eMUIdlk2bqOE}Ny(UbBK8&mPo3{?iEReNOU|)a`J-&@v~*MQFHC z;1B9mbX!@q_0Z%gX`Ef;`8IA5zSkgS_TIwfQg#nosOBgdekKnuf3oh#T5euyVQaR6 zj!KGA6ui>9vc|cYl)fPGrYx+5fuc;RB^%t62?Yuss4LA~;zw(LVX*HJgBv3)FqiNO zGx}dPrYei73HPMaFdN7!CcYw3xkSgwUMD84V!l}E-fqBF(Ws8|i%RbBywGA6>lrHP zVSx5Nvj_afD;ijCc#L4j=UZ-K;a;x8jqr5J{2b}| zRq+9(qXmvBpe(0f7YD=Kwl3Oa2R10B(4%mSXOSE=H+O5>m#4lPQn^K5;ueAgv8iA<>GB!`i>R0JIWy&ZHue@3jm?G0d9FQILvs) zxxo%cdO^1_)QV~qo?&u3YR8sW-04$_-s=a;ur8>{7qE`G%90mbZ-)ZT%NVX;WjiI^ zk+-_41AOR!@JLky08@!B$;6bVV6$2&zNjjX+J0MrVl2x3sKNb5-Bb@Ir*WK*h9adC zm#Z|Y=GftT8!A%Y!S55O+IO2DdI8oGyV#`kVel*g`;7<6+yO`-m218Lm&51V8~d?W zs3!|x;1RgeUnD+)Mj)zj!#2av9`TIV2hGUi-cAZVqs_@EJ zpd1L7YX$^>;W>=}pd*$IpY|H?h1srW#ntk`lUW9*t9o$^7dn>Gr3@X~$ol6gfSiKE z*e(h~kJJmt^32qJXD(0C-SfsLh5zqG(U1T#8H}(`%8bA8-fy%2{%1$%1)bbu#g;(` zSNL_FZ|0qsmL@%?FD~dkU&}Y}N+B+NoHtg8qa|IUBW5(mI!4Sf@C9KmIDIgjJ{82S zt>{XXXWwTKH2?%f+#>q6T@X-!)6IeOAB+l=4!M+Dz;ST-ft@b z{OOj1)GSKXFSGu8*WMW@Y9ohd)qz~Q*J6@illS+s4GAkyT-tL0F=xb|ZpBV)r)gJC zb$}++PjJ*SBRHt z(U@M~M8SeXVzV;Jb{^)?E+)O|H?4-%z4wJ#H^cKUaKc_i9WdNIqDLp$1g-kA%T!9@ zIM={;H8?DWB+}X!j~#la$!wDLhO0pJ9kg&=2SlE%%N9NQg3Ie^<%LDwv-N5YA!57D^O8EvFC~2xNoKbj${gz7y?=4)-ERyRO?o#kA*Zv)Ou2)^u9jdg)8# zl%K8ZEB8gJfCCdS-7^a2b)7HmgPm;osdV9j2AWi%gw?5*dZ>|UEN{4#7GoT}(*y%` z*bpn}UGrY7I_9brv%cPf%d8IpbXLxj_QMM~K?AM$sqH&3J&s~;wJALYC$>yp+6MwN zHDsRskiuj(difNM)C{P}KzES(YQm1~uGjqF#wcad+KXc^m14!NxpkwuB9Bpvw`G0% zB|IXL{GQZU;$tJ#`cV~UNWh5j^DA^CQ;L?iPnwRi1mHPeAGSwJi8OVblls#SQM~j3 z%;= zeB|CETp&k@6Yo+`o3Ge`-_Tk zs*;HU90%GPrOiDy43Ls*fN#0e0H*=+)||13VXD-Y=Nt8PbDCN~#%sU0V1G@?8Vsap z%Dya4=$w|0lS=%%u-w)*SG#Sq{U~tayPta9>i#f2`{sheHf2fW>}O*$4)KPbndj5 z5QmG*%z2t$6#pYkNP)az;X9i+7Jh5M1CxdY_$6DN3^h)Org) z16rS~)0^GQ4V=<3&n9X;nwvD$+Zm+FG-Q*NF059{3e(JYan^l2pVs^bYhrW9-T0gzrx@mE6a9aC}%J8@wC|XQ0GrxwTry@AMp({o}R_!7BqjG+v z9!IA4%%mC?fOs|?4Jgx0cMrCwII>*Dm-N%v3C&zEyvxPt$hW z<>*jZcoaYI^>Hl`hy6DZ+pJl_OR#))VUf#u3FbdL5I>X{9e85OEXoad6h{6 z(JieOw_aHD+wS2KX(WbzPi+IPx$oCA;+j?#?p)Vz?5ZH6{}PB$>atW8E2vcZ5_Y3G z`z6-~U~*}tpJJY+PqvH3I2h}nuVY)g+zEc;smLQL$Wnxuek5KuTv4DM8XNZZ0gw*K z(>g1AW{Aw5<&R+Z{*gi(cTsdZMgbp>x0Y?;D=BPJ9WaS@XhEAn84jbkOYcz>JnM4& zWKCX$1mzZElnkKXPH)$I%9y?CKuN9T{08e?a7qvv_u#jA?ccYWLHax*$P?f?ZJ)27 zp@IIGm*m1sVf8SJU@wbqhMu-oR z%;NkfljtT1in~d1POrxmwpq7) z@5JpeLj%lEd28Y6-L3%+FvN>{0X3`uXMI0-_pHbs_>Bnf?!{%wqi3mJ78`j9+oaU0 zcF&EX7!^5a74=>M7+T20+RdPY8K%^cuYy}Bp-I^N%U3#IRH~0;MSTfUw)O}2N`GS0 ziMsJA)!`Z_oQnp?`3~4==Xwie-ik5}@x* zoQ_ato%}X)p@|;=Eq$#JY#IKx@CP~J10$^fG(h4{@@xh^31`|-w61WXlb1nJSAT^w zg=9NF>OORqr>Ii^$~<5s#q{DWW?%)KPv9Io7lC=!haU|OM66k*mJC3807EelzYo3x z6f;Gd4ROw5VhGBG@_bC?p%Sjn%h81idyRUq2;{L+EVKuYbSYiV= zI$ar56rIfV4d5%hot<*%vyKhs(v9t7)8h?nJkAW#3lp_&gHQq0eM(wHNL6EXA*)}} zbHKH2;t8Ok+Klst;+{ta%rq-y2zDP~!i*Qsrz=>gIYOs?;#McTg3rhuRxiCgo%Z=h zf@a>L0(~B0rO=6vgKr6hY3Xz%{{r;&UhS&@`W+&nE@_pz2~cy-62C>G8K8xtnyZl( zW+^@qx&CFhcX=jbsu|(*>00R5BZO^+^8^{VXKUFZ>xRI!fIShv@QuM@44c*3!6vhP z&nxQPpDC%WYnQp^KVt%kHu*`S<^ExwM~kMRpOVJrAs#V4@=XSS6~fnYl;ysD$l{qf zL46-^B;t}$6|+=za@7aAj2#wscGQ+8;G`HIUa>J~yV|^qS`z-PnF>%pGs-BHx>D@} zf7BB77Y=Z%fCdCE8Cgr{l226(R*2WS7Vav(q`80)oEamI5ZmF7@29@E}OTL%b6JLUZUUrVS|3=qF6KN4;wO zA$EaSnTmc_!>Jfd3 zTuZis$&yKD)1J2m6ZM1L2ESy}=O+E3nsbURO<3K(rV>vG1`tOYAkT7<&t_J%6aP3?Mq!vcwJhamcbRVkJH3(5HGk>M*b>UCLy zF@n?T)RKiE^6Eg;6&eTKVk)NHmzQtx^`?r~+%iaajEIfX0nJhojLzO_KPE$XeA2kx z{5}NWwu2dLTdp!0ApIpj*5pcMJ!69tH76;#md-ls3{ya7h%Kw{{6|-ux^kOBpJ^qo zr#bE0|6>v3%q{>~lgzo>!Q3Fv*8R|0`85G^OQ<>MZy*gQr{Q9wkAhI?iyX#+!-hnW zMR1xqt(fflFon>L+kJZD`Xa{)J}$cN7h5Ee*cocQmqh-IEhp0*HugGJX=)2W1NE<^ z1m#jQ*UM$Sr{>Jr28UYn&*6Ji?N9=NjS7`aCJqcI=xFRw=s9b62IIQ-!x3+PV;(71 zyz;U~E|$}+yWJah<6(=9V#-CLm0Zy32V2C=PMCkC+O#vRDa3H8H zq$jm~$~Ms#sLt7_gicw{pg#Dw1I9a0ff!o#d4u#HIM;|6#rZ2~!kfL4JH`JuOiVFZ zIw8l7$#)|q83;FWB61{SYu}7&Wo-kX7!e}SJn|Kgd}l7tJ~qEon{`ynV)1lYJh?B~ zo7#G3Q|BtTvLFkWbFG9I{?M2&E`MA;y0WE)p&HISGD_yCe16AjPm`gW^N>S@{P!&1 zh?y%*9tAj`{)zGoUVF%9*m^EFa4?RjnDe z7dR{j;&AK`wl({hh4vUiGSK2wg0dVmvj$n=M}Ps&LftywtCpjb**!7XRwmqwJTb$X z4hc~73Rr5TmeJDxc1loODpEzuEpzq#=)gH`-S|KR9S;w$lH#QEftpLBbnbEs0sKKG=vJ!$8%^Jicw5M5kSSgp~DRjaT ze~-I5J^}s1v){{NJw5fuU)sUNK}wIUH0uk#Pg1B91`8%{B1c zJ)jdxr~09sD|I@ng(%><)eY|%%jnL!^NCz>icISY6#qJW`2nh2Zh4cWBlda8q`BnUacm9Ve9oN}IZZ8BKvtSVdXY*$~F; z^QV&yI^2Pt*{E}4cYl|V6C6}f{aj_gbUFzrR0&dLZO4VMYOBm9yxxIvYrfcbF8)yU z!MU0e=)9rYoXyAFGCp|zW#`*L;*n*ifU~*`7$3L%J=9?Ss(EB#!C^TyppJm<=^8k) zQ*YIuUB>73zGql)ft(SEgNha{uW1Tkqo^9Omcc!*&VcHF2I3(!Kyuqt`^*6PhX5B5 z-pvfRmqaFUsqxgmdKL@ufr)8=${rI-lX9uWMO>i2=Lr%CtL*bQkWMtKv@Dq}Q7LZ2 zpEqLb(;3WrY`!Nwwl%-04*s~!Qu2wUhqJOSvJ0;FnF&` z_TmJ=e&=B6iWPUr1KF)m-g3X)h8T*ih}sujI0{Zc5S@_j znvo})vrA;skI%nWyfJ929`9#u#gODn1oYT;?)wu8RBjxVdjp9>LAZ>9jcnk>b|WPv z&&my;7xAcA{_Vb((FdrKR%JV5kS2FKe&QL&aQvK5GV+XovYV8@xV67#XyURjWEDiv z1=dQ7T`pA{_|_+`SI=f8;JfrDMHB4C-EOrhma9cxXuG)RrBCF5V1lFJ?JHYO-V&>@ z^&af5t=?A|Bc94Bdko4>Qmmu^(hrg8QBHl1{!fDpaWIxm-sdXbmj;pq4c(C4Roo@z zOQofRooit!vll?Gh%q=0;)8nt&kh1Ov($HwB>En6_w;=^>y(k7 z(`&`0IVj$5jrNxLv0gv(U?ItDy&jY&2iB>UV7UnG260BU7Zq~c zIeiIm+Py7$WOg3|{V#K8)*<|0Jhv|7P8cIT*ne244$>b`T;#(4ODN7;F)!aS0z0Im zAUf$lPvJ`)p(P0mP=WEkKyi|UR^t?67DqJIN;_9P>0qPb zOG91EHe&a_AgROQr4u*z+_XIrrkrg{Q$n^01d@x%on4gG7TB8V1v zhB{89eW1MN0UYXZSudgO)|Psq?Gi%*j*!=g)2CEuF1XjD)Yvmg0q6CosXLJ4OU03U zaUs>CrRrhp@}q^%*5-u8Esd%>=8CSePhWSq@#5rZA$tJRB#M<3JgYXUWnHFN5{x-r z-n3R-il)d&kEA2LUibV$=23 zXO(vSlz9?HWaDv2?BUxRKwEW|(E6J?AOW|&cB=(FiOrRM6N?NSl-BBLj$q*2gXnCG ze8Kf^D}_>X$lH%aJ}))SldDTwSV0&(Y%Avu=Qi9JuqYjS#QGOL*OnPy6&4vZvyKSu zS-Xh|bW;uuGB_p>EGciAQfpLZeW;~ybf}Ci+lCe13cV2mR1SDn9ngB0Dp}b7{`0MTJjoNHIK?;_|B>ZNp7TkBT8OuT!aH(I&<*r({V=5cS zCgkS^e9jLC@L$I508>O3g1pndIw(p_L>%a#>B~zYGMnt}uy5Vl#3V7fXRef(?mF>6 z4Cn-=XIb1R43g=)QMoOu#Cd<4P+(QZw)|QrLc{8hX^k+9 z9aWwSEI;}3EV?XG2553iQNL}$T@ClP*ol`^zyw-Bjn2T?2o?SXXG0Iw`$lY&bmROT z4?vB1Kx=^JtZ?g1W-%ZIGMIO~ruj2xDV7oDYry@X%i2!f!hNmMq={RmxUC9`~k%L#`$eL=Nh|C)Jl0Rar)3yoksr14<~qJ;~tP*a2O<+cKB1a`7FBs^P8I@@*wq4{>W?p z{PusnnlVNm5mvz2w)yMkZ6}8M-&4w_2SO6?OJw5xoRXGLLy@PUS5|Kl4 zjG^;qbj^Jyn?nLp;#PC4TK|=&S_zN^yvAGF7yp9^J02kuA}MTP&HqYtBLNig9Mew& zk;qES85h*3mRZL_iDehUdi;M)MK@?p%qmbrruH_3vZ4>CFO2)Jx3%QW9+XE zDW@k8Cj~nb2$Oesp_T5*j-3b=q1K|PM0CyBG?8l~Ur|8{bgH|OW zKg)q(kUXNOJ!hjd&`2x!qRG;9ncC@o`C|6eM_{yW1Q@rtDuJCQv*f&EKNEoxoCih* z_XB2GUFl$1)V+qsF{Aqybz7-tY?lGlOW^}!m>=a-{C8UZWZUO_ji|GN6S`V=56nU> zWfj-7JL^rN4$|D8A?V^E8fWRnvUkr%U-h+hoptXNq2mPIie1?3D)m*0Lmp(^9LdJd zRa;*+tLJ-t){sH2_Y$SjpF;H?U*+pWHPaG$HAgFixXN*cIE(;ip@*7VQkGu*7p?S*(t%rk!v>H5WN;FX2?m)ApzX6RF@{vxSQPtAabJwDH_}(-IzThRD{= zo>4Pn&5LAd1ox!ZDcLxKI5c8$lM3=_a9NxOF{kCr%^aC_WUEO3t!DiU*+7yo`AR|3 z_r?8^!h6On50Pz0F54mC|7n(bZc|D~wrlLadSdcFrcj2o?jtKHmSX=4qudjTV@J-i z>y~l-iM9SQjS(q8x|VFEN-6%EgHh0;sHK6!>oZdCVMOtt{x}~SQXvzM$}|3{0O-fk zxJ_LNkwwaVtAfF;|KB7VK&|^uvi!iCXLaxjAQ|J^qsgEjU;h*58hP?DeJ2^t_k&jDlw{B^(SeWqake`fnf}AKUc~L9!%M=3@5n1Y=A0 zZ^7(B{)Gj^3R*bwUKbNj37&>pO-{lX#_Q@t&yYzi^e=L2LPjTI_m$%TPAoeyFH7h4 zxB23l2@?gi;L8+rbV0E2IKrN0eE!;VFA@@z4!v(GUK6`&hv_k^iH(nqP1&SyJ&YOy z6)KHQWk|L7zIKG=c$=HEvZuGVWuql>iN&gI{cvj}XOGVMOZOLUZo`bS+viX&^PNMz z{+w^CLtgaz)j2fsA3p+@9dFPW$y8^XRI_1{tB-P9qqikCYsa=fk)C|$@8@UmQC+;D zg^K%AL5trOX~mg8od5gnzrM2u6~F%1Uyv_|Ye&DTWFkU1`bu?j!c;0^P5RsZ<4f0{ zpL!Cgx2+ZuvJkhYdxB>ng>HQP^{GdY6Zw4k9UnF22LfPnzuxaiMe}m?zG5LY+a1rJ z|C0hgA44A;Pk)~Tbv@R3Zhb&a-p&zf8zGDF3&sB?<_&)|&$Ezke8AB(CK9`_b7V zX6~N)XL5>_gYU0)e@%1h=J&PcdwziqT^z54ro2|d)2@3fEnJnmT~fE>*)vwH||P!qKP zIEHlZ^69(Zd5)Y4czM9Z$EOyBFBW%ZN)1@JCbk^z$4vsxe(pd8+V5XDgKz&HLI{#9 z)O-enzm19foZI`$*2@fZh4xZBa)&x9iw3->9yvv;f97@g2zc+I^MMif@6_PaMn@g7 zL$aa@D)`!|3H?HuU&ifr2Vjmav}k6m&Oji-fdr^bI$mb;GChFe`sAw)bq|gicwck zE>D2o`Mf8k9-*n!&|!)q*QP|vczS}A5z1%>1fdn&Q;W3rVu4uyJ8$rj-vMMU;_|c0 z8@z5x=@@LH0zrfs+P^G!4H{By_Lvk}sow4w3jxJCrR>!y?ux$ocP>87Nqk-aX=UYC z_~WY47viTZx)2(1o4mxW;47T5UY_J4c~_YLs=J8gL>sVDxcQt0b5 z*6A)$wrxdRV@x#iyPB5^9FRhJ6Cx7j^6!K{LD}z@Zh4e$P)Bl1)Bp&#kZid@|(G^XRiX2hOIY;`SHjz0D%U6ao$uq4q3K7;#h)_8mcyK~oO7}ut$|Bni~&`Uk=v7n6b%NW_a_>a zCL->MYq{RfHq9tvZJgc{=u+q+OBx8u6|dAegqcrF7Ns!naYfy!lvY((E7?}wG|cB6 zcN(dD$f311-Kng0b6Uo!!h0u+&Rh4U^QvL$GHsc?R^Q@$IeUq3vsmP=s+9=AW2C~5 z8$xOhlGjsj<+2qI4^zg9Q)dvKBcoWG{Qu6Y-*xnT^a_#P${oGCbj1~8(S0^HAz{TW zy>#2VX<5@6_L@XR4^Y}&Jyf!(5ZK;^l=okwIn3U8X72lqPrL6rO<+y^U|+xnPviRb zU0@KHsO7Y|AS%N$wa2azu{U?#7d)BwY>tNJEs{6r%ZB9+Y>7?oXWbJrY0%7Vsgf~Q zv<-truaVHyv8`BAo2l7Dg5IyS7D<5-N>q>uBmE(;fO6lQYO4o?gPG=37 zzHJR0vt6lOfPUvtU*Yt-%&aOc*li3)CD+ku(OVnJ|7Co2h57n+#$V1KrZ!~VUbumJ zohI~o+rD$Ym6n?8rTX8^_c_!^-!$I2aS+3XGwzzj9Y^yrmQMz7| zEGFyLmVSA8-u(`LoHtsSjE0DqOIihQbfq@-%X4UhR4YW2SQh5oft|Se0$R7=nH!VI z9Xu%>u_mO`X86aMO;2tp_K8PMCv-DH$r zjlj5Bcty&r<8wlT`OA?a>xPC@rTRy`D+%qy%MHQHhxHzc4fiFEbtqg-+@OKFa*vet zVir2yB%ludL~F7bIf zrctrj{HemSdc)nZagAC-g`(VqV*QO|;%yrtEkr7YHCLEQob1L-q6^>-xj(iyosf%K#hCuLNuAjM+tkO=Es!baSbsu=iL}=Z=QQzAzAQ zs^keIOV-yr(AjzxB!~1VD%B5k4ccSS#LgdE%qn$d$ZD=vRm@QN@1oB)P%*cKF9*6K za`2N7&~nbMH;VQBiv(es#B88UT_HGI)t#M3{6j_PD~V=P%Y_RW0v3Iw&l22Pwf`^CG+VwvED|ND-{|J z*&772FN4U=nawA)d<;6k`rC8iBSm$j>Aqvn9CQhl$xV{rcCC%i_KtSXhX%Y@!ksEI zcQ(0x3=y66OkW<#Me=MKGnz$k3}f9?V`)Qu8~IYI3EB19{IBX4Dwd6dl+t!-OWLA= z(BjEm{4_ACYMeO}`O;=FtzF7*Jbmupp0 zgPLpM(Be&W+M?~x-`JpfFmF`2Pj%`9gi0%rG(cya5TC`oFGtC(PxQ!ge9N5PS$6G3 zXh#Ld?od#%eiXtdUr8e$mva;%YFicY2vA?0Su&-7@(Ou2x)T+!7q|W5?lQu-DrW(5 zC`^Q}+e6H|6Vd#YbP? z7s1wQE)MPusCfR^+1PMaP1fl!G}F?f!0SA2kHcYpM$9K!d9AT$y$e-2KJ$*85}&`@ zo6Ntm4vutK`;K%MPdkeL_<1wVsOP_!T?8pWB3y=t2k7Nz4?le|z(}QYWu+T0PHYK~ z40eIU+>W8*W*A6;E`~j0|HtD2RAMsD3aY#6d3%%^$u2&PB+*m^Y}HTn7!9?ddMxN7#$W2_Ghh! z7Iu6lOK8U{RtszT+;<`zZbuMG*G^e|)+M*mEc0uxzkmlZeNvPko6s;}GiUpG!=cXE z*G$i4{o&kRxmrTax3$A}=j>55pHw=S7g%K-+7pptyg9Tluo)hpIkNELrKr_OiT&6y+;?s(hx9t5rTA89v6eaH0OjNx>3+r2QtIy0tHmBe6!TJ$oYf84b>Di8hW#{wN^747#d}6yJLE4Hh{Xk$el}Zrv2Z=& z1{%>Ly$a0{t&9sVT$LQ05}jh5GK*-Y^`tobnl%GxY@bpfD%X+;4BhK0AABoHTd?w~ zCjUIq*z=6I0*9Lwg@@)o&Vh777UKTbV&B?cqla@?V?Htm(OwOURtIA=TDP2S6Vy%C zEE8cG>1!{aCCeOOdOF*6Led?)9Nk=e;)L^&%vY1ZQk_2LJ~jQZsC~b z62BCeXEU$~$VcVg{+4oJZIW`ES5e`|;V);a&{H&%*ur`Ie9aJ|vw<{P5^Q{-lw zFSGr}v1ZCx!iTj}o+VQCZ8S$EV=hDDM!|Tia`P-h>=%J}Kya*+>q-x}A`z28X6 z^=u;OMyL)o_7e4m;&K8-534Z>eFRR&Q4 z9mF?Ja>et$^*MTRS0MP^?txbD#@*Q)LN25ncrG#4&nD&z(@OEUVcv7=%Kp*JZPDTA za6&ovEpeeEA`T+=!{q(Uu1?G@$1fW75fv<*yXPkzRXNn7!bOXly-~Ef$g{(Tb)C%% znsu!3`=tkyaxN7*3peQ&Q!_J!uJ{`E+3jV~B|>bC?iQ5gg2(rG;0;vHfD?GytNbOL zYmsd7Xk0W zI??QvStnXnt%pk~Q+uNfmGWy$YFbg0_AVnAy0)p0Q#8Zu{W@G^x5#D55Z&{fw$dKR%ljV{zl0z1$rhx-GLtT%x$o1E$4Z%)fJ)Zd77l%{K$tbOyyz$zS z*+o|OlN9gAU0wEJ*2vun4YVleRLVdlGv&SN#L#HP85`=vU8=AbV-4fZd89U#w^YXV zJ*=C3JbQbVZWb%3K?r`S6s$(uYcc&gKIEAT9=jrx{oSD0&Az=X$kAJu%!1N__=$FT z*RWyP@M)>tsQGn<0(`vH!h!LQY3U~x;h*~ETU-*JPiN>{SN3_%`yAjuC+4E}HldFP zXu2n5Dik)PJg-Jy;6d0i64sJ%X|1)|l9sDQ9^4LKMugN&VMuYp!kG6394w_jo3-hN zPk!Fuqwu%jQ;oEs;YxSfRlJEi%PX-%RO&>p<0!N+^TF<>ZgHBq|0dE%*(;#J5o3XU zl5+Kl>jq1KPWnD#uFtl^w{|Yhaj%h(?|S=gI?j@&Bc5cn`@-yZ^C`<$Th(A*%zL*a zoO3Xzvi06h5KF}>=x;8j<@WQKFH5X~FJ$}O%OyrHDtwoRA6xT_eR z0yS^G-Gw@Pb@nMOYpUc@-4mfYzwm2eW4mv19C3{a8y8oGBfFECThbgd|D1%zZmm7k*b*)NAlCVUl#g;?YTDawlu$M-NwsRo*LL`ZO z#)iBeum>8e`xIJqbM|q^#MDk=tAIHf;J0y>8RQuhA<&RQX zkE_ORj&ELR0YXtk-vOY421%(g#g4Kz480en8>by8IccE}w4M>92}*k~sy`kQzjaMU zx7Q?}A0h98(DO-olCLp3OrHlYQlV%cnppR3HKNb5Z50%3uD={_vRY}*EF~5igYxD9E=qk* z%;fqgN*$Z(_l9Tg_!KHcHn>kl>$>LH&F@g5n=Yr*R351))K8nkzuQY6jZ+z})7mJG z4;Anw>D1fH_bg^Ly7=0n!WFBq#P&tu^!tYCu~|Mm)iQiljP~Y&ocGgOEJC@AhpH{u zqCsP&BF7+m9D8td*40!|wW6l}Wb19pBErUJ@Q^o+l1;XG6Jq^p;_#X3jGiW#(rYkP zJGXs>*c5a9b6c$mCMc76*JM`^PYSoQW)o*>E0+IpNI~(&au588lt)8RNj9rNH@2C;MGp0Cx>i9c6b{mTWQvM;^({|J}r+-AJaR7OS zdv`|0p=RC-n*JU^tS3V0{#!4Qciw0loxZrm9T(?u+f)=s8bQZ@IGC-Um)l!IW_H(bMY0||fZ1wbd06I=&*=9rPu zml=5+nZ|3~ib-kaZpyx1MiQf3j;l}R0ekN?HflQx6Rs2Y3}XVkfX{z<0fbavN1|6) ztwxe-S)=u5@(P$6CLYg~SIavoHm%pyP8SeLDyNDv9{b=8xmhmYbCzHC&&~7AxHWH; zC{wPwTN%;HR;oGdF$ua02NF~>;(tlsuK!*3lNs{AVT`}C>3C8#QdFAIxsBV(b;a*0 zxN8!)dKb`v95r7$nsQnf(^7GI(lmeV)Eg%knYCz|BZ_QFGY|>YBK@nP3JOF($^}WO z*PD7)LrYz6L>{&u+lIVwb>LC8m;9!d1Zzf9n7BspWjYJx1qOU$b($iC_Q$3BnH(VxB zrl_jAHdO2vUQq)n@CRF_Z?MePJG^i%@A))fzzXNKE0=LtD!($ zCwxV8DZ@8dIF$K%h@|t^U4Pw?cTag$oSSJ^BnTQyHJAdISKT2n1`iT$9I0oCT;% zdyQ;uRrYI`%5(iql@TOa{k=iQ+Lqfow7O(B0mi{a6-n!OW`?)%kOA@FSeQd$&X%~< zKE$x%wUgID)|KsUu@axEiPBv0!o;DzZOI+!i$C!TAjbUFfy74Jl1bJI<1(I%7Fvk& zpK})?BcVs)WPEXQfb<)C52rL8`U-mTHfve%QYIAF z3SB$>4B{xn^Hp{89}>>?WVUE^CDoE4Eq#hi(> z#7kfvx0o%`+2Y43*FM6zVg`8g`QB+jF8klP;ZkC_DWMQIg4KhW|8X-LB(~oyQayOl zq{KpYEo%ME6n=3w{WALuRv<-|Pu`GNdAW@_?EP|uJ`hIB>VBRc;I=1DcZ$&j)tfx= ztBY`wcg_LVZYCzU+baS z(aTTi*EZ5ZpOJ>HswfbQYR-xS{^a|J_hkR>75H9$lHwkm)OWAxuG_)82);|vImloe zC^3!O@8U2c->qvz4A3 z;uDUpA1(T{l8!;n4S_ZHWp*mamjIxlW`A~CcVes9^f(9AQRo#HTU+qA=vJn~oUY|y z7qI?R-r{_-PfPq_MwcT0@CUgh_CScwXyWHL)DU@evnUGJ%OqD^X}$b~th3rNw@U%T zGXof&Jis|zuBs68ntrhNc7WgH9#&u9Ua^>KSPC?9clkUd;3$;6{~;hj_8>`^s%!b= zmb_b|86J_MrmJYbupI0$X3X)izo1O)ipmg9qglDRScozN=?&IOxn4)yc?Iq_m&A*r zLlmOBvh=-{8T;h5e`7NE`q!ldHR2W$UUB&~^+=u^npw%?hUc_Ol-Nyi8I%to^) z6%TZ}fr_cwhIaN64)qaosZn|^+-Z4ncez2 zki@FsQp)}D{!*6LmxS-UVrwM?UF(B$PbN?*m4mk$R&O@6+HXEBEOT72z$YUmF(@%{rRl6ohx~st^lHG2Y&Txar zaLZ7!Y|dVaqhP#4msl!_YiG;aMAyQMfAPgYtjg+GJv%|l!rhGyj$jaEeMxb5Y-8|X zPCwsY^;<^N4-|r;c8&4gtyL%jM^1g`1|jrXIheNudJZ)&)QRI>0%MALifhKRYD|75*Ccp7_)) z7Ou1x`YBJj#{0x1Q~x}S_qL0C{ucD14At&koI8+%I3mOSm|D+Fj#!5;&>H6yF%y$*!VTos^=v}G?^JkFm*Im3`uPCWZc~=FR=!azSPY!OZ zrREGVT?5~5abLnsm>@ryz>Y#VXWR!gCmgm&;Sz|?^=#bOd78<}itL9nrEm;%;`Lt}fzK9ttYuCvaUkia&p?}M+ zX=>g74dQ3rCUPz|pw8!2k}fmSEB1Htt!c1L&F&;+MZ7E4DD_vf%{pE+^UFjnsP}cr zF}ae{6*TE@99+CTS>-jd?d~@N@}MtR&(7St<&eP@-!RoZII_&HL+zt~WuLA<@0RJ* zxK!o=X4PkUv<@ldr?aRmg;4w%q$YYc`%@MMUUC7w?^C>JtF;P9j;9K&Jl#(~d!Q}b zc5mE6Uk3BJCEoNF0Hx-)UaORs%Yx~pJg_8%Nu(q6{{aR7~2RKQJTF z^E*v8H9=FsjRf@U@ffp0S*c@#*E?`jjh{FZz|Yqmr0mWbYC$ zz1#p%eyw`Yp$w*g^}d8wD)dZNNX0dr8S4wzc6g8S1aZwDBaFHt{@H&1FQh==KprYY zofVn%wWl=za50+sb$+m~I>Cf$yb)@d)72);HleGO*}p)&a)6e&@8M({{BAV^{VXE- zRXkQ-drBJ>)8m}K*l(nO=zTtPL_pt`&1;1Pv>f+TLMl5IfB9xP#GHaaGzL(90!isg89g^W9v~nkf>ne}CEN)x?rQnr+N8uH~| zIl|>GRY*cye0qfx{gW1yQ5id@S+5^H86swnX}HYYebub+1!$-*5FH@44Y7-*o=kpt ztaB~EYxhnF>T_|9=EcmNAhY8q-<>G~O*C`QV^a3{cL$$nHdYqIw|M90g@MM2y# z8~GL4s5t?hfU~erEN^Vp0qB4z^l-Nx!zqL z8*MZM7!o5?K8I6~a0N%Wd?F_fgx*cX7(u&aI&W#(?Ap&889Cjhd=?XTGO~dx`h*vr z|Gn-%x3&1&IPv%M6nuDf$O#TDu1SG_`rI#yftGshhbMe^+DkD*AB}7;!Y@U2q$-E; z$QS838;bQNXfz(o47o1=?HI~VHEwCO`1QP|&fwtjFkLWuXGV{A%@xOq$C{1P*!t(~ zP3orAnG~m46vi~gSZfc+gd7boZz7?fJ&U5_xUW!&zU+%>Q7ax7+YLdCW!{e=d&f^9 zF>R>+@~H=YAJ6u0jfQ!|)L*JOI=_3D(WUXMy=KH6^el7fmV5x}iC6Oe8dDi~;*I(b zh}*-(S#|i$L&7Hv-xIwiU%xx$$8!VZ*m&j-TZoBSy?$m`zLR=Mg!^0^Xy3N`4Kw|8 z!^(U*nu>7t37+AB2^C9|nW>=AqI01WfYk}B#9a$;36LI$}!#pTCnW_O-= z$^YpjX%N`dOopa^!c17eodG31I=J)1s!`foG(y__G%xnIUZpN3`fw6zI?$ zc&Jlmzi1{0e|!KJyNrCGqS;{VpU~y*;tsVw8ymNiDLYVt%gbtq)A(+fCPCfb%G3>} z@a1;GreuF&M6@b|@iUxyAro>#UjC0GR_f>3U-;+^^aY<7TRDj>g5=w}jY^ zfNKhGZQVl&!=4}^j`-;zAvmVT2r^qaz;;m1Q`|-))p>?z=}@`Rw0>p_AZ}s`q zc5K~ewbUW7tX?>rZ>)m(cofx-PR>T`0T!cJEaX+<^s|E~a8F)!3UHjOqo2U=z>du5 zz(RViDi0X6TA~BFD_vD2B*@N5M{epQ;GxOLOM7gKe6GJo%?-7GF6D zRD57nny^KY{GbN2;=?aj$O~ z$`RGwuWWvV&?U!zTSVQ3?B|xnY*Wy*Jy#Cv!pW>I+3Uwg23JU8{(SGdiNvccHSPxutnj%+1K1^IYG*=Nqz)+W0(Iu^3}&bCz_gbXP5Y z&qB8PD^z*J;tm~M83W{y_soF_RgQL3b(XCcWLq0Lp&Ap9TA0cx43wK8#F~ZsAF&ol zcAkh(e>+Tpmc<~g^G02GkJsxh-4kc-9T}DL$BgKBaBaF}>j=zknbqsb8jEG$F*Rl= z+?eOGzll+l#4YsbjL@o8Z0rjm2Qig6mzgK3jiOu2UR_alf=gbO?a1`d={RY&qTPl_T)T6y(lu7SqMTj=j!kLE73M1xvIXAAZkVZVW?^zbed_P_ChDP!Ak`zDmCQ16k!BiGGjC&bG_tlx`n+@_C zn8$KEewwhQ4tb`yz*zr~?vifo{PVKi%5LnB5rQc2jKQ&8$KpGp1cxXz0sDuDa@rE~ zH~sr!Gqr?hLxh>#2{F~&?#-1@305_P2hRI{WR)x}sYtGpWQD+ZasI2rSdR~#hd?|i zH--g5y$3ihR}Zj@J>Cy?6^IB_+M!V6E0fXdz?btV<<4*Ib1&mROk4XgdhJvd zIcSxxV&Mqbvmp15m!AJMXlJ!tb@<9lefx##FRz$?1#x#-E(mB8ZuoyD)Aio0sLR{E zR+3MCqh4gzd9i^5`8|MGUJ^l5x-QpZaoY_p6X$-bkUmORufKar&|NWV`Cgb~;g&g< zJqec=e{jH zr`_UNNkXG?8t+5gwBY;k-q6cADz%o`ugOn;4+mmBO`icm{y9FLp(Kj((NZoS5CM|~XH#+fyi1M}70 zp{KHrSGeGYn;q*evbhx)Wr> zbX$%59c*`Q9rl7_9HA?v7zx7FEClG9Pl$?+Ry2Crzq*Tf2ubqHqbwiUWg^#wXutyE zk5mSkvI>WzGv1z&trY8=TF%Lrn3S#)**>V&&y&+QV0!FRTx`4((Ts4BYvY>*@H`sB`Rww?x@XLf(LoRrnD8cE+gW zqp)>-FI$8b0ix{po*;6jhV_7 zC(D+7QO(&(X$YB{1mI`OTuyxx77xB z(BtFe-DfM_h4FhAqf}hGeiH)9YCdV>)o}Di#V=pAUDPE7<`=zG-O(doh+q<4Q6Isl zx6qW9$hGLE%a>jF1$ES*-B5-&bCwOvuN4PfmLCpb_lB_acg<$JZi#npRw2pWvbT1o z)Tabjmk0?ab~)&1MHY70N=EU=Z!EFNz>xt9C$4Rn*xo2@x^Zj3mYJ6G4aWn8!f+&y z-;$U6N5R=rrf z8~XgDeMaJ|DzIPfP}eqKSRyBkf!Q@9J7jZb7Hw#0d!ch~85F^98Pd%39;bx-M=U+z zz3Lm%A*7Rmaje{zEI+@DXJ6SvPwOBjj!t}w0vRz52l3qJIvZnAx~Bm^9*?7-T!x%c zH3>B?;06cWT@)d_FO;&c{oDFsgIbnBvAJHUek|h25{$IL$kkZiPp*AI+*`b)>45l1 zRaZ_--M47f?V#@P#;g4YhaVTb_dmb~>@neiTmwgvxTD~~otI1OXpzw(0uS4HARa?4 zZ$=Gsq)yBE;+10ic!JPBM-_F6)<+9{znSxk?yVPfII z+!e>a?q>#3oiF z!!#9Knr^5qFj}4G`615Iw;g5=DO!6x6bC{ri*F7{a5b&I5+|I+NVt&Hx< zJy0(Oa*3FZ%Ie~1;=DWO?jT#Lce%LeAt>&&=_gwH4{U9tr-y>2w-+U2<>YK*2=XR+ z%#Y&<+{2#H#HVIF4c&qK{-1~H(Y@4urdJ5%?ir5>Ll;Zeg052^7|owh{#hE3az6D< zCh;p&j>CLFaCD5L+G>?L(a+6BDW#@&w7%-T1<U54g%=+^L}VPv?cBe-YNgl{y6 z_xgX3qmPM06!lgNRkgQ)dLt0b47kCXa&OmUV4hC2yG32%xWA)C*5bYQNJb5g%Z;-8f(v`D15f^^}|J7pD9> zI>q58jbmYWfyzS!k5|i4TLHn>`WT?m4<+r5wsQ$3{Kij&7?VqS7TuJ2M0nZLU`<%H zk>_BCRd&=Vt9Ibz>vFg=tL@g^?vor9(&7 zNfnM7WPhZKTRy1r_yjV2g9ba%t4?uclnlrC!tK2abb@{koghYq%?Oz+AB$ zeKt(rN#1aSE>3PzsQmTLYs+4*lQ8DufY>&KdgbXod^K#8U%OL_Hh`&jd0HLjKFB$v zH3WT+;XNW9bW3n=HH7sIF?|{Z+yZW-#7%3Xg%Y~LT3So_j+SP4UdhVYsb?r+3l|vg=agZ0~rfXL-&5t=B`^5ZJtP^gL-HwIV+UTnUzy zkdhpMkd3WNH5Rs|s_Ev|qxZay9(-9sg)dqy({@G4#OavxtX6T?lz#}CUhl;Yk7<=j z7Rph31z``Ne2+A&uVGagDEQQnzZ>&$JJC-) zL^so8Qv~mk#J0?Sj}kko`G-DFHfoSO{X8NtV?L=14DA->!lf4<26FAT7;dA0!4I5y zdN=QwF4D)X5U5fQo;BKBgii@{r&SY`EC8wvfUl7BcvJ~bYKGjnkFxEvc0*m-i$*!E zNh&P^v4tm1V>V2VMWW1p`1J{$Vk~`GyN68+OK2^a_MXh6ECN3kU7^!Co0d4HURS<1 z6I#Ak2TTPdZ{)t{!cHe=_l)dB1~J+#)h}l>4)?uV8;ZW^7nCFno;$~z zOb*sNNnIrN(m_s}tI(ukMsV@Sj76zIyF7r;>b?7?QvB0uPJzEGu>WMwJtJC$i@Km{ za~Kg!V2&W5e?Daj4X(-j_~8#;F0=bl2@S1ozhVxAwbC}&yY7Pd zgTk=M+(I2YXYQ?sE3f+oY#I^hK*;S>Y`p)^>_keDe&GJiZ~YR|(}87PCmu`IAwYot zPJ;z&XlVV&AtF0+fzcdAaG}e6wI$ZD6bA%!AEFa=v8fV`nP34m_81nmMrXZku@2Hd zMzxOeJ`hpceTPw!V%_qa?eBazHOd27{=~8+cJ0{mVnyy)habC16G1~oU_(2f{U(*; zz9inyGI4ym^@;^&){CJgAi=@ekL~bfOq1veLPMoPd|5{;@Na{R!Yr`?VEx`+c$`x0 z_N@QnHmF2${!o%b3ykn@%&^2os6^>bnU-oG(W5*{kX4WZA}WPyJ*4`> zNg(h;T)EnzRg!Tq2$yp*mKehjd9vYKaWv*iCH?BtazZc|Aop0XTVm#wa$HEX#htDn zwJ!Fbz1>4~q03dgP|B^NR61wl46Rxw-S>wl(8)RFwbDM@eAQ%`XL$M$ftthqmBJlX zPJ=>jT;ezFTj#8^?fQj3MXj=gxhK}vd~G@@`#uDFb7MNvg5|Vgwf$qKR`)($p zt|sQROU<;`Unu~Icpp1Pyt)3@$IkisMJ65PWQ$doZ#4EN5h)UzdSWX}t_-vq(29y6ssr`0|sF8hji~eCSe5;Eev~Z}lt{13ZI0<6aTF(9v0YHChvh+XzLMAXawnx$8@yJC=*)n+h`ER0qo~n9lf0 zzJ=YvsRO|MU=sxRyubX1oJH#Q=LLc8YKJ1DcGt{Un)yAHCWl^;L9%3mNgp~CbIn~E z0=qDzVzp86H)AQ-pEUgtdV7y=o>n$1y<@bi2ba#%%wQosZXzaqE=+XcaLS^mABoQ$`-{h;&RKcyid>T773Z}Asj(VNvi=G zmw;&yBd|M0FJBl+TmAJQUMSQPU#@N(Dt2Pxue-Cj={oVPeeH&Z$tpz+V9(qHRvFEt zj1C)~=mr}62OJcg_yPjN$Q6BoWaEeno4iAX@j5tYh+^vdZl6c0OyP0J@qGv<29$I0 z!)bUPiYS!jX9{blIn)h=!F@<(6_bgDCBakowI~OjgoYxpq96wcuqf^!$Gun z0ya}dNW4~L@d2CnzC%;D+_`$tzI$8orJw^;9{87ylg>c{Q0%poI~t`Z5b(x$Yqsg)P5+=^__I=!$;Pwun~r;Z z?@Lkd&*5^+M=A0k+)JpTdZT*9Q5BvwekYc5^9MhnOK4$wH2X;&BzToqrIm^I&&3^D zcVck)O|g@Unsvu(^cx=c&u6nTOI^TceNLjT2e)c;bER|Bb2f6mx#@qgc-)n~@nE<6 zGL!>^x2Z9ZRU*|*Eb{)=Zc<~gmPM=lA9(UNt`JxKZy@3u8;_JvMMu5+ps910xHDRX z7}uq|=1#?>a6yYPGzIJ27!uc^hrdO|y8_}cXNa+&S@P#L82ZW!4$pwLRm0pvi9^1H zDwFX`$nPEGXI9GlezDAVdHo&&ToCrrSo(RHxZdqI)!J`AhO{Akn#H#y4{25y(1mCn z9Xove2nCFfS2L?y==NqswDm4eg0(U81`i=@Tle;a3!C4px&xFdBFNI879YxDaf9J?67%m@=!34gH_Wfp zXOJKm{ni`xu(_3|I*HRttV$JrsIpSXIjA2g&vA4YFrJlW3P{Q$-_~>?WU}P3uFRR= z{YpA{LNW`ix0A*{95=LZ@8gfhkxO16`F1+x$y?gvd#SKC zDPvB{bFGm%SgfXUxHP}U!~eKc)u7f}$q^ga3RYDa{Z454sl>W3PBnJbpS#y$(xrbR z;&hEY@YC-n2)zC7C*UG5`$d~Nq}*9G<2qi}aai1Y6>|^^jZHk>s6()vtXH`3XC_|G zX%aWRb1OlduulIdAeCjCh${cw$X@|B$!d^blFUTs3#)E~H88p92g~HrNM3%VrOcFl zG?5&aPm}0QyZK4|&>;6{gAE&>r#>wjckYm3@Fn-?e?M4x2+#g^!8oWfyFd>|XUcUc6oSH4o*H0^mNx4_| zQEG*0*lBOt#7x+4D5zNf^vXTL5}|`;E7TNs+WM^_50APKg_jT8{j-Pql?`q+A|I2g z*qNk281Tyf-?bEf%w0BY!fHyg0CHp=9NRz#OPZR2v#Gg19`c)b^ z+RgvCZ3(@du;y$d!8u!luF&BTk6EQHl}80<4rpbpKULlS;Z=7gpMArO1sqcr zJaUI+F2?0rsFx?su*vf5eW{09JQxJdw zJ5q`&0)HPaMeME$9WVjtlF#4G$-gD6DJr%c`rJ*aI$HP~!diVDUupsq9_#lWE#f05Xc%%c@+7%G{12l3T`O@SXt#O#(j8-}I7^T)zk_3`J7VW+%5;Gc_%8ZZTn)_?fbO~7J zB{hU$C1I7?k*nN1JNnqKg3r0G4<$7gPOqnzX?|dlGo7=StC_2rQR~r>mdhokT(gG!W#JF-n*%(l1?`@@MgC?3(U>cCLG{s^x6EG*w0= zdAY5YGAMe<(H<)EKUDQjGlbA3MPvN&aI^64=pQ)6#3@=Io)^2&7h>$d@UthUG z%+7wHyksxN#{tpv`PY1sB~NmAxp7*)+jBw})0AYX9BcsZ?}aR+IAk^&7NM@F<16is zfHPhR=ihunhek;~z2&?&sb38v$^1=`_|@L5A+O?*qP6}9n-O&cr?7VYGed@gA!6-& z(oQuY%Ci91EE9T0^+LV#!qH~TP?6B&c}ruWxf2`?)?p|ZHaqyZifkjLWS||wQyM)^ zmXQ%Gk(oFMT%<3N+3OBfbgsr%%8H7999%dkKXS1hpZ&b-RP))MTkMRU(adBv-?>J; zcgP8(9~RCbD`}S6WL7F(Ck*i|?4lXUNIz&jCCK)kX9~~~n~-gZ4MSRkG>fAtm>H-k zzB3Oh|05T(k!=mWvQE7#{k*eX9c&zb$v7*nZusUz#%dkDDf|yS@|!*JscF7gd`W>E zhV9Kr3HcMb_d#Z6h9r&jK8>7C*I#Ev%Bdijur)$Y^CxVmESHYD8X^lar7}5wmZ1eB zKLkg|Yh=@iz3h*Dp&sfO+r6q#1;w8&^P8~OBr2^u#qP;Zg7O-FHf^0EApu>M%LIir z9VwV%!{dM~MqOU>iP`wB^%6qO9hhufs&_D4$Cs@L4f7tl&JGv1$xA18(CgF|}8+h45e(y1_RSE1VgD&qe9eZs}11jgv5gU+5By88WO z;+=O*$JQnly1F&~;~L7n4wR;pH_sj#m0!>F`U0z~EyAo2y8ShIGmYozGjRtMf+e|% zH%Ys?P2E-=+~$A18HPWBbl=p2l-T!bUOuL})yC|34$D!vH=J{Mw0YUU9|At!Oew!BEQM_q=m^*SX zWA*#X{M{K?uj-8C2Q7zZ9j$F!-NBdjGDl50-`9S0+b~T|!y;t%ZfP+3t#t=+%YQ#e z`mw-|- zOFc(km@co=sdrdH))mee(rDMMH66YgZhmW<{$aIc^GnT=>#>D|K-G*BvNvV4;Bg+8@L=XTTn za>v=|8F3Ny-j}EQX%ipYQCuaLWM2)h-j}Q)w!iP1fL;jxP#~xKug+A2brE<(&^l)( zW@+1nr8M3-PN^+DUO?Ysvkp^Kw(;Q@8Hv!4#?foHb%^)xH#@Uz7j=)8^L}u<9r||( z6~VxbWy`;YEjZhrtbdR)5-F)OgY8kS4w1Sixb)Nfa4G!7<-Zo|CxN#=9ln8=YlFaq zFR_kumk<>E5z1Y@VS@6dSJj2prB#oM)QR>BHGDA^G3?tDPZ`(wOD<~;)l_itE=jJv+7Kx*_@jLjgU%pRCQ1I z>QbYaTYf)?GagG5Uv6QnG|l|`#QDb?6I`p5zH>N5wPYt*o;Y*NFkKEI0cJtvg*AjFp;dnqs^3w(7e-VS z^FRC&ej%cWYK5+DVkyk!9w8RV@vt>P$x-$DX;epy(Q~l~y$QkDICHQd)JF(tu)jhO zA4{hf?A#fSU&dYmr_CvIzn9JGUVN`O8*E4B75YE8uZ99q@D4ZO(`zyZPV~OM{BZei zP)W!p`Ma}Zs^38QG0q)?tNl9a^DIaZ+I=Nw!qzRWM8`n zdhZYBo^j`O^sC&bZ5=g{D5S2D-w1R+3I{GE(6H{2ZQ*LcKfVBoAk?0jZR%_g!>Mwz zuj_o*O4609&+0uJMUDMm?Ru{$qZ<;=eZ7-*K`Z_%He6WJfa8$RhF5=7&Oh%n9rk5w`ODM?vif%|#h1{w{@aa~0`711Z_CAE;_JkvP-jX=bx@z zRSW%JS+?K>NM?%y?$O(Cr!T^tb0mO}YZR3G?Tn{B;c*0U5RDpE^Ke5-`PdV-blm)+ zJxc!cmx#X8GyKJbgMl7*{7z-$%Z-q0n0w}&M_9HQOur7JhjeYi&9_ah_eDl`xK?O; zW(gp|Le+cLs|O-cS@gtzi>#l^S*ll?j5dtQa%yEBql3q85c?-8a6n2{$G?92o!R<5 zk7W6f^d0xX#+|152jhaRMv*xa1H%BX@Tt;L@j!shCPA$lB6UIF<5tca_yqG@Q4DLNfpVz z9^ub?M9V}xQ{-mG6Y4JU3?-gemeBOJeG;N>3Y-M=s&PrHdP}006+a zrAlp&gz2A)tkDdFKFw`({SFbKyiiE~PkEu$C8~-wnLRaD)flf3mN^%zG1-Vri`j;< z2wja%Y`s_Xy;vtZf~<5E5+Q8H-SFsr;Q`zf73bjxGR|kQhq=h(3RK;%{i)0$e{(eb}%Q#1WsL+za9oi(1T~Q9Hkv ztnE_VJ=<{hdbpzUgK@Ne%h>;v!d>bR`9evs?=$5q{VCbP!92y7)rKr(KY6qHQ9j(4 z9=vo11OPr9h&?Lh4Q2(h?zHnZ9p)vUnMGJQCf)-CXl#!HFj1$44lNaGdl&xaq<_ZQ zyQjoPV}kqPfk3fY!5%2&lz0N3>xY&_PbZug_*7DwWC-_$U!ufUcp5ISHoR@k5)W4u z)Y45e^jCh|$vNTNW$*E9Oxo)65!Hoze?X(*t?R4?$MTEkE#|SQJI9}qtph2moB0Zz zoj(mex4p0OX7|a*H+^RjZAo6HT`}Sc1jc1P@NZb?ppl0+ePXLl;=!q}{P<>UH3oXn zpYiYV!it zi5maFJpO2xLgem!uk$~=!Q~D+&5GK{vAgy9G#q_yD5dXIdt0d|YW+^=Y1UN%#$+y6 zR?wLmW{Z6NMog%m9z!ooa0RSxTLxk zR(Y~0qkf#X>%gtpHtNnxZr*z7-&tQ^RnQYs-8) z6$Ml9NQskqOIXk&5U3iHV(I+{6 zI*e;M8X5jQ1h~mFgz(%AWlV?DWH-hn0}P8VDh@jxUJT?jlld0ftldG(x@EaXaJ8uL zd(=~IWq-Rxdv+sd=RRoYYClMJ@BZ7S%!iia;2SN*H%^pab^5(TentzUO+UyeMxHa{ z$o|<_UAP);NEFv;_{@1a z_F7aB-WzmOPtSOO!OUH(2`7sfI2MM(PtWTup1l`U#I;jTgXOQzFEx~njF4(cRRmiP zs55c>1xNd{=62q=06OjL3~ObaD9x0cM7muS_S9kL)cSaXXjmMVP!mXWHxrj!rFwOj z`~oQY?DejC&+K`Ei7|(*$*{T@5C5uc7-EtqeKjwOK zKwRiOsw97=^42a&w~{5}%{JKzjXx%L zg5C>>`%O;zw{EhDVv8R5mHQJE{KJaMHO}!0gmzJjy5Q8$E2gr@W;@s_e*G7%f9C$5 zot_}s)$1Cswe}sDhV%7WOY4hB>|%j zC8MRILwpp5rChfd<1h4`A{#T1HIZ8#4vn;+I5JTN>4i0CYcwit2giW(!!j`{^DX5~ zSCo#<5>qHxn0}|q{xJ>w+npr%_>m|PIw@<9kNHk@!w#bmou=_YpGmGNkeL&^DB~Y~>mdG7{d*AD)e*!j;@^y+ohAWq2Rg%l-gmWMOD` z$~ot`BL%3A-u&Be`|pqV!c`4s0xYzW{_0HWisnj=#+AnRSO4C5gdo_q96l^UDG)Mc z_TvIVZU)wQ=OXnK-%oD+d33l1wnQ#p#9{ zLS@*O3F;f~7|&2o1k7KjVoI}rui4bUaKXA3A*$M7+MsZZ7+Qfs^{ZWXa>~zgs8-p6 z0zu{B)7Vk<6QJ9gKiUIrj_xOrL!mlI?}Q!Zo_VK>Jro^o33&eg#IFfb6(71D*2Tr$ zK`39#oCu7aJ6Qa2`U2M^g7j&gGgLviDOnT(0!u9TO4*tf{0(q>3x7BeyC!!vtj$!H z6;6!ggy)9m6z9mRG14$vG`k@-5&9Em6OVCX=+Q9K)dmC8OvRdafjRa4(?WH4vIY|ZeopM`vJLYE)G z;fZzO2=V6`J@a4xIw~n(Z>#aC z^vKmsy5?dBt#^V9cy^Y7U{ikgPX47GdoyV}0@8Kq)O>BFJ1KHsDoQVP`+rBQ|JMx~ zBCgfHL8G4WeHxJQW{0W=+ONAWA!#~HmKQ+o&0*Vw4I)2DS$a)9(vKIP7^Y+k1mlR( zKDP9??Z6L2dnnR(#v+lP2@C3T{i}O9^F9C|2QJ6};WoKnB$F!ukjWNJ+*pOnVw-GT_6WCyA`LI zy#+-s3;oz(i)#iXH@(Z#4yrg8UzAmIvUAs&1ChQ()f?>l+jdVf9V{w1e$>z1+hyrI za4V9w39Op~r#-h@xV7WUvX-ZVBv>jHo>9%N?bqO0oou?x^?i$QK02MebIyC#-pJZ` zx3?x6Cz6Xt4X}e#eS_cJC$F(@u?;g>w8UYbWFqYE#;)S#wtSy;d22IbPfT^dlNNX8 z_cOlbY^72hVMgztA4iVttXUqxnDwdA5lCj+?6vuxdB#>snVuHgpS9wz z^ybU+Ks z@q+rKG~hrjrZ36}55nEwH6SclAM;)Q)dXpGi1?VMaC0X|M#EbN5yb`~EGtW?UZW}D zOsSq;+)H&3hcbeejDGN1-=*{m)Co-WtQ*RSb9BeE2F>Iv3lukHCN%w6!sJTls4S`P zfL{-NwK*(Ap36E0tTZQZ&~$pgy&KT+mH!LYRVTc>t_!dk05;HtALy;zNT3Ng&=}!AO0*?6x7<=rQHIK@2u0dQAiFkr~X(w|Jee@4x_? zbSk;Pw3PAox4Dznl;{?_x4al(`2gs}dU`0VIF-clU zFeA5Do4qW&Q7Mu}_V5PJh6#u5#B}%$%>d>T1K4OTzh=?+4OvqEoG#U&FSWXhArz~_ zg_?tXJXCF0iJS#I7d5 z6~n5HN+-cokzkM{7ObP*G33{FTQt1tcFuf8l}yV3hsW{CAL##Jr2jou|M64YZ3lZI zK*n*WVn$sd#T9lbTtQ&Q16f6!&9j!zbH6$07M|0`4?^<~iq?3vQ?fJOrR4@RW)7*} zp>pnxTy-D2DjzfBt`(69kXMzf+9ECyyD#rG1DcyQK28+1h}^g>X%Io!v#FSHOHkW+ zdGBD*k)E(<&=T+exh?dHM8QKW^wc0~g`iI6$Qheq4aX=YjPqR!)SJ-tkFv?TrB)I2 zI94Uf5=ndI%s0>~Y@<^`_kx9lRRACsr54QdEi$2N5rJOOagnpLOA=-}@cQsNl~P{s zc{-4wf$I+;PMA_dbZCQMWlS3#BWw*wm{fX$(D+h`mw`SicfUw z^j39);cz0k;gj`~Ho?}9*BjTFudTi)D2Vv}lD+ds6uS`W%VA}qp9+K3xBDpzcgU-{ z*rEcyC`G7TqI^hp!PtiWiBb6YJInptB>!bmUB$gp)9GQLzM&6fIZ1K)q-8lRVABUX zwEOM^tN7xV3q-_MZ!-%9hFDuA?XtM~f6i~h3vM!zIfK;&=J47jK^2Jy>XR+B&ft2^ z%T=xx8KWP0#na@<`|m*X$BlCPRm`z$k~`{Oxq9qBJ~xy3#5PhgT)yZj#ny=i+K<;} zzNy%K8C;t~)1vcIVI~M@9r$3<^7=k1&e~?#a00|5pCf78OCt0Z>?Bar8;tjo5tp6h z+RUU|`)X9RVUaLcONM&Lx(O;p1>JlIVNo4fwf4JV?ebvx$iyX+;h>WixHOEMoC~S; z*YRuFb>V1tsb{^6oSeRMU~f+k-^ho{X1u9s0!?UP1jcdFLR$h}1zhpmrs%YtB`?$H z@;-67j~HtCbDoj)n)^FA^qb`q+wIrmez1MrgfFC4ngS7dDTrI={3Gj`2mbOr+b}PI zn8PvR1!Mf6IK6+sb(@pCvp5_2#Elm2d=xg30p*rw zaqX>7Y@?OTi3p(9eFAk7;1v|T%+?{=s64Y5LB_1`1bsEF&i{mJ*#eRM&S~l^usTh2 zboJ5I+r9?O_~aIXrJ;Oq@a#BiE#Y_-UbOxoK4lP*l5ZyY`M=*!{kMPQw06lldafP(qwI_8h z*=c9H=V_>Y;7N?e<0Fl)O-MzEu5IjSk_2)b4^IpB>yeG|y(tG5Nvc4|m-kC{710*w zdUmReFF&a!;a)*`diU*;FPze`!xA1;29@{h0_S&aC(aD=G-upQ3fZLhnHI%do~)Nj zdYI5yD>e(JB#@8T9ea13g;pd2STnFm9Z*}%L410JH;i4l+rBvy{!mLcFI?`9YOhpN zZl}o5v3X`#Zva;<840(<&{Mw5Sl7zSdxs}EVTBHlem;jgA*6G``_fVy5V4W>ct-VN zyIkTEm-Ahppqhl>&EDE`v*;U%zdW~;LGa~U&@Ho881VR=BTj86Dwqk_Z z2hfRsJBgnc(d*LXfNdM@q9(!R54uE({atG(cx?6|X@*&@kmsr<$AP9r|lS}mgs7LD2PJry|jFJGvR5oYvd;IJ6_IHOug9SzO5V6ng=@)P7mhclCC#xIJnC3FiOC7OJcqBOK3V9ghxyO$K!%$O%DVHADp>g z!A9*B8C9J#25@#>26m?vP;XaK{+b5=aWj6sIe(r|$n4b|8b7!_@y$4hAHv5Fz!X6`i`NjxY`X^G+~IT}$rzUp@TD<~!L?fZiZ-#1%2OBY~xjVYS7=873gXYLqZM5d(fB7`a8 z{UmogRlJG}4;4qLZq^K8Z#KnSvmMV|TeKLQhuvL2aI@F6|MuqRGoib;MY?@|)r%FgZ9;T6Uz&Xa~vE`Td69m78DCC}C_wjo9GQ@;u(zx>9?2mi=h+2&nu{OqTgyne%>JY5QnE1vc3%*D5&K_)q2`KHcVVOb)E zI*wV}ETy|;8dX`-NPVyz_j@Bl@Eot5yBW!x?Pu#~!&at(toW<}qRV{etVe#A$epEg z5-wKaNtd`g6udt93TMZ>P@A$eA61r@mV?4kVP!gFKHf#v^EvkPdDCI(>@#n56_61r zX^9i=vzP|OqOtY4E$J!5y(iNz7Hh}qo}e4R&E;yg5oQdf2zRs&%w2d+!oX}UZR8e< zWDc}RKKpIOO!P7LTR~S@Ku0!3?A9LEJvO8-AsWFP+O6lbFZ3<-c^o-IFn5(}cio#t zMYhzIlHNmZ48^=xQ;;JRkP6?+x^smLF`aE40h3TIBuU1st4Mk0qze&nQcRk@13QQP z3HZNc?J@f^6f;IIX-rWq#v$IQj>d2b4se{&H)T+o=3hEomP*`qu<*~DoEo`W@8j&E z+ceivm26$ETY`A?BFeBvWp6|$S{tc|s+3GuFdes%<1QZ7QyX8iuY>I~z;|L9N-MDU ztj%7*b{n0min`AgtZhVYeF8q-y+zMIi_!RS=ODaE4*_@aL>;6Kmu5|?&s}zqNCUow zP1hhsHN8iY?lMZq#6HgWgfBUx|F#A}+p(L>$i!nOJoo_dlN@pm_FRBN(E4U*r*L#X z^jbg6ytDw~`(!^`M;!_t=;4vqi=9X6sGBMn%I!RqdZ~pHS(TcF?AniFDZpIw%Of{Z z`B(#p78 zLIO^Ui6D!GX@#W{<9H`DiCE zo*5*4JR9z%bvw5hWhc?Fx+=MRN0}$(Y;41wR>$uqBuVBfx@fCT$B1IJ)=WZdC{Cv^ z7Bv^(AGW6sAzfVD^o5{VAUXFU5N%#hiUvCvwm!C*qT|ny2x^0MSp1B_jObUfF9ksQ z$Xa2vSnl;9HC=L{0+zhpW-}<|D&Vn%hh$8Is>rS%D9lnk5&b&W&h;! z-=h5$`9tpF8?f_h?Ey6H3jHdNTiKh@yH2oYi3c7V5p8=p%#n!Zq$s!6vYTDBHXy*l zsCqpDw;PkJxY)FvZ{r#o;g;}!7W+V)fZ`By!9r*RvdvuiX^aFh5}HD~@d8o{gqN3c zro$vO5~=C3&O$WQ-#lDNH>Bx`t8V_tS^kN41m-8Zz>;r19=2NnSm?rTTiePri5}O} zW4rQKtMp4ZkTpv0g92oe6m^<kHSP*nlfriQl%xKd%Zg9+cuU(%hMF>cX-ia z>uivSoGeOhH6{NID9?~N|SK;`d-66L}0_Rjiw}Dk@;oPt%ukDiTkv~XU-D` z_ie8&d@5dc1i$B=?vflqjfPwJKq~u&Q4OEBWMNSkKExWHujL&oZ9(Si5O8`TZcNtB z96gxc?H}}rnc5Pz9u=Y6qE`$=4d_|irb?D!ITyZ5?};enCf(PDV8BK8p?iCfG-|Nn zI1QsJ5G18hfRXT-<@wKBH@uydMY-4nqg2!GylogBu)kKC&A+XiZ!Y=PRvos1?CVkT z|4xRg($!+E?d&H6Z9HjFAhlw5Ptmlm<8OuKkU|~Rq+cFg;IkSIG38ylFZ}Qy;D|@n z_%?Ii8l#6?(-`LlN?I94$t#$`9L(QLzAR8NVy5 z3YN!i10^SBE4-f%i4|gMjksnvc3Pl zkDE%Ibj{M)(#72N+WSQ+-zXhN_DwFN8V=oJxfYgVFKRxe(8y_eRORA8!;+k_t+6IH zqA=e?I=&eFH{&Rl+QLqcEk(?8+F8EUNm90Ev%$ z5p|oFZjoh65*PQ2fB{pq+-LxJc0aBR4H-MKu*j%mj+$Z3qi1YWoTdbuaT!4h$YJga zPC@lzQ?QPAYzEmf*Be#~Q}h!4)^Om2BH#Tb?b#$CS=hM0ub28@Kq&S4#{Gq8(<9DG z<*QZtS5}`YqDT_a7NfRb_9TJ5u)41Mx!$fOMTp00feQSsy&Kkb*O&QxsB( z-ej~YcH5Npselb%Y>FJ80W3`Tdnv+qY_g7e$~U=P6b&6NL+Y}g4ao9Ly@tRrI_Y7R zdT;c##od5#;loCp&W^lZ=8Zuh5@>(9EV8DTTBt43KAj$bQY3X7r-4;ly`7%B0Ul9DadO@QSuqJ7nGb$z>J2%+g^Emch-e34oEc%QeI+x!Um&KqG}p$=K3GLTwC76HYX`5 zA5_vLr;!e+`h5rzjINXeeAa`>bYBr6Yu#h2?;KO@3fbsU#J-&MV-NY&`G&ht_j0BV zQ>3d(5m__R9Y`4E1wDC&K#(cM)nPNPW>JkpWt&Y*%7ad4)9~}zQnh53%DWx!jaGfs zoRmL+_2lN4&&&zT9!feCko)t`>e6m~3+i*!R33+5N?;-_pH#VEm`|+)f)D!H#DaYP zfd%}J+YdN{cS$47_i{fGKABYQIJA-(gAn3_JgK#&jEK4Xn#-u`T+(CcqhW8Kp1qt{ z`X}tfW^awUtw&ymWK&X$c>3n^i1>UXG}^gknPs`$X?8x7)^X7uNTaUSUTba8o_g0_Mdw&g z_RU~NL)pu)@~}0jM^v8`{1n=DFg`J^ zt48MYi=Af(sy1LD0vx-aJl>!aqtWscQW6xM>p2?>YaN|M-@A1t)Wl2WTZ)alzZez*ap#nUY`&WpBn)Pf2 zA}D|WavWA!6Nv4t%qI@~ilo$u9!oAh-XB=E zH0>_A`|fJ1-^Eq1X?NTquY+rmeK%vmQxxDQt~mM&Hha6ET z=S^qpT+;a=;RUI0VaYeu#JPD3I9sQwSJPXQxQs#rT+Kh(H^brPC~H0zm#pU4>jTQl ziIiskw5y!A+lJcEyOwr!9Z4RuoP2C5R6?!DD#f}AZ^wy;FuR(z`FW$co`y1Mz;u)i zgjr;&2Lj@Rvqg1^(Izq=jRG|KHgV0d9nXh5d^8FQ?!@C!boCQY$QF{$jOl@F4 zj-i;FY8=nsZw=~{hVFGjB&oo9yyo|dO2hZw=;)gkYejq(d37m3ZB{|l=+Ie$Z{44L z$Yt8@g{xPQ^{qlv5Pjixm~?&mu^Ccn?rbj+chU*VUYxDb*$5SoV9ISx!mJI|W<8GX zG*u#eG(R6d*8^)tqILYQfW0sSh|jBL-TH*HAvCa(%TZB0l2=xPO}V()b=R{YIn80- zrb@o*Z$R>To-#%A{#0{z{)ak8_DE$qSX9YeH{=#a^1a)KZ94wfhjuMJw2RpLzE0?H zhYINTyCdu^pE*K-B1{iEptFmB&bEhT$RPQKUTh%wD`lM)5N~K|ccklt{BqE{sYV>v zm(^ERzvq(sUC7a;XH%iPdvGd zzZlt;q!cB5tyPVoz<`1->v2ek>WXhYd*Wg-B!@bqY`&l7q3gRcF2;#VV6P7i8l;hQ zxouwh><(Xd_74&b9k3cj41mSHW|u`hrmne-zk;UEDi@r^WM$=6LgJ*jmnZUzL{$ge zfYS2nKqtJgjNrkC^j37XjuiE3MVF4k9A=gDOIPMEwbo(r&B&$SmiWZ9newCGSPmkioBh!01lu>1pe+W zpzA>biRAot-`vsNj((GnoJH}{tU-0AR(~myH;}@?kW}%P+&RJVWu#NrqKvH?pUO zQR(Efk)}$LP3+U*;R6cG_ORvMoD>}*U%{dHHoZcB-YT?o(QupSpaKBTZ~7ak0See_ z%#lQtvX+_WjVf>5BJdL%sR@haLnF#nhXoB$Sv}DFtd9EA`-XBbd->l!h;Dqn2>F6Y zH${KYiPqMm#0? zAj;1hW6Z*>Ymp~8KR3U}EoBH>Ra*4)FS2Y#rdIGhl;bAHsuXdD1GPW0wxlT8vcg}1 z6QKq>1hzgxAco}NE2VlLI2j*uWBqRoW%?9^^Nx9NElyIalz9iTbx2;9NiCg%-Kh!l znocK}6HyM`BQu-446cmAO!$cp4Vcyg8yuP!1`UFo(2wZZC8D9UtG)tZNon0e&BzoW z}owi~lZC>VwB9YS&_753Z46z5Tnw}Et2x=7^$FJy1)VBA!K zZODZz__zqC;7qf;qXDzb=CAhh6I`j4ESBE|ilA8WSgB4vJSm9 z1Cg&r=xm6qGQCVU)3s>*)bFdnUB^TV$eL%{-YMKx8uoT7;xq;7 z<-SJQk0W*$SCKm9ghsqyDp(|*SvKn-*C#JlZ-}Teg{QaMWo1vn5DynMr%Kac@`s)) zkVyb)Z)d-$V=$~Z`{&*O_@&2Y^J2k06H#HRdNw+ke zUVDM#dn9h~wQg{{^u}c(tWZm6%Ph)HM*`I}-v;;`9ufSbA|c=-x`n?2g=oI>2Vyz2 z>rX{ag*8b=8O4XWBm8`I*L#=;^VL>7>z7#$HA-#&wb0`cv3u9Apkot%<&ZvW^a~2J z+v?0ehgRY4&DD#HQnVLUUFG$A+g7y5rvrnub>yePI&``rW5Ubcl7n9U!||7D3%aj! zOGZh0&b>IUhyPV*x^8>-?gSWZBQDOfBp8j}P`c=m4wf|+Me@;yk zD?Og)wW|30viS=?!dgQFekHFIN?TmfK_yW(*8S4}>iY;Q+*j{pV_JKD=US`G*PX4b zfh#Vv=`R=M7S;c!p>pzofQ;R14kY-UOd6Sh2wwV5)xTqpbSi!t0>i-l#G1=c4P_nL z7NjcNTjw$6Sy+wrEcSi*e0GdZN_eqDZybEO^2Fl^ltmOLye${L4{yv9Fu7&?R!f!E zhB4xAA)v(CJ6wyJimO`2GN>y**u*N_D7tyM(SFcNuI1vC9#|iKnqJl_fT$2(Y**TM zRB2EMvoIzY!oSkA359KW1YQ3-w5{B57zmEHc~O{gWP9WC6J>tdQJ2{j_7;7N`D5Qb zKv!@;nqURsiFJ&D&*(dzRFgl}j3rs$$9?P%%qZRuNJD#LvWlINrQtS|u~%9`XCfO{ zh`pZJ6x05Jvj0r+@hr7f;@2IapQl^K33%4Pk#09JaGqL{x1Nr0L=#s!?E?a((>tK7 z?*Yyb`khfq|Ia*s^H12ltHp2BE||a`B1uQvR@h}OS)Zog=9vC*23Lu{!^G`9H16A9y->C18dqtet~HONI7U?LO|D0dg!? z@0hDiDpUWb5#p3I|2=FFHxE)%qbTjRkE_Hf)rwY-e%kS^CcPvGhoz_16!y?qqcBfL z!u;NX=xH8DAM@Bn2tmPJN{lj zTb09|BjMh$sQS8H*m2p0yE?#uG#TA*Sz&*>H58A_0iMLOfO*TQUDi9rHk@%k-oOv4 zJeK%_{9>_FbNsoHeu}9$5`RSerQ!Q9;kN(6<-kYgORq9+huGyPoYJ?Gqx^zrc1^XD zgQMkQ$@xO=CEAxew9jx!lQy0u24HxA=%vK{F%D_}3gD%r@uG|c{6J{6kC&p#6CT`^ z0Uh6502uF1OGdN(9SHy6;2t@R4JBn^?w7WSyi+rU+ysXCby~vtr^C2j%KOf!m3wwN zxXbw%NM)7WTQ1~@d*dV%6z>Lb+Sj^h))X~dL`n@*-8B|JTLt&9->7at&8(XmfxVZH zI3tm*sqv{t8~zt>eWz`+6!jpuAxO3WE~lqtAPid%s*=eusRnIlXA)EXhTKWI_NOrOaQaP5w8((F&7HO zt1P@PC8MBVFfMzomlHRLTk1j ziI0-j9^QA@eLYQRy%$3n3^F1@G57N0IkLoRYEQW>7vvYJ4eTFjEOdjwdoek6VyuJQn_VmF!+p3K*Q)Z7h-1!jqYD>&r?mF& zd0@Z@9n3RAUhQb+Q@EyK0wg^*w_fzKfpCq}@rnZEt^-K#pOB~iJ?=lr-;+2!$(ZlU z{Qtq+G*!|jBN8sS?Eg-vv>9Mm`CCVI|3|WaA1qo6@K{;(mAC%MZvQFf_tF4j3wLy* z!cm@651tY@F#E?WN&xuQ_Qk}fJl9`H=@jk!4}c*H8^Ond!CG?tb>^&Ni&XGOhZx4b`8bK3Ods?A^U@&7Y%Te$^9#4TqQx@wiH3>=+e zlMK;ARobYBXMQgEk0Zn>JR6Oh3lx&%5y^-~2-Cz1T)qUHBd!oQ@KHXU6`ev(LO&jH ztyZ9B=QJZ}qT^AFo5L2)Xdk~F`}Pm(xrINMPm<2pP2Jq*=@Q#ycNH=#LoBVI;EeXK zfEnV73(%L0xYz$mSl7uDhD2qZIv#%A#hp9J5Wwre^sf}Bt>0e&6qjuNY9J(BXppk_ z2OK&*oRt>im(1J%F;Ul1b|5>OajASk^BQ&!ZM}36^36t;0%oG)mx%@G(9Ft zc~@V-(pE`!lmpcHkl}&nN{ze9@SvJiA8bw_D1d_1La&cV<-oY$Ef0&PqtDQYQ3vS!D}{i z&j%td$in>WgnjmxP25nkJjJ$dH!kblwTI+l4U!S!^o&!Oxy_GNOuHMFD4{t&?bRZO z<~6>?ZCbWq7QP6Qf$@dFo}olE;zoCM=?I-$i+80T0*!WsS0y)9p;aOd)E3-s5wrC` z^ROaGlMrD^vKA#}&3LXj-@rjI8S)mUv=_H05cTJc8Q!nk@|&;58Lsm8TRSUm_}~Yak9GF5K}X4 ziOnpDERqVxxv52UO!UYgS35o|wtNfIZ}gYi?n`)IRtkmJ^~}#ua!xTUxHD+=39&S0 zB}FMHhlTJ;O>B-jpmIv*Sy?n(A_8KVoiSzAfw9gom>9jD{#a92#B=oC=TXbO>U!Uu z!I?I?hRgd1cn@5iD#h&I>@9R`;dDf&sNFFrD+sCi23YCuWJD8%t((LcAe-VUbP3es3 zyjZ)`pql5`Wu}L!>?Q}eXl4xqbehn)P%-8wTFHS=M47oC*RxqJ+rUjPxmpow<{1z} zn!)Gw%&hOFwVOI<$xI3^k2j#|(K{MTxXQOX zlKn|-oR1nRN-uy)*)#%|=6O=SqU8NBKYQJ;I%oApDkli$9FH-;b}$?1qh@mSl~{^J zo%>8S!Z>^A^^%;r)2&+)<`(LCZYDO|_Hpd?HjC=mzlH+74NDw{-M|FqT%%*MdbJQY z5#KG$>w^gKzp<)ym=ecOaXPEKjK>u{%VBW#C+NB;eAS`-5mS+fNOqvx=dih^ow+C1 zxtT1~WLMI)x)b^YOZhd{7xRhzSyO6v!<*!l*_m#zNY?qyr#EXF#l;3YS4XCzo^Ezf zZZK#$$sbA|+XO@Ru3#Dc%(vp=CFA8|)XxoSiaNI62BMc+rMH5FCx6s+IolzEIINK(=a{igCnEW@NIQE9+jc z*@C3h&2&#E{0`ce+_V>0*C*-LN4;!E>MU&&h6KgBmUXUZQ;YC*SurJqMQJgLumL-Z z@Hv0J41a6OzKXX^p1g-FrA5=OMjj61WKBVnWX+lqZh`n9*}!bTda!jv^L|atF0|G1 zpKV_pJ~CWd9*-9?NEr6Cdhc*3cI@tE58~=0pNk3kU$>d66!A=`ts-wr(BDNNc0%T}p|j>ud@F|oT%L(R0kJ8a0Kj-_i|=kl>( zqouBOkcKIQLT2ruZkId&td^o^i6)s*P!Y*mJm` zF@AI0UVp>AL;RF@e(+B~SP4{eJ`B4fd-XR^F^|8i+US zlq9G+n15y^QjFxxuA3Q&Y;6w2KhtV(+jssqbicBMIibdBLEehbFw+u<)sQQfaD zS#&5EL4+o1wh9t#wmpwmAr+3TvkFa<`V9BYcbQ0aDp^^IQtG~% zUt`G=uSE1=l0^6S_cWOEvIMO!oC>sgUgMv`AR|J2SQeB|d2yTohV*<^R#>U{0VecY zj3p^`6WI_J41AoBa~M3{z}BSAR8#k>2#bJnp^J|#T=uR!4?DNDYiylU>l^Lh52?W# z^vKrhOvFOP*tOrj#cV%de>A@X_7f`k2+0wjfTn8wYuP6C5xJ4zToz_fyNsaUvwts+ zEjOlmltMdBPK!(&{cPL8;8XT>*cKDlA7=riK-c_9UWB*xJaXfyR!*B?(DQKC;HB&H3X8T+K2I0IZ{8s)Tod?RA#rYFhQUlCH+B+E7Ovq zJ@u8nkI1P{MP7;zj${3;k{Z^Z*apXd+BD?CCrZxk8#So?AOb!@+HALJBgAq+JAHn4 zjs#gyF)Hnbzy@~BDwV26&ywUtcIrHzO34d}s<{XtgrMV`9hFWplMnqI=Tb7a2zo^l ztWK>}a=u)WRe7Rpi<8o}JfI@9b8i*HzPk%&a#%|E03#!=iJ-X%-t7t1wbV-4j_5Wj zO}ZInTeq-GSC%<(HvP-e=g~!t3Z=rVi4TWRV1kcm6(c6W4Nnhg;mN}5mgy2#ErM}f z7mx07W;G&_Y~vm2H$f$uYE^Oq|6um=Kxm|EXDfeHpZsHs;L?(QZ-kTr=dmD)^A~J{|wI zQ|qVD1&RXq8oYn1)|7bT`=zDUEicm=d?|gwAo#B=pDt*|bqhn@v=61kYI9sM$J0kh zybcNM9Tem{4na{UHw2S@Z{?deO7e}u?9n!AeK#l)hAYJ>2r1?0C`pp8s-EXE(Q13c zQ@YbDJZ+9n5AG7ac5_Ko{4&w=Bn#ns;Ixj~iULJmNu|oTFI754^}fY(PfjI83FnV% z){UUsvj1K0F6QYP`)}~c(FwTU0Kb3htv+hF*KMk(PgBqp*-k!MW`=FGvHFajl+c%v ze_;(?rs=H${OJf3n+R4S9^AmiEncJ}e_&Ojc0L#;%5^i<&qhZW{+S2r?3pkL zd2d>oG}GhhHJ%cBORPpnyZM9?{B9(-0gd@q`{L3m>v5tC^6@MsjnY$jBZzxfNlkrr zxoHvn`a@N@XIXwbe1*v_RXJHPB=_kQV$ z5sUfAnG1c_gzlL8+Y&0sT!8bOa-{Odj`ULX=>7k`9OISqyO$9{EiLQIqmOtat5ULJ zLm!9)G?=Q~uh5B8??`bR4}?{ggFh7MlFy z1E@73E|m@Mve^T5Z$`ptueCYKtq>}Yc9RVvg`Co&1UGvF&4rLu_vdYH6O4jzPhr3j zzkXM13BF&;G7ypf;<;+xi+S1wmUEgemrpNni5l33zjOLJ&K1{-BLnC)BDU-qOZS@` zwnImW@n6bH-L;oq5?;wJbDgf~U-$&OKM*PUUgD9Eh5JoHS(r$*DYCYvIDy==P_Apm#^ttH z$#+@L1PI6jt_Z$rKiq+y#dQ*4gpV z)O9W&4+Yo#czpc2H%MQMp3bxXIz&jgjki_pbjyxCa3kW|)45uP!r4^@0R&eSl_VAH z`Wm#1+FGLV)NqNZ5b35 z3_JvN5>)t3^&6d^_FLm70A3;VT<;eO1X4RI>1ZYIp&-~KcKnQK_!zqi6?>|4(>lQu zWg0ooq@o00t^CzbumfT#QoDG2@%oK+Ai`mPJ>SJrN-`{gsh;mHumg$kO75)>{T<>}ls1UVXqopG#;o!r7oincag zi8wim;*(<^Is-{{*D#n%d}%}>s+XhPyn~m^<)sNEP_?tYXH=$D$@Oc|vXl^t1*41o z1zeReo~fNpKrup_zuvM_fD-ZmpXm8es5%@z`G~xbN*LU}dX70IW~{R68(ZEtmgp4W zjYgigwto5GK^RbrnaS!}Vb9-^8MnLA%Vxiq0%YaC*sh(B#PJ$g-vFt7h{teO;>i`q z5bi#ooj0#_9lPl_+d8NqG-U12bkfoW$ZpwwP)}yrG_ycrbV01PTBDg;bS>57&M-OU*0!|#M~}fr8)RIGfv?c@-i<)w zpgskh9<2iR%$oH9+^J-jm60kr?3`AQ1b0gLx4lsZtY2l|D=|SMhJFr?`RvMUiMnr_ z?$1kxbmwP6;l%OK>s>ZX37^Y+v90f{VKQgN_IKfQ@6z4A$66;8_B*wT{Ku6ectj#j zBhlycU%ZcZD<6M^el2U^{ zq2jbCt*5l5Wsb0Cl)39RspN)^&9zNM`}Zl^)UE>gv*@mNQ;Y18-Tg+Td>K7BbLlRn zcOzohw`^9m#z@M2E&X}+KxCn?VD8+j<4VN&wk}`876=eS!9sQ1m7mJ%eONd$hhEm| zk}QPwY>3vqIo&*xF^a!ETx7eio6LmcaCxMGK9c?`M5Qzr>B|`kbHiw}15U(a#_~iy zJFlgPrTiz-wTlvNUoWZJT6v4*0h}1Q;JWA4wUBh;IeE67nmBExqW7M2@r^Ygsp1t6 z?7(m^V0>=Ui|%CoV%yJgb&<M_BM^cFUh($fow1%_Pd>A!j{cVW_E8#5m{$SHBea zWxq@qfK*87=F%YLGoPh{tEcH1Cmp?o#Us=WEnIIB#TOsVz2huVi?-R{TVJWx=ZAlO z)w#Rr0(ra7kYLhDj3{ck>Ky%>T-i)N__1NnEoKw4b2|ReQ&@1MRij((n z()@rZ>H6=TFLEPnCpk>>;bTb}*9>&XkeoubYM!)zP>_>VCWY~MC*s{zR?~{hdd}mM zKz`E{Ipi;i(DB?Hk3^d*j(NNhcX6ndDES-}C8nI>Z(^`Dcc>Yyu7)j@bn(O%+$0c- zYGrmDaVi9+d z^mcNE&$5`qSJMtquc~Kd-Rfe(wps|LD2D86MZEKJ(6a6mR3%4yBW-;$`QCmN z(WpsQ(xX(s9=^3#=}%7)R~7HZDXHGyxfa1f>3C9JK%gSc2W$rIeCrB7X*)0GfdE}` znp%NtQxiSc{xVhQrBIlwQSErrw=wc|L=$(|SZSnA(r<-MiZsv92^j z44z_$NZdJcI4-HoTDJurY#h|hpv;DV9{wc?$IDbNg<%wPC8gb)UA ztu}5avIW=0N_L>6%FtB~13|DB<|I^SlM=-DSk>CxrpS9h$kzAB&X=58a_s4k<&SiB zc1eKhG?d=WRcg?GsgFU5aPfI{j1ck5TqX%)soLmEE^2RjLPss1m%p6>Qr$Up0mfTe z=r~r`2=Tw5vfdyhrqO}{l=V5*XGvxTztfMsq|wO$yO*VDIyOrrlzqF_?9oV-ls{#> z8eRIy00`I?x44vtfw~9`-0a3J-Q>xA1zHkQ2lT8ym|RH z*4uOy8`m9Vl`} zdbA|EaI2N|r={6CPJA%2L@Z#&Cac{qROIwr+(K>DxTm=d;;S!)g4j>u(Ms|c&OH8U z30|BxTX{?aAbQ8WPS+doMRD9&6298q&u7N%iawyNuH+1g0OoMqD#$9S!P-1}uXba# zG(1J4eqCD+z5q$B7TQ)zd5MGctE$uiD`42ryvsQjoI=_PS1)yFd?bdxMP=f>*K%uL zq}f@V0P3VY_}!ZX+yciBQGRX1D#i#^k`Fl?HYV$k**4=u(pXcPVGHCIE=&Tafxx!wlZ+&V% z1t-Bf{;le*UabbRWE^Y(otHp6?2eW_v_|zto#{Kya<2|!(Ni79zAb4EO*JaPB~_y& zO&+F;Ds*nQ*iNVAH1z-!`sl6%oScQfQd;kRw!nq7j7v`k)sv8{N%S~Uzzfh90u7!m z=XsugL-hqUC?%A7J?SJ8KAxqpN;KXP0O2t2_l90mh=*&X9l%}I>pxbX0~RPYpStZ# zQti0*X@4P~r-XB?9v?3htMxh-*e2OjfCm3X$@zG2rg~D%h&V8RCp1YP^9J>a+GV+lz zfc4@MXV8Y=Cw@g77${l;nSeTSzt-ai5w38(6R1Bhy1=w1rII;i$tM|*i9a>C9kD4^ zKfEX8quI$e7Ee8NG}T*Qakckf0S6#jC?9t^LXuFOq}Pq;kAVVMDLodKwLyq@sSg!$ zmX>;d$!HotKR25qPT1#<5Al)F>O=%l>JLO;_?>Ol`8pD(-`U^2(VH>X2B{ z|3T_6coa%k>y5eO%->83?6621i{q;~$s4rjIZb;^on6FvwTj^E*uxWm1p04wp#~s& zlw&Te)*4V+IQ+HsPdC%MhDV_Fiw=5g{1#|(%~=4Nf9;)Z zTP9m8@SRN%yae>p|4MXT1HVAYgr;D*^ig=xaQ0#S&TirNu1_48hLr-=VQbIb?N3sU z#l&+9kKwuRoRcBjpVI$0)~ z^NtNrsEXxDw%~aBza5T{BkjbrVe!%?b;@gXf2jdDGy~7j=mSb=wrc$kJoE{Izpv+G z3pk^ zUBOK&PIKrluJFtscdviYUw`_4LW>zE5PjK5gg^Liu=s;c+-DM;04tI+!R7wxpE3C) zF9IOPZI)GO5B^VGmWm?B7JNOf*7zh=dF1d#7XuqBo;TILf4xgPMO*O?@SsR|JlL#Y z?L(Zw67h*L=VFYA%KdGXtfZF11q_!SBEJY%Sp1ye0QMWbg7~GQEV!76zL(*;Wof^7rf3=8uvTAS=1{bL1I^0=ud}g_LsP2jU$bV$O6-3O9)XL zM~dfLrpT2l_IekCt0=iii-}b(6ca0-=OT_EeqL3gx;9E3x+PvuXHPashH+nN*pRyG zI2*}rvdU(Q62D9DUIqL!ND^)&%{1!uF}+di~H1gFSrm=5%o8W0L-l&2ZL zdJ&JU3BRYOm6(qvRzlh0d5Py1qtuR+++~#4UB|YX-tzUF-rDuxOz&?07Vm}}B#TQA zVXEbkm*T&yNs3>p6k825P7*2V z@g{U_u>y>Ufep$oX-54#PPcPzPr@0~Afb;+pI#?`GH#%uWPYIR0N)>D99#?bZ-dHu z{5B<_*PP;soP+|D;o44{x!U4SH5RE8>&?zJK>b{!DRXfjdNl31?%g!CDs{1D0*>~Hs80;70KFSLxfO>=1n)eHFv2l|dXp|XrS z-z)|$b7RMUY4c`8T3CTRWUo8Em;0GBnm=#`E@nY2lz2!Hjk&Nat4nvCK5 zUy4@0#s2)950&0Hih^0s6i>xl@!*ExX0FMGl4FC^(ZEHo{R>v(m&Lf?nL9<(7eI^t zS6RclN<_0`^3)QXJ(gYwq&Vm#*k%)5vbsi5-pm(4K@Ye%nDx$}P34qgi?! zs13A};;xA4h;L2jB@Mj=dfFW5cDlUGzW=_YpUjQ*amJkbqm8?5TcvakMHK_LZ4Ipw z-P{zrM8sTD5dp+tV-TjAVIGKf!EprFsjQ?9yG<67?emgS zqJ$h)pPDs=DM(T0)bDoGH6`j?a{=CI;XvvFs0%VnxUW2zoEj%HeqBnkUP=p+ZIW{$$7MG`S-(hBnvy*&0{Kgw@05y~h!+d=;6Mm1 zex+jV*Xx8O17U;s1T&d_kFNG+rRE-7HFRgM~l4f;$-(hLI-f_g2lvH)@0Su`x13P0xR|5F$OOr0 zdiw!bwxtMxS>kS-0#8%iHA0lPRTYzXGolPf{M*=?x`g~JKN4MMH_6`Ftqz_XJsTUL zYl37+FBFyq8VgZ&b-%JY6p?0*cyH4CCa<5L#HQDyev8iLpa=#{@(DH0Ut-Qx;Q!50 z>HM$8hD`;W7LMJu>jmjGHHA+j&Rc+n|5dwgMEym;l$5ii@xI2rvl?gby}EkmX;yNR zA_9??xSW|&lap(??lQfcxOn8p4O*1QEl`#KNeoD*io|17s#1rL6@_1d)~5 z;SCAmkb@WQts_&nZ%r4dO;@n$L$`}ZH=nRSqbqNiWQ^kSS^}4kO%AtQNZgxq10lZ* zlyx-Pw$86qX4=dhGSSnn_O?Q*#!|Ig&c4`qOy|EBVv}s?E3i6^>NB&oHtV}Sonm{` zoonW#l48jjyV8SzQB5t{C-*cgSLa%;HX!;x54Tkgt|QIjK<-+KLkAkd>94pXj}9E+ z)=|BUD0;fM-0*pDH7Lq9^kFumWD6n3GodtUL|1MbQb;hTT*Yi!tF&^@{V`oX3cZS@ zo_t~k5(4eEC*+m;7BxE?PG}=&J9EPuQpviJ`+kaoE@+XtZH}LZIst}MG*~t(??d^p z)DxJosFXHR%v+jX-OS-F!S>OWkx2Q?9kjE5Fq9EO|EiBbYk#|aJ(-x)&mwM*n>OC) z=!KA_uNb57t$2>rhQkS)CKY8GArY2ob|np6-!Yl;Psz7F=LS~;-#*wdyC24ov`xX% zzh@H~d$&xfa4t#EylD=*Ph3v2SZ6zN%lheEY+58uyh1FAlxN#!VDQc3k@6eX<7~RI z^wG9R^zim_Q9o2ze|v;1ywkPz_EFkpaCT?Di=Yz%B*Q z5##Qh+C#pJExaQ0>b#-ethxL_8q*%yRr{POA(RGo;H*|d>LQP|kt&jt@%>*VwjT{H z^4u}MRrj*wM9oOz>btKQxkBMiOC{j)aB9CYb|XcbhP9WYy~R+5_>!A?aa_Wi)^kp1 z6r)5*8b@-e1n2k>s#I`3`4VsDV|3*7ml#G#SZ15ReCMU)QlwwivXC`PczfrY;QEsM zfv-x?Ef$X54~ z_y(2Omhs39!Iw{xOONaX585`W=^w#M3d@zKQ^&sKmf65^QhwUR4xAwykFTh@Jns%0 z`O#4H%HR=`B9n9a?2bm|Y?TdP;LF`Qw9zUZKGq`h}Y=wch%_~(U+d{zSs z>n@88Ya6B8A}1F^my7I&2?XbBVs)@{Ld{QvfTQnI5Dhj1^G4n0urA6)yZx=-GWvcY zZFwA#qXz1YN#Y72S!{S)oOxl&#ZMjD@ibdWgz)F$mF6biMQnc!KdBF1G8tuXM&0QM zf(SYrUn_M*Q4wE*5>4|te5UY0Ey>sBGIR>H+3oDHnEP{M=Ss2Gf*Q9)j16+{ngiA& zB33u;_O0=1#Ot{g3e6|(1|p9J-ut*N%9K4ZF@NP{%R^VUMLma9(xR=IS=8On(2iIg zs&2Y7y5N|&t{G$}&aJ?ks-|1!JXAe^PI6Ceq0k7j!}{)cDBRwwtmd~wQ3)EYB8?CNyymUKriLQWr~$pbtuK%NOn;L zmC3|$=`ku~O-RhFTQ@4%4yLo(z=U?;+rRm^deo}HVN_JC$^5Py%*b`u>AS3*1!9a} zVv;pOQ{)6~Suv=`Xm5rzO?OHDLX9hAHC+Yd@uWFb>s2@D42;GABPvvU zD`!>@mE?5kq@l~u$CF7O0`)4^b5l(^@Xd~2w2vu)N8h3HYKzF~hML8$zimYJ@2iM?yP zvTkw9+1N4AaC2aC2wU$rGA-RYvj)+s`rR!ihViuZ2Lf8Ei4xgKdmGrmBKt_$$G)|a zRvyn|R`xl|I_PyIvfL1^aK5&0EI4Uj7i+-IbonY*kfVG)m5204Qo2k6Rs?GWL7@8X!3RlhDOVrqzEe_HfUkp-oub)R zbjP@h*IskSwm?GVU#iva6i^kdwqEor&Ttr&qr5ZScHusO~`X0yz zjIyhyrZ(N?IB_RI9gjFp5Txw8Q}=LVK&4w2rD;f%z)tN6Y#X zb>7pB3kGzF9cA@NUV}rEV=>8gZ>o^3IUumRQ9KD|nN8_1pS5bGIYg($B6-oLv^azv zaOxJ7Cs&?SKECu&J(rz%g>Kzu1q5n%!!BfPI#X6u^SCX?ZqK!w zvx49C3E!jIxsIGPnQFtRV$`~$(xIXJP2ZJeT`VIR_==OSm9t`lZX$`{j{mV|3wh66 zSc-NA%5xiFWXx$acjy$}r`cV~Iht@uYk zjmYCw)|T|F?CEt0nyzLS(tT3{+L7ENaktd99e7+@HECEoN2cV9a`SdjFNwc22>*ZVy?0boX%{y- zqK;jRsGu|j=}i>r0UQEImo6}5c`%p@Kk$uBns+1`0@w*Ts`ZJvTJ!o zHP`aE(H1sSQ5m&nmd_)AlQLvl&lb0|`qifQnbGaSjxSx*=GLU0q8Comz756(`nh|y zb*mK#n7qNyf@&u352@N&K&o(Pls;hWp%zxOFQ`Js$Y^N4M)68g-m`> z2WvGVIx$ip)o*@Cbk5==zw)`>4D8lafVq6n3%CeqPqNAJEeE5Q4D4&$c^_6_#3%HJi5{F13?b|>-iEp=ZH@pW;I zETba%SUH2-Ixm4A?E;uf#0*!1gX*AiUdelFUvE{Iv$gw&w$y3m?UE+Ixx1FfJC>7U zxjt=V+iq)Vvd-Ah?+h=yqJSQk69inr{-os?o+|Wue7vBcqn<_~SAdmKpXL05y?wiK z`3f()pnO9vl- z0nxXcJN~`4?FEQ_;}>t-$#6r9F@K-*l~O%nv)j*B@rILCEg z2gT&woctP_P;bt>F_@vCbBD~NBQ1jbwaPPBGiLFX?RtKZ1t9YR<KrqoB&$CMNG;DG59hm_JQCtDwNKW+}n4@N6M|nBWRC*5FNv z?UvZyKakb0n)3$Lw<$)2)B6mIILogNeSDY7hP77V>iDB`yh)m+3%u@ z_C{cdcv@F+|Deu*h)9i4Oso2q)z^mnC_ArVe($*&lWZGi~kX=J0V0R~{Zv=|8)Z9cd8 z`o5@}fERHbE$+L`Co*#wg^{Cdn&*iDWX%YmxE<{z%y^)3^3Ehdkn`C@1u3yoDA&-p zU@S~KzS?X^z;}%VW(?!;+PH|C>1Yf;L!4fV)hyvuT#-Q+QnnHBUb}y82Acx*)z~am z1(qFl#j9X!)xUUYw*}+hc&$Sna=B)s5;d=iF3oLs-fUwW`D^^%KY5qfjy((uNk)~T6BwN%}EdGXAyhYPfWRCmr~N+5~R{4 zn7C6|l6kocIO5UGoc!d3jV0%m%t`G_++D~cp%)6;_w>`caW_4-y0k{Cvb&=BNI1Y8 zXg+BzFEi>9^3MdIgXj@*owB9rjX-gBjW@cu0ZW`WTCL(P2+`o)DCd8J4dG%-EQ=uM zxSbv>JKr^TBnG>vAzmcNht*)OB1Ls%G70x zIs^C7Ri5st&4gPwq` zy%3kL2Omy2UT~cVrsi$kKwd%4yx-i*Jj&L!tpo@)fiTBrjxWXnaK8}`O()3$Z}4=_ zTibP70Zx1Wa*Zf$&GcaD>>1!>8EC|(`RkCfxvO=6@D4^Z3Yy%V{CeNo4NAW;@{q`L zPq#!vdu;VfRmkBIz=!YLGxvbOGYRi4n1DG5IIds$p zns5e6e0EuEbU^eTjJSmA0lc`@8Q#-O1&~|rOXM>tIb)W50H;sIU@rWymXvYc-iWRc zn)@zR+BtQ_8&dhn$YHqyvpS4@PH4^!PS@z|KYaeH{eK%0zUxu6A1eEgdH(0aBgH)&WX_;!QdMcc25Uv{zVq?;x!4p6 z*6dCMeoX6tsrp`h_qY&EQSc4ouPC}NR0h8D;OLR$D;k#haS=Ze?t3U#L?P4=))j0B zPB{SJ9Q(&9&sX=D+`0uA%{u>s*c7SrH|{jUzPFeF&1S--Tpm>HmJ2y=1P&0t+(u;v zsV>cnB<%;H&V}qjA$@z1zS(A|@RrL!eEjO`fq9r$Ur}eza2;$j*0`9>i+0Pco!gH9 z#KHG0VcsBlGI>G|Jk4-Q|3~@QGw&XNC0x65aMAt0^mCwimN3vLsjaM=5)MqrluJ9n zr>n21<`A|cKZN}N5j+iFS8(Y24m6rYWr(>fDeR>X2fL^D!vm!7d)QTf%6vj-emHp2 zV&jzR3h8)a*a0DYO??E2yq~+Z(R?3y1GB;|91%<=`*Gj~qNHMuaym3@In|#0;RM7* z1N1O||Dn0D{r3=t#ey)}Z|_(9A-s~R+*>`mY<=&qKwo&*e`W#vXCJTGzZ_ahg#%~n z`ki-(hV8GswX)yI|IF(>n)5E4a{7ui0Fdze==kpR2~CSR;j_P-A-Yep|61sVDv)%D z%d)TQIFLK};UWIB*YD2ny#+cW0cZOb3s!jFrvAsze_YbfgA-6`aoB(69REjwJ5~Ks z29o2zw(Ii~?&vQTvuscEl8F@Mi-{=7yUhEA;oxJf0m?mf+va<%G=6c!)!n+n@x|Hc zeE{cu5A@$1#%}D{2twtj-@X%yIbA=`&BA2*T0dZQW9t)|B~fB^VA>(QEH~(W#vGu5 zu_v&i5~2(GJ+WdnjkgY2M`*ITAoofVrWE`lEEUSjw1ha56T_7W)#uXx_Jw#c@O|b%wS(s5&2#w8e{G7i@vjW2 z+x$EPykzu%WbOM4%_YDd1%JN>Ki=anRWz|5;u4=qS^F`MZwv^Cn7co1*V;H{I5Tdu zlo7RWiT`l;zf=a&USK~fuQA*YuHmn?^51pt@d818brAQzzv8z)$An_cHNowznSr zQWw6Xu%F=3%}}4iAo96l!9U`RD6k$_JT9^xc^K7R9Cb7b%>Y}Q=VOFn z=B4#E1_4;7U*bC|laZ-NI=J#4$>*xx)se?cpDMJj@U#6}?)q7tXgVELX0)4jJ^_)V zX`0$DpV#O-RjY$`(`zlI`i=nfR6uUYwCwEPBlCYPKessVwdF=BOji@5+E+|c+o?l< zN=V9OaKp3jP=q&Z0jbGUw)oZ1jDMbIE?uf#q68RgC?d<6E8aM4RF9jB&>Jq#Joce>wyoESkBZG;+V5WRR5$=KinZ3^#w zQNBvZfaB^W6q%vt9i*D#;T_D~FSnGM2ypFbQ18}fxSY)5C5IxoGN@A4fPVEEeC6f9 zq`BVoo#Me$%t%NEjCm7YbqkZ*TO|x_JQvn2*6*ma_$tJ=5xygP{aG-`a(582djnGo z5|UHWM#6}Ra9cSs)>?r=&2T42g_S#^tx!+`b7AORJkAot#oXSW#=PwYvV6G%IW1!V zTHS#=F?GWqX$4N^%FL1(#;(;HwQUwLBV%0ys=$~~kflLNa20!MQh4F&Ht3X7QX}Jt zZ+Ac2uoB*)hH`y6JbTwsAWSL3wkF#D<;)r%Ux)$;N_70(S1+!HZGnmPRtIfkc5dk} zCt!rvH{y0jr$R@D#yJF`m&3*va2_Zz6RoB*APbp91_wh(ZAJwCVXbLwzERdnn^~OC z@ASe~xG?1mVG=8qG6v^X(}2Aq5t%4;&!zQVEJD`PNEBA&yikw6trA%m5?H0$6avDQ zrbrZF#S{qO6d&>~6Q&h}bz5_8kM{PD#iGG|mm6UR4CNTLFvc)1G&no7MW;rR!s1)F zW_O$psL)QeAa;g)^o6>ybVrWczz(4|Do9?&N4~T2optZ{*$N~!VhiGf0yZ1g6Kt0B`rnyo}aeAs8)1if24Ak&D4a0kUI zc<4E+P+eGSjeoQgKZg&8H^5!npQZJwGO>z6B^s|0U;<3aGmo^+2DR+^RED^!P!46a z32axCED0pzp4g?Ox3NnGu`eVbGUNqzgj_j2ctY}4zk0-fE*53-%`aKHm8Rr#lk^MS=P%Ef`Uc(3|vrTTj^@4CJ71#PKxLD4q?js@Kel_qe~<@<;g}? z%nFlUl!Y8n-Z+N^^F%|py`2fAVFm!hE-Z;@siKdLGZyowZpFoAOVGe18N*o1%k&>& z#F~_d%a(4Pr8PzbWcIHu(%+2m(`CC?WFDnZ+jqPrw4q47Dvk@r19Yzhy-@_k0Dg{$V+GB;3%v3|sv>{%ua-qO{EZ8HM{fV5bP4~V@h!`(=; z-WjbGVldUri?PV-l{rc(ZB&`~os_iz-4gJK1Y;7(=K7`znrqC{y00m`2%?G+*Ijg5D6~vsch*Hc>DT1w4t@ zUKAViWvmPWXWpu}5tyMhuN%pMvDs@_W;ATjM>*-b-iZt3n_Gsx9=3M=eNH2m>#yMg z4(ay?b+LU@F~|tc9*r}euMKK5AjTCNZh{}WO!ZZV2HvG|a0|E#;|l`5 z{yN_m)Rx`Ij$vWh7GsVHfhj>e?@1Iu*Gk|ikPC_g>+P%y5j^u)_u#eKp)G&-N2ZXF zyORb~YTQL35IyBowZgz)+*Oj1oVX+=4O9ua(6PW2=AEU-8$h2-Qw)P#Ss{VQSHf8H zrxf+%XC#@C)|DNDm_IF>UtF+6suqj0oe$o`#p3~n=+hyh2Q2zveZdH2Lfm7i6MFII zqPtUuk%*{S8E>J~K}DhEHry@KP>RnN)l{ie!_5hFeDJ#|cp=#-$y;jlysfR0eF}P| z1)b3dIUxz!u_~RKRnlg&yaXr=x^AweiCii3msva?#(Hm=-odQN7jjzmLPrO4u{6Jk zUJ7>Qoc>y^#nWq>cB1tnKV}8?b8XM)-0(eue8$BXdY6*A8-LS+J4C{A!y407H~5uS zuWcS%uFmLD5Zu4;aohc~*Hk%F*K2dNLqf0)ah)tY-_(!D)8wojY~7>>%b8EbHap2! zi)$qJcZ)til&5jO7Jj$Gne%TPT z^BHDY>2gw7@oeGJX1}%1>ImghwC}sxskzlZH{K<2m5>_h)lD-V@4hJ}%z5$aF}?)m z57^b&Ouyr`R-lYml&~`1;0?-sR!cc>=_Rdcxu9ofFF&PVcfXK{HocxNOsrLMm#2~^ zkK6FAlZ#Db*rfK3Tu7y&xE} zI8ym&>OPEC7P>k+0USpUjzy4j;z4n29ZAD;d4MBv~F9%C16ww7PD(<<}CkVNPtN zcS6qhSv`BkRzx>6G0Y=8qz$-Sii@C(fQd7ImCX}@mR|yN42Nv`tFdsU6V^Nq;4FvS_?%;Ca zUz~?EgkFvB=;npTcynn+2By&V`W;WXmlS0G&%ac?D~s7Z0h* zeWjX$7f7BWbN~d1Hca(_G=(y)O)qAT#;JDpY{qGbcMt4(J5QE|3D-CjDTTrG#`;#~ zj$PVRYIH*qUF5I%N`KZ!qz!9GNME_UeS!mvhZ5?O8sw%F*&3v6+E$Wxhh~~w$%$1b zsehR)IbO#6*vwgha(bF^*T$l;ddNaWx9RX8nFcX7W#8*GZo5dzE@r+U6yJS&SJT$>?hWt_VUHYqeW6~!vBcKj#cDbp!MMS$<6)ax zsCopv;C-745IY#@WAo~+qWQXo8eR_UTtV>3!X|qvykxTyTHLsEv`PnY=CGp6c9&@v zA2s%R8p-=?SB%h|ujl0w!BlBxG?JtE@66eLZ2Rzx0DJB^d546Xau{e71{-XxPZ0HX zO?#gb6gg2(DY+ks4!EbAc-7aigQYg>FZnar>6O{^u9sMCXqXaDUOMG}Ol&}OO{gZV z#@%0r?_~_%l%9eR9_3P*)tdhd_tvap=}4>9VwLz{LsM+ZPdMX`nc+MGm5dnH_;{@Y|i zi)Q@9pltn6a{oMIu)J}rs`IEU3GP?BX*7g(yE7>qL0`!rN%sPgVxD5;v0H(CfRCPj z>sRY$Q1^P}!-R?{Rii8`MY*U>8SyG>TYbayd$A$mn`#+rf$dL^@&(BIWNDSWk3fj} z_%DdMw^u)&!}YAKGYd}h&gd5>F&QP(MP*G<6~mk$gt#zPXC+Mu`ZJavAf#CZLR%z* zD3}%a34IiJEnrAxCo{!s`^1%PUM#*jh{=b(m?>2)nT8f0xYh9ple8)-jQ7q?#er z<8S)a@*B60zC4;rw&B_Bg-o;sn`}CQ)YjEiD$y70byJyBtnBJ)TsAdZ)>Q2%O%$2W zWUE%Ea3Xt%q*jJ$7{E@yW9dq7Q8>+ms84VH9Rz@r{hFX|IpW9V9 zFgExrA^9avEKI|$mw^w~Wh+-^M{R{m2y60`4Dy z5;*5|)H2bPHQAe+P`BP6FmV$v@Eg1{ID=*N?(`yOO1AejkH6gW)#qbxlijGOUXVn~Qwqr_w z>X{e9@W9K=4kVv$v>vf4UyM}9NM?AVtQd1z(o8!v>%B&~Ube-CySKPY-b3p?YcZAq zjV<+hb+N{qHTHBKQ#5I|E&-5S*UZBKnuCi@>X&rozUjp zGlP@F%hfMNx93qax0!2I*ANP>hV@XgrD^yzh#EWp&PYZm0RiT7!6ODmpFaS)$P0*Cu@n{l74fuUeYwN z&M}RtE^GzI;(ceI@0}hSRBPl=e~!Y$V4z8HgK& zB!qk~KB>KQuJQ6l$I0pvD?-EC3TTDXIx(Z#ZY;u=vzinGv6F5}U}7cUpX2D4?tn_` zZI=FWpKEk#HlU1jY{NpBC%sj-p(BKD24Z0Vd-U_?5Vl6?3o5sL^FER>U|&9T)W67n zdKwu|pXiW4wdDyf{Atd(GPR13@X;l74A66>O|A^xZrV}}c%L53dOt&5_9}2JXhVy*Z8Ffx{>#s zcJuD3j3z4gN8i(&}W5nq@$P;;j4P-`_%orPVX!*=ky46?Wt@ij%jN>?}<~ zw@0UQ1^Ijsb1@90&Y1PJD8UL^Dz!C3%62WJS!hPJ=kTrr5Y@iTY`fB$o}|#8rs!zI z-d3)V1PaEkS6?0E2=*vhTW=GEc%fkNK=nx*ITeYM+7^s(;kG0j$lIlAdNKx1Pc%1i z=etcdEPXadyx7U4O-Yc8{w-5We3&OCG-So9a-Dl_umCuGq^j95*PdJZxO_G`7p85b z;P9}?jA|~mBcDruXT3KVAzQf|WmjK~#aj@%dzbMc0)^(FwwgsZuT_0BKm^i@*{>Nt z4IDXOa=xp{e4m%qSAFd>k;M?t&_b(|xu!W@MIRL9=Qr$udPm=kd)Ys^#Z~HOZL#P+ zCekyk2NrYbGbTQhWE|opnlGcYp$vzIZ3CYvIw94!9mXL*3jm z9^5WqwBE{7nEL9h2T>qYe{>0M!Zbe^J*AiNB9*^D;jvUrHdLOA+N5(94&U&TJJNSN~`h&cg$oeZ?833Ac6N49~es zB~BKD@Ih58bJm`}@qu4QFt`(3AcVXUbjaQ6Hih`k)e`u0bqICqCaT&quM79C!|5cw z&*hDj8AmK1I?932T=+FaP9 z>36QasBqJ+Nm$>XL9XqGmYm~dQ!ZA++8OTE^A9$#BrGVpCU;N8VBC?mi4BSwD{@p( z{ZzM=!i!XqbG&taO)@q za!84F-r3UZf)V_bHDyu?I3E)+V}Me2SHGzS$T&);PLMBNk-W4~da61g7yBTRlKKjo zdSmxURAZ3ml~%~KecCmdG;a>UH9*SJBlrZa$Qkj44$6RxLfpSR;w6_badI%wuGYYd zWa?Nj!|(tFzWYi7uHW7|8T*{V(nJBA=gmZ$@4L0=XU>{wgYps>npy5kI>L{dSVuPs zMml$^R`6KbiA71Onr3Abt8IR9NvmIUsSP(H_zdNqF1H#Dv^8ffcoAU)X4%lE;*W^X zAm}A|$J+t}Dzik&l+hMVuV#d$0~pphv~Gd6kaf&3GUZpw(?S zwTw6$<}17eFNmCM??`ne!!2Y+Zxos@048EKtW z(A|Mj!n`{jY0`RdKys}E&s$sX{z*j7{`R$sB*k$gk#@GKq&UnA=3?W^r<2nC(9^v} zjWjg$seU#bTfRuu0*Jq2;sh(TW^DVQPixLVcrLH>2yVPQ1DOCECd|K<^L>lrbd;Y~q!=_V&?T-_Kt@1&DU`c>{?-{RJrYvpxYS{Fq?RVQ%hpoiBK6 zX>P$hZNxWlP!l(p(I5OH9wvy(b&03ys4gn29 zrgQ{3yn5VZaMi*loEgG*Sd=3bRy6pc>rH!>RPM=hEoS%+uV>hp+-ayUu76w1-)8Ob zJX*e{vVSyqNAQf9wU6M))7Xbue;sZ(5yd>#G7aQ95^K@x+&)kxRO_Nzu~v2a3fV$H zz3~`CoP&wD>gNw6PScpfNJRW#^@~!ITB^oSL}!wzpuOv4DL?IURGcRzk}-1Yaoc-s znPnG{QRsa`}Tb|&MKi& z@~485QUDuSQn2db9J3@0y{Tb{H4)?-uY3M%7)h+0!kJUHOSoS-Hp;h&y0AJ6$25gl zGYip#`H~#S*{T_dv`P5OW7ojqNe!nmR^-hosTj|1+e6CqU4ufzE{dYnv+?R1PdH9* zO7vC=>u%?U6Z>{_xD^8fPRt)ud4&#l1}bp^4F%h%(eo*7n@cUTGd1CN(sv5R!ddl{ z*E-C$?v+lDFAjiFU-ju|kI8YvtKNoj*O3J0IV8@l4#9Cw4!V#A@vEG`QO?*V)ctYE z&RehssFkkSDv=FYh(gc2@sXU+tO0GHM@RIU-Xft&yTAuU9H)s6YW7OMa|lFfhlVWn zgM=XYp?9h1kY+=&@|4lgk?fZZ5-dXF_!8Q`4O*39G7Coj;W#os@!?9VJw^xz!;tkZ2fe`p~9 zFzQ%IF8&2mq_U2X8UfR`kkq`Ef47b+RojN9AnU|A${^vQbfQaGLi@*H7XMh6oY@w* z#I5()f-)p?kP){fBYT_tHx!2-tDk4a_8nAgq@#{@USnQ`$Kz&HE1y?Z0Ee=TzSwHV zhu@Hgr^wJx<+(GAo=_TkN_bVl+mA<|d(Ovb+goSVWv6SHK97`?8!3OjX_%JW&|0ps z8KQ!Hvu(0%#osHvA#IXx=@Qr0ey4Zh=yMgzr@fRXt2J$vj<*YeMUCQ&y>>n|K7LwJ zf}07SN)0zqaG7jK`tli^vJ5Ce$UQyXSHmpxmJ}$nX=v}8F~sA_<*v4t#Nk>8)y#kg zJ8}ZaY^stg44_`1_80s5e*119Y(}G=HYIop1?10YS4Ln8D&g8omRaeD?K}n1%#~x! z$efjP!RtUFPt`OC?`9Tnd-DN1v=27m$8*8mrj3N1sIiZl3d+F^TA=SM$%2V4 z%Q@orbR6NjQ3_l|0t_=Yx8M3bqT_TP;+r?8dUA41^BepSkY>OeX%f>Z7_Y@EIlXJ&y=t8rk2M+&H#x|07>*-GzoAM723#>r|oS+Mept{5HJ>G#`^woPef z9fJuoHM>9hkE5W%OYXQYn`0g*z9WJdau zZ5Wrz5C>K;PXiy)Ye=*DHe79)w%TmH=@0$*tq4}i$S8d#57#%>>iHyJZsq+Uq1D$k zBM>m>k#qBTNn!d*qs(#-T;R}$%Td2&g%bV4iY;xHv z4f^f+J}jE!TU8VaHE;|s<81F!Tnqkur#Ni6HcRmQ^TFxop1H1TVh~Z1rVl=yBX`zACel2T?C7xQ|3ntxk?(w;2XLYm}wNyg3X0JB6| zWr_9A6FI+en}+RP*9+~!F#~-E(mR#rV~<)lG~!$+>_b8Hw8f?m%!=Tio^S!MlBZ=; zDqNs=N(AHul$V3U*bCi}bL-1P!n`K4vP{svVvtJCA$hksHXmOO*j<0v&E9O z{l;B+BPDpC)&U)oY^BoROmK=|^~~_^?*oS~2g{-a1-pSdbq5a08+?~aCnRu)#UwX_ zb`R+^j|0cC&cbdm5yiV=vLMqZ0AHy}oQcS-lCmveIb98EtGq99Apmzv&GOP5&Vo6q zxDT(euGQtu3ahXW{e(7(%J${LB!G;9sw9|TZudc}m7vg|Y*{7E&7{|Ffb%1h@OMp{ ze7MG~=)DpH6h6BwQ*jB5YV(U;!q_Sy5H4`y zIygU`k1|jpP8s)eR!juJMd#4d;!|u z*iu(b3q&FMoB!(6 z*C{2Z^^~!#(ewb*R++?0Vau(uIZZ9ha%o`+%hp26U9PPI!Cq@qG@Wli2%giNFuTC& zKVpjKg-p1Jb(-x*YgSQnE%oX?F1KQosU(dqtrmn!OE5cf=tb40t%ksNSE694q%ZIw z<~5d$rO$5g9MW=IsI`g>b)#@sK#(KoXCd(Ss$_iX$g)w%N;yk)n>zMTSU*dWD|g$5 z-h2(mSz%&Sp9!iw_TJwuzcZ2VDkrT%0%Pa#FaCU`t>d8%@yxE&lApaB!9p6%)sWXG za4m&~R@I~zgegRDCtnU@a7|Lkk(s4pMy~c9ep?Uy9YaCtnUP7i_LXk3W=6W^-J}U_ zd&G&C^K49}aWIF7O&88N8$j}k(@$`21H=>bZK$po3r2KNA6{w$E%`Le_u}b6v$ACx zH4gR_w?br7RL~J^vl$??ed6PjMV4q)IUtg+=Ab&YSEs^ynPc)2s-vCHu*sAWstsK$ zoS3#>!*yZ=8nUm)sACP1w$Ab=p}|zAi-r1FvvD&!4?q8cTs-P=eHAI4QUyiC389#Ev8Box5~P?Krq^sJwV{A>P@ z^b_Dp#+CbO`Ujq~fapA1B^d2^c`8-C zD4f%rJ}5NMT?RtBb+KPpgVjm-@Y2j|8sPQpA(h;nRN)cg*D=4Df*Zw#PJR^j5TYyxitw|2wMsf8=&WnO-qG;fg-m3`y9)y;Uqe zeG%YCk&z%b!qGS4$pBgoC{;H8DlYNh3hl%G6qXFFPKq6 z#X8Nd@t+czloN-}0gfJJlS->FeWT4y7qMCKw@E5g04m~;q>xD^a1<=k7aKy>l=9*X z==h`LnR{XR>GY%hN^31q1*9iLss z&J-+Z8xsXobj@dNqZTrDGO1D$Qo9Pmge5RA5Z1}#UXu@iu0_^6sX=OlXs0g+2MZdB zx!C*gbW0{7d;#=?lZlO-tPO3LM3BF1KKwpSn9q0yqGJQlrVDI`AXa(bkdc2?cl+!& z?J-A!(Lw-Z$)K>sQ(>GbikqID`!ryGlI{Dh4?qmOz zO_v%x($1R39XEr*bD~x%!VM&}+H_b^$@9;sBj^g3W~@fQa-yt#NvC$wk0r8t!Aq%u z*X?Bd-CS07?k*LDx^u~{;MwFVYD+Q|^kz9-h~e^hn9YhEU1Z13-7L$Y-rYZ5FwDg8QyV zUa(9x5mI0qkHC_au`L^}Q*`|ybp88Z>2SbB1g0ZJ7Q8Moxe^mW;&dW#^=+#MwVU}G ziV|IhT|3rX#kwCi!ZO|K1k|#&UVec17Kr-JLEnA~9o*Ll4q*AmPhKETBI=$$p7@0& zk}}r(GH^cN;-1~MJTK>P^p$`Ji?X_9q^Bt}Ua#V*VxhD#yz&OKM~#h*e5{9joY~8WQ{RkL7_sz*>yw z0TiT54l*_U#H}Tjhn92?r2!trv-T-v)iJvgtRSfLnB7&NfF>Z%Vq_Q4?y(KA!I#KP zsQ(qB{X-)u)l=RgvXWzD=#BnUzR<;2-C4lo4xlGF{;(474|C^!B0Dp2@B9@D-6}u9 z_)~>ZsY4r=!`v#sFK;BAs>YGm(2QZ)cjL0@sLn9PW*5se*f@4w;~`X9 zDePqMmw9epB{ug9#n4SPGU@CtWY^oMIR>cjyO$I##i^0GG{w;4WrR=&*zv@R16|tt zx%|-J%c3wwmqiZdImH|3{zzWm^EKuf2o1ll{fIG;fTnf~p0~I{TRk}2vPp(4CUsOU{tC}4P=SwxB~)Egdrj5prJQ!r9Bz06$r0)!EI03>c|nn z&kfRsJ1zxV|I@D;t4Lu^l{kyssJ^t<<-$(=b-i&^#`U}(Nq+eDFVY*o`E7H4{ZREa zh#A?=v0dV;$qTRY!ENxZdXnKl9xQ~Ni%oXT!u(9w4g&5QFvnFUDLL;;yP5zWVR8+< zJBtb@5Bi!f0{M=N=rW9=X@88fEhV3;m71@5S%#Kzc)vaKW`Mg{(;B>FE&gsHpEcZ| zNMGr(F7j`3`z4>-(t23}r0n98X{zt+$@M#>Q{q?8Ryk)hDhp=_;VWE zmRiyN7bB_(y2bTQkoYP9+8klY8!VhIGrLpvC?T68cpa3+ua?r)`*j|Ei*5egcC~RY zW=-HfeK`-y-@Tw2(<=4+^Y?GBxg9%Sm7~w*m|V?rFG)XU(rRCF4 z$T9Xd^u=m6nO!$|eg1yyOUQ|QA`T%5U(z*>Q z2byRod$*$DhnX7&a!m_uO>M}^kocap1cL^f^~x}DUd#{>cCZ!%-i&gTaaM_t=?JIh4rX=@xPQSxqsav*4sN+LIV;BZgkzon>C#R35C^6k zPRqm_!p=Yr2urZ4W_P{Yik85<0l7ia7iuJ!lt1ax_Ns{Rh{AX=vG5(4^wrFZhJ|#& zrzUSJ3&-v46Kjxmu&2dI7U+}`S=X*PxBn6Tv1@;Qme=BR>Ar8%-?QY0uSEHP{?Xasq+glkpRRr* zYp;2_&%P*V-*D@nJkw7WaE1aT@1_+yyq{G6!-xK131gL006DYJQ0fD(q5hu_``rR_ zb%6FH3Az4$fS~_kGw%{F&876|%woP<@aH%6<2ir-t#j)}@L}#@pA>ZKW+ijC)zrc9 zamUZy1v*-*D1W`+iKCb7DqCx0CriNO`S2l9jO^Hf@e@E_kW~2Q7)xGx1jIk1MHUCN4e?_kYckcrMGkQg>HV=$d`d5!a z12n0czx?p~{yqJK(FdRH|0~x2>`DHAcK<6D?PJ>iTXqw&N+b3wx*zJ=*H3<8I+~>< z*f6~^Psj4$mi5IA&x-Z5-?bT0%MGY|wnPNFe)nm`&ccsxHvGB7->~1aC_?t-r+x^r z{Hoc3{^(jwE|f@Lb%g?pSZ=uT19U_CF^o6>)jKvK@$=Qb*Iqvn-kz6>=|ydIRKUQnJp@8a9wI?_ZRhLu=u&9{_)~D8NhgWt{UEY_~ZHTiYnWD!)-b@ z36*b=Qnyv#VYkrvp8nt1+1G=6(@i(rdbWP>gp9{eIs5F5MD5&3IK7XxKijDBd0=C# zhmiZw>YjIAr|pdd%%HYAWcup~A;)dBfMqH;8zX*<9{l1!bEBM`+NyDJZlBZ+7^Z{l zaMA}1T{V$ZBl-iKV&y34b@yJ=xjuK+G3VfA`~TvH-8;--VqW{=UR{)$3FYS+L2 z{a*}tiERc>(N2s|YtN5+f2Vo$q2t9UPq5fD27un!=9=~#d|mWG?K38w0JcRQciPt+ z;B?=32AI5CWLR0e_FhD38u{bhe$K{+_W)8|ZrUZIYAJ<9+6kePh()Nf>Ks@TTd#0R z=1xuO{d*Vgy<5{EU%GJU1O-8EkwQE0?lCn_)ju@!q>jI#I`64`x3;4^iHvuvu1`ox z8cP}*85^nJ5$E$+6i0}!civiwOBh}1E~Qky_T)E;<5&Ot&%IL=)3GwoQ5segO2}hJ z&Pn}t=$HSz_BnE8J}VD*;lNKZ$A`E%YgG)LgJwOBHRYrj`lB>Tl-cj>Thw^#_@d)H z223SS?%9%?vY$#E)EPtTt%&V%E>|?nHtw$UwU8z3VDtNTySMMpMZX=^oX3y$I_=-6 z<4NG|@;bx6U3{q8*4BP<|MxGROXX%Qp&%c#^>0t5mv0?j6U2$*8yzawz1?g^Yd%Zw z+veVXv+eyLFt?O(=KqrHuTN5uPr6buJ0u>pg|!%dF9jdQ0j7Pwxksj;Bsxb`CasW* z*W@sb-i-+!^3(W_3_qi?)tx7YILjYjnmhahe}E+~o_njS8E6}g(g?hmCb9q94UV2M zluyNqftQTeO6{$&&-FE2HI!H6e)cSv5B;9~xijZA^Y;?4x9Lco*F(CR0>T*u1s}z- z>gr$L-cP%8*)o3Zh|~{@9OdrgSgQTIIDYk>Jg~T?dQ1gqp9s;I_+cKZts=a3Elkn$ z7~WE^gKLLN$ylg8*o@>%l5u_Zqk!&h?{f&ia;sT`9;*G9w)38Iy9BVCLkxW80QuJ` zq7uG|HfgHw&i*UUNxi0^IDIeYH%{dP7b;S$KHS@mo~Q-x=sqF$vpS(^ws*^%0j#Oy zenMV-BBgxdhEztT%CV&VQvZ6F;`z5_*An`e_7iY}=^jZ}GZvfy?FJJe2btH-^X|7O z#~>ub{tIujzw-UIo%b5DG5ZB8J6=ln8)0*SfacGi5H@~~uuCs{826Kh^Xe<>OBWNs zGGt;ZMQytL{_oFaQ~9lR?a70SV>_00|E>Ue0PV@cpHZ^=az)yG{cCAIjb{ ztjTBF162`F6cHt$BA_Tr5m1oO1*IsU^xk_5y+a^~NHHKFy;p(ILhl`oNbdv)y+ud> zDM{$u=-&Ie|Baq=?)}D_gqe5NthIit&bW@aTNhXFTsTGHv*+{K6k~rABQZDB=Px>d z?;(*GiNY){r~lQj^TPYz>+KX6Vn6*(IM&Czzei>46ZVn*3yIl&k=S~W_VhpUaYYD* zKz*N;=ML{)%mSa@b5$in#d6qVuq>dDrvAmB<65iyl59`?AAr*X1E^p1We)O#I|Tvb z7eAh6!?)sk6w%K@D2UT8UCbRiJuQ*d5=d{PE1N4Oo>pAxqolEYne%DwL}`WO{9rmg z?+6{M)o-3T{$L#bmvJd6h%8K|K%sYfHR+M(fFYfj^&=(?C#H&ze=rn|P4Vy%dv~gG znnj}iA1d?Axg|Qox3YV0tf)_IxnCk(ys+z(8Q+YIQ1Bn4N-!&SKVolkwy4bmyHd%# z9A`VdAmSSSXSw@xN3Mz+T9VEaDb#%UR&3i|0;Piw`(suj3v>KZ5HY;Z#ZJrBDHZqo z5k~DN@S6=jCO@k4`*XPA;9s(~DA`Z>hb;YNIBTi&pT8-|frdi0e0TlKSJ)pIbpB z8)fAn_5NRIaC#vnx`wTNlhM=JsxusqFI@iA!f|{>bO!5LQqa!#G^gI-)qH~%{{wj; zL`n?T9Ki4HwO2JauPZq523l@lY5!R13*vTN^ox-C+lK$wXMxIwty*^Z+f!DJK0gD? z9%AXwqDS$m&g6T(F38-bfv!?$5++P0*(rE?^6u`P$BXBPXUC>`?thJPow@&qK5B`$ zbU(0x+~vQf_=9S{zfIRjcXxmy`2|R?pSPBr{rR<6Hwv_|)g>^wj-DKNAm6 zYH?S2`pT>`*!|4uclNtWYk6b-=r0)kCudyTd8WH2;U{kU%j}}}2qzjrwyBI>mAbP} z8@tc^$(MohL_H{B(U*D}5&j&SXm?#Yn2z}@X;5v;Q zpqN8nMK@PKf%>&$)lG&;@h%Nkeae_mdR!61UG5S$;@D55VWHUm3&Rt8o4Q_Ye`QF4_A z&2pNPX;=!b181mg2_;)ZdCAx@TeDci_UCVQrrl=0C;aqWr5mK`s>H6>92#T^|JaIk z6}1&n+f)mnqzH|Y9e*HSKb)uMha0K<*;h{~Z}e*tUa!Fa{cZmE7C`e#wZt}&OK&!7 zVkaE>b2i!`M1hooIDCmk%)A!y3j@ zU<()ORE^4wzrM+~xn6C%diU|8^KL2o>#J`%VpDR{gz$Anj^ek^{~@0LB>x`^zOYC` z+zFAq}gX$ z(uQLcV=d9k-(s)(%m`0AAAb~5B!gW>mG}7lp^E;T_8%`*qWjAZac6n09@lo=SuDS- zaoeRb-u6o0(3OIJbM>9nmKSQ^JF2(UrSJrb=G291kB7mB5WT zC)59B!2clNKYw!dE)fkcMBnAI4jVhcP@A=8Dpy^!>*M*)9w}CwAyI)oF!{d~=YJ{0 zz@lIJVPTkyEh8!0My1Ve?EJskN}6amt4L4WM0@=I(?6__h%eeE9bk15C;6ynxeI^!h?APUF%5CP)lOhn%n^0IqJw@~95{$L2RQS9gq8tjQL9)xr z#59q0`arsWmP2z|TBkFY_(}G3LoOBx2vs=6Zcznwc7B?Y`8Za4Tw<(-fI{U1rWBT0 z?Nlbm;*fGm9dK6=*uN>hP{UxeiOA~rp!GCvdSf`Xt~^~r@Drg|2esvCV%ryUxnV63 z+M$*FrU2vW(zE_m;j?`;Dad9*r3G?KpW@(VweQPO#R^t)4y?Y#ragiKs{LxtL?%y# z4(HkpAHN46XT|DQmbJl4<7nKP>m-fZRag@~cvj%mD!?Z(6bD+*v^S*!rl@-GHxX;l zOMq5l50CnU4vYU0zwi`HP?PCp_N5(bF^TPSdq&mYTcVBJWd-m2&l=`$0MPlskV+#` z_WGD~(vi9m$r%b3dd01mhy*BsnH}||5_t8V-M%dq(?>8R}qdxI(3 zp6}Yo#J>L8Pj**G_i|bWNho-VA5{O7@26ah5BG4YncUIdgc>R}s|1s-SVsQSm-vT+ zV@9WTbNReGl)-qolHD1Q%fY3-D4RA6RX}SC25;$9*m|W)sPZl~z4U%5T!lhsDHlU!WlC$=}p zpuICrYa##O)zH-|5 zYzs{vhCv)~BJO>XXq6i(%6v>V{7>K6&*=hI$aO#~6^&~OS~@uHpnAtRqbC~o zsB`rxO=Hu-wROdD$VuN@zEMd0%`Eo|rk5V=CM0DR$0epVK2V&u8mC&+`aP&7*K94( z`dmFS7Lfdd2@HeJW0StiATEZg;|$oE3QO!g!lSTf3HEH*Ih;RWkFj%PHBoLvz#I%C z=pwA87=4s+;i_XK$MtO3yUV&(fT9{TBPw<}S$s*Q5B*6_0DG)eBg0F9aS{yvt=sME z^sHw7sy0{KXeq$`zLA-V`@!rYurWm=s&2IGNDUdgCnVF`nE_PU)m+SwT2vAqHGe-z zY5qQf*kH)E%;v;Ge`x4`_SBlePou%**jCFzYiI*tW#&} z(q-G21OX~j7={4Q2L+0E#+#7_8e>Nwv4Ec2JgRJ~n(}HEkZa`Q-W0= zCfc>~lRyqozM)KyA=-0!kpy^6Gs*cn72l9nKejr#N=pZdi+{&sA|3ta*dlBkDWeFE zA2u;HUvKzGVp#O7JLr224(sGNAw9STVN~sURfZT|GHP462 zFqLIC#D;%>M1}50uw*5bZG)Rf9&_((wBd}6VLOsWjEeTkX4w%JHJ?DX&Qb`C65(Ew z*^)V;6*~Kxty4OOPj!FlJ0D2^Pkib++buDpO5W0rQJJ*LKc^JvS%0bp^jMe1%}XM#NeM6%pG*31KSh7IjX*=r z`mEsbP&Hs~=MCTo=q&f~T^@S(c=)8@>IzmE-@6>E{ zuVX5~wkMuvnDBQ0<ks&`L-vXTNJBJ5N>vNi&@1*ivGnuO*n_|7h_9pAa^5Lw3$^DY6R@7LJ|trR%DpKlVH-o@dpjq+4B*QT@mPDC>iI^<o*r$UMg%*?xN$c z0p8q}eMk>0sh#g}c|HT;gZp`jiQO$58#gF0_rPDS&k0v-GEn!riCc3S#wLS_lN ze4}^*KY*W{Vr`=8pljNXhyF~7=NRj ziy|!$%kH)Sq!9fA?n!gLbv-|>;SJ-BmgsMzO#8j%n34t(l^2hMQz}hkMx&W|j)AjANxP~Jr**O( zzPVwGbWmcOBZ!Et4PSJpnXv2}UZK?gg0F?}uS`uMFJw9hr^Y1;A4NOPrTIBjP9N%9 za2@CLx-vHosyt{+#*7e-_$>}S3Mm~pD5Y%(_ge@FDnSUPD=S@CWol> zYVVl2dKmXotN?DdFL5S7I>)w$3bq>8T}RupDpsspg+7E_~^&^;Tbc~vzL?s@KfSB1kjv+Z!@vRIa6|J`an}1sY^_JF>Z@>Z~Gqce<52`I& z8C{Mo>%@z}Y*iW}6B<=e{T^U^6@|IeBUDS=LmGRwWmbLVZpF)HiPt7PYd5GfmJ1T! zSP@$+Br;5?k{AK`(VJ&nEvFRC6o$esQS#J2cRel`6Y7_#Z-;0)8l*q69%$&!;I%e` z*@P=jmx(D~M*EiEyCUeWBxUisqZAdJI_&kqbUs{A9sLNeEJF`db9gIj?Bb*r_tVR_ zm}2IPGY}4F+O`e!dq<-BP2qKzoIoFhF+{dWPCtVZ+myoZ`e-t})KzMudXL?VNzT``HD0AhbyKX_PVcS)_Ib5&y3AJ874@h^^3DYq9Rxvco1jm;q8 zsNRB*u)M{mRn2LnRGk|Zb}FQD`Q=#!5|)PdK1l7NocD5YYo=1OoOO*2_ceD&7p`I3 zbc4cvr_+mbM_$vMK6{x6(ev8)U75vYy}p@Qp|qw<_ma>Ve5%}QU4x^U-b(M?`*AlK zWxoOSB|_+Zg30o93#&y`0%r8P$Ko&lX)Zp#;CF1V$S6sF&uFO9HG;PFsYJp%Dtf%F zA=TFj8>YDm$C~uOtq~z}@b~_vIXB;co?RoVMpxGP&vEzI`2sqtRGw{5380U&%W@@f znPqaS@r36Cu#cUZ=2VW&`ss~G=FvW-@$TR)!%w!k==;2b(3`bONbl_81FvUdcj04N zV1ieOqAmReCllBs13VQq2mi(B#%r4L+z_Qpc*ETvF3-Ofh$dT;kYmvUO^A~?=nG_I zzUou8eMY5$UOLN3#(hYGID?KY;oZ+ZP?6}{{fK>!0#~3kbYFET{RXN>B}G7gWS_tx z9S5qBEps)|Qx#qf9M=BQ_!;yY*EY}F%vu*6*iIFjKI&kK z!EObY<|=A0mZM;?Ruf^~RUKRry_pw$N{KIM^rny*1d1l68i>a&i zYg6~?d_5(#nleF^gJk(n$=FA>&cs+VlqWQd(Vb0f2r;P~-LH0+Ec&VO6MI{!-TXsv z5|PdSQa#8`}jpuD+SSyT-`gQ;pwe}P!@gn&IRvP525p! z)Q&ABGbndtbF;Pk>zD0Nzl@emucPGos-_yfVl@wbJusW-FXDUkku@((#=jY}kp&v| zuKTReFowo)G36th=}H&s@2#$4>M0%^R5Q!xe{1BB7qhPG4OMCHRU~(x+TLMdlDU7B zmk5~q60`20z>afcUO8ezLPXw}U90WgW`aJV0zB}Vpxga0xdifxj#?`6$eR(K!JO!I z5S})_%2)(DUs97P)-dti(Pf|6pk2v7Dqi&+-rzaH_+KhJERgf$Anjs~Bas*SCiTgc znL?$>SEh}SL#~m1&X%-msEfJ6H`kb@PeE@*n0pj=)y4sj9T#o&hMT4IU`%(s;keeV z**;Vxi`*I^@EzOIVbV3o#v{;8SAScx!XI5OZp4&v^d+-Xrqu683GP`wQZX;B>9kZr zs@t*g0lazjVG+ z9I|GG{zbMGW&FHur8^bgQRlq&g;6=V_(peCRo4{OH{&UPGK zlGCEFTI$B#L2F%-5dks5E56^MYx|VBeFcU^4o08$9O|3<@o_IJj5aEK^SNzsD$t#` zG2n{Zu5Yv&wO=pQyu7^R*u!Cro;%C_fw`(kdZdQHfC&+ObnIvR&TLnHvLOA`$$uz@YH(qE&4oh(;V^S(~t=-T&!t8_)b+vFOpu_cgpx;}H) zn{H(zq5nKNNFVOBAn(!Y*^zTR|MkAl`!NA&gKR4%zE6KKd^<0F_4(esCCb(iR9w&r z3@9)guF)2xPv?oZEo$Xk`@Lo1QI+H#{v@hcza1rtq>jwdLB-qUdf{p6x3f!}^>*pY z)bhlkUO#H%&$0u97Z{{eg;(^b@cXjW*CCbUMshM6v1LkMfkeU2)a~+tFki8K6=D;i zraHl!UlLsQ^Gpm!MNy?ydCsC-5-2-Bw4y=Wh$?mB_UsT%Z6u{bY1)WWuFmjmfak5E zucFL->+DVW#EJ5t1ak*r)@4l_1)w3XfH+I2quem$xxw=m*a z0JOngLH&$yPGCT@9YTc!NYWyQ0ICM;k32J{U)&@?7+QR~FjRPhOY!()aGAGMvGo%B z9=odL)ciFR&cx0FY0#wl5tN?Vr|_)a8}rV8cx!~iZE7)?(&m9*E{&L)X`fPRW2+=! z{LGU}^sL4xch0Zo-raxe+8p$875Y%|3sg2skfI{U;YuJ-t|$+E~S(MtIBjD$Ap;Sc(L##a7l~RycI6TU!Y1e72sU+uz^=2E~|!0$uCl?OP2` zI{9p!-h(C&x4CNP&3ow^oi;WodZDiC!ohYPA=Oc@PY;z=yz?Szf4NeAb?PJ1m`TAjXAzoIx2&ho!;1C9L6AqC zJE3KU51v@vLspE$>3vxh=4`fe<@6w)&4H?KM32K`);)(@rt~ruUUG z>NO4aN|z7Kp2l!J7DoU}O^n~lUKYv!M?US}AxEwAH`1?#D;XR`#@H-J!N{x2*~ut) zW{z~5GS!(B;mc3b9zl6Y6-IKzZ)tn?T_!IBjUh!jB>D4SNj{BL* zlO^j28J0fdS9fhpxpC@cX`XRZlg5F2=dr%WOCyD#$()+!)CBtr2R?m}xLY2BZME$& zeMdC@>STk7+Ep{CcPmC_>r^yjvkN8w{KoXC-_}khLsXM~m6&Kjw$8qDuaA0H0qGXf z$`k&=U~ct_lK1n{1Y=66sChMn98?RsO+%tE%Nq4?#9u0Py`18|Bg5(1=RU|VITe9Z zidxx%q}K!JzHNCD1itRfR`y#CTkp4Twbv*G>8(UY0lPIV z65f+dYd$S3jRH-tY9euiw(z6%jhf+uKGTsN4aJMqJIB{JZ0DqXu;D8<8z4e5O3&$w z#1!nPua8y1a`vOtf)Lm$y_5IWRU`Xy$%n$ev@64VSh0K|@odi^d#ThY7vD$@+huKy zeHkItK^@kwoVhq|ksVbR>jTMKNR3nyOOXlHmO*>6G=3>72MylIUQY4-5j#aNLE^ec z@~`b{N|Kbuih6{>H%Ii0yH^{L^||BzhX{Pt>%+y57n_#Ky@i{>T8Ct?4}WHVFDs`T zMWB%T%eMYM3>{FgAo~XeYc*7?+4ZIJUKtyeFN?P&%qk;`MxQVhkv!vVe~xO27>7PQ z&`(>}fk+j{O0#iQ1&FK^1-rU2>W8!TshLs*#|V=a7dut!!SVTyl$V$GK7dc?HMshl z-1LJjH#}mdZY$}({zNI=5%a8{f*xjES~h>rPgAd}cJPX}W36$xE+Ah~e+4WFxMC!U zHMV6~QjxlyH$s&I13!20dD1mf;|>@~tAh(K3_o#SIkRq@E~}R&Efjv{+`u`!@PHNt z{VQXCk+zN|=^;e!!q%k?*5hQN6;5co50tAm~b1@*9M`?@Z;CcOkv= z*EcsmdZnVZsdfu=4mErfG~D50+t=-OT7CO&B^&$eOxPX1!xQl0N!HxSw?-$Y`0P)_ z8p}i8Bc+z;kux(a8mLyz#{DzM?d2~@YGs}5OTdz>;00|9dnC_NE{Pg*ta6Bchp;${ zaa7$S>&Zh_%GtfBUPn3X@k5O<7(si3cz-hz($3FQE$3PrM$SY2%~U=3Q@NI=dS$f< zPgIYW#OdNbVq_wp4CG3G=QnS_3k=q1ZRgJWRvS={w5Z}Ut)*~OC$qB0 z+s=cD(S5Cuyj$%yRPO?q8hApaD8N5fgb>D3@TY9XQoD0y)y%77p1bano~oaYLKner zXR2-zt;Nb2ZPtQse2O2pqq@r2kWfX>Wu$R{dThjq_qtXGymaS?9b}epX<7*!hYR3q z%r^E#I}U|P9;N5TRQi*P|?FWCrdFU z!aK#7v0s2`>~3D8YMS2ZAQ*L<{M?0|i(@ml@Kz_g%#wxc2~Y+0E9wn7-(%W*qZae| zP}30iNwv~6=9!H%N(X}v%-)QQz0?wZZF0(BAvyOk2{QU| zQ2Z+Qr`xQLjPs0e3{&yklN+=K>ES%gWJxZ~F69^xEY+`Ir+GzUa!o2|L_7pv-;|=1 z+WT~4O9EMfXI!y40R@@cu0Zy}j=kXlu_?`QAhq7DuQ0%5eBabstnMZ?a{HUVAZ8pY zTfa2niP0D9s?2Y00I%!89Yw)i>C#bT?=A!RHe516QwP|{B&+Fl4BX;)8@@d-6Ei72 z9QY>jV>^g3uC$Y&0!ObsEZeqmFPSC1)h|5m?o8+tfuB3 zpm@}xfq6fA*{AAOdqcn#XX77=Yr1Pi!Mwh%9xdT~fMfPFPxTwJ^g&yDW%z#A4EL|( zV4o$_i6Z=8+iem!MIBK@ZKfnePZa*lN7ipUfh-l(@TwKnhP>~5{Y6q3pz_#kQvwvW z^O=GZ0xz-uT6Lhk+bT)_l~|>-#j#ZIDc><1_RTT$y-%UR*XI+W0^Y@8iMY?f4|wy{ z7p7xMzkG`BxB;8r%Uww#bElVS?<-|~+fs5>K>g(=>iO*tscw!`9BxXIM_j_eIYBNe z^xoHFya9nHQ}W32=c5Gn99Aqr=Gy6ih}6aV4{Qr>vL0U(6v#$5T0AXTJe(%S9A(?r zeWgeepbs6aSbDbM%`Z`)lRxP)gaB>O~xixoRVnDn+P7WTiM{N{kF0kUg z%#PgN_dQpMYn`GIKPLI~^2t|) zL&%lULmJi*eR@Zvc>&G`aRsF%I|gY3l@%%*@M{YX>tpp4ddF-DNxXWwWPGG5eZnM6 zQR9_kW$2r&BS&3Tw#Xr&J}(?!R4Bh4C$eJv6Wj{JE!-!!gDeji_30NmIqCNXGs>5B~aRET^P|_$VH5e%0!&Y`_I$K6+e9K~L=g2r;P1DM0uV z>_w1@9+6ez3-gry+Nz&jMoh2T48e-0FB%NF!}P0}H6ib|@G-#JW$bAF@y^n5_NQt^ zqN${f&b>FkV+Qe}R>~;0ES@Zo)!q)Q($2hbeay>%v$)8EMKRWXVE<`R@rzu21GUK_ zfu`bavZUMa_`(wRdfUg7))e5D#|C9Ud9}j_s#ZOUP1@sZ)WYlpQD$ShBPsn8<--8w zst+?>YoiqhtDeL#Q#?bfJJIoSF<}*pw;}qU{wp&jV?a?wv*MvR>_X(RDp~&JkfF}! zcOG9m4;*;DyszC!Ua{^)EPWvWfQqB3swK_j(roWKAjNg+rAf@Igl@wrSx%J7?UiOV z8}+y&3E{Gv+d1T&%jaVk9v#cXz*R+HH;Ts89F1oR^!JA(IghkTLzsIv&K}XnI%W*g znqk>bcl4Ymm1b}$B?FD6rI2(e-~^JgO~-vyQZP4n%Y`dvrI1JSAK?oF&Qm`WbQf-o zPWTw!D2;W9&ov$B$QAF2(gVMNVsE zkp_wQGIVbPcx zEa z&8SW$IFpY3yFsRNUGSq3@W`}OWBG!|M-qG9aU8O6ds02;EPD|+b?I7`M(=7|&mGCX zp&(nt9v{QVq_fW&=8dbn9jvQ>^`u8p3y#&+Hlo@H9O9dMcdJulA)Y@>V7g|ep+2)p zl*{$^>Wf#7gpq@Yw(*_(f+~LBlbF^BuN2p6mQ>bv{uZ< z7ol2BH>Byd-%RI{0jod|A4EMdcjR53%Ip(feJ~>LhufoF^XqniZe2p=NjBXg1iLh6 zJyWulFYeyd?gR=Z<{uE#lIQUbHW?Ke?A~0ed3t*&VqH)NQ62KdA5#|C=B5Fc*H>~w zr=~XC4)&XTPfCOYu8AuVn#R~;~Z)Unw^bK51YYP z={Su~!?(wM&xfQS`{%U@ds5O`kB7ELUsHMsa%v9O+mz>5V7;t;JuxpK7^3<9qc4|k z!cJ7)pIcwv!lZekC7w7^C?%gq)>=+hSr}rK_-p8K+EI$;h+EeW`#VDrYJkk1y~gGkmy)$a@kmJpxut6 zPmeK3-LsSwSgyE@A7}D)2L;A*^=SUpkYbB~c+@5y6e4|m_%jzC=65r`?u?TLEiuu| zk*Cb4k2mfC3;kSUo-Nbe3mafO!x7y*$QY{4l4(3F>=7q5T<%=$Dd}1F9RloC50#+{ z?MKQQkbKkrxP2Y>q19-@efUi9PM%%=yCbZ$p2dn#b`!*ae&S0+tLf2)ZNoI%dJP8& zPL%(9G+mtKvH6Au~o_lpa6ywzn~De_&zOR}_XIlcn+KQtf~CwQ-o z9Mti;oZ&zVUI5?OB|uGKz0J|~mtrV^Iv>GE!vlfBDvR{a5}}mam(P7sn?(sdP6Dtb zKg)PG1elxY8N```jK~qbgQR{M^xby88_Rt((%f>&TFOYN-iU`;W;%uwdjp))6n zryCJR431OwB3L2Dif}Z=eIJuB$_qd1*T4WW<)Lpp1=1t`ISU}oZPp8%u7_wF zv~AwhGWJHcrTeOVm<0oYx51?&gfJg4{BY=*BZffu4Brvs zt@a?XA&J}qr9EL9R6r*uh>CkUdpFdKJ%Ujs$@haL@=>9PwPCamhR^{`<7iva1~gE! zHoWNyIp?V}DQ!feFN1%{ zN;0Euhr3JOFb#g{Q8o27UY-?o4TlWBzL^!=FhHq<4$i7?sC_ z6Y;L1__ym^6kwQ0GF<&R+&-HU$*}6p627OzFnK^h|0q%(=kCelVQn}&cJ9c6rmBdtqMTU#fGYDB3+cq?82mVKQ=o^MqH}H$ zE(lk7Qj^!@cADd2P!Ek`tHDx2#1e2X)Euz@`ARo@3Jh>N#_pG;m5b7t0OnlsDfBPV z#j+W!fHe~81Vxfc{n&-d>IuXeg6ZDvDu&dT9ZGlKiZfh%3M-zQ+3=1T-!O3;^3S*Q z&ALrH(UO|8=L{>8Cj1# zmSakDskSKMQ0?EI=7ocMivsd-#T(wM*N_z-d~BXZ+!`Hd*CtRd1`XP#)9oADA75(4 z25aHJ6|NKE>pSrp^4uZU-hCZut)|!Q_nHT6IMAyFV%V&P*6bY4@PTeN-_hA%MlN0UJ`?oPuT`LhT3|4K5;GZ zsia3D#k~}WdgWxWS6}^naoo;vu4OAT`}moQ;C?&V;m+sLTJ__9i)+^OVwX58!+l-sRSBm9~o+))qPtB-q ziW{1-q0R%DszThtr>eQP7(PGqTS=*WXpqucSn#b8^!SLL8YM66T6i>V^8A-r1uC~3 zKfA%O+H!d*HLA$1*Yv7GXriEFBH48o z?D868PNLbejN6p&9>+1UU6V?m%&Id`9;@$mXMMM_-T9szHnz_}W6xoO58i~n=+Fol zqG3|P>ch4k8yIwe?!3`h**4j@f~u_`-x!tIsA$35E~-wD3)6V56fHcC6O%G3XGg3> z<>Mb5NG{4zfG>!#kK;fjhBl*6l1%nRwsKo;Rqr=SU+Y&rwIBJYRjyMr!9s|fqOw>= zH-09@kZ3u~;=fpV=3n{7s655`?fH1KAHL(ez40)EO$;#zLxy@;^}8D#)*wE-vDFt_ zf`ZxF7?x53?YJ`E^O%U2!?DfvGm+V?94_LnMWWA|sx=(8gS zia$keN*5)&!j#aQ1R{~W3I2^gUc9iqV-+BKQ-zIz|l6B)rxRBR=Uur+^-ZEa+`cmBi- zu^E7Tiv@gRk}S(x^@-FI4aYR*!^dNG>|HTARso{oKCx{RT`{SH&?A5a>-_GbwZ-El zM?7d-tU9J9>ZhxkrEg3F*LLdh z4qZY-cJCu|hF@`#mVRQ5>6^Ky{JK}~*`LsIhQavRG7G1nP_Xs4T8(>G9Abibpr;kq z_iI13IS9EfJl`&}kKXkiKZneD&R(m0kTTO*$*%5ct@O#)+MUY@(w-Sw)%Z;*k>K;% zSBtYIo3>$WChb$zRmR7r?Hd}7z?<~xPnXNx7jGvILN>zr1k4pjfdW6)b}qn9x7%S`nY8nG>AEs3OH}x76f?u-Oj3Hlg62 ztK4q0CYCX#Q5tJZg4l<45X+Bk)D0v>&zn_wxmUq*MfBw%^3 zhB@Vm9JlIE9HdeY>sh@%wi#X(a4k}hE{WZf^7Dc6%67S`a9OJ{@D+m0cK9!@_|W&lyhJ z-08*6BxoxDe{2jv)c^)#^4 zLLqt7BvSmf3QH*a+1!QaPd{SR>a()@lu|u&b4F!sqhRGJu`=ht%;BxQ7enzWZF^zo z($1BJAAK?It)Qyv?1k)axM#8D#$9%~b$C>Btg}xBKtHooF{TjLl-3C`nIs073P|~z zeIUNR(v$BfZ&0u_7YuJuU*=Q2lN4W)O7HWPldHBZhEh5*W|Fcob!Ngq|%3>Iq->VMV z<7EHTy_3-dlROx)99pNMjcFp@Q+6KU{F>MR_WA&)j3mS9YJW-LL^g7)8ll&3J+rI( z)%+N0>PJdVZg8l=+(m!!im^ODs99qa%x*SoHta|x!l$yqyz{%e@Bc0TOE*53hZvvP zj?fn9b8U9oNGGfuDEY$8`BA^Jp`P`VB#MdENBJ19%?HS%LC$gdcJ@t^k=5d1fe9Av z2RjzFtsA3+VJq_(mweJ{7X#^eko6`=D`@Rw=>|66MiyR`MSqk>uV-B0qhW5oB}`kN zjbvD_Lr*8vzPtU0gxq=jp4dEn^M$wDf13JTIUYaDL_1mpe;AAY?X>w(P)JfrTX8!2 zf~0K-c9L2(u=ulg#tqtU+7%t{8I#AxugGTDja}yjelf{V#n1-q_j&!dL*)0I;`1`< zGb$kuouUONdgwftIHI?;HJq}pG?W-()Gam*svIETWU(XOB)L!>;qr8Gt|x9qg}3rp z@IHzpRBz%EKn zJ@1_o`Jr05QlWB*X94f(+vb|=6M;g*DF@Qxv9s(8Wiw9rIP>E?(P6teliOB%!{pA% z8@b77$_AFmU;%qiy@t=AsSWSOg27p3riQ}!1?BBDXHol}?ukO96raf-Uy7aeCFDly z)~gxa<%$Kr@5xPn*M52SFMB-vYmZlXTaN$JeK~y|JCqy#(nYA3q?0;H={NQ&8u__1 zPo&*eQipvfb9`@cQUY$KRKEB?EPh2NED^zysck>-S(k%dQy+UK21}$ym$$XKrv+vp z^IiAT|8UR0NpEa8Cq#rY;nJF{~LTxq(^#=8^Q@xHl7cJTZLo|t83UNjoNM8 zc_(7~)D{Oo)(qU$I)kUbq5hKE zn%+#wG(-#MHSK*r%Z7D|M~Fw6SF|g+`3E2RBa;U`g1)j$E;MZte1tHPfj4D=rk86% z^QY`Zmt%E5V^|_0LF>!XzFYg>TiNuuCl_P&Pi~^?V#%ruPoGcm%Q|EDb$Cv1ZTiE1 ztu30_p#*qqw+^nX9bEk;Jb&t1@oGt-Yhsvn**pNT4rQQeefshy8SK22=C5L6>zA{0 z!EkI2511l{*H{mp87^H|;!D@Ymfv%6PS$g4&evp1l@rAdrf7qtU7)J!sPy+?RyTdd zSKjbKR9o`)EvZx7YzFuV_9BlYIwJyEc0)Y5A!j;mH~`3!)*e}EV&hoeWpm)5yT z+j8KJD&Q_5Uf13_f$}>kLpBext~3pMW&|A{^=8A5Xu$`_;|9DQs;9lCwEbD?6XyV1 z6BO58ebWZ>awuyRYMCkxD`cUk?2xpa(u{8Src@>4D_hyPQFrfl{`%-l)-}4iI5YdP z2+zWCfOYlfgX|=|g~je_yI;LKob@M{(cB#;n;(!o9enn-_AkwHy8T{V)C0xUe1*mE zaIhgaf7HT{dhM;*R*TXLS7}bEsb7y04-~Ny91^2SZlV9KACTbcf4V4oBk@XM#?&%> z@>b5M!Q-cTCjY|qC+83zde(eTRSp)8 zXKA$k8pX&$see>@tKuwAKqRGn>2vnhohqJ_i?lnO$O5Ua8K-T1MR`9ns^k^rkeTn+ zPQ!k5wO?g*%wxzeH9P^;47{AgHc0Xi8%e?tw`QG;q_u>G%20W5qmLrhB?oxuV(-nfvc{{6PKLxYc)4#r{@mW^l#Z*_t9e3Z)=V-YPfXKl<6ir4Ij!CP|RFxi` zh<{R+%=TB?JqiZ+DrpS^an{t1!8E75ooKam&<9W&-iLp`zq=#PRI`bzU3-VmL@KU&?8d90Q|0UCpfE~66zZfQ8sRz;N%tUr(juu z@$>*Q!pVKz1`z|P>K|8VG^}}t->VEgcxmY{h=GHrpB8OB75`xt=hZiDp64IEI_!F& z$G;qfv~U?USGx(`G5J-C?I>Do0SS8f3t@0s=sKdknOOSKEOwkPRX%a49bWolu~sP~ zE#`;UKsEz0VT5bgbJ&rU{jch|-`=?&&n4oqINj+_gMLefcnoRP@UgU-RT_TnhaQwt z;s7id&19!_FuW`W#qGB-K7uG48Gy{wB&9`f6dM0dB#TpCe63-C%_FBjf9-V;s|jIS zfrF6}-tQDO|40fS#oUX79iEqhyJV;zvQ`Y@jr;{{&U`QMa|K!M(Nmxl*nG1r#8&1e z=|)CH5SOK17#)D0E3s-Z0X4kkjr`>XJ0=8VHm?V8k`LWBgWP&_;g?+fc>Mj;c`4C4 z>6n!+Jp!=3KCqJT-EzYGv*pwQ8g53&ga5L`&_@=@$fw^}G^RUQ9tO;$>Ni>a$_gKd zQd1H5!yL1&)IJ4Q0VtC#iIv8-Hui+TNOLZ$`aMg2e0+N{jwQwB8!ZU5NveZM*?}QSCgkBRuhuniY^9ne3=9}-=eeNHgkdw3b+N-a%_g-s* zfagPA^m0qwEd3pCYo7H?pNhUP9+L` zD$#w|zM6l#)P0uadpWiy#>a041oO^ySQ$nxhT|l!azpF%fZ%6AbA&Wy3Y#bQCZ)Rfa^PrE5$@! zY|_y#cyCxq%j6~Zd{L2kyJc>ew)0`Z(PMFxcz}U#4%%PK{TBViVaO%RfI|UuL2n~{ zpC)Z2JILhsB)?A#I0|uXJ%B2KRf1ulRtc zVOU31MyUel?MywTo!rKM1Zk6bx8i;7PSnp>tv&j zl)>7BE)QvZm*KyT$lZS6+xIDYEKJ7P4siW6lw@=rmLALGZ9*|4i=C6eUO;!FE=Dc( z*>7(pHkda)0DFceYQat4ww-cg!JnGpR--HL9@|M&&p(gXX>7yXZ9yCF28-Lw_Ma{9 z0Plz8YvIbJ%dyLEWC>(&t9H>A9wMfe+~Kl@pWOKd2Pgc!ab@e=5W6ggbyU@$m1|T2 zaAuWw@N)4lv=BM*gFdsGnPX+?P$aG@7vZ%5XJyP>Lrb>aOvL;1>d%_X*N&zz+|tcy zp-~_D5@Oi%HQS1#5Gid`{OQaWr1YTqXIuN+Kn9$s@eVVazXqwQiad$pUdEAnIVn%^ z_6twj%oe}=@Hu*EwtQ+d&ySOzV5xnMb$^5{j?Y+Rn%$eL&>NGN@hEaf*E(;}WQU`_ zQ!85ai%ZT@_niEC@@#uWVw|dw#+Qt8ogFGf7YmubUDi$8)9S`78Bx`f@a4S%?X{O! zosCwPM{k)UvLdvr_tK-P6K$Q4{P961v;8y&kHX5aL9_7V);&)_1Rxhr`v!cYFx0{45w^AexZr3mYr*|n%rS_9G zC)x0-JbWg(kQJE=6m1FDS|#|5*kP)S&M^!|PpjCRl(2f`M-{)9Yb%c#l4Y3X)FkMZ zUzlk%Dqiv-mS^zJ){p~5F8IYEnDlBTpR3x}h0B0?R9Y4x;%k+E+$h725Qs4*`$g(o`@r*G%u1U) zLWgHS?bG`#YAUjdZ(n!yp3Ku+ePNh1ar3he@iU-yYy-(W^SZ)})YVd;x`}ILx~mRo zywRJ9nOi9R;@drZ|3zX9v9lF71B&>wlK5-K0;ZwsWFEAJ%FnU=)wkB4S9d9MbexHg zh?{Fp?K65La3ovt@w_9e)0l$|g%xx0`zjN|f?-Af#2c`glbI)ZC@%L1ZFG#^Zxum2 z=#~^gQVgYJ0m)M`w0GC>)rlHp0yU8C_L)A}J9^O%_M~0S-kO--;F&H9Q=)L!r6uxu zhs2t;*Rsz-76~@;mhrD10BXyqhqgaiO5G zlUfE<^Xw!q4d;ZAH%PX=KH>%b?6&@;Ex+BBkh!jyOI2{JQs^(i_;tlKpwvh01*SSo zrM=1#I{)(DzM>~QCPCTjo%(2Z)s4oE<1_h&-U8|=g*rFz=^ZBCk>&sR_l4H=Ze#G9cXA6$75_~sc(1^{^PCq3PfmXL_2d5~6%RgTzIjr!aI$paW3uT&s>f1MpkOQ| zUKV6fc=2opMe*;!=TYoS-l!xrk95HCh5v@P2PBDRGYn1~L80$M_IH_1&oAZ;DpFfW z!UZ2Lnquu=vxXmoD8==J&#-5y)i$9&;uWj=X@2S!%F&YqczmY?b5|YmjE~nf?1PG? zN^IZbIX)B`k6>_kx-lB&!r>gWXkxy#?s)W+g)jf*FDDlHd8HhIq)iK3ZB_ol8_uIO zM33hEj;!rp{VVYb*q)QAn-b%D+5!) z+g~4X^N;-Q`@afrYxlB3DxOu4Y&77U^pzH-`Xp2M@fcm?ot*3Sj$!ivwQxz1Agz(9n|mp#Gvn`jcrBS;1jB zs@Sugibpi&bp-){KeG}Y?#chnw0{xP;17Z|VT1PTksxp4$D2UKr-&zKGBSmq?fCzm z_E%-N@0WOhlg}WtOxgEE{)bfFuQBQ`ieULonKa?E)o-xP0=cczdRU%em0a%@E$@+B zfsw5xIC}Xx35Y@Idbhys&vJt1Tb7~P%2a>B^(SXAT?h2S`IH{|jO#b8sp}%KXnEe?rN%C_KN1ZYq>l|8evqll}Cs zX#YXFoXg@@rK5j|o1fnO6Ok9>0d~F6iD?0jZT$J&zkTDJ-#S{B8he7{+F}DQ| zY_MP_v*h#t48}PXIiS9jYMP1UQEB)K(tmtVy8;w~Y*`V^r#w~;`XAr@3vdN40I4cL zaa8{oIITM~OS8{$Rm#^6H7*n;fx zl2!Rg;@0{7_`JEfxxD=8bqSX}`9HhO{i1)w0_O>QCV?Qh-!Gp1s(X&;lb_p8j4@un z2NPJ%(fogz;qOOrj(c({GErht>}a2k!%b3t)t}_aZ@x=cU#0XUZuGzXPml}9i9T(C zq&d?bYAEvSsg8jo>T>>p!feNmh1!3DJn`1`d&U`x0-$39|BTze89GJaG<~Myv=
  • ;lrGM=y2xykGsXq$naP8D(Udsv z)sSvj+UCOGks`Vk+k1;1MaQY_7!T0arXp?X{y)+7*5zx|L!O$531jx4h;SH@BZaB@ z?1~a&l}SRPcso0bMW4j6A-cTqP)qS)WdBV1N&Ma2MVD>mT(zAp*B#URs`Z!2eI0z( z>kS-Nj!9dsC&{anyUEMJmjBoh(c$`S!FMR#;)s&Ak6j}#16SpxEJLW^;|9-;$x%wq zfoj1na`F9>YUyITz45INl&&tpQQ%T44o2D7;}ZC%Q4|<%M}HgTl@8spQT{b73M{u> zzKx=xbF4$*zlY^^UaijCR6}fpUD@I>E8tiR@JL>D)vbT*0og#>l#>}@QA`+uFv|9L zn{-$8&fcL$Tn8L7MhjR??WH$v$GP$FH@@cQE?lzOD%HR~YUR9SEl@wlXiZQ4KfUzh z*Fto_c&0YReuvvy9YrZ%Go5{hwGaAbEtYE?8>Q-C6qlvq*nb@5ci&-+gHfIrksn*z zzlX)pH#YwO;f9*IM-{&BI0&%q0vmAIW8)wBY3&C*j&B{nCJW*Bj!NGQ8DKJm9CG9$ zjSn!#e#am`7PJh0YhErdZ%EEg8Xo zOclCI02Z!u)Pm?x)4roa98fF~U$)xczh9X@yf5G=NxK99E9*Y37ti>sH-83jM#CWW z0KYwV(t>~f{y^+~lMXnKqZBUoPn;(@jW_v?e>;e=$2#Gy1p^9Wyll`!`Us;PmTv&k zbb5uNgC(VlWpW$^JjE1nTGrZfxfPFM{^;UwbNN>KOK0~B7fv$dWGuXUcid6wVgbZk zvyS`dB*y@}En#40&?;cmAHiDIJ;!KdrgBVGoQeYg<9s-DY~9C}*KmH}o2oD}?)$|k zr?=ynOa2Mbz*=+h+=wIh#bvxB2dD-5%H2l5R;VzB;W@6wsCIDj@H~ewt7^;sk!!JNtD>)F9YMhDu zF+TifT3(JR0Ny4eBL6S%Uhf*g^Qsc^dJjH45rOuXkCwLD$`G%!F8l!&(i>Ku+TyuU z>)tc)iy;eZL!3He=-zNOT;b$C|36vE0}QSO0x%|(_1`+I3dR@|q)}ciu(i6`wp?xCjHmhrG6ixqV zLIC`5Tn1KkQt;(rPV(KNzdiD;A*t|~?%?mg0^qQ4#vp)IO>OMiIUNSwKpO9T#W(Zv zNB%MMa(vj7mjI>KR@!--4ymviVc%h`1BY*HWBfMVp+~UZc2l7IwFYz9w*@nLVnclJC|RfQUe3`=ET zvp2IegpAA4!G9M+39+v{z3e*5!k}~2$dhg2JxE>dVdb;bWdZGNeHtfxCRPSaQCzaQ zb(K#}+9E4v&0Uw|J6XDD6>PF{4%74mp6>Nm>WZ=ZoPe9@B_Mh9>3713<`W1`M8u<+ zs9V}jEfK%05t{IahSQRWGgL7u%(a`xbA6lPZ-w;t13v%=A=m#%JnyU}1p7^#=rE`J zMX;EsqWad&9w9<*lP)ya5&`Pl&+8mnNt&1~C~53${-{qjr2Eq^_jLn81LN`0^Q1>u z(L~+A{TrJ)FF)-*BnW@;R?6ThX_$(is})RX>vI9vb`xG;Td^oyWukB!(cV2kuAQ2z z)>WE03pLxdsh>+tg#R_n)Js5dZrRc3oIbpQ7$@JXyo10&v*z@fUwa_FdRH@H(nfHY z(k#0&3|IL;Wo!42D3HgA(ih!u>HyHzM4R~@oqrOiz8_mjGTeRHxyKli46Wt3V}c|B_f=yBw9&2 zG(*W%QWExil^0TO-ubsoEs#EZ8hAn zV+D%0c*p-G*$xZ<96{cBC{YXi!JQ88y0wPh(W!KSer z8+%hj%G{pFP%y$?q1C$|4P!Q0FpeQ8+q zS|o|S_d+wpxJw(HVk0rrhj4jEylwC1X$*oHuBY3T=l7@gWk3q}Im4fMYG)6dl3E=C z*^&c2H?@7~+L4yaZQ8yy!ORsb6PlRy^Dg7h3x?bVI)D_UFIncmr$1`}P+!iJi%ID@ z_3ncTQ&sk8>4HmRQsrtQp(kcrD1LpI62!}@C#?Tu_0!W5iQZnr$8t@jnIS{0mbI!#Y%KLDP4zO)1b3>eJp5r@71~I-%&J!=ez5B@5T|uTds3GKUGNz z2EHoaJIDB$P3tiBL|x#J8xL*Z)G9iSToD*o6!w zYgZ)K#p!h|zMwuMpX`AbGAMYIQ)7M$VKYRF+O3D`<=_iunVpt95K8S_u?vGQNU@*i z6}US11UW0MXIHu==2mK>zJ86ANpo*3`T8E=(G?|icF-4qTlba%6AVJ6B4MH@-D;>B z$-|mt0|<20*JB<{D)@26oWC$K>vb%|Y>`Ycdl6j-v6h}OTOz@-RLY@SluunvdH zOV%gn6GcT9Y-~a+URm9R8iU$z>W@4Gh4?pNnsQJ`haVn9Egs2?cJ1I&&tZK3 z=6mThnu&=}yJCN2RnE#N>>wDukz(_ggn#>x;&F zb^*4xRkk2*KC9P?zWITs#?eZ6GO`7J7 znHuSg{DF!orU-tERVU0ALql@mq(+}D167-)?v=65h2FlJCAf!m*Qeh^;>IHOU9mSU zL$1XrL@s>^cZqKIwAW9%xCv+?c%pIv|UzB;uBdfNqIzcaO=ySe1WBli$$ zv$&lWqoIe7?25t{zBQMN4^o!8R4z}MEJBJB3xTx8&1~v}_{b(E@20z{Eq)>a@F8WQ;4Bvi}8#FeyuW zY0N#DRC_On`&QGDy46IX;YNUX_SS>xmiRO!cfM2AcZ;>FTOY4&neoG+?(}W8z9ZwI z!CCqboz|SZ_49J(4Si|)lYAB{USMis7Z?axyf$`;c9PbC)u}Iye3%O;``h+I{MgQ>@aGKCE$ zvzEFg-AYBewGn2Jaw&5I?1S|7deSTM8&NVLessyjldE64?F`$7vPHNB?|N312yK~Y zGoft1CUf3{^+oHcK_a84-y}XDAt!TKdjxxx6GC+|^1b0+*rZ^$ZSvP=A(iy7OMI9=P)l%gc)o(xuE9`lajag57E5jj#y`wM4BWf*48glJ|IhT886-6h#s_ zaC=1pPu?rA+ir5gXyHLdRV@?Sfyk&JLPZ8KLCDgK)0~b>2I!&V9}aP+Qfkao=0H zIlK0{$l2Pd&~5>II{;4^7dz^R-o!8P*i8~K?i`D*;lHxEU(D^;`?5pFV`d~^sV1X#N}0astb=r*P5C0WL41TncJIHJS6tOpqFxv3LmQnNsG*X zcn>76Y6FGOv8T9|tNd(97q0k@K%^`uX-*)tMBFCzue&ry@-q5HP-|{0@uzK<&L+T( z<|zYur`EEoqlGy4eYakFo?Ec@-P9x#;lLShl-763{V)Xo^Yh9@S-p7Ga6@3F8{65T zvPw*<%%MdJelBEONA9gp37p*tvD>0s_ zK2L!K>O$(#)!b5LE~cq(eS^roBSk zN48%^Frx?q=~n{jJL^wRd6d%eFBD5)dL~^@dy>n}{oJh^ft8kUZ9^HPHC70D( z(TF%%(8lX|^0gen@@#n_bA~G}E&U1foMBDjJzs9lLq_+Weq&vls5jMPM$cjEs-6?n zm((YH>bzDh&6~S=KF+Ol4a}{43H$n0+osHEgqVicbL7$Fi2ikjmMn7)_Emw0%dYZO z8GSVy^T9%Hb&NchboW-N^_;R8V%fp5vZY9x^E;)Ko|=6KqSY!W1+Bf|$UTs}Uy~oL zoA|3KdruW%62%C?HT7=2h=jr$a0FR?idqJ7QHr)Js|cG~riU|7%6#uKZ`JghWAD@9Q! zApYAAS*^>qC!xF3ABRErL4AqV9 zx%s&Q6)8qNx)T}f)Je+>_~~k_UMy-S_&C7JV(9z&^mBpqt727GrMR(glI9+u<;D>Y zR)+0&(t{G)1j5d;vBB07B~m+@HUUc)qUodHg-pm&lV2e-?;O0Vv!>uM!e}&_kZ>2J z)Y^Q3Dx$kS^^$w=Vwk!B*@%)(05UvgzE36tUvSH-=j#j|zFL_>2XuC=oT+#$bwjaR zgoZcFAe_g+y!;NGIhO#Oc6E9;lS@P+Da4F9AUohvu*-zD?I#~z;`R>-JOcdLFP z*Hmhbyn$`Z-1+cZnQw}wm-9VZTf%Q$ zE3clU_ZL-C}5a`3)qx|{4yw+7yBeSBZ)jq1~Jc;`+JrL65pYj!-JAHMCOoK z`C!CZJ1Hae)vWwYqMDHGP|7}ad4|@VofmYcZ}+&L;n{Otq*f;l772T+{=qUXXeU}4 zh8xPjy~4fkWSp1Ra@lv0m&p*RN<;Yc3L-oO7vxWq7OUUW&X8*?a;)T?<5Q#uyK_d1 zh@q-LZlA;S#W=ymVuoh!uJt$x&S+J|W-0pIn>bll_j(koKs5>0R>G4jl}nK>zprQ4 zfquD0Ud*9cihYZp&B7*dQ6#uQb9LK?&R(f$2^16-E|UaOl)rAp2|>sC$fFAECuG^= zgI#~GCv-3_8WqsPIF&{bO4f;pU*KMBs(D$sh!3))lkh&*@Zv{2%pwcKMFg1YGK>Fl z{V9^CPmmGMA4j_{kuPfrgQ-;a^vXZ1L)ZH3_cu;HuN5T9gc;FFp<*5=@i`>v#2nffx+6h%1?(O7*^`m^)6zq{7rCDs`JgCkO_jtp%W9rkoi+e`#eKpjv`ZPMj~&S2FMHC&z+0u1 zqo*U^*UY?(j0C!5q^XeSflsQ^F)~oE=jT_3hih73>628lRL=N}G;fB!!pys_;EYyB z?SvFF3jF0ENWVAFJgXC4MW|!_kSTC8LwTVr*o^O5iSmF&vGtwxi!D<@09XZyo# z(b*>o@Z{!wslc_!KQc?wv?VR`qb*UvSI!yEmpLxoPXwyJC6j>GjwL?g{wG3wVpWSQa0k9y^r@0?zNk`B^h6K zvC|axnu}b*9#P}i^r~(4nB^qgso%v%*qa-vD(wCmNi3(;s+W6_YG4EEEm^YGo)c1Z zD}&RejGHMxxT*PSJ1Qo#XnzaM(>;*T6t<&J6G~*jJuQ8&xdxF$WanKU2bE|>@JCG# zZ#EQUa5{5VRo^Yq4p)wI9^_#U&<_dO42hz$)S4qwWe-m547_GGw}vQ4Oe?G8X2K%4 z<${uf7NdFk<9Q|(O&;$akl zzz-b`hpEZOhVV3Uah)0`r!eJa()&3e0e(N>rszY_=a}qzu&Hy~5^weLHec5Mb$1L; zXJ^Zb2>(MY>W)G%AERl~J9OJM-o}K8Zq>WDg;(|AC~4G zSjARa4e>e-WTQ}N@#?G1%$vwaPifTamdB7OGPy5)>wt?0X~?>y+8OV;rXiie?-P77ZW*sjlJ5@4FNWgQ;Xm&$2Gh>Ry~)@7!x zVpg$Y>AM+&SEV(AM7&3nuQ{DhlPj>ya&)C{@thiJnIG2?*y3UbgsfjAe&QTKfd{}z z-lj;?>kfu>DlLhr4)Yhw<z;qv_p}w2A%D2rh&k9xE7mM*e~6;m z(w7sE5_I!f^UxK8Tvz&@3iCb91mt^s$9bcVBz)4DfqbH-a;v^Q?5_;Xj}N_8n5H-PWm>GeQ7Js!E}B}YaAAuoTACtz^%r7j9wlr2UJaZb57}n zej>Ls%qR(3`N~U@cOmr1(Z1$pTYcxrlB(W&>=th)h#Kpq1?l`m`yU|5q}ZCJe$NV1 zKarixli#77w_hH<>X59J?L`Ns`fNhKzwn?h3ykek-)f#ue?!%L&#gaMX|ctOnT4<0 zbm#Xw&xK6JuMK+IWvRLEl)v7X64~&0ef6dUvh&gwrd>s5UuaM(iS(j2o&D4L^nt`Q zY=VK(nr3#H$G$AqwEbd%m$eu~aQ2NSKmVSa801MA3#|{^j+Qr7KVBKk>OX^z&Y){|i z$&)y;RSqQC6tjALa$*L1AKU%qczyd|V0wOm2;7R;-Q9(zp1pzL5t{zsQIC*7zKlf> zuZ}}^Z%?|S_6sH_Sp^U@SRf07uJcl;U#Q&?%Xt z7fvyC{!3V~B(naG0TE_XtmAyJ{#Jx0)w?7HjvTtBYj2}Ea{DbZ>RayV2Q|QhsdQ_K z%MD)ey?(=B2TPfcNl$oUZzHz9(5<6)CDK&f(}lB|G@q~R5z|c{5}g72TEf)vS&_Eg zJKNNQ_MQ$-x0(26StK)Vf|*5VQ_V&bJo}|)*E|W?uCe3`_L?{2^IuWn@9zou=3}q<^6Q$$0;jh5qS7j5;3Q#|dkYIH5$a%j>e`W*~>KiiGaId#K1L?aVmG6J8RV zls2=uD{2aXG!qI!)X=N)0j?VLMsy9@@ZuT4ZlIM#2XEtBy>}lEh6R*&;M=*o6@ye# zj3RQy7WCz z74+S$j=|w^>~-D2na-U5*%BN{8-81R?b>O*%P*^Kw_j%LG!)c8T_4n|a$Ga5rgBH~ zl5%#Re7@6oN5r4^*2JeDIY`Hh+P8TrO4`xcvq+6QsKBNAIajr_q-zOMPg|x9D<#E| zs>-*RBpW&?^FuzcPEdd{!~{Vlg7?!%&x84Icb>8+R)TT%VreX91?u|eq(lVm z`iC)A?iF+P)H;tR5F|x%uvt_$U;ZofIuxSzyfxX#{_cZdc?smCw#W$aO+OuqJ(G%E zqYIl2&}SwdU+fxo>UbA(+dYZ_BUApg#4ir*&v8ZN)|J8^Ol)+(#G6weIDcYdAnl8* ztIyj6Z?c(k3H|ioJ7HPyS^Vs(FL%dTDFv03iYHtREu}!E4sBYAL8^I$Xu+kN(P?Su zbf}NH&8XB!biJEBvhYqS*S(wI5l`CX1PQW0BR%p^njWi&+=5jsbXmfaeu9QJP6mS9 zs({LK@hrVrEGrt4QhrQ+Fm=o_-m1K4uR9z2{&1#Rc7HXu2D$Lo+esGzm+DRju)aRT z#+p^c);h*e$R}#Ypgw~xiD-2Vn(#NVm#+MRUoeyFA($pgM1H#8l%I7A1N+lOSMW9Wx~!Mt;52-cLw*WX zzdcG(_Enc`-0ga3*-Ut2DpI+DneW8J2@oR0%t{p(lcn6^Ax(iL7<7Q;#Du-FI=h)%4C41>|>b?6*j zBX_@L4TD)cq<7Iy0ix1eAYY%%^?AQQNUu40%l@f7rg$$e1><|#y4SkJuV(UQ2C_bQ zcfaiM8g?*kM7`0HjZYETS9#|p>c!InLcbDA0(hfkoLv5D$Lk*lWMb$5u5dVIsd1F{ zhlA_O8eMoHY#+8OX`95F8H4WIO)Z*+Z+o8a^UP#ice=ceG`w9k^Rv@tNFCM?<8&)16Y^q(Bhj?EZ5s8i5{3A2_}Aw+?rpEv zDa+yK;hf(#lzmULD`W>rwXno>mH-~b2NeoI&#_hJBz8U+)XZaKZpaq+eg>`t!V*%7t z%jchJCe-_U)(~g5@bo-04Q!;;1T^K0>t0^Xg+i`yp-q#CXtiW8=Nb9|D)_DAjzSc~%eonlE6!4Wc0dNNBk7e)bm? zH}Oz?k|Gy)1d$#(`&Q62c=_rGA&}^+OqH@Ur+>u>*Uq9WnebdjoX=?`L@=YW9Pc%N5zB9;u>2+jA~w5~@;7DcspuWS!AFb>GG&){C0 z#k2VF<`<&do*H3_xuZ)Zj8n>Pl;~d#RiBN|q@)r+^z}S6>chDdE&4}!m2KBPEg4Av z-2E!t%U{?-j(d8bfIrJ8J*Af-b+PH`x`eWK6gCmWOn&i4MC$vT@%BjTdsf-g>PMoT zHwzMm> zX}(x3YQ5%y4{Z%h=Gz3S~ArQrILcLjTMLw z`uR9`bp5(K9a@?;+04S$N|=p%pMw@W!x=3vX*^<9=hiMDQL0awj|kp$|5!ch=vr?| zXOEUq)YR?ctt*lLyviP621y#`m@}0`?rcN?u8Ii7w56w&L)xFg|3l!Ai)PG5yXsHD z|1Vv*QT$Q0+m7>fPfzQfk0&R*%Dqf!lgHAvxGOrVBGX-oDK%#P#!M+Wr!4OCjA7e~ zU)n5YOSN0w%}*NsicNE{2Mj_rCxH>F({$ONH_P}fSYoe}&K~bG48-@^wG?`vE(sJG z&Z5;?k@k8!XJijL9^tXPS^H9MhvEH~47BLtMngBX@I)PGqQ*xkZeS zpF~%l+uB={V?&chaZS{8e2{_QW2#;5cg7Ue6}4}9TLVIyzNkb?Kt9Ff+Dl(>d<4XJ z=4@LWGM3ty{i(%#36xj+MY$1|`gH=j>T97y5^=3wX<t@iaBPgo}6m&7-h}%-b=qzuppJJCCc$Q>yWAd zPA3&TZHbE#PE^6vL3c_V+Yk(iHCyhfJwR4KY!6-CAMO;Bwzu`LAuA#wvv)i2(?zNa z?=G|O$Gl+sK2R;^DQ}9#O2&y5XujO!rr)Jz=EB|k^`+oB&W>u&1u*SCx1m{raM{*8 z$nA9ayo2%ioB4uhQ&0^%@2wCwd$E04zDRW4OhUM!p3>tkhx2fgv{{EcO_Ox?hO*$B z6-n3Z)WC_@9|_T^4AFW0{kF~=N7X~i5H*0Dk~Z(I!rjPqpG9Tipr%f)01kS;c9pXl z_$o1B*(r#!?nX|VlC1RU^8d^m<;q_go|@+#SOG?`9n#j%hK-L8w&nY2_qv{CGh%ei zd^oK)`28q*r!^BQ_V$r;AQQMS3NFGZ9^|Uy~F_JXt)|&10BS8%MR;1+C<75v_lOYV5uN9zycz z3P?@w4$qth5eD*_KCK4VSMNyf8ax$t8(ak8h*qaTjbb`vfD2T~B%{(>hhYzoXuk5c zQD8T4$Zim-OpC?_s^$$UowA^AkoaBS`mX^ch7hAO`wC4~3+bS2eUtCje$Cqf7 z1Ww`X(b|9Ncnx|6;xnHfg<@#QQH((s^M7#guU!UTvh!G-9fhT z+5FLSrENUE$&IHkigV)Pw?8WS*wvq<<7bpf>lGn&+?9|SDT#8=n{secLO*-f=+sk? zCEtBNgs=Ni_DvCuO&*Z`!k60q?&;+7+g`;m>(^REviBpuduSRdcodW6nCV(d2UWH} zUxzleW+?`~f%btX4!n?cAxqAw70wqwTpVXut>b=}xw>Ay!(r_Y90iYvS`>mIBRoF= z8ex)v>!|zpF6R_?qaTJxvNWnxtnLwlcZ_!mRZ!pqIM~~;^u@~98pZNrq;0?Oo8p3$ zBq9@jITLfrq#CY@V&tQ^*$AS^6iBtM05}6lA3252$V?a;y*xrNoNCOJeONw$*n6DF zY`-LWJidzTFpz*Wn%qk%!44E*O>x{NZ(fUd`pD`Y-nu49sP&lePad-G-;vmJ0Cfg^ zNHf-Vz%nYjSvu6stbbm0my+4BR;;-}8#2bQ)p`p-w%P_wC6>3ZskCny!d+|64UbOn z?9~c3vz~dE6w*4l(g2;~Z4~5#M!)uD>ys0=2o#ym)yLGu!dW}6~`4Uy&GUXU|XP)(o z=-yJ!DPsLMT2wJ#MKLogO1s7PVIr{Ct4TbuL6$-628#jH-Y>X24c=W#6WKi*P%Y|w zrjGyc{jc(S*TK4id_d+YSBlBa`)rWJcv7b4b?2!_)dptu#^6p#KuHaRsT;eAS{f&BRt}&t|Qe}8r*qVFK#ao9-JXs=Z+|!)A z61}4=GF8aV_2NOm(%!C$mRs%m_Lt8%cGcw%g?jl59*$a$NCzcET;m&b%Sj^}y6HTd zGS(?Oaq``K?IOa{j(oHKZ0}0Yw2SA*fTf+VZQh`hXV^ByvaKuG#wp|<$WU)bsSIyqs?2Ev zZ`lN{vP+#L-BY&TL zO&{<@z4b_y5l63))k!Xv@=gL}bMbxJkx-?o*)Yz{k$leVL;G1kV)gtk6PplN&+`Ut z{`tF+g~I$IA``7tlxt6C-rt>I=bssw3B%FS(v;?PcYU?gi$uMJCJZU5pq-G&cuRlh z;1U#WroEXyKPg2XlQ)tU))`M^0qkb98y0Q>yBSA>Y6c%jvIyD5%Z2^?fZ##OXfg@S zUUm`kD_G)FQ??zuAUHchjV6Ia{EdeoD=Xg;P-A#7Szfyum#s2lFEe5nK<8#KduP%| z(cFi=4rvzQi}2)&RHwO%GCCtkq|%Z@DA$^_>DTZWJNQ*(pT55&S6+q&*IhKTZNzNnFb-J?Y#n5sY3OXRJUJ5RV_zG3GLIh%bCw?AY>xzaCccf zMqf4*Mbr5*O5$!brhk1|$!EPeB-FUj8QGKlthdKC8A%@pc)PmNIhSNn(ESR*JN>E4 zB@V)`O7_|A_TYLv#rM6 zt9Bq94{ay2h`_68r}mr_&X%~L7>0W{eFW+)Y1yDqoGWApINMoyUQX}cR2`_4*onbR z+l=J2a@Voi1T#Vad8=GH+Yc#Ij9Q9)?S0y;$r?k|fC{-Sa+w09##FkbNTwa3+t#~% z_vxfysNoPL^A@9Yz$fLE`B=dB)p$4@k^>gyESFo03x5nShri1BJhgc*X>vr*%79<_ za)Q@s%$&dq$SyJeZT_R4X7M$e^$#E9388#KH!kYq0`M=H*O+>}q>PTD3{Q3=t5F%8 zHhmE%!^j%^DEVgS2PNPXR;AIW#Zi~$XNXtZgg(-6$L;W2n(gU(T42i2!}r}ZGV$E{ zUr~C9Qxb(nL&m^{cJ)pmb@5%cR1E5VnuAPpPA-y{Wg--`Be_5p8ZNMZ%}!5jelsMf zBrD2c@`>QS4Mf%bP&Y}rTkp~na8>Z%) z<;hN-yFEsTJLU-A00>1dxP~T57;S9?PglxXa2qU|a1+S^3#-~zSuwqwlQ|^cW56CO z%T^DiJUh4?88dJ1k?_UG)HUoY@BU8Oox0cO3H2Y4JSXZp>L6a8Ieox2kE7c|4=Dvv zgV{8Ma!kTq&v}&j2p>&gB)4~@1x3C*qndOxb7;gyxvAD}?O=L~&C8N@o9;dRsFEZq zzYEU{kgKV3%lDG^BF>cq|ce-cE;H@)pjk73)k$> zxvwcYnP((P7R_jFCzTNn(%p$${5{yXF8TKtJ^78cs z6|JJXSgOr;MVmKlWxdV{y2=jrq!ODCPvSVNH<5cv*{Ep^6FrNOfsBs`Z|fb=NUsoS znP9K!b$3idSFwo9vuH1_aqSw}e%8`AEy5epUOOVZtbn$Lt7$Q^kI}K>=JVfN%n*Lu zwv+89_#<`qCw5Y_PcLm0zEA$(iJ+VBt=J07Pe*vJmW}bNVh+W--)w7)@4>adFMNm1 zx`|{;eY2-6rvv&&5KdNx+WU7c^t*x_81m6m zaTgyg@((k8i;b$b?6Vl9HuB7MKU(OTmqB<rhPy2XcEMUE z(rfv^fV8MoO=f4>lJNlW_-@alg9A1)$ zD=6B{hlF@M9=y}2O3P@`T!)A&$@8d-aVGxR`S@Ahe&ToEh?dKTg|s%0WTs|bzgYKo zJ>5qb7rm>CM&Zg0j&Co$0fh0+#l6)Gqc{gx<&G{KJ|$B(WxK0OZ?|iz(-Dk}XejP4 z8B5~!TJ7QcyJ!R+diERSUNXZhrB@gYmsm` z*kTA>lP`6$pb$bnbUEX1t#F9v;;}=y}#eH(2mglonh5l<)p&t%tm{wQ_)?^nA|7i2VHZl&>i6 zp}bjzzpuX6^}DCfjKD`IEX-GUGD^9kqZS4RxM|1%ugrved#n^(88Nq56GAL5JUWn3 zgnm*|vek3jag4p^3rs&|&AucFI}o9M+nfzWyaQ|TCN#5&d+f2|Z=2yufS%(ZimtvN zY(0P56-D-ZIl3e`bh8k3Hs8{cja4spJH*pQyW-406A!pK23#P+Z4vvmK~!$yQA&sj380&oV|z&N<9idGF%Z^BYY`v$xtdm@#v-*#?#P((!$IAz zWkgi^ib^HHAt5n%crJn|t8)95%J`%W$R+4aL`6pRNt&n@51LqMSkKd#`%V}+#*5ao zGg_7>Dd5|YpA3YfzQ+3q!CJG&y^XbTJuN5UaDQ@(x|3Pu`*W$fM)2)R?)5S5c^44w z02Llx6vr1w!74I}T%DvxmSPqj|G0VZGp`MyNFB8Nb|0VyxVM27SlcBK)=I?bj%mU6$+L3Og2o6UFvA9 z^A?&2TuLm6J$yCuU@@|iKv6EoKFY)Q=p5*}N;z4}2QfVpXREnY*Lgi{ItmCRbFDwv zkS2L8L3DLRp4Z2FlAWAawjSA5(W|@Z#h(C9Bsjgz*=z!Xs{`i1e z)nrRaNi&57BTcmWf;k)PW`V!|k$a8qVg}vWdptbp<>)7AZ2pTp+;fK`jPH7c^Pma> zedVE>oV9Gk46I`Uxvop{>5(x?MV~&iCART`^PS$I8+l)L%ugwm`LqnG=cp=YVKbG= zieb5WBlecs+KTs`THCGLvGg$|1DDAC)!t`8s{Lio(#;R&-Xl4LUjk%&;Qe@Rv>z(e zfDtnQ9AFv1El63ku&8@xJ~PMB+eK5I{KB^K`b6qMfTvy4Dj4+WKX*dcL`3e9^=178Sj6N$pbj)H<;ioZtN4==xWH zuBR>^PXBo~?f>EJy`!4Unt<^YMMVWx5KyX0ldkj*Dxh=_q!*>P&^rXgLNC&#gOt#F zlaA7n0HFj3y%PvXhXlg+u>0=5E4tt7`F-d7{_&9H@ws#7PM^6mK8N>|CQ;Cw6Kh3K zZ8@@Sbs&pFg#Og#t^^n0XVRnaE@T3ym4K$ii^_^mEzK2qm|1D=OstxlfF0yTY#|?y zjWJl>E3y#Fs@x5Y$~_-RW$~RZ`5Bsk&;aDL!^a?sSM#6p@2i+k1CKXpP*SiiDnl(V z3m+mvH#Dq2)Q_vaCzM;Cav|C|Qov#b!FMFPseY{@Efm`4veaI=OVNOh1%bHC?~xa6 zq&~?QtYXbf>!Bi?@E(~%pw-~|6T?~G73TjajZeSnTg7QmsDw|QTl}Z1>!Zu6RG!=V z&8cyXUx1!b3h+!?_B5j~2n2&!95j$+K)~d+tVVGy;HD}Bt?!P97B-eUZj>KjIV@L& zTu0RrV%gx6p0t*)cDxYZ%4wu)zG+W*4K~cCFhXiyvhQ^uIu_LMu z(rj*NN*YJFv3Y%4Rmt(rcXHZhGhdFUk++|};X;_N_*fmAib{3=+&Mc0LP_EW(R_{CcKS^0%xlQ_R0q zrZhr8{#&q#fh~v^eRK?Jpt9&w+DXS!7CoVxih~N+;~=IR21INAUkoGXlBRF?OO?70 z%{!5Rz1eVsB|YC95ldme)dg$!y2s`XL>O|$)yGXc*WaN&mGBnlfTlir2#c_}?KDCf zYK56*J9Fyb;f83rr4J9z?}!_39_uBl39kmLYL%RfjH;1B=-V4u*iM2JAU_HQSoRfa>heXVFH z*BmfeIkSGa6ZJDG@j@c?fQef0d!Bf~h^K1P!}3fgiWxI*`Slitz*R}>Frxd^tdsk~ zr%x_TzpCQa>Xl;;!2n>Kuo9^-8=|Y!x@u`i<5>G4FN{Nx1&tt-UJ-}ewB@s|>&6Fd zR|k9**EF6R3$_nlLYC+Qs@_#=H&84K+0&@-g-u$%;QBjRBP!!My zU=-u;9?Z9^<#Z2pxZXX`M>0iO)SX`r=haK#O~THXADK^?$v*33?AVChN;>g`Ie>)r zzW`3o38%Rj)z3xfcl)VlR$3nlNZ|XaM@tR8%bbEIroUoxM|A(R75XZx-%gO#9cMkUbUWVaztavBrO#xFDk z5-&P!r4=m%O8(~|d|1MH(}=awI+>}>04;g54&{C!)&nW9cH%Luig{{`)vV()J-=j- z8FgYVlvO&_4+4ewB70&bB3kAE4KFt7)i5?4)3%Pb+zw>@LxvD)Aa)>&$hU|Jkc)(9 zOC=uWYiv>l^2OMeSdl+nI$0*NjiRkSADKll5JQu6a~}nLw{JOqKEYWP@?ykMBNvD< zjJb}zfn^nm)+c8_%NvS!sR9vBxl4|fuFIPrKOst5vA{mqaJy~%y@MN^21fC zhRq{Kj1bW8fg$gj{z-TF=0MofO!`_#SHqL`KxV$IQ+0zxTOz|RBGesMc0K@}TFTZg zx4sQvO00ysu$f!OJCc}S|LO&>k2p`}DU7et(8*nM>IKA-w@gWw{B9)+>{~%> zJ6MPjsV$WJWH;-osnLknM=kQBo|WBGyYDp2!dxVL8xXun&y1yk7&tGHW}(dCB>u_D z5x?gwUCe{I*sQ%t3WxmfjpUzt8^DD+Aal?x&yD*}CU#5^2YB85V61HQK{YW$Y~(}V z!XvL^Il}?HHnBs;&|BW+>|-hvxkaJR%0Kpa#v6mW<##2bM{BZxa1QG6bH6dfPx@f~ z06>ju>UichC)+=lK79yiA$yNaYsk5<4jJ|<#wp3Ewzh}#ME(`)`47i_;PL?`6_2$Q zS~q+ukU#$&%l-h&DenU@A^;n!NjyI!t08zhQwss0rCiXb%wK&Ky2i<1@l2bv8qL)5 z*;h$|^*~LX3-WLg(@8q9PtUZ=rcfbNs1SXgCLX=98C)q_IJuQj$&3Fz>3%1O{_~TM zH3`rrsK8~o?mOWHWQ4kcU`TbW9RzB#?V_K25sLYyAhL8MqRGT2ut=3t=eTdjQBt$% z%k8)uH@9B(W3SF1R_g=dELbPJKRk=NG7cp@aK{>9jrbGR4ZAA5NR_Sfi|m4IfG>xW zl-B(}D(3$1%5(v*Y_Ny&Y=6a_V(}>-!Z_PK1oZeR5S*!!8)^SO#{4rq3-tcL+i^m? z{raCW&A*;;tQU8>4HB3O$tkz$P2wE_f)pBJ>Y!DW`D!l~^wrI;P}oqe9>Fq%C_0t1 zoY5{~tZL6j(?utcXtAsrv83433>kU#wBdxp3$I@|8eV`df9R&!bVN3?w_PHAS7RWSx0^xY=@eD_{ZcJBfcA<6ZJk~IDwab8Xt zT^hxazAQv{C(dYiJ$EB9d0pDcX_t3kU|?dq(H8c+%6g(^H!nGPB~B3hvyl|!kD5UK z%A8OCzmFaH&8LP02=`la;Gci|lMw^rzxEyZW59cBt5ygm`e6jZ@4xtp-?O#P5vd~g zVE6Bz-AyisZQlMl0ncI~swrt~F8WgHlMA_braR?_mVHHk>HVqBIGbZsfwbTE(kFiM z0?kzFx~oaNkAFD9ZarDsxIJqWkFgOhwnQB+tgi+&uI&}( zkVPYsoqyKWKZFHnpgE@}B=y~3x?T3^?nlS9$L71Da&4yr%Lb!g(FQ@hRceqHa@Q_? zhdwjTa~wn4)f5{Cd0C0&>RuMmy--@lnA#K$d!0`p^E2bl{usT`H-ckrNSM-3``sXh zU6%XD+U{>E=yR0*Kvx%W-~0_w$Et~&R{8f+hoo3-;hG6`YX{h{C*u!Ch0wA#`y0nj zqdn$l=dGL$D(LIO`I-vriw$f3=X&DxS2H42C2G`CI<0=&^(tWKI#4?h+W5HSo{_ZC zUD}f#FR_aWuwPF!rMgEm?C-F`}}<#)^!zgCm~j((OWl#!zE!n zVw!rETHhbzpUCH~`3;Xb#0&S9*gpqIaK9C2P|m#5)^$5U`^EZa6;=WIq}0+D@3}@d zYu^3c#kI5KbC2#tS@u_y7RJynv53@ytH#|#@m3Oz$3HjtpBW7yjMKc6?{7W$=X$rl z!}~wTB3P}-M0r!v{Ru*G9@~)Eyv7vHse1Rvr{zznWMVkPGU@Aw$9RqFLq&|31^KKv4BAH|>lfi-5m>owmq z)*rhdS3DJywI;zgzvJ^C5cbEfKDk$dz5W0ViJQjt6Re#(1+6Kl?fox-R@s#O2WS@h z1n8egdU6E-G~ZgIYrg~<@@lZ>AE04}09kzYgv3)fF94v`S+@EA3TQWpeXjR>{Jh3w zXwSBM<44SxaRZ8h92NNBmj-~fJx$%xRQ(`g9L$b8@e>BaX9O7Ri!SHIzh)u--`JsP z1W%Pa=t>zRc;X$^_-dQ&FKtxTXX_8jEvX~eIWtE6R1$E!LHuB;tu1xPAXW(_6|3C8WpNQ#u zaVkAVa1E>f`R$M5>(5^YAEl14GB2;(;|YA<{7eN5){EDzKk4}GM`WBKDjz(CVzgCUtf(=^Hh(iTWuSn@$MhIn z#{pTVl#>EKFrsq3k!;CZRJ|A)8VJ{8$&uYbY^XEv5f za6v7FNq4r}d?k(F>Hcls`)Rn9_Q~(R9Mg2C)Ny1|@rE1QC)Q4bXFD(q+}9ZOX>ib5 zIYn_-A>Ok3d`v%J{{A}7dUe>VbN?#E|NBD*F07MDfk6P5_ySU5`pr5n>~8%^KP3sK zuq9D7DLDA$?Nl6ed`_9dg#VAj+kzVbx}5XIzxk8-nrBGhKApJzmEO04H$JS_F>g3~ zv-5g%y@c8zZQouq|9tg;Q!!F?QQq~^Z)XYcrm4>XKTXP)pQzch+s+n8FIv!Cxqwsw zH0tQ^r<73@TDGz9`Q60cA}3PbA9^E1$v5-ygYy0DSh?(+d#(gfGsJ>c4$_kqAhhTa2y*K|kTv zzgPM%0sQQ*|DWMdWt~>7yCn28NKU`wxemw!hOd+t{%y-=xalV!&GZ0G5x+lXZ}Q7$ z@dGsC@)u@#1g>2A!LsmqaGgIj5A&svHa2GB6UPHPUoA|7r`h`F^#=R>p*`>Fg1l|w zg-Qt3uLy4dzNVJzFD^z0I24?9J$x8$RO>=9>Ha6d$y%10GacXjp7V!}Y~Xf{^^%!O zcTEe(KkZ=pnE?D@GXQO~itf@+xTz!uLYMhQ_tSj5lcUt|pLg<^P`Q+?NlrJQP%C{udR8*@} z!rBX3;>rUKL|9rqj?FjK#Mp9%WF;=bdAe5aEBsVOAk~0Vt*Pg}cGgYs={gl)pP=fB zBzDe)R=vx^E2ApUat6|?o}HBx;>TC`sVHQ-*V~_VC%4O5Er!xxLKc7(&>P1u1&wUM zoqOCJWVWVj_mbKtV#{8B+Z21+%kyMu@VQ}YoUnG5=Xz&i;l-50gq@?!#{y3IKO??D zwWb2p3_tk~ED;>{s^{Zf?8n^Q-ZiG@=^3=&@6u=Nay6H8L?|KlVV38nfVtStcxi-+ zmCTLu{GV$`)1E4}K)7(~Y~WYQ$>bmK{)#1n<5`NmezzD}_iVoRz7!+w?@7QOvS zd=#LpD0#`p-!BKuE7u2<2~gNdfqEJzVqE@uE$sWlRkHMHfzm`n6{l#~guIjzH4G&D zT!c)_4Y~zjhjD(bCM&5w+dr`4;pvJtlCnRpn5tElecwaoSl?3WjB|aTZo2X*2H{N? z8G2B_&lZ{Yktm@uL=Jl()@nYL&3xRC_mjQ-hC}@Dw&-<*2#2!<)r-4$fH~%nRFup- zyCxj5ZL${?o?Sxt{X3dm@f_?7OBTEHgJMB_rHtpmjcnmMPI0tz~>N zC-q|QhwByISGt)75!-Z|Q%pH>thny?W-jG^#=9>dRQ!ziT%&iv&g2m)(J(W)md!MY z!=7E=FkY|2@Aeg3);xs;$?I`O06EK>dk`~P~u)3mT6?5!KM9lSWxD zFpu=O+U+u(riizZIe$8(FFsyf4l!*^tS!bgpRXNmu=%KMi~V!WjzS87!ns5)WBYba zS=G^s)#%RbQyw5b4Gjdi4hfv`mwfQYgf5cwgj}kO*U(kTNh2Lay(6K)lK?V(!Y?oL5sXgQC5q8h+oC2ed0}>| zLNbW={?zoTITt8RW9W0`VkK)c5h^S9B~qCBMy5nb3?Xyn1AqOYT^(&e%S@UZTt zM)s~8j&h;|@T-<3N`?~6+NV~F&{^oXa-wU(L8sTIhhFp`3t`vvfOm8=BlaE^O?K5O zQe~>N-EP(qN?l0dx=gfLtDXTj)TX2>@BphBBy2DOVv?&JNo<31h;Tpjx>dc=9^tvy zHUst}*ay$boUD_lfqGT*xFt)o1W4cU8(XH?W2eH z%$2{4n0N#Yl$0#@e4X8QSuIg}YQG|8G+n|%w|j&#c1amIz9|AP@I`p!B}X*3v(xDH zvcH|UNOQRR?XZP+LR-rjlcASa6^Qusjvu5lQ0rEo*Q#AFQPi(*^sT?!@i52Ag+#24 zcT|>0i1N-RU!BT$j>FEl>B^_tS;)oRPu~JX$*3cm>ERvIbB&5c>Yl$jpAu%9vbcU1 z&#HAc|LhuV$;cFs@&c)JSCzFk_d?7XFZG|S^Wi;CeD=qsV=8%70ds5k*tVMN6d?_2 z%yr^z`E)|18S6z!A1AbYZL6g2 z4!*EmG=t%Ith&B=>vZH;FVd9z<;%(py}Bn6y%WrxB&3-eWlr2LKOXmO$OJJMN{}#4 zGDD--xJwMKN?YwyZ&rX#MOXu(muN+wxI(rDnp6|wShqwbfQan2iHc(2cdR?9f|br) zvacKsq;F|4v5F0iNeYu=Y{_p#J`Hc)U8!9Es%sR%Fsnf%UtOJ)uzG02&bg#ia)B@* zI_o5PBgo|W+k_Y$7rsfAc5EZjlw(m{rGh1-MT;lkNqb*SueZ#}d9m2ZMsI1lp;$Pt z#*ADtTjZpNbg@jNFn1hXR4UOzjQcvTe6S`9Y6A`2INO;?6fI@3#yHv`_QMYrF;5Xb zSGk*)7-1fBop(DYts*1B)`O`s!B~O##4T(`r2=cX0)Od}rUAkwK?wSA8H|QN5yi_XX%JOcUZV zQEZH|OAlOvgus|=hR06m>$@|vbPSytv_+wE3V2l&PO`5*9Q8L^F3PZqBs^JyYy<#b zZ>*NfMeKcnMh+h<;@L2NqE0y`T{)U(3-4r77OHC4j8t znoMCho1>To{O)~l(#-r~9Xed?0G#JY8ip-(sTvgCSqg}w7{!a6=Y-gs(dA%8nA(qLMzfgf)RTs9Ci$=`OG9YWG|LzK4WI$9q5`@ zoMI`F-!j$7$w+ww5}I_dbHAAI2yu0Yagj2stdmmPY}*$6$aFt$CoPY8ugCfKDNp^l zqa1C?*=XIAHLUQdAJ0STA1$0bM>*3*ObA6G8?2@ZD}>M?+=*^0@c;F@OcN7ueo{ZaRwno9}`nx?@h$6v2a$4 z=GK06def`J_$Ex3Nw1eB{#8jdoinACxq>KckT8Unxtq+|m zEl-8E51N>NEAWV{7AJkst!YP8=TIYv{|QxJ#a=Au2f^b#E(ixlU!ujI$HMSd++B(! zyzD4+IFevZ%OiPb=VZFlfe)GdJ7cf(-B;o&SAi`5U~FvU*>LxZr+{>bTlL&kJrY!HZ$;goH; z=Ea&Bd>w+F6wx3L_2XQy1vZB@J7Nb{H{tdt&V7-_fVigd(RG=qSi2D2lx5Z!M`!m7pgCb$oAnv_wN?!y*EkPbBPUy@ zqGBdOhvOb*ErJvZytIb(u>`FY9S9RH&-{~PjPX{8}E;oDW-x)Mv}KiCxjw*h;tcNrMBgZRD#prk$XJ@e1CF2Nj;@WAH4%-C zC_EhF!}SSZ>0=mp;yD%PWm)nS%28fXNd-x{d6F&~?y!!E-x_Nfn*aKzmDLw^Yfyp& zyzk`8kn1@_^!ypNiEtem|A~|WVAIt83sOf?Y5fXOn`Z;2_xo`_mt|*|DoSUODmaos zX}%YvsU;T@{courC2xL$@HFfm3D-3n^WBhFxHb7QRolJjB;L^rxj4hDZL1|jpC(Oz z{sir%03PgWIu>JlpBP~-Qmt??`QT0Yx|`1AI+)K>gj0YXaeo(9KU?5_*Oc@+MZ$on z?9t>_ePr(hP{~H`!Mb@db3aQZb2~D~FVE^u!%#C(LVaUKaw9FotDNPHqOJkxFT% zF@#WdO3GuI$LpSUtD*R`Vh%@FS*dKQl(QqyT=tlmJhxkTq=uM6Puj*|$2u-8-l|^V zLindQ3^9|lGQy??bn1~WwCMUr@2Utgc4nbdSl6gUD;47B%?-Y~qu7`lOuGFecc?rD zgYujdY80vj6a*hg!8*CLQmoT=heao(5T;1qy?%6>XgLRn<1rXLYl2aLYg6#m_YBQt zK|o`DDb;avW_J}9Z}L`t7&e825|jA`_mzkgS#u&!GQwVV3MMmb`O`(%$)lu;Oe0() z&d0P2#3ar`BHn; zc@(4J&y$5w77sAU~ySgweKF$fC=4kq31aR+{j&(53C-)rZxC z)gV@avNm7GLHB$Zlj_nrLSinYx0i8TmY=nQg^e1K=TRthNLTesBw0NRW7sg--{uMs z5%iA=rJG=gY%&ZGUlC2(AxmPIx(43NBz_C)4$Uh=8bw%hw%?TL>vtFNa%LShtPW*3 zm+4Cs?3v`&9y+H@6h+JOH^XjI6eK+~p zad->hxa6jmN3J?jN@ABD>ew-LcO`WHx7WGEO2p z4nDI|1>Y5XckHzv%r`vx_eqkiWTuRMqbdx86-!2cO1b|y5I!c?zj^49#MNm+SNK5# z_QRGX20&iT^I&~#=T0Pxb+oF)v&SWyJ5x@2daq@yy59t$A|@9ZF)bznOwW&;IOchz z3-lt?$)3-wvU1EAR6%03pA`wyb()gJB6QFjCz@WbyLM3Bla1rPIniOULxcJ!_FlCb z&~ow>6c8W@q;MxdyZ5?p9SUvlb3BW5Uw00-eI1y5@**6}u$5I++Zd&8d^M?tu2A#* z15K7Py=u1IhQ?Jc)orH4YA??UQM9wZoDMOiFA|1Fpx_5}HV`1QlMnPa7g9`tQwipE zD0vi6jI~RPY0<=Yyw9OPK8a+s`be_=iD?cmdoQN_kS*I5YNkM=_?!p$G)c?nJQyS(9ucr4O-cMN?UthJi$l|Cl58#gI7d!P(P2HDOZ zIAvATxr0Z$^9Ixoop`;(PL6h${OT&vUnKQ(-H)z=c0C??A^e>y8yl#XDu(VXz$MGe zj-58@D*M5URk9m2{WgP#wbvCYy;Zf3^`iN*I-FMswh%R;*qda`;aMbPG)&6JsgWGaj*c>fLiT6S6>1C@c{?zNaa$TA>kOyPTM;^=b>(Q(; zm5OBMNzAautm*YhBxJV|4a>QTeW9pFqg<9d&jY(6HcIha*IC!X*qB;*&fauQX>YU3 zd|C6gO574jTxuj#vqR+B!J(NpRg%|7`Y5Z2T6L_J)iBg8dOaA)xI9ml@sLII_c9FR zd}}x(B!xMJU~tg7B(GbfcjRe}{%3RU>%3zc-i8iQJoqodZgHOKJP?6yx!Krih= zLXg77^?X?l!YU|=1+p(rLDiu|Hu_|O4MI~uBii=uQT9fE5%K}=uyzT@$mf*ZiqWgJ zy~`fhfS`@z(aFrll@M$nas8$4_s6wT))&`XRLyyVQrak8Vt4XfZ^5~asnUX-R~Yt^ z?x>b*RgKfuX~(2garGcq zoo-}@E#LlVVdm$LfBktsW3EUPg10m&SptV>pE0=>#m4*U%F{^ zZQEa5j0+EacN}gYKaU0m-QGXDirz5?{9|VircOxYKH|nxr8-IhH3aT;eJh z>B{Q!WSg5ipxycb^&lO!Y>gcVbPrGJ*`}(XIY1bWjP*i1>#pdV0~wT4W!fM7BFta` z4+I?)+$$*nxs{J_mR&26ppswj+!2T}OV1%)aR;SGM)LEDL`*#4N%WjEHJO&~{b1NG zzyIl_6ji;lvzhRUQCgZ&iWO0B--Ck2G$R@?k38dz%=XAh4v~wxnKwvmjjq(S>a-k}p}VT!Y4)*;7|5>+ zGyYIK*(Z#HzO-GNX=%Ql_`B)r@AF>w!!Oi1aV#cTTd8@KKGjQSR_70pb`w#HfB#_6 zhoe&8YEePpSO6p#BojWnU&K~g(UPSK5`8Pm9G5-PakNh6h~pmx6*bsTBqVV`4!f^b z7h-a(hO((qH;332JRD!k@PrZ@UW~T54{LY5vb=G3Bf9V?C$Y^zZr9#Qesa`$fsu%+ zAZc^26Hk6DpdEvDx=8)RblSW`(}i34$rpKKP%76=MdVkZ6({*w0Mp={!FE-_hwTxr z+~JQ+XK#&}baFGvca_x4#5p_0|JF|A1d|pXeM=$xv426XRB_!(NUI2ik{mHv?{9#$ zQ@IaHP}yZI+f9nbB*xQa;4wk>;j&wo2opt)cT0px!L)r}apLJ*g61&2NPJYyO=k2v z(%h0;{98yE=5y%hTQ#&%UDe2iQ>476^nVGzvF$(lurpwNN zv!Eay5007XYc3Ezm|dk_%V;Vc5X@{M&Al$3I)lsYW*XqQyXvOFtt0TFc@-fTpn>IE zut*Hx;8+ovTdHRVi@B9s&N&S&M~MFZp#!JNN&awLLev{dXA}G)M!!eD`CPvOg!UitJD&~xoNB`}@+rH%X?g&TtQSCkLZSQ1n~74k z)oyw2z+Mdb#}e040UbTO4PCv9FmCdynyeylFDbG71`gegmNDNdLXH+KUosX=TfwRh z%k|y52z9tC$tQUB;6c(}99jTdAkb6CBkLrx)7&D!*5=)sOfxfEpvm58YD1u|UMEV5 z$3sH+n(8zbd)X%gI>TAI;eI{IIClquDGQxT!X)OSTT6lVY7H}_)=<*FgOU2_8(=4f z2OF<1)jXB^GamWxag;NgJq`PkeuLNF&LKOY-|X4%6zFep-YC#|Ez@0?(y@X-C9-W6 z5cH3VT=ZK#ZW#K~J&K#)mN^T9Fn4tpk8Wvgkq^1O8z@nhq;-T?CBY}Pd3b4{ zDK4qXs;+rkrX$kzTg5v%4mO6H-5sRKoLDkPB zfGP;?HymqJnt)BUhKDbOjk!B;10m<9=ZI9uw@M$FUHJVbY4#Y!yBTWDANKL%L5jb) zhI=su?Eu)&&?kW&K%A$3YdSlW5@Dqu&@+CgC=7eayRa%Y%XhVYmxGYQlx#fb5RY^e z^o}|`g7c_>C;l3jl3A>8jthg0TE9m6Q5`&`FgtA1Y^}hZbECLKe!nX(9S2y}jNu#~ z&&^0z164={SC;Usz@(;&u=@mw{em(@LuY6%+-|~+Pz3#wyw+?j(Je76EM~|;C0!(S zlP~0CprhAus7tBDD0GfH>mqUkzCdw)wUrbUSegC^e|>mY`@B<9jc1*2-e#LP*ts7H z5vlzc7G|kKp;6TuW`3b@49zM&HAWo!l`)7J`wJl$xUN@20#i|VwUP|XL23r z9WxQ4fyE{g(AIS|Qu)OWt4;<20V1tXTCA_jJP>4K-%lQ@)3sEaDbURNBzu0gWi1(z zW>B`ypQP2j*y(ul6ByoTHDy1hRA^#YhXwc2#}%T4}Gi*0tc4#ZC72g3fM>9RsB2^u!Wuf)!aGFldiUCp=x9)Q`Xl^>(U}cw4PR4Oe;67WGU=Rs$piu>x#$k#}729ee(dz0hB!Xq3kN~=pIOnNNUzitAwd>IHfPz_u zwW3Ev9?0KHDC7Z21eyI-Wg4@Tf#B2PiPS}$&}xz?mn^>NNu(S0mP?}OZcRL)>l%R> z9x239u3-B2;k|1igqDmg=tIaHNNHVVriAsn3HD7ddR>mkU0{9NLUr+vs~-%Glfo{rbaj~ zus&$ruIEY=wSl5^RTSw3GOuyR1LXA>$YR5kVuPB0fxJGvnyP<yiol43!;N6VFpvC=e0=|H&{AzYG; z&RIxnZ11F~PCh4E@hLL;(iW1U;%-Tg_Nhz=B4}KY*Y6uu3GKEV@R);3)-~6m-Sd}g zQP(zW^gAfD+C8x9;~{}lmaC^}=1_DHzx8_kq?dQqj+m>RUW)2z4ax59X@3>RizGb~ zeHfeL6WPN_ud1Zs!#zYTFg+wSt_$kUNk+n-hA5vb)myS0;X9W=-95Hv_O{zZYttiS z_?s_z0fO5!rL$$~Zs=h`le6W1nLlr4u9byQ*>c~9%uO$Rt}C%CS}%`dp;q`Cnn}A_ z8mgulJAm7f|+FHoE~P^b>%X%vZsz#6j& zBbVMU#Q>>#80d+Gedm;sxXqSg{hrO{Fze7;zYL%jD01X$5?y7h4-B2v8x(2QJ0VaO zqhl8YsL$X+N$+VF`xPex(5yCfRGGH{G!5omupY ze#TM}3NUwVTUUo89+!A=DmN2R+RybWbaQgI3K(!?-6@qFtB(5=|?(H z23eL9In^$y)VR->7)gVwx4-VU4s7bJWM1s8GwcucTeVwzep4l3Z&L_gw)!%ukV|)$ zPR7gbQ3HZlbB(Qc@{uN;Y|W)rg&M=i&o7KaVrYiuG&D3Jo6sGYuQzM>JQ;-=i&iP? zK(G0QGB%9kL_@PS=T}8;(Sqx&n3XOFw--r%e*cz+*5L9D{%gSfx%5}`S^N#&;a-OP zC!vKoc}Tn9U_3y#WoL=EAl_OyE9yTdEnbbzFYXOrQ<=GA_T6>7vmxX^iui)xi0A4L zADbl0VZWu&Hw@izJ8w%V|J%r(?n(zj`vk{B>j?9kW-H`NjuDEej}ahY zqjuCWgI(<{=5hJGej@a|hM2_4(dhZ0eVGmUsjkAU(|W?elvj~3j|dztvBQW#3R{k$ z;J0NP>+xGg$N1H`wCKRSN)L%Jn!y^eCxfPWtmh19LcGn^QVPBbduh@%LP|tcr#kXz z`kVEiPhYrLKAELhW|R{4Zp?87?Mj#Z*0-4%a2oCl`lQ`bnZJ~LuoV7*Q&4tK3b@vB zdhMj42+l58*K7F|KB8k7?^}=OrZ*=LB4%;)lvbl_)byF{C0%SmTc8qE-ea6S1;L|) zUfBwEZ)=m5w#TWHg}nrUsot3#OrGEcTkf7K@EUV=dd8iF8j-(x0i1DC|Eov)zTq5O z8ci&0L=hx?`e^*5mcwy}zG6#S)4R2{b%W)2rV?ATWPBX$*A3KaKxLnmgjb`uTEJfr zvIMy&Kw@@^ZEmx9h2BrC;-W&`b0S1zm6>-;A`Da2D~nvf1Xbq2Lqy<|ERmvH>Pcy) z?R9qq0|)|oCpoCxD9)3t%x6!nTpA&CykwzyT1AMIZL;$#Q$e@xv+rH3cd!<7KyRVR z7e0h67W*fIq}~IO<5AtU$)X~Q@$iBUVQ62wqS**5OOKM(V)Ua&0v=OG9G3(3ug z)t;uE^A}3m4>g>NWL)=%^;elc#V#v=P8dVJ`K40lI`u@P+Q%o$3M@#2jKnU0g$GAC zwWO8s(uxBCeZw)0!H&K;wx66ZRn*ixNjH9b>JpCtmK!H;VLEC^RCI~a*m3J=YR}U> zyfvr+E{*~&$Xv_gT%jVWN8s-N&AnE-gp<58<&f4UNi?-S0Yfix$ap4H1lB-ng_77`9Tx zVUuEyki@@TSG1AYl6Twgv-U|ue&bpJ(k`M+y)~&^dhlgSgyTi*vGz)>xetR$9rORl^$Hn|wwWLwNS9F2KRV$ABTh`O1wd2U$CanWJ^Mm!qIbK-D@uP&Q9LnafC@dFRa#nk$#% zv6N+C$Zndl1(=Y7W;#pkVW~C^GgS@k>k6C&RH8^iGX_-)gZ$ME4H0S`sc~eAY!w z{hLO!sMMUvvBt;F_>D^3+I%yCidhrQliCDHqDZ=ig@>~W9&h+jWi93jSpoccJ8s_` z(VwOZyi8gG1W54x+x}2PL%;Li`xx0yiM z`(KZZT>R9Jt9?4)_|zuw=5UCXONlR7=c^b~zcfFuW}B(IpZA})KY7<`}|g-FC&{ati|)SiCN1+VMgoxly9qkKWTD@mQd zah>b5%I+r-rT>0sX6rL_eNItgFj{4ZrvDy^GTiL>451dg>5=jhDyEDU^xyvvv^WON zvxb^Ce%Sv`bbO^99Y%uEm-(+1QrB@4A44@fB;>|%1z~)+OYaA!?4^!jrR$3R=l+-F zFaG7S1bupWnC*_X#M-`vgl=(e#C%xjbSRHbA$LiO9N(1?n}WjqcgT-w6wS02LkkXUm@ek2#$EY`xn* zRLGyITD;=IM-!zuuGdi1`2aqA)Y1yNOG($Yf?h}22y77!GW=wocSWd5BFXkyx2*AA z|7B|9wNYxKTME>A%0AZ@$k zi#p~dSnT6Dd2;8T%E98Vm?duOzJts*`w%UX?nN;fL;J;7&R~viR)kxl`dm6WubPJn zZ+6L}Ii?U}u&-GE(O1~bzG8T!DK|FD&dEfVhM9R!r>dc< zyM+?G!?Q+4vA068Z+4{eC7EpRC^r^nrca@4J>&4LydUU%JH~^3+g8i7X6a#wU(29q zIRf7=;$GV=q?S1lUIWJed~I4=HEQs!R~X&@JW38C53?9KV6%q1su|KPC{37`Pg%Zc zuh<;ZC`&OgoYH287sCS;Eh*Dq7FETp9e!wYA}6yPv2++oo769I>Il)DJR0?;&$6oj z0*S)j5Q#)3PDe4hK^7UebF=@;3IFz2F5`=R+qd0`wvxA2ooT|3j|DW=yw24^ZfmbK zgxUtgH`9+Z)LQTBgghy6$jh+0)qjxbUn8~a&rkBVEc96w;-$dx+~? zMN_^%%>OK1e~yvd)T}P+I=fnlt)cLx*jx3Q-ptDFdonz|0v0!uv_6S{ zf%1KL`ny$ac;VOn;u6ZDa9Fj@oFzGjWfZeYd2<|jySd|Aa_uSS$6I&Ec|(c0*$1*E z-GZoFptRboFPlO^b0rk%s0TW0J#qA$BO!xTW|V0qGG78kPQ3D7BEGcldE~W`@nF+7 z1wu@@yG|+LX5Ok{z^%qBNz?-CL^H8j#F&ZuoxPyfMF zgI4XQ;&iWklTeQIsgQs%<`T_;G-=kO>m2LUU61629moH{mMsw%0I(U>;8HVR0m7 zs2*ZFs>(vptvcp*qB?gY#6G#U5}{GQII=y_mdMt-`gwzA%?aLMbE(_8#=crA))yQy zwjU_vT0i`f*EtSeQQi~KE2v;9Te9S>+1RuLHc>KP%Y!a|S48o*GWc6`W_$Q#Vxpl|?=; zH4FY_HuMeRY-WkpmI!>39zz|Hr}P)oWOOqXjCELZV&D5t%nMHP-CF&fnYn_0i4f`U zm0hAKNlP4HjsdK0eC(b{uQeyTDsMl&_fRoKvTXEQ-f9iQ*k|~8XK$YOsV!H;ylR^UyX^URqzn~%q;xSkZ?yB#lIv0B zt`Zg?EHr7>A*eOcsK+3`D>8#3cNZpU$b^F@{ETS4+$D=ko3>O7sxZjU3%R!M3gNJH zEz;|dha`U=;5ZKL;knXJk}@e1VdVk!X*UBVNNc7F$Y(PAL<{oDgN;&T!oopfq)Uk? z1T|Yp%jA126Bb%l2gk<9pfw?ts);OR1$&udAjizYqciz5%qdceo?)-9bUsJSGAc6h zV-Kt~)R?cEj0NC7v+`AyuelK*i20lYPnWy`-d=rK;dt6hXXRF}J1+xdqUj(PFePw- z1Y^jgOg$JB+|uqP?d0APx_32yjrHkF4{vvuKKCT)&t|Frj=VNGRS+hZF<1&jzt zR}c{Cy@Q3KbOfo9-h1zmVHBhVq}NcTcj+}Ky@VnqKnzh@=tv1IK;AeG%sAicT+jFY z{T_bh;^buSv-Z8#y4PA~?*;KN->WrOPN`BPRUar47U>A;zjnVG>PYkb?fm~!N&Z}o zfDopk>^z?r?W`b$6;X!e?7~%tg%mm4a;O49AKUx5{;iZ9rF z#IjO%g}vxlm8enYlXttk^QZl8qI=Qy%8Qb|}cNlq} z)5L9HGGHQa0+C1XN($y*GM>~SNkwflL4?$^rYYVeC2EWk14h6@m%jA&A2RrT5;RU> zx;7~?CFrxgs|L2=U(UQpN*p+- za3jGtwI?j8G=lar#Rsw-L3AThEF+8uR6a_p@#MwgUSG$yWR^yYk{U293CHq&-HF!O{H)OO|eEg2z=-v|B|~agcaI2YPVQ9sV~xll~fSo?XU=K6hH_KH3plr~FhN@2n3nP(gw zzlpwaDg9YmtNYX{lreca$Blkin{L89{Wk{(D*v1c)i(P_BrUAM(r~GqO&cOM1wNy` z3seRbfe-5C5^z-7gPc?Tn)fFyEF-s0X!~3SB0e`alv}tnZBLX$x(KZ1t{HkMYq8r8 z*Qw4DgAE_`-hEMPV?@aQ{@v@+38jNCTP7v@D&Y66K^ATYSBO&7-YPJ9B3<2pq0I&2 zGHoO@eifVs*_&KoVsO6kpq39Qg>Rc{eW^i~Q%aDRSs2s()!a?v8isA{F!Y0LgY@e< zlSLGkeetR`KFahZ6?at~3`lmHs#JRF9?r_)>pGLhgLXaxQnBCiFIP(tHqlJY+FM%X z6Dz^sjt6KaLYVy(q6HHF~Yr%Y_x*F9IRqXt-?PV~&XZFz3kU&QrY5R;o}ShFmo$f(TV%jUpvgNKcFieW*A zjq+DdwLZ|OJeJO+EJkyc!dP%T#WqLUDhO>FqFy|`;Ps-+cA0rf`6EisU)hJqUxjM( z;ZBOXb-P$4Ju;N4l>|j}Z**3AH&tSCNr2PVB zp0hO4(ldOV1Fl{C#d=~h?c=??31Rn<{W{sNlB6qL>rBEg+Tvf%Q@$WXnOy&*&lYK| zcX2QPzhU6J>B=a$B)j#}QvKykk@^u=Jqt^0bA$hzPQ!f3Mr#K;a?7SdBcaA$`}SPt zt-+sRP4Cw*<-jl(Nro9|=vy}uxlw|mv>~#}>J-dNGi67nO54iE?T;`CONZ#kZj;Xq zC{~G7pS4Zjim*7SraOXOx%Qc-1Q#beN&AB&(oWAsze( z8<-q5U8RJK7@Z05Ho)wg7 zF>Ne-@mF0MHSOY_+*Kw=iRed;XwI9A^}%K0RKnIzgQ1`KiHw1Zl9{vh1|}mzb<-Iu z?ozFx{j)ZQb&Z|+RDmaw)_bgGQqpP^F1;#lup4T|BGr#Uh?6(t68W2jo#s^Tsl%U0 zZpC?KrIRM6ZFZcbIdlxzp?02m;hNM_n%^2}l33a1Y?6ZusMI>v(aYeeHw8s;UE10= zMK}b{Vn-MwcQXBx6uHbMJ`v%pNug?c?kq_l2Bo_x1715%EjT_OX){sd&4K4*UYRkq zwg-KW-2yco!Y8;zG%OsVSHa5V`wh>OXL@9&2JA1^rW|)=YyaNQ%0%oNXO9EsgCXfm zR5#f`VH1`PZ`#c8E&#uvA2j2NJ!JT6sE@*!@=QJIH7QO?H+O9VB?sUWdb0*e_$E)o zwlB>FKg$Q0Dfm_x04KP9O&j*qHK~woj`QB1$`1-T-pUPFjpv+In{~Y(=*c zLK=CT&>~7(!lfG0q*V~*mQtC(<&S789dl`_OOR)3*5DmZ+k$)bM2u@#m$M{o{fdpY z$tzWoI9$wTt>4OjW&jfLtNfP2;eYd>F``<=MCk(9t2ijML%ZkWPfv*Y4XNi;*EvS@ z~C=8 zO^^&-13ipGg#wi$iZp2YlH z7(%DGX6uD7LMQQ;jiKfDYLB$3TO>w3EnW;V=TD?;I*Q1$9rCy2R!BDSR8D6{zLyR@ z59@69Zh{IHo^woa--7dsiM~J_^Nj8`H|UcP-uGA(UMB%_SdW$J>e#R*y3=6?39Nmnbt-jE;qR zKDv8+H^tVbv`TsGiWMA}+sEwiN!MU)716{TauD5M^nOn8MGe|=*m*GtNpaZ1r()2p zen_i93OssSh9%)x>9=W#k$0%nnV^x1o6hZHGcW)hJw_p_hHN&z@{ov_tPvQuROW3E zW9&?}{opsOJ|5TCq)o=JBP#9uoyVpyS_vb#rqVdQCm$e8FE`&(#Kb0SsdMnCbZxLm zC@nr^a`k$l$2+;4^Q*T&0gJ$!RGxaF>N*>0>}2uV!LK3Zcz-d=`t^>){kWCL$|TW0E0~SSwyuv%sLXm$39>z5GAj|3RGuXNFTNY(ks zP0iutVHs-jWAuw|@$ne}7T507=U9FIj~}lM_C%%m^c?eJ--8cM?kde( z21Rn#I}lM>N(P&k#U}Y?qQq;RXbAhC6ck%UFDB-xDkJ<>KJmks0vp|N=b^XRB`89 z2_pz1VaYTJbhs=uy}!03*rqwm3mR@9b?e7jJ>?T^aovhsf3wp;cz1KZU`Md1c8(4i zzcsfa1oGKxlM#c1x5?cGNA5sD_YN%XiLgN0W0a?&-mcg@dqWF%5xRSMse!ErI$dsf zxD|cu(fqDpwq|3ou_ZzwCQ@8!Cgh-?j(N+nH>MQ3Y?zSl&#nF~xo^vf&TO zE7QQ)${B`VTD5pjZ`ouL9yxLuX)=6SD}?8Np~5ELj(Rm2=J5M=jp<_A*c#TxS9R7D z?8hH;ayjoRVE$HB*I!X zqx26{I$qH@vw$$Rp(lj*yH?C5sy-+fNi~&zyaj7<@2=lf$!EV>lAt?Ajrf@K1@SA@ z`Yr9!+754F3m+F_Aqt(<3?uzr%awUa524|jFI&L6tnP;DYE$CImzAoNYuiPt&_Rl+ z>-n`pu3z5q1Jw+X)8ga`ss$Y`8RYQ1)}=_jY6(!=e2i{|S>3x27)R)CUr-;zVfn=X zMn0ny`r&To;t4L>>@@Z9(zISrkex}((@8p$DcDb2^@rxMs+o$dNfgt9t zO}VIg6Ytc-eSaYIrHy|$XCA3(Y8_uw>eIxhEKg&-p0$`9KL>iliKYeva>8c2LUG*y zWFXA{Ml+L!nCp6G@5iDR3a%m$YSW@Fq7w5}#I)GCd*_3xg8MLFqu=&Lg&gD#T4@ZU14lPRv_4RvWxD;rG z^>S>_=3{A6Pj)XkqmNI;cp%`eXpfJ_OM+a@sZSijY<`cl*Qu|wknRsKy^B6IUZY$o+K= z8U7s16oGc*_)c*m>@Oo=fz`b|F3Sm2QQElY1CzJ&Bn{btNRT;{6a_TN4Px?`kWk*U zd4;9#L0N2!_!om3Rg4_K&#Bz?+4$4O?j3ybn(FLjqJoYKYnV_FQJ488Rvi0H&*&2= z2vS-V1j<#j%X&xrb&@Q9>VtZr19)4A-$Q~5t-uTQj}Qow?(9i8aed%1>HWS(EAz!u zL_NE|-qEWW4I~Dq*~-vLM0ANDwl^86Q(a1M5#2YWdo~hxwIgHUf)i^?|Z^v&YqHo!1ovD-6V+KU<{`-wjK* zE#NoeYc!~)XA>bQYkzOEnXGPmSs~Ppy`lt9wV9Nic2XP;cSn7y5mW5mzqzb7%P$_8B zRVPM^Mu<|OK55=HI-GP{pkOz1UMNAJF^J`ap5|DlcvfdiQIt}a)UGMhY)vJGct<$? ztxZ?W-PC?TCwO3xuHhJHdJjUtZ*(UypA_DZc`?oPynwZ3d0wwB>6w(+}GRrhjk%Dt^V_(vq zm59E@eay+fab3lLcdba*szeO%v7n*G`3Uy36Qw1p=k%-JcHL{pwMy2l?2RoI8gj)i z8uN_LyGgW4$$4GJ>LI0gd3sB)tm|KmwAQ{UeopYLlmxq*A>^gjdwmgXX(Du0OZP*1 zx&5BwBoTkLXPfGhwvFFF{M;rHn1hYhGGsTtnVxgE;9^`X*_3CDynFsL#5!O%d+$U1 z^LMrP$*qCkxxgd)oaI`MF~e5~G`>paRpKMvCTUZ`XY5mF-Jm%EMv~*p9w%z)8cyNk zP`T_WrXwnyd>lUXyT@NDznq};Q5(@hW9lJ-5Nf#5#5BcJ@H>B~mYF6ebp$uxkeW#0~{|X)F)3^`X!~ zmqylX;Kqu*N>_7%IkbA=PwS1|$8K15k()dlFQpdFt3)0o6%mSGU==ctjIW#R{MBT@ z@lhnHg3ftm(M-7!s!y@Z`<{_;0=EYnr(KYwug$wF=4h10ovI>TA4RrH=+(TU(jwJ) z_f%-S)1$A!T=aR5Lg~Jc4sN`b`L*_=Hw$t5k3&S0Z<`)t;}iK~+4-wrn|<8Eo&vi+*zdkKV z6EWbAB>z}_s7TBH^gfYu$1Udbm%|%>W6oQ=MIM0_zV$7g!5whPeErB@2e;Msv35C$ zu(-A0%mZIe#BTzgmmqnU4xq6`X0oXX*%KgMCFH$gb8Su*+LLvankm#x+pcwuwb0;x zrodhQVwMVDo(ql3w=snen1|l0y%vus?_KM}nz!Ih4%L^#@Xwn4S%Vkz7RRF;!W;C{ z-1i=X6$B&;HgSkX3T~ak!v~*{E?g83IX5!0TIbC)q*d>9l96@fQ=>>8z!S+_#|9m; zJCSww&tI4+{hTZfTgcQIcbmQHn_&-32WEW*XtYy=T)BUq!CIB>cOYHu9*(}@5pt_v zaj)MMRhov{#E!TrLkdOFS`v>VN1=l}fXL8$81Mo|@1~lV@XRDY%b~(zMXP3auv7l> zHD{7#_~Z=ro=I03Lyfz$yq_;y#S30aG}MMb_nc8$h;o~cb6Nqo^nhM@+Gt(Rna*bl ztdg?S-^5C{FJ*e5MQxiCId9NKDSw_Ky~W1Aly=er0cKhA#UGAka+m4GD`&Eb@Tl07 z(vx+$@r`;MXRWPnWo~0z|5SA?&Ks8e1hup?lOEL?gX?RDh#AYItt;NexSB6$E*`jf zz#b5~*6!Nlc59I9PCWb75OF!?;eMt?4MfTYKAMm7T)1K1U?b?!fD;~i)reg2flRY3 zM1vN1jf|@y!d)+lR_6}ZH+!~XSG@tUej#3TY5F%dC3ehF1Y#`+FkSbsKvF{~!sA$_iisDdS&{wqV~ zd?Wo%@xZQ+czGpqeQyJZ>EJ8Q`k3Wok$>7~MeXvn!{2uAl}fCNfTV~g%!6NU8Ljy~ zVe4~_fXyn9#5x4;DmbIB=Okq~zo1E(O>O6m5JA_@Lrb_1@a5Dv06n;_pd4yFRXvs8 zEC>yN?p*hnetpC|^-%sKoPFmPP?|~oM%ly83W3A$Sqbh4EMGdZ;lQ1LDuqpp-<{;e zvHq7}-^l0CvMY^RRZA95HG3FFm7_PoEOIooPTJCy;-XZ<;C*kquKJ4OFkQbKL8;1T zEY|P6c0+lH@$Cfdc3|yU-mJ%~qkFYZJ~U95sl|7?ou3e%9}g=<<BO!wVGDQrnGEGBgeeau#{7woeA{&2X^Liq&u>i9@|i_ zk4zGPG}CSC7Ui>>o1eBBG3>oG6`S!c^|e!ii*3s#c@h}i%{6-JnPVq>w56AgQ=!H@ zEsQW3An1DKVWlX#F*eJ1Qa`L;C`ju6PDN7uIhRzS)**uAUKI=;OA=L<p_>c%ddZUAjME(LHj)0~f5g-YA@< z;T)J1&c_fD+KqpEn{k;bMthm?K1dg3<@5ABmc&WpWWf5wo~=ZXx1E*c-3lI-I)_d@y0()M zOt5Rf_wXPAkTj8!9w14(84_$u{dNvo8yPuKFn$m{F|A$1Rh0zYi?DLtcOD%qtDZ+! zQ^=YAev(;fWRo1}Py)iJ3safzXB@BX0$^FnB}G zmu&_leeF=^EfF~nZQxH<>AH4kYi5_*nKmH7SXi_~4FS1@aZ{4s6#Z-7Wa>q(+^J5BZe5QY0IGb=H!FceMaw zqXV0-CZ@$^dUX+xQbclr9d{L&3+67oGzj+8k&ags^ipmQ3ih?!%mWnk34C)2(_p(r9tnwpfJqGxNOF~6}N2P7{ks&9rb0F7A zpY_@>@Q^Oz_^nDGu8`?#@R6AxMD9tpRAFL(odv~T9pe4<&!HhJ{tU$>9Lx;{#GAS)vS|3-)ABZm9vF=}HmdfC2<*&Xuw zCPO|=6ZKPdAiO_s=)X{Y_$)`Zq->nji3Ao^UmW*&znj5)rGQIQG3x$^=6N8z~U zU`5$7Nv<(>3L%fs@eu~jU~P6Uk0BRwE`GyV=TbFMLDHu!H`L^nt%zA8r9>Gx=sta# z%ugsCTYR^=EL>W#`cU-TPnv%v3I@oz8Th$+z#6bZ_+3Qj{JIlP}5}aZ#5nUEB&`gu$755 zpDdlcj@cud+M4Fr;eLK%n@Xo3@wt7j%L(MKU8g=6Uh^zkW6aIYc5cvq z$xjobv%Het9*{y!fcDg%uw;9_J+V;8S^u%sU|cl)1)@~Y1+;|Do=Pu62BYEBhlqn; z^4@KocL-HquV}U6b6SD`Y^eMOLJm2=HC6@R6roDHq|gf}-iYQvL&|5Dgiub176`0a zok4BB^G2&;8RNs8i`ig&>!jAO?C6m9NsjMgoh@l%dpfb~^PdSNnPVKrDLUb7RnL`@ zZbCpKl5efQ{=Kd34{|NO5uh!RgGdo6;78ckit&vswNzB8P@jIMbqC)-obrec#NXgw zI_!teN7r@^)KYa_Ffs7Ix~9&MJWn({X8Z9SY zZEe{UPsi`$uwzxKU7=U?;@0MB=cW!)U`8F#WnmVVpavy~#>aZ-JSPYk>{G8aDn-5d zRn|w7Bx2vUefu5n?_2o#OygCwQkX3llPr>PN+npBF-u31h6p}N<>CfjXhV>IYZQT# zwc}+iNhEh$7-#lYz!iRW(kK0_Y+PRvPQ77_#9XPO^xiNKRJ@>KLrV>0mu;6^IJ*zw zQ_?T8=cItGyU*6Ml%^kJXj2%~m~K4vfti*}(-skTydu1QOZGNSvTsldJl9wl8CL$&5EvvRU?kLO{|n*6Dgwy8sh_s~aU ziQ8^~B=%=pbrLI59NJMN!na>@$ zNYkZ+xMO8$sMT;f;mg2-jm^O$NN%|7XMd-uCb&U(_nz@|q_2S_Geb;J@_f5xUBjfX z8Ac5^ki=U$CIY!?;c+O_f8-YFxm|W*i`CyC+639RurDKa8#GmQARCt}nYrIQ@J~Bb zmG)mKOE_ZpwwCs{?_DeOWfPpHkLsaFnoH^7z(liIEm)MRFM~}!4Ge}*>hK0&dR+88 zaE!Q?gLDrqg6Rec?-ucy{wctL&R0@Edc^)p@GQgfJV4FMsNH7;e;Zv~$F022&;H)U z?X-*Cw>E_`=f{>=k(aufa-pHUVdSf3EN3b~hOpai==I(V>W!!S+9Gg7A^Sw8A( zSuku?>TQ3-7sE95OjZP~xF90QK>93sK56((g=+mB_k|0E2abNvcO$^BWr5EGb_}3^ zGSIEfv|u;D^B$=<|62OzDR7QUvIlherTh_C7iHk;YXD|X!R?sQgfUx#H{?FOs`$Gq z7fL&?=~)KJTNN*6#rGl(U9iIw)iMHun?3+7`(babdD!OG=U@|RYz=FbK*Ny@#-swX z3o4ns75Cso+53?cc|)f``+x`gsFD0G7;nB}Rg*epSXRG|sCte?Rl=LA8{Ls15c>_y z`*`6b%6%Ri_IETwRc5cv2JRNJ#?TW-?gORyLJs1F<9nKfsTzdC_k0l;DM*t*h1W?W*Ht{GS3Bt zvPri2KG@d*B;DdsoTP!MfV*G!INVxjC0rEi6#+3Uc<4$WUQ<80BT7#dg-V=e2MyUS z!lka4;RKCUar!$FtTPILqGr4%s7^*4o_qpO^rl1UpHt9$#t;mnp8^tyhCEnbY#oD4 z+qv=kTw+KR7KF^uZAUrL8z{x!4m7OL40c|&{$Aanhll; zy|wEx|64+0o&7h%vabm)Jf6qOm>PEogr&|TRNx?h*V3sp6%%b8IqeBRCZo)E zu_nuCHd+HzIwCuDJH+YawT$l_uBr$1lDmF_Ga&#uSs);)hY=c)pTM{r*STA-1_iaJ zFmCy7$2oLG`0nGB05A9ZJ<{f@NSu*MP62xHo>2=}>xvaHUdT#DqkCwqQ|3zsW$?b2 z+wcdQlTR%Y3^5s;5th0#BdumUaIUVTEs2BP;0bT+CvxSHLEcgSPoiwGl&kYu;417TX zNJ96w3!OgbzfRib_H#+buZB9BW`591-@>MeWo28B7ExIl<~N{AOKkoWf6Aw!9l!Xy z;l%VXWy>lKjpMEfFXha(*{gPTcD+z70@fr`Z04h`Jn|`iz~9Wl4;)hcE_bcByvOF4 zdLiqcmYPULp%2q(=$q&tu)d?>#LnmY1UR3Gx5TSdPA1Z&!Ed5SYEe`?c4itb(`*@A zk##SRR_UThR2CBbha!uxcALDePe)QWUgCegIIJ6` z1(%&0geNfY7Rc!61SlpeAiR6VtO^OlHfRwl!QCj3m@JUeNk{sG4qq6QrjdMZHVSpg zz0zm;ExTcOHJ~j!{^@&6boyS95~r62jf?;B-3my0_Ex27b8P8e#NyXN@4QPj^wih^ zn^lm)r~%5dsIaTElb}c*hauZM)B@KOCZ~&RZObMySZFe=wCEhawkXO)ky9F+1p=JBvQJhQyt!LFVApg7H08ZCfYz7~}c zt+k5r#$oX}2!pErBC*s{g~(t!Z==UM@RZ(MvOlIGxi^0ym88dF?t)1+Cp7fx+~Zjo zL6@8#(nX(DM}*JlkYOkT5<>2scaqR-Qg*{w*z&^R3bg~7pF5i#Sk~s zd6%L**Ix}Ja+2LaWN{;$smP_RLKxfW4Xry@`V}`u`@iL^{^p)si8*fp)hzXjS+xZF z$mmkc;O397uGAl*xtX9E89M7@wyG?rF#IQUbER)8F!T8(#0ht%qbrvOTAIZ9f(rZ#z!>yjE)@KQ(KNNK7NCa8Aeqvf) zp|UId+YEm&@~t;7_m`x7r1i(dS}z|z*C5Vc)<5$Hxmx&cv^&#!d}nj`V?XK3;-$xj^|FtS{LYVG6?6ry%S%sJG9oQeXGgm(&HvAp(hJ_#WgOe0jl|V|+(_bcA zyuRLAZu8H5GJ>)WU+K(8P2r;i7GxH`x!q45*Kq90x2M%PTxKN${_l%Q_(V0S#m8Fq z+b@)hUX;8!0$gsZZo4AEMclo@`p55PfY0ZUX$Z0#F(5j5_UvgA2vgw#Q}SbOIh|fp z?#X$6u`nMJuPwzz=LY^ ziO)D8E+v!1R{2SGs(in@xT%$!B)ary3=;JRDA-j#TLiWV-JBw~>Af4V0P^pdi(R;H z()=MrzNqeI1598|6cqueAxX)2r)9*Rb1_a(_hxQx$d8ac14OZMK>e6{LmwXVA6Lq< zoF6>jJBx6*{m$Re$lXXw27CjfVoNIr8+Uu0aMzVPlu*kz|jjAa>tk5oSL#!OAKQEyXu_H126e2mkP(! zHQkr-4wTJD)8Y((iMIl(N%?ME_ztoW)~{oU;{o7;<8n0vdCVQBpoRulw*4QD8(mdzIb&Dx!u*PfLfk_CIW16^PSh zrDUGGI|xaCo<{AYsY%W%)^VamcIaC?G%g$Vf&XXTCSZ)^o3pW)*HkIE_>C#Ke_2&d z(6alRD#>M>{5G)D->djtSaX*@Q9Y*Yz!f;d<`jpYeE+BP)wx2G^bk*ZlUNq% znoK@2JBUgrUri~XB~~YM&1kqrr9JWrc3CNjpU;w4!42p?ZW*tbPg#?ClOUB=P2RI0 zoh?S4_i2tkZxZH%_hV3F33*_FCBvx^`G?6VdXk|l!35Q{ng0WOnkS6hIeYB_0A?Cy4!4kP>w$IVgT!V&KEiTkuT9DNoUHkIw?D#B zvS=xbWIKJjJ@)K(*u$4J6N@oja~O(49AO$c7WdW(sEMMjs2`fv4Evz*bKx@n;mPN< zz6+N|b*%Y}cqvAT+%Neb$tC@Qo}P=47bCDePFxz&%0sW{ZbQehY6c4DxfRPZ6lu4=5vt~!74z2FaC;B*?7uWL(u3sJHmPI%q6 zESp(9Q+yK&?L~xMm*g#qyFV@t5qmq!$|5J-Shgo961usz;!F8)tKacxYaGr$ERre{*1Es z9qm$&tpMra_m~PyXJlR6Vk{5ATLYF}+YZCqS+c+Ua=rYi^h1p*`^h(iw-4y;^~=q+ zfgD=qB2D<@2ZWj$N){Eu_A*v6A77ctxFjlN7~q%lLZnj;pC*t(*hhNmwBt9~PnNBb zixB)Q@mow4VDGSy22tMFdC2l<#XX${746pz#OZI0Y$h9w3aBPcenJR|%Bxa7QDFJg zK0Oe_6ZfiFNlK<4mYLn)bCFZ1V$;5=bmF?n|DA0GlP zUE&4IMgfuxoz&VFf&+W^jL|lX#KE`tMVJO_S!XB#gJA8|ln*VGeka^Bi@}PodY-EV zv7*2ODiPgxd{#xI*FsJ(ci<5r%sxW_5UB+UFvU3AM`#x}v4|d}2Y1Pd z-(%?V_Gvdet`pYhn=~!RQ6g7>WY7Kq>7V$*Kc`isiKjq?%^2p>Y|(d&-+5CM=j(ax zM$&3D&$6>0@bh+GT{pyO6@6F{KEm#4smD&9`U-NU>;LjW?@GU|&hs<>cBVrA(Fg8S zJSF_nw8Q+P1^JiZ{(i~wGQQ32Z~14o^oN!Bcd=Z&e2oRaU00Z^`HU?72g3Yoa4I~R zpt}82R{HOQx8XS&<$!|2Pq@Rs%OW5MFS1{Q@1NrE--r5Z#^24K<9VCcE2TpJxOvDp zfZAl$g1-}FpIv&!nEq`ZhL-0D?>~+a`O!LFuG0J*bn5_QwYErf79#!QD}P@kL*q;4 z%%&NBOUg{a45SbKLy@t78GxF=G6T|HAgQLm!mE zy#@ezpIld-MYvu2k!jx2xP5Z+$$Y5g@T}~i2*vZ+C1{uVS>EQK{Gs@K0L)|8K1F8) zl44bhND2YlX}zyf`kp{MrE4`=4~~w#JeJ(;p^e6W)vE-q%z;u>|Aj~Da+X_-(4sqp z;Qr0?Biz0}aB|B`{KH_C%RKsUHeZH_l1>p~c+I%N+ev1gB5=FAwtnbcyuNSKPT-cIMFp TdHBO$@PAKbl%-1_8UFr15!>LM diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index e807ee54fbf..e95409e08e9 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. import dataclasses import inspect @@ -299,7 +299,6 @@ def __init__( extra_kwargs["delay_wgrad_compute"] = self.config.delay_wgrad_compute else: raise RuntimeError("Only TE with version >=2.3.0 supports delay_wgrad_compute now.") - if ( self.config.tp_comm_overlap and tp_comm_buffer_name @@ -2117,12 +2116,3 @@ def set_save_original_input(module): "set_save_original_input is only needed on transformer-engine modules that save " "quantized tensors by default. It needs transformer-engine>=2.6.0dev0." ) - - -try: - # pylint: disable=unused-import - from transformer_engine.pytorch import cpu_offload - from transformer_engine.pytorch.float8_tensor import Float8Tensor -except ImportError: - Float8Tensor = None - cpu_offload = None diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py index 74b9a90764d..d501c11a0a9 100644 --- a/megatron/core/models/common/model_chunk_schedule_plan.py +++ b/megatron/core/models/common/model_chunk_schedule_plan.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. from contextlib import nullcontext from typing import Optional @@ -8,9 +8,6 @@ from megatron.core.enums import Fp8Recipe from megatron.core.fp8_utils import get_fp8_context -from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_set_last_layer, -) from megatron.core.pipeline_parallel.utils import ( AbstractSchedulePlan, NoopScheduleNode, @@ -453,8 +450,6 @@ def run( f_layer = f_schedule_plan.get_layer(i) b_layer = b_schedule_plan.get_layer(b_num_layers - 1 - i) torch.cuda.nvtx.range_push(f"layer_{i}f-layer_{b_num_layers - 1 - i}b") - if f_layer.layer.config.fine_grained_activation_offloading: - fine_grained_offloading_set_last_layer(i == f_num_layers - 1) f_input, b_grad = TransformerLayerSchedulePlan.run( f_layer, b_layer, @@ -477,8 +472,6 @@ def run( for i in range(overlapped_layers, f_num_layers): f_layer = f_schedule_plan.get_layer(i) torch.cuda.nvtx.range_push(f"layer_{i}f") - if f_layer.layer.config.fine_grained_activation_offloading: - fine_grained_offloading_set_last_layer(i == f_num_layers - 1) f_input, _ = TransformerLayerSchedulePlan.run(f_layer, None, f_input=f_input) torch.cuda.nvtx.range_pop() diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index 786a1b850dd..fd1cc3d33c6 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. import weakref from contextlib import nullcontext @@ -8,11 +8,6 @@ import torch from megatron.core import tensor_parallel -from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_group_commit, - fine_grained_offloading_group_start, - get_fine_grained_offloading_context, -) from megatron.core.pipeline_parallel.utils import ScheduleNode, make_viewless from megatron.core.transformer.module import float16_to_fp32 from megatron.core.transformer.moe.moe_layer import MoELayer @@ -355,17 +350,13 @@ def submodule_post_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor) Run forward pass for computations between attention and dispatch: pre mlp layernorm->router->dispatch preprocess """ - if layer.offload_mlp_norm: - hidden_states = fine_grained_offloading_group_start(hidden_states, name="mlp_norm") if layer.recompute_pre_mlp_layernorm: layer.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput() - with get_fine_grained_offloading_context(layer.offload_mlp_norm): - pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint( - layer.pre_mlp_layernorm, hidden_states - ) + pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint( + layer.pre_mlp_layernorm, hidden_states + ) else: - with get_fine_grained_offloading_context(layer.offload_mlp_norm): - pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states) + pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states) local_tokens, probs, _ = layer.mlp.router_and_preprocess(pre_mlp_layernorm_output) @@ -446,10 +437,6 @@ def submodule_combine_forward( hidden_states = layer.mlp_bda(layer.training, layer.config.bias_dropout_fusion)( mlp_output_with_bias, residual, layer.hidden_dropout ) - if layer.offload_mlp_norm: - (hidden_states,) = fine_grained_offloading_group_commit( - hidden_states, name="mlp_norm", forced_released_tensors=[residual] - ) output = make_viewless_tensor( inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True ) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index ae292649561..654827dc6fb 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. from collections import OrderedDict from typing import Dict, Literal, Optional @@ -18,9 +18,6 @@ ) from megatron.core.models.common.language_module.language_module import LanguageModule from megatron.core.packed_seq_params import PackedSeqParams -from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_init_chunk_handler, -) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.quantization.utils import get_quant_config_or_none from megatron.core.tensor_parallel import gather_from_sequence_parallel_region @@ -120,7 +117,6 @@ def __init__( self.parallel_output = parallel_output self.share_embeddings_and_output_weights = share_embeddings_and_output_weights self.vp_stage = vp_stage - self.disable_param_offloading = True if hasattr(self.config, 'position_embedding_type'): self.position_embedding_type = self.config.position_embedding_type @@ -414,22 +410,6 @@ def _preprocess( return preproc_output - def preprocess_for_fine_grained_offloading(self): - """Preprocess for fine-grained activation offloading.""" - fine_grained_offloading_init_chunk_handler( - self.vp_stage, self.config.min_offloaded_tensor_size - ) - if self.disable_param_offloading: - for param in self.decoder.parameters(): - param.offloading_activation = False - if self.mtp_process: - for param in self.mtp.parameters(): - param.offloading_activation = False - if self.post_process: - for param in self.output_layer.parameters(): - param.offloading_activation = False - self.disable_param_offloading = False - def forward( self, input_ids: Tensor, @@ -455,8 +435,6 @@ def forward( runtime_gather_output (bool): Gather output at runtime. Default None means `parallel_output` arg in the constructor will be used. """ - if self.config.fine_grained_activation_offloading: - self.preprocess_for_fine_grained_offloading() inference_context = deprecate_inference_params(inference_context, inference_params) @@ -723,9 +701,6 @@ def build_schedule_plan( TransformerModelChunkSchedulePlan: The model chunk schedule plan. """ - if self.config.fine_grained_activation_offloading: - self.preprocess_for_fine_grained_offloading() - from ..common.model_chunk_schedule_plan import TransformerModelChunkSchedulePlan return TransformerModelChunkSchedulePlan( diff --git a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py deleted file mode 100644 index b28bbcbeddc..00000000000 --- a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py +++ /dev/null @@ -1,603 +0,0 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - -import warnings -from collections import deque -from contextlib import nullcontext -from typing import Any - -import torch - -# CPU offload implementation for pipeline parallelism -DEBUG = False -DEBUG_RANK = 0 - - -def debug_rank(message): - """Print debug message for a specific rank when DEBUG is enabled.""" - # pylint: disable=bad-builtin - if not DEBUG: - return - assert torch.distributed.is_initialized() - if torch.distributed.get_rank() == DEBUG_RANK: - print(message) - - -def set_ideal_affinity_for_current_gpu(): - """Set CPU affinity for the current GPU to optimize host-device transfers.""" - import uuid - - try: - import cuda.bindings.driver as cuda_driver - import cuda.bindings.runtime as cuda_runtime - except ImportError: - import cuda.cuda as cuda_driver - import cuda.cudart as cuda_runtime - try: - import pynvml - except ImportError: - warnings.warn("pynvml is not installed, skipping GPU affinity setting") - return - - # Get current CUDA device ID - err, device_id = cuda_runtime.cudaGetDevice() - assert err == cuda_runtime.cudaError_t.cudaSuccess - # Get device UUID - err, device_uuid = cuda_driver.cuDeviceGetUuid(device_id) - assert err == cuda_driver.CUresult.CUDA_SUCCESS - # Set CPU affinity based on GPU's NUMA node - pynvml.nvmlInit() - handle = pynvml.nvmlDeviceGetHandleByUUID("GPU-" + str(uuid.UUID(bytes=device_uuid.bytes))) - pynvml.nvmlDeviceSetCpuAffinity(handle) - - -class PipelineOffloadManager: - """ - Singleton manager for coordinating activation offloading across pipeline stages. - Manages chunk handlers, synchronizes GPU-CPU transfers, - and handles virtual pipeline parallelism. - """ - - OFFLOAD_MGR = None - - @classmethod - def get_instance(cls): - """Get the singleton instance of PipelineOffloadManager.""" - if cls.OFFLOAD_MGR is None: - cls.OFFLOAD_MGR = PipelineOffloadManager() - return cls.OFFLOAD_MGR - - def __init__(self): - """Initialize the manager with queues and dedicated CUDA streams.""" - from megatron.core import parallel_state - - # Queue to store chunk handlers for backward pass - self._queue = deque() - if parallel_state.get_virtual_pipeline_model_parallel_world_size() is None: - self._vpp = 1 - else: - self._vpp = parallel_state.get_virtual_pipeline_model_parallel_world_size() - - # Cache chunk handlers for each virtual pipeline stage - self._stages = [[] for _ in range(self._vpp)] - # allocate streams and events for synchronization - self._d2h_stream = torch.cuda.Stream() - self._h2d_stream = torch.cuda.Stream() - self.reset() - - @property - def d2h_stream(self): - """Get the device-to-host (GPU to CPU) transfer stream.""" - return self._d2h_stream - - @property - def h2d_stream(self): - """Get the host-to-device (CPU to GPU) transfer stream.""" - return self._h2d_stream - - def reset(self): - """Reset manager state for a new training iteration.""" - set_ideal_affinity_for_current_gpu() - self._inside_context = False - self._cur_forward_chunk = None - self._cur_backward_chunk = None - # Track the first microbatch of the last virtual pipeline stage - self._is_first_last_vpp_chunk = True - - def flush(self): - """Flush all staged chunks to the backward queue in reverse order.""" - # Ensure all virtual pipeline stages have the same number of chunks - if len(self._stages[0]) == len(self._stages[-1]): - lens = [len(e) for e in self._stages] - assert min(lens) == max(lens), "All stages must have same chunk count" - # Clear the last stage and push all chunks in reverse order for backward - self._stages[-1] = [] - for chunks in reversed(self._stages): - for chunk in chunks: - self.push(chunk) - # Clear all stages after flushing - for i in range(self._vpp): - self._stages[i] = [] - - def push(self, handler): - """Add a chunk handler to the backward queue.""" - debug_rank(f"pushing handler {handler}") - self._queue.append(handler) - - def pop(self): - """Remove and set the next non-empty chunk as the current backward chunk.""" - assert self.size(), "Cannot pop from empty queue" - while self._queue: - self._cur_backward_chunk = self._queue.popleft() - if not self._cur_backward_chunk.is_empty_chunk(): - break - debug_rank(f"popping handler {self._cur_backward_chunk}") - - def front(self): - """Get the first non-empty chunk handler without removing it from the queue.""" - if not self.size(): - return None - for chunk_handler in self._queue: - if not chunk_handler.is_empty_chunk(): - return chunk_handler - return None - - def size(self): - """Return the number of chunk handlers in the queue.""" - return len(self._queue) - - def init_model_chunk_offload_handler(self, vp_stage, min_offloaded_tensor_size=1024 * 1024): - """ - Initialize a chunk offload handler for a model chunk (microbatch). - - Args: - vp_stage: Virtual pipeline stage index (None means stage 0) - min_offloaded_tensor_size: Minimum tensor size (in elements) to offload - """ - if vp_stage is None: - cur_vpp_rank = 0 - else: - cur_vpp_rank = vp_stage - - is_first_last_vpp_chunk = self._is_first_last_vpp_chunk - # Flush staged chunks when reaching the last virtual pipeline stage - if cur_vpp_rank == self._vpp - 1: - self.flush() - # Determine if this is the first microbatch of the last virtual pipeline stage - is_first_last_vpp_chunk = is_first_last_vpp_chunk and (cur_vpp_rank == self._vpp - 1) - - cur_chunk = ChunkOffloadHandler(is_first_last_vpp_chunk, min_offloaded_tensor_size) - self._stages[cur_vpp_rank].append(cur_chunk) - # For the last stage, push immediately and flush - if cur_vpp_rank == self._vpp - 1: - self._is_first_last_vpp_chunk = False - self.push(cur_chunk) - self.flush() - self._cur_forward_chunk = cur_chunk - cur_chunk.vpp_rank = cur_vpp_rank - - def set_last_layer(self, is_last_layer): - """Mark whether the current forward chunk is processing the last layer.""" - self._cur_forward_chunk.is_last_layer = is_last_layer - - def cur_forward_chunk(self): - """Get the current forward pass chunk handler.""" - return self._cur_forward_chunk - - def cur_backward_chunk(self): - """Get the current backward pass chunk handler.""" - return self._cur_backward_chunk - - def __enter__(self): - """Enter context manager to enable activation offloading hooks.""" - debug_rank("----__enter__") - from megatron.core.extensions.transformer_engine import cpu_offload - - if cpu_offload is not None: - cpu_offload.CPUOffloadEnabled = True - self.inside_context = True - - torch._C._autograd._push_saved_tensors_default_hooks( - self.on_save_for_backward, self.on_get_saved_tensor - ) - - def __exit__(self, *args: Any): - """Exit context manager and restore original tensor saving behavior.""" - debug_rank("----__exit__") - from megatron.core.extensions.transformer_engine import cpu_offload - - if cpu_offload is not None: - cpu_offload.CPUOffloadEnabled = False - self.inside_context = False - torch._C._autograd._pop_saved_tensors_default_hooks() - - def on_save_for_backward(self, tensor: torch.Tensor) -> Any: - """ - Hook called when autograd saves a tensor for backward pass. - Returns a tag to identify the tensor later. - """ - debug_rank(f"------on_save_for_backward {tensor.shape}") - assert self.inside_context, "Must be inside offload context" - return self.cur_forward_chunk().tensor_push(tensor) - - def on_get_saved_tensor(self, saved_state: Any) -> torch.Tensor: - """ - Hook called when autograd retrieves a saved tensor during backward pass. - Returns the actual tensor (potentially reloading from CPU). - """ - debug_rank(f"----on_get_saved_tensor {saved_state}") - return self.cur_backward_chunk().tensor_pop(saved_state) - - -class ChunkOffloadHandler: - """ - Handles activation offloading and reloading for a single pipeline chunk (microbatch). - Manages tensor groups, coordinates asynchronous GPU-CPU transfers, and handles synchronization. - """ - - @staticmethod - def offload(src_tensor, pin_memory=True): - """Offload.""" - debug_rank("--------offload") - from megatron.core.extensions.transformer_engine import Float8Tensor - - fp8_offload = isinstance(src_tensor, Float8Tensor) if Float8Tensor is not None else False - - if not src_tensor.is_contiguous(): - src_tensor = src_tensor.contiguous() - - cpu_backup = torch.empty( - src_tensor.size(), - dtype=torch.uint8 if fp8_offload else src_tensor.dtype, - layout=src_tensor.layout, - device="cpu", - pin_memory=pin_memory, - ) - - if fp8_offload: - cpu_backup = Float8Tensor.make_like(src_tensor, data=cpu_backup) - - cpu_backup.copy_(src_tensor, non_blocking=pin_memory) - state = (src_tensor.device, cpu_backup) - return state - - @staticmethod - def reload(state, non_blocking=None): - """Reload.""" - debug_rank("------reload") - dev, cpu_backup = state - if non_blocking is None: - non_blocking = cpu_backup.is_pinned() - return cpu_backup.to(dev, non_blocking=non_blocking) - - def __init__(self, is_first_last_vpp_chunk, min_offloaded_tensor_size): - # Data Structure to maintain reference to activation tensors - self._tensor_tag_to_state = {} - # Mark the first microbatch of the last virtual pipeline stage - self._is_first_last_vpp_chunk = is_first_last_vpp_chunk - - # Group management for batching offload/reload operations - self._offloaded_group_index = 0 - self._groups_to_offload = [] - self._groups_to_reload = [] - self._tensor_count_current_group = 0 - - # Counter for special torch tensor types (FakeTensor, FunctionalTensor) - self.torch_tensor_count = 0 - self.d2h_stream = PipelineOffloadManager.get_instance().d2h_stream - self.h2d_stream = PipelineOffloadManager.get_instance().h2d_stream - self._offload_events = {} - self._reload_events = {} - self.min_offloaded_tensor_size = min_offloaded_tensor_size - self.is_last_layer = False - - def is_empty_chunk(self): - """Check if this chunk has no tensors to manage.""" - return len(self._tensor_tag_to_state) == 0 - - def is_first_last_layer(self): - """ - Check if this is the last layer of the first microbatch of the last vp stage. - These tensors should not be offloaded to avoid unnecessary overhead. - """ - debug_rank( - f"------is_first_last_layer {self._is_first_last_vpp_chunk} {self.is_last_layer}" - ) - return self._is_first_last_vpp_chunk and self.is_last_layer - - def tensor_push(self, tensor): - """Push tensor to the offload handler.""" - torch_stray_tensor = isinstance( - tensor, - ( - torch._subclasses.fake_tensor.FakeTensor, - torch._subclasses.functional_tensor.FunctionalTensor, - ), - ) - - if not torch_stray_tensor: - # Assign unique tag based on group index and position within group - tensor_tag = (self._offloaded_group_index, self._tensor_count_current_group) - self._tensor_count_current_group += 1 - assert tensor_tag not in self._tensor_tag_to_state, "Duplicate tensor tag" - self._tensor_tag_to_state[tensor_tag] = tensor - else: - # Use negative group ID for special tensor types - tensor_tag = (-1, self.torch_tensor_count) - self.torch_tensor_count += 1 - self._tensor_tag_to_state[tensor_tag] = tensor - debug_rank(f"--------tensor_push {tensor_tag}") - return tensor_tag - - def tensor_pop(self, tensor_tag): - """Pop tensor from the offload handler.""" - debug_rank(f"--------tensor_pop {tensor_tag}") - assert tensor_tag in self._tensor_tag_to_state, f"Tag {tensor_tag} not found" - tensor = self._tensor_tag_to_state.pop(tensor_tag) - # If tensor is offloaded (stored as tuple), reload it - if isinstance(tensor, tuple): - tensor = self.reload(tensor) - debug_rank(f"--------tensor_pop {tensor.shape}") - return tensor - - def tensor_need_offloading_checker(self, tensor): - """Check if the tensor needs to be offloaded.""" - if tensor.numel() < self.min_offloaded_tensor_size: - return False - # Respect tensor's offload preference if specified - if hasattr(tensor, "offloading_activation") and not tensor.offloading_activation: - return False - return True - - def bulk_offload_group(self, group_to_offload): - """offload a group of tensors recorded in tensor_push().""" - debug_rank("------bulk_offload_group") - assert not self.is_first_last_layer(), "Should not offload first-last layer" - group_id_to_offload, name = group_to_offload - torch.cuda.nvtx.range_push("activation offloading " + name) - with torch.cuda.stream(self.d2h_stream): - for tensor_tag, state in self._tensor_tag_to_state.items(): - group_id, _ = tensor_tag - if group_id == group_id_to_offload: - debug_rank(f"------tensor_tag {tensor_tag}") - debug_rank(f"------group_to_offload {group_to_offload}") - assert not isinstance(state, tuple), "Tensor already offloaded" - tensor_on_device = state - if self.tensor_need_offloading_checker(tensor_on_device): - state = self.offload(tensor_on_device) - event = torch.cuda.Event() - event.record(self.d2h_stream) - self._offload_events[name] = event - tensor_on_device.record_stream(self.d2h_stream) - self._tensor_tag_to_state[tensor_tag] = state - torch.cuda.nvtx.range_pop() - - def get_offload_event(self, name): - """Get the CUDA event for a named offload operation.""" - return self._offload_events.get(name, None) - - def get_reload_event(self, name): - """Get the CUDA event for a named reload operation.""" - return self._reload_events.get(name, None) - - def bulk_reload_group(self, group_to_reload): - """Bulk reload group.""" - debug_rank("----bulk_reload_group") - found_reload_group = False - group_id_to_reload, name = group_to_reload - torch.cuda.nvtx.range_push("activation reloading " + name) - with torch.cuda.stream(self.h2d_stream): - for tensor_label, state in self._tensor_tag_to_state.items(): - group_id, _ = tensor_label - if group_id == group_id_to_reload: - debug_rank(f"----tensor_label {tensor_label}") - found_reload_group = True - event = self.get_offload_event(name) - # Only reload if tensor was offloaded (stored as tuple) - if isinstance(state, tuple): - # Wait for offload to complete before reloading - torch.cuda.current_stream().wait_event(event) - recovered_tensor = self.reload(state) - event.record(self.h2d_stream) - self._reload_events[name] = event - debug_rank(f"----recovered_tensor {recovered_tensor.shape}") - self._tensor_tag_to_state[tensor_label] = recovered_tensor - torch.cuda.nvtx.range_pop() - return found_reload_group - - def pre_reload_last_layer(self): - """Pre-reload the last layer of this chunk to hide reload latency.""" - debug_rank("pre_reload_last_layer") - assert not self._is_first_last_vpp_chunk, "Should not pre-reload first chunk" - debug_rank(f"len(self._groups_to_reload) {len(self._groups_to_reload)}") - if len(self._groups_to_reload) > 0: - # Reload the last group (last layer) early - if self.bulk_reload_group(self._groups_to_reload[-1]): - self._groups_to_reload.pop() - - def should_bulk_offload(self): - """Determine if the current group should be offloaded.""" - # Don't offload the first backward chunk's last layer - if self.is_first_last_layer(): - return False - - # Check if next backward chunk is this chunk (for last pipeline stage) - next_backward_chunk = PipelineOffloadManager.get_instance().front() - if next_backward_chunk is not None and next_backward_chunk is self: - # Don't offload last layer if it's about to be used immediately - if self.is_last_layer: - return False - - return True - - def bulk_offload(self, forced_released_tensors): - """Offload a group of tensors and optionally release their GPU memory.""" - debug_rank("----bulk_offload") - if self.should_bulk_offload(): - group_to_offload = self._groups_to_offload.pop() - self._groups_to_reload.append(group_to_offload) - self.bulk_offload_group(group_to_offload) - # Manually release tensors not auto-freed by torch GC - if len(forced_released_tensors) > 0: - cur_stream = torch.cuda.current_stream() - for release_tensor in forced_released_tensors: - if self.tensor_need_offloading_checker(release_tensor): - # Ensure tensor is not in use before freeing - release_tensor.record_stream(cur_stream) - release_tensor.untyped_storage().resize_(0) - - def on_group_commit_forward(self, forced_released_tensors): - """Called at the end of a layer group's forward pass to trigger offloading.""" - debug_rank("--on_group_commit_forward") - # Wait for compute to finish before starting offload - self.d2h_stream.wait_stream(torch.cuda.current_stream()) - self.bulk_offload(forced_released_tensors) - - def bulk_reload(self): - """Reload the next group of tensors from CPU to GPU.""" - debug_rank("--bulk_reload") - if len(self._groups_to_reload) > 0: - # Reload the next layer group - if self.bulk_reload_group(self._groups_to_reload[-1]): - debug_rank(f"--bulk_reload_group {self._groups_to_reload}") - self._groups_to_reload.pop() - else: - # Pre-load the last layer of the next backward chunk to hide latency - next_backward_chunk = PipelineOffloadManager.get_instance().front() - if next_backward_chunk is not None: - next_backward_chunk.pre_reload_last_layer() - - def on_group_commit_backward(self, name): - """ - Called at the end of a layer group's backward pass. - Ensures correct chunk is active and synchronizes reloads. - """ - debug_rank("--on_group_commit_backward") - cur_backward_chunk = PipelineOffloadManager.get_instance().cur_backward_chunk() - # Switch to this chunk if it's not already current - if cur_backward_chunk is not self: - PipelineOffloadManager.get_instance().pop() - cur_backward_chunk = PipelineOffloadManager.get_instance().cur_backward_chunk() - assert cur_backward_chunk is self, "Chunk mismatch" - # Wait for reload to complete before using tensors - event = self.get_reload_event(name) - if event is not None: - torch.cuda.current_stream().wait_event(event) - self._offloaded_group_index = self._offloaded_group_index - 1 - - def on_group_start_forward(self, name): - """ - Called at the start of a layer group's forward pass. - Increments group index and prepares for offloading. - """ - debug_rank(f"--on_group_start_forward") - self._offloaded_group_index = self._offloaded_group_index + 1 - self._tensor_count_current_group = 0 - self._groups_to_offload.append((self._offloaded_group_index, name)) - - def on_group_start_backward(self): - """ - Called at the start of a layer group's backward pass. - Triggers reloading of tensors from CPU. - """ - debug_rank("--on_group_start_backward") - # Wait for compute to finish before starting reload - self.h2d_stream.wait_stream(torch.cuda.current_stream()) - self.bulk_reload() - - -class FineGrainedOffloadingGroupCommitFunction(torch.autograd.Function): - """ - Identity operation that marks the end of a layer group for offload synchronization. - Triggers offload during forward and synchronizes reload during backward. - """ - - @staticmethod - def forward(ctx, *args): - # pylint: disable=missing-function-docstring - debug_rank("FineGrainedOffloadingGroupCommitFunction forward") - - forced_released_tensors = args[-1] - name = args[-2] - cpu_offload_handler = args[-3] - tensor = args[:-3] - cpu_offload_handler.on_group_commit_forward(forced_released_tensors) - ctx.cpu_offload_handler = cpu_offload_handler - ctx.name = name - - # return the identical tensor - return tensor - - @staticmethod - def backward(ctx, *grad_output): - # pylint: disable=missing-function-docstring - debug_rank("FineGrainedOffloadingGroupCommitFunction backward") - - cpu_offload_handler = ctx.cpu_offload_handler - cpu_offload_handler.on_group_commit_backward(ctx.name) - return grad_output + (None, None, None) - - -def fine_grained_offloading_group_commit(*tensor, name, forced_released_tensors=[]): - """ - Specify the tensors to be released after offloading. - forced_released_tensors is a list of tensors to be released after offloading. - The tensors will be untyped_storage().resize_(0) after offloading. - Note: specify the tensors only when they are not automatically released by torch gc. - """ - cur_forward_chunk = PipelineOffloadManager.get_instance().cur_forward_chunk() - return FineGrainedOffloadingGroupCommitFunction.apply( - *tensor, cur_forward_chunk, name, forced_released_tensors - ) - - -class FineGrainedOffloadingGroupStartFunction(torch.autograd.Function): - """ - Identity operation that marks the start of a layer group for offload/reload. - Prepares for offload during forward and triggers reload during backward. - """ - - @staticmethod - def forward(ctx, tensor, cpu_offload_handler, name): - # pylint: disable=missing-function-docstring - ctx.cpu_offload_handler = cpu_offload_handler - debug_rank("FineGrainedOffloadingGroupStartFunction forward") - - cpu_offload_handler.on_group_start_forward(name) - # return the identical tensor - return tensor - - @staticmethod - def backward(ctx, grad_output): - # pylint: disable=missing-function-docstring - debug_rank("FineGrainedOffloadingGroupStartFunction backward") - cpu_offload_handler = ctx.cpu_offload_handler - cpu_offload_handler.on_group_start_backward() - return grad_output, None, None - - -def fine_grained_offloading_group_start(tensor, name=None): - """Mark the start of a layer group and prepare for offload/reload.""" - cur_forward_chunk = PipelineOffloadManager.get_instance().cur_forward_chunk() - return FineGrainedOffloadingGroupStartFunction.apply(tensor, cur_forward_chunk, name) - - -def get_fine_grained_offloading_context(flag): - """Get the fine-grained offload context""" - return PipelineOffloadManager.get_instance() if flag else nullcontext() - - -def fine_grained_offloading_set_last_layer(is_last_layer): - """Set the last layer flag.""" - PipelineOffloadManager.get_instance().set_last_layer(is_last_layer) - - -def fine_grained_offloading_init_chunk_handler(vp_stage, min_offloaded_tensor_size): - """Initialize the chunk handler, called at the start of a microbatch forward pass.""" - PipelineOffloadManager.get_instance().init_model_chunk_offload_handler( - vp_stage, min_offloaded_tensor_size - ) - - -def fine_grained_offloading_reset(): - """Reset the chunk handler, called at the start of a training iteration.""" - PipelineOffloadManager.get_instance().reset() diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index 09f95ac25d2..e83f8d90635 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. import contextlib from functools import partial @@ -9,9 +9,6 @@ from megatron.core import parallel_state from megatron.core.enums import ModelType -from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_reset, -) from megatron.core.pipeline_parallel.p2p_communication import P2PCommunicator from megatron.core.pipeline_parallel.utils import ( is_pp_first_stage, @@ -565,9 +562,6 @@ def forward_backward_no_pipelining( if config.timers is not None: config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time) - if not forward_only and config.fine_grained_activation_offloading: - fine_grained_offloading_reset() - no_sync_func = config.no_sync_func if no_sync_func is None: no_sync_func = contextlib.nullcontext @@ -904,9 +898,6 @@ def forward_backward_pipelining_with_interleaving( adjust_tensor_shapes_fn is None ), "adjust_tensor_shapes_fn is not supported for interleaved pipeline parallelism" - if not forward_only and config.fine_grained_activation_offloading: - fine_grained_offloading_reset() - if config.overlap_p2p_comm and config.batch_p2p_comm: raise ValueError("Can not use both overlap_p2p_comm and batch_p2p_comm") @@ -2052,9 +2043,6 @@ def forward_backward_pipelining_without_interleaving( if config.timers is not None: config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time) - if not forward_only and config.fine_grained_activation_offloading: - fine_grained_offloading_reset() - # Disable async grad reductions no_sync_func = config.no_sync_func if no_sync_func is None: diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py index 2ae15bef0d9..54cac0e41e3 100644 --- a/megatron/core/tensor_parallel/random.py +++ b/megatron/core/tensor_parallel/random.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # Parts of the code here are adapted from PyTorch # repo: https://github.com/pytorch/pytorch @@ -510,11 +510,10 @@ def forward(ctx, run_function, checkpoint_without_output_obj, *args): @staticmethod def backward(ctx, *args): """Backward pass.""" - inputs = ctx.inputs + inputs = ctx.saved_tensors outputs = ctx.outputs torch.autograd.backward(outputs, args) ctx.outputs = None - ctx.inputs = None grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp for inp in inputs) return (None, None) + grads @@ -574,9 +573,8 @@ def _recompute(self, _): recompute_ctx = contextlib.nullcontext() fp8_ctx = contextlib.nullcontext() - inputs = self.ctx.saved_tensors with torch.enable_grad(), fp8_ctx, recompute_ctx: - outputs = self.run_function(*inputs) + outputs = self.run_function(*self.ctx.saved_tensors) self.run_function = None self.rng_states = None @@ -592,7 +590,6 @@ def _recompute(self, _): output.untyped_storage().copy_(recomputation_output.untyped_storage()) self.ctx.outputs = outputs - self.ctx.inputs = inputs self.outputs = None self.ctx = None diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index 3427b5ee3ab..d4e990041ca 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. from abc import ABC, abstractmethod from dataclasses import dataclass @@ -22,11 +22,6 @@ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) -from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_group_commit, - fine_grained_offloading_group_start, - get_fine_grained_offloading_context, -) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.module import MegatronModule @@ -193,21 +188,6 @@ def __init__( and "core_attn" in self.config.recompute_modules ) - self.offload_qkv_linear = ( - self.config.fine_grained_activation_offloading - and "qkv_linear" in self.config.offload_modules - ) - - self.offload_core_attention = ( - self.config.fine_grained_activation_offloading - and "core_attn" in self.config.offload_modules - ) - - self.offload_attn_proj = ( - self.config.fine_grained_activation_offloading - and "attn_proj" in self.config.offload_modules - ) - # Output. self.linear_proj = build_module( submodules.linear_proj, @@ -750,17 +730,9 @@ def forward( if output_gate: assert split_qkv, "output_gate is not supported for unsplit mixed_qkv tensor." - if self.offload_qkv_linear: - hidden_states = fine_grained_offloading_group_start(hidden_states, name="qkv_linear") - with get_fine_grained_offloading_context(self.offload_qkv_linear): - qkv_output = self.get_query_key_value_tensors( - hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv - ) - if self.offload_qkv_linear: - qkv_output, _ = fine_grained_offloading_group_commit( - qkv_output, name="qkv_linear", forced_released_tensors=[hidden_states] - ) - + qkv_output = self.get_query_key_value_tensors( + hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv + ) attn_mask_type = self.attn_mask_type block_table = None gate = None @@ -909,20 +881,17 @@ def forward( packed_seq_params=packed_seq_params, ) else: - if self.offload_core_attention and self.training: - query = fine_grained_offloading_group_start(query, name="core_attn") if inference_context is None or inference_context.is_static_batching(): # Static batching attention kernel. - with get_fine_grained_offloading_context(self.offload_core_attention): - core_attn_out = self.core_attention( - query, - key, - value, - attention_mask, - attn_mask_type=attn_mask_type, - attention_bias=attention_bias, - packed_seq_params=packed_seq_params, - ) + core_attn_out = self.core_attention( + query, + key, + value, + attention_mask, + attn_mask_type=attn_mask_type, + attention_bias=attention_bias, + packed_seq_params=packed_seq_params, + ) else: # Dynamic batching attention kernel. @@ -942,10 +911,6 @@ def forward( block_table, ) core_attn_out = rearrange(core_attn_out, 's b h d -> s b (h d)') - if self.offload_core_attention and self.training: - (core_attn_out,) = fine_grained_offloading_group_commit( - core_attn_out, name="core_attn", forced_released_tensors=[query, key, value] - ) if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd': # reshape to same output shape as unpacked case @@ -966,14 +931,7 @@ def forward( # ================= nvtx_range_push(suffix="linear_proj") - if self.offload_attn_proj: - core_attn_out = fine_grained_offloading_group_start(core_attn_out, name="attn_proj") - with get_fine_grained_offloading_context(self.offload_attn_proj): - output, bias = self.linear_proj(core_attn_out) - if self.offload_attn_proj: - output, bias = fine_grained_offloading_group_commit( - output, bias, name="attn_proj", forced_released_tensors=[core_attn_out] - ) + output, bias = self.linear_proj(core_attn_out) nvtx_range_pop(suffix="linear_proj") return output, bias diff --git a/megatron/core/transformer/moe/README.md b/megatron/core/transformer/moe/README.md index a44daea38e2..0a933aed0df 100644 --- a/megatron/core/transformer/moe/README.md +++ b/megatron/core/transformer/moe/README.md @@ -210,20 +210,6 @@ Enable A2A overlap across different batches inspired by the DSv3 DualPipe implme --delay-wgrad-compute ``` -### Fine-grained Activation Offloading (collaborated with rednote) -Offload the input activation at the granularity of modules - -**Usage** -```bash -# Enable fine-grained activation offloading ---fine-grained-activation-offloading - -# Specify which modules are going to offload its input -# Choices: "attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act". ---offload-modules expert_fc1 -``` -For more details, please refer to the ```docs/source/api-guide/fine_grained_activation_offloading.md``` - ### MoE Related Arguments | Item | Description | | --- | --- | diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py index ca308da0d21..d0ac20a7536 100644 --- a/megatron/core/transformer/moe/experts.py +++ b/megatron/core/transformer/moe/experts.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. import copy import itertools @@ -27,11 +27,6 @@ from megatron.core.fusions.fused_bias_swiglu import weighted_bias_swiglu_impl from megatron.core.fusions.fused_weighted_squared_relu import weighted_squared_relu_impl from megatron.core.jit import jit_fuser -from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_group_commit, - fine_grained_offloading_group_start, - get_fine_grained_offloading_context, -) from megatron.core.tensor_parallel.layers import ( _initialize_affine_weight_cpu, _initialize_affine_weight_gpu, @@ -830,16 +825,6 @@ def __init__( tp_group=pg_collection.expt_tp, ) - self.offload_expert_fc1 = ( - self.config.fine_grained_activation_offloading - and "expert_fc1" in self.config.offload_modules - ) - - self.offload_moe_act = ( - self.config.fine_grained_activation_offloading - and "moe_act" in self.config.offload_modules - ) - self.activation_recompute = ( self.config.recompute_granularity == 'selective' and "moe_act" in self.config.recompute_modules @@ -849,12 +834,6 @@ def __init__( set_save_original_input(self.linear_fc2) - # This is to avoid the CPU overhead of multiple d2h copies - if self.offload_expert_fc1 and not (self.config.fp8 or self.config.fp4): - from megatron.core.extensions.transformer_engine import set_save_original_input - - set_save_original_input(self.linear_fc1) - if self.config.fp8 or self.config.fp4: assert HAVE_TE, "FP8 and FP4 requires TE." self.quantization_padding = Fp8Padding(self.num_local_experts) @@ -919,21 +898,9 @@ def forward( # Probs already applied, so reset to 1. permuted_probs = torch.ones_like(permuted_probs) - if self.offload_expert_fc1: - permuted_local_hidden_states = fine_grained_offloading_group_start( - permuted_local_hidden_states, name="expert_fc1" - ) - with get_fine_grained_offloading_context(self.offload_expert_fc1): - fc1_output, bias_parallel = self.linear_fc1( - permuted_local_hidden_states, tokens_per_expert - ) - if self.offload_expert_fc1: - fc1_output, bias_parallel = fine_grained_offloading_group_commit( - fc1_output, - bias_parallel, - name="expert_fc1", - forced_released_tensors=[permuted_local_hidden_states], - ) + intermediate_parallel, bias_parallel = self.linear_fc1( + permuted_local_hidden_states, tokens_per_expert + ) def bias_act_func(intermediate_parallel, bias_parallel, permuted_probs): if self.config.use_te_activation_func: @@ -993,26 +960,18 @@ def glu(x): intermediate_parallel = intermediate_parallel.to(original_dtype) return intermediate_parallel - if self.offload_moe_act: - fc1_output = fine_grained_offloading_group_start(fc1_output, name="moe_act") - if self.activation_recompute: self.activation_checkpoint = tensor_parallel.CheckpointWithoutOutput() - with get_fine_grained_offloading_context(self.offload_moe_act): - bias_act_output = self.activation_checkpoint.checkpoint( - bias_act_func, fc1_output, bias_parallel, permuted_probs - ) - else: - with get_fine_grained_offloading_context(self.offload_moe_act): - bias_act_output = bias_act_func(fc1_output, bias_parallel, permuted_probs) - - output, output_bias = self.linear_fc2(bias_act_output, tokens_per_expert) - if self.activation_recompute: + intermediate_parallel = self.activation_checkpoint.checkpoint( + bias_act_func, intermediate_parallel, bias_parallel, permuted_probs + ) + output, output_bias = self.linear_fc2(intermediate_parallel, tokens_per_expert) self.activation_checkpoint.discard_output_and_register_recompute(output) - if self.offload_moe_act: - (output,) = fine_grained_offloading_group_commit( - output, name="moe_act", forced_released_tensors=[fc1_output] + else: + intermediate_parallel = bias_act_func( + intermediate_parallel, bias_parallel, permuted_probs ) + output, output_bias = self.linear_fc2(intermediate_parallel, tokens_per_expert) # upad and concat the output if self.config.fp8 or self.config.fp4: diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py index 5d3f16c1041..a8893ebec36 100644 --- a/megatron/core/transformer/multi_latent_attention.py +++ b/megatron/core/transformer/multi_latent_attention.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. import math @@ -22,11 +22,6 @@ _yarn_get_mscale, apply_rotary_pos_emb, ) -from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_group_commit, - fine_grained_offloading_group_start, - get_fine_grained_offloading_context, -) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel.layers import ColumnParallelLinear from megatron.core.tensor_parallel.mappings import ( @@ -271,19 +266,15 @@ def forward( query, key, value, attention_mask, packed_seq_params=packed_seq_params ) else: - if self.offload_core_attention and self.training: - query = fine_grained_offloading_group_start(query, name="core_attn") - if inference_context is None or inference_context.is_static_batching(): - with get_fine_grained_offloading_context(self.offload_core_attention): - core_attn_out = self.core_attention( - query, - key, - value, - attention_mask, - packed_seq_params=packed_seq_params, - attn_mask_type=attn_mask_type, - ) + core_attn_out = self.core_attention( + query, + key, + value, + attention_mask, + packed_seq_params=packed_seq_params, + attn_mask_type=attn_mask_type, + ) elif self.cache_mla_latents: # Dynamic batching attention kernel. q, k, v = (query, key, value) @@ -304,10 +295,6 @@ def forward( # Only rearrange if not in absorption mode (Flash MLA handles format correctly) if not inference_context.is_decode_only(): core_attn_out = rearrange(core_attn_out, 's b h d -> s b (h d)') - if self.offload_core_attention and self.training: - (core_attn_out,) = fine_grained_offloading_group_commit( - core_attn_out, name="core_attn", forced_released_tensors=[query, key, value] - ) # We are doing absorption with cache mla latents and decode mode. if self.cache_mla_latents and inference_context.is_decode_only(): @@ -333,14 +320,7 @@ def forward( # ================= # Output. [sq, b, h] # ================= - if self.offload_attn_proj: - core_attn_out = fine_grained_offloading_group_start(core_attn_out, name="attn_proj") - with get_fine_grained_offloading_context(self.offload_attn_proj): - output, bias = self.linear_proj(core_attn_out) - if self.offload_attn_proj: - output, bias = fine_grained_offloading_group_commit( - output, bias, name="attn_proj", forced_released_tensors=[core_attn_out] - ) + output, bias = self.linear_proj(core_attn_out) return output, bias diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py index a619b9ffa55..bd3aa9c8c96 100755 --- a/megatron/core/transformer/multi_token_prediction.py +++ b/megatron/core/transformer/multi_token_prediction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. from contextlib import nullcontext from dataclasses import dataclass @@ -13,9 +13,6 @@ from megatron.core.fp8_utils import get_fp8_context from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider from megatron.core.packed_seq_params import PackedSeqParams -from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_set_last_layer, -) from megatron.core.pipeline_parallel.utils import is_vp_last_stage from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel import ( @@ -904,8 +901,6 @@ def forward( hidden_states_list = list(torch.chunk(hidden_states, 1 + offset, dim=0)) hidden_states = hidden_states_list[offset] for layer_number in range(len(self.layers)): - if self.config.fine_grained_activation_offloading: - fine_grained_offloading_set_last_layer(layer_number == len(self.layers) - 1) (hidden_states, input_ids, position_ids) = self.layers[layer_number]( input_ids=input_ids, position_ids=position_ids, diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index 06e8f1372f4..aead6133f22 100755 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. import logging from contextlib import nullcontext from dataclasses import dataclass @@ -16,9 +16,6 @@ from megatron.core.fusions.fused_layer_norm import FusedLayerNorm from megatron.core.inference.contexts import BaseInferenceContext from megatron.core.packed_seq_params import PackedSeqParams -from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_set_last_layer, -) from megatron.core.pipeline_parallel.utils import is_vp_first_stage, is_vp_last_stage from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.enums import LayerType @@ -696,11 +693,6 @@ def forward( else: inner_quantization_context = nullcontext() - if self.config.fine_grained_activation_offloading: - fine_grained_offloading_set_last_layer( - l_no == self.num_layers_per_pipeline_rank - 1 - ) - with self.offload_context, inner_quantization_context: hidden_states, context = layer( hidden_states=hidden_states, diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index ecc700375cd..b39b7706feb 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. import warnings from dataclasses import dataclass @@ -772,25 +772,6 @@ class TransformerConfig(ModelParallelConfig): """Transformer implementation to use. Options are 'transformer_engine' for Transformer Engine and 'local' for MCore.""" - ##################################### - # Fine-grained Activation Offloading - ##################################### - fine_grained_activation_offloading: bool = False - """If True, offload the input of the specified modules to the CPU.""" - - offload_modules: Optional[list[str]] = None - """The submodules to offload its input. - choices: "attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act". - "attn_norm": offload the input of the normalization in the attention part. - "core_attn": offload the input of the core attention part. - "mlp_norm": offload the input of the normalization in the mlp part. - "attn_proj": offload the input of the attn linear projection part. - "expert_fc1": offload the input of the expert fc1 part. - "moe_act": offload the input of the moe act part. - """ - min_offloaded_tensor_size: int = 1024 * 1024 - """The minimum size of the tensor to be offloaded.""" - def __post_init__(self): """Python dataclass method that is used to modify attributes after initialization. See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more @@ -1136,28 +1117,6 @@ def __post_init__(self): if "moe" not in self.recompute_modules: self.recompute_modules.append("moe") - if self.fine_grained_activation_offloading: - assert self.offload_modules is not None and len(self.offload_modules) > 0 - allowed_modules = { - "core_attn", - "attn_proj", - "expert_fc1", - "moe_act", - "attn_norm", - "mlp_norm", - } - invalid_modules = set(self.offload_modules) - allowed_modules - assert not invalid_modules, ( - f'Invalid choices for offload_modules: {invalid_modules}. ' - f'Allowed modules are: {allowed_modules}' - ) - if "attn_proj" in self.offload_modules and "core_attn" not in self.offload_modules: - raise ValueError( - "attn_proj cannot be set to offload_modules alone without core_attn " - "because the input of attn_proj is the output of core_attn, " - "which is needed in core_attn.backward()." - ) - if ( self.num_layers_in_first_pipeline_stage is not None or self.num_layers_in_last_pipeline_stage is not None diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index c36ff7515e4..a5babece9d0 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. import logging import warnings @@ -397,16 +397,6 @@ def __init__( if "mlp" in self.config.recompute_modules: if not isinstance(self.mlp, MoELayer): self.recompute_mlp = True - self.offload_attn_norm = ( - self.config.fine_grained_activation_offloading - and "attn_norm" in self.config.offload_modules - and not isinstance(self.input_layernorm, IdentityOp) - ) - self.offload_mlp_norm = ( - self.config.fine_grained_activation_offloading - and "mlp_norm" in self.config.offload_modules - and not isinstance(self.pre_mlp_layernorm, IdentityOp) - ) # @jcasper how should we handle nvfuser? # Set bias+dropout+add fusion grad_enable execution handler. @@ -489,29 +479,20 @@ def _forward_attention( context (Tensor): Updated context tensor if cross-attention is used, otherwise None. """ - from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_group_commit, - fine_grained_offloading_group_start, - get_fine_grained_offloading_context, - ) inference_context = deprecate_inference_params(inference_context, inference_params) # Residual connection. residual = hidden_states - if self.offload_attn_norm: - hidden_states = fine_grained_offloading_group_start(hidden_states, name="attn_norm") # Optional Input Layer norm if self.recompute_input_layernorm: self.input_layernorm_checkpoint = tensor_parallel.CheckpointWithoutOutput() - with get_fine_grained_offloading_context(self.offload_attn_norm): - input_layernorm_output = self.input_layernorm_checkpoint.checkpoint( - self.input_layernorm, hidden_states - ) + input_layernorm_output = self.input_layernorm_checkpoint.checkpoint( + self.input_layernorm, hidden_states + ) else: - with get_fine_grained_offloading_context(self.offload_attn_norm): - input_layernorm_output = self.input_layernorm(hidden_states) + input_layernorm_output = self.input_layernorm(hidden_states) # Self attention. nvtx_range_push(suffix="self_attention") @@ -545,11 +526,6 @@ def _forward_attention( ) nvtx_range_pop(suffix="self_attn_bda") - if self.offload_attn_norm: - (hidden_states,) = fine_grained_offloading_group_commit( - hidden_states, name="attn_norm", forced_released_tensors=[residual] - ) - # Residual connection. residual = hidden_states @@ -587,27 +563,17 @@ def _forward_mlp(self, hidden_states, inference_context=None): output (Tensor): Transformed hidden states of shape [s, b, h]. """ - from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_group_commit, - fine_grained_offloading_group_start, - get_fine_grained_offloading_context, - ) - # Residual connection. residual = hidden_states - if self.offload_mlp_norm: - hidden_states = fine_grained_offloading_group_start(hidden_states, name="mlp_norm") # Optional Layer norm post the cross-attention. if self.recompute_pre_mlp_layernorm: self.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput() - with get_fine_grained_offloading_context(self.offload_mlp_norm): - pre_mlp_layernorm_output = self.pre_mlp_norm_checkpoint.checkpoint( - self.pre_mlp_layernorm, hidden_states - ) + pre_mlp_layernorm_output = self.pre_mlp_norm_checkpoint.checkpoint( + self.pre_mlp_layernorm, hidden_states + ) else: - with get_fine_grained_offloading_context(self.offload_mlp_norm): - pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states) + pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states) nvtx_range_push(suffix="mlp") # Potentially chunk the MLP computation during prefill to minimize the peak activation size @@ -667,10 +633,6 @@ def _forward_mlp(self, hidden_states, inference_context=None): mlp_output_with_bias, residual, self.hidden_dropout ) nvtx_range_pop(suffix="mlp_bda") - if self.offload_mlp_norm: - (hidden_states,) = fine_grained_offloading_group_commit( - hidden_states, name="mlp_norm", forced_released_tensors=[residual] - ) # Jit compiled function creates 'view' tensor. This tensor # potentially gets saved in the MPU checkpoint function context, diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 8e5f343b73c..bdf915a8ae1 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1216,10 +1216,6 @@ def validate_args(args, defaults={}): "when enabling delay_wgrad_compute" ) - if args.fine_grained_activation_offloading: - assert args.transformer_impl == 'transformer_engine', \ - "Fine-grained activation offloading is only supported with transformer_engine implementation" - if args.mtp_num_layers: assert not args.use_legacy_models, "The legacy Megatron models does not support Multi-Token Prediction (MTP)." assert args.position_embedding_type == "rope" or args.position_embedding_type == "none", ( @@ -2331,12 +2327,7 @@ def _add_training_args(parser): help='The communicator group names to use high priority streams.') group.add_argument('--use-te-activation-func', action='store_true', help='Use activation function kernel from Transformer Engine in MLP module.') - group.add_argument('--fine-grained-activation-offloading', action='store_true', - help='Enable fine-grained activation offloading.') - group.add_argument('--offload-modules', nargs='*', type=str, default=[], - help='The submodules to offload its input. Choices: "attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act".') - group.add_argument('--min-offloaded-tensor-size', type=int, default=1024*1024, - help='The minimum size of the tensor to be offloaded.') + return parser diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_coreweave.json deleted file mode 100644 index 30ea509a50b..00000000000 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_coreweave.json +++ /dev/null @@ -1,110 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 11.0637, - "5": 9.48263, - "10": 9.04035, - "15": 8.00837, - "20": 7.88364, - "25": 7.67597, - "30": 7.63447, - "35": 7.21393, - "40": 7.55564, - "45": 7.21045, - "50": 7.05439 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 38802064.0, - "5": 394456256.0, - "10": 571185472.0, - "15": 699100416.0, - "20": 891692160.0, - "25": 748799104.0, - "30": 794511296.0, - "35": 671593792.0, - "40": 421718816.0, - "45": 517934176.0, - "50": 472902496.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 6025468416.0, - "5": 6025470464.0, - "10": 6025470464.0, - "15": 6025470464.0, - "20": 6025470464.0, - "25": 6025470464.0, - "30": 6025470464.0, - "35": 6025470464.0, - "40": 6025470464.0, - "45": 6025470464.0, - "50": 6025470464.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 45099868160.0, - "5": 49175810048.0, - "10": 49175810048.0, - "15": 49175810048.0, - "20": 49175810048.0, - "25": 49175810048.0, - "30": 49211260928.0, - "35": 49211260928.0, - "40": 49211260928.0, - "45": 49211260928.0, - "50": 49211260928.0 - } - }, - "mtp_1 loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 11.04508, - "5": 9.76285, - "10": 9.04997, - "15": 7.93865, - "20": 7.79984, - "25": 7.60324, - "30": 7.56633, - "35": 7.13802, - "40": 7.45784, - "45": 7.11892, - "50": 6.9559 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 52.8667, - "5": 2.06295, - "10": 1.09336, - "15": 1.10509, - "20": 1.08631, - "25": 1.08991, - "30": 1.10548, - "35": 1.10049, - "40": 1.11219, - "45": 1.09542, - "50": 1.09805 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_eos.json deleted file mode 100644 index 30ea509a50b..00000000000 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_eos.json +++ /dev/null @@ -1,110 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 11.0637, - "5": 9.48263, - "10": 9.04035, - "15": 8.00837, - "20": 7.88364, - "25": 7.67597, - "30": 7.63447, - "35": 7.21393, - "40": 7.55564, - "45": 7.21045, - "50": 7.05439 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 38802064.0, - "5": 394456256.0, - "10": 571185472.0, - "15": 699100416.0, - "20": 891692160.0, - "25": 748799104.0, - "30": 794511296.0, - "35": 671593792.0, - "40": 421718816.0, - "45": 517934176.0, - "50": 472902496.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 6025468416.0, - "5": 6025470464.0, - "10": 6025470464.0, - "15": 6025470464.0, - "20": 6025470464.0, - "25": 6025470464.0, - "30": 6025470464.0, - "35": 6025470464.0, - "40": 6025470464.0, - "45": 6025470464.0, - "50": 6025470464.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 45099868160.0, - "5": 49175810048.0, - "10": 49175810048.0, - "15": 49175810048.0, - "20": 49175810048.0, - "25": 49175810048.0, - "30": 49211260928.0, - "35": 49211260928.0, - "40": 49211260928.0, - "45": 49211260928.0, - "50": 49211260928.0 - } - }, - "mtp_1 loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 11.04508, - "5": 9.76285, - "10": 9.04997, - "15": 7.93865, - "20": 7.79984, - "25": 7.60324, - "30": 7.56633, - "35": 7.13802, - "40": 7.45784, - "45": 7.11892, - "50": 6.9559 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 52.8667, - "5": 2.06295, - "10": 1.09336, - "15": 1.10509, - "20": 1.08631, - "25": 1.08991, - "30": 1.10548, - "35": 1.10049, - "40": 1.11219, - "45": 1.09542, - "50": 1.09805 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml deleted file mode 100644 index d9ec0456190..00000000000 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml +++ /dev/null @@ -1,139 +0,0 @@ -ENV_VARS: - CUDA_DEVICE_MAX_CONNECTIONS: 32 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True - NCCL_NVLS_ENABLE: 0 - PYTHONWARNINGS: ignore - NCCL_DEBUG: VERSION -MODEL_ARGS: - # Distributed args - --distributed-timeout-minutes: 60 - --tensor-model-parallel-size: 2 - --pipeline-model-parallel-size: 2 - --expert-model-parallel-size: 4 - --context-parallel-size: 1 - --expert-tensor-parallel-size: 1 - --use-distributed-optimizer: true - # NOTE: uncomment if TE >= 2.9.0 - # --overlap-grad-reduce: true - # --overlap-param-gather: true - # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN - --attention-backend: unfused # TODO: switch back to fused attention after fix - # Training args - --use-mcore-models: true - --sequence-parallel: true - --disable-bias-linear: true - --micro-batch-size: 4 - --global-batch-size: 32 - --train-iters: 50 - --exit-duration-in-mins: 230 - --no-check-for-nan-in-loss-and-grad: true - --no-rope-fusion: true - --cross-entropy-loss-fusion: true - --cross-entropy-fusion-impl: native - --manual-gc: true - --manual-gc-interval: 100 - --recompute-granularity: selective - --recompute-modules: "[layernorm mla_up_proj mlp moe_act]" - --fine-grained-activation-offloading: true - --offload-modules: "[expert_fc1 moe_act attn_norm mlp_norm]" - # Transformer Engine args - --transformer-impl: transformer_engine - # Data args - --seq-length: 4096 - --data-cache-path: ${DATA_CACHE_PATH} - --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json - --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt - --split: 949,50,1 - # Add network size args - --num-layers: 15 - --moe-layer-freq: ([0]*3+[1]*12) - --pipeline-model-parallel-layout: Et*3\\|\\(tt\\|\\)*6mL # Et*3|(tt|)*6mL - --hidden-size: 1024 - --ffn-hidden-size: 4096 - --num-attention-heads: 32 - --kv-channels: 128 - --max-position-embeddings: 4096 - --position-embedding-type: rope - --rotary-base: 10000 - --make-vocab-size-divisible-by: 3232 - --normalization: RMSNorm - --norm-epsilon: 1e-6 - --swiglu: true - --untie-embeddings-and-output-weights: true - --multi-latent-attention: true - # Comment out the following MTP args to disable MTP - --mtp-num-layers: 1 - --mtp-loss-scaling-factor: 0.1 - # Add regularization args - --attention-dropout: 0.0 - --hidden-dropout: 0.0 - --clip-grad: 1.0 - --weight-decay: 0.1 - --qk-layernorm: true - # Add learning rate args - --lr-warmup-fraction: .01 - --lr: 0.00015 - --min-lr: 1.0e-5 - --lr-decay-style: cosine - --adam-beta1: 0.9 - --adam-beta2: 0.95 - # Add MoE args - --num-experts: 32 - --moe-ffn-hidden-size: 1024 - --moe-shared-expert-intermediate-size: 1024 - --moe-router-load-balancing-type: seq_aux_loss - --moe-router-topk: 4 - --moe-token-dispatcher-type: alltoall - --moe-router-pre-softmax: true - --moe-grouped-gemm: true - --moe-aux-loss-coeff: 1e-4 - --moe-router-group-topk: 2 - --moe-router-num-groups: 4 - --moe-router-topk-scaling-factor: 2.0 - --moe-router-score-function: sigmoid - --moe-router-enable-expert-bias: true - --moe-router-bias-update-rate: 1e-3 - --moe-router-dtype: fp32 - --moe-permute-fusion: true - # Add MLA args - --q-lora-rank: 1536 - --kv-lora-rank: 512 - --qk-head-dim: 128 - --qk-pos-emb-head-dim: 64 - --v-head-dim: 128 - --rotary-scaling-factor: 40 - --mscale: 1.0 - --mscale-all-dim: 1.0 - # Add validation args - --eval-iters: 32 - --eval-interval: 200 - # Add checkpointing args - --save: ${CHECKPOINT_SAVE_PATH} - --load: ${CHECKPOINT_LOAD_PATH} - --save-interval: 25 - # Add initialization args - --init-method-std: 0.02 - # Add logging args - --log-timers-to-tensorboard: true - --log-memory-to-tensorboard: true - --log-num-zeros-in-grad: true - --log-params-norm: true - --log-validation-ppl-to-tensorboard: true - --log-throughput: true - --log-interval: 1 - --logging-level: 40 - --tensorboard-dir: ${TENSORBOARD_PATH} - # Add mixed precision args - --bf16: true - --exit-interval: 50 - --overlap-moe-expert-parallel-comm: true -TEST_TYPE: regular # Usually ckpt-resume, but as a WAR to #513 set to regular -METRICS: - - "iteration-time" - - "lm loss" - - "num-zeros" - - "mem-allocated-bytes" - - "mem-max-allocated-bytes" - - "mtp_1 loss" diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_coreweave.json deleted file mode 100644 index 3687e19e563..00000000000 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_coreweave.json +++ /dev/null @@ -1,92 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 11.04266, - "5": 9.38536, - "10": 8.82761, - "15": 7.86966, - "20": 7.72022, - "25": 7.53119, - "30": 7.5026, - "35": 7.10343, - "40": 7.42037, - "45": 7.07056, - "50": 6.90946 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 844114112.0, - "5": 856834688.0, - "10": 928751040.0, - "15": 952825152.0, - "20": 987111232.0, - "25": 926008384.0, - "30": 864767232.0, - "35": 855095360.0, - "40": 849505920.0, - "45": 847187584.0, - "50": 846195840.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 4419107328.0, - "5": 4419108864.0, - "10": 4419108864.0, - "15": 4419108864.0, - "20": 4419108864.0, - "25": 4419108864.0, - "30": 4419108864.0, - "35": 4419108864.0, - "40": 4419108864.0, - "45": 4419108864.0, - "50": 4419108864.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 37959917568.0, - "5": 39583289344.0, - "10": 39583289344.0, - "15": 39583289344.0, - "20": 39583289344.0, - "25": 39583289344.0, - "30": 39583289344.0, - "35": 39583289344.0, - "40": 39583289344.0, - "45": 39583289344.0, - "50": 39583289344.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 58.78709, - "5": 2.40565, - "10": 1.13046, - "15": 1.39764, - "20": 1.1273, - "25": 1.12154, - "30": 1.03587, - "35": 1.09545, - "40": 1.09901, - "45": 1.00656, - "50": 1.00794 - } - } -} diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_eos.json deleted file mode 100644 index 3687e19e563..00000000000 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_eos.json +++ /dev/null @@ -1,92 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 11.04266, - "5": 9.38536, - "10": 8.82761, - "15": 7.86966, - "20": 7.72022, - "25": 7.53119, - "30": 7.5026, - "35": 7.10343, - "40": 7.42037, - "45": 7.07056, - "50": 6.90946 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 844114112.0, - "5": 856834688.0, - "10": 928751040.0, - "15": 952825152.0, - "20": 987111232.0, - "25": 926008384.0, - "30": 864767232.0, - "35": 855095360.0, - "40": 849505920.0, - "45": 847187584.0, - "50": 846195840.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 4419107328.0, - "5": 4419108864.0, - "10": 4419108864.0, - "15": 4419108864.0, - "20": 4419108864.0, - "25": 4419108864.0, - "30": 4419108864.0, - "35": 4419108864.0, - "40": 4419108864.0, - "45": 4419108864.0, - "50": 4419108864.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 37959917568.0, - "5": 39583289344.0, - "10": 39583289344.0, - "15": 39583289344.0, - "20": 39583289344.0, - "25": 39583289344.0, - "30": 39583289344.0, - "35": 39583289344.0, - "40": 39583289344.0, - "45": 39583289344.0, - "50": 39583289344.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 50, - "step_interval": 5, - "values": { - "1": 58.78709, - "5": 2.40565, - "10": 1.13046, - "15": 1.39764, - "20": 1.1273, - "25": 1.12154, - "30": 1.03587, - "35": 1.09545, - "40": 1.09901, - "45": 1.00656, - "50": 1.00794 - } - } -} diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml deleted file mode 100644 index f4b64722712..00000000000 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml +++ /dev/null @@ -1,134 +0,0 @@ -ENV_VARS: - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True - NCCL_NVLS_ENABLE: 0 - PYTHONWARNINGS: ignore - NCCL_DEBUG: VERSION -MODEL_ARGS: - # Distributed args - --distributed-timeout-minutes: 60 - --tensor-model-parallel-size: 2 - --pipeline-model-parallel-size: 2 - --expert-model-parallel-size: 4 - --context-parallel-size: 1 - --expert-tensor-parallel-size: 1 - --use-distributed-optimizer: true - # NOTE: uncomment if TE >= 2.9.0 - # --overlap-grad-reduce: true - # --overlap-param-gather: true - # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN - --attention-backend: unfused # TODO: switch back to fused attention after fix - # Training args - --use-mcore-models: true - --sequence-parallel: true - --disable-bias-linear: true - --micro-batch-size: 4 - --global-batch-size: 32 - --train-iters: 50 - --exit-duration-in-mins: 230 - --no-check-for-nan-in-loss-and-grad: true - --no-rope-fusion: true - --cross-entropy-loss-fusion: true - --cross-entropy-fusion-impl: native - --manual-gc: true - --manual-gc-interval: 100 - --recompute-granularity: selective - --recompute-modules: "[layernorm mla_up_proj mlp moe_act]" - --fine-grained-activation-offloading: true - --offload-modules: "[expert_fc1 moe_act attn_norm mlp_norm]" - # Transformer Engine args - --transformer-impl: transformer_engine - # Data args - --seq-length: 4096 - --data-cache-path: ${DATA_CACHE_PATH} - --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json - --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt - --split: 949,50,1 - # Add network size args - --num-layers: 15 - --moe-layer-freq: ([0]*3+[1]*12) - --pipeline-model-parallel-layout: Et*3\\|\\(tt\\|\\)*6L # Et*3|(tt|)*6L - --hidden-size: 1024 - --ffn-hidden-size: 4096 - --num-attention-heads: 32 - --kv-channels: 128 - --max-position-embeddings: 4096 - --position-embedding-type: rope - --rotary-base: 10000 - --make-vocab-size-divisible-by: 3232 - --normalization: RMSNorm - --norm-epsilon: 1e-6 - --swiglu: true - --untie-embeddings-and-output-weights: true - --multi-latent-attention: true - # Add regularization args - --attention-dropout: 0.0 - --hidden-dropout: 0.0 - --clip-grad: 1.0 - --weight-decay: 0.1 - --qk-layernorm: true - # Add learning rate args - --lr-warmup-fraction: .01 - --lr: 0.00015 - --min-lr: 1.0e-5 - --lr-decay-style: cosine - --adam-beta1: 0.9 - --adam-beta2: 0.95 - # Add MoE args - --num-experts: 32 - --moe-ffn-hidden-size: 1024 - --moe-shared-expert-intermediate-size: 1024 - --moe-router-load-balancing-type: seq_aux_loss - --moe-router-topk: 4 - --moe-token-dispatcher-type: alltoall - --moe-router-pre-softmax: true - --moe-grouped-gemm: true - --moe-aux-loss-coeff: 1e-4 - --moe-router-group-topk: 2 - --moe-router-num-groups: 4 - --moe-router-topk-scaling-factor: 2.0 - --moe-router-score-function: sigmoid - --moe-router-enable-expert-bias: true - --moe-router-bias-update-rate: 1e-3 - --moe-router-dtype: fp32 - --moe-permute-fusion: true - # Add MLA args - --q-lora-rank: 1536 - --kv-lora-rank: 512 - --qk-head-dim: 128 - --qk-pos-emb-head-dim: 64 - --v-head-dim: 128 - --rotary-scaling-factor: 40 - --mscale: 1.0 - --mscale-all-dim: 1.0 - # Add validation args - --eval-iters: 32 - --eval-interval: 200 - # Add checkpointing args - --save: ${CHECKPOINT_SAVE_PATH} - --load: ${CHECKPOINT_LOAD_PATH} - --save-interval: 25 - # Add initialization args - --init-method-std: 0.02 - # Add logging args - --log-timers-to-tensorboard: true - --log-memory-to-tensorboard: true - --log-num-zeros-in-grad: true - --log-params-norm: true - --log-validation-ppl-to-tensorboard: true - --log-throughput: true - --log-interval: 1 - --logging-level: 40 - --tensorboard-dir: ${TENSORBOARD_PATH} - # Add mixed precision args - --bf16: true - --exit-interval: 50 -TEST_TYPE: regular # Usually ckpt-resume, but as a WAR to #513 set to regular -METRICS: - - "iteration-time" - - "lm loss" - - "num-zeros" - - "mem-allocated-bytes" - - "mem-max-allocated-bytes" diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml index 63320ae3c3d..8164ca37df8 100644 --- a/tests/test_utils/recipes/moe.yaml +++ b/tests/test_utils/recipes/moe.yaml @@ -124,16 +124,6 @@ products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] - - test_case: [gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] ####################################################################### # Super important MR tests that run for both DEV and LTS per MR # ####################################################################### diff --git a/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py b/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py deleted file mode 100644 index edec95288c2..00000000000 --- a/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py +++ /dev/null @@ -1,187 +0,0 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - -import gc - -import pytest -import torch - -EPSILON = 0.1 - -# Skip all tests if CUDA is not available -cuda_available = torch.cuda.is_available() - - -def _reset_cuda_memory(): - gc.collect() - if cuda_available: - torch.cuda.empty_cache() - - -class ToyModel(torch.nn.Module): - def __init__(self, hidden_size: int = 2048, num_layers: int = 4, dtype=torch.bfloat16): - super().__init__() - layers = [] - for _ in range(num_layers): - layers.append( - torch.nn.Linear(hidden_size, hidden_size, bias=True, dtype=dtype, device="cuda") - ) - self.net = torch.nn.Sequential(*layers).to(device="cuda", dtype=dtype) - self.hidden_size = hidden_size - self.num_layers = num_layers - self.dtype = dtype - - # Prevent weights/bias from being considered activation tensors for offload; - # ensure we only count activation tensors (inputs x) in memory accounting. - for p in self.parameters(): - try: - setattr(p, "offloading_activation", False) - except Exception: - pass - - def forward(self, x, use_offload: bool = False): - from megatron.core.pipeline_parallel import fine_grained_activation_offload as off - - if use_offload: - # Initialize a new chunk (microbatch) and enable offload context. - with off.get_fine_grained_offloading_context(True): - off.fine_grained_offloading_init_chunk_handler( - vp_stage=None, min_offloaded_tensor_size=1 - ) - for i, layer in enumerate(self.net): - # Group by module; with this linear-only model, each group corresponds to a layer. - off.fine_grained_offloading_set_last_layer(i == len(self.net) - 1) - x = off.fine_grained_offloading_group_start(x, name=f"layer_{i}") - x = layer(x) - # Commit the group; returns a tuple of tensors - (x,) = off.fine_grained_offloading_group_commit( - x, name=f"layer_{i}", forced_released_tensors=[] - ) - return x - # Baseline path (no offload hooks) - with ( - torch.autocast(device_type="cuda", dtype=self.dtype) - if self.dtype in (torch.float16, torch.bfloat16) - else torch.cuda.amp.autocast(enabled=False) - ): - for layer in self.net: - x = layer(x) - return x - - -@pytest.fixture(autouse=True) -def _monkeypatch_offload_deps(monkeypatch): - # Avoid requiring torch.distributed initialization and NVML in tests - import megatron.core.pipeline_parallel.fine_grained_activation_offload as off - - monkeypatch.setattr(off, "debug_rank", lambda *args, **kwargs: None, raising=False) - monkeypatch.setattr(off, "set_ideal_affinity_for_current_gpu", lambda: None, raising=False) - # Ensure a clean state each test - off.fine_grained_offloading_reset() - yield - off.fine_grained_offloading_reset() - - -def test_fine_grained_activation_offload_memory_reduction(): - torch.manual_seed(1234) - # Use a linear-only stack so theoretical saved memory equals sum of per-layer input x bytes. - model = ToyModel(hidden_size=2048, num_layers=8, dtype=torch.bfloat16).eval() - - # Create input - inp = torch.randn( - (2048, model.hidden_size), device="cuda", dtype=torch.bfloat16, requires_grad=True - ) - - # Warmup to stabilize allocator behavior - _reset_cuda_memory() - out = model(inp, use_offload=False) - (out.sum()).backward() - torch.cuda.synchronize() - _reset_cuda_memory() - - # Baseline memory measurement (no offload) - _reset_cuda_memory() - inp_baseline = inp.detach().clone().requires_grad_(True) - baseline_mem_before = torch.cuda.memory_allocated() / (1024**2) - out_base = model(inp_baseline, use_offload=False) - baseline_mem_after = (torch.cuda.memory_allocated() - out_base.nbytes) / (1024**2) - (out_base.sum()).backward() - torch.cuda.synchronize() - baseline_delta = baseline_mem_after - baseline_mem_before - - # Offload memory measurement - from megatron.core.pipeline_parallel import fine_grained_activation_offload as off - - off.fine_grained_offloading_reset() - _reset_cuda_memory() - inp_off = inp.detach().clone().requires_grad_(True) - offload_mem_before = torch.cuda.memory_allocated() / (1024**2) - out_off = model(inp_off, use_offload=True) - offload_mem_after = (torch.cuda.memory_allocated() - out_off.nbytes) / (1024**2) - (out_off.sum()).backward() - torch.cuda.synchronize() - offload_delta = offload_mem_after - offload_mem_before - - # Offload should reduce peak cached memory usage after forward - assert ( - offload_delta < baseline_delta - ), f"offload did not reduce memory: off={offload_delta:.2f}MiB base={baseline_delta:.2f}MiB" - - # Theoretical savings: storing per-layer input x (same shape each layer). - bytes_per_elem = inp.element_size() # 2 for bfloat16 - input_bytes = inp.numel() * bytes_per_elem - # -2 because the first and last activations are not offloaded - expected_saved_mib = (model.num_layers - 2) * (input_bytes / (1024**2)) - - # Actual savings ≈ baseline_delta - offload_delta (both exclude output tensor memory). - actual_saved_mib = baseline_delta - offload_delta - - # Allow slack for allocator jitter and extra intermediates; magnitudes should match. - rel_err = abs(actual_saved_mib - expected_saved_mib) / max(expected_saved_mib, 1e-6) - assert ( - rel_err <= EPSILON - ), f"saved mismatch: actual={actual_saved_mib:.2f}MiB expected~={expected_saved_mib:.2f}MiB (rel_err={rel_err:.2f})" - - -def test_fine_grained_activation_offload_output_and_grad_consistency(): - torch.manual_seed(2025) - hidden = 1024 - layers = 3 - - # Create identical models by resetting seed - torch.manual_seed(2025) - model_base = ToyModel(hidden_size=hidden, num_layers=layers, dtype=torch.bfloat16).train() - torch.manual_seed(2025) - model_off = ToyModel(hidden_size=hidden, num_layers=layers, dtype=torch.bfloat16).train() - - # Same input and target - inp = torch.randn((32, hidden), device="cuda", dtype=torch.bfloat16, requires_grad=True) - target = torch.randn_like(inp) - - # Baseline forward/backward - out_base = model_base(inp, use_offload=False) - loss_base = torch.nn.functional.mse_loss(out_base, target) - loss_base.backward() - grads_base = [ - p.grad.detach().clone() if p.grad is not None else None for p in model_base.parameters() - ] - - # Offload forward/backward - from megatron.core.pipeline_parallel import fine_grained_activation_offload as off - - off.fine_grained_offloading_reset() - out_off = model_off(inp.detach().clone().requires_grad_(True), use_offload=True) - loss_off = torch.nn.functional.mse_loss(out_off, target) - loss_off.backward() - grads_off = [ - p.grad.detach().clone() if p.grad is not None else None for p in model_off.parameters() - ] - - # Compare outputs - assert torch.allclose(out_off.float(), out_base.float(), rtol=1e-3, atol=1e-3) - - # Compare gradients parameter-wise - for gb, go in zip(grads_base, grads_off): - if gb is None and go is None: - continue - assert gb is not None and go is not None - assert torch.allclose(go.float(), gb.float(), rtol=1e-3, atol=1e-3) From d95e86a25bce1c3357755699a2e9e08d39411eac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 27 Oct 2025 16:16:24 +0100 Subject: [PATCH 077/248] fix: Missing logger (#1966) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- megatron/core/transformer/transformer_config.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index b39b7706feb..d14f991046e 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import logging import warnings from dataclasses import dataclass from typing import Callable, List, Literal, Optional, Tuple, Union @@ -29,6 +30,8 @@ except ImportError: HAVE_PACKAGING = False +logger = logging.getLogger(__name__) + @dataclass class TransformerConfig(ModelParallelConfig): @@ -918,7 +921,7 @@ def __post_init__(self): if self.moe_enable_deepep: if self.moe_token_dispatcher_type != "flex": raise ValueError("DeepEP backend is only supported with flex token dispatcher.") - logging.warning( + logger.warning( "moe_enable_deepep is deprecated." "Please use --moe-flex-dispatcher-backend=deepep instead." ) From 113cefb24a7d7d77b88672630b6670724b877fe3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 27 Oct 2025 17:28:32 +0100 Subject: [PATCH 078/248] ci: Update copyright checker (#1974) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/copyright-check.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/copyright-check.yml b/.github/workflows/copyright-check.yml index 0463e1dd962..74469adf75d 100644 --- a/.github/workflows/copyright-check.yml +++ b/.github/workflows/copyright-check.yml @@ -31,7 +31,7 @@ jobs: if: | !(needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true') - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_copyright_check.yml@v0.65.9 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_copyright_check.yml@v0.65.11 copyright-check-summary: needs: [pre-flight, copyright-check] From d9e0806d180cdde70450cfaaff9cb7addac20b21 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Mon, 27 Oct 2025 23:20:49 -0700 Subject: [PATCH 079/248] [Dev] Update symmetric registration interface to sync-up with upstream pytorch change (#1930) Signed-off-by: Youngeun Kwon Signed-off-by: Youngeun --- megatron/core/distributed/fsdp/src/README.md | 9 +- .../megatron_fsdp/param_and_grad_buffer.py | 59 +++++++--- .../core/distributed/param_and_grad_buffer.py | 5 +- megatron/core/nccl_allocator.py | 104 ++++++++++++------ 4 files changed, 126 insertions(+), 51 deletions(-) diff --git a/megatron/core/distributed/fsdp/src/README.md b/megatron/core/distributed/fsdp/src/README.md index 8af58d07826..d879c6c26f8 100644 --- a/megatron/core/distributed/fsdp/src/README.md +++ b/megatron/core/distributed/fsdp/src/README.md @@ -35,10 +35,14 @@ Megatron-FSDP can provide up to 25% speed up and 23% memory savings compared to - **Advanced Bucketing**: Data-type aware bucketing system to minimize the overhead of collective operations - **Buffer Management**: Zero copy communication is achieved by reorganizing the storage of parameters and main grad with `ParamAndGradBuffer` class - **Communication Overlapping**: Improved communication overlap of paramter all-gather and gradient reduce-scatter -- **User-Buffer-Registration NCCL communication**: Offload NCCL collective communication to NVL/IB Sharp to reduce GPU SM usage for communication - **FP8 Mixed Precision with Transformer Engine**: Compatibility with Transformer Engine enables efficient FP8 mixed precision training - **Gradient accumulate fusion support with Transformer Engine**: Remove the explicit gradient copy to the communication buffer in backwards pass +### Advanced Collective Communication +- **SM Usage Reduction with SHARP**: FSDP's `All-Gather` (AG) and `Reduce-Scatter` (RS) collectives are designed to overlap with compute kernels. However, standard NCCL communication kernels can consume a significant number of GPU SMs (e.g., 16-32 SMs), "stealing" resources from compute (GEMM) kernels and reducing overall TFLOPS. +- **In-Switch Processing**: We leverage **SHARP** (Scalable Hierarchical Aggregation and Reduction Protocol) to offload these collective operations. SHARP performs aggregation and reduction computations directly on the network switches (InfiniBand or NVLink Switch) instead of on the GPU SMs. This dramatically reduces the SM consumption for communication to **1-6 SM** freeing up GPU resources for compute. It also provides lower communication latency, especially in large, scaled-out workloads. +- **Symmetric Optimizations for MNNVL**: We support **symmetric-based optimizations**, introduced in NCCL v2.27, which enable switch offloading for **Multi-Node NVLink (MNNVL)** systems such as GB200/GB300. This allows the same SM-saving benefits over the high-bandwidth NVLink fabric itself. +- **Hierarchical Collectives**: When an FSDP sharding domain spans both NVLink and InfiniBand, the library utilizes **hierarchical SHARP collectives** (e.g., NVL-SHARP + IB-SHARP) to optimize the communication path across the entire system topology. ## 📦 Installation @@ -207,6 +211,9 @@ optimizer.load_state_dict(ckpt_state_dict["optimizer"]) - `nccl_ub` will allocate and register the NCCL userbuffer for param and grad buffers. This option enables an SM-efficient NCCL algorithm that could improve the performance of overlapped computations. This flag will be much more effective when used together with SHARP if the FSDP communication includes both NVL and IB domains. Enabling this option will cause additional memory overhead due to the requirement to enable the `fsdp_double_buffer` option. - **Only effective when using Megatron-LM.** - Defaults to `False`. + - By default we try to use NCCL window (symmetric) registration if it is available. If not it falls back to conventional local registraion. +- `disable_symmetric_registration` will disable NCCL window (i.e. symmetric) registraion when using `nccl_ub`. + - Dafaults to `False`. - `fsdp_double_buffer` will use persistently allocated double buffers for temporarily-defined memory needed in `MegatronFSDP` communications. Having persistent double buffers may increase peak VRAM utilization, but is required to register NCCL user buffers (`nccl_ub=True`) for `MegatronFSDP`. Currently, this is only supported for simple repetitive model structures such as GPT. - **Only effective when using Megatron-LM.** - Defaults to `False`. Automatically overridden to `True` when `nccl_ub` is enabled. diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py index a987ec2cec4..c8116150d52 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py @@ -76,13 +76,19 @@ except Exception: HAVE_TE = False +NCCL_ALLOCATOR = None + try: # Try to import the MCore NCCL nccl_allocator first. # If it fails, try to import the APEX NCCL nccl_allocator. import megatron.core.nccl_allocator as nccl_allocator + + NCCL_ALLOCATOR = "MCORE" except ImportError: try: import apex.contrib.nccl_allocator as nccl_allocator + + NCCL_ALLOCATOR = "APEX" except ImportError: nccl_allocator = None @@ -94,8 +100,8 @@ def _p_assert(cond: Any, s: str, raise_assertion_error: bool = True) -> None: message ``s`` since otherwise, it is swallowed. """ if not cond: - print(s) - traceback.print_stack() + logger.error(s) + logger.error(''.join(traceback.format_stack())) if raise_assertion_error: raise AssertionError(s) @@ -205,7 +211,7 @@ def __exit__(self, *args): for group in self.groups[1:]: backend = group._get_backend(torch.device("cuda", torch.cuda.current_device())) if torch.distributed.get_rank() == 0: - print( + logger.info( f"[MultiGroupUBRAllocator] Registering mem pool to group {group}, " f"group.group_desc:{group.group_desc}" ) @@ -1612,7 +1618,9 @@ def __init__( # If using nccl_ub, it returns a function that registers buffers to the NCCL memory pool # Buffer is registered to data_parallel_group and expert_data_parallel_group if it exists # In the case of not using nccl_ub, it returns a nullcontext - self.mem_alloc_context = self.get_mem_alloc_context(groups=self.ubr_groups) + self.mem_alloc_context = self.get_mem_alloc_context( + groups=self.ubr_groups, symmetric=not self.ddp_config.disable_symmetric_registration + ) # Mark FP8 params. If TransformerEngine is not installed, we can skip this. meta_device_init_fp8_params = {} @@ -1640,7 +1648,7 @@ def __init__( self._log_parameter_groups() - def get_mem_alloc_context(self, groups=None): + def get_mem_alloc_context(self, groups=None, symmetric=True): """ Get the memory allocation context for the parameter and gradient buffers. """ @@ -1653,22 +1661,43 @@ def get_mem_alloc_context(self, groups=None): if groups is None: # data parallel group is a default group for user buffer registration groups = [self.dist_index.get_fsdp_group(is_expert_parallel=False)] - if len(groups) == 1: - # register buffers to the default group directly using apex memory allocator - mem_alloc_context = functools.partial( - nccl_allocator.nccl_mem, NCCL_MEMORY_POOL, group=groups[0] - ) - else: - if hasattr(nccl_allocator, "MultiGroupMemPoolAllocator"): - # Case of MCore NCCL allocator + + if NCCL_ALLOCATOR == "MCORE": + if len(groups) == 1: + # register buffers to the default group directly using nccl memory allocator mem_alloc_context = functools.partial( - nccl_allocator.MultiGroupMemPoolAllocator, NCCL_MEMORY_POOL, groups=groups + nccl_allocator.nccl_mem, + NCCL_MEMORY_POOL, + group=groups[0], + symmetric=symmetric, ) else: - # Case of APEX NCCL allocator. + mem_alloc_context = functools.partial( + nccl_allocator.MultiGroupMemPoolAllocator, + NCCL_MEMORY_POOL, + groups=groups, + symmetric=symmetric, + ) + elif NCCL_ALLOCATOR == "APEX": + if symmetric: + logging.warning( + "Symmetric registration is not supported for APEX NCCL allocator." + "falling back to non-symmetric registration. " + "Please use Megatron Core NCCL allocator for symmetric registration." + ) + + if len(groups) == 1: + # register buffers to the default group directly using nccl memory allocator + mem_alloc_context = functools.partial( + nccl_allocator.nccl_mem, NCCL_MEMORY_POOL, group=groups[0] + ) + else: + # Supports multiple groups registration for APEX NCCL allocator. mem_alloc_context = functools.partial( MultiGroupUBRAllocator, NCCL_MEMORY_POOL, groups=groups ) + else: + raise ValueError(f"Invalid NCCL allocator: {NCCL_ALLOCATOR}") return mem_alloc_context else: return nullcontext diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py index d49d77f6393..30a3c5dd8e2 100644 --- a/megatron/core/distributed/param_and_grad_buffer.py +++ b/megatron/core/distributed/param_and_grad_buffer.py @@ -685,7 +685,10 @@ def _does_param_require_new_bucket(param): symmetric=not self.ddp_config.disable_symmetric_registration ) mem_alloc_context = functools.partial( - nccl_allocator.nccl_mem, pool, group=self.data_parallel_group + nccl_allocator.nccl_mem, + pool, + group=self.data_parallel_group, + symmetric=not self.ddp_config.disable_symmetric_registration, ) else: # If nccl_ub is False, mem_alloc_context is nullcontext. diff --git a/megatron/core/nccl_allocator.py b/megatron/core/nccl_allocator.py index a328360ba0c..b46157e9d00 100644 --- a/megatron/core/nccl_allocator.py +++ b/megatron/core/nccl_allocator.py @@ -2,6 +2,7 @@ import logging import os from contextlib import nullcontext +from functools import lru_cache import torch @@ -94,6 +95,7 @@ def _build_nccl_allocator(): _allocator = nccl_allocator.get_nccl_allocator() +@lru_cache(maxsize=None) def get_func_args(func): """ Get the argument names of a function. @@ -122,15 +124,17 @@ def create_nccl_mem_pool(symmetric=None): # symmetric: bool | None = None -> to _pool = torch.cuda.MemPool(_allocator) else: if 'symmetric' in get_func_args(torch.cuda.MemPool): + # The PyTorch version >= 2.9.0a0 and before PyTorch PR #161238, + # The symmetric knob should passed to the MemPool constructor. + # Since PyTorch PR #161238 symmetric knob is now in registration function. _pool = torch.cuda.MemPool(_allocator, symmetric=symmetric) elif 'symm_mem' in get_func_args(torch.cuda.MemPool): # This path handles argument name divergence between # nvidia pytorch and the official pytorch. _pool = torch.cuda.MemPool(_allocator, symm_mem=symmetric) else: - raise ValueError( - "symmetric setting with torch.cuda.MemPool requires " "higher PyTorch version" - ) + # This path handles the case where the symmetric knob is in the registration function. + _pool = torch.cuda.MemPool(_allocator) return _pool @@ -149,7 +153,7 @@ def init() -> None: # Disables the use of the tensor register allocator hook os.environ["TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK"] = "0" _build_nccl_allocator() - print(f"[MCORE][NCCL_ALLOCATOR] Initialized NCCL Allocator") + logging.info(f"[MCORE][NCCL_ALLOCATOR] Initialized NCCL Allocator") # Preserve the original APEX NCCL allocator interface for backward compatibility @@ -158,11 +162,12 @@ class nccl_mem: An NCCL memory allocator, which inherits APEX nccl_allocator implementation. """ - def __init__(self, pool, enabled=True, device=None, group=None): + def __init__(self, pool, enabled=True, device=None, group=None, symmetric=True): self.device = None self.group = None self.mem_context = None self.pool = pool + self.symmetric = symmetric if enabled: if device is None: @@ -185,26 +190,41 @@ def __init__(self, pool, enabled=True, device=None, group=None): def __enter__(self): self.mem_context.__enter__() if self.group is not None: - backend = self.group._get_backend(self.device) - try: - # Deregister first to avoid duplicate registration of previously - # registered memory. - backend.deregister_mem_pool(self.pool) - except RuntimeError: - desc = getattr(self.group, "group_desc", None) - print( - f"[MCORE][NCCL_ALLOCATOR] Failed to deregister mem pool from" - f"{repr(self.group)}({desc}) group!!" - ) + # If the pool is not empty, deregister the pool from the group. + if self.pool.snapshot(): + backend = self.group._get_backend(self.device) + try: + # Deregister first to avoid duplicate registration of previously + # registered memory. + backend.deregister_mem_pool(self.pool) + except RuntimeError: + desc = getattr(self.group, "group_desc", None) + logging.warning( + f"[MCORE][NCCL_ALLOCATOR] Failed to deregister mem pool from" + f"{repr(self.group)}({desc}) group!!" + ) def __exit__(self, *args): if self.group is not None: backend = self.group._get_backend(self.device) try: - backend.register_mem_pool(self.pool) + # Prefer attempting symmetric registration first; fall back if unsupported. + if self.symmetric: + try: + # Since PyTorch PR #161238 symmetric knob is now in registration function. + backend.register_mem_pool(self.pool, symm=self.symmetric) + except TypeError: + # Older PyTorch/APIs without 'symm' keyword. + logging.warning( + f"[MCORE][NCCL_ALLOCATOR] Failed in symmetric registration." + f"Falling back to non-symmetric registration!!" + ) + backend.register_mem_pool(self.pool) + else: + backend.register_mem_pool(self.pool) except RuntimeError: desc = getattr(self.group, "group_desc", None) - print( + logging.warning( f"[MCORE][NCCL_ALLOCATOR] Failed to register mem pool to" f"{repr(self.group)}({desc}) group!!" ) @@ -238,11 +258,12 @@ class MultiGroupMemPoolAllocator: """ def __init__( - self, pool, groups + self, pool, groups, symmetric=True ): # pool: torch.cuda.MemPool, groups: List[torch.distributed.ProcessGroup] self.pool = pool self.groups = groups self.mem_context = torch.cuda.use_mem_pool(self.pool) + self.symmetric = symmetric assert isinstance(self.pool, torch.cuda.MemPool), "pool must be a torch.cuda.MemPool" assert isinstance(self.groups, list), "groups must be a list" @@ -252,28 +273,43 @@ def __init__( def __enter__(self): self.mem_context.__enter__() - for group in self.groups: - backend = group._get_backend(torch.device("cuda", torch.cuda.current_device())) - try: - # Since the registration is done in mempool granularity, we need to deregister - # the tensors in the mempool and re-register the mempool including the newly created - # tensors after the context is exited. - backend.deregister_mem_pool(self.pool) - except RuntimeError: - desc = getattr(group, "group_desc", None) - print( - f"[MCORE][MultiGroupMemPoolAllocator] Failed to deregister mem pool from" - f"{repr(group)}({desc}) group!!" - ) + # If the pool is not empty, deregister the pool from all the groups. + if self.pool.snapshot(): + for group in self.groups: + backend = group._get_backend(torch.device("cuda", torch.cuda.current_device())) + try: + # Since the registration is done in mempool granularity, we need to deregister + # the tensors in the mempool and re-register the mempool including + # the newly created tensors after the context is exited. + backend.deregister_mem_pool(self.pool) + except RuntimeError: + desc = getattr(group, "group_desc", None) + logging.warning( + f"[MCORE][MultiGroupMemPoolAllocator] Failed to deregister mem pool from" + f"{repr(group)}({desc}) group!!" + ) def __exit__(self, *args): for group in self.groups: backend = group._get_backend(torch.device("cuda", torch.cuda.current_device())) try: - backend.register_mem_pool(self.pool) + # Prefer attempting symmetric registration first; fall back if unsupported. + if self.symmetric: + try: + # Since PyTorch PR #161238 symmetric knob is now in registration function. + backend.register_mem_pool(self.pool, symm=self.symmetric) + except TypeError: + # Older PyTorch/APIs without 'symm' keyword. + logging.warning( + f"[MCORE][MultiGroupMemPoolAllocator] Failed in symmetric registration." + f"Falling back to non-symmetric registration!!" + ) + backend.register_mem_pool(self.pool) + else: + backend.register_mem_pool(self.pool) except RuntimeError: desc = getattr(group, "group_desc", None) - print( + logging.warning( f"[MCORE][MultiGroupMemPoolAllocator] Failed to register mem pool to" f"{repr(group)}({desc}) group!!" ) From cc33e0056b00ee67455fadfb6710e4dbde9e1c33 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Tue, 28 Oct 2025 03:03:31 -0500 Subject: [PATCH 080/248] cp: `Megatron-FSDP Expert Parallel (DeepSeek-v3) Support` into `dev` (#1987) Signed-off-by: Charlie Truong Co-authored-by: Jack Chang Co-authored-by: jianbinc Co-authored-by: xuwenc --- .../distributed/fsdp/mcore_fsdp_adapter.py | 133 +++- megatron/core/distributed/fsdp/src/README.md | 11 + .../fsdp/src/megatron_fsdp/fully_shard.py | 10 +- .../fsdp/src/megatron_fsdp/megatron_fsdp.py | 11 +- .../megatron_fsdp/param_and_grad_buffer.py | 83 ++- .../fsdp/src/megatron_fsdp/uneven_dtensor.py | 4 +- .../fsdp/src/megatron_fsdp/utils.py | 130 +++- .../embeddings/yarn_rotary_pos_embedding.py | 10 +- megatron/core/optimizer/__init__.py | 23 + megatron/core/optimizer/distrib_optimizer.py | 2 + .../transformer/fsdp_dtensor_checkpoint.py | 336 ++++++++-- megatron/training/arguments.py | 4 + megatron/training/checkpointing.py | 74 ++- megatron/training/training.py | 1 + .../golden_values_dev_dgxh100_coreweave.json | 598 +++++++++--------- .../golden_values_dev_dgxh100_coreweave.json | 500 +++++++-------- .../golden_values_dev_dgx_h100.json | 143 ++++- .../golden_values_dev_dgxh100_coreweave.json | 537 ++++++++++++++++ .../model_config.yaml | 2 +- tests/test_utils/recipes/moe.yaml | 15 +- tools/checkpoint/checkpoint_inspector.py | 362 +++++++++-- 21 files changed, 2224 insertions(+), 765 deletions(-) create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgxh100_coreweave.json diff --git a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py index a7c0d5802ab..7432a7f9a36 100644 --- a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +++ b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py @@ -13,6 +13,7 @@ # limitations under the License. import logging +import random from typing import List, Optional try: @@ -22,6 +23,7 @@ except ImportError: HAVE_EINOPS = False +import numpy as np import torch import torch.distributed as dist @@ -32,10 +34,11 @@ except ImportError: HAVE_DTENSOR = False -from megatron.core import parallel_state +from megatron.core import parallel_state, tensor_parallel from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk from megatron.core.distributed.data_parallel_base import _BaseDataParallel from megatron.core.distributed.distributed_data_parallel_config import DistributedDataParallelConfig +from megatron.core.extensions.transformer_engine import TELinear from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.transformer_layer import TransformerLayer @@ -95,6 +98,8 @@ def __init__( else: self.fsdp_unit_modules = [] + self._fix_tensor_parallel_attributes(module) + super().__init__( config=config, module=MegatronFSDP( @@ -119,6 +124,8 @@ def __init__( self.module.state_dict_for_save_checkpoint = self.module.state_dict self.state_dict_for_save_checkpoint = self.state_dict + self.sync_rng_states_across_tp_group() + def load_state_dict(self, state_dict, strict=True): """ Load the state dictionary into the module. @@ -141,6 +148,44 @@ def load_state_dict(self, state_dict, strict=True): self.module.load_state_dict(custom_state_dict, strict=strict) + def _fix_tensor_parallel_attributes(self, module): + is_expert_param = lambda n, p: ".experts." in n + is_router_param = lambda n, p: ".router.weight" in n + + if parallel_state.get_tensor_model_parallel_group(): + tp_size = parallel_state.get_tensor_model_parallel_group().size() + else: + tp_size = 1 + + if parallel_state.get_expert_tensor_parallel_group(): + expt_tp_size = parallel_state.get_expert_tensor_parallel_group().size() + else: + expt_tp_size = 1 + + param_to_direct_module = {} + for name, m in module.named_modules(): + for p in m.parameters(recurse=False): + param_to_direct_module[p] = (name, m) + + for name, param in module.named_parameters(): + if is_expert_param(name, param) and expt_tp_size > 1: + setattr(param, "_mcore_tp", True) + if "linear_fc1.weight" in name: + setattr(param, "_tp_partition_dim", 0) + elif "linear_fc2.weight" in name: + setattr(param, "_tp_partition_dim", 1) + + if not is_expert_param(name, param) and tp_size > 1: + m_name, direct_module = param_to_direct_module[param] + if isinstance(direct_module, (TELinear,)): + parallel_mode = getattr(direct_module, "parallel_mode", None) + if parallel_mode is None: + setattr(param, "_mcore_tp", True) + setattr(param, "_tp_duplicated", True) + elif is_router_param(name, param): + setattr(param, "_mcore_tp", True) + setattr(param, "_tp_duplicated", True) + def _init_dist_index(self, pg_collection): """ Initialize the distributed index for the module. @@ -154,6 +199,7 @@ def _init_dist_index(self, pg_collection): enable_hsdp = self.ddp_config.num_distributed_optimizer_instances > 1 if pg_collection is None: tp_group = parallel_state.get_tensor_model_parallel_group() + expt_tp_group = parallel_state.get_expert_tensor_parallel_group() if enable_hsdp: dp_cp_group = parallel_state.get_data_parallel_group( with_context_parallel=True, partial_data_parallel=True @@ -168,8 +214,11 @@ def _init_dist_index(self, pg_collection): ) outer_fsdp_group = None hybrid_fsdp_group = None + expt_dp_group = parallel_state.get_expert_data_parallel_group() + ep_group = parallel_state.get_expert_model_parallel_group() else: tp_group = getattr(pg_collection, 'tp', None) + expt_tp_group = getattr(pg_collection, 'expt_tp', None) if enable_hsdp: dp_cp_group = pg_collection.intra_dp_cp outer_fsdp_group = pg_collection.inter_dist_opt @@ -178,11 +227,17 @@ def _init_dist_index(self, pg_collection): dp_cp_group = pg_collection.dp_cp outer_fsdp_group = None hybrid_fsdp_group = None + expt_dp_group = getattr(pg_collection, 'expt_dp', None) + ep_group = getattr(pg_collection, 'ep', None) if tp_group is None: single_rank_group = dist.new_group(ranks=[dist.get_rank()]) tp_group = single_rank_group + if expt_tp_group is None: + single_rank_group = dist.new_group(ranks=[dist.get_rank()]) + expt_tp_group = single_rank_group + if enable_hsdp: mesh = _get_hsdp_tp_mesh(outer_fsdp_group, dp_cp_group, tp_group) dist_index = FSDPDistributedIndex( @@ -199,6 +254,17 @@ def _init_dist_index(self, pg_collection): hybrid_fsdp_group=hybrid_fsdp_group, ) else: + if ep_group is not None: + expt_mesh = _get_dp_tp_mesh(expt_dp_group, expt_tp_group, ep_size=ep_group.size()) + expt_device_mesh = DeviceMesh.from_group( + [expt_dp_group, expt_tp_group], + device_type="cuda", + mesh=expt_mesh.tolist(), + mesh_dim_names=["dp_cp", "tp"], + ) + else: + expt_device_mesh = None + mesh = _get_dp_tp_mesh(dp_cp_group, tp_group) dist_index = FSDPDistributedIndex( device_mesh=DeviceMesh.from_group( @@ -209,8 +275,11 @@ def _init_dist_index(self, pg_collection): ), dp_shard_dim="dp_cp", tp_dim="tp", + expt_device_mesh=expt_device_mesh, ) + self.tp_group = tp_group + return dist_index def stop_communication(self): @@ -220,6 +289,20 @@ def stop_communication(self): self.module.synchronize_gradient_reduce() self.module.synchronize_param_gather() + def sync_rng_states_across_tp_group(self): + """ + Synchronize the tensor parallel random number generator states. + """ + if self.tp_group.size() <= 1: + return + + if self.tp_group.rank() == 0: + broadcast_list = [_get_rng_state_dict()] + else: + broadcast_list = [None] + torch.distributed.broadcast_object_list(broadcast_list, group=self.tp_group, group_src=0) + _load_rng_state_dict(broadcast_list[0]) + def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group): assert HAVE_EINOPS, "einops is not installed. Please install it with `pip install einops`." @@ -273,29 +356,46 @@ def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group): return mesh -def _get_dp_tp_mesh(dp_cp_group, tp_group): +def _get_dp_tp_mesh(dp_cp_group, tp_group, ep_size=1): assert HAVE_EINOPS, "einops is not installed. Please install it with `pip install einops`." world_size = dist.get_world_size() tp_size = dist.get_world_size(tp_group) if tp_group is not None else 1 - # TODO: Supports configurable (dp, cp, tp) order. - mesh = einops.rearrange(torch.arange(world_size), "(dp_cp tp) -> dp_cp tp", tp=tp_size) + # TODO: Supports configurable (dp, cp, ep, tp) order. + mesh = einops.rearrange( + torch.arange(world_size), + "(dp_cp ep tp) -> ep dp_cp tp", + dp_cp=dp_cp_group.size(), + tp=tp_size, + ep=ep_size, + ) - mesh_dp_ranks = einops.rearrange(mesh, 'dp_cp tp -> tp dp_cp', tp=tp_size) + mesh_dp_ranks = einops.rearrange(mesh, 'ep dp_cp tp -> (ep tp) dp_cp', dp_cp=dp_cp_group.size()) dp_cp_group_ranks = dist.get_process_group_ranks(dp_cp_group) assert _check_mesh_ranks_and_group_ranks_are_consistent(mesh_dp_ranks, dp_cp_group_ranks), ( f"[Megatron-FSDP] Data Parallel ranks in the mesh {mesh_dp_ranks} " f"do not match the ranks in the DP group {dp_cp_group_ranks}." ) - mesh_tp_ranks = einops.rearrange(mesh, 'dp_cp tp -> (dp_cp) tp', tp=tp_size) + mesh_tp_ranks = einops.rearrange(mesh, 'ep dp_cp tp -> (dp_cp ep) tp', tp=tp_size) tp_group_ranks = dist.get_process_group_ranks(tp_group) assert _check_mesh_ranks_and_group_ranks_are_consistent(mesh_tp_ranks, tp_group_ranks), ( f"[Megatron-FSDP] Tensor Parallel ranks in the mesh {mesh_tp_ranks} " f"do not match the ranks in the TP group {tp_group_ranks}." ) - return mesh + # Exclude the expert parallel dimension + rank = dist.get_rank() + dp_tp_meshes = [per_ep_mesh for per_ep_mesh in mesh if rank in per_ep_mesh.reshape(-1).tolist()] + assert ( + len(dp_tp_meshes) == 1 + ), f"[Megatron-FSDP] Current rank {rank} is not unique in the mesh ranks {mesh.tolist()}." + assert len(dp_tp_meshes[0].reshape(-1).tolist()) == dp_cp_group.size() * tp_group.size(), ( + f"[Megatron-FSDP] DP-TP mesh size {len(dp_tp_meshes[0].reshape(-1).tolist())} " + f"does not match expected size {dp_cp_group.size() * tp_group.size()}." + ) + + return dp_tp_meshes[0] def _check_mesh_ranks_and_group_ranks_are_consistent(mesh_ranks, group_ranks): @@ -310,3 +410,22 @@ def _check_mesh_ranks_and_group_ranks_are_consistent(mesh_ranks, group_ranks): f"{mesh_ranks.tolist()} does not match the group ranks {group_ranks}." ) return sorted(current_ranks[0]) == sorted(group_ranks) + + +def _get_rng_state_dict(): + rng_state_dict = { + 'random_rng_state': random.getstate(), + 'np_rng_state': np.random.get_state(), + 'torch_rng_state': torch.get_rng_state(), + 'cuda_rng_state': torch.cuda.get_rng_state(), + 'rng_tracker_states': tensor_parallel.get_cuda_rng_tracker().get_states(), + } + return rng_state_dict + + +def _load_rng_state_dict(rng_state_dict): + random.setstate(rng_state_dict['random_rng_state']) + np.random.set_state(rng_state_dict['np_rng_state']) + torch.set_rng_state(rng_state_dict['torch_rng_state']) + torch.cuda.set_rng_state(rng_state_dict['cuda_rng_state']) + tensor_parallel.get_cuda_rng_tracker().set_states(rng_state_dict['rng_tracker_states']) diff --git a/megatron/core/distributed/fsdp/src/README.md b/megatron/core/distributed/fsdp/src/README.md index d879c6c26f8..9e036f22f67 100644 --- a/megatron/core/distributed/fsdp/src/README.md +++ b/megatron/core/distributed/fsdp/src/README.md @@ -127,6 +127,12 @@ device_mesh[("dp_shard", "cp")]._flatten("dp_shard_cp") # Only required if using HSDP. Otherwise, don't pass hybrid_fsdp_group. device_mesh[("dp_outer", "dp_shard", "cp")]._flatten("hsdp") hsdp_group = device_mesh["hsdp"].get_group() +# Initialize DeviceMesh for expert parallel (EP) modules when using FSDP + EP. +expert_device_mesh = torch.distributed.device_mesh.init_device_mesh( + "cuda", + mesh_shape=(expt_dp_shard_size, expt_tp_size), + mesh_dim_names=("dp_shard", "tp"), +) # Fully-shards your model and distributes your optimizer. model, optimizer = fully_shard( @@ -145,6 +151,8 @@ model, optimizer = fully_shard( tp_dim="tp", # Only required when using HSDP. Otherwise, set this to None. hybrid_fsdp_group=hsdp_group, + # Only required for FSDP + EP. Otherwise, set this to None. + expt_device_mesh=expt_device_mesh, # FSDP Sharding Strategy: no_shard (0) / optim (1) / optim_grads (2) / optim_grads_params (3) zero_dp_strategy=3, outer_dp_sharding_strategy=1, @@ -192,6 +200,9 @@ optimizer.load_state_dict(ckpt_state_dict["optimizer"]) - `tp_dim` is the name of the sub-mesh used for tensor parallelism (TP), which is required for `(FSDP, TP)`-strided sharding when using Megatron-LM or Torch-native `DTensor` TP. - For more information about tensor parallelism, refer to: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053). - `hybrid_fsdp_group` is the `ProcessGroup` which contains all ranks in the flattened `dp_shard_dim` and `dp_outer_dim` sub-meshes utilized to specify the `(DP-Outer, DP-Shard)` sharded coordinate system for the weight and gradient buffers. Required for HSDP. +- `expt_device_mesh` is another [`torch.distributed.DeviceMesh`](https://docs.pytorch.org/docs/stable/distributed.html#devicemesh) tailored for the expert parallel (EP) modules in `MegatronFSDP`. + - `dp_shard_dim` is the name of the sub-mesh required for FSDP sharding of the EP modules, enabling expert data parallelism (EDP). + - `tp_dim` is the name of the sub-mesh used for expert tensor parallelism (ETP), which is required for `(FSDP, ETP)`-strided sharding when using Megatron-LM or Torch-native `DTensor` ETP. - `init_model_with_meta_device` has `MegatronFSDP` initialize your `meta`-device model in shards on every CUDA device to avoid OOM when initializing extremely large models that cannot fit on a single device. Users can initialize their model on a [`meta`-device](https://docs.pytorch.org/docs/stable/meta.html) (`with torch.device('meta'): ...`), and ``MegatronFSDP`` will further shard and initialize the model parameters layer-by-layer adhering to the customizable `module.reset_parameters` method, which prevents the entire model from being allocated in memory at any point during runtime. - Defaults to `False`. - Note that the `device` argument which installs your model on a specific device or rank will be deactivated when `init_model_with_meta_device=True`. diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py index 24e86cede72..e98362a1a03 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py @@ -64,6 +64,7 @@ def fully_shard_model( dp_outer_dim: Optional[str] = None, tp_dim: Optional[str] = None, hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None, + expt_device_mesh: Optional[DeviceMesh] = None, fsdp_unit_modules: Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]] = None, zero_dp_strategy: str | int = 3, outer_dp_sharding_strategy: str | int = 0, @@ -183,8 +184,10 @@ def fully_shard_model( tp_dim=tp_dim, # Only required for HSDP. hybrid_fsdp_group=hybrid_fsdp_group, - # Access to flattened DP rank assignments for HFSDP. + # Access to flattened DP rank assignments for HSDP. hsdp_outer_dp_shard=_outer_fsdp_sharding, + # Only required for Megatron-FSDP + EP. + expt_device_mesh=expt_device_mesh, ) # Wrap model in Megatron FSDP. @@ -330,6 +333,7 @@ def fully_shard( dp_outer_dim: Optional[str] = None, tp_dim: Optional[str] = None, hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None, + expt_device_mesh: Optional[DeviceMesh] = None, fsdp_unit_modules: Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]] = None, zero_dp_strategy: str | int = 3, outer_dp_sharding_strategy: str | int = 0, @@ -391,6 +395,9 @@ def fully_shard( by flattening the outer-FSDP (dp_outer_dim) and FSDP (dp_shard_dim) process groups or sub-meshes. Defaults to None. Required for HSDP, i.e. if dp_outer_dim is not None. + expt_device_mesh (Optional[DeviceMesh]): + Expert parallel device mesh object defining the topology for MoE distributed training. + fsdp_unit_modules (Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]]): List of (sub-)module classes or (sub-)module class import paths that are "units", which are torch.nn.Module(s) that are sharded and scheduled by Megatron-FSDP. @@ -503,6 +510,7 @@ def fully_shard( dp_outer_dim=dp_outer_dim, tp_dim=tp_dim, hybrid_fsdp_group=hybrid_fsdp_group, + expt_device_mesh=expt_device_mesh, fsdp_unit_modules=fsdp_unit_modules, zero_dp_strategy=zero_dp_strategy, outer_dp_sharding_strategy=outer_dp_sharding_strategy, diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py index 10a8ae14d65..d6ef5f6210e 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py @@ -235,7 +235,10 @@ def __init__( self.dist_index = dist_index # If Megatron Expert Parallelism is enabled, you need to provide an expt_dp_group. - if has_expert_parameters and self.dist_index.get_expert_dp_group() is None: + if ( + has_expert_parameters + and self.dist_index.get_fsdp_group(is_expert_parallel=True) is None + ): raise ValueError( "[Megatron-FSDP] Megatron Expert Parallelism is enabled, but no expt_dp_group is" "provided." @@ -353,9 +356,7 @@ def _init_fsdp_param_and_grad_buffer(self): ) # Set the suggested communication unit size for reduce-scatter and all-gather pipelines. - suggested_communication_unit_size = ( - self.ddp_config.suggested_communication_unit_size or 1_000_000_000 - ) + suggested_communication_unit_size = self.ddp_config.suggested_communication_unit_size if suggested_communication_unit_size is None: if self.data_parallel_sharding_strategy == "optim_grads_params": total_param_elements = 0 @@ -370,6 +371,8 @@ def _init_fsdp_param_and_grad_buffer(self): suggested_communication_unit_size = total_param_elements // total_fsdp_module * 2 elif self.bucket_size is not None: suggested_communication_unit_size = self.bucket_size + else: + suggested_communication_unit_size = 1_000_000_000 # Cap to 1B elements. suggested_communication_unit_size = max( diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py index c8116150d52..bdf480d867b 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py @@ -34,7 +34,14 @@ from torch.distributed.tensor.device_mesh import _mesh_resources from .uneven_dtensor import update_uneven_dtensor_chunk_metadata, validate_uneven_dtensor -from .utils import _MODEL_PARALLEL_RNG_TRACKER_NAME, FSDPDistributedIndex, get_global_memory_buffer +from .utils import ( + _MODEL_PARALLEL_RNG_TRACKER_NAME, + FSDPDistributedIndex, + get_global_memory_buffer, + get_mcore_tensor_parallel_partition_dim, + is_mcore_tensor_model_parallel, + is_mcore_tensor_parallel_duplicated, +) logger = logging.getLogger(__name__) @@ -1299,7 +1306,7 @@ def _does_param_require_new_bucket(param): and policy.data_parallel_sharding_strategy != "no_shard" ) - is_expert_parameter = lambda p: not getattr(p, "allreduce", True) + is_expert_parameter = lambda n, p: ".experts." in n # Step 1: Group the parameters according to their execution order and attributes. # FSDP unit module parameters are split into multiple parameter sub-groups. @@ -1313,7 +1320,7 @@ def _does_param_require_new_bucket(param): if is_float8tensor(param) or meta_device_init_fp8_params.get(name, False) else param.dtype ), - is_expert_param=is_expert_parameter(param), + is_expert_param=is_expert_parameter(name, param), requires_grad=param.requires_grad, fsdp_unit_id=None, ) @@ -2257,6 +2264,10 @@ def _reset_parameters(self, old_params, new_params): self.param_to_direct_module[new_param] = self.param_to_direct_module[old_param] del self.param_to_direct_module[old_param] + for tp_attr in ["_mcore_tp", "_tp_partition_dim", "_tp_duplicated"]: + if getattr(old_param, tp_attr, None) is not None: + setattr(new_param, tp_attr, getattr(old_param, tp_attr)) + for item_id, p in enumerate(self.params): if p in param_map: new_p = param_map[p] @@ -2340,6 +2351,7 @@ def _init_distributed_params(self): is_expert_param=pg.is_expert_param, run_check=True, update_uneven_dtensor_chunk_meta=True, + force_sync_tp_duplicated_param=True, ) dist_main_weight[param_name] = dist_param elif wbuf: @@ -2351,6 +2363,7 @@ def _init_distributed_params(self): is_expert_param=pg.is_expert_param, run_check=True, update_uneven_dtensor_chunk_meta=True, + force_sync_tp_duplicated_param=True, ) dist_main_weight[param_name] = dist_param else: @@ -2365,6 +2378,7 @@ def _init_distributed_params(self): is_expert_param=pg.is_expert_param, run_check=True, update_uneven_dtensor_chunk_meta=False, + force_sync_tp_duplicated_param=True, ) dist_main_weight[param_name] = dist_param @@ -2399,6 +2413,9 @@ def set_param_attribute(): "partition_dim", "partition_stride", "is_embedding_or_output_parameter", + "_mcore_tp", + "_tp_duplicated", + "_tp_partition_dim", ]: if hasattr(orig_param, attr_name): setattr(param, attr_name, getattr(orig_param, attr_name)) @@ -3546,7 +3563,9 @@ def to_local_if_dtensor(tensor): return tensor -def _get_fsdp_tensor_spec(param, dist_index: FSDPDistributedIndex, is_sharded_param): +def _get_fsdp_tensor_spec( + param, dist_index: FSDPDistributedIndex, is_sharded_param, is_expert_param +): """ Get the DeviceMesh for the parameter and modify the placement for Megatron-FSDP. """ @@ -3557,7 +3576,7 @@ def _get_fsdp_tensor_spec(param, dist_index: FSDPDistributedIndex, is_sharded_pa dtensor_mesh = getattr(dtensor_spec, "mesh", None) # Validate that the DTensor root mesh is identical to the Megatron-FSDP device mesh. - megatron_fsdp_global_mesh = dist_index.get_root_mesh() + megatron_fsdp_global_mesh = dist_index.get_root_mesh(is_expert_parallel=is_expert_param) dtensor_global_mesh = _mesh_resources.get_root_mesh(dtensor_mesh) # FIXME(boxiangw): add or megatron_fsdp_global_mesh != dtensor_global_mesh: # _mesh_resources.get_root_mesh(dtensor_mesh) is not getting the correct root mesh @@ -3602,7 +3621,7 @@ def _get_fsdp_tensor_spec(param, dist_index: FSDPDistributedIndex, is_sharded_pa placements = [Shard(0), dtensor_placement] shard_order = [1, 0] - device_mesh = dist_index.get_submesh(mesh_dim_names) + device_mesh = dist_index.get_submesh(mesh_dim_names, is_expert_parallel=is_expert_param) if shard_order is not None: setattr(device_mesh, "_shard_order", shard_order) @@ -3627,7 +3646,7 @@ def _get_fsdp_tensor_spec(param, dist_index: FSDPDistributedIndex, is_sharded_pa else: placements = [Shard(0)] - device_mesh = dist_index.get_submesh(mesh_dim_names) + device_mesh = dist_index.get_submesh(mesh_dim_names, is_expert_parallel=is_expert_param) if shard_order is not None: setattr(device_mesh, "_shard_order", shard_order) @@ -3642,6 +3661,7 @@ def make_fsdp_dtensor( is_expert_param: bool = False, run_check: bool = False, update_uneven_dtensor_chunk_meta: bool = False, + force_sync_tp_duplicated_param: bool = False, ): """ Creates a distributed tensor (DTensor) from a local tensor with support for @@ -3720,38 +3740,39 @@ def make_fsdp_dtensor( orig_param = param # Handle tensor model parallel specific logic - if getattr(param, "tensor_model_parallel", False): + if is_mcore_tensor_model_parallel(param): # Ensure parameter is not already a DTensor assert not isinstance(param, DTensor), ( - "[Megatron-FSDP] Parameter is already a DTensor, yet tensor_model_parallel " - "is True. Check usage." + "[Megatron-FSDP] Parameter is already a DTensor, yet tensor_model_parallel " "is True." ) - # Validate M-Core TP attributes - assert hasattr( - param, "partition_dim" - ), "[Megatron-FSDP] tensor_model_parallel param missing 'partition_dim'." - assert hasattr( - param, "partition_stride" - ), "[Megatron-FSDP] tensor_model_parallel param missing 'partition_stride'." - assert ( - param.partition_stride == 1 - ), "[Megatron-FSDP] Only partition_stride=1 is currently supported for " - "tensor_model_parallel." - - tp_dim = param.partition_dim - tp_mesh = dist_index.get_submesh(dist_index.tp_dim) - - # Adjust shape for global dimension + tp_mesh = dist_index.get_submesh(dist_index.tp_dim, is_expert_parallel=is_expert_param) + global_shape = list(param.shape) if tp_mesh.mesh.numel() > 1: - global_shape = list(param.shape) - global_shape[tp_dim] *= tp_mesh.mesh.numel() + if is_mcore_tensor_parallel_duplicated(param): + placements = [Replicate()] + if force_sync_tp_duplicated_param: + if local_tensor.numel() > 0: + torch.distributed.broadcast( + local_tensor, group=tp_mesh.get_group(), group_src=0 + ) + elif run_check: + # TODO: Implement consistency check for duplicated TP parameters + pass + else: + tp_dim = get_mcore_tensor_parallel_partition_dim(param) + assert tp_dim is not None, ( + "[Megatron-FSDP] Parameter is not tensor model parallel, " + "yet tensor_model_parallel is True." + ) + placements = [Shard(tp_dim)] + global_shape[tp_dim] *= tp_mesh.mesh.numel() # Construct TP-sharded DTensor using Megatron-style placement param = DTensor.from_local( - local_tensor=param, + local_tensor=local_tensor, device_mesh=tp_mesh, - placements=[Shard(tp_dim)], + placements=placements, run_check=run_check, shape=global_shape, stride=torch.empty(global_shape).stride(), @@ -3759,7 +3780,7 @@ def make_fsdp_dtensor( # Get FSDP-configured mesh and placements from provided param device_mesh, placements = _get_fsdp_tensor_spec( - param, dist_index, is_sharded_param=is_sharded_param + param, dist_index, is_sharded_param=is_sharded_param, is_expert_param=is_expert_param ) # Reshape local tensor for sharded layouts beyond 1D diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py index 523d8fae333..490d80c0f21 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py @@ -365,7 +365,9 @@ def _assemble_full_tensor_from_uneven_chunks( # Wrap into a replicated DTensor and return return DTensor.from_local( - full_tensor, placements=[Replicate()], device_mesh=dtensor.device_mesh + full_tensor, + placements=[Replicate()] * len(dtensor.placements), + device_mesh=dtensor.device_mesh, ) diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py index 1dfe08b90f4..b94a332bb0d 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py @@ -675,6 +675,7 @@ def __init__( tp_dim: Optional[str] = None, hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None, hsdp_outer_dp_shard: bool = False, + expt_device_mesh: Optional[DeviceMesh] = None, ): """ Args: @@ -691,6 +692,8 @@ def __init__( in hybrid FSDP. Specifying outer sharding will lift the bucket sharding coordinate system to flattened ranks of (dp_shard, dp_outer) instead of just sharding across dp_shard ranks and replicating across dp_outer ranks. + expt_device_mesh (Optional[DeviceMesh]): The expert parallel device mesh + to use for the DistributedIndex. """ # Device mesh arguments. self.device_mesh = device_mesh @@ -701,6 +704,11 @@ def __init__( self.use_hybrid_fsdp = dp_outer_dim is not None # Helper flag to denote if we are outer-sharding in hybrid FSDP. self.hsdp_outer_dp_shard = hsdp_outer_dp_shard + self.expt_device_mesh = expt_device_mesh + + # Handling the situation where M-Core MoE EP=1 + if self.expt_device_mesh is None: + self.expt_device_mesh = device_mesh # Hybrid FSDP Process Groups # Retrieve the FSDP process group from the DeviceMesh. @@ -719,6 +727,14 @@ def __init__( # combination of the outer-FSDP and FSDP process groups. self.hybrid_fsdp_group = hybrid_fsdp_group + # Retrieve the expert parallel process groups from the DeviceMesh. + self.expt_fsdp_group = ( + self.expt_device_mesh[self.dp_shard_dim].get_group() + if self.expt_device_mesh is not None + and contains_submesh(self.expt_device_mesh, self.dp_shard_dim) + else None + ) + """ Store a persistent reference to the core device meshes that back Megatron-FSDP. This is necessary because _MeshEnv (_mesh_resources) may not persist: @@ -732,26 +748,33 @@ def __init__( FIXME(@cspades): Identify the root cause of this behavior. """ self.mesh_library = {} - # TP Mesh + + def register_submesh(device_mesh, submesh, is_expert_parallel): + """Register a submesh with identifier: (*submesh, is_expert_parallel) + in the mesh library.""" + if contains_submesh(device_mesh, submesh): + submesh_identifier = tuple(list(submesh) + [is_expert_parallel]) + self.mesh_library[submesh_identifier] = device_mesh[submesh] + + # Define common submesh patterns tp_submesh = (self.tp_dim,) - if contains_submesh(self.device_mesh, tp_submesh): - self.mesh_library[tp_submesh] = self.device_mesh[tp_submesh] - # HSDP-TP Mesh hsdp_tp_submesh = (self.dp_outer_dim, self.dp_shard_dim, self.tp_dim) - if contains_submesh(self.device_mesh, hsdp_tp_submesh): - self.mesh_library[hsdp_tp_submesh] = self.device_mesh[hsdp_tp_submesh] - # FSDP-TP Mesh fsdp_tp_submesh = (self.dp_shard_dim, self.tp_dim) - if contains_submesh(self.device_mesh, fsdp_tp_submesh): - self.mesh_library[fsdp_tp_submesh] = self.device_mesh[fsdp_tp_submesh] - # HSDP Mesh hsdp_submesh = (self.dp_outer_dim, self.dp_shard_dim) - if contains_submesh(self.device_mesh, hsdp_submesh): - self.mesh_library[hsdp_submesh] = self.device_mesh[hsdp_submesh] - # FSDP Mesh fsdp_submesh = (self.dp_shard_dim,) - if contains_submesh(self.device_mesh, fsdp_submesh): - self.mesh_library[fsdp_submesh] = self.device_mesh[fsdp_submesh] + + # Register non-EP submeshes + register_submesh(self.device_mesh, tp_submesh, False) + register_submesh(self.device_mesh, hsdp_tp_submesh, False) + register_submesh(self.device_mesh, fsdp_tp_submesh, False) + register_submesh(self.device_mesh, hsdp_submesh, False) + register_submesh(self.device_mesh, fsdp_submesh, False) + + # Register EP submeshes + if self.expt_device_mesh is not None: + register_submesh(self.expt_device_mesh, tp_submesh, True) + register_submesh(self.expt_device_mesh, fsdp_tp_submesh, True) + register_submesh(self.expt_device_mesh, fsdp_submesh, True) # Validate FSDP arguments. if self.fsdp_group is None: @@ -776,36 +799,54 @@ def __init__( "process groups or sub-meshes." ) - def get_submesh(self, mesh_dim_names: str | Sequence[str]) -> DeviceMesh: + def get_submesh( + self, mesh_dim_names: str | Sequence[str], is_expert_parallel: bool = False + ) -> DeviceMesh: """ - Retrieve an Megatron-FSDP-registered sub-mesh by name(s). + Retrieve an Megatron-FSDP-registered submesh by name(s). """ if isinstance(mesh_dim_names, str): mesh_dim_names = (mesh_dim_names,) - # Search for the sub-mesh in the mesh library. - device_submesh = self.mesh_library.get(tuple(mesh_dim_names), None) + + # Construct submesh identifier: (*mesh_dim_names, is_expert_parallel) + submesh_identifier = tuple(list(mesh_dim_names) + [is_expert_parallel]) + + # Retrieve the submesh from the mesh library + device_submesh = self.mesh_library.get(submesh_identifier, None) + if device_submesh is None: - if self.tp_dim is None: - # Warn about not specifying tp_dim for - # layers or frameworks that depend on this. + # Warn about not specifying tp_dim for layers or frameworks that depend on this. + if self.tp_dim is None and not is_expert_parallel: logger.warning( - "[FSDPDistributedIndex] Note: For TransformerEngine, or other machine learning " - "frameworks like Megatron that assume TP=1, you must specify tp_dim to use " - "Megatron-FSDP. Create a trivial TP dimension by setting the TP dimension size " + "[FSDPDistributedIndex] Note: For TransformerEngine, or " + "other machine learning frameworks like Megatron that assume " + "TP=1, you must specify tp_dim to use Megatron-FSDP. " + "Create a trivial TP dimension by setting the TP dimension size " "to 1 in the DeviceMesh.\n" f"DeviceMesh: {self.device_mesh}" ) + elif self.tp_dim is None and is_expert_parallel: + logger.warning( + "[FSDPDistributedIndex] Note: For TransformerEngine, or " + "other machine learning frameworks like Megatron that assume " + "ETP=1, you must specify tp_dim to use Megatron-FSDP. " + "Create a trivial ETP dimension by setting the ETP dimension size " + "to 1 in the DeviceMesh.\n" + f"DeviceMesh: {self.expt_device_mesh}" + ) + raise ValueError( - f"[FSDPDistributedIndex][get_submesh] No sub-mesh with " - f"mesh_dim_names={mesh_dim_names} has been registered with Megatron-FSDP." + f"[FSDPDistributedIndex][get_submesh] No submesh with " + f"mesh_dim_names={mesh_dim_names}, is_expert_parallel={is_expert_parallel} " + f"has been registered with Megatron-FSDP." ) + return device_submesh def get_dp_group(self, is_expert_parallel: bool = False) -> ProcessGroup: """Get the data parallel process group.""" if is_expert_parallel: - # Expert parallel is not supported - return None + return self.expt_fsdp_group if self.use_hybrid_fsdp: return self.hybrid_fsdp_group return self.fsdp_group @@ -813,8 +854,7 @@ def get_dp_group(self, is_expert_parallel: bool = False) -> ProcessGroup: def get_fsdp_group(self, is_expert_parallel: bool = False) -> ProcessGroup: """Get the FSDP process group.""" if is_expert_parallel: - # Expert parallel is not supported - return None + return self.expt_fsdp_group return self.fsdp_group def get_outer_fsdp_group(self) -> ProcessGroup: @@ -826,7 +866,7 @@ def get_outer_fsdp_group(self) -> ProcessGroup: def get_root_mesh(self, is_expert_parallel: bool = False) -> DeviceMesh: """Get the device mesh.""" if is_expert_parallel: - raise NotImplementedError("Expert parallel is not supported in Megatron-FSDP.") + return self.expt_device_mesh return self.device_mesh def get_logical_hybrid_fsdp_rank(self): @@ -924,3 +964,29 @@ def create_updated_function_signature(original_function, **extended_kwargs: dict # Return the updated function signature. return inspect.Signature(params) + + +def is_mcore_tensor_model_parallel(param: torch.Tensor) -> bool: + """ + Check if the given parameter is Megatron-Core tensor model parallel. + """ + return getattr(param, "_mcore_tp", False) or getattr(param, "tensor_model_parallel", False) + + +def is_mcore_tensor_parallel_duplicated(param: torch.Tensor) -> bool: + """ + Check if the given parameter is Megatron-Core tensor model parallel and duplicated. + """ + return getattr(param, "_tp_duplicated", False) + + +def get_mcore_tensor_parallel_partition_dim(param: torch.Tensor) -> Optional[int]: + """ + Get the partition dimension for a Megatron-Core tensor model parallel parameter. + """ + if is_mcore_tensor_model_parallel(param): + if hasattr(param, "_tp_partition_dim"): + return param._tp_partition_dim + else: + return param.partition_dim + return None diff --git a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py index 507472f789f..455a7757d28 100644 --- a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +++ b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py @@ -130,9 +130,9 @@ def forward(self, max_seq_len: int, offset: int = 0, packed_seq: bool = False) - self.original_max_position_embeddings, self.correction_range_round_to_int, ) - inv_freq_mask = 1.0 - _yarn_linear_ramp_mask(low, high, self.dim // 2).to( - device=self.inv_freq_extra.device, dtype=torch.float32 - ) + inv_freq_mask = 1.0 - _yarn_linear_ramp_mask( + low, high, self.dim // 2, device=self.inv_freq_extra.device + ).to(dtype=torch.float32) inv_freq = self.inv_freq_inter * (1 - inv_freq_mask) + self.inv_freq_extra * inv_freq_mask seq = ( @@ -211,11 +211,11 @@ def _yarn_find_correction_range( return max(low, 0), min(high, dim - 1) # Clamp values just in case -def _yarn_linear_ramp_mask(min: float, max: float, dim: int) -> Tensor: +def _yarn_linear_ramp_mask(min: float, max: float, dim: int, device: torch.device) -> Tensor: if min == max: max += 0.001 # Prevent singularity - linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + linear_func = (torch.arange(dim, dtype=torch.float32, device=device) - min) / (max - min) ramp_func = torch.clamp(linear_func, 0, 1) return ramp_func diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py index 307538fad22..c254b2f6882 100644 --- a/megatron/core/optimizer/__init__.py +++ b/megatron/core/optimizer/__init__.py @@ -34,6 +34,7 @@ from megatron.core import parallel_state from megatron.core.optimizer.cpu_offloading.hybrid_optimizer import HybridDeviceOptimizer from megatron.core.process_groups_config import ProcessGroupCollection +from megatron.core.transformer.fsdp_dtensor_checkpoint import get_global_unique_param_name from ..distributed.param_and_grad_buffer import _ParamAndGradBuffer from ..transformer.module import MegatronModule @@ -481,6 +482,7 @@ def get_megatron_optimizer( use_gloo_process_groups: bool = True, default_skip_embedding_weight_decay: bool = False, pg_collection: Optional[ProcessGroupCollection] = None, + dump_param_to_param_group_map: Optional[str] = None, ) -> MegatronOptimizer: """Retrieve the Megatron optimizer for model chunks. @@ -502,6 +504,7 @@ def get_megatron_optimizer( This is useful if you do not want embeddings to shrink to zero in training as recommended in https://arxiv.org/abs/2312.16903 pg_collection: Optional unified process group for distributed training. + dump_param_to_param_group_map (Optional[str]): path to dump parameter to param group map. Returns: Instance of MegatronOptimizer. @@ -579,6 +582,9 @@ def get_megatron_optimizer( return ChainedOptimizer(optimizers) + if dump_param_to_param_group_map is not None: + param_to_param_group = {} + param_group_id = 0 for dense_model_chunks, overlap_param_gather_with_optimizer_step in zip( all_dense_model_chunks, overlap_param_gather_with_optimizer_step_flags ): @@ -597,6 +603,12 @@ def get_megatron_optimizer( model_chunk.overlap_param_gather_with_optimizer_step = ( overlap_param_gather_with_optimizer_step ) + if dump_param_to_param_group_map is not None: + for param_group in param_groups: + for param in param_group["params"]: + param_name = get_global_unique_param_name(model_chunks, param) + param_to_param_group[param_name] = param_group_id + param_group_id += 1 # Pass Gloo process groups into optimizer only if needed. optimizers.append( @@ -626,6 +638,12 @@ def get_megatron_optimizer( buffer_name='expert_parallel_buffers', default_skip_embedding_weight_decay=default_skip_embedding_weight_decay, ) + if dump_param_to_param_group_map is not None: + for param_group in moe_param_groups: + for param in param_group["params"]: + param_name = get_global_unique_param_name(model_chunks, param) + param_to_param_group[param_name] = param_group_id + param_group_id += 1 if len(moe_param_groups) > 0: expt_model_parallel_rank = get_pg_rank(expt_tp_pp_group) # Pass Gloo process groups into optimizer only if needed. @@ -648,4 +666,9 @@ def get_megatron_optimizer( ) ) + if dump_param_to_param_group_map is not None: + torch.distributed.checkpoint.save( + state_dict=param_to_param_group, checkpoint_id=dump_param_to_param_group_map + ) + return ChainedOptimizer(optimizers) diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py index 2925edcce60..8b4740516e2 100644 --- a/megatron/core/optimizer/distrib_optimizer.py +++ b/megatron/core/optimizer/distrib_optimizer.py @@ -47,6 +47,7 @@ from ..dist_checkpointing.utils import extract_sharded_tensors_and_factories from ..distributed.param_and_grad_buffer import _ParamAndGradBuffer, partition_buckets from ..fp8_utils import dequantize_fp8_tensor, is_float8tensor, quantize_param_shard +from ..transformer.fsdp_dtensor_checkpoint import handle_experts_in_state_dict from ..transformer.module import MegatronModule from .grad_scaler import MegatronGradScaler from .optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper, param_group_identifier_keys @@ -1152,6 +1153,7 @@ def _param_name(self, param: torch.nn.Parameter) -> str: "Ensure that each model chunk has unique parameter names." ) name_to_param.update(_name_to_param) + name_to_param = handle_experts_in_state_dict(name_to_param) self.param_to_name = {param: name for name, param in name_to_param.items()} assert ( param in self.param_to_name diff --git a/megatron/core/transformer/fsdp_dtensor_checkpoint.py b/megatron/core/transformer/fsdp_dtensor_checkpoint.py index dad1947a183..9ef3f1f1b82 100644 --- a/megatron/core/transformer/fsdp_dtensor_checkpoint.py +++ b/megatron/core/transformer/fsdp_dtensor_checkpoint.py @@ -12,18 +12,160 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging +import re + import torch +import torch.distributed as dist +from torch.distributed.checkpoint import default_planner + +logger = logging.getLogger(__name__) try: + from torch.distributed import DeviceMesh + from torch.distributed._tensor import DTensor + from torch.distributed.checkpoint.metadata import TensorStorageMetadata + from torch.distributed.tensor.placement_types import Replicate, Shard + from megatron.core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer import ( make_fsdp_dtensor, ) + from megatron.core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor import ( + gather_uneven_dtensor_to_full_tensor, + ) + from megatron.core.distributed.fsdp.src.megatron_fsdp.utils import ( + get_mcore_tensor_parallel_partition_dim, + is_mcore_tensor_model_parallel, + ) HAVE_MEGATRON_FSDP = True except ImportError: HAVE_MEGATRON_FSDP = False +from megatron.core import parallel_state from megatron.core.tensor_parallel.layers import copy_tensor_model_parallel_attributes +from megatron.core.transformer.transformer_layer import TransformerLayer + + +def get_ep_layer_offset(): + """ + Get the expert layer offset for the current model. + """ + from megatron.training.global_vars import get_args + + args = get_args() + ep_size = parallel_state.get_expert_model_parallel_world_size() + ep_rank = parallel_state.get_expert_model_parallel_rank() + num_local_experts = args.num_experts // ep_size if args.num_experts else 0 + local_expert_offset = ep_rank * num_local_experts + + return local_expert_offset + + +def get_total_num_experts(): + """ + Get the total number of experts for the current model. + """ + from megatron.training.global_vars import get_args + + args = get_args() + return args.num_experts if args.num_experts else 0 + + +def get_expert_index_from_key(key): + """Extract expert index from various expert key formats. + + Supported formats: + - GroupedMLP: 'mlp.experts.linear_fc1.weight0', 'mlp.experts.linear_fc2.weight0' + - SequentialMLP: 'mlp.experts.local_experts.0.linear_fc1.weight', + 'mlp.experts.local_experts.0.linear_fc2.weight' + + Returns: + int: Expert index if found, None otherwise. + """ + # GroupedMLP: index is at the end after 'weight' + if 'mlp.experts.linear_fc1.weight' in key or 'mlp.experts.linear_fc2.weight' in key: + m = re.search(r'^.*\.mlp\.experts\.linear_fc\d\.weight(\d+)', key) + assert m, f"Failed to parse expert index from key: {key}" + return int(m.group(1)) + # SequentialMLP: index is between 'local_experts.' and next '.' + elif 'mlp.experts.local_experts' in key: + m = re.search(r'^.*\.mlp\.experts\.local_experts\.(\d+)', key) + assert m, f"Failed to parse expert index from key: {key}" + return int(m.group(1)) + return None + + +def handle_experts_in_state_dict(state_dict): + """ + Rewrite expert keys in state dict. + """ + local_expert_start = get_ep_layer_offset() + local_expert_end = get_total_num_experts() + + def should_keep_expert_key(expert_index): + """Determine if this rank should keep this expert key based on expert index""" + if expert_index is None: + # If we can't determine expert index, keep the key (non-expert weights) + return True + + # Check if this expert belongs to this rank + return local_expert_start <= expert_index < local_expert_end + + def replace_expert_index_in_key(key, expert_index, state_dict): + """Replace expert index in key with new index corresponding to the current rank""" + new_expert_index = expert_index + local_expert_start + # GroupedMLP: 'mlp.experts.linear_fc1.weight0', 'mlp.experts.linear_fc2.weight0' + if 'mlp.experts.linear_fc1.weight' in key or 'mlp.experts.linear_fc2.weight' in key: + # Handle SwiGLU weight{idx}_w and weight{idx}_v format + if key.endswith('_w') or key.endswith('_v'): + suffix = key[-2:] # '_w' or '_v' + new_key = key.replace( + f'weight{expert_index}{suffix}', f'weight{new_expert_index}{suffix}' + ) + # Handle regular weight{idx} format + else: + new_key = key.replace(f'weight{expert_index}', f'weight{new_expert_index}') + # SequentialMLP: index is between 'local_experts.' and next '.' + elif 'mlp.experts.local_experts' in key: + new_key = key.replace( + f'local_experts.{expert_index}.', f'local_experts.{new_expert_index}.' + ) + else: + raise ValueError(f"Unexpected expert key format: {key}") + + state_dict[new_key] = state_dict[key] + del state_dict[key] + + # Process model state dict + state_dict = state_dict.copy() + for key in list(state_dict.keys()): + expert_index = get_expert_index_from_key(key) + if not should_keep_expert_key(expert_index): + replace_expert_index_in_key(key, expert_index, state_dict) + + return state_dict + + +def expert_param_local_key(key): + """Get the module parameter corresponding to the key.""" + local_expert_offset = get_ep_layer_offset() + expert_index = get_expert_index_from_key(key) + if expert_index is not None: + new_expert_index = expert_index - local_expert_offset + # GroupedMLP: 'mlp.experts.linear_fc1.weight0', 'mlp.experts.linear_fc2.weight0' + if 'mlp.experts.linear_fc1.weight' in key or 'mlp.experts.linear_fc2.weight' in key: + new_key = key.replace(f'weight{expert_index}', f'weight{new_expert_index}') + # SequentialMLP: index is between 'local_experts.' and next '.' + elif 'mlp.experts.local_experts' in key: + new_key = key.replace( + f'local_experts.{expert_index}.', f'local_experts.{new_expert_index}.' + ) + else: + raise ValueError(f"Unexpected expert key format: {key}") + key = new_key + + return key def handle_swiglu_in_state_dict(model, model_state_dict, optimizer_state_dict): @@ -43,7 +185,29 @@ def intersection(s1, s2): def offset_slice(s, offset): return slice(s.start + offset, s.stop + offset) - def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis): + def is_swiglu_key(key): + """ + Check if this key should be handled as SwiGLU linear_fc1 weight or bias. + """ + # Non-expert MLP: 'mlp.linear_fc1.weight', 'mlp.linear_fc1.bias' + # GroupedMLP: 'mlp.experts.linear_fc1.weight0', 'mlp.experts.linear_fc1.bias0' + # SequentialMLP: 'mlp.experts.local_experts.0.linear_fc1.weight', + # 'mlp.experts.local_experts.0.linear_fc1.bias' + return any( + re.search(pat, key) + for pat in [ + r"(.*)\.mlp\.linear_fc1\.weight$", + r"(.*)\.mlp\.linear_fc1\.bias$", + r"(.*)\.mlp\.experts\.linear_fc1\.weight(\d+)$", + r"(.*)\.mlp\.experts\.linear_fc1\.bias(\d+)$", + r"(.*)\.mlp\.experts\.local_experts\.(\d+)\.linear_fc1\.weight$", + r"(.*)\.mlp\.experts\.local_experts\.(\d+)\.linear_fc1\.bias$", + r"(.*)\.mlp\.shared_experts\.linear_fc1\.weight$", + r"(.*)\.mlp\.shared_experts\.linear_fc1\.bias$", + ] + ) + + def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis, is_expert_param): """ Split the SWiGLU linear_fc1 parameter into two parts: weight_w and weight_v. """ @@ -55,7 +219,9 @@ def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis): fsdp_slice = dist_param.megatron_fsdp_slice megatron_fsdp_dist_index = dist_param.megatron_fsdp_dist_index - tp_mesh = megatron_fsdp_dist_index.get_submesh([megatron_fsdp_dist_index.tp_dim]) + tp_mesh = megatron_fsdp_dist_index.get_submesh( + [megatron_fsdp_dist_index.tp_dim], is_expert_parallel=is_expert_param + ) data_size = data.numel() // tp_mesh.mesh.numel() w_slice = slice(0, data_size // 2) v_slice = slice(data_size // 2, data_size) @@ -75,8 +241,9 @@ def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis): # Fake parameters w and v are used to provide the correct parameter # shape and Tensor-Parallelism information. per_tp_rank_shape = list(data.shape) - if getattr(dist_param, "tensor_model_parallel", False): - tp_dim = dist_param.partition_dim + if is_mcore_tensor_model_parallel(dist_param): + tp_dim = get_mcore_tensor_parallel_partition_dim(dist_param) + assert tp_dim is not None, "Tensor model parallel dimension not found" per_tp_rank_shape[tp_dim] //= tp_mesh.mesh.numel() linear_fc1_meta = torch.empty(*per_tp_rank_shape, device="meta") w_meta, v_meta = torch.chunk(linear_fc1_meta, 2, dim=swiglu_shard_axis) @@ -87,6 +254,7 @@ def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis): weight_w.data, w_meta, dist_index=megatron_fsdp_dist_index, + is_expert_param=is_expert_param, run_check=True, update_uneven_dtensor_chunk_meta=True, ) @@ -94,16 +262,21 @@ def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis): weight_v.data, v_meta, dist_index=megatron_fsdp_dist_index, + is_expert_param=is_expert_param, run_check=True, update_uneven_dtensor_chunk_meta=True, ) return weight_w, weight_v + model_state_dict = model_state_dict.copy() for key in list(model_state_dict.keys()): - if key.endswith('mlp.linear_fc1.weight') or key.endswith('mlp.linear_fc1.bias'): + if is_swiglu_key(key): dist_param = model.get_parameter(f"module.{key}") weight_w, weight_v = split_swiglu_linear_fc1( - model_state_dict[key], dist_param, swiglu_shard_axis=0 + model_state_dict[key], + dist_param, + swiglu_shard_axis=0, + is_expert_param='mlp.experts' in key, ) # Update the model state dict with the new keys @@ -111,26 +284,32 @@ def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis): model_state_dict[f"{key}_v"] = weight_v del model_state_dict[key] - try: - optimizer_state_dict = optimizer_state_dict["state"] - except KeyError: - optimizer_state_dict = {} + if optimizer_state_dict is not None: + optimizer_state_dict = optimizer_state_dict.copy() + if len(optimizer_state_dict["state"]) != 0: + opt_state_dict = optimizer_state_dict["state"] + new_opt_state_dict = {} + for key in list(opt_state_dict.keys()): + # Only process SWIGLU keys + if not is_swiglu_key(key): + new_opt_state_dict[key] = opt_state_dict[key] + continue + new_opt_state_dict[f"{key}_w"] = opt_state_dict[key].copy() + new_opt_state_dict[f"{key}_v"] = opt_state_dict[key].copy() + for subkey in ["exp_avg", "exp_avg_sq"]: + dist_param = model.get_parameter(expert_param_local_key(key[len("module.") :])) + weight_w, weight_v = split_swiglu_linear_fc1( + opt_state_dict[key][subkey], + dist_param, + swiglu_shard_axis=0, + is_expert_param="mlp.experts" in key, + ) + # Update the optimizer state dict with the new keys + new_opt_state_dict[f"{key}_w"][subkey] = weight_w + new_opt_state_dict[f"{key}_v"][subkey] = weight_v + optimizer_state_dict["state"] = new_opt_state_dict - if len(optimizer_state_dict) != 0: - for key in list(optimizer_state_dict.keys()): - if not (key.endswith('mlp.linear_fc1.weight') or key.endswith('mlp.linear_fc1.bias')): - continue - optimizer_state_dict[f"{key}_w"] = optimizer_state_dict[key].copy() - optimizer_state_dict[f"{key}_v"] = optimizer_state_dict[key].copy() - for subkey in ["exp_avg", "exp_avg_sq"]: - dist_param = model.get_parameter(key[len("module.") :]) - weight_w, weight_v = split_swiglu_linear_fc1( - optimizer_state_dict[key][subkey], dist_param, swiglu_shard_axis=0 - ) - # Update the optimizer state dict with the new keys - optimizer_state_dict[f"{key}_w"][subkey] = weight_w - optimizer_state_dict[f"{key}_v"][subkey] = weight_v - del optimizer_state_dict[key] + return model_state_dict, optimizer_state_dict def handle_fp8_extra_state_case(model_state_dict): @@ -162,7 +341,7 @@ def flatten_state_dict(obj, parent_key="", sep="."): return items -def print_diff_in_state_dicts(state_dict_metadata, load_state_dict): +def print_diff_in_state_dicts(state_dict_metadata, load_state_dict, limit=100): """ Print the differences between two state dicts: metadata state dict and load state dict. This function compares the keys and shapes of the tensors in both dicts. @@ -172,24 +351,105 @@ def print_diff_in_state_dicts(state_dict_metadata, load_state_dict): meta_keys = set(state_dict_metadata.keys()) load_keys = set(load_state_dict.keys()) - only_in_meta = meta_keys - load_keys - only_in_load = load_keys - meta_keys - in_both = meta_keys & load_keys + only_in_meta = list(meta_keys - load_keys) + only_in_load = list(load_keys - meta_keys) + in_both = list(meta_keys & load_keys) - print("Keys only in checkpoint metadata_state_dict:") - for k in sorted(only_in_meta): - print(f" {k}") + logger.info(f"Keys only in checkpoint metadata_state_dict(first {limit}):") + for k in sorted(only_in_meta[:limit]): + logger.info(f" {k}") - print("\nKeys only in load_state_dict:") - for k in sorted(only_in_load): - print(f" {k}") + logger.info(f"\nKeys only in load_state_dict(first {limit}):") + for k in sorted(only_in_load[:limit]): + logger.info(f" {k}") - print("\nKeys in both but with different shapes:") - for k in sorted(in_both): + logger.info(f"\nKeys in both but with different shapes(first {limit}):") + for k in sorted(in_both[:limit]): v_meta = state_dict_metadata[k] v_load = load_state_dict[k] # If tensors, compare shape; else, compare type/values meta_shape = v_meta.size if hasattr(v_meta, "size") else type(v_meta) load_shape = v_load.shape if hasattr(v_load, "shape") else type(v_load) if meta_shape != load_shape: - print(f" {k}: meta shape={meta_shape}, load shape={load_shape}") + logger.info(f" {k}: meta shape={meta_shape}, load shape={load_shape}") + + +def validate_loaded_state_dict(state_dict, checkpoint_path): + """ + Validate the loaded state dict against the expected structure and types. + """ + assert HAVE_MEGATRON_FSDP, "This function requires Megatron-FSDP to be installed." + + # Initialize reader + reader = torch.distributed.checkpoint.FileSystemReader(checkpoint_path) + metadata = reader.read_metadata() + flat_state_dict = flatten_state_dict(state_dict) + + for key, value in flat_state_dict.items(): + tensor_metadata = metadata.state_dict_metadata[key] + + if not isinstance(tensor_metadata, TensorStorageMetadata): + continue + if not isinstance(value, DTensor): + load_item_dict = {key: torch.empty_like(value)} + else: + load_item_dict = { + key: torch.distributed.tensor.empty( + tensor_metadata.size, + dtype=tensor_metadata.properties.dtype, + device_mesh=DeviceMesh.from_group( + group=dist.group.WORLD, + device_type="cuda", + mesh=torch.arange(dist.get_world_size()), + mesh_dim_names=("world",), + ), + placements=[Shard(0)], + ) + } + torch.distributed.checkpoint.load( + load_item_dict, storage_reader=reader, planner=default_planner.DefaultLoadPlanner() + ) + if isinstance(value, DTensor): + full_value = gather_uneven_dtensor_to_full_tensor(value) + loaded_tensor = load_item_dict[key].redistribute( + placements=[Replicate()] * len(value.placements) + ) + assert torch.allclose( + loaded_tensor._local_tensor, full_value._local_tensor, atol=1e-8, rtol=1e-5 + ), f"key: {key}; {loaded_tensor} {full_value}" + else: + assert torch.allclose( + value, load_item_dict[key] + ), f"key: {key}; {value} {load_item_dict[key]}" + + +def get_global_unique_param_name(model_chunks, param): + """ + Get the global unique parameter name for a given model and parameter. + """ + param_name = None + for model in model_chunks: + for name, p in model.named_parameters(): + if p is param: + param_name = name + break + if param_name is None: + raise ValueError("Parameter not found in model chunks") + + # Get PP unique parameter name + if re.search(r"layers\.(\d+)", param_name) and "mtp" not in param_name: + tf_layer_number = -1 + for module in model.modules(): + if not isinstance(module, TransformerLayer): + continue + for p in module.parameters(): + if p is param: + tf_layer_number = module.layer_number + break + if tf_layer_number != -1: + param_name = re.sub(r"layers\.(\d+)", f"layers.{tf_layer_number - 1}", param_name) + + # Get EP unique parameter name + param_name = list(handle_experts_in_state_dict({param_name: None}).keys())[0] + + return param_name diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index bdf915a8ae1..1d29aff0827 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -2267,6 +2267,10 @@ def _add_training_args(parser): help="Use torch.optim.Optimizer instead of Megatron's optimizer in optimizer cpu offload mode.") group.add_argument('--overlap-cpu-optimizer-d2h-h2d', action='store_true', default=False, help='Overlap CPU optimizer step, gradients D2H and updated parameters H2D.') + group.add_argument('--dump-param-to-param-group-map', type=str, default=None, + help="Path to a file containing parameter-to-parameter-group mapping. " + "Provide a JSON file that specifies which parameters belong to which " + "parameter group for global coordination.") group.add_argument('--no-pin-cpu-grads', action='store_false', dest='pin_cpu_grads', help='Disable pinning of CPU memory for gradients.') group.add_argument('--no-pin-cpu-params', action='store_false', dest='pin_cpu_params', diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py index 71b9cd97021..93c23255f4c 100644 --- a/megatron/training/checkpointing.py +++ b/megatron/training/checkpointing.py @@ -42,9 +42,10 @@ try: from megatron.core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor import preprocess_state_dict_for_uneven_dtensor from megatron.core.transformer.fsdp_dtensor_checkpoint import ( + print_diff_in_state_dicts, handle_fp8_extra_state_case, handle_swiglu_in_state_dict, - print_diff_in_state_dicts, + handle_experts_in_state_dict, ) HAVE_MEGATRON_FSDP = True except ImportError: @@ -561,6 +562,9 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati # TODO Handle non-empty directories (e.g., after a crash during saving). ensure_directory_exists(checkpoint_name, check_parent=False) + if ckpt_format == "fsdp_dtensor": + state_dict = preprocess_fsdp_dtensor_state_dict(args, state_dict, model[0]) + fs_storage_writer = torch.distributed.checkpoint.FileSystemWriter(checkpoint_name) torch.distributed.checkpoint.save( state_dict=state_dict, @@ -784,9 +788,17 @@ def maybe_save_dataloader_state(train_iterator, iteration, dataloader_save_path) torch.save(dataloader_save_dict, data_state_save_path) -def generate_state_dict(args, model, optimizer, opt_param_scheduler, - rng_state, iteration=None, - optim_sd_kwargs=None, model_sd_kwargs=None, rerun_state=None): +def generate_state_dict( + args, + model, + optimizer, + opt_param_scheduler, + rng_state, + iteration=None, + optim_sd_kwargs=None, + model_sd_kwargs=None, + rerun_state=None, +): """Generate a state dict from given model, optimizer, scheduler, rng state and others. """ # Arguments, iteration, and model. @@ -839,16 +851,27 @@ def generate_state_dict(args, model, optimizer, opt_param_scheduler, if not args.no_save_rng and rng_state: state_dict["rng_state"] = rng_state - # fsdp_dtensor ckpt specific state dict preprocessing - if args.ckpt_format == "fsdp_dtensor": - assert HAVE_MEGATRON_FSDP, "Megatron FSDP is enabled but Megatron-FSDP is not available." - assert len(model) == 1, "FSDP DTensor checkpoints are not supported for multiple models." - if args.swiglu: - state_dict = state_dict.copy() - handle_swiglu_in_state_dict( - model[0], state_dict["model"], state_dict["optimizer"]) - handle_fp8_extra_state_case(state_dict["model"]) - preprocess_state_dict_for_uneven_dtensor(state_dict) + return state_dict + + +def preprocess_fsdp_dtensor_state_dict(args, raw_state_dict, model): + state_dict = raw_state_dict.copy() + handle_fp8_extra_state_case(state_dict["model"]) + if args.swiglu: + if "optimizer" in state_dict: + model_state_dict, optimizer_state_dict = handle_swiglu_in_state_dict( + model, state_dict["model"], state_dict["optimizer"] + ) + state_dict["model"] = model_state_dict + state_dict["optimizer"] = optimizer_state_dict + else: + model_state_dict, _ = handle_swiglu_in_state_dict( + model, state_dict["model"], None + ) + state_dict["model"] = model_state_dict + if args.num_experts: + state_dict["model"] = handle_experts_in_state_dict(state_dict["model"]) + preprocess_state_dict_for_uneven_dtensor(state_dict) return state_dict @@ -1169,6 +1192,12 @@ def _load_base_checkpoint( if rank0: return {}, checkpoint_name, release, CheckpointType.FSDP_DTENSOR + state_dict = sharded_state_dict + raw_optimizer_state_dict = state_dict["optimizer"].copy() if "optimizer" in state_dict else None + raw_model_state_dict = state_dict["model"].copy() if "model" in state_dict else None + model = state_dict.pop("_model") + state_dict = preprocess_fsdp_dtensor_state_dict(args, state_dict, model[0]) + ckpt_type = CheckpointType.FSDP_DTENSOR fs_storage_reader = torch.distributed.checkpoint.FileSystemReader(checkpoint_name) allow_partial_load = not getattr(args, 'strict_fsdp_dtensor_load', False) @@ -1177,15 +1206,20 @@ def _load_base_checkpoint( rank = torch.distributed.get_rank() import time as _time _time.sleep(rank * 0.001) # Make that logs of different ranks do not overlap - print_diff_in_state_dicts(state_dict_metadata, sharded_state_dict) + print_diff_in_state_dicts(state_dict_metadata, state_dict) planner = default_planner.DefaultLoadPlanner(allow_partial_load=allow_partial_load) torch.distributed.checkpoint.load_state_dict( - state_dict=sharded_state_dict, + state_dict=state_dict, storage_reader=fs_storage_reader, planner=planner, ) - state_dict = sharded_state_dict + + if raw_optimizer_state_dict is not None: + state_dict["optimizer"] = raw_optimizer_state_dict + + if raw_model_state_dict is not None: + state_dict["model"] = raw_model_state_dict else: raise NotImplementedError(f"checkpoint format {ckpt_format} not supported") @@ -1520,7 +1554,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load', except FileNotFoundError: state_dict_metadata = {} - gen_sd_rerun_state = None + gen_sd_rerun_state = {} gen_sd_opt_param_scheduler = None gen_sd_rng_state = None gen_sd_optim = None @@ -1537,7 +1571,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load', optim_sd_kwargs = dict(metadata=_build_sharded_state_dict_metadata(args), is_loading=True) - load_kwargs["sharded_state_dict"] = generate_state_dict( + state_dict = generate_state_dict( args, model=model, optimizer=gen_sd_optim, @@ -1547,6 +1581,8 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load', rerun_state=gen_sd_rerun_state, iteration=1, ) + state_dict["_model"] = model + load_kwargs["sharded_state_dict"] = state_dict state_dict, checkpoint_name, release, ckpt_type = _load_base_checkpoint( load_dir, args, rank0=False, checkpointing_context=checkpointing_context, diff --git a/megatron/training/training.py b/megatron/training/training.py index f805dab0f15..bda9e42dc82 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -1210,6 +1210,7 @@ def setup_model_and_optimizer( # If the user is asking for a non-zero embedding init std, skip weight decay for embeddings # to avoid embeddings from shrinking to zero as recommended in https://arxiv.org/abs/2312.16903 default_skip_embedding_weight_decay=args.embedding_init_method_std is not None, + dump_param_to_param_group_map=args.dump_param_to_param_group_map, ) else: optimizer = get_megatron_muon_optimizer( diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json index 0f2637a9511..717ae3f5fa6 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.04748, - "2": 11.03561, - "3": 9.58774, - "4": 9.25819, - "5": 9.53583, - "6": 9.8804, - "7": 9.48247, - "8": 8.93575, - "9": 8.65813, - "10": 9.0567, - "11": 8.49445, - "12": 8.52444, - "13": 8.45239, - "14": 7.97323, - "15": 8.0476, - "16": 8.07971, - "17": 8.09081, - "18": 7.76437, - "19": 8.14892, - "20": 7.89868, - "21": 7.59371, - "22": 7.54743, - "23": 7.43222, - "24": 7.4302, - "25": 7.67579, - "26": 7.06929, - "27": 7.62041, - "28": 7.32495, - "29": 7.49042, - "30": 7.64391, - "31": 7.39435, - "32": 7.58789, - "33": 7.64037, - "34": 7.69778, - "35": 7.20998, - "36": 7.08538, - "37": 7.42584, - "38": 7.18804, - "39": 7.55054, - "40": 7.54446, - "41": 7.49287, - "42": 7.24937, - "43": 7.23587, - "44": 7.41595, - "45": 7.18755, - "46": 6.89949, - "47": 7.29966, - "48": 7.14134, - "49": 7.58963, - "50": 7.03602 + "1": 11.04722, + "2": 11.03572, + "3": 9.58802, + "4": 9.25807, + "5": 9.46595, + "6": 9.99646, + "7": 9.50952, + "8": 8.97596, + "9": 8.64768, + "10": 9.40103, + "11": 8.86556, + "12": 8.63563, + "13": 8.52125, + "14": 8.08824, + "15": 8.1958, + "16": 8.22112, + "17": 8.14098, + "18": 7.8386, + "19": 8.23438, + "20": 7.95361, + "21": 7.62549, + "22": 7.60352, + "23": 7.47957, + "24": 7.46573, + "25": 7.70343, + "26": 7.10719, + "27": 7.64313, + "28": 7.34582, + "29": 7.5169, + "30": 7.67511, + "31": 7.41799, + "32": 7.61213, + "33": 7.66582, + "34": 7.73101, + "35": 7.23081, + "36": 7.10765, + "37": 7.4476, + "38": 7.21053, + "39": 7.57508, + "40": 7.5662, + "41": 7.51605, + "42": 7.27243, + "43": 7.25706, + "44": 7.44, + "45": 7.21244, + "46": 6.92421, + "47": 7.32604, + "48": 7.17147, + "49": 7.62154, + "50": 7.0624 } }, "num-zeros": { @@ -62,55 +62,55 @@ "step_interval": 1, "values": { "1": 38802612.0, - "2": 38543592.0, - "3": 38739528.0, - "4": 279937824.0, - "5": 259189728.0, - "6": 271446400.0, - "7": 604773504.0, - "8": 768892544.0, - "9": 645824128.0, - "10": 744257088.0, - "11": 718888576.0, - "12": 746732544.0, - "13": 871990976.0, - "14": 821645632.0, - "15": 724250816.0, - "16": 932241472.0, - "17": 648958912.0, - "18": 649120000.0, - "19": 925992960.0, - "20": 989207936.0, - "21": 819324096.0, - "22": 736955072.0, - "23": 910497792.0, - "24": 876716672.0, - "25": 843170688.0, - "26": 809573824.0, - "27": 854086912.0, - "28": 802857664.0, - "29": 805523328.0, - "30": 775645184.0, - "31": 771754624.0, - "32": 749733696.0, - "33": 718385216.0, - "34": 724771200.0, - "35": 737655104.0, - "36": 690419968.0, - "37": 673203456.0, - "38": 627239552.0, - "39": 614047168.0, - "40": 607288512.0, - "41": 582590592.0, - "42": 548211200.0, - "43": 532740640.0, - "44": 554239168.0, - "45": 514790528.0, - "46": 350258560.0, - "47": 472420128.0, - "48": 453788736.0, - "49": 440597216.0, - "50": 303063296.0 + "2": 38543656.0, + "3": 38739356.0, + "4": 273649600.0, + "5": 252887040.0, + "6": 255692384.0, + "7": 598483264.0, + "8": 787737984.0, + "9": 696133120.0, + "10": 505146368.0, + "11": 718888640.0, + "12": 872597184.0, + "13": 947495104.0, + "14": 1076398976.0, + "15": 856390592.0, + "16": 1048635648.0, + "17": 831370688.0, + "18": 963679552.0, + "19": 970018240.0, + "20": 935737344.0, + "21": 904189312.0, + "22": 887937280.0, + "23": 894777856.0, + "24": 703744192.0, + "25": 909232512.0, + "26": 875633216.0, + "27": 894981376.0, + "28": 919242816.0, + "29": 931351552.0, + "30": 929784768.0, + "31": 941621376.0, + "32": 885000768.0, + "33": 828484096.0, + "34": 822284800.0, + "35": 832032128.0, + "36": 787939392.0, + "37": 770719808.0, + "38": 561204672.0, + "39": 617201536.0, + "40": 695374592.0, + "41": 698978816.0, + "42": 692913728.0, + "43": 668003776.0, + "44": 673780992.0, + "45": 631182912.0, + "46": 444613312.0, + "47": 591957824.0, + "48": 617363968.0, + "49": 585295808.0, + "50": 570423872.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 6637267456.0, - "2": 6637269504.0, - "3": 6637269504.0, - "4": 6637269504.0, - "5": 6637269504.0, - "6": 6637269504.0, - "7": 6637269504.0, - "8": 6637269504.0, - "9": 6637269504.0, - "10": 6637269504.0, - "11": 6637269504.0, - "12": 6637269504.0, - "13": 6637269504.0, - "14": 6637269504.0, - "15": 6637269504.0, - "16": 6637269504.0, - "17": 6637269504.0, - "18": 6637269504.0, - "19": 6637269504.0, - "20": 6637269504.0, - "21": 6637269504.0, - "22": 6637269504.0, - "23": 6637269504.0, - "24": 6637269504.0, - "25": 6637269504.0, - "26": 6637269504.0, - "27": 6637269504.0, - "28": 6637269504.0, - "29": 6637269504.0, - "30": 6637269504.0, - "31": 6637269504.0, - "32": 6637269504.0, - "33": 6637269504.0, - "34": 6637269504.0, - "35": 6637269504.0, - "36": 6637269504.0, - "37": 6637269504.0, - "38": 6637269504.0, - "39": 6637269504.0, - "40": 6637269504.0, - "41": 6637269504.0, - "42": 6637269504.0, - "43": 6637269504.0, - "44": 6637269504.0, - "45": 6637269504.0, - "46": 6637269504.0, - "47": 6637269504.0, - "48": 6637269504.0, - "49": 6637269504.0, - "50": 6637269504.0 + "1": 6637272576.0, + "2": 6637274624.0, + "3": 6637274624.0, + "4": 6637274624.0, + "5": 6637274624.0, + "6": 6637274624.0, + "7": 6637274624.0, + "8": 6637274624.0, + "9": 6637274624.0, + "10": 6637274624.0, + "11": 6637274624.0, + "12": 6637274624.0, + "13": 6637274624.0, + "14": 6637274624.0, + "15": 6637274624.0, + "16": 6637274624.0, + "17": 6637274624.0, + "18": 6637274624.0, + "19": 6637274624.0, + "20": 6637274624.0, + "21": 6637274624.0, + "22": 6637274624.0, + "23": 6637274624.0, + "24": 6637274624.0, + "25": 6637274624.0, + "26": 6637274624.0, + "27": 6637274624.0, + "28": 6637274624.0, + "29": 6637274624.0, + "30": 6637274624.0, + "31": 6637274624.0, + "32": 6637274624.0, + "33": 6637274624.0, + "34": 6637274624.0, + "35": 6637274624.0, + "36": 6637274624.0, + "37": 6637274624.0, + "38": 6637274624.0, + "39": 6637274624.0, + "40": 6637274624.0, + "41": 6637274624.0, + "42": 6637274624.0, + "43": 6637274624.0, + "44": 6637274624.0, + "45": 6637274624.0, + "46": 6637274624.0, + "47": 6637274624.0, + "48": 6637274624.0, + "49": 6637274624.0, + "50": 6637274624.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 55055331328.0, - "2": 57809321984.0, - "3": 57918455808.0, - "4": 57918455808.0, - "5": 57918455808.0, - "6": 57918455808.0, - "7": 57918455808.0, - "8": 57918455808.0, - "9": 57918455808.0, - "10": 57918455808.0, - "11": 57918455808.0, - "12": 57918455808.0, - "13": 57931390976.0, - "14": 57931390976.0, - "15": 57931390976.0, - "16": 57931390976.0, - "17": 57931390976.0, - "18": 57931390976.0, - "19": 57931390976.0, - "20": 57931390976.0, - "21": 57931390976.0, - "22": 57931390976.0, - "23": 57931390976.0, - "24": 57931390976.0, - "25": 57931390976.0, - "26": 57931390976.0, - "27": 57931390976.0, - "28": 57931390976.0, - "29": 57931390976.0, - "30": 57931390976.0, - "31": 57931390976.0, - "32": 58003226624.0, - "33": 58003226624.0, - "34": 58003226624.0, - "35": 58003226624.0, - "36": 58003226624.0, - "37": 58003226624.0, - "38": 58003226624.0, - "39": 58003226624.0, - "40": 58003226624.0, - "41": 58003226624.0, - "42": 58003226624.0, - "43": 58003226624.0, - "44": 58183614464.0, - "45": 58234208256.0, - "46": 58555555840.0, - "47": 58555555840.0, - "48": 58555555840.0, - "49": 58555555840.0, - "50": 58780934144.0 + "1": 55056003072.0, + "2": 57810763776.0, + "3": 57920647168.0, + "4": 57920647168.0, + "5": 57920647168.0, + "6": 57920647168.0, + "7": 57920647168.0, + "8": 57920647168.0, + "9": 57920647168.0, + "10": 57920647168.0, + "11": 57920647168.0, + "12": 57920647168.0, + "13": 57920647168.0, + "14": 57920647168.0, + "15": 57920647168.0, + "16": 57920647168.0, + "17": 57920647168.0, + "18": 57920647168.0, + "19": 57920647168.0, + "20": 57920647168.0, + "21": 57920647168.0, + "22": 57920647168.0, + "23": 57920647168.0, + "24": 57920647168.0, + "25": 57920647168.0, + "26": 57920647168.0, + "27": 57920647168.0, + "28": 57920647168.0, + "29": 57920647168.0, + "30": 57920647168.0, + "31": 57920647168.0, + "32": 57920647168.0, + "33": 57920647168.0, + "34": 57961472000.0, + "35": 57961472000.0, + "36": 57961472000.0, + "37": 57961472000.0, + "38": 57961472000.0, + "39": 57961472000.0, + "40": 57961472000.0, + "41": 57961472000.0, + "42": 57961472000.0, + "43": 57961472000.0, + "44": 57961472000.0, + "45": 57961472000.0, + "46": 57961472000.0, + "47": 57961472000.0, + "48": 57961472000.0, + "49": 57961472000.0, + "50": 57961472000.0 } }, "mtp_1 loss": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.07654, - "2": 11.07406, - "3": 10.53881, - "4": 10.09803, - "5": 9.81154, - "6": 10.06236, - "7": 9.79762, - "8": 9.07117, - "9": 8.87049, - "10": 9.127, - "11": 8.49853, - "12": 8.53046, - "13": 8.42444, - "14": 7.847, - "15": 7.99077, - "16": 8.05015, - "17": 8.00064, - "18": 7.73104, - "19": 8.11087, - "20": 7.82933, - "21": 7.52501, - "22": 7.49916, - "23": 7.36982, - "24": 7.37235, - "25": 7.61578, - "26": 7.02029, - "27": 7.56014, - "28": 7.2681, - "29": 7.44399, - "30": 7.58618, - "31": 7.32468, - "32": 7.50596, - "33": 7.5715, - "34": 7.63581, - "35": 7.15224, - "36": 7.01784, - "37": 7.35163, - "38": 7.12551, - "39": 7.48656, - "40": 7.47408, - "41": 7.42096, - "42": 7.17595, - "43": 7.16059, - "44": 7.34289, - "45": 7.11969, - "46": 6.82753, - "47": 7.23525, - "48": 7.08042, - "49": 7.51043, - "50": 6.9735 + "1": 11.07648, + "2": 11.07404, + "3": 10.53854, + "4": 10.09813, + "5": 9.81166, + "6": 10.09741, + "7": 9.79481, + "8": 9.0642, + "9": 8.86016, + "10": 9.34039, + "11": 8.51318, + "12": 8.59467, + "13": 8.5292, + "14": 7.95757, + "15": 8.06962, + "16": 8.11802, + "17": 8.06993, + "18": 7.80587, + "19": 8.19192, + "20": 7.8906, + "21": 7.57063, + "22": 7.55091, + "23": 7.41606, + "24": 7.42454, + "25": 7.65274, + "26": 7.05583, + "27": 7.59747, + "28": 7.29984, + "29": 7.472, + "30": 7.61908, + "31": 7.35179, + "32": 7.52979, + "33": 7.59161, + "34": 7.66287, + "35": 7.17383, + "36": 7.04133, + "37": 7.37081, + "38": 7.1443, + "39": 7.50879, + "40": 7.48921, + "41": 7.43802, + "42": 7.19405, + "43": 7.17581, + "44": 7.35785, + "45": 7.13985, + "46": 6.84014, + "47": 7.25094, + "48": 7.09407, + "49": 7.52321, + "50": 6.98987 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 69.29797, - "2": 1.7261, - "3": 1.40981, - "4": 2.16562, - "5": 1.7862, - "6": 1.7469, - "7": 1.96688, - "8": 1.97301, - "9": 1.74665, - "10": 1.69613, - "11": 1.02979, - "12": 1.02408, - "13": 1.03261, - "14": 1.02432, - "15": 1.0529, - "16": 1.04491, - "17": 1.03693, - "18": 1.03399, - "19": 1.03627, - "20": 1.02284, - "21": 1.01667, - "22": 1.02932, - "23": 1.03591, - "24": 1.03466, - "25": 1.03149, - "26": 1.03165, - "27": 1.02342, - "28": 1.03777, - "29": 1.04061, - "30": 1.05641, - "31": 1.02382, - "32": 1.01775, - "33": 1.03039, - "34": 1.03693, - "35": 1.03153, - "36": 1.02699, - "37": 1.02756, - "38": 1.02919, - "39": 1.01773, - "40": 1.03491, - "41": 1.03152, - "42": 1.03035, - "43": 1.0221, - "44": 1.05201, - "45": 1.02579, - "46": 1.02798, - "47": 1.03857, - "48": 1.02772, - "49": 1.0408, - "50": 1.03745 + "1": 93.39829, + "2": 1.82958, + "3": 1.3241, + "4": 2.19661, + "5": 2.13156, + "6": 1.75452, + "7": 2.08539, + "8": 1.58016, + "9": 1.60816, + "10": 1.03407, + "11": 1.01797, + "12": 1.0168, + "13": 1.01666, + "14": 1.0748, + "15": 1.04137, + "16": 1.05864, + "17": 1.05961, + "18": 1.03233, + "19": 1.02728, + "20": 1.02917, + "21": 1.04313, + "22": 1.03054, + "23": 1.0313, + "24": 1.03789, + "25": 1.04414, + "26": 1.05561, + "27": 1.03361, + "28": 1.03142, + "29": 1.02437, + "30": 1.02195, + "31": 1.0172, + "32": 1.03318, + "33": 1.03742, + "34": 1.03628, + "35": 1.03575, + "36": 1.05127, + "37": 1.03273, + "38": 1.03381, + "39": 1.02923, + "40": 1.02986, + "41": 1.03249, + "42": 1.033, + "43": 1.03169, + "44": 1.03818, + "45": 1.02736, + "46": 1.02698, + "47": 1.03158, + "48": 1.02471, + "49": 1.03674, + "50": 1.0291 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json index 58eb3fc16cd..8cea616921e 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 10.95004, - "2": 10.9521, - "3": 10.5115, - "4": 9.96454, - "5": 9.93941, - "6": 9.67273, - "7": 10.20975, - "8": 9.49716, - "9": 9.55902, - "10": 9.79742, - "11": 9.30109, - "12": 9.40483, - "13": 9.39546, - "14": 8.84681, - "15": 9.02444, - "16": 9.07121, - "17": 9.04574, - "18": 8.75678, - "19": 9.18159, - "20": 8.8595, - "21": 8.53503, - "22": 8.55182, - "23": 8.42441, - "24": 8.37608, - "25": 8.64304, - "26": 7.97393, - "27": 8.56806, - "28": 8.19764, - "29": 8.3928, - "30": 8.67283, - "31": 8.289, - "32": 8.43572, - "33": 8.5568, - "34": 8.66018, - "35": 8.07934, - "36": 7.94976, - "37": 8.29565, - "38": 7.98044, - "39": 8.39201, - "40": 8.35513, - "41": 8.31876, - "42": 8.0583, - "43": 8.03283, - "44": 8.24243, - "45": 8.10277, - "46": 7.61696, - "47": 8.15273, - "48": 8.00569, - "49": 8.38688, - "50": 7.81491 + "1": 10.94971, + "2": 10.95163, + "3": 10.51641, + "4": 9.9652, + "5": 9.94116, + "6": 9.67394, + "7": 10.19887, + "8": 9.50035, + "9": 9.54982, + "10": 9.79667, + "11": 9.30128, + "12": 9.40566, + "13": 9.39438, + "14": 8.84572, + "15": 9.02231, + "16": 9.06973, + "17": 9.04712, + "18": 8.75662, + "19": 9.18074, + "20": 8.86175, + "21": 8.53558, + "22": 8.55288, + "23": 8.42513, + "24": 8.37683, + "25": 8.64426, + "26": 7.9756, + "27": 8.57026, + "28": 8.1987, + "29": 8.39406, + "30": 8.67631, + "31": 8.29096, + "32": 8.43692, + "33": 8.55897, + "34": 8.66123, + "35": 8.08, + "36": 7.95214, + "37": 8.2979, + "38": 7.98177, + "39": 8.39281, + "40": 8.35852, + "41": 8.32006, + "42": 8.05954, + "43": 8.03381, + "44": 8.24236, + "45": 8.1025, + "46": 7.61814, + "47": 8.15364, + "48": 8.00693, + "49": 8.38704, + "50": 7.81592 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 19403624.0, - "2": 19274194.0, - "3": 19372760.0, - "4": 86525248.0, - "5": 148575568.0, - "6": 145226704.0, - "7": 171879984.0, - "8": 195785248.0, - "9": 164124752.0, - "10": 167684736.0, - "11": 221077344.0, - "12": 200384224.0, - "13": 248872528.0, - "14": 211169424.0, - "15": 214304608.0, - "16": 216075632.0, - "17": 267845984.0, - "18": 170470336.0, - "19": 176865072.0, - "20": 187955392.0, - "21": 225750704.0, - "22": 247396816.0, - "23": 211643856.0, - "24": 205638464.0, - "25": 277022272.0, - "26": 291562304.0, - "27": 225789840.0, - "28": 288202368.0, - "29": 198390384.0, - "30": 213302208.0, - "31": 227204752.0, - "32": 271112416.0, - "33": 231840432.0, - "34": 203575536.0, - "35": 191152368.0, - "36": 222566928.0, - "37": 177810112.0, - "38": 228708544.0, - "39": 211168784.0, - "40": 215603968.0, - "41": 200089440.0, - "42": 228529888.0, - "43": 198782848.0, - "44": 141902272.0, - "45": 181922816.0, - "46": 115369856.0, - "47": 170214176.0, - "48": 137292832.0, - "49": 97654936.0, - "50": 160979632.0 + "1": 19403704.0, + "2": 19274216.0, + "3": 22517470.0, + "4": 83429816.0, + "5": 139167728.0, + "6": 138921280.0, + "7": 173470304.0, + "8": 200511856.0, + "9": 165696320.0, + "10": 166120112.0, + "11": 213254416.0, + "12": 187847360.0, + "13": 231586656.0, + "14": 226879072.0, + "15": 219025920.0, + "16": 205179664.0, + "17": 280450432.0, + "18": 181477792.0, + "19": 191026096.0, + "20": 186395632.0, + "21": 233632576.0, + "22": 231696832.0, + "23": 216390688.0, + "24": 215133760.0, + "25": 233079504.0, + "26": 244437920.0, + "27": 222637584.0, + "28": 278773952.0, + "29": 253409264.0, + "30": 240036736.0, + "31": 236599008.0, + "32": 205066624.0, + "33": 263303312.0, + "34": 200444544.0, + "35": 199033824.0, + "36": 243001216.0, + "37": 151181872.0, + "38": 175301280.0, + "39": 219001024.0, + "40": 220307936.0, + "41": 217385856.0, + "42": 230074176.0, + "43": 208226784.0, + "44": 148172720.0, + "45": 141103744.0, + "46": 132664976.0, + "47": 179619392.0, + "48": 118381144.0, + "49": 86643984.0, + "50": 113798320.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 4883602432.0, - "2": 4885017088.0, - "3": 4882657792.0, - "4": 4883046912.0, - "5": 4883725824.0, - "6": 4883713536.0, - "7": 4883040768.0, - "8": 4883273216.0, - "9": 4882952704.0, - "10": 4885949952.0, - "11": 4883990016.0, - "12": 4887679488.0, - "13": 4884011520.0, - "14": 4882899456.0, - "15": 4883515904.0, - "16": 4883990016.0, - "17": 4883410432.0, - "18": 4883673600.0, - "19": 4882903552.0, - "20": 4884541952.0, - "21": 4883138048.0, - "22": 4883247616.0, - "23": 4883839488.0, - "24": 4885058048.0, - "25": 4882676224.0, - "26": 4884058624.0, - "27": 4884724224.0, - "28": 4884874752.0, - "29": 4883127808.0, - "30": 4883252736.0, - "31": 4882955776.0, - "32": 4885190144.0, - "33": 4883845632.0, - "34": 4884392448.0, - "35": 4883083776.0, - "36": 4883851776.0, - "37": 4885246464.0, - "38": 4882680320.0, - "39": 4884296192.0, - "40": 4884689408.0, - "41": 4882836992.0, - "42": 4883972608.0, - "43": 4884519424.0, - "44": 4883354112.0, - "45": 4883495424.0, - "46": 4882788864.0, - "47": 4883144192.0, - "48": 4883688960.0, - "49": 4884182528.0, - "50": 4885279232.0 + "1": 4883287040.0, + "2": 4883441152.0, + "3": 4881697280.0, + "4": 4883730944.0, + "5": 4882556416.0, + "6": 4882616832.0, + "7": 4883438080.0, + "8": 4881568256.0, + "9": 4883173888.0, + "10": 4882272768.0, + "11": 4883676672.0, + "12": 4881393152.0, + "13": 4883141120.0, + "14": 4883697152.0, + "15": 4882622976.0, + "16": 4881830400.0, + "17": 4881658368.0, + "18": 4881863168.0, + "19": 4883804672.0, + "20": 4881795584.0, + "21": 4883333632.0, + "22": 4882194944.0, + "23": 4882084352.0, + "24": 4884065792.0, + "25": 4881804800.0, + "26": 4883596800.0, + "27": 4883047936.0, + "28": 4882476544.0, + "29": 4883087872.0, + "30": 4882151936.0, + "31": 4882625024.0, + "32": 4883104256.0, + "33": 4882526720.0, + "34": 4882292224.0, + "35": 4882485760.0, + "36": 4882867712.0, + "37": 4882634240.0, + "38": 4882610688.0, + "39": 4881474048.0, + "40": 4881961472.0, + "41": 4882663936.0, + "42": 4881860096.0, + "43": 4881499648.0, + "44": 4883392000.0, + "45": 4882392576.0, + "46": 4882815488.0, + "47": 4883113472.0, + "48": 4882158080.0, + "49": 4881207808.0, + "50": 4881588736.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 41210470400.0, - "2": 41210470400.0, - "3": 41210470400.0, - "4": 41210470400.0, - "5": 41210470400.0, - "6": 41210470400.0, - "7": 41210470400.0, - "8": 41210470400.0, - "9": 41210470400.0, - "10": 41210470400.0, - "11": 41210470400.0, - "12": 41210470400.0, - "13": 41210470400.0, - "14": 41210470400.0, - "15": 41210470400.0, - "16": 41210470400.0, - "17": 41210470400.0, - "18": 41210470400.0, - "19": 41210470400.0, - "20": 41210470400.0, - "21": 41210470400.0, - "22": 41210470400.0, - "23": 41210470400.0, - "24": 41210470400.0, - "25": 41210470400.0, - "26": 41210470400.0, - "27": 41210470400.0, - "28": 41210470400.0, - "29": 41210470400.0, - "30": 41210470400.0, - "31": 41210470400.0, - "32": 41210470400.0, - "33": 41210470400.0, - "34": 41210470400.0, - "35": 41210470400.0, - "36": 41210470400.0, - "37": 41210470400.0, - "38": 41210470400.0, - "39": 41210470400.0, - "40": 41210470400.0, - "41": 41210470400.0, - "42": 41210470400.0, - "43": 41210470400.0, - "44": 41210470400.0, - "45": 41210470400.0, - "46": 41210470400.0, - "47": 41210470400.0, - "48": 41210470400.0, - "49": 41210470400.0, - "50": 41210470400.0 + "1": 41208348672.0, + "2": 41208348672.0, + "3": 41208348672.0, + "4": 41208348672.0, + "5": 41208348672.0, + "6": 41208348672.0, + "7": 41208348672.0, + "8": 41208348672.0, + "9": 41208348672.0, + "10": 41208348672.0, + "11": 41208348672.0, + "12": 41208348672.0, + "13": 41208348672.0, + "14": 41208348672.0, + "15": 41208348672.0, + "16": 41208348672.0, + "17": 41208348672.0, + "18": 41208348672.0, + "19": 41208348672.0, + "20": 41208348672.0, + "21": 41208348672.0, + "22": 41208348672.0, + "23": 41208348672.0, + "24": 41208348672.0, + "25": 41208348672.0, + "26": 41208348672.0, + "27": 41208348672.0, + "28": 41208348672.0, + "29": 41208348672.0, + "30": 41208348672.0, + "31": 41208348672.0, + "32": 41208348672.0, + "33": 41208348672.0, + "34": 41208348672.0, + "35": 41208348672.0, + "36": 41208348672.0, + "37": 41208348672.0, + "38": 41208348672.0, + "39": 41208348672.0, + "40": 41208348672.0, + "41": 41208348672.0, + "42": 41208348672.0, + "43": 41208348672.0, + "44": 41208348672.0, + "45": 41208348672.0, + "46": 41208348672.0, + "47": 41208348672.0, + "48": 41208348672.0, + "49": 41208348672.0, + "50": 41208348672.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 86.8085, - "2": 1.10913, - "3": 0.99097, - "4": 0.89412, - "5": 1.25997, - "6": 0.98162, - "7": 0.98318, - "8": 1.13296, - "9": 0.88126, - "10": 0.8633, - "11": 2.2744, - "12": 4.5393, - "13": 3.22763, - "14": 1.64923, - "15": 0.86595, - "16": 0.86575, - "17": 0.85272, - "18": 0.85454, - "19": 0.85281, - "20": 0.87018, - "21": 0.84654, - "22": 0.8494, - "23": 0.84882, - "24": 0.84482, - "25": 0.85311, - "26": 0.84678, - "27": 0.84096, - "28": 0.8412, - "29": 0.84156, - "30": 0.84475, - "31": 0.84747, - "32": 0.85058, - "33": 0.84977, - "34": 0.8479, - "35": 0.85234, - "36": 0.85012, - "37": 0.85087, - "38": 0.84594, - "39": 0.84558, - "40": 0.84807, - "41": 0.84183, - "42": 0.8439, - "43": 0.84221, - "44": 0.84248, - "45": 0.84257, - "46": 0.83922, - "47": 0.84311, - "48": 0.84159, - "49": 0.84011, - "50": 0.8353 + "1": 89.10928, + "2": 1.08143, + "3": 0.94222, + "4": 0.89675, + "5": 1.34524, + "6": 1.06972, + "7": 1.00314, + "8": 1.04961, + "9": 0.86611, + "10": 0.86248, + "11": 0.98739, + "12": 0.86057, + "13": 0.86777, + "14": 0.85834, + "15": 0.8559, + "16": 0.85522, + "17": 0.84644, + "18": 0.85748, + "19": 0.85218, + "20": 0.85342, + "21": 0.84029, + "22": 0.84342, + "23": 0.84297, + "24": 0.83925, + "25": 0.8439, + "26": 0.85696, + "27": 0.83981, + "28": 0.84643, + "29": 0.8433, + "30": 0.86234, + "31": 0.85636, + "32": 0.84184, + "33": 0.84501, + "34": 0.84316, + "35": 0.83806, + "36": 0.84143, + "37": 0.84447, + "38": 0.84137, + "39": 0.84133, + "40": 0.84321, + "41": 0.84019, + "42": 0.84164, + "43": 0.83741, + "44": 0.84203, + "45": 0.83966, + "46": 0.84109, + "47": 0.83945, + "48": 0.84001, + "49": 0.84194, + "50": 0.83578 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json index 1ba051f4889..0835e95b926 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json @@ -1 +1,142 @@ -{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.83281, "5": 10.85975, "10": 10.79613, "15": 10.80527, "20": 10.72502, "25": 10.53599, "30": 10.3571, "35": 10.24605, "40": 10.05992, "45": 9.7836, "50": 9.8722, "55": 9.83189, "60": 9.45075, "65": 8.89679, "70": 9.71414, "75": 9.39795, "80": 9.38169, "85": 9.58585, "90": 9.7999, "95": 9.50528, "100": 9.37224}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 27013.0, "5": 31736.0, "10": 25785.0, "15": 30383.0, "20": 28435.0, "25": 27493.0, "30": 30329.0, "35": 31750.0, "40": 34279.0, "45": 34634.0, "50": 38531.0, "55": 37465.0, "60": 40172.0, "65": 40624.0, "70": 44852.0, "75": 39231.0, "80": 130535.0, "85": 123250.0, "90": 47793.0, "95": 167340.0, "100": 163328.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 814390272.0, "5": 814420480.0, "10": 814376448.0, "15": 814376960.0, "20": 814373376.0, "25": 814321152.0, "30": 814306304.0, "35": 814292992.0, "40": 814288896.0, "45": 814272000.0, "50": 814262272.0, "55": 814258688.0, "60": 814268416.0, "65": 814220800.0, "70": 814266880.0, "75": 814318080.0, "80": 814285312.0, "85": 814289408.0, "90": 814315520.0, "95": 814320128.0, "100": 814311424.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2111314944.0, "5": 2370209280.0, "10": 2370209280.0, "15": 2370209280.0, "20": 2370209280.0, "25": 2370209280.0, "30": 2370209280.0, "35": 2370209280.0, "40": 2370209280.0, "45": 2370209280.0, "50": 2370209280.0, "55": 2370209280.0, "60": 2370209280.0, "65": 2370209280.0, "70": 2370209280.0, "75": 2370209280.0, "80": 2370209280.0, "85": 2370209280.0, "90": 2370209280.0, "95": 2370209280.0, "100": 2370209280.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 20.98318, "5": 0.79797, "10": 0.74028, "15": 0.67279, "20": 0.62948, "25": 0.61132, "30": 0.61547, "35": 0.6152, "40": 0.60421, "45": 0.59124, "50": 0.5891, "55": 0.57048, "60": 0.54799, "65": 0.52185, "70": 0.51195, "75": 0.50105, "80": 0.4628, "85": 0.45992, "90": 0.46498, "95": 0.4599, "100": 0.42568}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 5, + "values": { + "1": 10.82922, + "5": 10.85652, + "10": 10.79298, + "15": 10.8067, + "20": 10.72654, + "25": 10.53282, + "30": 10.35802, + "35": 10.24483, + "40": 10.05533, + "45": 9.77951, + "50": 9.86874, + "55": 9.82995, + "60": 9.449, + "65": 8.89366, + "70": 9.71127, + "75": 9.39451, + "80": 9.38198, + "85": 9.58333, + "90": 9.79944, + "95": 9.50213, + "100": 9.37131 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 5, + "values": { + "1": 27245.0, + "5": 31369.0, + "10": 25870.0, + "15": 29830.0, + "20": 28243.0, + "25": 27636.0, + "30": 30387.0, + "35": 31488.0, + "40": 34779.0, + "45": 35158.0, + "50": 38234.0, + "55": 37133.0, + "60": 40450.0, + "65": 40947.0, + "70": 43436.0, + "75": 39925.0, + "80": 51863.0, + "85": 2145177.0, + "90": 51330.0, + "95": 45247.0, + "100": 163741.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 5, + "values": { + "1": 787511296.0, + "5": 787542016.0, + "10": 787500032.0, + "15": 787499008.0, + "20": 787500032.0, + "25": 787446272.0, + "30": 787429888.0, + "35": 787413504.0, + "40": 787409920.0, + "45": 787394560.0, + "50": 787384320.0, + "55": 787383808.0, + "60": 787389952.0, + "65": 787346432.0, + "70": 787387904.0, + "75": 787437568.0, + "80": 787405312.0, + "85": 787407360.0, + "90": 787441664.0, + "95": 787445248.0, + "100": 787433472.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 5, + "values": { + "1": 2465793024.0, + "5": 2492764160.0, + "10": 2492764160.0, + "15": 2492764160.0, + "20": 2492764160.0, + "25": 2492764160.0, + "30": 2492764160.0, + "35": 2492764160.0, + "40": 2492764160.0, + "45": 2492764160.0, + "50": 2492764160.0, + "55": 2492764160.0, + "60": 2492764160.0, + "65": 2492764160.0, + "70": 2492764160.0, + "75": 2492764160.0, + "80": 2492764160.0, + "85": 2492764160.0, + "90": 2492764160.0, + "95": 2492764160.0, + "100": 2492764160.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 5, + "values": { + "1": 9.68104, + "5": 0.32859, + "10": 0.30772, + "15": 0.31234, + "20": 0.29254, + "25": 0.29296, + "30": 0.31344, + "35": 0.31026, + "40": 0.30514, + "45": 0.30481, + "50": 0.30324, + "55": 0.29929, + "60": 0.30103, + "65": 0.32008, + "70": 0.31307, + "75": 0.2933, + "80": 0.29351, + "85": 0.29283, + "90": 0.29375, + "95": 0.29458, + "100": 0.29103 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..7e299df5257 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.82922, + "2": 10.84163, + "3": 10.84245, + "4": 10.82, + "5": 10.85652, + "6": 10.86906, + "7": 10.83778, + "8": 10.84312, + "9": 10.84423, + "10": 10.79298, + "11": 10.86697, + "12": 10.86875, + "13": 10.86207, + "14": 10.86919, + "15": 10.8067, + "16": 10.8057, + "17": 10.77686, + "18": 10.79541, + "19": 10.78384, + "20": 10.72654, + "21": 10.69491, + "22": 10.54462, + "23": 10.6993, + "24": 10.58151, + "25": 10.53282, + "26": 10.58817, + "27": 10.601, + "28": 10.57563, + "29": 10.58022, + "30": 10.35802, + "31": 10.08769, + "32": 10.44466, + "33": 10.4477, + "34": 10.18704, + "35": 10.24483, + "36": 10.19713, + "37": 10.32294, + "38": 10.17101, + "39": 10.37026, + "40": 10.05533, + "41": 10.09491, + "42": 10.17971, + "43": 9.78263, + "44": 9.91346, + "45": 9.77951, + "46": 9.75648, + "47": 10.09647, + "48": 9.80391, + "49": 9.46649, + "50": 9.86874, + "51": 9.79428, + "52": 9.68303, + "53": 10.03314, + "54": 9.9113, + "55": 9.82995, + "56": 9.57839, + "57": 9.42377, + "58": 9.80549, + "59": 9.53292, + "60": 9.449, + "61": 9.65293, + "62": 9.95672, + "63": 9.33775, + "64": 9.74194, + "65": 8.89366, + "66": 9.67317, + "67": 9.33002, + "68": 9.76517, + "69": 9.76336, + "70": 9.71127, + "71": 9.59511, + "72": 9.54797, + "73": 9.47124, + "74": 8.89297, + "75": 9.39451, + "76": 9.04721, + "77": 10.04318, + "78": 9.70313, + "79": 9.35169, + "80": 9.38198, + "81": 9.45146, + "82": 9.67546, + "83": 9.27658, + "84": 9.39241, + "85": 9.58333, + "86": 9.04518, + "87": 9.56487, + "88": 9.72459, + "89": 9.57019, + "90": 9.79944, + "91": 9.30737, + "92": 9.3313, + "93": 9.04109, + "94": 8.80259, + "95": 9.50213, + "96": 9.5021, + "97": 9.28183, + "98": 9.64883, + "99": 8.8594, + "100": 9.37131 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 27245.0, + "2": 28958.0, + "3": 29464.0, + "4": 28046.0, + "5": 31369.0, + "6": 33287.0, + "7": 31200.0, + "8": 26921.0, + "9": 30008.0, + "10": 25870.0, + "11": 33681.0, + "12": 30344.0, + "13": 32737.0, + "14": 33315.0, + "15": 29830.0, + "16": 32475.0, + "17": 30747.0, + "18": 30381.0, + "19": 31032.0, + "20": 28243.0, + "21": 29224.0, + "22": 27340.0, + "23": 34119.0, + "24": 29049.0, + "25": 27636.0, + "26": 30662.0, + "27": 32009.0, + "28": 33355.0, + "29": 34714.0, + "30": 30387.0, + "31": 28212.0, + "32": 33411.0, + "33": 34696.0, + "34": 30053.0, + "35": 31488.0, + "36": 32943.0, + "37": 35829.0, + "38": 33740.0, + "39": 37632.0, + "40": 34779.0, + "41": 33958.0, + "42": 36396.0, + "43": 34088.0, + "44": 34090.0, + "45": 35158.0, + "46": 36174.0, + "47": 39772.0, + "48": 36516.0, + "49": 36733.0, + "50": 38234.0, + "51": 38608.0, + "52": 37030.0, + "53": 42442.0, + "54": 40944.0, + "55": 37133.0, + "56": 41001.0, + "57": 37524.0, + "58": 42317.0, + "59": 40804.0, + "60": 40450.0, + "61": 41478.0, + "62": 39766.0, + "63": 37941.0, + "64": 42197.0, + "65": 40947.0, + "66": 44094.0, + "67": 41958.0, + "68": 40060.0, + "69": 42189.0, + "70": 43436.0, + "71": 42748.0, + "72": 44280.0, + "73": 47478.0, + "74": 41456.0, + "75": 39925.0, + "76": 43490.0, + "77": 45636.0, + "78": 2141470.0, + "79": 46055.0, + "80": 51863.0, + "81": 151341.0, + "82": 49835.0, + "83": 143360.0, + "84": 2141546.0, + "85": 2145177.0, + "86": 132114.0, + "87": 2147022.0, + "88": 59899.0, + "89": 162883.0, + "90": 51330.0, + "91": 2141901.0, + "92": 44946.0, + "93": 138194.0, + "94": 2145772.0, + "95": 45247.0, + "96": 135045.0, + "97": 53170.0, + "98": 168576.0, + "99": 2141797.0, + "100": 163741.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 787516416.0, + "2": 787540992.0, + "3": 787524096.0, + "4": 787512320.0, + "5": 787547136.0, + "6": 787537920.0, + "7": 787512832.0, + "8": 787524608.0, + "9": 787528192.0, + "10": 787505152.0, + "11": 787522048.0, + "12": 787520000.0, + "13": 787529728.0, + "14": 787529216.0, + "15": 787504128.0, + "16": 787513344.0, + "17": 787503104.0, + "18": 787489280.0, + "19": 787514880.0, + "20": 787505152.0, + "21": 787479552.0, + "22": 787486208.0, + "23": 787478528.0, + "24": 787486208.0, + "25": 787451392.0, + "26": 787482112.0, + "27": 787470848.0, + "28": 787450368.0, + "29": 787458048.0, + "30": 787435008.0, + "31": 787406848.0, + "32": 787424256.0, + "33": 787435520.0, + "34": 787426304.0, + "35": 787418624.0, + "36": 787436544.0, + "37": 787428352.0, + "38": 787436544.0, + "39": 787417600.0, + "40": 787415040.0, + "41": 787405824.0, + "42": 787415040.0, + "43": 787367936.0, + "44": 787392512.0, + "45": 787399680.0, + "46": 787355136.0, + "47": 787411456.0, + "48": 787354112.0, + "49": 787374080.0, + "50": 787389440.0, + "51": 787375616.0, + "52": 787383808.0, + "53": 787379712.0, + "54": 787384832.0, + "55": 787388928.0, + "56": 787388928.0, + "57": 787351040.0, + "58": 787382784.0, + "59": 787374080.0, + "60": 787395072.0, + "61": 787405312.0, + "62": 787405824.0, + "63": 787373056.0, + "64": 787388928.0, + "65": 787351552.0, + "66": 787386880.0, + "67": 787392000.0, + "68": 787399168.0, + "69": 787383296.0, + "70": 787393024.0, + "71": 787406848.0, + "72": 787400704.0, + "73": 787401216.0, + "74": 787403264.0, + "75": 787442688.0, + "76": 787444736.0, + "77": 787445760.0, + "78": 787395072.0, + "79": 787430400.0, + "80": 787410432.0, + "81": 787412992.0, + "82": 787427840.0, + "83": 787428864.0, + "84": 787412480.0, + "85": 787412480.0, + "86": 787394560.0, + "87": 787452928.0, + "88": 787414528.0, + "89": 787404800.0, + "90": 787446784.0, + "91": 787446272.0, + "92": 787446784.0, + "93": 787430400.0, + "94": 787440128.0, + "95": 787450368.0, + "96": 787454976.0, + "97": 787427328.0, + "98": 787475968.0, + "99": 787419136.0, + "100": 787438592.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2479493120.0, + "2": 2485449728.0, + "3": 2487249408.0, + "4": 2487249408.0, + "5": 2495991808.0, + "6": 2495991808.0, + "7": 2495991808.0, + "8": 2495991808.0, + "9": 2495991808.0, + "10": 2495991808.0, + "11": 2495991808.0, + "12": 2495991808.0, + "13": 2495991808.0, + "14": 2495991808.0, + "15": 2495991808.0, + "16": 2495991808.0, + "17": 2495991808.0, + "18": 2495991808.0, + "19": 2495991808.0, + "20": 2495991808.0, + "21": 2495991808.0, + "22": 2495991808.0, + "23": 2495991808.0, + "24": 2495991808.0, + "25": 2495991808.0, + "26": 2495991808.0, + "27": 2495991808.0, + "28": 2495991808.0, + "29": 2495991808.0, + "30": 2495991808.0, + "31": 2495991808.0, + "32": 2495991808.0, + "33": 2495991808.0, + "34": 2495991808.0, + "35": 2495991808.0, + "36": 2495991808.0, + "37": 2495991808.0, + "38": 2495991808.0, + "39": 2495991808.0, + "40": 2495991808.0, + "41": 2495991808.0, + "42": 2495991808.0, + "43": 2495991808.0, + "44": 2495991808.0, + "45": 2495991808.0, + "46": 2495991808.0, + "47": 2495991808.0, + "48": 2495991808.0, + "49": 2495991808.0, + "50": 2495991808.0, + "51": 2495991808.0, + "52": 2495991808.0, + "53": 2495991808.0, + "54": 2495991808.0, + "55": 2495991808.0, + "56": 2495991808.0, + "57": 2495991808.0, + "58": 2495991808.0, + "59": 2495991808.0, + "60": 2495991808.0, + "61": 2495991808.0, + "62": 2495991808.0, + "63": 2495991808.0, + "64": 2495991808.0, + "65": 2495991808.0, + "66": 2495991808.0, + "67": 2495991808.0, + "68": 2495991808.0, + "69": 2495991808.0, + "70": 2495991808.0, + "71": 2495991808.0, + "72": 2495991808.0, + "73": 2495991808.0, + "74": 2495991808.0, + "75": 2495991808.0, + "76": 2495991808.0, + "77": 2495991808.0, + "78": 2495991808.0, + "79": 2495991808.0, + "80": 2495991808.0, + "81": 2495991808.0, + "82": 2495991808.0, + "83": 2495991808.0, + "84": 2495991808.0, + "85": 2495991808.0, + "86": 2495991808.0, + "87": 2495991808.0, + "88": 2495991808.0, + "89": 2495991808.0, + "90": 2495991808.0, + "91": 2495991808.0, + "92": 2495991808.0, + "93": 2495991808.0, + "94": 2495991808.0, + "95": 2495991808.0, + "96": 2495991808.0, + "97": 2495991808.0, + "98": 2495991808.0, + "99": 2495991808.0, + "100": 2495991808.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 12.11313, + "2": 0.4805, + "3": 0.36965, + "4": 0.36695, + "5": 0.31705, + "6": 0.31275, + "7": 0.31299, + "8": 0.29866, + "9": 0.28961, + "10": 0.28859, + "11": 0.29067, + "12": 0.29044, + "13": 0.29806, + "14": 0.29287, + "15": 0.29391, + "16": 0.3175, + "17": 0.28363, + "18": 0.2818, + "19": 0.29347, + "20": 0.28931, + "21": 0.29103, + "22": 0.28444, + "23": 0.28907, + "24": 0.27608, + "25": 0.28277, + "26": 0.28656, + "27": 0.28921, + "28": 0.30243, + "29": 0.30435, + "30": 0.31231, + "31": 0.30439, + "32": 0.31412, + "33": 0.28887, + "34": 0.29613, + "35": 0.29738, + "36": 0.29754, + "37": 0.3019, + "38": 0.2933, + "39": 0.2944, + "40": 0.29283, + "41": 0.29592, + "42": 0.29673, + "43": 0.29319, + "44": 0.30127, + "45": 0.29921, + "46": 0.29904, + "47": 0.28795, + "48": 0.29918, + "49": 0.28711, + "50": 0.29645, + "51": 0.28777, + "52": 0.29536, + "53": 0.2847, + "54": 0.28286, + "55": 0.2874, + "56": 0.28699, + "57": 0.28614, + "58": 0.29825, + "59": 0.28363, + "60": 0.29423, + "61": 0.29226, + "62": 0.2896, + "63": 0.28065, + "64": 0.29533, + "65": 0.29842, + "66": 0.28487, + "67": 0.28419, + "68": 0.29474, + "69": 0.28383, + "70": 0.28417, + "71": 0.29253, + "72": 0.28737, + "73": 0.27923, + "74": 0.28728, + "75": 0.29383, + "76": 0.28157, + "77": 0.64771, + "78": 0.29148, + "79": 0.28742, + "80": 0.29245, + "81": 0.28827, + "82": 0.28368, + "83": 0.28963, + "84": 0.29234, + "85": 0.28183, + "86": 0.28337, + "87": 0.27879, + "88": 0.28388, + "89": 0.28309, + "90": 0.28852, + "91": 0.28254, + "92": 0.28375, + "93": 0.28633, + "94": 0.28567, + "95": 0.28235, + "96": 0.28513, + "97": 0.27951, + "98": 0.27851, + "99": 0.28336, + "100": 0.27744 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml index 3ecd68b9841..8874f9cf045 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml @@ -56,7 +56,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true - --ckpt-format: torch_dist + --ckpt-format: fsdp_dtensor --dist-ckpt-optim-fully-reshardable: true --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml index 8164ca37df8..607d48380d5 100644 --- a/tests/test_utils/recipes/moe.yaml +++ b/tests/test_utils/recipes/moe.yaml @@ -106,14 +106,13 @@ products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - # TODO: The migration of custom fsdp causes EP + FSDP to be temporarily unavailable, which will be fixed in a subsequent MR. - # - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router] - # products: - # - environment: [dev] - # scope: [mr] - # platforms: [dgx_h100] - # - environment: [lts] - # scope: [nightly] + - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_h100] + - environment: [lts] + scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective] products: - environment: [dev] diff --git a/tools/checkpoint/checkpoint_inspector.py b/tools/checkpoint/checkpoint_inspector.py index 34afa27755f..c62f0ca7417 100644 --- a/tools/checkpoint/checkpoint_inspector.py +++ b/tools/checkpoint/checkpoint_inspector.py @@ -8,6 +8,8 @@ import time import re import shutil +from typing import Optional +import tempfile import click import torch @@ -19,6 +21,7 @@ FileSystemReader, FileSystemWriter, ) +from torch.distributed.checkpoint.format_utils import dcp_to_torch_save from torch.distributed.checkpoint.metadata import ( BytesStorageMetadata, TensorStorageMetadata, @@ -64,7 +67,8 @@ def cli(): @cli.command() @click.argument("checkpoint_dir", type=click.Path(exists=True)) @click.option("--enable-msc", is_flag=True, help="Enable MultiStorageClient feature.") -def inspect(checkpoint_dir, enable_msc): +@click.option("--not-ignore-param-to-group-meta", is_flag=True, help="Ignore parameter-to-group metadata.") +def inspect(checkpoint_dir, enable_msc, not_ignore_param_to_group_meta): """Inspect a Megatron Core Distributed Checkpoint""" ckpt_path = Path(checkpoint_dir) @@ -138,6 +142,8 @@ def inspect(checkpoint_dir, enable_msc): ] click.echo(" | ".join(stats) + "\n") + ignore_param_to_group_meta = not not_ignore_param_to_group_meta + ignore_param_to_group_meta_count = 0 for key, value in metadata.state_dict_metadata.items(): bullet = click.style("►", fg="blue") key_styled = click.style(key, fg="green") @@ -147,11 +153,18 @@ def inspect(checkpoint_dir, enable_msc): shape = click.style(f"{tuple(value.size)}", fg="magenta") click.echo(f" {bullet} {key_styled} [{dtype}, shape={shape}]") elif isinstance(value, BytesStorageMetadata): + if ignore_param_to_group_meta and key.startswith("optimizer.param_to_group_meta."): + ignore_param_to_group_meta_count += 1 + continue click.echo(f" {bullet} {key_styled} {click.style('[BYTES]', fg='yellow')}") else: click.echo( f" {bullet} {key_styled} {click.style('[UNKNOWN TYPE]', fg='red')}" ) + if ignore_param_to_group_meta: + click.echo( + click.style(f"Ignored parameter-to-group metadata: {ignore_param_to_group_meta_count}", fg="yellow") + ) # MCore data section try: @@ -323,8 +336,10 @@ def convert_checkpoint( output_dir, swiglu, process_group, + optimizer_param_to_group_prefix="optimizer.param_to_group_meta.module.module.module", optimizer_state_prefix="optimizer.state.module.module.module", model_weight_prefix="model.module", + param_to_param_group_map={}, ): """Convert a Megatron Core Distributed Checkpoint from torch_dist to standard fsdp_dtensor format.""" device_mesh = DeviceMesh.from_group(process_group, device_type="cuda") @@ -371,6 +386,104 @@ def _free_up_some_gpu_memory(): gc.collect() torch.cuda.empty_cache() + def split_layers( + key: str, + value: torch.Tensor, + orig_shape: Optional[torch.Size] = None, + ) -> dict[str, torch.Tensor]: + """ + Split layers into separate tensors. + """ + _free_up_some_gpu_memory() + layers = {} + for i, v in enumerate(split_dtensor(value, 1, dim=0)): + v = gather_uneven_dtensor_to_full_tensor(v).reshape( + orig_shape[1:] if orig_shape else value.shape[1:] + ).redistribute(placements=[Shard(0)]) + + layer_key = key.replace(".layers.", f".layers.{i}.") + layers[layer_key] = v + + return layers + + def split_expert_weights( + key: str, + value: torch.Tensor, + orig_shape: Optional[torch.Size] = None, + ) -> dict[str, torch.Tensor]: + """ + Split expert weights into separate tensors for each expert. + """ + experts = {} + layer_key = key.replace(".experts.experts.", ".experts.") + expert_weights = split_dtensor(value, 1, dim=0) + for expert_idx, expert_weight in enumerate(expert_weights): + layer_key_parts = layer_key.split(".weight", 1) + if len(layer_key_parts) == 1: + expert_key = f"{layer_key}{expert_idx}" + elif len(layer_key_parts) == 2: + expert_key = f"{layer_key_parts[0]}.weight{expert_idx}{layer_key_parts[1]}" + else: + raise ValueError(f"Unexpected expert layer key: {layer_key}") + + expert_weight = gather_uneven_dtensor_to_full_tensor(expert_weight) + expert_shape = orig_shape[1:] if orig_shape else value.shape[1:] + # Handle optimizer states for expert linear_fc2 when ETP is enabled + if ( + layer_key.startswith("optimizer.state.") + and "linear_fc2" in layer_key + and expert_weight.shape[-2] > 1 + ): + tp_size = expert_weight.shape[-2] + rows, cols = expert_shape + # Reshape to split column dimension by tp_size + expert_weight = expert_weight.reshape( + *expert_weight.shape[:-1], rows, cols // tp_size + ) + dims = list(range(expert_weight.ndim)) + dims[-3], dims[-2] = dims[-2], dims[-3] + expert_weight = ( + expert_weight.permute(*dims) + .reshape(expert_shape) + .redistribute(placements=[Shard(0)]) + ) + else: + expert_weight = expert_weight.reshape(expert_shape).redistribute( + placements=[Shard(0)] + ) + experts[expert_key] = expert_weight + return experts + + def is_swiglu_key(key): + return any(re.search(pat, key) for pat in [ + r"(.*)\.mlp\.linear_fc1\.weight", + r"(.*)\.mlp\.linear_fc1\.bias", + r"(.*)\.mlp\.experts\.linear_fc1\.weight(\d+)", + r"(.*)\.mlp\.experts\.linear_fc1\.bias(\d+)", + r"(.*)\.mlp\.experts\.local_experts\.(\d+)\.linear_fc1\.weight", + r"(.*)\.mlp\.experts\.local_experts\.(\d+)\.linear_fc1\.bias", + r"(.*)\.mlp\.shared_experts\.linear_fc1\.weight", + r"(.*)\.mlp\.shared_experts\.linear_fc1\.bias", + ]) + + def split_swiglu_weight(key: str, value: torch.Tensor) -> dict[str, torch.Tensor]: + """ + Split SwiGLU weights into separate tensors. + """ + value = gather_uneven_dtensor_to_full_tensor(value) + swiglu_w_and_v = {} + w, v = torch.chunk(value, 2, dim=0) + w = w.redistribute(placements=[Shard(0)]) + v = v.redistribute(placements=[Shard(0)]) + w_key = re.sub(r'(weight\d*)(.*)', r'\1_w\2', key) + v_key = re.sub(r'(weight\d*)(.*)', r'\1_v\2', key) + swiglu_w_and_v[w_key] = w + swiglu_w_and_v[v_key] = v + return swiglu_w_and_v + + def has_layer_index(key: str) -> bool: + return bool(re.search(r"layers\.(\d+)\.", key)) + while state_dict: key, value = state_dict.popitem() if torch.distributed.get_rank() == 0: @@ -387,9 +500,11 @@ def _free_up_some_gpu_memory(): # Special handling for optimizer state key_list = key.split(".") new_key = f"{optimizer_state_prefix}.{'.'.join(key_list[3:])}.{key_list[2]}" + is_param = False else: # Special handling for module parameters new_key = f"{model_weight_prefix}.{key}" + is_param = True # Handle dist-opt flatten tensors if ( @@ -406,68 +521,47 @@ def _free_up_some_gpu_memory(): else: orig_shape = None - # Handle multi-layer tensors - if ".layers." in new_key: - n_layer = value.shape[0] - - _free_up_some_gpu_memory() - per_layer_values = [ - gather_uneven_dtensor_to_full_tensor(v).redistribute( - placements=[Shard(len(v.shape) - 1)] - ) - for v in split_dtensor(value, 1, dim=0) - ] - for i in range(n_layer): - if orig_shape is not None: - layer_shape = orig_shape[1:] - else: - layer_shape = value.shape[1:] - - per_layer_values[i] = ( - per_layer_values[i] - .reshape(layer_shape) - .redistribute(placements=[Shard(0)]) - ) - for i in range(0, n_layer): - layer_key = new_key.replace(".layers.", f".layers.{i}.") - if swiglu and "mlp.linear_fc1.weight" in layer_key: - # Special case for SwiGLU - w, v = torch.chunk(per_layer_values[i], 2, dim=0) - w = w.redistribute(placements=[Shard(0)]) - v = v.redistribute(placements=[Shard(0)]) - w_key = layer_key.replace( - "mlp.linear_fc1.weight", "mlp.linear_fc1.weight_w" - ) - v_key = layer_key.replace( - "mlp.linear_fc1.weight", "mlp.linear_fc1.weight_v" - ) - # Store both w and v in the state_dict - fsdp_dtensor_state_dict[w_key] = w - fsdp_dtensor_state_dict[v_key] = v - elif ( - "experts.experts.linear_fc1.weight" in layer_key - or "experts.experts.linear_fc2.weight" in layer_key + # Handle multi-layer / experts tensors + split_tensors = {} + if ".layers." in new_key and not has_layer_index(new_key): + split_tensors = split_layers(new_key, value, orig_shape) + elif ".experts.experts." in new_key: + split_tensors = split_expert_weights(new_key, value, orig_shape) + else: + if orig_shape: + value = gather_uneven_dtensor_to_full_tensor(value) + # Handle optimizer states with partition_dim=1 when TP is enabled + if ( + new_key.startswith("optimizer.state.") + and value.ndim > 2 + and value.shape[-2] > 1 ): - # Special case for MoE - layer_key = layer_key.replace(".experts.experts.", ".experts.") - expert_weights = torch.split(per_layer_values[i], 1, dim=0) - for expert_idx, expert_weight in enumerate(expert_weights): - expert_key = f"{layer_key}{expert_idx}" - fsdp_dtensor_state_dict[expert_key] = expert_weight.squeeze( - 0 - ) + tp_size = value.shape[-2] + rows, cols = orig_shape + # Reshape to split column dimension by tp_size + value = value.reshape(*value.shape[:-1], rows, cols // tp_size) + dims = list(range(value.ndim)) + dims[-3], dims[-2] = dims[-2], dims[-3] + value = ( + value.permute(*dims) + .reshape(orig_shape) + .redistribute(placements=[Shard(0)]) + ) else: - # General case - fsdp_dtensor_state_dict[layer_key] = per_layer_values[i] - else: - if orig_shape is not None: - _free_up_some_gpu_memory() - value = ( - value.redistribute(placements=[Replicate()]) - .reshape(orig_shape) - .redistribute(placements=[Shard(0)]) - ) - fsdp_dtensor_state_dict[new_key] = value + value = value.reshape(orig_shape).redistribute(placements=[Shard(0)]) + split_tensors = {new_key: value} + + # Handle SWiGLU weights + for key, value in list(split_tensors.items()): + if swiglu and is_swiglu_key(key): + swiglu_w_and_v = split_swiglu_weight(key, value) + split_tensors.update(swiglu_w_and_v) + del split_tensors[key] + + fsdp_dtensor_state_dict.update(split_tensors) + if is_param and key in param_to_param_group_map: + for new_key in split_tensors.keys(): + param_to_param_group_map[new_key] = param_to_param_group_map[key] elif key.startswith("rng_state"): # Skip RNG states continue @@ -530,6 +624,15 @@ def _free_up_some_gpu_memory(): ) ) common_state = common_strategy.load_common(input_dir) + try: + if "param_groups" in common_state["optimizer"]: + ckpt_param_groups = common_state["optimizer"]["param_groups"] + else: + ckpt_param_groups = [] + for opt_state_dict in common_state["optimizer"].values(): + ckpt_param_groups.extend(opt_state_dict["optimizer"]["param_groups"]) + except: + ckpt_param_groups = None common_state = flatten(common_state) for key, value in common_state.items(): if key.startswith("optimizer.optimizer.param_groups."): @@ -541,12 +644,29 @@ def _free_up_some_gpu_memory(): ) fsdp_dtensor_state_dict[key] = value + # set up per-parameter param_groups + if param_to_param_group_map and ckpt_param_groups is not None: + for name in list(fsdp_dtensor_state_dict.keys()): + if not name.startswith(model_weight_prefix) or name.endswith(".expert_bias"): + continue + + assert name in param_to_param_group_map, f"Missing param group for {name}" + param_group_id = param_to_param_group_map[name] + assert param_group_id < len(ckpt_param_groups), f"Invalid param group id {param_group_id} for {name}" + name_without_prefix = name[len(model_weight_prefix):] + fsdp_dtensor_state_dict[ + f"{optimizer_param_to_group_prefix}.{name_without_prefix}" + ] = ckpt_param_groups[param_group_id] + if "checkpoint_version" not in fsdp_dtensor_state_dict: fsdp_dtensor_state_dict["checkpoint_version"] = 3.0 # Save modified checkpoint save_checkpoint_with_pickle_protocol(fsdp_dtensor_state_dict, output_dir) + dist.barrier() # Synchronize all ranks + dist.destroy_process_group() + @cli.command() @click.argument("input_dir", type=click.Path(exists=True)) @@ -560,12 +680,6 @@ def _free_up_some_gpu_memory(): "--oom-traceback", is_flag=True, help="Enable OOM traceback for debugging." ) @click.option("--enable-msc", is_flag=True, help="Enable MultiStorageClient feature.") -@click.option( - "--distributed-timeout-minutes", - default=10, - type=int, - help="Timeout for distributed operations in minutes.", -) @click.option( "--output-optimizer-state-prefix", default="optimizer.state.module.module.module", @@ -576,15 +690,21 @@ def _free_up_some_gpu_memory(): default="model.module", help="Prefix for model weight keys in the checkpoint.", ) +@click.option( + "--param-to-param-group-map-json", + type=str, + default="{}", + help="JSON string representing the param to parameter group map." +) def convert_torch_dist_to_fsdp_dtensor( input_dir, output_dir, swiglu, oom_traceback, enable_msc, - distributed_timeout_minutes, output_optimizer_state_prefix, output_model_weight_prefix, + param_to_param_group_map_json, ): """Convert a Megatron Core Distributed Checkpoint from torch_dist to fsdp_dtensor format.""" if not enable_msc: @@ -624,10 +744,13 @@ def oom_observer(device, alloc, device_alloc, device_free): ckpt_path = Path(input_dir) output_dir = Path(output_dir) + with open(param_to_param_group_map_json, "r") as f: + param_to_param_group_map = json.load(f) convert_checkpoint( ckpt_path, output_dir, swiglu, process_group=dist.group.WORLD, optimizer_state_prefix=output_optimizer_state_prefix, model_weight_prefix=output_model_weight_prefix, + param_to_param_group_map=param_to_param_group_map, ) click.echo( @@ -742,6 +865,109 @@ def modify_state_dict(input_dir, output_dir, op, enable_msc): ) +def _compare_two_checkpoint(checkpoint_1, checkpoint_2): + reader_1 = FileSystemReader(checkpoint_1) + metadata_1 = reader_1.read_metadata() + + reader_2 = FileSystemReader(checkpoint_2) + metadata_2 = reader_2.read_metadata() + + keys_1 = set(metadata_1.state_dict_metadata.keys()) + keys_2 = set(metadata_2.state_dict_metadata.keys()) + + click.echo(click.style("Comparing checkpoints...", fg="blue")) + + # Compare keys + missing_in_1 = keys_2 - keys_1 + missing_in_2 = keys_1 - keys_2 + common_keys = keys_1 & keys_2 + + click.echo(click.style("Keys missing in checkpoint 1:", fg="red")) + for key in missing_in_1: + click.echo(click.style(f" - {key}", fg="red")) + + click.echo(click.style("Keys missing in checkpoint 2:", fg="red")) + for key in missing_in_2: + click.echo(click.style(f" - {key}", fg="red")) + + # Compare common keys + click.echo(click.style("Common keys in both checkpoints:", fg="green")) + for key in common_keys: + meta_1 = metadata_1.state_dict_metadata[key] + meta_2 = metadata_2.state_dict_metadata[key] + + if not isinstance(meta_1, TensorStorageMetadata): + continue + + if meta_1.size != meta_2.size or meta_1.properties.dtype != meta_2.properties.dtype: + click.echo(click.style(f" - {key} (metadata differ) meta_1: {meta_1}, meta_2: {meta_2}", fg="red")) + else: + value_1 = torch.empty(meta_1.size, dtype=meta_1.properties.dtype) + value_2 = value_1.clone() + + dcp.load({key: value_1}, storage_reader=reader_1, planner=DefaultLoadPlanner()) + dcp.load({key: value_2}, storage_reader=reader_2, planner=DefaultLoadPlanner()) + + if not torch.allclose( + value_1, value_2, atol=1e-8, rtol=1e-5 + ): + click.echo(click.style(f" - {key} (values differ) value_1: {value_1}, value_2: {value_2}", fg="red")) + + +@cli.command() +@click.argument("checkpoint_1", type=click.Path(exists=True)) +@click.argument("checkpoint_2", type=click.Path(exists=True)) +@click.option("--enable-msc", is_flag=True, help="Enable MultiStorageClient feature.") +def compare_two_checkpoint(checkpoint_1, checkpoint_2, enable_msc): + """ + Compare two checkpoints. + """ + init_process_group(f"compare_two_checkpoint from {checkpoint_1} to {checkpoint_2}") + + if not enable_msc: + MultiStorageClientFeature.disable() + + _compare_two_checkpoint( + Path(checkpoint_1), + Path(checkpoint_2), + ) + + click.echo( + click.style( + f"Comparison between {checkpoint_1} and {checkpoint_2} completed.", fg="green", bold=True + ) + ) + + +@cli.command() +@click.argument("torch_dcp_dir", type=click.Path(exists=True)) +def print_torch_dcp_in_json(torch_dcp_dir, model_weight_prefix="model.module"): + # Use a temporary file context + with tempfile.NamedTemporaryFile(suffix=".pth") as tmp_file: + # Convert distributed checkpoint directory to a single-file checkpoint + dcp_to_torch_save(torch_dcp_dir, tmp_file.name) + + # Load the state dict from the temporary file + state_dict = torch.load(tmp_file.name, map_location="cpu") + + click.echo(f"torch dcp content: {json.dumps(state_dict)}") + + # Replace all "module.module." with model_weight_prefix in dict keys + new_state_dict = {} + for key, value in state_dict.items(): + new_key = key.replace("module.module", model_weight_prefix) + new_state_dict[new_key] = value + + # Convert state dict to JSON-serializable format + serializable_dict = {k: v.tolist() if hasattr(v, "tolist") else v for k, v in new_state_dict.items()} + + # Save to a JSON file + json_file_path = os.path.join(torch_dcp_dir, "param_to_param_group_map.json") + with open(json_file_path, "w") as json_file: + json.dump(serializable_dict, json_file, indent=2) + click.echo(f"Saved converted param_to_param_group_map to: {json_file_path}") + + def init_process_group(message): rank = int(os.getenv("RANK", "0")) world_size = int(os.getenv("WORLD_SIZE", "1")) From 13edb58560d083ef7ce5d42b90adda3bd9b53306 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 28 Oct 2025 12:02:51 +0000 Subject: [PATCH 081/248] Revert "cp: `Megatron-FSDP Expert Parallel (DeepSeek-v3) Support` into `dev` (#1987)" This reverts commit cc33e0056b00ee67455fadfb6710e4dbde9e1c33. --- .../distributed/fsdp/mcore_fsdp_adapter.py | 133 +--- megatron/core/distributed/fsdp/src/README.md | 11 - .../fsdp/src/megatron_fsdp/fully_shard.py | 10 +- .../fsdp/src/megatron_fsdp/megatron_fsdp.py | 11 +- .../megatron_fsdp/param_and_grad_buffer.py | 83 +-- .../fsdp/src/megatron_fsdp/uneven_dtensor.py | 4 +- .../fsdp/src/megatron_fsdp/utils.py | 130 +--- .../embeddings/yarn_rotary_pos_embedding.py | 10 +- megatron/core/optimizer/__init__.py | 23 - megatron/core/optimizer/distrib_optimizer.py | 2 - .../transformer/fsdp_dtensor_checkpoint.py | 336 ++-------- megatron/training/arguments.py | 4 - megatron/training/checkpointing.py | 74 +-- megatron/training/training.py | 1 - .../golden_values_dev_dgxh100_coreweave.json | 598 +++++++++--------- .../golden_values_dev_dgxh100_coreweave.json | 500 +++++++-------- .../golden_values_dev_dgx_h100.json | 143 +---- .../golden_values_dev_dgxh100_coreweave.json | 537 ---------------- .../model_config.yaml | 2 +- tests/test_utils/recipes/moe.yaml | 15 +- tools/checkpoint/checkpoint_inspector.py | 362 ++--------- 21 files changed, 765 insertions(+), 2224 deletions(-) delete mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgxh100_coreweave.json diff --git a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py index 7432a7f9a36..a7c0d5802ab 100644 --- a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +++ b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py @@ -13,7 +13,6 @@ # limitations under the License. import logging -import random from typing import List, Optional try: @@ -23,7 +22,6 @@ except ImportError: HAVE_EINOPS = False -import numpy as np import torch import torch.distributed as dist @@ -34,11 +32,10 @@ except ImportError: HAVE_DTENSOR = False -from megatron.core import parallel_state, tensor_parallel +from megatron.core import parallel_state from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk from megatron.core.distributed.data_parallel_base import _BaseDataParallel from megatron.core.distributed.distributed_data_parallel_config import DistributedDataParallelConfig -from megatron.core.extensions.transformer_engine import TELinear from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.transformer_layer import TransformerLayer @@ -98,8 +95,6 @@ def __init__( else: self.fsdp_unit_modules = [] - self._fix_tensor_parallel_attributes(module) - super().__init__( config=config, module=MegatronFSDP( @@ -124,8 +119,6 @@ def __init__( self.module.state_dict_for_save_checkpoint = self.module.state_dict self.state_dict_for_save_checkpoint = self.state_dict - self.sync_rng_states_across_tp_group() - def load_state_dict(self, state_dict, strict=True): """ Load the state dictionary into the module. @@ -148,44 +141,6 @@ def load_state_dict(self, state_dict, strict=True): self.module.load_state_dict(custom_state_dict, strict=strict) - def _fix_tensor_parallel_attributes(self, module): - is_expert_param = lambda n, p: ".experts." in n - is_router_param = lambda n, p: ".router.weight" in n - - if parallel_state.get_tensor_model_parallel_group(): - tp_size = parallel_state.get_tensor_model_parallel_group().size() - else: - tp_size = 1 - - if parallel_state.get_expert_tensor_parallel_group(): - expt_tp_size = parallel_state.get_expert_tensor_parallel_group().size() - else: - expt_tp_size = 1 - - param_to_direct_module = {} - for name, m in module.named_modules(): - for p in m.parameters(recurse=False): - param_to_direct_module[p] = (name, m) - - for name, param in module.named_parameters(): - if is_expert_param(name, param) and expt_tp_size > 1: - setattr(param, "_mcore_tp", True) - if "linear_fc1.weight" in name: - setattr(param, "_tp_partition_dim", 0) - elif "linear_fc2.weight" in name: - setattr(param, "_tp_partition_dim", 1) - - if not is_expert_param(name, param) and tp_size > 1: - m_name, direct_module = param_to_direct_module[param] - if isinstance(direct_module, (TELinear,)): - parallel_mode = getattr(direct_module, "parallel_mode", None) - if parallel_mode is None: - setattr(param, "_mcore_tp", True) - setattr(param, "_tp_duplicated", True) - elif is_router_param(name, param): - setattr(param, "_mcore_tp", True) - setattr(param, "_tp_duplicated", True) - def _init_dist_index(self, pg_collection): """ Initialize the distributed index for the module. @@ -199,7 +154,6 @@ def _init_dist_index(self, pg_collection): enable_hsdp = self.ddp_config.num_distributed_optimizer_instances > 1 if pg_collection is None: tp_group = parallel_state.get_tensor_model_parallel_group() - expt_tp_group = parallel_state.get_expert_tensor_parallel_group() if enable_hsdp: dp_cp_group = parallel_state.get_data_parallel_group( with_context_parallel=True, partial_data_parallel=True @@ -214,11 +168,8 @@ def _init_dist_index(self, pg_collection): ) outer_fsdp_group = None hybrid_fsdp_group = None - expt_dp_group = parallel_state.get_expert_data_parallel_group() - ep_group = parallel_state.get_expert_model_parallel_group() else: tp_group = getattr(pg_collection, 'tp', None) - expt_tp_group = getattr(pg_collection, 'expt_tp', None) if enable_hsdp: dp_cp_group = pg_collection.intra_dp_cp outer_fsdp_group = pg_collection.inter_dist_opt @@ -227,17 +178,11 @@ def _init_dist_index(self, pg_collection): dp_cp_group = pg_collection.dp_cp outer_fsdp_group = None hybrid_fsdp_group = None - expt_dp_group = getattr(pg_collection, 'expt_dp', None) - ep_group = getattr(pg_collection, 'ep', None) if tp_group is None: single_rank_group = dist.new_group(ranks=[dist.get_rank()]) tp_group = single_rank_group - if expt_tp_group is None: - single_rank_group = dist.new_group(ranks=[dist.get_rank()]) - expt_tp_group = single_rank_group - if enable_hsdp: mesh = _get_hsdp_tp_mesh(outer_fsdp_group, dp_cp_group, tp_group) dist_index = FSDPDistributedIndex( @@ -254,17 +199,6 @@ def _init_dist_index(self, pg_collection): hybrid_fsdp_group=hybrid_fsdp_group, ) else: - if ep_group is not None: - expt_mesh = _get_dp_tp_mesh(expt_dp_group, expt_tp_group, ep_size=ep_group.size()) - expt_device_mesh = DeviceMesh.from_group( - [expt_dp_group, expt_tp_group], - device_type="cuda", - mesh=expt_mesh.tolist(), - mesh_dim_names=["dp_cp", "tp"], - ) - else: - expt_device_mesh = None - mesh = _get_dp_tp_mesh(dp_cp_group, tp_group) dist_index = FSDPDistributedIndex( device_mesh=DeviceMesh.from_group( @@ -275,11 +209,8 @@ def _init_dist_index(self, pg_collection): ), dp_shard_dim="dp_cp", tp_dim="tp", - expt_device_mesh=expt_device_mesh, ) - self.tp_group = tp_group - return dist_index def stop_communication(self): @@ -289,20 +220,6 @@ def stop_communication(self): self.module.synchronize_gradient_reduce() self.module.synchronize_param_gather() - def sync_rng_states_across_tp_group(self): - """ - Synchronize the tensor parallel random number generator states. - """ - if self.tp_group.size() <= 1: - return - - if self.tp_group.rank() == 0: - broadcast_list = [_get_rng_state_dict()] - else: - broadcast_list = [None] - torch.distributed.broadcast_object_list(broadcast_list, group=self.tp_group, group_src=0) - _load_rng_state_dict(broadcast_list[0]) - def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group): assert HAVE_EINOPS, "einops is not installed. Please install it with `pip install einops`." @@ -356,46 +273,29 @@ def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group): return mesh -def _get_dp_tp_mesh(dp_cp_group, tp_group, ep_size=1): +def _get_dp_tp_mesh(dp_cp_group, tp_group): assert HAVE_EINOPS, "einops is not installed. Please install it with `pip install einops`." world_size = dist.get_world_size() tp_size = dist.get_world_size(tp_group) if tp_group is not None else 1 - # TODO: Supports configurable (dp, cp, ep, tp) order. - mesh = einops.rearrange( - torch.arange(world_size), - "(dp_cp ep tp) -> ep dp_cp tp", - dp_cp=dp_cp_group.size(), - tp=tp_size, - ep=ep_size, - ) + # TODO: Supports configurable (dp, cp, tp) order. + mesh = einops.rearrange(torch.arange(world_size), "(dp_cp tp) -> dp_cp tp", tp=tp_size) - mesh_dp_ranks = einops.rearrange(mesh, 'ep dp_cp tp -> (ep tp) dp_cp', dp_cp=dp_cp_group.size()) + mesh_dp_ranks = einops.rearrange(mesh, 'dp_cp tp -> tp dp_cp', tp=tp_size) dp_cp_group_ranks = dist.get_process_group_ranks(dp_cp_group) assert _check_mesh_ranks_and_group_ranks_are_consistent(mesh_dp_ranks, dp_cp_group_ranks), ( f"[Megatron-FSDP] Data Parallel ranks in the mesh {mesh_dp_ranks} " f"do not match the ranks in the DP group {dp_cp_group_ranks}." ) - mesh_tp_ranks = einops.rearrange(mesh, 'ep dp_cp tp -> (dp_cp ep) tp', tp=tp_size) + mesh_tp_ranks = einops.rearrange(mesh, 'dp_cp tp -> (dp_cp) tp', tp=tp_size) tp_group_ranks = dist.get_process_group_ranks(tp_group) assert _check_mesh_ranks_and_group_ranks_are_consistent(mesh_tp_ranks, tp_group_ranks), ( f"[Megatron-FSDP] Tensor Parallel ranks in the mesh {mesh_tp_ranks} " f"do not match the ranks in the TP group {tp_group_ranks}." ) - # Exclude the expert parallel dimension - rank = dist.get_rank() - dp_tp_meshes = [per_ep_mesh for per_ep_mesh in mesh if rank in per_ep_mesh.reshape(-1).tolist()] - assert ( - len(dp_tp_meshes) == 1 - ), f"[Megatron-FSDP] Current rank {rank} is not unique in the mesh ranks {mesh.tolist()}." - assert len(dp_tp_meshes[0].reshape(-1).tolist()) == dp_cp_group.size() * tp_group.size(), ( - f"[Megatron-FSDP] DP-TP mesh size {len(dp_tp_meshes[0].reshape(-1).tolist())} " - f"does not match expected size {dp_cp_group.size() * tp_group.size()}." - ) - - return dp_tp_meshes[0] + return mesh def _check_mesh_ranks_and_group_ranks_are_consistent(mesh_ranks, group_ranks): @@ -410,22 +310,3 @@ def _check_mesh_ranks_and_group_ranks_are_consistent(mesh_ranks, group_ranks): f"{mesh_ranks.tolist()} does not match the group ranks {group_ranks}." ) return sorted(current_ranks[0]) == sorted(group_ranks) - - -def _get_rng_state_dict(): - rng_state_dict = { - 'random_rng_state': random.getstate(), - 'np_rng_state': np.random.get_state(), - 'torch_rng_state': torch.get_rng_state(), - 'cuda_rng_state': torch.cuda.get_rng_state(), - 'rng_tracker_states': tensor_parallel.get_cuda_rng_tracker().get_states(), - } - return rng_state_dict - - -def _load_rng_state_dict(rng_state_dict): - random.setstate(rng_state_dict['random_rng_state']) - np.random.set_state(rng_state_dict['np_rng_state']) - torch.set_rng_state(rng_state_dict['torch_rng_state']) - torch.cuda.set_rng_state(rng_state_dict['cuda_rng_state']) - tensor_parallel.get_cuda_rng_tracker().set_states(rng_state_dict['rng_tracker_states']) diff --git a/megatron/core/distributed/fsdp/src/README.md b/megatron/core/distributed/fsdp/src/README.md index 9e036f22f67..d879c6c26f8 100644 --- a/megatron/core/distributed/fsdp/src/README.md +++ b/megatron/core/distributed/fsdp/src/README.md @@ -127,12 +127,6 @@ device_mesh[("dp_shard", "cp")]._flatten("dp_shard_cp") # Only required if using HSDP. Otherwise, don't pass hybrid_fsdp_group. device_mesh[("dp_outer", "dp_shard", "cp")]._flatten("hsdp") hsdp_group = device_mesh["hsdp"].get_group() -# Initialize DeviceMesh for expert parallel (EP) modules when using FSDP + EP. -expert_device_mesh = torch.distributed.device_mesh.init_device_mesh( - "cuda", - mesh_shape=(expt_dp_shard_size, expt_tp_size), - mesh_dim_names=("dp_shard", "tp"), -) # Fully-shards your model and distributes your optimizer. model, optimizer = fully_shard( @@ -151,8 +145,6 @@ model, optimizer = fully_shard( tp_dim="tp", # Only required when using HSDP. Otherwise, set this to None. hybrid_fsdp_group=hsdp_group, - # Only required for FSDP + EP. Otherwise, set this to None. - expt_device_mesh=expt_device_mesh, # FSDP Sharding Strategy: no_shard (0) / optim (1) / optim_grads (2) / optim_grads_params (3) zero_dp_strategy=3, outer_dp_sharding_strategy=1, @@ -200,9 +192,6 @@ optimizer.load_state_dict(ckpt_state_dict["optimizer"]) - `tp_dim` is the name of the sub-mesh used for tensor parallelism (TP), which is required for `(FSDP, TP)`-strided sharding when using Megatron-LM or Torch-native `DTensor` TP. - For more information about tensor parallelism, refer to: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053). - `hybrid_fsdp_group` is the `ProcessGroup` which contains all ranks in the flattened `dp_shard_dim` and `dp_outer_dim` sub-meshes utilized to specify the `(DP-Outer, DP-Shard)` sharded coordinate system for the weight and gradient buffers. Required for HSDP. -- `expt_device_mesh` is another [`torch.distributed.DeviceMesh`](https://docs.pytorch.org/docs/stable/distributed.html#devicemesh) tailored for the expert parallel (EP) modules in `MegatronFSDP`. - - `dp_shard_dim` is the name of the sub-mesh required for FSDP sharding of the EP modules, enabling expert data parallelism (EDP). - - `tp_dim` is the name of the sub-mesh used for expert tensor parallelism (ETP), which is required for `(FSDP, ETP)`-strided sharding when using Megatron-LM or Torch-native `DTensor` ETP. - `init_model_with_meta_device` has `MegatronFSDP` initialize your `meta`-device model in shards on every CUDA device to avoid OOM when initializing extremely large models that cannot fit on a single device. Users can initialize their model on a [`meta`-device](https://docs.pytorch.org/docs/stable/meta.html) (`with torch.device('meta'): ...`), and ``MegatronFSDP`` will further shard and initialize the model parameters layer-by-layer adhering to the customizable `module.reset_parameters` method, which prevents the entire model from being allocated in memory at any point during runtime. - Defaults to `False`. - Note that the `device` argument which installs your model on a specific device or rank will be deactivated when `init_model_with_meta_device=True`. diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py index e98362a1a03..24e86cede72 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py @@ -64,7 +64,6 @@ def fully_shard_model( dp_outer_dim: Optional[str] = None, tp_dim: Optional[str] = None, hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None, - expt_device_mesh: Optional[DeviceMesh] = None, fsdp_unit_modules: Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]] = None, zero_dp_strategy: str | int = 3, outer_dp_sharding_strategy: str | int = 0, @@ -184,10 +183,8 @@ def fully_shard_model( tp_dim=tp_dim, # Only required for HSDP. hybrid_fsdp_group=hybrid_fsdp_group, - # Access to flattened DP rank assignments for HSDP. + # Access to flattened DP rank assignments for HFSDP. hsdp_outer_dp_shard=_outer_fsdp_sharding, - # Only required for Megatron-FSDP + EP. - expt_device_mesh=expt_device_mesh, ) # Wrap model in Megatron FSDP. @@ -333,7 +330,6 @@ def fully_shard( dp_outer_dim: Optional[str] = None, tp_dim: Optional[str] = None, hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None, - expt_device_mesh: Optional[DeviceMesh] = None, fsdp_unit_modules: Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]] = None, zero_dp_strategy: str | int = 3, outer_dp_sharding_strategy: str | int = 0, @@ -395,9 +391,6 @@ def fully_shard( by flattening the outer-FSDP (dp_outer_dim) and FSDP (dp_shard_dim) process groups or sub-meshes. Defaults to None. Required for HSDP, i.e. if dp_outer_dim is not None. - expt_device_mesh (Optional[DeviceMesh]): - Expert parallel device mesh object defining the topology for MoE distributed training. - fsdp_unit_modules (Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]]): List of (sub-)module classes or (sub-)module class import paths that are "units", which are torch.nn.Module(s) that are sharded and scheduled by Megatron-FSDP. @@ -510,7 +503,6 @@ def fully_shard( dp_outer_dim=dp_outer_dim, tp_dim=tp_dim, hybrid_fsdp_group=hybrid_fsdp_group, - expt_device_mesh=expt_device_mesh, fsdp_unit_modules=fsdp_unit_modules, zero_dp_strategy=zero_dp_strategy, outer_dp_sharding_strategy=outer_dp_sharding_strategy, diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py index d6ef5f6210e..10a8ae14d65 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py @@ -235,10 +235,7 @@ def __init__( self.dist_index = dist_index # If Megatron Expert Parallelism is enabled, you need to provide an expt_dp_group. - if ( - has_expert_parameters - and self.dist_index.get_fsdp_group(is_expert_parallel=True) is None - ): + if has_expert_parameters and self.dist_index.get_expert_dp_group() is None: raise ValueError( "[Megatron-FSDP] Megatron Expert Parallelism is enabled, but no expt_dp_group is" "provided." @@ -356,7 +353,9 @@ def _init_fsdp_param_and_grad_buffer(self): ) # Set the suggested communication unit size for reduce-scatter and all-gather pipelines. - suggested_communication_unit_size = self.ddp_config.suggested_communication_unit_size + suggested_communication_unit_size = ( + self.ddp_config.suggested_communication_unit_size or 1_000_000_000 + ) if suggested_communication_unit_size is None: if self.data_parallel_sharding_strategy == "optim_grads_params": total_param_elements = 0 @@ -371,8 +370,6 @@ def _init_fsdp_param_and_grad_buffer(self): suggested_communication_unit_size = total_param_elements // total_fsdp_module * 2 elif self.bucket_size is not None: suggested_communication_unit_size = self.bucket_size - else: - suggested_communication_unit_size = 1_000_000_000 # Cap to 1B elements. suggested_communication_unit_size = max( diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py index bdf480d867b..c8116150d52 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py @@ -34,14 +34,7 @@ from torch.distributed.tensor.device_mesh import _mesh_resources from .uneven_dtensor import update_uneven_dtensor_chunk_metadata, validate_uneven_dtensor -from .utils import ( - _MODEL_PARALLEL_RNG_TRACKER_NAME, - FSDPDistributedIndex, - get_global_memory_buffer, - get_mcore_tensor_parallel_partition_dim, - is_mcore_tensor_model_parallel, - is_mcore_tensor_parallel_duplicated, -) +from .utils import _MODEL_PARALLEL_RNG_TRACKER_NAME, FSDPDistributedIndex, get_global_memory_buffer logger = logging.getLogger(__name__) @@ -1306,7 +1299,7 @@ def _does_param_require_new_bucket(param): and policy.data_parallel_sharding_strategy != "no_shard" ) - is_expert_parameter = lambda n, p: ".experts." in n + is_expert_parameter = lambda p: not getattr(p, "allreduce", True) # Step 1: Group the parameters according to their execution order and attributes. # FSDP unit module parameters are split into multiple parameter sub-groups. @@ -1320,7 +1313,7 @@ def _does_param_require_new_bucket(param): if is_float8tensor(param) or meta_device_init_fp8_params.get(name, False) else param.dtype ), - is_expert_param=is_expert_parameter(name, param), + is_expert_param=is_expert_parameter(param), requires_grad=param.requires_grad, fsdp_unit_id=None, ) @@ -2264,10 +2257,6 @@ def _reset_parameters(self, old_params, new_params): self.param_to_direct_module[new_param] = self.param_to_direct_module[old_param] del self.param_to_direct_module[old_param] - for tp_attr in ["_mcore_tp", "_tp_partition_dim", "_tp_duplicated"]: - if getattr(old_param, tp_attr, None) is not None: - setattr(new_param, tp_attr, getattr(old_param, tp_attr)) - for item_id, p in enumerate(self.params): if p in param_map: new_p = param_map[p] @@ -2351,7 +2340,6 @@ def _init_distributed_params(self): is_expert_param=pg.is_expert_param, run_check=True, update_uneven_dtensor_chunk_meta=True, - force_sync_tp_duplicated_param=True, ) dist_main_weight[param_name] = dist_param elif wbuf: @@ -2363,7 +2351,6 @@ def _init_distributed_params(self): is_expert_param=pg.is_expert_param, run_check=True, update_uneven_dtensor_chunk_meta=True, - force_sync_tp_duplicated_param=True, ) dist_main_weight[param_name] = dist_param else: @@ -2378,7 +2365,6 @@ def _init_distributed_params(self): is_expert_param=pg.is_expert_param, run_check=True, update_uneven_dtensor_chunk_meta=False, - force_sync_tp_duplicated_param=True, ) dist_main_weight[param_name] = dist_param @@ -2413,9 +2399,6 @@ def set_param_attribute(): "partition_dim", "partition_stride", "is_embedding_or_output_parameter", - "_mcore_tp", - "_tp_duplicated", - "_tp_partition_dim", ]: if hasattr(orig_param, attr_name): setattr(param, attr_name, getattr(orig_param, attr_name)) @@ -3563,9 +3546,7 @@ def to_local_if_dtensor(tensor): return tensor -def _get_fsdp_tensor_spec( - param, dist_index: FSDPDistributedIndex, is_sharded_param, is_expert_param -): +def _get_fsdp_tensor_spec(param, dist_index: FSDPDistributedIndex, is_sharded_param): """ Get the DeviceMesh for the parameter and modify the placement for Megatron-FSDP. """ @@ -3576,7 +3557,7 @@ def _get_fsdp_tensor_spec( dtensor_mesh = getattr(dtensor_spec, "mesh", None) # Validate that the DTensor root mesh is identical to the Megatron-FSDP device mesh. - megatron_fsdp_global_mesh = dist_index.get_root_mesh(is_expert_parallel=is_expert_param) + megatron_fsdp_global_mesh = dist_index.get_root_mesh() dtensor_global_mesh = _mesh_resources.get_root_mesh(dtensor_mesh) # FIXME(boxiangw): add or megatron_fsdp_global_mesh != dtensor_global_mesh: # _mesh_resources.get_root_mesh(dtensor_mesh) is not getting the correct root mesh @@ -3621,7 +3602,7 @@ def _get_fsdp_tensor_spec( placements = [Shard(0), dtensor_placement] shard_order = [1, 0] - device_mesh = dist_index.get_submesh(mesh_dim_names, is_expert_parallel=is_expert_param) + device_mesh = dist_index.get_submesh(mesh_dim_names) if shard_order is not None: setattr(device_mesh, "_shard_order", shard_order) @@ -3646,7 +3627,7 @@ def _get_fsdp_tensor_spec( else: placements = [Shard(0)] - device_mesh = dist_index.get_submesh(mesh_dim_names, is_expert_parallel=is_expert_param) + device_mesh = dist_index.get_submesh(mesh_dim_names) if shard_order is not None: setattr(device_mesh, "_shard_order", shard_order) @@ -3661,7 +3642,6 @@ def make_fsdp_dtensor( is_expert_param: bool = False, run_check: bool = False, update_uneven_dtensor_chunk_meta: bool = False, - force_sync_tp_duplicated_param: bool = False, ): """ Creates a distributed tensor (DTensor) from a local tensor with support for @@ -3740,39 +3720,38 @@ def make_fsdp_dtensor( orig_param = param # Handle tensor model parallel specific logic - if is_mcore_tensor_model_parallel(param): + if getattr(param, "tensor_model_parallel", False): # Ensure parameter is not already a DTensor assert not isinstance(param, DTensor), ( - "[Megatron-FSDP] Parameter is already a DTensor, yet tensor_model_parallel " "is True." + "[Megatron-FSDP] Parameter is already a DTensor, yet tensor_model_parallel " + "is True. Check usage." ) - tp_mesh = dist_index.get_submesh(dist_index.tp_dim, is_expert_parallel=is_expert_param) - global_shape = list(param.shape) + # Validate M-Core TP attributes + assert hasattr( + param, "partition_dim" + ), "[Megatron-FSDP] tensor_model_parallel param missing 'partition_dim'." + assert hasattr( + param, "partition_stride" + ), "[Megatron-FSDP] tensor_model_parallel param missing 'partition_stride'." + assert ( + param.partition_stride == 1 + ), "[Megatron-FSDP] Only partition_stride=1 is currently supported for " + "tensor_model_parallel." + + tp_dim = param.partition_dim + tp_mesh = dist_index.get_submesh(dist_index.tp_dim) + + # Adjust shape for global dimension if tp_mesh.mesh.numel() > 1: - if is_mcore_tensor_parallel_duplicated(param): - placements = [Replicate()] - if force_sync_tp_duplicated_param: - if local_tensor.numel() > 0: - torch.distributed.broadcast( - local_tensor, group=tp_mesh.get_group(), group_src=0 - ) - elif run_check: - # TODO: Implement consistency check for duplicated TP parameters - pass - else: - tp_dim = get_mcore_tensor_parallel_partition_dim(param) - assert tp_dim is not None, ( - "[Megatron-FSDP] Parameter is not tensor model parallel, " - "yet tensor_model_parallel is True." - ) - placements = [Shard(tp_dim)] - global_shape[tp_dim] *= tp_mesh.mesh.numel() + global_shape = list(param.shape) + global_shape[tp_dim] *= tp_mesh.mesh.numel() # Construct TP-sharded DTensor using Megatron-style placement param = DTensor.from_local( - local_tensor=local_tensor, + local_tensor=param, device_mesh=tp_mesh, - placements=placements, + placements=[Shard(tp_dim)], run_check=run_check, shape=global_shape, stride=torch.empty(global_shape).stride(), @@ -3780,7 +3759,7 @@ def make_fsdp_dtensor( # Get FSDP-configured mesh and placements from provided param device_mesh, placements = _get_fsdp_tensor_spec( - param, dist_index, is_sharded_param=is_sharded_param, is_expert_param=is_expert_param + param, dist_index, is_sharded_param=is_sharded_param ) # Reshape local tensor for sharded layouts beyond 1D diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py index 490d80c0f21..523d8fae333 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py @@ -365,9 +365,7 @@ def _assemble_full_tensor_from_uneven_chunks( # Wrap into a replicated DTensor and return return DTensor.from_local( - full_tensor, - placements=[Replicate()] * len(dtensor.placements), - device_mesh=dtensor.device_mesh, + full_tensor, placements=[Replicate()], device_mesh=dtensor.device_mesh ) diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py index b94a332bb0d..1dfe08b90f4 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py @@ -675,7 +675,6 @@ def __init__( tp_dim: Optional[str] = None, hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None, hsdp_outer_dp_shard: bool = False, - expt_device_mesh: Optional[DeviceMesh] = None, ): """ Args: @@ -692,8 +691,6 @@ def __init__( in hybrid FSDP. Specifying outer sharding will lift the bucket sharding coordinate system to flattened ranks of (dp_shard, dp_outer) instead of just sharding across dp_shard ranks and replicating across dp_outer ranks. - expt_device_mesh (Optional[DeviceMesh]): The expert parallel device mesh - to use for the DistributedIndex. """ # Device mesh arguments. self.device_mesh = device_mesh @@ -704,11 +701,6 @@ def __init__( self.use_hybrid_fsdp = dp_outer_dim is not None # Helper flag to denote if we are outer-sharding in hybrid FSDP. self.hsdp_outer_dp_shard = hsdp_outer_dp_shard - self.expt_device_mesh = expt_device_mesh - - # Handling the situation where M-Core MoE EP=1 - if self.expt_device_mesh is None: - self.expt_device_mesh = device_mesh # Hybrid FSDP Process Groups # Retrieve the FSDP process group from the DeviceMesh. @@ -727,14 +719,6 @@ def __init__( # combination of the outer-FSDP and FSDP process groups. self.hybrid_fsdp_group = hybrid_fsdp_group - # Retrieve the expert parallel process groups from the DeviceMesh. - self.expt_fsdp_group = ( - self.expt_device_mesh[self.dp_shard_dim].get_group() - if self.expt_device_mesh is not None - and contains_submesh(self.expt_device_mesh, self.dp_shard_dim) - else None - ) - """ Store a persistent reference to the core device meshes that back Megatron-FSDP. This is necessary because _MeshEnv (_mesh_resources) may not persist: @@ -748,33 +732,26 @@ def __init__( FIXME(@cspades): Identify the root cause of this behavior. """ self.mesh_library = {} - - def register_submesh(device_mesh, submesh, is_expert_parallel): - """Register a submesh with identifier: (*submesh, is_expert_parallel) - in the mesh library.""" - if contains_submesh(device_mesh, submesh): - submesh_identifier = tuple(list(submesh) + [is_expert_parallel]) - self.mesh_library[submesh_identifier] = device_mesh[submesh] - - # Define common submesh patterns + # TP Mesh tp_submesh = (self.tp_dim,) + if contains_submesh(self.device_mesh, tp_submesh): + self.mesh_library[tp_submesh] = self.device_mesh[tp_submesh] + # HSDP-TP Mesh hsdp_tp_submesh = (self.dp_outer_dim, self.dp_shard_dim, self.tp_dim) + if contains_submesh(self.device_mesh, hsdp_tp_submesh): + self.mesh_library[hsdp_tp_submesh] = self.device_mesh[hsdp_tp_submesh] + # FSDP-TP Mesh fsdp_tp_submesh = (self.dp_shard_dim, self.tp_dim) + if contains_submesh(self.device_mesh, fsdp_tp_submesh): + self.mesh_library[fsdp_tp_submesh] = self.device_mesh[fsdp_tp_submesh] + # HSDP Mesh hsdp_submesh = (self.dp_outer_dim, self.dp_shard_dim) + if contains_submesh(self.device_mesh, hsdp_submesh): + self.mesh_library[hsdp_submesh] = self.device_mesh[hsdp_submesh] + # FSDP Mesh fsdp_submesh = (self.dp_shard_dim,) - - # Register non-EP submeshes - register_submesh(self.device_mesh, tp_submesh, False) - register_submesh(self.device_mesh, hsdp_tp_submesh, False) - register_submesh(self.device_mesh, fsdp_tp_submesh, False) - register_submesh(self.device_mesh, hsdp_submesh, False) - register_submesh(self.device_mesh, fsdp_submesh, False) - - # Register EP submeshes - if self.expt_device_mesh is not None: - register_submesh(self.expt_device_mesh, tp_submesh, True) - register_submesh(self.expt_device_mesh, fsdp_tp_submesh, True) - register_submesh(self.expt_device_mesh, fsdp_submesh, True) + if contains_submesh(self.device_mesh, fsdp_submesh): + self.mesh_library[fsdp_submesh] = self.device_mesh[fsdp_submesh] # Validate FSDP arguments. if self.fsdp_group is None: @@ -799,54 +776,36 @@ def register_submesh(device_mesh, submesh, is_expert_parallel): "process groups or sub-meshes." ) - def get_submesh( - self, mesh_dim_names: str | Sequence[str], is_expert_parallel: bool = False - ) -> DeviceMesh: + def get_submesh(self, mesh_dim_names: str | Sequence[str]) -> DeviceMesh: """ - Retrieve an Megatron-FSDP-registered submesh by name(s). + Retrieve an Megatron-FSDP-registered sub-mesh by name(s). """ if isinstance(mesh_dim_names, str): mesh_dim_names = (mesh_dim_names,) - - # Construct submesh identifier: (*mesh_dim_names, is_expert_parallel) - submesh_identifier = tuple(list(mesh_dim_names) + [is_expert_parallel]) - - # Retrieve the submesh from the mesh library - device_submesh = self.mesh_library.get(submesh_identifier, None) - + # Search for the sub-mesh in the mesh library. + device_submesh = self.mesh_library.get(tuple(mesh_dim_names), None) if device_submesh is None: - # Warn about not specifying tp_dim for layers or frameworks that depend on this. - if self.tp_dim is None and not is_expert_parallel: + if self.tp_dim is None: + # Warn about not specifying tp_dim for + # layers or frameworks that depend on this. logger.warning( - "[FSDPDistributedIndex] Note: For TransformerEngine, or " - "other machine learning frameworks like Megatron that assume " - "TP=1, you must specify tp_dim to use Megatron-FSDP. " - "Create a trivial TP dimension by setting the TP dimension size " + "[FSDPDistributedIndex] Note: For TransformerEngine, or other machine learning " + "frameworks like Megatron that assume TP=1, you must specify tp_dim to use " + "Megatron-FSDP. Create a trivial TP dimension by setting the TP dimension size " "to 1 in the DeviceMesh.\n" f"DeviceMesh: {self.device_mesh}" ) - elif self.tp_dim is None and is_expert_parallel: - logger.warning( - "[FSDPDistributedIndex] Note: For TransformerEngine, or " - "other machine learning frameworks like Megatron that assume " - "ETP=1, you must specify tp_dim to use Megatron-FSDP. " - "Create a trivial ETP dimension by setting the ETP dimension size " - "to 1 in the DeviceMesh.\n" - f"DeviceMesh: {self.expt_device_mesh}" - ) - raise ValueError( - f"[FSDPDistributedIndex][get_submesh] No submesh with " - f"mesh_dim_names={mesh_dim_names}, is_expert_parallel={is_expert_parallel} " - f"has been registered with Megatron-FSDP." + f"[FSDPDistributedIndex][get_submesh] No sub-mesh with " + f"mesh_dim_names={mesh_dim_names} has been registered with Megatron-FSDP." ) - return device_submesh def get_dp_group(self, is_expert_parallel: bool = False) -> ProcessGroup: """Get the data parallel process group.""" if is_expert_parallel: - return self.expt_fsdp_group + # Expert parallel is not supported + return None if self.use_hybrid_fsdp: return self.hybrid_fsdp_group return self.fsdp_group @@ -854,7 +813,8 @@ def get_dp_group(self, is_expert_parallel: bool = False) -> ProcessGroup: def get_fsdp_group(self, is_expert_parallel: bool = False) -> ProcessGroup: """Get the FSDP process group.""" if is_expert_parallel: - return self.expt_fsdp_group + # Expert parallel is not supported + return None return self.fsdp_group def get_outer_fsdp_group(self) -> ProcessGroup: @@ -866,7 +826,7 @@ def get_outer_fsdp_group(self) -> ProcessGroup: def get_root_mesh(self, is_expert_parallel: bool = False) -> DeviceMesh: """Get the device mesh.""" if is_expert_parallel: - return self.expt_device_mesh + raise NotImplementedError("Expert parallel is not supported in Megatron-FSDP.") return self.device_mesh def get_logical_hybrid_fsdp_rank(self): @@ -964,29 +924,3 @@ def create_updated_function_signature(original_function, **extended_kwargs: dict # Return the updated function signature. return inspect.Signature(params) - - -def is_mcore_tensor_model_parallel(param: torch.Tensor) -> bool: - """ - Check if the given parameter is Megatron-Core tensor model parallel. - """ - return getattr(param, "_mcore_tp", False) or getattr(param, "tensor_model_parallel", False) - - -def is_mcore_tensor_parallel_duplicated(param: torch.Tensor) -> bool: - """ - Check if the given parameter is Megatron-Core tensor model parallel and duplicated. - """ - return getattr(param, "_tp_duplicated", False) - - -def get_mcore_tensor_parallel_partition_dim(param: torch.Tensor) -> Optional[int]: - """ - Get the partition dimension for a Megatron-Core tensor model parallel parameter. - """ - if is_mcore_tensor_model_parallel(param): - if hasattr(param, "_tp_partition_dim"): - return param._tp_partition_dim - else: - return param.partition_dim - return None diff --git a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py index 455a7757d28..507472f789f 100644 --- a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +++ b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py @@ -130,9 +130,9 @@ def forward(self, max_seq_len: int, offset: int = 0, packed_seq: bool = False) - self.original_max_position_embeddings, self.correction_range_round_to_int, ) - inv_freq_mask = 1.0 - _yarn_linear_ramp_mask( - low, high, self.dim // 2, device=self.inv_freq_extra.device - ).to(dtype=torch.float32) + inv_freq_mask = 1.0 - _yarn_linear_ramp_mask(low, high, self.dim // 2).to( + device=self.inv_freq_extra.device, dtype=torch.float32 + ) inv_freq = self.inv_freq_inter * (1 - inv_freq_mask) + self.inv_freq_extra * inv_freq_mask seq = ( @@ -211,11 +211,11 @@ def _yarn_find_correction_range( return max(low, 0), min(high, dim - 1) # Clamp values just in case -def _yarn_linear_ramp_mask(min: float, max: float, dim: int, device: torch.device) -> Tensor: +def _yarn_linear_ramp_mask(min: float, max: float, dim: int) -> Tensor: if min == max: max += 0.001 # Prevent singularity - linear_func = (torch.arange(dim, dtype=torch.float32, device=device) - min) / (max - min) + linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) ramp_func = torch.clamp(linear_func, 0, 1) return ramp_func diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py index c254b2f6882..307538fad22 100644 --- a/megatron/core/optimizer/__init__.py +++ b/megatron/core/optimizer/__init__.py @@ -34,7 +34,6 @@ from megatron.core import parallel_state from megatron.core.optimizer.cpu_offloading.hybrid_optimizer import HybridDeviceOptimizer from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.transformer.fsdp_dtensor_checkpoint import get_global_unique_param_name from ..distributed.param_and_grad_buffer import _ParamAndGradBuffer from ..transformer.module import MegatronModule @@ -482,7 +481,6 @@ def get_megatron_optimizer( use_gloo_process_groups: bool = True, default_skip_embedding_weight_decay: bool = False, pg_collection: Optional[ProcessGroupCollection] = None, - dump_param_to_param_group_map: Optional[str] = None, ) -> MegatronOptimizer: """Retrieve the Megatron optimizer for model chunks. @@ -504,7 +502,6 @@ def get_megatron_optimizer( This is useful if you do not want embeddings to shrink to zero in training as recommended in https://arxiv.org/abs/2312.16903 pg_collection: Optional unified process group for distributed training. - dump_param_to_param_group_map (Optional[str]): path to dump parameter to param group map. Returns: Instance of MegatronOptimizer. @@ -582,9 +579,6 @@ def get_megatron_optimizer( return ChainedOptimizer(optimizers) - if dump_param_to_param_group_map is not None: - param_to_param_group = {} - param_group_id = 0 for dense_model_chunks, overlap_param_gather_with_optimizer_step in zip( all_dense_model_chunks, overlap_param_gather_with_optimizer_step_flags ): @@ -603,12 +597,6 @@ def get_megatron_optimizer( model_chunk.overlap_param_gather_with_optimizer_step = ( overlap_param_gather_with_optimizer_step ) - if dump_param_to_param_group_map is not None: - for param_group in param_groups: - for param in param_group["params"]: - param_name = get_global_unique_param_name(model_chunks, param) - param_to_param_group[param_name] = param_group_id - param_group_id += 1 # Pass Gloo process groups into optimizer only if needed. optimizers.append( @@ -638,12 +626,6 @@ def get_megatron_optimizer( buffer_name='expert_parallel_buffers', default_skip_embedding_weight_decay=default_skip_embedding_weight_decay, ) - if dump_param_to_param_group_map is not None: - for param_group in moe_param_groups: - for param in param_group["params"]: - param_name = get_global_unique_param_name(model_chunks, param) - param_to_param_group[param_name] = param_group_id - param_group_id += 1 if len(moe_param_groups) > 0: expt_model_parallel_rank = get_pg_rank(expt_tp_pp_group) # Pass Gloo process groups into optimizer only if needed. @@ -666,9 +648,4 @@ def get_megatron_optimizer( ) ) - if dump_param_to_param_group_map is not None: - torch.distributed.checkpoint.save( - state_dict=param_to_param_group, checkpoint_id=dump_param_to_param_group_map - ) - return ChainedOptimizer(optimizers) diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py index 8b4740516e2..2925edcce60 100644 --- a/megatron/core/optimizer/distrib_optimizer.py +++ b/megatron/core/optimizer/distrib_optimizer.py @@ -47,7 +47,6 @@ from ..dist_checkpointing.utils import extract_sharded_tensors_and_factories from ..distributed.param_and_grad_buffer import _ParamAndGradBuffer, partition_buckets from ..fp8_utils import dequantize_fp8_tensor, is_float8tensor, quantize_param_shard -from ..transformer.fsdp_dtensor_checkpoint import handle_experts_in_state_dict from ..transformer.module import MegatronModule from .grad_scaler import MegatronGradScaler from .optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper, param_group_identifier_keys @@ -1153,7 +1152,6 @@ def _param_name(self, param: torch.nn.Parameter) -> str: "Ensure that each model chunk has unique parameter names." ) name_to_param.update(_name_to_param) - name_to_param = handle_experts_in_state_dict(name_to_param) self.param_to_name = {param: name for name, param in name_to_param.items()} assert ( param in self.param_to_name diff --git a/megatron/core/transformer/fsdp_dtensor_checkpoint.py b/megatron/core/transformer/fsdp_dtensor_checkpoint.py index 9ef3f1f1b82..dad1947a183 100644 --- a/megatron/core/transformer/fsdp_dtensor_checkpoint.py +++ b/megatron/core/transformer/fsdp_dtensor_checkpoint.py @@ -12,160 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging -import re - import torch -import torch.distributed as dist -from torch.distributed.checkpoint import default_planner - -logger = logging.getLogger(__name__) try: - from torch.distributed import DeviceMesh - from torch.distributed._tensor import DTensor - from torch.distributed.checkpoint.metadata import TensorStorageMetadata - from torch.distributed.tensor.placement_types import Replicate, Shard - from megatron.core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer import ( make_fsdp_dtensor, ) - from megatron.core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor import ( - gather_uneven_dtensor_to_full_tensor, - ) - from megatron.core.distributed.fsdp.src.megatron_fsdp.utils import ( - get_mcore_tensor_parallel_partition_dim, - is_mcore_tensor_model_parallel, - ) HAVE_MEGATRON_FSDP = True except ImportError: HAVE_MEGATRON_FSDP = False -from megatron.core import parallel_state from megatron.core.tensor_parallel.layers import copy_tensor_model_parallel_attributes -from megatron.core.transformer.transformer_layer import TransformerLayer - - -def get_ep_layer_offset(): - """ - Get the expert layer offset for the current model. - """ - from megatron.training.global_vars import get_args - - args = get_args() - ep_size = parallel_state.get_expert_model_parallel_world_size() - ep_rank = parallel_state.get_expert_model_parallel_rank() - num_local_experts = args.num_experts // ep_size if args.num_experts else 0 - local_expert_offset = ep_rank * num_local_experts - - return local_expert_offset - - -def get_total_num_experts(): - """ - Get the total number of experts for the current model. - """ - from megatron.training.global_vars import get_args - - args = get_args() - return args.num_experts if args.num_experts else 0 - - -def get_expert_index_from_key(key): - """Extract expert index from various expert key formats. - - Supported formats: - - GroupedMLP: 'mlp.experts.linear_fc1.weight0', 'mlp.experts.linear_fc2.weight0' - - SequentialMLP: 'mlp.experts.local_experts.0.linear_fc1.weight', - 'mlp.experts.local_experts.0.linear_fc2.weight' - - Returns: - int: Expert index if found, None otherwise. - """ - # GroupedMLP: index is at the end after 'weight' - if 'mlp.experts.linear_fc1.weight' in key or 'mlp.experts.linear_fc2.weight' in key: - m = re.search(r'^.*\.mlp\.experts\.linear_fc\d\.weight(\d+)', key) - assert m, f"Failed to parse expert index from key: {key}" - return int(m.group(1)) - # SequentialMLP: index is between 'local_experts.' and next '.' - elif 'mlp.experts.local_experts' in key: - m = re.search(r'^.*\.mlp\.experts\.local_experts\.(\d+)', key) - assert m, f"Failed to parse expert index from key: {key}" - return int(m.group(1)) - return None - - -def handle_experts_in_state_dict(state_dict): - """ - Rewrite expert keys in state dict. - """ - local_expert_start = get_ep_layer_offset() - local_expert_end = get_total_num_experts() - - def should_keep_expert_key(expert_index): - """Determine if this rank should keep this expert key based on expert index""" - if expert_index is None: - # If we can't determine expert index, keep the key (non-expert weights) - return True - - # Check if this expert belongs to this rank - return local_expert_start <= expert_index < local_expert_end - - def replace_expert_index_in_key(key, expert_index, state_dict): - """Replace expert index in key with new index corresponding to the current rank""" - new_expert_index = expert_index + local_expert_start - # GroupedMLP: 'mlp.experts.linear_fc1.weight0', 'mlp.experts.linear_fc2.weight0' - if 'mlp.experts.linear_fc1.weight' in key or 'mlp.experts.linear_fc2.weight' in key: - # Handle SwiGLU weight{idx}_w and weight{idx}_v format - if key.endswith('_w') or key.endswith('_v'): - suffix = key[-2:] # '_w' or '_v' - new_key = key.replace( - f'weight{expert_index}{suffix}', f'weight{new_expert_index}{suffix}' - ) - # Handle regular weight{idx} format - else: - new_key = key.replace(f'weight{expert_index}', f'weight{new_expert_index}') - # SequentialMLP: index is between 'local_experts.' and next '.' - elif 'mlp.experts.local_experts' in key: - new_key = key.replace( - f'local_experts.{expert_index}.', f'local_experts.{new_expert_index}.' - ) - else: - raise ValueError(f"Unexpected expert key format: {key}") - - state_dict[new_key] = state_dict[key] - del state_dict[key] - - # Process model state dict - state_dict = state_dict.copy() - for key in list(state_dict.keys()): - expert_index = get_expert_index_from_key(key) - if not should_keep_expert_key(expert_index): - replace_expert_index_in_key(key, expert_index, state_dict) - - return state_dict - - -def expert_param_local_key(key): - """Get the module parameter corresponding to the key.""" - local_expert_offset = get_ep_layer_offset() - expert_index = get_expert_index_from_key(key) - if expert_index is not None: - new_expert_index = expert_index - local_expert_offset - # GroupedMLP: 'mlp.experts.linear_fc1.weight0', 'mlp.experts.linear_fc2.weight0' - if 'mlp.experts.linear_fc1.weight' in key or 'mlp.experts.linear_fc2.weight' in key: - new_key = key.replace(f'weight{expert_index}', f'weight{new_expert_index}') - # SequentialMLP: index is between 'local_experts.' and next '.' - elif 'mlp.experts.local_experts' in key: - new_key = key.replace( - f'local_experts.{expert_index}.', f'local_experts.{new_expert_index}.' - ) - else: - raise ValueError(f"Unexpected expert key format: {key}") - key = new_key - - return key def handle_swiglu_in_state_dict(model, model_state_dict, optimizer_state_dict): @@ -185,29 +43,7 @@ def intersection(s1, s2): def offset_slice(s, offset): return slice(s.start + offset, s.stop + offset) - def is_swiglu_key(key): - """ - Check if this key should be handled as SwiGLU linear_fc1 weight or bias. - """ - # Non-expert MLP: 'mlp.linear_fc1.weight', 'mlp.linear_fc1.bias' - # GroupedMLP: 'mlp.experts.linear_fc1.weight0', 'mlp.experts.linear_fc1.bias0' - # SequentialMLP: 'mlp.experts.local_experts.0.linear_fc1.weight', - # 'mlp.experts.local_experts.0.linear_fc1.bias' - return any( - re.search(pat, key) - for pat in [ - r"(.*)\.mlp\.linear_fc1\.weight$", - r"(.*)\.mlp\.linear_fc1\.bias$", - r"(.*)\.mlp\.experts\.linear_fc1\.weight(\d+)$", - r"(.*)\.mlp\.experts\.linear_fc1\.bias(\d+)$", - r"(.*)\.mlp\.experts\.local_experts\.(\d+)\.linear_fc1\.weight$", - r"(.*)\.mlp\.experts\.local_experts\.(\d+)\.linear_fc1\.bias$", - r"(.*)\.mlp\.shared_experts\.linear_fc1\.weight$", - r"(.*)\.mlp\.shared_experts\.linear_fc1\.bias$", - ] - ) - - def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis, is_expert_param): + def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis): """ Split the SWiGLU linear_fc1 parameter into two parts: weight_w and weight_v. """ @@ -219,9 +55,7 @@ def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis, is_expert_param fsdp_slice = dist_param.megatron_fsdp_slice megatron_fsdp_dist_index = dist_param.megatron_fsdp_dist_index - tp_mesh = megatron_fsdp_dist_index.get_submesh( - [megatron_fsdp_dist_index.tp_dim], is_expert_parallel=is_expert_param - ) + tp_mesh = megatron_fsdp_dist_index.get_submesh([megatron_fsdp_dist_index.tp_dim]) data_size = data.numel() // tp_mesh.mesh.numel() w_slice = slice(0, data_size // 2) v_slice = slice(data_size // 2, data_size) @@ -241,9 +75,8 @@ def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis, is_expert_param # Fake parameters w and v are used to provide the correct parameter # shape and Tensor-Parallelism information. per_tp_rank_shape = list(data.shape) - if is_mcore_tensor_model_parallel(dist_param): - tp_dim = get_mcore_tensor_parallel_partition_dim(dist_param) - assert tp_dim is not None, "Tensor model parallel dimension not found" + if getattr(dist_param, "tensor_model_parallel", False): + tp_dim = dist_param.partition_dim per_tp_rank_shape[tp_dim] //= tp_mesh.mesh.numel() linear_fc1_meta = torch.empty(*per_tp_rank_shape, device="meta") w_meta, v_meta = torch.chunk(linear_fc1_meta, 2, dim=swiglu_shard_axis) @@ -254,7 +87,6 @@ def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis, is_expert_param weight_w.data, w_meta, dist_index=megatron_fsdp_dist_index, - is_expert_param=is_expert_param, run_check=True, update_uneven_dtensor_chunk_meta=True, ) @@ -262,21 +94,16 @@ def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis, is_expert_param weight_v.data, v_meta, dist_index=megatron_fsdp_dist_index, - is_expert_param=is_expert_param, run_check=True, update_uneven_dtensor_chunk_meta=True, ) return weight_w, weight_v - model_state_dict = model_state_dict.copy() for key in list(model_state_dict.keys()): - if is_swiglu_key(key): + if key.endswith('mlp.linear_fc1.weight') or key.endswith('mlp.linear_fc1.bias'): dist_param = model.get_parameter(f"module.{key}") weight_w, weight_v = split_swiglu_linear_fc1( - model_state_dict[key], - dist_param, - swiglu_shard_axis=0, - is_expert_param='mlp.experts' in key, + model_state_dict[key], dist_param, swiglu_shard_axis=0 ) # Update the model state dict with the new keys @@ -284,32 +111,26 @@ def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis, is_expert_param model_state_dict[f"{key}_v"] = weight_v del model_state_dict[key] - if optimizer_state_dict is not None: - optimizer_state_dict = optimizer_state_dict.copy() - if len(optimizer_state_dict["state"]) != 0: - opt_state_dict = optimizer_state_dict["state"] - new_opt_state_dict = {} - for key in list(opt_state_dict.keys()): - # Only process SWIGLU keys - if not is_swiglu_key(key): - new_opt_state_dict[key] = opt_state_dict[key] - continue - new_opt_state_dict[f"{key}_w"] = opt_state_dict[key].copy() - new_opt_state_dict[f"{key}_v"] = opt_state_dict[key].copy() - for subkey in ["exp_avg", "exp_avg_sq"]: - dist_param = model.get_parameter(expert_param_local_key(key[len("module.") :])) - weight_w, weight_v = split_swiglu_linear_fc1( - opt_state_dict[key][subkey], - dist_param, - swiglu_shard_axis=0, - is_expert_param="mlp.experts" in key, - ) - # Update the optimizer state dict with the new keys - new_opt_state_dict[f"{key}_w"][subkey] = weight_w - new_opt_state_dict[f"{key}_v"][subkey] = weight_v - optimizer_state_dict["state"] = new_opt_state_dict + try: + optimizer_state_dict = optimizer_state_dict["state"] + except KeyError: + optimizer_state_dict = {} - return model_state_dict, optimizer_state_dict + if len(optimizer_state_dict) != 0: + for key in list(optimizer_state_dict.keys()): + if not (key.endswith('mlp.linear_fc1.weight') or key.endswith('mlp.linear_fc1.bias')): + continue + optimizer_state_dict[f"{key}_w"] = optimizer_state_dict[key].copy() + optimizer_state_dict[f"{key}_v"] = optimizer_state_dict[key].copy() + for subkey in ["exp_avg", "exp_avg_sq"]: + dist_param = model.get_parameter(key[len("module.") :]) + weight_w, weight_v = split_swiglu_linear_fc1( + optimizer_state_dict[key][subkey], dist_param, swiglu_shard_axis=0 + ) + # Update the optimizer state dict with the new keys + optimizer_state_dict[f"{key}_w"][subkey] = weight_w + optimizer_state_dict[f"{key}_v"][subkey] = weight_v + del optimizer_state_dict[key] def handle_fp8_extra_state_case(model_state_dict): @@ -341,7 +162,7 @@ def flatten_state_dict(obj, parent_key="", sep="."): return items -def print_diff_in_state_dicts(state_dict_metadata, load_state_dict, limit=100): +def print_diff_in_state_dicts(state_dict_metadata, load_state_dict): """ Print the differences between two state dicts: metadata state dict and load state dict. This function compares the keys and shapes of the tensors in both dicts. @@ -351,105 +172,24 @@ def print_diff_in_state_dicts(state_dict_metadata, load_state_dict, limit=100): meta_keys = set(state_dict_metadata.keys()) load_keys = set(load_state_dict.keys()) - only_in_meta = list(meta_keys - load_keys) - only_in_load = list(load_keys - meta_keys) - in_both = list(meta_keys & load_keys) + only_in_meta = meta_keys - load_keys + only_in_load = load_keys - meta_keys + in_both = meta_keys & load_keys - logger.info(f"Keys only in checkpoint metadata_state_dict(first {limit}):") - for k in sorted(only_in_meta[:limit]): - logger.info(f" {k}") + print("Keys only in checkpoint metadata_state_dict:") + for k in sorted(only_in_meta): + print(f" {k}") - logger.info(f"\nKeys only in load_state_dict(first {limit}):") - for k in sorted(only_in_load[:limit]): - logger.info(f" {k}") + print("\nKeys only in load_state_dict:") + for k in sorted(only_in_load): + print(f" {k}") - logger.info(f"\nKeys in both but with different shapes(first {limit}):") - for k in sorted(in_both[:limit]): + print("\nKeys in both but with different shapes:") + for k in sorted(in_both): v_meta = state_dict_metadata[k] v_load = load_state_dict[k] # If tensors, compare shape; else, compare type/values meta_shape = v_meta.size if hasattr(v_meta, "size") else type(v_meta) load_shape = v_load.shape if hasattr(v_load, "shape") else type(v_load) if meta_shape != load_shape: - logger.info(f" {k}: meta shape={meta_shape}, load shape={load_shape}") - - -def validate_loaded_state_dict(state_dict, checkpoint_path): - """ - Validate the loaded state dict against the expected structure and types. - """ - assert HAVE_MEGATRON_FSDP, "This function requires Megatron-FSDP to be installed." - - # Initialize reader - reader = torch.distributed.checkpoint.FileSystemReader(checkpoint_path) - metadata = reader.read_metadata() - flat_state_dict = flatten_state_dict(state_dict) - - for key, value in flat_state_dict.items(): - tensor_metadata = metadata.state_dict_metadata[key] - - if not isinstance(tensor_metadata, TensorStorageMetadata): - continue - if not isinstance(value, DTensor): - load_item_dict = {key: torch.empty_like(value)} - else: - load_item_dict = { - key: torch.distributed.tensor.empty( - tensor_metadata.size, - dtype=tensor_metadata.properties.dtype, - device_mesh=DeviceMesh.from_group( - group=dist.group.WORLD, - device_type="cuda", - mesh=torch.arange(dist.get_world_size()), - mesh_dim_names=("world",), - ), - placements=[Shard(0)], - ) - } - torch.distributed.checkpoint.load( - load_item_dict, storage_reader=reader, planner=default_planner.DefaultLoadPlanner() - ) - if isinstance(value, DTensor): - full_value = gather_uneven_dtensor_to_full_tensor(value) - loaded_tensor = load_item_dict[key].redistribute( - placements=[Replicate()] * len(value.placements) - ) - assert torch.allclose( - loaded_tensor._local_tensor, full_value._local_tensor, atol=1e-8, rtol=1e-5 - ), f"key: {key}; {loaded_tensor} {full_value}" - else: - assert torch.allclose( - value, load_item_dict[key] - ), f"key: {key}; {value} {load_item_dict[key]}" - - -def get_global_unique_param_name(model_chunks, param): - """ - Get the global unique parameter name for a given model and parameter. - """ - param_name = None - for model in model_chunks: - for name, p in model.named_parameters(): - if p is param: - param_name = name - break - if param_name is None: - raise ValueError("Parameter not found in model chunks") - - # Get PP unique parameter name - if re.search(r"layers\.(\d+)", param_name) and "mtp" not in param_name: - tf_layer_number = -1 - for module in model.modules(): - if not isinstance(module, TransformerLayer): - continue - for p in module.parameters(): - if p is param: - tf_layer_number = module.layer_number - break - if tf_layer_number != -1: - param_name = re.sub(r"layers\.(\d+)", f"layers.{tf_layer_number - 1}", param_name) - - # Get EP unique parameter name - param_name = list(handle_experts_in_state_dict({param_name: None}).keys())[0] - - return param_name + print(f" {k}: meta shape={meta_shape}, load shape={load_shape}") diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 1d29aff0827..bdf915a8ae1 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -2267,10 +2267,6 @@ def _add_training_args(parser): help="Use torch.optim.Optimizer instead of Megatron's optimizer in optimizer cpu offload mode.") group.add_argument('--overlap-cpu-optimizer-d2h-h2d', action='store_true', default=False, help='Overlap CPU optimizer step, gradients D2H and updated parameters H2D.') - group.add_argument('--dump-param-to-param-group-map', type=str, default=None, - help="Path to a file containing parameter-to-parameter-group mapping. " - "Provide a JSON file that specifies which parameters belong to which " - "parameter group for global coordination.") group.add_argument('--no-pin-cpu-grads', action='store_false', dest='pin_cpu_grads', help='Disable pinning of CPU memory for gradients.') group.add_argument('--no-pin-cpu-params', action='store_false', dest='pin_cpu_params', diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py index 93c23255f4c..71b9cd97021 100644 --- a/megatron/training/checkpointing.py +++ b/megatron/training/checkpointing.py @@ -42,10 +42,9 @@ try: from megatron.core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor import preprocess_state_dict_for_uneven_dtensor from megatron.core.transformer.fsdp_dtensor_checkpoint import ( - print_diff_in_state_dicts, handle_fp8_extra_state_case, handle_swiglu_in_state_dict, - handle_experts_in_state_dict, + print_diff_in_state_dicts, ) HAVE_MEGATRON_FSDP = True except ImportError: @@ -562,9 +561,6 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati # TODO Handle non-empty directories (e.g., after a crash during saving). ensure_directory_exists(checkpoint_name, check_parent=False) - if ckpt_format == "fsdp_dtensor": - state_dict = preprocess_fsdp_dtensor_state_dict(args, state_dict, model[0]) - fs_storage_writer = torch.distributed.checkpoint.FileSystemWriter(checkpoint_name) torch.distributed.checkpoint.save( state_dict=state_dict, @@ -788,17 +784,9 @@ def maybe_save_dataloader_state(train_iterator, iteration, dataloader_save_path) torch.save(dataloader_save_dict, data_state_save_path) -def generate_state_dict( - args, - model, - optimizer, - opt_param_scheduler, - rng_state, - iteration=None, - optim_sd_kwargs=None, - model_sd_kwargs=None, - rerun_state=None, -): +def generate_state_dict(args, model, optimizer, opt_param_scheduler, + rng_state, iteration=None, + optim_sd_kwargs=None, model_sd_kwargs=None, rerun_state=None): """Generate a state dict from given model, optimizer, scheduler, rng state and others. """ # Arguments, iteration, and model. @@ -851,27 +839,16 @@ def generate_state_dict( if not args.no_save_rng and rng_state: state_dict["rng_state"] = rng_state - return state_dict - - -def preprocess_fsdp_dtensor_state_dict(args, raw_state_dict, model): - state_dict = raw_state_dict.copy() - handle_fp8_extra_state_case(state_dict["model"]) - if args.swiglu: - if "optimizer" in state_dict: - model_state_dict, optimizer_state_dict = handle_swiglu_in_state_dict( - model, state_dict["model"], state_dict["optimizer"] - ) - state_dict["model"] = model_state_dict - state_dict["optimizer"] = optimizer_state_dict - else: - model_state_dict, _ = handle_swiglu_in_state_dict( - model, state_dict["model"], None - ) - state_dict["model"] = model_state_dict - if args.num_experts: - state_dict["model"] = handle_experts_in_state_dict(state_dict["model"]) - preprocess_state_dict_for_uneven_dtensor(state_dict) + # fsdp_dtensor ckpt specific state dict preprocessing + if args.ckpt_format == "fsdp_dtensor": + assert HAVE_MEGATRON_FSDP, "Megatron FSDP is enabled but Megatron-FSDP is not available." + assert len(model) == 1, "FSDP DTensor checkpoints are not supported for multiple models." + if args.swiglu: + state_dict = state_dict.copy() + handle_swiglu_in_state_dict( + model[0], state_dict["model"], state_dict["optimizer"]) + handle_fp8_extra_state_case(state_dict["model"]) + preprocess_state_dict_for_uneven_dtensor(state_dict) return state_dict @@ -1192,12 +1169,6 @@ def _load_base_checkpoint( if rank0: return {}, checkpoint_name, release, CheckpointType.FSDP_DTENSOR - state_dict = sharded_state_dict - raw_optimizer_state_dict = state_dict["optimizer"].copy() if "optimizer" in state_dict else None - raw_model_state_dict = state_dict["model"].copy() if "model" in state_dict else None - model = state_dict.pop("_model") - state_dict = preprocess_fsdp_dtensor_state_dict(args, state_dict, model[0]) - ckpt_type = CheckpointType.FSDP_DTENSOR fs_storage_reader = torch.distributed.checkpoint.FileSystemReader(checkpoint_name) allow_partial_load = not getattr(args, 'strict_fsdp_dtensor_load', False) @@ -1206,20 +1177,15 @@ def _load_base_checkpoint( rank = torch.distributed.get_rank() import time as _time _time.sleep(rank * 0.001) # Make that logs of different ranks do not overlap - print_diff_in_state_dicts(state_dict_metadata, state_dict) + print_diff_in_state_dicts(state_dict_metadata, sharded_state_dict) planner = default_planner.DefaultLoadPlanner(allow_partial_load=allow_partial_load) torch.distributed.checkpoint.load_state_dict( - state_dict=state_dict, + state_dict=sharded_state_dict, storage_reader=fs_storage_reader, planner=planner, ) - - if raw_optimizer_state_dict is not None: - state_dict["optimizer"] = raw_optimizer_state_dict - - if raw_model_state_dict is not None: - state_dict["model"] = raw_model_state_dict + state_dict = sharded_state_dict else: raise NotImplementedError(f"checkpoint format {ckpt_format} not supported") @@ -1554,7 +1520,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load', except FileNotFoundError: state_dict_metadata = {} - gen_sd_rerun_state = {} + gen_sd_rerun_state = None gen_sd_opt_param_scheduler = None gen_sd_rng_state = None gen_sd_optim = None @@ -1571,7 +1537,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load', optim_sd_kwargs = dict(metadata=_build_sharded_state_dict_metadata(args), is_loading=True) - state_dict = generate_state_dict( + load_kwargs["sharded_state_dict"] = generate_state_dict( args, model=model, optimizer=gen_sd_optim, @@ -1581,8 +1547,6 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load', rerun_state=gen_sd_rerun_state, iteration=1, ) - state_dict["_model"] = model - load_kwargs["sharded_state_dict"] = state_dict state_dict, checkpoint_name, release, ckpt_type = _load_base_checkpoint( load_dir, args, rank0=False, checkpointing_context=checkpointing_context, diff --git a/megatron/training/training.py b/megatron/training/training.py index bda9e42dc82..f805dab0f15 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -1210,7 +1210,6 @@ def setup_model_and_optimizer( # If the user is asking for a non-zero embedding init std, skip weight decay for embeddings # to avoid embeddings from shrinking to zero as recommended in https://arxiv.org/abs/2312.16903 default_skip_embedding_weight_decay=args.embedding_init_method_std is not None, - dump_param_to_param_group_map=args.dump_param_to_param_group_map, ) else: optimizer = get_megatron_muon_optimizer( diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json index 717ae3f5fa6..0f2637a9511 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.04722, - "2": 11.03572, - "3": 9.58802, - "4": 9.25807, - "5": 9.46595, - "6": 9.99646, - "7": 9.50952, - "8": 8.97596, - "9": 8.64768, - "10": 9.40103, - "11": 8.86556, - "12": 8.63563, - "13": 8.52125, - "14": 8.08824, - "15": 8.1958, - "16": 8.22112, - "17": 8.14098, - "18": 7.8386, - "19": 8.23438, - "20": 7.95361, - "21": 7.62549, - "22": 7.60352, - "23": 7.47957, - "24": 7.46573, - "25": 7.70343, - "26": 7.10719, - "27": 7.64313, - "28": 7.34582, - "29": 7.5169, - "30": 7.67511, - "31": 7.41799, - "32": 7.61213, - "33": 7.66582, - "34": 7.73101, - "35": 7.23081, - "36": 7.10765, - "37": 7.4476, - "38": 7.21053, - "39": 7.57508, - "40": 7.5662, - "41": 7.51605, - "42": 7.27243, - "43": 7.25706, - "44": 7.44, - "45": 7.21244, - "46": 6.92421, - "47": 7.32604, - "48": 7.17147, - "49": 7.62154, - "50": 7.0624 + "1": 11.04748, + "2": 11.03561, + "3": 9.58774, + "4": 9.25819, + "5": 9.53583, + "6": 9.8804, + "7": 9.48247, + "8": 8.93575, + "9": 8.65813, + "10": 9.0567, + "11": 8.49445, + "12": 8.52444, + "13": 8.45239, + "14": 7.97323, + "15": 8.0476, + "16": 8.07971, + "17": 8.09081, + "18": 7.76437, + "19": 8.14892, + "20": 7.89868, + "21": 7.59371, + "22": 7.54743, + "23": 7.43222, + "24": 7.4302, + "25": 7.67579, + "26": 7.06929, + "27": 7.62041, + "28": 7.32495, + "29": 7.49042, + "30": 7.64391, + "31": 7.39435, + "32": 7.58789, + "33": 7.64037, + "34": 7.69778, + "35": 7.20998, + "36": 7.08538, + "37": 7.42584, + "38": 7.18804, + "39": 7.55054, + "40": 7.54446, + "41": 7.49287, + "42": 7.24937, + "43": 7.23587, + "44": 7.41595, + "45": 7.18755, + "46": 6.89949, + "47": 7.29966, + "48": 7.14134, + "49": 7.58963, + "50": 7.03602 } }, "num-zeros": { @@ -62,55 +62,55 @@ "step_interval": 1, "values": { "1": 38802612.0, - "2": 38543656.0, - "3": 38739356.0, - "4": 273649600.0, - "5": 252887040.0, - "6": 255692384.0, - "7": 598483264.0, - "8": 787737984.0, - "9": 696133120.0, - "10": 505146368.0, - "11": 718888640.0, - "12": 872597184.0, - "13": 947495104.0, - "14": 1076398976.0, - "15": 856390592.0, - "16": 1048635648.0, - "17": 831370688.0, - "18": 963679552.0, - "19": 970018240.0, - "20": 935737344.0, - "21": 904189312.0, - "22": 887937280.0, - "23": 894777856.0, - "24": 703744192.0, - "25": 909232512.0, - "26": 875633216.0, - "27": 894981376.0, - "28": 919242816.0, - "29": 931351552.0, - "30": 929784768.0, - "31": 941621376.0, - "32": 885000768.0, - "33": 828484096.0, - "34": 822284800.0, - "35": 832032128.0, - "36": 787939392.0, - "37": 770719808.0, - "38": 561204672.0, - "39": 617201536.0, - "40": 695374592.0, - "41": 698978816.0, - "42": 692913728.0, - "43": 668003776.0, - "44": 673780992.0, - "45": 631182912.0, - "46": 444613312.0, - "47": 591957824.0, - "48": 617363968.0, - "49": 585295808.0, - "50": 570423872.0 + "2": 38543592.0, + "3": 38739528.0, + "4": 279937824.0, + "5": 259189728.0, + "6": 271446400.0, + "7": 604773504.0, + "8": 768892544.0, + "9": 645824128.0, + "10": 744257088.0, + "11": 718888576.0, + "12": 746732544.0, + "13": 871990976.0, + "14": 821645632.0, + "15": 724250816.0, + "16": 932241472.0, + "17": 648958912.0, + "18": 649120000.0, + "19": 925992960.0, + "20": 989207936.0, + "21": 819324096.0, + "22": 736955072.0, + "23": 910497792.0, + "24": 876716672.0, + "25": 843170688.0, + "26": 809573824.0, + "27": 854086912.0, + "28": 802857664.0, + "29": 805523328.0, + "30": 775645184.0, + "31": 771754624.0, + "32": 749733696.0, + "33": 718385216.0, + "34": 724771200.0, + "35": 737655104.0, + "36": 690419968.0, + "37": 673203456.0, + "38": 627239552.0, + "39": 614047168.0, + "40": 607288512.0, + "41": 582590592.0, + "42": 548211200.0, + "43": 532740640.0, + "44": 554239168.0, + "45": 514790528.0, + "46": 350258560.0, + "47": 472420128.0, + "48": 453788736.0, + "49": 440597216.0, + "50": 303063296.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 6637272576.0, - "2": 6637274624.0, - "3": 6637274624.0, - "4": 6637274624.0, - "5": 6637274624.0, - "6": 6637274624.0, - "7": 6637274624.0, - "8": 6637274624.0, - "9": 6637274624.0, - "10": 6637274624.0, - "11": 6637274624.0, - "12": 6637274624.0, - "13": 6637274624.0, - "14": 6637274624.0, - "15": 6637274624.0, - "16": 6637274624.0, - "17": 6637274624.0, - "18": 6637274624.0, - "19": 6637274624.0, - "20": 6637274624.0, - "21": 6637274624.0, - "22": 6637274624.0, - "23": 6637274624.0, - "24": 6637274624.0, - "25": 6637274624.0, - "26": 6637274624.0, - "27": 6637274624.0, - "28": 6637274624.0, - "29": 6637274624.0, - "30": 6637274624.0, - "31": 6637274624.0, - "32": 6637274624.0, - "33": 6637274624.0, - "34": 6637274624.0, - "35": 6637274624.0, - "36": 6637274624.0, - "37": 6637274624.0, - "38": 6637274624.0, - "39": 6637274624.0, - "40": 6637274624.0, - "41": 6637274624.0, - "42": 6637274624.0, - "43": 6637274624.0, - "44": 6637274624.0, - "45": 6637274624.0, - "46": 6637274624.0, - "47": 6637274624.0, - "48": 6637274624.0, - "49": 6637274624.0, - "50": 6637274624.0 + "1": 6637267456.0, + "2": 6637269504.0, + "3": 6637269504.0, + "4": 6637269504.0, + "5": 6637269504.0, + "6": 6637269504.0, + "7": 6637269504.0, + "8": 6637269504.0, + "9": 6637269504.0, + "10": 6637269504.0, + "11": 6637269504.0, + "12": 6637269504.0, + "13": 6637269504.0, + "14": 6637269504.0, + "15": 6637269504.0, + "16": 6637269504.0, + "17": 6637269504.0, + "18": 6637269504.0, + "19": 6637269504.0, + "20": 6637269504.0, + "21": 6637269504.0, + "22": 6637269504.0, + "23": 6637269504.0, + "24": 6637269504.0, + "25": 6637269504.0, + "26": 6637269504.0, + "27": 6637269504.0, + "28": 6637269504.0, + "29": 6637269504.0, + "30": 6637269504.0, + "31": 6637269504.0, + "32": 6637269504.0, + "33": 6637269504.0, + "34": 6637269504.0, + "35": 6637269504.0, + "36": 6637269504.0, + "37": 6637269504.0, + "38": 6637269504.0, + "39": 6637269504.0, + "40": 6637269504.0, + "41": 6637269504.0, + "42": 6637269504.0, + "43": 6637269504.0, + "44": 6637269504.0, + "45": 6637269504.0, + "46": 6637269504.0, + "47": 6637269504.0, + "48": 6637269504.0, + "49": 6637269504.0, + "50": 6637269504.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 55056003072.0, - "2": 57810763776.0, - "3": 57920647168.0, - "4": 57920647168.0, - "5": 57920647168.0, - "6": 57920647168.0, - "7": 57920647168.0, - "8": 57920647168.0, - "9": 57920647168.0, - "10": 57920647168.0, - "11": 57920647168.0, - "12": 57920647168.0, - "13": 57920647168.0, - "14": 57920647168.0, - "15": 57920647168.0, - "16": 57920647168.0, - "17": 57920647168.0, - "18": 57920647168.0, - "19": 57920647168.0, - "20": 57920647168.0, - "21": 57920647168.0, - "22": 57920647168.0, - "23": 57920647168.0, - "24": 57920647168.0, - "25": 57920647168.0, - "26": 57920647168.0, - "27": 57920647168.0, - "28": 57920647168.0, - "29": 57920647168.0, - "30": 57920647168.0, - "31": 57920647168.0, - "32": 57920647168.0, - "33": 57920647168.0, - "34": 57961472000.0, - "35": 57961472000.0, - "36": 57961472000.0, - "37": 57961472000.0, - "38": 57961472000.0, - "39": 57961472000.0, - "40": 57961472000.0, - "41": 57961472000.0, - "42": 57961472000.0, - "43": 57961472000.0, - "44": 57961472000.0, - "45": 57961472000.0, - "46": 57961472000.0, - "47": 57961472000.0, - "48": 57961472000.0, - "49": 57961472000.0, - "50": 57961472000.0 + "1": 55055331328.0, + "2": 57809321984.0, + "3": 57918455808.0, + "4": 57918455808.0, + "5": 57918455808.0, + "6": 57918455808.0, + "7": 57918455808.0, + "8": 57918455808.0, + "9": 57918455808.0, + "10": 57918455808.0, + "11": 57918455808.0, + "12": 57918455808.0, + "13": 57931390976.0, + "14": 57931390976.0, + "15": 57931390976.0, + "16": 57931390976.0, + "17": 57931390976.0, + "18": 57931390976.0, + "19": 57931390976.0, + "20": 57931390976.0, + "21": 57931390976.0, + "22": 57931390976.0, + "23": 57931390976.0, + "24": 57931390976.0, + "25": 57931390976.0, + "26": 57931390976.0, + "27": 57931390976.0, + "28": 57931390976.0, + "29": 57931390976.0, + "30": 57931390976.0, + "31": 57931390976.0, + "32": 58003226624.0, + "33": 58003226624.0, + "34": 58003226624.0, + "35": 58003226624.0, + "36": 58003226624.0, + "37": 58003226624.0, + "38": 58003226624.0, + "39": 58003226624.0, + "40": 58003226624.0, + "41": 58003226624.0, + "42": 58003226624.0, + "43": 58003226624.0, + "44": 58183614464.0, + "45": 58234208256.0, + "46": 58555555840.0, + "47": 58555555840.0, + "48": 58555555840.0, + "49": 58555555840.0, + "50": 58780934144.0 } }, "mtp_1 loss": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.07648, - "2": 11.07404, - "3": 10.53854, - "4": 10.09813, - "5": 9.81166, - "6": 10.09741, - "7": 9.79481, - "8": 9.0642, - "9": 8.86016, - "10": 9.34039, - "11": 8.51318, - "12": 8.59467, - "13": 8.5292, - "14": 7.95757, - "15": 8.06962, - "16": 8.11802, - "17": 8.06993, - "18": 7.80587, - "19": 8.19192, - "20": 7.8906, - "21": 7.57063, - "22": 7.55091, - "23": 7.41606, - "24": 7.42454, - "25": 7.65274, - "26": 7.05583, - "27": 7.59747, - "28": 7.29984, - "29": 7.472, - "30": 7.61908, - "31": 7.35179, - "32": 7.52979, - "33": 7.59161, - "34": 7.66287, - "35": 7.17383, - "36": 7.04133, - "37": 7.37081, - "38": 7.1443, - "39": 7.50879, - "40": 7.48921, - "41": 7.43802, - "42": 7.19405, - "43": 7.17581, - "44": 7.35785, - "45": 7.13985, - "46": 6.84014, - "47": 7.25094, - "48": 7.09407, - "49": 7.52321, - "50": 6.98987 + "1": 11.07654, + "2": 11.07406, + "3": 10.53881, + "4": 10.09803, + "5": 9.81154, + "6": 10.06236, + "7": 9.79762, + "8": 9.07117, + "9": 8.87049, + "10": 9.127, + "11": 8.49853, + "12": 8.53046, + "13": 8.42444, + "14": 7.847, + "15": 7.99077, + "16": 8.05015, + "17": 8.00064, + "18": 7.73104, + "19": 8.11087, + "20": 7.82933, + "21": 7.52501, + "22": 7.49916, + "23": 7.36982, + "24": 7.37235, + "25": 7.61578, + "26": 7.02029, + "27": 7.56014, + "28": 7.2681, + "29": 7.44399, + "30": 7.58618, + "31": 7.32468, + "32": 7.50596, + "33": 7.5715, + "34": 7.63581, + "35": 7.15224, + "36": 7.01784, + "37": 7.35163, + "38": 7.12551, + "39": 7.48656, + "40": 7.47408, + "41": 7.42096, + "42": 7.17595, + "43": 7.16059, + "44": 7.34289, + "45": 7.11969, + "46": 6.82753, + "47": 7.23525, + "48": 7.08042, + "49": 7.51043, + "50": 6.9735 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 93.39829, - "2": 1.82958, - "3": 1.3241, - "4": 2.19661, - "5": 2.13156, - "6": 1.75452, - "7": 2.08539, - "8": 1.58016, - "9": 1.60816, - "10": 1.03407, - "11": 1.01797, - "12": 1.0168, - "13": 1.01666, - "14": 1.0748, - "15": 1.04137, - "16": 1.05864, - "17": 1.05961, - "18": 1.03233, - "19": 1.02728, - "20": 1.02917, - "21": 1.04313, - "22": 1.03054, - "23": 1.0313, - "24": 1.03789, - "25": 1.04414, - "26": 1.05561, - "27": 1.03361, - "28": 1.03142, - "29": 1.02437, - "30": 1.02195, - "31": 1.0172, - "32": 1.03318, - "33": 1.03742, - "34": 1.03628, - "35": 1.03575, - "36": 1.05127, - "37": 1.03273, - "38": 1.03381, - "39": 1.02923, - "40": 1.02986, - "41": 1.03249, - "42": 1.033, - "43": 1.03169, - "44": 1.03818, - "45": 1.02736, - "46": 1.02698, - "47": 1.03158, - "48": 1.02471, - "49": 1.03674, - "50": 1.0291 + "1": 69.29797, + "2": 1.7261, + "3": 1.40981, + "4": 2.16562, + "5": 1.7862, + "6": 1.7469, + "7": 1.96688, + "8": 1.97301, + "9": 1.74665, + "10": 1.69613, + "11": 1.02979, + "12": 1.02408, + "13": 1.03261, + "14": 1.02432, + "15": 1.0529, + "16": 1.04491, + "17": 1.03693, + "18": 1.03399, + "19": 1.03627, + "20": 1.02284, + "21": 1.01667, + "22": 1.02932, + "23": 1.03591, + "24": 1.03466, + "25": 1.03149, + "26": 1.03165, + "27": 1.02342, + "28": 1.03777, + "29": 1.04061, + "30": 1.05641, + "31": 1.02382, + "32": 1.01775, + "33": 1.03039, + "34": 1.03693, + "35": 1.03153, + "36": 1.02699, + "37": 1.02756, + "38": 1.02919, + "39": 1.01773, + "40": 1.03491, + "41": 1.03152, + "42": 1.03035, + "43": 1.0221, + "44": 1.05201, + "45": 1.02579, + "46": 1.02798, + "47": 1.03857, + "48": 1.02772, + "49": 1.0408, + "50": 1.03745 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json index 8cea616921e..58eb3fc16cd 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 10.94971, - "2": 10.95163, - "3": 10.51641, - "4": 9.9652, - "5": 9.94116, - "6": 9.67394, - "7": 10.19887, - "8": 9.50035, - "9": 9.54982, - "10": 9.79667, - "11": 9.30128, - "12": 9.40566, - "13": 9.39438, - "14": 8.84572, - "15": 9.02231, - "16": 9.06973, - "17": 9.04712, - "18": 8.75662, - "19": 9.18074, - "20": 8.86175, - "21": 8.53558, - "22": 8.55288, - "23": 8.42513, - "24": 8.37683, - "25": 8.64426, - "26": 7.9756, - "27": 8.57026, - "28": 8.1987, - "29": 8.39406, - "30": 8.67631, - "31": 8.29096, - "32": 8.43692, - "33": 8.55897, - "34": 8.66123, - "35": 8.08, - "36": 7.95214, - "37": 8.2979, - "38": 7.98177, - "39": 8.39281, - "40": 8.35852, - "41": 8.32006, - "42": 8.05954, - "43": 8.03381, - "44": 8.24236, - "45": 8.1025, - "46": 7.61814, - "47": 8.15364, - "48": 8.00693, - "49": 8.38704, - "50": 7.81592 + "1": 10.95004, + "2": 10.9521, + "3": 10.5115, + "4": 9.96454, + "5": 9.93941, + "6": 9.67273, + "7": 10.20975, + "8": 9.49716, + "9": 9.55902, + "10": 9.79742, + "11": 9.30109, + "12": 9.40483, + "13": 9.39546, + "14": 8.84681, + "15": 9.02444, + "16": 9.07121, + "17": 9.04574, + "18": 8.75678, + "19": 9.18159, + "20": 8.8595, + "21": 8.53503, + "22": 8.55182, + "23": 8.42441, + "24": 8.37608, + "25": 8.64304, + "26": 7.97393, + "27": 8.56806, + "28": 8.19764, + "29": 8.3928, + "30": 8.67283, + "31": 8.289, + "32": 8.43572, + "33": 8.5568, + "34": 8.66018, + "35": 8.07934, + "36": 7.94976, + "37": 8.29565, + "38": 7.98044, + "39": 8.39201, + "40": 8.35513, + "41": 8.31876, + "42": 8.0583, + "43": 8.03283, + "44": 8.24243, + "45": 8.10277, + "46": 7.61696, + "47": 8.15273, + "48": 8.00569, + "49": 8.38688, + "50": 7.81491 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 19403704.0, - "2": 19274216.0, - "3": 22517470.0, - "4": 83429816.0, - "5": 139167728.0, - "6": 138921280.0, - "7": 173470304.0, - "8": 200511856.0, - "9": 165696320.0, - "10": 166120112.0, - "11": 213254416.0, - "12": 187847360.0, - "13": 231586656.0, - "14": 226879072.0, - "15": 219025920.0, - "16": 205179664.0, - "17": 280450432.0, - "18": 181477792.0, - "19": 191026096.0, - "20": 186395632.0, - "21": 233632576.0, - "22": 231696832.0, - "23": 216390688.0, - "24": 215133760.0, - "25": 233079504.0, - "26": 244437920.0, - "27": 222637584.0, - "28": 278773952.0, - "29": 253409264.0, - "30": 240036736.0, - "31": 236599008.0, - "32": 205066624.0, - "33": 263303312.0, - "34": 200444544.0, - "35": 199033824.0, - "36": 243001216.0, - "37": 151181872.0, - "38": 175301280.0, - "39": 219001024.0, - "40": 220307936.0, - "41": 217385856.0, - "42": 230074176.0, - "43": 208226784.0, - "44": 148172720.0, - "45": 141103744.0, - "46": 132664976.0, - "47": 179619392.0, - "48": 118381144.0, - "49": 86643984.0, - "50": 113798320.0 + "1": 19403624.0, + "2": 19274194.0, + "3": 19372760.0, + "4": 86525248.0, + "5": 148575568.0, + "6": 145226704.0, + "7": 171879984.0, + "8": 195785248.0, + "9": 164124752.0, + "10": 167684736.0, + "11": 221077344.0, + "12": 200384224.0, + "13": 248872528.0, + "14": 211169424.0, + "15": 214304608.0, + "16": 216075632.0, + "17": 267845984.0, + "18": 170470336.0, + "19": 176865072.0, + "20": 187955392.0, + "21": 225750704.0, + "22": 247396816.0, + "23": 211643856.0, + "24": 205638464.0, + "25": 277022272.0, + "26": 291562304.0, + "27": 225789840.0, + "28": 288202368.0, + "29": 198390384.0, + "30": 213302208.0, + "31": 227204752.0, + "32": 271112416.0, + "33": 231840432.0, + "34": 203575536.0, + "35": 191152368.0, + "36": 222566928.0, + "37": 177810112.0, + "38": 228708544.0, + "39": 211168784.0, + "40": 215603968.0, + "41": 200089440.0, + "42": 228529888.0, + "43": 198782848.0, + "44": 141902272.0, + "45": 181922816.0, + "46": 115369856.0, + "47": 170214176.0, + "48": 137292832.0, + "49": 97654936.0, + "50": 160979632.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 4883287040.0, - "2": 4883441152.0, - "3": 4881697280.0, - "4": 4883730944.0, - "5": 4882556416.0, - "6": 4882616832.0, - "7": 4883438080.0, - "8": 4881568256.0, - "9": 4883173888.0, - "10": 4882272768.0, - "11": 4883676672.0, - "12": 4881393152.0, - "13": 4883141120.0, - "14": 4883697152.0, - "15": 4882622976.0, - "16": 4881830400.0, - "17": 4881658368.0, - "18": 4881863168.0, - "19": 4883804672.0, - "20": 4881795584.0, - "21": 4883333632.0, - "22": 4882194944.0, - "23": 4882084352.0, - "24": 4884065792.0, - "25": 4881804800.0, - "26": 4883596800.0, - "27": 4883047936.0, - "28": 4882476544.0, - "29": 4883087872.0, - "30": 4882151936.0, - "31": 4882625024.0, - "32": 4883104256.0, - "33": 4882526720.0, - "34": 4882292224.0, - "35": 4882485760.0, - "36": 4882867712.0, - "37": 4882634240.0, - "38": 4882610688.0, - "39": 4881474048.0, - "40": 4881961472.0, - "41": 4882663936.0, - "42": 4881860096.0, - "43": 4881499648.0, - "44": 4883392000.0, - "45": 4882392576.0, - "46": 4882815488.0, - "47": 4883113472.0, - "48": 4882158080.0, - "49": 4881207808.0, - "50": 4881588736.0 + "1": 4883602432.0, + "2": 4885017088.0, + "3": 4882657792.0, + "4": 4883046912.0, + "5": 4883725824.0, + "6": 4883713536.0, + "7": 4883040768.0, + "8": 4883273216.0, + "9": 4882952704.0, + "10": 4885949952.0, + "11": 4883990016.0, + "12": 4887679488.0, + "13": 4884011520.0, + "14": 4882899456.0, + "15": 4883515904.0, + "16": 4883990016.0, + "17": 4883410432.0, + "18": 4883673600.0, + "19": 4882903552.0, + "20": 4884541952.0, + "21": 4883138048.0, + "22": 4883247616.0, + "23": 4883839488.0, + "24": 4885058048.0, + "25": 4882676224.0, + "26": 4884058624.0, + "27": 4884724224.0, + "28": 4884874752.0, + "29": 4883127808.0, + "30": 4883252736.0, + "31": 4882955776.0, + "32": 4885190144.0, + "33": 4883845632.0, + "34": 4884392448.0, + "35": 4883083776.0, + "36": 4883851776.0, + "37": 4885246464.0, + "38": 4882680320.0, + "39": 4884296192.0, + "40": 4884689408.0, + "41": 4882836992.0, + "42": 4883972608.0, + "43": 4884519424.0, + "44": 4883354112.0, + "45": 4883495424.0, + "46": 4882788864.0, + "47": 4883144192.0, + "48": 4883688960.0, + "49": 4884182528.0, + "50": 4885279232.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 41208348672.0, - "2": 41208348672.0, - "3": 41208348672.0, - "4": 41208348672.0, - "5": 41208348672.0, - "6": 41208348672.0, - "7": 41208348672.0, - "8": 41208348672.0, - "9": 41208348672.0, - "10": 41208348672.0, - "11": 41208348672.0, - "12": 41208348672.0, - "13": 41208348672.0, - "14": 41208348672.0, - "15": 41208348672.0, - "16": 41208348672.0, - "17": 41208348672.0, - "18": 41208348672.0, - "19": 41208348672.0, - "20": 41208348672.0, - "21": 41208348672.0, - "22": 41208348672.0, - "23": 41208348672.0, - "24": 41208348672.0, - "25": 41208348672.0, - "26": 41208348672.0, - "27": 41208348672.0, - "28": 41208348672.0, - "29": 41208348672.0, - "30": 41208348672.0, - "31": 41208348672.0, - "32": 41208348672.0, - "33": 41208348672.0, - "34": 41208348672.0, - "35": 41208348672.0, - "36": 41208348672.0, - "37": 41208348672.0, - "38": 41208348672.0, - "39": 41208348672.0, - "40": 41208348672.0, - "41": 41208348672.0, - "42": 41208348672.0, - "43": 41208348672.0, - "44": 41208348672.0, - "45": 41208348672.0, - "46": 41208348672.0, - "47": 41208348672.0, - "48": 41208348672.0, - "49": 41208348672.0, - "50": 41208348672.0 + "1": 41210470400.0, + "2": 41210470400.0, + "3": 41210470400.0, + "4": 41210470400.0, + "5": 41210470400.0, + "6": 41210470400.0, + "7": 41210470400.0, + "8": 41210470400.0, + "9": 41210470400.0, + "10": 41210470400.0, + "11": 41210470400.0, + "12": 41210470400.0, + "13": 41210470400.0, + "14": 41210470400.0, + "15": 41210470400.0, + "16": 41210470400.0, + "17": 41210470400.0, + "18": 41210470400.0, + "19": 41210470400.0, + "20": 41210470400.0, + "21": 41210470400.0, + "22": 41210470400.0, + "23": 41210470400.0, + "24": 41210470400.0, + "25": 41210470400.0, + "26": 41210470400.0, + "27": 41210470400.0, + "28": 41210470400.0, + "29": 41210470400.0, + "30": 41210470400.0, + "31": 41210470400.0, + "32": 41210470400.0, + "33": 41210470400.0, + "34": 41210470400.0, + "35": 41210470400.0, + "36": 41210470400.0, + "37": 41210470400.0, + "38": 41210470400.0, + "39": 41210470400.0, + "40": 41210470400.0, + "41": 41210470400.0, + "42": 41210470400.0, + "43": 41210470400.0, + "44": 41210470400.0, + "45": 41210470400.0, + "46": 41210470400.0, + "47": 41210470400.0, + "48": 41210470400.0, + "49": 41210470400.0, + "50": 41210470400.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 89.10928, - "2": 1.08143, - "3": 0.94222, - "4": 0.89675, - "5": 1.34524, - "6": 1.06972, - "7": 1.00314, - "8": 1.04961, - "9": 0.86611, - "10": 0.86248, - "11": 0.98739, - "12": 0.86057, - "13": 0.86777, - "14": 0.85834, - "15": 0.8559, - "16": 0.85522, - "17": 0.84644, - "18": 0.85748, - "19": 0.85218, - "20": 0.85342, - "21": 0.84029, - "22": 0.84342, - "23": 0.84297, - "24": 0.83925, - "25": 0.8439, - "26": 0.85696, - "27": 0.83981, - "28": 0.84643, - "29": 0.8433, - "30": 0.86234, - "31": 0.85636, - "32": 0.84184, - "33": 0.84501, - "34": 0.84316, - "35": 0.83806, - "36": 0.84143, - "37": 0.84447, - "38": 0.84137, - "39": 0.84133, - "40": 0.84321, - "41": 0.84019, - "42": 0.84164, - "43": 0.83741, - "44": 0.84203, - "45": 0.83966, - "46": 0.84109, - "47": 0.83945, - "48": 0.84001, - "49": 0.84194, - "50": 0.83578 + "1": 86.8085, + "2": 1.10913, + "3": 0.99097, + "4": 0.89412, + "5": 1.25997, + "6": 0.98162, + "7": 0.98318, + "8": 1.13296, + "9": 0.88126, + "10": 0.8633, + "11": 2.2744, + "12": 4.5393, + "13": 3.22763, + "14": 1.64923, + "15": 0.86595, + "16": 0.86575, + "17": 0.85272, + "18": 0.85454, + "19": 0.85281, + "20": 0.87018, + "21": 0.84654, + "22": 0.8494, + "23": 0.84882, + "24": 0.84482, + "25": 0.85311, + "26": 0.84678, + "27": 0.84096, + "28": 0.8412, + "29": 0.84156, + "30": 0.84475, + "31": 0.84747, + "32": 0.85058, + "33": 0.84977, + "34": 0.8479, + "35": 0.85234, + "36": 0.85012, + "37": 0.85087, + "38": 0.84594, + "39": 0.84558, + "40": 0.84807, + "41": 0.84183, + "42": 0.8439, + "43": 0.84221, + "44": 0.84248, + "45": 0.84257, + "46": 0.83922, + "47": 0.84311, + "48": 0.84159, + "49": 0.84011, + "50": 0.8353 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json index 0835e95b926..1ba051f4889 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json @@ -1,142 +1 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 100, - "step_interval": 5, - "values": { - "1": 10.82922, - "5": 10.85652, - "10": 10.79298, - "15": 10.8067, - "20": 10.72654, - "25": 10.53282, - "30": 10.35802, - "35": 10.24483, - "40": 10.05533, - "45": 9.77951, - "50": 9.86874, - "55": 9.82995, - "60": 9.449, - "65": 8.89366, - "70": 9.71127, - "75": 9.39451, - "80": 9.38198, - "85": 9.58333, - "90": 9.79944, - "95": 9.50213, - "100": 9.37131 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 100, - "step_interval": 5, - "values": { - "1": 27245.0, - "5": 31369.0, - "10": 25870.0, - "15": 29830.0, - "20": 28243.0, - "25": 27636.0, - "30": 30387.0, - "35": 31488.0, - "40": 34779.0, - "45": 35158.0, - "50": 38234.0, - "55": 37133.0, - "60": 40450.0, - "65": 40947.0, - "70": 43436.0, - "75": 39925.0, - "80": 51863.0, - "85": 2145177.0, - "90": 51330.0, - "95": 45247.0, - "100": 163741.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 100, - "step_interval": 5, - "values": { - "1": 787511296.0, - "5": 787542016.0, - "10": 787500032.0, - "15": 787499008.0, - "20": 787500032.0, - "25": 787446272.0, - "30": 787429888.0, - "35": 787413504.0, - "40": 787409920.0, - "45": 787394560.0, - "50": 787384320.0, - "55": 787383808.0, - "60": 787389952.0, - "65": 787346432.0, - "70": 787387904.0, - "75": 787437568.0, - "80": 787405312.0, - "85": 787407360.0, - "90": 787441664.0, - "95": 787445248.0, - "100": 787433472.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 100, - "step_interval": 5, - "values": { - "1": 2465793024.0, - "5": 2492764160.0, - "10": 2492764160.0, - "15": 2492764160.0, - "20": 2492764160.0, - "25": 2492764160.0, - "30": 2492764160.0, - "35": 2492764160.0, - "40": 2492764160.0, - "45": 2492764160.0, - "50": 2492764160.0, - "55": 2492764160.0, - "60": 2492764160.0, - "65": 2492764160.0, - "70": 2492764160.0, - "75": 2492764160.0, - "80": 2492764160.0, - "85": 2492764160.0, - "90": 2492764160.0, - "95": 2492764160.0, - "100": 2492764160.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 100, - "step_interval": 5, - "values": { - "1": 9.68104, - "5": 0.32859, - "10": 0.30772, - "15": 0.31234, - "20": 0.29254, - "25": 0.29296, - "30": 0.31344, - "35": 0.31026, - "40": 0.30514, - "45": 0.30481, - "50": 0.30324, - "55": 0.29929, - "60": 0.30103, - "65": 0.32008, - "70": 0.31307, - "75": 0.2933, - "80": 0.29351, - "85": 0.29283, - "90": 0.29375, - "95": 0.29458, - "100": 0.29103 - } - } -} \ No newline at end of file +{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.83281, "5": 10.85975, "10": 10.79613, "15": 10.80527, "20": 10.72502, "25": 10.53599, "30": 10.3571, "35": 10.24605, "40": 10.05992, "45": 9.7836, "50": 9.8722, "55": 9.83189, "60": 9.45075, "65": 8.89679, "70": 9.71414, "75": 9.39795, "80": 9.38169, "85": 9.58585, "90": 9.7999, "95": 9.50528, "100": 9.37224}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 27013.0, "5": 31736.0, "10": 25785.0, "15": 30383.0, "20": 28435.0, "25": 27493.0, "30": 30329.0, "35": 31750.0, "40": 34279.0, "45": 34634.0, "50": 38531.0, "55": 37465.0, "60": 40172.0, "65": 40624.0, "70": 44852.0, "75": 39231.0, "80": 130535.0, "85": 123250.0, "90": 47793.0, "95": 167340.0, "100": 163328.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 814390272.0, "5": 814420480.0, "10": 814376448.0, "15": 814376960.0, "20": 814373376.0, "25": 814321152.0, "30": 814306304.0, "35": 814292992.0, "40": 814288896.0, "45": 814272000.0, "50": 814262272.0, "55": 814258688.0, "60": 814268416.0, "65": 814220800.0, "70": 814266880.0, "75": 814318080.0, "80": 814285312.0, "85": 814289408.0, "90": 814315520.0, "95": 814320128.0, "100": 814311424.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2111314944.0, "5": 2370209280.0, "10": 2370209280.0, "15": 2370209280.0, "20": 2370209280.0, "25": 2370209280.0, "30": 2370209280.0, "35": 2370209280.0, "40": 2370209280.0, "45": 2370209280.0, "50": 2370209280.0, "55": 2370209280.0, "60": 2370209280.0, "65": 2370209280.0, "70": 2370209280.0, "75": 2370209280.0, "80": 2370209280.0, "85": 2370209280.0, "90": 2370209280.0, "95": 2370209280.0, "100": 2370209280.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 20.98318, "5": 0.79797, "10": 0.74028, "15": 0.67279, "20": 0.62948, "25": 0.61132, "30": 0.61547, "35": 0.6152, "40": 0.60421, "45": 0.59124, "50": 0.5891, "55": 0.57048, "60": 0.54799, "65": 0.52185, "70": 0.51195, "75": 0.50105, "80": 0.4628, "85": 0.45992, "90": 0.46498, "95": 0.4599, "100": 0.42568}}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgxh100_coreweave.json deleted file mode 100644 index 7e299df5257..00000000000 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgxh100_coreweave.json +++ /dev/null @@ -1,537 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 10.82922, - "2": 10.84163, - "3": 10.84245, - "4": 10.82, - "5": 10.85652, - "6": 10.86906, - "7": 10.83778, - "8": 10.84312, - "9": 10.84423, - "10": 10.79298, - "11": 10.86697, - "12": 10.86875, - "13": 10.86207, - "14": 10.86919, - "15": 10.8067, - "16": 10.8057, - "17": 10.77686, - "18": 10.79541, - "19": 10.78384, - "20": 10.72654, - "21": 10.69491, - "22": 10.54462, - "23": 10.6993, - "24": 10.58151, - "25": 10.53282, - "26": 10.58817, - "27": 10.601, - "28": 10.57563, - "29": 10.58022, - "30": 10.35802, - "31": 10.08769, - "32": 10.44466, - "33": 10.4477, - "34": 10.18704, - "35": 10.24483, - "36": 10.19713, - "37": 10.32294, - "38": 10.17101, - "39": 10.37026, - "40": 10.05533, - "41": 10.09491, - "42": 10.17971, - "43": 9.78263, - "44": 9.91346, - "45": 9.77951, - "46": 9.75648, - "47": 10.09647, - "48": 9.80391, - "49": 9.46649, - "50": 9.86874, - "51": 9.79428, - "52": 9.68303, - "53": 10.03314, - "54": 9.9113, - "55": 9.82995, - "56": 9.57839, - "57": 9.42377, - "58": 9.80549, - "59": 9.53292, - "60": 9.449, - "61": 9.65293, - "62": 9.95672, - "63": 9.33775, - "64": 9.74194, - "65": 8.89366, - "66": 9.67317, - "67": 9.33002, - "68": 9.76517, - "69": 9.76336, - "70": 9.71127, - "71": 9.59511, - "72": 9.54797, - "73": 9.47124, - "74": 8.89297, - "75": 9.39451, - "76": 9.04721, - "77": 10.04318, - "78": 9.70313, - "79": 9.35169, - "80": 9.38198, - "81": 9.45146, - "82": 9.67546, - "83": 9.27658, - "84": 9.39241, - "85": 9.58333, - "86": 9.04518, - "87": 9.56487, - "88": 9.72459, - "89": 9.57019, - "90": 9.79944, - "91": 9.30737, - "92": 9.3313, - "93": 9.04109, - "94": 8.80259, - "95": 9.50213, - "96": 9.5021, - "97": 9.28183, - "98": 9.64883, - "99": 8.8594, - "100": 9.37131 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 27245.0, - "2": 28958.0, - "3": 29464.0, - "4": 28046.0, - "5": 31369.0, - "6": 33287.0, - "7": 31200.0, - "8": 26921.0, - "9": 30008.0, - "10": 25870.0, - "11": 33681.0, - "12": 30344.0, - "13": 32737.0, - "14": 33315.0, - "15": 29830.0, - "16": 32475.0, - "17": 30747.0, - "18": 30381.0, - "19": 31032.0, - "20": 28243.0, - "21": 29224.0, - "22": 27340.0, - "23": 34119.0, - "24": 29049.0, - "25": 27636.0, - "26": 30662.0, - "27": 32009.0, - "28": 33355.0, - "29": 34714.0, - "30": 30387.0, - "31": 28212.0, - "32": 33411.0, - "33": 34696.0, - "34": 30053.0, - "35": 31488.0, - "36": 32943.0, - "37": 35829.0, - "38": 33740.0, - "39": 37632.0, - "40": 34779.0, - "41": 33958.0, - "42": 36396.0, - "43": 34088.0, - "44": 34090.0, - "45": 35158.0, - "46": 36174.0, - "47": 39772.0, - "48": 36516.0, - "49": 36733.0, - "50": 38234.0, - "51": 38608.0, - "52": 37030.0, - "53": 42442.0, - "54": 40944.0, - "55": 37133.0, - "56": 41001.0, - "57": 37524.0, - "58": 42317.0, - "59": 40804.0, - "60": 40450.0, - "61": 41478.0, - "62": 39766.0, - "63": 37941.0, - "64": 42197.0, - "65": 40947.0, - "66": 44094.0, - "67": 41958.0, - "68": 40060.0, - "69": 42189.0, - "70": 43436.0, - "71": 42748.0, - "72": 44280.0, - "73": 47478.0, - "74": 41456.0, - "75": 39925.0, - "76": 43490.0, - "77": 45636.0, - "78": 2141470.0, - "79": 46055.0, - "80": 51863.0, - "81": 151341.0, - "82": 49835.0, - "83": 143360.0, - "84": 2141546.0, - "85": 2145177.0, - "86": 132114.0, - "87": 2147022.0, - "88": 59899.0, - "89": 162883.0, - "90": 51330.0, - "91": 2141901.0, - "92": 44946.0, - "93": 138194.0, - "94": 2145772.0, - "95": 45247.0, - "96": 135045.0, - "97": 53170.0, - "98": 168576.0, - "99": 2141797.0, - "100": 163741.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 787516416.0, - "2": 787540992.0, - "3": 787524096.0, - "4": 787512320.0, - "5": 787547136.0, - "6": 787537920.0, - "7": 787512832.0, - "8": 787524608.0, - "9": 787528192.0, - "10": 787505152.0, - "11": 787522048.0, - "12": 787520000.0, - "13": 787529728.0, - "14": 787529216.0, - "15": 787504128.0, - "16": 787513344.0, - "17": 787503104.0, - "18": 787489280.0, - "19": 787514880.0, - "20": 787505152.0, - "21": 787479552.0, - "22": 787486208.0, - "23": 787478528.0, - "24": 787486208.0, - "25": 787451392.0, - "26": 787482112.0, - "27": 787470848.0, - "28": 787450368.0, - "29": 787458048.0, - "30": 787435008.0, - "31": 787406848.0, - "32": 787424256.0, - "33": 787435520.0, - "34": 787426304.0, - "35": 787418624.0, - "36": 787436544.0, - "37": 787428352.0, - "38": 787436544.0, - "39": 787417600.0, - "40": 787415040.0, - "41": 787405824.0, - "42": 787415040.0, - "43": 787367936.0, - "44": 787392512.0, - "45": 787399680.0, - "46": 787355136.0, - "47": 787411456.0, - "48": 787354112.0, - "49": 787374080.0, - "50": 787389440.0, - "51": 787375616.0, - "52": 787383808.0, - "53": 787379712.0, - "54": 787384832.0, - "55": 787388928.0, - "56": 787388928.0, - "57": 787351040.0, - "58": 787382784.0, - "59": 787374080.0, - "60": 787395072.0, - "61": 787405312.0, - "62": 787405824.0, - "63": 787373056.0, - "64": 787388928.0, - "65": 787351552.0, - "66": 787386880.0, - "67": 787392000.0, - "68": 787399168.0, - "69": 787383296.0, - "70": 787393024.0, - "71": 787406848.0, - "72": 787400704.0, - "73": 787401216.0, - "74": 787403264.0, - "75": 787442688.0, - "76": 787444736.0, - "77": 787445760.0, - "78": 787395072.0, - "79": 787430400.0, - "80": 787410432.0, - "81": 787412992.0, - "82": 787427840.0, - "83": 787428864.0, - "84": 787412480.0, - "85": 787412480.0, - "86": 787394560.0, - "87": 787452928.0, - "88": 787414528.0, - "89": 787404800.0, - "90": 787446784.0, - "91": 787446272.0, - "92": 787446784.0, - "93": 787430400.0, - "94": 787440128.0, - "95": 787450368.0, - "96": 787454976.0, - "97": 787427328.0, - "98": 787475968.0, - "99": 787419136.0, - "100": 787438592.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 2479493120.0, - "2": 2485449728.0, - "3": 2487249408.0, - "4": 2487249408.0, - "5": 2495991808.0, - "6": 2495991808.0, - "7": 2495991808.0, - "8": 2495991808.0, - "9": 2495991808.0, - "10": 2495991808.0, - "11": 2495991808.0, - "12": 2495991808.0, - "13": 2495991808.0, - "14": 2495991808.0, - "15": 2495991808.0, - "16": 2495991808.0, - "17": 2495991808.0, - "18": 2495991808.0, - "19": 2495991808.0, - "20": 2495991808.0, - "21": 2495991808.0, - "22": 2495991808.0, - "23": 2495991808.0, - "24": 2495991808.0, - "25": 2495991808.0, - "26": 2495991808.0, - "27": 2495991808.0, - "28": 2495991808.0, - "29": 2495991808.0, - "30": 2495991808.0, - "31": 2495991808.0, - "32": 2495991808.0, - "33": 2495991808.0, - "34": 2495991808.0, - "35": 2495991808.0, - "36": 2495991808.0, - "37": 2495991808.0, - "38": 2495991808.0, - "39": 2495991808.0, - "40": 2495991808.0, - "41": 2495991808.0, - "42": 2495991808.0, - "43": 2495991808.0, - "44": 2495991808.0, - "45": 2495991808.0, - "46": 2495991808.0, - "47": 2495991808.0, - "48": 2495991808.0, - "49": 2495991808.0, - "50": 2495991808.0, - "51": 2495991808.0, - "52": 2495991808.0, - "53": 2495991808.0, - "54": 2495991808.0, - "55": 2495991808.0, - "56": 2495991808.0, - "57": 2495991808.0, - "58": 2495991808.0, - "59": 2495991808.0, - "60": 2495991808.0, - "61": 2495991808.0, - "62": 2495991808.0, - "63": 2495991808.0, - "64": 2495991808.0, - "65": 2495991808.0, - "66": 2495991808.0, - "67": 2495991808.0, - "68": 2495991808.0, - "69": 2495991808.0, - "70": 2495991808.0, - "71": 2495991808.0, - "72": 2495991808.0, - "73": 2495991808.0, - "74": 2495991808.0, - "75": 2495991808.0, - "76": 2495991808.0, - "77": 2495991808.0, - "78": 2495991808.0, - "79": 2495991808.0, - "80": 2495991808.0, - "81": 2495991808.0, - "82": 2495991808.0, - "83": 2495991808.0, - "84": 2495991808.0, - "85": 2495991808.0, - "86": 2495991808.0, - "87": 2495991808.0, - "88": 2495991808.0, - "89": 2495991808.0, - "90": 2495991808.0, - "91": 2495991808.0, - "92": 2495991808.0, - "93": 2495991808.0, - "94": 2495991808.0, - "95": 2495991808.0, - "96": 2495991808.0, - "97": 2495991808.0, - "98": 2495991808.0, - "99": 2495991808.0, - "100": 2495991808.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 12.11313, - "2": 0.4805, - "3": 0.36965, - "4": 0.36695, - "5": 0.31705, - "6": 0.31275, - "7": 0.31299, - "8": 0.29866, - "9": 0.28961, - "10": 0.28859, - "11": 0.29067, - "12": 0.29044, - "13": 0.29806, - "14": 0.29287, - "15": 0.29391, - "16": 0.3175, - "17": 0.28363, - "18": 0.2818, - "19": 0.29347, - "20": 0.28931, - "21": 0.29103, - "22": 0.28444, - "23": 0.28907, - "24": 0.27608, - "25": 0.28277, - "26": 0.28656, - "27": 0.28921, - "28": 0.30243, - "29": 0.30435, - "30": 0.31231, - "31": 0.30439, - "32": 0.31412, - "33": 0.28887, - "34": 0.29613, - "35": 0.29738, - "36": 0.29754, - "37": 0.3019, - "38": 0.2933, - "39": 0.2944, - "40": 0.29283, - "41": 0.29592, - "42": 0.29673, - "43": 0.29319, - "44": 0.30127, - "45": 0.29921, - "46": 0.29904, - "47": 0.28795, - "48": 0.29918, - "49": 0.28711, - "50": 0.29645, - "51": 0.28777, - "52": 0.29536, - "53": 0.2847, - "54": 0.28286, - "55": 0.2874, - "56": 0.28699, - "57": 0.28614, - "58": 0.29825, - "59": 0.28363, - "60": 0.29423, - "61": 0.29226, - "62": 0.2896, - "63": 0.28065, - "64": 0.29533, - "65": 0.29842, - "66": 0.28487, - "67": 0.28419, - "68": 0.29474, - "69": 0.28383, - "70": 0.28417, - "71": 0.29253, - "72": 0.28737, - "73": 0.27923, - "74": 0.28728, - "75": 0.29383, - "76": 0.28157, - "77": 0.64771, - "78": 0.29148, - "79": 0.28742, - "80": 0.29245, - "81": 0.28827, - "82": 0.28368, - "83": 0.28963, - "84": 0.29234, - "85": 0.28183, - "86": 0.28337, - "87": 0.27879, - "88": 0.28388, - "89": 0.28309, - "90": 0.28852, - "91": 0.28254, - "92": 0.28375, - "93": 0.28633, - "94": 0.28567, - "95": 0.28235, - "96": 0.28513, - "97": 0.27951, - "98": 0.27851, - "99": 0.28336, - "100": 0.27744 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml index 8874f9cf045..3ecd68b9841 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml @@ -56,7 +56,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true - --ckpt-format: fsdp_dtensor + --ckpt-format: torch_dist --dist-ckpt-optim-fully-reshardable: true --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml index 607d48380d5..8164ca37df8 100644 --- a/tests/test_utils/recipes/moe.yaml +++ b/tests/test_utils/recipes/moe.yaml @@ -106,13 +106,14 @@ products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] - - environment: [lts] - scope: [nightly] + # TODO: The migration of custom fsdp causes EP + FSDP to be temporarily unavailable, which will be fixed in a subsequent MR. + # - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router] + # products: + # - environment: [dev] + # scope: [mr] + # platforms: [dgx_h100] + # - environment: [lts] + # scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective] products: - environment: [dev] diff --git a/tools/checkpoint/checkpoint_inspector.py b/tools/checkpoint/checkpoint_inspector.py index c62f0ca7417..34afa27755f 100644 --- a/tools/checkpoint/checkpoint_inspector.py +++ b/tools/checkpoint/checkpoint_inspector.py @@ -8,8 +8,6 @@ import time import re import shutil -from typing import Optional -import tempfile import click import torch @@ -21,7 +19,6 @@ FileSystemReader, FileSystemWriter, ) -from torch.distributed.checkpoint.format_utils import dcp_to_torch_save from torch.distributed.checkpoint.metadata import ( BytesStorageMetadata, TensorStorageMetadata, @@ -67,8 +64,7 @@ def cli(): @cli.command() @click.argument("checkpoint_dir", type=click.Path(exists=True)) @click.option("--enable-msc", is_flag=True, help="Enable MultiStorageClient feature.") -@click.option("--not-ignore-param-to-group-meta", is_flag=True, help="Ignore parameter-to-group metadata.") -def inspect(checkpoint_dir, enable_msc, not_ignore_param_to_group_meta): +def inspect(checkpoint_dir, enable_msc): """Inspect a Megatron Core Distributed Checkpoint""" ckpt_path = Path(checkpoint_dir) @@ -142,8 +138,6 @@ def inspect(checkpoint_dir, enable_msc, not_ignore_param_to_group_meta): ] click.echo(" | ".join(stats) + "\n") - ignore_param_to_group_meta = not not_ignore_param_to_group_meta - ignore_param_to_group_meta_count = 0 for key, value in metadata.state_dict_metadata.items(): bullet = click.style("►", fg="blue") key_styled = click.style(key, fg="green") @@ -153,18 +147,11 @@ def inspect(checkpoint_dir, enable_msc, not_ignore_param_to_group_meta): shape = click.style(f"{tuple(value.size)}", fg="magenta") click.echo(f" {bullet} {key_styled} [{dtype}, shape={shape}]") elif isinstance(value, BytesStorageMetadata): - if ignore_param_to_group_meta and key.startswith("optimizer.param_to_group_meta."): - ignore_param_to_group_meta_count += 1 - continue click.echo(f" {bullet} {key_styled} {click.style('[BYTES]', fg='yellow')}") else: click.echo( f" {bullet} {key_styled} {click.style('[UNKNOWN TYPE]', fg='red')}" ) - if ignore_param_to_group_meta: - click.echo( - click.style(f"Ignored parameter-to-group metadata: {ignore_param_to_group_meta_count}", fg="yellow") - ) # MCore data section try: @@ -336,10 +323,8 @@ def convert_checkpoint( output_dir, swiglu, process_group, - optimizer_param_to_group_prefix="optimizer.param_to_group_meta.module.module.module", optimizer_state_prefix="optimizer.state.module.module.module", model_weight_prefix="model.module", - param_to_param_group_map={}, ): """Convert a Megatron Core Distributed Checkpoint from torch_dist to standard fsdp_dtensor format.""" device_mesh = DeviceMesh.from_group(process_group, device_type="cuda") @@ -386,104 +371,6 @@ def _free_up_some_gpu_memory(): gc.collect() torch.cuda.empty_cache() - def split_layers( - key: str, - value: torch.Tensor, - orig_shape: Optional[torch.Size] = None, - ) -> dict[str, torch.Tensor]: - """ - Split layers into separate tensors. - """ - _free_up_some_gpu_memory() - layers = {} - for i, v in enumerate(split_dtensor(value, 1, dim=0)): - v = gather_uneven_dtensor_to_full_tensor(v).reshape( - orig_shape[1:] if orig_shape else value.shape[1:] - ).redistribute(placements=[Shard(0)]) - - layer_key = key.replace(".layers.", f".layers.{i}.") - layers[layer_key] = v - - return layers - - def split_expert_weights( - key: str, - value: torch.Tensor, - orig_shape: Optional[torch.Size] = None, - ) -> dict[str, torch.Tensor]: - """ - Split expert weights into separate tensors for each expert. - """ - experts = {} - layer_key = key.replace(".experts.experts.", ".experts.") - expert_weights = split_dtensor(value, 1, dim=0) - for expert_idx, expert_weight in enumerate(expert_weights): - layer_key_parts = layer_key.split(".weight", 1) - if len(layer_key_parts) == 1: - expert_key = f"{layer_key}{expert_idx}" - elif len(layer_key_parts) == 2: - expert_key = f"{layer_key_parts[0]}.weight{expert_idx}{layer_key_parts[1]}" - else: - raise ValueError(f"Unexpected expert layer key: {layer_key}") - - expert_weight = gather_uneven_dtensor_to_full_tensor(expert_weight) - expert_shape = orig_shape[1:] if orig_shape else value.shape[1:] - # Handle optimizer states for expert linear_fc2 when ETP is enabled - if ( - layer_key.startswith("optimizer.state.") - and "linear_fc2" in layer_key - and expert_weight.shape[-2] > 1 - ): - tp_size = expert_weight.shape[-2] - rows, cols = expert_shape - # Reshape to split column dimension by tp_size - expert_weight = expert_weight.reshape( - *expert_weight.shape[:-1], rows, cols // tp_size - ) - dims = list(range(expert_weight.ndim)) - dims[-3], dims[-2] = dims[-2], dims[-3] - expert_weight = ( - expert_weight.permute(*dims) - .reshape(expert_shape) - .redistribute(placements=[Shard(0)]) - ) - else: - expert_weight = expert_weight.reshape(expert_shape).redistribute( - placements=[Shard(0)] - ) - experts[expert_key] = expert_weight - return experts - - def is_swiglu_key(key): - return any(re.search(pat, key) for pat in [ - r"(.*)\.mlp\.linear_fc1\.weight", - r"(.*)\.mlp\.linear_fc1\.bias", - r"(.*)\.mlp\.experts\.linear_fc1\.weight(\d+)", - r"(.*)\.mlp\.experts\.linear_fc1\.bias(\d+)", - r"(.*)\.mlp\.experts\.local_experts\.(\d+)\.linear_fc1\.weight", - r"(.*)\.mlp\.experts\.local_experts\.(\d+)\.linear_fc1\.bias", - r"(.*)\.mlp\.shared_experts\.linear_fc1\.weight", - r"(.*)\.mlp\.shared_experts\.linear_fc1\.bias", - ]) - - def split_swiglu_weight(key: str, value: torch.Tensor) -> dict[str, torch.Tensor]: - """ - Split SwiGLU weights into separate tensors. - """ - value = gather_uneven_dtensor_to_full_tensor(value) - swiglu_w_and_v = {} - w, v = torch.chunk(value, 2, dim=0) - w = w.redistribute(placements=[Shard(0)]) - v = v.redistribute(placements=[Shard(0)]) - w_key = re.sub(r'(weight\d*)(.*)', r'\1_w\2', key) - v_key = re.sub(r'(weight\d*)(.*)', r'\1_v\2', key) - swiglu_w_and_v[w_key] = w - swiglu_w_and_v[v_key] = v - return swiglu_w_and_v - - def has_layer_index(key: str) -> bool: - return bool(re.search(r"layers\.(\d+)\.", key)) - while state_dict: key, value = state_dict.popitem() if torch.distributed.get_rank() == 0: @@ -500,11 +387,9 @@ def has_layer_index(key: str) -> bool: # Special handling for optimizer state key_list = key.split(".") new_key = f"{optimizer_state_prefix}.{'.'.join(key_list[3:])}.{key_list[2]}" - is_param = False else: # Special handling for module parameters new_key = f"{model_weight_prefix}.{key}" - is_param = True # Handle dist-opt flatten tensors if ( @@ -521,47 +406,68 @@ def has_layer_index(key: str) -> bool: else: orig_shape = None - # Handle multi-layer / experts tensors - split_tensors = {} - if ".layers." in new_key and not has_layer_index(new_key): - split_tensors = split_layers(new_key, value, orig_shape) - elif ".experts.experts." in new_key: - split_tensors = split_expert_weights(new_key, value, orig_shape) - else: - if orig_shape: - value = gather_uneven_dtensor_to_full_tensor(value) - # Handle optimizer states with partition_dim=1 when TP is enabled - if ( - new_key.startswith("optimizer.state.") - and value.ndim > 2 - and value.shape[-2] > 1 - ): - tp_size = value.shape[-2] - rows, cols = orig_shape - # Reshape to split column dimension by tp_size - value = value.reshape(*value.shape[:-1], rows, cols // tp_size) - dims = list(range(value.ndim)) - dims[-3], dims[-2] = dims[-2], dims[-3] - value = ( - value.permute(*dims) - .reshape(orig_shape) - .redistribute(placements=[Shard(0)]) + # Handle multi-layer tensors + if ".layers." in new_key: + n_layer = value.shape[0] + + _free_up_some_gpu_memory() + per_layer_values = [ + gather_uneven_dtensor_to_full_tensor(v).redistribute( + placements=[Shard(len(v.shape) - 1)] + ) + for v in split_dtensor(value, 1, dim=0) + ] + for i in range(n_layer): + if orig_shape is not None: + layer_shape = orig_shape[1:] + else: + layer_shape = value.shape[1:] + + per_layer_values[i] = ( + per_layer_values[i] + .reshape(layer_shape) + .redistribute(placements=[Shard(0)]) + ) + for i in range(0, n_layer): + layer_key = new_key.replace(".layers.", f".layers.{i}.") + if swiglu and "mlp.linear_fc1.weight" in layer_key: + # Special case for SwiGLU + w, v = torch.chunk(per_layer_values[i], 2, dim=0) + w = w.redistribute(placements=[Shard(0)]) + v = v.redistribute(placements=[Shard(0)]) + w_key = layer_key.replace( + "mlp.linear_fc1.weight", "mlp.linear_fc1.weight_w" + ) + v_key = layer_key.replace( + "mlp.linear_fc1.weight", "mlp.linear_fc1.weight_v" ) + # Store both w and v in the state_dict + fsdp_dtensor_state_dict[w_key] = w + fsdp_dtensor_state_dict[v_key] = v + elif ( + "experts.experts.linear_fc1.weight" in layer_key + or "experts.experts.linear_fc2.weight" in layer_key + ): + # Special case for MoE + layer_key = layer_key.replace(".experts.experts.", ".experts.") + expert_weights = torch.split(per_layer_values[i], 1, dim=0) + for expert_idx, expert_weight in enumerate(expert_weights): + expert_key = f"{layer_key}{expert_idx}" + fsdp_dtensor_state_dict[expert_key] = expert_weight.squeeze( + 0 + ) else: - value = value.reshape(orig_shape).redistribute(placements=[Shard(0)]) - split_tensors = {new_key: value} - - # Handle SWiGLU weights - for key, value in list(split_tensors.items()): - if swiglu and is_swiglu_key(key): - swiglu_w_and_v = split_swiglu_weight(key, value) - split_tensors.update(swiglu_w_and_v) - del split_tensors[key] - - fsdp_dtensor_state_dict.update(split_tensors) - if is_param and key in param_to_param_group_map: - for new_key in split_tensors.keys(): - param_to_param_group_map[new_key] = param_to_param_group_map[key] + # General case + fsdp_dtensor_state_dict[layer_key] = per_layer_values[i] + else: + if orig_shape is not None: + _free_up_some_gpu_memory() + value = ( + value.redistribute(placements=[Replicate()]) + .reshape(orig_shape) + .redistribute(placements=[Shard(0)]) + ) + fsdp_dtensor_state_dict[new_key] = value elif key.startswith("rng_state"): # Skip RNG states continue @@ -624,15 +530,6 @@ def has_layer_index(key: str) -> bool: ) ) common_state = common_strategy.load_common(input_dir) - try: - if "param_groups" in common_state["optimizer"]: - ckpt_param_groups = common_state["optimizer"]["param_groups"] - else: - ckpt_param_groups = [] - for opt_state_dict in common_state["optimizer"].values(): - ckpt_param_groups.extend(opt_state_dict["optimizer"]["param_groups"]) - except: - ckpt_param_groups = None common_state = flatten(common_state) for key, value in common_state.items(): if key.startswith("optimizer.optimizer.param_groups."): @@ -644,29 +541,12 @@ def has_layer_index(key: str) -> bool: ) fsdp_dtensor_state_dict[key] = value - # set up per-parameter param_groups - if param_to_param_group_map and ckpt_param_groups is not None: - for name in list(fsdp_dtensor_state_dict.keys()): - if not name.startswith(model_weight_prefix) or name.endswith(".expert_bias"): - continue - - assert name in param_to_param_group_map, f"Missing param group for {name}" - param_group_id = param_to_param_group_map[name] - assert param_group_id < len(ckpt_param_groups), f"Invalid param group id {param_group_id} for {name}" - name_without_prefix = name[len(model_weight_prefix):] - fsdp_dtensor_state_dict[ - f"{optimizer_param_to_group_prefix}.{name_without_prefix}" - ] = ckpt_param_groups[param_group_id] - if "checkpoint_version" not in fsdp_dtensor_state_dict: fsdp_dtensor_state_dict["checkpoint_version"] = 3.0 # Save modified checkpoint save_checkpoint_with_pickle_protocol(fsdp_dtensor_state_dict, output_dir) - dist.barrier() # Synchronize all ranks - dist.destroy_process_group() - @cli.command() @click.argument("input_dir", type=click.Path(exists=True)) @@ -680,6 +560,12 @@ def has_layer_index(key: str) -> bool: "--oom-traceback", is_flag=True, help="Enable OOM traceback for debugging." ) @click.option("--enable-msc", is_flag=True, help="Enable MultiStorageClient feature.") +@click.option( + "--distributed-timeout-minutes", + default=10, + type=int, + help="Timeout for distributed operations in minutes.", +) @click.option( "--output-optimizer-state-prefix", default="optimizer.state.module.module.module", @@ -690,21 +576,15 @@ def has_layer_index(key: str) -> bool: default="model.module", help="Prefix for model weight keys in the checkpoint.", ) -@click.option( - "--param-to-param-group-map-json", - type=str, - default="{}", - help="JSON string representing the param to parameter group map." -) def convert_torch_dist_to_fsdp_dtensor( input_dir, output_dir, swiglu, oom_traceback, enable_msc, + distributed_timeout_minutes, output_optimizer_state_prefix, output_model_weight_prefix, - param_to_param_group_map_json, ): """Convert a Megatron Core Distributed Checkpoint from torch_dist to fsdp_dtensor format.""" if not enable_msc: @@ -744,13 +624,10 @@ def oom_observer(device, alloc, device_alloc, device_free): ckpt_path = Path(input_dir) output_dir = Path(output_dir) - with open(param_to_param_group_map_json, "r") as f: - param_to_param_group_map = json.load(f) convert_checkpoint( ckpt_path, output_dir, swiglu, process_group=dist.group.WORLD, optimizer_state_prefix=output_optimizer_state_prefix, model_weight_prefix=output_model_weight_prefix, - param_to_param_group_map=param_to_param_group_map, ) click.echo( @@ -865,109 +742,6 @@ def modify_state_dict(input_dir, output_dir, op, enable_msc): ) -def _compare_two_checkpoint(checkpoint_1, checkpoint_2): - reader_1 = FileSystemReader(checkpoint_1) - metadata_1 = reader_1.read_metadata() - - reader_2 = FileSystemReader(checkpoint_2) - metadata_2 = reader_2.read_metadata() - - keys_1 = set(metadata_1.state_dict_metadata.keys()) - keys_2 = set(metadata_2.state_dict_metadata.keys()) - - click.echo(click.style("Comparing checkpoints...", fg="blue")) - - # Compare keys - missing_in_1 = keys_2 - keys_1 - missing_in_2 = keys_1 - keys_2 - common_keys = keys_1 & keys_2 - - click.echo(click.style("Keys missing in checkpoint 1:", fg="red")) - for key in missing_in_1: - click.echo(click.style(f" - {key}", fg="red")) - - click.echo(click.style("Keys missing in checkpoint 2:", fg="red")) - for key in missing_in_2: - click.echo(click.style(f" - {key}", fg="red")) - - # Compare common keys - click.echo(click.style("Common keys in both checkpoints:", fg="green")) - for key in common_keys: - meta_1 = metadata_1.state_dict_metadata[key] - meta_2 = metadata_2.state_dict_metadata[key] - - if not isinstance(meta_1, TensorStorageMetadata): - continue - - if meta_1.size != meta_2.size or meta_1.properties.dtype != meta_2.properties.dtype: - click.echo(click.style(f" - {key} (metadata differ) meta_1: {meta_1}, meta_2: {meta_2}", fg="red")) - else: - value_1 = torch.empty(meta_1.size, dtype=meta_1.properties.dtype) - value_2 = value_1.clone() - - dcp.load({key: value_1}, storage_reader=reader_1, planner=DefaultLoadPlanner()) - dcp.load({key: value_2}, storage_reader=reader_2, planner=DefaultLoadPlanner()) - - if not torch.allclose( - value_1, value_2, atol=1e-8, rtol=1e-5 - ): - click.echo(click.style(f" - {key} (values differ) value_1: {value_1}, value_2: {value_2}", fg="red")) - - -@cli.command() -@click.argument("checkpoint_1", type=click.Path(exists=True)) -@click.argument("checkpoint_2", type=click.Path(exists=True)) -@click.option("--enable-msc", is_flag=True, help="Enable MultiStorageClient feature.") -def compare_two_checkpoint(checkpoint_1, checkpoint_2, enable_msc): - """ - Compare two checkpoints. - """ - init_process_group(f"compare_two_checkpoint from {checkpoint_1} to {checkpoint_2}") - - if not enable_msc: - MultiStorageClientFeature.disable() - - _compare_two_checkpoint( - Path(checkpoint_1), - Path(checkpoint_2), - ) - - click.echo( - click.style( - f"Comparison between {checkpoint_1} and {checkpoint_2} completed.", fg="green", bold=True - ) - ) - - -@cli.command() -@click.argument("torch_dcp_dir", type=click.Path(exists=True)) -def print_torch_dcp_in_json(torch_dcp_dir, model_weight_prefix="model.module"): - # Use a temporary file context - with tempfile.NamedTemporaryFile(suffix=".pth") as tmp_file: - # Convert distributed checkpoint directory to a single-file checkpoint - dcp_to_torch_save(torch_dcp_dir, tmp_file.name) - - # Load the state dict from the temporary file - state_dict = torch.load(tmp_file.name, map_location="cpu") - - click.echo(f"torch dcp content: {json.dumps(state_dict)}") - - # Replace all "module.module." with model_weight_prefix in dict keys - new_state_dict = {} - for key, value in state_dict.items(): - new_key = key.replace("module.module", model_weight_prefix) - new_state_dict[new_key] = value - - # Convert state dict to JSON-serializable format - serializable_dict = {k: v.tolist() if hasattr(v, "tolist") else v for k, v in new_state_dict.items()} - - # Save to a JSON file - json_file_path = os.path.join(torch_dcp_dir, "param_to_param_group_map.json") - with open(json_file_path, "w") as json_file: - json.dump(serializable_dict, json_file, indent=2) - click.echo(f"Saved converted param_to_param_group_map to: {json_file_path}") - - def init_process_group(message): rank = int(os.getenv("RANK", "0")) world_size = int(os.getenv("WORLD_SIZE", "1")) From c22c2aa5d0a26ad544b2d4d48911eadc07346f05 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Tue, 28 Oct 2025 22:15:48 +0800 Subject: [PATCH 082/248] [Was PR1912][Dev] feat(moe): Fine-grained activation offloading (#1969) Signed-off-by: Hongbin Liu Signed-off-by: Hongbin Liu Co-authored-by: Hongbin Liu --- .../fine_grained_activation_offloading.md | 31 + docs/source/api-guide/index.rst | 1 + .../offloading_and_recomputing.png | Bin 0 -> 332427 bytes .../core/extensions/transformer_engine.py | 12 +- .../common/model_chunk_schedule_plan.py | 9 +- .../core/models/gpt/fine_grained_callables.py | 23 +- megatron/core/models/gpt/gpt_model.py | 29 +- .../fine_grained_activation_offload.py | 609 ++++++++++++++++++ megatron/core/pipeline_parallel/schedules.py | 14 +- megatron/core/tensor_parallel/random.py | 13 +- megatron/core/transformer/attention.py | 70 +- megatron/core/transformer/moe/README.md | 14 + megatron/core/transformer/moe/experts.py | 65 +- .../transformer/multi_latent_attention.py | 40 +- .../transformer/multi_token_prediction.py | 7 +- .../core/transformer/transformer_block.py | 10 +- .../core/transformer/transformer_config.py | 51 +- .../core/transformer/transformer_layer.py | 56 +- megatron/training/arguments.py | 11 +- .../golden_values_dev_coreweave.json | 344 ++++++++++ .../golden_values_dev_eos.json | 344 ++++++++++ .../model_config.yaml | 139 ++++ .../golden_values_dev_coreweave.json | 287 +++++++++ .../golden_values_dev_eos.json | 287 +++++++++ .../model_config.yaml | 134 ++++ tests/test_utils/recipes/moe.yaml | 10 + ...test_fine_grained_activation_offloading.py | 187 ++++++ 27 files changed, 2736 insertions(+), 61 deletions(-) create mode 100644 docs/source/api-guide/fine_grained_activation_offloading.md create mode 100644 docs/source/images/fine_grained_activation_offloading/offloading_and_recomputing.png create mode 100644 megatron/core/pipeline_parallel/fine_grained_activation_offload.py create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_coreweave.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_eos.json create mode 100644 tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml create mode 100644 tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py diff --git a/docs/source/api-guide/fine_grained_activation_offloading.md b/docs/source/api-guide/fine_grained_activation_offloading.md new file mode 100644 index 00000000000..969098263fc --- /dev/null +++ b/docs/source/api-guide/fine_grained_activation_offloading.md @@ -0,0 +1,31 @@ +# Fine-grained Activation Offloading (collaborated with rednote) + +Memory capacity is more and more important with the rising of extreme sparse MoE models like DeepSeek-V3 and Qwen3-235B. Fine-grained recomputing reduces the memory footprint at the cost of extra recomputation, while offloading could utilize the host-device bandwidth to achieve nearly zero-overhead. Fine-grained Activation Offloading targets at offloading the activation at the granularity of specific modules, so that we can calibrate the amount of offloading activation to maximize the training throughput. + +Currently, the supported offloading modules are `"attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act"`, which could work with fine-grained recomputation to release almost all activations of a transformer layer. + +**Features** +* Support PP=1/PP/Interleaved PP +* Compatible with fine-grained recomputation +* Support FP8 +* Support MTP +* Support mixed dense & moe layer +* Support A2A Overlap +* Support CUDA Graph + * (Temporary) cuda graph scope cannot contains the offloading modules + +**Usage** +```bash +# Enable fine-grained activation offloading +--fine-grained-activation-offloading + +# Specify which modules are going to offload its input +# Choices: "attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act". +--offload-modules expert_fc1 +``` +**Compatible with Fine-grained Recomputation** +- For modules with minor perf overhead like layernorm or moe_act, use recomputing to reduce memory footprint; +- For other modules, use offloading to reduce memory footprint; +- Make sure the offloading/reloading could be overlapped with computing; + +![Fine-grained Activation Offloading and Fine-grained Recomputation](../images/fine_grained_activation_offloading/offloading_and_recomputing.png) diff --git a/docs/source/api-guide/index.rst b/docs/source/api-guide/index.rst index 710a7caf4de..ac6d7cb0b2d 100644 --- a/docs/source/api-guide/index.rst +++ b/docs/source/api-guide/index.rst @@ -22,3 +22,4 @@ API Guide optimizer_cpu_offload multi_token_prediction tokenizers + fine_grained_activation_offloading diff --git a/docs/source/images/fine_grained_activation_offloading/offloading_and_recomputing.png b/docs/source/images/fine_grained_activation_offloading/offloading_and_recomputing.png new file mode 100644 index 0000000000000000000000000000000000000000..6c8afa78bb180a0815aff02693690b864e9b01f8 GIT binary patch literal 332427 zcmeFZXH-*bw+4z>P!Lfeq7L@*-C?8~5C;#x)#z@6hLxX~se14JQ{F&<%XMbHn z{-QX;L~;Jla|#O8GtB>->zv{K>l#W5iZBO?bAMf9Og{cPRsHqy&+%;bng8A~oAU3g zFBfK?{rmjFujimh(_@$-A1=L8e(p&@A>8)s`wX|W@iYa+BMOzLkM;b{Y|c{E^DAT! zcOzcEet(CTdP)Ck)tyGC_Sct|nI?BX{XH#RoPUn2RrZz8KNTrPak z{?_uwgRA#0UI#ore?V%T8BLe-Y$_=ZEJk?^1fl&QcD9YhB_$x%g2uqXRMC6qu0EnT z^Y6>QGWZt<|C+(SRPZko{>y~_GU2~W_%9Rw%Y^?j;lE7y|6?W$b~q=NX=~l3@V{{y zTuPZ%SnI?ssCy#nG*DHsshXO*>~>Z?pu3UEUQnOyH3j9xJO7_A`$AtvPKK6d26K`! z+`4Z$?a?a3(u!t?!%{z)E0r(axeuq@JiF%mV=c967$&$X@v($e9!WoaLk0}g4XbbP z{pruwLY%BzG|fBUU~vWYfi~s0y(p*c4_X*}wN34K%F7H9qJVLHZo~nRveQ32i?6jn zLesr;Nxd`q`=Db#J%hNusTQooQaS{BbJ;h1l%YVg8KRRl^z7Sr5pM(}`3=}#9ZKBy z%=GlwOeSGNB%iiXO5&O2BSNR74ytuGw2oYdT16G&gfI42P!L`pnbkI|@q3l3Ne-?! zDyhoCu#$PvW5Neq1**@k{8ICsf=8Q1N*2zVE*nq&he$76l`!&dwDK-pWd0EJP=_0= zmXd=bgmPY|{O!)06#8R6={x6Zsn$}5d6E0GZ9xY6q*#5|T4kFfg#dls9k zpl>snd{v9w!^R~Yg_;=5{>E}P+COxuF??JdKk^M!Q0NL-7X~#cF=c*swK#Bg8od>} z{;}mIrR0#Lqz`|WREH_s2T%BeYEE#sJdG}h^5SKIwxjXk_En_CclV>4?sZ@gtL0&q zNXtF|u79GstKUfgR2ax;x`WSO`V-E<>6GbBt7Ex>zfnB(jh#{Sy>p>gCk+z1ZfoDN zY#e+43npJ~QW9;N(}up5ZWsH%n@vamIH3xGMrDp^yr2PC6x2J_jd06yQSy@{pMUVY z&kV`lQ=%rJJ|^*P=e|2Nqyt>y1X7s|p#cTFayiP9nT19Z2QuTh=OMs*?rDwpK*JtE zFW%loUr|rwKWA{UzfTesp0ae7yd>T>$Y*9DaQAF8chs-v7}j_(64RIV-bpQlaE#l}DBJJHx)9K66_;g)-*ZwZEc|V4ia)G(H5Mna3>+g~ zTD^SolekVFsBA}LN+wxn({gk6RU*_)O0l2&OUuUbm!-7u=OijFi>ERMyVji_dPome zE%MXbVobV#mD`Cb!Z-gHodt(5`n^4O@&+WN{fh2ud&L!sJNJ>4rbf zCcHo=sN9lWe7t(Tk^zz>ee|*iX;WCeekS2AdT`kKzc>6#7B9-rmlrn#hP0C&B)lcd zvp*D8dmo#SAF#s4){Wk$az0V8~>w+nrVLw9l$D1Cc ziD5NE&&w~fl=%JpUSgiFS^khCBlihV{}5Y84XUr2)iG78m^V<*zl+XHzmfGfeEv~( zw%Ni!?{VHz-cr z*y+ho_a0Ebipj=X=x>%C!l!mx-Q2S7FRAIRjfaCc;>#Gx;y(8Bi}_1deUt}xg-mw- zBx8!;afZW8PU25>+DTKasTGfFp&_3Aq-7~Fe5|^Nnpm-PAv{7?=@ZNER{oDulgSo_ zqua7Xomzvv``53FZVL&}dMOpSO|g;9?}YjbUGlt5UwX*-lMg-1XBFg%nhD%N>>j#% z^;CR8rChXpZ4gJJc^GnV4@kJqq(s;*vORGM%QwxfbkjRrjPjI@M17jW09zRXH^%9T zWgaaqtMVUp#mfWgbkd8HIky;Y9VxHE`E~LI0=pZh^Lt*M^PqmU*xWX-v#e=P+<)h?U!T5ayr7iY6v(5gmfUB;pP<35dhLH- z@~7m#L)QU(|GX2te!f-@Ku4qJS-TfhwadkjbiQ|1#lP;#%EypD?V~`^v(8ntYkcx0j#STyd@)@5ZhrqK$oqBO9nLnW_WMh*e;d%n75bg?Z~@A3u!28S_e67d^^v5V z#y-tqY;lj`FYsKvxLMeYI=@#^x{vLmg9Vi^-CscG6%!~rY3TFaMH3{Sjue1id{jQ= zfj3VXbbj!z)A+dff5@=%HQ}>GD#fe5x55m+=;SZn{_oj>N3fiD?xTM=@XoN-!ll>| zIfFh>Gp==gtCi_zTBORK_pJi>MRPI+Cb!cu%PH=b{19^);1g}ZbEuD!##e7$T5Y^> zB#*o%6;JkGR|c}V|F^Y!a_nTKrok20?92AV!{hHV|GzxVFMRnu3IUw?`@Q6-wla~F z>GPae##L}-BI#NSydLMjw>2)>MpyLV&(+v+eW~H}e$>8lg8r;@FP`~Rca{FS|0Mui z*+xu_Tj~EICD|gFM%0PxsqGL65Y(i6tMdPl@GtSyo8G~dRNTLNA43UIb~5a|626tR zBo%HwFYRq{8U!!MAR8 z$g?A?Rg{@rj-TF(<^pbvK=8?PTmK*CS}=pfQ&=j*f z_(sX17HT3PS8x;342c4UJ?6gemmK&T3_kxiow`l}4rWAxCs@U*{B6iC5q{vhxMk>%Jb( zBvle&^Myj9a|P7#m5@NlPo*nD|?d8O&)KS1V6__cx4Fmpu$& zFMFGYaQU&tZn;4g4+k<-WnB}7f##0$Pr zHH;U1)VlY~=&Snqex&Nq^YG<^4fJ!XpAjulQ!s;Pv(pQ;nPoeoJG?=((N6cD+Ub++24%9o$uXvtY;=qR2 zm(g|yg#cAmRN#g=Lws^&uhNF}`>pomoZZNNnD}X1XcM+F*n#Jh%a#Ct+G#mBa;NVo zjv@X*I@pk3!S?jdf6nXY6PcmQ&|4P_*w^XI9Df~Z=qr4Z)GnSA_`^qIySXJ$z%>ug z%tH>_)0bQpE_N=4R{=&jacA7WmJ}&F5GX(*C~!%g8t5Xx1UZv~yViL({4)box8KU6 z;HMSaL));Pm9$WxsN@&i4??Rn@65FZrXIpfG9Jch5)bPV>n zE+SyB@rEiDuV+OfCBoqlZM=89Sy29|k!S|fSIfA45)>koO&hzNuS$1OFQ|CW{A#yb zA5$O9db1BcAS4zf4VbiW*Xqn*FptrR8`5gcK8#$Vrv|mf@%dZGRiGxpQDVq<2W+!ig}))>)`>$Xfg?LEQA&Tq7*6jC&>;|Tu{xy$mvXo&jf&b#O~A z1^!co^QMRr28$@b_6L$_0v3Oi?-hJOu7&3_>PwSTGuMT#)8sw5sW?y{`l5hFz2(l$ zL744&EP7^(x~)6yJI+6E9?tNb#hU%Mh_X%QB1>t?j)s=YZULq3o=Gq+u(ehzJq$>- z;lJ{fReZ$GSG%Sm@3L>f={^9KR;8Eg>w#*Pel@6*Tg{`_mGM|e+c_sWX~m)plDK58 zU_U4C-^w%BH^RtOvch~Y)pg{e&%HQqGrjSnN{k2zfIiROv}Q$>Z1MrjK{|2L+$0nC zSj)d`sRwW_gn){YkG6Hhw@#toP&V6x`U!=f^CHKDwo>@ta8?~*!_3CT-L7?_3e9+ z58V1(s>Etf15Kp4!eUO-tA-jnR%DL%o44K|h^(y+w=T;%_mh!V2DO@ZyWCRFT6*bi z^sF2mYhZ7U8#ZWg`gqHq7e?2!m8s7}cwm3|wA)IK(_glyhVBJ{4oGL~&FVnTwh{Ij zS6Gw4RanP7q*HrNtt2M%7L(zIyOSuPL-z+_M*P|o73hWKg_g`y(`zQc={+rEeN1Yh z=iYJ`(~fWmL3euNJuRuU3mY&Z9-^LUTr%czYN${M<(rtOa5|ylIawI0<{67NJ$2S+ zwnAP+>YryexASMsdAXNFH>Cnacw|5lZ#N9iPMMZ0=56n*wm^$KP=LR^_^%l5#-rB} z5q&h0W5K$lKTw0|5ezWR)UgwKeWXiLTI|;B8~srshN3NH0pIYA0}b!If)iDw3$b}xj%QZ`dC*fx0NjZ~Z&J^D5pmiu-!>q(Fe zARy|~P{SkpBdH<6GW3~x;8*v#=nA$JsTKv%Zzox@SiL=io$jL=z&hP4MQVWULSzNcajlz2vz2}l3YyAbEH{d)(x~P0iF!4= za#^ZjE zLSMw5tFNzbu9jz&ApU{6QWnE~42h+QLNQbL8m_1kYZ8YKJXTtCMw4-&ou>QKHMc=8 z=$m$HV@7~_7*;@4z@t^4RD=02r7Ot#94|f_qTUE1XGmgu1^lw`ad5LKtKR0>MghzW zfx;~bYH;uz`eeD;z`IK-i;lX~!m4-banvR6sp*$lkKVKlU3@M%B>tH$Lu~xqiIWzr zR{bM1X9KQet*H7_8Lx5wL70QkPq7CL52Z!SKon_lT|xXjFOLwBZl5UUb>AlHM>oef zG8smRa()15A&_m(KE2@?R!5JQoV<$N58LJBzD*Q-W=>t)51I4>?}wEJCejtgR|+5* zG0Aug)<>*GXf^#KMuIQNoM(Q~ur{DxE6@4OzQ+~qUWiq_s4IBj;BEy&W)gPVQvtNs zYa@MB6mO-yc_v}np)2&KlHh9c%4+Y?_{x;o8g9tl|I~v8eU_IPGt9_qIlo^~wYZTi zllyIX!sxg|zB}(gBxnu z&kRA?6}D>wNYvL~ig)WW8v2iSDcd$~$qECTJK%-% zam~LMuiGduy>fT=e;FqA>g2rJ&oqaj5fRrfGwJ4-nVuupP7&4@W?kTl{8fu2Zl0sX zSnUGCq$4+8H1s9V_(H2ua_SSt8<;ZHXkC$-yEe%PH?=z5P++Y}5tna@Qne`WeHlY4 z0O9p^pA1rN;g)eLM41PFtT&GDQ=ZL}$=>_a9dt<9bt7E+I98Z1DIZB64_fI6BkGq` zC`=8HmCJPqPIzsX*sF1P2Gv6uhXKDbrdw^{&Mp{c+4u6%cSVd(CxXa^ zpOH)ly$lYM_?#2en;loX>Ims-g;q@cI57S$$u;kunDjzrelf{JcBUz=_N%`H6}JQY z!HjWTtL^azL+YW9@p0ThWK367u{mSKX6VxmxwK!Pc7<>jcXu(!up~6KIsxR{$f%seBLdrx==r6KNT>2j^}-nUB(cvCHh5HkOjPP>-s_z zHembF=uz~>R=}*R^^)h|GL>Tq>nza6Ng+$*Dagb`S~c*IO+U&g&%$t5N?3@{tg~(5 zI4yW9`tQQoui%*GPFrW^Y6#0lTgv!AHO9hK6Xo0b9(z$CE4s#^>nJy0a%G@D<1Qt; z3=|N==>mq`7Yz~Q*BtHHNXuwx72zPYkEew9_zZ3AJEY=7Q<-ku6pM(6*gQLheD8oi z!^2n>YEDx1d(=tzG5JQ!jnJ38aNwosgpWmJmRy<1N zl6(exeABUN<<*#-@nf+uCx_`5>nMu6E%8ueOmV=veEGczcw#GLG^W|ZUkuEExiy3c@;1EOfkE<0H+NUNw(Lg^bNb)B&>XclmLz#9uM37c&B`XpwtF=61 zFe19TZAYUqpN|#OYPDdz>7R(aHWLZUvKsi#j%+Xi?+%X2RlE&k+`u&}{kJNRUDw9h zg0OyiuJ>xaXKo6yW+97j=F$Ou=MQQUZ?xOq303Df1JC+1-h-@gw*rPt?&T}TJgjyr2hozW=Dx6Hu`mc8!blM_(WPv~N)_ z_}T8`PC5Yb{7sT&6jbu0oLZ>B3{aASOSN1Ib224OY3Uvt(jJ7ojrWS9K1#X)E&DRE z9_|U4<_(fRoXSH6u4Bn8<|PXVC|=U%39&N@1wWF;i4b0!6i#_>=~r~`LM8~MbUct^ z30-&n5WFrV+Y;Dm+}WJpW$qP)APbsxvt z`qCRMQ#j`G#X%9#O3umcjU6wm0d5kGw~qW^OOW>uy>fbwq(g>wPgsxnJF@@>>@<(p zbwc1=P;Hpo^^TGx|FbVCwtDIbmgO$AK3%?Ulk$BVAwFN?@3eQ z44|C-bPu<&}nl)|gL&CXuf}%L$PlltX(P4U=6<^g> z51tRL*v)_Q4D3uEO&YK7@ev0Tn|m8{g%fA49eFC~l7r&b2O^>k&V}Xq&1xcpuy8z2 z(946S@CXKMrT921-^+V{}pw>E=RHCms^t}LA@Z@?aU zr{dqKS2@NENt@@?8$ao6Eh2J-JI)NFKhke_#!mW=kQrJ1Mw$-@3(&ur3is92{DQJ` zQ-b`wFH#0(^G*uj-*t}iJ~CX9%RCPKGQ3x`M=bOZGB~{Z+Nd#~x>?^nP<~)2^#%vzEchhCMdy-RZm)EWy>0 zlovkSki1VU^~?AJvf@*su%#rpo1Clf&q;cm=hCz!_5FCO5aWF>y3hQLVZPCPJHp@w z@Zcl-4KUB%goN&hF>Dw^tLlHo$y5;))Wp zdD7!$V4=MUd=>)%DU@PbQny+4#6+-x2FD+F%4L@~oXRqe9O3(#I@Pj|3QD_HPDjE3utqKI zlz1zr&E`T(NQeVom!Pz^8?;&=^Df)+Yjv21d?L}NVbV)gHu0uyY2UnyXFxSubx?8h zvfdygT@$Cpp^Wac@lo6ccz;$(&wOLIri0&y_n0riwF zCbROvSZj18w3tx&C~e5zQMvDnPTi^w9+Q{psTsJfT?Wb9euKZ{>^9!of%psqhgHmk z`3bl)^LwB2dBY!k3oxvB=u_UT4DR;f@jlH5HnbXu!-cTFe6yx^WMs?nO(_kK$9r;u zwWR=p^ZxSAn>D^2BB4}z<;AwApiT?`cB+|x!E~7E6rs0nr1rq@VPw`DS!L;`6O%3K zpvIp|z&@xnw$QbR`7R-@o*qpoJ<0lLv8y6AXA!K%bQ>d@F-y!XrxdEC^r%6>^knlw57_a^t6uDNv=i@I+B4AS zcN&NcUn?~t05boR(Xk1+kH9CYE$dXQ&$!mI9p!l zz1LEAZJrzCBBcsYpO8KJAwFF2tF|}<+d3;Y+J2f~>Cu1$5uUVx^2*QAuQq2QCL)fj zyNIuQKZ=5|IL%hPoO9TvH^T@%zK6JZSo6Fyp{7E6R>QwyxUEM_3qnlk^-2jl_G-@3 zk_-h^%jxtH-iQWOx?qP0wULa>EGEFZ7`+9WRwAG_z=U%iB%`CGiZfv!_4CY)i;LmN ztbqwa>jtKmFNOE4${ih*@Q(VF8XAbq8gXbEs82{A9np-WCeq|OXO)w5I+F)sNvJ!R zrhUEHT)WeK!#zzwCl|S-NV7sMStRSm-Rna;+TH{|6AJ?@zKEQ$H3}p(LWPdfJ*PaW z+-Nj8w%@3c>)htDqqOn6IUv*}qD zTA3ruigt04gIq0ixrXzP@NnU9p~++Ttpt8CQjv@}_dS~9;ViA6cR!;092}_zs{*TG za=rA}yK8V>~Mx23JoE$xKw0X#nvPgOF-&Ex{b>OalH7$By z*KNu(P)CTrwL7FhJqCG5Zr8E>W|p6>D0-}Mur?Gvd_ZjAP2cI2yW{MDA_GhZ_r^`t z2$#bKxpUu}nTc?g7v_DkfA?Ktb37GV=k`zgB4YU0))Yw}*dANLuEy?YZicAGWQO{nndT-~_-v_cqdv~B5Aju9> zD_gry6zsvCb6eNB^p+%jmP)+^*CJlRsAS}RG)(-6-jUW2tad%53{+(zLq#jFuu({N z)@2`}*sETmUNr4dI%{=WuDvH|7wt(^k@z35WYzQiG%Y1ZVBfG6E2oMmK*{n&nCEJ( z)rxk9VHMpEFOKs019pVraa_#fBJhh*9D7oL`rdbomHm=G^SdoVnN>VMy6YG&qD~JQ zJLKXUG<@4ihnQMC(FG98=Qt7?=Msm3x2KZ=QxfKe4rc`UOv*N8v2r#UJ}E-HjdEW4 z8XGmS>nDx8$vde5=2NRVN%=jl#lfv~cUC;#@71|5HM}&khsR9~O=!C;XP*@j8Qb=b zvZ|qr1dDHYvhviLMb}^kROS8Z=g6g=2PPHFd6@k!FH&3*>;Gu__;U{}p-@a_66SKg;viSNC~njQ|Y&W`4z z3;&p;4K%;q0MwRL@owx0oMH%UTkCexaO=v zQ};Yi>N4YTjjU9n{b-{LvrlL*J(t9c$TBsVJZT~NE{*2P>OU?$9rZ~!a0$}4Hb!xS ze7oFB6H^mFrQZJQq-k&*-_$8>0Ck%0enz*7_3_MKBUn=4T(p8W4AG-6vKx@LKV ziD?N?^<4F&&AQ?pLpFAqr$=+;9=YPyh4)aBKA~$O28&dBmPLK%e;S#|uMpD36U%S& zeaSu|N4$C4^I+4J^oK51%%gIC*IGLY1QJ$t@?78ad0vHx=VN0|75$#aPMf_@N7tEX zfjsYlMoTisBTb&YBPHIKEv7c3e6 z%DV~l+(^@zKn@k}X=?yqY&2B)FNE9yiJyPE9ns@v!^IZUnoqKWx^2?OAG& zTJ-S7%PjHQ{lOweCcr;?o8*=ko)7ypd)Bi-iZA#niQd;h*|F?kS)CqzwxdBDt?TwA zZC(=hK0b$%bw6l13w=4SDOA`l9i{nYnAijVv>y`b5>T?E$C02*pj%@o3{~CEJ8w)i!odBJsIv+YMW;#NS{Wl39p;%znTktc}2=?HYFHq zt+@m{M9KC|e{GFboZ47YPO06LpAt?CC4eADaBlUFmW(S`^jD!7wv;vQ&|a)%Tad`k?lZ;MI9wGSjc}@`*J!Omfl+1 z@2di0Kw-WX;FAa9Voak-LqNsx&&H8I+jWfW zgEejhY^NM4Tsy3_6+u9!z98RU6CpWT@z1K9pLq;WwYX%1>nBupc~Z?M7Txrfs2Ve zuK=MYFQjg;dfX(u#x{;_pS^wcxpJ%DhyAq~k}epk9e?PwO{de;qD7l0FP{?IjOUqW zq?Tu-!5I=G0ASpix-ma-ooWtDpojU*T`w6Bo&Mp?1JBM~HH^?0S|%57&q>rD!DSC* zB-9zW$9+-@h^G#lI7?zP-DI4QNc6gfF~Hr9U9x!Aip0+J5QZKyAZhi2#8(?;L>!zE zNQX5JG1uN!ug&=~-Bvm?o(r(Qtx^)DK0Vx3OP7hq>^P{|*c6eKncz8c;^2%15~9{t zzYRkutxVwSO3^|Hf6#^1pGui1E1e)(6``*<_*}T$%7M3SrfTC*8#47op4TG7vg^MM z3L6^-R)FR^HLuhfMvX&}RM0A74l2;1YNx}M{sisrT^e@?2>q_uvrxtW&`Y0V}MYg0;wyi_Tz%wH@n;VopYNJ=k}&6 zaZY_kOc3RJp!fmUyPg;Kvha`c~k(*R{`5IQzt(wxHoZY%BHiE0z z(+ZHUtaIDzV<7oZa1YyLPV3$AR>!mt0EZ zJCTQ)l4qTN#@iX9i*UCV;;vbv^N>)o&Z?!)$G5c?IrY&g45Yzmm$g>i1}MFt+Qm z{|JxmqJ_BemKRv`MZ_a^;wPDTmJI3D&5Q49T)J&o(tNUlNK_F7-nl|Qjod#T+%~jC zgRK({tc5n@+A@NrD?Uu_bkK>jh+scFcTLh2_bK`Vy)NqW7(IK7)5rb^D-~zF18=3( zXxr46Wk-yEn&3{k@<=&d;pV4fS7713^D3D40|;IEJ%+=q#WnX@ZB*nzhB8s)$9#BX zbK0JGMErZ3n5{!5CydAu51nh|DG0LrS`{w(;#%!0M7Ymtbq4M^*s#eID4q}AVG9b2 z5r`S@A8Me@A4HH9fnzRGi}EO6M) z#rSdO-Im`v%(Cfp^|=fwr;xGe9j_hk-5%OCkWgoq%SP^<3Dff^@YGvW39kvy8NEvS zanCRf{BTPEV6FuYUb>=&smQf{HfHko{{lej&=X#-av*(yU!saHW;{GZzJ@f)q&Q9;( zmzf$TbP-doyp=pxQ4}}z1;256711|!&v<@t8J6p9Ya(1eF5JwtX*23QUD~DLA7wU% zb)W6)AgWn{x?%}9t1!}+irJQp4v%nrySNy0Rk8Dl}3@@AVK`K0tYFa z%U<-!2+vYiN~a0_b~z!`0|}okHm`w3bY+grZ&_)^xhIT7Y5k{*BbN>@7*RP4q)sMP z3M#^)kBrQ=4`+pSe#-UwmMS_nxo<5x_E%kS9F`gXH2?M7S;*%8vW)AXT zl<4KO@$4v(99D)_dw2QHs6@+xJVW200TWB$ht{XlxHB-#b$ZeF5u;omP4~ijI1su! zmtyl-@=kQwY;-EYGE%0y>B@^|6EHvHEEz?h-}8if&F?Ox!B2;~K^70JJH!c&+P!Ca z=aQP7xmmCGrVG1WBU@ol`K?=DQ7rOuj}B>^ZK2f zQ|uhN84{{lmrHnc*I>L&$`!CxDY|J7uzxv@nBh5i{QV+)sA1r^*(0@@tjmJkb>VGC zw>L#DtfRkz6z`&%B(kFN29hT**9oz9Kf*Tuj8(=rer+IEd(dQV(@#SiEvE5m(S7To zE*zlpNUmC&+ROOerC@4{j_ImAb61PX*^%#@!=9m^x(6wVu!NMC| zuPdsO70cHOOxb@vESr2&0rM=88?{bPpp5W~uF<(ebRZk5l@umY=$h(pPHkBCHI-gK z`vc==l#=iT4e4c*B1&JGNKu#3?_ZQ04>L^m@>yk$9z8AHcj~_KZNDHR<=gHl&2g2g zIqG(C{bx(xP7Od#ntp<;?|bm0V>Xv*s&9hBmuoZ$ptpzl#DVX2FZI>t^mXwa7CBy8 zHbeovJDzh^dh?INL0*O8J`em0S7&qHcizCtv*%Fh;S7lT48)43In`mp$P3hmNyCt< z-I&x)SZ>K=swgai8;&-&DR|JFwF?cSVcoZKiPU(djzAij6b#^nQcqPDEx26NKBB5! zRt>BUZ=~8g&lAz(_C{w$BVD-i-1@Ah$<&$B`ixx4{bg^XN$PcRR?`=BO+bHU0~@?P&7)Aw5-Gmo^yPb^6$FD~5P4ATI? z7@F3x-r=%m^v)Os9Zd)T%KK1ZO3InFkoPWOgX10#Z@XIGI zsXzerfEAg>olP!|YnWG&BaZ>WoSe=5CsHfFjT>lxpr9*csQzN_=oSn*xc!jy&V?0! zR4`W2RYr(P^e!E5{&EwY+$Z|E>TtD*b$rr}=ZdzetYHUDSr}Oze>nY8hu{^5%|*|%EJa8kQF%ReY-Ir^>!23xENmft2geZ19aGQYX|u+xRIY}coyXesVd%6M<% ztj_e4QIZ3h*I$=TK@XCBOtHwj; z(&-l)GEokvR`A%yG43SZ`}x>3$jYIP$y=-8;nVIs;KwB=m4HjNB(IV(lW-4H*Wi@J z1&ot1{#mp6?pU`MnEsTHfsw})GNrdi?z}?IGT@yQsXLEp**Vp~4ZT3eHS0v92|_T7j-EF5rLbyuobePB1?b*-_)YP%#lN1kFRr-L4yo6 z%!=bn&UIBo5QFl?ViglkCcy9f(+@F-I~6hVVBhp4E1f}X6dCPRE`%7Dk1R=}Q3X8y zMT(HrrJDB%Ck4H-axmvYBl5Q6_2Uk!T6tE!jlRI2kgn(b@?c08k@!b2(@)aO?ZCqo z6^7~Iqyzs0xZ7W!7}c2utVPE3dpM8%JE)* z+>4K*a9RJ@D^VcuP&nO6&nDYaBI^cI@z0y2QH1t>>gPEihn@%Wt#HY+d#g9e zapi!Tt$FRL4i1y@sRNwe4MsQc1CHIUCLCh{`?E3ef%xeed$>7h!&31R*R_3tfmDh0`gbk^YTUSqa z-{|Ms1UvD=2Z(|`r^#~5kd<^Rtfv!`6Pe95_6P3&HdwHx*=7;{Yv@!un7q5Z6=7Zi zUUuI+^X{(nYG`R-cTGR*PoRMe-~Q8P!61uWP&gHNymUv%E#&)EMosxqK7ooca2tkg zm{Do3peNcpQA^i%&^_xzRpprt)l(r}=vcMYG>kMLT?B8FWE!W7t>z%e)eGo$;AvJQ z(I1o{CSr&~`bk2B_E1jJix~ z&Mkd>CRhK~23Ysfd(YBlHJ<9|098|y9<+wbf4j>sUgp&9)yR5tM*9Kf!yE6+L_9rl zDeh?-ADKJg^s;qx*`at?ngFw+%54$r*`BDM&vs~;P&u+)B{56T^`Pcp`(MTFP$iXve%KB1-3!X`aG#0`6-?UsHuhtTt;0pC629@bjn3 zAk(+Ye}v8w;KGzo^kg5u;x9+;k&6AHIw64 z{x%nR0WyZCo|2r{x-@KZo-2tZ$*v{;TLGR6@WW&g$QH3x7xV^8#Q-hX@iWscLc3f# zU3@St&?H!Td(~?69aiHYdF4sW^=Wg$(zgv+6M)wKyz>b##UtM+&{NYXwNV0*s_53? z{sZM=-+uKsNQeI#No8YXjuJFln0Z?ZRk7(hK;OHhF=%|xwpxCosH}9ekq>lZcm1y8 z>KRs(W3>cQ=8{r4P_QL47dZ*l(2iZuKo1Vh9{rR;@pMkO4WV!1&%h!gU=j4A{2k;e zt$Z`AJAyc`J2No=h%r0~13itChLY#7{t&Ujp+t!5;pm#Ns-#dt=w6RSW>9>n%am|5 z6)=AX`s8OSxdsA=NQ&9Q$WI$JGZ#Ztdp|aW@OOs1fOuwG5gY%i4G^ zLk-$RVwAb73Kdl66V-kNBG1#;@8gAZB&5wr&R2{PO9svzG?|aTAjg{Y`7xg8|MP# z$UE(~t7aWNUS8fwfpneGKC5re{y|*Bz*({jw+wTz;4#RyBJxjcMQcoDOQ4LHKd8&T z2G5PX)%(3AUE-YIBO8{!U<3Z>wxd-2`I#Gy?@1SWfzxMshn}TP@PsNCXzDSg;m!2t zAU^#mGY&rJbiiPC`G81N9j7J=$%_YtYjHzjB{X$mMkpKmS=VB>kyd=t*J zvI7Dd~-6fKojJZ4DPJGSSyuq9lY>d^pkbloc_!H(#6q} zP-!WF4zC+!;W3V(Kwt{F7o7xWfWD7UQ@{zPcAsA7JXn7#9ATWh^4AQFjjKaYn47sP zhwGaGPvPwxG99%9+0S}Wa^UU`RMP!uB-7EvgJ=~8O)FmgC7c^5y3fiu!k~8b&9R?b zC%xen*OLPVT3MY%xetOxW7vs(2^?`Jj=#b60}#Z!=NJ4X@oG?Br?8&2vHW#nKM);zyYL6HI@(ii&P((7SsM zxtSr+QIV)L0kG#m1>0()?Yt5Dp<#uZD%~@yij3g}!=ipQ0GKF1u*OcSUr9a;%6mT) zWXhwxA-CW>(3KZnyMMvAea>|EtioV*2`@82UW-nf*=l-mk#`@NexKHx_0vlsowv04!HpjQP=kI?DZPhC-0N@@At(I!EU!5Piiu1-5TGSJ zS2MRK$B!R0NzL;LirC$Q&~SS8)nnUvG2u`6^@69pA6jzSt}(5g>i%Sdksf~k3<%>j zke1b@y~xTq(~9AN{1nes434W?y^jCoW6MK71G{|>*A(=Y#3p69mT!}rb7%U(CN#LT zErH*QMh84lFv$yfvF_@so=mz$*Xy7JchlRW#?>!PGV>newd+B=iE}xW7KZ7!_>B65 zyj!sRk>pDLO6&V6-uCG$luh2;(|Pu#a}Gf5-J14vhm!OJT5Y7fp55kgHdmkbvFqXH zexv_0rr+Yi5$ad$yY`H}A?7Rx``m|1M#n$pe! zZb&q~JY2~>%S9y;M1*vFInKU&Tf3o>a|h6A#{e=kB~LdASm-yKz%*kC$U42%V$LDd zgWwx8X8u^3`-F47QhjthSQQqv}|5-<5xko80vJ-#Di+9gnC-Bp`QA%%K4l4 zy&UgRn)t~ukyFzGWZMJofp9CG)X_=Rgg@zN+PJz>mT}`Vp2}PkR$W8g{J@~_F8mvI zqUyna%z?>({sPCv!^f3(iGkg&u?_?&UH%ZF(}BJvyKg@ng4nAP9F&~6zqGj@>Yiut zn@6w}BnMv)x$OS9SbAjqZ6UK4xWPG-R=f?>kx^b`j;zSJX+=Hz1&x!leXWR*ev~0^ zuoqXPE=-9?xcH$`ZTA0R??1zu+P1%8cw4rJ4G|TkDhNmyq*s-u z^xlgS0qFvU4iO8zNbf{IdXpMzQ0cwb&>@5t0tr2kJPY0DZ1+CpKJWi}Kl;ThxYn9; zuF>Y0WBkTgB%VCH*1LfVyKQv%x1fgrw5WDF#M716dCA^=^5wUzpuTUcaq8i4NTeSmVt&wSx%<^%)B+y0J=~2{03x%$qe56jz*VF+P?OlyPf_Wy7 zoP@~R_mBG|e#Ic~Y=&;)qj@fQ;~e~N7|BP=&)V~eFhA9?D6wfn(MI?^ zv+*1=y1X~*?=EFDNX5tFBytaGO~{z+)4f%BqTuB1np6DOc0;0;6VONQcdL{j=da!6 z!$?`}(x0-0EUBkJ#?~@@93^~PV!Q^HXcQ7Ou@~{~pK|M>A=}46`x@&{2x(a{wg?sH znlEGRSYd6B{Tn;^x~V?c=}+!1PqGn6&#z%uuiter;Bq@YRsrWcYc8NrfGa-YIB%ie z%(6pm#=)XL1{7jSeDZce*mHzFd?pzB6gQ8NtThH8qEn|?cn7aHgFwh zs$^{=JfLwX*p%}mzWf44bkr+4)!MaQjZjP)?MJ7&xiq4j&6v?jtJzAH6U^@r%D)x? zqz?1B%YTY--=jEghe3Lu;^EWxuSG?;4S*;Y8TZ7?-a#o(+#V;D)tG*R=lrVFRZ+o( zqUMieFVZM~XRa;ArceNJm2JZi)p0^E*-&G@3+Q)|Mag@qhtU-69PZ)jyC>wVz8>BF zyfL9xt8%N}ItG$j(9u*>xQ)6?eD|0s_U<&kRF8>B83I%xul*Li`IgOg8P+aT%;{w5 zx*o*18>)!04nG`iU{Bo-(G-7C0m}(Ho%uNyTbQ=7b6MW+9q%bOoFm80Rcu%R%=f+%kt!V-&IOP*>kV6vB(_FYZ;g)#U3$&y~H9+%vuPgTise; zSxx+wG>!~DDJ4o{SI2NKL!$@M?IP#J@wGI8gFb&|;lHq*k8+mUVRPbfJ6&>i&wOyS zUbw!Xea|P{(2`MWYwoex`n$*b{7aAd^-7M|H;NPUp#{8<*k@KQTC^%1sy+F6kC&si z-ilR4Ku8z&Nuz<-KTXb{*K5eNWsUGL4<7(yfnQhB3WII(WQh+}8Zlfv%*XbGMztj$ z6}~$M(}^dMY4_rdz?dMa9X=^q?I>x!+GwE9E<;69ydayo1ozA*T zR1mj#Y8=W0G9t)XSM5m-z3>0pJ|Uv{5j0hxNgxe(P}U%jF_^VpI)7+9R_tUwGBZDB zaaEXp=2e~TGLf~R-Yy{`CF7Cz=kTkPLi#fDh#z+ArU^@n?%YZKK+HymLgs?)OE<1| zw-)d8Br_R>n)6ZPeuG}h_^_`$cWqjDSl8|^PJD;gfQ<8E5R2APsS<6-2*%0RqkBKYa*+#H`X-~@e8*<-=$KGBz0+uCpt=LayI+NFHFc zGio1In*YMm#V5-9I&cmw44Me*1hPnW!Q!xE+zWPeB!3(2kz(8}{~{L;nIxlGwa?83 zEE}x{MCd4Zh^j)jhR(~OhG+|U{jzJ%_@`K*H zFQ95e?~$Q&g~i=DLTB!-;ogWn|0$6KoJ<&b?o(!g(1h@I z!z{T5`;QLZ;i?dRFJcg8aG_+#HL|GKgz-so^TksG*_r#BgY68+l*D@lRtd%y8bEq! zXZzE-a3-LM#chK9I%sQ;NkK5vi@PWj5RMyNK4e5MvCsv-s8nOxJG8a_?J;$a4!lmA zQCx$fKon@!(6w9)DPK>FO>8b8OLV?jD>lHFLYIR=w__q5(OISg-o@tzjWnh`gx9a; ziG7eHYq`=h6&Fi%wlZm#KtoS;urhU!x$S?A9IeJtMmr>`tLLCbS#dFWo#k46Ie~E= z(ZqVT8tw4==)(5H%O?CGqpPfJtIoShbb@>#McW0s^B1*t&3pu);iFUi70CWw3=`C8 zQN-mgnBH*3Wv|ibp<{9XKVXz??AkqVfTFi|><&jeG`&Fij4)6>^BtnAh}Z{*dN1?? z0I1<0Z*SlP)Z!gvk~%MMXfN(@jJGJTPlLC2 z&e7PYD1C-^E5jx8r#`YywH(Z<7wNSq=I(fSjF(t=Oj;b;$!O^y@`_YGWya<_H4q5(;YH43qGr0#FHe-C$1ckSrm;NDZb~cElGiDlY?Y$)%w_#hjog~}gbnwbjFtPR za|h;Sx^>&0CLD@NQYN~uL=UaCAgY6i(Jg`l*Tnt(N9yfeh1<)7vWBwn^{y}E=uPQA z$1>}fEGl8V7?(hZ#GwQy9ptC{ks&@Zs{+f>V3yp2+>5qFm>U(OU&oI+)L8`^>zQ>s z68JsX>TL-t4R|g;j~wr58?_>9>`_zuv^Kc{KKIK)<#x{$_pWfv;k=NM|k9t#@CQV)A5waXamt%xl#-ueS1spOH%Zynk<)9bRL zC;4$s`=CDGcU%)$Rljdecl-7agru=L&`LrpzDZcuWENuhJ%KQqp@6^pHqGk-L~0>)rl@} zNl#vfK>f7LTPZ|`T30&H)jy3q@aqNT%-mc_7l%lBZVvMB4JtIIhy9VsjlA0Xn%&8# zsab(Cc$rI@dnQCS8G{t=>zSFU z+#K5Iz0Xr+N0I+c;4GUuP?q!QkYXyiJ$E%PaSLr z=Gl$bUi7uezulv5`026Hh+!e-Ejbl9cMBXsYmJ)BO!&6N2)UOUVv*Om82-fTZYrHY zt-)+!j79|ar$ZP6(kGbmlV&#Wu1;f20C9d;{Mxn&HLm#CeHnxHa;Q~ay@c3y(nkp> zeuRB#AHD@ZYCYCErt7zmPE`jb)*eKMXoI}RN-(Xjqwe}jt1qIr_bsg0w)YZ5Ai^E! zf_5%7&0$uc8VHg#cRMieu6hLhg9(+*`g73v48<(Ad9QWGs{_oK)jW|zW7PhMXuyiN zjn>DycHcy>*h4GHV`|B`Hbp(>OqVtD7yhU!k&%~srX9+ZlpfbA zADZ6RDAb7YJjRK*0*-*b8J9}wCn`;)PCB6H6eJlflKQqG|@KIpQ&dgJn&$T6*JPrmeSheMVq?ils`dt&85&uaHK_| zNZl4PDWjK)MrQA*wf(huPr~m&&GZ^?kaB^QRaO0rk-{}-=2=4-4zKoz<8# zD{~R@p*W9->1oo~>c9&L) z{o_|HtNJo`jr8IR2a;c}N98wTuq&1sg(_7d*`p+9y;!@f=w}i&H0WQ8J~4x4ykwbs zjxB1DRNgd_iQUd;tDz#xp5RL@V98Nze=l;4nggAYz@F8+>K%T4?^!4X#j1!3lAu+b zXv({v)}$PG>T0Uj^SXkmu&i|zyVR0x2PsS)1lMqqk|SiEBy?Q8@JQ3(XnWRURVsTj z(hdGH3HzI_7nKr5K{CcQMP8Qxf3Ond7}-(0jFVy8m3_~B$sG}|C}Ax5z@hS6%Veeq zv!5HZ?rj=St+Y38CgBwA^_lIU)U;m$iEEDcEyqN)0SxRUb)h zvpf(JYH482UBm))imOEsXHk4+DxEi}EsF%*t0zph9k|aB?#-n8BUMsu_!*@#H7^ad zX~D*+q8*n#F1*iR$?++<2*R{3s6LVdM_wBCPqpnuk%CmJb_!B@B1QmY(!8}*K+8u} z`hXrHP_q#BB!&3Bn+USp(x7e|oEdp76(lWQ)N@?b)Uh~J17cXr+k72}*y(&+v$Fb4 zL;TSrw(ZLDj77Axj8_U`3Yz7Z*GGSB&dPH;P3di7E6k9yep(^%*zLUVWkZ%?Lfn&q z+P(2+kE<>^eB0F1vJEDAm5=-Q2n(NGatxtc%prVwItYv1I&l>TlJ~pkmA&ig=2!#g zuiDNx`xXsU8mX_Zp3Q)QkUQBiGKEgCT!|gGXldLk7ZuI8cBmCyNa|0=^INYU)ci(C z8EXq{ZU&r`$p@2{O*Yokw709q)=r1Gl+95Dg07(e!zYo54ILIrWN%KXP&;+>;mLql zr`6~(?FVy*XMJ4`C{QfXaW0X0(qW~$BbF^C(=}i&`P}qXw+aS7RF{^DeEV@7(^K%y ziLjwZ>m_hk$z6)~^i-mcW|dJv*Nj%;I6OWyL&5Il%tB+KZ9PPRRat(;9=xB%m2r2) zhRVpdp0hI%@2XcWdGGauA@Z{%AJn=1HNl424xT(r{I;2Ij1am?t6@|w=`p0pag}n- zdehHgE%0o;a`v6^-MPNNfEB4_z&}!9?I1Q;S=&@8FXF#OVD8p(YT>)5ce7Eqsj{ru zj`qgE-cC{VVQf#1hWgsf#0M=Pa+MyT@dml`-}m$z;0o`aBV zK}LJ9?Z*1TH6S)-oAF*)?>R(xg%%qA09m18qxT~v0+j!2=CyNlf4vT1x~)XFR`t$d zG$c~@0_U#Eh)eDeUPaeurm0}1gi4oAZG)S7tNN=$L;h)>hUS2JB<&%T&W;;6BpTz= z*V4P=J=i|y`Db&-V+iWpNkcZT|99_zXIobdU19lnI>^YTx%6arX_d*VT9z zwxVYaJT-7Q1~4@HaPaC1hm(bV_aN+TebIc&&>k7CAOUtv&g=Re8;s@ zmwMl<<5eFR)T}f>at?@T*<^Y3&~2dE0aJJHj!`)}lTFZfiN@ z;qr@+UYLvck(F*V~W-6XtRo&&b6=L%J$(`+0`=YWP33Nl; z{(Y*5xpKa<-kCRO%SmYgevWw3g>_kxe^8JE!sPVU>oy_$oa8HVdH7eS;5H(86YVQ0 z7RfssitiylM+)Hqlk5>HJH97@RQXh@p4;bdL>~qhY!P?Wx|5@hn2gu%NF}`OcGMw# zK`7a^xM#*g4)lo&8m7NT5slx6Q4^R5w>lLY9q$riR5uPZWFdHl0RBelrIKk~T9-@_>f@BKd3vcg8_ipR z!IW6sw~Hv7V&ACaoG=$i{KzL^pFTk8h$PyH-cJ6 z6*EH?4env$OG0~>iZn7Iz3eWIHTcZT)yLoM?2lyVEUJ@xg92r zm1w)CLyW=s|p|e0s0MWbhN`&_ph*uV?0xiWuHE9L4h0ytD zhfv-7gM4`=@H`mGWNTJk<^|osqL<0`x^dztnqgqGK@2g}4y!Yx|D4*Xf050Be#6BI z#!c8YYGG~>H2`aoi6oO}@7_Xo@$SdYG9yIx$H*K*SocjEo>Q|(;-59;u?URQZg{sk zl!Veiy~OIZOhv~7_usfcE5u+K-wg3NF)P$0#JkVv4p*N`eYz6H1on%FAsSn`b{{P+ zxD(Ze>a}RITJUZnE$cjZJ=skOTti?z8*^;mWvCa|p2T|yzioCSL{*Q~;9JGo^;n>L z#-M@_suR3^PX#}71>T^uI;vWfVfz+WMu`4AeZ#6QZzY4QDngeiqO!WH-=!Yl^_V@j z_49G_oUUs6sxkmcb-BApU_yD=N;TlvbRHiVO3rVS08yMK@qyb0FRbeQ%Zjb)@#vZ@ z@dB;=D6p1_6IuLV+EHR6QlInWhkvrJ#5&233$&9NM&yq@Pgsb{aO{%$c{VTw#^q~WSD z3ya3u88COE?)M$_mTf#)9a&|{^*sn1m*y+I9_Ee%pK~KxOeRYS92#CJ$Wv_a?3YhI za@s}W>g=gBu<%0cxjMSh)IRv@O}?i10I*-?YHiT9A}yQ3I04(VbH+fGsS|HKS~7NF zTm>9oWFa>WiN_XAXM77aQxJ_Av?wY=LiqU!O@X*;k*m+AdyFmU^IF3f(R&FEpX_OC ztfeIvZit3%8Ldx+!Sx&%n|c8&D{z$#Utw5Yxp3scPv%XF|Gsg$O^}2EiEJ-g>J_gi z|Lmz%-mSIW?`QZVACe3^26RPtz|aG6VGr{Fr@x=4ts@GTx5!^Jh=zRcqvhZW#~1AY zDMZ9Nxh1IX2%b}=aID2n5Lcw+4m?0F=`ETy_~1ZzA6`bHnTw~U`?xQ{&ubrv6WQNS zq$1K^lVT$3WSHD#&O&UL7EMn-PYd@uy<#M&Ca?_xyr`j93ZgB!)pI}>1c2U}?+ z#L~^GR_f0V9KZRzzw8>Z$h(z6SU7q0k%=<5I=(1Hh%8mP;JjvwdBFCks|fA)gtQl> zl580fjl9%(6W)P6cMq*^+YVf~xLd>zc~}f%$aUIVUC+#mXI8|z8o_SUVt@rISNwP-tzK_<1p}&$TogC zvrBfaEZwRni^NU}x^mRp;znvqd=S z*xse8Ul)cvIE)5tNl%AvkFZm+C{Rx0YBN2%x;p#;qcC;`1wNu)^#-+qyTJo~wd$_M zBN)K^YjY)!c~T*GUX%kpT!fk33T6)3YHfinfP2MZjO=5Xm*n;a>>cH9rz2TFS!sfw zf~H*uUBMq>RDPow=#CNO^w~gq7}3D?iPdD1UdQ&7Qqjnhe97l??y%gUdGy+~b0#3W z1oMG=yW$5VPU~rLGSwx_*b7a?ftfpH7oF;pi;!Znvm(p`d3+yjC%}>KMZLShw5bwf z5#7Zr=pDS_aXU6<#K@Lg_^w-YONBc{b*8JP)(4%R_7ii?%hmg?!?w@dfv)XbpW{n* zR3{W>NjMH=1_UdGCs@JZ-vn7wvV;_76W`YZx%XC?!1EI-GNEU$dc8B4>HOn{uQee+;IVm$#QO1u=FA zf1?@mx~HQSGH&Ex&=hiYN=0kd={`NsLp`ZTNN$s&X*c#8UKQFC-ps1{?KqWF2Okth zL^l8#} z*G&vV%F>w+RGVn88k6PqB07rB6J(3B*g|z`O=f%Y;(e<_RabnV`;`hJD=BtGwQ%g3v%?SI_& z0La0sk|wLz3<)qa`~>~@UQ2fO;fy>JUxWYy5Y0TdH)3oae`rGYXgOGw+3j!hoaOlZ z+w?4@>+Jni;0lm~dU-(~`TLhY8k2t?>~H^MpSMvbe(?Zg^68zEh+JApH2_agPoEMT z$-l-c)BOti*o?qL{d51-A5-REoe@3!M@#fh<=3Fzwfy*hz3s1LXgX)8!bx^yFi&Qd zQYy|cfX~AgrMM;h*#Q2J+ksA)8?@eZF$}n4>PE4C;mPhq%I5u`i|(rv#CxVo`g__z zml_X$%7OlgTl!$|)Jf~cN>t5*{Y3MmB|pXiQHlAo>~nvK1p9x~b{d`hnITP#}iR;OOADJ7IQOq)<~t7nGsfQk2m&+!T&J3fWBrF{83XUpw)*5}kmH|hxkD-@m+^=! zTL^;-%)2vG@hsJde~AOYK${?5YW{rm_(9zVO0O{+{1>La(+Ge8fO-|H;&1#R?$0j+ zD7_2*nm?sM|I~yBIzGw#-^>C>Pu;24h^J()(EjvEG`Cmo8Y`Ii5IJ$_Utg;1_@yR& zX^!os&$Hi!&RkqoK>tCi4<q zv(){$rhhzlHaO_Z6CNwVqfXa5sby3+qN!I=B$gLr}`NZAI^{s@zwN&0;PpC_!JO>Bm$$U)J; z{bJoAJUfhxSO0d`(^n@6WRfD13IOR}e`y71A2QJ$@q@+>EKc9ae$_mt|8K}r#; z2qR))tMnJ1-aB#9h~l$y-W}k6-Lq@dZ=ajCEt9iIa|FG}vhi2ek@FN`z_dClynFOl z9-Iw^0z3|b7K9%Y`svFPn1^riaSA_fdSyIgR@`2vpYLp2R$tU4*7cVuA3T*m`&c#p zieH=Tm0P{OU$9HK*Zwl(L3lbG|9Ysu0ed+83hQ5=c>Sf3bWifIam?U8z^%-h*SjB= zz0Zt7{zl)wE%8sf{EI03y7Y^5zog}t7W~qJUs~`>3w~+A|4|FFve+2^(F@>juGlY^ z<8Ln9FZ%oviC-e|OACH!!7nZNr3JsV;FlKs(t=-F@JkDRX~8co_@xEEwBVN({L+G7 zTJTE?{+kvw%YU@aKf`Uq7ZPvxW>(?;9d&nlC;i(162OOWax%kkD|f!$ zRgI|4|F@9$(=;Iat{7FjedS*U`#C-EQFi7?QhYwCu+Y)Di{pDHz4uPF2pQ_|+r+kBs_17ZD75FoD)Q zexd(D1E02AGu^C_vv)_&0NL_{2P&*to-3RMCMDxoZ zKF-nnO?v-FvAYosuuE-IFYf;$;}0IZ33U3Ogz8^Lf2s+nhisF@v;PyDjv4&l>G_|y z@c#z}`!e2h5=zD&gg%ca&u?b<9|`?4d%j4MKBkMiyPs1$l@7XZLg!%B=uIBPC?Gv? z#lfi&#VBz@jT6m*3(=hkUU|fGl;a<*WSg7OdG_6iz(Y@l6_`MIj&jnM=OVIlXWpWQ z8B{ip{LJ0Po{6@z{jW@z@7oKrx;MYx}w-?MT;SQ1JnQig^@l! z0Qb)%ykJF=`fSV!+$zl8y4l7X-K2C$>ZA1yFL3gW6D==HCkmqfMQ;A9wE2);SxWi1 zt&d)~?kbm^R4N;U9ZU*iXx?(qUr&gg`QjujDdF3htefn41lNEpu@dYjHWVV2vG_zzQ| z$m)~X7_~lC|IR&gkZV-;e`dtJ%;PO8$|?4S&BHF^B!dBMTdFqRA*9kVpzoOSneQGj zXLcs(O$};QcL!c1D4DiLjXF|4`5#vQV}NEIz>x19C^z+?25o?ESK1C`hI42bP4WF9 ziBHwfCmqCqTckzWsXZt(s!iqpAHKjJ&wXFO0br_^TT=TUJ@pFO`I;bC()n~gUha>z zEmgj5j0%l3xc&1J|Bc&XzrcAn0W`|uSw z%nwr&kq|X?zBHxN9n^xLihf8}1DZOYxJYDJiRCX3sW@o8aVu;)fE+KqcC%QR!9=AH z_>1!@+k5bV;+%Nz~LnsvD7K!)M^}Il%NVP>gu`_ zd9*De;AV9_fK>xmIcD|0N_Cu&Pi?DOe8S0=7V~;Z28!Q&+2U2Pv;Hud38MjFZtpJX zVCRmNC0CEem$75%_P<*+gPcqs8S8qgO&@BLfvS4ItGbs%?EPJpr1=9r7TTwCQgC6@ z=0;>E`d-j1I6Uk0@MsKTaHjrbV*HWV@2?_jW3{sDqDDkyR`hAp)1F#rLWD$+o^nfd zH@=ljDT|taimc(m1^as(h?w!VXEE^hJ^PFeiQ6ynSM+|76}&6_0ddc`<*paJw=5E! zv|H7h_1HwxmV;SIov2hDJLd0H{lOrIKy5wO{CfsBu6LJgcLW=gHc;I#7u}($e1`GAqEp% zPhd9tJ|X!`2Set!Rq&*QVnD@IW7JKgQuz5-+9UQ?-i%MLr=$?8W-gvj=_8zU|Eq8Q zT@jbKH+-Ne4)Q8GRi7c&M14=YA^dsZE6_ukrC;9Yb+q@pkwj0La|P-en)i&>*^I%9 zZ6-=GCtzSDy?W%jBdYmOq|id-{@AYNt;@M;`ishB2xPnB*!3L z|71x_VVnmje=omsvy4uELQhER z`9pe&aLSu7HjkFzBH;FJspwkQ!*p^vvLT1yMU*#RZ0)Eh_eZ;yI}{nbPnaSvhX%Et zKZ=#UphZVd3#WIGmNvGT%6&CQ(+Unl@#NmOE3Z$sA$I7Fi85mG`OflFo42_)BpK&m zfaaI6#S49CyBL2sWG5^nVjl6>*dN-a0G^Jjy)_P{;Tc8_C!AOfMLc)68??c`&aC!w z1#P)^yiVD^sN%n}oFM~GGg)+4-xA^>$QoZX=_$9r zptY~Hy3`?2BmObTVKmX2q}q9Vw_Q-j`=hh5(R|t%J_-~g0jH^*Nv&&R1$*V%Qo{;4 z*@url8P+e7AglTJw+-<~hYl0HP~9Yx_v;+g75QvX_(6I2*?~i9Yf*gJ{q1k*lIn}n zkCe&}5!WE96}t-<<-en?0lCmPJ{D?lx9G(^_t}3oYEkt01&gu1Eu-t_ zA)*Yhn#qVZzId_d={seSN&PnjN1`b^$SR8Fy)G?OD3*hw5Mtww&KF*GpCu7yi>cvb zpoq{O3U+u}W|r3fGD0J_q4o}&j&vDONfErfnV4eJ6B52=3Ev%I%32TPsdtNw zE|M1GPjg3wWh?9IU9~1iON+L#4uy0+PmGnT%^TOfKN&ER+-$WQ7BC{f#z`khji$C* zM4v-F@3I3IJ~5R?awchR6x<`?j*T9uJV|_Sa99GMm$nXAMNC#V8!xRH3pU%cX$##aILamH1SQ5gA$F+Nzm7+e9!ky=VP;zVEOv7o)$IFRy2(9T$okMxDm6~GP69sz zJ&QuT&e(?<(}N`c`wYC{i$x6GVM_roxc1$LCnt1sTvotYCG96LQh6YA{@cj;p>Cyt znsMQ=GLLzUuEr>hY54m6&8`*hYWb1-)#LS%gL^g~)Q-ztoKNl(;sTQ74LvB%awL)d zY$oD-PoB5HXr*|zPDc&I)>|p=5d3BK%iCTk=uEtS{%U?rz{x=vx8V}y$KlcbIYJ`= z<_VpoS50wftZ9F{>nV@Re>N0maL`!dNqFm99^C!&*gqW5nu-S=k_{!+)QZt{qGAGN z!o_!BMgRmxm=>A19K#U_ln>#)N?$H9gF@WTgvD| zlpoLjQ1TG12BV&N%QsuaV3dTQHWXSdQK`1u1Y&6sk2I;AoVhYrQLaSW*bmoeR zTug{@mrF{k(}5fLh_-AiN-8B58d7;{?k)My>GxtVToTbux;#m}pz#%)842TxKh)yf z7l}YbppMMQhd#yBrKR*V*~@BnyjQD&jo*_UFri-*qS&9fv6xx0=55^B)>sJ???klR z+2w>zyBWqv^R?gR*&O$G;iF=1o3C$iuH|Bg6ro}=NuRnFUS!gR@OW8vq%6H-$(8Ir zX=!YqPS}rY-Hr21?u=*?s4AH5Zoh1%BgEo2+SJ!0flLCoIQMYauPm<|9Jxj&U%-^6 zPqQ)tVUnj=U*dVmgA5p zHgw0c$Vxoms3NYnL`_;~)$7wty>vrGWzp|cl(bA7`N5tBO&J+YA{UN~H3e*qbPDw+ zyXAVfPl?;@vv*YK)Kvo>Lk|3{_Rmv&58L=yGTn31N23R)#!5jrNkZG`JSAJ;KP-x^+@|`ckqMhcUPE-ePb__#MD06^&yY<)Z%23&ER^Np;06{ zWNWEt!7x(F$g5YT)?dGWgKtrb-sQbhNwp+M!bnLm;A>4Y<9(*7FJThfwyRT`lY5n- z9sHEF&j?w$wgs6gZushv7H9BC7iMKS_HK20$4-AKHcgP@a4*#NA+xd<;y@C^CSEfH z9;3UzG7j=suin@`pDztHz4&DYB>a1GZ$@>aAJzv4|76y4>!^&-OqAJi-*dq@~h0cQ8qA|@| zr>M5Q!#F)ldNON94sjV6BY3e1F`?bk-stFEKmYwmJ3p-o`$;U3OT;4VHc{o}IAU1b z*+XTdVf9QHiza$M(hIZ7h9b;t<1pC<%80mgH00?>r{w^U$jTs1aMPbvbjR}bC5l35 zxog$9x-JWah)pTo7}2nsXW2XhctI9bhlSkbL1&X>DS)vfR3ZqmdKBHf$6h&l#yPYzxmbEM7S* zIUpEx9B}5fdl`&GjZHc=nCsau&_3NsglxSAgPTZ1v*`D79d|RSJ93rS&ve>n-5%MD zJ2KYgs&%U-YRii1AN)%s)7k6z@Qb=3Sz)e06d2cM%F?WX++`~Y7OL%!b76K>mhMi> z&!E72$`5p-jBaAN=+RNT!gNL+tgOgFC40&XMqA#jhH!g|x?S<&)SV$iV z>+h$j9ar>(Mb;D6FQ68Q05$Cs1T@>uJ3~hXb9NYZmj(bIDN|{=U=h9`U{{zZER861 zRK8T4$e1`8&=}Pm(-d3P(zumKz3sWS%(`Daru|0#PH7S?cS>n)2PV$WTML~?{)8g7 zaiPU!i`phFk0nC6CT0kn(-pXk9(6JkJ2aT&JvCn44s9ypT6{l?j$L=Oe$|ay`jAoc z%EHm!_D$n!mt=UKY5jUoTg4U4>YDZ-M#-g3PQg!2Jg!RxDsGstj$TqW2Tv$i)|qAfz|g6}Wd zA!0C;S2R~%HWzYE*I+TF!XUwgDB6IgDDhVUN&j>Jzg)e#wCtwuvV73{1eK3}b?Dz} zz?2)@#Z&8c#NFPrUWm6%{n`Q@5-*~uU$d;^@sK4>zi7J?JRd7w?rAGyKu=L_*he0u zLgnyA-sLc6@>vkwkmFja`E%ZG9)&aBufW8Li%5hZ(WAU2=1M^JiP(i)cY+a(*aq(PxrT45J>2wnWRqOc13kuse` z$126=IWgjS(Q)g8!uO)X)F||T55?8)RA#2cpT%ZmR(#C`OZtE{&3qEs2143UEscJR zD-igaU>%lW&QE#)$HV=o+dj}&BsNW<<%e=S>EL{r9OaR<+r8idu5rwm1UbR z6sCu>R-Tlp1@_z=#KJY(b%n{2ns35-FcZ+o_4pGY!MJcBa&>R{H1BfBx|eOg?F$kC ze;JYw1#kYz)!%%QPloUI(gI)Z)7&?^lbeKJavD}%#7sHc?Qsc}o8g-s$7rFf(7GM_ zj$!mE2+7iCnJ})JuKfCa`x+WQZjs{2!czyFCL1%n_KV+6pwR-r{yfNQ-8|tv3EQ<{ zH>um4MCJ}n7b+<)j|j8HAL8mq>Z=wewG%pKgH$*GKX70|EdO9WratX?+K0ba*8+>L z1HRg8W<^7+#*0H7;}QvDOV9weQnn7IMY1;0@*Yb^TNBL}XOaysJ2~N%^OpJ3a|+h8 zKEV42Ek3WXm@X!gtlkU7aan@as!bZYPe-T98{bW7JNuT2{nZLhTzd=9LAZJ!F9*WG zPPv5>U&_xbvksZ@_T{MwGy>5#OMn%5eK>OO z%)YkLwA^5slJr1`Z6P1#Uc?i~{=wwkF4Fg7Q#rulNp(lQOZY=^dArZ7TyOp)7vC?{PI=Zvta z_HbW`IBAfdgQ`P#$P=Mq!6}o);hfj+7Eg&9bXnVlJ(Hk;1?!DZOc`JT6!@*=bw+l| z`Mb)M5U^cOu^Vtb!P00Io#mUZ{EsGo>$^S=Vdi3P+buegF|;?f*@E|SL%jaoimJjM zb5LKn?kLrjub7&*-3GU=uxYK2_vLU{Po_upB-|Di0544qN4s36DQV<>>CGZgLO7&( zj`ygBO-U@wi2POUd|0_Y1h5sK?^#}7c8?LwRb2aOSZ*m(wKp6Vt#QfqN&UsPns*$D zE|2w!3k$*?9dIJDITA`$f*o`xWRTUe8i+7Cu0s4?6}qV-qKf3=l4nzHQ46htx4=iQ z&%R2C;b@$lr;hNde|^|mZnBB>;+7ANUbC>$Vu2pX5m#p+>{d@1ecp~R^EB}(-M<8L zzi@E@NsOaYhjU9J6VQ;hnhO&Hu)sbJVq8L3ZE>dU=giudT-;B^^81%hL@&1q2;bZh zhIk2bSN90B)Stq-T6*tj$F4+{^{7CrL z;X{ji)AtpqI0s*PHFx9U1U|sJ4F{__e8|UPZ%1QZ=QaWHM@s75mt{ViG(;&Z%1vW` zC*M;vbG18XO)$Rw*g5-_Hi@zuSI_B<{G`nAclBSh zUZFzRAg*<{3Alh4CPgB$`!&4XW~r|yiT4-O&V-!rzPVh2=5U!^r6d@M9?6cQz6o|h zdLGBxT2uv1RX#PWAA>K2oR{qHb`CDYp)oRpZwA(s+U z|H|bPCpk~w$t`=m{1R7$sr!65On?|xCwSjCn_ZjrD)Z`E!9+7&hYs4WhWU$zTo8O{ zCVS}Eof>pNTpuu^wqH_;TNd2A72V{7weBDwW43j6>|c~RqnfQd(A0%&_%6xn=_?A0 zi%QLMI-z>*?p4lROvp?ydg9RY)hdH9hV-Npa%^7MFJtG5tnJ=e>GT-5pSp38cDI)w3| z8D9szYUvcU^|-H9`}v4GQZ~1T?u0;RgZg%{!5NSkgkeV*O4bwXYAj8+YDS*U35LG1 z=zmhf6e)y_?O1KL4CYC2%Hxrpoc|2*sj>f1vvP*|p+Q!+j|iezLY8s#$id)FWd2IK z{Sg19^ka8iqu3^NzyEf#kieqFp^P-aW-OLLg3jTgbY280*S;a`s-y9Ry*`06$xA3% z>O`UEv9oua7oNt?=mO~~yKjphsQJEKY8VSc+vcYOo;f(t(bfo*XbWrzV~d5Ma09t3 zTC(-=vVWv#eoAS?eLHzYi+N`TDMMJTn9#c3J(V}@+X>Zz;HJHk*`lp=O!O}qY-M}9 zPzihX1t233guH(1rM9<}5@nq!;U;bC93%N~S6x!J_}X5>xlmY_lI@6nplA^vOZFNM zY@HE1EM*M1pN?z&EEkqGDo+_3ch{|`01inq{+v5>l1?>-+f`MM*TSvsXcW{?lU7Zu`ht zGM9$J8^7lKd~2+fJJs9?a|k%mCf*LaBki2jg`>qbAO;HcOL}`7j<4>VLJRNCl;HEl z-Fy1B4c1wA?W(ILN-fRymyL)BAY#KYAO7dQ`orS%J^}pW+%O(yVjB$p&N*{8*nB^rDKr6gXtMnFC+>Ux#)3ck|)RHgK-x#b--3x44 z)!WmmBctYs!k5YwRe7M1g79qYC@f#nZZSATa678-{`@Dpl{(8cfiI6wpH(#XY1^O^ zjQ4a^%uIh~@Uu**AIsGG%PxOWdA?XYtiaxmsw^eo0xX|sW^W6w-YgRA9@}4(&Xg6HDyR~>WfNJ@L%tlg4vvl1Z;nd zlEKJDk3{YKBL~lqmw3$(TZ!Xo!&iRqLW*KcKPg) za*iIn04qOF!6j?(wb*n~3DM^#yZS5BJ99dSpLjGe^Gep-+vdyD8KS zdwJ|rb@R&iJUWFx9(I2%VpmwabSN)M@r;*O4Yl+*c%MKEWs|H~?GTMHOBl&e{!0b| zh-Tg3_VIyshKwWxi&FWlC%ng5IInO`6fRe%Th4~&BvXLA95CJCa)Lu`Ufy0dD+u4> zs`#glPk?Bn#C0Cs98p=fwQ-_lih9L`iZ>2PFk6}yJBuadDL5`o_5 zoRVhA(uCRPXRaEGYnD7>8!(=9DS~!_NHiG zMS_=uG~7k}Q~y+4r&q#AcH?{zV|U&lly(5Gy#ftere-vk5#yXAV+;)#l{9s3Z1-Pf z!1CkXl8?xiX=J{U5yhwe1^wnl%eHc=ZuX2(47uVq3f<`*g(nz=i!<-sMMPN`npwW-eW z1R?Q$D*_9LLH0*sjx%rkqJ9tmO`S9TmIHc4p*;{RnIjOk{A%j!oydpg&OMeM6KnY1 zI`1Ny>J4(Ff?#bOFf`&;2Ou5j*T1(!%V8o-+{(FD8RC}&&1SHrrbjB`3)^+(1#+Xp zEEu>`b1EvT#|VEQL`F~3#tPAL3{Gv&zvYJs%XWN1Fj*0x1gBmXxM=8N3)m)E25bwpW@{$ zYp2`LD`SnC_s_wauTF=#>P$~C%;A&uiZpS}vdkIokQa2J*R})&%^C&9nluKalCm)q z1r5FdLw!jB9GAW2B$_U4e{?Ote~Vjf_iwA3PlL5kooJ2FJ!3Yv2)X0dn>fFdCx!w< zQQmyHvSYPuv@z-1m?)JS6;Zf-ob7J-^+<^XWs{E)tIa^F`-v%~o(;{&H;CzHmrbS6 zso$YojCQBdIv3PL(K8mwrbwkHq~V|E;;Je*EL($v<%zB;Nl4xH%*Y+a0t4Sm(G7Kz z{2=P0p09~gBFy}ES7(MmsX?B`Eg_&6(d`a=BUOoiRtD@TPCq`nH@SCbGX;Nkk5*V( zOG+SU=-3;nk4x#9q2FSoem2pltG8JE>b{ju(|MzM7Z*x;MDv=<#}jchFePPaCSiy! z7Utd?c4dr57qz{Fe`5}2DrwF;(ZF@#5wi(*n7H{nBT9K3hCi*>Q^*eO3#t7wOC5ue zTwgp=AZkzS?5sujZoKi}xLopx@hmlZ?e?887sjAi$=PKH5FHoq1jeZ&op5o6^j$UU>d|`m?^=v-rILjU1$JcwJX*OXdqV99k<4S6Ip_`u@Q%s5)aDXD6IIH+W z4TY;x8Y(D5yizg}Dv6+B6t<;jjH9#FjXW=3?t?lubE0ianyHoW1oBEA$w{>5O4grpB@bWtkT8Yai{ak?ZZ#Pamjy8*54_oTz^$ z9ecUER=c{Vy?;2s+wltX(1sUGi#33k*q27d?T2PZVQ2Otrf1OOlEcOkMFOddL|paDtZE- zhM2bH&8Y~FWGik!UWjt{vygR9Csq}5aw2ydu2g*6V!F?rQ0_5mi+()QBeb0EjGfv^ zFte;R{}9clMEl7NRYnfn%Zz%Q8y1eiXm2Xadmj#5nAx2G=_@9Sn)xkWcwy)`<^+jnq!IDQ_&T5#mP{%K(@M;Z{?G} zK1ez=ms2{dHP}NA%j1k&Fj)I7JibLk^&u2I7~Oa3ics*!HNgU6W#O_K+Xs5*e)lDS zqB4(_JQhEeAHcz1=iS3?Yzz)#|8>pfF^(!5$VPHd@r0g*QEmwd(OB{XdAha)st5Cr z(spAL>-uRz3Qe}}wiCrcIkH%c%P$AK)iauD-0E3Tq|+;}Mb|6lo1XOG(JdiBRIQLJ zArcZm;E|M+bn+v=-WEQW5#u3|cQY58{+dV18VK}>^1Jh#;QkbL!9Av6Wp}x!gEyxK zF}K_3u{#1Aju^Xv>o(E~HSzLteDc~`7^UGb`mNJ>fy6nz8d&DrnLjqun!Z|JO;%W3 zST%pYD7k&sV?AP^!>xelws4W{hM+oP`M!TVNxbot00QW5ThsxItpyH5!`yC7vLX zINNd%eg010Lnil0VzS7Ih+@sQ+~u5De0r06N8XaFO*t5y;{$8iQKAf(k(&JzIqYAk z&Jv71Bdnm|y$=|?x(Jk46R6U3N4Yw0pI$~2vW z@tWqEMc8`TlLxe#2DLES`GB)Os@;(N(Rcku{tnp;Q$`jE;24;CGv01&_$7Cw5pGi& zgb5;0Lbe{uniM)2wXm__S5{H1`8OyTng0q%b5)F|F?UquA5}J&iGD5{tLuzfl}PVWC#Il|ZJ`p_471hQX8B41xQd zoeioX*=QcvG>lGy(6_>Mnhv{iUlrlJ9?`ae-Y_kZTnWTz#EnYUccqjX#*3SgpNX3Z z@OaNiVz{d1(_0lgH*A3-412gfBGdl^U9!j%jSp{u_^@NV%99$0?kqM`up)15z0cgM z>e!qsc#`6;xnPaFb4@^M&zt6Ms4?j!dbS~`E5sJA<)G)S?^&_!X64p0JOD%hf-?iC zWBAXBd&}f##v&S$wik4xyT`|gHm=Q6y9{c21aI?IAU&21CyoWe{l&wL`VplXowCur zMr(94o9Dcht<^?E9Dw)f!&Y{TYT<_Ukos6W%23(>-(V93h|#(v_`p!ed|KGJcU3fx z(PnFzjV21|EIHFZF0Ia7gN=$frPgiUy;~hkX(V@>dnYM(&omWo#_(XidCbHsG>H2; zTH{tiuqf@rV-laquphH|66fjGf)Qjm4(+Ociv9Shdj_7s zcvHQRW_zbOVb^{$7QTjqn`o5pEM`|V=x)h@;(z*l%lo73P5K$RofpwY3e3}U24YU< zGVOCh`{omVhB{Zk(y|-JDnvMX3x$H;ubcC=T1PwK(>r%=tYQi(&I-jpxE>OL+j$Iv z9x=;!GS&SyL`h;m?{K`qHZDl1Hkf6@N!KM z3uom@2`&yI+fR-~Lhet+iu0|F^vk8PQwJ-SRceTenug*-clzecK`e!?=!*jsy6_bW zOQ`2#*-9)13a4L99cn$N78Waa!sC7ZUi@2)3YRnhGexF;wEP7%M9Byn=s5F7LrHe3 zjP6`}FSxDZNqPs2F*b!RtVzt!slM7`t(kCfK4`#i+&_ zH8b-@jvB+)l`iTX+c?X_?4e%ku3)&<5)L@B#nv-*snLLox?#)cFjEwQ+`Q?%7ZjUF zY~f9siS3fSx=imZBl@_QU+JdU&edfh=b?E=YD@on+eJojXsziGF)RH1z}KX9fiM&$ z45)dkBB9>M_r7JJj)N~gQv<99pU>E+Rw{tBIq2GwO?o7Dm_oAAZKM3N(I=LjY>1;M z$TAY0?>s%mvZ<_QJyfi{4LB@lR?!(@WvshNHL0J;anHbJX3-1 z2BVg(ye(UH5}*Tl)(yRWuguv)`_6_$Ycu!y&z+Fpsq5EC?m~X`ArB|p4*7maCU@$6 zc@2W52r4-!o9JypLSE5^)h_0#uHk=erPdz9v-Z^dlnrSovdh^mBF7N@mxe4i9p?)x zHzPF#PunnPUFB)%c1&5e>*M74vMbXPoK1uTlVAXuYJ4Zth|!8MxS_3}97H`?tI(J{!y@OG(gczH67 zJ>~NEKxy;!3nHZpy{Q=Ex>uwm+Y9!ks()D7nyOc&`Ot2WEwKcY-*W<4z)As+`G&9y4m9)`F9P0XFzBKCxK|L`au5e5 zd)R2&aRxfS-nLw*x>hAus1o_^GG{?Z^jwCi7|*Y8=d40LVl|)o?r&e@x1V@jS%HzQ zyL+0hkEQ4gpshL}AL>M=Zil?eZj)2pcsKi-0Z^dK6GF-YLGKy4=wCP+-w;6uR|OuS z$H{!CrfK^csyF&u$R&bgPhi^`8}ry;%Fxd6R3fOiqk}#+wZ)TS!fzq=WE-G9=(VW@ zHncYXN_n5VZI&e5*xNoUC6bcbxcw}0frI*(uZ3;_+PuArt5)f_QOVhnFAQ{7UC+Es zk-2s4l;2O2N8X-RVgE+;O6K<$pZShnF!+gaqQv~g?JB1uhRYEJTm>FWJ=;BAx^?{G z_2T^XlG7$J5%UhI$$dHEgG+z-Oa?vEs7t z@E#yy_V~Q>Ljf#?2W}#0c7G6KFjSSJtuPO>1iOqcgtD{e8|@TX!}~_#LvP1+VmNay zQh<8KU{k+xn?*x%qInuN3N7bM)gis1^Gh*tg%C?SZw9+dSzh50&GNKUPVd*Z^Pk%h zHkmON)@IWBKFt<;OlhXV3R`Ux^p6TFKNR~zZjY>O;XBH7JQO3+R7f#>6*kI8wz{yM zW9e&0ufvEoZLb7WH_OJRFPKKtFb`(A5L#5|c7txwC@nNVBBA5A&R$hW&E(A7>*{2h zQ?!&s=COXg^!6i~VT!$wop1q<{>N^1PT#o4>xJ5+U{w}v{B%38gwIl0vkhJhYoElH z?1=@H#xp5rh22xvjPJYv8}G_s!d}dRMV-GL^Y(g&(%DgZxBRhAeAd=#Cfo3YvDJsr z*Q9m*dO^6kkY_ti<_8~lQO(b(-etN9Rnp988eq#7nA#8$=H>hQt}Pp$O%ty=X~>E{ z+eSQvAZFv|@yS;7oM5os3?D`4ijExiNjN%!#Uolex+_eRWFkwxC>$2&{Iq2a@DBTE z`ylPYZo08aC!=T}ctW7Gi|DDVn32?8!=}r2&a&v(0k&DWThym3@C;uCne$xM9ZTQy zdkK%N)$y9FRXRP8Wmad0u(io*Nxv0OcvXpg5{Xj)m_2A_NF6HQG6QrMU>BqjVGUPJ zeP!Kh#Tm%8`A)Wj(#%e3+S>yeWUPPEPv6@78|>67I)w#hbKL0Qv>9W@z$-$jn1QI_ zlU)8ovo5=x{yt2zYvIN11FYF{TCpAq9|GrFB}=3(tcZJG9Y;eI8`<)c>YjYa7#Htg zUrhcbG@&_{j-Jk2Z?{@ojiVDzJ!I&6iD?Z1>M1h#(v?e5&>q(-Y^v^ILWq)J;!`)Y zch@j)klK}bdo|t7w?MaI2jsE_-9~SXGS%PvZj=+w_-WiG8*3R%B)pg#D^O6=Zz5A) zdY7ePXu}httSM%ZGdI0W`DF>$DV^+)Z{`{p7>(3#7v9Cxk~il{=nFHev9M8({wY_= zta+oVIE1cO!VTw|@+rgJZpEe>$z(jkgxZeTBn!e(^pgvdN=UynnusSdjQOdWNw`6 z6)d>S1D1diZ?i3y;Vkw;+hdX8A_~Op)^WB#qzc-dJ7-firHETJoCCAy;k6NbZE#Ht z0&+5G$;{taq}xDz4anEOG}I8vE;pgwM|wK*V%1qoR!!n|OlYoF^(7o#4uzc~l|v}b zAwb3#>}3i*Q{mU!(gmg-d`z7ntO1bF`S|;_cI|^Zh0t@q{p8oj{)Qq|>??u0(MR}Z z%($_#f{;bd!`XK9cZ0&_Q;53@zaJ-j;9@`Ap6zTliSX z3R|0f7ej48fDDk8MX8wh`)wie3j)v|y zxh~!1ktd0`FPHaE0^!*}nulCqsc#|HnD(Fno)t3u6@0|XrS1X+Ze7oT&>`#5gwA@lmFdBe)EC*3LWGS|){}!~+ch&%k*%s=_7w_# zeOmk#4&0ix=_6VNHA;P%uFxYUi7J~7R2(6yqV4XR6Iylz?acKNOMV@%#B4^{*`0!I zIepC;!M2RTckIWjteXYj1ucCHn|lU3o={up@@0d%Vv)4Oc_N|F#bopmM*NP-;m}FF zLYn=i-LTH!4{L}M{G)K-b;%g@#7JsYt$QEU@cr!M# zP6Gz81w*?= zkw{;eu0F0`5;{sGNlT`O@6H+9GaJv`1`*}S1~(vxV*jlNh4C*cX3P=vtLgrB*=G>V z5qjpnB&g&KgmlyGsSQv_lkRXixo^xe6YU|tusY5IWMdZ}P(_Dv!L~k;2V0H|8{A|k z=&)}KmCmD@K82iJe$OOGqTUf1W;kudy1&NT@AWZfQ>7N+dJWp5tF+PPNl;RnhVi*nsI%OKPs2SS zbD#~{cF!utW1*x5lvs=QI?n^p;fF1b$W z#15y&y0QGU-?dX^9rUx8!uU%mnDg(7{8_`!8;b)8_M1ZGwaJ}lZrsZ%kl6afDK7kE zU){|gF`g1eAR4hKzjMt_45Q_1C9C$X#`4X^yHFxd{tC@1+W)-+cTx0Kgu(;*T!C_x z4U$b^#z;t5fZu9EAx`c%*`m!o|+pAC-pI<1@CIKD9IzznGyRoK;n=)SNojz-G#Rk!H(M#X`ZSX(>`8z6u~h+|!BX;+w5$M2DFsUbNENzCDJ9g^>9 zP{?l)#aS$}SVtEVrqVm60x|DZ)}+F)z4CpzDX`AnwX;=WVOwglaEuk!Z0pw+5QxIS z$DrKUmblp%-qhG81_%tOcQ-j0XTBg>g>qA!-PHu#x03o}c+8!lnb7%HV)MjCFIb3g zxot&AWwty`Oe|8Q8D?8yeULJa91RS(tsJZ3Jr}K%JfYbHg>{xq#M7`bd1F=4Ux6>Z`Ii2Ns?AOQo zMq$#^11uqC5*8Fk7w#Bp0Q`ryW3}G@4rn6=qtGTVJ!Iz^zam%~(m*K`n{8OOWlNzi z1*|@QWwH|4atL~H~u?uIrVI1xFJyE%py8~ zw#kHf!C}1thiY$=z5y0cFLTMr0_egCVgL&jYHfA?T~AG;J#`)TGJ2P7j14dN)Vt*) zi1-SR#uMT`@jbLASaQM{d>z17=!>DL__MFJWwDtN4bs&cErBUjo_X~~43glWPhlTA z+Pt({-h-xrex@N9h=^M#^G_PQo6uoJ9I+CfQt9`4SZF&%AVcwZq2kM%H?|2 z`U@?EwL7-4*%*nZkHj8h=+@_Uw_uQ{<y?mnDV zR>>PDu}9w)BL~z|mob&z<12lAxcp^mWrFf0JJCejS;VehgdTT`%zjT;`X9v@Q zZ)%mBgWB{s`;yTlppmn)HhpXBeLc1DN@`wuAhrMj!`A5596A0qmyhnz@iYW5&3MjP zVvU2I+r8AI(R0&lYn`u(?N-JjxQ)p1u7tNM%FvfFrlMx=F*)82B(qZM=D9n`MBD6+ zyM-L9S`xObHVeq(Hrg2L!dlC(x{A3vSFp=LVAkABE!32_n`L#%mMdU zfa?v7CTr7Ww(KjluI&}E41!sYfA^*P^rFw`DUPzF#*nuxP-*5a278au5vI;V>!V{F zAe)JSZrHwiv#MU6gHa#(!Fh4jcHNiwv|!AL;k>8I{?X1q@V?Aq;$O=Jnw8?&1%)kK;OAjhDGOgQt*ZE>wz*HEPek+%=m zOa0`ReT<>;RMy;C{Wtp~zkP@Rv+G-I5c5C&!B2{cm{=UH=(-fVv~xWjZfeH&)07}iL(Ud|zxJ%+3bDtqDuNgGIkhA_7!N6zS_un-$^qshw3 zcsU2%O3`k%{J364t`mL2ewKTfDx;Y!w6jAo?8y}rNS5*nIm|fZe%w6Aieh!)uAA53BELtGdF#vFf$2@}z8`p`a2 zychY~7w~`?(9Pvm0qz<~o{1tBD<+rnAPY%zJyl*eot-=MPeWqyQgq5{%(ZSy#eu3H z#2QUMxRn-52fd8Q5voKl87B_xY5fyw;#H=^wjOVXI^Q9pwIMP)kYErEEX}FPAl%sd z&TyD(X(0O#(Cjzc(?o%)u*`zmiqB((R*w`TXhcFjl`?jEGr69YC zL7b4^6$GUQHGhEekbGWGfM=4iw)k3XOOe}-?J3Q_uFP*cJgOjHtGe( zm~;CGaaQ8Drg|9G0#!w=v(Zmt&@)x*Y8OosAF#r}ToeeX@`A}P1WV! zxP;bK!nA`JUZ)>uj)fh&p1m$Oin^m)yz^MhXV}`>;kxNXWwlum+ezt6z?T5u4tsWY zu*_$-(z-Kr!+NU5T2OZ=dA)>4NLWj--dWnksJ>Q#8zqRoU1_5^p(4rJ`1Kq70kyq; z_@1P9#k5Lig@_MlE+r=ptgo*1F9w`uJE!8pI;$ooaKReC#>b%y#2;LTb~hxd)17PO zZO7}YGEc|CZ%X1yHjxuy?}t5*B-?%zqk{@DF6ajPM8noMaQ5kC5;97`25{)B0_FyZ z#VQItP>s@B?>eJpB`B8ZUzinJz^tM(sab-1Piy_;SIYXZH>BnTld_2dzcAulmC4$7 z%_zfpR0_mdsP=G_XRPVcYMDQI5AdUWlt$3d49TkIZ5`iy0gyudw6JoA2+C&(@pO`f zpSlXgMuo`<7})HZNSF0R%TgLt(ByH}I#!!#OZjO-o35>2D~1d#-Wd#v$R2pUEBC^! zXx|ZSZUnL8#sEc2P{B?3m#FE~9J)IcJ1_E7Say|$_s0~hY)D(Tx5;%=pZfI1cqQF= zmFl8h?e5Ii4Na2_l35;@D%T>gbj=cMju<4BaMPn}(Z0wtS<>m=^>nUWtAd<}NC9Wl zwvO)Vp#j(@*fQD^R_8DT9>~~)8fY$vQF_3cbkb9ifSK2Uvs9UjLnUuT(7XoRQRU$5 zGN~p0)ieQ4mHgu0&}S*Lek-DBIpgN5FsdIqsqzf_36~rPVX^%=MhkG(5G?hSG2qkcoY7@B#N>@Q)=MO==izQmZH-Igym^f-6_u!b~^M|0je zyWZwOV^$!~M{9+1W6{!T0My-6jXO?pbPjW=7E~8~Z3Zz2&-zg*HZB{}3B(>OM^hFvIbh>9^|{Ci7JwC@Hu{z1UUOc*#3V(Fxw*Y`7)gh8SKYzQfOCs)~~$j7F#g%LD@F~bRc z12K*o9RatZ!)#!lN276a+m#?rHbU5^afS{=x9yFTDAVMXr2x-4sUjM@EYr;GKK8Jm zA`8jqogTH&uv*9taukQOc2`=gSA>`c9XZ#@b$}M=yQvU$6vMd^!1|q=#%V8FcpEB+ z?Magr!0!qqSa5I(8L?!zzFUgCUg|txZ{0mS?DLZRWp;c(*ue|7smL3ayg3mSmNV+S z`HHdsBd3%!SG!oq@&_OyKll4gY5)k_^B_xuOrKHeOvyH%8y800klgrEoe5*tmCmsJ z;%2`vkpyVl(Ph~zKktOYBlyl={T$r<(BZkYg?G%MpnCoNtL(-S{6Jsx&;t%AoR%P) z!knAL$iMc9;xDL8e{~vT8Ly)gc!uS+yol zz}%lJ-yv%<#}kq(L{JDQlAlFr>1#bI$>;9F+ify=)$eLqnHqaFIqOgjvt@x^g&ngu ztC006J$M434SSV^rCj%fc;4Kbxwf3$n=H%$%=V4OBp7O}mJiiL5Vjs2%bE*yLo8K8 zHi8XsS;NEv6NfYsiqTOKuD22C+9d@p{#pfXs~gnrd4W7^GZD zPM%dY5O`G9nNSF{Tk7me>^g18p#tQ7E^Z`D7$6@krBoW)iVn}~J0cP~b&L&1on3Gw z1LMl?LU+IAXu$e;Yg*Buj8W&%! zOs$6rPicdI0WLSwt9`EDJnmJoz-H>(w0(oUYF!ua#1r-@Kt$9!Q;#gpQM(lKvGqvy znz1W-rfOsP!aG@@uGCly)v3^m_0gtcjS;96Kc z=o8!9^_UlCtn9gi9OCJ#sz4PoHZgWHELVFus8!;57mig2IsX|vu2=RIcV6JnIaz-k z#;=dF3(e!B56Ph2b@t^l54Jx)t91s>Gn#b_D*F*^c zB9c?j{GlLdMVL41a&V2{RD%E8D2XYh&=0I?L2)d6?5ymb@BC}T(!Chr%=RHMm^Lct zEz^Z0V$3g@Yy6%i|;&t%oy+xzCrZVG)1x@MQjAF~SupLs0!Gq;!y z*KvuH{QDj3{bAqz#LsbTjHAHj!0LbX)gO-s_*`lIak1$C-Osgw0ciO$-_xWAq|x76 z?fb@?H8cAoym z-bBdufD*=o(0_FF=rHq|Fr6=3)j)FEg<_=K^9~|giE-kOOZ@oB$cJNNgm2R>o(I?b zyU4yn3s9Hoqk_M4{vS49oC7XISm$5d`jKq^#wY*X`|o0DA8!8(^S?n^W_oD?uQpM8 zPiF4}v0vi@U`Q6)wTg*zzjXd#Ju`zeiR?{u&t9lr^vDYAp8=46hDL%bfC{s4+G@!I zs_npLf4xI9^G#q9F17gEbFUAECI7knzx{)^e@}`SOb~uJ_dY85%N4xinrkE%J+J(y zk=#HvVlY+biqth@;yBWy*y;TaBquVI(#2bZ%0GD~3>YYFEGA)FMtcYVY5IWqBEV@W zdg$|gSl9#1nT+Ozux;{<4>e*12U22xLao{V7r=Hr5S&zb=ii$Ik7WFOUQ}L>J`&IcOx%IH()rMYrJ?WTO`XjSMkQ81fd_(hQ!7^JiPHtG~d0g~A{!`9{chJDasF!H(|o$|YIu$KSu-hX*ZzuCPQ zfTQJue}AaISkG^7uJhf$$v212O&{&_s-#WZO@#(@V_72U32^(3IBPa|Fr$9>wvgY{C^nS&G26Y{~eV77s3CD5)aTX zFq!?o9Q^mB?teM>@4@LG^mU5k{~HnAdhI_8N&>SV4W^OYJ2uwYTSW2ie4V2;KW8FE zz}{Gp(qAc?(Q29wyumMJ>-|(E?{|P zJT?Zt8GVzs;^YtCZl=G;`@Dl*!rDFlpt@TBC8^@cu634{RDHG7mM!sH)D$rnmF2V8 zt~?BL{$}pwm4j!}<%(aO^V(k6-x$5OMX-q<)3^}ab>d|^9651|V}nUoG>ba>hUH>z zaka^hwvL%GV0r0DzaXR@I0V7Qug5;o!G>%R(LbeeyD{%bv##)G-NT8JM;A|H)wQPH zUrGgMX|Sc96?iHu54QeE`SACiH8WrPT%~a3w-<-Vz&lT-DL&JPn_{>?jZwW(C|q^y zAU*SzQ9&<8)6YC{;jFS*kJ8CCU_T7WN3N#<>j8ypLmxDM{q7wTtF|Y57C79TL)9cI zdEIE(pC92IagDRxnaLGeMs(=2a!uR4O z;pj7-RD0I+e&_)%+NF$K&(9y}o31X3{czlqdq&nXOZ9_W@fsX85Dg4WRFd!1uVOif zC?G`o3V!htbsasq{7J~cf7?p{UIVNcyCin+L$m#lU$jEq;&9-&d>*ZGaI=yVC(qz+ zaWK1;IO{a0asZY`TT7as2JbCwwnbI4@Wb67 z17`N!GyXb%)Up3lF31Np6YLYEX@y_dF^BE~M}B(C;KV^p&@)oqXOa8(g1fafzAE;I zg@*x?)H=CP@Q|MkqN8t8*P76}JH_66{Lp0_Xz4q?12qwx(tYrEz}E7908rJRURb0$ zbPhGYNqV`Dsy9AB<=-9hV1m#3%_^v`rUjKcy;{0)@OOX*UkOl>!Gi6fYLQ1R$6iT& zr=*?C(W}SS#)AHw1#oZyAZFM79kLl%?C9Bp1OgDg0}wtdO9VeVbQ3?FlV{BL2)|+~ zSnm)o0w4_u`;M#Wt}*4qE6OYY{!|5eg!Z3U`u<0=|7ii~KaTrk^y;A_np1`Ral}RJ zDAVBun;k9!M`T>JPuS0WM^9F0q{Z*ysK0~X6m<}>kL*qk0-K#M4i4eB;W>yuU=cPp zfRmmi+5eH&{ioLyTq84`kz14Zqy-#t_K*xw=Xlt=hjczAa0==Nhkx-V5P+{gg|;}j zRkPwrfxF-DJQX~0sQeTF;){O(QRPP0cl^*L3@YT1{ z^~*#B*qZU&sO6>*^hSx{(W%=Uu?^&!6-b$(kVD1+Wb&rT_q1tHM~|P66WvXz*m`rizq?wh zvK|@LPMv8MDzT!UrQtRFQ&7CzBiW-%CE{C5TJ7~=aTz{jWK1VOT zT5a8aS)9S(q2(b20y?Tu{wlLt-ak2N?+>6r#o1YDVhfX;)E~HE$nIxxl)3h6TVA8R z;bwZ6*v{*`!aJ=5^^?ID{B}~H%Yv2=2_^S7hl6X}yBe16OpLgU;Sz}d5e!!!0;)0V z`TYa@{DJ+dsQDIGq4n0$n`%a}=fttUM=5|sJtxgSRGZqe@~TkrMRd%4N8N`Cv4TT+ z^#kbmL+{HgKAJdb-)6C&-b-{(VG|3cZ`%IWoO|(8Ms8OEw7ySMerB{(6GcTGCgpbO z2Z?)_k7OP#o}|BLzL>=6eJHW~N^bAgpPzHSHaFJ!abD*^YvlWZuL35L6Vn0E{jl9Z zz573xIkOj9ezV1ONHPD<7yj+CSMPlk{1*rMtIhVU#d{r)o%g58c@N8jzwuj{XU~pa zLLRuHd!&rDus!mSBtGz!{a50h*fY}%2TA)s{M#4tjt7)|7GUF zp#FdNB-o#(^`bd&azYu+@*lM|e|f{dxXgz;fNZRhD-8R;{ox*=0fjw^4@W!IYChcf zi{<|X|NIYx&r~aerP#+9XxaxkcN zqRlRq%0hL2B*2P*wa5VuNcOPQLzEb};p-A<7 zE`cVP(ipl@)E?a=WIt9aWIsZFm74vqzZ`D?$BEYxxRXMHJqSnVyO)m-}NtA&u$% zu`;v!UQaujtUtb5=-*{@R5Q*iUyhbsd*iX!si|E~sj~tl0?m&?6vX&OMR+?+2L2lan+Gz@_$Up>A_M8wl`T8F1ZR32*s) zf|Tv#LjP2Jj@CM=2Bs~xGApcbw<{))AkSGWG8JU~>-C|Fop2*cY+gY_#>kUHX5F6< zAej2`OP7$18HHk`J%(QA6psa6z5=*eA*Us-(GtB{9*6MQoce?99$nwF95Soy_Y>43 zY8orf%AC-uQ}@O4SC=9=V<1s#!K9#pI(zEO&SkP`oxL&h5$LymlliY6kXaQ8UZBSP z%{oxibpq0NRnqyn0=#WPCx0b%R5X10rQ))vS87fsK`getUeJ1Cef^Vujz=4e(NQTD zz55%z_RR1-f+upZp>Q6SD*D)03`&ONDC_w6>bBj?2D&FU^*v|`N)PJOKQDT6@jVrv z{MLB3%I(IbU>BjJdM8aJ`}O?A_w3_uLG&hm>}~VcbIug!{ZW(T7jv?r@=4+0Dx_Dp zcwsvY>+AQ?DJeBiJ>_vG64Tgg%jzZ#7unvNv6qZ&>sIMCTz~7jIAJLem88+Qwyg@4 z^?E-QmSskO%5S`TwV=>TT&s9GpTTI-^H<_5P2Eo;PoQ!z2F)D6;USye@mkGuyG}Hu z=stYkPvmm|vDgPSS-fin)O{Gk=R|{FBPt2%uhnj9z2-xpPf!6vb9=y&>YF00h@Ws$& z)el1=mocs`g`;kj-Ek#wyXnsr&TIB=8^uF&-}G~aZHnrbUcw-87OE0y7L<)~o$Li& z<2S>0jqLU#O&oiS9uoF4+^LO}7X@C*INP zKXH@uW{Z~NS5wlR*sheP9Ub+UtT5}(bQL9VJ249~mGQ`SL8g8K*ZI{C!rwaI%^(ho=TtUYiO7;S_vso2K#n!pyRk3Ubl_ zUKrh$wF~mR%1UicDfegD7@W?^bGGB!==cA?b}lfEB^+_RE{nY=6zM3BT@o_r2|k7= zt$p4I3;fiE5M*7UEP?1`u{otfYgx*-NsU)cZDhGe@2u!@kBW$!QyM~!fyUWl6u5%6 zoLg>yDm(J}wvZqw3w7|<;F{Z0VBlRHA7RrHjYdb}18d4?`9mp7;0-qr6frA@4G(e% z`LsNJ=wBNV?LPLiwk3K+K^@wXXGaCbXQw??z=o?c#pxOi zY6zwcA0}!o`Kd7`tL=}_A)KXvvG~W`#boB5)I_u!Ed(b!45QG>XRf?`- zRw{C`=_cOWo-8@og*h^`qDk~vMtTv%i#LP)8!7vhvrG~+8oufujM!P?TsCnJ8hy_3Wi3#sf|4Mz@x}f z>fTWGsFAb=%v_>jCfxq!`+B+rrrrbc%VO#b7 zWeYd^8X#+HlyoeH znrOKb>c1)HBbttgz#t(bn67ITxR+HsO#*{=@94ZK4i}aWUa>SMdWiwAF%A?5ehvP# zf^iJ3kxti3xfDIWnX{rW?h=@MJU->pn#?oZW~o(P40|0Fb&k`ju)3a*3r7UQ@H(AL>X`*qG($vzJjM|hjZRC4}+0r02ABVQ_`rgghAs~QYw>|Ma* z@%j3)x;T+P(8EJu*}yv_xkLDcG(z6nQIp)fjRyKC77V90%KT{x#;4G?L6k*3d5pRw=x);d9$fKOoIm132E%M|9-aeuOdXQ7D@{9bsZKvAej zm6yd^lTKkwNn+tS&h-VyyDQ-3M8__;k1X0m5@I2Fo3zVeDlZGq8g6P77lR9?i-~3p zi=|u%fns{{1_!5IlEPrq#qh}1`P-E~3;GV%Nfr16iJRxHvfmrrmP!cQ{NkT7d)hUNIQkG(IF71l4gVK%-Ujs7it>;Ib=!%p3}O-XiBmMmFJHg&!2snGS5@L zw(VHh-(j0R-1{pTw!LE@-ajzB?IBZWUcRXfBJSFh+~pSAx!=;udVITapkp)1piRB_ zo>Z$rCvFa zyG%mUr!KLDa3wft)J>s_wcmM^bO3`J5?iZveQ(wQeWNJ0x-(UFD?0}5r)LI-#Oo)K z^+HC8vtY?|(^`ERI<3KlJR`*XbJxM7o!wu}MQz{P?ln@}9~;)*tu-`9OukzZCW5Rd zD-?1@X=t;3xXN!{Ie&^{T61si3$$&l!nu=auE!1VI)JvKu?jOc<3uSNLgYYZs4=E{ zqGq1X+KkKlb46R`^d`(VfsSpyqS`G}bsU-bx{4dT*Kkk=-7x2?+W3H*RWI*V(iFE? znHXkask;SOZu#|6K$&}g_~U8}_CrRm6bcxXYHPT;18|5kQgW*tXi8u3;2swb2Zq(h z`}&BnUhpHtdp_N>Vz(JkCA)7cygCEdKgZ0>%|}arC13t(-1=j(anFt+xF2E5KQn&m zjpOg#;2@S}-S6`tFJjPX8Nu1$BRzWTqGepKA(O2*ILsI)vgmf#q(tJpG5 znj?-{!J*P?$8CEcO-9U4zqW+Da0?~6uyKc2VU-Jae(Pmt>j@p%$dc7Mej92iI*Fy6 z=`Z$hZ-T7vHb(z$3(Q=q^EVjp5xz)qY~g3~W#$|1P&BIlV19S3D7m-QuM&^DNeSi( z$;Hhsr#m$wy`AFw1W+Beccr_(v4aK@B)%Y9oQa*?E{R6I9$fB9cI)DxrTZgW?@JdA z+0YO5+DLI^lNQLOF=u?|X;x=euw+(^-Eu0%l2YILys+V>ja`xWWgO6tb8#%6a_(z# zG{TeOCCe5d107BBB#!xq%_vD{4XucN>=s(9e&|H(BlsFmRIBqPQ@%Xh@A3jZaxRS( z&&DqM!f?;ovrCq0rZwIl&Y~VrH=gc^D}h{(xe}w}&{SD%{Tca@e+_3)-?5t*lO^Spcb&UJm6pAX7odS<@|rNCID_OVHQ zRGi7_AU+srQ39AX5CqIrek?jLB#bW#sGRLPg&*j2A$EM=K~W%AC~$?hXz^Onn^ISt z9=EuGh zrqSKkA6w(do&E;Gqs5_wFa<0-4#LPMJ+1|GVG}<%ejQblLcnz@yBmt32=C6_MjFXt zHm4;Wd*@F>ea=h4ExoF~V3BvPMrQhAig!8L++~yK!#8o4L1v8=q=YW%USi`rJ%>Og z-!KvBsV4uO|HIyUhc%gg?V@8J#z90BLDsqJp55(7S+w^b+Zv48uq-(mP1+ zAiWcmUPBK(N>3nQLJ295{W9~Lv&EUS=ljmSuJhOTZ^~PqwVw5?df#zvq^IVf;DP79 zlRKO{p>_Onb|E*>c~WloApkxvFbMja59bmC;M_`HuY}0?MGZhRjTzN_qQY#vv;it| zQgc>4$x@l54bEGCXc9&?p42I6x_3*pI0chl2rjod!2zyqOrNx=_CLNd-KYkWJ9c*l z>(kR@M)X3L#R2TuwQ81(>*}HR5};B2P`MS`4^6M((Z@zo*8^I}IjcuavrbwQC6IRc zF3`+>GAI9uyQZ5joD8?h$8QkdhoG6 zKPmzNoePSjn;SXy`(Em$W7`kgYu^+{fOMt5#f1#I%g2adDg~eNnfR?c1C`UVpxh21 zbGJhcc*{KN;xc_lCaehurRZb~p(a?#8--^hm#H1)?s4a!&8h3<_7e659CbV0S@}YH zMM`O9u0m8Mc2(9PfPy2{`$TT)Ci+y{f*$GVJX|KKlN~$Ba$1NF8TIPQYJDOMs7DoSfv^)bX|e>P(b;PrX9i<;yRwjeC+I3<6@= zI(lVuk8vfM(UU}Yo{_yp|)Gxbk zAV2iDJZtCt0i zrh*^7ts@c9X4H6hll`5rfE=_nb)#x!c&9hW(R*@SR-kutHhl*^+4H~#wJblBF-?6c z`}4~PubQIj^bXrYULE_kY763{2+!YR#)0Ab+ITmqAIwe$2XElkcIYnh2Pw<6v3OzGeK?ju&Ln}hqHSTGDpk5vc~V}7Y+xWq}-tEO&QvQDISkbl5qLtrqxWdy(Djx z{@wH%NEz%bH{H=ViiEd8YYY^+!Hmj?J+)3_$NWkPL!KH&eVok+%F)5Q^DfJ>&}R&0 zLcjb~`%;T)4^f(u&eO8-_AzX<<$>s%z_L)XKx+<{8egp|pYKkM^T9|5gGx}XsN33n zune<`L~ZJUc~qt7^0`eDL#_fJS^b5)d7VWMj6?yXCF8>I;pZmZgeJvHQ{(mZs*Sv^ z#-w3_Up~x(9GDoVxZ2^JScY>&V^gIMo5>z8FznqaVHu9Yg!OLbXLyU=!2YRyL+Q#r zGw0n1hjR&n!}`RUn?;AepOKhMYV_44Y?(7>)KztthUN>x^dS~PX*#2N3g!ydb94}V zx!Ay^O|P{}Q|WR2c^p(z6<)`Y6Z;V+ZQk8gr~?$Ci222CQe^porFK1)4niGxx5S-f z1K)sGnOz{Gs^j3ugKdTx;vhp|R`vWoG_lRH8+;KJyPjV4dMImoZq7tl4~oL4uVUW< zUwO4rJGDDuW8!x?YE*rOeoeO-+}h&eZQX0tgczIYJk=ygygJA|sDqSRmFA4;bCI{l z+*QbX0RB#7;RD@jUPSeP8fVt$FV zv-k08pKBk}maP!u#%HMEWyfMk2tvVOf@Hw{SA7{j*xpnOaN zg((T|9~6Uv(!Y&w+8238bZ#YrQpv?}3h9M>0Qv*92$E{gNMZjP6}%q6Cm4W}?=T7- zHKa3IC{K$FrMl<=MVm5-IYO$a&sx!qY5?maGu&mhd>@{BGK0)W|NMH_gIY6?`2MfDd6yG24egk) zd-Gpq#A1YP;)%kGaVHr~7jnL#hBU;Po#{Ho)K#XQ|Yu&~^1t>%Drg&-?s4d@?u~@9;o@#7$Rw9_) z{Uq6dKM(@S7(!+wKgS4ZVfia5+N1nYxlW(__!a3Yda1%S{&KU=T~0gPp6sbjob{Vs zD47s`W9ba!818#rz()MG9=^CR$9k|;BN}>{g?)=!L9ixZI)_IwuOYb>$jUs9@!3W9 zY#QXh@Y0sNz>ZLxcveKVY~pp;s*pHF@h@=ju%!yeYWl%;AnW z-_q8d)}OC2H*Q8YF`Us;uz!M=80F3kI-}GWH_R2UxO4M^Cg7*{Gve2MJtOpQs8k-u znjPWSJ_IUy>*6%)6DjM|6m;jzmI>BAN9o8lJ)CPmIpl@-0}_KV#IOkfR8cM1rj6#p55IW| z9K{4fxN9a;21Q1ptyAGo=E$8cuD>aY&0>d^&c$a=CV5&#)cL1iiTdl07)=+cSj$61 z>qhtyKR`4(-|yJIHFtxr#N1+F+P1ClW)GYfx>cU8L!Mtr#bs)+#aA5pI~?wykC90p zz)q*_G0@jA3kupznHQPLv=3WNGR@0Grakkp@ARdv2beiV_(N`9UfnBBf5EuFTRclB zF@9mzJ#ow4B&3d%p|?+*0EOlU58Nh)KLC_V}UMJ9kf^7L)a(oAJN^X`v*q+bIYr z@5J^kip$M3A}7TQA+t|@zE|2_@bypKeVNJ26k;%k8 zqX=zHZWeV;4pM2D{f&VY$Lg`mRAi6-=a0Lpc2>H&imm_S?mU8?MK5>X0H+}9GwJ`R z+ruxfFg&?iAn8N@VR{pC`v;(uukcfxCTC1}s7TCqbdXqQK$Bns)a#uqfJ0nDSYb%K zq+-Cu6Pww#xAH_vRhlaspq126mf;kk?OzZ$+IdTO6l)Td?0HoGbpAS~Ab7Drjq&EV zSmF0N| zDQy^u^JVgDc$!t2*@sggLUf0`pm%Zn5Zz95GyfGrair&Jn@m96|@OBVpOYql;vceKCp&(i=W=DJV-#nSmc+a3f$)=D^9%5TSLG`j&kkXT$c9< zJ5_-LV(OZ9>S`E@31j*`U=VS+i{{JN;X$9ZDRDwN$5e~o`^1sM$z{t|uhzLjLFq;i z-RP^8R^877*oAJ31L?Mb6}dn|_!@vJDdUYh!``RP^mBtn$7*h@%eDS5cVo`S{K#VX zB+2SQ(#v+?kD^Nm(SS4lA8(%Pmb;`ZG#Y%HP3`_qzfjz{e#RUbr~1WdT8OM$k~*%e z3fmscpPTXC^lVElN1?V+?q2I|mW0$4y%`}7UBbcsjv~}`BbAjROY(W1A54szwMX51 z5jUff@pLVaQbq|5Zumv=77Z(^K6L~8xp(SW+M$)#;uo;(Y*)yoV@Cq1;+Tgrd~!e*Wg+uD_utO8^vCP%tRd=QlNMyzn;;w}=IEdfobqpJ4; z*E%RvdK{ExD&w1q<0a}|#DkiV#Rjf}J<+H)1VSzNxjUHKJF`B;&xyD`7NC>y&gnPJ z%2I@%Az{rGtjg-hdt)FW%X{3t9OCZPN4*ol@7+T7ZIWyb{i|xXq4(aP)hSn_57B)x zuK2#>{8B|$bos3hAFm}nJ&RdjW-iky_I7`et5{q|wouI@gdCo+$K@}MzH-?Ul!}p- z88d8sAB%}PXx*Q3Yr7x=ZSrx=EifQlRD(V1P|=K#osJO}7OpAOR%lBtma;W0wO?rQ zCCzAJyW?%drL;@X>9z}_fm{K$sA66^GxB=(F=DYwHtYswpQu}}H#s6XE$F09erA4_ zSn*=&Lk?~H$1Inp5b*jkb5Z1rY12f?Abas8yT+$&RTh?K(5XgQ;Vv$x zm8wI=-z>K2Wat?ddXt_kI7x&R01Sh(b0+Y{^z-pk%x?409Eoj{`9_z8;k>EKcnbuH z?7kaxwXD9#WLIWC-Z5d2F}f*x1-?!q>r0ReCP7x#JS&?m zTja<^gMv@X1&RbH%WS`&s=-w>Kc5{$ej<#S2e_8{O?lZ`n(9Ex94Fbztf#*BW}P+k zg)8Ehc77Gaq-aXGVH_V^^fYNE>^FHxR~G|BRytCQ7iGUKSvTJK0Lz}#bYp>+L0<|b zzkDDu`24bnJ?W{q3qHqf+X7GW9V>oYVn!Q3wIgx6afw2Lxc>Gb1OM~6z&WC16F|+` zKU)TmVD}<*A6GD_m=+p7++OOxa4Gk8Pnnl-f5?^f9%WXjF%Giq0Dm6zFx4*_V4}g0;@oV@V>^*Mjwye9{IdLV z8lFl?MmT-lV=Xn&*U11!o?#egs0OfS~(FG>AqF|FTl)+EX$ox0nDMy9&NHVQ3OJG}ajh@9c9UsK|M) zMUz@oEaoc1E0}CoBVG4H?$&Ns1iw9J8&4rUwNoz|_GEE2g<`wDL4P?&5?9umHEo?n zcvA`!xxs7<-|=@HD5ISNvH*pzmp7GW`%^AvJ7!aGp_y2j-DNwQf^Rfd<-|!@*1A@& zRvzCfiLfrNSU?hH<|pC9%B1-?BUur6Mxij1Ga{`}+d5~*4*7rv{#qIi6c@8ra-DyB z1WGAn1HE7xsQo>j^vZc=Q(eg2wGWYywBUT3KH8*t*Q^}()J3C>HeKdcWvN3>VmH?0 z=?a+xHQ=06)zWq+K}NH19i?nU!xqG3Z?lg3RcCH-+UDtVZ-K{o1tXU1#?C(l&FiO! zKVIzPmB3ASWo=oOERcgZb|Jo-)I%dFAV{x;mEM(a^&IFTZ*Kqmwa(=uqX|gHgc!k@ z)!?7rALkxBl04jGJok@LH#yCLdO?K0czV`^qc|+;5~^WrQ(YQ);T2bb0i=fM5M>zYUiVw?qM&d2+VyB+^hZYPs&%q=(xVK49V;WG*Etr)F<#S17N3xB*V_Hk)3 z7_6{zH?%S)%h_o_i6y_uhu`h6k8|3(jafQs2dMm!=ewzza$a3ygDyhEmRwzly%ldc zo~G%*6X;Cz3)jI=l1&z`t|cnvp4gt1rUdj-&1_vPSwY?a{aS&j+_F}4>(lF>vC{7Q zo316hDL2s9v@E*DUtaysI{jHhcz23d-5|BcZTz`qS5vMPs#lIh_{ld{YYEswBZ&!V%%Se zDdCE-+x*6I{{?(?#6kbD5>t$^Pp3>lg%NQo&JvLn`arbmVfX&%UQ36n!|XyV)nU#a z-UNDQRo|Lj1G^>ODs@Dey4Zy5cj-wOZhOD7Bmtd?dtZfE3KHI`*ETnsgM%$U)Bxyr$_LE^5{*k94 zoYnZo{^xYd@r2xP48g7H_+8VmxyaS}-G~*}W~)ZM0}G;Dj7jKnA9Q^MyvDrQ@tcn< zR!*@Qs=6kiTr)rYCD8pkVTXEu=D2wgIO{GXaF%SNY%SUS91gffQF|TD4=ba56%Par zQ!TJ5q~%8ql1rH^*g?+ac#>dxthkfU0~Rn;DqGJ_hVIXU9o z=I4d!16q#C{aT03l4^n2tJK%#=kRZL$O88Vi{(7ZfC?=y4R9Vm@3u*D!jR$Qhw?UT z$3tdCsDqhXD8YW#Xc%A%X~3)_zb2LC>R|N_^gZO7%xs;VI;puc^frYFNtRnl{l6y@ z(xbG30b`FX>sQB|9 zrt&>cT=2CK_kwlfMzGF@xko0cejekOQoBb%9e!cVV-x7bB!S#<4Y!QTgoIpAsW6R< ztbA>HCVFLtOWQWJUh@b0=jsch^OI9jMFuETx~g8iKcoj$_W16nozMBJJK>MSM|&x) zhhF%0ZZJt^Y{ z^5wU}Oig$KSlLTOPPJ3O6dC^IH4SXX`p z9MubcJs74(6Ahkk;aSR|M*B!u*Xwm6@11P8`{QS?Y36JB6h^eJk`oJZRuBp{d0VOd zCLQ(F{+i&`WtMiHm%}XmMVHq(*`_RJ3+Q&dL zRbpfAIUKM#67;GZzV7?Anr^&2mqT=Oc&CUzt|1FdH<^AGX@owC{--f9q8y4(K2{X+H{0Vp2muni2-e2a5cLtREiq-P%II zH+9w@a?ggz{8(+OcJcJA-valWRjGoWEe5<)1-8oO%NJx)Yi4X@`1yB=c^Kcs0;WCk zm~>d=Gnuwk=zvJZ>&?l{VSAu{)oj?h@b%AmQNfkc>=t@r`F*@N{porhspB)y)Hdf1nN?I=oHKAL-NsGl=Vo6* zJ3eW$Snxy=DCyjM(&8^)v6;14ZckUSUw-&HXDM9NGxdV6(s=tLv{d_At6i{}Hfz`G zZ1a~5`)xgQ>)jt~mtEZ1$a==hWcfLz_;79H4>4E&)$VDs4}|v;3{EO{>so zb`us}Nh15~452$Eo7@r!Cd^R?UhAFQqGJ_8+Mof?rRxmrdeHpW&9d2Lv8kVT`K}C; zC*pl%QBtn^4u*rw6E7A&z7J#Gsu48uX~yLxu#sKv*CZHO0x4wAuL3&Cld!d$K}mbX z$%l=c&4F0h3@>B2Uv>peGirAJ27IOI>mc*&y50>;s%xbQPd!&u=nXs=%2A8DA2th- z>BbJzF4Kn}aI^3&U_4eMS5ub+64TCOkS!yR@mhOB5LC!M=cOLm=UkJ+(I z`SA{xFxkA6QYdq5r=Jl5Pn&TnZ=MvL^{reeQ%mDHRT5XuY)?0=P$^j0^LIfLGII+U z_qr$M#u0Bm1numXUS}50s|Fv*$r81+oSt;O9JCun#O|awRIHqjcLo!c%ftyDE1^9i zcm3^{7tLchxR-ooBtf`?zD6U66bKV&p&?fMTHSvz1UXw3Un*^S>IgQi;=_QUD6^+ z`ZFW1d%eTwx`fSHeSu`4+0Ev7Q1RO=_YeMjH+m^(cY^g6KOQD4U+j$AR!%tdpg4vPpT9Oct*GNZujOWDv&=wV z-g&^GRvO+Qo$Y54Ta^l-3X!H$aM1__PAP*yn0kG}IkV4YestU!n#YfL5!1#d{V|tO z@by*Cu0Y!Tr7a%bCf9jM8u)fOX=dwYZUCjl(t7^Zt$rEH^;)rEAH=R61+5v{Lf0r& z&`~AQbrIh&4ciHx-+4u?8r2KQk$M31;N7J3TpKc(C=+E(C}0)anP@U|sY)W3dqesz zGGz0TkB)1DBiJ@=Rq*5pQ~Din7deD-@y4iO{i}?pt6|jm+I?zQv?r#Tdh&^3SyM?x z$gHT_{xaY&ZK0g!d87Cro7elyrk`TMb!%;}-Gtl<;bK-+l{w^3jbVjIF;1;BuuK2V=nlGg8Up@|Bm{i2`z3X z+Ruy(gqI8p2=U{wsjM~J86xUhU^k-UNUA?07A5-pm$@L}1!w>T52f}c_7CwYeX6n65DMpu;qyyZM5M5(%;s!}X9vr{G%S1(fA;4* zifzBGs9cn8I5RX|>$)rjvY5jhNG%eoQZ1tx!8tMeLGp%v&Qg#7ErH7AK&t`f_6bCf zhu(w)r=`MrT5gO|nj16$9d| zp;wQ2Zqtu4JF1seeoZ3N4q*7HIi9Y-yYc{o`bxb>RSgd(#8Z`|x)dh587Il!T? zG9%r?eJRo9Ufbf0=J{Kc1y3b&9G)5v_xZqzYzM;dJ4-OySyYLjr^*DgdmS&V({|WS zVtv6>QmVO2U;O#08gx?pS$L_8h>?e-=QTx8`!ocjt|`4;0#6^WflX3Ua1--=X&<&R z`%Sh2o^VKfhN6jA_eV&&aSK5TsZB+Q+6z*EaJFI}GpA1vqw2)Tk^Fgv!mG)_3hXj(_Zf48YW4cNSkY+RJx;@3K-`=a0Sh~b^t{0JSXqd@N zF3E*?sF3LytGcp-TUjy~q9a1t5OQ%inZ+TQE_8meG!3ohLmeVuo7x@N00NF;R^&pe zh%S}bF0ay}%g5%#g;v8egKe=xB1^Q;eI)0oj?zVI%TBy|z$S}BeBUD1MOXv>HTGW`h8ak@kBsV3Mw5ik^5*#lF{>m5QvplRkhK`UXbIU z8-$8D5QFh;9;s_pgH0YO0?|5lUm$r5C26rVn?BfJxmyQ>U(N&UQH+E|K7B`j7|yOEzXS9IXO0@X?!aksYz zMyEhdii>;JY;(Bynj5Lo+E#kk_S050mDR+!XB~pAv@MoWtX#KdCZuU9#Hx`#?x~f) zD4l(V4ac|BpZ+tPZ$6?;K=lp|ewl?D$mgn3hx}u|EiNV2DsM|VOn==Kx}PRBPHCtKM&(dzt#1%NCWR02YCK6LY8Z;j8fbdik3Wqkqi8BHf+ z-T8C`)ShQx4Lv@6|3dA@XnW+|L+0n0FE7UzL6V=N#KjZsJGx+1OPe01w5cE^2<*3- z_Z9frY4MiIaU-PaHwiG{XfU8y0cQ+}G+YPA-3qCTWzDz_aB5Yv@Pz^NQ<(|?QI(Zn zHEW)Ny!VVsUw@yni@!nl>7I?^s9f z_+WO4QgSpdGRni;c>035IIr7Br08AXhD|1qO#4gyuJGRr=OY{MeS9*fH(5VXmPR)a zq)=K0>_<9URHF&#tV2OdcCMyve-CYQAueDP04eir@o;b!B;}7GUh=TH>eOO{3%Hl5~xPs|IWE z;Wmq6Sb*Y{iR~Z)UYrmRgWe^}0MVVhLtEhHKuNaK6JJ>#NWL0>2?+O4-EDAYO87)q z&}r^rm$_1d&&mBRpUzZdjyZ4S#6>tqx^;vS1V0OP_!S-x0)@EioR#>ia<&6~hhd*bTI7kBZ$umMB7P7n3? zU-Ku^*()U0K!2HCWEw?L2BiIfDGbg8YJ_dcHjb5*1DpII7)Vu@*w1pr64o}os^gUy z;XeomJ}^)`Q$~hz5G%~$x)7cjboS`GDQotsJZa{j2ZwrDXnS=SRB$2@{&aPk5E!#s zwSrI?-BED$9kBSJuaDnzu2S*X7mZ+DqZ41C-e;zpZK@9L=6 z#oHjE(q@~58Or?-t?d~%=Wz=lpX{e;SU~ALh=5why3T=ZZBmN^Nw@89HdI@?Q_U|w z7f5~Xy>kIkxsS?6;|xjf;w)2;xwotK8uGVKf@quZzcM8M8O3A2^G2hu4nN1xraxwE zH+!2B%p)uh%;@aI)wtVgW_acu@UEJrcl1>{HH>^2DNJPh;YQP+6s0d{3iDCjsApcd zwV_GbVV{8*5u>l$=H`fbX*j6a5VpQ9@HHy&XK6@b?A(C#ZM~~#aO7bkFb*QJwP;|~ zB90QH&u&}sO{2r45W1UI3wp3{7Q*H%c4?gjJJeAdLnM9BnZY!$FzxQ#hZtB@Xfxlw zj87fT(6%k~_R-{_qj5v&xC51Rvs}Q>@cqI4O^sz{!PALEpAJ{hFZU88vcC*IG4nG9 zg?Y$jN$KwHK*T~e@*2k;&uEO{&W*YO^qlu*xwECSEC7!~bR~@kI6-1fWJ9YJw&?9; zLV8VpRO7+#6z)82X=*SHCjg3K_H)*&Nf(n?Jb03lQlW)^l%Jz~1=`x*KW0Tz>lD1q z4UY2Ky@YxTYHsjy?8A_Mna$7~Gqx3Hp~ZS0>6B%CZD>&gYY$QInZ=VJMt-Uwk@nFr{6mj)C}7AOG@;hR-;rd@%ieFi>gkOsm6v<`BVc~ z4O6F7a=&y(&=p9|bps)V-@dOokf+Q2_@(=3{r8|bVRF@!`KPFQyS5f&%Ym_nmAFNP zWx+`@mjYdXty)mVr(z8zKV{GFS1y>fkmw*JTE&~*w#JI|3~H6q=E4uSi1P#JOs>V< z98vL#u;GYZ9v-SNXFPt^ZQC+LHPwqYx9WW6(x9jW)ywYyid^{K$tXeWURB#n><%TM z6Efv{NW9~ChTP48*c`0MKBZ9$sZO{j1L}(#pyub_gNa-;(y`J!nMn5+O^g|U_490-!G#9nsl_T+>RAuM?j;yzf7uC*kU)`d zMpcAPH#_()!XXMI>rOn7utB8deoYE4kh?y>Hu|;O)NFz*$x<=!oTl~xKdtm_Ob@ju z0-wKaPYq1sLC>w3(G8I0WlQvzLw%dG{2u-MK*n7r@=#w6G%t34={dkJy>IHNBsBlR zZeE?NCz+wAmxznUdFuc~!@eg$`f((qihVqTlGl-Z&7@!AVBbiipMw$f&r_#>wt%?K z&f&HC`Zu45726!0gw9U@E$I3KnZz)bgK*J1rrL4^=IrG|pl+db_-oLxH(1V3s*E3h zZnZ#u^o5Q;kB5GbW*e2(cv;Ye3U<~7ZL<{XbQMo zP~^K{I(bR9rDLk19F2RI=ulvk?&as6fT8nl&%`70m zqT3$vX=zSl{*V-G(xy_afeO5wZ0$Q9$B$@1R^8m8V#!*erun)u3vsb?(_794_U-;zJ667^A0SB z%w1vd&d{Cy1tBUzr`UF9eeVirMR2q}>AjOiO>1+2@5Ucqoe|%gZW$h5%f!r2+oB|& zb6zi-ZaD1Qj1aKVq~-9u$O*{VCIsO&6;P} zr<1eety#BeyF-kiW)8o%m_+()H6UOt#wxy&id;8HrsIeMPY(;|;GdO}Ms6~;QE!$< ziPq%8C`J=?_QfcdF@9okvGaKcMDNLH&3I^u1R+yq2Vhz9O}K6FMq0tslRrRfqD zX~yV^E`y@ZPiAg%W%k)of{ylX59$DC25}=mjBAPQ?c>$VR8zyOfngNfj1y`+d{A}g z;vp&+obdCW=V*diX_d6jB8@t^$yIG4DxyKuv;|wV@zp;Dgs=VD!-g(P$tMCH@chiw zA099|ZGF$fYi(~v>@?!?2A4l2kgMGV!!p4_i8(kQuUYrebE&$@A68K@uLBACQdoxq zuE*xYulX)`se*?QLa--JpUyzu8NWGf9$UOj1vW65`k*`5f}QVJr0sjiAzaqx*Kq;4 zh!zp4RdwwEiW0(Q2)~}+-MulhYd8wJJ>KUq$rUgw<~2Tv=sUv1k*gs8K&f)R8J*w) zuVV-uLY>m)M=!I_s`AWy)nx;+e)0CP;1!<5d9g)5i)8}0#O(=6wj&(B^|;)hPV%6% zKe+21D7eA#tn5H_iKG+@!be=EuctTUT7AT1(QBh)+9O1GL-}i?GENRXNf{PI`22JF zkl~%+ULs6H5cbx|DXml6KI54*85(?89o#Zs{Bu{R-9+DhbY?}t(-(}RAbq-Y;~N4~ zqE5dlRt|R78cMFcD1vpuhUm_1(o>_4eYA6S41tj#s{-K5nPE{BQF;Agvag@alK1{4 zJ7+O4F(f!q{&l^>y%}seFyO!!m)=8ux-39l!LN@Gjj5~3hbfqWQn(Xf^0IYXxxk-C z`$XMb5y>7yGn4BLbGJGw$S(~ht@Xh4`Rv7x#!4t_q7Nqz>)K6 zs9R~QE-Qjpgi*!g9=*ZgWZz&4of=e ziT|1mNVDnHSt1M(XmD_`v!7ziw$6U6k22VE%T^KeGV?!fD-hrAU0y+`i}TWTUR028 zxw8bfxAXDb^L#FM+{(ofU86Z10s@#m7)HSKmH~5SN46|XjEzPouNL>jK=)JTw3#3r!g)NWdTBWZf z3x^NM@Rd;sso zK}YlyGx)&(S~5}46Fms13;me{6T`(D(KOKp@uZZP*o9+(Y{r6FKOjslWh^-mT|CSO zS_E$ec<6LeP>V%dwotk3ZjOkYkebW}6Ecemhf~J-_jm<9Ewg}_Epzd+;=D^m)Eq#n zIoh{M*VprjuX*??N!;Dym2cg#a#q;_3OGH?%{EqQ>l=Ja*0ZPi;S$JHB9J=)NrU)e z1$yV|ogD=6UghCR13QzN+OxXGnzIM2;mk1-`%GA0=f#ICu9PeJvINe!Rg}} zPK5$C4y!b`ap+T1m}|O&_?KhA{gE_s!{Np|4!qWWKX61v?5Og4e})PVB~>**%)O4k z?A%mgYI5FuXl;9!8eRMObB#j7^@^ZbfBu%;IG4>e(gnNkRqBC%7n?Uq=uS>u*rwT+ zSGE0-J@FXPMU}+2Di%ijfmambz@$cQ(~2haLGY?4b@e`bVN;gDpHYAj_Zyq=mIqIB zU}Ven&WPKWo=WnWW(JeU-faRR#$j47fFew+z`X=sNuVpfx0lRK(dv{ z@7}3`nMNM(@=yZmcH0xtBR_G_dM9Cb$6L&4sk1KFtzT@^nx)N?Yz}p#mfp&l^ct!|j<>c5Y=P*A6TbN^6q>iqL2$Gllg$43C5rP|n_ap_ zqF(Q)176u>&{0ngWYZg>xXX&2-gPPgdL0}}%<64+5^OES{TxEs-Q_!E$qxot_~xkbFT&WMSNOSv-PHK~K9HBEg`vYL zHOlaZjy6AAvB0gQqe9w1lRsn0wdn~+kD=u7$4q27Mz>Y+etMItHMxy*AvOKG_?Gi( zgY#xBGNu;syrvSbV;t5jd0>TlN}66G`}|mgc)7Qph(Q~rx+#)hxJDmgJO*yaEt^YW zcM4FN@x-~zg)s2{86Di`dPj5CN!hYd$SuQ6yx(QpTfAF>kvJ$)2ywXOmIhfL5NTac zD%Vi@J*D>;sNbabHZ;9TMIJZKT>o+W#9Y*hMXx)GSM&kQf00SO(@oZ?X}`y?a(q&8 z_td)m`Oez_l6#hP^2rX5rH3nnB^D~g1bwBOhcI4lse-QAr_R$6?Xxni%V9Ybj9rg$ z2YUsvS4gp@yT7AEz)g}C8@(p&qk2;gjeKQuezs=I*C)4Uy?lY@Lb9Duk*UXB2olyg zz?)kB%!IUW0(f*{_Dd4Uoe8q6bK^jFK}_)^>h}Yuo`jO|#0tV@9J2)Pxemi|DhD4E z@y!HsXoUg;(#84cyUBTvv*vyGF89n8b`h59i`N`%4fo0(y2p?NlEFS@j9`|i^hNMH z6&&AfE7^%%k=~6E-0b|maqn^3jwaQe!8c^GH9EedxxhK#t zlpg~3paGj!yH0a4~ zk?4-p@%>GHW!(xHk%bRmENhMG_Vo_tqzmJ88nFBHbA&nIS}oUq0NvaKhGWdyuPOq-r(|h zBaWd~gZeo7gnfs46J*_6TW$#l#$iDd{&D7g!p$_*2o!Okp<>`}Pxi z@%gnAoajd4ytv@ES$E~;b~blkj@6`cr}6Hv@gROrKf`7jbUsz??eql6a|)55fwEM{ z%)zt)T7tv!h08iz-t6(tSdG(1e`}y`D5tUz5;b9nR0Vo0tp`S=tex*3+l3z2gWI^q zY`RrR_axTD>V3smlTg@Y!()lnB;`hnBTrzqchUT$Cn~S&hke*q)PRi!iW)b=O&pbh1+xzxw-Z& z`DvZ%B=*3xw3t{en>;(4u`2o(Hb-Gm++mc0T026ISKDJ8Z_sEpO=ho52%R!pj}kBQ zgkY$)f=L&ytA_V7ukZyI#*=VImuznU{d~E1A9rbgm*cVUZ?8+bvphAVJN#Sp*6bZS zhQ<3Sg;ll+Up7w3^U*IxYNlU*w;HJSTuIfOvBe^sO4w3z8^$z54cTokQ)tv_P)YDT_1yJ3V5O!Yq)+Pm9#%f<777&~Q1RA7blj zFPN6|^j%M9qW|pQI6AxDvovxvIhrSF*2{c-w!4@wJRZ!2{EB`ucko0?cP>5sE^HXP z5`*rs;Y!VW&4RmfoBERHsNSVw!+OaCa!a|fA<4`BRgYP{!F7;q{I|_8=huF1uM-Vg z3_fD*kI@vCCz>b#Pc}iJyrPodG{k!vI`-6S{n2CMFUIkKZFplMrrOK7hL_8l-9N}+!3n^(>A^Z;Jl;I_uH@txtVtczm=i|4)Z`IMwqRK;t5gTY&{wRi#HHnG zm~034tD+c88&i3ID#-q}esJb25OR*+T!xt(zawumm%whO8a{ow4N%e~H^a&V|IeH> zKlq1SZh^iY_sv0vQFpx#Glj1|?U)OVP=`t`g+3v+szX~x5@ZG-s-1d$%&L+5WR2F zAjs*-vm&)O+XDeWgU)q>Qn>QVdfJV-GT36@>ABs1SV$*-!5n;eaN3U=CK)vGnF=j; zmn`?WO+Ae}n^}&FM>0w}s3Ce|8L86rnEX#oE#$OzOg*(NY6{If0J#+%B@hiGiNMh% zb%)KBZ_GONsl{gQGVh>*-4vO6)T!}nX=yJtxIJYc%$UgFmKP=f!ja6R$ONx@_0t7w zy}$CYTTcF@W)$^X^%o|EB`2bPWb023*M`gArg3~Dmq8%}L3XaQI>i7j=Z`|#^Hn!S z6Tn`5A@^F}?(>HcF0=B5=rLe^4faXv^O-V9@QAUr@tlhc4jOg#axu25mXso_>c6a- z%%cX}MRrq9Iek2!b_R7UvrP-xUDnn>gWDf>A8Y`Wd+DK|pVlSd9oy%rdm-z74hn%ncbnbt89+1i;O^)Q5B zO+(k0son$&z&${9D2Mt|xl{$X{KYT99@RAQK=0-EXZ!0EN8KtIe) zCQBl`lK_qwFK8gdS$X)$fk+#(ma5tApSUg!+C+Jg?t2)H1v*0JR z`J_YjJ&8;9-Ty)xSu3L+uwB3Nsy?Htp!w`K(iKwO9`?LJgEc#C3X_ zT<6U2jX*578q;)8)uoh!(Wd5)?0tUUQzs78UyAi0J< zW?90qI@peKh#;6SYr!2XSUY}noI|zMdfMKp&XAK;8uavU6N3bTzN7F2*6iifk6Na8 zC-ntyo%{A*j{pkGsdu-%pQc@{1hQv9`m|nt{-&cn&=<#U;PgZ(vI8jmRho2WP~+t1 zZ#z25Ig(Yw>Occ9GXR3%26a?;0|!ecJS8~oUA0(tIu^OHCqu0o>G4dC37j#yiehwB zmt6ZjHUxOyq+YSvz*9!|q!rE*z`YzyE*J~>MywnsO7D6S^`bf&^rGt25* zWc|Uj{*q>Woa2&_X&W(SO#@A$?NQz0y_Jx?SWDhTT;V96yAfu8uY^1lTQ3y%rSRyD zf>3Evc7JJQQub~o!}kJgPrJU69Ba9ndnyY3ha>3whyQbXSttXV5w0%r&&Wvsa?O_x zCl)*vR#N@nLN@)6pZVip(j!DTW1s(bckwsk`>>XvP>@O5g30$dw)?FY^hlvtn|+io zK6-!ez6Jo5W~ce@^okbsztU@2;C9VizS#~gMeUi4#E9x9myJM`|Jow?M*?V`F3J|? zzFqkm4z;rU`g*t~|H-F?6j~bMo3N;(?!MY4g>QfNjb8k3ALhLTpf)Z&yRYNEeX^`a zZ66I{6zQZL1C@Vm=cDQE<9zOf;M;Y7`EM7zPw(H>XJ==4>UTE{U-t0)zybNXw4N^M zrfN$u7~W2ZAj;g0+*|Ug8A{bPqJFMbFgO^^_!c-J4ll-6V`pU~4j6KxJoUfbVn6uL zu#P~jR;OmI)+$6dbSp_^!GbpSOsB(>Ka}e=fSg=@YMBOqZI6_Trx%Tpy?WE|GX{ zT<}wZra1wmbd)gsH@E1!C;N~1eg_bq6VLqfcmHzl_FO>h%Av6CZ>0V&fBAOLUekJU z?*H3AW3K^k%qh+AZ~n_azZC%CP5Iw=%x@l1SrM=;xrcWDze9L)0*V;BR4wq&FZ3@j z^EU|Z@66Nx`$hjp!2jFEo!WBW=)^aR|9^qe{yzf#uiLxxKmN}S{zmKmk1G0qxr2Rw z{8wAyzo78B59 z`%wOLKI`ufasEdamN&;TXlI3D9db^rJ^Pf1g=&%Uo(#?0l+NFOz5uZE=FMdYEdMJw zf9&;}x26dhNeizx@45@Ft+8?*nTg{&vjimczayCf*7+Zu?|+o8|H1iu2-ZL@z03{qzj+XAfOilth7l4!4QKWbdvzqa zeQzjRqxCq=Wztp%=ok1-mM!z~^1q6j=6X_sKG2Zx;G<=S#rNy)zkNbTz5vS{ z{kP-apW?Fe5f7JzYVz0ESEm8Wfj0oMbkEd~SdNje|KRzZ+F$Lj&*!_GEVuGn(TDFH zJYb~##GT$BxBUA3lYW1&zH~aPDH0Ysqsfj_p8gvu|HuL$wk9H8(Cd`j`wwe<-`bxu z`Dxb{XRII7*5A&hk)f08;AaHg{`EQ+&c0g5#7l;13X_N|GLgv6v{q-tn;MvOPTLi)Bl7?YitOXQTeZO=e(XY@`v0%S+<0O^znl2>y@jjB&IN|Lrte1F;dyl3r)vP}=-tSjp^ zzyI;0dLRNuPFW5&{HwHtD8gm_kb20X!{2^vbYj*9=a+@Ip%nv7eF2T40x z5h5L6e}DJ?y2TazWyFg%9<{%&!%rCe`R9*!{wB8bP0!v}gmv2#{{1%JelpE_e%sew z{MDYEf}7+vttI-dWlp=wqyi=M)nP$EM^Ob-xNUvUXRiv3BOS?f-sbwr--vr8c6fGq zXI#J8?P?_GyPY`!1WOZlCI4n)Pd(sE+8^hMhJX8=8c;wprWm3pjS^`Cj8j)Vci6n;B$2mf-VpjCk*K`*jNRBK6~eCMY6!= zL^98*rv@4VjZUb*;4+uw|M9^3e{73C@%-I_hHDmC?M(5tNP4NvXPyUfv0Ry%Z&`(M z7mVMhfP(MF)3|wG`r-6jyrjFlhhAEi0I*X}fAQTS|K7d<06bqmZ&C0lN;Tj z(^aF?7dXDwd5B*Ajw9|DfC&-c_Y(J3_$xPi%I<%=Kt^}oS*nX={V&qqJD{m_+Z$Fy zR1_VhN*|TpRGM@U>C$@_PfWK;KlA3-WBX`!6-cHHAu5Z<*Mt6&(4T`5QUM^{=W4^}pLz1v z_hs6Is7km?ys`@Pmi*F@S|LK zTjJjw9rM>Q{7n4cZutCKj$MBMp8vX)&u{;qwjX-zthIhI>5U>S(mQqIktxV?>_4GO+aUCUyRB9yXXI!UH_1Ury(G*36N|0f9*s3 ztgS%ZU6FL(c8>G%Kldm4)jfZ5FW2bNv|BJ|ci^s*IBK;`tMKxbY$2F%jI(iSVXCX< zXWyf{6y`^tg}gp5bLmHlqb#>edBzffhVTK_*E_(}|^y7LzaiTsN(2dFYWFN?X=IT2qlEy8aoDl6+ zJN>b8h2DxYN=f^MGJ-j?M|WeK5rPyJdo3_vWlQ!-b>KMKEoVUJl`8fCS2J2EAF7&y zFGH(c)Ckf9CW}i9e(aSVn#So4HO~2KLhxM8RVPdHnG8TW<=*%8zm~spO)~8^3Wbie zLm-L7PYDG_p#mu@lPEOR`R!71c0T`kF^B$2>Er;KHL0_Ku!}#be`vvv0-t^??!?-; zpSa|CA9#_p!^TBK>;lV?q32A{LWjyW(>@F~hYv~Izw9?*CdT%&V|%Ad0`gpOPP_CU z6%FWxkB|(9syi%22zF2e5OkkR4a6_-sU+<2==5%e>E`HgVHrv;Y8)g9RqXW$`R&=6 z;>xWTNV)IWsOIGPE?izx{#kZRz`<_If2jR@wBvD7-gT3p1c<$`wU;2?+FMuL=h zw*+p-Vd{)!pO=Z4{}X_LZ$;c1#3&|_H zpv=|o$s_sk;B}exVv}E3Qkew#Hs%$_TG~(W@WeZ18$b35qcu_yemaG&rj}?)_(VNk zyWr?|0T;*7Um=Q*hky%NuPOV-{NI1;rt8Z;wn-W0bfbfe3k_23RE@uXsmaq@PCuQ# zOV<>l!kBmWu(fbIXQ<}s{_D72whUeD!S#D1q?hmLULJ$zg^-+qT{tcnR#~9gF*C`2 zlD|s^p{;2_*x#iJu$exsEH$H&**F*KVaIWTJBM44PCY7j`fbVL;q2x`)H?L$@nyEVE;+EC@jKq(=88mMZ5swVxIhar6(eq1PMbl%~c z9gE6mU!wJ5bvnNu&3eNA}eZy>fJ9kR|irdO)2y#%DZGWhv&AjPhSd(;)0Uros?Eg3g|9$k; zE2!Px?^y;3$fLHzs;tX5@fL9ru97z?%y47zg`aZ>_Ic&LXr(V*5&~+6)4+7djs~{3 z2YTwme7>)wp?eeO1dcLM8pM9!SHI~}inWV0QuJ}DrZ9RH&>y24*HDo7O$ZJQ` zX)=Y`^Lgh7JJ5&spxqud9X8wB#@h`7Ia3Khi?lNVq7WFPh$gn??Oh2`r*FWWb68z= zdwA3LsG`8FGveL#$HbfHH#CydC2wz66hEoy-!ci-N}+wrd@uw1){a#VGYsZ(Ih?%IOVjrL~Bso^0Dr?z$$F~=w03@u%fGY9H>-5oxuU^NfVWoWHVeMeT>W1k`= zat>MI78I`a9<&~qkNe5(YUH_WCAtt?dNPMAe4hb&VS|C82~bTl;OW zZ&4h1YqSSW<31y3rS<+t<@Gx8jSZR z1z{ zDc+sZd$9CU3ef)L{TQL6@;lMM6r2;dd&xQ!!OvX({$$DKM>geu(9VFMMc3|iEuX#< z*s)zZe*7GlF5A#)34RB9i<}x@Q2w&?#=ho*a36*R7JO@gFE)!FZm1M7_)QP>o? zc3mEb13+9_z^0hgeoT%EU1i;YMv8Bj zlaifJ!A|Gviy51t{hrqm^9}n8?n#lO77#N2Lp^!~!SIe;$3g!_m(2PVT}{R!BN|B@ zb^y19P96A;o8k68iK}D{fzHV?IIo@ccs%}yB=M(7m=Xfqnyuz_2DHhse|#55ax>LU zKnmK?+^CcC;JGSPV&gomCWTHu{A(vZ^=n+!004ZG!UBJ&thncw;=9e`TiSWC zvFMe?)+-Q*es`~xe%v$14r_$Zq!p>UVz!?Qo3C$TSsRY<&a!FTcq#OBBcx@ofsN1a zKt`w-Yg~I5ho2F8I+o{K{kpi3m*T;kPRy4W>*uN(71wLGDJ2M^{CWc)?1Ck)$u9_t zD7m1{vM8f^4p8xmWLV1@;y%o~`?V6K>L%t+0#QL?RR6{&+u(c74uWzy;`8i{~uge2_j+J>{uC~Xsf_Ic8!!G!*FPYA$ zFWwaoQ`L0}q)1@9qMYEbMlZfty`d&dVMg5RcO71180cH01wtK{!{gcRSvPoAYhMX% z_9x3={|O?hC)l*35skY_>ZXT`FdF zc}^z82Gdnshu?VR2_%2tXhprSmWQSeyVV!z$EzVk+5%r_ zWG+PNvZfTz4H_u;OY%Z?Ttv(Y_vh8pqM|h;%(9zn>LWFPDK6t4ggUIVU?_V|Xs2{4 zjpSZq0em`FCP_S<9Hlroa9cbIrl6`FF|%yb}_P%mWR}RBZWriI}7S!%|md2;{2Ww~AgNSd9Ky z@N`p)GY^Z3gCD_Yl71uxIGL6L?d~pbTGXN3wV=bBu%{~;_Rt-DVqqgtEnNm$8Fwd} zaVHv+wpx!?Y_$tqrz#pZr>--?<&T%NLdV#C6U@;ISQL&a01j8yHRYf?tjy#1lSZD$%l|6zt8DT;%Y zc%ceh#l(>Ib>k2lyF;ZoUV71G0FBznJCtzjFyV%V>d_koz8N-(cFN1f?#d1yG?y22 zkw(U_LI*cI+iM|VO$T|q$_ZFf8?Z?f?AStvA8Pthh1AXc;u7Pn=`bv3cHT|4SaN}wK z0EHw47%)!Zs_SOkB6_ar4SbByn?$^yrUN1n*N%XLJ<2$0%yxl^5b3bI8j_IMFh_8h zrql!iWRS(Je8xJpSl!Lqj%$vEY*C^Ty;g-Z9C^-FwtqHW~G z3E;5hr*Y|%-H?j7W|CtA!F(lYJSSPbj8Uqdic=ghA03)4}JiSlH6?5hTEn**yd; zl2|B@8ebCr#4q+fSF140l$=l3lFvJJ|1Sr%!x$0f2*g!}}BfFFhXLjf%{nV384bIov=E06`(w=Ne-G&H7uK zBb@J#4A^n&`>+UMZE2+G-VU!(e_6NvU6rol0j&>~%t{hbTsKXz>hhu)sntiA&kkFn zmXMK}>=UJbf`s799*1w+g1$d2>zhGD$=!QA zmg!P*A>e+8gWz}#bN03C+u4uLTpdR3ym-eE2)P!QPVmX_ndS!JDVobP>piX|FAMC9 z@d-D(jphA*;a{Zvg~$rhcbPgg3=+Q|A04aK5ebeefl0!4@>+xnN5S{)`27(5<=_9x z0oG@-e54W11bwVS0DG8U&9>#;`bPnMd(9M5cPt~-`I;PV+ecG|eKJjBd=aL!Ew1hO z#ZfMxYAtMCbbUR+m&dArQcn?=o*A2t3Vg2WxGgS6XqBFYaJO~&OS}vG94d!LUD+91 zV0XUz%G7yAJtV%X%P;_eV}!jL75^e5fD!f=LUkY?ng?}FW)i>MxcXNKo+bsK&o)Heu}5#` zOP9cNCutwsHXu1HRc|oD&EA-}X0eti^VL|rp|3KyjfJ+7$&EHNM3(0j@z%~d$t_kI zp8fnfZNfLUW2q`yL(9R`Kb7^oup}!g0WY~hf!n63@oGAJ3e`g)rMAoBrZgqmZ3RDJ zI%}>}R9dVKi&(NXg7@bWgHtlEc?oFA_bvCR< zJZH4J)R+0d+CYrHl+_2X%>3`u~;v|9l0fEz8S)TsdY6_STEj zUFbHYBW4;&RNp+I4}D!E(|y#>pInCXpWpd%sfIUbj@GqsZoll1|B51`r{-P+<5rhnk=kFx=ORNht_o^ zp|V}rbvbx-`b(UybMr$2oB)sP8&n-9ERa8*(NuO%_3rg6FzjF-x->;ZHF|_|1J53_ z^BnC|I1*IL04bj^%O95|H$IXk82IPZX&l%1&_rrrtW-_A7=_m%zF!C}sD6@@?69Re zFC03%`{+x%=GlU)MLTsLOC9j%(t9mY9F~qB2$QV@%w7B0n}oSdwr07T`_3I*0a=AxMrSfjfSYg4;rI zIhvxz1N>6{dTF z$s&p7SX9eyEvCFQx%%O`CZ_{>F8|T@)}{$Nyn^PTMWrS@aK0WL2sfPMb}7Qry|`(B zjq{~kV_;&wG`1|^3eRJ%7MSfey^9it_EvAJnr^o%TnJoI$9pYuai^RTbUxyF{&dMh zTM+B9G}YAqI7dIwr88*Tr0;OurqSaUdVZQTk$fA(zcJuaeA1silZ`u%)YTi1t9^as zfm!Cd=^p@QiR3-*C}hJfTpFLbh1Jyx;J-PXn-_NJO7o|iVNa1@Ima%z99_g&=+h~_ zjMW;I3(5G5%W#62-}dUYBwbs@jPbZ7oV{{btNDubIB)(8ejjyCv*+_xJ)$_SWv4=Oe^?d=F(rsXQ~DN%!qQG6r( zf`?76Taf)XYY2K$8tzG~6m=Kza3HVHt8q6eb89%{?3xpHs#0n{V<=xZ>}u;2%j1Ox zCcww%vy*i%CEtp-@aIVn)*5nd_vrT=JsZuuHx?_-^Rq3VA_1L!mpL%{;?hY0sB?V_ z0juIZ5F`wOp0|AzPyDKC3FcN=quY zUN030$p==IUtkUsk%Alc7%m48-m(fihqDShmawJj$|q5be_dzxY7AJ7HG{>b7q#CS zw7fLXS;~^L&#T)RT--~Vw0CrmO_|9(6=w={3si@wlr&jp1WEQE5-UG7m}-FYjFe6c z<;Ugs^Sp;DG`)&CKQXCK@&`I`770}IQjnBli+vPzT+5z)v=03L#>v zz1?b#45s3(J*p2-Rke#(ZRp}pq3o((>-^Mwt|`ecqlfm0hBs*Y!v4PE#ndP68EdK}>z1Y0-$6BU8h!tE<++$4CO_C5<9eCQ#X~Q{EdCuKFYI|& z!<8(@j2x*8?7{)*7JFSv&=oV(xpZADCP7ojcpEePDa{G@U0Q*?od}crdB`Rz&p7g7 z14Z3cCtNkGK^6;yX@P}hF1`@8Q^)RN zb;y_m9US#KY}*lqPZL0iHI3?;VWN1GuM6PDMJ0~H9<2-4pLJPXW4Sao{gSnD#Yc{A z;Yhu=w1$OZ0ZmNEByyxlLC!i{cHWK>{uskh8lKtWgs)*_9BV1Bat6ch_ba+Wr_^_U%gZ@CX*q?j?U=#Bf=0Yr7{9A1V zjsrhH0rEXV&xtEaZQFjI9`l}3hew_u=HFfR$^OV2GHIhrPgo>tpCyT|#E3>|ovsZ!ImmzV{jYo_zc7(1#))k5^wA80u7dBYk41d^ zc;WH$db0I7VD0;@&qOohi0^sN>uGEe*a*#qi^_Ym(7&!{cQUW05(V6GP=ii9~uJTwP;1IdfCTr*pAZ)D&m%t@&py z4}DHj@Z59^;C(mycNOzKLGa~pb&dD`FdNzEi!WDqpYCi>E;5eITr!jXHv|8t1db6W z8%`oAmvdj%M=A3YDNZkB)TK;%(EK0Tng9GF|6)k3qBx^hz&i@ffoV5=GxPAlr^+X_ z_k_rfU53(J1-wgiL!TM@UkGC_({0@rK37$h!Av?GRd|`anYE2#7wOF^VhTg{oMqEzAFf9j8fugtQ0Oc>f}1`x~L@;$D)Pht3^E2*z zPy?jD8XqMrj48yXMUlLo7)}o8I2c70qT7(Ath1dp^~nl6UH8@24peE~WBUXenr`Q< zkT*lo5)JO8(xpOFRK~XQ6lXYb!>I*0iCML3?qMIa+zAQVvud6aNGDqafwNeXhg({& zp|~vA{Jq~p?PBB($lwm=$ak8OCZ?iA)2hJ1^-q5OCJB)q7q)~9>tB_4*6q&F>CfPo zy$ewwv;3i%DLyaat_XgoAc?>})w&xH(&RSNTD>z*m7$Ata=|7DQ&mz8Qu%W88#d%B zn-u!Hv9HO;`V2R4bPq^}jC8gb@l}!sLYts|MjHYx=Hvu3$B2qp{^s>zcQ6HYCpTuQ zg767(gbDm2jbhfDcwg3g5Y&Q`Y8$gBNUhF+_3?&h2rpy6lcOOox&0Fl{VkDpC7^Q& zTfWZ-Vo%U@A#FQjuOuw2TC1@G!&LHA3pQVA@H`fV@w)8xdBlU#befC{VskEY7%J`afQ*}G(!J-*LCRXBW@;@V`>PFy*Y7Eqyv=sbQS{~caCniAHh)r+ zO5|;p(U=9rOlyEZO&CQ*%?05N7lij}9L0-QV0Fbh&1olw-mwQM-XGE>)oLvJPkE&x z@0ebdNpw3js4D;TDZC?ed}&>xwC{4cOPCaac)R-039`0cH?vb-4w@V`nW$eEiyodF z&9hi%s6h`0pdTz=FG$GXao&_Y*zp>5d311r91XI`5OytCXuK{n`l+xM#6;`%#;!31 zKgZ$Slbw+@f;XgY?ypbV=k_|B7Ek$LJ$iY z4#FPX*X}@*4DWX>1Clyypz*dzbZ25I5uH8|OL`>}qOtfw#j}E~Wv|lBxvMC5^nVR? z??MCu`mBxT{Jua<6FaiOWW^?m{`8Ghyl}{`cQPcVCKX&!VV{=1^RT zXya8pXSKd_ZW+g$FC4}BD|~^RPE89Y~yMlD-QagC-uRb$eK)uKoOI~?NJ+TmtI%R<}zI0M=s5&>A;KQ+vw^bIUgnV3m zh4&>MJ@E}&SpdaYzp+bAt!Nr0Dp_GS8mT;~_Ef86Joiynl;&-+A-#*0&hHNR7eTOK zaper^+<>X*J^)fl(j<7Q3ZlOZcCn{D?LC{2hNRIV3z@FI((w#hVNE@ zmpLNbFoM(43bE~6?z7NV9*SCv?!W;E;xXlfYI52&4%jF`ZR5e2V$_$|RqC9eDR`2w zL6jR@W!R3#Iaaj+sK8Cf=Y_@0~nX3%f*c zdE4b-Vwbz7Wt4m)2-p(O0Kc=b*u9T@p;AfgOH`GCwsW->enwn)5LJ4UA%J6;Z9GZw zf34&1{Pj*&jFRGwDfhR_Bxwceo=L8k`Cpenq28(YW)k;Q(@K!qcXY}VXS7;|$!m}w zr?(~^7YDx?dHr%YOP_(!d67AWcxS&$WNI)(E&89P7PMbwpDkZ~1z7-e72XPyd3 z$2EhyWV92c8;y?vu@|qNuDoF76O6LW;8Efv=GsT-tOX883mAB3T4+tOL*C^yMxv;3 z7Dp+~pk7Yuw4Aa$JE8k3MR849$h#KK*Zu%mxE2{{Tg7dSXkk9=CNv+u)aAaXa_6f- znn!_dP2psQN=JYrENrlkeGrMaYtSPg{>b1l@$Du*WcqE5k}pz`#{>ag(qhaqW0eo41sMZ^Z=r=I2 zUE_7rW?b!sN^G>8+*D&jd{}zvQr1gd&$6Z>#{}Lb=nO}24lFcpF<{WR)s zCCp);vBjh(0Bb_kTe^&<5GVw+3@e2vSQmQ$xJIg|-%7SWK06Pi zXfsY-%@%pOR|!@q?}Qt#(rAyF^57nW{rr`v8K|5o#WxesPGRbayMe`_hxe-~$RWRV zj;;I--UT+-A47*G7gUKxZ^Us&mNa)`4+d>adfdK}HRo35y|7*aQt zs}>g*?VxOS?iXEgnmtWkZ(vy`-%d%j5%_65J+~~hb`Mu6vZLeZjx|^{Ik%4}Xdx^( zHCOh&@T)UP5co=2zEeEH!HsE@;YxJN9(F{>iM)5#JvVo4c(R>qgWaug^N4D+HO_=3 zZ#OB9@KzTZxe^M|IDmR+25xQ)T`WW{HmGO6Oe)_{b)8&q?4)L@gY(^Q(FWhgf2a~u z1P;_F@UWD8zvGCe?O6`ck2_%MbBZ*Y4=y3FCo8BsKi!A`+hjd(tqV&G@${(L?Q+)` zD>8}tEE{5hhrbCZewlGEkWyW#Jc=vQz|C5<76R$moY&ISX@607;ggBx*~z1ZEr~2k z{)M}xvspXOaZke+#tRmacO!v?5Syg;j{BdPk&r*cBNb{;S(2A76xQqhLK!8 z)E}>u_vx@l-yUgt`N~d0u@y5$!(_QOFxKVSzAt0r#m(e8n(nYVd~~~NcXzVRsocrb zgTKTM4!6h{xMS=1smEy+X9==ddUNNdD0V=)fUPxF!L^a65qS&KW2>t+5{7xMhd*~E zt*QakOT{}Y&R!F~>@rfNm_8W|9AUC*RKBC2HsXo$=0CiC$4<4iZOM&zfAM^)cSB9A z@H@10ajWEHa(6B*zqCQXQxOS)YS*=aixp$W<`-$Cc0MO8nxyM^hwgYsWbk0n9wioK z_B#^KR)5vC`KbgKdiqB>?aB7_HP6hAcm#jwl66_zoKbvrc0aS2R?gQexYfmN+!^zF=vy z3eui^MZao^G$E7k@WL7%l_TK1Ini%F4?xVMugaMh0Wd_g%({yF^8{z;sZJf6AwdiH4=tP}(O=58jRN?FHvF z6v=;mX~FR@@=7&oq=m_Jg1=@e4)4rx4Q8J0)!z#acS$V~5_)*pdihR%udvZbyXpP4 ziZP77iNttec*lZeh@DX(2#Ff8P0qKfcTV(@yc8i|+$*dFs>3`b1WyQzhd&u6#YlV; zY57)|{@(raONH46gKeO-@;CCn6fIWipn?km(3Pa^l%DT?Z!3}c)!k_#B5hcxMyb|L z5+sJ9#Y4Udp;YWzc$I8rcxGy+ zZ(GeV^UnxZ{Z_A6Ai4Dmv3O_EdMBvJ6E>ZMXeuU$*bSlJnBk zA5^Ny<473-byUlHS%nLtD?$Ux5zYIqn&}l$Oc$wDI*K5QVUl8M>iS4u^#-JG@?_^; zM|lq!p$pO6K0JeXs!^8>oKgWkWGc%puiTVv2CZ#%l}cDloNfa=T~r3z9Qws7QwBQe zR8SPaKD;IQ#>?Qf#`XMHJKL=7?SfBmckrZdt@HY9%scLuage z;Al@=)t#&DU-)D`zlWrB`Cvn3!Y|PFh`9G3y3)OaTskl=r`sKU#{8d) z*#E56UAzdZ`d)=^&7nQ4vP*v~9Inn9MavLxqU!%;5J^U<)WRNkDVn=(p2>ur>5(&~ zF#A1%(r|iK0&;T&84x?nFx19g2y3P1NYC^Lan1-T!Ul*SNfWGF$mYf}H}8k_Y%$GV z*%R^G#F^ASkr366+yRj1FdP$#aC6*1>}8+p4bG8tsMDe7RUfa_7$2U2q!l@V9+p>m zc4rtKDu+w?x6c?Tl(&024*b?4ov}vXZTB_tlDi4B#@Au%ZxWrK<%3`iLzTV#9eeCs zm$Zp}_N^0}8hn!~52|Vf(p-5960NHr=S_6ziN4L7%4tyN7Tn6OjeF*}kxZG6sww_c z9-vWO(YpFxW+F`7v#p28Q^d`w7m}6d&zNmY^+Y|JGeCO`6TSa=iA5Z~z{@Kgss6G( z!d!rMa+E$BZaKjF8tq`XYB{7-v|YT4jP~@wOuR_FJyaa${BDVOp)$E9%z0M{WvI(d z!=kpTZJA|*bB=zMQ8j26yVJyzmh5uTweoVyYv?kc)Z)O~fW(biymO~*-uOH%Pk|Ty zfE9()(v;{e_ZThTWqg~Em~vztlE=Y0X&yk|adG7KO3{v6`y6|T(w_6{XkkoUE}oK6 z@|`qCoVlx4uWs|D$=rF~b*FWcVrC|pV2gQM4l!w$zhTM$xFu7Jl-}Wahsq7hx(7ai zn{8eSYy=wd`RH(wh3XcL*azLOM>^=WbGn`!)UNANVcxjK`a3Dj9>gf`xPd_Dt`yBq zK@)b;u~GDYkjq!!D{PEk#nYebX+yrgEu31X66SAUxGPc z!F9SdjwS0CdkeyDg`C^PRZckl(A9O)g!sW+nhdcXu0VYrcfZQfw+GbdK%Qm%(>nAP z5pqcj2YEHExjc04b;doLXx7P*_jRW-Mlh!iEsvYyj?3RWHdXMK5(a5dJtbYvVlG5u}|gPigIKMQEdfdmyiqwtGk+qPv`5zt3e!E ze_u?eLK{ftFY$K=mT|l7vM_}dUF&sk97h3_XdItcM-sV9mrG@61EQix!1zADG^?*d zZ}7T6S76F5)sl4w;o&%*#EQ`DjSJi&a;=bkYt7qSdeYpm@{iRIn#jx_9a*QUUN>@X ztr)#2EtfOYZqTx?8?#nb>sz!l#kfSsrNO4Ea^xd3*YH$`S-sidmDe^SGZ0v?MOInw zQm<)_?3f;k*V&`xQi~;Z5)X!pWe=?GDkh2kO1@cRk5=PG%8U;a`OAn*MP=9tYN|tH zjNp-Y%cAwUs@~6&R5dLg)d4ScYb>Xa;()yWPg$u2th}rb;X>cM?rWE45#7tU7~fc^ zLS$a4k}tm7moZviI&`!$SJ^6AV8IOQ{BqqiM_60o3iMtkIv{;58HRL=oR$4M$^O3_ zXbDH3GNsg0yX^+SIO8=G@m6s#JX(R1bUbLQ#TF`b?{+8P;L5gNzSg>xDR z2U95X=RKxee|*T97k{@5`Rb_DGxkm)sxA&UIM&Giqu4kMfAc?X(0D2gpU7dzxf3B$ zs+TX9?ou=_wWnZSf0#l%OCIQsxeRBmW-hrLW58|pwU?5q0}?dD;K$^G z=Ti!ZRUupS3B(me#`orEHO&CiEqy$x#_mhRm+_h3N9m-D0Cn{onH(6?65*?F$Tptv z70(RwM8-+RCiY}gbUKfs@wR+el|btj0V+do=gluxU?ytBsKK(u{DWZ;-5|v~C8LS^ zC20kv>6Z8L&Uc3n!A)HJy74;9TC=oNEYWLTZ${RLG3=Q^2CM66&Lplyo~j0ci7k#j ziI0EzgzH?v=;808FpTe zv#bt>RxdBN&YBn?`qVTPOzBF#-_4s50B48TjrqnMO5Ikyn$TKxSFDiAxU_T)SEJM* zncdM?{z5%dO2rDuroV3X7n&N1X}FT!A;{(j4H#ds=6%cZUKXy<>>I z(5N^cd6TBdQOa`6fT7d?6AP7Nn%C5_?CBlM(3sEAmrO8~PBINKSK|@6Fmu(kIq;D# z6(P{escJ9%hDC6$>c9o=)xx#cGZIdd2+>(|cx#a>sNL}L2w8N#Sy6(iN5U`+)}%W# z@2^kBQ>yEH75&DPr6?y{J!r3DMicDDoy$$05KzN8-zI`%#PLw=N{DsdcynjA=7hqx#Wm`*`L;K5%|ATVBu^T|@2gcA>6~BK4AT{eVjn1dc8X72}bH+86+!{I4WoKJ?y=HO=@xf~gjGXw=>Sy$h%EXPvR?2_%lU5J@q zZYhXYe}a6lrqsi1d8Zmp89=ojr<>^kwtZuzRhHjrBU$jxXlorM`GhwvKXTX$o!)z7 zUti7YqN!Kj%L()$x2empwNYSgkm!835o(J%k`rh6P}T`Y(&{@_|_ z+T9y+=bS?b1Usb`3-J9T{>^l?`faCz7P0-R&y1J;R$lX)y+NpAO6Cm1%k*vUg2(9W z#kSEN(WgE|K`|HSMn|d@z%|tCUtSe-cyrA_7sU&cbk?@ytwR^U;Tb?DqZg}G{ZCk9woW6vpL73J~}hS)P}dskq*Z} zfzpkz<~fXonu+r@*&q*D#No>FRB?;cb+JqR9b{mimMrZ7W9b}t3!b{33**_(G*nmT z0Yq{cgGNY)?MfG2o;(hbmRV-0BVaJm9yu~~*Z=@>J~lU93=_(JRpdSPnNg2{i**OJ z2ERqgJm{*7n^ei@j9?z5GzmdkfSyL3}-;KC@{BImL zR-rBPDAozFo+uwfsgz4I&YK3UJOr?HiZ!B{UG%`@Dm8kU{f2GoZmyUF6mmn7a55F_fd>QM)42->zpUJtRAeCh zfVa10BJV(y8dp=Lep^b_%4Lyjq5^zPl#{HU!O`d_spm;Zukpd4r$yk3DQq&aX#>U; zUlHP#5`O4X*Dy0kF6a@h?p0;Ou@~4rS<|#KT)hMh_F()w$viMMbNR;{rPL&3u$8EV z^BYaBrf~+AZAoQ9X_22yuxyD8W~d~cqD)oi6;22Yd9ztvt?pe7&E8K+H$$w@k3R}q znA}tn?SHhRK4cl#{II+NLC?4>RH34{&RSuAU#nrcXG0aT)8B#mR+pVKI9S&A5e}dt z2)}SpApk0uby8-OOY-6BtULzJZ>$FJPfOOjtwFD<4G9o4eGU_)t00;2#Yv72akR$4 zOJnO>TS5$FLlF=`j>EHF^{+KHFm7lWl@80Hw)I&*F@ps6jTa*VzAFgFD(pqH*eaJ{ zbZ=S!++ttn!$)(w*FoQU1cCFH9}7y@RvI(3smoakM@D= z9fb|pjAM%-6^aXDSUW$fy30OOzgDOpVsMTK;^*H=`@$ag0OcY!HuyEEB18bGYl}fY z)up;fg%f=Ra?{kRrF@vMvS+$XXL>zThcEgxoKYkCzs@Q7GY|-s$_kyTaffE!2_RB< z50Xi@bcPn*y8gw6FfK6qHJXAReh<>OF9uf7SobV+o1dE~bbB<+C=9-0u`PA_3wsk1 z5#Dikd{-@8;`EI9{5n6(Cd^qv^AOkuXb(&gOsKoDCbV0*vXwSbq%m3O^Lm%;wURmt zW{u!ch$8*6e?{z{djWLTK}1AYWh@I?8dI?oF_0e~iu+1zBd`{CE-N0iZkb<}r+C6fNZNn>WvneOtQmpMbjBOustY;p=LpnWcpZ7oTo0X&h?BQC_|>D_=C;Z zpuh8Xw+6=2j!dIw#A^)gN^}BHMQ-V_yn+Jaf$o)26&CK_YPX8t;x(n6zAz^{+uEt? z)d>dY-|n)V^&Yuue?Rmn&uP5N(>z0a-7K7dt;%1nG#NA4kd6O?&!ri9n3|Ui2GLr^ zEOPMR9(eP)Cl@h0%Cjl@V5rc1{el5 z>rO!!#O;((vGF7{_@Pvgp4FL=iHgK)3zbQgpU6y6q$D*BMPD9sI6ymHz%FSG`Yun? z)o+M5bF;o$GZ@R$JVc%ikeR}t7_+%* zBR@80=NRB^2Z(@JV0jGP9^y8Xhs1m%RJ+nJkQgUhXK=nXMP$u4YLreDz77=k0(+eE z+thXhl_0&>XxY-{IrA*?9plW%sR@Ww(|kEs(<*EzHuC4p&Giu-v9_ObC>N>mG*1qAU5WQy6-4&ky z$JuuWG?i_8kEkFh;sg+tW}#P+-cbZ8(wlT?(m{F+h=2@LX+c^50qF_7Lo5^l=_T|i zy+=wCNC?(va20$!BEREZ- z6q0quFoT;Y{+&i=`kdBA348d=Ow$`sY#pG56-G-Z-1i z%s0M@gKx+0S+p)DA*CcYi>HShnkj{<8Y$JIRe*D6#iX~k6iZ;cbvymxW85Dz^jcS8 z$@DbX)@r2t>kn+O7@g8Wjv4?`#FMW90EC>~XQ^nge%JU#CR{Ya)ku8JEVA0ML3Qt6 zV^!yu)*;MX7Fm%Pla1K}&EL_%g^iSB#45n`6s5t=2SOYI>78{uXl}z41DnlzE?>b-g5i4{ z%-#?zJC(;n*itfL(v>H;NDWL_JYPe&xyKqwvs-i?mv|#N<2)zP1^8k0xL@YsdGadsa)53V$bYj#I zQKJD=~>q}#^! zaBMlg_S}i3R40Uefa(utE;oaZ)lrw688@N7@x>vvp0?w0v!N zmnS^-K0UxIVle)0o|aa`cg?hYx53v`RnnHT9Xj+lG?jN_+?ex?RsT>UrAwu+ zZiQTY^>oRL#Vb?d5M_T*!HmCpOHwiAh>VE^BCWZ}lCoVXJav*$S8#&_l9dmrMczGlpZA8oSnm^Ge2x)cG7s8D=xmXWBpp0hFi)SiEy7i2EZ6-ZG^Yh z1%vSbH=XuJ?bpM>vlE1NQoGsQ*YLsu?c&wMwZJ{YCt0!UTpe6a*SCfn${oF;!K73k ziL7G=K9i`jM(|j&aj~2Eba7vj$ID6KB!W1tDFTdy7!_ku1utDZjgdAl7T*Lwfk9^j zkM(`aK5tFp5(y1X#~qc3nrjUYc`J6Yz*I&Gf~b>jhh6)TVkO$*S24kI=P9adz6c|y zSf|2%i*|q%cs`AGNOE9mx<)MP5O98mPu3HAS{G*@I?G$E_a;(vkpTe+wK&ls3=9;7 zQ*VVi%u3eOi}v_2miTF;ts*?M)KS;$nbX58EvUabXw{A?i+HQ=NfwwX*QDVTSc%1p ztpFITXtCxgA7B=<8BPV@vyCTb?tj4F(~u-g6%MM3C*B4lGaWq7;a`5|gv*tj+ZYfC zNV)w9^3CdrA#ufYFY8jJ@fOfu+o@budDnZ5$e0@hsg~aSSz7Iy0ni{2OPCz=_{$!r z-Th*{d~;(}S5!>A_^&Wn_vV-Mm^;;}i3>X(wm%3uR6X8$%dqGq#)Lzra(8n5(huUj1H@k!fNWV*|guI~J z$DQSFN$-8a4?7CCHgDl4tSpYoQrZ3hQ~>@HfNuH7DNFA~%KW21xl4Wl$*OQ#+xZ!K zE2;^=D{EEUZIzAX`Wvsl!XAQ?>b$xRnEDBpi?~&05%Km`#2hOZ3sr{H znCsFw=sflj`70XJ)O-j1;#E@6p4v^hy8z}SK#fML-b0TTAvmD8Xv{sfvoMpSmZmI` z$~uW;3u$a;8%Z+i8@niS_wA#KE>00;RC>)*R!5cy665Wj4X7YL=UVX7v1vax)&neu>Q3q>-ob^ zRh`}4rnyhqjqBB@bmtgEi|~R`w$cYzyIbZLHc_*5Toh`*rDkuUe6Xt87~+(5B8#Mi zvlFvqaL%!MIii*T^~D-53R2Vw6^92fj(jh0b`Bv9QYSMcgfP4x+kXz3x!>)-vW^*K9@B+=`e zq%^yjTtg=fGk_w=o2H@>Dt9deD@;2~Z= z$chm#3IOz^3bBERURdQM+IGeNXiY;pKz5F$TSLZYNbf%m>BA2_%ScT;Y31s@^%%k8 zs-gT75@#$m94ZQ0A3dsApMu$z?H>oKsBHvHl|r>k zi+~6*>4SmOht)vT3lms8FNMkF^zXN8#~0G4u!}&6_>LFhUsA|(w^TY^dPpJB;iKoz zfc+U(%TDSPfmZj2Bea+AaUvhqV0_46ql+y|E91?$nZW(8+_DDl39j-yxW+Mf?C5+6 z)*9kWOW<$<#Yl=TsxLzIS$IWqmL_h-it<#>%s7D$!{$7qT4(?$S7vusVZEMKp-`hj zj-wU*zw5$pBx<(?g8$T7H4mMqH~vjg{^|#~deXsnI@a{N7rv(b<#)^^o0-%jMYOGb zmOPEQz9;+mF-y&X>T~aHHqk=&S0C7dle9OAc(C^&fi__PG$bQ%QPFu{xiuDg_;67s zzkZ~uH^qVFy!mn0_kukD0%i=*N)z)261AfxfB{Hv?hF0X0RPWU(u@F8tVY~%cl&PR zanGa&KSEzwCQ2NYed3@JgA=vlmX_zDw@H$*Vsyn|l&w7>P|xl8oO3^x0~4p;rwe#=>@6(3vDWc>X?EW? zGRqB<>mJe2LZL&RqG##CzB4?)$0UvceanEf@3E)@7f7%pm!cN|kGzCaU-yE3bH14&7;ziafOOvOL3r%F4R6xfnx~ZG;r{HC^^3UfLjLk8dmNYAa9ZAzQh%k z+%xipDvCTRmUhe_Agw35!B!TSd?Rwum z`1-SF8jlauE>d_)_;_nL`Hs`JvMqf7esbC2CyxQ(GKbpdfG2PoRV#eA5#VFmiO<$c zm~^-KPKti|F^T1|h8B6%ggF~3vY>#a?jL8tzy9&SG@vy0Tga3W`5zzm<0e`Do`R+F zCSTxt{RF0GjhrmDmXZ?vON$AGEc~g@`CUWb&zm}7q{|5-aqB%lG}Znr-qNg;ODHG= z)Gxts;?z&1!r^2wOi$0x4yJ#l$DeC!Hp*@Px!922#Oa?b_TTLcBA4^$VwFl08$*R%C?=|kv%zf+cN+5D1bpy zlVp^BEGt(~i!Qlw|Guxk{^c2vwh1TvJI8)S+`5DuG?K`>l6NJ+1 zSEv6DH-ApF|9!18ULaN&3nDi7|3vYAv`asI<7`WSx4I8Mo%-$_|BH_yV<#f!h&VEl z{xv=N8M`$r0N!%3E&TSszw-a~TRF!Q5vwr|hW*5e`0304L?;R#0lcN9cK_|aBq@hh zc3JnPs+3gkU$SXu6rzprNWc92d`atXqQ;n5@zehuamx70D}HM&yaIouCtxICu>ICD zZB7wr{VHU!4{o}~d}Z^Mv03gSy?Wf5Wk~hq0`V&b#Yd3DT#4Hxm_5AHb&^@*(;m!E z_0QntuXp*_9ZJ^8_8D#oY*e9}qT+!jH#{ch`B3)B;XXhY=knj_^}uzKECX5RcSV%< zVpD8%D?+|qe%T?y;*3beHr%ewDwE0c1;$^P@;tUuG=*z*XOe50o*Nln{}@oh=Rz(q zc%5XuFpDutht`Akv}r<6#6?fp+}&V3Zqy%KMlv}kwmol9Unh3V0{@FO>VK*b<*o7p ze>``%qsUQikB}oLksrY_v{yx0(O|xP`9qO# zv8LpxqLpu^atj<0^w6%3d#wv@X4d&)>w)y=5eGXtiSJW_|Epop zgvzne#@MSyAk~zzuqI1~!))rS_(5i2Lsp&H!SRxdXBA5{j05n(h7GDTXzmUUeNCwX zjvC_RWW?Vc*JWonlv&ru@OcV-25UZc@gMcPcKs#rj8VzO{+BPxTn8lN>LqXWsQ+~v z-zjvt?)7OgT2iFCo#bZ5mqU1#k1di4ZuO>HJCoFmzN9rm{g_z`421c`^bYw+*ukK` zQ)+fzAr&NcVdw$kXnzl_$lc_jFcPWD73Ai+d_Qrs{-Z|Qoe9yE_=H`vz2cIWlWg$P zRdTc8WF*K{27hoP(wi%m?e>S^uO(6q9C{7}J6XtIHl?~zt|d5PGE|sPce?bky9za` z3{Z|ax)jx4@qGVbNv>&Kzwq&8`0AHpa^o%HbbX{p+>VF#ec7ulX1Rjt?nb`L>2iO1_A_x^qgaQJi5&V zzItxybFNW7G~wOvC&eU?@8T9A(#hqgO0+6G?j!KfYGU$WA1xnYJ#E*^iq*N?BSvTZ z-LwsD52w?{k_ixkT1Ao3`fz-1$gu1@%&>F}Z*g(tc*6OXb4%9*=BuU^ZHb5per@Ty zWxZR|EK%ozY{{BC7mo4xaTLb}lcCOFneW|g=62I4dYVOu1B#4D#J`@mr46;D3ydB^ z#f|Qky{-Ow#&&+VSax9npNl7FY%@bM(r|M_40yDGmr>Z(XgBxp;#J}DhEG)>|D2x@ zMMaC|X3{@YwZOfYvBH?)KX#iL2e?vt@r}utL%rNvi)RYRfL0Y-M9YJ+dNex5Jq7Dz zc!+1p$AD?Dvu0Wr@Xj8JfM9cAx<-?`YVZ|>W$&_$-F>6*mbwk1TgC^OTqpO=-6&() z6pkSZLZBd76WNS=3~1M?YrWcRll&&xIzxWYvf0)3T-VW~%jHOSV2%j|p|2KQlH$Hy z4*`HL=w-|HR)Tl$@QU{$zvjcfs!Y#NigEg|AAN1!o8`j|RaB$-W?4~uHI{e#LVtNr_me8yS){FFu5wc1Wh=lGE%g zla%4w;wr+gIfJrlnJn|AS=aq!Z4AB|rIMP(^0lz7%DyO9!Gdv~^$ zM|W$m<23ECN37GGZs90KcTC`U`jx}80FV}#mI7whUB?JyrDsGxbVp6`h0<OyaoKaGPhT0Ur}qVeXZ;TCy)%udlnWlQH+x|yYW_^`tHdF_~W@tjG!O;q6GC1(EI6%eM=(+Wyh@Y@090| z#dW258?rPaU2h&$oLa!HAJfx0j#Eu-N=#Qqf<&%JMRxT$*e2>aJ==RiLBD{$SZ`3Q zF(IEyM!U}K^vxqjJDQN427lRKKY$r?pAz+~JWp%!A#rk8M%`dMFTI5p&bdRIZf!x$ zpt#`prR$|$A6ujoy!3r$ES7r=K6hkh zSUNlJOr_TXokRisY(nGPiAmC4E1429AD23MZ zLlaH@>v6D6Vu6c3OKQfOC(`YGHld3rpgFSGJv>bnmkmMO;Sj7abZTqc$~CY1G)2`e zZ9qbtnm(6}yUL*$))1HPYxxXWVuN8fMcP0e3B~SHemjs!O$)SW-JuD20M%tKZ1x${ zLK5F%!SaM#{Z6pKM!ao65)7i{dARQ`tfwB$F15bZ#;lQeN*UAXv&B5JD@e20?BlGx zgrhKNP`iJiX+ixMC`W^07CF>-H2at#YMqR9nmR;`jGwKC~(G;m`0$7b+ z8g{NN{rYz3J-3Q<-5Ju7$>DQ6L2sT$Q%c0uPVJOGlC}@Py;fcuZGIw&AXc_Yi?>6z z@2Ju?!JKKi0XPBfT9hdbLVH}!r7k0{?TYBtk|{Zqmleitl#O!2YqGY6=44vNl@(By z=eHOMk7CP!cHeeUS`QkRgJuwxCZVu7FB#5_)e6+Nc#Gzm-mzh&P7mc~j z{#I;NQeDC03!X)_$t`4NZhUvj@T^gH)o0g`92=;6+dE-gs6W_2FxfKi08T~j4X`V` zvyRpjFeu1%B!w|vr6x9TSt!>xwWxdg(VTnP)S2uR!a8ic)=u~c{A4o_XE9QXfGh?s zcl3i_aXsjuZoo_!O$PX9d^k!E0S6mSEz`JE^lzV@wuT)gLAbd$JRbY=b-w93yFtbH zLoMUy)(QIVedEHhPN2bd&sfu;nL&&K|3xm~Pvzz*!m#VpMI)|?u!gUx^X$Dx#~U+b z8T9R#ciV=_Nc8m6)1l7qURw@Jz8#n2$O#KGBgr|Na0!$UOp(JF2D%=5=^Sf&*sDtp z_gpK~>NrWQ8Y!=wCUr7-+Q4Ohj9#r?9(5SqV#!p;_bwj~^lfZx^!a$%njR5^Quwz_ z#4xc-Ntr-m~Ga;}uWW3d05nnH&f{agkP~w5&HONDHlUdbt-aZng zD#ia*UCI7cyyyt~ZZxXVN--04VlWL$V<8F%AK@y%Y)Rsj3xn-uQFm}Ql5 zm|*{_O)?g-1wl;h(%))N!OIkB2d3dVTw+%wbh%p0`<`Knk(qgqrhMjU2)en z6z%K+8Q46FLjWb5cJy$Cu)`N=4?-%BD2XYcrtq!Eh{+k7%BGuB_yFZZGkyO_jL%E6 z=vK7(t4A6iI6vCR2Qy_oy*39QUi4}x;AI|tpmRanj$3#P)uW+YpUB6iL<}TXqj}oB zSXmLP6NzxpesOeUkE^yVmSJIxiyzpPR6s`@O|SO@E$TNw>JyqGK|DFgFTCL*JJ5bp z!~-sTfMIZ*o%sE^5|D<<3L1uEt=_m^Vc_Jr8O)T}%a7ELuFUVaX5>y3^?b&#+c_Q| zdF-cG{6P7$s*}iZeB)7TtHIpkei`CKOaJha?#Dh-c=C-)#WD7G{`Twt50m}Jd8eCj zq{_z2Ki)qVy=#8XQ<`CV+9`Eu$xejsc7vaPA+g5||E_HH%9R*KRD;B&0LGy!2#yW*Ka$N^pI{|ce1fa|LUrBizT~4M{~9M zb>VAw^shI&cf~AGP7TGD5xiZ@B(nR$_Hs0-DLXo&J!5rPy(B@d=e=vhF_5u74YO8v zrH+V0Hc@_N8p_v{oBP{2`@Y_C9dR9SgIdgCjvcaO%dzG;^JfXTz`Uq2`e>Inv}7$5 z|E{8|mu;X~Z8_be?`0e{szrV{e#MZ8X*k|1{qXHuy=Hlq5c^!WlV!Dl^n(Hr4y+ z*|LfJfyqFp?f_FjR{M*&tDiTLRs+OZrwt8@OeVFZQOeBD5Xh zc7OEs?hDh#wJ6rF;lZq0`NrRFPB^T=yqueP_yd&c6X~Z-_k1E{#+&coW1jX(F;`s- zilp7@08R0AWKiv*wJu*pUKPn$li&i58$Ml22R>(%FO`dX9SS!a>~9&Rc8j z+6OsdYzp-~)`7I?w#ZgY2@CJ{`c{`k{g~PIu9`yQnmjS6h1ChpzSpx6lU7J( zS#z9Bs>9UKBE9r+6QvQD%u6_`U(LM+{~II-?UtfExsG;^_O6XdACxj-E`Ml@ZS@?j zxwDblhml-<$~2Th?QQ6%xaxGY9afN@KD#Hxu2=RYE@gx=7Chfv@*&=|HxEwY|JZ*k>L$huFOaa zf4d@B4^(yG^D#sdp^bSeE~m3@o`2@0 z=X$IE)EL=qI9i6UUC}FaV(?7q11}$3>_+Q&4H7+LI~B6Ka0x6XWX)lHwA}P|Qs^xKC1zz#d7Vv&Gxg$GO>k5&-(;8eF z5}HPGYW>C5`B$gFot~!8wx}jUBo`Mg8jkVb0&n!JJhLI%``UfxS3Gndr9DpBwLYV5>1%YwUg? zV$-NmpqM2>+!lW~43m{l@tZs>4t}KwzGA|R<@X*WOY84_s!dcp5|jpex};gZf=SI) zdECkpapu}1WFuy?XiN-6MG1R{wbl8m>TVbPltlK>iNp+2zs#Ezp>3+$wZHEa*KSqT zG$mLXQ=2a-t=Kl;SZ5Po-xy5EpIBZvsCK6cj?@*hf6Q)>8&o+QTtyZCn(M3AD1y@^ z{!@(!3t4^Q!^1d)#x>z03woqF5nP?#HVmtMf~>7F0}Q{$%;XP!=W6d{65D;vEPN)0~RDl5qV}vl9Nf&6rHYyofUR z5jH?*@l)(8V*CUqX(M+-0$Z9wo4k%Tn$EX9Z7o(jKz=da?h|BGn?m2Z1|(_E8zu$i zy)=T;_^MKF3NvT+*|Z?zL9}MVRe20o9DKj={May|#9thGq(y5zFCbJkkS(_pPeuOeTx|J+CL&;fdph^%1 z(G`rFe|iGuASPa5-ygR?uUXB#Zt(#w=uk?3~~mgdDbX?fUn47x2(@0WPmFNUnB>}^r_In zm8?P?J6R^ivO2e#m-k!!cJP_|-hBL3dAlDCnX;7yalUva9xwL0`QwUPst1}}4tksl z=wkz^#8F&08`N6o=tkz=vVO1z;?{O;SSuaJx%#8qq(3P)|1^xtc0V)69ZBQ#A)Lva zui3-vR27r;p&bXO-dm)}&Y3cq``kU#o-I$rA3wV2peDu#)xB9_Qs_DNy05h`b2Yux zs-P}cKf1KAYW!+v?c!Qv6fcSC)b@I^o<-w2gnT75=W28eJF3#D9~n2Z>kZTDjCBpb zr9Re;>oIv#TVt~GU@zpucnzyb?z+EMUJb!gnLf*w$rL7j#f>wvRe<}zuT)<#;zfM<(zio}%gSntng$z!1?KntnsMG`!A^&k(q|f#-1A`j`@8)v zPZpEL8D~B&w|ck8272e&HL6YuYvp&>Ad;wcql9gR2T@)XC?8-FWV%|V4&2o>OFgK_ zxM_cHSnp;fFJ5lDeZsrX3gHx#wNF$ZBg+y5=4rl-Jhw&mjTpz-abiQ5W#LX!ipmeb z12d*jAA`zqZ;;lssi>Qt&?%eRHgA~7jFkMGSmcAb?ZW1@)7X5YD2v5Rf|UJ!e3!ZWsV;3LZ^m^~h{=t(-{b3h- z_iPlx)Sk-H*5+IomsQ$l(Eoj8o}_j;%tM~C1m5GgR2sdaazVRP4taV@znSTxXU)0_ zr6X=4b0X@I_d?rKAzkKI1+|7nS8fvz-$q-f zDBkUkIiChg3=yW#errDN8F8Uh>(!n4=N;_UkJ0iaev-(TU9#2Cs>EH9Y}@PtzAJ@P zFPY}-`;5KD{9^F~xNSfB0l|TorLXwg>#5Ks8F;tGZoy72j$87uT}?1e)Mvl_{oau;@4x6_QX;-2Lz{ke9c$FPB-KPTR=G$CqgCKdMp z2qw<$DPB*Z5@`{q>ipH*8;#PJqVr^Qo~+O9=`Q%g+V_X`A3$5$OI{&%S5n%Em&mNE|Qvc&x|VZ z(}V4RY_XIxv}y|j!5+j8HyB(L&2M`F?WoUf=|`1-2W8blMQG>2UzXF*gyG=>x-~Ji z>K*QY1L38CB{kjs6owKjXuL&CWTpB1F$+X>n%^zDO(;@Vf!btmSs1l=~6Hp zqJ#}@$-Xk58-CF803!ik-AW~NM2IuJA&&#*!1;W5Vt%c+-;IgbLU@`*zS5F&eZcV) z@zVXDT)*>UycPmIzoJv)-A~2ebZeWV$+$ZV_?wU+x`8&D1LS^jtxb=s?~xCEHROmE zH$jV^*jor0(U1}x2tM{oe)XJ9$iFYuxn|sB?99t7y5b0{CxolUsNmES2_s^TtSeRPP??1gm^qBlfoJ$;YClY?&6hHetaB&%{m|XS zHX$Lc`!8KnzCjZ6zFv2=Vp^L;CAL@Zn66C@Jqmz(LlWV6YSAHYpH*yPc&g9#$9dL`0;y|yZa?U zgpwdhco03N1oTpAnmxxkHA?0WkEk{|gH8kMfW{K2iw=Su&!G`PmvFeofm05I#FGNEPh` znh2zK4z75$`d2|}hxKt6h^vLZ=V-dNtYi|xcT640X{fxme9E;bg%lA{+d`bioZ3QV2n#^YU`zTbdbX0{othih&FN%-e-V?9ML+Ip{X7}2?3#A?TcAxY;yrL z?^DTa4i=RF3l(i$B*&p%^|d;gRc-scoHvtJm@-fm%Q}l6;@D{YL^1BNo0>}qru#Ob zMolt79#*-o_~R`#Kk*2-j2_p2KDt8=F7HC0%D6EEt4M7V0O6l4je-z~cJWGjwTXwZCDoz=CQn3_R`4B%uLygRPCCXwJjcC1! zGUI?733XW)yI6B%9c&;2wj(tHir->%Hdhpyc8}KDFP)XG2GT~Y&9W0RR2wNKWAul9 zGGtjdoT>2D%R_lx97Wv}6FHGw*FLhg&P8FYCG{q>yjM`d0dLndzhGO-hd5X1E>CmC zbBQY-w-TGef>PhBM4I&2iGRT`TpT*unmcM(<-Uc(=yulz<%HGI%%RB8;)Mp!-~0C=O^&EA56hJ1379QzW6IEO-O_mVeNF(10{x=+ZAEvgApMVrv3pJ5%a>S_Y-yY zED9bfyp+|pOlUg-6)~CZNWS#!cjW94-|>Hc;p8&Bq$WArw((K%L+{$ymiw)C#I@wF zqe#qrTN1n*T-N<{b}e*r>rB`z0~+@Y{pjj(G6U>ElRvdCL?zwKmJ?$6*$~x^*iLPh z*(l3mMaMr{tU}pen{%o+BL>}>rgE1w#h9&d@l4Qwp9syIK9o>&oi(2@kvGSjcso~c z?8)X{#f(sI^cpR6M%(~}5xSuWs+FY2azeQpqM3D{`=Zt3=4M#*@$rK%R>@va3-LsVy{*~b7 zLuN2?t~95-;~?;Lt>ThGy>22;kzYc#wsu`;^e-4;4uKFI;C`#scHz~J8U)57kAKVp z;8)t<+P2{D;H;4sKInjt57`}Tz7hdzZgNI;f#Ax2*RVesy{p~i7ZR{pje@M35ip;G z4VfRMG0srUV;3j_JepDRcS(5!E827;#rlYNn0U>e+%8R?o$T zhwbU#a*0G4odYm*nrHY6jEKMPSFA^q;nLz4EN+!F=lGX%;6RT(1>3dXb-cEW9FG=3 zggti-xIO#A{b58~0jMZp#=!>VORn}Uzz_&LP*<-RC~w@#K}@BwOe-WlnkMH$c%gZ| zcng$e*ssvBWi05uhl6un0`CJA*L}?@6 ztBDmf2p8};m%(^_MO|DLNyP!WgVnv1@35T+~KG z4t}@lOTIaE^DFPVN~p|2F6#0W-6Fa8s{OWa+cL=>eMBY^@1LlV^-xVzg{-D9 zS&5$K5lHk6V%0)5rFaH@Sq6-`F{;hm{(9Fst83lCV}hk%C8x1FqpqI$nm1OOjUF#o z(UieSOS}M48|?Q_HUg|S#~9Hb*ATgWP<+r946z4ucO#vSp!->SO>!TM$8aJvMP?Vcak{{;u+5c?CS4Ji zcufP>!6SbYK80X18xL8}&&-4ifryf_aYdNUvW=i|1kmrF61sw5JuhX&z=ClKmJu@i zO!Sbszs4;!uT`db3Z%=m*CFWqe^>BAs!8yh&3MfmoCTt=T3B_1)2*iMLSzHzIB7qm zO&t_Ob#v?bW)PK!ipTpNdiNKIHml>UMr9D=`~;@+1h6|@@pfHZca`}cuyXj4pOjGncZrFsh7VK zoIF>QwP~-aHzVi0f*{Pg7u4m1tpTifBC`wbE$D?SCk7xkt8ym`)DSsAFABi*Bf?!A zI)l5`0a=2vqs%zEd9Y?FVIFsBpUH4K>F&_-J`a9za`9~cDZJ>2FtpZl^t_2A1`OwR z=W(mC--(57e39BMMO5v%Mma?tInVJ11 z;&e2Oi`U3yB;5^TdzYW4bt}9(urcM^wP5;bb?7_iXF>i;#GOwNnbTnTNj}KHeOkmD z@=KB`Qs^oj=5tQp?9u&d4|>{(|$e9GXn9aC!~i6Mow4Z^h?Nv7@d>8T}=VV9j8eY!qNbxP-I+~)dBICM8K zi+pQgc*$8?6?`Qy%qL=GJ#eymRV2&1C72pKnaKv$-&2WHEXcWMex4|!Ic7PXMHRS( zvKU%Q;aK#f?0Ru@h;Kn=$Vh{Ukqj$Df)boPJU+DTff1bB z8)S*L%8Xjam?FI^M9(`fetv+EvM?_I1?wSQl~=m28cnjQL;kR9*K55wat1rb42>%rj37Bzf!_siy3ZKP{1Z=2|4LPjx% zT7S1;<-v=QC#%cg6?=L6UDe+bp8Qz|lw&17^W2-~Zo3H9-D--i8tz0dNjyB2A{#JT zu9v)^xzI3sYdK)rqwlUZGI&a-tgWJ9W7yS5R_TLr&<)Djh~XA6zR_$&95E;O-c@xf zM`lxfn5}X+jPiC_}3~e)zmIR6eN7EQR&g$eo$2V>8KsMjh>fY_@b#wyC zf3n&?JS;BJc|upBm8T-N|CCjp+A%ORuu6maTF{#Pu!BHMpnAGTqG*L8*?wkNqp;G2 z{Dcy-5e6cecVpDSVYU@)%D<3dp3udc{HRk{*|gkNn)s5!{D#VlK&wVNv_L0aK)5=fScGED!KpKo!czv;Q+WhTo)58Ve!_)7a~|QA{l$?< zFix*ypVJdVxb_Q?NY*`KMNJsamEOKPU=WcUmvm+B>VZ%ZGY=pZ^dP)oHbONn0jOUZ z%f!FP3b=;J!IiKEt=c2Q8i*`=76)}Kdymv=hx3wiS*K-RYR4v<_XJ~bM&^)vMn!06 z5PpphuCnOv6Y5B~HzA{Isbi2ndyDZw@Uc(*{<%s~>LJ)X&A zYhPRxPHj7OfBlJf2$`N;`LfzH{^J$TKl`0Hmht0~Et9(3(fMz*!ZRBNBTIYo7 z&6Ypa%4kaIy)pX032|CE-FA&8IOJX{Rfgc@`J=-J>e!Glg7kbwaHW&oKCN2b5ZMYl zVRJ(vJuaZKZ4=}LbyHL!{^9HE&8q7a16SgCXt!Bx!p2X{Dzx2ZTAVlM?1A=iMR6xP zPE&%bO_jL4Cc6dU>P5k-_soeZ=|XF0cmXR3?4m>*Tg_os3GeXRBblKyqzD#LTt#qF zH}fdfI;>^J+MsRCY|M3eyjDFhc%_HaIF?nu<5x$vwE z@xej$;6M8R+NlS~!Vr8pk-s-26?(SbpW_DMA3Fwfg=g%7N!*IU@C(||q(1y_->%A# zWqui65+bUKTul0eWf~vC-6IE|YZ+3#fg+1@&OvKDQqPkOWtL$7Jcj|VRw3G&dGOf0 z=%Wh{A)gl0GIm$ixX6_3NaD>0ad5VGQVVs0^65+jx=1UQkun;U8>LuCB=M82dy4WD zLzcJCu!r_RXw!|>vU#;_dmm9@)kRYu`7~!D(KoIQ$9jhd;|bGBQ|<9XQi#vBUDzEs zlYFE(Vw4yq)4OfymCrzw&O;b;obx>YDtk!nJ?XCquRsVdnBnq%wT;7&E7){#*!~X< z<6y^bOitCyKsK=t1+i>@lb!?iOeU!@K8#JDr~9b`5whpVS_`RHx;gx@eW+dZKtc9a zK37oJomD&`c3$*FllhJ1z>o|($;gM?y{n}i!r0<`5g|c7_NoSNyZW|4 z_}?1iQj44UB=^1^o2yvwC<`mPPF9KAHf3bzN3d_VKByEo2PoYqL;(h_8=zKuRe#?y z(nA{Vtvsc5zzMY4f)H>DD>F!X^H1ulWyy;i_U42aSRd2`u zt=P@k%QVk zI(K?vyt}B)JmQn%TQV7`h$IuQyKx42RIdYX2#k15Kc0X=p31(PpX~j|J%%*OUsT0k zvgzBcEJK`GU#PCwLXtsvaV2@?d(MfU@;^iR@XtJIS&>IRJ?|P~&?XOVEK5C5uY|Xe zg63I7AXm6^B6~dMx8s}rqbJ7#=&K6(Au`LVhbs5bnKF-?bM6;}1uf|g8?a=!zpm2C zHUNKWV8`8GM&rb#(tC85p-Cr}XPtA8Xn2c#C`>?2d+>e?IcQzw4C~&}I_lOitI*wd zoXg!W6dQ+zvv?VGWKoWH(mU+PTVTI~@O964bC7#MQZjLMU8-Y?%Bu|p_jAMgeBQJ@ zA}3DW3$|VTxQiFWR^Km@U4d$pm|C~CaHb99B4quwh3;;vz>{^ zSgrepoi$6b?|?ZO=yHa-$N3;;Z_>ubvUI4f5a5dUmSI!vJPk+mYb_!~@J^4f6Dxd5 zSuBf8CS|j3F2`dyYcY^UO}hu#@~IPZTaoF^J-;iVH-5;6`~&L#FTdbmKSg|D;aapK zVr}Rndv1vVvocfDip(FkYKzPuM>z9DelUaz#l?p}v1g>R*{hSlcT`NN;QrvOdB}csL%O^2(MngHQ&a!w z@TN(YsOGQ*G_fqHF-XU|5lD*-naUFW5c52!$>lBZ)H}T)#2?>APT`#I zac5T4YoWwnNnKr!eP@{kwgOMy5w-_K_?*3@ras6*TQJ`DVtv&0EFE$ytoH_a&77V) zkU)I5Jk(6iMI8NL>mc+Km}D~hrQ&8zL91Umo`xuzzf}47*bk3NNd#i^9VYv2JMU?v zqkKc9z(aaI59_L)5o%K^FscGGT_UUMbuat-=%k&2M+;K^J*MWLffy~{(?32VuAEoC zzaT}WY)z>13rAgkYkmO;t)p^2i~q=%A4+QY#CJRmi$LSKm!OFxiPgRH zS#h|4$w_IivL{Uvk+8nXy8i9ev_pN))h>C#^iRrhuj+5jbj=Y+yVt1#@iXi8>+s7^ zd(H1!xC@_#1os?bg6TLE#TC`xEazU=1d&J|FsG`_eP}rME5xE;8MrwwvnGi38`4J) zZz14;2!1gp}J& zWm39#w@4?p`Y%I^zTco+HU~-LB=^~ngx`UjzqQL*x*uWwKggNpJrcxr10;S3%TMlF z8D6(;G4wRPPHPRW+WghtoDb74<%~b>L1^DJ-RYdIJm%_mo6rR^amH-E^p&OSZvK?3 zP-(II*yA?a=Jb5tH6T#_ac5lj-l1JTN4@rap*)vx>W`U4?k{F>^?|lspMQa=mp-XI zZuHDz%Za?c{^)6Fn>D#kMzD1}UeyGyuM4pMuQVv|0 z6jiD1PZ;pnTR2`ubO=ML{1Ti3IRxbIC93TO?y^*J`ZH3q#w8A%p)ObvTUIf-nKQMu z)2hqoBU)ces0w$y=JfC7I2-ot;M1;isiS02iBZ0gvW5Isr7Mm1I8Ug((sleu?+#sm z_&;U&2QvRp+mxnU$h^&l0gLA%jn9M<7r%<#&4|{Z8x{F&wV)bMkOH+Z6f-*8l)Q~g&`aik=CKRYle{SP#WoOq`Mh9zJrRdqVIFP zzwaNenPKLfz3)}`T6^zx%&fbU8@3)|NvSm?88SDM;Nf>zkI*-lE0k<<+4+hg_#(3> zE_sN_+q~bh{P={0*Y+>YXo~LY{n1n=mj7j74h*Ys?-&n47U;tcan9+B9yyeEZ~F3s zw?>0w{p_~YJ63J5qz>v6XA$(pcO?)V6x^)4RO&@ywnu#Gn62dma{L107E!hp#x6Ol z;pfc$6rw;47EsZ6Q4oRkKU6d(@5dk=7HMNWNOLIS@a7szvNw_eBC;&q+n$%Ho?k%~ zi^X82w&Wb137YW06B9SY^Um;uOW2EXnPk^@;*IUVNY@}UdTBWy#=(BvV&l3VGV+`4 z)2$RO{(VmbA<%}TO%5;9Y2m*-i8*HRyr3Sx5Y@K_m;Xog{`yM58N<5!s!uGQKcy(> zQgF%#C5pA!YA9Z&_-3x5pjT}z9#lMyI?K1((!-BZbj^gGefJyxsDLB=c7IOGOMg;#S$1@alz`@R z-MfE+{l|WN5xh16iDlZ=BGz`1_%=Pah_4wFG(tp2Yn-I*bV&mZP<$Kh%zDMIe(B?tAh`y>1kN(!jJtpEj`VK3zkd3+N&olH1jf!kk)0`WOxWda2PU44Xh5bXSCn#XDz^)=^7qG10^7J4Giy*vPq!@aE+YiD(p zNPZTqpv3ED_v1A8+5i7x=mJzQ*`zCre>C4Z*+T+WaIM`lewGo?Ess6Xm~wh80SDOd z12vmMf?Ny9QIjQ&!M^UGEbYs1-Ge+3VdjDM(#R#%eEZe@X*6oR*&6fz0?oJl>~`Wm zbSVi5&oIvde<&6si%j|GAbxC*`#kZ-on@WTTgyxU=9xc?YUU4-&DxMrSamqSnZbilVIVBCJ5^{)4W z_4`W*hZixU_opfep-lg{|Eu1<#UL2LIK}Ez>H!`C<&dY(Pyx*LIu9@ZxWjBOdTaSn zJ-_K-lx96hxXSH`^@B+Q7WT7X{Bp5?I6(4=FCm4eRiK}q#^L_^(G$@px);EzPG<-^ zz`E{8K%gkoFxsq5aXPVY(Es*_R`q0ieED(HnU)kWO&IHWp4iCbfPRwKb3c?vpyI~; zNm|;psE*80Vl|%sK3Ymjz$=4>uRQpp=j8~_q`}Fs<_;sZes&1DErWhuRm81Ze%;!g z=i1wgIGYaNVBH{n_UQiu)*JPD!0j=K9E9 z^k<(qY>M|A-#(1t{`B$BiT_lAtAw{7yA`0H>dk4WB)I+F_yp9{SK+PKe@H6E^t|Qw zk#3n)R{z^bVX_TsSZ9*?fYi!$x~9p#uDG>c=1A&JS|?cYqC8&_*LuMloZ$wC$!;7W zG3Q>jpf<_#uyGuWCht!(VK-~|nGC*HlVZqmW@%~hAsVzSAz{up(xTua?q&ZnvG=%u z7OuzE6aDrb0c_Y*aHzgcxR-WG`t?Xd(U>RbP!+pSqjsI9FIz`hEIlDNtZZkwOKyHy zR7{Zg%6c*3#{1F0b?%MvPcscGQl}?5bs1uiDUxB&D|dtRd?akHB9jx~v487XcIt80 zX|C4KXnCJ8#gB@&gCewR{N;PtbKC@?v2PI;#_bNpsZ=#+bEONi(?Spe@!LQ7yB;bv@5S;KI;Jtq=0C}2a1})d*P?TAdkG=e> zXFj6|*mr;ca7W=L{;%Jm)p+HOq&s4e?f9qIv+*%39Ee#7D{?iOLm~u>|NPL#qKgS? zCzfFOEP?T?LjEGfP3hZRB~_e%Hu2jFf0Yr(+~ZtUq0Zj>y+4lt2-}*PN=*4MmBK>y zyt+aknRX;mXoYE@vcWr+rDzOk4eMjQr2fol#x{c)XKB5x#7&L zuin25Ge(xNK6r!tAtiDg`d-Xqv(CJE|876(&xyfq0`=J8^39A*EB82Rr*mGT-u1J* z+@8~8;-qS?I6t|aa8O?Qa@C$f0<}WY&>b9JnL(H~KzB8_BIvMmajo_d`O8H!Dz?l6 zv;4Y&T8&k;jY@vq!D#1SkoltyfL7>iz! z5NE1YfkA(xwhffWO@K!GAXpSoyWvoQW1goF*!;7y>pi z%uMEDhsK@C#ayR;4}tdcrYy>E1uDYB(uk{Q7>?&POv#QyvzAC4?WXhEkG?AJPpPhR zQM~?L$-e^=kaiN}p(ce7|0@)7*T14I$d%Hm`QHPqI^qUTZ|b_IKq{7+f-zw=>b7hH z`j~GIotcgbj?wK2P5rRsH#qiJYJa-gQ%C`QzfXD+`ernqpS0fn>0@P$&=nJiy&(u9W#>H=L`+^02(tob-^VK8;U|iB_)_=SCAK``v zgG+J{?56T9x=iqXrvGK6KfVl8!vOq)h*$ZqS~S%62SS929VUDLM_QdwL} zg8iK1&)@u?M(Bv~nKT_T4gD6sqqi|x&W2$?Z7qS~FeM?w-mmEWdE4(pl@&$Lc|g8q z@BEz;IzS&1{3-R@)PRsI$K!slGya^w$(58ZDk*Qq;+#4m*}Lew-#nA}WRm60KcNyJ z2WSManji6RoZu9EnB2Ac(QgRSZU-!!6&f(%Jwd>D%VlRG{=Ctj^A30~z^8bUW8f9X zs+~rsU*y(?d6NI2aV7sRYceqb7!DQRn19FNq{``l|KrDDCPm%czwKL6T zP(U)^8>Z|vLw|1^FI$ctq?d-+d$X(jDeMhbVBsr*C)v!Oj9tjo4%H3MR$5{{pWIDW z>M%5j$);aAvRghoFpm6BfMiF|Ve3(3Isa3|3uX$q2Lhcz4&A{~A|R6+@r*iyF7Q^s z9!NCgMz(|d|K|99QG#z$dhZWJ$Ggb5TzxBmuIR@9wGPXSAsEOcrT=e-FQ#aY!niKmH_{#qKz}`cETw70Wqmjg0 zFVf#xOHl^2Ql;^yWyWL!-Y7L~zuEMxPOK7q9E61RBtMYsBi!BTf5Y|Dt^u7ze_aO%;WE$H^E!z5?8InrWmgJTE7I6rCBPe1sW9# zU|rPxmXC;kWq*Nn_SS%cfUq#c=Zy>hEm;+~?e?nsVFzo}YHn)(+F}G;zO>3s6tJ`k zHi_EZSUWpAm<&YM`1om;`72BRw>@C7RN}pTj?|;KH+BM>I+4P4y4@EDw6rWg=3H$mH%O35c zF4$+l$=L)!74bfJjp%!}470or)U3SCM&I)Q+!Vwf`?sz{0#R4u&f0_QKQP1v`LgbY zBx8ReDXq3zs)x^Wv(WG86gU~hX+C&Jx-w$Ir4eqt7)V|#tx+~?=L0T~-gXn_3pM(T zLk9Hau&n}m1R@G}JBcLlF7YJlY@uoq>9#k$$(IbQ{Q4rv*h>e$ zSOnsz#WlJkCa6lIh{)$@z_{SOT%J3dZJ)G}K3KBQB8pN>CPEqo$g@QPHI|a(fvdNQ zUZ+OAbj9)+?!^e};ZR(vRH`ssu*6sA8?`Rqj!`<0&!4d&k6sXcUY(z0eDI7aw;gT0 zoS1IgJY}foRo1=8T61O7yIz_sksIU>BP~V=kTYE6otmR8u|Bx0vF-j+hGmgH-4E5K zSsrci_3+uUnl6I9Bwx0t_3bBEjT|bsA|1k6v?n-c=0i3Jbuu+5X7&-O!%I9bhDMZP ziSgC%ZS0O;-c{ZWFFcfxCbqpY_TwQbp7<%D*0ER_EF*R!?TmzBTM!H8cDfZ3{;KlKIeI zXU4BqbAMg?k(~$*eXgB%1ZimhO{Q2{%<;BL8>7Xp3Z$H25}S&*wT!7E4UBDh>M-UnP+v~M!ud3iE;alved_UnW=Q@V zKgSZ|(MZ?g^L(D?g+_MA`N|ays;-`3?rK)D{Wn;RoMnj{W$se@O-RY+7JjC@k|X+O zb~7o%O<~+uBn`hP0A>zNvP~wZvNnl6OojuGA}1CLefv6@O3iM@5k$2{k%Do-+;ARu zPeQPwzOI$H#3?bAdsS4vtp6I;euQT^A-WB>cwC)tl5NLslFruUB%3`MAFKi{b>{L5 zZ%vKolhkM$Xf!vLOwJ> z_r-Z4=PlYDLPM3kP^918tV*$%{G(52;zh%|R?Y0m)KU=tXjL+9x-E;37I$aHAv$%| zrH<)XKKjCcwpbIh6?1tdktQlf;UT&aZ5}6tMOOD`N}eor6yjF0f*<}Sf= zv@fTiZ3re%1s%y#K!@|7{|K-bc~s{D3$xE|zLRAD;-Be5K1a_PQi&U8kV zw?}TcZ&ATZ#@09N2jZrBfeECk#A)wSquw8A#b-D_U9U0FyE$sIHAhJFsFpR7i&_u3 z<}Ig-x%%!ol2sqJzrgA8Muy8#ZG}RToisV}RLMJ}x?qlwV%{*Ri8Yo zfy2cgb&-hkh=PNlw09lvABw0qn>Lq;*;1k#WHl=7R2k_vFYSM6v}ia^QZ%#GHSMl= zgD-o6*4~pJb%(uK;ztbV8nCK!6>Qx}3ldbm*cxW8iq(T|TM(z?9d`^ z{$WhF02X}$1qj=HhDW-&C%JK+X2Oc^K_{AG^X<;(O*&MpmV*fy zq!z0|2CUjRcf%aIm;dMp4UuR zC0BS@+iaQS(lQ|xr4+-rk;Hi`aiDe8E8 z5E-6N)}_``f~p8{R1h`Q(tHRvTWb0c$>->1_OV^uuy+)PN_j$(Peur1+vO!=oFfxb zdrgmdJJ%X7gO;R)wCeSfjw-`__*mx|J}{0jFXd!%d}y5^f;K0wj|>bMGK&LIDny{u z0c-fR5c`R%QycLjq{=O!v)MnWwd)S32jbb`6u)>}yF{(VVJdwf-os*JNv+u%N&3m; zpGH|qsUmenZt}XPT~I2#J8E?UPO$Qs zftTNnGJ2vb!OEt!u73K7a&sF7a#E$Fe3(C6X!f-L4R$qIM!v*aBe1IC-40f$RnAf7 zb>k4C!2ynlaKr~NzBf6`)MJKDR(iL&n+|@HA?)r&C7UYAC0%=>QTaGJes+^(z9wNl zXc1Iq(~CpWT{xNIAuE}E1=f|(Y^df4J0e^%o+|lCwrfFd>n_sBY1Z1v*TF8-x8TTZ ze(SI-t?LeMTa6c0AqL>u_g4Vi1arN61W;*A%tuGG1$X#Q&+G=K9&C^A`3A!CIQKczt+jO zFt0T3zYlAgUEhThSydRh%=smak=Qi~B|{5+nUnN`qeHehU6{5Saq$Q8@QCsgSLXRi zGl#WRfF7w16nuAp{X9`jl94iORmkF23|C}} zfO_$$^_#oiD_~*cb{rM12r!v`_C?b1``Y7u9W*3@$=-Hrpp~S7yR*RBdG)27t0z^- zRsx2YIU{nJ)C9rKC#x7p!Vs-a>|-`~!e;=4nIkurIRKJGdmQD`=0D~G=IDB(yVzz3 z+EceM&|$k2O!4&-oMjMoJl@_Kv4s$!Dm4u9pbxvrt!cemn$`j?YrdJa7STewr)-3} z%Q*^>S~8z$F!-?KLx5mvXY-O0=C6|ww3{J0hWm-aU-`#jh?$xoYRUBlNl0&l%5;3d z{0fSmI8V%98Z=R4@38Hvensot-BQnT@O%a=NkilG2Q9+e-tLD3)X(xk79oDgk_7&T zR^!NQ?KtP84&fiifPj501c41bk-Im&pZ=y$Kio#698d?yE=`yG{ZU-^GXzjQ+1hSA z?^hn|{WCD{vKYGlm`4gv$!39~)4GEm|JFc`JKFHf!blKiH?g1dBO}V|!=rrr7drQ5 z?eyrF`eXL{wiu;mxa~;P*=p1g)WDYasmn{MW&LAV9t;CFoL8RN&n{aH0};jYm_x4x zcekb6T=QAa(2MsJsZ9Za3U4VlQvnxj0SGGNjp$ND8-&hGM%{2Ax`HQl*hqGxyI{#| z9=C+|&RjLz;y43{YDrTg_aZZfGrdb_4tK4!vS=Z zoRmyPgL8%r^HhqR_m$_&73f3jqAUZA{1Ya&LLpN@@95~VyQGUN6;^i6;ir&JUK%|# z?!Ll$2ejVZhB3uvNA1$Gu-x=-X9y<20zLZ4XOo*YZnTxgJ6tMt3obID5gktXzTl*# zeob2lwCrj3?FJfLP+Vm~LhOKMsVH2&{N9og{IYJQGhJ?h0?1W7NsetT9Hf3wD?YD4 zbvbf_>`7?ExFl2YxKk(7zKnFi@BF`DjO zeQdH zpx!FLL&|j~3>F_|N*LR52tyuNLv#jXU$}=P$?GReN`trrh^+%lxj;ikJkD>^vYCf6 zB`j4xDk?UXsNy+4SeKGs1G+A48b>wH*so;$@p!tQDaTcT+jb}AMvjTIJ8pwtf%!jU z@FLavY!ws>>+gC9)B-$wlq!3ZwD%qF(53G1xA({&#$MifLL4KNk%;mjr^?%Bd&3`L zewXeB#Ue2GC(|K?TyINWvuWjG`9v+6Gq(p(CRrVw!!nyvGlK4YPD>)xV5Liq27{x! zs761LLIpZX;>~J{;5ygW*!O#uz~rE~u5{3T>1cTdsmigPuaW(%b?4SWrevL}UD)bp zYAJV_4p4{I+S_*&qZ9reGb9DOEd@uf_(R{uQ#`h>I@EPp zJJKZkG^TnI>zy)b)O8FuHIhUXZ-XFdXgU;gd)M&Q3_7icnU`^{+%z!O4g8=c4p@pd zvLb;yNoSb>s|1<~`pbSfmAI?uc~KT6pUdTgkT*>FCo~zwH~`CAc_c_ek_iFi&BHy>~3hahAv_0Wx~MxGytB;T@(D%wN1JIQE`D z^Yh}l)cbt$Fv+-SGU3iZR8e&!wKJeRc!8Xk%rOM3n!`kks&Cu}lTQ zpgNaEuscp&e(Py3fcrTrPE}3{$HTt#PTWYZf?mg{AsI5_&7jJUSfEi)GqEbFg9b^9 z-dt3Don>o2CvU9G^Hr+ak_Y0G{xd$JGM>&X_Rk|3;JD39k131k2cttL<=cPcTtzql z!=MQgKwy!L$mOlUvWW*^>R6wFxHyJO=7q}Zb(~&O`RJhAu#gqFv&r&-AdLxT!_L7r z7-@ulj{OOj!BPg0)#hZdQ*}h;%8<-&?fR^#vQgfRCK%8>iuZ8bgJ(fpXhR9n@PX6@ zH3iFmL>!zV<%hlK>D!z6r+v91lHK?_Fe9rE8$*|hG)IokR3hDA(7Gi1N zoNw?>NB*;Y%2}88OQ~7l*$EL#V~qcOSF0YQ7MrhRn2z<7JIHK~_^xJYsJUIO6IdrH zVXYJaiE=M6kEMyUjz$84Gt$C&cpTsR2)@pL3O!%ZD15W(3tla;>9%9la^#) zmnJ@Mpxl7VI$r>XdJ$XGy)Y2yuBg7uaR4XCZs85x9 zM1fDl&NNom;h(ki#N8u9t)pSq4vqh2wKs$%MS~3uP_6uMeYW(P|DCr70^gwgM!5i$UbKi{_un zj~HXs>0r-P!{(c>$QZA@T)NS@{E_#r!B3?KQa4n$tu~Y8hOcTAK9=$)08)aXWMMiQzqJn5}#sUjz`!02OLVMdtd$7Gn4+!`1c=YjJGf(iE z%d+0YzP)!u*T6Ud_l-w`IWuZCT*7l9-Q1^P`XfF1&VufrtlT$F@G;~3Rw0gQ#X!I8 zj73X60~{7dBX8<;pH>{rM)H*6x9o)h1-tj|LrI~CtbJ>8{1nZ)H|$)uw`A4r;!T0s zS(JjVj#OSptN;O+ESE%tGcX)}a;C5d-MiuHw^p@Au21u`&vB zuzdTd@&(1U+LU%hHz&I_nEe8a#H?#D*U5Zzm!IH%9~!Na06$Ac(ZtujM*9(hM+Zu- z3#PQkZ!L_C$FVYsi$!Njf%X_IAN-;}2svz97nq4P&kt^U`Au1!hld~==O2T+}Yg-cMoE{R<`#%Q<8Xt!{1*@F4)V7iG`AsEnB%pDS@?h>@o z=1do(Aj5OV=+<$!d=eFkZsr2CJFZlwb)Dx`NGm=@XL<}Cf~9*f$|$Vmw(-g_eYmLeoP(NyAb=sim zw4UaXXS=hT6FKufx7#;3QzkNE{1y#}c8T$zCDo=v1=SE8Yh%z|tGTp1VG3l;fL701 zBaAf605%^9+D{*NeBwvAX7~f%N5Byl&AtZIOW2GKpK@DEnNxXFAH(a|uxhUqP%a`y z7B4D`P}x(KAzoloZ+42;4OIXn;sw{B*%v9*Fx{KU{9z0WO(7w$w8QQ-v4 zVAWgUZDJJ-MH@bYcms`P+JstDxJX2~mZ_r8@qQaOaVS7Fq_|9E@hZJbbaBQ3zHL9%j}$yQ~jxm?sW* z7AjA-H%?>7Z9k&=s{A;~YQzcNCz$=1Lj{*34he^hc{uOJ@+9mk;EJh1Lo5WPz^{!S z&3{V6laGT-5~%Mbc{s8UzTvkYdBg88fPrZNtY}8(Z&8^3Siuwcbl#dRum@9yJsmgt znIOqBkN|VFy{TQ+aTVfd-5Ubv;6o{DJ_E&LD~sKN!>KesFL6tw$qh*MJUI&F`_cFS z!sbzLfa4Q&T`K=_MvlmOEGJcpaB#B(Syd+<5&I_$9=e>q_K%M2VRLh#<;1%?oH{4O z!{0f>GLsOWn73akD6mZo-|pb(3TSpxVRFNzZgD|!cm%@DPoyRvD;r zM|?r?{U4-3o}te^lk=G01wu~~8^ApL_mmf?$mwL{s8I2z`!TU^UtM}QwoT^g=4;ND zW+E8B4JY8E`m%5xG(Z%+l~7!)Qq}Gk(p|9p3@E-x5BB!oR4ekq>Uq2Ff zSi~@R1GL=JItaTHF76$?Yo;%_Aj1rMhVPHZnN8X;myATSZEiMNbGdwFl-9`IIVy9D zBAmizoppy2|NL81cpF_CD{X8>;A}9?Hy0VCZ>QPm^NH_w|N5J|t*39>NOocNlll>O zPCYRU`|7&~GDmNxDqpp8HuLnpxv}zfY@2*Rnz-F*i6xU6;5bHt_wqrrDkb~pMInzo z9@sO)@`5R|fr5zRT9PBq_JBp)6`#<%$g)d8Zb}1E-GxGUS9}0t{=C8;)+U;0fDSC& zP>~YdDl(H(`R4{XlIe@wprsb{^j8gat9Jcjglb9@vA2{iM)@5+e?Jif*qr_@FS^`j*Ap)~dcjZ8{udoyX{`KMSEx4SG7>u|8ke!7s7 z6^D*a`|?BFCeBc{lfjX}1-t(Ke(1{b(iBH@tNsj+RZA$#XuqGSR+JNVwU>l0%|lr? z>ouPv1QEB)4I!t3Ji3HgCs9xX0S=DLvPYOIB&9SwmpY34{^oGqGCtoPMsRM=-3=#ae!6pRhaglk_fbyE2Uj-#$a?&a`TVO+W6sn@p+L{4%F3 zZ+DRK^Wvd%wju%9_beLUE^5Br>i3{&cxMP(Hys)QIqqb}k^SGKbXhI^4?oZQaSfTo;@I)#=3AI!XQx0Olew#Ecj zo^d54sl3Wj-WBsm$-C6^Zn{HMX+smkmyNq_;VPjVoCVKYJK}eHk#3k!DkcfZM@-!Y zul{XI;ef`g=yg-5PRs8=TU(&X^X_Dl?X(UrR{MN?%-?*Ot<>TT#iOG7^2@E`oNx^e z&2fw4cKq#B+FPGqyNr2oTz^8p7c5@+f;wl%j$7)W#HE>qB5RdQ##?~pyh6v*qnXC2 z)Wy{WiLi&PC>1QybqUn%c^f(BjxxDjpCY*z)0cX0xzX+m};G@6tpmAs81@S3L916$9 zbV}-ErBo%aAUzSJ7jfopl?cj2nwp{S3!CJIo0?fk?0GugtXgzMYpd{)oZ6OVrr(t4 zH%8_F|A#=qp<{Sj<+r#8W5;_}v2Tavg0jfyf4%Q4Gj4Gdjfl%`Cup_6*eX)nY>eEf zO7HM3<5EB5+M1jHRL+#KyZFX!GCR3?=wl@TW7o&K3zT^W%51F-{Z+6R$q3A$G}FNm zP`bTP)4ZP=6EOq+6X*2+B6(XoDQ$?52VO}b7;8NGx$lguc02E~k~Av1 z3DEgkZ4T}+&DU9w-wkBNtt9C$&4%GZs+?j@Y&g zXB zxluG=Tg9<>wVl>I)m^sdrSH^*gNatf=rI#Zp?)VRUaiK{1A+|KWxH9{kzsuebt=j( zDAJsrHywh=cNH$GFpR)(~cE^A_OD@&^P#^G5Dy3kZ?6Gej!uH3M0#0nVTH@IK&4a9VstBVX_ z2zHf+yC2zF<=551FuMIhF}9CmXUC2%`({9AiBJY>ix>O~wqb2!6(8CHqPdgvkR{zC z?2iv$Hr6ebU1~Y|{CU7D7h+gLy#6SpHedTFhvzis!h3H?Ga;jNR-K6lw{Z?nmOzoV z{3vc#?~rFgg++_MK6vM=ZEHaJC#b~iS0wM6AZ#+3r6z4ucJ@7f4*V&I-ah5i#4W=l zfo$nG#^WxQzFBS+r;L#gUv+JoHWvEB-!XQ&5_i*ReeUsOD>G-Q{CJyBgjUpzjU_{5W~6d>>3vjsoDL_ zBcWjwM+hM1IYG{Q5j%FgEqS(O+ZR(ySIwAAdY%|BamVde%|l{w<#1b3wkVS?qML!y z9xhE-2)sJkUL#k`V?lT{PpUI)?;K>PQx>dCw9EEpGhH#?8+TuNZ;?%qiO0M4U z0Nvx_ltZu4+8B?}jFu9MS>4pq10B!jqQQBo{V^T+-7$GG)Abd~0Ud^jzdR72p3v~n zC9t0V+?igHFtU4BLtn%u^pt6?I-oSFC_WAvjgvb`mQW=>@_JfPN*apdZ7pW4VjRO? zc`8%8%Uid8&cn@a#yxwf)>n@^tJ4&1->SPUGk^4jywp}~kPbI~7%NHCl*{(eujg}020D+O0yGd@^Qg?Sp_zk4<;5G?qTP!pp8eR6|d9o0yhA9@DTMW0dU9NGiFY3AB=4-R&Ez*Ee#EEvp51;GI zzBgH~ipnpV69S`6t6Qj1QYI*tFM8k`@}gM{VyBi^cv1xOcBX;+!8wV`pWtQ@uX*oy6%Lb;>PIhesWf zPmF687uTA6nX30=B&L;?p;{|HdR=RLU|&&1{%Oed%VE`dl|ePcRyiO!i>7 z)v8pslBm=#Aeu#DQP}f9cIM#bhsXP&WXp_G z$vi1=1&1l=rGWxuzgv>poM-#@D&uXst7+GpHG|9sxVJ`?yZBKyC_9yv{>w9Z-A;ED zhi5XJmxfn;_!Mawoi)85;}7n4Md1D}qZ6aK52}R7S#X^#7s)0Ftou_}JI@`~dRX)+ zptOc%+XbD#1YXUwccxfCVBVEQLQeJkaBzue{*lI)e2M-ny3($LL>C$%H7ER)QQ9pe z(sZu%STz@%l`-9eh?}9G)TItHEum4Em3^AEId4xprW^Inp{;}CK(>9~`$oFyt36ZH z)97SlrUGY#23XrXRc^^gq&al3!80v*)+!ie+5&DdfUM*o_iRv1Etyt>xOFNhk}e_I z6$yt(2U(G(p;-9H>f@*E)3t6QC@U%H@4gyF_TcLA3Tm9(Dh>!{4it+k7=eUkbaTz* zsh%b>5)REIwC%Q$h+;V;t*1AhP9>T4r6D0osYM!FSI@{kctt;Me67a5jdP*;;9!Mj zp-ifqgT$_pA80bwDq^K=IewfX^Nqq(06r=k%QA6!b3IQD!M zwwRTl(aXNTULTeW=Y{Ts$lGIj~4~?)5Sq zhu|lye=bn4Jh*A!AX1D$eJm^}cF^LFF%kCa8%Y!JyU~WNFfMMP66ZXrEj9Ck$yItv zvqVlMcI9EDL2TM-w)IO=-SQ(S8^%uX5Xt}HjYHO*HUxTu19d|xc%~_| zNjgE?<>>8(_;hvuj}84W-)kR_O*33(iGYNg1V}+16kd!!Ue>MixOiQn=8;Ky_!T}AQa(BDGFw<3kW4BqOIkHm}$Ir5JUl)u`M!ubyb z2>8GYJifyhsF>*o>36jg{qZ6Xoj}f;hRCMgIFLf(hdt`n7l==WN6ftnwx7kcS}h8- z-2hg#;z>p}D?Z-#WzWN+3qp~ntr&ZU!B<3I?f#Sje=?qk?M74LZk0B5UteZz{^qn_ zcNwo@plqsHVYs+#hCE2Tr3ib-rNq+fkTzF2%M3BDu_EaH4%qi@K==`V(Q&@Q17R#* z(8r2u-FU01HzhK-WjW9+1=$t@gQP?Twj2gt*dp1Mr6g8z_L?9O9Ab{#u{R5J=o51mdxl491wqN7xnTn2pKl&VMMPkVepTc?-jK?!8-tPE`s*bhmPM&d; zaMW>r2%=cbpueeEf?|EKfWmQzKhpDFO~w~-YKo&4O-lCdV1SCaD4^gZYC`9r5xx3A z8aFOG>?BWdl1Gt4;R~fTtZ;Bnq)C2Uc-W#jBs8^TMT>^t>Ma7Et?7TbbVc9VOBh(7 zX>1Mc*3Aozqe{R>znGi)is>c!njbgVG2Rc{U>h%tgDZ|FwD~NHuF;c`9=2xSp-Fj@_}tdjbjSBry!yKS#K)-!Z&WPBNSU&FFJ^yMbH5R| zAmTj6R{4;!6;*)J^JFC(&36{P6W!4@5U;FG=a%Ri{jy??@Ea1^W5aN@tz3-l!>D)W zj{`X)S<2V+V&A1F;f53j)N8ek19HL_4i$((VleT9X`Gf52LDcfTM zr90FRd@-9ntOaRx6etKbOfE&MqLkaZP~jhZTAcFv=~4q3k*&i{B4E}|1y^yQ2AZvC zllGYhV3+BryD*QLnFp_({;a6P$e*0%qA@?`_3LuiuOlS?O?WsaPzM!uTBvsei5roEM|G^3G*_D3Cp0_vkqBV@^Aw@$wx;Eo+K$S03Wb^OWRNN@hJCdzAO z>-86hsfX>fZ&NTlVw6%-_7b=Av=Nw`&1T4aYN3z@c#oNRIy}9iSDbiaKEks;pI@<9 zP>L)-{$Wz=)(hc)MRQbV~v?!BxFrqBWqo0`5G_gi{ zGK-Fg=rxrYhaB!J6TL%H5_!sn1jW)A1l{5j3Pe$nQ-wcAw!;;`4@bSSXadP);Bm6L zJX(2DCB-27K+jzo;&G);;xL|?AT3E-ghN7ino{mX5ci#@ovbw6M^{qIgeVJBk3|Vb ztZDe?G@~Z3a~DyrH|=H|m#16;wO%iO<}UvHc!f}X?n}i&mi8r1fq8OHjSiF1ttQC* zHbryp7Vy_%Q8Jp?d(H*%5-?tQ%Jb|&F>+c@f-miEmuf+4STs{+Q6z_ToYAPlJr-uJ zeb2Z$N=W(LwO;3RxbF0I-nYJ|68evC7Z3wFrFA@Ak0AWr&|?fP&Z5m79Asef)K;*zWMG{@uSygU5C%kVY8+91jNE>-q{L5iIhylC_qb;}YEQH*-R zEc+@5@lTj7$WZ);K!(nxRLMWKfT!qjTjgGN9I{wUs*w@Z-4@Z<_akqm# ztu*1qn11Hj#R{q=Y`HDjMimybvg}cNiy~`&d{l$W*4&g-h#M*{&(~WM7VAl{Noi?s z46Q)n8Arl+D@W<=M-7O6Jm?_@Tw_Be{_xRh0qPgftf_znYp%{=Erwq{{1!BGU0;nr zb?KTuU;pDjk*`K9?7*<`E!PH`IVlOSb^`-Kr3)Q=(MFN z{Fo*IPh30msF9$RHIDI6*zsPnN4v|mWW7CZ3O2PzA}%-RRS~B59Zd1rj|-MUEza{B z1r$JVje5xh+rhMZV!3t>E(dZCL%f|Lv3-Y;tVUBFi&^y61ct3=8QwLL=l2eA18iu* zSRgoKPOD#ILZQqi0d5!=w??hJ$k-7hog)rXT7NITZqG!LLl>7fqj)^YIiJJY&zU#^ z4R~{Alaf7GFeRY#jyB z=Ej%o1vyQ%B2-#R<~O!AAw%X$Yi4pAt;5v{Ao{cnVnj)A@lwnVJF%mCYehG)d$blDp7 zM@|WEWSpL$mqdTyP1+Tdv+X(tO?t58WDP}b7Vpx|O{vguf95Sv2UA+tv*Cpolrf9< zx4RI;kA@7zwm44~eNkAi!ImjaeHBUMuK-rTHm9|}$2@_4Eim^zl)-ki?M}r(wx|0P z@xX|AhW`v;CPz>3aF**uJ2XpE;jc!T@=F$cWAWk_=Q(!6m#(?YFz$|-4zAg6smeJw zp6d@On;^Ffd$%%3+dw_<+4&c)Q=AQyKGb)Yvuu9@GN$Arg>n9Iqpn?1RO!OIiYe*x z25w1wSQKSU>K7Y+o$j2BXyk9Fq=6`X&d>F5f?$oqVzrtVO#)2$6btQ$iTYh+2)rM? z#6?Q`+_oG6&kcRSjO`r~j(0A-6W9uxyP*^}_-{K@WZ^|vXPI~pzjJwBLc#A)jc-A# zr782efrj(-QuWPa^JK52{$m59{UpexYdZC4aE*Ois>xEjH{m8|h!1K^>F$1m;7 z_v@k-i%K_pS}FLB$$`4jZGh*3WOj=X?{gqq?{B0jL{IpKp>`XkKVSL1!ggYCZ4DTk z()naTzn^2@yoE;Aw#>O5Mkn$Qj1>K#z{36fuc_AN6Et{^ZY-MT#arAI+zk_OjXc{XplXih$^~H*fq#pI)Qryi@ifOIUN~S(0)G zbg_F*1jWI@pooR-XTP^s7ZF0ZaGztFd3@!ln{@Uu!*QQH4}`?!u;8wK2R}7YOgw{_JZzaAdnp10EwutX%ho( zpAw8xV^WXg^T1u%*hKk6OncfSs4fdO&Fd8bt|=3|NdNy>d+WHUw(oyf5CajBR*;Zz zfB{B8L>iP6vtKi}8$`|mL4aL(Rq zuYRw!w{SI6&U|4PI-n`~@J+LjjQf5ug*m-cKct7($p`AP=avz)E~zW<;0H)`LMpA< zQMAIL$e909)C>LzQb!yWyWK?qth1)Pz90tk*^5wc3!)^X%D2@|?*kNa2iO6{Saajo z1@^+3aSUyt&7k}%=l+k^j+_)J)mJLbg3e8o51K;l3uMmfFP#PCa*bD%nJ(QM(UM?O zB5P*WFiVYJ+?hUT`)VMRS!udJi_Q6%#v(r`u;TKgQ{(^^lkE51{>-Sf8*LIA3(f9v zb|Z_RM&$T{*_~Z)cb8AsJySTrq2$`VE=&A(i?c&|GVI}Y=_t)mLe|>i`Bj|Z<#Nm| z%%y!2AXdMfE){F`PHOy8Rz21<*=D#m>D;)NYI;O}r~8ya>SQrOVuIUZejv4z4{AqJ2kX|_ii{m-*WDx7VYuMP-?6+_CU`z>>m@J!%Wdz~|==mrN;c2KEfw5Px9C*P5R3Jdb$aEnRP zsm{OM@eMUIdlk2bqOE}Ny(UbBK8&mPo3{?iEReNOU|)a`J-&@v~*MQFHC z;1B9mbX!@q_0Z%gX`Ef;`8IA5zSkgS_TIwfQg#nosOBgdekKnuf3oh#T5euyVQaR6 zj!KGA6ui>9vc|cYl)fPGrYx+5fuc;RB^%t62?Yuss4LA~;zw(LVX*HJgBv3)FqiNO zGx}dPrYei73HPMaFdN7!CcYw3xkSgwUMD84V!l}E-fqBF(Ws8|i%RbBywGA6>lrHP zVSx5Nvj_afD;ijCc#L4j=UZ-K;a;x8jqr5J{2b}| zRq+9(qXmvBpe(0f7YD=Kwl3Oa2R10B(4%mSXOSE=H+O5>m#4lPQn^K5;ueAgv8iA<>GB!`i>R0JIWy&ZHue@3jm?G0d9FQILvs) zxxo%cdO^1_)QV~qo?&u3YR8sW-04$_-s=a;ur8>{7qE`G%90mbZ-)ZT%NVX;WjiI^ zk+-_41AOR!@JLky08@!B$;6bVV6$2&zNjjX+J0MrVl2x3sKNb5-Bb@Ir*WK*h9adC zm#Z|Y=GftT8!A%Y!S55O+IO2DdI8oGyV#`kVel*g`;7<6+yO`-m218Lm&51V8~d?W zs3!|x;1RgeUnD+)Mj)zj!#2av9`TIV2hGUi-cAZVqs_@EJ zpd1L7YX$^>;W>=}pd*$IpY|H?h1srW#ntk`lUW9*t9o$^7dn>Gr3@X~$ol6gfSiKE z*e(h~kJJmt^32qJXD(0C-SfsLh5zqG(U1T#8H}(`%8bA8-fy%2{%1$%1)bbu#g;(` zSNL_FZ|0qsmL@%?FD~dkU&}Y}N+B+NoHtg8qa|IUBW5(mI!4Sf@C9KmIDIgjJ{82S zt>{XXXWwTKH2?%f+#>q6T@X-!)6IeOAB+l=4!M+Dz;ST-ft@b z{OOj1)GSKXFSGu8*WMW@Y9ohd)qz~Q*J6@illS+s4GAkyT-tL0F=xb|ZpBV)r)gJC zb$}++PjJ*SBRHt z(U@M~M8SeXVzV;Jb{^)?E+)O|H?4-%z4wJ#H^cKUaKc_i9WdNIqDLp$1g-kA%T!9@ zIM={;H8?DWB+}X!j~#la$!wDLhO0pJ9kg&=2SlE%%N9NQg3Ie^<%LDwv-N5YA!57D^O8EvFC~2xNoKbj${gz7y?=4)-ERyRO?o#kA*Zv)Ou2)^u9jdg)8# zl%K8ZEB8gJfCCdS-7^a2b)7HmgPm;osdV9j2AWi%gw?5*dZ>|UEN{4#7GoT}(*y%` z*bpn}UGrY7I_9brv%cPf%d8IpbXLxj_QMM~K?AM$sqH&3J&s~;wJALYC$>yp+6MwN zHDsRskiuj(difNM)C{P}KzES(YQm1~uGjqF#wcad+KXc^m14!NxpkwuB9Bpvw`G0% zB|IXL{GQZU;$tJ#`cV~UNWh5j^DA^CQ;L?iPnwRi1mHPeAGSwJi8OVblls#SQM~j3 z%;= zeB|CETp&k@6Yo+`o3Ge`-_Tk zs*;HU90%GPrOiDy43Ls*fN#0e0H*=+)||13VXD-Y=Nt8PbDCN~#%sU0V1G@?8Vsap z%Dya4=$w|0lS=%%u-w)*SG#Sq{U~tayPta9>i#f2`{sheHf2fW>}O*$4)KPbndj5 z5QmG*%z2t$6#pYkNP)az;X9i+7Jh5M1CxdY_$6DN3^h)Org) z16rS~)0^GQ4V=<3&n9X;nwvD$+Zm+FG-Q*NF059{3e(JYan^l2pVs^bYhrW9-T0gzrx@mE6a9aC}%J8@wC|XQ0GrxwTry@AMp({o}R_!7BqjG+v z9!IA4%%mC?fOs|?4Jgx0cMrCwII>*Dm-N%v3C&zEyvxPt$hW z<>*jZcoaYI^>Hl`hy6DZ+pJl_OR#))VUf#u3FbdL5I>X{9e85OEXoad6h{6 z(JieOw_aHD+wS2KX(WbzPi+IPx$oCA;+j?#?p)Vz?5ZH6{}PB$>atW8E2vcZ5_Y3G z`z6-~U~*}tpJJY+PqvH3I2h}nuVY)g+zEc;smLQL$Wnxuek5KuTv4DM8XNZZ0gw*K z(>g1AW{Aw5<&R+Z{*gi(cTsdZMgbp>x0Y?;D=BPJ9WaS@XhEAn84jbkOYcz>JnM4& zWKCX$1mzZElnkKXPH)$I%9y?CKuN9T{08e?a7qvv_u#jA?ccYWLHax*$P?f?ZJ)27 zp@IIGm*m1sVf8SJU@wbqhMu-oR z%;NkfljtT1in~d1POrxmwpq7) z@5JpeLj%lEd28Y6-L3%+FvN>{0X3`uXMI0-_pHbs_>Bnf?!{%wqi3mJ78`j9+oaU0 zcF&EX7!^5a74=>M7+T20+RdPY8K%^cuYy}Bp-I^N%U3#IRH~0;MSTfUw)O}2N`GS0 ziMsJA)!`Z_oQnp?`3~4==Xwie-ik5}@x* zoQ_ato%}X)p@|;=Eq$#JY#IKx@CP~J10$^fG(h4{@@xh^31`|-w61WXlb1nJSAT^w zg=9NF>OORqr>Ii^$~<5s#q{DWW?%)KPv9Io7lC=!haU|OM66k*mJC3807EelzYo3x z6f;Gd4ROw5VhGBG@_bC?p%Sjn%h81idyRUq2;{L+EVKuYbSYiV= zI$ar56rIfV4d5%hot<*%vyKhs(v9t7)8h?nJkAW#3lp_&gHQq0eM(wHNL6EXA*)}} zbHKH2;t8Ok+Klst;+{ta%rq-y2zDP~!i*Qsrz=>gIYOs?;#McTg3rhuRxiCgo%Z=h zf@a>L0(~B0rO=6vgKr6hY3Xz%{{r;&UhS&@`W+&nE@_pz2~cy-62C>G8K8xtnyZl( zW+^@qx&CFhcX=jbsu|(*>00R5BZO^+^8^{VXKUFZ>xRI!fIShv@QuM@44c*3!6vhP z&nxQPpDC%WYnQp^KVt%kHu*`S<^ExwM~kMRpOVJrAs#V4@=XSS6~fnYl;ysD$l{qf zL46-^B;t}$6|+=za@7aAj2#wscGQ+8;G`HIUa>J~yV|^qS`z-PnF>%pGs-BHx>D@} zf7BB77Y=Z%fCdCE8Cgr{l226(R*2WS7Vav(q`80)oEamI5ZmF7@29@E}OTL%b6JLUZUUrVS|3=qF6KN4;wO zA$EaSnTmc_!>Jfd3 zTuZis$&yKD)1J2m6ZM1L2ESy}=O+E3nsbURO<3K(rV>vG1`tOYAkT7<&t_J%6aP3?Mq!vcwJhamcbRVkJH3(5HGk>M*b>UCLy zF@n?T)RKiE^6Eg;6&eTKVk)NHmzQtx^`?r~+%iaajEIfX0nJhojLzO_KPE$XeA2kx z{5}NWwu2dLTdp!0ApIpj*5pcMJ!69tH76;#md-ls3{ya7h%Kw{{6|-ux^kOBpJ^qo zr#bE0|6>v3%q{>~lgzo>!Q3Fv*8R|0`85G^OQ<>MZy*gQr{Q9wkAhI?iyX#+!-hnW zMR1xqt(fflFon>L+kJZD`Xa{)J}$cN7h5Ee*cocQmqh-IEhp0*HugGJX=)2W1NE<^ z1m#jQ*UM$Sr{>Jr28UYn&*6Ji?N9=NjS7`aCJqcI=xFRw=s9b62IIQ-!x3+PV;(71 zyz;U~E|$}+yWJah<6(=9V#-CLm0Zy32V2C=PMCkC+O#vRDa3H8H zq$jm~$~Ms#sLt7_gicw{pg#Dw1I9a0ff!o#d4u#HIM;|6#rZ2~!kfL4JH`JuOiVFZ zIw8l7$#)|q83;FWB61{SYu}7&Wo-kX7!e}SJn|Kgd}l7tJ~qEon{`ynV)1lYJh?B~ zo7#G3Q|BtTvLFkWbFG9I{?M2&E`MA;y0WE)p&HISGD_yCe16AjPm`gW^N>S@{P!&1 zh?y%*9tAj`{)zGoUVF%9*m^EFa4?RjnDe z7dR{j;&AK`wl({hh4vUiGSK2wg0dVmvj$n=M}Ps&LftywtCpjb**!7XRwmqwJTb$X z4hc~73Rr5TmeJDxc1loODpEzuEpzq#=)gH`-S|KR9S;w$lH#QEftpLBbnbEs0sKKG=vJ!$8%^Jicw5M5kSSgp~DRjaT ze~-I5J^}s1v){{NJw5fuU)sUNK}wIUH0uk#Pg1B91`8%{B1c zJ)jdxr~09sD|I@ng(%><)eY|%%jnL!^NCz>icISY6#qJW`2nh2Zh4cWBlda8q`BnUacm9Ve9oN}IZZ8BKvtSVdXY*$~F; z^QV&yI^2Pt*{E}4cYl|V6C6}f{aj_gbUFzrR0&dLZO4VMYOBm9yxxIvYrfcbF8)yU z!MU0e=)9rYoXyAFGCp|zW#`*L;*n*ifU~*`7$3L%J=9?Ss(EB#!C^TyppJm<=^8k) zQ*YIuUB>73zGql)ft(SEgNha{uW1Tkqo^9Omcc!*&VcHF2I3(!Kyuqt`^*6PhX5B5 z-pvfRmqaFUsqxgmdKL@ufr)8=${rI-lX9uWMO>i2=Lr%CtL*bQkWMtKv@Dq}Q7LZ2 zpEqLb(;3WrY`!Nwwl%-04*s~!Qu2wUhqJOSvJ0;FnF&` z_TmJ=e&=B6iWPUr1KF)m-g3X)h8T*ih}sujI0{Zc5S@_j znvo})vrA;skI%nWyfJ929`9#u#gODn1oYT;?)wu8RBjxVdjp9>LAZ>9jcnk>b|WPv z&&my;7xAcA{_Vb((FdrKR%JV5kS2FKe&QL&aQvK5GV+XovYV8@xV67#XyURjWEDiv z1=dQ7T`pA{_|_+`SI=f8;JfrDMHB4C-EOrhma9cxXuG)RrBCF5V1lFJ?JHYO-V&>@ z^&af5t=?A|Bc94Bdko4>Qmmu^(hrg8QBHl1{!fDpaWIxm-sdXbmj;pq4c(C4Roo@z zOQofRooit!vll?Gh%q=0;)8nt&kh1Ov($HwB>En6_w;=^>y(k7 z(`&`0IVj$5jrNxLv0gv(U?ItDy&jY&2iB>UV7UnG260BU7Zq~c zIeiIm+Py7$WOg3|{V#K8)*<|0Jhv|7P8cIT*ne244$>b`T;#(4ODN7;F)!aS0z0Im zAUf$lPvJ`)p(P0mP=WEkKyi|UR^t?67DqJIN;_9P>0qPb zOG91EHe&a_AgROQr4u*z+_XIrrkrg{Q$n^01d@x%on4gG7TB8V1v zhB{89eW1MN0UYXZSudgO)|Psq?Gi%*j*!=g)2CEuF1XjD)Yvmg0q6CosXLJ4OU03U zaUs>CrRrhp@}q^%*5-u8Esd%>=8CSePhWSq@#5rZA$tJRB#M<3JgYXUWnHFN5{x-r z-n3R-il)d&kEA2LUibV$=23 zXO(vSlz9?HWaDv2?BUxRKwEW|(E6J?AOW|&cB=(FiOrRM6N?NSl-BBLj$q*2gXnCG ze8Kf^D}_>X$lH%aJ}))SldDTwSV0&(Y%Avu=Qi9JuqYjS#QGOL*OnPy6&4vZvyKSu zS-Xh|bW;uuGB_p>EGciAQfpLZeW;~ybf}Ci+lCe13cV2mR1SDn9ngB0Dp}b7{`0MTJjoNHIK?;_|B>ZNp7TkBT8OuT!aH(I&<*r({V=5cS zCgkS^e9jLC@L$I508>O3g1pndIw(p_L>%a#>B~zYGMnt}uy5Vl#3V7fXRef(?mF>6 z4Cn-=XIb1R43g=)QMoOu#Cd<4P+(QZw)|QrLc{8hX^k+9 z9aWwSEI;}3EV?XG2553iQNL}$T@ClP*ol`^zyw-Bjn2T?2o?SXXG0Iw`$lY&bmROT z4?vB1Kx=^JtZ?g1W-%ZIGMIO~ruj2xDV7oDYry@X%i2!f!hNmMq={RmxUC9`~k%L#`$eL=Nh|C)Jl0Rar)3yoksr14<~qJ;~tP*a2O<+cKB1a`7FBs^P8I@@*wq4{>W?p z{PusnnlVNm5mvz2w)yMkZ6}8M-&4w_2SO6?OJw5xoRXGLLy@PUS5|Kl4 zjG^;qbj^Jyn?nLp;#PC4TK|=&S_zN^yvAGF7yp9^J02kuA}MTP&HqYtBLNig9Mew& zk;qES85h*3mRZL_iDehUdi;M)MK@?p%qmbrruH_3vZ4>CFO2)Jx3%QW9+XE zDW@k8Cj~nb2$Oesp_T5*j-3b=q1K|PM0CyBG?8l~Ur|8{bgH|OW zKg)q(kUXNOJ!hjd&`2x!qRG;9ncC@o`C|6eM_{yW1Q@rtDuJCQv*f&EKNEoxoCih* z_XB2GUFl$1)V+qsF{Aqybz7-tY?lGlOW^}!m>=a-{C8UZWZUO_ji|GN6S`V=56nU> zWfj-7JL^rN4$|D8A?V^E8fWRnvUkr%U-h+hoptXNq2mPIie1?3D)m*0Lmp(^9LdJd zRa;*+tLJ-t){sH2_Y$SjpF;H?U*+pWHPaG$HAgFixXN*cIE(;ip@*7VQkGu*7p?S*(t%rk!v>H5WN;FX2?m)ApzX6RF@{vxSQPtAabJwDH_}(-IzThRD{= zo>4Pn&5LAd1ox!ZDcLxKI5c8$lM3=_a9NxOF{kCr%^aC_WUEO3t!DiU*+7yo`AR|3 z_r?8^!h6On50Pz0F54mC|7n(bZc|D~wrlLadSdcFrcj2o?jtKHmSX=4qudjTV@J-i z>y~l-iM9SQjS(q8x|VFEN-6%EgHh0;sHK6!>oZdCVMOtt{x}~SQXvzM$}|3{0O-fk zxJ_LNkwwaVtAfF;|KB7VK&|^uvi!iCXLaxjAQ|J^qsgEjU;h*58hP?DeJ2^t_k&jDlw{B^(SeWqake`fnf}AKUc~L9!%M=3@5n1Y=A0 zZ^7(B{)Gj^3R*bwUKbNj37&>pO-{lX#_Q@t&yYzi^e=L2LPjTI_m$%TPAoeyFH7h4 zxB23l2@?gi;L8+rbV0E2IKrN0eE!;VFA@@z4!v(GUK6`&hv_k^iH(nqP1&SyJ&YOy z6)KHQWk|L7zIKG=c$=HEvZuGVWuql>iN&gI{cvj}XOGVMOZOLUZo`bS+viX&^PNMz z{+w^CLtgaz)j2fsA3p+@9dFPW$y8^XRI_1{tB-P9qqikCYsa=fk)C|$@8@UmQC+;D zg^K%AL5trOX~mg8od5gnzrM2u6~F%1Uyv_|Ye&DTWFkU1`bu?j!c;0^P5RsZ<4f0{ zpL!Cgx2+ZuvJkhYdxB>ng>HQP^{GdY6Zw4k9UnF22LfPnzuxaiMe}m?zG5LY+a1rJ z|C0hgA44A;Pk)~Tbv@R3Zhb&a-p&zf8zGDF3&sB?<_&)|&$Ezke8AB(CK9`_b7V zX6~N)XL5>_gYU0)e@%1h=J&PcdwziqT^z54ro2|d)2@3fEnJnmT~fE>*)vwH||P!qKP zIEHlZ^69(Zd5)Y4czM9Z$EOyBFBW%ZN)1@JCbk^z$4vsxe(pd8+V5XDgKz&HLI{#9 z)O-enzm19foZI`$*2@fZh4xZBa)&x9iw3->9yvv;f97@g2zc+I^MMif@6_PaMn@g7 zL$aa@D)`!|3H?HuU&ifr2Vjmav}k6m&Oji-fdr^bI$mb;GChFe`sAw)bq|gicwck zE>D2o`Mf8k9-*n!&|!)q*QP|vczS}A5z1%>1fdn&Q;W3rVu4uyJ8$rj-vMMU;_|c0 z8@z5x=@@LH0zrfs+P^G!4H{By_Lvk}sow4w3jxJCrR>!y?ux$ocP>87Nqk-aX=UYC z_~WY47viTZx)2(1o4mxW;47T5UY_J4c~_YLs=J8gL>sVDxcQt0b5 z*6A)$wrxdRV@x#iyPB5^9FRhJ6Cx7j^6!K{LD}z@Zh4e$P)Bl1)Bp&#kZid@|(G^XRiX2hOIY;`SHjz0D%U6ao$uq4q3K7;#h)_8mcyK~oO7}ut$|Bni~&`Uk=v7n6b%NW_a_>a zCL->MYq{RfHq9tvZJgc{=u+q+OBx8u6|dAegqcrF7Ns!naYfy!lvY((E7?}wG|cB6 zcN(dD$f311-Kng0b6Uo!!h0u+&Rh4U^QvL$GHsc?R^Q@$IeUq3vsmP=s+9=AW2C~5 z8$xOhlGjsj<+2qI4^zg9Q)dvKBcoWG{Qu6Y-*xnT^a_#P${oGCbj1~8(S0^HAz{TW zy>#2VX<5@6_L@XR4^Y}&Jyf!(5ZK;^l=okwIn3U8X72lqPrL6rO<+y^U|+xnPviRb zU0@KHsO7Y|AS%N$wa2azu{U?#7d)BwY>tNJEs{6r%ZB9+Y>7?oXWbJrY0%7Vsgf~Q zv<-truaVHyv8`BAo2l7Dg5IyS7D<5-N>q>uBmE(;fO6lQYO4o?gPG=37 zzHJR0vt6lOfPUvtU*Yt-%&aOc*li3)CD+ku(OVnJ|7Co2h57n+#$V1KrZ!~VUbumJ zohI~o+rD$Ym6n?8rTX8^_c_!^-!$I2aS+3XGwzzj9Y^yrmQMz7| zEGFyLmVSA8-u(`LoHtsSjE0DqOIihQbfq@-%X4UhR4YW2SQh5oft|Se0$R7=nH!VI z9Xu%>u_mO`X86aMO;2tp_K8PMCv-DH$r zjlj5Bcty&r<8wlT`OA?a>xPC@rTRy`D+%qy%MHQHhxHzc4fiFEbtqg-+@OKFa*vet zVir2yB%ludL~F7bIf zrctrj{HemSdc)nZagAC-g`(VqV*QO|;%yrtEkr7YHCLEQob1L-q6^>-xj(iyosf%K#hCuLNuAjM+tkO=Es!baSbsu=iL}=Z=QQzAzAQ zs^keIOV-yr(AjzxB!~1VD%B5k4ccSS#LgdE%qn$d$ZD=vRm@QN@1oB)P%*cKF9*6K za`2N7&~nbMH;VQBiv(es#B88UT_HGI)t#M3{6j_PD~V=P%Y_RW0v3Iw&l22Pwf`^CG+VwvED|ND-{|J z*&772FN4U=nawA)d<;6k`rC8iBSm$j>Aqvn9CQhl$xV{rcCC%i_KtSXhX%Y@!ksEI zcQ(0x3=y66OkW<#Me=MKGnz$k3}f9?V`)Qu8~IYI3EB19{IBX4Dwd6dl+t!-OWLA= z(BjEm{4_ACYMeO}`O;=FtzF7*Jbmupp0 zgPLpM(Be&W+M?~x-`JpfFmF`2Pj%`9gi0%rG(cya5TC`oFGtC(PxQ!ge9N5PS$6G3 zXh#Ld?od#%eiXtdUr8e$mva;%YFicY2vA?0Su&-7@(Ou2x)T+!7q|W5?lQu-DrW(5 zC`^Q}+e6H|6Vd#YbP? z7s1wQE)MPusCfR^+1PMaP1fl!G}F?f!0SA2kHcYpM$9K!d9AT$y$e-2KJ$*85}&`@ zo6Ntm4vutK`;K%MPdkeL_<1wVsOP_!T?8pWB3y=t2k7Nz4?le|z(}QYWu+T0PHYK~ z40eIU+>W8*W*A6;E`~j0|HtD2RAMsD3aY#6d3%%^$u2&PB+*m^Y}HTn7!9?ddMxN7#$W2_Ghh! z7Iu6lOK8U{RtszT+;<`zZbuMG*G^e|)+M*mEc0uxzkmlZeNvPko6s;}GiUpG!=cXE z*G$i4{o&kRxmrTax3$A}=j>55pHw=S7g%K-+7pptyg9Tluo)hpIkNELrKr_OiT&6y+;?s(hx9t5rTA89v6eaH0OjNx>3+r2QtIy0tHmBe6!TJ$oYf84b>Di8hW#{wN^747#d}6yJLE4Hh{Xk$el}Zrv2Z=& z1{%>Ly$a0{t&9sVT$LQ05}jh5GK*-Y^`tobnl%GxY@bpfD%X+;4BhK0AABoHTd?w~ zCjUIq*z=6I0*9Lwg@@)o&Vh777UKTbV&B?cqla@?V?Htm(OwOURtIA=TDP2S6Vy%C zEE8cG>1!{aCCeOOdOF*6Led?)9Nk=e;)L^&%vY1ZQk_2LJ~jQZsC~b z62BCeXEU$~$VcVg{+4oJZIW`ES5e`|;V);a&{H&%*ur`Ie9aJ|vw<{P5^Q{-lw zFSGr}v1ZCx!iTj}o+VQCZ8S$EV=hDDM!|Tia`P-h>=%J}Kya*+>q-x}A`z28X6 z^=u;OMyL)o_7e4m;&K8-534Z>eFRR&Q4 z9mF?Ja>et$^*MTRS0MP^?txbD#@*Q)LN25ncrG#4&nD&z(@OEUVcv7=%Kp*JZPDTA za6&ovEpeeEA`T+=!{q(Uu1?G@$1fW75fv<*yXPkzRXNn7!bOXly-~Ef$g{(Tb)C%% znsu!3`=tkyaxN7*3peQ&Q!_J!uJ{`E+3jV~B|>bC?iQ5gg2(rG;0;vHfD?GytNbOL zYmsd7Xk0W zI??QvStnXnt%pk~Q+uNfmGWy$YFbg0_AVnAy0)p0Q#8Zu{W@G^x5#D55Z&{fw$dKR%ljV{zl0z1$rhx-GLtT%x$o1E$4Z%)fJ)Zd77l%{K$tbOyyz$zS z*+o|OlN9gAU0wEJ*2vun4YVleRLVdlGv&SN#L#HP85`=vU8=AbV-4fZd89U#w^YXV zJ*=C3JbQbVZWb%3K?r`S6s$(uYcc&gKIEAT9=jrx{oSD0&Az=X$kAJu%!1N__=$FT z*RWyP@M)>tsQGn<0(`vH!h!LQY3U~x;h*~ETU-*JPiN>{SN3_%`yAjuC+4E}HldFP zXu2n5Dik)PJg-Jy;6d0i64sJ%X|1)|l9sDQ9^4LKMugN&VMuYp!kG6394w_jo3-hN zPk!Fuqwu%jQ;oEs;YxSfRlJEi%PX-%RO&>p<0!N+^TF<>ZgHBq|0dE%*(;#J5o3XU zl5+Kl>jq1KPWnD#uFtl^w{|Yhaj%h(?|S=gI?j@&Bc5cn`@-yZ^C`<$Th(A*%zL*a zoO3Xzvi06h5KF}>=x;8j<@WQKFH5X~FJ$}O%OyrHDtwoRA6xT_eR z0yS^G-Gw@Pb@nMOYpUc@-4mfYzwm2eW4mv19C3{a8y8oGBfFECThbgd|D1%zZmm7k*b*)NAlCVUl#g;?YTDawlu$M-NwsRo*LL`ZO z#)iBeum>8e`xIJqbM|q^#MDk=tAIHf;J0y>8RQuhA<&RQX zkE_ORj&ELR0YXtk-vOY421%(g#g4Kz480en8>by8IccE}w4M>92}*k~sy`kQzjaMU zx7Q?}A0h98(DO-olCLp3OrHlYQlV%cnppR3HKNb5Z50%3uD={_vRY}*EF~5igYxD9E=qk* z%;fqgN*$Z(_l9Tg_!KHcHn>kl>$>LH&F@g5n=Yr*R351))K8nkzuQY6jZ+z})7mJG z4;Anw>D1fH_bg^Ly7=0n!WFBq#P&tu^!tYCu~|Mm)iQiljP~Y&ocGgOEJC@AhpH{u zqCsP&BF7+m9D8td*40!|wW6l}Wb19pBErUJ@Q^o+l1;XG6Jq^p;_#X3jGiW#(rYkP zJGXs>*c5a9b6c$mCMc76*JM`^PYSoQW)o*>E0+IpNI~(&au588lt)8RNj9rNH@2C;MGp0Cx>i9c6b{mTWQvM;^({|J}r+-AJaR7OS zdv`|0p=RC-n*JU^tS3V0{#!4Qciw0loxZrm9T(?u+f)=s8bQZ@IGC-Um)l!IW_H(bMY0||fZ1wbd06I=&*=9rPu zml=5+nZ|3~ib-kaZpyx1MiQf3j;l}R0ekN?HflQx6Rs2Y3}XVkfX{z<0fbavN1|6) ztwxe-S)=u5@(P$6CLYg~SIavoHm%pyP8SeLDyNDv9{b=8xmhmYbCzHC&&~7AxHWH; zC{wPwTN%;HR;oGdF$ua02NF~>;(tlsuK!*3lNs{AVT`}C>3C8#QdFAIxsBV(b;a*0 zxN8!)dKb`v95r7$nsQnf(^7GI(lmeV)Eg%knYCz|BZ_QFGY|>YBK@nP3JOF($^}WO z*PD7)LrYz6L>{&u+lIVwb>LC8m;9!d1Zzf9n7BspWjYJx1qOU$b($iC_Q$3BnH(VxB zrl_jAHdO2vUQq)n@CRF_Z?MePJG^i%@A))fzzXNKE0=LtD!($ zCwxV8DZ@8dIF$K%h@|t^U4Pw?cTag$oSSJ^BnTQyHJAdISKT2n1`iT$9I0oCT;% zdyQ;uRrYI`%5(iql@TOa{k=iQ+Lqfow7O(B0mi{a6-n!OW`?)%kOA@FSeQd$&X%~< zKE$x%wUgID)|KsUu@axEiPBv0!o;DzZOI+!i$C!TAjbUFfy74Jl1bJI<1(I%7Fvk& zpK})?BcVs)WPEXQfb<)C52rL8`U-mTHfve%QYIAF z3SB$>4B{xn^Hp{89}>>?WVUE^CDoE4Eq#hi(> z#7kfvx0o%`+2Y43*FM6zVg`8g`QB+jF8klP;ZkC_DWMQIg4KhW|8X-LB(~oyQayOl zq{KpYEo%ME6n=3w{WALuRv<-|Pu`GNdAW@_?EP|uJ`hIB>VBRc;I=1DcZ$&j)tfx= ztBY`wcg_LVZYCzU+baS z(aTTi*EZ5ZpOJ>HswfbQYR-xS{^a|J_hkR>75H9$lHwkm)OWAxuG_)82);|vImloe zC^3!O@8U2c->qvz4A3 z;uDUpA1(T{l8!;n4S_ZHWp*mamjIxlW`A~CcVes9^f(9AQRo#HTU+qA=vJn~oUY|y z7qI?R-r{_-PfPq_MwcT0@CUgh_CScwXyWHL)DU@evnUGJ%OqD^X}$b~th3rNw@U%T zGXof&Jis|zuBs68ntrhNc7WgH9#&u9Ua^>KSPC?9clkUd;3$;6{~;hj_8>`^s%!b= zmb_b|86J_MrmJYbupI0$X3X)izo1O)ipmg9qglDRScozN=?&IOxn4)yc?Iq_m&A*r zLlmOBvh=-{8T;h5e`7NE`q!ldHR2W$UUB&~^+=u^npw%?hUc_Ol-Nyi8I%to^) z6%TZ}fr_cwhIaN64)qaosZn|^+-Z4ncez2 zki@FsQp)}D{!*6LmxS-UVrwM?UF(B$PbN?*m4mk$R&O@6+HXEBEOT72z$YUmF(@%{rRl6ohx~st^lHG2Y&Txar zaLZ7!Y|dVaqhP#4msl!_YiG;aMAyQMfAPgYtjg+GJv%|l!rhGyj$jaEeMxb5Y-8|X zPCwsY^;<^N4-|r;c8&4gtyL%jM^1g`1|jrXIheNudJZ)&)QRI>0%MALifhKRYD|75*Ccp7_)) z7Ou1x`YBJj#{0x1Q~x}S_qL0C{ucD14At&koI8+%I3mOSm|D+Fj#!5;&>H6yF%y$*!VTos^=v}G?^JkFm*Im3`uPCWZc~=FR=!azSPY!OZ zrREGVT?5~5abLnsm>@ryz>Y#VXWR!gCmgm&;Sz|?^=#bOd78<}itL9nrEm;%;`Lt}fzK9ttYuCvaUkia&p?}M+ zX=>g74dQ3rCUPz|pw8!2k}fmSEB1Htt!c1L&F&;+MZ7E4DD_vf%{pE+^UFjnsP}cr zF}ae{6*TE@99+CTS>-jd?d~@N@}MtR&(7St<&eP@-!RoZII_&HL+zt~WuLA<@0RJ* zxK!o=X4PkUv<@ldr?aRmg;4w%q$YYc`%@MMUUC7w?^C>JtF;P9j;9K&Jl#(~d!Q}b zc5mE6Uk3BJCEoNF0Hx-)UaORs%Yx~pJg_8%Nu(q6{{aR7~2RKQJTF z^E*v8H9=FsjRf@U@ffp0S*c@#*E?`jjh{FZz|Yqmr0mWbYC$ zz1#p%eyw`Yp$w*g^}d8wD)dZNNX0dr8S4wzc6g8S1aZwDBaFHt{@H&1FQh==KprYY zofVn%wWl=za50+sb$+m~I>Cf$yb)@d)72);HleGO*}p)&a)6e&@8M({{BAV^{VXE- zRXkQ-drBJ>)8m}K*l(nO=zTtPL_pt`&1;1Pv>f+TLMl5IfB9xP#GHaaGzL(90!isg89g^W9v~nkf>ne}CEN)x?rQnr+N8uH~| zIl|>GRY*cye0qfx{gW1yQ5id@S+5^H86swnX}HYYebub+1!$-*5FH@44Y7-*o=kpt ztaB~EYxhnF>T_|9=EcmNAhY8q-<>G~O*C`QV^a3{cL$$nHdYqIw|M90g@MM2y# z8~GL4s5t?hfU~erEN^Vp0qB4z^l-Nx!zqL z8*MZM7!o5?K8I6~a0N%Wd?F_fgx*cX7(u&aI&W#(?Ap&889Cjhd=?XTGO~dx`h*vr z|Gn-%x3&1&IPv%M6nuDf$O#TDu1SG_`rI#yftGshhbMe^+DkD*AB}7;!Y@U2q$-E; z$QS838;bQNXfz(o47o1=?HI~VHEwCO`1QP|&fwtjFkLWuXGV{A%@xOq$C{1P*!t(~ zP3orAnG~m46vi~gSZfc+gd7boZz7?fJ&U5_xUW!&zU+%>Q7ax7+YLdCW!{e=d&f^9 zF>R>+@~H=YAJ6u0jfQ!|)L*JOI=_3D(WUXMy=KH6^el7fmV5x}iC6Oe8dDi~;*I(b zh}*-(S#|i$L&7Hv-xIwiU%xx$$8!VZ*m&j-TZoBSy?$m`zLR=Mg!^0^Xy3N`4Kw|8 z!^(U*nu>7t37+AB2^C9|nW>=AqI01WfYk}B#9a$;36LI$}!#pTCnW_O-= z$^YpjX%N`dOopa^!c17eodG31I=J)1s!`foG(y__G%xnIUZpN3`fw6zI?$ zc&Jlmzi1{0e|!KJyNrCGqS;{VpU~y*;tsVw8ymNiDLYVt%gbtq)A(+fCPCfb%G3>} z@a1;GreuF&M6@b|@iUxyAro>#UjC0GR_f>3U-;+^^aY<7TRDj>g5=w}jY^ zfNKhGZQVl&!=4}^j`-;zAvmVT2r^qaz;;m1Q`|-))p>?z=}@`Rw0>p_AZ}s`q zc5K~ewbUW7tX?>rZ>)m(cofx-PR>T`0T!cJEaX+<^s|E~a8F)!3UHjOqo2U=z>du5 zz(RViDi0X6TA~BFD_vD2B*@N5M{epQ;GxOLOM7gKe6GJo%?-7GF6D zRD57nny^KY{GbN2;=?aj$O~ z$`RGwuWWvV&?U!zTSVQ3?B|xnY*Wy*Jy#Cv!pW>I+3Uwg23JU8{(SGdiNvccHSPxutnj%+1K1^IYG*=Nqz)+W0(Iu^3}&bCz_gbXP5Y z&qB8PD^z*J;tm~M83W{y_soF_RgQL3b(XCcWLq0Lp&Ap9TA0cx43wK8#F~ZsAF&ol zcAkh(e>+Tpmc<~g^G02GkJsxh-4kc-9T}DL$BgKBaBaF}>j=zknbqsb8jEG$F*Rl= z+?eOGzll+l#4YsbjL@o8Z0rjm2Qig6mzgK3jiOu2UR_alf=gbO?a1`d={RY&qTPl_T)T6y(lu7SqMTj=j!kLE73M1xvIXAAZkVZVW?^zbed_P_ChDP!Ak`zDmCQ16k!BiGGjC&bG_tlx`n+@_C zn8$KEewwhQ4tb`yz*zr~?vifo{PVKi%5LnB5rQc2jKQ&8$KpGp1cxXz0sDuDa@rE~ zH~sr!Gqr?hLxh>#2{F~&?#-1@305_P2hRI{WR)x}sYtGpWQD+ZasI2rSdR~#hd?|i zH--g5y$3ihR}Zj@J>Cy?6^IB_+M!V6E0fXdz?btV<<4*Ib1&mROk4XgdhJvd zIcSxxV&Mqbvmp15m!AJMXlJ!tb@<9lefx##FRz$?1#x#-E(mB8ZuoyD)Aio0sLR{E zR+3MCqh4gzd9i^5`8|MGUJ^l5x-QpZaoY_p6X$-bkUmORufKar&|NWV`Cgb~;g&g< zJqec=e{jH zr`_UNNkXG?8t+5gwBY;k-q6cADz%o`ugOn;4+mmBO`icm{y9FLp(Kj((NZoS5CM|~XH#+fyi1M}70 zp{KHrSGeGYn;q*evbhx)Wr> zbX$%59c*`Q9rl7_9HA?v7zx7FEClG9Pl$?+Ry2Crzq*Tf2ubqHqbwiUWg^#wXutyE zk5mSkvI>WzGv1z&trY8=TF%Lrn3S#)**>V&&y&+QV0!FRTx`4((Ts4BYvY>*@H`sB`Rww?x@XLf(LoRrnD8cE+gW zqp)>-FI$8b0ix{po*;6jhV_7 zC(D+7QO(&(X$YB{1mI`OTuyxx77xB z(BtFe-DfM_h4FhAqf}hGeiH)9YCdV>)o}Di#V=pAUDPE7<`=zG-O(doh+q<4Q6Isl zx6qW9$hGLE%a>jF1$ES*-B5-&bCwOvuN4PfmLCpb_lB_acg<$JZi#npRw2pWvbT1o z)Tabjmk0?ab~)&1MHY70N=EU=Z!EFNz>xt9C$4Rn*xo2@x^Zj3mYJ6G4aWn8!f+&y z-;$U6N5R=rrf z8~XgDeMaJ|DzIPfP}eqKSRyBkf!Q@9J7jZb7Hw#0d!ch~85F^98Pd%39;bx-M=U+z zz3Lm%A*7Rmaje{zEI+@DXJ6SvPwOBjj!t}w0vRz52l3qJIvZnAx~Bm^9*?7-T!x%c zH3>B?;06cWT@)d_FO;&c{oDFsgIbnBvAJHUek|h25{$IL$kkZiPp*AI+*`b)>45l1 zRaZ_--M47f?V#@P#;g4YhaVTb_dmb~>@neiTmwgvxTD~~otI1OXpzw(0uS4HARa?4 zZ$=Gsq)yBE;+10ic!JPBM-_F6)<+9{znSxk?yVPfII z+!e>a?q>#3oiF z!!#9Knr^5qFj}4G`615Iw;g5=DO!6x6bC{ri*F7{a5b&I5+|I+NVt&Hx< zJy0(Oa*3FZ%Ie~1;=DWO?jT#Lce%LeAt>&&=_gwH4{U9tr-y>2w-+U2<>YK*2=XR+ z%#Y&<+{2#H#HVIF4c&qK{-1~H(Y@4urdJ5%?ir5>Ll;Zeg052^7|owh{#hE3az6D< zCh;p&j>CLFaCD5L+G>?L(a+6BDW#@&w7%-T1<U54g%=+^L}VPv?cBe-YNgl{y6 z_xgX3qmPM06!lgNRkgQ)dLt0b47kCXa&OmUV4hC2yG32%xWA)C*5bYQNJb5g%Z;-8f(v`D15f^^}|J7pD9> zI>q58jbmYWfyzS!k5|i4TLHn>`WT?m4<+r5wsQ$3{Kij&7?VqS7TuJ2M0nZLU`<%H zk>_BCRd&=Vt9Ibz>vFg=tL@g^?vor9(&7 zNfnM7WPhZKTRy1r_yjV2g9ba%t4?uclnlrC!tK2abb@{koghYq%?Oz+AB$ zeKt(rN#1aSE>3PzsQmTLYs+4*lQ8DufY>&KdgbXod^K#8U%OL_Hh`&jd0HLjKFB$v zH3WT+;XNW9bW3n=HH7sIF?|{Z+yZW-#7%3Xg%Y~LT3So_j+SP4UdhVYsb?r+3l|vg=agZ0~rfXL-&5t=B`^5ZJtP^gL-HwIV+UTnUzy zkdhpMkd3WNH5Rs|s_Ev|qxZay9(-9sg)dqy({@G4#OavxtX6T?lz#}CUhl;Yk7<=j z7Rph31z``Ne2+A&uVGagDEQQnzZ>&$JJC-) zL^so8Qv~mk#J0?Sj}kko`G-DFHfoSO{X8NtV?L=14DA->!lf4<26FAT7;dA0!4I5y zdN=QwF4D)X5U5fQo;BKBgii@{r&SY`EC8wvfUl7BcvJ~bYKGjnkFxEvc0*m-i$*!E zNh&P^v4tm1V>V2VMWW1p`1J{$Vk~`GyN68+OK2^a_MXh6ECN3kU7^!Co0d4HURS<1 z6I#Ak2TTPdZ{)t{!cHe=_l)dB1~J+#)h}l>4)?uV8;ZW^7nCFno;$~z zOb*sNNnIrN(m_s}tI(ukMsV@Sj76zIyF7r;>b?7?QvB0uPJzEGu>WMwJtJC$i@Km{ za~Kg!V2&W5e?Daj4X(-j_~8#;F0=bl2@S1ozhVxAwbC}&yY7Pd zgTk=M+(I2YXYQ?sE3f+oY#I^hK*;S>Y`p)^>_keDe&GJiZ~YR|(}87PCmu`IAwYot zPJ;z&XlVV&AtF0+fzcdAaG}e6wI$ZD6bA%!AEFa=v8fV`nP34m_81nmMrXZku@2Hd zMzxOeJ`hpceTPw!V%_qa?eBazHOd27{=~8+cJ0{mVnyy)habC16G1~oU_(2f{U(*; zz9inyGI4ym^@;^&){CJgAi=@ekL~bfOq1veLPMoPd|5{;@Na{R!Yr`?VEx`+c$`x0 z_N@QnHmF2${!o%b3ykn@%&^2os6^>bnU-oG(W5*{kX4WZA}WPyJ*4`> zNg(h;T)EnzRg!Tq2$yp*mKehjd9vYKaWv*iCH?BtazZc|Aop0XTVm#wa$HEX#htDn zwJ!Fbz1>4~q03dgP|B^NR61wl46Rxw-S>wl(8)RFwbDM@eAQ%`XL$M$ftthqmBJlX zPJ=>jT;ezFTj#8^?fQj3MXj=gxhK}vd~G@@`#uDFb7MNvg5|Vgwf$qKR`)($p zt|sQROU<;`Unu~Icpp1Pyt)3@$IkisMJ65PWQ$doZ#4EN5h)UzdSWX}t_-vq(29y6ssr`0|sF8hji~eCSe5;Eev~Z}lt{13ZI0<6aTF(9v0YHChvh+XzLMAXawnx$8@yJC=*)n+h`ER0qo~n9lf0 zzJ=YvsRO|MU=sxRyubX1oJH#Q=LLc8YKJ1DcGt{Un)yAHCWl^;L9%3mNgp~CbIn~E z0=qDzVzp86H)AQ-pEUgtdV7y=o>n$1y<@bi2ba#%%wQosZXzaqE=+XcaLS^mABoQ$`-{h;&RKcyid>T773Z}Asj(VNvi=G zmw;&yBd|M0FJBl+TmAJQUMSQPU#@N(Dt2Pxue-Cj={oVPeeH&Z$tpz+V9(qHRvFEt zj1C)~=mr}62OJcg_yPjN$Q6BoWaEeno4iAX@j5tYh+^vdZl6c0OyP0J@qGv<29$I0 z!)bUPiYS!jX9{blIn)h=!F@<(6_bgDCBakowI~OjgoYxpq96wcuqf^!$Gun z0ya}dNW4~L@d2CnzC%;D+_`$tzI$8orJw^;9{87ylg>c{Q0%poI~t`Z5b(x$Yqsg)P5+=^__I=!$;Pwun~r;Z z?@Lkd&*5^+M=A0k+)JpTdZT*9Q5BvwekYc5^9MhnOK4$wH2X;&BzToqrIm^I&&3^D zcVck)O|g@Unsvu(^cx=c&u6nTOI^TceNLjT2e)c;bER|Bb2f6mx#@qgc-)n~@nE<6 zGL!>^x2Z9ZRU*|*Eb{)=Zc<~gmPM=lA9(UNt`JxKZy@3u8;_JvMMu5+ps910xHDRX z7}uq|=1#?>a6yYPGzIJ27!uc^hrdO|y8_}cXNa+&S@P#L82ZW!4$pwLRm0pvi9^1H zDwFX`$nPEGXI9GlezDAVdHo&&ToCrrSo(RHxZdqI)!J`AhO{Akn#H#y4{25y(1mCn z9Xove2nCFfS2L?y==NqswDm4eg0(U81`i=@Tle;a3!C4px&xFdBFNI879YxDaf9J?67%m@=!34gH_Wfp zXOJKm{ni`xu(_3|I*HRttV$JrsIpSXIjA2g&vA4YFrJlW3P{Q$-_~>?WU}P3uFRR= z{YpA{LNW`ix0A*{95=LZ@8gfhkxO16`F1+x$y?gvd#SKC zDPvB{bFGm%SgfXUxHP}U!~eKc)u7f}$q^ga3RYDa{Z454sl>W3PBnJbpS#y$(xrbR z;&hEY@YC-n2)zC7C*UG5`$d~Nq}*9G<2qi}aai1Y6>|^^jZHk>s6()vtXH`3XC_|G zX%aWRb1OlduulIdAeCjCh${cw$X@|B$!d^blFUTs3#)E~H88p92g~HrNM3%VrOcFl zG?5&aPm}0QyZK4|&>;6{gAE&>r#>wjckYm3@Fn-?e?M4x2+#g^!8oWfyFd>|XUcUc6oSH4o*H0^mNx4_| zQEG*0*lBOt#7x+4D5zNf^vXTL5}|`;E7TNs+WM^_50APKg_jT8{j-Pql?`q+A|I2g z*qNk281Tyf-?bEf%w0BY!fHyg0CHp=9NRz#OPZR2v#Gg19`c)b^ z+RgvCZ3(@du;y$d!8u!luF&BTk6EQHl}80<4rpbpKULlS;Z=7gpMArO1sqcr zJaUI+F2?0rsFx?su*vf5eW{09JQxJdw zJ5q`&0)HPaMeME$9WVjtlF#4G$-gD6DJr%c`rJ*aI$HP~!diVDUupsq9_#lWE#f05Xc%%c@+7%G{12l3T`O@SXt#O#(j8-}I7^T)zk_3`J7VW+%5;Gc_%8ZZTn)_?fbO~7J zB{hU$C1I7?k*nN1JNnqKg3r0G4<$7gPOqnzX?|dlGo7=StC_2rQR~r>mdhokT(gG!W#JF-n*%(l1?`@@MgC?3(U>cCLG{s^x6EG*w0= zdAY5YGAMe<(H<)EKUDQjGlbA3MPvN&aI^64=pQ)6#3@=Io)^2&7h>$d@UthUG z%+7wHyksxN#{tpv`PY1sB~NmAxp7*)+jBw})0AYX9BcsZ?}aR+IAk^&7NM@F<16is zfHPhR=ihunhek;~z2&?&sb38v$^1=`_|@L5A+O?*qP6}9n-O&cr?7VYGed@gA!6-& z(oQuY%Ci91EE9T0^+LV#!qH~TP?6B&c}ruWxf2`?)?p|ZHaqyZifkjLWS||wQyM)^ zmXQ%Gk(oFMT%<3N+3OBfbgsr%%8H7999%dkKXS1hpZ&b-RP))MTkMRU(adBv-?>J; zcgP8(9~RCbD`}S6WL7F(Ck*i|?4lXUNIz&jCCK)kX9~~~n~-gZ4MSRkG>fAtm>H-k zzB3Oh|05T(k!=mWvQE7#{k*eX9c&zb$v7*nZusUz#%dkDDf|yS@|!*JscF7gd`W>E zhV9Kr3HcMb_d#Z6h9r&jK8>7C*I#Ev%Bdijur)$Y^CxVmESHYD8X^lar7}5wmZ1eB zKLkg|Yh=@iz3h*Dp&sfO+r6q#1;w8&^P8~OBr2^u#qP;Zg7O-FHf^0EApu>M%LIir z9VwV%!{dM~MqOU>iP`wB^%6qO9hhufs&_D4$Cs@L4f7tl&JGv1$xA18(CgF|}8+h45e(y1_RSE1VgD&qe9eZs}11jgv5gU+5By88WO z;+=O*$JQnly1F&~;~L7n4wR;pH_sj#m0!>F`U0z~EyAo2y8ShIGmYozGjRtMf+e|% zH%Ys?P2E-=+~$A18HPWBbl=p2l-T!bUOuL})yC|34$D!vH=J{Mw0YUU9|At!Oew!BEQM_q=m^*SX zWA*#X{M{K?uj-8C2Q7zZ9j$F!-NBdjGDl50-`9S0+b~T|!y;t%ZfP+3t#t=+%YQ#e z`mw-|- zOFc(km@co=sdrdH))mee(rDMMH66YgZhmW<{$aIc^GnT=>#>D|K-G*BvNvV4;Bg+8@L=XTTn za>v=|8F3Ny-j}EQX%ipYQCuaLWM2)h-j}Q)w!iP1fL;jxP#~xKug+A2brE<(&^l)( zW@+1nr8M3-PN^+DUO?Ysvkp^Kw(;Q@8Hv!4#?foHb%^)xH#@Uz7j=)8^L}u<9r||( z6~VxbWy`;YEjZhrtbdR)5-F)OgY8kS4w1Sixb)Nfa4G!7<-Zo|CxN#=9ln8=YlFaq zFR_kumk<>E5z1Y@VS@6dSJj2prB#oM)QR>BHGDA^G3?tDPZ`(wOD<~;)l_itE=jJv+7Kx*_@jLjgU%pRCQ1I z>QbYaTYf)?GagG5Uv6QnG|l|`#QDb?6I`p5zH>N5wPYt*o;Y*NFkKEI0cJtvg*AjFp;dnqs^3w(7e-VS z^FRC&ej%cWYK5+DVkyk!9w8RV@vt>P$x-$DX;epy(Q~l~y$QkDICHQd)JF(tu)jhO zA4{hf?A#fSU&dYmr_CvIzn9JGUVN`O8*E4B75YE8uZ99q@D4ZO(`zyZPV~OM{BZei zP)W!p`Ma}Zs^38QG0q)?tNl9a^DIaZ+I=Nw!qzRWM8`n zdhZYBo^j`O^sC&bZ5=g{D5S2D-w1R+3I{GE(6H{2ZQ*LcKfVBoAk?0jZR%_g!>Mwz zuj_o*O4609&+0uJMUDMm?Ru{$qZ<;=eZ7-*K`Z_%He6WJfa8$RhF5=7&Oh%n9rk5w`ODM?vif%|#h1{w{@aa~0`711Z_CAE;_JkvP-jX=bx@z zRSW%JS+?K>NM?%y?$O(Cr!T^tb0mO}YZR3G?Tn{B;c*0U5RDpE^Ke5-`PdV-blm)+ zJxc!cmx#X8GyKJbgMl7*{7z-$%Z-q0n0w}&M_9HQOur7JhjeYi&9_ah_eDl`xK?O; zW(gp|Le+cLs|O-cS@gtzi>#l^S*ll?j5dtQa%yEBql3q85c?-8a6n2{$G?92o!R<5 zk7W6f^d0xX#+|152jhaRMv*xa1H%BX@Tt;L@j!shCPA$lB6UIF<5tca_yqG@Q4DLNfpVz z9^ub?M9V}xQ{-mG6Y4JU3?-gemeBOJeG;N>3Y-M=s&PrHdP}006+a zrAlp&gz2A)tkDdFKFw`({SFbKyiiE~PkEu$C8~-wnLRaD)flf3mN^%zG1-Vri`j;< z2wja%Y`s_Xy;vtZf~<5E5+Q8H-SFsr;Q`zf73bjxGR|kQhq=h(3RK;%{i)0$e{(eb}%Q#1WsL+za9oi(1T~Q9Hkv ztnE_VJ=<{hdbpzUgK@Ne%h>;v!d>bR`9evs?=$5q{VCbP!92y7)rKr(KY6qHQ9j(4 z9=vo11OPr9h&?Lh4Q2(h?zHnZ9p)vUnMGJQCf)-CXl#!HFj1$44lNaGdl&xaq<_ZQ zyQjoPV}kqPfk3fY!5%2&lz0N3>xY&_PbZug_*7DwWC-_$U!ufUcp5ISHoR@k5)W4u z)Y45e^jCh|$vNTNW$*E9Oxo)65!Hoze?X(*t?R4?$MTEkE#|SQJI9}qtph2moB0Zz zoj(mex4p0OX7|a*H+^RjZAo6HT`}Sc1jc1P@NZb?ppl0+ePXLl;=!q}{P<>UH3oXn zpYiYV!it zi5maFJpO2xLgem!uk$~=!Q~D+&5GK{vAgy9G#q_yD5dXIdt0d|YW+^=Y1UN%#$+y6 zR?wLmW{Z6NMog%m9z!ooa0RSxTLxk zR(Y~0qkf#X>%gtpHtNnxZr*z7-&tQ^RnQYs-8) z6$Ml9NQskqOIXk&5U3iHV(I+{6 zI*e;M8X5jQ1h~mFgz(%AWlV?DWH-hn0}P8VDh@jxUJT?jlld0ftldG(x@EaXaJ8uL zd(=~IWq-Rxdv+sd=RRoYYClMJ@BZ7S%!iia;2SN*H%^pab^5(TentzUO+UyeMxHa{ z$o|<_UAP);NEFv;_{@1a z_F7aB-WzmOPtSOO!OUH(2`7sfI2MM(PtWTup1l`U#I;jTgXOQzFEx~njF4(cRRmiP zs55c>1xNd{=62q=06OjL3~ObaD9x0cM7muS_S9kL)cSaXXjmMVP!mXWHxrj!rFwOj z`~oQY?DejC&+K`Ei7|(*$*{T@5C5uc7-EtqeKjwOK zKwRiOsw97=^42a&w~{5}%{JKzjXx%L zg5C>>`%O;zw{EhDVv8R5mHQJE{KJaMHO}!0gmzJjy5Q8$E2gr@W;@s_e*G7%f9C$5 zot_}s)$1Cswe}sDhV%7WOY4hB>|%j zC8MRILwpp5rChfd<1h4`A{#T1HIZ8#4vn;+I5JTN>4i0CYcwit2giW(!!j`{^DX5~ zSCo#<5>qHxn0}|q{xJ>w+npr%_>m|PIw@<9kNHk@!w#bmou=_YpGmGNkeL&^DB~Y~>mdG7{d*AD)e*!j;@^y+ohAWq2Rg%l-gmWMOD` z$~ot`BL%3A-u&Be`|pqV!c`4s0xYzW{_0HWisnj=#+AnRSO4C5gdo_q96l^UDG)Mc z_TvIVZU)wQ=OXnK-%oD+d33l1wnQ#p#9{ zLS@*O3F;f~7|&2o1k7KjVoI}rui4bUaKXA3A*$M7+MsZZ7+Qfs^{ZWXa>~zgs8-p6 z0zu{B)7Vk<6QJ9gKiUIrj_xOrL!mlI?}Q!Zo_VK>Jro^o33&eg#IFfb6(71D*2Tr$ zK`39#oCu7aJ6Qa2`U2M^g7j&gGgLviDOnT(0!u9TO4*tf{0(q>3x7BeyC!!vtj$!H z6;6!ggy)9m6z9mRG14$vG`k@-5&9Em6OVCX=+Q9K)dmC8OvRdafjRa4(?WH4vIY|ZeopM`vJLYE)G z;fZzO2=V6`J@a4xIw~n(Z>#aC z^vKmsy5?dBt#^V9cy^Y7U{ikgPX47GdoyV}0@8Kq)O>BFJ1KHsDoQVP`+rBQ|JMx~ zBCgfHL8G4WeHxJQW{0W=+ONAWA!#~HmKQ+o&0*Vw4I)2DS$a)9(vKIP7^Y+k1mlR( zKDP9??Z6L2dnnR(#v+lP2@C3T{i}O9^F9C|2QJ6};WoKnB$F!ukjWNJ+*pOnVw-GT_6WCyA`LI zy#+-s3;oz(i)#iXH@(Z#4yrg8UzAmIvUAs&1ChQ()f?>l+jdVf9V{w1e$>z1+hyrI za4V9w39Op~r#-h@xV7WUvX-ZVBv>jHo>9%N?bqO0oou?x^?i$QK02MebIyC#-pJZ` zx3?x6Cz6Xt4X}e#eS_cJC$F(@u?;g>w8UYbWFqYE#;)S#wtSy;d22IbPfT^dlNNX8 z_cOlbY^72hVMgztA4iVttXUqxnDwdA5lCj+?6vuxdB#>snVuHgpS9wz z^ybU+Ks z@q+rKG~hrjrZ36}55nEwH6SclAM;)Q)dXpGi1?VMaC0X|M#EbN5yb`~EGtW?UZW}D zOsSq;+)H&3hcbeejDGN1-=*{m)Co-WtQ*RSb9BeE2F>Iv3lukHCN%w6!sJTls4S`P zfL{-NwK*(Ap36E0tTZQZ&~$pgy&KT+mH!LYRVTc>t_!dk05;HtALy;zNT3Ng&=}!AO0*?6x7<=rQHIK@2u0dQAiFkr~X(w|Jee@4x_? zbSk;Pw3PAox4Dznl;{?_x4al(`2gs}dU`0VIF-clU zFeA5Do4qW&Q7Mu}_V5PJh6#u5#B}%$%>d>T1K4OTzh=?+4OvqEoG#U&FSWXhArz~_ zg_?tXJXCF0iJS#I7d5 z6~n5HN+-cokzkM{7ObP*G33{FTQt1tcFuf8l}yV3hsW{CAL##Jr2jou|M64YZ3lZI zK*n*WVn$sd#T9lbTtQ&Q16f6!&9j!zbH6$07M|0`4?^<~iq?3vQ?fJOrR4@RW)7*} zp>pnxTy-D2DjzfBt`(69kXMzf+9ECyyD#rG1DcyQK28+1h}^g>X%Io!v#FSHOHkW+ zdGBD*k)E(<&=T+exh?dHM8QKW^wc0~g`iI6$Qheq4aX=YjPqR!)SJ-tkFv?TrB)I2 zI94Uf5=ndI%s0>~Y@<^`_kx9lRRACsr54QdEi$2N5rJOOagnpLOA=-}@cQsNl~P{s zc{-4wf$I+;PMA_dbZCQMWlS3#BWw*wm{fX$(D+h`mw`SicfUw z^j39);cz0k;gj`~Ho?}9*BjTFudTi)D2Vv}lD+ds6uS`W%VA}qp9+K3xBDpzcgU-{ z*rEcyC`G7TqI^hp!PtiWiBb6YJInptB>!bmUB$gp)9GQLzM&6fIZ1K)q-8lRVABUX zwEOM^tN7xV3q-_MZ!-%9hFDuA?XtM~f6i~h3vM!zIfK;&=J47jK^2Jy>XR+B&ft2^ z%T=xx8KWP0#na@<`|m*X$BlCPRm`z$k~`{Oxq9qBJ~xy3#5PhgT)yZj#ny=i+K<;} zzNy%K8C;t~)1vcIVI~M@9r$3<^7=k1&e~?#a00|5pCf78OCt0Z>?Bar8;tjo5tp6h z+RUU|`)X9RVUaLcONM&Lx(O;p1>JlIVNo4fwf4JV?ebvx$iyX+;h>WixHOEMoC~S; z*YRuFb>V1tsb{^6oSeRMU~f+k-^ho{X1u9s0!?UP1jcdFLR$h}1zhpmrs%YtB`?$H z@;-67j~HtCbDoj)n)^FA^qb`q+wIrmez1MrgfFC4ngS7dDTrI={3Gj`2mbOr+b}PI zn8PvR1!Mf6IK6+sb(@pCvp5_2#Elm2d=xg30p*rw zaqX>7Y@?OTi3p(9eFAk7;1v|T%+?{=s64Y5LB_1`1bsEF&i{mJ*#eRM&S~l^usTh2 zboJ5I+r9?O_~aIXrJ;Oq@a#BiE#Y_-UbOxoK4lP*l5ZyY`M=*!{kMPQw06lldafP(qwI_8h z*=c9H=V_>Y;7N?e<0Fl)O-MzEu5IjSk_2)b4^IpB>yeG|y(tG5Nvc4|m-kC{710*w zdUmReFF&a!;a)*`diU*;FPze`!xA1;29@{h0_S&aC(aD=G-upQ3fZLhnHI%do~)Nj zdYI5yD>e(JB#@8T9ea13g;pd2STnFm9Z*}%L410JH;i4l+rBvy{!mLcFI?`9YOhpN zZl}o5v3X`#Zva;<840(<&{Mw5Sl7zSdxs}EVTBHlem;jgA*6G``_fVy5V4W>ct-VN zyIkTEm-Ahppqhl>&EDE`v*;U%zdW~;LGa~U&@Ho881VR=BTj86Dwqk_Z z2hfRsJBgnc(d*LXfNdM@q9(!R54uE({atG(cx?6|X@*&@kmsr<$AP9r|lS}mgs7LD2PJry|jFJGvR5oYvd;IJ6_IHOug9SzO5V6ng=@)P7mhclCC#xIJnC3FiOC7OJcqBOK3V9ghxyO$K!%$O%DVHADp>g z!A9*B8C9J#25@#>26m?vP;XaK{+b5=aWj6sIe(r|$n4b|8b7!_@y$4hAHv5Fz!X6`i`NjxY`X^G+~IT}$rzUp@TD<~!L?fZiZ-#1%2OBY~xjVYS7=873gXYLqZM5d(fB7`a8 z{UmogRlJG}4;4qLZq^K8Z#KnSvmMV|TeKLQhuvL2aI@F6|MuqRGoib;MY?@|)r%FgZ9;T6Uz&Xa~vE`Td69m78DCC}C_wjo9GQ@;u(zx>9?2mi=h+2&nu{OqTgyne%>JY5QnE1vc3%*D5&K_)q2`KHcVVOb)E zI*wV}ETy|;8dX`-NPVyz_j@Bl@Eot5yBW!x?Pu#~!&at(toW<}qRV{etVe#A$epEg z5-wKaNtd`g6udt93TMZ>P@A$eA61r@mV?4kVP!gFKHf#v^EvkPdDCI(>@#n56_61r zX^9i=vzP|OqOtY4E$J!5y(iNz7Hh}qo}e4R&E;yg5oQdf2zRs&%w2d+!oX}UZR8e< zWDc}RKKpIOO!P7LTR~S@Ku0!3?A9LEJvO8-AsWFP+O6lbFZ3<-c^o-IFn5(}cio#t zMYhzIlHNmZ48^=xQ;;JRkP6?+x^smLF`aE40h3TIBuU1st4Mk0qze&nQcRk@13QQP z3HZNc?J@f^6f;IIX-rWq#v$IQj>d2b4se{&H)T+o=3hEomP*`qu<*~DoEo`W@8j&E z+ceivm26$ETY`A?BFeBvWp6|$S{tc|s+3GuFdes%<1QZ7QyX8iuY>I~z;|L9N-MDU ztj%7*b{n0min`AgtZhVYeF8q-y+zMIi_!RS=ODaE4*_@aL>;6Kmu5|?&s}zqNCUow zP1hhsHN8iY?lMZq#6HgWgfBUx|F#A}+p(L>$i!nOJoo_dlN@pm_FRBN(E4U*r*L#X z^jbg6ytDw~`(!^`M;!_t=;4vqi=9X6sGBMn%I!RqdZ~pHS(TcF?AniFDZpIw%Of{Z z`B(#p78 zLIO^Ui6D!GX@#W{<9H`DiCE zo*5*4JR9z%bvw5hWhc?Fx+=MRN0}$(Y;41wR>$uqBuVBfx@fCT$B1IJ)=WZdC{Cv^ z7Bv^(AGW6sAzfVD^o5{VAUXFU5N%#hiUvCvwm!C*qT|ny2x^0MSp1B_jObUfF9ksQ z$Xa2vSnl;9HC=L{0+zhpW-}<|D&Vn%hh$8Is>rS%D9lnk5&b&W&h;! z-=h5$`9tpF8?f_h?Ey6H3jHdNTiKh@yH2oYi3c7V5p8=p%#n!Zq$s!6vYTDBHXy*l zsCqpDw;PkJxY)FvZ{r#o;g;}!7W+V)fZ`By!9r*RvdvuiX^aFh5}HD~@d8o{gqN3c zro$vO5~=C3&O$WQ-#lDNH>Bx`t8V_tS^kN41m-8Zz>;r19=2NnSm?rTTiePri5}O} zW4rQKtMp4ZkTpv0g92oe6m^<kHSP*nlfriQl%xKd%Zg9+cuU(%hMF>cX-ia z>uivSoGeOhH6{NID9?~N|SK;`d-66L}0_Rjiw}Dk@;oPt%ukDiTkv~XU-D` z_ie8&d@5dc1i$B=?vflqjfPwJKq~u&Q4OEBWMNSkKExWHujL&oZ9(Si5O8`TZcNtB z96gxc?H}}rnc5Pz9u=Y6qE`$=4d_|irb?D!ITyZ5?};enCf(PDV8BK8p?iCfG-|Nn zI1QsJ5G18hfRXT-<@wKBH@uydMY-4nqg2!GylogBu)kKC&A+XiZ!Y=PRvos1?CVkT z|4xRg($!+E?d&H6Z9HjFAhlw5Ptmlm<8OuKkU|~Rq+cFg;IkSIG38ylFZ}Qy;D|@n z_%?Ii8l#6?(-`LlN?I94$t#$`9L(QLzAR8NVy5 z3YN!i10^SBE4-f%i4|gMjksnvc3Pl zkDE%Ibj{M)(#72N+WSQ+-zXhN_DwFN8V=oJxfYgVFKRxe(8y_eRORA8!;+k_t+6IH zqA=e?I=&eFH{&Rl+QLqcEk(?8+F8EUNm90Ev%$ z5p|oFZjoh65*PQ2fB{pq+-LxJc0aBR4H-MKu*j%mj+$Z3qi1YWoTdbuaT!4h$YJga zPC@lzQ?QPAYzEmf*Be#~Q}h!4)^Om2BH#Tb?b#$CS=hM0ub28@Kq&S4#{Gq8(<9DG z<*QZtS5}`YqDT_a7NfRb_9TJ5u)41Mx!$fOMTp00feQSsy&Kkb*O&QxsB( z-ej~YcH5Npselb%Y>FJ80W3`Tdnv+qY_g7e$~U=P6b&6NL+Y}g4ao9Ly@tRrI_Y7R zdT;c##od5#;loCp&W^lZ=8Zuh5@>(9EV8DTTBt43KAj$bQY3X7r-4;ly`7%B0Ul9DadO@QSuqJ7nGb$z>J2%+g^Emch-e34oEc%QeI+x!Um&KqG}p$=K3GLTwC76HYX`5 zA5_vLr;!e+`h5rzjINXeeAa`>bYBr6Yu#h2?;KO@3fbsU#J-&MV-NY&`G&ht_j0BV zQ>3d(5m__R9Y`4E1wDC&K#(cM)nPNPW>JkpWt&Y*%7ad4)9~}zQnh53%DWx!jaGfs zoRmL+_2lN4&&&zT9!feCko)t`>e6m~3+i*!R33+5N?;-_pH#VEm`|+)f)D!H#DaYP zfd%}J+YdN{cS$47_i{fGKABYQIJA-(gAn3_JgK#&jEK4Xn#-u`T+(CcqhW8Kp1qt{ z`X}tfW^awUtw&ymWK&X$c>3n^i1>UXG}^gknPs`$X?8x7)^X7uNTaUSUTba8o_g0_Mdw&g z_RU~NL)pu)@~}0jM^v8`{1n=DFg`J^ zt48MYi=Af(sy1LD0vx-aJl>!aqtWscQW6xM>p2?>YaN|M-@A1t)Wl2WTZ)alzZez*ap#nUY`&WpBn)Pf2 zA}D|WavWA!6Nv4t%qI@~ilo$u9!oAh-XB=E zH0>_A`|fJ1-^Eq1X?NTquY+rmeK%vmQxxDQt~mM&Hha6ET z=S^qpT+;a=;RUI0VaYeu#JPD3I9sQwSJPXQxQs#rT+Kh(H^brPC~H0zm#pU4>jTQl ziIiskw5y!A+lJcEyOwr!9Z4RuoP2C5R6?!DD#f}AZ^wy;FuR(z`FW$co`y1Mz;u)i zgjr;&2Lj@Rvqg1^(Izq=jRG|KHgV0d9nXh5d^8FQ?!@C!boCQY$QF{$jOl@F4 zj-i;FY8=nsZw=~{hVFGjB&oo9yyo|dO2hZw=;)gkYejq(d37m3ZB{|l=+Ie$Z{44L z$Yt8@g{xPQ^{qlv5Pjixm~?&mu^Ccn?rbj+chU*VUYxDb*$5SoV9ISx!mJI|W<8GX zG*u#eG(R6d*8^)tqILYQfW0sSh|jBL-TH*HAvCa(%TZB0l2=xPO}V()b=R{YIn80- zrb@o*Z$R>To-#%A{#0{z{)ak8_DE$qSX9YeH{=#a^1a)KZ94wfhjuMJw2RpLzE0?H zhYINTyCdu^pE*K-B1{iEptFmB&bEhT$RPQKUTh%wD`lM)5N~K|ccklt{BqE{sYV>v zm(^ERzvq(sUC7a;XH%iPdvGd zzZlt;q!cB5tyPVoz<`1->v2ek>WXhYd*Wg-B!@bqY`&l7q3gRcF2;#VV6P7i8l;hQ zxouwh><(Xd_74&b9k3cj41mSHW|u`hrmne-zk;UEDi@r^WM$=6LgJ*jmnZUzL{$ge zfYS2nKqtJgjNrkC^j37XjuiE3MVF4k9A=gDOIPMEwbo(r&B&$SmiWZ9newCGSPmkioBh!01lu>1pe+W zpzA>biRAot-`vsNj((GnoJH}{tU-0AR(~myH;}@?kW}%P+&RJVWu#NrqKvH?pUO zQR(Efk)}$LP3+U*;R6cG_ORvMoD>}*U%{dHHoZcB-YT?o(QupSpaKBTZ~7ak0See_ z%#lQtvX+_WjVf>5BJdL%sR@haLnF#nhXoB$Sv}DFtd9EA`-XBbd->l!h;Dqn2>F6Y zH${KYiPqMm#0? zAj;1hW6Z*>Ymp~8KR3U}EoBH>Ra*4)FS2Y#rdIGhl;bAHsuXdD1GPW0wxlT8vcg}1 z6QKq>1hzgxAco}NE2VlLI2j*uWBqRoW%?9^^Nx9NElyIalz9iTbx2;9NiCg%-Kh!l znocK}6HyM`BQu-446cmAO!$cp4Vcyg8yuP!1`UFo(2wZZC8D9UtG)tZNon0e&BzoW z}owi~lZC>VwB9YS&_753Z46z5Tnw}Et2x=7^$FJy1)VBA!K zZODZz__zqC;7qf;qXDzb=CAhh6I`j4ESBE|ilA8WSgB4vJSm9 z1Cg&r=xm6qGQCVU)3s>*)bFdnUB^TV$eL%{-YMKx8uoT7;xq;7 z<-SJQk0W*$SCKm9ghsqyDp(|*SvKn-*C#JlZ-}Teg{QaMWo1vn5DynMr%Kac@`s)) zkVyb)Z)d-$V=$~Z`{&*O_@&2Y^J2k06H#HRdNw+ke zUVDM#dn9h~wQg{{^u}c(tWZm6%Ph)HM*`I}-v;;`9ufSbA|c=-x`n?2g=oI>2Vyz2 z>rX{ag*8b=8O4XWBm8`I*L#=;^VL>7>z7#$HA-#&wb0`cv3u9Apkot%<&ZvW^a~2J z+v?0ehgRY4&DD#HQnVLUUFG$A+g7y5rvrnub>yePI&``rW5Ubcl7n9U!||7D3%aj! zOGZh0&b>IUhyPV*x^8>-?gSWZBQDOfBp8j}P`c=m4wf|+Me@;yk zD?Og)wW|30viS=?!dgQFekHFIN?TmfK_yW(*8S4}>iY;Q+*j{pV_JKD=US`G*PX4b zfh#Vv=`R=M7S;c!p>pzofQ;R14kY-UOd6Sh2wwV5)xTqpbSi!t0>i-l#G1=c4P_nL z7NjcNTjw$6Sy+wrEcSi*e0GdZN_eqDZybEO^2Fl^ltmOLye${L4{yv9Fu7&?R!f!E zhB4xAA)v(CJ6wyJimO`2GN>y**u*N_D7tyM(SFcNuI1vC9#|iKnqJl_fT$2(Y**TM zRB2EMvoIzY!oSkA359KW1YQ3-w5{B57zmEHc~O{gWP9WC6J>tdQJ2{j_7;7N`D5Qb zKv!@;nqURsiFJ&D&*(dzRFgl}j3rs$$9?P%%qZRuNJD#LvWlINrQtS|u~%9`XCfO{ zh`pZJ6x05Jvj0r+@hr7f;@2IapQl^K33%4Pk#09JaGqL{x1Nr0L=#s!?E?a((>tK7 z?*Yyb`khfq|Ia*s^H12ltHp2BE||a`B1uQvR@h}OS)Zog=9vC*23Lu{!^G`9H16A9y->C18dqtet~HONI7U?LO|D0dg!? z@0hDiDpUWb5#p3I|2=FFHxE)%qbTjRkE_Hf)rwY-e%kS^CcPvGhoz_16!y?qqcBfL z!u;NX=xH8DAM@Bn2tmPJN{lj zTb09|BjMh$sQS8H*m2p0yE?#uG#TA*Sz&*>H58A_0iMLOfO*TQUDi9rHk@%k-oOv4 zJeK%_{9>_FbNsoHeu}9$5`RSerQ!Q9;kN(6<-kYgORq9+huGyPoYJ?Gqx^zrc1^XD zgQMkQ$@xO=CEAxew9jx!lQy0u24HxA=%vK{F%D_}3gD%r@uG|c{6J{6kC&p#6CT`^ z0Uh6502uF1OGdN(9SHy6;2t@R4JBn^?w7WSyi+rU+ysXCby~vtr^C2j%KOf!m3wwN zxXbw%NM)7WTQ1~@d*dV%6z>Lb+Sj^h))X~dL`n@*-8B|JTLt&9->7at&8(XmfxVZH zI3tm*sqv{t8~zt>eWz`+6!jpuAxO3WE~lqtAPid%s*=eusRnIlXA)EXhTKWI_NOrOaQaP5w8((F&7HO zt1P@PC8MBVFfMzomlHRLTk1j ziI0-j9^QA@eLYQRy%$3n3^F1@G57N0IkLoRYEQW>7vvYJ4eTFjEOdjwdoek6VyuJQn_VmF!+p3K*Q)Z7h-1!jqYD>&r?mF& zd0@Z@9n3RAUhQb+Q@EyK0wg^*w_fzKfpCq}@rnZEt^-K#pOB~iJ?=lr-;+2!$(ZlU z{Qtq+G*!|jBN8sS?Eg-vv>9Mm`CCVI|3|WaA1qo6@K{;(mAC%MZvQFf_tF4j3wLy* z!cm@651tY@F#E?WN&xuQ_Qk}fJl9`H=@jk!4}c*H8^Ond!CG?tb>^&Ni&XGOhZx4b`8bK3Ods?A^U@&7Y%Te$^9#4TqQx@wiH3>=+e zlMK;ARobYBXMQgEk0Zn>JR6Oh3lx&%5y^-~2-Cz1T)qUHBd!oQ@KHXU6`ev(LO&jH ztyZ9B=QJZ}qT^AFo5L2)Xdk~F`}Pm(xrINMPm<2pP2Jq*=@Q#ycNH=#LoBVI;EeXK zfEnV73(%L0xYz$mSl7uDhD2qZIv#%A#hp9J5Wwre^sf}Bt>0e&6qjuNY9J(BXppk_ z2OK&*oRt>im(1J%F;Ul1b|5>OajASk^BQ&!ZM}36^36t;0%oG)mx%@G(9Ft zc~@V-(pE`!lmpcHkl}&nN{ze9@SvJiA8bw_D1d_1La&cV<-oY$Ef0&PqtDQYQ3vS!D}{i z&j%td$in>WgnjmxP25nkJjJ$dH!kblwTI+l4U!S!^o&!Oxy_GNOuHMFD4{t&?bRZO z<~6>?ZCbWq7QP6Qf$@dFo}olE;zoCM=?I-$i+80T0*!WsS0y)9p;aOd)E3-s5wrC` z^ROaGlMrD^vKA#}&3LXj-@rjI8S)mUv=_H05cTJc8Q!nk@|&;58Lsm8TRSUm_}~Yak9GF5K}X4 ziOnpDERqVxxv52UO!UYgS35o|wtNfIZ}gYi?n`)IRtkmJ^~}#ua!xTUxHD+=39&S0 zB}FMHhlTJ;O>B-jpmIv*Sy?n(A_8KVoiSzAfw9gom>9jD{#a92#B=oC=TXbO>U!Uu z!I?I?hRgd1cn@5iD#h&I>@9R`;dDf&sNFFrD+sCi23YCuWJD8%t((LcAe-VUbP3es3 zyjZ)`pql5`Wu}L!>?Q}eXl4xqbehn)P%-8wTFHS=M47oC*RxqJ+rUjPxmpow<{1z} zn!)Gw%&hOFwVOI<$xI3^k2j#|(K{MTxXQOX zlKn|-oR1nRN-uy)*)#%|=6O=SqU8NBKYQJ;I%oApDkli$9FH-;b}$?1qh@mSl~{^J zo%>8S!Z>^A^^%;r)2&+)<`(LCZYDO|_Hpd?HjC=mzlH+74NDw{-M|FqT%%*MdbJQY z5#KG$>w^gKzp<)ym=ecOaXPEKjK>u{%VBW#C+NB;eAS`-5mS+fNOqvx=dih^ow+C1 zxtT1~WLMI)x)b^YOZhd{7xRhzSyO6v!<*!l*_m#zNY?qyr#EXF#l;3YS4XCzo^Ezf zZZK#$$sbA|+XO@Ru3#Dc%(vp=CFA8|)XxoSiaNI62BMc+rMH5FCx6s+IolzEIINK(=a{igCnEW@NIQE9+jc z*@C3h&2&#E{0`ce+_V>0*C*-LN4;!E>MU&&h6KgBmUXUZQ;YC*SurJqMQJgLumL-Z z@Hv0J41a6OzKXX^p1g-FrA5=OMjj61WKBVnWX+lqZh`n9*}!bTda!jv^L|atF0|G1 zpKV_pJ~CWd9*-9?NEr6Cdhc*3cI@tE58~=0pNk3kU$>d66!A=`ts-wr(BDNNc0%T}p|j>ud@F|oT%L(R0kJ8a0Kj-_i|=kl>( zqouBOkcKIQLT2ruZkId&td^o^i6)s*P!Y*mJm` zF@AI0UVp>AL;RF@e(+B~SP4{eJ`B4fd-XR^F^|8i+US zlq9G+n15y^QjFxxuA3Q&Y;6w2KhtV(+jssqbicBMIibdBLEehbFw+u<)sQQfaD zS#&5EL4+o1wh9t#wmpwmAr+3TvkFa<`V9BYcbQ0aDp^^IQtG~% zUt`G=uSE1=l0^6S_cWOEvIMO!oC>sgUgMv`AR|J2SQeB|d2yTohV*<^R#>U{0VecY zj3p^`6WI_J41AoBa~M3{z}BSAR8#k>2#bJnp^J|#T=uR!4?DNDYiylU>l^Lh52?W# z^vKrhOvFOP*tOrj#cV%de>A@X_7f`k2+0wjfTn8wYuP6C5xJ4zToz_fyNsaUvwts+ zEjOlmltMdBPK!(&{cPL8;8XT>*cKDlA7=riK-c_9UWB*xJaXfyR!*B?(DQKC;HB&H3X8T+K2I0IZ{8s)Tod?RA#rYFhQUlCH+B+E7Ovq zJ@u8nkI1P{MP7;zj${3;k{Z^Z*apXd+BD?CCrZxk8#So?AOb!@+HALJBgAq+JAHn4 zjs#gyF)Hnbzy@~BDwV26&ywUtcIrHzO34d}s<{XtgrMV`9hFWplMnqI=Tb7a2zo^l ztWK>}a=u)WRe7Rpi<8o}JfI@9b8i*HzPk%&a#%|E03#!=iJ-X%-t7t1wbV-4j_5Wj zO}ZInTeq-GSC%<(HvP-e=g~!t3Z=rVi4TWRV1kcm6(c6W4Nnhg;mN}5mgy2#ErM}f z7mx07W;G&_Y~vm2H$f$uYE^Oq|6um=Kxm|EXDfeHpZsHs;L?(QZ-kTr=dmD)^A~J{|wI zQ|qVD1&RXq8oYn1)|7bT`=zDUEicm=d?|gwAo#B=pDt*|bqhn@v=61kYI9sM$J0kh zybcNM9Tem{4na{UHw2S@Z{?deO7e}u?9n!AeK#l)hAYJ>2r1?0C`pp8s-EXE(Q13c zQ@YbDJZ+9n5AG7ac5_Ko{4&w=Bn#ns;Ixj~iULJmNu|oTFI754^}fY(PfjI83FnV% z){UUsvj1K0F6QYP`)}~c(FwTU0Kb3htv+hF*KMk(PgBqp*-k!MW`=FGvHFajl+c%v ze_;(?rs=H${OJf3n+R4S9^AmiEncJ}e_&Ojc0L#;%5^i<&qhZW{+S2r?3pkL zd2d>oG}GhhHJ%cBORPpnyZM9?{B9(-0gd@q`{L3m>v5tC^6@MsjnY$jBZzxfNlkrr zxoHvn`a@N@XIXwbe1*v_RXJHPB=_kQV$ z5sUfAnG1c_gzlL8+Y&0sT!8bOa-{Odj`ULX=>7k`9OISqyO$9{EiLQIqmOtat5ULJ zLm!9)G?=Q~uh5B8??`bR4}?{ggFh7MlFy z1E@73E|m@Mve^T5Z$`ptueCYKtq>}Yc9RVvg`Co&1UGvF&4rLu_vdYH6O4jzPhr3j zzkXM13BF&;G7ypf;<;+xi+S1wmUEgemrpNni5l33zjOLJ&K1{-BLnC)BDU-qOZS@` zwnImW@n6bH-L;oq5?;wJbDgf~U-$&OKM*PUUgD9Eh5JoHS(r$*DYCYvIDy==P_Apm#^ttH z$#+@L1PI6jt_Z$rKiq+y#dQ*4gpV z)O9W&4+Yo#czpc2H%MQMp3bxXIz&jgjki_pbjyxCa3kW|)45uP!r4^@0R&eSl_VAH z`Wm#1+FGLV)NqNZ5b35 z3_JvN5>)t3^&6d^_FLm70A3;VT<;eO1X4RI>1ZYIp&-~KcKnQK_!zqi6?>|4(>lQu zWg0ooq@o00t^CzbumfT#QoDG2@%oK+Ai`mPJ>SJrN-`{gsh;mHumg$kO75)>{T<>}ls1UVXqopG#;o!r7oincag zi8wim;*(<^Is-{{*D#n%d}%}>s+XhPyn~m^<)sNEP_?tYXH=$D$@Oc|vXl^t1*41o z1zeReo~fNpKrup_zuvM_fD-ZmpXm8es5%@z`G~xbN*LU}dX70IW~{R68(ZEtmgp4W zjYgigwto5GK^RbrnaS!}Vb9-^8MnLA%Vxiq0%YaC*sh(B#PJ$g-vFt7h{teO;>i`q z5bi#ooj0#_9lPl_+d8NqG-U12bkfoW$ZpwwP)}yrG_ycrbV01PTBDg;bS>57&M-OU*0!|#M~}fr8)RIGfv?c@-i<)w zpgskh9<2iR%$oH9+^J-jm60kr?3`AQ1b0gLx4lsZtY2l|D=|SMhJFr?`RvMUiMnr_ z?$1kxbmwP6;l%OK>s>ZX37^Y+v90f{VKQgN_IKfQ@6z4A$66;8_B*wT{Ku6ectj#j zBhlycU%ZcZD<6M^el2U^{ zq2jbCt*5l5Wsb0Cl)39RspN)^&9zNM`}Zl^)UE>gv*@mNQ;Y18-Tg+Td>K7BbLlRn zcOzohw`^9m#z@M2E&X}+KxCn?VD8+j<4VN&wk}`876=eS!9sQ1m7mJ%eONd$hhEm| zk}QPwY>3vqIo&*xF^a!ETx7eio6LmcaCxMGK9c?`M5Qzr>B|`kbHiw}15U(a#_~iy zJFlgPrTiz-wTlvNUoWZJT6v4*0h}1Q;JWA4wUBh;IeE67nmBExqW7M2@r^Ygsp1t6 z?7(m^V0>=Ui|%CoV%yJgb&<M_BM^cFUh($fow1%_Pd>A!j{cVW_E8#5m{$SHBea zWxq@qfK*87=F%YLGoPh{tEcH1Cmp?o#Us=WEnIIB#TOsVz2huVi?-R{TVJWx=ZAlO z)w#Rr0(ra7kYLhDj3{ck>Ky%>T-i)N__1NnEoKw4b2|ReQ&@1MRij((n z()@rZ>H6=TFLEPnCpk>>;bTb}*9>&XkeoubYM!)zP>_>VCWY~MC*s{zR?~{hdd}mM zKz`E{Ipi;i(DB?Hk3^d*j(NNhcX6ndDES-}C8nI>Z(^`Dcc>Yyu7)j@bn(O%+$0c- zYGrmDaVi9+d z^mcNE&$5`qSJMtquc~Kd-Rfe(wps|LD2D86MZEKJ(6a6mR3%4yBW-;$`QCmN z(WpsQ(xX(s9=^3#=}%7)R~7HZDXHGyxfa1f>3C9JK%gSc2W$rIeCrB7X*)0GfdE}` znp%NtQxiSc{xVhQrBIlwQSErrw=wc|L=$(|SZSnA(r<-MiZsv92^j z44z_$NZdJcI4-HoTDJurY#h|hpv;DV9{wc?$IDbNg<%wPC8gb)UA ztu}5avIW=0N_L>6%FtB~13|DB<|I^SlM=-DSk>CxrpS9h$kzAB&X=58a_s4k<&SiB zc1eKhG?d=WRcg?GsgFU5aPfI{j1ck5TqX%)soLmEE^2RjLPss1m%p6>Qr$Up0mfTe z=r~r`2=Tw5vfdyhrqO}{l=V5*XGvxTztfMsq|wO$yO*VDIyOrrlzqF_?9oV-ls{#> z8eRIy00`I?x44vtfw~9`-0a3J-Q>xA1zHkQ2lT8ym|RH z*4uOy8`m9Vl`} zdbA|EaI2N|r={6CPJA%2L@Z#&Cac{qROIwr+(K>DxTm=d;;S!)g4j>u(Ms|c&OH8U z30|BxTX{?aAbQ8WPS+doMRD9&6298q&u7N%iawyNuH+1g0OoMqD#$9S!P-1}uXba# zG(1J4eqCD+z5q$B7TQ)zd5MGctE$uiD`42ryvsQjoI=_PS1)yFd?bdxMP=f>*K%uL zq}f@V0P3VY_}!ZX+yciBQGRX1D#i#^k`Fl?HYV$k**4=u(pXcPVGHCIE=&Tafxx!wlZ+&V% z1t-Bf{;le*UabbRWE^Y(otHp6?2eW_v_|zto#{Kya<2|!(Ni79zAb4EO*JaPB~_y& zO&+F;Ds*nQ*iNVAH1z-!`sl6%oScQfQd;kRw!nq7j7v`k)sv8{N%S~Uzzfh90u7!m z=XsugL-hqUC?%A7J?SJ8KAxqpN;KXP0O2t2_l90mh=*&X9l%}I>pxbX0~RPYpStZ# zQti0*X@4P~r-XB?9v?3htMxh-*e2OjfCm3X$@zG2rg~D%h&V8RCp1YP^9J>a+GV+lz zfc4@MXV8Y=Cw@g77${l;nSeTSzt-ai5w38(6R1Bhy1=w1rII;i$tM|*i9a>C9kD4^ zKfEX8quI$e7Ee8NG}T*Qakckf0S6#jC?9t^LXuFOq}Pq;kAVVMDLodKwLyq@sSg!$ zmX>;d$!HotKR25qPT1#<5Al)F>O=%l>JLO;_?>Ol`8pD(-`U^2(VH>X2B{ z|3T_6coa%k>y5eO%->83?6621i{q;~$s4rjIZb;^on6FvwTj^E*uxWm1p04wp#~s& zlw&Te)*4V+IQ+HsPdC%MhDV_Fiw=5g{1#|(%~=4Nf9;)Z zTP9m8@SRN%yae>p|4MXT1HVAYgr;D*^ig=xaQ0#S&TirNu1_48hLr-=VQbIb?N3sU z#l&+9kKwuRoRcBjpVI$0)~ z^NtNrsEXxDw%~aBza5T{BkjbrVe!%?b;@gXf2jdDGy~7j=mSb=wrc$kJoE{Izpv+G z3pk^ zUBOK&PIKrluJFtscdviYUw`_4LW>zE5PjK5gg^Liu=s;c+-DM;04tI+!R7wxpE3C) zF9IOPZI)GO5B^VGmWm?B7JNOf*7zh=dF1d#7XuqBo;TILf4xgPMO*O?@SsR|JlL#Y z?L(Zw67h*L=VFYA%KdGXtfZF11q_!SBEJY%Sp1ye0QMWbg7~GQEV!76zL(*;Wof^7rf3=8uvTAS=1{bL1I^0=ud}g_LsP2jU$bV$O6-3O9)XL zM~dfLrpT2l_IekCt0=iii-}b(6ca0-=OT_EeqL3gx;9E3x+PvuXHPashH+nN*pRyG zI2*}rvdU(Q62D9DUIqL!ND^)&%{1!uF}+di~H1gFSrm=5%o8W0L-l&2ZL zdJ&JU3BRYOm6(qvRzlh0d5Py1qtuR+++~#4UB|YX-tzUF-rDuxOz&?07Vm}}B#TQA zVXEbkm*T&yNs3>p6k825P7*2V z@g{U_u>y>Ufep$oX-54#PPcPzPr@0~Afb;+pI#?`GH#%uWPYIR0N)>D99#?bZ-dHu z{5B<_*PP;soP+|D;o44{x!U4SH5RE8>&?zJK>b{!DRXfjdNl31?%g!CDs{1D0*>~Hs80;70KFSLxfO>=1n)eHFv2l|dXp|XrS z-z)|$b7RMUY4c`8T3CTRWUo8Em;0GBnm=#`E@nY2lz2!Hjk&Nat4nvCK5 zUy4@0#s2)950&0Hih^0s6i>xl@!*ExX0FMGl4FC^(ZEHo{R>v(m&Lf?nL9<(7eI^t zS6RclN<_0`^3)QXJ(gYwq&Vm#*k%)5vbsi5-pm(4K@Ye%nDx$}P34qgi?! zs13A};;xA4h;L2jB@Mj=dfFW5cDlUGzW=_YpUjQ*amJkbqm8?5TcvakMHK_LZ4Ipw z-P{zrM8sTD5dp+tV-TjAVIGKf!EprFsjQ?9yG<67?emgS zqJ$h)pPDs=DM(T0)bDoGH6`j?a{=CI;XvvFs0%VnxUW2zoEj%HeqBnkUP=p+ZIW{$$7MG`S-(hBnvy*&0{Kgw@05y~h!+d=;6Mm1 zex+jV*Xx8O17U;s1T&d_kFNG+rRE-7HFRgM~l4f;$-(hLI-f_g2lvH)@0Su`x13P0xR|5F$OOr0 zdiw!bwxtMxS>kS-0#8%iHA0lPRTYzXGolPf{M*=?x`g~JKN4MMH_6`Ftqz_XJsTUL zYl37+FBFyq8VgZ&b-%JY6p?0*cyH4CCa<5L#HQDyev8iLpa=#{@(DH0Ut-Qx;Q!50 z>HM$8hD`;W7LMJu>jmjGHHA+j&Rc+n|5dwgMEym;l$5ii@xI2rvl?gby}EkmX;yNR zA_9??xSW|&lap(??lQfcxOn8p4O*1QEl`#KNeoD*io|17s#1rL6@_1d)~5 z;SCAmkb@WQts_&nZ%r4dO;@n$L$`}ZH=nRSqbqNiWQ^kSS^}4kO%AtQNZgxq10lZ* zlyx-Pw$86qX4=dhGSSnn_O?Q*#!|Ig&c4`qOy|EBVv}s?E3i6^>NB&oHtV}Sonm{` zoonW#l48jjyV8SzQB5t{C-*cgSLa%;HX!;x54Tkgt|QIjK<-+KLkAkd>94pXj}9E+ z)=|BUD0;fM-0*pDH7Lq9^kFumWD6n3GodtUL|1MbQb;hTT*Yi!tF&^@{V`oX3cZS@ zo_t~k5(4eEC*+m;7BxE?PG}=&J9EPuQpviJ`+kaoE@+XtZH}LZIst}MG*~t(??d^p z)DxJosFXHR%v+jX-OS-F!S>OWkx2Q?9kjE5Fq9EO|EiBbYk#|aJ(-x)&mwM*n>OC) z=!KA_uNb57t$2>rhQkS)CKY8GArY2ob|np6-!Yl;Psz7F=LS~;-#*wdyC24ov`xX% zzh@H~d$&xfa4t#EylD=*Ph3v2SZ6zN%lheEY+58uyh1FAlxN#!VDQc3k@6eX<7~RI z^wG9R^zim_Q9o2ze|v;1ywkPz_EFkpaCT?Di=Yz%B*Q z5##Qh+C#pJExaQ0>b#-ethxL_8q*%yRr{POA(RGo;H*|d>LQP|kt&jt@%>*VwjT{H z^4u}MRrj*wM9oOz>btKQxkBMiOC{j)aB9CYb|XcbhP9WYy~R+5_>!A?aa_Wi)^kp1 z6r)5*8b@-e1n2k>s#I`3`4VsDV|3*7ml#G#SZ15ReCMU)QlwwivXC`PczfrY;QEsM zfv-x?Ef$X54~ z_y(2Omhs39!Iw{xOONaX585`W=^w#M3d@zKQ^&sKmf65^QhwUR4xAwykFTh@Jns%0 z`O#4H%HR=`B9n9a?2bm|Y?TdP;LF`Qw9zUZKGq`h}Y=wch%_~(U+d{zSs z>n@88Ya6B8A}1F^my7I&2?XbBVs)@{Ld{QvfTQnI5Dhj1^G4n0urA6)yZx=-GWvcY zZFwA#qXz1YN#Y72S!{S)oOxl&#ZMjD@ibdWgz)F$mF6biMQnc!KdBF1G8tuXM&0QM zf(SYrUn_M*Q4wE*5>4|te5UY0Ey>sBGIR>H+3oDHnEP{M=Ss2Gf*Q9)j16+{ngiA& zB33u;_O0=1#Ot{g3e6|(1|p9J-ut*N%9K4ZF@NP{%R^VUMLma9(xR=IS=8On(2iIg zs&2Y7y5N|&t{G$}&aJ?ks-|1!JXAe^PI6Ceq0k7j!}{)cDBRwwtmd~wQ3)EYB8?CNyymUKriLQWr~$pbtuK%NOn;L zmC3|$=`ku~O-RhFTQ@4%4yLo(z=U?;+rRm^deo}HVN_JC$^5Py%*b`u>AS3*1!9a} zVv;pOQ{)6~Suv=`Xm5rzO?OHDLX9hAHC+Yd@uWFb>s2@D42;GABPvvU zD`!>@mE?5kq@l~u$CF7O0`)4^b5l(^@Xd~2w2vu)N8h3HYKzF~hML8$zimYJ@2iM?yP zvTkw9+1N4AaC2aC2wU$rGA-RYvj)+s`rR!ihViuZ2Lf8Ei4xgKdmGrmBKt_$$G)|a zRvyn|R`xl|I_PyIvfL1^aK5&0EI4Uj7i+-IbonY*kfVG)m5204Qo2k6Rs?GWL7@8X!3RlhDOVrqzEe_HfUkp-oub)R zbjP@h*IskSwm?GVU#iva6i^kdwqEor&Ttr&qr5ZScHusO~`X0yz zjIyhyrZ(N?IB_RI9gjFp5Txw8Q}=LVK&4w2rD;f%z)tN6Y#X zb>7pB3kGzF9cA@NUV}rEV=>8gZ>o^3IUumRQ9KD|nN8_1pS5bGIYg($B6-oLv^azv zaOxJ7Cs&?SKECu&J(rz%g>Kzu1q5n%!!BfPI#X6u^SCX?ZqK!w zvx49C3E!jIxsIGPnQFtRV$`~$(xIXJP2ZJeT`VIR_==OSm9t`lZX$`{j{mV|3wh66 zSc-NA%5xiFWXx$acjy$}r`cV~Iht@uYk zjmYCw)|T|F?CEt0nyzLS(tT3{+L7ENaktd99e7+@HECEoN2cV9a`SdjFNwc22>*ZVy?0boX%{y- zqK;jRsGu|j=}i>r0UQEImo6}5c`%p@Kk$uBns+1`0@w*Ts`ZJvTJ!o zHP`aE(H1sSQ5m&nmd_)AlQLvl&lb0|`qifQnbGaSjxSx*=GLU0q8Comz756(`nh|y zb*mK#n7qNyf@&u352@N&K&o(Pls;hWp%zxOFQ`Js$Y^N4M)68g-m`> z2WvGVIx$ip)o*@Cbk5==zw)`>4D8lafVq6n3%CeqPqNAJEeE5Q4D4&$c^_6_#3%HJi5{F13?b|>-iEp=ZH@pW;I zETba%SUH2-Ixm4A?E;uf#0*!1gX*AiUdelFUvE{Iv$gw&w$y3m?UE+Ixx1FfJC>7U zxjt=V+iq)Vvd-Ah?+h=yqJSQk69inr{-os?o+|Wue7vBcqn<_~SAdmKpXL05y?wiK z`3f()pnO9vl- z0nxXcJN~`4?FEQ_;}>t-$#6r9F@K-*l~O%nv)j*B@rILCEg z2gT&woctP_P;bt>F_@vCbBD~NBQ1jbwaPPBGiLFX?RtKZ1t9YR<KrqoB&$CMNG;DG59hm_JQCtDwNKW+}n4@N6M|nBWRC*5FNv z?UvZyKakb0n)3$Lw<$)2)B6mIILogNeSDY7hP77V>iDB`yh)m+3%u@ z_C{cdcv@F+|Deu*h)9i4Oso2q)z^mnC_ArVe($*&lWZGi~kX=J0V0R~{Zv=|8)Z9cd8 z`o5@}fERHbE$+L`Co*#wg^{Cdn&*iDWX%YmxE<{z%y^)3^3Ehdkn`C@1u3yoDA&-p zU@S~KzS?X^z;}%VW(?!;+PH|C>1Yf;L!4fV)hyvuT#-Q+QnnHBUb}y82Acx*)z~am z1(qFl#j9X!)xUUYw*}+hc&$Sna=B)s5;d=iF3oLs-fUwW`D^^%KY5qfjy((uNk)~T6BwN%}EdGXAyhYPfWRCmr~N+5~R{4 zn7C6|l6kocIO5UGoc!d3jV0%m%t`G_++D~cp%)6;_w>`caW_4-y0k{Cvb&=BNI1Y8 zXg+BzFEi>9^3MdIgXj@*owB9rjX-gBjW@cu0ZW`WTCL(P2+`o)DCd8J4dG%-EQ=uM zxSbv>JKr^TBnG>vAzmcNht*)OB1Ls%G70x zIs^C7Ri5st&4gPwq` zy%3kL2Omy2UT~cVrsi$kKwd%4yx-i*Jj&L!tpo@)fiTBrjxWXnaK8}`O()3$Z}4=_ zTibP70Zx1Wa*Zf$&GcaD>>1!>8EC|(`RkCfxvO=6@D4^Z3Yy%V{CeNo4NAW;@{q`L zPq#!vdu;VfRmkBIz=!YLGxvbOGYRi4n1DG5IIds$p zns5e6e0EuEbU^eTjJSmA0lc`@8Q#-O1&~|rOXM>tIb)W50H;sIU@rWymXvYc-iWRc zn)@zR+BtQ_8&dhn$YHqyvpS4@PH4^!PS@z|KYaeH{eK%0zUxu6A1eEgdH(0aBgH)&WX_;!QdMcc25Uv{zVq?;x!4p6 z*6dCMeoX6tsrp`h_qY&EQSc4ouPC}NR0h8D;OLR$D;k#haS=Ze?t3U#L?P4=))j0B zPB{SJ9Q(&9&sX=D+`0uA%{u>s*c7SrH|{jUzPFeF&1S--Tpm>HmJ2y=1P&0t+(u;v zsV>cnB<%;H&V}qjA$@z1zS(A|@RrL!eEjO`fq9r$Ur}eza2;$j*0`9>i+0Pco!gH9 z#KHG0VcsBlGI>G|Jk4-Q|3~@QGw&XNC0x65aMAt0^mCwimN3vLsjaM=5)MqrluJ9n zr>n21<`A|cKZN}N5j+iFS8(Y24m6rYWr(>fDeR>X2fL^D!vm!7d)QTf%6vj-emHp2 zV&jzR3h8)a*a0DYO??E2yq~+Z(R?3y1GB;|91%<=`*Gj~qNHMuaym3@In|#0;RM7* z1N1O||Dn0D{r3=t#ey)}Z|_(9A-s~R+*>`mY<=&qKwo&*e`W#vXCJTGzZ_ahg#%~n z`ki-(hV8GswX)yI|IF(>n)5E4a{7ui0Fdze==kpR2~CSR;j_P-A-Yep|61sVDv)%D z%d)TQIFLK};UWIB*YD2ny#+cW0cZOb3s!jFrvAsze_YbfgA-6`aoB(69REjwJ5~Ks z29o2zw(Ii~?&vQTvuscEl8F@Mi-{=7yUhEA;oxJf0m?mf+va<%G=6c!)!n+n@x|Hc zeE{cu5A@$1#%}D{2twtj-@X%yIbA=`&BA2*T0dZQW9t)|B~fB^VA>(QEH~(W#vGu5 zu_v&i5~2(GJ+WdnjkgY2M`*ITAoofVrWE`lEEUSjw1ha56T_7W)#uXx_Jw#c@O|b%wS(s5&2#w8e{G7i@vjW2 z+x$EPykzu%WbOM4%_YDd1%JN>Ki=anRWz|5;u4=qS^F`MZwv^Cn7co1*V;H{I5Tdu zlo7RWiT`l;zf=a&USK~fuQA*YuHmn?^51pt@d818brAQzzv8z)$An_cHNowznSr zQWw6Xu%F=3%}}4iAo96l!9U`RD6k$_JT9^xc^K7R9Cb7b%>Y}Q=VOFn z=B4#E1_4;7U*bC|laZ-NI=J#4$>*xx)se?cpDMJj@U#6}?)q7tXgVELX0)4jJ^_)V zX`0$DpV#O-RjY$`(`zlI`i=nfR6uUYwCwEPBlCYPKessVwdF=BOji@5+E+|c+o?l< zN=V9OaKp3jP=q&Z0jbGUw)oZ1jDMbIE?uf#q68RgC?d<6E8aM4RF9jB&>Jq#Joce>wyoESkBZG;+V5WRR5$=KinZ3^#w zQNBvZfaB^W6q%vt9i*D#;T_D~FSnGM2ypFbQ18}fxSY)5C5IxoGN@A4fPVEEeC6f9 zq`BVoo#Me$%t%NEjCm7YbqkZ*TO|x_JQvn2*6*ma_$tJ=5xygP{aG-`a(582djnGo z5|UHWM#6}Ra9cSs)>?r=&2T42g_S#^tx!+`b7AORJkAot#oXSW#=PwYvV6G%IW1!V zTHS#=F?GWqX$4N^%FL1(#;(;HwQUwLBV%0ys=$~~kflLNa20!MQh4F&Ht3X7QX}Jt zZ+Ac2uoB*)hH`y6JbTwsAWSL3wkF#D<;)r%Ux)$;N_70(S1+!HZGnmPRtIfkc5dk} zCt!rvH{y0jr$R@D#yJF`m&3*va2_Zz6RoB*APbp91_wh(ZAJwCVXbLwzERdnn^~OC z@ASe~xG?1mVG=8qG6v^X(}2Aq5t%4;&!zQVEJD`PNEBA&yikw6trA%m5?H0$6avDQ zrbrZF#S{qO6d&>~6Q&h}bz5_8kM{PD#iGG|mm6UR4CNTLFvc)1G&no7MW;rR!s1)F zW_O$psL)QeAa;g)^o6>ybVrWczz(4|Do9?&N4~T2optZ{*$N~!VhiGf0yZ1g6Kt0B`rnyo}aeAs8)1if24Ak&D4a0kUI zc<4E+P+eGSjeoQgKZg&8H^5!npQZJwGO>z6B^s|0U;<3aGmo^+2DR+^RED^!P!46a z32axCED0pzp4g?Ox3NnGu`eVbGUNqzgj_j2ctY}4zk0-fE*53-%`aKHm8Rr#lk^MS=P%Ef`Uc(3|vrTTj^@4CJ71#PKxLD4q?js@Kel_qe~<@<;g}? z%nFlUl!Y8n-Z+N^^F%|py`2fAVFm!hE-Z;@siKdLGZyowZpFoAOVGe18N*o1%k&>& z#F~_d%a(4Pr8PzbWcIHu(%+2m(`CC?WFDnZ+jqPrw4q47Dvk@r19Yzhy-@_k0Dg{$V+GB;3%v3|sv>{%ua-qO{EZ8HM{fV5bP4~V@h!`(=; z-WjbGVldUri?PV-l{rc(ZB&`~os_iz-4gJK1Y;7(=K7`znrqC{y00m`2%?G+*Ijg5D6~vsch*Hc>DT1w4t@ zUKAViWvmPWXWpu}5tyMhuN%pMvDs@_W;ATjM>*-b-iZt3n_Gsx9=3M=eNH2m>#yMg z4(ay?b+LU@F~|tc9*r}euMKK5AjTCNZh{}WO!ZZV2HvG|a0|E#;|l`5 z{yN_m)Rx`Ij$vWh7GsVHfhj>e?@1Iu*Gk|ikPC_g>+P%y5j^u)_u#eKp)G&-N2ZXF zyORb~YTQL35IyBowZgz)+*Oj1oVX+=4O9ua(6PW2=AEU-8$h2-Qw)P#Ss{VQSHf8H zrxf+%XC#@C)|DNDm_IF>UtF+6suqj0oe$o`#p3~n=+hyh2Q2zveZdH2Lfm7i6MFII zqPtUuk%*{S8E>J~K}DhEHry@KP>RnN)l{ie!_5hFeDJ#|cp=#-$y;jlysfR0eF}P| z1)b3dIUxz!u_~RKRnlg&yaXr=x^AweiCii3msva?#(Hm=-odQN7jjzmLPrO4u{6Jk zUJ7>Qoc>y^#nWq>cB1tnKV}8?b8XM)-0(eue8$BXdY6*A8-LS+J4C{A!y407H~5uS zuWcS%uFmLD5Zu4;aohc~*Hk%F*K2dNLqf0)ah)tY-_(!D)8wojY~7>>%b8EbHap2! zi)$qJcZ)til&5jO7Jj$Gne%TPT z^BHDY>2gw7@oeGJX1}%1>ImghwC}sxskzlZH{K<2m5>_h)lD-V@4hJ}%z5$aF}?)m z57^b&Ouyr`R-lYml&~`1;0?-sR!cc>=_Rdcxu9ofFF&PVcfXK{HocxNOsrLMm#2~^ zkK6FAlZ#Db*rfK3Tu7y&xE} zI8ym&>OPEC7P>k+0USpUjzy4j;z4n29ZAD;d4MBv~F9%C16ww7PD(<<}CkVNPtN zcS6qhSv`BkRzx>6G0Y=8qz$-Sii@C(fQd7ImCX}@mR|yN42Nv`tFdsU6V^Nq;4FvS_?%;Ca zUz~?EgkFvB=;npTcynn+2By&V`W;WXmlS0G&%ac?D~s7Z0h* zeWjX$7f7BWbN~d1Hca(_G=(y)O)qAT#;JDpY{qGbcMt4(J5QE|3D-CjDTTrG#`;#~ zj$PVRYIH*qUF5I%N`KZ!qz!9GNME_UeS!mvhZ5?O8sw%F*&3v6+E$Wxhh~~w$%$1b zsehR)IbO#6*vwgha(bF^*T$l;ddNaWx9RX8nFcX7W#8*GZo5dzE@r+U6yJS&SJT$>?hWt_VUHYqeW6~!vBcKj#cDbp!MMS$<6)ax zsCopv;C-745IY#@WAo~+qWQXo8eR_UTtV>3!X|qvykxTyTHLsEv`PnY=CGp6c9&@v zA2s%R8p-=?SB%h|ujl0w!BlBxG?JtE@66eLZ2Rzx0DJB^d546Xau{e71{-XxPZ0HX zO?#gb6gg2(DY+ks4!EbAc-7aigQYg>FZnar>6O{^u9sMCXqXaDUOMG}Ol&}OO{gZV z#@%0r?_~_%l%9eR9_3P*)tdhd_tvap=}4>9VwLz{LsM+ZPdMX`nc+MGm5dnH_;{@Y|i zi)Q@9pltn6a{oMIu)J}rs`IEU3GP?BX*7g(yE7>qL0`!rN%sPgVxD5;v0H(CfRCPj z>sRY$Q1^P}!-R?{Rii8`MY*U>8SyG>TYbayd$A$mn`#+rf$dL^@&(BIWNDSWk3fj} z_%DdMw^u)&!}YAKGYd}h&gd5>F&QP(MP*G<6~mk$gt#zPXC+Mu`ZJavAf#CZLR%z* zD3}%a34IiJEnrAxCo{!s`^1%PUM#*jh{=b(m?>2)nT8f0xYh9ple8)-jQ7q?#er z<8S)a@*B60zC4;rw&B_Bg-o;sn`}CQ)YjEiD$y70byJyBtnBJ)TsAdZ)>Q2%O%$2W zWUE%Ea3Xt%q*jJ$7{E@yW9dq7Q8>+ms84VH9Rz@r{hFX|IpW9V9 zFgExrA^9avEKI|$mw^w~Wh+-^M{R{m2y60`4Dy z5;*5|)H2bPHQAe+P`BP6FmV$v@Eg1{ID=*N?(`yOO1AejkH6gW)#qbxlijGOUXVn~Qwqr_w z>X{e9@W9K=4kVv$v>vf4UyM}9NM?AVtQd1z(o8!v>%B&~Ube-CySKPY-b3p?YcZAq zjV<+hb+N{qHTHBKQ#5I|E&-5S*UZBKnuCi@>X&rozUjp zGlP@F%hfMNx93qax0!2I*ANP>hV@XgrD^yzh#EWp&PYZm0RiT7!6ODmpFaS)$P0*Cu@n{l74fuUeYwN z&M}RtE^GzI;(ceI@0}hSRBPl=e~!Y$V4z8HgK& zB!qk~KB>KQuJQ6l$I0pvD?-EC3TTDXIx(Z#ZY;u=vzinGv6F5}U}7cUpX2D4?tn_` zZI=FWpKEk#HlU1jY{NpBC%sj-p(BKD24Z0Vd-U_?5Vl6?3o5sL^FER>U|&9T)W67n zdKwu|pXiW4wdDyf{Atd(GPR13@X;l74A66>O|A^xZrV}}c%L53dOt&5_9}2JXhVy*Z8Ffx{>#s zcJuD3j3z4gN8i(&}W5nq@$P;;j4P-`_%orPVX!*=ky46?Wt@ij%jN>?}<~ zw@0UQ1^Ijsb1@90&Y1PJD8UL^Dz!C3%62WJS!hPJ=kTrr5Y@iTY`fB$o}|#8rs!zI z-d3)V1PaEkS6?0E2=*vhTW=GEc%fkNK=nx*ITeYM+7^s(;kG0j$lIlAdNKx1Pc%1i z=etcdEPXadyx7U4O-Yc8{w-5We3&OCG-So9a-Dl_umCuGq^j95*PdJZxO_G`7p85b z;P9}?jA|~mBcDruXT3KVAzQf|WmjK~#aj@%dzbMc0)^(FwwgsZuT_0BKm^i@*{>Nt z4IDXOa=xp{e4m%qSAFd>k;M?t&_b(|xu!W@MIRL9=Qr$udPm=kd)Ys^#Z~HOZL#P+ zCekyk2NrYbGbTQhWE|opnlGcYp$vzIZ3CYvIw94!9mXL*3jm z9^5WqwBE{7nEL9h2T>qYe{>0M!Zbe^J*AiNB9*^D;jvUrHdLOA+N5(94&U&TJJNSN~`h&cg$oeZ?833Ac6N49~es zB~BKD@Ih58bJm`}@qu4QFt`(3AcVXUbjaQ6Hih`k)e`u0bqICqCaT&quM79C!|5cw z&*hDj8AmK1I?932T=+FaP9 z>36QasBqJ+Nm$>XL9XqGmYm~dQ!ZA++8OTE^A9$#BrGVpCU;N8VBC?mi4BSwD{@p( z{ZzM=!i!XqbG&taO)@q za!84F-r3UZf)V_bHDyu?I3E)+V}Me2SHGzS$T&);PLMBNk-W4~da61g7yBTRlKKjo zdSmxURAZ3ml~%~KecCmdG;a>UH9*SJBlrZa$Qkj44$6RxLfpSR;w6_badI%wuGYYd zWa?Nj!|(tFzWYi7uHW7|8T*{V(nJBA=gmZ$@4L0=XU>{wgYps>npy5kI>L{dSVuPs zMml$^R`6KbiA71Onr3Abt8IR9NvmIUsSP(H_zdNqF1H#Dv^8ffcoAU)X4%lE;*W^X zAm}A|$J+t}Dzik&l+hMVuV#d$0~pphv~Gd6kaf&3GUZpw(?S zwTw6$<}17eFNmCM??`ne!!2Y+Zxos@048EKtW z(A|Mj!n`{jY0`RdKys}E&s$sX{z*j7{`R$sB*k$gk#@GKq&UnA=3?W^r<2nC(9^v} zjWjg$seU#bTfRuu0*Jq2;sh(TW^DVQPixLVcrLH>2yVPQ1DOCECd|K<^L>lrbd;Y~q!=_V&?T-_Kt@1&DU`c>{?-{RJrYvpxYS{Fq?RVQ%hpoiBK6 zX>P$hZNxWlP!l(p(I5OH9wvy(b&03ys4gn29 zrgQ{3yn5VZaMi*loEgG*Sd=3bRy6pc>rH!>RPM=hEoS%+uV>hp+-ayUu76w1-)8Ob zJX*e{vVSyqNAQf9wU6M))7Xbue;sZ(5yd>#G7aQ95^K@x+&)kxRO_Nzu~v2a3fV$H zz3~`CoP&wD>gNw6PScpfNJRW#^@~!ITB^oSL}!wzpuOv4DL?IURGcRzk}-1Yaoc-s znPnG{QRsa`}Tb|&MKi& z@~485QUDuSQn2db9J3@0y{Tb{H4)?-uY3M%7)h+0!kJUHOSoS-Hp;h&y0AJ6$25gl zGYip#`H~#S*{T_dv`P5OW7ojqNe!nmR^-hosTj|1+e6CqU4ufzE{dYnv+?R1PdH9* zO7vC=>u%?U6Z>{_xD^8fPRt)ud4&#l1}bp^4F%h%(eo*7n@cUTGd1CN(sv5R!ddl{ z*E-C$?v+lDFAjiFU-ju|kI8YvtKNoj*O3J0IV8@l4#9Cw4!V#A@vEG`QO?*V)ctYE z&RehssFkkSDv=FYh(gc2@sXU+tO0GHM@RIU-Xft&yTAuU9H)s6YW7OMa|lFfhlVWn zgM=XYp?9h1kY+=&@|4lgk?fZZ5-dXF_!8Q`4O*39G7Coj;W#os@!?9VJw^xz!;tkZ2fe`p~9 zFzQ%IF8&2mq_U2X8UfR`kkq`Ef47b+RojN9AnU|A${^vQbfQaGLi@*H7XMh6oY@w* z#I5()f-)p?kP){fBYT_tHx!2-tDk4a_8nAgq@#{@USnQ`$Kz&HE1y?Z0Ee=TzSwHV zhu@Hgr^wJx<+(GAo=_TkN_bVl+mA<|d(Ovb+goSVWv6SHK97`?8!3OjX_%JW&|0ps z8KQ!Hvu(0%#osHvA#IXx=@Qr0ey4Zh=yMgzr@fRXt2J$vj<*YeMUCQ&y>>n|K7LwJ zf}07SN)0zqaG7jK`tli^vJ5Ce$UQyXSHmpxmJ}$nX=v}8F~sA_<*v4t#Nk>8)y#kg zJ8}ZaY^stg44_`1_80s5e*119Y(}G=HYIop1?10YS4Ln8D&g8omRaeD?K}n1%#~x! z$efjP!RtUFPt`OC?`9Tnd-DN1v=27m$8*8mrj3N1sIiZl3d+F^TA=SM$%2V4 z%Q@orbR6NjQ3_l|0t_=Yx8M3bqT_TP;+r?8dUA41^BepSkY>OeX%f>Z7_Y@EIlXJ&y=t8rk2M+&H#x|07>*-GzoAM723#>r|oS+Mept{5HJ>G#`^woPef z9fJuoHM>9hkE5W%OYXQYn`0g*z9WJdau zZ5Wrz5C>K;PXiy)Ye=*DHe79)w%TmH=@0$*tq4}i$S8d#57#%>>iHyJZsq+Uq1D$k zBM>m>k#qBTNn!d*qs(#-T;R}$%Td2&g%bV4iY;xHv z4f^f+J}jE!TU8VaHE;|s<81F!Tnqkur#Ni6HcRmQ^TFxop1H1TVh~Z1rVl=yBX`zACel2T?C7xQ|3ntxk?(w;2XLYm}wNyg3X0JB6| zWr_9A6FI+en}+RP*9+~!F#~-E(mR#rV~<)lG~!$+>_b8Hw8f?m%!=Tio^S!MlBZ=; zDqNs=N(AHul$V3U*bCi}bL-1P!n`K4vP{svVvtJCA$hksHXmOO*j<0v&E9O z{l;B+BPDpC)&U)oY^BoROmK=|^~~_^?*oS~2g{-a1-pSdbq5a08+?~aCnRu)#UwX_ zb`R+^j|0cC&cbdm5yiV=vLMqZ0AHy}oQcS-lCmveIb98EtGq99Apmzv&GOP5&Vo6q zxDT(euGQtu3ahXW{e(7(%J${LB!G;9sw9|TZudc}m7vg|Y*{7E&7{|Ffb%1h@OMp{ ze7MG~=)DpH6h6BwQ*jB5YV(U;!q_Sy5H4`y zIygU`k1|jpP8s)eR!juJMd#4d;!|u z*iu(b3q&FMoB!(6 z*C{2Z^^~!#(ewb*R++?0Vau(uIZZ9ha%o`+%hp26U9PPI!Cq@qG@Wli2%giNFuTC& zKVpjKg-p1Jb(-x*YgSQnE%oX?F1KQosU(dqtrmn!OE5cf=tb40t%ksNSE694q%ZIw z<~5d$rO$5g9MW=IsI`g>b)#@sK#(KoXCd(Ss$_iX$g)w%N;yk)n>zMTSU*dWD|g$5 z-h2(mSz%&Sp9!iw_TJwuzcZ2VDkrT%0%Pa#FaCU`t>d8%@yxE&lApaB!9p6%)sWXG za4m&~R@I~zgegRDCtnU@a7|Lkk(s4pMy~c9ep?Uy9YaCtnUP7i_LXk3W=6W^-J}U_ zd&G&C^K49}aWIF7O&88N8$j}k(@$`21H=>bZK$po3r2KNA6{w$E%`Le_u}b6v$ACx zH4gR_w?br7RL~J^vl$??ed6PjMV4q)IUtg+=Ab&YSEs^ynPc)2s-vCHu*sAWstsK$ zoS3#>!*yZ=8nUm)sACP1w$Ab=p}|zAi-r1FvvD&!4?q8cTs-P=eHAI4QUyiC389#Ev8Box5~P?Krq^sJwV{A>P@ z^b_Dp#+CbO`Ujq~fapA1B^d2^c`8-C zD4f%rJ}5NMT?RtBb+KPpgVjm-@Y2j|8sPQpA(h;nRN)cg*D=4Df*Zw#PJR^j5TYyxitw|2wMsf8=&WnO-qG;fg-m3`y9)y;Uqe zeG%YCk&z%b!qGS4$pBgoC{;H8DlYNh3hl%G6qXFFPKq6 z#X8Nd@t+czloN-}0gfJJlS->FeWT4y7qMCKw@E5g04m~;q>xD^a1<=k7aKy>l=9*X z==h`LnR{XR>GY%hN^31q1*9iLss z&J-+Z8xsXobj@dNqZTrDGO1D$Qo9Pmge5RA5Z1}#UXu@iu0_^6sX=OlXs0g+2MZdB zx!C*gbW0{7d;#=?lZlO-tPO3LM3BF1KKwpSn9q0yqGJQlrVDI`AXa(bkdc2?cl+!& z?J-A!(Lw-Z$)K>sQ(>GbikqID`!ryGlI{Dh4?qmOz zO_v%x($1R39XEr*bD~x%!VM&}+H_b^$@9;sBj^g3W~@fQa-yt#NvC$wk0r8t!Aq%u z*X?Bd-CS07?k*LDx^u~{;MwFVYD+Q|^kz9-h~e^hn9YhEU1Z13-7L$Y-rYZ5FwDg8QyV zUa(9x5mI0qkHC_au`L^}Q*`|ybp88Z>2SbB1g0ZJ7Q8Moxe^mW;&dW#^=+#MwVU}G ziV|IhT|3rX#kwCi!ZO|K1k|#&UVec17Kr-JLEnA~9o*Ll4q*AmPhKETBI=$$p7@0& zk}}r(GH^cN;-1~MJTK>P^p$`Ji?X_9q^Bt}Ua#V*VxhD#yz&OKM~#h*e5{9joY~8WQ{RkL7_sz*>yw z0TiT54l*_U#H}Tjhn92?r2!trv-T-v)iJvgtRSfLnB7&NfF>Z%Vq_Q4?y(KA!I#KP zsQ(qB{X-)u)l=RgvXWzD=#BnUzR<;2-C4lo4xlGF{;(474|C^!B0Dp2@B9@D-6}u9 z_)~>ZsY4r=!`v#sFK;BAs>YGm(2QZ)cjL0@sLn9PW*5se*f@4w;~`X9 zDePqMmw9epB{ug9#n4SPGU@CtWY^oMIR>cjyO$I##i^0GG{w;4WrR=&*zv@R16|tt zx%|-J%c3wwmqiZdImH|3{zzWm^EKuf2o1ll{fIG;fTnf~p0~I{TRk}2vPp(4CUsOU{tC}4P=SwxB~)Egdrj5prJQ!r9Bz06$r0)!EI03>c|nn z&kfRsJ1zxV|I@D;t4Lu^l{kyssJ^t<<-$(=b-i&^#`U}(Nq+eDFVY*o`E7H4{ZREa zh#A?=v0dV;$qTRY!ENxZdXnKl9xQ~Ni%oXT!u(9w4g&5QFvnFUDLL;;yP5zWVR8+< zJBtb@5Bi!f0{M=N=rW9=X@88fEhV3;m71@5S%#Kzc)vaKW`Mg{(;B>FE&gsHpEcZ| zNMGr(F7j`3`z4>-(t23}r0n98X{zt+$@M#>Q{q?8Ryk)hDhp=_;VWE zmRiyN7bB_(y2bTQkoYP9+8klY8!VhIGrLpvC?T68cpa3+ua?r)`*j|Ei*5egcC~RY zW=-HfeK`-y-@Tw2(<=4+^Y?GBxg9%Sm7~w*m|V?rFG)XU(rRCF4 z$T9Xd^u=m6nO!$|eg1yyOUQ|QA`T%5U(z*>Q z2byRod$*$DhnX7&a!m_uO>M}^kocap1cL^f^~x}DUd#{>cCZ!%-i&gTaaM_t=?JIh4rX=@xPQSxqsav*4sN+LIV;BZgkzon>C#R35C^6k zPRqm_!p=Yr2urZ4W_P{Yik85<0l7ia7iuJ!lt1ax_Ns{Rh{AX=vG5(4^wrFZhJ|#& zrzUSJ3&-v46Kjxmu&2dI7U+}`S=X*PxBn6Tv1@;Qme=BR>Ar8%-?QY0uSEHP{?Xasq+glkpRRr* zYp;2_&%P*V-*D@nJkw7WaE1aT@1_+yyq{G6!-xK131gL006DYJQ0fD(q5hu_``rR_ zb%6FH3Az4$fS~_kGw%{F&876|%woP<@aH%6<2ir-t#j)}@L}#@pA>ZKW+ijC)zrc9 zamUZy1v*-*D1W`+iKCb7DqCx0CriNO`S2l9jO^Hf@e@E_kW~2Q7)xGx1jIk1MHUCN4e?_kYckcrMGkQg>HV=$d`d5!a z12n0czx?p~{yqJK(FdRH|0~x2>`DHAcK<6D?PJ>iTXqw&N+b3wx*zJ=*H3<8I+~>< z*f6~^Psj4$mi5IA&x-Z5-?bT0%MGY|wnPNFe)nm`&ccsxHvGB7->~1aC_?t-r+x^r z{Hoc3{^(jwE|f@Lb%g?pSZ=uT19U_CF^o6>)jKvK@$=Qb*Iqvn-kz6>=|ydIRKUQnJp@8a9wI?_ZRhLu=u&9{_)~D8NhgWt{UEY_~ZHTiYnWD!)-b@ z36*b=Qnyv#VYkrvp8nt1+1G=6(@i(rdbWP>gp9{eIs5F5MD5&3IK7XxKijDBd0=C# zhmiZw>YjIAr|pdd%%HYAWcup~A;)dBfMqH;8zX*<9{l1!bEBM`+NyDJZlBZ+7^Z{l zaMA}1T{V$ZBl-iKV&y34b@yJ=xjuK+G3VfA`~TvH-8;--VqW{=UR{)$3FYS+L2 z{a*}tiERc>(N2s|YtN5+f2Vo$q2t9UPq5fD27un!=9=~#d|mWG?K38w0JcRQciPt+ z;B?=32AI5CWLR0e_FhD38u{bhe$K{+_W)8|ZrUZIYAJ<9+6kePh()Nf>Ks@TTd#0R z=1xuO{d*Vgy<5{EU%GJU1O-8EkwQE0?lCn_)ju@!q>jI#I`64`x3;4^iHvuvu1`ox z8cP}*85^nJ5$E$+6i0}!civiwOBh}1E~Qky_T)E;<5&Ot&%IL=)3GwoQ5segO2}hJ z&Pn}t=$HSz_BnE8J}VD*;lNKZ$A`E%YgG)LgJwOBHRYrj`lB>Tl-cj>Thw^#_@d)H z223SS?%9%?vY$#E)EPtTt%&V%E>|?nHtw$UwU8z3VDtNTySMMpMZX=^oX3y$I_=-6 z<4NG|@;bx6U3{q8*4BP<|MxGROXX%Qp&%c#^>0t5mv0?j6U2$*8yzawz1?g^Yd%Zw z+veVXv+eyLFt?O(=KqrHuTN5uPr6buJ0u>pg|!%dF9jdQ0j7Pwxksj;Bsxb`CasW* z*W@sb-i-+!^3(W_3_qi?)tx7YILjYjnmhahe}E+~o_njS8E6}g(g?hmCb9q94UV2M zluyNqftQTeO6{$&&-FE2HI!H6e)cSv5B;9~xijZA^Y;?4x9Lco*F(CR0>T*u1s}z- z>gr$L-cP%8*)o3Zh|~{@9OdrgSgQTIIDYk>Jg~T?dQ1gqp9s;I_+cKZts=a3Elkn$ z7~WE^gKLLN$ylg8*o@>%l5u_Zqk!&h?{f&ia;sT`9;*G9w)38Iy9BVCLkxW80QuJ` zq7uG|HfgHw&i*UUNxi0^IDIeYH%{dP7b;S$KHS@mo~Q-x=sqF$vpS(^ws*^%0j#Oy zenMV-BBgxdhEztT%CV&VQvZ6F;`z5_*An`e_7iY}=^jZ}GZvfy?FJJe2btH-^X|7O z#~>ub{tIujzw-UIo%b5DG5ZB8J6=ln8)0*SfacGi5H@~~uuCs{826Kh^Xe<>OBWNs zGGt;ZMQytL{_oFaQ~9lR?a70SV>_00|E>Ue0PV@cpHZ^=az)yG{cCAIjb{ ztjTBF162`F6cHt$BA_Tr5m1oO1*IsU^xk_5y+a^~NHHKFy;p(ILhl`oNbdv)y+ud> zDM{$u=-&Ie|Baq=?)}D_gqe5NthIit&bW@aTNhXFTsTGHv*+{K6k~rABQZDB=Px>d z?;(*GiNY){r~lQj^TPYz>+KX6Vn6*(IM&Czzei>46ZVn*3yIl&k=S~W_VhpUaYYD* zKz*N;=ML{)%mSa@b5$in#d6qVuq>dDrvAmB<65iyl59`?AAr*X1E^p1We)O#I|Tvb z7eAh6!?)sk6w%K@D2UT8UCbRiJuQ*d5=d{PE1N4Oo>pAxqolEYne%DwL}`WO{9rmg z?+6{M)o-3T{$L#bmvJd6h%8K|K%sYfHR+M(fFYfj^&=(?C#H&ze=rn|P4Vy%dv~gG znnj}iA1d?Axg|Qox3YV0tf)_IxnCk(ys+z(8Q+YIQ1Bn4N-!&SKVolkwy4bmyHd%# z9A`VdAmSSSXSw@xN3Mz+T9VEaDb#%UR&3i|0;Piw`(suj3v>KZ5HY;Z#ZJrBDHZqo z5k~DN@S6=jCO@k4`*XPA;9s(~DA`Z>hb;YNIBTi&pT8-|frdi0e0TlKSJ)pIbpB z8)fAn_5NRIaC#vnx`wTNlhM=JsxusqFI@iA!f|{>bO!5LQqa!#G^gI-)qH~%{{wj; zL`n?T9Ki4HwO2JauPZq523l@lY5!R13*vTN^ox-C+lK$wXMxIwty*^Z+f!DJK0gD? z9%AXwqDS$m&g6T(F38-bfv!?$5++P0*(rE?^6u`P$BXBPXUC>`?thJPow@&qK5B`$ zbU(0x+~vQf_=9S{zfIRjcXxmy`2|R?pSPBr{rR<6Hwv_|)g>^wj-DKNAm6 zYH?S2`pT>`*!|4uclNtWYk6b-=r0)kCudyTd8WH2;U{kU%j}}}2qzjrwyBI>mAbP} z8@tc^$(MohL_H{B(U*D}5&j&SXm?#Yn2z}@X;5v;Q zpqN8nMK@PKf%>&$)lG&;@h%Nkeae_mdR!61UG5S$;@D55VWHUm3&Rt8o4Q_Ye`QF4_A z&2pNPX;=!b181mg2_;)ZdCAx@TeDci_UCVQrrl=0C;aqWr5mK`s>H6>92#T^|JaIk z6}1&n+f)mnqzH|Y9e*HSKb)uMha0K<*;h{~Z}e*tUa!Fa{cZmE7C`e#wZt}&OK&!7 zVkaE>b2i!`M1hooIDCmk%)A!y3j@ zU<()ORE^4wzrM+~xn6C%diU|8^KL2o>#J`%VpDR{gz$Anj^ek^{~@0LB>x`^zOYC` z+zFAq}gX$ z(uQLcV=d9k-(s)(%m`0AAAb~5B!gW>mG}7lp^E;T_8%`*qWjAZac6n09@lo=SuDS- zaoeRb-u6o0(3OIJbM>9nmKSQ^JF2(UrSJrb=G291kB7mB5WT zC)59B!2clNKYw!dE)fkcMBnAI4jVhcP@A=8Dpy^!>*M*)9w}CwAyI)oF!{d~=YJ{0 zz@lIJVPTkyEh8!0My1Ve?EJskN}6amt4L4WM0@=I(?6__h%eeE9bk15C;6ynxeI^!h?APUF%5CP)lOhn%n^0IqJw@~95{$L2RQS9gq8tjQL9)xr z#59q0`arsWmP2z|TBkFY_(}G3LoOBx2vs=6Zcznwc7B?Y`8Za4Tw<(-fI{U1rWBT0 z?Nlbm;*fGm9dK6=*uN>hP{UxeiOA~rp!GCvdSf`Xt~^~r@Drg|2esvCV%ryUxnV63 z+M$*FrU2vW(zE_m;j?`;Dad9*r3G?KpW@(VweQPO#R^t)4y?Y#ragiKs{LxtL?%y# z4(HkpAHN46XT|DQmbJl4<7nKP>m-fZRag@~cvj%mD!?Z(6bD+*v^S*!rl@-GHxX;l zOMq5l50CnU4vYU0zwi`HP?PCp_N5(bF^TPSdq&mYTcVBJWd-m2&l=`$0MPlskV+#` z_WGD~(vi9m$r%b3dd01mhy*BsnH}||5_t8V-M%dq(?>8R}qdxI(3 zp6}Yo#J>L8Pj**G_i|bWNho-VA5{O7@26ah5BG4YncUIdgc>R}s|1s-SVsQSm-vT+ zV@9WTbNReGl)-qolHD1Q%fY3-D4RA6RX}SC25;$9*m|W)sPZl~z4U%5T!lhsDHlU!WlC$=}p zpuICrYa##O)zH-|5 zYzs{vhCv)~BJO>XXq6i(%6v>V{7>K6&*=hI$aO#~6^&~OS~@uHpnAtRqbC~o zsB`rxO=Hu-wROdD$VuN@zEMd0%`Eo|rk5V=CM0DR$0epVK2V&u8mC&+`aP&7*K94( z`dmFS7Lfdd2@HeJW0StiATEZg;|$oE3QO!g!lSTf3HEH*Ih;RWkFj%PHBoLvz#I%C z=pwA87=4s+;i_XK$MtO3yUV&(fT9{TBPw<}S$s*Q5B*6_0DG)eBg0F9aS{yvt=sME z^sHw7sy0{KXeq$`zLA-V`@!rYurWm=s&2IGNDUdgCnVF`nE_PU)m+SwT2vAqHGe-z zY5qQf*kH)E%;v;Ge`x4`_SBlePou%**jCFzYiI*tW#&} z(q-G21OX~j7={4Q2L+0E#+#7_8e>Nwv4Ec2JgRJ~n(}HEkZa`Q-W0= zCfc>~lRyqozM)KyA=-0!kpy^6Gs*cn72l9nKejr#N=pZdi+{&sA|3ta*dlBkDWeFE zA2u;HUvKzGVp#O7JLr224(sGNAw9STVN~sURfZT|GHP462 zFqLIC#D;%>M1}50uw*5bZG)Rf9&_((wBd}6VLOsWjEeTkX4w%JHJ?DX&Qb`C65(Ew z*^)V;6*~Kxty4OOPj!FlJ0D2^Pkib++buDpO5W0rQJJ*LKc^JvS%0bp^jMe1%}XM#NeM6%pG*31KSh7IjX*=r z`mEsbP&Hs~=MCTo=q&f~T^@S(c=)8@>IzmE-@6>E{ zuVX5~wkMuvnDBQ0<ks&`L-vXTNJBJ5N>vNi&@1*ivGnuO*n_|7h_9pAa^5Lw3$^DY6R@7LJ|trR%DpKlVH-o@dpjq+4B*QT@mPDC>iI^<o*r$UMg%*?xN$c z0p8q}eMk>0sh#g}c|HT;gZp`jiQO$58#gF0_rPDS&k0v-GEn!riCc3S#wLS_lN ze4}^*KY*W{Vr`=8pljNXhyF~7=NRj ziy|!$%kH)Sq!9fA?n!gLbv-|>;SJ-BmgsMzO#8j%n34t(l^2hMQz}hkMx&W|j)AjANxP~Jr**O( zzPVwGbWmcOBZ!Et4PSJpnXv2}UZK?gg0F?}uS`uMFJw9hr^Y1;A4NOPrTIBjP9N%9 za2@CLx-vHosyt{+#*7e-_$>}S3Mm~pD5Y%(_ge@FDnSUPD=S@CWol> zYVVl2dKmXotN?DdFL5S7I>)w$3bq>8T}RupDpsspg+7E_~^&^;Tbc~vzL?s@KfSB1kjv+Z!@vRIa6|J`an}1sY^_JF>Z@>Z~Gqce<52`I& z8C{Mo>%@z}Y*iW}6B<=e{T^U^6@|IeBUDS=LmGRwWmbLVZpF)HiPt7PYd5GfmJ1T! zSP@$+Br;5?k{AK`(VJ&nEvFRC6o$esQS#J2cRel`6Y7_#Z-;0)8l*q69%$&!;I%e` z*@P=jmx(D~M*EiEyCUeWBxUisqZAdJI_&kqbUs{A9sLNeEJF`db9gIj?Bb*r_tVR_ zm}2IPGY}4F+O`e!dq<-BP2qKzoIoFhF+{dWPCtVZ+myoZ`e-t})KzMudXL?VNzT``HD0AhbyKX_PVcS)_Ib5&y3AJ874@h^^3DYq9Rxvco1jm;q8 zsNRB*u)M{mRn2LnRGk|Zb}FQD`Q=#!5|)PdK1l7NocD5YYo=1OoOO*2_ceD&7p`I3 zbc4cvr_+mbM_$vMK6{x6(ev8)U75vYy}p@Qp|qw<_ma>Ve5%}QU4x^U-b(M?`*AlK zWxoOSB|_+Zg30o93#&y`0%r8P$Ko&lX)Zp#;CF1V$S6sF&uFO9HG;PFsYJp%Dtf%F zA=TFj8>YDm$C~uOtq~z}@b~_vIXB;co?RoVMpxGP&vEzI`2sqtRGw{5380U&%W@@f znPqaS@r36Cu#cUZ=2VW&`ss~G=FvW-@$TR)!%w!k==;2b(3`bONbl_81FvUdcj04N zV1ieOqAmReCllBs13VQq2mi(B#%r4L+z_Qpc*ETvF3-Ofh$dT;kYmvUO^A~?=nG_I zzUou8eMY5$UOLN3#(hYGID?KY;oZ+ZP?6}{{fK>!0#~3kbYFET{RXN>B}G7gWS_tx z9S5qBEps)|Qx#qf9M=BQ_!;yY*EY}F%vu*6*iIFjKI&kK z!EObY<|=A0mZM;?Ruf^~RUKRry_pw$N{KIM^rny*1d1l68i>a&i zYg6~?d_5(#nleF^gJk(n$=FA>&cs+VlqWQd(Vb0f2r;P~-LH0+Ec&VO6MI{!-TXsv z5|PdSQa#8`}jpuD+SSyT-`gQ;pwe}P!@gn&IRvP525p! z)Q&ABGbndtbF;Pk>zD0Nzl@emucPGos-_yfVl@wbJusW-FXDUkku@((#=jY}kp&v| zuKTReFowo)G36th=}H&s@2#$4>M0%^R5Q!xe{1BB7qhPG4OMCHRU~(x+TLMdlDU7B zmk5~q60`20z>afcUO8ezLPXw}U90WgW`aJV0zB}Vpxga0xdifxj#?`6$eR(K!JO!I z5S})_%2)(DUs97P)-dti(Pf|6pk2v7Dqi&+-rzaH_+KhJERgf$Anjs~Bas*SCiTgc znL?$>SEh}SL#~m1&X%-msEfJ6H`kb@PeE@*n0pj=)y4sj9T#o&hMT4IU`%(s;keeV z**;Vxi`*I^@EzOIVbV3o#v{;8SAScx!XI5OZp4&v^d+-Xrqu683GP`wQZX;B>9kZr zs@t*g0lazjVG+ z9I|GG{zbMGW&FHur8^bgQRlq&g;6=V_(peCRo4{OH{&UPGK zlGCEFTI$B#L2F%-5dks5E56^MYx|VBeFcU^4o08$9O|3<@o_IJj5aEK^SNzsD$t#` zG2n{Zu5Yv&wO=pQyu7^R*u!Cro;%C_fw`(kdZdQHfC&+ObnIvR&TLnHvLOA`$$uz@YH(qE&4oh(;V^S(~t=-T&!t8_)b+vFOpu_cgpx;}H) zn{H(zq5nKNNFVOBAn(!Y*^zTR|MkAl`!NA&gKR4%zE6KKd^<0F_4(esCCb(iR9w&r z3@9)guF)2xPv?oZEo$Xk`@Lo1QI+H#{v@hcza1rtq>jwdLB-qUdf{p6x3f!}^>*pY z)bhlkUO#H%&$0u97Z{{eg;(^b@cXjW*CCbUMshM6v1LkMfkeU2)a~+tFki8K6=D;i zraHl!UlLsQ^Gpm!MNy?ydCsC-5-2-Bw4y=Wh$?mB_UsT%Z6u{bY1)WWuFmjmfak5E zucFL->+DVW#EJ5t1ak*r)@4l_1)w3XfH+I2quem$xxw=m*a z0JOngLH&$yPGCT@9YTc!NYWyQ0ICM;k32J{U)&@?7+QR~FjRPhOY!()aGAGMvGo%B z9=odL)ciFR&cx0FY0#wl5tN?Vr|_)a8}rV8cx!~iZE7)?(&m9*E{&L)X`fPRW2+=! z{LGU}^sL4xch0Zo-raxe+8p$875Y%|3sg2skfI{U;YuJ-t|$+E~S(MtIBjD$Ap;Sc(L##a7l~RycI6TU!Y1e72sU+uz^=2E~|!0$uCl?OP2` zI{9p!-h(C&x4CNP&3ow^oi;WodZDiC!ohYPA=Oc@PY;z=yz?Szf4NeAb?PJ1m`TAjXAzoIx2&ho!;1C9L6AqC zJE3KU51v@vLspE$>3vxh=4`fe<@6w)&4H?KM32K`);)(@rt~ruUUG z>NO4aN|z7Kp2l!J7DoU}O^n~lUKYv!M?US}AxEwAH`1?#D;XR`#@H-J!N{x2*~ut) zW{z~5GS!(B;mc3b9zl6Y6-IKzZ)tn?T_!IBjUh!jB>D4SNj{BL* zlO^j28J0fdS9fhpxpC@cX`XRZlg5F2=dr%WOCyD#$()+!)CBtr2R?m}xLY2BZME$& zeMdC@>STk7+Ep{CcPmC_>r^yjvkN8w{KoXC-_}khLsXM~m6&Kjw$8qDuaA0H0qGXf z$`k&=U~ct_lK1n{1Y=66sChMn98?RsO+%tE%Nq4?#9u0Py`18|Bg5(1=RU|VITe9Z zidxx%q}K!JzHNCD1itRfR`y#CTkp4Twbv*G>8(UY0lPIV z65f+dYd$S3jRH-tY9euiw(z6%jhf+uKGTsN4aJMqJIB{JZ0DqXu;D8<8z4e5O3&$w z#1!nPua8y1a`vOtf)Lm$y_5IWRU`Xy$%n$ev@64VSh0K|@odi^d#ThY7vD$@+huKy zeHkItK^@kwoVhq|ksVbR>jTMKNR3nyOOXlHmO*>6G=3>72MylIUQY4-5j#aNLE^ec z@~`b{N|Kbuih6{>H%Ii0yH^{L^||BzhX{Pt>%+y57n_#Ky@i{>T8Ct?4}WHVFDs`T zMWB%T%eMYM3>{FgAo~XeYc*7?+4ZIJUKtyeFN?P&%qk;`MxQVhkv!vVe~xO27>7PQ z&`(>}fk+j{O0#iQ1&FK^1-rU2>W8!TshLs*#|V=a7dut!!SVTyl$V$GK7dc?HMshl z-1LJjH#}mdZY$}({zNI=5%a8{f*xjES~h>rPgAd}cJPX}W36$xE+Ah~e+4WFxMC!U zHMV6~QjxlyH$s&I13!20dD1mf;|>@~tAh(K3_o#SIkRq@E~}R&Efjv{+`u`!@PHNt z{VQXCk+zN|=^;e!!q%k?*5hQN6;5co50tAm~b1@*9M`?@Z;CcOkv= z*EcsmdZnVZsdfu=4mErfG~D50+t=-OT7CO&B^&$eOxPX1!xQl0N!HxSw?-$Y`0P)_ z8p}i8Bc+z;kux(a8mLyz#{DzM?d2~@YGs}5OTdz>;00|9dnC_NE{Pg*ta6Bchp;${ zaa7$S>&Zh_%GtfBUPn3X@k5O<7(si3cz-hz($3FQE$3PrM$SY2%~U=3Q@NI=dS$f< zPgIYW#OdNbVq_wp4CG3G=QnS_3k=q1ZRgJWRvS={w5Z}Ut)*~OC$qB0 z+s=cD(S5Cuyj$%yRPO?q8hApaD8N5fgb>D3@TY9XQoD0y)y%77p1bano~oaYLKner zXR2-zt;Nb2ZPtQse2O2pqq@r2kWfX>Wu$R{dThjq_qtXGymaS?9b}epX<7*!hYR3q z%r^E#I}U|P9;N5TRQi*P|?FWCrdFU z!aK#7v0s2`>~3D8YMS2ZAQ*L<{M?0|i(@ml@Kz_g%#wxc2~Y+0E9wn7-(%W*qZae| zP}30iNwv~6=9!H%N(X}v%-)QQz0?wZZF0(BAvyOk2{QU| zQ2Z+Qr`xQLjPs0e3{&yklN+=K>ES%gWJxZ~F69^xEY+`Ir+GzUa!o2|L_7pv-;|=1 z+WT~4O9EMfXI!y40R@@cu0Zy}j=kXlu_?`QAhq7DuQ0%5eBabstnMZ?a{HUVAZ8pY zTfa2niP0D9s?2Y00I%!89Yw)i>C#bT?=A!RHe516QwP|{B&+Fl4BX;)8@@d-6Ei72 z9QY>jV>^g3uC$Y&0!ObsEZeqmFPSC1)h|5m?o8+tfuB3 zpm@}xfq6fA*{AAOdqcn#XX77=Yr1Pi!Mwh%9xdT~fMfPFPxTwJ^g&yDW%z#A4EL|( zV4o$_i6Z=8+iem!MIBK@ZKfnePZa*lN7ipUfh-l(@TwKnhP>~5{Y6q3pz_#kQvwvW z^O=GZ0xz-uT6Lhk+bT)_l~|>-#j#ZIDc><1_RTT$y-%UR*XI+W0^Y@8iMY?f4|wy{ z7p7xMzkG`BxB;8r%Uww#bElVS?<-|~+fs5>K>g(=>iO*tscw!`9BxXIM_j_eIYBNe z^xoHFya9nHQ}W32=c5Gn99Aqr=Gy6ih}6aV4{Qr>vL0U(6v#$5T0AXTJe(%S9A(?r zeWgeepbs6aSbDbM%`Z`)lRxP)gaB>O~xixoRVnDn+P7WTiM{N{kF0kUg z%#PgN_dQpMYn`GIKPLI~^2t|) zL&%lULmJi*eR@Zvc>&G`aRsF%I|gY3l@%%*@M{YX>tpp4ddF-DNxXWwWPGG5eZnM6 zQR9_kW$2r&BS&3Tw#Xr&J}(?!R4Bh4C$eJv6Wj{JE!-!!gDeji_30NmIqCNXGs>5B~aRET^P|_$VH5e%0!&Y`_I$K6+e9K~L=g2r;P1DM0uV z>_w1@9+6ez3-gry+Nz&jMoh2T48e-0FB%NF!}P0}H6ib|@G-#JW$bAF@y^n5_NQt^ zqN${f&b>FkV+Qe}R>~;0ES@Zo)!q)Q($2hbeay>%v$)8EMKRWXVE<`R@rzu21GUK_ zfu`bavZUMa_`(wRdfUg7))e5D#|C9Ud9}j_s#ZOUP1@sZ)WYlpQD$ShBPsn8<--8w zst+?>YoiqhtDeL#Q#?bfJJIoSF<}*pw;}qU{wp&jV?a?wv*MvR>_X(RDp~&JkfF}! zcOG9m4;*;DyszC!Ua{^)EPWvWfQqB3swK_j(roWKAjNg+rAf@Igl@wrSx%J7?UiOV z8}+y&3E{Gv+d1T&%jaVk9v#cXz*R+HH;Ts89F1oR^!JA(IghkTLzsIv&K}XnI%W*g znqk>bcl4Ymm1b}$B?FD6rI2(e-~^JgO~-vyQZP4n%Y`dvrI1JSAK?oF&Qm`WbQf-o zPWTw!D2;W9&ov$B$QAF2(gVMNVsE zkp_wQGIVbPcx zEa z&8SW$IFpY3yFsRNUGSq3@W`}OWBG!|M-qG9aU8O6ds02;EPD|+b?I7`M(=7|&mGCX zp&(nt9v{QVq_fW&=8dbn9jvQ>^`u8p3y#&+Hlo@H9O9dMcdJulA)Y@>V7g|ep+2)p zl*{$^>Wf#7gpq@Yw(*_(f+~LBlbF^BuN2p6mQ>bv{uZ< z7ol2BH>Byd-%RI{0jod|A4EMdcjR53%Ip(feJ~>LhufoF^XqniZe2p=NjBXg1iLh6 zJyWulFYeyd?gR=Z<{uE#lIQUbHW?Ke?A~0ed3t*&VqH)NQ62KdA5#|C=B5Fc*H>~w zr=~XC4)&XTPfCOYu8AuVn#R~;~Z)Unw^bK51YYP z={Su~!?(wM&xfQS`{%U@ds5O`kB7ELUsHMsa%v9O+mz>5V7;t;JuxpK7^3<9qc4|k z!cJ7)pIcwv!lZekC7w7^C?%gq)>=+hSr}rK_-p8K+EI$;h+EeW`#VDrYJkk1y~gGkmy)$a@kmJpxut6 zPmeK3-LsSwSgyE@A7}D)2L;A*^=SUpkYbB~c+@5y6e4|m_%jzC=65r`?u?TLEiuu| zk*Cb4k2mfC3;kSUo-Nbe3mafO!x7y*$QY{4l4(3F>=7q5T<%=$Dd}1F9RloC50#+{ z?MKQQkbKkrxP2Y>q19-@efUi9PM%%=yCbZ$p2dn#b`!*ae&S0+tLf2)ZNoI%dJP8& zPL%(9G+mtKvH6Au~o_lpa6ywzn~De_&zOR}_XIlcn+KQtf~CwQ-o z9Mti;oZ&zVUI5?OB|uGKz0J|~mtrV^Iv>GE!vlfBDvR{a5}}mam(P7sn?(sdP6Dtb zKg)PG1elxY8N```jK~qbgQR{M^xby88_Rt((%f>&TFOYN-iU`;W;%uwdjp))6n zryCJR431OwB3L2Dif}Z=eIJuB$_qd1*T4WW<)Lpp1=1t`ISU}oZPp8%u7_wF zv~AwhGWJHcrTeOVm<0oYx51?&gfJg4{BY=*BZffu4Brvs zt@a?XA&J}qr9EL9R6r*uh>CkUdpFdKJ%Ujs$@haL@=>9PwPCamhR^{`<7iva1~gE! zHoWNyIp?V}DQ!feFN1%{ zN;0Euhr3JOFb#g{Q8o27UY-?o4TlWBzL^!=FhHq<4$i7?sC_ z6Y;L1__ym^6kwQ0GF<&R+&-HU$*}6p627OzFnK^h|0q%(=kCelVQn}&cJ9c6rmBdtqMTU#fGYDB3+cq?82mVKQ=o^MqH}H$ zE(lk7Qj^!@cADd2P!Ek`tHDx2#1e2X)Euz@`ARo@3Jh>N#_pG;m5b7t0OnlsDfBPV z#j+W!fHe~81Vxfc{n&-d>IuXeg6ZDvDu&dT9ZGlKiZfh%3M-zQ+3=1T-!O3;^3S*Q z&ALrH(UO|8=L{>8Cj1# zmSakDskSKMQ0?EI=7ocMivsd-#T(wM*N_z-d~BXZ+!`Hd*CtRd1`XP#)9oADA75(4 z25aHJ6|NKE>pSrp^4uZU-hCZut)|!Q_nHT6IMAyFV%V&P*6bY4@PTeN-_hA%MlN0UJ`?oPuT`LhT3|4K5;GZ zsia3D#k~}WdgWxWS6}^naoo;vu4OAT`}moQ;C?&V;m+sLTJ__9i)+^OVwX58!+l-sRSBm9~o+))qPtB-q ziW{1-q0R%DszThtr>eQP7(PGqTS=*WXpqucSn#b8^!SLL8YM66T6i>V^8A-r1uC~3 zKfA%O+H!d*HLA$1*Yv7GXriEFBH48o z?D868PNLbejN6p&9>+1UU6V?m%&Id`9;@$mXMMM_-T9szHnz_}W6xoO58i~n=+Fol zqG3|P>ch4k8yIwe?!3`h**4j@f~u_`-x!tIsA$35E~-wD3)6V56fHcC6O%G3XGg3> z<>Mb5NG{4zfG>!#kK;fjhBl*6l1%nRwsKo;Rqr=SU+Y&rwIBJYRjyMr!9s|fqOw>= zH-09@kZ3u~;=fpV=3n{7s655`?fH1KAHL(ez40)EO$;#zLxy@;^}8D#)*wE-vDFt_ zf`ZxF7?x53?YJ`E^O%U2!?DfvGm+V?94_LnMWWA|sx=(8gS zia$keN*5)&!j#aQ1R{~W3I2^gUc9iqV-+BKQ-zIz|l6B)rxRBR=Uur+^-ZEa+`cmBi- zu^E7Tiv@gRk}S(x^@-FI4aYR*!^dNG>|HTARso{oKCx{RT`{SH&?A5a>-_GbwZ-El zM?7d-tU9J9>ZhxkrEg3F*LLdh z4qZY-cJCu|hF@`#mVRQ5>6^Ky{JK}~*`LsIhQavRG7G1nP_Xs4T8(>G9Abibpr;kq z_iI13IS9EfJl`&}kKXkiKZneD&R(m0kTTO*$*%5ct@O#)+MUY@(w-Sw)%Z;*k>K;% zSBtYIo3>$WChb$zRmR7r?Hd}7z?<~xPnXNx7jGvILN>zr1k4pjfdW6)b}qn9x7%S`nY8nG>AEs3OH}x76f?u-Oj3Hlg62 ztK4q0CYCX#Q5tJZg4l<45X+Bk)D0v>&zn_wxmUq*MfBw%^3 zhB@Vm9JlIE9HdeY>sh@%wi#X(a4k}hE{WZf^7Dc6%67S`a9OJ{@D+m0cK9!@_|W&lyhJ z-08*6BxoxDe{2jv)c^)#^4 zLLqt7BvSmf3QH*a+1!QaPd{SR>a()@lu|u&b4F!sqhRGJu`=ht%;BxQ7enzWZF^zo z($1BJAAK?It)Qyv?1k)axM#8D#$9%~b$C>Btg}xBKtHooF{TjLl-3C`nIs073P|~z zeIUNR(v$BfZ&0u_7YuJuU*=Q2lN4W)O7HWPldHBZhEh5*W|Fcob!Ngq|%3>Iq->VMV z<7EHTy_3-dlROx)99pNMjcFp@Q+6KU{F>MR_WA&)j3mS9YJW-LL^g7)8ll&3J+rI( z)%+N0>PJdVZg8l=+(m!!im^ODs99qa%x*SoHta|x!l$yqyz{%e@Bc0TOE*53hZvvP zj?fn9b8U9oNGGfuDEY$8`BA^Jp`P`VB#MdENBJ19%?HS%LC$gdcJ@t^k=5d1fe9Av z2RjzFtsA3+VJq_(mweJ{7X#^eko6`=D`@Rw=>|66MiyR`MSqk>uV-B0qhW5oB}`kN zjbvD_Lr*8vzPtU0gxq=jp4dEn^M$wDf13JTIUYaDL_1mpe;AAY?X>w(P)JfrTX8!2 zf~0K-c9L2(u=ulg#tqtU+7%t{8I#AxugGTDja}yjelf{V#n1-q_j&!dL*)0I;`1`< zGb$kuouUONdgwftIHI?;HJq}pG?W-()Gam*svIETWU(XOB)L!>;qr8Gt|x9qg}3rp z@IHzpRBz%EKn zJ@1_o`Jr05QlWB*X94f(+vb|=6M;g*DF@Qxv9s(8Wiw9rIP>E?(P6teliOB%!{pA% z8@b77$_AFmU;%qiy@t=AsSWSOg27p3riQ}!1?BBDXHol}?ukO96raf-Uy7aeCFDly z)~gxa<%$Kr@5xPn*M52SFMB-vYmZlXTaN$JeK~y|JCqy#(nYA3q?0;H={NQ&8u__1 zPo&*eQipvfb9`@cQUY$KRKEB?EPh2NED^zysck>-S(k%dQy+UK21}$ym$$XKrv+vp z^IiAT|8UR0NpEa8Cq#rY;nJF{~LTxq(^#=8^Q@xHl7cJTZLo|t83UNjoNM8 zc_(7~)D{Oo)(qU$I)kUbq5hKE zn%+#wG(-#MHSK*r%Z7D|M~Fw6SF|g+`3E2RBa;U`g1)j$E;MZte1tHPfj4D=rk86% z^QY`Zmt%E5V^|_0LF>!XzFYg>TiNuuCl_P&Pi~^?V#%ruPoGcm%Q|EDb$Cv1ZTiE1 ztu30_p#*qqw+^nX9bEk;Jb&t1@oGt-Yhsvn**pNT4rQQeefshy8SK22=C5L6>zA{0 z!EkI2511l{*H{mp87^H|;!D@Ymfv%6PS$g4&evp1l@rAdrf7qtU7)J!sPy+?RyTdd zSKjbKR9o`)EvZx7YzFuV_9BlYIwJyEc0)Y5A!j;mH~`3!)*e}EV&hoeWpm)5yT z+j8KJD&Q_5Uf13_f$}>kLpBext~3pMW&|A{^=8A5Xu$`_;|9DQs;9lCwEbD?6XyV1 z6BO58ebWZ>awuyRYMCkxD`cUk?2xpa(u{8Src@>4D_hyPQFrfl{`%-l)-}4iI5YdP z2+zWCfOYlfgX|=|g~je_yI;LKob@M{(cB#;n;(!o9enn-_AkwHy8T{V)C0xUe1*mE zaIhgaf7HT{dhM;*R*TXLS7}bEsb7y04-~Ny91^2SZlV9KACTbcf4V4oBk@XM#?&%> z@>b5M!Q-cTCjY|qC+83zde(eTRSp)8 zXKA$k8pX&$see>@tKuwAKqRGn>2vnhohqJ_i?lnO$O5Ua8K-T1MR`9ns^k^rkeTn+ zPQ!k5wO?g*%wxzeH9P^;47{AgHc0Xi8%e?tw`QG;q_u>G%20W5qmLrhB?oxuV(-nfvc{{6PKLxYc)4#r{@mW^l#Z*_t9e3Z)=V-YPfXKl<6ir4Ij!CP|RFxi` zh<{R+%=TB?JqiZ+DrpS^an{t1!8E75ooKam&<9W&-iLp`zq=#PRI`bzU3-VmL@KU&?8d90Q|0UCpfE~66zZfQ8sRz;N%tUr(juu z@$>*Q!pVKz1`z|P>K|8VG^}}t->VEgcxmY{h=GHrpB8OB75`xt=hZiDp64IEI_!F& z$G;qfv~U?USGx(`G5J-C?I>Do0SS8f3t@0s=sKdknOOSKEOwkPRX%a49bWolu~sP~ zE#`;UKsEz0VT5bgbJ&rU{jch|-`=?&&n4oqINj+_gMLefcnoRP@UgU-RT_TnhaQwt z;s7id&19!_FuW`W#qGB-K7uG48Gy{wB&9`f6dM0dB#TpCe63-C%_FBjf9-V;s|jIS zfrF6}-tQDO|40fS#oUX79iEqhyJV;zvQ`Y@jr;{{&U`QMa|K!M(Nmxl*nG1r#8&1e z=|)CH5SOK17#)D0E3s-Z0X4kkjr`>XJ0=8VHm?V8k`LWBgWP&_;g?+fc>Mj;c`4C4 z>6n!+Jp!=3KCqJT-EzYGv*pwQ8g53&ga5L`&_@=@$fw^}G^RUQ9tO;$>Ni>a$_gKd zQd1H5!yL1&)IJ4Q0VtC#iIv8-Hui+TNOLZ$`aMg2e0+N{jwQwB8!ZU5NveZM*?}QSCgkBRuhuniY^9ne3=9}-=eeNHgkdw3b+N-a%_g-s* zfagPA^m0qwEd3pCYo7H?pNhUP9+L` zD$#w|zM6l#)P0uadpWiy#>a041oO^ySQ$nxhT|l!azpF%fZ%6AbA&Wy3Y#bQCZ)Rfa^PrE5$@! zY|_y#cyCxq%j6~Zd{L2kyJc>ew)0`Z(PMFxcz}U#4%%PK{TBViVaO%RfI|UuL2n~{ zpC)Z2JILhsB)?A#I0|uXJ%B2KRf1ulRtc zVOU31MyUel?MywTo!rKM1Zk6bx8i;7PSnp>tv&j zl)>7BE)QvZm*KyT$lZS6+xIDYEKJ7P4siW6lw@=rmLALGZ9*|4i=C6eUO;!FE=Dc( z*>7(pHkda)0DFceYQat4ww-cg!JnGpR--HL9@|M&&p(gXX>7yXZ9yCF28-Lw_Ma{9 z0Plz8YvIbJ%dyLEWC>(&t9H>A9wMfe+~Kl@pWOKd2Pgc!ab@e=5W6ggbyU@$m1|T2 zaAuWw@N)4lv=BM*gFdsGnPX+?P$aG@7vZ%5XJyP>Lrb>aOvL;1>d%_X*N&zz+|tcy zp-~_D5@Oi%HQS1#5Gid`{OQaWr1YTqXIuN+Kn9$s@eVVazXqwQiad$pUdEAnIVn%^ z_6twj%oe}=@Hu*EwtQ+d&ySOzV5xnMb$^5{j?Y+Rn%$eL&>NGN@hEaf*E(;}WQU`_ zQ!85ai%ZT@_niEC@@#uWVw|dw#+Qt8ogFGf7YmubUDi$8)9S`78Bx`f@a4S%?X{O! zosCwPM{k)UvLdvr_tK-P6K$Q4{P961v;8y&kHX5aL9_7V);&)_1Rxhr`v!cYFx0{45w^AexZr3mYr*|n%rS_9G zC)x0-JbWg(kQJE=6m1FDS|#|5*kP)S&M^!|PpjCRl(2f`M-{)9Yb%c#l4Y3X)FkMZ zUzlk%Dqiv-mS^zJ){p~5F8IYEnDlBTpR3x}h0B0?R9Y4x;%k+E+$h725Qs4*`$g(o`@r*G%u1U) zLWgHS?bG`#YAUjdZ(n!yp3Ku+ePNh1ar3he@iU-yYy-(W^SZ)})YVd;x`}ILx~mRo zywRJ9nOi9R;@drZ|3zX9v9lF71B&>wlK5-K0;ZwsWFEAJ%FnU=)wkB4S9d9MbexHg zh?{Fp?K65La3ovt@w_9e)0l$|g%xx0`zjN|f?-Af#2c`glbI)ZC@%L1ZFG#^Zxum2 z=#~^gQVgYJ0m)M`w0GC>)rlHp0yU8C_L)A}J9^O%_M~0S-kO--;F&H9Q=)L!r6uxu zhs2t;*Rsz-76~@;mhrD10BXyqhqgaiO5G zlUfE<^Xw!q4d;ZAH%PX=KH>%b?6&@;Ex+BBkh!jyOI2{JQs^(i_;tlKpwvh01*SSo zrM=1#I{)(DzM>~QCPCTjo%(2Z)s4oE<1_h&-U8|=g*rFz=^ZBCk>&sR_l4H=Ze#G9cXA6$75_~sc(1^{^PCq3PfmXL_2d5~6%RgTzIjr!aI$paW3uT&s>f1MpkOQ| zUKV6fc=2opMe*;!=TYoS-l!xrk95HCh5v@P2PBDRGYn1~L80$M_IH_1&oAZ;DpFfW z!UZ2Lnquu=vxXmoD8==J&#-5y)i$9&;uWj=X@2S!%F&YqczmY?b5|YmjE~nf?1PG? zN^IZbIX)B`k6>_kx-lB&!r>gWXkxy#?s)W+g)jf*FDDlHd8HhIq)iK3ZB_ol8_uIO zM33hEj;!rp{VVYb*q)QAn-b%D+5!) z+g~4X^N;-Q`@afrYxlB3DxOu4Y&77U^pzH-`Xp2M@fcm?ot*3Sj$!ivwQxz1Agz(9n|mp#Gvn`jcrBS;1jB zs@Sugibpi&bp-){KeG}Y?#chnw0{xP;17Z|VT1PTksxp4$D2UKr-&zKGBSmq?fCzm z_E%-N@0WOhlg}WtOxgEE{)bfFuQBQ`ieULonKa?E)o-xP0=cczdRU%em0a%@E$@+B zfsw5xIC}Xx35Y@Idbhys&vJt1Tb7~P%2a>B^(SXAT?h2S`IH{|jO#b8sp}%KXnEe?rN%C_KN1ZYq>l|8evqll}Cs zX#YXFoXg@@rK5j|o1fnO6Ok9>0d~F6iD?0jZT$J&zkTDJ-#S{B8he7{+F}DQ| zY_MP_v*h#t48}PXIiS9jYMP1UQEB)K(tmtVy8;w~Y*`V^r#w~;`XAr@3vdN40I4cL zaa8{oIITM~OS8{$Rm#^6H7*n;fx zl2!Rg;@0{7_`JEfxxD=8bqSX}`9HhO{i1)w0_O>QCV?Qh-!Gp1s(X&;lb_p8j4@un z2NPJ%(fogz;qOOrj(c({GErht>}a2k!%b3t)t}_aZ@x=cU#0XUZuGzXPml}9i9T(C zq&d?bYAEvSsg8jo>T>>p!feNmh1!3DJn`1`d&U`x0-$39|BTze89GJaG<~Myv=
  • ;lrGM=y2xykGsXq$naP8D(Udsv z)sSvj+UCOGks`Vk+k1;1MaQY_7!T0arXp?X{y)+7*5zx|L!O$531jx4h;SH@BZaB@ z?1~a&l}SRPcso0bMW4j6A-cTqP)qS)WdBV1N&Ma2MVD>mT(zAp*B#URs`Z!2eI0z( z>kS-Nj!9dsC&{anyUEMJmjBoh(c$`S!FMR#;)s&Ak6j}#16SpxEJLW^;|9-;$x%wq zfoj1na`F9>YUyITz45INl&&tpQQ%T44o2D7;}ZC%Q4|<%M}HgTl@8spQT{b73M{u> zzKx=xbF4$*zlY^^UaijCR6}fpUD@I>E8tiR@JL>D)vbT*0og#>l#>}@QA`+uFv|9L zn{-$8&fcL$Tn8L7MhjR??WH$v$GP$FH@@cQE?lzOD%HR~YUR9SEl@wlXiZQ4KfUzh z*Fto_c&0YReuvvy9YrZ%Go5{hwGaAbEtYE?8>Q-C6qlvq*nb@5ci&-+gHfIrksn*z zzlX)pH#YwO;f9*IM-{&BI0&%q0vmAIW8)wBY3&C*j&B{nCJW*Bj!NGQ8DKJm9CG9$ zjSn!#e#am`7PJh0YhErdZ%EEg8Xo zOclCI02Z!u)Pm?x)4roa98fF~U$)xczh9X@yf5G=NxK99E9*Y37ti>sH-83jM#CWW z0KYwV(t>~f{y^+~lMXnKqZBUoPn;(@jW_v?e>;e=$2#Gy1p^9Wyll`!`Us;PmTv&k zbb5uNgC(VlWpW$^JjE1nTGrZfxfPFM{^;UwbNN>KOK0~B7fv$dWGuXUcid6wVgbZk zvyS`dB*y@}En#40&?;cmAHiDIJ;!KdrgBVGoQeYg<9s-DY~9C}*KmH}o2oD}?)$|k zr?=ynOa2Mbz*=+h+=wIh#bvxB2dD-5%H2l5R;VzB;W@6wsCIDj@H~ewt7^;sk!!JNtD>)F9YMhDu zF+TifT3(JR0Ny4eBL6S%Uhf*g^Qsc^dJjH45rOuXkCwLD$`G%!F8l!&(i>Ku+TyuU z>)tc)iy;eZL!3He=-zNOT;b$C|36vE0}QSO0x%|(_1`+I3dR@|q)}ciu(i6`wp?xCjHmhrG6ixqV zLIC`5Tn1KkQt;(rPV(KNzdiD;A*t|~?%?mg0^qQ4#vp)IO>OMiIUNSwKpO9T#W(Zv zNB%MMa(vj7mjI>KR@!--4ymviVc%h`1BY*HWBfMVp+~UZc2l7IwFYz9w*@nLVnclJC|RfQUe3`=ET zvp2IegpAA4!G9M+39+v{z3e*5!k}~2$dhg2JxE>dVdb;bWdZGNeHtfxCRPSaQCzaQ zb(K#}+9E4v&0Uw|J6XDD6>PF{4%74mp6>Nm>WZ=ZoPe9@B_Mh9>3713<`W1`M8u<+ zs9V}jEfK%05t{IahSQRWGgL7u%(a`xbA6lPZ-w;t13v%=A=m#%JnyU}1p7^#=rE`J zMX;EsqWad&9w9<*lP)ya5&`Pl&+8mnNt&1~C~53${-{qjr2Eq^_jLn81LN`0^Q1>u z(L~+A{TrJ)FF)-*BnW@;R?6ThX_$(is})RX>vI9vb`xG;Td^oyWukB!(cV2kuAQ2z z)>WE03pLxdsh>+tg#R_n)Js5dZrRc3oIbpQ7$@JXyo10&v*z@fUwa_FdRH@H(nfHY z(k#0&3|IL;Wo!42D3HgA(ih!u>HyHzM4R~@oqrOiz8_mjGTeRHxyKli46Wt3V}c|B_f=yBw9&2 zG(*W%QWExil^0TO-ubsoEs#EZ8hAn zV+D%0c*p-G*$xZ<96{cBC{YXi!JQ88y0wPh(W!KSer z8+%hj%G{pFP%y$?q1C$|4P!Q0FpeQ8+q zS|o|S_d+wpxJw(HVk0rrhj4jEylwC1X$*oHuBY3T=l7@gWk3q}Im4fMYG)6dl3E=C z*^&c2H?@7~+L4yaZQ8yy!ORsb6PlRy^Dg7h3x?bVI)D_UFIncmr$1`}P+!iJi%ID@ z_3ncTQ&sk8>4HmRQsrtQp(kcrD1LpI62!}@C#?Tu_0!W5iQZnr$8t@jnIS{0mbI!#Y%KLDP4zO)1b3>eJp5r@71~I-%&J!=ez5B@5T|uTds3GKUGNz z2EHoaJIDB$P3tiBL|x#J8xL*Z)G9iSToD*o6!w zYgZ)K#p!h|zMwuMpX`AbGAMYIQ)7M$VKYRF+O3D`<=_iunVpt95K8S_u?vGQNU@*i z6}US11UW0MXIHu==2mK>zJ86ANpo*3`T8E=(G?|icF-4qTlba%6AVJ6B4MH@-D;>B z$-|mt0|<20*JB<{D)@26oWC$K>vb%|Y>`Ycdl6j-v6h}OTOz@-RLY@SluunvdH zOV%gn6GcT9Y-~a+URm9R8iU$z>W@4Gh4?pNnsQJ`haVn9Egs2?cJ1I&&tZK3 z=6mThnu&=}yJCN2RnE#N>>wDukz(_ggn#>x;&F zb^*4xRkk2*KC9P?zWITs#?eZ6GO`7J7 znHuSg{DF!orU-tERVU0ALql@mq(+}D167-)?v=65h2FlJCAf!m*Qeh^;>IHOU9mSU zL$1XrL@s>^cZqKIwAW9%xCv+?c%pIv|UzB;uBdfNqIzcaO=ySe1WBli$$ zv$&lWqoIe7?25t{zBQMN4^o!8R4z}MEJBJB3xTx8&1~v}_{b(E@20z{Eq)>a@F8WQ;4Bvi}8#FeyuW zY0N#DRC_On`&QGDy46IX;YNUX_SS>xmiRO!cfM2AcZ;>FTOY4&neoG+?(}W8z9ZwI z!CCqboz|SZ_49J(4Si|)lYAB{USMis7Z?axyf$`;c9PbC)u}Iye3%O;``h+I{MgQ>@aGKCE$ zvzEFg-AYBewGn2Jaw&5I?1S|7deSTM8&NVLessyjldE64?F`$7vPHNB?|N312yK~Y zGoft1CUf3{^+oHcK_a84-y}XDAt!TKdjxxx6GC+|^1b0+*rZ^$ZSvP=A(iy7OMI9=P)l%gc)o(xuE9`lajag57E5jj#y`wM4BWf*48glJ|IhT886-6h#s_ zaC=1pPu?rA+ir5gXyHLdRV@?Sfyk&JLPZ8KLCDgK)0~b>2I!&V9}aP+Qfkao=0H zIlK0{$l2Pd&~5>II{;4^7dz^R-o!8P*i8~K?i`D*;lHxEU(D^;`?5pFV`d~^sV1X#N}0astb=r*P5C0WL41TncJIHJS6tOpqFxv3LmQnNsG*X zcn>76Y6FGOv8T9|tNd(97q0k@K%^`uX-*)tMBFCzue&ry@-q5HP-|{0@uzK<&L+T( z<|zYur`EEoqlGy4eYakFo?Ec@-P9x#;lLShl-763{V)Xo^Yh9@S-p7Ga6@3F8{65T zvPw*<%%MdJelBEONA9gp37p*tvD>0s_ zK2L!K>O$(#)!b5LE~cq(eS^roBSk zN48%^Frx?q=~n{jJL^wRd6d%eFBD5)dL~^@dy>n}{oJh^ft8kUZ9^HPHC70D( z(TF%%(8lX|^0gen@@#n_bA~G}E&U1foMBDjJzs9lLq_+Weq&vls5jMPM$cjEs-6?n zm((YH>bzDh&6~S=KF+Ol4a}{43H$n0+osHEgqVicbL7$Fi2ikjmMn7)_Emw0%dYZO z8GSVy^T9%Hb&NchboW-N^_;R8V%fp5vZY9x^E;)Ko|=6KqSY!W1+Bf|$UTs}Uy~oL zoA|3KdruW%62%C?HT7=2h=jr$a0FR?idqJ7QHr)Js|cG~riU|7%6#uKZ`JghWAD@9Q! zApYAAS*^>qC!xF3ABRErL4AqV9 zx%s&Q6)8qNx)T}f)Je+>_~~k_UMy-S_&C7JV(9z&^mBpqt727GrMR(glI9+u<;D>Y zR)+0&(t{G)1j5d;vBB07B~m+@HUUc)qUodHg-pm&lV2e-?;O0Vv!>uM!e}&_kZ>2J z)Y^Q3Dx$kS^^$w=Vwk!B*@%)(05UvgzE36tUvSH-=j#j|zFL_>2XuC=oT+#$bwjaR zgoZcFAe_g+y!;NGIhO#Oc6E9;lS@P+Da4F9AUohvu*-zD?I#~z;`R>-JOcdLFP z*Hmhbyn$`Z-1+cZnQw}wm-9VZTf%Q$ zE3clU_ZL-C}5a`3)qx|{4yw+7yBeSBZ)jq1~Jc;`+JrL65pYj!-JAHMCOoK z`C!CZJ1Hae)vWwYqMDHGP|7}ad4|@VofmYcZ}+&L;n{Otq*f;l772T+{=qUXXeU}4 zh8xPjy~4fkWSp1Ra@lv0m&p*RN<;Yc3L-oO7vxWq7OUUW&X8*?a;)T?<5Q#uyK_d1 zh@q-LZlA;S#W=ymVuoh!uJt$x&S+J|W-0pIn>bll_j(koKs5>0R>G4jl}nK>zprQ4 zfquD0Ud*9cihYZp&B7*dQ6#uQb9LK?&R(f$2^16-E|UaOl)rAp2|>sC$fFAECuG^= zgI#~GCv-3_8WqsPIF&{bO4f;pU*KMBs(D$sh!3))lkh&*@Zv{2%pwcKMFg1YGK>Fl z{V9^CPmmGMA4j_{kuPfrgQ-;a^vXZ1L)ZH3_cu;HuN5T9gc;FFp<*5=@i`>v#2nffx+6h%1?(O7*^`m^)6zq{7rCDs`JgCkO_jtp%W9rkoi+e`#eKpjv`ZPMj~&S2FMHC&z+0u1 zqo*U^*UY?(j0C!5q^XeSflsQ^F)~oE=jT_3hih73>628lRL=N}G;fB!!pys_;EYyB z?SvFF3jF0ENWVAFJgXC4MW|!_kSTC8LwTVr*o^O5iSmF&vGtwxi!D<@09XZyo# z(b*>o@Z{!wslc_!KQc?wv?VR`qb*UvSI!yEmpLxoPXwyJC6j>GjwL?g{wG3wVpWSQa0k9y^r@0?zNk`B^h6K zvC|axnu}b*9#P}i^r~(4nB^qgso%v%*qa-vD(wCmNi3(;s+W6_YG4EEEm^YGo)c1Z zD}&RejGHMxxT*PSJ1Qo#XnzaM(>;*T6t<&J6G~*jJuQ8&xdxF$WanKU2bE|>@JCG# zZ#EQUa5{5VRo^Yq4p)wI9^_#U&<_dO42hz$)S4qwWe-m547_GGw}vQ4Oe?G8X2K%4 z<${uf7NdFk<9Q|(O&;$akl zzz-b`hpEZOhVV3Uah)0`r!eJa()&3e0e(N>rszY_=a}qzu&Hy~5^weLHec5Mb$1L; zXJ^Zb2>(MY>W)G%AERl~J9OJM-o}K8Zq>WDg;(|AC~4G zSjARa4e>e-WTQ}N@#?G1%$vwaPifTamdB7OGPy5)>wt?0X~?>y+8OV;rXiie?-P77ZW*sjlJ5@4FNWgQ;Xm&$2Gh>Ry~)@7!x zVpg$Y>AM+&SEV(AM7&3nuQ{DhlPj>ya&)C{@thiJnIG2?*y3UbgsfjAe&QTKfd{}z z-lj;?>kfu>DlLhr4)Yhw<z;qv_p}w2A%D2rh&k9xE7mM*e~6;m z(w7sE5_I!f^UxK8Tvz&@3iCb91mt^s$9bcVBz)4DfqbH-a;v^Q?5_;Xj}N_8n5H-PWm>GeQ7Js!E}B}YaAAuoTACtz^%r7j9wlr2UJaZb57}n zej>Ls%qR(3`N~U@cOmr1(Z1$pTYcxrlB(W&>=th)h#Kpq1?l`m`yU|5q}ZCJe$NV1 zKarixli#77w_hH<>X59J?L`Ns`fNhKzwn?h3ykek-)f#ue?!%L&#gaMX|ctOnT4<0 zbm#Xw&xK6JuMK+IWvRLEl)v7X64~&0ef6dUvh&gwrd>s5UuaM(iS(j2o&D4L^nt`Q zY=VK(nr3#H$G$AqwEbd%m$eu~aQ2NSKmVSa801MA3#|{^j+Qr7KVBKk>OX^z&Y){|i z$&)y;RSqQC6tjALa$*L1AKU%qczyd|V0wOm2;7R;-Q9(zp1pzL5t{zsQIC*7zKlf> zuZ}}^Z%?|S_6sH_Sp^U@SRf07uJcl;U#Q&?%Xt z7fvyC{!3V~B(naG0TE_XtmAyJ{#Jx0)w?7HjvTtBYj2}Ea{DbZ>RayV2Q|QhsdQ_K z%MD)ey?(=B2TPfcNl$oUZzHz9(5<6)CDK&f(}lB|G@q~R5z|c{5}g72TEf)vS&_Eg zJKNNQ_MQ$-x0(26StK)Vf|*5VQ_V&bJo}|)*E|W?uCe3`_L?{2^IuWn@9zou=3}q<^6Q$$0;jh5qS7j5;3Q#|dkYIH5$a%j>e`W*~>KiiGaId#K1L?aVmG6J8RV zls2=uD{2aXG!qI!)X=N)0j?VLMsy9@@ZuT4ZlIM#2XEtBy>}lEh6R*&;M=*o6@ye# zj3RQy7WCz z74+S$j=|w^>~-D2na-U5*%BN{8-81R?b>O*%P*^Kw_j%LG!)c8T_4n|a$Ga5rgBH~ zl5%#Re7@6oN5r4^*2JeDIY`Hh+P8TrO4`xcvq+6QsKBNAIajr_q-zOMPg|x9D<#E| zs>-*RBpW&?^FuzcPEdd{!~{Vlg7?!%&x84Icb>8+R)TT%VreX91?u|eq(lVm z`iC)A?iF+P)H;tR5F|x%uvt_$U;ZofIuxSzyfxX#{_cZdc?smCw#W$aO+OuqJ(G%E zqYIl2&}SwdU+fxo>UbA(+dYZ_BUApg#4ir*&v8ZN)|J8^Ol)+(#G6weIDcYdAnl8* ztIyj6Z?c(k3H|ioJ7HPyS^Vs(FL%dTDFv03iYHtREu}!E4sBYAL8^I$Xu+kN(P?Su zbf}NH&8XB!biJEBvhYqS*S(wI5l`CX1PQW0BR%p^njWi&+=5jsbXmfaeu9QJP6mS9 zs({LK@hrVrEGrt4QhrQ+Fm=o_-m1K4uR9z2{&1#Rc7HXu2D$Lo+esGzm+DRju)aRT z#+p^c);h*e$R}#Ypgw~xiD-2Vn(#NVm#+MRUoeyFA($pgM1H#8l%I7A1N+lOSMW9Wx~!Mt;52-cLw*WX zzdcG(_Enc`-0ga3*-Ut2DpI+DneW8J2@oR0%t{p(lcn6^Ax(iL7<7Q;#Du-FI=h)%4C41>|>b?6*j zBX_@L4TD)cq<7Iy0ix1eAYY%%^?AQQNUu40%l@f7rg$$e1><|#y4SkJuV(UQ2C_bQ zcfaiM8g?*kM7`0HjZYETS9#|p>c!InLcbDA0(hfkoLv5D$Lk*lWMb$5u5dVIsd1F{ zhlA_O8eMoHY#+8OX`95F8H4WIO)Z*+Z+o8a^UP#ice=ceG`w9k^Rv@tNFCM?<8&)16Y^q(Bhj?EZ5s8i5{3A2_}Aw+?rpEv zDa+yK;hf(#lzmULD`W>rwXno>mH-~b2NeoI&#_hJBz8U+)XZaKZpaq+eg>`t!V*%7t z%jchJCe-_U)(~g5@bo-04Q!;;1T^K0>t0^Xg+i`yp-q#CXtiW8=Nb9|D)_DAjzSc~%eonlE6!4Wc0dNNBk7e)bm? zH}Oz?k|Gy)1d$#(`&Q62c=_rGA&}^+OqH@Ur+>u>*Uq9WnebdjoX=?`L@=YW9Pc%N5zB9;u>2+jA~w5~@;7DcspuWS!AFb>GG&){C0 z#k2VF<`<&do*H3_xuZ)Zj8n>Pl;~d#RiBN|q@)r+^z}S6>chDdE&4}!m2KBPEg4Av z-2E!t%U{?-j(d8bfIrJ8J*Af-b+PH`x`eWK6gCmWOn&i4MC$vT@%BjTdsf-g>PMoT zHwzMm> zX}(x3YQ5%y4{Z%h=Gz3S~ArQrILcLjTMLw z`uR9`bp5(K9a@?;+04S$N|=p%pMw@W!x=3vX*^<9=hiMDQL0awj|kp$|5!ch=vr?| zXOEUq)YR?ctt*lLyviP621y#`m@}0`?rcN?u8Ii7w56w&L)xFg|3l!Ai)PG5yXsHD z|1Vv*QT$Q0+m7>fPfzQfk0&R*%Dqf!lgHAvxGOrVBGX-oDK%#P#!M+Wr!4OCjA7e~ zU)n5YOSN0w%}*NsicNE{2Mj_rCxH>F({$ONH_P}fSYoe}&K~bG48-@^wG?`vE(sJG z&Z5;?k@k8!XJijL9^tXPS^H9MhvEH~47BLtMngBX@I)PGqQ*xkZeS zpF~%l+uB={V?&chaZS{8e2{_QW2#;5cg7Ue6}4}9TLVIyzNkb?Kt9Ff+Dl(>d<4XJ z=4@LWGM3ty{i(%#36xj+MY$1|`gH=j>T97y5^=3wX<t@iaBPgo}6m&7-h}%-b=qzuppJJCCc$Q>yWAd zPA3&TZHbE#PE^6vL3c_V+Yk(iHCyhfJwR4KY!6-CAMO;Bwzu`LAuA#wvv)i2(?zNa z?=G|O$Gl+sK2R;^DQ}9#O2&y5XujO!rr)Jz=EB|k^`+oB&W>u&1u*SCx1m{raM{*8 z$nA9ayo2%ioB4uhQ&0^%@2wCwd$E04zDRW4OhUM!p3>tkhx2fgv{{EcO_Ox?hO*$B z6-n3Z)WC_@9|_T^4AFW0{kF~=N7X~i5H*0Dk~Z(I!rjPqpG9Tipr%f)01kS;c9pXl z_$o1B*(r#!?nX|VlC1RU^8d^m<;q_go|@+#SOG?`9n#j%hK-L8w&nY2_qv{CGh%ei zd^oK)`28q*r!^BQ_V$r;AQQMS3NFGZ9^|Uy~F_JXt)|&10BS8%MR;1+C<75v_lOYV5uN9zycz z3P?@w4$qth5eD*_KCK4VSMNyf8ax$t8(ak8h*qaTjbb`vfD2T~B%{(>hhYzoXuk5c zQD8T4$Zim-OpC?_s^$$UowA^AkoaBS`mX^ch7hAO`wC4~3+bS2eUtCje$Cqf7 z1Ww`X(b|9Ncnx|6;xnHfg<@#QQH((s^M7#guU!UTvh!G-9fhT z+5FLSrENUE$&IHkigV)Pw?8WS*wvq<<7bpf>lGn&+?9|SDT#8=n{secLO*-f=+sk? zCEtBNgs=Ni_DvCuO&*Z`!k60q?&;+7+g`;m>(^REviBpuduSRdcodW6nCV(d2UWH} zUxzleW+?`~f%btX4!n?cAxqAw70wqwTpVXut>b=}xw>Ay!(r_Y90iYvS`>mIBRoF= z8ex)v>!|zpF6R_?qaTJxvNWnxtnLwlcZ_!mRZ!pqIM~~;^u@~98pZNrq;0?Oo8p3$ zBq9@jITLfrq#CY@V&tQ^*$AS^6iBtM05}6lA3252$V?a;y*xrNoNCOJeONw$*n6DF zY`-LWJidzTFpz*Wn%qk%!44E*O>x{NZ(fUd`pD`Y-nu49sP&lePad-G-;vmJ0Cfg^ zNHf-Vz%nYjSvu6stbbm0my+4BR;;-}8#2bQ)p`p-w%P_wC6>3ZskCny!d+|64UbOn z?9~c3vz~dE6w*4l(g2;~Z4~5#M!)uD>ys0=2o#ym)yLGu!dW}6~`4Uy&GUXU|XP)(o z=-yJ!DPsLMT2wJ#MKLogO1s7PVIr{Ct4TbuL6$-628#jH-Y>X24c=W#6WKi*P%Y|w zrjGyc{jc(S*TK4id_d+YSBlBa`)rWJcv7b4b?2!_)dptu#^6p#KuHaRsT;eAS{f&BRt}&t|Qe}8r*qVFK#ao9-JXs=Z+|!)A z61}4=GF8aV_2NOm(%!C$mRs%m_Lt8%cGcw%g?jl59*$a$NCzcET;m&b%Sj^}y6HTd zGS(?Oaq``K?IOa{j(oHKZ0}0Yw2SA*fTf+VZQh`hXV^ByvaKuG#wp|<$WU)bsSIyqs?2Ev zZ`lN{vP+#L-BY&TL zO&{<@z4b_y5l63))k!Xv@=gL}bMbxJkx-?o*)Yz{k$leVL;G1kV)gtk6PplN&+`Ut z{`tF+g~I$IA``7tlxt6C-rt>I=bssw3B%FS(v;?PcYU?gi$uMJCJZU5pq-G&cuRlh z;1U#WroEXyKPg2XlQ)tU))`M^0qkb98y0Q>yBSA>Y6c%jvIyD5%Z2^?fZ##OXfg@S zUUm`kD_G)FQ??zuAUHchjV6Ia{EdeoD=Xg;P-A#7Szfyum#s2lFEe5nK<8#KduP%| z(cFi=4rvzQi}2)&RHwO%GCCtkq|%Z@DA$^_>DTZWJNQ*(pT55&S6+q&*IhKTZNzNnFb-J?Y#n5sY3OXRJUJ5RV_zG3GLIh%bCw?AY>xzaCccf zMqf4*Mbr5*O5$!brhk1|$!EPeB-FUj8QGKlthdKC8A%@pc)PmNIhSNn(ESR*JN>E4 zB@V)`O7_|A_TYLv#rM6 zt9Bq94{ay2h`_68r}mr_&X%~L7>0W{eFW+)Y1yDqoGWApINMoyUQX}cR2`_4*onbR z+l=J2a@Voi1T#Vad8=GH+Yc#Ij9Q9)?S0y;$r?k|fC{-Sa+w09##FkbNTwa3+t#~% z_vxfysNoPL^A@9Yz$fLE`B=dB)p$4@k^>gyESFo03x5nShri1BJhgc*X>vr*%79<_ za)Q@s%$&dq$SyJeZT_R4X7M$e^$#E9388#KH!kYq0`M=H*O+>}q>PTD3{Q3=t5F%8 zHhmE%!^j%^DEVgS2PNPXR;AIW#Zi~$XNXtZgg(-6$L;W2n(gU(T42i2!}r}ZGV$E{ zUr~C9Qxb(nL&m^{cJ)pmb@5%cR1E5VnuAPpPA-y{Wg--`Be_5p8ZNMZ%}!5jelsMf zBrD2c@`>QS4Mf%bP&Y}rTkp~na8>Z%) z<;hN-yFEsTJLU-A00>1dxP~T57;S9?PglxXa2qU|a1+S^3#-~zSuwqwlQ|^cW56CO z%T^DiJUh4?88dJ1k?_UG)HUoY@BU8Oox0cO3H2Y4JSXZp>L6a8Ieox2kE7c|4=Dvv zgV{8Ma!kTq&v}&j2p>&gB)4~@1x3C*qndOxb7;gyxvAD}?O=L~&C8N@o9;dRsFEZq zzYEU{kgKV3%lDG^BF>cq|ce-cE;H@)pjk73)k$> zxvwcYnP((P7R_jFCzTNn(%p$${5{yXF8TKtJ^78cs z6|JJXSgOr;MVmKlWxdV{y2=jrq!ODCPvSVNH<5cv*{Ep^6FrNOfsBs`Z|fb=NUsoS znP9K!b$3idSFwo9vuH1_aqSw}e%8`AEy5epUOOVZtbn$Lt7$Q^kI}K>=JVfN%n*Lu zwv+89_#<`qCw5Y_PcLm0zEA$(iJ+VBt=J07Pe*vJmW}bNVh+W--)w7)@4>adFMNm1 zx`|{;eY2-6rvv&&5KdNx+WU7c^t*x_81m6m zaTgyg@((k8i;b$b?6Vl9HuB7MKU(OTmqB<rhPy2XcEMUE z(rfv^fV8MoO=f4>lJNlW_-@alg9A1)$ zD=6B{hlF@M9=y}2O3P@`T!)A&$@8d-aVGxR`S@Ahe&ToEh?dKTg|s%0WTs|bzgYKo zJ>5qb7rm>CM&Zg0j&Co$0fh0+#l6)Gqc{gx<&G{KJ|$B(WxK0OZ?|iz(-Dk}XejP4 z8B5~!TJ7QcyJ!R+diERSUNXZhrB@gYmsm` z*kTA>lP`6$pb$bnbUEX1t#F9v;;}=y}#eH(2mglonh5l<)p&t%tm{wQ_)?^nA|7i2VHZl&>i6 zp}bjzzpuX6^}DCfjKD`IEX-GUGD^9kqZS4RxM|1%ugrved#n^(88Nq56GAL5JUWn3 zgnm*|vek3jag4p^3rs&|&AucFI}o9M+nfzWyaQ|TCN#5&d+f2|Z=2yufS%(ZimtvN zY(0P56-D-ZIl3e`bh8k3Hs8{cja4spJH*pQyW-406A!pK23#P+Z4vvmK~!$yQA&sj380&oV|z&N<9idGF%Z^BYY`v$xtdm@#v-*#?#P((!$IAz zWkgi^ib^HHAt5n%crJn|t8)95%J`%W$R+4aL`6pRNt&n@51LqMSkKd#`%V}+#*5ao zGg_7>Dd5|YpA3YfzQ+3q!CJG&y^XbTJuN5UaDQ@(x|3Pu`*W$fM)2)R?)5S5c^44w z02Llx6vr1w!74I}T%DvxmSPqj|G0VZGp`MyNFB8Nb|0VyxVM27SlcBK)=I?bj%mU6$+L3Og2o6UFvA9 z^A?&2TuLm6J$yCuU@@|iKv6EoKFY)Q=p5*}N;z4}2QfVpXREnY*Lgi{ItmCRbFDwv zkS2L8L3DLRp4Z2FlAWAawjSA5(W|@Z#h(C9Bsjgz*=z!Xs{`i1e z)nrRaNi&57BTcmWf;k)PW`V!|k$a8qVg}vWdptbp<>)7AZ2pTp+;fK`jPH7c^Pma> zedVE>oV9Gk46I`Uxvop{>5(x?MV~&iCART`^PS$I8+l)L%ugwm`LqnG=cp=YVKbG= zieb5WBlecs+KTs`THCGLvGg$|1DDAC)!t`8s{Lio(#;R&-Xl4LUjk%&;Qe@Rv>z(e zfDtnQ9AFv1El63ku&8@xJ~PMB+eK5I{KB^K`b6qMfTvy4Dj4+WKX*dcL`3e9^=178Sj6N$pbj)H<;ioZtN4==xWH zuBR>^PXBo~?f>EJy`!4Unt<^YMMVWx5KyX0ldkj*Dxh=_q!*>P&^rXgLNC&#gOt#F zlaA7n0HFj3y%PvXhXlg+u>0=5E4tt7`F-d7{_&9H@ws#7PM^6mK8N>|CQ;Cw6Kh3K zZ8@@Sbs&pFg#Og#t^^n0XVRnaE@T3ym4K$ii^_^mEzK2qm|1D=OstxlfF0yTY#|?y zjWJl>E3y#Fs@x5Y$~_-RW$~RZ`5Bsk&;aDL!^a?sSM#6p@2i+k1CKXpP*SiiDnl(V z3m+mvH#Dq2)Q_vaCzM;Cav|C|Qov#b!FMFPseY{@Efm`4veaI=OVNOh1%bHC?~xa6 zq&~?QtYXbf>!Bi?@E(~%pw-~|6T?~G73TjajZeSnTg7QmsDw|QTl}Z1>!Zu6RG!=V z&8cyXUx1!b3h+!?_B5j~2n2&!95j$+K)~d+tVVGy;HD}Bt?!P97B-eUZj>KjIV@L& zTu0RrV%gx6p0t*)cDxYZ%4wu)zG+W*4K~cCFhXiyvhQ^uIu_LMu z(rj*NN*YJFv3Y%4Rmt(rcXHZhGhdFUk++|};X;_N_*fmAib{3=+&Mc0LP_EW(R_{CcKS^0%xlQ_R0q zrZhr8{#&q#fh~v^eRK?Jpt9&w+DXS!7CoVxih~N+;~=IR21INAUkoGXlBRF?OO?70 z%{!5Rz1eVsB|YC95ldme)dg$!y2s`XL>O|$)yGXc*WaN&mGBnlfTlir2#c_}?KDCf zYK56*J9Fyb;f83rr4J9z?}!_39_uBl39kmLYL%RfjH;1B=-V4u*iM2JAU_HQSoRfa>heXVFH z*BmfeIkSGa6ZJDG@j@c?fQef0d!Bf~h^K1P!}3fgiWxI*`Slitz*R}>Frxd^tdsk~ zr%x_TzpCQa>Xl;;!2n>Kuo9^-8=|Y!x@u`i<5>G4FN{Nx1&tt-UJ-}ewB@s|>&6Fd zR|k9**EF6R3$_nlLYC+Qs@_#=H&84K+0&@-g-u$%;QBjRBP!!My zU=-u;9?Z9^<#Z2pxZXX`M>0iO)SX`r=haK#O~THXADK^?$v*33?AVChN;>g`Ie>)r zzW`3o38%Rj)z3xfcl)VlR$3nlNZ|XaM@tR8%bbEIroUoxM|A(R75XZx-%gO#9cMkUbUWVaztavBrO#xFDk z5-&P!r4=m%O8(~|d|1MH(}=awI+>}>04;g54&{C!)&nW9cH%Luig{{`)vV()J-=j- z8FgYVlvO&_4+4ewB70&bB3kAE4KFt7)i5?4)3%Pb+zw>@LxvD)Aa)>&$hU|Jkc)(9 zOC=uWYiv>l^2OMeSdl+nI$0*NjiRkSADKll5JQu6a~}nLw{JOqKEYWP@?ykMBNvD< zjJb}zfn^nm)+c8_%NvS!sR9vBxl4|fuFIPrKOst5vA{mqaJy~%y@MN^21fC zhRq{Kj1bW8fg$gj{z-TF=0MofO!`_#SHqL`KxV$IQ+0zxTOz|RBGesMc0K@}TFTZg zx4sQvO00ysu$f!OJCc}S|LO&>k2p`}DU7et(8*nM>IKA-w@gWw{B9)+>{~%> zJ6MPjsV$WJWH;-osnLknM=kQBo|WBGyYDp2!dxVL8xXun&y1yk7&tGHW}(dCB>u_D z5x?gwUCe{I*sQ%t3WxmfjpUzt8^DD+Aal?x&yD*}CU#5^2YB85V61HQK{YW$Y~(}V z!XvL^Il}?HHnBs;&|BW+>|-hvxkaJR%0Kpa#v6mW<##2bM{BZxa1QG6bH6dfPx@f~ z06>ju>UichC)+=lK79yiA$yNaYsk5<4jJ|<#wp3Ewzh}#ME(`)`47i_;PL?`6_2$Q zS~q+ukU#$&%l-h&DenU@A^;n!NjyI!t08zhQwss0rCiXb%wK&Ky2i<1@l2bv8qL)5 z*;h$|^*~LX3-WLg(@8q9PtUZ=rcfbNs1SXgCLX=98C)q_IJuQj$&3Fz>3%1O{_~TM zH3`rrsK8~o?mOWHWQ4kcU`TbW9RzB#?V_K25sLYyAhL8MqRGT2ut=3t=eTdjQBt$% z%k8)uH@9B(W3SF1R_g=dELbPJKRk=NG7cp@aK{>9jrbGR4ZAA5NR_Sfi|m4IfG>xW zl-B(}D(3$1%5(v*Y_Ny&Y=6a_V(}>-!Z_PK1oZeR5S*!!8)^SO#{4rq3-tcL+i^m? z{raCW&A*;;tQU8>4HB3O$tkz$P2wE_f)pBJ>Y!DW`D!l~^wrI;P}oqe9>Fq%C_0t1 zoY5{~tZL6j(?utcXtAsrv83433>kU#wBdxp3$I@|8eV`df9R&!bVN3?w_PHAS7RWSx0^xY=@eD_{ZcJBfcA<6ZJk~IDwab8Xt zT^hxazAQv{C(dYiJ$EB9d0pDcX_t3kU|?dq(H8c+%6g(^H!nGPB~B3hvyl|!kD5UK z%A8OCzmFaH&8LP02=`la;Gci|lMw^rzxEyZW59cBt5ygm`e6jZ@4xtp-?O#P5vd~g zVE6Bz-AyisZQlMl0ncI~swrt~F8WgHlMA_braR?_mVHHk>HVqBIGbZsfwbTE(kFiM z0?kzFx~oaNkAFD9ZarDsxIJqWkFgOhwnQB+tgi+&uI&}( zkVPYsoqyKWKZFHnpgE@}B=y~3x?T3^?nlS9$L71Da&4yr%Lb!g(FQ@hRceqHa@Q_? zhdwjTa~wn4)f5{Cd0C0&>RuMmy--@lnA#K$d!0`p^E2bl{usT`H-ckrNSM-3``sXh zU6%XD+U{>E=yR0*Kvx%W-~0_w$Et~&R{8f+hoo3-;hG6`YX{h{C*u!Ch0wA#`y0nj zqdn$l=dGL$D(LIO`I-vriw$f3=X&DxS2H42C2G`CI<0=&^(tWKI#4?h+W5HSo{_ZC zUD}f#FR_aWuwPF!rMgEm?C-F`}}<#)^!zgCm~j((OWl#!zE!n zVw!rETHhbzpUCH~`3;Xb#0&S9*gpqIaK9C2P|m#5)^$5U`^EZa6;=WIq}0+D@3}@d zYu^3c#kI5KbC2#tS@u_y7RJynv53@ytH#|#@m3Oz$3HjtpBW7yjMKc6?{7W$=X$rl z!}~wTB3P}-M0r!v{Ru*G9@~)Eyv7vHse1Rvr{zznWMVkPGU@Aw$9RqFLq&|31^KKv4BAH|>lfi-5m>owmq z)*rhdS3DJywI;zgzvJ^C5cbEfKDk$dz5W0ViJQjt6Re#(1+6Kl?fox-R@s#O2WS@h z1n8egdU6E-G~ZgIYrg~<@@lZ>AE04}09kzYgv3)fF94v`S+@EA3TQWpeXjR>{Jh3w zXwSBM<44SxaRZ8h92NNBmj-~fJx$%xRQ(`g9L$b8@e>BaX9O7Ri!SHIzh)u--`JsP z1W%Pa=t>zRc;X$^_-dQ&FKtxTXX_8jEvX~eIWtE6R1$E!LHuB;tu1xPAXW(_6|3C8WpNQ#u zaVkAVa1E>f`R$M5>(5^YAEl14GB2;(;|YA<{7eN5){EDzKk4}GM`WBKDjz(CVzgCUtf(=^Hh(iTWuSn@$MhIn z#{pTVl#>EKFrsq3k!;CZRJ|A)8VJ{8$&uYbY^XEv5f za6v7FNq4r}d?k(F>Hcls`)Rn9_Q~(R9Mg2C)Ny1|@rE1QC)Q4bXFD(q+}9ZOX>ib5 zIYn_-A>Ok3d`v%J{{A}7dUe>VbN?#E|NBD*F07MDfk6P5_ySU5`pr5n>~8%^KP3sK zuq9D7DLDA$?Nl6ed`_9dg#VAj+kzVbx}5XIzxk8-nrBGhKApJzmEO04H$JS_F>g3~ zv-5g%y@c8zZQouq|9tg;Q!!F?QQq~^Z)XYcrm4>XKTXP)pQzch+s+n8FIv!Cxqwsw zH0tQ^r<73@TDGz9`Q60cA}3PbA9^E1$v5-ygYy0DSh?(+d#(gfGsJ>c4$_kqAhhTa2y*K|kTv zzgPM%0sQQ*|DWMdWt~>7yCn28NKU`wxemw!hOd+t{%y-=xalV!&GZ0G5x+lXZ}Q7$ z@dGsC@)u@#1g>2A!LsmqaGgIj5A&svHa2GB6UPHPUoA|7r`h`F^#=R>p*`>Fg1l|w zg-Qt3uLy4dzNVJzFD^z0I24?9J$x8$RO>=9>Ha6d$y%10GacXjp7V!}Y~Xf{^^%!O zcTEe(KkZ=pnE?D@GXQO~itf@+xTz!uLYMhQ_tSj5lcUt|pLg<^P`Q+?NlrJQP%C{udR8*@} z!rBX3;>rUKL|9rqj?FjK#Mp9%WF;=bdAe5aEBsVOAk~0Vt*Pg}cGgYs={gl)pP=fB zBzDe)R=vx^E2ApUat6|?o}HBx;>TC`sVHQ-*V~_VC%4O5Er!xxLKc7(&>P1u1&wUM zoqOCJWVWVj_mbKtV#{8B+Z21+%kyMu@VQ}YoUnG5=Xz&i;l-50gq@?!#{y3IKO??D zwWb2p3_tk~ED;>{s^{Zf?8n^Q-ZiG@=^3=&@6u=Nay6H8L?|KlVV38nfVtStcxi-+ zmCTLu{GV$`)1E4}K)7(~Y~WYQ$>bmK{)#1n<5`NmezzD}_iVoRz7!+w?@7QOvS zd=#LpD0#`p-!BKuE7u2<2~gNdfqEJzVqE@uE$sWlRkHMHfzm`n6{l#~guIjzH4G&D zT!c)_4Y~zjhjD(bCM&5w+dr`4;pvJtlCnRpn5tElecwaoSl?3WjB|aTZo2X*2H{N? z8G2B_&lZ{Yktm@uL=Jl()@nYL&3xRC_mjQ-hC}@Dw&-<*2#2!<)r-4$fH~%nRFup- zyCxj5ZL${?o?Sxt{X3dm@f_?7OBTEHgJMB_rHtpmjcnmMPI0tz~>N zC-q|QhwByISGt)75!-Z|Q%pH>thny?W-jG^#=9>dRQ!ziT%&iv&g2m)(J(W)md!MY z!=7E=FkY|2@Aeg3);xs;$?I`O06EK>dk`~P~u)3mT6?5!KM9lSWxD zFpu=O+U+u(riizZIe$8(FFsyf4l!*^tS!bgpRXNmu=%KMi~V!WjzS87!ns5)WBYba zS=G^s)#%RbQyw5b4Gjdi4hfv`mwfQYgf5cwgj}kO*U(kTNh2Lay(6K)lK?V(!Y?oL5sXgQC5q8h+oC2ed0}>| zLNbW={?zoTITt8RW9W0`VkK)c5h^S9B~qCBMy5nb3?Xyn1AqOYT^(&e%S@UZTt zM)s~8j&h;|@T-<3N`?~6+NV~F&{^oXa-wU(L8sTIhhFp`3t`vvfOm8=BlaE^O?K5O zQe~>N-EP(qN?l0dx=gfLtDXTj)TX2>@BphBBy2DOVv?&JNo<31h;Tpjx>dc=9^tvy zHUst}*ay$boUD_lfqGT*xFt)o1W4cU8(XH?W2eH z%$2{4n0N#Yl$0#@e4X8QSuIg}YQG|8G+n|%w|j&#c1amIz9|AP@I`p!B}X*3v(xDH zvcH|UNOQRR?XZP+LR-rjlcASa6^Qusjvu5lQ0rEo*Q#AFQPi(*^sT?!@i52Ag+#24 zcT|>0i1N-RU!BT$j>FEl>B^_tS;)oRPu~JX$*3cm>ERvIbB&5c>Yl$jpAu%9vbcU1 z&#HAc|LhuV$;cFs@&c)JSCzFk_d?7XFZG|S^Wi;CeD=qsV=8%70ds5k*tVMN6d?_2 z%yr^z`E)|18S6z!A1AbYZL6g2 z4!*EmG=t%Ith&B=>vZH;FVd9z<;%(py}Bn6y%WrxB&3-eWlr2LKOXmO$OJJMN{}#4 zGDD--xJwMKN?YwyZ&rX#MOXu(muN+wxI(rDnp6|wShqwbfQan2iHc(2cdR?9f|br) zvacKsq;F|4v5F0iNeYu=Y{_p#J`Hc)U8!9Es%sR%Fsnf%UtOJ)uzG02&bg#ia)B@* zI_o5PBgo|W+k_Y$7rsfAc5EZjlw(m{rGh1-MT;lkNqb*SueZ#}d9m2ZMsI1lp;$Pt z#*ADtTjZpNbg@jNFn1hXR4UOzjQcvTe6S`9Y6A`2INO;?6fI@3#yHv`_QMYrF;5Xb zSGk*)7-1fBop(DYts*1B)`O`s!B~O##4T(`r2=cX0)Od}rUAkwK?wSA8H|QN5yi_XX%JOcUZV zQEZH|OAlOvgus|=hR06m>$@|vbPSytv_+wE3V2l&PO`5*9Q8L^F3PZqBs^JyYy<#b zZ>*NfMeKcnMh+h<;@L2NqE0y`T{)U(3-4r77OHC4j8t znoMCho1>To{O)~l(#-r~9Xed?0G#JY8ip-(sTvgCSqg}w7{!a6=Y-gs(dA%8nA(qLMzfgf)RTs9Ci$=`OG9YWG|LzK4WI$9q5`@ zoMI`F-!j$7$w+ww5}I_dbHAAI2yu0Yagj2stdmmPY}*$6$aFt$CoPY8ugCfKDNp^l zqa1C?*=XIAHLUQdAJ0STA1$0bM>*3*ObA6G8?2@ZD}>M?+=*^0@c;F@OcN7ueo{ZaRwno9}`nx?@h$6v2a$4 z=GK06def`J_$Ex3Nw1eB{#8jdoinACxq>KckT8Unxtq+|m zEl-8E51N>NEAWV{7AJkst!YP8=TIYv{|QxJ#a=Au2f^b#E(ixlU!ujI$HMSd++B(! zyzD4+IFevZ%OiPb=VZFlfe)GdJ7cf(-B;o&SAi`5U~FvU*>LxZr+{>bTlL&kJrY!HZ$;goH; z=Ea&Bd>w+F6wx3L_2XQy1vZB@J7Nb{H{tdt&V7-_fVigd(RG=qSi2D2lx5Z!M`!m7pgCb$oAnv_wN?!y*EkPbBPUy@ zqGBdOhvOb*ErJvZytIb(u>`FY9S9RH&-{~PjPX{8}E;oDW-x)Mv}KiCxjw*h;tcNrMBgZRD#prk$XJ@e1CF2Nj;@WAH4%-C zC_EhF!}SSZ>0=mp;yD%PWm)nS%28fXNd-x{d6F&~?y!!E-x_Nfn*aKzmDLw^Yfyp& zyzk`8kn1@_^!ypNiEtem|A~|WVAIt83sOf?Y5fXOn`Z;2_xo`_mt|*|DoSUODmaos zX}%YvsU;T@{courC2xL$@HFfm3D-3n^WBhFxHb7QRolJjB;L^rxj4hDZL1|jpC(Oz z{sir%03PgWIu>JlpBP~-Qmt??`QT0Yx|`1AI+)K>gj0YXaeo(9KU?5_*Oc@+MZ$on z?9t>_ePr(hP{~H`!Mb@db3aQZb2~D~FVE^u!%#C(LVaUKaw9FotDNPHqOJkxFT% zF@#WdO3GuI$LpSUtD*R`Vh%@FS*dKQl(QqyT=tlmJhxkTq=uM6Puj*|$2u-8-l|^V zLindQ3^9|lGQy??bn1~WwCMUr@2Utgc4nbdSl6gUD;47B%?-Y~qu7`lOuGFecc?rD zgYujdY80vj6a*hg!8*CLQmoT=heao(5T;1qy?%6>XgLRn<1rXLYl2aLYg6#m_YBQt zK|o`DDb;avW_J}9Z}L`t7&e825|jA`_mzkgS#u&!GQwVV3MMmb`O`(%$)lu;Oe0() z&d0P2#3ar`BHn; zc@(4J&y$5w77sAU~ySgweKF$fC=4kq31aR+{j&(53C-)rZxC z)gV@avNm7GLHB$Zlj_nrLSinYx0i8TmY=nQg^e1K=TRthNLTesBw0NRW7sg--{uMs z5%iA=rJG=gY%&ZGUlC2(AxmPIx(43NBz_C)4$Uh=8bw%hw%?TL>vtFNa%LShtPW*3 zm+4Cs?3v`&9y+H@6h+JOH^XjI6eK+~p zad->hxa6jmN3J?jN@ABD>ew-LcO`WHx7WGEO2p z4nDI|1>Y5XckHzv%r`vx_eqkiWTuRMqbdx86-!2cO1b|y5I!c?zj^49#MNm+SNK5# z_QRGX20&iT^I&~#=T0Pxb+oF)v&SWyJ5x@2daq@yy59t$A|@9ZF)bznOwW&;IOchz z3-lt?$)3-wvU1EAR6%03pA`wyb()gJB6QFjCz@WbyLM3Bla1rPIniOULxcJ!_FlCb z&~ow>6c8W@q;MxdyZ5?p9SUvlb3BW5Uw00-eI1y5@**6}u$5I++Zd&8d^M?tu2A#* z15K7Py=u1IhQ?Jc)orH4YA??UQM9wZoDMOiFA|1Fpx_5}HV`1QlMnPa7g9`tQwipE zD0vi6jI~RPY0<=Yyw9OPK8a+s`be_=iD?cmdoQN_kS*I5YNkM=_?!p$G)c?nJQyS(9ucr4O-cMN?UthJi$l|Cl58#gI7d!P(P2HDOZ zIAvATxr0Z$^9Ixoop`;(PL6h${OT&vUnKQ(-H)z=c0C??A^e>y8yl#XDu(VXz$MGe zj-58@D*M5URk9m2{WgP#wbvCYy;Zf3^`iN*I-FMswh%R;*qda`;aMbPG)&6JsgWGaj*c>fLiT6S6>1C@c{?zNaa$TA>kOyPTM;^=b>(Q(; zm5OBMNzAautm*YhBxJV|4a>QTeW9pFqg<9d&jY(6HcIha*IC!X*qB;*&fauQX>YU3 zd|C6gO574jTxuj#vqR+B!J(NpRg%|7`Y5Z2T6L_J)iBg8dOaA)xI9ml@sLII_c9FR zd}}x(B!xMJU~tg7B(GbfcjRe}{%3RU>%3zc-i8iQJoqodZgHOKJP?6yx!Krih= zLXg77^?X?l!YU|=1+p(rLDiu|Hu_|O4MI~uBii=uQT9fE5%K}=uyzT@$mf*ZiqWgJ zy~`fhfS`@z(aFrll@M$nas8$4_s6wT))&`XRLyyVQrak8Vt4XfZ^5~asnUX-R~Yt^ z?x>b*RgKfuX~(2garGcq zoo-}@E#LlVVdm$LfBktsW3EUPg10m&SptV>pE0=>#m4*U%F{^ zZQEa5j0+EacN}gYKaU0m-QGXDirz5?{9|VircOxYKH|nxr8-IhH3aT;eJh z>B{Q!WSg5ipxycb^&lO!Y>gcVbPrGJ*`}(XIY1bWjP*i1>#pdV0~wT4W!fM7BFta` z4+I?)+$$*nxs{J_mR&26ppswj+!2T}OV1%)aR;SGM)LEDL`*#4N%WjEHJO&~{b1NG zzyIl_6ji;lvzhRUQCgZ&iWO0B--Ck2G$R@?k38dz%=XAh4v~wxnKwvmjjq(S>a-k}p}VT!Y4)*;7|5>+ zGyYIK*(Z#HzO-GNX=%Ql_`B)r@AF>w!!Oi1aV#cTTd8@KKGjQSR_70pb`w#HfB#_6 zhoe&8YEePpSO6p#BojWnU&K~g(UPSK5`8Pm9G5-PakNh6h~pmx6*bsTBqVV`4!f^b z7h-a(hO((qH;332JRD!k@PrZ@UW~T54{LY5vb=G3Bf9V?C$Y^zZr9#Qesa`$fsu%+ zAZc^26Hk6DpdEvDx=8)RblSW`(}i34$rpKKP%76=MdVkZ6({*w0Mp={!FE-_hwTxr z+~JQ+XK#&}baFGvca_x4#5p_0|JF|A1d|pXeM=$xv426XRB_!(NUI2ik{mHv?{9#$ zQ@IaHP}yZI+f9nbB*xQa;4wk>;j&wo2opt)cT0px!L)r}apLJ*g61&2NPJYyO=k2v z(%h0;{98yE=5y%hTQ#&%UDe2iQ>476^nVGzvF$(lurpwNN zv!Eay5007XYc3Ezm|dk_%V;Vc5X@{M&Al$3I)lsYW*XqQyXvOFtt0TFc@-fTpn>IE zut*Hx;8+ovTdHRVi@B9s&N&S&M~MFZp#!JNN&awLLev{dXA}G)M!!eD`CPvOg!UitJD&~xoNB`}@+rH%X?g&TtQSCkLZSQ1n~74k z)oyw2z+Mdb#}e040UbTO4PCv9FmCdynyeylFDbG71`gegmNDNdLXH+KUosX=TfwRh z%k|y52z9tC$tQUB;6c(}99jTdAkb6CBkLrx)7&D!*5=)sOfxfEpvm58YD1u|UMEV5 z$3sH+n(8zbd)X%gI>TAI;eI{IIClquDGQxT!X)OSTT6lVY7H}_)=<*FgOU2_8(=4f z2OF<1)jXB^GamWxag;NgJq`PkeuLNF&LKOY-|X4%6zFep-YC#|Ez@0?(y@X-C9-W6 z5cH3VT=ZK#ZW#K~J&K#)mN^T9Fn4tpk8Wvgkq^1O8z@nhq;-T?CBY}Pd3b4{ zDK4qXs;+rkrX$kzTg5v%4mO6H-5sRKoLDkPB zfGP;?HymqJnt)BUhKDbOjk!B;10m<9=ZI9uw@M$FUHJVbY4#Y!yBTWDANKL%L5jb) zhI=su?Eu)&&?kW&K%A$3YdSlW5@Dqu&@+CgC=7eayRa%Y%XhVYmxGYQlx#fb5RY^e z^o}|`g7c_>C;l3jl3A>8jthg0TE9m6Q5`&`FgtA1Y^}hZbECLKe!nX(9S2y}jNu#~ z&&^0z164={SC;Usz@(;&u=@mw{em(@LuY6%+-|~+Pz3#wyw+?j(Je76EM~|;C0!(S zlP~0CprhAus7tBDD0GfH>mqUkzCdw)wUrbUSegC^e|>mY`@B<9jc1*2-e#LP*ts7H z5vlzc7G|kKp;6TuW`3b@49zM&HAWo!l`)7J`wJl$xUN@20#i|VwUP|XL23r z9WxQ4fyE{g(AIS|Qu)OWt4;<20V1tXTCA_jJP>4K-%lQ@)3sEaDbURNBzu0gWi1(z zW>B`ypQP2j*y(ul6ByoTHDy1hRA^#YhXwc2#}%T4}Gi*0tc4#ZC72g3fM>9RsB2^u!Wuf)!aGFldiUCp=x9)Q`Xl^>(U}cw4PR4Oe;67WGU=Rs$piu>x#$k#}729ee(dz0hB!Xq3kN~=pIOnNNUzitAwd>IHfPz_u zwW3Ev9?0KHDC7Z21eyI-Wg4@Tf#B2PiPS}$&}xz?mn^>NNu(S0mP?}OZcRL)>l%R> z9x239u3-B2;k|1igqDmg=tIaHNNHVVriAsn3HD7ddR>mkU0{9NLUr+vs~-%Glfo{rbaj~ zus&$ruIEY=wSl5^RTSw3GOuyR1LXA>$YR5kVuPB0fxJGvnyP<yiol43!;N6VFpvC=e0=|H&{AzYG; z&RIxnZ11F~PCh4E@hLL;(iW1U;%-Tg_Nhz=B4}KY*Y6uu3GKEV@R);3)-~6m-Sd}g zQP(zW^gAfD+C8x9;~{}lmaC^}=1_DHzx8_kq?dQqj+m>RUW)2z4ax59X@3>RizGb~ zeHfeL6WPN_ud1Zs!#zYTFg+wSt_$kUNk+n-hA5vb)myS0;X9W=-95Hv_O{zZYttiS z_?s_z0fO5!rL$$~Zs=h`le6W1nLlr4u9byQ*>c~9%uO$Rt}C%CS}%`dp;q`Cnn}A_ z8mgulJAm7f|+FHoE~P^b>%X%vZsz#6j& zBbVMU#Q>>#80d+Gedm;sxXqSg{hrO{Fze7;zYL%jD01X$5?y7h4-B2v8x(2QJ0VaO zqhl8YsL$X+N$+VF`xPex(5yCfRGGH{G!5omupY ze#TM}3NUwVTUUo89+!A=DmN2R+RybWbaQgI3K(!?-6@qFtB(5=|?(H z23eL9In^$y)VR->7)gVwx4-VU4s7bJWM1s8GwcucTeVwzep4l3Z&L_gw)!%ukV|)$ zPR7gbQ3HZlbB(Qc@{uN;Y|W)rg&M=i&o7KaVrYiuG&D3Jo6sGYuQzM>JQ;-=i&iP? zK(G0QGB%9kL_@PS=T}8;(Sqx&n3XOFw--r%e*cz+*5L9D{%gSfx%5}`S^N#&;a-OP zC!vKoc}Tn9U_3y#WoL=EAl_OyE9yTdEnbbzFYXOrQ<=GA_T6>7vmxX^iui)xi0A4L zADbl0VZWu&Hw@izJ8w%V|J%r(?n(zj`vk{B>j?9kW-H`NjuDEej}ahY zqjuCWgI(<{=5hJGej@a|hM2_4(dhZ0eVGmUsjkAU(|W?elvj~3j|dztvBQW#3R{k$ z;J0NP>+xGg$N1H`wCKRSN)L%Jn!y^eCxfPWtmh19LcGn^QVPBbduh@%LP|tcr#kXz z`kVEiPhYrLKAELhW|R{4Zp?87?Mj#Z*0-4%a2oCl`lQ`bnZJ~LuoV7*Q&4tK3b@vB zdhMj42+l58*K7F|KB8k7?^}=OrZ*=LB4%;)lvbl_)byF{C0%SmTc8qE-ea6S1;L|) zUfBwEZ)=m5w#TWHg}nrUsot3#OrGEcTkf7K@EUV=dd8iF8j-(x0i1DC|Eov)zTq5O z8ci&0L=hx?`e^*5mcwy}zG6#S)4R2{b%W)2rV?ATWPBX$*A3KaKxLnmgjb`uTEJfr zvIMy&Kw@@^ZEmx9h2BrC;-W&`b0S1zm6>-;A`Da2D~nvf1Xbq2Lqy<|ERmvH>Pcy) z?R9qq0|)|oCpoCxD9)3t%x6!nTpA&CykwzyT1AMIZL;$#Q$e@xv+rH3cd!<7KyRVR z7e0h67W*fIq}~IO<5AtU$)X~Q@$iBUVQ62wqS**5OOKM(V)Ua&0v=OG9G3(3ug z)t;uE^A}3m4>g>NWL)=%^;elc#V#v=P8dVJ`K40lI`u@P+Q%o$3M@#2jKnU0g$GAC zwWO8s(uxBCeZw)0!H&K;wx66ZRn*ixNjH9b>JpCtmK!H;VLEC^RCI~a*m3J=YR}U> zyfvr+E{*~&$Xv_gT%jVWN8s-N&AnE-gp<58<&f4UNi?-S0Yfix$ap4H1lB-ng_77`9Tx zVUuEyki@@TSG1AYl6Twgv-U|ue&bpJ(k`M+y)~&^dhlgSgyTi*vGz)>xetR$9rORl^$Hn|wwWLwNS9F2KRV$ABTh`O1wd2U$CanWJ^Mm!qIbK-D@uP&Q9LnafC@dFRa#nk$#% zv6N+C$Zndl1(=Y7W;#pkVW~C^GgS@k>k6C&RH8^iGX_-)gZ$ME4H0S`sc~eAY!w z{hLO!sMMUvvBt;F_>D^3+I%yCidhrQliCDHqDZ=ig@>~W9&h+jWi93jSpoccJ8s_` z(VwOZyi8gG1W54x+x}2PL%;Li`xx0yiM z`(KZZT>R9Jt9?4)_|zuw=5UCXONlR7=c^b~zcfFuW}B(IpZA})KY7<`}|g-FC&{ati|)SiCN1+VMgoxly9qkKWTD@mQd zah>b5%I+r-rT>0sX6rL_eNItgFj{4ZrvDy^GTiL>451dg>5=jhDyEDU^xyvvv^WON zvxb^Ce%Sv`bbO^99Y%uEm-(+1QrB@4A44@fB;>|%1z~)+OYaA!?4^!jrR$3R=l+-F zFaG7S1bupWnC*_X#M-`vgl=(e#C%xjbSRHbA$LiO9N(1?n}WjqcgT-w6wS02LkkXUm@ek2#$EY`xn* zRLGyITD;=IM-!zuuGdi1`2aqA)Y1yNOG($Yf?h}22y77!GW=wocSWd5BFXkyx2*AA z|7B|9wNYxKTME>A%0AZ@$k zi#p~dSnT6Dd2;8T%E98Vm?duOzJts*`w%UX?nN;fL;J;7&R~viR)kxl`dm6WubPJn zZ+6L}Ii?U}u&-GE(O1~bzG8T!DK|FD&dEfVhM9R!r>dc< zyM+?G!?Q+4vA068Z+4{eC7EpRC^r^nrca@4J>&4LydUU%JH~^3+g8i7X6a#wU(29q zIRf7=;$GV=q?S1lUIWJed~I4=HEQs!R~X&@JW38C53?9KV6%q1su|KPC{37`Pg%Zc zuh<;ZC`&OgoYH287sCS;Eh*Dq7FETp9e!wYA}6yPv2++oo769I>Il)DJR0?;&$6oj z0*S)j5Q#)3PDe4hK^7UebF=@;3IFz2F5`=R+qd0`wvxA2ooT|3j|DW=yw24^ZfmbK zgxUtgH`9+Z)LQTBgghy6$jh+0)qjxbUn8~a&rkBVEc96w;-$dx+~? zMN_^%%>OK1e~yvd)T}P+I=fnlt)cLx*jx3Q-ptDFdonz|0v0!uv_6S{ zf%1KL`ny$ac;VOn;u6ZDa9Fj@oFzGjWfZeYd2<|jySd|Aa_uSS$6I&Ec|(c0*$1*E z-GZoFptRboFPlO^b0rk%s0TW0J#qA$BO!xTW|V0qGG78kPQ3D7BEGcldE~W`@nF+7 z1wu@@yG|+LX5Ok{z^%qBNz?-CL^H8j#F&ZuoxPyfMF zgI4XQ;&iWklTeQIsgQs%<`T_;G-=kO>m2LUU61629moH{mMsw%0I(U>;8HVR0m7 zs2*ZFs>(vptvcp*qB?gY#6G#U5}{GQII=y_mdMt-`gwzA%?aLMbE(_8#=crA))yQy zwjU_vT0i`f*EtSeQQi~KE2v;9Te9S>+1RuLHc>KP%Y!a|S48o*GWc6`W_$Q#Vxpl|?=; zH4FY_HuMeRY-WkpmI!>39zz|Hr}P)oWOOqXjCELZV&D5t%nMHP-CF&fnYn_0i4f`U zm0hAKNlP4HjsdK0eC(b{uQeyTDsMl&_fRoKvTXEQ-f9iQ*k|~8XK$YOsV!H;ylR^UyX^URqzn~%q;xSkZ?yB#lIv0B zt`Zg?EHr7>A*eOcsK+3`D>8#3cNZpU$b^F@{ETS4+$D=ko3>O7sxZjU3%R!M3gNJH zEz;|dha`U=;5ZKL;knXJk}@e1VdVk!X*UBVNNc7F$Y(PAL<{oDgN;&T!oopfq)Uk? z1T|Yp%jA126Bb%l2gk<9pfw?ts);OR1$&udAjizYqciz5%qdceo?)-9bUsJSGAc6h zV-Kt~)R?cEj0NC7v+`AyuelK*i20lYPnWy`-d=rK;dt6hXXRF}J1+xdqUj(PFePw- z1Y^jgOg$JB+|uqP?d0APx_32yjrHkF4{vvuKKCT)&t|Frj=VNGRS+hZF<1&jzt zR}c{Cy@Q3KbOfo9-h1zmVHBhVq}NcTcj+}Ky@VnqKnzh@=tv1IK;AeG%sAicT+jFY z{T_bh;^buSv-Z8#y4PA~?*;KN->WrOPN`BPRUar47U>A;zjnVG>PYkb?fm~!N&Z}o zfDopk>^z?r?W`b$6;X!e?7~%tg%mm4a;O49AKUx5{;iZ9rF z#IjO%g}vxlm8enYlXttk^QZl8qI=Qy%8Qb|}cNlq} z)5L9HGGHQa0+C1XN($y*GM>~SNkwflL4?$^rYYVeC2EWk14h6@m%jA&A2RrT5;RU> zx;7~?CFrxgs|L2=U(UQpN*p+- za3jGtwI?j8G=lar#Rsw-L3AThEF+8uR6a_p@#MwgUSG$yWR^yYk{U293CHq&-HF!O{H)OO|eEg2z=-v|B|~agcaI2YPVQ9sV~xll~fSo?XU=K6hH_KH3plr~FhN@2n3nP(gw zzlpwaDg9YmtNYX{lreca$Blkin{L89{Wk{(D*v1c)i(P_BrUAM(r~GqO&cOM1wNy` z3seRbfe-5C5^z-7gPc?Tn)fFyEF-s0X!~3SB0e`alv}tnZBLX$x(KZ1t{HkMYq8r8 z*Qw4DgAE_`-hEMPV?@aQ{@v@+38jNCTP7v@D&Y66K^ATYSBO&7-YPJ9B3<2pq0I&2 zGHoO@eifVs*_&KoVsO6kpq39Qg>Rc{eW^i~Q%aDRSs2s()!a?v8isA{F!Y0LgY@e< zlSLGkeetR`KFahZ6?at~3`lmHs#JRF9?r_)>pGLhgLXaxQnBCiFIP(tHqlJY+FM%X z6Dz^sjt6KaLYVy(q6HHF~Yr%Y_x*F9IRqXt-?PV~&XZFz3kU&QrY5R;o}ShFmo$f(TV%jUpvgNKcFieW*A zjq+DdwLZ|OJeJO+EJkyc!dP%T#WqLUDhO>FqFy|`;Ps-+cA0rf`6EisU)hJqUxjM( z;ZBOXb-P$4Ju;N4l>|j}Z**3AH&tSCNr2PVB zp0hO4(ldOV1Fl{C#d=~h?c=??31Rn<{W{sNlB6qL>rBEg+Tvf%Q@$WXnOy&*&lYK| zcX2QPzhU6J>B=a$B)j#}QvKykk@^u=Jqt^0bA$hzPQ!f3Mr#K;a?7SdBcaA$`}SPt zt-+sRP4Cw*<-jl(Nro9|=vy}uxlw|mv>~#}>J-dNGi67nO54iE?T;`CONZ#kZj;Xq zC{~G7pS4Zjim*7SraOXOx%Qc-1Q#beN&AB&(oWAsze( z8<-q5U8RJK7@Z05Ho)wg7 zF>Ne-@mF0MHSOY_+*Kw=iRed;XwI9A^}%K0RKnIzgQ1`KiHw1Zl9{vh1|}mzb<-Iu z?ozFx{j)ZQb&Z|+RDmaw)_bgGQqpP^F1;#lup4T|BGr#Uh?6(t68W2jo#s^Tsl%U0 zZpC?KrIRM6ZFZcbIdlxzp?02m;hNM_n%^2}l33a1Y?6ZusMI>v(aYeeHw8s;UE10= zMK}b{Vn-MwcQXBx6uHbMJ`v%pNug?c?kq_l2Bo_x1715%EjT_OX){sd&4K4*UYRkq zwg-KW-2yco!Y8;zG%OsVSHa5V`wh>OXL@9&2JA1^rW|)=YyaNQ%0%oNXO9EsgCXfm zR5#f`VH1`PZ`#c8E&#uvA2j2NJ!JT6sE@*!@=QJIH7QO?H+O9VB?sUWdb0*e_$E)o zwlB>FKg$Q0Dfm_x04KP9O&j*qHK~woj`QB1$`1-T-pUPFjpv+In{~Y(=*c zLK=CT&>~7(!lfG0q*V~*mQtC(<&S789dl`_OOR)3*5DmZ+k$)bM2u@#m$M{o{fdpY z$tzWoI9$wTt>4OjW&jfLtNfP2;eYd>F``<=MCk(9t2ijML%ZkWPfv*Y4XNi;*EvS@ z~C=8 zO^^&-13ipGg#wi$iZp2YlH z7(%DGX6uD7LMQQ;jiKfDYLB$3TO>w3EnW;V=TD?;I*Q1$9rCy2R!BDSR8D6{zLyR@ z59@69Zh{IHo^woa--7dsiM~J_^Nj8`H|UcP-uGA(UMB%_SdW$J>e#R*y3=6?39Nmnbt-jE;qR zKDv8+H^tVbv`TsGiWMA}+sEwiN!MU)716{TauD5M^nOn8MGe|=*m*GtNpaZ1r()2p zen_i93OssSh9%)x>9=W#k$0%nnV^x1o6hZHGcW)hJw_p_hHN&z@{ov_tPvQuROW3E zW9&?}{opsOJ|5TCq)o=JBP#9uoyVpyS_vb#rqVdQCm$e8FE`&(#Kb0SsdMnCbZxLm zC@nr^a`k$l$2+;4^Q*T&0gJ$!RGxaF>N*>0>}2uV!LK3Zcz-d=`t^>){kWCL$|TW0E0~SSwyuv%sLXm$39>z5GAj|3RGuXNFTNY(ks zP0iutVHs-jWAuw|@$ne}7T507=U9FIj~}lM_C%%m^c?eJ--8cM?kde( z21Rn#I}lM>N(P&k#U}Y?qQq;RXbAhC6ck%UFDB-xDkJ<>KJmks0vp|N=b^XRB`89 z2_pz1VaYTJbhs=uy}!03*rqwm3mR@9b?e7jJ>?T^aovhsf3wp;cz1KZU`Md1c8(4i zzcsfa1oGKxlM#c1x5?cGNA5sD_YN%XiLgN0W0a?&-mcg@dqWF%5xRSMse!ErI$dsf zxD|cu(fqDpwq|3ou_ZzwCQ@8!Cgh-?j(N+nH>MQ3Y?zSl&#nF~xo^vf&TO zE7QQ)${B`VTD5pjZ`ouL9yxLuX)=6SD}?8Np~5ELj(Rm2=J5M=jp<_A*c#TxS9R7D z?8hH;ayjoRVE$HB*I!X zqx26{I$qH@vw$$Rp(lj*yH?C5sy-+fNi~&zyaj7<@2=lf$!EV>lAt?Ajrf@K1@SA@ z`Yr9!+754F3m+F_Aqt(<3?uzr%awUa524|jFI&L6tnP;DYE$CImzAoNYuiPt&_Rl+ z>-n`pu3z5q1Jw+X)8ga`ss$Y`8RYQ1)}=_jY6(!=e2i{|S>3x27)R)CUr-;zVfn=X zMn0ny`r&To;t4L>>@@Z9(zISrkex}((@8p$DcDb2^@rxMs+o$dNfgt9t zO}VIg6Ytc-eSaYIrHy|$XCA3(Y8_uw>eIxhEKg&-p0$`9KL>iliKYeva>8c2LUG*y zWFXA{Ml+L!nCp6G@5iDR3a%m$YSW@Fq7w5}#I)GCd*_3xg8MLFqu=&Lg&gD#T4@ZU14lPRv_4RvWxD;rG z^>S>_=3{A6Pj)XkqmNI;cp%`eXpfJ_OM+a@sZSijY<`cl*Qu|wknRsKy^B6IUZY$o+K= z8U7s16oGc*_)c*m>@Oo=fz`b|F3Sm2QQElY1CzJ&Bn{btNRT;{6a_TN4Px?`kWk*U zd4;9#L0N2!_!om3Rg4_K&#Bz?+4$4O?j3ybn(FLjqJoYKYnV_FQJ488Rvi0H&*&2= z2vS-V1j<#j%X&xrb&@Q9>VtZr19)4A-$Q~5t-uTQj}Qow?(9i8aed%1>HWS(EAz!u zL_NE|-qEWW4I~Dq*~-vLM0ANDwl^86Q(a1M5#2YWdo~hxwIgHUf)i^?|Z^v&YqHo!1ovD-6V+KU<{`-wjK* zE#NoeYc!~)XA>bQYkzOEnXGPmSs~Ppy`lt9wV9Nic2XP;cSn7y5mW5mzqzb7%P$_8B zRVPM^Mu<|OK55=HI-GP{pkOz1UMNAJF^J`ap5|DlcvfdiQIt}a)UGMhY)vJGct<$? ztxZ?W-PC?TCwO3xuHhJHdJjUtZ*(UypA_DZc`?oPynwZ3d0wwB>6w(+}GRrhjk%Dt^V_(vq zm59E@eay+fab3lLcdba*szeO%v7n*G`3Uy36Qw1p=k%-JcHL{pwMy2l?2RoI8gj)i z8uN_LyGgW4$$4GJ>LI0gd3sB)tm|KmwAQ{UeopYLlmxq*A>^gjdwmgXX(Du0OZP*1 zx&5BwBoTkLXPfGhwvFFF{M;rHn1hYhGGsTtnVxgE;9^`X*_3CDynFsL#5!O%d+$U1 z^LMrP$*qCkxxgd)oaI`MF~e5~G`>paRpKMvCTUZ`XY5mF-Jm%EMv~*p9w%z)8cyNk zP`T_WrXwnyd>lUXyT@NDznq};Q5(@hW9lJ-5Nf#5#5BcJ@H>B~mYF6ebp$uxkeW#0~{|X)F)3^`X!~ zmqylX;Kqu*N>_7%IkbA=PwS1|$8K15k()dlFQpdFt3)0o6%mSGU==ctjIW#R{MBT@ z@lhnHg3ftm(M-7!s!y@Z`<{_;0=EYnr(KYwug$wF=4h10ovI>TA4RrH=+(TU(jwJ) z_f%-S)1$A!T=aR5Lg~Jc4sN`b`L*_=Hw$t5k3&S0Z<`)t;}iK~+4-wrn|<8Eo&vi+*zdkKV z6EWbAB>z}_s7TBH^gfYu$1Udbm%|%>W6oQ=MIM0_zV$7g!5whPeErB@2e;Msv35C$ zu(-A0%mZIe#BTzgmmqnU4xq6`X0oXX*%KgMCFH$gb8Su*+LLvankm#x+pcwuwb0;x zrodhQVwMVDo(ql3w=snen1|l0y%vus?_KM}nz!Ih4%L^#@Xwn4S%Vkz7RRF;!W;C{ z-1i=X6$B&;HgSkX3T~ak!v~*{E?g83IX5!0TIbC)q*d>9l96@fQ=>>8z!S+_#|9m; zJCSww&tI4+{hTZfTgcQIcbmQHn_&-32WEW*XtYy=T)BUq!CIB>cOYHu9*(}@5pt_v zaj)MMRhov{#E!TrLkdOFS`v>VN1=l}fXL8$81Mo|@1~lV@XRDY%b~(zMXP3auv7l> zHD{7#_~Z=ro=I03Lyfz$yq_;y#S30aG}MMb_nc8$h;o~cb6Nqo^nhM@+Gt(Rna*bl ztdg?S-^5C{FJ*e5MQxiCId9NKDSw_Ky~W1Aly=er0cKhA#UGAka+m4GD`&Eb@Tl07 z(vx+$@r`;MXRWPnWo~0z|5SA?&Ks8e1hup?lOEL?gX?RDh#AYItt;NexSB6$E*`jf zz#b5~*6!Nlc59I9PCWb75OF!?;eMt?4MfTYKAMm7T)1K1U?b?!fD;~i)reg2flRY3 zM1vN1jf|@y!d)+lR_6}ZH+!~XSG@tUej#3TY5F%dC3ehF1Y#`+FkSbsKvF{~!sA$_iisDdS&{wqV~ zd?Wo%@xZQ+czGpqeQyJZ>EJ8Q`k3Wok$>7~MeXvn!{2uAl}fCNfTV~g%!6NU8Ljy~ zVe4~_fXyn9#5x4;DmbIB=Okq~zo1E(O>O6m5JA_@Lrb_1@a5Dv06n;_pd4yFRXvs8 zEC>yN?p*hnetpC|^-%sKoPFmPP?|~oM%ly83W3A$Sqbh4EMGdZ;lQ1LDuqpp-<{;e zvHq7}-^l0CvMY^RRZA95HG3FFm7_PoEOIooPTJCy;-XZ<;C*kquKJ4OFkQbKL8;1T zEY|P6c0+lH@$Cfdc3|yU-mJ%~qkFYZJ~U95sl|7?ou3e%9}g=<<BO!wVGDQrnGEGBgeeau#{7woeA{&2X^Liq&u>i9@|i_ zk4zGPG}CSC7Ui>>o1eBBG3>oG6`S!c^|e!ii*3s#c@h}i%{6-JnPVq>w56AgQ=!H@ zEsQW3An1DKVWlX#F*eJ1Qa`L;C`ju6PDN7uIhRzS)**uAUKI=;OA=L<p_>c%ddZUAjME(LHj)0~f5g-YA@< z;T)J1&c_fD+KqpEn{k;bMthm?K1dg3<@5ABmc&WpWWf5wo~=ZXx1E*c-3lI-I)_d@y0()M zOt5Rf_wXPAkTj8!9w14(84_$u{dNvo8yPuKFn$m{F|A$1Rh0zYi?DLtcOD%qtDZ+! zQ^=YAev(;fWRo1}Py)iJ3safzXB@BX0$^FnB}G zmu&_leeF=^EfF~nZQxH<>AH4kYi5_*nKmH7SXi_~4FS1@aZ{4s6#Z-7Wa>q(+^J5BZe5QY0IGb=H!FceMaw zqXV0-CZ@$^dUX+xQbclr9d{L&3+67oGzj+8k&ags^ipmQ3ih?!%mWnk34C)2(_p(r9tnwpfJqGxNOF~6}N2P7{ks&9rb0F7A zpY_@>@Q^Oz_^nDGu8`?#@R6AxMD9tpRAFL(odv~T9pe4<&!HhJ{tU$>9Lx;{#GAS)vS|3-)ABZm9vF=}HmdfC2<*&Xuw zCPO|=6ZKPdAiO_s=)X{Y_$)`Zq->nji3Ao^UmW*&znj5)rGQIQG3x$^=6N8z~U zU`5$7Nv<(>3L%fs@eu~jU~P6Uk0BRwE`GyV=TbFMLDHu!H`L^nt%zA8r9>Gx=sta# z%ugsCTYR^=EL>W#`cU-TPnv%v3I@oz8Th$+z#6bZ_+3Qj{JIlP}5}aZ#5nUEB&`gu$755 zpDdlcj@cud+M4Fr;eLK%n@Xo3@wt7j%L(MKU8g=6Uh^zkW6aIYc5cvq z$xjobv%Het9*{y!fcDg%uw;9_J+V;8S^u%sU|cl)1)@~Y1+;|Do=Pu62BYEBhlqn; z^4@KocL-HquV}U6b6SD`Y^eMOLJm2=HC6@R6roDHq|gf}-iYQvL&|5Dgiub176`0a zok4BB^G2&;8RNs8i`ig&>!jAO?C6m9NsjMgoh@l%dpfb~^PdSNnPVKrDLUb7RnL`@ zZbCpKl5efQ{=Kd34{|NO5uh!RgGdo6;78ckit&vswNzB8P@jIMbqC)-obrec#NXgw zI_!teN7r@^)KYa_Ffs7Ix~9&MJWn({X8Z9SY zZEe{UPsi`$uwzxKU7=U?;@0MB=cW!)U`8F#WnmVVpavy~#>aZ-JSPYk>{G8aDn-5d zRn|w7Bx2vUefu5n?_2o#OygCwQkX3llPr>PN+npBF-u31h6p}N<>CfjXhV>IYZQT# zwc}+iNhEh$7-#lYz!iRW(kK0_Y+PRvPQ77_#9XPO^xiNKRJ@>KLrV>0mu;6^IJ*zw zQ_?T8=cItGyU*6Ml%^kJXj2%~m~K4vfti*}(-skTydu1QOZGNSvTsldJl9wl8CL$&5EvvRU?kLO{|n*6Dgwy8sh_s~aU ziQ8^~B=%=pbrLI59NJMN!na>@$ zNYkZ+xMO8$sMT;f;mg2-jm^O$NN%|7XMd-uCb&U(_nz@|q_2S_Geb;J@_f5xUBjfX z8Ac5^ki=U$CIY!?;c+O_f8-YFxm|W*i`CyC+639RurDKa8#GmQARCt}nYrIQ@J~Bb zmG)mKOE_ZpwwCs{?_DeOWfPpHkLsaFnoH^7z(liIEm)MRFM~}!4Ge}*>hK0&dR+88 zaE!Q?gLDrqg6Rec?-ucy{wctL&R0@Edc^)p@GQgfJV4FMsNH7;e;Zv~$F022&;H)U z?X-*Cw>E_`=f{>=k(aufa-pHUVdSf3EN3b~hOpai==I(V>W!!S+9Gg7A^Sw8A( zSuku?>TQ3-7sE95OjZP~xF90QK>93sK56((g=+mB_k|0E2abNvcO$^BWr5EGb_}3^ zGSIEfv|u;D^B$=<|62OzDR7QUvIlherTh_C7iHk;YXD|X!R?sQgfUx#H{?FOs`$Gq z7fL&?=~)KJTNN*6#rGl(U9iIw)iMHun?3+7`(babdD!OG=U@|RYz=FbK*Ny@#-swX z3o4ns75Cso+53?cc|)f``+x`gsFD0G7;nB}Rg*epSXRG|sCte?Rl=LA8{Ls15c>_y z`*`6b%6%Ri_IETwRc5cv2JRNJ#?TW-?gORyLJs1F<9nKfsTzdC_k0l;DM*t*h1W?W*Ht{GS3Bt zvPri2KG@d*B;DdsoTP!MfV*G!INVxjC0rEi6#+3Uc<4$WUQ<80BT7#dg-V=e2MyUS z!lka4;RKCUar!$FtTPILqGr4%s7^*4o_qpO^rl1UpHt9$#t;mnp8^tyhCEnbY#oD4 z+qv=kTw+KR7KF^uZAUrL8z{x!4m7OL40c|&{$Aanhll; zy|wEx|64+0o&7h%vabm)Jf6qOm>PEogr&|TRNx?h*V3sp6%%b8IqeBRCZo)E zu_nuCHd+HzIwCuDJH+YawT$l_uBr$1lDmF_Ga&#uSs);)hY=c)pTM{r*STA-1_iaJ zFmCy7$2oLG`0nGB05A9ZJ<{f@NSu*MP62xHo>2=}>xvaHUdT#DqkCwqQ|3zsW$?b2 z+wcdQlTR%Y3^5s;5th0#BdumUaIUVTEs2BP;0bT+CvxSHLEcgSPoiwGl&kYu;417TX zNJ96w3!OgbzfRib_H#+buZB9BW`591-@>MeWo28B7ExIl<~N{AOKkoWf6Aw!9l!Xy z;l%VXWy>lKjpMEfFXha(*{gPTcD+z70@fr`Z04h`Jn|`iz~9Wl4;)hcE_bcByvOF4 zdLiqcmYPULp%2q(=$q&tu)d?>#LnmY1UR3Gx5TSdPA1Z&!Ed5SYEe`?c4itb(`*@A zk##SRR_UThR2CBbha!uxcALDePe)QWUgCegIIJ6` z1(%&0geNfY7Rc!61SlpeAiR6VtO^OlHfRwl!QCj3m@JUeNk{sG4qq6QrjdMZHVSpg zz0zm;ExTcOHJ~j!{^@&6boyS95~r62jf?;B-3my0_Ex27b8P8e#NyXN@4QPj^wih^ zn^lm)r~%5dsIaTElb}c*hauZM)B@KOCZ~&RZObMySZFe=wCEhawkXO)ky9F+1p=JBvQJhQyt!LFVApg7H08ZCfYz7~}c zt+k5r#$oX}2!pErBC*s{g~(t!Z==UM@RZ(MvOlIGxi^0ym88dF?t)1+Cp7fx+~Zjo zL6@8#(nX(DM}*JlkYOkT5<>2scaqR-Qg*{w*z&^R3bg~7pF5i#Sk~s zd6%L**Ix}Ja+2LaWN{;$smP_RLKxfW4Xry@`V}`u`@iL^{^p)si8*fp)hzXjS+xZF z$mmkc;O397uGAl*xtX9E89M7@wyG?rF#IQUbER)8F!T8(#0ht%qbrvOTAIZ9f(rZ#z!>yjE)@KQ(KNNK7NCa8Aeqvf) zp|UId+YEm&@~t;7_m`x7r1i(dS}z|z*C5Vc)<5$Hxmx&cv^&#!d}nj`V?XK3;-$xj^|FtS{LYVG6?6ry%S%sJG9oQeXGgm(&HvAp(hJ_#WgOe0jl|V|+(_bcA zyuRLAZu8H5GJ>)WU+K(8P2r;i7GxH`x!q45*Kq90x2M%PTxKN${_l%Q_(V0S#m8Fq z+b@)hUX;8!0$gsZZo4AEMclo@`p55PfY0ZUX$Z0#F(5j5_UvgA2vgw#Q}SbOIh|fp z?#X$6u`nMJuPwzz=LY^ ziO)D8E+v!1R{2SGs(in@xT%$!B)ary3=;JRDA-j#TLiWV-JBw~>Af4V0P^pdi(R;H z()=MrzNqeI1598|6cqueAxX)2r)9*Rb1_a(_hxQx$d8ac14OZMK>e6{LmwXVA6Lq< zoF6>jJBx6*{m$Re$lXXw27CjfVoNIr8+Uu0aMzVPlu*kz|jjAa>tk5oSL#!OAKQEyXu_H126e2mkP(! zHQkr-4wTJD)8Y((iMIl(N%?ME_ztoW)~{oU;{o7;<8n0vdCVQBpoRulw*4QD8(mdzIb&Dx!u*PfLfk_CIW16^PSh zrDUGGI|xaCo<{AYsY%W%)^VamcIaC?G%g$Vf&XXTCSZ)^o3pW)*HkIE_>C#Ke_2&d z(6alRD#>M>{5G)D->djtSaX*@Q9Y*Yz!f;d<`jpYeE+BP)wx2G^bk*ZlUNq% znoK@2JBUgrUri~XB~~YM&1kqrr9JWrc3CNjpU;w4!42p?ZW*tbPg#?ClOUB=P2RI0 zoh?S4_i2tkZxZH%_hV3F33*_FCBvx^`G?6VdXk|l!35Q{ng0WOnkS6hIeYB_0A?Cy4!4kP>w$IVgT!V&KEiTkuT9DNoUHkIw?D#B zvS=xbWIKJjJ@)K(*u$4J6N@oja~O(49AO$c7WdW(sEMMjs2`fv4Evz*bKx@n;mPN< zz6+N|b*%Y}cqvAT+%Neb$tC@Qo}P=47bCDePFxz&%0sW{ZbQehY6c4DxfRPZ6lu4=5vt~!74z2FaC;B*?7uWL(u3sJHmPI%q6 zESp(9Q+yK&?L~xMm*g#qyFV@t5qmq!$|5J-Shgo961usz;!F8)tKacxYaGr$ERre{*1Es z9qm$&tpMra_m~PyXJlR6Vk{5ATLYF}+YZCqS+c+Ua=rYi^h1p*`^h(iw-4y;^~=q+ zfgD=qB2D<@2ZWj$N){Eu_A*v6A77ctxFjlN7~q%lLZnj;pC*t(*hhNmwBt9~PnNBb zixB)Q@mow4VDGSy22tMFdC2l<#XX${746pz#OZI0Y$h9w3aBPcenJR|%Bxa7QDFJg zK0Oe_6ZfiFNlK<4mYLn)bCFZ1V$;5=bmF?n|DA0GlP zUE&4IMgfuxoz&VFf&+W^jL|lX#KE`tMVJO_S!XB#gJA8|ln*VGeka^Bi@}PodY-EV zv7*2ODiPgxd{#xI*FsJ(ci<5r%sxW_5UB+UFvU3AM`#x}v4|d}2Y1Pd z-(%?V_Gvdet`pYhn=~!RQ6g7>WY7Kq>7V$*Kc`isiKjq?%^2p>Y|(d&-+5CM=j(ax zM$&3D&$6>0@bh+GT{pyO6@6F{KEm#4smD&9`U-NU>;LjW?@GU|&hs<>cBVrA(Fg8S zJSF_nw8Q+P1^JiZ{(i~wGQQ32Z~14o^oN!Bcd=Z&e2oRaU00Z^`HU?72g3Yoa4I~R zpt}82R{HOQx8XS&<$!|2Pq@Rs%OW5MFS1{Q@1NrE--r5Z#^24K<9VCcE2TpJxOvDp zfZAl$g1-}FpIv&!nEq`ZhL-0D?>~+a`O!LFuG0J*bn5_QwYErf79#!QD}P@kL*q;4 z%%&NBOUg{a45SbKLy@t78GxF=G6T|HAgQLm!mE zy#@ezpIld-MYvu2k!jx2xP5Z+$$Y5g@T}~i2*vZ+C1{uVS>EQK{Gs@K0L)|8K1F8) zl44bhND2YlX}zyf`kp{MrE4`=4~~w#JeJ(;p^e6W)vE-q%z;u>|Aj~Da+X_-(4sqp z;Qr0?Biz0}aB|B`{KH_C%RKsUHeZH_l1>p~c+I%N+ev1gB5=FAwtnbcyuNSKPT-cIMFp TdHBO$@PAKbl%-1_8UFr15!>LM literal 0 HcmV?d00001 diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index e95409e08e9..e807ee54fbf 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import dataclasses import inspect @@ -299,6 +299,7 @@ def __init__( extra_kwargs["delay_wgrad_compute"] = self.config.delay_wgrad_compute else: raise RuntimeError("Only TE with version >=2.3.0 supports delay_wgrad_compute now.") + if ( self.config.tp_comm_overlap and tp_comm_buffer_name @@ -2116,3 +2117,12 @@ def set_save_original_input(module): "set_save_original_input is only needed on transformer-engine modules that save " "quantized tensors by default. It needs transformer-engine>=2.6.0dev0." ) + + +try: + # pylint: disable=unused-import + from transformer_engine.pytorch import cpu_offload + from transformer_engine.pytorch.float8_tensor import Float8Tensor +except ImportError: + Float8Tensor = None + cpu_offload = None diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py index d501c11a0a9..74b9a90764d 100644 --- a/megatron/core/models/common/model_chunk_schedule_plan.py +++ b/megatron/core/models/common/model_chunk_schedule_plan.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. from contextlib import nullcontext from typing import Optional @@ -8,6 +8,9 @@ from megatron.core.enums import Fp8Recipe from megatron.core.fp8_utils import get_fp8_context +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_set_last_layer, +) from megatron.core.pipeline_parallel.utils import ( AbstractSchedulePlan, NoopScheduleNode, @@ -450,6 +453,8 @@ def run( f_layer = f_schedule_plan.get_layer(i) b_layer = b_schedule_plan.get_layer(b_num_layers - 1 - i) torch.cuda.nvtx.range_push(f"layer_{i}f-layer_{b_num_layers - 1 - i}b") + if f_layer.layer.config.fine_grained_activation_offloading: + fine_grained_offloading_set_last_layer(i == f_num_layers - 1) f_input, b_grad = TransformerLayerSchedulePlan.run( f_layer, b_layer, @@ -472,6 +477,8 @@ def run( for i in range(overlapped_layers, f_num_layers): f_layer = f_schedule_plan.get_layer(i) torch.cuda.nvtx.range_push(f"layer_{i}f") + if f_layer.layer.config.fine_grained_activation_offloading: + fine_grained_offloading_set_last_layer(i == f_num_layers - 1) f_input, _ = TransformerLayerSchedulePlan.run(f_layer, None, f_input=f_input) torch.cuda.nvtx.range_pop() diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index fd1cc3d33c6..786a1b850dd 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import weakref from contextlib import nullcontext @@ -8,6 +8,11 @@ import torch from megatron.core import tensor_parallel +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_group_commit, + fine_grained_offloading_group_start, + get_fine_grained_offloading_context, +) from megatron.core.pipeline_parallel.utils import ScheduleNode, make_viewless from megatron.core.transformer.module import float16_to_fp32 from megatron.core.transformer.moe.moe_layer import MoELayer @@ -350,13 +355,17 @@ def submodule_post_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor) Run forward pass for computations between attention and dispatch: pre mlp layernorm->router->dispatch preprocess """ + if layer.offload_mlp_norm: + hidden_states = fine_grained_offloading_group_start(hidden_states, name="mlp_norm") if layer.recompute_pre_mlp_layernorm: layer.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput() - pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint( - layer.pre_mlp_layernorm, hidden_states - ) + with get_fine_grained_offloading_context(layer.offload_mlp_norm): + pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint( + layer.pre_mlp_layernorm, hidden_states + ) else: - pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states) + with get_fine_grained_offloading_context(layer.offload_mlp_norm): + pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states) local_tokens, probs, _ = layer.mlp.router_and_preprocess(pre_mlp_layernorm_output) @@ -437,6 +446,10 @@ def submodule_combine_forward( hidden_states = layer.mlp_bda(layer.training, layer.config.bias_dropout_fusion)( mlp_output_with_bias, residual, layer.hidden_dropout ) + if layer.offload_mlp_norm: + (hidden_states,) = fine_grained_offloading_group_commit( + hidden_states, name="mlp_norm", forced_released_tensors=[residual] + ) output = make_viewless_tensor( inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True ) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 654827dc6fb..209fdc9530d 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. from collections import OrderedDict from typing import Dict, Literal, Optional @@ -18,6 +18,9 @@ ) from megatron.core.models.common.language_module.language_module import LanguageModule from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_init_chunk_handler, +) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.quantization.utils import get_quant_config_or_none from megatron.core.tensor_parallel import gather_from_sequence_parallel_region @@ -117,6 +120,7 @@ def __init__( self.parallel_output = parallel_output self.share_embeddings_and_output_weights = share_embeddings_and_output_weights self.vp_stage = vp_stage + self.disable_param_offloading = True if hasattr(self.config, 'position_embedding_type'): self.position_embedding_type = self.config.position_embedding_type @@ -410,6 +414,24 @@ def _preprocess( return preproc_output + def preprocess_for_fine_grained_offloading(self): + """Preprocess for fine-grained activation offloading.""" + fine_grained_offloading_init_chunk_handler( + vp_size=self.config.virtual_pipeline_model_parallel_size, + vp_stage=self.vp_stage, + min_offloaded_tensor_size=self.config.min_offloaded_tensor_size, + ) + if self.disable_param_offloading: + for param in self.decoder.parameters(): + param.offloading_activation = False + if self.mtp_process: + for param in self.mtp.parameters(): + param.offloading_activation = False + if self.post_process: + for param in self.output_layer.parameters(): + param.offloading_activation = False + self.disable_param_offloading = False + def forward( self, input_ids: Tensor, @@ -435,6 +457,8 @@ def forward( runtime_gather_output (bool): Gather output at runtime. Default None means `parallel_output` arg in the constructor will be used. """ + if self.config.fine_grained_activation_offloading: + self.preprocess_for_fine_grained_offloading() inference_context = deprecate_inference_params(inference_context, inference_params) @@ -701,6 +725,9 @@ def build_schedule_plan( TransformerModelChunkSchedulePlan: The model chunk schedule plan. """ + if self.config.fine_grained_activation_offloading: + self.preprocess_for_fine_grained_offloading() + from ..common.model_chunk_schedule_plan import TransformerModelChunkSchedulePlan return TransformerModelChunkSchedulePlan( diff --git a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py new file mode 100644 index 00000000000..1e280a09d35 --- /dev/null +++ b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py @@ -0,0 +1,609 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +import warnings +from collections import deque +from contextlib import nullcontext +from typing import Any + +import torch + +# CPU offload implementation for pipeline parallelism +DEBUG = False +DEBUG_RANK = 0 + + +def debug_rank(message): + """Print debug message for a specific rank when DEBUG is enabled.""" + # pylint: disable=bad-builtin + if not DEBUG: + return + assert torch.distributed.is_initialized() + if torch.distributed.get_rank() == DEBUG_RANK: + print(message) + + +def set_ideal_affinity_for_current_gpu(): + """Set CPU affinity for the current GPU to optimize host-device transfers.""" + import uuid + + try: + import cuda.bindings.driver as cuda_driver + import cuda.bindings.runtime as cuda_runtime + except ImportError: + try: + import cuda.cuda as cuda_driver + import cuda.cudart as cuda_runtime + except ImportError: + # print("cuda-python may not be installed, skipping GPU affinity setting") + warnings.warn("cuda-python may not be installed, skipping GPU affinity setting") + return + try: + import pynvml + except ImportError: + warnings.warn("pynvml is not installed, skipping GPU affinity setting") + return + + # Get current CUDA device ID + err, device_id = cuda_runtime.cudaGetDevice() + assert err == cuda_runtime.cudaError_t.cudaSuccess + # Get device UUID + err, device_uuid = cuda_driver.cuDeviceGetUuid(device_id) + assert err == cuda_driver.CUresult.CUDA_SUCCESS + # Set CPU affinity based on GPU's NUMA node + pynvml.nvmlInit() + handle = pynvml.nvmlDeviceGetHandleByUUID("GPU-" + str(uuid.UUID(bytes=device_uuid.bytes))) + pynvml.nvmlDeviceSetCpuAffinity(handle) + + +class PipelineOffloadManager: + """ + Singleton manager for coordinating activation offloading across pipeline stages. + Manages chunk handlers, synchronizes GPU-CPU transfers, + and handles virtual pipeline parallelism. + """ + + OFFLOAD_MGR = None + + @classmethod + def get_instance(cls): + """Get the singleton instance of PipelineOffloadManager.""" + if cls.OFFLOAD_MGR is None: + cls.OFFLOAD_MGR = PipelineOffloadManager() + return cls.OFFLOAD_MGR + + def __init__(self): + """Initialize the manager with queues and dedicated CUDA streams.""" + # Queue to store chunk handlers for backward pass + self._queue = deque() + # Cache chunk handlers for each virtual pipeline stage + self._stages = None + # allocate streams and events for synchronization + self._d2h_stream = torch.cuda.Stream() + self._h2d_stream = torch.cuda.Stream() + self.reset() + + @property + def d2h_stream(self): + """Get the device-to-host (GPU to CPU) transfer stream.""" + return self._d2h_stream + + @property + def h2d_stream(self): + """Get the host-to-device (CPU to GPU) transfer stream.""" + return self._h2d_stream + + def reset(self): + """Reset manager state for a new training iteration.""" + set_ideal_affinity_for_current_gpu() + self._inside_context = False + self._cur_forward_chunk = None + self._cur_backward_chunk = None + # Track the first microbatch of the last virtual pipeline stage + self._is_first_last_vpp_chunk = True + + def flush(self): + """Flush all staged chunks to the backward queue in reverse order.""" + # Ensure all virtual pipeline stages have the same number of chunks + if len(self._stages[0]) == len(self._stages[-1]): + lens = [len(e) for e in self._stages] + assert min(lens) == max(lens), "All stages must have same chunk count" + # Clear the last stage and push all chunks in reverse order for backward + self._stages[-1] = [] + for chunks in reversed(self._stages): + for chunk in chunks: + self.push(chunk) + # Clear all stages after flushing + for i in range(self._vpp): + self._stages[i] = [] + + def push(self, handler): + """Add a chunk handler to the backward queue.""" + debug_rank(f"pushing handler {handler}") + self._queue.append(handler) + + def pop(self): + """Remove and set the next non-empty chunk as the current backward chunk.""" + assert self.size(), "Cannot pop from empty queue" + while self._queue: + self._cur_backward_chunk = self._queue.popleft() + if not self._cur_backward_chunk.is_empty_chunk(): + break + debug_rank(f"popping handler {self._cur_backward_chunk}") + + def front(self): + """Get the first non-empty chunk handler without removing it from the queue.""" + if not self.size(): + return None + for chunk_handler in self._queue: + if not chunk_handler.is_empty_chunk(): + return chunk_handler + return None + + def size(self): + """Return the number of chunk handlers in the queue.""" + return len(self._queue) + + def init_model_chunk_offload_handler( + self, vp_size, vp_stage, min_offloaded_tensor_size=1024 * 1024 + ): + """ + Initialize a chunk offload handler for a model chunk (microbatch). + + Args: + vp_size: Virtual pipeline size + vp_stage: Virtual pipeline stage index (None means stage 0) + min_offloaded_tensor_size: Minimum tensor size (in elements) to offload + """ + if self._stages is None: + vp_size = 1 if vp_size is None else vp_size + self._vpp = vp_size + self._stages = [[] for _ in range(vp_size)] + + if vp_stage is None: + cur_vpp_rank = 0 + else: + cur_vpp_rank = vp_stage + + is_first_last_vpp_chunk = self._is_first_last_vpp_chunk + # Flush staged chunks when reaching the last virtual pipeline stage + if cur_vpp_rank == self._vpp - 1: + self.flush() + # Determine if this is the first microbatch of the last virtual pipeline stage + is_first_last_vpp_chunk = is_first_last_vpp_chunk and (cur_vpp_rank == self._vpp - 1) + + cur_chunk = ChunkOffloadHandler(is_first_last_vpp_chunk, min_offloaded_tensor_size) + self._stages[cur_vpp_rank].append(cur_chunk) + # For the last stage, push immediately and flush + if cur_vpp_rank == self._vpp - 1: + self._is_first_last_vpp_chunk = False + self.push(cur_chunk) + self.flush() + self._cur_forward_chunk = cur_chunk + cur_chunk.vpp_rank = cur_vpp_rank + + def set_last_layer(self, is_last_layer): + """Mark whether the current forward chunk is processing the last layer.""" + self._cur_forward_chunk.is_last_layer = is_last_layer + + def cur_forward_chunk(self): + """Get the current forward pass chunk handler.""" + return self._cur_forward_chunk + + def cur_backward_chunk(self): + """Get the current backward pass chunk handler.""" + return self._cur_backward_chunk + + def __enter__(self): + """Enter context manager to enable activation offloading hooks.""" + debug_rank("----__enter__") + from megatron.core.extensions.transformer_engine import cpu_offload + + if cpu_offload is not None: + cpu_offload.CPUOffloadEnabled = True + self.inside_context = True + + torch._C._autograd._push_saved_tensors_default_hooks( + self.on_save_for_backward, self.on_get_saved_tensor + ) + + def __exit__(self, *args: Any): + """Exit context manager and restore original tensor saving behavior.""" + debug_rank("----__exit__") + from megatron.core.extensions.transformer_engine import cpu_offload + + if cpu_offload is not None: + cpu_offload.CPUOffloadEnabled = False + self.inside_context = False + torch._C._autograd._pop_saved_tensors_default_hooks() + + def on_save_for_backward(self, tensor: torch.Tensor) -> Any: + """ + Hook called when autograd saves a tensor for backward pass. + Returns a tag to identify the tensor later. + """ + debug_rank(f"------on_save_for_backward {tensor.shape}") + assert self.inside_context, "Must be inside offload context" + return self.cur_forward_chunk().tensor_push(tensor) + + def on_get_saved_tensor(self, saved_state: Any) -> torch.Tensor: + """ + Hook called when autograd retrieves a saved tensor during backward pass. + Returns the actual tensor (potentially reloading from CPU). + """ + debug_rank(f"----on_get_saved_tensor {saved_state}") + return self.cur_backward_chunk().tensor_pop(saved_state) + + +class ChunkOffloadHandler: + """ + Handles activation offloading and reloading for a single pipeline chunk (microbatch). + Manages tensor groups, coordinates asynchronous GPU-CPU transfers, and handles synchronization. + """ + + @staticmethod + def offload(src_tensor, pin_memory=True): + """Offload.""" + debug_rank("--------offload") + from megatron.core.extensions.transformer_engine import Float8Tensor + + fp8_offload = isinstance(src_tensor, Float8Tensor) if Float8Tensor is not None else False + + if not src_tensor.is_contiguous(): + src_tensor = src_tensor.contiguous() + + cpu_backup = torch.empty( + src_tensor.size(), + dtype=torch.uint8 if fp8_offload else src_tensor.dtype, + layout=src_tensor.layout, + device="cpu", + pin_memory=pin_memory, + ) + + if fp8_offload: + cpu_backup = Float8Tensor.make_like(src_tensor, data=cpu_backup) + + cpu_backup.copy_(src_tensor, non_blocking=pin_memory) + state = (src_tensor.device, cpu_backup) + return state + + @staticmethod + def reload(state, non_blocking=None): + """Reload.""" + debug_rank("------reload") + dev, cpu_backup = state + if non_blocking is None: + non_blocking = cpu_backup.is_pinned() + return cpu_backup.to(dev, non_blocking=non_blocking) + + def __init__(self, is_first_last_vpp_chunk, min_offloaded_tensor_size): + # Data Structure to maintain reference to activation tensors + self._tensor_tag_to_state = {} + # Mark the first microbatch of the last virtual pipeline stage + self._is_first_last_vpp_chunk = is_first_last_vpp_chunk + + # Group management for batching offload/reload operations + self._offloaded_group_index = 0 + self._groups_to_offload = [] + self._groups_to_reload = [] + self._tensor_count_current_group = 0 + + # Counter for special torch tensor types (FakeTensor, FunctionalTensor) + self.torch_tensor_count = 0 + self.d2h_stream = PipelineOffloadManager.get_instance().d2h_stream + self.h2d_stream = PipelineOffloadManager.get_instance().h2d_stream + self._offload_events = {} + self._reload_events = {} + self.min_offloaded_tensor_size = min_offloaded_tensor_size + self.is_last_layer = False + + def is_empty_chunk(self): + """Check if this chunk has no tensors to manage.""" + return len(self._tensor_tag_to_state) == 0 + + def is_first_last_layer(self): + """ + Check if this is the last layer of the first microbatch of the last vp stage. + These tensors should not be offloaded to avoid unnecessary overhead. + """ + debug_rank( + f"------is_first_last_layer {self._is_first_last_vpp_chunk} {self.is_last_layer}" + ) + return self._is_first_last_vpp_chunk and self.is_last_layer + + def tensor_push(self, tensor): + """Push tensor to the offload handler.""" + torch_stray_tensor = isinstance( + tensor, + ( + torch._subclasses.fake_tensor.FakeTensor, + torch._subclasses.functional_tensor.FunctionalTensor, + ), + ) + + if not torch_stray_tensor: + # Assign unique tag based on group index and position within group + tensor_tag = (self._offloaded_group_index, self._tensor_count_current_group) + self._tensor_count_current_group += 1 + assert tensor_tag not in self._tensor_tag_to_state, "Duplicate tensor tag" + self._tensor_tag_to_state[tensor_tag] = tensor + else: + # Use negative group ID for special tensor types + tensor_tag = (-1, self.torch_tensor_count) + self.torch_tensor_count += 1 + self._tensor_tag_to_state[tensor_tag] = tensor + debug_rank(f"--------tensor_push {tensor_tag}") + return tensor_tag + + def tensor_pop(self, tensor_tag): + """Pop tensor from the offload handler.""" + debug_rank(f"--------tensor_pop {tensor_tag}") + assert tensor_tag in self._tensor_tag_to_state, f"Tag {tensor_tag} not found" + tensor = self._tensor_tag_to_state.pop(tensor_tag) + # If tensor is offloaded (stored as tuple), reload it + if isinstance(tensor, tuple): + tensor = self.reload(tensor) + debug_rank(f"--------tensor_pop {tensor.shape}") + return tensor + + def tensor_need_offloading_checker(self, tensor): + """Check if the tensor needs to be offloaded.""" + if tensor.numel() < self.min_offloaded_tensor_size: + return False + # Respect tensor's offload preference if specified + if hasattr(tensor, "offloading_activation") and not tensor.offloading_activation: + return False + return True + + def bulk_offload_group(self, group_to_offload): + """offload a group of tensors recorded in tensor_push().""" + debug_rank("------bulk_offload_group") + assert not self.is_first_last_layer(), "Should not offload first-last layer" + group_id_to_offload, name = group_to_offload + torch.cuda.nvtx.range_push("activation offloading " + name) + with torch.cuda.stream(self.d2h_stream): + for tensor_tag, state in self._tensor_tag_to_state.items(): + group_id, _ = tensor_tag + if group_id == group_id_to_offload: + debug_rank(f"------tensor_tag {tensor_tag}") + debug_rank(f"------group_to_offload {group_to_offload}") + assert not isinstance(state, tuple), "Tensor already offloaded" + tensor_on_device = state + if self.tensor_need_offloading_checker(tensor_on_device): + state = self.offload(tensor_on_device) + event = torch.cuda.Event() + event.record(self.d2h_stream) + self._offload_events[name] = event + tensor_on_device.record_stream(self.d2h_stream) + self._tensor_tag_to_state[tensor_tag] = state + torch.cuda.nvtx.range_pop() + + def get_offload_event(self, name): + """Get the CUDA event for a named offload operation.""" + return self._offload_events.get(name, None) + + def get_reload_event(self, name): + """Get the CUDA event for a named reload operation.""" + return self._reload_events.get(name, None) + + def bulk_reload_group(self, group_to_reload): + """Bulk reload group.""" + debug_rank("----bulk_reload_group") + found_reload_group = False + group_id_to_reload, name = group_to_reload + torch.cuda.nvtx.range_push("activation reloading " + name) + with torch.cuda.stream(self.h2d_stream): + for tensor_label, state in self._tensor_tag_to_state.items(): + group_id, _ = tensor_label + if group_id == group_id_to_reload: + debug_rank(f"----tensor_label {tensor_label}") + found_reload_group = True + event = self.get_offload_event(name) + # Only reload if tensor was offloaded (stored as tuple) + if isinstance(state, tuple): + # Wait for offload to complete before reloading + torch.cuda.current_stream().wait_event(event) + recovered_tensor = self.reload(state) + event.record(self.h2d_stream) + self._reload_events[name] = event + debug_rank(f"----recovered_tensor {recovered_tensor.shape}") + self._tensor_tag_to_state[tensor_label] = recovered_tensor + torch.cuda.nvtx.range_pop() + return found_reload_group + + def pre_reload_last_layer(self): + """Pre-reload the last layer of this chunk to hide reload latency.""" + debug_rank("pre_reload_last_layer") + assert not self._is_first_last_vpp_chunk, "Should not pre-reload first chunk" + debug_rank(f"len(self._groups_to_reload) {len(self._groups_to_reload)}") + if len(self._groups_to_reload) > 0: + # Reload the last group (last layer) early + if self.bulk_reload_group(self._groups_to_reload[-1]): + self._groups_to_reload.pop() + + def should_bulk_offload(self): + """Determine if the current group should be offloaded.""" + # Don't offload the first backward chunk's last layer + if self.is_first_last_layer(): + return False + + # Check if next backward chunk is this chunk (for last pipeline stage) + next_backward_chunk = PipelineOffloadManager.get_instance().front() + if next_backward_chunk is not None and next_backward_chunk is self: + # Don't offload last layer if it's about to be used immediately + if self.is_last_layer: + return False + + return True + + def bulk_offload(self, forced_released_tensors): + """Offload a group of tensors and optionally release their GPU memory.""" + debug_rank("----bulk_offload") + if self.should_bulk_offload(): + group_to_offload = self._groups_to_offload.pop() + self._groups_to_reload.append(group_to_offload) + self.bulk_offload_group(group_to_offload) + # Manually release tensors not auto-freed by torch GC + if len(forced_released_tensors) > 0: + cur_stream = torch.cuda.current_stream() + for release_tensor in forced_released_tensors: + if self.tensor_need_offloading_checker(release_tensor): + # Ensure tensor is not in use before freeing + release_tensor.record_stream(cur_stream) + release_tensor.untyped_storage().resize_(0) + + def on_group_commit_forward(self, forced_released_tensors): + """Called at the end of a layer group's forward pass to trigger offloading.""" + debug_rank("--on_group_commit_forward") + # Wait for compute to finish before starting offload + self.d2h_stream.wait_stream(torch.cuda.current_stream()) + self.bulk_offload(forced_released_tensors) + + def bulk_reload(self): + """Reload the next group of tensors from CPU to GPU.""" + debug_rank("--bulk_reload") + if len(self._groups_to_reload) > 0: + # Reload the next layer group + if self.bulk_reload_group(self._groups_to_reload[-1]): + debug_rank(f"--bulk_reload_group {self._groups_to_reload}") + self._groups_to_reload.pop() + else: + # Pre-load the last layer of the next backward chunk to hide latency + next_backward_chunk = PipelineOffloadManager.get_instance().front() + if next_backward_chunk is not None: + next_backward_chunk.pre_reload_last_layer() + + def on_group_commit_backward(self, name): + """ + Called at the end of a layer group's backward pass. + Ensures correct chunk is active and synchronizes reloads. + """ + debug_rank("--on_group_commit_backward") + cur_backward_chunk = PipelineOffloadManager.get_instance().cur_backward_chunk() + # Switch to this chunk if it's not already current + if cur_backward_chunk is not self: + PipelineOffloadManager.get_instance().pop() + cur_backward_chunk = PipelineOffloadManager.get_instance().cur_backward_chunk() + assert cur_backward_chunk is self, "Chunk mismatch" + # Wait for reload to complete before using tensors + event = self.get_reload_event(name) + if event is not None: + torch.cuda.current_stream().wait_event(event) + self._offloaded_group_index = self._offloaded_group_index - 1 + + def on_group_start_forward(self, name): + """ + Called at the start of a layer group's forward pass. + Increments group index and prepares for offloading. + """ + debug_rank(f"--on_group_start_forward") + self._offloaded_group_index = self._offloaded_group_index + 1 + self._tensor_count_current_group = 0 + self._groups_to_offload.append((self._offloaded_group_index, name)) + + def on_group_start_backward(self): + """ + Called at the start of a layer group's backward pass. + Triggers reloading of tensors from CPU. + """ + debug_rank("--on_group_start_backward") + # Wait for compute to finish before starting reload + self.h2d_stream.wait_stream(torch.cuda.current_stream()) + self.bulk_reload() + + +class FineGrainedOffloadingGroupCommitFunction(torch.autograd.Function): + """ + Identity operation that marks the end of a layer group for offload synchronization. + Triggers offload during forward and synchronizes reload during backward. + """ + + @staticmethod + def forward(ctx, *args): + # pylint: disable=missing-function-docstring + debug_rank("FineGrainedOffloadingGroupCommitFunction forward") + + forced_released_tensors = args[-1] + name = args[-2] + cpu_offload_handler = args[-3] + tensor = args[:-3] + cpu_offload_handler.on_group_commit_forward(forced_released_tensors) + ctx.cpu_offload_handler = cpu_offload_handler + ctx.name = name + + # return the identical tensor + return tensor + + @staticmethod + def backward(ctx, *grad_output): + # pylint: disable=missing-function-docstring + debug_rank("FineGrainedOffloadingGroupCommitFunction backward") + + cpu_offload_handler = ctx.cpu_offload_handler + cpu_offload_handler.on_group_commit_backward(ctx.name) + return grad_output + (None, None, None) + + +def fine_grained_offloading_group_commit(*tensor, name, forced_released_tensors=[]): + """ + Specify the tensors to be released after offloading. + forced_released_tensors is a list of tensors to be released after offloading. + The tensors will be untyped_storage().resize_(0) after offloading. + Note: specify the tensors only when they are not automatically released by torch gc. + """ + cur_forward_chunk = PipelineOffloadManager.get_instance().cur_forward_chunk() + return FineGrainedOffloadingGroupCommitFunction.apply( + *tensor, cur_forward_chunk, name, forced_released_tensors + ) + + +class FineGrainedOffloadingGroupStartFunction(torch.autograd.Function): + """ + Identity operation that marks the start of a layer group for offload/reload. + Prepares for offload during forward and triggers reload during backward. + """ + + @staticmethod + def forward(ctx, tensor, cpu_offload_handler, name): + # pylint: disable=missing-function-docstring + ctx.cpu_offload_handler = cpu_offload_handler + debug_rank("FineGrainedOffloadingGroupStartFunction forward") + + cpu_offload_handler.on_group_start_forward(name) + # return the identical tensor + return tensor + + @staticmethod + def backward(ctx, grad_output): + # pylint: disable=missing-function-docstring + debug_rank("FineGrainedOffloadingGroupStartFunction backward") + cpu_offload_handler = ctx.cpu_offload_handler + cpu_offload_handler.on_group_start_backward() + return grad_output, None, None + + +def fine_grained_offloading_group_start(tensor, name=None): + """Mark the start of a layer group and prepare for offload/reload.""" + cur_forward_chunk = PipelineOffloadManager.get_instance().cur_forward_chunk() + return FineGrainedOffloadingGroupStartFunction.apply(tensor, cur_forward_chunk, name) + + +def get_fine_grained_offloading_context(flag): + """Get the fine-grained offload context""" + return PipelineOffloadManager.get_instance() if flag else nullcontext() + + +def fine_grained_offloading_set_last_layer(is_last_layer): + """Set the last layer flag.""" + PipelineOffloadManager.get_instance().set_last_layer(is_last_layer) + + +def fine_grained_offloading_init_chunk_handler(vp_size, vp_stage, min_offloaded_tensor_size): + """Initialize the chunk handler, called at the start of a microbatch forward pass.""" + PipelineOffloadManager.get_instance().init_model_chunk_offload_handler( + vp_size, vp_stage, min_offloaded_tensor_size + ) + + +def fine_grained_offloading_reset(): + """Reset the chunk handler, called at the start of a training iteration.""" + PipelineOffloadManager.get_instance().reset() diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index e83f8d90635..09f95ac25d2 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import contextlib from functools import partial @@ -9,6 +9,9 @@ from megatron.core import parallel_state from megatron.core.enums import ModelType +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_reset, +) from megatron.core.pipeline_parallel.p2p_communication import P2PCommunicator from megatron.core.pipeline_parallel.utils import ( is_pp_first_stage, @@ -562,6 +565,9 @@ def forward_backward_no_pipelining( if config.timers is not None: config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time) + if not forward_only and config.fine_grained_activation_offloading: + fine_grained_offloading_reset() + no_sync_func = config.no_sync_func if no_sync_func is None: no_sync_func = contextlib.nullcontext @@ -898,6 +904,9 @@ def forward_backward_pipelining_with_interleaving( adjust_tensor_shapes_fn is None ), "adjust_tensor_shapes_fn is not supported for interleaved pipeline parallelism" + if not forward_only and config.fine_grained_activation_offloading: + fine_grained_offloading_reset() + if config.overlap_p2p_comm and config.batch_p2p_comm: raise ValueError("Can not use both overlap_p2p_comm and batch_p2p_comm") @@ -2043,6 +2052,9 @@ def forward_backward_pipelining_without_interleaving( if config.timers is not None: config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time) + if not forward_only and config.fine_grained_activation_offloading: + fine_grained_offloading_reset() + # Disable async grad reductions no_sync_func = config.no_sync_func if no_sync_func is None: diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py index 54cac0e41e3..5a44c38713d 100644 --- a/megatron/core/tensor_parallel/random.py +++ b/megatron/core/tensor_parallel/random.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Parts of the code here are adapted from PyTorch # repo: https://github.com/pytorch/pytorch @@ -510,10 +510,14 @@ def forward(ctx, run_function, checkpoint_without_output_obj, *args): @staticmethod def backward(ctx, *args): """Backward pass.""" - inputs = ctx.saved_tensors + # Get the inputs from the context instead of the saved tensors + # because the saved tensors are already cached by the recomputation. + # This is to avoid double-reloading the inputs in CPU offloading scenario. + inputs = ctx.inputs outputs = ctx.outputs torch.autograd.backward(outputs, args) ctx.outputs = None + ctx.inputs = None grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp for inp in inputs) return (None, None) + grads @@ -573,8 +577,10 @@ def _recompute(self, _): recompute_ctx = contextlib.nullcontext() fp8_ctx = contextlib.nullcontext() + # Store the inputs for backward pass + inputs = self.ctx.saved_tensors with torch.enable_grad(), fp8_ctx, recompute_ctx: - outputs = self.run_function(*self.ctx.saved_tensors) + outputs = self.run_function(*inputs) self.run_function = None self.rng_states = None @@ -590,6 +596,7 @@ def _recompute(self, _): output.untyped_storage().copy_(recomputation_output.untyped_storage()) self.ctx.outputs = outputs + self.ctx.inputs = inputs self.outputs = None self.ctx = None diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index d4e990041ca..af6dada6746 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. from abc import ABC, abstractmethod from dataclasses import dataclass @@ -22,6 +22,11 @@ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_group_commit, + fine_grained_offloading_group_start, + get_fine_grained_offloading_context, +) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.module import MegatronModule @@ -188,6 +193,21 @@ def __init__( and "core_attn" in self.config.recompute_modules ) + self.offload_qkv_linear = ( + self.config.fine_grained_activation_offloading + and "qkv_linear" in self.config.offload_modules + ) + + self.offload_core_attention = ( + self.config.fine_grained_activation_offloading + and "core_attn" in self.config.offload_modules + ) + + self.offload_attn_proj = ( + self.config.fine_grained_activation_offloading + and "attn_proj" in self.config.offload_modules + ) + # Output. self.linear_proj = build_module( submodules.linear_proj, @@ -730,9 +750,17 @@ def forward( if output_gate: assert split_qkv, "output_gate is not supported for unsplit mixed_qkv tensor." - qkv_output = self.get_query_key_value_tensors( - hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv - ) + if self.offload_qkv_linear: + hidden_states = fine_grained_offloading_group_start(hidden_states, name="qkv_linear") + with get_fine_grained_offloading_context(self.offload_qkv_linear): + qkv_output = self.get_query_key_value_tensors( + hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv + ) + if self.offload_qkv_linear: + (qkv_output,) = fine_grained_offloading_group_commit( + qkv_output, name="qkv_linear", forced_released_tensors=[] + ) + attn_mask_type = self.attn_mask_type block_table = None gate = None @@ -881,17 +909,20 @@ def forward( packed_seq_params=packed_seq_params, ) else: + if self.offload_core_attention and self.training: + query = fine_grained_offloading_group_start(query, name="core_attn") if inference_context is None or inference_context.is_static_batching(): # Static batching attention kernel. - core_attn_out = self.core_attention( - query, - key, - value, - attention_mask, - attn_mask_type=attn_mask_type, - attention_bias=attention_bias, - packed_seq_params=packed_seq_params, - ) + with get_fine_grained_offloading_context(self.offload_core_attention): + core_attn_out = self.core_attention( + query, + key, + value, + attention_mask, + attn_mask_type=attn_mask_type, + attention_bias=attention_bias, + packed_seq_params=packed_seq_params, + ) else: # Dynamic batching attention kernel. @@ -911,6 +942,10 @@ def forward( block_table, ) core_attn_out = rearrange(core_attn_out, 's b h d -> s b (h d)') + if self.offload_core_attention and self.training: + (core_attn_out,) = fine_grained_offloading_group_commit( + core_attn_out, name="core_attn", forced_released_tensors=[query, key, value] + ) if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd': # reshape to same output shape as unpacked case @@ -931,7 +966,14 @@ def forward( # ================= nvtx_range_push(suffix="linear_proj") - output, bias = self.linear_proj(core_attn_out) + if self.offload_attn_proj: + core_attn_out = fine_grained_offloading_group_start(core_attn_out, name="attn_proj") + with get_fine_grained_offloading_context(self.offload_attn_proj): + output, bias = self.linear_proj(core_attn_out) + if self.offload_attn_proj: + output, bias = fine_grained_offloading_group_commit( + output, bias, name="attn_proj", forced_released_tensors=[core_attn_out] + ) nvtx_range_pop(suffix="linear_proj") return output, bias diff --git a/megatron/core/transformer/moe/README.md b/megatron/core/transformer/moe/README.md index 0a933aed0df..a44daea38e2 100644 --- a/megatron/core/transformer/moe/README.md +++ b/megatron/core/transformer/moe/README.md @@ -210,6 +210,20 @@ Enable A2A overlap across different batches inspired by the DSv3 DualPipe implme --delay-wgrad-compute ``` +### Fine-grained Activation Offloading (collaborated with rednote) +Offload the input activation at the granularity of modules + +**Usage** +```bash +# Enable fine-grained activation offloading +--fine-grained-activation-offloading + +# Specify which modules are going to offload its input +# Choices: "attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act". +--offload-modules expert_fc1 +``` +For more details, please refer to the ```docs/source/api-guide/fine_grained_activation_offloading.md``` + ### MoE Related Arguments | Item | Description | | --- | --- | diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py index d0ac20a7536..ca308da0d21 100644 --- a/megatron/core/transformer/moe/experts.py +++ b/megatron/core/transformer/moe/experts.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import copy import itertools @@ -27,6 +27,11 @@ from megatron.core.fusions.fused_bias_swiglu import weighted_bias_swiglu_impl from megatron.core.fusions.fused_weighted_squared_relu import weighted_squared_relu_impl from megatron.core.jit import jit_fuser +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_group_commit, + fine_grained_offloading_group_start, + get_fine_grained_offloading_context, +) from megatron.core.tensor_parallel.layers import ( _initialize_affine_weight_cpu, _initialize_affine_weight_gpu, @@ -825,6 +830,16 @@ def __init__( tp_group=pg_collection.expt_tp, ) + self.offload_expert_fc1 = ( + self.config.fine_grained_activation_offloading + and "expert_fc1" in self.config.offload_modules + ) + + self.offload_moe_act = ( + self.config.fine_grained_activation_offloading + and "moe_act" in self.config.offload_modules + ) + self.activation_recompute = ( self.config.recompute_granularity == 'selective' and "moe_act" in self.config.recompute_modules @@ -834,6 +849,12 @@ def __init__( set_save_original_input(self.linear_fc2) + # This is to avoid the CPU overhead of multiple d2h copies + if self.offload_expert_fc1 and not (self.config.fp8 or self.config.fp4): + from megatron.core.extensions.transformer_engine import set_save_original_input + + set_save_original_input(self.linear_fc1) + if self.config.fp8 or self.config.fp4: assert HAVE_TE, "FP8 and FP4 requires TE." self.quantization_padding = Fp8Padding(self.num_local_experts) @@ -898,9 +919,21 @@ def forward( # Probs already applied, so reset to 1. permuted_probs = torch.ones_like(permuted_probs) - intermediate_parallel, bias_parallel = self.linear_fc1( - permuted_local_hidden_states, tokens_per_expert - ) + if self.offload_expert_fc1: + permuted_local_hidden_states = fine_grained_offloading_group_start( + permuted_local_hidden_states, name="expert_fc1" + ) + with get_fine_grained_offloading_context(self.offload_expert_fc1): + fc1_output, bias_parallel = self.linear_fc1( + permuted_local_hidden_states, tokens_per_expert + ) + if self.offload_expert_fc1: + fc1_output, bias_parallel = fine_grained_offloading_group_commit( + fc1_output, + bias_parallel, + name="expert_fc1", + forced_released_tensors=[permuted_local_hidden_states], + ) def bias_act_func(intermediate_parallel, bias_parallel, permuted_probs): if self.config.use_te_activation_func: @@ -960,18 +993,26 @@ def glu(x): intermediate_parallel = intermediate_parallel.to(original_dtype) return intermediate_parallel + if self.offload_moe_act: + fc1_output = fine_grained_offloading_group_start(fc1_output, name="moe_act") + if self.activation_recompute: self.activation_checkpoint = tensor_parallel.CheckpointWithoutOutput() - intermediate_parallel = self.activation_checkpoint.checkpoint( - bias_act_func, intermediate_parallel, bias_parallel, permuted_probs - ) - output, output_bias = self.linear_fc2(intermediate_parallel, tokens_per_expert) - self.activation_checkpoint.discard_output_and_register_recompute(output) + with get_fine_grained_offloading_context(self.offload_moe_act): + bias_act_output = self.activation_checkpoint.checkpoint( + bias_act_func, fc1_output, bias_parallel, permuted_probs + ) else: - intermediate_parallel = bias_act_func( - intermediate_parallel, bias_parallel, permuted_probs + with get_fine_grained_offloading_context(self.offload_moe_act): + bias_act_output = bias_act_func(fc1_output, bias_parallel, permuted_probs) + + output, output_bias = self.linear_fc2(bias_act_output, tokens_per_expert) + if self.activation_recompute: + self.activation_checkpoint.discard_output_and_register_recompute(output) + if self.offload_moe_act: + (output,) = fine_grained_offloading_group_commit( + output, name="moe_act", forced_released_tensors=[fc1_output] ) - output, output_bias = self.linear_fc2(intermediate_parallel, tokens_per_expert) # upad and concat the output if self.config.fp8 or self.config.fp4: diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py index a8893ebec36..5d3f16c1041 100644 --- a/megatron/core/transformer/multi_latent_attention.py +++ b/megatron/core/transformer/multi_latent_attention.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import math @@ -22,6 +22,11 @@ _yarn_get_mscale, apply_rotary_pos_emb, ) +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_group_commit, + fine_grained_offloading_group_start, + get_fine_grained_offloading_context, +) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel.layers import ColumnParallelLinear from megatron.core.tensor_parallel.mappings import ( @@ -266,15 +271,19 @@ def forward( query, key, value, attention_mask, packed_seq_params=packed_seq_params ) else: + if self.offload_core_attention and self.training: + query = fine_grained_offloading_group_start(query, name="core_attn") + if inference_context is None or inference_context.is_static_batching(): - core_attn_out = self.core_attention( - query, - key, - value, - attention_mask, - packed_seq_params=packed_seq_params, - attn_mask_type=attn_mask_type, - ) + with get_fine_grained_offloading_context(self.offload_core_attention): + core_attn_out = self.core_attention( + query, + key, + value, + attention_mask, + packed_seq_params=packed_seq_params, + attn_mask_type=attn_mask_type, + ) elif self.cache_mla_latents: # Dynamic batching attention kernel. q, k, v = (query, key, value) @@ -295,6 +304,10 @@ def forward( # Only rearrange if not in absorption mode (Flash MLA handles format correctly) if not inference_context.is_decode_only(): core_attn_out = rearrange(core_attn_out, 's b h d -> s b (h d)') + if self.offload_core_attention and self.training: + (core_attn_out,) = fine_grained_offloading_group_commit( + core_attn_out, name="core_attn", forced_released_tensors=[query, key, value] + ) # We are doing absorption with cache mla latents and decode mode. if self.cache_mla_latents and inference_context.is_decode_only(): @@ -320,7 +333,14 @@ def forward( # ================= # Output. [sq, b, h] # ================= - output, bias = self.linear_proj(core_attn_out) + if self.offload_attn_proj: + core_attn_out = fine_grained_offloading_group_start(core_attn_out, name="attn_proj") + with get_fine_grained_offloading_context(self.offload_attn_proj): + output, bias = self.linear_proj(core_attn_out) + if self.offload_attn_proj: + output, bias = fine_grained_offloading_group_commit( + output, bias, name="attn_proj", forced_released_tensors=[core_attn_out] + ) return output, bias diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py index bd3aa9c8c96..a619b9ffa55 100755 --- a/megatron/core/transformer/multi_token_prediction.py +++ b/megatron/core/transformer/multi_token_prediction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. from contextlib import nullcontext from dataclasses import dataclass @@ -13,6 +13,9 @@ from megatron.core.fp8_utils import get_fp8_context from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_set_last_layer, +) from megatron.core.pipeline_parallel.utils import is_vp_last_stage from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel import ( @@ -901,6 +904,8 @@ def forward( hidden_states_list = list(torch.chunk(hidden_states, 1 + offset, dim=0)) hidden_states = hidden_states_list[offset] for layer_number in range(len(self.layers)): + if self.config.fine_grained_activation_offloading: + fine_grained_offloading_set_last_layer(layer_number == len(self.layers) - 1) (hidden_states, input_ids, position_ids) = self.layers[layer_number]( input_ids=input_ids, position_ids=position_ids, diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index aead6133f22..06e8f1372f4 100755 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import logging from contextlib import nullcontext from dataclasses import dataclass @@ -16,6 +16,9 @@ from megatron.core.fusions.fused_layer_norm import FusedLayerNorm from megatron.core.inference.contexts import BaseInferenceContext from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_set_last_layer, +) from megatron.core.pipeline_parallel.utils import is_vp_first_stage, is_vp_last_stage from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.enums import LayerType @@ -693,6 +696,11 @@ def forward( else: inner_quantization_context = nullcontext() + if self.config.fine_grained_activation_offloading: + fine_grained_offloading_set_last_layer( + l_no == self.num_layers_per_pipeline_rank - 1 + ) + with self.offload_context, inner_quantization_context: hidden_states, context = layer( hidden_states=hidden_states, diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index d14f991046e..9f1b112ba83 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import logging import warnings @@ -775,6 +775,29 @@ class TransformerConfig(ModelParallelConfig): """Transformer implementation to use. Options are 'transformer_engine' for Transformer Engine and 'local' for MCore.""" + ##################################### + # Fine-grained Activation Offloading + ##################################### + fine_grained_activation_offloading: bool = False + """If True, offload the input of the specified modules to the CPU. + Fine-grained activation offloading is a module-level offloading method + instead of a layer-level offloading method like cpu_offloading.""" + + offload_modules: Optional[list[str]] = None + """The submodules to offload its input. + choices: "attn_norm", "qkv_linear", "core_attn", "attn_proj", + "mlp_norm", "expert_fc1", "moe_act". + "attn_norm": offload the input of the normalization in the attention part. + "qkv_linear": offload the input of the qkv linear part. + "core_attn": offload the input of the core attention part. + "attn_proj": offload the input of the attn linear projection part. + "mlp_norm": offload the input of the normalization in the mlp part. + "expert_fc1": offload the input of the expert fc1 part. + "moe_act": offload the input of the moe act part. + """ + min_offloaded_tensor_size: int = 1024 * 1024 + """The minimum size of the tensor to be offloaded.""" + def __post_init__(self): """Python dataclass method that is used to modify attributes after initialization. See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more @@ -1120,6 +1143,32 @@ def __post_init__(self): if "moe" not in self.recompute_modules: self.recompute_modules.append("moe") + if self.fine_grained_activation_offloading: + assert ( + not self.cpu_offloading + ), "fine_grained_activation_offloading cannot be enabled with cpu_offloading." + assert self.offload_modules is not None and len(self.offload_modules) > 0 + allowed_modules = { + "core_attn", + "attn_proj", + "expert_fc1", + "moe_act", + "attn_norm", + "mlp_norm", + "qkv_linear", + } + invalid_modules = set(self.offload_modules) - allowed_modules + assert not invalid_modules, ( + f'Invalid choices for offload_modules: {invalid_modules}. ' + f'Allowed modules are: {allowed_modules}' + ) + if "attn_proj" in self.offload_modules and "core_attn" not in self.offload_modules: + raise ValueError( + "attn_proj cannot be set to offload_modules alone without core_attn " + "because the input of attn_proj is the output of core_attn, " + "which is needed in core_attn.backward()." + ) + if ( self.num_layers_in_first_pipeline_stage is not None or self.num_layers_in_last_pipeline_stage is not None diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index a5babece9d0..c36ff7515e4 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import logging import warnings @@ -397,6 +397,16 @@ def __init__( if "mlp" in self.config.recompute_modules: if not isinstance(self.mlp, MoELayer): self.recompute_mlp = True + self.offload_attn_norm = ( + self.config.fine_grained_activation_offloading + and "attn_norm" in self.config.offload_modules + and not isinstance(self.input_layernorm, IdentityOp) + ) + self.offload_mlp_norm = ( + self.config.fine_grained_activation_offloading + and "mlp_norm" in self.config.offload_modules + and not isinstance(self.pre_mlp_layernorm, IdentityOp) + ) # @jcasper how should we handle nvfuser? # Set bias+dropout+add fusion grad_enable execution handler. @@ -479,20 +489,29 @@ def _forward_attention( context (Tensor): Updated context tensor if cross-attention is used, otherwise None. """ + from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_group_commit, + fine_grained_offloading_group_start, + get_fine_grained_offloading_context, + ) inference_context = deprecate_inference_params(inference_context, inference_params) # Residual connection. residual = hidden_states + if self.offload_attn_norm: + hidden_states = fine_grained_offloading_group_start(hidden_states, name="attn_norm") # Optional Input Layer norm if self.recompute_input_layernorm: self.input_layernorm_checkpoint = tensor_parallel.CheckpointWithoutOutput() - input_layernorm_output = self.input_layernorm_checkpoint.checkpoint( - self.input_layernorm, hidden_states - ) + with get_fine_grained_offloading_context(self.offload_attn_norm): + input_layernorm_output = self.input_layernorm_checkpoint.checkpoint( + self.input_layernorm, hidden_states + ) else: - input_layernorm_output = self.input_layernorm(hidden_states) + with get_fine_grained_offloading_context(self.offload_attn_norm): + input_layernorm_output = self.input_layernorm(hidden_states) # Self attention. nvtx_range_push(suffix="self_attention") @@ -526,6 +545,11 @@ def _forward_attention( ) nvtx_range_pop(suffix="self_attn_bda") + if self.offload_attn_norm: + (hidden_states,) = fine_grained_offloading_group_commit( + hidden_states, name="attn_norm", forced_released_tensors=[residual] + ) + # Residual connection. residual = hidden_states @@ -563,17 +587,27 @@ def _forward_mlp(self, hidden_states, inference_context=None): output (Tensor): Transformed hidden states of shape [s, b, h]. """ + from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_group_commit, + fine_grained_offloading_group_start, + get_fine_grained_offloading_context, + ) + # Residual connection. residual = hidden_states + if self.offload_mlp_norm: + hidden_states = fine_grained_offloading_group_start(hidden_states, name="mlp_norm") # Optional Layer norm post the cross-attention. if self.recompute_pre_mlp_layernorm: self.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput() - pre_mlp_layernorm_output = self.pre_mlp_norm_checkpoint.checkpoint( - self.pre_mlp_layernorm, hidden_states - ) + with get_fine_grained_offloading_context(self.offload_mlp_norm): + pre_mlp_layernorm_output = self.pre_mlp_norm_checkpoint.checkpoint( + self.pre_mlp_layernorm, hidden_states + ) else: - pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states) + with get_fine_grained_offloading_context(self.offload_mlp_norm): + pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states) nvtx_range_push(suffix="mlp") # Potentially chunk the MLP computation during prefill to minimize the peak activation size @@ -633,6 +667,10 @@ def _forward_mlp(self, hidden_states, inference_context=None): mlp_output_with_bias, residual, self.hidden_dropout ) nvtx_range_pop(suffix="mlp_bda") + if self.offload_mlp_norm: + (hidden_states,) = fine_grained_offloading_group_commit( + hidden_states, name="mlp_norm", forced_released_tensors=[residual] + ) # Jit compiled function creates 'view' tensor. This tensor # potentially gets saved in the MPU checkpoint function context, diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index bdf915a8ae1..8e5f343b73c 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1216,6 +1216,10 @@ def validate_args(args, defaults={}): "when enabling delay_wgrad_compute" ) + if args.fine_grained_activation_offloading: + assert args.transformer_impl == 'transformer_engine', \ + "Fine-grained activation offloading is only supported with transformer_engine implementation" + if args.mtp_num_layers: assert not args.use_legacy_models, "The legacy Megatron models does not support Multi-Token Prediction (MTP)." assert args.position_embedding_type == "rope" or args.position_embedding_type == "none", ( @@ -2327,7 +2331,12 @@ def _add_training_args(parser): help='The communicator group names to use high priority streams.') group.add_argument('--use-te-activation-func', action='store_true', help='Use activation function kernel from Transformer Engine in MLP module.') - + group.add_argument('--fine-grained-activation-offloading', action='store_true', + help='Enable fine-grained activation offloading.') + group.add_argument('--offload-modules', nargs='*', type=str, default=[], + help='The submodules to offload its input. Choices: "attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act".') + group.add_argument('--min-offloaded-tensor-size', type=int, default=1024*1024, + help='The minimum size of the tensor to be offloaded.') return parser diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_coreweave.json new file mode 100644 index 00000000000..b3f192ba287 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_coreweave.json @@ -0,0 +1,344 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.07546, + "2": 11.03837, + "3": 9.66011, + "4": 9.91381, + "5": 9.32909, + "6": 9.13922, + "7": 9.13574, + "8": 8.65508, + "9": 8.51394, + "10": 8.8409, + "11": 8.29149, + "12": 8.34581, + "13": 8.25518, + "14": 7.73711, + "15": 7.86249, + "16": 7.9371, + "17": 7.89319, + "18": 7.63123, + "19": 7.99731, + "20": 7.74538, + "21": 7.44348, + "22": 7.42249, + "23": 7.29714, + "24": 7.27462, + "25": 7.54574, + "26": 6.96838, + "27": 7.50556, + "28": 7.22743, + "29": 7.36588, + "30": 7.52622, + "31": 7.27026, + "32": 7.45521, + "33": 7.50954, + "34": 7.55686, + "35": 7.10177, + "36": 6.96431, + "37": 7.28463, + "38": 7.0808, + "39": 7.40923, + "40": 7.43338, + "41": 7.38496, + "42": 7.15749, + "43": 7.15858, + "44": 7.28852, + "45": 7.16793, + "46": 6.78468, + "47": 7.4114, + "48": 7.0027, + "49": 7.46249, + "50": 6.92151 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 911219392.0, + "2": 910960384.0, + "3": 911156352.0, + "4": 912204800.0, + "5": 920796544.0, + "6": 940387968.0, + "7": 990599872.0, + "8": 976457728.0, + "9": 998097664.0, + "10": 995852672.0, + "11": 994583680.0, + "12": 977344896.0, + "13": 1028141824.0, + "14": 1007166208.0, + "15": 987423616.0, + "16": 993054784.0, + "17": 982319168.0, + "18": 998261760.0, + "19": 984696320.0, + "20": 982914752.0, + "21": 979667456.0, + "22": 953988864.0, + "23": 972353984.0, + "24": 964792064.0, + "25": 958512192.0, + "26": 946928512.0, + "27": 948458304.0, + "28": 949643968.0, + "29": 942877440.0, + "30": 935020160.0, + "31": 935327616.0, + "32": 934281088.0, + "33": 921805568.0, + "34": 928189312.0, + "35": 922202496.0, + "36": 924246656.0, + "37": 920661248.0, + "38": 922930752.0, + "39": 922322816.0, + "40": 921856512.0, + "41": 920227968.0, + "42": 918353664.0, + "43": 918607040.0, + "44": 914948032.0, + "45": 914295232.0, + "46": 914344448.0, + "47": 911769536.0, + "48": 912013312.0, + "49": 910349440.0, + "50": 914351552.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5498353152.0, + "2": 5499147776.0, + "3": 5499940352.0, + "4": 5500732928.0, + "5": 5501525504.0, + "6": 5502318080.0, + "7": 5503110656.0, + "8": 5503903232.0, + "9": 5497958912.0, + "10": 5498751488.0, + "11": 5499544064.0, + "12": 5500336640.0, + "13": 5501129216.0, + "14": 5501921792.0, + "15": 5502714368.0, + "16": 5503506944.0, + "17": 5504299520.0, + "18": 5505092096.0, + "19": 5505884672.0, + "20": 5506677248.0, + "21": 5507469824.0, + "22": 5508262400.0, + "23": 5509054976.0, + "24": 5509847552.0, + "25": 5510640128.0, + "26": 5511432704.0, + "27": 5512225280.0, + "28": 5513017856.0, + "29": 5513810432.0, + "30": 5514603008.0, + "31": 5515395584.0, + "32": 5516188160.0, + "33": 5516980736.0, + "34": 5517773312.0, + "35": 5518565888.0, + "36": 5519358464.0, + "37": 5520151040.0, + "38": 5520943616.0, + "39": 5521736192.0, + "40": 5522528768.0, + "41": 5523321344.0, + "42": 5524113920.0, + "43": 5524906496.0, + "44": 5525699072.0, + "45": 5526491648.0, + "46": 5527284224.0, + "47": 5528076800.0, + "48": 5528869376.0, + "49": 5529661952.0, + "50": 5530454528.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 41739952128.0, + "2": 43687571456.0, + "3": 43687571456.0, + "4": 43983216640.0, + "5": 43983216640.0, + "6": 43983216640.0, + "7": 43983216640.0, + "8": 44024635392.0, + "9": 44041216000.0, + "10": 44041216000.0, + "11": 44041216000.0, + "12": 44041216000.0, + "13": 44041216000.0, + "14": 44041216000.0, + "15": 44041216000.0, + "16": 44041216000.0, + "17": 44041216000.0, + "18": 44041216000.0, + "19": 44041216000.0, + "20": 44041216000.0, + "21": 44041216000.0, + "22": 44041216000.0, + "23": 44041216000.0, + "24": 44041216000.0, + "25": 44041216000.0, + "26": 44041216000.0, + "27": 44041216000.0, + "28": 44041216000.0, + "29": 44041326592.0, + "30": 44162326528.0, + "31": 44220485632.0, + "32": 44270411776.0, + "33": 44293799936.0, + "34": 44293799936.0, + "35": 44293799936.0, + "36": 44293799936.0, + "37": 44293799936.0, + "38": 44293799936.0, + "39": 44293799936.0, + "40": 44293799936.0, + "41": 44293799936.0, + "42": 44293799936.0, + "43": 44293799936.0, + "44": 44293799936.0, + "45": 44293799936.0, + "46": 44293799936.0, + "47": 44293799936.0, + "48": 44293799936.0, + "49": 44293799936.0, + "50": 44293799936.0 + } + }, + "mtp_1 loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.08617, + "2": 11.10475, + "3": 10.48001, + "4": 10.13466, + "5": 9.79047, + "6": 9.50601, + "7": 9.5113, + "8": 8.85336, + "9": 8.66683, + "10": 8.95866, + "11": 8.29315, + "12": 8.36982, + "13": 8.25544, + "14": 7.73322, + "15": 7.86639, + "16": 7.92442, + "17": 7.86278, + "18": 7.61012, + "19": 8.00269, + "20": 7.73019, + "21": 7.4165, + "22": 7.41478, + "23": 7.28671, + "24": 7.27903, + "25": 7.54456, + "26": 6.96542, + "27": 7.50538, + "28": 7.20607, + "29": 7.377, + "30": 7.52777, + "31": 7.27094, + "32": 7.4604, + "33": 7.51419, + "34": 7.56867, + "35": 7.09252, + "36": 6.96015, + "37": 7.29846, + "38": 7.0742, + "39": 7.43347, + "40": 7.43116, + "41": 7.40919, + "42": 7.15527, + "43": 7.15652, + "44": 7.30441, + "45": 7.1893, + "46": 6.77296, + "47": 7.45045, + "48": 7.02403, + "49": 7.45719, + "50": 6.92656 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 64.40054, + "2": 2.16564, + "3": 3.72378, + "4": 1.63174, + "5": 2.30947, + "6": 1.7246, + "7": 1.5089, + "8": 1.60943, + "9": 1.48606, + "10": 1.47162, + "11": 1.05608, + "12": 1.3309, + "13": 1.06824, + "14": 1.41914, + "15": 1.10033, + "16": 1.15759, + "17": 1.23897, + "18": 1.10439, + "19": 1.11869, + "20": 1.09363, + "21": 1.23622, + "22": 1.14797, + "23": 1.23037, + "24": 1.03991, + "25": 1.07795, + "26": 1.04416, + "27": 1.03654, + "28": 1.04098, + "29": 1.03502, + "30": 1.02909, + "31": 1.17935, + "32": 1.14717, + "33": 1.05403, + "34": 1.13894, + "35": 1.04538, + "36": 1.04367, + "37": 1.0843, + "38": 1.04631, + "39": 1.06131, + "40": 1.06988, + "41": 1.09756, + "42": 1.04759, + "43": 1.09649, + "44": 1.05666, + "45": 1.05249, + "46": 1.04539, + "47": 1.04041, + "48": 1.04904, + "49": 1.04777, + "50": 1.06237 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_eos.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_eos.json new file mode 100644 index 00000000000..d7372742ca7 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_eos.json @@ -0,0 +1,344 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.07546, + "2": 11.03837, + "3": 9.66011, + "4": 9.91381, + "5": 9.32909, + "6": 9.13922, + "7": 9.13574, + "8": 8.65508, + "9": 8.51394, + "10": 8.8409, + "11": 8.29149, + "12": 8.34581, + "13": 8.25518, + "14": 7.73711, + "15": 7.86249, + "16": 7.9371, + "17": 7.89319, + "18": 7.63123, + "19": 7.99731, + "20": 7.74538, + "21": 7.44348, + "22": 7.42249, + "23": 7.29714, + "24": 7.27462, + "25": 7.54574, + "26": 6.96838, + "27": 7.50556, + "28": 7.22743, + "29": 7.36588, + "30": 7.52622, + "31": 7.27026, + "32": 7.45521, + "33": 7.50954, + "34": 7.55686, + "35": 7.10177, + "36": 6.96431, + "37": 7.28463, + "38": 7.0808, + "39": 7.40923, + "40": 7.43338, + "41": 7.38496, + "42": 7.15749, + "43": 7.15858, + "44": 7.28852, + "45": 7.16793, + "46": 6.78468, + "47": 7.4114, + "48": 7.0027, + "49": 7.46249, + "50": 6.92151 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 911219392.0, + "2": 910960384.0, + "3": 911156352.0, + "4": 912204800.0, + "5": 920796544.0, + "6": 940387968.0, + "7": 990599872.0, + "8": 976457728.0, + "9": 998097664.0, + "10": 995852672.0, + "11": 994583680.0, + "12": 977344896.0, + "13": 1028141824.0, + "14": 1007166208.0, + "15": 987423616.0, + "16": 993054784.0, + "17": 982319168.0, + "18": 998261760.0, + "19": 984696320.0, + "20": 982914752.0, + "21": 979667456.0, + "22": 953988864.0, + "23": 972353984.0, + "24": 964792064.0, + "25": 958512192.0, + "26": 946928512.0, + "27": 948458304.0, + "28": 949643968.0, + "29": 942877440.0, + "30": 935020160.0, + "31": 935327616.0, + "32": 934281088.0, + "33": 921805568.0, + "34": 928189312.0, + "35": 922202496.0, + "36": 924246656.0, + "37": 920661248.0, + "38": 922930752.0, + "39": 922322816.0, + "40": 921856512.0, + "41": 920227968.0, + "42": 918353664.0, + "43": 918607040.0, + "44": 914948032.0, + "45": 914295232.0, + "46": 914344448.0, + "47": 911769536.0, + "48": 912013312.0, + "49": 910349440.0, + "50": 914351552.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 5498353152.0, + "2": 5499147776.0, + "3": 5499940352.0, + "4": 5500732928.0, + "5": 5501525504.0, + "6": 5502318080.0, + "7": 5503110656.0, + "8": 5503903232.0, + "9": 5497958912.0, + "10": 5498751488.0, + "11": 5499544064.0, + "12": 5500336640.0, + "13": 5501129216.0, + "14": 5501921792.0, + "15": 5502714368.0, + "16": 5503506944.0, + "17": 5504299520.0, + "18": 5505092096.0, + "19": 5505884672.0, + "20": 5506677248.0, + "21": 5507469824.0, + "22": 5508262400.0, + "23": 5509054976.0, + "24": 5509847552.0, + "25": 5510640128.0, + "26": 5511432704.0, + "27": 5512225280.0, + "28": 5513017856.0, + "29": 5513810432.0, + "30": 5514603008.0, + "31": 5515395584.0, + "32": 5516188160.0, + "33": 5516980736.0, + "34": 5517773312.0, + "35": 5518565888.0, + "36": 5519358464.0, + "37": 5520151040.0, + "38": 5520943616.0, + "39": 5521736192.0, + "40": 5522528768.0, + "41": 5523321344.0, + "42": 5524113920.0, + "43": 5524906496.0, + "44": 5525699072.0, + "45": 5526491648.0, + "46": 5527284224.0, + "47": 5528076800.0, + "48": 5528869376.0, + "49": 5529661952.0, + "50": 5530454528.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 41739952128.0, + "2": 43687571456.0, + "3": 43687571456.0, + "4": 43983216640.0, + "5": 43983216640.0, + "6": 43983216640.0, + "7": 43983216640.0, + "8": 44024635392.0, + "9": 44041216000.0, + "10": 44041216000.0, + "11": 44041216000.0, + "12": 44041216000.0, + "13": 44041216000.0, + "14": 44041216000.0, + "15": 44041216000.0, + "16": 44041216000.0, + "17": 44041216000.0, + "18": 44041216000.0, + "19": 44041216000.0, + "20": 44041216000.0, + "21": 44041216000.0, + "22": 44041216000.0, + "23": 44041216000.0, + "24": 44041216000.0, + "25": 44041216000.0, + "26": 44041216000.0, + "27": 44041216000.0, + "28": 44041216000.0, + "29": 44041326592.0, + "30": 44162326528.0, + "31": 44220485632.0, + "32": 44270411776.0, + "33": 44293799936.0, + "34": 44293799936.0, + "35": 44293799936.0, + "36": 44293799936.0, + "37": 44293799936.0, + "38": 44293799936.0, + "39": 44293799936.0, + "40": 44293799936.0, + "41": 44293799936.0, + "42": 44293799936.0, + "43": 44293799936.0, + "44": 44293799936.0, + "45": 44293799936.0, + "46": 44293799936.0, + "47": 44293799936.0, + "48": 44293799936.0, + "49": 44293799936.0, + "50": 44293799936.0 + } + }, + "mtp_1 loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.08617, + "2": 11.10475, + "3": 10.48001, + "4": 10.13466, + "5": 9.79047, + "6": 9.50601, + "7": 9.5113, + "8": 8.85336, + "9": 8.66683, + "10": 8.95866, + "11": 8.29315, + "12": 8.36982, + "13": 8.25544, + "14": 7.73322, + "15": 7.86639, + "16": 7.92442, + "17": 7.86278, + "18": 7.61012, + "19": 8.00269, + "20": 7.73019, + "21": 7.4165, + "22": 7.41478, + "23": 7.28671, + "24": 7.27903, + "25": 7.54456, + "26": 6.96542, + "27": 7.50538, + "28": 7.20607, + "29": 7.377, + "30": 7.52777, + "31": 7.27094, + "32": 7.4604, + "33": 7.51419, + "34": 7.56867, + "35": 7.09252, + "36": 6.96015, + "37": 7.29846, + "38": 7.0742, + "39": 7.43347, + "40": 7.43116, + "41": 7.40919, + "42": 7.15527, + "43": 7.15652, + "44": 7.30441, + "45": 7.1893, + "46": 6.77296, + "47": 7.45045, + "48": 7.02403, + "49": 7.45719, + "50": 6.92656 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 87.63934, + "2": 1.98402, + "3": 3.95877, + "4": 1.64812, + "5": 2.312, + "6": 2.02902, + "7": 1.56333, + "8": 1.66703, + "9": 1.6393, + "10": 1.40472, + "11": 1.086, + "12": 1.34921, + "13": 1.0854, + "14": 1.4242, + "15": 1.09539, + "16": 1.79766, + "17": 1.2562, + "18": 1.08887, + "19": 1.08371, + "20": 1.10071, + "21": 1.25979, + "22": 1.3212, + "23": 1.25044, + "24": 1.05384, + "25": 1.11356, + "26": 1.0605, + "27": 1.03418, + "28": 1.0405, + "29": 1.05174, + "30": 1.04166, + "31": 1.20036, + "32": 1.12936, + "33": 1.02917, + "34": 1.13473, + "35": 1.02829, + "36": 1.04352, + "37": 1.0843, + "38": 1.03714, + "39": 1.04534, + "40": 1.07031, + "41": 1.07618, + "42": 1.03008, + "43": 1.06043, + "44": 1.04049, + "45": 1.02875, + "46": 1.03669, + "47": 1.03128, + "48": 1.02808, + "49": 1.03038, + "50": 1.04621 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml new file mode 100644 index 00000000000..d9ec0456190 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml @@ -0,0 +1,139 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 32 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True + NCCL_NVLS_ENABLE: 0 + PYTHONWARNINGS: ignore + NCCL_DEBUG: VERSION +MODEL_ARGS: + # Distributed args + --distributed-timeout-minutes: 60 + --tensor-model-parallel-size: 2 + --pipeline-model-parallel-size: 2 + --expert-model-parallel-size: 4 + --context-parallel-size: 1 + --expert-tensor-parallel-size: 1 + --use-distributed-optimizer: true + # NOTE: uncomment if TE >= 2.9.0 + # --overlap-grad-reduce: true + # --overlap-param-gather: true + # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN + --attention-backend: unfused # TODO: switch back to fused attention after fix + # Training args + --use-mcore-models: true + --sequence-parallel: true + --disable-bias-linear: true + --micro-batch-size: 4 + --global-batch-size: 32 + --train-iters: 50 + --exit-duration-in-mins: 230 + --no-check-for-nan-in-loss-and-grad: true + --no-rope-fusion: true + --cross-entropy-loss-fusion: true + --cross-entropy-fusion-impl: native + --manual-gc: true + --manual-gc-interval: 100 + --recompute-granularity: selective + --recompute-modules: "[layernorm mla_up_proj mlp moe_act]" + --fine-grained-activation-offloading: true + --offload-modules: "[expert_fc1 moe_act attn_norm mlp_norm]" + # Transformer Engine args + --transformer-impl: transformer_engine + # Data args + --seq-length: 4096 + --data-cache-path: ${DATA_CACHE_PATH} + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt + --split: 949,50,1 + # Add network size args + --num-layers: 15 + --moe-layer-freq: ([0]*3+[1]*12) + --pipeline-model-parallel-layout: Et*3\\|\\(tt\\|\\)*6mL # Et*3|(tt|)*6mL + --hidden-size: 1024 + --ffn-hidden-size: 4096 + --num-attention-heads: 32 + --kv-channels: 128 + --max-position-embeddings: 4096 + --position-embedding-type: rope + --rotary-base: 10000 + --make-vocab-size-divisible-by: 3232 + --normalization: RMSNorm + --norm-epsilon: 1e-6 + --swiglu: true + --untie-embeddings-and-output-weights: true + --multi-latent-attention: true + # Comment out the following MTP args to disable MTP + --mtp-num-layers: 1 + --mtp-loss-scaling-factor: 0.1 + # Add regularization args + --attention-dropout: 0.0 + --hidden-dropout: 0.0 + --clip-grad: 1.0 + --weight-decay: 0.1 + --qk-layernorm: true + # Add learning rate args + --lr-warmup-fraction: .01 + --lr: 0.00015 + --min-lr: 1.0e-5 + --lr-decay-style: cosine + --adam-beta1: 0.9 + --adam-beta2: 0.95 + # Add MoE args + --num-experts: 32 + --moe-ffn-hidden-size: 1024 + --moe-shared-expert-intermediate-size: 1024 + --moe-router-load-balancing-type: seq_aux_loss + --moe-router-topk: 4 + --moe-token-dispatcher-type: alltoall + --moe-router-pre-softmax: true + --moe-grouped-gemm: true + --moe-aux-loss-coeff: 1e-4 + --moe-router-group-topk: 2 + --moe-router-num-groups: 4 + --moe-router-topk-scaling-factor: 2.0 + --moe-router-score-function: sigmoid + --moe-router-enable-expert-bias: true + --moe-router-bias-update-rate: 1e-3 + --moe-router-dtype: fp32 + --moe-permute-fusion: true + # Add MLA args + --q-lora-rank: 1536 + --kv-lora-rank: 512 + --qk-head-dim: 128 + --qk-pos-emb-head-dim: 64 + --v-head-dim: 128 + --rotary-scaling-factor: 40 + --mscale: 1.0 + --mscale-all-dim: 1.0 + # Add validation args + --eval-iters: 32 + --eval-interval: 200 + # Add checkpointing args + --save: ${CHECKPOINT_SAVE_PATH} + --load: ${CHECKPOINT_LOAD_PATH} + --save-interval: 25 + # Add initialization args + --init-method-std: 0.02 + # Add logging args + --log-timers-to-tensorboard: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-params-norm: true + --log-validation-ppl-to-tensorboard: true + --log-throughput: true + --log-interval: 1 + --logging-level: 40 + --tensorboard-dir: ${TENSORBOARD_PATH} + # Add mixed precision args + --bf16: true + --exit-interval: 50 + --overlap-moe-expert-parallel-comm: true +TEST_TYPE: regular # Usually ckpt-resume, but as a WAR to #513 set to regular +METRICS: + - "iteration-time" + - "lm loss" + - "num-zeros" + - "mem-allocated-bytes" + - "mem-max-allocated-bytes" + - "mtp_1 loss" diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_coreweave.json new file mode 100644 index 00000000000..4e979e64295 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_coreweave.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.04266, + "2": 11.02309, + "3": 9.43552, + "4": 10.04614, + "5": 9.38535, + "6": 9.14543, + "7": 9.21141, + "8": 8.63458, + "9": 8.48937, + "10": 8.82763, + "11": 8.29457, + "12": 8.3282, + "13": 8.23008, + "14": 7.71714, + "15": 7.86981, + "16": 7.92286, + "17": 7.8604, + "18": 7.62039, + "19": 7.98493, + "20": 7.72023, + "21": 7.39758, + "22": 7.39771, + "23": 7.28314, + "24": 7.25048, + "25": 7.53113, + "26": 6.95329, + "27": 7.49432, + "28": 7.20394, + "29": 7.37282, + "30": 7.50232, + "31": 7.25348, + "32": 7.4305, + "33": 7.48364, + "34": 7.53486, + "35": 7.10336, + "36": 6.94516, + "37": 7.26117, + "38": 7.07009, + "39": 7.40543, + "40": 7.42044, + "41": 7.34202, + "42": 7.11816, + "43": 7.11373, + "44": 7.27067, + "45": 7.07036, + "46": 6.77823, + "47": 7.1875, + "48": 6.99998, + "49": 7.45868, + "50": 6.90956 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 844114112.0, + "2": 843855104.0, + "3": 844048640.0, + "4": 842998144.0, + "5": 855786112.0, + "6": 874329728.0, + "7": 925591552.0, + "8": 915644608.0, + "9": 935187584.0, + "10": 927702400.0, + "11": 957888256.0, + "12": 923872512.0, + "13": 969427072.0, + "14": 965228416.0, + "15": 952825344.0, + "16": 943777088.0, + "17": 928845824.0, + "18": 925913856.0, + "19": 955339136.0, + "20": 989208256.0, + "21": 924095424.0, + "22": 908902272.0, + "23": 892664576.0, + "24": 900830400.0, + "25": 928105472.0, + "26": 877724352.0, + "27": 912808320.0, + "28": 904557696.0, + "29": 872625088.0, + "30": 864767104.0, + "31": 868220416.0, + "32": 861931136.0, + "33": 859941312.0, + "34": 855839104.0, + "35": 854046848.0, + "36": 852944896.0, + "37": 851456704.0, + "38": 849532096.0, + "39": 849972608.0, + "40": 849505792.0, + "41": 845780288.0, + "42": 846003328.0, + "43": 846257472.0, + "44": 852034880.0, + "45": 847187456.0, + "46": 855625856.0, + "47": 844661952.0, + "48": 851197248.0, + "49": 851630464.0, + "50": 846195904.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4419107328.0, + "2": 4419108864.0, + "3": 4419108864.0, + "4": 4419108864.0, + "5": 4419108864.0, + "6": 4419108864.0, + "7": 4419108864.0, + "8": 4419108864.0, + "9": 4419108864.0, + "10": 4419108864.0, + "11": 4419108864.0, + "12": 4419108864.0, + "13": 4419108864.0, + "14": 4419108864.0, + "15": 4419108864.0, + "16": 4419108864.0, + "17": 4419108864.0, + "18": 4419108864.0, + "19": 4419108864.0, + "20": 4419108864.0, + "21": 4419108864.0, + "22": 4419108864.0, + "23": 4419108864.0, + "24": 4419108864.0, + "25": 4419108864.0, + "26": 4419108864.0, + "27": 4419108864.0, + "28": 4419108864.0, + "29": 4419108864.0, + "30": 4419108864.0, + "31": 4419108864.0, + "32": 4419108864.0, + "33": 4419108864.0, + "34": 4419108864.0, + "35": 4419108864.0, + "36": 4419108864.0, + "37": 4419108864.0, + "38": 4419108864.0, + "39": 4419108864.0, + "40": 4419108864.0, + "41": 4419108864.0, + "42": 4419108864.0, + "43": 4419108864.0, + "44": 4419108864.0, + "45": 4419108864.0, + "46": 4419108864.0, + "47": 4419108864.0, + "48": 4419108864.0, + "49": 4419108864.0, + "50": 4419108864.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 37959917568.0, + "2": 39578677248.0, + "3": 39580196864.0, + "4": 39580196864.0, + "5": 39583309824.0, + "6": 39583309824.0, + "7": 39583309824.0, + "8": 39583309824.0, + "9": 39583309824.0, + "10": 39583309824.0, + "11": 39583309824.0, + "12": 39583309824.0, + "13": 39583309824.0, + "14": 39583309824.0, + "15": 39583309824.0, + "16": 39583309824.0, + "17": 39583309824.0, + "18": 39583309824.0, + "19": 39583309824.0, + "20": 39583309824.0, + "21": 39583309824.0, + "22": 39583309824.0, + "23": 39583309824.0, + "24": 39583309824.0, + "25": 39583309824.0, + "26": 39583309824.0, + "27": 39583309824.0, + "28": 39583309824.0, + "29": 39583309824.0, + "30": 39583309824.0, + "31": 39583309824.0, + "32": 39583309824.0, + "33": 39583309824.0, + "34": 39583309824.0, + "35": 39583309824.0, + "36": 39583309824.0, + "37": 39583309824.0, + "38": 39583309824.0, + "39": 39583309824.0, + "40": 39583309824.0, + "41": 39583309824.0, + "42": 39583309824.0, + "43": 39583309824.0, + "44": 39583309824.0, + "45": 39583309824.0, + "46": 39583309824.0, + "47": 39583309824.0, + "48": 39583309824.0, + "49": 39583309824.0, + "50": 39583309824.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 60.48727, + "2": 2.0537, + "3": 3.26481, + "4": 2.56819, + "5": 2.40218, + "6": 1.26492, + "7": 1.5836, + "8": 1.37182, + "9": 1.10133, + "10": 1.10352, + "11": 1.18687, + "12": 1.53724, + "13": 1.25166, + "14": 1.69801, + "15": 1.42166, + "16": 1.104, + "17": 1.22214, + "18": 1.34911, + "19": 1.09323, + "20": 1.08552, + "21": 1.22223, + "22": 1.19712, + "23": 1.05456, + "24": 1.03745, + "25": 1.14154, + "26": 1.07349, + "27": 1.05181, + "28": 1.0364, + "29": 1.17111, + "30": 1.02943, + "31": 1.0758, + "32": 1.03304, + "33": 1.04107, + "34": 1.03092, + "35": 1.07869, + "36": 1.02457, + "37": 1.08557, + "38": 1.00729, + "39": 1.07249, + "40": 1.08655, + "41": 1.02362, + "42": 1.02046, + "43": 1.07618, + "44": 1.08709, + "45": 1.00443, + "46": 1.00379, + "47": 1.06019, + "48": 0.98958, + "49": 1.08317, + "50": 0.9932 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_eos.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_eos.json new file mode 100644 index 00000000000..537e20b09d8 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_eos.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.04266, + "2": 11.02309, + "3": 9.43552, + "4": 10.04614, + "5": 9.38535, + "6": 9.14543, + "7": 9.21141, + "8": 8.63458, + "9": 8.48937, + "10": 8.82763, + "11": 8.29457, + "12": 8.3282, + "13": 8.23008, + "14": 7.71714, + "15": 7.86981, + "16": 7.92286, + "17": 7.8604, + "18": 7.62039, + "19": 7.98493, + "20": 7.72023, + "21": 7.39758, + "22": 7.39771, + "23": 7.28314, + "24": 7.25048, + "25": 7.53113, + "26": 6.95329, + "27": 7.49432, + "28": 7.20394, + "29": 7.37282, + "30": 7.50232, + "31": 7.25348, + "32": 7.4305, + "33": 7.48364, + "34": 7.53486, + "35": 7.10336, + "36": 6.94516, + "37": 7.26117, + "38": 7.07009, + "39": 7.40543, + "40": 7.42044, + "41": 7.34202, + "42": 7.11816, + "43": 7.11373, + "44": 7.27067, + "45": 7.07036, + "46": 6.77823, + "47": 7.1875, + "48": 6.99998, + "49": 7.45868, + "50": 6.90956 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 844114112.0, + "2": 843855104.0, + "3": 844048640.0, + "4": 842998144.0, + "5": 855786112.0, + "6": 874329728.0, + "7": 925591552.0, + "8": 915644608.0, + "9": 935187584.0, + "10": 927702400.0, + "11": 957888256.0, + "12": 923872512.0, + "13": 969427072.0, + "14": 965228416.0, + "15": 952825344.0, + "16": 943777088.0, + "17": 928845824.0, + "18": 925913856.0, + "19": 955339136.0, + "20": 989208256.0, + "21": 924095424.0, + "22": 908902272.0, + "23": 892664576.0, + "24": 900830400.0, + "25": 928105472.0, + "26": 877724352.0, + "27": 912808320.0, + "28": 904557696.0, + "29": 872625088.0, + "30": 864767104.0, + "31": 868220416.0, + "32": 861931136.0, + "33": 859941312.0, + "34": 855839104.0, + "35": 854046848.0, + "36": 852944896.0, + "37": 851456704.0, + "38": 849532096.0, + "39": 849972608.0, + "40": 849505792.0, + "41": 845780288.0, + "42": 846003328.0, + "43": 846257472.0, + "44": 852034880.0, + "45": 847187456.0, + "46": 855625856.0, + "47": 844661952.0, + "48": 851197248.0, + "49": 851630464.0, + "50": 846195904.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4419107328.0, + "2": 4419108864.0, + "3": 4419108864.0, + "4": 4419108864.0, + "5": 4419108864.0, + "6": 4419108864.0, + "7": 4419108864.0, + "8": 4419108864.0, + "9": 4419108864.0, + "10": 4419108864.0, + "11": 4419108864.0, + "12": 4419108864.0, + "13": 4419108864.0, + "14": 4419108864.0, + "15": 4419108864.0, + "16": 4419108864.0, + "17": 4419108864.0, + "18": 4419108864.0, + "19": 4419108864.0, + "20": 4419108864.0, + "21": 4419108864.0, + "22": 4419108864.0, + "23": 4419108864.0, + "24": 4419108864.0, + "25": 4419108864.0, + "26": 4419108864.0, + "27": 4419108864.0, + "28": 4419108864.0, + "29": 4419108864.0, + "30": 4419108864.0, + "31": 4419108864.0, + "32": 4419108864.0, + "33": 4419108864.0, + "34": 4419108864.0, + "35": 4419108864.0, + "36": 4419108864.0, + "37": 4419108864.0, + "38": 4419108864.0, + "39": 4419108864.0, + "40": 4419108864.0, + "41": 4419108864.0, + "42": 4419108864.0, + "43": 4419108864.0, + "44": 4419108864.0, + "45": 4419108864.0, + "46": 4419108864.0, + "47": 4419108864.0, + "48": 4419108864.0, + "49": 4419108864.0, + "50": 4419108864.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 37959917568.0, + "2": 39578677248.0, + "3": 39580196864.0, + "4": 39580196864.0, + "5": 39583309824.0, + "6": 39583309824.0, + "7": 39583309824.0, + "8": 39583309824.0, + "9": 39583309824.0, + "10": 39583309824.0, + "11": 39583309824.0, + "12": 39583309824.0, + "13": 39583309824.0, + "14": 39583309824.0, + "15": 39583309824.0, + "16": 39583309824.0, + "17": 39583309824.0, + "18": 39583309824.0, + "19": 39583309824.0, + "20": 39583309824.0, + "21": 39583309824.0, + "22": 39583309824.0, + "23": 39583309824.0, + "24": 39583309824.0, + "25": 39583309824.0, + "26": 39583309824.0, + "27": 39583309824.0, + "28": 39583309824.0, + "29": 39583309824.0, + "30": 39583309824.0, + "31": 39583309824.0, + "32": 39583309824.0, + "33": 39583309824.0, + "34": 39583309824.0, + "35": 39583309824.0, + "36": 39583309824.0, + "37": 39583309824.0, + "38": 39583309824.0, + "39": 39583309824.0, + "40": 39583309824.0, + "41": 39583309824.0, + "42": 39583309824.0, + "43": 39583309824.0, + "44": 39583309824.0, + "45": 39583309824.0, + "46": 39583309824.0, + "47": 39583309824.0, + "48": 39583309824.0, + "49": 39583309824.0, + "50": 39583309824.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 67.13422, + "2": 1.95457, + "3": 3.25371, + "4": 2.66673, + "5": 3.05794, + "6": 1.35128, + "7": 1.66174, + "8": 2.19011, + "9": 1.16207, + "10": 1.16456, + "11": 1.26279, + "12": 1.60263, + "13": 1.29219, + "14": 2.93489, + "15": 1.48729, + "16": 1.15146, + "17": 1.27648, + "18": 1.39906, + "19": 1.13846, + "20": 1.14415, + "21": 1.27567, + "22": 1.26287, + "23": 1.11223, + "24": 1.10986, + "25": 1.20096, + "26": 1.13382, + "27": 1.11305, + "28": 1.11424, + "29": 1.22341, + "30": 1.08856, + "31": 1.15539, + "32": 1.10684, + "33": 1.11399, + "34": 1.09048, + "35": 1.1509, + "36": 1.09151, + "37": 1.13904, + "38": 1.06658, + "39": 1.1325, + "40": 1.14715, + "41": 1.07533, + "42": 1.08243, + "43": 1.13881, + "44": 1.14004, + "45": 1.06323, + "46": 1.06103, + "47": 1.11785, + "48": 1.04242, + "49": 1.13933, + "50": 1.0407 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml new file mode 100644 index 00000000000..f4b64722712 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml @@ -0,0 +1,134 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True + NCCL_NVLS_ENABLE: 0 + PYTHONWARNINGS: ignore + NCCL_DEBUG: VERSION +MODEL_ARGS: + # Distributed args + --distributed-timeout-minutes: 60 + --tensor-model-parallel-size: 2 + --pipeline-model-parallel-size: 2 + --expert-model-parallel-size: 4 + --context-parallel-size: 1 + --expert-tensor-parallel-size: 1 + --use-distributed-optimizer: true + # NOTE: uncomment if TE >= 2.9.0 + # --overlap-grad-reduce: true + # --overlap-param-gather: true + # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN + --attention-backend: unfused # TODO: switch back to fused attention after fix + # Training args + --use-mcore-models: true + --sequence-parallel: true + --disable-bias-linear: true + --micro-batch-size: 4 + --global-batch-size: 32 + --train-iters: 50 + --exit-duration-in-mins: 230 + --no-check-for-nan-in-loss-and-grad: true + --no-rope-fusion: true + --cross-entropy-loss-fusion: true + --cross-entropy-fusion-impl: native + --manual-gc: true + --manual-gc-interval: 100 + --recompute-granularity: selective + --recompute-modules: "[layernorm mla_up_proj mlp moe_act]" + --fine-grained-activation-offloading: true + --offload-modules: "[expert_fc1 moe_act attn_norm mlp_norm]" + # Transformer Engine args + --transformer-impl: transformer_engine + # Data args + --seq-length: 4096 + --data-cache-path: ${DATA_CACHE_PATH} + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt + --split: 949,50,1 + # Add network size args + --num-layers: 15 + --moe-layer-freq: ([0]*3+[1]*12) + --pipeline-model-parallel-layout: Et*3\\|\\(tt\\|\\)*6L # Et*3|(tt|)*6L + --hidden-size: 1024 + --ffn-hidden-size: 4096 + --num-attention-heads: 32 + --kv-channels: 128 + --max-position-embeddings: 4096 + --position-embedding-type: rope + --rotary-base: 10000 + --make-vocab-size-divisible-by: 3232 + --normalization: RMSNorm + --norm-epsilon: 1e-6 + --swiglu: true + --untie-embeddings-and-output-weights: true + --multi-latent-attention: true + # Add regularization args + --attention-dropout: 0.0 + --hidden-dropout: 0.0 + --clip-grad: 1.0 + --weight-decay: 0.1 + --qk-layernorm: true + # Add learning rate args + --lr-warmup-fraction: .01 + --lr: 0.00015 + --min-lr: 1.0e-5 + --lr-decay-style: cosine + --adam-beta1: 0.9 + --adam-beta2: 0.95 + # Add MoE args + --num-experts: 32 + --moe-ffn-hidden-size: 1024 + --moe-shared-expert-intermediate-size: 1024 + --moe-router-load-balancing-type: seq_aux_loss + --moe-router-topk: 4 + --moe-token-dispatcher-type: alltoall + --moe-router-pre-softmax: true + --moe-grouped-gemm: true + --moe-aux-loss-coeff: 1e-4 + --moe-router-group-topk: 2 + --moe-router-num-groups: 4 + --moe-router-topk-scaling-factor: 2.0 + --moe-router-score-function: sigmoid + --moe-router-enable-expert-bias: true + --moe-router-bias-update-rate: 1e-3 + --moe-router-dtype: fp32 + --moe-permute-fusion: true + # Add MLA args + --q-lora-rank: 1536 + --kv-lora-rank: 512 + --qk-head-dim: 128 + --qk-pos-emb-head-dim: 64 + --v-head-dim: 128 + --rotary-scaling-factor: 40 + --mscale: 1.0 + --mscale-all-dim: 1.0 + # Add validation args + --eval-iters: 32 + --eval-interval: 200 + # Add checkpointing args + --save: ${CHECKPOINT_SAVE_PATH} + --load: ${CHECKPOINT_LOAD_PATH} + --save-interval: 25 + # Add initialization args + --init-method-std: 0.02 + # Add logging args + --log-timers-to-tensorboard: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-params-norm: true + --log-validation-ppl-to-tensorboard: true + --log-throughput: true + --log-interval: 1 + --logging-level: 40 + --tensorboard-dir: ${TENSORBOARD_PATH} + # Add mixed precision args + --bf16: true + --exit-interval: 50 +TEST_TYPE: regular # Usually ckpt-resume, but as a WAR to #513 set to regular +METRICS: + - "iteration-time" + - "lm loss" + - "num-zeros" + - "mem-allocated-bytes" + - "mem-max-allocated-bytes" diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml index 8164ca37df8..7a0f7d8a3f6 100644 --- a/tests/test_utils/recipes/moe.yaml +++ b/tests/test_utils/recipes/moe.yaml @@ -124,6 +124,16 @@ products: - environment: [dev] scope: [mr] platforms: [dgx_h100] + - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_h100] + - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_h100] ####################################################################### # Super important MR tests that run for both DEV and LTS per MR # ####################################################################### diff --git a/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py b/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py new file mode 100644 index 00000000000..7c1b7f1fe4b --- /dev/null +++ b/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py @@ -0,0 +1,187 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +import gc + +import pytest +import torch + +EPSILON = 0.1 + +# Skip all tests if CUDA is not available +cuda_available = torch.cuda.is_available() + + +def _reset_cuda_memory(): + gc.collect() + if cuda_available: + torch.cuda.empty_cache() + + +class ToyModel(torch.nn.Module): + def __init__(self, hidden_size: int = 2048, num_layers: int = 4, dtype=torch.bfloat16): + super().__init__() + layers = [] + for _ in range(num_layers): + layers.append( + torch.nn.Linear(hidden_size, hidden_size, bias=True, dtype=dtype, device="cuda") + ) + self.net = torch.nn.Sequential(*layers).to(device="cuda", dtype=dtype) + self.hidden_size = hidden_size + self.num_layers = num_layers + self.dtype = dtype + + # Prevent weights/bias from being considered activation tensors for offload; + # ensure we only count activation tensors (inputs x) in memory accounting. + for p in self.parameters(): + try: + setattr(p, "offloading_activation", False) + except Exception: + pass + + def forward(self, x, use_offload: bool = False): + from megatron.core.pipeline_parallel import fine_grained_activation_offload as off + + if use_offload: + # Initialize a new chunk (microbatch) and enable offload context. + with off.get_fine_grained_offloading_context(True): + off.fine_grained_offloading_init_chunk_handler( + vp_size=1, vp_stage=None, min_offloaded_tensor_size=1 + ) + for i, layer in enumerate(self.net): + # Group by module; with this linear-only model, each group corresponds to a layer. + off.fine_grained_offloading_set_last_layer(i == len(self.net) - 1) + x = off.fine_grained_offloading_group_start(x, name=f"layer_{i}") + x = layer(x) + # Commit the group; returns a tuple of tensors + (x,) = off.fine_grained_offloading_group_commit( + x, name=f"layer_{i}", forced_released_tensors=[] + ) + return x + # Baseline path (no offload hooks) + with ( + torch.autocast(device_type="cuda", dtype=self.dtype) + if self.dtype in (torch.float16, torch.bfloat16) + else torch.cuda.amp.autocast(enabled=False) + ): + for layer in self.net: + x = layer(x) + return x + + +@pytest.fixture(autouse=True) +def _monkeypatch_offload_deps(monkeypatch): + # Avoid requiring torch.distributed initialization and NVML in tests + import megatron.core.pipeline_parallel.fine_grained_activation_offload as off + + monkeypatch.setattr(off, "debug_rank", lambda *args, **kwargs: None, raising=False) + monkeypatch.setattr(off, "set_ideal_affinity_for_current_gpu", lambda: None, raising=False) + # Ensure a clean state each test + off.fine_grained_offloading_reset() + yield + off.fine_grained_offloading_reset() + + +def test_fine_grained_activation_offload_memory_reduction(): + torch.manual_seed(1234) + # Use a linear-only stack so theoretical saved memory equals sum of per-layer input x bytes. + model = ToyModel(hidden_size=2048, num_layers=8, dtype=torch.bfloat16).eval() + + # Create input + inp = torch.randn( + (2048, model.hidden_size), device="cuda", dtype=torch.bfloat16, requires_grad=True + ) + + # Warmup to stabilize allocator behavior + _reset_cuda_memory() + out = model(inp, use_offload=False) + (out.sum()).backward() + torch.cuda.synchronize() + _reset_cuda_memory() + + # Baseline memory measurement (no offload) + _reset_cuda_memory() + inp_baseline = inp.detach().clone().requires_grad_(True) + baseline_mem_before = torch.cuda.memory_allocated() / (1024**2) + out_base = model(inp_baseline, use_offload=False) + baseline_mem_after = (torch.cuda.memory_allocated() - out_base.nbytes) / (1024**2) + (out_base.sum()).backward() + torch.cuda.synchronize() + baseline_delta = baseline_mem_after - baseline_mem_before + + # Offload memory measurement + from megatron.core.pipeline_parallel import fine_grained_activation_offload as off + + off.fine_grained_offloading_reset() + _reset_cuda_memory() + inp_off = inp.detach().clone().requires_grad_(True) + offload_mem_before = torch.cuda.memory_allocated() / (1024**2) + out_off = model(inp_off, use_offload=True) + offload_mem_after = (torch.cuda.memory_allocated() - out_off.nbytes) / (1024**2) + (out_off.sum()).backward() + torch.cuda.synchronize() + offload_delta = offload_mem_after - offload_mem_before + + # Offload should reduce peak cached memory usage after forward + assert ( + offload_delta < baseline_delta + ), f"offload did not reduce memory: off={offload_delta:.2f}MiB base={baseline_delta:.2f}MiB" + + # Theoretical savings: storing per-layer input x (same shape each layer). + bytes_per_elem = inp.element_size() # 2 for bfloat16 + input_bytes = inp.numel() * bytes_per_elem + # -2 because the first and last activations are not offloaded + expected_saved_mib = (model.num_layers - 2) * (input_bytes / (1024**2)) + + # Actual savings ≈ baseline_delta - offload_delta (both exclude output tensor memory). + actual_saved_mib = baseline_delta - offload_delta + + # Allow slack for allocator jitter and extra intermediates; magnitudes should match. + rel_err = abs(actual_saved_mib - expected_saved_mib) / max(expected_saved_mib, 1e-6) + assert ( + rel_err <= EPSILON + ), f"saved mismatch: actual={actual_saved_mib:.2f}MiB expected~={expected_saved_mib:.2f}MiB (rel_err={rel_err:.2f})" + + +def test_fine_grained_activation_offload_output_and_grad_consistency(): + torch.manual_seed(2025) + hidden = 1024 + layers = 3 + + # Create identical models by resetting seed + torch.manual_seed(2025) + model_base = ToyModel(hidden_size=hidden, num_layers=layers, dtype=torch.bfloat16).train() + torch.manual_seed(2025) + model_off = ToyModel(hidden_size=hidden, num_layers=layers, dtype=torch.bfloat16).train() + + # Same input and target + inp = torch.randn((32, hidden), device="cuda", dtype=torch.bfloat16, requires_grad=True) + target = torch.randn_like(inp) + + # Baseline forward/backward + out_base = model_base(inp, use_offload=False) + loss_base = torch.nn.functional.mse_loss(out_base, target) + loss_base.backward() + grads_base = [ + p.grad.detach().clone() if p.grad is not None else None for p in model_base.parameters() + ] + + # Offload forward/backward + from megatron.core.pipeline_parallel import fine_grained_activation_offload as off + + off.fine_grained_offloading_reset() + out_off = model_off(inp.detach().clone().requires_grad_(True), use_offload=True) + loss_off = torch.nn.functional.mse_loss(out_off, target) + loss_off.backward() + grads_off = [ + p.grad.detach().clone() if p.grad is not None else None for p in model_off.parameters() + ] + + # Compare outputs + assert torch.allclose(out_off.float(), out_base.float(), rtol=1e-3, atol=1e-3) + + # Compare gradients parameter-wise + for gb, go in zip(grads_base, grads_off): + if gb is None and go is None: + continue + assert gb is not None and go is not None + assert torch.allclose(go.float(), gb.float(), rtol=1e-3, atol=1e-3) From bada8f96681f7610500e6acd5aa51a7cca0bd5e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Oct 2025 10:09:18 +0100 Subject: [PATCH 083/248] ci(fix): `Run tests` label (#1970) (#2006) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/auto-assign-milestone.yml | 1 + .github/workflows/auto-reminder-bot.yml | 34 ++++ .github/workflows/auto-swap-labels.yml | 33 ++++ .../workflows/build-test-publish-wheel.yml | 3 + .../workflows/cherry-pick-release-commit.yml | 1 + .github/workflows/cicd-approve-test-queue.yml | 1 + .github/workflows/cicd-main.yml | 28 ++-- .github/workflows/close-inactive-issue-pr.yml | 1 + .github/workflows/community-bot.yml | 1 + .github/workflows/copyright-check.yml | 11 +- .github/workflows/dependabot.yml | 4 +- .github/workflows/install-test.yml | 4 + .gitlab/stages/05.publish.yml | 2 +- hello_world | 0 .../launch_nemo_run_workload.py | 2 + .../python_scripts/swap_pr_labels.py | 147 ++++++++++++++++++ tests/test_utils/recipes/ckpt_converter.yaml | 2 +- .../gpt-dynamic-inference-cuda-graphs.yaml | 2 +- .../recipes/gpt-dynamic-inference.yaml | 2 +- tests/test_utils/recipes/gpt-grads.yaml | 2 +- tests/test_utils/recipes/gpt.yaml | 88 +++++------ .../recipes/mamba-static-inference.yaml | 2 +- tests/test_utils/recipes/mamba.yaml | 2 +- .../recipes/moe-dynamic-inference.yaml | 2 +- .../recipes/moe-static-inference.yaml | 6 +- tests/test_utils/recipes/moe.yaml | 28 ++-- 26 files changed, 321 insertions(+), 88 deletions(-) create mode 100644 .github/workflows/auto-reminder-bot.yml create mode 100644 .github/workflows/auto-swap-labels.yml create mode 100644 hello_world create mode 100644 tests/test_utils/python_scripts/swap_pr_labels.py diff --git a/.github/workflows/auto-assign-milestone.yml b/.github/workflows/auto-assign-milestone.yml index 7eae6838332..8153728f9fd 100644 --- a/.github/workflows/auto-assign-milestone.yml +++ b/.github/workflows/auto-assign-milestone.yml @@ -14,6 +14,7 @@ jobs: assign-milestone: runs-on: ubuntu-latest environment: nemo-ci + if: github.repository == 'NVIDIA/Megatron-LM' steps: - name: Get PR info id: get-pr-info diff --git a/.github/workflows/auto-reminder-bot.yml b/.github/workflows/auto-reminder-bot.yml new file mode 100644 index 00000000000..c3aa8169b50 --- /dev/null +++ b/.github/workflows/auto-reminder-bot.yml @@ -0,0 +1,34 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +name: Auto Reminder Bot + +on: + workflow_dispatch: + schedule: + - cron: "0 12 * * *" + +jobs: + run-script: + environment: main + name: Run Auto Reminder Bot + runs-on: ubuntu-latest + if: github.repository == 'NVIDIA/Megatron-LM' + steps: + - name: Check out repository code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install dependencies + run: | + pip install --no-cache-dir PyGithub slack-sdk + + - name: Run Auto Reminder Bot + run: | + export SLACK_TOKEN=${{ secrets.SLACK_TOKEN }} + export SLACK_WEBHOOK_URL=${{ secrets.SLACK_WEBHOOK_URL }} + export GH_TOKEN=${{ secrets.PAT }} + python tests/test_utils/python_scripts/auto_reminder_github.py diff --git a/.github/workflows/auto-swap-labels.yml b/.github/workflows/auto-swap-labels.yml new file mode 100644 index 00000000000..5335026e2af --- /dev/null +++ b/.github/workflows/auto-swap-labels.yml @@ -0,0 +1,33 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +name: Auto Swap Labels +on: + pull_request_review: + types: [submitted] + +permissions: + pull-requests: write + contents: read + +jobs: + check-approval: + runs-on: ubuntu-latest + if: github.event.review.state == 'approved' && github.repository == 'NVIDIA/Megatron-LM' + steps: + - name: Check out repository code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install dependencies + run: | + pip install --no-cache-dir PyGithub slack-sdk + + - name: Run Auto Reminder Bot + run: | + export GH_TOKEN=${{ github.token }} + export PR_NUMBER=${{ github.event.pull_request.number }} + python tests/test_utils/python_scripts/swap_pr_labels.py diff --git a/.github/workflows/build-test-publish-wheel.yml b/.github/workflows/build-test-publish-wheel.yml index 1ff9f53202b..0f3a037979a 100644 --- a/.github/workflows/build-test-publish-wheel.yml +++ b/.github/workflows/build-test-publish-wheel.yml @@ -35,6 +35,7 @@ permissions: jobs: pre-flight: uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.5 + if: github.repository == 'NVIDIA/Megatron-LM' build-test-publish-wheel: needs: [pre-flight] @@ -42,6 +43,7 @@ jobs: !(needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true') + && github.repository == 'NVIDIA/Megatron-LM' uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.63.1 with: dry-run: true @@ -68,6 +70,7 @@ jobs: || needs.pre-flight.outputs.is_deployment_workflow == 'true' || always() ) + && github.repository == 'NVIDIA/Megatron-LM' && !cancelled() runs-on: ubuntu-latest steps: diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index 9cf8ed98660..58b447939a7 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -22,6 +22,7 @@ on: jobs: cherry-pick: uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cherry_pick.yml@v0.65.9 + if: github.repository == 'NVIDIA/Megatron-LM' with: target-branches-pattern: 'core_(*dev_)?r[0-9]+\.[0-9]+\.[0-9]+' secrets: diff --git a/.github/workflows/cicd-approve-test-queue.yml b/.github/workflows/cicd-approve-test-queue.yml index 1f23905d5d8..ccc8327368d 100644 --- a/.github/workflows/cicd-approve-test-queue.yml +++ b/.github/workflows/cicd-approve-test-queue.yml @@ -23,6 +23,7 @@ jobs: approve-queue: runs-on: ubuntu-latest environment: main + if: github.repository == 'NVIDIA/Megatron-LM' strategy: matrix: branch: [main, dev] diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index d1e411be98f..27e1f6cdacb 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + name: CICD Megatron-LM on: schedule: @@ -150,6 +151,7 @@ jobs: pre-flight: needs: [is-not-external-contributor] + if: github.repository == 'NVIDIA/Megatron-LM' uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.10 linting: @@ -251,11 +253,6 @@ jobs: apt-get update apt-get install -y gh - - name: Pull cache - run: | - docker pull ${{ env.container-registry }}/megatron-lm:main || true - docker pull ${{ env.container-registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} || true - - name: Get last merged PR id: cache_from env: @@ -271,13 +268,16 @@ jobs: } } }' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do - echo "${{ env.container-registry }}/megatron-lm:$number" + echo "type=registry,ref=${{ env.container-registry }}/megatron-lm:$number-buildcache,mode=max" done) echo "LAST_PRS< latest_reviews[review.user.login].submitted_at + ): + latest_reviews[review.user.login] = review + except Exception as e: + logger.warning(f"Could not get reviews for PR #{pr.number}: {e}") + + # 2. Separate reviewers into approvers (List B) and non-approvers + approvers = {user for user, review in latest_reviews.items() if review.state == "APPROVED"} + non_approving_reviewers = { + user for user, review in latest_reviews.items() if review.state == "CHANGES_REQUESTED" + } + + # 3. Get all *currently pending* review requests + try: + pending_users_req, pending_teams_req = pr.get_review_requests() + pending_individuals = {r.login for r in pending_users_req} + pending_teams_slugs = {t.slug for t in pending_teams_req} + except Exception as e: + logger.warning(f"Could not get review requests for PR #{pr.number}: {e}") + pending_individuals = set() + pending_teams_slugs = set() + + # 4. Filter pending teams based on the current stage + teams_to_query = ( + pending_teams_slugs - self.EXCLUDED_TEAMS + if self.stage == self.EXPERT_REVIEW + else pending_teams_slugs & self.EXCLUDED_TEAMS + ) + + # 5. Get members from the required pending teams + pending_team_members = set() + for slug in teams_to_query: + try: + pending_team_members.update( + m.login for m in self.org.get_team_by_slug(slug).get_members() + ) + except Exception as e: + logger.warning(f"Could not get members for team {slug} on PR #{pr.number}: {e}") + + # 6. "List A": Combine all users who *still need to review* + all_required_reviewers = ( + pending_individuals | pending_team_members | non_approving_reviewers + ) + + # 7. Final list (List A - List B): + pending_reviewers = all_required_reviewers - approvers + logger.info(f"Pending reviewers: {pending_reviewers}") + if len(pending_reviewers) == 0: + try: + pr.remove_from_labels(self.EXPERT_REVIEW) + logger.info(f'Removed "{self.EXPERT_REVIEW}" label from PR #{pr.number}') + except Exception as e: + logger.warning( + f'Failed to remove "{self.EXPERT_REVIEW}" label from PR #{pr.number}: {e}' + ) + + try: + pr.add_to_labels(self.FINAL_REVIEW) + logger.info(f'Added "{self.FINAL_REVIEW}" label to PR #{pr.number}') + except Exception as e: + logger.warning(f'Failed to add "{self.FINAL_REVIEW}" label to PR #{pr.number}: {e}') + + +def main(): + token = os.environ.get("GH_TOKEN") + repo = os.environ.get("REPO", "NVIDIA/Megatron-LM") + pr_number = int(os.environ.get("PR_NUMBER")) + + if not token: + logger.error("GH_TOKEN environment variable is required") + sys.exit(1) + + logger.info(f"Starting PR review reminder for {repo}") + tracker = PRReviewTracker(token, repo, pr_number) + tracker.swap_labels() + + +if __name__ == "__main__": + main() diff --git a/tests/test_utils/recipes/ckpt_converter.yaml b/tests/test_utils/recipes/ckpt_converter.yaml index f78f184a326..bf328ae44c9 100644 --- a/tests/test_utils/recipes/ckpt_converter.yaml +++ b/tests/test_utils/recipes/ckpt_converter.yaml @@ -48,7 +48,7 @@ products: - test_case: [ckpt_converter] products: - environment: [dev] - scope: [mr-broken] + scope: [mr-github-broken, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly-broken] diff --git a/tests/test_utils/recipes/gpt-dynamic-inference-cuda-graphs.yaml b/tests/test_utils/recipes/gpt-dynamic-inference-cuda-graphs.yaml index 47b8d346150..f4a7d6c786b 100644 --- a/tests/test_utils/recipes/gpt-dynamic-inference-cuda-graphs.yaml +++ b/tests/test_utils/recipes/gpt-dynamic-inference-cuda-graphs.yaml @@ -47,5 +47,5 @@ products: - test_case: [gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_validation] products: - environment: [dev] - scope: [mr-broken] + scope: [mr-broken, mr-github] platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/gpt-dynamic-inference.yaml b/tests/test_utils/recipes/gpt-dynamic-inference.yaml index 748e4734a6d..77a98d4bd7f 100644 --- a/tests/test_utils/recipes/gpt-dynamic-inference.yaml +++ b/tests/test_utils/recipes/gpt-dynamic-inference.yaml @@ -72,5 +72,5 @@ products: - test_case: [gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/gpt-grads.yaml b/tests/test_utils/recipes/gpt-grads.yaml index cdd3a050ff2..bf048542410 100644 --- a/tests/test_utils/recipes/gpt-grads.yaml +++ b/tests/test_utils/recipes/gpt-grads.yaml @@ -62,5 +62,5 @@ products: - test_case: [gpt3_mcore_reruns_resume_check_grads] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/gpt.yaml index 0dafb8685c2..baf07cb9759 100644 --- a/tests/test_utils/recipes/gpt.yaml +++ b/tests/test_utils/recipes/gpt.yaml @@ -104,75 +104,75 @@ products: scope: [nightly] platforms: [dgx_h100] ####################################################################### - # MR tests: Mostly DEV on MR, and LTS on nightly cadence, except for # + # mr, mr-github tests: Mostly DEV on mr, mr-github, and LTS on nightly cadence, except for # # some very important tests. # ####################################################################### - test_case: [gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] # - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic] # products: # - environment: [dev] - # scope: [mr] + # scope: [mr, mr-github] # - environment: [lts] # scope: [nightly] # Non-deterministic: #487 - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # outdated TE: #501 - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # non-determinism: #436 - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # non-determinism: #437 - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] @@ -193,42 +193,42 @@ products: - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] # - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist] # products: # - environment: [dev] - # scope: [mr] + # scope: [mr, mr-github] # platforms: [dgx_h100] # Hangs: #513 # - environment: [lts] # scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather] products: # - environment: [dev] - # scope: [mr] + # scope: [mr, mr-github] # platforms: [dgx_h100] # Hangs: #513 - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied] products: # - environment: [dev] - # scope: [mr] # Hangs: #513 + # scope: [mr, mr-github] # Hangs: #513 # platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap] products: # - environment: [dev] - # scope: [mr] # Hangs: #513 + # scope: [mr, mr-github] # Hangs: #513 # platforms: [dgx_h100] - environment: [lts] scope: [nightly] @@ -326,14 +326,14 @@ products: - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] @@ -345,49 +345,49 @@ products: - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader] products: # - environment: [dev] - # scope: [mr] # Hangs: #513 + # scope: [mr, mr-github] # Hangs: #513 # platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] scope: [nightly] @@ -415,25 +415,25 @@ products: - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp1_modelopt_distill_resume] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # Outdated: #502 # - test_case: [gpt3_mcore_te_tp2_pp1_fsdp2_resume_torch_dist] # products: # - environment: [dev] - # scope: [mr] # Broken: #484 + # scope: [mr, mr-github] # Broken: #484 # - environment: [lts] # scope: [nightly] # Requires PyT 2.4: #481 ####################################################################### - # Super important MR tests that run for both DEV and LTS per MR # + # Super important mr, mr-github tests that run for both DEV and LTS per mr, mr-github # ####################################################################### - test_case: [gpt3_mcore_reruns_persistent_1] products: @@ -445,19 +445,16 @@ products: # - test_case: [gpt3_mcore_reruns_persistent_2] # products: # - environment: [dev] - # scope: [mr] + # scope: [mr, mr-github] # platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer] products: - environment: [lts] - scope: [mr] - - environment: [dev] scope: [mr, mr-github] - platforms: [dgx_h100] - environment: [dev] - scope: [mr-slim] + scope: [mr, mr-github, mr-slim] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather] products: @@ -465,43 +462,40 @@ products: scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] - scope: [mr] + scope: [mr, mr-github] - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather] products: - environment: [lts] - scope: [mr] - - environment: [dev] scope: [mr, mr-github] - platforms: [dgx_h100] - environment: [dev] - scope: [mr-slim] + scope: [mr, mr-slim] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - environment: [lts] - scope: [mr] + scope: [mr, mr-github] # - test_case: [gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone] # products: # - environment: [dev] - # scope: [mr] + # scope: [mr, mr-github] # platforms: [dgx_h100] # - test_case: [gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer] # products: # - environment: [dev] - # scope: [mr] + # scope: [mr, mr-github] # platforms: [dgx_h100] # - test_case: [gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu] # products: # - environment: [dev] - # scope: [mr] + # scope: [mr, mr-github] # platforms: [dgx_h100] # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic] # products: # - environment: [dev] - # scope: [mr] + # scope: [mr, mr-github] # platforms: [dgx_a100, dgx_h100] # - test_case: [gpt3_weekly_dgx_b200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap] # products: @@ -551,4 +545,4 @@ products: # - test_case: [gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te] # products: # - environment: [dev, lts] - # scope: [mr] # Non-deterministic: #483 + # scope: [mr, mr-github] # Non-deterministic: #483 diff --git a/tests/test_utils/recipes/mamba-static-inference.yaml b/tests/test_utils/recipes/mamba-static-inference.yaml index e727c4db5ee..9fcc86830f0 100644 --- a/tests/test_utils/recipes/mamba-static-inference.yaml +++ b/tests/test_utils/recipes/mamba-static-inference.yaml @@ -62,5 +62,5 @@ products: - test_case: [hybrid_static_inference_tp1_pp1_2B_cudagraphs] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dg x_h100] diff --git a/tests/test_utils/recipes/mamba.yaml b/tests/test_utils/recipes/mamba.yaml index 0f8a4085ea5..40d1d095aa4 100644 --- a/tests/test_utils/recipes/mamba.yaml +++ b/tests/test_utils/recipes/mamba.yaml @@ -67,7 +67,7 @@ products: # - test_case: [hybrid_mr_mcore_te_tp1_pp4_cp1_dgx_a100_1N8G] # products: # - environment: [dev] - # scope: [mr] + # scope: [mr, mr-github] # - environment: [lts] # disabled until triton is bumped # scope: [nightly] diff --git a/tests/test_utils/recipes/moe-dynamic-inference.yaml b/tests/test_utils/recipes/moe-dynamic-inference.yaml index c9d1be57add..d477bdeda4a 100644 --- a/tests/test_utils/recipes/moe-dynamic-inference.yaml +++ b/tests/test_utils/recipes/moe-dynamic-inference.yaml @@ -62,5 +62,5 @@ products: - test_case: [gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/moe-static-inference.yaml b/tests/test_utils/recipes/moe-static-inference.yaml index c11cd294592..bd7c4ca0f50 100644 --- a/tests/test_utils/recipes/moe-static-inference.yaml +++ b/tests/test_utils/recipes/moe-static-inference.yaml @@ -57,15 +57,15 @@ products: - test_case: [gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml index 7a0f7d8a3f6..649da3ba518 100644 --- a/tests/test_utils/recipes/moe.yaml +++ b/tests/test_utils/recipes/moe.yaml @@ -78,28 +78,28 @@ products: # Weekly tests: Run both DEV and LTS unless something is flaky # ####################################################################### ####################################################################### - # MR tests: Mostly DEV on MR, and LTS on nightly cadence, except for # + # mr, mr-github tests: Mostly DEV on mr, mr-github, and LTS on nightly cadence, except for # # some very important tests. # ####################################################################### - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] # hang: #513 # - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental] # products: # - environment: [dev] - # scope: [mr] + # scope: [mr, mr-github] # platforms: [dgx_h100] # hang: #513 - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4] products: @@ -122,7 +122,7 @@ products: - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading] products: @@ -135,17 +135,17 @@ products: scope: [mr] platforms: [dgx_h100] ####################################################################### - # Super important MR tests that run for both DEV and LTS per MR # + # Super important mr, mr-github tests that run for both DEV and LTS per mr, mr-github # ####################################################################### # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer] # products: # - environment: [dev] - # scope: [mr] + # scope: [mr, mr-github] # platforms: [dgx_h100] # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM] # products: # - environment: [dev] - # scope: [mr] + # scope: [mr, mr-github] # platforms: [dgx_h100] ########################### # Merge train tests # @@ -153,18 +153,12 @@ products: - test_case: [gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer] products: - environment: [dev] - scope: [mr] - platforms: [dgx_h100] - - environment: [dev] - scope: [mr-slim] + scope: [mr, mr-github, mr-slim] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed] products: - environment: [dev] - scope: [mr, mr-github] - platforms: [dgx_h100] - - environment: [dev] - scope: [mr-slim] + scope: [mr, mr-github, mr-slim] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8] products: From ccf794e8e51af72bed287219e9da3ab32c0938e1 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Wed, 29 Oct 2025 17:56:26 +0800 Subject: [PATCH 084/248] Renaming golden values (#2020) Signed-off-by: Hongbin Liu --- ...ev_coreweave.json => golden_values_dev_dgxh100_coreweave.json} | 0 ...den_values_dev_eos.json => golden_values_dev_dgxh100_eos.json} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/{golden_values_dev_coreweave.json => golden_values_dev_dgxh100_coreweave.json} (100%) rename tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/{golden_values_dev_eos.json => golden_values_dev_dgxh100_eos.json} (100%) diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_eos.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgxh100_eos.json From 7342f67d2f2dc8cb3b5a9d18bf6674f56f505678 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Oct 2025 10:56:40 +0100 Subject: [PATCH 085/248] Ko3n1g/chore/sync main to dev (#2018) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig Co-authored-by: James Shen Co-authored-by: Chen-Han Yu Co-authored-by: Shanmugam Ramasamy Co-authored-by: Shanmugam Ramasamy Co-authored-by: Mcore Bot Co-authored-by: Shanmugam Ramasamy Co-authored-by: Siddharth Singh Co-authored-by: Shanmugam Ramasamy Co-authored-by: Youngeun Kwon Co-authored-by: Shunjia Ding Co-authored-by: Maanu Grover Co-authored-by: Jack Chang Co-authored-by: jianbinc Co-authored-by: xuwenc Co-authored-by: Teodor-Dumitru Ene <34819528+tdene@users.noreply.github.com> --- .github/workflows/cicd-approve-test-queue.yml | 8 +- .github/workflows/cicd-main.yml | 2 +- .github/workflows/copyright-check.yml | 1 + .gitlab/stages/00.pre.yml | 24 +- .gitlab/stages/05.publish.yml | 56 ++ pyproject.toml | 7 +- .../python_scripts/auto_reminder_github.py | 326 ++++++++++ .../python_scripts/check_status_of_main.py | 2 + .../launch_nemo_run_workload.py | 6 - tests/test_utils/recipes/gpt.yaml | 2 +- .../recipes/mamba-static-inference.yaml | 2 +- uv.lock | 586 ++++++++++-------- 12 files changed, 716 insertions(+), 306 deletions(-) create mode 100644 tests/test_utils/python_scripts/auto_reminder_github.py diff --git a/.github/workflows/cicd-approve-test-queue.yml b/.github/workflows/cicd-approve-test-queue.yml index ccc8327368d..1c35031cb35 100644 --- a/.github/workflows/cicd-approve-test-queue.yml +++ b/.github/workflows/cicd-approve-test-queue.yml @@ -26,7 +26,7 @@ jobs: if: github.repository == 'NVIDIA/Megatron-LM' strategy: matrix: - branch: [main, dev] + branch: [main, dev, others] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -45,6 +45,7 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.PAT }} MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }} + PYTHONUNBUFFERED: 1 shell: python run: | import os @@ -100,7 +101,10 @@ jobs: return False base_branch = pr_info.get("base", {}).get("ref") - if base_branch == target_branch: + if ( + (base_branch == target_branch) or + (base_branch != "main" and base_branch != "dev" and target_branch == "others") + ): print(f"PR #{pr_number} targets {target_branch}") return True diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 27e1f6cdacb..855b444ad64 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -192,7 +192,7 @@ jobs: export PATH=".venv/bin:$PATH" export GITLAB_ENDPOINT=github.com export CI_PROJECT_NAMESPACE=NVIDIA - export BASE_REF="${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}" + export BASE_REF="${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}" export CHECK_ONLY=true export SKIP_DOCS=false bash tools/autoformat.sh diff --git a/.github/workflows/copyright-check.yml b/.github/workflows/copyright-check.yml index bb9640a1147..05ca4b4cec9 100644 --- a/.github/workflows/copyright-check.yml +++ b/.github/workflows/copyright-check.yml @@ -33,6 +33,7 @@ jobs: needs: [pre-flight] if: | !(needs.pre-flight.outputs.docs_only == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true') && github.repository == 'NVIDIA/Megatron-LM' uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_copyright_check.yml@v0.65.12 diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml index dca3a7b47ae..a22c2cf3ea7 100644 --- a/.gitlab/stages/00.pre.yml +++ b/.gitlab/stages/00.pre.yml @@ -21,29 +21,6 @@ include: - echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin - echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin -pre:mirror_to_github: - rules: - - if: '($CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "dev") && $CI_PIPELINE_SOURCE == "push"' - allow_failure: true - - when: never - tags: - - arch/amd64 - - env/prod - - origin/jet-fleet - - owner/jet-core - - purpose/utility - - team/megatron - stage: .pre - image: python:3.10 - variables: - GIT_STRATEGY: "clone" - script: - - git checkout $CI_COMMIT_BRANCH - - git remote add github https://ko3n1g:$GH_TOKEN@github.com/NVIDIA/Megatron-LM.git || true - - git push -u github $CI_COMMIT_BRANCH - retry: - max: 2 - pre:create_ci_branches: rules: - if: '$CI_COMMIT_BRANCH == "main" && $CI_PIPELINE_SOURCE == "push"' @@ -61,6 +38,7 @@ pre:create_ci_branches: - branch: ci-upgrade-dependencies - branch: ci-approve-main - branch: ci-approve-dev + - branch: ci-sync-branches tags: - arch/amd64 - env/prod diff --git a/.gitlab/stages/05.publish.yml b/.gitlab/stages/05.publish.yml index 3b50562629a..39f072c88ae 100644 --- a/.gitlab/stages/05.publish.yml +++ b/.gitlab/stages/05.publish.yml @@ -800,3 +800,59 @@ publish:approve_merge_gate: - if: $CI_PIPELINE_SOURCE == "schedule" && ($CI_COMMIT_BRANCH == 'ci-approve-dev' || $CI_COMMIT_BRANCH == 'ci-approve-main') when: always - when: never + +publish:sync_branches: + stage: publish + image: python:3.10 + script: + - set -x + - git remote add github https://github.com/NVIDIA/Megatron-LM.git || true + - git remote add gitlab https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/${CI_PROJECT_NAMESPACE}/Megatron-LM.git || true + - BRANCHES=("main" "dev") + - | + while IFS= read -r line; do + BRANCHES+=("$line") # Add each line to the array + done < <( \ + git ls-remote --heads "https://token:${PAT}@github.com/NVIDIA/Megatron-LM.git" 'refs/heads/core_*' | \ + cut -d'/' -f3- \ + ) + - | + for BRANCH in "${BRANCHES[@]}"; do + # Define the full refspec for the branch + BRANCH_REF="refs/heads/$BRANCH" + + echo "--- Processing branch: $BRANCH ---" + + # 1. Explicitly fetch the branch ref from 'github' + # This avoids fetching a tag with the same name. + # It updates/creates the remote-tracking branch (e.g., 'refs/remotes/github/core_r0.10.0') + if ! git fetch github "$BRANCH_REF:refs/remotes/github/$BRANCH"; then + echo "Failed to fetch branch $BRANCH. Skipping." + continue + fi + + # 2. Create or update the local branch from the remote-tracking branch we just fetched. + # The -B flag creates the branch if it doesn't exist or resets it if it does. + if ! git checkout -B "$BRANCH" "github/$BRANCH"; then + echo "Failed to checkout local branch $BRANCH. Skipping." + continue + fi + + # 3. Now you are on the correct local branch, ready to push. + echo "Successfully on branch $BRANCH. Echoing push command:" + git push -u gitlab HEAD:refs/heads/$BRANCH --force + echo "-----------------------------------" + done + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + retry: + max: 2 + rules: + - if: $CI_PIPELINE_SOURCE == "schedule" && ($CI_COMMIT_BRANCH == 'ci-sync-branches') + when: always + - when: never diff --git a/pyproject.toml b/pyproject.toml index db91ce393e7..246189d6bd3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,7 +77,7 @@ dev = [ "mamba-ssm~=2.2", "causal-conv1d~=1.5", "nv-grouped-gemm~=1.1", - "transformer-engine[pytorch]>=2.7.0a0,<2.9.0", + "transformer-engine[pytorch]>=2.7.0a0,<2.10.0", "nvidia-resiliency-ext>=0.4.0a0,<0.5.0", "nvidia-modelopt[torch]>=0.33.0a0,<0.34.0; sys_platform != 'darwin'", "megatron-energon[av_decode]~=6.0", @@ -168,9 +168,10 @@ override-dependencies = [ flash_mla = [ { git = "https://github.com/deepseek-ai/FlashMLA", rev = "9edee0c022cd0938148a18e334203b0aab43aa19" }, ] -transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.8" } # on `release_v2.8` +transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" } # on `release_v2.9` +nemo-run = { git = "https://github.com/NVIDIA-NeMo/Run.git", rev = "01a9a8ba360f7b2908728ad0516e0ad9d936966d" } emerging_optimizers = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git", rev = "cf9909b777ffac18e05b67a6708282cadc000942" } -nemo-run = { git = "https://github.com/NVIDIA-NeMo/Run.git", rev = "8ca8f7952a597f944985f1f1368a7acb9aa3a6c2" } + [tool.isort] profile = "black" # black-compatible line_length = 100 # should match black parameters diff --git a/tests/test_utils/python_scripts/auto_reminder_github.py b/tests/test_utils/python_scripts/auto_reminder_github.py new file mode 100644 index 00000000000..df75ec0542c --- /dev/null +++ b/tests/test_utils/python_scripts/auto_reminder_github.py @@ -0,0 +1,326 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 +""" +GitHub PR Review Reminder Automation +Requirements: pip install PyGithub slack-sdk requests +Usage: GH_TOKEN=ghp_... SLACK_TOKEN=xoxb-... SLACK_WEBHOOK_URL=https://... REPO=NVIDIA/Megatron-LM python github_pr_reminder.py +""" + +import logging +import os +import sys +from dataclasses import dataclass +from datetime import datetime, timezone +from typing import List + +import requests +from github import Github +from slack_sdk import WebClient +from slack_sdk.errors import SlackApiError + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +@dataclass +class Reminder: + id: int + pr: str + milestone: str + author: str + priority: str + review_stage: str + total_review_time: int + current_stage_time: int + reviewers: List[str] + action_message: str + + +class PRReviewTracker: + EXPERT_REVIEW = "Expert Review" + FINAL_REVIEW = "Final Review" + EXCLUDED_TEAMS = {"core-adlr", "core-nemo"} + + def __init__( + self, token: str, repo_name: str, slack_token: str = None, webhook_url: str = None + ): + self.github = Github(token) + self.repo = self.github.get_repo(repo_name) + self.email_cache = {} + self.slack_id_cache = {} + self.slack_client = WebClient(token=slack_token) if slack_token else None + self.webhook_url = webhook_url + + def get_user_email(self, username: str): + """Get user's email, prioritizing public profile, then recent commits.""" + if username in self.email_cache: + return self.email_cache[username] + + try: + user = self.github.get_user(username) + + # 1. Try public profile email first + if user.email and not user.email.endswith("@users.noreply.github.com"): + self.email_cache[username] = user.email + return user.email + + # 2. If no public email, check recent commits on the main repo + try: + # Use get_commits(author=...) which is more direct than search_commits + for commit in self.repo.get_commits(author=user)[:10]: + email = commit.commit.author.email + if email and not email.endswith("@users.noreply.github.com"): + self.email_cache[username] = email + return email + except Exception as e: + logger.debug(f"Could not check commits for {username}: {e}") + + # 3. Fallback to public email (even if noreply) or a constructed noreply + email = user.email or f"{username}@users.noreply.github.com" + self.email_cache[username] = email + return email + + except Exception as e: + logger.warning(f"Could not get user object for {username}: {e}") + email = f"{username}@users.noreply.github.com" + self.email_cache[username] = email + return email + + def get_slack_user_id(self, email: str): + """Get Slack user ID from email.""" + if not self.slack_client: + return email + if email in self.slack_id_cache: + return self.slack_id_cache[email] + try: + response = self.slack_client.users_lookupByEmail(email=email) + user_id = response["user"]["id"] + self.slack_id_cache[email] = f"<@{user_id}>" + return self.slack_id_cache[email] + except SlackApiError as e: + logger.warning(f"Could not find Slack user for {email}: {e.response['error']}") + self.slack_id_cache[email] = email + return email + + def get_label_date(self, pr, label: str): + """Get most recent date when label was attached.""" + dates = [ + e.created_at + for e in pr.as_issue().get_events() + if e.event == "labeled" and e.label and e.label.name == label + ] + return max(dates) if dates else None + + def days_since(self, date): + """Calculate days since given date.""" + if not date: + return 0 + if date.tzinfo is None: + date = date.replace(tzinfo=timezone.utc) + return (datetime.now(timezone.utc) - date).days + + def get_stage(self, pr): + """Get current review stage.""" + labels = {l.name for l in pr.labels} + return self.FINAL_REVIEW if self.FINAL_REVIEW in labels else self.EXPERT_REVIEW + + def get_reviewers(self, pr): + """Get filtered reviewer emails who haven't approved yet.""" + stage = self.get_stage(pr) + org = self.github.get_organization(self.repo.organization.login) + + # 1. Get the latest review state for everyone who has submitted a review + latest_reviews = {} + try: + for review in pr.get_reviews(): + if not review.user: # Handle rare cases of deleted users + continue + # Only track 'APPROVED' or 'CHANGES_REQUESTED' as definitive states + if review.state in ("APPROVED", "CHANGES_REQUESTED"): + if ( + review.user.login not in latest_reviews + or review.submitted_at > latest_reviews[review.user.login].submitted_at + ): + latest_reviews[review.user.login] = review + except Exception as e: + logger.warning(f"Could not get reviews for PR #{pr.number}: {e}") + + # 2. Separate reviewers into approvers (List B) and non-approvers + approvers = {user for user, review in latest_reviews.items() if review.state == "APPROVED"} + non_approving_reviewers = { + user for user, review in latest_reviews.items() if review.state == "CHANGES_REQUESTED" + } + + # 3. Get all *currently pending* review requests + try: + pending_users_req, pending_teams_req = pr.get_review_requests() + pending_individuals = {r.login for r in pending_users_req} + pending_teams_slugs = {t.slug for t in pending_teams_req} + except Exception as e: + logger.warning(f"Could not get review requests for PR #{pr.number}: {e}") + pending_individuals = set() + pending_teams_slugs = set() + + # 4. Filter pending teams based on the current stage + teams_to_query = ( + pending_teams_slugs - self.EXCLUDED_TEAMS + if stage == self.EXPERT_REVIEW + else pending_teams_slugs & self.EXCLUDED_TEAMS + ) + + # 5. Get members from the required pending teams + pending_team_members = set() + for slug in teams_to_query: + try: + pending_team_members.update( + m.login for m in org.get_team_by_slug(slug).get_members() + ) + except Exception as e: + logger.warning(f"Could not get members for team {slug} on PR #{pr.number}: {e}") + + # 6. "List A": Combine all users who *still need to review* + all_required_reviewers = ( + pending_individuals | pending_team_members | non_approving_reviewers + ) + + # 7. Final list (List A - List B): + pending_reviewers = all_required_reviewers - approvers + reviewer_emails = sorted([self.get_user_email(u) for u in pending_reviewers]) + action_message = "Please review the PR." + + # 8. Handle the original edge cases + if len(reviewer_emails) == 0: + if stage == self.EXPERT_REVIEW: + # Assign to PR author + reviewer_emails = [self.get_user_email(pr.user.login)] + action_message = "All Expert Reviewers approved the PR. Please attach the Final Review label to proceed with the review." + elif stage == self.FINAL_REVIEW: + # Assign to mcore-reviewers who approved + try: + mcore_team = org.get_team_by_slug("mcore-reviewers") + mcore_members = {m.login for m in mcore_team.get_members()} + valid_approvers = approvers & mcore_members + reviewer_emails = sorted([self.get_user_email(u) for u in valid_approvers]) + action_message = "All Final Reviewers approved the PR. Please ping an Expert or Final Reviewer to merge the PR." + + except Exception as e: + logger.warning( + f"Could not get mcore-reviewers approvers for PR #{pr.number}: {e}" + ) + + return reviewer_emails, action_message + + def create_reminder(self, pr): + """Create reminder for PR.""" + stage = self.get_stage(pr) + stage_days = self.days_since(self.get_label_date(pr, stage)) + author_email = self.get_user_email(pr.user.login) + reviewer_emails, action_message = self.get_reviewers(pr) + + return Reminder( + id=pr.number, + pr=f"<{pr.html_url}|#{pr.number} - {pr.title}>", + milestone=pr.milestone.title if pr.milestone else "No Milestone", + author=self.get_slack_user_id(author_email), + priority="P0" if stage_days > 3 else "P1" if stage_days >= 1 else "P2", + review_stage=stage, + total_review_time=self.days_since(self.get_label_date(pr, self.EXPERT_REVIEW)), + current_stage_time=stage_days, + reviewers=[self.get_slack_user_id(email) for email in reviewer_emails], + action_message=action_message, + ) + + def generate_reminders(self): + """Generate all reminders.""" + milestones = list(self.repo.get_milestones(state="open", sort="due_on", direction="desc"))[ + :2 + ] + logger.info(f"Found milestones: {', '.join(m.title for m in milestones)}") + + reminders = [] + for milestone in milestones: + # Find issues with the 'Expert Review' or 'Final Review' label + query = ( + f'repo:"{self.repo.full_name}" ' + f'milestone:"{milestone.title}" ' + f'is:open is:pr ' + f'label:"{self.EXPERT_REVIEW}","{self.FINAL_REVIEW}"' + ) + try: + # Use search_issues for a more direct query instead of get_issues + filtering + issues = self.github.search_issues(query) + for issue in issues: + try: + reminders.append(self.create_reminder(issue.as_pull_request())) + logger.info(f"Processed PR #{issue.number}") + except Exception as e: + logger.error(f"Failed to process PR #{issue.number}: {e}") + except Exception as e: + logger.error(f"Failed to search issues for milestone {milestone.title}: {e}") + + return sorted(reminders, key=lambda r: (r.priority, -r.current_stage_time)) + + def send_slack_notification(self, reminder: Reminder): + """Send Slack notification via webhook.""" + if not self.webhook_url: + return + + reviewers_str = ', '.join(reminder.reviewers) if reminder.reviewers else 'None' + message = [ + f"*PR*: {reminder.pr}", + f"*Milestone*: {reminder.milestone}", + f"*Author*: {reminder.author}", + f"*Priority*: {reminder.priority}", + f"*Review stage*: {reminder.review_stage}", + f"*Days in review*: {reminder.total_review_time}", + f"*Days in {reminder.review_stage}*: {reminder.current_stage_time}", + f"*Reviewers*: {reviewers_str}", + ] + + payload = { + "text": f"PR Review Reminder: {reminder.priority} - PR #{reminder.id}", + "blocks": [{"type": "section", "text": {"type": "mrkdwn", "text": "\n".join(message)}}], + } + + try: + response = requests.post(self.webhook_url, json=payload, timeout=10) + response.raise_for_status() + logger.info(f"Sent Slack notification for PR #{reminder.id}") + except requests.exceptions.RequestException as e: + logger.error(f"Failed to send Slack notification for PR #{reminder.id}: {e}") + + +def main(): + token = os.environ.get("GH_TOKEN") + slack_token = os.environ.get("SLACK_TOKEN") + webhook_url = os.environ.get("SLACK_WEBHOOK_URL") + repo = os.environ.get("REPO", "NVIDIA/Megatron-LM") + + if not token: + logger.error("GH_TOKEN environment variable is required") + sys.exit(1) + + logger.info(f"Starting PR review reminder for {repo}") + tracker = PRReviewTracker(token, repo, slack_token, webhook_url) + reminders = tracker.generate_reminders() + logger.info(f"Generated {len(reminders)} reminders\n{'=' * 80}") + + if not reminders: + logger.info("No reminders to send.") + return + + for r in reminders: + logger.info(f"{r.priority} | PR #{r.id} | {r.milestone}") + logger.info(f" Author: {r.author} | Stage: {r.review_stage}") + logger.info(f" Stage time: {r.current_stage_time}d | Total: {r.total_review_time}") + logger.info(f" Reviewers: {', '.join(r.reviewers) if r.reviewers else 'None'}") + logger.info(f" Action message: {r.action_message}") + logger.info("-" * 80) + if webhook_url: + tracker.send_slack_notification(r) + + logger.info("All reminders processed.") + + +if __name__ == "__main__": + main() diff --git a/tests/test_utils/python_scripts/check_status_of_main.py b/tests/test_utils/python_scripts/check_status_of_main.py index a1cae393bfb..ce777814b91 100644 --- a/tests/test_utils/python_scripts/check_status_of_main.py +++ b/tests/test_utils/python_scripts/check_status_of_main.py @@ -1,3 +1,5 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + from __future__ import annotations import logging diff --git a/tests/test_utils/python_scripts/launch_nemo_run_workload.py b/tests/test_utils/python_scripts/launch_nemo_run_workload.py index 33d2a4a6a74..6e2b73e430f 100644 --- a/tests/test_utils/python_scripts/launch_nemo_run_workload.py +++ b/tests/test_utils/python_scripts/launch_nemo_run_workload.py @@ -153,12 +153,6 @@ def main( sys.exit(1) - result_dict = exp.status(return_dict=True) - _, job_dict = list(result_dict.items())[0] - - logger.info(f"Job status: {job_dict["status"]}") - sys.exit(0 if str(job_dict["status"]) == "SUCCEEDED" else 1) - if __name__ == "__main__": main() diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/gpt.yaml index baf07cb9759..488f3747a0f 100644 --- a/tests/test_utils/recipes/gpt.yaml +++ b/tests/test_utils/recipes/gpt.yaml @@ -468,7 +468,7 @@ products: - environment: [lts] scope: [mr, mr-github] - environment: [dev] - scope: [mr, mr-slim] + scope: [mr, mr-github, mr-slim] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone] products: diff --git a/tests/test_utils/recipes/mamba-static-inference.yaml b/tests/test_utils/recipes/mamba-static-inference.yaml index 9fcc86830f0..79a5ab4eee2 100644 --- a/tests/test_utils/recipes/mamba-static-inference.yaml +++ b/tests/test_utils/recipes/mamba-static-inference.yaml @@ -63,4 +63,4 @@ products: products: - environment: [dev] scope: [mr, mr-github] - platforms: [dg x_h100] + platforms: [dgx_h100] diff --git a/uv.lock b/uv.lock index c20d3f55dfe..92ad88abd33 100644 --- a/uv.lock +++ b/uv.lock @@ -76,7 +76,7 @@ wheels = [ [[package]] name = "aiobotocore" -version = "2.25.0" +version = "2.25.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, @@ -87,9 +87,9 @@ dependencies = [ { name = "python-dateutil" }, { name = "wrapt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/29/89/b1ae494cfd12520c5d3b19704a14ffa19153634be47d48052e45223eee86/aiobotocore-2.25.0.tar.gz", hash = "sha256:169d07de312fd51292292f2c8faf8f67d0f466f525cea03855fe065ddc85f79d", size = 120514, upload-time = "2025-10-10T17:39:12.291Z" } +sdist = { url = "https://files.pythonhosted.org/packages/62/94/2e4ec48cf1abb89971cb2612d86f979a6240520f0a659b53a43116d344dc/aiobotocore-2.25.1.tar.gz", hash = "sha256:ea9be739bfd7ece8864f072ec99bb9ed5c7e78ebb2b0b15f29781fbe02daedbc", size = 120560, upload-time = "2025-10-28T22:33:21.787Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a8/4e/3592d88436bbd60984a08440793c0ba245f538f9f6287b59c1e2c0aead8c/aiobotocore-2.25.0-py3-none-any.whl", hash = "sha256:0524fd36f6d522ddc9d013df2c19fb56369ffdfbffd129895918fbfe95216dad", size = 86028, upload-time = "2025-10-10T17:39:10.423Z" }, + { url = "https://files.pythonhosted.org/packages/95/2a/d275ec4ce5cd0096665043995a7d76f5d0524853c76a3d04656de49f8808/aiobotocore-2.25.1-py3-none-any.whl", hash = "sha256:eb6daebe3cbef5b39a0bb2a97cffbe9c7cb46b2fcc399ad141f369f3c2134b1f", size = 86039, upload-time = "2025-10-28T22:33:19.949Z" }, ] [[package]] @@ -103,7 +103,7 @@ wheels = [ [[package]] name = "aiohttp" -version = "3.13.1" +version = "3.13.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohappyeyeballs" }, @@ -115,110 +115,110 @@ dependencies = [ { name = "propcache" }, { name = "yarl" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ba/fa/3ae643cd525cf6844d3dc810481e5748107368eb49563c15a5fb9f680750/aiohttp-3.13.1.tar.gz", hash = "sha256:4b7ee9c355015813a6aa085170b96ec22315dabc3d866fd77d147927000e9464", size = 7835344, upload-time = "2025-10-17T14:03:29.337Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e6/34/5097441cc3047eccc2e0bfed3760ed068489b8392545d3aec0d8fbfab2b5/aiohttp-3.13.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2349a6b642020bf20116a8a5c83bae8ba071acf1461c7cbe45fc7fafd552e7e2", size = 735069, upload-time = "2025-10-17T13:58:56.602Z" }, - { url = "https://files.pythonhosted.org/packages/8c/2b/726466b4b4b16271a3db2a8a914d754d6cb9cee7bebde1f3ac6043e4e030/aiohttp-3.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2a8434ca31c093a90edb94d7d70e98706ce4d912d7f7a39f56e1af26287f4bb7", size = 492575, upload-time = "2025-10-17T13:58:58.696Z" }, - { url = "https://files.pythonhosted.org/packages/82/1f/364e64292c95bb6c9e2823b0afa1ad3f06524c573d45df82294be572489d/aiohttp-3.13.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0bd610a7e87431741021a9a6ab775e769ea8c01bf01766d481282bfb17df597f", size = 487862, upload-time = "2025-10-17T13:59:00.315Z" }, - { url = "https://files.pythonhosted.org/packages/23/b0/c5a774b3125ac854987b8ca45a6d995829987d01ece4525d3fc369a9ca88/aiohttp-3.13.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:777ec887264b629395b528af59b8523bf3164d4c6738cd8989485ff3eda002e2", size = 1666761, upload-time = "2025-10-17T13:59:02.224Z" }, - { url = "https://files.pythonhosted.org/packages/29/be/32c6c1d3a6c69e594b855bbf4014bea4c42008b0daac8c6e5c9f03207b89/aiohttp-3.13.1-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:ac1892f56e2c445aca5ba28f3bf8e16b26dfc05f3c969867b7ef553b74cb4ebe", size = 1634627, upload-time = "2025-10-17T13:59:03.829Z" }, - { url = "https://files.pythonhosted.org/packages/73/8d/fde3a8f4801b14e0b9490f5bc86c5106cb7d96bd60ff2aaee53749c72fe1/aiohttp-3.13.1-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:499a047d1c5e490c31d16c033e2e47d1358f0e15175c7a1329afc6dfeb04bc09", size = 1726564, upload-time = "2025-10-17T13:59:05.997Z" }, - { url = "https://files.pythonhosted.org/packages/52/b2/8290556f1f6b17b1af976a9abb17f9b54dc7218e11bbf6abbebaa7cc70fb/aiohttp-3.13.1-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:610be925f89501938c770f1e28ca9dd62e9b308592c81bd5d223ce92434c0089", size = 1814413, upload-time = "2025-10-17T13:59:08.975Z" }, - { url = "https://files.pythonhosted.org/packages/ef/6b/4b657e9fa72479df38117609d4ec8e4b07e8110b872df3872f9c6a96e26b/aiohttp-3.13.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:90eb902c06c6ac85d6b80fa9f2bd681f25b1ebf73433d428b3d182a507242711", size = 1667964, upload-time = "2025-10-17T13:59:10.606Z" }, - { url = "https://files.pythonhosted.org/packages/ee/ed/563de175d01fa26459a60a7c82dbf69d20e356d459476a7526329091b4c3/aiohttp-3.13.1-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ab8ac3224b2beb46266c094b3869d68d5f96f35dba98e03dea0acbd055eefa03", size = 1553917, upload-time = "2025-10-17T13:59:12.312Z" }, - { url = "https://files.pythonhosted.org/packages/39/26/48a4b5681eada16eb5b39cae277765aed1644b03610c43eadb8b331ccfea/aiohttp-3.13.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:79ac65b6e2731558aad1e4c1a655d2aa2a77845b62acecf5898b0d4fe8c76618", size = 1637730, upload-time = "2025-10-17T13:59:14.395Z" }, - { url = "https://files.pythonhosted.org/packages/c1/43/57b137af37344e03c7f6b28ddf38a4af820b53c1fa9ce13f668fe468d2e2/aiohttp-3.13.1-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:4dadbd858ed8c04d1aa7a2a91ad65f8e1fbd253ae762ef5be8111e763d576c3c", size = 1644088, upload-time = "2025-10-17T13:59:16.749Z" }, - { url = "https://files.pythonhosted.org/packages/0d/c4/e49bafa4babef09929b10968a6b6efe3707fbaa5c5bb7c8db7f810232269/aiohttp-3.13.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e0b2ccd331bc77149e88e919aa95c228a011e03e1168fd938e6aeb1a317d7a8a", size = 1696215, upload-time = "2025-10-17T13:59:18.711Z" }, - { url = "https://files.pythonhosted.org/packages/15/e4/8414be434b3e50f9089ffa7c4d5130ba6ff0d1c6fa9f55cd760b088abbe0/aiohttp-3.13.1-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:fba3c85fb24fe204e73f3c92f09f4f5cfa55fa7e54b34d59d91b7c5a258d0f6a", size = 1540617, upload-time = "2025-10-17T13:59:20.46Z" }, - { url = "https://files.pythonhosted.org/packages/bd/8b/31cb6725f819b74a9c0b0055c500187294e73aea40708b6a5aa7b328ea4c/aiohttp-3.13.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8d5011e4e741d2635cda18f2997a56e8e1d1b94591dc8732f2ef1d3e1bfc5f45", size = 1713509, upload-time = "2025-10-17T13:59:22.61Z" }, - { url = "https://files.pythonhosted.org/packages/24/ac/49a79c2711423cfa091e265c46e58617de31258c64502b890f25421cb742/aiohttp-3.13.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c5fe2728a89c82574bd3132d59237c3b5fb83e2e00a320e928d05d74d1ae895f", size = 1654702, upload-time = "2025-10-17T13:59:24.396Z" }, - { url = "https://files.pythonhosted.org/packages/30/52/1cf23cffeda1f079f20cd9c72174a76e8b0c6595def6803892e37ee35c8a/aiohttp-3.13.1-cp310-cp310-win32.whl", hash = "sha256:add14a5e68cbcfc526c89c1ed8ea963f5ff8b9b4b854985b07820c6fbfdb3c3c", size = 430898, upload-time = "2025-10-17T13:59:26.227Z" }, - { url = "https://files.pythonhosted.org/packages/0e/13/214a01f2936f4645b1fbd5cba9001331ca5af5c04bbdbe747eed330a8516/aiohttp-3.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:a4cc9d9cfdf75a69ae921c407e02d0c1799ab333b0bc6f7928c175f47c080d6a", size = 453684, upload-time = "2025-10-17T13:59:28.129Z" }, - { url = "https://files.pythonhosted.org/packages/be/2c/739d03730ffce57d2093e2e611e1541ac9a4b3bb88288c33275058b9ffc2/aiohttp-3.13.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9eefa0a891e85dca56e2d00760945a6325bd76341ec386d3ad4ff72eb97b7e64", size = 742004, upload-time = "2025-10-17T13:59:29.73Z" }, - { url = "https://files.pythonhosted.org/packages/fc/f8/7f5b7f7184d7c80e421dbaecbd13e0b2a0bb8663fd0406864f9a167a438c/aiohttp-3.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6c20eb646371a5a57a97de67e52aac6c47badb1564e719b3601bbb557a2e8fd0", size = 495601, upload-time = "2025-10-17T13:59:31.312Z" }, - { url = "https://files.pythonhosted.org/packages/3e/af/fb78d028b9642dd33ff127d9a6a151586f33daff631b05250fecd0ab23f8/aiohttp-3.13.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bfc28038cd86fb1deed5cc75c8fda45c6b0f5c51dfd76f8c63d3d22dc1ab3d1b", size = 491790, upload-time = "2025-10-17T13:59:33.304Z" }, - { url = "https://files.pythonhosted.org/packages/1e/ae/e40e422ee995e4f91f7f087b86304e3dd622d3a5b9ca902a1e94ebf9a117/aiohttp-3.13.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b22eeffca2e522451990c31a36fe0e71079e6112159f39a4391f1c1e259a795", size = 1746350, upload-time = "2025-10-17T13:59:35.158Z" }, - { url = "https://files.pythonhosted.org/packages/28/a5/fe6022bb869bf2d2633b155ed8348d76358c22d5ff9692a15016b2d1019f/aiohttp-3.13.1-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:65782b2977c05ebd78787e3c834abe499313bf69d6b8be4ff9c340901ee7541f", size = 1703046, upload-time = "2025-10-17T13:59:37.077Z" }, - { url = "https://files.pythonhosted.org/packages/5a/a5/c4ef3617d7cdc49f2d5af077f19794946f0f2d94b93c631ace79047361a2/aiohttp-3.13.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:dacba54f9be3702eb866b0b9966754b475e1e39996e29e442c3cd7f1117b43a9", size = 1806161, upload-time = "2025-10-17T13:59:38.837Z" }, - { url = "https://files.pythonhosted.org/packages/ad/45/b87d2430aee7e7d00b24e3dff2c5bd69f21017f6edb19cfd91e514664fc8/aiohttp-3.13.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:aa878da718e8235302c365e376b768035add36b55177706d784a122cb822a6a4", size = 1894546, upload-time = "2025-10-17T13:59:40.741Z" }, - { url = "https://files.pythonhosted.org/packages/e8/a2/79eb466786a7f11a0292c353a8a9b95e88268c48c389239d7531d66dbb48/aiohttp-3.13.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e4b4e607fbd4964d65945a7b9d1e7f98b0d5545736ea613f77d5a2a37ff1e46", size = 1745683, upload-time = "2025-10-17T13:59:42.59Z" }, - { url = "https://files.pythonhosted.org/packages/93/1a/153b0ad694f377e94eacc85338efe03ed4776a396c8bb47bd9227135792a/aiohttp-3.13.1-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0c3db2d0e5477ad561bf7ba978c3ae5f8f78afda70daa05020179f759578754f", size = 1605418, upload-time = "2025-10-17T13:59:45.229Z" }, - { url = "https://files.pythonhosted.org/packages/3f/4e/18605b1bfeb4b00d3396d833647cdb213118e2a96862e5aebee62ad065b4/aiohttp-3.13.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9739d34506fdf59bf2c092560d502aa728b8cdb33f34ba15fb5e2852c35dd829", size = 1722379, upload-time = "2025-10-17T13:59:46.969Z" }, - { url = "https://files.pythonhosted.org/packages/72/13/0a38ad385d547fb283e0e1fe1ff1dff8899bd4ed0aaceeb13ec14abbf136/aiohttp-3.13.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:b902e30a268a85d50197b4997edc6e78842c14c0703450f632c2d82f17577845", size = 1716693, upload-time = "2025-10-17T13:59:49.217Z" }, - { url = "https://files.pythonhosted.org/packages/55/65/7029d7573ab9009adde380052c6130d02c8db52195fda112db35e914fe7b/aiohttp-3.13.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1bbfc04c8de7def6504cce0a97f9885a5c805fd2395a0634bc10f9d6ecb42524", size = 1784174, upload-time = "2025-10-17T13:59:51.439Z" }, - { url = "https://files.pythonhosted.org/packages/2d/36/fd46e39cb85418e45b0e4a8bfc39651ee0b8f08ea006adf217a221cdb269/aiohttp-3.13.1-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:6941853405a38a5eeb7d9776db77698df373ff7fa8c765cb81ea14a344fccbeb", size = 1593716, upload-time = "2025-10-17T13:59:53.367Z" }, - { url = "https://files.pythonhosted.org/packages/85/b8/188e0cb1be37b4408373171070fda17c3bf9c67c0d3d4fd5ee5b1fa108e1/aiohttp-3.13.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:7764adcd2dc8bd21c8228a53dda2005428498dc4d165f41b6086f0ac1c65b1c9", size = 1799254, upload-time = "2025-10-17T13:59:55.352Z" }, - { url = "https://files.pythonhosted.org/packages/67/ff/fdf768764eb427b0cc9ebb2cebddf990f94d98b430679f8383c35aa114be/aiohttp-3.13.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c09e08d38586fa59e5a2f9626505a0326fadb8e9c45550f029feeb92097a0afc", size = 1738122, upload-time = "2025-10-17T13:59:57.263Z" }, - { url = "https://files.pythonhosted.org/packages/94/84/fce7a4d575943394d7c0e632273838eb6f39de8edf25386017bf5f0de23b/aiohttp-3.13.1-cp311-cp311-win32.whl", hash = "sha256:ce1371675e74f6cf271d0b5530defb44cce713fd0ab733713562b3a2b870815c", size = 430491, upload-time = "2025-10-17T13:59:59.466Z" }, - { url = "https://files.pythonhosted.org/packages/ac/d2/d21b8ab6315a5d588c550ab285b4f02ae363edf012920e597904c5a56608/aiohttp-3.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:77a2f5cc28cf4704cc157be135c6a6cfb38c9dea478004f1c0fd7449cf445c28", size = 454808, upload-time = "2025-10-17T14:00:01.247Z" }, - { url = "https://files.pythonhosted.org/packages/1a/72/d463a10bf29871f6e3f63bcf3c91362dc4d72ed5917a8271f96672c415ad/aiohttp-3.13.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0760bd9a28efe188d77b7c3fe666e6ef74320d0f5b105f2e931c7a7e884c8230", size = 736218, upload-time = "2025-10-17T14:00:03.51Z" }, - { url = "https://files.pythonhosted.org/packages/26/13/f7bccedbe52ea5a6eef1e4ebb686a8d7765319dfd0a5939f4238cb6e79e6/aiohttp-3.13.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7129a424b441c3fe018a414401bf1b9e1d49492445f5676a3aecf4f74f67fcdb", size = 491251, upload-time = "2025-10-17T14:00:05.756Z" }, - { url = "https://files.pythonhosted.org/packages/0c/7c/7ea51b5aed6cc69c873f62548da8345032aa3416336f2d26869d4d37b4a2/aiohttp-3.13.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e1cb04ae64a594f6ddf5cbb024aba6b4773895ab6ecbc579d60414f8115e9e26", size = 490394, upload-time = "2025-10-17T14:00:07.504Z" }, - { url = "https://files.pythonhosted.org/packages/31/05/1172cc4af4557f6522efdee6eb2b9f900e1e320a97e25dffd3c5a6af651b/aiohttp-3.13.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:782d656a641e755decd6bd98d61d2a8ea062fd45fd3ff8d4173605dd0d2b56a1", size = 1737455, upload-time = "2025-10-17T14:00:09.403Z" }, - { url = "https://files.pythonhosted.org/packages/24/3d/ce6e4eca42f797d6b1cd3053cf3b0a22032eef3e4d1e71b9e93c92a3f201/aiohttp-3.13.1-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:f92ad8169767429a6d2237331726c03ccc5f245222f9373aa045510976af2b35", size = 1699176, upload-time = "2025-10-17T14:00:11.314Z" }, - { url = "https://files.pythonhosted.org/packages/25/04/7127ba55653e04da51477372566b16ae786ef854e06222a1c96b4ba6c8ef/aiohttp-3.13.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0e778f634ca50ec005eefa2253856921c429581422d887be050f2c1c92e5ce12", size = 1767216, upload-time = "2025-10-17T14:00:13.668Z" }, - { url = "https://files.pythonhosted.org/packages/b8/3b/43bca1e75847e600f40df829a6b2f0f4e1d4c70fb6c4818fdc09a462afd5/aiohttp-3.13.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:9bc36b41cf4aab5d3b34d22934a696ab83516603d1bc1f3e4ff9930fe7d245e5", size = 1865870, upload-time = "2025-10-17T14:00:15.852Z" }, - { url = "https://files.pythonhosted.org/packages/9e/69/b204e5d43384197a614c88c1717c324319f5b4e7d0a1b5118da583028d40/aiohttp-3.13.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3fd4570ea696aee27204dd524f287127ed0966d14d309dc8cc440f474e3e7dbd", size = 1751021, upload-time = "2025-10-17T14:00:18.297Z" }, - { url = "https://files.pythonhosted.org/packages/1c/af/845dc6b6fdf378791d720364bf5150f80d22c990f7e3a42331d93b337cc7/aiohttp-3.13.1-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:7bda795f08b8a620836ebfb0926f7973972a4bf8c74fdf9145e489f88c416811", size = 1561448, upload-time = "2025-10-17T14:00:20.152Z" }, - { url = "https://files.pythonhosted.org/packages/7a/91/d2ab08cd77ed76a49e4106b1cfb60bce2768242dd0c4f9ec0cb01e2cbf94/aiohttp-3.13.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:055a51d90e351aae53dcf324d0eafb2abe5b576d3ea1ec03827d920cf81a1c15", size = 1698196, upload-time = "2025-10-17T14:00:22.131Z" }, - { url = "https://files.pythonhosted.org/packages/5e/d1/082f0620dc428ecb8f21c08a191a4694915cd50f14791c74a24d9161cc50/aiohttp-3.13.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:d4131df864cbcc09bb16d3612a682af0db52f10736e71312574d90f16406a867", size = 1719252, upload-time = "2025-10-17T14:00:24.453Z" }, - { url = "https://files.pythonhosted.org/packages/fc/78/2af2f44491be7b08e43945b72d2b4fd76f0a14ba850ba9e41d28a7ce716a/aiohttp-3.13.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:163d3226e043f79bf47c87f8dfc89c496cc7bc9128cb7055ce026e435d551720", size = 1736529, upload-time = "2025-10-17T14:00:26.567Z" }, - { url = "https://files.pythonhosted.org/packages/b0/34/3e919ecdc93edaea8d140138049a0d9126141072e519535e2efa38eb7a02/aiohttp-3.13.1-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:a2370986a3b75c1a5f3d6f6d763fc6be4b430226577b0ed16a7c13a75bf43d8f", size = 1553723, upload-time = "2025-10-17T14:00:28.592Z" }, - { url = "https://files.pythonhosted.org/packages/21/4b/d8003aeda2f67f359b37e70a5a4b53fee336d8e89511ac307ff62aeefcdb/aiohttp-3.13.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:d7c14de0c7c9f1e6e785ce6cbe0ed817282c2af0012e674f45b4e58c6d4ea030", size = 1763394, upload-time = "2025-10-17T14:00:31.051Z" }, - { url = "https://files.pythonhosted.org/packages/4c/7b/1dbe6a39e33af9baaafc3fc016a280663684af47ba9f0e5d44249c1f72ec/aiohttp-3.13.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bb611489cf0db10b99beeb7280bd39e0ef72bc3eb6d8c0f0a16d8a56075d1eb7", size = 1718104, upload-time = "2025-10-17T14:00:33.407Z" }, - { url = "https://files.pythonhosted.org/packages/5c/88/bd1b38687257cce67681b9b0fa0b16437be03383fa1be4d1a45b168bef25/aiohttp-3.13.1-cp312-cp312-win32.whl", hash = "sha256:f90fe0ee75590f7428f7c8b5479389d985d83c949ea10f662ab928a5ed5cf5e6", size = 425303, upload-time = "2025-10-17T14:00:35.829Z" }, - { url = "https://files.pythonhosted.org/packages/0e/e3/4481f50dd6f27e9e58c19a60cff44029641640237e35d32b04aaee8cf95f/aiohttp-3.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:3461919a9dca272c183055f2aab8e6af0adc810a1b386cce28da11eb00c859d9", size = 452071, upload-time = "2025-10-17T14:00:37.764Z" }, - { url = "https://files.pythonhosted.org/packages/16/6d/d267b132342e1080f4c1bb7e1b4e96b168b3cbce931ec45780bff693ff95/aiohttp-3.13.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:55785a7f8f13df0c9ca30b5243d9909bd59f48b274262a8fe78cee0828306e5d", size = 730727, upload-time = "2025-10-17T14:00:39.681Z" }, - { url = "https://files.pythonhosted.org/packages/92/c8/1cf495bac85cf71b80fad5f6d7693e84894f11b9fe876b64b0a1e7cbf32f/aiohttp-3.13.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4bef5b83296cebb8167707b4f8d06c1805db0af632f7a72d7c5288a84667e7c3", size = 488678, upload-time = "2025-10-17T14:00:41.541Z" }, - { url = "https://files.pythonhosted.org/packages/a8/19/23c6b81cca587ec96943d977a58d11d05a82837022e65cd5502d665a7d11/aiohttp-3.13.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:27af0619c33f9ca52f06069ec05de1a357033449ab101836f431768ecfa63ff5", size = 487637, upload-time = "2025-10-17T14:00:43.527Z" }, - { url = "https://files.pythonhosted.org/packages/48/58/8f9464afb88b3eed145ad7c665293739b3a6f91589694a2bb7e5778cbc72/aiohttp-3.13.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a47fe43229a8efd3764ef7728a5c1158f31cdf2a12151fe99fde81c9ac87019c", size = 1718975, upload-time = "2025-10-17T14:00:45.496Z" }, - { url = "https://files.pythonhosted.org/packages/e1/8b/c3da064ca392b2702f53949fd7c403afa38d9ee10bf52c6ad59a42537103/aiohttp-3.13.1-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6e68e126de5b46e8b2bee73cab086b5d791e7dc192056916077aa1e2e2b04437", size = 1686905, upload-time = "2025-10-17T14:00:47.707Z" }, - { url = "https://files.pythonhosted.org/packages/0a/a4/9c8a3843ecf526daee6010af1a66eb62579be1531d2d5af48ea6f405ad3c/aiohttp-3.13.1-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e65ef49dd22514329c55970d39079618a8abf856bae7147913bb774a3ab3c02f", size = 1754907, upload-time = "2025-10-17T14:00:49.702Z" }, - { url = "https://files.pythonhosted.org/packages/a4/80/1f470ed93e06436e3fc2659a9fc329c192fa893fb7ed4e884d399dbfb2a8/aiohttp-3.13.1-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0e425a7e0511648b3376839dcc9190098671a47f21a36e815b97762eb7d556b0", size = 1857129, upload-time = "2025-10-17T14:00:51.822Z" }, - { url = "https://files.pythonhosted.org/packages/cc/e6/33d305e6cce0a8daeb79c7d8d6547d6e5f27f4e35fa4883fc9c9eb638596/aiohttp-3.13.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:010dc9b7110f055006acd3648d5d5955bb6473b37c3663ec42a1b4cba7413e6b", size = 1738189, upload-time = "2025-10-17T14:00:53.976Z" }, - { url = "https://files.pythonhosted.org/packages/ac/42/8df03367e5a64327fe0c39291080697795430c438fc1139c7cc1831aa1df/aiohttp-3.13.1-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1b5c722d0ca5f57d61066b5dfa96cdb87111e2519156b35c1f8dd17c703bee7a", size = 1553608, upload-time = "2025-10-17T14:00:56.144Z" }, - { url = "https://files.pythonhosted.org/packages/96/17/6d5c73cd862f1cf29fddcbb54aac147037ff70a043a2829d03a379e95742/aiohttp-3.13.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:93029f0e9b77b714904a281b5aa578cdc8aa8ba018d78c04e51e1c3d8471b8ec", size = 1681809, upload-time = "2025-10-17T14:00:58.603Z" }, - { url = "https://files.pythonhosted.org/packages/be/31/8926c8ab18533f6076ce28d2c329a203b58c6861681906e2d73b9c397588/aiohttp-3.13.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:d1824c7d08d8ddfc8cb10c847f696942e5aadbd16fd974dfde8bd2c3c08a9fa1", size = 1711161, upload-time = "2025-10-17T14:01:01.744Z" }, - { url = "https://files.pythonhosted.org/packages/f2/36/2f83e1ca730b1e0a8cf1c8ab9559834c5eec9f5da86e77ac71f0d16b521d/aiohttp-3.13.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:8f47d0ff5b3eb9c1278a2f56ea48fda667da8ebf28bd2cb378b7c453936ce003", size = 1731999, upload-time = "2025-10-17T14:01:04.626Z" }, - { url = "https://files.pythonhosted.org/packages/b9/ec/1f818cc368dfd4d5ab4e9efc8f2f6f283bfc31e1c06d3e848bcc862d4591/aiohttp-3.13.1-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:8a396b1da9b51ded79806ac3b57a598f84e0769eaa1ba300655d8b5e17b70c7b", size = 1548684, upload-time = "2025-10-17T14:01:06.828Z" }, - { url = "https://files.pythonhosted.org/packages/d3/ad/33d36efd16e4fefee91b09a22a3a0e1b830f65471c3567ac5a8041fac812/aiohttp-3.13.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d9c52a65f54796e066b5d674e33b53178014752d28bca555c479c2c25ffcec5b", size = 1756676, upload-time = "2025-10-17T14:01:09.517Z" }, - { url = "https://files.pythonhosted.org/packages/3c/c4/4a526d84e77d464437713ca909364988ed2e0cd0cdad2c06cb065ece9e08/aiohttp-3.13.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a89da72d18d6c95a653470b78d8ee5aa3c4b37212004c103403d0776cbea6ff0", size = 1715577, upload-time = "2025-10-17T14:01:11.958Z" }, - { url = "https://files.pythonhosted.org/packages/a2/21/e39638b7d9c7f1362c4113a91870f89287e60a7ea2d037e258b81e8b37d5/aiohttp-3.13.1-cp313-cp313-win32.whl", hash = "sha256:02e0258b7585ddf5d01c79c716ddd674386bfbf3041fbbfe7bdf9c7c32eb4a9b", size = 424468, upload-time = "2025-10-17T14:01:14.344Z" }, - { url = "https://files.pythonhosted.org/packages/cc/00/f3a92c592a845ebb2f47d102a67f35f0925cb854c5e7386f1a3a1fdff2ab/aiohttp-3.13.1-cp313-cp313-win_amd64.whl", hash = "sha256:ef56ffe60e8d97baac123272bde1ab889ee07d3419606fae823c80c2b86c403e", size = 450806, upload-time = "2025-10-17T14:01:16.437Z" }, - { url = "https://files.pythonhosted.org/packages/97/be/0f6c41d2fd0aab0af133c509cabaf5b1d78eab882cb0ceb872e87ceeabf7/aiohttp-3.13.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:77f83b3dc5870a2ea79a0fcfdcc3fc398187ec1675ff61ec2ceccad27ecbd303", size = 733828, upload-time = "2025-10-17T14:01:18.58Z" }, - { url = "https://files.pythonhosted.org/packages/75/14/24e2ac5efa76ae30e05813e0f50737005fd52da8ddffee474d4a5e7f38a6/aiohttp-3.13.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:9cafd2609ebb755e47323306c7666283fbba6cf82b5f19982ea627db907df23a", size = 489320, upload-time = "2025-10-17T14:01:20.644Z" }, - { url = "https://files.pythonhosted.org/packages/da/5a/4cbe599358d05ea7db4869aff44707b57d13f01724d48123dc68b3288d5a/aiohttp-3.13.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9c489309a2ca548d5f11131cfb4092f61d67954f930bba7e413bcdbbb82d7fae", size = 489899, upload-time = "2025-10-17T14:01:22.638Z" }, - { url = "https://files.pythonhosted.org/packages/67/96/3aec9d9cfc723273d4386328a1e2562cf23629d2f57d137047c49adb2afb/aiohttp-3.13.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79ac15fe5fdbf3c186aa74b656cd436d9a1e492ba036db8901c75717055a5b1c", size = 1716556, upload-time = "2025-10-17T14:01:25.406Z" }, - { url = "https://files.pythonhosted.org/packages/b9/99/39a3d250595b5c8172843831221fa5662884f63f8005b00b4034f2a7a836/aiohttp-3.13.1-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:095414be94fce3bc080684b4cd50fb70d439bc4662b2a1984f45f3bf9ede08aa", size = 1665814, upload-time = "2025-10-17T14:01:27.683Z" }, - { url = "https://files.pythonhosted.org/packages/3b/96/8319e7060a85db14a9c178bc7b3cf17fad458db32ba6d2910de3ca71452d/aiohttp-3.13.1-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c68172e1a2dca65fa1272c85ca72e802d78b67812b22827df01017a15c5089fa", size = 1755767, upload-time = "2025-10-17T14:01:29.914Z" }, - { url = "https://files.pythonhosted.org/packages/1c/c6/0a2b3d886b40aa740fa2294cd34ed46d2e8108696748492be722e23082a7/aiohttp-3.13.1-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3751f9212bcd119944d4ea9de6a3f0fee288c177b8ca55442a2cdff0c8201eb3", size = 1836591, upload-time = "2025-10-17T14:01:32.28Z" }, - { url = "https://files.pythonhosted.org/packages/fb/34/8ab5904b3331c91a58507234a1e2f662f837e193741609ee5832eb436251/aiohttp-3.13.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8619dca57d98a8353abdc7a1eeb415548952b39d6676def70d9ce76d41a046a9", size = 1714915, upload-time = "2025-10-17T14:01:35.138Z" }, - { url = "https://files.pythonhosted.org/packages/b5/d3/d36077ca5f447649112189074ac6c192a666bf68165b693e48c23b0d008c/aiohttp-3.13.1-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:97795a0cb0a5f8a843759620e9cbd8889f8079551f5dcf1ccd99ed2f056d9632", size = 1546579, upload-time = "2025-10-17T14:01:38.237Z" }, - { url = "https://files.pythonhosted.org/packages/a8/14/dbc426a1bb1305c4fc78ce69323498c9e7c699983366ef676aa5d3f949fa/aiohttp-3.13.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:1060e058da8f9f28a7026cdfca9fc886e45e551a658f6a5c631188f72a3736d2", size = 1680633, upload-time = "2025-10-17T14:01:40.902Z" }, - { url = "https://files.pythonhosted.org/packages/29/83/1e68e519aff9f3ef6d4acb6cdda7b5f592ef5c67c8f095dc0d8e06ce1c3e/aiohttp-3.13.1-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:f48a2c26333659101ef214907d29a76fe22ad7e912aa1e40aeffdff5e8180977", size = 1678675, upload-time = "2025-10-17T14:01:43.779Z" }, - { url = "https://files.pythonhosted.org/packages/38/b9/7f3e32a81c08b6d29ea15060c377e1f038ad96cd9923a85f30e817afff22/aiohttp-3.13.1-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:f1dfad638b9c91ff225162b2824db0e99ae2d1abe0dc7272b5919701f0a1e685", size = 1726829, upload-time = "2025-10-17T14:01:46.546Z" }, - { url = "https://files.pythonhosted.org/packages/23/ce/610b1f77525a0a46639aea91377b12348e9f9412cc5ddcb17502aa4681c7/aiohttp-3.13.1-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:8fa09ab6dd567cb105db4e8ac4d60f377a7a94f67cf669cac79982f626360f32", size = 1542985, upload-time = "2025-10-17T14:01:49.082Z" }, - { url = "https://files.pythonhosted.org/packages/53/39/3ac8dfdad5de38c401846fa071fcd24cb3b88ccfb024854df6cbd9b4a07e/aiohttp-3.13.1-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:4159fae827f9b5f655538a4f99b7cbc3a2187e5ca2eee82f876ef1da802ccfa9", size = 1741556, upload-time = "2025-10-17T14:01:51.846Z" }, - { url = "https://files.pythonhosted.org/packages/2a/48/b1948b74fea7930b0f29595d1956842324336de200593d49a51a40607fdc/aiohttp-3.13.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ad671118c19e9cfafe81a7a05c294449fe0ebb0d0c6d5bb445cd2190023f5cef", size = 1696175, upload-time = "2025-10-17T14:01:54.232Z" }, - { url = "https://files.pythonhosted.org/packages/96/26/063bba38e4b27b640f56cc89fe83cc3546a7ae162c2e30ca345f0ccdc3d1/aiohttp-3.13.1-cp314-cp314-win32.whl", hash = "sha256:c5c970c148c48cf6acb65224ca3c87a47f74436362dde75c27bc44155ccf7dfc", size = 430254, upload-time = "2025-10-17T14:01:56.451Z" }, - { url = "https://files.pythonhosted.org/packages/88/aa/25fd764384dc4eab714023112d3548a8dd69a058840d61d816ea736097a2/aiohttp-3.13.1-cp314-cp314-win_amd64.whl", hash = "sha256:748a00167b7a88385756fa615417d24081cba7e58c8727d2e28817068b97c18c", size = 456256, upload-time = "2025-10-17T14:01:58.752Z" }, - { url = "https://files.pythonhosted.org/packages/d4/9f/9ba6059de4bad25c71cd88e3da53f93e9618ea369cf875c9f924b1c167e2/aiohttp-3.13.1-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:390b73e99d7a1f0f658b3f626ba345b76382f3edc65f49d6385e326e777ed00e", size = 765956, upload-time = "2025-10-17T14:02:01.515Z" }, - { url = "https://files.pythonhosted.org/packages/1f/30/b86da68b494447d3060f45c7ebb461347535dab4af9162a9267d9d86ca31/aiohttp-3.13.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:27e83abb330e687e019173d8fc1fd6a1cf471769624cf89b1bb49131198a810a", size = 503206, upload-time = "2025-10-17T14:02:03.818Z" }, - { url = "https://files.pythonhosted.org/packages/c1/21/d27a506552843ff9eeb9fcc2d45f943b09eefdfdf205aab044f4f1f39f6a/aiohttp-3.13.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2b20eed07131adbf3e873e009c2869b16a579b236e9d4b2f211bf174d8bef44a", size = 507719, upload-time = "2025-10-17T14:02:05.947Z" }, - { url = "https://files.pythonhosted.org/packages/58/23/4042230ec7e4edc7ba43d0342b5a3d2fe0222ca046933c4251a35aaf17f5/aiohttp-3.13.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:58fee9ef8477fd69e823b92cfd1f590ee388521b5ff8f97f3497e62ee0656212", size = 1862758, upload-time = "2025-10-17T14:02:08.469Z" }, - { url = "https://files.pythonhosted.org/packages/df/88/525c45bea7cbb9f65df42cadb4ff69f6a0dbf95931b0ff7d1fdc40a1cb5f/aiohttp-3.13.1-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:1f62608fcb7b3d034d5e9496bea52d94064b7b62b06edba82cd38191336bbeda", size = 1717790, upload-time = "2025-10-17T14:02:11.37Z" }, - { url = "https://files.pythonhosted.org/packages/1d/80/21e9b5eb77df352a5788713f37359b570a793f0473f3a72db2e46df379b9/aiohttp-3.13.1-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:fdc4d81c3dfc999437f23e36d197e8b557a3f779625cd13efe563a9cfc2ce712", size = 1842088, upload-time = "2025-10-17T14:02:13.872Z" }, - { url = "https://files.pythonhosted.org/packages/d2/bf/d1738f6d63fe8b2a0ad49533911b3347f4953cd001bf3223cb7b61f18dff/aiohttp-3.13.1-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:601d7ec812f746fd80ff8af38eeb3f196e1bab4a4d39816ccbc94c222d23f1d0", size = 1934292, upload-time = "2025-10-17T14:02:16.624Z" }, - { url = "https://files.pythonhosted.org/packages/04/e6/26cab509b42610ca49573f2fc2867810f72bd6a2070182256c31b14f2e98/aiohttp-3.13.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:47c3f21c469b840d9609089435c0d9918ae89f41289bf7cc4afe5ff7af5458db", size = 1791328, upload-time = "2025-10-17T14:02:19.051Z" }, - { url = "https://files.pythonhosted.org/packages/8a/6d/baf7b462852475c9d045bee8418d9cdf280efb687752b553e82d0c58bcc2/aiohttp-3.13.1-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d6c6cdc0750db88520332d4aaa352221732b0cafe89fd0e42feec7cb1b5dc236", size = 1622663, upload-time = "2025-10-17T14:02:21.397Z" }, - { url = "https://files.pythonhosted.org/packages/c8/48/396a97318af9b5f4ca8b3dc14a67976f71c6400a9609c622f96da341453f/aiohttp-3.13.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:58a12299eeb1fca2414ee2bc345ac69b0f765c20b82c3ab2a75d91310d95a9f6", size = 1787791, upload-time = "2025-10-17T14:02:24.212Z" }, - { url = "https://files.pythonhosted.org/packages/a8/e2/6925f6784134ce3ff3ce1a8502ab366432a3b5605387618c1a939ce778d9/aiohttp-3.13.1-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:0989cbfc195a4de1bb48f08454ef1cb47424b937e53ed069d08404b9d3c7aea1", size = 1775459, upload-time = "2025-10-17T14:02:26.971Z" }, - { url = "https://files.pythonhosted.org/packages/c3/e3/b372047ba739fc39f199b99290c4cc5578ce5fd125f69168c967dac44021/aiohttp-3.13.1-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:feb5ee664300e2435e0d1bc3443a98925013dfaf2cae9699c1f3606b88544898", size = 1789250, upload-time = "2025-10-17T14:02:29.686Z" }, - { url = "https://files.pythonhosted.org/packages/02/8c/9f48b93d7d57fc9ef2ad4adace62e4663ea1ce1753806c4872fb36b54c39/aiohttp-3.13.1-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:58a6f8702da0c3606fb5cf2e669cce0ca681d072fe830968673bb4c69eb89e88", size = 1616139, upload-time = "2025-10-17T14:02:32.151Z" }, - { url = "https://files.pythonhosted.org/packages/5c/c6/c64e39d61aaa33d7de1be5206c0af3ead4b369bf975dac9fdf907a4291c1/aiohttp-3.13.1-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:a417ceb433b9d280e2368ffea22d4bc6e3e0d894c4bc7768915124d57d0964b6", size = 1815829, upload-time = "2025-10-17T14:02:34.635Z" }, - { url = "https://files.pythonhosted.org/packages/22/75/e19e93965ea675f1151753b409af97a14f1d888588a555e53af1e62b83eb/aiohttp-3.13.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8ac8854f7b0466c5d6a9ea49249b3f6176013859ac8f4bb2522ad8ed6b94ded2", size = 1760923, upload-time = "2025-10-17T14:02:37.364Z" }, - { url = "https://files.pythonhosted.org/packages/6c/a4/06ed38f1dabd98ea136fd116cba1d02c9b51af5a37d513b6850a9a567d86/aiohttp-3.13.1-cp314-cp314t-win32.whl", hash = "sha256:be697a5aeff42179ed13b332a411e674994bcd406c81642d014ace90bf4bb968", size = 463318, upload-time = "2025-10-17T14:02:39.924Z" }, - { url = "https://files.pythonhosted.org/packages/04/0f/27e4fdde899e1e90e35eeff56b54ed63826435ad6cdb06b09ed312d1b3fa/aiohttp-3.13.1-cp314-cp314t-win_amd64.whl", hash = "sha256:f1d6aa90546a4e8f20c3500cb68ab14679cd91f927fa52970035fd3207dfb3da", size = 496721, upload-time = "2025-10-17T14:02:42.199Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/1c/ce/3b83ebba6b3207a7135e5fcaba49706f8a4b6008153b4e30540c982fae26/aiohttp-3.13.2.tar.gz", hash = "sha256:40176a52c186aefef6eb3cad2cdd30cd06e3afbe88fe8ab2af9c0b90f228daca", size = 7837994, upload-time = "2025-10-28T20:59:39.937Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6d/34/939730e66b716b76046dedfe0842995842fa906ccc4964bba414ff69e429/aiohttp-3.13.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2372b15a5f62ed37789a6b383ff7344fc5b9f243999b0cd9b629d8bc5f5b4155", size = 736471, upload-time = "2025-10-28T20:55:27.924Z" }, + { url = "https://files.pythonhosted.org/packages/fd/cf/dcbdf2df7f6ca72b0bb4c0b4509701f2d8942cf54e29ca197389c214c07f/aiohttp-3.13.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e7f8659a48995edee7229522984bd1009c1213929c769c2daa80b40fe49a180c", size = 493985, upload-time = "2025-10-28T20:55:29.456Z" }, + { url = "https://files.pythonhosted.org/packages/9d/87/71c8867e0a1d0882dcbc94af767784c3cb381c1c4db0943ab4aae4fed65e/aiohttp-3.13.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:939ced4a7add92296b0ad38892ce62b98c619288a081170695c6babe4f50e636", size = 489274, upload-time = "2025-10-28T20:55:31.134Z" }, + { url = "https://files.pythonhosted.org/packages/38/0f/46c24e8dae237295eaadd113edd56dee96ef6462adf19b88592d44891dc5/aiohttp-3.13.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6315fb6977f1d0dd41a107c527fee2ed5ab0550b7d885bc15fee20ccb17891da", size = 1668171, upload-time = "2025-10-28T20:55:36.065Z" }, + { url = "https://files.pythonhosted.org/packages/eb/c6/4cdfb4440d0e28483681a48f69841fa5e39366347d66ef808cbdadddb20e/aiohttp-3.13.2-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6e7352512f763f760baaed2637055c49134fd1d35b37c2dedfac35bfe5cf8725", size = 1636036, upload-time = "2025-10-28T20:55:37.576Z" }, + { url = "https://files.pythonhosted.org/packages/84/37/8708cf678628216fb678ab327a4e1711c576d6673998f4f43e86e9ae90dd/aiohttp-3.13.2-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e09a0a06348a2dd73e7213353c90d709502d9786219f69b731f6caa0efeb46f5", size = 1727975, upload-time = "2025-10-28T20:55:39.457Z" }, + { url = "https://files.pythonhosted.org/packages/e6/2e/3ebfe12fdcb9b5f66e8a0a42dffcd7636844c8a018f261efb2419f68220b/aiohttp-3.13.2-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a09a6d073fb5789456545bdee2474d14395792faa0527887f2f4ec1a486a59d3", size = 1815823, upload-time = "2025-10-28T20:55:40.958Z" }, + { url = "https://files.pythonhosted.org/packages/a1/4f/ca2ef819488cbb41844c6cf92ca6dd15b9441e6207c58e5ae0e0fc8d70ad/aiohttp-3.13.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b59d13c443f8e049d9e94099c7e412e34610f1f49be0f230ec656a10692a5802", size = 1669374, upload-time = "2025-10-28T20:55:42.745Z" }, + { url = "https://files.pythonhosted.org/packages/f8/fe/1fe2e1179a0d91ce09c99069684aab619bf2ccde9b20bd6ca44f8837203e/aiohttp-3.13.2-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:20db2d67985d71ca033443a1ba2001c4b5693fe09b0e29f6d9358a99d4d62a8a", size = 1555315, upload-time = "2025-10-28T20:55:44.264Z" }, + { url = "https://files.pythonhosted.org/packages/5a/2b/f3781899b81c45d7cbc7140cddb8a3481c195e7cbff8e36374759d2ab5a5/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:960c2fc686ba27b535f9fd2b52d87ecd7e4fd1cf877f6a5cba8afb5b4a8bd204", size = 1639140, upload-time = "2025-10-28T20:55:46.626Z" }, + { url = "https://files.pythonhosted.org/packages/72/27/c37e85cd3ece6f6c772e549bd5a253d0c122557b25855fb274224811e4f2/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:6c00dbcf5f0d88796151e264a8eab23de2997c9303dd7c0bf622e23b24d3ce22", size = 1645496, upload-time = "2025-10-28T20:55:48.933Z" }, + { url = "https://files.pythonhosted.org/packages/66/20/3af1ab663151bd3780b123e907761cdb86ec2c4e44b2d9b195ebc91fbe37/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fed38a5edb7945f4d1bcabe2fcd05db4f6ec7e0e82560088b754f7e08d93772d", size = 1697625, upload-time = "2025-10-28T20:55:50.377Z" }, + { url = "https://files.pythonhosted.org/packages/95/eb/ae5cab15efa365e13d56b31b0d085a62600298bf398a7986f8388f73b598/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:b395bbca716c38bef3c764f187860e88c724b342c26275bc03e906142fc5964f", size = 1542025, upload-time = "2025-10-28T20:55:51.861Z" }, + { url = "https://files.pythonhosted.org/packages/e9/2d/1683e8d67ec72d911397fe4e575688d2a9b8f6a6e03c8fdc9f3fd3d4c03f/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:204ffff2426c25dfda401ba08da85f9c59525cdc42bda26660463dd1cbcfec6f", size = 1714918, upload-time = "2025-10-28T20:55:53.515Z" }, + { url = "https://files.pythonhosted.org/packages/99/a2/ffe8e0e1c57c5e542d47ffa1fcf95ef2b3ea573bf7c4d2ee877252431efc/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:05c4dd3c48fb5f15db31f57eb35374cb0c09afdde532e7fb70a75aede0ed30f6", size = 1656113, upload-time = "2025-10-28T20:55:55.438Z" }, + { url = "https://files.pythonhosted.org/packages/0d/42/d511aff5c3a2b06c09d7d214f508a4ad8ac7799817f7c3d23e7336b5e896/aiohttp-3.13.2-cp310-cp310-win32.whl", hash = "sha256:e574a7d61cf10351d734bcddabbe15ede0eaa8a02070d85446875dc11189a251", size = 432290, upload-time = "2025-10-28T20:55:56.96Z" }, + { url = "https://files.pythonhosted.org/packages/8b/ea/1c2eb7098b5bad4532994f2b7a8228d27674035c9b3234fe02c37469ef14/aiohttp-3.13.2-cp310-cp310-win_amd64.whl", hash = "sha256:364f55663085d658b8462a1c3f17b2b84a5c2e1ba858e1b79bff7b2e24ad1514", size = 455075, upload-time = "2025-10-28T20:55:58.373Z" }, + { url = "https://files.pythonhosted.org/packages/35/74/b321e7d7ca762638cdf8cdeceb39755d9c745aff7a64c8789be96ddf6e96/aiohttp-3.13.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4647d02df098f6434bafd7f32ad14942f05a9caa06c7016fdcc816f343997dd0", size = 743409, upload-time = "2025-10-28T20:56:00.354Z" }, + { url = "https://files.pythonhosted.org/packages/99/3d/91524b905ec473beaf35158d17f82ef5a38033e5809fe8742e3657cdbb97/aiohttp-3.13.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e3403f24bcb9c3b29113611c3c16a2a447c3953ecf86b79775e7be06f7ae7ccb", size = 497006, upload-time = "2025-10-28T20:56:01.85Z" }, + { url = "https://files.pythonhosted.org/packages/eb/d3/7f68bc02a67716fe80f063e19adbd80a642e30682ce74071269e17d2dba1/aiohttp-3.13.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:43dff14e35aba17e3d6d5ba628858fb8cb51e30f44724a2d2f0c75be492c55e9", size = 493195, upload-time = "2025-10-28T20:56:03.314Z" }, + { url = "https://files.pythonhosted.org/packages/98/31/913f774a4708775433b7375c4f867d58ba58ead833af96c8af3621a0d243/aiohttp-3.13.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e2a9ea08e8c58bb17655630198833109227dea914cd20be660f52215f6de5613", size = 1747759, upload-time = "2025-10-28T20:56:04.904Z" }, + { url = "https://files.pythonhosted.org/packages/e8/63/04efe156f4326f31c7c4a97144f82132c3bb21859b7bb84748d452ccc17c/aiohttp-3.13.2-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:53b07472f235eb80e826ad038c9d106c2f653584753f3ddab907c83f49eedead", size = 1704456, upload-time = "2025-10-28T20:56:06.986Z" }, + { url = "https://files.pythonhosted.org/packages/8e/02/4e16154d8e0a9cf4ae76f692941fd52543bbb148f02f098ca73cab9b1c1b/aiohttp-3.13.2-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e736c93e9c274fce6419af4aac199984d866e55f8a4cec9114671d0ea9688780", size = 1807572, upload-time = "2025-10-28T20:56:08.558Z" }, + { url = "https://files.pythonhosted.org/packages/34/58/b0583defb38689e7f06798f0285b1ffb3a6fb371f38363ce5fd772112724/aiohttp-3.13.2-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ff5e771f5dcbc81c64898c597a434f7682f2259e0cd666932a913d53d1341d1a", size = 1895954, upload-time = "2025-10-28T20:56:10.545Z" }, + { url = "https://files.pythonhosted.org/packages/6b/f3/083907ee3437425b4e376aa58b2c915eb1a33703ec0dc30040f7ae3368c6/aiohttp-3.13.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3b6fb0c207cc661fa0bf8c66d8d9b657331ccc814f4719468af61034b478592", size = 1747092, upload-time = "2025-10-28T20:56:12.118Z" }, + { url = "https://files.pythonhosted.org/packages/ac/61/98a47319b4e425cc134e05e5f3fc512bf9a04bf65aafd9fdcda5d57ec693/aiohttp-3.13.2-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:97a0895a8e840ab3520e2288db7cace3a1981300d48babeb50e7425609e2e0ab", size = 1606815, upload-time = "2025-10-28T20:56:14.191Z" }, + { url = "https://files.pythonhosted.org/packages/97/4b/e78b854d82f66bb974189135d31fce265dee0f5344f64dd0d345158a5973/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9e8f8afb552297aca127c90cb840e9a1d4bfd6a10d7d8f2d9176e1acc69bad30", size = 1723789, upload-time = "2025-10-28T20:56:16.101Z" }, + { url = "https://files.pythonhosted.org/packages/ed/fc/9d2ccc794fc9b9acd1379d625c3a8c64a45508b5091c546dea273a41929e/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:ed2f9c7216e53c3df02264f25d824b079cc5914f9e2deba94155190ef648ee40", size = 1718104, upload-time = "2025-10-28T20:56:17.655Z" }, + { url = "https://files.pythonhosted.org/packages/66/65/34564b8765ea5c7d79d23c9113135d1dd3609173da13084830f1507d56cf/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:99c5280a329d5fa18ef30fd10c793a190d996567667908bef8a7f81f8202b948", size = 1785584, upload-time = "2025-10-28T20:56:19.238Z" }, + { url = "https://files.pythonhosted.org/packages/30/be/f6a7a426e02fc82781afd62016417b3948e2207426d90a0e478790d1c8a4/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:2ca6ffef405fc9c09a746cb5d019c1672cd7f402542e379afc66b370833170cf", size = 1595126, upload-time = "2025-10-28T20:56:20.836Z" }, + { url = "https://files.pythonhosted.org/packages/e5/c7/8e22d5d28f94f67d2af496f14a83b3c155d915d1fe53d94b66d425ec5b42/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:47f438b1a28e926c37632bff3c44df7d27c9b57aaf4e34b1def3c07111fdb782", size = 1800665, upload-time = "2025-10-28T20:56:22.922Z" }, + { url = "https://files.pythonhosted.org/packages/d1/11/91133c8b68b1da9fc16555706aa7276fdf781ae2bb0876c838dd86b8116e/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9acda8604a57bb60544e4646a4615c1866ee6c04a8edef9b8ee6fd1d8fa2ddc8", size = 1739532, upload-time = "2025-10-28T20:56:25.924Z" }, + { url = "https://files.pythonhosted.org/packages/17/6b/3747644d26a998774b21a616016620293ddefa4d63af6286f389aedac844/aiohttp-3.13.2-cp311-cp311-win32.whl", hash = "sha256:868e195e39b24aaa930b063c08bb0c17924899c16c672a28a65afded9c46c6ec", size = 431876, upload-time = "2025-10-28T20:56:27.524Z" }, + { url = "https://files.pythonhosted.org/packages/c3/63/688462108c1a00eb9f05765331c107f95ae86f6b197b865d29e930b7e462/aiohttp-3.13.2-cp311-cp311-win_amd64.whl", hash = "sha256:7fd19df530c292542636c2a9a85854fab93474396a52f1695e799186bbd7f24c", size = 456205, upload-time = "2025-10-28T20:56:29.062Z" }, + { url = "https://files.pythonhosted.org/packages/29/9b/01f00e9856d0a73260e86dd8ed0c2234a466c5c1712ce1c281548df39777/aiohttp-3.13.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b1e56bab2e12b2b9ed300218c351ee2a3d8c8fdab5b1ec6193e11a817767e47b", size = 737623, upload-time = "2025-10-28T20:56:30.797Z" }, + { url = "https://files.pythonhosted.org/packages/5a/1b/4be39c445e2b2bd0aab4ba736deb649fabf14f6757f405f0c9685019b9e9/aiohttp-3.13.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:364e25edaabd3d37b1db1f0cbcee8c73c9a3727bfa262b83e5e4cf3489a2a9dc", size = 492664, upload-time = "2025-10-28T20:56:32.708Z" }, + { url = "https://files.pythonhosted.org/packages/28/66/d35dcfea8050e131cdd731dff36434390479b4045a8d0b9d7111b0a968f1/aiohttp-3.13.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c5c94825f744694c4b8db20b71dba9a257cd2ba8e010a803042123f3a25d50d7", size = 491808, upload-time = "2025-10-28T20:56:34.57Z" }, + { url = "https://files.pythonhosted.org/packages/00/29/8e4609b93e10a853b65f8291e64985de66d4f5848c5637cddc70e98f01f8/aiohttp-3.13.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ba2715d842ffa787be87cbfce150d5e88c87a98e0b62e0f5aa489169a393dbbb", size = 1738863, upload-time = "2025-10-28T20:56:36.377Z" }, + { url = "https://files.pythonhosted.org/packages/9d/fa/4ebdf4adcc0def75ced1a0d2d227577cd7b1b85beb7edad85fcc87693c75/aiohttp-3.13.2-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:585542825c4bc662221fb257889e011a5aa00f1ae4d75d1d246a5225289183e3", size = 1700586, upload-time = "2025-10-28T20:56:38.034Z" }, + { url = "https://files.pythonhosted.org/packages/da/04/73f5f02ff348a3558763ff6abe99c223381b0bace05cd4530a0258e52597/aiohttp-3.13.2-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:39d02cb6025fe1aabca329c5632f48c9532a3dabccd859e7e2f110668972331f", size = 1768625, upload-time = "2025-10-28T20:56:39.75Z" }, + { url = "https://files.pythonhosted.org/packages/f8/49/a825b79ffec124317265ca7d2344a86bcffeb960743487cb11988ffb3494/aiohttp-3.13.2-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:e67446b19e014d37342f7195f592a2a948141d15a312fe0e700c2fd2f03124f6", size = 1867281, upload-time = "2025-10-28T20:56:41.471Z" }, + { url = "https://files.pythonhosted.org/packages/b9/48/adf56e05f81eac31edcfae45c90928f4ad50ef2e3ea72cb8376162a368f8/aiohttp-3.13.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4356474ad6333e41ccefd39eae869ba15a6c5299c9c01dfdcfdd5c107be4363e", size = 1752431, upload-time = "2025-10-28T20:56:43.162Z" }, + { url = "https://files.pythonhosted.org/packages/30/ab/593855356eead019a74e862f21523db09c27f12fd24af72dbc3555b9bfd9/aiohttp-3.13.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:eeacf451c99b4525f700f078becff32c32ec327b10dcf31306a8a52d78166de7", size = 1562846, upload-time = "2025-10-28T20:56:44.85Z" }, + { url = "https://files.pythonhosted.org/packages/39/0f/9f3d32271aa8dc35036e9668e31870a9d3b9542dd6b3e2c8a30931cb27ae/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d8a9b889aeabd7a4e9af0b7f4ab5ad94d42e7ff679aaec6d0db21e3b639ad58d", size = 1699606, upload-time = "2025-10-28T20:56:46.519Z" }, + { url = "https://files.pythonhosted.org/packages/2c/3c/52d2658c5699b6ef7692a3f7128b2d2d4d9775f2a68093f74bca06cf01e1/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:fa89cb11bc71a63b69568d5b8a25c3ca25b6d54c15f907ca1c130d72f320b76b", size = 1720663, upload-time = "2025-10-28T20:56:48.528Z" }, + { url = "https://files.pythonhosted.org/packages/9b/d4/8f8f3ff1fb7fb9e3f04fcad4e89d8a1cd8fc7d05de67e3de5b15b33008ff/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:8aa7c807df234f693fed0ecd507192fc97692e61fee5702cdc11155d2e5cadc8", size = 1737939, upload-time = "2025-10-28T20:56:50.77Z" }, + { url = "https://files.pythonhosted.org/packages/03/d3/ddd348f8a27a634daae39a1b8e291ff19c77867af438af844bf8b7e3231b/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:9eb3e33fdbe43f88c3c75fa608c25e7c47bbd80f48d012763cb67c47f39a7e16", size = 1555132, upload-time = "2025-10-28T20:56:52.568Z" }, + { url = "https://files.pythonhosted.org/packages/39/b8/46790692dc46218406f94374903ba47552f2f9f90dad554eed61bfb7b64c/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9434bc0d80076138ea986833156c5a48c9c7a8abb0c96039ddbb4afc93184169", size = 1764802, upload-time = "2025-10-28T20:56:54.292Z" }, + { url = "https://files.pythonhosted.org/packages/ba/e4/19ce547b58ab2a385e5f0b8aa3db38674785085abcf79b6e0edd1632b12f/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ff15c147b2ad66da1f2cbb0622313f2242d8e6e8f9b79b5206c84523a4473248", size = 1719512, upload-time = "2025-10-28T20:56:56.428Z" }, + { url = "https://files.pythonhosted.org/packages/70/30/6355a737fed29dcb6dfdd48682d5790cb5eab050f7b4e01f49b121d3acad/aiohttp-3.13.2-cp312-cp312-win32.whl", hash = "sha256:27e569eb9d9e95dbd55c0fc3ec3a9335defbf1d8bc1d20171a49f3c4c607b93e", size = 426690, upload-time = "2025-10-28T20:56:58.736Z" }, + { url = "https://files.pythonhosted.org/packages/0a/0d/b10ac09069973d112de6ef980c1f6bb31cb7dcd0bc363acbdad58f927873/aiohttp-3.13.2-cp312-cp312-win_amd64.whl", hash = "sha256:8709a0f05d59a71f33fd05c17fc11fcb8c30140506e13c2f5e8ee1b8964e1b45", size = 453465, upload-time = "2025-10-28T20:57:00.795Z" }, + { url = "https://files.pythonhosted.org/packages/bf/78/7e90ca79e5aa39f9694dcfd74f4720782d3c6828113bb1f3197f7e7c4a56/aiohttp-3.13.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7519bdc7dfc1940d201651b52bf5e03f5503bda45ad6eacf64dda98be5b2b6be", size = 732139, upload-time = "2025-10-28T20:57:02.455Z" }, + { url = "https://files.pythonhosted.org/packages/db/ed/1f59215ab6853fbaa5c8495fa6cbc39edfc93553426152b75d82a5f32b76/aiohttp-3.13.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:088912a78b4d4f547a1f19c099d5a506df17eacec3c6f4375e2831ec1d995742", size = 490082, upload-time = "2025-10-28T20:57:04.784Z" }, + { url = "https://files.pythonhosted.org/packages/68/7b/fe0fe0f5e05e13629d893c760465173a15ad0039c0a5b0d0040995c8075e/aiohttp-3.13.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5276807b9de9092af38ed23ce120539ab0ac955547b38563a9ba4f5b07b95293", size = 489035, upload-time = "2025-10-28T20:57:06.894Z" }, + { url = "https://files.pythonhosted.org/packages/d2/04/db5279e38471b7ac801d7d36a57d1230feeee130bbe2a74f72731b23c2b1/aiohttp-3.13.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1237c1375eaef0db4dcd7c2559f42e8af7b87ea7d295b118c60c36a6e61cb811", size = 1720387, upload-time = "2025-10-28T20:57:08.685Z" }, + { url = "https://files.pythonhosted.org/packages/31/07/8ea4326bd7dae2bd59828f69d7fdc6e04523caa55e4a70f4a8725a7e4ed2/aiohttp-3.13.2-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:96581619c57419c3d7d78703d5b78c1e5e5fc0172d60f555bdebaced82ded19a", size = 1688314, upload-time = "2025-10-28T20:57:10.693Z" }, + { url = "https://files.pythonhosted.org/packages/48/ab/3d98007b5b87ffd519d065225438cc3b668b2f245572a8cb53da5dd2b1bc/aiohttp-3.13.2-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a2713a95b47374169409d18103366de1050fe0ea73db358fc7a7acb2880422d4", size = 1756317, upload-time = "2025-10-28T20:57:12.563Z" }, + { url = "https://files.pythonhosted.org/packages/97/3d/801ca172b3d857fafb7b50c7c03f91b72b867a13abca982ed6b3081774ef/aiohttp-3.13.2-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:228a1cd556b3caca590e9511a89444925da87d35219a49ab5da0c36d2d943a6a", size = 1858539, upload-time = "2025-10-28T20:57:14.623Z" }, + { url = "https://files.pythonhosted.org/packages/f7/0d/4764669bdf47bd472899b3d3db91fffbe925c8e3038ec591a2fd2ad6a14d/aiohttp-3.13.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ac6cde5fba8d7d8c6ac963dbb0256a9854e9fafff52fbcc58fdf819357892c3e", size = 1739597, upload-time = "2025-10-28T20:57:16.399Z" }, + { url = "https://files.pythonhosted.org/packages/c4/52/7bd3c6693da58ba16e657eb904a5b6decfc48ecd06e9ac098591653b1566/aiohttp-3.13.2-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f2bef8237544f4e42878c61cef4e2839fee6346dc60f5739f876a9c50be7fcdb", size = 1555006, upload-time = "2025-10-28T20:57:18.288Z" }, + { url = "https://files.pythonhosted.org/packages/48/30/9586667acec5993b6f41d2ebcf96e97a1255a85f62f3c653110a5de4d346/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:16f15a4eac3bc2d76c45f7ebdd48a65d41b242eb6c31c2245463b40b34584ded", size = 1683220, upload-time = "2025-10-28T20:57:20.241Z" }, + { url = "https://files.pythonhosted.org/packages/71/01/3afe4c96854cfd7b30d78333852e8e851dceaec1c40fd00fec90c6402dd2/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:bb7fb776645af5cc58ab804c58d7eba545a97e047254a52ce89c157b5af6cd0b", size = 1712570, upload-time = "2025-10-28T20:57:22.253Z" }, + { url = "https://files.pythonhosted.org/packages/11/2c/22799d8e720f4697a9e66fd9c02479e40a49de3de2f0bbe7f9f78a987808/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:e1b4951125ec10c70802f2cb09736c895861cd39fd9dcb35107b4dc8ae6220b8", size = 1733407, upload-time = "2025-10-28T20:57:24.37Z" }, + { url = "https://files.pythonhosted.org/packages/34/cb/90f15dd029f07cebbd91f8238a8b363978b530cd128488085b5703683594/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:550bf765101ae721ee1d37d8095f47b1f220650f85fe1af37a90ce75bab89d04", size = 1550093, upload-time = "2025-10-28T20:57:26.257Z" }, + { url = "https://files.pythonhosted.org/packages/69/46/12dce9be9d3303ecbf4d30ad45a7683dc63d90733c2d9fe512be6716cd40/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:fe91b87fc295973096251e2d25a811388e7d8adf3bd2b97ef6ae78bc4ac6c476", size = 1758084, upload-time = "2025-10-28T20:57:28.349Z" }, + { url = "https://files.pythonhosted.org/packages/f9/c8/0932b558da0c302ffd639fc6362a313b98fdf235dc417bc2493da8394df7/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e0c8e31cfcc4592cb200160344b2fb6ae0f9e4effe06c644b5a125d4ae5ebe23", size = 1716987, upload-time = "2025-10-28T20:57:30.233Z" }, + { url = "https://files.pythonhosted.org/packages/5d/8b/f5bd1a75003daed099baec373aed678f2e9b34f2ad40d85baa1368556396/aiohttp-3.13.2-cp313-cp313-win32.whl", hash = "sha256:0740f31a60848d6edb296a0df827473eede90c689b8f9f2a4cdde74889eb2254", size = 425859, upload-time = "2025-10-28T20:57:32.105Z" }, + { url = "https://files.pythonhosted.org/packages/5d/28/a8a9fc6957b2cee8902414e41816b5ab5536ecf43c3b1843c10e82c559b2/aiohttp-3.13.2-cp313-cp313-win_amd64.whl", hash = "sha256:a88d13e7ca367394908f8a276b89d04a3652044612b9a408a0bb22a5ed976a1a", size = 452192, upload-time = "2025-10-28T20:57:34.166Z" }, + { url = "https://files.pythonhosted.org/packages/9b/36/e2abae1bd815f01c957cbf7be817b3043304e1c87bad526292a0410fdcf9/aiohttp-3.13.2-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:2475391c29230e063ef53a66669b7b691c9bfc3f1426a0f7bcdf1216bdbac38b", size = 735234, upload-time = "2025-10-28T20:57:36.415Z" }, + { url = "https://files.pythonhosted.org/packages/ca/e3/1ee62dde9b335e4ed41db6bba02613295a0d5b41f74a783c142745a12763/aiohttp-3.13.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:f33c8748abef4d8717bb20e8fb1b3e07c6adacb7fd6beaae971a764cf5f30d61", size = 490733, upload-time = "2025-10-28T20:57:38.205Z" }, + { url = "https://files.pythonhosted.org/packages/1a/aa/7a451b1d6a04e8d15a362af3e9b897de71d86feac3babf8894545d08d537/aiohttp-3.13.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ae32f24bbfb7dbb485a24b30b1149e2f200be94777232aeadba3eecece4d0aa4", size = 491303, upload-time = "2025-10-28T20:57:40.122Z" }, + { url = "https://files.pythonhosted.org/packages/57/1e/209958dbb9b01174870f6a7538cd1f3f28274fdbc88a750c238e2c456295/aiohttp-3.13.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d7f02042c1f009ffb70067326ef183a047425bb2ff3bc434ead4dd4a4a66a2b", size = 1717965, upload-time = "2025-10-28T20:57:42.28Z" }, + { url = "https://files.pythonhosted.org/packages/08/aa/6a01848d6432f241416bc4866cae8dc03f05a5a884d2311280f6a09c73d6/aiohttp-3.13.2-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:93655083005d71cd6c072cdab54c886e6570ad2c4592139c3fb967bfc19e4694", size = 1667221, upload-time = "2025-10-28T20:57:44.869Z" }, + { url = "https://files.pythonhosted.org/packages/87/4f/36c1992432d31bbc789fa0b93c768d2e9047ec8c7177e5cd84ea85155f36/aiohttp-3.13.2-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0db1e24b852f5f664cd728db140cf11ea0e82450471232a394b3d1a540b0f906", size = 1757178, upload-time = "2025-10-28T20:57:47.216Z" }, + { url = "https://files.pythonhosted.org/packages/ac/b4/8e940dfb03b7e0f68a82b88fd182b9be0a65cb3f35612fe38c038c3112cf/aiohttp-3.13.2-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b009194665bcd128e23eaddef362e745601afa4641930848af4c8559e88f18f9", size = 1838001, upload-time = "2025-10-28T20:57:49.337Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ef/39f3448795499c440ab66084a9db7d20ca7662e94305f175a80f5b7e0072/aiohttp-3.13.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c038a8fdc8103cd51dbd986ecdce141473ffd9775a7a8057a6ed9c3653478011", size = 1716325, upload-time = "2025-10-28T20:57:51.327Z" }, + { url = "https://files.pythonhosted.org/packages/d7/51/b311500ffc860b181c05d91c59a1313bdd05c82960fdd4035a15740d431e/aiohttp-3.13.2-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:66bac29b95a00db411cd758fea0e4b9bdba6d549dfe333f9a945430f5f2cc5a6", size = 1547978, upload-time = "2025-10-28T20:57:53.554Z" }, + { url = "https://files.pythonhosted.org/packages/31/64/b9d733296ef79815226dab8c586ff9e3df41c6aff2e16c06697b2d2e6775/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:4ebf9cfc9ba24a74cf0718f04aac2a3bbe745902cc7c5ebc55c0f3b5777ef213", size = 1682042, upload-time = "2025-10-28T20:57:55.617Z" }, + { url = "https://files.pythonhosted.org/packages/3f/30/43d3e0f9d6473a6db7d472104c4eff4417b1e9df01774cb930338806d36b/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:a4b88ebe35ce54205c7074f7302bd08a4cb83256a3e0870c72d6f68a3aaf8e49", size = 1680085, upload-time = "2025-10-28T20:57:57.59Z" }, + { url = "https://files.pythonhosted.org/packages/16/51/c709f352c911b1864cfd1087577760ced64b3e5bee2aa88b8c0c8e2e4972/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:98c4fb90bb82b70a4ed79ca35f656f4281885be076f3f970ce315402b53099ae", size = 1728238, upload-time = "2025-10-28T20:57:59.525Z" }, + { url = "https://files.pythonhosted.org/packages/19/e2/19bd4c547092b773caeb48ff5ae4b1ae86756a0ee76c16727fcfd281404b/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:ec7534e63ae0f3759df3a1ed4fa6bc8f75082a924b590619c0dd2f76d7043caa", size = 1544395, upload-time = "2025-10-28T20:58:01.914Z" }, + { url = "https://files.pythonhosted.org/packages/cf/87/860f2803b27dfc5ed7be532832a3498e4919da61299b4a1f8eb89b8ff44d/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:5b927cf9b935a13e33644cbed6c8c4b2d0f25b713d838743f8fe7191b33829c4", size = 1742965, upload-time = "2025-10-28T20:58:03.972Z" }, + { url = "https://files.pythonhosted.org/packages/67/7f/db2fc7618925e8c7a601094d5cbe539f732df4fb570740be88ed9e40e99a/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:88d6c017966a78c5265d996c19cdb79235be5e6412268d7e2ce7dee339471b7a", size = 1697585, upload-time = "2025-10-28T20:58:06.189Z" }, + { url = "https://files.pythonhosted.org/packages/0c/07/9127916cb09bb38284db5036036042b7b2c514c8ebaeee79da550c43a6d6/aiohttp-3.13.2-cp314-cp314-win32.whl", hash = "sha256:f7c183e786e299b5d6c49fb43a769f8eb8e04a2726a2bd5887b98b5cc2d67940", size = 431621, upload-time = "2025-10-28T20:58:08.636Z" }, + { url = "https://files.pythonhosted.org/packages/fb/41/554a8a380df6d3a2bba8a7726429a23f4ac62aaf38de43bb6d6cde7b4d4d/aiohttp-3.13.2-cp314-cp314-win_amd64.whl", hash = "sha256:fe242cd381e0fb65758faf5ad96c2e460df6ee5b2de1072fe97e4127927e00b4", size = 457627, upload-time = "2025-10-28T20:58:11Z" }, + { url = "https://files.pythonhosted.org/packages/c7/8e/3824ef98c039d3951cb65b9205a96dd2b20f22241ee17d89c5701557c826/aiohttp-3.13.2-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:f10d9c0b0188fe85398c61147bbd2a657d616c876863bfeff43376e0e3134673", size = 767360, upload-time = "2025-10-28T20:58:13.358Z" }, + { url = "https://files.pythonhosted.org/packages/a4/0f/6a03e3fc7595421274fa34122c973bde2d89344f8a881b728fa8c774e4f1/aiohttp-3.13.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:e7c952aefdf2460f4ae55c5e9c3e80aa72f706a6317e06020f80e96253b1accd", size = 504616, upload-time = "2025-10-28T20:58:15.339Z" }, + { url = "https://files.pythonhosted.org/packages/c6/aa/ed341b670f1bc8a6f2c6a718353d13b9546e2cef3544f573c6a1ff0da711/aiohttp-3.13.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c20423ce14771d98353d2e25e83591fa75dfa90a3c1848f3d7c68243b4fbded3", size = 509131, upload-time = "2025-10-28T20:58:17.693Z" }, + { url = "https://files.pythonhosted.org/packages/7f/f0/c68dac234189dae5c4bbccc0f96ce0cc16b76632cfc3a08fff180045cfa4/aiohttp-3.13.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e96eb1a34396e9430c19d8338d2ec33015e4a87ef2b4449db94c22412e25ccdf", size = 1864168, upload-time = "2025-10-28T20:58:20.113Z" }, + { url = "https://files.pythonhosted.org/packages/8f/65/75a9a76db8364b5d0e52a0c20eabc5d52297385d9af9c35335b924fafdee/aiohttp-3.13.2-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:23fb0783bc1a33640036465019d3bba069942616a6a2353c6907d7fe1ccdaf4e", size = 1719200, upload-time = "2025-10-28T20:58:22.583Z" }, + { url = "https://files.pythonhosted.org/packages/f5/55/8df2ed78d7f41d232f6bd3ff866b6f617026551aa1d07e2f03458f964575/aiohttp-3.13.2-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2e1a9bea6244a1d05a4e57c295d69e159a5c50d8ef16aa390948ee873478d9a5", size = 1843497, upload-time = "2025-10-28T20:58:24.672Z" }, + { url = "https://files.pythonhosted.org/packages/e9/e0/94d7215e405c5a02ccb6a35c7a3a6cfff242f457a00196496935f700cde5/aiohttp-3.13.2-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0a3d54e822688b56e9f6b5816fb3de3a3a64660efac64e4c2dc435230ad23bad", size = 1935703, upload-time = "2025-10-28T20:58:26.758Z" }, + { url = "https://files.pythonhosted.org/packages/0b/78/1eeb63c3f9b2d1015a4c02788fb543141aad0a03ae3f7a7b669b2483f8d4/aiohttp-3.13.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7a653d872afe9f33497215745da7a943d1dc15b728a9c8da1c3ac423af35178e", size = 1792738, upload-time = "2025-10-28T20:58:29.787Z" }, + { url = "https://files.pythonhosted.org/packages/41/75/aaf1eea4c188e51538c04cc568040e3082db263a57086ea74a7d38c39e42/aiohttp-3.13.2-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:56d36e80d2003fa3fc0207fac644216d8532e9504a785ef9a8fd013f84a42c61", size = 1624061, upload-time = "2025-10-28T20:58:32.529Z" }, + { url = "https://files.pythonhosted.org/packages/9b/c2/3b6034de81fbcc43de8aeb209073a2286dfb50b86e927b4efd81cf848197/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:78cd586d8331fb8e241c2dd6b2f4061778cc69e150514b39a9e28dd050475661", size = 1789201, upload-time = "2025-10-28T20:58:34.618Z" }, + { url = "https://files.pythonhosted.org/packages/c9/38/c15dcf6d4d890217dae79d7213988f4e5fe6183d43893a9cf2fe9e84ca8d/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:20b10bbfbff766294fe99987f7bb3b74fdd2f1a2905f2562132641ad434dcf98", size = 1776868, upload-time = "2025-10-28T20:58:38.835Z" }, + { url = "https://files.pythonhosted.org/packages/04/75/f74fd178ac81adf4f283a74847807ade5150e48feda6aef024403716c30c/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9ec49dff7e2b3c85cdeaa412e9d438f0ecd71676fde61ec57027dd392f00c693", size = 1790660, upload-time = "2025-10-28T20:58:41.507Z" }, + { url = "https://files.pythonhosted.org/packages/e7/80/7368bd0d06b16b3aba358c16b919e9c46cf11587dc572091031b0e9e3ef0/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:94f05348c4406450f9d73d38efb41d669ad6cd90c7ee194810d0eefbfa875a7a", size = 1617548, upload-time = "2025-10-28T20:58:43.674Z" }, + { url = "https://files.pythonhosted.org/packages/7d/4b/a6212790c50483cb3212e507378fbe26b5086d73941e1ec4b56a30439688/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:fa4dcb605c6f82a80c7f95713c2b11c3b8e9893b3ebd2bc9bde93165ed6107be", size = 1817240, upload-time = "2025-10-28T20:58:45.787Z" }, + { url = "https://files.pythonhosted.org/packages/ff/f7/ba5f0ba4ea8d8f3c32850912944532b933acbf0f3a75546b89269b9b7dde/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cf00e5db968c3f67eccd2778574cf64d8b27d95b237770aa32400bd7a1ca4f6c", size = 1762334, upload-time = "2025-10-28T20:58:47.936Z" }, + { url = "https://files.pythonhosted.org/packages/7e/83/1a5a1856574588b1cad63609ea9ad75b32a8353ac995d830bf5da9357364/aiohttp-3.13.2-cp314-cp314t-win32.whl", hash = "sha256:d23b5fe492b0805a50d3371e8a728a9134d8de5447dce4c885f5587294750734", size = 464685, upload-time = "2025-10-28T20:58:50.642Z" }, + { url = "https://files.pythonhosted.org/packages/9f/4d/d22668674122c08f4d56972297c51a624e64b3ed1efaa40187607a7cb66e/aiohttp-3.13.2-cp314-cp314t-win_amd64.whl", hash = "sha256:ff0a7b0a82a7ab905cbda74006318d1b12e37c797eb1b0d4eb3e316cf47f658f", size = 498093, upload-time = "2025-10-28T20:58:52.782Z" }, ] [[package]] @@ -261,6 +261,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/59/75/e0e10dc7ed1408c28e03a6cb2d7a407f99320eb953f229d008a7a6d05546/aniso8601-10.0.1-py2.py3-none-any.whl", hash = "sha256:eb19717fd4e0db6de1aab06f12450ab92144246b257423fe020af5748c0cb89e", size = 52848, upload-time = "2025-04-18T17:29:41.492Z" }, ] +[[package]] +name = "annotated-doc" +version = "0.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/a6/dc46877b911e40c00d395771ea710d5e77b6de7bacd5fdcd78d70cc5a48f/annotated_doc-0.0.3.tar.gz", hash = "sha256:e18370014c70187422c33e945053ff4c286f453a984eba84d0dbfa0c935adeda", size = 5535, upload-time = "2025-10-24T14:57:10.718Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/b7/cf592cb5de5cb3bade3357f8d2cf42bf103bbe39f459824b4939fd212911/annotated_doc-0.0.3-py3-none-any.whl", hash = "sha256:348ec6664a76f1fd3be81f43dffbee4c7e8ce931ba71ec67cc7f4ade7fbbb580", size = 5488, upload-time = "2025-10-24T14:57:09.462Z" }, +] + [[package]] name = "annotated-types" version = "0.7.0" @@ -604,16 +613,16 @@ wheels = [ [[package]] name = "botocore" -version = "1.40.49" +version = "1.40.61" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jmespath" }, { name = "python-dateutil" }, { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/01/6a/eb7503536552bbd3388b2607bc7a64e59d4f988336406b51a69d29f17ed2/botocore-1.40.49.tar.gz", hash = "sha256:fe8d4cbcc22de84c20190ae728c46b931bafeb40fce247010fb071c31b6532b5", size = 14415240, upload-time = "2025-10-09T19:21:37.133Z" } +sdist = { url = "https://files.pythonhosted.org/packages/28/a3/81d3a47c2dbfd76f185d3b894f2ad01a75096c006a2dd91f237dca182188/botocore-1.40.61.tar.gz", hash = "sha256:a2487ad69b090f9cccd64cf07c7021cd80ee9c0655ad974f87045b02f3ef52cd", size = 14393956, upload-time = "2025-10-28T19:26:46.108Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/fc/7b/dce396a3f7078e0432d40a9778602cbf0785ca91e7bcb64e05f19dfb5662/botocore-1.40.49-py3-none-any.whl", hash = "sha256:bf1089d0e77e4fc2e195d81c519b194ab62a4d4dd3e7113ee4e2bf903b0b75ab", size = 14085172, upload-time = "2025-10-09T19:21:32.721Z" }, + { url = "https://files.pythonhosted.org/packages/38/c5/f6ce561004db45f0b847c2cd9b19c67c6bf348a82018a48cb718be6b58b0/botocore-1.40.61-py3-none-any.whl", hash = "sha256:17ebae412692fd4824f99cde0f08d50126dc97954008e5ba2b522eb049238aa7", size = 14055973, upload-time = "2025-10-28T19:26:42.15Z" }, ] [[package]] @@ -1152,7 +1161,7 @@ wheels = [ [[package]] name = "datasets" -version = "4.2.0" +version = "4.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "dill" }, @@ -1170,9 +1179,9 @@ dependencies = [ { name = "tqdm" }, { name = "xxhash" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/70/48/0186fbc4b86a4f9ecaf04eb01e877e78b53bfa0b03be9c84b2298431ba33/datasets-4.2.0.tar.gz", hash = "sha256:8333a7db9f3bb8044c1b819a35d4e3e2809596c837793b0921382efffdc36e78", size = 582256, upload-time = "2025-10-09T16:10:15.534Z" } +sdist = { url = "https://files.pythonhosted.org/packages/2a/47/325206ac160f7699ed9f1798afa8f8f8d5189b03bf3815654859ac1d5cba/datasets-4.3.0.tar.gz", hash = "sha256:bc9118ed9afd92346c5be7ed3aaa00177eb907c25467f9d072a0d22777efbd2b", size = 582801, upload-time = "2025-10-23T16:31:51.547Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/91/9e/0bbbd09b116fd8ee2d3617e28e6598551d2f0f24d3a2ce99cc87ec85aeb0/datasets-4.2.0-py3-none-any.whl", hash = "sha256:fdc43aaf4a73b31f64f80f72f195ab413a1141ed15555d675b2fd17926f8b026", size = 506316, upload-time = "2025-10-09T16:10:13.375Z" }, + { url = "https://files.pythonhosted.org/packages/ca/51/409a8184ed35453d9cbb3d6b20d524b1115c2c2d117b85d5e9b06cd70b45/datasets-4.3.0-py3-none-any.whl", hash = "sha256:0ea157e72138b3ca6c7d2415f19a164ecf7d4c4fa72da2a570da286882e96903", size = 506846, upload-time = "2025-10-23T16:31:49.965Z" }, ] [[package]] @@ -1315,16 +1324,17 @@ wheels = [ [[package]] name = "fastapi" -version = "0.119.1" +version = "0.120.1" source = { registry = "https://pypi.org/simple" } dependencies = [ + { name = "annotated-doc" }, { name = "pydantic" }, { name = "starlette" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/a6/f4/152127681182e6413e7a89684c434e19e7414ed7ac0c632999c3c6980640/fastapi-0.119.1.tar.gz", hash = "sha256:a5e3426edce3fe221af4e1992c6d79011b247e3b03cc57999d697fe76cbf8ae0", size = 338616, upload-time = "2025-10-20T11:30:27.734Z" } +sdist = { url = "https://files.pythonhosted.org/packages/40/cc/28aff6e246ee85bd571b26e4a793b84d42700e3bdc3008c3d747eda7b06d/fastapi-0.120.1.tar.gz", hash = "sha256:b5c6217e9ddca6dfcf54c97986180d4a1955e10c693d74943fc5327700178bff", size = 337616, upload-time = "2025-10-27T17:53:42.954Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b1/26/e6d959b4ac959fdb3e9c4154656fc160794db6af8e64673d52759456bf07/fastapi-0.119.1-py3-none-any.whl", hash = "sha256:0b8c2a2cce853216e150e9bd4faaed88227f8eb37de21cb200771f491586a27f", size = 108123, upload-time = "2025-10-20T11:30:26.185Z" }, + { url = "https://files.pythonhosted.org/packages/7e/bb/1a74dbe87e9a595bf63052c886dfef965dc5b91d149456a8301eb3d41ce2/fastapi-0.120.1-py3-none-any.whl", hash = "sha256:0e8a2c328e96c117272d8c794d3a97d205f753cc2e69dd7ee387b7488a75601f", size = 108254, upload-time = "2025-10-27T17:53:40.076Z" }, ] [[package]] @@ -1736,17 +1746,31 @@ wheels = [ [[package]] name = "hf-xet" -version = "1.1.10" +version = "1.2.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/74/31/feeddfce1748c4a233ec1aa5b7396161c07ae1aa9b7bdbc9a72c3c7dd768/hf_xet-1.1.10.tar.gz", hash = "sha256:408aef343800a2102374a883f283ff29068055c111f003ff840733d3b715bb97", size = 487910, upload-time = "2025-09-12T20:10:27.12Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f7/a2/343e6d05de96908366bdc0081f2d8607d61200be2ac802769c4284cc65bd/hf_xet-1.1.10-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:686083aca1a6669bc85c21c0563551cbcdaa5cf7876a91f3d074a030b577231d", size = 2761466, upload-time = "2025-09-12T20:10:22.836Z" }, - { url = "https://files.pythonhosted.org/packages/31/f9/6215f948ac8f17566ee27af6430ea72045e0418ce757260248b483f4183b/hf_xet-1.1.10-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:71081925383b66b24eedff3013f8e6bbd41215c3338be4b94ba75fd75b21513b", size = 2623807, upload-time = "2025-09-12T20:10:21.118Z" }, - { url = "https://files.pythonhosted.org/packages/15/07/86397573efefff941e100367bbda0b21496ffcdb34db7ab51912994c32a2/hf_xet-1.1.10-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b6bceb6361c80c1cc42b5a7b4e3efd90e64630bcf11224dcac50ef30a47e435", size = 3186960, upload-time = "2025-09-12T20:10:19.336Z" }, - { url = "https://files.pythonhosted.org/packages/01/a7/0b2e242b918cc30e1f91980f3c4b026ff2eedaf1e2ad96933bca164b2869/hf_xet-1.1.10-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:eae7c1fc8a664e54753ffc235e11427ca61f4b0477d757cc4eb9ae374b69f09c", size = 3087167, upload-time = "2025-09-12T20:10:17.255Z" }, - { url = "https://files.pythonhosted.org/packages/4a/25/3e32ab61cc7145b11eee9d745988e2f0f4fafda81b25980eebf97d8cff15/hf_xet-1.1.10-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0a0005fd08f002180f7a12d4e13b22be277725bc23ed0529f8add5c7a6309c06", size = 3248612, upload-time = "2025-09-12T20:10:24.093Z" }, - { url = "https://files.pythonhosted.org/packages/2c/3d/ab7109e607ed321afaa690f557a9ada6d6d164ec852fd6bf9979665dc3d6/hf_xet-1.1.10-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f900481cf6e362a6c549c61ff77468bd59d6dd082f3170a36acfef2eb6a6793f", size = 3353360, upload-time = "2025-09-12T20:10:25.563Z" }, - { url = "https://files.pythonhosted.org/packages/ee/0e/471f0a21db36e71a2f1752767ad77e92d8cde24e974e03d662931b1305ec/hf_xet-1.1.10-cp37-abi3-win_amd64.whl", hash = "sha256:5f54b19cc347c13235ae7ee98b330c26dd65ef1df47e5316ffb1e87713ca7045", size = 2804691, upload-time = "2025-09-12T20:10:28.433Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/5e/6e/0f11bacf08a67f7fb5ee09740f2ca54163863b07b70d579356e9222ce5d8/hf_xet-1.2.0.tar.gz", hash = "sha256:a8c27070ca547293b6890c4bf389f713f80e8c478631432962bb7f4bc0bd7d7f", size = 506020, upload-time = "2025-10-24T19:04:32.129Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/a5/85ef910a0aa034a2abcfadc360ab5ac6f6bc4e9112349bd40ca97551cff0/hf_xet-1.2.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:ceeefcd1b7aed4956ae8499e2199607765fbd1c60510752003b6cc0b8413b649", size = 2861870, upload-time = "2025-10-24T19:04:11.422Z" }, + { url = "https://files.pythonhosted.org/packages/ea/40/e2e0a7eb9a51fe8828ba2d47fe22a7e74914ea8a0db68a18c3aa7449c767/hf_xet-1.2.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b70218dd548e9840224df5638fdc94bd033552963cfa97f9170829381179c813", size = 2717584, upload-time = "2025-10-24T19:04:09.586Z" }, + { url = "https://files.pythonhosted.org/packages/a5/7d/daf7f8bc4594fdd59a8a596f9e3886133fdc68e675292218a5e4c1b7e834/hf_xet-1.2.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d40b18769bb9a8bc82a9ede575ce1a44c75eb80e7375a01d76259089529b5dc", size = 3315004, upload-time = "2025-10-24T19:04:00.314Z" }, + { url = "https://files.pythonhosted.org/packages/b1/ba/45ea2f605fbf6d81c8b21e4d970b168b18a53515923010c312c06cd83164/hf_xet-1.2.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd3a6027d59cfb60177c12d6424e31f4b5ff13d8e3a1247b3a584bf8977e6df5", size = 3222636, upload-time = "2025-10-24T19:03:58.111Z" }, + { url = "https://files.pythonhosted.org/packages/4a/1d/04513e3cab8f29ab8c109d309ddd21a2705afab9d52f2ba1151e0c14f086/hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6de1fc44f58f6dd937956c8d304d8c2dea264c80680bcfa61ca4a15e7b76780f", size = 3408448, upload-time = "2025-10-24T19:04:20.951Z" }, + { url = "https://files.pythonhosted.org/packages/f0/7c/60a2756d7feec7387db3a1176c632357632fbe7849fce576c5559d4520c7/hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f182f264ed2acd566c514e45da9f2119110e48a87a327ca271027904c70c5832", size = 3503401, upload-time = "2025-10-24T19:04:22.549Z" }, + { url = "https://files.pythonhosted.org/packages/4e/64/48fffbd67fb418ab07451e4ce641a70de1c40c10a13e25325e24858ebe5a/hf_xet-1.2.0-cp313-cp313t-win_amd64.whl", hash = "sha256:293a7a3787e5c95d7be1857358a9130694a9c6021de3f27fa233f37267174382", size = 2900866, upload-time = "2025-10-24T19:04:33.461Z" }, + { url = "https://files.pythonhosted.org/packages/e2/51/f7e2caae42f80af886db414d4e9885fac959330509089f97cccb339c6b87/hf_xet-1.2.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:10bfab528b968c70e062607f663e21e34e2bba349e8038db546646875495179e", size = 2861861, upload-time = "2025-10-24T19:04:19.01Z" }, + { url = "https://files.pythonhosted.org/packages/6e/1d/a641a88b69994f9371bd347f1dd35e5d1e2e2460a2e350c8d5165fc62005/hf_xet-1.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2a212e842647b02eb6a911187dc878e79c4aa0aa397e88dd3b26761676e8c1f8", size = 2717699, upload-time = "2025-10-24T19:04:17.306Z" }, + { url = "https://files.pythonhosted.org/packages/df/e0/e5e9bba7d15f0318955f7ec3f4af13f92e773fbb368c0b8008a5acbcb12f/hf_xet-1.2.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30e06daccb3a7d4c065f34fc26c14c74f4653069bb2b194e7f18f17cbe9939c0", size = 3314885, upload-time = "2025-10-24T19:04:07.642Z" }, + { url = "https://files.pythonhosted.org/packages/21/90/b7fe5ff6f2b7b8cbdf1bd56145f863c90a5807d9758a549bf3d916aa4dec/hf_xet-1.2.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:29c8fc913a529ec0a91867ce3d119ac1aac966e098cf49501800c870328cc090", size = 3221550, upload-time = "2025-10-24T19:04:05.55Z" }, + { url = "https://files.pythonhosted.org/packages/6f/cb/73f276f0a7ce46cc6a6ec7d6c7d61cbfe5f2e107123d9bbd0193c355f106/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e159cbfcfbb29f920db2c09ed8b660eb894640d284f102ada929b6e3dc410a", size = 3408010, upload-time = "2025-10-24T19:04:28.598Z" }, + { url = "https://files.pythonhosted.org/packages/b8/1e/d642a12caa78171f4be64f7cd9c40e3ca5279d055d0873188a58c0f5fbb9/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9c91d5ae931510107f148874e9e2de8a16052b6f1b3ca3c1b12f15ccb491390f", size = 3503264, upload-time = "2025-10-24T19:04:30.397Z" }, + { url = "https://files.pythonhosted.org/packages/17/b5/33764714923fa1ff922770f7ed18c2daae034d21ae6e10dbf4347c854154/hf_xet-1.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:210d577732b519ac6ede149d2f2f34049d44e8622bf14eb3d63bbcd2d4b332dc", size = 2901071, upload-time = "2025-10-24T19:04:37.463Z" }, + { url = "https://files.pythonhosted.org/packages/96/2d/22338486473df5923a9ab7107d375dbef9173c338ebef5098ef593d2b560/hf_xet-1.2.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:46740d4ac024a7ca9b22bebf77460ff43332868b661186a8e46c227fdae01848", size = 2866099, upload-time = "2025-10-24T19:04:15.366Z" }, + { url = "https://files.pythonhosted.org/packages/7f/8c/c5becfa53234299bc2210ba314eaaae36c2875e0045809b82e40a9544f0c/hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:27df617a076420d8845bea087f59303da8be17ed7ec0cd7ee3b9b9f579dff0e4", size = 2722178, upload-time = "2025-10-24T19:04:13.695Z" }, + { url = "https://files.pythonhosted.org/packages/9a/92/cf3ab0b652b082e66876d08da57fcc6fa2f0e6c70dfbbafbd470bb73eb47/hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3651fd5bfe0281951b988c0facbe726aa5e347b103a675f49a3fa8144c7968fd", size = 3320214, upload-time = "2025-10-24T19:04:03.596Z" }, + { url = "https://files.pythonhosted.org/packages/46/92/3f7ec4a1b6a65bf45b059b6d4a5d38988f63e193056de2f420137e3c3244/hf_xet-1.2.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d06fa97c8562fb3ee7a378dd9b51e343bc5bc8190254202c9771029152f5e08c", size = 3229054, upload-time = "2025-10-24T19:04:01.949Z" }, + { url = "https://files.pythonhosted.org/packages/0b/dd/7ac658d54b9fb7999a0ccb07ad863b413cbaf5cf172f48ebcd9497ec7263/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4c1428c9ae73ec0939410ec73023c4f842927f39db09b063b9482dac5a3bb737", size = 3413812, upload-time = "2025-10-24T19:04:24.585Z" }, + { url = "https://files.pythonhosted.org/packages/92/68/89ac4e5b12a9ff6286a12174c8538a5930e2ed662091dd2572bbe0a18c8a/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a55558084c16b09b5ed32ab9ed38421e2d87cf3f1f89815764d1177081b99865", size = 3508920, upload-time = "2025-10-24T19:04:26.927Z" }, + { url = "https://files.pythonhosted.org/packages/cb/44/870d44b30e1dcfb6a65932e3e1506c103a8a5aea9103c337e7a53180322c/hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69", size = 2905735, upload-time = "2025-10-24T19:04:35.928Z" }, ] [[package]] @@ -2012,11 +2036,11 @@ wheels = [ [[package]] name = "lark" -version = "1.3.0" +version = "1.3.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1d/37/a13baf0135f348af608c667633cbe5d13aa2c5c15a56ae9ad3e6cba45ae3/lark-1.3.0.tar.gz", hash = "sha256:9a3839d0ca5e1faf7cfa3460e420e859b66bcbde05b634e73c369c8244c5fa48", size = 259551, upload-time = "2025-09-22T13:45:05.072Z" } +sdist = { url = "https://files.pythonhosted.org/packages/da/34/28fff3ab31ccff1fd4f6c7c7b0ceb2b6968d8ea4950663eadcb5720591a0/lark-1.3.1.tar.gz", hash = "sha256:b426a7a6d6d53189d318f2b6236ab5d6429eaf09259f1ca33eb716eed10d2905", size = 382732, upload-time = "2025-10-27T18:25:56.653Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a8/3e/1c6b43277de64fc3c0333b0e72ab7b52ddaaea205210d60d9b9f83c3d0c7/lark-1.3.0-py3-none-any.whl", hash = "sha256:80661f261fb2584a9828a097a2432efd575af27d20be0fd35d17f0fe37253831", size = 113002, upload-time = "2025-09-22T13:45:03.747Z" }, + { url = "https://files.pythonhosted.org/packages/82/3d/14ce75ef66813643812f3093ab17e46d3a206942ce7376d31ec2d36229e7/lark-1.3.1-py3-none-any.whl", hash = "sha256:c629b661023a014c37da873b4ff58a817398d12635d3bbb2c5a03be7fe5d1e12", size = 113151, upload-time = "2025-10-27T18:25:54.882Z" }, ] [[package]] @@ -2426,7 +2450,7 @@ requires-dist = [ { name = "torch" }, { name = "tqdm", marker = "extra == 'dev'" }, { name = "tqdm", marker = "extra == 'lts'" }, - { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'dev'", git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.8" }, + { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'dev'", git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.9" }, { name = "transformers", marker = "extra == 'lts'" }, { name = "transformers", marker = "extra == 'mlm'" }, { name = "wandb", marker = "extra == 'mlm'" }, @@ -2469,7 +2493,7 @@ linting = [ ] test = [ { name = "coverage" }, - { name = "nemo-run", git = "https://github.com/NVIDIA-NeMo/Run.git?rev=8ca8f7952a597f944985f1f1368a7acb9aa3a6c2" }, + { name = "nemo-run", git = "https://github.com/NVIDIA-NeMo/Run.git?rev=01a9a8ba360f7b2908728ad0516e0ad9d936966d" }, { name = "nltk" }, { name = "pydantic" }, { name = "pygithub" }, @@ -2887,7 +2911,7 @@ wheels = [ [[package]] name = "nemo-run" version = "0.7.0rc0.dev0" -source = { git = "https://github.com/NVIDIA-NeMo/Run.git?rev=8ca8f7952a597f944985f1f1368a7acb9aa3a6c2#8ca8f7952a597f944985f1f1368a7acb9aa3a6c2" } +source = { git = "https://github.com/NVIDIA-NeMo/Run.git?rev=01a9a8ba360f7b2908728ad0516e0ad9d936966d#01a9a8ba360f7b2908728ad0516e0ad9d936966d" } dependencies = [ { name = "catalogue" }, { name = "cryptography" }, @@ -3296,7 +3320,7 @@ dependencies = [ { name = "rich" }, { name = "safetensors" }, { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "scipy", version = "1.16.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "scipy", version = "1.16.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "torch", marker = "sys_platform == 'never'" }, { name = "torchprofile" }, { name = "torchvision", marker = "sys_platform == 'never'" }, @@ -3551,7 +3575,7 @@ wheels = [ [[package]] name = "onnx-ir" -version = "0.1.11" +version = "0.1.12" source = { registry = "https://pypi.org/simple" } resolution-markers = [ "python_full_version == '3.12.*' and sys_platform == 'linux'", @@ -3567,9 +3591,9 @@ dependencies = [ { name = "onnx", version = "1.19.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/4b/c4/d7c52d89120ae2d90025bf30999f44ec029bb297be706ada81a2b7ce3e73/onnx_ir-0.1.11.tar.gz", hash = "sha256:05fd55f7548f4301a17476c53e19c16f92f4fc4c0f468fcd8d3afb6869f8ae75", size = 112093, upload-time = "2025-10-15T22:20:46.785Z" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/1a/2a94112a39d01a9d1490f5ef3c205d8a17fe1ca27f307b026c40d62d8e9f/onnx_ir-0.1.12.tar.gz", hash = "sha256:742e0bff875d0547724187560b3f441833191c8aa939c05f14176f4892784deb", size = 112699, upload-time = "2025-10-28T23:43:54.129Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/3a/de/a9bb49f36e2d27ff2b1941972ce01838c9032155256e3380960c6f545455/onnx_ir-0.1.11-py3-none-any.whl", hash = "sha256:f23edd0d3f49b92abfab275625cb325da3978f5b41ba8cdaa28e85e87b44d2c1", size = 128694, upload-time = "2025-10-15T22:20:45.208Z" }, + { url = "https://files.pythonhosted.org/packages/c8/36/c4df116f5dcaa82ec7944e5d25624a3811f6603fd190660b0b079ea759fb/onnx_ir-0.1.12-py3-none-any.whl", hash = "sha256:17f86faf8a53b979430bde1bc6022c7a162b0d1534550ddb17a1d37eb993e765", size = 129277, upload-time = "2025-10-28T23:43:52.493Z" }, ] [[package]] @@ -3613,7 +3637,7 @@ dependencies = [ { name = "ml-dtypes", version = "0.5.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" }, { name = "numpy", marker = "python_full_version < '3.13'" }, { name = "onnx", version = "1.19.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" }, - { name = "onnx-ir", version = "0.1.11", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" }, + { name = "onnx-ir", version = "0.1.12", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" }, { name = "packaging", marker = "python_full_version < '3.13'" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] @@ -4031,18 +4055,28 @@ wheels = [ [[package]] name = "psutil" -version = "7.1.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/89/fc/889242351a932d6183eec5df1fc6539b6f36b6a88444f1e63f18668253aa/psutil-7.1.1.tar.gz", hash = "sha256:092b6350145007389c1cfe5716050f02030a05219d90057ea867d18fe8d372fc", size = 487067, upload-time = "2025-10-19T15:43:59.373Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/51/30/f97f8fb1f9ecfbeae4b5ca738dcae66ab28323b5cfbc96cb5565f3754056/psutil-7.1.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:8fa59d7b1f01f0337f12cd10dbd76e4312a4d3c730a4fedcbdd4e5447a8b8460", size = 244221, upload-time = "2025-10-19T15:44:03.145Z" }, - { url = "https://files.pythonhosted.org/packages/7b/98/b8d1f61ebf35f4dbdbaabadf9208282d8adc820562f0257e5e6e79e67bf2/psutil-7.1.1-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:2a95104eae85d088891716db676f780c1404fc15d47fde48a46a5d61e8f5ad2c", size = 245660, upload-time = "2025-10-19T15:44:05.657Z" }, - { url = "https://files.pythonhosted.org/packages/f0/4a/b8015d7357fefdfe34bc4a3db48a107bae4bad0b94fb6eb0613f09a08ada/psutil-7.1.1-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:98629cd8567acefcc45afe2f4ba1e9290f579eacf490a917967decce4b74ee9b", size = 286963, upload-time = "2025-10-19T15:44:08.877Z" }, - { url = "https://files.pythonhosted.org/packages/3d/3c/b56076bb35303d0733fc47b110a1c9cce081a05ae2e886575a3587c1ee76/psutil-7.1.1-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:92ebc58030fb054fa0f26c3206ef01c31c29d67aee1367e3483c16665c25c8d2", size = 290118, upload-time = "2025-10-19T15:44:11.897Z" }, - { url = "https://files.pythonhosted.org/packages/dc/af/c13d360c0adc6f6218bf9e2873480393d0f729c8dd0507d171f53061c0d3/psutil-7.1.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:146a704f224fb2ded2be3da5ac67fc32b9ea90c45b51676f9114a6ac45616967", size = 292587, upload-time = "2025-10-19T15:44:14.67Z" }, - { url = "https://files.pythonhosted.org/packages/90/2d/c933e7071ba60c7862813f2c7108ec4cf8304f1c79660efeefd0de982258/psutil-7.1.1-cp37-abi3-win32.whl", hash = "sha256:295c4025b5cd880f7445e4379e6826f7307e3d488947bf9834e865e7847dc5f7", size = 243772, upload-time = "2025-10-19T15:44:16.938Z" }, - { url = "https://files.pythonhosted.org/packages/be/f3/11fd213fff15427bc2853552138760c720fd65032d99edfb161910d04127/psutil-7.1.1-cp37-abi3-win_amd64.whl", hash = "sha256:9b4f17c5f65e44f69bd3a3406071a47b79df45cf2236d1f717970afcb526bcd3", size = 246936, upload-time = "2025-10-19T15:44:18.663Z" }, - { url = "https://files.pythonhosted.org/packages/0a/8d/8a9a45c8b655851f216c1d44f68e3533dc8d2c752ccd0f61f1aa73be4893/psutil-7.1.1-cp37-abi3-win_arm64.whl", hash = "sha256:5457cf741ca13da54624126cd5d333871b454ab133999a9a103fb097a7d7d21a", size = 243944, upload-time = "2025-10-19T15:44:20.666Z" }, +version = "7.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/ec/7b8e6b9b1d22708138630ef34c53ab2b61032c04f16adfdbb96791c8c70c/psutil-7.1.2.tar.gz", hash = "sha256:aa225cdde1335ff9684708ee8c72650f6598d5ed2114b9a7c5802030b1785018", size = 487424, upload-time = "2025-10-25T10:46:34.931Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b8/d9/b56cc9f883140ac10021a8c9b0f4e16eed1ba675c22513cdcbce3ba64014/psutil-7.1.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0cc5c6889b9871f231ed5455a9a02149e388fffcb30b607fb7a8896a6d95f22e", size = 238575, upload-time = "2025-10-25T10:46:38.728Z" }, + { url = "https://files.pythonhosted.org/packages/36/eb/28d22de383888deb252c818622196e709da98816e296ef95afda33f1c0a2/psutil-7.1.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:8e9e77a977208d84aa363a4a12e0f72189d58bbf4e46b49aae29a2c6e93ef206", size = 239297, upload-time = "2025-10-25T10:46:41.347Z" }, + { url = "https://files.pythonhosted.org/packages/89/5d/220039e2f28cc129626e54d63892ab05c0d56a29818bfe7268dcb5008932/psutil-7.1.2-cp313-cp313t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7d9623a5e4164d2220ecceb071f4b333b3c78866141e8887c072129185f41278", size = 280420, upload-time = "2025-10-25T10:46:44.122Z" }, + { url = "https://files.pythonhosted.org/packages/ba/7a/286f0e1c167445b2ef4a6cbdfc8c59fdb45a5a493788950cf8467201dc73/psutil-7.1.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:364b1c10fe4ed59c89ec49e5f1a70da353b27986fa8233b4b999df4742a5ee2f", size = 283049, upload-time = "2025-10-25T10:46:47.095Z" }, + { url = "https://files.pythonhosted.org/packages/aa/cc/7eb93260794a42e39b976f3a4dde89725800b9f573b014fac142002a5c98/psutil-7.1.2-cp313-cp313t-win_amd64.whl", hash = "sha256:f101ef84de7e05d41310e3ccbdd65a6dd1d9eed85e8aaf0758405d022308e204", size = 248713, upload-time = "2025-10-25T10:46:49.573Z" }, + { url = "https://files.pythonhosted.org/packages/ab/1a/0681a92b53366e01f0a099f5237d0c8a2f79d322ac589cccde5e30c8a4e2/psutil-7.1.2-cp313-cp313t-win_arm64.whl", hash = "sha256:20c00824048a95de67f00afedc7b08b282aa08638585b0206a9fb51f28f1a165", size = 244644, upload-time = "2025-10-25T10:46:51.924Z" }, + { url = "https://files.pythonhosted.org/packages/56/9e/f1c5c746b4ed5320952acd3002d3962fe36f30524c00ea79fdf954cc6779/psutil-7.1.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:e09cfe92aa8e22b1ec5e2d394820cf86c5dff6367ac3242366485dfa874d43bc", size = 238640, upload-time = "2025-10-25T10:46:54.089Z" }, + { url = "https://files.pythonhosted.org/packages/32/ee/fd26216a735395cc25c3899634e34aeb41fb1f3dbb44acc67d9e594be562/psutil-7.1.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:fa6342cf859c48b19df3e4aa170e4cfb64aadc50b11e06bb569c6c777b089c9e", size = 239303, upload-time = "2025-10-25T10:46:56.932Z" }, + { url = "https://files.pythonhosted.org/packages/3c/cd/7d96eaec4ef7742b845a9ce2759a2769ecce4ab7a99133da24abacbc9e41/psutil-7.1.2-cp314-cp314t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:625977443498ee7d6c1e63e93bacca893fd759a66c5f635d05e05811d23fb5ee", size = 281717, upload-time = "2025-10-25T10:46:59.116Z" }, + { url = "https://files.pythonhosted.org/packages/bc/1a/7f0b84bdb067d35fe7fade5fff888408688caf989806ce2d6dae08c72dd5/psutil-7.1.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4a24bcd7b7f2918d934af0fb91859f621b873d6aa81267575e3655cd387572a7", size = 284575, upload-time = "2025-10-25T10:47:00.944Z" }, + { url = "https://files.pythonhosted.org/packages/de/05/7820ef8f7b275268917e0c750eada5834581206d9024ca88edce93c4b762/psutil-7.1.2-cp314-cp314t-win_amd64.whl", hash = "sha256:329f05610da6380982e6078b9d0881d9ab1e9a7eb7c02d833bfb7340aa634e31", size = 249491, upload-time = "2025-10-25T10:47:03.174Z" }, + { url = "https://files.pythonhosted.org/packages/db/9a/58de399c7cb58489f08498459ff096cd76b3f1ddc4f224ec2c5ef729c7d0/psutil-7.1.2-cp314-cp314t-win_arm64.whl", hash = "sha256:7b04c29e3c0c888e83ed4762b70f31e65c42673ea956cefa8ced0e31e185f582", size = 244880, upload-time = "2025-10-25T10:47:05.228Z" }, + { url = "https://files.pythonhosted.org/packages/ae/89/b9f8d47ddbc52d7301fc868e8224e5f44ed3c7f55e6d0f54ecaf5dd9ff5e/psutil-7.1.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c9ba5c19f2d46203ee8c152c7b01df6eec87d883cfd8ee1af2ef2727f6b0f814", size = 237244, upload-time = "2025-10-25T10:47:07.086Z" }, + { url = "https://files.pythonhosted.org/packages/c8/7a/8628c2f6b240680a67d73d8742bb9ff39b1820a693740e43096d5dcb01e5/psutil-7.1.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:2a486030d2fe81bec023f703d3d155f4823a10a47c36784c84f1cc7f8d39bedb", size = 238101, upload-time = "2025-10-25T10:47:09.523Z" }, + { url = "https://files.pythonhosted.org/packages/30/28/5e27f4d5a0e347f8e3cc16cd7d35533dbce086c95807f1f0e9cd77e26c10/psutil-7.1.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3efd8fc791492e7808a51cb2b94889db7578bfaea22df931424f874468e389e3", size = 258675, upload-time = "2025-10-25T10:47:11.082Z" }, + { url = "https://files.pythonhosted.org/packages/e5/5c/79cf60c9acf36d087f0db0f82066fca4a780e97e5b3a2e4c38209c03d170/psutil-7.1.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e2aeb9b64f481b8eabfc633bd39e0016d4d8bbcd590d984af764d80bf0851b8a", size = 260203, upload-time = "2025-10-25T10:47:13.226Z" }, + { url = "https://files.pythonhosted.org/packages/f7/03/0a464404c51685dcb9329fdd660b1721e076ccd7b3d97dee066bcc9ffb15/psutil-7.1.2-cp37-abi3-win_amd64.whl", hash = "sha256:8e17852114c4e7996fe9da4745c2bdef001ebbf2f260dec406290e66628bdb91", size = 246714, upload-time = "2025-10-25T10:47:15.093Z" }, + { url = "https://files.pythonhosted.org/packages/6a/32/97ca2090f2f1b45b01b6aa7ae161cfe50671de097311975ca6eea3e7aabc/psutil-7.1.2-cp37-abi3-win_arm64.whl", hash = "sha256:3e988455e61c240cc879cb62a008c2699231bf3e3d061d7fce4234463fd2abb4", size = 243742, upload-time = "2025-10-25T10:47:17.302Z" }, ] [[package]] @@ -4056,45 +4090,59 @@ wheels = [ [[package]] name = "pyarrow" -version = "21.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ef/c2/ea068b8f00905c06329a3dfcd40d0fcc2b7d0f2e355bdb25b65e0a0e4cd4/pyarrow-21.0.0.tar.gz", hash = "sha256:5051f2dccf0e283ff56335760cbc8622cf52264d67e359d5569541ac11b6d5bc", size = 1133487, upload-time = "2025-07-18T00:57:31.761Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/17/d9/110de31880016e2afc52d8580b397dbe47615defbf09ca8cf55f56c62165/pyarrow-21.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e563271e2c5ff4d4a4cbeb2c83d5cf0d4938b891518e676025f7268c6fe5fe26", size = 31196837, upload-time = "2025-07-18T00:54:34.755Z" }, - { url = "https://files.pythonhosted.org/packages/df/5f/c1c1997613abf24fceb087e79432d24c19bc6f7259cab57c2c8e5e545fab/pyarrow-21.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:fee33b0ca46f4c85443d6c450357101e47d53e6c3f008d658c27a2d020d44c79", size = 32659470, upload-time = "2025-07-18T00:54:38.329Z" }, - { url = "https://files.pythonhosted.org/packages/3e/ed/b1589a777816ee33ba123ba1e4f8f02243a844fed0deec97bde9fb21a5cf/pyarrow-21.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:7be45519b830f7c24b21d630a31d48bcebfd5d4d7f9d3bdb49da9cdf6d764edb", size = 41055619, upload-time = "2025-07-18T00:54:42.172Z" }, - { url = "https://files.pythonhosted.org/packages/44/28/b6672962639e85dc0ac36f71ab3a8f5f38e01b51343d7aa372a6b56fa3f3/pyarrow-21.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:26bfd95f6bff443ceae63c65dc7e048670b7e98bc892210acba7e4995d3d4b51", size = 42733488, upload-time = "2025-07-18T00:54:47.132Z" }, - { url = "https://files.pythonhosted.org/packages/f8/cc/de02c3614874b9089c94eac093f90ca5dfa6d5afe45de3ba847fd950fdf1/pyarrow-21.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:bd04ec08f7f8bd113c55868bd3fc442a9db67c27af098c5f814a3091e71cc61a", size = 43329159, upload-time = "2025-07-18T00:54:51.686Z" }, - { url = "https://files.pythonhosted.org/packages/a6/3e/99473332ac40278f196e105ce30b79ab8affab12f6194802f2593d6b0be2/pyarrow-21.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9b0b14b49ac10654332a805aedfc0147fb3469cbf8ea951b3d040dab12372594", size = 45050567, upload-time = "2025-07-18T00:54:56.679Z" }, - { url = "https://files.pythonhosted.org/packages/7b/f5/c372ef60593d713e8bfbb7e0c743501605f0ad00719146dc075faf11172b/pyarrow-21.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:9d9f8bcb4c3be7738add259738abdeddc363de1b80e3310e04067aa1ca596634", size = 26217959, upload-time = "2025-07-18T00:55:00.482Z" }, - { url = "https://files.pythonhosted.org/packages/94/dc/80564a3071a57c20b7c32575e4a0120e8a330ef487c319b122942d665960/pyarrow-21.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c077f48aab61738c237802836fc3844f85409a46015635198761b0d6a688f87b", size = 31243234, upload-time = "2025-07-18T00:55:03.812Z" }, - { url = "https://files.pythonhosted.org/packages/ea/cc/3b51cb2db26fe535d14f74cab4c79b191ed9a8cd4cbba45e2379b5ca2746/pyarrow-21.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:689f448066781856237eca8d1975b98cace19b8dd2ab6145bf49475478bcaa10", size = 32714370, upload-time = "2025-07-18T00:55:07.495Z" }, - { url = "https://files.pythonhosted.org/packages/24/11/a4431f36d5ad7d83b87146f515c063e4d07ef0b7240876ddb885e6b44f2e/pyarrow-21.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:479ee41399fcddc46159a551705b89c05f11e8b8cb8e968f7fec64f62d91985e", size = 41135424, upload-time = "2025-07-18T00:55:11.461Z" }, - { url = "https://files.pythonhosted.org/packages/74/dc/035d54638fc5d2971cbf1e987ccd45f1091c83bcf747281cf6cc25e72c88/pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:40ebfcb54a4f11bcde86bc586cbd0272bac0d516cfa539c799c2453768477569", size = 42823810, upload-time = "2025-07-18T00:55:16.301Z" }, - { url = "https://files.pythonhosted.org/packages/2e/3b/89fced102448a9e3e0d4dded1f37fa3ce4700f02cdb8665457fcc8015f5b/pyarrow-21.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8d58d8497814274d3d20214fbb24abcad2f7e351474357d552a8d53bce70c70e", size = 43391538, upload-time = "2025-07-18T00:55:23.82Z" }, - { url = "https://files.pythonhosted.org/packages/fb/bb/ea7f1bd08978d39debd3b23611c293f64a642557e8141c80635d501e6d53/pyarrow-21.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:585e7224f21124dd57836b1530ac8f2df2afc43c861d7bf3d58a4870c42ae36c", size = 45120056, upload-time = "2025-07-18T00:55:28.231Z" }, - { url = "https://files.pythonhosted.org/packages/6e/0b/77ea0600009842b30ceebc3337639a7380cd946061b620ac1a2f3cb541e2/pyarrow-21.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:555ca6935b2cbca2c0e932bedd853e9bc523098c39636de9ad4693b5b1df86d6", size = 26220568, upload-time = "2025-07-18T00:55:32.122Z" }, - { url = "https://files.pythonhosted.org/packages/ca/d4/d4f817b21aacc30195cf6a46ba041dd1be827efa4a623cc8bf39a1c2a0c0/pyarrow-21.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:3a302f0e0963db37e0a24a70c56cf91a4faa0bca51c23812279ca2e23481fccd", size = 31160305, upload-time = "2025-07-18T00:55:35.373Z" }, - { url = "https://files.pythonhosted.org/packages/a2/9c/dcd38ce6e4b4d9a19e1d36914cb8e2b1da4e6003dd075474c4cfcdfe0601/pyarrow-21.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:b6b27cf01e243871390474a211a7922bfbe3bda21e39bc9160daf0da3fe48876", size = 32684264, upload-time = "2025-07-18T00:55:39.303Z" }, - { url = "https://files.pythonhosted.org/packages/4f/74/2a2d9f8d7a59b639523454bec12dba35ae3d0a07d8ab529dc0809f74b23c/pyarrow-21.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:e72a8ec6b868e258a2cd2672d91f2860ad532d590ce94cdf7d5e7ec674ccf03d", size = 41108099, upload-time = "2025-07-18T00:55:42.889Z" }, - { url = "https://files.pythonhosted.org/packages/ad/90/2660332eeb31303c13b653ea566a9918484b6e4d6b9d2d46879a33ab0622/pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b7ae0bbdc8c6674259b25bef5d2a1d6af5d39d7200c819cf99e07f7dfef1c51e", size = 42829529, upload-time = "2025-07-18T00:55:47.069Z" }, - { url = "https://files.pythonhosted.org/packages/33/27/1a93a25c92717f6aa0fca06eb4700860577d016cd3ae51aad0e0488ac899/pyarrow-21.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:58c30a1729f82d201627c173d91bd431db88ea74dcaa3885855bc6203e433b82", size = 43367883, upload-time = "2025-07-18T00:55:53.069Z" }, - { url = "https://files.pythonhosted.org/packages/05/d9/4d09d919f35d599bc05c6950095e358c3e15148ead26292dfca1fb659b0c/pyarrow-21.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:072116f65604b822a7f22945a7a6e581cfa28e3454fdcc6939d4ff6090126623", size = 45133802, upload-time = "2025-07-18T00:55:57.714Z" }, - { url = "https://files.pythonhosted.org/packages/71/30/f3795b6e192c3ab881325ffe172e526499eb3780e306a15103a2764916a2/pyarrow-21.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:cf56ec8b0a5c8c9d7021d6fd754e688104f9ebebf1bf4449613c9531f5346a18", size = 26203175, upload-time = "2025-07-18T00:56:01.364Z" }, - { url = "https://files.pythonhosted.org/packages/16/ca/c7eaa8e62db8fb37ce942b1ea0c6d7abfe3786ca193957afa25e71b81b66/pyarrow-21.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e99310a4ebd4479bcd1964dff9e14af33746300cb014aa4a3781738ac63baf4a", size = 31154306, upload-time = "2025-07-18T00:56:04.42Z" }, - { url = "https://files.pythonhosted.org/packages/ce/e8/e87d9e3b2489302b3a1aea709aaca4b781c5252fcb812a17ab6275a9a484/pyarrow-21.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:d2fe8e7f3ce329a71b7ddd7498b3cfac0eeb200c2789bd840234f0dc271a8efe", size = 32680622, upload-time = "2025-07-18T00:56:07.505Z" }, - { url = "https://files.pythonhosted.org/packages/84/52/79095d73a742aa0aba370c7942b1b655f598069489ab387fe47261a849e1/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:f522e5709379d72fb3da7785aa489ff0bb87448a9dc5a75f45763a795a089ebd", size = 41104094, upload-time = "2025-07-18T00:56:10.994Z" }, - { url = "https://files.pythonhosted.org/packages/89/4b/7782438b551dbb0468892a276b8c789b8bbdb25ea5c5eb27faadd753e037/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:69cbbdf0631396e9925e048cfa5bce4e8c3d3b41562bbd70c685a8eb53a91e61", size = 42825576, upload-time = "2025-07-18T00:56:15.569Z" }, - { url = "https://files.pythonhosted.org/packages/b3/62/0f29de6e0a1e33518dec92c65be0351d32d7ca351e51ec5f4f837a9aab91/pyarrow-21.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:731c7022587006b755d0bdb27626a1a3bb004bb56b11fb30d98b6c1b4718579d", size = 43368342, upload-time = "2025-07-18T00:56:19.531Z" }, - { url = "https://files.pythonhosted.org/packages/90/c7/0fa1f3f29cf75f339768cc698c8ad4ddd2481c1742e9741459911c9ac477/pyarrow-21.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:dc56bc708f2d8ac71bd1dcb927e458c93cec10b98eb4120206a4091db7b67b99", size = 45131218, upload-time = "2025-07-18T00:56:23.347Z" }, - { url = "https://files.pythonhosted.org/packages/01/63/581f2076465e67b23bc5a37d4a2abff8362d389d29d8105832e82c9c811c/pyarrow-21.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:186aa00bca62139f75b7de8420f745f2af12941595bbbfa7ed3870ff63e25636", size = 26087551, upload-time = "2025-07-18T00:56:26.758Z" }, - { url = "https://files.pythonhosted.org/packages/c9/ab/357d0d9648bb8241ee7348e564f2479d206ebe6e1c47ac5027c2e31ecd39/pyarrow-21.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:a7a102574faa3f421141a64c10216e078df467ab9576684d5cd696952546e2da", size = 31290064, upload-time = "2025-07-18T00:56:30.214Z" }, - { url = "https://files.pythonhosted.org/packages/3f/8a/5685d62a990e4cac2043fc76b4661bf38d06efed55cf45a334b455bd2759/pyarrow-21.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:1e005378c4a2c6db3ada3ad4c217b381f6c886f0a80d6a316fe586b90f77efd7", size = 32727837, upload-time = "2025-07-18T00:56:33.935Z" }, - { url = "https://files.pythonhosted.org/packages/fc/de/c0828ee09525c2bafefd3e736a248ebe764d07d0fd762d4f0929dbc516c9/pyarrow-21.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:65f8e85f79031449ec8706b74504a316805217b35b6099155dd7e227eef0d4b6", size = 41014158, upload-time = "2025-07-18T00:56:37.528Z" }, - { url = "https://files.pythonhosted.org/packages/6e/26/a2865c420c50b7a3748320b614f3484bfcde8347b2639b2b903b21ce6a72/pyarrow-21.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:3a81486adc665c7eb1a2bde0224cfca6ceaba344a82a971ef059678417880eb8", size = 42667885, upload-time = "2025-07-18T00:56:41.483Z" }, - { url = "https://files.pythonhosted.org/packages/0a/f9/4ee798dc902533159250fb4321267730bc0a107d8c6889e07c3add4fe3a5/pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fc0d2f88b81dcf3ccf9a6ae17f89183762c8a94a5bdcfa09e05cfe413acf0503", size = 43276625, upload-time = "2025-07-18T00:56:48.002Z" }, - { url = "https://files.pythonhosted.org/packages/5a/da/e02544d6997037a4b0d22d8e5f66bc9315c3671371a8b18c79ade1cefe14/pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6299449adf89df38537837487a4f8d3bd91ec94354fdd2a7d30bc11c48ef6e79", size = 44951890, upload-time = "2025-07-18T00:56:52.568Z" }, - { url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" }, +version = "22.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/30/53/04a7fdc63e6056116c9ddc8b43bc28c12cdd181b85cbeadb79278475f3ae/pyarrow-22.0.0.tar.gz", hash = "sha256:3d600dc583260d845c7d8a6db540339dd883081925da2bd1c5cb808f720b3cd9", size = 1151151, upload-time = "2025-10-24T12:30:00.762Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/9b/cb3f7e0a345353def531ca879053e9ef6b9f38ed91aebcf68b09ba54dec0/pyarrow-22.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:77718810bd3066158db1e95a63c160ad7ce08c6b0710bc656055033e39cdad88", size = 34223968, upload-time = "2025-10-24T10:03:31.21Z" }, + { url = "https://files.pythonhosted.org/packages/6c/41/3184b8192a120306270c5307f105b70320fdaa592c99843c5ef78aaefdcf/pyarrow-22.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:44d2d26cda26d18f7af7db71453b7b783788322d756e81730acb98f24eb90ace", size = 35942085, upload-time = "2025-10-24T10:03:38.146Z" }, + { url = "https://files.pythonhosted.org/packages/d9/3d/a1eab2f6f08001f9fb714b8ed5cfb045e2fe3e3e3c0c221f2c9ed1e6d67d/pyarrow-22.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:b9d71701ce97c95480fecb0039ec5bb889e75f110da72005743451339262f4ce", size = 44964613, upload-time = "2025-10-24T10:03:46.516Z" }, + { url = "https://files.pythonhosted.org/packages/46/46/a1d9c24baf21cfd9ce994ac820a24608decf2710521b29223d4334985127/pyarrow-22.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:710624ab925dc2b05a6229d47f6f0dac1c1155e6ed559be7109f684eba048a48", size = 47627059, upload-time = "2025-10-24T10:03:55.353Z" }, + { url = "https://files.pythonhosted.org/packages/3a/4c/f711acb13075c1391fd54bc17e078587672c575f8de2a6e62509af026dcf/pyarrow-22.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f963ba8c3b0199f9d6b794c90ec77545e05eadc83973897a4523c9e8d84e9340", size = 47947043, upload-time = "2025-10-24T10:04:05.408Z" }, + { url = "https://files.pythonhosted.org/packages/4e/70/1f3180dd7c2eab35c2aca2b29ace6c519f827dcd4cfeb8e0dca41612cf7a/pyarrow-22.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bd0d42297ace400d8febe55f13fdf46e86754842b860c978dfec16f081e5c653", size = 50206505, upload-time = "2025-10-24T10:04:15.786Z" }, + { url = "https://files.pythonhosted.org/packages/80/07/fea6578112c8c60ffde55883a571e4c4c6bc7049f119d6b09333b5cc6f73/pyarrow-22.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:00626d9dc0f5ef3a75fe63fd68b9c7c8302d2b5bbc7f74ecaedba83447a24f84", size = 28101641, upload-time = "2025-10-24T10:04:22.57Z" }, + { url = "https://files.pythonhosted.org/packages/2e/b7/18f611a8cdc43417f9394a3ccd3eace2f32183c08b9eddc3d17681819f37/pyarrow-22.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:3e294c5eadfb93d78b0763e859a0c16d4051fc1c5231ae8956d61cb0b5666f5a", size = 34272022, upload-time = "2025-10-24T10:04:28.973Z" }, + { url = "https://files.pythonhosted.org/packages/26/5c/f259e2526c67eb4b9e511741b19870a02363a47a35edbebc55c3178db22d/pyarrow-22.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:69763ab2445f632d90b504a815a2a033f74332997052b721002298ed6de40f2e", size = 35995834, upload-time = "2025-10-24T10:04:35.467Z" }, + { url = "https://files.pythonhosted.org/packages/50/8d/281f0f9b9376d4b7f146913b26fac0aa2829cd1ee7e997f53a27411bbb92/pyarrow-22.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:b41f37cabfe2463232684de44bad753d6be08a7a072f6a83447eeaf0e4d2a215", size = 45030348, upload-time = "2025-10-24T10:04:43.366Z" }, + { url = "https://files.pythonhosted.org/packages/f5/e5/53c0a1c428f0976bf22f513d79c73000926cb00b9c138d8e02daf2102e18/pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:35ad0f0378c9359b3f297299c3309778bb03b8612f987399a0333a560b43862d", size = 47699480, upload-time = "2025-10-24T10:04:51.486Z" }, + { url = "https://files.pythonhosted.org/packages/95/e1/9dbe4c465c3365959d183e6345d0a8d1dc5b02ca3f8db4760b3bc834cf25/pyarrow-22.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8382ad21458075c2e66a82a29d650f963ce51c7708c7c0ff313a8c206c4fd5e8", size = 48011148, upload-time = "2025-10-24T10:04:59.585Z" }, + { url = "https://files.pythonhosted.org/packages/c5/b4/7caf5d21930061444c3cf4fa7535c82faf5263e22ce43af7c2759ceb5b8b/pyarrow-22.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1a812a5b727bc09c3d7ea072c4eebf657c2f7066155506ba31ebf4792f88f016", size = 50276964, upload-time = "2025-10-24T10:05:08.175Z" }, + { url = "https://files.pythonhosted.org/packages/ae/f3/cec89bd99fa3abf826f14d4e53d3d11340ce6f6af4d14bdcd54cd83b6576/pyarrow-22.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:ec5d40dd494882704fb876c16fa7261a69791e784ae34e6b5992e977bd2e238c", size = 28106517, upload-time = "2025-10-24T10:05:14.314Z" }, + { url = "https://files.pythonhosted.org/packages/af/63/ba23862d69652f85b615ca14ad14f3bcfc5bf1b99ef3f0cd04ff93fdad5a/pyarrow-22.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:bea79263d55c24a32b0d79c00a1c58bb2ee5f0757ed95656b01c0fb310c5af3d", size = 34211578, upload-time = "2025-10-24T10:05:21.583Z" }, + { url = "https://files.pythonhosted.org/packages/b1/d0/f9ad86fe809efd2bcc8be32032fa72e8b0d112b01ae56a053006376c5930/pyarrow-22.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:12fe549c9b10ac98c91cf791d2945e878875d95508e1a5d14091a7aaa66d9cf8", size = 35989906, upload-time = "2025-10-24T10:05:29.485Z" }, + { url = "https://files.pythonhosted.org/packages/b4/a8/f910afcb14630e64d673f15904ec27dd31f1e009b77033c365c84e8c1e1d/pyarrow-22.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:334f900ff08ce0423407af97e6c26ad5d4e3b0763645559ece6fbf3747d6a8f5", size = 45021677, upload-time = "2025-10-24T10:05:38.274Z" }, + { url = "https://files.pythonhosted.org/packages/13/95/aec81f781c75cd10554dc17a25849c720d54feafb6f7847690478dcf5ef8/pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:c6c791b09c57ed76a18b03f2631753a4960eefbbca80f846da8baefc6491fcfe", size = 47726315, upload-time = "2025-10-24T10:05:47.314Z" }, + { url = "https://files.pythonhosted.org/packages/bb/d4/74ac9f7a54cfde12ee42734ea25d5a3c9a45db78f9def949307a92720d37/pyarrow-22.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c3200cb41cdbc65156e5f8c908d739b0dfed57e890329413da2748d1a2cd1a4e", size = 47990906, upload-time = "2025-10-24T10:05:58.254Z" }, + { url = "https://files.pythonhosted.org/packages/2e/71/fedf2499bf7a95062eafc989ace56572f3343432570e1c54e6599d5b88da/pyarrow-22.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ac93252226cf288753d8b46280f4edf3433bf9508b6977f8dd8526b521a1bbb9", size = 50306783, upload-time = "2025-10-24T10:06:08.08Z" }, + { url = "https://files.pythonhosted.org/packages/68/ed/b202abd5a5b78f519722f3d29063dda03c114711093c1995a33b8e2e0f4b/pyarrow-22.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:44729980b6c50a5f2bfcc2668d36c569ce17f8b17bccaf470c4313dcbbf13c9d", size = 27972883, upload-time = "2025-10-24T10:06:14.204Z" }, + { url = "https://files.pythonhosted.org/packages/a6/d6/d0fac16a2963002fc22c8fa75180a838737203d558f0ed3b564c4a54eef5/pyarrow-22.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e6e95176209257803a8b3d0394f21604e796dadb643d2f7ca21b66c9c0b30c9a", size = 34204629, upload-time = "2025-10-24T10:06:20.274Z" }, + { url = "https://files.pythonhosted.org/packages/c6/9c/1d6357347fbae062ad3f17082f9ebc29cc733321e892c0d2085f42a2212b/pyarrow-22.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:001ea83a58024818826a9e3f89bf9310a114f7e26dfe404a4c32686f97bd7901", size = 35985783, upload-time = "2025-10-24T10:06:27.301Z" }, + { url = "https://files.pythonhosted.org/packages/ff/c0/782344c2ce58afbea010150df07e3a2f5fdad299cd631697ae7bd3bac6e3/pyarrow-22.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:ce20fe000754f477c8a9125543f1936ea5b8867c5406757c224d745ed033e691", size = 45020999, upload-time = "2025-10-24T10:06:35.387Z" }, + { url = "https://files.pythonhosted.org/packages/1b/8b/5362443737a5307a7b67c1017c42cd104213189b4970bf607e05faf9c525/pyarrow-22.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e0a15757fccb38c410947df156f9749ae4a3c89b2393741a50521f39a8cf202a", size = 47724601, upload-time = "2025-10-24T10:06:43.551Z" }, + { url = "https://files.pythonhosted.org/packages/69/4d/76e567a4fc2e190ee6072967cb4672b7d9249ac59ae65af2d7e3047afa3b/pyarrow-22.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cedb9dd9358e4ea1d9bce3665ce0797f6adf97ff142c8e25b46ba9cdd508e9b6", size = 48001050, upload-time = "2025-10-24T10:06:52.284Z" }, + { url = "https://files.pythonhosted.org/packages/01/5e/5653f0535d2a1aef8223cee9d92944cb6bccfee5cf1cd3f462d7cb022790/pyarrow-22.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:252be4a05f9d9185bb8c18e83764ebcfea7185076c07a7a662253af3a8c07941", size = 50307877, upload-time = "2025-10-24T10:07:02.405Z" }, + { url = "https://files.pythonhosted.org/packages/2d/f8/1d0bd75bf9328a3b826e24a16e5517cd7f9fbf8d34a3184a4566ef5a7f29/pyarrow-22.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:a4893d31e5ef780b6edcaf63122df0f8d321088bb0dee4c8c06eccb1ca28d145", size = 27977099, upload-time = "2025-10-24T10:08:07.259Z" }, + { url = "https://files.pythonhosted.org/packages/90/81/db56870c997805bf2b0f6eeeb2d68458bf4654652dccdcf1bf7a42d80903/pyarrow-22.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:f7fe3dbe871294ba70d789be16b6e7e52b418311e166e0e3cba9522f0f437fb1", size = 34336685, upload-time = "2025-10-24T10:07:11.47Z" }, + { url = "https://files.pythonhosted.org/packages/1c/98/0727947f199aba8a120f47dfc229eeb05df15bcd7a6f1b669e9f882afc58/pyarrow-22.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:ba95112d15fd4f1105fb2402c4eab9068f0554435e9b7085924bcfaac2cc306f", size = 36032158, upload-time = "2025-10-24T10:07:18.626Z" }, + { url = "https://files.pythonhosted.org/packages/96/b4/9babdef9c01720a0785945c7cf550e4acd0ebcd7bdd2e6f0aa7981fa85e2/pyarrow-22.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:c064e28361c05d72eed8e744c9605cbd6d2bb7481a511c74071fd9b24bc65d7d", size = 44892060, upload-time = "2025-10-24T10:07:26.002Z" }, + { url = "https://files.pythonhosted.org/packages/f8/ca/2f8804edd6279f78a37062d813de3f16f29183874447ef6d1aadbb4efa0f/pyarrow-22.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:6f9762274496c244d951c819348afbcf212714902742225f649cf02823a6a10f", size = 47504395, upload-time = "2025-10-24T10:07:34.09Z" }, + { url = "https://files.pythonhosted.org/packages/b9/f0/77aa5198fd3943682b2e4faaf179a674f0edea0d55d326d83cb2277d9363/pyarrow-22.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a9d9ffdc2ab696f6b15b4d1f7cec6658e1d788124418cb30030afbae31c64746", size = 48066216, upload-time = "2025-10-24T10:07:43.528Z" }, + { url = "https://files.pythonhosted.org/packages/79/87/a1937b6e78b2aff18b706d738c9e46ade5bfcf11b294e39c87706a0089ac/pyarrow-22.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ec1a15968a9d80da01e1d30349b2b0d7cc91e96588ee324ce1b5228175043e95", size = 50288552, upload-time = "2025-10-24T10:07:53.519Z" }, + { url = "https://files.pythonhosted.org/packages/60/ae/b5a5811e11f25788ccfdaa8f26b6791c9807119dffcf80514505527c384c/pyarrow-22.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:bba208d9c7decf9961998edf5c65e3ea4355d5818dd6cd0f6809bec1afb951cc", size = 28262504, upload-time = "2025-10-24T10:08:00.932Z" }, + { url = "https://files.pythonhosted.org/packages/bd/b0/0fa4d28a8edb42b0a7144edd20befd04173ac79819547216f8a9f36f9e50/pyarrow-22.0.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:9bddc2cade6561f6820d4cd73f99a0243532ad506bc510a75a5a65a522b2d74d", size = 34224062, upload-time = "2025-10-24T10:08:14.101Z" }, + { url = "https://files.pythonhosted.org/packages/0f/a8/7a719076b3c1be0acef56a07220c586f25cd24de0e3f3102b438d18ae5df/pyarrow-22.0.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:e70ff90c64419709d38c8932ea9fe1cc98415c4f87ea8da81719e43f02534bc9", size = 35990057, upload-time = "2025-10-24T10:08:21.842Z" }, + { url = "https://files.pythonhosted.org/packages/89/3c/359ed54c93b47fb6fe30ed16cdf50e3f0e8b9ccfb11b86218c3619ae50a8/pyarrow-22.0.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:92843c305330aa94a36e706c16209cd4df274693e777ca47112617db7d0ef3d7", size = 45068002, upload-time = "2025-10-24T10:08:29.034Z" }, + { url = "https://files.pythonhosted.org/packages/55/fc/4945896cc8638536ee787a3bd6ce7cec8ec9acf452d78ec39ab328efa0a1/pyarrow-22.0.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:6dda1ddac033d27421c20d7a7943eec60be44e0db4e079f33cc5af3b8280ccde", size = 47737765, upload-time = "2025-10-24T10:08:38.559Z" }, + { url = "https://files.pythonhosted.org/packages/cd/5e/7cb7edeb2abfaa1f79b5d5eb89432356155c8426f75d3753cbcb9592c0fd/pyarrow-22.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:84378110dd9a6c06323b41b56e129c504d157d1a983ce8f5443761eb5256bafc", size = 48048139, upload-time = "2025-10-24T10:08:46.784Z" }, + { url = "https://files.pythonhosted.org/packages/88/c6/546baa7c48185f5e9d6e59277c4b19f30f48c94d9dd938c2a80d4d6b067c/pyarrow-22.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:854794239111d2b88b40b6ef92aa478024d1e5074f364033e73e21e3f76b25e0", size = 50314244, upload-time = "2025-10-24T10:08:55.771Z" }, + { url = "https://files.pythonhosted.org/packages/3c/79/755ff2d145aafec8d347bf18f95e4e81c00127f06d080135dfc86aea417c/pyarrow-22.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:b883fe6fd85adad7932b3271c38ac289c65b7337c2c132e9569f9d3940620730", size = 28757501, upload-time = "2025-10-24T10:09:59.891Z" }, + { url = "https://files.pythonhosted.org/packages/0e/d2/237d75ac28ced3147912954e3c1a174df43a95f4f88e467809118a8165e0/pyarrow-22.0.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:7a820d8ae11facf32585507c11f04e3f38343c1e784c9b5a8b1da5c930547fe2", size = 34355506, upload-time = "2025-10-24T10:09:02.953Z" }, + { url = "https://files.pythonhosted.org/packages/1e/2c/733dfffe6d3069740f98e57ff81007809067d68626c5faef293434d11bd6/pyarrow-22.0.0-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:c6ec3675d98915bf1ec8b3c7986422682f7232ea76cad276f4c8abd5b7319b70", size = 36047312, upload-time = "2025-10-24T10:09:10.334Z" }, + { url = "https://files.pythonhosted.org/packages/7c/2b/29d6e3782dc1f299727462c1543af357a0f2c1d3c160ce199950d9ca51eb/pyarrow-22.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:3e739edd001b04f654b166204fc7a9de896cf6007eaff33409ee9e50ceaff754", size = 45081609, upload-time = "2025-10-24T10:09:18.61Z" }, + { url = "https://files.pythonhosted.org/packages/8d/42/aa9355ecc05997915af1b7b947a7f66c02dcaa927f3203b87871c114ba10/pyarrow-22.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:7388ac685cab5b279a41dfe0a6ccd99e4dbf322edfb63e02fc0443bf24134e91", size = 47703663, upload-time = "2025-10-24T10:09:27.369Z" }, + { url = "https://files.pythonhosted.org/packages/ee/62/45abedde480168e83a1de005b7b7043fd553321c1e8c5a9a114425f64842/pyarrow-22.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f633074f36dbc33d5c05b5dc75371e5660f1dbf9c8b1d95669def05e5425989c", size = 48066543, upload-time = "2025-10-24T10:09:34.908Z" }, + { url = "https://files.pythonhosted.org/packages/84/e9/7878940a5b072e4f3bf998770acafeae13b267f9893af5f6d4ab3904b67e/pyarrow-22.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:4c19236ae2402a8663a2c8f21f1870a03cc57f0bef7e4b6eb3238cc82944de80", size = 50288838, upload-time = "2025-10-24T10:09:44.394Z" }, + { url = "https://files.pythonhosted.org/packages/7b/03/f335d6c52b4a4761bcc83499789a1e2e16d9d201a58c327a9b5cc9a41bd9/pyarrow-22.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0c34fe18094686194f204a3b1787a27456897d8a2d62caf84b61e8dfbc0252ae", size = 29185594, upload-time = "2025-10-24T10:09:53.111Z" }, ] [[package]] @@ -5074,7 +5122,7 @@ wheels = [ [[package]] name = "scipy" -version = "1.16.2" +version = "1.16.3" source = { registry = "https://pypi.org/simple" } resolution-markers = [ "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", @@ -5091,68 +5139,68 @@ resolution-markers = [ dependencies = [ { name = "numpy", marker = "python_full_version >= '3.11'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/4c/3b/546a6f0bfe791bbb7f8d591613454d15097e53f906308ec6f7c1ce588e8e/scipy-1.16.2.tar.gz", hash = "sha256:af029b153d243a80afb6eabe40b0a07f8e35c9adc269c019f364ad747f826a6b", size = 30580599, upload-time = "2025-09-11T17:48:08.271Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0b/ef/37ed4b213d64b48422df92560af7300e10fe30b5d665dd79932baebee0c6/scipy-1.16.2-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:6ab88ea43a57da1af33292ebd04b417e8e2eaf9d5aa05700be8d6e1b6501cd92", size = 36619956, upload-time = "2025-09-11T17:39:20.5Z" }, - { url = "https://files.pythonhosted.org/packages/85/ab/5c2eba89b9416961a982346a4d6a647d78c91ec96ab94ed522b3b6baf444/scipy-1.16.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c95e96c7305c96ede73a7389f46ccd6c659c4da5ef1b2789466baeaed3622b6e", size = 28931117, upload-time = "2025-09-11T17:39:29.06Z" }, - { url = "https://files.pythonhosted.org/packages/80/d1/eed51ab64d227fe60229a2d57fb60ca5898cfa50ba27d4f573e9e5f0b430/scipy-1.16.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:87eb178db04ece7c698220d523c170125dbffebb7af0345e66c3554f6f60c173", size = 20921997, upload-time = "2025-09-11T17:39:34.892Z" }, - { url = "https://files.pythonhosted.org/packages/be/7c/33ea3e23bbadde96726edba6bf9111fb1969d14d9d477ffa202c67bec9da/scipy-1.16.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:4e409eac067dcee96a57fbcf424c13f428037827ec7ee3cb671ff525ca4fc34d", size = 23523374, upload-time = "2025-09-11T17:39:40.846Z" }, - { url = "https://files.pythonhosted.org/packages/96/0b/7399dc96e1e3f9a05e258c98d716196a34f528eef2ec55aad651ed136d03/scipy-1.16.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e574be127bb760f0dad24ff6e217c80213d153058372362ccb9555a10fc5e8d2", size = 33583702, upload-time = "2025-09-11T17:39:49.011Z" }, - { url = "https://files.pythonhosted.org/packages/1a/bc/a5c75095089b96ea72c1bd37a4497c24b581ec73db4ef58ebee142ad2d14/scipy-1.16.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f5db5ba6188d698ba7abab982ad6973265b74bb40a1efe1821b58c87f73892b9", size = 35883427, upload-time = "2025-09-11T17:39:57.406Z" }, - { url = "https://files.pythonhosted.org/packages/ab/66/e25705ca3d2b87b97fe0a278a24b7f477b4023a926847935a1a71488a6a6/scipy-1.16.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ec6e74c4e884104ae006d34110677bfe0098203a3fec2f3faf349f4cb05165e3", size = 36212940, upload-time = "2025-09-11T17:40:06.013Z" }, - { url = "https://files.pythonhosted.org/packages/d6/fd/0bb911585e12f3abdd603d721d83fc1c7492835e1401a0e6d498d7822b4b/scipy-1.16.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:912f46667d2d3834bc3d57361f854226475f695eb08c08a904aadb1c936b6a88", size = 38865092, upload-time = "2025-09-11T17:40:15.143Z" }, - { url = "https://files.pythonhosted.org/packages/d6/73/c449a7d56ba6e6f874183759f8483cde21f900a8be117d67ffbb670c2958/scipy-1.16.2-cp311-cp311-win_amd64.whl", hash = "sha256:91e9e8a37befa5a69e9cacbe0bcb79ae5afb4a0b130fd6db6ee6cc0d491695fa", size = 38687626, upload-time = "2025-09-11T17:40:24.041Z" }, - { url = "https://files.pythonhosted.org/packages/68/72/02f37316adf95307f5d9e579023c6899f89ff3a051fa079dbd6faafc48e5/scipy-1.16.2-cp311-cp311-win_arm64.whl", hash = "sha256:f3bf75a6dcecab62afde4d1f973f1692be013110cad5338007927db8da73249c", size = 25503506, upload-time = "2025-09-11T17:40:30.703Z" }, - { url = "https://files.pythonhosted.org/packages/b7/8d/6396e00db1282279a4ddd507c5f5e11f606812b608ee58517ce8abbf883f/scipy-1.16.2-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:89d6c100fa5c48472047632e06f0876b3c4931aac1f4291afc81a3644316bb0d", size = 36646259, upload-time = "2025-09-11T17:40:39.329Z" }, - { url = "https://files.pythonhosted.org/packages/3b/93/ea9edd7e193fceb8eef149804491890bde73fb169c896b61aa3e2d1e4e77/scipy-1.16.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:ca748936cd579d3f01928b30a17dc474550b01272d8046e3e1ee593f23620371", size = 28888976, upload-time = "2025-09-11T17:40:46.82Z" }, - { url = "https://files.pythonhosted.org/packages/91/4d/281fddc3d80fd738ba86fd3aed9202331180b01e2c78eaae0642f22f7e83/scipy-1.16.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:fac4f8ce2ddb40e2e3d0f7ec36d2a1e7f92559a2471e59aec37bd8d9de01fec0", size = 20879905, upload-time = "2025-09-11T17:40:52.545Z" }, - { url = "https://files.pythonhosted.org/packages/69/40/b33b74c84606fd301b2915f0062e45733c6ff5708d121dd0deaa8871e2d0/scipy-1.16.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:033570f1dcefd79547a88e18bccacff025c8c647a330381064f561d43b821232", size = 23553066, upload-time = "2025-09-11T17:40:59.014Z" }, - { url = "https://files.pythonhosted.org/packages/55/a7/22c739e2f21a42cc8f16bc76b47cff4ed54fbe0962832c589591c2abec34/scipy-1.16.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ea3421209bf00c8a5ef2227de496601087d8f638a2363ee09af059bd70976dc1", size = 33336407, upload-time = "2025-09-11T17:41:06.796Z" }, - { url = "https://files.pythonhosted.org/packages/53/11/a0160990b82999b45874dc60c0c183d3a3a969a563fffc476d5a9995c407/scipy-1.16.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f66bd07ba6f84cd4a380b41d1bf3c59ea488b590a2ff96744845163309ee8e2f", size = 35673281, upload-time = "2025-09-11T17:41:15.055Z" }, - { url = "https://files.pythonhosted.org/packages/96/53/7ef48a4cfcf243c3d0f1643f5887c81f29fdf76911c4e49331828e19fc0a/scipy-1.16.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5e9feab931bd2aea4a23388c962df6468af3d808ddf2d40f94a81c5dc38f32ef", size = 36004222, upload-time = "2025-09-11T17:41:23.868Z" }, - { url = "https://files.pythonhosted.org/packages/49/7f/71a69e0afd460049d41c65c630c919c537815277dfea214031005f474d78/scipy-1.16.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:03dfc75e52f72cf23ec2ced468645321407faad8f0fe7b1f5b49264adbc29cb1", size = 38664586, upload-time = "2025-09-11T17:41:31.021Z" }, - { url = "https://files.pythonhosted.org/packages/34/95/20e02ca66fb495a95fba0642fd48e0c390d0ece9b9b14c6e931a60a12dea/scipy-1.16.2-cp312-cp312-win_amd64.whl", hash = "sha256:0ce54e07bbb394b417457409a64fd015be623f36e330ac49306433ffe04bc97e", size = 38550641, upload-time = "2025-09-11T17:41:36.61Z" }, - { url = "https://files.pythonhosted.org/packages/92/ad/13646b9beb0a95528ca46d52b7babafbe115017814a611f2065ee4e61d20/scipy-1.16.2-cp312-cp312-win_arm64.whl", hash = "sha256:2a8ffaa4ac0df81a0b94577b18ee079f13fecdb924df3328fc44a7dc5ac46851", size = 25456070, upload-time = "2025-09-11T17:41:41.3Z" }, - { url = "https://files.pythonhosted.org/packages/c1/27/c5b52f1ee81727a9fc457f5ac1e9bf3d6eab311805ea615c83c27ba06400/scipy-1.16.2-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:84f7bf944b43e20b8a894f5fe593976926744f6c185bacfcbdfbb62736b5cc70", size = 36604856, upload-time = "2025-09-11T17:41:47.695Z" }, - { url = "https://files.pythonhosted.org/packages/32/a9/15c20d08e950b540184caa8ced675ba1128accb0e09c653780ba023a4110/scipy-1.16.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:5c39026d12edc826a1ef2ad35ad1e6d7f087f934bb868fc43fa3049c8b8508f9", size = 28864626, upload-time = "2025-09-11T17:41:52.642Z" }, - { url = "https://files.pythonhosted.org/packages/4c/fc/ea36098df653cca26062a627c1a94b0de659e97127c8491e18713ca0e3b9/scipy-1.16.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:e52729ffd45b68777c5319560014d6fd251294200625d9d70fd8626516fc49f5", size = 20855689, upload-time = "2025-09-11T17:41:57.886Z" }, - { url = "https://files.pythonhosted.org/packages/dc/6f/d0b53be55727f3e6d7c72687ec18ea6d0047cf95f1f77488b99a2bafaee1/scipy-1.16.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:024dd4a118cccec09ca3209b7e8e614931a6ffb804b2a601839499cb88bdf925", size = 23512151, upload-time = "2025-09-11T17:42:02.303Z" }, - { url = "https://files.pythonhosted.org/packages/11/85/bf7dab56e5c4b1d3d8eef92ca8ede788418ad38a7dc3ff50262f00808760/scipy-1.16.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7a5dc7ee9c33019973a470556081b0fd3c9f4c44019191039f9769183141a4d9", size = 33329824, upload-time = "2025-09-11T17:42:07.549Z" }, - { url = "https://files.pythonhosted.org/packages/da/6a/1a927b14ddc7714111ea51f4e568203b2bb6ed59bdd036d62127c1a360c8/scipy-1.16.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c2275ff105e508942f99d4e3bc56b6ef5e4b3c0af970386ca56b777608ce95b7", size = 35681881, upload-time = "2025-09-11T17:42:13.255Z" }, - { url = "https://files.pythonhosted.org/packages/c1/5f/331148ea5780b4fcc7007a4a6a6ee0a0c1507a796365cc642d4d226e1c3a/scipy-1.16.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:af80196eaa84f033e48444d2e0786ec47d328ba00c71e4299b602235ffef9acb", size = 36006219, upload-time = "2025-09-11T17:42:18.765Z" }, - { url = "https://files.pythonhosted.org/packages/46/3a/e991aa9d2aec723b4a8dcfbfc8365edec5d5e5f9f133888067f1cbb7dfc1/scipy-1.16.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9fb1eb735fe3d6ed1f89918224e3385fbf6f9e23757cacc35f9c78d3b712dd6e", size = 38682147, upload-time = "2025-09-11T17:42:25.177Z" }, - { url = "https://files.pythonhosted.org/packages/a1/57/0f38e396ad19e41b4c5db66130167eef8ee620a49bc7d0512e3bb67e0cab/scipy-1.16.2-cp313-cp313-win_amd64.whl", hash = "sha256:fda714cf45ba43c9d3bae8f2585c777f64e3f89a2e073b668b32ede412d8f52c", size = 38520766, upload-time = "2025-09-11T17:43:25.342Z" }, - { url = "https://files.pythonhosted.org/packages/1b/a5/85d3e867b6822d331e26c862a91375bb7746a0b458db5effa093d34cdb89/scipy-1.16.2-cp313-cp313-win_arm64.whl", hash = "sha256:2f5350da923ccfd0b00e07c3e5cfb316c1c0d6c1d864c07a72d092e9f20db104", size = 25451169, upload-time = "2025-09-11T17:43:30.198Z" }, - { url = "https://files.pythonhosted.org/packages/09/d9/60679189bcebda55992d1a45498de6d080dcaf21ce0c8f24f888117e0c2d/scipy-1.16.2-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:53d8d2ee29b925344c13bda64ab51785f016b1b9617849dac10897f0701b20c1", size = 37012682, upload-time = "2025-09-11T17:42:30.677Z" }, - { url = "https://files.pythonhosted.org/packages/83/be/a99d13ee4d3b7887a96f8c71361b9659ba4ef34da0338f14891e102a127f/scipy-1.16.2-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:9e05e33657efb4c6a9d23bd8300101536abd99c85cca82da0bffff8d8764d08a", size = 29389926, upload-time = "2025-09-11T17:42:35.845Z" }, - { url = "https://files.pythonhosted.org/packages/bf/0a/130164a4881cec6ca8c00faf3b57926f28ed429cd6001a673f83c7c2a579/scipy-1.16.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:7fe65b36036357003b3ef9d37547abeefaa353b237e989c21027b8ed62b12d4f", size = 21381152, upload-time = "2025-09-11T17:42:40.07Z" }, - { url = "https://files.pythonhosted.org/packages/47/a6/503ffb0310ae77fba874e10cddfc4a1280bdcca1d13c3751b8c3c2996cf8/scipy-1.16.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:6406d2ac6d40b861cccf57f49592f9779071655e9f75cd4f977fa0bdd09cb2e4", size = 23914410, upload-time = "2025-09-11T17:42:44.313Z" }, - { url = "https://files.pythonhosted.org/packages/fa/c7/1147774bcea50d00c02600aadaa919facbd8537997a62496270133536ed6/scipy-1.16.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ff4dc42bd321991fbf611c23fc35912d690f731c9914bf3af8f417e64aca0f21", size = 33481880, upload-time = "2025-09-11T17:42:49.325Z" }, - { url = "https://files.pythonhosted.org/packages/6a/74/99d5415e4c3e46b2586f30cdbecb95e101c7192628a484a40dd0d163811a/scipy-1.16.2-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:654324826654d4d9133e10675325708fb954bc84dae6e9ad0a52e75c6b1a01d7", size = 35791425, upload-time = "2025-09-11T17:42:54.711Z" }, - { url = "https://files.pythonhosted.org/packages/1b/ee/a6559de7c1cc710e938c0355d9d4fbcd732dac4d0d131959d1f3b63eb29c/scipy-1.16.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:63870a84cd15c44e65220eaed2dac0e8f8b26bbb991456a033c1d9abfe8a94f8", size = 36178622, upload-time = "2025-09-11T17:43:00.375Z" }, - { url = "https://files.pythonhosted.org/packages/4e/7b/f127a5795d5ba8ece4e0dce7d4a9fb7cb9e4f4757137757d7a69ab7d4f1a/scipy-1.16.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:fa01f0f6a3050fa6a9771a95d5faccc8e2f5a92b4a2e5440a0fa7264a2398472", size = 38783985, upload-time = "2025-09-11T17:43:06.661Z" }, - { url = "https://files.pythonhosted.org/packages/3e/9f/bc81c1d1e033951eb5912cd3750cc005943afa3e65a725d2443a3b3c4347/scipy-1.16.2-cp313-cp313t-win_amd64.whl", hash = "sha256:116296e89fba96f76353a8579820c2512f6e55835d3fad7780fece04367de351", size = 38631367, upload-time = "2025-09-11T17:43:14.44Z" }, - { url = "https://files.pythonhosted.org/packages/d6/5e/2cc7555fd81d01814271412a1d59a289d25f8b63208a0a16c21069d55d3e/scipy-1.16.2-cp313-cp313t-win_arm64.whl", hash = "sha256:98e22834650be81d42982360382b43b17f7ba95e0e6993e2a4f5b9ad9283a94d", size = 25787992, upload-time = "2025-09-11T17:43:19.745Z" }, - { url = "https://files.pythonhosted.org/packages/8b/ac/ad8951250516db71619f0bd3b2eb2448db04b720a003dd98619b78b692c0/scipy-1.16.2-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:567e77755019bb7461513c87f02bb73fb65b11f049aaaa8ca17cfaa5a5c45d77", size = 36595109, upload-time = "2025-09-11T17:43:35.713Z" }, - { url = "https://files.pythonhosted.org/packages/ff/f6/5779049ed119c5b503b0f3dc6d6f3f68eefc3a9190d4ad4c276f854f051b/scipy-1.16.2-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:17d9bb346194e8967296621208fcdfd39b55498ef7d2f376884d5ac47cec1a70", size = 28859110, upload-time = "2025-09-11T17:43:40.814Z" }, - { url = "https://files.pythonhosted.org/packages/82/09/9986e410ae38bf0a0c737ff8189ac81a93b8e42349aac009891c054403d7/scipy-1.16.2-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:0a17541827a9b78b777d33b623a6dcfe2ef4a25806204d08ead0768f4e529a88", size = 20850110, upload-time = "2025-09-11T17:43:44.981Z" }, - { url = "https://files.pythonhosted.org/packages/0d/ad/485cdef2d9215e2a7df6d61b81d2ac073dfacf6ae24b9ae87274c4e936ae/scipy-1.16.2-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:d7d4c6ba016ffc0f9568d012f5f1eb77ddd99412aea121e6fa8b4c3b7cbad91f", size = 23497014, upload-time = "2025-09-11T17:43:49.074Z" }, - { url = "https://files.pythonhosted.org/packages/a7/74/f6a852e5d581122b8f0f831f1d1e32fb8987776ed3658e95c377d308ed86/scipy-1.16.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9702c4c023227785c779cba2e1d6f7635dbb5b2e0936cdd3a4ecb98d78fd41eb", size = 33401155, upload-time = "2025-09-11T17:43:54.661Z" }, - { url = "https://files.pythonhosted.org/packages/d9/f5/61d243bbc7c6e5e4e13dde9887e84a5cbe9e0f75fd09843044af1590844e/scipy-1.16.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d1cdf0ac28948d225decdefcc45ad7dd91716c29ab56ef32f8e0d50657dffcc7", size = 35691174, upload-time = "2025-09-11T17:44:00.101Z" }, - { url = "https://files.pythonhosted.org/packages/03/99/59933956331f8cc57e406cdb7a483906c74706b156998f322913e789c7e1/scipy-1.16.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:70327d6aa572a17c2941cdfb20673f82e536e91850a2e4cb0c5b858b690e1548", size = 36070752, upload-time = "2025-09-11T17:44:05.619Z" }, - { url = "https://files.pythonhosted.org/packages/c6/7d/00f825cfb47ee19ef74ecf01244b43e95eae74e7e0ff796026ea7cd98456/scipy-1.16.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5221c0b2a4b58aa7c4ed0387d360fd90ee9086d383bb34d9f2789fafddc8a936", size = 38701010, upload-time = "2025-09-11T17:44:11.322Z" }, - { url = "https://files.pythonhosted.org/packages/e4/9f/b62587029980378304ba5a8563d376c96f40b1e133daacee76efdcae32de/scipy-1.16.2-cp314-cp314-win_amd64.whl", hash = "sha256:f5a85d7b2b708025af08f060a496dd261055b617d776fc05a1a1cc69e09fe9ff", size = 39360061, upload-time = "2025-09-11T17:45:09.814Z" }, - { url = "https://files.pythonhosted.org/packages/82/04/7a2f1609921352c7fbee0815811b5050582f67f19983096c4769867ca45f/scipy-1.16.2-cp314-cp314-win_arm64.whl", hash = "sha256:2cc73a33305b4b24556957d5857d6253ce1e2dcd67fa0ff46d87d1670b3e1e1d", size = 26126914, upload-time = "2025-09-11T17:45:14.73Z" }, - { url = "https://files.pythonhosted.org/packages/51/b9/60929ce350c16b221928725d2d1d7f86cf96b8bc07415547057d1196dc92/scipy-1.16.2-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:9ea2a3fed83065d77367775d689401a703d0f697420719ee10c0780bcab594d8", size = 37013193, upload-time = "2025-09-11T17:44:16.757Z" }, - { url = "https://files.pythonhosted.org/packages/2a/41/ed80e67782d4bc5fc85a966bc356c601afddd175856ba7c7bb6d9490607e/scipy-1.16.2-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:7280d926f11ca945c3ef92ba960fa924e1465f8d07ce3a9923080363390624c4", size = 29390172, upload-time = "2025-09-11T17:44:21.783Z" }, - { url = "https://files.pythonhosted.org/packages/c4/a3/2f673ace4090452696ccded5f5f8efffb353b8f3628f823a110e0170b605/scipy-1.16.2-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:8afae1756f6a1fe04636407ef7dbece33d826a5d462b74f3d0eb82deabefd831", size = 21381326, upload-time = "2025-09-11T17:44:25.982Z" }, - { url = "https://files.pythonhosted.org/packages/42/bf/59df61c5d51395066c35836b78136accf506197617c8662e60ea209881e1/scipy-1.16.2-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:5c66511f29aa8d233388e7416a3f20d5cae7a2744d5cee2ecd38c081f4e861b3", size = 23915036, upload-time = "2025-09-11T17:44:30.527Z" }, - { url = "https://files.pythonhosted.org/packages/91/c3/edc7b300dc16847ad3672f1a6f3f7c5d13522b21b84b81c265f4f2760d4a/scipy-1.16.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:efe6305aeaa0e96b0ccca5ff647a43737d9a092064a3894e46c414db84bc54ac", size = 33484341, upload-time = "2025-09-11T17:44:35.981Z" }, - { url = "https://files.pythonhosted.org/packages/26/c7/24d1524e72f06ff141e8d04b833c20db3021020563272ccb1b83860082a9/scipy-1.16.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7f3a337d9ae06a1e8d655ee9d8ecb835ea5ddcdcbd8d23012afa055ab014f374", size = 35790840, upload-time = "2025-09-11T17:44:41.76Z" }, - { url = "https://files.pythonhosted.org/packages/aa/b7/5aaad984eeedd56858dc33d75efa59e8ce798d918e1033ef62d2708f2c3d/scipy-1.16.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:bab3605795d269067d8ce78a910220262711b753de8913d3deeaedb5dded3bb6", size = 36174716, upload-time = "2025-09-11T17:44:47.316Z" }, - { url = "https://files.pythonhosted.org/packages/fd/c2/e276a237acb09824822b0ada11b028ed4067fdc367a946730979feacb870/scipy-1.16.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b0348d8ddb55be2a844c518cd8cc8deeeb8aeba707cf834db5758fc89b476a2c", size = 38790088, upload-time = "2025-09-11T17:44:53.011Z" }, - { url = "https://files.pythonhosted.org/packages/c6/b4/5c18a766e8353015439f3780f5fc473f36f9762edc1a2e45da3ff5a31b21/scipy-1.16.2-cp314-cp314t-win_amd64.whl", hash = "sha256:26284797e38b8a75e14ea6631d29bda11e76ceaa6ddb6fdebbfe4c4d90faf2f9", size = 39457455, upload-time = "2025-09-11T17:44:58.899Z" }, - { url = "https://files.pythonhosted.org/packages/97/30/2f9a5243008f76dfc5dee9a53dfb939d9b31e16ce4bd4f2e628bfc5d89d2/scipy-1.16.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d2a4472c231328d4de38d5f1f68fdd6d28a615138f842580a8a321b5845cf779", size = 26448374, upload-time = "2025-09-11T17:45:03.45Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/0a/ca/d8ace4f98322d01abcd52d381134344bf7b431eba7ed8b42bdea5a3c2ac9/scipy-1.16.3.tar.gz", hash = "sha256:01e87659402762f43bd2fee13370553a17ada367d42e7487800bf2916535aecb", size = 30597883, upload-time = "2025-10-28T17:38:54.068Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/5f/6f37d7439de1455ce9c5a556b8d1db0979f03a796c030bafdf08d35b7bf9/scipy-1.16.3-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:40be6cf99e68b6c4321e9f8782e7d5ff8265af28ef2cd56e9c9b2638fa08ad97", size = 36630881, upload-time = "2025-10-28T17:31:47.104Z" }, + { url = "https://files.pythonhosted.org/packages/7c/89/d70e9f628749b7e4db2aa4cd89735502ff3f08f7b9b27d2e799485987cd9/scipy-1.16.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:8be1ca9170fcb6223cc7c27f4305d680ded114a1567c0bd2bfcbf947d1b17511", size = 28941012, upload-time = "2025-10-28T17:31:53.411Z" }, + { url = "https://files.pythonhosted.org/packages/a8/a8/0e7a9a6872a923505dbdf6bb93451edcac120363131c19013044a1e7cb0c/scipy-1.16.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:bea0a62734d20d67608660f69dcda23e7f90fb4ca20974ab80b6ed40df87a005", size = 20931935, upload-time = "2025-10-28T17:31:57.361Z" }, + { url = "https://files.pythonhosted.org/packages/bd/c7/020fb72bd79ad798e4dbe53938543ecb96b3a9ac3fe274b7189e23e27353/scipy-1.16.3-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:2a207a6ce9c24f1951241f4693ede2d393f59c07abc159b2cb2be980820e01fb", size = 23534466, upload-time = "2025-10-28T17:32:01.875Z" }, + { url = "https://files.pythonhosted.org/packages/be/a0/668c4609ce6dbf2f948e167836ccaf897f95fb63fa231c87da7558a374cd/scipy-1.16.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:532fb5ad6a87e9e9cd9c959b106b73145a03f04c7d57ea3e6f6bb60b86ab0876", size = 33593618, upload-time = "2025-10-28T17:32:06.902Z" }, + { url = "https://files.pythonhosted.org/packages/ca/6e/8942461cf2636cdae083e3eb72622a7fbbfa5cf559c7d13ab250a5dbdc01/scipy-1.16.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0151a0749efeaaab78711c78422d413c583b8cdd2011a3c1d6c794938ee9fdb2", size = 35899798, upload-time = "2025-10-28T17:32:12.665Z" }, + { url = "https://files.pythonhosted.org/packages/79/e8/d0f33590364cdbd67f28ce79368b373889faa4ee959588beddf6daef9abe/scipy-1.16.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b7180967113560cca57418a7bc719e30366b47959dd845a93206fbed693c867e", size = 36226154, upload-time = "2025-10-28T17:32:17.961Z" }, + { url = "https://files.pythonhosted.org/packages/39/c1/1903de608c0c924a1749c590064e65810f8046e437aba6be365abc4f7557/scipy-1.16.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:deb3841c925eeddb6afc1e4e4a45e418d19ec7b87c5df177695224078e8ec733", size = 38878540, upload-time = "2025-10-28T17:32:23.907Z" }, + { url = "https://files.pythonhosted.org/packages/f1/d0/22ec7036ba0b0a35bccb7f25ab407382ed34af0b111475eb301c16f8a2e5/scipy-1.16.3-cp311-cp311-win_amd64.whl", hash = "sha256:53c3844d527213631e886621df5695d35e4f6a75f620dca412bcd292f6b87d78", size = 38722107, upload-time = "2025-10-28T17:32:29.921Z" }, + { url = "https://files.pythonhosted.org/packages/7b/60/8a00e5a524bb3bf8898db1650d350f50e6cffb9d7a491c561dc9826c7515/scipy-1.16.3-cp311-cp311-win_arm64.whl", hash = "sha256:9452781bd879b14b6f055b26643703551320aa8d79ae064a71df55c00286a184", size = 25506272, upload-time = "2025-10-28T17:32:34.577Z" }, + { url = "https://files.pythonhosted.org/packages/40/41/5bf55c3f386b1643812f3a5674edf74b26184378ef0f3e7c7a09a7e2ca7f/scipy-1.16.3-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:81fc5827606858cf71446a5e98715ba0e11f0dbc83d71c7409d05486592a45d6", size = 36659043, upload-time = "2025-10-28T17:32:40.285Z" }, + { url = "https://files.pythonhosted.org/packages/1e/0f/65582071948cfc45d43e9870bf7ca5f0e0684e165d7c9ef4e50d783073eb/scipy-1.16.3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:c97176013d404c7346bf57874eaac5187d969293bf40497140b0a2b2b7482e07", size = 28898986, upload-time = "2025-10-28T17:32:45.325Z" }, + { url = "https://files.pythonhosted.org/packages/96/5e/36bf3f0ac298187d1ceadde9051177d6a4fe4d507e8f59067dc9dd39e650/scipy-1.16.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2b71d93c8a9936046866acebc915e2af2e292b883ed6e2cbe5c34beb094b82d9", size = 20889814, upload-time = "2025-10-28T17:32:49.277Z" }, + { url = "https://files.pythonhosted.org/packages/80/35/178d9d0c35394d5d5211bbff7ac4f2986c5488b59506fef9e1de13ea28d3/scipy-1.16.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3d4a07a8e785d80289dfe66b7c27d8634a773020742ec7187b85ccc4b0e7b686", size = 23565795, upload-time = "2025-10-28T17:32:53.337Z" }, + { url = "https://files.pythonhosted.org/packages/fa/46/d1146ff536d034d02f83c8afc3c4bab2eddb634624d6529a8512f3afc9da/scipy-1.16.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0553371015692a898e1aa858fed67a3576c34edefa6b7ebdb4e9dde49ce5c203", size = 33349476, upload-time = "2025-10-28T17:32:58.353Z" }, + { url = "https://files.pythonhosted.org/packages/79/2e/415119c9ab3e62249e18c2b082c07aff907a273741b3f8160414b0e9193c/scipy-1.16.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:72d1717fd3b5e6ec747327ce9bda32d5463f472c9dce9f54499e81fbd50245a1", size = 35676692, upload-time = "2025-10-28T17:33:03.88Z" }, + { url = "https://files.pythonhosted.org/packages/27/82/df26e44da78bf8d2aeaf7566082260cfa15955a5a6e96e6a29935b64132f/scipy-1.16.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1fb2472e72e24d1530debe6ae078db70fb1605350c88a3d14bc401d6306dbffe", size = 36019345, upload-time = "2025-10-28T17:33:09.773Z" }, + { url = "https://files.pythonhosted.org/packages/82/31/006cbb4b648ba379a95c87262c2855cd0d09453e500937f78b30f02fa1cd/scipy-1.16.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c5192722cffe15f9329a3948c4b1db789fbb1f05c97899187dcf009b283aea70", size = 38678975, upload-time = "2025-10-28T17:33:15.809Z" }, + { url = "https://files.pythonhosted.org/packages/c2/7f/acbd28c97e990b421af7d6d6cd416358c9c293fc958b8529e0bd5d2a2a19/scipy-1.16.3-cp312-cp312-win_amd64.whl", hash = "sha256:56edc65510d1331dae01ef9b658d428e33ed48b4f77b1d51caf479a0253f96dc", size = 38555926, upload-time = "2025-10-28T17:33:21.388Z" }, + { url = "https://files.pythonhosted.org/packages/ce/69/c5c7807fd007dad4f48e0a5f2153038dc96e8725d3345b9ee31b2b7bed46/scipy-1.16.3-cp312-cp312-win_arm64.whl", hash = "sha256:a8a26c78ef223d3e30920ef759e25625a0ecdd0d60e5a8818b7513c3e5384cf2", size = 25463014, upload-time = "2025-10-28T17:33:25.975Z" }, + { url = "https://files.pythonhosted.org/packages/72/f1/57e8327ab1508272029e27eeef34f2302ffc156b69e7e233e906c2a5c379/scipy-1.16.3-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:d2ec56337675e61b312179a1ad124f5f570c00f920cc75e1000025451b88241c", size = 36617856, upload-time = "2025-10-28T17:33:31.375Z" }, + { url = "https://files.pythonhosted.org/packages/44/13/7e63cfba8a7452eb756306aa2fd9b37a29a323b672b964b4fdeded9a3f21/scipy-1.16.3-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:16b8bc35a4cc24db80a0ec836a9286d0e31b2503cb2fd7ff7fb0e0374a97081d", size = 28874306, upload-time = "2025-10-28T17:33:36.516Z" }, + { url = "https://files.pythonhosted.org/packages/15/65/3a9400efd0228a176e6ec3454b1fa998fbbb5a8defa1672c3f65706987db/scipy-1.16.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:5803c5fadd29de0cf27fa08ccbfe7a9e5d741bf63e4ab1085437266f12460ff9", size = 20865371, upload-time = "2025-10-28T17:33:42.094Z" }, + { url = "https://files.pythonhosted.org/packages/33/d7/eda09adf009a9fb81827194d4dd02d2e4bc752cef16737cc4ef065234031/scipy-1.16.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:b81c27fc41954319a943d43b20e07c40bdcd3ff7cf013f4fb86286faefe546c4", size = 23524877, upload-time = "2025-10-28T17:33:48.483Z" }, + { url = "https://files.pythonhosted.org/packages/7d/6b/3f911e1ebc364cb81320223a3422aab7d26c9c7973109a9cd0f27c64c6c0/scipy-1.16.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0c3b4dd3d9b08dbce0f3440032c52e9e2ab9f96ade2d3943313dfe51a7056959", size = 33342103, upload-time = "2025-10-28T17:33:56.495Z" }, + { url = "https://files.pythonhosted.org/packages/21/f6/4bfb5695d8941e5c570a04d9fcd0d36bce7511b7d78e6e75c8f9791f82d0/scipy-1.16.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7dc1360c06535ea6116a2220f760ae572db9f661aba2d88074fe30ec2aa1ff88", size = 35697297, upload-time = "2025-10-28T17:34:04.722Z" }, + { url = "https://files.pythonhosted.org/packages/04/e1/6496dadbc80d8d896ff72511ecfe2316b50313bfc3ebf07a3f580f08bd8c/scipy-1.16.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:663b8d66a8748051c3ee9c96465fb417509315b99c71550fda2591d7dd634234", size = 36021756, upload-time = "2025-10-28T17:34:13.482Z" }, + { url = "https://files.pythonhosted.org/packages/fe/bd/a8c7799e0136b987bda3e1b23d155bcb31aec68a4a472554df5f0937eef7/scipy-1.16.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eab43fae33a0c39006a88096cd7b4f4ef545ea0447d250d5ac18202d40b6611d", size = 38696566, upload-time = "2025-10-28T17:34:22.384Z" }, + { url = "https://files.pythonhosted.org/packages/cd/01/1204382461fcbfeb05b6161b594f4007e78b6eba9b375382f79153172b4d/scipy-1.16.3-cp313-cp313-win_amd64.whl", hash = "sha256:062246acacbe9f8210de8e751b16fc37458213f124bef161a5a02c7a39284304", size = 38529877, upload-time = "2025-10-28T17:35:51.076Z" }, + { url = "https://files.pythonhosted.org/packages/7f/14/9d9fbcaa1260a94f4bb5b64ba9213ceb5d03cd88841fe9fd1ffd47a45b73/scipy-1.16.3-cp313-cp313-win_arm64.whl", hash = "sha256:50a3dbf286dbc7d84f176f9a1574c705f277cb6565069f88f60db9eafdbe3ee2", size = 25455366, upload-time = "2025-10-28T17:35:59.014Z" }, + { url = "https://files.pythonhosted.org/packages/e2/a3/9ec205bd49f42d45d77f1730dbad9ccf146244c1647605cf834b3a8c4f36/scipy-1.16.3-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:fb4b29f4cf8cc5a8d628bc8d8e26d12d7278cd1f219f22698a378c3d67db5e4b", size = 37027931, upload-time = "2025-10-28T17:34:31.451Z" }, + { url = "https://files.pythonhosted.org/packages/25/06/ca9fd1f3a4589cbd825b1447e5db3a8ebb969c1eaf22c8579bd286f51b6d/scipy-1.16.3-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:8d09d72dc92742988b0e7750bddb8060b0c7079606c0d24a8cc8e9c9c11f9079", size = 29400081, upload-time = "2025-10-28T17:34:39.087Z" }, + { url = "https://files.pythonhosted.org/packages/6a/56/933e68210d92657d93fb0e381683bc0e53a965048d7358ff5fbf9e6a1b17/scipy-1.16.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:03192a35e661470197556de24e7cb1330d84b35b94ead65c46ad6f16f6b28f2a", size = 21391244, upload-time = "2025-10-28T17:34:45.234Z" }, + { url = "https://files.pythonhosted.org/packages/a8/7e/779845db03dc1418e215726329674b40576879b91814568757ff0014ad65/scipy-1.16.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:57d01cb6f85e34f0946b33caa66e892aae072b64b034183f3d87c4025802a119", size = 23929753, upload-time = "2025-10-28T17:34:51.793Z" }, + { url = "https://files.pythonhosted.org/packages/4c/4b/f756cf8161d5365dcdef9e5f460ab226c068211030a175d2fc7f3f41ca64/scipy-1.16.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:96491a6a54e995f00a28a3c3badfff58fd093bf26cd5fb34a2188c8c756a3a2c", size = 33496912, upload-time = "2025-10-28T17:34:59.8Z" }, + { url = "https://files.pythonhosted.org/packages/09/b5/222b1e49a58668f23839ca1542a6322bb095ab8d6590d4f71723869a6c2c/scipy-1.16.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cd13e354df9938598af2be05822c323e97132d5e6306b83a3b4ee6724c6e522e", size = 35802371, upload-time = "2025-10-28T17:35:08.173Z" }, + { url = "https://files.pythonhosted.org/packages/c1/8d/5964ef68bb31829bde27611f8c9deeac13764589fe74a75390242b64ca44/scipy-1.16.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:63d3cdacb8a824a295191a723ee5e4ea7768ca5ca5f2838532d9f2e2b3ce2135", size = 36190477, upload-time = "2025-10-28T17:35:16.7Z" }, + { url = "https://files.pythonhosted.org/packages/ab/f2/b31d75cb9b5fa4dd39a0a931ee9b33e7f6f36f23be5ef560bf72e0f92f32/scipy-1.16.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e7efa2681ea410b10dde31a52b18b0154d66f2485328830e45fdf183af5aefc6", size = 38796678, upload-time = "2025-10-28T17:35:26.354Z" }, + { url = "https://files.pythonhosted.org/packages/b4/1e/b3723d8ff64ab548c38d87055483714fefe6ee20e0189b62352b5e015bb1/scipy-1.16.3-cp313-cp313t-win_amd64.whl", hash = "sha256:2d1ae2cf0c350e7705168ff2429962a89ad90c2d49d1dd300686d8b2a5af22fc", size = 38640178, upload-time = "2025-10-28T17:35:35.304Z" }, + { url = "https://files.pythonhosted.org/packages/8e/f3/d854ff38789aca9b0cc23008d607ced9de4f7ab14fa1ca4329f86b3758ca/scipy-1.16.3-cp313-cp313t-win_arm64.whl", hash = "sha256:0c623a54f7b79dd88ef56da19bc2873afec9673a48f3b85b18e4d402bdd29a5a", size = 25803246, upload-time = "2025-10-28T17:35:42.155Z" }, + { url = "https://files.pythonhosted.org/packages/99/f6/99b10fd70f2d864c1e29a28bbcaa0c6340f9d8518396542d9ea3b4aaae15/scipy-1.16.3-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:875555ce62743e1d54f06cdf22c1e0bc47b91130ac40fe5d783b6dfa114beeb6", size = 36606469, upload-time = "2025-10-28T17:36:08.741Z" }, + { url = "https://files.pythonhosted.org/packages/4d/74/043b54f2319f48ea940dd025779fa28ee360e6b95acb7cd188fad4391c6b/scipy-1.16.3-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:bb61878c18a470021fb515a843dc7a76961a8daceaaaa8bad1332f1bf4b54657", size = 28872043, upload-time = "2025-10-28T17:36:16.599Z" }, + { url = "https://files.pythonhosted.org/packages/4d/e1/24b7e50cc1c4ee6ffbcb1f27fe9f4c8b40e7911675f6d2d20955f41c6348/scipy-1.16.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:f2622206f5559784fa5c4b53a950c3c7c1cf3e84ca1b9c4b6c03f062f289ca26", size = 20862952, upload-time = "2025-10-28T17:36:22.966Z" }, + { url = "https://files.pythonhosted.org/packages/dd/3a/3e8c01a4d742b730df368e063787c6808597ccb38636ed821d10b39ca51b/scipy-1.16.3-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7f68154688c515cdb541a31ef8eb66d8cd1050605be9dcd74199cbd22ac739bc", size = 23508512, upload-time = "2025-10-28T17:36:29.731Z" }, + { url = "https://files.pythonhosted.org/packages/1f/60/c45a12b98ad591536bfe5330cb3cfe1850d7570259303563b1721564d458/scipy-1.16.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8b3c820ddb80029fe9f43d61b81d8b488d3ef8ca010d15122b152db77dc94c22", size = 33413639, upload-time = "2025-10-28T17:36:37.982Z" }, + { url = "https://files.pythonhosted.org/packages/71/bc/35957d88645476307e4839712642896689df442f3e53b0fa016ecf8a3357/scipy-1.16.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d3837938ae715fc0fe3c39c0202de3a8853aff22ca66781ddc2ade7554b7e2cc", size = 35704729, upload-time = "2025-10-28T17:36:46.547Z" }, + { url = "https://files.pythonhosted.org/packages/3b/15/89105e659041b1ca11c386e9995aefacd513a78493656e57789f9d9eab61/scipy-1.16.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:aadd23f98f9cb069b3bd64ddc900c4d277778242e961751f77a8cb5c4b946fb0", size = 36086251, upload-time = "2025-10-28T17:36:55.161Z" }, + { url = "https://files.pythonhosted.org/packages/1a/87/c0ea673ac9c6cc50b3da2196d860273bc7389aa69b64efa8493bdd25b093/scipy-1.16.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b7c5f1bda1354d6a19bc6af73a649f8285ca63ac6b52e64e658a5a11d4d69800", size = 38716681, upload-time = "2025-10-28T17:37:04.1Z" }, + { url = "https://files.pythonhosted.org/packages/91/06/837893227b043fb9b0d13e4bd7586982d8136cb249ffb3492930dab905b8/scipy-1.16.3-cp314-cp314-win_amd64.whl", hash = "sha256:e5d42a9472e7579e473879a1990327830493a7047506d58d73fc429b84c1d49d", size = 39358423, upload-time = "2025-10-28T17:38:20.005Z" }, + { url = "https://files.pythonhosted.org/packages/95/03/28bce0355e4d34a7c034727505a02d19548549e190bedd13a721e35380b7/scipy-1.16.3-cp314-cp314-win_arm64.whl", hash = "sha256:6020470b9d00245926f2d5bb93b119ca0340f0d564eb6fbaad843eaebf9d690f", size = 26135027, upload-time = "2025-10-28T17:38:24.966Z" }, + { url = "https://files.pythonhosted.org/packages/b2/6f/69f1e2b682efe9de8fe9f91040f0cd32f13cfccba690512ba4c582b0bc29/scipy-1.16.3-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:e1d27cbcb4602680a49d787d90664fa4974063ac9d4134813332a8c53dbe667c", size = 37028379, upload-time = "2025-10-28T17:37:14.061Z" }, + { url = "https://files.pythonhosted.org/packages/7c/2d/e826f31624a5ebbab1cd93d30fd74349914753076ed0593e1d56a98c4fb4/scipy-1.16.3-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:9b9c9c07b6d56a35777a1b4cc8966118fb16cfd8daf6743867d17d36cfad2d40", size = 29400052, upload-time = "2025-10-28T17:37:21.709Z" }, + { url = "https://files.pythonhosted.org/packages/69/27/d24feb80155f41fd1f156bf144e7e049b4e2b9dd06261a242905e3bc7a03/scipy-1.16.3-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:3a4c460301fb2cffb7f88528f30b3127742cff583603aa7dc964a52c463b385d", size = 21391183, upload-time = "2025-10-28T17:37:29.559Z" }, + { url = "https://files.pythonhosted.org/packages/f8/d3/1b229e433074c5738a24277eca520a2319aac7465eea7310ea6ae0e98ae2/scipy-1.16.3-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:f667a4542cc8917af1db06366d3f78a5c8e83badd56409f94d1eac8d8d9133fa", size = 23930174, upload-time = "2025-10-28T17:37:36.306Z" }, + { url = "https://files.pythonhosted.org/packages/16/9d/d9e148b0ec680c0f042581a2be79a28a7ab66c0c4946697f9e7553ead337/scipy-1.16.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f379b54b77a597aa7ee5e697df0d66903e41b9c85a6dd7946159e356319158e8", size = 33497852, upload-time = "2025-10-28T17:37:42.228Z" }, + { url = "https://files.pythonhosted.org/packages/2f/22/4e5f7561e4f98b7bea63cf3fd7934bff1e3182e9f1626b089a679914d5c8/scipy-1.16.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4aff59800a3b7f786b70bfd6ab551001cb553244988d7d6b8299cb1ea653b353", size = 35798595, upload-time = "2025-10-28T17:37:48.102Z" }, + { url = "https://files.pythonhosted.org/packages/83/42/6644d714c179429fc7196857866f219fef25238319b650bb32dde7bf7a48/scipy-1.16.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:da7763f55885045036fabcebd80144b757d3db06ab0861415d1c3b7c69042146", size = 36186269, upload-time = "2025-10-28T17:37:53.72Z" }, + { url = "https://files.pythonhosted.org/packages/ac/70/64b4d7ca92f9cf2e6fc6aaa2eecf80bb9b6b985043a9583f32f8177ea122/scipy-1.16.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ffa6eea95283b2b8079b821dc11f50a17d0571c92b43e2b5b12764dc5f9b285d", size = 38802779, upload-time = "2025-10-28T17:37:59.393Z" }, + { url = "https://files.pythonhosted.org/packages/61/82/8d0e39f62764cce5ffd5284131e109f07cf8955aef9ab8ed4e3aa5e30539/scipy-1.16.3-cp314-cp314t-win_amd64.whl", hash = "sha256:d9f48cafc7ce94cf9b15c6bffdc443a81a27bf7075cf2dcd5c8b40f85d10c4e7", size = 39471128, upload-time = "2025-10-28T17:38:05.259Z" }, + { url = "https://files.pythonhosted.org/packages/64/47/a494741db7280eae6dc033510c319e34d42dd41b7ac0c7ead39354d1a2b5/scipy-1.16.3-cp314-cp314t-win_arm64.whl", hash = "sha256:21d9d6b197227a12dcbf9633320a4e34c6b0e51c57268df255a0942983bac562", size = 26464127, upload-time = "2025-10-28T17:38:11.34Z" }, ] [[package]] @@ -5581,15 +5629,15 @@ wheels = [ [[package]] name = "starlette" -version = "0.48.0" +version = "0.49.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/a7/a5/d6f429d43394057b67a6b5bbe6eae2f77a6bf7459d961fdb224bf206eee6/starlette-0.48.0.tar.gz", hash = "sha256:7e8cee469a8ab2352911528110ce9088fdc6a37d9876926e73da7ce4aa4c7a46", size = 2652949, upload-time = "2025-09-13T08:41:05.699Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1b/3f/507c21db33b66fb027a332f2cb3abbbe924cc3a79ced12f01ed8645955c9/starlette-0.49.1.tar.gz", hash = "sha256:481a43b71e24ed8c43b11ea02f5353d77840e01480881b8cb5a26b8cae64a8cb", size = 2654703, upload-time = "2025-10-28T17:34:10.928Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/be/72/2db2f49247d0a18b4f1bb9a5a39a0162869acf235f3a96418363947b3d46/starlette-0.48.0-py3-none-any.whl", hash = "sha256:0764ca97b097582558ecb498132ed0c7d942f233f365b86ba37770e026510659", size = 73736, upload-time = "2025-09-13T08:41:03.869Z" }, + { url = "https://files.pythonhosted.org/packages/51/da/545b75d420bb23b5d494b0517757b351963e974e79933f01e05c929f20a6/starlette-0.49.1-py3-none-any.whl", hash = "sha256:d92ce9f07e4a3caa3ac13a79523bd18e3bc0042bb8ff2d759a8e7dd0e1859875", size = 74175, upload-time = "2025-10-28T17:34:09.13Z" }, ] [[package]] @@ -6026,8 +6074,8 @@ wheels = [ [[package]] name = "transformer-engine" -version = "2.8.0+40c69e75" -source = { git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.8#40c69e751a47ec87786283e125c5eb264101270f" } +version = "2.9.0+c4c185db" +source = { git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.9#c4c185dbec1aab3627ab2ecffbc4c429d31f23c0" } dependencies = [ { name = "einops" }, { name = "importlib-metadata", version = "8.6.1", source = { registry = "https://pypi.org/simple" } }, @@ -6174,7 +6222,7 @@ wheels = [ [[package]] name = "wandb" -version = "0.22.2" +version = "0.22.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click" }, @@ -6188,17 +6236,17 @@ dependencies = [ { name = "sentry-sdk" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c1/a8/680bd77e11a278e6c14a2cb4646e8ab9525b2baaa81c3d12dc0f616aa4aa/wandb-0.22.2.tar.gz", hash = "sha256:510f5a1ac30d16921c36c3b932da852f046641d4aee98a86a7f5ec03a6e95bda", size = 41401439, upload-time = "2025-10-07T19:54:21.88Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e7/b3/8c637fb594cfd574ce9c9f7d0ac2f2d12742eb38ec59dcbb713beae95343/wandb-0.22.2-py3-none-macosx_12_0_arm64.whl", hash = "sha256:2e29c9fa4462b5411b2cd2175ae33eff4309c91de7c426bca6bc8e7abc7e5dec", size = 18677549, upload-time = "2025-10-07T19:54:00.839Z" }, - { url = "https://files.pythonhosted.org/packages/d3/f3/e309a726eaebddad6b8d9a73a50891e5796962ec8a091bb6a61d31692d1e/wandb-0.22.2-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:c42d594cd7a9da4fd39ecdb0abbc081b61f304123277b2b6c4ba84283956fd21", size = 19715188, upload-time = "2025-10-07T19:54:03.805Z" }, - { url = "https://files.pythonhosted.org/packages/f9/73/fad59910215876008f4781b57d828d1b19b3677c9b46af615e7229746435/wandb-0.22.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5188d84e66d3fd584f3b3ae4d2a70e78f29403c0528e6aecaa4188a1fcf54d8", size = 18463148, upload-time = "2025-10-07T19:54:05.676Z" }, - { url = "https://files.pythonhosted.org/packages/87/11/572c1913b5b92e4c519f735adfae572b46f2d79d99ede63eec0d6a272d6e/wandb-0.22.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88ccd484af9f21cfc127976793c3cf66cfe1acd75bd8cd650086a64e88bac4bf", size = 19908645, upload-time = "2025-10-07T19:54:07.693Z" }, - { url = "https://files.pythonhosted.org/packages/6d/0d/133aa82f5a505ba638b4fda5014cefddfe7f1f6238ef4afc0871ec61c41f/wandb-0.22.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:abf0ed175e791af64110e0a0b99ce02bbbbd1017722bc32d3bc328efb86450cd", size = 18501348, upload-time = "2025-10-07T19:54:10.234Z" }, - { url = "https://files.pythonhosted.org/packages/d0/d5/776203be2601872f01dacc6a5b4274106ec0db7cd3bf2cdb3b741f8fc932/wandb-0.22.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:44e77c56403b90bf3473a7ca3bfc4d42c636b7c0e31a5fb9cd0382f08302f74b", size = 20001756, upload-time = "2025-10-07T19:54:12.452Z" }, - { url = "https://files.pythonhosted.org/packages/30/43/ae3fa46e20b1d9a6508dd9abe716d57205c038ed4661c5c98ace48a60eac/wandb-0.22.2-py3-none-win32.whl", hash = "sha256:44d12bd379dbe15be5ceed6bdf23803d42f648ba0dd111297b4c47a3c7be6dbd", size = 19075950, upload-time = "2025-10-07T19:54:14.892Z" }, - { url = "https://files.pythonhosted.org/packages/09/59/c174321e868205f7a659d1e5ec51f546e62267296d6f4179bb9119294964/wandb-0.22.2-py3-none-win_amd64.whl", hash = "sha256:c95eb221bf316c0872f7ac55071856b9f25f95a2de983ada48acf653ce259386", size = 19075953, upload-time = "2025-10-07T19:54:16.837Z" }, - { url = "https://files.pythonhosted.org/packages/7a/a2/c7c24fda78513cab5686949d8cb36459dbbccbbb4b2b6fc67237ece31a00/wandb-0.22.2-py3-none-win_arm64.whl", hash = "sha256:20d2ab9aa10445aab3d60914a980f002a4f66566e28b0cd156b1e462f0080a0d", size = 17383217, upload-time = "2025-10-07T19:54:19.384Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/c1/d1/6b70f365ed86bd69debba8ad55dec8606fc21006e7ca703a5a091bd3b719/wandb-0.22.3.tar.gz", hash = "sha256:04468a8ab2769a46f5e384c9c4ada5da0dced005ca689a8424e4b8b5cb2a0291", size = 44337368, upload-time = "2025-10-28T23:59:10.275Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/23/02/87fb60f587ec249f784a40bd91c30de1b2b24d691ee72675d5b66c3d0728/wandb-0.22.3-py3-none-macosx_12_0_arm64.whl", hash = "sha256:81b3b6e405f38342b0a080898b7d00c5b9375432f5ba358942a09e65cdcfe781", size = 18758047, upload-time = "2025-10-28T23:58:46.56Z" }, + { url = "https://files.pythonhosted.org/packages/26/88/64081740ef2b2efc7fbcb2139a07a849e42bcb09ae0c56ae50c41bd0ad63/wandb-0.22.3-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:d29c16817cca6401b4919069ec7570c781eacb67dc0b1ff2e0096a9a59581720", size = 19798011, upload-time = "2025-10-28T23:58:49.718Z" }, + { url = "https://files.pythonhosted.org/packages/19/72/c4f922b33dbb84d1c81ee045ff8791dd14e26d79e1e9bbafff964b7043e2/wandb-0.22.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb955d73a4ba55df9adc61fafbabef5556784d33fc39c7b5c8165d2694ddeb3b", size = 18542713, upload-time = "2025-10-28T23:58:51.927Z" }, + { url = "https://files.pythonhosted.org/packages/ad/98/3ce5f6e2086d91b0c51b38ae7ff591109e7da2bb25fe1a12eec0cdbaa494/wandb-0.22.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23f3ebe41a26506117a098fdfd2706ed0e50b37899bfbefe3a0628fcbd70c69d", size = 19984910, upload-time = "2025-10-28T23:58:54.641Z" }, + { url = "https://files.pythonhosted.org/packages/5e/57/e68cb38427b60490d6ddf1b992e6c7f36be83be1079d291ce87a8d347f48/wandb-0.22.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:2973462bed5d4a653b1a97cf9fc350673bb200fb356a2f4eba34beae9b87e0aa", size = 18581776, upload-time = "2025-10-28T23:58:56.975Z" }, + { url = "https://files.pythonhosted.org/packages/66/6d/543f907ce0c6b6da13628b23d19ca7282c559fd73eb47b04977b9a61d0c6/wandb-0.22.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:c5c2bd18f95c1639863c527da0a5818ac6b0e5194f9c691426b265908ddd8b2c", size = 20078800, upload-time = "2025-10-28T23:58:59.217Z" }, + { url = "https://files.pythonhosted.org/packages/da/91/1decaf1a6ac2017481c782e0fad7f90bc9ae4057f3d76d478cb6527f3dd3/wandb-0.22.3-py3-none-win32.whl", hash = "sha256:09ca1edfe0fd6dc30447d368acddb825668e60ee705c98594a6bbfd30d34d47e", size = 19160297, upload-time = "2025-10-28T23:59:01.536Z" }, + { url = "https://files.pythonhosted.org/packages/4c/ba/3b092634279994b0c79fe05220532822be09f3a353ae95c54e7142769db8/wandb-0.22.3-py3-none-win_amd64.whl", hash = "sha256:55403bf93872c9978433d101324f51e43e78c70c809bf6d06ca7b2760e39f497", size = 19160300, upload-time = "2025-10-28T23:59:04.06Z" }, + { url = "https://files.pythonhosted.org/packages/7f/80/4662fce9eebcc8c71f5083e9152ccaf7d43d4ca9c446e1422f9aa784a51c/wandb-0.22.3-py3-none-win_arm64.whl", hash = "sha256:49f66b05882abfa53816cc8d01b3c2435a89c5a090176802fa6928b5979d34d9", size = 17461959, upload-time = "2025-10-28T23:59:07.059Z" }, ] [[package]] From 0d0f29cd8a5f2f6c39786c979cea2b61fdda8626 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Oct 2025 23:54:28 +0100 Subject: [PATCH 086/248] Ko3n1g/fix/golden values (#2037) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/stages/02.test.yml | 7 +++++++ ...weave.json => golden_values_dev_dgxh100_coreweave.json} | 0 ...ues_dev_eos.json => golden_values_dev_dgxh100_eos.json} | 0 3 files changed, 7 insertions(+) rename tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/{golden_values_dev_coreweave.json => golden_values_dev_dgxh100_coreweave.json} (100%) rename tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/{golden_values_dev_eos.json => golden_values_dev_dgxh100_eos.json} (100%) diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml index 2f018f94e66..33dd8d7a5fb 100644 --- a/.gitlab/stages/02.test.yml +++ b/.gitlab/stages/02.test.yml @@ -228,6 +228,13 @@ test:linting_docs_build: - mv megatron-lm/ documentation/ - cd documentation/ - ./repo docs + rules: + - if: $PUBLISH == "yes" + when: never + - if: $BUILD == "no" + when: never + - when: on_success + allow_failure: true # Override from template secret_detection: diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_eos.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgxh100_eos.json From 1d1ac739c69180d3c7410064748f1005f789154d Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 30 Oct 2025 18:57:52 -0500 Subject: [PATCH 087/248] cp: `Megatron-FSDP Expert Parallel (DeepSeek-v3) Support` into `dev` (#2007) Signed-off-by: Charlie Truong Co-authored-by: Jack Chang Co-authored-by: jianbinc Co-authored-by: xuwenc --- .../distributed/fsdp/mcore_fsdp_adapter.py | 133 +++- megatron/core/distributed/fsdp/src/README.md | 11 + .../fsdp/src/megatron_fsdp/fully_shard.py | 10 +- .../fsdp/src/megatron_fsdp/megatron_fsdp.py | 11 +- .../megatron_fsdp/param_and_grad_buffer.py | 83 ++- .../fsdp/src/megatron_fsdp/uneven_dtensor.py | 4 +- .../fsdp/src/megatron_fsdp/utils.py | 130 +++- .../embeddings/yarn_rotary_pos_embedding.py | 10 +- megatron/core/optimizer/__init__.py | 23 + megatron/core/optimizer/distrib_optimizer.py | 2 + .../transformer/fsdp_dtensor_checkpoint.py | 336 ++++++++-- megatron/training/arguments.py | 4 + megatron/training/checkpointing.py | 74 ++- megatron/training/training.py | 1 + .../golden_values_dev_dgxh100_coreweave.json | 598 ++++++++--------- .../golden_values_dev_dgxh100_coreweave.json | 600 +++++++++--------- .../golden_values_dev_dgxh100_eos.json | 600 +++++++++--------- .../golden_values_dev_dgxh100_coreweave.json | 500 +++++++-------- .../golden_values_dev_dgx_h100.json | 143 ++++- .../golden_values_dev_dgxh100_coreweave.json | 537 ++++++++++++++++ .../model_config.yaml | 2 +- .../golden_values_dev_dgxh100_coreweave.json | 478 +++++++------- .../golden_values_dev_dgxh100_eos.json | 478 +++++++------- tests/test_utils/recipes/moe.yaml | 15 +- tools/checkpoint/checkpoint_inspector.py | 362 +++++++++-- 25 files changed, 3302 insertions(+), 1843 deletions(-) create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgxh100_coreweave.json diff --git a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py index a7c0d5802ab..7432a7f9a36 100644 --- a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +++ b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py @@ -13,6 +13,7 @@ # limitations under the License. import logging +import random from typing import List, Optional try: @@ -22,6 +23,7 @@ except ImportError: HAVE_EINOPS = False +import numpy as np import torch import torch.distributed as dist @@ -32,10 +34,11 @@ except ImportError: HAVE_DTENSOR = False -from megatron.core import parallel_state +from megatron.core import parallel_state, tensor_parallel from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk from megatron.core.distributed.data_parallel_base import _BaseDataParallel from megatron.core.distributed.distributed_data_parallel_config import DistributedDataParallelConfig +from megatron.core.extensions.transformer_engine import TELinear from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.transformer_layer import TransformerLayer @@ -95,6 +98,8 @@ def __init__( else: self.fsdp_unit_modules = [] + self._fix_tensor_parallel_attributes(module) + super().__init__( config=config, module=MegatronFSDP( @@ -119,6 +124,8 @@ def __init__( self.module.state_dict_for_save_checkpoint = self.module.state_dict self.state_dict_for_save_checkpoint = self.state_dict + self.sync_rng_states_across_tp_group() + def load_state_dict(self, state_dict, strict=True): """ Load the state dictionary into the module. @@ -141,6 +148,44 @@ def load_state_dict(self, state_dict, strict=True): self.module.load_state_dict(custom_state_dict, strict=strict) + def _fix_tensor_parallel_attributes(self, module): + is_expert_param = lambda n, p: ".experts." in n + is_router_param = lambda n, p: ".router.weight" in n + + if parallel_state.get_tensor_model_parallel_group(): + tp_size = parallel_state.get_tensor_model_parallel_group().size() + else: + tp_size = 1 + + if parallel_state.get_expert_tensor_parallel_group(): + expt_tp_size = parallel_state.get_expert_tensor_parallel_group().size() + else: + expt_tp_size = 1 + + param_to_direct_module = {} + for name, m in module.named_modules(): + for p in m.parameters(recurse=False): + param_to_direct_module[p] = (name, m) + + for name, param in module.named_parameters(): + if is_expert_param(name, param) and expt_tp_size > 1: + setattr(param, "_mcore_tp", True) + if "linear_fc1.weight" in name: + setattr(param, "_tp_partition_dim", 0) + elif "linear_fc2.weight" in name: + setattr(param, "_tp_partition_dim", 1) + + if not is_expert_param(name, param) and tp_size > 1: + m_name, direct_module = param_to_direct_module[param] + if isinstance(direct_module, (TELinear,)): + parallel_mode = getattr(direct_module, "parallel_mode", None) + if parallel_mode is None: + setattr(param, "_mcore_tp", True) + setattr(param, "_tp_duplicated", True) + elif is_router_param(name, param): + setattr(param, "_mcore_tp", True) + setattr(param, "_tp_duplicated", True) + def _init_dist_index(self, pg_collection): """ Initialize the distributed index for the module. @@ -154,6 +199,7 @@ def _init_dist_index(self, pg_collection): enable_hsdp = self.ddp_config.num_distributed_optimizer_instances > 1 if pg_collection is None: tp_group = parallel_state.get_tensor_model_parallel_group() + expt_tp_group = parallel_state.get_expert_tensor_parallel_group() if enable_hsdp: dp_cp_group = parallel_state.get_data_parallel_group( with_context_parallel=True, partial_data_parallel=True @@ -168,8 +214,11 @@ def _init_dist_index(self, pg_collection): ) outer_fsdp_group = None hybrid_fsdp_group = None + expt_dp_group = parallel_state.get_expert_data_parallel_group() + ep_group = parallel_state.get_expert_model_parallel_group() else: tp_group = getattr(pg_collection, 'tp', None) + expt_tp_group = getattr(pg_collection, 'expt_tp', None) if enable_hsdp: dp_cp_group = pg_collection.intra_dp_cp outer_fsdp_group = pg_collection.inter_dist_opt @@ -178,11 +227,17 @@ def _init_dist_index(self, pg_collection): dp_cp_group = pg_collection.dp_cp outer_fsdp_group = None hybrid_fsdp_group = None + expt_dp_group = getattr(pg_collection, 'expt_dp', None) + ep_group = getattr(pg_collection, 'ep', None) if tp_group is None: single_rank_group = dist.new_group(ranks=[dist.get_rank()]) tp_group = single_rank_group + if expt_tp_group is None: + single_rank_group = dist.new_group(ranks=[dist.get_rank()]) + expt_tp_group = single_rank_group + if enable_hsdp: mesh = _get_hsdp_tp_mesh(outer_fsdp_group, dp_cp_group, tp_group) dist_index = FSDPDistributedIndex( @@ -199,6 +254,17 @@ def _init_dist_index(self, pg_collection): hybrid_fsdp_group=hybrid_fsdp_group, ) else: + if ep_group is not None: + expt_mesh = _get_dp_tp_mesh(expt_dp_group, expt_tp_group, ep_size=ep_group.size()) + expt_device_mesh = DeviceMesh.from_group( + [expt_dp_group, expt_tp_group], + device_type="cuda", + mesh=expt_mesh.tolist(), + mesh_dim_names=["dp_cp", "tp"], + ) + else: + expt_device_mesh = None + mesh = _get_dp_tp_mesh(dp_cp_group, tp_group) dist_index = FSDPDistributedIndex( device_mesh=DeviceMesh.from_group( @@ -209,8 +275,11 @@ def _init_dist_index(self, pg_collection): ), dp_shard_dim="dp_cp", tp_dim="tp", + expt_device_mesh=expt_device_mesh, ) + self.tp_group = tp_group + return dist_index def stop_communication(self): @@ -220,6 +289,20 @@ def stop_communication(self): self.module.synchronize_gradient_reduce() self.module.synchronize_param_gather() + def sync_rng_states_across_tp_group(self): + """ + Synchronize the tensor parallel random number generator states. + """ + if self.tp_group.size() <= 1: + return + + if self.tp_group.rank() == 0: + broadcast_list = [_get_rng_state_dict()] + else: + broadcast_list = [None] + torch.distributed.broadcast_object_list(broadcast_list, group=self.tp_group, group_src=0) + _load_rng_state_dict(broadcast_list[0]) + def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group): assert HAVE_EINOPS, "einops is not installed. Please install it with `pip install einops`." @@ -273,29 +356,46 @@ def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group): return mesh -def _get_dp_tp_mesh(dp_cp_group, tp_group): +def _get_dp_tp_mesh(dp_cp_group, tp_group, ep_size=1): assert HAVE_EINOPS, "einops is not installed. Please install it with `pip install einops`." world_size = dist.get_world_size() tp_size = dist.get_world_size(tp_group) if tp_group is not None else 1 - # TODO: Supports configurable (dp, cp, tp) order. - mesh = einops.rearrange(torch.arange(world_size), "(dp_cp tp) -> dp_cp tp", tp=tp_size) + # TODO: Supports configurable (dp, cp, ep, tp) order. + mesh = einops.rearrange( + torch.arange(world_size), + "(dp_cp ep tp) -> ep dp_cp tp", + dp_cp=dp_cp_group.size(), + tp=tp_size, + ep=ep_size, + ) - mesh_dp_ranks = einops.rearrange(mesh, 'dp_cp tp -> tp dp_cp', tp=tp_size) + mesh_dp_ranks = einops.rearrange(mesh, 'ep dp_cp tp -> (ep tp) dp_cp', dp_cp=dp_cp_group.size()) dp_cp_group_ranks = dist.get_process_group_ranks(dp_cp_group) assert _check_mesh_ranks_and_group_ranks_are_consistent(mesh_dp_ranks, dp_cp_group_ranks), ( f"[Megatron-FSDP] Data Parallel ranks in the mesh {mesh_dp_ranks} " f"do not match the ranks in the DP group {dp_cp_group_ranks}." ) - mesh_tp_ranks = einops.rearrange(mesh, 'dp_cp tp -> (dp_cp) tp', tp=tp_size) + mesh_tp_ranks = einops.rearrange(mesh, 'ep dp_cp tp -> (dp_cp ep) tp', tp=tp_size) tp_group_ranks = dist.get_process_group_ranks(tp_group) assert _check_mesh_ranks_and_group_ranks_are_consistent(mesh_tp_ranks, tp_group_ranks), ( f"[Megatron-FSDP] Tensor Parallel ranks in the mesh {mesh_tp_ranks} " f"do not match the ranks in the TP group {tp_group_ranks}." ) - return mesh + # Exclude the expert parallel dimension + rank = dist.get_rank() + dp_tp_meshes = [per_ep_mesh for per_ep_mesh in mesh if rank in per_ep_mesh.reshape(-1).tolist()] + assert ( + len(dp_tp_meshes) == 1 + ), f"[Megatron-FSDP] Current rank {rank} is not unique in the mesh ranks {mesh.tolist()}." + assert len(dp_tp_meshes[0].reshape(-1).tolist()) == dp_cp_group.size() * tp_group.size(), ( + f"[Megatron-FSDP] DP-TP mesh size {len(dp_tp_meshes[0].reshape(-1).tolist())} " + f"does not match expected size {dp_cp_group.size() * tp_group.size()}." + ) + + return dp_tp_meshes[0] def _check_mesh_ranks_and_group_ranks_are_consistent(mesh_ranks, group_ranks): @@ -310,3 +410,22 @@ def _check_mesh_ranks_and_group_ranks_are_consistent(mesh_ranks, group_ranks): f"{mesh_ranks.tolist()} does not match the group ranks {group_ranks}." ) return sorted(current_ranks[0]) == sorted(group_ranks) + + +def _get_rng_state_dict(): + rng_state_dict = { + 'random_rng_state': random.getstate(), + 'np_rng_state': np.random.get_state(), + 'torch_rng_state': torch.get_rng_state(), + 'cuda_rng_state': torch.cuda.get_rng_state(), + 'rng_tracker_states': tensor_parallel.get_cuda_rng_tracker().get_states(), + } + return rng_state_dict + + +def _load_rng_state_dict(rng_state_dict): + random.setstate(rng_state_dict['random_rng_state']) + np.random.set_state(rng_state_dict['np_rng_state']) + torch.set_rng_state(rng_state_dict['torch_rng_state']) + torch.cuda.set_rng_state(rng_state_dict['cuda_rng_state']) + tensor_parallel.get_cuda_rng_tracker().set_states(rng_state_dict['rng_tracker_states']) diff --git a/megatron/core/distributed/fsdp/src/README.md b/megatron/core/distributed/fsdp/src/README.md index d879c6c26f8..9e036f22f67 100644 --- a/megatron/core/distributed/fsdp/src/README.md +++ b/megatron/core/distributed/fsdp/src/README.md @@ -127,6 +127,12 @@ device_mesh[("dp_shard", "cp")]._flatten("dp_shard_cp") # Only required if using HSDP. Otherwise, don't pass hybrid_fsdp_group. device_mesh[("dp_outer", "dp_shard", "cp")]._flatten("hsdp") hsdp_group = device_mesh["hsdp"].get_group() +# Initialize DeviceMesh for expert parallel (EP) modules when using FSDP + EP. +expert_device_mesh = torch.distributed.device_mesh.init_device_mesh( + "cuda", + mesh_shape=(expt_dp_shard_size, expt_tp_size), + mesh_dim_names=("dp_shard", "tp"), +) # Fully-shards your model and distributes your optimizer. model, optimizer = fully_shard( @@ -145,6 +151,8 @@ model, optimizer = fully_shard( tp_dim="tp", # Only required when using HSDP. Otherwise, set this to None. hybrid_fsdp_group=hsdp_group, + # Only required for FSDP + EP. Otherwise, set this to None. + expt_device_mesh=expt_device_mesh, # FSDP Sharding Strategy: no_shard (0) / optim (1) / optim_grads (2) / optim_grads_params (3) zero_dp_strategy=3, outer_dp_sharding_strategy=1, @@ -192,6 +200,9 @@ optimizer.load_state_dict(ckpt_state_dict["optimizer"]) - `tp_dim` is the name of the sub-mesh used for tensor parallelism (TP), which is required for `(FSDP, TP)`-strided sharding when using Megatron-LM or Torch-native `DTensor` TP. - For more information about tensor parallelism, refer to: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053). - `hybrid_fsdp_group` is the `ProcessGroup` which contains all ranks in the flattened `dp_shard_dim` and `dp_outer_dim` sub-meshes utilized to specify the `(DP-Outer, DP-Shard)` sharded coordinate system for the weight and gradient buffers. Required for HSDP. +- `expt_device_mesh` is another [`torch.distributed.DeviceMesh`](https://docs.pytorch.org/docs/stable/distributed.html#devicemesh) tailored for the expert parallel (EP) modules in `MegatronFSDP`. + - `dp_shard_dim` is the name of the sub-mesh required for FSDP sharding of the EP modules, enabling expert data parallelism (EDP). + - `tp_dim` is the name of the sub-mesh used for expert tensor parallelism (ETP), which is required for `(FSDP, ETP)`-strided sharding when using Megatron-LM or Torch-native `DTensor` ETP. - `init_model_with_meta_device` has `MegatronFSDP` initialize your `meta`-device model in shards on every CUDA device to avoid OOM when initializing extremely large models that cannot fit on a single device. Users can initialize their model on a [`meta`-device](https://docs.pytorch.org/docs/stable/meta.html) (`with torch.device('meta'): ...`), and ``MegatronFSDP`` will further shard and initialize the model parameters layer-by-layer adhering to the customizable `module.reset_parameters` method, which prevents the entire model from being allocated in memory at any point during runtime. - Defaults to `False`. - Note that the `device` argument which installs your model on a specific device or rank will be deactivated when `init_model_with_meta_device=True`. diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py index 24e86cede72..e98362a1a03 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py @@ -64,6 +64,7 @@ def fully_shard_model( dp_outer_dim: Optional[str] = None, tp_dim: Optional[str] = None, hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None, + expt_device_mesh: Optional[DeviceMesh] = None, fsdp_unit_modules: Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]] = None, zero_dp_strategy: str | int = 3, outer_dp_sharding_strategy: str | int = 0, @@ -183,8 +184,10 @@ def fully_shard_model( tp_dim=tp_dim, # Only required for HSDP. hybrid_fsdp_group=hybrid_fsdp_group, - # Access to flattened DP rank assignments for HFSDP. + # Access to flattened DP rank assignments for HSDP. hsdp_outer_dp_shard=_outer_fsdp_sharding, + # Only required for Megatron-FSDP + EP. + expt_device_mesh=expt_device_mesh, ) # Wrap model in Megatron FSDP. @@ -330,6 +333,7 @@ def fully_shard( dp_outer_dim: Optional[str] = None, tp_dim: Optional[str] = None, hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None, + expt_device_mesh: Optional[DeviceMesh] = None, fsdp_unit_modules: Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]] = None, zero_dp_strategy: str | int = 3, outer_dp_sharding_strategy: str | int = 0, @@ -391,6 +395,9 @@ def fully_shard( by flattening the outer-FSDP (dp_outer_dim) and FSDP (dp_shard_dim) process groups or sub-meshes. Defaults to None. Required for HSDP, i.e. if dp_outer_dim is not None. + expt_device_mesh (Optional[DeviceMesh]): + Expert parallel device mesh object defining the topology for MoE distributed training. + fsdp_unit_modules (Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]]): List of (sub-)module classes or (sub-)module class import paths that are "units", which are torch.nn.Module(s) that are sharded and scheduled by Megatron-FSDP. @@ -503,6 +510,7 @@ def fully_shard( dp_outer_dim=dp_outer_dim, tp_dim=tp_dim, hybrid_fsdp_group=hybrid_fsdp_group, + expt_device_mesh=expt_device_mesh, fsdp_unit_modules=fsdp_unit_modules, zero_dp_strategy=zero_dp_strategy, outer_dp_sharding_strategy=outer_dp_sharding_strategy, diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py index 10a8ae14d65..d6ef5f6210e 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py @@ -235,7 +235,10 @@ def __init__( self.dist_index = dist_index # If Megatron Expert Parallelism is enabled, you need to provide an expt_dp_group. - if has_expert_parameters and self.dist_index.get_expert_dp_group() is None: + if ( + has_expert_parameters + and self.dist_index.get_fsdp_group(is_expert_parallel=True) is None + ): raise ValueError( "[Megatron-FSDP] Megatron Expert Parallelism is enabled, but no expt_dp_group is" "provided." @@ -353,9 +356,7 @@ def _init_fsdp_param_and_grad_buffer(self): ) # Set the suggested communication unit size for reduce-scatter and all-gather pipelines. - suggested_communication_unit_size = ( - self.ddp_config.suggested_communication_unit_size or 1_000_000_000 - ) + suggested_communication_unit_size = self.ddp_config.suggested_communication_unit_size if suggested_communication_unit_size is None: if self.data_parallel_sharding_strategy == "optim_grads_params": total_param_elements = 0 @@ -370,6 +371,8 @@ def _init_fsdp_param_and_grad_buffer(self): suggested_communication_unit_size = total_param_elements // total_fsdp_module * 2 elif self.bucket_size is not None: suggested_communication_unit_size = self.bucket_size + else: + suggested_communication_unit_size = 1_000_000_000 # Cap to 1B elements. suggested_communication_unit_size = max( diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py index c8116150d52..bdf480d867b 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py @@ -34,7 +34,14 @@ from torch.distributed.tensor.device_mesh import _mesh_resources from .uneven_dtensor import update_uneven_dtensor_chunk_metadata, validate_uneven_dtensor -from .utils import _MODEL_PARALLEL_RNG_TRACKER_NAME, FSDPDistributedIndex, get_global_memory_buffer +from .utils import ( + _MODEL_PARALLEL_RNG_TRACKER_NAME, + FSDPDistributedIndex, + get_global_memory_buffer, + get_mcore_tensor_parallel_partition_dim, + is_mcore_tensor_model_parallel, + is_mcore_tensor_parallel_duplicated, +) logger = logging.getLogger(__name__) @@ -1299,7 +1306,7 @@ def _does_param_require_new_bucket(param): and policy.data_parallel_sharding_strategy != "no_shard" ) - is_expert_parameter = lambda p: not getattr(p, "allreduce", True) + is_expert_parameter = lambda n, p: ".experts." in n # Step 1: Group the parameters according to their execution order and attributes. # FSDP unit module parameters are split into multiple parameter sub-groups. @@ -1313,7 +1320,7 @@ def _does_param_require_new_bucket(param): if is_float8tensor(param) or meta_device_init_fp8_params.get(name, False) else param.dtype ), - is_expert_param=is_expert_parameter(param), + is_expert_param=is_expert_parameter(name, param), requires_grad=param.requires_grad, fsdp_unit_id=None, ) @@ -2257,6 +2264,10 @@ def _reset_parameters(self, old_params, new_params): self.param_to_direct_module[new_param] = self.param_to_direct_module[old_param] del self.param_to_direct_module[old_param] + for tp_attr in ["_mcore_tp", "_tp_partition_dim", "_tp_duplicated"]: + if getattr(old_param, tp_attr, None) is not None: + setattr(new_param, tp_attr, getattr(old_param, tp_attr)) + for item_id, p in enumerate(self.params): if p in param_map: new_p = param_map[p] @@ -2340,6 +2351,7 @@ def _init_distributed_params(self): is_expert_param=pg.is_expert_param, run_check=True, update_uneven_dtensor_chunk_meta=True, + force_sync_tp_duplicated_param=True, ) dist_main_weight[param_name] = dist_param elif wbuf: @@ -2351,6 +2363,7 @@ def _init_distributed_params(self): is_expert_param=pg.is_expert_param, run_check=True, update_uneven_dtensor_chunk_meta=True, + force_sync_tp_duplicated_param=True, ) dist_main_weight[param_name] = dist_param else: @@ -2365,6 +2378,7 @@ def _init_distributed_params(self): is_expert_param=pg.is_expert_param, run_check=True, update_uneven_dtensor_chunk_meta=False, + force_sync_tp_duplicated_param=True, ) dist_main_weight[param_name] = dist_param @@ -2399,6 +2413,9 @@ def set_param_attribute(): "partition_dim", "partition_stride", "is_embedding_or_output_parameter", + "_mcore_tp", + "_tp_duplicated", + "_tp_partition_dim", ]: if hasattr(orig_param, attr_name): setattr(param, attr_name, getattr(orig_param, attr_name)) @@ -3546,7 +3563,9 @@ def to_local_if_dtensor(tensor): return tensor -def _get_fsdp_tensor_spec(param, dist_index: FSDPDistributedIndex, is_sharded_param): +def _get_fsdp_tensor_spec( + param, dist_index: FSDPDistributedIndex, is_sharded_param, is_expert_param +): """ Get the DeviceMesh for the parameter and modify the placement for Megatron-FSDP. """ @@ -3557,7 +3576,7 @@ def _get_fsdp_tensor_spec(param, dist_index: FSDPDistributedIndex, is_sharded_pa dtensor_mesh = getattr(dtensor_spec, "mesh", None) # Validate that the DTensor root mesh is identical to the Megatron-FSDP device mesh. - megatron_fsdp_global_mesh = dist_index.get_root_mesh() + megatron_fsdp_global_mesh = dist_index.get_root_mesh(is_expert_parallel=is_expert_param) dtensor_global_mesh = _mesh_resources.get_root_mesh(dtensor_mesh) # FIXME(boxiangw): add or megatron_fsdp_global_mesh != dtensor_global_mesh: # _mesh_resources.get_root_mesh(dtensor_mesh) is not getting the correct root mesh @@ -3602,7 +3621,7 @@ def _get_fsdp_tensor_spec(param, dist_index: FSDPDistributedIndex, is_sharded_pa placements = [Shard(0), dtensor_placement] shard_order = [1, 0] - device_mesh = dist_index.get_submesh(mesh_dim_names) + device_mesh = dist_index.get_submesh(mesh_dim_names, is_expert_parallel=is_expert_param) if shard_order is not None: setattr(device_mesh, "_shard_order", shard_order) @@ -3627,7 +3646,7 @@ def _get_fsdp_tensor_spec(param, dist_index: FSDPDistributedIndex, is_sharded_pa else: placements = [Shard(0)] - device_mesh = dist_index.get_submesh(mesh_dim_names) + device_mesh = dist_index.get_submesh(mesh_dim_names, is_expert_parallel=is_expert_param) if shard_order is not None: setattr(device_mesh, "_shard_order", shard_order) @@ -3642,6 +3661,7 @@ def make_fsdp_dtensor( is_expert_param: bool = False, run_check: bool = False, update_uneven_dtensor_chunk_meta: bool = False, + force_sync_tp_duplicated_param: bool = False, ): """ Creates a distributed tensor (DTensor) from a local tensor with support for @@ -3720,38 +3740,39 @@ def make_fsdp_dtensor( orig_param = param # Handle tensor model parallel specific logic - if getattr(param, "tensor_model_parallel", False): + if is_mcore_tensor_model_parallel(param): # Ensure parameter is not already a DTensor assert not isinstance(param, DTensor), ( - "[Megatron-FSDP] Parameter is already a DTensor, yet tensor_model_parallel " - "is True. Check usage." + "[Megatron-FSDP] Parameter is already a DTensor, yet tensor_model_parallel " "is True." ) - # Validate M-Core TP attributes - assert hasattr( - param, "partition_dim" - ), "[Megatron-FSDP] tensor_model_parallel param missing 'partition_dim'." - assert hasattr( - param, "partition_stride" - ), "[Megatron-FSDP] tensor_model_parallel param missing 'partition_stride'." - assert ( - param.partition_stride == 1 - ), "[Megatron-FSDP] Only partition_stride=1 is currently supported for " - "tensor_model_parallel." - - tp_dim = param.partition_dim - tp_mesh = dist_index.get_submesh(dist_index.tp_dim) - - # Adjust shape for global dimension + tp_mesh = dist_index.get_submesh(dist_index.tp_dim, is_expert_parallel=is_expert_param) + global_shape = list(param.shape) if tp_mesh.mesh.numel() > 1: - global_shape = list(param.shape) - global_shape[tp_dim] *= tp_mesh.mesh.numel() + if is_mcore_tensor_parallel_duplicated(param): + placements = [Replicate()] + if force_sync_tp_duplicated_param: + if local_tensor.numel() > 0: + torch.distributed.broadcast( + local_tensor, group=tp_mesh.get_group(), group_src=0 + ) + elif run_check: + # TODO: Implement consistency check for duplicated TP parameters + pass + else: + tp_dim = get_mcore_tensor_parallel_partition_dim(param) + assert tp_dim is not None, ( + "[Megatron-FSDP] Parameter is not tensor model parallel, " + "yet tensor_model_parallel is True." + ) + placements = [Shard(tp_dim)] + global_shape[tp_dim] *= tp_mesh.mesh.numel() # Construct TP-sharded DTensor using Megatron-style placement param = DTensor.from_local( - local_tensor=param, + local_tensor=local_tensor, device_mesh=tp_mesh, - placements=[Shard(tp_dim)], + placements=placements, run_check=run_check, shape=global_shape, stride=torch.empty(global_shape).stride(), @@ -3759,7 +3780,7 @@ def make_fsdp_dtensor( # Get FSDP-configured mesh and placements from provided param device_mesh, placements = _get_fsdp_tensor_spec( - param, dist_index, is_sharded_param=is_sharded_param + param, dist_index, is_sharded_param=is_sharded_param, is_expert_param=is_expert_param ) # Reshape local tensor for sharded layouts beyond 1D diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py index 523d8fae333..490d80c0f21 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py @@ -365,7 +365,9 @@ def _assemble_full_tensor_from_uneven_chunks( # Wrap into a replicated DTensor and return return DTensor.from_local( - full_tensor, placements=[Replicate()], device_mesh=dtensor.device_mesh + full_tensor, + placements=[Replicate()] * len(dtensor.placements), + device_mesh=dtensor.device_mesh, ) diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py index 1dfe08b90f4..b94a332bb0d 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py @@ -675,6 +675,7 @@ def __init__( tp_dim: Optional[str] = None, hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None, hsdp_outer_dp_shard: bool = False, + expt_device_mesh: Optional[DeviceMesh] = None, ): """ Args: @@ -691,6 +692,8 @@ def __init__( in hybrid FSDP. Specifying outer sharding will lift the bucket sharding coordinate system to flattened ranks of (dp_shard, dp_outer) instead of just sharding across dp_shard ranks and replicating across dp_outer ranks. + expt_device_mesh (Optional[DeviceMesh]): The expert parallel device mesh + to use for the DistributedIndex. """ # Device mesh arguments. self.device_mesh = device_mesh @@ -701,6 +704,11 @@ def __init__( self.use_hybrid_fsdp = dp_outer_dim is not None # Helper flag to denote if we are outer-sharding in hybrid FSDP. self.hsdp_outer_dp_shard = hsdp_outer_dp_shard + self.expt_device_mesh = expt_device_mesh + + # Handling the situation where M-Core MoE EP=1 + if self.expt_device_mesh is None: + self.expt_device_mesh = device_mesh # Hybrid FSDP Process Groups # Retrieve the FSDP process group from the DeviceMesh. @@ -719,6 +727,14 @@ def __init__( # combination of the outer-FSDP and FSDP process groups. self.hybrid_fsdp_group = hybrid_fsdp_group + # Retrieve the expert parallel process groups from the DeviceMesh. + self.expt_fsdp_group = ( + self.expt_device_mesh[self.dp_shard_dim].get_group() + if self.expt_device_mesh is not None + and contains_submesh(self.expt_device_mesh, self.dp_shard_dim) + else None + ) + """ Store a persistent reference to the core device meshes that back Megatron-FSDP. This is necessary because _MeshEnv (_mesh_resources) may not persist: @@ -732,26 +748,33 @@ def __init__( FIXME(@cspades): Identify the root cause of this behavior. """ self.mesh_library = {} - # TP Mesh + + def register_submesh(device_mesh, submesh, is_expert_parallel): + """Register a submesh with identifier: (*submesh, is_expert_parallel) + in the mesh library.""" + if contains_submesh(device_mesh, submesh): + submesh_identifier = tuple(list(submesh) + [is_expert_parallel]) + self.mesh_library[submesh_identifier] = device_mesh[submesh] + + # Define common submesh patterns tp_submesh = (self.tp_dim,) - if contains_submesh(self.device_mesh, tp_submesh): - self.mesh_library[tp_submesh] = self.device_mesh[tp_submesh] - # HSDP-TP Mesh hsdp_tp_submesh = (self.dp_outer_dim, self.dp_shard_dim, self.tp_dim) - if contains_submesh(self.device_mesh, hsdp_tp_submesh): - self.mesh_library[hsdp_tp_submesh] = self.device_mesh[hsdp_tp_submesh] - # FSDP-TP Mesh fsdp_tp_submesh = (self.dp_shard_dim, self.tp_dim) - if contains_submesh(self.device_mesh, fsdp_tp_submesh): - self.mesh_library[fsdp_tp_submesh] = self.device_mesh[fsdp_tp_submesh] - # HSDP Mesh hsdp_submesh = (self.dp_outer_dim, self.dp_shard_dim) - if contains_submesh(self.device_mesh, hsdp_submesh): - self.mesh_library[hsdp_submesh] = self.device_mesh[hsdp_submesh] - # FSDP Mesh fsdp_submesh = (self.dp_shard_dim,) - if contains_submesh(self.device_mesh, fsdp_submesh): - self.mesh_library[fsdp_submesh] = self.device_mesh[fsdp_submesh] + + # Register non-EP submeshes + register_submesh(self.device_mesh, tp_submesh, False) + register_submesh(self.device_mesh, hsdp_tp_submesh, False) + register_submesh(self.device_mesh, fsdp_tp_submesh, False) + register_submesh(self.device_mesh, hsdp_submesh, False) + register_submesh(self.device_mesh, fsdp_submesh, False) + + # Register EP submeshes + if self.expt_device_mesh is not None: + register_submesh(self.expt_device_mesh, tp_submesh, True) + register_submesh(self.expt_device_mesh, fsdp_tp_submesh, True) + register_submesh(self.expt_device_mesh, fsdp_submesh, True) # Validate FSDP arguments. if self.fsdp_group is None: @@ -776,36 +799,54 @@ def __init__( "process groups or sub-meshes." ) - def get_submesh(self, mesh_dim_names: str | Sequence[str]) -> DeviceMesh: + def get_submesh( + self, mesh_dim_names: str | Sequence[str], is_expert_parallel: bool = False + ) -> DeviceMesh: """ - Retrieve an Megatron-FSDP-registered sub-mesh by name(s). + Retrieve an Megatron-FSDP-registered submesh by name(s). """ if isinstance(mesh_dim_names, str): mesh_dim_names = (mesh_dim_names,) - # Search for the sub-mesh in the mesh library. - device_submesh = self.mesh_library.get(tuple(mesh_dim_names), None) + + # Construct submesh identifier: (*mesh_dim_names, is_expert_parallel) + submesh_identifier = tuple(list(mesh_dim_names) + [is_expert_parallel]) + + # Retrieve the submesh from the mesh library + device_submesh = self.mesh_library.get(submesh_identifier, None) + if device_submesh is None: - if self.tp_dim is None: - # Warn about not specifying tp_dim for - # layers or frameworks that depend on this. + # Warn about not specifying tp_dim for layers or frameworks that depend on this. + if self.tp_dim is None and not is_expert_parallel: logger.warning( - "[FSDPDistributedIndex] Note: For TransformerEngine, or other machine learning " - "frameworks like Megatron that assume TP=1, you must specify tp_dim to use " - "Megatron-FSDP. Create a trivial TP dimension by setting the TP dimension size " + "[FSDPDistributedIndex] Note: For TransformerEngine, or " + "other machine learning frameworks like Megatron that assume " + "TP=1, you must specify tp_dim to use Megatron-FSDP. " + "Create a trivial TP dimension by setting the TP dimension size " "to 1 in the DeviceMesh.\n" f"DeviceMesh: {self.device_mesh}" ) + elif self.tp_dim is None and is_expert_parallel: + logger.warning( + "[FSDPDistributedIndex] Note: For TransformerEngine, or " + "other machine learning frameworks like Megatron that assume " + "ETP=1, you must specify tp_dim to use Megatron-FSDP. " + "Create a trivial ETP dimension by setting the ETP dimension size " + "to 1 in the DeviceMesh.\n" + f"DeviceMesh: {self.expt_device_mesh}" + ) + raise ValueError( - f"[FSDPDistributedIndex][get_submesh] No sub-mesh with " - f"mesh_dim_names={mesh_dim_names} has been registered with Megatron-FSDP." + f"[FSDPDistributedIndex][get_submesh] No submesh with " + f"mesh_dim_names={mesh_dim_names}, is_expert_parallel={is_expert_parallel} " + f"has been registered with Megatron-FSDP." ) + return device_submesh def get_dp_group(self, is_expert_parallel: bool = False) -> ProcessGroup: """Get the data parallel process group.""" if is_expert_parallel: - # Expert parallel is not supported - return None + return self.expt_fsdp_group if self.use_hybrid_fsdp: return self.hybrid_fsdp_group return self.fsdp_group @@ -813,8 +854,7 @@ def get_dp_group(self, is_expert_parallel: bool = False) -> ProcessGroup: def get_fsdp_group(self, is_expert_parallel: bool = False) -> ProcessGroup: """Get the FSDP process group.""" if is_expert_parallel: - # Expert parallel is not supported - return None + return self.expt_fsdp_group return self.fsdp_group def get_outer_fsdp_group(self) -> ProcessGroup: @@ -826,7 +866,7 @@ def get_outer_fsdp_group(self) -> ProcessGroup: def get_root_mesh(self, is_expert_parallel: bool = False) -> DeviceMesh: """Get the device mesh.""" if is_expert_parallel: - raise NotImplementedError("Expert parallel is not supported in Megatron-FSDP.") + return self.expt_device_mesh return self.device_mesh def get_logical_hybrid_fsdp_rank(self): @@ -924,3 +964,29 @@ def create_updated_function_signature(original_function, **extended_kwargs: dict # Return the updated function signature. return inspect.Signature(params) + + +def is_mcore_tensor_model_parallel(param: torch.Tensor) -> bool: + """ + Check if the given parameter is Megatron-Core tensor model parallel. + """ + return getattr(param, "_mcore_tp", False) or getattr(param, "tensor_model_parallel", False) + + +def is_mcore_tensor_parallel_duplicated(param: torch.Tensor) -> bool: + """ + Check if the given parameter is Megatron-Core tensor model parallel and duplicated. + """ + return getattr(param, "_tp_duplicated", False) + + +def get_mcore_tensor_parallel_partition_dim(param: torch.Tensor) -> Optional[int]: + """ + Get the partition dimension for a Megatron-Core tensor model parallel parameter. + """ + if is_mcore_tensor_model_parallel(param): + if hasattr(param, "_tp_partition_dim"): + return param._tp_partition_dim + else: + return param.partition_dim + return None diff --git a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py index 507472f789f..455a7757d28 100644 --- a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +++ b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py @@ -130,9 +130,9 @@ def forward(self, max_seq_len: int, offset: int = 0, packed_seq: bool = False) - self.original_max_position_embeddings, self.correction_range_round_to_int, ) - inv_freq_mask = 1.0 - _yarn_linear_ramp_mask(low, high, self.dim // 2).to( - device=self.inv_freq_extra.device, dtype=torch.float32 - ) + inv_freq_mask = 1.0 - _yarn_linear_ramp_mask( + low, high, self.dim // 2, device=self.inv_freq_extra.device + ).to(dtype=torch.float32) inv_freq = self.inv_freq_inter * (1 - inv_freq_mask) + self.inv_freq_extra * inv_freq_mask seq = ( @@ -211,11 +211,11 @@ def _yarn_find_correction_range( return max(low, 0), min(high, dim - 1) # Clamp values just in case -def _yarn_linear_ramp_mask(min: float, max: float, dim: int) -> Tensor: +def _yarn_linear_ramp_mask(min: float, max: float, dim: int, device: torch.device) -> Tensor: if min == max: max += 0.001 # Prevent singularity - linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + linear_func = (torch.arange(dim, dtype=torch.float32, device=device) - min) / (max - min) ramp_func = torch.clamp(linear_func, 0, 1) return ramp_func diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py index 307538fad22..c254b2f6882 100644 --- a/megatron/core/optimizer/__init__.py +++ b/megatron/core/optimizer/__init__.py @@ -34,6 +34,7 @@ from megatron.core import parallel_state from megatron.core.optimizer.cpu_offloading.hybrid_optimizer import HybridDeviceOptimizer from megatron.core.process_groups_config import ProcessGroupCollection +from megatron.core.transformer.fsdp_dtensor_checkpoint import get_global_unique_param_name from ..distributed.param_and_grad_buffer import _ParamAndGradBuffer from ..transformer.module import MegatronModule @@ -481,6 +482,7 @@ def get_megatron_optimizer( use_gloo_process_groups: bool = True, default_skip_embedding_weight_decay: bool = False, pg_collection: Optional[ProcessGroupCollection] = None, + dump_param_to_param_group_map: Optional[str] = None, ) -> MegatronOptimizer: """Retrieve the Megatron optimizer for model chunks. @@ -502,6 +504,7 @@ def get_megatron_optimizer( This is useful if you do not want embeddings to shrink to zero in training as recommended in https://arxiv.org/abs/2312.16903 pg_collection: Optional unified process group for distributed training. + dump_param_to_param_group_map (Optional[str]): path to dump parameter to param group map. Returns: Instance of MegatronOptimizer. @@ -579,6 +582,9 @@ def get_megatron_optimizer( return ChainedOptimizer(optimizers) + if dump_param_to_param_group_map is not None: + param_to_param_group = {} + param_group_id = 0 for dense_model_chunks, overlap_param_gather_with_optimizer_step in zip( all_dense_model_chunks, overlap_param_gather_with_optimizer_step_flags ): @@ -597,6 +603,12 @@ def get_megatron_optimizer( model_chunk.overlap_param_gather_with_optimizer_step = ( overlap_param_gather_with_optimizer_step ) + if dump_param_to_param_group_map is not None: + for param_group in param_groups: + for param in param_group["params"]: + param_name = get_global_unique_param_name(model_chunks, param) + param_to_param_group[param_name] = param_group_id + param_group_id += 1 # Pass Gloo process groups into optimizer only if needed. optimizers.append( @@ -626,6 +638,12 @@ def get_megatron_optimizer( buffer_name='expert_parallel_buffers', default_skip_embedding_weight_decay=default_skip_embedding_weight_decay, ) + if dump_param_to_param_group_map is not None: + for param_group in moe_param_groups: + for param in param_group["params"]: + param_name = get_global_unique_param_name(model_chunks, param) + param_to_param_group[param_name] = param_group_id + param_group_id += 1 if len(moe_param_groups) > 0: expt_model_parallel_rank = get_pg_rank(expt_tp_pp_group) # Pass Gloo process groups into optimizer only if needed. @@ -648,4 +666,9 @@ def get_megatron_optimizer( ) ) + if dump_param_to_param_group_map is not None: + torch.distributed.checkpoint.save( + state_dict=param_to_param_group, checkpoint_id=dump_param_to_param_group_map + ) + return ChainedOptimizer(optimizers) diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py index 2925edcce60..8b4740516e2 100644 --- a/megatron/core/optimizer/distrib_optimizer.py +++ b/megatron/core/optimizer/distrib_optimizer.py @@ -47,6 +47,7 @@ from ..dist_checkpointing.utils import extract_sharded_tensors_and_factories from ..distributed.param_and_grad_buffer import _ParamAndGradBuffer, partition_buckets from ..fp8_utils import dequantize_fp8_tensor, is_float8tensor, quantize_param_shard +from ..transformer.fsdp_dtensor_checkpoint import handle_experts_in_state_dict from ..transformer.module import MegatronModule from .grad_scaler import MegatronGradScaler from .optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper, param_group_identifier_keys @@ -1152,6 +1153,7 @@ def _param_name(self, param: torch.nn.Parameter) -> str: "Ensure that each model chunk has unique parameter names." ) name_to_param.update(_name_to_param) + name_to_param = handle_experts_in_state_dict(name_to_param) self.param_to_name = {param: name for name, param in name_to_param.items()} assert ( param in self.param_to_name diff --git a/megatron/core/transformer/fsdp_dtensor_checkpoint.py b/megatron/core/transformer/fsdp_dtensor_checkpoint.py index dad1947a183..9ef3f1f1b82 100644 --- a/megatron/core/transformer/fsdp_dtensor_checkpoint.py +++ b/megatron/core/transformer/fsdp_dtensor_checkpoint.py @@ -12,18 +12,160 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging +import re + import torch +import torch.distributed as dist +from torch.distributed.checkpoint import default_planner + +logger = logging.getLogger(__name__) try: + from torch.distributed import DeviceMesh + from torch.distributed._tensor import DTensor + from torch.distributed.checkpoint.metadata import TensorStorageMetadata + from torch.distributed.tensor.placement_types import Replicate, Shard + from megatron.core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer import ( make_fsdp_dtensor, ) + from megatron.core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor import ( + gather_uneven_dtensor_to_full_tensor, + ) + from megatron.core.distributed.fsdp.src.megatron_fsdp.utils import ( + get_mcore_tensor_parallel_partition_dim, + is_mcore_tensor_model_parallel, + ) HAVE_MEGATRON_FSDP = True except ImportError: HAVE_MEGATRON_FSDP = False +from megatron.core import parallel_state from megatron.core.tensor_parallel.layers import copy_tensor_model_parallel_attributes +from megatron.core.transformer.transformer_layer import TransformerLayer + + +def get_ep_layer_offset(): + """ + Get the expert layer offset for the current model. + """ + from megatron.training.global_vars import get_args + + args = get_args() + ep_size = parallel_state.get_expert_model_parallel_world_size() + ep_rank = parallel_state.get_expert_model_parallel_rank() + num_local_experts = args.num_experts // ep_size if args.num_experts else 0 + local_expert_offset = ep_rank * num_local_experts + + return local_expert_offset + + +def get_total_num_experts(): + """ + Get the total number of experts for the current model. + """ + from megatron.training.global_vars import get_args + + args = get_args() + return args.num_experts if args.num_experts else 0 + + +def get_expert_index_from_key(key): + """Extract expert index from various expert key formats. + + Supported formats: + - GroupedMLP: 'mlp.experts.linear_fc1.weight0', 'mlp.experts.linear_fc2.weight0' + - SequentialMLP: 'mlp.experts.local_experts.0.linear_fc1.weight', + 'mlp.experts.local_experts.0.linear_fc2.weight' + + Returns: + int: Expert index if found, None otherwise. + """ + # GroupedMLP: index is at the end after 'weight' + if 'mlp.experts.linear_fc1.weight' in key or 'mlp.experts.linear_fc2.weight' in key: + m = re.search(r'^.*\.mlp\.experts\.linear_fc\d\.weight(\d+)', key) + assert m, f"Failed to parse expert index from key: {key}" + return int(m.group(1)) + # SequentialMLP: index is between 'local_experts.' and next '.' + elif 'mlp.experts.local_experts' in key: + m = re.search(r'^.*\.mlp\.experts\.local_experts\.(\d+)', key) + assert m, f"Failed to parse expert index from key: {key}" + return int(m.group(1)) + return None + + +def handle_experts_in_state_dict(state_dict): + """ + Rewrite expert keys in state dict. + """ + local_expert_start = get_ep_layer_offset() + local_expert_end = get_total_num_experts() + + def should_keep_expert_key(expert_index): + """Determine if this rank should keep this expert key based on expert index""" + if expert_index is None: + # If we can't determine expert index, keep the key (non-expert weights) + return True + + # Check if this expert belongs to this rank + return local_expert_start <= expert_index < local_expert_end + + def replace_expert_index_in_key(key, expert_index, state_dict): + """Replace expert index in key with new index corresponding to the current rank""" + new_expert_index = expert_index + local_expert_start + # GroupedMLP: 'mlp.experts.linear_fc1.weight0', 'mlp.experts.linear_fc2.weight0' + if 'mlp.experts.linear_fc1.weight' in key or 'mlp.experts.linear_fc2.weight' in key: + # Handle SwiGLU weight{idx}_w and weight{idx}_v format + if key.endswith('_w') or key.endswith('_v'): + suffix = key[-2:] # '_w' or '_v' + new_key = key.replace( + f'weight{expert_index}{suffix}', f'weight{new_expert_index}{suffix}' + ) + # Handle regular weight{idx} format + else: + new_key = key.replace(f'weight{expert_index}', f'weight{new_expert_index}') + # SequentialMLP: index is between 'local_experts.' and next '.' + elif 'mlp.experts.local_experts' in key: + new_key = key.replace( + f'local_experts.{expert_index}.', f'local_experts.{new_expert_index}.' + ) + else: + raise ValueError(f"Unexpected expert key format: {key}") + + state_dict[new_key] = state_dict[key] + del state_dict[key] + + # Process model state dict + state_dict = state_dict.copy() + for key in list(state_dict.keys()): + expert_index = get_expert_index_from_key(key) + if not should_keep_expert_key(expert_index): + replace_expert_index_in_key(key, expert_index, state_dict) + + return state_dict + + +def expert_param_local_key(key): + """Get the module parameter corresponding to the key.""" + local_expert_offset = get_ep_layer_offset() + expert_index = get_expert_index_from_key(key) + if expert_index is not None: + new_expert_index = expert_index - local_expert_offset + # GroupedMLP: 'mlp.experts.linear_fc1.weight0', 'mlp.experts.linear_fc2.weight0' + if 'mlp.experts.linear_fc1.weight' in key or 'mlp.experts.linear_fc2.weight' in key: + new_key = key.replace(f'weight{expert_index}', f'weight{new_expert_index}') + # SequentialMLP: index is between 'local_experts.' and next '.' + elif 'mlp.experts.local_experts' in key: + new_key = key.replace( + f'local_experts.{expert_index}.', f'local_experts.{new_expert_index}.' + ) + else: + raise ValueError(f"Unexpected expert key format: {key}") + key = new_key + + return key def handle_swiglu_in_state_dict(model, model_state_dict, optimizer_state_dict): @@ -43,7 +185,29 @@ def intersection(s1, s2): def offset_slice(s, offset): return slice(s.start + offset, s.stop + offset) - def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis): + def is_swiglu_key(key): + """ + Check if this key should be handled as SwiGLU linear_fc1 weight or bias. + """ + # Non-expert MLP: 'mlp.linear_fc1.weight', 'mlp.linear_fc1.bias' + # GroupedMLP: 'mlp.experts.linear_fc1.weight0', 'mlp.experts.linear_fc1.bias0' + # SequentialMLP: 'mlp.experts.local_experts.0.linear_fc1.weight', + # 'mlp.experts.local_experts.0.linear_fc1.bias' + return any( + re.search(pat, key) + for pat in [ + r"(.*)\.mlp\.linear_fc1\.weight$", + r"(.*)\.mlp\.linear_fc1\.bias$", + r"(.*)\.mlp\.experts\.linear_fc1\.weight(\d+)$", + r"(.*)\.mlp\.experts\.linear_fc1\.bias(\d+)$", + r"(.*)\.mlp\.experts\.local_experts\.(\d+)\.linear_fc1\.weight$", + r"(.*)\.mlp\.experts\.local_experts\.(\d+)\.linear_fc1\.bias$", + r"(.*)\.mlp\.shared_experts\.linear_fc1\.weight$", + r"(.*)\.mlp\.shared_experts\.linear_fc1\.bias$", + ] + ) + + def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis, is_expert_param): """ Split the SWiGLU linear_fc1 parameter into two parts: weight_w and weight_v. """ @@ -55,7 +219,9 @@ def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis): fsdp_slice = dist_param.megatron_fsdp_slice megatron_fsdp_dist_index = dist_param.megatron_fsdp_dist_index - tp_mesh = megatron_fsdp_dist_index.get_submesh([megatron_fsdp_dist_index.tp_dim]) + tp_mesh = megatron_fsdp_dist_index.get_submesh( + [megatron_fsdp_dist_index.tp_dim], is_expert_parallel=is_expert_param + ) data_size = data.numel() // tp_mesh.mesh.numel() w_slice = slice(0, data_size // 2) v_slice = slice(data_size // 2, data_size) @@ -75,8 +241,9 @@ def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis): # Fake parameters w and v are used to provide the correct parameter # shape and Tensor-Parallelism information. per_tp_rank_shape = list(data.shape) - if getattr(dist_param, "tensor_model_parallel", False): - tp_dim = dist_param.partition_dim + if is_mcore_tensor_model_parallel(dist_param): + tp_dim = get_mcore_tensor_parallel_partition_dim(dist_param) + assert tp_dim is not None, "Tensor model parallel dimension not found" per_tp_rank_shape[tp_dim] //= tp_mesh.mesh.numel() linear_fc1_meta = torch.empty(*per_tp_rank_shape, device="meta") w_meta, v_meta = torch.chunk(linear_fc1_meta, 2, dim=swiglu_shard_axis) @@ -87,6 +254,7 @@ def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis): weight_w.data, w_meta, dist_index=megatron_fsdp_dist_index, + is_expert_param=is_expert_param, run_check=True, update_uneven_dtensor_chunk_meta=True, ) @@ -94,16 +262,21 @@ def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis): weight_v.data, v_meta, dist_index=megatron_fsdp_dist_index, + is_expert_param=is_expert_param, run_check=True, update_uneven_dtensor_chunk_meta=True, ) return weight_w, weight_v + model_state_dict = model_state_dict.copy() for key in list(model_state_dict.keys()): - if key.endswith('mlp.linear_fc1.weight') or key.endswith('mlp.linear_fc1.bias'): + if is_swiglu_key(key): dist_param = model.get_parameter(f"module.{key}") weight_w, weight_v = split_swiglu_linear_fc1( - model_state_dict[key], dist_param, swiglu_shard_axis=0 + model_state_dict[key], + dist_param, + swiglu_shard_axis=0, + is_expert_param='mlp.experts' in key, ) # Update the model state dict with the new keys @@ -111,26 +284,32 @@ def split_swiglu_linear_fc1(data, dist_param, swiglu_shard_axis): model_state_dict[f"{key}_v"] = weight_v del model_state_dict[key] - try: - optimizer_state_dict = optimizer_state_dict["state"] - except KeyError: - optimizer_state_dict = {} + if optimizer_state_dict is not None: + optimizer_state_dict = optimizer_state_dict.copy() + if len(optimizer_state_dict["state"]) != 0: + opt_state_dict = optimizer_state_dict["state"] + new_opt_state_dict = {} + for key in list(opt_state_dict.keys()): + # Only process SWIGLU keys + if not is_swiglu_key(key): + new_opt_state_dict[key] = opt_state_dict[key] + continue + new_opt_state_dict[f"{key}_w"] = opt_state_dict[key].copy() + new_opt_state_dict[f"{key}_v"] = opt_state_dict[key].copy() + for subkey in ["exp_avg", "exp_avg_sq"]: + dist_param = model.get_parameter(expert_param_local_key(key[len("module.") :])) + weight_w, weight_v = split_swiglu_linear_fc1( + opt_state_dict[key][subkey], + dist_param, + swiglu_shard_axis=0, + is_expert_param="mlp.experts" in key, + ) + # Update the optimizer state dict with the new keys + new_opt_state_dict[f"{key}_w"][subkey] = weight_w + new_opt_state_dict[f"{key}_v"][subkey] = weight_v + optimizer_state_dict["state"] = new_opt_state_dict - if len(optimizer_state_dict) != 0: - for key in list(optimizer_state_dict.keys()): - if not (key.endswith('mlp.linear_fc1.weight') or key.endswith('mlp.linear_fc1.bias')): - continue - optimizer_state_dict[f"{key}_w"] = optimizer_state_dict[key].copy() - optimizer_state_dict[f"{key}_v"] = optimizer_state_dict[key].copy() - for subkey in ["exp_avg", "exp_avg_sq"]: - dist_param = model.get_parameter(key[len("module.") :]) - weight_w, weight_v = split_swiglu_linear_fc1( - optimizer_state_dict[key][subkey], dist_param, swiglu_shard_axis=0 - ) - # Update the optimizer state dict with the new keys - optimizer_state_dict[f"{key}_w"][subkey] = weight_w - optimizer_state_dict[f"{key}_v"][subkey] = weight_v - del optimizer_state_dict[key] + return model_state_dict, optimizer_state_dict def handle_fp8_extra_state_case(model_state_dict): @@ -162,7 +341,7 @@ def flatten_state_dict(obj, parent_key="", sep="."): return items -def print_diff_in_state_dicts(state_dict_metadata, load_state_dict): +def print_diff_in_state_dicts(state_dict_metadata, load_state_dict, limit=100): """ Print the differences between two state dicts: metadata state dict and load state dict. This function compares the keys and shapes of the tensors in both dicts. @@ -172,24 +351,105 @@ def print_diff_in_state_dicts(state_dict_metadata, load_state_dict): meta_keys = set(state_dict_metadata.keys()) load_keys = set(load_state_dict.keys()) - only_in_meta = meta_keys - load_keys - only_in_load = load_keys - meta_keys - in_both = meta_keys & load_keys + only_in_meta = list(meta_keys - load_keys) + only_in_load = list(load_keys - meta_keys) + in_both = list(meta_keys & load_keys) - print("Keys only in checkpoint metadata_state_dict:") - for k in sorted(only_in_meta): - print(f" {k}") + logger.info(f"Keys only in checkpoint metadata_state_dict(first {limit}):") + for k in sorted(only_in_meta[:limit]): + logger.info(f" {k}") - print("\nKeys only in load_state_dict:") - for k in sorted(only_in_load): - print(f" {k}") + logger.info(f"\nKeys only in load_state_dict(first {limit}):") + for k in sorted(only_in_load[:limit]): + logger.info(f" {k}") - print("\nKeys in both but with different shapes:") - for k in sorted(in_both): + logger.info(f"\nKeys in both but with different shapes(first {limit}):") + for k in sorted(in_both[:limit]): v_meta = state_dict_metadata[k] v_load = load_state_dict[k] # If tensors, compare shape; else, compare type/values meta_shape = v_meta.size if hasattr(v_meta, "size") else type(v_meta) load_shape = v_load.shape if hasattr(v_load, "shape") else type(v_load) if meta_shape != load_shape: - print(f" {k}: meta shape={meta_shape}, load shape={load_shape}") + logger.info(f" {k}: meta shape={meta_shape}, load shape={load_shape}") + + +def validate_loaded_state_dict(state_dict, checkpoint_path): + """ + Validate the loaded state dict against the expected structure and types. + """ + assert HAVE_MEGATRON_FSDP, "This function requires Megatron-FSDP to be installed." + + # Initialize reader + reader = torch.distributed.checkpoint.FileSystemReader(checkpoint_path) + metadata = reader.read_metadata() + flat_state_dict = flatten_state_dict(state_dict) + + for key, value in flat_state_dict.items(): + tensor_metadata = metadata.state_dict_metadata[key] + + if not isinstance(tensor_metadata, TensorStorageMetadata): + continue + if not isinstance(value, DTensor): + load_item_dict = {key: torch.empty_like(value)} + else: + load_item_dict = { + key: torch.distributed.tensor.empty( + tensor_metadata.size, + dtype=tensor_metadata.properties.dtype, + device_mesh=DeviceMesh.from_group( + group=dist.group.WORLD, + device_type="cuda", + mesh=torch.arange(dist.get_world_size()), + mesh_dim_names=("world",), + ), + placements=[Shard(0)], + ) + } + torch.distributed.checkpoint.load( + load_item_dict, storage_reader=reader, planner=default_planner.DefaultLoadPlanner() + ) + if isinstance(value, DTensor): + full_value = gather_uneven_dtensor_to_full_tensor(value) + loaded_tensor = load_item_dict[key].redistribute( + placements=[Replicate()] * len(value.placements) + ) + assert torch.allclose( + loaded_tensor._local_tensor, full_value._local_tensor, atol=1e-8, rtol=1e-5 + ), f"key: {key}; {loaded_tensor} {full_value}" + else: + assert torch.allclose( + value, load_item_dict[key] + ), f"key: {key}; {value} {load_item_dict[key]}" + + +def get_global_unique_param_name(model_chunks, param): + """ + Get the global unique parameter name for a given model and parameter. + """ + param_name = None + for model in model_chunks: + for name, p in model.named_parameters(): + if p is param: + param_name = name + break + if param_name is None: + raise ValueError("Parameter not found in model chunks") + + # Get PP unique parameter name + if re.search(r"layers\.(\d+)", param_name) and "mtp" not in param_name: + tf_layer_number = -1 + for module in model.modules(): + if not isinstance(module, TransformerLayer): + continue + for p in module.parameters(): + if p is param: + tf_layer_number = module.layer_number + break + if tf_layer_number != -1: + param_name = re.sub(r"layers\.(\d+)", f"layers.{tf_layer_number - 1}", param_name) + + # Get EP unique parameter name + param_name = list(handle_experts_in_state_dict({param_name: None}).keys())[0] + + return param_name diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 8e5f343b73c..cd1de6a5118 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -2271,6 +2271,10 @@ def _add_training_args(parser): help="Use torch.optim.Optimizer instead of Megatron's optimizer in optimizer cpu offload mode.") group.add_argument('--overlap-cpu-optimizer-d2h-h2d', action='store_true', default=False, help='Overlap CPU optimizer step, gradients D2H and updated parameters H2D.') + group.add_argument('--dump-param-to-param-group-map', type=str, default=None, + help="Path to a file containing parameter-to-parameter-group mapping. " + "Provide a JSON file that specifies which parameters belong to which " + "parameter group for global coordination.") group.add_argument('--no-pin-cpu-grads', action='store_false', dest='pin_cpu_grads', help='Disable pinning of CPU memory for gradients.') group.add_argument('--no-pin-cpu-params', action='store_false', dest='pin_cpu_params', diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py index 71b9cd97021..93c23255f4c 100644 --- a/megatron/training/checkpointing.py +++ b/megatron/training/checkpointing.py @@ -42,9 +42,10 @@ try: from megatron.core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor import preprocess_state_dict_for_uneven_dtensor from megatron.core.transformer.fsdp_dtensor_checkpoint import ( + print_diff_in_state_dicts, handle_fp8_extra_state_case, handle_swiglu_in_state_dict, - print_diff_in_state_dicts, + handle_experts_in_state_dict, ) HAVE_MEGATRON_FSDP = True except ImportError: @@ -561,6 +562,9 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati # TODO Handle non-empty directories (e.g., after a crash during saving). ensure_directory_exists(checkpoint_name, check_parent=False) + if ckpt_format == "fsdp_dtensor": + state_dict = preprocess_fsdp_dtensor_state_dict(args, state_dict, model[0]) + fs_storage_writer = torch.distributed.checkpoint.FileSystemWriter(checkpoint_name) torch.distributed.checkpoint.save( state_dict=state_dict, @@ -784,9 +788,17 @@ def maybe_save_dataloader_state(train_iterator, iteration, dataloader_save_path) torch.save(dataloader_save_dict, data_state_save_path) -def generate_state_dict(args, model, optimizer, opt_param_scheduler, - rng_state, iteration=None, - optim_sd_kwargs=None, model_sd_kwargs=None, rerun_state=None): +def generate_state_dict( + args, + model, + optimizer, + opt_param_scheduler, + rng_state, + iteration=None, + optim_sd_kwargs=None, + model_sd_kwargs=None, + rerun_state=None, +): """Generate a state dict from given model, optimizer, scheduler, rng state and others. """ # Arguments, iteration, and model. @@ -839,16 +851,27 @@ def generate_state_dict(args, model, optimizer, opt_param_scheduler, if not args.no_save_rng and rng_state: state_dict["rng_state"] = rng_state - # fsdp_dtensor ckpt specific state dict preprocessing - if args.ckpt_format == "fsdp_dtensor": - assert HAVE_MEGATRON_FSDP, "Megatron FSDP is enabled but Megatron-FSDP is not available." - assert len(model) == 1, "FSDP DTensor checkpoints are not supported for multiple models." - if args.swiglu: - state_dict = state_dict.copy() - handle_swiglu_in_state_dict( - model[0], state_dict["model"], state_dict["optimizer"]) - handle_fp8_extra_state_case(state_dict["model"]) - preprocess_state_dict_for_uneven_dtensor(state_dict) + return state_dict + + +def preprocess_fsdp_dtensor_state_dict(args, raw_state_dict, model): + state_dict = raw_state_dict.copy() + handle_fp8_extra_state_case(state_dict["model"]) + if args.swiglu: + if "optimizer" in state_dict: + model_state_dict, optimizer_state_dict = handle_swiglu_in_state_dict( + model, state_dict["model"], state_dict["optimizer"] + ) + state_dict["model"] = model_state_dict + state_dict["optimizer"] = optimizer_state_dict + else: + model_state_dict, _ = handle_swiglu_in_state_dict( + model, state_dict["model"], None + ) + state_dict["model"] = model_state_dict + if args.num_experts: + state_dict["model"] = handle_experts_in_state_dict(state_dict["model"]) + preprocess_state_dict_for_uneven_dtensor(state_dict) return state_dict @@ -1169,6 +1192,12 @@ def _load_base_checkpoint( if rank0: return {}, checkpoint_name, release, CheckpointType.FSDP_DTENSOR + state_dict = sharded_state_dict + raw_optimizer_state_dict = state_dict["optimizer"].copy() if "optimizer" in state_dict else None + raw_model_state_dict = state_dict["model"].copy() if "model" in state_dict else None + model = state_dict.pop("_model") + state_dict = preprocess_fsdp_dtensor_state_dict(args, state_dict, model[0]) + ckpt_type = CheckpointType.FSDP_DTENSOR fs_storage_reader = torch.distributed.checkpoint.FileSystemReader(checkpoint_name) allow_partial_load = not getattr(args, 'strict_fsdp_dtensor_load', False) @@ -1177,15 +1206,20 @@ def _load_base_checkpoint( rank = torch.distributed.get_rank() import time as _time _time.sleep(rank * 0.001) # Make that logs of different ranks do not overlap - print_diff_in_state_dicts(state_dict_metadata, sharded_state_dict) + print_diff_in_state_dicts(state_dict_metadata, state_dict) planner = default_planner.DefaultLoadPlanner(allow_partial_load=allow_partial_load) torch.distributed.checkpoint.load_state_dict( - state_dict=sharded_state_dict, + state_dict=state_dict, storage_reader=fs_storage_reader, planner=planner, ) - state_dict = sharded_state_dict + + if raw_optimizer_state_dict is not None: + state_dict["optimizer"] = raw_optimizer_state_dict + + if raw_model_state_dict is not None: + state_dict["model"] = raw_model_state_dict else: raise NotImplementedError(f"checkpoint format {ckpt_format} not supported") @@ -1520,7 +1554,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load', except FileNotFoundError: state_dict_metadata = {} - gen_sd_rerun_state = None + gen_sd_rerun_state = {} gen_sd_opt_param_scheduler = None gen_sd_rng_state = None gen_sd_optim = None @@ -1537,7 +1571,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load', optim_sd_kwargs = dict(metadata=_build_sharded_state_dict_metadata(args), is_loading=True) - load_kwargs["sharded_state_dict"] = generate_state_dict( + state_dict = generate_state_dict( args, model=model, optimizer=gen_sd_optim, @@ -1547,6 +1581,8 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load', rerun_state=gen_sd_rerun_state, iteration=1, ) + state_dict["_model"] = model + load_kwargs["sharded_state_dict"] = state_dict state_dict, checkpoint_name, release, ckpt_type = _load_base_checkpoint( load_dir, args, rank0=False, checkpointing_context=checkpointing_context, diff --git a/megatron/training/training.py b/megatron/training/training.py index f805dab0f15..bda9e42dc82 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -1210,6 +1210,7 @@ def setup_model_and_optimizer( # If the user is asking for a non-zero embedding init std, skip weight decay for embeddings # to avoid embeddings from shrinking to zero as recommended in https://arxiv.org/abs/2312.16903 default_skip_embedding_weight_decay=args.embedding_init_method_std is not None, + dump_param_to_param_group_map=args.dump_param_to_param_group_map, ) else: optimizer = get_megatron_muon_optimizer( diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json index 0f2637a9511..717ae3f5fa6 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_coreweave.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.04748, - "2": 11.03561, - "3": 9.58774, - "4": 9.25819, - "5": 9.53583, - "6": 9.8804, - "7": 9.48247, - "8": 8.93575, - "9": 8.65813, - "10": 9.0567, - "11": 8.49445, - "12": 8.52444, - "13": 8.45239, - "14": 7.97323, - "15": 8.0476, - "16": 8.07971, - "17": 8.09081, - "18": 7.76437, - "19": 8.14892, - "20": 7.89868, - "21": 7.59371, - "22": 7.54743, - "23": 7.43222, - "24": 7.4302, - "25": 7.67579, - "26": 7.06929, - "27": 7.62041, - "28": 7.32495, - "29": 7.49042, - "30": 7.64391, - "31": 7.39435, - "32": 7.58789, - "33": 7.64037, - "34": 7.69778, - "35": 7.20998, - "36": 7.08538, - "37": 7.42584, - "38": 7.18804, - "39": 7.55054, - "40": 7.54446, - "41": 7.49287, - "42": 7.24937, - "43": 7.23587, - "44": 7.41595, - "45": 7.18755, - "46": 6.89949, - "47": 7.29966, - "48": 7.14134, - "49": 7.58963, - "50": 7.03602 + "1": 11.04722, + "2": 11.03572, + "3": 9.58802, + "4": 9.25807, + "5": 9.46595, + "6": 9.99646, + "7": 9.50952, + "8": 8.97596, + "9": 8.64768, + "10": 9.40103, + "11": 8.86556, + "12": 8.63563, + "13": 8.52125, + "14": 8.08824, + "15": 8.1958, + "16": 8.22112, + "17": 8.14098, + "18": 7.8386, + "19": 8.23438, + "20": 7.95361, + "21": 7.62549, + "22": 7.60352, + "23": 7.47957, + "24": 7.46573, + "25": 7.70343, + "26": 7.10719, + "27": 7.64313, + "28": 7.34582, + "29": 7.5169, + "30": 7.67511, + "31": 7.41799, + "32": 7.61213, + "33": 7.66582, + "34": 7.73101, + "35": 7.23081, + "36": 7.10765, + "37": 7.4476, + "38": 7.21053, + "39": 7.57508, + "40": 7.5662, + "41": 7.51605, + "42": 7.27243, + "43": 7.25706, + "44": 7.44, + "45": 7.21244, + "46": 6.92421, + "47": 7.32604, + "48": 7.17147, + "49": 7.62154, + "50": 7.0624 } }, "num-zeros": { @@ -62,55 +62,55 @@ "step_interval": 1, "values": { "1": 38802612.0, - "2": 38543592.0, - "3": 38739528.0, - "4": 279937824.0, - "5": 259189728.0, - "6": 271446400.0, - "7": 604773504.0, - "8": 768892544.0, - "9": 645824128.0, - "10": 744257088.0, - "11": 718888576.0, - "12": 746732544.0, - "13": 871990976.0, - "14": 821645632.0, - "15": 724250816.0, - "16": 932241472.0, - "17": 648958912.0, - "18": 649120000.0, - "19": 925992960.0, - "20": 989207936.0, - "21": 819324096.0, - "22": 736955072.0, - "23": 910497792.0, - "24": 876716672.0, - "25": 843170688.0, - "26": 809573824.0, - "27": 854086912.0, - "28": 802857664.0, - "29": 805523328.0, - "30": 775645184.0, - "31": 771754624.0, - "32": 749733696.0, - "33": 718385216.0, - "34": 724771200.0, - "35": 737655104.0, - "36": 690419968.0, - "37": 673203456.0, - "38": 627239552.0, - "39": 614047168.0, - "40": 607288512.0, - "41": 582590592.0, - "42": 548211200.0, - "43": 532740640.0, - "44": 554239168.0, - "45": 514790528.0, - "46": 350258560.0, - "47": 472420128.0, - "48": 453788736.0, - "49": 440597216.0, - "50": 303063296.0 + "2": 38543656.0, + "3": 38739356.0, + "4": 273649600.0, + "5": 252887040.0, + "6": 255692384.0, + "7": 598483264.0, + "8": 787737984.0, + "9": 696133120.0, + "10": 505146368.0, + "11": 718888640.0, + "12": 872597184.0, + "13": 947495104.0, + "14": 1076398976.0, + "15": 856390592.0, + "16": 1048635648.0, + "17": 831370688.0, + "18": 963679552.0, + "19": 970018240.0, + "20": 935737344.0, + "21": 904189312.0, + "22": 887937280.0, + "23": 894777856.0, + "24": 703744192.0, + "25": 909232512.0, + "26": 875633216.0, + "27": 894981376.0, + "28": 919242816.0, + "29": 931351552.0, + "30": 929784768.0, + "31": 941621376.0, + "32": 885000768.0, + "33": 828484096.0, + "34": 822284800.0, + "35": 832032128.0, + "36": 787939392.0, + "37": 770719808.0, + "38": 561204672.0, + "39": 617201536.0, + "40": 695374592.0, + "41": 698978816.0, + "42": 692913728.0, + "43": 668003776.0, + "44": 673780992.0, + "45": 631182912.0, + "46": 444613312.0, + "47": 591957824.0, + "48": 617363968.0, + "49": 585295808.0, + "50": 570423872.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 6637267456.0, - "2": 6637269504.0, - "3": 6637269504.0, - "4": 6637269504.0, - "5": 6637269504.0, - "6": 6637269504.0, - "7": 6637269504.0, - "8": 6637269504.0, - "9": 6637269504.0, - "10": 6637269504.0, - "11": 6637269504.0, - "12": 6637269504.0, - "13": 6637269504.0, - "14": 6637269504.0, - "15": 6637269504.0, - "16": 6637269504.0, - "17": 6637269504.0, - "18": 6637269504.0, - "19": 6637269504.0, - "20": 6637269504.0, - "21": 6637269504.0, - "22": 6637269504.0, - "23": 6637269504.0, - "24": 6637269504.0, - "25": 6637269504.0, - "26": 6637269504.0, - "27": 6637269504.0, - "28": 6637269504.0, - "29": 6637269504.0, - "30": 6637269504.0, - "31": 6637269504.0, - "32": 6637269504.0, - "33": 6637269504.0, - "34": 6637269504.0, - "35": 6637269504.0, - "36": 6637269504.0, - "37": 6637269504.0, - "38": 6637269504.0, - "39": 6637269504.0, - "40": 6637269504.0, - "41": 6637269504.0, - "42": 6637269504.0, - "43": 6637269504.0, - "44": 6637269504.0, - "45": 6637269504.0, - "46": 6637269504.0, - "47": 6637269504.0, - "48": 6637269504.0, - "49": 6637269504.0, - "50": 6637269504.0 + "1": 6637272576.0, + "2": 6637274624.0, + "3": 6637274624.0, + "4": 6637274624.0, + "5": 6637274624.0, + "6": 6637274624.0, + "7": 6637274624.0, + "8": 6637274624.0, + "9": 6637274624.0, + "10": 6637274624.0, + "11": 6637274624.0, + "12": 6637274624.0, + "13": 6637274624.0, + "14": 6637274624.0, + "15": 6637274624.0, + "16": 6637274624.0, + "17": 6637274624.0, + "18": 6637274624.0, + "19": 6637274624.0, + "20": 6637274624.0, + "21": 6637274624.0, + "22": 6637274624.0, + "23": 6637274624.0, + "24": 6637274624.0, + "25": 6637274624.0, + "26": 6637274624.0, + "27": 6637274624.0, + "28": 6637274624.0, + "29": 6637274624.0, + "30": 6637274624.0, + "31": 6637274624.0, + "32": 6637274624.0, + "33": 6637274624.0, + "34": 6637274624.0, + "35": 6637274624.0, + "36": 6637274624.0, + "37": 6637274624.0, + "38": 6637274624.0, + "39": 6637274624.0, + "40": 6637274624.0, + "41": 6637274624.0, + "42": 6637274624.0, + "43": 6637274624.0, + "44": 6637274624.0, + "45": 6637274624.0, + "46": 6637274624.0, + "47": 6637274624.0, + "48": 6637274624.0, + "49": 6637274624.0, + "50": 6637274624.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 55055331328.0, - "2": 57809321984.0, - "3": 57918455808.0, - "4": 57918455808.0, - "5": 57918455808.0, - "6": 57918455808.0, - "7": 57918455808.0, - "8": 57918455808.0, - "9": 57918455808.0, - "10": 57918455808.0, - "11": 57918455808.0, - "12": 57918455808.0, - "13": 57931390976.0, - "14": 57931390976.0, - "15": 57931390976.0, - "16": 57931390976.0, - "17": 57931390976.0, - "18": 57931390976.0, - "19": 57931390976.0, - "20": 57931390976.0, - "21": 57931390976.0, - "22": 57931390976.0, - "23": 57931390976.0, - "24": 57931390976.0, - "25": 57931390976.0, - "26": 57931390976.0, - "27": 57931390976.0, - "28": 57931390976.0, - "29": 57931390976.0, - "30": 57931390976.0, - "31": 57931390976.0, - "32": 58003226624.0, - "33": 58003226624.0, - "34": 58003226624.0, - "35": 58003226624.0, - "36": 58003226624.0, - "37": 58003226624.0, - "38": 58003226624.0, - "39": 58003226624.0, - "40": 58003226624.0, - "41": 58003226624.0, - "42": 58003226624.0, - "43": 58003226624.0, - "44": 58183614464.0, - "45": 58234208256.0, - "46": 58555555840.0, - "47": 58555555840.0, - "48": 58555555840.0, - "49": 58555555840.0, - "50": 58780934144.0 + "1": 55056003072.0, + "2": 57810763776.0, + "3": 57920647168.0, + "4": 57920647168.0, + "5": 57920647168.0, + "6": 57920647168.0, + "7": 57920647168.0, + "8": 57920647168.0, + "9": 57920647168.0, + "10": 57920647168.0, + "11": 57920647168.0, + "12": 57920647168.0, + "13": 57920647168.0, + "14": 57920647168.0, + "15": 57920647168.0, + "16": 57920647168.0, + "17": 57920647168.0, + "18": 57920647168.0, + "19": 57920647168.0, + "20": 57920647168.0, + "21": 57920647168.0, + "22": 57920647168.0, + "23": 57920647168.0, + "24": 57920647168.0, + "25": 57920647168.0, + "26": 57920647168.0, + "27": 57920647168.0, + "28": 57920647168.0, + "29": 57920647168.0, + "30": 57920647168.0, + "31": 57920647168.0, + "32": 57920647168.0, + "33": 57920647168.0, + "34": 57961472000.0, + "35": 57961472000.0, + "36": 57961472000.0, + "37": 57961472000.0, + "38": 57961472000.0, + "39": 57961472000.0, + "40": 57961472000.0, + "41": 57961472000.0, + "42": 57961472000.0, + "43": 57961472000.0, + "44": 57961472000.0, + "45": 57961472000.0, + "46": 57961472000.0, + "47": 57961472000.0, + "48": 57961472000.0, + "49": 57961472000.0, + "50": 57961472000.0 } }, "mtp_1 loss": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.07654, - "2": 11.07406, - "3": 10.53881, - "4": 10.09803, - "5": 9.81154, - "6": 10.06236, - "7": 9.79762, - "8": 9.07117, - "9": 8.87049, - "10": 9.127, - "11": 8.49853, - "12": 8.53046, - "13": 8.42444, - "14": 7.847, - "15": 7.99077, - "16": 8.05015, - "17": 8.00064, - "18": 7.73104, - "19": 8.11087, - "20": 7.82933, - "21": 7.52501, - "22": 7.49916, - "23": 7.36982, - "24": 7.37235, - "25": 7.61578, - "26": 7.02029, - "27": 7.56014, - "28": 7.2681, - "29": 7.44399, - "30": 7.58618, - "31": 7.32468, - "32": 7.50596, - "33": 7.5715, - "34": 7.63581, - "35": 7.15224, - "36": 7.01784, - "37": 7.35163, - "38": 7.12551, - "39": 7.48656, - "40": 7.47408, - "41": 7.42096, - "42": 7.17595, - "43": 7.16059, - "44": 7.34289, - "45": 7.11969, - "46": 6.82753, - "47": 7.23525, - "48": 7.08042, - "49": 7.51043, - "50": 6.9735 + "1": 11.07648, + "2": 11.07404, + "3": 10.53854, + "4": 10.09813, + "5": 9.81166, + "6": 10.09741, + "7": 9.79481, + "8": 9.0642, + "9": 8.86016, + "10": 9.34039, + "11": 8.51318, + "12": 8.59467, + "13": 8.5292, + "14": 7.95757, + "15": 8.06962, + "16": 8.11802, + "17": 8.06993, + "18": 7.80587, + "19": 8.19192, + "20": 7.8906, + "21": 7.57063, + "22": 7.55091, + "23": 7.41606, + "24": 7.42454, + "25": 7.65274, + "26": 7.05583, + "27": 7.59747, + "28": 7.29984, + "29": 7.472, + "30": 7.61908, + "31": 7.35179, + "32": 7.52979, + "33": 7.59161, + "34": 7.66287, + "35": 7.17383, + "36": 7.04133, + "37": 7.37081, + "38": 7.1443, + "39": 7.50879, + "40": 7.48921, + "41": 7.43802, + "42": 7.19405, + "43": 7.17581, + "44": 7.35785, + "45": 7.13985, + "46": 6.84014, + "47": 7.25094, + "48": 7.09407, + "49": 7.52321, + "50": 6.98987 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 69.29797, - "2": 1.7261, - "3": 1.40981, - "4": 2.16562, - "5": 1.7862, - "6": 1.7469, - "7": 1.96688, - "8": 1.97301, - "9": 1.74665, - "10": 1.69613, - "11": 1.02979, - "12": 1.02408, - "13": 1.03261, - "14": 1.02432, - "15": 1.0529, - "16": 1.04491, - "17": 1.03693, - "18": 1.03399, - "19": 1.03627, - "20": 1.02284, - "21": 1.01667, - "22": 1.02932, - "23": 1.03591, - "24": 1.03466, - "25": 1.03149, - "26": 1.03165, - "27": 1.02342, - "28": 1.03777, - "29": 1.04061, - "30": 1.05641, - "31": 1.02382, - "32": 1.01775, - "33": 1.03039, - "34": 1.03693, - "35": 1.03153, - "36": 1.02699, - "37": 1.02756, - "38": 1.02919, - "39": 1.01773, - "40": 1.03491, - "41": 1.03152, - "42": 1.03035, - "43": 1.0221, - "44": 1.05201, - "45": 1.02579, - "46": 1.02798, - "47": 1.03857, - "48": 1.02772, - "49": 1.0408, - "50": 1.03745 + "1": 93.39829, + "2": 1.82958, + "3": 1.3241, + "4": 2.19661, + "5": 2.13156, + "6": 1.75452, + "7": 2.08539, + "8": 1.58016, + "9": 1.60816, + "10": 1.03407, + "11": 1.01797, + "12": 1.0168, + "13": 1.01666, + "14": 1.0748, + "15": 1.04137, + "16": 1.05864, + "17": 1.05961, + "18": 1.03233, + "19": 1.02728, + "20": 1.02917, + "21": 1.04313, + "22": 1.03054, + "23": 1.0313, + "24": 1.03789, + "25": 1.04414, + "26": 1.05561, + "27": 1.03361, + "28": 1.03142, + "29": 1.02437, + "30": 1.02195, + "31": 1.0172, + "32": 1.03318, + "33": 1.03742, + "34": 1.03628, + "35": 1.03575, + "36": 1.05127, + "37": 1.03273, + "38": 1.03381, + "39": 1.02923, + "40": 1.02986, + "41": 1.03249, + "42": 1.033, + "43": 1.03169, + "44": 1.03818, + "45": 1.02736, + "46": 1.02698, + "47": 1.03158, + "48": 1.02471, + "49": 1.03674, + "50": 1.0291 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_coreweave.json index 0af1bff480e..adec1b3bd58 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_coreweave.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.04624, - "2": 11.03476, - "3": 9.59903, - "4": 9.26301, - "5": 9.36373, - "6": 9.59608, - "7": 9.45214, - "8": 8.95198, - "9": 8.65952, - "10": 9.17778, - "11": 9.21306, - "12": 8.68184, - "13": 8.6038, - "14": 8.01576, - "15": 8.13595, - "16": 8.20124, - "17": 8.13602, - "18": 7.83369, - "19": 8.22974, - "20": 7.9452, - "21": 7.62338, - "22": 7.60791, - "23": 7.48374, - "24": 7.46559, - "25": 7.71274, - "26": 7.12081, - "27": 7.64626, - "28": 7.35234, - "29": 7.52084, - "30": 7.67784, - "31": 7.42246, - "32": 7.6137, - "33": 7.66159, - "34": 7.72817, - "35": 7.23134, - "36": 7.10612, - "37": 7.44953, - "38": 7.20946, - "39": 7.57073, - "40": 7.56124, - "41": 7.51119, - "42": 7.27048, - "43": 7.25633, - "44": 7.43634, - "45": 7.21132, - "46": 6.91913, - "47": 7.32211, - "48": 7.16551, - "49": 7.6155, - "50": 7.05648 + "1": 11.04577, + "2": 11.03578, + "3": 9.5968, + "4": 9.26068, + "5": 9.09365, + "6": 8.97825, + "7": 9.18096, + "8": 8.70673, + "9": 8.55632, + "10": 8.85377, + "11": 8.31245, + "12": 8.35862, + "13": 8.28114, + "14": 7.73951, + "15": 7.91242, + "16": 7.94944, + "17": 7.89918, + "18": 7.64375, + "19": 8.02647, + "20": 7.73813, + "21": 7.44557, + "22": 7.43367, + "23": 7.31291, + "24": 7.30268, + "25": 7.57549, + "26": 6.98093, + "27": 7.50005, + "28": 7.241, + "29": 7.40369, + "30": 7.51839, + "31": 7.29514, + "32": 7.47818, + "33": 7.52568, + "34": 7.57647, + "35": 7.12091, + "36": 6.97439, + "37": 7.30929, + "38": 7.09349, + "39": 7.43659, + "40": 7.45122, + "41": 7.37904, + "42": 7.14627, + "43": 7.13408, + "44": 7.30886, + "45": 7.08523, + "46": 6.8067, + "47": 7.21159, + "48": 7.0245, + "49": 7.50096, + "50": 6.92687 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 38802568, - "2": 38543544, - "3": 41886704, - "4": 264367872, - "5": 224737792, - "6": 302994528, - "7": 645808768, - "8": 775291136, - "9": 765475328, - "10": 675259904, - "11": 615098624, - "12": 702764352, - "13": 934951360, - "14": 1060699008, - "15": 802967296, - "16": 1026771392, - "17": 756706880, - "18": 715253696, - "19": 929126208, - "20": 875969472, - "21": 665188032, - "22": 903854976, - "23": 747044352, - "24": 920777856, - "25": 733230528, - "26": 863183104, - "27": 879318336, - "28": 916219136, - "29": 909384256, - "30": 879622720, - "31": 866425152, - "32": 819074560, - "33": 589493056, - "34": 772011648, - "35": 778655488, - "36": 759651584, - "37": 761302144, - "38": 463804224, - "39": 543038400, - "40": 497278720, - "41": 658241792, - "42": 661600512, - "43": 495713632, - "44": 673788672, - "45": 470873536, - "46": 614455040, - "47": 554219584, - "48": 570200064, - "49": 557109312, - "50": 347212736 + "1": 38802664.0, + "2": 38543552.0, + "3": 38740472.0, + "4": 273766176.0, + "5": 196515488.0, + "6": 432153600.0, + "7": 715038528.0, + "8": 797328960.0, + "9": 696279488.0, + "10": 668928192.0, + "11": 583742720.0, + "12": 595799040.0, + "13": 695916288.0, + "14": 617245056.0, + "15": 629936832.0, + "16": 639940800.0, + "17": 642766016.0, + "18": 664898112.0, + "19": 671247104.0, + "20": 602545216.0, + "21": 542607872.0, + "22": 551419008.0, + "23": 533094816.0, + "24": 527647904.0, + "25": 570717824.0, + "26": 510874176.0, + "27": 498748096.0, + "28": 510353632.0, + "29": 506802112.0, + "30": 486336928.0, + "31": 410143360.0, + "32": 372280800.0, + "33": 369351776.0, + "34": 353666688.0, + "35": 344549376.0, + "36": 278456576.0, + "37": 289517152.0, + "38": 274950816.0, + "39": 242921776.0, + "40": 223597264.0, + "41": 186386944.0, + "42": 180387488.0, + "43": 224573440.0, + "44": 217714800.0, + "45": 143723568.0, + "46": 161525888.0, + "47": 120124336.0, + "48": 183368272.0, + "49": 154411968.0, + "50": 167778288.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 7321308672, - "2": 7321310720, - "3": 7321310720, - "4": 7321310720, - "5": 7321310720, - "6": 7321310720, - "7": 7321310720, - "8": 7321310720, - "9": 7321310720, - "10": 7321310720, - "11": 7321310720, - "12": 7321310720, - "13": 7321310720, - "14": 7321310720, - "15": 7321310720, - "16": 7321310720, - "17": 7321310720, - "18": 7321310720, - "19": 7321310720, - "20": 7321310720, - "21": 7321310720, - "22": 7321310720, - "23": 7321310720, - "24": 7321310720, - "25": 7321310720, - "26": 7321310720, - "27": 7321310720, - "28": 7321310720, - "29": 7321310720, - "30": 7321310720, - "31": 7321310720, - "32": 7321310720, - "33": 7321310720, - "34": 7321310720, - "35": 7321310720, - "36": 7321310720, - "37": 7321310720, - "38": 7321310720, - "39": 7321310720, - "40": 7321310720, - "41": 7321310720, - "42": 7321310720, - "43": 7321310720, - "44": 7321310720, - "45": 7321310720, - "46": 7321310720, - "47": 7321310720, - "48": 7321310720, - "49": 7321310720, - "50": 7321310720 + "1": 7321336320.0, + "2": 7321338368.0, + "3": 7321338368.0, + "4": 7321338368.0, + "5": 7321338368.0, + "6": 7321338368.0, + "7": 7321338368.0, + "8": 7321338368.0, + "9": 7321338368.0, + "10": 7321338368.0, + "11": 7321338368.0, + "12": 7321338368.0, + "13": 7321338368.0, + "14": 7321338368.0, + "15": 7321338368.0, + "16": 7321338368.0, + "17": 7321338368.0, + "18": 7321338368.0, + "19": 7321338368.0, + "20": 7321338368.0, + "21": 7321338368.0, + "22": 7321338368.0, + "23": 7321338368.0, + "24": 7321338368.0, + "25": 7321338368.0, + "26": 7321338368.0, + "27": 7321338368.0, + "28": 7321338368.0, + "29": 7321338368.0, + "30": 7321338368.0, + "31": 7321338368.0, + "32": 7321338368.0, + "33": 7321338368.0, + "34": 7321338368.0, + "35": 7321338368.0, + "36": 7321338368.0, + "37": 7321338368.0, + "38": 7321338368.0, + "39": 7321338368.0, + "40": 7321338368.0, + "41": 7321338368.0, + "42": 7321338368.0, + "43": 7321338368.0, + "44": 7321338368.0, + "45": 7321338368.0, + "46": 7321338368.0, + "47": 7321338368.0, + "48": 7321338368.0, + "49": 7321338368.0, + "50": 7321338368.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 54396813312, - "2": 57149165568, - "3": 57165475840, - "4": 57165475840, - "5": 57165475840, - "6": 57165475840, - "7": 57165475840, - "8": 57165475840, - "9": 57165475840, - "10": 57165475840, - "11": 57165475840, - "12": 57165475840, - "13": 57165475840, - "14": 57165475840, - "15": 57165475840, - "16": 57165475840, - "17": 57165475840, - "18": 57165475840, - "19": 57165475840, - "20": 57165475840, - "21": 57165475840, - "22": 57165475840, - "23": 57165475840, - "24": 57165475840, - "25": 57165475840, - "26": 57165475840, - "27": 57165475840, - "28": 57165475840, - "29": 57165475840, - "30": 57165475840, - "31": 57165475840, - "32": 57165475840, - "33": 57165475840, - "34": 57165475840, - "35": 57165475840, - "36": 57165475840, - "37": 57165475840, - "38": 57165475840, - "39": 57165475840, - "40": 57295986688, - "41": 57295986688, - "42": 57331482624, - "43": 57360437248, - "44": 57561960448, - "45": 57561960448, - "46": 57561960448, - "47": 57585307648, - "48": 57602347008, - "49": 57823961088, - "50": 57823961088 + "1": 54402162688.0, + "2": 57150373888.0, + "3": 57150373888.0, + "4": 57150373888.0, + "5": 57150373888.0, + "6": 57150373888.0, + "7": 57150373888.0, + "8": 57150373888.0, + "9": 57150373888.0, + "10": 57150373888.0, + "11": 57150373888.0, + "12": 57150373888.0, + "13": 57150373888.0, + "14": 57150373888.0, + "15": 57150373888.0, + "16": 57150373888.0, + "17": 57150373888.0, + "18": 57150373888.0, + "19": 57150373888.0, + "20": 57150373888.0, + "21": 57150373888.0, + "22": 57150373888.0, + "23": 57150373888.0, + "24": 57150373888.0, + "25": 57150373888.0, + "26": 57150373888.0, + "27": 57150373888.0, + "28": 57150373888.0, + "29": 57150373888.0, + "30": 57150373888.0, + "31": 57150373888.0, + "32": 57150373888.0, + "33": 57150373888.0, + "34": 57150373888.0, + "35": 57152438272.0, + "36": 57344114688.0, + "37": 57344114688.0, + "38": 57449279488.0, + "39": 57449279488.0, + "40": 57449279488.0, + "41": 57449279488.0, + "42": 57449279488.0, + "43": 57449279488.0, + "44": 57449279488.0, + "45": 57470353408.0, + "46": 57470353408.0, + "47": 57470353408.0, + "48": 57470353408.0, + "49": 57470353408.0, + "50": 57470353408.0 } }, "mtp_1 loss": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.07779, - "2": 11.07564, - "3": 10.52904, - "4": 10.08924, - "5": 9.81101, - "6": 9.88786, - "7": 9.72987, - "8": 9.02044, - "9": 8.8145, - "10": 9.09362, - "11": 8.77612, - "12": 8.56714, - "13": 8.54777, - "14": 8.04338, - "15": 8.10946, - "16": 8.13231, - "17": 8.0853, - "18": 7.83475, - "19": 8.21923, - "20": 7.91097, - "21": 7.58489, - "22": 7.56231, - "23": 7.44204, - "24": 7.44303, - "25": 7.67594, - "26": 7.07138, - "27": 7.60696, - "28": 7.30925, - "29": 7.48219, - "30": 7.62699, - "31": 7.3655, - "32": 7.54203, - "33": 7.60199, - "34": 7.66716, - "35": 7.18385, - "36": 7.05252, - "37": 7.38377, - "38": 7.15521, - "39": 7.51639, - "40": 7.4929, - "41": 7.44762, - "42": 7.20298, - "43": 7.18681, - "44": 7.36683, - "45": 7.15506, - "46": 6.85064, - "47": 7.26072, - "48": 7.10489, - "49": 7.53477, - "50": 6.99715 + "1": 11.07769, + "2": 11.07625, + "3": 10.52909, + "4": 10.08687, + "5": 9.82013, + "6": 9.48246, + "7": 9.54169, + "8": 8.83661, + "9": 8.64933, + "10": 8.95821, + "11": 8.32934, + "12": 8.36033, + "13": 8.26936, + "14": 7.73441, + "15": 7.87122, + "16": 7.9153, + "17": 7.86923, + "18": 7.61191, + "19": 7.99919, + "20": 7.72174, + "21": 7.4147, + "22": 7.40336, + "23": 7.27676, + "24": 7.28557, + "25": 7.53782, + "26": 6.94933, + "27": 7.48504, + "28": 7.20219, + "29": 7.38696, + "30": 7.51152, + "31": 7.26613, + "32": 7.45631, + "33": 7.51482, + "34": 7.57527, + "35": 7.10374, + "36": 6.97224, + "37": 7.31053, + "38": 7.08607, + "39": 7.44371, + "40": 7.43612, + "41": 7.37848, + "42": 7.13561, + "43": 7.11558, + "44": 7.30254, + "45": 7.08147, + "46": 6.78911, + "47": 7.21791, + "48": 7.03066, + "49": 7.46668, + "50": 6.93251 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 98.46571, - "2": 1.63304, - "3": 1.32772, - "4": 1.63453, - "5": 1.11673, - "6": 1.14377, - "7": 1.33213, - "8": 1.32699, - "9": 1.07499, - "10": 1.12938, - "11": 1.07438, - "12": 1.11078, - "13": 1.06958, - "14": 1.08718, - "15": 1.10547, - "16": 1.07557, - "17": 1.08606, - "18": 1.0832, - "19": 1.08226, - "20": 1.126, - "21": 1.08645, - "22": 1.07978, - "23": 1.07859, - "24": 1.08221, - "25": 1.08192, - "26": 1.09185, - "27": 1.0923, - "28": 1.09562, - "29": 1.10486, - "30": 1.10038, - "31": 1.09094, - "32": 1.08693, - "33": 1.0883, - "34": 1.08169, - "35": 1.08611, - "36": 1.07758, - "37": 1.07933, - "38": 1.08289, - "39": 1.07885, - "40": 1.08075, - "41": 1.0781, - "42": 1.08028, - "43": 1.08035, - "44": 1.08973, - "45": 1.08944, - "46": 1.07483, - "47": 1.08306, - "48": 1.07701, - "49": 1.0768, - "50": 1.07022 + "1": 92.7075, + "2": 1.62502, + "3": 1.31213, + "4": 1.71707, + "5": 1.11852, + "6": 1.39151, + "7": 1.37049, + "8": 1.22293, + "9": 1.10694, + "10": 1.11053, + "11": 1.10169, + "12": 1.14642, + "13": 1.11639, + "14": 1.12927, + "15": 1.12868, + "16": 1.11899, + "17": 1.10545, + "18": 1.11542, + "19": 1.11417, + "20": 1.11349, + "21": 1.11071, + "22": 1.11032, + "23": 1.11836, + "24": 1.11402, + "25": 1.11546, + "26": 1.10471, + "27": 1.10368, + "28": 1.09929, + "29": 1.10324, + "30": 1.10507, + "31": 1.10255, + "32": 1.10727, + "33": 1.1043, + "34": 1.10476, + "35": 1.10252, + "36": 1.10053, + "37": 1.1068, + "38": 1.09229, + "39": 1.08165, + "40": 1.07889, + "41": 1.07583, + "42": 1.07174, + "43": 1.07738, + "44": 1.08604, + "45": 1.09529, + "46": 1.08309, + "47": 1.08896, + "48": 1.08318, + "49": 1.08597, + "50": 1.08649 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_eos.json index 585139e83c9..b7df693e1f7 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_eos.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgxh100_eos.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.04624, - "2": 11.03476, - "3": 9.59903, - "4": 9.26301, - "5": 9.36373, - "6": 9.59608, - "7": 9.45214, - "8": 8.95198, - "9": 8.65952, - "10": 9.17778, - "11": 9.21306, - "12": 8.68184, - "13": 8.6038, - "14": 8.01576, - "15": 8.13595, - "16": 8.20124, - "17": 8.13602, - "18": 7.83369, - "19": 8.22974, - "20": 7.9452, - "21": 7.62338, - "22": 7.60791, - "23": 7.48374, - "24": 7.46559, - "25": 7.71274, - "26": 7.12081, - "27": 7.64626, - "28": 7.35234, - "29": 7.52084, - "30": 7.67784, - "31": 7.42246, - "32": 7.6137, - "33": 7.66159, - "34": 7.72817, - "35": 7.23134, - "36": 7.10612, - "37": 7.44953, - "38": 7.20946, - "39": 7.57073, - "40": 7.56124, - "41": 7.51119, - "42": 7.27048, - "43": 7.25633, - "44": 7.43634, - "45": 7.21132, - "46": 6.91913, - "47": 7.32211, - "48": 7.16551, - "49": 7.6155, - "50": 7.05648 + "1": 11.04577, + "2": 11.03578, + "3": 9.5968, + "4": 9.26068, + "5": 9.09365, + "6": 8.97825, + "7": 9.18096, + "8": 8.70673, + "9": 8.55632, + "10": 8.85377, + "11": 8.31245, + "12": 8.35862, + "13": 8.28114, + "14": 7.73951, + "15": 7.91242, + "16": 7.94944, + "17": 7.89918, + "18": 7.64375, + "19": 8.02647, + "20": 7.73813, + "21": 7.44557, + "22": 7.43367, + "23": 7.31291, + "24": 7.30268, + "25": 7.57549, + "26": 6.98093, + "27": 7.50005, + "28": 7.241, + "29": 7.40369, + "30": 7.51839, + "31": 7.29514, + "32": 7.47818, + "33": 7.52568, + "34": 7.57647, + "35": 7.12091, + "36": 6.97439, + "37": 7.30929, + "38": 7.09349, + "39": 7.43659, + "40": 7.45122, + "41": 7.37904, + "42": 7.14627, + "43": 7.13408, + "44": 7.30886, + "45": 7.08523, + "46": 6.8067, + "47": 7.21159, + "48": 7.0245, + "49": 7.50096, + "50": 6.92687 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 38802568, - "2": 38543544, - "3": 41886704, - "4": 264367872, - "5": 224737792, - "6": 302994528, - "7": 645808768, - "8": 775291136, - "9": 765475328, - "10": 675259904, - "11": 615098624, - "12": 702764352, - "13": 934951360, - "14": 1060699008, - "15": 802967296, - "16": 1026771392, - "17": 756706880, - "18": 715253696, - "19": 929126208, - "20": 875969472, - "21": 665188032, - "22": 903854976, - "23": 747044352, - "24": 920777856, - "25": 733230528, - "26": 863183104, - "27": 879318336, - "28": 916219136, - "29": 909384256, - "30": 879622720, - "31": 866425152, - "32": 819074560, - "33": 589493056, - "34": 772011648, - "35": 778655488, - "36": 759651584, - "37": 761302144, - "38": 463804224, - "39": 543038400, - "40": 497278720, - "41": 658241792, - "42": 661600512, - "43": 495713632, - "44": 673788672, - "45": 470873536, - "46": 614455040, - "47": 554219584, - "48": 570200064, - "49": 557109312, - "50": 347212736 + "1": 38802664.0, + "2": 38543552.0, + "3": 38740472.0, + "4": 273766176.0, + "5": 196515488.0, + "6": 432153600.0, + "7": 715038528.0, + "8": 797328960.0, + "9": 696279488.0, + "10": 668928192.0, + "11": 583742720.0, + "12": 595799040.0, + "13": 695916288.0, + "14": 617245056.0, + "15": 629936832.0, + "16": 639940800.0, + "17": 642766016.0, + "18": 664898112.0, + "19": 671247104.0, + "20": 602545216.0, + "21": 542607872.0, + "22": 551419008.0, + "23": 533094816.0, + "24": 527647904.0, + "25": 570717824.0, + "26": 510874176.0, + "27": 498748096.0, + "28": 510353632.0, + "29": 506802112.0, + "30": 486336928.0, + "31": 410143360.0, + "32": 372280800.0, + "33": 369351776.0, + "34": 353666688.0, + "35": 344549376.0, + "36": 278456576.0, + "37": 289517152.0, + "38": 274950816.0, + "39": 242921776.0, + "40": 223597264.0, + "41": 186386944.0, + "42": 180387488.0, + "43": 224573440.0, + "44": 217714800.0, + "45": 143723568.0, + "46": 161525888.0, + "47": 120124336.0, + "48": 183368272.0, + "49": 154411968.0, + "50": 167778288.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 7321308672, - "2": 7321310720, - "3": 7321310720, - "4": 7321310720, - "5": 7321310720, - "6": 7321310720, - "7": 7321310720, - "8": 7321310720, - "9": 7321310720, - "10": 7321310720, - "11": 7321310720, - "12": 7321310720, - "13": 7321310720, - "14": 7321310720, - "15": 7321310720, - "16": 7321310720, - "17": 7321310720, - "18": 7321310720, - "19": 7321310720, - "20": 7321310720, - "21": 7321310720, - "22": 7321310720, - "23": 7321310720, - "24": 7321310720, - "25": 7321310720, - "26": 7321310720, - "27": 7321310720, - "28": 7321310720, - "29": 7321310720, - "30": 7321310720, - "31": 7321310720, - "32": 7321310720, - "33": 7321310720, - "34": 7321310720, - "35": 7321310720, - "36": 7321310720, - "37": 7321310720, - "38": 7321310720, - "39": 7321310720, - "40": 7321310720, - "41": 7321310720, - "42": 7321310720, - "43": 7321310720, - "44": 7321310720, - "45": 7321310720, - "46": 7321310720, - "47": 7321310720, - "48": 7321310720, - "49": 7321310720, - "50": 7321310720 + "1": 7321336320.0, + "2": 7321338368.0, + "3": 7321338368.0, + "4": 7321338368.0, + "5": 7321338368.0, + "6": 7321338368.0, + "7": 7321338368.0, + "8": 7321338368.0, + "9": 7321338368.0, + "10": 7321338368.0, + "11": 7321338368.0, + "12": 7321338368.0, + "13": 7321338368.0, + "14": 7321338368.0, + "15": 7321338368.0, + "16": 7321338368.0, + "17": 7321338368.0, + "18": 7321338368.0, + "19": 7321338368.0, + "20": 7321338368.0, + "21": 7321338368.0, + "22": 7321338368.0, + "23": 7321338368.0, + "24": 7321338368.0, + "25": 7321338368.0, + "26": 7321338368.0, + "27": 7321338368.0, + "28": 7321338368.0, + "29": 7321338368.0, + "30": 7321338368.0, + "31": 7321338368.0, + "32": 7321338368.0, + "33": 7321338368.0, + "34": 7321338368.0, + "35": 7321338368.0, + "36": 7321338368.0, + "37": 7321338368.0, + "38": 7321338368.0, + "39": 7321338368.0, + "40": 7321338368.0, + "41": 7321338368.0, + "42": 7321338368.0, + "43": 7321338368.0, + "44": 7321338368.0, + "45": 7321338368.0, + "46": 7321338368.0, + "47": 7321338368.0, + "48": 7321338368.0, + "49": 7321338368.0, + "50": 7321338368.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 54396813312, - "2": 57149165568, - "3": 57165475840, - "4": 57165475840, - "5": 57165475840, - "6": 57165475840, - "7": 57165475840, - "8": 57165475840, - "9": 57165475840, - "10": 57165475840, - "11": 57165475840, - "12": 57165475840, - "13": 57165475840, - "14": 57165475840, - "15": 57165475840, - "16": 57165475840, - "17": 57165475840, - "18": 57165475840, - "19": 57165475840, - "20": 57165475840, - "21": 57165475840, - "22": 57165475840, - "23": 57165475840, - "24": 57165475840, - "25": 57165475840, - "26": 57165475840, - "27": 57165475840, - "28": 57165475840, - "29": 57165475840, - "30": 57165475840, - "31": 57165475840, - "32": 57165475840, - "33": 57165475840, - "34": 57165475840, - "35": 57165475840, - "36": 57165475840, - "37": 57165475840, - "38": 57165475840, - "39": 57165475840, - "40": 57295986688, - "41": 57295986688, - "42": 57331482624, - "43": 57360437248, - "44": 57561960448, - "45": 57561960448, - "46": 57561960448, - "47": 57585307648, - "48": 57602347008, - "49": 57823961088, - "50": 57823961088 + "1": 54402162688.0, + "2": 57150373888.0, + "3": 57150373888.0, + "4": 57150373888.0, + "5": 57150373888.0, + "6": 57150373888.0, + "7": 57150373888.0, + "8": 57150373888.0, + "9": 57150373888.0, + "10": 57150373888.0, + "11": 57150373888.0, + "12": 57150373888.0, + "13": 57150373888.0, + "14": 57150373888.0, + "15": 57150373888.0, + "16": 57150373888.0, + "17": 57150373888.0, + "18": 57150373888.0, + "19": 57150373888.0, + "20": 57150373888.0, + "21": 57150373888.0, + "22": 57150373888.0, + "23": 57150373888.0, + "24": 57150373888.0, + "25": 57150373888.0, + "26": 57150373888.0, + "27": 57150373888.0, + "28": 57150373888.0, + "29": 57150373888.0, + "30": 57150373888.0, + "31": 57150373888.0, + "32": 57150373888.0, + "33": 57150373888.0, + "34": 57150373888.0, + "35": 57152438272.0, + "36": 57344114688.0, + "37": 57344114688.0, + "38": 57449279488.0, + "39": 57449279488.0, + "40": 57449279488.0, + "41": 57449279488.0, + "42": 57449279488.0, + "43": 57449279488.0, + "44": 57449279488.0, + "45": 57470353408.0, + "46": 57470353408.0, + "47": 57470353408.0, + "48": 57470353408.0, + "49": 57470353408.0, + "50": 57470353408.0 } }, "mtp_1 loss": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.07779, - "2": 11.07564, - "3": 10.52904, - "4": 10.08924, - "5": 9.81101, - "6": 9.88786, - "7": 9.72987, - "8": 9.02044, - "9": 8.8145, - "10": 9.09362, - "11": 8.77612, - "12": 8.56714, - "13": 8.54777, - "14": 8.04338, - "15": 8.10946, - "16": 8.13231, - "17": 8.0853, - "18": 7.83475, - "19": 8.21923, - "20": 7.91097, - "21": 7.58489, - "22": 7.56231, - "23": 7.44204, - "24": 7.44303, - "25": 7.67594, - "26": 7.07138, - "27": 7.60696, - "28": 7.30925, - "29": 7.48219, - "30": 7.62699, - "31": 7.3655, - "32": 7.54203, - "33": 7.60199, - "34": 7.66716, - "35": 7.18385, - "36": 7.05252, - "37": 7.38377, - "38": 7.15521, - "39": 7.51639, - "40": 7.4929, - "41": 7.44762, - "42": 7.20298, - "43": 7.18681, - "44": 7.36683, - "45": 7.15506, - "46": 6.85064, - "47": 7.26072, - "48": 7.10489, - "49": 7.53477, - "50": 6.99715 + "1": 11.07769, + "2": 11.07625, + "3": 10.52909, + "4": 10.08687, + "5": 9.82013, + "6": 9.48246, + "7": 9.54169, + "8": 8.83661, + "9": 8.64933, + "10": 8.95821, + "11": 8.32934, + "12": 8.36033, + "13": 8.26936, + "14": 7.73441, + "15": 7.87122, + "16": 7.9153, + "17": 7.86923, + "18": 7.61191, + "19": 7.99919, + "20": 7.72174, + "21": 7.4147, + "22": 7.40336, + "23": 7.27676, + "24": 7.28557, + "25": 7.53782, + "26": 6.94933, + "27": 7.48504, + "28": 7.20219, + "29": 7.38696, + "30": 7.51152, + "31": 7.26613, + "32": 7.45631, + "33": 7.51482, + "34": 7.57527, + "35": 7.10374, + "36": 6.97224, + "37": 7.31053, + "38": 7.08607, + "39": 7.44371, + "40": 7.43612, + "41": 7.37848, + "42": 7.13561, + "43": 7.11558, + "44": 7.30254, + "45": 7.08147, + "46": 6.78911, + "47": 7.21791, + "48": 7.03066, + "49": 7.46668, + "50": 6.93251 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 89.12995, - "2": 1.33749, - "3": 1.24205, - "4": 1.63759, - "5": 1.13139, - "6": 1.12938, - "7": 1.37914, - "8": 1.3886, - "9": 1.10046, - "10": 1.11649, - "11": 1.11259, - "12": 1.10822, - "13": 1.10532, - "14": 1.11189, - "15": 1.1132, - "16": 1.10539, - "17": 1.11434, - "18": 1.11836, - "19": 1.11073, - "20": 1.11278, - "21": 1.11212, - "22": 1.10671, - "23": 1.11034, - "24": 1.11107, - "25": 1.11085, - "26": 1.10756, - "27": 1.10109, - "28": 1.1069, - "29": 1.11354, - "30": 1.11254, - "31": 1.10893, - "32": 1.11311, - "33": 1.10722, - "34": 1.10243, - "35": 1.10358, - "36": 1.09746, - "37": 1.09875, - "38": 1.10151, - "39": 1.10188, - "40": 1.10069, - "41": 1.10545, - "42": 1.10709, - "43": 1.1028, - "44": 1.10723, - "45": 1.10614, - "46": 1.09997, - "47": 1.1053, - "48": 1.10274, - "49": 1.09986, - "50": 1.10191 + "1": 95.02242, + "2": 1.29728, + "3": 1.24413, + "4": 1.67309, + "5": 1.12527, + "6": 1.39226, + "7": 1.33351, + "8": 1.19614, + "9": 1.10737, + "10": 1.09796, + "11": 1.10736, + "12": 1.10105, + "13": 1.10552, + "14": 1.11007, + "15": 1.09853, + "16": 1.10142, + "17": 1.09718, + "18": 1.10103, + "19": 1.10339, + "20": 1.1069, + "21": 1.10541, + "22": 1.10374, + "23": 1.1028, + "24": 1.1, + "25": 1.09935, + "26": 1.09318, + "27": 1.09779, + "28": 1.09457, + "29": 1.09, + "30": 1.09267, + "31": 1.08899, + "32": 1.09268, + "33": 1.08757, + "34": 1.08991, + "35": 1.09705, + "36": 1.09429, + "37": 1.09459, + "38": 1.08857, + "39": 1.09547, + "40": 1.09224, + "41": 1.089, + "42": 1.08879, + "43": 1.0834, + "44": 1.08212, + "45": 1.08363, + "46": 1.08596, + "47": 1.07798, + "48": 1.07329, + "49": 1.07678, + "50": 1.07483 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json index 58eb3fc16cd..8cea616921e 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_coreweave.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 10.95004, - "2": 10.9521, - "3": 10.5115, - "4": 9.96454, - "5": 9.93941, - "6": 9.67273, - "7": 10.20975, - "8": 9.49716, - "9": 9.55902, - "10": 9.79742, - "11": 9.30109, - "12": 9.40483, - "13": 9.39546, - "14": 8.84681, - "15": 9.02444, - "16": 9.07121, - "17": 9.04574, - "18": 8.75678, - "19": 9.18159, - "20": 8.8595, - "21": 8.53503, - "22": 8.55182, - "23": 8.42441, - "24": 8.37608, - "25": 8.64304, - "26": 7.97393, - "27": 8.56806, - "28": 8.19764, - "29": 8.3928, - "30": 8.67283, - "31": 8.289, - "32": 8.43572, - "33": 8.5568, - "34": 8.66018, - "35": 8.07934, - "36": 7.94976, - "37": 8.29565, - "38": 7.98044, - "39": 8.39201, - "40": 8.35513, - "41": 8.31876, - "42": 8.0583, - "43": 8.03283, - "44": 8.24243, - "45": 8.10277, - "46": 7.61696, - "47": 8.15273, - "48": 8.00569, - "49": 8.38688, - "50": 7.81491 + "1": 10.94971, + "2": 10.95163, + "3": 10.51641, + "4": 9.9652, + "5": 9.94116, + "6": 9.67394, + "7": 10.19887, + "8": 9.50035, + "9": 9.54982, + "10": 9.79667, + "11": 9.30128, + "12": 9.40566, + "13": 9.39438, + "14": 8.84572, + "15": 9.02231, + "16": 9.06973, + "17": 9.04712, + "18": 8.75662, + "19": 9.18074, + "20": 8.86175, + "21": 8.53558, + "22": 8.55288, + "23": 8.42513, + "24": 8.37683, + "25": 8.64426, + "26": 7.9756, + "27": 8.57026, + "28": 8.1987, + "29": 8.39406, + "30": 8.67631, + "31": 8.29096, + "32": 8.43692, + "33": 8.55897, + "34": 8.66123, + "35": 8.08, + "36": 7.95214, + "37": 8.2979, + "38": 7.98177, + "39": 8.39281, + "40": 8.35852, + "41": 8.32006, + "42": 8.05954, + "43": 8.03381, + "44": 8.24236, + "45": 8.1025, + "46": 7.61814, + "47": 8.15364, + "48": 8.00693, + "49": 8.38704, + "50": 7.81592 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 19403624.0, - "2": 19274194.0, - "3": 19372760.0, - "4": 86525248.0, - "5": 148575568.0, - "6": 145226704.0, - "7": 171879984.0, - "8": 195785248.0, - "9": 164124752.0, - "10": 167684736.0, - "11": 221077344.0, - "12": 200384224.0, - "13": 248872528.0, - "14": 211169424.0, - "15": 214304608.0, - "16": 216075632.0, - "17": 267845984.0, - "18": 170470336.0, - "19": 176865072.0, - "20": 187955392.0, - "21": 225750704.0, - "22": 247396816.0, - "23": 211643856.0, - "24": 205638464.0, - "25": 277022272.0, - "26": 291562304.0, - "27": 225789840.0, - "28": 288202368.0, - "29": 198390384.0, - "30": 213302208.0, - "31": 227204752.0, - "32": 271112416.0, - "33": 231840432.0, - "34": 203575536.0, - "35": 191152368.0, - "36": 222566928.0, - "37": 177810112.0, - "38": 228708544.0, - "39": 211168784.0, - "40": 215603968.0, - "41": 200089440.0, - "42": 228529888.0, - "43": 198782848.0, - "44": 141902272.0, - "45": 181922816.0, - "46": 115369856.0, - "47": 170214176.0, - "48": 137292832.0, - "49": 97654936.0, - "50": 160979632.0 + "1": 19403704.0, + "2": 19274216.0, + "3": 22517470.0, + "4": 83429816.0, + "5": 139167728.0, + "6": 138921280.0, + "7": 173470304.0, + "8": 200511856.0, + "9": 165696320.0, + "10": 166120112.0, + "11": 213254416.0, + "12": 187847360.0, + "13": 231586656.0, + "14": 226879072.0, + "15": 219025920.0, + "16": 205179664.0, + "17": 280450432.0, + "18": 181477792.0, + "19": 191026096.0, + "20": 186395632.0, + "21": 233632576.0, + "22": 231696832.0, + "23": 216390688.0, + "24": 215133760.0, + "25": 233079504.0, + "26": 244437920.0, + "27": 222637584.0, + "28": 278773952.0, + "29": 253409264.0, + "30": 240036736.0, + "31": 236599008.0, + "32": 205066624.0, + "33": 263303312.0, + "34": 200444544.0, + "35": 199033824.0, + "36": 243001216.0, + "37": 151181872.0, + "38": 175301280.0, + "39": 219001024.0, + "40": 220307936.0, + "41": 217385856.0, + "42": 230074176.0, + "43": 208226784.0, + "44": 148172720.0, + "45": 141103744.0, + "46": 132664976.0, + "47": 179619392.0, + "48": 118381144.0, + "49": 86643984.0, + "50": 113798320.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 4883602432.0, - "2": 4885017088.0, - "3": 4882657792.0, - "4": 4883046912.0, - "5": 4883725824.0, - "6": 4883713536.0, - "7": 4883040768.0, - "8": 4883273216.0, - "9": 4882952704.0, - "10": 4885949952.0, - "11": 4883990016.0, - "12": 4887679488.0, - "13": 4884011520.0, - "14": 4882899456.0, - "15": 4883515904.0, - "16": 4883990016.0, - "17": 4883410432.0, - "18": 4883673600.0, - "19": 4882903552.0, - "20": 4884541952.0, - "21": 4883138048.0, - "22": 4883247616.0, - "23": 4883839488.0, - "24": 4885058048.0, - "25": 4882676224.0, - "26": 4884058624.0, - "27": 4884724224.0, - "28": 4884874752.0, - "29": 4883127808.0, - "30": 4883252736.0, - "31": 4882955776.0, - "32": 4885190144.0, - "33": 4883845632.0, - "34": 4884392448.0, - "35": 4883083776.0, - "36": 4883851776.0, - "37": 4885246464.0, - "38": 4882680320.0, - "39": 4884296192.0, - "40": 4884689408.0, - "41": 4882836992.0, - "42": 4883972608.0, - "43": 4884519424.0, - "44": 4883354112.0, - "45": 4883495424.0, - "46": 4882788864.0, - "47": 4883144192.0, - "48": 4883688960.0, - "49": 4884182528.0, - "50": 4885279232.0 + "1": 4883287040.0, + "2": 4883441152.0, + "3": 4881697280.0, + "4": 4883730944.0, + "5": 4882556416.0, + "6": 4882616832.0, + "7": 4883438080.0, + "8": 4881568256.0, + "9": 4883173888.0, + "10": 4882272768.0, + "11": 4883676672.0, + "12": 4881393152.0, + "13": 4883141120.0, + "14": 4883697152.0, + "15": 4882622976.0, + "16": 4881830400.0, + "17": 4881658368.0, + "18": 4881863168.0, + "19": 4883804672.0, + "20": 4881795584.0, + "21": 4883333632.0, + "22": 4882194944.0, + "23": 4882084352.0, + "24": 4884065792.0, + "25": 4881804800.0, + "26": 4883596800.0, + "27": 4883047936.0, + "28": 4882476544.0, + "29": 4883087872.0, + "30": 4882151936.0, + "31": 4882625024.0, + "32": 4883104256.0, + "33": 4882526720.0, + "34": 4882292224.0, + "35": 4882485760.0, + "36": 4882867712.0, + "37": 4882634240.0, + "38": 4882610688.0, + "39": 4881474048.0, + "40": 4881961472.0, + "41": 4882663936.0, + "42": 4881860096.0, + "43": 4881499648.0, + "44": 4883392000.0, + "45": 4882392576.0, + "46": 4882815488.0, + "47": 4883113472.0, + "48": 4882158080.0, + "49": 4881207808.0, + "50": 4881588736.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 41210470400.0, - "2": 41210470400.0, - "3": 41210470400.0, - "4": 41210470400.0, - "5": 41210470400.0, - "6": 41210470400.0, - "7": 41210470400.0, - "8": 41210470400.0, - "9": 41210470400.0, - "10": 41210470400.0, - "11": 41210470400.0, - "12": 41210470400.0, - "13": 41210470400.0, - "14": 41210470400.0, - "15": 41210470400.0, - "16": 41210470400.0, - "17": 41210470400.0, - "18": 41210470400.0, - "19": 41210470400.0, - "20": 41210470400.0, - "21": 41210470400.0, - "22": 41210470400.0, - "23": 41210470400.0, - "24": 41210470400.0, - "25": 41210470400.0, - "26": 41210470400.0, - "27": 41210470400.0, - "28": 41210470400.0, - "29": 41210470400.0, - "30": 41210470400.0, - "31": 41210470400.0, - "32": 41210470400.0, - "33": 41210470400.0, - "34": 41210470400.0, - "35": 41210470400.0, - "36": 41210470400.0, - "37": 41210470400.0, - "38": 41210470400.0, - "39": 41210470400.0, - "40": 41210470400.0, - "41": 41210470400.0, - "42": 41210470400.0, - "43": 41210470400.0, - "44": 41210470400.0, - "45": 41210470400.0, - "46": 41210470400.0, - "47": 41210470400.0, - "48": 41210470400.0, - "49": 41210470400.0, - "50": 41210470400.0 + "1": 41208348672.0, + "2": 41208348672.0, + "3": 41208348672.0, + "4": 41208348672.0, + "5": 41208348672.0, + "6": 41208348672.0, + "7": 41208348672.0, + "8": 41208348672.0, + "9": 41208348672.0, + "10": 41208348672.0, + "11": 41208348672.0, + "12": 41208348672.0, + "13": 41208348672.0, + "14": 41208348672.0, + "15": 41208348672.0, + "16": 41208348672.0, + "17": 41208348672.0, + "18": 41208348672.0, + "19": 41208348672.0, + "20": 41208348672.0, + "21": 41208348672.0, + "22": 41208348672.0, + "23": 41208348672.0, + "24": 41208348672.0, + "25": 41208348672.0, + "26": 41208348672.0, + "27": 41208348672.0, + "28": 41208348672.0, + "29": 41208348672.0, + "30": 41208348672.0, + "31": 41208348672.0, + "32": 41208348672.0, + "33": 41208348672.0, + "34": 41208348672.0, + "35": 41208348672.0, + "36": 41208348672.0, + "37": 41208348672.0, + "38": 41208348672.0, + "39": 41208348672.0, + "40": 41208348672.0, + "41": 41208348672.0, + "42": 41208348672.0, + "43": 41208348672.0, + "44": 41208348672.0, + "45": 41208348672.0, + "46": 41208348672.0, + "47": 41208348672.0, + "48": 41208348672.0, + "49": 41208348672.0, + "50": 41208348672.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 86.8085, - "2": 1.10913, - "3": 0.99097, - "4": 0.89412, - "5": 1.25997, - "6": 0.98162, - "7": 0.98318, - "8": 1.13296, - "9": 0.88126, - "10": 0.8633, - "11": 2.2744, - "12": 4.5393, - "13": 3.22763, - "14": 1.64923, - "15": 0.86595, - "16": 0.86575, - "17": 0.85272, - "18": 0.85454, - "19": 0.85281, - "20": 0.87018, - "21": 0.84654, - "22": 0.8494, - "23": 0.84882, - "24": 0.84482, - "25": 0.85311, - "26": 0.84678, - "27": 0.84096, - "28": 0.8412, - "29": 0.84156, - "30": 0.84475, - "31": 0.84747, - "32": 0.85058, - "33": 0.84977, - "34": 0.8479, - "35": 0.85234, - "36": 0.85012, - "37": 0.85087, - "38": 0.84594, - "39": 0.84558, - "40": 0.84807, - "41": 0.84183, - "42": 0.8439, - "43": 0.84221, - "44": 0.84248, - "45": 0.84257, - "46": 0.83922, - "47": 0.84311, - "48": 0.84159, - "49": 0.84011, - "50": 0.8353 + "1": 89.10928, + "2": 1.08143, + "3": 0.94222, + "4": 0.89675, + "5": 1.34524, + "6": 1.06972, + "7": 1.00314, + "8": 1.04961, + "9": 0.86611, + "10": 0.86248, + "11": 0.98739, + "12": 0.86057, + "13": 0.86777, + "14": 0.85834, + "15": 0.8559, + "16": 0.85522, + "17": 0.84644, + "18": 0.85748, + "19": 0.85218, + "20": 0.85342, + "21": 0.84029, + "22": 0.84342, + "23": 0.84297, + "24": 0.83925, + "25": 0.8439, + "26": 0.85696, + "27": 0.83981, + "28": 0.84643, + "29": 0.8433, + "30": 0.86234, + "31": 0.85636, + "32": 0.84184, + "33": 0.84501, + "34": 0.84316, + "35": 0.83806, + "36": 0.84143, + "37": 0.84447, + "38": 0.84137, + "39": 0.84133, + "40": 0.84321, + "41": 0.84019, + "42": 0.84164, + "43": 0.83741, + "44": 0.84203, + "45": 0.83966, + "46": 0.84109, + "47": 0.83945, + "48": 0.84001, + "49": 0.84194, + "50": 0.83578 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json index 1ba051f4889..0835e95b926 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_h100.json @@ -1 +1,142 @@ -{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.83281, "5": 10.85975, "10": 10.79613, "15": 10.80527, "20": 10.72502, "25": 10.53599, "30": 10.3571, "35": 10.24605, "40": 10.05992, "45": 9.7836, "50": 9.8722, "55": 9.83189, "60": 9.45075, "65": 8.89679, "70": 9.71414, "75": 9.39795, "80": 9.38169, "85": 9.58585, "90": 9.7999, "95": 9.50528, "100": 9.37224}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 27013.0, "5": 31736.0, "10": 25785.0, "15": 30383.0, "20": 28435.0, "25": 27493.0, "30": 30329.0, "35": 31750.0, "40": 34279.0, "45": 34634.0, "50": 38531.0, "55": 37465.0, "60": 40172.0, "65": 40624.0, "70": 44852.0, "75": 39231.0, "80": 130535.0, "85": 123250.0, "90": 47793.0, "95": 167340.0, "100": 163328.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 814390272.0, "5": 814420480.0, "10": 814376448.0, "15": 814376960.0, "20": 814373376.0, "25": 814321152.0, "30": 814306304.0, "35": 814292992.0, "40": 814288896.0, "45": 814272000.0, "50": 814262272.0, "55": 814258688.0, "60": 814268416.0, "65": 814220800.0, "70": 814266880.0, "75": 814318080.0, "80": 814285312.0, "85": 814289408.0, "90": 814315520.0, "95": 814320128.0, "100": 814311424.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2111314944.0, "5": 2370209280.0, "10": 2370209280.0, "15": 2370209280.0, "20": 2370209280.0, "25": 2370209280.0, "30": 2370209280.0, "35": 2370209280.0, "40": 2370209280.0, "45": 2370209280.0, "50": 2370209280.0, "55": 2370209280.0, "60": 2370209280.0, "65": 2370209280.0, "70": 2370209280.0, "75": 2370209280.0, "80": 2370209280.0, "85": 2370209280.0, "90": 2370209280.0, "95": 2370209280.0, "100": 2370209280.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 20.98318, "5": 0.79797, "10": 0.74028, "15": 0.67279, "20": 0.62948, "25": 0.61132, "30": 0.61547, "35": 0.6152, "40": 0.60421, "45": 0.59124, "50": 0.5891, "55": 0.57048, "60": 0.54799, "65": 0.52185, "70": 0.51195, "75": 0.50105, "80": 0.4628, "85": 0.45992, "90": 0.46498, "95": 0.4599, "100": 0.42568}}} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 5, + "values": { + "1": 10.82922, + "5": 10.85652, + "10": 10.79298, + "15": 10.8067, + "20": 10.72654, + "25": 10.53282, + "30": 10.35802, + "35": 10.24483, + "40": 10.05533, + "45": 9.77951, + "50": 9.86874, + "55": 9.82995, + "60": 9.449, + "65": 8.89366, + "70": 9.71127, + "75": 9.39451, + "80": 9.38198, + "85": 9.58333, + "90": 9.79944, + "95": 9.50213, + "100": 9.37131 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 5, + "values": { + "1": 27245.0, + "5": 31369.0, + "10": 25870.0, + "15": 29830.0, + "20": 28243.0, + "25": 27636.0, + "30": 30387.0, + "35": 31488.0, + "40": 34779.0, + "45": 35158.0, + "50": 38234.0, + "55": 37133.0, + "60": 40450.0, + "65": 40947.0, + "70": 43436.0, + "75": 39925.0, + "80": 51863.0, + "85": 2145177.0, + "90": 51330.0, + "95": 45247.0, + "100": 163741.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 5, + "values": { + "1": 787511296.0, + "5": 787542016.0, + "10": 787500032.0, + "15": 787499008.0, + "20": 787500032.0, + "25": 787446272.0, + "30": 787429888.0, + "35": 787413504.0, + "40": 787409920.0, + "45": 787394560.0, + "50": 787384320.0, + "55": 787383808.0, + "60": 787389952.0, + "65": 787346432.0, + "70": 787387904.0, + "75": 787437568.0, + "80": 787405312.0, + "85": 787407360.0, + "90": 787441664.0, + "95": 787445248.0, + "100": 787433472.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 5, + "values": { + "1": 2465793024.0, + "5": 2492764160.0, + "10": 2492764160.0, + "15": 2492764160.0, + "20": 2492764160.0, + "25": 2492764160.0, + "30": 2492764160.0, + "35": 2492764160.0, + "40": 2492764160.0, + "45": 2492764160.0, + "50": 2492764160.0, + "55": 2492764160.0, + "60": 2492764160.0, + "65": 2492764160.0, + "70": 2492764160.0, + "75": 2492764160.0, + "80": 2492764160.0, + "85": 2492764160.0, + "90": 2492764160.0, + "95": 2492764160.0, + "100": 2492764160.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 5, + "values": { + "1": 9.68104, + "5": 0.32859, + "10": 0.30772, + "15": 0.31234, + "20": 0.29254, + "25": 0.29296, + "30": 0.31344, + "35": 0.31026, + "40": 0.30514, + "45": 0.30481, + "50": 0.30324, + "55": 0.29929, + "60": 0.30103, + "65": 0.32008, + "70": 0.31307, + "75": 0.2933, + "80": 0.29351, + "85": 0.29283, + "90": 0.29375, + "95": 0.29458, + "100": 0.29103 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgxh100_coreweave.json new file mode 100644 index 00000000000..7e299df5257 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgxh100_coreweave.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.82922, + "2": 10.84163, + "3": 10.84245, + "4": 10.82, + "5": 10.85652, + "6": 10.86906, + "7": 10.83778, + "8": 10.84312, + "9": 10.84423, + "10": 10.79298, + "11": 10.86697, + "12": 10.86875, + "13": 10.86207, + "14": 10.86919, + "15": 10.8067, + "16": 10.8057, + "17": 10.77686, + "18": 10.79541, + "19": 10.78384, + "20": 10.72654, + "21": 10.69491, + "22": 10.54462, + "23": 10.6993, + "24": 10.58151, + "25": 10.53282, + "26": 10.58817, + "27": 10.601, + "28": 10.57563, + "29": 10.58022, + "30": 10.35802, + "31": 10.08769, + "32": 10.44466, + "33": 10.4477, + "34": 10.18704, + "35": 10.24483, + "36": 10.19713, + "37": 10.32294, + "38": 10.17101, + "39": 10.37026, + "40": 10.05533, + "41": 10.09491, + "42": 10.17971, + "43": 9.78263, + "44": 9.91346, + "45": 9.77951, + "46": 9.75648, + "47": 10.09647, + "48": 9.80391, + "49": 9.46649, + "50": 9.86874, + "51": 9.79428, + "52": 9.68303, + "53": 10.03314, + "54": 9.9113, + "55": 9.82995, + "56": 9.57839, + "57": 9.42377, + "58": 9.80549, + "59": 9.53292, + "60": 9.449, + "61": 9.65293, + "62": 9.95672, + "63": 9.33775, + "64": 9.74194, + "65": 8.89366, + "66": 9.67317, + "67": 9.33002, + "68": 9.76517, + "69": 9.76336, + "70": 9.71127, + "71": 9.59511, + "72": 9.54797, + "73": 9.47124, + "74": 8.89297, + "75": 9.39451, + "76": 9.04721, + "77": 10.04318, + "78": 9.70313, + "79": 9.35169, + "80": 9.38198, + "81": 9.45146, + "82": 9.67546, + "83": 9.27658, + "84": 9.39241, + "85": 9.58333, + "86": 9.04518, + "87": 9.56487, + "88": 9.72459, + "89": 9.57019, + "90": 9.79944, + "91": 9.30737, + "92": 9.3313, + "93": 9.04109, + "94": 8.80259, + "95": 9.50213, + "96": 9.5021, + "97": 9.28183, + "98": 9.64883, + "99": 8.8594, + "100": 9.37131 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 27245.0, + "2": 28958.0, + "3": 29464.0, + "4": 28046.0, + "5": 31369.0, + "6": 33287.0, + "7": 31200.0, + "8": 26921.0, + "9": 30008.0, + "10": 25870.0, + "11": 33681.0, + "12": 30344.0, + "13": 32737.0, + "14": 33315.0, + "15": 29830.0, + "16": 32475.0, + "17": 30747.0, + "18": 30381.0, + "19": 31032.0, + "20": 28243.0, + "21": 29224.0, + "22": 27340.0, + "23": 34119.0, + "24": 29049.0, + "25": 27636.0, + "26": 30662.0, + "27": 32009.0, + "28": 33355.0, + "29": 34714.0, + "30": 30387.0, + "31": 28212.0, + "32": 33411.0, + "33": 34696.0, + "34": 30053.0, + "35": 31488.0, + "36": 32943.0, + "37": 35829.0, + "38": 33740.0, + "39": 37632.0, + "40": 34779.0, + "41": 33958.0, + "42": 36396.0, + "43": 34088.0, + "44": 34090.0, + "45": 35158.0, + "46": 36174.0, + "47": 39772.0, + "48": 36516.0, + "49": 36733.0, + "50": 38234.0, + "51": 38608.0, + "52": 37030.0, + "53": 42442.0, + "54": 40944.0, + "55": 37133.0, + "56": 41001.0, + "57": 37524.0, + "58": 42317.0, + "59": 40804.0, + "60": 40450.0, + "61": 41478.0, + "62": 39766.0, + "63": 37941.0, + "64": 42197.0, + "65": 40947.0, + "66": 44094.0, + "67": 41958.0, + "68": 40060.0, + "69": 42189.0, + "70": 43436.0, + "71": 42748.0, + "72": 44280.0, + "73": 47478.0, + "74": 41456.0, + "75": 39925.0, + "76": 43490.0, + "77": 45636.0, + "78": 2141470.0, + "79": 46055.0, + "80": 51863.0, + "81": 151341.0, + "82": 49835.0, + "83": 143360.0, + "84": 2141546.0, + "85": 2145177.0, + "86": 132114.0, + "87": 2147022.0, + "88": 59899.0, + "89": 162883.0, + "90": 51330.0, + "91": 2141901.0, + "92": 44946.0, + "93": 138194.0, + "94": 2145772.0, + "95": 45247.0, + "96": 135045.0, + "97": 53170.0, + "98": 168576.0, + "99": 2141797.0, + "100": 163741.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 787516416.0, + "2": 787540992.0, + "3": 787524096.0, + "4": 787512320.0, + "5": 787547136.0, + "6": 787537920.0, + "7": 787512832.0, + "8": 787524608.0, + "9": 787528192.0, + "10": 787505152.0, + "11": 787522048.0, + "12": 787520000.0, + "13": 787529728.0, + "14": 787529216.0, + "15": 787504128.0, + "16": 787513344.0, + "17": 787503104.0, + "18": 787489280.0, + "19": 787514880.0, + "20": 787505152.0, + "21": 787479552.0, + "22": 787486208.0, + "23": 787478528.0, + "24": 787486208.0, + "25": 787451392.0, + "26": 787482112.0, + "27": 787470848.0, + "28": 787450368.0, + "29": 787458048.0, + "30": 787435008.0, + "31": 787406848.0, + "32": 787424256.0, + "33": 787435520.0, + "34": 787426304.0, + "35": 787418624.0, + "36": 787436544.0, + "37": 787428352.0, + "38": 787436544.0, + "39": 787417600.0, + "40": 787415040.0, + "41": 787405824.0, + "42": 787415040.0, + "43": 787367936.0, + "44": 787392512.0, + "45": 787399680.0, + "46": 787355136.0, + "47": 787411456.0, + "48": 787354112.0, + "49": 787374080.0, + "50": 787389440.0, + "51": 787375616.0, + "52": 787383808.0, + "53": 787379712.0, + "54": 787384832.0, + "55": 787388928.0, + "56": 787388928.0, + "57": 787351040.0, + "58": 787382784.0, + "59": 787374080.0, + "60": 787395072.0, + "61": 787405312.0, + "62": 787405824.0, + "63": 787373056.0, + "64": 787388928.0, + "65": 787351552.0, + "66": 787386880.0, + "67": 787392000.0, + "68": 787399168.0, + "69": 787383296.0, + "70": 787393024.0, + "71": 787406848.0, + "72": 787400704.0, + "73": 787401216.0, + "74": 787403264.0, + "75": 787442688.0, + "76": 787444736.0, + "77": 787445760.0, + "78": 787395072.0, + "79": 787430400.0, + "80": 787410432.0, + "81": 787412992.0, + "82": 787427840.0, + "83": 787428864.0, + "84": 787412480.0, + "85": 787412480.0, + "86": 787394560.0, + "87": 787452928.0, + "88": 787414528.0, + "89": 787404800.0, + "90": 787446784.0, + "91": 787446272.0, + "92": 787446784.0, + "93": 787430400.0, + "94": 787440128.0, + "95": 787450368.0, + "96": 787454976.0, + "97": 787427328.0, + "98": 787475968.0, + "99": 787419136.0, + "100": 787438592.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2479493120.0, + "2": 2485449728.0, + "3": 2487249408.0, + "4": 2487249408.0, + "5": 2495991808.0, + "6": 2495991808.0, + "7": 2495991808.0, + "8": 2495991808.0, + "9": 2495991808.0, + "10": 2495991808.0, + "11": 2495991808.0, + "12": 2495991808.0, + "13": 2495991808.0, + "14": 2495991808.0, + "15": 2495991808.0, + "16": 2495991808.0, + "17": 2495991808.0, + "18": 2495991808.0, + "19": 2495991808.0, + "20": 2495991808.0, + "21": 2495991808.0, + "22": 2495991808.0, + "23": 2495991808.0, + "24": 2495991808.0, + "25": 2495991808.0, + "26": 2495991808.0, + "27": 2495991808.0, + "28": 2495991808.0, + "29": 2495991808.0, + "30": 2495991808.0, + "31": 2495991808.0, + "32": 2495991808.0, + "33": 2495991808.0, + "34": 2495991808.0, + "35": 2495991808.0, + "36": 2495991808.0, + "37": 2495991808.0, + "38": 2495991808.0, + "39": 2495991808.0, + "40": 2495991808.0, + "41": 2495991808.0, + "42": 2495991808.0, + "43": 2495991808.0, + "44": 2495991808.0, + "45": 2495991808.0, + "46": 2495991808.0, + "47": 2495991808.0, + "48": 2495991808.0, + "49": 2495991808.0, + "50": 2495991808.0, + "51": 2495991808.0, + "52": 2495991808.0, + "53": 2495991808.0, + "54": 2495991808.0, + "55": 2495991808.0, + "56": 2495991808.0, + "57": 2495991808.0, + "58": 2495991808.0, + "59": 2495991808.0, + "60": 2495991808.0, + "61": 2495991808.0, + "62": 2495991808.0, + "63": 2495991808.0, + "64": 2495991808.0, + "65": 2495991808.0, + "66": 2495991808.0, + "67": 2495991808.0, + "68": 2495991808.0, + "69": 2495991808.0, + "70": 2495991808.0, + "71": 2495991808.0, + "72": 2495991808.0, + "73": 2495991808.0, + "74": 2495991808.0, + "75": 2495991808.0, + "76": 2495991808.0, + "77": 2495991808.0, + "78": 2495991808.0, + "79": 2495991808.0, + "80": 2495991808.0, + "81": 2495991808.0, + "82": 2495991808.0, + "83": 2495991808.0, + "84": 2495991808.0, + "85": 2495991808.0, + "86": 2495991808.0, + "87": 2495991808.0, + "88": 2495991808.0, + "89": 2495991808.0, + "90": 2495991808.0, + "91": 2495991808.0, + "92": 2495991808.0, + "93": 2495991808.0, + "94": 2495991808.0, + "95": 2495991808.0, + "96": 2495991808.0, + "97": 2495991808.0, + "98": 2495991808.0, + "99": 2495991808.0, + "100": 2495991808.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 12.11313, + "2": 0.4805, + "3": 0.36965, + "4": 0.36695, + "5": 0.31705, + "6": 0.31275, + "7": 0.31299, + "8": 0.29866, + "9": 0.28961, + "10": 0.28859, + "11": 0.29067, + "12": 0.29044, + "13": 0.29806, + "14": 0.29287, + "15": 0.29391, + "16": 0.3175, + "17": 0.28363, + "18": 0.2818, + "19": 0.29347, + "20": 0.28931, + "21": 0.29103, + "22": 0.28444, + "23": 0.28907, + "24": 0.27608, + "25": 0.28277, + "26": 0.28656, + "27": 0.28921, + "28": 0.30243, + "29": 0.30435, + "30": 0.31231, + "31": 0.30439, + "32": 0.31412, + "33": 0.28887, + "34": 0.29613, + "35": 0.29738, + "36": 0.29754, + "37": 0.3019, + "38": 0.2933, + "39": 0.2944, + "40": 0.29283, + "41": 0.29592, + "42": 0.29673, + "43": 0.29319, + "44": 0.30127, + "45": 0.29921, + "46": 0.29904, + "47": 0.28795, + "48": 0.29918, + "49": 0.28711, + "50": 0.29645, + "51": 0.28777, + "52": 0.29536, + "53": 0.2847, + "54": 0.28286, + "55": 0.2874, + "56": 0.28699, + "57": 0.28614, + "58": 0.29825, + "59": 0.28363, + "60": 0.29423, + "61": 0.29226, + "62": 0.2896, + "63": 0.28065, + "64": 0.29533, + "65": 0.29842, + "66": 0.28487, + "67": 0.28419, + "68": 0.29474, + "69": 0.28383, + "70": 0.28417, + "71": 0.29253, + "72": 0.28737, + "73": 0.27923, + "74": 0.28728, + "75": 0.29383, + "76": 0.28157, + "77": 0.64771, + "78": 0.29148, + "79": 0.28742, + "80": 0.29245, + "81": 0.28827, + "82": 0.28368, + "83": 0.28963, + "84": 0.29234, + "85": 0.28183, + "86": 0.28337, + "87": 0.27879, + "88": 0.28388, + "89": 0.28309, + "90": 0.28852, + "91": 0.28254, + "92": 0.28375, + "93": 0.28633, + "94": 0.28567, + "95": 0.28235, + "96": 0.28513, + "97": 0.27951, + "98": 0.27851, + "99": 0.28336, + "100": 0.27744 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml index 3ecd68b9841..8874f9cf045 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/model_config.yaml @@ -56,7 +56,7 @@ MODEL_ARGS: --attention-softmax-in-fp32: true --use-checkpoint-opt_param-scheduler: true --use-mcore-models: true - --ckpt-format: torch_dist + --ckpt-format: fsdp_dtensor --dist-ckpt-optim-fully-reshardable: true --dist-ckpt-strictness: log_all # backward compatibility for TE changes --data-cache-path: ${DATA_CACHE_PATH} diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgxh100_coreweave.json index b3f192ba287..73fb00c9231 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgxh100_coreweave.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgxh100_coreweave.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.07546, - "2": 11.03837, - "3": 9.66011, - "4": 9.91381, - "5": 9.32909, - "6": 9.13922, - "7": 9.13574, - "8": 8.65508, - "9": 8.51394, - "10": 8.8409, - "11": 8.29149, - "12": 8.34581, - "13": 8.25518, - "14": 7.73711, - "15": 7.86249, - "16": 7.9371, - "17": 7.89319, - "18": 7.63123, - "19": 7.99731, - "20": 7.74538, - "21": 7.44348, - "22": 7.42249, - "23": 7.29714, - "24": 7.27462, - "25": 7.54574, - "26": 6.96838, - "27": 7.50556, - "28": 7.22743, - "29": 7.36588, - "30": 7.52622, - "31": 7.27026, - "32": 7.45521, - "33": 7.50954, - "34": 7.55686, - "35": 7.10177, - "36": 6.96431, - "37": 7.28463, - "38": 7.0808, - "39": 7.40923, - "40": 7.43338, - "41": 7.38496, - "42": 7.15749, - "43": 7.15858, - "44": 7.28852, - "45": 7.16793, - "46": 6.78468, - "47": 7.4114, - "48": 7.0027, - "49": 7.46249, - "50": 6.92151 + "1": 11.07559, + "2": 11.03834, + "3": 9.66022, + "4": 9.91367, + "5": 9.3291, + "6": 9.13927, + "7": 9.13591, + "8": 8.65527, + "9": 8.51396, + "10": 8.84095, + "11": 8.29144, + "12": 8.34584, + "13": 8.25509, + "14": 7.73685, + "15": 7.86273, + "16": 7.93699, + "17": 7.89257, + "18": 7.63116, + "19": 7.99719, + "20": 7.7453, + "21": 7.44298, + "22": 7.42242, + "23": 7.29721, + "24": 7.27467, + "25": 7.54562, + "26": 6.96839, + "27": 7.50569, + "28": 7.22761, + "29": 7.36579, + "30": 7.52635, + "31": 7.27036, + "32": 7.45548, + "33": 7.50952, + "34": 7.55694, + "35": 7.10212, + "36": 6.96414, + "37": 7.28438, + "38": 7.08049, + "39": 7.40908, + "40": 7.4335, + "41": 7.38491, + "42": 7.15766, + "43": 7.15867, + "44": 7.28831, + "45": 7.16729, + "46": 6.78429, + "47": 7.40937, + "48": 7.00259, + "49": 7.46241, + "50": 6.92143 } }, "num-zeros": { @@ -63,54 +63,54 @@ "values": { "1": 911219392.0, "2": 910960384.0, - "3": 911156352.0, - "4": 912204800.0, - "5": 920796544.0, - "6": 940387968.0, - "7": 990599872.0, - "8": 976457728.0, - "9": 998097664.0, - "10": 995852672.0, - "11": 994583680.0, - "12": 977344896.0, - "13": 1028141824.0, - "14": 1007166208.0, - "15": 987423616.0, - "16": 993054784.0, - "17": 982319168.0, - "18": 998261760.0, - "19": 984696320.0, - "20": 982914752.0, - "21": 979667456.0, - "22": 953988864.0, - "23": 972353984.0, - "24": 964792064.0, - "25": 958512192.0, - "26": 946928512.0, + "3": 911156288.0, + "4": 913253376.0, + "5": 921845056.0, + "6": 941436672.0, + "7": 993745472.0, + "8": 974360512.0, + "9": 999146112.0, + "10": 992706944.0, + "11": 991438144.0, + "12": 979442048.0, + "13": 1029190272.0, + "14": 1008214656.0, + "15": 988472000.0, + "16": 988861120.0, + "17": 979173312.0, + "18": 996164608.0, + "19": 979453440.0, + "20": 982914688.0, + "21": 975473344.0, + "22": 955037568.0, + "23": 969208128.0, + "24": 965840832.0, + "25": 953269440.0, + "26": 949025536.0, "27": 948458304.0, - "28": 949643968.0, - "29": 942877440.0, + "28": 951741184.0, + "29": 943926272.0, "30": 935020160.0, - "31": 935327616.0, - "32": 934281088.0, - "33": 921805568.0, - "34": 928189312.0, - "35": 922202496.0, - "36": 924246656.0, - "37": 920661248.0, + "31": 933230336.0, + "32": 930086848.0, + "33": 922853952.0, + "34": 927140800.0, + "35": 925348224.0, + "36": 925295168.0, + "37": 922758272.0, "38": 922930752.0, - "39": 922322816.0, - "40": 921856512.0, - "41": 920227968.0, + "39": 922322880.0, + "40": 921856640.0, + "41": 920227776.0, "42": 918353664.0, - "43": 918607040.0, - "44": 914948032.0, - "45": 914295232.0, + "43": 919655616.0, + "44": 914948224.0, + "45": 916392512.0, "46": 914344448.0, "47": 911769536.0, - "48": 912013312.0, - "49": 910349440.0, - "50": 914351552.0 + "48": 912013248.0, + "49": 910349376.0, + "50": 914351616.0 } }, "mem-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 41739952128.0, - "2": 43687571456.0, - "3": 43687571456.0, - "4": 43983216640.0, - "5": 43983216640.0, - "6": 43983216640.0, - "7": 43983216640.0, - "8": 44024635392.0, - "9": 44041216000.0, - "10": 44041216000.0, - "11": 44041216000.0, - "12": 44041216000.0, - "13": 44041216000.0, - "14": 44041216000.0, - "15": 44041216000.0, - "16": 44041216000.0, - "17": 44041216000.0, - "18": 44041216000.0, - "19": 44041216000.0, - "20": 44041216000.0, - "21": 44041216000.0, - "22": 44041216000.0, - "23": 44041216000.0, - "24": 44041216000.0, - "25": 44041216000.0, - "26": 44041216000.0, - "27": 44041216000.0, - "28": 44041216000.0, - "29": 44041326592.0, - "30": 44162326528.0, - "31": 44220485632.0, - "32": 44270411776.0, - "33": 44293799936.0, - "34": 44293799936.0, - "35": 44293799936.0, - "36": 44293799936.0, - "37": 44293799936.0, - "38": 44293799936.0, - "39": 44293799936.0, - "40": 44293799936.0, - "41": 44293799936.0, - "42": 44293799936.0, - "43": 44293799936.0, - "44": 44293799936.0, - "45": 44293799936.0, - "46": 44293799936.0, - "47": 44293799936.0, - "48": 44293799936.0, - "49": 44293799936.0, - "50": 44293799936.0 + "1": 41740259328.0, + "2": 43687292928.0, + "3": 43687292928.0, + "4": 43984064512.0, + "5": 43984064512.0, + "6": 43984064512.0, + "7": 43984064512.0, + "8": 44026380288.0, + "9": 44041506816.0, + "10": 44041506816.0, + "11": 44041506816.0, + "12": 44041506816.0, + "13": 44041506816.0, + "14": 44041506816.0, + "15": 44041506816.0, + "16": 44041506816.0, + "17": 44041506816.0, + "18": 44041506816.0, + "19": 44041506816.0, + "20": 44041506816.0, + "21": 44041506816.0, + "22": 44041506816.0, + "23": 44041506816.0, + "24": 44041506816.0, + "25": 44041506816.0, + "26": 44041506816.0, + "27": 44041506816.0, + "28": 44041506816.0, + "29": 44044173312.0, + "30": 44164231168.0, + "31": 44221079552.0, + "32": 44271415296.0, + "33": 44290232320.0, + "34": 44290232320.0, + "35": 44290232320.0, + "36": 44290232320.0, + "37": 44290232320.0, + "38": 44290232320.0, + "39": 44290232320.0, + "40": 44290232320.0, + "41": 44290232320.0, + "42": 44290232320.0, + "43": 44290232320.0, + "44": 44290232320.0, + "45": 44290232320.0, + "46": 44290232320.0, + "47": 44290232320.0, + "48": 44290232320.0, + "49": 44290232320.0, + "50": 44290232320.0 } }, "mtp_1 loss": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.08617, - "2": 11.10475, - "3": 10.48001, - "4": 10.13466, - "5": 9.79047, - "6": 9.50601, - "7": 9.5113, - "8": 8.85336, - "9": 8.66683, - "10": 8.95866, - "11": 8.29315, - "12": 8.36982, - "13": 8.25544, - "14": 7.73322, + "1": 11.08623, + "2": 11.1047, + "3": 10.47999, + "4": 10.13471, + "5": 9.79045, + "6": 9.50607, + "7": 9.51139, + "8": 8.85331, + "9": 8.66688, + "10": 8.95867, + "11": 8.29318, + "12": 8.36986, + "13": 8.25545, + "14": 7.73323, "15": 7.86639, - "16": 7.92442, - "17": 7.86278, - "18": 7.61012, - "19": 8.00269, - "20": 7.73019, - "21": 7.4165, - "22": 7.41478, - "23": 7.28671, - "24": 7.27903, - "25": 7.54456, - "26": 6.96542, - "27": 7.50538, - "28": 7.20607, - "29": 7.377, - "30": 7.52777, - "31": 7.27094, - "32": 7.4604, + "16": 7.92438, + "17": 7.86276, + "18": 7.61004, + "19": 8.00261, + "20": 7.73004, + "21": 7.41636, + "22": 7.41466, + "23": 7.28656, + "24": 7.27882, + "25": 7.54458, + "26": 6.96533, + "27": 7.5053, + "28": 7.20603, + "29": 7.37687, + "30": 7.52783, + "31": 7.27097, + "32": 7.46043, "33": 7.51419, - "34": 7.56867, - "35": 7.09252, - "36": 6.96015, - "37": 7.29846, - "38": 7.0742, - "39": 7.43347, - "40": 7.43116, - "41": 7.40919, + "34": 7.56879, + "35": 7.09276, + "36": 6.96019, + "37": 7.29843, + "38": 7.07417, + "39": 7.43338, + "40": 7.43134, + "41": 7.40946, "42": 7.15527, - "43": 7.15652, - "44": 7.30441, - "45": 7.1893, - "46": 6.77296, - "47": 7.45045, - "48": 7.02403, - "49": 7.45719, - "50": 6.92656 + "43": 7.15684, + "44": 7.30429, + "45": 7.18917, + "46": 6.77286, + "47": 7.44985, + "48": 7.02383, + "49": 7.4572, + "50": 6.92645 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 64.40054, - "2": 2.16564, - "3": 3.72378, - "4": 1.63174, - "5": 2.30947, - "6": 1.7246, - "7": 1.5089, - "8": 1.60943, - "9": 1.48606, - "10": 1.47162, - "11": 1.05608, - "12": 1.3309, - "13": 1.06824, - "14": 1.41914, - "15": 1.10033, - "16": 1.15759, - "17": 1.23897, - "18": 1.10439, - "19": 1.11869, - "20": 1.09363, - "21": 1.23622, - "22": 1.14797, - "23": 1.23037, - "24": 1.03991, - "25": 1.07795, - "26": 1.04416, - "27": 1.03654, - "28": 1.04098, - "29": 1.03502, - "30": 1.02909, - "31": 1.17935, - "32": 1.14717, - "33": 1.05403, - "34": 1.13894, - "35": 1.04538, - "36": 1.04367, - "37": 1.0843, - "38": 1.04631, - "39": 1.06131, - "40": 1.06988, - "41": 1.09756, - "42": 1.04759, - "43": 1.09649, - "44": 1.05666, - "45": 1.05249, - "46": 1.04539, - "47": 1.04041, - "48": 1.04904, - "49": 1.04777, - "50": 1.06237 + "1": 89.89187, + "2": 2.19484, + "3": 3.80506, + "4": 1.63188, + "5": 2.52939, + "6": 2.46374, + "7": 1.5097, + "8": 1.75664, + "9": 1.62191, + "10": 1.35808, + "11": 1.04295, + "12": 1.35317, + "13": 1.07545, + "14": 1.42301, + "15": 1.10347, + "16": 1.28287, + "17": 1.22104, + "18": 1.07676, + "19": 1.08763, + "20": 1.12221, + "21": 1.25145, + "22": 1.04596, + "23": 1.22539, + "24": 1.06194, + "25": 1.11205, + "26": 1.05389, + "27": 1.03357, + "28": 1.0291, + "29": 1.04027, + "30": 1.06631, + "31": 1.18617, + "32": 1.142, + "33": 1.03842, + "34": 1.12457, + "35": 1.04164, + "36": 1.04698, + "37": 1.07674, + "38": 1.03833, + "39": 1.03043, + "40": 1.02697, + "41": 1.11388, + "42": 1.04538, + "43": 1.03328, + "44": 1.04873, + "45": 1.03241, + "46": 1.03847, + "47": 1.04164, + "48": 1.04077, + "49": 1.03715, + "50": 1.02734 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgxh100_eos.json index d7372742ca7..0a6724a3e95 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgxh100_eos.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgxh100_eos.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.07546, - "2": 11.03837, - "3": 9.66011, - "4": 9.91381, - "5": 9.32909, - "6": 9.13922, - "7": 9.13574, - "8": 8.65508, - "9": 8.51394, - "10": 8.8409, - "11": 8.29149, - "12": 8.34581, - "13": 8.25518, - "14": 7.73711, - "15": 7.86249, - "16": 7.9371, - "17": 7.89319, - "18": 7.63123, - "19": 7.99731, - "20": 7.74538, - "21": 7.44348, - "22": 7.42249, - "23": 7.29714, - "24": 7.27462, - "25": 7.54574, - "26": 6.96838, - "27": 7.50556, - "28": 7.22743, - "29": 7.36588, - "30": 7.52622, - "31": 7.27026, - "32": 7.45521, - "33": 7.50954, - "34": 7.55686, - "35": 7.10177, - "36": 6.96431, - "37": 7.28463, - "38": 7.0808, - "39": 7.40923, - "40": 7.43338, - "41": 7.38496, - "42": 7.15749, - "43": 7.15858, - "44": 7.28852, - "45": 7.16793, - "46": 6.78468, - "47": 7.4114, - "48": 7.0027, - "49": 7.46249, - "50": 6.92151 + "1": 11.07559, + "2": 11.03834, + "3": 9.66022, + "4": 9.91367, + "5": 9.3291, + "6": 9.13927, + "7": 9.13591, + "8": 8.65527, + "9": 8.51396, + "10": 8.84095, + "11": 8.29144, + "12": 8.34584, + "13": 8.25509, + "14": 7.73685, + "15": 7.86273, + "16": 7.93699, + "17": 7.89257, + "18": 7.63116, + "19": 7.99719, + "20": 7.7453, + "21": 7.44298, + "22": 7.42242, + "23": 7.29721, + "24": 7.27467, + "25": 7.54562, + "26": 6.96839, + "27": 7.50569, + "28": 7.22761, + "29": 7.36579, + "30": 7.52635, + "31": 7.27036, + "32": 7.45548, + "33": 7.50952, + "34": 7.55694, + "35": 7.10212, + "36": 6.96414, + "37": 7.28438, + "38": 7.08049, + "39": 7.40908, + "40": 7.4335, + "41": 7.38491, + "42": 7.15766, + "43": 7.15867, + "44": 7.28831, + "45": 7.16729, + "46": 6.78429, + "47": 7.40937, + "48": 7.00259, + "49": 7.46241, + "50": 6.92143 } }, "num-zeros": { @@ -63,54 +63,54 @@ "values": { "1": 911219392.0, "2": 910960384.0, - "3": 911156352.0, - "4": 912204800.0, - "5": 920796544.0, - "6": 940387968.0, - "7": 990599872.0, - "8": 976457728.0, - "9": 998097664.0, - "10": 995852672.0, - "11": 994583680.0, - "12": 977344896.0, - "13": 1028141824.0, - "14": 1007166208.0, - "15": 987423616.0, - "16": 993054784.0, - "17": 982319168.0, - "18": 998261760.0, - "19": 984696320.0, - "20": 982914752.0, - "21": 979667456.0, - "22": 953988864.0, - "23": 972353984.0, - "24": 964792064.0, - "25": 958512192.0, - "26": 946928512.0, + "3": 911156288.0, + "4": 913253376.0, + "5": 921845056.0, + "6": 941436672.0, + "7": 993745472.0, + "8": 974360512.0, + "9": 999146112.0, + "10": 992706944.0, + "11": 991438144.0, + "12": 979442048.0, + "13": 1029190272.0, + "14": 1008214656.0, + "15": 988472000.0, + "16": 988861120.0, + "17": 979173312.0, + "18": 996164608.0, + "19": 979453440.0, + "20": 982914688.0, + "21": 975473344.0, + "22": 955037568.0, + "23": 969208128.0, + "24": 965840832.0, + "25": 953269440.0, + "26": 949025536.0, "27": 948458304.0, - "28": 949643968.0, - "29": 942877440.0, + "28": 951741184.0, + "29": 943926272.0, "30": 935020160.0, - "31": 935327616.0, - "32": 934281088.0, - "33": 921805568.0, - "34": 928189312.0, - "35": 922202496.0, - "36": 924246656.0, - "37": 920661248.0, + "31": 933230336.0, + "32": 930086848.0, + "33": 922853952.0, + "34": 927140800.0, + "35": 925348224.0, + "36": 925295168.0, + "37": 922758272.0, "38": 922930752.0, - "39": 922322816.0, - "40": 921856512.0, - "41": 920227968.0, + "39": 922322880.0, + "40": 921856640.0, + "41": 920227776.0, "42": 918353664.0, - "43": 918607040.0, - "44": 914948032.0, - "45": 914295232.0, + "43": 919655616.0, + "44": 914948224.0, + "45": 916392512.0, "46": 914344448.0, "47": 911769536.0, - "48": 912013312.0, - "49": 910349440.0, - "50": 914351552.0 + "48": 912013248.0, + "49": 910349376.0, + "50": 914351616.0 } }, "mem-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 41739952128.0, - "2": 43687571456.0, - "3": 43687571456.0, - "4": 43983216640.0, - "5": 43983216640.0, - "6": 43983216640.0, - "7": 43983216640.0, - "8": 44024635392.0, - "9": 44041216000.0, - "10": 44041216000.0, - "11": 44041216000.0, - "12": 44041216000.0, - "13": 44041216000.0, - "14": 44041216000.0, - "15": 44041216000.0, - "16": 44041216000.0, - "17": 44041216000.0, - "18": 44041216000.0, - "19": 44041216000.0, - "20": 44041216000.0, - "21": 44041216000.0, - "22": 44041216000.0, - "23": 44041216000.0, - "24": 44041216000.0, - "25": 44041216000.0, - "26": 44041216000.0, - "27": 44041216000.0, - "28": 44041216000.0, - "29": 44041326592.0, - "30": 44162326528.0, - "31": 44220485632.0, - "32": 44270411776.0, - "33": 44293799936.0, - "34": 44293799936.0, - "35": 44293799936.0, - "36": 44293799936.0, - "37": 44293799936.0, - "38": 44293799936.0, - "39": 44293799936.0, - "40": 44293799936.0, - "41": 44293799936.0, - "42": 44293799936.0, - "43": 44293799936.0, - "44": 44293799936.0, - "45": 44293799936.0, - "46": 44293799936.0, - "47": 44293799936.0, - "48": 44293799936.0, - "49": 44293799936.0, - "50": 44293799936.0 + "1": 41740259328.0, + "2": 43687292928.0, + "3": 43687292928.0, + "4": 43984064512.0, + "5": 43984064512.0, + "6": 43984064512.0, + "7": 43984064512.0, + "8": 44026380288.0, + "9": 44041506816.0, + "10": 44041506816.0, + "11": 44041506816.0, + "12": 44041506816.0, + "13": 44041506816.0, + "14": 44041506816.0, + "15": 44041506816.0, + "16": 44041506816.0, + "17": 44041506816.0, + "18": 44041506816.0, + "19": 44041506816.0, + "20": 44041506816.0, + "21": 44041506816.0, + "22": 44041506816.0, + "23": 44041506816.0, + "24": 44041506816.0, + "25": 44041506816.0, + "26": 44041506816.0, + "27": 44041506816.0, + "28": 44041506816.0, + "29": 44044173312.0, + "30": 44164231168.0, + "31": 44221079552.0, + "32": 44271415296.0, + "33": 44290232320.0, + "34": 44290232320.0, + "35": 44290232320.0, + "36": 44290232320.0, + "37": 44290232320.0, + "38": 44290232320.0, + "39": 44290232320.0, + "40": 44290232320.0, + "41": 44290232320.0, + "42": 44290232320.0, + "43": 44290232320.0, + "44": 44290232320.0, + "45": 44290232320.0, + "46": 44290232320.0, + "47": 44290232320.0, + "48": 44290232320.0, + "49": 44290232320.0, + "50": 44290232320.0 } }, "mtp_1 loss": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.08617, - "2": 11.10475, - "3": 10.48001, - "4": 10.13466, - "5": 9.79047, - "6": 9.50601, - "7": 9.5113, - "8": 8.85336, - "9": 8.66683, - "10": 8.95866, - "11": 8.29315, - "12": 8.36982, - "13": 8.25544, - "14": 7.73322, + "1": 11.08623, + "2": 11.1047, + "3": 10.47999, + "4": 10.13471, + "5": 9.79045, + "6": 9.50607, + "7": 9.51139, + "8": 8.85331, + "9": 8.66688, + "10": 8.95867, + "11": 8.29318, + "12": 8.36986, + "13": 8.25545, + "14": 7.73323, "15": 7.86639, - "16": 7.92442, - "17": 7.86278, - "18": 7.61012, - "19": 8.00269, - "20": 7.73019, - "21": 7.4165, - "22": 7.41478, - "23": 7.28671, - "24": 7.27903, - "25": 7.54456, - "26": 6.96542, - "27": 7.50538, - "28": 7.20607, - "29": 7.377, - "30": 7.52777, - "31": 7.27094, - "32": 7.4604, + "16": 7.92438, + "17": 7.86276, + "18": 7.61004, + "19": 8.00261, + "20": 7.73004, + "21": 7.41636, + "22": 7.41466, + "23": 7.28656, + "24": 7.27882, + "25": 7.54458, + "26": 6.96533, + "27": 7.5053, + "28": 7.20603, + "29": 7.37687, + "30": 7.52783, + "31": 7.27097, + "32": 7.46043, "33": 7.51419, - "34": 7.56867, - "35": 7.09252, - "36": 6.96015, - "37": 7.29846, - "38": 7.0742, - "39": 7.43347, - "40": 7.43116, - "41": 7.40919, + "34": 7.56879, + "35": 7.09276, + "36": 6.96019, + "37": 7.29843, + "38": 7.07417, + "39": 7.43338, + "40": 7.43134, + "41": 7.40946, "42": 7.15527, - "43": 7.15652, - "44": 7.30441, - "45": 7.1893, - "46": 6.77296, - "47": 7.45045, - "48": 7.02403, - "49": 7.45719, - "50": 6.92656 + "43": 7.15684, + "44": 7.30429, + "45": 7.18917, + "46": 6.77286, + "47": 7.44985, + "48": 7.02383, + "49": 7.4572, + "50": 6.92645 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 87.63934, - "2": 1.98402, - "3": 3.95877, - "4": 1.64812, - "5": 2.312, - "6": 2.02902, - "7": 1.56333, - "8": 1.66703, - "9": 1.6393, - "10": 1.40472, - "11": 1.086, - "12": 1.34921, - "13": 1.0854, - "14": 1.4242, - "15": 1.09539, - "16": 1.79766, - "17": 1.2562, - "18": 1.08887, - "19": 1.08371, - "20": 1.10071, - "21": 1.25979, - "22": 1.3212, - "23": 1.25044, - "24": 1.05384, - "25": 1.11356, - "26": 1.0605, - "27": 1.03418, - "28": 1.0405, - "29": 1.05174, - "30": 1.04166, - "31": 1.20036, - "32": 1.12936, - "33": 1.02917, - "34": 1.13473, - "35": 1.02829, - "36": 1.04352, - "37": 1.0843, - "38": 1.03714, - "39": 1.04534, - "40": 1.07031, - "41": 1.07618, - "42": 1.03008, - "43": 1.06043, - "44": 1.04049, - "45": 1.02875, - "46": 1.03669, - "47": 1.03128, - "48": 1.02808, - "49": 1.03038, - "50": 1.04621 + "1": 85.92313, + "2": 1.99152, + "3": 3.91366, + "4": 1.68454, + "5": 2.53883, + "6": 2.55539, + "7": 1.60104, + "8": 1.70562, + "9": 1.72325, + "10": 1.4332, + "11": 1.07958, + "12": 1.399, + "13": 1.10259, + "14": 1.43922, + "15": 1.12046, + "16": 1.33695, + "17": 1.24765, + "18": 1.11257, + "19": 1.10335, + "20": 1.12919, + "21": 1.27711, + "22": 1.09482, + "23": 1.27635, + "24": 1.112, + "25": 1.17791, + "26": 1.10426, + "27": 1.09103, + "28": 1.08338, + "29": 1.07904, + "30": 1.08709, + "31": 1.2237, + "32": 1.18059, + "33": 1.07913, + "34": 1.17232, + "35": 1.09059, + "36": 1.09648, + "37": 1.12683, + "38": 1.10153, + "39": 1.09557, + "40": 1.07747, + "41": 1.12905, + "42": 1.09275, + "43": 1.08609, + "44": 1.08042, + "45": 1.08321, + "46": 1.0732, + "47": 1.08666, + "48": 1.08865, + "49": 1.08808, + "50": 1.08086 } } } \ No newline at end of file diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml index 649da3ba518..53047ff4a3b 100644 --- a/tests/test_utils/recipes/moe.yaml +++ b/tests/test_utils/recipes/moe.yaml @@ -106,14 +106,13 @@ products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] - # TODO: The migration of custom fsdp causes EP + FSDP to be temporarily unavailable, which will be fixed in a subsequent MR. - # - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router] - # products: - # - environment: [dev] - # scope: [mr] - # platforms: [dgx_h100] - # - environment: [lts] - # scope: [nightly] + - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_h100] + - environment: [lts] + scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective] products: - environment: [dev] diff --git a/tools/checkpoint/checkpoint_inspector.py b/tools/checkpoint/checkpoint_inspector.py index 34afa27755f..c62f0ca7417 100644 --- a/tools/checkpoint/checkpoint_inspector.py +++ b/tools/checkpoint/checkpoint_inspector.py @@ -8,6 +8,8 @@ import time import re import shutil +from typing import Optional +import tempfile import click import torch @@ -19,6 +21,7 @@ FileSystemReader, FileSystemWriter, ) +from torch.distributed.checkpoint.format_utils import dcp_to_torch_save from torch.distributed.checkpoint.metadata import ( BytesStorageMetadata, TensorStorageMetadata, @@ -64,7 +67,8 @@ def cli(): @cli.command() @click.argument("checkpoint_dir", type=click.Path(exists=True)) @click.option("--enable-msc", is_flag=True, help="Enable MultiStorageClient feature.") -def inspect(checkpoint_dir, enable_msc): +@click.option("--not-ignore-param-to-group-meta", is_flag=True, help="Ignore parameter-to-group metadata.") +def inspect(checkpoint_dir, enable_msc, not_ignore_param_to_group_meta): """Inspect a Megatron Core Distributed Checkpoint""" ckpt_path = Path(checkpoint_dir) @@ -138,6 +142,8 @@ def inspect(checkpoint_dir, enable_msc): ] click.echo(" | ".join(stats) + "\n") + ignore_param_to_group_meta = not not_ignore_param_to_group_meta + ignore_param_to_group_meta_count = 0 for key, value in metadata.state_dict_metadata.items(): bullet = click.style("►", fg="blue") key_styled = click.style(key, fg="green") @@ -147,11 +153,18 @@ def inspect(checkpoint_dir, enable_msc): shape = click.style(f"{tuple(value.size)}", fg="magenta") click.echo(f" {bullet} {key_styled} [{dtype}, shape={shape}]") elif isinstance(value, BytesStorageMetadata): + if ignore_param_to_group_meta and key.startswith("optimizer.param_to_group_meta."): + ignore_param_to_group_meta_count += 1 + continue click.echo(f" {bullet} {key_styled} {click.style('[BYTES]', fg='yellow')}") else: click.echo( f" {bullet} {key_styled} {click.style('[UNKNOWN TYPE]', fg='red')}" ) + if ignore_param_to_group_meta: + click.echo( + click.style(f"Ignored parameter-to-group metadata: {ignore_param_to_group_meta_count}", fg="yellow") + ) # MCore data section try: @@ -323,8 +336,10 @@ def convert_checkpoint( output_dir, swiglu, process_group, + optimizer_param_to_group_prefix="optimizer.param_to_group_meta.module.module.module", optimizer_state_prefix="optimizer.state.module.module.module", model_weight_prefix="model.module", + param_to_param_group_map={}, ): """Convert a Megatron Core Distributed Checkpoint from torch_dist to standard fsdp_dtensor format.""" device_mesh = DeviceMesh.from_group(process_group, device_type="cuda") @@ -371,6 +386,104 @@ def _free_up_some_gpu_memory(): gc.collect() torch.cuda.empty_cache() + def split_layers( + key: str, + value: torch.Tensor, + orig_shape: Optional[torch.Size] = None, + ) -> dict[str, torch.Tensor]: + """ + Split layers into separate tensors. + """ + _free_up_some_gpu_memory() + layers = {} + for i, v in enumerate(split_dtensor(value, 1, dim=0)): + v = gather_uneven_dtensor_to_full_tensor(v).reshape( + orig_shape[1:] if orig_shape else value.shape[1:] + ).redistribute(placements=[Shard(0)]) + + layer_key = key.replace(".layers.", f".layers.{i}.") + layers[layer_key] = v + + return layers + + def split_expert_weights( + key: str, + value: torch.Tensor, + orig_shape: Optional[torch.Size] = None, + ) -> dict[str, torch.Tensor]: + """ + Split expert weights into separate tensors for each expert. + """ + experts = {} + layer_key = key.replace(".experts.experts.", ".experts.") + expert_weights = split_dtensor(value, 1, dim=0) + for expert_idx, expert_weight in enumerate(expert_weights): + layer_key_parts = layer_key.split(".weight", 1) + if len(layer_key_parts) == 1: + expert_key = f"{layer_key}{expert_idx}" + elif len(layer_key_parts) == 2: + expert_key = f"{layer_key_parts[0]}.weight{expert_idx}{layer_key_parts[1]}" + else: + raise ValueError(f"Unexpected expert layer key: {layer_key}") + + expert_weight = gather_uneven_dtensor_to_full_tensor(expert_weight) + expert_shape = orig_shape[1:] if orig_shape else value.shape[1:] + # Handle optimizer states for expert linear_fc2 when ETP is enabled + if ( + layer_key.startswith("optimizer.state.") + and "linear_fc2" in layer_key + and expert_weight.shape[-2] > 1 + ): + tp_size = expert_weight.shape[-2] + rows, cols = expert_shape + # Reshape to split column dimension by tp_size + expert_weight = expert_weight.reshape( + *expert_weight.shape[:-1], rows, cols // tp_size + ) + dims = list(range(expert_weight.ndim)) + dims[-3], dims[-2] = dims[-2], dims[-3] + expert_weight = ( + expert_weight.permute(*dims) + .reshape(expert_shape) + .redistribute(placements=[Shard(0)]) + ) + else: + expert_weight = expert_weight.reshape(expert_shape).redistribute( + placements=[Shard(0)] + ) + experts[expert_key] = expert_weight + return experts + + def is_swiglu_key(key): + return any(re.search(pat, key) for pat in [ + r"(.*)\.mlp\.linear_fc1\.weight", + r"(.*)\.mlp\.linear_fc1\.bias", + r"(.*)\.mlp\.experts\.linear_fc1\.weight(\d+)", + r"(.*)\.mlp\.experts\.linear_fc1\.bias(\d+)", + r"(.*)\.mlp\.experts\.local_experts\.(\d+)\.linear_fc1\.weight", + r"(.*)\.mlp\.experts\.local_experts\.(\d+)\.linear_fc1\.bias", + r"(.*)\.mlp\.shared_experts\.linear_fc1\.weight", + r"(.*)\.mlp\.shared_experts\.linear_fc1\.bias", + ]) + + def split_swiglu_weight(key: str, value: torch.Tensor) -> dict[str, torch.Tensor]: + """ + Split SwiGLU weights into separate tensors. + """ + value = gather_uneven_dtensor_to_full_tensor(value) + swiglu_w_and_v = {} + w, v = torch.chunk(value, 2, dim=0) + w = w.redistribute(placements=[Shard(0)]) + v = v.redistribute(placements=[Shard(0)]) + w_key = re.sub(r'(weight\d*)(.*)', r'\1_w\2', key) + v_key = re.sub(r'(weight\d*)(.*)', r'\1_v\2', key) + swiglu_w_and_v[w_key] = w + swiglu_w_and_v[v_key] = v + return swiglu_w_and_v + + def has_layer_index(key: str) -> bool: + return bool(re.search(r"layers\.(\d+)\.", key)) + while state_dict: key, value = state_dict.popitem() if torch.distributed.get_rank() == 0: @@ -387,9 +500,11 @@ def _free_up_some_gpu_memory(): # Special handling for optimizer state key_list = key.split(".") new_key = f"{optimizer_state_prefix}.{'.'.join(key_list[3:])}.{key_list[2]}" + is_param = False else: # Special handling for module parameters new_key = f"{model_weight_prefix}.{key}" + is_param = True # Handle dist-opt flatten tensors if ( @@ -406,68 +521,47 @@ def _free_up_some_gpu_memory(): else: orig_shape = None - # Handle multi-layer tensors - if ".layers." in new_key: - n_layer = value.shape[0] - - _free_up_some_gpu_memory() - per_layer_values = [ - gather_uneven_dtensor_to_full_tensor(v).redistribute( - placements=[Shard(len(v.shape) - 1)] - ) - for v in split_dtensor(value, 1, dim=0) - ] - for i in range(n_layer): - if orig_shape is not None: - layer_shape = orig_shape[1:] - else: - layer_shape = value.shape[1:] - - per_layer_values[i] = ( - per_layer_values[i] - .reshape(layer_shape) - .redistribute(placements=[Shard(0)]) - ) - for i in range(0, n_layer): - layer_key = new_key.replace(".layers.", f".layers.{i}.") - if swiglu and "mlp.linear_fc1.weight" in layer_key: - # Special case for SwiGLU - w, v = torch.chunk(per_layer_values[i], 2, dim=0) - w = w.redistribute(placements=[Shard(0)]) - v = v.redistribute(placements=[Shard(0)]) - w_key = layer_key.replace( - "mlp.linear_fc1.weight", "mlp.linear_fc1.weight_w" - ) - v_key = layer_key.replace( - "mlp.linear_fc1.weight", "mlp.linear_fc1.weight_v" - ) - # Store both w and v in the state_dict - fsdp_dtensor_state_dict[w_key] = w - fsdp_dtensor_state_dict[v_key] = v - elif ( - "experts.experts.linear_fc1.weight" in layer_key - or "experts.experts.linear_fc2.weight" in layer_key + # Handle multi-layer / experts tensors + split_tensors = {} + if ".layers." in new_key and not has_layer_index(new_key): + split_tensors = split_layers(new_key, value, orig_shape) + elif ".experts.experts." in new_key: + split_tensors = split_expert_weights(new_key, value, orig_shape) + else: + if orig_shape: + value = gather_uneven_dtensor_to_full_tensor(value) + # Handle optimizer states with partition_dim=1 when TP is enabled + if ( + new_key.startswith("optimizer.state.") + and value.ndim > 2 + and value.shape[-2] > 1 ): - # Special case for MoE - layer_key = layer_key.replace(".experts.experts.", ".experts.") - expert_weights = torch.split(per_layer_values[i], 1, dim=0) - for expert_idx, expert_weight in enumerate(expert_weights): - expert_key = f"{layer_key}{expert_idx}" - fsdp_dtensor_state_dict[expert_key] = expert_weight.squeeze( - 0 - ) + tp_size = value.shape[-2] + rows, cols = orig_shape + # Reshape to split column dimension by tp_size + value = value.reshape(*value.shape[:-1], rows, cols // tp_size) + dims = list(range(value.ndim)) + dims[-3], dims[-2] = dims[-2], dims[-3] + value = ( + value.permute(*dims) + .reshape(orig_shape) + .redistribute(placements=[Shard(0)]) + ) else: - # General case - fsdp_dtensor_state_dict[layer_key] = per_layer_values[i] - else: - if orig_shape is not None: - _free_up_some_gpu_memory() - value = ( - value.redistribute(placements=[Replicate()]) - .reshape(orig_shape) - .redistribute(placements=[Shard(0)]) - ) - fsdp_dtensor_state_dict[new_key] = value + value = value.reshape(orig_shape).redistribute(placements=[Shard(0)]) + split_tensors = {new_key: value} + + # Handle SWiGLU weights + for key, value in list(split_tensors.items()): + if swiglu and is_swiglu_key(key): + swiglu_w_and_v = split_swiglu_weight(key, value) + split_tensors.update(swiglu_w_and_v) + del split_tensors[key] + + fsdp_dtensor_state_dict.update(split_tensors) + if is_param and key in param_to_param_group_map: + for new_key in split_tensors.keys(): + param_to_param_group_map[new_key] = param_to_param_group_map[key] elif key.startswith("rng_state"): # Skip RNG states continue @@ -530,6 +624,15 @@ def _free_up_some_gpu_memory(): ) ) common_state = common_strategy.load_common(input_dir) + try: + if "param_groups" in common_state["optimizer"]: + ckpt_param_groups = common_state["optimizer"]["param_groups"] + else: + ckpt_param_groups = [] + for opt_state_dict in common_state["optimizer"].values(): + ckpt_param_groups.extend(opt_state_dict["optimizer"]["param_groups"]) + except: + ckpt_param_groups = None common_state = flatten(common_state) for key, value in common_state.items(): if key.startswith("optimizer.optimizer.param_groups."): @@ -541,12 +644,29 @@ def _free_up_some_gpu_memory(): ) fsdp_dtensor_state_dict[key] = value + # set up per-parameter param_groups + if param_to_param_group_map and ckpt_param_groups is not None: + for name in list(fsdp_dtensor_state_dict.keys()): + if not name.startswith(model_weight_prefix) or name.endswith(".expert_bias"): + continue + + assert name in param_to_param_group_map, f"Missing param group for {name}" + param_group_id = param_to_param_group_map[name] + assert param_group_id < len(ckpt_param_groups), f"Invalid param group id {param_group_id} for {name}" + name_without_prefix = name[len(model_weight_prefix):] + fsdp_dtensor_state_dict[ + f"{optimizer_param_to_group_prefix}.{name_without_prefix}" + ] = ckpt_param_groups[param_group_id] + if "checkpoint_version" not in fsdp_dtensor_state_dict: fsdp_dtensor_state_dict["checkpoint_version"] = 3.0 # Save modified checkpoint save_checkpoint_with_pickle_protocol(fsdp_dtensor_state_dict, output_dir) + dist.barrier() # Synchronize all ranks + dist.destroy_process_group() + @cli.command() @click.argument("input_dir", type=click.Path(exists=True)) @@ -560,12 +680,6 @@ def _free_up_some_gpu_memory(): "--oom-traceback", is_flag=True, help="Enable OOM traceback for debugging." ) @click.option("--enable-msc", is_flag=True, help="Enable MultiStorageClient feature.") -@click.option( - "--distributed-timeout-minutes", - default=10, - type=int, - help="Timeout for distributed operations in minutes.", -) @click.option( "--output-optimizer-state-prefix", default="optimizer.state.module.module.module", @@ -576,15 +690,21 @@ def _free_up_some_gpu_memory(): default="model.module", help="Prefix for model weight keys in the checkpoint.", ) +@click.option( + "--param-to-param-group-map-json", + type=str, + default="{}", + help="JSON string representing the param to parameter group map." +) def convert_torch_dist_to_fsdp_dtensor( input_dir, output_dir, swiglu, oom_traceback, enable_msc, - distributed_timeout_minutes, output_optimizer_state_prefix, output_model_weight_prefix, + param_to_param_group_map_json, ): """Convert a Megatron Core Distributed Checkpoint from torch_dist to fsdp_dtensor format.""" if not enable_msc: @@ -624,10 +744,13 @@ def oom_observer(device, alloc, device_alloc, device_free): ckpt_path = Path(input_dir) output_dir = Path(output_dir) + with open(param_to_param_group_map_json, "r") as f: + param_to_param_group_map = json.load(f) convert_checkpoint( ckpt_path, output_dir, swiglu, process_group=dist.group.WORLD, optimizer_state_prefix=output_optimizer_state_prefix, model_weight_prefix=output_model_weight_prefix, + param_to_param_group_map=param_to_param_group_map, ) click.echo( @@ -742,6 +865,109 @@ def modify_state_dict(input_dir, output_dir, op, enable_msc): ) +def _compare_two_checkpoint(checkpoint_1, checkpoint_2): + reader_1 = FileSystemReader(checkpoint_1) + metadata_1 = reader_1.read_metadata() + + reader_2 = FileSystemReader(checkpoint_2) + metadata_2 = reader_2.read_metadata() + + keys_1 = set(metadata_1.state_dict_metadata.keys()) + keys_2 = set(metadata_2.state_dict_metadata.keys()) + + click.echo(click.style("Comparing checkpoints...", fg="blue")) + + # Compare keys + missing_in_1 = keys_2 - keys_1 + missing_in_2 = keys_1 - keys_2 + common_keys = keys_1 & keys_2 + + click.echo(click.style("Keys missing in checkpoint 1:", fg="red")) + for key in missing_in_1: + click.echo(click.style(f" - {key}", fg="red")) + + click.echo(click.style("Keys missing in checkpoint 2:", fg="red")) + for key in missing_in_2: + click.echo(click.style(f" - {key}", fg="red")) + + # Compare common keys + click.echo(click.style("Common keys in both checkpoints:", fg="green")) + for key in common_keys: + meta_1 = metadata_1.state_dict_metadata[key] + meta_2 = metadata_2.state_dict_metadata[key] + + if not isinstance(meta_1, TensorStorageMetadata): + continue + + if meta_1.size != meta_2.size or meta_1.properties.dtype != meta_2.properties.dtype: + click.echo(click.style(f" - {key} (metadata differ) meta_1: {meta_1}, meta_2: {meta_2}", fg="red")) + else: + value_1 = torch.empty(meta_1.size, dtype=meta_1.properties.dtype) + value_2 = value_1.clone() + + dcp.load({key: value_1}, storage_reader=reader_1, planner=DefaultLoadPlanner()) + dcp.load({key: value_2}, storage_reader=reader_2, planner=DefaultLoadPlanner()) + + if not torch.allclose( + value_1, value_2, atol=1e-8, rtol=1e-5 + ): + click.echo(click.style(f" - {key} (values differ) value_1: {value_1}, value_2: {value_2}", fg="red")) + + +@cli.command() +@click.argument("checkpoint_1", type=click.Path(exists=True)) +@click.argument("checkpoint_2", type=click.Path(exists=True)) +@click.option("--enable-msc", is_flag=True, help="Enable MultiStorageClient feature.") +def compare_two_checkpoint(checkpoint_1, checkpoint_2, enable_msc): + """ + Compare two checkpoints. + """ + init_process_group(f"compare_two_checkpoint from {checkpoint_1} to {checkpoint_2}") + + if not enable_msc: + MultiStorageClientFeature.disable() + + _compare_two_checkpoint( + Path(checkpoint_1), + Path(checkpoint_2), + ) + + click.echo( + click.style( + f"Comparison between {checkpoint_1} and {checkpoint_2} completed.", fg="green", bold=True + ) + ) + + +@cli.command() +@click.argument("torch_dcp_dir", type=click.Path(exists=True)) +def print_torch_dcp_in_json(torch_dcp_dir, model_weight_prefix="model.module"): + # Use a temporary file context + with tempfile.NamedTemporaryFile(suffix=".pth") as tmp_file: + # Convert distributed checkpoint directory to a single-file checkpoint + dcp_to_torch_save(torch_dcp_dir, tmp_file.name) + + # Load the state dict from the temporary file + state_dict = torch.load(tmp_file.name, map_location="cpu") + + click.echo(f"torch dcp content: {json.dumps(state_dict)}") + + # Replace all "module.module." with model_weight_prefix in dict keys + new_state_dict = {} + for key, value in state_dict.items(): + new_key = key.replace("module.module", model_weight_prefix) + new_state_dict[new_key] = value + + # Convert state dict to JSON-serializable format + serializable_dict = {k: v.tolist() if hasattr(v, "tolist") else v for k, v in new_state_dict.items()} + + # Save to a JSON file + json_file_path = os.path.join(torch_dcp_dir, "param_to_param_group_map.json") + with open(json_file_path, "w") as json_file: + json.dump(serializable_dict, json_file, indent=2) + click.echo(f"Saved converted param_to_param_group_map to: {json_file_path}") + + def init_process_group(message): rank = int(os.getenv("RANK", "0")) world_size = int(os.getenv("WORLD_SIZE", "1")) From 2c854484431191e661242eb27185492f3760dfb6 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 30 Oct 2025 23:30:40 -0500 Subject: [PATCH 088/248] Update golden values due to PR #2007 (#2057) Signed-off-by: Charlie Truong --- .../golden_values_dev_dgxh100_eos.json | 598 +++++++++--------- .../golden_values_dev_dgxh100_eos.json | 500 +++++++-------- .../golden_values_dev_dgxh100_eos.json | 537 ++++++++++++++++ .../golden_values_dev_dgxh100_eos.json | 380 +++++------ 4 files changed, 1276 insertions(+), 739 deletions(-) create mode 100644 tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_eos.json index b3668b31178..01651f27b62 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_eos.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgxh100_eos.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.04748, - "2": 11.03561, - "3": 9.58773, - "4": 9.25819, - "5": 9.52742, - "6": 9.87911, - "7": 9.48366, - "8": 8.93879, - "9": 8.6551, - "10": 9.10915, - "11": 8.51806, - "12": 8.54732, - "13": 8.48144, - "14": 8.05312, - "15": 8.10118, - "16": 8.10344, - "17": 8.08878, - "18": 7.78589, - "19": 8.15794, - "20": 7.88069, - "21": 7.58542, - "22": 7.54895, - "23": 7.4296, - "24": 7.41901, - "25": 7.67277, - "26": 7.07835, - "27": 7.61157, - "28": 7.31513, - "29": 7.49487, - "30": 7.64287, - "31": 7.39102, - "32": 7.59148, - "33": 7.6393, - "34": 7.70086, - "35": 7.2119, - "36": 7.08623, - "37": 7.43064, - "38": 7.18999, - "39": 7.5525, - "40": 7.54961, - "41": 7.49385, - "42": 7.25481, - "43": 7.24066, - "44": 7.42131, - "45": 7.19201, - "46": 6.90547, - "47": 7.30704, - "48": 7.15325, - "49": 7.60504, - "50": 7.04512 + "1": 11.04722, + "2": 11.03572, + "3": 9.58802, + "4": 9.25807, + "5": 9.46595, + "6": 9.99646, + "7": 9.50952, + "8": 8.97596, + "9": 8.64768, + "10": 9.40103, + "11": 8.86557, + "12": 8.63562, + "13": 8.52126, + "14": 8.08764, + "15": 8.19553, + "16": 8.22117, + "17": 8.14088, + "18": 7.83923, + "19": 8.23508, + "20": 7.95432, + "21": 7.62712, + "22": 7.60353, + "23": 7.48451, + "24": 7.46602, + "25": 7.70409, + "26": 7.10906, + "27": 7.6443, + "28": 7.34234, + "29": 7.5189, + "30": 7.67585, + "31": 7.41996, + "32": 7.61477, + "33": 7.66691, + "34": 7.73349, + "35": 7.23566, + "36": 7.11008, + "37": 7.44958, + "38": 7.21125, + "39": 7.57837, + "40": 7.56809, + "41": 7.51465, + "42": 7.27318, + "43": 7.25818, + "44": 7.44014, + "45": 7.21234, + "46": 6.92392, + "47": 7.32631, + "48": 7.17263, + "49": 7.62149, + "50": 7.06495 } }, "num-zeros": { @@ -62,55 +62,55 @@ "step_interval": 1, "values": { "1": 38802612.0, - "2": 38543592.0, - "3": 38739480.0, - "4": 279954336.0, - "5": 249745312.0, - "6": 268288496.0, - "7": 604756224.0, - "8": 781485184.0, - "9": 636362112.0, - "10": 653025216.0, - "11": 668551168.0, - "12": 765583616.0, - "13": 815362944.0, - "14": 834270656.0, - "15": 755756096.0, - "16": 995153536.0, - "17": 938291584.0, - "18": 721524928.0, - "19": 756173504.0, - "20": 901129600.0, - "21": 721816384.0, - "22": 831311872.0, - "23": 803536768.0, - "24": 628253248.0, - "25": 663895680.0, - "26": 847321664.0, - "27": 828927424.0, - "28": 777678976.0, - "29": 764628608.0, - "30": 781930112.0, - "31": 771767616.0, - "32": 771755392.0, - "33": 586323648.0, - "34": 734207552.0, - "35": 690468480.0, - "36": 485982688.0, - "37": 506506336.0, - "38": 642964160.0, - "39": 661240000.0, - "40": 645048768.0, - "41": 636072704.0, - "42": 491645856.0, - "43": 601942528.0, - "44": 623448960.0, - "45": 539959424.0, - "46": 532669088.0, - "47": 529039680.0, - "48": 504121984.0, - "49": 478344480.0, - "50": 331385728.0 + "2": 38543656.0, + "3": 38739356.0, + "4": 273649600.0, + "5": 252887040.0, + "6": 255692384.0, + "7": 598483264.0, + "8": 787737984.0, + "9": 696133120.0, + "10": 505146400.0, + "11": 715718272.0, + "12": 872566848.0, + "13": 947497344.0, + "14": 1076390912.0, + "15": 853234624.0, + "16": 1045488064.0, + "17": 831385088.0, + "18": 969961792.0, + "19": 973165952.0, + "20": 951461376.0, + "21": 901033280.0, + "22": 897373440.0, + "23": 901066560.0, + "24": 710038592.0, + "25": 912381952.0, + "26": 866199936.0, + "27": 876109696.0, + "28": 912952192.0, + "29": 972247104.0, + "30": 951806720.0, + "31": 960493312.0, + "32": 910169408.0, + "33": 853655744.0, + "34": 834879424.0, + "35": 835171520.0, + "36": 797371392.0, + "37": 777009408.0, + "38": 598948480.0, + "39": 664393152.0, + "40": 767727104.0, + "41": 771335168.0, + "42": 752681344.0, + "43": 715187840.0, + "44": 714677440.0, + "45": 687806016.0, + "46": 501256736.0, + "47": 629706368.0, + "48": 651967104.0, + "49": 629336832.0, + "50": 589310016.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 6637267456.0, - "2": 6637269504.0, - "3": 6637269504.0, - "4": 6637269504.0, - "5": 6637269504.0, - "6": 6637269504.0, - "7": 6637269504.0, - "8": 6637269504.0, - "9": 6637269504.0, - "10": 6637269504.0, - "11": 6637269504.0, - "12": 6637269504.0, - "13": 6637269504.0, - "14": 6637269504.0, - "15": 6637269504.0, - "16": 6637269504.0, - "17": 6637269504.0, - "18": 6637269504.0, - "19": 6637269504.0, - "20": 6637269504.0, - "21": 6637269504.0, - "22": 6637269504.0, - "23": 6637269504.0, - "24": 6637269504.0, - "25": 6637269504.0, - "26": 6637269504.0, - "27": 6637269504.0, - "28": 6637269504.0, - "29": 6637269504.0, - "30": 6637269504.0, - "31": 6637269504.0, - "32": 6637269504.0, - "33": 6637269504.0, - "34": 6637269504.0, - "35": 6637269504.0, - "36": 6637269504.0, - "37": 6637269504.0, - "38": 6637269504.0, - "39": 6637269504.0, - "40": 6637269504.0, - "41": 6637269504.0, - "42": 6637269504.0, - "43": 6637269504.0, - "44": 6637269504.0, - "45": 6637269504.0, - "46": 6637269504.0, - "47": 6637269504.0, - "48": 6637269504.0, - "49": 6637269504.0, - "50": 6637269504.0 + "1": 6637272576.0, + "2": 6637274624.0, + "3": 6637274624.0, + "4": 6637274624.0, + "5": 6637274624.0, + "6": 6637274624.0, + "7": 6637274624.0, + "8": 6637274624.0, + "9": 6637274624.0, + "10": 6637274624.0, + "11": 6637274624.0, + "12": 6637274624.0, + "13": 6637274624.0, + "14": 6637274624.0, + "15": 6637274624.0, + "16": 6637274624.0, + "17": 6637274624.0, + "18": 6637274624.0, + "19": 6637274624.0, + "20": 6637274624.0, + "21": 6637274624.0, + "22": 6637274624.0, + "23": 6637274624.0, + "24": 6637274624.0, + "25": 6637274624.0, + "26": 6637274624.0, + "27": 6637274624.0, + "28": 6637274624.0, + "29": 6637274624.0, + "30": 6637274624.0, + "31": 6637274624.0, + "32": 6637274624.0, + "33": 6637274624.0, + "34": 6637274624.0, + "35": 6637274624.0, + "36": 6637274624.0, + "37": 6637274624.0, + "38": 6637274624.0, + "39": 6637274624.0, + "40": 6637274624.0, + "41": 6637274624.0, + "42": 6637274624.0, + "43": 6637274624.0, + "44": 6637274624.0, + "45": 6637274624.0, + "46": 6637274624.0, + "47": 6637274624.0, + "48": 6637274624.0, + "49": 6637274624.0, + "50": 6637274624.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 55055331328.0, - "2": 57809321984.0, - "3": 57919823872.0, - "4": 57919823872.0, - "5": 57919823872.0, - "6": 57919823872.0, - "7": 57919823872.0, - "8": 57919823872.0, - "9": 57919823872.0, - "10": 57919823872.0, - "11": 57919823872.0, - "12": 57919823872.0, - "13": 57932275712.0, - "14": 57932275712.0, - "15": 57932275712.0, - "16": 57932275712.0, - "17": 57932275712.0, - "18": 57932275712.0, - "19": 57932275712.0, - "20": 57932275712.0, - "21": 57932275712.0, - "22": 57932275712.0, - "23": 57932275712.0, - "24": 57932275712.0, - "25": 57932275712.0, - "26": 57932275712.0, - "27": 57932275712.0, - "28": 57932275712.0, - "29": 57932275712.0, - "30": 57932275712.0, - "31": 57932275712.0, - "32": 57932275712.0, - "33": 57932275712.0, - "34": 57932275712.0, - "35": 57932275712.0, - "36": 57932275712.0, - "37": 57932275712.0, - "38": 57932275712.0, - "39": 57932275712.0, - "40": 57932275712.0, - "41": 57932275712.0, - "42": 57932275712.0, - "43": 57932275712.0, - "44": 57932275712.0, - "45": 57932275712.0, - "46": 57932275712.0, - "47": 57932275712.0, - "48": 57932275712.0, - "49": 57932275712.0, - "50": 57932275712.0 + "1": 55056003072.0, + "2": 57810763776.0, + "3": 57920647168.0, + "4": 57920647168.0, + "5": 57920647168.0, + "6": 57920647168.0, + "7": 57920647168.0, + "8": 57920647168.0, + "9": 57920647168.0, + "10": 57920647168.0, + "11": 57920647168.0, + "12": 57920647168.0, + "13": 57920647168.0, + "14": 57920647168.0, + "15": 57920647168.0, + "16": 57920647168.0, + "17": 57920647168.0, + "18": 57920647168.0, + "19": 57920647168.0, + "20": 57920647168.0, + "21": 57920647168.0, + "22": 57920647168.0, + "23": 57920647168.0, + "24": 57920647168.0, + "25": 57920647168.0, + "26": 57920647168.0, + "27": 57920647168.0, + "28": 57920647168.0, + "29": 57920647168.0, + "30": 57920647168.0, + "31": 57920647168.0, + "32": 57920647168.0, + "33": 57920647168.0, + "34": 57920647168.0, + "35": 57920647168.0, + "36": 57920647168.0, + "37": 57920647168.0, + "38": 57920647168.0, + "39": 57920647168.0, + "40": 57920647168.0, + "41": 57920647168.0, + "42": 57920647168.0, + "43": 57920647168.0, + "44": 57920647168.0, + "45": 57920647168.0, + "46": 57921617920.0, + "47": 57921617920.0, + "48": 57921617920.0, + "49": 57921617920.0, + "50": 57921617920.0 } }, "mtp_1 loss": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.07654, - "2": 11.07406, - "3": 10.53883, - "4": 10.09801, - "5": 9.81156, - "6": 10.06025, - "7": 9.7962, - "8": 9.06987, - "9": 8.86879, - "10": 9.13393, - "11": 8.5017, - "12": 8.54094, - "13": 8.43678, - "14": 7.85637, - "15": 7.99846, - "16": 8.05889, - "17": 8.01134, - "18": 7.73929, - "19": 8.1188, - "20": 7.83458, - "21": 7.53103, - "22": 7.50125, - "23": 7.37135, - "24": 7.37419, - "25": 7.61596, - "26": 7.01586, - "27": 7.55739, - "28": 7.26274, - "29": 7.43991, - "30": 7.58436, - "31": 7.32289, - "32": 7.50362, - "33": 7.56884, - "34": 7.6339, - "35": 7.151, - "36": 7.01725, - "37": 7.35013, - "38": 7.12483, - "39": 7.48708, - "40": 7.47451, - "41": 7.4181, - "42": 7.17557, - "43": 7.15957, - "44": 7.34227, - "45": 7.12176, - "46": 6.82526, - "47": 7.23374, - "48": 7.07893, - "49": 7.5077, - "50": 6.97094 + "1": 11.07648, + "2": 11.07404, + "3": 10.53854, + "4": 10.09813, + "5": 9.81166, + "6": 10.09741, + "7": 9.79481, + "8": 9.0642, + "9": 8.86016, + "10": 9.34039, + "11": 8.51318, + "12": 8.59468, + "13": 8.52921, + "14": 7.95758, + "15": 8.06962, + "16": 8.11803, + "17": 8.06994, + "18": 7.80584, + "19": 8.19191, + "20": 7.89063, + "21": 7.5707, + "22": 7.55089, + "23": 7.41603, + "24": 7.42509, + "25": 7.65319, + "26": 7.05604, + "27": 7.59797, + "28": 7.29977, + "29": 7.47274, + "30": 7.61938, + "31": 7.35308, + "32": 7.53089, + "33": 7.59296, + "34": 7.66429, + "35": 7.17544, + "36": 7.04045, + "37": 7.37008, + "38": 7.14419, + "39": 7.51022, + "40": 7.48928, + "41": 7.43717, + "42": 7.19432, + "43": 7.17612, + "44": 7.35764, + "45": 7.13893, + "46": 6.84092, + "47": 7.25121, + "48": 7.09497, + "49": 7.52321, + "50": 6.98958 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 57.80279, - "2": 1.26321, - "3": 1.18918, - "4": 2.24643, - "5": 2.25191, - "6": 1.80757, - "7": 2.09086, - "8": 1.69153, - "9": 1.81279, - "10": 1.64882, - "11": 1.03476, - "12": 1.03593, - "13": 1.04348, - "14": 1.03841, - "15": 1.04432, - "16": 1.05281, - "17": 1.04826, - "18": 1.04981, - "19": 1.05351, - "20": 1.04668, - "21": 1.05254, - "22": 1.05391, - "23": 1.04635, - "24": 1.05503, - "25": 1.04226, - "26": 1.0684, - "27": 1.04985, - "28": 1.04233, - "29": 1.05036, - "30": 1.06219, - "31": 1.044, - "32": 1.05614, - "33": 1.05729, - "34": 1.05618, - "35": 1.06289, - "36": 1.05761, - "37": 1.05956, - "38": 1.06343, - "39": 1.06848, - "40": 1.06027, - "41": 1.05493, - "42": 1.05258, - "43": 1.04879, - "44": 1.04949, - "45": 1.05964, - "46": 1.04465, - "47": 1.0491, - "48": 1.05387, - "49": 1.05218, - "50": 1.05453 + "1": 85.33545, + "2": 1.29783, + "3": 1.20289, + "4": 2.24602, + "5": 2.32616, + "6": 1.7486, + "7": 2.17383, + "8": 1.65491, + "9": 1.70888, + "10": 1.05169, + "11": 1.03097, + "12": 1.02332, + "13": 1.0314, + "14": 1.03723, + "15": 1.02333, + "16": 1.04585, + "17": 1.05489, + "18": 1.05149, + "19": 1.04366, + "20": 1.04123, + "21": 1.04123, + "22": 1.05131, + "23": 1.04784, + "24": 1.05156, + "25": 1.05897, + "26": 1.05841, + "27": 1.03255, + "28": 1.03763, + "29": 1.0362, + "30": 1.04244, + "31": 1.03393, + "32": 1.04177, + "33": 1.06033, + "34": 1.06132, + "35": 1.06434, + "36": 1.05438, + "37": 1.64369, + "38": 1.06374, + "39": 1.07491, + "40": 1.07295, + "41": 1.06978, + "42": 1.06102, + "43": 1.05808, + "44": 1.06997, + "45": 1.06476, + "46": 1.06795, + "47": 1.06701, + "48": 1.06649, + "49": 1.06638, + "50": 1.06224 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json index daa04af43dd..dc2c39d712d 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgxh100_eos.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 10.95004, - "2": 10.9521, - "3": 10.5115, - "4": 9.96454, - "5": 9.93941, - "6": 9.67273, - "7": 10.20975, - "8": 9.49716, - "9": 9.55902, - "10": 9.79742, - "11": 9.30109, - "12": 9.40483, - "13": 9.39546, - "14": 8.84681, - "15": 9.02444, - "16": 9.07121, - "17": 9.04574, - "18": 8.75678, - "19": 9.18159, - "20": 8.8595, - "21": 8.53503, - "22": 8.55182, - "23": 8.42441, - "24": 8.37608, - "25": 8.64304, - "26": 7.97393, - "27": 8.56806, - "28": 8.19764, - "29": 8.3928, - "30": 8.67283, - "31": 8.289, - "32": 8.43572, - "33": 8.5568, - "34": 8.66018, - "35": 8.07934, - "36": 7.94976, - "37": 8.29565, - "38": 7.98044, - "39": 8.39201, - "40": 8.35513, - "41": 8.31876, - "42": 8.0583, - "43": 8.03283, - "44": 8.24243, - "45": 8.10277, - "46": 7.61696, - "47": 8.15273, - "48": 8.00569, - "49": 8.38688, - "50": 7.81491 + "1": 10.94971, + "2": 10.95174, + "3": 10.51547, + "4": 9.96574, + "5": 9.941, + "6": 9.67424, + "7": 10.20193, + "8": 9.50006, + "9": 9.54983, + "10": 9.79714, + "11": 9.30093, + "12": 9.40563, + "13": 9.39461, + "14": 8.84641, + "15": 9.02323, + "16": 9.07046, + "17": 9.04704, + "18": 8.75684, + "19": 9.18168, + "20": 8.86245, + "21": 8.53735, + "22": 8.55361, + "23": 8.42666, + "24": 8.37856, + "25": 8.64287, + "26": 7.9729, + "27": 8.56717, + "28": 8.19494, + "29": 8.39321, + "30": 8.67278, + "31": 8.2887, + "32": 8.43529, + "33": 8.5564, + "34": 8.65783, + "35": 8.07826, + "36": 7.94839, + "37": 8.29395, + "38": 7.9776, + "39": 8.39027, + "40": 8.35602, + "41": 8.31509, + "42": 8.06463, + "43": 8.03334, + "44": 8.24022, + "45": 8.10462, + "46": 7.61777, + "47": 8.15389, + "48": 8.0077, + "49": 8.38728, + "50": 7.81501 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 19403624.0, - "2": 19274194.0, - "3": 19372760.0, - "4": 86525248.0, - "5": 148575568.0, - "6": 145226704.0, - "7": 171879984.0, - "8": 195785248.0, - "9": 164124752.0, - "10": 167684736.0, - "11": 221077344.0, - "12": 200384224.0, - "13": 248872528.0, - "14": 211169424.0, - "15": 214304608.0, - "16": 216075632.0, - "17": 267845984.0, - "18": 170470336.0, - "19": 176865072.0, - "20": 187955392.0, - "21": 225750704.0, - "22": 247396816.0, - "23": 211643856.0, - "24": 205638464.0, - "25": 277022272.0, - "26": 291562304.0, - "27": 225789840.0, - "28": 288202368.0, - "29": 198390384.0, - "30": 213302208.0, - "31": 227204752.0, - "32": 271112416.0, - "33": 231840432.0, - "34": 203575536.0, - "35": 191152368.0, - "36": 222566928.0, - "37": 177810112.0, - "38": 228708544.0, - "39": 211168784.0, - "40": 215603968.0, - "41": 200089440.0, - "42": 228529888.0, - "43": 198782848.0, - "44": 141902272.0, - "45": 181922816.0, - "46": 115369856.0, - "47": 170214176.0, - "48": 137292832.0, - "49": 97654936.0, - "50": 160979632.0 + "1": 19403704.0, + "2": 19274202.0, + "3": 19372672.0, + "4": 84955472.0, + "5": 148573088.0, + "6": 140513744.0, + "7": 176606368.0, + "8": 198919440.0, + "9": 175143840.0, + "10": 164545552.0, + "11": 216370368.0, + "12": 201999712.0, + "13": 239390272.0, + "14": 230012880.0, + "15": 215921904.0, + "16": 211344080.0, + "17": 274153920.0, + "18": 173627616.0, + "19": 176950304.0, + "20": 194330304.0, + "21": 243134016.0, + "22": 234854608.0, + "23": 219609264.0, + "24": 205630080.0, + "25": 198436912.0, + "26": 293244384.0, + "27": 274552608.0, + "28": 277179296.0, + "29": 210959616.0, + "30": 233757584.0, + "31": 236548544.0, + "32": 264864608.0, + "33": 250754976.0, + "34": 258614240.0, + "35": 208476240.0, + "36": 241437056.0, + "37": 177817504.0, + "38": 227178000.0, + "39": 222169216.0, + "40": 214031296.0, + "41": 209523040.0, + "42": 212816672.0, + "43": 195600416.0, + "44": 154459088.0, + "45": 166289280.0, + "46": 116993536.0, + "47": 168587312.0, + "48": 162414240.0, + "49": 119666904.0, + "50": 171972272.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 4882187264.0, - "2": 4881607168.0, - "3": 4882283008.0, - "4": 4881322496.0, - "5": 4882174464.0, - "6": 4883177984.0, - "7": 4883252736.0, - "8": 4881774080.0, - "9": 4881443328.0, - "10": 4884319744.0, - "11": 4882319872.0, - "12": 4881232384.0, - "13": 4880836096.0, - "14": 4882124288.0, - "15": 4882108928.0, - "16": 4883384832.0, - "17": 4880466432.0, - "18": 4881518080.0, - "19": 4881734144.0, - "20": 4883215872.0, - "21": 4883534336.0, - "22": 4882774528.0, - "23": 4881818112.0, - "24": 4882441728.0, - "25": 4880546304.0, - "26": 4882178560.0, - "27": 4881892864.0, - "28": 4881869312.0, - "29": 4882979328.0, - "30": 4882715136.0, - "31": 4883084800.0, - "32": 4881436160.0, - "33": 4881766912.0, - "34": 4881406464.0, - "35": 4881531392.0, - "36": 4881479168.0, - "37": 4882455040.0, - "38": 4882054656.0, - "39": 4882005504.0, - "40": 4882743808.0, - "41": 4881211904.0, - "42": 4881378816.0, - "43": 4882133504.0, - "44": 4881860096.0, - "45": 4883165696.0, - "46": 4882168320.0, - "47": 4881526272.0, - "48": 4882125312.0, - "49": 4881533440.0, - "50": 4881598976.0 + "1": 4880827392.0, + "2": 4880161280.0, + "3": 4879780352.0, + "4": 4881006080.0, + "5": 4881443328.0, + "6": 4880235008.0, + "7": 4878593536.0, + "8": 4880183808.0, + "9": 4878518784.0, + "10": 4880639488.0, + "11": 4878592512.0, + "12": 4879459840.0, + "13": 4879073792.0, + "14": 4881052160.0, + "15": 4878580224.0, + "16": 4878705152.0, + "17": 4880005632.0, + "18": 4880081408.0, + "19": 4879190528.0, + "20": 4879407616.0, + "21": 4878837248.0, + "22": 4878897664.0, + "23": 4878346752.0, + "24": 4880498176.0, + "25": 4880417280.0, + "26": 4878027264.0, + "27": 4878756352.0, + "28": 4880044544.0, + "29": 4879154688.0, + "30": 4879779328.0, + "31": 4881071616.0, + "32": 4879392256.0, + "33": 4879744512.0, + "34": 4878250496.0, + "35": 4878979584.0, + "36": 4880133632.0, + "37": 4880431616.0, + "38": 4878993920.0, + "39": 4878280192.0, + "40": 4879473152.0, + "41": 4880439808.0, + "42": 4879638016.0, + "43": 4879913472.0, + "44": 4879031808.0, + "45": 4879471104.0, + "46": 4878890496.0, + "47": 4879007232.0, + "48": 4879195648.0, + "49": 4879473152.0, + "50": 4878174720.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 41210470400.0, - "2": 41210470400.0, - "3": 41210470400.0, - "4": 41210470400.0, - "5": 41210470400.0, - "6": 41210470400.0, - "7": 41210470400.0, - "8": 41210470400.0, - "9": 41210470400.0, - "10": 41210470400.0, - "11": 41210470400.0, - "12": 41210470400.0, - "13": 41210470400.0, - "14": 41210470400.0, - "15": 41210470400.0, - "16": 41210470400.0, - "17": 41210470400.0, - "18": 41210470400.0, - "19": 41210470400.0, - "20": 41210470400.0, - "21": 41210470400.0, - "22": 41210470400.0, - "23": 41210470400.0, - "24": 41210470400.0, - "25": 41210470400.0, - "26": 41210470400.0, - "27": 41210470400.0, - "28": 41210470400.0, - "29": 41210470400.0, - "30": 41210470400.0, - "31": 41210470400.0, - "32": 41210470400.0, - "33": 41210470400.0, - "34": 41210470400.0, - "35": 41210470400.0, - "36": 41210470400.0, - "37": 41210470400.0, - "38": 41210470400.0, - "39": 41210470400.0, - "40": 41210470400.0, - "41": 41210470400.0, - "42": 41210470400.0, - "43": 41210470400.0, - "44": 41210470400.0, - "45": 41210470400.0, - "46": 41210470400.0, - "47": 41210470400.0, - "48": 41210470400.0, - "49": 41210470400.0, - "50": 41210470400.0 + "1": 41208373248.0, + "2": 41208373248.0, + "3": 41208373248.0, + "4": 41208373248.0, + "5": 41208373248.0, + "6": 41208373248.0, + "7": 41208373248.0, + "8": 41208373248.0, + "9": 41208373248.0, + "10": 41208373248.0, + "11": 41208373248.0, + "12": 41208373248.0, + "13": 41208373248.0, + "14": 41208373248.0, + "15": 41208373248.0, + "16": 41208373248.0, + "17": 41208373248.0, + "18": 41208373248.0, + "19": 41208373248.0, + "20": 41208373248.0, + "21": 41208373248.0, + "22": 41208373248.0, + "23": 41208373248.0, + "24": 41208373248.0, + "25": 41208373248.0, + "26": 41208373248.0, + "27": 41208373248.0, + "28": 41208373248.0, + "29": 41208373248.0, + "30": 41208373248.0, + "31": 41208373248.0, + "32": 41208373248.0, + "33": 41208373248.0, + "34": 41208373248.0, + "35": 41208373248.0, + "36": 41208373248.0, + "37": 41208373248.0, + "38": 41208373248.0, + "39": 41208373248.0, + "40": 41208373248.0, + "41": 41208373248.0, + "42": 41208373248.0, + "43": 41208373248.0, + "44": 41208373248.0, + "45": 41208373248.0, + "46": 41208373248.0, + "47": 41208373248.0, + "48": 41208373248.0, + "49": 41208373248.0, + "50": 41208373248.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 96.21947, - "2": 1.10023, - "3": 0.96399, - "4": 0.91113, - "5": 1.27509, - "6": 1.00484, - "7": 1.01236, - "8": 1.1739, - "9": 0.89406, - "10": 0.88836, - "11": 0.92033, - "12": 0.88331, - "13": 0.88179, - "14": 0.88307, - "15": 0.88648, - "16": 0.88425, - "17": 0.87155, - "18": 0.87556, - "19": 0.87374, - "20": 0.8744, - "21": 0.86757, - "22": 0.87217, - "23": 0.8736, - "24": 0.86646, - "25": 0.87328, - "26": 0.87121, - "27": 0.85886, - "28": 0.86392, - "29": 0.86385, - "30": 0.86425, - "31": 0.8631, - "32": 0.8617, - "33": 0.86069, - "34": 0.86829, - "35": 0.86837, - "36": 0.86776, - "37": 0.86686, - "38": 0.86359, - "39": 0.8677, - "40": 0.86441, - "41": 0.86179, - "42": 0.86079, - "43": 0.86149, - "44": 0.86222, - "45": 0.86336, - "46": 0.85875, - "47": 0.86219, - "48": 0.86026, - "49": 0.85894, - "50": 0.8544 + "1": 94.76465, + "2": 1.07136, + "3": 0.97804, + "4": 0.91812, + "5": 1.39406, + "6": 1.11113, + "7": 1.05399, + "8": 1.07764, + "9": 0.8817, + "10": 0.88267, + "11": 0.97121, + "12": 0.87696, + "13": 0.87547, + "14": 0.87457, + "15": 0.87326, + "16": 0.87868, + "17": 0.86846, + "18": 0.86669, + "19": 0.86508, + "20": 0.86847, + "21": 0.86661, + "22": 0.85614, + "23": 0.8576, + "24": 0.86445, + "25": 0.86658, + "26": 0.86708, + "27": 0.86226, + "28": 0.85806, + "29": 0.86248, + "30": 0.85836, + "31": 0.85969, + "32": 0.85739, + "33": 0.86134, + "34": 0.8621, + "35": 0.86104, + "36": 0.85793, + "37": 0.85834, + "38": 0.85618, + "39": 0.85754, + "40": 0.8554, + "41": 0.85094, + "42": 0.85738, + "43": 0.85524, + "44": 0.85844, + "45": 0.85739, + "46": 0.85581, + "47": 0.85717, + "48": 0.85118, + "49": 0.85577, + "50": 0.85127 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgxh100_eos.json new file mode 100644 index 00000000000..fe8428055c3 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgxh100_eos.json @@ -0,0 +1,537 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.82922, + "2": 10.84163, + "3": 10.84245, + "4": 10.82, + "5": 10.85652, + "6": 10.86906, + "7": 10.83778, + "8": 10.84312, + "9": 10.84423, + "10": 10.79298, + "11": 10.86697, + "12": 10.86875, + "13": 10.86207, + "14": 10.86919, + "15": 10.8067, + "16": 10.8057, + "17": 10.77686, + "18": 10.79541, + "19": 10.78384, + "20": 10.72654, + "21": 10.69491, + "22": 10.54462, + "23": 10.6993, + "24": 10.58151, + "25": 10.53282, + "26": 10.58817, + "27": 10.601, + "28": 10.57563, + "29": 10.58022, + "30": 10.35802, + "31": 10.08769, + "32": 10.44466, + "33": 10.4477, + "34": 10.18704, + "35": 10.24483, + "36": 10.19713, + "37": 10.32294, + "38": 10.17101, + "39": 10.37026, + "40": 10.05533, + "41": 10.09491, + "42": 10.17971, + "43": 9.78263, + "44": 9.91346, + "45": 9.77951, + "46": 9.75648, + "47": 10.09647, + "48": 9.80391, + "49": 9.46649, + "50": 9.86874, + "51": 9.79428, + "52": 9.68303, + "53": 10.03314, + "54": 9.9113, + "55": 9.82995, + "56": 9.57839, + "57": 9.42377, + "58": 9.80549, + "59": 9.53292, + "60": 9.449, + "61": 9.65293, + "62": 9.95672, + "63": 9.33775, + "64": 9.74194, + "65": 8.89366, + "66": 9.67317, + "67": 9.33002, + "68": 9.76517, + "69": 9.76336, + "70": 9.71127, + "71": 9.59511, + "72": 9.54797, + "73": 9.47124, + "74": 8.89297, + "75": 9.39451, + "76": 9.04721, + "77": 10.04318, + "78": 9.70313, + "79": 9.35169, + "80": 9.38198, + "81": 9.45146, + "82": 9.67546, + "83": 9.27658, + "84": 9.39241, + "85": 9.58333, + "86": 9.04518, + "87": 9.56487, + "88": 9.72459, + "89": 9.57019, + "90": 9.79944, + "91": 9.30737, + "92": 9.3313, + "93": 9.04109, + "94": 8.80259, + "95": 9.50213, + "96": 9.5021, + "97": 9.28183, + "98": 9.64883, + "99": 8.8594, + "100": 9.37131 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 27245.0, + "2": 28958.0, + "3": 29464.0, + "4": 28046.0, + "5": 31369.0, + "6": 33287.0, + "7": 31200.0, + "8": 26921.0, + "9": 30008.0, + "10": 25870.0, + "11": 33681.0, + "12": 30344.0, + "13": 32737.0, + "14": 33315.0, + "15": 29830.0, + "16": 32475.0, + "17": 30747.0, + "18": 30381.0, + "19": 31032.0, + "20": 28243.0, + "21": 29224.0, + "22": 27340.0, + "23": 34119.0, + "24": 29049.0, + "25": 27636.0, + "26": 30662.0, + "27": 32009.0, + "28": 33355.0, + "29": 34714.0, + "30": 30387.0, + "31": 28212.0, + "32": 33411.0, + "33": 34696.0, + "34": 30053.0, + "35": 31488.0, + "36": 32943.0, + "37": 35829.0, + "38": 33740.0, + "39": 37632.0, + "40": 34779.0, + "41": 33958.0, + "42": 36396.0, + "43": 34088.0, + "44": 34090.0, + "45": 35158.0, + "46": 36174.0, + "47": 39772.0, + "48": 36516.0, + "49": 36733.0, + "50": 38234.0, + "51": 38608.0, + "52": 37030.0, + "53": 42442.0, + "54": 40944.0, + "55": 37133.0, + "56": 41001.0, + "57": 37524.0, + "58": 42317.0, + "59": 40804.0, + "60": 40450.0, + "61": 41478.0, + "62": 39766.0, + "63": 37941.0, + "64": 42197.0, + "65": 40947.0, + "66": 44094.0, + "67": 41958.0, + "68": 40060.0, + "69": 42189.0, + "70": 43436.0, + "71": 42748.0, + "72": 44280.0, + "73": 47478.0, + "74": 41456.0, + "75": 39925.0, + "76": 43490.0, + "77": 45636.0, + "78": 2141470.0, + "79": 46055.0, + "80": 51863.0, + "81": 151341.0, + "82": 49835.0, + "83": 143360.0, + "84": 2141546.0, + "85": 2145177.0, + "86": 132114.0, + "87": 2147022.0, + "88": 59899.0, + "89": 162883.0, + "90": 51330.0, + "91": 2141901.0, + "92": 44946.0, + "93": 138194.0, + "94": 2145772.0, + "95": 45247.0, + "96": 135045.0, + "97": 53170.0, + "98": 168576.0, + "99": 2141797.0, + "100": 163741.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 787516416.0, + "2": 787540992.0, + "3": 787524096.0, + "4": 787512320.0, + "5": 787547136.0, + "6": 787537920.0, + "7": 787512832.0, + "8": 787524608.0, + "9": 787528192.0, + "10": 787505152.0, + "11": 787522048.0, + "12": 787520000.0, + "13": 787529728.0, + "14": 787529216.0, + "15": 787504128.0, + "16": 787513344.0, + "17": 787503104.0, + "18": 787489280.0, + "19": 787514880.0, + "20": 787505152.0, + "21": 787479552.0, + "22": 787486208.0, + "23": 787478528.0, + "24": 787486208.0, + "25": 787451392.0, + "26": 787482112.0, + "27": 787470848.0, + "28": 787450368.0, + "29": 787458048.0, + "30": 787435008.0, + "31": 787406848.0, + "32": 787424256.0, + "33": 787435520.0, + "34": 787426304.0, + "35": 787418624.0, + "36": 787436544.0, + "37": 787428352.0, + "38": 787436544.0, + "39": 787417600.0, + "40": 787415040.0, + "41": 787405824.0, + "42": 787415040.0, + "43": 787367936.0, + "44": 787392512.0, + "45": 787399680.0, + "46": 787355136.0, + "47": 787411456.0, + "48": 787354112.0, + "49": 787374080.0, + "50": 787389440.0, + "51": 787375616.0, + "52": 787383808.0, + "53": 787379712.0, + "54": 787384832.0, + "55": 787388928.0, + "56": 787388928.0, + "57": 787351040.0, + "58": 787382784.0, + "59": 787374080.0, + "60": 787395072.0, + "61": 787405312.0, + "62": 787405824.0, + "63": 787373056.0, + "64": 787388928.0, + "65": 787351552.0, + "66": 787386880.0, + "67": 787392000.0, + "68": 787399168.0, + "69": 787383296.0, + "70": 787393024.0, + "71": 787406848.0, + "72": 787400704.0, + "73": 787401216.0, + "74": 787403264.0, + "75": 787442688.0, + "76": 787444736.0, + "77": 787445760.0, + "78": 787395072.0, + "79": 787430400.0, + "80": 787410432.0, + "81": 787412992.0, + "82": 787427840.0, + "83": 787428864.0, + "84": 787412480.0, + "85": 787412480.0, + "86": 787394560.0, + "87": 787452928.0, + "88": 787414528.0, + "89": 787404800.0, + "90": 787446784.0, + "91": 787446272.0, + "92": 787446784.0, + "93": 787430400.0, + "94": 787440128.0, + "95": 787450368.0, + "96": 787454976.0, + "97": 787427328.0, + "98": 787475968.0, + "99": 787419136.0, + "100": 787438592.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 2579673088.0, + "2": 2590714880.0, + "3": 2590714880.0, + "4": 2590714880.0, + "5": 2596039680.0, + "6": 2596039680.0, + "7": 2596039680.0, + "8": 2596039680.0, + "9": 2596039680.0, + "10": 2596039680.0, + "11": 2596039680.0, + "12": 2596039680.0, + "13": 2596039680.0, + "14": 2596039680.0, + "15": 2596039680.0, + "16": 2596039680.0, + "17": 2596039680.0, + "18": 2596039680.0, + "19": 2596039680.0, + "20": 2596039680.0, + "21": 2596039680.0, + "22": 2596039680.0, + "23": 2596039680.0, + "24": 2596039680.0, + "25": 2596039680.0, + "26": 2596039680.0, + "27": 2596039680.0, + "28": 2596039680.0, + "29": 2596039680.0, + "30": 2596039680.0, + "31": 2596039680.0, + "32": 2596039680.0, + "33": 2596039680.0, + "34": 2596039680.0, + "35": 2596039680.0, + "36": 2596039680.0, + "37": 2596039680.0, + "38": 2596039680.0, + "39": 2596039680.0, + "40": 2596039680.0, + "41": 2596039680.0, + "42": 2596039680.0, + "43": 2596039680.0, + "44": 2596039680.0, + "45": 2596039680.0, + "46": 2596039680.0, + "47": 2596039680.0, + "48": 2596039680.0, + "49": 2596039680.0, + "50": 2596039680.0, + "51": 2596039680.0, + "52": 2596039680.0, + "53": 2596039680.0, + "54": 2596039680.0, + "55": 2596039680.0, + "56": 2596039680.0, + "57": 2596039680.0, + "58": 2596039680.0, + "59": 2596039680.0, + "60": 2596039680.0, + "61": 2596039680.0, + "62": 2596039680.0, + "63": 2596039680.0, + "64": 2596039680.0, + "65": 2596039680.0, + "66": 2596039680.0, + "67": 2596039680.0, + "68": 2596039680.0, + "69": 2596039680.0, + "70": 2596039680.0, + "71": 2596039680.0, + "72": 2596039680.0, + "73": 2596039680.0, + "74": 2596039680.0, + "75": 2596039680.0, + "76": 2596039680.0, + "77": 2596039680.0, + "78": 2596039680.0, + "79": 2596039680.0, + "80": 2596039680.0, + "81": 2596039680.0, + "82": 2596039680.0, + "83": 2596039680.0, + "84": 2596039680.0, + "85": 2596039680.0, + "86": 2596039680.0, + "87": 2596039680.0, + "88": 2596039680.0, + "89": 2596039680.0, + "90": 2596039680.0, + "91": 2596039680.0, + "92": 2596039680.0, + "93": 2596039680.0, + "94": 2596039680.0, + "95": 2596039680.0, + "96": 2596039680.0, + "97": 2596039680.0, + "98": 2596039680.0, + "99": 2596039680.0, + "100": 2596039680.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 11.07685, + "2": 0.45645, + "3": 0.41285, + "4": 0.40148, + "5": 0.35405, + "6": 0.35535, + "7": 0.35437, + "8": 0.32989, + "9": 0.32686, + "10": 0.32734, + "11": 0.32243, + "12": 0.32634, + "13": 0.33475, + "14": 0.33636, + "15": 0.33838, + "16": 0.32741, + "17": 0.33364, + "18": 0.33147, + "19": 0.33328, + "20": 0.33281, + "21": 0.33587, + "22": 0.3271, + "23": 0.33537, + "24": 0.32125, + "25": 0.33225, + "26": 0.33085, + "27": 0.3387, + "28": 0.34305, + "29": 0.34938, + "30": 0.34814, + "31": 0.35223, + "32": 0.36489, + "33": 0.33408, + "34": 0.34688, + "35": 0.33945, + "36": 0.34851, + "37": 0.3471, + "38": 0.3338, + "39": 0.3395, + "40": 0.3414, + "41": 0.34662, + "42": 0.34093, + "43": 0.34012, + "44": 0.34423, + "45": 0.34205, + "46": 0.34681, + "47": 0.33694, + "48": 0.34136, + "49": 0.34255, + "50": 0.34412, + "51": 0.32987, + "52": 0.34834, + "53": 0.34028, + "54": 0.33718, + "55": 0.33563, + "56": 0.3372, + "57": 0.33927, + "58": 0.34337, + "59": 0.34056, + "60": 0.34048, + "61": 0.33816, + "62": 0.3357, + "63": 0.3365, + "64": 0.33906, + "65": 0.34134, + "66": 0.34125, + "67": 0.33859, + "68": 0.34726, + "69": 0.3385, + "70": 0.34428, + "71": 0.34339, + "72": 0.33789, + "73": 0.33975, + "74": 0.34759, + "75": 0.33612, + "76": 0.33913, + "77": 0.34664, + "78": 0.33673, + "79": 0.33903, + "80": 0.33519, + "81": 0.33434, + "82": 0.34003, + "83": 0.33784, + "84": 0.33367, + "85": 0.33382, + "86": 0.34029, + "87": 0.33537, + "88": 0.33703, + "89": 0.33416, + "90": 0.33113, + "91": 0.33369, + "92": 0.33443, + "93": 0.33841, + "94": 0.339, + "95": 0.33271, + "96": 0.33211, + "97": 0.33492, + "98": 0.33877, + "99": 0.33548, + "100": 0.33195 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgxh100_eos.json index 537e20b09d8..eca2cabacaf 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgxh100_eos.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgxh100_eos.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.04266, - "2": 11.02309, - "3": 9.43552, - "4": 10.04614, - "5": 9.38535, - "6": 9.14543, - "7": 9.21141, - "8": 8.63458, - "9": 8.48937, - "10": 8.82763, - "11": 8.29457, - "12": 8.3282, - "13": 8.23008, - "14": 7.71714, - "15": 7.86981, - "16": 7.92286, - "17": 7.8604, - "18": 7.62039, - "19": 7.98493, - "20": 7.72023, - "21": 7.39758, - "22": 7.39771, - "23": 7.28314, - "24": 7.25048, - "25": 7.53113, - "26": 6.95329, - "27": 7.49432, - "28": 7.20394, - "29": 7.37282, - "30": 7.50232, - "31": 7.25348, - "32": 7.4305, - "33": 7.48364, - "34": 7.53486, - "35": 7.10336, - "36": 6.94516, - "37": 7.26117, - "38": 7.07009, - "39": 7.40543, - "40": 7.42044, - "41": 7.34202, - "42": 7.11816, - "43": 7.11373, - "44": 7.27067, - "45": 7.07036, - "46": 6.77823, - "47": 7.1875, - "48": 6.99998, - "49": 7.45868, - "50": 6.90956 + "1": 11.04276, + "2": 11.02298, + "3": 9.43542, + "4": 10.04672, + "5": 9.38572, + "6": 9.14547, + "7": 9.21155, + "8": 8.63445, + "9": 8.48944, + "10": 8.82764, + "11": 8.29479, + "12": 8.32819, + "13": 8.23003, + "14": 7.71724, + "15": 7.86963, + "16": 7.9228, + "17": 7.86049, + "18": 7.62035, + "19": 7.9851, + "20": 7.72027, + "21": 7.39754, + "22": 7.39767, + "23": 7.28334, + "24": 7.25057, + "25": 7.53131, + "26": 6.95335, + "27": 7.49421, + "28": 7.20415, + "29": 7.373, + "30": 7.50279, + "31": 7.25342, + "32": 7.43069, + "33": 7.48385, + "34": 7.53476, + "35": 7.10325, + "36": 6.94471, + "37": 7.26141, + "38": 7.07026, + "39": 7.40536, + "40": 7.42025, + "41": 7.34194, + "42": 7.11724, + "43": 7.11421, + "44": 7.27077, + "45": 7.0701, + "46": 6.77811, + "47": 7.18895, + "48": 7.00013, + "49": 7.45875, + "50": 6.90988 } }, "num-zeros": { @@ -62,55 +62,55 @@ "step_interval": 1, "values": { "1": 844114112.0, - "2": 843855104.0, + "2": 843855296.0, "3": 844048640.0, - "4": 842998144.0, + "4": 842998208.0, "5": 855786112.0, - "6": 874329728.0, - "7": 925591552.0, - "8": 915644608.0, - "9": 935187584.0, - "10": 927702400.0, - "11": 957888256.0, - "12": 923872512.0, - "13": 969427072.0, + "6": 878524160.0, + "7": 924542976.0, + "8": 917741504.0, + "9": 932042112.0, + "10": 930847360.0, + "11": 954742400.0, + "12": 922824128.0, + "13": 968378816.0, "14": 965228416.0, - "15": 952825344.0, - "16": 943777088.0, - "17": 928845824.0, - "18": 925913856.0, - "19": 955339136.0, - "20": 989208256.0, - "21": 924095424.0, - "22": 908902272.0, - "23": 892664576.0, - "24": 900830400.0, - "25": 928105472.0, - "26": 877724352.0, - "27": 912808320.0, - "28": 904557696.0, - "29": 872625088.0, - "30": 864767104.0, - "31": 868220416.0, - "32": 861931136.0, - "33": 859941312.0, + "15": 951776640.0, + "16": 941679424.0, + "17": 929894336.0, + "18": 928011136.0, + "19": 955339264.0, + "20": 987111232.0, + "21": 924095488.0, + "22": 906805504.0, + "23": 895810432.0, + "24": 902927680.0, + "25": 927056960.0, + "26": 879821440.0, + "27": 911759744.0, + "28": 902460416.0, + "29": 872625216.0, + "30": 865815744.0, + "31": 868220352.0, + "32": 865076800.0, + "33": 864135552.0, "34": 855839104.0, - "35": 854046848.0, - "36": 852944896.0, - "37": 851456704.0, - "38": 849532096.0, + "35": 854046784.0, + "36": 855042176.0, + "37": 850408192.0, + "38": 850580480.0, "39": 849972608.0, "40": 849505792.0, - "41": 845780288.0, - "42": 846003328.0, - "43": 846257472.0, - "44": 852034880.0, - "45": 847187456.0, + "41": 845780352.0, + "42": 846003392.0, + "43": 848354688.0, + "44": 850986496.0, + "45": 848236160.0, "46": 855625856.0, - "47": 844661952.0, - "48": 851197248.0, + "47": 843613312.0, + "48": 851197312.0, "49": 851630464.0, - "50": 846195904.0 + "50": 846195968.0 } }, "mem-allocated-bytes": { @@ -176,55 +176,55 @@ "step_interval": 1, "values": { "1": 37959917568.0, - "2": 39578677248.0, - "3": 39580196864.0, - "4": 39580196864.0, - "5": 39583309824.0, - "6": 39583309824.0, - "7": 39583309824.0, - "8": 39583309824.0, - "9": 39583309824.0, - "10": 39583309824.0, - "11": 39583309824.0, - "12": 39583309824.0, - "13": 39583309824.0, - "14": 39583309824.0, - "15": 39583309824.0, - "16": 39583309824.0, - "17": 39583309824.0, - "18": 39583309824.0, - "19": 39583309824.0, - "20": 39583309824.0, - "21": 39583309824.0, - "22": 39583309824.0, - "23": 39583309824.0, - "24": 39583309824.0, - "25": 39583309824.0, - "26": 39583309824.0, - "27": 39583309824.0, - "28": 39583309824.0, - "29": 39583309824.0, - "30": 39583309824.0, - "31": 39583309824.0, - "32": 39583309824.0, - "33": 39583309824.0, - "34": 39583309824.0, - "35": 39583309824.0, - "36": 39583309824.0, - "37": 39583309824.0, - "38": 39583309824.0, - "39": 39583309824.0, - "40": 39583309824.0, - "41": 39583309824.0, - "42": 39583309824.0, - "43": 39583309824.0, - "44": 39583309824.0, - "45": 39583309824.0, - "46": 39583309824.0, - "47": 39583309824.0, - "48": 39583309824.0, - "49": 39583309824.0, - "50": 39583309824.0 + "2": 39578673152.0, + "3": 39580192768.0, + "4": 39580192768.0, + "5": 39583301632.0, + "6": 39583301632.0, + "7": 39583301632.0, + "8": 39583301632.0, + "9": 39583301632.0, + "10": 39583301632.0, + "11": 39583301632.0, + "12": 39583301632.0, + "13": 39583301632.0, + "14": 39583301632.0, + "15": 39583301632.0, + "16": 39583301632.0, + "17": 39583301632.0, + "18": 39583301632.0, + "19": 39583301632.0, + "20": 39583301632.0, + "21": 39583301632.0, + "22": 39583301632.0, + "23": 39583301632.0, + "24": 39583301632.0, + "25": 39583301632.0, + "26": 39583301632.0, + "27": 39583301632.0, + "28": 39583301632.0, + "29": 39583301632.0, + "30": 39583301632.0, + "31": 39583301632.0, + "32": 39583301632.0, + "33": 39583301632.0, + "34": 39583301632.0, + "35": 39583301632.0, + "36": 39583301632.0, + "37": 39583301632.0, + "38": 39583301632.0, + "39": 39583301632.0, + "40": 39583301632.0, + "41": 39583301632.0, + "42": 39583301632.0, + "43": 39583301632.0, + "44": 39583301632.0, + "45": 39583301632.0, + "46": 39583301632.0, + "47": 39583301632.0, + "48": 39583301632.0, + "49": 39583301632.0, + "50": 39583301632.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 67.13422, - "2": 1.95457, - "3": 3.25371, - "4": 2.66673, - "5": 3.05794, - "6": 1.35128, - "7": 1.66174, - "8": 2.19011, - "9": 1.16207, - "10": 1.16456, - "11": 1.26279, - "12": 1.60263, - "13": 1.29219, - "14": 2.93489, - "15": 1.48729, - "16": 1.15146, - "17": 1.27648, - "18": 1.39906, - "19": 1.13846, - "20": 1.14415, - "21": 1.27567, - "22": 1.26287, - "23": 1.11223, - "24": 1.10986, - "25": 1.20096, - "26": 1.13382, - "27": 1.11305, - "28": 1.11424, - "29": 1.22341, - "30": 1.08856, - "31": 1.15539, - "32": 1.10684, - "33": 1.11399, - "34": 1.09048, - "35": 1.1509, - "36": 1.09151, - "37": 1.13904, - "38": 1.06658, - "39": 1.1325, - "40": 1.14715, - "41": 1.07533, - "42": 1.08243, - "43": 1.13881, - "44": 1.14004, - "45": 1.06323, - "46": 1.06103, - "47": 1.11785, - "48": 1.04242, - "49": 1.13933, - "50": 1.0407 + "1": 89.14162, + "2": 2.00665, + "3": 3.2832, + "4": 2.63833, + "5": 2.43073, + "6": 1.4868, + "7": 1.81732, + "8": 2.74562, + "9": 1.18286, + "10": 1.18542, + "11": 1.27273, + "12": 1.63885, + "13": 1.31323, + "14": 2.29007, + "15": 1.52021, + "16": 1.87975, + "17": 1.3507, + "18": 1.48627, + "19": 1.17842, + "20": 1.17004, + "21": 1.30369, + "22": 1.24781, + "23": 1.13565, + "24": 1.13418, + "25": 1.21915, + "26": 1.24288, + "27": 1.15052, + "28": 1.12573, + "29": 1.15398, + "30": 1.13143, + "31": 1.17104, + "32": 1.12919, + "33": 1.1286, + "34": 1.14327, + "35": 1.1721, + "36": 1.12494, + "37": 1.2626, + "38": 1.11425, + "39": 1.14594, + "40": 1.18189, + "41": 1.09297, + "42": 1.09247, + "43": 1.18621, + "44": 1.19564, + "45": 1.08252, + "46": 1.08511, + "47": 1.23319, + "48": 1.08249, + "49": 1.0979, + "50": 1.07182 } } } \ No newline at end of file From 402bc50b1c2693dbde1fdc6c45416e37e1692f85 Mon Sep 17 00:00:00 2001 From: Santosh Bhavani Date: Thu, 30 Oct 2025 23:37:28 -0700 Subject: [PATCH 089/248] Add DeepSeek-V3 GB200 NVL72 optimization guide (#2059) Co-authored-by: Xin Yao --- docs/discussions/README.md | 22 ++ .../deepseek-v3-gb200-optimization.md | 252 ++++++++++++++++++ .../images/image1.png | Bin 0 -> 325505 bytes .../images/image2.png | Bin 0 -> 205208 bytes .../images/image3.png | Bin 0 -> 98729 bytes .../images/image4.png | Bin 0 -> 191466 bytes .../images/image5.png | Bin 0 -> 330297 bytes .../images/image6.png | Bin 0 -> 203011 bytes .../images/image7.png | Bin 0 -> 209740 bytes 9 files changed, 274 insertions(+) create mode 100644 docs/discussions/README.md create mode 100644 docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-optimization.md create mode 100644 docs/discussions/deepseek-v3-gb200-optimization/images/image1.png create mode 100644 docs/discussions/deepseek-v3-gb200-optimization/images/image2.png create mode 100644 docs/discussions/deepseek-v3-gb200-optimization/images/image3.png create mode 100644 docs/discussions/deepseek-v3-gb200-optimization/images/image4.png create mode 100644 docs/discussions/deepseek-v3-gb200-optimization/images/image5.png create mode 100644 docs/discussions/deepseek-v3-gb200-optimization/images/image6.png create mode 100644 docs/discussions/deepseek-v3-gb200-optimization/images/image7.png diff --git a/docs/discussions/README.md b/docs/discussions/README.md new file mode 100644 index 00000000000..5dc19181842 --- /dev/null +++ b/docs/discussions/README.md @@ -0,0 +1,22 @@ +# Megatron Discussions + +This directory contains in-depth guides, tutorials, and discussions about optimizing and using Megatron for various use cases. + +## Available Guides + +### Performance Optimization + +- **[Optimizing DeepSeek-V3 Training Performance on NVIDIA GB200 NVL72](deepseek-v3-gb200-optimization/deepseek-v3-gb200-optimization.md)** + + A comprehensive guide on optimizing DeepSeek-V3 model training on NVIDIA GB200 NVL72 systems, covering profiling techniques, performance bottlenecks, and optimization strategies. + +## Contributing + +If you'd like to contribute a guide or tutorial, please follow this structure: + +1. Create a new directory: `docs/discussions/your-guide-name/` +2. Add your main guide: `docs/discussions/your-guide-name/your-guide-name.md` +3. Create an images directory: `docs/discussions/your-guide-name/images/` +4. Update this README.md with a link to your guide + +Each guide should be self-contained with its own images and supporting files. diff --git a/docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-optimization.md b/docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-optimization.md new file mode 100644 index 00000000000..e3573fa76ba --- /dev/null +++ b/docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-optimization.md @@ -0,0 +1,252 @@ +# **Optimizing DeepSeek-V3 Training Performance on NVIDIA GB200 NVL72** + +**Authors:** Xin Yao (@yaox12), Hongxiao Bai (@hxbai), Yaobin Zhang (@buptzyb), Tong Liu (@Autumn1998), Fan Yu (@HWZealot), Kunlun Li (@kunlunl), Zhongbo Zhu (@zhongbozhu), Zijie Yan (@yanring) + +--- + +This guide describes how we used Megatron Core (MCore) and Transformer Engine (TE) to pre-train the DeepSeek-V3 model with MXFP8 precision on 256 GB200 GPUs. We will detail the step-by-step process of optimizing performance to **970 TFLOPS/GPU**, which is a **2.55x** speedup compared to the estimated 380 TFLOPS on H100/H800 (refer to the estimation in this article \[[1](https://zhuanlan.zhihu.com/p/16480858047)\] in Chinese). The related features have been or will be open-sourced to the [Megatron Core](https://github.com/NVIDIA/Megatron-LM) and [Transformer Engine](https://github.com/NVIDIA/TransformerEngine) repositories. + +## **0. Methodology** + +To optimize the pre-training performance of a model, our methodology is generally as follows: + +1. Find a performance baseline. This baseline is usually the best performance that the current software stack can achieve on a given hardware platform and training precision by adjusting model parallelism, recomputation, and other configurations. +2. Use performance analysis tools such as [Nsight Systems](https://developer.nvidia.com/nsight-systems) (Nsys) or [PyTorch Profiler](https://docs.pytorch.org/tutorials/recipes/recipes/profiler_recipe.html) to capture a profile file (also called a timeline or trace) and analyze it to find performance bottlenecks. For example, are there significant exposed communications, kernels with a significantly high proportion, or whether the GPU kernel layout is dense? We usually prefer to use Nsys because, with the help of NVTX, it provides a clearer display of CUDA API and GPU kernel execution. +3. Optimize for performance bottlenecks. Then repeat steps 1-3 until the performance expectations are met. + +## **1. Baseline** + +DeepSeek-V3 innovatively uses FP8 mixed precision for pre-training, which saves memory and improves training speed without sacrificing model accuracy. We refer to the FP8 recipe used by DeepSeek-V3, where activations are quantized at a 1x128 granularity and weights are quantized at a 128x128 granularity, as the blockwise scaling recipe. MCore (v0.13+) and TE (v2.3+) also support it. + +On the Blackwell platform, thanks to the native support of the fifth-generation Tensor Core for the MXFP8 format, we adopted the MXFP8 recipe, a more fine-grained quantization scheme for training. Both activations and weights are quantized at a 1x32 granularity, and E8M0 is used as the format for the scaling factor. + +Here, we will briefly introduce the difference in implementation between MXFP8 GEMM on the Blackwell platform and Blockwise FP8 GEMM on the Hopper platform. On the Hopper platform, since the Tensor Core itself does not support multiplication with a scale, after the matrix multiplication of each tile, it is necessary to multiply by the scale and accumulate the result with the CUDA Core. This also determines that on the Hopper platform, 1x128 is almost the finest quantization granularity available. If a finer granularity was used for quantization, the GEMM performance would suffer a great loss. On the other hand, since the Blackwell platform natively supports MXFP8, the dequantization process in GEMM (i.e., multiplying by the scale) is completed inside the Tensor Core, so the CUDA Core is not involved throughout the process, which can achieve better performance and support finer-grained quantization (1x32). + +When we started optimizing DeepSeek-V3 on the GB200 NVL72 platform with MCore, our baseline already included the following features: + +1. **MXFP8 recipe**, where the fprop/wgrad/dgrad inputs of all linear layers in the model are quantized at a 1x32 granularity, while Scaled Dot Product Attention (SDPA)/Embedding/LM Head/Router/Loss/Optimizer, etc., remain at their original high precision. For details on the FP8 recipe, please refer to our presentation at the NVIDIA AI Open Day in June 2025 (Video \[[2](https://www.bilibili.com/video/BV1mpMwz9Ey5/)\] in Chinese) and GTC 2025 (Video \[[3](https://www.nvidia.com/en-us/on-demand/session/gtc25-s72778/)\] in English). The option to enable this in MCore is `--fp8-recipe mxfp8 --fp8-format e4m3`. +2. **Multi-head Latent Attention (MLA) kernels** on the Blackwell platform, provided by cuDNN 9.11. +3. **MXFP8 Grouped GEMM**, implemented using multi-stream \+ cuBLAS. The advantage of this implementation is that we can support various quantization schemes at the fastest speed: as long as the single GEMM is ready, we can have a Grouped GEMM implementation with good performance. Our multi-stream \+ cuBLAS solution can achieve 2,672 TFLOP/s (flush L2) on the shape K=7,168, N=2,048, which is basically equivalent to a highly optimized Grouped GEMM \[[4](https://cursor.com/cn/blog/kernels)\]. We will continue to optimize the performance of Grouped GEMM. The option to enable this in MCore is `--moe-grouped-gemm`. +4. **Kernel fusions**, such as: + 1. Yarn RoPE fusion, enabled by default. + 2. Permute fusion, the option to enable this in MCore is `--moe-permute-fusion`. + 3. Cross-entropy loss fusion, the option to enable this in MCore is `--cross-entropy-loss-fusion`. +5. **Flexible Pipeline Parallelism (PP) layout**, making PP more balanced. The corresponding option in MCore is `--pipeline-model-parallel-layout [layout]`. +6. **Primary weights in FP8**. FP8 mixed-precision training supports two weight schemes: + 1. Dual precision weights (default): Maintains both BF16 and FP8 weight copies. Simple implementation but uses more memory than BF16 training alone. + 2. FP8 only weights: Stores only FP8 weights, saving memory and enables FP8 AllGather of the updated parameters for per-tensor and blockwise FP8 recipes when using Distributed Optimizer (ZeRO-1). Complex implementation requiring recipe-specific handling. The option to enable this in MCore is `--fp8-param-gather`. +7. **BF16 optimizer states**. According to the technical report, DeepSeek-v3 uses BF16 for optimizer states. This feature is orthogonal to the training precision, and it can be used for both BF16 and FP8 training. The options to enable this in MCore are `--use-precision-aware-optimizer --main-grads-dtype fp32 --main-params-dtype fp32 --exp-avg-dtype bf16 --exp-avg-sq-dtype bf16`. +8. **Fine-grained recompute**. By recomputing some modules with smaller computational workload but larger memory occupation, a large amount of memory is saved at a small recomputation cost, thereby minimizing model parallel sizes. In our baseline version, fine-grained recompute only supports BF16, and FP8 training is currently not supported. The options to enable this in MCore are `--recompute-granularity selective --recompute-modules [modules]`. +9. **Token dispatcher** supports both NCCL AlltoAll and DeepEP backends. However, at the time we tested the baseline performance, DeepEP did not support the Multi-Node NVLink (MNNVL) of GB200, so we could only use the NCCL AlltoAll backend. The option to use the AlltoAll dispatcher in MCore is `--moe-token-dispatcher-type alltoall`. + +On the above software stack, using the parallel configuration of TP1/PP8/VPP4/EP32/MBS1/GBS2048 on 256 GB200s, enabling recomputation of the MLP part of the dense layers (i.e., the first three layers of DeepSeek-v3) and the MLA up projection (`--recompute-modules mlp up_proj`), with the PP layout as `--pipeline-model-parallel-layout Et|(tt|)*30L` (a total of 32 stages, where the first stage is Embedding \+ 1 transformer layer, the last stage is Loss, and the middle 30 stages are 2 transformer layers), using the AlltoAll token dispatcher (NCCL backend), and enabling BF16 optimizer states, we achieved a performance of 494 TFLOPS/GPU. This performance is obviously not satisfactory, and we will optimize it from several aspects. + +## **2. Performance Optimization** + +By capturing and analyzing the Nsys timeline corresponding to the baseline, taking a forward iteration as an example, we can see that the biggest performance issue is that there are large gaps between kernels, and the CPU kernel launch speed cannot keep up with the kernel execution speed on GPU. We call this phenomenon *CPU overhead* or *host boundedness*. This overhead mainly comes from Python code (such as loops, `getattr`, etc.), PyTorch's Python and C++ logic code (for example, a simple `torch.empty` will not call any CUDA kernel, but it will generate a few microseconds of overhead on the host side), CUDA kernel launch, etc. The reason for this phenomenon is that, on the one hand, the speed of GPU executing kernels is getting faster and faster, resulting in not enough time to overlap the CPU execution time. On the other hand, FP8 training and fine-grained MoE models introduce more quantization, router, and other kernels. The main idea to solve CPU overhead is to reduce the number of kernels through kernel fusion and use CUDA Graphs for graph launch to bypass repeated work on the CPU side. + +![images/image1.png](images/image1.png) + +In addition to CPU overhead, we can also see several other obvious problems: + +* The length of the Permute kernel is clearly abnormal, suggesting that this kernel needs to be optimized. +* Before the GEMM in the Expert part, there are a large number of small, fragmented kernels. This is obviously abnormal, and we need to locate what these kernels are doing and whether they can be eliminated or fused. +* The NCCL-based token dispatcher, which requires explicit global token permutation, is not optimal. +* The overhead of recomputing MLA up projection is not as small as expected due to the CPU overhead. + +Therefore, our optimization plan is roughly as follows: + +1. Kernel fusion and optimization +2. Memory saving to allow more optimizations +3. CUDA Graphs to resolve CPU-side overhead +4. CPU-side optimizations +5. HybridEP: An Expert Parallel (EP) communication library developed based on a new set of API, with functions similar to DeepEP, but able to achieve higher bandwidth with fewer SMs, and fully supporting MNNVL. + +### **2.1 Kernel Fusion and Optimization** + +#### **2.1.1 Optimizing the Permute Kernel** + +The permute operation in the MoE model rearranges tokens in memory for communication and computation. The AlltoAll dispatcher using the NCCL backend requires one global and one local permute before and after EP communication, respectively. The Flex Dispatcher of DeepEP or HybridEP fuses the global permute into the communication kernel, eliminating the need to explicitly copy the tokens top-k times, but still requires a permute kernel to copy and rearrange the tokens distributed to different local experts after EP communication. TE [PR 1927](https://github.com/NVIDIA/TransformerEngine/pull/1927) significantly improves performance when top-k is much smaller than the number of experts (e.g., DeepSeek-v3's 256 experts with top-8), with up to a 10x unit speedup. The option to enable this in MCore is `--moe-permute-fusion`, and we recommend setting `--enable-experimental` for more aggressive fusions. + +#### **2.1.2 Fused Memory Allocation for the MXFP8 Quantization** + +By comparing the code and the Nsys GPU trace timeline, we found that there are mainly two types of fragmented kernels in the Expert part: `torch.zeros` kernels that allocate the scaling factor for MXFP8, and the kernels that swizzle the MXFP8 scaling factors. The reason for using `torch.zeros` instead of `torch.empty` to allocate memory for the scaling factor is that the Tensor Core requires the scaling factor to be padded to a specific shape, with the padded part filled with 0. In optimization 2.1.3, we fuse the zero-padding to the swizzle scaling factor kernel to avoid `torch.zeros` kernels. + +When performing MXFP8 quantization for each tensor, four tensors need to be allocated, namely {row-wise, col-wise} * {data, scaling factor}. As mentioned earlier, even when using `torch.empty` to allocate memory, each PyTorch API call introduces several microseconds of overhead, resulting in significant CPU overhead. Our solution here is to pre-allocate a large memory buffer for data and scaling factors, and then construct tensors from this buffer using the `aten::from_blob` API by calculating pointer offsets, thus avoiding a large number of tiny `torch.empty/zeros`. For the specific implementation, please refer to TE PR [1793](https://github.com/NVIDIA/TransformerEngine/pull/1793), [1934](https://github.com/NVIDIA/TransformerEngine/pull/1934), and [2134](https://github.com/NVIDIA/TransformerEngine/pull/2134). This optimization replaces the previous implementation and is enabled by default. + +#### **2.1.3 Fused Multiple Swizzle Scaling Factor Kernels** + +As mentioned earlier, the second type of fragmented kernels in the Expert part is swizzling the scaling factor. This is because the Tensor Core requires the scaling factors to be swizzled according to certain rules (refer to the [cuBLAS documentation](https://docs.nvidia.com/cuda/cublas/#d-block-scaling-factors-layout)). We fused the swizzle operations of the scaling factors of multiple input tensors into a single kernel, and handled padding with 0 in it. This completely eliminates the `torch.zeros` kernel when allocating the buffer mentioned above, reduces the number of kernels, and alleviates CPU overhead. For the specific implementation, please refer to TE [PR 2019](https://github.com/NVIDIA/TransformerEngine/pull/2019). This optimization replaces the previous implementation and is enabled by default. + +In addition, theoretically, we can fuse the swizzle scaling factor into the quantization kernel. The main reason we haven't done so yet is to consider that when MXFP8 data needs to be communicated, such as in TP and EP Dispatch (which are not yet supported), un-swizzled scaling factors are more convenient for communication. Of course, the ideal situation is to make the quantization kernel configurable, so that it does not perform swizzling where communication is needed, and performs swizzling otherwise, thus avoiding redundant operations. + +#### **2.1.4 Kernel Fusion in the Router Part** + +The Router part contains a large number of element-wise operators, mainly for calculating the routing map, i.e., which experts the tokens should be assigned to, and for calculating and counting the aux loss. We fused some of these kernels, reducing the total number of kernels in the router part from 72 to 31. For the specific implementation, please refer to TE [PR 1883](https://github.com/NVIDIA/TransformerEngine/pull/1883). The option to enable this in MCore is `--moe-router-fusion`. + +The reason why it cannot be completely fused is that the remaining kernels are separated by communication kernels of global auxiliary losses calculation, which are not easy to fuse. There are also many kernels scattered in different Python logic codes. If they are forcibly fused, it will mess up the code structure of Python. Moreover, we will apply CUDA Graphs for the router part later, which can already solve the CPU overhead problem well, so there is little performance gain from fusing the remaining kernels. + +#### **2.1.5 Quantization Fused to Normalization** + +cuDNN supports fusing MXFP8 quantization into normalization, including layer norm and RMS norm. To enable this feature, we suggest using cuDNN 9.14 or later and set the following environment variables. + +```shell +NVTE_NORM_FWD_USE_CUDNN=1 +NVTE_NORM_BWD_USE_CUDNN=1 +``` + +Under the same parallel configuration, we measured that optimizations 2.1.1 and 2.1.2 improved the end-to-end (E2E) performance by 35 TFLOPS, optimization 2.1.3 improved it by 35.5 TFLOPS, optimization 2.1.4 improved it by 10.5 TFLOPS, and optimization 2.1.5 improved it by 13.8 TFLOPS. The Nsys timeline with optimizations 2.1.1, 2.1.2, and 2.1.4 enabled is as follows (the reason for not including 2.1.3 nor 2.1.5 is that they were done later, and at that time the timeline had already been superimposed with other optimizations, so it could not be directly compared): + +![images/image2.png](images/image2.png) + +Although it still doesn't look very satisfactory, it has improved. + +### **2.2 Memory Saving to Allow More Optimizations** + +#### **2.2.1 DeepEP** + +Theoretically, on the GB200 NVL72 system, all EP communication is within the NVLink domain. Thanks to the bidirectional 1.8 TB/s bandwidth of MNNVL on the GB200, EP communication will be greatly accelerated. However, DeepEP still does not officially support scenarios where the NVLink domain is larger than 8. We have supported the EP32 scenario based on [this community PR](https://github.com/deepseek-ai/DeepEP/pull/218). But this support is not well-optimized. In the EP32 scenario, the dispatch can only reach about 400 GB/s and the combine can only reach about 190 GB/s algorithm bandwidth with 24 SMs, which is a large gap from the unidirectional bandwidth of 900 GB/s for MNNVL on the GB200 NVL72. Therefore, after switching to DeepEP, we did not get the communication benefits, but got some memory-saving benefits (DeepEP does not need explicit global permute, so it reduces the peak memory consumption), and reduced CPU overhead (DeepEP uses a fused kernel for the EP communication preprocess, further reducing the number of kernels in the router and preprocess parts to 17), so we put DeepEP in the memory optimization part. + +The options to enable DeepEP in MCore are: + +```shell +--moe-token-dispatcher-type flex +--moe-flex-dispatcher-backend deepep +``` + +#### **2.2.2 Fine-grained Recompute for FP8** + +The conventional recomputation method recomputes multiple modules to save all intermediate activations of a Transformer layer, but recomputing a single module does not take effect. We want to do more fine-grained recomputation, that is, recomputing some modules within a Transformer layer with low computational intensity but high memory consumption, to save more memory at a lower performance cost. Therefore, we implemented the [output discarding recompute](https://github.com/NVIDIA/Megatron-LM/blob/e000263e21ac89571123303c4043ec9ea7261513/megatron/core/tensor_parallel/random.py#L521) in MCore to support recomputing a single module. + +In addition, for FP8, we need to consider that the FP8 quantized version of the discarded output may be saved by subsequent layers, which would not achieve the goal of saving memory. Therefore, we need to tell the FP8 module to save the original input (so that it can be correctly discarded) instead of the quantized version. The cost is that we need to re-quantize during the backward pass. For implementation details, please refer to \[[MCore commit](https://github.com/NVIDIA/Megatron-LM/commit/781e765818b86b8f2e03ac6bb6b09aaaa9d17074)\] and \[[TE PR 1865](https://github.com/NVIDIA/TransformerEngine/pull/1865)\]. + +This technique is also applicable to SDPA and the subsequent Linear module (called Projection Linear). Because SDPA is a special module, it saves its own output for backward computation, while Projection Linear saves the input for backward computation. In BF16 training, these two tensors are actually the same tensor, occupying only one copy of memory. In FP8 training, SDPA saves a BF16 output tensor, while Projection Linear saves an FP8 tensor quantized from the input tensor. These two tensors do not share memory, so it actually saves 1.5 times the size. We can use a similar method to tell Projection Linear to save the original input instead of the quantized version to save memory. Similarly, the cost is that it needs to be re-quantized during the backward pass. + +![images/image3.png](images/image3.png) + +E2E testing shows that enabling DeepEP reduces the CPU overhead of the router and preprocess, improving performance by 54.3 TFLOPS. By using fine-grained recompute, the redundant activation saved between SDPA and Projection is eliminated, allowing us to turn off the recomputation of MLA up projection, which improves performance by 44.7 TFLOPS. The reason is that although the MLA up projection has a low computational density and the cost of recomputation is theoretically small, it also has serious CPU overhead, so turning off recomputation can achieve a certain performance improvement. Correspondingly, the recomputation parameters were changed to `--recompute-modules mlp moe_act`. The following figure shows the Nsys timeline with DeepEP enabled and using new recompute parameters: + +![images/image4.png](images/image4.png) + +### **2.3 CUDA Graphs to Resolve CPU-side Overhead** + +CUDA Graphs significantly reduce CPU overhead by capturing GPU kernels into a static graph that replays entire kernel sequences in subsequent iterations, bypassing most CPU logic. However, captured parts must be static with no dynamic shapes allowed. In Dropless MoE models, routed experts are dynamic while attention, router, EP preprocess, and shared experts remain static, so we capture these static components to minimize CPU overhead. + +We have developed the Partial CUDA Graphs feature in MCore and TE, which allows us to capture only a part of the model. The parameter in MCore is `--cuda-graph-scope`, and the supported options are: + +* `attn`: capture the attention part. +* `mlp`: capture the MLP part of the dense layer, for example, the first three layers of DeepSeek-V3 are dense layers. +* `moe`: capture the moe part, only supports token-drop MoE. +* `moe_router`: capture the moe router part. Also capture shared experts unless the shared experts overlap is enabled. +* `moe_preprocess`: capture the EP preprocess part, must be used with `moe_router`. +* `mamba`: captures the mamba layer. + +In DeepSeek-v3, we finally used `--cuda-graph-impl transformer_engine --cuda-graph-scope attn moe_router moe_preprocess` to capture attention, router, EP preprocess, and shared experts of each layer. The partial CUDA Graphs feature is temporarily only available in `--cuda-graph-impl transformer_engine` implementation. Another implementation is called `local`, which introduces full-layer and full-iteration CUDA Graphs support, but not feasible for MoE models due to the dynamic shape issue. + +One limitation of CUDA Graphs is that it occupies additional memory. The number of CUDA Graphs we need to capture is `L*M*2`, where `L` is the number of layers per GPU and `M` is the number of micro-batches in one iteration. `*2` because we need to capture both forward and backward graphs. This additional memory of these graphs comes from three aspects. + +1. The structure of CUDA Graphs itself occupies some memory. This memory usage increases with the number of nodes in the graph, but the amount is typically negligible. +2. CUDA Graphs need to use an independent memory pool. PyTorch’s caching allocator cannot reuse the memory in this pool for operators outside of CUDA Graphs. +3. CUDA Graphs need static memory buffers for input and output data of the graphs. + +We have made a series of optimizations to optimize the memory consumption of CUDA Graphs, especially targeting 2 and 3. For 2, though graphed and non-graphed parts must use separate pools, we managed to make all graphs share one pool by capturing them in the same order they will be replayed. For 3, we reuse the static memory buffers between graphs as much as possible following its PP pattern. For details, please refer to the `_order` and `_reuse_graph_input_output_buffers` arguments in TE [make_graphed_callables()](https://github.com/NVIDIA/TransformerEngine/blob/release_v2.8/transformer_engine/pytorch/graph.py#L847-L863) API. In addition, we have also made a series of adaptations and optimizations for CUDA Graphs for MoE models, different FP8 recipes, MTP support, flexible PP layouts, and precision alignment to ensure it works correctly and efficiently. + +The following figure shows our timeline after enabling CUDA Graphs (this figure also includes 2.1.3 fuse swizzle scaling factor). It can be seen that the CPU overhead problem has been greatly alleviated, and currently only the routed experts part still has some CPU overhead. Enabling CUDA Graphs has improved the E2E performance by a total of 84.8 TFLOPS. + +![images/image5.png](images/image5.png) + +At this point, we can see that the performance problem of DeepEP is beginning to become a bottleneck, and we will have work to optimize it later. + +### **2.4 CPU-side Optimizations** + +Adding [bindpcie](https://github.com/NVIDIA/mlperf-common/blob/main/client/bindpcie) to the startup phase of each training process, so as to automatically detect the GPU/NUMA topology of the local machine based on the rank of the local process, and use `numactl` to bind the CPU and memory of the process to the local NUMA node corresponding to its GPU. This reduces per-GPU kernel launch latency and the latency variation among GPUs, and improves E2E performance by 70.6 TFLOPS. + +It is worth mentioning that since CPU overhead is a major performance issue in FP8 training, and in language model training tasks where the data loading pressure is small, usually only a few CPU cores are responsible for launching kernels and are in a high-load state. For example, on a DGX/HGX NVL8 system, if core binding is performed, then 8 GPUs correspond to 8 processes, which correspond to 8 CPU cores. Therefore, we recommend configuring the CPU to a mode that allows some cores to boost to the highest frequency, which can significantly improve the performance of FP8 training. + +With the help of CPU-side profiling, we're working on simplifying the host-side code of TE, such as removing unnecessary checks, PyTorch APIs, and CUDA calls. In addition, we are working with CPU experts to explore other CPU-side optimizations. + +### **2.5 HybridEP** + +HybridEP is a new EP communication library developed by NVIDIA, with functions similar to DeepEP, but it can fully release the performance potential of the NVL72 architecture and also supports intra-node and inter-node communication on the Hopper platform. HybridEP mainly has the following features: + +* Fully adapted to the NVL72 architecture. Within the NVLink domain, Tensor Memory Accelerator (TMA) is used for data copy to minimize the number of instructions and reduce resource occupation. +* Deeply optimized RDMA communication across NVLink domains using IBGDA technology. +* Ensured that there is no redundant communication during data distribution. +* Completely asynchronous at the kernel level and adapted to CUDA Graphs. +* Can flexibly adjust the number of occupied SMs and achieve excellent performance with as few SMs as possible. + +HybridEP is fully adapted to the NVL72 architecture and can achieve high transmission bandwidth with fewer SM resources. +![images/image6.png](images/image6.png) + +It is worth mentioning that although we only report the performance of EP36 here, HybridEP actually supports the full NVL72. Therefore, if future models are designed with the number of experts being a multiple of 72, HybridEP can fully utilize the bandwidth of NVL72. This also reflects the philosophy of model and hardware architecture co-design. + +When integrating HybridEP into MCore, we need to solve a problem: in the implementation, we need to register some special buffers so that they can be accessed by other ranks in the same NVLink domain. And since the output of dispatch and the input of combine both exist in the buffer managed by HybridEP itself. This buffer is globally unique on the current rank and is reused between layers. We need an extra D2D (Device to Device) copy to copy the output of the dispatch kernel from the buffer to the downstream required PyTorch tensor, or to copy the input of the combine kernel from the upstream PyTorch tensor to the combine input buffer. And the duration of this D2D copy is about 10%-20% of the communication time. + +Considering that the MoE permute operation following dispatch, we’re doing + +1. EP communication over NVLink: dispatch -> HybridEP managed buffer +2. D2D copy: HybridEP managed buffer -> output buffer in PyTorch tensors +3. Permute: output buffer -> permuted tensors to be fed into experts + +Therefore, we choose to fuse this D2D copy with the subsequent permute, that is, while permuting, we also complete the data transfer between the HybridEP managed buffer and the ordinary PyTorch tensor. Furthermore, since cuBLAS FP8 GEMM requires the input M dimension to be aligned to 16 (per-tensor recipe or blockwise recipe) or 32 (MXFP8 recipe), and the output generated by permute is very likely not to meet this requirement, it needs to be padded in the M dimension. This padding task is also essentially a D2D copy, and we also fuse it into the permute process. + +The options to enable HybridEP in MCore are: + +```shell +--moe-token-dispatcher-type flex +--moe-flex-dispatcher-backend hybridep +``` + +The figure below shows the timeline after we used HybridEP to optimize EP communication and permute/pad, which improved the E2E performance by 113.6 TFLOPS. + +![images/image7.png](images/image7.png) + +HybridEP has been open-sourced as an [independent branch](https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) in the DeepEP repository, have a try now! + +## **3. Summary and Outlook** + +We started from a baseline of 494 TFLOPS, and through multiple rounds of performance analysis and optimization, we finally reached 970 TFLOPS, achieving a 1.96x performance improvement. The following is our optimization history sorted by time: + +| Model | System | Precision | Dispatcher | Feature Roadmap | TFLOPS/GPU | +| ----- | ----- | ----- | ----- | ----- | ----- | +| DeepSeek-V3 | GB200 | MXFP8 | AlltoAll | Baseline | 494.46 | +| DeepSeek-V3 | GB200 | MXFP8 | AlltoAll | Fuse torch.zeros for scaling factor allocation & Permute kernel Optimization | 529.55 | +| DeepSeek-V3 | GB200 | MXFP8 | AlltoAll | Router fusion | 540.00 | +| DeepSeek-V3 | GB200 | MXFP8 | DeepEP | Enable DeepEP (Will switch to HybridEP) | 566.07 | +| DeepSeek-V3 | GB200 | MXFP8 | DeepEP | Remove up\_proj recompute | 610.71 | +| DeepSeek-V3 | GB200 | MXFP8 | DeepEP | CUDA Graphs | 663.27 | +| DeepSeek-V3 | GB200 | MXFP8 | DeepEP | Tune DeepEP (Will switch to HybridEP) | 691.49 | +| DeepSeek-V3 | GB200 | MXFP8 | DeepEP | CPU-side optimization | 762.12 | +| DeepSeek-V3 | GB200 | MXFP8 | DeepEP | PDL for quantization kernels & Fuse MXFP8 swizzle scaling factor | 797.67 | +| DeepSeek-V3 | GB200 | MXFP8 | DeepEP | CUDA Graphs capture shared expert | 829.93 | +| DeepSeek-V3 | GB200 | MXFP8 | HybridEP | HybridEP | 943.56 | +| DeepSeek-V3 | GB200 | MXFP8 | HybridEP | CPU-side optimization | 956.21 | +| DeepSeek-V3 | GB200 | MXFP8 | HybridEP | Fuse quantization to normalization (cuDNN 9.14) | 970.01 | + +### **3.1 Future Work** + +1. Completely eliminate CPU overhead. We hope to eliminate the device-host sync in the MoE model (its purpose is to get the tokens per expert information), so that we can use CUDA Graphs for the entire model and completely eliminate CPU overhead. We used a small proxy model to estimate that this optimization can achieve at least a 10% additional performance gain. Please refer to the MCore MoE [roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729). +2. Scale to a larger amount of GPUs. Our current parallel configuration is already limited by the number of GPUs (EP32 * PP8 = 256 GPUs). If we expand to 512 cards, we can explore the performance of EP64. Theoretically, since EP64 is still within the NVLink domain, its communication overhead is still small. And a large EP can reduce the number of local experts, thereby reducing quantization and other overheads, and improving the performance of Grouped GEMM. +3. Explore the use of NVLink-C2C's CPU offloading technology. Since the GB200 NVL72 system has NVLink-C2C, the connection between CPU and GPU is faster than PCIe 5.0, so offloading is a very promising feature. For example, with the help of CPU offloading, can we increase MBS to 2? If so, it will greatly improve the computational intensity, and many of the CPU overhead problems mentioned earlier may no longer exist. + +### **3.2 Some Discussions** + +1. Why didn't we use FP8 dispatch on the GB200? + * FP8 dispatch is not a free lunch. Since we can only transmit row-wise FP8 data, we need some extra "de-quantize and re-quantize" kernels to calculate col-wise FP8 data for backward computation. The overhead of these kernels offsets the communication time saved by FP8 dispatch. +2. Why didn't we use 1F1B AlltoAll overlap on the GB200 (a kind of inter-batch overlap scheme similar to DualPipe, for details see MCore commits [8333bd5](https://github.com/NVIDIA/Megatron-LM/commit/8333bd5bb6de2bdbdb3ebebf224b4a339a04ec90), [ae1c882](https://github.com/NVIDIA/Megatron-LM/commit/ae1c88296f465ab4ac9c503d75a57ba4044c47d1), [d7bf5aa](https://github.com/NVIDIA/Megatron-LM/commit/d7bf5aaaa8e331f901366621db009b0c2880c8fd))? + * First, thanks to NVL72, EP communication is very fast, and the necessity of overlap is not great. Second, 1F1B AlltoAll overlap is not a free lunch either. It divides the forward and backward into multiple stages for scheduling, and there is some synchronization between different stages, which aggravates the CPU overhead, so the overall benefit is negative on the GB200. If we can further solve the CPU overhead problem, we can re-evaluate the benefits of 1F1B AlltoAll overlap. +3. How much performance improvement is there compared to the H100? + * DeepSeek's technical report did not announce the TFLOPS during its pre-training phase, but some article \[[1](https://zhuanlan.zhihu.com/p/16480858047)\] (in Chinese, we recommend reading it by translation) has estimated it to be around 380 TFLOPS, so the 970 TFLOPS on the GB200 is a 2.55x performance improvement. This surpasses the 2.5x improvement of the GB200 over the H100 in FP8 computing power. This significant performance gain is attributed to leveraging MNNVL on the GB200 for optimized EP communication and utilizing the substantially larger device memory on the GB200 to explore enhanced parallel configurations. + +## **4. Resources** + +**Complete Training Examples** + +* [DeepSeek-V3 Training Scripts](https://github.com/yanring/Megatron-MoE-ModelZoo) \- End-to-end training configurations and launch scripts + +**Papers and Technical Reports** + +1. [DeepSeek-V3 MFU Estimation](https://zhuanlan.zhihu.com/p/16480858047). An article in Chinese estimates the MFU of DeepSeek-V3 training. +2. [FP8 Training Recipes, Performance and Convergence](https://www.bilibili.com/video/BV1mpMwz9Ey5/). A video in Chinese introduces FP8 training recipes, performance and convergence. +3. [Stable and Scalable FP8 Deep Learning Training on Blackwell](https://www.nvidia.com/en-us/on-demand/session/gtc25-s72778/). GTC talk on FP8 training on Blackwell. +4. [Cursor's Blog on Faster Grouped GEMM Kernels and MoE Training.](https://cursor.com/cn/blog/kernels) diff --git a/docs/discussions/deepseek-v3-gb200-optimization/images/image1.png b/docs/discussions/deepseek-v3-gb200-optimization/images/image1.png new file mode 100644 index 0000000000000000000000000000000000000000..6e4dad685c4251ecee64a1c8d221ae869ea5ff43 GIT binary patch literal 325505 zcmZU41yGeyw>BtJ5>kRlgS51igfu8AQqm35NT&kQjY_wK7)VH`G)PN#cX!`)zWd*q zZ{~N#I>0&ayZ2t}sr7~^E6QMFl42quAz{nOO1?xwLVbpWbc-7u6<*Oi8k&H=P~M8m zy+VgyUg##lNJvyja*|@N+)_4cU7gfiT~4k9R_^OMuN9o&pi>Lp8uYX)b9h)@B)s5$ zAe1P-oGKL8;lbd(Pp8GS(eWj`#LTpX+xBlWNwU#BD#2U*Vs|j!lcGQQSM9VRuyVDq ze%VJ<#_jS8d)24b$T#yX1G;LrN^QG?@ zAIZ$vUG;|am|OEfjKS)@cumt`2`kbIZVM};dS}S7Hm4IJ%#;);bQs+h;sZQ93Ue`q zdhw=o3b)JI8`hinko=Lmw_1Hyo|1gCrExBOH-BChwLa%hm;FIgMYZ^s=x^*tA{ZTO zmw|N?#tTSFK}-HT4zU4W-lPTeNQ~F6!R4|1|Cbj;UK~C-{JcT`rk(^RB~Y@@Lk# zKl}}LOVQR79qwhCuLbOkYfU4l-3ASP9H^F;9T&eCUOZ#~32BK)b^WmVF=-ySj zA$6VpQ&_Go9ot9uAO(k&vOXyDoOF~Nx3$@lN)G?E8%ock4g1~{wAjTGK9SR1Y;jWI z+h4wX3CPHx;z`FppY3Hq&Y~1aDF3p zy~)iV)uJFT_JZ>ZVLx5)*&%K0_4q?0jYC|Fq<@*RHwm8V)j1QrNU}?GL?>eY#JLk?=8SKQ2An)Z~4mMzk9x5B514 z$K_1DxL{p+19QUotEghbur>-|i>r@xUpgHZg(;s2QQcO+J!V1SqS~G{onKCqO1!D3 zqT4KD9fw;F!lpr{k;5-2$YKm-N1pe`HJ)`6{NN+B-f>HJgd7JgG)w6I%A!B+0%t85UE#U ze9>~&MKr~ePwti#-FVHCEG32C#1NydZhq48i+4hy93&iMqDa+>V$6NFJKs;`6F6?G z@ru5Qe`bK)Ctndf`klB)I`jnS&y@5umh^x<5iSgerCj&x$4WdYvDo8LsHM}aXt zEix+a>ySyXEN(7#IIdQFqNdKOMa8_&E-bFEagUSa@MD`I=i+c@XYT~xu^D&zsDJgK znNFx;M}_c9m^sJg=}Ppi%b^*uY4MMy-@H!VRC>2bw+W$!md1zTsiTuCr*2&`>R<(-~?Ub_Q&+=C%OAC+JhpI z4EReQE)Cu6ozf|JpB{4#%-Zxpj$e=17sB)Gw;3U=fy z%OrNH^(F~4xmb~dI3+U@BvwCs?(9;jbF$WFkzWwn<;>2V z*`+OMXl2!k#?tsq0Pyexg(ih$aY5FftICoHbw zF`DCV1x@dlzwzIAP|K09o;ckppP-GyBl%0uF&!5_vHgs+NU69#z=Stsgqe3jx{a{wW;QG47G#0?JCio1C=`b*3RxFM|bC&@U$&H_1yD|2x;14R$l>qmCu=Z1min? z7^OPO*);ub@>TA17)LobA(z@pw*o$+Mme9`P3OUaa!r-k7#E&dxJP`-u~RzkmWUtG zoS~?3R#H=&v@IWb3tJ!;8%?*y?TouSYNd7{IjfT~R}xfDDU#>*c5$Y^3bv_7rLN7Q z(!2jH(sr6{F9D7+gM!41?f_2DEi`A5H`4U-?ExFV`|Jt2w@CTOsj`=zxcFK_0g_9yv)IXUSQxr`^VnrQ7@Jc)_le(RN4c#8ce9%*kbGw`RfOZ5~BztH#@$%Ti1 zI2i$Y`mqjF&g&v9ICnUcsq7hR@s@E z$fBa6@ICcOX>wcxvq*vSMz}IRP&(Of@J`_hi90iQC`RSt?a?GrlsT zYUNhe`WF-=TOcs?ce_yKKyr5EHp6=3`ELDeCKYpWr3|<$meK z^?5NekJe|0N*zLCVs3tZ^oHgTf{?JVrY#p^4wC#IKXSf)#ZXg|Q&gmX{J6V|__JwZ zW24yf7cUU6Ffpx_SXS`VV44f3HyYD~+1*F=M^5f;{cxAqUNDse3)Rul5giv72qqTl zZP<1vS5H4TOG`_j7wDq>1{@sx{OzevU01iZ9$+7STt65PV7G~FD~AD@F1k1 z01lag0=9xe8Zt^qNQjAv$?36)J}q3p*4Ea)fB)cz(J?TnpFY)Y?=maFNPd8iFRrK< zDf^i*RNUG5$&VjD;JRPFd?_n0e^?e0Yo7D5bzFjifc(6QO73LD(eh+iIIg34YofgJ z?FtTX1VX~*@CV#Ty@Ep{BQmC@kJ#eF-n~OI>rIQuVr(xCZ5>g#Jkpc$>gymnQpJNY?+ff(}iFooZ@oDn-OQ zDLfpDO~3AMrbJM1XsEcF+W7dmW~Jk!7x8RYwcjG8^(m)T*E*Ofg`kJfoFY*Ol$Mtt z^vh6k^6*G{c?p%<&Ex{#9MSWPNK2#msZ;5fo*pDFKFD)^alx4=&#L+BnOAFH-#tp7 zQ|jBdZ_6j~VEFppTnK&Gkjuk|=+4{+&$g^Ve_RjCZDMkphB(2K)LNpuM;s z4QarE%l!Ub{?(;rlC*{f@z~hd;_@=|!92y(d1xJ@*1XF-sa^lRs=mJ{tiQhQjSHiD zA?sZ$yMjrDQYsautp4+ePPElhD!kD`@|IyqI6Z->Mq#n3O#7hT`669?GrD){Q%{l z^bakON_UAuRD%6muM`#*JK_k5h++~F+F?=iBFW4q%53D5c_TVEM#g?y@zKY{G&DSe z)9_-ZK#f(m>ZA7W-(LG23>7+TSy|b@fWNG%lPigweR1WRvUo77AkF!jK9)6G;jdU~ zJyO%La9Uf^Ln_zv5iiraN0-DW*{oeqP=yboLbHCoT~UO7{m5x?edwF@Sn&sF?hb1M zkB+yeST=Q3RPZ)N3eC@*dY}CB-hdO{lP=mDi2YFZ`E&dkhQ-W{NEfBow4^ROcIh*F ziG4@%@yUP{uC67p6R&c3k;9zu@ooNL*IgGu%J-C#&SLe=Uwo}iIjA%?v zO*hAXhqbrM>aY<@NlC%&F1;Z7>o(TXzmclcwmlKnQT!Kr-)Nop)9dSNP!Y@~Z$;G^}9 zl~04_sXJ7kaWL!6m4p~)Se`I_Nn?aA`7u7e#A<|1TtWgMGOw~Sae7)${k!IS=u58k z7v!)6D0lAkPfz#2SzuBBZs6vYK>pOFZ*)`^ZX&{F>+r;B_f>Q4&fflt=*e!K-(*1T zUP?00`Cc3PImPRlxWsNV%)565JPsK-IXPux0X!H+YuvfH&3jDj?W0|1yor$jVinVb zI^Z4>M7*mi(x87q6aQs5BeL9^uJrnK5~B}kSdr&yFI;%~!@}M1u!vcBAH*3=N%_0G z%Yi7=55#&@EG%Kwt~7`K zGa!auPfgR@z4r@+qydyu$omzmfj(@u<;h=F^X7MUdZFtl2)T1179I|?c`wED=g$+K zSlvgGkdQb%JLBZwctrQ%PLe<4Z zPEL++X;`(({<1Lo&NpM6pP$zVidyqNgpW_?HbX^4g?H8bp$>-*w$>s^T2qtcnpUVl zF)t2!D+qaHWMt@$H`8(-+wi2bWBmMX2?`317MtFE|NcEtkI=BNqKU1`%N0>#uBlp2 zzV7bs?>d#SL@esTCo`3AgJ47BwMxw3?ig8E7EbpTgw7V@{-y{J$i>=6qL^M>Tu6PI zZD>F;D@lan2n-5Z+uD-)Z2G6EY470R;8o?s7FKyV-dO)TE!028$`ayzliO`|eyb%J z1XBJ7VGW_=iHV8HaQ)v`>+w>#ni&h8mryN{ z`kuJBcx$Z!O0UPwVEXmPh=+V!cGE&U3fYB)QCV3u%gake zrd{__6dCT^xua8I{~nIPz`uWz-rmADySO*+I!xvd*9I3?R!ZCt%t%Q|`8|(W?ds0x z0c(JJ?)N<1f$COx{hAcc={s`?v+@ZHbo7QdzXt{fX@>>A4Kd$}hOcBj7vbkm=aL9M zp-g-6-#YJ$3YRU?!CZxf`Iazf+1H!B*I3#8?Qp^TZo5=amvpqW{yja4c$!XvsR022 z_ee-&mF(F4{rzEqfRxh+(t1`^Kd`StSACUrczb6@V(=9OUC>GSttp-G0y(_8HSO-|~l7wRc!Y9?f; z{2sFgvS!ScNJ2(NC{ky812vO5r|#GI7MD~^<&l!Bo<09uW20WFSQOpib1x)!6P>2^ zg&QK^mqSBCH5c2}mrMHBIkNZktDG?6yIOF4PbL2R`2!0ye|0jGmy>g+WK13aqwDiy zI%_5^a z1qJ);H~Z|bUcI_UP98C3+csxU^$-i|_ymvJnwAlL^+RM2&gSE8NfflZGd_oU^7=D1 z_+28wMj)}M0eTf)zQnhy*+YfaKUM9j`ucTespTNz&_H?dO}qI6q5(w685&Y2CMF)9 zoTx%|13c-{E2YWb>~f&_9Evs~$<8_ouB}rxa0oRw%u+$Wg3~)|&%e61)@Z?lfB!!H zygW!KXmfK5XlTBTLb!$o(pP79tj@C0)zlF532p-Jf5EYmgoMOkED>gqr?{B38x3c-^`RX!M*utE z^9S&s=(M!R!8JlD@^g_f+j8UbiI%#G?CjgsQ;|?Vcz9{nU+CoER0V6w!g&tP^d>sR zM}S}7?E`mrcMy@kJWp)*QgSWx3pxSvOU$~B=Kg%}Yf@Qpwzjj&gVH%Zb^(?nr=akN zo__fSF4s-X=3;huzbp?St(yU9hR)sw^unwR@4#GzxpsDN^+wq{qTiV&REjelG%Ze$ziHQJ?@INHHY^VVX;EMAqG1A8 zglx&l32|pd^%p}?aITX~_1s(TP6o()sj4caW&oNFQ3~wrQTzLj4JFY}o;=y>U`W4* zhbN}5PYL)2qSNtYTX%c15+v}K+}!=ix4oH+n8a*4v#`EK2W3Vrn9IQTx7~uA-PayQ z_V35ig}}yed7mG8>Y_?#YkUf0W6v9u1Ik`dz-9Oc1@uQHG}8wU9suiB)z*FiD~evG zSLc;DKW_v@y}G^*Vn^x2;ScT7va)$6!Nez4pGOVInHdds2i((~D>#yo6r@yt7Z#dc zP=8=!zj7^~c)W4nm*4LPjjZfQ9DfN^wmbN~SFSiAhp)$>UD z1+s7{MdX3m`}p_(!6YCdX$PPIokI{Z2V#xOyoU^+-t%}fv$9f)jsjLl>E+9Tilb_$ z$~9fTD&avM?2@s+p+qdeqW{d^lm#|#2|gF_gis1^Xgf7E6`=@d#!P>|9xJnXh=+%_ zz>JYqC|ac#Q5G^A({g^a0phgTw*0o+-tnHu(QMNmoxuXW>`d*K07Q7wo(8v z8JwTYs;cwpIXV3L%O&=ZD9~As{S!020pUEnzj=(YQCqM^032M#t?{$6zyAE0o00L1 zj@JBdLSyWs7ZBKy#HThHAIanXUFQmqidsPkWqy7Vur`-GSBv}2!xH7`=UzOIl>{RI zn3jB+P$HBQurkRzwh|Jz(Qql~0O7Z{Z2?u{7?kMg=mG`?UV=pDrG6iww1{WNvb_lp`lfC%Nnh9sA#JtytEAc_ugj+#YI8)|3j8Mh& z+|BdHxe6%_pzZ7{b-+~!gmR+Z0V5|iF7CF_IcIR;tu^kde^{|tv>s&!jkkcCHK)nz zcO7CL_Vp>xFDwAmEiNq;0sDp?92_3Lm}s6xO-(I!8i5*z@rnvv@ZpC0h5wc&5pk-n zuHSY=QBe$FJ*@e`N-wd~KQtUNgo4X2crGi8zA^lR;FZ`(mjVHI-?s1{W3l%+8;N=u zmL+50R#Z&cLeFNeJ&bLWA&Jp=_Us*l7-JW7Vs0QXG&D57Cbqzycn1LPi@)!{PnII; zOOf3#D=8VEA!G$W>R0@=ygYt)*B%%U5ay;O$2rGJ&EIc`fQNkn?1Kj1H#r$zPyh|4 z#AWL>G~V}nIp*10S#)wxY*?uNqYpQ9znuGw&CKYZestZQGz8*c zHkr?^*lo{yE4k>IuoTYvue0sPuFIwnR8gi(8LZ;C)1yzo+8y>)Fc5BkR9kcNIvZEY&Z({T-KN=R$3kn9t$3t1WfHPz>V!G~3x6I6>&(0da zdNS$0dG&UDVgkW5VB78&f+TENa@+$p zKj$Q<95s*rf3pA}L+8HcrQuhBoKE-sjg78QKIoW_bxK{_5(&s;?k zFev_VTzU@9`B=3pJ3y?#{$dA$Z$EY2{slq|YV>TV&_~L_p=gvNif=U)MORlBAOH?0 z$Wp%@Awj_;=qDjh0Lj9vMt*Vv+5%?)N0LfJB(2u-1Rqqhjm_h?D?Okd5&iQ~e40E6 zLmG}9mufJpg{}^PKXx|yqak+2YWO=fCua=&oFmvtVBS_Zt>MEVtvQ_!167PrOJGs} zQcYAme}{B5#EsdZwO+Xot+UZ96<NBC(pZ>( zD=4U`^Mq?SJ#|BzOsgs$7kO{tDf&PoQSJ>s26^JWnyOLGZnv=8BkDgcziq$BU7H*o;O-S&YRM-WR;Ez)&o3 zscpcj1RNL9frd1imGnwRd-$ zf=c~2GIFq%m(B&8vF;#shXaoLj>viNQ_h%X+V{7v-y?tbS(kgsLz}*R9?;w@$;q%B zS(|?C@R7Or3WGnq!{E> zU>yKUl27E4+H8R;Kxhg$v9mGT;h^aNnE<}#51@&|nS}U2MTlMwA5=zKB|g=0AM^@p zdiv1lvWNC(TuzO#rd=3!Bj$E}l=3>DxZxH|!6j&JX+e0YB3Fm1K%6A^dffcBS&k2n zqcPj@^Bem-PS|X#{ew9@i{bMSr7tH(O+zETxpv4Vp-T@$4oM)2-+|I zeE{62K^fUkUsUGyq*FLi?B) z7;|SU>3IO}&+sb=aGi5`6m9{h*rUlACuAQyB(_it>-RY*oj z|4~rzH%&MhfDum~iVd@y~Mfyn*H!<{ft=ZJiv5XTSx1wJ#BB0U2G8rY3v)(BPtEw1ij zY6laIOvot&K@iZ;;*ye>Qz{^^>+I|V{gP2%Kf}V2#~=q@MmU`u=sOq}YVYZ3ee=86 zyUPG1h2CRi24uV+$+rV>DBa}MDIZxs(%McsnfkPy#q(rPH1KZw0d1!{b6p-oO!b{4cY z;XW8`^9}EB(>;2`2_jadg#r(cI$s}L3%&`8w0`(oxQQK5ezw1RRWx# z+hHv?3Xa<7timpAVXux#?6f%lpkVr zAuUbOz<>&bHx(Njj<2t8R(AFSGBP?|-uq!K;@~;M&Vts80i7u;+fMtqko|7$!&$!i zGi>Q>6{n?}^O1Tp2zn4E-oJmZ4BK!hgk*uj%$)VOqGMq}I1>aUSOw4#2I*kkh(jJv z>HpiBT@`^H0+clU#GhmN!IV9;qUHaArV{#w90j47m%W zWvHh4rlj}k)4i8MH7E4i@5d1gH`8-F)`DD|5mOuZ`g31jQMh>tH8m);zrf?pAU2Q# zPa{8HRylO-I!n*mbG03BSTPxO$0mrUFGb=D!=o5sOMLngEhlTGB%To18Hj>kKoV(N zuI-cob#!aHii0c+K^UMe zrDQrli3f*;(Skt%(G5^bs9d9Tb#?VXIynF)P9Y(Az)YZ|HRtQ!ptuW#o8#YW+>*3= zNUG#_GB?*cN)_ni`#iSUs<1Y&Yl#MWYTh7|89i8WTp(BBIx8I(Pyqo9K}Z-KH?H<> zZ*48*#Ov*!_W+k=Z^AqwOK5A+`bUg;g?3LYl~yLQZ)BuRRY~|!cv_v?Mk?RApBYyo zz+@J4-;yI98QEWm6#0Fxg&;Wswpt9)OW~W1J=Cz4KP9~)VXR#UPI7+k0pAq}+DAx8 z2c?IFg+%~}+e2P!(5$T+KHyBH@H_Z1spbZoyoA#ZEd)~8cl6{tZ=iFk_1`faH5ZLe@pqXELZ0(HR&Iu%Sa;CnPk z4%e6Nh#(toW64zi@L4N#V1xm1KTL{F9@@lRVm6p5f~5x3^mwG{ETgB?u&PE>MZ4uB z*wNEtc9ryGQrHz$+m*nK)+jiuAU`7#S9g#A*F)+WPPmk5afb<%xOk~WO8?LmFQ8id?)c20~+tt-HXRi!; zKL%zU6ckLa&X4qvBlnQxVgdd!!BjGmh7kDxVdYu#hGpo2!3)+|b8l}HxD4pq2gYrY znUG8lgoY;YMVrjT<^IU;Y7wtX1zL3~LGw^&B^2=zNa3V0W+xx7>~Q+5QsL z9}KI6U+hb%uM=t(glelrQ=DO;rzL*PR+La9;*mn$n!aL`!#`N=HinypX*7q+Q-{y& z`idqdUsfw#s(mUe{z)SC0OF$^=e7s_}TehXagDTfiP3* zsA6z!5kMTWO+f2P&`luj|K#;uNyT*tF&oliJb=_302~n+#Q1;fq|otV!#O`CciC+EekVoaPkrAZ!{nD{PEUA8@MEzbzTApe+K3U zKo9Q^ykN(`tnY|p2%AraBo0V+ZD<#GT1mMf+ivX7<%E&Vk;(}hmf{#H9M`duN51bp z4-_s?J`xUHlw)NMotJy(h2y0E?4&IU43RMQI!6A^BgItUzzF&eo@jeV2cB*VaB=hQ zq$lFj^`|ZD6>t>7h}oCR>#rUI?ta%QL6js!TZjM>flkPxQ|0|n$&u%*);Dwk!g_@c zurz+^%mKoKNdttDo9)Ep+#1>2-UYmP_PW!oRiyrtW|)2e&qf{V%gEVUJ%~kn;Mc&h ze!c!-G&Sh$Rof>QMZ8K*U8Pr{%Qk6CEG^Z@F;MVhy@X6=kqG2C!uW#V?WxP=J*Nzh z#s)Wt37Xql_Ws0ab6zc_OJF8k7)~vBS4cF4ga+UHmjek1)6(?zFUIg0}EoxBB?tN>bsrT9l>xwK}2qW8VOqRrKG-u)ydIEn&o9B|_|u zAKxVfmSaZbQ_vJ3^lJ_WUKsfpnVHdp@XUp%26i0`xJvf}ddLQ0$uT`O)YLvS&1HZp zDJWouuhn|tPNA+>2_!w&Q6U7o2HqI5@(|=1nDroW70fMQ26%o187@S`V5UfA>CE`A z?F}Ft7;^IN{iUuJ=b@33q$E;c`Uohks)9i?VAfFM+|blbv=GS{2y;XhCN^kM{nfQ6 zB9eQ(t{uQ6nl1$F-vQ`5U*zJzXjt=?d$;QLF_oB-f-*cehx9Joo{UV8y(XO;jfjYz zvJUI((qQPsedFVu|2s;CKk1Vlp)D_uB@;zq3jG!dG5rFZ!1K{-X^jY<={=zW5^uJG z0!JXLKx4)cuOC)4BArKIWn+_*mlp)P2C3E8{QQiJjCYXrHzo>B!!QNR zg~8Yx%&J+~+R_6K_k~#|wF1QQ+s`5)Mqv=0ZuCbv5kO!TBPJ-7Ruis}&|{G2K!YpX z4{|kkl~VZM*VotQy_n@*!jn@Vaq9lAo2;&$u83~_1^KPG3TwPuQ|89es1#Rsx zz}`oXq}uOhfH@9>1;amey80@7CZnVFcVnVCa@6tc6uit7MJ7!6w6A*N5%mf|4g z#rGA6&R|rob#tXH)OiC#XqPvioUus)Yhv^Cz-{a+yI-y)h+}**vhV_rz~(9)Hb``8 zR?*pXDxOJ6-GPcFRP(8_1=Aa{2|#gx<+!Q~)us1dJiW|nLJg_ec}T)SOnenw{Itgl}~R?<8p zffP5CT)Vou#(n-wUtj{?vU_quT%rj+o_+Nc__2LsV{-cXGu}kT=k?h(6BfE}tEVQb zoWp=Y;$=Y7BnH3C2>P#H`ay-nAP~<2m@Fu%Gv^CIKZYr zpdd7y!XOZs1q2A6oDh(cD?s-E=4}E^->+D)JlS|_MpJk&9TYPFB{-Z3O+R>(s~LqkSSj{=(c@US{O zDsvlX_MProw|;hVkC9B)E+ceDu#e#VXbfkDhNM7n03$c(&yq4)mQ+&0g*i(<5rl{W zhWh&T>*faQpeE{NidC_|{!`Yes*{Dx))AhDJJ;99tXk#o-TV{%F0T+$G%@jPwbL5w zsE&h3-yQ}^Vd#%Bb{UZ!QDm*=LsJme<9^5bF=5O}cLCjzTEI5ELurnn{& z1Rf3WPl+I$ke~dg)=A4n*FInq0Jz4AVPdEb;XCVxd~e&ef&t5|G;jtLJX2HWEevV2naT*|EYR(bOg4t=Y+(z zx!R!g^}*kaehVur*miImqF}%g&I1hA94sf>g&w^X0zG?q`_tT-jr{u* z@3~?#hN@-wY$s(sJgTCT2iHtm!ym|}@Z+heF|o0M&-Bvi?W)v6a6JHwNWa$gTwI*Z z*FJDN?&GtYZU!L^6b8Y`OU$Eo_h86URMmDGcdD9z!jw{KkOzF906R@Bt&m0oDsFC7 z!ruVp(1gCipe$II$D!TOHb77(3OX|<d z8jLm_jp&~+yTH^bf~CT1Fww*hLO$W_8v{Sw0R}cU(M45%>e)HxYTt>k2VW#>7mTw2 z^Cw4)zyLjfUbhS$o=&CX2MFAJa%gmC?m58#@G5dszwgG~@&7j=lgw*l1ZxPY7DzFG zI1ewcOXZlK+ix>yfAaUH3LI*j|2tJMT`^T<7FVfhNXiZj^4mh@x(=>eZj9N z5)dR_04O?Gt!>u4gOih3&%i8C(o5u^ui!hk^WKBWO+X!Rni$A`dk5 z70BNcX+M0h?n?cORd3L1tE;B+>W+eE={Jsg*24nf+NaWmNCP~*vzq|jj^~G#Fo_^x zZ~u!#k^)AqVm()S(-8!U20dRRYxQ9+!s&ytYdnJi@{0b*ea)*|l1H0BAz=5+w#I+^ zJ(n*0M`m}PBN3krPt@3!-z6X9QRurMA|eXG4q+>YHG=*Eeg#$>y#fvR%+UP}$|qo0 z3em_R8bFX!P%)Lj)PTHr?Rd@*ypJquJSy_Yy8-8X5D*wBsifzrdB0^O+N$8PVu9m{qNvh{`c=De*!RIpr4&>7waFL}h+g%UwP4+f>!4dMhRcH#B6!St85%=#Yb!nhLDNC6D7%+&^;8c+i9;DBa+^o4 zs@Ke{ko+Daj{zzK4TKvahR>{8G+vVFh54ZkjhgyJ;OE07f|U$odNj(myvZ+y;}UkO=_}BL+l)>?8Q6sBhZgp%Y?ONm3F8m>kcOCsP{XoV?F6FYY6?XYhN( zD9}%`Decbn$0xhGy+#qKpY>`z53SPu7Z#J0lr#z>l>lW>_k;cY?+XeLfc@|xkb*_n z4OXq<01%%2!^2Q{+`%CsFzyxwI|mjGjvHes?C1vtG859~q7X382O6hEV z(NT3Zbwbkm#8c%XBb-|QnSwvro?2X7jE;_`I-~(X4+Bkbph2HOE*TF*kqZ=RzZ|#C zI1ig%jV#RILlTJ?YebOUxDKau>rxd|ok5|E*n)f-WJf3uSUGVQm$Hh&){?^fugnEp zulZz#`Qq=EbVvfHW>8EqINludt5O-{p)Q9h10YBiFm(^UIrz4q)LHR~AYuR&GB`35 zRsACZ<;s(XNbTqt&skvgX1#?JdLfJv{Q^H!<=OcMkvhS3P7EG}eLBM-_A-H}Thcon zt?P-f@4apP=1o>XK`3N}Ft8j&C+GKM0URB~*aVCs0UX1CT#NTP&ujJBt%;|piijr| z0D_=SNg~n?fsUVc^6qgHP=Vk7>zM|U@GMeVw;jyoLT%3P&Naaia)9SFcob-#K23&( zhL}_b=;-L|%z_jHC{d#T|M9^S1!FiBMopa48G}b~x6vOzN`hR6yPZFSvjY*x3{5IISO2{FohQQg1UIej);rx%( zASNT!Jv;;<$;~M(eP#c}7N&G}lOkon{({H@Mp7m>&KsH#1G%1_5al4v7QFyaVb!lo z0VW1J2)RmD26dk)xCk)GFxQtU0m$R%$c32KcpGtOxSF*5H4CP7PDo=@V`ARZ$sy8= z$TW%qTRn#C+V}Pku-=QIoLyu`4m#_6Hbv|;o!lR>;)yMI(j+cA`Wb*2+zhf%nJ*ku z$m`>PdIKO*RrxrXb8vIV!F;pw>(|fS-777YhT^tikk9FjT&$#pMLs`{!U5gx;NT#L z9+*82f#(1aV{CZ&>5zNf4>O&-&;@Ha{c6)3f@X)ukaxm_ANY6(?yRNtV79*lF{TAl z3F;5HDBJuuW4+f9{fjo4KnESwOoQ)w$rn~2d(L=3qsuW4tki8k=*>;S??;SvuL9e0u-9=W23YWkHt+7cpkl+s67$# zofIs$Q`2zPa4y^aBEn9qt7fiWe<7EX56|X+gp;JfgMo7>UAjbhMEy z2Zcu7Pn{gn_ow>is{ZDr!u>AFt%TeT_>dJ0ba?2na=qYN1^jlK`#^j;hwB?`nFQs% z_a4`)fnXOK3+oqnlzN_9PoO`73HXW66-3U#Hd_-10DI5AVhW!t8xfHT-@hbexh{il zb{|EV197(MO|JW_nj9rNT)u;M9&J5obAaADS9h`96-qlZJS+`#`Z*H-)=Gag=`& z6Y2pr)p0ob=xI_b_~#0|6{6V63S%>ZVJC#?!NbF24-aysf1IYI_;}(wHe)Iz1j#Dm z!7j+l^U^#p?JZki6XNf(*F+0@9`AOt6e??LcR7#fwn0d%#QZ5MFAuGW1164*ngX%E zyyLDKw~V}ngVLn-Z)n)PR9a2#(}^_xtUIsf=OwMI`rSWwryrWb+4B+GiMpH;Dq7Zf z-T||IA(lyc0iR8{)}$Io!SI7%6h-I5aP{;N+h;c2=MZNiEJ?t!Nso;mDIV)z>Eh(r zzjIWi9gSNlrhA@W(F#w)kT?a*L0-U7GEAZ$(RquFaFh{`;(UD-3V_2_cWT65v)+L| zQC7KCHUW#LxBAUZND&4J9zTAJuvZb}7AcL(==$=^dZH`>wg#RKX@O@RI>9pi#To`3 zmA&*_0#{fP252BV_yWwdzh7m?o4H#CXsa2MSXO_&%ufA|_l_|vhR|L!LE7f}pS$cS zT$J>%s%kLlm6zg(36vmhxHz0E_bzNC!rz1umm?frrbi)?e(<62XhpDeHWU?*`puSt ziORPOMVjFjJe|N~EZ@9I8W5)_ah(1gsCZ_ zU{dZAQGwD7Xk3Yf9UT?i_n%(&=ol={70m>?{Xy{njn zg5vmNt>FDblNC$mPk%VDqGGyJ*=;J#lIQ~sqvh#C9;yBl+Cwj~tw(#;?CTqUicN{k zS)z}u@=iH){p24l+Iw$~3ah7z5(GwmC3G`hXEUe=d4mfHig_U|MvO~=vVGGIT3a0- z7t+(=@P5jC=j8Qqy^Vt2YekrYGO+I8?fj0OTy#geOb zYX(+bX*IgoBeMnh04^-QhiSAPLr3@g7KQaq4%2)kn`Odsa~xN?ZhDMt1#4vvh2aT` zuiw8Pu8P(2!S*3(B;=hKu-W9VD2Z;TKJ`5p32!k1=Mt~u??^{ouj9z3iE9jnD^t!% zWNbZaS24^fF%V+nMM&3H9?q!T%0G?w#PB)rjD9!wq@LP&&GVPEOk0majw~I1vk@n> zyBGLb>A~%z0jk;>QJPq9aGQ@e|9rK2W-{zix3|}uJWV;S+Mtx}86zSe*QcHQ)%(QL zkoSwIy|DMqv8s3ozsd55(t=f*_TcIB%FIUuApz!^s=t(88f{wUte2Eim)W*e?K?_e z_e${pJ|dW2;>JGfNqoipiSLI<;iaC_^;xGB+vEJD2R@ab)bHKa;=;3O{^HIhtj};D z;y9lq2RdxxwoY+qkY-G71+_lMVdUpvI^ zl)MbJ=ikJ$S$b=V^&xmeyc7E|8S8Q?(p5txg`KZRxAIa?_VxD(q7ccNyXz(=qy6pV zu8;Y9;}x})?pGxo)j%uBcvbeo$5w1%CfH*Z4ceJrG@4-Slg`B)o z*;=)vZ_fu)EvN=z{VJVQrMskdVRJumj zd)1?nMd$pwRoo-j&2zEx_=G$r4(|+}c<@^K>D#?2?~=AWtyOujBFbPhL2r=6^GY2L z9=v_5a$~nFj$VFMx0RJ=ly#J_Q|^W7k*Xe)`>n9>q_rwKXDXmH^B&4=JY{96!S;jIbapB$a*G!8M24=`xgL!=*ZbH_yEUa(#wk%{CoA2LHR^3#=$2VF zKCMQzuRcbz$s-67PA zdbGuzqa3*q_wKCqF~dwh8Sah1Q}UBUMO8-cOfLG35ff8|k++=H`o(lI)HlhRGT*#O zbWQW~(dL&cnDRu6IR8uf?#Z|XCT5{iE zoN(!t)=!&sj68OG*WmE|@0iNCmx5;cOGaOZ*9I?}PtA$B_sBU_1lYE8>AL5#AAds1 zN%_vdGB_vyZ}()wV)!lp8BV%7Yua&pfsDX|h0vnjvaQ)s5z|8%t?e2HxNIZO6 z<4#B$H>bH>(%}}LJucFC%K4VdJZ8r>y;G#b2CY#4Wz|Ybd`B&T368Lvq??8JXAg<5 zT;_?SBO?PRIoldO?mf-!FY+v-9@*?@FyH4szOK4FWB4Yspcx*dT^EYr5rgWC1K#y$CS@ZzwR?) zqWmA8&N3{ju3zJHNlQwXba%Iu)PMp40@BjmAt~M6Ez%*~U4noJ(jC&>aMrx3T%wZfqXH~AX&yJf-ciJs1QCy{6XJU!uFVV#4}IrXu#NtMjruisBhL+v`Xc_?k)( zj@Jppj6c|u6v^tmqc=cSheYK0;rK<1?LAe{ed@X9JparoK*4-O`SEE@AYAL?-=R_V z&Np0w9-R-rHnh<%+!N=;3>9w+@jz}E3G8<_`@`4>JV^&>RC#Pvk8%}XVRf}mEA6FN zzAD!_X8{r^TPa(S=GxuGPC09t2;Em-o>shF@BRb)tf$hFKhWk@oT4&B>&V^|L9wV$ z=sHG@k%8BMD{XVH#BWg7&ZfUsu>iSPie&Ffnb&}EgDNN&R01yd4Gb3=$Pg#Qp*l# zxw+s|-maF(HCAcPGw|n(Fo@LNZG0ZAJyZT+P}No&_rog3*wC={GBwJCO9$ekL5x+T zsUQ+T0s0h7;9@ZbR0PFGrgw31^0eXPSbm*~UrcpCrJF*h-P33I-h=58P9)o$X+zz* zLx#HBs_GHgui30&et&o@OE*xAMgE!`j^_Ipe`?lwLrgF9FyHsm)bECs@S-CHVn~Dp z8Ofi3qgotPy}!Fd3GE%r;II0oB!$vFG=u`Gw%`~Fl0LeS4j>KZPcnd1N*YH_BK1qG zsV)x1eJvJ(0+6Z)3M_yF=tJ(owg@OFsr5e1Kw!QE^nk-)c81qE!gjcf{e_R`ja9|P z12612i7z%Xz`k#p#=ieTNpToR>7Zsn_lKX)uK!|mrBj;)+SvrDFp$YGgIHw(oeW6! zrBziqJnfwZ;D?Hgj7pkO%>1$gU2zr{zC>jn}dOku&^#(A`c& zsvx&Av}Q1&+&UM!SCj~H6jVipH8l=X{OJ$3AU_d=9RLg$LT7f06?U#`ZOe<64FM3I ziBl;_6Zd;6jQnau1k{_CXu+wXUapgNIjB@nZV4XRWU9wz%bfQSId>a(ZR)8TX(%wL zjEJDgqT!>I*k$6>QiMVY%9=wS@Rl+ZrHcFcsvlYb+7EDC2L=Z6g9eQs|tUD7S zAefFDCIi$6Qt=7JEoWuQ!mgSmesUm1u->7oyiNd>JDQ+eApCx7)gQF^v$KwXpWB!E zOOH|hwFbjrH;d!re1kGDY9}ptp55^zgtCgkdDLD2yyMeBmY2Sc#~k3{;%!eF3f{CB zS7yE(n*{)AQP$&{SrCnMT?Vr>OGpwg*@#n3N&|7a|_0Sh8O zofe=~xO{91|CGDWY%1csBwNv+=?#t{;M4Ww>Byq5GRM!h%sJ8m6^qHsz72=}>MMnw zdfFX5w+|v|iPW7p1fb0hirlP>E1*RWF8+k==^d}hM_l93^^;%NS%-&XrB(E=qEzPj z^bJ@DjDcJT4zcWoOD4bCMu81Mz|QI6b0>H4j8LO9^{&Z)wHD%##e6ToiqV^OWG4>c`UvXz#Lj=GtEP74V79S)Sw=S)~ z*>|}$6LMpffHL6DAJx$zwtgxA#1FYBq^l3Hc{4T(X5h4$Fm`d$9CT>&P2wW= z?07rPP->va2<=$2L;y9Eu$>%MNLb&vAC&3|l*=-T6}-HJyAACj%ccjm$#lajhLH`b zCDSY)+nNf!!TSvZPRXhQPa9H(?cnTAr?Zf(J`jdUwJ0|O>Kbz`KX{J6?xFa&V_{%; z_`nm9MFE`#U=xgiHYEFE*;(%c6#}@DWgDe;NJvQm0?A7edi(GgeV{V581iYm`juzP zQs(|b#ZMGuL}VLP#a*@D-k;47M)ZRlhJh&tn?pn-91>nxDP0IjuxfvV-VDqwK2P5L z4HG%+1QGc9IjwBQ0@2_?7N();m7U0_R27H&pm?uXqn7%)^X?7_s0x9|SX=@E97Vo+ zu0jyVU#G98r3h;%g~-TrEj}dg2dQKRqI7pifMxmJ?{2aQG9Zm2X=g_PG!1CQZFW{y zQ_SxkDDEX9PeoT{?)f?#JUo%6f1!DfqoeifI@=Y{Kpf5%OV?qNU8_9bbMm2Ne_D2Sb_%z8N(($7G~3G&PRO+Dy8Xu-8IHxmY6A~d z;FnKzl&D>MFzr1Vq4G1{G&`$5v#dqQ1n3PlB=Hy|n*xn%m2sqP1tt&g7@;T4w9a&a zZ;CgR@*LZ-<21EgOq3ivn`Mg#dz4#EN8h6toZP*GY3QH2a(z|i&vD`AVsAf6Fhhw)R;@IE1P&XJ&Nk2D!T|T?lvGVRcsy>)+e;5><#GE!+%`E(f z1+%&1+HnW6yxPuU;!!$dH^R{xxHhGnTwG`jSH~*`xD(n;tQi?q+S!x_9pDpwo4(wh zLFRfmugsuHX(r|mVL3c>ndlX`xVq-%;24q*tDlS`ih)>o+e88Z8Wwz9;oH1QKYq{d zoB%>j!zX-gb>w0gb3W^pmLwUMsB6Qfajg+|3*xV)Iw1TW8l7$X&+X7ePy+jnF}$_s z28I+(?=o@j>?2S;mHwrWV=b}M(Z-EGX6BIgqey$3e6`VSjQ#$7*b`w%bs=S{DlQJk zdg0S-WtJ?-&dok^h}KknAS4`Q2@^%gHHP_xgu)@1;F4heV2`D-0sF`!!?*!^aVBUEtU0 z9B?28Ml2nM46$kEF1C~!h-4WN1@*2GSHYap5TRg4H4zX@hNIMxkoce0$kR(jBk!nB z>`Vyjh#>~`;(#5;mM`0+tVXQqw?H`3+2cP-DRh~seNqf7$1iZcB7m`{Fm+m$A*>GABKYse%weCo>wE>7Yj@o@IzGJ-)HcK6_ozRjGrY zc_aPG17i{3c&T(Z>{k`4c`W2}Z3!vnB}$l*JHM1_iHUy|FV7H(A4ZudTB)qjEwgbj z2prY6`g!3k`+ws<4XQ<`Buq(^@e``%&InmQpN_EbboPHpPnzoRzE!NS%fDy-QkB*ytg^3XmTh2`|kOChs@=eok8Pm7^?g93w#kE$aAQ7iNS5C73%vYVe0^ z1&SQU>$tE)Pjj;$MgdoEr~#VeAAM12xDHnTs`i$H=z^K)W!dgoQhpyjY)M-sPiKPg z`iA@WG0B%ea|!jO*9%1}H1c%QpvBE=`Hp5()P9B)YQ}0Rs*=cleXsY*FW=G7asFH3 z6AUI=9?iM{Hd-i7npf4_9Z#)g|Lw9{v=R2yM|!j%BQ+dM9Q%Vd9!o|dw{1lQ0aDjn zeniMCP0b)wO&M%o_OUThK6M*nCjK(al=mzm21VxuaiIWC*Wt@1}DrkHwP zx#QFR~=O zIT`GYdP~_&tix)ybmxoQ^;{UYW~Hs zKoS$JK8&D3JV*owGJ$rB2nyJIzsC#$PNSXT{;V-#l`!*EXuovUvFpSgc=HQCMMhcH z<(DLNEFICiW}YnJ(-0$fw{Lzv+R5+ml=tMQ=3PjU?v1WUcY{}_P>uzh7{@J-DYF{( zlt9Xzux0L$mau$&{rWBQ_}9D{txUnf<#GGJbL!gd$yl|g~3&Or#lD=yvp*@Du^E+y}rk;G4xRpRpeO?H_y3X=#Wz; zbgQdTIHD-NtG~9csn663-JrDJjhyg?H(dDAjHAWIlS(l7_PxDGp1s1rU97YOLksR3 zMxkWl7l=enT(30DI-9w9OXb1ZVDbxnAPjCceqgPudOb{3VNB%1V_I|~uHK*6?ZptD z5bPj~;G$Yl%0d|Frj@^#I<+{c2c&#AGf&t!(ngHkX}knw+jOWw1wp+=?C4+2V5IWY zmWe9CDFPfcd0x@shPkiMv8oKW4CJu@Ve#ecT+3n z;D=4(ft!&(dEiQmi+ZrhF!3YUL)eTo!G+8Xfqz1^rh1+luac4!lN&U>uOfMPDgMfeuS|MFpr`b$*(mh$p8Z~_dioEaK-E#O94j)#oe(91c38Y# zFfZ7y%a=^c6imkJeRFvRO?vuBoOF3wB4T1dJm3R2`Q>}y3!o!~4Q_xR1stC^loQ-V zq7ziA^SJw}>9jvsP3l>;dJ(mGx(GHuLf7*K8*t?BHssi^>&$7P0FIo_VZ`9Za>0NU zN)_qz@u_C2uzHJ5?4m^D=Td?ITRR^8lP{Hq+R|m3*dV*Iyu(b zX>fbDW|tD}RCQk^`i8z2)H;`s!KR)b|II%&v}99{(=(SxU_+9(=j-pYj^^j(8R-dn zdOK2hCP9)lqdP^O-HAg0Sky)RVtREoH--G9$7SZvA5SV_>xE*dcyP*P|Hy218kA^!apYXJ!dxw5uCl|OIDj~XT$G!YRim5pOzu9ZUs8R+a3`26Q z+jf$y%%Hzmu+NQQNvHkZ=Gc(&aH-{I@v?p&6uzzt8PB=LRI?Sr_(^gzN(M|i zQcUxo8b{aZ@275^1wTx#F~|!CT~u1~0_(hKTk!^#Jn zx)xx;G9{U~jctYnN-{-v3n>ZN_XohFOi&a&#t-A|^>MJ*htBa>oGlOV5RNJ0lz*ev z+-!vtP)W8LKG{Wq9ew@j)$+sE8DIK#`M05M&WjrhaIpe`MKoP2#ty(D32e%yXMX?2 zMnM1yC5UVS0#=snXmoUR{M*;<4=p0f89UT&VlPfHo40naEIw!(gSTvp?)!I?t^*=aN5FbA@RF%mGcqsvoNgN2`=NP6bQ&>~wu4d1zR!2Yj!>*=_?`eDsig zKKD!b^xm+YiF8zNF+VL+IH4aI6G z{Psey@PHF5t`@E^k}Db>93QYGltxn{)&V>pbP(5oVq= zMmJH~qyM>#t&lIWAvN%Nc%Syvuy_0WaN>@v|EL?r>K*|f7vXkb?&k_|Xm;Z27=Z=( z{d{2OLvLD{)PBR9cYk?Z*p=*&`>ps3)AJ8ihHg?E%->SAj5Y+Qj|B*s3>sCR^%?X# z`1VhK4q|U@Pv;mMpiPOvQRpE3z)9lIvJK5{^&WB4&v`x=u2>Pjx*bXTw$$<@WuD&Y z?f2Mx$Y}C+?|I3=ItY$&FhzqQ$1l@yA&Xr#&tYtq-LGZ%Zmmk%3pXi1%1L+3_gvRl zg~#^kiSV-*=LWZqVDp}{xFTv%{^&S=_LEPELtzD12}wPjAE%n4k@D`WPV@6*VkuwFfaO_ab`v-)f8PEGi<>y2UpA**Kxx1$@aiFBv9t z`DQ@-e;5QL&^zy*9EPtWnUxM1F(svdyF8MkgH?9}u^h+B!bh6zMU)Nuu@K z%!6Z-?j=Up%e}ioU+j@pUN^l$?nhRv**1m0x;IYAiWBOA6PjI^>oT+|{nHX$20e_D z^?U5XD?*vTld8AxaArUz&2awU$Wi%?Gx|wLX)ZiY^T_Al>KvamV`yLjW&vZcjV@iN z+c48~>tJNSxd^ec-scvnogvN9QC4&6JN1+e{+5$zjOM*FAMvZ!o?099wGXGlm;#@@ zJ;i@!+dX=;`5V)`_|+yig3TAfr$ua<3N!j;IVgY7K34Wj4{o;+Z?8VkhLF6(+lC>y z(TNnch zm6)D({3Vqo9N#(N+U)7Q_fJ$G;Cv&LwN3G~MgeroO~5t*EB(pMMvN2S44i|%Dc&K< z1CIg9&;c6uUCXPWLnb)IXkt<6T>FaC7L9;$JnLOv`BKiZ^jsKe2Bi2S80~1Khgk&D zS3Tj{irDeQsyOwhyKxld-ZC>c$9ixRc*Kax3R|sCdXVff@piGou832SrS?2$znK0O zK3U&_brvITA={I_*dF}^Gn_-r;hu93+TrO9!P(&bacqvXQ=vab{Kb5)0}eR@weIOC zro#U)COWZfZ`kiM(D@Dd->*bZS)z8hS9ZtG_kO>tUf@}p;1hSm{}5`N<<4{G!!kP= z_OC1!lYbw)*FMWcV}M)cxGAUad;M6TiIDX+Gh)? zsGD~=#@qI4<#;Lt5~K}@Wq(-&$9fT{CU46p3-8%U7(i;>H$*)J=Q4JH)Fi;XQve3`-pm=!3*->v6@BZwyEC)vNiC21(rcioa48BzIn} z5mDl4cf3KAlwi2iKvG!1u>4KJ-_-U&Es180r9LHkhj>0amPJuvLR;1BXaK@;9yLbI z@`JGGVqwdO*9La_b2r3_~QpoL_aim3u;<}4p6zA2p!tL zA~w(L28}9c3be5i=$hKqhF^A){|R7%2B#rNa{$6Hd!{T9W#_-!7x!*7a}zdL+3;sq z&%I2s7RQzm=Of_D?NOUU>!BZ1OL9o0^fyw9b(ILA;NqH0Q_#&U~`^8)8#Zwt9%6io|3>Y}+C+?V&^ks!c(>jkM%=RkB|Fu6tvB=q5b=NX&%Hxdh5#?lu(EDaXN%p~FUoApqPEb{_{Wly2#Ul%h$%Zz zU*u_`gmYJlOj3dhmr}RiA*KgftYzEQ2*XC-8&Hvg0xFLISR7Q>J9?4DZzQ{15cL~v z$tXaO5ZbEDxg9f~>+}-`QuLzhf{pjxbdDLQGK0nXyj0aEe8T6*Nzc9;QP1acM&$Ch zMz-$$OI49>-dVj~9rLVy(d)%t5bKTbjl3`kWX$dHM?+QbHATm5pcKW>7saP6x_>=1 zoZp>O8Vg5Vyk1lZBff}V?2>EkWZxH8W5QWd^Qn8tJw2?}pJGS^=>R2LjA(PmOUk*} zykX0p*ciLR&oogz#Ovu;Ro-_R-_WH|L59gBWE-7S&dAmyC;MiO&+u%tor=#H*#hkw zDxx8z>#1oo)CRU?V>%+YmMm^*A0zlt93qJyd1fOT29=W& zJUcI@Fx!6PuObUm(~jO(MG{9AvdRk{b^hW$CW**2 zxyj(yLS{Po`Dq7P{16!0@6NNI^~~7JY4*ID(rzaViR zb}BiunMUsoajZ@5yN^G??MzA;wCeY?<99P=F4nsdBR&o)D(x$-2XnAp%k5sk2jvCz zj77ia$87UKzXidk7VN)U7jLQRTKAPh+4;WG*h3pz#!L%fV$dye^N=1bU!)i#Y&K; zjds%*5$F+^iZS`y@YCRF%iwH_-7jswvNgKOdy5&ki?C4eSY`9}E3m_y0Bz%MRaHBK z;!ia@RO75i7qlh&Lot$! zZtMv3b6BvYDCD8BJd%4I7*6?J2yk8B5DQElH&VOaA8BLh!qu^iuo>f`P}1gdGv2Np zagcbybKi-9G>0*UMpv<9AjJV@on7HUl)=F17dSY_PxB3tTq8pR>M!J%fs_xJO(Wp3 z^Gt6cc%d^_)%!I6=aIni&k1(n4!>T;KJ$JwI@Ln#rKDxH2$zjFKz^Q zz7MxK&w|h=c)MH~*`N_9U_6|qbb>H`@Pch?6iD1|9bN+%g!^t@C`=OoQ43x#I<>DP zLgpHNPOFOl1o7FX)qodg$woQ^6XMSz&@Q_+V3sv5xsm{HarqcZ^oG7Z2&6z?AG|xl zc7i}}0enCx8P|T^vg9-_#ybe4p|_NEQLw~rD1n)-=eFdM%>~{A-`i0GXcK^d{o~km z=d(}8NiaJH@ZQTie8Hz<1X`-lLIlO*uAH9V<7KAnyd@M+49d%f>90W+TZyeX2n4y^ z(PBVw0Nn5ZoyS~Sk11;x(5V*nPKP?@AmN2-VpS`(i?EDE0CU;yEv7b^rpHhGT zL=5mr6CpsY)BGE~VGp9&XV6a`aQsvRx9~pI5ET>@fCM;hSRanoZYGPsM}lOCKjUvB z%W@h>&6S!zZ`SK$eeBiw?^9>U2-XLvCsR}jFMq{JH z>LMv$715gWzDSRD3w$j(rzTP>{B};Q+@k&)mau@>7lV-y^Z5YhL41BTdGeS0lwX*e zKF;D8&f%74X1u12v3dCHr;P(EC*#Z(TxRcC%{$ifZSy5>K=|3YoJI&HqkwvVv_oL1 zYY4Fl<3D&)g{xt79a@aSiJYTClRh>te-p{34FLcl;)=%9Q@Y`7VYPjMG!>w!Gifm3 zAXTJUzK5=c?|;pfT#pd#7BRwLe7EuK#2q|sb3wiuJX9P9h|N#_l~sYvAN1*iEoHyr za}8K{SxM#qinQplljUCqb2F9FAp@cvko&O{wxZv;q-K-;+QJ_q=fMnSyXUQ>*JC+$sQ(!-IO)YehzAPXS!#+O4h?v)5qj z2dzeemQFLky4+s-J-bcq@g$p`LWgKTr;mU}4k-6S?*m{*Zp@#9<9aQaNb+`?_EnVQ zvd7LTz%0X7PVF)q+@gU;#(q_2Hb~&(k+$0To@aoba6EVgOj3a^z=|IQbow^29G|Pd zK(ds|!`{hs!US}O^(HRxpg7N1*-X~8H3FSysO3ju*A@pnXM%Em3h1)~y2FK{O^#dL zMaDsD3)*1-sh)>k@dxQY$E!>mww;fzAWZ}CaXXc*M`zp6ff1`i zfWUox4U{|(?b5D4%hQgGj>e4{1Wyx?LpcZZwjjUzM!${wm1i-4l@fAWbb#_=cfKH# zuf{8}t;vbs;S8JL8tUm@-2Hx5E`qdy1bqQ`xTt!szl5&Q|Jz{04MPCNt6ZBnJv=&i zWV<)?7BU}^XD#a@Yp~R%J`@Md3I%!}{7@ucprHK_kQayjcdGEf#97N_xZ7afPm>tn%8)g5;LVC!=LNawm){;IPV#(5$!P3%a1u@ zmNF81@Fk8S)2#nt*)y>1g()56L}h=vPM~0vt8QM|#L|a-vS;WTU4osG zTdkmmjbxI9B+-4<2$or6*i}VCig)aJ`7TeMNwV#sX+t3@5o4(!KF|5|{^;u@IX&+8 z)gRH6F~@95FJpMZuX)ma%g$8WVa3iA={k?O+fTBPwtjVSTBO6Cel6({fcRx3nz^!GD3tQ$z7Dem(2! z05(}5V*PY8C=Lhsm~R010d!bE;PCum#^CYYM>~J$B`ino5d?+XmEz^x{VusVjqw{Z zc<46(iHtGh7|{U$=Mw-&h34m{`)2UU&0u^$*qZ{7iXr#8PzPgLx}hW)w$2B~r=#Pi zXb?Sd0#5?RX=Q!~fWO0ydULzDBQ^-YCc~f?t_pe^&@Ku1&z<^d6{x`m6l&0%Z3aot zCa|q;Iv*EC4VEGma{dZkD)c<}ilB`)a6?V_JvnuN)y^y`0vPUq90OiSLq}(L<>BYb z$$8FmW`=D`7(m?t#s-wT2huZC0ItUSe4+Z>2bibPiSeL32mnGq5efBTw<r3t*I zJHYuht%Z;XIPYSEZEV}=00{`f5FSQ>YR-!;{*F_Oalsu@==dG%07DLFRSJ~MpC7NC zMp+w8+8s^-y~}A*maxYA_RPVmheuiLswTz(d`3Xx90fO{y4Yo@>M1w|r&Rep?#nZx zBEZT0HkMKxrWZR!1*efO{z`f|zIS`j&dh&D%KdD=ULM8ZUz7ku1uW4EZ16L(NudDV zOfLZ0z<|dI!1v6bzxfP^i{NnzrGH_DdaQmyo&yAX&~1S({h2O*bPxh1$vt)eWrSE|O-*j4jTMyx0^wIr*tX>;%(*~gjCm6w? zqt+>3!(KqJ8xgY{Qi%UJKq7E{53+XWsSkgp8z{;e-<)~E8CJzYvP2;j9u?J)@Kc;3 zPT@g7*s#kDqq1KatJVlGx<{$kWpKqveQ3~Ew~@LK2)bY-=q0Ehq?>%jZ=yog2MNel zmz4dnznwMlQVCYF6oq8u1;#ZbW>`ryAVL=b*jP$`n+=XR!%EPTClbb9MEOv}qMfxl zaHb|-L@B=N4e!F^wMnLNZ(#0&4!EP9h_PPlcF*t$Zq4?TL;P{Mfv*h#5{eUgqTB4|WA88IzND-+@^+rGFnf}w9 z%)-`=ru^s6lS@zawS<=UbKA)*5>hhxe{F}^6tjtj zy7OBnC`RHLbJiHbobn=xED^0YwPR;tO@eW;b%F23MK@MQa+pl{WoWy|;whW)_J$hd zEb0$O*>?KkqS{9+jSx8Qms~YNo5p^G0`aefCW@I0hki+VBPSVOoD}w=1)Lt-Lcb%vB-KKy--3fF9;tl?9iV_$o!ylksu6neKAdD~stRFx&6QJP$O`jZkcR-hJ zcHh$%zK{C`z?Z=20`ml#K$7kD$gOh}EW}Og;mjgN&w@Ld@DkKkl+C9G1#HYtikPw7PK}A1)~Ae2i$q)j4!}y zt^?@AP*I_FlnowAjf4Rsk)SL2$=!Zc9oV-fLcorBQA0@Ip&#G-eqm)BEd3)8e~TDm zeU*3b@S)!t^yfE08<3U=ogkB2oOEz4>)0C#E?b4}iEsZjb|hV9B*t@(3&8tBlWKro zVzcn@Ppfl1-Kx?E6l=lL&`GZuqG(r<1)e3|8zd|L*$jX~H%p66k%Xe4Z5A3;8@B!` z6Zi6Y_I2c_7pS0m`XL8OE*ZAzFRj&#o+f+M;zt$uL{wHs_k(+xWDjt!_u0yps2Me2h5AGj16IPBQ?Z;U% zM$;h(K~{MOjd+bSn=+0IvE99`J+Y|xCd}30LydkztCMfV$X*+je{@-7q}OPnvKh-f zCFseb2o_j>e~Gw2ZRpb&w#(P#S<_w4bnO!qyVf$Y?@>oY(%`R&@=^jPwz`tLOBeoi zf$JLqJ(&gibU8*mNx@{>?`obule9y64v$UdI1SH+#!;?+D-hhJ#^mP3hMbY3`~4U+ zIqc>D|G16BeMs$&B)kZSo8=F;eSn=j=Fz?d?`GzCQ(Ggx7jF`;5Z&O!Vbi zQYEF`wY;s~>$HqXwRy#p&#L|_WOxsUQwYH7M~jy*^Z9q2H74-yv1E;*+NSLkL*Cw7 zDhT|r(xwhJmAKPCu@f;l_*7baGCbiQTszluw$VPLH|nJ^(wi{6KgOzKHae*psc}%9 zJMor_Awcvoz4?X+lrZWt&M8En9@hgm&+j9cqslZ2bcHmpJ0{oP%(f z@au8qX^7?NZY2t%oc%OJ!N$b;E%uT2t^>-eTf)ZdDg#m!&Un`wsjR{Q|CM^gza-qZ zo-a4>kWqC{?xkT}FJ4eXj0!|#(>{uOW-s2VAL$%sbmr#0#?wPJ5YjxAstncbC4!sB zO>_DKJK9M8T59SGhUn`IWLoCswFVy>-B@*{Q>`fMp9jxt^{b_iRP&*4kD5$G0?S#o z;kkMxl(f`~@S~o@LePVqU)yC?NSJIleBCtO%0Q!W>il=N=#ncU8=XL$*_~+k^9fs@ z=5?b1M30OcH5vZ-hw$4XMxwWL*xwVp+Y?d?t%U3thMDk>cPwM3ryMOcN<-fNbCqd+MRDEfY7x z-PRoI{|!mQ+077i4jn5TeTThSd_IyCUM+al%D8w>$KOu1ct4_)B2D6TAG3Z;r_@a4 zMiY{6ta)|;%kOMrt2E8mX4Nq)`198*ZtYHHskzLDhZ?dbdt3h(%&V7T3E~a11CwV$ zOMji-vgvaCi7_57`=A)LK**+CUhR_|bvu<5;IB#k6E7BRkm$bL$SiUl-7Z^yp2JOa z*ZD5{`j7N@3vVzr3mJPB6&}@IdxJ&TV|OY0u5fRhP$+^-5?=Ba%JN(C5<3DcyQ4xV^{faFr= zed`F-CH^zr35abV4&D#cYQ(0I(8Z33AB88)OM}{1|mY6$xsD-%|5mDxFZAf z4wDYGPJkK=^6x4_Syy-hV0#6|k!>xhsjIIAH%w7|uMYqTVh~IRWWno1k*6A<=9xpQ z+tYjHr;}T|r?C(K5g!5;@*rd1#T~X(WX2!}PC&Pa&=V_I2ZY2xZ$Dbs9D3#j@7yNf z7jHV{+>?Ux32x`(p-D6$VhvWPHc22ril*4G`Sr+ z;Lg8$;9~+IRhZvNm) zIfK?C6;J9olHrpQ+;^0yP)LyQk$6lHQZ145xBE0>5(BDvzNS+;I{Tw4VT+8PF7msC z-t@B#^OhjF9|zPN(}Wm0$atPRmFFhjbF_&m<=8}=IBNMva5uJ5Y7D93gtICZpljuO7{w9OnJ=THKl63d?0??JQ;1Q z4!vS?@p!#Rtp;jn8@XHmeLb!yjJ;OQ1Dc zk2>J9C>wnnVYvzSKN-2LB1=Lq#Dmx&Ax8VFvT<9M2FE7OLEQJLC~5JL@NiFwq_AU= zO5QhF?@V0DioO=aSoHWJDXaDiFFxoyDf!%c1yc$8b7Axk-`5s&%c`?Zy)+nQhp@fx z;3VI{kT2*Af30XbWKl52sWi;8O!$g7hPj6w7>;DM?qONKn^@hAlr%_~U6Ay9 z1%i)d;JD@!wj$iO7rgU+(SIg`)r?GLouSgZT{OxQ@Wni!MXo(bvMVcz&kDAsqj$IHDeUc^3 z8L24DQg2;kUddp5mC2>zaX80dOnn*hN^sfg{6@-fAwWVx0?j^`8YY4|K9{%RmFJj| zuo@wg4Z16Oy2$be1*PECmpeUIsl)6IS~c~Gj-!8%^!+{~Y9iK-E^%V7mQ*CAemcNR zed#<=Ul0GT=#3{tqRYrCrn^cmKekq&49#ru^?7%EfTHrT5SwT&{<9qx(#@pq%)>2p z!YEg!cmZ0l=N+tI~mtO^6PMBS$fj;Kkn3*4UV~LAniYU6R5k) z*}+QEDlO8k<*1V|jI0u>-rCxAu7#F>2dKddwuUejEBrNCi(IXF^ahhrL?u9-VVkwi+od$(Diq zJWa5@)c`{wO4HTcVGeozg=41Q3T|_Ro4VWTZHb`S7Qn_ZbiAeJkAmYPM?0O)ihthG z3*;s-G}ih`fAwndEEHJ#OrQe7-CM*!G#U3&A{`pIoQRBar-UVlI%4MDEiPnK?~i$# zHbfHoFHb&!7jh%<-hu?`bnN*0)$sQ$v`dP0@C;4q2u7$${tgPpiQ6iDXB^Fc;cjh* zc#mw2&M5ykJSNN3@lBSJaEMRF#;GWNkegg{f66TE#{-mey$l5lYQiyU93{r17I`lE ze`o7G*sD$EZW`E3)VO(y5x?>hmEiIAFtj^_s*GWo=R)*_s1f{aSfgM$ZS0ycueF^I1HQFvcPb_DIDyU~gJm)y6 zykno9uEXN)g6V;&ML}z#hE><&$nRt2bl`vWGKF^K;M$XSnD^GwHgYnfkEkw9mhRPt zBk!b)G*5W`z*iEGa7II~bG^Q}fV|temUFpwp182);DGFnSKghpYMl{S%Li#o)NR>v%IeGI)eW4qsRCw zFn&Bn!d0>xjcIYxD}V(kq-XAM8|TpaD} zU@~3EihHETB$T*Vt*cmEI7k^!K-!z_RUC4z>UIdopT`20Vf|?Br=G-A<~!BkE#w7E z(_=J>FDu=o$KDC>L|!^biYA(V7q#>gr^nw5)m3$Sq*p$7fP8FcHcKr}oxJvGv-e$d zvHsdj5dVVyUcmRWTYJg)nZxItA6>ef@-*6FOM_rQ*=Ux)&%v;1v?>F%8*j6=53CXO zu1P{LDHITa=oNwdXY&pG^9bY*cRhLiF?M?aS~MrO(r}19`&&N0;VWwoHtH4~?kKa~ zP<`XWf3s|>rZJyC`bUpz!hol6_T39)LhOI*h|&>xCx>|uC@-)a zkS;$q7t36E>G#Xqy5iq;d~mE;a7LSAT`i=FZFK&{N4w%C9gruPx=J#wKdvCzLhKCA z2A+*Y)3KjjpARX73vJ<6xe-=~gRIy1iis%#mNOBbAhC{G4 zqGq#9X0xtU{N}R^X0y(I*+j{DJ|@SP*9SuVjr$=EZHTeXslxBM+|m`EC0VYi55~BQ zh_{tFQd4eiD74F(j4Y2&bvc#|yKjzmu5d7rVC-5=4Q#VsC8cNder@6QCse?xicqu ziG)0Zb^K~0K=8FmB3X&*wZq-T@(xSJ=UNukVgteH!I8A!9hB7}1td9@kqoQ^WddbB zudGIE@j%mimmIfW+JPB^R~?nA<2<*UV@GH7Dx882iYwP^tEWiFF%<6JD%L!LoX5Uw z8P>Jdj>n!8f|ROH2SyUC2&A1k-KX~$i_5Z3;d18!9?3Zg=VCd+x&cgyPPF#c1;aJ1 zMo+Bd<`Pk`$K6eiffHobQd>tc{g?fL$SWZRs48hk8;R8CQ@zG^((uQ9_o!cf*sn>k zjIM*23hW1~j=PyE%x@1cvl;~e!3)Nr?DXJ#DXX^l{CR{C+tZc>t9~+biDhE@04mGE z-1JgBbphY@Z<7PfgxEa`Xg(bm!3-z215*f0=oX6JSUta0Q-tcTJL9_5hOl7hGe5{X zgQ?SCQXLau4vqs!c@)Z81p2m)X7dzxo-^Np=99xh0~n6Z@crwMrv?fY*s;?mDPBS_ z8Hzw>25N00le_x^8XCZ(_FQ*di$Xn}%J!>y-$C{aiPlu=eD+$Zu`^L25J=zJ+7djD zf7l1Smbd9VrCsN=x-lC9h#PU2GQ;f0=eJ-Ag5!~Aga`581}Nk=McA zDCUxH@B+Z7ZUzJ_58(B>14Ra^E`=1vj(X;SB9>Uv`+GELm(FWYlAC+?)al0Cxf>*| zikGRzKh;hwHtxBwW>7j7%43~MpT}k{cJ`E!!P8l1Xj8Bn zTie}uZK0G{k=#qnYsz!ucDya4*lV{k`yf|mHVQXccq3qLVy(W|?yv43aL#HOv(%^>}B`#g1Qb#*U!>}4jKyA|__Hu8S4 z!%Ci;fwU#~=YyZu!Q+8(BShFE8h2Uba_*)3xwU3qh3M{JvBOH3$
  • ;lrGM=y2xykGsXq$naP8D(Udsv z)sSvj+UCOGks`Vk+k1;1MaQY_7!T0arXp?X{y)+7*5zx|L!O$531jx4h;SH@BZaB@ z?1~a&l}SRPcso0bMW4j6A-cTqP)qS)WdBV1N&Ma2MVD>mT(zAp*B#URs`Z!2eI0z( z>kS-Nj!9dsC&{anyUEMJmjBoh(c$`S!FMR#;)s&Ak6j}#16SpxEJLW^;|9-;$x%wq zfoj1na`F9>YUyITz45INl&&tpQQ%T44o2D7;}ZC%Q4|<%M}HgTl@8spQT{b73M{u> zzKx=xbF4$*zlY^^UaijCR6}fpUD@I>E8tiR@JL>D)vbT*0og#>l#>}@QA`+uFv|9L zn{-$8&fcL$Tn8L7MhjR??WH$v$GP$FH@@cQE?lzOD%HR~YUR9SEl@wlXiZQ4KfUzh z*Fto_c&0YReuvvy9YrZ%Go5{hwGaAbEtYE?8>Q-C6qlvq*nb@5ci&-+gHfIrksn*z zzlX)pH#YwO;f9*IM-{&BI0&%q0vmAIW8)wBY3&C*j&B{nCJW*Bj!NGQ8DKJm9CG9$ zjSn!#e#am`7PJh0YhErdZ%EEg8Xo zOclCI02Z!u)Pm?x)4roa98fF~U$)xczh9X@yf5G=NxK99E9*Y37ti>sH-83jM#CWW z0KYwV(t>~f{y^+~lMXnKqZBUoPn;(@jW_v?e>;e=$2#Gy1p^9Wyll`!`Us;PmTv&k zbb5uNgC(VlWpW$^JjE1nTGrZfxfPFM{^;UwbNN>KOK0~B7fv$dWGuXUcid6wVgbZk zvyS`dB*y@}En#40&?;cmAHiDIJ;!KdrgBVGoQeYg<9s-DY~9C}*KmH}o2oD}?)$|k zr?=ynOa2Mbz*=+h+=wIh#bvxB2dD-5%H2l5R;VzB;W@6wsCIDj@H~ewt7^;sk!!JNtD>)F9YMhDu zF+TifT3(JR0Ny4eBL6S%Uhf*g^Qsc^dJjH45rOuXkCwLD$`G%!F8l!&(i>Ku+TyuU z>)tc)iy;eZL!3He=-zNOT;b$C|36vE0}QSO0x%|(_1`+I3dR@|q)}ciu(i6`wp?xCjHmhrG6ixqV zLIC`5Tn1KkQt;(rPV(KNzdiD;A*t|~?%?mg0^qQ4#vp)IO>OMiIUNSwKpO9T#W(Zv zNB%MMa(vj7mjI>KR@!--4ymviVc%h`1BY*HWBfMVp+~UZc2l7IwFYz9w*@nLVnclJC|RfQUe3`=ET zvp2IegpAA4!G9M+39+v{z3e*5!k}~2$dhg2JxE>dVdb;bWdZGNeHtfxCRPSaQCzaQ zb(K#}+9E4v&0Uw|J6XDD6>PF{4%74mp6>Nm>WZ=ZoPe9@B_Mh9>3713<`W1`M8u<+ zs9V}jEfK%05t{IahSQRWGgL7u%(a`xbA6lPZ-w;t13v%=A=m#%JnyU}1p7^#=rE`J zMX;EsqWad&9w9<*lP)ya5&`Pl&+8mnNt&1~C~53${-{qjr2Eq^_jLn81LN`0^Q1>u z(L~+A{TrJ)FF)-*BnW@;R?6ThX_$(is})RX>vI9vb`xG;Td^oyWukB!(cV2kuAQ2z z)>WE03pLxdsh>+tg#R_n)Js5dZrRc3oIbpQ7$@JXyo10&v*z@fUwa_FdRH@H(nfHY z(k#0&3|IL;Wo!42D3HgA(ih!u>HyHzM4R~@oqrOiz8_mjGTeRHxyKli46Wt3V}c|B_f=yBw9&2 zG(*W%QWExil^0TO-ubsoEs#EZ8hAn zV+D%0c*p-G*$xZ<96{cBC{YXi!JQ88y0wPh(W!KSer z8+%hj%G{pFP%y$?q1C$|4P!Q0FpeQ8+q zS|o|S_d+wpxJw(HVk0rrhj4jEylwC1X$*oHuBY3T=l7@gWk3q}Im4fMYG)6dl3E=C z*^&c2H?@7~+L4yaZQ8yy!ORsb6PlRy^Dg7h3x?bVI)D_UFIncmr$1`}P+!iJi%ID@ z_3ncTQ&sk8>4HmRQsrtQp(kcrD1LpI62!}@C#?Tu_0!W5iQZnr$8t@jnIS{0mbI!#Y%KLDP4zO)1b3>eJp5r@71~I-%&J!=ez5B@5T|uTds3GKUGNz z2EHoaJIDB$P3tiBL|x#J8xL*Z)G9iSToD*o6!w zYgZ)K#p!h|zMwuMpX`AbGAMYIQ)7M$VKYRF+O3D`<=_iunVpt95K8S_u?vGQNU@*i z6}US11UW0MXIHu==2mK>zJ86ANpo*3`T8E=(G?|icF-4qTlba%6AVJ6B4MH@-D;>B z$-|mt0|<20*JB<{D)@26oWC$K>vb%|Y>`Ycdl6j-v6h}OTOz@-RLY@SluunvdH zOV%gn6GcT9Y-~a+URm9R8iU$z>W@4Gh4?pNnsQJ`haVn9Egs2?cJ1I&&tZK3 z=6mThnu&=}yJCN2RnE#N>>wDukz(_ggn#>x;&F zb^*4xRkk2*KC9P?zWITs#?eZ6GO`7J7 znHuSg{DF!orU-tERVU0ALql@mq(+}D167-)?v=65h2FlJCAf!m*Qeh^;>IHOU9mSU zL$1XrL@s>^cZqKIwAW9%xCv+?c%pIv|UzB;uBdfNqIzcaO=ySe1WBli$$ zv$&lWqoIe7?25t{zBQMN4^o!8R4z}MEJBJB3xTx8&1~v}_{b(E@20z{Eq)>a@F8WQ;4Bvi}8#FeyuW zY0N#DRC_On`&QGDy46IX;YNUX_SS>xmiRO!cfM2AcZ;>FTOY4&neoG+?(}W8z9ZwI z!CCqboz|SZ_49J(4Si|)lYAB{USMis7Z?axyf$`;c9PbC)u}Iye3%O;``h+I{MgQ>@aGKCE$ zvzEFg-AYBewGn2Jaw&5I?1S|7deSTM8&NVLessyjldE64?F`$7vPHNB?|N312yK~Y zGoft1CUf3{^+oHcK_a84-y}XDAt!TKdjxxx6GC+|^1b0+*rZ^$ZSvP=A(iy7OMI9=P)l%gc)o(xuE9`lajag57E5jj#y`wM4BWf*48glJ|IhT886-6h#s_ zaC=1pPu?rA+ir5gXyHLdRV@?Sfyk&JLPZ8KLCDgK)0~b>2I!&V9}aP+Qfkao=0H zIlK0{$l2Pd&~5>II{;4^7dz^R-o!8P*i8~K?i`D*;lHxEU(D^;`?5pFV`d~^sV1X#N}0astb=r*P5C0WL41TncJIHJS6tOpqFxv3LmQnNsG*X zcn>76Y6FGOv8T9|tNd(97q0k@K%^`uX-*)tMBFCzue&ry@-q5HP-|{0@uzK<&L+T( z<|zYur`EEoqlGy4eYakFo?Ec@-P9x#;lLShl-763{V)Xo^Yh9@S-p7Ga6@3F8{65T zvPw*<%%MdJelBEONA9gp37p*tvD>0s_ zK2L!K>O$(#)!b5LE~cq(eS^roBSk zN48%^Frx?q=~n{jJL^wRd6d%eFBD5)dL~^@dy>n}{oJh^ft8kUZ9^HPHC70D( z(TF%%(8lX|^0gen@@#n_bA~G}E&U1foMBDjJzs9lLq_+Weq&vls5jMPM$cjEs-6?n zm((YH>bzDh&6~S=KF+Ol4a}{43H$n0+osHEgqVicbL7$Fi2ikjmMn7)_Emw0%dYZO z8GSVy^T9%Hb&NchboW-N^_;R8V%fp5vZY9x^E;)Ko|=6KqSY!W1+Bf|$UTs}Uy~oL zoA|3KdruW%62%C?HT7=2h=jr$a0FR?idqJ7QHr)Js|cG~riU|7%6#uKZ`JghWAD@9Q! zApYAAS*^>qC!xF3ABRErL4AqV9 zx%s&Q6)8qNx)T}f)Je+>_~~k_UMy-S_&C7JV(9z&^mBpqt727GrMR(glI9+u<;D>Y zR)+0&(t{G)1j5d;vBB07B~m+@HUUc)qUodHg-pm&lV2e-?;O0Vv!>uM!e}&_kZ>2J z)Y^Q3Dx$kS^^$w=Vwk!B*@%)(05UvgzE36tUvSH-=j#j|zFL_>2XuC=oT+#$bwjaR zgoZcFAe_g+y!;NGIhO#Oc6E9;lS@P+Da4F9AUohvu*-zD?I#~z;`R>-JOcdLFP z*Hmhbyn$`Z-1+cZnQw}wm-9VZTf%Q$ zE3clU_ZL-C}5a`3)qx|{4yw+7yBeSBZ)jq1~Jc;`+JrL65pYj!-JAHMCOoK z`C!CZJ1Hae)vWwYqMDHGP|7}ad4|@VofmYcZ}+&L;n{Otq*f;l772T+{=qUXXeU}4 zh8xPjy~4fkWSp1Ra@lv0m&p*RN<;Yc3L-oO7vxWq7OUUW&X8*?a;)T?<5Q#uyK_d1 zh@q-LZlA;S#W=ymVuoh!uJt$x&S+J|W-0pIn>bll_j(koKs5>0R>G4jl}nK>zprQ4 zfquD0Ud*9cihYZp&B7*dQ6#uQb9LK?&R(f$2^16-E|UaOl)rAp2|>sC$fFAECuG^= zgI#~GCv-3_8WqsPIF&{bO4f;pU*KMBs(D$sh!3))lkh&*@Zv{2%pwcKMFg1YGK>Fl z{V9^CPmmGMA4j_{kuPfrgQ-;a^vXZ1L)ZH3_cu;HuN5T9gc;FFp<*5=@i`>v#2nffx+6h%1?(O7*^`m^)6zq{7rCDs`JgCkO_jtp%W9rkoi+e`#eKpjv`ZPMj~&S2FMHC&z+0u1 zqo*U^*UY?(j0C!5q^XeSflsQ^F)~oE=jT_3hih73>628lRL=N}G;fB!!pys_;EYyB z?SvFF3jF0ENWVAFJgXC4MW|!_kSTC8LwTVr*o^O5iSmF&vGtwxi!D<@09XZyo# z(b*>o@Z{!wslc_!KQc?wv?VR`qb*UvSI!yEmpLxoPXwyJC6j>GjwL?g{wG3wVpWSQa0k9y^r@0?zNk`B^h6K zvC|axnu}b*9#P}i^r~(4nB^qgso%v%*qa-vD(wCmNi3(;s+W6_YG4EEEm^YGo)c1Z zD}&RejGHMxxT*PSJ1Qo#XnzaM(>;*T6t<&J6G~*jJuQ8&xdxF$WanKU2bE|>@JCG# zZ#EQUa5{5VRo^Yq4p)wI9^_#U&<_dO42hz$)S4qwWe-m547_GGw}vQ4Oe?G8X2K%4 z<${uf7NdFk<9Q|(O&;$akl zzz-b`hpEZOhVV3Uah)0`r!eJa()&3e0e(N>rszY_=a}qzu&Hy~5^weLHec5Mb$1L; zXJ^Zb2>(MY>W)G%AERl~J9OJM-o}K8Zq>WDg;(|AC~4G zSjARa4e>e-WTQ}N@#?G1%$vwaPifTamdB7OGPy5)>wt?0X~?>y+8OV;rXiie?-P77ZW*sjlJ5@4FNWgQ;Xm&$2Gh>Ry~)@7!x zVpg$Y>AM+&SEV(AM7&3nuQ{DhlPj>ya&)C{@thiJnIG2?*y3UbgsfjAe&QTKfd{}z z-lj;?>kfu>DlLhr4)Yhw<z;qv_p}w2A%D2rh&k9xE7mM*e~6;m z(w7sE5_I!f^UxK8Tvz&@3iCb91mt^s$9bcVBz)4DfqbH-a;v^Q?5_;Xj}N_8n5H-PWm>GeQ7Js!E}B}YaAAuoTACtz^%r7j9wlr2UJaZb57}n zej>Ls%qR(3`N~U@cOmr1(Z1$pTYcxrlB(W&>=th)h#Kpq1?l`m`yU|5q}ZCJe$NV1 zKarixli#77w_hH<>X59J?L`Ns`fNhKzwn?h3ykek-)f#ue?!%L&#gaMX|ctOnT4<0 zbm#Xw&xK6JuMK+IWvRLEl)v7X64~&0ef6dUvh&gwrd>s5UuaM(iS(j2o&D4L^nt`Q zY=VK(nr3#H$G$AqwEbd%m$eu~aQ2NSKmVSa801MA3#|{^j+Qr7KVBKk>OX^z&Y){|i z$&)y;RSqQC6tjALa$*L1AKU%qczyd|V0wOm2;7R;-Q9(zp1pzL5t{zsQIC*7zKlf> zuZ}}^Z%?|S_6sH_Sp^U@SRf07uJcl;U#Q&?%Xt z7fvyC{!3V~B(naG0TE_XtmAyJ{#Jx0)w?7HjvTtBYj2}Ea{DbZ>RayV2Q|QhsdQ_K z%MD)ey?(=B2TPfcNl$oUZzHz9(5<6)CDK&f(}lB|G@q~R5z|c{5}g72TEf)vS&_Eg zJKNNQ_MQ$-x0(26StK)Vf|*5VQ_V&bJo}|)*E|W?uCe3`_L?{2^IuWn@9zou=3}q<^6Q$$0;jh5qS7j5;3Q#|dkYIH5$a%j>e`W*~>KiiGaId#K1L?aVmG6J8RV zls2=uD{2aXG!qI!)X=N)0j?VLMsy9@@ZuT4ZlIM#2XEtBy>}lEh6R*&;M=*o6@ye# zj3RQy7WCz z74+S$j=|w^>~-D2na-U5*%BN{8-81R?b>O*%P*^Kw_j%LG!)c8T_4n|a$Ga5rgBH~ zl5%#Re7@6oN5r4^*2JeDIY`Hh+P8TrO4`xcvq+6QsKBNAIajr_q-zOMPg|x9D<#E| zs>-*RBpW&?^FuzcPEdd{!~{Vlg7?!%&x84Icb>8+R)TT%VreX91?u|eq(lVm z`iC)A?iF+P)H;tR5F|x%uvt_$U;ZofIuxSzyfxX#{_cZdc?smCw#W$aO+OuqJ(G%E zqYIl2&}SwdU+fxo>UbA(+dYZ_BUApg#4ir*&v8ZN)|J8^Ol)+(#G6weIDcYdAnl8* ztIyj6Z?c(k3H|ioJ7HPyS^Vs(FL%dTDFv03iYHtREu}!E4sBYAL8^I$Xu+kN(P?Su zbf}NH&8XB!biJEBvhYqS*S(wI5l`CX1PQW0BR%p^njWi&+=5jsbXmfaeu9QJP6mS9 zs({LK@hrVrEGrt4QhrQ+Fm=o_-m1K4uR9z2{&1#Rc7HXu2D$Lo+esGzm+DRju)aRT z#+p^c);h*e$R}#Ypgw~xiD-2Vn(#NVm#+MRUoeyFA($pgM1H#8l%I7A1N+lOSMW9Wx~!Mt;52-cLw*WX zzdcG(_Enc`-0ga3*-Ut2DpI+DneW8J2@oR0%t{p(lcn6^Ax(iL7<7Q;#Du-FI=h)%4C41>|>b?6*j zBX_@L4TD)cq<7Iy0ix1eAYY%%^?AQQNUu40%l@f7rg$$e1><|#y4SkJuV(UQ2C_bQ zcfaiM8g?*kM7`0HjZYETS9#|p>c!InLcbDA0(hfkoLv5D$Lk*lWMb$5u5dVIsd1F{ zhlA_O8eMoHY#+8OX`95F8H4WIO)Z*+Z+o8a^UP#ice=ceG`w9k^Rv@tNFCM?<8&)16Y^q(Bhj?EZ5s8i5{3A2_}Aw+?rpEv zDa+yK;hf(#lzmULD`W>rwXno>mH-~b2NeoI&#_hJBz8U+)XZaKZpaq+eg>`t!V*%7t z%jchJCe-_U)(~g5@bo-04Q!;;1T^K0>t0^Xg+i`yp-q#CXtiW8=Nb9|D)_DAjzSc~%eonlE6!4Wc0dNNBk7e)bm? zH}Oz?k|Gy)1d$#(`&Q62c=_rGA&}^+OqH@Ur+>u>*Uq9WnebdjoX=?`L@=YW9Pc%N5zB9;u>2+jA~w5~@;7DcspuWS!AFb>GG&){C0 z#k2VF<`<&do*H3_xuZ)Zj8n>Pl;~d#RiBN|q@)r+^z}S6>chDdE&4}!m2KBPEg4Av z-2E!t%U{?-j(d8bfIrJ8J*Af-b+PH`x`eWK6gCmWOn&i4MC$vT@%BjTdsf-g>PMoT zHwzMm> zX}(x3YQ5%y4{Z%h=Gz3S~ArQrILcLjTMLw z`uR9`bp5(K9a@?;+04S$N|=p%pMw@W!x=3vX*^<9=hiMDQL0awj|kp$|5!ch=vr?| zXOEUq)YR?ctt*lLyviP621y#`m@}0`?rcN?u8Ii7w56w&L)xFg|3l!Ai)PG5yXsHD z|1Vv*QT$Q0+m7>fPfzQfk0&R*%Dqf!lgHAvxGOrVBGX-oDK%#P#!M+Wr!4OCjA7e~ zU)n5YOSN0w%}*NsicNE{2Mj_rCxH>F({$ONH_P}fSYoe}&K~bG48-@^wG?`vE(sJG z&Z5;?k@k8!XJijL9^tXPS^H9MhvEH~47BLtMngBX@I)PGqQ*xkZeS zpF~%l+uB={V?&chaZS{8e2{_QW2#;5cg7Ue6}4}9TLVIyzNkb?Kt9Ff+Dl(>d<4XJ z=4@LWGM3ty{i(%#36xj+MY$1|`gH=j>T97y5^=3wX<t@iaBPgo}6m&7-h}%-b=qzuppJJCCc$Q>yWAd zPA3&TZHbE#PE^6vL3c_V+Yk(iHCyhfJwR4KY!6-CAMO;Bwzu`LAuA#wvv)i2(?zNa z?=G|O$Gl+sK2R;^DQ}9#O2&y5XujO!rr)Jz=EB|k^`+oB&W>u&1u*SCx1m{raM{*8 z$nA9ayo2%ioB4uhQ&0^%@2wCwd$E04zDRW4OhUM!p3>tkhx2fgv{{EcO_Ox?hO*$B z6-n3Z)WC_@9|_T^4AFW0{kF~=N7X~i5H*0Dk~Z(I!rjPqpG9Tipr%f)01kS;c9pXl z_$o1B*(r#!?nX|VlC1RU^8d^m<;q_go|@+#SOG?`9n#j%hK-L8w&nY2_qv{CGh%ei zd^oK)`28q*r!^BQ_V$r;AQQMS3NFGZ9^|Uy~F_JXt)|&10BS8%MR;1+C<75v_lOYV5uN9zycz z3P?@w4$qth5eD*_KCK4VSMNyf8ax$t8(ak8h*qaTjbb`vfD2T~B%{(>hhYzoXuk5c zQD8T4$Zim-OpC?_s^$$UowA^AkoaBS`mX^ch7hAO`wC4~3+bS2eUtCje$Cqf7 z1Ww`X(b|9Ncnx|6;xnHfg<@#QQH((s^M7#guU!UTvh!G-9fhT z+5FLSrENUE$&IHkigV)Pw?8WS*wvq<<7bpf>lGn&+?9|SDT#8=n{secLO*-f=+sk? zCEtBNgs=Ni_DvCuO&*Z`!k60q?&;+7+g`;m>(^REviBpuduSRdcodW6nCV(d2UWH} zUxzleW+?`~f%btX4!n?cAxqAw70wqwTpVXut>b=}xw>Ay!(r_Y90iYvS`>mIBRoF= z8ex)v>!|zpF6R_?qaTJxvNWnxtnLwlcZ_!mRZ!pqIM~~;^u@~98pZNrq;0?Oo8p3$ zBq9@jITLfrq#CY@V&tQ^*$AS^6iBtM05}6lA3252$V?a;y*xrNoNCOJeONw$*n6DF zY`-LWJidzTFpz*Wn%qk%!44E*O>x{NZ(fUd`pD`Y-nu49sP&lePad-G-;vmJ0Cfg^ zNHf-Vz%nYjSvu6stbbm0my+4BR;;-}8#2bQ)p`p-w%P_wC6>3ZskCny!d+|64UbOn z?9~c3vz~dE6w*4l(g2;~Z4~5#M!)uD>ys0=2o#ym)yLGu!dW}6~`4Uy&GUXU|XP)(o z=-yJ!DPsLMT2wJ#MKLogO1s7PVIr{Ct4TbuL6$-628#jH-Y>X24c=W#6WKi*P%Y|w zrjGyc{jc(S*TK4id_d+YSBlBa`)rWJcv7b4b?2!_)dptu#^6p#KuHaRsT;eAS{f&BRt}&t|Qe}8r*qVFK#ao9-JXs=Z+|!)A z61}4=GF8aV_2NOm(%!C$mRs%m_Lt8%cGcw%g?jl59*$a$NCzcET;m&b%Sj^}y6HTd zGS(?Oaq``K?IOa{j(oHKZ0}0Yw2SA*fTf+VZQh`hXV^ByvaKuG#wp|<$WU)bsSIyqs?2Ev zZ`lN{vP+#L-BY&TL zO&{<@z4b_y5l63))k!Xv@=gL}bMbxJkx-?o*)Yz{k$leVL;G1kV)gtk6PplN&+`Ut z{`tF+g~I$IA``7tlxt6C-rt>I=bssw3B%FS(v;?PcYU?gi$uMJCJZU5pq-G&cuRlh z;1U#WroEXyKPg2XlQ)tU))`M^0qkb98y0Q>yBSA>Y6c%jvIyD5%Z2^?fZ##OXfg@S zUUm`kD_G)FQ??zuAUHchjV6Ia{EdeoD=Xg;P-A#7Szfyum#s2lFEe5nK<8#KduP%| z(cFi=4rvzQi}2)&RHwO%GCCtkq|%Z@DA$^_>DTZWJNQ*(pT55&S6+q&*IhKTZNzNnFb-J?Y#n5sY3OXRJUJ5RV_zG3GLIh%bCw?AY>xzaCccf zMqf4*Mbr5*O5$!brhk1|$!EPeB-FUj8QGKlthdKC8A%@pc)PmNIhSNn(ESR*JN>E4 zB@V)`O7_|A_TYLv#rM6 zt9Bq94{ay2h`_68r}mr_&X%~L7>0W{eFW+)Y1yDqoGWApINMoyUQX}cR2`_4*onbR z+l=J2a@Voi1T#Vad8=GH+Yc#Ij9Q9)?S0y;$r?k|fC{-Sa+w09##FkbNTwa3+t#~% z_vxfysNoPL^A@9Yz$fLE`B=dB)p$4@k^>gyESFo03x5nShri1BJhgc*X>vr*%79<_ za)Q@s%$&dq$SyJeZT_R4X7M$e^$#E9388#KH!kYq0`M=H*O+>}q>PTD3{Q3=t5F%8 zHhmE%!^j%^DEVgS2PNPXR;AIW#Zi~$XNXtZgg(-6$L;W2n(gU(T42i2!}r}ZGV$E{ zUr~C9Qxb(nL&m^{cJ)pmb@5%cR1E5VnuAPpPA-y{Wg--`Be_5p8ZNMZ%}!5jelsMf zBrD2c@`>QS4Mf%bP&Y}rTkp~na8>Z%) z<;hN-yFEsTJLU-A00>1dxP~T57;S9?PglxXa2qU|a1+S^3#-~zSuwqwlQ|^cW56CO z%T^DiJUh4?88dJ1k?_UG)HUoY@BU8Oox0cO3H2Y4JSXZp>L6a8Ieox2kE7c|4=Dvv zgV{8Ma!kTq&v}&j2p>&gB)4~@1x3C*qndOxb7;gyxvAD}?O=L~&C8N@o9;dRsFEZq zzYEU{kgKV3%lDG^BF>cq|ce-cE;H@)pjk73)k$> zxvwcYnP((P7R_jFCzTNn(%p$${5{yXF8TKtJ^78cs z6|JJXSgOr;MVmKlWxdV{y2=jrq!ODCPvSVNH<5cv*{Ep^6FrNOfsBs`Z|fb=NUsoS znP9K!b$3idSFwo9vuH1_aqSw}e%8`AEy5epUOOVZtbn$Lt7$Q^kI}K>=JVfN%n*Lu zwv+89_#<`qCw5Y_PcLm0zEA$(iJ+VBt=J07Pe*vJmW}bNVh+W--)w7)@4>adFMNm1 zx`|{;eY2-6rvv&&5KdNx+WU7c^t*x_81m6m zaTgyg@((k8i;b$b?6Vl9HuB7MKU(OTmqB<rhPy2XcEMUE z(rfv^fV8MoO=f4>lJNlW_-@alg9A1)$ zD=6B{hlF@M9=y}2O3P@`T!)A&$@8d-aVGxR`S@Ahe&ToEh?dKTg|s%0WTs|bzgYKo zJ>5qb7rm>CM&Zg0j&Co$0fh0+#l6)Gqc{gx<&G{KJ|$B(WxK0OZ?|iz(-Dk}XejP4 z8B5~!TJ7QcyJ!R+diERSUNXZhrB@gYmsm` z*kTA>lP`6$pb$bnbUEX1t#F9v;;}=y}#eH(2mglonh5l<)p&t%tm{wQ_)?^nA|7i2VHZl&>i6 zp}bjzzpuX6^}DCfjKD`IEX-GUGD^9kqZS4RxM|1%ugrved#n^(88Nq56GAL5JUWn3 zgnm*|vek3jag4p^3rs&|&AucFI}o9M+nfzWyaQ|TCN#5&d+f2|Z=2yufS%(ZimtvN zY(0P56-D-ZIl3e`bh8k3Hs8{cja4spJH*pQyW-406A!pK23#P+Z4vvmK~!$yQA&sj380&oV|z&N<9idGF%Z^BYY`v$xtdm@#v-*#?#P((!$IAz zWkgi^ib^HHAt5n%crJn|t8)95%J`%W$R+4aL`6pRNt&n@51LqMSkKd#`%V}+#*5ao zGg_7>Dd5|YpA3YfzQ+3q!CJG&y^XbTJuN5UaDQ@(x|3Pu`*W$fM)2)R?)5S5c^44w z02Llx6vr1w!74I}T%DvxmSPqj|G0VZGp`MyNFB8Nb|0VyxVM27SlcBK)=I?bj%mU6$+L3Og2o6UFvA9 z^A?&2TuLm6J$yCuU@@|iKv6EoKFY)Q=p5*}N;z4}2QfVpXREnY*Lgi{ItmCRbFDwv zkS2L8L3DLRp4Z2FlAWAawjSA5(W|@Z#h(C9Bsjgz*=z!Xs{`i1e z)nrRaNi&57BTcmWf;k)PW`V!|k$a8qVg}vWdptbp<>)7AZ2pTp+;fK`jPH7c^Pma> zedVE>oV9Gk46I`Uxvop{>5(x?MV~&iCART`^PS$I8+l)L%ugwm`LqnG=cp=YVKbG= zieb5WBlecs+KTs`THCGLvGg$|1DDAC)!t`8s{Lio(#;R&-Xl4LUjk%&;Qe@Rv>z(e zfDtnQ9AFv1El63ku&8@xJ~PMB+eK5I{KB^K`b6qMfTvy4Dj4+WKX*dcL`3e9^=178Sj6N$pbj)H<;ioZtN4==xWH zuBR>^PXBo~?f>EJy`!4Unt<^YMMVWx5KyX0ldkj*Dxh=_q!*>P&^rXgLNC&#gOt#F zlaA7n0HFj3y%PvXhXlg+u>0=5E4tt7`F-d7{_&9H@ws#7PM^6mK8N>|CQ;Cw6Kh3K zZ8@@Sbs&pFg#Og#t^^n0XVRnaE@T3ym4K$ii^_^mEzK2qm|1D=OstxlfF0yTY#|?y zjWJl>E3y#Fs@x5Y$~_-RW$~RZ`5Bsk&;aDL!^a?sSM#6p@2i+k1CKXpP*SiiDnl(V z3m+mvH#Dq2)Q_vaCzM;Cav|C|Qov#b!FMFPseY{@Efm`4veaI=OVNOh1%bHC?~xa6 zq&~?QtYXbf>!Bi?@E(~%pw-~|6T?~G73TjajZeSnTg7QmsDw|QTl}Z1>!Zu6RG!=V z&8cyXUx1!b3h+!?_B5j~2n2&!95j$+K)~d+tVVGy;HD}Bt?!P97B-eUZj>KjIV@L& zTu0RrV%gx6p0t*)cDxYZ%4wu)zG+W*4K~cCFhXiyvhQ^uIu_LMu z(rj*NN*YJFv3Y%4Rmt(rcXHZhGhdFUk++|};X;_N_*fmAib{3=+&Mc0LP_EW(R_{CcKS^0%xlQ_R0q zrZhr8{#&q#fh~v^eRK?Jpt9&w+DXS!7CoVxih~N+;~=IR21INAUkoGXlBRF?OO?70 z%{!5Rz1eVsB|YC95ldme)dg$!y2s`XL>O|$)yGXc*WaN&mGBnlfTlir2#c_}?KDCf zYK56*J9Fyb;f83rr4J9z?}!_39_uBl39kmLYL%RfjH;1B=-V4u*iM2JAU_HQSoRfa>heXVFH z*BmfeIkSGa6ZJDG@j@c?fQef0d!Bf~h^K1P!}3fgiWxI*`Slitz*R}>Frxd^tdsk~ zr%x_TzpCQa>Xl;;!2n>Kuo9^-8=|Y!x@u`i<5>G4FN{Nx1&tt-UJ-}ewB@s|>&6Fd zR|k9**EF6R3$_nlLYC+Qs@_#=H&84K+0&@-g-u$%;QBjRBP!!My zU=-u;9?Z9^<#Z2pxZXX`M>0iO)SX`r=haK#O~THXADK^?$v*33?AVChN;>g`Ie>)r zzW`3o38%Rj)z3xfcl)VlR$3nlNZ|XaM@tR8%bbEIroUoxM|A(R75XZx-%gO#9cMkUbUWVaztavBrO#xFDk z5-&P!r4=m%O8(~|d|1MH(}=awI+>}>04;g54&{C!)&nW9cH%Luig{{`)vV()J-=j- z8FgYVlvO&_4+4ewB70&bB3kAE4KFt7)i5?4)3%Pb+zw>@LxvD)Aa)>&$hU|Jkc)(9 zOC=uWYiv>l^2OMeSdl+nI$0*NjiRkSADKll5JQu6a~}nLw{JOqKEYWP@?ykMBNvD< zjJb}zfn^nm)+c8_%NvS!sR9vBxl4|fuFIPrKOst5vA{mqaJy~%y@MN^21fC zhRq{Kj1bW8fg$gj{z-TF=0MofO!`_#SHqL`KxV$IQ+0zxTOz|RBGesMc0K@}TFTZg zx4sQvO00ysu$f!OJCc}S|LO&>k2p`}DU7et(8*nM>IKA-w@gWw{B9)+>{~%> zJ6MPjsV$WJWH;-osnLknM=kQBo|WBGyYDp2!dxVL8xXun&y1yk7&tGHW}(dCB>u_D z5x?gwUCe{I*sQ%t3WxmfjpUzt8^DD+Aal?x&yD*}CU#5^2YB85V61HQK{YW$Y~(}V z!XvL^Il}?HHnBs;&|BW+>|-hvxkaJR%0Kpa#v6mW<##2bM{BZxa1QG6bH6dfPx@f~ z06>ju>UichC)+=lK79yiA$yNaYsk5<4jJ|<#wp3Ewzh}#ME(`)`47i_;PL?`6_2$Q zS~q+ukU#$&%l-h&DenU@A^;n!NjyI!t08zhQwss0rCiXb%wK&Ky2i<1@l2bv8qL)5 z*;h$|^*~LX3-WLg(@8q9PtUZ=rcfbNs1SXgCLX=98C)q_IJuQj$&3Fz>3%1O{_~TM zH3`rrsK8~o?mOWHWQ4kcU`TbW9RzB#?V_K25sLYyAhL8MqRGT2ut=3t=eTdjQBt$% z%k8)uH@9B(W3SF1R_g=dELbPJKRk=NG7cp@aK{>9jrbGR4ZAA5NR_Sfi|m4IfG>xW zl-B(}D(3$1%5(v*Y_Ny&Y=6a_V(}>-!Z_PK1oZeR5S*!!8)^SO#{4rq3-tcL+i^m? z{raCW&A*;;tQU8>4HB3O$tkz$P2wE_f)pBJ>Y!DW`D!l~^wrI;P}oqe9>Fq%C_0t1 zoY5{~tZL6j(?utcXtAsrv83433>kU#wBdxp3$I@|8eV`df9R&!bVN3?w_PHAS7RWSx0^xY=@eD_{ZcJBfcA<6ZJk~IDwab8Xt zT^hxazAQv{C(dYiJ$EB9d0pDcX_t3kU|?dq(H8c+%6g(^H!nGPB~B3hvyl|!kD5UK z%A8OCzmFaH&8LP02=`la;Gci|lMw^rzxEyZW59cBt5ygm`e6jZ@4xtp-?O#P5vd~g zVE6Bz-AyisZQlMl0ncI~swrt~F8WgHlMA_braR?_mVHHk>HVqBIGbZsfwbTE(kFiM z0?kzFx~oaNkAFD9ZarDsxIJqWkFgOhwnQB+tgi+&uI&}( zkVPYsoqyKWKZFHnpgE@}B=y~3x?T3^?nlS9$L71Da&4yr%Lb!g(FQ@hRceqHa@Q_? zhdwjTa~wn4)f5{Cd0C0&>RuMmy--@lnA#K$d!0`p^E2bl{usT`H-ckrNSM-3``sXh zU6%XD+U{>E=yR0*Kvx%W-~0_w$Et~&R{8f+hoo3-;hG6`YX{h{C*u!Ch0wA#`y0nj zqdn$l=dGL$D(LIO`I-vriw$f3=X&DxS2H42C2G`CI<0=&^(tWKI#4?h+W5HSo{_ZC zUD}f#FR_aWuwPF!rMgEm?C-F`}}<#)^!zgCm~j((OWl#!zE!n zVw!rETHhbzpUCH~`3;Xb#0&S9*gpqIaK9C2P|m#5)^$5U`^EZa6;=WIq}0+D@3}@d zYu^3c#kI5KbC2#tS@u_y7RJynv53@ytH#|#@m3Oz$3HjtpBW7yjMKc6?{7W$=X$rl z!}~wTB3P}-M0r!v{Ru*G9@~)Eyv7vHse1Rvr{zznWMVkPGU@Aw$9RqFLq&|31^KKv4BAH|>lfi-5m>owmq z)*rhdS3DJywI;zgzvJ^C5cbEfKDk$dz5W0ViJQjt6Re#(1+6Kl?fox-R@s#O2WS@h z1n8egdU6E-G~ZgIYrg~<@@lZ>AE04}09kzYgv3)fF94v`S+@EA3TQWpeXjR>{Jh3w zXwSBM<44SxaRZ8h92NNBmj-~fJx$%xRQ(`g9L$b8@e>BaX9O7Ri!SHIzh)u--`JsP z1W%Pa=t>zRc;X$^_-dQ&FKtxTXX_8jEvX~eIWtE6R1$E!LHuB;tu1xPAXW(_6|3C8WpNQ#u zaVkAVa1E>f`R$M5>(5^YAEl14GB2;(;|YA<{7eN5){EDzKk4}GM`WBKDjz(CVzgCUtf(=^Hh(iTWuSn@$MhIn z#{pTVl#>EKFrsq3k!;CZRJ|A)8VJ{8$&uYbY^XEv5f za6v7FNq4r}d?k(F>Hcls`)Rn9_Q~(R9Mg2C)Ny1|@rE1QC)Q4bXFD(q+}9ZOX>ib5 zIYn_-A>Ok3d`v%J{{A}7dUe>VbN?#E|NBD*F07MDfk6P5_ySU5`pr5n>~8%^KP3sK zuq9D7DLDA$?Nl6ed`_9dg#VAj+kzVbx}5XIzxk8-nrBGhKApJzmEO04H$JS_F>g3~ zv-5g%y@c8zZQouq|9tg;Q!!F?QQq~^Z)XYcrm4>XKTXP)pQzch+s+n8FIv!Cxqwsw zH0tQ^r<73@TDGz9`Q60cA}3PbA9^E1$v5-ygYy0DSh?(+d#(gfGsJ>c4$_kqAhhTa2y*K|kTv zzgPM%0sQQ*|DWMdWt~>7yCn28NKU`wxemw!hOd+t{%y-=xalV!&GZ0G5x+lXZ}Q7$ z@dGsC@)u@#1g>2A!LsmqaGgIj5A&svHa2GB6UPHPUoA|7r`h`F^#=R>p*`>Fg1l|w zg-Qt3uLy4dzNVJzFD^z0I24?9J$x8$RO>=9>Ha6d$y%10GacXjp7V!}Y~Xf{^^%!O zcTEe(KkZ=pnE?D@GXQO~itf@+xTz!uLYMhQ_tSj5lcUt|pLg<^P`Q+?NlrJQP%C{udR8*@} z!rBX3;>rUKL|9rqj?FjK#Mp9%WF;=bdAe5aEBsVOAk~0Vt*Pg}cGgYs={gl)pP=fB zBzDe)R=vx^E2ApUat6|?o}HBx;>TC`sVHQ-*V~_VC%4O5Er!xxLKc7(&>P1u1&wUM zoqOCJWVWVj_mbKtV#{8B+Z21+%kyMu@VQ}YoUnG5=Xz&i;l-50gq@?!#{y3IKO??D zwWb2p3_tk~ED;>{s^{Zf?8n^Q-ZiG@=^3=&@6u=Nay6H8L?|KlVV38nfVtStcxi-+ zmCTLu{GV$`)1E4}K)7(~Y~WYQ$>bmK{)#1n<5`NmezzD}_iVoRz7!+w?@7QOvS zd=#LpD0#`p-!BKuE7u2<2~gNdfqEJzVqE@uE$sWlRkHMHfzm`n6{l#~guIjzH4G&D zT!c)_4Y~zjhjD(bCM&5w+dr`4;pvJtlCnRpn5tElecwaoSl?3WjB|aTZo2X*2H{N? z8G2B_&lZ{Yktm@uL=Jl()@nYL&3xRC_mjQ-hC}@Dw&-<*2#2!<)r-4$fH~%nRFup- zyCxj5ZL${?o?Sxt{X3dm@f_?7OBTEHgJMB_rHtpmjcnmMPI0tz~>N zC-q|QhwByISGt)75!-Z|Q%pH>thny?W-jG^#=9>dRQ!ziT%&iv&g2m)(J(W)md!MY z!=7E=FkY|2@Aeg3);xs;$?I`O06EK>dk`~P~u)3mT6?5!KM9lSWxD zFpu=O+U+u(riizZIe$8(FFsyf4l!*^tS!bgpRXNmu=%KMi~V!WjzS87!ns5)WBYba zS=G^s)#%RbQyw5b4Gjdi4hfv`mwfQYgf5cwgj}kO*U(kTNh2Lay(6K)lK?V(!Y?oL5sXgQC5q8h+oC2ed0}>| zLNbW={?zoTITt8RW9W0`VkK)c5h^S9B~qCBMy5nb3?Xyn1AqOYT^(&e%S@UZTt zM)s~8j&h;|@T-<3N`?~6+NV~F&{^oXa-wU(L8sTIhhFp`3t`vvfOm8=BlaE^O?K5O zQe~>N-EP(qN?l0dx=gfLtDXTj)TX2>@BphBBy2DOVv?&JNo<31h;Tpjx>dc=9^tvy zHUst}*ay$boUD_lfqGT*xFt)o1W4cU8(XH?W2eH z%$2{4n0N#Yl$0#@e4X8QSuIg}YQG|8G+n|%w|j&#c1amIz9|AP@I`p!B}X*3v(xDH zvcH|UNOQRR?XZP+LR-rjlcASa6^Qusjvu5lQ0rEo*Q#AFQPi(*^sT?!@i52Ag+#24 zcT|>0i1N-RU!BT$j>FEl>B^_tS;)oRPu~JX$*3cm>ERvIbB&5c>Yl$jpAu%9vbcU1 z&#HAc|LhuV$;cFs@&c)JSCzFk_d?7XFZG|S^Wi;CeD=qsV=8%70ds5k*tVMN6d?_2 z%yr^z`E)|18S6z!A1AbYZL6g2 z4!*EmG=t%Ith&B=>vZH;FVd9z<;%(py}Bn6y%WrxB&3-eWlr2LKOXmO$OJJMN{}#4 zGDD--xJwMKN?YwyZ&rX#MOXu(muN+wxI(rDnp6|wShqwbfQan2iHc(2cdR?9f|br) zvacKsq;F|4v5F0iNeYu=Y{_p#J`Hc)U8!9Es%sR%Fsnf%UtOJ)uzG02&bg#ia)B@* zI_o5PBgo|W+k_Y$7rsfAc5EZjlw(m{rGh1-MT;lkNqb*SueZ#}d9m2ZMsI1lp;$Pt z#*ADtTjZpNbg@jNFn1hXR4UOzjQcvTe6S`9Y6A`2INO;?6fI@3#yHv`_QMYrF;5Xb zSGk*)7-1fBop(DYts*1B)`O`s!B~O##4T(`r2=cX0)Od}rUAkwK?wSA8H|QN5yi_XX%JOcUZV zQEZH|OAlOvgus|=hR06m>$@|vbPSytv_+wE3V2l&PO`5*9Q8L^F3PZqBs^JyYy<#b zZ>*NfMeKcnMh+h<;@L2NqE0y`T{)U(3-4r77OHC4j8t znoMCho1>To{O)~l(#-r~9Xed?0G#JY8ip-(sTvgCSqg}w7{!a6=Y-gs(dA%8nA(qLMzfgf)RTs9Ci$=`OG9YWG|LzK4WI$9q5`@ zoMI`F-!j$7$w+ww5}I_dbHAAI2yu0Yagj2stdmmPY}*$6$aFt$CoPY8ugCfKDNp^l zqa1C?*=XIAHLUQdAJ0STA1$0bM>*3*ObA6G8?2@ZD}>M?+=*^0@c;F@OcN7ueo{ZaRwno9}`nx?@h$6v2a$4 z=GK06def`J_$Ex3Nw1eB{#8jdoinACxq>KckT8Unxtq+|m zEl-8E51N>NEAWV{7AJkst!YP8=TIYv{|QxJ#a=Au2f^b#E(ixlU!ujI$HMSd++B(! zyzD4+IFevZ%OiPb=VZFlfe)GdJ7cf(-B;o&SAi`5U~FvU*>LxZr+{>bTlL&kJrY!HZ$;goH; z=Ea&Bd>w+F6wx3L_2XQy1vZB@J7Nb{H{tdt&V7-_fVigd(RG=qSi2D2lx5Z!M`!m7pgCb$oAnv_wN?!y*EkPbBPUy@ zqGBdOhvOb*ErJvZytIb(u>`FY9S9RH&-{~PjPX{8}E;oDW-x)Mv}KiCxjw*h;tcNrMBgZRD#prk$XJ@e1CF2Nj;@WAH4%-C zC_EhF!}SSZ>0=mp;yD%PWm)nS%28fXNd-x{d6F&~?y!!E-x_Nfn*aKzmDLw^Yfyp& zyzk`8kn1@_^!ypNiEtem|A~|WVAIt83sOf?Y5fXOn`Z;2_xo`_mt|*|DoSUODmaos zX}%YvsU;T@{courC2xL$@HFfm3D-3n^WBhFxHb7QRolJjB;L^rxj4hDZL1|jpC(Oz z{sir%03PgWIu>JlpBP~-Qmt??`QT0Yx|`1AI+)K>gj0YXaeo(9KU?5_*Oc@+MZ$on z?9t>_ePr(hP{~H`!Mb@db3aQZb2~D~FVE^u!%#C(LVaUKaw9FotDNPHqOJkxFT% zF@#WdO3GuI$LpSUtD*R`Vh%@FS*dKQl(QqyT=tlmJhxkTq=uM6Puj*|$2u-8-l|^V zLindQ3^9|lGQy??bn1~WwCMUr@2Utgc4nbdSl6gUD;47B%?-Y~qu7`lOuGFecc?rD zgYujdY80vj6a*hg!8*CLQmoT=heao(5T;1qy?%6>XgLRn<1rXLYl2aLYg6#m_YBQt zK|o`DDb;avW_J}9Z}L`t7&e825|jA`_mzkgS#u&!GQwVV3MMmb`O`(%$)lu;Oe0() z&d0P2#3ar`BHn; zc@(4J&y$5w77sAU~ySgweKF$fC=4kq31aR+{j&(53C-)rZxC z)gV@avNm7GLHB$Zlj_nrLSinYx0i8TmY=nQg^e1K=TRthNLTesBw0NRW7sg--{uMs z5%iA=rJG=gY%&ZGUlC2(AxmPIx(43NBz_C)4$Uh=8bw%hw%?TL>vtFNa%LShtPW*3 zm+4Cs?3v`&9y+H@6h+JOH^XjI6eK+~p zad->hxa6jmN3J?jN@ABD>ew-LcO`WHx7WGEO2p z4nDI|1>Y5XckHzv%r`vx_eqkiWTuRMqbdx86-!2cO1b|y5I!c?zj^49#MNm+SNK5# z_QRGX20&iT^I&~#=T0Pxb+oF)v&SWyJ5x@2daq@yy59t$A|@9ZF)bznOwW&;IOchz z3-lt?$)3-wvU1EAR6%03pA`wyb()gJB6QFjCz@WbyLM3Bla1rPIniOULxcJ!_FlCb z&~ow>6c8W@q;MxdyZ5?p9SUvlb3BW5Uw00-eI1y5@**6}u$5I++Zd&8d^M?tu2A#* z15K7Py=u1IhQ?Jc)orH4YA??UQM9wZoDMOiFA|1Fpx_5}HV`1QlMnPa7g9`tQwipE zD0vi6jI~RPY0<=Yyw9OPK8a+s`be_=iD?cmdoQN_kS*I5YNkM=_?!p$G)c?nJQyS(9ucr4O-cMN?UthJi$l|Cl58#gI7d!P(P2HDOZ zIAvATxr0Z$^9Ixoop`;(PL6h${OT&vUnKQ(-H)z=c0C??A^e>y8yl#XDu(VXz$MGe zj-58@D*M5URk9m2{WgP#wbvCYy;Zf3^`iN*I-FMswh%R;*qda`;aMbPG)&6JsgWGaj*c>fLiT6S6>1C@c{?zNaa$TA>kOyPTM;^=b>(Q(; zm5OBMNzAautm*YhBxJV|4a>QTeW9pFqg<9d&jY(6HcIha*IC!X*qB;*&fauQX>YU3 zd|C6gO574jTxuj#vqR+B!J(NpRg%|7`Y5Z2T6L_J)iBg8dOaA)xI9ml@sLII_c9FR zd}}x(B!xMJU~tg7B(GbfcjRe}{%3RU>%3zc-i8iQJoqodZgHOKJP?6yx!Krih= zLXg77^?X?l!YU|=1+p(rLDiu|Hu_|O4MI~uBii=uQT9fE5%K}=uyzT@$mf*ZiqWgJ zy~`fhfS`@z(aFrll@M$nas8$4_s6wT))&`XRLyyVQrak8Vt4XfZ^5~asnUX-R~Yt^ z?x>b*RgKfuX~(2garGcq zoo-}@E#LlVVdm$LfBktsW3EUPg10m&SptV>pE0=>#m4*U%F{^ zZQEa5j0+EacN}gYKaU0m-QGXDirz5?{9|VircOxYKH|nxr8-IhH3aT;eJh z>B{Q!WSg5ipxycb^&lO!Y>gcVbPrGJ*`}(XIY1bWjP*i1>#pdV0~wT4W!fM7BFta` z4+I?)+$$*nxs{J_mR&26ppswj+!2T}OV1%)aR;SGM)LEDL`*#4N%WjEHJO&~{b1NG zzyIl_6ji;lvzhRUQCgZ&iWO0B--Ck2G$R@?k38dz%=XAh4v~wxnKwvmjjq(S>a-k}p}VT!Y4)*;7|5>+ zGyYIK*(Z#HzO-GNX=%Ql_`B)r@AF>w!!Oi1aV#cTTd8@KKGjQSR_70pb`w#HfB#_6 zhoe&8YEePpSO6p#BojWnU&K~g(UPSK5`8Pm9G5-PakNh6h~pmx6*bsTBqVV`4!f^b z7h-a(hO((qH;332JRD!k@PrZ@UW~T54{LY5vb=G3Bf9V?C$Y^zZr9#Qesa`$fsu%+ zAZc^26Hk6DpdEvDx=8)RblSW`(}i34$rpKKP%76=MdVkZ6({*w0Mp={!FE-_hwTxr z+~JQ+XK#&}baFGvca_x4#5p_0|JF|A1d|pXeM=$xv426XRB_!(NUI2ik{mHv?{9#$ zQ@IaHP}yZI+f9nbB*xQa;4wk>;j&wo2opt)cT0px!L)r}apLJ*g61&2NPJYyO=k2v z(%h0;{98yE=5y%hTQ#&%UDe2iQ>476^nVGzvF$(lurpwNN zv!Eay5007XYc3Ezm|dk_%V;Vc5X@{M&Al$3I)lsYW*XqQyXvOFtt0TFc@-fTpn>IE zut*Hx;8+ovTdHRVi@B9s&N&S&M~MFZp#!JNN&awLLev{dXA}G)M!!eD`CPvOg!UitJD&~xoNB`}@+rH%X?g&TtQSCkLZSQ1n~74k z)oyw2z+Mdb#}e040UbTO4PCv9FmCdynyeylFDbG71`gegmNDNdLXH+KUosX=TfwRh z%k|y52z9tC$tQUB;6c(}99jTdAkb6CBkLrx)7&D!*5=)sOfxfEpvm58YD1u|UMEV5 z$3sH+n(8zbd)X%gI>TAI;eI{IIClquDGQxT!X)OSTT6lVY7H}_)=<*FgOU2_8(=4f z2OF<1)jXB^GamWxag;NgJq`PkeuLNF&LKOY-|X4%6zFep-YC#|Ez@0?(y@X-C9-W6 z5cH3VT=ZK#ZW#K~J&K#)mN^T9Fn4tpk8Wvgkq^1O8z@nhq;-T?CBY}Pd3b4{ zDK4qXs;+rkrX$kzTg5v%4mO6H-5sRKoLDkPB zfGP;?HymqJnt)BUhKDbOjk!B;10m<9=ZI9uw@M$FUHJVbY4#Y!yBTWDANKL%L5jb) zhI=su?Eu)&&?kW&K%A$3YdSlW5@Dqu&@+CgC=7eayRa%Y%XhVYmxGYQlx#fb5RY^e z^o}|`g7c_>C;l3jl3A>8jthg0TE9m6Q5`&`FgtA1Y^}hZbECLKe!nX(9S2y}jNu#~ z&&^0z164={SC;Usz@(;&u=@mw{em(@LuY6%+-|~+Pz3#wyw+?j(Je76EM~|;C0!(S zlP~0CprhAus7tBDD0GfH>mqUkzCdw)wUrbUSegC^e|>mY`@B<9jc1*2-e#LP*ts7H z5vlzc7G|kKp;6TuW`3b@49zM&HAWo!l`)7J`wJl$xUN@20#i|VwUP|XL23r z9WxQ4fyE{g(AIS|Qu)OWt4;<20V1tXTCA_jJP>4K-%lQ@)3sEaDbURNBzu0gWi1(z zW>B`ypQP2j*y(ul6ByoTHDy1hRA^#YhXwc2#}%T4}Gi*0tc4#ZC72g3fM>9RsB2^u!Wuf)!aGFldiUCp=x9)Q`Xl^>(U}cw4PR4Oe;67WGU=Rs$piu>x#$k#}729ee(dz0hB!Xq3kN~=pIOnNNUzitAwd>IHfPz_u zwW3Ev9?0KHDC7Z21eyI-Wg4@Tf#B2PiPS}$&}xz?mn^>NNu(S0mP?}OZcRL)>l%R> z9x239u3-B2;k|1igqDmg=tIaHNNHVVriAsn3HD7ddR>mkU0{9NLUr+vs~-%Glfo{rbaj~ zus&$ruIEY=wSl5^RTSw3GOuyR1LXA>$YR5kVuPB0fxJGvnyP<yiol43!;N6VFpvC=e0=|H&{AzYG; z&RIxnZ11F~PCh4E@hLL;(iW1U;%-Tg_Nhz=B4}KY*Y6uu3GKEV@R);3)-~6m-Sd}g zQP(zW^gAfD+C8x9;~{}lmaC^}=1_DHzx8_kq?dQqj+m>RUW)2z4ax59X@3>RizGb~ zeHfeL6WPN_ud1Zs!#zYTFg+wSt_$kUNk+n-hA5vb)myS0;X9W=-95Hv_O{zZYttiS z_?s_z0fO5!rL$$~Zs=h`le6W1nLlr4u9byQ*>c~9%uO$Rt}C%CS}%`dp;q`Cnn}A_ z8mgulJAm7f|+FHoE~P^b>%X%vZsz#6j& zBbVMU#Q>>#80d+Gedm;sxXqSg{hrO{Fze7;zYL%jD01X$5?y7h4-B2v8x(2QJ0VaO zqhl8YsL$X+N$+VF`xPex(5yCfRGGH{G!5omupY ze#TM}3NUwVTUUo89+!A=DmN2R+RybWbaQgI3K(!?-6@qFtB(5=|?(H z23eL9In^$y)VR->7)gVwx4-VU4s7bJWM1s8GwcucTeVwzep4l3Z&L_gw)!%ukV|)$ zPR7gbQ3HZlbB(Qc@{uN;Y|W)rg&M=i&o7KaVrYiuG&D3Jo6sGYuQzM>JQ;-=i&iP? zK(G0QGB%9kL_@PS=T}8;(Sqx&n3XOFw--r%e*cz+*5L9D{%gSfx%5}`S^N#&;a-OP zC!vKoc}Tn9U_3y#WoL=EAl_OyE9yTdEnbbzFYXOrQ<=GA_T6>7vmxX^iui)xi0A4L zADbl0VZWu&Hw@izJ8w%V|J%r(?n(zj`vk{B>j?9kW-H`NjuDEej}ahY zqjuCWgI(<{=5hJGej@a|hM2_4(dhZ0eVGmUsjkAU(|W?elvj~3j|dztvBQW#3R{k$ z;J0NP>+xGg$N1H`wCKRSN)L%Jn!y^eCxfPWtmh19LcGn^QVPBbduh@%LP|tcr#kXz z`kVEiPhYrLKAELhW|R{4Zp?87?Mj#Z*0-4%a2oCl`lQ`bnZJ~LuoV7*Q&4tK3b@vB zdhMj42+l58*K7F|KB8k7?^}=OrZ*=LB4%;)lvbl_)byF{C0%SmTc8qE-ea6S1;L|) zUfBwEZ)=m5w#TWHg}nrUsot3#OrGEcTkf7K@EUV=dd8iF8j-(x0i1DC|Eov)zTq5O z8ci&0L=hx?`e^*5mcwy}zG6#S)4R2{b%W)2rV?ATWPBX$*A3KaKxLnmgjb`uTEJfr zvIMy&Kw@@^ZEmx9h2BrC;-W&`b0S1zm6>-;A`Da2D~nvf1Xbq2Lqy<|ERmvH>Pcy) z?R9qq0|)|oCpoCxD9)3t%x6!nTpA&CykwzyT1AMIZL;$#Q$e@xv+rH3cd!<7KyRVR z7e0h67W*fIq}~IO<5AtU$)X~Q@$iBUVQ62wqS**5OOKM(V)Ua&0v=OG9G3(3ug z)t;uE^A}3m4>g>NWL)=%^;elc#V#v=P8dVJ`K40lI`u@P+Q%o$3M@#2jKnU0g$GAC zwWO8s(uxBCeZw)0!H&K;wx66ZRn*ixNjH9b>JpCtmK!H;VLEC^RCI~a*m3J=YR}U> zyfvr+E{*~&$Xv_gT%jVWN8s-N&AnE-gp<58<&f4UNi?-S0Yfix$ap4H1lB-ng_77`9Tx zVUuEyki@@TSG1AYl6Twgv-U|ue&bpJ(k`M+y)~&^dhlgSgyTi*vGz)>xetR$9rORl^$Hn|wwWLwNS9F2KRV$ABTh`O1wd2U$CanWJ^Mm!qIbK-D@uP&Q9LnafC@dFRa#nk$#% zv6N+C$Zndl1(=Y7W;#pkVW~C^GgS@k>k6C&RH8^iGX_-)gZ$ME4H0S`sc~eAY!w z{hLO!sMMUvvBt;F_>D^3+I%yCidhrQliCDHqDZ=ig@>~W9&h+jWi93jSpoccJ8s_` z(VwOZyi8gG1W54x+x}2PL%;Li`xx0yiM z`(KZZT>R9Jt9?4)_|zuw=5UCXONlR7=c^b~zcfFuW}B(IpZA})KY7<`}|g-FC&{ati|)SiCN1+VMgoxly9qkKWTD@mQd zah>b5%I+r-rT>0sX6rL_eNItgFj{4ZrvDy^GTiL>451dg>5=jhDyEDU^xyvvv^WON zvxb^Ce%Sv`bbO^99Y%uEm-(+1QrB@4A44@fB;>|%1z~)+OYaA!?4^!jrR$3R=l+-F zFaG7S1bupWnC*_X#M-`vgl=(e#C%xjbSRHbA$LiO9N(1?n}WjqcgT-w6wS02LkkXUm@ek2#$EY`xn* zRLGyITD;=IM-!zuuGdi1`2aqA)Y1yNOG($Yf?h}22y77!GW=wocSWd5BFXkyx2*AA z|7B|9wNYxKTME>A%0AZ@$k zi#p~dSnT6Dd2;8T%E98Vm?duOzJts*`w%UX?nN;fL;J;7&R~viR)kxl`dm6WubPJn zZ+6L}Ii?U}u&-GE(O1~bzG8T!DK|FD&dEfVhM9R!r>dc< zyM+?G!?Q+4vA068Z+4{eC7EpRC^r^nrca@4J>&4LydUU%JH~^3+g8i7X6a#wU(29q zIRf7=;$GV=q?S1lUIWJed~I4=HEQs!R~X&@JW38C53?9KV6%q1su|KPC{37`Pg%Zc zuh<;ZC`&OgoYH287sCS;Eh*Dq7FETp9e!wYA}6yPv2++oo769I>Il)DJR0?;&$6oj z0*S)j5Q#)3PDe4hK^7UebF=@;3IFz2F5`=R+qd0`wvxA2ooT|3j|DW=yw24^ZfmbK zgxUtgH`9+Z)LQTBgghy6$jh+0)qjxbUn8~a&rkBVEc96w;-$dx+~? zMN_^%%>OK1e~yvd)T}P+I=fnlt)cLx*jx3Q-ptDFdonz|0v0!uv_6S{ zf%1KL`ny$ac;VOn;u6ZDa9Fj@oFzGjWfZeYd2<|jySd|Aa_uSS$6I&Ec|(c0*$1*E z-GZoFptRboFPlO^b0rk%s0TW0J#qA$BO!xTW|V0qGG78kPQ3D7BEGcldE~W`@nF+7 z1wu@@yG|+LX5Ok{z^%qBNz?-CL^H8j#F&ZuoxPyfMF zgI4XQ;&iWklTeQIsgQs%<`T_;G-=kO>m2LUU61629moH{mMsw%0I(U>;8HVR0m7 zs2*ZFs>(vptvcp*qB?gY#6G#U5}{GQII=y_mdMt-`gwzA%?aLMbE(_8#=crA))yQy zwjU_vT0i`f*EtSeQQi~KE2v;9Te9S>+1RuLHc>KP%Y!a|S48o*GWc6`W_$Q#Vxpl|?=; zH4FY_HuMeRY-WkpmI!>39zz|Hr}P)oWOOqXjCELZV&D5t%nMHP-CF&fnYn_0i4f`U zm0hAKNlP4HjsdK0eC(b{uQeyTDsMl&_fRoKvTXEQ-f9iQ*k|~8XK$YOsV!H;ylR^UyX^URqzn~%q;xSkZ?yB#lIv0B zt`Zg?EHr7>A*eOcsK+3`D>8#3cNZpU$b^F@{ETS4+$D=ko3>O7sxZjU3%R!M3gNJH zEz;|dha`U=;5ZKL;knXJk}@e1VdVk!X*UBVNNc7F$Y(PAL<{oDgN;&T!oopfq)Uk? z1T|Yp%jA126Bb%l2gk<9pfw?ts);OR1$&udAjizYqciz5%qdceo?)-9bUsJSGAc6h zV-Kt~)R?cEj0NC7v+`AyuelK*i20lYPnWy`-d=rK;dt6hXXRF}J1+xdqUj(PFePw- z1Y^jgOg$JB+|uqP?d0APx_32yjrHkF4{vvuKKCT)&t|Frj=VNGRS+hZF<1&jzt zR}c{Cy@Q3KbOfo9-h1zmVHBhVq}NcTcj+}Ky@VnqKnzh@=tv1IK;AeG%sAicT+jFY z{T_bh;^buSv-Z8#y4PA~?*;KN->WrOPN`BPRUar47U>A;zjnVG>PYkb?fm~!N&Z}o zfDopk>^z?r?W`b$6;X!e?7~%tg%mm4a;O49AKUx5{;iZ9rF z#IjO%g}vxlm8enYlXttk^QZl8qI=Qy%8Qb|}cNlq} z)5L9HGGHQa0+C1XN($y*GM>~SNkwflL4?$^rYYVeC2EWk14h6@m%jA&A2RrT5;RU> zx;7~?CFrxgs|L2=U(UQpN*p+- za3jGtwI?j8G=lar#Rsw-L3AThEF+8uR6a_p@#MwgUSG$yWR^yYk{U293CHq&-HF!O{H)OO|eEg2z=-v|B|~agcaI2YPVQ9sV~xll~fSo?XU=K6hH_KH3plr~FhN@2n3nP(gw zzlpwaDg9YmtNYX{lreca$Blkin{L89{Wk{(D*v1c)i(P_BrUAM(r~GqO&cOM1wNy` z3seRbfe-5C5^z-7gPc?Tn)fFyEF-s0X!~3SB0e`alv}tnZBLX$x(KZ1t{HkMYq8r8 z*Qw4DgAE_`-hEMPV?@aQ{@v@+38jNCTP7v@D&Y66K^ATYSBO&7-YPJ9B3<2pq0I&2 zGHoO@eifVs*_&KoVsO6kpq39Qg>Rc{eW^i~Q%aDRSs2s()!a?v8isA{F!Y0LgY@e< zlSLGkeetR`KFahZ6?at~3`lmHs#JRF9?r_)>pGLhgLXaxQnBCiFIP(tHqlJY+FM%X z6Dz^sjt6KaLYVy(q6HHF~Yr%Y_x*F9IRqXt-?PV~&XZFz3kU&QrY5R;o}ShFmo$f(TV%jUpvgNKcFieW*A zjq+DdwLZ|OJeJO+EJkyc!dP%T#WqLUDhO>FqFy|`;Ps-+cA0rf`6EisU)hJqUxjM( z;ZBOXb-P$4Ju;N4l>|j}Z**3AH&tSCNr2PVB zp0hO4(ldOV1Fl{C#d=~h?c=??31Rn<{W{sNlB6qL>rBEg+Tvf%Q@$WXnOy&*&lYK| zcX2QPzhU6J>B=a$B)j#}QvKykk@^u=Jqt^0bA$hzPQ!f3Mr#K;a?7SdBcaA$`}SPt zt-+sRP4Cw*<-jl(Nro9|=vy}uxlw|mv>~#}>J-dNGi67nO54iE?T;`CONZ#kZj;Xq zC{~G7pS4Zjim*7SraOXOx%Qc-1Q#beN&AB&(oWAsze( z8<-q5U8RJK7@Z05Ho)wg7 zF>Ne-@mF0MHSOY_+*Kw=iRed;XwI9A^}%K0RKnIzgQ1`KiHw1Zl9{vh1|}mzb<-Iu z?ozFx{j)ZQb&Z|+RDmaw)_bgGQqpP^F1;#lup4T|BGr#Uh?6(t68W2jo#s^Tsl%U0 zZpC?KrIRM6ZFZcbIdlxzp?02m;hNM_n%^2}l33a1Y?6ZusMI>v(aYeeHw8s;UE10= zMK}b{Vn-MwcQXBx6uHbMJ`v%pNug?c?kq_l2Bo_x1715%EjT_OX){sd&4K4*UYRkq zwg-KW-2yco!Y8;zG%OsVSHa5V`wh>OXL@9&2JA1^rW|)=YyaNQ%0%oNXO9EsgCXfm zR5#f`VH1`PZ`#c8E&#uvA2j2NJ!JT6sE@*!@=QJIH7QO?H+O9VB?sUWdb0*e_$E)o zwlB>FKg$Q0Dfm_x04KP9O&j*qHK~woj`QB1$`1-T-pUPFjpv+In{~Y(=*c zLK=CT&>~7(!lfG0q*V~*mQtC(<&S789dl`_OOR)3*5DmZ+k$)bM2u@#m$M{o{fdpY z$tzWoI9$wTt>4OjW&jfLtNfP2;eYd>F``<=MCk(9t2ijML%ZkWPfv*Y4XNi;*EvS@ z~C=8 zO^^&-13ipGg#wi$iZp2YlH z7(%DGX6uD7LMQQ;jiKfDYLB$3TO>w3EnW;V=TD?;I*Q1$9rCy2R!BDSR8D6{zLyR@ z59@69Zh{IHo^woa--7dsiM~J_^Nj8`H|UcP-uGA(UMB%_SdW$J>e#R*y3=6?39Nmnbt-jE;qR zKDv8+H^tVbv`TsGiWMA}+sEwiN!MU)716{TauD5M^nOn8MGe|=*m*GtNpaZ1r()2p zen_i93OssSh9%)x>9=W#k$0%nnV^x1o6hZHGcW)hJw_p_hHN&z@{ov_tPvQuROW3E zW9&?}{opsOJ|5TCq)o=JBP#9uoyVpyS_vb#rqVdQCm$e8FE`&(#Kb0SsdMnCbZxLm zC@nr^a`k$l$2+;4^Q*T&0gJ$!RGxaF>N*>0>}2uV!LK3Zcz-d=`t^>){kWCL$|TW0E0~SSwyuv%sLXm$39>z5GAj|3RGuXNFTNY(ks zP0iutVHs-jWAuw|@$ne}7T507=U9FIj~}lM_C%%m^c?eJ--8cM?kde( z21Rn#I}lM>N(P&k#U}Y?qQq;RXbAhC6ck%UFDB-xDkJ<>KJmks0vp|N=b^XRB`89 z2_pz1VaYTJbhs=uy}!03*rqwm3mR@9b?e7jJ>?T^aovhsf3wp;cz1KZU`Md1c8(4i zzcsfa1oGKxlM#c1x5?cGNA5sD_YN%XiLgN0W0a?&-mcg@dqWF%5xRSMse!ErI$dsf zxD|cu(fqDpwq|3ou_ZzwCQ@8!Cgh-?j(N+nH>MQ3Y?zSl&#nF~xo^vf&TO zE7QQ)${B`VTD5pjZ`ouL9yxLuX)=6SD}?8Np~5ELj(Rm2=J5M=jp<_A*c#TxS9R7D z?8hH;ayjoRVE$HB*I!X zqx26{I$qH@vw$$Rp(lj*yH?C5sy-+fNi~&zyaj7<@2=lf$!EV>lAt?Ajrf@K1@SA@ z`Yr9!+754F3m+F_Aqt(<3?uzr%awUa524|jFI&L6tnP;DYE$CImzAoNYuiPt&_Rl+ z>-n`pu3z5q1Jw+X)8ga`ss$Y`8RYQ1)}=_jY6(!=e2i{|S>3x27)R)CUr-;zVfn=X zMn0ny`r&To;t4L>>@@Z9(zISrkex}((@8p$DcDb2^@rxMs+o$dNfgt9t zO}VIg6Ytc-eSaYIrHy|$XCA3(Y8_uw>eIxhEKg&-p0$`9KL>iliKYeva>8c2LUG*y zWFXA{Ml+L!nCp6G@5iDR3a%m$YSW@Fq7w5}#I)GCd*_3xg8MLFqu=&Lg&gD#T4@ZU14lPRv_4RvWxD;rG z^>S>_=3{A6Pj)XkqmNI;cp%`eXpfJ_OM+a@sZSijY<`cl*Qu|wknRsKy^B6IUZY$o+K= z8U7s16oGc*_)c*m>@Oo=fz`b|F3Sm2QQElY1CzJ&Bn{btNRT;{6a_TN4Px?`kWk*U zd4;9#L0N2!_!om3Rg4_K&#Bz?+4$4O?j3ybn(FLjqJoYKYnV_FQJ488Rvi0H&*&2= z2vS-V1j<#j%X&xrb&@Q9>VtZr19)4A-$Q~5t-uTQj}Qow?(9i8aed%1>HWS(EAz!u zL_NE|-qEWW4I~Dq*~-vLM0ANDwl^86Q(a1M5#2YWdo~hxwIgHUf)i^?|Z^v&YqHo!1ovD-6V+KU<{`-wjK* zE#NoeYc!~)XA>bQYkzOEnXGPmSs~Ppy`lt9wV9Nic2XP;cSn7y5mW5mzqzb7%P$_8B zRVPM^Mu<|OK55=HI-GP{pkOz1UMNAJF^J`ap5|DlcvfdiQIt}a)UGMhY)vJGct<$? ztxZ?W-PC?TCwO3xuHhJHdJjUtZ*(UypA_DZc`?oPynwZ3d0wwB>6w(+}GRrhjk%Dt^V_(vq zm59E@eay+fab3lLcdba*szeO%v7n*G`3Uy36Qw1p=k%-JcHL{pwMy2l?2RoI8gj)i z8uN_LyGgW4$$4GJ>LI0gd3sB)tm|KmwAQ{UeopYLlmxq*A>^gjdwmgXX(Du0OZP*1 zx&5BwBoTkLXPfGhwvFFF{M;rHn1hYhGGsTtnVxgE;9^`X*_3CDynFsL#5!O%d+$U1 z^LMrP$*qCkxxgd)oaI`MF~e5~G`>paRpKMvCTUZ`XY5mF-Jm%EMv~*p9w%z)8cyNk zP`T_WrXwnyd>lUXyT@NDznq};Q5(@hW9lJ-5Nf#5#5BcJ@H>B~mYF6ebp$uxkeW#0~{|X)F)3^`X!~ zmqylX;Kqu*N>_7%IkbA=PwS1|$8K15k()dlFQpdFt3)0o6%mSGU==ctjIW#R{MBT@ z@lhnHg3ftm(M-7!s!y@Z`<{_;0=EYnr(KYwug$wF=4h10ovI>TA4RrH=+(TU(jwJ) z_f%-S)1$A!T=aR5Lg~Jc4sN`b`L*_=Hw$t5k3&S0Z<`)t;}iK~+4-wrn|<8Eo&vi+*zdkKV z6EWbAB>z}_s7TBH^gfYu$1Udbm%|%>W6oQ=MIM0_zV$7g!5whPeErB@2e;Msv35C$ zu(-A0%mZIe#BTzgmmqnU4xq6`X0oXX*%KgMCFH$gb8Su*+LLvankm#x+pcwuwb0;x zrodhQVwMVDo(ql3w=snen1|l0y%vus?_KM}nz!Ih4%L^#@Xwn4S%Vkz7RRF;!W;C{ z-1i=X6$B&;HgSkX3T~ak!v~*{E?g83IX5!0TIbC)q*d>9l96@fQ=>>8z!S+_#|9m; zJCSww&tI4+{hTZfTgcQIcbmQHn_&-32WEW*XtYy=T)BUq!CIB>cOYHu9*(}@5pt_v zaj)MMRhov{#E!TrLkdOFS`v>VN1=l}fXL8$81Mo|@1~lV@XRDY%b~(zMXP3auv7l> zHD{7#_~Z=ro=I03Lyfz$yq_;y#S30aG}MMb_nc8$h;o~cb6Nqo^nhM@+Gt(Rna*bl ztdg?S-^5C{FJ*e5MQxiCId9NKDSw_Ky~W1Aly=er0cKhA#UGAka+m4GD`&Eb@Tl07 z(vx+$@r`;MXRWPnWo~0z|5SA?&Ks8e1hup?lOEL?gX?RDh#AYItt;NexSB6$E*`jf zz#b5~*6!Nlc59I9PCWb75OF!?;eMt?4MfTYKAMm7T)1K1U?b?!fD;~i)reg2flRY3 zM1vN1jf|@y!d)+lR_6}ZH+!~XSG@tUej#3TY5F%dC3ehF1Y#`+FkSbsKvF{~!sA$_iisDdS&{wqV~ zd?Wo%@xZQ+czGpqeQyJZ>EJ8Q`k3Wok$>7~MeXvn!{2uAl}fCNfTV~g%!6NU8Ljy~ zVe4~_fXyn9#5x4;DmbIB=Okq~zo1E(O>O6m5JA_@Lrb_1@a5Dv06n;_pd4yFRXvs8 zEC>yN?p*hnetpC|^-%sKoPFmPP?|~oM%ly83W3A$Sqbh4EMGdZ;lQ1LDuqpp-<{;e zvHq7}-^l0CvMY^RRZA95HG3FFm7_PoEOIooPTJCy;-XZ<;C*kquKJ4OFkQbKL8;1T zEY|P6c0+lH@$Cfdc3|yU-mJ%~qkFYZJ~U95sl|7?ou3e%9}g=<<BO!wVGDQrnGEGBgeeau#{7woeA{&2X^Liq&u>i9@|i_ zk4zGPG}CSC7Ui>>o1eBBG3>oG6`S!c^|e!ii*3s#c@h}i%{6-JnPVq>w56AgQ=!H@ zEsQW3An1DKVWlX#F*eJ1Qa`L;C`ju6PDN7uIhRzS)**uAUKI=;OA=L<p_>c%ddZUAjME(LHj)0~f5g-YA@< z;T)J1&c_fD+KqpEn{k;bMthm?K1dg3<@5ABmc&WpWWf5wo~=ZXx1E*c-3lI-I)_d@y0()M zOt5Rf_wXPAkTj8!9w14(84_$u{dNvo8yPuKFn$m{F|A$1Rh0zYi?DLtcOD%qtDZ+! zQ^=YAev(;fWRo1}Py)iJ3safzXB@BX0$^FnB}G zmu&_leeF=^EfF~nZQxH<>AH4kYi5_*nKmH7SXi_~4FS1@aZ{4s6#Z-7Wa>q(+^J5BZe5QY0IGb=H!FceMaw zqXV0-CZ@$^dUX+xQbclr9d{L&3+67oGzj+8k&ags^ipmQ3ih?!%mWnk34C)2(_p(r9tnwpfJqGxNOF~6}N2P7{ks&9rb0F7A zpY_@>@Q^Oz_^nDGu8`?#@R6AxMD9tpRAFL(odv~T9pe4<&!HhJ{tU$>9Lx;{#GAS)vS|3-)ABZm9vF=}HmdfC2<*&Xuw zCPO|=6ZKPdAiO_s=)X{Y_$)`Zq->nji3Ao^UmW*&znj5)rGQIQG3x$^=6N8z~U zU`5$7Nv<(>3L%fs@eu~jU~P6Uk0BRwE`GyV=TbFMLDHu!H`L^nt%zA8r9>Gx=sta# z%ugsCTYR^=EL>W#`cU-TPnv%v3I@oz8Th$+z#6bZ_+3Qj{JIlP}5}aZ#5nUEB&`gu$755 zpDdlcj@cud+M4Fr;eLK%n@Xo3@wt7j%L(MKU8g=6Uh^zkW6aIYc5cvq z$xjobv%Het9*{y!fcDg%uw;9_J+V;8S^u%sU|cl)1)@~Y1+;|Do=Pu62BYEBhlqn; z^4@KocL-HquV}U6b6SD`Y^eMOLJm2=HC6@R6roDHq|gf}-iYQvL&|5Dgiub176`0a zok4BB^G2&;8RNs8i`ig&>!jAO?C6m9NsjMgoh@l%dpfb~^PdSNnPVKrDLUb7RnL`@ zZbCpKl5efQ{=Kd34{|NO5uh!RgGdo6;78ckit&vswNzB8P@jIMbqC)-obrec#NXgw zI_!teN7r@^)KYa_Ffs7Ix~9&MJWn({X8Z9SY zZEe{UPsi`$uwzxKU7=U?;@0MB=cW!)U`8F#WnmVVpavy~#>aZ-JSPYk>{G8aDn-5d zRn|w7Bx2vUefu5n?_2o#OygCwQkX3llPr>PN+npBF-u31h6p}N<>CfjXhV>IYZQT# zwc}+iNhEh$7-#lYz!iRW(kK0_Y+PRvPQ77_#9XPO^xiNKRJ@>KLrV>0mu;6^IJ*zw zQ_?T8=cItGyU*6Ml%^kJXj2%~m~K4vfti*}(-skTydu1QOZGNSvTsldJl9wl8CL$&5EvvRU?kLO{|n*6Dgwy8sh_s~aU ziQ8^~B=%=pbrLI59NJMN!na>@$ zNYkZ+xMO8$sMT;f;mg2-jm^O$NN%|7XMd-uCb&U(_nz@|q_2S_Geb;J@_f5xUBjfX z8Ac5^ki=U$CIY!?;c+O_f8-YFxm|W*i`CyC+639RurDKa8#GmQARCt}nYrIQ@J~Bb zmG)mKOE_ZpwwCs{?_DeOWfPpHkLsaFnoH^7z(liIEm)MRFM~}!4Ge}*>hK0&dR+88 zaE!Q?gLDrqg6Rec?-ucy{wctL&R0@Edc^)p@GQgfJV4FMsNH7;e;Zv~$F022&;H)U z?X-*Cw>E_`=f{>=k(aufa-pHUVdSf3EN3b~hOpai==I(V>W!!S+9Gg7A^Sw8A( zSuku?>TQ3-7sE95OjZP~xF90QK>93sK56((g=+mB_k|0E2abNvcO$^BWr5EGb_}3^ zGSIEfv|u;D^B$=<|62OzDR7QUvIlherTh_C7iHk;YXD|X!R?sQgfUx#H{?FOs`$Gq z7fL&?=~)KJTNN*6#rGl(U9iIw)iMHun?3+7`(babdD!OG=U@|RYz=FbK*Ny@#-swX z3o4ns75Cso+53?cc|)f``+x`gsFD0G7;nB}Rg*epSXRG|sCte?Rl=LA8{Ls15c>_y z`*`6b%6%Ri_IETwRc5cv2JRNJ#?TW-?gORyLJs1F<9nKfsTzdC_k0l;DM*t*h1W?W*Ht{GS3Bt zvPri2KG@d*B;DdsoTP!MfV*G!INVxjC0rEi6#+3Uc<4$WUQ<80BT7#dg-V=e2MyUS z!lka4;RKCUar!$FtTPILqGr4%s7^*4o_qpO^rl1UpHt9$#t;mnp8^tyhCEnbY#oD4 z+qv=kTw+KR7KF^uZAUrL8z{x!4m7OL40c|&{$Aanhll; zy|wEx|64+0o&7h%vabm)Jf6qOm>PEogr&|TRNx?h*V3sp6%%b8IqeBRCZo)E zu_nuCHd+HzIwCuDJH+YawT$l_uBr$1lDmF_Ga&#uSs);)hY=c)pTM{r*STA-1_iaJ zFmCy7$2oLG`0nGB05A9ZJ<{f@NSu*MP62xHo>2=}>xvaHUdT#DqkCwqQ|3zsW$?b2 z+wcdQlTR%Y3^5s;5th0#BdumUaIUVTEs2BP;0bT+CvxSHLEcgSPoiwGl&kYu;417TX zNJ96w3!OgbzfRib_H#+buZB9BW`591-@>MeWo28B7ExIl<~N{AOKkoWf6Aw!9l!Xy z;l%VXWy>lKjpMEfFXha(*{gPTcD+z70@fr`Z04h`Jn|`iz~9Wl4;)hcE_bcByvOF4 zdLiqcmYPULp%2q(=$q&tu)d?>#LnmY1UR3Gx5TSdPA1Z&!Ed5SYEe`?c4itb(`*@A zk##SRR_UThR2CBbha!uxcALDePe)QWUgCegIIJ6` z1(%&0geNfY7Rc!61SlpeAiR6VtO^OlHfRwl!QCj3m@JUeNk{sG4qq6QrjdMZHVSpg zz0zm;ExTcOHJ~j!{^@&6boyS95~r62jf?;B-3my0_Ex27b8P8e#NyXN@4QPj^wih^ zn^lm)r~%5dsIaTElb}c*hauZM)B@KOCZ~&RZObMySZFe=wCEhawkXO)ky9F+1p=JBvQJhQyt!LFVApg7H08ZCfYz7~}c zt+k5r#$oX}2!pErBC*s{g~(t!Z==UM@RZ(MvOlIGxi^0ym88dF?t)1+Cp7fx+~Zjo zL6@8#(nX(DM}*JlkYOkT5<>2scaqR-Qg*{w*z&^R3bg~7pF5i#Sk~s zd6%L**Ix}Ja+2LaWN{;$smP_RLKxfW4Xry@`V}`u`@iL^{^p)si8*fp)hzXjS+xZF z$mmkc;O397uGAl*xtX9E89M7@wyG?rF#IQUbER)8F!T8(#0ht%qbrvOTAIZ9f(rZ#z!>yjE)@KQ(KNNK7NCa8Aeqvf) zp|UId+YEm&@~t;7_m`x7r1i(dS}z|z*C5Vc)<5$Hxmx&cv^&#!d}nj`V?XK3;-$xj^|FtS{LYVG6?6ry%S%sJG9oQeXGgm(&HvAp(hJ_#WgOe0jl|V|+(_bcA zyuRLAZu8H5GJ>)WU+K(8P2r;i7GxH`x!q45*Kq90x2M%PTxKN${_l%Q_(V0S#m8Fq z+b@)hUX;8!0$gsZZo4AEMclo@`p55PfY0ZUX$Z0#F(5j5_UvgA2vgw#Q}SbOIh|fp z?#X$6u`nMJuPwzz=LY^ ziO)D8E+v!1R{2SGs(in@xT%$!B)ary3=;JRDA-j#TLiWV-JBw~>Af4V0P^pdi(R;H z()=MrzNqeI1598|6cqueAxX)2r)9*Rb1_a(_hxQx$d8ac14OZMK>e6{LmwXVA6Lq< zoF6>jJBx6*{m$Re$lXXw27CjfVoNIr8+Uu0aMzVPlu*kz|jjAa>tk5oSL#!OAKQEyXu_H126e2mkP(! zHQkr-4wTJD)8Y((iMIl(N%?ME_ztoW)~{oU;{o7;<8n0vdCVQBpoRulw*4QD8(mdzIb&Dx!u*PfLfk_CIW16^PSh zrDUGGI|xaCo<{AYsY%W%)^VamcIaC?G%g$Vf&XXTCSZ)^o3pW)*HkIE_>C#Ke_2&d z(6alRD#>M>{5G)D->djtSaX*@Q9Y*Yz!f;d<`jpYeE+BP)wx2G^bk*ZlUNq% znoK@2JBUgrUri~XB~~YM&1kqrr9JWrc3CNjpU;w4!42p?ZW*tbPg#?ClOUB=P2RI0 zoh?S4_i2tkZxZH%_hV3F33*_FCBvx^`G?6VdXk|l!35Q{ng0WOnkS6hIeYB_0A?Cy4!4kP>w$IVgT!V&KEiTkuT9DNoUHkIw?D#B zvS=xbWIKJjJ@)K(*u$4J6N@oja~O(49AO$c7WdW(sEMMjs2`fv4Evz*bKx@n;mPN< zz6+N|b*%Y}cqvAT+%Neb$tC@Qo}P=47bCDePFxz&%0sW{ZbQehY6c4DxfRPZ6lu4=5vt~!74z2FaC;B*?7uWL(u3sJHmPI%q6 zESp(9Q+yK&?L~xMm*g#qyFV@t5qmq!$|5J-Shgo961usz;!F8)tKacxYaGr$ERre{*1Es z9qm$&tpMra_m~PyXJlR6Vk{5ATLYF}+YZCqS+c+Ua=rYi^h1p*`^h(iw-4y;^~=q+ zfgD=qB2D<@2ZWj$N){Eu_A*v6A77ctxFjlN7~q%lLZnj;pC*t(*hhNmwBt9~PnNBb zixB)Q@mow4VDGSy22tMFdC2l<#XX${746pz#OZI0Y$h9w3aBPcenJR|%Bxa7QDFJg zK0Oe_6ZfiFNlK<4mYLn)bCFZ1V$;5=bmF?n|DA0GlP zUE&4IMgfuxoz&VFf&+W^jL|lX#KE`tMVJO_S!XB#gJA8|ln*VGeka^Bi@}PodY-EV zv7*2ODiPgxd{#xI*FsJ(ci<5r%sxW_5UB+UFvU3AM`#x}v4|d}2Y1Pd z-(%?V_Gvdet`pYhn=~!RQ6g7>WY7Kq>7V$*Kc`isiKjq?%^2p>Y|(d&-+5CM=j(ax zM$&3D&$6>0@bh+GT{pyO6@6F{KEm#4smD&9`U-NU>;LjW?@GU|&hs<>cBVrA(Fg8S zJSF_nw8Q+P1^JiZ{(i~wGQQ32Z~14o^oN!Bcd=Z&e2oRaU00Z^`HU?72g3Yoa4I~R zpt}82R{HOQx8XS&<$!|2Pq@Rs%OW5MFS1{Q@1NrE--r5Z#^24K<9VCcE2TpJxOvDp zfZAl$g1-}FpIv&!nEq`ZhL-0D?>~+a`O!LFuG0J*bn5_QwYErf79#!QD}P@kL*q;4 z%%&NBOUg{a45SbKLy@t78GxF=G6T|HAgQLm!mE zy#@ezpIld-MYvu2k!jx2xP5Z+$$Y5g@T}~i2*vZ+C1{uVS>EQK{Gs@K0L)|8K1F8) zl44bhND2YlX}zyf`kp{MrE4`=4~~w#JeJ(;p^e6W)vE-q%z;u>|Aj~Da+X_-(4sqp z;Qr0?Biz0}aB|B`{KH_C%RKsUHeZH_l1>p~c+I%N+ev1gB5=FAwtnbcyuNSKPT-cIMFp TdHBO$@PAKbl%-1_8UFr15!>LM literal 0 HcmV?d00001 diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index e95409e08e9..e807ee54fbf 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import dataclasses import inspect @@ -299,6 +299,7 @@ def __init__( extra_kwargs["delay_wgrad_compute"] = self.config.delay_wgrad_compute else: raise RuntimeError("Only TE with version >=2.3.0 supports delay_wgrad_compute now.") + if ( self.config.tp_comm_overlap and tp_comm_buffer_name @@ -2116,3 +2117,12 @@ def set_save_original_input(module): "set_save_original_input is only needed on transformer-engine modules that save " "quantized tensors by default. It needs transformer-engine>=2.6.0dev0." ) + + +try: + # pylint: disable=unused-import + from transformer_engine.pytorch import cpu_offload + from transformer_engine.pytorch.float8_tensor import Float8Tensor +except ImportError: + Float8Tensor = None + cpu_offload = None diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py index d501c11a0a9..74b9a90764d 100644 --- a/megatron/core/models/common/model_chunk_schedule_plan.py +++ b/megatron/core/models/common/model_chunk_schedule_plan.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. from contextlib import nullcontext from typing import Optional @@ -8,6 +8,9 @@ from megatron.core.enums import Fp8Recipe from megatron.core.fp8_utils import get_fp8_context +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_set_last_layer, +) from megatron.core.pipeline_parallel.utils import ( AbstractSchedulePlan, NoopScheduleNode, @@ -450,6 +453,8 @@ def run( f_layer = f_schedule_plan.get_layer(i) b_layer = b_schedule_plan.get_layer(b_num_layers - 1 - i) torch.cuda.nvtx.range_push(f"layer_{i}f-layer_{b_num_layers - 1 - i}b") + if f_layer.layer.config.fine_grained_activation_offloading: + fine_grained_offloading_set_last_layer(i == f_num_layers - 1) f_input, b_grad = TransformerLayerSchedulePlan.run( f_layer, b_layer, @@ -472,6 +477,8 @@ def run( for i in range(overlapped_layers, f_num_layers): f_layer = f_schedule_plan.get_layer(i) torch.cuda.nvtx.range_push(f"layer_{i}f") + if f_layer.layer.config.fine_grained_activation_offloading: + fine_grained_offloading_set_last_layer(i == f_num_layers - 1) f_input, _ = TransformerLayerSchedulePlan.run(f_layer, None, f_input=f_input) torch.cuda.nvtx.range_pop() diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index fd1cc3d33c6..786a1b850dd 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import weakref from contextlib import nullcontext @@ -8,6 +8,11 @@ import torch from megatron.core import tensor_parallel +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_group_commit, + fine_grained_offloading_group_start, + get_fine_grained_offloading_context, +) from megatron.core.pipeline_parallel.utils import ScheduleNode, make_viewless from megatron.core.transformer.module import float16_to_fp32 from megatron.core.transformer.moe.moe_layer import MoELayer @@ -350,13 +355,17 @@ def submodule_post_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor) Run forward pass for computations between attention and dispatch: pre mlp layernorm->router->dispatch preprocess """ + if layer.offload_mlp_norm: + hidden_states = fine_grained_offloading_group_start(hidden_states, name="mlp_norm") if layer.recompute_pre_mlp_layernorm: layer.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput() - pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint( - layer.pre_mlp_layernorm, hidden_states - ) + with get_fine_grained_offloading_context(layer.offload_mlp_norm): + pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint( + layer.pre_mlp_layernorm, hidden_states + ) else: - pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states) + with get_fine_grained_offloading_context(layer.offload_mlp_norm): + pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states) local_tokens, probs, _ = layer.mlp.router_and_preprocess(pre_mlp_layernorm_output) @@ -437,6 +446,10 @@ def submodule_combine_forward( hidden_states = layer.mlp_bda(layer.training, layer.config.bias_dropout_fusion)( mlp_output_with_bias, residual, layer.hidden_dropout ) + if layer.offload_mlp_norm: + (hidden_states,) = fine_grained_offloading_group_commit( + hidden_states, name="mlp_norm", forced_released_tensors=[residual] + ) output = make_viewless_tensor( inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True ) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 654827dc6fb..ae292649561 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. from collections import OrderedDict from typing import Dict, Literal, Optional @@ -18,6 +18,9 @@ ) from megatron.core.models.common.language_module.language_module import LanguageModule from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_init_chunk_handler, +) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.quantization.utils import get_quant_config_or_none from megatron.core.tensor_parallel import gather_from_sequence_parallel_region @@ -117,6 +120,7 @@ def __init__( self.parallel_output = parallel_output self.share_embeddings_and_output_weights = share_embeddings_and_output_weights self.vp_stage = vp_stage + self.disable_param_offloading = True if hasattr(self.config, 'position_embedding_type'): self.position_embedding_type = self.config.position_embedding_type @@ -410,6 +414,22 @@ def _preprocess( return preproc_output + def preprocess_for_fine_grained_offloading(self): + """Preprocess for fine-grained activation offloading.""" + fine_grained_offloading_init_chunk_handler( + self.vp_stage, self.config.min_offloaded_tensor_size + ) + if self.disable_param_offloading: + for param in self.decoder.parameters(): + param.offloading_activation = False + if self.mtp_process: + for param in self.mtp.parameters(): + param.offloading_activation = False + if self.post_process: + for param in self.output_layer.parameters(): + param.offloading_activation = False + self.disable_param_offloading = False + def forward( self, input_ids: Tensor, @@ -435,6 +455,8 @@ def forward( runtime_gather_output (bool): Gather output at runtime. Default None means `parallel_output` arg in the constructor will be used. """ + if self.config.fine_grained_activation_offloading: + self.preprocess_for_fine_grained_offloading() inference_context = deprecate_inference_params(inference_context, inference_params) @@ -701,6 +723,9 @@ def build_schedule_plan( TransformerModelChunkSchedulePlan: The model chunk schedule plan. """ + if self.config.fine_grained_activation_offloading: + self.preprocess_for_fine_grained_offloading() + from ..common.model_chunk_schedule_plan import TransformerModelChunkSchedulePlan return TransformerModelChunkSchedulePlan( diff --git a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py new file mode 100644 index 00000000000..b28bbcbeddc --- /dev/null +++ b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py @@ -0,0 +1,603 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +import warnings +from collections import deque +from contextlib import nullcontext +from typing import Any + +import torch + +# CPU offload implementation for pipeline parallelism +DEBUG = False +DEBUG_RANK = 0 + + +def debug_rank(message): + """Print debug message for a specific rank when DEBUG is enabled.""" + # pylint: disable=bad-builtin + if not DEBUG: + return + assert torch.distributed.is_initialized() + if torch.distributed.get_rank() == DEBUG_RANK: + print(message) + + +def set_ideal_affinity_for_current_gpu(): + """Set CPU affinity for the current GPU to optimize host-device transfers.""" + import uuid + + try: + import cuda.bindings.driver as cuda_driver + import cuda.bindings.runtime as cuda_runtime + except ImportError: + import cuda.cuda as cuda_driver + import cuda.cudart as cuda_runtime + try: + import pynvml + except ImportError: + warnings.warn("pynvml is not installed, skipping GPU affinity setting") + return + + # Get current CUDA device ID + err, device_id = cuda_runtime.cudaGetDevice() + assert err == cuda_runtime.cudaError_t.cudaSuccess + # Get device UUID + err, device_uuid = cuda_driver.cuDeviceGetUuid(device_id) + assert err == cuda_driver.CUresult.CUDA_SUCCESS + # Set CPU affinity based on GPU's NUMA node + pynvml.nvmlInit() + handle = pynvml.nvmlDeviceGetHandleByUUID("GPU-" + str(uuid.UUID(bytes=device_uuid.bytes))) + pynvml.nvmlDeviceSetCpuAffinity(handle) + + +class PipelineOffloadManager: + """ + Singleton manager for coordinating activation offloading across pipeline stages. + Manages chunk handlers, synchronizes GPU-CPU transfers, + and handles virtual pipeline parallelism. + """ + + OFFLOAD_MGR = None + + @classmethod + def get_instance(cls): + """Get the singleton instance of PipelineOffloadManager.""" + if cls.OFFLOAD_MGR is None: + cls.OFFLOAD_MGR = PipelineOffloadManager() + return cls.OFFLOAD_MGR + + def __init__(self): + """Initialize the manager with queues and dedicated CUDA streams.""" + from megatron.core import parallel_state + + # Queue to store chunk handlers for backward pass + self._queue = deque() + if parallel_state.get_virtual_pipeline_model_parallel_world_size() is None: + self._vpp = 1 + else: + self._vpp = parallel_state.get_virtual_pipeline_model_parallel_world_size() + + # Cache chunk handlers for each virtual pipeline stage + self._stages = [[] for _ in range(self._vpp)] + # allocate streams and events for synchronization + self._d2h_stream = torch.cuda.Stream() + self._h2d_stream = torch.cuda.Stream() + self.reset() + + @property + def d2h_stream(self): + """Get the device-to-host (GPU to CPU) transfer stream.""" + return self._d2h_stream + + @property + def h2d_stream(self): + """Get the host-to-device (CPU to GPU) transfer stream.""" + return self._h2d_stream + + def reset(self): + """Reset manager state for a new training iteration.""" + set_ideal_affinity_for_current_gpu() + self._inside_context = False + self._cur_forward_chunk = None + self._cur_backward_chunk = None + # Track the first microbatch of the last virtual pipeline stage + self._is_first_last_vpp_chunk = True + + def flush(self): + """Flush all staged chunks to the backward queue in reverse order.""" + # Ensure all virtual pipeline stages have the same number of chunks + if len(self._stages[0]) == len(self._stages[-1]): + lens = [len(e) for e in self._stages] + assert min(lens) == max(lens), "All stages must have same chunk count" + # Clear the last stage and push all chunks in reverse order for backward + self._stages[-1] = [] + for chunks in reversed(self._stages): + for chunk in chunks: + self.push(chunk) + # Clear all stages after flushing + for i in range(self._vpp): + self._stages[i] = [] + + def push(self, handler): + """Add a chunk handler to the backward queue.""" + debug_rank(f"pushing handler {handler}") + self._queue.append(handler) + + def pop(self): + """Remove and set the next non-empty chunk as the current backward chunk.""" + assert self.size(), "Cannot pop from empty queue" + while self._queue: + self._cur_backward_chunk = self._queue.popleft() + if not self._cur_backward_chunk.is_empty_chunk(): + break + debug_rank(f"popping handler {self._cur_backward_chunk}") + + def front(self): + """Get the first non-empty chunk handler without removing it from the queue.""" + if not self.size(): + return None + for chunk_handler in self._queue: + if not chunk_handler.is_empty_chunk(): + return chunk_handler + return None + + def size(self): + """Return the number of chunk handlers in the queue.""" + return len(self._queue) + + def init_model_chunk_offload_handler(self, vp_stage, min_offloaded_tensor_size=1024 * 1024): + """ + Initialize a chunk offload handler for a model chunk (microbatch). + + Args: + vp_stage: Virtual pipeline stage index (None means stage 0) + min_offloaded_tensor_size: Minimum tensor size (in elements) to offload + """ + if vp_stage is None: + cur_vpp_rank = 0 + else: + cur_vpp_rank = vp_stage + + is_first_last_vpp_chunk = self._is_first_last_vpp_chunk + # Flush staged chunks when reaching the last virtual pipeline stage + if cur_vpp_rank == self._vpp - 1: + self.flush() + # Determine if this is the first microbatch of the last virtual pipeline stage + is_first_last_vpp_chunk = is_first_last_vpp_chunk and (cur_vpp_rank == self._vpp - 1) + + cur_chunk = ChunkOffloadHandler(is_first_last_vpp_chunk, min_offloaded_tensor_size) + self._stages[cur_vpp_rank].append(cur_chunk) + # For the last stage, push immediately and flush + if cur_vpp_rank == self._vpp - 1: + self._is_first_last_vpp_chunk = False + self.push(cur_chunk) + self.flush() + self._cur_forward_chunk = cur_chunk + cur_chunk.vpp_rank = cur_vpp_rank + + def set_last_layer(self, is_last_layer): + """Mark whether the current forward chunk is processing the last layer.""" + self._cur_forward_chunk.is_last_layer = is_last_layer + + def cur_forward_chunk(self): + """Get the current forward pass chunk handler.""" + return self._cur_forward_chunk + + def cur_backward_chunk(self): + """Get the current backward pass chunk handler.""" + return self._cur_backward_chunk + + def __enter__(self): + """Enter context manager to enable activation offloading hooks.""" + debug_rank("----__enter__") + from megatron.core.extensions.transformer_engine import cpu_offload + + if cpu_offload is not None: + cpu_offload.CPUOffloadEnabled = True + self.inside_context = True + + torch._C._autograd._push_saved_tensors_default_hooks( + self.on_save_for_backward, self.on_get_saved_tensor + ) + + def __exit__(self, *args: Any): + """Exit context manager and restore original tensor saving behavior.""" + debug_rank("----__exit__") + from megatron.core.extensions.transformer_engine import cpu_offload + + if cpu_offload is not None: + cpu_offload.CPUOffloadEnabled = False + self.inside_context = False + torch._C._autograd._pop_saved_tensors_default_hooks() + + def on_save_for_backward(self, tensor: torch.Tensor) -> Any: + """ + Hook called when autograd saves a tensor for backward pass. + Returns a tag to identify the tensor later. + """ + debug_rank(f"------on_save_for_backward {tensor.shape}") + assert self.inside_context, "Must be inside offload context" + return self.cur_forward_chunk().tensor_push(tensor) + + def on_get_saved_tensor(self, saved_state: Any) -> torch.Tensor: + """ + Hook called when autograd retrieves a saved tensor during backward pass. + Returns the actual tensor (potentially reloading from CPU). + """ + debug_rank(f"----on_get_saved_tensor {saved_state}") + return self.cur_backward_chunk().tensor_pop(saved_state) + + +class ChunkOffloadHandler: + """ + Handles activation offloading and reloading for a single pipeline chunk (microbatch). + Manages tensor groups, coordinates asynchronous GPU-CPU transfers, and handles synchronization. + """ + + @staticmethod + def offload(src_tensor, pin_memory=True): + """Offload.""" + debug_rank("--------offload") + from megatron.core.extensions.transformer_engine import Float8Tensor + + fp8_offload = isinstance(src_tensor, Float8Tensor) if Float8Tensor is not None else False + + if not src_tensor.is_contiguous(): + src_tensor = src_tensor.contiguous() + + cpu_backup = torch.empty( + src_tensor.size(), + dtype=torch.uint8 if fp8_offload else src_tensor.dtype, + layout=src_tensor.layout, + device="cpu", + pin_memory=pin_memory, + ) + + if fp8_offload: + cpu_backup = Float8Tensor.make_like(src_tensor, data=cpu_backup) + + cpu_backup.copy_(src_tensor, non_blocking=pin_memory) + state = (src_tensor.device, cpu_backup) + return state + + @staticmethod + def reload(state, non_blocking=None): + """Reload.""" + debug_rank("------reload") + dev, cpu_backup = state + if non_blocking is None: + non_blocking = cpu_backup.is_pinned() + return cpu_backup.to(dev, non_blocking=non_blocking) + + def __init__(self, is_first_last_vpp_chunk, min_offloaded_tensor_size): + # Data Structure to maintain reference to activation tensors + self._tensor_tag_to_state = {} + # Mark the first microbatch of the last virtual pipeline stage + self._is_first_last_vpp_chunk = is_first_last_vpp_chunk + + # Group management for batching offload/reload operations + self._offloaded_group_index = 0 + self._groups_to_offload = [] + self._groups_to_reload = [] + self._tensor_count_current_group = 0 + + # Counter for special torch tensor types (FakeTensor, FunctionalTensor) + self.torch_tensor_count = 0 + self.d2h_stream = PipelineOffloadManager.get_instance().d2h_stream + self.h2d_stream = PipelineOffloadManager.get_instance().h2d_stream + self._offload_events = {} + self._reload_events = {} + self.min_offloaded_tensor_size = min_offloaded_tensor_size + self.is_last_layer = False + + def is_empty_chunk(self): + """Check if this chunk has no tensors to manage.""" + return len(self._tensor_tag_to_state) == 0 + + def is_first_last_layer(self): + """ + Check if this is the last layer of the first microbatch of the last vp stage. + These tensors should not be offloaded to avoid unnecessary overhead. + """ + debug_rank( + f"------is_first_last_layer {self._is_first_last_vpp_chunk} {self.is_last_layer}" + ) + return self._is_first_last_vpp_chunk and self.is_last_layer + + def tensor_push(self, tensor): + """Push tensor to the offload handler.""" + torch_stray_tensor = isinstance( + tensor, + ( + torch._subclasses.fake_tensor.FakeTensor, + torch._subclasses.functional_tensor.FunctionalTensor, + ), + ) + + if not torch_stray_tensor: + # Assign unique tag based on group index and position within group + tensor_tag = (self._offloaded_group_index, self._tensor_count_current_group) + self._tensor_count_current_group += 1 + assert tensor_tag not in self._tensor_tag_to_state, "Duplicate tensor tag" + self._tensor_tag_to_state[tensor_tag] = tensor + else: + # Use negative group ID for special tensor types + tensor_tag = (-1, self.torch_tensor_count) + self.torch_tensor_count += 1 + self._tensor_tag_to_state[tensor_tag] = tensor + debug_rank(f"--------tensor_push {tensor_tag}") + return tensor_tag + + def tensor_pop(self, tensor_tag): + """Pop tensor from the offload handler.""" + debug_rank(f"--------tensor_pop {tensor_tag}") + assert tensor_tag in self._tensor_tag_to_state, f"Tag {tensor_tag} not found" + tensor = self._tensor_tag_to_state.pop(tensor_tag) + # If tensor is offloaded (stored as tuple), reload it + if isinstance(tensor, tuple): + tensor = self.reload(tensor) + debug_rank(f"--------tensor_pop {tensor.shape}") + return tensor + + def tensor_need_offloading_checker(self, tensor): + """Check if the tensor needs to be offloaded.""" + if tensor.numel() < self.min_offloaded_tensor_size: + return False + # Respect tensor's offload preference if specified + if hasattr(tensor, "offloading_activation") and not tensor.offloading_activation: + return False + return True + + def bulk_offload_group(self, group_to_offload): + """offload a group of tensors recorded in tensor_push().""" + debug_rank("------bulk_offload_group") + assert not self.is_first_last_layer(), "Should not offload first-last layer" + group_id_to_offload, name = group_to_offload + torch.cuda.nvtx.range_push("activation offloading " + name) + with torch.cuda.stream(self.d2h_stream): + for tensor_tag, state in self._tensor_tag_to_state.items(): + group_id, _ = tensor_tag + if group_id == group_id_to_offload: + debug_rank(f"------tensor_tag {tensor_tag}") + debug_rank(f"------group_to_offload {group_to_offload}") + assert not isinstance(state, tuple), "Tensor already offloaded" + tensor_on_device = state + if self.tensor_need_offloading_checker(tensor_on_device): + state = self.offload(tensor_on_device) + event = torch.cuda.Event() + event.record(self.d2h_stream) + self._offload_events[name] = event + tensor_on_device.record_stream(self.d2h_stream) + self._tensor_tag_to_state[tensor_tag] = state + torch.cuda.nvtx.range_pop() + + def get_offload_event(self, name): + """Get the CUDA event for a named offload operation.""" + return self._offload_events.get(name, None) + + def get_reload_event(self, name): + """Get the CUDA event for a named reload operation.""" + return self._reload_events.get(name, None) + + def bulk_reload_group(self, group_to_reload): + """Bulk reload group.""" + debug_rank("----bulk_reload_group") + found_reload_group = False + group_id_to_reload, name = group_to_reload + torch.cuda.nvtx.range_push("activation reloading " + name) + with torch.cuda.stream(self.h2d_stream): + for tensor_label, state in self._tensor_tag_to_state.items(): + group_id, _ = tensor_label + if group_id == group_id_to_reload: + debug_rank(f"----tensor_label {tensor_label}") + found_reload_group = True + event = self.get_offload_event(name) + # Only reload if tensor was offloaded (stored as tuple) + if isinstance(state, tuple): + # Wait for offload to complete before reloading + torch.cuda.current_stream().wait_event(event) + recovered_tensor = self.reload(state) + event.record(self.h2d_stream) + self._reload_events[name] = event + debug_rank(f"----recovered_tensor {recovered_tensor.shape}") + self._tensor_tag_to_state[tensor_label] = recovered_tensor + torch.cuda.nvtx.range_pop() + return found_reload_group + + def pre_reload_last_layer(self): + """Pre-reload the last layer of this chunk to hide reload latency.""" + debug_rank("pre_reload_last_layer") + assert not self._is_first_last_vpp_chunk, "Should not pre-reload first chunk" + debug_rank(f"len(self._groups_to_reload) {len(self._groups_to_reload)}") + if len(self._groups_to_reload) > 0: + # Reload the last group (last layer) early + if self.bulk_reload_group(self._groups_to_reload[-1]): + self._groups_to_reload.pop() + + def should_bulk_offload(self): + """Determine if the current group should be offloaded.""" + # Don't offload the first backward chunk's last layer + if self.is_first_last_layer(): + return False + + # Check if next backward chunk is this chunk (for last pipeline stage) + next_backward_chunk = PipelineOffloadManager.get_instance().front() + if next_backward_chunk is not None and next_backward_chunk is self: + # Don't offload last layer if it's about to be used immediately + if self.is_last_layer: + return False + + return True + + def bulk_offload(self, forced_released_tensors): + """Offload a group of tensors and optionally release their GPU memory.""" + debug_rank("----bulk_offload") + if self.should_bulk_offload(): + group_to_offload = self._groups_to_offload.pop() + self._groups_to_reload.append(group_to_offload) + self.bulk_offload_group(group_to_offload) + # Manually release tensors not auto-freed by torch GC + if len(forced_released_tensors) > 0: + cur_stream = torch.cuda.current_stream() + for release_tensor in forced_released_tensors: + if self.tensor_need_offloading_checker(release_tensor): + # Ensure tensor is not in use before freeing + release_tensor.record_stream(cur_stream) + release_tensor.untyped_storage().resize_(0) + + def on_group_commit_forward(self, forced_released_tensors): + """Called at the end of a layer group's forward pass to trigger offloading.""" + debug_rank("--on_group_commit_forward") + # Wait for compute to finish before starting offload + self.d2h_stream.wait_stream(torch.cuda.current_stream()) + self.bulk_offload(forced_released_tensors) + + def bulk_reload(self): + """Reload the next group of tensors from CPU to GPU.""" + debug_rank("--bulk_reload") + if len(self._groups_to_reload) > 0: + # Reload the next layer group + if self.bulk_reload_group(self._groups_to_reload[-1]): + debug_rank(f"--bulk_reload_group {self._groups_to_reload}") + self._groups_to_reload.pop() + else: + # Pre-load the last layer of the next backward chunk to hide latency + next_backward_chunk = PipelineOffloadManager.get_instance().front() + if next_backward_chunk is not None: + next_backward_chunk.pre_reload_last_layer() + + def on_group_commit_backward(self, name): + """ + Called at the end of a layer group's backward pass. + Ensures correct chunk is active and synchronizes reloads. + """ + debug_rank("--on_group_commit_backward") + cur_backward_chunk = PipelineOffloadManager.get_instance().cur_backward_chunk() + # Switch to this chunk if it's not already current + if cur_backward_chunk is not self: + PipelineOffloadManager.get_instance().pop() + cur_backward_chunk = PipelineOffloadManager.get_instance().cur_backward_chunk() + assert cur_backward_chunk is self, "Chunk mismatch" + # Wait for reload to complete before using tensors + event = self.get_reload_event(name) + if event is not None: + torch.cuda.current_stream().wait_event(event) + self._offloaded_group_index = self._offloaded_group_index - 1 + + def on_group_start_forward(self, name): + """ + Called at the start of a layer group's forward pass. + Increments group index and prepares for offloading. + """ + debug_rank(f"--on_group_start_forward") + self._offloaded_group_index = self._offloaded_group_index + 1 + self._tensor_count_current_group = 0 + self._groups_to_offload.append((self._offloaded_group_index, name)) + + def on_group_start_backward(self): + """ + Called at the start of a layer group's backward pass. + Triggers reloading of tensors from CPU. + """ + debug_rank("--on_group_start_backward") + # Wait for compute to finish before starting reload + self.h2d_stream.wait_stream(torch.cuda.current_stream()) + self.bulk_reload() + + +class FineGrainedOffloadingGroupCommitFunction(torch.autograd.Function): + """ + Identity operation that marks the end of a layer group for offload synchronization. + Triggers offload during forward and synchronizes reload during backward. + """ + + @staticmethod + def forward(ctx, *args): + # pylint: disable=missing-function-docstring + debug_rank("FineGrainedOffloadingGroupCommitFunction forward") + + forced_released_tensors = args[-1] + name = args[-2] + cpu_offload_handler = args[-3] + tensor = args[:-3] + cpu_offload_handler.on_group_commit_forward(forced_released_tensors) + ctx.cpu_offload_handler = cpu_offload_handler + ctx.name = name + + # return the identical tensor + return tensor + + @staticmethod + def backward(ctx, *grad_output): + # pylint: disable=missing-function-docstring + debug_rank("FineGrainedOffloadingGroupCommitFunction backward") + + cpu_offload_handler = ctx.cpu_offload_handler + cpu_offload_handler.on_group_commit_backward(ctx.name) + return grad_output + (None, None, None) + + +def fine_grained_offloading_group_commit(*tensor, name, forced_released_tensors=[]): + """ + Specify the tensors to be released after offloading. + forced_released_tensors is a list of tensors to be released after offloading. + The tensors will be untyped_storage().resize_(0) after offloading. + Note: specify the tensors only when they are not automatically released by torch gc. + """ + cur_forward_chunk = PipelineOffloadManager.get_instance().cur_forward_chunk() + return FineGrainedOffloadingGroupCommitFunction.apply( + *tensor, cur_forward_chunk, name, forced_released_tensors + ) + + +class FineGrainedOffloadingGroupStartFunction(torch.autograd.Function): + """ + Identity operation that marks the start of a layer group for offload/reload. + Prepares for offload during forward and triggers reload during backward. + """ + + @staticmethod + def forward(ctx, tensor, cpu_offload_handler, name): + # pylint: disable=missing-function-docstring + ctx.cpu_offload_handler = cpu_offload_handler + debug_rank("FineGrainedOffloadingGroupStartFunction forward") + + cpu_offload_handler.on_group_start_forward(name) + # return the identical tensor + return tensor + + @staticmethod + def backward(ctx, grad_output): + # pylint: disable=missing-function-docstring + debug_rank("FineGrainedOffloadingGroupStartFunction backward") + cpu_offload_handler = ctx.cpu_offload_handler + cpu_offload_handler.on_group_start_backward() + return grad_output, None, None + + +def fine_grained_offloading_group_start(tensor, name=None): + """Mark the start of a layer group and prepare for offload/reload.""" + cur_forward_chunk = PipelineOffloadManager.get_instance().cur_forward_chunk() + return FineGrainedOffloadingGroupStartFunction.apply(tensor, cur_forward_chunk, name) + + +def get_fine_grained_offloading_context(flag): + """Get the fine-grained offload context""" + return PipelineOffloadManager.get_instance() if flag else nullcontext() + + +def fine_grained_offloading_set_last_layer(is_last_layer): + """Set the last layer flag.""" + PipelineOffloadManager.get_instance().set_last_layer(is_last_layer) + + +def fine_grained_offloading_init_chunk_handler(vp_stage, min_offloaded_tensor_size): + """Initialize the chunk handler, called at the start of a microbatch forward pass.""" + PipelineOffloadManager.get_instance().init_model_chunk_offload_handler( + vp_stage, min_offloaded_tensor_size + ) + + +def fine_grained_offloading_reset(): + """Reset the chunk handler, called at the start of a training iteration.""" + PipelineOffloadManager.get_instance().reset() diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index e83f8d90635..09f95ac25d2 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import contextlib from functools import partial @@ -9,6 +9,9 @@ from megatron.core import parallel_state from megatron.core.enums import ModelType +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_reset, +) from megatron.core.pipeline_parallel.p2p_communication import P2PCommunicator from megatron.core.pipeline_parallel.utils import ( is_pp_first_stage, @@ -562,6 +565,9 @@ def forward_backward_no_pipelining( if config.timers is not None: config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time) + if not forward_only and config.fine_grained_activation_offloading: + fine_grained_offloading_reset() + no_sync_func = config.no_sync_func if no_sync_func is None: no_sync_func = contextlib.nullcontext @@ -898,6 +904,9 @@ def forward_backward_pipelining_with_interleaving( adjust_tensor_shapes_fn is None ), "adjust_tensor_shapes_fn is not supported for interleaved pipeline parallelism" + if not forward_only and config.fine_grained_activation_offloading: + fine_grained_offloading_reset() + if config.overlap_p2p_comm and config.batch_p2p_comm: raise ValueError("Can not use both overlap_p2p_comm and batch_p2p_comm") @@ -2043,6 +2052,9 @@ def forward_backward_pipelining_without_interleaving( if config.timers is not None: config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time) + if not forward_only and config.fine_grained_activation_offloading: + fine_grained_offloading_reset() + # Disable async grad reductions no_sync_func = config.no_sync_func if no_sync_func is None: diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py index 54cac0e41e3..2ae15bef0d9 100644 --- a/megatron/core/tensor_parallel/random.py +++ b/megatron/core/tensor_parallel/random.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Parts of the code here are adapted from PyTorch # repo: https://github.com/pytorch/pytorch @@ -510,10 +510,11 @@ def forward(ctx, run_function, checkpoint_without_output_obj, *args): @staticmethod def backward(ctx, *args): """Backward pass.""" - inputs = ctx.saved_tensors + inputs = ctx.inputs outputs = ctx.outputs torch.autograd.backward(outputs, args) ctx.outputs = None + ctx.inputs = None grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp for inp in inputs) return (None, None) + grads @@ -573,8 +574,9 @@ def _recompute(self, _): recompute_ctx = contextlib.nullcontext() fp8_ctx = contextlib.nullcontext() + inputs = self.ctx.saved_tensors with torch.enable_grad(), fp8_ctx, recompute_ctx: - outputs = self.run_function(*self.ctx.saved_tensors) + outputs = self.run_function(*inputs) self.run_function = None self.rng_states = None @@ -590,6 +592,7 @@ def _recompute(self, _): output.untyped_storage().copy_(recomputation_output.untyped_storage()) self.ctx.outputs = outputs + self.ctx.inputs = inputs self.outputs = None self.ctx = None diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index d4e990041ca..3427b5ee3ab 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. from abc import ABC, abstractmethod from dataclasses import dataclass @@ -22,6 +22,11 @@ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_group_commit, + fine_grained_offloading_group_start, + get_fine_grained_offloading_context, +) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.module import MegatronModule @@ -188,6 +193,21 @@ def __init__( and "core_attn" in self.config.recompute_modules ) + self.offload_qkv_linear = ( + self.config.fine_grained_activation_offloading + and "qkv_linear" in self.config.offload_modules + ) + + self.offload_core_attention = ( + self.config.fine_grained_activation_offloading + and "core_attn" in self.config.offload_modules + ) + + self.offload_attn_proj = ( + self.config.fine_grained_activation_offloading + and "attn_proj" in self.config.offload_modules + ) + # Output. self.linear_proj = build_module( submodules.linear_proj, @@ -730,9 +750,17 @@ def forward( if output_gate: assert split_qkv, "output_gate is not supported for unsplit mixed_qkv tensor." - qkv_output = self.get_query_key_value_tensors( - hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv - ) + if self.offload_qkv_linear: + hidden_states = fine_grained_offloading_group_start(hidden_states, name="qkv_linear") + with get_fine_grained_offloading_context(self.offload_qkv_linear): + qkv_output = self.get_query_key_value_tensors( + hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv + ) + if self.offload_qkv_linear: + qkv_output, _ = fine_grained_offloading_group_commit( + qkv_output, name="qkv_linear", forced_released_tensors=[hidden_states] + ) + attn_mask_type = self.attn_mask_type block_table = None gate = None @@ -881,17 +909,20 @@ def forward( packed_seq_params=packed_seq_params, ) else: + if self.offload_core_attention and self.training: + query = fine_grained_offloading_group_start(query, name="core_attn") if inference_context is None or inference_context.is_static_batching(): # Static batching attention kernel. - core_attn_out = self.core_attention( - query, - key, - value, - attention_mask, - attn_mask_type=attn_mask_type, - attention_bias=attention_bias, - packed_seq_params=packed_seq_params, - ) + with get_fine_grained_offloading_context(self.offload_core_attention): + core_attn_out = self.core_attention( + query, + key, + value, + attention_mask, + attn_mask_type=attn_mask_type, + attention_bias=attention_bias, + packed_seq_params=packed_seq_params, + ) else: # Dynamic batching attention kernel. @@ -911,6 +942,10 @@ def forward( block_table, ) core_attn_out = rearrange(core_attn_out, 's b h d -> s b (h d)') + if self.offload_core_attention and self.training: + (core_attn_out,) = fine_grained_offloading_group_commit( + core_attn_out, name="core_attn", forced_released_tensors=[query, key, value] + ) if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd': # reshape to same output shape as unpacked case @@ -931,7 +966,14 @@ def forward( # ================= nvtx_range_push(suffix="linear_proj") - output, bias = self.linear_proj(core_attn_out) + if self.offload_attn_proj: + core_attn_out = fine_grained_offloading_group_start(core_attn_out, name="attn_proj") + with get_fine_grained_offloading_context(self.offload_attn_proj): + output, bias = self.linear_proj(core_attn_out) + if self.offload_attn_proj: + output, bias = fine_grained_offloading_group_commit( + output, bias, name="attn_proj", forced_released_tensors=[core_attn_out] + ) nvtx_range_pop(suffix="linear_proj") return output, bias diff --git a/megatron/core/transformer/moe/README.md b/megatron/core/transformer/moe/README.md index 0a933aed0df..a44daea38e2 100644 --- a/megatron/core/transformer/moe/README.md +++ b/megatron/core/transformer/moe/README.md @@ -210,6 +210,20 @@ Enable A2A overlap across different batches inspired by the DSv3 DualPipe implme --delay-wgrad-compute ``` +### Fine-grained Activation Offloading (collaborated with rednote) +Offload the input activation at the granularity of modules + +**Usage** +```bash +# Enable fine-grained activation offloading +--fine-grained-activation-offloading + +# Specify which modules are going to offload its input +# Choices: "attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act". +--offload-modules expert_fc1 +``` +For more details, please refer to the ```docs/source/api-guide/fine_grained_activation_offloading.md``` + ### MoE Related Arguments | Item | Description | | --- | --- | diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py index d0ac20a7536..ca308da0d21 100644 --- a/megatron/core/transformer/moe/experts.py +++ b/megatron/core/transformer/moe/experts.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import copy import itertools @@ -27,6 +27,11 @@ from megatron.core.fusions.fused_bias_swiglu import weighted_bias_swiglu_impl from megatron.core.fusions.fused_weighted_squared_relu import weighted_squared_relu_impl from megatron.core.jit import jit_fuser +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_group_commit, + fine_grained_offloading_group_start, + get_fine_grained_offloading_context, +) from megatron.core.tensor_parallel.layers import ( _initialize_affine_weight_cpu, _initialize_affine_weight_gpu, @@ -825,6 +830,16 @@ def __init__( tp_group=pg_collection.expt_tp, ) + self.offload_expert_fc1 = ( + self.config.fine_grained_activation_offloading + and "expert_fc1" in self.config.offload_modules + ) + + self.offload_moe_act = ( + self.config.fine_grained_activation_offloading + and "moe_act" in self.config.offload_modules + ) + self.activation_recompute = ( self.config.recompute_granularity == 'selective' and "moe_act" in self.config.recompute_modules @@ -834,6 +849,12 @@ def __init__( set_save_original_input(self.linear_fc2) + # This is to avoid the CPU overhead of multiple d2h copies + if self.offload_expert_fc1 and not (self.config.fp8 or self.config.fp4): + from megatron.core.extensions.transformer_engine import set_save_original_input + + set_save_original_input(self.linear_fc1) + if self.config.fp8 or self.config.fp4: assert HAVE_TE, "FP8 and FP4 requires TE." self.quantization_padding = Fp8Padding(self.num_local_experts) @@ -898,9 +919,21 @@ def forward( # Probs already applied, so reset to 1. permuted_probs = torch.ones_like(permuted_probs) - intermediate_parallel, bias_parallel = self.linear_fc1( - permuted_local_hidden_states, tokens_per_expert - ) + if self.offload_expert_fc1: + permuted_local_hidden_states = fine_grained_offloading_group_start( + permuted_local_hidden_states, name="expert_fc1" + ) + with get_fine_grained_offloading_context(self.offload_expert_fc1): + fc1_output, bias_parallel = self.linear_fc1( + permuted_local_hidden_states, tokens_per_expert + ) + if self.offload_expert_fc1: + fc1_output, bias_parallel = fine_grained_offloading_group_commit( + fc1_output, + bias_parallel, + name="expert_fc1", + forced_released_tensors=[permuted_local_hidden_states], + ) def bias_act_func(intermediate_parallel, bias_parallel, permuted_probs): if self.config.use_te_activation_func: @@ -960,18 +993,26 @@ def glu(x): intermediate_parallel = intermediate_parallel.to(original_dtype) return intermediate_parallel + if self.offload_moe_act: + fc1_output = fine_grained_offloading_group_start(fc1_output, name="moe_act") + if self.activation_recompute: self.activation_checkpoint = tensor_parallel.CheckpointWithoutOutput() - intermediate_parallel = self.activation_checkpoint.checkpoint( - bias_act_func, intermediate_parallel, bias_parallel, permuted_probs - ) - output, output_bias = self.linear_fc2(intermediate_parallel, tokens_per_expert) - self.activation_checkpoint.discard_output_and_register_recompute(output) + with get_fine_grained_offloading_context(self.offload_moe_act): + bias_act_output = self.activation_checkpoint.checkpoint( + bias_act_func, fc1_output, bias_parallel, permuted_probs + ) else: - intermediate_parallel = bias_act_func( - intermediate_parallel, bias_parallel, permuted_probs + with get_fine_grained_offloading_context(self.offload_moe_act): + bias_act_output = bias_act_func(fc1_output, bias_parallel, permuted_probs) + + output, output_bias = self.linear_fc2(bias_act_output, tokens_per_expert) + if self.activation_recompute: + self.activation_checkpoint.discard_output_and_register_recompute(output) + if self.offload_moe_act: + (output,) = fine_grained_offloading_group_commit( + output, name="moe_act", forced_released_tensors=[fc1_output] ) - output, output_bias = self.linear_fc2(intermediate_parallel, tokens_per_expert) # upad and concat the output if self.config.fp8 or self.config.fp4: diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py index a8893ebec36..5d3f16c1041 100644 --- a/megatron/core/transformer/multi_latent_attention.py +++ b/megatron/core/transformer/multi_latent_attention.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import math @@ -22,6 +22,11 @@ _yarn_get_mscale, apply_rotary_pos_emb, ) +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_group_commit, + fine_grained_offloading_group_start, + get_fine_grained_offloading_context, +) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel.layers import ColumnParallelLinear from megatron.core.tensor_parallel.mappings import ( @@ -266,15 +271,19 @@ def forward( query, key, value, attention_mask, packed_seq_params=packed_seq_params ) else: + if self.offload_core_attention and self.training: + query = fine_grained_offloading_group_start(query, name="core_attn") + if inference_context is None or inference_context.is_static_batching(): - core_attn_out = self.core_attention( - query, - key, - value, - attention_mask, - packed_seq_params=packed_seq_params, - attn_mask_type=attn_mask_type, - ) + with get_fine_grained_offloading_context(self.offload_core_attention): + core_attn_out = self.core_attention( + query, + key, + value, + attention_mask, + packed_seq_params=packed_seq_params, + attn_mask_type=attn_mask_type, + ) elif self.cache_mla_latents: # Dynamic batching attention kernel. q, k, v = (query, key, value) @@ -295,6 +304,10 @@ def forward( # Only rearrange if not in absorption mode (Flash MLA handles format correctly) if not inference_context.is_decode_only(): core_attn_out = rearrange(core_attn_out, 's b h d -> s b (h d)') + if self.offload_core_attention and self.training: + (core_attn_out,) = fine_grained_offloading_group_commit( + core_attn_out, name="core_attn", forced_released_tensors=[query, key, value] + ) # We are doing absorption with cache mla latents and decode mode. if self.cache_mla_latents and inference_context.is_decode_only(): @@ -320,7 +333,14 @@ def forward( # ================= # Output. [sq, b, h] # ================= - output, bias = self.linear_proj(core_attn_out) + if self.offload_attn_proj: + core_attn_out = fine_grained_offloading_group_start(core_attn_out, name="attn_proj") + with get_fine_grained_offloading_context(self.offload_attn_proj): + output, bias = self.linear_proj(core_attn_out) + if self.offload_attn_proj: + output, bias = fine_grained_offloading_group_commit( + output, bias, name="attn_proj", forced_released_tensors=[core_attn_out] + ) return output, bias diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py index bd3aa9c8c96..a619b9ffa55 100755 --- a/megatron/core/transformer/multi_token_prediction.py +++ b/megatron/core/transformer/multi_token_prediction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. from contextlib import nullcontext from dataclasses import dataclass @@ -13,6 +13,9 @@ from megatron.core.fp8_utils import get_fp8_context from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_set_last_layer, +) from megatron.core.pipeline_parallel.utils import is_vp_last_stage from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel import ( @@ -901,6 +904,8 @@ def forward( hidden_states_list = list(torch.chunk(hidden_states, 1 + offset, dim=0)) hidden_states = hidden_states_list[offset] for layer_number in range(len(self.layers)): + if self.config.fine_grained_activation_offloading: + fine_grained_offloading_set_last_layer(layer_number == len(self.layers) - 1) (hidden_states, input_ids, position_ids) = self.layers[layer_number]( input_ids=input_ids, position_ids=position_ids, diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index aead6133f22..06e8f1372f4 100755 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import logging from contextlib import nullcontext from dataclasses import dataclass @@ -16,6 +16,9 @@ from megatron.core.fusions.fused_layer_norm import FusedLayerNorm from megatron.core.inference.contexts import BaseInferenceContext from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_set_last_layer, +) from megatron.core.pipeline_parallel.utils import is_vp_first_stage, is_vp_last_stage from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.enums import LayerType @@ -693,6 +696,11 @@ def forward( else: inner_quantization_context = nullcontext() + if self.config.fine_grained_activation_offloading: + fine_grained_offloading_set_last_layer( + l_no == self.num_layers_per_pipeline_rank - 1 + ) + with self.offload_context, inner_quantization_context: hidden_states, context = layer( hidden_states=hidden_states, diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index b39b7706feb..ecc700375cd 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import warnings from dataclasses import dataclass @@ -772,6 +772,25 @@ class TransformerConfig(ModelParallelConfig): """Transformer implementation to use. Options are 'transformer_engine' for Transformer Engine and 'local' for MCore.""" + ##################################### + # Fine-grained Activation Offloading + ##################################### + fine_grained_activation_offloading: bool = False + """If True, offload the input of the specified modules to the CPU.""" + + offload_modules: Optional[list[str]] = None + """The submodules to offload its input. + choices: "attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act". + "attn_norm": offload the input of the normalization in the attention part. + "core_attn": offload the input of the core attention part. + "mlp_norm": offload the input of the normalization in the mlp part. + "attn_proj": offload the input of the attn linear projection part. + "expert_fc1": offload the input of the expert fc1 part. + "moe_act": offload the input of the moe act part. + """ + min_offloaded_tensor_size: int = 1024 * 1024 + """The minimum size of the tensor to be offloaded.""" + def __post_init__(self): """Python dataclass method that is used to modify attributes after initialization. See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more @@ -1117,6 +1136,28 @@ def __post_init__(self): if "moe" not in self.recompute_modules: self.recompute_modules.append("moe") + if self.fine_grained_activation_offloading: + assert self.offload_modules is not None and len(self.offload_modules) > 0 + allowed_modules = { + "core_attn", + "attn_proj", + "expert_fc1", + "moe_act", + "attn_norm", + "mlp_norm", + } + invalid_modules = set(self.offload_modules) - allowed_modules + assert not invalid_modules, ( + f'Invalid choices for offload_modules: {invalid_modules}. ' + f'Allowed modules are: {allowed_modules}' + ) + if "attn_proj" in self.offload_modules and "core_attn" not in self.offload_modules: + raise ValueError( + "attn_proj cannot be set to offload_modules alone without core_attn " + "because the input of attn_proj is the output of core_attn, " + "which is needed in core_attn.backward()." + ) + if ( self.num_layers_in_first_pipeline_stage is not None or self.num_layers_in_last_pipeline_stage is not None diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index a5babece9d0..c36ff7515e4 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import logging import warnings @@ -397,6 +397,16 @@ def __init__( if "mlp" in self.config.recompute_modules: if not isinstance(self.mlp, MoELayer): self.recompute_mlp = True + self.offload_attn_norm = ( + self.config.fine_grained_activation_offloading + and "attn_norm" in self.config.offload_modules + and not isinstance(self.input_layernorm, IdentityOp) + ) + self.offload_mlp_norm = ( + self.config.fine_grained_activation_offloading + and "mlp_norm" in self.config.offload_modules + and not isinstance(self.pre_mlp_layernorm, IdentityOp) + ) # @jcasper how should we handle nvfuser? # Set bias+dropout+add fusion grad_enable execution handler. @@ -479,20 +489,29 @@ def _forward_attention( context (Tensor): Updated context tensor if cross-attention is used, otherwise None. """ + from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_group_commit, + fine_grained_offloading_group_start, + get_fine_grained_offloading_context, + ) inference_context = deprecate_inference_params(inference_context, inference_params) # Residual connection. residual = hidden_states + if self.offload_attn_norm: + hidden_states = fine_grained_offloading_group_start(hidden_states, name="attn_norm") # Optional Input Layer norm if self.recompute_input_layernorm: self.input_layernorm_checkpoint = tensor_parallel.CheckpointWithoutOutput() - input_layernorm_output = self.input_layernorm_checkpoint.checkpoint( - self.input_layernorm, hidden_states - ) + with get_fine_grained_offloading_context(self.offload_attn_norm): + input_layernorm_output = self.input_layernorm_checkpoint.checkpoint( + self.input_layernorm, hidden_states + ) else: - input_layernorm_output = self.input_layernorm(hidden_states) + with get_fine_grained_offloading_context(self.offload_attn_norm): + input_layernorm_output = self.input_layernorm(hidden_states) # Self attention. nvtx_range_push(suffix="self_attention") @@ -526,6 +545,11 @@ def _forward_attention( ) nvtx_range_pop(suffix="self_attn_bda") + if self.offload_attn_norm: + (hidden_states,) = fine_grained_offloading_group_commit( + hidden_states, name="attn_norm", forced_released_tensors=[residual] + ) + # Residual connection. residual = hidden_states @@ -563,17 +587,27 @@ def _forward_mlp(self, hidden_states, inference_context=None): output (Tensor): Transformed hidden states of shape [s, b, h]. """ + from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_group_commit, + fine_grained_offloading_group_start, + get_fine_grained_offloading_context, + ) + # Residual connection. residual = hidden_states + if self.offload_mlp_norm: + hidden_states = fine_grained_offloading_group_start(hidden_states, name="mlp_norm") # Optional Layer norm post the cross-attention. if self.recompute_pre_mlp_layernorm: self.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput() - pre_mlp_layernorm_output = self.pre_mlp_norm_checkpoint.checkpoint( - self.pre_mlp_layernorm, hidden_states - ) + with get_fine_grained_offloading_context(self.offload_mlp_norm): + pre_mlp_layernorm_output = self.pre_mlp_norm_checkpoint.checkpoint( + self.pre_mlp_layernorm, hidden_states + ) else: - pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states) + with get_fine_grained_offloading_context(self.offload_mlp_norm): + pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states) nvtx_range_push(suffix="mlp") # Potentially chunk the MLP computation during prefill to minimize the peak activation size @@ -633,6 +667,10 @@ def _forward_mlp(self, hidden_states, inference_context=None): mlp_output_with_bias, residual, self.hidden_dropout ) nvtx_range_pop(suffix="mlp_bda") + if self.offload_mlp_norm: + (hidden_states,) = fine_grained_offloading_group_commit( + hidden_states, name="mlp_norm", forced_released_tensors=[residual] + ) # Jit compiled function creates 'view' tensor. This tensor # potentially gets saved in the MPU checkpoint function context, diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index bdf915a8ae1..8e5f343b73c 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1216,6 +1216,10 @@ def validate_args(args, defaults={}): "when enabling delay_wgrad_compute" ) + if args.fine_grained_activation_offloading: + assert args.transformer_impl == 'transformer_engine', \ + "Fine-grained activation offloading is only supported with transformer_engine implementation" + if args.mtp_num_layers: assert not args.use_legacy_models, "The legacy Megatron models does not support Multi-Token Prediction (MTP)." assert args.position_embedding_type == "rope" or args.position_embedding_type == "none", ( @@ -2327,7 +2331,12 @@ def _add_training_args(parser): help='The communicator group names to use high priority streams.') group.add_argument('--use-te-activation-func', action='store_true', help='Use activation function kernel from Transformer Engine in MLP module.') - + group.add_argument('--fine-grained-activation-offloading', action='store_true', + help='Enable fine-grained activation offloading.') + group.add_argument('--offload-modules', nargs='*', type=str, default=[], + help='The submodules to offload its input. Choices: "attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act".') + group.add_argument('--min-offloaded-tensor-size', type=int, default=1024*1024, + help='The minimum size of the tensor to be offloaded.') return parser diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_coreweave.json new file mode 100644 index 00000000000..30ea509a50b --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_coreweave.json @@ -0,0 +1,110 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 11.0637, + "5": 9.48263, + "10": 9.04035, + "15": 8.00837, + "20": 7.88364, + "25": 7.67597, + "30": 7.63447, + "35": 7.21393, + "40": 7.55564, + "45": 7.21045, + "50": 7.05439 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 38802064.0, + "5": 394456256.0, + "10": 571185472.0, + "15": 699100416.0, + "20": 891692160.0, + "25": 748799104.0, + "30": 794511296.0, + "35": 671593792.0, + "40": 421718816.0, + "45": 517934176.0, + "50": 472902496.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 6025468416.0, + "5": 6025470464.0, + "10": 6025470464.0, + "15": 6025470464.0, + "20": 6025470464.0, + "25": 6025470464.0, + "30": 6025470464.0, + "35": 6025470464.0, + "40": 6025470464.0, + "45": 6025470464.0, + "50": 6025470464.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 45099868160.0, + "5": 49175810048.0, + "10": 49175810048.0, + "15": 49175810048.0, + "20": 49175810048.0, + "25": 49175810048.0, + "30": 49211260928.0, + "35": 49211260928.0, + "40": 49211260928.0, + "45": 49211260928.0, + "50": 49211260928.0 + } + }, + "mtp_1 loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 11.04508, + "5": 9.76285, + "10": 9.04997, + "15": 7.93865, + "20": 7.79984, + "25": 7.60324, + "30": 7.56633, + "35": 7.13802, + "40": 7.45784, + "45": 7.11892, + "50": 6.9559 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 52.8667, + "5": 2.06295, + "10": 1.09336, + "15": 1.10509, + "20": 1.08631, + "25": 1.08991, + "30": 1.10548, + "35": 1.10049, + "40": 1.11219, + "45": 1.09542, + "50": 1.09805 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_eos.json new file mode 100644 index 00000000000..30ea509a50b --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_eos.json @@ -0,0 +1,110 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 11.0637, + "5": 9.48263, + "10": 9.04035, + "15": 8.00837, + "20": 7.88364, + "25": 7.67597, + "30": 7.63447, + "35": 7.21393, + "40": 7.55564, + "45": 7.21045, + "50": 7.05439 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 38802064.0, + "5": 394456256.0, + "10": 571185472.0, + "15": 699100416.0, + "20": 891692160.0, + "25": 748799104.0, + "30": 794511296.0, + "35": 671593792.0, + "40": 421718816.0, + "45": 517934176.0, + "50": 472902496.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 6025468416.0, + "5": 6025470464.0, + "10": 6025470464.0, + "15": 6025470464.0, + "20": 6025470464.0, + "25": 6025470464.0, + "30": 6025470464.0, + "35": 6025470464.0, + "40": 6025470464.0, + "45": 6025470464.0, + "50": 6025470464.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 45099868160.0, + "5": 49175810048.0, + "10": 49175810048.0, + "15": 49175810048.0, + "20": 49175810048.0, + "25": 49175810048.0, + "30": 49211260928.0, + "35": 49211260928.0, + "40": 49211260928.0, + "45": 49211260928.0, + "50": 49211260928.0 + } + }, + "mtp_1 loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 11.04508, + "5": 9.76285, + "10": 9.04997, + "15": 7.93865, + "20": 7.79984, + "25": 7.60324, + "30": 7.56633, + "35": 7.13802, + "40": 7.45784, + "45": 7.11892, + "50": 6.9559 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 52.8667, + "5": 2.06295, + "10": 1.09336, + "15": 1.10509, + "20": 1.08631, + "25": 1.08991, + "30": 1.10548, + "35": 1.10049, + "40": 1.11219, + "45": 1.09542, + "50": 1.09805 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml new file mode 100644 index 00000000000..9a125a1cf74 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml @@ -0,0 +1,139 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 32 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True + NCCL_NVLS_ENABLE: 0 + PYTHONWARNINGS: ignore + NCCL_DEBUG: VERSION +MODEL_ARGS: + # Distributed args + --distributed-timeout-minutes: 60 + --tensor-model-parallel-size: 2 + --pipeline-model-parallel-size: 2 + --expert-model-parallel-size: 4 + --context-parallel-size: 1 + --expert-tensor-parallel-size: 1 + --use-distributed-optimizer: true + # NOTE: uncomment if TE >= 2.9.0 + # --overlap-grad-reduce: true + # --overlap-param-gather: true + # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN + --attention-backend: unfused # TODO: switch back to fused attention after fix + # Training args + --use-mcore-models: true + --sequence-parallel: true + --disable-bias-linear: true + --micro-batch-size: 4 + --global-batch-size: 32 + --train-iters: 50 + --exit-duration-in-mins: 230 + --no-check-for-nan-in-loss-and-grad: true + --no-rope-fusion: true + --cross-entropy-loss-fusion: true + --cross-entropy-fusion-impl: native + --manual-gc: true + --manual-gc-interval: 100 + --recompute-granularity: selective + --recompute-modules: "[layernorm mla_up_proj mlp moe_act]" + --fine-grained-activation-offloading: true + --offload-modules: "[expert_fc1 moe_act attn_norm mlp_norm]" + # Transformer Engine args + --transformer-impl: transformer_engine + # Data args + --seq-length: 4096 + --data-cache-path: ${DATA_CACHE_PATH} + --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --vocab-file: ${DATA_PATH}/bpe/vocab.json + --merge-file: ${DATA_PATH}/bpe/merges.txt + --split: 949,50,1 + # Add network size args + --num-layers: 15 + --moe-layer-freq: ([0]*3+[1]*12) + --pipeline-model-parallel-layout: Et*3\\|\\(tt\\|\\)*6mL # Et*3|(tt|)*6mL + --hidden-size: 1024 + --ffn-hidden-size: 4096 + --num-attention-heads: 32 + --kv-channels: 128 + --max-position-embeddings: 4096 + --position-embedding-type: rope + --rotary-base: 10000 + --make-vocab-size-divisible-by: 3232 + --normalization: RMSNorm + --norm-epsilon: 1e-6 + --swiglu: true + --untie-embeddings-and-output-weights: true + --multi-latent-attention: true + # Comment out the following MTP args to disable MTP + --mtp-num-layers: 1 + --mtp-loss-scaling-factor: 0.1 + # Add regularization args + --attention-dropout: 0.0 + --hidden-dropout: 0.0 + --clip-grad: 1.0 + --weight-decay: 0.1 + --qk-layernorm: true + # Add learning rate args + --lr-warmup-fraction: .01 + --lr: 0.00015 + --min-lr: 1.0e-5 + --lr-decay-style: cosine + --adam-beta1: 0.9 + --adam-beta2: 0.95 + # Add MoE args + --num-experts: 32 + --moe-ffn-hidden-size: 1024 + --moe-shared-expert-intermediate-size: 1024 + --moe-router-load-balancing-type: seq_aux_loss + --moe-router-topk: 4 + --moe-token-dispatcher-type: alltoall + --moe-router-pre-softmax: true + --moe-grouped-gemm: true + --moe-aux-loss-coeff: 1e-4 + --moe-router-group-topk: 2 + --moe-router-num-groups: 4 + --moe-router-topk-scaling-factor: 2.0 + --moe-router-score-function: sigmoid + --moe-router-enable-expert-bias: true + --moe-router-bias-update-rate: 1e-3 + --moe-router-dtype: fp32 + --moe-permute-fusion: true + # Add MLA args + --q-lora-rank: 1536 + --kv-lora-rank: 512 + --qk-head-dim: 128 + --qk-pos-emb-head-dim: 64 + --v-head-dim: 128 + --rotary-scaling-factor: 40 + --mscale: 1.0 + --mscale-all-dim: 1.0 + # Add validation args + --eval-iters: 32 + --eval-interval: 200 + # Add checkpointing args + --save: ${CHECKPOINT_SAVE_PATH} + --load: ${CHECKPOINT_LOAD_PATH} + --save-interval: 25 + # Add initialization args + --init-method-std: 0.02 + # Add logging args + --log-timers-to-tensorboard: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-params-norm: true + --log-validation-ppl-to-tensorboard: true + --log-throughput: true + --log-interval: 1 + --logging-level: 40 + --tensorboard-dir: ${TENSORBOARD_PATH} + # Add mixed precision args + --bf16: true + --exit-interval: 50 + --overlap-moe-expert-parallel-comm: true +TEST_TYPE: regular # Usually ckpt-resume, but as a WAR to #513 set to regular +METRICS: + - "iteration-time" + - "lm loss" + - "num-zeros" + - "mem-allocated-bytes" + - "mem-max-allocated-bytes" + - "mtp_1 loss" diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_coreweave.json new file mode 100644 index 00000000000..3687e19e563 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_coreweave.json @@ -0,0 +1,92 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 11.04266, + "5": 9.38536, + "10": 8.82761, + "15": 7.86966, + "20": 7.72022, + "25": 7.53119, + "30": 7.5026, + "35": 7.10343, + "40": 7.42037, + "45": 7.07056, + "50": 6.90946 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 844114112.0, + "5": 856834688.0, + "10": 928751040.0, + "15": 952825152.0, + "20": 987111232.0, + "25": 926008384.0, + "30": 864767232.0, + "35": 855095360.0, + "40": 849505920.0, + "45": 847187584.0, + "50": 846195840.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 4419107328.0, + "5": 4419108864.0, + "10": 4419108864.0, + "15": 4419108864.0, + "20": 4419108864.0, + "25": 4419108864.0, + "30": 4419108864.0, + "35": 4419108864.0, + "40": 4419108864.0, + "45": 4419108864.0, + "50": 4419108864.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 37959917568.0, + "5": 39583289344.0, + "10": 39583289344.0, + "15": 39583289344.0, + "20": 39583289344.0, + "25": 39583289344.0, + "30": 39583289344.0, + "35": 39583289344.0, + "40": 39583289344.0, + "45": 39583289344.0, + "50": 39583289344.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 58.78709, + "5": 2.40565, + "10": 1.13046, + "15": 1.39764, + "20": 1.1273, + "25": 1.12154, + "30": 1.03587, + "35": 1.09545, + "40": 1.09901, + "45": 1.00656, + "50": 1.00794 + } + } +} diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_eos.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_eos.json new file mode 100644 index 00000000000..3687e19e563 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_eos.json @@ -0,0 +1,92 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 11.04266, + "5": 9.38536, + "10": 8.82761, + "15": 7.86966, + "20": 7.72022, + "25": 7.53119, + "30": 7.5026, + "35": 7.10343, + "40": 7.42037, + "45": 7.07056, + "50": 6.90946 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 844114112.0, + "5": 856834688.0, + "10": 928751040.0, + "15": 952825152.0, + "20": 987111232.0, + "25": 926008384.0, + "30": 864767232.0, + "35": 855095360.0, + "40": 849505920.0, + "45": 847187584.0, + "50": 846195840.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 4419107328.0, + "5": 4419108864.0, + "10": 4419108864.0, + "15": 4419108864.0, + "20": 4419108864.0, + "25": 4419108864.0, + "30": 4419108864.0, + "35": 4419108864.0, + "40": 4419108864.0, + "45": 4419108864.0, + "50": 4419108864.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 37959917568.0, + "5": 39583289344.0, + "10": 39583289344.0, + "15": 39583289344.0, + "20": 39583289344.0, + "25": 39583289344.0, + "30": 39583289344.0, + "35": 39583289344.0, + "40": 39583289344.0, + "45": 39583289344.0, + "50": 39583289344.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 5, + "values": { + "1": 58.78709, + "5": 2.40565, + "10": 1.13046, + "15": 1.39764, + "20": 1.1273, + "25": 1.12154, + "30": 1.03587, + "35": 1.09545, + "40": 1.09901, + "45": 1.00656, + "50": 1.00794 + } + } +} diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml new file mode 100644 index 00000000000..8832d687004 --- /dev/null +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml @@ -0,0 +1,134 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True + NCCL_NVLS_ENABLE: 0 + PYTHONWARNINGS: ignore + NCCL_DEBUG: VERSION +MODEL_ARGS: + # Distributed args + --distributed-timeout-minutes: 60 + --tensor-model-parallel-size: 2 + --pipeline-model-parallel-size: 2 + --expert-model-parallel-size: 4 + --context-parallel-size: 1 + --expert-tensor-parallel-size: 1 + --use-distributed-optimizer: true + # NOTE: uncomment if TE >= 2.9.0 + # --overlap-grad-reduce: true + # --overlap-param-gather: true + # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN + --attention-backend: unfused # TODO: switch back to fused attention after fix + # Training args + --use-mcore-models: true + --sequence-parallel: true + --disable-bias-linear: true + --micro-batch-size: 4 + --global-batch-size: 32 + --train-iters: 50 + --exit-duration-in-mins: 230 + --no-check-for-nan-in-loss-and-grad: true + --no-rope-fusion: true + --cross-entropy-loss-fusion: true + --cross-entropy-fusion-impl: native + --manual-gc: true + --manual-gc-interval: 100 + --recompute-granularity: selective + --recompute-modules: "[layernorm mla_up_proj mlp moe_act]" + --fine-grained-activation-offloading: true + --offload-modules: "[expert_fc1 moe_act attn_norm mlp_norm]" + # Transformer Engine args + --transformer-impl: transformer_engine + # Data args + --seq-length: 4096 + --data-cache-path: ${DATA_CACHE_PATH} + --data-path: ${DATA_PATH}/my-gpt3_00_text_document + --vocab-file: ${DATA_PATH}/bpe/vocab.json + --merge-file: ${DATA_PATH}/bpe/merges.txt + --split: 949,50,1 + # Add network size args + --num-layers: 15 + --moe-layer-freq: ([0]*3+[1]*12) + --pipeline-model-parallel-layout: Et*3\\|\\(tt\\|\\)*6L # Et*3|(tt|)*6L + --hidden-size: 1024 + --ffn-hidden-size: 4096 + --num-attention-heads: 32 + --kv-channels: 128 + --max-position-embeddings: 4096 + --position-embedding-type: rope + --rotary-base: 10000 + --make-vocab-size-divisible-by: 3232 + --normalization: RMSNorm + --norm-epsilon: 1e-6 + --swiglu: true + --untie-embeddings-and-output-weights: true + --multi-latent-attention: true + # Add regularization args + --attention-dropout: 0.0 + --hidden-dropout: 0.0 + --clip-grad: 1.0 + --weight-decay: 0.1 + --qk-layernorm: true + # Add learning rate args + --lr-warmup-fraction: .01 + --lr: 0.00015 + --min-lr: 1.0e-5 + --lr-decay-style: cosine + --adam-beta1: 0.9 + --adam-beta2: 0.95 + # Add MoE args + --num-experts: 32 + --moe-ffn-hidden-size: 1024 + --moe-shared-expert-intermediate-size: 1024 + --moe-router-load-balancing-type: seq_aux_loss + --moe-router-topk: 4 + --moe-token-dispatcher-type: alltoall + --moe-router-pre-softmax: true + --moe-grouped-gemm: true + --moe-aux-loss-coeff: 1e-4 + --moe-router-group-topk: 2 + --moe-router-num-groups: 4 + --moe-router-topk-scaling-factor: 2.0 + --moe-router-score-function: sigmoid + --moe-router-enable-expert-bias: true + --moe-router-bias-update-rate: 1e-3 + --moe-router-dtype: fp32 + --moe-permute-fusion: true + # Add MLA args + --q-lora-rank: 1536 + --kv-lora-rank: 512 + --qk-head-dim: 128 + --qk-pos-emb-head-dim: 64 + --v-head-dim: 128 + --rotary-scaling-factor: 40 + --mscale: 1.0 + --mscale-all-dim: 1.0 + # Add validation args + --eval-iters: 32 + --eval-interval: 200 + # Add checkpointing args + --save: ${CHECKPOINT_SAVE_PATH} + --load: ${CHECKPOINT_LOAD_PATH} + --save-interval: 25 + # Add initialization args + --init-method-std: 0.02 + # Add logging args + --log-timers-to-tensorboard: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-params-norm: true + --log-validation-ppl-to-tensorboard: true + --log-throughput: true + --log-interval: 1 + --logging-level: 40 + --tensorboard-dir: ${TENSORBOARD_PATH} + # Add mixed precision args + --bf16: true + --exit-interval: 50 +TEST_TYPE: regular # Usually ckpt-resume, but as a WAR to #513 set to regular +METRICS: + - "iteration-time" + - "lm loss" + - "num-zeros" + - "mem-allocated-bytes" + - "mem-max-allocated-bytes" diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml index 8164ca37df8..63320ae3c3d 100644 --- a/tests/test_utils/recipes/moe.yaml +++ b/tests/test_utils/recipes/moe.yaml @@ -124,6 +124,16 @@ products: - environment: [dev] scope: [mr] platforms: [dgx_h100] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_h100] + - test_case: [gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_h100] ####################################################################### # Super important MR tests that run for both DEV and LTS per MR # ####################################################################### diff --git a/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py b/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py new file mode 100644 index 00000000000..edec95288c2 --- /dev/null +++ b/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py @@ -0,0 +1,187 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +import gc + +import pytest +import torch + +EPSILON = 0.1 + +# Skip all tests if CUDA is not available +cuda_available = torch.cuda.is_available() + + +def _reset_cuda_memory(): + gc.collect() + if cuda_available: + torch.cuda.empty_cache() + + +class ToyModel(torch.nn.Module): + def __init__(self, hidden_size: int = 2048, num_layers: int = 4, dtype=torch.bfloat16): + super().__init__() + layers = [] + for _ in range(num_layers): + layers.append( + torch.nn.Linear(hidden_size, hidden_size, bias=True, dtype=dtype, device="cuda") + ) + self.net = torch.nn.Sequential(*layers).to(device="cuda", dtype=dtype) + self.hidden_size = hidden_size + self.num_layers = num_layers + self.dtype = dtype + + # Prevent weights/bias from being considered activation tensors for offload; + # ensure we only count activation tensors (inputs x) in memory accounting. + for p in self.parameters(): + try: + setattr(p, "offloading_activation", False) + except Exception: + pass + + def forward(self, x, use_offload: bool = False): + from megatron.core.pipeline_parallel import fine_grained_activation_offload as off + + if use_offload: + # Initialize a new chunk (microbatch) and enable offload context. + with off.get_fine_grained_offloading_context(True): + off.fine_grained_offloading_init_chunk_handler( + vp_stage=None, min_offloaded_tensor_size=1 + ) + for i, layer in enumerate(self.net): + # Group by module; with this linear-only model, each group corresponds to a layer. + off.fine_grained_offloading_set_last_layer(i == len(self.net) - 1) + x = off.fine_grained_offloading_group_start(x, name=f"layer_{i}") + x = layer(x) + # Commit the group; returns a tuple of tensors + (x,) = off.fine_grained_offloading_group_commit( + x, name=f"layer_{i}", forced_released_tensors=[] + ) + return x + # Baseline path (no offload hooks) + with ( + torch.autocast(device_type="cuda", dtype=self.dtype) + if self.dtype in (torch.float16, torch.bfloat16) + else torch.cuda.amp.autocast(enabled=False) + ): + for layer in self.net: + x = layer(x) + return x + + +@pytest.fixture(autouse=True) +def _monkeypatch_offload_deps(monkeypatch): + # Avoid requiring torch.distributed initialization and NVML in tests + import megatron.core.pipeline_parallel.fine_grained_activation_offload as off + + monkeypatch.setattr(off, "debug_rank", lambda *args, **kwargs: None, raising=False) + monkeypatch.setattr(off, "set_ideal_affinity_for_current_gpu", lambda: None, raising=False) + # Ensure a clean state each test + off.fine_grained_offloading_reset() + yield + off.fine_grained_offloading_reset() + + +def test_fine_grained_activation_offload_memory_reduction(): + torch.manual_seed(1234) + # Use a linear-only stack so theoretical saved memory equals sum of per-layer input x bytes. + model = ToyModel(hidden_size=2048, num_layers=8, dtype=torch.bfloat16).eval() + + # Create input + inp = torch.randn( + (2048, model.hidden_size), device="cuda", dtype=torch.bfloat16, requires_grad=True + ) + + # Warmup to stabilize allocator behavior + _reset_cuda_memory() + out = model(inp, use_offload=False) + (out.sum()).backward() + torch.cuda.synchronize() + _reset_cuda_memory() + + # Baseline memory measurement (no offload) + _reset_cuda_memory() + inp_baseline = inp.detach().clone().requires_grad_(True) + baseline_mem_before = torch.cuda.memory_allocated() / (1024**2) + out_base = model(inp_baseline, use_offload=False) + baseline_mem_after = (torch.cuda.memory_allocated() - out_base.nbytes) / (1024**2) + (out_base.sum()).backward() + torch.cuda.synchronize() + baseline_delta = baseline_mem_after - baseline_mem_before + + # Offload memory measurement + from megatron.core.pipeline_parallel import fine_grained_activation_offload as off + + off.fine_grained_offloading_reset() + _reset_cuda_memory() + inp_off = inp.detach().clone().requires_grad_(True) + offload_mem_before = torch.cuda.memory_allocated() / (1024**2) + out_off = model(inp_off, use_offload=True) + offload_mem_after = (torch.cuda.memory_allocated() - out_off.nbytes) / (1024**2) + (out_off.sum()).backward() + torch.cuda.synchronize() + offload_delta = offload_mem_after - offload_mem_before + + # Offload should reduce peak cached memory usage after forward + assert ( + offload_delta < baseline_delta + ), f"offload did not reduce memory: off={offload_delta:.2f}MiB base={baseline_delta:.2f}MiB" + + # Theoretical savings: storing per-layer input x (same shape each layer). + bytes_per_elem = inp.element_size() # 2 for bfloat16 + input_bytes = inp.numel() * bytes_per_elem + # -2 because the first and last activations are not offloaded + expected_saved_mib = (model.num_layers - 2) * (input_bytes / (1024**2)) + + # Actual savings ≈ baseline_delta - offload_delta (both exclude output tensor memory). + actual_saved_mib = baseline_delta - offload_delta + + # Allow slack for allocator jitter and extra intermediates; magnitudes should match. + rel_err = abs(actual_saved_mib - expected_saved_mib) / max(expected_saved_mib, 1e-6) + assert ( + rel_err <= EPSILON + ), f"saved mismatch: actual={actual_saved_mib:.2f}MiB expected~={expected_saved_mib:.2f}MiB (rel_err={rel_err:.2f})" + + +def test_fine_grained_activation_offload_output_and_grad_consistency(): + torch.manual_seed(2025) + hidden = 1024 + layers = 3 + + # Create identical models by resetting seed + torch.manual_seed(2025) + model_base = ToyModel(hidden_size=hidden, num_layers=layers, dtype=torch.bfloat16).train() + torch.manual_seed(2025) + model_off = ToyModel(hidden_size=hidden, num_layers=layers, dtype=torch.bfloat16).train() + + # Same input and target + inp = torch.randn((32, hidden), device="cuda", dtype=torch.bfloat16, requires_grad=True) + target = torch.randn_like(inp) + + # Baseline forward/backward + out_base = model_base(inp, use_offload=False) + loss_base = torch.nn.functional.mse_loss(out_base, target) + loss_base.backward() + grads_base = [ + p.grad.detach().clone() if p.grad is not None else None for p in model_base.parameters() + ] + + # Offload forward/backward + from megatron.core.pipeline_parallel import fine_grained_activation_offload as off + + off.fine_grained_offloading_reset() + out_off = model_off(inp.detach().clone().requires_grad_(True), use_offload=True) + loss_off = torch.nn.functional.mse_loss(out_off, target) + loss_off.backward() + grads_off = [ + p.grad.detach().clone() if p.grad is not None else None for p in model_off.parameters() + ] + + # Compare outputs + assert torch.allclose(out_off.float(), out_base.float(), rtol=1e-3, atol=1e-3) + + # Compare gradients parameter-wise + for gb, go in zip(grads_base, grads_off): + if gb is None and go is None: + continue + assert gb is not None and go is not None + assert torch.allclose(go.float(), gb.float(), rtol=1e-3, atol=1e-3) From 65c8f40b4e8df619b5c829c699b353fe7ee6894d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 27 Oct 2025 12:36:12 +0000 Subject: [PATCH 075/248] tests: Fix paths for test_cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .../model_config.yaml | 6 +++--- .../model_config.yaml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml index 9a125a1cf74..d9ec0456190 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml @@ -42,9 +42,9 @@ MODEL_ARGS: # Data args --seq-length: 4096 --data-cache-path: ${DATA_CACHE_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 # Add network size args --num-layers: 15 diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml index 8832d687004..f4b64722712 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml @@ -42,9 +42,9 @@ MODEL_ARGS: # Data args --seq-length: 4096 --data-cache-path: ${DATA_CACHE_PATH} - --data-path: ${DATA_PATH}/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/bpe/vocab.json - --merge-file: ${DATA_PATH}/bpe/merges.txt + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt --split: 949,50,1 # Add network size args --num-layers: 15 From 2155c47d19fa2af5e10160194d6b7a79695f091f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 27 Oct 2025 13:38:29 +0000 Subject: [PATCH 076/248] Revert "[Dev] feat(moe): Fine-grained activation offloading (#1912)" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 9069e1268f495407598d9f6771e363737505dab7. Signed-off-by: oliver könig --- .../fine_grained_activation_offloading.md | 29 - docs/source/api-guide/index.rst | 1 - .../offloading_and_recomputing.png | Bin 332427 -> 0 bytes .../core/extensions/transformer_engine.py | 12 +- .../common/model_chunk_schedule_plan.py | 9 +- .../core/models/gpt/fine_grained_callables.py | 23 +- megatron/core/models/gpt/gpt_model.py | 27 +- .../fine_grained_activation_offload.py | 603 ------------------ megatron/core/pipeline_parallel/schedules.py | 14 +- megatron/core/tensor_parallel/random.py | 9 +- megatron/core/transformer/attention.py | 70 +- megatron/core/transformer/moe/README.md | 14 - megatron/core/transformer/moe/experts.py | 65 +- .../transformer/multi_latent_attention.py | 40 +- .../transformer/multi_token_prediction.py | 7 +- .../core/transformer/transformer_block.py | 10 +- .../core/transformer/transformer_config.py | 43 +- .../core/transformer/transformer_layer.py | 56 +- megatron/training/arguments.py | 11 +- .../golden_values_dev_coreweave.json | 110 ---- .../golden_values_dev_eos.json | 110 ---- .../model_config.yaml | 139 ---- .../golden_values_dev_coreweave.json | 92 --- .../golden_values_dev_eos.json | 92 --- .../model_config.yaml | 134 ---- tests/test_utils/recipes/moe.yaml | 10 - ...test_fine_grained_activation_offloading.py | 187 ------ 27 files changed, 61 insertions(+), 1856 deletions(-) delete mode 100644 docs/source/api-guide/fine_grained_activation_offloading.md delete mode 100644 docs/source/images/fine_grained_activation_offloading/offloading_and_recomputing.png delete mode 100644 megatron/core/pipeline_parallel/fine_grained_activation_offload.py delete mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_coreweave.json delete mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_eos.json delete mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml delete mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_coreweave.json delete mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_eos.json delete mode 100644 tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml delete mode 100644 tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py diff --git a/docs/source/api-guide/fine_grained_activation_offloading.md b/docs/source/api-guide/fine_grained_activation_offloading.md deleted file mode 100644 index b4c2ea753fa..00000000000 --- a/docs/source/api-guide/fine_grained_activation_offloading.md +++ /dev/null @@ -1,29 +0,0 @@ -# Fine-grained Activation Offloading (collaborated with rednote) - -Memory capacity is more and more important with the rising of extreme sparse MoE models like DeepSeek-V3 and Qwen3-235B. Fine-grained recomputing reduces the memory footprint at the cost of extra recomputation, while offloading could utilize the host-device bandwidth to achieve nearly zero-overhead. Fine-grained Activation Offloading targets at offloading the activation at the granularity of specific modules, so that we can calibrate the amount of offloading activation to maximize the training throughput. - -**Features** -* Support PP=1/PP/Interleaved PP -* Compatible with fine-grained recomputation -* Support FP8 -* Support MTP -* Support mixed dense & moe layer -* Support A2A Overlap -* Support CUDA Graph - * (Temporary) cuda graph scope cannot contains the offloading modules - -**Usage** -```bash -# Enable fine-grained activation offloading ---fine-grained-activation-offloading - -# Specify which modules are going to offload its input -# Choices: "attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act". ---offload-modules expert_fc1 -``` -**Compatible with Fine-grained Recomputation** -- For modules with minor perf overhead like layernorm or moe_act, use recomputing to reduce memory footprint; -- For other modules, use offloading to reduce memory footprint; -- Make sure the offloading/reloading could be overlapped with computing; - -![Fine-grained Activation Offloading and Fine-grained Recomputation](../images/fine_grained_activation_offloading/offloading_and_recomputing.png) diff --git a/docs/source/api-guide/index.rst b/docs/source/api-guide/index.rst index ac6d7cb0b2d..710a7caf4de 100644 --- a/docs/source/api-guide/index.rst +++ b/docs/source/api-guide/index.rst @@ -22,4 +22,3 @@ API Guide optimizer_cpu_offload multi_token_prediction tokenizers - fine_grained_activation_offloading diff --git a/docs/source/images/fine_grained_activation_offloading/offloading_and_recomputing.png b/docs/source/images/fine_grained_activation_offloading/offloading_and_recomputing.png deleted file mode 100644 index 6c8afa78bb180a0815aff02693690b864e9b01f8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 332427 zcmeFZXH-*bw+4z>P!Lfeq7L@*-C?8~5C;#x)#z@6hLxX~se14JQ{F&<%XMbHn z{-QX;L~;Jla|#O8GtB>->zv{K>l#W5iZBO?bAMf9Og{cPRsHqy&+%;bng8A~oAU3g zFBfK?{rmjFujimh(_@$-A1=L8e(p&@A>8)s`wX|W@iYa+BMOzLkM;b{Y|c{E^DAT! zcOzcEet(CTdP)Ck)tyGC_Sct|nI?BX{XH#RoPUn2RrZz8KNTrPak z{?_uwgRA#0UI#ore?V%T8BLe-Y$_=ZEJk?^1fl&QcD9YhB_$x%g2uqXRMC6qu0EnT z^Y6>QGWZt<|C+(SRPZko{>y~_GU2~W_%9Rw%Y^?j;lE7y|6?W$b~q=NX=~l3@V{{y zTuPZ%SnI?ssCy#nG*DHsshXO*>~>Z?pu3UEUQnOyH3j9xJO7_A`$AtvPKK6d26K`! z+`4Z$?a?a3(u!t?!%{z)E0r(axeuq@JiF%mV=c967$&$X@v($e9!WoaLk0}g4XbbP z{pruwLY%BzG|fBUU~vWYfi~s0y(p*c4_X*}wN34K%F7H9qJVLHZo~nRveQ32i?6jn zLesr;Nxd`q`=Db#J%hNusTQooQaS{BbJ;h1l%YVg8KRRl^z7Sr5pM(}`3=}#9ZKBy z%=GlwOeSGNB%iiXO5&O2BSNR74ytuGw2oYdT16G&gfI42P!L`pnbkI|@q3l3Ne-?! zDyhoCu#$PvW5Neq1**@k{8ICsf=8Q1N*2zVE*nq&he$76l`!&dwDK-pWd0EJP=_0= zmXd=bgmPY|{O!)06#8R6={x6Zsn$}5d6E0GZ9xY6q*#5|T4kFfg#dls9k zpl>snd{v9w!^R~Yg_;=5{>E}P+COxuF??JdKk^M!Q0NL-7X~#cF=c*swK#Bg8od>} z{;}mIrR0#Lqz`|WREH_s2T%BeYEE#sJdG}h^5SKIwxjXk_En_CclV>4?sZ@gtL0&q zNXtF|u79GstKUfgR2ax;x`WSO`V-E<>6GbBt7Ex>zfnB(jh#{Sy>p>gCk+z1ZfoDN zY#e+43npJ~QW9;N(}up5ZWsH%n@vamIH3xGMrDp^yr2PC6x2J_jd06yQSy@{pMUVY z&kV`lQ=%rJJ|^*P=e|2Nqyt>y1X7s|p#cTFayiP9nT19Z2QuTh=OMs*?rDwpK*JtE zFW%loUr|rwKWA{UzfTesp0ae7yd>T>$Y*9DaQAF8chs-v7}j_(64RIV-bpQlaE#l}DBJJHx)9K66_;g)-*ZwZEc|V4ia)G(H5Mna3>+g~ zTD^SolekVFsBA}LN+wxn({gk6RU*_)O0l2&OUuUbm!-7u=OijFi>ERMyVji_dPome zE%MXbVobV#mD`Cb!Z-gHodt(5`n^4O@&+WN{fh2ud&L!sJNJ>4rbf zCcHo=sN9lWe7t(Tk^zz>ee|*iX;WCeekS2AdT`kKzc>6#7B9-rmlrn#hP0C&B)lcd zvp*D8dmo#SAF#s4){Wk$az0V8~>w+nrVLw9l$D1Cc ziD5NE&&w~fl=%JpUSgiFS^khCBlihV{}5Y84XUr2)iG78m^V<*zl+XHzmfGfeEv~( zw%Ni!?{VHz-cr z*y+ho_a0Ebipj=X=x>%C!l!mx-Q2S7FRAIRjfaCc;>#Gx;y(8Bi}_1deUt}xg-mw- zBx8!;afZW8PU25>+DTKasTGfFp&_3Aq-7~Fe5|^Nnpm-PAv{7?=@ZNER{oDulgSo_ zqua7Xomzvv``53FZVL&}dMOpSO|g;9?}YjbUGlt5UwX*-lMg-1XBFg%nhD%N>>j#% z^;CR8rChXpZ4gJJc^GnV4@kJqq(s;*vORGM%QwxfbkjRrjPjI@M17jW09zRXH^%9T zWgaaqtMVUp#mfWgbkd8HIky;Y9VxHE`E~LI0=pZh^Lt*M^PqmU*xWX-v#e=P+<)h?U!T5ayr7iY6v(5gmfUB;pP<35dhLH- z@~7m#L)QU(|GX2te!f-@Ku4qJS-TfhwadkjbiQ|1#lP;#%EypD?V~`^v(8ntYkcx0j#STyd@)@5ZhrqK$oqBO9nLnW_WMh*e;d%n75bg?Z~@A3u!28S_e67d^^v5V z#y-tqY;lj`FYsKvxLMeYI=@#^x{vLmg9Vi^-CscG6%!~rY3TFaMH3{Sjue1id{jQ= zfj3VXbbj!z)A+dff5@=%HQ}>GD#fe5x55m+=;SZn{_oj>N3fiD?xTM=@XoN-!ll>| zIfFh>Gp==gtCi_zTBORK_pJi>MRPI+Cb!cu%PH=b{19^);1g}ZbEuD!##e7$T5Y^> zB#*o%6;JkGR|c}V|F^Y!a_nTKrok20?92AV!{hHV|GzxVFMRnu3IUw?`@Q6-wla~F z>GPae##L}-BI#NSydLMjw>2)>MpyLV&(+v+eW~H}e$>8lg8r;@FP`~Rca{FS|0Mui z*+xu_Tj~EICD|gFM%0PxsqGL65Y(i6tMdPl@GtSyo8G~dRNTLNA43UIb~5a|626tR zBo%HwFYRq{8U!!MAR8 z$g?A?Rg{@rj-TF(<^pbvK=8?PTmK*CS}=pfQ&=j*f z_(sX17HT3PS8x;342c4UJ?6gemmK&T3_kxiow`l}4rWAxCs@U*{B6iC5q{vhxMk>%Jb( zBvle&^Myj9a|P7#m5@NlPo*nD|?d8O&)KS1V6__cx4Fmpu$& zFMFGYaQU&tZn;4g4+k<-WnB}7f##0$Pr zHH;U1)VlY~=&Snqex&Nq^YG<^4fJ!XpAjulQ!s;Pv(pQ;nPoeoJG?=((N6cD+Ub++24%9o$uXvtY;=qR2 zm(g|yg#cAmRN#g=Lws^&uhNF}`>pomoZZNNnD}X1XcM+F*n#Jh%a#Ct+G#mBa;NVo zjv@X*I@pk3!S?jdf6nXY6PcmQ&|4P_*w^XI9Df~Z=qr4Z)GnSA_`^qIySXJ$z%>ug z%tH>_)0bQpE_N=4R{=&jacA7WmJ}&F5GX(*C~!%g8t5Xx1UZv~yViL({4)box8KU6 z;HMSaL));Pm9$WxsN@&i4??Rn@65FZrXIpfG9Jch5)bPV>n zE+SyB@rEiDuV+OfCBoqlZM=89Sy29|k!S|fSIfA45)>koO&hzNuS$1OFQ|CW{A#yb zA5$O9db1BcAS4zf4VbiW*Xqn*FptrR8`5gcK8#$Vrv|mf@%dZGRiGxpQDVq<2W+!ig}))>)`>$Xfg?LEQA&Tq7*6jC&>;|Tu{xy$mvXo&jf&b#O~A z1^!co^QMRr28$@b_6L$_0v3Oi?-hJOu7&3_>PwSTGuMT#)8sw5sW?y{`l5hFz2(l$ zL744&EP7^(x~)6yJI+6E9?tNb#hU%Mh_X%QB1>t?j)s=YZULq3o=Gq+u(ehzJq$>- z;lJ{fReZ$GSG%Sm@3L>f={^9KR;8Eg>w#*Pel@6*Tg{`_mGM|e+c_sWX~m)plDK58 zU_U4C-^w%BH^RtOvch~Y)pg{e&%HQqGrjSnN{k2zfIiROv}Q$>Z1MrjK{|2L+$0nC zSj)d`sRwW_gn){YkG6Hhw@#toP&V6x`U!=f^CHKDwo>@ta8?~*!_3CT-L7?_3e9+ z58V1(s>Etf15Kp4!eUO-tA-jnR%DL%o44K|h^(y+w=T;%_mh!V2DO@ZyWCRFT6*bi z^sF2mYhZ7U8#ZWg`gqHq7e?2!m8s7}cwm3|wA)IK(_glyhVBJ{4oGL~&FVnTwh{Ij zS6Gw4RanP7q*HrNtt2M%7L(zIyOSuPL-z+_M*P|o73hWKg_g`y(`zQc={+rEeN1Yh z=iYJ`(~fWmL3euNJuRuU3mY&Z9-^LUTr%czYN${M<(rtOa5|ylIawI0<{67NJ$2S+ zwnAP+>YryexASMsdAXNFH>Cnacw|5lZ#N9iPMMZ0=56n*wm^$KP=LR^_^%l5#-rB} z5q&h0W5K$lKTw0|5ezWR)UgwKeWXiLTI|;B8~srshN3NH0pIYA0}b!If)iDw3$b}xj%QZ`dC*fx0NjZ~Z&J^D5pmiu-!>q(Fe zARy|~P{SkpBdH<6GW3~x;8*v#=nA$JsTKv%Zzox@SiL=io$jL=z&hP4MQVWULSzNcajlz2vz2}l3YyAbEH{d)(x~P0iF!4= za#^ZjE zLSMw5tFNzbu9jz&ApU{6QWnE~42h+QLNQbL8m_1kYZ8YKJXTtCMw4-&ou>QKHMc=8 z=$m$HV@7~_7*;@4z@t^4RD=02r7Ot#94|f_qTUE1XGmgu1^lw`ad5LKtKR0>MghzW zfx;~bYH;uz`eeD;z`IK-i;lX~!m4-banvR6sp*$lkKVKlU3@M%B>tH$Lu~xqiIWzr zR{bM1X9KQet*H7_8Lx5wL70QkPq7CL52Z!SKon_lT|xXjFOLwBZl5UUb>AlHM>oef zG8smRa()15A&_m(KE2@?R!5JQoV<$N58LJBzD*Q-W=>t)51I4>?}wEJCejtgR|+5* zG0Aug)<>*GXf^#KMuIQNoM(Q~ur{DxE6@4OzQ+~qUWiq_s4IBj;BEy&W)gPVQvtNs zYa@MB6mO-yc_v}np)2&KlHh9c%4+Y?_{x;o8g9tl|I~v8eU_IPGt9_qIlo^~wYZTi zllyIX!sxg|zB}(gBxnu z&kRA?6}D>wNYvL~ig)WW8v2iSDcd$~$qECTJK%-% zam~LMuiGduy>fT=e;FqA>g2rJ&oqaj5fRrfGwJ4-nVuupP7&4@W?kTl{8fu2Zl0sX zSnUGCq$4+8H1s9V_(H2ua_SSt8<;ZHXkC$-yEe%PH?=z5P++Y}5tna@Qne`WeHlY4 z0O9p^pA1rN;g)eLM41PFtT&GDQ=ZL}$=>_a9dt<9bt7E+I98Z1DIZB64_fI6BkGq` zC`=8HmCJPqPIzsX*sF1P2Gv6uhXKDbrdw^{&Mp{c+4u6%cSVd(CxXa^ zpOH)ly$lYM_?#2en;loX>Ims-g;q@cI57S$$u;kunDjzrelf{JcBUz=_N%`H6}JQY z!HjWTtL^azL+YW9@p0ThWK367u{mSKX6VxmxwK!Pc7<>jcXu(!up~6KIsxR{$f%seBLdrx==r6KNT>2j^}-nUB(cvCHh5HkOjPP>-s_z zHembF=uz~>R=}*R^^)h|GL>Tq>nza6Ng+$*Dagb`S~c*IO+U&g&%$t5N?3@{tg~(5 zI4yW9`tQQoui%*GPFrW^Y6#0lTgv!AHO9hK6Xo0b9(z$CE4s#^>nJy0a%G@D<1Qt; z3=|N==>mq`7Yz~Q*BtHHNXuwx72zPYkEew9_zZ3AJEY=7Q<-ku6pM(6*gQLheD8oi z!^2n>YEDx1d(=tzG5JQ!jnJ38aNwosgpWmJmRy<1N zl6(exeABUN<<*#-@nf+uCx_`5>nMu6E%8ueOmV=veEGczcw#GLG^W|ZUkuEExiy3c@;1EOfkE<0H+NUNw(Lg^bNb)B&>XclmLz#9uM37c&B`XpwtF=61 zFe19TZAYUqpN|#OYPDdz>7R(aHWLZUvKsi#j%+Xi?+%X2RlE&k+`u&}{kJNRUDw9h zg0OyiuJ>xaXKo6yW+97j=F$Ou=MQQUZ?xOq303Df1JC+1-h-@gw*rPt?&T}TJgjyr2hozW=Dx6Hu`mc8!blM_(WPv~N)_ z_}T8`PC5Yb{7sT&6jbu0oLZ>B3{aASOSN1Ib224OY3Uvt(jJ7ojrWS9K1#X)E&DRE z9_|U4<_(fRoXSH6u4Bn8<|PXVC|=U%39&N@1wWF;i4b0!6i#_>=~r~`LM8~MbUct^ z30-&n5WFrV+Y;Dm+}WJpW$qP)APbsxvt z`qCRMQ#j`G#X%9#O3umcjU6wm0d5kGw~qW^OOW>uy>fbwq(g>wPgsxnJF@@>>@<(p zbwc1=P;Hpo^^TGx|FbVCwtDIbmgO$AK3%?Ulk$BVAwFN?@3eQ z44|C-bPu<&}nl)|gL&CXuf}%L$PlltX(P4U=6<^g> z51tRL*v)_Q4D3uEO&YK7@ev0Tn|m8{g%fA49eFC~l7r&b2O^>k&V}Xq&1xcpuy8z2 z(946S@CXKMrT921-^+V{}pw>E=RHCms^t}LA@Z@?aU zr{dqKS2@NENt@@?8$ao6Eh2J-JI)NFKhke_#!mW=kQrJ1Mw$-@3(&ur3is92{DQJ` zQ-b`wFH#0(^G*uj-*t}iJ~CX9%RCPKGQ3x`M=bOZGB~{Z+Nd#~x>?^nP<~)2^#%vzEchhCMdy-RZm)EWy>0 zlovkSki1VU^~?AJvf@*su%#rpo1Clf&q;cm=hCz!_5FCO5aWF>y3hQLVZPCPJHp@w z@Zcl-4KUB%goN&hF>Dw^tLlHo$y5;))Wp zdD7!$V4=MUd=>)%DU@PbQny+4#6+-x2FD+F%4L@~oXRqe9O3(#I@Pj|3QD_HPDjE3utqKI zlz1zr&E`T(NQeVom!Pz^8?;&=^Df)+Yjv21d?L}NVbV)gHu0uyY2UnyXFxSubx?8h zvfdygT@$Cpp^Wac@lo6ccz;$(&wOLIri0&y_n0riwF zCbROvSZj18w3tx&C~e5zQMvDnPTi^w9+Q{psTsJfT?Wb9euKZ{>^9!of%psqhgHmk z`3bl)^LwB2dBY!k3oxvB=u_UT4DR;f@jlH5HnbXu!-cTFe6yx^WMs?nO(_kK$9r;u zwWR=p^ZxSAn>D^2BB4}z<;AwApiT?`cB+|x!E~7E6rs0nr1rq@VPw`DS!L;`6O%3K zpvIp|z&@xnw$QbR`7R-@o*qpoJ<0lLv8y6AXA!K%bQ>d@F-y!XrxdEC^r%6>^knlw57_a^t6uDNv=i@I+B4AS zcN&NcUn?~t05boR(Xk1+kH9CYE$dXQ&$!mI9p!l zz1LEAZJrzCBBcsYpO8KJAwFF2tF|}<+d3;Y+J2f~>Cu1$5uUVx^2*QAuQq2QCL)fj zyNIuQKZ=5|IL%hPoO9TvH^T@%zK6JZSo6Fyp{7E6R>QwyxUEM_3qnlk^-2jl_G-@3 zk_-h^%jxtH-iQWOx?qP0wULa>EGEFZ7`+9WRwAG_z=U%iB%`CGiZfv!_4CY)i;LmN ztbqwa>jtKmFNOE4${ih*@Q(VF8XAbq8gXbEs82{A9np-WCeq|OXO)w5I+F)sNvJ!R zrhUEHT)WeK!#zzwCl|S-NV7sMStRSm-Rna;+TH{|6AJ?@zKEQ$H3}p(LWPdfJ*PaW z+-Nj8w%@3c>)htDqqOn6IUv*}qD zTA3ruigt04gIq0ixrXzP@NnU9p~++Ttpt8CQjv@}_dS~9;ViA6cR!;092}_zs{*TG za=rA}yK8V>~Mx23JoE$xKw0X#nvPgOF-&Ex{b>OalH7$By z*KNu(P)CTrwL7FhJqCG5Zr8E>W|p6>D0-}Mur?Gvd_ZjAP2cI2yW{MDA_GhZ_r^`t z2$#bKxpUu}nTc?g7v_DkfA?Ktb37GV=k`zgB4YU0))Yw}*dANLuEy?YZicAGWQO{nndT-~_-v_cqdv~B5Aju9> zD_gry6zsvCb6eNB^p+%jmP)+^*CJlRsAS}RG)(-6-jUW2tad%53{+(zLq#jFuu({N z)@2`}*sETmUNr4dI%{=WuDvH|7wt(^k@z35WYzQiG%Y1ZVBfG6E2oMmK*{n&nCEJ( z)rxk9VHMpEFOKs019pVraa_#fBJhh*9D7oL`rdbomHm=G^SdoVnN>VMy6YG&qD~JQ zJLKXUG<@4ihnQMC(FG98=Qt7?=Msm3x2KZ=QxfKe4rc`UOv*N8v2r#UJ}E-HjdEW4 z8XGmS>nDx8$vde5=2NRVN%=jl#lfv~cUC;#@71|5HM}&khsR9~O=!C;XP*@j8Qb=b zvZ|qr1dDHYvhviLMb}^kROS8Z=g6g=2PPHFd6@k!FH&3*>;Gu__;U{}p-@a_66SKg;viSNC~njQ|Y&W`4z z3;&p;4K%;q0MwRL@owx0oMH%UTkCexaO=v zQ};Yi>N4YTjjU9n{b-{LvrlL*J(t9c$TBsVJZT~NE{*2P>OU?$9rZ~!a0$}4Hb!xS ze7oFB6H^mFrQZJQq-k&*-_$8>0Ck%0enz*7_3_MKBUn=4T(p8W4AG-6vKx@LKV ziD?N?^<4F&&AQ?pLpFAqr$=+;9=YPyh4)aBKA~$O28&dBmPLK%e;S#|uMpD36U%S& zeaSu|N4$C4^I+4J^oK51%%gIC*IGLY1QJ$t@?78ad0vHx=VN0|75$#aPMf_@N7tEX zfjsYlMoTisBTb&YBPHIKEv7c3e6 z%DV~l+(^@zKn@k}X=?yqY&2B)FNE9yiJyPE9ns@v!^IZUnoqKWx^2?OAG& zTJ-S7%PjHQ{lOweCcr;?o8*=ko)7ypd)Bi-iZA#niQd;h*|F?kS)CqzwxdBDt?TwA zZC(=hK0b$%bw6l13w=4SDOA`l9i{nYnAijVv>y`b5>T?E$C02*pj%@o3{~CEJ8w)i!odBJsIv+YMW;#NS{Wl39p;%znTktc}2=?HYFHq zt+@m{M9KC|e{GFboZ47YPO06LpAt?CC4eADaBlUFmW(S`^jD!7wv;vQ&|a)%Tad`k?lZ;MI9wGSjc}@`*J!Omfl+1 z@2di0Kw-WX;FAa9Voak-LqNsx&&H8I+jWfW zgEejhY^NM4Tsy3_6+u9!z98RU6CpWT@z1K9pLq;WwYX%1>nBupc~Z?M7Txrfs2Ve zuK=MYFQjg;dfX(u#x{;_pS^wcxpJ%DhyAq~k}epk9e?PwO{de;qD7l0FP{?IjOUqW zq?Tu-!5I=G0ASpix-ma-ooWtDpojU*T`w6Bo&Mp?1JBM~HH^?0S|%57&q>rD!DSC* zB-9zW$9+-@h^G#lI7?zP-DI4QNc6gfF~Hr9U9x!Aip0+J5QZKyAZhi2#8(?;L>!zE zNQX5JG1uN!ug&=~-Bvm?o(r(Qtx^)DK0Vx3OP7hq>^P{|*c6eKncz8c;^2%15~9{t zzYRkutxVwSO3^|Hf6#^1pGui1E1e)(6``*<_*}T$%7M3SrfTC*8#47op4TG7vg^MM z3L6^-R)FR^HLuhfMvX&}RM0A74l2;1YNx}M{sisrT^e@?2>q_uvrxtW&`Y0V}MYg0;wyi_Tz%wH@n;VopYNJ=k}&6 zaZY_kOc3RJp!fmUyPg;Kvha`c~k(*R{`5IQzt(wxHoZY%BHiE0z z(+ZHUtaIDzV<7oZa1YyLPV3$AR>!mt0EZ zJCTQ)l4qTN#@iX9i*UCV;;vbv^N>)o&Z?!)$G5c?IrY&g45Yzmm$g>i1}MFt+Qm z{|JxmqJ_BemKRv`MZ_a^;wPDTmJI3D&5Q49T)J&o(tNUlNK_F7-nl|Qjod#T+%~jC zgRK({tc5n@+A@NrD?Uu_bkK>jh+scFcTLh2_bK`Vy)NqW7(IK7)5rb^D-~zF18=3( zXxr46Wk-yEn&3{k@<=&d;pV4fS7713^D3D40|;IEJ%+=q#WnX@ZB*nzhB8s)$9#BX zbK0JGMErZ3n5{!5CydAu51nh|DG0LrS`{w(;#%!0M7Ymtbq4M^*s#eID4q}AVG9b2 z5r`S@A8Me@A4HH9fnzRGi}EO6M) z#rSdO-Im`v%(Cfp^|=fwr;xGe9j_hk-5%OCkWgoq%SP^<3Dff^@YGvW39kvy8NEvS zanCRf{BTPEV6FuYUb>=&smQf{HfHko{{lej&=X#-av*(yU!saHW;{GZzJ@f)q&Q9;( zmzf$TbP-doyp=pxQ4}}z1;256711|!&v<@t8J6p9Ya(1eF5JwtX*23QUD~DLA7wU% zb)W6)AgWn{x?%}9t1!}+irJQp4v%nrySNy0Rk8Dl}3@@AVK`K0tYFa z%U<-!2+vYiN~a0_b~z!`0|}okHm`w3bY+grZ&_)^xhIT7Y5k{*BbN>@7*RP4q)sMP z3M#^)kBrQ=4`+pSe#-UwmMS_nxo<5x_E%kS9F`gXH2?M7S;*%8vW)AXT zl<4KO@$4v(99D)_dw2QHs6@+xJVW200TWB$ht{XlxHB-#b$ZeF5u;omP4~ijI1su! zmtyl-@=kQwY;-EYGE%0y>B@^|6EHvHEEz?h-}8if&F?Ox!B2;~K^70JJH!c&+P!Ca z=aQP7xmmCGrVG1WBU@ol`K?=DQ7rOuj}B>^ZK2f zQ|uhN84{{lmrHnc*I>L&$`!CxDY|J7uzxv@nBh5i{QV+)sA1r^*(0@@tjmJkb>VGC zw>L#DtfRkz6z`&%B(kFN29hT**9oz9Kf*Tuj8(=rer+IEd(dQV(@#SiEvE5m(S7To zE*zlpNUmC&+ROOerC@4{j_ImAb61PX*^%#@!=9m^x(6wVu!NMC| zuPdsO70cHOOxb@vESr2&0rM=88?{bPpp5W~uF<(ebRZk5l@umY=$h(pPHkBCHI-gK z`vc==l#=iT4e4c*B1&JGNKu#3?_ZQ04>L^m@>yk$9z8AHcj~_KZNDHR<=gHl&2g2g zIqG(C{bx(xP7Od#ntp<;?|bm0V>Xv*s&9hBmuoZ$ptpzl#DVX2FZI>t^mXwa7CBy8 zHbeovJDzh^dh?INL0*O8J`em0S7&qHcizCtv*%Fh;S7lT48)43In`mp$P3hmNyCt< z-I&x)SZ>K=swgai8;&-&DR|JFwF?cSVcoZKiPU(djzAij6b#^nQcqPDEx26NKBB5! zRt>BUZ=~8g&lAz(_C{w$BVD-i-1@Ah$<&$B`ixx4{bg^XN$PcRR?`=BO+bHU0~@?P&7)Aw5-Gmo^yPb^6$FD~5P4ATI? z7@F3x-r=%m^v)Os9Zd)T%KK1ZO3InFkoPWOgX10#Z@XIGI zsXzerfEAg>olP!|YnWG&BaZ>WoSe=5CsHfFjT>lxpr9*csQzN_=oSn*xc!jy&V?0! zR4`W2RYr(P^e!E5{&EwY+$Z|E>TtD*b$rr}=ZdzetYHUDSr}Oze>nY8hu{^5%|*|%EJa8kQF%ReY-Ir^>!23xENmft2geZ19aGQYX|u+xRIY}coyXesVd%6M<% ztj_e4QIZ3h*I$=TK@XCBOtHwj; z(&-l)GEokvR`A%yG43SZ`}x>3$jYIP$y=-8;nVIs;KwB=m4HjNB(IV(lW-4H*Wi@J z1&ot1{#mp6?pU`MnEsTHfsw})GNrdi?z}?IGT@yQsXLEp**Vp~4ZT3eHS0v92|_T7j-EF5rLbyuobePB1?b*-_)YP%#lN1kFRr-L4yo6 z%!=bn&UIBo5QFl?ViglkCcy9f(+@F-I~6hVVBhp4E1f}X6dCPRE`%7Dk1R=}Q3X8y zMT(HrrJDB%Ck4H-axmvYBl5Q6_2Uk!T6tE!jlRI2kgn(b@?c08k@!b2(@)aO?ZCqo z6^7~Iqyzs0xZ7W!7}c2utVPE3dpM8%JE)* z+>4K*a9RJ@D^VcuP&nO6&nDYaBI^cI@z0y2QH1t>>gPEihn@%Wt#HY+d#g9e zapi!Tt$FRL4i1y@sRNwe4MsQc1CHIUCLCh{`?E3ef%xeed$>7h!&31R*R_3tfmDh0`gbk^YTUSqa z-{|Ms1UvD=2Z(|`r^#~5kd<^Rtfv!`6Pe95_6P3&HdwHx*=7;{Yv@!un7q5Z6=7Zi zUUuI+^X{(nYG`R-cTGR*PoRMe-~Q8P!61uWP&gHNymUv%E#&)EMosxqK7ooca2tkg zm{Do3peNcpQA^i%&^_xzRpprt)l(r}=vcMYG>kMLT?B8FWE!W7t>z%e)eGo$;AvJQ z(I1o{CSr&~`bk2B_E1jJix~ z&Mkd>CRhK~23Ysfd(YBlHJ<9|098|y9<+wbf4j>sUgp&9)yR5tM*9Kf!yE6+L_9rl zDeh?-ADKJg^s;qx*`at?ngFw+%54$r*`BDM&vs~;P&u+)B{56T^`Pcp`(MTFP$iXve%KB1-3!X`aG#0`6-?UsHuhtTt;0pC629@bjn3 zAk(+Ye}v8w;KGzo^kg5u;x9+;k&6AHIw64 z{x%nR0WyZCo|2r{x-@KZo-2tZ$*v{;TLGR6@WW&g$QH3x7xV^8#Q-hX@iWscLc3f# zU3@St&?H!Td(~?69aiHYdF4sW^=Wg$(zgv+6M)wKyz>b##UtM+&{NYXwNV0*s_53? z{sZM=-+uKsNQeI#No8YXjuJFln0Z?ZRk7(hK;OHhF=%|xwpxCosH}9ekq>lZcm1y8 z>KRs(W3>cQ=8{r4P_QL47dZ*l(2iZuKo1Vh9{rR;@pMkO4WV!1&%h!gU=j4A{2k;e zt$Z`AJAyc`J2No=h%r0~13itChLY#7{t&Ujp+t!5;pm#Ns-#dt=w6RSW>9>n%am|5 z6)=AX`s8OSxdsA=NQ&9Q$WI$JGZ#Ztdp|aW@OOs1fOuwG5gY%i4G^ zLk-$RVwAb73Kdl66V-kNBG1#;@8gAZB&5wr&R2{PO9svzG?|aTAjg{Y`7xg8|MP# z$UE(~t7aWNUS8fwfpneGKC5re{y|*Bz*({jw+wTz;4#RyBJxjcMQcoDOQ4LHKd8&T z2G5PX)%(3AUE-YIBO8{!U<3Z>wxd-2`I#Gy?@1SWfzxMshn}TP@PsNCXzDSg;m!2t zAU^#mGY&rJbiiPC`G81N9j7J=$%_YtYjHzjB{X$mMkpKmS=VB>kyd=t*J zvI7Dd~-6fKojJZ4DPJGSSyuq9lY>d^pkbloc_!H(#6q} zP-!WF4zC+!;W3V(Kwt{F7o7xWfWD7UQ@{zPcAsA7JXn7#9ATWh^4AQFjjKaYn47sP zhwGaGPvPwxG99%9+0S}Wa^UU`RMP!uB-7EvgJ=~8O)FmgC7c^5y3fiu!k~8b&9R?b zC%xen*OLPVT3MY%xetOxW7vs(2^?`Jj=#b60}#Z!=NJ4X@oG?Br?8&2vHW#nKM);zyYL6HI@(ii&P((7SsM zxtSr+QIV)L0kG#m1>0()?Yt5Dp<#uZD%~@yij3g}!=ipQ0GKF1u*OcSUr9a;%6mT) zWXhwxA-CW>(3KZnyMMvAea>|EtioV*2`@82UW-nf*=l-mk#`@NexKHx_0vlsowv04!HpjQP=kI?DZPhC-0N@@At(I!EU!5Piiu1-5TGSJ zS2MRK$B!R0NzL;LirC$Q&~SS8)nnUvG2u`6^@69pA6jzSt}(5g>i%Sdksf~k3<%>j zke1b@y~xTq(~9AN{1nes434W?y^jCoW6MK71G{|>*A(=Y#3p69mT!}rb7%U(CN#LT zErH*QMh84lFv$yfvF_@so=mz$*Xy7JchlRW#?>!PGV>newd+B=iE}xW7KZ7!_>B65 zyj!sRk>pDLO6&V6-uCG$luh2;(|Pu#a}Gf5-J14vhm!OJT5Y7fp55kgHdmkbvFqXH zexv_0rr+Yi5$ad$yY`H}A?7Rx``m|1M#n$pe! zZb&q~JY2~>%S9y;M1*vFInKU&Tf3o>a|h6A#{e=kB~LdASm-yKz%*kC$U42%V$LDd zgWwx8X8u^3`-F47QhjthSQQqv}|5-<5xko80vJ-#Di+9gnC-Bp`QA%%K4l4 zy&UgRn)t~ukyFzGWZMJofp9CG)X_=Rgg@zN+PJz>mT}`Vp2}PkR$W8g{J@~_F8mvI zqUyna%z?>({sPCv!^f3(iGkg&u?_?&UH%ZF(}BJvyKg@ng4nAP9F&~6zqGj@>Yiut zn@6w}BnMv)x$OS9SbAjqZ6UK4xWPG-R=f?>kx^b`j;zSJX+=Hz1&x!leXWR*ev~0^ zuoqXPE=-9?xcH$`ZTA0R??1zu+P1%8cw4rJ4G|TkDhNmyq*s-u z^xlgS0qFvU4iO8zNbf{IdXpMzQ0cwb&>@5t0tr2kJPY0DZ1+CpKJWi}Kl;ThxYn9; zuF>Y0WBkTgB%VCH*1LfVyKQv%x1fgrw5WDF#M716dCA^=^5wUzpuTUcaq8i4NTeSmVt&wSx%<^%)B+y0J=~2{03x%$qe56jz*VF+P?OlyPf_Wy7 zoP@~R_mBG|e#Ic~Y=&;)qj@fQ;~e~N7|BP=&)V~eFhA9?D6wfn(MI?^ zv+*1=y1X~*?=EFDNX5tFBytaGO~{z+)4f%BqTuB1np6DOc0;0;6VONQcdL{j=da!6 z!$?`}(x0-0EUBkJ#?~@@93^~PV!Q^HXcQ7Ou@~{~pK|M>A=}46`x@&{2x(a{wg?sH znlEGRSYd6B{Tn;^x~V?c=}+!1PqGn6&#z%uuiter;Bq@YRsrWcYc8NrfGa-YIB%ie z%(6pm#=)XL1{7jSeDZce*mHzFd?pzB6gQ8NtThH8qEn|?cn7aHgFwh zs$^{=JfLwX*p%}mzWf44bkr+4)!MaQjZjP)?MJ7&xiq4j&6v?jtJzAH6U^@r%D)x? zqz?1B%YTY--=jEghe3Lu;^EWxuSG?;4S*;Y8TZ7?-a#o(+#V;D)tG*R=lrVFRZ+o( zqUMieFVZM~XRa;ArceNJm2JZi)p0^E*-&G@3+Q)|Mag@qhtU-69PZ)jyC>wVz8>BF zyfL9xt8%N}ItG$j(9u*>xQ)6?eD|0s_U<&kRF8>B83I%xul*Li`IgOg8P+aT%;{w5 zx*o*18>)!04nG`iU{Bo-(G-7C0m}(Ho%uNyTbQ=7b6MW+9q%bOoFm80Rcu%R%=f+%kt!V-&IOP*>kV6vB(_FYZ;g)#U3$&y~H9+%vuPgTise; zSxx+wG>!~DDJ4o{SI2NKL!$@M?IP#J@wGI8gFb&|;lHq*k8+mUVRPbfJ6&>i&wOyS zUbw!Xea|P{(2`MWYwoex`n$*b{7aAd^-7M|H;NPUp#{8<*k@KQTC^%1sy+F6kC&si z-ilR4Ku8z&Nuz<-KTXb{*K5eNWsUGL4<7(yfnQhB3WII(WQh+}8Zlfv%*XbGMztj$ z6}~$M(}^dMY4_rdz?dMa9X=^q?I>x!+GwE9E<;69ydayo1ozA*T zR1mj#Y8=W0G9t)XSM5m-z3>0pJ|Uv{5j0hxNgxe(P}U%jF_^VpI)7+9R_tUwGBZDB zaaEXp=2e~TGLf~R-Yy{`CF7Cz=kTkPLi#fDh#z+ArU^@n?%YZKK+HymLgs?)OE<1| zw-)d8Br_R>n)6ZPeuG}h_^_`$cWqjDSl8|^PJD;gfQ<8E5R2APsS<6-2*%0RqkBKYa*+#H`X-~@e8*<-=$KGBz0+uCpt=LayI+NFHFc zGio1In*YMm#V5-9I&cmw44Me*1hPnW!Q!xE+zWPeB!3(2kz(8}{~{L;nIxlGwa?83 zEE}x{MCd4Zh^j)jhR(~OhG+|U{jzJ%_@`K*H zFQ95e?~$Q&g~i=DLTB!-;ogWn|0$6KoJ<&b?o(!g(1h@I z!z{T5`;QLZ;i?dRFJcg8aG_+#HL|GKgz-so^TksG*_r#BgY68+l*D@lRtd%y8bEq! zXZzE-a3-LM#chK9I%sQ;NkK5vi@PWj5RMyNK4e5MvCsv-s8nOxJG8a_?J;$a4!lmA zQCx$fKon@!(6w9)DPK>FO>8b8OLV?jD>lHFLYIR=w__q5(OISg-o@tzjWnh`gx9a; ziG7eHYq`=h6&Fi%wlZm#KtoS;urhU!x$S?A9IeJtMmr>`tLLCbS#dFWo#k46Ie~E= z(ZqVT8tw4==)(5H%O?CGqpPfJtIoShbb@>#McW0s^B1*t&3pu);iFUi70CWw3=`C8 zQN-mgnBH*3Wv|ibp<{9XKVXz??AkqVfTFi|><&jeG`&Fij4)6>^BtnAh}Z{*dN1?? z0I1<0Z*SlP)Z!gvk~%MMXfN(@jJGJTPlLC2 z&e7PYD1C-^E5jx8r#`YywH(Z<7wNSq=I(fSjF(t=Oj;b;$!O^y@`_YGWya<_H4q5(;YH43qGr0#FHe-C$1ckSrm;NDZb~cElGiDlY?Y$)%w_#hjog~}gbnwbjFtPR za|h;Sx^>&0CLD@NQYN~uL=UaCAgY6i(Jg`l*Tnt(N9yfeh1<)7vWBwn^{y}E=uPQA z$1>}fEGl8V7?(hZ#GwQy9ptC{ks&@Zs{+f>V3yp2+>5qFm>U(OU&oI+)L8`^>zQ>s z68JsX>TL-t4R|g;j~wr58?_>9>`_zuv^Kc{KKIK)<#x{$_pWfv;k=NM|k9t#@CQV)A5waXamt%xl#-ueS1spOH%Zynk<)9bRL zC;4$s`=CDGcU%)$Rljdecl-7agru=L&`LrpzDZcuWENuhJ%KQqp@6^pHqGk-L~0>)rl@} zNl#vfK>f7LTPZ|`T30&H)jy3q@aqNT%-mc_7l%lBZVvMB4JtIIhy9VsjlA0Xn%&8# zsab(Cc$rI@dnQCS8G{t=>zSFU z+#K5Iz0Xr+N0I+c;4GUuP?q!QkYXyiJ$E%PaSLr z=Gl$bUi7uezulv5`026Hh+!e-Ejbl9cMBXsYmJ)BO!&6N2)UOUVv*Om82-fTZYrHY zt-)+!j79|ar$ZP6(kGbmlV&#Wu1;f20C9d;{Mxn&HLm#CeHnxHa;Q~ay@c3y(nkp> zeuRB#AHD@ZYCYCErt7zmPE`jb)*eKMXoI}RN-(Xjqwe}jt1qIr_bsg0w)YZ5Ai^E! zf_5%7&0$uc8VHg#cRMieu6hLhg9(+*`g73v48<(Ad9QWGs{_oK)jW|zW7PhMXuyiN zjn>DycHcy>*h4GHV`|B`Hbp(>OqVtD7yhU!k&%~srX9+ZlpfbA zADZ6RDAb7YJjRK*0*-*b8J9}wCn`;)PCB6H6eJlflKQqG|@KIpQ&dgJn&$T6*JPrmeSheMVq?ils`dt&85&uaHK_| zNZl4PDWjK)MrQA*wf(huPr~m&&GZ^?kaB^QRaO0rk-{}-=2=4-4zKoz<8# zD{~R@p*W9->1oo~>c9&L) z{o_|HtNJo`jr8IR2a;c}N98wTuq&1sg(_7d*`p+9y;!@f=w}i&H0WQ8J~4x4ykwbs zjxB1DRNgd_iQUd;tDz#xp5RL@V98Nze=l;4nggAYz@F8+>K%T4?^!4X#j1!3lAu+b zXv({v)}$PG>T0Uj^SXkmu&i|zyVR0x2PsS)1lMqqk|SiEBy?Q8@JQ3(XnWRURVsTj z(hdGH3HzI_7nKr5K{CcQMP8Qxf3Ond7}-(0jFVy8m3_~B$sG}|C}Ax5z@hS6%Veeq zv!5HZ?rj=St+Y38CgBwA^_lIU)U;m$iEEDcEyqN)0SxRUb)h zvpf(JYH482UBm))imOEsXHk4+DxEi}EsF%*t0zph9k|aB?#-n8BUMsu_!*@#H7^ad zX~D*+q8*n#F1*iR$?++<2*R{3s6LVdM_wBCPqpnuk%CmJb_!B@B1QmY(!8}*K+8u} z`hXrHP_q#BB!&3Bn+USp(x7e|oEdp76(lWQ)N@?b)Uh~J17cXr+k72}*y(&+v$Fb4 zL;TSrw(ZLDj77Axj8_U`3Yz7Z*GGSB&dPH;P3di7E6k9yep(^%*zLUVWkZ%?Lfn&q z+P(2+kE<>^eB0F1vJEDAm5=-Q2n(NGatxtc%prVwItYv1I&l>TlJ~pkmA&ig=2!#g zuiDNx`xXsU8mX_Zp3Q)QkUQBiGKEgCT!|gGXldLk7ZuI8cBmCyNa|0=^INYU)ci(C z8EXq{ZU&r`$p@2{O*Yokw709q)=r1Gl+95Dg07(e!zYo54ILIrWN%KXP&;+>;mLql zr`6~(?FVy*XMJ4`C{QfXaW0X0(qW~$BbF^C(=}i&`P}qXw+aS7RF{^DeEV@7(^K%y ziLjwZ>m_hk$z6)~^i-mcW|dJv*Nj%;I6OWyL&5Il%tB+KZ9PPRRat(;9=xB%m2r2) zhRVpdp0hI%@2XcWdGGauA@Z{%AJn=1HNl424xT(r{I;2Ij1am?t6@|w=`p0pag}n- zdehHgE%0o;a`v6^-MPNNfEB4_z&}!9?I1Q;S=&@8FXF#OVD8p(YT>)5ce7Eqsj{ru zj`qgE-cC{VVQf#1hWgsf#0M=Pa+MyT@dml`-}m$z;0o`aBV zK}LJ9?Z*1TH6S)-oAF*)?>R(xg%%qA09m18qxT~v0+j!2=CyNlf4vT1x~)XFR`t$d zG$c~@0_U#Eh)eDeUPaeurm0}1gi4oAZG)S7tNN=$L;h)>hUS2JB<&%T&W;;6BpTz= z*V4P=J=i|y`Db&-V+iWpNkcZT|99_zXIobdU19lnI>^YTx%6arX_d*VT9z zwxVYaJT-7Q1~4@HaPaC1hm(bV_aN+TebIc&&>k7CAOUtv&g=Re8;s@ zmwMl<<5eFR)T}f>at?@T*<^Y3&~2dE0aJJHj!`)}lTFZfiN@ z;qr@+UYLvck(F*V~W-6XtRo&&b6=L%J$(`+0`=YWP33Nl; z{(Y*5xpKa<-kCRO%SmYgevWw3g>_kxe^8JE!sPVU>oy_$oa8HVdH7eS;5H(86YVQ0 z7RfssitiylM+)Hqlk5>HJH97@RQXh@p4;bdL>~qhY!P?Wx|5@hn2gu%NF}`OcGMw# zK`7a^xM#*g4)lo&8m7NT5slx6Q4^R5w>lLY9q$riR5uPZWFdHl0RBelrIKk~T9-@_>f@BKd3vcg8_ipR z!IW6sw~Hv7V&ACaoG=$i{KzL^pFTk8h$PyH-cJ6 z6*EH?4env$OG0~>iZn7Iz3eWIHTcZT)yLoM?2lyVEUJ@xg92r zm1w)CLyW=s|p|e0s0MWbhN`&_ph*uV?0xiWuHE9L4h0ytD zhfv-7gM4`=@H`mGWNTJk<^|osqL<0`x^dztnqgqGK@2g}4y!Yx|D4*Xf050Be#6BI z#!c8YYGG~>H2`aoi6oO}@7_Xo@$SdYG9yIx$H*K*SocjEo>Q|(;-59;u?URQZg{sk zl!Veiy~OIZOhv~7_usfcE5u+K-wg3NF)P$0#JkVv4p*N`eYz6H1on%FAsSn`b{{P+ zxD(Ze>a}RITJUZnE$cjZJ=skOTti?z8*^;mWvCa|p2T|yzioCSL{*Q~;9JGo^;n>L z#-M@_suR3^PX#}71>T^uI;vWfVfz+WMu`4AeZ#6QZzY4QDngeiqO!WH-=!Yl^_V@j z_49G_oUUs6sxkmcb-BApU_yD=N;TlvbRHiVO3rVS08yMK@qyb0FRbeQ%Zjb)@#vZ@ z@dB;=D6p1_6IuLV+EHR6QlInWhkvrJ#5&233$&9NM&yq@Pgsb{aO{%$c{VTw#^q~WSD z3ya3u88COE?)M$_mTf#)9a&|{^*sn1m*y+I9_Ee%pK~KxOeRYS92#CJ$Wv_a?3YhI za@s}W>g=gBu<%0cxjMSh)IRv@O}?i10I*-?YHiT9A}yQ3I04(VbH+fGsS|HKS~7NF zTm>9oWFa>WiN_XAXM77aQxJ_Av?wY=LiqU!O@X*;k*m+AdyFmU^IF3f(R&FEpX_OC ztfeIvZit3%8Ldx+!Sx&%n|c8&D{z$#Utw5Yxp3scPv%XF|Gsg$O^}2EiEJ-g>J_gi z|Lmz%-mSIW?`QZVACe3^26RPtz|aG6VGr{Fr@x=4ts@GTx5!^Jh=zRcqvhZW#~1AY zDMZ9Nxh1IX2%b}=aID2n5Lcw+4m?0F=`ETy_~1ZzA6`bHnTw~U`?xQ{&ubrv6WQNS zq$1K^lVT$3WSHD#&O&UL7EMn-PYd@uy<#M&Ca?_xyr`j93ZgB!)pI}>1c2U}?+ z#L~^GR_f0V9KZRzzw8>Z$h(z6SU7q0k%=<5I=(1Hh%8mP;JjvwdBFCks|fA)gtQl> zl580fjl9%(6W)P6cMq*^+YVf~xLd>zc~}f%$aUIVUC+#mXI8|z8o_SUVt@rISNwP-tzK_<1p}&$TogC zvrBfaEZwRni^NU}x^mRp;znvqd=S z*xse8Ul)cvIE)5tNl%AvkFZm+C{Rx0YBN2%x;p#;qcC;`1wNu)^#-+qyTJo~wd$_M zBN)K^YjY)!c~T*GUX%kpT!fk33T6)3YHfinfP2MZjO=5Xm*n;a>>cH9rz2TFS!sfw zf~H*uUBMq>RDPow=#CNO^w~gq7}3D?iPdD1UdQ&7Qqjnhe97l??y%gUdGy+~b0#3W z1oMG=yW$5VPU~rLGSwx_*b7a?ftfpH7oF;pi;!Znvm(p`d3+yjC%}>KMZLShw5bwf z5#7Zr=pDS_aXU6<#K@Lg_^w-YONBc{b*8JP)(4%R_7ii?%hmg?!?w@dfv)XbpW{n* zR3{W>NjMH=1_UdGCs@JZ-vn7wvV;_76W`YZx%XC?!1EI-GNEU$dc8B4>HOn{uQee+;IVm$#QO1u=FA zf1?@mx~HQSGH&Ex&=hiYN=0kd={`NsLp`ZTNN$s&X*c#8UKQFC-ps1{?KqWF2Okth zL^l8#} z*G&vV%F>w+RGVn88k6PqB07rB6J(3B*g|z`O=f%Y;(e<_RabnV`;`hJD=BtGwQ%g3v%?SI_& z0La0sk|wLz3<)qa`~>~@UQ2fO;fy>JUxWYy5Y0TdH)3oae`rGYXgOGw+3j!hoaOlZ z+w?4@>+Jni;0lm~dU-(~`TLhY8k2t?>~H^MpSMvbe(?Zg^68zEh+JApH2_agPoEMT z$-l-c)BOti*o?qL{d51-A5-REoe@3!M@#fh<=3Fzwfy*hz3s1LXgX)8!bx^yFi&Qd zQYy|cfX~AgrMM;h*#Q2J+ksA)8?@eZF$}n4>PE4C;mPhq%I5u`i|(rv#CxVo`g__z zml_X$%7OlgTl!$|)Jf~cN>t5*{Y3MmB|pXiQHlAo>~nvK1p9x~b{d`hnITP#}iR;OOADJ7IQOq)<~t7nGsfQk2m&+!T&J3fWBrF{83XUpw)*5}kmH|hxkD-@m+^=! zTL^;-%)2vG@hsJde~AOYK${?5YW{rm_(9zVO0O{+{1>La(+Ge8fO-|H;&1#R?$0j+ zD7_2*nm?sM|I~yBIzGw#-^>C>Pu;24h^J()(EjvEG`Cmo8Y`Ii5IJ$_Utg;1_@yR& zX^!os&$Hi!&RkqoK>tCi4<q zv(){$rhhzlHaO_Z6CNwVqfXa5sby3+qN!I=B$gLr}`NZAI^{s@zwN&0;PpC_!JO>Bm$$U)J; z{bJoAJUfhxSO0d`(^n@6WRfD13IOR}e`y71A2QJ$@q@+>EKc9ae$_mt|8K}r#; z2qR))tMnJ1-aB#9h~l$y-W}k6-Lq@dZ=ajCEt9iIa|FG}vhi2ek@FN`z_dClynFOl z9-Iw^0z3|b7K9%Y`svFPn1^riaSA_fdSyIgR@`2vpYLp2R$tU4*7cVuA3T*m`&c#p zieH=Tm0P{OU$9HK*Zwl(L3lbG|9Ysu0ed+83hQ5=c>Sf3bWifIam?U8z^%-h*SjB= zz0Zt7{zl)wE%8sf{EI03y7Y^5zog}t7W~qJUs~`>3w~+A|4|FFve+2^(F@>juGlY^ z<8Ln9FZ%oviC-e|OACH!!7nZNr3JsV;FlKs(t=-F@JkDRX~8co_@xEEwBVN({L+G7 zTJTE?{+kvw%YU@aKf`Uq7ZPvxW>(?;9d&nlC;i(162OOWax%kkD|f!$ zRgI|4|F@9$(=;Iat{7FjedS*U`#C-EQFi7?QhYwCu+Y)Di{pDHz4uPF2pQ_|+r+kBs_17ZD75FoD)Q zexd(D1E02AGu^C_vv)_&0NL_{2P&*to-3RMCMDxoZ zKF-nnO?v-FvAYosuuE-IFYf;$;}0IZ33U3Ogz8^Lf2s+nhisF@v;PyDjv4&l>G_|y z@c#z}`!e2h5=zD&gg%ca&u?b<9|`?4d%j4MKBkMiyPs1$l@7XZLg!%B=uIBPC?Gv? z#lfi&#VBz@jT6m*3(=hkUU|fGl;a<*WSg7OdG_6iz(Y@l6_`MIj&jnM=OVIlXWpWQ z8B{ip{LJ0Po{6@z{jW@z@7oKrx;MYx}w-?MT;SQ1JnQig^@l! z0Qb)%ykJF=`fSV!+$zl8y4l7X-K2C$>ZA1yFL3gW6D==HCkmqfMQ;A9wE2);SxWi1 zt&d)~?kbm^R4N;U9ZU*iXx?(qUr&gg`QjujDdF3htefn41lNEpu@dYjHWVV2vG_zzQ| z$m)~X7_~lC|IR&gkZV-;e`dtJ%;PO8$|?4S&BHF^B!dBMTdFqRA*9kVpzoOSneQGj zXLcs(O$};QcL!c1D4DiLjXF|4`5#vQV}NEIz>x19C^z+?25o?ESK1C`hI42bP4WF9 ziBHwfCmqCqTckzWsXZt(s!iqpAHKjJ&wXFO0br_^TT=TUJ@pFO`I;bC()n~gUha>z zEmgj5j0%l3xc&1J|Bc&XzrcAn0W`|uSw z%nwr&kq|X?zBHxN9n^xLihf8}1DZOYxJYDJiRCX3sW@o8aVu;)fE+KqcC%QR!9=AH z_>1!@+k5bV;+%Nz~LnsvD7K!)M^}Il%NVP>gu`_ zd9*De;AV9_fK>xmIcD|0N_Cu&Pi?DOe8S0=7V~;Z28!Q&+2U2Pv;Hud38MjFZtpJX zVCRmNC0CEem$75%_P<*+gPcqs8S8qgO&@BLfvS4ItGbs%?EPJpr1=9r7TTwCQgC6@ z=0;>E`d-j1I6Uk0@MsKTaHjrbV*HWV@2?_jW3{sDqDDkyR`hAp)1F#rLWD$+o^nfd zH@=ljDT|taimc(m1^as(h?w!VXEE^hJ^PFeiQ6ynSM+|76}&6_0ddc`<*paJw=5E! zv|H7h_1HwxmV;SIov2hDJLd0H{lOrIKy5wO{CfsBu6LJgcLW=gHc;I#7u}($e1`GAqEp% zPhd9tJ|X!`2Set!Rq&*QVnD@IW7JKgQuz5-+9UQ?-i%MLr=$?8W-gvj=_8zU|Eq8Q zT@jbKH+-Ne4)Q8GRi7c&M14=YA^dsZE6_ukrC;9Yb+q@pkwj0La|P-en)i&>*^I%9 zZ6-=GCtzSDy?W%jBdYmOq|id-{@AYNt;@M;`ishB2xPnB*!3L z|71x_VVnmje=omsvy4uELQhER z`9pe&aLSu7HjkFzBH;FJspwkQ!*p^vvLT1yMU*#RZ0)Eh_eZ;yI}{nbPnaSvhX%Et zKZ=#UphZVd3#WIGmNvGT%6&CQ(+Unl@#NmOE3Z$sA$I7Fi85mG`OflFo42_)BpK&m zfaaI6#S49CyBL2sWG5^nVjl6>*dN-a0G^Jjy)_P{;Tc8_C!AOfMLc)68??c`&aC!w z1#P)^yiVD^sN%n}oFM~GGg)+4-xA^>$QoZX=_$9r zptY~Hy3`?2BmObTVKmX2q}q9Vw_Q-j`=hh5(R|t%J_-~g0jH^*Nv&&R1$*V%Qo{;4 z*@url8P+e7AglTJw+-<~hYl0HP~9Yx_v;+g75QvX_(6I2*?~i9Yf*gJ{q1k*lIn}n zkCe&}5!WE96}t-<<-en?0lCmPJ{D?lx9G(^_t}3oYEkt01&gu1Eu-t_ zA)*Yhn#qVZzId_d={seSN&PnjN1`b^$SR8Fy)G?OD3*hw5Mtww&KF*GpCu7yi>cvb zpoq{O3U+u}W|r3fGD0J_q4o}&j&vDONfErfnV4eJ6B52=3Ev%I%32TPsdtNw zE|M1GPjg3wWh?9IU9~1iON+L#4uy0+PmGnT%^TOfKN&ER+-$WQ7BC{f#z`khji$C* zM4v-F@3I3IJ~5R?awchR6x<`?j*T9uJV|_Sa99GMm$nXAMNC#V8!xRH3pU%cX$##aILamH1SQ5gA$F+Nzm7+e9!ky=VP;zVEOv7o)$IFRy2(9T$okMxDm6~GP69sz zJ&QuT&e(?<(}N`c`wYC{i$x6GVM_roxc1$LCnt1sTvotYCG96LQh6YA{@cj;p>Cyt znsMQ=GLLzUuEr>hY54m6&8`*hYWb1-)#LS%gL^g~)Q-ztoKNl(;sTQ74LvB%awL)d zY$oD-PoB5HXr*|zPDc&I)>|p=5d3BK%iCTk=uEtS{%U?rz{x=vx8V}y$KlcbIYJ`= z<_VpoS50wftZ9F{>nV@Re>N0maL`!dNqFm99^C!&*gqW5nu-S=k_{!+)QZt{qGAGN z!o_!BMgRmxm=>A19K#U_ln>#)N?$H9gF@WTgvD| zlpoLjQ1TG12BV&N%QsuaV3dTQHWXSdQK`1u1Y&6sk2I;AoVhYrQLaSW*bmoeR zTug{@mrF{k(}5fLh_-AiN-8B58d7;{?k)My>GxtVToTbux;#m}pz#%)842TxKh)yf z7l}YbppMMQhd#yBrKR*V*~@BnyjQD&jo*_UFri-*qS&9fv6xx0=55^B)>sJ???klR z+2w>zyBWqv^R?gR*&O$G;iF=1o3C$iuH|Bg6ro}=NuRnFUS!gR@OW8vq%6H-$(8Ir zX=!YqPS}rY-Hr21?u=*?s4AH5Zoh1%BgEo2+SJ!0flLCoIQMYauPm<|9Jxj&U%-^6 zPqQ)tVUnj=U*dVmgA5p zHgw0c$Vxoms3NYnL`_;~)$7wty>vrGWzp|cl(bA7`N5tBO&J+YA{UN~H3e*qbPDw+ zyXAVfPl?;@vv*YK)Kvo>Lk|3{_Rmv&58L=yGTn31N23R)#!5jrNkZG`JSAJ;KP-x^+@|`ckqMhcUPE-ePb__#MD06^&yY<)Z%23&ER^Np;06{ zWNWEt!7x(F$g5YT)?dGWgKtrb-sQbhNwp+M!bnLm;A>4Y<9(*7FJThfwyRT`lY5n- z9sHEF&j?w$wgs6gZushv7H9BC7iMKS_HK20$4-AKHcgP@a4*#NA+xd<;y@C^CSEfH z9;3UzG7j=suin@`pDztHz4&DYB>a1GZ$@>aAJzv4|76y4>!^&-OqAJi-*dq@~h0cQ8qA|@| zr>M5Q!#F)ldNON94sjV6BY3e1F`?bk-stFEKmYwmJ3p-o`$;U3OT;4VHc{o}IAU1b z*+XTdVf9QHiza$M(hIZ7h9b;t<1pC<%80mgH00?>r{w^U$jTs1aMPbvbjR}bC5l35 zxog$9x-JWah)pTo7}2nsXW2XhctI9bhlSkbL1&X>DS)vfR3ZqmdKBHf$6h&l#yPYzxmbEM7S* zIUpEx9B}5fdl`&GjZHc=nCsau&_3NsglxSAgPTZ1v*`D79d|RSJ93rS&ve>n-5%MD zJ2KYgs&%U-YRii1AN)%s)7k6z@Qb=3Sz)e06d2cM%F?WX++`~Y7OL%!b76K>mhMi> z&!E72$`5p-jBaAN=+RNT!gNL+tgOgFC40&XMqA#jhH!g|x?S<&)SV$iV z>+h$j9ar>(Mb;D6FQ68Q05$Cs1T@>uJ3~hXb9NYZmj(bIDN|{=U=h9`U{{zZER861 zRK8T4$e1`8&=}Pm(-d3P(zumKz3sWS%(`Daru|0#PH7S?cS>n)2PV$WTML~?{)8g7 zaiPU!i`phFk0nC6CT0kn(-pXk9(6JkJ2aT&JvCn44s9ypT6{l?j$L=Oe$|ay`jAoc z%EHm!_D$n!mt=UKY5jUoTg4U4>YDZ-M#-g3PQg!2Jg!RxDsGstj$TqW2Tv$i)|qAfz|g6}Wd zA!0C;S2R~%HWzYE*I+TF!XUwgDB6IgDDhVUN&j>Jzg)e#wCtwuvV73{1eK3}b?Dz} zz?2)@#Z&8c#NFPrUWm6%{n`Q@5-*~uU$d;^@sK4>zi7J?JRd7w?rAGyKu=L_*he0u zLgnyA-sLc6@>vkwkmFja`E%ZG9)&aBufW8Li%5hZ(WAU2=1M^JiP(i)cY+a(*aq(PxrT45J>2wnWRqOc13kuse` z$126=IWgjS(Q)g8!uO)X)F||T55?8)RA#2cpT%ZmR(#C`OZtE{&3qEs2143UEscJR zD-igaU>%lW&QE#)$HV=o+dj}&BsNW<<%e=S>EL{r9OaR<+r8idu5rwm1UbR z6sCu>R-Tlp1@_z=#KJY(b%n{2ns35-FcZ+o_4pGY!MJcBa&>R{H1BfBx|eOg?F$kC ze;JYw1#kYz)!%%QPloUI(gI)Z)7&?^lbeKJavD}%#7sHc?Qsc}o8g-s$7rFf(7GM_ zj$!mE2+7iCnJ})JuKfCa`x+WQZjs{2!czyFCL1%n_KV+6pwR-r{yfNQ-8|tv3EQ<{ zH>um4MCJ}n7b+<)j|j8HAL8mq>Z=wewG%pKgH$*GKX70|EdO9WratX?+K0ba*8+>L z1HRg8W<^7+#*0H7;}QvDOV9weQnn7IMY1;0@*Yb^TNBL}XOaysJ2~N%^OpJ3a|+h8 zKEV42Ek3WXm@X!gtlkU7aan@as!bZYPe-T98{bW7JNuT2{nZLhTzd=9LAZJ!F9*WG zPPv5>U&_xbvksZ@_T{MwGy>5#OMn%5eK>OO z%)YkLwA^5slJr1`Z6P1#Uc?i~{=wwkF4Fg7Q#rulNp(lQOZY=^dArZ7TyOp)7vC?{PI=Zvta z_HbW`IBAfdgQ`P#$P=Mq!6}o);hfj+7Eg&9bXnVlJ(Hk;1?!DZOc`JT6!@*=bw+l| z`Mb)M5U^cOu^Vtb!P00Io#mUZ{EsGo>$^S=Vdi3P+buegF|;?f*@E|SL%jaoimJjM zb5LKn?kLrjub7&*-3GU=uxYK2_vLU{Po_upB-|Di0544qN4s36DQV<>>CGZgLO7&( zj`ygBO-U@wi2POUd|0_Y1h5sK?^#}7c8?LwRb2aOSZ*m(wKp6Vt#QfqN&UsPns*$D zE|2w!3k$*?9dIJDITA`$f*o`xWRTUe8i+7Cu0s4?6}qV-qKf3=l4nzHQ46htx4=iQ z&%R2C;b@$lr;hNde|^|mZnBB>;+7ANUbC>$Vu2pX5m#p+>{d@1ecp~R^EB}(-M<8L zzi@E@NsOaYhjU9J6VQ;hnhO&Hu)sbJVq8L3ZE>dU=giudT-;B^^81%hL@&1q2;bZh zhIk2bSN90B)Stq-T6*tj$F4+{^{7CrL z;X{ji)AtpqI0s*PHFx9U1U|sJ4F{__e8|UPZ%1QZ=QaWHM@s75mt{ViG(;&Z%1vW` zC*M;vbG18XO)$Rw*g5-_Hi@zuSI_B<{G`nAclBSh zUZFzRAg*<{3Alh4CPgB$`!&4XW~r|yiT4-O&V-!rzPVh2=5U!^r6d@M9?6cQz6o|h zdLGBxT2uv1RX#PWAA>K2oR{qHb`CDYp)oRpZwA(s+U z|H|bPCpk~w$t`=m{1R7$sr!65On?|xCwSjCn_ZjrD)Z`E!9+7&hYs4WhWU$zTo8O{ zCVS}Eof>pNTpuu^wqH_;TNd2A72V{7weBDwW43j6>|c~RqnfQd(A0%&_%6xn=_?A0 zi%QLMI-z>*?p4lROvp?ydg9RY)hdH9hV-Npa%^7MFJtG5tnJ=e>GT-5pSp38cDI)w3| z8D9szYUvcU^|-H9`}v4GQZ~1T?u0;RgZg%{!5NSkgkeV*O4bwXYAj8+YDS*U35LG1 z=zmhf6e)y_?O1KL4CYC2%Hxrpoc|2*sj>f1vvP*|p+Q!+j|iezLY8s#$id)FWd2IK z{Sg19^ka8iqu3^NzyEf#kieqFp^P-aW-OLLg3jTgbY280*S;a`s-y9Ry*`06$xA3% z>O`UEv9oua7oNt?=mO~~yKjphsQJEKY8VSc+vcYOo;f(t(bfo*XbWrzV~d5Ma09t3 zTC(-=vVWv#eoAS?eLHzYi+N`TDMMJTn9#c3J(V}@+X>Zz;HJHk*`lp=O!O}qY-M}9 zPzihX1t233guH(1rM9<}5@nq!;U;bC93%N~S6x!J_}X5>xlmY_lI@6nplA^vOZFNM zY@HE1EM*M1pN?z&EEkqGDo+_3ch{|`01inq{+v5>l1?>-+f`MM*TSvsXcW{?lU7Zu`ht zGM9$J8^7lKd~2+fJJs9?a|k%mCf*LaBki2jg`>qbAO;HcOL}`7j<4>VLJRNCl;HEl z-Fy1B4c1wA?W(ILN-fRymyL)BAY#KYAO7dQ`orS%J^}pW+%O(yVjB$p&N*{8*nB^rDKr6gXtMnFC+>Ux#)3ck|)RHgK-x#b--3x44 z)!WmmBctYs!k5YwRe7M1g79qYC@f#nZZSATa678-{`@Dpl{(8cfiI6wpH(#XY1^O^ zjQ4a^%uIh~@Uu**AIsGG%PxOWdA?XYtiaxmsw^eo0xX|sW^W6w-YgRA9@}4(&Xg6HDyR~>WfNJ@L%tlg4vvl1Z;nd zlEKJDk3{YKBL~lqmw3$(TZ!Xo!&iRqLW*KcKPg) za*iIn04qOF!6j?(wb*n~3DM^#yZS5BJ99dSpLjGe^Gep-+vdyD8KS zdwJ|rb@R&iJUWFx9(I2%VpmwabSN)M@r;*O4Yl+*c%MKEWs|H~?GTMHOBl&e{!0b| zh-Tg3_VIyshKwWxi&FWlC%ng5IInO`6fRe%Th4~&BvXLA95CJCa)Lu`Ufy0dD+u4> zs`#glPk?Bn#C0Cs98p=fwQ-_lih9L`iZ>2PFk6}yJBuadDL5`o_5 zoRVhA(uCRPXRaEGYnD7>8!(=9DS~!_NHiG zMS_=uG~7k}Q~y+4r&q#AcH?{zV|U&lly(5Gy#ftere-vk5#yXAV+;)#l{9s3Z1-Pf z!1CkXl8?xiX=J{U5yhwe1^wnl%eHc=ZuX2(47uVq3f<`*g(nz=i!<-sMMPN`npwW-eW z1R?Q$D*_9LLH0*sjx%rkqJ9tmO`S9TmIHc4p*;{RnIjOk{A%j!oydpg&OMeM6KnY1 zI`1Ny>J4(Ff?#bOFf`&;2Ou5j*T1(!%V8o-+{(FD8RC}&&1SHrrbjB`3)^+(1#+Xp zEEu>`b1EvT#|VEQL`F~3#tPAL3{Gv&zvYJs%XWN1Fj*0x1gBmXxM=8N3)m)E25bwpW@{$ zYp2`LD`SnC_s_wauTF=#>P$~C%;A&uiZpS}vdkIokQa2J*R})&%^C&9nluKalCm)q z1r5FdLw!jB9GAW2B$_U4e{?Ote~Vjf_iwA3PlL5kooJ2FJ!3Yv2)X0dn>fFdCx!w< zQQmyHvSYPuv@z-1m?)JS6;Zf-ob7J-^+<^XWs{E)tIa^F`-v%~o(;{&H;CzHmrbS6 zso$YojCQBdIv3PL(K8mwrbwkHq~V|E;;Je*EL($v<%zB;Nl4xH%*Y+a0t4Sm(G7Kz z{2=P0p09~gBFy}ES7(MmsX?B`Eg_&6(d`a=BUOoiRtD@TPCq`nH@SCbGX;Nkk5*V( zOG+SU=-3;nk4x#9q2FSoem2pltG8JE>b{ju(|MzM7Z*x;MDv=<#}jchFePPaCSiy! z7Utd?c4dr57qz{Fe`5}2DrwF;(ZF@#5wi(*n7H{nBT9K3hCi*>Q^*eO3#t7wOC5ue zTwgp=AZkzS?5sujZoKi}xLopx@hmlZ?e?887sjAi$=PKH5FHoq1jeZ&op5o6^j$UU>d|`m?^=v-rILjU1$JcwJX*OXdqV99k<4S6Ip_`u@Q%s5)aDXD6IIH+W z4TY;x8Y(D5yizg}Dv6+B6t<;jjH9#FjXW=3?t?lubE0ianyHoW1oBEA$w{>5O4grpB@bWtkT8Yai{ak?ZZ#Pamjy8*54_oTz^$ z9ecUER=c{Vy?;2s+wltX(1sUGi#33k*q27d?T2PZVQ2Otrf1OOlEcOkMFOddL|paDtZE- zhM2bH&8Y~FWGik!UWjt{vygR9Csq}5aw2ydu2g*6V!F?rQ0_5mi+()QBeb0EjGfv^ zFte;R{}9clMEl7NRYnfn%Zz%Q8y1eiXm2Xadmj#5nAx2G=_@9Sn)xkWcwy)`<^+jnq!IDQ_&T5#mP{%K(@M;Z{?G} zK1ez=ms2{dHP}NA%j1k&Fj)I7JibLk^&u2I7~Oa3ics*!HNgU6W#O_K+Xs5*e)lDS zqB4(_JQhEeAHcz1=iS3?Yzz)#|8>pfF^(!5$VPHd@r0g*QEmwd(OB{XdAha)st5Cr z(spAL>-uRz3Qe}}wiCrcIkH%c%P$AK)iauD-0E3Tq|+;}Mb|6lo1XOG(JdiBRIQLJ zArcZm;E|M+bn+v=-WEQW5#u3|cQY58{+dV18VK}>^1Jh#;QkbL!9Av6Wp}x!gEyxK zF}K_3u{#1Aju^Xv>o(E~HSzLteDc~`7^UGb`mNJ>fy6nz8d&DrnLjqun!Z|JO;%W3 zST%pYD7k&sV?AP^!>xelws4W{hM+oP`M!TVNxbot00QW5ThsxItpyH5!`yC7vLX zINNd%eg010Lnil0VzS7Ih+@sQ+~u5De0r06N8XaFO*t5y;{$8iQKAf(k(&JzIqYAk z&Jv71Bdnm|y$=|?x(Jk46R6U3N4Yw0pI$~2vW z@tWqEMc8`TlLxe#2DLES`GB)Os@;(N(Rcku{tnp;Q$`jE;24;CGv01&_$7Cw5pGi& zgb5;0Lbe{uniM)2wXm__S5{H1`8OyTng0q%b5)F|F?UquA5}J&iGD5{tLuzfl}PVWC#Il|ZJ`p_471hQX8B41xQd zoeioX*=QcvG>lGy(6_>Mnhv{iUlrlJ9?`ae-Y_kZTnWTz#EnYUccqjX#*3SgpNX3Z z@OaNiVz{d1(_0lgH*A3-412gfBGdl^U9!j%jSp{u_^@NV%99$0?kqM`up)15z0cgM z>e!qsc#`6;xnPaFb4@^M&zt6Ms4?j!dbS~`E5sJA<)G)S?^&_!X64p0JOD%hf-?iC zWBAXBd&}f##v&S$wik4xyT`|gHm=Q6y9{c21aI?IAU&21CyoWe{l&wL`VplXowCur zMr(94o9Dcht<^?E9Dw)f!&Y{TYT<_Ukos6W%23(>-(V93h|#(v_`p!ed|KGJcU3fx z(PnFzjV21|EIHFZF0Ia7gN=$frPgiUy;~hkX(V@>dnYM(&omWo#_(XidCbHsG>H2; zTH{tiuqf@rV-laquphH|66fjGf)Qjm4(+Ociv9Shdj_7s zcvHQRW_zbOVb^{$7QTjqn`o5pEM`|V=x)h@;(z*l%lo73P5K$RofpwY3e3}U24YU< zGVOCh`{omVhB{Zk(y|-JDnvMX3x$H;ubcC=T1PwK(>r%=tYQi(&I-jpxE>OL+j$Iv z9x=;!GS&SyL`h;m?{K`qHZDl1Hkf6@N!KM z3uom@2`&yI+fR-~Lhet+iu0|F^vk8PQwJ-SRceTenug*-clzecK`e!?=!*jsy6_bW zOQ`2#*-9)13a4L99cn$N78Waa!sC7ZUi@2)3YRnhGexF;wEP7%M9Byn=s5F7LrHe3 zjP6`}FSxDZNqPs2F*b!RtVzt!slM7`t(kCfK4`#i+&_ zH8b-@jvB+)l`iTX+c?X_?4e%ku3)&<5)L@B#nv-*snLLox?#)cFjEwQ+`Q?%7ZjUF zY~f9siS3fSx=imZBl@_QU+JdU&edfh=b?E=YD@on+eJojXsziGF)RH1z}KX9fiM&$ z45)dkBB9>M_r7JJj)N~gQv<99pU>E+Rw{tBIq2GwO?o7Dm_oAAZKM3N(I=LjY>1;M z$TAY0?>s%mvZ<_QJyfi{4LB@lR?!(@WvshNHL0J;anHbJX3-1 z2BVg(ye(UH5}*Tl)(yRWuguv)`_6_$Ycu!y&z+Fpsq5EC?m~X`ArB|p4*7maCU@$6 zc@2W52r4-!o9JypLSE5^)h_0#uHk=erPdz9v-Z^dlnrSovdh^mBF7N@mxe4i9p?)x zHzPF#PunnPUFB)%c1&5e>*M74vMbXPoK1uTlVAXuYJ4Zth|!8MxS_3}97H`?tI(J{!y@OG(gczH67 zJ>~NEKxy;!3nHZpy{Q=Ex>uwm+Y9!ks()D7nyOc&`Ot2WEwKcY-*W<4z)As+`G&9y4m9)`F9P0XFzBKCxK|L`au5e5 zd)R2&aRxfS-nLw*x>hAus1o_^GG{?Z^jwCi7|*Y8=d40LVl|)o?r&e@x1V@jS%HzQ zyL+0hkEQ4gpshL}AL>M=Zil?eZj)2pcsKi-0Z^dK6GF-YLGKy4=wCP+-w;6uR|OuS z$H{!CrfK^csyF&u$R&bgPhi^`8}ry;%Fxd6R3fOiqk}#+wZ)TS!fzq=WE-G9=(VW@ zHncYXN_n5VZI&e5*xNoUC6bcbxcw}0frI*(uZ3;_+PuArt5)f_QOVhnFAQ{7UC+Es zk-2s4l;2O2N8X-RVgE+;O6K<$pZShnF!+gaqQv~g?JB1uhRYEJTm>FWJ=;BAx^?{G z_2T^XlG7$J5%UhI$$dHEgG+z-Oa?vEs7t z@E#yy_V~Q>Ljf#?2W}#0c7G6KFjSSJtuPO>1iOqcgtD{e8|@TX!}~_#LvP1+VmNay zQh<8KU{k+xn?*x%qInuN3N7bM)gis1^Gh*tg%C?SZw9+dSzh50&GNKUPVd*Z^Pk%h zHkmON)@IWBKFt<;OlhXV3R`Ux^p6TFKNR~zZjY>O;XBH7JQO3+R7f#>6*kI8wz{yM zW9e&0ufvEoZLb7WH_OJRFPKKtFb`(A5L#5|c7txwC@nNVBBA5A&R$hW&E(A7>*{2h zQ?!&s=COXg^!6i~VT!$wop1q<{>N^1PT#o4>xJ5+U{w}v{B%38gwIl0vkhJhYoElH z?1=@H#xp5rh22xvjPJYv8}G_s!d}dRMV-GL^Y(g&(%DgZxBRhAeAd=#Cfo3YvDJsr z*Q9m*dO^6kkY_ti<_8~lQO(b(-etN9Rnp988eq#7nA#8$=H>hQt}Pp$O%ty=X~>E{ z+eSQvAZFv|@yS;7oM5os3?D`4ijExiNjN%!#Uolex+_eRWFkwxC>$2&{Iq2a@DBTE z`ylPYZo08aC!=T}ctW7Gi|DDVn32?8!=}r2&a&v(0k&DWThym3@C;uCne$xM9ZTQy zdkK%N)$y9FRXRP8Wmad0u(io*Nxv0OcvXpg5{Xj)m_2A_NF6HQG6QrMU>BqjVGUPJ zeP!Kh#Tm%8`A)Wj(#%e3+S>yeWUPPEPv6@78|>67I)w#hbKL0Qv>9W@z$-$jn1QI_ zlU)8ovo5=x{yt2zYvIN11FYF{TCpAq9|GrFB}=3(tcZJG9Y;eI8`<)c>YjYa7#Htg zUrhcbG@&_{j-Jk2Z?{@ojiVDzJ!I&6iD?Z1>M1h#(v?e5&>q(-Y^v^ILWq)J;!`)Y zch@j)klK}bdo|t7w?MaI2jsE_-9~SXGS%PvZj=+w_-WiG8*3R%B)pg#D^O6=Zz5A) zdY7ePXu}httSM%ZGdI0W`DF>$DV^+)Z{`{p7>(3#7v9Cxk~il{=nFHev9M8({wY_= zta+oVIE1cO!VTw|@+rgJZpEe>$z(jkgxZeTBn!e(^pgvdN=UynnusSdjQOdWNw`6 z6)d>S1D1diZ?i3y;Vkw;+hdX8A_~Op)^WB#qzc-dJ7-firHETJoCCAy;k6NbZE#Ht z0&+5G$;{taq}xDz4anEOG}I8vE;pgwM|wK*V%1qoR!!n|OlYoF^(7o#4uzc~l|v}b zAwb3#>}3i*Q{mU!(gmg-d`z7ntO1bF`S|;_cI|^Zh0t@q{p8oj{)Qq|>??u0(MR}Z z%($_#f{;bd!`XK9cZ0&_Q;53@zaJ-j;9@`Ap6zTliSX z3R|0f7ej48fDDk8MX8wh`)wie3j)v|y zxh~!1ktd0`FPHaE0^!*}nulCqsc#|HnD(Fno)t3u6@0|XrS1X+Ze7oT&>`#5gwA@lmFdBe)EC*3LWGS|){}!~+ch&%k*%s=_7w_# zeOmk#4&0ix=_6VNHA;P%uFxYUi7J~7R2(6yqV4XR6Iylz?acKNOMV@%#B4^{*`0!I zIepC;!M2RTckIWjteXYj1ucCHn|lU3o={up@@0d%Vv)4Oc_N|F#bopmM*NP-;m}FF zLYn=i-LTH!4{L}M{G)K-b;%g@#7JsYt$QEU@cr!M# zP6Gz81w*?= zkw{;eu0F0`5;{sGNlT`O@6H+9GaJv`1`*}S1~(vxV*jlNh4C*cX3P=vtLgrB*=G>V z5qjpnB&g&KgmlyGsSQv_lkRXixo^xe6YU|tusY5IWMdZ}P(_Dv!L~k;2V0H|8{A|k z=&)}KmCmD@K82iJe$OOGqTUf1W;kudy1&NT@AWZfQ>7N+dJWp5tF+PPNl;RnhVi*nsI%OKPs2SS zbD#~{cF!utW1*x5lvs=QI?n^p;fF1b$W z#15y&y0QGU-?dX^9rUx8!uU%mnDg(7{8_`!8;b)8_M1ZGwaJ}lZrsZ%kl6afDK7kE zU){|gF`g1eAR4hKzjMt_45Q_1C9C$X#`4X^yHFxd{tC@1+W)-+cTx0Kgu(;*T!C_x z4U$b^#z;t5fZu9EAx`c%*`m!o|+pAC-pI<1@CIKD9IzznGyRoK;n=)SNojz-G#Rk!H(M#X`ZSX(>`8z6u~h+|!BX;+w5$M2DFsUbNENzCDJ9g^>9 zP{?l)#aS$}SVtEVrqVm60x|DZ)}+F)z4CpzDX`AnwX;=WVOwglaEuk!Z0pw+5QxIS z$DrKUmblp%-qhG81_%tOcQ-j0XTBg>g>qA!-PHu#x03o}c+8!lnb7%HV)MjCFIb3g zxot&AWwty`Oe|8Q8D?8yeULJa91RS(tsJZ3Jr}K%JfYbHg>{xq#M7`bd1F=4Ux6>Z`Ii2Ns?AOQo zMq$#^11uqC5*8Fk7w#Bp0Q`ryW3}G@4rn6=qtGTVJ!Iz^zam%~(m*K`n{8OOWlNzi z1*|@QWwH|4atL~H~u?uIrVI1xFJyE%py8~ zw#kHf!C}1thiY$=z5y0cFLTMr0_egCVgL&jYHfA?T~AG;J#`)TGJ2P7j14dN)Vt*) zi1-SR#uMT`@jbLASaQM{d>z17=!>DL__MFJWwDtN4bs&cErBUjo_X~~43glWPhlTA z+Pt({-h-xrex@N9h=^M#^G_PQo6uoJ9I+CfQt9`4SZF&%AVcwZq2kM%H?|2 z`U@?EwL7-4*%*nZkHj8h=+@_Uw_uQ{<y?mnDV zR>>PDu}9w)BL~z|mob&z<12lAxcp^mWrFf0JJCejS;VehgdTT`%zjT;`X9v@Q zZ)%mBgWB{s`;yTlppmn)HhpXBeLc1DN@`wuAhrMj!`A5596A0qmyhnz@iYW5&3MjP zVvU2I+r8AI(R0&lYn`u(?N-JjxQ)p1u7tNM%FvfFrlMx=F*)82B(qZM=D9n`MBD6+ zyM-L9S`xObHVeq(Hrg2L!dlC(x{A3vSFp=LVAkABE!32_n`L#%mMdU zfa?v7CTr7Ww(KjluI&}E41!sYfA^*P^rFw`DUPzF#*nuxP-*5a278au5vI;V>!V{F zAe)JSZrHwiv#MU6gHa#(!Fh4jcHNiwv|!AL;k>8I{?X1q@V?Aq;$O=Jnw8?&1%)kK;OAjhDGOgQt*ZE>wz*HEPek+%=m zOa0`ReT<>;RMy;C{Wtp~zkP@Rv+G-I5c5C&!B2{cm{=UH=(-fVv~xWjZfeH&)07}iL(Ud|zxJ%+3bDtqDuNgGIkhA_7!N6zS_un-$^qshw3 zcsU2%O3`k%{J364t`mL2ewKTfDx;Y!w6jAo?8y}rNS5*nIm|fZe%w6Aieh!)uAA53BELtGdF#vFf$2@}z8`p`a2 zychY~7w~`?(9Pvm0qz<~o{1tBD<+rnAPY%zJyl*eot-=MPeWqyQgq5{%(ZSy#eu3H z#2QUMxRn-52fd8Q5voKl87B_xY5fyw;#H=^wjOVXI^Q9pwIMP)kYErEEX}FPAl%sd z&TyD(X(0O#(Cjzc(?o%)u*`zmiqB((R*w`TXhcFjl`?jEGr69YC zL7b4^6$GUQHGhEekbGWGfM=4iw)k3XOOe}-?J3Q_uFP*cJgOjHtGe( zm~;CGaaQ8Drg|9G0#!w=v(Zmt&@)x*Y8OosAF#r}ToeeX@`A}P1WV! zxP;bK!nA`JUZ)>uj)fh&p1m$Oin^m)yz^MhXV}`>;kxNXWwlum+ezt6z?T5u4tsWY zu*_$-(z-Kr!+NU5T2OZ=dA)>4NLWj--dWnksJ>Q#8zqRoU1_5^p(4rJ`1Kq70kyq; z_@1P9#k5Lig@_MlE+r=ptgo*1F9w`uJE!8pI;$ooaKReC#>b%y#2;LTb~hxd)17PO zZO7}YGEc|CZ%X1yHjxuy?}t5*B-?%zqk{@DF6ajPM8noMaQ5kC5;97`25{)B0_FyZ z#VQItP>s@B?>eJpB`B8ZUzinJz^tM(sab-1Piy_;SIYXZH>BnTld_2dzcAulmC4$7 z%_zfpR0_mdsP=G_XRPVcYMDQI5AdUWlt$3d49TkIZ5`iy0gyudw6JoA2+C&(@pO`f zpSlXgMuo`<7})HZNSF0R%TgLt(ByH}I#!!#OZjO-o35>2D~1d#-Wd#v$R2pUEBC^! zXx|ZSZUnL8#sEc2P{B?3m#FE~9J)IcJ1_E7Say|$_s0~hY)D(Tx5;%=pZfI1cqQF= zmFl8h?e5Ii4Na2_l35;@D%T>gbj=cMju<4BaMPn}(Z0wtS<>m=^>nUWtAd<}NC9Wl zwvO)Vp#j(@*fQD^R_8DT9>~~)8fY$vQF_3cbkb9ifSK2Uvs9UjLnUuT(7XoRQRU$5 zGN~p0)ieQ4mHgu0&}S*Lek-DBIpgN5FsdIqsqzf_36~rPVX^%=MhkG(5G?hSG2qkcoY7@B#N>@Q)=MO==izQmZH-Igym^f-6_u!b~^M|0je zyWZwOV^$!~M{9+1W6{!T0My-6jXO?pbPjW=7E~8~Z3Zz2&-zg*HZB{}3B(>OM^hFvIbh>9^|{Ci7JwC@Hu{z1UUOc*#3V(Fxw*Y`7)gh8SKYzQfOCs)~~$j7F#g%LD@F~bRc z12K*o9RatZ!)#!lN276a+m#?rHbU5^afS{=x9yFTDAVMXr2x-4sUjM@EYr;GKK8Jm zA`8jqogTH&uv*9taukQOc2`=gSA>`c9XZ#@b$}M=yQvU$6vMd^!1|q=#%V8FcpEB+ z?Magr!0!qqSa5I(8L?!zzFUgCUg|txZ{0mS?DLZRWp;c(*ue|7smL3ayg3mSmNV+S z`HHdsBd3%!SG!oq@&_OyKll4gY5)k_^B_xuOrKHeOvyH%8y800klgrEoe5*tmCmsJ z;%2`vkpyVl(Ph~zKktOYBlyl={T$r<(BZkYg?G%MpnCoNtL(-S{6Jsx&;t%AoR%P) z!knAL$iMc9;xDL8e{~vT8Ly)gc!uS+yol zz}%lJ-yv%<#}kq(L{JDQlAlFr>1#bI$>;9F+ify=)$eLqnHqaFIqOgjvt@x^g&ngu ztC006J$M434SSV^rCj%fc;4Kbxwf3$n=H%$%=V4OBp7O}mJiiL5Vjs2%bE*yLo8K8 zHi8XsS;NEv6NfYsiqTOKuD22C+9d@p{#pfXs~gnrd4W7^GZD zPM%dY5O`G9nNSF{Tk7me>^g18p#tQ7E^Z`D7$6@krBoW)iVn}~J0cP~b&L&1on3Gw z1LMl?LU+IAXu$e;Yg*Buj8W&%! zOs$6rPicdI0WLSwt9`EDJnmJoz-H>(w0(oUYF!ua#1r-@Kt$9!Q;#gpQM(lKvGqvy znz1W-rfOsP!aG@@uGCly)v3^m_0gtcjS;96Kc z=o8!9^_UlCtn9gi9OCJ#sz4PoHZgWHELVFus8!;57mig2IsX|vu2=RIcV6JnIaz-k z#;=dF3(e!B56Ph2b@t^l54Jx)t91s>Gn#b_D*F*^c zB9c?j{GlLdMVL41a&V2{RD%E8D2XYh&=0I?L2)d6?5ymb@BC}T(!Chr%=RHMm^Lct zEz^Z0V$3g@Yy6%i|;&t%oy+xzCrZVG)1x@MQjAF~SupLs0!Gq;!y z*KvuH{QDj3{bAqz#LsbTjHAHj!0LbX)gO-s_*`lIak1$C-Osgw0ciO$-_xWAq|x76 z?fb@?H8cAoym z-bBdufD*=o(0_FF=rHq|Fr6=3)j)FEg<_=K^9~|giE-kOOZ@oB$cJNNgm2R>o(I?b zyU4yn3s9Hoqk_M4{vS49oC7XISm$5d`jKq^#wY*X`|o0DA8!8(^S?n^W_oD?uQpM8 zPiF4}v0vi@U`Q6)wTg*zzjXd#Ju`zeiR?{u&t9lr^vDYAp8=46hDL%bfC{s4+G@!I zs_npLf4xI9^G#q9F17gEbFUAECI7knzx{)^e@}`SOb~uJ_dY85%N4xinrkE%J+J(y zk=#HvVlY+biqth@;yBWy*y;TaBquVI(#2bZ%0GD~3>YYFEGA)FMtcYVY5IWqBEV@W zdg$|gSl9#1nT+Ozux;{<4>e*12U22xLao{V7r=Hr5S&zb=ii$Ik7WFOUQ}L>J`&IcOx%IH()rMYrJ?WTO`XjSMkQ81fd_(hQ!7^JiPHtG~d0g~A{!`9{chJDasF!H(|o$|YIu$KSu-hX*ZzuCPQ zfTQJue}AaISkG^7uJhf$$v212O&{&_s-#WZO@#(@V_72U32^(3IBPa|Fr$9>wvgY{C^nS&G26Y{~eV77s3CD5)aTX zFq!?o9Q^mB?teM>@4@LG^mU5k{~HnAdhI_8N&>SV4W^OYJ2uwYTSW2ie4V2;KW8FE zz}{Gp(qAc?(Q29wyumMJ>-|(E?{|P zJT?Zt8GVzs;^YtCZl=G;`@Dl*!rDFlpt@TBC8^@cu634{RDHG7mM!sH)D$rnmF2V8 zt~?BL{$}pwm4j!}<%(aO^V(k6-x$5OMX-q<)3^}ab>d|^9651|V}nUoG>ba>hUH>z zaka^hwvL%GV0r0DzaXR@I0V7Qug5;o!G>%R(LbeeyD{%bv##)G-NT8JM;A|H)wQPH zUrGgMX|Sc96?iHu54QeE`SACiH8WrPT%~a3w-<-Vz&lT-DL&JPn_{>?jZwW(C|q^y zAU*SzQ9&<8)6YC{;jFS*kJ8CCU_T7WN3N#<>j8ypLmxDM{q7wTtF|Y57C79TL)9cI zdEIE(pC92IagDRxnaLGeMs(=2a!uR4O z;pj7-RD0I+e&_)%+NF$K&(9y}o31X3{czlqdq&nXOZ9_W@fsX85Dg4WRFd!1uVOif zC?G`o3V!htbsasq{7J~cf7?p{UIVNcyCin+L$m#lU$jEq;&9-&d>*ZGaI=yVC(qz+ zaWK1;IO{a0asZY`TT7as2JbCwwnbI4@Wb67 z17`N!GyXb%)Up3lF31Np6YLYEX@y_dF^BE~M}B(C;KV^p&@)oqXOa8(g1fafzAE;I zg@*x?)H=CP@Q|MkqN8t8*P76}JH_66{Lp0_Xz4q?12qwx(tYrEz}E7908rJRURb0$ zbPhGYNqV`Dsy9AB<=-9hV1m#3%_^v`rUjKcy;{0)@OOX*UkOl>!Gi6fYLQ1R$6iT& zr=*?C(W}SS#)AHw1#oZyAZFM79kLl%?C9Bp1OgDg0}wtdO9VeVbQ3?FlV{BL2)|+~ zSnm)o0w4_u`;M#Wt}*4qE6OYY{!|5eg!Z3U`u<0=|7ii~KaTrk^y;A_np1`Ral}RJ zDAVBun;k9!M`T>JPuS0WM^9F0q{Z*ysK0~X6m<}>kL*qk0-K#M4i4eB;W>yuU=cPp zfRmmi+5eH&{ioLyTq84`kz14Zqy-#t_K*xw=Xlt=hjczAa0==Nhkx-V5P+{gg|;}j zRkPwrfxF-DJQX~0sQeTF;){O(QRPP0cl^*L3@YT1{ z^~*#B*qZU&sO6>*^hSx{(W%=Uu?^&!6-b$(kVD1+Wb&rT_q1tHM~|P66WvXz*m`rizq?wh zvK|@LPMv8MDzT!UrQtRFQ&7CzBiW-%CE{C5TJ7~=aTz{jWK1VOT zT5a8aS)9S(q2(b20y?Tu{wlLt-ak2N?+>6r#o1YDVhfX;)E~HE$nIxxl)3h6TVA8R z;bwZ6*v{*`!aJ=5^^?ID{B}~H%Yv2=2_^S7hl6X}yBe16OpLgU;Sz}d5e!!!0;)0V z`TYa@{DJ+dsQDIGq4n0$n`%a}=fttUM=5|sJtxgSRGZqe@~TkrMRd%4N8N`Cv4TT+ z^#kbmL+{HgKAJdb-)6C&-b-{(VG|3cZ`%IWoO|(8Ms8OEw7ySMerB{(6GcTGCgpbO z2Z?)_k7OP#o}|BLzL>=6eJHW~N^bAgpPzHSHaFJ!abD*^YvlWZuL35L6Vn0E{jl9Z zz573xIkOj9ezV1ONHPD<7yj+CSMPlk{1*rMtIhVU#d{r)o%g58c@N8jzwuj{XU~pa zLLRuHd!&rDus!mSBtGz!{a50h*fY}%2TA)s{M#4tjt7)|7GUF zp#FdNB-o#(^`bd&azYu+@*lM|e|f{dxXgz;fNZRhD-8R;{ox*=0fjw^4@W!IYChcf zi{<|X|NIYx&r~aerP#+9XxaxkcN zqRlRq%0hL2B*2P*wa5VuNcOPQLzEb};p-A<7 zE`cVP(ipl@)E?a=WIt9aWIsZFm74vqzZ`D?$BEYxxRXMHJqSnVyO)m-}NtA&u$% zu`;v!UQaujtUtb5=-*{@R5Q*iUyhbsd*iX!si|E~sj~tl0?m&?6vX&OMR+?+2L2lan+Gz@_$Up>A_M8wl`T8F1ZR32*s) zf|Tv#LjP2Jj@CM=2Bs~xGApcbw<{))AkSGWG8JU~>-C|Fop2*cY+gY_#>kUHX5F6< zAej2`OP7$18HHk`J%(QA6psa6z5=*eA*Us-(GtB{9*6MQoce?99$nwF95Soy_Y>43 zY8orf%AC-uQ}@O4SC=9=V<1s#!K9#pI(zEO&SkP`oxL&h5$LymlliY6kXaQ8UZBSP z%{oxibpq0NRnqyn0=#WPCx0b%R5X10rQ))vS87fsK`getUeJ1Cef^Vujz=4e(NQTD zz55%z_RR1-f+upZp>Q6SD*D)03`&ONDC_w6>bBj?2D&FU^*v|`N)PJOKQDT6@jVrv z{MLB3%I(IbU>BjJdM8aJ`}O?A_w3_uLG&hm>}~VcbIug!{ZW(T7jv?r@=4+0Dx_Dp zcwsvY>+AQ?DJeBiJ>_vG64Tgg%jzZ#7unvNv6qZ&>sIMCTz~7jIAJLem88+Qwyg@4 z^?E-QmSskO%5S`TwV=>TT&s9GpTTI-^H<_5P2Eo;PoQ!z2F)D6;USye@mkGuyG}Hu z=stYkPvmm|vDgPSS-fin)O{Gk=R|{FBPt2%uhnj9z2-xpPf!6vb9=y&>YF00h@Ws$& z)el1=mocs`g`;kj-Ek#wyXnsr&TIB=8^uF&-}G~aZHnrbUcw-87OE0y7L<)~o$Li& z<2S>0jqLU#O&oiS9uoF4+^LO}7X@C*INP zKXH@uW{Z~NS5wlR*sheP9Ub+UtT5}(bQL9VJ249~mGQ`SL8g8K*ZI{C!rwaI%^(ho=TtUYiO7;S_vso2K#n!pyRk3Ubl_ zUKrh$wF~mR%1UicDfegD7@W?^bGGB!==cA?b}lfEB^+_RE{nY=6zM3BT@o_r2|k7= zt$p4I3;fiE5M*7UEP?1`u{otfYgx*-NsU)cZDhGe@2u!@kBW$!QyM~!fyUWl6u5%6 zoLg>yDm(J}wvZqw3w7|<;F{Z0VBlRHA7RrHjYdb}18d4?`9mp7;0-qr6frA@4G(e% z`LsNJ=wBNV?LPLiwk3K+K^@wXXGaCbXQw??z=o?c#pxOi zY6zwcA0}!o`Kd7`tL=}_A)KXvvG~W`#boB5)I_u!Ed(b!45QG>XRf?`- zRw{C`=_cOWo-8@og*h^`qDk~vMtTv%i#LP)8!7vhvrG~+8oufujM!P?TsCnJ8hy_3Wi3#sf|4Mz@x}f z>fTWGsFAb=%v_>jCfxq!`+B+rrrrbc%VO#b7 zWeYd^8X#+HlyoeH znrOKb>c1)HBbttgz#t(bn67ITxR+HsO#*{=@94ZK4i}aWUa>SMdWiwAF%A?5ehvP# zf^iJ3kxti3xfDIWnX{rW?h=@MJU->pn#?oZW~o(P40|0Fb&k`ju)3a*3r7UQ@H(AL>X`*qG($vzJjM|hjZRC4}+0r02ABVQ_`rgghAs~QYw>|Ma* z@%j3)x;T+P(8EJu*}yv_xkLDcG(z6nQIp)fjRyKC77V90%KT{x#;4G?L6k*3d5pRw=x);d9$fKOoIm132E%M|9-aeuOdXQ7D@{9bsZKvAej zm6yd^lTKkwNn+tS&h-VyyDQ-3M8__;k1X0m5@I2Fo3zVeDlZGq8g6P77lR9?i-~3p zi=|u%fns{{1_!5IlEPrq#qh}1`P-E~3;GV%Nfr16iJRxHvfmrrmP!cQ{NkT7d)hUNIQkG(IF71l4gVK%-Ujs7it>;Ib=!%p3}O-XiBmMmFJHg&!2snGS5@L zw(VHh-(j0R-1{pTw!LE@-ajzB?IBZWUcRXfBJSFh+~pSAx!=;udVITapkp)1piRB_ zo>Z$rCvFa zyG%mUr!KLDa3wft)J>s_wcmM^bO3`J5?iZveQ(wQeWNJ0x-(UFD?0}5r)LI-#Oo)K z^+HC8vtY?|(^`ERI<3KlJR`*XbJxM7o!wu}MQz{P?ln@}9~;)*tu-`9OukzZCW5Rd zD-?1@X=t;3xXN!{Ie&^{T61si3$$&l!nu=auE!1VI)JvKu?jOc<3uSNLgYYZs4=E{ zqGq1X+KkKlb46R`^d`(VfsSpyqS`G}bsU-bx{4dT*Kkk=-7x2?+W3H*RWI*V(iFE? znHXkask;SOZu#|6K$&}g_~U8}_CrRm6bcxXYHPT;18|5kQgW*tXi8u3;2swb2Zq(h z`}&BnUhpHtdp_N>Vz(JkCA)7cygCEdKgZ0>%|}arC13t(-1=j(anFt+xF2E5KQn&m zjpOg#;2@S}-S6`tFJjPX8Nu1$BRzWTqGepKA(O2*ILsI)vgmf#q(tJpG5 znj?-{!J*P?$8CEcO-9U4zqW+Da0?~6uyKc2VU-Jae(Pmt>j@p%$dc7Mej92iI*Fy6 z=`Z$hZ-T7vHb(z$3(Q=q^EVjp5xz)qY~g3~W#$|1P&BIlV19S3D7m-QuM&^DNeSi( z$;Hhsr#m$wy`AFw1W+Beccr_(v4aK@B)%Y9oQa*?E{R6I9$fB9cI)DxrTZgW?@JdA z+0YO5+DLI^lNQLOF=u?|X;x=euw+(^-Eu0%l2YILys+V>ja`xWWgO6tb8#%6a_(z# zG{TeOCCe5d107BBB#!xq%_vD{4XucN>=s(9e&|H(BlsFmRIBqPQ@%Xh@A3jZaxRS( z&&DqM!f?;ovrCq0rZwIl&Y~VrH=gc^D}h{(xe}w}&{SD%{Tca@e+_3)-?5t*lO^Spcb&UJm6pAX7odS<@|rNCID_OVHQ zRGi7_AU+srQ39AX5CqIrek?jLB#bW#sGRLPg&*j2A$EM=K~W%AC~$?hXz^Onn^ISt z9=EuGh zrqSKkA6w(do&E;Gqs5_wFa<0-4#LPMJ+1|GVG}<%ejQblLcnz@yBmt32=C6_MjFXt zHm4;Wd*@F>ea=h4ExoF~V3BvPMrQhAig!8L++~yK!#8o4L1v8=q=YW%USi`rJ%>Og z-!KvBsV4uO|HIyUhc%gg?V@8J#z90BLDsqJp55(7S+w^b+Zv48uq-(mP1+ zAiWcmUPBK(N>3nQLJ295{W9~Lv&EUS=ljmSuJhOTZ^~PqwVw5?df#zvq^IVf;DP79 zlRKO{p>_Onb|E*>c~WloApkxvFbMja59bmC;M_`HuY}0?MGZhRjTzN_qQY#vv;it| zQgc>4$x@l54bEGCXc9&?p42I6x_3*pI0chl2rjod!2zyqOrNx=_CLNd-KYkWJ9c*l z>(kR@M)X3L#R2TuwQ81(>*}HR5};B2P`MS`4^6M((Z@zo*8^I}IjcuavrbwQC6IRc zF3`+>GAI9uyQZ5joD8?h$8QkdhoG6 zKPmzNoePSjn;SXy`(Em$W7`kgYu^+{fOMt5#f1#I%g2adDg~eNnfR?c1C`UVpxh21 zbGJhcc*{KN;xc_lCaehurRZb~p(a?#8--^hm#H1)?s4a!&8h3<_7e659CbV0S@}YH zMM`O9u0m8Mc2(9PfPy2{`$TT)Ci+y{f*$GVJX|KKlN~$Ba$1NF8TIPQYJDOMs7DoSfv^)bX|e>P(b;PrX9i<;yRwjeC+I3<6@= zI(lVuk8vfM(UU}Yo{_yp|)Gxbk zAV2iDJZtCt0i zrh*^7ts@c9X4H6hll`5rfE=_nb)#x!c&9hW(R*@SR-kutHhl*^+4H~#wJblBF-?6c z`}4~PubQIj^bXrYULE_kY763{2+!YR#)0Ab+ITmqAIwe$2XElkcIYnh2Pw<6v3OzGeK?ju&Ln}hqHSTGDpk5vc~V}7Y+xWq}-tEO&QvQDISkbl5qLtrqxWdy(Djx z{@wH%NEz%bH{H=ViiEd8YYY^+!Hmj?J+)3_$NWkPL!KH&eVok+%F)5Q^DfJ>&}R&0 zLcjb~`%;T)4^f(u&eO8-_AzX<<$>s%z_L)XKx+<{8egp|pYKkM^T9|5gGx}XsN33n zune<`L~ZJUc~qt7^0`eDL#_fJS^b5)d7VWMj6?yXCF8>I;pZmZgeJvHQ{(mZs*Sv^ z#-w3_Up~x(9GDoVxZ2^JScY>&V^gIMo5>z8FznqaVHu9Yg!OLbXLyU=!2YRyL+Q#r zGw0n1hjR&n!}`RUn?;AepOKhMYV_44Y?(7>)KztthUN>x^dS~PX*#2N3g!ydb94}V zx!Ay^O|P{}Q|WR2c^p(z6<)`Y6Z;V+ZQk8gr~?$Ci222CQe^porFK1)4niGxx5S-f z1K)sGnOz{Gs^j3ugKdTx;vhp|R`vWoG_lRH8+;KJyPjV4dMImoZq7tl4~oL4uVUW< zUwO4rJGDDuW8!x?YE*rOeoeO-+}h&eZQX0tgczIYJk=ygygJA|sDqSRmFA4;bCI{l z+*QbX0RB#7;RD@jUPSeP8fVt$FV zv-k08pKBk}maP!u#%HMEWyfMk2tvVOf@Hw{SA7{j*xpnOaN zg((T|9~6Uv(!Y&w+8238bZ#YrQpv?}3h9M>0Qv*92$E{gNMZjP6}%q6Cm4W}?=T7- zHKa3IC{K$FrMl<=MVm5-IYO$a&sx!qY5?maGu&mhd>@{BGK0)W|NMH_gIY6?`2MfDd6yG24egk) zd-Gpq#A1YP;)%kGaVHr~7jnL#hBU;Po#{Ho)K#XQ|Yu&~^1t>%Drg&-?s4d@?u~@9;o@#7$Rw9_) z{Uq6dKM(@S7(!+wKgS4ZVfia5+N1nYxlW(__!a3Yda1%S{&KU=T~0gPp6sbjob{Vs zD47s`W9ba!818#rz()MG9=^CR$9k|;BN}>{g?)=!L9ixZI)_IwuOYb>$jUs9@!3W9 zY#QXh@Y0sNz>ZLxcveKVY~pp;s*pHF@h@=ju%!yeYWl%;AnW z-_q8d)}OC2H*Q8YF`Us;uz!M=80F3kI-}GWH_R2UxO4M^Cg7*{Gve2MJtOpQs8k-u znjPWSJ_IUy>*6%)6DjM|6m;jzmI>BAN9o8lJ)CPmIpl@-0}_KV#IOkfR8cM1rj6#p55IW| z9K{4fxN9a;21Q1ptyAGo=E$8cuD>aY&0>d^&c$a=CV5&#)cL1iiTdl07)=+cSj$61 z>qhtyKR`4(-|yJIHFtxr#N1+F+P1ClW)GYfx>cU8L!Mtr#bs)+#aA5pI~?wykC90p zz)q*_G0@jA3kupznHQPLv=3WNGR@0Grakkp@ARdv2beiV_(N`9UfnBBf5EuFTRclB zF@9mzJ#ow4B&3d%p|?+*0EOlU58Nh)KLC_V}UMJ9kf^7L)a(oAJN^X`v*q+bIYr z@5J^kip$M3A}7TQA+t|@zE|2_@bypKeVNJ26k;%k8 zqX=zHZWeV;4pM2D{f&VY$Lg`mRAi6-=a0Lpc2>H&imm_S?mU8?MK5>X0H+}9GwJ`R z+ruxfFg&?iAn8N@VR{pC`v;(uukcfxCTC1}s7TCqbdXqQK$Bns)a#uqfJ0nDSYb%K zq+-Cu6Pww#xAH_vRhlaspq126mf;kk?OzZ$+IdTO6l)Td?0HoGbpAS~Ab7Drjq&EV zSmF0N| zDQy^u^JVgDc$!t2*@sggLUf0`pm%Zn5Zz95GyfGrair&Jn@m96|@OBVpOYql;vceKCp&(i=W=DJV-#nSmc+a3f$)=D^9%5TSLG`j&kkXT$c9< zJ5_-LV(OZ9>S`E@31j*`U=VS+i{{JN;X$9ZDRDwN$5e~o`^1sM$z{t|uhzLjLFq;i z-RP^8R^877*oAJ31L?Mb6}dn|_!@vJDdUYh!``RP^mBtn$7*h@%eDS5cVo`S{K#VX zB+2SQ(#v+?kD^Nm(SS4lA8(%Pmb;`ZG#Y%HP3`_qzfjz{e#RUbr~1WdT8OM$k~*%e z3fmscpPTXC^lVElN1?V+?q2I|mW0$4y%`}7UBbcsjv~}`BbAjROY(W1A54szwMX51 z5jUff@pLVaQbq|5Zumv=77Z(^K6L~8xp(SW+M$)#;uo;(Y*)yoV@Cq1;+Tgrd~!e*Wg+uD_utO8^vCP%tRd=QlNMyzn;;w}=IEdfobqpJ4; z*E%RvdK{ExD&w1q<0a}|#DkiV#Rjf}J<+H)1VSzNxjUHKJF`B;&xyD`7NC>y&gnPJ z%2I@%Az{rGtjg-hdt)FW%X{3t9OCZPN4*ol@7+T7ZIWyb{i|xXq4(aP)hSn_57B)x zuK2#>{8B|$bos3hAFm}nJ&RdjW-iky_I7`et5{q|wouI@gdCo+$K@}MzH-?Ul!}p- z88d8sAB%}PXx*Q3Yr7x=ZSrx=EifQlRD(V1P|=K#osJO}7OpAOR%lBtma;W0wO?rQ zCCzAJyW?%drL;@X>9z}_fm{K$sA66^GxB=(F=DYwHtYswpQu}}H#s6XE$F09erA4_ zSn*=&Lk?~H$1Inp5b*jkb5Z1rY12f?Abas8yT+$&RTh?K(5XgQ;Vv$x zm8wI=-z>K2Wat?ddXt_kI7x&R01Sh(b0+Y{^z-pk%x?409Eoj{`9_z8;k>EKcnbuH z?7kaxwXD9#WLIWC-Z5d2F}f*x1-?!q>r0ReCP7x#JS&?m zTja<^gMv@X1&RbH%WS`&s=-w>Kc5{$ej<#S2e_8{O?lZ`n(9Ex94Fbztf#*BW}P+k zg)8Ehc77Gaq-aXGVH_V^^fYNE>^FHxR~G|BRytCQ7iGUKSvTJK0Lz}#bYp>+L0<|b zzkDDu`24bnJ?W{q3qHqf+X7GW9V>oYVn!Q3wIgx6afw2Lxc>Gb1OM~6z&WC16F|+` zKU)TmVD}<*A6GD_m=+p7++OOxa4Gk8Pnnl-f5?^f9%WXjF%Giq0Dm6zFx4*_V4}g0;@oV@V>^*Mjwye9{IdLV z8lFl?MmT-lV=Xn&*U11!o?#egs0OfS~(FG>AqF|FTl)+EX$ox0nDMy9&NHVQ3OJG}ajh@9c9UsK|M) zMUz@oEaoc1E0}CoBVG4H?$&Ns1iw9J8&4rUwNoz|_GEE2g<`wDL4P?&5?9umHEo?n zcvA`!xxs7<-|=@HD5ISNvH*pzmp7GW`%^AvJ7!aGp_y2j-DNwQf^Rfd<-|!@*1A@& zRvzCfiLfrNSU?hH<|pC9%B1-?BUur6Mxij1Ga{`}+d5~*4*7rv{#qIi6c@8ra-DyB z1WGAn1HE7xsQo>j^vZc=Q(eg2wGWYywBUT3KH8*t*Q^}()J3C>HeKdcWvN3>VmH?0 z=?a+xHQ=06)zWq+K}NH19i?nU!xqG3Z?lg3RcCH-+UDtVZ-K{o1tXU1#?C(l&FiO! zKVIzPmB3ASWo=oOERcgZb|Jo-)I%dFAV{x;mEM(a^&IFTZ*Kqmwa(=uqX|gHgc!k@ z)!?7rALkxBl04jGJok@LH#yCLdO?K0czV`^qc|+;5~^WrQ(YQ);T2bb0i=fM5M>zYUiVw?qM&d2+VyB+^hZYPs&%q=(xVK49V;WG*Etr)F<#S17N3xB*V_Hk)3 z7_6{zH?%S)%h_o_i6y_uhu`h6k8|3(jafQs2dMm!=ewzza$a3ygDyhEmRwzly%ldc zo~G%*6X;Cz3)jI=l1&z`t|cnvp4gt1rUdj-&1_vPSwY?a{aS&j+_F}4>(lF>vC{7Q zo316hDL2s9v@E*DUtaysI{jHhcz23d-5|BcZTz`qS5vMPs#lIh_{ld{YYEswBZ&!V%%Se zDdCE-+x*6I{{?(?#6kbD5>t$^Pp3>lg%NQo&JvLn`arbmVfX&%UQ36n!|XyV)nU#a z-UNDQRo|Lj1G^>ODs@Dey4Zy5cj-wOZhOD7Bmtd?dtZfE3KHI`*ETnsgM%$U)Bxyr$_LE^5{*k94 zoYnZo{^xYd@r2xP48g7H_+8VmxyaS}-G~*}W~)ZM0}G;Dj7jKnA9Q^MyvDrQ@tcn< zR!*@Qs=6kiTr)rYCD8pkVTXEu=D2wgIO{GXaF%SNY%SUS91gffQF|TD4=ba56%Par zQ!TJ5q~%8ql1rH^*g?+ac#>dxthkfU0~Rn;DqGJ_hVIXU9o z=I4d!16q#C{aT03l4^n2tJK%#=kRZL$O88Vi{(7ZfC?=y4R9Vm@3u*D!jR$Qhw?UT z$3tdCsDqhXD8YW#Xc%A%X~3)_zb2LC>R|N_^gZO7%xs;VI;puc^frYFNtRnl{l6y@ z(xbG30b`FX>sQB|9 zrt&>cT=2CK_kwlfMzGF@xko0cejekOQoBb%9e!cVV-x7bB!S#<4Y!QTgoIpAsW6R< ztbA>HCVFLtOWQWJUh@b0=jsch^OI9jMFuETx~g8iKcoj$_W16nozMBJJK>MSM|&x) zhhF%0ZZJt^Y{ z^5wU}Oig$KSlLTOPPJ3O6dC^IH4SXX`p z9MubcJs74(6Ahkk;aSR|M*B!u*Xwm6@11P8`{QS?Y36JB6h^eJk`oJZRuBp{d0VOd zCLQ(F{+i&`WtMiHm%}XmMVHq(*`_RJ3+Q&dL zRbpfAIUKM#67;GZzV7?Anr^&2mqT=Oc&CUzt|1FdH<^AGX@owC{--f9q8y4(K2{X+H{0Vp2muni2-e2a5cLtREiq-P%II zH+9w@a?ggz{8(+OcJcJA-valWRjGoWEe5<)1-8oO%NJx)Yi4X@`1yB=c^Kcs0;WCk zm~>d=Gnuwk=zvJZ>&?l{VSAu{)oj?h@b%AmQNfkc>=t@r`F*@N{porhspB)y)Hdf1nN?I=oHKAL-NsGl=Vo6* zJ3eW$Snxy=DCyjM(&8^)v6;14ZckUSUw-&HXDM9NGxdV6(s=tLv{d_At6i{}Hfz`G zZ1a~5`)xgQ>)jt~mtEZ1$a==hWcfLz_;79H4>4E&)$VDs4}|v;3{EO{>so zb`us}Nh15~452$Eo7@r!Cd^R?UhAFQqGJ_8+Mof?rRxmrdeHpW&9d2Lv8kVT`K}C; zC*pl%QBtn^4u*rw6E7A&z7J#Gsu48uX~yLxu#sKv*CZHO0x4wAuL3&Cld!d$K}mbX z$%l=c&4F0h3@>B2Uv>peGirAJ27IOI>mc*&y50>;s%xbQPd!&u=nXs=%2A8DA2th- z>BbJzF4Kn}aI^3&U_4eMS5ub+64TCOkS!yR@mhOB5LC!M=cOLm=UkJ+(I z`SA{xFxkA6QYdq5r=Jl5Pn&TnZ=MvL^{reeQ%mDHRT5XuY)?0=P$^j0^LIfLGII+U z_qr$M#u0Bm1numXUS}50s|Fv*$r81+oSt;O9JCun#O|awRIHqjcLo!c%ftyDE1^9i zcm3^{7tLchxR-ooBtf`?zD6U66bKV&p&?fMTHSvz1UXw3Un*^S>IgQi;=_QUD6^+ z`ZFW1d%eTwx`fSHeSu`4+0Ev7Q1RO=_YeMjH+m^(cY^g6KOQD4U+j$AR!%tdpg4vPpT9Oct*GNZujOWDv&=wV z-g&^GRvO+Qo$Y54Ta^l-3X!H$aM1__PAP*yn0kG}IkV4YestU!n#YfL5!1#d{V|tO z@by*Cu0Y!Tr7a%bCf9jM8u)fOX=dwYZUCjl(t7^Zt$rEH^;)rEAH=R61+5v{Lf0r& z&`~AQbrIh&4ciHx-+4u?8r2KQk$M31;N7J3TpKc(C=+E(C}0)anP@U|sY)W3dqesz zGGz0TkB)1DBiJ@=Rq*5pQ~Din7deD-@y4iO{i}?pt6|jm+I?zQv?r#Tdh&^3SyM?x z$gHT_{xaY&ZK0g!d87Cro7elyrk`TMb!%;}-Gtl<;bK-+l{w^3jbVjIF;1;BuuK2V=nlGg8Up@|Bm{i2`z3X z+Ruy(gqI8p2=U{wsjM~J86xUhU^k-UNUA?07A5-pm$@L}1!w>T52f}c_7CwYeX6n65DMpu;qyyZM5M5(%;s!}X9vr{G%S1(fA;4* zifzBGs9cn8I5RX|>$)rjvY5jhNG%eoQZ1tx!8tMeLGp%v&Qg#7ErH7AK&t`f_6bCf zhu(w)r=`MrT5gO|nj16$9d| zp;wQ2Zqtu4JF1seeoZ3N4q*7HIi9Y-yYc{o`bxb>RSgd(#8Z`|x)dh587Il!T? zG9%r?eJRo9Ufbf0=J{Kc1y3b&9G)5v_xZqzYzM;dJ4-OySyYLjr^*DgdmS&V({|WS zVtv6>QmVO2U;O#08gx?pS$L_8h>?e-=QTx8`!ocjt|`4;0#6^WflX3Ua1--=X&<&R z`%Sh2o^VKfhN6jA_eV&&aSK5TsZB+Q+6z*EaJFI}GpA1vqw2)Tk^Fgv!mG)_3hXj(_Zf48YW4cNSkY+RJx;@3K-`=a0Sh~b^t{0JSXqd@N zF3E*?sF3LytGcp-TUjy~q9a1t5OQ%inZ+TQE_8meG!3ohLmeVuo7x@N00NF;R^&pe zh%S}bF0ay}%g5%#g;v8egKe=xB1^Q;eI)0oj?zVI%TBy|z$S}BeBUD1MOXv>HTGW`h8ak@kBsV3Mw5ik^5*#lF{>m5QvplRkhK`UXbIU z8-$8D5QFh;9;s_pgH0YO0?|5lUm$r5C26rVn?BfJxmyQ>U(N&UQH+E|K7B`j7|yOEzXS9IXO0@X?!aksYz zMyEhdii>;JY;(Bynj5Lo+E#kk_S050mDR+!XB~pAv@MoWtX#KdCZuU9#Hx`#?x~f) zD4l(V4ac|BpZ+tPZ$6?;K=lp|ewl?D$mgn3hx}u|EiNV2DsM|VOn==Kx}PRBPHCtKM&(dzt#1%NCWR02YCK6LY8Z;j8fbdik3Wqkqi8BHf+ z-T8C`)ShQx4Lv@6|3dA@XnW+|L+0n0FE7UzL6V=N#KjZsJGx+1OPe01w5cE^2<*3- z_Z9frY4MiIaU-PaHwiG{XfU8y0cQ+}G+YPA-3qCTWzDz_aB5Yv@Pz^NQ<(|?QI(Zn zHEW)Ny!VVsUw@yni@!nl>7I?^s9f z_+WO4QgSpdGRni;c>035IIr7Br08AXhD|1qO#4gyuJGRr=OY{MeS9*fH(5VXmPR)a zq)=K0>_<9URHF&#tV2OdcCMyve-CYQAueDP04eir@o;b!B;}7GUh=TH>eOO{3%Hl5~xPs|IWE z;Wmq6Sb*Y{iR~Z)UYrmRgWe^}0MVVhLtEhHKuNaK6JJ>#NWL0>2?+O4-EDAYO87)q z&}r^rm$_1d&&mBRpUzZdjyZ4S#6>tqx^;vS1V0OP_!S-x0)@EioR#>ia<&6~hhd*bTI7kBZ$umMB7P7n3? zU-Ku^*()U0K!2HCWEw?L2BiIfDGbg8YJ_dcHjb5*1DpII7)Vu@*w1pr64o}os^gUy z;XeomJ}^)`Q$~hz5G%~$x)7cjboS`GDQotsJZa{j2ZwrDXnS=SRB$2@{&aPk5E!#s zwSrI?-BED$9kBSJuaDnzu2S*X7mZ+DqZ41C-e;zpZK@9L=6 z#oHjE(q@~58Or?-t?d~%=Wz=lpX{e;SU~ALh=5why3T=ZZBmN^Nw@89HdI@?Q_U|w z7f5~Xy>kIkxsS?6;|xjf;w)2;xwotK8uGVKf@quZzcM8M8O3A2^G2hu4nN1xraxwE zH+!2B%p)uh%;@aI)wtVgW_acu@UEJrcl1>{HH>^2DNJPh;YQP+6s0d{3iDCjsApcd zwV_GbVV{8*5u>l$=H`fbX*j6a5VpQ9@HHy&XK6@b?A(C#ZM~~#aO7bkFb*QJwP;|~ zB90QH&u&}sO{2r45W1UI3wp3{7Q*H%c4?gjJJeAdLnM9BnZY!$FzxQ#hZtB@Xfxlw zj87fT(6%k~_R-{_qj5v&xC51Rvs}Q>@cqI4O^sz{!PALEpAJ{hFZU88vcC*IG4nG9 zg?Y$jN$KwHK*T~e@*2k;&uEO{&W*YO^qlu*xwECSEC7!~bR~@kI6-1fWJ9YJw&?9; zLV8VpRO7+#6z)82X=*SHCjg3K_H)*&Nf(n?Jb03lQlW)^l%Jz~1=`x*KW0Tz>lD1q z4UY2Ky@YxTYHsjy?8A_Mna$7~Gqx3Hp~ZS0>6B%CZD>&gYY$QInZ=VJMt-Uwk@nFr{6mj)C}7AOG@;hR-;rd@%ieFi>gkOsm6v<`BVc~ z4O6F7a=&y(&=p9|bps)V-@dOokf+Q2_@(=3{r8|bVRF@!`KPFQyS5f&%Ym_nmAFNP zWx+`@mjYdXty)mVr(z8zKV{GFS1y>fkmw*JTE&~*w#JI|3~H6q=E4uSi1P#JOs>V< z98vL#u;GYZ9v-SNXFPt^ZQC+LHPwqYx9WW6(x9jW)ywYyid^{K$tXeWURB#n><%TM z6Efv{NW9~ChTP48*c`0MKBZ9$sZO{j1L}(#pyub_gNa-;(y`J!nMn5+O^g|U_490-!G#9nsl_T+>RAuM?j;yzf7uC*kU)`d zMpcAPH#_()!XXMI>rOn7utB8deoYE4kh?y>Hu|;O)NFz*$x<=!oTl~xKdtm_Ob@ju z0-wKaPYq1sLC>w3(G8I0WlQvzLw%dG{2u-MK*n7r@=#w6G%t34={dkJy>IHNBsBlR zZeE?NCz+wAmxznUdFuc~!@eg$`f((qihVqTlGl-Z&7@!AVBbiipMw$f&r_#>wt%?K z&f&HC`Zu45726!0gw9U@E$I3KnZz)bgK*J1rrL4^=IrG|pl+db_-oLxH(1V3s*E3h zZnZ#u^o5Q;kB5GbW*e2(cv;Ye3U<~7ZL<{XbQMo zP~^K{I(bR9rDLk19F2RI=ulvk?&as6fT8nl&%`70m zqT3$vX=zSl{*V-G(xy_afeO5wZ0$Q9$B$@1R^8m8V#!*erun)u3vsb?(_794_U-;zJ667^A0SB z%w1vd&d{Cy1tBUzr`UF9eeVirMR2q}>AjOiO>1+2@5Ucqoe|%gZW$h5%f!r2+oB|& zb6zi-ZaD1Qj1aKVq~-9u$O*{VCIsO&6;P} zr<1eety#BeyF-kiW)8o%m_+()H6UOt#wxy&id;8HrsIeMPY(;|;GdO}Ms6~;QE!$< ziPq%8C`J=?_QfcdF@9okvGaKcMDNLH&3I^u1R+yq2Vhz9O}K6FMq0tslRrRfqD zX~yV^E`y@ZPiAg%W%k)of{ylX59$DC25}=mjBAPQ?c>$VR8zyOfngNfj1y`+d{A}g z;vp&+obdCW=V*diX_d6jB8@t^$yIG4DxyKuv;|wV@zp;Dgs=VD!-g(P$tMCH@chiw zA099|ZGF$fYi(~v>@?!?2A4l2kgMGV!!p4_i8(kQuUYrebE&$@A68K@uLBACQdoxq zuE*xYulX)`se*?QLa--JpUyzu8NWGf9$UOj1vW65`k*`5f}QVJr0sjiAzaqx*Kq;4 zh!zp4RdwwEiW0(Q2)~}+-MulhYd8wJJ>KUq$rUgw<~2Tv=sUv1k*gs8K&f)R8J*w) zuVV-uLY>m)M=!I_s`AWy)nx;+e)0CP;1!<5d9g)5i)8}0#O(=6wj&(B^|;)hPV%6% zKe+21D7eA#tn5H_iKG+@!be=EuctTUT7AT1(QBh)+9O1GL-}i?GENRXNf{PI`22JF zkl~%+ULs6H5cbx|DXml6KI54*85(?89o#Zs{Bu{R-9+DhbY?}t(-(}RAbq-Y;~N4~ zqE5dlRt|R78cMFcD1vpuhUm_1(o>_4eYA6S41tj#s{-K5nPE{BQF;Agvag@alK1{4 zJ7+O4F(f!q{&l^>y%}seFyO!!m)=8ux-39l!LN@Gjj5~3hbfqWQn(Xf^0IYXxxk-C z`$XMb5y>7yGn4BLbGJGw$S(~ht@Xh4`Rv7x#!4t_q7Nqz>)K6 zs9R~QE-Qjpgi*!g9=*ZgWZz&4of=e ziT|1mNVDnHSt1M(XmD_`v!7ziw$6U6k22VE%T^KeGV?!fD-hrAU0y+`i}TWTUR028 zxw8bfxAXDb^L#FM+{(ofU86Z10s@#m7)HSKmH~5SN46|XjEzPouNL>jK=)JTw3#3r!g)NWdTBWZf z3x^NM@Rd;sso zK}YlyGx)&(S~5}46Fms13;me{6T`(D(KOKp@uZZP*o9+(Y{r6FKOjslWh^-mT|CSO zS_E$ec<6LeP>V%dwotk3ZjOkYkebW}6Ecemhf~J-_jm<9Ewg}_Epzd+;=D^m)Eq#n zIoh{M*VprjuX*??N!;Dym2cg#a#q;_3OGH?%{EqQ>l=Ja*0ZPi;S$JHB9J=)NrU)e z1$yV|ogD=6UghCR13QzN+OxXGnzIM2;mk1-`%GA0=f#ICu9PeJvINe!Rg}} zPK5$C4y!b`ap+T1m}|O&_?KhA{gE_s!{Np|4!qWWKX61v?5Og4e})PVB~>**%)O4k z?A%mgYI5FuXl;9!8eRMObB#j7^@^ZbfBu%;IG4>e(gnNkRqBC%7n?Uq=uS>u*rwT+ zSGE0-J@FXPMU}+2Di%ijfmambz@$cQ(~2haLGY?4b@e`bVN;gDpHYAj_Zyq=mIqIB zU}Ven&WPKWo=WnWW(JeU-faRR#$j47fFew+z`X=sNuVpfx0lRK(dv{ z@7}3`nMNM(@=yZmcH0xtBR_G_dM9Cb$6L&4sk1KFtzT@^nx)N?Yz}p#mfp&l^ct!|j<>c5Y=P*A6TbN^6q>iqL2$Gllg$43C5rP|n_ap_ zqF(Q)176u>&{0ngWYZg>xXX&2-gPPgdL0}}%<64+5^OES{TxEs-Q_!E$qxot_~xkbFT&WMSNOSv-PHK~K9HBEg`vYL zHOlaZjy6AAvB0gQqe9w1lRsn0wdn~+kD=u7$4q27Mz>Y+etMItHMxy*AvOKG_?Gi( zgY#xBGNu;syrvSbV;t5jd0>TlN}66G`}|mgc)7Qph(Q~rx+#)hxJDmgJO*yaEt^YW zcM4FN@x-~zg)s2{86Di`dPj5CN!hYd$SuQ6yx(QpTfAF>kvJ$)2ywXOmIhfL5NTac zD%Vi@J*D>;sNbabHZ;9TMIJZKT>o+W#9Y*hMXx)GSM&kQf00SO(@oZ?X}`y?a(q&8 z_td)m`Oez_l6#hP^2rX5rH3nnB^D~g1bwBOhcI4lse-QAr_R$6?Xxni%V9Ybj9rg$ z2YUsvS4gp@yT7AEz)g}C8@(p&qk2;gjeKQuezs=I*C)4Uy?lY@Lb9Duk*UXB2olyg zz?)kB%!IUW0(f*{_Dd4Uoe8q6bK^jFK}_)^>h}Yuo`jO|#0tV@9J2)Pxemi|DhD4E z@y!HsXoUg;(#84cyUBTvv*vyGF89n8b`h59i`N`%4fo0(y2p?NlEFS@j9`|i^hNMH z6&&AfE7^%%k=~6E-0b|maqn^3jwaQe!8c^GH9EedxxhK#t zlpg~3paGj!yH0a4~ zk?4-p@%>GHW!(xHk%bRmENhMG_Vo_tqzmJ88nFBHbA&nIS}oUq0NvaKhGWdyuPOq-r(|h zBaWd~gZeo7gnfs46J*_6TW$#l#$iDd{&D7g!p$_*2o!Okp<>`}Pxi z@%gnAoajd4ytv@ES$E~;b~blkj@6`cr}6Hv@gROrKf`7jbUsz??eql6a|)55fwEM{ z%)zt)T7tv!h08iz-t6(tSdG(1e`}y`D5tUz5;b9nR0Vo0tp`S=tex*3+l3z2gWI^q zY`RrR_axTD>V3smlTg@Y!()lnB;`hnBTrzqchUT$Cn~S&hke*q)PRi!iW)b=O&pbh1+xzxw-Z& z`DvZ%B=*3xw3t{en>;(4u`2o(Hb-Gm++mc0T026ISKDJ8Z_sEpO=ho52%R!pj}kBQ zgkY$)f=L&ytA_V7ukZyI#*=VImuznU{d~E1A9rbgm*cVUZ?8+bvphAVJN#Sp*6bZS zhQ<3Sg;ll+Up7w3^U*IxYNlU*w;HJSTuIfOvBe^sO4w3z8^$z54cTokQ)tv_P)YDT_1yJ3V5O!Yq)+Pm9#%f<777&~Q1RA7blj zFPN6|^j%M9qW|pQI6AxDvovxvIhrSF*2{c-w!4@wJRZ!2{EB`ucko0?cP>5sE^HXP z5`*rs;Y!VW&4RmfoBERHsNSVw!+OaCa!a|fA<4`BRgYP{!F7;q{I|_8=huF1uM-Vg z3_fD*kI@vCCz>b#Pc}iJyrPodG{k!vI`-6S{n2CMFUIkKZFplMrrOK7hL_8l-9N}+!3n^(>A^Z;Jl;I_uH@txtVtczm=i|4)Z`IMwqRK;t5gTY&{wRi#HHnG zm~034tD+c88&i3ID#-q}esJb25OR*+T!xt(zawumm%whO8a{ow4N%e~H^a&V|IeH> zKlq1SZh^iY_sv0vQFpx#Glj1|?U)OVP=`t`g+3v+szX~x5@ZG-s-1d$%&L+5WR2F zAjs*-vm&)O+XDeWgU)q>Qn>QVdfJV-GT36@>ABs1SV$*-!5n;eaN3U=CK)vGnF=j; zmn`?WO+Ae}n^}&FM>0w}s3Ce|8L86rnEX#oE#$OzOg*(NY6{If0J#+%B@hiGiNMh% zb%)KBZ_GONsl{gQGVh>*-4vO6)T!}nX=yJtxIJYc%$UgFmKP=f!ja6R$ONx@_0t7w zy}$CYTTcF@W)$^X^%o|EB`2bPWb023*M`gArg3~Dmq8%}L3XaQI>i7j=Z`|#^Hn!S z6Tn`5A@^F}?(>HcF0=B5=rLe^4faXv^O-V9@QAUr@tlhc4jOg#axu25mXso_>c6a- z%%cX}MRrq9Iek2!b_R7UvrP-xUDnn>gWDf>A8Y`Wd+DK|pVlSd9oy%rdm-z74hn%ncbnbt89+1i;O^)Q5B zO+(k0son$&z&${9D2Mt|xl{$X{KYT99@RAQK=0-EXZ!0EN8KtIe) zCQBl`lK_qwFK8gdS$X)$fk+#(ma5tApSUg!+C+Jg?t2)H1v*0JR z`J_YjJ&8;9-Ty)xSu3L+uwB3Nsy?Htp!w`K(iKwO9`?LJgEc#C3X_ zT<6U2jX*578q;)8)uoh!(Wd5)?0tUUQzs78UyAi0J< zW?90qI@peKh#;6SYr!2XSUY}noI|zMdfMKp&XAK;8uavU6N3bTzN7F2*6iifk6Na8 zC-ntyo%{A*j{pkGsdu-%pQc@{1hQv9`m|nt{-&cn&=<#U;PgZ(vI8jmRho2WP~+t1 zZ#z25Ig(Yw>Occ9GXR3%26a?;0|!ecJS8~oUA0(tIu^OHCqu0o>G4dC37j#yiehwB zmt6ZjHUxOyq+YSvz*9!|q!rE*z`YzyE*J~>MywnsO7D6S^`bf&^rGt25* zWc|Uj{*q>Woa2&_X&W(SO#@A$?NQz0y_Jx?SWDhTT;V96yAfu8uY^1lTQ3y%rSRyD zf>3Evc7JJQQub~o!}kJgPrJU69Ba9ndnyY3ha>3whyQbXSttXV5w0%r&&Wvsa?O_x zCl)*vR#N@nLN@)6pZVip(j!DTW1s(bckwsk`>>XvP>@O5g30$dw)?FY^hlvtn|+io zK6-!ez6Jo5W~ce@^okbsztU@2;C9VizS#~gMeUi4#E9x9myJM`|Jow?M*?V`F3J|? zzFqkm4z;rU`g*t~|H-F?6j~bMo3N;(?!MY4g>QfNjb8k3ALhLTpf)Z&yRYNEeX^`a zZ66I{6zQZL1C@Vm=cDQE<9zOf;M;Y7`EM7zPw(H>XJ==4>UTE{U-t0)zybNXw4N^M zrfN$u7~W2ZAj;g0+*|Ug8A{bPqJFMbFgO^^_!c-J4ll-6V`pU~4j6KxJoUfbVn6uL zu#P~jR;OmI)+$6dbSp_^!GbpSOsB(>Ka}e=fSg=@YMBOqZI6_Trx%Tpy?WE|GX{ zT<}wZra1wmbd)gsH@E1!C;N~1eg_bq6VLqfcmHzl_FO>h%Av6CZ>0V&fBAOLUekJU z?*H3AW3K^k%qh+AZ~n_azZC%CP5Iw=%x@l1SrM=;xrcWDze9L)0*V;BR4wq&FZ3@j z^EU|Z@66Nx`$hjp!2jFEo!WBW=)^aR|9^qe{yzf#uiLxxKmN}S{zmKmk1G0qxr2Rw z{8wAyzo78B59 z`%wOLKI`ufasEdamN&;TXlI3D9db^rJ^Pf1g=&%Uo(#?0l+NFOz5uZE=FMdYEdMJw zf9&;}x26dhNeizx@45@Ft+8?*nTg{&vjimczayCf*7+Zu?|+o8|H1iu2-ZL@z03{qzj+XAfOilth7l4!4QKWbdvzqa zeQzjRqxCq=Wztp%=ok1-mM!z~^1q6j=6X_sKG2Zx;G<=S#rNy)zkNbTz5vS{ z{kP-apW?Fe5f7JzYVz0ESEm8Wfj0oMbkEd~SdNje|KRzZ+F$Lj&*!_GEVuGn(TDFH zJYb~##GT$BxBUA3lYW1&zH~aPDH0Ysqsfj_p8gvu|HuL$wk9H8(Cd`j`wwe<-`bxu z`Dxb{XRII7*5A&hk)f08;AaHg{`EQ+&c0g5#7l;13X_N|GLgv6v{q-tn;MvOPTLi)Bl7?YitOXQTeZO=e(XY@`v0%S+<0O^znl2>y@jjB&IN|Lrte1F;dyl3r)vP}=-tSjp^ zzyI;0dLRNuPFW5&{HwHtD8gm_kb20X!{2^vbYj*9=a+@Ip%nv7eF2T40x z5h5L6e}DJ?y2TazWyFg%9<{%&!%rCe`R9*!{wB8bP0!v}gmv2#{{1%JelpE_e%sew z{MDYEf}7+vttI-dWlp=wqyi=M)nP$EM^Ob-xNUvUXRiv3BOS?f-sbwr--vr8c6fGq zXI#J8?P?_GyPY`!1WOZlCI4n)Pd(sE+8^hMhJX8=8c;wprWm3pjS^`Cj8j)Vci6n;B$2mf-VpjCk*K`*jNRBK6~eCMY6!= zL^98*rv@4VjZUb*;4+uw|M9^3e{73C@%-I_hHDmC?M(5tNP4NvXPyUfv0Ry%Z&`(M z7mVMhfP(MF)3|wG`r-6jyrjFlhhAEi0I*X}fAQTS|K7d<06bqmZ&C0lN;Tj z(^aF?7dXDwd5B*Ajw9|DfC&-c_Y(J3_$xPi%I<%=Kt^}oS*nX={V&qqJD{m_+Z$Fy zR1_VhN*|TpRGM@U>C$@_PfWK;KlA3-WBX`!6-cHHAu5Z<*Mt6&(4T`5QUM^{=W4^}pLz1v z_hs6Is7km?ys`@Pmi*F@S|LK zTjJjw9rM>Q{7n4cZutCKj$MBMp8vX)&u{;qwjX-zthIhI>5U>S(mQqIktxV?>_4GO+aUCUyRB9yXXI!UH_1Ury(G*36N|0f9*s3 ztgS%ZU6FL(c8>G%Kldm4)jfZ5FW2bNv|BJ|ci^s*IBK;`tMKxbY$2F%jI(iSVXCX< zXWyf{6y`^tg}gp5bLmHlqb#>edBzffhVTK_*E_(}|^y7LzaiTsN(2dFYWFN?X=IT2qlEy8aoDl6+ zJN>b8h2DxYN=f^MGJ-j?M|WeK5rPyJdo3_vWlQ!-b>KMKEoVUJl`8fCS2J2EAF7&y zFGH(c)Ckf9CW}i9e(aSVn#So4HO~2KLhxM8RVPdHnG8TW<=*%8zm~spO)~8^3Wbie zLm-L7PYDG_p#mu@lPEOR`R!71c0T`kF^B$2>Er;KHL0_Ku!}#be`vvv0-t^??!?-; zpSa|CA9#_p!^TBK>;lV?q32A{LWjyW(>@F~hYv~Izw9?*CdT%&V|%Ad0`gpOPP_CU z6%FWxkB|(9syi%22zF2e5OkkR4a6_-sU+<2==5%e>E`HgVHrv;Y8)g9RqXW$`R&=6 z;>xWTNV)IWsOIGPE?izx{#kZRz`<_If2jR@wBvD7-gT3p1c<$`wU;2?+FMuL=h zw*+p-Vd{)!pO=Z4{}X_LZ$;c1#3&|_H zpv=|o$s_sk;B}exVv}E3Qkew#Hs%$_TG~(W@WeZ18$b35qcu_yemaG&rj}?)_(VNk zyWr?|0T;*7Um=Q*hky%NuPOV-{NI1;rt8Z;wn-W0bfbfe3k_23RE@uXsmaq@PCuQ# zOV<>l!kBmWu(fbIXQ<}s{_D72whUeD!S#D1q?hmLULJ$zg^-+qT{tcnR#~9gF*C`2 zlD|s^p{;2_*x#iJu$exsEH$H&**F*KVaIWTJBM44PCY7j`fbVL;q2x`)H?L$@nyEVE;+EC@jKq(=88mMZ5swVxIhar6(eq1PMbl%~c z9gE6mU!wJ5bvnNu&3eNA}eZy>fJ9kR|irdO)2y#%DZGWhv&AjPhSd(;)0Uros?Eg3g|9$k; zE2!Px?^y;3$fLHzs;tX5@fL9ru97z?%y47zg`aZ>_Ic&LXr(V*5&~+6)4+7djs~{3 z2YTwme7>)wp?eeO1dcLM8pM9!SHI~}inWV0QuJ}DrZ9RH&>y24*HDo7O$ZJQ` zX)=Y`^Lgh7JJ5&spxqud9X8wB#@h`7Ia3Khi?lNVq7WFPh$gn??Oh2`r*FWWb68z= zdwA3LsG`8FGveL#$HbfHH#CydC2wz66hEoy-!ci-N}+wrd@uw1){a#VGYsZ(Ih?%IOVjrL~Bso^0Dr?z$$F~=w03@u%fGY9H>-5oxuU^NfVWoWHVeMeT>W1k`= zat>MI78I`a9<&~qkNe5(YUH_WCAtt?dNPMAe4hb&VS|C82~bTl;OW zZ&4h1YqSSW<31y3rS<+t<@Gx8jSZR z1z{ zDc+sZd$9CU3ef)L{TQL6@;lMM6r2;dd&xQ!!OvX({$$DKM>geu(9VFMMc3|iEuX#< z*s)zZe*7GlF5A#)34RB9i<}x@Q2w&?#=ho*a36*R7JO@gFE)!FZm1M7_)QP>o? zc3mEb13+9_z^0hgeoT%EU1i;YMv8Bj zlaifJ!A|Gviy51t{hrqm^9}n8?n#lO77#N2Lp^!~!SIe;$3g!_m(2PVT}{R!BN|B@ zb^y19P96A;o8k68iK}D{fzHV?IIo@ccs%}yB=M(7m=Xfqnyuz_2DHhse|#55ax>LU zKnmK?+^CcC;JGSPV&gomCWTHu{A(vZ^=n+!004ZG!UBJ&thncw;=9e`TiSWC zvFMe?)+-Q*es`~xe%v$14r_$Zq!p>UVz!?Qo3C$TSsRY<&a!FTcq#OBBcx@ofsN1a zKt`w-Yg~I5ho2F8I+o{K{kpi3m*T;kPRy4W>*uN(71wLGDJ2M^{CWc)?1Ck)$u9_t zD7m1{vM8f^4p8xmWLV1@;y%o~`?V6K>L%t+0#QL?RR6{&+u(c74uWzy;`8i{~uge2_j+J>{uC~Xsf_Ic8!!G!*FPYA$ zFWwaoQ`L0}q)1@9qMYEbMlZfty`d&dVMg5RcO71180cH01wtK{!{gcRSvPoAYhMX% z_9x3={|O?hC)l*35skY_>ZXT`FdF zc}^z82Gdnshu?VR2_%2tXhprSmWQSeyVV!z$EzVk+5%r_ zWG+PNvZfTz4H_u;OY%Z?Ttv(Y_vh8pqM|h;%(9zn>LWFPDK6t4ggUIVU?_V|Xs2{4 zjpSZq0em`FCP_S<9Hlroa9cbIrl6`FF|%yb}_P%mWR}RBZWriI}7S!%|md2;{2Ww~AgNSd9Ky z@N`p)GY^Z3gCD_Yl71uxIGL6L?d~pbTGXN3wV=bBu%{~;_Rt-DVqqgtEnNm$8Fwd} zaVHv+wpx!?Y_$tqrz#pZr>--?<&T%NLdV#C6U@;ISQL&a01j8yHRYf?tjy#1lSZD$%l|6zt8DT;%Y zc%ceh#l(>Ib>k2lyF;ZoUV71G0FBznJCtzjFyV%V>d_koz8N-(cFN1f?#d1yG?y22 zkw(U_LI*cI+iM|VO$T|q$_ZFf8?Z?f?AStvA8Pthh1AXc;u7Pn=`bv3cHT|4SaN}wK z0EHw47%)!Zs_SOkB6_ar4SbByn?$^yrUN1n*N%XLJ<2$0%yxl^5b3bI8j_IMFh_8h zrql!iWRS(Je8xJpSl!Lqj%$vEY*C^Ty;g-Z9C^-FwtqHW~G z3E;5hr*Y|%-H?j7W|CtA!F(lYJSSPbj8Uqdic=ghA03)4}JiSlH6?5hTEn**yd; zl2|B@8ebCr#4q+fSF140l$=l3lFvJJ|1Sr%!x$0f2*g!}}BfFFhXLjf%{nV384bIov=E06`(w=Ne-G&H7uK zBb@J#4A^n&`>+UMZE2+G-VU!(e_6NvU6rol0j&>~%t{hbTsKXz>hhu)sntiA&kkFn zmXMK}>=UJbf`s799*1w+g1$d2>zhGD$=!QA zmg!P*A>e+8gWz}#bN03C+u4uLTpdR3ym-eE2)P!QPVmX_ndS!JDVobP>piX|FAMC9 z@d-D(jphA*;a{Zvg~$rhcbPgg3=+Q|A04aK5ebeefl0!4@>+xnN5S{)`27(5<=_9x z0oG@-e54W11bwVS0DG8U&9>#;`bPnMd(9M5cPt~-`I;PV+ecG|eKJjBd=aL!Ew1hO z#ZfMxYAtMCbbUR+m&dArQcn?=o*A2t3Vg2WxGgS6XqBFYaJO~&OS}vG94d!LUD+91 zV0XUz%G7yAJtV%X%P;_eV}!jL75^e5fD!f=LUkY?ng?}FW)i>MxcXNKo+bsK&o)Heu}5#` zOP9cNCutwsHXu1HRc|oD&EA-}X0eti^VL|rp|3KyjfJ+7$&EHNM3(0j@z%~d$t_kI zp8fnfZNfLUW2q`yL(9R`Kb7^oup}!g0WY~hf!n63@oGAJ3e`g)rMAoBrZgqmZ3RDJ zI%}>}R9dVKi&(NXg7@bWgHtlEc?oFA_bvCR< zJZH4J)R+0d+CYrHl+_2X%>3`u~;v|9l0fEz8S)TsdY6_STEj zUFbHYBW4;&RNp+I4}D!E(|y#>pInCXpWpd%sfIUbj@GqsZoll1|B51`r{-P+<5rhnk=kFx=ORNht_o^ zp|V}rbvbx-`b(UybMr$2oB)sP8&n-9ERa8*(NuO%_3rg6FzjF-x->;ZHF|_|1J53_ z^BnC|I1*IL04bj^%O95|H$IXk82IPZX&l%1&_rrrtW-_A7=_m%zF!C}sD6@@?69Re zFC03%`{+x%=GlU)MLTsLOC9j%(t9mY9F~qB2$QV@%w7B0n}oSdwr07T`_3I*0a=AxMrSfjfSYg4;rI zIhvxz1N>6{dTF z$s&p7SX9eyEvCFQx%%O`CZ_{>F8|T@)}{$Nyn^PTMWrS@aK0WL2sfPMb}7Qry|`(B zjq{~kV_;&wG`1|^3eRJ%7MSfey^9it_EvAJnr^o%TnJoI$9pYuai^RTbUxyF{&dMh zTM+B9G}YAqI7dIwr88*Tr0;OurqSaUdVZQTk$fA(zcJuaeA1silZ`u%)YTi1t9^as zfm!Cd=^p@QiR3-*C}hJfTpFLbh1Jyx;J-PXn-_NJO7o|iVNa1@Ima%z99_g&=+h~_ zjMW;I3(5G5%W#62-}dUYBwbs@jPbZ7oV{{btNDubIB)(8ejjyCv*+_xJ)$_SWv4=Oe^?d=F(rsXQ~DN%!qQG6r( zf`?76Taf)XYY2K$8tzG~6m=Kza3HVHt8q6eb89%{?3xpHs#0n{V<=xZ>}u;2%j1Ox zCcww%vy*i%CEtp-@aIVn)*5nd_vrT=JsZuuHx?_-^Rq3VA_1L!mpL%{;?hY0sB?V_ z0juIZ5F`wOp0|AzPyDKC3FcN=quY zUN030$p==IUtkUsk%Alc7%m48-m(fihqDShmawJj$|q5be_dzxY7AJ7HG{>b7q#CS zw7fLXS;~^L&#T)RT--~Vw0CrmO_|9(6=w={3si@wlr&jp1WEQE5-UG7m}-FYjFe6c z<;Ugs^Sp;DG`)&CKQXCK@&`I`770}IQjnBli+vPzT+5z)v=03L#>v zz1?b#45s3(J*p2-Rke#(ZRp}pq3o((>-^Mwt|`ecqlfm0hBs*Y!v4PE#ndP68EdK}>z1Y0-$6BU8h!tE<++$4CO_C5<9eCQ#X~Q{EdCuKFYI|& z!<8(@j2x*8?7{)*7JFSv&=oV(xpZADCP7ojcpEePDa{G@U0Q*?od}crdB`Rz&p7g7 z14Z3cCtNkGK^6;yX@P}hF1`@8Q^)RN zb;y_m9US#KY}*lqPZL0iHI3?;VWN1GuM6PDMJ0~H9<2-4pLJPXW4Sao{gSnD#Yc{A z;Yhu=w1$OZ0ZmNEByyxlLC!i{cHWK>{uskh8lKtWgs)*_9BV1Bat6ch_ba+Wr_^_U%gZ@CX*q?j?U=#Bf=0Yr7{9A1V zjsrhH0rEXV&xtEaZQFjI9`l}3hew_u=HFfR$^OV2GHIhrPgo>tpCyT|#E3>|ovsZ!ImmzV{jYo_zc7(1#))k5^wA80u7dBYk41d^ zc;WH$db0I7VD0;@&qOohi0^sN>uGEe*a*#qi^_Ym(7&!{cQUW05(V6GP=ii9~uJTwP;1IdfCTr*pAZ)D&m%t@&py z4}DHj@Z59^;C(mycNOzKLGa~pb&dD`FdNzEi!WDqpYCi>E;5eITr!jXHv|8t1db6W z8%`oAmvdj%M=A3YDNZkB)TK;%(EK0Tng9GF|6)k3qBx^hz&i@ffoV5=GxPAlr^+X_ z_k_rfU53(J1-wgiL!TM@UkGC_({0@rK37$h!Av?GRd|`anYE2#7wOF^VhTg{oMqEzAFf9j8fugtQ0Oc>f}1`x~L@;$D)Pht3^E2*z zPy?jD8XqMrj48yXMUlLo7)}o8I2c70qT7(Ath1dp^~nl6UH8@24peE~WBUXenr`Q< zkT*lo5)JO8(xpOFRK~XQ6lXYb!>I*0iCML3?qMIa+zAQVvud6aNGDqafwNeXhg({& zp|~vA{Jq~p?PBB($lwm=$ak8OCZ?iA)2hJ1^-q5OCJB)q7q)~9>tB_4*6q&F>CfPo zy$ewwv;3i%DLyaat_XgoAc?>})w&xH(&RSNTD>z*m7$Ata=|7DQ&mz8Qu%W88#d%B zn-u!Hv9HO;`V2R4bPq^}jC8gb@l}!sLYts|MjHYx=Hvu3$B2qp{^s>zcQ6HYCpTuQ zg767(gbDm2jbhfDcwg3g5Y&Q`Y8$gBNUhF+_3?&h2rpy6lcOOox&0Fl{VkDpC7^Q& zTfWZ-Vo%U@A#FQjuOuw2TC1@G!&LHA3pQVA@H`fV@w)8xdBlU#befC{VskEY7%J`afQ*}G(!J-*LCRXBW@;@V`>PFy*Y7Eqyv=sbQS{~caCniAHh)r+ zO5|;p(U=9rOlyEZO&CQ*%?05N7lij}9L0-QV0Fbh&1olw-mwQM-XGE>)oLvJPkE&x z@0ebdNpw3js4D;TDZC?ed}&>xwC{4cOPCaac)R-039`0cH?vb-4w@V`nW$eEiyodF z&9hi%s6h`0pdTz=FG$GXao&_Y*zp>5d311r91XI`5OytCXuK{n`l+xM#6;`%#;!31 zKgZ$Slbw+@f;XgY?ypbV=k_|B7Ek$LJ$iY z4#FPX*X}@*4DWX>1Clyypz*dzbZ25I5uH8|OL`>}qOtfw#j}E~Wv|lBxvMC5^nVR? z??MCu`mBxT{Jua<6FaiOWW^?m{`8Ghyl}{`cQPcVCKX&!VV{=1^RT zXya8pXSKd_ZW+g$FC4}BD|~^RPE89Y~yMlD-QagC-uRb$eK)uKoOI~?NJ+TmtI%R<}zI0M=s5&>A;KQ+vw^bIUgnV3m zh4&>MJ@E}&SpdaYzp+bAt!Nr0Dp_GS8mT;~_Ef86Joiynl;&-+A-#*0&hHNR7eTOK zaper^+<>X*J^)fl(j<7Q3ZlOZcCn{D?LC{2hNRIV3z@FI((w#hVNE@ zmpLNbFoM(43bE~6?z7NV9*SCv?!W;E;xXlfYI52&4%jF`ZR5e2V$_$|RqC9eDR`2w zL6jR@W!R3#Iaaj+sK8Cf=Y_@0~nX3%f*c zdE4b-Vwbz7Wt4m)2-p(O0Kc=b*u9T@p;AfgOH`GCwsW->enwn)5LJ4UA%J6;Z9GZw zf34&1{Pj*&jFRGwDfhR_Bxwceo=L8k`Cpenq28(YW)k;Q(@K!qcXY}VXS7;|$!m}w zr?(~^7YDx?dHr%YOP_(!d67AWcxS&$WNI)(E&89P7PMbwpDkZ~1z7-e72XPyd3 z$2EhyWV92c8;y?vu@|qNuDoF76O6LW;8Efv=GsT-tOX883mAB3T4+tOL*C^yMxv;3 z7Dp+~pk7Yuw4Aa$JE8k3MR849$h#KK*Zu%mxE2{{Tg7dSXkk9=CNv+u)aAaXa_6f- znn!_dP2psQN=JYrENrlkeGrMaYtSPg{>b1l@$Du*WcqE5k}pz`#{>ag(qhaqW0eo41sMZ^Z=r=I2 zUE_7rW?b!sN^G>8+*D&jd{}zvQr1gd&$6Z>#{}Lb=nO}24lFcpF<{WR)s zCCp);vBjh(0Bb_kTe^&<5GVw+3@e2vSQmQ$xJIg|-%7SWK06Pi zXfsY-%@%pOR|!@q?}Qt#(rAyF^57nW{rr`v8K|5o#WxesPGRbayMe`_hxe-~$RWRV zj;;I--UT+-A47*G7gUKxZ^Us&mNa)`4+d>adfdK}HRo35y|7*aQt zs}>g*?VxOS?iXEgnmtWkZ(vy`-%d%j5%_65J+~~hb`Mu6vZLeZjx|^{Ik%4}Xdx^( zHCOh&@T)UP5co=2zEeEH!HsE@;YxJN9(F{>iM)5#JvVo4c(R>qgWaug^N4D+HO_=3 zZ#OB9@KzTZxe^M|IDmR+25xQ)T`WW{HmGO6Oe)_{b)8&q?4)L@gY(^Q(FWhgf2a~u z1P;_F@UWD8zvGCe?O6`ck2_%MbBZ*Y4=y3FCo8BsKi!A`+hjd(tqV&G@${(L?Q+)` zD>8}tEE{5hhrbCZewlGEkWyW#Jc=vQz|C5<76R$moY&ISX@607;ggBx*~z1ZEr~2k z{)M}xvspXOaZke+#tRmacO!v?5Syg;j{BdPk&r*cBNb{;S(2A76xQqhLK!8 z)E}>u_vx@l-yUgt`N~d0u@y5$!(_QOFxKVSzAt0r#m(e8n(nYVd~~~NcXzVRsocrb zgTKTM4!6h{xMS=1smEy+X9==ddUNNdD0V=)fUPxF!L^a65qS&KW2>t+5{7xMhd*~E zt*QakOT{}Y&R!F~>@rfNm_8W|9AUC*RKBC2HsXo$=0CiC$4<4iZOM&zfAM^)cSB9A z@H@10ajWEHa(6B*zqCQXQxOS)YS*=aixp$W<`-$Cc0MO8nxyM^hwgYsWbk0n9wioK z_B#^KR)5vC`KbgKdiqB>?aB7_HP6hAcm#jwl66_zoKbvrc0aS2R?gQexYfmN+!^zF=vy z3eui^MZao^G$E7k@WL7%l_TK1Ini%F4?xVMugaMh0Wd_g%({yF^8{z;sZJf6AwdiH4=tP}(O=58jRN?FHvF z6v=;mX~FR@@=7&oq=m_Jg1=@e4)4rx4Q8J0)!z#acS$V~5_)*pdihR%udvZbyXpP4 ziZP77iNttec*lZeh@DX(2#Ff8P0qKfcTV(@yc8i|+$*dFs>3`b1WyQzhd&u6#YlV; zY57)|{@(raONH46gKeO-@;CCn6fIWipn?km(3Pa^l%DT?Z!3}c)!k_#B5hcxMyb|L z5+sJ9#Y4Udp;YWzc$I8rcxGy+ zZ(GeV^UnxZ{Z_A6Ai4Dmv3O_EdMBvJ6E>ZMXeuU$*bSlJnBk zA5^Ny<473-byUlHS%nLtD?$Ux5zYIqn&}l$Oc$wDI*K5QVUl8M>iS4u^#-JG@?_^; zM|lq!p$pO6K0JeXs!^8>oKgWkWGc%puiTVv2CZ#%l}cDloNfa=T~r3z9Qws7QwBQe zR8SPaKD;IQ#>?Qf#`XMHJKL=7?SfBmckrZdt@HY9%scLuage z;Al@=)t#&DU-)D`zlWrB`Cvn3!Y|PFh`9G3y3)OaTskl=r`sKU#{8d) z*#E56UAzdZ`d)=^&7nQ4vP*v~9Inn9MavLxqU!%;5J^U<)WRNkDVn=(p2>ur>5(&~ zF#A1%(r|iK0&;T&84x?nFx19g2y3P1NYC^Lan1-T!Ul*SNfWGF$mYf}H}8k_Y%$GV z*%R^G#F^ASkr366+yRj1FdP$#aC6*1>}8+p4bG8tsMDe7RUfa_7$2U2q!l@V9+p>m zc4rtKDu+w?x6c?Tl(&024*b?4ov}vXZTB_tlDi4B#@Au%ZxWrK<%3`iLzTV#9eeCs zm$Zp}_N^0}8hn!~52|Vf(p-5960NHr=S_6ziN4L7%4tyN7Tn6OjeF*}kxZG6sww_c z9-vWO(YpFxW+F`7v#p28Q^d`w7m}6d&zNmY^+Y|JGeCO`6TSa=iA5Z~z{@Kgss6G( z!d!rMa+E$BZaKjF8tq`XYB{7-v|YT4jP~@wOuR_FJyaa${BDVOp)$E9%z0M{WvI(d z!=kpTZJA|*bB=zMQ8j26yVJyzmh5uTweoVyYv?kc)Z)O~fW(biymO~*-uOH%Pk|Ty zfE9()(v;{e_ZThTWqg~Em~vztlE=Y0X&yk|adG7KO3{v6`y6|T(w_6{XkkoUE}oK6 z@|`qCoVlx4uWs|D$=rF~b*FWcVrC|pV2gQM4l!w$zhTM$xFu7Jl-}Wahsq7hx(7ai zn{8eSYy=wd`RH(wh3XcL*azLOM>^=WbGn`!)UNANVcxjK`a3Dj9>gf`xPd_Dt`yBq zK@)b;u~GDYkjq!!D{PEk#nYebX+yrgEu31X66SAUxGPc z!F9SdjwS0CdkeyDg`C^PRZckl(A9O)g!sW+nhdcXu0VYrcfZQfw+GbdK%Qm%(>nAP z5pqcj2YEHExjc04b;doLXx7P*_jRW-Mlh!iEsvYyj?3RWHdXMK5(a5dJtbYvVlG5u}|gPigIKMQEdfdmyiqwtGk+qPv`5zt3e!E ze_u?eLK{ftFY$K=mT|l7vM_}dUF&sk97h3_XdItcM-sV9mrG@61EQix!1zADG^?*d zZ}7T6S76F5)sl4w;o&%*#EQ`DjSJi&a;=bkYt7qSdeYpm@{iRIn#jx_9a*QUUN>@X ztr)#2EtfOYZqTx?8?#nb>sz!l#kfSsrNO4Ea^xd3*YH$`S-sidmDe^SGZ0v?MOInw zQm<)_?3f;k*V&`xQi~;Z5)X!pWe=?GDkh2kO1@cRk5=PG%8U;a`OAn*MP=9tYN|tH zjNp-Y%cAwUs@~6&R5dLg)d4ScYb>Xa;()yWPg$u2th}rb;X>cM?rWE45#7tU7~fc^ zLS$a4k}tm7moZviI&`!$SJ^6AV8IOQ{BqqiM_60o3iMtkIv{;58HRL=oR$4M$^O3_ zXbDH3GNsg0yX^+SIO8=G@m6s#JX(R1bUbLQ#TF`b?{+8P;L5gNzSg>xDR z2U95X=RKxee|*T97k{@5`Rb_DGxkm)sxA&UIM&Giqu4kMfAc?X(0D2gpU7dzxf3B$ zs+TX9?ou=_wWnZSf0#l%OCIQsxeRBmW-hrLW58|pwU?5q0}?dD;K$^G z=Ti!ZRUupS3B(me#`orEHO&CiEqy$x#_mhRm+_h3N9m-D0Cn{onH(6?65*?F$Tptv z70(RwM8-+RCiY}gbUKfs@wR+el|btj0V+do=gluxU?ytBsKK(u{DWZ;-5|v~C8LS^ zC20kv>6Z8L&Uc3n!A)HJy74;9TC=oNEYWLTZ${RLG3=Q^2CM66&Lplyo~j0ci7k#j ziI0EzgzH?v=;808FpTe zv#bt>RxdBN&YBn?`qVTPOzBF#-_4s50B48TjrqnMO5Ikyn$TKxSFDiAxU_T)SEJM* zncdM?{z5%dO2rDuroV3X7n&N1X}FT!A;{(j4H#ds=6%cZUKXy<>>I z(5N^cd6TBdQOa`6fT7d?6AP7Nn%C5_?CBlM(3sEAmrO8~PBINKSK|@6Fmu(kIq;D# z6(P{escJ9%hDC6$>c9o=)xx#cGZIdd2+>(|cx#a>sNL}L2w8N#Sy6(iN5U`+)}%W# z@2^kBQ>yEH75&DPr6?y{J!r3DMicDDoy$$05KzN8-zI`%#PLw=N{DsdcynjA=7hqx#Wm`*`L;K5%|ATVBu^T|@2gcA>6~BK4AT{eVjn1dc8X72}bH+86+!{I4WoKJ?y=HO=@xf~gjGXw=>Sy$h%EXPvR?2_%lU5J@q zZYhXYe}a6lrqsi1d8Zmp89=ojr<>^kwtZuzRhHjrBU$jxXlorM`GhwvKXTX$o!)z7 zUti7YqN!Kj%L()$x2empwNYSgkm!835o(J%k`rh6P}T`Y(&{@_|_ z+T9y+=bS?b1Usb`3-J9T{>^l?`faCz7P0-R&y1J;R$lX)y+NpAO6Cm1%k*vUg2(9W z#kSEN(WgE|K`|HSMn|d@z%|tCUtSe-cyrA_7sU&cbk?@ytwR^U;Tb?DqZg}G{ZCk9woW6vpL73J~}hS)P}dskq*Z} zfzpkz<~fXonu+r@*&q*D#No>FRB?;cb+JqR9b{mimMrZ7W9b}t3!b{33**_(G*nmT z0Yq{cgGNY)?MfG2o;(hbmRV-0BVaJm9yu~~*Z=@>J~lU93=_(JRpdSPnNg2{i**OJ z2ERqgJm{*7n^ei@j9?z5GzmdkfSyL3}-;KC@{BImL zR-rBPDAozFo+uwfsgz4I&YK3UJOr?HiZ!B{UG%`@Dm8kU{f2GoZmyUF6mmn7a55F_fd>QM)42->zpUJtRAeCh zfVa10BJV(y8dp=Lep^b_%4Lyjq5^zPl#{HU!O`d_spm;Zukpd4r$yk3DQq&aX#>U; zUlHP#5`O4X*Dy0kF6a@h?p0;Ou@~4rS<|#KT)hMh_F()w$viMMbNR;{rPL&3u$8EV z^BYaBrf~+AZAoQ9X_22yuxyD8W~d~cqD)oi6;22Yd9ztvt?pe7&E8K+H$$w@k3R}q znA}tn?SHhRK4cl#{II+NLC?4>RH34{&RSuAU#nrcXG0aT)8B#mR+pVKI9S&A5e}dt z2)}SpApk0uby8-OOY-6BtULzJZ>$FJPfOOjtwFD<4G9o4eGU_)t00;2#Yv72akR$4 zOJnO>TS5$FLlF=`j>EHF^{+KHFm7lWl@80Hw)I&*F@ps6jTa*VzAFgFD(pqH*eaJ{ zbZ=S!++ttn!$)(w*FoQU1cCFH9}7y@RvI(3smoakM@D= z9fb|pjAM%-6^aXDSUW$fy30OOzgDOpVsMTK;^*H=`@$ag0OcY!HuyEEB18bGYl}fY z)up;fg%f=Ra?{kRrF@vMvS+$XXL>zThcEgxoKYkCzs@Q7GY|-s$_kyTaffE!2_RB< z50Xi@bcPn*y8gw6FfK6qHJXAReh<>OF9uf7SobV+o1dE~bbB<+C=9-0u`PA_3wsk1 z5#Dikd{-@8;`EI9{5n6(Cd^qv^AOkuXb(&gOsKoDCbV0*vXwSbq%m3O^Lm%;wURmt zW{u!ch$8*6e?{z{djWLTK}1AYWh@I?8dI?oF_0e~iu+1zBd`{CE-N0iZkb<}r+C6fNZNn>WvneOtQmpMbjBOustY;p=LpnWcpZ7oTo0X&h?BQC_|>D_=C;Z zpuh8Xw+6=2j!dIw#A^)gN^}BHMQ-V_yn+Jaf$o)26&CK_YPX8t;x(n6zAz^{+uEt? z)d>dY-|n)V^&Yuue?Rmn&uP5N(>z0a-7K7dt;%1nG#NA4kd6O?&!ri9n3|Ui2GLr^ zEOPMR9(eP)Cl@h0%Cjl@V5rc1{el5 z>rO!!#O;((vGF7{_@Pvgp4FL=iHgK)3zbQgpU6y6q$D*BMPD9sI6ymHz%FSG`Yun? z)o+M5bF;o$GZ@R$JVc%ikeR}t7_+%* zBR@80=NRB^2Z(@JV0jGP9^y8Xhs1m%RJ+nJkQgUhXK=nXMP$u4YLreDz77=k0(+eE z+thXhl_0&>XxY-{IrA*?9plW%sR@Ww(|kEs(<*EzHuC4p&Giu-v9_ObC>N>mG*1qAU5WQy6-4&ky z$JuuWG?i_8kEkFh;sg+tW}#P+-cbZ8(wlT?(m{F+h=2@LX+c^50qF_7Lo5^l=_T|i zy+=wCNC?(va20$!BEREZ- z6q0quFoT;Y{+&i=`kdBA348d=Ow$`sY#pG56-G-Z-1i z%s0M@gKx+0S+p)DA*CcYi>HShnkj{<8Y$JIRe*D6#iX~k6iZ;cbvymxW85Dz^jcS8 z$@DbX)@r2t>kn+O7@g8Wjv4?`#FMW90EC>~XQ^nge%JU#CR{Ya)ku8JEVA0ML3Qt6 zV^!yu)*;MX7Fm%Pla1K}&EL_%g^iSB#45n`6s5t=2SOYI>78{uXl}z41DnlzE?>b-g5i4{ z%-#?zJC(;n*itfL(v>H;NDWL_JYPe&xyKqwvs-i?mv|#N<2)zP1^8k0xL@YsdGadsa)53V$bYj#I zQKJD=~>q}#^! zaBMlg_S}i3R40Uefa(utE;oaZ)lrw688@N7@x>vvp0?w0v!N zmnS^-K0UxIVle)0o|aa`cg?hYx53v`RnnHT9Xj+lG?jN_+?ex?RsT>UrAwu+ zZiQTY^>oRL#Vb?d5M_T*!HmCpOHwiAh>VE^BCWZ}lCoVXJav*$S8#&_l9dmrMczGlpZA8oSnm^Ge2x)cG7s8D=xmXWBpp0hFi)SiEy7i2EZ6-ZG^Yh z1%vSbH=XuJ?bpM>vlE1NQoGsQ*YLsu?c&wMwZJ{YCt0!UTpe6a*SCfn${oF;!K73k ziL7G=K9i`jM(|j&aj~2Eba7vj$ID6KB!W1tDFTdy7!_ku1utDZjgdAl7T*Lwfk9^j zkM(`aK5tFp5(y1X#~qc3nrjUYc`J6Yz*I&Gf~b>jhh6)TVkO$*S24kI=P9adz6c|y zSf|2%i*|q%cs`AGNOE9mx<)MP5O98mPu3HAS{G*@I?G$E_a;(vkpTe+wK&ls3=9;7 zQ*VVi%u3eOi}v_2miTF;ts*?M)KS;$nbX58EvUabXw{A?i+HQ=NfwwX*QDVTSc%1p ztpFITXtCxgA7B=<8BPV@vyCTb?tj4F(~u-g6%MM3C*B4lGaWq7;a`5|gv*tj+ZYfC zNV)w9^3CdrA#ufYFY8jJ@fOfu+o@budDnZ5$e0@hsg~aSSz7Iy0ni{2OPCz=_{$!r z-Th*{d~;(}S5!>A_^&Wn_vV-Mm^;;}i3>X(wm%3uR6X8$%dqGq#)Lzra(8n5(huUj1H@k!fNWV*|guI~J z$DQSFN$-8a4?7CCHgDl4tSpYoQrZ3hQ~>@HfNuH7DNFA~%KW21xl4Wl$*OQ#+xZ!K zE2;^=D{EEUZIzAX`Wvsl!XAQ?>b$xRnEDBpi?~&05%Km`#2hOZ3sr{H znCsFw=sflj`70XJ)O-j1;#E@6p4v^hy8z}SK#fML-b0TTAvmD8Xv{sfvoMpSmZmI` z$~uW;3u$a;8%Z+i8@niS_wA#KE>00;RC>)*R!5cy665Wj4X7YL=UVX7v1vax)&neu>Q3q>-ob^ zRh`}4rnyhqjqBB@bmtgEi|~R`w$cYzyIbZLHc_*5Toh`*rDkuUe6Xt87~+(5B8#Mi zvlFvqaL%!MIii*T^~D-53R2Vw6^92fj(jh0b`Bv9QYSMcgfP4x+kXz3x!>)-vW^*K9@B+=`e zq%^yjTtg=fGk_w=o2H@>Dt9deD@;2~Z= z$chm#3IOz^3bBERURdQM+IGeNXiY;pKz5F$TSLZYNbf%m>BA2_%ScT;Y31s@^%%k8 zs-gT75@#$m94ZQ0A3dsApMu$z?H>oKsBHvHl|r>k zi+~6*>4SmOht)vT3lms8FNMkF^zXN8#~0G4u!}&6_>LFhUsA|(w^TY^dPpJB;iKoz zfc+U(%TDSPfmZj2Bea+AaUvhqV0_46ql+y|E91?$nZW(8+_DDl39j-yxW+Mf?C5+6 z)*9kWOW<$<#Yl=TsxLzIS$IWqmL_h-it<#>%s7D$!{$7qT4(?$S7vusVZEMKp-`hj zj-wU*zw5$pBx<(?g8$T7H4mMqH~vjg{^|#~deXsnI@a{N7rv(b<#)^^o0-%jMYOGb zmOPEQz9;+mF-y&X>T~aHHqk=&S0C7dle9OAc(C^&fi__PG$bQ%QPFu{xiuDg_;67s zzkZ~uH^qVFy!mn0_kukD0%i=*N)z)261AfxfB{Hv?hF0X0RPWU(u@F8tVY~%cl&PR zanGa&KSEzwCQ2NYed3@JgA=vlmX_zDw@H$*Vsyn|l&w7>P|xl8oO3^x0~4p;rwe#=>@6(3vDWc>X?EW? zGRqB<>mJe2LZL&RqG##CzB4?)$0UvceanEf@3E)@7f7%pm!cN|kGzCaU-yE3bH14&7;ziafOOvOL3r%F4R6xfnx~ZG;r{HC^^3UfLjLk8dmNYAa9ZAzQh%k z+%xipDvCTRmUhe_Agw35!B!TSd?Rwum z`1-SF8jlauE>d_)_;_nL`Hs`JvMqf7esbC2CyxQ(GKbpdfG2PoRV#eA5#VFmiO<$c zm~^-KPKti|F^T1|h8B6%ggF~3vY>#a?jL8tzy9&SG@vy0Tga3W`5zzm<0e`Do`R+F zCSTxt{RF0GjhrmDmXZ?vON$AGEc~g@`CUWb&zm}7q{|5-aqB%lG}Znr-qNg;ODHG= z)Gxts;?z&1!r^2wOi$0x4yJ#l$DeC!Hp*@Px!922#Oa?b_TTLcBA4^$VwFl08$*R%C?=|kv%zf+cN+5D1bpy zlVp^BEGt(~i!Qlw|Guxk{^c2vwh1TvJI8)S+`5DuG?K`>l6NJ+1 zSEv6DH-ApF|9!18ULaN&3nDi7|3vYAv`asI<7`WSx4I8Mo%-$_|BH_yV<#f!h&VEl z{xv=N8M`$r0N!%3E&TSszw-a~TRF!Q5vwr|hW*5e`0304L?;R#0lcN9cK_|aBq@hh zc3JnPs+3gkU$SXu6rzprNWc92d`atXqQ;n5@zehuamx70D}HM&yaIouCtxICu>ICD zZB7wr{VHU!4{o}~d}Z^Mv03gSy?Wf5Wk~hq0`V&b#Yd3DT#4Hxm_5AHb&^@*(;m!E z_0QntuXp*_9ZJ^8_8D#oY*e9}qT+!jH#{ch`B3)B;XXhY=knj_^}uzKECX5RcSV%< zVpD8%D?+|qe%T?y;*3beHr%ewDwE0c1;$^P@;tUuG=*z*XOe50o*Nln{}@oh=Rz(q zc%5XuFpDutht`Akv}r<6#6?fp+}&V3Zqy%KMlv}kwmol9Unh3V0{@FO>VK*b<*o7p ze>``%qsUQikB}oLksrY_v{yx0(O|xP`9qO# zv8LpxqLpu^atj<0^w6%3d#wv@X4d&)>w)y=5eGXtiSJW_|Epop zgvzne#@MSyAk~zzuqI1~!))rS_(5i2Lsp&H!SRxdXBA5{j05n(h7GDTXzmUUeNCwX zjvC_RWW?Vc*JWonlv&ru@OcV-25UZc@gMcPcKs#rj8VzO{+BPxTn8lN>LqXWsQ+~v z-zjvt?)7OgT2iFCo#bZ5mqU1#k1di4ZuO>HJCoFmzN9rm{g_z`421c`^bYw+*ukK` zQ)+fzAr&NcVdw$kXnzl_$lc_jFcPWD73Ai+d_Qrs{-Z|Qoe9yE_=H`vz2cIWlWg$P zRdTc8WF*K{27hoP(wi%m?e>S^uO(6q9C{7}J6XtIHl?~zt|d5PGE|sPce?bky9za` z3{Z|ax)jx4@qGVbNv>&Kzwq&8`0AHpa^o%HbbX{p+>VF#ec7ulX1Rjt?nb`L>2iO1_A_x^qgaQJi5&V zzItxybFNW7G~wOvC&eU?@8T9A(#hqgO0+6G?j!KfYGU$WA1xnYJ#E*^iq*N?BSvTZ z-LwsD52w?{k_ixkT1Ao3`fz-1$gu1@%&>F}Z*g(tc*6OXb4%9*=BuU^ZHb5per@Ty zWxZR|EK%ozY{{BC7mo4xaTLb}lcCOFneW|g=62I4dYVOu1B#4D#J`@mr46;D3ydB^ z#f|Qky{-Ow#&&+VSax9npNl7FY%@bM(r|M_40yDGmr>Z(XgBxp;#J}DhEG)>|D2x@ zMMaC|X3{@YwZOfYvBH?)KX#iL2e?vt@r}utL%rNvi)RYRfL0Y-M9YJ+dNex5Jq7Dz zc!+1p$AD?Dvu0Wr@Xj8JfM9cAx<-?`YVZ|>W$&_$-F>6*mbwk1TgC^OTqpO=-6&() z6pkSZLZBd76WNS=3~1M?YrWcRll&&xIzxWYvf0)3T-VW~%jHOSV2%j|p|2KQlH$Hy z4*`HL=w-|HR)Tl$@QU{$zvjcfs!Y#NigEg|AAN1!o8`j|RaB$-W?4~uHI{e#LVtNr_me8yS){FFu5wc1Wh=lGE%g zla%4w;wr+gIfJrlnJn|AS=aq!Z4AB|rIMP(^0lz7%DyO9!Gdv~^$ zM|W$m<23ECN37GGZs90KcTC`U`jx}80FV}#mI7whUB?JyrDsGxbVp6`h0<OyaoKaGPhT0Ur}qVeXZ;TCy)%udlnWlQH+x|yYW_^`tHdF_~W@tjG!O;q6GC1(EI6%eM=(+Wyh@Y@090| z#dW258?rPaU2h&$oLa!HAJfx0j#Eu-N=#Qqf<&%JMRxT$*e2>aJ==RiLBD{$SZ`3Q zF(IEyM!U}K^vxqjJDQN427lRKKY$r?pAz+~JWp%!A#rk8M%`dMFTI5p&bdRIZf!x$ zpt#`prR$|$A6ujoy!3r$ES7r=K6hkh zSUNlJOr_TXokRisY(nGPiAmC4E1429AD23MZ zLlaH@>v6D6Vu6c3OKQfOC(`YGHld3rpgFSGJv>bnmkmMO;Sj7abZTqc$~CY1G)2`e zZ9qbtnm(6}yUL*$))1HPYxxXWVuN8fMcP0e3B~SHemjs!O$)SW-JuD20M%tKZ1x${ zLK5F%!SaM#{Z6pKM!ao65)7i{dARQ`tfwB$F15bZ#;lQeN*UAXv&B5JD@e20?BlGx zgrhKNP`iJiX+ixMC`W^07CF>-H2at#YMqR9nmR;`jGwKC~(G;m`0$7b+ z8g{NN{rYz3J-3Q<-5Ju7$>DQ6L2sT$Q%c0uPVJOGlC}@Py;fcuZGIw&AXc_Yi?>6z z@2Ju?!JKKi0XPBfT9hdbLVH}!r7k0{?TYBtk|{Zqmleitl#O!2YqGY6=44vNl@(By z=eHOMk7CP!cHeeUS`QkRgJuwxCZVu7FB#5_)e6+Nc#Gzm-mzh&P7mc~j z{#I;NQeDC03!X)_$t`4NZhUvj@T^gH)o0g`92=;6+dE-gs6W_2FxfKi08T~j4X`V` zvyRpjFeu1%B!w|vr6x9TSt!>xwWxdg(VTnP)S2uR!a8ic)=u~c{A4o_XE9QXfGh?s zcl3i_aXsjuZoo_!O$PX9d^k!E0S6mSEz`JE^lzV@wuT)gLAbd$JRbY=b-w93yFtbH zLoMUy)(QIVedEHhPN2bd&sfu;nL&&K|3xm~Pvzz*!m#VpMI)|?u!gUx^X$Dx#~U+b z8T9R#ciV=_Nc8m6)1l7qURw@Jz8#n2$O#KGBgr|Na0!$UOp(JF2D%=5=^Sf&*sDtp z_gpK~>NrWQ8Y!=wCUr7-+Q4Ohj9#r?9(5SqV#!p;_bwj~^lfZx^!a$%njR5^Quwz_ z#4xc-Ntr-m~Ga;}uWW3d05nnH&f{agkP~w5&HONDHlUdbt-aZng zD#ia*UCI7cyyyt~ZZxXVN--04VlWL$V<8F%AK@y%Y)Rsj3xn-uQFm}Ql5 zm|*{_O)?g-1wl;h(%))N!OIkB2d3dVTw+%wbh%p0`<`Knk(qgqrhMjU2)en z6z%K+8Q46FLjWb5cJy$Cu)`N=4?-%BD2XYcrtq!Eh{+k7%BGuB_yFZZGkyO_jL%E6 z=vK7(t4A6iI6vCR2Qy_oy*39QUi4}x;AI|tpmRanj$3#P)uW+YpUB6iL<}TXqj}oB zSXmLP6NzxpesOeUkE^yVmSJIxiyzpPR6s`@O|SO@E$TNw>JyqGK|DFgFTCL*JJ5bp z!~-sTfMIZ*o%sE^5|D<<3L1uEt=_m^Vc_Jr8O)T}%a7ELuFUVaX5>y3^?b&#+c_Q| zdF-cG{6P7$s*}iZeB)7TtHIpkei`CKOaJha?#Dh-c=C-)#WD7G{`Twt50m}Jd8eCj zq{_z2Ki)qVy=#8XQ<`CV+9`Eu$xejsc7vaPA+g5||E_HH%9R*KRD;B&0LGy!2#yW*Ka$N^pI{|ce1fa|LUrBizT~4M{~9M zb>VAw^shI&cf~AGP7TGD5xiZ@B(nR$_Hs0-DLXo&J!5rPy(B@d=e=vhF_5u74YO8v zrH+V0Hc@_N8p_v{oBP{2`@Y_C9dR9SgIdgCjvcaO%dzG;^JfXTz`Uq2`e>Inv}7$5 z|E{8|mu;X~Z8_be?`0e{szrV{e#MZ8X*k|1{qXHuy=Hlq5c^!WlV!Dl^n(Hr4y+ z*|LfJfyqFp?f_FjR{M*&tDiTLRs+OZrwt8@OeVFZQOeBD5Xh zc7OEs?hDh#wJ6rF;lZq0`NrRFPB^T=yqueP_yd&c6X~Z-_k1E{#+&coW1jX(F;`s- zilp7@08R0AWKiv*wJu*pUKPn$li&i58$Ml22R>(%FO`dX9SS!a>~9&Rc8j z+6OsdYzp-~)`7I?w#ZgY2@CJ{`c{`k{g~PIu9`yQnmjS6h1ChpzSpx6lU7J( zS#z9Bs>9UKBE9r+6QvQD%u6_`U(LM+{~II-?UtfExsG;^_O6XdACxj-E`Ml@ZS@?j zxwDblhml-<$~2Th?QQ6%xaxGY9afN@KD#Hxu2=RYE@gx=7Chfv@*&=|HxEwY|JZ*k>L$huFOaa zf4d@B4^(yG^D#sdp^bSeE~m3@o`2@0 z=X$IE)EL=qI9i6UUC}FaV(?7q11}$3>_+Q&4H7+LI~B6Ka0x6XWX)lHwA}P|Qs^xKC1zz#d7Vv&Gxg$GO>k5&-(;8eF z5}HPGYW>C5`B$gFot~!8wx}jUBo`Mg8jkVb0&n!JJhLI%``UfxS3Gndr9DpBwLYV5>1%YwUg? zV$-NmpqM2>+!lW~43m{l@tZs>4t}KwzGA|R<@X*WOY84_s!dcp5|jpex};gZf=SI) zdECkpapu}1WFuy?XiN-6MG1R{wbl8m>TVbPltlK>iNp+2zs#Ezp>3+$wZHEa*KSqT zG$mLXQ=2a-t=Kl;SZ5Po-xy5EpIBZvsCK6cj?@*hf6Q)>8&o+QTtyZCn(M3AD1y@^ z{!@(!3t4^Q!^1d)#x>z03woqF5nP?#HVmtMf~>7F0}Q{$%;XP!=W6d{65D;vEPN)0~RDl5qV}vl9Nf&6rHYyofUR z5jH?*@l)(8V*CUqX(M+-0$Z9wo4k%Tn$EX9Z7o(jKz=da?h|BGn?m2Z1|(_E8zu$i zy)=T;_^MKF3NvT+*|Z?zL9}MVRe20o9DKj={May|#9thGq(y5zFCbJkkS(_pPeuOeTx|J+CL&;fdph^%1 z(G`rFe|iGuASPa5-ygR?uUXB#Zt(#w=uk?3~~mgdDbX?fUn47x2(@0WPmFNUnB>}^r_In zm8?P?J6R^ivO2e#m-k!!cJP_|-hBL3dAlDCnX;7yalUva9xwL0`QwUPst1}}4tksl z=wkz^#8F&08`N6o=tkz=vVO1z;?{O;SSuaJx%#8qq(3P)|1^xtc0V)69ZBQ#A)Lva zui3-vR27r;p&bXO-dm)}&Y3cq``kU#o-I$rA3wV2peDu#)xB9_Qs_DNy05h`b2Yux zs-P}cKf1KAYW!+v?c!Qv6fcSC)b@I^o<-w2gnT75=W28eJF3#D9~n2Z>kZTDjCBpb zr9Re;>oIv#TVt~GU@zpucnzyb?z+EMUJb!gnLf*w$rL7j#f>wvRe<}zuT)<#;zfM<(zio}%gSntng$z!1?KntnsMG`!A^&k(q|f#-1A`j`@8)v zPZpEL8D~B&w|ck8272e&HL6YuYvp&>Ad;wcql9gR2T@)XC?8-FWV%|V4&2o>OFgK_ zxM_cHSnp;fFJ5lDeZsrX3gHx#wNF$ZBg+y5=4rl-Jhw&mjTpz-abiQ5W#LX!ipmeb z12d*jAA`zqZ;;lssi>Qt&?%eRHgA~7jFkMGSmcAb?ZW1@)7X5YD2v5Rf|UJ!e3!ZWsV;3LZ^m^~h{=t(-{b3h- z_iPlx)Sk-H*5+IomsQ$l(Eoj8o}_j;%tM~C1m5GgR2sdaazVRP4taV@znSTxXU)0_ zr6X=4b0X@I_d?rKAzkKI1+|7nS8fvz-$q-f zDBkUkIiChg3=yW#errDN8F8Uh>(!n4=N;_UkJ0iaev-(TU9#2Cs>EH9Y}@PtzAJ@P zFPY}-`;5KD{9^F~xNSfB0l|TorLXwg>#5Ks8F;tGZoy72j$87uT}?1e)Mvl_{oau;@4x6_QX;-2Lz{ke9c$FPB-KPTR=G$CqgCKdMp z2qw<$DPB*Z5@`{q>ipH*8;#PJqVr^Qo~+O9=`Q%g+V_X`A3$5$OI{&%S5n%Em&mNE|Qvc&x|VZ z(}V4RY_XIxv}y|j!5+j8HyB(L&2M`F?WoUf=|`1-2W8blMQG>2UzXF*gyG=>x-~Ji z>K*QY1L38CB{kjs6owKjXuL&CWTpB1F$+X>n%^zDO(;@Vf!btmSs1l=~6Hp zqJ#}@$-Xk58-CF803!ik-AW~NM2IuJA&&#*!1;W5Vt%c+-;IgbLU@`*zS5F&eZcV) z@zVXDT)*>UycPmIzoJv)-A~2ebZeWV$+$ZV_?wU+x`8&D1LS^jtxb=s?~xCEHROmE zH$jV^*jor0(U1}x2tM{oe)XJ9$iFYuxn|sB?99t7y5b0{CxolUsNmES2_s^TtSeRPP??1gm^qBlfoJ$;YClY?&6hHetaB&%{m|XS zHX$Lc`!8KnzCjZ6zFv2=Vp^L;CAL@Zn66C@Jqmz(LlWV6YSAHYpH*yPc&g9#$9dL`0;y|yZa?U zgpwdhco03N1oTpAnmxxkHA?0WkEk{|gH8kMfW{K2iw=Su&!G`PmvFeofm05I#FGNEPh` znh2zK4z75$`d2|}hxKt6h^vLZ=V-dNtYi|xcT640X{fxme9E;bg%lA{+d`bioZ3QV2n#^YU`zTbdbX0{othih&FN%-e-V?9ML+Ip{X7}2?3#A?TcAxY;yrL z?^DTa4i=RF3l(i$B*&p%^|d;gRc-scoHvtJm@-fm%Q}l6;@D{YL^1BNo0>}qru#Ob zMolt79#*-o_~R`#Kk*2-j2_p2KDt8=F7HC0%D6EEt4M7V0O6l4je-z~cJWGjwTXwZCDoz=CQn3_R`4B%uLygRPCCXwJjcC1! zGUI?733XW)yI6B%9c&;2wj(tHir->%Hdhpyc8}KDFP)XG2GT~Y&9W0RR2wNKWAul9 zGGtjdoT>2D%R_lx97Wv}6FHGw*FLhg&P8FYCG{q>yjM`d0dLndzhGO-hd5X1E>CmC zbBQY-w-TGef>PhBM4I&2iGRT`TpT*unmcM(<-Uc(=yulz<%HGI%%RB8;)Mp!-~0C=O^&EA56hJ1379QzW6IEO-O_mVeNF(10{x=+ZAEvgApMVrv3pJ5%a>S_Y-yY zED9bfyp+|pOlUg-6)~CZNWS#!cjW94-|>Hc;p8&Bq$WArw((K%L+{$ymiw)C#I@wF zqe#qrTN1n*T-N<{b}e*r>rB`z0~+@Y{pjj(G6U>ElRvdCL?zwKmJ?$6*$~x^*iLPh z*(l3mMaMr{tU}pen{%o+BL>}>rgE1w#h9&d@l4Qwp9syIK9o>&oi(2@kvGSjcso~c z?8)X{#f(sI^cpR6M%(~}5xSuWs+FY2azeQpqM3D{`=Zt3=4M#*@$rK%R>@va3-LsVy{*~b7 zLuN2?t~95-;~?;Lt>ThGy>22;kzYc#wsu`;^e-4;4uKFI;C`#scHz~J8U)57kAKVp z;8)t<+P2{D;H;4sKInjt57`}Tz7hdzZgNI;f#Ax2*RVesy{p~i7ZR{pje@M35ip;G z4VfRMG0srUV;3j_JepDRcS(5!E827;#rlYNn0U>e+%8R?o$T zhwbU#a*0G4odYm*nrHY6jEKMPSFA^q;nLz4EN+!F=lGX%;6RT(1>3dXb-cEW9FG=3 zggti-xIO#A{b58~0jMZp#=!>VORn}Uzz_&LP*<-RC~w@#K}@BwOe-WlnkMH$c%gZ| zcng$e*ssvBWi05uhl6un0`CJA*L}?@6 ztBDmf2p8};m%(^_MO|DLNyP!WgVnv1@35T+~KG z4t}@lOTIaE^DFPVN~p|2F6#0W-6Fa8s{OWa+cL=>eMBY^@1LlV^-xVzg{-D9 zS&5$K5lHk6V%0)5rFaH@Sq6-`F{;hm{(9Fst83lCV}hk%C8x1FqpqI$nm1OOjUF#o z(UieSOS}M48|?Q_HUg|S#~9Hb*ATgWP<+r946z4ucO#vSp!->SO>!TM$8aJvMP?Vcak{{;u+5c?CS4Ji zcufP>!6SbYK80X18xL8}&&-4ifryf_aYdNUvW=i|1kmrF61sw5JuhX&z=ClKmJu@i zO!Sbszs4;!uT`db3Z%=m*CFWqe^>BAs!8yh&3MfmoCTt=T3B_1)2*iMLSzHzIB7qm zO&t_Ob#v?bW)PK!ipTpNdiNKIHml>UMr9D=`~;@+1h6|@@pfHZca`}cuyXj4pOjGncZrFsh7VK zoIF>QwP~-aHzVi0f*{Pg7u4m1tpTifBC`wbE$D?SCk7xkt8ym`)DSsAFABi*Bf?!A zI)l5`0a=2vqs%zEd9Y?FVIFsBpUH4K>F&_-J`a9za`9~cDZJ>2FtpZl^t_2A1`OwR z=W(mC--(57e39BMMO5v%Mma?tInVJ11 z;&e2Oi`U3yB;5^TdzYW4bt}9(urcM^wP5;bb?7_iXF>i;#GOwNnbTnTNj}KHeOkmD z@=KB`Qs^oj=5tQp?9u&d4|>{(|$e9GXn9aC!~i6Mow4Z^h?Nv7@d>8T}=VV9j8eY!qNbxP-I+~)dBICM8K zi+pQgc*$8?6?`Qy%qL=GJ#eymRV2&1C72pKnaKv$-&2WHEXcWMex4|!Ic7PXMHRS( zvKU%Q;aK#f?0Ru@h;Kn=$Vh{Ukqj$Df)boPJU+DTff1bB z8)S*L%8Xjam?FI^M9(`fetv+EvM?_I1?wSQl~=m28cnjQL;kR9*K55wat1rb42>%rj37Bzf!_siy3ZKP{1Z=2|4LPjx% zT7S1;<-v=QC#%cg6?=L6UDe+bp8Qz|lw&17^W2-~Zo3H9-D--i8tz0dNjyB2A{#JT zu9v)^xzI3sYdK)rqwlUZGI&a-tgWJ9W7yS5R_TLr&<)Djh~XA6zR_$&95E;O-c@xf zM`lxfn5}X+jPiC_}3~e)zmIR6eN7EQR&g$eo$2V>8KsMjh>fY_@b#wyC zf3n&?JS;BJc|upBm8T-N|CCjp+A%ORuu6maTF{#Pu!BHMpnAGTqG*L8*?wkNqp;G2 z{Dcy-5e6cecVpDSVYU@)%D<3dp3udc{HRk{*|gkNn)s5!{D#VlK&wVNv_L0aK)5=fScGED!KpKo!czv;Q+WhTo)58Ve!_)7a~|QA{l$?< zFix*ypVJdVxb_Q?NY*`KMNJsamEOKPU=WcUmvm+B>VZ%ZGY=pZ^dP)oHbONn0jOUZ z%f!FP3b=;J!IiKEt=c2Q8i*`=76)}Kdymv=hx3wiS*K-RYR4v<_XJ~bM&^)vMn!06 z5PpphuCnOv6Y5B~HzA{Isbi2ndyDZw@Uc(*{<%s~>LJ)X&A zYhPRxPHj7OfBlJf2$`N;`LfzH{^J$TKl`0Hmht0~Et9(3(fMz*!ZRBNBTIYo7 z&6Ypa%4kaIy)pX032|CE-FA&8IOJX{Rfgc@`J=-J>e!Glg7kbwaHW&oKCN2b5ZMYl zVRJ(vJuaZKZ4=}LbyHL!{^9HE&8q7a16SgCXt!Bx!p2X{Dzx2ZTAVlM?1A=iMR6xP zPE&%bO_jL4Cc6dU>P5k-_soeZ=|XF0cmXR3?4m>*Tg_os3GeXRBblKyqzD#LTt#qF zH}fdfI;>^J+MsRCY|M3eyjDFhc%_HaIF?nu<5x$vwE z@xej$;6M8R+NlS~!Vr8pk-s-26?(SbpW_DMA3Fwfg=g%7N!*IU@C(||q(1y_->%A# zWqui65+bUKTul0eWf~vC-6IE|YZ+3#fg+1@&OvKDQqPkOWtL$7Jcj|VRw3G&dGOf0 z=%Wh{A)gl0GIm$ixX6_3NaD>0ad5VGQVVs0^65+jx=1UQkun;U8>LuCB=M82dy4WD zLzcJCu!r_RXw!|>vU#;_dmm9@)kRYu`7~!D(KoIQ$9jhd;|bGBQ|<9XQi#vBUDzEs zlYFE(Vw4yq)4OfymCrzw&O;b;obx>YDtk!nJ?XCquRsVdnBnq%wT;7&E7){#*!~X< z<6y^bOitCyKsK=t1+i>@lb!?iOeU!@K8#JDr~9b`5whpVS_`RHx;gx@eW+dZKtc9a zK37oJomD&`c3$*FllhJ1z>o|($;gM?y{n}i!r0<`5g|c7_NoSNyZW|4 z_}?1iQj44UB=^1^o2yvwC<`mPPF9KAHf3bzN3d_VKByEo2PoYqL;(h_8=zKuRe#?y z(nA{Vtvsc5zzMY4f)H>DD>F!X^H1ulWyy;i_U42aSRd2`u zt=P@k%QVk zI(K?vyt}B)JmQn%TQV7`h$IuQyKx42RIdYX2#k15Kc0X=p31(PpX~j|J%%*OUsT0k zvgzBcEJK`GU#PCwLXtsvaV2@?d(MfU@;^iR@XtJIS&>IRJ?|P~&?XOVEK5C5uY|Xe zg63I7AXm6^B6~dMx8s}rqbJ7#=&K6(Au`LVhbs5bnKF-?bM6;}1uf|g8?a=!zpm2C zHUNKWV8`8GM&rb#(tC85p-Cr}XPtA8Xn2c#C`>?2d+>e?IcQzw4C~&}I_lOitI*wd zoXg!W6dQ+zvv?VGWKoWH(mU+PTVTI~@O964bC7#MQZjLMU8-Y?%Bu|p_jAMgeBQJ@ zA}3DW3$|VTxQiFWR^Km@U4d$pm|C~CaHb99B4quwh3;;vz>{^ zSgrepoi$6b?|?ZO=yHa-$N3;;Z_>ubvUI4f5a5dUmSI!vJPk+mYb_!~@J^4f6Dxd5 zSuBf8CS|j3F2`dyYcY^UO}hu#@~IPZTaoF^J-;iVH-5;6`~&L#FTdbmKSg|D;aapK zVr}Rndv1vVvocfDip(FkYKzPuM>z9DelUaz#l?p}v1g>R*{hSlcT`NN;QrvOdB}csL%O^2(MngHQ&a!w z@TN(YsOGQ*G_fqHF-XU|5lD*-naUFW5c52!$>lBZ)H}T)#2?>APT`#I zac5T4YoWwnNnKr!eP@{kwgOMy5w-_K_?*3@ras6*TQJ`DVtv&0EFE$ytoH_a&77V) zkU)I5Jk(6iMI8NL>mc+Km}D~hrQ&8zL91Umo`xuzzf}47*bk3NNd#i^9VYv2JMU?v zqkKc9z(aaI59_L)5o%K^FscGGT_UUMbuat-=%k&2M+;K^J*MWLffy~{(?32VuAEoC zzaT}WY)z>13rAgkYkmO;t)p^2i~q=%A4+QY#CJRmi$LSKm!OFxiPgRH zS#h|4$w_IivL{Uvk+8nXy8i9ev_pN))h>C#^iRrhuj+5jbj=Y+yVt1#@iXi8>+s7^ zd(H1!xC@_#1os?bg6TLE#TC`xEazU=1d&J|FsG`_eP}rME5xE;8MrwwvnGi38`4J) zZz14;2!1gp}J& zWm39#w@4?p`Y%I^zTco+HU~-LB=^~ngx`UjzqQL*x*uWwKggNpJrcxr10;S3%TMlF z8D6(;G4wRPPHPRW+WghtoDb74<%~b>L1^DJ-RYdIJm%_mo6rR^amH-E^p&OSZvK?3 zP-(II*yA?a=Jb5tH6T#_ac5lj-l1JTN4@rap*)vx>W`U4?k{F>^?|lspMQa=mp-XI zZuHDz%Za?c{^)6Fn>D#kMzD1}UeyGyuM4pMuQVv|0 z6jiD1PZ;pnTR2`ubO=ML{1Ti3IRxbIC93TO?y^*J`ZH3q#w8A%p)ObvTUIf-nKQMu z)2hqoBU)ces0w$y=JfC7I2-ot;M1;isiS02iBZ0gvW5Isr7Mm1I8Ug((sleu?+#sm z_&;U&2QvRp+mxnU$h^&l0gLA%jn9M<7r%<#&4|{Z8x{F&wV)bMkOH+Z6f-*8l)Q~g&`aik=CKRYle{SP#WoOq`Mh9zJrRdqVIFP zzwaNenPKLfz3)}`T6^zx%&fbU8@3)|NvSm?88SDM;Nf>zkI*-lE0k<<+4+hg_#(3> zE_sN_+q~bh{P={0*Y+>YXo~LY{n1n=mj7j74h*Ys?-&n47U;tcan9+B9yyeEZ~F3s zw?>0w{p_~YJ63J5qz>v6XA$(pcO?)V6x^)4RO&@ywnu#Gn62dma{L107E!hp#x6Ol z;pfc$6rw;47EsZ6Q4oRkKU6d(@5dk=7HMNWNOLIS@a7szvNw_eBC;&q+n$%Ho?k%~ zi^X82w&Wb137YW06B9SY^Um;uOW2EXnPk^@;*IUVNY@}UdTBWy#=(BvV&l3VGV+`4 z)2$RO{(VmbA<%}TO%5;9Y2m*-i8*HRyr3Sx5Y@K_m;Xog{`yM58N<5!s!uGQKcy(> zQgF%#C5pA!YA9Z&_-3x5pjT}z9#lMyI?K1((!-BZbj^gGefJyxsDLB=c7IOGOMg;#S$1@alz`@R z-MfE+{l|WN5xh16iDlZ=BGz`1_%=Pah_4wFG(tp2Yn-I*bV&mZP<$Kh%zDMIe(B?tAh`y>1kN(!jJtpEj`VK3zkd3+N&olH1jf!kk)0`WOxWda2PU44Xh5bXSCn#XDz^)=^7qG10^7J4Giy*vPq!@aE+YiD(p zNPZTqpv3ED_v1A8+5i7x=mJzQ*`zCre>C4Z*+T+WaIM`lewGo?Ess6Xm~wh80SDOd z12vmMf?Ny9QIjQ&!M^UGEbYs1-Ge+3VdjDM(#R#%eEZe@X*6oR*&6fz0?oJl>~`Wm zbSVi5&oIvde<&6si%j|GAbxC*`#kZ-on@WTTgyxU=9xc?YUU4-&DxMrSamqSnZbilVIVBCJ5^{)4W z_4`W*hZixU_opfep-lg{|Eu1<#UL2LIK}Ez>H!`C<&dY(Pyx*LIu9@ZxWjBOdTaSn zJ-_K-lx96hxXSH`^@B+Q7WT7X{Bp5?I6(4=FCm4eRiK}q#^L_^(G$@px);EzPG<-^ zz`E{8K%gkoFxsq5aXPVY(Es*_R`q0ieED(HnU)kWO&IHWp4iCbfPRwKb3c?vpyI~; zNm|;psE*80Vl|%sK3Ymjz$=4>uRQpp=j8~_q`}Fs<_;sZes&1DErWhuRm81Ze%;!g z=i1wgIGYaNVBH{n_UQiu)*JPD!0j=K9E9 z^k<(qY>M|A-#(1t{`B$BiT_lAtAw{7yA`0H>dk4WB)I+F_yp9{SK+PKe@H6E^t|Qw zk#3n)R{z^bVX_TsSZ9*?fYi!$x~9p#uDG>c=1A&JS|?cYqC8&_*LuMloZ$wC$!;7W zG3Q>jpf<_#uyGuWCht!(VK-~|nGC*HlVZqmW@%~hAsVzSAz{up(xTua?q&ZnvG=%u z7OuzE6aDrb0c_Y*aHzgcxR-WG`t?Xd(U>RbP!+pSqjsI9FIz`hEIlDNtZZkwOKyHy zR7{Zg%6c*3#{1F0b?%MvPcscGQl}?5bs1uiDUxB&D|dtRd?akHB9jx~v487XcIt80 zX|C4KXnCJ8#gB@&gCewR{N;PtbKC@?v2PI;#_bNpsZ=#+bEONi(?Spe@!LQ7yB;bv@5S;KI;Jtq=0C}2a1})d*P?TAdkG=e> zXFj6|*mr;ca7W=L{;%Jm)p+HOq&s4e?f9qIv+*%39Ee#7D{?iOLm~u>|NPL#qKgS? zCzfFOEP?T?LjEGfP3hZRB~_e%Hu2jFf0Yr(+~ZtUq0Zj>y+4lt2-}*PN=*4MmBK>y zyt+aknRX;mXoYE@vcWr+rDzOk4eMjQr2fol#x{c)XKB5x#7&L zuin25Ge(xNK6r!tAtiDg`d-Xqv(CJE|876(&xyfq0`=J8^39A*EB82Rr*mGT-u1J* z+@8~8;-qS?I6t|aa8O?Qa@C$f0<}WY&>b9JnL(H~KzB8_BIvMmajo_d`O8H!Dz?l6 zv;4Y&T8&k;jY@vq!D#1SkoltyfL7>iz! z5NE1YfkA(xwhffWO@K!GAXpSoyWvoQW1goF*!;7y>pi z%uMEDhsK@C#ayR;4}tdcrYy>E1uDYB(uk{Q7>?&POv#QyvzAC4?WXhEkG?AJPpPhR zQM~?L$-e^=kaiN}p(ce7|0@)7*T14I$d%Hm`QHPqI^qUTZ|b_IKq{7+f-zw=>b7hH z`j~GIotcgbj?wK2P5rRsH#qiJYJa-gQ%C`QzfXD+`ernqpS0fn>0@P$&=nJiy&(u9W#>H=L`+^02(tob-^VK8;U|iB_)_=SCAK``v zgG+J{?56T9x=iqXrvGK6KfVl8!vOq)h*$ZqS~S%62SS929VUDLM_QdwL} zg8iK1&)@u?M(Bv~nKT_T4gD6sqqi|x&W2$?Z7qS~FeM?w-mmEWdE4(pl@&$Lc|g8q z@BEz;IzS&1{3-R@)PRsI$K!slGya^w$(58ZDk*Qq;+#4m*}Lew-#nA}WRm60KcNyJ z2WSManji6RoZu9EnB2Ac(QgRSZU-!!6&f(%Jwd>D%VlRG{=Ctj^A30~z^8bUW8f9X zs+~rsU*y(?d6NI2aV7sRYceqb7!DQRn19FNq{``l|KrDDCPm%czwKL6T zP(U)^8>Z|vLw|1^FI$ctq?d-+d$X(jDeMhbVBsr*C)v!Oj9tjo4%H3MR$5{{pWIDW z>M%5j$);aAvRghoFpm6BfMiF|Ve3(3Isa3|3uX$q2Lhcz4&A{~A|R6+@r*iyF7Q^s z9!NCgMz(|d|K|99QG#z$dhZWJ$Ggb5TzxBmuIR@9wGPXSAsEOcrT=e-FQ#aY!niKmH_{#qKz}`cETw70Wqmjg0 zFVf#xOHl^2Ql;^yWyWL!-Y7L~zuEMxPOK7q9E61RBtMYsBi!BTf5Y|Dt^u7ze_aO%;WE$H^E!z5?8InrWmgJTE7I6rCBPe1sW9# zU|rPxmXC;kWq*Nn_SS%cfUq#c=Zy>hEm;+~?e?nsVFzo}YHn)(+F}G;zO>3s6tJ`k zHi_EZSUWpAm<&YM`1om;`72BRw>@C7RN}pTj?|;KH+BM>I+4P4y4@EDw6rWg=3H$mH%O35c zF4$+l$=L)!74bfJjp%!}470or)U3SCM&I)Q+!Vwf`?sz{0#R4u&f0_QKQP1v`LgbY zBx8ReDXq3zs)x^Wv(WG86gU~hX+C&Jx-w$Ir4eqt7)V|#tx+~?=L0T~-gXn_3pM(T zLk9Hau&n}m1R@G}JBcLlF7YJlY@uoq>9#k$$(IbQ{Q4rv*h>e$ zSOnsz#WlJkCa6lIh{)$@z_{SOT%J3dZJ)G}K3KBQB8pN>CPEqo$g@QPHI|a(fvdNQ zUZ+OAbj9)+?!^e};ZR(vRH`ssu*6sA8?`Rqj!`<0&!4d&k6sXcUY(z0eDI7aw;gT0 zoS1IgJY}foRo1=8T61O7yIz_sksIU>BP~V=kTYE6otmR8u|Bx0vF-j+hGmgH-4E5K zSsrci_3+uUnl6I9Bwx0t_3bBEjT|bsA|1k6v?n-c=0i3Jbuu+5X7&-O!%I9bhDMZP ziSgC%ZS0O;-c{ZWFFcfxCbqpY_TwQbp7<%D*0ER_EF*R!?TmzBTM!H8cDfZ3{;KlKIeI zXU4BqbAMg?k(~$*eXgB%1ZimhO{Q2{%<;BL8>7Xp3Z$H25}S&*wT!7E4UBDh>M-UnP+v~M!ud3iE;alved_UnW=Q@V zKgSZ|(MZ?g^L(D?g+_MA`N|ays;-`3?rK)D{Wn;RoMnj{W$se@O-RY+7JjC@k|X+O zb~7o%O<~+uBn`hP0A>zNvP~wZvNnl6OojuGA}1CLefv6@O3iM@5k$2{k%Do-+;ARu zPeQPwzOI$H#3?bAdsS4vtp6I;euQT^A-WB>cwC)tl5NLslFruUB%3`MAFKi{b>{L5 zZ%vKolhkM$Xf!vLOwJ> z_r-Z4=PlYDLPM3kP^918tV*$%{G(52;zh%|R?Y0m)KU=tXjL+9x-E;37I$aHAv$%| zrH<)XKKjCcwpbIh6?1tdktQlf;UT&aZ5}6tMOOD`N}eor6yjF0f*<}Sf= zv@fTiZ3re%1s%y#K!@|7{|K-bc~s{D3$xE|zLRAD;-Be5K1a_PQi&U8kV zw?}TcZ&ATZ#@09N2jZrBfeECk#A)wSquw8A#b-D_U9U0FyE$sIHAhJFsFpR7i&_u3 z<}Ig-x%%!ol2sqJzrgA8Muy8#ZG}RToisV}RLMJ}x?qlwV%{*Ri8Yo zfy2cgb&-hkh=PNlw09lvABw0qn>Lq;*;1k#WHl=7R2k_vFYSM6v}ia^QZ%#GHSMl= zgD-o6*4~pJb%(uK;ztbV8nCK!6>Qx}3ldbm*cxW8iq(T|TM(z?9d`^ z{$WhF02X}$1qj=HhDW-&C%JK+X2Oc^K_{AG^X<;(O*&MpmV*fy zq!z0|2CUjRcf%aIm;dMp4UuR zC0BS@+iaQS(lQ|xr4+-rk;Hi`aiDe8E8 z5E-6N)}_``f~p8{R1h`Q(tHRvTWb0c$>->1_OV^uuy+)PN_j$(Peur1+vO!=oFfxb zdrgmdJJ%X7gO;R)wCeSfjw-`__*mx|J}{0jFXd!%d}y5^f;K0wj|>bMGK&LIDny{u z0c-fR5c`R%QycLjq{=O!v)MnWwd)S32jbb`6u)>}yF{(VVJdwf-os*JNv+u%N&3m; zpGH|qsUmenZt}XPT~I2#J8E?UPO$Qs zftTNnGJ2vb!OEt!u73K7a&sF7a#E$Fe3(C6X!f-L4R$qIM!v*aBe1IC-40f$RnAf7 zb>k4C!2ynlaKr~NzBf6`)MJKDR(iL&n+|@HA?)r&C7UYAC0%=>QTaGJes+^(z9wNl zXc1Iq(~CpWT{xNIAuE}E1=f|(Y^df4J0e^%o+|lCwrfFd>n_sBY1Z1v*TF8-x8TTZ ze(SI-t?LeMTa6c0AqL>u_g4Vi1arN61W;*A%tuGG1$X#Q&+G=K9&C^A`3A!CIQKczt+jO zFt0T3zYlAgUEhThSydRh%=smak=Qi~B|{5+nUnN`qeHehU6{5Saq$Q8@QCsgSLXRi zGl#WRfF7w16nuAp{X9`jl94iORmkF23|C}} zfO_$$^_#oiD_~*cb{rM12r!v`_C?b1``Y7u9W*3@$=-Hrpp~S7yR*RBdG)27t0z^- zRsx2YIU{nJ)C9rKC#x7p!Vs-a>|-`~!e;=4nIkurIRKJGdmQD`=0D~G=IDB(yVzz3 z+EceM&|$k2O!4&-oMjMoJl@_Kv4s$!Dm4u9pbxvrt!cemn$`j?YrdJa7STewr)-3} z%Q*^>S~8z$F!-?KLx5mvXY-O0=C6|ww3{J0hWm-aU-`#jh?$xoYRUBlNl0&l%5;3d z{0fSmI8V%98Z=R4@38Hvensot-BQnT@O%a=NkilG2Q9+e-tLD3)X(xk79oDgk_7&T zR^!NQ?KtP84&fiifPj501c41bk-Im&pZ=y$Kio#698d?yE=`yG{ZU-^GXzjQ+1hSA z?^hn|{WCD{vKYGlm`4gv$!39~)4GEm|JFc`JKFHf!blKiH?g1dBO}V|!=rrr7drQ5 z?eyrF`eXL{wiu;mxa~;P*=p1g)WDYasmn{MW&LAV9t;CFoL8RN&n{aH0};jYm_x4x zcekb6T=QAa(2MsJsZ9Za3U4VlQvnxj0SGGNjp$ND8-&hGM%{2Ax`HQl*hqGxyI{#| z9=C+|&RjLz;y43{YDrTg_aZZfGrdb_4tK4!vS=Z zoRmyPgL8%r^HhqR_m$_&73f3jqAUZA{1Ya&LLpN@@95~VyQGUN6;^i6;ir&JUK%|# z?!Ll$2ejVZhB3uvNA1$Gu-x=-X9y<20zLZ4XOo*YZnTxgJ6tMt3obID5gktXzTl*# zeob2lwCrj3?FJfLP+Vm~LhOKMsVH2&{N9og{IYJQGhJ?h0?1W7NsetT9Hf3wD?YD4 zbvbf_>`7?ExFl2YxKk(7zKnFi@BF`DjO zeQdH zpx!FLL&|j~3>F_|N*LR52tyuNLv#jXU$}=P$?GReN`trrh^+%lxj;ikJkD>^vYCf6 zB`j4xDk?UXsNy+4SeKGs1G+A48b>wH*so;$@p!tQDaTcT+jb}AMvjTIJ8pwtf%!jU z@FLavY!ws>>+gC9)B-$wlq!3ZwD%qF(53G1xA({&#$MifLL4KNk%;mjr^?%Bd&3`L zewXeB#Ue2GC(|K?TyINWvuWjG`9v+6Gq(p(CRrVw!!nyvGlK4YPD>)xV5Liq27{x! zs761LLIpZX;>~J{;5ygW*!O#uz~rE~u5{3T>1cTdsmigPuaW(%b?4SWrevL}UD)bp zYAJV_4p4{I+S_*&qZ9reGb9DOEd@uf_(R{uQ#`h>I@EPp zJJKZkG^TnI>zy)b)O8FuHIhUXZ-XFdXgU;gd)M&Q3_7icnU`^{+%z!O4g8=c4p@pd zvLb;yNoSb>s|1<~`pbSfmAI?uc~KT6pUdTgkT*>FCo~zwH~`CAc_c_ek_iFi&BHy>~3hahAv_0Wx~MxGytB;T@(D%wN1JIQE`D z^Yh}l)cbt$Fv+-SGU3iZR8e&!wKJeRc!8Xk%rOM3n!`kks&Cu}lTQ zpgNaEuscp&e(Py3fcrTrPE}3{$HTt#PTWYZf?mg{AsI5_&7jJUSfEi)GqEbFg9b^9 z-dt3Don>o2CvU9G^Hr+ak_Y0G{xd$JGM>&X_Rk|3;JD39k131k2cttL<=cPcTtzql z!=MQgKwy!L$mOlUvWW*^>R6wFxHyJO=7q}Zb(~&O`RJhAu#gqFv&r&-AdLxT!_L7r z7-@ulj{OOj!BPg0)#hZdQ*}h;%8<-&?fR^#vQgfRCK%8>iuZ8bgJ(fpXhR9n@PX6@ zH3iFmL>!zV<%hlK>D!z6r+v91lHK?_Fe9rE8$*|hG)IokR3hDA(7Gi1N zoNw?>NB*;Y%2}88OQ~7l*$EL#V~qcOSF0YQ7MrhRn2z<7JIHK~_^xJYsJUIO6IdrH zVXYJaiE=M6kEMyUjz$84Gt$C&cpTsR2)@pL3O!%ZD15W(3tla;>9%9la^#) zmnJ@Mpxl7VI$r>XdJ$XGy)Y2yuBg7uaR4XCZs85x9 zM1fDl&NNom;h(ki#N8u9t)pSq4vqh2wKs$%MS~3uP_6uMeYW(P|DCr70^gwgM!5i$UbKi{_un zj~HXs>0r-P!{(c>$QZA@T)NS@{E_#r!B3?KQa4n$tu~Y8hOcTAK9=$)08)aXWMMiQzqJn5}#sUjz`!02OLVMdtd$7Gn4+!`1c=YjJGf(iE z%d+0YzP)!u*T6Ud_l-w`IWuZCT*7l9-Q1^P`XfF1&VufrtlT$F@G;~3Rw0gQ#X!I8 zj73X60~{7dBX8<;pH>{rM)H*6x9o)h1-tj|LrI~CtbJ>8{1nZ)H|$)uw`A4r;!T0s zS(JjVj#OSptN;O+ESE%tGcX)}a;C5d-MiuHw^p@Au21u`&vB zuzdTd@&(1U+LU%hHz&I_nEe8a#H?#D*U5Zzm!IH%9~!Na06$Ac(ZtujM*9(hM+Zu- z3#PQkZ!L_C$FVYsi$!Njf%X_IAN-;}2svz97nq4P&kt^U`Au1!hld~==O2T+}Yg-cMoE{R<`#%Q<8Xt!{1*@F4)V7iG`AsEnB%pDS@?h>@o z=1do(Aj5OV=+<$!d=eFkZsr2CJFZlwb)Dx`NGm=@XL<}Cf~9*f$|$Vmw(-g_eYmLeoP(NyAb=sim zw4UaXXS=hT6FKufx7#;3QzkNE{1y#}c8T$zCDo=v1=SE8Yh%z|tGTp1VG3l;fL701 zBaAf605%^9+D{*NeBwvAX7~f%N5Byl&AtZIOW2GKpK@DEnNxXFAH(a|uxhUqP%a`y z7B4D`P}x(KAzoloZ+42;4OIXn;sw{B*%v9*Fx{KU{9z0WO(7w$w8QQ-v4 zVAWgUZDJJ-MH@bYcms`P+JstDxJX2~mZ_r8@qQaOaVS7Fq_|9E@hZJbbaBQ3zHL9%j}$yQ~jxm?sW* z7AjA-H%?>7Z9k&=s{A;~YQzcNCz$=1Lj{*34he^hc{uOJ@+9mk;EJh1Lo5WPz^{!S z&3{V6laGT-5~%Mbc{s8UzTvkYdBg88fPrZNtY}8(Z&8^3Siuwcbl#dRum@9yJsmgt znIOqBkN|VFy{TQ+aTVfd-5Ubv;6o{DJ_E&LD~sKN!>KesFL6tw$qh*MJUI&F`_cFS z!sbzLfa4Q&T`K=_MvlmOEGJcpaB#B(Syd+<5&I_$9=e>q_K%M2VRLh#<;1%?oH{4O z!{0f>GLsOWn73akD6mZo-|pb(3TSpxVRFNzZgD|!cm%@DPoyRvD;r zM|?r?{U4-3o}te^lk=G01wu~~8^ApL_mmf?$mwL{s8I2z`!TU^UtM}QwoT^g=4;ND zW+E8B4JY8E`m%5xG(Z%+l~7!)Qq}Gk(p|9p3@E-x5BB!oR4ekq>Uq2Ff zSi~@R1GL=JItaTHF76$?Yo;%_Aj1rMhVPHZnN8X;myATSZEiMNbGdwFl-9`IIVy9D zBAmizoppy2|NL81cpF_CD{X8>;A}9?Hy0VCZ>QPm^NH_w|N5J|t*39>NOocNlll>O zPCYRU`|7&~GDmNxDqpp8HuLnpxv}zfY@2*Rnz-F*i6xU6;5bHt_wqrrDkb~pMInzo z9@sO)@`5R|fr5zRT9PBq_JBp)6`#<%$g)d8Zb}1E-GxGUS9}0t{=C8;)+U;0fDSC& zP>~YdDl(H(`R4{XlIe@wprsb{^j8gat9Jcjglb9@vA2{iM)@5+e?Jif*qr_@FS^`j*Ap)~dcjZ8{udoyX{`KMSEx4SG7>u|8ke!7s7 z6^D*a`|?BFCeBc{lfjX}1-t(Ke(1{b(iBH@tNsj+RZA$#XuqGSR+JNVwU>l0%|lr? z>ouPv1QEB)4I!t3Ji3HgCs9xX0S=DLvPYOIB&9SwmpY34{^oGqGCtoPMsRM=-3=#ae!6pRhaglk_fbyE2Uj-#$a?&a`TVO+W6sn@p+L{4%F3 zZ+DRK^Wvd%wju%9_beLUE^5Br>i3{&cxMP(Hys)QIqqb}k^SGKbXhI^4?oZQaSfTo;@I)#=3AI!XQx0Olew#Ecj zo^d54sl3Wj-WBsm$-C6^Zn{HMX+smkmyNq_;VPjVoCVKYJK}eHk#3k!DkcfZM@-!Y zul{XI;ef`g=yg-5PRs8=TU(&X^X_Dl?X(UrR{MN?%-?*Ot<>TT#iOG7^2@E`oNx^e z&2fw4cKq#B+FPGqyNr2oTz^8p7c5@+f;wl%j$7)W#HE>qB5RdQ##?~pyh6v*qnXC2 z)Wy{WiLi&PC>1QybqUn%c^f(BjxxDjpCY*z)0cX0xzX+m};G@6tpmAs81@S3L916$9 zbV}-ErBo%aAUzSJ7jfopl?cj2nwp{S3!CJIo0?fk?0GugtXgzMYpd{)oZ6OVrr(t4 zH%8_F|A#=qp<{Sj<+r#8W5;_}v2Tavg0jfyf4%Q4Gj4Gdjfl%`Cup_6*eX)nY>eEf zO7HM3<5EB5+M1jHRL+#KyZFX!GCR3?=wl@TW7o&K3zT^W%51F-{Z+6R$q3A$G}FNm zP`bTP)4ZP=6EOq+6X*2+B6(XoDQ$?52VO}b7;8NGx$lguc02E~k~Av1 z3DEgkZ4T}+&DU9w-wkBNtt9C$&4%GZs+?j@Y&g zXB zxluG=Tg9<>wVl>I)m^sdrSH^*gNatf=rI#Zp?)VRUaiK{1A+|KWxH9{kzsuebt=j( zDAJsrHywh=cNH$GFpR)(~cE^A_OD@&^P#^G5Dy3kZ?6Gej!uH3M0#0nVTH@IK&4a9VstBVX_ z2zHf+yC2zF<=551FuMIhF}9CmXUC2%`({9AiBJY>ix>O~wqb2!6(8CHqPdgvkR{zC z?2iv$Hr6ebU1~Y|{CU7D7h+gLy#6SpHedTFhvzis!h3H?Ga;jNR-K6lw{Z?nmOzoV z{3vc#?~rFgg++_MK6vM=ZEHaJC#b~iS0wM6AZ#+3r6z4ucJ@7f4*V&I-ah5i#4W=l zfo$nG#^WxQzFBS+r;L#gUv+JoHWvEB-!XQ&5_i*ReeUsOD>G-Q{CJyBgjUpzjU_{5W~6d>>3vjsoDL_ zBcWjwM+hM1IYG{Q5j%FgEqS(O+ZR(ySIwAAdY%|BamVde%|l{w<#1b3wkVS?qML!y z9xhE-2)sJkUL#k`V?lT{PpUI)?;K>PQx>dCw9EEpGhH#?8+TuNZ;?%qiO0M4U z0Nvx_ltZu4+8B?}jFu9MS>4pq10B!jqQQBo{V^T+-7$GG)Abd~0Ud^jzdR72p3v~n zC9t0V+?igHFtU4BLtn%u^pt6?I-oSFC_WAvjgvb`mQW=>@_JfPN*apdZ7pW4VjRO? zc`8%8%Uid8&cn@a#yxwf)>n@^tJ4&1->SPUGk^4jywp}~kPbI~7%NHCl*{(eujg}020D+O0yGd@^Qg?Sp_zk4<;5G?qTP!pp8eR6|d9o0yhA9@DTMW0dU9NGiFY3AB=4-R&Ez*Ee#EEvp51;GI zzBgH~ipnpV69S`6t6Qj1QYI*tFM8k`@}gM{VyBi^cv1xOcBX;+!8wV`pWtQ@uX*oy6%Lb;>PIhesWf zPmF687uTA6nX30=B&L;?p;{|HdR=RLU|&&1{%Oed%VE`dl|ePcRyiO!i>7 z)v8pslBm=#Aeu#DQP}f9cIM#bhsXP&WXp_G z$vi1=1&1l=rGWxuzgv>poM-#@D&uXst7+GpHG|9sxVJ`?yZBKyC_9yv{>w9Z-A;ED zhi5XJmxfn;_!Mawoi)85;}7n4Md1D}qZ6aK52}R7S#X^#7s)0Ftou_}JI@`~dRX)+ zptOc%+XbD#1YXUwccxfCVBVEQLQeJkaBzue{*lI)e2M-ny3($LL>C$%H7ER)QQ9pe z(sZu%STz@%l`-9eh?}9G)TItHEum4Em3^AEId4xprW^Inp{;}CK(>9~`$oFyt36ZH z)97SlrUGY#23XrXRc^^gq&al3!80v*)+!ie+5&DdfUM*o_iRv1Etyt>xOFNhk}e_I z6$yt(2U(G(p;-9H>f@*E)3t6QC@U%H@4gyF_TcLA3Tm9(Dh>!{4it+k7=eUkbaTz* zsh%b>5)REIwC%Q$h+;V;t*1AhP9>T4r6D0osYM!FSI@{kctt;Me67a5jdP*;;9!Mj zp-ifqgT$_pA80bwDq^K=IewfX^Nqq(06r=k%QA6!b3IQD!M zwwRTl(aXNTULTeW=Y{Ts$lGIj~4~?)5Sq zhu|lye=bn4Jh*A!AX1D$eJm^}cF^LFF%kCa8%Y!JyU~WNFfMMP66ZXrEj9Ck$yItv zvqVlMcI9EDL2TM-w)IO=-SQ(S8^%uX5Xt}HjYHO*HUxTu19d|xc%~_| zNjgE?<>>8(_;hvuj}84W-)kR_O*33(iGYNg1V}+16kd!!Ue>MixOiQn=8;Ky_!T}AQa(BDGFw<3kW4BqOIkHm}$Ir5JUl)u`M!ubyb z2>8GYJifyhsF>*o>36jg{qZ6Xoj}f;hRCMgIFLf(hdt`n7l==WN6ftnwx7kcS}h8- z-2hg#;z>p}D?Z-#WzWN+3qp~ntr&ZU!B<3I?f#Sje=?qk?M74LZk0B5UteZz{^qn_ zcNwo@plqsHVYs+#hCE2Tr3ib-rNq+fkTzF2%M3BDu_EaH4%qi@K==`V(Q&@Q17R#* z(8r2u-FU01HzhK-WjW9+1=$t@gQP?Twj2gt*dp1Mr6g8z_L?9O9Ab{#u{R5J=o51mdxl491wqN7xnTn2pKl&VMMPkVepTc?-jK?!8-tPE`s*bhmPM&d; zaMW>r2%=cbpueeEf?|EKfWmQzKhpDFO~w~-YKo&4O-lCdV1SCaD4^gZYC`9r5xx3A z8aFOG>?BWdl1Gt4;R~fTtZ;Bnq)C2Uc-W#jBs8^TMT>^t>Ma7Et?7TbbVc9VOBh(7 zX>1Mc*3Aozqe{R>znGi)is>c!njbgVG2Rc{U>h%tgDZ|FwD~NHuF;c`9=2xSp-Fj@_}tdjbjSBry!yKS#K)-!Z&WPBNSU&FFJ^yMbH5R| zAmTj6R{4;!6;*)J^JFC(&36{P6W!4@5U;FG=a%Ri{jy??@Ea1^W5aN@tz3-l!>D)W zj{`X)S<2V+V&A1F;f53j)N8ek19HL_4i$((VleT9X`Gf52LDcfTM zr90FRd@-9ntOaRx6etKbOfE&MqLkaZP~jhZTAcFv=~4q3k*&i{B4E}|1y^yQ2AZvC zllGYhV3+BryD*QLnFp_({;a6P$e*0%qA@?`_3LuiuOlS?O?WsaPzM!uTBvsei5roEM|G^3G*_D3Cp0_vkqBV@^Aw@$wx;Eo+K$S03Wb^OWRNN@hJCdzAO z>-86hsfX>fZ&NTlVw6%-_7b=Av=Nw`&1T4aYN3z@c#oNRIy}9iSDbiaKEks;pI@<9 zP>L)-{$Wz=)(hc)MRQbV~v?!BxFrqBWqo0`5G_gi{ zGK-Fg=rxrYhaB!J6TL%H5_!sn1jW)A1l{5j3Pe$nQ-wcAw!;;`4@bSSXadP);Bm6L zJX(2DCB-27K+jzo;&G);;xL|?AT3E-ghN7ino{mX5ci#@ovbw6M^{qIgeVJBk3|Vb ztZDe?G@~Z3a~DyrH|=H|m#16;wO%iO<}UvHc!f}X?n}i&mi8r1fq8OHjSiF1ttQC* zHbryp7Vy_%Q8Jp?d(H*%5-?tQ%Jb|&F>+c@f-miEmuf+4STs{+Q6z_ToYAPlJr-uJ zeb2Z$N=W(LwO;3RxbF0I-nYJ|68evC7Z3wFrFA@Ak0AWr&|?fP&Z5m79Asef)K;*zWMG{@uSygU5C%kVY8+91jNE>-q{L5iIhylC_qb;}YEQH*-R zEc+@5@lTj7$WZ);K!(nxRLMWKfT!qjTjgGN9I{wUs*w@Z-4@Z<_akqm# ztu*1qn11Hj#R{q=Y`HDjMimybvg}cNiy~`&d{l$W*4&g-h#M*{&(~WM7VAl{Noi?s z46Q)n8Arl+D@W<=M-7O6Jm?_@Tw_Be{_xRh0qPgftf_znYp%{=Erwq{{1!BGU0;nr zb?KTuU;pDjk*`K9?7*<`E!PH`IVlOSb^`-Kr3)Q=(MFN z{Fo*IPh30msF9$RHIDI6*zsPnN4v|mWW7CZ3O2PzA}%-RRS~B59Zd1rj|-MUEza{B z1r$JVje5xh+rhMZV!3t>E(dZCL%f|Lv3-Y;tVUBFi&^y61ct3=8QwLL=l2eA18iu* zSRgoKPOD#ILZQqi0d5!=w??hJ$k-7hog)rXT7NITZqG!LLl>7fqj)^YIiJJY&zU#^ z4R~{Alaf7GFeRY#jyB z=Ej%o1vyQ%B2-#R<~O!AAw%X$Yi4pAt;5v{Ao{cnVnj)A@lwnVJF%mCYehG)d$blDp7 zM@|WEWSpL$mqdTyP1+Tdv+X(tO?t58WDP}b7Vpx|O{vguf95Sv2UA+tv*Cpolrf9< zx4RI;kA@7zwm44~eNkAi!ImjaeHBUMuK-rTHm9|}$2@_4Eim^zl)-ki?M}r(wx|0P z@xX|AhW`v;CPz>3aF**uJ2XpE;jc!T@=F$cWAWk_=Q(!6m#(?YFz$|-4zAg6smeJw zp6d@On;^Ffd$%%3+dw_<+4&c)Q=AQyKGb)Yvuu9@GN$Arg>n9Iqpn?1RO!OIiYe*x z25w1wSQKSU>K7Y+o$j2BXyk9Fq=6`X&d>F5f?$oqVzrtVO#)2$6btQ$iTYh+2)rM? z#6?Q`+_oG6&kcRSjO`r~j(0A-6W9uxyP*^}_-{K@WZ^|vXPI~pzjJwBLc#A)jc-A# zr782efrj(-QuWPa^JK52{$m59{UpexYdZC4aE*Ois>xEjH{m8|h!1K^>F$1m;7 z_v@k-i%K_pS}FLB$$`4jZGh*3WOj=X?{gqq?{B0jL{IpKp>`XkKVSL1!ggYCZ4DTk z()naTzn^2@yoE;Aw#>O5Mkn$Qj1>K#z{36fuc_AN6Et{^ZY-MT#arAI+zk_OjXc{XplXih$^~H*fq#pI)Qryi@ifOIUN~S(0)G zbg_F*1jWI@pooR-XTP^s7ZF0ZaGztFd3@!ln{@Uu!*QQH4}`?!u;8wK2R}7YOgw{_JZzaAdnp10EwutX%ho( zpAw8xV^WXg^T1u%*hKk6OncfSs4fdO&Fd8bt|=3|NdNy>d+WHUw(oyf5CajBR*;Zz zfB{B8L>iP6vtKi}8$`|mL4aL(Rq zuYRw!w{SI6&U|4PI-n`~@J+LjjQf5ug*m-cKct7($p`AP=avz)E~zW<;0H)`LMpA< zQMAIL$e909)C>LzQb!yWyWK?qth1)Pz90tk*^5wc3!)^X%D2@|?*kNa2iO6{Saajo z1@^+3aSUyt&7k}%=l+k^j+_)J)mJLbg3e8o51K;l3uMmfFP#PCa*bD%nJ(QM(UM?O zB5P*WFiVYJ+?hUT`)VMRS!udJi_Q6%#v(r`u;TKgQ{(^^lkE51{>-Sf8*LIA3(f9v zb|Z_RM&$T{*_~Z)cb8AsJySTrq2$`VE=&A(i?c&|GVI}Y=_t)mLe|>i`Bj|Z<#Nm| z%%y!2AXdMfE){F`PHOy8Rz21<*=D#m>D;)NYI;O}r~8ya>SQrOVuIUZejv4z4{AqJ2kX|_ii{m-*WDx7VYuMP-?6+_CU`z>>m@J!%Wdz~|==mrN;c2KEfw5Px9C*P5R3Jdb$aEnRP zsm{OM@eMUIdlk2bqOE}Ny(UbBK8&mPo3{?iEReNOU|)a`J-&@v~*MQFHC z;1B9mbX!@q_0Z%gX`Ef;`8IA5zSkgS_TIwfQg#nosOBgdekKnuf3oh#T5euyVQaR6 zj!KGA6ui>9vc|cYl)fPGrYx+5fuc;RB^%t62?Yuss4LA~;zw(LVX*HJgBv3)FqiNO zGx}dPrYei73HPMaFdN7!CcYw3xkSgwUMD84V!l}E-fqBF(Ws8|i%RbBywGA6>lrHP zVSx5Nvj_afD;ijCc#L4j=UZ-K;a;x8jqr5J{2b}| zRq+9(qXmvBpe(0f7YD=Kwl3Oa2R10B(4%mSXOSE=H+O5>m#4lPQn^K5;ueAgv8iA<>GB!`i>R0JIWy&ZHue@3jm?G0d9FQILvs) zxxo%cdO^1_)QV~qo?&u3YR8sW-04$_-s=a;ur8>{7qE`G%90mbZ-)ZT%NVX;WjiI^ zk+-_41AOR!@JLky08@!B$;6bVV6$2&zNjjX+J0MrVl2x3sKNb5-Bb@Ir*WK*h9adC zm#Z|Y=GftT8!A%Y!S55O+IO2DdI8oGyV#`kVel*g`;7<6+yO`-m218Lm&51V8~d?W zs3!|x;1RgeUnD+)Mj)zj!#2av9`TIV2hGUi-cAZVqs_@EJ zpd1L7YX$^>;W>=}pd*$IpY|H?h1srW#ntk`lUW9*t9o$^7dn>Gr3@X~$ol6gfSiKE z*e(h~kJJmt^32qJXD(0C-SfsLh5zqG(U1T#8H}(`%8bA8-fy%2{%1$%1)bbu#g;(` zSNL_FZ|0qsmL@%?FD~dkU&}Y}N+B+NoHtg8qa|IUBW5(mI!4Sf@C9KmIDIgjJ{82S zt>{XXXWwTKH2?%f+#>q6T@X-!)6IeOAB+l=4!M+Dz;ST-ft@b z{OOj1)GSKXFSGu8*WMW@Y9ohd)qz~Q*J6@illS+s4GAkyT-tL0F=xb|ZpBV)r)gJC zb$}++PjJ*SBRHt z(U@M~M8SeXVzV;Jb{^)?E+)O|H?4-%z4wJ#H^cKUaKc_i9WdNIqDLp$1g-kA%T!9@ zIM={;H8?DWB+}X!j~#la$!wDLhO0pJ9kg&=2SlE%%N9NQg3Ie^<%LDwv-N5YA!57D^O8EvFC~2xNoKbj${gz7y?=4)-ERyRO?o#kA*Zv)Ou2)^u9jdg)8# zl%K8ZEB8gJfCCdS-7^a2b)7HmgPm;osdV9j2AWi%gw?5*dZ>|UEN{4#7GoT}(*y%` z*bpn}UGrY7I_9brv%cPf%d8IpbXLxj_QMM~K?AM$sqH&3J&s~;wJALYC$>yp+6MwN zHDsRskiuj(difNM)C{P}KzES(YQm1~uGjqF#wcad+KXc^m14!NxpkwuB9Bpvw`G0% zB|IXL{GQZU;$tJ#`cV~UNWh5j^DA^CQ;L?iPnwRi1mHPeAGSwJi8OVblls#SQM~j3 z%;= zeB|CETp&k@6Yo+`o3Ge`-_Tk zs*;HU90%GPrOiDy43Ls*fN#0e0H*=+)||13VXD-Y=Nt8PbDCN~#%sU0V1G@?8Vsap z%Dya4=$w|0lS=%%u-w)*SG#Sq{U~tayPta9>i#f2`{sheHf2fW>}O*$4)KPbndj5 z5QmG*%z2t$6#pYkNP)az;X9i+7Jh5M1CxdY_$6DN3^h)Org) z16rS~)0^GQ4V=<3&n9X;nwvD$+Zm+FG-Q*NF059{3e(JYan^l2pVs^bYhrW9-T0gzrx@mE6a9aC}%J8@wC|XQ0GrxwTry@AMp({o}R_!7BqjG+v z9!IA4%%mC?fOs|?4Jgx0cMrCwII>*Dm-N%v3C&zEyvxPt$hW z<>*jZcoaYI^>Hl`hy6DZ+pJl_OR#))VUf#u3FbdL5I>X{9e85OEXoad6h{6 z(JieOw_aHD+wS2KX(WbzPi+IPx$oCA;+j?#?p)Vz?5ZH6{}PB$>atW8E2vcZ5_Y3G z`z6-~U~*}tpJJY+PqvH3I2h}nuVY)g+zEc;smLQL$Wnxuek5KuTv4DM8XNZZ0gw*K z(>g1AW{Aw5<&R+Z{*gi(cTsdZMgbp>x0Y?;D=BPJ9WaS@XhEAn84jbkOYcz>JnM4& zWKCX$1mzZElnkKXPH)$I%9y?CKuN9T{08e?a7qvv_u#jA?ccYWLHax*$P?f?ZJ)27 zp@IIGm*m1sVf8SJU@wbqhMu-oR z%;NkfljtT1in~d1POrxmwpq7) z@5JpeLj%lEd28Y6-L3%+FvN>{0X3`uXMI0-_pHbs_>Bnf?!{%wqi3mJ78`j9+oaU0 zcF&EX7!^5a74=>M7+T20+RdPY8K%^cuYy}Bp-I^N%U3#IRH~0;MSTfUw)O}2N`GS0 ziMsJA)!`Z_oQnp?`3~4==Xwie-ik5}@x* zoQ_ato%}X)p@|;=Eq$#JY#IKx@CP~J10$^fG(h4{@@xh^31`|-w61WXlb1nJSAT^w zg=9NF>OORqr>Ii^$~<5s#q{DWW?%)KPv9Io7lC=!haU|OM66k*mJC3807EelzYo3x z6f;Gd4ROw5VhGBG@_bC?p%Sjn%h81idyRUq2;{L+EVKuYbSYiV= zI$ar56rIfV4d5%hot<*%vyKhs(v9t7)8h?nJkAW#3lp_&gHQq0eM(wHNL6EXA*)}} zbHKH2;t8Ok+Klst;+{ta%rq-y2zDP~!i*Qsrz=>gIYOs?;#McTg3rhuRxiCgo%Z=h zf@a>L0(~B0rO=6vgKr6hY3Xz%{{r;&UhS&@`W+&nE@_pz2~cy-62C>G8K8xtnyZl( zW+^@qx&CFhcX=jbsu|(*>00R5BZO^+^8^{VXKUFZ>xRI!fIShv@QuM@44c*3!6vhP z&nxQPpDC%WYnQp^KVt%kHu*`S<^ExwM~kMRpOVJrAs#V4@=XSS6~fnYl;ysD$l{qf zL46-^B;t}$6|+=za@7aAj2#wscGQ+8;G`HIUa>J~yV|^qS`z-PnF>%pGs-BHx>D@} zf7BB77Y=Z%fCdCE8Cgr{l226(R*2WS7Vav(q`80)oEamI5ZmF7@29@E}OTL%b6JLUZUUrVS|3=qF6KN4;wO zA$EaSnTmc_!>Jfd3 zTuZis$&yKD)1J2m6ZM1L2ESy}=O+E3nsbURO<3K(rV>vG1`tOYAkT7<&t_J%6aP3?Mq!vcwJhamcbRVkJH3(5HGk>M*b>UCLy zF@n?T)RKiE^6Eg;6&eTKVk)NHmzQtx^`?r~+%iaajEIfX0nJhojLzO_KPE$XeA2kx z{5}NWwu2dLTdp!0ApIpj*5pcMJ!69tH76;#md-ls3{ya7h%Kw{{6|-ux^kOBpJ^qo zr#bE0|6>v3%q{>~lgzo>!Q3Fv*8R|0`85G^OQ<>MZy*gQr{Q9wkAhI?iyX#+!-hnW zMR1xqt(fflFon>L+kJZD`Xa{)J}$cN7h5Ee*cocQmqh-IEhp0*HugGJX=)2W1NE<^ z1m#jQ*UM$Sr{>Jr28UYn&*6Ji?N9=NjS7`aCJqcI=xFRw=s9b62IIQ-!x3+PV;(71 zyz;U~E|$}+yWJah<6(=9V#-CLm0Zy32V2C=PMCkC+O#vRDa3H8H zq$jm~$~Ms#sLt7_gicw{pg#Dw1I9a0ff!o#d4u#HIM;|6#rZ2~!kfL4JH`JuOiVFZ zIw8l7$#)|q83;FWB61{SYu}7&Wo-kX7!e}SJn|Kgd}l7tJ~qEon{`ynV)1lYJh?B~ zo7#G3Q|BtTvLFkWbFG9I{?M2&E`MA;y0WE)p&HISGD_yCe16AjPm`gW^N>S@{P!&1 zh?y%*9tAj`{)zGoUVF%9*m^EFa4?RjnDe z7dR{j;&AK`wl({hh4vUiGSK2wg0dVmvj$n=M}Ps&LftywtCpjb**!7XRwmqwJTb$X z4hc~73Rr5TmeJDxc1loODpEzuEpzq#=)gH`-S|KR9S;w$lH#QEftpLBbnbEs0sKKG=vJ!$8%^Jicw5M5kSSgp~DRjaT ze~-I5J^}s1v){{NJw5fuU)sUNK}wIUH0uk#Pg1B91`8%{B1c zJ)jdxr~09sD|I@ng(%><)eY|%%jnL!^NCz>icISY6#qJW`2nh2Zh4cWBlda8q`BnUacm9Ve9oN}IZZ8BKvtSVdXY*$~F; z^QV&yI^2Pt*{E}4cYl|V6C6}f{aj_gbUFzrR0&dLZO4VMYOBm9yxxIvYrfcbF8)yU z!MU0e=)9rYoXyAFGCp|zW#`*L;*n*ifU~*`7$3L%J=9?Ss(EB#!C^TyppJm<=^8k) zQ*YIuUB>73zGql)ft(SEgNha{uW1Tkqo^9Omcc!*&VcHF2I3(!Kyuqt`^*6PhX5B5 z-pvfRmqaFUsqxgmdKL@ufr)8=${rI-lX9uWMO>i2=Lr%CtL*bQkWMtKv@Dq}Q7LZ2 zpEqLb(;3WrY`!Nwwl%-04*s~!Qu2wUhqJOSvJ0;FnF&` z_TmJ=e&=B6iWPUr1KF)m-g3X)h8T*ih}sujI0{Zc5S@_j znvo})vrA;skI%nWyfJ929`9#u#gODn1oYT;?)wu8RBjxVdjp9>LAZ>9jcnk>b|WPv z&&my;7xAcA{_Vb((FdrKR%JV5kS2FKe&QL&aQvK5GV+XovYV8@xV67#XyURjWEDiv z1=dQ7T`pA{_|_+`SI=f8;JfrDMHB4C-EOrhma9cxXuG)RrBCF5V1lFJ?JHYO-V&>@ z^&af5t=?A|Bc94Bdko4>Qmmu^(hrg8QBHl1{!fDpaWIxm-sdXbmj;pq4c(C4Roo@z zOQofRooit!vll?Gh%q=0;)8nt&kh1Ov($HwB>En6_w;=^>y(k7 z(`&`0IVj$5jrNxLv0gv(U?ItDy&jY&2iB>UV7UnG260BU7Zq~c zIeiIm+Py7$WOg3|{V#K8)*<|0Jhv|7P8cIT*ne244$>b`T;#(4ODN7;F)!aS0z0Im zAUf$lPvJ`)p(P0mP=WEkKyi|UR^t?67DqJIN;_9P>0qPb zOG91EHe&a_AgROQr4u*z+_XIrrkrg{Q$n^01d@x%on4gG7TB8V1v zhB{89eW1MN0UYXZSudgO)|Psq?Gi%*j*!=g)2CEuF1XjD)Yvmg0q6CosXLJ4OU03U zaUs>CrRrhp@}q^%*5-u8Esd%>=8CSePhWSq@#5rZA$tJRB#M<3JgYXUWnHFN5{x-r z-n3R-il)d&kEA2LUibV$=23 zXO(vSlz9?HWaDv2?BUxRKwEW|(E6J?AOW|&cB=(FiOrRM6N?NSl-BBLj$q*2gXnCG ze8Kf^D}_>X$lH%aJ}))SldDTwSV0&(Y%Avu=Qi9JuqYjS#QGOL*OnPy6&4vZvyKSu zS-Xh|bW;uuGB_p>EGciAQfpLZeW;~ybf}Ci+lCe13cV2mR1SDn9ngB0Dp}b7{`0MTJjoNHIK?;_|B>ZNp7TkBT8OuT!aH(I&<*r({V=5cS zCgkS^e9jLC@L$I508>O3g1pndIw(p_L>%a#>B~zYGMnt}uy5Vl#3V7fXRef(?mF>6 z4Cn-=XIb1R43g=)QMoOu#Cd<4P+(QZw)|QrLc{8hX^k+9 z9aWwSEI;}3EV?XG2553iQNL}$T@ClP*ol`^zyw-Bjn2T?2o?SXXG0Iw`$lY&bmROT z4?vB1Kx=^JtZ?g1W-%ZIGMIO~ruj2xDV7oDYry@X%i2!f!hNmMq={RmxUC9`~k%L#`$eL=Nh|C)Jl0Rar)3yoksr14<~qJ;~tP*a2O<+cKB1a`7FBs^P8I@@*wq4{>W?p z{PusnnlVNm5mvz2w)yMkZ6}8M-&4w_2SO6?OJw5xoRXGLLy@PUS5|Kl4 zjG^;qbj^Jyn?nLp;#PC4TK|=&S_zN^yvAGF7yp9^J02kuA}MTP&HqYtBLNig9Mew& zk;qES85h*3mRZL_iDehUdi;M)MK@?p%qmbrruH_3vZ4>CFO2)Jx3%QW9+XE zDW@k8Cj~nb2$Oesp_T5*j-3b=q1K|PM0CyBG?8l~Ur|8{bgH|OW zKg)q(kUXNOJ!hjd&`2x!qRG;9ncC@o`C|6eM_{yW1Q@rtDuJCQv*f&EKNEoxoCih* z_XB2GUFl$1)V+qsF{Aqybz7-tY?lGlOW^}!m>=a-{C8UZWZUO_ji|GN6S`V=56nU> zWfj-7JL^rN4$|D8A?V^E8fWRnvUkr%U-h+hoptXNq2mPIie1?3D)m*0Lmp(^9LdJd zRa;*+tLJ-t){sH2_Y$SjpF;H?U*+pWHPaG$HAgFixXN*cIE(;ip@*7VQkGu*7p?S*(t%rk!v>H5WN;FX2?m)ApzX6RF@{vxSQPtAabJwDH_}(-IzThRD{= zo>4Pn&5LAd1ox!ZDcLxKI5c8$lM3=_a9NxOF{kCr%^aC_WUEO3t!DiU*+7yo`AR|3 z_r?8^!h6On50Pz0F54mC|7n(bZc|D~wrlLadSdcFrcj2o?jtKHmSX=4qudjTV@J-i z>y~l-iM9SQjS(q8x|VFEN-6%EgHh0;sHK6!>oZdCVMOtt{x}~SQXvzM$}|3{0O-fk zxJ_LNkwwaVtAfF;|KB7VK&|^uvi!iCXLaxjAQ|J^qsgEjU;h*58hP?DeJ2^t_k&jDlw{B^(SeWqake`fnf}AKUc~L9!%M=3@5n1Y=A0 zZ^7(B{)Gj^3R*bwUKbNj37&>pO-{lX#_Q@t&yYzi^e=L2LPjTI_m$%TPAoeyFH7h4 zxB23l2@?gi;L8+rbV0E2IKrN0eE!;VFA@@z4!v(GUK6`&hv_k^iH(nqP1&SyJ&YOy z6)KHQWk|L7zIKG=c$=HEvZuGVWuql>iN&gI{cvj}XOGVMOZOLUZo`bS+viX&^PNMz z{+w^CLtgaz)j2fsA3p+@9dFPW$y8^XRI_1{tB-P9qqikCYsa=fk)C|$@8@UmQC+;D zg^K%AL5trOX~mg8od5gnzrM2u6~F%1Uyv_|Ye&DTWFkU1`bu?j!c;0^P5RsZ<4f0{ zpL!Cgx2+ZuvJkhYdxB>ng>HQP^{GdY6Zw4k9UnF22LfPnzuxaiMe}m?zG5LY+a1rJ z|C0hgA44A;Pk)~Tbv@R3Zhb&a-p&zf8zGDF3&sB?<_&)|&$Ezke8AB(CK9`_b7V zX6~N)XL5>_gYU0)e@%1h=J&PcdwziqT^z54ro2|d)2@3fEnJnmT~fE>*)vwH||P!qKP zIEHlZ^69(Zd5)Y4czM9Z$EOyBFBW%ZN)1@JCbk^z$4vsxe(pd8+V5XDgKz&HLI{#9 z)O-enzm19foZI`$*2@fZh4xZBa)&x9iw3->9yvv;f97@g2zc+I^MMif@6_PaMn@g7 zL$aa@D)`!|3H?HuU&ifr2Vjmav}k6m&Oji-fdr^bI$mb;GChFe`sAw)bq|gicwck zE>D2o`Mf8k9-*n!&|!)q*QP|vczS}A5z1%>1fdn&Q;W3rVu4uyJ8$rj-vMMU;_|c0 z8@z5x=@@LH0zrfs+P^G!4H{By_Lvk}sow4w3jxJCrR>!y?ux$ocP>87Nqk-aX=UYC z_~WY47viTZx)2(1o4mxW;47T5UY_J4c~_YLs=J8gL>sVDxcQt0b5 z*6A)$wrxdRV@x#iyPB5^9FRhJ6Cx7j^6!K{LD}z@Zh4e$P)Bl1)Bp&#kZid@|(G^XRiX2hOIY;`SHjz0D%U6ao$uq4q3K7;#h)_8mcyK~oO7}ut$|Bni~&`Uk=v7n6b%NW_a_>a zCL->MYq{RfHq9tvZJgc{=u+q+OBx8u6|dAegqcrF7Ns!naYfy!lvY((E7?}wG|cB6 zcN(dD$f311-Kng0b6Uo!!h0u+&Rh4U^QvL$GHsc?R^Q@$IeUq3vsmP=s+9=AW2C~5 z8$xOhlGjsj<+2qI4^zg9Q)dvKBcoWG{Qu6Y-*xnT^a_#P${oGCbj1~8(S0^HAz{TW zy>#2VX<5@6_L@XR4^Y}&Jyf!(5ZK;^l=okwIn3U8X72lqPrL6rO<+y^U|+xnPviRb zU0@KHsO7Y|AS%N$wa2azu{U?#7d)BwY>tNJEs{6r%ZB9+Y>7?oXWbJrY0%7Vsgf~Q zv<-truaVHyv8`BAo2l7Dg5IyS7D<5-N>q>uBmE(;fO6lQYO4o?gPG=37 zzHJR0vt6lOfPUvtU*Yt-%&aOc*li3)CD+ku(OVnJ|7Co2h57n+#$V1KrZ!~VUbumJ zohI~o+rD$Ym6n?8rTX8^_c_!^-!$I2aS+3XGwzzj9Y^yrmQMz7| zEGFyLmVSA8-u(`LoHtsSjE0DqOIihQbfq@-%X4UhR4YW2SQh5oft|Se0$R7=nH!VI z9Xu%>u_mO`X86aMO;2tp_K8PMCv-DH$r zjlj5Bcty&r<8wlT`OA?a>xPC@rTRy`D+%qy%MHQHhxHzc4fiFEbtqg-+@OKFa*vet zVir2yB%ludL~F7bIf zrctrj{HemSdc)nZagAC-g`(VqV*QO|;%yrtEkr7YHCLEQob1L-q6^>-xj(iyosf%K#hCuLNuAjM+tkO=Es!baSbsu=iL}=Z=QQzAzAQ zs^keIOV-yr(AjzxB!~1VD%B5k4ccSS#LgdE%qn$d$ZD=vRm@QN@1oB)P%*cKF9*6K za`2N7&~nbMH;VQBiv(es#B88UT_HGI)t#M3{6j_PD~V=P%Y_RW0v3Iw&l22Pwf`^CG+VwvED|ND-{|J z*&772FN4U=nawA)d<;6k`rC8iBSm$j>Aqvn9CQhl$xV{rcCC%i_KtSXhX%Y@!ksEI zcQ(0x3=y66OkW<#Me=MKGnz$k3}f9?V`)Qu8~IYI3EB19{IBX4Dwd6dl+t!-OWLA= z(BjEm{4_ACYMeO}`O;=FtzF7*Jbmupp0 zgPLpM(Be&W+M?~x-`JpfFmF`2Pj%`9gi0%rG(cya5TC`oFGtC(PxQ!ge9N5PS$6G3 zXh#Ld?od#%eiXtdUr8e$mva;%YFicY2vA?0Su&-7@(Ou2x)T+!7q|W5?lQu-DrW(5 zC`^Q}+e6H|6Vd#YbP? z7s1wQE)MPusCfR^+1PMaP1fl!G}F?f!0SA2kHcYpM$9K!d9AT$y$e-2KJ$*85}&`@ zo6Ntm4vutK`;K%MPdkeL_<1wVsOP_!T?8pWB3y=t2k7Nz4?le|z(}QYWu+T0PHYK~ z40eIU+>W8*W*A6;E`~j0|HtD2RAMsD3aY#6d3%%^$u2&PB+*m^Y}HTn7!9?ddMxN7#$W2_Ghh! z7Iu6lOK8U{RtszT+;<`zZbuMG*G^e|)+M*mEc0uxzkmlZeNvPko6s;}GiUpG!=cXE z*G$i4{o&kRxmrTax3$A}=j>55pHw=S7g%K-+7pptyg9Tluo)hpIkNELrKr_OiT&6y+;?s(hx9t5rTA89v6eaH0OjNx>3+r2QtIy0tHmBe6!TJ$oYf84b>Di8hW#{wN^747#d}6yJLE4Hh{Xk$el}Zrv2Z=& z1{%>Ly$a0{t&9sVT$LQ05}jh5GK*-Y^`tobnl%GxY@bpfD%X+;4BhK0AABoHTd?w~ zCjUIq*z=6I0*9Lwg@@)o&Vh777UKTbV&B?cqla@?V?Htm(OwOURtIA=TDP2S6Vy%C zEE8cG>1!{aCCeOOdOF*6Led?)9Nk=e;)L^&%vY1ZQk_2LJ~jQZsC~b z62BCeXEU$~$VcVg{+4oJZIW`ES5e`|;V);a&{H&%*ur`Ie9aJ|vw<{P5^Q{-lw zFSGr}v1ZCx!iTj}o+VQCZ8S$EV=hDDM!|Tia`P-h>=%J}Kya*+>q-x}A`z28X6 z^=u;OMyL)o_7e4m;&K8-534Z>eFRR&Q4 z9mF?Ja>et$^*MTRS0MP^?txbD#@*Q)LN25ncrG#4&nD&z(@OEUVcv7=%Kp*JZPDTA za6&ovEpeeEA`T+=!{q(Uu1?G@$1fW75fv<*yXPkzRXNn7!bOXly-~Ef$g{(Tb)C%% znsu!3`=tkyaxN7*3peQ&Q!_J!uJ{`E+3jV~B|>bC?iQ5gg2(rG;0;vHfD?GytNbOL zYmsd7Xk0W zI??QvStnXnt%pk~Q+uNfmGWy$YFbg0_AVnAy0)p0Q#8Zu{W@G^x5#D55Z&{fw$dKR%ljV{zl0z1$rhx-GLtT%x$o1E$4Z%)fJ)Zd77l%{K$tbOyyz$zS z*+o|OlN9gAU0wEJ*2vun4YVleRLVdlGv&SN#L#HP85`=vU8=AbV-4fZd89U#w^YXV zJ*=C3JbQbVZWb%3K?r`S6s$(uYcc&gKIEAT9=jrx{oSD0&Az=X$kAJu%!1N__=$FT z*RWyP@M)>tsQGn<0(`vH!h!LQY3U~x;h*~ETU-*JPiN>{SN3_%`yAjuC+4E}HldFP zXu2n5Dik)PJg-Jy;6d0i64sJ%X|1)|l9sDQ9^4LKMugN&VMuYp!kG6394w_jo3-hN zPk!Fuqwu%jQ;oEs;YxSfRlJEi%PX-%RO&>p<0!N+^TF<>ZgHBq|0dE%*(;#J5o3XU zl5+Kl>jq1KPWnD#uFtl^w{|Yhaj%h(?|S=gI?j@&Bc5cn`@-yZ^C`<$Th(A*%zL*a zoO3Xzvi06h5KF}>=x;8j<@WQKFH5X~FJ$}O%OyrHDtwoRA6xT_eR z0yS^G-Gw@Pb@nMOYpUc@-4mfYzwm2eW4mv19C3{a8y8oGBfFECThbgd|D1%zZmm7k*b*)NAlCVUl#g;?YTDawlu$M-NwsRo*LL`ZO z#)iBeum>8e`xIJqbM|q^#MDk=tAIHf;J0y>8RQuhA<&RQX zkE_ORj&ELR0YXtk-vOY421%(g#g4Kz480en8>by8IccE}w4M>92}*k~sy`kQzjaMU zx7Q?}A0h98(DO-olCLp3OrHlYQlV%cnppR3HKNb5Z50%3uD={_vRY}*EF~5igYxD9E=qk* z%;fqgN*$Z(_l9Tg_!KHcHn>kl>$>LH&F@g5n=Yr*R351))K8nkzuQY6jZ+z})7mJG z4;Anw>D1fH_bg^Ly7=0n!WFBq#P&tu^!tYCu~|Mm)iQiljP~Y&ocGgOEJC@AhpH{u zqCsP&BF7+m9D8td*40!|wW6l}Wb19pBErUJ@Q^o+l1;XG6Jq^p;_#X3jGiW#(rYkP zJGXs>*c5a9b6c$mCMc76*JM`^PYSoQW)o*>E0+IpNI~(&au588lt)8RNj9rNH@2C;MGp0Cx>i9c6b{mTWQvM;^({|J}r+-AJaR7OS zdv`|0p=RC-n*JU^tS3V0{#!4Qciw0loxZrm9T(?u+f)=s8bQZ@IGC-Um)l!IW_H(bMY0||fZ1wbd06I=&*=9rPu zml=5+nZ|3~ib-kaZpyx1MiQf3j;l}R0ekN?HflQx6Rs2Y3}XVkfX{z<0fbavN1|6) ztwxe-S)=u5@(P$6CLYg~SIavoHm%pyP8SeLDyNDv9{b=8xmhmYbCzHC&&~7AxHWH; zC{wPwTN%;HR;oGdF$ua02NF~>;(tlsuK!*3lNs{AVT`}C>3C8#QdFAIxsBV(b;a*0 zxN8!)dKb`v95r7$nsQnf(^7GI(lmeV)Eg%knYCz|BZ_QFGY|>YBK@nP3JOF($^}WO z*PD7)LrYz6L>{&u+lIVwb>LC8m;9!d1Zzf9n7BspWjYJx1qOU$b($iC_Q$3BnH(VxB zrl_jAHdO2vUQq)n@CRF_Z?MePJG^i%@A))fzzXNKE0=LtD!($ zCwxV8DZ@8dIF$K%h@|t^U4Pw?cTag$oSSJ^BnTQyHJAdISKT2n1`iT$9I0oCT;% zdyQ;uRrYI`%5(iql@TOa{k=iQ+Lqfow7O(B0mi{a6-n!OW`?)%kOA@FSeQd$&X%~< zKE$x%wUgID)|KsUu@axEiPBv0!o;DzZOI+!i$C!TAjbUFfy74Jl1bJI<1(I%7Fvk& zpK})?BcVs)WPEXQfb<)C52rL8`U-mTHfve%QYIAF z3SB$>4B{xn^Hp{89}>>?WVUE^CDoE4Eq#hi(> z#7kfvx0o%`+2Y43*FM6zVg`8g`QB+jF8klP;ZkC_DWMQIg4KhW|8X-LB(~oyQayOl zq{KpYEo%ME6n=3w{WALuRv<-|Pu`GNdAW@_?EP|uJ`hIB>VBRc;I=1DcZ$&j)tfx= ztBY`wcg_LVZYCzU+baS z(aTTi*EZ5ZpOJ>HswfbQYR-xS{^a|J_hkR>75H9$lHwkm)OWAxuG_)82);|vImloe zC^3!O@8U2c->qvz4A3 z;uDUpA1(T{l8!;n4S_ZHWp*mamjIxlW`A~CcVes9^f(9AQRo#HTU+qA=vJn~oUY|y z7qI?R-r{_-PfPq_MwcT0@CUgh_CScwXyWHL)DU@evnUGJ%OqD^X}$b~th3rNw@U%T zGXof&Jis|zuBs68ntrhNc7WgH9#&u9Ua^>KSPC?9clkUd;3$;6{~;hj_8>`^s%!b= zmb_b|86J_MrmJYbupI0$X3X)izo1O)ipmg9qglDRScozN=?&IOxn4)yc?Iq_m&A*r zLlmOBvh=-{8T;h5e`7NE`q!ldHR2W$UUB&~^+=u^npw%?hUc_Ol-Nyi8I%to^) z6%TZ}fr_cwhIaN64)qaosZn|^+-Z4ncez2 zki@FsQp)}D{!*6LmxS-UVrwM?UF(B$PbN?*m4mk$R&O@6+HXEBEOT72z$YUmF(@%{rRl6ohx~st^lHG2Y&Txar zaLZ7!Y|dVaqhP#4msl!_YiG;aMAyQMfAPgYtjg+GJv%|l!rhGyj$jaEeMxb5Y-8|X zPCwsY^;<^N4-|r;c8&4gtyL%jM^1g`1|jrXIheNudJZ)&)QRI>0%MALifhKRYD|75*Ccp7_)) z7Ou1x`YBJj#{0x1Q~x}S_qL0C{ucD14At&koI8+%I3mOSm|D+Fj#!5;&>H6yF%y$*!VTos^=v}G?^JkFm*Im3`uPCWZc~=FR=!azSPY!OZ zrREGVT?5~5abLnsm>@ryz>Y#VXWR!gCmgm&;Sz|?^=#bOd78<}itL9nrEm;%;`Lt}fzK9ttYuCvaUkia&p?}M+ zX=>g74dQ3rCUPz|pw8!2k}fmSEB1Htt!c1L&F&;+MZ7E4DD_vf%{pE+^UFjnsP}cr zF}ae{6*TE@99+CTS>-jd?d~@N@}MtR&(7St<&eP@-!RoZII_&HL+zt~WuLA<@0RJ* zxK!o=X4PkUv<@ldr?aRmg;4w%q$YYc`%@MMUUC7w?^C>JtF;P9j;9K&Jl#(~d!Q}b zc5mE6Uk3BJCEoNF0Hx-)UaORs%Yx~pJg_8%Nu(q6{{aR7~2RKQJTF z^E*v8H9=FsjRf@U@ffp0S*c@#*E?`jjh{FZz|Yqmr0mWbYC$ zz1#p%eyw`Yp$w*g^}d8wD)dZNNX0dr8S4wzc6g8S1aZwDBaFHt{@H&1FQh==KprYY zofVn%wWl=za50+sb$+m~I>Cf$yb)@d)72);HleGO*}p)&a)6e&@8M({{BAV^{VXE- zRXkQ-drBJ>)8m}K*l(nO=zTtPL_pt`&1;1Pv>f+TLMl5IfB9xP#GHaaGzL(90!isg89g^W9v~nkf>ne}CEN)x?rQnr+N8uH~| zIl|>GRY*cye0qfx{gW1yQ5id@S+5^H86swnX}HYYebub+1!$-*5FH@44Y7-*o=kpt ztaB~EYxhnF>T_|9=EcmNAhY8q-<>G~O*C`QV^a3{cL$$nHdYqIw|M90g@MM2y# z8~GL4s5t?hfU~erEN^Vp0qB4z^l-Nx!zqL z8*MZM7!o5?K8I6~a0N%Wd?F_fgx*cX7(u&aI&W#(?Ap&889Cjhd=?XTGO~dx`h*vr z|Gn-%x3&1&IPv%M6nuDf$O#TDu1SG_`rI#yftGshhbMe^+DkD*AB}7;!Y@U2q$-E; z$QS838;bQNXfz(o47o1=?HI~VHEwCO`1QP|&fwtjFkLWuXGV{A%@xOq$C{1P*!t(~ zP3orAnG~m46vi~gSZfc+gd7boZz7?fJ&U5_xUW!&zU+%>Q7ax7+YLdCW!{e=d&f^9 zF>R>+@~H=YAJ6u0jfQ!|)L*JOI=_3D(WUXMy=KH6^el7fmV5x}iC6Oe8dDi~;*I(b zh}*-(S#|i$L&7Hv-xIwiU%xx$$8!VZ*m&j-TZoBSy?$m`zLR=Mg!^0^Xy3N`4Kw|8 z!^(U*nu>7t37+AB2^C9|nW>=AqI01WfYk}B#9a$;36LI$}!#pTCnW_O-= z$^YpjX%N`dOopa^!c17eodG31I=J)1s!`foG(y__G%xnIUZpN3`fw6zI?$ zc&Jlmzi1{0e|!KJyNrCGqS;{VpU~y*;tsVw8ymNiDLYVt%gbtq)A(+fCPCfb%G3>} z@a1;GreuF&M6@b|@iUxyAro>#UjC0GR_f>3U-;+^^aY<7TRDj>g5=w}jY^ zfNKhGZQVl&!=4}^j`-;zAvmVT2r^qaz;;m1Q`|-))p>?z=}@`Rw0>p_AZ}s`q zc5K~ewbUW7tX?>rZ>)m(cofx-PR>T`0T!cJEaX+<^s|E~a8F)!3UHjOqo2U=z>du5 zz(RViDi0X6TA~BFD_vD2B*@N5M{epQ;GxOLOM7gKe6GJo%?-7GF6D zRD57nny^KY{GbN2;=?aj$O~ z$`RGwuWWvV&?U!zTSVQ3?B|xnY*Wy*Jy#Cv!pW>I+3Uwg23JU8{(SGdiNvccHSPxutnj%+1K1^IYG*=Nqz)+W0(Iu^3}&bCz_gbXP5Y z&qB8PD^z*J;tm~M83W{y_soF_RgQL3b(XCcWLq0Lp&Ap9TA0cx43wK8#F~ZsAF&ol zcAkh(e>+Tpmc<~g^G02GkJsxh-4kc-9T}DL$BgKBaBaF}>j=zknbqsb8jEG$F*Rl= z+?eOGzll+l#4YsbjL@o8Z0rjm2Qig6mzgK3jiOu2UR_alf=gbO?a1`d={RY&qTPl_T)T6y(lu7SqMTj=j!kLE73M1xvIXAAZkVZVW?^zbed_P_ChDP!Ak`zDmCQ16k!BiGGjC&bG_tlx`n+@_C zn8$KEewwhQ4tb`yz*zr~?vifo{PVKi%5LnB5rQc2jKQ&8$KpGp1cxXz0sDuDa@rE~ zH~sr!Gqr?hLxh>#2{F~&?#-1@305_P2hRI{WR)x}sYtGpWQD+ZasI2rSdR~#hd?|i zH--g5y$3ihR}Zj@J>Cy?6^IB_+M!V6E0fXdz?btV<<4*Ib1&mROk4XgdhJvd zIcSxxV&Mqbvmp15m!AJMXlJ!tb@<9lefx##FRz$?1#x#-E(mB8ZuoyD)Aio0sLR{E zR+3MCqh4gzd9i^5`8|MGUJ^l5x-QpZaoY_p6X$-bkUmORufKar&|NWV`Cgb~;g&g< zJqec=e{jH zr`_UNNkXG?8t+5gwBY;k-q6cADz%o`ugOn;4+mmBO`icm{y9FLp(Kj((NZoS5CM|~XH#+fyi1M}70 zp{KHrSGeGYn;q*evbhx)Wr> zbX$%59c*`Q9rl7_9HA?v7zx7FEClG9Pl$?+Ry2Crzq*Tf2ubqHqbwiUWg^#wXutyE zk5mSkvI>WzGv1z&trY8=TF%Lrn3S#)**>V&&y&+QV0!FRTx`4((Ts4BYvY>*@H`sB`Rww?x@XLf(LoRrnD8cE+gW zqp)>-FI$8b0ix{po*;6jhV_7 zC(D+7QO(&(X$YB{1mI`OTuyxx77xB z(BtFe-DfM_h4FhAqf}hGeiH)9YCdV>)o}Di#V=pAUDPE7<`=zG-O(doh+q<4Q6Isl zx6qW9$hGLE%a>jF1$ES*-B5-&bCwOvuN4PfmLCpb_lB_acg<$JZi#npRw2pWvbT1o z)Tabjmk0?ab~)&1MHY70N=EU=Z!EFNz>xt9C$4Rn*xo2@x^Zj3mYJ6G4aWn8!f+&y z-;$U6N5R=rrf z8~XgDeMaJ|DzIPfP}eqKSRyBkf!Q@9J7jZb7Hw#0d!ch~85F^98Pd%39;bx-M=U+z zz3Lm%A*7Rmaje{zEI+@DXJ6SvPwOBjj!t}w0vRz52l3qJIvZnAx~Bm^9*?7-T!x%c zH3>B?;06cWT@)d_FO;&c{oDFsgIbnBvAJHUek|h25{$IL$kkZiPp*AI+*`b)>45l1 zRaZ_--M47f?V#@P#;g4YhaVTb_dmb~>@neiTmwgvxTD~~otI1OXpzw(0uS4HARa?4 zZ$=Gsq)yBE;+10ic!JPBM-_F6)<+9{znSxk?yVPfII z+!e>a?q>#3oiF z!!#9Knr^5qFj}4G`615Iw;g5=DO!6x6bC{ri*F7{a5b&I5+|I+NVt&Hx< zJy0(Oa*3FZ%Ie~1;=DWO?jT#Lce%LeAt>&&=_gwH4{U9tr-y>2w-+U2<>YK*2=XR+ z%#Y&<+{2#H#HVIF4c&qK{-1~H(Y@4urdJ5%?ir5>Ll;Zeg052^7|owh{#hE3az6D< zCh;p&j>CLFaCD5L+G>?L(a+6BDW#@&w7%-T1<U54g%=+^L}VPv?cBe-YNgl{y6 z_xgX3qmPM06!lgNRkgQ)dLt0b47kCXa&OmUV4hC2yG32%xWA)C*5bYQNJb5g%Z;-8f(v`D15f^^}|J7pD9> zI>q58jbmYWfyzS!k5|i4TLHn>`WT?m4<+r5wsQ$3{Kij&7?VqS7TuJ2M0nZLU`<%H zk>_BCRd&=Vt9Ibz>vFg=tL@g^?vor9(&7 zNfnM7WPhZKTRy1r_yjV2g9ba%t4?uclnlrC!tK2abb@{koghYq%?Oz+AB$ zeKt(rN#1aSE>3PzsQmTLYs+4*lQ8DufY>&KdgbXod^K#8U%OL_Hh`&jd0HLjKFB$v zH3WT+;XNW9bW3n=HH7sIF?|{Z+yZW-#7%3Xg%Y~LT3So_j+SP4UdhVYsb?r+3l|vg=agZ0~rfXL-&5t=B`^5ZJtP^gL-HwIV+UTnUzy zkdhpMkd3WNH5Rs|s_Ev|qxZay9(-9sg)dqy({@G4#OavxtX6T?lz#}CUhl;Yk7<=j z7Rph31z``Ne2+A&uVGagDEQQnzZ>&$JJC-) zL^so8Qv~mk#J0?Sj}kko`G-DFHfoSO{X8NtV?L=14DA->!lf4<26FAT7;dA0!4I5y zdN=QwF4D)X5U5fQo;BKBgii@{r&SY`EC8wvfUl7BcvJ~bYKGjnkFxEvc0*m-i$*!E zNh&P^v4tm1V>V2VMWW1p`1J{$Vk~`GyN68+OK2^a_MXh6ECN3kU7^!Co0d4HURS<1 z6I#Ak2TTPdZ{)t{!cHe=_l)dB1~J+#)h}l>4)?uV8;ZW^7nCFno;$~z zOb*sNNnIrN(m_s}tI(ukMsV@Sj76zIyF7r;>b?7?QvB0uPJzEGu>WMwJtJC$i@Km{ za~Kg!V2&W5e?Daj4X(-j_~8#;F0=bl2@S1ozhVxAwbC}&yY7Pd zgTk=M+(I2YXYQ?sE3f+oY#I^hK*;S>Y`p)^>_keDe&GJiZ~YR|(}87PCmu`IAwYot zPJ;z&XlVV&AtF0+fzcdAaG}e6wI$ZD6bA%!AEFa=v8fV`nP34m_81nmMrXZku@2Hd zMzxOeJ`hpceTPw!V%_qa?eBazHOd27{=~8+cJ0{mVnyy)habC16G1~oU_(2f{U(*; zz9inyGI4ym^@;^&){CJgAi=@ekL~bfOq1veLPMoPd|5{;@Na{R!Yr`?VEx`+c$`x0 z_N@QnHmF2${!o%b3ykn@%&^2os6^>bnU-oG(W5*{kX4WZA}WPyJ*4`> zNg(h;T)EnzRg!Tq2$yp*mKehjd9vYKaWv*iCH?BtazZc|Aop0XTVm#wa$HEX#htDn zwJ!Fbz1>4~q03dgP|B^NR61wl46Rxw-S>wl(8)RFwbDM@eAQ%`XL$M$ftthqmBJlX zPJ=>jT;ezFTj#8^?fQj3MXj=gxhK}vd~G@@`#uDFb7MNvg5|Vgwf$qKR`)($p zt|sQROU<;`Unu~Icpp1Pyt)3@$IkisMJ65PWQ$doZ#4EN5h)UzdSWX}t_-vq(29y6ssr`0|sF8hji~eCSe5;Eev~Z}lt{13ZI0<6aTF(9v0YHChvh+XzLMAXawnx$8@yJC=*)n+h`ER0qo~n9lf0 zzJ=YvsRO|MU=sxRyubX1oJH#Q=LLc8YKJ1DcGt{Un)yAHCWl^;L9%3mNgp~CbIn~E z0=qDzVzp86H)AQ-pEUgtdV7y=o>n$1y<@bi2ba#%%wQosZXzaqE=+XcaLS^mABoQ$`-{h;&RKcyid>T773Z}Asj(VNvi=G zmw;&yBd|M0FJBl+TmAJQUMSQPU#@N(Dt2Pxue-Cj={oVPeeH&Z$tpz+V9(qHRvFEt zj1C)~=mr}62OJcg_yPjN$Q6BoWaEeno4iAX@j5tYh+^vdZl6c0OyP0J@qGv<29$I0 z!)bUPiYS!jX9{blIn)h=!F@<(6_bgDCBakowI~OjgoYxpq96wcuqf^!$Gun z0ya}dNW4~L@d2CnzC%;D+_`$tzI$8orJw^;9{87ylg>c{Q0%poI~t`Z5b(x$Yqsg)P5+=^__I=!$;Pwun~r;Z z?@Lkd&*5^+M=A0k+)JpTdZT*9Q5BvwekYc5^9MhnOK4$wH2X;&BzToqrIm^I&&3^D zcVck)O|g@Unsvu(^cx=c&u6nTOI^TceNLjT2e)c;bER|Bb2f6mx#@qgc-)n~@nE<6 zGL!>^x2Z9ZRU*|*Eb{)=Zc<~gmPM=lA9(UNt`JxKZy@3u8;_JvMMu5+ps910xHDRX z7}uq|=1#?>a6yYPGzIJ27!uc^hrdO|y8_}cXNa+&S@P#L82ZW!4$pwLRm0pvi9^1H zDwFX`$nPEGXI9GlezDAVdHo&&ToCrrSo(RHxZdqI)!J`AhO{Akn#H#y4{25y(1mCn z9Xove2nCFfS2L?y==NqswDm4eg0(U81`i=@Tle;a3!C4px&xFdBFNI879YxDaf9J?67%m@=!34gH_Wfp zXOJKm{ni`xu(_3|I*HRttV$JrsIpSXIjA2g&vA4YFrJlW3P{Q$-_~>?WU}P3uFRR= z{YpA{LNW`ix0A*{95=LZ@8gfhkxO16`F1+x$y?gvd#SKC zDPvB{bFGm%SgfXUxHP}U!~eKc)u7f}$q^ga3RYDa{Z454sl>W3PBnJbpS#y$(xrbR z;&hEY@YC-n2)zC7C*UG5`$d~Nq}*9G<2qi}aai1Y6>|^^jZHk>s6()vtXH`3XC_|G zX%aWRb1OlduulIdAeCjCh${cw$X@|B$!d^blFUTs3#)E~H88p92g~HrNM3%VrOcFl zG?5&aPm}0QyZK4|&>;6{gAE&>r#>wjckYm3@Fn-?e?M4x2+#g^!8oWfyFd>|XUcUc6oSH4o*H0^mNx4_| zQEG*0*lBOt#7x+4D5zNf^vXTL5}|`;E7TNs+WM^_50APKg_jT8{j-Pql?`q+A|I2g z*qNk281Tyf-?bEf%w0BY!fHyg0CHp=9NRz#OPZR2v#Gg19`c)b^ z+RgvCZ3(@du;y$d!8u!luF&BTk6EQHl}80<4rpbpKULlS;Z=7gpMArO1sqcr zJaUI+F2?0rsFx?su*vf5eW{09JQxJdw zJ5q`&0)HPaMeME$9WVjtlF#4G$-gD6DJr%c`rJ*aI$HP~!diVDUupsq9_#lWE#f05Xc%%c@+7%G{12l3T`O@SXt#O#(j8-}I7^T)zk_3`J7VW+%5;Gc_%8ZZTn)_?fbO~7J zB{hU$C1I7?k*nN1JNnqKg3r0G4<$7gPOqnzX?|dlGo7=StC_2rQR~r>mdhokT(gG!W#JF-n*%(l1?`@@MgC?3(U>cCLG{s^x6EG*w0= zdAY5YGAMe<(H<)EKUDQjGlbA3MPvN&aI^64=pQ)6#3@=Io)^2&7h>$d@UthUG z%+7wHyksxN#{tpv`PY1sB~NmAxp7*)+jBw})0AYX9BcsZ?}aR+IAk^&7NM@F<16is zfHPhR=ihunhek;~z2&?&sb38v$^1=`_|@L5A+O?*qP6}9n-O&cr?7VYGed@gA!6-& z(oQuY%Ci91EE9T0^+LV#!qH~TP?6B&c}ruWxf2`?)?p|ZHaqyZifkjLWS||wQyM)^ zmXQ%Gk(oFMT%<3N+3OBfbgsr%%8H7999%dkKXS1hpZ&b-RP))MTkMRU(adBv-?>J; zcgP8(9~RCbD`}S6WL7F(Ck*i|?4lXUNIz&jCCK)kX9~~~n~-gZ4MSRkG>fAtm>H-k zzB3Oh|05T(k!=mWvQE7#{k*eX9c&zb$v7*nZusUz#%dkDDf|yS@|!*JscF7gd`W>E zhV9Kr3HcMb_d#Z6h9r&jK8>7C*I#Ev%Bdijur)$Y^CxVmESHYD8X^lar7}5wmZ1eB zKLkg|Yh=@iz3h*Dp&sfO+r6q#1;w8&^P8~OBr2^u#qP;Zg7O-FHf^0EApu>M%LIir z9VwV%!{dM~MqOU>iP`wB^%6qO9hhufs&_D4$Cs@L4f7tl&JGv1$xA18(CgF|}8+h45e(y1_RSE1VgD&qe9eZs}11jgv5gU+5By88WO z;+=O*$JQnly1F&~;~L7n4wR;pH_sj#m0!>F`U0z~EyAo2y8ShIGmYozGjRtMf+e|% zH%Ys?P2E-=+~$A18HPWBbl=p2l-T!bUOuL})yC|34$D!vH=J{Mw0YUU9|At!Oew!BEQM_q=m^*SX zWA*#X{M{K?uj-8C2Q7zZ9j$F!-NBdjGDl50-`9S0+b~T|!y;t%ZfP+3t#t=+%YQ#e z`mw-|- zOFc(km@co=sdrdH))mee(rDMMH66YgZhmW<{$aIc^GnT=>#>D|K-G*BvNvV4;Bg+8@L=XTTn za>v=|8F3Ny-j}EQX%ipYQCuaLWM2)h-j}Q)w!iP1fL;jxP#~xKug+A2brE<(&^l)( zW@+1nr8M3-PN^+DUO?Ysvkp^Kw(;Q@8Hv!4#?foHb%^)xH#@Uz7j=)8^L}u<9r||( z6~VxbWy`;YEjZhrtbdR)5-F)OgY8kS4w1Sixb)Nfa4G!7<-Zo|CxN#=9ln8=YlFaq zFR_kumk<>E5z1Y@VS@6dSJj2prB#oM)QR>BHGDA^G3?tDPZ`(wOD<~;)l_itE=jJv+7Kx*_@jLjgU%pRCQ1I z>QbYaTYf)?GagG5Uv6QnG|l|`#QDb?6I`p5zH>N5wPYt*o;Y*NFkKEI0cJtvg*AjFp;dnqs^3w(7e-VS z^FRC&ej%cWYK5+DVkyk!9w8RV@vt>P$x-$DX;epy(Q~l~y$QkDICHQd)JF(tu)jhO zA4{hf?A#fSU&dYmr_CvIzn9JGUVN`O8*E4B75YE8uZ99q@D4ZO(`zyZPV~OM{BZei zP)W!p`Ma}Zs^38QG0q)?tNl9a^DIaZ+I=Nw!qzRWM8`n zdhZYBo^j`O^sC&bZ5=g{D5S2D-w1R+3I{GE(6H{2ZQ*LcKfVBoAk?0jZR%_g!>Mwz zuj_o*O4609&+0uJMUDMm?Ru{$qZ<;=eZ7-*K`Z_%He6WJfa8$RhF5=7&Oh%n9rk5w`ODM?vif%|#h1{w{@aa~0`711Z_CAE;_JkvP-jX=bx@z zRSW%JS+?K>NM?%y?$O(Cr!T^tb0mO}YZR3G?Tn{B;c*0U5RDpE^Ke5-`PdV-blm)+ zJxc!cmx#X8GyKJbgMl7*{7z-$%Z-q0n0w}&M_9HQOur7JhjeYi&9_ah_eDl`xK?O; zW(gp|Le+cLs|O-cS@gtzi>#l^S*ll?j5dtQa%yEBql3q85c?-8a6n2{$G?92o!R<5 zk7W6f^d0xX#+|152jhaRMv*xa1H%BX@Tt;L@j!shCPA$lB6UIF<5tca_yqG@Q4DLNfpVz z9^ub?M9V}xQ{-mG6Y4JU3?-gemeBOJeG;N>3Y-M=s&PrHdP}006+a zrAlp&gz2A)tkDdFKFw`({SFbKyiiE~PkEu$C8~-wnLRaD)flf3mN^%zG1-Vri`j;< z2wja%Y`s_Xy;vtZf~<5E5+Q8H-SFsr;Q`zf73bjxGR|kQhq=h(3RK;%{i)0$e{(eb}%Q#1WsL+za9oi(1T~Q9Hkv ztnE_VJ=<{hdbpzUgK@Ne%h>;v!d>bR`9evs?=$5q{VCbP!92y7)rKr(KY6qHQ9j(4 z9=vo11OPr9h&?Lh4Q2(h?zHnZ9p)vUnMGJQCf)-CXl#!HFj1$44lNaGdl&xaq<_ZQ zyQjoPV}kqPfk3fY!5%2&lz0N3>xY&_PbZug_*7DwWC-_$U!ufUcp5ISHoR@k5)W4u z)Y45e^jCh|$vNTNW$*E9Oxo)65!Hoze?X(*t?R4?$MTEkE#|SQJI9}qtph2moB0Zz zoj(mex4p0OX7|a*H+^RjZAo6HT`}Sc1jc1P@NZb?ppl0+ePXLl;=!q}{P<>UH3oXn zpYiYV!it zi5maFJpO2xLgem!uk$~=!Q~D+&5GK{vAgy9G#q_yD5dXIdt0d|YW+^=Y1UN%#$+y6 zR?wLmW{Z6NMog%m9z!ooa0RSxTLxk zR(Y~0qkf#X>%gtpHtNnxZr*z7-&tQ^RnQYs-8) z6$Ml9NQskqOIXk&5U3iHV(I+{6 zI*e;M8X5jQ1h~mFgz(%AWlV?DWH-hn0}P8VDh@jxUJT?jlld0ftldG(x@EaXaJ8uL zd(=~IWq-Rxdv+sd=RRoYYClMJ@BZ7S%!iia;2SN*H%^pab^5(TentzUO+UyeMxHa{ z$o|<_UAP);NEFv;_{@1a z_F7aB-WzmOPtSOO!OUH(2`7sfI2MM(PtWTup1l`U#I;jTgXOQzFEx~njF4(cRRmiP zs55c>1xNd{=62q=06OjL3~ObaD9x0cM7muS_S9kL)cSaXXjmMVP!mXWHxrj!rFwOj z`~oQY?DejC&+K`Ei7|(*$*{T@5C5uc7-EtqeKjwOK zKwRiOsw97=^42a&w~{5}%{JKzjXx%L zg5C>>`%O;zw{EhDVv8R5mHQJE{KJaMHO}!0gmzJjy5Q8$E2gr@W;@s_e*G7%f9C$5 zot_}s)$1Cswe}sDhV%7WOY4hB>|%j zC8MRILwpp5rChfd<1h4`A{#T1HIZ8#4vn;+I5JTN>4i0CYcwit2giW(!!j`{^DX5~ zSCo#<5>qHxn0}|q{xJ>w+npr%_>m|PIw@<9kNHk@!w#bmou=_YpGmGNkeL&^DB~Y~>mdG7{d*AD)e*!j;@^y+ohAWq2Rg%l-gmWMOD` z$~ot`BL%3A-u&Be`|pqV!c`4s0xYzW{_0HWisnj=#+AnRSO4C5gdo_q96l^UDG)Mc z_TvIVZU)wQ=OXnK-%oD+d33l1wnQ#p#9{ zLS@*O3F;f~7|&2o1k7KjVoI}rui4bUaKXA3A*$M7+MsZZ7+Qfs^{ZWXa>~zgs8-p6 z0zu{B)7Vk<6QJ9gKiUIrj_xOrL!mlI?}Q!Zo_VK>Jro^o33&eg#IFfb6(71D*2Tr$ zK`39#oCu7aJ6Qa2`U2M^g7j&gGgLviDOnT(0!u9TO4*tf{0(q>3x7BeyC!!vtj$!H z6;6!ggy)9m6z9mRG14$vG`k@-5&9Em6OVCX=+Q9K)dmC8OvRdafjRa4(?WH4vIY|ZeopM`vJLYE)G z;fZzO2=V6`J@a4xIw~n(Z>#aC z^vKmsy5?dBt#^V9cy^Y7U{ikgPX47GdoyV}0@8Kq)O>BFJ1KHsDoQVP`+rBQ|JMx~ zBCgfHL8G4WeHxJQW{0W=+ONAWA!#~HmKQ+o&0*Vw4I)2DS$a)9(vKIP7^Y+k1mlR( zKDP9??Z6L2dnnR(#v+lP2@C3T{i}O9^F9C|2QJ6};WoKnB$F!ukjWNJ+*pOnVw-GT_6WCyA`LI zy#+-s3;oz(i)#iXH@(Z#4yrg8UzAmIvUAs&1ChQ()f?>l+jdVf9V{w1e$>z1+hyrI za4V9w39Op~r#-h@xV7WUvX-ZVBv>jHo>9%N?bqO0oou?x^?i$QK02MebIyC#-pJZ` zx3?x6Cz6Xt4X}e#eS_cJC$F(@u?;g>w8UYbWFqYE#;)S#wtSy;d22IbPfT^dlNNX8 z_cOlbY^72hVMgztA4iVttXUqxnDwdA5lCj+?6vuxdB#>snVuHgpS9wz z^ybU+Ks z@q+rKG~hrjrZ36}55nEwH6SclAM;)Q)dXpGi1?VMaC0X|M#EbN5yb`~EGtW?UZW}D zOsSq;+)H&3hcbeejDGN1-=*{m)Co-WtQ*RSb9BeE2F>Iv3lukHCN%w6!sJTls4S`P zfL{-NwK*(Ap36E0tTZQZ&~$pgy&KT+mH!LYRVTc>t_!dk05;HtALy;zNT3Ng&=}!AO0*?6x7<=rQHIK@2u0dQAiFkr~X(w|Jee@4x_? zbSk;Pw3PAox4Dznl;{?_x4al(`2gs}dU`0VIF-clU zFeA5Do4qW&Q7Mu}_V5PJh6#u5#B}%$%>d>T1K4OTzh=?+4OvqEoG#U&FSWXhArz~_ zg_?tXJXCF0iJS#I7d5 z6~n5HN+-cokzkM{7ObP*G33{FTQt1tcFuf8l}yV3hsW{CAL##Jr2jou|M64YZ3lZI zK*n*WVn$sd#T9lbTtQ&Q16f6!&9j!zbH6$07M|0`4?^<~iq?3vQ?fJOrR4@RW)7*} zp>pnxTy-D2DjzfBt`(69kXMzf+9ECyyD#rG1DcyQK28+1h}^g>X%Io!v#FSHOHkW+ zdGBD*k)E(<&=T+exh?dHM8QKW^wc0~g`iI6$Qheq4aX=YjPqR!)SJ-tkFv?TrB)I2 zI94Uf5=ndI%s0>~Y@<^`_kx9lRRACsr54QdEi$2N5rJOOagnpLOA=-}@cQsNl~P{s zc{-4wf$I+;PMA_dbZCQMWlS3#BWw*wm{fX$(D+h`mw`SicfUw z^j39);cz0k;gj`~Ho?}9*BjTFudTi)D2Vv}lD+ds6uS`W%VA}qp9+K3xBDpzcgU-{ z*rEcyC`G7TqI^hp!PtiWiBb6YJInptB>!bmUB$gp)9GQLzM&6fIZ1K)q-8lRVABUX zwEOM^tN7xV3q-_MZ!-%9hFDuA?XtM~f6i~h3vM!zIfK;&=J47jK^2Jy>XR+B&ft2^ z%T=xx8KWP0#na@<`|m*X$BlCPRm`z$k~`{Oxq9qBJ~xy3#5PhgT)yZj#ny=i+K<;} zzNy%K8C;t~)1vcIVI~M@9r$3<^7=k1&e~?#a00|5pCf78OCt0Z>?Bar8;tjo5tp6h z+RUU|`)X9RVUaLcONM&Lx(O;p1>JlIVNo4fwf4JV?ebvx$iyX+;h>WixHOEMoC~S; z*YRuFb>V1tsb{^6oSeRMU~f+k-^ho{X1u9s0!?UP1jcdFLR$h}1zhpmrs%YtB`?$H z@;-67j~HtCbDoj)n)^FA^qb`q+wIrmez1MrgfFC4ngS7dDTrI={3Gj`2mbOr+b}PI zn8PvR1!Mf6IK6+sb(@pCvp5_2#Elm2d=xg30p*rw zaqX>7Y@?OTi3p(9eFAk7;1v|T%+?{=s64Y5LB_1`1bsEF&i{mJ*#eRM&S~l^usTh2 zboJ5I+r9?O_~aIXrJ;Oq@a#BiE#Y_-UbOxoK4lP*l5ZyY`M=*!{kMPQw06lldafP(qwI_8h z*=c9H=V_>Y;7N?e<0Fl)O-MzEu5IjSk_2)b4^IpB>yeG|y(tG5Nvc4|m-kC{710*w zdUmReFF&a!;a)*`diU*;FPze`!xA1;29@{h0_S&aC(aD=G-upQ3fZLhnHI%do~)Nj zdYI5yD>e(JB#@8T9ea13g;pd2STnFm9Z*}%L410JH;i4l+rBvy{!mLcFI?`9YOhpN zZl}o5v3X`#Zva;<840(<&{Mw5Sl7zSdxs}EVTBHlem;jgA*6G``_fVy5V4W>ct-VN zyIkTEm-Ahppqhl>&EDE`v*;U%zdW~;LGa~U&@Ho881VR=BTj86Dwqk_Z z2hfRsJBgnc(d*LXfNdM@q9(!R54uE({atG(cx?6|X@*&@kmsr<$AP9r|lS}mgs7LD2PJry|jFJGvR5oYvd;IJ6_IHOug9SzO5V6ng=@)P7mhclCC#xIJnC3FiOC7OJcqBOK3V9ghxyO$K!%$O%DVHADp>g z!A9*B8C9J#25@#>26m?vP;XaK{+b5=aWj6sIe(r|$n4b|8b7!_@y$4hAHv5Fz!X6`i`NjxY`X^G+~IT}$rzUp@TD<~!L?fZiZ-#1%2OBY~xjVYS7=873gXYLqZM5d(fB7`a8 z{UmogRlJG}4;4qLZq^K8Z#KnSvmMV|TeKLQhuvL2aI@F6|MuqRGoib;MY?@|)r%FgZ9;T6Uz&Xa~vE`Td69m78DCC}C_wjo9GQ@;u(zx>9?2mi=h+2&nu{OqTgyne%>JY5QnE1vc3%*D5&K_)q2`KHcVVOb)E zI*wV}ETy|;8dX`-NPVyz_j@Bl@Eot5yBW!x?Pu#~!&at(toW<}qRV{etVe#A$epEg z5-wKaNtd`g6udt93TMZ>P@A$eA61r@mV?4kVP!gFKHf#v^EvkPdDCI(>@#n56_61r zX^9i=vzP|OqOtY4E$J!5y(iNz7Hh}qo}e4R&E;yg5oQdf2zRs&%w2d+!oX}UZR8e< zWDc}RKKpIOO!P7LTR~S@Ku0!3?A9LEJvO8-AsWFP+O6lbFZ3<-c^o-IFn5(}cio#t zMYhzIlHNmZ48^=xQ;;JRkP6?+x^smLF`aE40h3TIBuU1st4Mk0qze&nQcRk@13QQP z3HZNc?J@f^6f;IIX-rWq#v$IQj>d2b4se{&H)T+o=3hEomP*`qu<*~DoEo`W@8j&E z+ceivm26$ETY`A?BFeBvWp6|$S{tc|s+3GuFdes%<1QZ7QyX8iuY>I~z;|L9N-MDU ztj%7*b{n0min`AgtZhVYeF8q-y+zMIi_!RS=ODaE4*_@aL>;6Kmu5|?&s}zqNCUow zP1hhsHN8iY?lMZq#6HgWgfBUx|F#A}+p(L>$i!nOJoo_dlN@pm_FRBN(E4U*r*L#X z^jbg6ytDw~`(!^`M;!_t=;4vqi=9X6sGBMn%I!RqdZ~pHS(TcF?AniFDZpIw%Of{Z z`B(#p78 zLIO^Ui6D!GX@#W{<9H`DiCE zo*5*4JR9z%bvw5hWhc?Fx+=MRN0}$(Y;41wR>$uqBuVBfx@fCT$B1IJ)=WZdC{Cv^ z7Bv^(AGW6sAzfVD^o5{VAUXFU5N%#hiUvCvwm!C*qT|ny2x^0MSp1B_jObUfF9ksQ z$Xa2vSnl;9HC=L{0+zhpW-}<|D&Vn%hh$8Is>rS%D9lnk5&b&W&h;! z-=h5$`9tpF8?f_h?Ey6H3jHdNTiKh@yH2oYi3c7V5p8=p%#n!Zq$s!6vYTDBHXy*l zsCqpDw;PkJxY)FvZ{r#o;g;}!7W+V)fZ`By!9r*RvdvuiX^aFh5}HD~@d8o{gqN3c zro$vO5~=C3&O$WQ-#lDNH>Bx`t8V_tS^kN41m-8Zz>;r19=2NnSm?rTTiePri5}O} zW4rQKtMp4ZkTpv0g92oe6m^<kHSP*nlfriQl%xKd%Zg9+cuU(%hMF>cX-ia z>uivSoGeOhH6{NID9?~N|SK;`d-66L}0_Rjiw}Dk@;oPt%ukDiTkv~XU-D` z_ie8&d@5dc1i$B=?vflqjfPwJKq~u&Q4OEBWMNSkKExWHujL&oZ9(Si5O8`TZcNtB z96gxc?H}}rnc5Pz9u=Y6qE`$=4d_|irb?D!ITyZ5?};enCf(PDV8BK8p?iCfG-|Nn zI1QsJ5G18hfRXT-<@wKBH@uydMY-4nqg2!GylogBu)kKC&A+XiZ!Y=PRvos1?CVkT z|4xRg($!+E?d&H6Z9HjFAhlw5Ptmlm<8OuKkU|~Rq+cFg;IkSIG38ylFZ}Qy;D|@n z_%?Ii8l#6?(-`LlN?I94$t#$`9L(QLzAR8NVy5 z3YN!i10^SBE4-f%i4|gMjksnvc3Pl zkDE%Ibj{M)(#72N+WSQ+-zXhN_DwFN8V=oJxfYgVFKRxe(8y_eRORA8!;+k_t+6IH zqA=e?I=&eFH{&Rl+QLqcEk(?8+F8EUNm90Ev%$ z5p|oFZjoh65*PQ2fB{pq+-LxJc0aBR4H-MKu*j%mj+$Z3qi1YWoTdbuaT!4h$YJga zPC@lzQ?QPAYzEmf*Be#~Q}h!4)^Om2BH#Tb?b#$CS=hM0ub28@Kq&S4#{Gq8(<9DG z<*QZtS5}`YqDT_a7NfRb_9TJ5u)41Mx!$fOMTp00feQSsy&Kkb*O&QxsB( z-ej~YcH5Npselb%Y>FJ80W3`Tdnv+qY_g7e$~U=P6b&6NL+Y}g4ao9Ly@tRrI_Y7R zdT;c##od5#;loCp&W^lZ=8Zuh5@>(9EV8DTTBt43KAj$bQY3X7r-4;ly`7%B0Ul9DadO@QSuqJ7nGb$z>J2%+g^Emch-e34oEc%QeI+x!Um&KqG}p$=K3GLTwC76HYX`5 zA5_vLr;!e+`h5rzjINXeeAa`>bYBr6Yu#h2?;KO@3fbsU#J-&MV-NY&`G&ht_j0BV zQ>3d(5m__R9Y`4E1wDC&K#(cM)nPNPW>JkpWt&Y*%7ad4)9~}zQnh53%DWx!jaGfs zoRmL+_2lN4&&&zT9!feCko)t`>e6m~3+i*!R33+5N?;-_pH#VEm`|+)f)D!H#DaYP zfd%}J+YdN{cS$47_i{fGKABYQIJA-(gAn3_JgK#&jEK4Xn#-u`T+(CcqhW8Kp1qt{ z`X}tfW^awUtw&ymWK&X$c>3n^i1>UXG}^gknPs`$X?8x7)^X7uNTaUSUTba8o_g0_Mdw&g z_RU~NL)pu)@~}0jM^v8`{1n=DFg`J^ zt48MYi=Af(sy1LD0vx-aJl>!aqtWscQW6xM>p2?>YaN|M-@A1t)Wl2WTZ)alzZez*ap#nUY`&WpBn)Pf2 zA}D|WavWA!6Nv4t%qI@~ilo$u9!oAh-XB=E zH0>_A`|fJ1-^Eq1X?NTquY+rmeK%vmQxxDQt~mM&Hha6ET z=S^qpT+;a=;RUI0VaYeu#JPD3I9sQwSJPXQxQs#rT+Kh(H^brPC~H0zm#pU4>jTQl ziIiskw5y!A+lJcEyOwr!9Z4RuoP2C5R6?!DD#f}AZ^wy;FuR(z`FW$co`y1Mz;u)i zgjr;&2Lj@Rvqg1^(Izq=jRG|KHgV0d9nXh5d^8FQ?!@C!boCQY$QF{$jOl@F4 zj-i;FY8=nsZw=~{hVFGjB&oo9yyo|dO2hZw=;)gkYejq(d37m3ZB{|l=+Ie$Z{44L z$Yt8@g{xPQ^{qlv5Pjixm~?&mu^Ccn?rbj+chU*VUYxDb*$5SoV9ISx!mJI|W<8GX zG*u#eG(R6d*8^)tqILYQfW0sSh|jBL-TH*HAvCa(%TZB0l2=xPO}V()b=R{YIn80- zrb@o*Z$R>To-#%A{#0{z{)ak8_DE$qSX9YeH{=#a^1a)KZ94wfhjuMJw2RpLzE0?H zhYINTyCdu^pE*K-B1{iEptFmB&bEhT$RPQKUTh%wD`lM)5N~K|ccklt{BqE{sYV>v zm(^ERzvq(sUC7a;XH%iPdvGd zzZlt;q!cB5tyPVoz<`1->v2ek>WXhYd*Wg-B!@bqY`&l7q3gRcF2;#VV6P7i8l;hQ zxouwh><(Xd_74&b9k3cj41mSHW|u`hrmne-zk;UEDi@r^WM$=6LgJ*jmnZUzL{$ge zfYS2nKqtJgjNrkC^j37XjuiE3MVF4k9A=gDOIPMEwbo(r&B&$SmiWZ9newCGSPmkioBh!01lu>1pe+W zpzA>biRAot-`vsNj((GnoJH}{tU-0AR(~myH;}@?kW}%P+&RJVWu#NrqKvH?pUO zQR(Efk)}$LP3+U*;R6cG_ORvMoD>}*U%{dHHoZcB-YT?o(QupSpaKBTZ~7ak0See_ z%#lQtvX+_WjVf>5BJdL%sR@haLnF#nhXoB$Sv}DFtd9EA`-XBbd->l!h;Dqn2>F6Y zH${KYiPqMm#0? zAj;1hW6Z*>Ymp~8KR3U}EoBH>Ra*4)FS2Y#rdIGhl;bAHsuXdD1GPW0wxlT8vcg}1 z6QKq>1hzgxAco}NE2VlLI2j*uWBqRoW%?9^^Nx9NElyIalz9iTbx2;9NiCg%-Kh!l znocK}6HyM`BQu-446cmAO!$cp4Vcyg8yuP!1`UFo(2wZZC8D9UtG)tZNon0e&BzoW z}owi~lZC>VwB9YS&_753Z46z5Tnw}Et2x=7^$FJy1)VBA!K zZODZz__zqC;7qf;qXDzb=CAhh6I`j4ESBE|ilA8WSgB4vJSm9 z1Cg&r=xm6qGQCVU)3s>*)bFdnUB^TV$eL%{-YMKx8uoT7;xq;7 z<-SJQk0W*$SCKm9ghsqyDp(|*SvKn-*C#JlZ-}Teg{QaMWo1vn5DynMr%Kac@`s)) zkVyb)Z)d-$V=$~Z`{&*O_@&2Y^J2k06H#HRdNw+ke zUVDM#dn9h~wQg{{^u}c(tWZm6%Ph)HM*`I}-v;;`9ufSbA|c=-x`n?2g=oI>2Vyz2 z>rX{ag*8b=8O4XWBm8`I*L#=;^VL>7>z7#$HA-#&wb0`cv3u9Apkot%<&ZvW^a~2J z+v?0ehgRY4&DD#HQnVLUUFG$A+g7y5rvrnub>yePI&``rW5Ubcl7n9U!||7D3%aj! zOGZh0&b>IUhyPV*x^8>-?gSWZBQDOfBp8j}P`c=m4wf|+Me@;yk zD?Og)wW|30viS=?!dgQFekHFIN?TmfK_yW(*8S4}>iY;Q+*j{pV_JKD=US`G*PX4b zfh#Vv=`R=M7S;c!p>pzofQ;R14kY-UOd6Sh2wwV5)xTqpbSi!t0>i-l#G1=c4P_nL z7NjcNTjw$6Sy+wrEcSi*e0GdZN_eqDZybEO^2Fl^ltmOLye${L4{yv9Fu7&?R!f!E zhB4xAA)v(CJ6wyJimO`2GN>y**u*N_D7tyM(SFcNuI1vC9#|iKnqJl_fT$2(Y**TM zRB2EMvoIzY!oSkA359KW1YQ3-w5{B57zmEHc~O{gWP9WC6J>tdQJ2{j_7;7N`D5Qb zKv!@;nqURsiFJ&D&*(dzRFgl}j3rs$$9?P%%qZRuNJD#LvWlINrQtS|u~%9`XCfO{ zh`pZJ6x05Jvj0r+@hr7f;@2IapQl^K33%4Pk#09JaGqL{x1Nr0L=#s!?E?a((>tK7 z?*Yyb`khfq|Ia*s^H12ltHp2BE||a`B1uQvR@h}OS)Zog=9vC*23Lu{!^G`9H16A9y->C18dqtet~HONI7U?LO|D0dg!? z@0hDiDpUWb5#p3I|2=FFHxE)%qbTjRkE_Hf)rwY-e%kS^CcPvGhoz_16!y?qqcBfL z!u;NX=xH8DAM@Bn2tmPJN{lj zTb09|BjMh$sQS8H*m2p0yE?#uG#TA*Sz&*>H58A_0iMLOfO*TQUDi9rHk@%k-oOv4 zJeK%_{9>_FbNsoHeu}9$5`RSerQ!Q9;kN(6<-kYgORq9+huGyPoYJ?Gqx^zrc1^XD zgQMkQ$@xO=CEAxew9jx!lQy0u24HxA=%vK{F%D_}3gD%r@uG|c{6J{6kC&p#6CT`^ z0Uh6502uF1OGdN(9SHy6;2t@R4JBn^?w7WSyi+rU+ysXCby~vtr^C2j%KOf!m3wwN zxXbw%NM)7WTQ1~@d*dV%6z>Lb+Sj^h))X~dL`n@*-8B|JTLt&9->7at&8(XmfxVZH zI3tm*sqv{t8~zt>eWz`+6!jpuAxO3WE~lqtAPid%s*=eusRnIlXA)EXhTKWI_NOrOaQaP5w8((F&7HO zt1P@PC8MBVFfMzomlHRLTk1j ziI0-j9^QA@eLYQRy%$3n3^F1@G57N0IkLoRYEQW>7vvYJ4eTFjEOdjwdoek6VyuJQn_VmF!+p3K*Q)Z7h-1!jqYD>&r?mF& zd0@Z@9n3RAUhQb+Q@EyK0wg^*w_fzKfpCq}@rnZEt^-K#pOB~iJ?=lr-;+2!$(ZlU z{Qtq+G*!|jBN8sS?Eg-vv>9Mm`CCVI|3|WaA1qo6@K{;(mAC%MZvQFf_tF4j3wLy* z!cm@651tY@F#E?WN&xuQ_Qk}fJl9`H=@jk!4}c*H8^Ond!CG?tb>^&Ni&XGOhZx4b`8bK3Ods?A^U@&7Y%Te$^9#4TqQx@wiH3>=+e zlMK;ARobYBXMQgEk0Zn>JR6Oh3lx&%5y^-~2-Cz1T)qUHBd!oQ@KHXU6`ev(LO&jH ztyZ9B=QJZ}qT^AFo5L2)Xdk~F`}Pm(xrINMPm<2pP2Jq*=@Q#ycNH=#LoBVI;EeXK zfEnV73(%L0xYz$mSl7uDhD2qZIv#%A#hp9J5Wwre^sf}Bt>0e&6qjuNY9J(BXppk_ z2OK&*oRt>im(1J%F;Ul1b|5>OajASk^BQ&!ZM}36^36t;0%oG)mx%@G(9Ft zc~@V-(pE`!lmpcHkl}&nN{ze9@SvJiA8bw_D1d_1La&cV<-oY$Ef0&PqtDQYQ3vS!D}{i z&j%td$in>WgnjmxP25nkJjJ$dH!kblwTI+l4U!S!^o&!Oxy_GNOuHMFD4{t&?bRZO z<~6>?ZCbWq7QP6Qf$@dFo}olE;zoCM=?I-$i+80T0*!WsS0y)9p;aOd)E3-s5wrC` z^ROaGlMrD^vKA#}&3LXj-@rjI8S)mUv=_H05cTJc8Q!nk@|&;58Lsm8TRSUm_}~Yak9GF5K}X4 ziOnpDERqVxxv52UO!UYgS35o|wtNfIZ}gYi?n`)IRtkmJ^~}#ua!xTUxHD+=39&S0 zB}FMHhlTJ;O>B-jpmIv*Sy?n(A_8KVoiSzAfw9gom>9jD{#a92#B=oC=TXbO>U!Uu z!I?I?hRgd1cn@5iD#h&I>@9R`;dDf&sNFFrD+sCi23YCuWJD8%t((LcAe-VUbP3es3 zyjZ)`pql5`Wu}L!>?Q}eXl4xqbehn)P%-8wTFHS=M47oC*RxqJ+rUjPxmpow<{1z} zn!)Gw%&hOFwVOI<$xI3^k2j#|(K{MTxXQOX zlKn|-oR1nRN-uy)*)#%|=6O=SqU8NBKYQJ;I%oApDkli$9FH-;b}$?1qh@mSl~{^J zo%>8S!Z>^A^^%;r)2&+)<`(LCZYDO|_Hpd?HjC=mzlH+74NDw{-M|FqT%%*MdbJQY z5#KG$>w^gKzp<)ym=ecOaXPEKjK>u{%VBW#C+NB;eAS`-5mS+fNOqvx=dih^ow+C1 zxtT1~WLMI)x)b^YOZhd{7xRhzSyO6v!<*!l*_m#zNY?qyr#EXF#l;3YS4XCzo^Ezf zZZK#$$sbA|+XO@Ru3#Dc%(vp=CFA8|)XxoSiaNI62BMc+rMH5FCx6s+IolzEIINK(=a{igCnEW@NIQE9+jc z*@C3h&2&#E{0`ce+_V>0*C*-LN4;!E>MU&&h6KgBmUXUZQ;YC*SurJqMQJgLumL-Z z@Hv0J41a6OzKXX^p1g-FrA5=OMjj61WKBVnWX+lqZh`n9*}!bTda!jv^L|atF0|G1 zpKV_pJ~CWd9*-9?NEr6Cdhc*3cI@tE58~=0pNk3kU$>d66!A=`ts-wr(BDNNc0%T}p|j>ud@F|oT%L(R0kJ8a0Kj-_i|=kl>( zqouBOkcKIQLT2ruZkId&td^o^i6)s*P!Y*mJm` zF@AI0UVp>AL;RF@e(+B~SP4{eJ`B4fd-XR^F^|8i+US zlq9G+n15y^QjFxxuA3Q&Y;6w2KhtV(+jssqbicBMIibdBLEehbFw+u<)sQQfaD zS#&5EL4+o1wh9t#wmpwmAr+3TvkFa<`V9BYcbQ0aDp^^IQtG~% zUt`G=uSE1=l0^6S_cWOEvIMO!oC>sgUgMv`AR|J2SQeB|d2yTohV*<^R#>U{0VecY zj3p^`6WI_J41AoBa~M3{z}BSAR8#k>2#bJnp^J|#T=uR!4?DNDYiylU>l^Lh52?W# z^vKrhOvFOP*tOrj#cV%de>A@X_7f`k2+0wjfTn8wYuP6C5xJ4zToz_fyNsaUvwts+ zEjOlmltMdBPK!(&{cPL8;8XT>*cKDlA7=riK-c_9UWB*xJaXfyR!*B?(DQKC;HB&H3X8T+K2I0IZ{8s)Tod?RA#rYFhQUlCH+B+E7Ovq zJ@u8nkI1P{MP7;zj${3;k{Z^Z*apXd+BD?CCrZxk8#So?AOb!@+HALJBgAq+JAHn4 zjs#gyF)Hnbzy@~BDwV26&ywUtcIrHzO34d}s<{XtgrMV`9hFWplMnqI=Tb7a2zo^l ztWK>}a=u)WRe7Rpi<8o}JfI@9b8i*HzPk%&a#%|E03#!=iJ-X%-t7t1wbV-4j_5Wj zO}ZInTeq-GSC%<(HvP-e=g~!t3Z=rVi4TWRV1kcm6(c6W4Nnhg;mN}5mgy2#ErM}f z7mx07W;G&_Y~vm2H$f$uYE^Oq|6um=Kxm|EXDfeHpZsHs;L?(QZ-kTr=dmD)^A~J{|wI zQ|qVD1&RXq8oYn1)|7bT`=zDUEicm=d?|gwAo#B=pDt*|bqhn@v=61kYI9sM$J0kh zybcNM9Tem{4na{UHw2S@Z{?deO7e}u?9n!AeK#l)hAYJ>2r1?0C`pp8s-EXE(Q13c zQ@YbDJZ+9n5AG7ac5_Ko{4&w=Bn#ns;Ixj~iULJmNu|oTFI754^}fY(PfjI83FnV% z){UUsvj1K0F6QYP`)}~c(FwTU0Kb3htv+hF*KMk(PgBqp*-k!MW`=FGvHFajl+c%v ze_;(?rs=H${OJf3n+R4S9^AmiEncJ}e_&Ojc0L#;%5^i<&qhZW{+S2r?3pkL zd2d>oG}GhhHJ%cBORPpnyZM9?{B9(-0gd@q`{L3m>v5tC^6@MsjnY$jBZzxfNlkrr zxoHvn`a@N@XIXwbe1*v_RXJHPB=_kQV$ z5sUfAnG1c_gzlL8+Y&0sT!8bOa-{Odj`ULX=>7k`9OISqyO$9{EiLQIqmOtat5ULJ zLm!9)G?=Q~uh5B8??`bR4}?{ggFh7MlFy z1E@73E|m@Mve^T5Z$`ptueCYKtq>}Yc9RVvg`Co&1UGvF&4rLu_vdYH6O4jzPhr3j zzkXM13BF&;G7ypf;<;+xi+S1wmUEgemrpNni5l33zjOLJ&K1{-BLnC)BDU-qOZS@` zwnImW@n6bH-L;oq5?;wJbDgf~U-$&OKM*PUUgD9Eh5JoHS(r$*DYCYvIDy==P_Apm#^ttH z$#+@L1PI6jt_Z$rKiq+y#dQ*4gpV z)O9W&4+Yo#czpc2H%MQMp3bxXIz&jgjki_pbjyxCa3kW|)45uP!r4^@0R&eSl_VAH z`Wm#1+FGLV)NqNZ5b35 z3_JvN5>)t3^&6d^_FLm70A3;VT<;eO1X4RI>1ZYIp&-~KcKnQK_!zqi6?>|4(>lQu zWg0ooq@o00t^CzbumfT#QoDG2@%oK+Ai`mPJ>SJrN-`{gsh;mHumg$kO75)>{T<>}ls1UVXqopG#;o!r7oincag zi8wim;*(<^Is-{{*D#n%d}%}>s+XhPyn~m^<)sNEP_?tYXH=$D$@Oc|vXl^t1*41o z1zeReo~fNpKrup_zuvM_fD-ZmpXm8es5%@z`G~xbN*LU}dX70IW~{R68(ZEtmgp4W zjYgigwto5GK^RbrnaS!}Vb9-^8MnLA%Vxiq0%YaC*sh(B#PJ$g-vFt7h{teO;>i`q z5bi#ooj0#_9lPl_+d8NqG-U12bkfoW$ZpwwP)}yrG_ycrbV01PTBDg;bS>57&M-OU*0!|#M~}fr8)RIGfv?c@-i<)w zpgskh9<2iR%$oH9+^J-jm60kr?3`AQ1b0gLx4lsZtY2l|D=|SMhJFr?`RvMUiMnr_ z?$1kxbmwP6;l%OK>s>ZX37^Y+v90f{VKQgN_IKfQ@6z4A$66;8_B*wT{Ku6ectj#j zBhlycU%ZcZD<6M^el2U^{ zq2jbCt*5l5Wsb0Cl)39RspN)^&9zNM`}Zl^)UE>gv*@mNQ;Y18-Tg+Td>K7BbLlRn zcOzohw`^9m#z@M2E&X}+KxCn?VD8+j<4VN&wk}`876=eS!9sQ1m7mJ%eONd$hhEm| zk}QPwY>3vqIo&*xF^a!ETx7eio6LmcaCxMGK9c?`M5Qzr>B|`kbHiw}15U(a#_~iy zJFlgPrTiz-wTlvNUoWZJT6v4*0h}1Q;JWA4wUBh;IeE67nmBExqW7M2@r^Ygsp1t6 z?7(m^V0>=Ui|%CoV%yJgb&<M_BM^cFUh($fow1%_Pd>A!j{cVW_E8#5m{$SHBea zWxq@qfK*87=F%YLGoPh{tEcH1Cmp?o#Us=WEnIIB#TOsVz2huVi?-R{TVJWx=ZAlO z)w#Rr0(ra7kYLhDj3{ck>Ky%>T-i)N__1NnEoKw4b2|ReQ&@1MRij((n z()@rZ>H6=TFLEPnCpk>>;bTb}*9>&XkeoubYM!)zP>_>VCWY~MC*s{zR?~{hdd}mM zKz`E{Ipi;i(DB?Hk3^d*j(NNhcX6ndDES-}C8nI>Z(^`Dcc>Yyu7)j@bn(O%+$0c- zYGrmDaVi9+d z^mcNE&$5`qSJMtquc~Kd-Rfe(wps|LD2D86MZEKJ(6a6mR3%4yBW-;$`QCmN z(WpsQ(xX(s9=^3#=}%7)R~7HZDXHGyxfa1f>3C9JK%gSc2W$rIeCrB7X*)0GfdE}` znp%NtQxiSc{xVhQrBIlwQSErrw=wc|L=$(|SZSnA(r<-MiZsv92^j z44z_$NZdJcI4-HoTDJurY#h|hpv;DV9{wc?$IDbNg<%wPC8gb)UA ztu}5avIW=0N_L>6%FtB~13|DB<|I^SlM=-DSk>CxrpS9h$kzAB&X=58a_s4k<&SiB zc1eKhG?d=WRcg?GsgFU5aPfI{j1ck5TqX%)soLmEE^2RjLPss1m%p6>Qr$Up0mfTe z=r~r`2=Tw5vfdyhrqO}{l=V5*XGvxTztfMsq|wO$yO*VDIyOrrlzqF_?9oV-ls{#> z8eRIy00`I?x44vtfw~9`-0a3J-Q>xA1zHkQ2lT8ym|RH z*4uOy8`m9Vl`} zdbA|EaI2N|r={6CPJA%2L@Z#&Cac{qROIwr+(K>DxTm=d;;S!)g4j>u(Ms|c&OH8U z30|BxTX{?aAbQ8WPS+doMRD9&6298q&u7N%iawyNuH+1g0OoMqD#$9S!P-1}uXba# zG(1J4eqCD+z5q$B7TQ)zd5MGctE$uiD`42ryvsQjoI=_PS1)yFd?bdxMP=f>*K%uL zq}f@V0P3VY_}!ZX+yciBQGRX1D#i#^k`Fl?HYV$k**4=u(pXcPVGHCIE=&Tafxx!wlZ+&V% z1t-Bf{;le*UabbRWE^Y(otHp6?2eW_v_|zto#{Kya<2|!(Ni79zAb4EO*JaPB~_y& zO&+F;Ds*nQ*iNVAH1z-!`sl6%oScQfQd;kRw!nq7j7v`k)sv8{N%S~Uzzfh90u7!m z=XsugL-hqUC?%A7J?SJ8KAxqpN;KXP0O2t2_l90mh=*&X9l%}I>pxbX0~RPYpStZ# zQti0*X@4P~r-XB?9v?3htMxh-*e2OjfCm3X$@zG2rg~D%h&V8RCp1YP^9J>a+GV+lz zfc4@MXV8Y=Cw@g77${l;nSeTSzt-ai5w38(6R1Bhy1=w1rII;i$tM|*i9a>C9kD4^ zKfEX8quI$e7Ee8NG}T*Qakckf0S6#jC?9t^LXuFOq}Pq;kAVVMDLodKwLyq@sSg!$ zmX>;d$!HotKR25qPT1#<5Al)F>O=%l>JLO;_?>Ol`8pD(-`U^2(VH>X2B{ z|3T_6coa%k>y5eO%->83?6621i{q;~$s4rjIZb;^on6FvwTj^E*uxWm1p04wp#~s& zlw&Te)*4V+IQ+HsPdC%MhDV_Fiw=5g{1#|(%~=4Nf9;)Z zTP9m8@SRN%yae>p|4MXT1HVAYgr;D*^ig=xaQ0#S&TirNu1_48hLr-=VQbIb?N3sU z#l&+9kKwuRoRcBjpVI$0)~ z^NtNrsEXxDw%~aBza5T{BkjbrVe!%?b;@gXf2jdDGy~7j=mSb=wrc$kJoE{Izpv+G z3pk^ zUBOK&PIKrluJFtscdviYUw`_4LW>zE5PjK5gg^Liu=s;c+-DM;04tI+!R7wxpE3C) zF9IOPZI)GO5B^VGmWm?B7JNOf*7zh=dF1d#7XuqBo;TILf4xgPMO*O?@SsR|JlL#Y z?L(Zw67h*L=VFYA%KdGXtfZF11q_!SBEJY%Sp1ye0QMWbg7~GQEV!76zL(*;Wof^7rf3=8uvTAS=1{bL1I^0=ud}g_LsP2jU$bV$O6-3O9)XL zM~dfLrpT2l_IekCt0=iii-}b(6ca0-=OT_EeqL3gx;9E3x+PvuXHPashH+nN*pRyG zI2*}rvdU(Q62D9DUIqL!ND^)&%{1!uF}+di~H1gFSrm=5%o8W0L-l&2ZL zdJ&JU3BRYOm6(qvRzlh0d5Py1qtuR+++~#4UB|YX-tzUF-rDuxOz&?07Vm}}B#TQA zVXEbkm*T&yNs3>p6k825P7*2V z@g{U_u>y>Ufep$oX-54#PPcPzPr@0~Afb;+pI#?`GH#%uWPYIR0N)>D99#?bZ-dHu z{5B<_*PP;soP+|D;o44{x!U4SH5RE8>&?zJK>b{!DRXfjdNl31?%g!CDs{1D0*>~Hs80;70KFSLxfO>=1n)eHFv2l|dXp|XrS z-z)|$b7RMUY4c`8T3CTRWUo8Em;0GBnm=#`E@nY2lz2!Hjk&Nat4nvCK5 zUy4@0#s2)950&0Hih^0s6i>xl@!*ExX0FMGl4FC^(ZEHo{R>v(m&Lf?nL9<(7eI^t zS6RclN<_0`^3)QXJ(gYwq&Vm#*k%)5vbsi5-pm(4K@Ye%nDx$}P34qgi?! zs13A};;xA4h;L2jB@Mj=dfFW5cDlUGzW=_YpUjQ*amJkbqm8?5TcvakMHK_LZ4Ipw z-P{zrM8sTD5dp+tV-TjAVIGKf!EprFsjQ?9yG<67?emgS zqJ$h)pPDs=DM(T0)bDoGH6`j?a{=CI;XvvFs0%VnxUW2zoEj%HeqBnkUP=p+ZIW{$$7MG`S-(hBnvy*&0{Kgw@05y~h!+d=;6Mm1 zex+jV*Xx8O17U;s1T&d_kFNG+rRE-7HFRgM~l4f;$-(hLI-f_g2lvH)@0Su`x13P0xR|5F$OOr0 zdiw!bwxtMxS>kS-0#8%iHA0lPRTYzXGolPf{M*=?x`g~JKN4MMH_6`Ftqz_XJsTUL zYl37+FBFyq8VgZ&b-%JY6p?0*cyH4CCa<5L#HQDyev8iLpa=#{@(DH0Ut-Qx;Q!50 z>HM$8hD`;W7LMJu>jmjGHHA+j&Rc+n|5dwgMEym;l$5ii@xI2rvl?gby}EkmX;yNR zA_9??xSW|&lap(??lQfcxOn8p4O*1QEl`#KNeoD*io|17s#1rL6@_1d)~5 z;SCAmkb@WQts_&nZ%r4dO;@n$L$`}ZH=nRSqbqNiWQ^kSS^}4kO%AtQNZgxq10lZ* zlyx-Pw$86qX4=dhGSSnn_O?Q*#!|Ig&c4`qOy|EBVv}s?E3i6^>NB&oHtV}Sonm{` zoonW#l48jjyV8SzQB5t{C-*cgSLa%;HX!;x54Tkgt|QIjK<-+KLkAkd>94pXj}9E+ z)=|BUD0;fM-0*pDH7Lq9^kFumWD6n3GodtUL|1MbQb;hTT*Yi!tF&^@{V`oX3cZS@ zo_t~k5(4eEC*+m;7BxE?PG}=&J9EPuQpviJ`+kaoE@+XtZH}LZIst}MG*~t(??d^p z)DxJosFXHR%v+jX-OS-F!S>OWkx2Q?9kjE5Fq9EO|EiBbYk#|aJ(-x)&mwM*n>OC) z=!KA_uNb57t$2>rhQkS)CKY8GArY2ob|np6-!Yl;Psz7F=LS~;-#*wdyC24ov`xX% zzh@H~d$&xfa4t#EylD=*Ph3v2SZ6zN%lheEY+58uyh1FAlxN#!VDQc3k@6eX<7~RI z^wG9R^zim_Q9o2ze|v;1ywkPz_EFkpaCT?Di=Yz%B*Q z5##Qh+C#pJExaQ0>b#-ethxL_8q*%yRr{POA(RGo;H*|d>LQP|kt&jt@%>*VwjT{H z^4u}MRrj*wM9oOz>btKQxkBMiOC{j)aB9CYb|XcbhP9WYy~R+5_>!A?aa_Wi)^kp1 z6r)5*8b@-e1n2k>s#I`3`4VsDV|3*7ml#G#SZ15ReCMU)QlwwivXC`PczfrY;QEsM zfv-x?Ef$X54~ z_y(2Omhs39!Iw{xOONaX585`W=^w#M3d@zKQ^&sKmf65^QhwUR4xAwykFTh@Jns%0 z`O#4H%HR=`B9n9a?2bm|Y?TdP;LF`Qw9zUZKGq`h}Y=wch%_~(U+d{zSs z>n@88Ya6B8A}1F^my7I&2?XbBVs)@{Ld{QvfTQnI5Dhj1^G4n0urA6)yZx=-GWvcY zZFwA#qXz1YN#Y72S!{S)oOxl&#ZMjD@ibdWgz)F$mF6biMQnc!KdBF1G8tuXM&0QM zf(SYrUn_M*Q4wE*5>4|te5UY0Ey>sBGIR>H+3oDHnEP{M=Ss2Gf*Q9)j16+{ngiA& zB33u;_O0=1#Ot{g3e6|(1|p9J-ut*N%9K4ZF@NP{%R^VUMLma9(xR=IS=8On(2iIg zs&2Y7y5N|&t{G$}&aJ?ks-|1!JXAe^PI6Ceq0k7j!}{)cDBRwwtmd~wQ3)EYB8?CNyymUKriLQWr~$pbtuK%NOn;L zmC3|$=`ku~O-RhFTQ@4%4yLo(z=U?;+rRm^deo}HVN_JC$^5Py%*b`u>AS3*1!9a} zVv;pOQ{)6~Suv=`Xm5rzO?OHDLX9hAHC+Yd@uWFb>s2@D42;GABPvvU zD`!>@mE?5kq@l~u$CF7O0`)4^b5l(^@Xd~2w2vu)N8h3HYKzF~hML8$zimYJ@2iM?yP zvTkw9+1N4AaC2aC2wU$rGA-RYvj)+s`rR!ihViuZ2Lf8Ei4xgKdmGrmBKt_$$G)|a zRvyn|R`xl|I_PyIvfL1^aK5&0EI4Uj7i+-IbonY*kfVG)m5204Qo2k6Rs?GWL7@8X!3RlhDOVrqzEe_HfUkp-oub)R zbjP@h*IskSwm?GVU#iva6i^kdwqEor&Ttr&qr5ZScHusO~`X0yz zjIyhyrZ(N?IB_RI9gjFp5Txw8Q}=LVK&4w2rD;f%z)tN6Y#X zb>7pB3kGzF9cA@NUV}rEV=>8gZ>o^3IUumRQ9KD|nN8_1pS5bGIYg($B6-oLv^azv zaOxJ7Cs&?SKECu&J(rz%g>Kzu1q5n%!!BfPI#X6u^SCX?ZqK!w zvx49C3E!jIxsIGPnQFtRV$`~$(xIXJP2ZJeT`VIR_==OSm9t`lZX$`{j{mV|3wh66 zSc-NA%5xiFWXx$acjy$}r`cV~Iht@uYk zjmYCw)|T|F?CEt0nyzLS(tT3{+L7ENaktd99e7+@HECEoN2cV9a`SdjFNwc22>*ZVy?0boX%{y- zqK;jRsGu|j=}i>r0UQEImo6}5c`%p@Kk$uBns+1`0@w*Ts`ZJvTJ!o zHP`aE(H1sSQ5m&nmd_)AlQLvl&lb0|`qifQnbGaSjxSx*=GLU0q8Comz756(`nh|y zb*mK#n7qNyf@&u352@N&K&o(Pls;hWp%zxOFQ`Js$Y^N4M)68g-m`> z2WvGVIx$ip)o*@Cbk5==zw)`>4D8lafVq6n3%CeqPqNAJEeE5Q4D4&$c^_6_#3%HJi5{F13?b|>-iEp=ZH@pW;I zETba%SUH2-Ixm4A?E;uf#0*!1gX*AiUdelFUvE{Iv$gw&w$y3m?UE+Ixx1FfJC>7U zxjt=V+iq)Vvd-Ah?+h=yqJSQk69inr{-os?o+|Wue7vBcqn<_~SAdmKpXL05y?wiK z`3f()pnO9vl- z0nxXcJN~`4?FEQ_;}>t-$#6r9F@K-*l~O%nv)j*B@rILCEg z2gT&woctP_P;bt>F_@vCbBD~NBQ1jbwaPPBGiLFX?RtKZ1t9YR<KrqoB&$CMNG;DG59hm_JQCtDwNKW+}n4@N6M|nBWRC*5FNv z?UvZyKakb0n)3$Lw<$)2)B6mIILogNeSDY7hP77V>iDB`yh)m+3%u@ z_C{cdcv@F+|Deu*h)9i4Oso2q)z^mnC_ArVe($*&lWZGi~kX=J0V0R~{Zv=|8)Z9cd8 z`o5@}fERHbE$+L`Co*#wg^{Cdn&*iDWX%YmxE<{z%y^)3^3Ehdkn`C@1u3yoDA&-p zU@S~KzS?X^z;}%VW(?!;+PH|C>1Yf;L!4fV)hyvuT#-Q+QnnHBUb}y82Acx*)z~am z1(qFl#j9X!)xUUYw*}+hc&$Sna=B)s5;d=iF3oLs-fUwW`D^^%KY5qfjy((uNk)~T6BwN%}EdGXAyhYPfWRCmr~N+5~R{4 zn7C6|l6kocIO5UGoc!d3jV0%m%t`G_++D~cp%)6;_w>`caW_4-y0k{Cvb&=BNI1Y8 zXg+BzFEi>9^3MdIgXj@*owB9rjX-gBjW@cu0ZW`WTCL(P2+`o)DCd8J4dG%-EQ=uM zxSbv>JKr^TBnG>vAzmcNht*)OB1Ls%G70x zIs^C7Ri5st&4gPwq` zy%3kL2Omy2UT~cVrsi$kKwd%4yx-i*Jj&L!tpo@)fiTBrjxWXnaK8}`O()3$Z}4=_ zTibP70Zx1Wa*Zf$&GcaD>>1!>8EC|(`RkCfxvO=6@D4^Z3Yy%V{CeNo4NAW;@{q`L zPq#!vdu;VfRmkBIz=!YLGxvbOGYRi4n1DG5IIds$p zns5e6e0EuEbU^eTjJSmA0lc`@8Q#-O1&~|rOXM>tIb)W50H;sIU@rWymXvYc-iWRc zn)@zR+BtQ_8&dhn$YHqyvpS4@PH4^!PS@z|KYaeH{eK%0zUxu6A1eEgdH(0aBgH)&WX_;!QdMcc25Uv{zVq?;x!4p6 z*6dCMeoX6tsrp`h_qY&EQSc4ouPC}NR0h8D;OLR$D;k#haS=Ze?t3U#L?P4=))j0B zPB{SJ9Q(&9&sX=D+`0uA%{u>s*c7SrH|{jUzPFeF&1S--Tpm>HmJ2y=1P&0t+(u;v zsV>cnB<%;H&V}qjA$@z1zS(A|@RrL!eEjO`fq9r$Ur}eza2;$j*0`9>i+0Pco!gH9 z#KHG0VcsBlGI>G|Jk4-Q|3~@QGw&XNC0x65aMAt0^mCwimN3vLsjaM=5)MqrluJ9n zr>n21<`A|cKZN}N5j+iFS8(Y24m6rYWr(>fDeR>X2fL^D!vm!7d)QTf%6vj-emHp2 zV&jzR3h8)a*a0DYO??E2yq~+Z(R?3y1GB;|91%<=`*Gj~qNHMuaym3@In|#0;RM7* z1N1O||Dn0D{r3=t#ey)}Z|_(9A-s~R+*>`mY<=&qKwo&*e`W#vXCJTGzZ_ahg#%~n z`ki-(hV8GswX)yI|IF(>n)5E4a{7ui0Fdze==kpR2~CSR;j_P-A-Yep|61sVDv)%D z%d)TQIFLK};UWIB*YD2ny#+cW0cZOb3s!jFrvAsze_YbfgA-6`aoB(69REjwJ5~Ks z29o2zw(Ii~?&vQTvuscEl8F@Mi-{=7yUhEA;oxJf0m?mf+va<%G=6c!)!n+n@x|Hc zeE{cu5A@$1#%}D{2twtj-@X%yIbA=`&BA2*T0dZQW9t)|B~fB^VA>(QEH~(W#vGu5 zu_v&i5~2(GJ+WdnjkgY2M`*ITAoofVrWE`lEEUSjw1ha56T_7W)#uXx_Jw#c@O|b%wS(s5&2#w8e{G7i@vjW2 z+x$EPykzu%WbOM4%_YDd1%JN>Ki=anRWz|5;u4=qS^F`MZwv^Cn7co1*V;H{I5Tdu zlo7RWiT`l;zf=a&USK~fuQA*YuHmn?^51pt@d818brAQzzv8z)$An_cHNowznSr zQWw6Xu%F=3%}}4iAo96l!9U`RD6k$_JT9^xc^K7R9Cb7b%>Y}Q=VOFn z=B4#E1_4;7U*bC|laZ-NI=J#4$>*xx)se?cpDMJj@U#6}?)q7tXgVELX0)4jJ^_)V zX`0$DpV#O-RjY$`(`zlI`i=nfR6uUYwCwEPBlCYPKessVwdF=BOji@5+E+|c+o?l< zN=V9OaKp3jP=q&Z0jbGUw)oZ1jDMbIE?uf#q68RgC?d<6E8aM4RF9jB&>Jq#Joce>wyoESkBZG;+V5WRR5$=KinZ3^#w zQNBvZfaB^W6q%vt9i*D#;T_D~FSnGM2ypFbQ18}fxSY)5C5IxoGN@A4fPVEEeC6f9 zq`BVoo#Me$%t%NEjCm7YbqkZ*TO|x_JQvn2*6*ma_$tJ=5xygP{aG-`a(582djnGo z5|UHWM#6}Ra9cSs)>?r=&2T42g_S#^tx!+`b7AORJkAot#oXSW#=PwYvV6G%IW1!V zTHS#=F?GWqX$4N^%FL1(#;(;HwQUwLBV%0ys=$~~kflLNa20!MQh4F&Ht3X7QX}Jt zZ+Ac2uoB*)hH`y6JbTwsAWSL3wkF#D<;)r%Ux)$;N_70(S1+!HZGnmPRtIfkc5dk} zCt!rvH{y0jr$R@D#yJF`m&3*va2_Zz6RoB*APbp91_wh(ZAJwCVXbLwzERdnn^~OC z@ASe~xG?1mVG=8qG6v^X(}2Aq5t%4;&!zQVEJD`PNEBA&yikw6trA%m5?H0$6avDQ zrbrZF#S{qO6d&>~6Q&h}bz5_8kM{PD#iGG|mm6UR4CNTLFvc)1G&no7MW;rR!s1)F zW_O$psL)QeAa;g)^o6>ybVrWczz(4|Do9?&N4~T2optZ{*$N~!VhiGf0yZ1g6Kt0B`rnyo}aeAs8)1if24Ak&D4a0kUI zc<4E+P+eGSjeoQgKZg&8H^5!npQZJwGO>z6B^s|0U;<3aGmo^+2DR+^RED^!P!46a z32axCED0pzp4g?Ox3NnGu`eVbGUNqzgj_j2ctY}4zk0-fE*53-%`aKHm8Rr#lk^MS=P%Ef`Uc(3|vrTTj^@4CJ71#PKxLD4q?js@Kel_qe~<@<;g}? z%nFlUl!Y8n-Z+N^^F%|py`2fAVFm!hE-Z;@siKdLGZyowZpFoAOVGe18N*o1%k&>& z#F~_d%a(4Pr8PzbWcIHu(%+2m(`CC?WFDnZ+jqPrw4q47Dvk@r19Yzhy-@_k0Dg{$V+GB;3%v3|sv>{%ua-qO{EZ8HM{fV5bP4~V@h!`(=; z-WjbGVldUri?PV-l{rc(ZB&`~os_iz-4gJK1Y;7(=K7`znrqC{y00m`2%?G+*Ijg5D6~vsch*Hc>DT1w4t@ zUKAViWvmPWXWpu}5tyMhuN%pMvDs@_W;ATjM>*-b-iZt3n_Gsx9=3M=eNH2m>#yMg z4(ay?b+LU@F~|tc9*r}euMKK5AjTCNZh{}WO!ZZV2HvG|a0|E#;|l`5 z{yN_m)Rx`Ij$vWh7GsVHfhj>e?@1Iu*Gk|ikPC_g>+P%y5j^u)_u#eKp)G&-N2ZXF zyORb~YTQL35IyBowZgz)+*Oj1oVX+=4O9ua(6PW2=AEU-8$h2-Qw)P#Ss{VQSHf8H zrxf+%XC#@C)|DNDm_IF>UtF+6suqj0oe$o`#p3~n=+hyh2Q2zveZdH2Lfm7i6MFII zqPtUuk%*{S8E>J~K}DhEHry@KP>RnN)l{ie!_5hFeDJ#|cp=#-$y;jlysfR0eF}P| z1)b3dIUxz!u_~RKRnlg&yaXr=x^AweiCii3msva?#(Hm=-odQN7jjzmLPrO4u{6Jk zUJ7>Qoc>y^#nWq>cB1tnKV}8?b8XM)-0(eue8$BXdY6*A8-LS+J4C{A!y407H~5uS zuWcS%uFmLD5Zu4;aohc~*Hk%F*K2dNLqf0)ah)tY-_(!D)8wojY~7>>%b8EbHap2! zi)$qJcZ)til&5jO7Jj$Gne%TPT z^BHDY>2gw7@oeGJX1}%1>ImghwC}sxskzlZH{K<2m5>_h)lD-V@4hJ}%z5$aF}?)m z57^b&Ouyr`R-lYml&~`1;0?-sR!cc>=_Rdcxu9ofFF&PVcfXK{HocxNOsrLMm#2~^ zkK6FAlZ#Db*rfK3Tu7y&xE} zI8ym&>OPEC7P>k+0USpUjzy4j;z4n29ZAD;d4MBv~F9%C16ww7PD(<<}CkVNPtN zcS6qhSv`BkRzx>6G0Y=8qz$-Sii@C(fQd7ImCX}@mR|yN42Nv`tFdsU6V^Nq;4FvS_?%;Ca zUz~?EgkFvB=;npTcynn+2By&V`W;WXmlS0G&%ac?D~s7Z0h* zeWjX$7f7BWbN~d1Hca(_G=(y)O)qAT#;JDpY{qGbcMt4(J5QE|3D-CjDTTrG#`;#~ zj$PVRYIH*qUF5I%N`KZ!qz!9GNME_UeS!mvhZ5?O8sw%F*&3v6+E$Wxhh~~w$%$1b zsehR)IbO#6*vwgha(bF^*T$l;ddNaWx9RX8nFcX7W#8*GZo5dzE@r+U6yJS&SJT$>?hWt_VUHYqeW6~!vBcKj#cDbp!MMS$<6)ax zsCopv;C-745IY#@WAo~+qWQXo8eR_UTtV>3!X|qvykxTyTHLsEv`PnY=CGp6c9&@v zA2s%R8p-=?SB%h|ujl0w!BlBxG?JtE@66eLZ2Rzx0DJB^d546Xau{e71{-XxPZ0HX zO?#gb6gg2(DY+ks4!EbAc-7aigQYg>FZnar>6O{^u9sMCXqXaDUOMG}Ol&}OO{gZV z#@%0r?_~_%l%9eR9_3P*)tdhd_tvap=}4>9VwLz{LsM+ZPdMX`nc+MGm5dnH_;{@Y|i zi)Q@9pltn6a{oMIu)J}rs`IEU3GP?BX*7g(yE7>qL0`!rN%sPgVxD5;v0H(CfRCPj z>sRY$Q1^P}!-R?{Rii8`MY*U>8SyG>TYbayd$A$mn`#+rf$dL^@&(BIWNDSWk3fj} z_%DdMw^u)&!}YAKGYd}h&gd5>F&QP(MP*G<6~mk$gt#zPXC+Mu`ZJavAf#CZLR%z* zD3}%a34IiJEnrAxCo{!s`^1%PUM#*jh{=b(m?>2)nT8f0xYh9ple8)-jQ7q?#er z<8S)a@*B60zC4;rw&B_Bg-o;sn`}CQ)YjEiD$y70byJyBtnBJ)TsAdZ)>Q2%O%$2W zWUE%Ea3Xt%q*jJ$7{E@yW9dq7Q8>+ms84VH9Rz@r{hFX|IpW9V9 zFgExrA^9avEKI|$mw^w~Wh+-^M{R{m2y60`4Dy z5;*5|)H2bPHQAe+P`BP6FmV$v@Eg1{ID=*N?(`yOO1AejkH6gW)#qbxlijGOUXVn~Qwqr_w z>X{e9@W9K=4kVv$v>vf4UyM}9NM?AVtQd1z(o8!v>%B&~Ube-CySKPY-b3p?YcZAq zjV<+hb+N{qHTHBKQ#5I|E&-5S*UZBKnuCi@>X&rozUjp zGlP@F%hfMNx93qax0!2I*ANP>hV@XgrD^yzh#EWp&PYZm0RiT7!6ODmpFaS)$P0*Cu@n{l74fuUeYwN z&M}RtE^GzI;(ceI@0}hSRBPl=e~!Y$V4z8HgK& zB!qk~KB>KQuJQ6l$I0pvD?-EC3TTDXIx(Z#ZY;u=vzinGv6F5}U}7cUpX2D4?tn_` zZI=FWpKEk#HlU1jY{NpBC%sj-p(BKD24Z0Vd-U_?5Vl6?3o5sL^FER>U|&9T)W67n zdKwu|pXiW4wdDyf{Atd(GPR13@X;l74A66>O|A^xZrV}}c%L53dOt&5_9}2JXhVy*Z8Ffx{>#s zcJuD3j3z4gN8i(&}W5nq@$P;;j4P-`_%orPVX!*=ky46?Wt@ij%jN>?}<~ zw@0UQ1^Ijsb1@90&Y1PJD8UL^Dz!C3%62WJS!hPJ=kTrr5Y@iTY`fB$o}|#8rs!zI z-d3)V1PaEkS6?0E2=*vhTW=GEc%fkNK=nx*ITeYM+7^s(;kG0j$lIlAdNKx1Pc%1i z=etcdEPXadyx7U4O-Yc8{w-5We3&OCG-So9a-Dl_umCuGq^j95*PdJZxO_G`7p85b z;P9}?jA|~mBcDruXT3KVAzQf|WmjK~#aj@%dzbMc0)^(FwwgsZuT_0BKm^i@*{>Nt z4IDXOa=xp{e4m%qSAFd>k;M?t&_b(|xu!W@MIRL9=Qr$udPm=kd)Ys^#Z~HOZL#P+ zCekyk2NrYbGbTQhWE|opnlGcYp$vzIZ3CYvIw94!9mXL*3jm z9^5WqwBE{7nEL9h2T>qYe{>0M!Zbe^J*AiNB9*^D;jvUrHdLOA+N5(94&U&TJJNSN~`h&cg$oeZ?833Ac6N49~es zB~BKD@Ih58bJm`}@qu4QFt`(3AcVXUbjaQ6Hih`k)e`u0bqICqCaT&quM79C!|5cw z&*hDj8AmK1I?932T=+FaP9 z>36QasBqJ+Nm$>XL9XqGmYm~dQ!ZA++8OTE^A9$#BrGVpCU;N8VBC?mi4BSwD{@p( z{ZzM=!i!XqbG&taO)@q za!84F-r3UZf)V_bHDyu?I3E)+V}Me2SHGzS$T&);PLMBNk-W4~da61g7yBTRlKKjo zdSmxURAZ3ml~%~KecCmdG;a>UH9*SJBlrZa$Qkj44$6RxLfpSR;w6_badI%wuGYYd zWa?Nj!|(tFzWYi7uHW7|8T*{V(nJBA=gmZ$@4L0=XU>{wgYps>npy5kI>L{dSVuPs zMml$^R`6KbiA71Onr3Abt8IR9NvmIUsSP(H_zdNqF1H#Dv^8ffcoAU)X4%lE;*W^X zAm}A|$J+t}Dzik&l+hMVuV#d$0~pphv~Gd6kaf&3GUZpw(?S zwTw6$<}17eFNmCM??`ne!!2Y+Zxos@048EKtW z(A|Mj!n`{jY0`RdKys}E&s$sX{z*j7{`R$sB*k$gk#@GKq&UnA=3?W^r<2nC(9^v} zjWjg$seU#bTfRuu0*Jq2;sh(TW^DVQPixLVcrLH>2yVPQ1DOCECd|K<^L>lrbd;Y~q!=_V&?T-_Kt@1&DU`c>{?-{RJrYvpxYS{Fq?RVQ%hpoiBK6 zX>P$hZNxWlP!l(p(I5OH9wvy(b&03ys4gn29 zrgQ{3yn5VZaMi*loEgG*Sd=3bRy6pc>rH!>RPM=hEoS%+uV>hp+-ayUu76w1-)8Ob zJX*e{vVSyqNAQf9wU6M))7Xbue;sZ(5yd>#G7aQ95^K@x+&)kxRO_Nzu~v2a3fV$H zz3~`CoP&wD>gNw6PScpfNJRW#^@~!ITB^oSL}!wzpuOv4DL?IURGcRzk}-1Yaoc-s znPnG{QRsa`}Tb|&MKi& z@~485QUDuSQn2db9J3@0y{Tb{H4)?-uY3M%7)h+0!kJUHOSoS-Hp;h&y0AJ6$25gl zGYip#`H~#S*{T_dv`P5OW7ojqNe!nmR^-hosTj|1+e6CqU4ufzE{dYnv+?R1PdH9* zO7vC=>u%?U6Z>{_xD^8fPRt)ud4&#l1}bp^4F%h%(eo*7n@cUTGd1CN(sv5R!ddl{ z*E-C$?v+lDFAjiFU-ju|kI8YvtKNoj*O3J0IV8@l4#9Cw4!V#A@vEG`QO?*V)ctYE z&RehssFkkSDv=FYh(gc2@sXU+tO0GHM@RIU-Xft&yTAuU9H)s6YW7OMa|lFfhlVWn zgM=XYp?9h1kY+=&@|4lgk?fZZ5-dXF_!8Q`4O*39G7Coj;W#os@!?9VJw^xz!;tkZ2fe`p~9 zFzQ%IF8&2mq_U2X8UfR`kkq`Ef47b+RojN9AnU|A${^vQbfQaGLi@*H7XMh6oY@w* z#I5()f-)p?kP){fBYT_tHx!2-tDk4a_8nAgq@#{@USnQ`$Kz&HE1y?Z0Ee=TzSwHV zhu@Hgr^wJx<+(GAo=_TkN_bVl+mA<|d(Ovb+goSVWv6SHK97`?8!3OjX_%JW&|0ps z8KQ!Hvu(0%#osHvA#IXx=@Qr0ey4Zh=yMgzr@fRXt2J$vj<*YeMUCQ&y>>n|K7LwJ zf}07SN)0zqaG7jK`tli^vJ5Ce$UQyXSHmpxmJ}$nX=v}8F~sA_<*v4t#Nk>8)y#kg zJ8}ZaY^stg44_`1_80s5e*119Y(}G=HYIop1?10YS4Ln8D&g8omRaeD?K}n1%#~x! z$efjP!RtUFPt`OC?`9Tnd-DN1v=27m$8*8mrj3N1sIiZl3d+F^TA=SM$%2V4 z%Q@orbR6NjQ3_l|0t_=Yx8M3bqT_TP;+r?8dUA41^BepSkY>OeX%f>Z7_Y@EIlXJ&y=t8rk2M+&H#x|07>*-GzoAM723#>r|oS+Mept{5HJ>G#`^woPef z9fJuoHM>9hkE5W%OYXQYn`0g*z9WJdau zZ5Wrz5C>K;PXiy)Ye=*DHe79)w%TmH=@0$*tq4}i$S8d#57#%>>iHyJZsq+Uq1D$k zBM>m>k#qBTNn!d*qs(#-T;R}$%Td2&g%bV4iY;xHv z4f^f+J}jE!TU8VaHE;|s<81F!Tnqkur#Ni6HcRmQ^TFxop1H1TVh~Z1rVl=yBX`zACel2T?C7xQ|3ntxk?(w;2XLYm}wNyg3X0JB6| zWr_9A6FI+en}+RP*9+~!F#~-E(mR#rV~<)lG~!$+>_b8Hw8f?m%!=Tio^S!MlBZ=; zDqNs=N(AHul$V3U*bCi}bL-1P!n`K4vP{svVvtJCA$hksHXmOO*j<0v&E9O z{l;B+BPDpC)&U)oY^BoROmK=|^~~_^?*oS~2g{-a1-pSdbq5a08+?~aCnRu)#UwX_ zb`R+^j|0cC&cbdm5yiV=vLMqZ0AHy}oQcS-lCmveIb98EtGq99Apmzv&GOP5&Vo6q zxDT(euGQtu3ahXW{e(7(%J${LB!G;9sw9|TZudc}m7vg|Y*{7E&7{|Ffb%1h@OMp{ ze7MG~=)DpH6h6BwQ*jB5YV(U;!q_Sy5H4`y zIygU`k1|jpP8s)eR!juJMd#4d;!|u z*iu(b3q&FMoB!(6 z*C{2Z^^~!#(ewb*R++?0Vau(uIZZ9ha%o`+%hp26U9PPI!Cq@qG@Wli2%giNFuTC& zKVpjKg-p1Jb(-x*YgSQnE%oX?F1KQosU(dqtrmn!OE5cf=tb40t%ksNSE694q%ZIw z<~5d$rO$5g9MW=IsI`g>b)#@sK#(KoXCd(Ss$_iX$g)w%N;yk)n>zMTSU*dWD|g$5 z-h2(mSz%&Sp9!iw_TJwuzcZ2VDkrT%0%Pa#FaCU`t>d8%@yxE&lApaB!9p6%)sWXG za4m&~R@I~zgegRDCtnU@a7|Lkk(s4pMy~c9ep?Uy9YaCtnUP7i_LXk3W=6W^-J}U_ zd&G&C^K49}aWIF7O&88N8$j}k(@$`21H=>bZK$po3r2KNA6{w$E%`Le_u}b6v$ACx zH4gR_w?br7RL~J^vl$??ed6PjMV4q)IUtg+=Ab&YSEs^ynPc)2s-vCHu*sAWstsK$ zoS3#>!*yZ=8nUm)sACP1w$Ab=p}|zAi-r1FvvD&!4?q8cTs-P=eHAI4QUyiC389#Ev8Box5~P?Krq^sJwV{A>P@ z^b_Dp#+CbO`Ujq~fapA1B^d2^c`8-C zD4f%rJ}5NMT?RtBb+KPpgVjm-@Y2j|8sPQpA(h;nRN)cg*D=4Df*Zw#PJR^j5TYyxitw|2wMsf8=&WnO-qG;fg-m3`y9)y;Uqe zeG%YCk&z%b!qGS4$pBgoC{;H8DlYNh3hl%G6qXFFPKq6 z#X8Nd@t+czloN-}0gfJJlS->FeWT4y7qMCKw@E5g04m~;q>xD^a1<=k7aKy>l=9*X z==h`LnR{XR>GY%hN^31q1*9iLss z&J-+Z8xsXobj@dNqZTrDGO1D$Qo9Pmge5RA5Z1}#UXu@iu0_^6sX=OlXs0g+2MZdB zx!C*gbW0{7d;#=?lZlO-tPO3LM3BF1KKwpSn9q0yqGJQlrVDI`AXa(bkdc2?cl+!& z?J-A!(Lw-Z$)K>sQ(>GbikqID`!ryGlI{Dh4?qmOz zO_v%x($1R39XEr*bD~x%!VM&}+H_b^$@9;sBj^g3W~@fQa-yt#NvC$wk0r8t!Aq%u z*X?Bd-CS07?k*LDx^u~{;MwFVYD+Q|^kz9-h~e^hn9YhEU1Z13-7L$Y-rYZ5FwDg8QyV zUa(9x5mI0qkHC_au`L^}Q*`|ybp88Z>2SbB1g0ZJ7Q8Moxe^mW;&dW#^=+#MwVU}G ziV|IhT|3rX#kwCi!ZO|K1k|#&UVec17Kr-JLEnA~9o*Ll4q*AmPhKETBI=$$p7@0& zk}}r(GH^cN;-1~MJTK>P^p$`Ji?X_9q^Bt}Ua#V*VxhD#yz&OKM~#h*e5{9joY~8WQ{RkL7_sz*>yw z0TiT54l*_U#H}Tjhn92?r2!trv-T-v)iJvgtRSfLnB7&NfF>Z%Vq_Q4?y(KA!I#KP zsQ(qB{X-)u)l=RgvXWzD=#BnUzR<;2-C4lo4xlGF{;(474|C^!B0Dp2@B9@D-6}u9 z_)~>ZsY4r=!`v#sFK;BAs>YGm(2QZ)cjL0@sLn9PW*5se*f@4w;~`X9 zDePqMmw9epB{ug9#n4SPGU@CtWY^oMIR>cjyO$I##i^0GG{w;4WrR=&*zv@R16|tt zx%|-J%c3wwmqiZdImH|3{zzWm^EKuf2o1ll{fIG;fTnf~p0~I{TRk}2vPp(4CUsOU{tC}4P=SwxB~)Egdrj5prJQ!r9Bz06$r0)!EI03>c|nn z&kfRsJ1zxV|I@D;t4Lu^l{kyssJ^t<<-$(=b-i&^#`U}(Nq+eDFVY*o`E7H4{ZREa zh#A?=v0dV;$qTRY!ENxZdXnKl9xQ~Ni%oXT!u(9w4g&5QFvnFUDLL;;yP5zWVR8+< zJBtb@5Bi!f0{M=N=rW9=X@88fEhV3;m71@5S%#Kzc)vaKW`Mg{(;B>FE&gsHpEcZ| zNMGr(F7j`3`z4>-(t23}r0n98X{zt+$@M#>Q{q?8Ryk)hDhp=_;VWE zmRiyN7bB_(y2bTQkoYP9+8klY8!VhIGrLpvC?T68cpa3+ua?r)`*j|Ei*5egcC~RY zW=-HfeK`-y-@Tw2(<=4+^Y?GBxg9%Sm7~w*m|V?rFG)XU(rRCF4 z$T9Xd^u=m6nO!$|eg1yyOUQ|QA`T%5U(z*>Q z2byRod$*$DhnX7&a!m_uO>M}^kocap1cL^f^~x}DUd#{>cCZ!%-i&gTaaM_t=?JIh4rX=@xPQSxqsav*4sN+LIV;BZgkzon>C#R35C^6k zPRqm_!p=Yr2urZ4W_P{Yik85<0l7ia7iuJ!lt1ax_Ns{Rh{AX=vG5(4^wrFZhJ|#& zrzUSJ3&-v46Kjxmu&2dI7U+}`S=X*PxBn6Tv1@;Qme=BR>Ar8%-?QY0uSEHP{?Xasq+glkpRRr* zYp;2_&%P*V-*D@nJkw7WaE1aT@1_+yyq{G6!-xK131gL006DYJQ0fD(q5hu_``rR_ zb%6FH3Az4$fS~_kGw%{F&876|%woP<@aH%6<2ir-t#j)}@L}#@pA>ZKW+ijC)zrc9 zamUZy1v*-*D1W`+iKCb7DqCx0CriNO`S2l9jO^Hf@e@E_kW~2Q7)xGx1jIk1MHUCN4e?_kYckcrMGkQg>HV=$d`d5!a z12n0czx?p~{yqJK(FdRH|0~x2>`DHAcK<6D?PJ>iTXqw&N+b3wx*zJ=*H3<8I+~>< z*f6~^Psj4$mi5IA&x-Z5-?bT0%MGY|wnPNFe)nm`&ccsxHvGB7->~1aC_?t-r+x^r z{Hoc3{^(jwE|f@Lb%g?pSZ=uT19U_CF^o6>)jKvK@$=Qb*Iqvn-kz6>=|ydIRKUQnJp@8a9wI?_ZRhLu=u&9{_)~D8NhgWt{UEY_~ZHTiYnWD!)-b@ z36*b=Qnyv#VYkrvp8nt1+1G=6(@i(rdbWP>gp9{eIs5F5MD5&3IK7XxKijDBd0=C# zhmiZw>YjIAr|pdd%%HYAWcup~A;)dBfMqH;8zX*<9{l1!bEBM`+NyDJZlBZ+7^Z{l zaMA}1T{V$ZBl-iKV&y34b@yJ=xjuK+G3VfA`~TvH-8;--VqW{=UR{)$3FYS+L2 z{a*}tiERc>(N2s|YtN5+f2Vo$q2t9UPq5fD27un!=9=~#d|mWG?K38w0JcRQciPt+ z;B?=32AI5CWLR0e_FhD38u{bhe$K{+_W)8|ZrUZIYAJ<9+6kePh()Nf>Ks@TTd#0R z=1xuO{d*Vgy<5{EU%GJU1O-8EkwQE0?lCn_)ju@!q>jI#I`64`x3;4^iHvuvu1`ox z8cP}*85^nJ5$E$+6i0}!civiwOBh}1E~Qky_T)E;<5&Ot&%IL=)3GwoQ5segO2}hJ z&Pn}t=$HSz_BnE8J}VD*;lNKZ$A`E%YgG)LgJwOBHRYrj`lB>Tl-cj>Thw^#_@d)H z223SS?%9%?vY$#E)EPtTt%&V%E>|?nHtw$UwU8z3VDtNTySMMpMZX=^oX3y$I_=-6 z<4NG|@;bx6U3{q8*4BP<|MxGROXX%Qp&%c#^>0t5mv0?j6U2$*8yzawz1?g^Yd%Zw z+veVXv+eyLFt?O(=KqrHuTN5uPr6buJ0u>pg|!%dF9jdQ0j7Pwxksj;Bsxb`CasW* z*W@sb-i-+!^3(W_3_qi?)tx7YILjYjnmhahe}E+~o_njS8E6}g(g?hmCb9q94UV2M zluyNqftQTeO6{$&&-FE2HI!H6e)cSv5B;9~xijZA^Y;?4x9Lco*F(CR0>T*u1s}z- z>gr$L-cP%8*)o3Zh|~{@9OdrgSgQTIIDYk>Jg~T?dQ1gqp9s;I_+cKZts=a3Elkn$ z7~WE^gKLLN$ylg8*o@>%l5u_Zqk!&h?{f&ia;sT`9;*G9w)38Iy9BVCLkxW80QuJ` zq7uG|HfgHw&i*UUNxi0^IDIeYH%{dP7b;S$KHS@mo~Q-x=sqF$vpS(^ws*^%0j#Oy zenMV-BBgxdhEztT%CV&VQvZ6F;`z5_*An`e_7iY}=^jZ}GZvfy?FJJe2btH-^X|7O z#~>ub{tIujzw-UIo%b5DG5ZB8J6=ln8)0*SfacGi5H@~~uuCs{826Kh^Xe<>OBWNs zGGt;ZMQytL{_oFaQ~9lR?a70SV>_00|E>Ue0PV@cpHZ^=az)yG{cCAIjb{ ztjTBF162`F6cHt$BA_Tr5m1oO1*IsU^xk_5y+a^~NHHKFy;p(ILhl`oNbdv)y+ud> zDM{$u=-&Ie|Baq=?)}D_gqe5NthIit&bW@aTNhXFTsTGHv*+{K6k~rABQZDB=Px>d z?;(*GiNY){r~lQj^TPYz>+KX6Vn6*(IM&Czzei>46ZVn*3yIl&k=S~W_VhpUaYYD* zKz*N;=ML{)%mSa@b5$in#d6qVuq>dDrvAmB<65iyl59`?AAr*X1E^p1We)O#I|Tvb z7eAh6!?)sk6w%K@D2UT8UCbRiJuQ*d5=d{PE1N4Oo>pAxqolEYne%DwL}`WO{9rmg z?+6{M)o-3T{$L#bmvJd6h%8K|K%sYfHR+M(fFYfj^&=(?C#H&ze=rn|P4Vy%dv~gG znnj}iA1d?Axg|Qox3YV0tf)_IxnCk(ys+z(8Q+YIQ1Bn4N-!&SKVolkwy4bmyHd%# z9A`VdAmSSSXSw@xN3Mz+T9VEaDb#%UR&3i|0;Piw`(suj3v>KZ5HY;Z#ZJrBDHZqo z5k~DN@S6=jCO@k4`*XPA;9s(~DA`Z>hb;YNIBTi&pT8-|frdi0e0TlKSJ)pIbpB z8)fAn_5NRIaC#vnx`wTNlhM=JsxusqFI@iA!f|{>bO!5LQqa!#G^gI-)qH~%{{wj; zL`n?T9Ki4HwO2JauPZq523l@lY5!R13*vTN^ox-C+lK$wXMxIwty*^Z+f!DJK0gD? z9%AXwqDS$m&g6T(F38-bfv!?$5++P0*(rE?^6u`P$BXBPXUC>`?thJPow@&qK5B`$ zbU(0x+~vQf_=9S{zfIRjcXxmy`2|R?pSPBr{rR<6Hwv_|)g>^wj-DKNAm6 zYH?S2`pT>`*!|4uclNtWYk6b-=r0)kCudyTd8WH2;U{kU%j}}}2qzjrwyBI>mAbP} z8@tc^$(MohL_H{B(U*D}5&j&SXm?#Yn2z}@X;5v;Q zpqN8nMK@PKf%>&$)lG&;@h%Nkeae_mdR!61UG5S$;@D55VWHUm3&Rt8o4Q_Ye`QF4_A z&2pNPX;=!b181mg2_;)ZdCAx@TeDci_UCVQrrl=0C;aqWr5mK`s>H6>92#T^|JaIk z6}1&n+f)mnqzH|Y9e*HSKb)uMha0K<*;h{~Z}e*tUa!Fa{cZmE7C`e#wZt}&OK&!7 zVkaE>b2i!`M1hooIDCmk%)A!y3j@ zU<()ORE^4wzrM+~xn6C%diU|8^KL2o>#J`%VpDR{gz$Anj^ek^{~@0LB>x`^zOYC` z+zFAq}gX$ z(uQLcV=d9k-(s)(%m`0AAAb~5B!gW>mG}7lp^E;T_8%`*qWjAZac6n09@lo=SuDS- zaoeRb-u6o0(3OIJbM>9nmKSQ^JF2(UrSJrb=G291kB7mB5WT zC)59B!2clNKYw!dE)fkcMBnAI4jVhcP@A=8Dpy^!>*M*)9w}CwAyI)oF!{d~=YJ{0 zz@lIJVPTkyEh8!0My1Ve?EJskN}6amt4L4WM0@=I(?6__h%eeE9bk15C;6ynxeI^!h?APUF%5CP)lOhn%n^0IqJw@~95{$L2RQS9gq8tjQL9)xr z#59q0`arsWmP2z|TBkFY_(}G3LoOBx2vs=6Zcznwc7B?Y`8Za4Tw<(-fI{U1rWBT0 z?Nlbm;*fGm9dK6=*uN>hP{UxeiOA~rp!GCvdSf`Xt~^~r@Drg|2esvCV%ryUxnV63 z+M$*FrU2vW(zE_m;j?`;Dad9*r3G?KpW@(VweQPO#R^t)4y?Y#ragiKs{LxtL?%y# z4(HkpAHN46XT|DQmbJl4<7nKP>m-fZRag@~cvj%mD!?Z(6bD+*v^S*!rl@-GHxX;l zOMq5l50CnU4vYU0zwi`HP?PCp_N5(bF^TPSdq&mYTcVBJWd-m2&l=`$0MPlskV+#` z_WGD~(vi9m$r%b3dd01mhy*BsnH}||5_t8V-M%dq(?>8R}qdxI(3 zp6}Yo#J>L8Pj**G_i|bWNho-VA5{O7@26ah5BG4YncUIdgc>R}s|1s-SVsQSm-vT+ zV@9WTbNReGl)-qolHD1Q%fY3-D4RA6RX}SC25;$9*m|W)sPZl~z4U%5T!lhsDHlU!WlC$=}p zpuICrYa##O)zH-|5 zYzs{vhCv)~BJO>XXq6i(%6v>V{7>K6&*=hI$aO#~6^&~OS~@uHpnAtRqbC~o zsB`rxO=Hu-wROdD$VuN@zEMd0%`Eo|rk5V=CM0DR$0epVK2V&u8mC&+`aP&7*K94( z`dmFS7Lfdd2@HeJW0StiATEZg;|$oE3QO!g!lSTf3HEH*Ih;RWkFj%PHBoLvz#I%C z=pwA87=4s+;i_XK$MtO3yUV&(fT9{TBPw<}S$s*Q5B*6_0DG)eBg0F9aS{yvt=sME z^sHw7sy0{KXeq$`zLA-V`@!rYurWm=s&2IGNDUdgCnVF`nE_PU)m+SwT2vAqHGe-z zY5qQf*kH)E%;v;Ge`x4`_SBlePou%**jCFzYiI*tW#&} z(q-G21OX~j7={4Q2L+0E#+#7_8e>Nwv4Ec2JgRJ~n(}HEkZa`Q-W0= zCfc>~lRyqozM)KyA=-0!kpy^6Gs*cn72l9nKejr#N=pZdi+{&sA|3ta*dlBkDWeFE zA2u;HUvKzGVp#O7JLr224(sGNAw9STVN~sURfZT|GHP462 zFqLIC#D;%>M1}50uw*5bZG)Rf9&_((wBd}6VLOsWjEeTkX4w%JHJ?DX&Qb`C65(Ew z*^)V;6*~Kxty4OOPj!FlJ0D2^Pkib++buDpO5W0rQJJ*LKc^JvS%0bp^jMe1%}XM#NeM6%pG*31KSh7IjX*=r z`mEsbP&Hs~=MCTo=q&f~T^@S(c=)8@>IzmE-@6>E{ zuVX5~wkMuvnDBQ0<ks&`L-vXTNJBJ5N>vNi&@1*ivGnuO*n_|7h_9pAa^5Lw3$^DY6R@7LJ|trR%DpKlVH-o@dpjq+4B*QT@mPDC>iI^<o*r$UMg%*?xN$c z0p8q}eMk>0sh#g}c|HT;gZp`jiQO$58#gF0_rPDS&k0v-GEn!riCc3S#wLS_lN ze4}^*KY*W{Vr`=8pljNXhyF~7=NRj ziy|!$%kH)Sq!9fA?n!gLbv-|>;SJ-BmgsMzO#8j%n34t(l^2hMQz}hkMx&W|j)AjANxP~Jr**O( zzPVwGbWmcOBZ!Et4PSJpnXv2}UZK?gg0F?}uS`uMFJw9hr^Y1;A4NOPrTIBjP9N%9 za2@CLx-vHosyt{+#*7e-_$>}S3Mm~pD5Y%(_ge@FDnSUPD=S@CWol> zYVVl2dKmXotN?DdFL5S7I>)w$3bq>8T}RupDpsspg+7E_~^&^;Tbc~vzL?s@KfSB1kjv+Z!@vRIa6|J`an}1sY^_JF>Z@>Z~Gqce<52`I& z8C{Mo>%@z}Y*iW}6B<=e{T^U^6@|IeBUDS=LmGRwWmbLVZpF)HiPt7PYd5GfmJ1T! zSP@$+Br;5?k{AK`(VJ&nEvFRC6o$esQS#J2cRel`6Y7_#Z-;0)8l*q69%$&!;I%e` z*@P=jmx(D~M*EiEyCUeWBxUisqZAdJI_&kqbUs{A9sLNeEJF`db9gIj?Bb*r_tVR_ zm}2IPGY}4F+O`e!dq<-BP2qKzoIoFhF+{dWPCtVZ+myoZ`e-t})KzMudXL?VNzT``HD0AhbyKX_PVcS)_Ib5&y3AJ874@h^^3DYq9Rxvco1jm;q8 zsNRB*u)M{mRn2LnRGk|Zb}FQD`Q=#!5|)PdK1l7NocD5YYo=1OoOO*2_ceD&7p`I3 zbc4cvr_+mbM_$vMK6{x6(ev8)U75vYy}p@Qp|qw<_ma>Ve5%}QU4x^U-b(M?`*AlK zWxoOSB|_+Zg30o93#&y`0%r8P$Ko&lX)Zp#;CF1V$S6sF&uFO9HG;PFsYJp%Dtf%F zA=TFj8>YDm$C~uOtq~z}@b~_vIXB;co?RoVMpxGP&vEzI`2sqtRGw{5380U&%W@@f znPqaS@r36Cu#cUZ=2VW&`ss~G=FvW-@$TR)!%w!k==;2b(3`bONbl_81FvUdcj04N zV1ieOqAmReCllBs13VQq2mi(B#%r4L+z_Qpc*ETvF3-Ofh$dT;kYmvUO^A~?=nG_I zzUou8eMY5$UOLN3#(hYGID?KY;oZ+ZP?6}{{fK>!0#~3kbYFET{RXN>B}G7gWS_tx z9S5qBEps)|Qx#qf9M=BQ_!;yY*EY}F%vu*6*iIFjKI&kK z!EObY<|=A0mZM;?Ruf^~RUKRry_pw$N{KIM^rny*1d1l68i>a&i zYg6~?d_5(#nleF^gJk(n$=FA>&cs+VlqWQd(Vb0f2r;P~-LH0+Ec&VO6MI{!-TXsv z5|PdSQa#8`}jpuD+SSyT-`gQ;pwe}P!@gn&IRvP525p! z)Q&ABGbndtbF;Pk>zD0Nzl@emucPGos-_yfVl@wbJusW-FXDUkku@((#=jY}kp&v| zuKTReFowo)G36th=}H&s@2#$4>M0%^R5Q!xe{1BB7qhPG4OMCHRU~(x+TLMdlDU7B zmk5~q60`20z>afcUO8ezLPXw}U90WgW`aJV0zB}Vpxga0xdifxj#?`6$eR(K!JO!I z5S})_%2)(DUs97P)-dti(Pf|6pk2v7Dqi&+-rzaH_+KhJERgf$Anjs~Bas*SCiTgc znL?$>SEh}SL#~m1&X%-msEfJ6H`kb@PeE@*n0pj=)y4sj9T#o&hMT4IU`%(s;keeV z**;Vxi`*I^@EzOIVbV3o#v{;8SAScx!XI5OZp4&v^d+-Xrqu683GP`wQZX;B>9kZr zs@t*g0lazjVG+ z9I|GG{zbMGW&FHur8^bgQRlq&g;6=V_(peCRo4{OH{&UPGK zlGCEFTI$B#L2F%-5dks5E56^MYx|VBeFcU^4o08$9O|3<@o_IJj5aEK^SNzsD$t#` zG2n{Zu5Yv&wO=pQyu7^R*u!Cro;%C_fw`(kdZdQHfC&+ObnIvR&TLnHvLOA`$$uz@YH(qE&4oh(;V^S(~t=-T&!t8_)b+vFOpu_cgpx;}H) zn{H(zq5nKNNFVOBAn(!Y*^zTR|MkAl`!NA&gKR4%zE6KKd^<0F_4(esCCb(iR9w&r z3@9)guF)2xPv?oZEo$Xk`@Lo1QI+H#{v@hcza1rtq>jwdLB-qUdf{p6x3f!}^>*pY z)bhlkUO#H%&$0u97Z{{eg;(^b@cXjW*CCbUMshM6v1LkMfkeU2)a~+tFki8K6=D;i zraHl!UlLsQ^Gpm!MNy?ydCsC-5-2-Bw4y=Wh$?mB_UsT%Z6u{bY1)WWuFmjmfak5E zucFL->+DVW#EJ5t1ak*r)@4l_1)w3XfH+I2quem$xxw=m*a z0JOngLH&$yPGCT@9YTc!NYWyQ0ICM;k32J{U)&@?7+QR~FjRPhOY!()aGAGMvGo%B z9=odL)ciFR&cx0FY0#wl5tN?Vr|_)a8}rV8cx!~iZE7)?(&m9*E{&L)X`fPRW2+=! z{LGU}^sL4xch0Zo-raxe+8p$875Y%|3sg2skfI{U;YuJ-t|$+E~S(MtIBjD$Ap;Sc(L##a7l~RycI6TU!Y1e72sU+uz^=2E~|!0$uCl?OP2` zI{9p!-h(C&x4CNP&3ow^oi;WodZDiC!ohYPA=Oc@PY;z=yz?Szf4NeAb?PJ1m`TAjXAzoIx2&ho!;1C9L6AqC zJE3KU51v@vLspE$>3vxh=4`fe<@6w)&4H?KM32K`);)(@rt~ruUUG z>NO4aN|z7Kp2l!J7DoU}O^n~lUKYv!M?US}AxEwAH`1?#D;XR`#@H-J!N{x2*~ut) zW{z~5GS!(B;mc3b9zl6Y6-IKzZ)tn?T_!IBjUh!jB>D4SNj{BL* zlO^j28J0fdS9fhpxpC@cX`XRZlg5F2=dr%WOCyD#$()+!)CBtr2R?m}xLY2BZME$& zeMdC@>STk7+Ep{CcPmC_>r^yjvkN8w{KoXC-_}khLsXM~m6&Kjw$8qDuaA0H0qGXf z$`k&=U~ct_lK1n{1Y=66sChMn98?RsO+%tE%Nq4?#9u0Py`18|Bg5(1=RU|VITe9Z zidxx%q}K!JzHNCD1itRfR`y#CTkp4Twbv*G>8(UY0lPIV z65f+dYd$S3jRH-tY9euiw(z6%jhf+uKGTsN4aJMqJIB{JZ0DqXu;D8<8z4e5O3&$w z#1!nPua8y1a`vOtf)Lm$y_5IWRU`Xy$%n$ev@64VSh0K|@odi^d#ThY7vD$@+huKy zeHkItK^@kwoVhq|ksVbR>jTMKNR3nyOOXlHmO*>6G=3>72MylIUQY4-5j#aNLE^ec z@~`b{N|Kbuih6{>H%Ii0yH^{L^||BzhX{Pt>%+y57n_#Ky@i{>T8Ct?4}WHVFDs`T zMWB%T%eMYM3>{FgAo~XeYc*7?+4ZIJUKtyeFN?P&%qk;`MxQVhkv!vVe~xO27>7PQ z&`(>}fk+j{O0#iQ1&FK^1-rU2>W8!TshLs*#|V=a7dut!!SVTyl$V$GK7dc?HMshl z-1LJjH#}mdZY$}({zNI=5%a8{f*xjES~h>rPgAd}cJPX}W36$xE+Ah~e+4WFxMC!U zHMV6~QjxlyH$s&I13!20dD1mf;|>@~tAh(K3_o#SIkRq@E~}R&Efjv{+`u`!@PHNt z{VQXCk+zN|=^;e!!q%k?*5hQN6;5co50tAm~b1@*9M`?@Z;CcOkv= z*EcsmdZnVZsdfu=4mErfG~D50+t=-OT7CO&B^&$eOxPX1!xQl0N!HxSw?-$Y`0P)_ z8p}i8Bc+z;kux(a8mLyz#{DzM?d2~@YGs}5OTdz>;00|9dnC_NE{Pg*ta6Bchp;${ zaa7$S>&Zh_%GtfBUPn3X@k5O<7(si3cz-hz($3FQE$3PrM$SY2%~U=3Q@NI=dS$f< zPgIYW#OdNbVq_wp4CG3G=QnS_3k=q1ZRgJWRvS={w5Z}Ut)*~OC$qB0 z+s=cD(S5Cuyj$%yRPO?q8hApaD8N5fgb>D3@TY9XQoD0y)y%77p1bano~oaYLKner zXR2-zt;Nb2ZPtQse2O2pqq@r2kWfX>Wu$R{dThjq_qtXGymaS?9b}epX<7*!hYR3q z%r^E#I}U|P9;N5TRQi*P|?FWCrdFU z!aK#7v0s2`>~3D8YMS2ZAQ*L<{M?0|i(@ml@Kz_g%#wxc2~Y+0E9wn7-(%W*qZae| zP}30iNwv~6=9!H%N(X}v%-)QQz0?wZZF0(BAvyOk2{QU| zQ2Z+Qr`xQLjPs0e3{&yklN+=K>ES%gWJxZ~F69^xEY+`Ir+GzUa!o2|L_7pv-;|=1 z+WT~4O9EMfXI!y40R@@cu0Zy}j=kXlu_?`QAhq7DuQ0%5eBabstnMZ?a{HUVAZ8pY zTfa2niP0D9s?2Y00I%!89Yw)i>C#bT?=A!RHe516QwP|{B&+Fl4BX;)8@@d-6Ei72 z9QY>jV>^g3uC$Y&0!ObsEZeqmFPSC1)h|5m?o8+tfuB3 zpm@}xfq6fA*{AAOdqcn#XX77=Yr1Pi!Mwh%9xdT~fMfPFPxTwJ^g&yDW%z#A4EL|( zV4o$_i6Z=8+iem!MIBK@ZKfnePZa*lN7ipUfh-l(@TwKnhP>~5{Y6q3pz_#kQvwvW z^O=GZ0xz-uT6Lhk+bT)_l~|>-#j#ZIDc><1_RTT$y-%UR*XI+W0^Y@8iMY?f4|wy{ z7p7xMzkG`BxB;8r%Uww#bElVS?<-|~+fs5>K>g(=>iO*tscw!`9BxXIM_j_eIYBNe z^xoHFya9nHQ}W32=c5Gn99Aqr=Gy6ih}6aV4{Qr>vL0U(6v#$5T0AXTJe(%S9A(?r zeWgeepbs6aSbDbM%`Z`)lRxP)gaB>O~xixoRVnDn+P7WTiM{N{kF0kUg z%#PgN_dQpMYn`GIKPLI~^2t|) zL&%lULmJi*eR@Zvc>&G`aRsF%I|gY3l@%%*@M{YX>tpp4ddF-DNxXWwWPGG5eZnM6 zQR9_kW$2r&BS&3Tw#Xr&J}(?!R4Bh4C$eJv6Wj{JE!-!!gDeji_30NmIqCNXGs>5B~aRET^P|_$VH5e%0!&Y`_I$K6+e9K~L=g2r;P1DM0uV z>_w1@9+6ez3-gry+Nz&jMoh2T48e-0FB%NF!}P0}H6ib|@G-#JW$bAF@y^n5_NQt^ zqN${f&b>FkV+Qe}R>~;0ES@Zo)!q)Q($2hbeay>%v$)8EMKRWXVE<`R@rzu21GUK_ zfu`bavZUMa_`(wRdfUg7))e5D#|C9Ud9}j_s#ZOUP1@sZ)WYlpQD$ShBPsn8<--8w zst+?>YoiqhtDeL#Q#?bfJJIoSF<}*pw;}qU{wp&jV?a?wv*MvR>_X(RDp~&JkfF}! zcOG9m4;*;DyszC!Ua{^)EPWvWfQqB3swK_j(roWKAjNg+rAf@Igl@wrSx%J7?UiOV z8}+y&3E{Gv+d1T&%jaVk9v#cXz*R+HH;Ts89F1oR^!JA(IghkTLzsIv&K}XnI%W*g znqk>bcl4Ymm1b}$B?FD6rI2(e-~^JgO~-vyQZP4n%Y`dvrI1JSAK?oF&Qm`WbQf-o zPWTw!D2;W9&ov$B$QAF2(gVMNVsE zkp_wQGIVbPcx zEa z&8SW$IFpY3yFsRNUGSq3@W`}OWBG!|M-qG9aU8O6ds02;EPD|+b?I7`M(=7|&mGCX zp&(nt9v{QVq_fW&=8dbn9jvQ>^`u8p3y#&+Hlo@H9O9dMcdJulA)Y@>V7g|ep+2)p zl*{$^>Wf#7gpq@Yw(*_(f+~LBlbF^BuN2p6mQ>bv{uZ< z7ol2BH>Byd-%RI{0jod|A4EMdcjR53%Ip(feJ~>LhufoF^XqniZe2p=NjBXg1iLh6 zJyWulFYeyd?gR=Z<{uE#lIQUbHW?Ke?A~0ed3t*&VqH)NQ62KdA5#|C=B5Fc*H>~w zr=~XC4)&XTPfCOYu8AuVn#R~;~Z)Unw^bK51YYP z={Su~!?(wM&xfQS`{%U@ds5O`kB7ELUsHMsa%v9O+mz>5V7;t;JuxpK7^3<9qc4|k z!cJ7)pIcwv!lZekC7w7^C?%gq)>=+hSr}rK_-p8K+EI$;h+EeW`#VDrYJkk1y~gGkmy)$a@kmJpxut6 zPmeK3-LsSwSgyE@A7}D)2L;A*^=SUpkYbB~c+@5y6e4|m_%jzC=65r`?u?TLEiuu| zk*Cb4k2mfC3;kSUo-Nbe3mafO!x7y*$QY{4l4(3F>=7q5T<%=$Dd}1F9RloC50#+{ z?MKQQkbKkrxP2Y>q19-@efUi9PM%%=yCbZ$p2dn#b`!*ae&S0+tLf2)ZNoI%dJP8& zPL%(9G+mtKvH6Au~o_lpa6ywzn~De_&zOR}_XIlcn+KQtf~CwQ-o z9Mti;oZ&zVUI5?OB|uGKz0J|~mtrV^Iv>GE!vlfBDvR{a5}}mam(P7sn?(sdP6Dtb zKg)PG1elxY8N```jK~qbgQR{M^xby88_Rt((%f>&TFOYN-iU`;W;%uwdjp))6n zryCJR431OwB3L2Dif}Z=eIJuB$_qd1*T4WW<)Lpp1=1t`ISU}oZPp8%u7_wF zv~AwhGWJHcrTeOVm<0oYx51?&gfJg4{BY=*BZffu4Brvs zt@a?XA&J}qr9EL9R6r*uh>CkUdpFdKJ%Ujs$@haL@=>9PwPCamhR^{`<7iva1~gE! zHoWNyIp?V}DQ!feFN1%{ zN;0Euhr3JOFb#g{Q8o27UY-?o4TlWBzL^!=FhHq<4$i7?sC_ z6Y;L1__ym^6kwQ0GF<&R+&-HU$*}6p627OzFnK^h|0q%(=kCelVQn}&cJ9c6rmBdtqMTU#fGYDB3+cq?82mVKQ=o^MqH}H$ zE(lk7Qj^!@cADd2P!Ek`tHDx2#1e2X)Euz@`ARo@3Jh>N#_pG;m5b7t0OnlsDfBPV z#j+W!fHe~81Vxfc{n&-d>IuXeg6ZDvDu&dT9ZGlKiZfh%3M-zQ+3=1T-!O3;^3S*Q z&ALrH(UO|8=L{>8Cj1# zmSakDskSKMQ0?EI=7ocMivsd-#T(wM*N_z-d~BXZ+!`Hd*CtRd1`XP#)9oADA75(4 z25aHJ6|NKE>pSrp^4uZU-hCZut)|!Q_nHT6IMAyFV%V&P*6bY4@PTeN-_hA%MlN0UJ`?oPuT`LhT3|4K5;GZ zsia3D#k~}WdgWxWS6}^naoo;vu4OAT`}moQ;C?&V;m+sLTJ__9i)+^OVwX58!+l-sRSBm9~o+))qPtB-q ziW{1-q0R%DszThtr>eQP7(PGqTS=*WXpqucSn#b8^!SLL8YM66T6i>V^8A-r1uC~3 zKfA%O+H!d*HLA$1*Yv7GXriEFBH48o z?D868PNLbejN6p&9>+1UU6V?m%&Id`9;@$mXMMM_-T9szHnz_}W6xoO58i~n=+Fol zqG3|P>ch4k8yIwe?!3`h**4j@f~u_`-x!tIsA$35E~-wD3)6V56fHcC6O%G3XGg3> z<>Mb5NG{4zfG>!#kK;fjhBl*6l1%nRwsKo;Rqr=SU+Y&rwIBJYRjyMr!9s|fqOw>= zH-09@kZ3u~;=fpV=3n{7s655`?fH1KAHL(ez40)EO$;#zLxy@;^}8D#)*wE-vDFt_ zf`ZxF7?x53?YJ`E^O%U2!?DfvGm+V?94_LnMWWA|sx=(8gS zia$keN*5)&!j#aQ1R{~W3I2^gUc9iqV-+BKQ-zIz|l6B)rxRBR=Uur+^-ZEa+`cmBi- zu^E7Tiv@gRk}S(x^@-FI4aYR*!^dNG>|HTARso{oKCx{RT`{SH&?A5a>-_GbwZ-El zM?7d-tU9J9>ZhxkrEg3F*LLdh z4qZY-cJCu|hF@`#mVRQ5>6^Ky{JK}~*`LsIhQavRG7G1nP_Xs4T8(>G9Abibpr;kq z_iI13IS9EfJl`&}kKXkiKZneD&R(m0kTTO*$*%5ct@O#)+MUY@(w-Sw)%Z;*k>K;% zSBtYIo3>$WChb$zRmR7r?Hd}7z?<~xPnXNx7jGvILN>zr1k4pjfdW6)b}qn9x7%S`nY8nG>AEs3OH}x76f?u-Oj3Hlg62 ztK4q0CYCX#Q5tJZg4l<45X+Bk)D0v>&zn_wxmUq*MfBw%^3 zhB@Vm9JlIE9HdeY>sh@%wi#X(a4k}hE{WZf^7Dc6%67S`a9OJ{@D+m0cK9!@_|W&lyhJ z-08*6BxoxDe{2jv)c^)#^4 zLLqt7BvSmf3QH*a+1!QaPd{SR>a()@lu|u&b4F!sqhRGJu`=ht%;BxQ7enzWZF^zo z($1BJAAK?It)Qyv?1k)axM#8D#$9%~b$C>Btg}xBKtHooF{TjLl-3C`nIs073P|~z zeIUNR(v$BfZ&0u_7YuJuU*=Q2lN4W)O7HWPldHBZhEh5*W|Fcob!Ngq|%3>Iq->VMV z<7EHTy_3-dlROx)99pNMjcFp@Q+6KU{F>MR_WA&)j3mS9YJW-LL^g7)8ll&3J+rI( z)%+N0>PJdVZg8l=+(m!!im^ODs99qa%x*SoHta|x!l$yqyz{%e@Bc0TOE*53hZvvP zj?fn9b8U9oNGGfuDEY$8`BA^Jp`P`VB#MdENBJ19%?HS%LC$gdcJ@t^k=5d1fe9Av z2RjzFtsA3+VJq_(mweJ{7X#^eko6`=D`@Rw=>|66MiyR`MSqk>uV-B0qhW5oB}`kN zjbvD_Lr*8vzPtU0gxq=jp4dEn^M$wDf13JTIUYaDL_1mpe;AAY?X>w(P)JfrTX8!2 zf~0K-c9L2(u=ulg#tqtU+7%t{8I#AxugGTDja}yjelf{V#n1-q_j&!dL*)0I;`1`< zGb$kuouUONdgwftIHI?;HJq}pG?W-()Gam*svIETWU(XOB)L!>;qr8Gt|x9qg}3rp z@IHzpRBz%EKn zJ@1_o`Jr05QlWB*X94f(+vb|=6M;g*DF@Qxv9s(8Wiw9rIP>E?(P6teliOB%!{pA% z8@b77$_AFmU;%qiy@t=AsSWSOg27p3riQ}!1?BBDXHol}?ukO96raf-Uy7aeCFDly z)~gxa<%$Kr@5xPn*M52SFMB-vYmZlXTaN$JeK~y|JCqy#(nYA3q?0;H={NQ&8u__1 zPo&*eQipvfb9`@cQUY$KRKEB?EPh2NED^zysck>-S(k%dQy+UK21}$ym$$XKrv+vp z^IiAT|8UR0NpEa8Cq#rY;nJF{~LTxq(^#=8^Q@xHl7cJTZLo|t83UNjoNM8 zc_(7~)D{Oo)(qU$I)kUbq5hKE zn%+#wG(-#MHSK*r%Z7D|M~Fw6SF|g+`3E2RBa;U`g1)j$E;MZte1tHPfj4D=rk86% z^QY`Zmt%E5V^|_0LF>!XzFYg>TiNuuCl_P&Pi~^?V#%ruPoGcm%Q|EDb$Cv1ZTiE1 ztu30_p#*qqw+^nX9bEk;Jb&t1@oGt-Yhsvn**pNT4rQQeefshy8SK22=C5L6>zA{0 z!EkI2511l{*H{mp87^H|;!D@Ymfv%6PS$g4&evp1l@rAdrf7qtU7)J!sPy+?RyTdd zSKjbKR9o`)EvZx7YzFuV_9BlYIwJyEc0)Y5A!j;mH~`3!)*e}EV&hoeWpm)5yT z+j8KJD&Q_5Uf13_f$}>kLpBext~3pMW&|A{^=8A5Xu$`_;|9DQs;9lCwEbD?6XyV1 z6BO58ebWZ>awuyRYMCkxD`cUk?2xpa(u{8Src@>4D_hyPQFrfl{`%-l)-}4iI5YdP z2+zWCfOYlfgX|=|g~je_yI;LKob@M{(cB#;n;(!o9enn-_AkwHy8T{V)C0xUe1*mE zaIhgaf7HT{dhM;*R*TXLS7}bEsb7y04-~Ny91^2SZlV9KACTbcf4V4oBk@XM#?&%> z@>b5M!Q-cTCjY|qC+83zde(eTRSp)8 zXKA$k8pX&$see>@tKuwAKqRGn>2vnhohqJ_i?lnO$O5Ua8K-T1MR`9ns^k^rkeTn+ zPQ!k5wO?g*%wxzeH9P^;47{AgHc0Xi8%e?tw`QG;q_u>G%20W5qmLrhB?oxuV(-nfvc{{6PKLxYc)4#r{@mW^l#Z*_t9e3Z)=V-YPfXKl<6ir4Ij!CP|RFxi` zh<{R+%=TB?JqiZ+DrpS^an{t1!8E75ooKam&<9W&-iLp`zq=#PRI`bzU3-VmL@KU&?8d90Q|0UCpfE~66zZfQ8sRz;N%tUr(juu z@$>*Q!pVKz1`z|P>K|8VG^}}t->VEgcxmY{h=GHrpB8OB75`xt=hZiDp64IEI_!F& z$G;qfv~U?USGx(`G5J-C?I>Do0SS8f3t@0s=sKdknOOSKEOwkPRX%a49bWolu~sP~ zE#`;UKsEz0VT5bgbJ&rU{jch|-`=?&&n4oqINj+_gMLefcnoRP@UgU-RT_TnhaQwt z;s7id&19!_FuW`W#qGB-K7uG48Gy{wB&9`f6dM0dB#TpCe63-C%_FBjf9-V;s|jIS zfrF6}-tQDO|40fS#oUX79iEqhyJV;zvQ`Y@jr;{{&U`QMa|K!M(Nmxl*nG1r#8&1e z=|)CH5SOK17#)D0E3s-Z0X4kkjr`>XJ0=8VHm?V8k`LWBgWP&_;g?+fc>Mj;c`4C4 z>6n!+Jp!=3KCqJT-EzYGv*pwQ8g53&ga5L`&_@=@$fw^}G^RUQ9tO;$>Ni>a$_gKd zQd1H5!yL1&)IJ4Q0VtC#iIv8-Hui+TNOLZ$`aMg2e0+N{jwQwB8!ZU5NveZM*?}QSCgkBRuhuniY^9ne3=9}-=eeNHgkdw3b+N-a%_g-s* zfagPA^m0qwEd3pCYo7H?pNhUP9+L` zD$#w|zM6l#)P0uadpWiy#>a041oO^ySQ$nxhT|l!azpF%fZ%6AbA&Wy3Y#bQCZ)Rfa^PrE5$@! zY|_y#cyCxq%j6~Zd{L2kyJc>ew)0`Z(PMFxcz}U#4%%PK{TBViVaO%RfI|UuL2n~{ zpC)Z2JILhsB)?A#I0|uXJ%B2KRf1ulRtc zVOU31MyUel?MywTo!rKM1Zk6bx8i;7PSnp>tv&j zl)>7BE)QvZm*KyT$lZS6+xIDYEKJ7P4siW6lw@=rmLALGZ9*|4i=C6eUO;!FE=Dc( z*>7(pHkda)0DFceYQat4ww-cg!JnGpR--HL9@|M&&p(gXX>7yXZ9yCF28-Lw_Ma{9 z0Plz8YvIbJ%dyLEWC>(&t9H>A9wMfe+~Kl@pWOKd2Pgc!ab@e=5W6ggbyU@$m1|T2 zaAuWw@N)4lv=BM*gFdsGnPX+?P$aG@7vZ%5XJyP>Lrb>aOvL;1>d%_X*N&zz+|tcy zp-~_D5@Oi%HQS1#5Gid`{OQaWr1YTqXIuN+Kn9$s@eVVazXqwQiad$pUdEAnIVn%^ z_6twj%oe}=@Hu*EwtQ+d&ySOzV5xnMb$^5{j?Y+Rn%$eL&>NGN@hEaf*E(;}WQU`_ zQ!85ai%ZT@_niEC@@#uWVw|dw#+Qt8ogFGf7YmubUDi$8)9S`78Bx`f@a4S%?X{O! zosCwPM{k)UvLdvr_tK-P6K$Q4{P961v;8y&kHX5aL9_7V);&)_1Rxhr`v!cYFx0{45w^AexZr3mYr*|n%rS_9G zC)x0-JbWg(kQJE=6m1FDS|#|5*kP)S&M^!|PpjCRl(2f`M-{)9Yb%c#l4Y3X)FkMZ zUzlk%Dqiv-mS^zJ){p~5F8IYEnDlBTpR3x}h0B0?R9Y4x;%k+E+$h725Qs4*`$g(o`@r*G%u1U) zLWgHS?bG`#YAUjdZ(n!yp3Ku+ePNh1ar3he@iU-yYy-(W^SZ)})YVd;x`}ILx~mRo zywRJ9nOi9R;@drZ|3zX9v9lF71B&>wlK5-K0;ZwsWFEAJ%FnU=)wkB4S9d9MbexHg zh?{Fp?K65La3ovt@w_9e)0l$|g%xx0`zjN|f?-Af#2c`glbI)ZC@%L1ZFG#^Zxum2 z=#~^gQVgYJ0m)M`w0GC>)rlHp0yU8C_L)A}J9^O%_M~0S-kO--;F&H9Q=)L!r6uxu zhs2t;*Rsz-76~@;mhrD10BXyqhqgaiO5G zlUfE<^Xw!q4d;ZAH%PX=KH>%b?6&@;Ex+BBkh!jyOI2{JQs^(i_;tlKpwvh01*SSo zrM=1#I{)(DzM>~QCPCTjo%(2Z)s4oE<1_h&-U8|=g*rFz=^ZBCk>&sR_l4H=Ze#G9cXA6$75_~sc(1^{^PCq3PfmXL_2d5~6%RgTzIjr!aI$paW3uT&s>f1MpkOQ| zUKV6fc=2opMe*;!=TYoS-l!xrk95HCh5v@P2PBDRGYn1~L80$M_IH_1&oAZ;DpFfW z!UZ2Lnquu=vxXmoD8==J&#-5y)i$9&;uWj=X@2S!%F&YqczmY?b5|YmjE~nf?1PG? zN^IZbIX)B`k6>_kx-lB&!r>gWXkxy#?s)W+g)jf*FDDlHd8HhIq)iK3ZB_ol8_uIO zM33hEj;!rp{VVYb*q)QAn-b%D+5!) z+g~4X^N;-Q`@afrYxlB3DxOu4Y&77U^pzH-`Xp2M@fcm?ot*3Sj$!ivwQxz1Agz(9n|mp#Gvn`jcrBS;1jB zs@Sugibpi&bp-){KeG}Y?#chnw0{xP;17Z|VT1PTksxp4$D2UKr-&zKGBSmq?fCzm z_E%-N@0WOhlg}WtOxgEE{)bfFuQBQ`ieULonKa?E)o-xP0=cczdRU%em0a%@E$@+B zfsw5xIC}Xx35Y@Idbhys&vJt1Tb7~P%2a>B^(SXAT?h2S`IH{|jO#b8sp}%KXnEe?rN%C_KN1ZYq>l|8evqll}Cs zX#YXFoXg@@rK5j|o1fnO6Ok9>0d~F6iD?0jZT$J&zkTDJ-#S{B8he7{+F}DQ| zY_MP_v*h#t48}PXIiS9jYMP1UQEB)K(tmtVy8;w~Y*`V^r#w~;`XAr@3vdN40I4cL zaa8{oIITM~OS8{$Rm#^6H7*n;fx zl2!Rg;@0{7_`JEfxxD=8bqSX}`9HhO{i1)w0_O>QCV?Qh-!Gp1s(X&;lb_p8j4@un z2NPJ%(fogz;qOOrj(c({GErht>}a2k!%b3t)t}_aZ@x=cU#0XUZuGzXPml}9i9T(C zq&d?bYAEvSsg8jo>T>>p!feNmh1!3DJn`1`d&U`x0-$39|BTze89GJaG<~Myv=
  • [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) (**ModelOpt**, `nvidia-modelopt`) -provides end-to-end model optimization for -NVIDIA hardware including quantization (real or simulated), sparsity, knowledge distillation, pruning, -neural architecture search, and speulative decoding. +provides end-to-end model optimization for NVIDIA hardware including quantization (real or simulated), +knowledge distillation, pruning, speculative decoding, and more. ## Major Features -- Start from Hugging Face pretrained model checkpoint with on-the-fly conversion. +- Start from Hugging Face pretrained model checkpoint with on-the-fly conversion to Megatron-LM checkpoint format. - Support all kinds of model parallelism (TP, EP, ETP, PP). - Export to TensorRT-LLM, vLLM, and SGLang ready unified checkpoint. @@ -28,11 +27,14 @@ neural architecture search, and speulative decoding. | Model (`conf/`) | Quantization | EAGLE3 | Pruning (PP only) | Distillation | | :---: | :---: | :---: | :---: | :---: | -| `moonshotai/Kimi-K2-Instruct` | ✅ | ✅ | - | - | -| `Qwen/Qwen3-{30B-A3B, 235B-A22B}` | **WAR** | ✅ | - | - | -| `Qwen/Qwen3-{0.6B, 8B}` | ✅ | ✅ | ✅ | ✅ | | `deepseek-ai/DeepSeek-R1` | ✅ | ✅ | - | - | | `meta-llama/Llama-{3.1-8B, 3.1-405B, 3.2-1B}-Instruct` | ✅ | ✅ | ✅ | ✅ | +| `meta-llama/Llama-4-{Scout,Maverick}-17B-{16,128}E-Instruct` | ✅ | ✅ | - | - | +| `moonshotai/Kimi-K2-Instruct` | ✅ | ✅ | - | - | +| `nvidia/NVIDIA-Nemotron-Nano-9B-v2` | ✅ | - | ✅ | ✅ | +| `openai/gpt-oss-{20b, 120b}` | ✅ | **Online** | ✅ | ✅ | +| `Qwen/Qwen3-{0.6B, 8B}` | ✅ | ✅ | ✅ | ✅ | +| `Qwen/Qwen3-{30B-A3B, 235B-A22B}` | **WAR** | ✅ | ✅ | ✅ | ## Getting Started in a Local Environment @@ -43,6 +45,10 @@ pip install -U nvidia-modelopt Alternatively, you can install from [source](https://github.com/NVIDIA/TensorRT-Model-Optimizer) to try our latest features. +> **❗ IMPORTANT:** The first positional argument (e.g. `meta-llama/Llama-3.2-1B-Instruct`) of each script +> is the config name used to match the supported model config in `conf/`. The pretrained HF checkpoint should +> be downloaded and provided through `${HF_MODEL_CKPT}`. + ### ⭐ NVFP4 Quantization, Qauntization-Aware Training, and Model Export @@ -55,7 +61,7 @@ provide `${EXPORT_DIR}` to `export.sh`. > low-precision numerical behavior (fake-quant) which can be run on GPUs with compute > 80. > Real low-precision paramters (e.g. `E4M3` or `E2M1`) > and low-precision compute (e.g. `FP8Linear`) are also supported depending on GPU compute capability. -> **See [Adanvanced Topics](advanced.md) for details**. +> **See [Adanvanced Topics](./ADVANCED.md) for details**. ```sh \ @@ -72,31 +78,6 @@ provide `${EXPORT_DIR}` to `export.sh`. ./export.sh meta-llama/Llama-3.2-1B-Instruct ``` -> **❗ IMPORTANT:** The first positional arugment (e.g. `meta-llama/Llama-3.2-1B-Instruct`) of each script -> is the config name used to match the supported model config in `conf/`. The pretrained checkpoint should -> be downloaded and provided through `${HF_MODEL_CKPT}`. - -Loading the saved distributed checkpoint, the quantized Megatron model can be resumed for inference -(generate or evaluate) or training (SFT or PEFT). To read more about these features, see -[Adanvanced Topics](advanced.md). To learn more about the design, see our [Design]() document [WIP]. - -```sh -\ - TP=1 \ - MLM_MODEL_CKPT=/tmp/Llama-3.2-1B-Instruct_quant \ - ./generate.sh meta-llama/Llama-3.2-1B-Instruct - -\ - TP=1 \ - MLM_MODEL_CKPT=/tmp/Llama-3.2-1B-Instruct_quant \ - ./mmlu.sh meta-llama/Llama-3.2-1B-Instruct - -\ - TP=1 \ - MLM_MODEL_CKPT=/tmp/Llama-3.2-1B-Instruct_quant \ - ./finetune.sh meta-llama/Llama-3.2-1B-Instruct -``` - ### ⭐ Online BF16 EAGLE3 Training Online EAGLE3 training has both the target (frozen) and draft models in the memory where the `hidden_states` @@ -119,19 +100,23 @@ deployment. ./export.sh meta-llama/Llama-3.2-1B-Instruct ``` -See [Adanvanced Topics](ADVANCED.md) for a `moonshotai/Kimi-K2-Instruct` EAGLE3 training example using `slurm`. +See [Adanvanced Topics](./ADVANCED.md) for a `moonshotai/Kimi-K2-Instruct` EAGLE3 training example using `slurm`. ### ⭐ Pruning Checkout pruning getting started section and guidelines for configuring pruning parameters in the [ModelOpt pruning README](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/pruning). -Pruning is supported for GPT and Mamba models. Available pruning options are: +Pruning is supported for GPT and Mamba models in Pipeline Parallel mode. Available pruning dimensions are: + - `TARGET_FFN_HIDDEN_SIZE` - `TARGET_HIDDEN_SIZE` - `TARGET_NUM_ATTENTION_HEADS` - `TARGET_NUM_QUERY_GROUPS` - `TARGET_MAMBA_NUM_HEADS` - `TARGET_MAMBA_HEAD_DIM` +- `TARGET_NUM_MOE_EXPERTS` +- `TARGET_MOE_FFN_HIDDEN_SIZE` +- `TARGET_MOE_SHARED_EXPERT_INTERMEDIATE_SIZE` - `TARGET_NUM_LAYERS` - `LAYERS_TO_DROP` (comma separated, 1-indexed list of layer numbers to directly drop) @@ -142,12 +127,44 @@ PP=1 \ TARGET_NUM_LAYERS=24 \ HF_MODEL_CKPT= \ MLM_MODEL_SAVE=Qwen3-8B-Pruned \ -./prune.sh qwen/Qwen3-8B +./prune.sh Qwen/Qwen3-8B ``` > [!TIP] > If number of layers in the model is not divisible by pipeline parallel size (PP), you can configure uneven > PP by setting `MLM_EXTRA_ARGS="--decoder-first-pipeline-num-layers --decoder-last-pipeline-num-layers "` +> [!TIP] +> You can reuse pruning scores for pruning same model again to different architectures by setting +> `PRUNE_ARGS="--pruning-scores-path "` + +> [!NOTE] +> When loading pruned M-LM checkpoint for subsequent steps, make sure overwrite the pruned parameters in the +> default `conf/` by setting `MLM_EXTRA_ARGS`. E.g.: for loading above pruned Qwen3-8B checkpoint for mmlu, set: +> `MLM_EXTRA_ARGS="--num-layers 24"` + +### ⭐ Inference and Training + +The saved Megatron-LM distributed checkpoint (output of above scripts) can be resumed for inference +(generate or evaluate) or training (SFT or PEFT). To read more about these features, see +[Advanced Topics](./ADVANCED.md). + +```sh +\ + TP=1 \ + MLM_MODEL_CKPT=/tmp/Llama-3.2-1B-Instruct_quant \ + ./generate.sh meta-llama/Llama-3.2-1B-Instruct + +\ + TP=1 \ + MLM_MODEL_CKPT=/tmp/Llama-3.2-1B-Instruct_quant \ + ./mmlu.sh meta-llama/Llama-3.2-1B-Instruct + +\ + TP=1 \ + MLM_MODEL_CKPT=/tmp/Llama-3.2-1B-Instruct_quant \ + ./finetune.sh meta-llama/Llama-3.2-1B-Instruct +``` + ## Advanced Usage TBD diff --git a/examples/post_training/modelopt/conf/qwen/Qwen2.5-0.5B-Instruct.sh b/examples/post_training/modelopt/conf/Qwen/Qwen2.5-0.5B-Instruct.sh similarity index 100% rename from examples/post_training/modelopt/conf/qwen/Qwen2.5-0.5B-Instruct.sh rename to examples/post_training/modelopt/conf/Qwen/Qwen2.5-0.5B-Instruct.sh diff --git a/examples/post_training/modelopt/conf/qwen/Qwen2.5-7B-Instruct.sh b/examples/post_training/modelopt/conf/Qwen/Qwen2.5-7B-Instruct.sh similarity index 100% rename from examples/post_training/modelopt/conf/qwen/Qwen2.5-7B-Instruct.sh rename to examples/post_training/modelopt/conf/Qwen/Qwen2.5-7B-Instruct.sh diff --git a/examples/post_training/modelopt/conf/qwen/Qwen3-0.6B.sh b/examples/post_training/modelopt/conf/Qwen/Qwen3-0.6B.sh similarity index 100% rename from examples/post_training/modelopt/conf/qwen/Qwen3-0.6B.sh rename to examples/post_training/modelopt/conf/Qwen/Qwen3-0.6B.sh diff --git a/examples/post_training/modelopt/conf/qwen/Qwen3-235B-A22B.sh b/examples/post_training/modelopt/conf/Qwen/Qwen3-235B-A22B.sh similarity index 100% rename from examples/post_training/modelopt/conf/qwen/Qwen3-235B-A22B.sh rename to examples/post_training/modelopt/conf/Qwen/Qwen3-235B-A22B.sh diff --git a/examples/post_training/modelopt/conf/qwen/Qwen3-30B-A3B.sh b/examples/post_training/modelopt/conf/Qwen/Qwen3-30B-A3B.sh similarity index 100% rename from examples/post_training/modelopt/conf/qwen/Qwen3-30B-A3B.sh rename to examples/post_training/modelopt/conf/Qwen/Qwen3-30B-A3B.sh diff --git a/examples/post_training/modelopt/conf/qwen/Qwen3-8B.sh b/examples/post_training/modelopt/conf/Qwen/Qwen3-8B.sh similarity index 100% rename from examples/post_training/modelopt/conf/qwen/Qwen3-8B.sh rename to examples/post_training/modelopt/conf/Qwen/Qwen3-8B.sh diff --git a/examples/post_training/modelopt/conf/arguments.sh b/examples/post_training/modelopt/conf/arguments.sh index f29e0a9d989..0193bf8b643 100644 --- a/examples/post_training/modelopt/conf/arguments.sh +++ b/examples/post_training/modelopt/conf/arguments.sh @@ -1,3 +1,6 @@ +#!/bin/bash +set -e + MLM_MODEL_CFG=$1 # Bash coloring diff --git a/examples/post_training/modelopt/conf/moonshotai/kimi_k2_instruct.sh b/examples/post_training/modelopt/conf/moonshotai/kimi_k2_instruct.sh new file mode 100644 index 00000000000..4f301f31c1d --- /dev/null +++ b/examples/post_training/modelopt/conf/moonshotai/kimi_k2_instruct.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +HF_MODEL_CKPT=/workspace/scratch/moonshotai/Kimi-K2-Instruct +TP=8 +ETP=1 +EP=64 + diff --git a/examples/post_training/modelopt/conf/moonshotai/kimi_k2_instruct_export.sh b/examples/post_training/modelopt/conf/moonshotai/kimi_k2_instruct_export.sh new file mode 100644 index 00000000000..73ee80a6d93 --- /dev/null +++ b/examples/post_training/modelopt/conf/moonshotai/kimi_k2_instruct_export.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +HF_MODEL_CKPT=/workspace/scratch/moonshotai/Kimi-K2-Instruct + +MLM_EXTRA_ARGS=" \ + --decoder-first-pipeline-num-layers 3 \ + --decoder-last-pipeline-num-layers 2 \ + --init-model-with-meta-device \ + --use-cpu-initialization \ + +" + +# Layer distribution over PP: 3, [4] * 14, 2. +PP=16 + diff --git a/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base.sh b/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base.sh deleted file mode 100644 index d6ba1e1dcc4..00000000000 --- a/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -if [ -z ${HF_MODEL_CKPT} ]; then - HF_MODEL_CKPT=nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base - TOKENIZER_MODEL=nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base -else - TOKENIZER_MODEL=${HF_MODEL_CKPT} -fi - -MODEL_ARGS=" \ - --save-interval 100000 \ - --micro-batch-size 1 \ - --bf16 \ - --no-masked-softmax-fusion \ - --disable-bias-linear \ - --untie-embeddings-and-output-weights \ - --position-embedding-type none \ - --no-rope-fusion \ - --normalization RMSNorm \ - --squared-relu \ - --num-layers 56 \ - --hidden-size 4480 \ - --ffn-hidden-size 15680 \ - --num-attention-heads 40 \ - --kv-channels 128 \ - --group-query-attention \ - --num-query-groups 8 \ - --hybrid-override-pattern M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M- \ - --is-hybrid-model \ - --mamba-head-dim 80 \ - --mamba-num-heads 128 \ - --mamba-num-groups 8 \ - --mamba-state-dim 128 \ - --seq-length 4096 \ - --max-position-embeddings 131072 \ - --tokenizer-type HuggingFaceTokenizer \ - --make-vocab-size-divisible-by 1 \ - --use-mcore-models \ - --export-model-type MambaModel \ - --padded-vocab-size 131072 \ -" diff --git a/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base.sh b/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base.sh new file mode 120000 index 00000000000..3771c930263 --- /dev/null +++ b/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base.sh @@ -0,0 +1 @@ +NVIDIA-Nemotron-Nano-9B-v2.sh \ No newline at end of file diff --git a/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2.sh b/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2.sh new file mode 100644 index 00000000000..d6ba1e1dcc4 --- /dev/null +++ b/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +if [ -z ${HF_MODEL_CKPT} ]; then + HF_MODEL_CKPT=nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base + TOKENIZER_MODEL=nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base +else + TOKENIZER_MODEL=${HF_MODEL_CKPT} +fi + +MODEL_ARGS=" \ + --save-interval 100000 \ + --micro-batch-size 1 \ + --bf16 \ + --no-masked-softmax-fusion \ + --disable-bias-linear \ + --untie-embeddings-and-output-weights \ + --position-embedding-type none \ + --no-rope-fusion \ + --normalization RMSNorm \ + --squared-relu \ + --num-layers 56 \ + --hidden-size 4480 \ + --ffn-hidden-size 15680 \ + --num-attention-heads 40 \ + --kv-channels 128 \ + --group-query-attention \ + --num-query-groups 8 \ + --hybrid-override-pattern M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M- \ + --is-hybrid-model \ + --mamba-head-dim 80 \ + --mamba-num-heads 128 \ + --mamba-num-groups 8 \ + --mamba-state-dim 128 \ + --seq-length 4096 \ + --max-position-embeddings 131072 \ + --tokenizer-type HuggingFaceTokenizer \ + --make-vocab-size-divisible-by 1 \ + --use-mcore-models \ + --export-model-type MambaModel \ + --padded-vocab-size 131072 \ +" diff --git a/examples/post_training/modelopt/convert_model.py b/examples/post_training/modelopt/convert_model.py index 9790d73fc4c..20ee59a2fe0 100644 --- a/examples/post_training/modelopt/convert_model.py +++ b/examples/post_training/modelopt/convert_model.py @@ -162,17 +162,7 @@ def check_arguments(): if eagle_module is not None: mcore_eagle_state_dict = torch.load(args.extra_model_path) eagle_module.load_state_dict(mcore_eagle_state_dict, strict=False) - - # Add mask tokens for parallel draft - if unwrapped_model.eagle_config.parallel_draft_step > 1: - assert unwrapped_model.eagle_config.parallel_draft_step <= 4, "Parallel draft only supports steps less than or equal to 4." - tokenizer = get_tokenizer() - for i in range(unwrapped_model.eagle_config.parallel_draft_step - 1): - mask_token = "[MASK_{}]".format(i) - tokenizer._tokenizer.add_tokens([mask_token], special_tokens=True) - token_id = tokenizer._tokenizer.convert_tokens_to_ids(mask_token) - setattr(unwrapped_model, "mask_token_{}".format(i), torch.tensor(token_id)) - + elif args.algorithm == "medusa": config = {"medusa_num_heads": args.export_num_medusa_heads, "medusa_num_layers": 1} unwrapped_model = mtsp.convert(unwrapped_model, [("medusa", config)]) diff --git a/examples/post_training/modelopt/finetune.py b/examples/post_training/modelopt/finetune.py index bd0569bb513..6489d394392 100755 --- a/examples/post_training/modelopt/finetune.py +++ b/examples/post_training/modelopt/finetune.py @@ -167,7 +167,7 @@ def __init__( hf_dataset_kwargs = SFTDataset.hf_dataset_to_kwargs.get( self.hf_dataset, {"split": "train"} ) - self._raw_samples = datasets.load_dataset(self.hf_dataset, **hf_dataset_kwargs) + self._raw_samples = datasets.load_dataset(self.hf_dataset, token=os.environ.get("HF_TOKEN", None), **hf_dataset_kwargs) self._raw_samples = self._raw_samples.shard( num_shards=self.num_shards, index=shard_index ) @@ -455,7 +455,10 @@ def non_loss_data_func(model: GPTModel): """Callback to compute the acceptance length.""" args = get_args() if not args.export_offline_model: - report_draft_acceptance_length(model) + try: + report_draft_acceptance_length(model) + except Exception as e: + print(e) diff --git a/examples/post_training/modelopt/finetune.sh b/examples/post_training/modelopt/finetune.sh index 0579dd69157..21493697374 100755 --- a/examples/post_training/modelopt/finetune.sh +++ b/examples/post_training/modelopt/finetune.sh @@ -14,6 +14,7 @@ MLM_DEFAULT_ARGS=" \ --distributed-timeout-minutes 30 \ --auto-detect-ckpt-format \ --export-te-mcore-model \ + --finetune \ " @@ -67,6 +68,8 @@ if [ -z ${MLM_EVAL_ARGS} ]; then " fi +export HF_TOKEN=${HF_TOKEN} + ${LAUNCH_SCRIPT} ${SCRIPT_DIR}/finetune.py \ ${MODEL_ARGS} \ --tensor-model-parallel-size ${TP} \ diff --git a/examples/post_training/modelopt/prune.py b/examples/post_training/modelopt/prune.py index 7819b2ed2af..6a0178a1420 100644 --- a/examples/post_training/modelopt/prune.py +++ b/examples/post_training/modelopt/prune.py @@ -20,6 +20,7 @@ from modelopt.torch.export import import_mcore_gpt_from_hf from modelopt.torch.prune.plugins.mcore_minitron import SUPPORTED_HPARAMS +from megatron.core.parallel_state import get_pipeline_model_parallel_group, get_tensor_model_parallel_group from megatron.post_training.arguments import add_modelopt_args from megatron.post_training.checkpointing import load_modelopt_checkpoint from megatron.post_training.generate import simple_generate @@ -91,6 +92,21 @@ def add_prune_args(parser): type=int, help="Prune dimension of Mamba attention heads to this value", ) + group.add_argument( + "--target-num-moe-experts", + type=int, + help="Prune number of MoE experts to this value", + ) + group.add_argument( + "--target-moe-ffn-hidden-size", + type=int, + help="Prune MoE FFN hidden size to this value", + ) + group.add_argument( + "--target-moe-shared-expert-intermediate-size", + type=int, + help="Prune MoE shared expert intermediate size to this value", + ) group.add_argument( "--target-num-layers", type=int, @@ -104,6 +120,12 @@ def add_prune_args(parser): nargs="*", help="Drop specific model layers (1-indexed). Cannot be used with rest of the pruning options", ) + group.add_argument( + "--pruning-scores-path", + type=str, + default=None, + help="Path to the cache and reuse pruning scores for pruning again to different params", + ) add_modelopt_args(parser) return parser @@ -125,6 +147,14 @@ def get_calib_dataloader(calib_size=1024, max_sequence_length=512): yield dataset[i][text_column][:max_sequence_length] +def get_params(model): + params = sum(p.numel() for p in model.parameters()) + reduced_params = torch.Tensor([params]).to(device=next(model.parameters()).device) + torch.distributed.all_reduce(reduced_params, group=get_pipeline_model_parallel_group()) + torch.distributed.all_reduce(reduced_params, group=get_tensor_model_parallel_group()) + return reduced_params.item() + + if __name__ == "__main__": initialize_megatron( extra_args_provider=add_prune_args, @@ -181,7 +211,7 @@ def _hf_dataset_forword_loop_func(model): simple_generate(model, tokens.input_ids.cuda(), osl=1) if args.layers_to_drop: - mtp.plugins.drop_mcore_language_model_layers(model, layers_to_drop=args.layers_to_drop) + mtp.mcore_minitron.drop_mcore_language_model_layers(model, layers_to_drop=args.layers_to_drop) else: print_rank_0("Pruning model...") export_config = { @@ -189,18 +219,22 @@ def _hf_dataset_forword_loop_func(model): for k in SUPPORTED_HPARAMS if getattr(args, f"target_{k}", None) is not None } + config = {"forward_loop": _hf_dataset_forword_loop_func} + if args.pruning_scores_path is not None: + config["scores_path"] = args.pruning_scores_path mtp.prune( unwrapped_model, mode="mcore_minitron", constraints={"export_config": export_config}, dummy_input=None, # Not used - config={"forward_loop": _hf_dataset_forword_loop_func}, + config=config, ) # [WAR till modelopt 0.39]: Remove prune state to avoid converting again on restore which forces TP=1. if mto.ModeloptStateManager.has_state_for_mode_type("prune", model=unwrapped_model): mto.ModeloptStateManager.remove_state(unwrapped_model) print_rank_0(f"Pruned Model:\n {unwrapped_model}") + print_rank_0(f"Pruned Model Params: {get_params(unwrapped_model)/1e9:.2f}B") _custom_prompt_forward_loop_func(unwrapped_model) diff --git a/examples/post_training/modelopt/prune.sh b/examples/post_training/modelopt/prune.sh index ef86260b062..33f3e615e96 100755 --- a/examples/post_training/modelopt/prune.sh +++ b/examples/post_training/modelopt/prune.sh @@ -23,23 +23,27 @@ MLM_DEFAULT_ARGS=" # Example: export LAYERS_TO_DROP="1 5 10" # Define pruning argument mappings: "env_var:cli_arg" -PRUNE_ARG_MAPPINGS=( - "TARGET_FFN_HIDDEN_SIZE:--target-ffn-hidden-size" - "TARGET_HIDDEN_SIZE:--target-hidden-size" - "TARGET_NUM_ATTENTION_HEADS:--target-num-attention-heads" - "TARGET_NUM_QUERY_GROUPS:--target-num-query-groups" - "TARGET_MAMBA_NUM_HEADS:--target-mamba-num-heads" - "TARGET_MAMBA_HEAD_DIM:--target-mamba-head-dim" - "TARGET_NUM_LAYERS:--target-num-layers" - "LAYERS_TO_DROP:--layers-to-drop" +# List of environment variables we want to check for pruning CLI args +PRUNE_ENV_VARS=( + TARGET_FFN_HIDDEN_SIZE + TARGET_HIDDEN_SIZE + TARGET_NUM_ATTENTION_HEADS + TARGET_NUM_QUERY_GROUPS + TARGET_MAMBA_NUM_HEADS + TARGET_MAMBA_HEAD_DIM + TARGET_NUM_MOE_EXPERTS + TARGET_MOE_FFN_HIDDEN_SIZE + TARGET_MOE_SHARED_EXPERT_INTERMEDIATE_SIZE + TARGET_NUM_LAYERS + LAYERS_TO_DROP ) -# Build arguments from environment variables -PRUNE_ARGS="" -for mapping in "${PRUNE_ARG_MAPPINGS[@]}"; do - env_var="${mapping%%:*}" - cli_arg="${mapping##*:}" +# Build arguments from environment variables (TARGET_NUM_LAYERS -> --target-num-layers, etc.) +PRUNE_ARGS=${PRUNE_ARGS:-""} +for env_var in "${PRUNE_ENV_VARS[@]}"; do if [ ! -z "${!env_var}" ]; then + # prepend --, convert to lowercase, replace _ with - + cli_arg="--$(echo "${env_var}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')" PRUNE_ARGS="${PRUNE_ARGS} ${cli_arg} ${!env_var}" fi done @@ -59,6 +63,9 @@ else LOAD_ARGS="--load ${MLM_MODEL_CKPT}" fi + +set -ex + ${LAUNCH_SCRIPT} ${SCRIPT_DIR}/prune.py \ ${MODEL_ARGS} \ ${LOAD_ARGS} \ @@ -67,6 +74,5 @@ ${LAUNCH_SCRIPT} ${SCRIPT_DIR}/prune.py \ --tokenizer-model ${TOKENIZER_MODEL} \ --save ${MLM_MODEL_SAVE} \ --references "${MLM_REF_LABEL}" \ - --calib-size 1024 \ ${PRUNE_ARGS} \ ${MLM_DEFAULT_ARGS} ${MLM_EXTRA_ARGS} diff --git a/examples/post_training/modelopt/slurm/env_setup_template.sh b/examples/post_training/modelopt/slurm/env_setup_template.sh new file mode 100644 index 00000000000..12b59f06eed --- /dev/null +++ b/examples/post_training/modelopt/slurm/env_setup_template.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +HF_MODEL_CKPT=/workspace/scratch/meta-llama/Llama-3.2-1B-Instruct +TP=1 +ETP=1 +EP=1 +PP=1 diff --git a/examples/post_training/modelopt/slurm/sbatch.sh b/examples/post_training/modelopt/slurm/sbatch.sh new file mode 100644 index 00000000000..3916c5de2b5 --- /dev/null +++ b/examples/post_training/modelopt/slurm/sbatch.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +#SBATCH -A +#SBATCH -p +#SBATCH --job-name= +#SBATCH --nodes=1 --ntasks-per-node=8 --gpus-per-node=8 +#SBATCH -t 04:00:00 +#SBATCH --exclusive --mem=0 --overcommit + +# Bash coloring +RED='\033[0;31m' +YELLOW='\033[0;33m' +GREEN='\033[0;32m' +BLUE='\033[0;34m' +PURPLE='\033[0;35m' +WHITE='\033[0;37m' + +# Predefined logging +MLM_ERROR="${RED}ERROR: ${WHITE}" +MLM_WARNING="${YELLOW}WARNING:${WHITE}" + +# CHANGE THE FOLLOWING TO YOUR DATA, MEGATRON, and CHECKPOINT DIR +if [[ -z ${USER_FSW} ]]; then + printf "${MLM_ERROR} Variable USER_FSW (read/write scratch space) must be set!\n" + exit 1 +fi + +if [ -z ${SANDBOX_DIR} ]; then + SANDBOX_DIR="$(pwd)" + printf "${MLM_WARNING} Variable SANDBOX_DIR not set! (default: ${SANDBOX_DIR})\n" +fi + +if [ -z ${SANDBOX_ENV_SETUP} ]; then + SANDBOX_ENV_SETUP=./env_setup_template.sh + printf "${MLM_WARNING} Variable SANDBOX_ENV_SETUP not set! (default: ${SANDBOX_ENV_SETUP})\n" +fi + +if [ -z ${CONTAINER_IMAGE} ]; then + CONTAINER_IMAGE="nvidia-modelopt-megatron:latest" + printf "${MLM_WARNING} Variable CONTAINER_IMAGE not set! (default: ${CONTAINER_IMAGE})\n" +fi + +if [ -z ${LAUNCH_SCRIPT} ]; then + LAUNCH_SCRIPT="python" + printf "${MLM_WARNING} Variable LAUNCH_SCRIPT not set! (default: ${LAUNCH_SCRIPT})\n" +fi + +# DO NOT MODIFY THE VALUES BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!! +DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` + +CONTAINER_MOUNT="${SANDBOX_DIR}:/workspace/nmm-sandbox,${USER_FSW}:/workspace/scratch" + +srun -l \ + --mpi=pmix \ + --output=%x_%j_$DATETIME.log \ + --container-image ${CONTAINER_IMAGE} \ + --container-workdir "/workspace/nmm-sandbox" \ + --container-mounts ${CONTAINER_MOUNT} \ + --export "HF_MODEL_CKPT=${HF_MODEL_CKPT},SANDBOX_ENV_SETUP=${SANDBOX_ENV_SETUP},LAUNCH_SCRIPT=${LAUNCH_SCRIPT}" \ + bash ${1} + +set +x + diff --git a/examples/post_training/modelopt/validate.sh b/examples/post_training/modelopt/validate.sh index 90ff4810117..796231e508e 100644 --- a/examples/post_training/modelopt/validate.sh +++ b/examples/post_training/modelopt/validate.sh @@ -16,8 +16,9 @@ if [ -z ${MLM_MODEL_CKPT} ]; then fi if [ -z ${PROMPTS_PATH} ]; then - printf "${MLM_ERROR} Variable ${PURPLE}PROMPTS_PATH${WHITE} must be set!\n" - exit 1 + PROMPT_ARGS="" +else + PROMPT_ARGS="--prompts-path ${PROMPTS_PATH}" fi if [ -z ${STEPS} ]; then @@ -40,6 +41,7 @@ if [ -z ${OSL} ]; then STEPS=64 fi +export HF_TOKEN=${HF_TOKEN} ${LAUNCH_SCRIPT} ${SCRIPT_DIR}/validate.py \ ${MODEL_ARGS} \ @@ -49,9 +51,9 @@ ${LAUNCH_SCRIPT} ${SCRIPT_DIR}/validate.py \ --pipeline-model-parallel-size ${PP} \ --tokenizer-model ${TOKENIZER_MODEL} \ --load ${MLM_MODEL_CKPT} \ - --prompts-path ${PROMPTS_PATH} \ --steps ${STEPS} \ --osl ${OSL} \ + ${PROMPT_ARGS} \ ${GT_ARGS} \ ${SAVE_ARGS} \ ${MLM_DEFAULT_ARGS} ${MLM_EXTRA_ARGS} diff --git a/gpt_builders.py b/gpt_builders.py index 9fa1aff72c7..2ef41846f2c 100644 --- a/gpt_builders.py +++ b/gpt_builders.py @@ -5,6 +5,7 @@ get_gpt_decoder_block_spec, get_gpt_layer_local_spec, get_gpt_layer_with_transformer_engine_spec, + get_gpt_layer_with_inference_spec, get_gpt_mtp_block_spec, get_gpt_decoder_layer_specs, ) @@ -43,6 +44,7 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None): use_te = args.transformer_impl == "transformer_engine" if args.num_experts or (args.linear_attention_type is not None): + assert not (config.transformer_impl == "inference_optimized") # Define the decoder block spec transformer_layer_spec = get_gpt_decoder_block_spec( config, @@ -52,12 +54,14 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None): vp_stage=vp_stage, ) elif args.heterogeneous_layers_config_path is not None: + assert not (config.transformer_impl == "inference_optimized") transformer_layer_spec = get_gpt_heterogeneous_layer_spec(config, use_te) else: # Define the decoder layer spec transformer_layer_spec = _get_transformer_layer_spec(use_te, config) mtp_block_spec = None if args.mtp_num_layers is not None: + assert not (config.transformer_impl == "inference_optimized") # Get GPT decoder layer specs for the model. if args.spec is not None: mtp_transformer_layer_spec = import_module(args.spec) @@ -120,6 +124,12 @@ def _get_transformer_layer_spec(use_te, config): use_kitchen=config.use_kitchen, fallback_to_eager_attn=config.fallback_to_eager_attn, ) + elif config.transformer_impl == "inference_optimized": + return get_gpt_layer_with_inference_spec( + args.qk_layernorm, + args.multi_latent_attention, + qk_l2_norm=args.qk_l2_norm, + ) else: return get_gpt_layer_local_spec( args.num_experts, diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py index d6ef5f6210e..8a63e0f5cf7 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py @@ -898,9 +898,10 @@ def forward_hook(_module, inputs, output): # Register pre state_dict hook to ensure that the module parameters are # distributed before saving the state_dict. - self._state_dict_pre_hook = self.module.register_state_dict_pre_hook( - lambda *args, **kwargs: self._replace_param_with_distributed_if_needed() - ) + for name, module in self.named_modules(): + module.register_state_dict_pre_hook( + lambda *args, **kwargs: self._replace_param_with_distributed_if_needed() + ) @contextmanager def no_sync(self): diff --git a/megatron/core/fusions/fused_pad_routing_map.py b/megatron/core/fusions/fused_pad_routing_map.py index c382178b6c9..8e4d1763270 100644 --- a/megatron/core/fusions/fused_pad_routing_map.py +++ b/megatron/core/fusions/fused_pad_routing_map.py @@ -6,7 +6,7 @@ from packaging import version from megatron.core.jit import jit_fuser -from megatron.core.utils import null_decorator +from megatron.core.utils import experimental_fn, null_decorator try: import triton @@ -70,6 +70,7 @@ def _pad_routing_map_kernel( tl.store(output_row_ptr + token_indices, output_row, mask=token_mask) +@experimental_fn(introduced_with_version="0.13.0") @jit_fuser def fused_pad_routing_map(routing_map: torch.Tensor, pad_multiple: int) -> torch.Tensor: """Fused version of pad_routing_map. diff --git a/megatron/core/inference/communication_utils.py b/megatron/core/inference/communication_utils.py index 18fbb18f2f0..a5bfe75fbb6 100644 --- a/megatron/core/inference/communication_utils.py +++ b/megatron/core/inference/communication_utils.py @@ -71,8 +71,7 @@ def broadcast_from_last_pipeline_stage( tensor.shape ), f"Expected tensor of shape {size} but got {list(tensor.shape)}" assert dtype == tensor.dtype, f"Expected tensor of type {dtype} but got {tensor.dtype}" - _is_cuda(tensor) - assert tensor.is_contiguous() + _is_cuda_contiguous(tensor) else: tensor = torch.empty(size, dtype=dtype, device=torch.cuda.current_device()) diff --git a/megatron/core/inference/contexts/attention_context/mamba_metadata.py b/megatron/core/inference/contexts/attention_context/mamba_metadata.py index e9cd99a6c48..ecb0296559f 100644 --- a/megatron/core/inference/contexts/attention_context/mamba_metadata.py +++ b/megatron/core/inference/contexts/attention_context/mamba_metadata.py @@ -1,8 +1,28 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +from dataclasses import dataclass +from typing import List, Optional, Tuple + import torch +@dataclass +class MambaInferenceStateConfig: + """Config for initializing Mamba model inference state tensors.""" + + layer_type_list: List[str] + """ + A list of strings that indicates the layer type (Mamba / Attention / MLP) for each layer. + See `megatron/core/ssm/mamba_hybrid_layer_allocation.py` for the list of symbols. + """ + + mamba_conv_states_shape: Tuple[int] + """Mamba conv states shape per request.""" + + mamba_ssm_states_shape: Tuple[int] + """Mamba ssm states shape per request.""" + + class MambaMetadata: """Manages the metadata tensors required for Mamba layers during inference.""" @@ -64,7 +84,7 @@ def update_cudagraph_mapping( """ self.request_to_mamba_state_idx_cudagraph_only[0:num_active_requests] = active_mamba_indices - def allocate_slot(self) -> int: + def allocate_slot(self) -> Optional[int]: """ Allocates a new slot for a request in the Mamba state buffers. diff --git a/megatron/core/inference/contexts/dynamic_block_allocator.py b/megatron/core/inference/contexts/dynamic_block_allocator.py index 4baa3f5212c..026ee47d094 100644 --- a/megatron/core/inference/contexts/dynamic_block_allocator.py +++ b/megatron/core/inference/contexts/dynamic_block_allocator.py @@ -13,60 +13,86 @@ class BlockAllocator: - Initializing a pool of block IDs - Allocating blocks from the pool - Releasing blocks back to the pool - - Managing the guaranteed block count for active requests Args: - block_count_total (int): Total number of blocks available in the buffer. - gtd_block_count (int): Number of blocks reserved for guaranteed requests. + context (DynamicInferenceContext): Dynamic inference context. + active_count (int): Total number of active blocks available in the buffer. + The full buffer size is 2*active_count, to accommodate an equal-size + space for paused requests that live on the CPU. """ - def __init__(self, block_count_total: int, gtd_block_count: int): - self.block_count_total = block_count_total - self.gtd_block_count = gtd_block_count + def __init__(self, context: "DynamicInferenceContext", total_count: int): - # Reserve last block ID as dummy block for decode-only inference steps - self.block_count_avail = self.block_count_total - 1 - self.dummy_block_idx = self.block_count_total - 1 + self.context = context + + active_count = (total_count - 1) // 2 # -1 for dummy_block_idx (see below) + active_count = max(1, active_count) # need at least one block + self.total_count = 2 * active_count + 1 # +1 for dummy_block_idx + self.total_avail = self.total_count - 1 # -1 for dummy_block_idx + self.active_count = active_count + self.paused_count = self.total_count - self.active_count - 1 # -1 for dummy_block_idx + self.dummy_block_idx = self.total_count - 1 # Initialize block pool as a "stack" data structure self.block_bag = torch.arange( - self.block_count_total, dtype=torch.int32, device=torch.cuda.current_device() + self.total_count, dtype=torch.int32, device=torch.cuda.current_device() ) - def is_memory_available(self, num_blocks: int, safe: bool = False) -> bool: - """Check if memory blocks are available. + def __str__(self): + return ( + f"total avail {self.total_avail} / {self.total_count - 1}" + f"; active {self.active_count}" + ) - Use 'safe' to avoid all requests being deadlocked. A fraction of the KV cache - memory buffer is reserved to guarantee that a minimum number of active - requests can run on any given step. + def get_active_used(self): + """Compute number of active blocks used.""" + return ( + self.context.request_kv_block_counts[ + self.context.paused_request_count : self.context.total_request_count + ] + .sum() + .item() + ) + + def get_paused_used(self): + """Compute number of paused blocks used.""" + return ( + self.context.request_kv_block_counts[: self.context.paused_request_count].sum().item() + ) + + def get_active_avail(self): + """Compute number of active blocks available.""" + return self.active_count - self.get_active_used() + + def get_paused_avail(self): + """Compute number of paused blocks available.""" + return self.paused_count - self.get_paused_used() + + def is_memory_available(self, num_blocks: int) -> bool: + """Check if memory blocks are available. Args: num_blocks (int): Number of blocks to check. - safe (bool): Include extra space for guaranteeing ability to run - requests to completion. Return: (bool) Is memory available? """ - if safe: - return self.block_count_avail >= num_blocks + self.gtd_block_count - else: - return self.block_count_avail >= num_blocks + return self.get_active_avail() >= num_blocks - def allocate_memory_blocks(self, num_blocks: int = 1, safe: bool = False) -> Optional[Tensor]: + def allocate_memory_blocks(self, num_blocks: int) -> Optional[Tensor]: """Allocate memory blocks if available, else return None. Args: num_blocks (int): Number of blocks to allocate. - safe (bool): Include extra space for guaranteeing ability to run - requests to completion. Return: (Optional[Tensor]) Allocated block IDs. """ - if self.is_memory_available(num_blocks, safe): - self.block_count_avail -= num_blocks - return self.block_bag[self.block_count_avail : (self.block_count_avail + num_blocks)] + if self.is_memory_available(num_blocks): + self.total_avail -= num_blocks + block_ids = self.block_bag[self.total_avail : (self.total_avail + num_blocks)] + assert num_blocks == block_ids.numel() + return block_ids else: return None @@ -80,8 +106,8 @@ def release_memory_blocks(self, blocks: Tensor) -> None: None """ num_blocks = blocks.size(dim=0) - self.block_bag[self.block_count_avail : (self.block_count_avail + num_blocks)] = blocks - self.block_count_avail += num_blocks + self.block_bag[self.total_avail : (self.total_avail + num_blocks)] = blocks + self.total_avail += num_blocks def reset(self) -> None: """Reset the allocator to initial state. @@ -89,4 +115,4 @@ def reset(self) -> None: This resets the available block count to the entire memory pool (except for the dummy block). """ - self.block_count_avail = self.block_count_total - 1 + self.total_avail = self.total_count - 1 diff --git a/megatron/core/inference/contexts/dynamic_context.py b/megatron/core/inference/contexts/dynamic_context.py index 000b58200f8..d15daa90d10 100644 --- a/megatron/core/inference/contexts/dynamic_context.py +++ b/megatron/core/inference/contexts/dynamic_context.py @@ -1,5 +1,6 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +import logging import math import warnings from contextlib import nullcontext @@ -23,14 +24,11 @@ from megatron.core.inference.utils import tensor_swap from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb from megatron.core.package_info import __version__ as mcore_version -from megatron.core.ssm.mamba_hybrid_layer_allocation import ( - Symbols, - get_layer_maps_from_layer_type_list, -) +from megatron.core.ssm.mamba_hybrid_layer_allocation import get_layer_maps_from_layer_type_list from megatron.core.transformer import TransformerConfig from megatron.core.utils import divide as core_divide -from .attention_context.mamba_metadata import MambaMetadata +from .attention_context.mamba_metadata import MambaInferenceStateConfig, MambaMetadata from .attention_context.mha_metadata import GraphedMHAMetadata, NonGraphedMHAMetadata from .base_context import BaseInferenceContext from .dynamic_block_allocator import BlockAllocator @@ -113,7 +111,7 @@ class BlockOverflowError(ContextOverflowError): class ActiveRequestCountOverflowError(ContextOverflowError): '''Used when `initialize_attention_state()` is called with - `num_warmup_requests > max_requests.''' + `num_warmup_requests > max_active_requests.''' def __init__(self, max_request_count, active_request_count): assert active_request_count > max_request_count @@ -124,6 +122,13 @@ def __init__(self, max_request_count, active_request_count): ) +class TensorStateDeallocatedError(ContextOverflowError): + """Context's tensor state is currently deallocated, such as when the engine + has been suspended.""" + + pass + + class ContextErrorFactory: """Factory class for serializing/deserializing context errors.""" @@ -175,6 +180,15 @@ class WarmupEngineMode(Enum): NON_DECODE = "non_decode" +def get_mem_size_str(n_bytes: int) -> str: + """Convert number of bytes to human-readable string.""" + for exp, suffix in ((4, "TB"), (3, "GB"), (2, "MB"), (3, "KB"), (0, "bytes")): + nquery = int(1024**exp) + if round(n_bytes / nquery) >= 1: + return "%.3g %s" % (n_bytes / nquery, suffix) + raise Exception(f"something went wrong, n_bytes={n_bytes}.") + + # pylint: disable=line-too-long class DynamicInferenceContext(BaseInferenceContext): """Inference context that is passed to the main model in order @@ -185,64 +199,37 @@ class DynamicInferenceContext(BaseInferenceContext): arbitrary sequence length may be added, paused, or removed from the context at any step. The only constraint is the maximum number of requests or tokens that the context is defined to support. For the block-level KV cache, a memory - buffer is allocated up front (size `buffer_size_gb`), that is divided into - blocks and dynamically assigned to requests. At any given step, any unassigned - blocks equate to unused space. - - Additionally, a fraction of the memory buffer (`gtd_request_fraction`, i.e., - the 'guaranteed' request fraction) is reserved for guaranteeing that a - minimum number of active requests may continue to generate tokens on any step. - The reason for this is that the context manages two pools of requests: 1) - active requests, and 2) paused requests. Paused requests are requests where - insufficient memory blocks remain for future assignment, and these requests - are set aside until enough memory blocks are available. Active requests are - requests that have sufficient memory blocks to proceed with their generations. - - The situation can arise where all requests eventually become paused due to all - memory blocks being assigned. In this case, there are no active requests and - thus no progress can be made. To handle this case, a fraction of the memory - buffer is reserved that only allows active requests, and no paused requests. - This fraction must be carefully tuned, as it can have an order of magnitude - impact on overall latency. + buffer is allocated up front (size `buffer_size_gb` if `unified_memory_level` + == 0, or `2 * buffer_size_gb` if `unified_memory_level` == 1), that is + divided into blocks and dynamically assigned to requests. At any given step, + any unassigned blocks equate to unused space. Args: params_dtype (torch.dtype): Dtype used for KV cache. - num_layers (int): Number of layers. + num_layers (int): Number of layers on this pipeline parallel rank. kv_channels (int): Hidden dimension per attention head. num_attention_heads (int): Number of attention heads. max_sequence_length (int): Max possible sequence length (prompt + output) that will occur. - buffer_size_gb (float): Total buffer size (GB), shared by main and - fallback contexts. + buffer_size_gb (float): Buffer size reserved on the GPU for the KV cache. + if `unified_memory_level` >= 1, then CPU memory is additionally + utilized, resulting in a total buffer size of `2 * buffer_size_gb`. + Regardless of total buffer size, the KV cache is conceptually divided + into 50% active requests and 50% paused requests. + max_tokens (int): Max number of tokens to use for forward passes. This is + primarily limited by prefill activation memory usage. (Defaults to + 16384). block_size_tokens (int): Size of KV cache block size. - buffer_guaranteed_fraction (float): Fraction of the memory buffer that is - reserved to guarantee that one or more active requests are able to - run to completion. Without reserving this memory, paused requests are - able to fill the memory buffer and block execution of any requests. - buffer_overflow_factor (Optional[float]): Scaling factor over the buffer - size for auto computing `max_requests` and `max_tokens`. This scaling - factor is used for fitting more requests and tokens in the memory - buffer than it can safely hold, which in turn increases throughput. - max_requests_override (Optional[int]): If set, overrides value computed - from `buffer_overflow_factor`. - max_tokens_override (Optional[int]): If set, overrides value computed - from `buffer_overflow_factor`. tensor_model_parallel_size (Optional[int]): Tensor model parallel size. num_cuda_graphs (Optional[int]): Maximum number of cuda graphs to capture, - where the cuda graph batch sizes range from 1 to `max_requests` (as - computed below). Due to rounding, the actual number of cuda graphs may - not equal this argument. + where the cuda graph batch sizes range from 1 to `max_active_requests` + (as computed below). Due to rounding, the actual number of cuda graphs + may not equal this argument. materialize_only_last_token_logits (Optional[bool]): Whether to only materialize logits for the last token. This should be set to False if returning log probs. - layer_type_list (Optional[List[str]]): A list of strings that indicates - the layer type (Mamba / Attention / MLP) for each layer. - See `megatron/core/ssm/mamba_hybrid_layer_allocation.py` for the list - of symbols. This must be provided for hybrid models. - mamba_conv_states_shape: (Optional[Tuple[int]]): Mamba conv states shape per request. - This must be provided for hybrid models. - mamba_ssm_states_shape: (Optional[Tuple[int]]): Mamba ssm states shape per request. - This must be provided for hybrid models. + mamba_inference_state_config (Optional[MambaInferenceStateConfig]): The Mamba + inference state config if the model is a hybrid model. use_cuda_graphs_for_non_decode_steps (bool): If True, use cuda graphs for non-decode engine steps. unified_memory_level (Optional[int]): Set unified memory usage within the @@ -250,10 +237,17 @@ class DynamicInferenceContext(BaseInferenceContext): allocate `memory_buffer` in unified memory. Eventually, additional levels will be included to control other tensors within the context. use_flashinfer_fused_rope (bool): If True, use flashinfer's fused rope implementation. - If None, defaults to using flash-infer if available. + If None, defaults to using flash-infer if available. metrics_writer (Optional['WandbModule']): Wandb module for writing metrics. + num_request_metadata (Optional[int]): Number of metadata fields to track per request. + These represent metadata that is needed by the text generation controller, + and that must be kept in sync with active requests through update_requests. """ + DEFAULT_MAX_TOKENS = 16384 + TOKEN_ROUNDER = 64 + REQUEST_ROUNDER = 4 + def __init__( self, *, @@ -263,24 +257,20 @@ def __init__( num_attention_heads: int, max_sequence_length: int, buffer_size_gb: float, - buffer_guaranteed_fraction: float, + max_tokens: int = DEFAULT_MAX_TOKENS, block_size_tokens: int = 256, - buffer_overflow_factor: Optional[float] = None, - max_requests_override: Optional[int] = None, - max_tokens_override: Optional[int] = None, tensor_model_parallel_size: Optional[int] = None, cache_mla_latent: bool = False, kv_lora_rank: Optional[int] = None, qk_pos_emb_head_dim: Optional[int] = None, num_cuda_graphs: Optional[int] = None, materialize_only_last_token_logits: Optional[bool] = True, - layer_type_list: Optional[List[str]] = None, - mamba_conv_states_shape: Optional[Tuple[int]] = None, - mamba_ssm_states_shape: Optional[Tuple[int]] = None, + mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, use_cuda_graphs_for_non_decode_steps: bool = True, use_flashinfer_fused_rope: bool = False, - unified_memory_level: Optional[int] = 0, + unified_memory_level: Optional[int] = 1, metrics_writer: Optional['WandbModule'] = None, + num_request_metadata: Optional[int] = None, ): super().__init__(materialize_only_last_token_logits=materialize_only_last_token_logits) @@ -298,36 +288,40 @@ def __init__( tp_size = parallel_state.get_tensor_model_parallel_world_size() else: tp_size = tensor_model_parallel_size - hidden_size_per_attention_head = core_divide(projection_size, num_attention_heads) - num_attention_heads_per_partition = core_divide(num_attention_heads, tp_size) + self.hidden_size_per_attention_head = core_divide(projection_size, num_attention_heads) + self.num_attention_heads_per_partition = core_divide(num_attention_heads, tp_size) # Mamba states. - self.is_hybrid_model = layer_type_list is not None and Symbols.MAMBA in layer_type_list + self.is_hybrid_model = mamba_inference_state_config is not None if self.is_hybrid_model: + mamba_conv_states_shape = mamba_inference_state_config.mamba_conv_states_shape + mamba_ssm_states_shape = mamba_inference_state_config.mamba_ssm_states_shape assert ( mamba_conv_states_shape is not None ), "`mamba_conv_states_shape` must be specified for hybrid models" assert ( mamba_ssm_states_shape is not None ), "`mamba_ssm_states_shape` must be specified for hybrid models" - assert ( - not use_cuda_graphs_for_non_decode_steps + assert not ( + num_cuda_graphs is not None and use_cuda_graphs_for_non_decode_steps ), "Non-decode CUDA graphs not yet supported for hybrid models" # For hybrid models, the layer map converts the global layer index to the # corresponding attention layer index or Mamba layer index depending on the # layer type. - attention_layer_map, mamba_layer_map, _ = get_layer_maps_from_layer_type_list( - layer_type_list + attention_layer_map, mamba_layer_map, _, _ = get_layer_maps_from_layer_type_list( + mamba_inference_state_config.layer_type_list ) self.num_attention_layers = len(attention_layer_map) self.num_mamba_layers = len(mamba_layer_map) + self.mamba_conv_states_shape = mamba_conv_states_shape + self.mamba_ssm_states_shape = mamba_ssm_states_shape self.layer_map = attention_layer_map | mamba_layer_map else: # The layer map is the identity function for pure Transformer models. self.num_attention_layers = num_layers self.num_mamba_layers = 0 - (mamba_conv_states_shape, mamba_ssm_states_shape) = (None, None) + (self.mamba_conv_states_shape, self.mamba_ssm_states_shape) = (None, None) self.layer_map = {i: i for i in range(self.num_attention_layers)} if self.num_attention_layers == 0: @@ -340,10 +334,12 @@ def __init__( self.block_size_tokens = block_size_tokens if self.cache_mla_latent: # one vector c_t (rank) + optional RoPE phase slice - kv_reduced_dim = kv_lora_rank + qk_pos_emb_head_dim - self.kv_reduced_dim = kv_reduced_dim + self.kv_reduced_dim = kv_lora_rank + qk_pos_emb_head_dim self.block_size_bytes = ( - dtype_size_bytes * num_layers * self.block_size_tokens * kv_reduced_dim + dtype_size_bytes + * self.num_attention_layers + * self.block_size_tokens + * self.kv_reduced_dim ) else: self.block_size_bytes = ( @@ -351,62 +347,18 @@ def __init__( * 2 # key, value * self.num_attention_layers * self.block_size_tokens - * num_attention_heads_per_partition - * hidden_size_per_attention_head + * self.num_attention_heads_per_partition + * self.hidden_size_per_attention_head ) assert self.block_size_bytes > 0 - # Adjust buffer to be a multiple of block size. - buffer_size_bytes = int(buffer_size_gb * 1024**3) - buffer_size_bytes_rem = buffer_size_bytes % self.block_size_bytes - buffer_size_bytes = buffer_size_bytes - buffer_size_bytes_rem - mamba_states_memory_per_request = 0 if self.is_hybrid_model: - mamba_states_memory_per_request += math.prod(mamba_conv_states_shape) - mamba_states_memory_per_request += math.prod(mamba_ssm_states_shape) + mamba_states_memory_per_request += math.prod(self.mamba_conv_states_shape) + mamba_states_memory_per_request += math.prod(self.mamba_ssm_states_shape) mamba_states_memory_per_request *= self.num_mamba_layers mamba_states_memory_per_request *= dtype_size_bytes - # Compute max_requets, max_tokens from buffer size, overflow factor, and Mamba state size. - def bytes_to_max_requests_and_tokens(n_bytes): - bytes_per_token = self.block_size_bytes / self.block_size_tokens - cost_per_request_bytes = ( - mamba_states_memory_per_request + max_sequence_length * bytes_per_token - ) - # TODO(ksanthanam): Leave room for an extra request in the event of padding - # for non-decode CUDA graphs - n_requests = n_bytes / cost_per_request_bytes - n_tokens = n_requests * max_sequence_length - n_requests = self.round_up_requests(int(n_requests), tp_size=tp_size) - n_tokens = self.round_up_tokens(int(n_tokens), tp_size=tp_size) - return n_requests, n_tokens - - self.max_requests, self.max_tokens = bytes_to_max_requests_and_tokens(buffer_size_bytes) - if buffer_overflow_factor is not None: - self.max_requests = self.round_up_requests( - int(self.max_requests * buffer_overflow_factor), tp_size=tp_size - ) - self.max_tokens = self.round_up_tokens( - int(self.max_tokens * buffer_overflow_factor / 50.0), tp_size=tp_size - ) - - if max_requests_override is not None: - self.max_requests = ( - max_requests_override - if max_requests_override < self.REQUEST_ROUNDER - else self.round_up_requests(max_requests_override, tp_size=tp_size) - ) - - if max_tokens_override is not None: - self.max_tokens = self.round_up_tokens(max_tokens_override, tp_size=tp_size) - - self.max_requests = min(self.max_requests, self.max_tokens) # e.g., decode only. - - # Initialize context state. - self.params_dtype = params_dtype - self.max_sequence_length = max_sequence_length - # Unified memory. self.unified_memory_level = unified_memory_level if unified_memory_level > 0: @@ -419,6 +371,38 @@ def bytes_to_max_requests_and_tokens(n_bytes): ) self.unified_memory_level = 0 + # Initialize block allocator. + buffer_size_bytes = int(buffer_size_gb * 1024**3) + block_count_total = buffer_size_bytes // ( + self.block_size_bytes + mamba_states_memory_per_request + ) + self.block_allocator = BlockAllocator( + context=self, + total_count=( + block_count_total if self.unified_memory_level == 0 else 2 * block_count_total + ), + ) + + # Set max_total_requests, max_active_requests, max_tokens. + self.max_total_requests = self.block_allocator.total_count - 1 # -1 for dummy block + self.max_active_requests = self.block_allocator.active_count + self.max_tokens = max_tokens or self.DEFAULT_MAX_TOKENS + + assert self.max_tokens >= self.max_active_requests, ( + f"max_tokens ({self.max_tokens}) must be >= " + f"max_active_requests ({self.max_active_requests}), " + "to have consistency between cuda graph sizes and the block table size." + ) + + # Track request metadata. + if num_request_metadata is None: + num_request_metadata = len(DynamicInferenceRequest.get_metadata_labels()) + self.num_request_metadata = num_request_metadata + + # Initialize context state. + self.params_dtype = params_dtype + self.max_sequence_length = max_sequence_length + # Request and token counts. self.total_request_count = 0 self.active_token_count = 0 @@ -427,93 +411,19 @@ def bytes_to_max_requests_and_tokens(n_bytes): self.padded_active_request_count = None self.paused_tokens = None - # Per-request state. - self.request_ids = torch.full( - (self.max_requests,), -1, dtype=torch.int32, device=torch.cuda.current_device() - ) - # request_query_lengths is the input prompt tokens length during prefill phase (1st step) and then 1 for the decode phase (i.e During generation) - self.request_query_lengths = torch.empty_like(self.request_ids) - # request_output_lengths is len(input_prompt_tokens) + num_tokens_to_generate - self.request_output_lengths = torch.empty_like(self.request_ids) - # request_kv_length_offsets is the same as query length during prefill phase (1st step) and then 1 for the decode phase (i.e During generation) - self.request_kv_length_offsets = torch.empty_like(self.request_ids) - self.request_kv_block_counts = torch.empty_like(self.request_ids) - self.request_last_kv_block_id = torch.empty_like(self.request_ids) - # request_last_kv_block_offset represents number of tokens in the last kv block - self.request_last_kv_block_offset = torch.empty_like(self.request_ids) - - # Per-token state. - self.token_to_input_ids = torch.full( - (self.max_tokens,), 0, dtype=torch.long, device=torch.cuda.current_device() - ) - self.token_to_pos_ids = torch.full_like(self.token_to_input_ids, 0) - self.token_to_request_idx = torch.empty_like(self.token_to_input_ids) - self.token_to_block_idx = torch.empty_like(self.token_to_input_ids) - # i.e For a set of tokens A B C D E F .. and block_size 4: - # token_to_position_in_request is [0, 1, 2, 3, 4, 5] - # token_to_local_position_within_kv_block is [0 , 1, 2, 3, 0, 1, 2] - self.token_to_position_in_request = torch.empty_like(self.token_to_input_ids) - self.token_to_local_position_within_kv_block = torch.empty_like(self.token_to_input_ids) - - # Calculate the total number of chunks available in the buffer - total_mamba_states_memory = mamba_states_memory_per_request * self.max_requests - block_count_total = ( - max(0, buffer_size_bytes - total_mamba_states_memory) // self.block_size_bytes - ) - - # Memory buffer. - ctx_manager = ( - torch.cuda.use_mem_pool(self.unified_memory_mempool) - if self.unified_memory_level > 0 - else nullcontext() - ) - with ctx_manager: - if cache_mla_latent: - self.memory_buffer = torch.full( - ( - self.num_attention_layers, - block_count_total, - self.block_size_tokens, - kv_reduced_dim, - ), - -1, - dtype=self.params_dtype, - device=torch.cuda.current_device(), - ) - else: - self.memory_buffer = torch.full( - ( - 2, # key and value - self.num_attention_layers, - block_count_total, - self.block_size_tokens, - num_attention_heads_per_partition, - hidden_size_per_attention_head, - ), - -1, - dtype=self.params_dtype, - device=torch.cuda.current_device(), - ) - # Block ids. self.max_kv_block_count = math.ceil(self.max_sequence_length / self.block_size_tokens) - self.request_to_kv_block_ids = torch.full( - (self.max_requests, self.max_kv_block_count), - -1, - dtype=torch.int, - device=torch.cuda.current_device(), - ) # Cuda graph token-counts (i.e., token counts used by cuda-graph steps, both decode and non-decode). self.cuda_graph_token_counts = None if num_cuda_graphs is not None: # Ensure valid num_cuda_graphs. - num_cuda_graphs = min(max(num_cuda_graphs, 1), self.max_requests) + num_cuda_graphs = min(max(num_cuda_graphs, 1), self.max_active_requests) # Cuda graph step size. cuda_graph_rounder = 8 - self.cuda_graph_step_size = self.max_requests / num_cuda_graphs + self.cuda_graph_step_size = self.max_active_requests / num_cuda_graphs self.cuda_graph_step_size = ( math.ceil(self.cuda_graph_step_size / cuda_graph_rounder) * cuda_graph_rounder ) @@ -522,13 +432,17 @@ def bytes_to_max_requests_and_tokens(n_bytes): # Cuda graph token counts. if num_cuda_graphs == 1: - self.cuda_graph_token_counts = [self.max_requests] + self.cuda_graph_token_counts = [self.max_active_requests] else: self.cuda_graph_token_counts = list( - range(self.cuda_graph_step_size, self.max_requests, self.cuda_graph_step_size) + range( + self.cuda_graph_step_size, + self.max_active_requests, + self.cuda_graph_step_size, + ) ) - if self.cuda_graph_token_counts[-1] != self.max_requests: - self.cuda_graph_token_counts.append(self.max_requests) + if self.cuda_graph_token_counts[-1] != self.max_active_requests: + self.cuda_graph_token_counts.append(self.max_active_requests) self.cuda_graph_token_counts.reverse() # Set used for validating active cuda graph token count. @@ -550,82 +464,205 @@ def bytes_to_max_requests_and_tokens(n_bytes): self.active_attn_metadata = None self.graph_attn_metadata["mha_metadata"] = GraphedMHAMetadata( - block_count_total=block_count_total, + block_count_total=self.block_allocator.total_count, max_kv_block_count=self.max_kv_block_count, - max_requests=self.max_requests, + max_requests=self.max_total_requests, block_size_tokens=self.block_size_tokens, max_seqlen=self.max_sequence_length, ) self.non_graph_attn_metadata["mha_metadata"] = NonGraphedMHAMetadata( - block_count_total=block_count_total, + block_count_total=self.block_allocator.total_count, max_kv_block_count=self.max_kv_block_count, - max_requests=self.max_requests, + max_requests=self.max_total_requests, block_size_tokens=self.block_size_tokens, max_seqlen=self.max_sequence_length, ) - # Guaranteed active requests. - # * See details in the class docstring above. `gtd_request_fraction` is - # the fraction of blocks in the memory buffer that are reserved for - # guaranteeing that some number of active requests can always proceed - # with their generations. The number of blocks defined by - # `buffer_guaranteed_fraction * block_count_total` is converted to a - # number of requests that this reserved space can safely handle - # (`gtd_request_count`). - # * Note: computing the size of this guaranteed space from blocks rather - # than bytes is safer due to the non-linear impacts of a large - # `block_size_tokens` or `max_kv_block_count`. When computing from - # blocks, this space will always be less than `block_count_total`. When - # computing from bytes, this space can unexpectedly be much larger than - # `block_count_total`, resulting in stalled generations. - gtd_block_count = int(buffer_guaranteed_fraction * block_count_total) - gtd_block_count = min(gtd_block_count, block_count_total) - self.gtd_request_count = max(1, gtd_block_count // self.max_kv_block_count) - self.gtd_block_count = self.gtd_request_count * self.max_kv_block_count - - # Initialize allocator for KV memory blocks - self.block_allocator = BlockAllocator( - block_count_total=block_count_total, gtd_block_count=self.gtd_block_count + # Deal with chunked prefill + self.chunked_prefill_request_id = -1 + + # FlashInfer. + if use_flashinfer_fused_rope is True: + assert HAVE_FLASHINFER, "flashinfer is not installed" + elif use_flashinfer_fused_rope is None: + use_flashinfer_fused_rope = HAVE_FLASHINFER + self.use_flashinfer_fused_rope = use_flashinfer_fused_rope + + # Allocate GPU state. + self.is_tensor_state_allocated = False + self.allocate_all_tensors(is_init=True) + + # Print info. + logging.info( + "DynamicInferenceContext: allocated context with active buffer size %s (%d blocks)." + % ( + get_mem_size_str(self.block_allocator.active_count * self.block_size_bytes), + self.block_allocator.active_count, + ) ) - # Optional state tensors for hybrid models - if self.is_hybrid_model: - self.mamba_metadata = MambaMetadata(max_requests=self.max_requests) + def allocate_all_tensors(self, *, is_init: bool) -> None: + """Allocate GPU state. + + This method is used for both 1) initial allocation, and 2) resuming the + GPU state after a suspend. + + Args: + is_init (bool): True if this is being called from `__init__()`. + """ + + # Only allocate tensors when not using unified memory at all (level 0), + # or for initial allocation during `__init__()`. For levels 1 and 2, we do + # not perform any explicit allocations or deallocations after the initial + # call to `__init__()`. + if self.unified_memory_level != 0 and not is_init: + return + + # Mark allocated. + if self.is_tensor_state_allocated: + return + self.is_tensor_state_allocated = True + + # Validate no tensors allocated prior to this method. + for key in vars(self).keys(): + value = getattr(self, key) + assert not isinstance(value, torch.Tensor), ( + "All tensors should be allocated within `allocate_all_tensors()." + f"Please move tensor '{key}'." + ) + + # Per-request state. + self.request_ids = torch.full( + (self.max_total_requests,), -1, dtype=torch.int32, device=torch.cuda.current_device() + ) + # request_query_lengths is the input prompt tokens length during prefill phase (1st step) and then 1 for the decode phase (i.e During generation) + self.request_query_lengths = torch.empty_like(self.request_ids) + # request_output_lengths is len(input_prompt_tokens) + num_tokens_to_generate + self.request_output_lengths = torch.empty_like(self.request_ids) + # request_kv_length_offsets is the same as query length during prefill phase (1st step) and then 1 for the decode phase (i.e During generation) + self.request_kv_length_offsets = torch.empty_like(self.request_ids) + self.request_kv_block_counts = torch.empty_like(self.request_ids) + self.request_last_kv_block_id = torch.empty_like(self.request_ids) + # request_last_kv_block_offset represents number of tokens in the last kv block + self.request_last_kv_block_offset = torch.empty_like(self.request_ids) + self.request_to_kv_block_ids = torch.full( + (self.max_total_requests, self.max_kv_block_count), + -1, + dtype=torch.int, + device=torch.cuda.current_device(), + ) + + # Track request metadata. + self.request_metadata = torch.empty( + (self.max_total_requests, self.num_request_metadata), + dtype=torch.float32, + device=torch.cuda.current_device(), + ) - with ctx_manager: + # Per-token state. + self.token_to_input_ids = torch.full( + (self.max_tokens,), 0, dtype=torch.long, device=torch.cuda.current_device() + ) + self.token_to_pos_ids = torch.full_like(self.token_to_input_ids, 0) + self.token_to_request_idx = torch.empty_like(self.token_to_input_ids) + self.token_to_block_idx = torch.empty_like(self.token_to_input_ids) + # i.e For a set of tokens A B C D E F .. and block_size 4: + # token_to_position_in_request is [0, 1, 2, 3, 4, 5] + # token_to_local_position_within_kv_block is [0 , 1, 2, 3, 0, 1, 2] + self.token_to_position_in_request = torch.empty_like(self.token_to_input_ids) + self.token_to_local_position_within_kv_block = torch.empty_like(self.token_to_input_ids) + + # Memory buffer. + def allocate_memory_buffer(): + """Allocate the memory buffer. This function is called below within + `with ctx_manager:`.""" + if self.cache_mla_latent: + self.memory_buffer = torch.full( + ( + self.num_attention_layers, + self.block_allocator.total_count, + self.block_size_tokens, + self.kv_reduced_dim, + ), + -1, + dtype=self.params_dtype, + device=torch.cuda.current_device(), + ) + else: + self.memory_buffer = torch.full( + ( + 2, # key and value + self.num_attention_layers, + self.block_allocator.total_count, + self.block_size_tokens, + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + ), + -1, + dtype=self.params_dtype, + device=torch.cuda.current_device(), + ) + + # Optional state tensors for hybrid models + def allocate_mamba_states(): + """Allocate Mamba states. This function is called below within + `with ctx_manager:`.""" + if self.is_hybrid_model: + self.mamba_metadata = MambaMetadata(max_requests=self.max_total_requests) self.mamba_conv_states = torch.zeros( - (self.num_mamba_layers, self.max_requests) + mamba_conv_states_shape, + (self.num_mamba_layers, self.max_total_requests) + self.mamba_conv_states_shape, dtype=self.params_dtype, device=torch.cuda.current_device(), ) self.mamba_ssm_states = torch.zeros( - (self.num_mamba_layers, self.max_requests) + mamba_ssm_states_shape, + (self.num_mamba_layers, self.max_total_requests) + self.mamba_ssm_states_shape, dtype=self.params_dtype, device=torch.cuda.current_device(), ) - else: - self.mamba_metadata = None - - # Store the dummy block idx reference for convenience - self.dummy_block_idx = self.block_allocator.dummy_block_idx + else: + self.mamba_metadata = None - # Deal with chunked prefill - self.chunked_prefill_request_id = -1 + # Allocate `ctx_manager`-managed buffers. (For currently unknown reasons, + # `ctx_manager` can only be used once.) + ctx_manager = ( + torch.cuda.use_mem_pool(self.unified_memory_mempool) + if self.unified_memory_level > 0 + else nullcontext() + ) + with ctx_manager: + allocate_memory_buffer() + allocate_mamba_states() # Reset attention and Mamba state. self.reset_attention_state() self.reset_mamba_state() - if use_flashinfer_fused_rope is True: - assert HAVE_FLASHINFER, "flashinfer is not installed" - elif use_flashinfer_fused_rope is None: - use_flashinfer_fused_rope = HAVE_FLASHINFER - self.use_flashinfer_fused_rope = use_flashinfer_fused_rope + def deallocate_all_tensors(self): + """Deallocate GPU state. - TOKEN_ROUNDER = 64 - REQUEST_ROUNDER = 4 + This method is used for suspending the dynamic engine. + """ + + # Only deallocate tensors when not using unified memory at all (level 0). + # For levels 1 and 2, we do not perform any explicit allocations or + # deallocations after the initial call to `__init__()`. + if self.unified_memory_level != 0: + return + + # Mark deallocated. + if not self.is_tensor_state_allocated: + return + self.is_tensor_state_allocated = False + + # Delete all tensor attributes. + # TODO(@lmcafee): check that device == 'cuda'? + keys = list(vars(self).keys()) + for key in keys: + value = getattr(self, key) + if isinstance(value, torch.Tensor): + delattr(self, key) @classmethod def round_up_tokens(cls, value, tp_size=None): @@ -656,13 +693,13 @@ def from_config( max_batch_size: int, buffer_size_gb: float = 40, num_cuda_graphs: int = None, + mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, ): """ Instantiate a `DynamicInferenceContext` from a `TransformerConfig` and an `InferenceWrapperConfig`. """ # TODO: Add other necessary configs from inference_config - buffer_guaranteed_fraction = 0.1 model_config = model.config max_sequence_length = ( inference_config.inference_max_seq_length or model_config.max_sequence_length @@ -670,16 +707,15 @@ def from_config( max_sequence_length = max(max_sequence_length, max_batch_size) return cls( params_dtype=inference_config.params_dtype, - num_layers=model_config.num_layers, + num_layers=model_config.num_layers // model_config.pipeline_model_parallel_size, kv_channels=model_config.kv_channels, num_attention_heads=model_config.num_query_groups, max_sequence_length=inference_config.inference_max_seq_length, buffer_size_gb=buffer_size_gb, - buffer_guaranteed_fraction=buffer_guaranteed_fraction, materialize_only_last_token_logits=False, - max_requests_override=max_batch_size, num_cuda_graphs=num_cuda_graphs, use_flashinfer_fused_rope=None, + mamba_inference_state_config=mamba_inference_state_config, ) @classmethod @@ -820,6 +856,7 @@ def key_value_cache(self, layer_number: int) -> Tuple[Tensor, Tensor]: to blocks within the block-level memory buffer. """ attention_layer_number = self.layer_map[layer_number - 1] + if self.cache_mla_latent: return ( self.memory_buffer[attention_layer_number], @@ -988,7 +1025,7 @@ def initialize_attention_state( Args: num_warmup_tokens (Optional[int]): Number of tokens to use for warming up cuda graphs. Must be less than or equal to - `max_requests`. + `max_active_requests`. warmup_engine_mode (WarmupEngineMode): Denote whether to setup for a decode or a non-decode cuda-graph warmup. num_warmup_requests (Optional[int]): [DEPRECATED] Use num_warmup_tokens instead. @@ -1008,8 +1045,8 @@ def initialize_attention_state( # warmup both decode and non-decode engine steps if num_warmup_tokens is not None: - if num_warmup_tokens > self.max_requests: - raise ActiveRequestCountOverflowError(self.max_requests, num_warmup_tokens) + if num_warmup_tokens > self.max_active_requests: + raise ActiveRequestCountOverflowError(self.max_active_requests, num_warmup_tokens) if warmup_engine_mode == WarmupEngineMode.NON_DECODE: assert self.non_decode_cuda_graphs, "Set non-decode cuda graphs to True" @@ -1028,7 +1065,9 @@ def initialize_attention_state( math.ceil(active_token_count / self.cuda_graph_step_size) * self.cuda_graph_step_size ) - self.padded_active_token_count = min(self.padded_active_token_count, self.max_requests) + self.padded_active_token_count = min( + self.padded_active_token_count, self.max_active_requests + ) assert ( self.padded_active_token_count in self.cuda_graph_token_counts_set ), f"padded_active_token_count: {self.padded_active_token_count} not in cuda_graph_token_counts_set: {self.cuda_graph_token_counts_set}" @@ -1038,7 +1077,7 @@ def initialize_attention_state( if self.is_decode_only(): # For decode-only, the padded active token count cannot exceed max-requests. self.padded_active_token_count = min( - self.padded_active_token_count, self.max_requests + self.padded_active_token_count, self.max_active_requests ) # How are we calculating the padded active request count? @@ -1056,7 +1095,7 @@ def initialize_attention_state( # Update token position indexes. self.token_to_block_idx[self.active_token_count : self.padded_active_token_count] = ( - self.dummy_block_idx + self.block_allocator.dummy_block_idx ) self.token_to_local_position_within_kv_block[ self.active_token_count : self.padded_active_token_count @@ -1131,6 +1170,7 @@ def reset(self) -> None: self.request_last_kv_block_id.fill_(-1) self.request_last_kv_block_offset.fill_(0) self.request_to_kv_block_ids.fill_(-1) + self.request_metadata.fill_(0) # Reset token indexes. self.token_to_input_ids.fill_(0) @@ -1198,20 +1238,20 @@ def last_token_logits(self, logits: Tensor) -> Tensor: return last_token_logits - def check_availability( - self, req: DynamicInferenceRequest, safe: bool = False - ) -> (bool, bool, bool): + def check_availability(self, req: DynamicInferenceRequest) -> (bool, bool, bool): """ Check if the request can be added to the context. """ - request_can_be_added = self.total_request_count < self.max_requests + request_can_be_added = ( + self.total_request_count - self.paused_request_count < self.max_active_requests + ) request_tokens_can_be_added = ( self.active_token_count + req.remaining_prompt_length <= self.max_tokens ) blocks = math.ceil( (req.remaining_prompt_length + req.finished_chunk_token_count) / self.block_size_tokens ) - math.ceil(req.finished_chunk_token_count / self.block_size_tokens) - kv_cache_available = self.block_allocator.is_memory_available(blocks, safe=safe) + kv_cache_available = self.block_allocator.is_memory_available(blocks) return request_can_be_added, request_tokens_can_be_added, kv_cache_available def add_request(self, req: DynamicInferenceRequest, chunk_length: Optional[int] = None) -> None: @@ -1224,6 +1264,12 @@ def add_request(self, req: DynamicInferenceRequest, chunk_length: Optional[int] Return: None """ + + # If tensor state is deallocated, do not add request. + if not self.is_tensor_state_allocated: + raise TensorStateDeallocatedError(req.request_id) + + # Chunk length. if chunk_length is None: chunk_length = req.remaining_prompt_length @@ -1251,9 +1297,7 @@ def add_request(self, req: DynamicInferenceRequest, chunk_length: Optional[int] num_blocks_needed = overall_required_blocks - already_allocated_blocks if num_blocks_needed > 0: - new_block_ids = self.block_allocator.allocate_memory_blocks( - num_blocks_needed, safe=not is_chunked_prefill - ) + new_block_ids = self.block_allocator.allocate_memory_blocks(num_blocks_needed) if new_block_ids is None or len(new_block_ids) != num_blocks_needed: raise BlockOverflowError(req.request_id) @@ -1271,13 +1315,22 @@ def add_request(self, req: DynamicInferenceRequest, chunk_length: Optional[int] else: current_id = self.total_request_count - if current_id >= self.max_requests: + if current_id >= self.max_active_requests: raise RequestOverflowError(req.request_id) if self.active_token_count + chunk_length > self.max_tokens: raise TokenOverflowError(req.request_id) self.request_ids[current_id] = req.request_id + # Handle request metadata. + metadata = req.tracked_metadata + assert ( + len(metadata) == self.num_request_metadata + ), "Request added to context with invalid metadata length" + self.request_metadata[current_id] = torch.tensor( + metadata, dtype=torch.float32, device=self.request_metadata.device + ) + # Handle length and block assignments. self.request_query_lengths[current_id] = chunk_length self.request_output_lengths[current_id] = ( req.finished_chunk_token_count @@ -1342,6 +1395,7 @@ def _move_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens): self.request_kv_length_offsets[dst_idxs] = self.request_kv_length_offsets[src_idxs] self.request_query_lengths[dst_idxs] = self.request_query_lengths[src_idxs] self.request_output_lengths[dst_idxs] = self.request_output_lengths[src_idxs] + self.request_metadata[dst_idxs] = self.request_metadata[src_idxs] self.request_ids[dst_idxs] = self.request_ids[src_idxs] next_tokens[dst_idxs] = next_tokens[src_idxs] @@ -1362,6 +1416,7 @@ def _swap_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens): tensor_swap(self.request_kv_length_offsets, src_idxs, dst_idxs) tensor_swap(self.request_query_lengths, src_idxs, dst_idxs) tensor_swap(self.request_output_lengths, src_idxs, dst_idxs) + tensor_swap(self.request_metadata, src_idxs, dst_idxs) tensor_swap(self.request_ids, src_idxs, dst_idxs) tensor_swap(next_tokens, src_idxs, dst_idxs) tensor_swap(self.request_to_kv_block_ids, src_idxs, dst_idxs) @@ -1372,6 +1427,14 @@ def _swap_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens): if self.is_hybrid_model: tensor_swap(self.mamba_metadata.request_to_mamba_state_idx, src_idxs, dst_idxs) + def get_index_of_chunked_prefill_request(self) -> int: + """Get the index of the chunked prefill request in the context. + + Return: + (int) Index of the chunked prefill request, or -1 if none exists. + """ + return torch.where(self.request_ids == self.chunked_prefill_request_id)[0][0] + # TODO: see if we can compile this function def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> Tensor: """Update context state after calling engine.step(). @@ -1389,7 +1452,7 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T between these request groups. - 0:paused_request_count -> paused requests - paused_request_count:total_request_count -> active requests - - total_request_count:max_requests -> completed requests are moved here. + - total_request_count:max_active_requests -> completed requests are moved here. The reason for maintaining contiguous tensors rather than multiple smaller (e.g., per-group or per-request) tensors is for both 1) speed (avoid unnecessary tensor allocations), and 2) compatibility with the @@ -1413,6 +1476,7 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T Return: (Tensor) Newly paused request IDs. """ + # 1. The active token mask tells us which requests are still active and which are completed # active_request_count -> This corresponds to requests that have not reached EOD or max length # finished_request_count are requests that have reached the termination criterion @@ -1432,6 +1496,9 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T # Reset attention state. self.reset_attention_state() + # Update total_request_count. + self.total_request_count = active_request_count + self.paused_request_count + # 2. If no paused requests are present and no active requests we release memory and reset. if active_request_count + self.paused_request_count == 0: if finished_request_count > 0: @@ -1524,13 +1591,19 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T if self.chunked_prefill_request_id != -1: # find the id in request_ids that is the chunked_prefill_request_id. Only one request should be chunked. - pos = torch.where(self.request_ids == self.chunked_prefill_request_id)[0][0] - active_requests_requiring_new_block[pos] = 0 # chunked prefill should not be paused + active_requests_requiring_new_block[self.get_index_of_chunked_prefill_request()] = ( + 0 # chunked prefill should not be paused + ) active_requests_requiring_new_block_count = ( (active_requests_requiring_new_block == 1).sum().item() ) + if active_requests_requiring_new_block_count > 0: + newly_paused_request_ids = self.request_ids[ + torch.nonzero(active_requests_requiring_new_block) + self.paused_request_count + ] + # Swap unfinished active requests on the left side with paused requests on the right side # NOTE : We add paused request count because we concatenate # paused tokens to the left at the beginning of update requests @@ -1563,7 +1636,6 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T self._move_book_keeping_tensors( src_idxs=src_idxs, dst_idxs=dst_idxs, next_tokens=next_tokens ) - newly_paused_request_ids = self.request_ids[dst_idxs] self.paused_request_count += active_requests_requiring_new_block_count active_request_count -= active_requests_requiring_new_block_count @@ -1572,26 +1644,26 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T # We determine how many requests we can resume and resume them # Assign released blocks to paused requests. # todo: @shanmugamr, un-pause requests using FIFO, rather than LIFO. - num_non_gtd_blocks = max(0, self.block_allocator.block_count_avail - self.gtd_block_count) - if num_non_gtd_blocks: - # if we have non-gtd blocks, use them. Do not dip into the gtd-block pool - resume_request_count = min(num_non_gtd_blocks, self.paused_request_count) - else: - # only dip into the gtd-block pool if we have run out of non-gtd-blocks and the active - # request count has fallen below a certain threshold. + resume_request_count = 0 + if self.paused_request_count > 0: + active_block_count_avail = self.block_allocator.get_active_avail() + paused_block_counts = self.request_kv_block_counts[: self.paused_request_count] + paused_block_counts = paused_block_counts.flip(dims=[0]) + paused_block_counts += 1 # +1 for newly added block + paused_block_counts_cumsum = paused_block_counts.cumsum(dim=0) resume_request_count = min( - max(self.gtd_request_count - active_request_count, 0), self.paused_request_count + torch.nonzero(paused_block_counts_cumsum <= active_block_count_avail).numel(), + self.block_allocator.total_avail, ) self.paused_request_count -= resume_request_count active_request_count += resume_request_count assert active_request_count > 0, "active_request_count == %d." % active_request_count - # finally, swap the chunked prefill to the end of the active requests to obey the invariant + # finally, swap the chunked prefill to the end of the active requests to obey the invariance if self.chunked_prefill_request_id != -1: - pos = torch.where(self.request_ids == self.chunked_prefill_request_id)[0][0] self._swap_book_keeping_tensors( - src_idxs=torch.tensor([pos]), + src_idxs=torch.tensor([self.get_index_of_chunked_prefill_request()]), dst_idxs=torch.tensor([active_request_count + self.paused_request_count - 1]), next_tokens=next_tokens, ) @@ -1640,6 +1712,7 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T == 0 ), "The request_last_kv_block_offset should be 0 for the requests that just got resumed this step. " + assert resume_request_count <= self.block_allocator.total_avail block_ids = self.block_allocator.allocate_memory_blocks(resume_request_count) row_idx = torch.arange( self.paused_request_count, @@ -1761,11 +1834,11 @@ def get_kvcache_utilization_stats(self) -> dict: } """ # Total usable blocks exclude the reserved dummy block. - total_blocks = max(self.block_allocator.block_count_total - 1, 1) - block_count_avail = int(self.block_allocator.block_count_avail) + total_blocks = max(self.block_allocator.total_count - 1, 1) + block_count_avail = int(self.block_allocator.total_avail) # Overall allocated blocks in the buffer right now. - allocated_blocks = (self.block_allocator.block_count_total - 1) - block_count_avail + allocated_blocks = (self.block_allocator.total_count - 1) - block_count_avail allocated_blocks = int(max(0, allocated_blocks)) # Active unique blocks referenced by current active requests only. @@ -1787,7 +1860,6 @@ def get_kvcache_utilization_stats(self) -> dict: active_utilization = float(active_unique_blocks) / float(total_blocks) # Diagnostic helpers - num_non_gtd_blocks = max(0, block_count_avail - int(self.gtd_block_count)) total_request_count = int(self.total_request_count) return { 'total_blocks': int(total_blocks), @@ -1797,10 +1869,9 @@ def get_kvcache_utilization_stats(self) -> dict: 'active_utilization': active_utilization, 'active_request_count': int(self.get_active_request_count()), 'paused_request_count': int(self.paused_request_count), - 'gtd_block_count': int(self.gtd_block_count), 'block_count_avail': int(block_count_avail), - 'num_non_gtd_blocks': int(num_non_gtd_blocks), 'active_token_count': int(self.active_token_count), 'total_request_count': int(total_request_count), - 'max_requests': int(self.max_requests), + 'max_total_requests': int(self.max_total_requests), + 'max_active_requests': int(self.max_active_requests), } diff --git a/megatron/core/inference/data_parallel_inference_coordinator.py b/megatron/core/inference/data_parallel_inference_coordinator.py index 0045d5947a1..e1fe7b21566 100644 --- a/megatron/core/inference/data_parallel_inference_coordinator.py +++ b/megatron/core/inference/data_parallel_inference_coordinator.py @@ -9,7 +9,7 @@ import torch -from megatron.core.inference.headers import Headers +from megatron.core.inference.headers import Headers, UnknownHeaderError try: import zmq @@ -109,6 +109,8 @@ def __init__(self, inference_coordinator_port: int, data_parallel_size: int): self.identities_of_data_parallel_ranks.append(identity) logging.info("Inference Coordinator: Connected with data parallel ranks...") self.data_parallel_rank_iterator = cycle(self.identities_of_data_parallel_ranks) + self.data_parallel_pause_acks = set() + self.data_parallel_stop_acks = set() self.request_id_to_client_id = {} self.request_id_to_client_request_id = {} @@ -151,7 +153,7 @@ def start(self): # print(f"New client connected: {sender_identity}") known_clients.add(sender_identity) self.router_socket.send_multipart( - [sender_identity, msgpack.packb([Headers.ACK.value], use_bin_type=True)] + [sender_identity, msgpack.packb([Headers.CONNECT_ACK.value], use_bin_type=True)] ) elif header == Headers.SUBMIT_REQUEST: @@ -193,7 +195,13 @@ def start(self): ), ] ) - elif header in [Headers.PAUSE, Headers.UNPAUSE, Headers.STOP]: + elif header in [ + Headers.PAUSE, + Headers.UNPAUSE, + Headers.SUSPEND, + Headers.RESUME, + Headers.STOP, + ]: # control signals for the engine # broadcast to all data parallel ranks if sender_identity not in known_clients: @@ -202,13 +210,57 @@ def start(self): self.router_socket.send_multipart( [data_parallel_rank_id, msgpack.packb([header.value], use_bin_type=True)] ) + if header == Headers.UNPAUSE: + self.data_parallel_pause_acks = set() + elif header == Headers.PAUSE_ACK: + # control signal ack from the engine + assert sender_identity in self.identities_of_data_parallel_ranks + assert sender_identity not in self.data_parallel_pause_acks + self.data_parallel_pause_acks.add(sender_identity) + # route to all clients only once we have gotten an ack from all data parallel ranks + if len(self.data_parallel_pause_acks) == self.data_parallel_size: + for client_id in known_clients: + self.router_socket.send_multipart( + [ + client_id, + msgpack.packb([header.value, sender_identity], use_bin_type=True), + ] + ) + for data_parallel_rank_id in self.identities_of_data_parallel_ranks: + self.router_socket.send_multipart( + [ + data_parallel_rank_id, + msgpack.packb([Headers.PAUSE_ACK.value], use_bin_type=True), + ] + ) + elif header == Headers.STOP_ACK: + # control signal ack from the engine + assert sender_identity in self.identities_of_data_parallel_ranks + assert sender_identity not in self.data_parallel_stop_acks + self.data_parallel_stop_acks.add(sender_identity) + # route to all clients only once we have gotten an ack from all data parallel ranks + if len(self.data_parallel_stop_acks) == self.data_parallel_size: + for client_id in known_clients: + self.router_socket.send_multipart( + [ + client_id, + msgpack.packb([header.value, sender_identity], use_bin_type=True), + ] + ) + for data_parallel_rank_id in self.identities_of_data_parallel_ranks: + self.router_socket.send_multipart( + [ + data_parallel_rank_id, + msgpack.packb([Headers.STOP_ACK.value], use_bin_type=True), + ] + ) elif header == Headers.ENGINE_REPLY: # This is the output of a single engine step on some data parallel rank. assert sender_identity in self.identities_of_data_parallel_ranks - finished_requests = deserialized_payload[1] + finished_request_records = deserialized_payload[1] - for finished_request in finished_requests: - fid = finished_request["request_id"] + for finished_request_record in finished_request_records: + fid = finished_request_record["requests"][0]["request_id"] client_identity = self.request_id_to_client_id[fid] client_request_identity = self.request_id_to_client_request_id[fid] del self.request_id_to_client_id[fid] @@ -218,11 +270,15 @@ def start(self): [ client_identity, msgpack.packb( - [client_request_identity, finished_request], use_bin_type=True + [header.value, client_request_identity, finished_request_record], + use_bin_type=True, ), ] ) + else: + raise UnknownHeaderError(header) + @classmethod def entrypoint( cls, ready_event: Event, inference_coordinator_port: int, data_parallel_size: int diff --git a/megatron/core/inference/engines/__init__.py b/megatron/core/inference/engines/__init__.py index 9cd902d9d63..d6a4f6eb694 100644 --- a/megatron/core/inference/engines/__init__.py +++ b/megatron/core/inference/engines/__init__.py @@ -1,5 +1,5 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. from .abstract_engine import AbstractEngine -from .dynamic_engine import DynamicInferenceEngine +from .dynamic_engine import DynamicInferenceEngine, EngineSuspendedError from .static_engine import StaticInferenceEngine diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py index 4bff4f85fa8..5fad1369308 100644 --- a/megatron/core/inference/engines/dynamic_engine.py +++ b/megatron/core/inference/engines/dynamic_engine.py @@ -4,10 +4,13 @@ import logging import multiprocessing import os +import socket import struct import time import warnings from collections import deque +from contextlib import contextmanager +from dataclasses import dataclass from datetime import datetime from itertools import repeat from typing import Dict, List, Optional, Tuple, Union @@ -27,14 +30,19 @@ DataParallelInferenceCoordinator, ) from megatron.core.inference.engines.abstract_engine import AbstractEngine -from megatron.core.inference.headers import Headers -from megatron.core.inference.inference_request import DynamicInferenceRequest, Status +from megatron.core.inference.headers import Headers, UnknownHeaderError +from megatron.core.inference.inference_request import ( + DynamicInferenceRequest, + DynamicInferenceRequestRecord, + Status, +) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) from megatron.core.inference.utils import Counter, await_process_event -from megatron.core.utils import get_asyncio_loop, trace_async_exceptions +from megatron.core.transformer.cuda_graphs import delete_cuda_graphs +from megatron.core.utils import get_asyncio_loop, internal_api, trace_async_exceptions try: from tqdm import tqdm @@ -65,6 +73,19 @@ HAVE_WANDB = False wandb = None +try: + import psutil + + HAVE_PSUTIL = True +except ImportError: + HAVE_PSUTIL = False + + +class EngineSuspendedError(Exception): + """Engine is currently suspended and not performing steps.""" + + pass + def format_mem_bytes(mem_bytes): """Convert a byte count to a human-readable string in tb, gb, mb, kb, or bytes.""" @@ -75,6 +96,14 @@ def format_mem_bytes(mem_bytes): return "%d bytes" % mem_bytes +@dataclass(kw_only=True) +class RequestEntry: + """Entry in the engine's `self.requests` dict.""" + + record: DynamicInferenceRequestRecord + future: asyncio.Future + + # pylint: disable=line-too-long class DynamicInferenceEngine(AbstractEngine): """The dynamic inference engine. @@ -94,9 +123,6 @@ class DynamicInferenceEngine(AbstractEngine): batching and a dynamic block-level KV cache (similar to paged attention). random_seed (Optional[int]): Use a random seed if you want deterministic results. Defaults to None. - static_sampling (bool): If True, all requests are assumed to have the same - sampling parameters. This avoids needing to loop through all requests and - their sampling parameters every generation step, improving latency. inference_logging_step_interval (int): The step interval at which to log inference metrics to wandb. Defaults to 0, which means no logging. """ @@ -110,17 +136,9 @@ def __init__( *, track_paused_request_events: bool = False, enable_chunked_prefill: bool = True, - static_sampling: bool = False, inference_logging_step_interval: int = 0, ): - if enable_cuda_graph is not None: - warnings.warn( - "The `enable_cuda_graph` argument is deprecated and will be " - "removed in `megatron-core 0.15`. `enable_cuda_graph` is now " - "read directly from the transformer config object." - ) - assert isinstance( controller, TextGenerationController ), f"controller must be a TextGenerationController, got {type(controller)}" @@ -129,31 +147,41 @@ def __init__( ), f"context must be a DynamicInferenceContext, got {type(context)}" assert isinstance(random_seed, int), f"random_seed must be an int, got {type(random_seed)}" - self.request_counter = Counter() + # Deprecate `enable_cuda_graph`. + if enable_cuda_graph is not None: + warnings.warn( + "The `enable_cuda_graph` argument is deprecated and will be " + "removed in `megatron-core 0.15`. `enable_cuda_graph` is now " + "read directly from the transformer config object." + ) + self.enable_cuda_graph = enable_cuda_graph + else: + self.enable_cuda_graph = ( + controller.inference_wrapped_model.model.config.enable_cuda_graph + ) + + # Initialization options. self.controller = controller self.context = context self.random_seed = random_seed self.track_paused_request_events = track_paused_request_events - self.step_count = 0 - self.finished_request_count = 0 - self.waiting_request_ids = deque() - self.failed_request_ids = [] # deque() - self.request_counter = Counter() - self.requests: Dict[int, DynamicInferenceRequest] = {} - self.request_completion_futures: Dict[int, asyncio.Future] = {} - self.step_start_event = torch.cuda.Event(enable_timing=True) - self.step_end_event = torch.cuda.Event(enable_timing=True) - self.paused = False - self.stopped = False self.enable_chunked_prefill = enable_chunked_prefill - self.static_sampling = static_sampling - self.inference_logging_step_interval = inference_logging_step_interval + self.unified_memory_level = context.unified_memory_level + + if enable_cuda_graph is not None: + self.cuda_graph_impl = "local" if enable_cuda_graph else "none" + else: + self.cuda_graph_impl = controller.inference_wrapped_model.model.config.cuda_graph_impl + + # Initialize engine. + self.reset() + # Configure wandb to use separate step counter for inference metrics (only once) if self.inference_logging_step_interval > 0 and self.context.metrics_writer is not None: logging.info( f"\033[1;93m[INFERENCE]\033[0m " - f"\033[1;95mLogging inference metrics to wandb (rank {torch.distributed.get_rank()})\033[0m" + f"\033[1;95mLogging inference metrics to wandb (rank {self.rank})\033[0m" ) if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb": # Make all inference/* metrics use inference_step as their x-axis @@ -174,21 +202,43 @@ def __init__( max_step = int(val) self.inference_step_offset = int(max_step) - # Initialize the asyncio loop if it has not already been initialized. - # TODO: Start the engine loop here. - self._loop = get_asyncio_loop() - self._cond = asyncio.Condition() + # Create cuda graphs. + self.create_cuda_graphs() - # Capture cuda graph. - self.capture_stats = None + def reset(self) -> None: + """Reset by removing all requests and reset all state.""" - if enable_cuda_graph is not None: - self.cuda_graph_impl = "local" if enable_cuda_graph else "none" - else: - self.cuda_graph_impl = controller.inference_wrapped_model.model.config.cuda_graph_impl + self.context.reset() - if self.cuda_graph_impl == "local": - self.create_cuda_graphs() + # Request state. + self.request_counter = Counter() + self.finished_request_count = 0 + + self.requests: Dict[int, RequestEntry] = {} + self.waiting_request_ids = deque() + self.failed_request_ids = [] + + # Timing and logging variables. + self.rank = torch.distributed.get_rank() + self.step_count = 0 + self.step_start_event = torch.cuda.Event(enable_timing=True) + self.step_end_event = torch.cuda.Event(enable_timing=True) + self.capture_stats = None + + # Runtime state. + self._loop = get_asyncio_loop(getattr(self, "_loop", None)) + self._cond = asyncio.Condition() + self.running = asyncio.Event() + self.paused = asyncio.Event() + self.stopped = asyncio.Event() + self.received_pause: bool = False + self.received_stop: bool = False + self.suspend_signal = False + self.is_suspended = False + self.resume_request_ids = None + + # Coordinator state. + self.use_coordinator = False def create_cuda_graphs(self, reset_context: bool = True): """Create cuda graphs. @@ -199,6 +249,10 @@ def create_cuda_graphs(self, reset_context: bool = True): Args: reset_context (bool): Whether to reset the context after building cuda graphs. """ + + if self.cuda_graph_impl != "local": + return + context = self.context controller = self.controller @@ -207,7 +261,7 @@ def create_cuda_graphs(self, reset_context: bool = True): if moe_pad_experts and context.non_decode_cuda_graphs: context.non_decode_cuda_graphs = False - if torch.distributed.get_rank() == 0: + if self.rank == 0: warnings.warn( "MoE models do not support non-decode cuda graphs. " "Forcing non_decode_cuda_graphs to False." @@ -292,10 +346,12 @@ def create_cuda_graphs(self, reset_context: bool = True): self.capture_stats = capture_stats + @internal_api async def start_listening_to_data_parallel_coordinator( self, inference_coordinator_port: int, launch_inference_coordinator: bool = True, + verbose: bool = False, *, loop: Optional[asyncio.AbstractEventLoop] = None, ): @@ -306,16 +362,18 @@ async def start_listening_to_data_parallel_coordinator( `InferenceCoordinator`. It configures different ZMQ socket patterns based on the rank's role within the distributed topology. + Note that this method must be called on all ranks, as it uses blocking torch broadcasts. + The setup involves two primary roles within each data-parallel group: - 1. **TP Coordinator (TP_rank=0, PP_rank=0)**: This rank connects directly + 1. **MP Coordinator (TP_rank=0, PP_rank=0)**: This rank connects directly to the central coordinator via a ZMQ `DEALER` socket. It receives requests and uses a ZMQ `PUB` (publisher) socket to broadcast them - to all other ranks within its tensor-parallel (TP) group. - 2. **TP Workers (all other ranks)**: These ranks use ZMQ `SUB` (subscriber) - sockets to listen for requests broadcast by their local TP Coordinator. + to all other ranks within its model-parallel (MP) group. + 2. **MP Workers (all other ranks)**: These ranks use ZMQ `SUB` (subscriber) + sockets to listen for requests broadcast by their local MP Coordinator. - This architecture uses fast Inter-Process Communication (`ipc`) sockets for - intra-node broadcasts within a TP group. + This architecture uses TCP sockets for both inter-node and intra-node broadcasts + within an MP group. Finally, after setting up the communication channels and ensuring all ranks are synchronized, this method starts the main engine processing loop @@ -327,12 +385,7 @@ async def start_listening_to_data_parallel_coordinator( launch_inference_coordinator (bool, optional): If True, the global rank 0 process will spawn and manage the `InferenceCoordinator` process. Defaults to True. - - Note: - The current implementation uses `ipc` sockets for broadcasting requests - within a Tensor Parallel group, which limits each TP group to a single - physical node. For example, if you have 8 GPUs per node, then this will only - work with TP=[1,2,4,8] + verbose (bool): Whether to run in verbose mode. """ assert HAVE_ZMQ, ( @@ -343,7 +396,25 @@ async def start_listening_to_data_parallel_coordinator( "pip install msgpack" ) - if launch_inference_coordinator and torch.distributed.get_rank() == 0: + self.zmq_context = zmq.Context().instance() + self.zmq_sockets = [] # keep track of all sockets created by this engine + + # Get world info. + dp_group = parallel_state.get_data_parallel_group() + dp_src = parallel_state.get_data_parallel_src_rank() + dp_size = parallel_state.get_data_parallel_world_size() + dp_rank = parallel_state.get_data_parallel_rank() + + mp_group = parallel_state.get_model_parallel_group() + mp_src = parallel_state.get_model_parallel_src_rank() + tp_rank = parallel_state.get_tensor_model_parallel_rank() + pp_rank = parallel_state.get_pipeline_model_parallel_rank() + + self.is_mp_coordinator = tp_rank == 0 and pp_rank == 0 + self.is_dp_coordinator = (dp_rank == 0) and self.is_mp_coordinator + + # Spawn a DP coordinator process and get the connection info. + if launch_inference_coordinator and self.is_dp_coordinator: spawn_context = multiprocessing.get_context('spawn') coordinator_ready_event = spawn_context.Event() self.inference_coordinator_process = spawn_context.Process( @@ -356,67 +427,223 @@ async def start_listening_to_data_parallel_coordinator( ) self.inference_coordinator_process.start() - # Todo [Siddharth]: can we move this code to another file? - self.zmq_context = zmq.Context() - self.zmq_sockets = [] # keep track of all sockets created by this engine + # Find available ports for MP and bind to them. + if self.is_mp_coordinator: + local_ip = socket.gethostname() + mp_req_sock = self.zmq_context.socket(zmq.PUB) + mp_req_sock.bind_to_random_port(f"tcp://{local_ip}") + mp_req_addr = mp_req_sock.getsockopt_string(zmq.LAST_ENDPOINT) + + mp_len_sock = self.zmq_context.socket(zmq.PUB) + mp_len_sock.bind_to_random_port(f"tcp://{local_ip}") + mp_len_addr = mp_len_sock.getsockopt_string(zmq.LAST_ENDPOINT) + else: + mp_req_addr = None + mp_len_addr = None + + # Broadcast addresses to respective ranks. + bcast = [mp_req_addr, mp_len_addr] + torch.distributed.broadcast_object_list(bcast, src=mp_src, group=mp_group) + [mp_req_addr, mp_len_addr] = bcast + ip_address_of_dp_coordinator = os.getenv('MASTER_ADDR', '127.0.0.1') - identity = f'tp-coord-{parallel_state.get_data_parallel_rank()}' - if ( - parallel_state.get_tensor_model_parallel_rank() == 0 - and parallel_state.get_pipeline_model_parallel_rank() == 0 - ): + dp_addr = f"tcp://{ip_address_of_dp_coordinator}:{inference_coordinator_port}" + identity = f'mp-coord-{dp_rank}' + if self.is_mp_coordinator: # 1. Create dealer sockets where tp_rank = 0 and pp_rank = 0 # These will receive requests from an InferenceCoordinator. self.socket_for_receiving_requests = self.zmq_context.socket(zmq.DEALER) self.socket_for_receiving_requests.setsockopt(zmq.IDENTITY, identity.encode('utf-8')) - self.socket_for_receiving_requests.connect( - f"tcp://{ip_address_of_dp_coordinator}:{inference_coordinator_port}" - ) + self.socket_for_receiving_requests.connect(dp_addr) # send empty string. this is used to register with the coordinator. self.socket_for_receiving_requests.send(b"") # 2. Create a publisher socket. This is used to publish or broadcast - # requests within the tensor parallel group - self.tensor_parallel_publisher_socket = self.zmq_context.socket(zmq.PUB) - self.tensor_parallel_publisher_socket.bind(f"ipc:///tmp/{identity}-tp-bcast-socket-req") + # requests within the model parallel group + self.model_parallel_publisher_socket = mp_req_sock # 3. Create another publisher socket to broadcast the number of messages to receive. - self.tensor_parallel_num_msgs_publisher_socket = self.zmq_context.socket(zmq.PUB) - self.tensor_parallel_num_msgs_publisher_socket.bind( - f"ipc:///tmp/{identity}-tp-bcast-socket-len" - ) + self.model_parallel_num_msgs_publisher_socket = mp_len_sock self.zmq_sockets += [ self.socket_for_receiving_requests, - self.tensor_parallel_num_msgs_publisher_socket, - self.tensor_parallel_publisher_socket, + self.model_parallel_num_msgs_publisher_socket, + self.model_parallel_publisher_socket, ] - # All TP ranks subscribe to the two publisher sockets - self.tensor_parallel_subscriber_socket = self.zmq_context.socket(zmq.SUB) - self.tensor_parallel_subscriber_socket.connect(f"ipc:///tmp/{identity}-tp-bcast-socket-req") - self.tensor_parallel_subscriber_socket.setsockopt_string(zmq.SUBSCRIBE, "") - - self.tensor_parallel_num_msgs_subscriber_socket = self.zmq_context.socket(zmq.SUB) - self.tensor_parallel_num_msgs_subscriber_socket.connect( - f"ipc:///tmp/{identity}-tp-bcast-socket-len" - ) - self.tensor_parallel_num_msgs_subscriber_socket.setsockopt_string(zmq.SUBSCRIBE, "") + # All MP ranks subscribe to the two publisher sockets + self.model_parallel_subscriber_socket = self.zmq_context.socket(zmq.SUB) + self.model_parallel_subscriber_socket.connect(mp_req_addr) + self.model_parallel_subscriber_socket.setsockopt_string(zmq.SUBSCRIBE, "") + + self.model_parallel_num_msgs_subscriber_socket = self.zmq_context.socket(zmq.SUB) + self.model_parallel_num_msgs_subscriber_socket.connect(mp_len_addr) + self.model_parallel_num_msgs_subscriber_socket.setsockopt_string(zmq.SUBSCRIBE, "") self.zmq_sockets += [ - self.tensor_parallel_subscriber_socket, - self.tensor_parallel_num_msgs_subscriber_socket, + self.model_parallel_subscriber_socket, + self.model_parallel_num_msgs_subscriber_socket, ] - torch.distributed.barrier(parallel_state.get_tensor_model_parallel_group()) + torch.distributed.barrier(mp_group) - if launch_inference_coordinator and torch.distributed.get_rank() == 0: + if launch_inference_coordinator and self.is_dp_coordinator: await await_process_event(coordinator_ready_event, self.inference_coordinator_process) logging.info("Inference co-ordinator is ready to receive requests!") # Finally run the engine infinite loop loop = get_asyncio_loop(loop) - self.engine_loop_task = loop.create_task(self.run_engine_with_coordinator(loop=loop)) + self.engine_loop_task = loop.create_task( + self.run_engine_with_coordinator(loop=loop, verbose=verbose) + ) + + @contextmanager + @staticmethod + def suspend_resume_ctx(key: str, *, unified_memory_level: int) -> None: + """Context manager for of suspending and resuming the engine. + + This context manager records the time and memory usage when suspending + and resuming the context. TODO(@lmcafee): add argument to optionally + return nullcontext, to avoid overhead. + + Args: + key (str): Key that identifies caller (e.g., 'suspend' or 'resume'). + + Return: + None. + """ + + try: + + start_mem = torch.cuda.memory_stats() + start_time = time.time() + torch.cuda.synchronize() + + yield + + finally: + + end_time = time.time() + + end_mem = torch.cuda.memory_stats() + start_mem_alloc = start_mem["allocated_bytes.all.current"] + end_mem_alloc = end_mem["allocated_bytes.all.current"] + start_mem_res = start_mem["reserved_bytes.all.current"] + end_mem_res = end_mem["reserved_bytes.all.current"] + + rank_str = torch.distributed.get_rank() + dir_str = "deallocating" if end_mem_alloc <= start_mem_alloc else "allocating" + relative_time_str = f"{end_time - start_time:.3f} sec" + relative_mem_str = f"{abs(start_mem_alloc - end_mem_alloc) / 1024**3:.1f} gb" + + if HAVE_PSUTIL: + process = psutil.Process() + mem_info = process.memory_info() + cpu_mem_str = f"{mem_info.rss / 1024**3:.1f} gb" + else: + cpu_mem_str = "--" + + total_mem_str = ", ".join( + ( + f"cpu: {cpu_mem_str}", + f"gpu: alloc {end_mem_alloc / 1024**3:.1f} gb", + f"res {end_mem_res / 1024**3:.1f} gb", + ) + ) + logging.info( + f"[rank {rank_str}] dynamic engine {key}, " + f"unified {unified_memory_level}, " + f"{dir_str} " + f"{relative_mem_str} in {relative_time_str} ... " + f"abs mem usage: {total_mem_str}" + ) + + def suspend(self): + """Suspend engine by deallocating context's GPU state.""" + + # Skip if already suspended, which can happen when using the inference + # coordinator. + if self.is_suspended: + return + self.is_suspended = True + + # Deallocate context tensors. + with self.__class__.suspend_resume_ctx( + "suspended", unified_memory_level=self.unified_memory_level + ): + self.context.deallocate_all_tensors() + + # Delete cuda graphs when not using unified memory at all (level 0). For + # levels 1 and 2, the context's tensors maintain static memory addresses, + # so the cuda graphs are re-used. + if self.unified_memory_level == 0: + delete_cuda_graphs() + + # Maintain references to requests before reset. + waiting_request_ids = list(self.waiting_request_ids) + active_request_ids = set(self.requests.keys()) - set(waiting_request_ids) + self.resume_request_ids = [*active_request_ids, *waiting_request_ids] + self.waiting_request_ids.clear() + + # Suspend requests objects. + for request_id in active_request_ids: + self.requests[request_id].record.suspend(self.controller.tokenizer) + + def resume(self): + """Resume engine by reallocating context's GPU state.""" + + # Skip if not suspended, which can happen when using the inference + # coordinator. + if not self.is_suspended: + return + self.is_suspended = False + + # Resume. + with self.__class__.suspend_resume_ctx( + "resumed", unified_memory_level=self.unified_memory_level + ): + + # Allocate context tensors. + alloc_time = time.time() + torch.cuda.synchronize() + self.context.allocate_all_tensors(is_init=False) + torch.cuda.synchronize() + alloc_time = time.time() - alloc_time + + # Reset context and request data. + self.context.reset() + + # Create cuda graphs (before adding requests, to be in decode mode). + # Only create cuda graphs when not using unified memory at all (level + # 0). For levels 1 and 2, the context's tensors maintain static + # memory addresses, so the cuda graphs are re-used. + capture_time = time.time() + if self.unified_memory_level == 0: + self.create_cuda_graphs() + capture_time = time.time() - capture_time + + # Add requests. + add_time = time.time() + torch.cuda.synchronize() + for request_id in self.resume_request_ids: + self._add_request(self.get_request(request_id)) + torch.cuda.synchronize() + add_time = time.time() - add_time + + # Print inner timing (must be outside context manager above for correct formatting). + logging.info( + " > " + + ", ".join( + ( + f"inner timing: alloc {alloc_time:.3f}", + f"add {add_time:.3f}", + f"capture {capture_time:.3f}.", + ) + ) + ) + + # Notify event loop. + self._loop.call_soon_threadsafe(asyncio.create_task, self._notify_cond_for_new_request()) @trace_async_exceptions async def _notify_cond_for_new_request(self): @@ -428,19 +655,31 @@ def has_unfinished_requests(self) -> bool: """Test if context contains unfinished requests.""" return self.context.has_unfinished_requests() or len(self.waiting_request_ids) > 0 - def reset(self) -> None: - """Reset by removing all requests and reset all state.""" - self.context.reset() - self.waiting_request_ids.clear() - self.step_count = 0 - self.finished_request_count = 0 + def get_request(self, request_id: int) -> DynamicInferenceRequest: + """Get most recent request from a request record. + + Args: + request_id (int): Request id. + + Returns: + (DynamicInferenceRequest) The most recent request in the record. + """ + return self.requests[request_id].record[-1] def _add_request( self, request: DynamicInferenceRequest ) -> asyncio.Future[DynamicInferenceRequest]: request_id = request.request_id - self.requests[request_id] = request + + # Add request to self.requests. If the engine has previously been + # suspended, then the request may already exist. + if request_id not in self.requests: + self.requests[request_id] = RequestEntry( + record=DynamicInferenceRequestRecord.from_request(request), + future=self._loop.create_future(), + ) + if request.status is None: request.status = Status.ACTIVE_AND_GENERATING_TOKENS @@ -456,6 +695,17 @@ def _add_request( request.sampling_params.num_tokens_to_generate = self.context.max_sequence_length - len( request.prompt_tokens ) + if request.sampling_params.termination_id is None: + try: + eod = self.controller.tokenizer.eod + except AttributeError: + if self.rank == 0: + warnings.warn( + "Termination ID not specified, and tokenizer does not define eod." + "Defaulting to not using termination id." + ) + eod = -1 + request.sampling_params.termination_id = eod if ( len(request.prompt_tokens) + request.sampling_params.num_tokens_to_generate @@ -470,10 +720,10 @@ def _add_request( if request.status != Status.FAILED: self.waiting_request_ids.append(request_id) + else: + self.failed_request_ids.append(request_id) - # Create a new asyncio Future to notify the user when the request has completed. - self.request_completion_futures[request_id] = self._loop.create_future() - return self.request_completion_futures[request_id] + return self.requests[request_id].future def add_request( self, @@ -491,7 +741,6 @@ def add_request( Return: Returns an asyncio `Future[DynamicInferenceRequest]` for the user to wait on. """ - prompt_str = None # Tokenize prompt if text. if isinstance(prompt, str): @@ -520,8 +769,8 @@ def add_request( # Initialize request. request = DynamicInferenceRequest( - prompt=prompt_str, request_id=request_id, + prompt=prompt_str, prompt_tokens=tokens, sampling_params=sampling_params, ) @@ -550,9 +799,9 @@ def post_process_requests( Returns: A list of active requests and completed requests as `DynamicInferenceRequest` objects """ - active_requests: List[DynamicInferenceRequest] = [] - finished_requests: List[DynamicInferenceRequest] = [] + active_request_ids: list[int] = [] finished_request_ids = set(finished_request_ids.tolist()) + finished_request_records: list[DynamicInferenceRequestRecord] = [] self.finished_request_count += len(finished_request_ids) log_probs_iter = log_probs if log_probs else repeat(None) @@ -560,7 +809,7 @@ def post_process_requests( for request_id, token, request_log_probs in zip( request_ids.tolist(), sample.tolist(), log_probs_iter ): - request: DynamicInferenceRequest = self.requests[request_id] + request: DynamicInferenceRequest = self.get_request(request_id) if request_id != self.context.chunked_prefill_request_id: request.generated_tokens.append(token) if request.tpot is None: @@ -594,19 +843,20 @@ def post_process_requests( if request_id in finished_request_ids: request.generated_length = len(request.generated_tokens) request.status = Status.COMPLETED - finished_request = self.requests.pop(request_id) + finished_entry = self.requests.pop(request_id) + finished_request = finished_entry.record[-1] if finished_request.prompt is None: finished_request.prompt = self.controller.tokenizer.detokenize( finished_request.prompt_tokens.tolist() ) finished_request.generated_length = len(finished_request.generated_tokens) - finished_requests.append(finished_request) finished_request.generated_text = self.controller.tokenizer.detokenize( finished_request.generated_tokens ) - self.request_completion_futures[request_id].set_result(finished_request) + finished_request_records.append(finished_entry.record) + finished_entry.future.set_result(finished_entry.record) else: - active_requests.append(request) + active_request_ids.append(request_id) else: # The chunked prefill produces useless tokens # so we are not appending them to the generated tokens. @@ -624,9 +874,9 @@ def post_process_requests( request.prompt_log_probs = [] request.prompt_log_probs.extend(request_log_probs) request.generated_log_probs = [] - active_requests.append(request) + active_request_ids.append(request_id) - return active_requests, finished_requests + return active_request_ids, finished_request_records def schedule_waiting_requests(self): """Tries to schedule any requests in the waiting pool.""" @@ -640,9 +890,9 @@ def schedule_non_chunked_prefill(self): Perform the same original scheduling logic for non-chunked runs """ while self.waiting_request_ids: - req = self.requests[self.waiting_request_ids[0]] + req = self.get_request(self.waiting_request_ids[0]) request_can_be_added, request_tokens_can_be_added, kv_cache_available = ( - self.context.check_availability(req, safe=True) + self.context.check_availability(req) ) if request_can_be_added and request_tokens_can_be_added and kv_cache_available: self.context.add_request(req) @@ -655,37 +905,6 @@ def schedule_non_chunked_prefill(self): else: break - def get_active_sampling_map(self) -> List[Tuple[SamplingParams, List[int]]]: - """Gets a map of sampling methods to active requests indices in the context.""" - # Get all active request IDs. - active_request_ids = self.context.request_ids[ - self.context.paused_request_count : self.context.total_request_count - ].tolist() - if self.static_sampling: - return [(next(iter(self.requests.values())).sampling_params, active_request_ids)] - - # Get a map from request_id to context array index. - context_id_map = {r: i for i, r in enumerate(active_request_ids)} - - # Create map of sampling methods to context array indices. - sampling_map: List[Tuple[SamplingParams, List[int]]] = [] - for request_id, request in self.requests.items(): - if request_id not in context_id_map: - continue - context_id = context_id_map[request_id] - sp = request.sampling_params - - # Look for a pre-existing group with these sampling parameters. - for sampling, indices in sampling_map: - if sampling == sp: - indices.append(context_id) - break - # If no group exists, create a new one. - else: - sampling_map.append((sp, [context_id])) - - return sampling_map - def schedule_chunked_prefill(self): """ This function schedules chunked prefill requests. @@ -704,7 +923,7 @@ def schedule_chunked_prefill(self): can_schedule = True while self.waiting_request_ids and can_schedule: can_schedule = False - req = self.requests[self.waiting_request_ids[0]] + req = self.get_request(self.waiting_request_ids[0]) # is_continuing_chunked_prefill is True if we are scheduling next # chunk of a existing chunked prefill request @@ -716,9 +935,7 @@ def schedule_chunked_prefill(self): self.context.active_token_count + remaining_len <= self.context.max_tokens ) token_partially_can_be_added = self.context.active_token_count < self.context.max_tokens - request_can_be_added, _, kv_cache_available = self.context.check_availability( - req, safe=not is_continuing_chunked_prefill - ) + request_can_be_added, _, kv_cache_available = self.context.check_availability(req) request_can_be_added = is_continuing_chunked_prefill or request_can_be_added if request_can_be_added and kv_cache_available: @@ -747,104 +964,157 @@ def schedule_chunked_prefill(self): # chunked prefill request at the head of the waiting queue # Note that we do not need to continue check the queue, as the tokens are full - async def async_step( - self, *, verbose: Optional[bool] = False - ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]: - """ - Wrapper for controller.generate_output_tokens_dynamic_batch(), to - match vLLM API. Uses `asyncio` for continuous generation which allows this - method to sleep and wake up when new requests are available. - - Args: - sampling_params (SamplingParams): The sampling parameters. - verbose (bool): Whether to run in verbose mode. + async def async_forward(self) -> Tuple[Dict, Dict, float, int]: + """Uses `asyncio` for continuous generation. + Sleeps when no requests are available, until new requests have been added. Returns: A tuple comprised of: - 1. Requests that ran in the last step and are still active. - 2. Requests that ran in the last step and have now finished. - 3. The step time in seconds. + step_result (Optional[Dict]): The result of the step. + context_state (Dict): A tuple consisting of the state of the context. + is_decode_only, total/paused request count, active token count. + step_time (float): How long this step took. """ + + # If suspended, no stepping. + if self.is_suspended: + raise EngineSuspendedError(self.step_count) + # schedule requests self.schedule_waiting_requests() - # Previous context state, for printing output below. - prev_is_decode_only = self.context.is_decode_only() - prev_total_request_count = self.context.total_request_count - prev_paused_request_count = self.context.paused_request_count - prev_active_token_count = self.context.active_token_count - - range_push("Prefill" if not prev_is_decode_only else "Decode") + # Saving pre-step state, for printing output below. + is_decode_only = self.context.is_decode_only() + pre_step_context_state = { + "is_decode_only": is_decode_only, + "total_request_count": self.context.total_request_count, + "paused_request_count": self.context.paused_request_count, + "active_token_count": self.context.active_token_count, + } # Generate tokens. - is_decode_only = self.context.is_decode_only() - # save the is_decode_only AFTER scheduling, BEFORE update + range_push("Prefill" if not is_decode_only else "Decode") + # TODO @TDE: Account for this line when overlapping forward and bookkeep. self.is_decode_only = is_decode_only + self.step_start_event.record() - sampling_map = self.get_active_sampling_map() - result = await self.controller.async_generate_output_tokens_dynamic_batch(sampling_map) + result = await self.controller.async_generate_output_tokens_dynamic_batch() self.step_end_event.record() self.step_end_event.synchronize() step_time = self.step_start_event.elapsed_time(self.step_end_event) / 1e3 + self.step_count += 1 + + range_pop() + + if ( + self.inference_logging_step_interval > 0 + and step_count > 0 + and step_count % self.inference_logging_step_interval == 0 + and self.context.metrics_writer is not None + ): + kvcache_util_stats = self.context.get_kvcache_utilization_stats() + else: + kvcache_util_stats = None + + post_step_context_state = { + "waiting_request_count": len(self.waiting_request_ids), + "finished_request_count": self.finished_request_count, + "kv_stats": kvcache_util_stats, + "padded_active_token_count": self.context.padded_active_token_count, + "using_cuda_graph_this_step": self.context.using_cuda_graph_this_step(), + "total_active_block_count": self.context.block_allocator.active_count, + "total_paused_block_count": self.context.block_allocator.paused_count, + "total_active_used_blocks": self.context.block_allocator.get_active_used(), + "total_paused_used_blocks": self.context.block_allocator.get_paused_used(), + } + + context_state = {**pre_step_context_state, **post_step_context_state} + + return result, context_state, step_time, self.step_count + + async def async_bookkeep( + self, + step_result: Optional[Dict], + context_state: Dict, + step_time: float, + step_count: int, + *, + verbose: bool = False, + ): + """Uses `asyncio` for continuous bookkeeping. + + Args: + step_result (Optional[Dict]): The result of the step. + context_state (Dict): is_decode_only, total/paused request count, active token count. + step_time (float): How long this step took. + step_count (int): The count of the step. + verbose (bool): Whether to run in verbose mode. + Returns: + A dictionary containing: + active_requests (List): Requests that ran in the last step and are still active. + finished_requests (List): Requests that ran in the last step and have now finished. + step_time (float): The step time in seconds. + cuda_graph_request_count (int): The CUDA graph batch size matching this step. + """ # Increment finished_request_count. cuda_graph_request_count = None - if result is not None: - active_request_ids = result["active_request_ids"] - newly_paused_request_ids = result["newly_paused_request_ids"] - finished_request_ids = result["finished_request_ids"] - sample = result["sample"] - log_probs = result["log_probs"] - cuda_graph_request_count = result["cuda_graph_request_count"] + if step_result is not None: + active_request_ids = step_result["active_request_ids"] + newly_paused_request_ids = step_result["newly_paused_request_ids"] + finished_request_ids = step_result["finished_request_ids"] + sample = step_result["sample"] + log_probs = step_result["log_probs"] + cuda_graph_request_count = step_result["cuda_graph_request_count"] # Add paused events. if newly_paused_request_ids is not None and self.track_paused_request_events: newly_paused_request_ids = newly_paused_request_ids.tolist() - [self.requests[i].add_event_pause() for i in newly_paused_request_ids] + [self.get_request(i).add_event_pause() for i in newly_paused_request_ids] # Mark requests finished. - [self.requests[i].add_event_finish() for i in finished_request_ids.tolist()] + [self.get_request(i).add_event_finish() for i in finished_request_ids.tolist()] # Add finished events. - (active_requests, finished_requests) = self.post_process_requests( + active_request_ids, finished_request_records = self.post_process_requests( active_request_ids, finished_request_ids, step_time, sample, log_probs ) else: - active_requests: List[DynamicInferenceRequest] = [] - finished_requests: List[DynamicInferenceRequest] = [] + active_request_ids: list[int] = [] + finished_request_records: list[DynamicInferenceRequestRecord] = [] # Failed requests. for failed_request_id in self.failed_request_ids: - failed_request = self.requests.pop(failed_request_id) + failed_entry = self.requests.pop(failed_request_id) + failed_request = failed_entry.record[-1] failed_request.status = Status.FAILED failed_request.add_event_fail() - finished_requests.append(failed_request) - self.request_completion_futures[failed_request_id].set_result(failed_request) + finished_request_records.append(failed_entry.record) + failed_entry.future.set_result(failed_entry.record) self.failed_request_ids.clear() - # Log KV cache utilization stats to W&B - if ( - self.inference_logging_step_interval > 0 - and self.step_count > 0 - and self.step_count % self.inference_logging_step_interval == 0 - and self.context.metrics_writer is not None - ): - - # Get KV cache utilization stats from dynamic context - kv_stats = self.context.get_kvcache_utilization_stats() + # Handle necessary ZMQ DP coordinator communication. + if self.use_coordinator and self.is_mp_coordinator and finished_request_records: + payload = msgpack.packb( + [Headers.ENGINE_REPLY.value, [r.serialize() for r in finished_request_records]], + use_bin_type=True, + ) + self.socket_for_receiving_requests.send(payload) + # Log KV cache utilization stats to W&B + if context_state["kv_stats"] is not None: # Prepare metrics dictionary with all stats # Use 'inference/' prefix for all metrics to separate from training metrics metrics = { - 'inference/inference_step': int(self.inference_step_offset + int(self.step_count)), + 'inference/inference_step': int(self.inference_step_offset + int(step_count)), 'inference/step_time_s': float(step_time), 'inference/waiting_queue_len': int(len(self.waiting_request_ids)), 'inference/total_requests_dict_size': int(len(self.requests)), } # Add KV stats with inference/ prefix # Convert utilization metrics from 0-1 range to 0-100 percentage range for better visualization - for key, value in kv_stats.items(): + for key, value in context_state["kv_stats"].items(): if 'utilization' in key: # Convert to percentage (0-100) and group under kvcache_utilization metrics[f'inference/{key}'] = float(value * 100.0) @@ -860,15 +1130,16 @@ async def async_step( # Print context state. if verbose: - context = self.context mem = torch.cuda.memory_stats() - step_type = "decode" if is_decode_only else "non-decode" + step_type = "decode" if context_state["is_decode_only"] else "non-decode" output_str = ( - "* step %d | %s ... time: %.3f%s ... " - "reqs: %d [ gtd %d, active %d, paused %d, finished %d ] ... " + "* rank %d | step %d | %s ... time: %.3f%s ... " + "reqs: a %d/%d, p %d/%d, w %d, f %d ... " + "blocks: a %d/%d, p %d/%d ... " "mem: tensors %d, alloc %.1f gb, res %.1f gb." % ( - self.step_count, + self.rank, + step_count, datetime.now().strftime("%H:%M:%S"), step_time, ( @@ -877,44 +1148,71 @@ async def async_step( step_type, ( "DIM %d:%d" - % (context.padded_active_token_count, prev_active_token_count) - if self.context.using_cuda_graph_this_step() + % ( + context_state["padded_active_token_count"], + context_state["active_token_count"], + ) + if context_state["using_cuda_graph_this_step"] else "OFF" ), ) ), - prev_total_request_count, - context.gtd_request_count, - prev_total_request_count - prev_paused_request_count, - prev_paused_request_count, - self.finished_request_count, + context_state["total_request_count"] - context_state["paused_request_count"], + context_state["total_active_block_count"], + context_state["paused_request_count"], + context_state["total_paused_block_count"], + context_state["waiting_request_count"], + context_state["finished_request_count"], + context_state["total_active_used_blocks"], + context_state["total_active_block_count"], + context_state["total_paused_used_blocks"], + context_state["total_paused_block_count"], mem["allocation.all.current"], mem["allocated_bytes.all.current"] / (1024**3), mem["reserved_bytes.all.current"] / (1024**3), ) ) - if prev_is_decode_only: + if context_state["is_decode_only"]: output_str = f"\033[94m{output_str}\033[0m" logging.info(output_str) - self.step_count += 1 - - range_pop() return { - "active_requests": active_requests, - "finished_requests": finished_requests, + "active_request_ids": active_request_ids, + "finished_request_records": finished_request_records, "step_time": step_time, "cuda_graph_request_count": cuda_graph_request_count, } + async def async_step( + self, *, verbose: bool = False + ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]: + """ + Wrapper for controller.generate_output_tokens_dynamic_batch(), to + match vLLM API. Uses `asyncio` for continuous generation which allows this + method to sleep and wake up when new requests are available. + + Args: + verbose (bool): Whether to run in verbose mode. + + Returns: + A tuple comprised of: + 1. Requests that ran in the last step and are still active. + 2. Requests that ran in the last step and have now finished. + 3. The step time in seconds. + """ + last_step_data = await self.async_forward() + ret = await self.async_bookkeep(*last_step_data, verbose=verbose) + # Keep for compatibility with current test suite. + return ret + def step_modern( - self, *, verbose: Optional[bool] = False + self, *, verbose: bool = False ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]: """Synchronous wrapper for `self.async_step`.""" return self._loop.run_until_complete(self.async_step(verbose=verbose)) def step_legacy( - self, sampling_params: SamplingParams, *, verbose: Optional[bool] = False + self, sampling_params: SamplingParams, *, verbose: bool = False ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]: """Synchronous wrapper for `self.async_step`.""" warnings.warn( @@ -922,10 +1220,10 @@ def step_legacy( "0.16. Please use `step_modern()` going forward, which will eventually " "be renamed to `step()`." ) - result = self._loop.run_until_complete( - self.async_step(sampling_params=sampling_params, verbose=verbose) - ) - return (result["active_requests"], result["finished_requests"], result["step_time"]) + result = self._loop.run_until_complete(self.async_step(verbose=verbose)) + active_requests = [self.get_request(i) for i in result["active_request_ids"]] + finished_requests = [r.merge() for r in result["finished_request_records"]] + return active_requests, finished_requests, result["step_time"] # For backwards compatibility, point `step()` to `step_legacy()`. Starting in # `megatron-core` 0.16, `step_modern()` will be renamed to `step()`. @@ -940,39 +1238,40 @@ def generate( request_id = int(next(self.request_counter)) _ = self.add_request(request_id, prompt, sampling_params) - finished_requests_list = [] + finished_request_records_list = [] while self.has_unfinished_requests(): result = self.step_modern() - finished_requests_list.extend(result["finished_requests"]) + finished_request_records_list.extend(result["finished_request_records"]) - # Ensure requests are returned in the same order they were passed in - finished_requests_list.sort(key=lambda x: x.request_id) + # Ensure requests are returned in the same order they were passed in. + finished_request_records_list.sort(key=lambda r: r.request_id) - return finished_requests_list + return finished_request_records_list def schedule_requests(self) -> int: """Drains the ZMQ socket for a batch of requests and adds them to the engine. This method is a collective and synchronous operation that must be called - by all ranks in a Tensor Parallel (TP) group at the same time. It ensures + by all ranks in a Model Parallel (MP) group at the same time. It ensures that all ranks process the exact same batch of incoming requests and control signals. The synchronization works as follows: - 1. The TP rank 0 drains all pending messages from its subscriber socket + 1. The MP rank 0 drains all pending messages from its subscriber socket in a non-blocking manner. - 2. TP rank 0 then broadcasts the number of messages it received to all other - ranks in its TP group using a dedicated publisher socket. - 3. The other TP ranks wait to receive this count, and then receive exactly + 2. MP rank 0 then broadcasts the number of messages it received to all other + ranks in its MP group using a dedicated publisher socket. + 3. The other MP ranks wait to receive this count, and then receive exactly that many messages from their subscriber sockets. Once all ranks have the same batch of messages, they are unpacked and processed. New requests are added to the engine's queue, and control - signals (PAUSE, STOP, UNPAUSE) update the engine's internal state. + signals (PAUSE, UNPAUSE, SUSPEND, RESUME, STOP) update the engine's + internal state. Note: This function is synchronous and must be called collectively by all - ranks in a TP group. It should not be launched in a separate coroutine + ranks in a MP group. It should not be launched in a separate coroutine to ensure all ranks execute it in lockstep before proceeding to the next engine step. @@ -980,10 +1279,9 @@ def schedule_requests(self) -> int: int: The number of messages that were received and processed in this batch. """ - rank = parallel_state.get_tensor_model_parallel_rank() torch.cuda.nvtx.range_push("drain_zmq_socket") all_messages = [] - if rank == 0: + if self.is_mp_coordinator: while True: try: # Receive messages in a non-blocking way. @@ -995,37 +1293,72 @@ def schedule_requests(self) -> int: # First publish the number of messages to dequeue. # This is important because we want all tensor parallel ranks # to dequeue the same number of messages. - self.tensor_parallel_num_msgs_publisher_socket.send( + self.model_parallel_num_msgs_publisher_socket.send( struct.pack('!i', messages_to_dequeue) ) - # Now publish the actual messages to all tensor parallel ranks - for message in all_messages: - self.tensor_parallel_publisher_socket.send(message) + # Now publish the actual messages to all model parallel ranks + if messages_to_dequeue > 0: + self.model_parallel_publisher_socket.send_multipart(all_messages) else: - # First, receive the number of messages to dequeue from tp-rank 0 + # First, receive the number of messages to dequeue from mp-rank 0 messages_to_dequeue = struct.unpack( - '!i', self.tensor_parallel_num_msgs_subscriber_socket.recv() + '!i', self.model_parallel_num_msgs_subscriber_socket.recv() )[0] # Now, dequeue the same number of messages from the subscriber socket. # Note that these receives are blocking, because the messages # are guaranteed to be available after the tp-rank 0 has sent them. - for _ in range(messages_to_dequeue): - all_messages.append(self.tensor_parallel_subscriber_socket.recv()) + if messages_to_dequeue > 0: + all_messages = self.model_parallel_subscriber_socket.recv_multipart() + else: + all_messages = [] torch.cuda.nvtx.range_pop() for message in all_messages: data = msgpack.unpackb(message, raw=False) header = Headers(data[0]) + + if self.received_stop: + assert ( + header == Headers.STOP_ACK + ), "Engine is shutting down. No other messages allowed except STOP_ACK." + if header == Headers.SUBMIT_REQUEST: request_id, prompt, sampling_params = data[1:] sampling_params = SamplingParams.deserialize(sampling_params) self.add_request(request_id, prompt, sampling_params) elif header == Headers.PAUSE: - self.paused = True + # Pause thyself. + self.received_pause = True + self.running.clear() + # Send PAUSE_ACK back to coordinator. + if self.is_mp_coordinator: + payload = msgpack.packb([Headers.PAUSE_ACK.value], use_bin_type=True) + self.socket_for_receiving_requests.send(payload) elif header == Headers.STOP: - self.stopped = True + # Stop thyself. + self.received_stop = True + self.running.clear() + # Send STOP_ACK back to coordinator. + if self.is_mp_coordinator: + payload = msgpack.packb([Headers.STOP_ACK.value], use_bin_type=True) + self.socket_for_receiving_requests.send(payload) + elif header == Headers.PAUSE_ACK: + self.paused.set() + self.received_pause = False + elif header == Headers.STOP_ACK: + self.stopped.set() + self.stop() elif header == Headers.UNPAUSE: - self.paused = False + self.paused.clear() + self.running.set() + elif header == Headers.SUSPEND: + self.suspend_signal = True + elif header == Headers.RESUME: + self.suspend_signal = False + elif header == Headers.STOP: + self.stopped = True + else: + raise UnknownHeaderError(header) return len(all_messages) @@ -1043,7 +1376,6 @@ def stop(self): for socket in self.zmq_sockets: socket.close() self.zmq_context.term() - parallel_state.destroy_model_parallel() @trace_async_exceptions async def run_engine( @@ -1051,15 +1383,20 @@ async def run_engine( ): """Continually steps the engine asynchronously.""" self._loop = get_asyncio_loop(loop) + self.use_coordinator = False try: while True: # Wait until there are active requests before proceeding. async with self._cond: await self._cond.wait_for( - lambda: self.context.get_active_request_count() > 0 - or self.waiting_request_ids + lambda: ( + not self.is_suspended + and ( + self.context.get_active_request_count() > 0 + or self.waiting_request_ids + ) + ) ) - await self.async_step(verbose=verbose) except asyncio.CancelledError: pass @@ -1070,14 +1407,14 @@ async def run_engine_with_coordinator( ): """Continually steps the engine asynchronously.""" self._loop = get_asyncio_loop(loop) + self.use_coordinator = True try: while True: self.schedule_requests() - if self.stopped: - self.stop() - return + if self.stopped.is_set(): + break - # for the cases below (engine is paused or no active requests), + # for the cases below (no active requests, or undergoing a state-change) # do not use asyncio.sleep(0) # as tp-rank=0 will flood the num_messages publisher # with "0" repeatedly. This causes some packets to drop. @@ -1089,10 +1426,20 @@ async def run_engine_with_coordinator( # todo [Siddharth]: Can this hardcoded sleep be avoided # with asyncio zmq sockets? - if self.paused: + if self.paused.is_set() or self.received_pause or self.received_stop: + await asyncio.sleep(0.02) + continue + + # Suspend, resume. + if self.suspend_signal: + self.suspend() await asyncio.sleep(0.02) continue + else: + self.resume() + + # No requests. if ( self.context.get_active_request_count() == 0 and len(self.waiting_request_ids) == 0 @@ -1100,25 +1447,7 @@ async def run_engine_with_coordinator( await asyncio.sleep(0.02) continue - engine_output = await self.async_step(verbose=verbose) - - is_tp0_and_pp0 = ( - parallel_state.get_tensor_model_parallel_rank() == 0 - and parallel_state.get_pipeline_model_parallel_rank() == 0 - ) - if ( - is_tp0_and_pp0 - and engine_output is not None - and engine_output["finished_requests"] - ): - payload = msgpack.packb( - [ - Headers.ENGINE_REPLY.value, - [r.serializable() for r in engine_output["finished_requests"]], - ], - use_bin_type=True, - ) - self.socket_for_receiving_requests.send(payload) + await self.async_step(verbose=verbose) except asyncio.CancelledError: pass diff --git a/megatron/core/inference/engines/static_engine.py b/megatron/core/inference/engines/static_engine.py index dc86eb775f9..d4c61965d2b 100644 --- a/megatron/core/inference/engines/static_engine.py +++ b/megatron/core/inference/engines/static_engine.py @@ -17,7 +17,7 @@ from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) -from megatron.core.utils import get_asyncio_loop +from megatron.core.utils import get_asyncio_loop, get_mamba_inference_state_config_from_model try: from tqdm import tqdm @@ -93,6 +93,10 @@ def __init__( # Store original context in case we need to fall back to legacy static engine original_context = text_generation_controller.inference_wrapped_model.inference_context + mamba_inference_state_config = get_mamba_inference_state_config_from_model( + text_generation_controller.inference_wrapped_model.model + ) + try: if not legacy: dynamic_context = DynamicInferenceContext.from_config( @@ -101,16 +105,17 @@ def __init__( max_batch_size=max_batch_size, buffer_size_gb=buffer_size_gb, num_cuda_graphs=1, + mamba_inference_state_config=mamba_inference_state_config, ) self.controller.inference_wrapped_model.inference_context = dynamic_context self.controller.inference_wrapped_model.prep_model_for_inference() + self.controller._init_dynamic_sampling_tensors() self.dynamic_engine = DynamicInferenceEngine( controller=self.controller, random_seed=self.random_seed, context=dynamic_context, enable_cuda_graph=True, - static_sampling=True, ) except Exception as e: # Get exception details for better debugging diff --git a/megatron/core/inference/headers.py b/megatron/core/inference/headers.py index ff894cc1918..a22d1328679 100644 --- a/megatron/core/inference/headers.py +++ b/megatron/core/inference/headers.py @@ -1,6 +1,6 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -from enum import Enum +from enum import Enum, auto class Headers(Enum): @@ -8,10 +8,21 @@ class Headers(Enum): Enum representing headers used for communication with the inference-coordinator. """ - CONNECT = 0 - ACK = 1 - SUBMIT_REQUEST = 2 - ENGINE_REPLY = 3 - PAUSE = 4 - UNPAUSE = 5 - STOP = 6 + CONNECT = auto() + CONNECT_ACK = auto() + SUBMIT_REQUEST = auto() + ENGINE_REPLY = auto() + PAUSE = auto() + PAUSE_ACK = auto() + UNPAUSE = auto() + SUSPEND = auto() + RESUME = auto() + STOP = auto() + STOP_ACK = auto() + + +class UnknownHeaderError(Exception): + """A signal with an unrecognized header was received by the coordinator.""" + + def __init_(self, header): + super().__init__(f"specialize for {header}.") diff --git a/megatron/core/inference/inference_client.py b/megatron/core/inference/inference_client.py index 53daac091b0..8a19e226c46 100644 --- a/megatron/core/inference/inference_client.py +++ b/megatron/core/inference/inference_client.py @@ -4,9 +4,9 @@ import logging import os import time -from typing import List, Union +from typing import Awaitable, List, Optional, Union -from megatron.core.inference.inference_request import DynamicInferenceRequest +from megatron.core.inference.inference_request import DynamicInferenceRequestRecord from megatron.core.inference.sampling_params import SamplingParams from megatron.core.utils import get_asyncio_loop, trace_async_exceptions @@ -73,6 +73,11 @@ def __init__(self, inference_coordinator_port: int): inference_coordinator_address = os.getenv('MASTER_ADDR', '127.0.0.1') socket.connect(f"tcp://{inference_coordinator_address}:{inference_coordinator_port}") + self._loop = None + self.running = asyncio.Event() + self.paused = asyncio.Event() + self.stopped = asyncio.Event() + self.socket = socket self.completion_futures = {} self.request_submission_times = {} @@ -92,41 +97,55 @@ def add_request( prompt (str): The input prompt to send to the language model. sampling_params: An object containing the sampling parameters for text generation (e.g., temperature, top_p). It must have a - `serializable()` method. + `serialize()` method. Returns: asyncio.Future: A future that will be resolved with a - `DynamicInferenceRequest` object containing the completed result. + `DynamicInferenceRequestRecord` object containing the completed result. """ + if not self.running.is_set(): + raise RuntimeError("InferenceClient is not currently running.") request_id = self.next_request_id self.next_request_id += 1 - payload = [Headers.SUBMIT_REQUEST.value, request_id, prompt, sampling_params.serializable()] + payload = [Headers.SUBMIT_REQUEST.value, request_id, prompt, sampling_params.serialize()] payload_serialized = msgpack.packb(payload, use_bin_type=True) self.socket.send(payload_serialized) assert request_id not in self.completion_futures - self.completion_futures[request_id] = get_asyncio_loop().create_future() + self.completion_futures[request_id] = self._loop.create_future() self.request_submission_times[request_id] = time.perf_counter() return self.completion_futures[request_id] @trace_async_exceptions - async def _listen_for_completed_requests(self): + async def _recv_task(self): """ Listens for completed inference requests from the coordinator. This coroutine runs in an infinite loop, continuously polling the socket - for replies. When a reply is received, it unpacks the message, finds the + for data. + When a request reply is received, it unpacks the message, finds the corresponding Future using the request ID, and sets the result. + Other control packets are handled appropriately. This method is started as a background task by the `start()` method. """ while True: try: - request_id, reply = msgpack.unpackb(self.socket.recv(flags=zmq.NOBLOCK), raw=False) - reply['latency'] = time.perf_counter() - self.request_submission_times.pop( - request_id - ) - completion_future = self.completion_futures.pop(request_id) - completion_future.set_result(DynamicInferenceRequest.deserialize(reply)) + data = msgpack.unpackb(self.socket.recv(flags=zmq.NOBLOCK), raw=False) + header = Headers(data[0]) + if header == Headers.ENGINE_REPLY: + request_id, reply = data[1:] + reply['latency'] = time.perf_counter() - self.request_submission_times.pop( + request_id + ) + completion_future = self.completion_futures.pop(request_id) + if completion_future.done(): + logging.warning(f"Client: The future for {request_id} has been cancelled!") + continue + completion_future.set_result(DynamicInferenceRequestRecord.deserialize(reply)) + elif header == Headers.PAUSE_ACK: + self.paused.set() + elif header == Headers.STOP_ACK: + self.stopped.set() except zmq.Again: await asyncio.sleep(0.005) continue @@ -137,15 +156,15 @@ def _connect_with_inference_coordinator(self): """ Performs the initial handshake with the inference coordinator. - Sends a CONNECT signal and waits for an ACK reply to ensure the + Sends a CONNECT signal and waits for a CONNECT_ACK reply to ensure the connection is established and acknowledged by the coordinator. """ payload = [Headers.CONNECT.value] self.socket.send(msgpack.packb(payload, use_bin_type=True)) reply = msgpack.unpackb(self.socket.recv(), raw=False)[0] - assert Headers(reply) == Headers.ACK + assert Headers(reply) == Headers.CONNECT_ACK - async def start(self): + async def start(self, loop: Optional[asyncio.AbstractEventLoop] = None): """ Connects to the coordinator and starts the background listener task. @@ -154,8 +173,12 @@ async def start(self): coroutine. """ logging.info("Client: Connecting to InferenceCoordinator...") + self._loop = get_asyncio_loop(loop) + self.running.set() + self.paused.clear() + self.stopped.clear() self._connect_with_inference_coordinator() - self.listener_task = asyncio.create_task(self._listen_for_completed_requests()) + self.listener_task = self._loop.create_task(self._recv_task()) def _send_signal_to_engines(self, signal): """ @@ -168,17 +191,52 @@ def _send_signal_to_engines(self, signal): payload_serialized = msgpack.packb(payload, use_bin_type=True) self.socket.send(payload_serialized) - def pause_engines(self): + def pause_engines(self) -> Awaitable: + """Sends a signal to pause all inference engines. + + The signal first propagates thru the coordinator to all engines. + All engines acknowledge this signal and clear their `running` flags. + The coordinator awaits all acknowledgements before forwarding the ACK + back to the client, as well as to the engines. + The engines set their `paused` flags upon seeing the ACK. + + Returns: + Awaitable: An awaitable that resolves when all engines have paused. + """ + self._send_signal_to_engines(Headers.PAUSE) + return self.paused.wait() + + def unpause_engines(self) -> None: + """Sends a signal to unpause all inference engines.""" + self.paused.clear() + self.running.set() + self._send_signal_to_engines(Headers.UNPAUSE) + + def suspend_engines(self): """Sends a signal to pause all inference engines.""" self._send_signal_to_engines(Headers.PAUSE) + self._send_signal_to_engines(Headers.SUSPEND) - def unpause_engines(self): + def resume_engines(self): """Sends a signal to unpause all inference engines.""" + self._send_signal_to_engines(Headers.RESUME) self._send_signal_to_engines(Headers.UNPAUSE) - def stop_engines(self): - """Sends a signal to gracefully stop all inference engines.""" + def stop_engines(self) -> Awaitable: + """Sends a signal to gracefully stop all inference engines. + + The signal first propagates thru the coordinator to all engines. + All engines acknowledge this signal and clear their `running` flags. + The coordinator awaits all acknowledgements before forwarding the ACK + back to the client, as well as to the engines. + The engines set their `stopped` flags upon seeing the ACK. + + Returns: + Awaitable: An awaitable that resolves when all engines have stopped. + """ self._send_signal_to_engines(Headers.STOP) + self.running.clear() + return self.stopped.wait() def stop(self): """ diff --git a/megatron/core/inference/inference_request.py b/megatron/core/inference/inference_request.py index 21ff7786d6a..b58fac1b281 100644 --- a/megatron/core/inference/inference_request.py +++ b/megatron/core/inference/inference_request.py @@ -11,10 +11,18 @@ import torch from megatron.core.inference.sampling_params import SamplingParams +from megatron.core.tokenizers import MegatronTokenizer -def serialize_tensor(tensor): - """Serialize tensor to bytes.""" +def serialize_tensor(tensor: torch.Tensor) -> bytes: + """Serialize tensor to bytes. + + Args: + tensor (Tensor): Tensor. + + Returns: + (bytes) Byte representation of tensor. + """ buffer = io.BytesIO() torch.save(tensor, buffer) buffer.seek(0) @@ -22,8 +30,15 @@ def serialize_tensor(tensor): return tensor_bytes -def deserialize_tensor(tensor_bytes): - """Deserialize tensor from bytes.""" +def deserialize_tensor(tensor_bytes: bytes) -> torch.Tensor: + """Deserialize tensor from bytes. + + Args: + tensor_bytes (bytes): Byte representation of tensor. + + Returns: + (Tensor) Tensor. + """ buffer = io.BytesIO(tensor_bytes) tensor = torch.load(buffer) return tensor @@ -76,11 +91,12 @@ def __post_init__(self): ) self.sampling_params = self.inference_parameters - def serializable(self): - """ - Converts the instance into a serializable dictionary. + def serialize(self) -> dict: + """Converts the instance into a serializable dictionary. + Returns: - dict: A dictionary representation of the instance suitable for serialization. + (dict) A dictionary representation of the instance suitable for + serialization. """ # Dataclass to dict. @@ -169,11 +185,12 @@ def __str__(self): payload_str = "" if self.payload is None else f", {type(self.payload).__name__}" return f"[{self.timestamp:.3f}] {self.type.name}{payload_str}" - def serialize(self): - """ - Converts the instance into a serializable dictionary. + def serialize(self) -> dict: + """Converts the instance into a serializable dictionary. + Returns: - dict: A dictionary representation of the instance suitable for serialization. + (dict) A dictionary representation of the instance suitable for + serialization. """ # Dataclass to dict. @@ -253,13 +270,14 @@ def __str__(self): ) ) - def serializable(self): - """ - Converts the instance into a serializable dictionary. + def serialize(self): + """Converts the instance into a serializable dictionary. + Returns: - dict: A dictionary representation of the instance suitable for serialization. + (dict) A dictionary representation of the instance suitable for + serialization. """ - obj = super().serializable() + obj = super().serialize() obj["events"] = [e.serialize() for e in self.events] return obj @@ -277,6 +295,39 @@ def deserialize(cls, obj: dict) -> "DynamicInferenceRequest": request.events = [DynamicInferenceEvent.deserialize(e) for e in obj["events"]] return request + @property + def tracked_metadata(self) -> List[Any]: + """Obtain an ordered list of all request metadata to be tracked by the context. + + This consists of metadata that is used to inform text generation. + The values of such fields are tensorized and kept aligned with the current active batch. + + Note that while the general request object is mutable, this metadata is + inherently assumed to remain immutable once the request becomes active. + """ + sp = self.sampling_params + if sp.termination_id is None: + if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: + warnings.warn( + f"DynamicInferenceRequest {self.request_id} has no termination_id set " + "in its sampling_params. Defaulting to -1." + ) + sp.termination_id = -1 + return [getattr(sp, field) for field in self.get_metadata_labels().keys()] + + @staticmethod + def get_metadata_labels() -> Dict[str, int]: + """Provides human-readable labels for the tracked metadata fields.""" + ret = [ + "temperature", + "top_k", + "top_p", + "termination_id", + "return_log_probs", + "skip_prompt_log_probs", + ] + return {k: v for v, k in enumerate(ret)} + def add_event(self, type: DynamicInferenceEventType, payload: Optional[Any] = None) -> None: """Add event.""" self.events.append(DynamicInferenceEvent(type=type, payload=payload)) @@ -314,6 +365,158 @@ def failed(self) -> bool: return self.status == Status.FAILED +@dataclass(kw_only=True) +class DynamicInferenceRequestRecord: + """History of DynamicInferenceRequest objects over multiple suspend and + resumes.""" + + requests: list[DynamicInferenceRequest] = field(default_factory=list) + latency: Optional[float] = None + + @classmethod + def from_request(cls, request: DynamicInferenceRequest) -> "DynamicInferenceRequestRecord": + """Initialize record from a single request. + + Args: + request (DynamicInferenceRequest): Initial request. + + Returns: + (DynamicInferenceRequestRecord) A record. + """ + record = cls() + record.requests.append(request) + return record + + def __getitem__(self, idx: int) -> DynamicInferenceRequest: + """Get request by index. + + Args: + idx (int): Request index. + + Returns: + (DynamicInferenceRequest) Request object. + """ + return self.requests[idx] + + @property + def request_id(self) -> int: + """Get request id. + + Returns: + (int) Request id. + """ + return self.requests[0].request_id + + def suspend(self, tokenizer: MegatronTokenizer): + """Suspend request by storing references to previous prompt, generations, + and sampling params. + + Args: + tokenizer (MegatronTokenizer): The tokenizer. + """ + + old_request = self[-1] + + # New prompt (concatenate prompt + generated tokens). + new_prompt_tokens = torch.cat( + ( + old_request.prompt_tokens, + torch.tensor( + old_request.generated_tokens, + dtype=old_request.prompt_tokens.dtype, + device=old_request.prompt_tokens.device, + ), + ), + dim=0, + ) + new_prompt_str = tokenizer.detokenize(new_prompt_tokens.tolist()) + + # New sampling params. + new_sampling_params = SamplingParams( + **{ + **asdict(old_request.sampling_params), + "num_tokens_to_generate": ( + old_request.sampling_params.num_tokens_to_generate + - len(old_request.generated_tokens) + ), + } + ) + + # New request. + new_request = DynamicInferenceRequest( + request_id=old_request.request_id, + prompt=new_prompt_str, + prompt_tokens=new_prompt_tokens, + sampling_params=new_sampling_params, + ) + self.requests.append(new_request) + + def merge(self, tokenizer: MegatronTokenizer) -> DynamicInferenceRequest: + """Merge requests into a single suspend-agnostic request object. + + Args: + tokenizer (MegatronTokenizer): The tokenizer. + + Returns: + (DynamicInferenceRequest) Merged request. + """ + + def merge_lists(key): + if getattr(self.requests[0], key) is None: + return None + else: + return [v for r in self.requests for v in getattr(r, key)] + + prompt_tokens = self.requests[0].prompt_tokens + generated_tokens = merge_lists("generated_tokens") + + # Merged request. + request = DynamicInferenceRequest( + request_id=self.requests[0].request_id, + prompt=tokenizer.detokenize(prompt_tokens.tolist()), + prompt_tokens=prompt_tokens, + prompt_log_probs=self.requests[0].prompt_log_probs, + prompt_top_n_logprobs=self.requests[0].prompt_top_n_logprobs, + generated_text=tokenizer.detokenize(generated_tokens), + generated_tokens=generated_tokens, + generated_length=len(generated_tokens), + generated_log_probs=merge_lists("generated_log_probs"), + generated_top_n_logprobs=merge_lists("generated_top_n_logprobs"), + sampling_params=self.requests[0].sampling_params, + tpot=merge_lists("tpot"), + status=self.requests[-1].status, + latency=self.latency, + events=merge_lists("events"), + ) + + return request + + def serialize(self) -> dict: + """Converts the instance into a serializable dictionary. + + Returns: + (dict) A dictionary representation of the instance suitable for + serialization. + """ + obj = asdict(self) + obj["requests"] = [r.serialize() for r in self.requests] + return obj + + @classmethod + def deserialize(cls, obj: dict) -> "DynamicInferenceRequestRecord": + """Deserialize record. + + Args: + obj (dict): Serialized record data. + + Returns: + (DynamicInferenceRequestRecord) Deserialized record. + """ + request = cls(**obj) + request.requests = [DynamicInferenceRequest.deserialize(r) for r in obj["requests"]] + return request + + @dataclass(kw_only=True) class VLMInferenceRequest(InferenceRequest): """Class for a VLM inference request""" diff --git a/megatron/core/inference/sampling_params.py b/megatron/core/inference/sampling_params.py index e215b3f134b..d85b2816c80 100644 --- a/megatron/core/inference/sampling_params.py +++ b/megatron/core/inference/sampling_params.py @@ -44,7 +44,7 @@ def add_attributes(self, attribute_value_pair: dict): for key, value in attribute_value_pair.items(): setattr(self, key, value) - def serializable(self) -> dict: + def serialize(self) -> dict: """Return a dictionary that is msgpack-serializable.""" return self.__dict__.copy() diff --git a/megatron/core/inference/text_generation_controllers/text_generation_controller.py b/megatron/core/inference/text_generation_controllers/text_generation_controller.py index 2bda1425710..0aed3df079e 100644 --- a/megatron/core/inference/text_generation_controllers/text_generation_controller.py +++ b/megatron/core/inference/text_generation_controllers/text_generation_controller.py @@ -23,7 +23,11 @@ MaxSequenceLengthOverflowError, WarmupEngineMode, ) -from megatron.core.inference.inference_request import InferenceRequest, Status +from megatron.core.inference.inference_request import ( + DynamicInferenceRequest, + InferenceRequest, + Status, +) from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( AbstractModelInferenceWrapper, ) @@ -74,6 +78,35 @@ def __init__( self.sampling_rng = torch.Generator(device=torch.cuda.current_device()) self.sampling_rng.manual_seed(model_config.inference_sampling_seed) + if self.inference_wrapped_model.inference_context.is_dynamic_batching(): + self._init_dynamic_sampling_tensors() + + def _init_dynamic_sampling_tensors(self): + """Initialize tensors needed for dynamic sampling.""" + context = self.inference_wrapped_model.inference_context + max_requests = context.max_total_requests + + device = torch.cuda.current_device() + logits_dtype = self.inference_wrapped_model.inference_wrapper_config.params_dtype + # Use padded vocab size because tokenizer vocab size might pad to nearest power of 2. + vocab_size = self.inference_wrapped_model.inference_wrapper_config.padded_vocab_size + + # Initialize bookkeeping tensors. + self.sampling_logits_cuda = torch.empty( + max_requests, vocab_size, dtype=logits_dtype, device=device + ) + self.sampled_tokens_cuda = torch.empty(max_requests, dtype=torch.int64, device=device) + + self.temperature_cuda = torch.empty_like(self.sampled_tokens_cuda, dtype=torch.float) + self.top_k_cuda = torch.empty_like(self.sampled_tokens_cuda, dtype=torch.int32) + self.top_p_cuda = torch.empty_like(self.sampled_tokens_cuda, dtype=torch.float) + self.termination_id_cuda = torch.empty(max_requests, dtype=torch.int64, device=device) + self.return_log_probs_cuda = torch.empty(max_requests, dtype=torch.bool, device=device) + self.skip_prompt_log_probs_cuda = torch.empty(max_requests, dtype=torch.bool, device=device) + + # Used for inefficient torch sampling. + self.torch_sampling_buckets: List[Tensor] = [] + def tokenize_prompt(self, prompt: str, add_BOS: bool = False) -> List[int]: """Utility to tokenize the input prompts. @@ -177,16 +210,14 @@ def detokenize_generations( return text, prompts_plus_generations_segments - def sample_from_logits( + def _torch_sampling_func( self, last_token_logits: torch.Tensor, - sampling_params: Optional[SamplingParams] = None, + temperature: float, + top_k: int, + top_p: float, vocab_size: Optional[int] = None, - generation_started: Optional[torch.Tensor] = None, - top_n_logprobs_dict: Dict[int, List[Dict[str, float]]] = None, - logits: Optional[torch.Tensor] = None, - **kwargs, - ) -> torch.Tensor: + ): """Samples the logits to generate outputs Given the logits of the last token, this function samples it @@ -196,26 +227,15 @@ def sample_from_logits( Args: last_token_logits (torch.Tensor): The last token logits. A tensor of - size [batch_size, vocab_size] - sampling_params (SamplingParams): The parameters to use for inference. - vocab_size (int): Obtained from the tokenizer. Defaults to None - generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True - indicates the prompt at that index has started generating tokens. - top_n_logprobs_dict (top_n_logprobs_dict): The dict to be updated + size [batch_size, vocab_size]. + temperature (float): The temperature to use for sampling. + top_k (int): The top-k value to use for sampling. + top_p (float): The top-p value to use for sampling. + vocab_size (int): Obtained from the tokenizer. Defaults to None. Returns: sampled_logits (torch.Tensor): 1D tensor with [batch_size] elements - top_n_logprobs_this_step (torch.return_types.topk): a topk tensor with values as logits - and indices as the top k elements. None if sampling params top_n_logprobs is 0. """ - - if kwargs.get("common_inference_params"): - sampling_params = kwargs["common_inference_params"] - - top_p = sampling_params.top_p - top_k = sampling_params.top_k - temperature = sampling_params.temperature - assert isinstance(top_p, float) assert isinstance(top_k, int) assert not (top_k > 0 and top_p > 0.0), "Cannot have top-p and top-k both greater than zero" @@ -246,53 +266,6 @@ def modify_logits_for_top_p_filtering(logits, top_p): filter_ = filter_.scatter(1, sorted_indices, filter_) logits.masked_fill_(filter_, float("-Inf")) - if sampling_params.top_n_logprobs > 0: - # NOTE : This thing can also be clubbed with where we compute log probs - # when --return-log-probs is enabled. This is just more efficient - assert generation_started is not None - if logits is None: - batch_size = last_token_logits.shape[0] - last_token_log_probs = F.log_softmax(last_token_logits, dim=1).to(torch.float32) - top_n_logits_this_step = torch.topk( - last_token_log_probs, k=sampling_params.top_n_logprobs - ) - top_n_logprobs_this_step = top_n_logits_this_step.values.cpu() - top_n_logprobs_indices = top_n_logits_this_step.indices.cpu() - - # If we return prompt top_n_log_probs then we always append to the - # logprobs dict. Otherwise we only append for generated tokens. - if sampling_params.return_prompt_top_n_logprobs: - mask = torch.ones(batch_size, dtype=torch.bool) - else: - mask = generation_started.cpu() - - self._update_top_n_logprobs_dict( - top_n_logprobs_this_step, top_n_logprobs_indices, mask, top_n_logprobs_dict - ) - else: - assert sampling_params.return_prompt_top_n_logprobs - - # Compute the prompt logprobs - batch_size, seq_length, _ = logits.shape - log_probs = F.log_softmax(logits, dim=2).to(torch.float32) - top_n_logits_this_step = torch.topk(log_probs, k=sampling_params.top_n_logprobs) - - # Move the token dimension to the front and then add each token logprobs - # individually for every request in the batch - top_n_logprobs_this_step = top_n_logits_this_step.values.permute(1, 0, 2).cpu() - top_n_logprobs_indices = top_n_logits_this_step.indices.permute(1, 0, 2).cpu() - - # We append to the logprobs dict for every prompt token - mask = torch.ones(batch_size, dtype=torch.bool) - - for i in range(seq_length): - self._update_top_n_logprobs_dict( - top_n_logprobs_this_step[i], - top_n_logprobs_indices[i], - mask, - top_n_logprobs_dict, - ) - # Greedy sampling if top_k == 1: sampled_logits = torch.argmax(last_token_logits, dim=-1) @@ -322,10 +295,10 @@ def modify_logits_for_top_p_filtering(logits, top_p): return sampled_logits - def sample_from_dynamic_logits( + def sample_from_logits( self, last_token_logits: torch.Tensor, - active_sampling_map: List[Tuple[SamplingParams, List[int]]], + sampling_params: Optional[SamplingParams] = None, vocab_size: Optional[int] = None, generation_started: Optional[torch.Tensor] = None, top_n_logprobs_dict: Dict[int, List[Dict[str, float]]] = None, @@ -335,16 +308,14 @@ def sample_from_dynamic_logits( """Samples the logits to generate outputs Given the logits of the last token, this function samples it - according to the parameters defined in active_sampling_map + according to the parameters defined in sampling_params and returns the samples. If sampling parameters top_n_logprobs > 0 at each step it also updates the top_n_logprobs dict. Args: last_token_logits (torch.Tensor): The last token logits. A tensor of size [batch_size, vocab_size] - active_sampling_map (List[Tuple[SamplingParams, List[int]]]): A list of tuples - matching each unique set of sampling params to the context array indices - of the corresponding active requests. + sampling_params (SamplingParams): The parameters to use for inference. vocab_size (int): Obtained from the tokenizer. Defaults to None generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True indicates the prompt at that index has started generating tokens. @@ -352,29 +323,65 @@ def sample_from_dynamic_logits( Returns: sampled_logits (torch.Tensor): 1D tensor with [batch_size] elements - termination_id (torch.Tensor): Tensor of shape [batch_size] with termination ids top_n_logprobs_this_step (torch.return_types.topk): a topk tensor with values as logits and indices as the top k elements. None if sampling params top_n_logprobs is 0. """ - batch_size = last_token_logits.size(0) - new_sample = torch.zeros(batch_size, dtype=torch.int64, device=last_token_logits.device) - termination_id = torch.zeros_like(new_sample, dtype=torch.int64) - - for sampling_params, mask in active_sampling_map: - # Filter out indices that are out of bounds for the current batch - valid_mask = [i for i in mask if i < batch_size] - if valid_mask: - new_sample[valid_mask] = self.sample_from_logits( - last_token_logits[valid_mask], - sampling_params=sampling_params, - vocab_size=vocab_size, + + if kwargs.get("common_inference_params"): + sampling_params = kwargs["common_inference_params"] + + if sampling_params.top_n_logprobs > 0: + # NOTE : This thing can also be clubbed with where we compute log probs + # when --return-log-probs is enabled. This is just more efficient + assert generation_started is not None + if logits is None: + batch_size = last_token_logits.shape[0] + last_token_log_probs = F.log_softmax(last_token_logits, dim=1).to(torch.float32) + top_n_logits_this_step = torch.topk( + last_token_log_probs, k=sampling_params.top_n_logprobs ) - if sampling_params.termination_id is not None: - termination_id[valid_mask] = sampling_params.termination_id + top_n_logprobs_this_step = top_n_logits_this_step.values.cpu() + top_n_logprobs_indices = top_n_logits_this_step.indices.cpu() + + # If we return prompt top_n_log_probs then we always append to the + # logprobs dict. Otherwise we only append for generated tokens. + if sampling_params.return_prompt_top_n_logprobs: + mask = torch.ones(batch_size, dtype=torch.bool) else: - termination_id[valid_mask] = self.tokenizer.eod + mask = generation_started.cpu() + + self._update_top_n_logprobs_dict( + top_n_logprobs_this_step, top_n_logprobs_indices, mask, top_n_logprobs_dict + ) + else: + assert sampling_params.return_prompt_top_n_logprobs + + # Compute the prompt logprobs + batch_size, seq_length, _ = logits.shape + log_probs = F.log_softmax(logits, dim=2).to(torch.float32) + top_n_logits_this_step = torch.topk(log_probs, k=sampling_params.top_n_logprobs) + + # Move the token dimension to the front and then add each token logprobs + # individually for every request in the batch + top_n_logprobs_this_step = top_n_logits_this_step.values.permute(1, 0, 2).cpu() + top_n_logprobs_indices = top_n_logits_this_step.indices.permute(1, 0, 2).cpu() - return new_sample, termination_id + # We append to the logprobs dict for every prompt token + mask = torch.ones(batch_size, dtype=torch.bool) + + for i in range(seq_length): + self._update_top_n_logprobs_dict( + top_n_logprobs_this_step[i], + top_n_logprobs_indices[i], + mask, + top_n_logprobs_dict, + ) + + top_p = sampling_params.top_p + top_k = sampling_params.top_k + temperature = sampling_params.temperature + + return self._torch_sampling_func(last_token_logits, temperature, top_k, top_p, vocab_size) def update_generation_status( self, @@ -535,10 +542,12 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor) input_ids (Tensor): The input token IDs. position_ids (Tensor): The position IDs. """ + inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config + context = self.inference_wrapped_model.inference_context materialize_only_last_token_logits = context.materialize_only_last_token_logits - inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config + active_request_count = context.total_request_count - context.paused_request_count with torch.inference_mode(): logits = self.inference_wrapped_model.run_one_forward_step( @@ -546,9 +555,8 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor) ) if self.model_is_pipeline_parallel: - batch_size = context.total_request_count - context.paused_request_count logits_seq_len = ( - batch_size if materialize_only_last_token_logits else input_ids.shape[1] + active_request_count if materialize_only_last_token_logits else input_ids.shape[1] ) vocab_size = inference_wrapper_config.padded_vocab_size logits_shape = [1, logits_seq_len, vocab_size] @@ -556,8 +564,6 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor) if is_pipeline_last_stage(self.pp_group): assert logits is not None and torch.Size(logits_shape) == logits.shape - # TODO(ksanthanam): Evaluate whether it makes more sense to sample on 1 rank - # and then broadcast the sampled tokens rather than broadcasting the raw logits. logits = broadcast_from_last_pipeline_stage( logits_shape, dtype=inference_wrapper_config.params_dtype, @@ -567,31 +573,95 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor) return logits def _dynamic_step_sample_bookkeeping( - self, active_sampling_map: List[Tuple[SamplingParams, List[int]]] + self, + *, + backend: str = "torch", + request_metadata: Optional[Tensor] = None, + request_metadata_labels: Dict[str, int] = None, ): - """Perform bookkeeping necessary to sample logits for dynamic batching.""" - pass + """Perform bookkeeping necessary to sample logits for dynamic batching. - def _dynamic_step_sample_logits( - self, logits: Tensor, active_sampling_map: List[Tuple[SamplingParams, List[int]]] - ) -> Tensor: - """Sample logits for dynamic batching. + The ability to override the context's data is solely intended for + standalone use or testing, and should never be used in a running system. Args: - logits (Tensor): The logits from the forward step. - active_sampling_map (List[Tuple[SamplingParams, List[int]]]): A list of tuples - matching each unique set of sampling params to the context array indices - of the corresponding active requests. + backend (str): The sampling backend to use. + request_metadata (Optional[Tensor]): An override for the tensor that manages all + request metadata, such as sampling parameters. By default, this metadata is + retrieved from the context. + request_metadata_labels (Optional[Dict]): An override for the map of metadata labels + to their index in the request_metadata tensor. By default, this metadata is + retrieved from the request object. + """ + assert backend in ["torch"] + context = self.inference_wrapped_model.inference_context + + if request_metadata is None: + request_metadata = context.request_metadata[ + context.paused_request_count : context.total_request_count, : + ] + if request_metadata_labels is None: + request_metadata_labels = DynamicInferenceRequest.get_metadata_labels() + active_request_count = request_metadata.size(0) + + # Shorthand these, because the torch backend needs them. + temp = request_metadata[:, request_metadata_labels["temperature"]] + top_k = request_metadata[:, request_metadata_labels["top_k"]] + top_p = request_metadata[:, request_metadata_labels["top_p"]] + + # Copy data into relevant tensors. + self.temperature_cuda[:active_request_count].copy_(temp, non_blocking=True) + self.top_k_cuda[:active_request_count] = top_k.to( + dtype=torch.int32, copy=True, non_blocking=True + ) + self.top_p_cuda[:active_request_count].copy_(top_p, non_blocking=True) + self.termination_id_cuda[:active_request_count] = request_metadata[ + :, request_metadata_labels["termination_id"] + ].to(dtype=torch.int64, copy=True, non_blocking=True) + self.return_log_probs_cuda[:active_request_count] = request_metadata[ + :, request_metadata_labels["return_log_probs"] + ].to(dtype=torch.bool, copy=True, non_blocking=True) + self.skip_prompt_log_probs_cuda[:active_request_count] = request_metadata[ + :, request_metadata_labels["skip_prompt_log_probs"] + ].to(dtype=torch.bool, copy=True, non_blocking=True) + + if backend == "torch": + # Bucketize the core sampling parameters. + core_params = torch.stack((temp, top_k, top_p), dim=1) + _, inv_indices, cnts = torch.unique( + core_params, dim=0, return_inverse=True, return_counts=True + ) + order = torch.argsort(inv_indices, stable=True) + sampling_buckets = torch.split(order, cnts.tolist()) + # Perform the D2H sync needed by `_torch_sampling_func` here. + group_reps = torch.stack([indices[0] for indices in sampling_buckets], dim=0) + core_params_reps = core_params[group_reps].detach().cpu() + temp_reps = core_params_reps[:, 0].tolist() + top_k_reps = core_params_reps[:, 1].to(torch.int32).tolist() + top_p_reps = core_params_reps[:, 2].tolist() + # Store the buckets and their equivalence class representatives. + self.torch_sampling_buckets = ( + (sampling_buckets[idx], temp_reps[idx], top_k_reps[idx], top_p_reps[idx]) + for idx in range(len(sampling_buckets)) + ) + + def _dynamic_step_sample_logits(self, logits: Tensor, backend: str = "torch") -> Tensor: + """Sample tokens from logits for dynamic batching. + + Args: + logits (Tensor): The logits to sample from. + backend (str): The sampling backend to use. Returns: - new_sample (Tensor): The sampled tokens for each active request. - termination_id (int): The termination token IDs of each active request. + new_sample (Tensor): The sampled tokens. """ + # TODO(ksanthanam): Evaluate whether it makes more sense to sample on 1 rank + # and then broadcast the sampled tokens rather than broadcasting the raw logits. + assert backend in ["torch"] + context = self.inference_wrapped_model.inference_context materialize_only_last_token_logits = context.materialize_only_last_token_logits - inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config - # Last token logits. if materialize_only_last_token_logits: # When materialize_only_last_token_logits is true, last_token_logits is @@ -599,60 +669,72 @@ def _dynamic_step_sample_logits( last_token_logits = logits.squeeze(0) else: last_token_logits = context.last_token_logits(logits) + active_request_count = last_token_logits.size(0) + # Copy last_token_logits to contiguous buffer. + self.sampling_logits_cuda[:active_request_count].copy_(last_token_logits, non_blocking=True) + + if backend == "torch": + # Concatenate the outputs once to prevent repeated small writes. + token_list = [] + indices_list = [] + + for indices, temp, top_k, top_p in self.torch_sampling_buckets: + token_list.append( + self._torch_sampling_func( + self.sampling_logits_cuda[indices, :], temp, top_k, top_p + ) + ) + indices_list.append(indices) - # Sample. - # Use padded vocab size because tokenizer vocab size might not include padding - # to nearest power of 2. - vocab_size = inference_wrapper_config.padded_vocab_size - new_sample, termination_id = self.sample_from_dynamic_logits( - last_token_logits, active_sampling_map, vocab_size=vocab_size - ) - return new_sample, termination_id + # Single write to the output tensor. + sampled_tokens = torch.cat(token_list, dim=0) + sampled_indices = torch.cat(indices_list, dim=0) + self.sampled_tokens_cuda.index_copy_(0, sampled_indices, sampled_tokens) + return self.sampled_tokens_cuda[:active_request_count].clone() - def _dynamic_step_log_probs_bookkeeping(self): + def _dynamic_step_log_probs_bookkeeping(self) -> bool: """Perform bookkeeping necessary to compute log probs for dynamic batching.""" - pass - - def _dynamic_step_calculate_log_probs( - self, - logits: Tensor, - new_sample: Tensor, - active_sampling_map: List[Tuple[SamplingParams, List[int]]], - ) -> Optional[Tensor]: context = self.inference_wrapped_model.inference_context materialize_only_last_token_logits = context.materialize_only_last_token_logits - log_probs = None - return_log_probs = False - for sampling_params, mask in active_sampling_map: - if sampling_params.return_log_probs: - assert ( - sampling_params.skip_prompt_log_probs - or materialize_only_last_token_logits is False - ), "Materialize only last token logits must be false for returning log probs" - return_log_probs = True + active_request_count = context.total_request_count - context.paused_request_count - if return_log_probs: - log_probs = context.calculate_log_probs( - logits, new_sample, only_last_token_logits=materialize_only_last_token_logits - ) + to_check = self.return_log_probs_cuda[:active_request_count] + to_check &= ~self.skip_prompt_log_probs_cuda[:active_request_count] - return log_probs + assert not ( + to_check.any() and materialize_only_last_token_logits + ), "Prompt log probs cannot be calculated if only last token logits are materialized." - def _dynamic_step_context_bookkeeping( - self, new_sample: Tensor, termination_id: int - ) -> Tuple[Tensor, Tensor, Tensor]: - """Update the dynamic inference context after sampling. + return self.return_log_probs_cuda[:active_request_count].any() - Args: - new_sample (Tensor): The newly sampled tokens for each active request. - termination_id (int): The token ID that indicates termination. + def _dynamic_step_calculate_log_probs(self, logits: Tensor) -> Optional[Tensor]: + """Calculate log probs from logits.""" + context = self.inference_wrapped_model.inference_context + materialize_only_last_token_logits = context.materialize_only_last_token_logits + + active_request_count = context.total_request_count - context.paused_request_count + + ret = context.calculate_log_probs( + logits, + self.sampled_tokens_cuda[:active_request_count], + only_last_token_logits=materialize_only_last_token_logits, + ) + return ret + + def _dynamic_step_context_bookkeeping(self, new_sample) -> Dict[str, Tensor]: + """Update the dynamic inference context after sampling. Return: - Tuple[Tensor, Tensor, Tensor]: active / paused / finished request IDs. + Dict [str, Tensor]: A dictionary containing: + active_request_ids (Tensor): Current active request IDs. + newly_paused_request_ids (Tensor): Newly paused request IDs. + finished_request_ids (Tensor): Finished request IDs. """ context = self.inference_wrapped_model.inference_context + active_request_count = context.total_request_count - context.paused_request_count + # Active sequence lengths. active_request_ids = context.request_ids[ context.paused_request_count : context.total_request_count @@ -663,9 +745,10 @@ def _dynamic_step_context_bookkeeping( # Request finished if termination_id or length >= max_sequence_length. # Note: termination_id tensor has per-request termination IDs from mixed sampling - active_request_mask = (new_sample != termination_id).byte() & torch.less( - active_sequence_lengths, max_sequence_lengths - ).byte() + active_request_mask = ( + self.sampled_tokens_cuda[:active_request_count] + != self.termination_id_cuda[:active_request_count] + ).byte() & torch.less(active_sequence_lengths, max_sequence_lengths).byte() finished_idxs = ( torch.nonzero(active_request_mask == 0, as_tuple=True)[0] + context.paused_request_count ) @@ -685,16 +768,11 @@ def _dynamic_step_context_bookkeeping( @torch.inference_mode() async def async_generate_output_tokens_dynamic_batch( - self, - active_sampling_map: List[Tuple[SamplingParams, List[int]]], - skip_bookkeeping: Optional[bool] = False, + self, skip_bookkeeping: Optional[bool] = False ) -> Optional[Dict]: """Forward step the model and update the inference context. Args: - active_sampling_map (List[Tuple[SamplingParams, List[int]]]): A list of tuples - matching each unique set of sampling params to the context array indices - of the corresponding active requests. skip_bookkeeping (Optional[bool]): If true, skip the context bookkeeping step. Return: @@ -715,13 +793,12 @@ async def async_generate_output_tokens_dynamic_batch( if context.active_token_count == 0: return None - # This method only performs computations using CPU tensors. input_ids, position_ids = self._dynamic_step_context_init() + cuda_graph_request_count = ( context.padded_active_request_count if context.is_decode_only() else None ) - # This method only performs computations using GPU tensors. logits = self._dynamic_step_forward_logits(input_ids, position_ids) # This is the best place to yield control back to event loop. @@ -733,41 +810,35 @@ async def async_generate_output_tokens_dynamic_batch( # NOTE [TDE]: This will be moved once CPU and GPU methods are separated. await asyncio.sleep(0) - # This method will only perform computations using CPU tensors in the future. - self._dynamic_step_sample_bookkeeping(active_sampling_map) - # This method will only perform computations using GPU tensors in the future. - new_sample, termination_id = self._dynamic_step_sample_logits(logits, active_sampling_map) + self._dynamic_step_sample_bookkeeping() + new_sample = self._dynamic_step_sample_logits(logits) - # This method will only perform computations using CPU tensors in the future. - self._dynamic_step_log_probs_bookkeeping() - # This method will only perform computations using GPU tensors in the future. - log_probs = self._dynamic_step_calculate_log_probs(logits, new_sample, active_sampling_map) + return_log_probs = self._dynamic_step_log_probs_bookkeeping() + if return_log_probs: + log_probs = self._dynamic_step_calculate_log_probs(logits) + else: + log_probs = None - # This method only performs computations using CPU tensors. if skip_bookkeeping: - request_bookeeping = {} + request_bookkeeping = {} else: - request_bookeeping = self._dynamic_step_context_bookkeeping(new_sample, termination_id) + request_bookkeeping = self._dynamic_step_context_bookkeeping(new_sample) ret = { "sample": new_sample, "log_probs": log_probs, "cuda_graph_request_count": cuda_graph_request_count, } - ret.update(request_bookeeping) + ret.update(request_bookkeeping) return ret @torch.inference_mode() def generate_output_tokens_dynamic_batch( - self, - active_sampling_map: List[Tuple[SamplingParams, List[int]]], - loop: Optional[asyncio.AbstractEventLoop] = None, + self, loop: Optional[asyncio.AbstractEventLoop] = None ) -> Optional[Dict]: """Synchronous wrapper for `self.async_generate_output_tokens_dynamic_batch.""" loop = get_asyncio_loop(loop) - return loop.run_until_complete( - self.async_generate_output_tokens_dynamic_batch(active_sampling_map) - ) + return loop.run_until_complete(self.async_generate_output_tokens_dynamic_batch()) def _update_top_n_logprobs_dict( self, diff --git a/megatron/core/inference/unified_memory.py b/megatron/core/inference/unified_memory.py index 6e5e85ed668..e06e3022561 100644 --- a/megatron/core/inference/unified_memory.py +++ b/megatron/core/inference/unified_memory.py @@ -56,9 +56,9 @@ def compile_allocator(): EXPORT void* managed_malloc(size_t size, int device, void* stream) { (void)stream; - int cur = -1; - cudaGetDevice(&cur); - if (device != cur && device >= 0) cudaSetDevice(device); + int prev_device = -1; + cudaGetDevice(&prev_device); + if (device != prev_device && device >= 0) cudaSetDevice(device); // cudaMallocManaged allows for more memory to be allocated than the device memory size. // The cudaMemAttachGlobal flag makes the memory accessible from both host and device. @@ -69,13 +69,32 @@ def compile_allocator(): if (device >= 0) { // cudaMemAdviseSetPreferredLocation sets the preferred location for the memory. // This is a hint that tries to prevent data from being migrated away from the device. - cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetPreferredLocation, device); - // cudaMemAdviseSetAccessedBy ensures the memory always lives in the device's page table. - // Even if the memory has to be migrated away from the device, it still does not page fault. - // The CUDA docs claim that cudaMemAdviseSetPreferredLocation completely overrides this flag, - // but there is no harm in adding this flag as well for future-proofing. - cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetAccessedBy, device); + + #if CUDART_VERSION >= 13000 + // For CUDA >= 13, the cudaMemAdvise device arg is type cudaMemLocation + // instead of an int, so we setup the location and conditionally use it + // in calls to cudaMemAdvise. + cudaMemLocation location; + location.type = cudaMemLocationTypeDevice; + location.id = device; + + cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetPreferredLocation, location); + + // cudaMemAdviseSetAccessedBy ensures the memory always lives in the device's page table. + // Even if the memory has to be migrated away from the device, it still does not page fault. + // The CUDA docs claim that cudaMemAdviseSetPreferredLocation completely overrides this flag, + // but there is no harm in adding this flag as well for future-proofing. + cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetAccessedBy, location); + #else + cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetPreferredLocation, device); + // cudaMemAdviseSetAccessedBy ensures the memory always lives in the device's page table. + // Even if the memory has to be migrated away from the device, it still does not page fault. + // The CUDA docs claim that cudaMemAdviseSetPreferredLocation completely overrides this flag, + // but there is no harm in adding this flag as well for future-proofing. + cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetAccessedBy, device); + #endif } + if (device != prev_device && prev_device >= 0) cudaSetDevice(prev_device); return ptr; } @@ -100,13 +119,29 @@ def compile_allocator(): functions=[], with_cuda=True, extra_ldflags=_extra_ldflags, - verbose=False, + verbose=True, ) _so_path = Path(_mod.__file__).as_posix() _alloc = CUDAPluggableAllocator(_so_path, "managed_malloc", "managed_free").allocator() _compilation_state = CompilationState.SUCCESS - except (RuntimeError, ImportError, OSError): - warnings.warn("Failed to create unified memory mempool.") + except (RuntimeError, ImportError, OSError) as e: + warnings.warn(f"Failed to create unified memory mempool: '{e}'.") + _compilation_state = CompilationState.FAILURE + + # Synchronize failure state across ranks. (For currently unknown reasons, + # one rank can show as FAILURE while the remaining ranks show as SUCCESS.) + import torch + + local_state = torch.tensor( + [_compilation_state.value], dtype=torch.uint8, device=torch.cuda.current_device() + ) + world_states = [ + torch.empty(1, dtype=torch.uint8, device=torch.cuda.current_device()) + for _ in range(torch.distributed.get_world_size()) + ] + torch.distributed.all_gather(world_states, local_state) + world_states = set(s.item() for s in world_states) + if CompilationState.FAILURE.value in world_states: _compilation_state = CompilationState.FAILURE diff --git a/megatron/core/inference/utils.py b/megatron/core/inference/utils.py index d58f3c3a652..55536a52088 100644 --- a/megatron/core/inference/utils.py +++ b/megatron/core/inference/utils.py @@ -2,6 +2,7 @@ import asyncio import multiprocessing +import sys import torch @@ -161,3 +162,57 @@ async def await_process_event( raise RuntimeError( f"Process {process.name} (pid {process.pid}) has exited unexpectedly." ) + + +# Compatibility for Python < 3.13 asyncio Queue functionality. +# This is necessary because asyncio Queues are broken in Python < 3.13. +if sys.version_info < (3, 13): + + _SHUTDOWN_SENTINEL = object() + + class asyncio_QueueShutDown(Exception): + """Compatibility exception for Python < 3.13.""" + + pass + + class asyncio_Queue(asyncio.Queue): + """An asyncio.Queue with Python 3.13 compatibility features for Python < 3.13.""" + + def __init__(self, maxsize: int = 0): + super().__init__(maxsize) + self._is_shutdown = False + + async def get(self): + """Get an item from the queue with Python < 3.13 compatibility.""" + if self._is_shutdown and self.empty(): + raise asyncio_QueueShutDown + ret = await super().get() + if ret is _SHUTDOWN_SENTINEL: + super().put_nowait(_SHUTDOWN_SENTINEL) + super().task_done() + raise asyncio_QueueShutDown + return ret + + def put_nowait(self, item): + """Put an item into the queue without blocking""" + if self._is_shutdown: + raise asyncio_QueueShutDown + if item is _SHUTDOWN_SENTINEL: + raise ValueError(f"{item} is reserved for shutdown purposes for Python < 3.13") + super().put_nowait(item) + + def shutdown(self): + """Shutdown the queue for Python < 3.13. + + Note that the listening side of the queue can continue to get old data + off the queue even after it has already been shutdown. The listener only + shutdowns when the queue is BOTH shutdown AND empty. + """ + if not self._is_shutdown: + super().put_nowait(_SHUTDOWN_SENTINEL) + super().task_done() + self._is_shutdown = True + +else: + asyncio_QueueShutDown = asyncio.QueueShutDown + asyncio_Queue = asyncio.Queue diff --git a/megatron/core/models/backends.py b/megatron/core/models/backends.py index abda7c47787..29169285b3e 100644 --- a/megatron/core/models/backends.py +++ b/megatron/core/models/backends.py @@ -22,6 +22,19 @@ LNImpl = WrappedTorchNorm HAVE_APEX = False +from megatron.core.extensions.transformer_engine import ( + TEActivationOp, + TEColumnParallelLinear, + TEDotProductAttention, + TELinear, + TENorm, +) +from megatron.core.tensor_parallel.inference_layers import ( + InferenceLayerNormColumnParallelLinear, + InferenceRowParallelLinear, +) +from megatron.core.utils import is_te_min_version + class BackendSpecProvider(Protocol): """A protocol for providing the submodules used in Spec building.""" @@ -119,3 +132,51 @@ def grouped_mlp_modules( def activation_func(self) -> type: """Which module to use for activation function""" return None + + +class InferenceSpecProvider(BackendSpecProvider): + """A protocol for providing the submodules used in Spec building.""" + + def linear(self) -> type: + """Which linear module TE backend uses""" + return TELinear + + def column_parallel_linear(self) -> type: + """Which column parallel linear module TE backend uses""" + return TEColumnParallelLinear + + def row_parallel_linear(self) -> type: + """Which row parallel linear module TE backend uses""" + return InferenceRowParallelLinear + + def fuse_layernorm_and_linear(self) -> bool: + """TE backend chooses a single module for layernorm and linear""" + return True + + def column_parallel_layer_norm_linear(self) -> Optional[type]: + """Which module for sequential layernorm and linear""" + return InferenceLayerNormColumnParallelLinear + + def layer_norm(self, rms_norm: bool = False, for_qk: bool = False) -> type: + """Which module to use for layer norm""" + if for_qk and not is_te_min_version("1.9.0"): + # TENorm significantly harms convergence when used + # for QKLayerNorm if TE Version < 1.9; + # we instead use the Apex implementation. + return FusedLayerNorm + return TENorm + + def core_attention(self) -> type: + """Which module to use for attention""" + return TEDotProductAttention + + def activation_func(self) -> type: + """Which module to use for activation function""" + return TEActivationOp + + def grouped_mlp_modules( + self, moe_use_grouped_gemm: bool, moe_use_legacy_grouped_gemm: bool + ) -> Tuple[type, Optional[MLPSubmodules]]: + raise NotImplementedError( + "MOE is not supported with inference optimized transformer implementation." + ) diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py index c5c9caa3d67..7405150c4b3 100755 --- a/megatron/core/models/gpt/gpt_layer_specs.py +++ b/megatron/core/models/gpt/gpt_layer_specs.py @@ -4,7 +4,11 @@ from typing import Optional, Union from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add -from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider +from megatron.core.models.backends import ( + BackendSpecProvider, + InferenceSpecProvider, + LocalSpecProvider, +) from megatron.core.models.gpt.linear_attention_module_specs import ( get_linear_attention_module_spec_for_backend, ) @@ -73,6 +77,102 @@ HAVE_APEX = False +def get_gpt_layer_with_inference_spec( + qk_layernorm: Optional[bool] = False, + multi_latent_attention: Optional[bool] = False, + qk_l2_norm: Optional[bool] = False, +) -> ModuleSpec: + """Use this spec to use inference optimized linear layers. + Args: + qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False. + multi_latent_attention (bool, optional): To use MLA. Defaults to False. + qk_l2_norm (bool, optional): To use l2 norm for queries/keys. Defaults to False. + """ + assert HAVE_TE, "--transformer-impl inference_optimized requires transformer engine" + backend = InferenceSpecProvider() + + mlp = get_mlp_module_spec_for_backend( + backend=backend, + num_experts=None, + moe_grouped_gemm=False, + moe_use_legacy_grouped_gemm=False, + use_te_op_fuser=False, + use_te_activation_func=False, + ) + + if multi_latent_attention: + assert qk_l2_norm is False, "qk_l2_norm is not supported with MLA." + linear_q_up_proj = ( + backend.column_parallel_layer_norm_linear() + if qk_layernorm + else backend.column_parallel_linear() + ) + linear_kv_up_proj = ( + backend.column_parallel_layer_norm_linear() + if qk_layernorm + else backend.column_parallel_linear() + ) + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=backend.layer_norm(), + self_attention=ModuleSpec( + module=MLASelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=MLASelfAttentionSubmodules( + linear_q_proj=backend.column_parallel_linear(), + linear_q_down_proj=backend.linear(), + linear_q_up_proj=linear_q_up_proj, + linear_kv_down_proj=backend.linear(), + linear_kv_up_proj=linear_kv_up_proj, + core_attention=backend.core_attention(), + linear_proj=backend.row_parallel_linear(), + q_layernorm=IdentityOp, + kv_layernorm=IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=IdentityOp, + mlp=mlp, + mlp_bda=get_bias_dropout_add, + ), + ) + else: + qk_norm = backend.layer_norm(for_qk=True) + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=backend.column_parallel_layer_norm_linear(), + core_attention=backend.core_attention(), + linear_proj=backend.row_parallel_linear(), + q_layernorm=( + L2Norm if qk_l2_norm else (qk_norm if qk_layernorm else IdentityOp) + ), + k_layernorm=( + L2Norm if qk_l2_norm else (qk_norm if qk_layernorm else IdentityOp) + ), + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=IdentityOp, + mlp=mlp, + mlp_bda=get_bias_dropout_add, + sharded_state_dict_keys_map={ + "mlp.0.weight": "mlp.linear_fc1.layer_norm_weight", + "mlp.0.bias": "mlp.linear_fc1.layer_norm_bias", + "mlp.1.basic_ops.0.weight": "mlp.linear_fc1.weight", + "mlp.1.basic_ops.1.bias": "mlp.linear_fc1.bias", + "mlp.3.basic_ops.0.weight": "mlp.linear_fc2.weight", + "mlp.3.basic_ops.1.bias": "mlp.linear_fc2.bias", + }, + ), + ) + + def get_gpt_layer_with_transformer_engine_spec( num_experts: Optional[int] = None, moe_grouped_gemm: Optional[bool] = False, diff --git a/megatron/core/models/gpt/moe_module_specs.py b/megatron/core/models/gpt/moe_module_specs.py index 1de0f14efcd..62ee4537cfc 100755 --- a/megatron/core/models/gpt/moe_module_specs.py +++ b/megatron/core/models/gpt/moe_module_specs.py @@ -2,21 +2,13 @@ from typing import Optional +from megatron.core.extensions.transformer_engine_spec_provider import TESpecProvider from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider from megatron.core.transformer.mlp import MLPSubmodules from megatron.core.transformer.moe.moe_layer import MoELayer, MoESubmodules from megatron.core.transformer.moe.shared_experts import SharedExpertMLP from megatron.core.transformer.spec_utils import ModuleSpec -try: - import transformer_engine as te # pylint: disable=unused-import - - from megatron.core.extensions.transformer_engine_spec_provider import TESpecProvider - - HAVE_TE = True -except ImportError: - HAVE_TE = False - def get_moe_module_spec( use_te: Optional[bool] = True, diff --git a/megatron/core/models/mamba/mamba_layer_specs.py b/megatron/core/models/mamba/mamba_layer_specs.py index 8ef4a2ab3e4..bfe38c2bbc8 100755 --- a/megatron/core/models/mamba/mamba_layer_specs.py +++ b/megatron/core/models/mamba/mamba_layer_specs.py @@ -3,9 +3,11 @@ from megatron.core.extensions.transformer_engine import ( TEDotProductAttention, TELayerNormColumnParallelLinear, + TENorm, TERowParallelLinear, ) from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules @@ -16,6 +18,13 @@ from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules +moe = get_moe_module_spec( + use_te=True, + num_experts=8, # Can be any positive integer (must not be None). + moe_grouped_gemm=True, + moe_use_legacy_grouped_gemm=False, +) + mamba_stack_spec = ModuleSpec( module=MambaStack, submodules=MambaStackSubmodules( @@ -64,5 +73,12 @@ mlp_bda=get_bias_dropout_add, ), ), + moe_layer=ModuleSpec( + # TODO (rwaleffe): change this to be an "MoELayer" to work with CudaGraphs? + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + pre_mlp_layernorm=TENorm, mlp=moe, mlp_bda=get_bias_dropout_add + ), + ), ), ) diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py index c254b2f6882..061cb25f5b8 100644 --- a/megatron/core/optimizer/__init__.py +++ b/megatron/core/optimizer/__init__.py @@ -1,7 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import copy import logging import warnings -from typing import Callable, Dict, List, Optional, Tuple +from dataclasses import astuple +from typing import Callable, Dict, List, Optional, Tuple, Union import torch from torch.optim import SGD as CPUSGD @@ -48,100 +50,114 @@ MegatronOptimizer, param_group_identifier_keys, ) -from .optimizer_config import OptimizerConfig +from .optimizer_config import AdamOptimizerConfig, OptimizerConfig, ParamKey, SGDOptimizerConfig logger = logging.getLogger(__name__) +def _matches(param: torch.nn.Parameter, param_name: str, param_key: ParamKey) -> bool: + """Returns true if passed-in parameter (with name) matches `param_key`. + + Args: + param (torch.nn.Parameter): Handle to parameter object. + param_name (str): Name of parameter in underlying PyTorch module. + param_key (ParamKey): ParamKey object. + + Returns: + bool: True if parameter matches passed-in param_key. + """ + + # Check if name matches. + if isinstance(param_key.name, str): + target_names = [param_key.name] + else: + target_names = list(param_key.name) + for target_name in target_names: + if param_name in target_name: + return True + + # Check if attribute matches. + if isinstance(param_key.attr, str): + target_attrs = [param_key.attr] + else: + target_attrs = list(param_key.attr) + for target_attr in target_attrs: + if getattr(param, target_attr, False): + return True + + return False + + def _get_param_groups( model_chunks: List[MegatronModule], - no_weight_decay_cond: Optional[Callable], - scale_lr_cond: Optional[Callable], - lr_mult: float, - lr: float, - min_lr: float, - decoupled_lr: Optional[float], - decoupled_min_lr: Optional[float], - default_skip_embedding_weight_decay: bool = False, + config: OptimizerConfig, + config_overrides: Optional[Dict[ParamKey, OptimizerConfig]], ) -> List[Dict]: """Create parameter groups for optimizer. - Creates parameter groups based on weight decay condition (regularized vs - non regularized), learning rate scale condition (lr vs lr_mult * lr), - and whether it is expert parameters. scale_lr_cond is used during finetuning - where head of the network requires a scaled version of the base learning rate. + Creates parameter groups from provided optimizer config object. Args: model_chunks (List[MegatronModule]): model chunks to create parameter groups for. - no_weight_decay_cond (func, optional): function to determine whether a - parameter should not perform weight decay. - scale_lr_cond (func, optional): function to determine whether a parameter - should have a scaled learning rate. - lr_mult (float): learning rate multiplier for parameters that - satisfy scale_lr_cond. - lr (float): learning rate. - min_lr (float): minimum learning rate. - decoupled_lr (Optional[float]): optional decoupled learning rate. - decoupled_min_lr (Optional[float]): optional decoupled minimum learning rate. - default_skip_embedding_weight_decay (bool): whether to skip weight decay for embedding - parameters by default, if no_weight_decay_cond is not provided. - + config (OptimizerConfig): optimizer configuration object. + config_overrides (Optional[Dict[LayerKey, OptimizerConfig]): optimizer overrides, + specified on a per-layer basis. Returns: List of parameter groups. """ - use_decoupled_learning_rate = decoupled_lr is not None - - # Map (wd_mult, lr_mult, is_expert_parallel, is_decoupled_lr) to params. + # Map (wd_mult, is_expert_parallel, param_group_hyperparameters_config) to params. params_map = {} + configs_map = {} + for model_chunk in model_chunks: for name, param in model_chunk.named_parameters(): if not param.requires_grad: continue - is_expert_parallel = not getattr(param, 'allreduce', True) - - if no_weight_decay_cond is not None: - no_wd: bool = no_weight_decay_cond(name, param) + uses_default_config = False + # Get optimizer config for this parameter. + if config_overrides is None: + config_for_param = config + uses_default_config = True else: - # Do not regularize biases and norm parameters. - # optionally, also skip weight decay for embedding parameters if requested - # (useful if you do not want embeddings to shrink to zero in training - # https://arxiv.org/abs/2312.16903) - no_wd = ( - name.endswith(".bias") - or len(param.shape) == 1 - or (default_skip_embedding_weight_decay and "embedding" in name) - ) + config_for_param = None + for param_key in config_overrides: + if _matches(param, name, param_key): + config_for_param = config_overrides[param_key] + break + # Fall back to default config. + if config_for_param is None: + config_for_param = config + uses_default_config = True - if scale_lr_cond is not None: - scale_lr = scale_lr_cond(name, param) - else: - scale_lr = False - - if not no_wd and not scale_lr: - wd_mult, _lr_mult = 1.0, 1.0 - elif not no_wd and scale_lr: - wd_mult, _lr_mult = 1.0, lr_mult - elif no_wd and not scale_lr: - wd_mult, _lr_mult = 0.0, 1.0 - else: - wd_mult, _lr_mult = 0.0, lr_mult - - is_decoupled_lr = False - # For input/embedding and output layer: embedding.word_embeddings.weight / - # output_layer.weight. - if use_decoupled_learning_rate and getattr( - param, 'is_embedding_or_output_parameter', False - ): - is_decoupled_lr = True + is_expert_parallel = not getattr(param, 'allreduce', True) - key = (wd_mult, _lr_mult, is_expert_parallel, is_decoupled_lr) + # TODO: Make sure there is a way to support old no_weight_decay_func functionality + # and default_skip_embedding_weight_decay: + # or (default_skip_embedding_weight_decay and "embedding" in name) + no_wd = name.endswith(".bias") or len(param.shape) == 1 + if not no_wd: + wd_mult = 1.0 + else: + wd_mult = 0.0 + + # Create config_tuple that is hash-able. Remove timers object before + # creating config_tuple. + config_for_param_copy = copy.deepcopy(config_for_param) + config_for_param_copy.timers = None + config_tuple = astuple(config_for_param_copy) + key = (wd_mult, is_expert_parallel, config_tuple) if key not in params_map: params_map[key] = [] params_map[key].append(param) + if key in configs_map: + assert (config_for_param, uses_default_config) == configs_map[key] + else: + configs_map[key] = (config_for_param, uses_default_config) + # Distributed checkpoint requires all ranks to have the same param groups, # so we need to align the param groups across ranks, otherwise we may have # runtime error when loading the checkpoint or numerical error when resuming training. @@ -155,67 +171,33 @@ def _get_param_groups( param_groups = [] for key in params_key: - wd_mult, _lr_mult, is_expert_parallel, is_decoupled_lr = key + wd_mult, is_expert_parallel, _ = key params = params_map[key] if key in params_map else [] + config, uses_default_config = None, True + if key not in configs_map: + assert params == [] + else: + config, uses_default_config = configs_map[key] + assert config is not None + + # TODO: Remove "backwards compatible" fields below eventually. param_group = { 'params': params, - 'wd_mult': wd_mult, - 'lr_mult': _lr_mult, + 'wd_mult': wd_mult, # For backwards compatibility. + 'lr_mult': 1.0, # For backwards compatibility. 'is_expert_parallel': is_expert_parallel, - 'is_decoupled_lr': is_decoupled_lr, + 'is_decoupled_lr': False, # For backwards compatibility. + 'default_config': uses_default_config, } - # Ensure param_group has required keys for matching when loading optimizer state - # See MegatronOptimizer._filter_and_reorder_param_groups. - assert set(param_group.keys()) - set(param_group_identifier_keys) == {'params'} - param_groups.append(param_group) - - param_groups = _update_min_and_max_lr_in_param_groups( - param_groups, - lr=lr, - min_lr=min_lr, - decoupled_lr=decoupled_lr, - decoupled_min_lr=decoupled_min_lr, - ) - - return param_groups - -def _update_min_and_max_lr_in_param_groups( - param_groups: List[Dict], - lr: float, - min_lr: float, - decoupled_lr: Optional[float], - decoupled_min_lr: Optional[float], -) -> List[Dict]: - """ - Updates `max_lr` and `min_lr` values in each parameter group, and returns new list. - By default, each group will use `lr` / `min_lr` as `max_lr` / `min_lr`. - If `decoupled_lr` is provided, then `decoupled_lr` / `decoupled_min_lr` will be used - as `max_lr` / `min_lr` for the input and output layer. - - Args: - param_groups (List): parameter groups whose 'max_lr' and `min_lr` fields need to - be adjusted. - lr (float): learning rate. - min_lr (float): minimum learning rate. - decoupled_lr (Optional[float]): optional decoupled learning rate. - decoupled_min_lr (Optional[float]): optional decoupled minimum learning rate. - - Returns: - List of adjusted parameter groups. - """ - - if decoupled_min_lr is None: - decoupled_min_lr = min_lr + # Stick relevant fields into param_group from config object. + if config is not None: + param_group['max_lr'] = config.lr + param_group['min_lr'] = config.min_lr + # TODO: Add other relevant arguments (e.g., weight decay, optimizer) + # here as well. + param_groups.append(param_group) - for param_group in param_groups: - if param_group['is_decoupled_lr']: - assert decoupled_lr is not None - param_group['max_lr'] = decoupled_lr - param_group['min_lr'] = decoupled_min_lr - else: - param_group['max_lr'] = lr - param_group['min_lr'] = min_lr return param_groups @@ -223,12 +205,9 @@ def _get_param_groups_and_buffers( model_chunks: List[MegatronModule], model_chunk_offset: int, config: OptimizerConfig, - no_weight_decay_cond: Optional[Callable], - scale_lr_cond: Optional[Callable], - lr_mult: float, + config_overrides: Optional[Dict[ParamKey, OptimizerConfig]], filter_fn: Callable, buffer_name: str, - default_skip_embedding_weight_decay: bool = False, ) -> Tuple[List[Dict], Dict[int, List[_ParamAndGradBuffer]]]: """Returns parameter groups and buffer for optimizer. @@ -237,33 +216,17 @@ def _get_param_groups_and_buffers( groups for. model_chunk_offset (int): offset of model_chunks in global model_chunks list. config (OptimizerConfig): optimizer configuration object. - no_weight_decay_cond (func, optional): function to determine whether a - parameter should not perform weight decay. - scale_lr_cond (func, optional): function to determine whether a parameter - should have a scaled learning rate. - lr_mult (float): learning rate multiplier for parameters that - satisfy scale_lr_cond. + config_overrides (Optional[Dict[LayerKey, OptimizerConfig]): optimizer overrides, + specified on a per-layer basis. lr (float): learning rate. min_lr (float): minimum learning rate. filter_fn (callable): filtering function for param_groups. buffer_name (str): name of buffer. - default_skip_embedding_weight_decay (bool): whether to skip weight decay for - embedding parameters by default, if no_weight_decay_cond is not provided. Returns: List of parameter groups and dictionary of model chunk IDs to buffers. """ - param_groups = _get_param_groups( - model_chunks, - no_weight_decay_cond, - scale_lr_cond, - lr_mult, - lr=config.lr, - min_lr=config.min_lr, - decoupled_lr=config.decoupled_lr, - decoupled_min_lr=config.decoupled_min_lr, - default_skip_embedding_weight_decay=default_skip_embedding_weight_decay, - ) + param_groups = _get_param_groups(model_chunks, config, config_overrides) param_groups = list(filter(filter_fn, param_groups)) buffers = {} for model_chunk_idx, model_chunk in enumerate(model_chunks): @@ -304,9 +267,12 @@ def _get_megatron_optimizer_based_on_param_groups( Returns: Instance of MegatronOptimizer. """ - # when freezing sub-models we may have no trainable parameters on a rank and + # TODO: Logic needs to be updated to handle different optimizer types (i.e., param_groups + # passed into this function need to correspond to the same optimizer). + + # When freezing sub-models we may have no trainable parameters on a rank and # hence an empty param_groups. However, we still need to create an optimizer - # for the purposes of grad stats reductions + # for the purposes of grad stats reductions. if param_groups: if config.optimizer_cpu_offload: if torch.__version__ < '2.3.0': @@ -476,11 +442,8 @@ def init_state_fn(opt, config=None): def get_megatron_optimizer( config: OptimizerConfig, model_chunks: List[MegatronModule], - no_weight_decay_cond: Optional[Callable] = None, - scale_lr_cond: Optional[Callable] = None, - lr_mult: float = 1.0, + config_overrides: Optional[Dict[ParamKey, OptimizerConfig]] = None, use_gloo_process_groups: bool = True, - default_skip_embedding_weight_decay: bool = False, pg_collection: Optional[ProcessGroupCollection] = None, dump_param_to_param_group_map: Optional[str] = None, ) -> MegatronOptimizer: @@ -491,18 +454,11 @@ def get_megatron_optimizer( Args: config (OptimizerConfig): optimizer configuration object. model_chunks (List[MegatronModule]): model chunks to get optimizer for. - no_weight_decay_cond (func, optional): function to determine whether a parameter - should not perform weight decay. Defaults to None. - scale_lr_cond (func, optional): function to determine whether a parameter - should have a scaled learning rate. Defaults to None. - lr_mult (float, optional): learning rate multiplier for parameters that - satisfy scale_lr_cond. Defaults to 1.0. + config_overrides (Optional[Dict[ParamKey, OptimizerConfig]]): optional dictionary of + optimizer configuration objects to override default optimizer behavior for different + subsets of parameters (identified by ParamKey). use_gloo_process_groups (bool): if false, disable use of Gloo process groups in underlying Megatron optimizers. - default_skip_embedding_weight_decay (bool): whether to skip weight decay for - embedding parameters by default, if no_weight_decay_cond is not provided. - This is useful if you do not want embeddings to shrink to zero in training - as recommended in https://arxiv.org/abs/2312.16903 pg_collection: Optional unified process group for distributed training. dump_param_to_param_group_map (Optional[str]): path to dump parameter to param group map. @@ -512,6 +468,20 @@ def get_megatron_optimizer( log_single_rank(logger, logging.INFO, f'Setting up optimizer with config {config}') + # TODO: Remove `optimizer` from this eventually (e.g., if we use Muon for some layers and + # Adam for other layers). This would need some more refactoring to work though (param_groups + # filtered by optimizer passed into _get_megatron_optimizer_based_on_param_groups). + fields_to_check_for_consistency = [ + 'overlap_param_gather_with_optimizer_step', + 'optimizer', + 'optimizer_cpu_offload', + ] + for field_name in fields_to_check_for_consistency: + field = getattr(config, field_name, None) + if config_overrides is not None: + all_configs = list(config_overrides.values()) + assert all([getattr(x, field_name, None) == field for x in all_configs]) + # Separate out first model chunk if overlapping param AG with optimizer step. if config.overlap_param_gather_with_optimizer_step: all_dense_model_chunks = [[model_chunks[0]], model_chunks[1:]] @@ -553,17 +523,14 @@ def get_megatron_optimizer( model_chunk, model_chunk_offset=model_chunk_offset, config=config, - no_weight_decay_cond=no_weight_decay_cond, - scale_lr_cond=scale_lr_cond, - lr_mult=lr_mult, + config_overrides=config_overrides, filter_fn=lambda g: True, buffer_name='buffers', - default_skip_embedding_weight_decay=default_skip_embedding_weight_decay, ) optimizers.append( _get_megatron_optimizer_based_on_param_groups( - config, + config=config, model_chunks=model_chunk, param_groups=param_groups, per_model_buffers=buffers, @@ -592,12 +559,9 @@ def get_megatron_optimizer( dense_model_chunks, model_chunk_offset=model_chunk_offset, config=config, - no_weight_decay_cond=no_weight_decay_cond, - scale_lr_cond=scale_lr_cond, - lr_mult=lr_mult, + config_overrides=config_overrides, filter_fn=lambda g: not g['is_expert_parallel'], buffer_name='buffers', - default_skip_embedding_weight_decay=default_skip_embedding_weight_decay, ) for model_chunk in dense_model_chunks: model_chunk.overlap_param_gather_with_optimizer_step = ( @@ -613,7 +577,7 @@ def get_megatron_optimizer( # Pass Gloo process groups into optimizer only if needed. optimizers.append( _get_megatron_optimizer_based_on_param_groups( - config, + config=config, model_chunks=dense_model_chunks, param_groups=param_groups, per_model_buffers=buffers, @@ -631,12 +595,9 @@ def get_megatron_optimizer( model_chunks, model_chunk_offset=0, config=config, - no_weight_decay_cond=no_weight_decay_cond, - scale_lr_cond=scale_lr_cond, - lr_mult=lr_mult, + config_overrides=config_overrides, filter_fn=lambda g: g['is_expert_parallel'], buffer_name='expert_parallel_buffers', - default_skip_embedding_weight_decay=default_skip_embedding_weight_decay, ) if dump_param_to_param_group_map is not None: for param_group in moe_param_groups: @@ -653,7 +614,7 @@ def get_megatron_optimizer( expt_data_parallel_group_gloo = None optimizers.append( _get_megatron_optimizer_based_on_param_groups( - config, + config=config, model_chunks=model_chunks, param_groups=moe_param_groups, per_model_buffers=moe_buffers, diff --git a/megatron/core/optimizer/muon.py b/megatron/core/optimizer/muon.py index ddf20b0abb8..2b1f0502e46 100644 --- a/megatron/core/optimizer/muon.py +++ b/megatron/core/optimizer/muon.py @@ -3,7 +3,7 @@ """Megatron muon optimizer wrapper to handle tensor-parallel.""" import logging -from typing import Any, Callable, List, Literal, Optional +from typing import Any, Callable, Dict, List, Literal, Optional import torch from torch.optim.optimizer import ParamsT @@ -21,7 +21,7 @@ FP32Optimizer, MegatronOptimizer, ) -from .optimizer_config import OptimizerConfig +from .optimizer_config import OptimizerConfig, ParamKey try: from emerging_optimizers.orthogonalized_optimizers import ( @@ -166,9 +166,7 @@ def orthogonalize(self, p: torch.Tensor, grad: torch.Tensor, **kwargs: Any) -> t def get_megatron_muon_optimizer( config: OptimizerConfig, model_chunks: List[MegatronModule], - no_weight_decay_cond: Optional[Callable] = None, - scale_lr_cond: Optional[Callable] = None, - lr_mult: float = 1.0, + config_overrides: Optional[Dict[ParamKey, OptimizerConfig]] = None, use_gloo_process_groups: bool = True, layer_wise_distributed_optimizer: bool = False, pg_collection: Optional[ProcessGroupCollection] = None, @@ -179,17 +177,15 @@ def get_megatron_muon_optimizer( Args: config (OptimizerConfig): optimizer configuration object. model_chunks (List[MegatronModule]): model chunks to get optimizer for. - no_weight_decay_cond (func, optional): function to determine whether a parameter - should not perform weight decay. Defaults to None. - scale_lr_cond (func, optional): function to determine whether a parameter - should have a scaled learning rate. Defaults to None. - lr_mult (float, optional): learning rate multiplier for parameters that - satisfy scale_lr_cond. Defaults to 1.0. use_gloo_process_groups (bool): if false, disable use of Gloo process groups in underlying Megatron optimizers. layer_wise_distributed_optimizer (bool): if true, use layer-wise distributed optimizer. Defaults to False. """ + # Muon currently use adam config. setting str here to call regular get for adam creation + # side effect is muon optimizer will have wrong name, i.e. config.optimizer == 'adam' + config.optimizer = 'adam' + assert HAVE_EMERGING_OPTIMIZERS, "Emerging Optimizers is not installed." # dist-optim is not supported due to strong coupling with how DDP init grad buffer @@ -246,16 +242,7 @@ def get_megatron_muon_optimizer( for param in nonlinear_params: param.requires_grad = False - linear_param_groups = _get_param_groups( - model_chunks, - no_weight_decay_cond, - scale_lr_cond, - lr_mult, - lr=config.lr, - min_lr=config.min_lr, - decoupled_lr=config.decoupled_lr, - decoupled_min_lr=config.decoupled_min_lr, - ) + linear_param_groups = _get_param_groups(model_chunks, config, config_overrides) optimizer = TensorParallelMuon( linear_param_groups, @@ -274,13 +261,6 @@ def get_megatron_muon_optimizer( mode=config.muon_tp_mode, ) - # set config here to: - # 1. get adam for rest of layer - # 2. avoid ChainedOptimizer check fail that assert all optimizers are same kind - # side effect is muon optimizer will have wrong name str, i.e. config.optimizer == 'adam' - # TODO(deyuf): allow user to select optimizer mix and relax ChainedOptimizer design - config.optimizer = 'adam' - # Needed for torch_dist ckpt_format, unlike torch ckpt_format # For other emerging optimizers, need to implement init_state_fn as well # TODO(boxiangw): Improve usability after optimizer refactor @@ -331,7 +311,10 @@ def adam_init_state_fn(opt, config=None): # call original get. linear params will be skipped since they're freezed chained_adam = get_megatron_optimizer( - config, model_chunks, no_weight_decay_cond, scale_lr_cond, lr_mult, use_gloo_process_groups + config, + model_chunks, + config_overrides=config_overrides, + use_gloo_process_groups=use_gloo_process_groups, ) # unfreeze everything diff --git a/megatron/core/optimizer/optimizer.py b/megatron/core/optimizer/optimizer.py index 1829cb424f1..54e7f67c629 100644 --- a/megatron/core/optimizer/optimizer.py +++ b/megatron/core/optimizer/optimizer.py @@ -3,6 +3,7 @@ """Megatron optimizer.""" import copy +import logging import math import warnings from abc import ABC, abstractmethod diff --git a/megatron/core/optimizer/optimizer_config.py b/megatron/core/optimizer/optimizer_config.py index 8692d1e9b52..6a4199a1f7a 100644 --- a/megatron/core/optimizer/optimizer_config.py +++ b/megatron/core/optimizer/optimizer_config.py @@ -1,23 +1,34 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -from dataclasses import dataclass -from typing import Callable, Optional +from dataclasses import dataclass, field +from typing import Callable, Optional, Tuple, Union import torch from ..utils import is_te_min_version +@dataclass(frozen=True, slots=True) +class ParamKey: + """Key to group parameters by. All such grouped parameters can share an + optimizer config specification.""" + + # TODO: Can add layer_id here later. + + name: Union[str, Tuple[str]] = field(default_factory=tuple) + """Parameter name(s).""" + + attr: Union[str, Tuple[str]] = field(default_factory=tuple) + """Parameter attribute(s).""" + + @dataclass class OptimizerConfig: - """Configuration for optimizer.""" + """Base optimizer configuration object.""" ############## # General ############## - optimizer: str = 'adam' - """Optimizer to use (one of Adam, SGD, or Muon).""" - lr: Optional[float] = None """Initial learning rate. Depending on decay style and initial warmup, the learning rate at each iteration would be different. @@ -26,14 +37,6 @@ class OptimizerConfig: min_lr: Optional[float] = None """Minumum value for learning rate. The scheduler clip values below this threshold.""" - decoupled_lr: Optional[float] = None - """Separate learning rate for the input and output layer.""" - - decoupled_min_lr: Optional[float] = None - """Minimum value for learning rate for the input and output layer. The scheduler clip values - below this threshold. - """ - weight_decay: float = 0.01 """Weight decay coefficient for L2 regularization.""" @@ -78,6 +81,9 @@ class OptimizerConfig: exp_avg_sq_dtype: torch.dtype = torch.float32 """dtype of exp_avg_sq when enabling precision-aware-optimizer""" + optimizer: str = 'adam' + """Optimizer name. NOTE: Deprecated, use individual optimizer classes instead.""" + ############### # Loss scaling ############### @@ -98,10 +104,10 @@ class OptimizerConfig: hysteresis: int = 2 """Hysteresis for dynamic loss scaling.""" - ############## - # Optimizer - ############## - # Adam + ################################################################################### + # Optimizer (NOTE: Deprecated, use individual optimizer classes instead.). + ################################################################################### + # Adam. adam_beta1: float = 0.9 """First coefficient for computing running averages of gradient and its square in Adam optimizer. @@ -259,6 +265,7 @@ def __post_init__(self): try: import inspect + # TODO: Move this below? from transformer_engine.pytorch.optimizers import FusedAdam as Adam adam_args = inspect.signature(Adam).parameters @@ -291,3 +298,35 @@ def __post_init__(self): assert ( self.exp_avg_sq_dtype == torch.float32 ), "exp_avg_sq_dtype can only be fp32 when not using precision-aware optimizer" + + +@dataclass +class AdamOptimizerConfig(OptimizerConfig): + """Adam optimizer configuration object.""" + + optimizer: str = 'adam' + """Optimizer name.""" + + adam_beta1: float = 0.9 + """First coefficient for computing running averages of gradient and its square in Adam + optimizer. + """ + + adam_beta2: float = 0.999 + """Second coefficient for computing running averages of gradient and its square in Adam + optimizer. + """ + + adam_eps: float = 1e-08 + """Term added to the denominator to improve numerical stability in Adam optimizer.""" + + +@dataclass +class SGDOptimizerConfig(OptimizerConfig): + """SGD optimizer configuration object.""" + + optimizer: str = 'sgd' + """Optimizer name.""" + + sgd_momentum: float = 0.9 + """Momentum factor for SGD optimizer.""" diff --git a/megatron/core/optimizer_param_scheduler.py b/megatron/core/optimizer_param_scheduler.py index da7e0787676..9f771c612e8 100644 --- a/megatron/core/optimizer_param_scheduler.py +++ b/megatron/core/optimizer_param_scheduler.py @@ -95,19 +95,30 @@ def __init__( self.step(0) log_single_rank(logger, logging.INFO, f"> learning rate decay style: {self.lr_decay_style}") - def get_wd(self) -> float: - """Weight decay incr functions""" + def get_wd(self, param_group: Optional[dict] = None) -> float: + """Weight decay incr functions + + Args: + param_group (dict): parameter group from the optimizer.""" + + if param_group is not None: + start_wd = param_group.get('start_wd', self.start_wd) + end_wd = param_group.get('end_wd', self.end_wd) + else: + start_wd = self.start_wd + end_wd = self.end_wd + if self.num_steps > self.wd_incr_steps: - return self.end_wd + return end_wd if self.wd_incr_style == 'constant': - assert self.start_wd == self.end_wd - return self.end_wd + assert start_wd == end_wd + return end_wd incr_ratio = float(self.num_steps) / float(self.wd_incr_steps) assert incr_ratio >= 0.0 assert incr_ratio <= 1.0 - delta_wd = self.end_wd - self.start_wd + delta_wd = end_wd - start_wd if self.wd_incr_style == 'linear': coeff = incr_ratio @@ -116,7 +127,7 @@ def get_wd(self) -> float: else: raise Exception(f'{self.wd_incr_style} weight decay increment style is not supported.') - return self.start_wd + coeff * delta_wd + return start_wd + coeff * delta_wd def get_lr(self, param_group: dict) -> float: """Learning rate decay functions from: @@ -191,11 +202,9 @@ def step(self, increment: int) -> None: increment (int): number of steps to increment """ self.num_steps += increment - new_wd = self.get_wd() for param_group in self.optimizer.param_groups: - new_lr = self.get_lr(param_group) - param_group['lr'] = new_lr * param_group.get('lr_mult', 1.0) - param_group['weight_decay'] = new_wd * param_group.get('wd_mult', 1.0) + param_group['lr'] = self.get_lr(param_group) + param_group['weight_decay'] = self.get_wd(param_group) * param_group.get('wd_mult', 1.0) def state_dict(self) -> dict: """Return the state dict.""" diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py index 1e41bf9d8c2..1916bfff079 100644 --- a/megatron/core/parallel_state.py +++ b/megatron/core/parallel_state.py @@ -1122,6 +1122,7 @@ def initialize_model_parallel( for ranks in expert_decoder_rank_generator.get_ranks('ep'): group = create_group( ranks, + timeout=timeout, pg_options=get_nccl_options("ep", nccl_comm_cfgs), group_desc="EXPERT_MODEL_PARALLEL_GROUP", ) diff --git a/megatron/core/process_groups_config.py b/megatron/core/process_groups_config.py index 07c922ea685..ef8f31ea150 100644 --- a/megatron/core/process_groups_config.py +++ b/megatron/core/process_groups_config.py @@ -140,6 +140,23 @@ def __init__(self, **kwargs): else: raise ValueError(f"Unknown attribute: {key}") + def __repr__(self): + """Return a concise representation showing which process groups exist and their sizes.""" + active_pgs = [] + for field_info in fields(self): + if hasattr(self, field_info.name): + pg = getattr(self, field_info.name) + if pg is not None: + active_pgs.append(f"{field_info.name}({pg.size()})") + else: + # Field exists but is None + active_pgs.append(f"{field_info.name}(None)") + return ( + f"ProcessGroupCollection({', '.join(active_pgs)})" + if active_pgs + else "ProcessGroupCollection(empty)" + ) + @classmethod def use_mpu_process_groups(cls, required_pgs: Optional[List[str]] = None): """ diff --git a/megatron/core/safe_globals.py b/megatron/core/safe_globals.py index d2baed2a4a0..cc5eb8809e8 100755 --- a/megatron/core/safe_globals.py +++ b/megatron/core/safe_globals.py @@ -11,6 +11,7 @@ from numpy.dtypes import UInt32DType from megatron.core.enums import ModelType +from megatron.core.optimizer import OptimizerConfig from megatron.core.rerun_state_machine import RerunDiagnostic, RerunMode, RerunState from megatron.core.transformer.enums import AttnBackend @@ -24,6 +25,7 @@ Namespace, AttnBackend, ModelType, + OptimizerConfig, RerunDiagnostic, RerunMode, RerunState, diff --git a/megatron/core/ssm/mamba_block.py b/megatron/core/ssm/mamba_block.py index 1bcadd0af10..de27bb89d2e 100644 --- a/megatron/core/ssm/mamba_block.py +++ b/megatron/core/ssm/mamba_block.py @@ -5,10 +5,8 @@ # This source code is licensed under the Apache license found in the # LICENSE file in the root directory of this source tree. -import math from contextlib import nullcontext from dataclasses import dataclass -from functools import partial from typing import Optional, Tuple, Union import torch @@ -23,7 +21,6 @@ from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols as LayerSymbols from megatron.core.ssm.mamba_hybrid_layer_allocation import allocate_layers -from megatron.core.tensor_parallel import get_cuda_rng_tracker from megatron.core.transformer import TransformerConfig from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.module import MegatronModule @@ -33,50 +30,6 @@ from megatron.core.utils import WrappedTensor, deprecate_inference_params, make_viewless_tensor -# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454 -def _init_weights( - module, - n_layer, - initializer_range=0.02, # Now only used for embedding layer. - rescale_prenorm_residual=True, - n_residuals_per_layer=1, # Change to 2 if we have MLP -): - with get_cuda_rng_tracker().fork(): - if isinstance(module, nn.Linear): - if not getattr(module.weight, "_no_reinit", False): - nn.init.normal_(module.weight, std=initializer_range) - if module.bias is not None: - if not getattr(module.bias, "_no_reinit", False): - nn.init.zeros_(module.bias) - elif isinstance(module, nn.Embedding): - nn.init.normal_(module.weight, std=initializer_range) - - for name, p in module.named_parameters(): - if name in ["conv1d.weight", "out_proj.weight"]: - nn.init.kaiming_uniform_(p, a=math.sqrt(5)) - if name in ["in_proj.weight"]: - nn.init.normal_(p, mean=0.0, std=initializer_range) - - if rescale_prenorm_residual: - # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: - # > A modified initialization which accounts for the accumulation on the - # > residual path with model depth. Scale - # > the weights of residual layers at initialization by a factor of - # > 1/√N where N is the # of residual layers. - # > -- GPT-2 :: https://openai.com/blog/better-language-models/ - # - # Reference (Megatron-LM): - # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py - for name, p in module.named_parameters(): - if name in ["out_proj.weight", "fc2.weight"]: - # Special Scaled Initialization - nn.init.normal_( - p, - mean=0.0, - std=initializer_range / math.sqrt(n_residuals_per_layer * n_layer), - ) - - @dataclass class MambaStackSubmodules: """ @@ -86,6 +39,7 @@ class MambaStackSubmodules: mamba_layer: Union[ModuleSpec, type] = IdentityOp attention_layer: Union[ModuleSpec, type] = IdentityOp mlp_layer: Union[ModuleSpec, type] = IdentityOp + moe_layer: Union[ModuleSpec, type] = IdentityOp class MambaStack(MegatronModule): @@ -171,6 +125,7 @@ def __init__( config=self.config, residual_in_fp32=residual_in_fp32, layer_number=i + 1 + pp_layer_offset, + pp_layer_offset=pp_layer_offset, pg_collection=pg_collection, ) elif layer_type == LayerSymbols.ATTENTION: @@ -189,6 +144,11 @@ def __init__( layer_number=i + 1, pg_collection=pg_collection, ) + elif layer_type == LayerSymbols.MOE: + # Transformer layers apply their own pp_layer_offset + layer = build_module( + submodules.moe_layer, config=self.config, layer_number=i + 1 + ) else: assert False, "unexpected layer_type" self.layers.append(layer) @@ -204,15 +164,6 @@ def __init__( eps=self.config.layernorm_epsilon, ) - if self.config.perform_initialization: - self.apply( - partial( - _init_weights, - n_layer=self.config.num_layers, - initializer_range=self.config.init_method_std, - ) - ) - def _select_layers_for_pipeline_parallel(self, layer_type_list): num_layers_per_pipeline_rank = self.config.num_layers // self.pp_group.size() diff --git a/megatron/core/ssm/mamba_hybrid_layer_allocation.py b/megatron/core/ssm/mamba_hybrid_layer_allocation.py index 7407bfe899f..fe997e2249a 100644 --- a/megatron/core/ssm/mamba_hybrid_layer_allocation.py +++ b/megatron/core/ssm/mamba_hybrid_layer_allocation.py @@ -28,7 +28,8 @@ class Symbols: MAMBA = "M" ATTENTION = "*" MLP = "-" - VALID = {MAMBA, ATTENTION, MLP} + MOE = 'E' + VALID = {MAMBA, ATTENTION, MLP, MOE} def _allocate_auto( @@ -172,9 +173,9 @@ def get_layer_maps_from_layer_type_list( ) -> Tuple[Dict[int, int], Dict[int, int], Dict[int, int]]: """ Returns maps from global layer index to the corresponding layer index - for each layer type in [Attention, Mamba, MLP] given a layer type list. + for each layer type in [Attention, Mamba, MLP, MoE] given a layer type list. """ - layer_types = [Symbols.ATTENTION, Symbols.MAMBA, Symbols.MLP] + layer_types = [Symbols.ATTENTION, Symbols.MAMBA, Symbols.MLP, Symbols.MOE] layer_maps = {layer_type: {} for layer_type in layer_types} for global_layer_idx, layer_type in enumerate(layer_type_list): layer_map = layer_maps[layer_type] diff --git a/megatron/core/ssm/mamba_layer.py b/megatron/core/ssm/mamba_layer.py index 69d5ef21c81..6514050ac63 100644 --- a/megatron/core/ssm/mamba_layer.py +++ b/megatron/core/ssm/mamba_layer.py @@ -61,6 +61,7 @@ def __init__( layer_number: int = 1, residual_in_fp32=False, pg_collection: ProcessGroupCollection = None, + pp_layer_offset: int = 0, ): """Initialize Mamba Layer.""" super().__init__(config) @@ -77,6 +78,7 @@ def __init__( d_model=self.config.hidden_size, layer_number=layer_number, pg_collection=pg_collection, + pp_layer_offset=pp_layer_offset, ) self.norm = build_module(submodules.norm, self.config, self.config.hidden_size) self.mamba_bda = build_module(submodules.mamba_bda) diff --git a/megatron/core/ssm/mamba_mixer.py b/megatron/core/ssm/mamba_mixer.py index b792f8a2f1f..91dc266e590 100644 --- a/megatron/core/ssm/mamba_mixer.py +++ b/megatron/core/ssm/mamba_mixer.py @@ -162,6 +162,7 @@ def __init__( headdim=None, ngroups=None, pg_collection: ProcessGroupCollection = None, + pp_layer_offset: int = 0, ): if not HAVE_MAMBA_SSM: raise ImportError( @@ -183,6 +184,7 @@ def __init__( self.norm_before_gate = norm_before_gate self.chunk_size = chunk_size self.layer_number = layer_number + self.pp_layer_offset = pp_layer_offset self.cached_batch_size = None assert pg_collection is not None, "pg_collection must be provided for MambaMixer" self.pg_collection = pg_collection @@ -297,9 +299,12 @@ def __init__( setattr(self.conv1d.weight, "tensor_model_parallel", True) setattr(self.conv1d.bias, "tensor_model_parallel", True) - if self.config.perform_initialization and self.conv_init is not None: + if self.config.perform_initialization: with get_cuda_rng_tracker().fork(): - nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init) + if self.conv_init is not None: + nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init) + else: + nn.init.kaiming_uniform_(self.conv1d.weight, a=math.sqrt(5)) self.activation = "silu" self.act = nn.SiLU() @@ -324,13 +329,6 @@ def __init__( ) self.dt_bias = nn.Parameter(inv_dt) - # Our initialization would set all Linear.bias to zero, - # need to mark this one as _no_reinit - self.dt_bias._no_reinit = True - # Just to be explicit. Without this we already don't - # put wd on dt_bias because of the check - # name.endswith("bias") in param_grouping.py - self.dt_bias._no_weight_decay = True setattr(self.dt_bias, "tensor_model_parallel", True) # A parameter @@ -342,7 +340,6 @@ def __init__( A = A.uniform_(*A_init_range) A_log = torch.log(A) # Keep A_log in fp32 self.A_log = nn.Parameter(A_log) - self.A_log._no_weight_decay = True setattr(self.A_log, "tensor_model_parallel", True) # D "skip" parameter @@ -352,7 +349,6 @@ def __init__( device=torch.cuda.current_device(), ) ) # Keep in fp32 - self.D._no_weight_decay = True setattr(self.D, "tensor_model_parallel", True) if self.rmsnorm: @@ -365,6 +361,7 @@ def __init__( device=torch.cuda.current_device(), dtype=config.params_dtype, ) + setattr(self.norm.weight, "tensor_model_parallel", True) # Assume sequence parallelism: input is partitioned along d_inner and # output is partitioned along the sequence dimension @@ -458,7 +455,7 @@ def dynamic_inference(self, hidden_states: torch.Tensor, context: DynamicInferen ) assert sequence_packing_available, reason_for_no_sequence_packing - conv_state, ssm_state = context.mamba_states_cache(self.layer_number) + conv_state, ssm_state = context.mamba_states_cache(self.layer_number - self.pp_layer_offset) # Fast path: decode-only if context.is_decode_only(): @@ -504,7 +501,10 @@ def dynamic_inference(self, hidden_states: torch.Tensor, context: DynamicInferen zxBCdt_chunked_prefill = zxBCdt[ active_token_count - chunked_prefill_request_token_count : active_token_count ] - batch_index_chunked_prefill = batch_indices[context.chunked_prefill_request_id] + + batch_index_chunked_prefill = batch_indices[ + context.get_index_of_chunked_prefill_request() + ] y_prefill_chunked = self.ssm_prefill( zxBCdt_chunked_prefill, @@ -941,6 +941,12 @@ def ssm_decode( x_reshaped = rearrange(x, "b (h p) -> b h p", p=self.headdim) if not self.rmsnorm: z = rearrange(z, "b (h p) -> b h p", p=self.headdim) + + # Upcast the batch_indices to prevent integer overflow errors in the case of + # large max request counts. + if batch_indices is not None: + batch_indices = batch_indices.to(torch.int64) + y = selective_state_update( ssm_state, x_reshaped, diff --git a/megatron/core/tensor_parallel/inference_layers.py b/megatron/core/tensor_parallel/inference_layers.py new file mode 100644 index 00000000000..05f7b88d095 --- /dev/null +++ b/megatron/core/tensor_parallel/inference_layers.py @@ -0,0 +1,151 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + +from typing import Callable, Optional + +import torch +import torch.distributed as dist + +from megatron.core.extensions.transformer_engine import ( + TELayerNormColumnParallelLinear, + TERowParallelLinear, +) +from megatron.core.model_parallel_config import ModelParallelConfig +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import get_tensor_model_parallel_group_if_none + +try: + import transformer_engine.pytorch.cpp_extensions as tex + from transformer_engine.pytorch.constants import TE_DType + from transformer_engine.pytorch.distributed import ( + gather_along_first_dim, + reduce_scatter_along_first_dim, + ) + + HAVE_TE = True +except ImportError: + HAVE_TE = False + + +def _te_rms_norm_kernel(x: torch.Tensor, weight: torch.Tensor, eps: float): + x_shape = x.shape + x = x.view(-1, x.size(-1)) + out, _, _ = tex.rmsnorm_fwd( + x, weight, eps, None, None, TE_DType[x.dtype], 16, False # sm-margin # zero centered gamma + ) + out = out.view(*x_shape[:-1], -1) + return out.to(x.dtype) + + +class InferenceLayerNormColumnParallelLinear(TELayerNormColumnParallelLinear): + """ + Inference optimized version of TELayerNormColumnParallelLinear. + """ + + def __init__( + self, + input_size: int, + output_size: int, + *, + config: TransformerConfig, + init_method: Callable, + gather_output: bool, + bias: bool, + skip_bias_add: bool, + is_expert: bool, + skip_weight_param_allocation: bool = False, + tp_comm_buffer_name: Optional[str] = None, + tp_group: Optional[torch.distributed.ProcessGroup] = None, + ): + assert HAVE_TE, "--transformer-impl=inference_optimized requires transformer engine" + super().__init__( + input_size, + output_size, + config=config, + init_method=init_method, + gather_output=gather_output, + bias=bias, + skip_bias_add=skip_bias_add, + is_expert=is_expert, + skip_weight_param_allocation=skip_weight_param_allocation, + tp_comm_buffer_name=tp_comm_buffer_name, + tp_group=tp_group, + ) + self.tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert) + self.tp_size = dist.get_world_size(self.tp_group) + + assert ( + output_size % self.tp_size == 0 + ), f"output_size ({output_size}) must be divisible by tp_size ({self.tp_size})" + + self.eps = config.layernorm_epsilon + + if self.tp_size > 1: + assert ( + config.sequence_parallel + ), "--transformer-impl=inference_optimized requires --sequence-parallel" + + @torch.no_grad() + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Forward pass. + """ + x = _te_rms_norm_kernel(x=x, weight=self.layer_norm_weight, eps=self.eps) + if self.tp_size > 1: + x, _ = gather_along_first_dim(x, process_group=self.tp_group) + x = torch.matmul(x, self.weight.t()) + return x, None + + +class InferenceRowParallelLinear(TERowParallelLinear): + """ + Inference optimized version of TERowParallelLinear. + """ + + def __init__( + self, + input_size: int, + output_size: int, + *, + config: ModelParallelConfig, + init_method: Callable, + bias: bool, + input_is_parallel: bool, + skip_bias_add: bool, + is_expert: bool, + tp_comm_buffer_name: Optional[str] = None, + tp_group: Optional[torch.distributed.ProcessGroup] = None, + ): + assert HAVE_TE, "--transformer-impl=inference_optimized requires transformer engine" + super().__init__( + input_size, + output_size, + config=config, + init_method=init_method, + bias=bias, + input_is_parallel=input_is_parallel, + skip_bias_add=skip_bias_add, + is_expert=is_expert, + tp_comm_buffer_name=tp_comm_buffer_name, + tp_group=tp_group, + ) + self.tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert) + self.tp_size = dist.get_world_size(self.tp_group) + assert ( + input_size % self.tp_size == 0 + ), f"input_size ({input_size}) must be divisible by tp_size ({self.tp_size})" + + if self.tp_size > 1: + assert ( + config.sequence_parallel + ), "--transformer-impl=inference_optimized requires --sequence-parallel" + + @torch.no_grad() + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Forward pass. + """ + x = torch.matmul(x, self.weight.t()) + if self.tp_size > 1: + x, _ = reduce_scatter_along_first_dim(x, tp_group=self.tp_group) + return x, None diff --git a/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py b/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py index c68b0ef89b1..458689fa1f4 100644 --- a/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +++ b/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py @@ -69,7 +69,6 @@ def __init__( pretrained_model_name_or_path=tokenizer_path, use_fast=use_fast, trust_remote_code=trust_remote_code, - chat_template=chat_template, ) elif merges_file is None: self.tokenizer = AutoTokenizer.from_pretrained( @@ -77,7 +76,6 @@ def __init__( vocab_file=vocab_file, use_fast=use_fast, trust_remote_code=trust_remote_code, - chat_template=chat_template, ) else: self.tokenizer = AutoTokenizer.from_pretrained( @@ -86,7 +84,6 @@ def __init__( merge_files=merges_file, use_fast=use_fast, trust_remote_code=trust_remote_code, - chat_template=chat_template, ) except Exception as e: raise ValueError( @@ -94,6 +91,14 @@ def __init__( f'for {tokenizer_path}. Exception: {e}' ) + # Store the tokenizer's existing chat template if the user does not provide + # a custom chat template. Otherwise, override the default chat template with + # the user-provided template. + if chat_template is None: + chat_template = self.tokenizer.chat_template + else: + self.tokenizer.chat_template = chat_template + self.include_special_tokens = include_special_tokens self.original_vocab_size = len(self.tokenizer) self.chat_template = chat_template diff --git a/megatron/core/tokenizers/text/libraries/null_tokenizer.py b/megatron/core/tokenizers/text/libraries/null_tokenizer.py index 13d56436192..4ddf77fc774 100644 --- a/megatron/core/tokenizers/text/libraries/null_tokenizer.py +++ b/megatron/core/tokenizers/text/libraries/null_tokenizer.py @@ -25,6 +25,14 @@ def ids_to_text(self, ids): text = [str(x) for x in ids] return ' '.join(text) + def tokens_to_ids(self, tokens): + """Converts tokens to ids.""" + return [int(x) for x in tokens] + + def ids_to_tokens(self, ids): + """Converts ids to tokens.""" + return [str(x) for x in ids] + def offsets(self, ids: list[int], text: str) -> list[int]: """Returns offsets.""" offsets, start_idx = [], 0 diff --git a/megatron/core/tokenizers/text/text_tokenizer.py b/megatron/core/tokenizers/text/text_tokenizer.py index 2107cf9dce4..4e0c624e006 100644 --- a/megatron/core/tokenizers/text/text_tokenizer.py +++ b/megatron/core/tokenizers/text/text_tokenizer.py @@ -37,13 +37,17 @@ def __init__(self, path: str, config: dict, **kwargs) -> None: self._tokenizer = self._restore_model(**kwargs) self.additional_args = kwargs self.path = path - if ( - config.get("chat_template", None) is None - and kwargs.get("chat_template", None) is not None - ): - self.chat_template = kwargs.get("chat_template", None) + + config_template = config.get("chat_template", None) + tokenizer_template = getattr(self._tokenizer, "chat_template", None) + kwargs_template = kwargs.get("chat_template", None) + + if config_template is not None: + self.chat_template = config_template + elif tokenizer_template is not None: + self.chat_template = tokenizer_template else: - self.chat_template = config.get("chat_template", None) + self.chat_template = kwargs_template def _restore_model(self, **kwargs) -> MegatronTokenizerTextAbstract: """Returns tokenizer library object.""" diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index 74031f38219..7bb9a12c697 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -35,6 +35,7 @@ from megatron.core.utils import ( deprecate_inference_params, divide, + get_pg_rank, get_pg_size, is_fa_min_version, is_te_min_version, @@ -158,6 +159,7 @@ def __init__( self.config = config self.layer_number = layer_number + self.attn_mask_type = attn_mask_type self.attention_type = attention_type @@ -306,6 +308,19 @@ def _allocate_memory(self, inference_max_sequence_length, batch_size, dim, dtype device=torch.cuda.current_device(), ) + def _get_pp_layer_offset_for_inference(self): + """Return the pipeline parallel layer offset for inference.""" + assert ( + self.config.virtual_pipeline_model_parallel_size is None + ), "Virtual pipeline parallelism is not supported for inference" + + # Import here to avoid circular imports + from megatron.core.transformer.transformer_layer import get_transformer_layer_offset + + return get_transformer_layer_offset( + self.config, vp_stage=None, pp_rank=get_pg_rank(self.pg_collection.pp) + ) + def _adjust_key_value_for_inference( self, inference_context: BaseInferenceContext, @@ -371,9 +386,15 @@ def _adjust_key_value_for_inference( inference_context.key_value_memory_dict[self.layer_number] ) - if not inference_context.is_static_batching() or inference_context.sequence_len_offset > 0: + if ( + not inference_context.is_static_batching() or inference_context.sequence_len_offset > 0 + ) and (not self.training or not is_te_min_version("2.2.0")): # This should mean that we are past the prompt forward_step # and so we need to turn off masking + # Note: in ModelOpt, we may use inference_context for speculative decoding + # in training. In that case, we do not want to turn off masking as we need + # customized attention mask for speculative decoding. + attn_mask_type = AttnMaskType.no_mask if inference_context.is_static_batching(): @@ -444,6 +465,8 @@ def _adjust_key_value_for_inference( key = inference_key_memory[:sequence_end, batch_start:batch_end, ...] value = inference_value_memory[:sequence_end, batch_start:batch_end, ...] else: + pp_layer_offset = self._get_pp_layer_offset_for_inference() + # Apply rotary embeddings before appending KV cache. if inference_context.use_flashinfer_fused_rope and (rotary_pos_cos_sin is not None): query, key = inference_context.apply_fused_qk_rotary_emb( @@ -458,17 +481,23 @@ def _adjust_key_value_for_inference( rotary_pos_emb = (q_pos_emb, None) # key rotary emb has been applied # Append key/value data tensors to cache. - inference_context.append_key_value_cache(self.layer_number, key, value) + inference_context.append_key_value_cache( + self.layer_number - pp_layer_offset, key, value + ) _, max_seqlen_q = inference_context.cu_query_lengths() if getattr(self.config, "cache_mla_latents", None) and max_seqlen_q > 1: # Doing unabsorbed MLA Attention with cached mla latents (prefill/mixed mode) - kv_cache, _, block_table = inference_context.key_value_cache(self.layer_number) + kv_cache, _, block_table = inference_context.key_value_cache( + self.layer_number - pp_layer_offset + ) # Uncompress the KV cache for prefill/mixed mode key, value = self.uncompress_kv_from_cache(kv_cache) else: # Read key/value *pointer* tensors from cache. - key, value, block_table = inference_context.key_value_cache(self.layer_number) + key, value, block_table = inference_context.key_value_cache( + self.layer_number - pp_layer_offset + ) return query, key, value, rotary_pos_emb, attn_mask_type, block_table @abstractmethod diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index 12f15ee980a..10a739e11c0 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -368,9 +368,26 @@ def create_cudagraphs(): def delete_cuda_graphs(): """Delete all CUDA graphs.""" + # Reset runners. + for record in [ + *_CudagraphGlobalRecord.cudagraph_record, + *_CudagraphGlobalRecord.cudagraph_inference_record, + ]: + runner = record[0] + assert isinstance(runner, _CudaGraphRunner) + + runner.cudagraph_created = False + runner.fwd_graph_recorded = False + runner.bwd_graph_recorded = False + runner.fwd_graph = None + runner.bwd_graph = None + runner.fwd_mempool = None + runner.bwd_mempool = None + # Reset global tracking state _CudagraphGlobalRecord.cudagraph_created = False _CudagraphGlobalRecord.cudagraph_record = [] + _CudagraphGlobalRecord.cudagraph_inference_record = [] # TODO: Optional?: Force garbage collection to clean up memory gc.collect() diff --git a/megatron/core/transformer/fsdp_dtensor_checkpoint.py b/megatron/core/transformer/fsdp_dtensor_checkpoint.py index 65e2f5f9dff..04ec982e6ff 100644 --- a/megatron/core/transformer/fsdp_dtensor_checkpoint.py +++ b/megatron/core/transformer/fsdp_dtensor_checkpoint.py @@ -484,6 +484,6 @@ def get_global_unique_param_name(model_chunks, param): # Get EP unique parameter name num_experts = model_chunks[0].config.num_moe_experts if model_chunks else None - param_name = list(handle_experts_in_state_dict({param_name: None}, num_experts).keys())[0] + param_name = next(iter(handle_experts_in_state_dict({param_name: None}, num_experts).keys())) return param_name diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py index b2135fdb00d..8754e938348 100644 --- a/megatron/core/transformer/moe/token_dispatcher.py +++ b/megatron/core/transformer/moe/token_dispatcher.py @@ -48,6 +48,8 @@ num_global_tokens: num_local_tokens*TP*EP """ +logger = logging.getLogger(__name__) + class MoETokenDispatcher: """ @@ -1270,7 +1272,6 @@ def _pad_routing_map( # Check if there are enough tokens to pad enough_tokens_to_pad = torch.all(target_tokens_per_expert <= num_input_tokens) if not enough_tokens_to_pad: - logger = logging.getLogger(__name__) logger.warning( "Not enough tokens to pad. The total number of tokens received in this rank " "is smaller than the target number of tokens for each expert. " diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index fae2e2f5d4d..3f8c97099da 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -749,6 +749,9 @@ class TransformerConfig(ModelParallelConfig): symmetric_ar_type: Optional[str] = None """Type of symmetric all reduce to use""" + use_inference_optimized_layers: bool = False + """If True, use inference optimized transformer layers during inference.""" + mrope_section: Optional[List[int]] = None """ Multimodal rope section is for channel dimension of temporal, height and width in rope calculation. """ @@ -1874,6 +1877,13 @@ def __post_init__(self): f"for context parallelism, but got {self.cp_comm_type=} instead." ) + if self.transformer_impl == "inference_optimized": + assert self.normalization == "RMSNorm" + assert not self.layernorm_zero_centered_gamma + assert not self.add_bias_linear + assert not self.add_qkv_bias + assert not self.use_kitchen + @dataclass class MLATransformerConfig(TransformerConfig): diff --git a/megatron/core/utils.py b/megatron/core/utils.py index 9b62b18d400..77a004a6845 100644 --- a/megatron/core/utils.py +++ b/megatron/core/utils.py @@ -24,7 +24,7 @@ from functools import lru_cache, reduce, wraps from importlib.metadata import version from types import TracebackType -from typing import Any, Callable, Coroutine, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union import numpy import torch @@ -2140,23 +2140,28 @@ def maybe_cat(a, b, dim=0, *, required=False): return xs[0] if len(xs) == 1 else torch.cat(xs, dim=dim) +_ASYNC_IO_LOOP: asyncio.AbstractEventLoop | None = None + + def get_asyncio_loop(loop: asyncio.AbstractEventLoop | None = None) -> asyncio.AbstractEventLoop: """Creates an asyncio loop if necessary and then returns the current asyncio loop.""" + global _ASYNC_IO_LOOP if loop is None: try: loop = asyncio.get_running_loop() except RuntimeError as e: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) + if _ASYNC_IO_LOOP is not None: + return _ASYNC_IO_LOOP + else: + _ASYNC_IO_LOOP = loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) return loop _ASYNC_TASK_STATS = defaultdict(lambda: [0, 0.0]) # cnt, total_time -def trace_async_exceptions( - func: Optional[Callable[..., Coroutine]], *, verbose: bool = False -) -> Callable[..., Coroutine]: +def trace_async_exceptions(func: Optional[Callable] = None, *, verbose: bool = False): """Decorator to be applied to every coroutine that runs in a separate task. This is needed because asyncio tasks do not propagate exceptions. @@ -2171,41 +2176,81 @@ async def my_coroutine(...): ``` """ - def _decorate(fn): - if not asyncio.iscoroutinefunction(fn): - raise TypeError("trace_async_exceptions can only be used with async functions") - - @functools.wraps(fn) - async def wrapper(*args, **kwargs): - if verbose: - start = time.perf_counter() - try: - return await fn(*args, **kwargs) - except Exception as e: - logger.error(f"Exception in async function {fn.__name__}: {e}") - traceback.print_exc() - sys.exit(1) - finally: + def _log_verbose(name: str, start: float) -> None: + elapsed = (time.perf_counter() - start) * 1000.0 + cnt, tot = _ASYNC_TASK_STATS[name] + _ASYNC_TASK_STATS[name] = [cnt + 1, tot + elapsed] + avg = _ASYNC_TASK_STATS[name][1] / _ASYNC_TASK_STATS[name][0] + + log10 = numpy.log10(max(cnt, 1)) + if numpy.isclose(log10, round(log10)): + logger.info( + f"{name} completed in {elapsed:.3f} ms, " + f"lifetime avg: {avg:.3f} ms, " + f"lifetime cnt: {cnt + 1}" + ) + + def _decorate(fn: Callable): + if asyncio.iscoroutinefunction(fn): + + @functools.wraps(fn) + async def wrapper(*args, **kwargs): if verbose: - elapsed = (time.perf_counter() - start) * 1000.0 - name = fn.__qualname__ - cnt, tot = _ASYNC_TASK_STATS[name] - _ASYNC_TASK_STATS[name] = [cnt + 1, tot + elapsed] - avg = _ASYNC_TASK_STATS[name][1] / _ASYNC_TASK_STATS[name][0] - - log10 = numpy.log10(max(cnt, 1)) - if numpy.isclose(log10, round(log10)): - logger.info( - f"{name} completed in {elapsed:.3f} ms, " - f"lifetime avg: {avg:.3f} ms, " - f"lifetime cnt: {cnt + 1}" - ) + start = time.perf_counter() + try: + return await fn(*args, **kwargs) + except Exception as e: + logger.error(f"Exception in async function {fn.__name__}: {e}") + traceback.print_exc() + sys.exit(1) + finally: + if verbose: + _log_verbose(fn.__qualname__, start) + + elif inspect.isasyncgenfunction(fn): + + @functools.wraps(fn) + async def wrapper(*args, **kwargs): + if verbose: + start = time.perf_counter() + agen = fn(*args, **kwargs) + try: + async for item in agen: + yield item + except Exception as e: + logger.error(f"Exception in async generator {fn.__name__}: {e}") + traceback.print_exc() + sys.exit(1) + finally: + if verbose: + _log_verbose(fn.__qualname__, start) + else: + raise TypeError("trace_async_exceptions must be used on async functions or generators") return wrapper return _decorate if func is None else _decorate(func) +def get_mamba_inference_state_config_from_model(model) -> Optional["MambaInferenceStateConfig"]: + """Returns Mamba inference state config from the model if it is a hybrid model.""" + from megatron.core.inference.contexts.attention_context.mamba_metadata import ( + MambaInferenceStateConfig, + ) + from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols + + decoder = get_attr_wrapped_model(model, "decoder") + layer_type_list = getattr(decoder, "layer_type_list", None) + if layer_type_list is not None and Symbols.MAMBA in layer_type_list: + (mamba_conv_states_shape, mamba_ssm_states_shape) = decoder.mamba_state_shapes_per_request() + return MambaInferenceStateConfig( + layer_type_list=layer_type_list, + mamba_conv_states_shape=mamba_conv_states_shape, + mamba_ssm_states_shape=mamba_ssm_states_shape, + ) + return None + + # ============================================================================ # Backward Compatibility Decorators # ============================================================================ diff --git a/megatron/legacy/data/biencoder_dataset_utils.py b/megatron/legacy/data/biencoder_dataset_utils.py index 6fa391c8a22..6d69fabbe48 100644 --- a/megatron/legacy/data/biencoder_dataset_utils.py +++ b/megatron/legacy/data/biencoder_dataset_utils.py @@ -5,11 +5,14 @@ import numpy as np import torch -from megatron.training import get_args, get_tokenizer, print_rank_0 from megatron.core import mpu, tensor_parallel -from megatron.legacy.data.dataset_utils import create_masked_lm_predictions, \ - pad_and_convert_to_numpy -from megatron.legacy.data.data_samplers import MegatronPretrainingSampler +from megatron.legacy.data.dataset_utils import ( + create_masked_lm_predictions, + pad_and_convert_to_numpy, +) +from megatron.training import get_args, get_tokenizer, print_rank_0 +from megatron.training.datasets.data_samplers import MegatronPretrainingSampler + def make_attention_mask(source_block, target_block): """ diff --git a/megatron/legacy/data/vit_dataset.py b/megatron/legacy/data/vit_dataset.py index e65c536c897..504075a5506 100644 --- a/megatron/legacy/data/vit_dataset.py +++ b/megatron/legacy/data/vit_dataset.py @@ -1,15 +1,17 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. import os import random + import numpy as np import torch import torchvision.transforms as T +from PIL import Image, ImageFilter, ImageOps from torchvision import datasets -from megatron.training import get_args -from megatron.legacy.data.image_folder import ImageFolder + from megatron.legacy.data.autoaugment import ImageNetPolicy -from megatron.legacy.data.data_samplers import RandomSeedDataset -from PIL import Image, ImageFilter, ImageOps +from megatron.legacy.data.image_folder import ImageFolder +from megatron.training import get_args +from megatron.training.datasets.data_samplers import RandomSeedDataset class GaussianBlur(object): @@ -236,7 +238,7 @@ def build_train_valid_datasets(data_path, image_size=224): classes_fraction=args.classes_fraction, data_per_class_fraction=args.data_per_class_fraction ) - train_data = RandomSeedDataset(train_data) + train_data = RandomSeedDataset(train_data, args.seed) # validation dataset val_data_path = data_path[1] @@ -244,6 +246,6 @@ def build_train_valid_datasets(data_path, image_size=224): root=val_data_path, transform=val_transform ) - val_data = RandomSeedDataset(val_data) + val_data = RandomSeedDataset(val_data, args.seed) return train_data, val_data diff --git a/megatron/post_training/algos/__init__.py b/megatron/post_training/algos/__init__.py deleted file mode 100644 index f8011007a50..00000000000 --- a/megatron/post_training/algos/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/megatron/post_training/algos/distillation.py b/megatron/post_training/algos/distillation.py deleted file mode 100644 index c54add0a8d7..00000000000 --- a/megatron/post_training/algos/distillation.py +++ /dev/null @@ -1,601 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - -"""Distillation loss function(s).""" - -import logging -import re -import types -from abc import ABCMeta -from typing import Any, Callable, Dict, List, Optional, Tuple, Union - -import modelopt.torch.distill as mtd -import modelopt.torch.opt as mto -import torch -import torch.nn as nn -import torch.nn.functional as F -import yaml -from torch import Tensor -from torch.nn.modules.loss import _Loss - -from megatron.core.dist_checkpointing.mapping import ShardedStateDict -from megatron.core.parallel_state import ( - get_context_parallel_group, - get_pipeline_model_parallel_world_size, - get_tensor_and_context_parallel_rank, - get_tensor_model_parallel_group, - get_virtual_pipeline_model_parallel_world_size, - is_pipeline_last_stage, -) -from megatron.core.pipeline_parallel.schedules import get_tensor_shapes -from megatron.core.transformer import MegatronModule, TransformerConfig, TransformerLayer -from megatron.core.utils import get_model_config - -logger = logging.getLogger(__name__) - - -def load_distillation_config( - config_path: Optional[str], student_cfg: TransformerConfig, teacher_cfg: TransformerConfig -) -> Dict[str, Any]: - """Read the distillation yaml config file specified by ``args.export_kd_cfg``. - - Args: - config_path: Path to user-defined distillation settings yaml file. - If `None`, uses default logits-only distillation mode for GPT models. - student_cfg: Model config for student model. - teacher_cfg: Model config for teacher model. - - WARNING: Assumes intermediate hidden sizes are always that found in the model config's ``hidden_size`` attribute. - """ - if not config_path: - logger.warning("Distillation config not provided. Using default.") - cfg = { - "logit_layers": ["output_layer", "output_layer"], - "intermediate_layer_pairs": [], - "skip_lm_loss": True, - "kd_loss_scale": 1.0, - } - else: - with open(config_path) as f: - cfg = yaml.safe_load(f) - - intermediate_pairs = cfg.get("intermediate_layer_pairs", []) - logit_pair = cfg["logit_layers"] - skip_lm_loss = cfg["skip_lm_loss"] - loss_scale = cfg["kd_loss_scale"] - - criterion = {} - if student_cfg.pipeline_model_parallel_size == 1 or is_pipeline_last_stage(): - criterion[tuple(logit_pair)] = LogitsKLLoss(student_cfg) - # NOTE: Projection layer shared among intermediate layer pairs. - projection_layer = ProjectionLayer(student_cfg, teacher_cfg) - - for entry in intermediate_pairs: - if len(entry) == 2: - student_layer, teacher_layer = entry - loss = "hidden_cosine" - elif len(entry) == 3: - student_layer, teacher_layer, loss = entry - - loss_fn = None - - if loss == "mse": - loss_fn = MSELoss - elif loss == "hidden_cosine": - loss_fn = HiddenStateCosineLoss - else: - assert False, f"loss passed was {loss=}" - - if get_tensor_and_context_parallel_rank() == 0: - print( - "Distillation: Adding intermediate loss between" - f" `{student_layer}` of student (hidden size {student_cfg.hidden_size}) and" - f" `{teacher_layer}` of teacher (hidden size {teacher_cfg.hidden_size})." - ) - student_layer = _adjust_layer_index_for_pp(student_layer, student_cfg) - teacher_layer = _adjust_layer_index_for_pp(teacher_layer, teacher_cfg) - criterion[(student_layer, teacher_layer)] = loss_fn( - student_cfg, projection_layer=projection_layer - ) - - loss_balancer = LogitsAndIntermediatesLossBalancer( - kd_loss_scale=loss_scale, skip_original_loss=skip_lm_loss - ) - - cfg["criterion"] = criterion - cfg["loss_balancer"] = loss_balancer - - return cfg - - -def _adjust_layer_index_for_pp(submodule_name, model_cfg): - """Adjust any sequence-based layer indices found in a submodule name for Pipeline Parallelism.""" - - match = re.search(r'(?<=\.)\d+(?=\.)', submodule_name) - if not match: - return submodule_name - - offset = TransformerLayer._get_layer_offset(model_cfg) - new_layer_idx = int(match.group(0)) - offset - if new_layer_idx < 0: - raise ValueError(f"Layer {submodule_name} does not fall on final PP rank.") - - new_submodule_name = submodule_name.replace(match.group(0), str(new_layer_idx)) - if get_tensor_and_context_parallel_rank() == 0: - print( - f'Distillation: Renamed layer "{submodule_name}" on final PP rank to "{new_submodule_name}"' - ) - return new_submodule_name - - -######################################################## - - -class BaseLoss(_Loss, metaclass=ABCMeta): - """Abstract base class for Megatron distillation losses.""" - - def __init__( - self, model_config: TransformerConfig, projection_layer: Optional[nn.Module] = None - ): - """ - Constructor. - - Args: - model_config: MCore transformer config. - projection_layer: Module which projects student activations to teacher's hidden dim. - """ - super().__init__() - self._config = model_config - self._projection = projection_layer - - def pre_forward(self, predictions: Tensor, targets: Tensor) -> Tuple[Tensor, Tensor]: - """Performs projection of student tensor to match teacher's size if necessary.""" - if isinstance(predictions, tuple): - # `ColumnParallelLinear` returns bias too - predictions, targets = predictions[0], targets[0] - - if self._projection is not None: - predictions = self._projection(predictions) - targets = targets.detach() - - return predictions, targets - - def post_forward(self, loss: Tensor, tp_reduce: bool = False, is_sequence_parallel: bool = False) -> Tensor: - """Reshapes tensor from [s, b] to [b, s] for upcoming loss masking.""" - loss = loss.transpose(0, 1).contiguous() - return (loss, tp_reduce, is_sequence_parallel) - - -class HiddenStateCosineLoss(BaseLoss): - """ - Calculates Cosine loss between two tensors without reducing the sequence dim. - - The tensors are assumed to be intermediate activations, so extra restrictions are in place. - """ - - def __init__( - self, model_config: TransformerConfig, projection_layer: Optional[nn.Module] = None - ): - """ - Constructor. - - Args: - model_config: MCore transformer config. - projection_layer: Module which projects student activations to teacher's hidden dim. - """ - super().__init__(model_config, projection_layer=projection_layer) - - if self._config.tensor_model_parallel_size > 1 and not self._config.sequence_parallel: - logger.warning( - "``HiddenStateCosineLoss`` only works with tensors with full hidden dim. Ensure the " - "tensor inputs meet this requirement or use `--sequence_parallel` if tensor parallel is enabled." - ) - - def forward(self, predictions: Tensor, targets: Tensor) -> Tensor: - """ - Forward function. - - Args: - predictions: Student model tensors (size [s, b, h]) - targets: Teacher model tensors (size [s, b, h]) - - Returns: - Cosine loss of tensors (size [b, s]) - """ - predictions, targets = self.pre_forward(predictions, targets) - - loss = F.cosine_embedding_loss( - predictions.view(-1, predictions.size(-1)), - targets.view(-1, targets.size(-1)), - targets.new_ones(1), - reduction="none", - ) - loss = loss.view(*predictions.shape[:2]) - - # NOTE: Tensor sequence length is still split among TP ranks. - return self.post_forward(loss, is_sequence_parallel=self._config.sequence_parallel) - - -class MSELoss(BaseLoss): - """Calculates MSE loss between two tensors without reducing the sequence dim.""" - - def forward(self, predictions: Tensor, targets: Tensor) -> Tensor: - """Forward function. - - Args: - predictions: Student model tensors (size [s, b, h]) - targets: Teacher model tensors (size [s, b, h]) - - Returns: - MSE loss of tensors (size [b, s]) - """ - predictions, targets = self.pre_forward(predictions, targets) - - loss = F.mse_loss(predictions, targets, reduction="none") - loss = loss.mean(dim=-1) - - return self.post_forward(loss, is_sequence_parallel=self._config.sequence_parallel) - - -class LogitsKLLoss(BaseLoss): - """Calculates KL-Divergence loss between two logits tensors without reducing the sequence dim.""" - - def __init__( - self, model_config: TransformerConfig, temperature: float = 1.0, reverse: bool = False - ): - """ - Constructor. - - Args: - model_config: MCore transformer config. - temperature: Divide tensors by this value prior to calculating loss. - reverse: Whether to reverse the loss as KLD(teacher, student) instead of KLD(student, teacher) - """ - super().__init__(model_config) - self._temperature = temperature - self._reverse = reverse - - def forward(self, predictions: Tensor, targets: Tensor) -> Tensor: - """ - Forward function. - - Args: - predictions: Student model tensors (size [s, b, h]) - targets: Teacher model tensors (size [s, b, h]) - - Returns: - KLD loss of tensors (size [b, s]) - """ - predictions, targets = self.pre_forward(predictions, targets) - - # Division by temp should happen prior to finding max for both student and teacher. - # Currently we don't use temperature in any of ours runs (temp=1.0) - output_teacher = targets.float() / self._temperature - output_student = predictions.float() / self._temperature - - # Compute local softmax, and the reweight to compute global softmax. - if self._config.tensor_model_parallel_size > 1: - - # Maximum value along vocab dimension across all GPUs. - teacher_logits_max, _ = torch.max(output_teacher, dim=-1) - torch.distributed.all_reduce( - teacher_logits_max, - op=torch.distributed.ReduceOp.MAX, - group=get_tensor_model_parallel_group(), - ) - output_teacher = output_teacher - teacher_logits_max.unsqueeze(dim=-1) - - denom_teacher = torch.sum(torch.exp(output_teacher), dim=-1) - # We can't use standard reduction function here since the computation - # that follows it isn't identical across TP ranks. - denom_teacher = all_reduce_autograd( - denom_teacher, group=get_tensor_model_parallel_group() - ) - - # Maximum value along vocab dimension across all GPUs. - student_logits_max, _ = torch.max(output_student, dim=-1) - torch.distributed.all_reduce( - student_logits_max, - op=torch.distributed.ReduceOp.MAX, - group=get_tensor_model_parallel_group(), - ) - output_student = output_student - student_logits_max.unsqueeze(dim=-1).detach() - - denom_student = torch.sum(torch.exp(output_student), dim=-1) - denom_student = all_reduce_autograd( - denom_student, group=get_tensor_model_parallel_group() - ) - - slen, bsz, sharded_vocab_size = output_student.shape - student_log_prob = output_student - torch.log(denom_student).view(slen, bsz, 1).expand( - slen, bsz, sharded_vocab_size - ) - teacher_log_prob = output_teacher - torch.log(denom_teacher).view(slen, bsz, 1).expand( - slen, bsz, sharded_vocab_size - ) - - if self._reverse: - loss = torch.sum( - F.kl_div(teacher_log_prob, student_log_prob, reduction="none", log_target=True), - dim=-1, - ) - else: - loss = torch.sum( - F.kl_div(student_log_prob, teacher_log_prob, reduction="none", log_target=True), - dim=-1, - ) - - else: - if self._reverse: - loss = torch.sum( - F.kl_div( - F.log_softmax(output_teacher, dim=-1), - F.softmax(output_student, dim=-1), - reduction="none", - ), - dim=-1, - ) - else: - loss = torch.sum( - F.kl_div( - F.log_softmax(output_student, dim=-1), - F.softmax(output_teacher, dim=-1), - reduction="none", - ), - dim=-1, - ) - - return self.post_forward(loss, tp_reduce=True) - - -######################################################## - - -class LogitsAndIntermediatesLossBalancer(mtd.DistillationLossBalancer): - """ - LossBalancer implementation for Logit and Intermediate losses. - - Dynamically weighs distillation and original losses to balance during training. - """ - - def __init__(self, kd_loss_scale: float = 1.0, skip_original_loss: bool = False): - """Constructor. - - Args: - kd_loss_scale: Multiply distillation losses by this before weighing. - (Not used when `skip_original_loss` is True.) - skip_original_loss: Used to signal whether the original loss should be used, regardless - of whether it was passed into ``mtd.DistillationModel.compute_kd_loss()`` or not. - """ - super().__init__() - self._kd_loss_scale = kd_loss_scale - self._skip_original_loss = skip_original_loss - - def forward(self, loss_dict: Dict[str, Tensor]) -> Tensor: - """Forward function. - - Args: - loss_dict: All individual scalar losses, passed in during ``mtd.DistillationModel.compute_kd_loss()`` - - Returns: - Aggregate total scalar loss. - """ - original_loss = loss_dict.pop(mtd.loss_balancers.STUDENT_LOSS_KEY) - for _key in loss_dict: - if _key.startswith(LogitsKLLoss.__name__): - logits_key = _key # should only be one - logits_loss = loss_dict.pop(logits_key) - intermediate_loss = sum(loss_dict.values()) / max(len(loss_dict), 1) - - if intermediate_loss > 0: - dynamic_scale = logits_loss.item() / intermediate_loss.item() - intermediate_loss_scaled = intermediate_loss * dynamic_scale - kd_loss_scale = self._kd_loss_scale / 2.0 - else: - kd_loss_scale = self._kd_loss_scale - intermediate_loss = logits_loss.new_tensor(intermediate_loss) - intermediate_loss_scaled = intermediate_loss - - if self._skip_original_loss: - total_loss = logits_loss + intermediate_loss_scaled - else: - kd_loss = (logits_loss + intermediate_loss_scaled) * kd_loss_scale - dynamic_scale = original_loss.item() / kd_loss.item() - total_loss = original_loss + kd_loss * dynamic_scale - - out_dict = { - "kd_loss": total_loss, - "logits_loss": logits_loss, - "intermediate_loss": intermediate_loss, - } - return out_dict - - -######################################################## - - -class ProjectionLayer(MegatronModule): - """Module to project student layer activations to teacher's size.""" - - def __init__(self, student_config: TransformerConfig, teacher_config: TransformerConfig): - """ - Constructor. - - Args: - student_config: Student's MCore transformer config. - teacher_config: Teacher's MCore transformer config. - """ - super().__init__(config=student_config) - if student_config.hidden_size == teacher_config.hidden_size: - self._fit = nn.Identity() - else: - self._fit = nn.Linear(student_config.hidden_size, teacher_config.hidden_size) - self.apply(self._init_weights) - # Attribute below needed to reduce gradients during backward properly. - setattr(self._fit.weight, "sequence_parallel", self.config.sequence_parallel) - setattr(self._fit.bias, "sequence_parallel", self.config.sequence_parallel) - - def forward(self, student_tensor: Tensor): - """ - Forward function. - - Args: - student_tensor: Tensor to be fit to teacher size. - """ - return self._fit(student_tensor) - - def _init_weights(self, module): - """Initialize the weights.""" - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=0.01) - if module.bias is not None: - module.bias.data.zero_() - - -class _AllReduce(torch.autograd.Function): - """Implementation from old PyTorch `torch.distributed.nn.parallel`.""" - - @staticmethod - def forward(ctx, op, group, tensor): - ctx.group, ctx.op = group, op - tensor = tensor.clone() - torch.distributed.all_reduce(tensor, op=op, group=group) - return tensor - - @staticmethod - def backward(ctx, grad_output): - return (None, None, _AllReduce.apply(ctx.op, ctx.group, grad_output)) - - -def all_reduce_autograd( - tensor, op=torch.distributed.ReduceOp.SUM, group=torch.distributed.group.WORLD -): - """Custom all-reduce function. - - Needed instead of other all-reduce functions available when the computation following - the all-reduce call differs per rank. In KL loss, this corresponds to the different numerators. - """ - return _AllReduce.apply(op, group, tensor) - - -######################################################## - - -def adjust_distillation_model_for_mcore(model: mtd.DistillationModel, distill_cfg: Dict[str, Any]): - """Extra modifcations to ``mtd.DistillationModel`` requried for Megatron-Core.""" - - # HACK: Get rid of ModelOpt Distillation state - # NOTE: If re-placed, above losses need modifcation as `TransformerConfig` has non-pickleable elements. - mto.ModeloptStateManager(model)._state.pop() - - # HACK: Hide teacher during `sharded_state_dict` method. - def _sharded_state_dict(self, *args, **kwargs) -> ShardedStateDict: - with self.hide_teacher_model(): - return type(self).sharded_state_dict(self, *args, **kwargs) - - model.sharded_state_dict = types.MethodType(_sharded_state_dict, model) - - # HACK: Skip `lm_loss` bypassing it when training if not needed for backprop. - def _compute_language_model_loss(self, labels, logits) -> Tensor: - if distill_cfg["skip_lm_loss"] and self.training: - return torch.zeros_like(labels) - return type(self).compute_language_model_loss(self, labels, logits) - - model.compute_language_model_loss = types.MethodType(_compute_language_model_loss, model) - - # HACK: Skip `lm_loss` always for teacher. - def _compute_language_model_loss(self, labels, logits) -> Tensor: - return torch.zeros_like(labels) - - model.teacher_model.compute_language_model_loss = types.MethodType( - _compute_language_model_loss, model.teacher_model - ) - - # HACK: Pipeline-parallel Distillation requires splitting input tensor into student and teacher parts. - def _set_student_input_tensor_shape(self, shapes: List[Tuple[int]]): - self._tensor_split_idx = shapes[0][-1] - - def _set_input_tensor(self, input_tensors: List[Tensor]): - teacher_inputs = [t[..., self._tensor_split_idx:] if t is not None else t for t in input_tensors] - student_inputs = [t[..., :self._tensor_split_idx] if t is not None else t for t in input_tensors] - type(self).set_input_tensor(self.teacher_model, teacher_inputs) - type(self).set_input_tensor(self, student_inputs) - - model.set_student_input_tensor_shape = types.MethodType(_set_student_input_tensor_shape, model) - model.set_input_tensor = types.MethodType(_set_input_tensor, model) - - # HACK: Concatenate output tensors when PP>1 so they can be passed between ranks. - def _forward(self, *args, **kwargs): - if not self.training: - with self.only_student_forward(): - return type(self).forward(self, *args, **kwargs) - - with torch.no_grad(): - self._teacher_model.eval() - teacher_output = self._teacher_model(*args, **kwargs) - with self.only_student_forward(): - student_output = type(self).forward(self, *args, **kwargs) - - if not is_pipeline_last_stage(): - return torch.cat([student_output, teacher_output], dim=-1) - else: - return student_output - - model.forward = types.MethodType(_forward, model) - - -def get_tensor_shapes_adjust_fn_for_distillation( - model: Union[torch.nn.Module, List[torch.nn.Module]], - seq_length: int, - micro_batch_size: int, - decoder_seq_length: Optional[int] = None, - forward_only: bool = False, -) -> Union[Callable, None]: - if ( - forward_only - or get_pipeline_model_parallel_world_size() == 1 - or get_virtual_pipeline_model_parallel_world_size() is not None - ): - return None - # Unwrap - if isinstance(model, list): - model = model[0] - while hasattr(model, "module"): - model = model.module - if not isinstance(model, mtd.DistillationModel): - return None - - def adjust_tensor_shapes(recv_tensor_shapes: List[Tuple[int, ...]], send_tensor_shapes: List[Tuple[int, ...]]): - teacher_config = get_model_config(model.teacher_model) - tp_group = get_tensor_model_parallel_group() - cp_group = get_context_parallel_group() - - teacher_recv_tensor_shapes = get_tensor_shapes( - seq_length=seq_length, - micro_batch_size=micro_batch_size, - decoder_seq_length=decoder_seq_length, - config=teacher_config, - tp_group=tp_group, - cp_group=cp_group, - ) - teacher_send_tensor_shapes = get_tensor_shapes( - seq_length=seq_length, - micro_batch_size=micro_batch_size, - decoder_seq_length=decoder_seq_length, - config=teacher_config, - tp_group=tp_group, - cp_group=cp_group, - ) - model.set_student_input_tensor_shape(recv_tensor_shapes) - - for i, shape in enumerate(recv_tensor_shapes): - shape = list(shape) - shape[-1] += teacher_recv_tensor_shapes[0][-1] - recv_tensor_shapes[i] = tuple(shape) - for i, shape in enumerate(send_tensor_shapes): - shape = list(shape) - shape[-1] += teacher_send_tensor_shapes[0][-1] - send_tensor_shapes[i] = tuple(shape) - - return recv_tensor_shapes, send_tensor_shapes - - return adjust_tensor_shapes diff --git a/megatron/post_training/checkpointing.py b/megatron/post_training/checkpointing.py index aac59341e37..143cbb9c6ab 100644 --- a/megatron/post_training/checkpointing.py +++ b/megatron/post_training/checkpointing.py @@ -183,14 +183,7 @@ def _remove_prefix_state_dict_pre_hook( logger.warning(f"PyTorch version {get_torch_version()} below 2.6 detected." f" Forcing dist_ckpt_save_pre_mcore_014 behavior.") - # NOTE: singleton_local_shards only take care of the weight and bias. There are be issue when linear_fc1._amax - # is a matrix such as NVFP4 real quant, awq, and blockwise 128. - if args.dist_ckpt_save_pre_mcore_014 or force_pre_mcore_014: - metadata = {"singleton_local_shards": False} - else: - metadata = {"singleton_local_shards": True} - - sharded_state_dict = unwrapped_model[0].sharded_state_dict(prefix=additional_sharded_prefix, metadata=metadata) + sharded_state_dict = unwrapped_model[0].sharded_state_dict(prefix=additional_sharded_prefix) if additional_sharded_prefix: unwrapped_model[0]._register_load_state_dict_pre_hook( diff --git a/megatron/post_training/docs/distillation.md b/megatron/post_training/docs/distillation.md index 6ca1ec18417..9f0d5524176 100644 --- a/megatron/post_training/docs/distillation.md +++ b/megatron/post_training/docs/distillation.md @@ -75,7 +75,7 @@ Model Optimizer modifies the model using the loss criterion present in the disti defines a loss function between two module attribute names of the teacher and student model, respectively. Default loss function used between logits is a KL-Divergence Loss and loss used among intermediate tensors is Cosine-Similarity, -both defined in `megatron/inference/algos/distillation.py`. +both defined in `modelopt.torch.distill.plugins.megatron`. ## Restrictions diff --git a/megatron/post_training/generate.py b/megatron/post_training/generate.py index 0c5be3eceab..2a124734a30 100644 --- a/megatron/post_training/generate.py +++ b/megatron/post_training/generate.py @@ -104,7 +104,7 @@ def simple_speculative_generate( input_ids: torch.Tensor, images: Optional[torch.Tensor] = None, osl: int = 32, - draft_length: int = 0, + steps: int = 0, eos_token_id: List[int] = [], disable_tqdm: bool = False, ): @@ -127,7 +127,7 @@ def simple_speculative_generate( # Speculative decoding forward # NOTE: PP is not yet supported. - new_token, draft_tokens = model.pseudo_speculative_generate(input_ids, steps=draft_length) + new_token, draft_tokens = model.pseudo_speculative_generate(input_ids, steps=steps) # Always accept the first token. input_ids = output_ids[:, : offset] @@ -138,6 +138,8 @@ def simple_speculative_generate( for i in range(draft_tokens.shape[-1]): if torch.equal(draft_tokens[:, i : i + 1], output_ids[:, offset: offset + 1]): offset += 1 + else: + break # Broadcast the accepted offset from the last rank. offset = [offset] diff --git a/megatron/post_training/loss_func.py b/megatron/post_training/loss_func.py index eb8dbca1c6a..9c99529172d 100644 --- a/megatron/post_training/loss_func.py +++ b/megatron/post_training/loss_func.py @@ -55,16 +55,18 @@ def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor, model: GPTMo num_tokens = loss_mask.sum().clone().detach().to(torch.int) report = {'lm loss': torch.cat([loss_lm.clone().detach().view(1), num_tokens.view(1)])} - if model.training and args.export_kd_teacher_load: + if args.export_kd_teacher_load: # [ModelOpt]: Handle knowledge distillation losses = model.compute_kd_loss( student_loss=loss_lm, loss_reduction_fn=lambda x: _mask_loss(x, loss_mask), ) - loss = losses["kd_loss"] report["total loss"] = torch.cat([losses["kd_loss"].clone().detach().view(1), num_tokens.view(1)]) report["logits distillation loss"] = torch.cat([losses["logits_loss"].clone().detach().view(1), num_tokens.view(1)]) report["intermediate distillation loss"] = torch.cat([losses["intermediate_loss"].clone().detach().view(1), num_tokens.view(1)]) + if model.training: + loss = losses["kd_loss"] + return loss, num_tokens, report diff --git a/megatron/post_training/model_builder.py b/megatron/post_training/model_builder.py index 34daa279651..cb2654e7107 100644 --- a/megatron/post_training/model_builder.py +++ b/megatron/post_training/model_builder.py @@ -7,6 +7,8 @@ from typing import Any, Dict import modelopt.torch.distill as mtd +import modelopt.torch.distill.plugins.megatron as mtd_mcore +import modelopt.torch.opt as mto import yaml from megatron.core.models.gpt import GPTModel as MCoreGPTModel @@ -18,7 +20,6 @@ from megatron.core.post_training.modelopt.gpt.state_dict_hooks import ( mcore_gpt_load_te_state_dict_pre_hook, ) -from megatron.post_training.algos import distillation from megatron.post_training.checkpointing import load_modelopt_checkpoint, load_modelopt_state from megatron.training import get_args, print_rank_0 from megatron.training.arguments import core_transformer_config_from_args @@ -285,7 +286,7 @@ def modelopt_gpt_mamba_builder(args, pre_process, post_process, vp_stage=None, c ), "ModelOpt Distillation currently incompatible with interleaved pipeline schedule." teacher_config = _load_teacher_model_config(args.export_kd_teacher_load) - distill_cfg = distillation.load_distillation_config( + distill_cfg = mtd_mcore.setup_distillation_config( args.export_kd_cfg, student_cfg=config, teacher_cfg=core_transformer_config_from_args(teacher_config) ) if "hybrid_override_pattern" in teacher_config and args.is_hybrid_model: @@ -297,14 +298,15 @@ def modelopt_gpt_mamba_builder(args, pre_process, post_process, vp_stage=None, c kd_config = { "teacher_model": (_teacher_provider, [teacher_config, model_kwargs], {}), - "criterion": distill_cfg["criterion"], - "loss_balancer": distill_cfg["loss_balancer"], + "criterion": distill_cfg.criterion, + "loss_balancer": distill_cfg.loss_balancer, } model = mtd.convert(model, mode=[("kd_loss", kd_config)]) - # Additional tweaks needed for MCore/Nemo. - # NOTE: Distillation state manually removed in this function. - # ModelOpt state restoration above will not return a `mtd.DistillationModel` for simplicity reasons. - distillation.adjust_distillation_model_for_mcore(model, distill_cfg) + # Additional tweaks needed for MCore. + # (accounts for sharded state, pipeline parallel, and potentially skipping LM loss) + mtd_mcore.adjust_distillation_model_for_mcore(model, distill_cfg) + # Also remove KD mode state to prevent issues with re-conversion after restore. + mto.ModeloptStateManager(model).state_dict().pop() # TODO(aanoosheh): remove once fixed in ModelOpt return model diff --git a/megatron/post_training/non_loss_data_func.py b/megatron/post_training/non_loss_data_func.py index 49fb9220258..49c29b4912c 100644 --- a/megatron/post_training/non_loss_data_func.py +++ b/megatron/post_training/non_loss_data_func.py @@ -8,10 +8,11 @@ from megatron.training.utils import unwrap_model -def report_draft_acceptance_length(model, osl: int = 64, draft_length: int = 7): +def report_draft_acceptance_length(model, osl: int = 64, draft_steps: int = 7): """Report MTBench acceptance length.""" tokenizer = get_tokenizer()._tokenizer unwrapped_model = unwrap_model(model)[0] + parallel_draft_step = unwrapped_model.eagle_config.parallel_draft_step if hasattr(unwrapped_model, "eagle_config") else 1 if unwrapped_model.training: return @@ -33,15 +34,15 @@ def report_draft_acceptance_length(model, osl: int = 64, draft_length: int = 7): conversations, return_tensors="pt", add_generation_prompt=True ).to(torch.cuda.current_device()) output_ids, actual_osl, steps = simple_speculative_generate( - unwrapped_model, input_ids, osl=osl, draft_length=draft_length, disable_tqdm=True + unwrapped_model, input_ids, osl=osl, steps=draft_steps, disable_tqdm=True ) total_osl += actual_osl total_steps += steps if torch.distributed.get_rank() == 0: al = actual_osl / steps - ar = al / draft_length + ar = al / (draft_steps + parallel_draft_step - 1) print( - "Rank {:3}/{:3} {:12} AL {:.1f} AR {:.2f} STEPS {:5}/{:5} DRAFT {:2}".format( + "Rank {:3}/{:3} {:12} AL {:.1f} AR {:.2f} STEPS {:5}/{:5} DRAFT {:2} PARALLEL {:2}".format( torch.distributed.get_rank(), torch.distributed.get_world_size(), category, @@ -49,15 +50,16 @@ def report_draft_acceptance_length(model, osl: int = 64, draft_length: int = 7): ar, steps, actual_osl, - draft_length, + draft_steps, + parallel_draft_step, ), flush=True, ) if torch.distributed.get_rank() == 0: al = total_osl / total_steps - ar = al / draft_length + ar = al / (draft_steps + parallel_draft_step - 1) print( - "Rank {:3}/{:3} {:12} AL {:.1f} AR {:.2f} STEPS {:5}/{:5} DRAFT {:2}".format( + "Rank {:3}/{:3} {:12} AL {:.1f} AR {:.2f} STEPS {:5}/{:5} DRAFT {:2} PARALLEL {:2}".format( torch.distributed.get_rank(), torch.distributed.get_world_size(), "average", @@ -65,7 +67,8 @@ def report_draft_acceptance_length(model, osl: int = 64, draft_length: int = 7): ar, total_steps, total_osl, - draft_length, + draft_steps, + parallel_draft_step, ), flush=True, ) diff --git a/megatron/post_training/utils.py b/megatron/post_training/utils.py index 5d9f301cd41..4bec8c96cf1 100644 --- a/megatron/post_training/utils.py +++ b/megatron/post_training/utils.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import os import torch from datasets import load_dataset @@ -34,7 +35,7 @@ def mtbench_to_oai_chat(example): example["conversations"] = conversations return example - dataset = load_dataset("HuggingFaceH4/mt_bench_prompts", split="train") + dataset = load_dataset("HuggingFaceH4/mt_bench_prompts", split="train", token=os.environ.get("HF_TOKEN", None)) return dataset.map(mtbench_to_oai_chat) def to_empty_if_meta(module: torch.nn.Module, *, device: torch.device, recurse=True): diff --git a/megatron/rl/inference/megatron.py b/megatron/rl/inference/megatron.py index 58613b364a6..ad22bd14ac9 100644 --- a/megatron/rl/inference/megatron.py +++ b/megatron/rl/inference/megatron.py @@ -5,10 +5,11 @@ from argparse import Namespace from pydantic import PrivateAttr +import torch.distributed as dist from megatron.core import parallel_state +from megatron.core.inference.inference_client import InferenceClient from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext -from megatron.core.inference.coordinator import DynamicEngineCoordinator from megatron.core.inference.engines.abstract_engine import AbstractEngine from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine from megatron.core.inference.engines.mcore_engine import MCoreEngine @@ -23,9 +24,11 @@ SimpleTextGenerationController, ) from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols from megatron.core.transformer.module import MegatronModule -from megatron.core.utils import log_single_rank +from megatron.core.utils import get_mamba_inference_state_config_from_model, log_single_rank from megatron.training.global_vars import get_args, get_tokenizer +from megatron.training import get_wandb_writer from ..inference.inference_interface import ( ChatInferenceInterface, @@ -102,38 +105,36 @@ def get_dynamic_inference_engine(args: Namespace, model: MegatronModule, inferen """ tokenizer = get_tokenizer() - num_cuda_graphs = None - if args.enable_cuda_graph: - num_cuda_graphs = args.inference_dynamic_batching_num_cuda_graphs + enable_cuda_graph = args.cuda_graph_impl == "local" - module = model.module.module if hasattr(model.module, "module") else model.module + mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) # Inference context. inference_context = DynamicInferenceContext( params_dtype=args.params_dtype, - num_layers=args.num_layers, + num_layers=args.num_layers // args.pipeline_model_parallel_size, kv_channels=args.kv_channels, num_attention_heads=( args.num_query_groups if args.group_query_attention else args.num_attention_heads ), max_sequence_length=args.inference_max_seq_length, - num_cuda_graphs=num_cuda_graphs, + num_cuda_graphs=( + args.inference_dynamic_batching_num_cuda_graphs + if enable_cuda_graph + else None + ), + block_size_tokens=args.inference_dynamic_batching_block_size, buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb, - buffer_guaranteed_fraction=args.inference_dynamic_batching_buffer_guaranteed_fraction, - chunk_size_tokens=args.inference_dynamic_batching_chunk_size, - buffer_overflow_factor=args.inference_dynamic_batching_buffer_overflow_factor, - max_requests_override=args.inference_dynamic_batching_max_requests_override, - max_tokens_override=args.inference_dynamic_batching_max_tokens_override, + max_tokens=args.inference_dynamic_batching_max_tokens, tensor_model_parallel_size=args.tensor_model_parallel_size, materialize_only_last_token_logits=True, - unified_memory_kvcache=args.inference_dynamic_batching_unified_memory_kvcache, - is_hybrid_model=args.is_hybrid_model, - layer_type_list=module.decoder.layer_type_list if args.is_hybrid_model else None, - mamba_head_dim=args.mamba_head_dim, - mamba_num_groups=args.mamba_num_groups, - mamba_d_model=args.hidden_size, - mamba_d_conv=4 if args.is_hybrid_model else None, - mamba_d_state=args.mamba_state_dim, + mamba_inference_state_config=mamba_inference_state_config, + cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents, + kv_lora_rank=args.kv_lora_rank if args.multi_latent_attention else None, + qk_pos_emb_head_dim=args.qk_pos_emb_head_dim, + use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs, + use_flashinfer_fused_rope=None, + unified_memory_level=args.inference_dynamic_batching_unified_memory_level, metrics_writer=metrics_writer, ) @@ -150,7 +151,7 @@ def get_dynamic_inference_engine(args: Namespace, model: MegatronModule, inferen return DynamicInferenceEngine( controller=text_generation_controller, context=inference_context, - enable_cuda_graph=args.enable_cuda_graph, + enable_cuda_graph=enable_cuda_graph, random_seed=args.seed, inference_logging_step_interval=inference_logging_step_interval, ) @@ -159,9 +160,8 @@ def get_dynamic_inference_engine(args: Namespace, model: MegatronModule, inferen class MegatronLocal(InferenceServer, ReturnsTokens, ReturnsRaw): """Interface to use MCoreEngine directly as an inference engine.""" - _coordinator: DynamicEngineCoordinator = PrivateAttr(None) - _engine_task: asyncio.Task = PrivateAttr(None) - _kill_engine: bool = PrivateAttr(False) + _client: InferenceClient = PrivateAttr(None) + _inference_engine: DynamicInferenceEngine = PrivateAttr(None) async def base_generate(self, request: InferenceRequest): @@ -174,25 +174,29 @@ async def base_generate(self, request: InferenceRequest): isinstance(p, str) for p in request.prompt ), "MegatronLocal only supports string prompts." + assert self._client is not None, "Client is not initialized" + tokenizer = get_tokenizer() sampling_params = SamplingParams( - num_tokens_to_generate=request.generation_args.max_tokens or 1024, + num_tokens_to_generate=None, + num_tokens_total=request.generation_args.max_tokens, temperature=request.generation_args.temperature or 1.0, top_k=request.generation_args.top_k or 0, top_p=request.generation_args.top_p or 0.0, - termination_id=self._coordinator.engine.controller.tokenizer.eod, + termination_id=self._inference_engine.controller.tokenizer.eod, return_log_probs=True, skip_prompt_log_probs=True, add_BOS=tokenizer.bos is not None, ) - request_ids = [ - self._coordinator.schedule_request(prompt=prompt, sampling_params=sampling_params) + requests = [ + self._client.add_request(prompt=prompt, sampling_params=sampling_params) for prompt in request.prompt ] - responses = await asyncio.gather( - *[self._coordinator.get_response(id) for id in request_ids] + records = await asyncio.gather( + *requests ) + responses = [record[-1] for record in records] return [ InferenceResponse( response=r.generated_text, @@ -229,28 +233,32 @@ async def launch(cls, model: GPTModel, **kwargs): "wandb module is available. Inference logging will be disabled.") inference_engine: DynamicInferenceEngine = get_dynamic_inference_engine(args, model, inference_logging_step_interval, metrics_writer) - coordinator = DynamicEngineCoordinator( - inference_engine, - inference_max_requests=inference_engine.context.max_requests, - log_level=0, - ) + await inference_engine.start_listening_to_data_parallel_coordinator(inference_coordinator_port=41521, launch_inference_coordinator=True) + if dist.get_rank() == 0: + # TODO: We have to do this only on the rank 0 process, should be fixed in the future when we have support for multiple inference clients. !2278 + client = InferenceClient(inference_coordinator_port=41521) + await client.start() + else: + client = None launched_server = cls(**kwargs) - launched_server._coordinator = coordinator - - loop = asyncio.get_running_loop() - - coordinator.startup(loop) + launched_server._client = client + launched_server._inference_engine = inference_engine return launched_server async def kill(self): - await self._coordinator.shutdown() + if dist.get_rank() == 0: + await self._client.stop_engines() + await self._inference_engine.stopped.wait() async def suspend(self): - await self._coordinator.suspend_engine() - - def resume(self): - self._coordinator.resume_engine() - + if dist.get_rank() == 0: + await self._client.pause_engines() + await self._inference_engine.paused.wait() + + async def resume(self): + if dist.get_rank() == 0: + self._client.unpause_engines() + await self._inference_engine.running.wait() class MegatronChatLocal(ChatInferenceInterface, MegatronLocal): ... diff --git a/megatron/rl/rl_utils.py b/megatron/rl/rl_utils.py index c0992778d57..11e005f74af 100644 --- a/megatron/rl/rl_utils.py +++ b/megatron/rl/rl_utils.py @@ -24,7 +24,7 @@ from megatron.core import mpu from megatron.core.datasets.megatron_tokenizer import MegatronLegacyTokenizer -from megatron.core.inference.utils import get_event_loop +from megatron.core.utils import get_asyncio_loop from megatron.core.models.common.language_module.language_module import LanguageModule from megatron.core.num_microbatches_calculator import get_num_microbatches from megatron.core.optimizer import MegatronOptimizer @@ -607,11 +607,11 @@ def get_environment_rollouts( ), "n_prompts must be divisible by data_parallel_world_size" with nvtx_range("rollout-collection"): - loop = get_event_loop() + loop = get_asyncio_loop() with megatron_rl_inference_mode( model, optimizer, - args.enable_cuda_graph, + args.cuda_graph_impl, args.rl_reset_cuda_graphs, args.rl_offload_optimizer_during_inference, args.rl_offload_kv_cache_during_training, @@ -1006,7 +1006,7 @@ def prepare_trajectories( args = get_args() # Only process if we have inference_logprobs if inference_logprobs and any(lp is not None for lp in inference_logprobs): - if args.use_sequence_packing: + if args.rl_use_sequence_packing: # For sequence packing, we need to pad all logprobs to the same size padded_logprobs = [] for logprobs in inference_logprobs: @@ -1207,14 +1207,14 @@ def prepare_data_for_update( # [g, group_size] # Making an assumption that all groups are of the same size! # For packing mode, use all rollouts to compute rewards - rollouts_for_rewards = all_rollouts if args.use_sequence_packing else rollouts + rollouts_for_rewards = all_rollouts if args.rl_use_sequence_packing else rollouts rewards = torch.tensor( [[rollout.reward for rollout in group] for group in rollouts_for_rewards], device='cpu' ) # We flatten them for logging. with nvtx_range("prepare_trajectories"): - if args.use_sequence_packing: + if args.rl_use_sequence_packing: trajs, generation_masks, inference_logprobs = prepare_packed_trajectories( all_rollouts, tokenizer, args ) @@ -1228,14 +1228,14 @@ def prepare_data_for_update( # Sequence packing or standard processing packing_context = {} # Store all packing-related data - if args.use_sequence_packing: + if args.rl_use_sequence_packing: with nvtx_range("sequence_packing"): timers('sequence-packing-overhead', log_level=1).start() - bin_size = args.sequence_packing_bin_size + bin_size = args.rl_sequence_packing_bin_size # Create packer with max sequences per bin limit to prevent extreme imbalance - max_sequences_per_bin = getattr(args, 'sequence_packing_max_sequences_per_bin', 100) + max_sequences_per_bin = getattr(args, 'rl_sequence_packing_max_sequences_per_bin', 100) packer = SequencePacker( bin_size=bin_size, pad_token=tokenizer.pad, @@ -1276,7 +1276,7 @@ def prepare_data_for_update( world_size = mpu.get_expert_data_parallel_world_size() # Choose distribution algorithm based on args.sequence_packing_algo - packing_algo = getattr(args, 'sequence_packing_algo', 'fifo') + packing_algo = getattr(args, 'rl_sequence_packing_algo', 'fifo') if packing_algo == 'round-robin': # Round-robin assignment: rank i gets bins [i, i+world_size, i+2*world_size, ...] @@ -1596,7 +1596,7 @@ def prepare_data_for_update( ) original_loss_mask[~generation_masks] = 0.0 - if not args.use_sequence_packing: + if not args.rl_use_sequence_packing: # Use original masks if not packing attention_mask = original_attention_mask loss_mask = original_loss_mask @@ -1606,7 +1606,7 @@ def prepare_data_for_update( timers('compute-logprobs', log_level=0).start() # Before we can update the model, we need to get the logprobs for the \pi_{old} model. # Use packed sequences if packing is enabled for performance benefits - if args.use_sequence_packing and 'packed_trajs' in packing_context: + if args.rl_use_sequence_packing and 'packed_trajs' in packing_context: compute_trajs = packing_context['packed_trajs'] compute_position_ids = packing_context['packed_position_ids'] compute_attention_mask = packing_context['packed_attention_mask'] @@ -1661,7 +1661,7 @@ def prepare_data_for_update( if ( inference_logprobs is not None and args.rl_inference_logprobs_is_correction - and not args.use_sequence_packing + and not args.rl_use_sequence_packing ): inference_logprobs = align_unpacked_inference_logprobs( inference_logprobs=inference_logprobs, @@ -1670,14 +1670,14 @@ def prepare_data_for_update( group_stats=group_stats, ) else: - if not args.use_sequence_packing: + if not args.rl_use_sequence_packing: # Keep inference_logprobs as None instead of zeros inference_logprobs = None # For sequence packing, inference_logprobs will be handled separately # Handle packing of inference_logprobs for sequence packing mode if ( - args.use_sequence_packing + args.rl_use_sequence_packing and inference_logprobs is not None and args.rl_inference_logprobs_is_correction ): @@ -1687,7 +1687,7 @@ def prepare_data_for_update( inference_logprobs=inference_logprobs, packing_info=packing_context['packing_info'], generation_masks=generation_masks, - bin_size=args.sequence_packing_bin_size, + bin_size=args.rl_sequence_packing_bin_size, ) # Store packed inference logprobs in packing context @@ -1754,7 +1754,7 @@ def prepare_data_for_update( timers('prepare-advantages').stop() with nvtx_range("create_dataloader"): - if args.use_sequence_packing: + if args.rl_use_sequence_packing: # Store packing context in runtime state for forward_step runtime_state = get_rl_runtime_state() runtime_state.packing_context = packing_context @@ -2049,14 +2049,14 @@ def evaluate_and_print_results_rl( with megatron_rl_inference_mode( model, optimizer, - args.enable_cuda_graph, + args.cuda_graph_impl, args.rl_reset_cuda_graphs, args.rl_offload_optimizer_during_inference, args.rl_offload_kv_cache_during_training, args.rl_remove_kv_cache_during_training, ) as inference_interface: - loop = get_event_loop() + loop = get_asyncio_loop() rank = torch.distributed.get_rank() if rank == 0: @@ -2230,7 +2230,7 @@ def calculate_grpo_loss( def megatron_rl_inference_mode( model: list[LanguageModule], optimizer: MegatronOptimizer, - enable_cuda_graph: bool, + cuda_graph_impl: str, reset_cuda_graphs: bool, offload_optimizer_during_inference: bool, offload_kv_cache_during_training: bool, @@ -2241,7 +2241,7 @@ def megatron_rl_inference_mode( Args: model: model to prepare. optimizer: optimizer used to train the model. - enable_cuda_graph: use cuda graphs or not. + cuda_graph_impl: which cuda graph implementation to use. reset_cuda_graphs: rebuild cuda graphs for each inference stage or not. offload_optimizer_during_inference: move optimizer to cpu during inference or not. offload_kv_cache_during_training: manually offload kv cache to host before training or not. @@ -2252,7 +2252,7 @@ def megatron_rl_inference_mode( """ args = get_args() - loop = get_event_loop() + loop = get_asyncio_loop() nvtx_range = get_nvtx_range() print(f"[{dist.get_rank()}:DP] Entering inference mode") @@ -2275,8 +2275,9 @@ def megatron_rl_inference_mode( with nvtx_range("offload-optimizer-before-inference"): optimizer.offload_to_cpu() - if enable_cuda_graph: - toggle_cuda_graphs(lang_module, True, reset_cuda_graphs=reset_cuda_graphs) + # TODO: Remove this if statement once a change to `toggle_cuda_graphs` makes it safe to. + if cuda_graph_impl != "none": + toggle_cuda_graphs(lang_module, cuda_graph_impl, reset_cuda_graphs=reset_cuda_graphs) inference_interface = get_inference_interface(args, loop, model) @@ -2286,25 +2287,28 @@ def megatron_rl_inference_mode( reset_cuda_graphs ), "reset_cuda_graphs must be True when offloading kv cache during training" print( - f"[{dist.get_rank()}:DP] Restoring kv cache ({inference_interface._coordinator.engine.context.memory_buffer.numel() / 1024**3:.2f} GB) to GPU" + f"[{dist.get_rank()}:DP] Restoring kv cache ({inference_interface._inference_engine.context.memory_buffer.numel() / 1024**3:.2f} GB) to GPU" ) - kv_cache = inference_interface._coordinator.engine.context.memory_buffer - inference_interface._coordinator.engine.context.memory_buffer = kv_cache.cuda() + kv_cache = inference_interface._inference_engine.context.memory_buffer + inference_interface._inference_engine.context.memory_buffer = kv_cache.cuda() elif remove_kv_cache_during_training: - if inference_interface._coordinator.engine.context.memory_buffer is None: - inference_interface._coordinator.engine.context.build_memory_buffer() + if inference_interface._inference_engine.context.memory_buffer is None: + inference_interface._inference_engine.context.build_memory_buffer() - if enable_cuda_graph and not _CudagraphGlobalRecord.cudagraph_created: + # TODO: Improve this if statement once a change is made to CUDA graph handling. + cuda_graph_exists = len(_CudagraphGlobalRecord.cudagraph_inference_record) != 0 + if cuda_graph_impl != "none" and not cuda_graph_exists: with nvtx_range("wait-for-decode-only"): - while not inference_interface._coordinator.engine.context.is_decode_only(): + while not inference_interface._inference_engine.context.is_decode_only(): active_requests, finished_requests, step_time = loop.run_until_complete( - inference_interface._coordinator.engine.async_step() + inference_interface._inference_engine.async_step() ) with nvtx_range("build-cuda-graphs"): - inference_interface._coordinator.engine.build_cuda_graphs(reset_context=False) + inference_interface._inference_engine.create_cuda_graphs(reset_context=True) - inference_interface.resume() + loop.run_until_complete(inference_interface.resume()) + print(f"[{dist.get_rank()}:DP] Entered inference mode") yield inference_interface with nvtx_range("suspend-engine"): @@ -2312,16 +2316,17 @@ def megatron_rl_inference_mode( with nvtx_range("offload-kv-cache-after-inference"): if offload_kv_cache_during_training: - kv_cache = inference_interface._coordinator.engine.context.memory_buffer + kv_cache = inference_interface._inference_engine.context.memory_buffer print( f"[{dist.get_rank()}:DP] Offloading kv cache ({kv_cache.numel() * kv_cache.element_size() / 1024**3:.2f} GB) to CPU" ) - inference_interface._coordinator.engine.context.memory_buffer = kv_cache.cpu() + inference_interface._inference_engine.context.memory_buffer = kv_cache.cpu() elif remove_kv_cache_during_training: - inference_interface._coordinator.engine.context.memory_buffer = None + inference_interface._inference_engine.context.memory_buffer = None - if enable_cuda_graph: - toggle_cuda_graphs(lang_module, False, reset_cuda_graphs=reset_cuda_graphs) + # TODO: Remove this if statement once a change to `toggle_cuda_graphs` makes it safe to. + if cuda_graph_impl != "none": + toggle_cuda_graphs(lang_module, 'none', reset_cuda_graphs=reset_cuda_graphs) if offload_optimizer_during_inference: with nvtx_range("onload-optimizer-after-inference"): @@ -2348,7 +2353,7 @@ def get_iteration_sequence_count(args): def update_sequence_packing_metrics(args): """Update bin tracking for sequence packing mode.""" - if args.use_sequence_packing: + if args.rl_use_sequence_packing: bin_count = ( mpu.get_data_parallel_world_size() * args.micro_batch_size * get_num_microbatches() ) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index bb1b17e9ba2..be667e32419 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -9,7 +9,6 @@ from pathlib import Path import re import types -import warnings import torch import torch.nn.functional as F @@ -35,6 +34,7 @@ ) from megatron.core.activations import squared_relu from megatron.core.fusions.fused_bias_geglu import quick_gelu +from megatron.training.dist_signal_handler import SIGNAL_MAP from megatron.training.utils import ( get_device_arch_version, update_use_dist_ckpt, @@ -1062,8 +1062,6 @@ def validate_args(args, defaults={}): # MoE Spec check if args.num_experts == 0: args.num_experts = None - if args.num_experts is not None: - assert args.spec is None, "Model Spec must be None when using MoEs" if args.num_experts is not None and args.moe_ffn_hidden_size is None: args.moe_ffn_hidden_size = args.ffn_hidden_size print("Warning: moe_ffn_hidden_size is not set, using ffn_hidden_size for MoE instead.") @@ -1108,6 +1106,20 @@ def validate_args(args, defaults={}): any([args.train_data_path, args.valid_data_path, args.test_data_path]) \ <= 1, "A single data source must be provided in training mode, else None" + if args.fim_data: + extra_tokens = [ + args.fim_prefix_token, + args.fim_middle_token, + args.fim_suffix_token, + args.fim_pad_token, + args.fim_eod_token, + ] + assert not args.mock_data, "Mock dataset is not supported with FIM dataset." + assert not args.legacy_tokenizer, "FIM dataset is not supported with legacy tokenizers." + assert args.fim_rate, "--fim-rate should be specified." + assert args.fim_spm_rate, "--fim-spm-rate should be specified." + assert all(token is not None for token in extra_tokens), "FIM extra tokens should be specified." + # Deterministic mode if args.deterministic_mode: assert not args.use_flash_attn, "Flash attention can not be used in deterministic mode." @@ -1182,7 +1194,6 @@ def validate_args(args, defaults={}): if args.inference_dynamic_batching: assert args.inference_dynamic_batching_buffer_size_gb is not None assert args.inference_dynamic_batching_block_size % 256 == 0, "block size should be a multiple of 256" - assert args.inference_dynamic_batching_buffer_guaranteed_fraction is not None # MoE upcycling check if args.moe_use_upcycling: @@ -1407,7 +1418,7 @@ def _add_transformer_engine_args(parser): help='Execute wgrad in higher precision even for FP8 runs', dest='fp8_wgrad') group.add_argument('--transformer-impl', default='transformer_engine', - choices=['local', 'transformer_engine'], + choices=['local', 'transformer_engine', 'inference_optimized'], help='Which Transformer implementation to use.') group.add_argument('--fallback-to-eager-attn', action='store_true', help='Fallback to eager attention in TE implementation. ' @@ -1516,34 +1527,22 @@ def _add_inference_args(parser): help='Enable dynamic batching mode.') group.add_argument('--inference-dynamic-batching-buffer-size-gb', type=float, default=40., - help='Total buffer size (GB) allocated for the block-level KV ' - 'memory.') + help='Amount of on-GPU memory allocated for the KV cache. ' + 'The total amount of memory allocated for the KV cache ' + '(CPU + GPU memory) depends on the value set for the ' + 'unified virtual memory (UVM) level (via ' + '`--inference-dynamic-batching-unified-memory-level`).' + 'If the UVM level is 0, then only GPU memory is used and ' + 'the total memory equals `buffer_size_gb`. If the UVM ' + 'level is 1, then additional memory is utilized on the ' + 'CPU and the total memory equals `2 * buffer_size_gb`.') group.add_argument('--inference-dynamic-batching-block-size', type=int, default=256, help='KV cache block size. ' 'It should be a multiple of 256') - group.add_argument('--inference-dynamic-batching-buffer-guaranteed-fraction', - type=float, default=0.2, - help='Space is reserved within the inference context ' - 'memory buffer to guarantee that a minimum number of ' - 'active requests will always be able to run to ' - 'completion. This is to avoid the context being deadlocked ' - 'by paused requests.') - group.add_argument('--inference-dynamic-batching-buffer-overflow-factor', - type=float, default=None, - help='Scaling factor over the memory buffer size for auto ' - 'computing `max_requests` and `max_tokens`. This scaling ' - 'factor is used for fitting more requests and tokens in ' - 'the memory buffer than it can safely hold, which in turn ' - 'increases throughput.') - group.add_argument('--inference-dynamic-batching-max-requests-override', - type=int, default=None, - help='If set, this overrides the max requests as computed ' - 'from `--inference-dynamic-batching-buffer-overflow-factor`.') - group.add_argument('--inference-dynamic-batching-max-tokens-override', + group.add_argument('--inference-dynamic-batching-max-tokens', type=int, default=None, - help='If set, this overrides the max tokens as computed ' - 'from `--inference-dynamic-batching-buffer-overflow-factor`.') + help='Override the inference context\'s default `max_tokens`.') group.add_argument('--inference-dynamic-batching-num-cuda-graphs', type=int, default=16, help='Maximum number of cuda graphs to capture, where the ' @@ -1560,7 +1559,7 @@ def _add_inference_args(parser): action='store_true', default=False, help='Only use cuda graphs for decode-only steps, not prefill and mixed steps.') group.add_argument('--inference-dynamic-batching-unified-memory-level', - type=int, default=0, choices=[0, 1], + type=int, default=1, choices=[0, 1], help='Set unified memory usage within the dynamic ' 'inference context. The levels are: 0) no unified memory, ' '1) allocate `memory_buffer` in unified memory. ' @@ -1580,7 +1579,8 @@ def _add_inference_args(parser): group.add_argument('--inference-wandb-logging-step-interval', type=int, default=0, help='Step interval for logging inference metrics to wandb. ' 'Default to 0 to disable inference wandb logging.') - + group.add_argument("--inference-coordinator-port", type=int, default=12346, + help="This port will be used to setup the inference coordinator on node-0") return parser @@ -2273,7 +2273,10 @@ def _add_training_args(parser): help='Exit the program after this many minutes.') group.add_argument('--exit-signal-handler', action='store_true', help='Dynamically save the checkpoint and shutdown the ' - 'training if SIGTERM is received') + 'training if signal is received') + group.add_argument('--exit-signal', type=str, default='SIGTERM', + choices=list(SIGNAL_MAP.keys()), + help='Signal to use for exit signal handler. If not specified, defaults to SIGTERM.') group.add_argument('--tensorboard-dir', type=str, default=None, help='Write TensorBoard logs to this directory.') group.add_argument('--no-masked-softmax-fusion', @@ -3043,6 +3046,27 @@ def _add_data_args(parser): 'If instead this argument is set, the training flow will treat all tokens ' 'that share the same id as the pad token as true pad tokens, potentially ' 'causing severe training instability.') + group.add_argument('--fim-data', action='store_true', help='Whether to use the FIM dataset.') + group.add_argument('--fim-rate', type=float, default=0.5, + help='Probability to convert a training sample into a FIM format.') + group.add_argument('--fim-spm-rate', type=float, default=0.5, + help='Probability that the a FIM sample uses the SPM format over the PSM format.') + group.add_argument('--fim-split-sample', type=str, default=None, + help='String around which to split the sample for FIM.') + group.add_argument('--fim-fragment-rate', type=float, default=None, + help='Rate of FIM on each fragment when --fim-split-sample is not None.') + group.add_argument('--fim-no-prefix', type=str, default=None, + help='Do not apply FIM to fragments that start with this prefix') + group.add_argument('--fim-prefix-token', type=str, default='', + help='FIM prefix token') + group.add_argument('--fim-middle-token', type=str, default='', + help='FIM middle token') + group.add_argument('--fim-suffix-token', type=str, default='', + help='FIM suffix token') + group.add_argument('--fim-pad-token', type=str, default='', + help='FIM PAD token') + group.add_argument('--fim-eod-token', type=str, default='<|endoftext|>', + help='FIM EOD token') return parser diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py index feacccba162..48a2025fa63 100644 --- a/megatron/training/checkpointing.py +++ b/megatron/training/checkpointing.py @@ -270,7 +270,7 @@ def checkpoint_exists(checkpoints_path): def read_metadata(tracker_filename): # Read the tracker file and either set the iteration or # mark it as a release checkpoint. - iteration = 0 + iteration = -1 release = False with open_file(tracker_filename, 'r') as f: @@ -283,7 +283,10 @@ def read_metadata(tracker_filename): print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format( tracker_filename)) sys.exit() - assert iteration > 0 or release, 'error parsing metadata file {}'.format( + else: + # Set iteration to 0 for release checkpoints + iteration = 0 + assert iteration > -1 or release, 'error parsing metadata file {}'.format( tracker_filename) # Get the max iteration retrieved across the ranks. @@ -1828,6 +1831,16 @@ def load_model_state_dict(module, state_dict, strict: bool): is_local_chkpt = (ckpt_type == CheckpointType.LOCAL) ft_integration.on_checkpoint_loaded(is_local_chkpt=is_local_chkpt) + # Patch checkpoint as needed if required field is not found. + if optimizer is not None: + log_printed = False + for param_group in optimizer.param_groups: + if 'default_config' not in param_group: + param_group['default_config'] = True + if not log_printed: + print_rank_0(">>> Inserting 'default_config' field into optimizer.param_groups...") + log_printed = True + return iteration, num_floating_point_operations_so_far diff --git a/megatron/training/datasets/README.md b/megatron/training/datasets/README.md new file mode 100644 index 00000000000..d5543c3d1b5 --- /dev/null +++ b/megatron/training/datasets/README.md @@ -0,0 +1,34 @@ +# Data Pipeline + +## FIM dataset + +`GPTFIMDataset` extends Megatron-Core’s `GPTDataset` to support **Fill-in-the-Middle (FIM)** data augmentation. +It probabilistically converts samples into FIM format using configurable rates, with support for both PSM and SPM patterns, fragment-level splitting, and length-preserving output. + +`GPTFIMDatasetConfig` provides the configuration needed to enable this behavior. +`GPTFIMDatasetConfig` configuration object extending `GPTDatasetConfig` to enable FIM preprocessing. + +**Attributes** + +- `rate`: Probability of converting a sample into a FIM example. A value of `1.0` means FIM is always applied. a value of `0.0` means FIM is never applied. +- `spm_rate`: Probability of using the SPM FIM pattern (vs PSM). The remaining probability (`1 - spm_rate`) selects the PSM (prefix-suffix-middle) pattern instead. For example, if `spm_rate = 0.3`: 30% SPM, 70% PSM. +- `extra_tokens`: Dictionary containing the FIM special tokens: {"prefix", "middle", "suffix", "pad", "eod"}. +- `split_sample`: Optional token around which samples are split before applying FIM. If provided, the input sequence is divided at every occurrence of this token, and FIM is applied independently to each fragment. `A B C D E F G H` -> `FIM(Fragment 1) FIM(Fragment 2) FIM(Fragment 3)`. +- `fragment_rate`: Probability of applying FIM to each fragment when split_sample is used. +- `no_prefix`: If the decoded sequence starts with this prefix, FIM is skipped. +`GPTFIMDataset` dataset class that loads token sequences from an `IndexedDataset` and applies FIM transformations before returning each sample. + +**PSM Format** +``` +[prefix_tok] prefix [suffix_tok] suffix [middle_tok] middle +``` + +**SPM Format** +``` +[prefix_tok, suffix_tok] suffix [middle_tok] prefix middle +``` + +**Special cases:** + +- If the sequence starts with no_prefix, FIM is skipped. +- If FIM is not applied, the sample is returned unchanged. \ No newline at end of file diff --git a/megatron/legacy/data/data_samplers.py b/megatron/training/datasets/data_samplers.py similarity index 56% rename from megatron/legacy/data/data_samplers.py rename to megatron/training/datasets/data_samplers.py index 1bf1bf5ee91..1e7f47510d1 100644 --- a/megatron/legacy/data/data_samplers.py +++ b/megatron/training/datasets/data_samplers.py @@ -4,13 +4,17 @@ import random -import torch + import numpy as np +import torch from torch.utils.data import Dataset -from megatron.training import get_args + from megatron.core import mpu from megatron.core.datasets.utils import Split +from megatron.training import get_args +from megatron.training.dist_signal_handler import DistributedSignalHandler + def build_pretraining_data_loader(dataset, consumed_samples): """Build dataloader given an input dataset.""" @@ -18,10 +22,10 @@ def build_pretraining_data_loader(dataset, consumed_samples): if dataset is None: return None args = get_args() - - if hasattr(dataset,'split'): + + if hasattr(dataset, 'split'): split = dataset.split - elif hasattr(dataset,'index_split'): + elif hasattr(dataset, 'index_split'): split = dataset.index_split else: split = None @@ -32,7 +36,8 @@ def build_pretraining_data_loader(dataset, consumed_samples): consumed_samples=0, micro_batch_size=args.micro_batch_size, data_parallel_rank=mpu.get_data_parallel_rank(), - data_parallel_size=mpu.get_data_parallel_world_size()) + data_parallel_size=mpu.get_data_parallel_world_size(), + ) elif args.dataloader_type == 'single': # Megatron sampler batch_sampler = MegatronPretrainingSampler( @@ -40,7 +45,8 @@ def build_pretraining_data_loader(dataset, consumed_samples): consumed_samples=consumed_samples, micro_batch_size=args.micro_batch_size, data_parallel_rank=mpu.get_data_parallel_rank(), - data_parallel_size=mpu.get_data_parallel_world_size()) + data_parallel_size=mpu.get_data_parallel_world_size(), + ) elif args.dataloader_type == 'cyclic': batch_sampler = MegatronPretrainingRandomSampler( dataset, @@ -49,52 +55,82 @@ def build_pretraining_data_loader(dataset, consumed_samples): micro_batch_size=args.micro_batch_size, data_parallel_rank=mpu.get_data_parallel_rank(), data_parallel_size=mpu.get_data_parallel_world_size(), - data_sharding=args.data_sharding) + data_sharding=args.data_sharding, + ) elif args.dataloader_type == "external": # External dataloaders are passed through. User is expected to provide a # torch-compatible dataloader and define samplers, if needed. return dataset else: - raise Exception('{} dataloader type is not supported.'.format( - args.dataloader_type)) + raise Exception('{} dataloader type is not supported.'.format(args.dataloader_type)) + + def worker_init_fn(_): + DistributedSignalHandler(args.exit_signal).__enter__() + maybe_worker_init_fn = ( + worker_init_fn if args.exit_signal_handler and args.num_workers > 0 else None + ) # Torch dataloader. - return torch.utils.data.DataLoader(dataset, - batch_sampler=batch_sampler, - num_workers=args.num_workers, - pin_memory=True, - persistent_workers=True if args.num_workers > 0 else False, - ) + return torch.utils.data.DataLoader( + dataset, + batch_sampler=batch_sampler, + num_workers=args.num_workers, + pin_memory=True, + persistent_workers=True if args.num_workers > 0 else False, + worker_init_fn=maybe_worker_init_fn, + ) + class MegatronPretrainingSampler: + """ + Sampler for Megatron pretraining dataloaders that divides data samples across + data parallel workers. Each worker receives a contiguous chunk of data determined by + its rank and the micro batch size. Supports dropping the last incomplete batch if + specified, and keeps track of total and consumed samples. Designed to work with + distributed training using Megatron's data parallelism. + """ - def __init__(self, total_samples, consumed_samples, micro_batch_size, - data_parallel_rank, data_parallel_size, drop_last=True): + def __init__( + self, + total_samples, + consumed_samples, + micro_batch_size, + data_parallel_rank, + data_parallel_size, + drop_last=True, + ): # Keep a copy of input params for later use. self.total_samples = total_samples self.consumed_samples = consumed_samples self.micro_batch_size = micro_batch_size self.data_parallel_rank = data_parallel_rank - self.micro_batch_times_data_parallel_size = \ - self.micro_batch_size * data_parallel_size + self.micro_batch_times_data_parallel_size = self.micro_batch_size * data_parallel_size self.drop_last = drop_last # Sanity checks. - assert self.total_samples > 0, \ - 'no sample to consume: {}'.format(self.total_samples) - assert self.consumed_samples < self.total_samples, \ - 'no samples left to consume: {}, {}'.format(self.consumed_samples, - self.total_samples) + assert self.total_samples > 0, 'no sample to consume: {}'.format(self.total_samples) + assert ( + self.consumed_samples < self.total_samples + ), 'no samples left to consume: {}, {}'.format(self.consumed_samples, self.total_samples) assert self.micro_batch_size > 0 assert data_parallel_size > 0 - assert self.data_parallel_rank < data_parallel_size, \ - 'data_parallel_rank should be smaller than data size: {}, ' \ - '{}'.format(self.data_parallel_rank, data_parallel_size) + assert ( + self.data_parallel_rank < data_parallel_size + ), 'data_parallel_rank should be smaller than data size: {}, ' '{}'.format( + self.data_parallel_rank, data_parallel_size + ) def __len__(self): return self.total_samples def get_start_end_idx(self): + """ + Calculate the start and end indices for the current data parallel worker's + chunk within a batch. + + Returns: + tuple: (start_idx, end_idx) indicating the slice of the batch for this worker. + """ start_idx = self.data_parallel_rank * self.micro_batch_size end_idx = start_idx + self.micro_batch_size return start_idx, end_idx @@ -116,17 +152,37 @@ def __iter__(self): class RandomSeedDataset(Dataset): + """ + A dataset wrapper that resets the random seed before each sample. - def __init__(self, dataset): - args = get_args() - self.base_seed = args.seed - self.curr_seed = args.seed + This ensures deterministic behavior per sample by setting the RNG state + for torch, numpy, and random before accessing each underlying data sample. + The base seed is retrieved from training arguments, and can be varied per epoch + using the set_epoch method to ensure different shuffling or augmentation each epoch. + + Args: + dataset: The underlying dataset to wrap. + + Methods: + set_epoch(epoch): Change the seed offset so each epoch produces different randomization. + __getitem__(idx): Sets the seed based on the sample index and current epoch. + """ + + def __init__(self, dataset, seed): + self.base_seed = seed + self.curr_seed = seed self.dataset = dataset def __len__(self): return len(self.dataset) def set_epoch(self, epoch): + """ + Change the seed offset so each epoch produces different randomization. + + Args: + epoch: The epoch number to use as the seed offset. + """ self.curr_seed = self.base_seed + epoch def __getitem__(self, idx): @@ -138,9 +194,23 @@ def __getitem__(self, idx): class MegatronPretrainingRandomSampler: + """ + Sampler for Megatron pretraining dataloaders that performs random sampling + across data parallel workers. Supports data sharding to divide the dataset + into buckets and shuffle within each bucket. Designed to work with distributed + training using Megatron's data parallelism. + """ - def __init__(self, dataset, total_samples, consumed_samples, micro_batch_size, - data_parallel_rank, data_parallel_size, data_sharding): + def __init__( + self, + dataset, + total_samples, + consumed_samples, + micro_batch_size, + data_parallel_rank, + data_parallel_size, + data_sharding, + ): # Keep a copy of input params for later use. self.dataset = dataset self.total_samples = total_samples @@ -149,19 +219,18 @@ def __init__(self, dataset, total_samples, consumed_samples, micro_batch_size, self.data_parallel_rank = data_parallel_rank self.data_parallel_size = data_parallel_size self.data_sharding = data_sharding - self.micro_batch_times_data_parallel_size = \ - self.micro_batch_size * data_parallel_size - self.last_batch_size = \ - self.total_samples % self.micro_batch_times_data_parallel_size + self.micro_batch_times_data_parallel_size = self.micro_batch_size * data_parallel_size + self.last_batch_size = self.total_samples % self.micro_batch_times_data_parallel_size # Sanity checks. - assert self.total_samples > 0, \ - 'no sample to consume: {}'.format(self.total_samples) + assert self.total_samples > 0, 'no sample to consume: {}'.format(self.total_samples) assert self.micro_batch_size > 0 assert data_parallel_size > 0 - assert self.data_parallel_rank < data_parallel_size, \ - 'data_parallel_rank should be smaller than data size: {}, ' \ - '{}'.format(self.data_parallel_rank, data_parallel_size) + assert ( + self.data_parallel_rank < data_parallel_size + ), 'data_parallel_rank should be smaller than data size: {}, ' '{}'.format( + self.data_parallel_rank, data_parallel_size + ) def __len__(self): return self.total_samples @@ -177,8 +246,9 @@ def __iter__(self): # data sharding and random sampling if self.data_sharding: - bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \ - * self.micro_batch_size + bucket_size = ( + self.total_samples // self.micro_batch_times_data_parallel_size + ) * self.micro_batch_size bucket_offset = current_epoch_samples // self.data_parallel_size start_idx = self.data_parallel_rank * bucket_size @@ -187,15 +257,13 @@ def __iter__(self): random_idx = torch.randperm(bucket_size, generator=g).tolist() idx_range = [start_idx + x for x in random_idx[bucket_offset:]] else: - full_bucket_size = (self.total_samples // self.micro_batch_size) \ - * self.micro_batch_size + full_bucket_size = (self.total_samples // self.micro_batch_size) * self.micro_batch_size full_bucket_offset = current_epoch_samples g = torch.Generator() g.manual_seed(self.epoch) - idx_range_total = \ - torch.randperm(full_bucket_size, generator=g).tolist() + idx_range_total = torch.randperm(full_bucket_size, generator=g).tolist() idx_range_active = idx_range_total[full_bucket_offset:] - idx_range = idx_range_active[self.data_parallel_rank::self.data_parallel_size] + idx_range = idx_range_active[self.data_parallel_rank :: self.data_parallel_size] batch = [] # Last batch if not complete will be dropped. diff --git a/megatron/training/datasets/fim_dataset.py b/megatron/training/datasets/fim_dataset.py new file mode 100644 index 00000000000..730b7e033a1 --- /dev/null +++ b/megatron/training/datasets/fim_dataset.py @@ -0,0 +1,308 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +from typing import Dict, Tuple, Optional +from dataclasses import dataclass, field + +import numpy as np +import logging +from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig +from megatron.core.datasets.indexed_dataset import IndexedDataset +from megatron.core.datasets.utils import Split + +logger = logging.getLogger(__name__) + + +@dataclass +class GPTFIMDatasetConfig(GPTDatasetConfig): + """Configuration object for Megatron Core GPT FIM datasets""" + + fim_rate: float = None + """Probability to convert a training sample into a FIM format""" + + fim_spm_rate: float = None + """Probability that the a FIM sample uses the SPM format over the PSM format""" + + fim_extra_tokens: Dict = None + """FIM extra tokens. Should consist of prefix, middle, suffix, PAD, and EOD tokens.""" + + fim_split_sample: Optional[str] = None + """String around which to split the sample for FIM""" + + fim_fragment_rate: Optional[float] = None + """Rate of FIM on each fragment when split_sample is not None""" + + fim_no_prefix: Optional[str] = None + """Do not apply FIM to fragments that start with this prefix""" + + +class GPTFIMDataset(GPTDataset): + """The base GPT dataset + + Args: + indexed_dataset (IndexedDataset): The IndexedDataset around which to build the + MegatronDataset + + indexed_indices (np.ndarray): The set of the documents indices to expose + + num_samples (int): The number of samples to draw from the indexed dataset + + index_split (Split): The indexed_indices Split + + config (GPTFIMDatasetConfig): The GPT-specific container for all config sourced parameters + """ + + def __init__( + self, + indexed_dataset: IndexedDataset, + dataset_path: str, + indexed_indices: np.ndarray, + num_samples: int, + index_split: Split, + config: GPTFIMDatasetConfig, + ) -> None: + super().__init__( + indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config + ) + + self.np_rng = np.random.RandomState(seed=self.config.random_seed) + logger.info(f"Initialized FIM RNG with seed = {self.config.random_seed}") + # get FIM params + self.fim_rate = self.config.fim_rate + self.fim_spm_rate = self.config.fim_spm_rate + self.fragment_fim_rate = self.config.fim_fragment_rate + fim_split_sample = self.config.fim_split_sample + self.no_fim_prefix = self.config.fim_no_prefix + if fim_split_sample: + fim_split_sample_ids = self.config.tokenizer._tokenizer.tokens_to_ids(fim_split_sample) + assert isinstance(fim_split_sample_ids, int) or len(fim_split_sample_ids) == 1 + self.fim_split_sample = ( + fim_split_sample_ids + if isinstance(fim_split_sample_ids, int) + else fim_split_sample_ids[0] + ) + else: + self.fim_split_sample = None + + # get extra tokens ids + fim_tokens = self.config.fim_extra_tokens + fim_tokens = [ + fim_tokens["prefix"], + fim_tokens["middle"], + fim_tokens["suffix"], + fim_tokens["pad"], + fim_tokens["eod"], + ] + fim_tokens_ids = self.config.tokenizer._tokenizer.tokens_to_ids(fim_tokens) + ( + self.prefix_tok_id, + self.middle_tok_id, + self.suffix_tok_id, + self.pad_tok_id, + self.eod_tok_id, + ) = fim_tokens_ids + + def _query_document_sample_shuffle_indices(self, idx: int) -> Tuple[np.ndarray, np.ndarray]: + """Get the text (token ids) and document ids for a given index + + Args: + idx (int): The index into the dataset + + Returns: + Tuple[np.ndarray, np.ndarray]: The text ids and document ids + """ + # Do the shuffle mapping + idx = self.shuffle_index[idx] + + # Get the beginning and end documents and offsets + doc_index_beg, doc_index_beg_offset = self.sample_index[idx] + doc_index_end, doc_index_end_offset = self.sample_index[idx + 1] + + document_ids = [] + sample_parts = [] + + # Sample spans a single document + if doc_index_beg == doc_index_end: + # Add the document id + document_ids.append(self.document_index[doc_index_beg]) + + # Add the entire sample + sample_parts.append( + self.dataset.get( + self.document_index[doc_index_beg], + offset=doc_index_beg_offset, + length=doc_index_end_offset - doc_index_beg_offset + 1, + ) + ) + + # Sample spans multiple documents + else: + for i in range(doc_index_beg, doc_index_end + 1): + # Add the document id + document_ids.append(self.document_index[i]) + + # Add the sample part + offset = 0 if i > doc_index_beg else doc_index_beg_offset + length = None if i < doc_index_end else doc_index_end_offset + 1 + sample_parts.append( + self.dataset.get(self.document_index[i], offset=offset, length=length) + ) + + sample = np.concatenate(sample_parts) + + sample_len = sample.shape[0] + segment_breaks = np.argwhere(sample == self.eod_tok_id) + + if segment_breaks.shape != (0, 1): # then there is an EOD token in this example + curr_start_position = 0 + new_samples = [] + for loc in np.nditer(segment_breaks): + # Only permute non-empty segments. + if loc - curr_start_position > 0: + # permute {prefix, suffix, middle} or {suffix, prefix, middle} + permuted = self._fim_split_and_permute_sequence(sample[curr_start_position:loc]) + new_samples += [permuted, [self.eod_tok_id]] + + curr_start_position = loc + 1 # jump over the EOD token + # Permute the segment after the last EOD + permuted = self._fim_split_and_permute_sequence(sample[curr_start_position:]) + new_samples.append(permuted) + + sample = np.concatenate(new_samples) + else: + sample = self._fim_split_and_permute_sequence(sample) + + diff = sample.shape[0] - sample_len + if diff > 0: # too long + sample = sample[:sample_len] + elif diff < 0: # too short + sample = np.concatenate([sample, np.full((-1 * diff), self.pad_tok_id)]) + + assert sample.shape[0] == sample_len + + return (np.array(sample, dtype=np.int64), np.array(document_ids, dtype=np.int64)) + + def _fim_permute_sequence(self, sequence, rate): + return self._permute( + sequence, + rate, + self.fim_spm_rate, + self.config.tokenizer, + truncate_or_pad=False, + suffix_tok_id=self.suffix_tok_id, + prefix_tok_id=self.prefix_tok_id, + middle_tok_id=self.middle_tok_id, + pad_tok_id=self.pad_tok_id, + no_fim_prefix=self.no_fim_prefix, + ) + + def _fim_split_and_permute_sequence(self, sequence): + """ + If self.fim_split_sample is not None, split the sequence. + Then apply FIM on the fragments, or the whole sequence if self.fim_split_sample is None. + """ + if self.fim_split_sample is None: + return self._fim_permute_sequence(sequence, self.fim_rate) + # fim_split_sample is set: split the sample on this token and permute each fragment separately. + # Typically, if each sample is a repository, then we split again on the file level. + # Each fragment is a file, and we permute the files. + fragment_breaks = np.argwhere(sequence == self.fim_split_sample) + if fragment_breaks.shape == (0, 1): + # no split token in this sample + return self._fim_permute_sequence(sequence, self.fim_rate) + if not self.np_rng.binomial(1, self.fim_rate): + # don't do FIM preproc + return sequence + # Do FIM on each fragment + curr_start_position = 0 + new_samples = [] + for loc in np.nditer(fragment_breaks): + if loc - curr_start_position > 0: + permuted = self._fim_permute_sequence( + sequence[curr_start_position:loc], self.fragment_fim_rate + ) + new_samples += [permuted, [self.fim_split_sample]] + curr_start_position = loc + 1 # Jump over the split token + # Permute the segment after the last split token + permuted = self._fim_permute_sequence( + sequence[curr_start_position:], self.fragment_fim_rate + ) + new_samples.append(permuted) + + return np.concatenate(new_samples) + + def _permute( + self, + sample, + fim_rate, + fim_spm_rate, + tokenizer, + truncate_or_pad=True, + suffix_tok_id=None, + prefix_tok_id=None, + middle_tok_id=None, + pad_tok_id=None, + no_fim_prefix=None, + ): + """ + Take in a sample (np array w/ size (0,chunklength)) and perform a FIM transformation on it. + Maintain the same sample length (if transform creates a few extra tokens, drop them). + """ + if self.np_rng.binomial(1, fim_rate): # sample bernoulli dist + + contents = tokenizer._tokenizer.ids_to_text(sample) + + # Do not apply FIM if the sample starts with no_fim_prefix + if no_fim_prefix is not None and contents.startswith(no_fim_prefix): + return sample + + try: + # A boundary can be =0 (prefix will be empty) + # a boundary can be =len(contents) (suffix will be empty) + # The two boundaries can be equal (middle will be empty) + boundaries = list(self.np_rng.randint(low=0, high=len(contents) + 1, size=2)) + boundaries.sort() + except ValueError as e: + print(len(contents), contents) + print(e) + raise e + + prefix = contents[: boundaries[0]] + middle = contents[boundaries[0] : boundaries[1]] + suffix = contents[boundaries[1] :] + + prefix = np.array([*tokenizer._tokenizer.text_to_ids(prefix)], dtype=np.int64) + middle = np.array([*tokenizer._tokenizer.text_to_ids(middle)], dtype=np.int64) + suffix = np.array([*tokenizer._tokenizer.text_to_ids(suffix)], dtype=np.int64) + + # here we truncate each given segment to fit the same length as it was before + # A consequence is that we never reach the end of a file? + # we should rather truncate at the context-level + if truncate_or_pad: + # need to make same length as the input. Take the 3 sentinel tokens into account + new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3 + diff = new_length - sample.shape[0] + if diff > 0: # too long + if ( + suffix.shape[0] <= diff + ): # if there's no space to truncate the suffix: stop and report it. atm i should have stopped this from happening + return sample + suffix = suffix[: suffix.shape[0] - diff] + elif diff < 0: # too short + suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)]) + + if self.np_rng.binomial(1, fim_spm_rate): + # SPM (variant 2 from FIM paper) + new_sample = np.concatenate( + [[prefix_tok_id, suffix_tok_id], suffix, [middle_tok_id], prefix, middle] + ) + else: + # PSM + new_sample = np.concatenate( + [[prefix_tok_id], prefix, [suffix_tok_id], suffix, [middle_tok_id], middle] + ) + + else: + # don't do FIM preproc + new_sample = sample + + return new_sample diff --git a/megatron/training/dist_signal_handler.py b/megatron/training/dist_signal_handler.py index f4b4fbf5c0d..f1f3725c8a9 100644 --- a/megatron/training/dist_signal_handler.py +++ b/megatron/training/dist_signal_handler.py @@ -3,6 +3,12 @@ import torch +SIGNAL_MAP = { + 'SIGTERM': signal.SIGTERM, + 'SIGINT': signal.SIGINT, + 'SIGUSR1': signal.SIGUSR1, + 'SIGUSR2': signal.SIGUSR2 +} def get_world_size(): if torch.distributed.is_available() and torch.distributed.is_initialized(): @@ -49,8 +55,8 @@ def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None): class DistributedSignalHandler: - def __init__(self, sig=signal.SIGTERM): - self.sig = sig + def __init__(self, sig: str = 'SIGTERM'): + self.sig = SIGNAL_MAP.get(sig, signal.SIGTERM) def signals_received(self): all_received = all_gather_item( diff --git a/megatron/training/global_vars.py b/megatron/training/global_vars.py index ec402263d29..a718877b40c 100644 --- a/megatron/training/global_vars.py +++ b/megatron/training/global_vars.py @@ -11,7 +11,7 @@ from megatron.core.energy_monitor import EnergyMonitor from megatron.core.jit import disable_jit_fuser from megatron.core.num_microbatches_calculator import init_num_microbatches_calculator, unset_num_microbatches_calculator -from megatron.training import dist_signal_handler +from megatron.training.dist_signal_handler import DistributedSignalHandler from megatron.training.tokenizer import build_tokenizer _GLOBAL_ARGS = None @@ -74,10 +74,11 @@ def get_signal_handler(): return _GLOBAL_SIGNAL_HANDLER -def _set_signal_handler(): +def _set_signal_handler(exit_signal): + global _GLOBAL_SIGNAL_HANDLER _ensure_var_is_not_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler') - _GLOBAL_SIGNAL_HANDLER = dist_signal_handler.DistributedSignalHandler().__enter__() + _GLOBAL_SIGNAL_HANDLER = DistributedSignalHandler(exit_signal).__enter__() @@ -110,7 +111,7 @@ def set_global_variables(args, build_tokenizer=True): set_experimental_flag(True) if args.exit_signal_handler: - _set_signal_handler() + _set_signal_handler(args.exit_signal) if args.disable_jit_fuser: disable_jit_fuser() diff --git a/megatron/training/training.py b/megatron/training/training.py index 9986f931641..58dcfbde734 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -2,6 +2,7 @@ """Pretrain utilities.""" +import copy import dataclasses from datetime import datetime, timedelta import functools @@ -11,7 +12,7 @@ import math import os import sys -from typing import List, Optional +from typing import Any, Optional import torch.distributed @@ -33,7 +34,7 @@ except ImportError: has_rl_utils = False try: - from megatron.post_training.algos.distillation import ( + from modelopt.torch.distill.plugins.megatron import ( get_tensor_shapes_adjust_fn_for_distillation, ) @@ -75,7 +76,7 @@ from megatron.core.distributed import finalize_model_grads from megatron.core.enums import ModelType -from megatron.core.optimizer import get_megatron_optimizer, OptimizerConfig +from megatron.core.optimizer import get_megatron_optimizer, AdamOptimizerConfig, SGDOptimizerConfig, OptimizerConfig, ParamKey from megatron.core.optimizer.muon import get_megatron_muon_optimizer from megatron.core.rerun_state_machine import ( get_rerun_state_machine, @@ -87,7 +88,7 @@ from megatron.training.initialize import write_args_to_tensorboard from megatron.training.initialize import set_jit_fusion_options from megatron.training.utils import get_batch_on_this_cp_rank, get_batch_on_this_tp_rank -from megatron.legacy.data.data_samplers import build_pretraining_data_loader +from megatron.training.datasets.data_samplers import build_pretraining_data_loader from megatron.core.optimizer_param_scheduler import OptimizerParamScheduler from megatron.core.transformer.moe import upcycling_utils from megatron.core.transformer.moe.moe_utils import track_moe_metrics @@ -161,22 +162,32 @@ def num_floating_point_operations(args, batch_size): def calculate_layer_counts(): """Calculate the number of attention, Mamba, and MLP layers.""" if args.hybrid_override_pattern: - counts = {'M': 0, '*': 0, '-': 0} + counts = {'M': 0, '*': 0, '-': 0, 'E':0} for layer_type in args.hybrid_override_pattern: if layer_type in counts: counts[layer_type] += 1 - return counts['*'], counts['M'], counts['-'] + return counts['*'], counts['M'], counts['-'], counts['E'] else: num_attn_layers = round(args.num_layers * args.hybrid_attention_ratio) num_mlp_layers = round(args.num_layers * args.hybrid_mlp_ratio) num_mamba_layers = args.num_layers - num_attn_layers - num_mlp_layers - return num_attn_layers, num_mamba_layers, num_mlp_layers + num_moe_layers = 0 + return num_attn_layers, num_mamba_layers, num_mlp_layers, num_moe_layers def mlp_layer_flops(batch_size, seq_len, hidden_size, expansion=4.0, swiglu=False): """Calculate FLOPs for an MLP layer.""" scale_factor = 3.0 / 2.0 if swiglu else 1.0 return 4 * expansion * scale_factor * batch_size * seq_len * hidden_size**2 + def moe_layer_flops(batch_size, seq_len, hidden_size, moe_ffn_hidden_size, + shared_expert_ffn_hidden_size, num_experts_routed_to, swiglu=False): + """Calculate FLOPs for an MoE layer.""" + scale_factor = 3.0 / 2.0 if swiglu else 1.0 + routed_flops = (4 * batch_size * seq_len * hidden_size * + moe_ffn_hidden_size * num_experts_routed_to * scale_factor) + shared_flops = 4 * batch_size * seq_len * hidden_size * shared_expert_ffn_hidden_size * scale_factor + return routed_flops + shared_flops + def attn_layer_flops( batch_size, seq_len, hidden_size, num_heads, gqa=True, gqa_groups=8, kv_channels=None ): @@ -215,12 +226,13 @@ def mamba_layer_flops(batch_size, seq_len, hidden_size, state_dim=16, ) def hybrid_flops(batch_size, seq_len, hidden_size, - num_attn_layers, num_mamba_layers, num_mlp_layers, + num_attn_layers, num_mamba_layers, num_mlp_layers, num_moe_layers, mamba_state_dim=128, mamba_head_dim=64, mamba_num_groups=8, mamba_num_heads=128, - num_attn_heads=32,gqa=True, + num_attn_heads=32, gqa=True, gqa_groups=8, kv_channels=None, mlp_expansion=4.0, swiglu=False, + moe_ffn_hidden_size=2048, shared_expert_ffn_hidden_size=2048, num_experts_routed_to=1, vocab_size=256000): """Calculate total FLOPs for the hybrid model.""" flops_fwd = ( @@ -231,6 +243,8 @@ def hybrid_flops(batch_size, seq_len, hidden_size, num_mamba_layers * mamba_layer_flops(batch_size, seq_len, hidden_size, mamba_state_dim, mamba_head_dim, mamba_num_groups, mamba_num_heads) + + num_moe_layers * moe_layer_flops(batch_size, seq_len, hidden_size, moe_ffn_hidden_size, + shared_expert_ffn_hidden_size, num_experts_routed_to, swiglu) + (2 * batch_size * seq_len * hidden_size * vocab_size) # logits computation ) return flops_fwd * 3 @@ -479,7 +493,7 @@ def transformer_flops(): # Main entrypoint for FLOPs calculation. if args.is_hybrid_model: # Calculate the number of each type of layer. - num_attn_layers, num_mamba_layers, num_mlp_layers = calculate_layer_counts() + num_attn_layers, num_mamba_layers, num_mlp_layers, num_moe_layers = calculate_layer_counts() # Compute hybrid model FLOPs. return hybrid_flops( @@ -489,6 +503,7 @@ def transformer_flops(): num_attn_layers=num_attn_layers, num_mamba_layers=num_mamba_layers, num_mlp_layers=num_mlp_layers, + num_moe_layers=num_moe_layers, mamba_state_dim=args.mamba_state_dim, mamba_head_dim=args.mamba_head_dim, mamba_num_groups=args.mamba_num_groups, @@ -499,6 +514,11 @@ def transformer_flops(): kv_channels=args.kv_channels, mlp_expansion=args.ffn_hidden_size / args.hidden_size, swiglu=args.swiglu, + moe_ffn_hidden_size=(args.moe_ffn_hidden_size if args.moe_ffn_hidden_size is not None + else args.ffn_hidden_size), + shared_expert_ffn_hidden_size=(0 if args.moe_shared_expert_intermediate_size is None + else args.moe_shared_expert_intermediate_size), + num_experts_routed_to=args.moe_router_topk, vocab_size=args.padded_vocab_size, ) else: @@ -594,30 +614,6 @@ def reorder_inner_param_groups(optimizer_state_dict): return preprocessed_common_state_dict -def get_no_weight_decay_cond(no_weight_decay_cond_type, default_skip_embedding_weight_decay): - """Get the no weight decay condition function.""" - - # Default case: no_weight_decay_cond_type is None - no_weight_decay_cond_fn = None - - if no_weight_decay_cond_type == 'apply_wd_to_qk_layernorm': - # Qwen3-Next applies weight decay to qk layernorm as a special case - def apply_wd_to_qk_layernorm_fn(name, param): - if "q_layernorm" in name or "k_layernorm" in name: - no_wd = False - else: - no_wd = ( - name.endswith(".bias") - or len(param.shape) == 1 - or (default_skip_embedding_weight_decay and "embedding" in name) - ) - return no_wd - no_weight_decay_cond_fn = apply_wd_to_qk_layernorm_fn - elif no_weight_decay_cond_type is not None: - raise ValueError(f"Invalid no_weight_decay_cond_type: {no_weight_decay_cond_type}") - - return no_weight_decay_cond_fn - def pretrain( train_valid_test_dataset_provider, model_provider, @@ -754,15 +750,8 @@ def pretrain( # Model, optimizer, and learning rate. timers('model-and-optimizer-setup', log_level=0).start(barrier=True) - no_weight_decay_cond = get_no_weight_decay_cond( - args.no_weight_decay_cond_type, - default_skip_embedding_weight_decay=args.embedding_init_method_std is not None, - ) model, optimizer, opt_param_scheduler = setup_model_and_optimizer( - model_provider, - model_type, - checkpointing_context=checkpointing_context, - no_weight_decay_cond=no_weight_decay_cond, + model_provider, model_type, checkpointing_context=checkpointing_context ) timers('model-and-optimizer-setup').stop() @@ -1178,12 +1167,45 @@ def get_optimizer_param_scheduler(optimizer): return opt_param_scheduler +def get_megatron_optimizer_config(args: Any) -> OptimizerConfig: + """Return a Megatron optimizer config object from Megatron's arguments.""" + + config = None + if args.optimizer == 'adam' or 'muon' in args.optimizer: + # TODO(deyuf): Muon needs both adam + muon but get() only receive one config + # So for now we keep using adam config that's back compat with old way + kwargs = {} + for f in dataclasses.fields(AdamOptimizerConfig): + if hasattr(args, f.name): + kwargs[f.name] = getattr(args, f.name) + config = AdamOptimizerConfig(**kwargs) + elif args.optimizer == 'sgd': + kwargs = {} + for f in dataclasses.fields(SGDOptimizerConfig): + if hasattr(args, f.name): + kwargs[f.name] = getattr(args, f.name) + config = SGDOptimizerConfig(**kwargs) + else: + raise ValueError("Invalid optimizer type!") + + # Construct the appropriate config_overrides object. + # TODO: add more logic here as needed down the road. + if args.decoupled_lr is not None: + decoupled_param_key = ParamKey(attr="is_embedding_or_output_parameter") + decoupled_optimizer_config = copy.deepcopy(config) + decoupled_optimizer_config.lr = args.decoupled_lr + if args.decoupled_min_lr is not None: + decoupled_optimizer_config.min_lr = args.decoupled_min_lr + config_overrides = {decoupled_param_key: decoupled_optimizer_config} + else: + config_overrides = None + + return config, config_overrides + + def setup_model_and_optimizer( model_provider_func, model_type, - no_weight_decay_cond=None, - scale_lr_cond=None, - lr_mult=1.0, checkpointing_context=None, ): """Setup model and optimizer.""" @@ -1195,33 +1217,25 @@ def setup_model_and_optimizer( unwrapped_model = unwrap_model(model) one_logger and one_logger.log_metrics({"app_build_optimzer_start_time": one_logger_utils.get_timestamp_in_ms()}) - kwargs = {} - for f in dataclasses.fields(OptimizerConfig): - if hasattr(args, f.name): - kwargs[f.name] = getattr(args, f.name) - config = OptimizerConfig(**kwargs) + config, config_overrides = get_megatron_optimizer_config(args) config.timers = timers if 'muon' not in config.optimizer: + # If the user is asking for a non-zero embedding init std, skip weight decay for embeddings + # to avoid embeddings from shrinking to zero as recommended in https://arxiv.org/abs/2312.16903 + # default_skip_embedding_weight_decay=args.embedding_init_method_std is not None, optimizer = get_megatron_optimizer( config, model, - no_weight_decay_cond, - scale_lr_cond, - lr_mult, + config_overrides=config_overrides, use_gloo_process_groups=args.enable_gloo_process_groups, - # If the user is asking for a non-zero embedding init std, skip weight decay for embeddings - # to avoid embeddings from shrinking to zero as recommended in https://arxiv.org/abs/2312.16903 - default_skip_embedding_weight_decay=args.embedding_init_method_std is not None, dump_param_to_param_group_map=args.dump_param_to_param_group_map, ) else: optimizer = get_megatron_muon_optimizer( config, model, - no_weight_decay_cond, - scale_lr_cond, - lr_mult, + config_overrides=config_overrides, use_gloo_process_groups=args.enable_gloo_process_groups, layer_wise_distributed_optimizer='dist' in config.optimizer, ) @@ -1365,7 +1379,10 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch if has_nvidia_modelopt: # [ModelOpt]: Pipeline-parallel Distillation stacks student and teacher tensors adjust_tensor_shapes_fn = get_tensor_shapes_adjust_fn_for_distillation( - model, args.seq_length, args.micro_batch_size, args.decoder_seq_length + model, + seq_length=args.seq_length, + micro_batch_size=args.micro_batch_size, + decoder_seq_length=args.decoder_seq_length, ) else: adjust_tensor_shapes_fn = None @@ -1494,7 +1511,6 @@ def training_log( loss_dict, total_loss_dict, learning_rate, - decoupled_learning_rate, iteration, loss_scale, report_memory_flag, @@ -1599,8 +1615,6 @@ def training_log( writer.add_scalar('learning-rate vs samples', learning_rate, args.consumed_train_samples) if wandb_writer: wandb_writer.log({'learning-rate': learning_rate}, iteration) - if args.decoupled_lr is not None: - writer.add_scalar('decoupled-learning-rate', decoupled_learning_rate, iteration) if args.skipped_train_samples > 0: writer.add_scalar('skipped-train-samples', args.skipped_train_samples, iteration) if wandb_writer: @@ -1680,6 +1694,12 @@ def training_log( track_names.append("global_load_balancing_loss") if args.moe_z_loss_coeff is not None: track_names.append("z_loss") + + if args.is_hybrid_model: + layers = args.hybrid_override_pattern.count('E') + else: + layers = args.num_layers + track_moe_metrics( loss_scale=moe_loss_scale, iteration=iteration, @@ -1689,7 +1709,7 @@ def training_log( per_layer_logging=args.moe_per_layer_logging, force_initialize=True, track_names=track_names, - num_layers=args.num_layers, + num_layers=layers, moe_layer_freq=args.moe_layer_freq, mtp_num_layers=args.mtp_num_layers, ) @@ -1750,14 +1770,6 @@ def training_log( wandb_writer.log({'power/gpu': power}, iteration) # Decoupled_learning_rate should be not None only on first and last pipeline stage. log_string += f' learning rate: {learning_rate:.6E} |' - if args.decoupled_lr is not None and ( - mpu.is_pipeline_first_stage(ignore_virtual=True) - or mpu.is_pipeline_last_stage(ignore_virtual=True) - ): - assert decoupled_learning_rate is not None - log_string += f' decoupled learning rate: {decoupled_learning_rate:.6E} |' - else: - assert decoupled_learning_rate is None log_string += f' global batch size: {batch_size:5d} |' for key in total_loss_dict: if key not in [advanced_iters_key, skipped_iters_key, nan_iters_key]: @@ -2523,19 +2535,15 @@ def get_e2e_base_metrics(): if args.log_params_norm: params_norm = calc_params_l2_norm(model) learning_rate = None - decoupled_learning_rate = None for param_group in optimizer.param_groups: if len(param_group['params']) == 0: continue - if param_group['is_decoupled_lr']: - decoupled_learning_rate = param_group['lr'] - else: + if param_group['default_config']: learning_rate = param_group['lr'] report_memory_flag = training_log( loss_dict, total_loss_dict, learning_rate, - decoupled_learning_rate, iteration, loss_scale, report_memory_flag, diff --git a/pretrain_gpt.py b/pretrain_gpt.py index ecb7163ff70..9b13d66c7a7 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -20,6 +20,7 @@ from megatron.training.arguments import core_transformer_config_from_args from megatron.training import get_args, get_timers, get_tokenizer, inprocess_restart, pretrain, print_rank_0 from megatron.training.datasets.sft_dataset import SFTDataset +from megatron.training.datasets.fim_dataset import GPTFIMDataset, GPTFIMDatasetConfig from megatron.training.utils import ( get_batch_on_this_cp_rank, get_batch_on_this_tp_rank, @@ -185,26 +186,49 @@ def core_gpt_dataset_config_from_args(args): blend_per_split: Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]] blend, blend_per_split = get_blend_and_blend_per_split(args) - return GPTDatasetConfig( - random_seed=args.seed, - sequence_length=args.seq_length, - blend=blend, - blend_per_split=blend_per_split, - split=args.split, - multiple_validation_sets=args.multiple_validation_sets, - full_validation=args.full_validation, - num_dataset_builder_threads=args.num_dataset_builder_threads, - path_to_cache=args.data_cache_path, - mmap_bin_files=args.mmap_bin_files, - tokenizer=tokenizer, - reset_position_ids=args.reset_position_ids, - reset_attention_mask=args.reset_attention_mask, - eod_mask_loss=args.eod_mask_loss, - create_attention_mask=args.create_attention_mask_in_dataloader, - object_storage_cache_path=args.object_storage_cache_path, - mid_level_dataset_surplus=args.mid_level_dataset_surplus, - allow_ambiguous_pad_tokens=args.allow_ambiguous_pad_tokens, - ) + data_args = { + "random_seed": args.seed, + "sequence_length": args.seq_length, + "blend": blend, + "blend_per_split": blend_per_split, + "split": args.split, + "multiple_validation_sets": args.multiple_validation_sets, + "full_validation": args.full_validation, + "num_dataset_builder_threads": args.num_dataset_builder_threads, + "path_to_cache": args.data_cache_path, + "mmap_bin_files": args.mmap_bin_files, + "tokenizer": tokenizer, + "reset_position_ids": args.reset_position_ids, + "reset_attention_mask": args.reset_attention_mask, + "eod_mask_loss": args.eod_mask_loss, + "create_attention_mask": args.create_attention_mask_in_dataloader, + "object_storage_cache_path": args.object_storage_cache_path, + "mid_level_dataset_surplus": args.mid_level_dataset_surplus, + "allow_ambiguous_pad_tokens": args.allow_ambiguous_pad_tokens, + } + + # add FIM args to the config + if args.fim_data: + extra_tokens = { + "prefix": args.fim_prefix_token, + "middle": args.fim_middle_token, + "suffix": args.fim_suffix_token, + "pad": args.fim_pad_token, + "eod": args.fim_eod_token, + } + data_args.update( + { + "fim_rate": args.fim_rate, + "fim_spm_rate": args.fim_spm_rate, + "fim_extra_tokens": extra_tokens, + "fim_split_sample": args.fim_split_sample, + "fim_fragment_rate": args.fim_fragment_rate, + "fim_no_prefix": args.fim_no_prefix, + } + ) + return GPTFIMDatasetConfig(**data_args) + + return GPTDatasetConfig(**data_args) def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None): @@ -222,6 +246,8 @@ def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None else: if args.mock_data: dataset_type = MockGPTDataset + elif args.fim_data: + dataset_type = GPTFIMDataset else: dataset_type = GPTDataset diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_h100.json new file mode 100644 index 00000000000..cd90888e65d --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_h100.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.89074, + "2": 10.89234, + "3": 10.89032, + "4": 10.89221, + "5": 10.89416, + "6": 10.90226, + "7": 10.8884, + "8": 10.90211, + "9": 10.90202, + "10": 10.88512, + "11": 10.87636, + "12": 10.89499, + "13": 10.89837, + "14": 10.89182, + "15": 10.85125, + "16": 10.8534, + "17": 10.82862, + "18": 10.83653, + "19": 10.82847, + "20": 10.74583, + "21": 10.73117, + "22": 10.61256, + "23": 10.72616, + "24": 10.62932, + "25": 10.59394, + "26": 10.63357, + "27": 10.63137, + "28": 10.58201, + "29": 10.58671, + "30": 10.40936, + "31": 10.15873, + "32": 10.48319, + "33": 10.46977, + "34": 10.23978, + "35": 10.28144, + "36": 10.23894, + "37": 10.35198, + "38": 10.20565, + "39": 10.40496, + "40": 10.09271, + "41": 10.16148, + "42": 10.2231, + "43": 9.84152, + "44": 9.97329, + "45": 9.84544, + "46": 9.82102, + "47": 10.14261, + "48": 9.86553, + "49": 9.54033, + "50": 9.9169 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1544.0, + "2": 1729.0, + "3": 1672.0, + "4": 1807.0, + "5": 1942.0, + "6": 1736.0, + "7": 1956.0, + "8": 1716.0, + "9": 2011.0, + "10": 1385.0, + "11": 1864.0, + "12": 1767.0, + "13": 2019.0, + "14": 1787.0, + "15": 1828.0, + "16": 1908.0, + "17": 1718.0, + "18": 1602.0, + "19": 1785.0, + "20": 1679.0, + "21": 1917.0, + "22": 1712.0, + "23": 2034.0, + "24": 1752.0, + "25": 1645.0, + "26": 1820.0, + "27": 1915.0, + "28": 1996.0, + "29": 2051.0, + "30": 1890.0, + "31": 1577.0, + "32": 1886.0, + "33": 2116.0, + "34": 1912.0, + "35": 2037.0, + "36": 1924.0, + "37": 2462.0, + "38": 2241.0, + "39": 2321.0, + "40": 2221.0, + "41": 2345.0, + "42": 2386.0, + "43": 2027.0, + "44": 2211.0, + "45": 2096.0, + "46": 2285.0, + "47": 2536.0, + "48": 2289.0, + "49": 2270.0, + "50": 2421.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 581489664.0, + "2": 581489664.0, + "3": 581489664.0, + "4": 581489664.0, + "5": 581489664.0, + "6": 581489664.0, + "7": 581489664.0, + "8": 581489664.0, + "9": 581489664.0, + "10": 581489664.0, + "11": 581489664.0, + "12": 581489664.0, + "13": 581489664.0, + "14": 581489664.0, + "15": 581489664.0, + "16": 581489664.0, + "17": 581489664.0, + "18": 581489664.0, + "19": 581489664.0, + "20": 581489664.0, + "21": 581489664.0, + "22": 581489664.0, + "23": 581489664.0, + "24": 581489664.0, + "25": 581489664.0, + "26": 581489664.0, + "27": 581489664.0, + "28": 581489664.0, + "29": 581489664.0, + "30": 581489664.0, + "31": 581489664.0, + "32": 581489664.0, + "33": 581489664.0, + "34": 581489664.0, + "35": 581489664.0, + "36": 581489664.0, + "37": 581489664.0, + "38": 581489664.0, + "39": 581489664.0, + "40": 581489664.0, + "41": 581489664.0, + "42": 581489664.0, + "43": 581489664.0, + "44": 581489664.0, + "45": 581489664.0, + "46": 581489664.0, + "47": 581489664.0, + "48": 581489664.0, + "49": 581489664.0, + "50": 581489664.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 4605814272.0, + "2": 4702430720.0, + "3": 4702430720.0, + "4": 4702430720.0, + "5": 4702430720.0, + "6": 4702430720.0, + "7": 4702430720.0, + "8": 4702430720.0, + "9": 4702430720.0, + "10": 4702430720.0, + "11": 4702430720.0, + "12": 4702430720.0, + "13": 4702430720.0, + "14": 4702430720.0, + "15": 4702430720.0, + "16": 4702430720.0, + "17": 4702430720.0, + "18": 4702430720.0, + "19": 4702430720.0, + "20": 4702430720.0, + "21": 4702430720.0, + "22": 4702430720.0, + "23": 4702430720.0, + "24": 4702430720.0, + "25": 4702430720.0, + "26": 4702430720.0, + "27": 4702430720.0, + "28": 4702430720.0, + "29": 4702430720.0, + "30": 4702430720.0, + "31": 4702430720.0, + "32": 4702430720.0, + "33": 4702430720.0, + "34": 4702430720.0, + "35": 4702430720.0, + "36": 4702430720.0, + "37": 4702430720.0, + "38": 4702430720.0, + "39": 4702430720.0, + "40": 4702430720.0, + "41": 4702430720.0, + "42": 4702430720.0, + "43": 4702430720.0, + "44": 4702430720.0, + "45": 4702430720.0, + "46": 4702430720.0, + "47": 4702430720.0, + "48": 4702430720.0, + "49": 4702430720.0, + "50": 4702430720.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 6.95394, + "2": 0.0878, + "3": 0.06953, + "4": 0.07916, + "5": 0.06775, + "6": 0.07681, + "7": 0.06695, + "8": 0.0786, + "9": 0.0664, + "10": 0.08059, + "11": 0.06554, + "12": 0.07501, + "13": 0.06663, + "14": 0.06608, + "15": 0.06585, + "16": 0.06738, + "17": 0.067, + "18": 0.06553, + "19": 0.06755, + "20": 0.06723, + "21": 0.06559, + "22": 0.0664, + "23": 0.06722, + "24": 0.06553, + "25": 0.06829, + "26": 0.06873, + "27": 0.06733, + "28": 0.06731, + "29": 0.06824, + "30": 0.06696, + "31": 0.06661, + "32": 0.06587, + "33": 0.06588, + "34": 0.06564, + "35": 0.06761, + "36": 0.06655, + "37": 0.06712, + "38": 0.06601, + "39": 0.06661, + "40": 0.06632, + "41": 0.0691, + "42": 0.06551, + "43": 0.06839, + "44": 0.06528, + "45": 0.06744, + "46": 0.0675, + "47": 0.06698, + "48": 0.0649, + "49": 0.06596, + "50": 0.06581 + } + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/model_config.yaml new file mode 100644 index 00000000000..ddc8286573b --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/model_config.yaml @@ -0,0 +1,56 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: Ring + CUBLAS_WORKSPACE_CONFIG: :4096:8 +MODEL_ARGS: + --num-layers: 12 + --hidden-size: 512 + --num-attention-heads: 8 + --log-params-norm: true + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --tensorboard-dir: ${TENSORBOARD_PATH} + --micro-batch-size: 4 + --global-batch-size: 32 + --seq-length: 1024 + --max-position-embeddings: 1024 + --train-iters: 50 + --timing-log-level: 0 + --lr-decay-iters: 320000 + --save: ${CHECKPOINT_SAVE_PATH} + --load: ${CHECKPOINT_LOAD_PATH} + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt + --split: 949,50,1 + --distributed-backend: nccl + --lr: 0.00015 + --lr-decay-style: cosine + --min-lr: 1.0e-5 + --weight-decay: 1e-2 + --clip-grad: 1.0 + --lr-warmup-fraction: .01 + --log-interval: 1 + --save-interval: 10000 + --eval-interval: 1000 + --eval-iters: 10 + --transformer-impl: transformer_engine + --tensor-model-parallel-size: 1 + --pipeline-model-parallel-size: 1 + --use-distributed-optimizer: true + --deterministic-mode: true + --no-gradient-accumulation-fusion: true + --attention-softmax-in-fp32: true + --use-mcore-models: true + --ckpt-format: torch_dist + --dist-ckpt-strictness: log_all # backward compatibility for TE changes + --data-cache-path: ${DATA_CACHE_PATH} + --bf16: true + --attention-backend: unfused + --log-memory-to-tensorboard: true + --fim-data: true + --fim-rate: 0.5 + --fim-spm-rate: 0.5 +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgx_h100.json index 12a9b70df83..cbc5f4fa3ae 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgx_h100.json @@ -1,178 +1,187 @@ { - "0": { - "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", - "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", - "generated_tokens": [ - 3060, - 2430, - 1636, - 2012, - 1317, - 1278, - 2362, - 1307, - 1278, - 16070, - 1044, - 1321, - 1636, - 23067, - 1455, - 1593, - 1395, - 1605, - 3140, - 5152, - 1513, - 1747, - 1046, - 2409, - 1395, - 3140, - 5152, - 1513, - 1278, - 2362 - ], - "latency": 0.29413437843322754, - "cuda_graph_request_count_map": { - "372": 0, - "360": 0, - "336": 0, - "312": 0, - "288": 0, - "264": 0, - "240": 0, - "216": 0, - "192": 0, - "168": 0, - "144": 0, - "120": 0, - "96": 0, - "72": 0, - "48": 0, - "24": 29 - }, - "step_count": 240, - "logprobs": [ - -9.362494468688965, - -2.827894449234009, - -4.557381629943848, - -1.4968647956848145, - -0.717312216758728, - -1.7262351512908936, - -2.522736072540283, - -2.1782360076904297, - -2.3603432178497314, - -6.136383533477783, - -1.4676916599273682, - -3.468963384628296, - -4.424870491027832, - -3.7345848083496094, - -2.012619972229004, - -1.8833301067352295, - -3.5708768367767334, - -6.8197832107543945, - -0.3122292757034302, - -0.9820290207862854, - -6.532033443450928, - -7.498172760009766, - -12.615165710449219, - -2.409003496170044, - -3.8550546169281006, - -0.5105050802230835, - -4.2802581787109375, - -0.06971167027950287, - -0.054025799036026, - -3.319596767425537, - -9.703240394592285, - -1.0997297763824463, - -6.224854469299316, - -5.234503269195557, - -3.934987783432007, - -2.5263679027557373, - -3.1843955516815186, - -5.880871295928955, - -1.8436813354492188, - -5.906496047973633, - -12.15787410736084, - -12.5841064453125, - -0.0819428563117981, - -2.6212656497955322, - -1.4329369068145752, - -2.885145425796509, - -1.2901865243911743, - -0.006647023372352123, - -3.5115818977355957, - -12.945953369140625, - -3.793078899383545, - -3.0094375610351562, - -5.966838836669922, - -0.8998424410820007, - -0.040962252765893936, - -1.5467679500579834, - -1.0785343647003174, - -5.73494815826416, - -0.38491737842559814, - -5.017007827758789, - -0.5568072199821472, - -0.5968841910362244, - -2.3609962463378906, - -13.582086563110352, - -0.09050048142671585, - -3.7264108657836914, - -1.1208789348602295, - -6.052675247192383, - -0.5848909616470337, - -3.5906238555908203, - -0.9494907855987549, - -1.5676641464233398, - -5.127577781677246, - -17.19189453125, - -6.698403835296631, - -1.0449178218841553, - -4.365664958953857, - -1.1243419647216797, - -2.2092156410217285, - -1.8081634044647217, - -0.23330983519554138, - -9.439546585083008, - -0.2947109341621399, - -7.253565788269043, - -2.3855936527252197, - -4.629369258880615, - -3.4186267852783203, - -1.9727531671524048, - -2.331681251525879, - -1.5606917142868042, - -2.454296588897705, - -1.5334703922271729, - -1.2631131410598755, - -2.657367706298828, - -0.6480202078819275, - -0.4550393521785736, - -1.3625166416168213, - -0.8142069578170776, - -0.4496593475341797, - -0.9312890768051147, - -1.732723355293274, - -0.44613128900527954, - -1.6895122528076172, - -0.6082233190536499, - -1.0978344678878784, - -1.1122435331344604, - -0.002520838286727667, - -1.4072327613830566, - -0.007462364621460438, - -0.7548662424087524, - -0.9937503337860107, - -0.0675487294793129, - -0.9595617055892944, - -0.029961343854665756, - -2.205785036087036, - -1.2615025043487549, - -0.7878209352493286 - ] - }, - "throughput": [104.98559493782837, 104.98559493782837] + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", + "generated_tokens": [ + 3060, + 2430, + 1636, + 2012, + 1317, + 1278, + 2362, + 1307, + 1278, + 16070, + 1044, + 1321, + 1636, + 23067, + 1455, + 1593, + 1395, + 1605, + 3140, + 5152, + 1513, + 1747, + 1046, + 2409, + 1395, + 3140, + 5152, + 1513, + 1278, + 2362 + ], + "latency": 0.2963709831237793, + "cuda_graph_request_count_map": { + "852": 0, + "840": 0, + "784": 0, + "728": 0, + "672": 0, + "616": 0, + "560": 0, + "504": 0, + "448": 0, + "392": 0, + "336": 0, + "280": 0, + "224": 0, + "168": 0, + "112": 0, + "56": 29 + }, + "step_count": 240, + "logprobs": [ + -9.362494468688965, + -2.827894449234009, + -4.557381629943848, + -1.4968647956848145, + -0.717312216758728, + -1.7262351512908936, + -2.522736072540283, + -2.1782360076904297, + -2.3603432178497314, + -6.136383533477783, + -1.4676916599273682, + -3.468963384628296, + -4.424870491027832, + -3.7345848083496094, + -2.012619972229004, + -1.8833301067352295, + -3.5708768367767334, + -6.8197832107543945, + -0.3122292757034302, + -0.9820290207862854, + -6.532033443450928, + -7.498172760009766, + -12.615165710449219, + -2.409003496170044, + -3.8550546169281006, + -0.5105050802230835, + -4.2802581787109375, + -0.06971167027950287, + -0.054025799036026, + -3.319596767425537, + -9.703240394592285, + -1.0997297763824463, + -6.224854469299316, + -5.234503269195557, + -3.934987783432007, + -2.5263679027557373, + -3.1843955516815186, + -5.880871295928955, + -1.8436813354492188, + -5.906496047973633, + -12.15787410736084, + -12.5841064453125, + -0.0819428563117981, + -2.6212656497955322, + -1.4329369068145752, + -2.885145425796509, + -1.2901865243911743, + -0.006647023372352123, + -3.5115818977355957, + -12.945953369140625, + -3.793078899383545, + -3.0094375610351562, + -5.966838836669922, + -0.8998424410820007, + -0.040962252765893936, + -1.5467679500579834, + -1.0785343647003174, + -5.73494815826416, + -0.38491737842559814, + -5.017007827758789, + -0.5568072199821472, + -0.5968841910362244, + -2.3609962463378906, + -13.582086563110352, + -0.09050048142671585, + -3.7264108657836914, + -1.1208789348602295, + -6.052675247192383, + -0.5848909616470337, + -3.5906238555908203, + -0.9494907855987549, + -1.5676641464233398, + -5.127577781677246, + -17.19189453125, + -6.698403835296631, + -1.0449178218841553, + -4.365664958953857, + -1.1243419647216797, + -2.2092156410217285, + -1.8081634044647217, + -0.23330983519554138, + -9.439546585083008, + -0.2947109341621399, + -7.253565788269043, + -2.3855936527252197, + -4.629369258880615, + -3.4186267852783203, + -1.9727531671524048, + -2.354729652404785, + -1.474542498588562, + -2.48478364944458, + -1.7641210556030273, + -1.1853944063186646, + -2.8624324798583984, + -0.5740103125572205, + -0.4542185962200165, + -1.4300930500030518, + -0.8807456493377686, + -0.4597663879394531, + -0.9252307415008545, + -1.648141860961914, + -0.44453874230384827, + -1.818476915359497, + -0.5714479088783264, + -1.2115143537521362, + -1.0910619497299194, + -0.0023161747958511114, + -1.3206473588943481, + -0.008621376007795334, + -0.7551823854446411, + -0.9404395818710327, + -0.07279698550701141, + -0.9365248680114746, + -0.03344438225030899, + -1.9720849990844727, + -1.3928067684173584, + -0.7453650832176208 + ] + }, + "throughput": [ + 5.425516447410972, + 95.53889537647129, + 98.64633360458717, + 100.31860128598137, + 100.41338716203114, + 100.2318180695741, + 100.30260782227111, + 100.30996418216475 + ] } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml index 0675b047464..15a4a655049 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml @@ -46,8 +46,6 @@ MODEL_ARGS: --return-log-probs: true --num-tokens-to-generate: 30 --enable-cuda-graph: true - --inference-dynamic-batching-buffer-guaranteed-fraction: 0 - --inference-dynamic-batching-buffer-overflow-factor: 0.2 --inference-dynamic-batching-buffer-size-gb: 20 --dist-ckpt-strictness: log_unexpected --inference-ckpt-non-strict: true # To handle the extra_state errors diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgx_h100.json index 8e07dfee229..c22bb604f94 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgx_h100.json @@ -1,178 +1,187 @@ { - "0": { - "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", - "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", - "generated_tokens": [ - 3060, - 2430, - 1636, - 2012, - 1317, - 1278, - 2362, - 1307, - 1278, - 16070, - 1044, - 1321, - 1636, - 23067, - 1455, - 1593, - 1395, - 1605, - 3140, - 5152, - 1513, - 1747, - 1046, - 2409, - 1395, - 3140, - 5152, - 1513, - 1278, - 2362 - ], - "latency": 0.3712351322174072, - "cuda_graph_request_count_map": { - "372": 0, - "360": 0, - "336": 0, - "312": 0, - "288": 0, - "264": 0, - "240": 0, - "216": 0, - "192": 0, - "168": 0, - "144": 0, - "120": 0, - "96": 0, - "72": 0, - "48": 0, - "24": 29 - }, - "step_count": 240, - "logprobs": [ - -9.362494468688965, - -2.827894449234009, - -4.557381629943848, - -1.4968647956848145, - -0.717312216758728, - -1.7262351512908936, - -2.522736072540283, - -2.1782360076904297, - -2.3603432178497314, - -6.136383533477783, - -1.4676916599273682, - -3.468963384628296, - -4.424870491027832, - -3.7345848083496094, - -2.012619972229004, - -1.8833301067352295, - -3.5708768367767334, - -6.8197832107543945, - -0.3122292757034302, - -0.9820290207862854, - -6.532033443450928, - -7.498172760009766, - -12.615165710449219, - -2.409003496170044, - -3.8550546169281006, - -0.5105050802230835, - -4.2802581787109375, - -0.06971167027950287, - -0.054025799036026, - -3.319596767425537, - -9.703240394592285, - -1.0997297763824463, - -6.224854469299316, - -5.234503269195557, - -3.934987783432007, - -2.5263679027557373, - -3.1843955516815186, - -5.880871295928955, - -1.8436813354492188, - -5.906496047973633, - -12.15787410736084, - -12.5841064453125, - -0.0819428563117981, - -2.6212656497955322, - -1.4329369068145752, - -2.885145425796509, - -1.2901865243911743, - -0.006647023372352123, - -3.5115818977355957, - -12.945953369140625, - -3.793078899383545, - -3.0094375610351562, - -5.966838836669922, - -0.8998424410820007, - -0.040962252765893936, - -1.5467679500579834, - -1.0785343647003174, - -5.73494815826416, - -0.38491737842559814, - -5.017007827758789, - -0.5568072199821472, - -0.5968841910362244, - -2.3609962463378906, - -13.582086563110352, - -0.09050048142671585, - -3.7264108657836914, - -1.1208789348602295, - -6.052675247192383, - -0.5848909616470337, - -3.5906238555908203, - -0.9494907855987549, - -1.5676641464233398, - -5.127577781677246, - -17.19189453125, - -6.698403835296631, - -1.0449178218841553, - -4.365664958953857, - -1.1243419647216797, - -2.2092156410217285, - -1.8081634044647217, - -0.23330983519554138, - -9.439546585083008, - -0.2947109341621399, - -7.253565788269043, - -2.3855936527252197, - -4.629369258880615, - -3.4186267852783203, - -1.9727531671524048, - -2.331681251525879, - -1.5606917142868042, - -2.454296588897705, - -1.5334703922271729, - -1.2631131410598755, - -2.657367706298828, - -0.6480202078819275, - -0.4550393521785736, - -1.3625166416168213, - -0.8142069578170776, - -0.4496593475341797, - -0.9312890768051147, - -1.732723355293274, - -0.44613128900527954, - -1.6895122528076172, - -0.6082233190536499, - -1.0978344678878784, - -1.1122435331344604, - -0.002520838286727667, - -1.4072327613830566, - -0.007462364621460438, - -0.7548662424087524, - -0.9937503337860107, - -0.0675487294793129, - -0.9595617055892944, - -0.029961343854665756, - -2.205785036087036, - -1.2615025043487549, - -0.7878209352493286 - ] - }, - "throughput": [79.88988160240554, 79.88988160240554] + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", + "generated_tokens": [ + 3060, + 2430, + 1636, + 2012, + 1317, + 1278, + 2362, + 1307, + 1278, + 16070, + 1044, + 1321, + 1636, + 23067, + 1455, + 1593, + 1395, + 1605, + 3140, + 5152, + 1513, + 1747, + 1046, + 2409, + 1395, + 3140, + 5152, + 1513, + 1278, + 2362 + ], + "latency": 0.38181447982788086, + "cuda_graph_request_count_map": { + "852": 0, + "840": 0, + "784": 0, + "728": 0, + "672": 0, + "616": 0, + "560": 0, + "504": 0, + "448": 0, + "392": 0, + "336": 0, + "280": 0, + "224": 0, + "168": 0, + "112": 0, + "56": 29 + }, + "step_count": 240, + "logprobs": [ + -9.362494468688965, + -2.827894449234009, + -4.557381629943848, + -1.4968647956848145, + -0.717312216758728, + -1.7262351512908936, + -2.522736072540283, + -2.1782360076904297, + -2.3603432178497314, + -6.136383533477783, + -1.4676916599273682, + -3.468963384628296, + -4.424870491027832, + -3.7345848083496094, + -2.012619972229004, + -1.8833301067352295, + -3.5708768367767334, + -6.8197832107543945, + -0.3122292757034302, + -0.9820290207862854, + -6.532033443450928, + -7.498172760009766, + -12.615165710449219, + -2.409003496170044, + -3.8550546169281006, + -0.5105050802230835, + -4.2802581787109375, + -0.06971167027950287, + -0.054025799036026, + -3.319596767425537, + -9.703240394592285, + -1.0997297763824463, + -6.224854469299316, + -5.234503269195557, + -3.934987783432007, + -2.5263679027557373, + -3.1843955516815186, + -5.880871295928955, + -1.8436813354492188, + -5.906496047973633, + -12.15787410736084, + -12.5841064453125, + -0.0819428563117981, + -2.6212656497955322, + -1.4329369068145752, + -2.885145425796509, + -1.2901865243911743, + -0.006647023372352123, + -3.5115818977355957, + -12.945953369140625, + -3.793078899383545, + -3.0094375610351562, + -5.966838836669922, + -0.8998424410820007, + -0.040962252765893936, + -1.5467679500579834, + -1.0785343647003174, + -5.73494815826416, + -0.38491737842559814, + -5.017007827758789, + -0.5568072199821472, + -0.5968841910362244, + -2.3609962463378906, + -13.582086563110352, + -0.09050048142671585, + -3.7264108657836914, + -1.1208789348602295, + -6.052675247192383, + -0.5848909616470337, + -3.5906238555908203, + -0.9494907855987549, + -1.5676641464233398, + -5.127577781677246, + -17.19189453125, + -6.698403835296631, + -1.0449178218841553, + -4.365664958953857, + -1.1243419647216797, + -2.2092156410217285, + -1.8081634044647217, + -0.23330983519554138, + -9.439546585083008, + -0.2947109341621399, + -7.253565788269043, + -2.3855936527252197, + -4.629369258880615, + -3.4186267852783203, + -1.9727531671524048, + -2.354729652404785, + -1.474542498588562, + -2.48478364944458, + -1.7641210556030273, + -1.1853944063186646, + -2.8624324798583984, + -0.5740103125572205, + -0.4542185962200165, + -1.4300930500030518, + -0.8807456493377686, + -0.4597663879394531, + -0.9252307415008545, + -1.648141860961914, + -0.44453874230384827, + -1.818476915359497, + -0.5714479088783264, + -1.2115143537521362, + -1.0910619497299194, + -0.0023161747958511114, + -1.3206473588943481, + -0.008621376007795334, + -0.7551823854446411, + -0.9404395818710327, + -0.07279698550701141, + -0.9365248680114746, + -0.03344438225030899, + -1.9720849990844727, + -1.3928067684173584, + -0.7453650832176208 + ] + }, + "throughput": [ + 3.896181563640281, + 77.1287764739343, + 77.17674536709352, + 76.8666671960972, + 77.944911028325, + 77.95118832563914, + 78.13236085816422, + 78.0046829173943 + ] } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/model_config.yaml index 2ba9050ceaf..b368242b9af 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/model_config.yaml @@ -47,8 +47,6 @@ MODEL_ARGS: --num-tokens-to-generate: 30 --enable-cuda-graph: true --decode-only-cuda-graphs: true - --inference-dynamic-batching-buffer-guaranteed-fraction: 0 - --inference-dynamic-batching-buffer-overflow-factor: 0.2 --inference-dynamic-batching-buffer-size-gb: 20 --dist-ckpt-strictness: log_unexpected --inference-ckpt-non-strict: true # To handle the extra_state errors diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml index a4f47d3705f..7fcf9e9cf81 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml @@ -22,7 +22,8 @@ MODEL_ARGS: --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ --distributed-backend: nccl --log-interval: 1 - --transformer-impl: transformer_engine + --transformer-impl: inference_optimized + --sequence-parallel: true --tensor-model-parallel-size: 1 --pipeline-model-parallel-size: 1 --deterministic-mode: true @@ -41,9 +42,6 @@ MODEL_ARGS: --top_k: 1 --return-log-probs: true --num-tokens-to-generate: 30 - --inference-dynamic-batching-max-requests-override: 8 # hardcode decode padding tokens to 7 for reproducibility - --inference-dynamic-batching-buffer-guaranteed-fraction: 0 - --inference-dynamic-batching-buffer-overflow-factor: 0.2 --inference-dynamic-batching-buffer-size-gb: 20 --dist-ckpt-strictness: log_unexpected --inference-ckpt-non-strict: true # To handle the extra_state errors diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/golden_values_dev_dgx_h100.json new file mode 100644 index 00000000000..9be8a9dc0ca --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/golden_values_dev_dgx_h100.json @@ -0,0 +1,1028 @@ +{ + "throughput": [ + 94.6087716527102, + 115.85992244026639, + 138.9562527069375, + 133.18726531918395, + 81.97861561771212, + 134.30726469422635, + 86.456140428456, + 114.99456351298251, + 147.3101800153954, + 3.0364623744653003, + 124.7590786954667, + 134.2276982994434, + 3.0580463134110167, + 117.03969654341354, + 130.92134521286803, + 48.493091604204935, + 1.4498729599486508, + 128.01470907994928, + 1.8330770354872434, + 66.31842482241125, + 82.24189975425459, + 1.07058112939944, + 1.8815468970982412, + 0.9373246942729808, + 134.9963160815443, + 2.285771114682068, + 43.068220270070434, + 134.9677086822377, + 82.44946740133796, + 47.71839155542011, + 114.4199568886962, + 29.67621576315833, + 144.1589742491705, + 95.8164720809401, + 122.80562228460093, + 39.21436814433054, + 3.041180292262413, + 3.2867844729646842, + 72.43808226229888, + 0.8371525937296347, + 1.2212635079980698, + 145.6869075644325, + 42.317711349146016, + 109.1196064871946, + 73.6281770453198, + 140.4495689387567, + 1.219834296561022, + 138.66856497329005, + 23.33818821323391, + 67.82342558671365, + 130.09683254313987, + 147.60199288178146, + 0.9427431720755464, + 3.2856495013162523, + 79.12426666101076, + 86.41557345094756, + 120.17346279825053, + 137.16615251640926, + 108.93291864542198, + 110.10504114490513, + 46.19253755421628, + 0.950218846923012, + 136.50642826951463, + 142.73168666846448, + 1.2206786818073785, + 1.898581377105612, + 131.72636154091063, + 2.2842414327001976, + 89.76521170090028, + 114.66053545744656, + 58.64474290044525, + 0.8367865961030284, + 128.01767795820945, + 60.87292097103301, + 124.20016865241587, + 119.59336898055426, + 0.9425820346281929, + 93.70053305431952, + 1.0728113870213674, + 135.7596767309971, + 112.89357243644062, + 89.2743296587299, + 137.86411291342458, + 135.6974706051771, + 102.59633828443238, + 129.82058179399326, + 139.57672703148444, + 140.5642311163746, + 78.49182953675201, + 123.40912657074227, + 82.74099904578694, + 75.5490641626476, + 93.38596238341951, + 141.19058076067225, + 1.072254167577298, + 100.8669047802279, + 132.77382347347034, + 92.29086179175866, + 137.20301032384705, + 89.57723938765776, + 67.5465256589703, + 0.9498935124108836, + 1.0716887464650027, + 0.8365472180547067, + 137.902625307774, + 132.67132600219722, + 1.45201860416265, + 1.8366476879619427, + 88.65095604379363, + 132.1806036761347, + 126.0481874394642, + 127.43750324083169, + 93.27238135265156, + 109.83884164204308, + 102.30516355984702, + 141.10387096377744, + 0.9425154448032942, + 95.04281981148903, + 103.11525529548061, + 0.8361762901534399, + 135.3171561172067, + 123.30032998064965, + 118.75691144485415, + 82.21375599642211, + 66.37216333263251, + 120.02349229491865, + 27.339414655466246, + 133.1312422227687, + 123.02377779863252, + 111.0798894329, + 58.88405247768833, + 131.31767475108893, + 40.19076958615912, + 123.58362152151858, + 130.6541142941889, + 61.39555613504246, + 43.92154495664044, + 1.037012527495492, + 127.16052127606021, + 137.06554800183082, + 85.67161160523041, + 1.0253417447981334, + 139.20903624514017, + 140.19068787455728, + 117.67416498245059, + 23.410837515725987, + 130.73052473972666, + 22.561824695346466, + 1.028901717647808, + 119.30712483977753, + 117.77548263464804, + 135.2959098119142, + 142.10193821260228, + 1.0366044325624144, + 1.0350271698893887, + 132.8943567509843, + 51.50353963446039, + 113.39559408843714, + 124.25424103796537, + 129.60407993083075, + 136.8566687186031, + 1.036163010240988, + 1.0345739017743927, + 118.72350056844492, + 32.453707095990595, + 43.851925176925825, + 139.39206855448938, + 141.0979597861742, + 132.81461728578432, + 80.95956255477945, + 133.42483643501154, + 57.27721135575491, + 81.47649794801364, + 79.39765285063396, + 56.40255861789973, + 0.8890603607397893, + 137.59325887086797, + 118.03982850100024, + 53.04390121587005, + 88.31177924841927, + 1.0287550608831881, + 54.67393025836421, + 54.73556135447348, + 129.6143036059356, + 123.57095756116274, + 146.05184555314386, + 55.506024155977386, + 84.40666358740559, + 62.68531518105107, + 147.42894642823578, + 1.0274253590993496, + 145.9063526676371, + 76.36231256557768, + 1.035808949157935, + 136.1858098182613, + 93.13144140533397, + 54.57886608953819, + 1.0251956490815057, + 1.0270063804838983, + 67.96952180390161, + 136.90103479290272, + 78.62986077133174, + 129.97235998681177, + 70.57784076609056, + 1.028567312218149, + 69.64434330087829, + 1.0266016363366386, + 25.142311727265525, + 139.54750333578679, + 118.80547132463877, + 1.0342055876192149, + 132.79991800938092, + 88.25494664060619, + 132.4600307114398, + 1.026200775415348, + 111.33264788932784, + 1.031301270403004, + 104.45912302410692, + 1.0337771723701492, + 124.53550504281608, + 1.0283501183885058, + 126.53361938982871, + 139.83512785200963, + 102.28350299734186, + 122.68389734539087, + 139.27095111763788, + 1.0333552237490158, + 97.04945381465573, + 60.63422077140298, + 1.0248694052483192, + 96.77644543721476, + 118.38370846079931, + 1.0309087229819596, + 136.0487423665781, + 1.032932214377732, + 104.96525711514936, + 50.75370028394122, + 125.67617176346853, + 125.47392048276225, + 101.59371483024698, + 119.1183231384482, + 134.24568445137294, + 1.0323996653747745, + 119.28563313083153, + 50.183581144589674, + 107.50817556608582, + 127.4693561344537, + 116.0234844098742, + 149.0429439759437, + 127.77855747904051, + 1.0319900690130652, + 129.7400124946839, + 60.27584011696136, + 1.0245534026749026, + 113.8687773549026, + 129.9927880985222, + 41.55332067297356, + 12.991853549713621, + 144.9384518471586, + 127.77570879015505, + 79.09214991388126, + 1.0326234729165304, + 144.50618896622706, + 44.461452482592826, + 145.75357879817352, + 150.5618330832813, + 123.17802281879979, + 147.0133924731902, + 57.07203337285457, + 140.17944630269687, + 44.5066568841284, + 150.2834791394652, + 146.37106237628518, + 135.59553639884948, + 21.91845075979551, + 1.0391172002596458, + 92.42182316100705, + 14.98578222593142, + 19.944740287073653, + 32.75622847272977, + 58.94666795839769, + 1.0428676908165904, + 97.94938911630567, + 140.5399781540016, + 36.397689902912774, + 1.0322919875583962, + 33.76444948259586, + 147.54902815924785, + 51.316830076622495, + 153.55703202636914, + 46.423895018386204, + 140.271682540213, + 1.0340651759548871, + 85.22971449383292, + 141.80480996358014, + 1.0234621691055457, + 1.0355322329825165, + 136.96321865236195, + 138.2293990177049, + 136.89440582973347, + 96.94919171687799, + 54.992986423891566, + 142.91167590864902, + 138.73615931624403, + 86.32837448704223, + 1.0424247604140402, + 127.58052889290863, + 138.2472241943501, + 1.0338260095695477, + 1.0317372756221133, + 150.59249576769173, + 1.0229533138894364, + 149.1711141084735, + 1.0419379125129562, + 1.040305113121658, + 150.13261057757276, + 62.47975017460808, + 70.20443057037575, + 76.88821624674898, + 1.0225242667788867, + 136.83301633777177, + 1.0414381555227956, + 131.6044067829552, + 1.038902005769604, + 1.0335832618537684, + 83.38230404797935, + 3.047737981863063, + 140.9843162162637, + 1.0352264324041114, + 1.0409374510445146, + 103.17228299164871, + 1.0383219913492376, + 67.5151836065632, + 126.94018489907108, + 95.29974174831813, + 1.022161551972834, + 1.0348032799350415, + 93.24855217625235, + 140.00831851627856, + 142.46553219867087, + 80.52507876480331, + 149.47939431741142, + 125.60095189608528, + 92.57991472689042, + 153.09192667088175, + 98.78787611117323, + 136.9802701171813, + 1.0378200246498124, + 79.05370338483348, + 145.63143231877774, + 107.86253722014555, + 113.1390555766259, + 150.4596904971142, + 6.010262757833046, + 138.11675690694213, + 1.0371929842524894, + 55.1702723554103, + 148.4142582794926, + 108.62464742566522, + 142.2515578682958, + 149.5588988951372, + 1.0310870179234204, + 32.798276334675066, + 145.8363475163408, + 82.52497836005318, + 144.77105210255448, + 140.95035733017403, + 145.4844811663436, + 145.0646083055648, + 139.1641494303434, + 1.0401220454548914, + 146.10598185112948, + 1.0335329080843159, + 1.0316085392161136, + 133.98012837767038, + 129.62059667226987, + 151.2681266565858, + 1.030719335336581, + 135.9600336007384, + 1.0366589924031362, + 107.70864165999221, + 118.06361914834272, + 148.4615541738592, + 135.1206190516379, + 1.0788915925864082, + 1.0662361391973343, + 1.0784094142292293, + 145.5492563111853, + 100.1745158858024, + 89.97448812790176, + 140.13008352060388, + 8.378443606045758, + 19.841723966559687, + 31.11972559764219, + 127.75589035167928, + 144.649118240912, + 83.40454687650907, + 13.609558087727212, + 144.14916775068022, + 143.0831699051951, + 144.53789580070173, + 129.35689525213576, + 126.54760361436873, + 136.72725454688293, + 83.66753329456253, + 35.238850690537326, + 138.73588075606074, + 148.39285997484404, + 141.43706957675556, + 35.20788617289704, + 140.22918428708584, + 141.42288954532623, + 80.8071906111917, + 53.480908541665116, + 96.60869116876205, + 138.83030943256392, + 146.89537016655746, + 1.0659353965573166, + 138.66041009897964, + 138.0783824554628, + 54.95061283513892, + 1.0688789370964418, + 145.4981195236156, + 107.91672388693667, + 147.39387423946786, + 143.49840246862203, + 1.0781871694837721, + 125.37215873599833, + 46.390553110182545, + 1.0683430650310588, + 60.55314896188811, + 128.32962060837178, + 142.6648214311374, + 1.065532502621677, + 145.06202945295232, + 149.5985088362253, + 43.61426254132819, + 139.2120402464869, + 138.80120892663803, + 142.59390751862693, + 147.27000174003754, + 139.5980537408405, + 142.37081759892675, + 76.47257166426981, + 0.8663971721944621, + 1.067847671923619, + 1.0752972325757186, + 139.11225337731244, + 154.1012640338781, + 91.85315813315137, + 7.34066705730821, + 1.0763437477764217, + 56.03391448680589, + 1.067309924884827, + 1.0747789028833068, + 1.057667310022394, + 146.4284745539176, + 142.32867288307636, + 132.81801172672715, + 142.5746724111237, + 43.178263922620026, + 140.19958418325498, + 1.0742201855279276, + 139.95237701874325, + 124.69044225989671, + 89.93275546978569, + 1.0778110524743836, + 108.03753008375865, + 0.8649825661375887, + 101.22782607000799, + 138.6615942910557, + 1.0572642952018412, + 143.509260845593, + 1.0651693329533294, + 97.454990956795, + 1.075960473594851, + 104.89429761368234, + 153.46849816095335, + 143.28204379991922, + 112.57923589922926, + 145.35468060283986, + 119.53338040876814, + 132.53105489182144, + 146.60735281445733, + 0.8648000721123511, + 132.61504628627392, + 140.81953388748138, + 1.05684091289561, + 147.29646966899597, + 1.0646855258714663, + 1.0772400203863821, + 137.87592499226204, + 101.79954304062817, + 134.45893707567646, + 1.0737967838723397, + 147.3289039421509, + 142.95955673278567, + 123.11846557585149, + 139.7223884224781, + 5.274894457437767, + 0.8646226703470901, + 135.27010135142623, + 134.53222451904563, + 140.4520894166607, + 148.6784682726068, + 148.83999547746723, + 144.76059628877204, + 146.09818079047014, + 0.8644123666240657, + 133.05795012757028, + 141.21253159110282, + 147.08086640702987, + 153.13511211461227, + 147.72437078211334, + 53.87242850230838, + 61.34701685378028, + 74.50771860339175, + 16.40780504974564, + 16.448796993269678, + 144.08505364828036, + 143.78069847853888, + 145.08382905436133, + 139.4144567792124, + 1.113422304912727, + 23.732299099149245, + 146.716938504402, + 1.1150428401994323, + 1.1070863332993708, + 147.462815334713, + 15.300506166735937, + 142.89311901203018, + 35.881455163220174, + 0.8959120615185874, + 134.50389621984408, + 79.91603718165896, + 145.31776951960734, + 153.19384567886857, + 142.494036234602, + 130.58249312188119, + 1.1128817603274543, + 56.157995916719756, + 35.81413980204931, + 116.5213087641768, + 63.30354399512571, + 55.0117106848875, + 47.52954249314361, + 153.04709230401787, + 1.112276523473745, + 80.1523559974256, + 136.20373724941714, + 1.114673225365626, + 1.1067132158651183, + 149.29883052073288, + 145.10950784560325, + 130.53765167080937, + 1.111788125890117, + 0.8957719496064405, + 1.1050775451489783, + 17.522300994030367, + 154.45472111064055, + 152.07616582090188, + 1.1020107149905272, + 138.6808068419634, + 76.87873177159636, + 51.43702839643221, + 138.95045176064437, + 138.64177504011988, + 140.72197385602811, + 132.80947742972836, + 149.78872816785005, + 139.94034036065392, + 154.2632802491591, + 55.57148538150843, + 1.1044580058296936, + 147.1712801496827, + 77.84198065949245, + 142.38330204183904, + 151.76812011990265, + 145.19131540821485, + 147.26566215388425, + 87.12413393605841, + 1.1038403429439656, + 141.4935550752979, + 145.7397470598185, + 3.3080164659931235, + 123.0327553358976, + 146.24080278853327, + 148.10448175245884, + 29.234562433775857, + 151.30177873039895, + 135.4653748135468, + 144.3293913931314, + 148.16163203136404, + 1.1015876034201657, + 1.1114790318458536, + 136.68047783885697, + 77.72584511329579, + 125.73692105352463, + 106.98755729483561, + 96.25926845246491, + 1.109721323323522, + 141.71073652156545, + 130.22006710827588, + 145.24478945746003, + 80.67459353439743, + 1.1033551544760267, + 150.03177939272493, + 154.12875534463626, + 150.04771421074818, + 1.1010813815407388, + 1.1110434127990452, + 145.385699877379, + 86.86487551811825, + 130.16687493633253, + 143.8726181331947, + 111.91340621077623, + 146.0394914387852, + 1.1006353022455784, + 134.47903589563677, + 148.6907436994389, + 102.87151097507036, + 137.41724911494663, + 1.1146766644704549, + 143.85952373403495, + 146.92280951248307, + 1.100156488603178, + 144.04783334738536, + 148.53630346113712, + 58.74848466983248, + 147.0485685726298, + 141.32891699761203, + 142.8441702922343, + 131.04366253726744, + 128.6305301075303, + 1.1106412111686195, + 147.90025888582002, + 0.8959265584913588, + 149.5194069726666, + 137.43649451567626, + 1.1068068376551545, + 68.05269425995475, + 138.94056631255367, + 138.43818227469507, + 69.60391199895408, + 114.83395091462887, + 151.34107787433956, + 141.57237630997332, + 146.07433910500515, + 9.941778754980154, + 131.297822968639, + 10.386636719874664, + 10.545636067043365, + 114.58677137445733, + 75.28902943071078, + 90.63452059810655, + 143.58694736923238, + 9.901118804514459, + 144.5206530902411, + 144.78737732574044, + 79.81136215142409, + 84.9314508821071, + 120.18939827456474, + 10.225253542151219, + 9.702822548173124, + 103.1188517219872, + 138.5008491242522, + 92.02238700298246, + 151.99592340131602, + 9.807595290716304, + 150.0447954775559, + 134.2614008494909, + 149.38544573345007, + 149.62298116309924, + 124.32358754465251, + 132.817456221544, + 10.50607995390264, + 9.78317681034783, + 151.07916494121415, + 146.93545537009487, + 118.45851163082196, + 145.03008316360754, + 154.4449202186591, + 146.86002069809945, + 150.6932855951215, + 110.74803327496042, + 127.40788523389726, + 150.81323854197058, + 150.0047673310006, + 149.6063654551971, + 133.87244996538675, + 10.329695475492791, + 9.414695716712222, + 106.77032789813472, + 118.34636653947105, + 123.44441062862572, + 144.9015592115516, + 153.74652990582067, + 10.065713405335144, + 129.38998560194165, + 117.69087049838025, + 99.15650839997046, + 127.90462338199198, + 147.3574863739125, + 9.696544883885949, + 9.8853852911422, + 128.35872796896587, + 145.2939860705264, + 128.72081963712404, + 94.09935653689803, + 142.8780531031409, + 130.5213122981276, + 126.89288883528536, + 153.36107852781166, + 149.17239657923582, + 9.177632630803961, + 9.387171298727486, + 109.68196882316985, + 148.55536204011432, + 152.61730207818772, + 9.648922236946333, + 132.805446535875, + 138.74295200738652, + 141.66118217831166, + 124.0399127789103, + 113.05005278683446, + 149.71230902297984, + 25.727698431920004, + 129.56419655827216, + 130.40687823665095, + 128.46470366050013, + 150.46298369674685, + 9.22073843893938, + 110.36443029340542, + 148.23878821929193, + 10.219508495480236, + 9.615051521185155, + 9.8723813087942, + 149.91378148843256, + 9.149056684599877, + 130.37704092008303, + 114.86611671621016, + 134.53633480709703, + 131.11593468604048, + 149.74665952988033, + 136.60701891253495, + 146.50864617645632, + 9.094221140419737, + 149.69902295915708, + 126.93245475406366, + 141.2463933703881, + 10.18172163650932, + 136.76582155059438, + 155.5823388453975, + 144.68082947663285, + 142.0128061769988, + 116.20800508912414, + 101.13756407758095, + 10.050927550768915, + 10.14139856150474, + 9.573219645146107, + 146.33874064646594, + 137.22302119976462, + 132.14965518046, + 148.08190796641483, + 117.6843964457568, + 153.04352772565807, + 146.79238076404926, + 9.522740968586977, + 145.93484469600287, + 13.925952420322696, + 12.697420287309185, + 146.39122941822845, + 113.94298610788566, + 13.844109957456581, + 154.57922917096633, + 13.525210269101805, + 103.83976095796662, + 97.75660804271413, + 135.83818209343426, + 158.60060111529293, + 111.57793188874757, + 13.768524263105455, + 154.2203592546867, + 108.85242762118563, + 111.15752259030245, + 149.5942138872604, + 119.77102605185765, + 120.68065341205389, + 105.29698904913548, + 151.41465167808087, + 138.90606724001483, + 13.437371194424983, + 119.97194649055415, + 144.6223725248399, + 146.9934910169238, + 149.45319992777343, + 121.48260402443249, + 13.662736071688842, + 14.448955892498802, + 144.5545360346381, + 154.00382983055897, + 151.8635735223181, + 137.2321484611102, + 119.71487519948164, + 88.24978714231261, + 147.74815341218743, + 142.1113258863455, + 132.08775922189477, + 124.63351274554526, + 145.72256212355262, + 100.50708502243579, + 139.16363846809003, + 114.82662827063822, + 154.78307253831395, + 149.22879563842886, + 152.6744734255461, + 145.81022434241217, + 152.68018782123758, + 116.75549006136289, + 12.968595875688791, + 6.824624970615158, + 125.05116103474757, + 147.66072487793718, + 147.5735120742967, + 139.1302141298083, + 146.48542990069834, + 12.674865288395944, + 147.88858853602966, + 6.8124480142416175, + 137.54766974463703, + 130.89979405333307, + 13.364169845161861, + 14.116086127002273, + 130.3002929300388, + 116.98398239487472, + 152.70827610346095, + 98.51470626500011, + 135.1252373635164, + 14.405992358855888, + 154.13709739001223, + 146.28661687368685, + 137.87827066214206, + 12.621081453489012, + 154.04574874294514, + 6.802625211185703, + 152.18661864386252, + 149.30257880598677, + 13.244501725269068, + 138.34068638798834, + 150.95140747506372, + 141.8441899037163, + 152.99022366652198, + 103.95004802425926, + 140.28144756248412, + 154.51222806007945, + 85.40777548962518, + 154.7067128296305, + 120.47843952303268, + 12.568053995018431, + 12.916583075889136, + 105.92477484543576, + 137.92878859711615, + 135.13853669037294, + 137.88549737290148, + 157.83019925734393, + 145.48927689323145, + 12.509532718065461, + 150.6233829715981, + 119.23669844460764, + 138.49099023171033, + 154.0870149904812, + 140.1862744667834, + 148.860174031694, + 147.54629689336036, + 12.448861769003683, + 152.4711466483636, + 102.47079224461186, + 152.40864885890767, + 156.21773232766026, + 13.139291580904986, + 150.30653960489693, + 145.43571147072188, + 132.8965387342577, + 144.85972103961666, + 125.5438694385711, + 158.07457773478276, + 14.359506122440205, + 137.7658155977229, + 153.68125116011197, + 156.57780724945528, + 12.394708947912125, + 12.874702780202174, + 110.61518572692995, + 149.4338565730422, + 149.67552030435513, + 146.20909415912828, + 9.308833539527914, + 26.176147260970783, + 8.701217384742513, + 66.92241449340185, + 105.12940849136734, + 145.25326276553395, + 139.68219350261262, + 131.60335890332783, + 150.53420884400245, + 17.552483447968918, + 99.60476667168517, + 9.003208512207522, + 8.539560747895454, + 9.946172723540226, + 150.55644446784382, + 9.608936841972842, + 104.80864366760326, + 25.95068644438624, + 99.42592550150236, + 108.35979254469888, + 113.9171427720856, + 9.905905876631499, + 131.1684982861573, + 154.7989292174601, + 151.34753888952145, + 150.11816141981262, + 143.00557828542912, + 126.2310299151925, + 113.53830001728545, + 148.13405630794878, + 150.7564429392251, + 155.252325076404, + 18.20048176554747, + 25.725436761645142, + 8.678711562613207, + 143.3683328827327, + 127.0294451168928, + 137.50119476282134, + 10.068367539846923, + 155.64822784014916, + 153.2789382926615, + 25.46950813818654, + 142.9138107220956, + 155.10510899417167, + 107.40557834412083, + 9.871948602847068, + 144.4712732194919, + 140.17802930301565, + 9.286026243902361, + 129.1488895575147, + 124.35586045151207, + 140.1410811550992, + 96.63692877337894, + 153.62093095799207, + 156.05800033315097, + 9.587609950939838, + 140.09721428165886, + 134.898750425008, + 8.652809034763463, + 8.989448046931262, + 107.64260577858933, + 9.825071080298192, + 150.6237132142087, + 143.76058852986372, + 154.01627264735168, + 140.85322298632985, + 143.63714834446708, + 149.7259575806535, + 8.53942846683121, + 157.02635815805976, + 150.83913162907433, + 154.0283691261865, + 9.246842209481716, + 154.5851361854829, + 133.4662155767381, + 137.55396410787307, + 105.77910782321499, + 148.97953057255376, + 111.3041581371634, + 9.543858351726714, + 142.71996301994741, + 144.2417836324451, + 148.5293262803374, + 8.95331376662564, + 105.2724164655814, + 149.16646109060707, + 151.1947852118465, + 9.503293907683512, + 133.40055362812345, + 8.776394391795916, + 148.3675722527084, + 154.66946641450528, + 122.71674068416665, + 149.62192317697068, + 153.40159484208397, + 9.46860898864519, + 146.10526710538994, + 143.96020057925128, + 8.62472208077336, + 8.906885562515198, + 105.7754218686014, + 150.17957794387223, + 144.0451331512576, + 149.95461039551162, + 151.46311089131117, + 142.22104279807664, + 147.3679944003333, + 140.5394711174869, + 123.62157744638432, + 152.32796921399395, + 156.6603241829257, + 9.43621164630811, + 158.2241383954169, + 149.33346139426692, + 144.12074054746773, + 143.1977521817863, + 8.536662624511228, + 9.785635570067782, + 147.61880087321424, + 9.402323265876474, + 159.1161790596516, + 146.56796834276156, + 147.64890403285438, + 157.70847517328534, + 114.64282143770687, + 148.5000942425868, + 10.052761003641129, + 147.38801074409378 + ] +} diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/model_config.yaml new file mode 100644 index 00000000000..2d65c154a0e --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/model_config.yaml @@ -0,0 +1,59 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: Ring + CUBLAS_WORKSPACE_CONFIG: :4096:8 +TEST_TYPE: frozen-start +MODE: inference +MODEL_ARGS: + --tiktoken-pattern: v2 + --use-mcore-models: true + --tokenizer-type: TikTokenizer + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --auto-detect-ckpt-format: true + --max-tokens-to-oom: 3600000 + --inference-max-seq-length: 4096 + --attention-backend: flash + --use-checkpoint-args: true + --micro-batch-size: 1 + --no-load-optim: true + --no-use-tokenizer-model-from-checkpoint-args: true + --timing-log-level: 0 + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ + --distributed-backend: nccl + --log-interval: 1 + --transformer-impl: transformer_engine + --tensor-model-parallel-size: 1 + --pipeline-model-parallel-size: 1 + --ckpt-format: torch_dist + --bf16: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --num-layers: 24 + --hidden-size: 1152 + --num-attention-heads: 16 + --max-position-embeddings: 1024 + --seq-length: 1024 + --temperature: 1.0 + --top_k: 1 + --seed: 42 + --return-log-probs: true + --num-tokens-from-file: true + --inference-dynamic-batching-buffer-size-gb: 20 + --cuda-graph-impl: local + --cuda-graph-scope: full_iteration + --disable-chunked-prefill: true + --dist-ckpt-strictness: log_unexpected + --inference-ckpt-non-strict: true # To handle the extra_state errors + --output-path: ${TENSORBOARD_PATH} + --output-every-n-results: 32 + --prompt-file: ${DATA_PATH}/text/sharegpt-vicuna/filtered/processed.jsonl + --prompt-file-num-truncate: 1024 + --incoming-requests-per-step: 128 + --use-flashinfer-fused-rope: true + --throughput-check-only: true +METRICS: + - "generated_tokens" + - "logprobs" diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json new file mode 100644 index 00000000000..07adf271434 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json @@ -0,0 +1,158 @@ +{ + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And that this is the place where you can be yourself, and be yourself in the most beautiful way. And that this is the place where you can", + "generated_tokens": [ + 3060, + 1455, + 1593, + 1395, + 1278, + 3535, + 2478, + 1636, + 1710, + 1402, + 14019, + 1044, + 1321, + 1402, + 14019, + 1294, + 1278, + 2725, + 15568, + 3039, + 1046, + 3060, + 1455, + 1593, + 1395, + 1278, + 3535, + 2478, + 1636, + 1710 + ], + "latency": 2.020272731781006, + "logprobs": [ + -9.358587265014648, + -2.7594826221466064, + -4.608366012573242, + -1.4093360900878906, + -0.6152952313423157, + -1.7217562198638916, + -2.496668815612793, + -2.0547454357147217, + -2.441960573196411, + -6.280838966369629, + -1.5643692016601562, + -3.462346076965332, + -4.428728103637695, + -3.8633861541748047, + -1.9936373233795166, + -1.8929449319839478, + -3.796365737915039, + -6.8360137939453125, + -0.2901247441768646, + -0.9246833324432373, + -6.633338928222656, + -7.166708469390869, + -12.771251678466797, + -2.198296308517456, + -3.7778120040893555, + -0.4983733296394348, + -4.381269454956055, + -0.0666784718632698, + -0.09580295532941818, + -3.2437636852264404, + -10.079947471618652, + -1.172220230102539, + -5.977442741394043, + -5.046236038208008, + -3.855658531188965, + -2.5585858821868896, + -3.356245994567871, + -5.557229518890381, + -1.6787731647491455, + -5.483290672302246, + -12.218501091003418, + -12.61402702331543, + -0.09662941098213196, + -2.5431432723999023, + -1.4071024656295776, + -2.9154715538024902, + -1.1964417695999146, + -0.006458481773734093, + -3.3625335693359375, + -13.262511253356934, + -4.314079761505127, + -2.617699146270752, + -5.987792015075684, + -0.778266429901123, + -0.048888545483350754, + -1.548882007598877, + -1.1381981372833252, + -5.627166748046875, + -0.4078553318977356, + -4.958505630493164, + -0.6187160611152649, + -0.7174848914146423, + -2.469533920288086, + -13.620073318481445, + -0.09088654816150665, + -3.526974678039551, + -1.4195809364318848, + -6.402483940124512, + -0.5898402333259583, + -3.565917491912842, + -0.8561318516731262, + -1.6140165328979492, + -5.370549201965332, + -17.159223556518555, + -6.583524703979492, + -0.8855001926422119, + -4.19431209564209, + -1.2012220621109009, + -2.2563133239746094, + -1.7674944400787354, + -0.22064533829689026, + -9.292220115661621, + -0.12445646524429321, + -7.29617977142334, + -2.526529312133789, + -4.071560859680176, + -3.5568013191223145, + -1.926215410232544, + -2.349026918411255, + -2.2132363319396973, + -0.3125414550304413, + -1.4718132019042969, + -2.149106740951538, + -1.0855519771575928, + -1.631832242012024, + -1.3751734495162964, + -1.9396103620529175, + -1.5293723344802856, + -0.8444125056266785, + -1.2414811849594116, + -1.9522171020507812, + -2.4338042736053467, + -1.5651824474334717, + -0.9498789310455322, + -1.8044980764389038, + -2.356677770614624, + -1.247452974319458, + -1.550165057182312, + -0.5635553598403931, + -0.6177330017089844, + -0.4778785705566406, + -0.020452087745070457, + -0.48500269651412964, + -0.23854275047779083, + -0.06543659418821335, + -0.11837350577116013, + -0.0585334412753582 + ] + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/model_config.yaml new file mode 100644 index 00000000000..96d3fd0fc0c --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/model_config.yaml @@ -0,0 +1,58 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: Ring + CUBLAS_WORKSPACE_CONFIG: :4096:8 +TEST_TYPE: frozen-start +MODE: inference +MODEL_ARGS: + --tiktoken-pattern: v2 + --use-mcore-models: true + --tokenizer-type: TikTokenizer + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --auto-detect-ckpt-format: true + --max-tokens-to-oom: 3600000 + --inference-max-seq-length: 4096 + --attention-backend: flash + --use-checkpoint-args: true + --micro-batch-size: 1 + --no-load-optim: true + --no-use-tokenizer-model-from-checkpoint-args: true + --timing-log-level: 0 + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ + --distributed-backend: nccl + --log-interval: 1 + --transformer-impl: inference_optimized + --sequence-parallel: true + --tensor-model-parallel-size: 1 + --pipeline-model-parallel-size: 8 + --deterministic-mode: true + --ckpt-format: torch_dist + --bf16: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --num-layers: 24 + --hidden-size: 1152 + --num-attention-heads: 16 + --max-position-embeddings: 1024 + --seq-length: 1024 + --temperature: 1.0 + --top_k: 1 + --return-log-probs: true + --num-tokens-to-generate: 30 + --inference-dynamic-batching-max-requests-override: 8 # hardcode decode padding tokens to 7 for reproducibility + --inference-dynamic-batching-buffer-guaranteed-fraction: 0 + --inference-dynamic-batching-buffer-overflow-factor: 0.2 + --inference-dynamic-batching-buffer-size-gb: 20 + --dist-ckpt-strictness: log_unexpected + --inference-ckpt-non-strict: true # To handle the extra_state errors + --output-path: ${TENSORBOARD_PATH} + --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." + --incoming-requests-per-step: 32 + --use-flashinfer-fused-rope: true + +METRICS: + - "generated_tokens" + - "logprobs" diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json new file mode 100644 index 00000000000..55d6955055a --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json @@ -0,0 +1,158 @@ +{ + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", + "generated_tokens": [ + 3060, + 2430, + 1636, + 2012, + 1317, + 1278, + 2362, + 1307, + 1278, + 16070, + 1044, + 1321, + 1636, + 23067, + 1455, + 1593, + 1395, + 1605, + 3140, + 5152, + 1513, + 1747, + 1046, + 2409, + 1395, + 3140, + 5152, + 1513, + 1278, + 2362 + ], + "latency": 44.73653959017247, + "logprobs": [ + -9.358970642089844, + -2.7523813247680664, + -4.628502368927002, + -1.4058877229690552, + -0.6050865054130554, + -1.7354254722595215, + -2.4828507900238037, + -2.0520384311676025, + -2.4089853763580322, + -6.2649126052856445, + -1.5644135475158691, + -3.4096615314483643, + -4.358163833618164, + -3.866471767425537, + -2.0575876235961914, + -1.904883623123169, + -3.7622976303100586, + -6.835415363311768, + -0.2829523980617523, + -0.9827429056167603, + -6.655940055847168, + -7.188957214355469, + -12.757233619689941, + -2.1933951377868652, + -3.808887481689453, + -0.515199601650238, + -4.323916912078857, + -0.067625492811203, + -0.09976530075073242, + -3.228640556335449, + -10.129311561584473, + -1.1787357330322266, + -5.97692346572876, + -5.036575794219971, + -3.8267176151275635, + -2.6010468006134033, + -3.366438865661621, + -5.553505897521973, + -1.6046268939971924, + -5.442874908447266, + -12.218503952026367, + -12.597894668579102, + -0.0976092740893364, + -2.530579090118408, + -1.4139617681503296, + -2.8606526851654053, + -1.1690009832382202, + -0.0066696410067379475, + -3.361189365386963, + -13.191482543945312, + -4.413737773895264, + -2.639688491821289, + -6.0114641189575195, + -0.7672993540763855, + -0.047326065599918365, + -1.550362467765808, + -1.137772798538208, + -5.627618789672852, + -0.40103790163993835, + -4.908735275268555, + -0.5704602599143982, + -0.6625558733940125, + -2.364135503768921, + -13.609526634216309, + -0.08865148574113846, + -3.5251970291137695, + -1.3791766166687012, + -6.395696640014648, + -0.588782787322998, + -3.566770076751709, + -0.8742034435272217, + -1.5827170610427856, + -5.3912353515625, + -17.150842666625977, + -6.6234588623046875, + -0.885993242263794, + -4.162992477416992, + -1.1942744255065918, + -2.281689405441284, + -1.7708709239959717, + -0.22030864655971527, + -9.292593955993652, + -0.1258234828710556, + -7.346449851989746, + -2.5470826625823975, + -4.115433692932129, + -3.5646262168884277, + -1.9410749673843384, + -2.3247878551483154, + -1.523364543914795, + -2.360647678375244, + -1.708706021308899, + -1.131014108657837, + -2.944424867630005, + -0.5273782014846802, + -0.44912564754486084, + -1.753378987312317, + -0.8341047167778015, + -0.4124295711517334, + -0.9006240367889404, + -1.4890273809432983, + -0.4379286766052246, + -1.6497018337249756, + -0.5444425344467163, + -1.2305881977081299, + -1.164027214050293, + -0.002498721005395055, + -1.165798544883728, + -0.007112303748726845, + -0.718407154083252, + -0.7442683577537537, + -0.04299728572368622, + -0.8688321113586426, + -0.021008115261793137, + -2.033963680267334, + -1.2936673164367676, + -0.78721684217453 + ] + } +} diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/model_config.yaml new file mode 100644 index 00000000000..306c12bd653 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/model_config.yaml @@ -0,0 +1,58 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: Ring + CUBLAS_WORKSPACE_CONFIG: :4096:8 +TEST_TYPE: frozen-start +MODE: inference +MODEL_ARGS: + --tiktoken-pattern: v2 + --use-mcore-models: true + --tokenizer-type: TikTokenizer + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --auto-detect-ckpt-format: true + --max-tokens-to-oom: 3600000 + --inference-max-seq-length: 4096 + --attention-backend: flash + --use-checkpoint-args: true + --micro-batch-size: 1 + --no-load-optim: true + --no-use-tokenizer-model-from-checkpoint-args: true + --timing-log-level: 0 + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ + --distributed-backend: nccl + --log-interval: 1 + --transformer-impl: inference_optimized + --sequence-parallel: true + --tensor-model-parallel-size: 2 + --pipeline-model-parallel-size: 2 + --deterministic-mode: true + --ckpt-format: torch_dist + --bf16: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --num-layers: 24 + --hidden-size: 1152 + --num-attention-heads: 16 + --max-position-embeddings: 1024 + --seq-length: 1024 + --temperature: 1.0 + --top_k: 1 + --return-log-probs: true + --num-tokens-to-generate: 30 + --inference-dynamic-batching-max-requests-override: 8 # hardcode decode padding tokens to 7 for reproducibility + --inference-dynamic-batching-buffer-guaranteed-fraction: 0 + --inference-dynamic-batching-buffer-overflow-factor: 0.2 + --inference-dynamic-batching-buffer-size-gb: 20 + --dist-ckpt-strictness: log_unexpected + --inference-ckpt-non-strict: true # To handle the extra_state errors + --output-path: ${TENSORBOARD_PATH} + --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." + --incoming-requests-per-step: 32 + --use-flashinfer-fused-rope: true + +METRICS: + - "generated_tokens" + - "logprobs" diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json index 6ef98105cbd..f32580e937f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json @@ -157,5 +157,5 @@ -0.0585334412753582 ] }, - "throughput": [13.93210545115292, 13.93210545115292] -} \ No newline at end of file + "throughput": [12.319796866345767, 12.319796866345767] +} diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml index 59186f8d532..e6b659cf46f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml @@ -41,10 +41,7 @@ MODEL_ARGS: --top_k: 1 --return-log-probs: true --num-tokens-to-generate: 30 - --inference-dynamic-batching-max-requests-override: 8 # hardcode decode padding tokens to 7 for reproducibility - --inference-dynamic-batching-buffer-guaranteed-fraction: 0 - --inference-dynamic-batching-buffer-overflow-factor: 0.2 - --inference-dynamic-batching-buffer-size-gb: 20 + --inference-dynamic-batching-buffer-size-gb: 10 --dist-ckpt-strictness: log_unexpected --inference-ckpt-non-strict: true # To handle the extra_state errors --output-path: ${TENSORBOARD_PATH} diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json index 07adf271434..4ebaf72f5e7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json @@ -1,158 +1,158 @@ { - "0": { - "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", - "generated_text": " And that this is the place where you can be yourself, and be yourself in the most beautiful way. And that this is the place where you can", - "generated_tokens": [ - 3060, - 1455, - 1593, - 1395, - 1278, - 3535, - 2478, - 1636, - 1710, - 1402, - 14019, - 1044, - 1321, - 1402, - 14019, - 1294, - 1278, - 2725, - 15568, - 3039, - 1046, - 3060, - 1455, - 1593, - 1395, - 1278, - 3535, - 2478, - 1636, - 1710 - ], - "latency": 2.020272731781006, - "logprobs": [ - -9.358587265014648, - -2.7594826221466064, - -4.608366012573242, - -1.4093360900878906, - -0.6152952313423157, - -1.7217562198638916, - -2.496668815612793, - -2.0547454357147217, - -2.441960573196411, - -6.280838966369629, - -1.5643692016601562, - -3.462346076965332, - -4.428728103637695, - -3.8633861541748047, - -1.9936373233795166, - -1.8929449319839478, - -3.796365737915039, - -6.8360137939453125, - -0.2901247441768646, - -0.9246833324432373, - -6.633338928222656, - -7.166708469390869, - -12.771251678466797, - -2.198296308517456, - -3.7778120040893555, - -0.4983733296394348, - -4.381269454956055, - -0.0666784718632698, - -0.09580295532941818, - -3.2437636852264404, - -10.079947471618652, - -1.172220230102539, - -5.977442741394043, - -5.046236038208008, - -3.855658531188965, - -2.5585858821868896, - -3.356245994567871, - -5.557229518890381, - -1.6787731647491455, - -5.483290672302246, - -12.218501091003418, - -12.61402702331543, - -0.09662941098213196, - -2.5431432723999023, - -1.4071024656295776, - -2.9154715538024902, - -1.1964417695999146, - -0.006458481773734093, - -3.3625335693359375, - -13.262511253356934, - -4.314079761505127, - -2.617699146270752, - -5.987792015075684, - -0.778266429901123, - -0.048888545483350754, - -1.548882007598877, - -1.1381981372833252, - -5.627166748046875, - -0.4078553318977356, - -4.958505630493164, - -0.6187160611152649, - -0.7174848914146423, - -2.469533920288086, - -13.620073318481445, - -0.09088654816150665, - -3.526974678039551, - -1.4195809364318848, - -6.402483940124512, - -0.5898402333259583, - -3.565917491912842, - -0.8561318516731262, - -1.6140165328979492, - -5.370549201965332, - -17.159223556518555, - -6.583524703979492, - -0.8855001926422119, - -4.19431209564209, - -1.2012220621109009, - -2.2563133239746094, - -1.7674944400787354, - -0.22064533829689026, - -9.292220115661621, - -0.12445646524429321, - -7.29617977142334, - -2.526529312133789, - -4.071560859680176, - -3.5568013191223145, - -1.926215410232544, - -2.349026918411255, - -2.2132363319396973, - -0.3125414550304413, - -1.4718132019042969, - -2.149106740951538, - -1.0855519771575928, - -1.631832242012024, - -1.3751734495162964, - -1.9396103620529175, - -1.5293723344802856, - -0.8444125056266785, - -1.2414811849594116, - -1.9522171020507812, - -2.4338042736053467, - -1.5651824474334717, - -0.9498789310455322, - -1.8044980764389038, - -2.356677770614624, - -1.247452974319458, - -1.550165057182312, - -0.5635553598403931, - -0.6177330017089844, - -0.4778785705566406, - -0.020452087745070457, - -0.48500269651412964, - -0.23854275047779083, - -0.06543659418821335, - -0.11837350577116013, - -0.0585334412753582 - ] - } -} \ No newline at end of file + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", + "generated_tokens": [ + 3060, + 2430, + 1636, + 2012, + 1317, + 1278, + 2362, + 1307, + 1278, + 16070, + 1044, + 1321, + 1636, + 23067, + 1455, + 1593, + 1395, + 1605, + 3140, + 5152, + 1513, + 1747, + 1046, + 2409, + 1395, + 3140, + 5152, + 1513, + 1278, + 2362 + ], + "latency": 42.63835311005823, + "logprobs": [ + -9.358713150024414, + -2.724055767059326, + -4.5792131423950195, + -1.4844143390655518, + -0.6546129584312439, + -1.7303215265274048, + -2.4795279502868652, + -2.0776171684265137, + -2.4553134441375732, + -6.219150066375732, + -1.566371202468872, + -3.486889362335205, + -4.418787479400635, + -3.8580172061920166, + -2.0664010047912598, + -1.843908667564392, + -3.744598627090454, + -6.82543420791626, + -0.2880207300186157, + -0.9257857799530029, + -6.612694263458252, + -7.218401908874512, + -12.827808380126953, + -2.1861495971679688, + -3.8218231201171875, + -0.5008565187454224, + -4.383245468139648, + -0.06934759020805359, + -0.09667497128248215, + -3.2640299797058105, + -10.102912902832031, + -1.1498218774795532, + -5.979549407958984, + -5.0192108154296875, + -3.8367133140563965, + -2.581653356552124, + -3.4087462425231934, + -5.545716285705566, + -1.6541939973831177, + -5.547749996185303, + -12.21850872039795, + -12.582784652709961, + -0.09534379839897156, + -2.522055149078369, + -1.4054086208343506, + -2.8758127689361572, + -1.1866405010223389, + -0.005799253936856985, + -3.3871712684631348, + -13.193516731262207, + -4.389392852783203, + -2.520228862762451, + -6.023908615112305, + -0.7408540844917297, + -0.04526234790682793, + -1.5508661270141602, + -1.1332746744155884, + -5.653256416320801, + -0.4028852581977844, + -4.9457244873046875, + -0.618165135383606, + -0.6616490483283997, + -2.36385178565979, + -13.6455078125, + -0.08668932318687439, + -3.5266754627227783, + -1.3801541328430176, + -6.351947784423828, + -0.5434023141860962, + -3.5673093795776367, + -0.871107816696167, + -1.618450403213501, + -5.378700256347656, + -17.17119026184082, + -6.662005424499512, + -0.9221409559249878, + -4.141905784606934, + -1.2047083377838135, + -2.227570056915283, + -1.7645721435546875, + -0.21892313659191132, + -9.296550750732422, + -0.11995092779397964, + -7.402207851409912, + -2.512965679168701, + -4.100971221923828, + -3.580245018005371, + -1.9462040662765503, + -2.347074031829834, + -1.5288957357406616, + -2.4033043384552, + -1.7311294078826904, + -1.1686863899230957, + -2.938558340072632, + -0.5278136730194092, + -0.4748117923736572, + -1.749883770942688, + -0.8397680521011353, + -0.4109693169593811, + -0.9552587270736694, + -1.5238327980041504, + -0.4656376838684082, + -1.6448218822479248, + -0.5414345264434814, + -1.2422380447387695, + -1.1426063776016235, + -0.002245525596663356, + -1.252556562423706, + -0.007873333990573883, + -0.7185167670249939, + -0.7521701455116272, + -0.042445242404937744, + -0.8852499723434448, + -0.02266514115035534, + -2.0951969623565674, + -1.348037838935852, + -0.8296748399734497 + ] + } +} diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml index 612e621534d..551ba8115cb 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml @@ -22,8 +22,9 @@ MODEL_ARGS: --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ --distributed-backend: nccl --log-interval: 1 - --transformer-impl: transformer_engine - --tensor-model-parallel-size: 1 + --transformer-impl: inference_optimized + --sequence-parallel: true + --tensor-model-parallel-size: 8 --pipeline-model-parallel-size: 1 --deterministic-mode: true --ckpt-format: torch_dist @@ -51,6 +52,7 @@ MODEL_ARGS: --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." --incoming-requests-per-step: 32 --use-flashinfer-fused-rope: true + METRICS: - "generated_tokens" - "logprobs" diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/golden_values_dev_dgx_h100.json new file mode 100644 index 00000000000..dccdd34a5e7 --- /dev/null +++ b/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/golden_values_dev_dgx_h100.json @@ -0,0 +1,135 @@ +{ + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " Then, when you're ready, go home and watch the movie again.", + "generated_tokens": [ + 6830, + 1044, + 2200, + 1636, + 6185, + 11831, + 1044, + 1974, + 4590, + 1321, + 9951, + 1278, + 16070, + 2790, + 1046, + 2 + ], + "latency": 22.701347589492798, + "cuda_graph_request_count_map": null, + "step_count": 16, + "logprobs": [ + -9.498085021972656, + -3.787536859512329, + -3.0404648780822754, + -1.7445809841156006, + -0.29672086238861084, + -1.3661342859268188, + -2.3458175659179688, + -1.83931303024292, + -1.4894113540649414, + -6.440437316894531, + -0.8176816701889038, + -1.790361762046814, + -3.6521127223968506, + -3.7014482021331787, + -1.5858951807022095, + -1.5492421388626099, + -2.844204902648926, + -6.694585800170898, + -0.06552714854478836, + -1.333437204360962, + -6.077418327331543, + -9.448220252990723, + -10.46927261352539, + -1.4987666606903076, + -4.727880001068115, + -0.7596290111541748, + -2.152517795562744, + -0.013758113607764244, + -0.040566492825746536, + -3.1010313034057617, + -8.735280990600586, + -1.5446771383285522, + -5.841436862945557, + -3.0970406532287598, + -4.0269670486450195, + -3.769413948059082, + -2.466399669647217, + -2.3482255935668945, + -0.47234833240509033, + -1.114174723625183, + -5.310229778289795, + -8.236719131469727, + -0.015452657826244831, + -2.854970932006836, + -1.2198810577392578, + -3.923705577850342, + -0.9644856452941895, + -0.0026721982285380363, + -3.096668243408203, + -11.110801696777344, + -3.688267230987549, + -2.3297765254974365, + -4.670788764953613, + -0.09854680299758911, + -0.06234245002269745, + -1.3255000114440918, + -2.169330596923828, + -4.490111827850342, + -0.4412422776222229, + -3.9356117248535156, + -0.5775455832481384, + -0.2409835010766983, + -2.9197134971618652, + -13.475022315979004, + -0.10248012840747833, + -3.5023770332336426, + -0.8544933795928955, + -5.194520473480225, + -0.32954925298690796, + -2.3026833534240723, + -0.5346049070358276, + -1.2862977981567383, + -4.881562232971191, + -15.555293083190918, + -4.919404029846191, + -0.22008435428142548, + -6.644532680511475, + -0.8938115239143372, + -2.1304054260253906, + -1.8866363763809204, + -0.20106904208660126, + -5.917205810546875, + -0.0056310598738491535, + -7.453446388244629, + -3.1677205562591553, + -3.706507682800293, + -2.136584520339966, + -2.9287283420562744, + -1.4792609214782715, + -2.4399306774139404, + -1.2330785989761353, + -1.9715899229049683, + -1.9578948020935059, + -0.23143476247787476, + -2.052696466445923, + -1.0413113832473755, + -1.1709030866622925, + -2.825991630554199, + -1.6848523616790771, + -2.2008259296417236, + -1.5216114521026611, + -1.2439141273498535, + -1.412055253982544 + ] + }, + "throughput": [ + 13.750125804204401, 13.955213632130931 + ] +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/model_config.yaml new file mode 100644 index 00000000000..4ae5c719291 --- /dev/null +++ b/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/model_config.yaml @@ -0,0 +1,72 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: Ring + CUBLAS_WORKSPACE_CONFIG: :4096:8 +TEST_TYPE: frozen-start +MODE: inference +MODEL_ARGS: + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --log-memory-to-tensorboard: true + --timing-log-level: 0 + --load: ${CHECKPOINT_LOAD_PATH}/model/mamba_hybrid_2b/dcp/mcore-v1_bf16/checkpoint + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mamba_hybrid_2b/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --tokenizer-type: TikTokenizer + --tiktoken-pattern: v2 + --distributed-backend: nccl + --log-interval: 1 + --transformer-impl: transformer_engine + --tensor-model-parallel-size: 1 + --pipeline-model-parallel-size: 1 + --expert-model-parallel-size: 1 + --use-mcore-models: true + --is-hybrid-model: true + --model-provider: mamba + --init-method-std: 0.0198 + --untie-embeddings-and-output-weights: true + --disable-bias-linear: true + --init-method-std: 0.014 + --position-embedding-type: none + --num-layers: 50 + --hidden-size: 2048 + --ffn-hidden-size: 11264 + --num-attention-heads: 16 + --kv-channels: 128 + --hybrid-override-pattern: M-M-M-M*-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- + --spec: megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec + --normalization: RMSNorm + --swiglu: true + --attention-dropout: 0.0 + --hidden-dropout: 0.0 + --seq-length: 4096 + --max-position-embeddings: 4096 + --micro-batch-size: 1 + --ckpt-format: torch_dist + --ckpt-fully-parallel-save: true + --ckpt-fully-parallel-load: true + --ckpt-assume-constant-structure: true + --dist-ckpt-strictness: log_unexpected + --bf16: true + --attention-backend: flash + --no-create-attention-mask-in-dataloader: true + --num-workers: 8 + --use-checkpoint-args: true + --no-use-tokenizer-model-from-checkpoint-args: true + --no-load-optim: true + --deterministic-mode: true + --save-interval: 2000 + --temperature: 1.0 + --top_k: 1 + --return-log-probs: true + --num-tokens-to-generate: 30 + --max-tokens-to-oom: 3600000 + --inference-max-seq-length: 4096 + --output-path: ${TENSORBOARD_PATH} + --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." + --incoming-requests-per-step: 32 + --inference-repeat-n: 3 +METRICS: + - "generated_tokens" + - "logprobs" diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgx_h100.json index 1a9705f8181..d9a60d1ae11 100644 --- a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgx_h100.json @@ -174,5 +174,5 @@ -0.5394397377967834 ] }, - "throughput": [25.35687538450034, 25.35687538450034] + "throughput": [34.95064017365726, 34.95064017365726] } diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml index 0e1f9110793..e97dc0b56a4 100644 --- a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml @@ -80,6 +80,7 @@ MODEL_ARGS: --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." --incoming-requests-per-sec: -1 --inference-repeat-n: 8 + --inference-dynamic-batching-buffer-size-gb: 20 METRICS: - "generated_tokens" - "logprobs" diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml index 1b9eaaf1f65..6c119cc548b 100644 --- a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml @@ -76,6 +76,7 @@ MODEL_ARGS: --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." --incoming-requests-per-sec: -1 # all requests arrive up front. --inference-repeat-n: 8 + --inference-dynamic-batching-buffer-size-gb: 20 METRICS: - "generated_tokens" - "logprobs" diff --git a/tests/test_utils/python_scripts/auto_reminder_github.py b/tests/test_utils/python_scripts/auto_reminder_github.py index df75ec0542c..7484244b717 100644 --- a/tests/test_utils/python_scripts/auto_reminder_github.py +++ b/tests/test_utils/python_scripts/auto_reminder_github.py @@ -58,27 +58,42 @@ def get_user_email(self, username: str): try: user = self.github.get_user(username) + public_email = None # 1. Try public profile email first if user.email and not user.email.endswith("@users.noreply.github.com"): - self.email_cache[username] = user.email - return user.email + if user.email.endswith("@nvidia.com"): + self.email_cache[username] = user.email + return user.email + else: + public_email = user.email # 2. If no public email, check recent commits on the main repo try: # Use get_commits(author=...) which is more direct than search_commits for commit in self.repo.get_commits(author=user)[:10]: email = commit.commit.author.email - if email and not email.endswith("@users.noreply.github.com"): + if ( + email + and not email.endswith("@users.noreply.github.com") + and email.endswith("@nvidia.com") + ): self.email_cache[username] = email return email + elif ( + email + and not email.endswith("@users.noreply.github.com") + and public_email is None + ): + public_email = email except Exception as e: logger.debug(f"Could not check commits for {username}: {e}") - # 3. Fallback to public email (even if noreply) or a constructed noreply - email = user.email or f"{username}@users.noreply.github.com" - self.email_cache[username] = email - return email + if public_email is None: + public_email = f"{username}@users.noreply.github.com" + + self.email_cache[username] = public_email + return public_email except Exception as e: logger.warning(f"Could not get user object for {username}: {e}") diff --git a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml b/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml index c61128aaca2..6a3d582d3ae 100644 --- a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml +++ b/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml @@ -39,7 +39,7 @@ spec: ARGUMENTS=( "CHECKPOINT_LOAD_PATH=/mnt/artifacts" "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" - "DATA_PATH=null" + "DATA_PATH=/mnt/artifacts/" "DATA_CACHE_PATH=/workspace/data/cache" "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" @@ -59,8 +59,23 @@ products: - environment: [dev] scope: [flaky] platforms: [dgx_h100] + - test_case: [gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq] + products: + - environment: [dev] + scope: [flaky] + platforms: [dgx_h100] - test_case: [gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq] products: - environment: [dev] scope: [mr, mr-github] platforms: [dgx_h100] + - test_case: [gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq] + products: + - environment: [dev] + scope: [mr, mr-github] + platforms: [dgx_h100] + - test_case: [gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq] + products: + - environment: [dev] + scope: [flaky] + diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/gpt.yaml index 0b3606fd702..34030e4923a 100644 --- a/tests/test_utils/recipes/gpt.yaml +++ b/tests/test_utils/recipes/gpt.yaml @@ -114,6 +114,11 @@ products: platforms: [dgx_h100] - environment: [lts] scope: [nightly] + - test_case: [gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset] + products: + - environment: [dev] + scope: [mr, mr-github] + platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer] products: - environment: [dev] diff --git a/tests/test_utils/recipes/mamba-dynamic-inference.yaml b/tests/test_utils/recipes/mamba-dynamic-inference.yaml new file mode 100644 index 00000000000..9ca1bab4402 --- /dev/null +++ b/tests/test_utils/recipes/mamba-dynamic-inference.yaml @@ -0,0 +1,61 @@ +type: basic +format_version: 1 +maintainers: [mcore] +loggers: [stdout] +spec: + name: "{test_case}_{environment}_{platforms}" + model: hybrid + build: mcore-pyt-{environment} + nodes: 1 + gpus: 1 + n_repeat: 1 + platforms: dgx_a100 + script_setup: | + unset https_proxy + echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc + + # Checkout latest + cd /opt + rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm + git init + git remote add origin $MCORE_REPO + git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*' + git fetch origin $MCORE_MR_COMMIT + git checkout $MCORE_MR_COMMIT + git rev-parse HEAD + # Checkout backwards-ref + cd /opt + rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy + git init + git remote add origin $MCORE_REPO + git fetch origin $MCORE_BACKWARDS_COMMIT + git checkout $MCORE_BACKWARDS_COMMIT + git rev-parse HEAD + rm -rf megatron; cp -a /opt/megatron-lm/megatron ./ + script: |- + ls + cd /opt/megatron-lm + + ARGUMENTS=( + "CHECKPOINT_LOAD_PATH=/mnt/artifacts" + "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" + "DATA_PATH=null" + "DATA_CACHE_PATH=/workspace/data/cache" + "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_dynamic_inference.py" + "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "OUTPUT_PATH={assets_dir}" + "TENSORBOARD_PATH={assets_dir}/generations_{environment}_{platforms}.json" + "N_REPEAT={n_repeat}" + "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" + "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" + ) + + bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} + +products: + - test_case: [hybrid_dynamic_inference_tp1_pp1_dp8_583m] + products: + - environment: [dev] + scope: [mr, mr-github] + platforms: [dgx_h100] diff --git a/tests/unit_tests/data/test_fim_dataset.py b/tests/unit_tests/data/test_fim_dataset.py new file mode 100644 index 00000000000..7022a4b5fa9 --- /dev/null +++ b/tests/unit_tests/data/test_fim_dataset.py @@ -0,0 +1,87 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +import pytest +import torch + +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.utils import compile_helpers, get_blend_from_list +from megatron.core.tokenizers import MegatronTokenizer +from megatron.training.datasets.fim_dataset import GPTFIMDataset, GPTFIMDatasetConfig +from tests.unit_tests.test_utilities import Utils + + +@pytest.mark.parametrize("spm_rate", [0.0, 1.0]) +@pytest.mark.parametrize("split_sample", [None, "python"]) +def test_fim_gpt_dataset(spm_rate, split_sample): + if torch.distributed.is_available(): + Utils.initialize_distributed() + if torch.distributed.get_rank() == 0: + compile_helpers() + torch.distributed.barrier() + else: + compile_helpers() + + tokenizer = MegatronTokenizer.from_pretrained( + tokenizer_path="/opt/data/tokenizers/huggingface", + metadata_path={"library": "huggingface"}, + additional_special_tokens=["", "", "", "", ""], + include_special_tokens=True, + ) + blend = get_blend_from_list(["/opt/data/datasets/fim/fim_text_document"]) + extra_tokens = { + "prefix": "", + "middle": "", + "suffix": "", + "pad": "", + "eod": "", + } + seq_length = 32 + rate = 1.0 + fragment_rate = 1.0 + config = GPTFIMDatasetConfig( + blend=blend, + random_seed=1234, + sequence_length=seq_length, + split="990,9,1", + tokenizer=tokenizer, + reset_position_ids=True, + reset_attention_mask=True, + eod_mask_loss=True, + fim_extra_tokens=extra_tokens, + fim_rate=rate, + fim_spm_rate=spm_rate, + fim_fragment_rate=fragment_rate, + fim_split_sample=split_sample, + ) + + datasets = BlendedMegatronDatasetBuilder( + GPTFIMDataset, [10, 10, 10], lambda: True, config + ).build() + + prefix_id = tokenizer.tokenize("")[1] + suffix_id = tokenizer.tokenize("")[1] + middle_id = tokenizer.tokenize("")[1] + + dataset = datasets[0] + assert dataset.fim_rate == rate + assert dataset.fim_spm_rate == spm_rate + assert dataset.fragment_fim_rate == fragment_rate + + tokens = dataset[0]["tokens"].tolist() + if split_sample: + split_sample_id = tokenizer.tokenize(split_sample)[1] + split_sample_index = tokens.index(split_sample_id) + assert prefix_id == tokens[split_sample_index + 1] + if spm_rate == 0.0: + assert prefix_id == tokens[0] + assert suffix_id in tokens + assert middle_id in tokens + assert tokens.index(suffix_id) < tokens.index(middle_id) + else: + assert prefix_id == tokens[0] + assert suffix_id == tokens[1] + assert middle_id in tokens + + +if __name__ == "__main__": + test_fim_gpt_dataset() diff --git a/tests/unit_tests/inference/contexts/test_dynamic_context.py b/tests/unit_tests/inference/contexts/test_dynamic_context.py index 0674cdfcabd..1baf9034c9d 100644 --- a/tests/unit_tests/inference/contexts/test_dynamic_context.py +++ b/tests/unit_tests/inference/contexts/test_dynamic_context.py @@ -5,6 +5,9 @@ import pytest import torch +from megatron.core.inference.contexts.attention_context.mamba_metadata import ( + MambaInferenceStateConfig, +) from megatron.core.inference.contexts.dynamic_context import ( DynamicInferenceContext, RequestOverflowError, @@ -28,6 +31,8 @@ class TestDynamicContext: def _setup_model_parallel_group(self, tensor_parallel_size, pipeline_parallel_size): + self.pp_size = pipeline_parallel_size + Utils.initialize_model_parallel( tensor_model_parallel_size=tensor_parallel_size, pipeline_model_parallel_size=pipeline_parallel_size, @@ -43,38 +48,39 @@ def _get_dynamic_context( max_sequence_length, buffer_size_gb, block_size_tokens, - buffer_guaranteed_fraction, - buffer_overflow_factor, - max_requests_override, - max_tokens_override, + max_tokens, is_hybrid_model=False, layer_type_list=None, rounder=64, ): set_rounder(rounder) - if is_hybrid_model and layer_type_list is None: - layer_type_list = [Symbols.MAMBA, Symbols.MLP, Symbols.ATTENTION, Symbols.MLP] + if is_hybrid_model: + if layer_type_list is None: + layer_type_list = [Symbols.MAMBA, Symbols.MLP, Symbols.ATTENTION, Symbols.MLP] + mamba_conv_states_shape = (544, 4) + mamba_ssm_states_shape = (8, 64, 16) + mamba_inference_state_config = MambaInferenceStateConfig( + layer_type_list, mamba_conv_states_shape, mamba_ssm_states_shape + ) + else: + mamba_inference_state_config = None dynamic_context = DynamicInferenceContext( params_dtype=params_dtype, - num_layers=num_layers, + num_layers=num_layers // self.pp_size, kv_channels=kv_channels, num_attention_heads=num_attention_heads, max_sequence_length=max_sequence_length, num_cuda_graphs=None, use_cuda_graphs_for_non_decode_steps=not is_hybrid_model, buffer_size_gb=buffer_size_gb, - buffer_guaranteed_fraction=buffer_guaranteed_fraction, block_size_tokens=block_size_tokens, - buffer_overflow_factor=buffer_overflow_factor, - max_requests_override=max_requests_override, - max_tokens_override=max_tokens_override, - layer_type_list=layer_type_list, - mamba_conv_states_shape=(544, 4), - mamba_ssm_states_shape=(8, 64, 16), + max_tokens=max_tokens, + mamba_inference_state_config=mamba_inference_state_config, use_flashinfer_fused_rope=None, # default to using flash-infer if available # this is for compatibility with the LTS environment + unified_memory_level=0, # unit tests currently broken with UVM ) return dynamic_context @@ -93,28 +99,25 @@ def test_initialize_dynamic_context(self, is_hybrid_model: bool): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, - buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_requests_override=None, - max_tokens_override=None, - buffer_overflow_factor=None, + max_tokens=None, is_hybrid_model=is_hybrid_model, ) if not is_hybrid_model: - assert dynamic_context.gtd_block_count == 48 - assert dynamic_context.gtd_request_count == 12 - assert dynamic_context.block_allocator.block_count_total == 491 - assert dynamic_context.max_requests == 128 - assert dynamic_context.max_tokens == 62848 + assert dynamic_context.block_allocator.total_count == 491 + assert dynamic_context.block_allocator.active_count == 245 + assert dynamic_context.max_total_requests == 490 + assert dynamic_context.max_active_requests == 245 + assert dynamic_context.max_tokens == 16384 assert dynamic_context.num_mamba_layers == 0 assert dynamic_context.mamba_metadata is None else: - assert dynamic_context.gtd_block_count == 112 - assert dynamic_context.gtd_request_count == 28 - assert dynamic_context.block_allocator.block_count_total == 1156 - assert dynamic_context.max_requests == 320 - assert dynamic_context.max_tokens == 154176 + assert dynamic_context.block_allocator.total_count == 555 + assert dynamic_context.block_allocator.active_count == 277 + assert dynamic_context.max_total_requests == 554 + assert dynamic_context.max_active_requests == 277 + assert dynamic_context.max_tokens == 16384 assert dynamic_context.num_mamba_layers == 1 assert dynamic_context.mamba_metadata is not None @@ -131,11 +134,8 @@ def test_is_static_batching(self): num_attention_heads=8, max_sequence_length=512, buffer_size_gb=1.0, - buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_requests_override=None, - max_tokens_override=None, - buffer_overflow_factor=None, + max_tokens=None, ) assert not dynamic_context.is_static_batching() @@ -150,26 +150,18 @@ def test_is_memory_available(self, is_hybrid_model): num_attention_heads=8, max_sequence_length=512, buffer_size_gb=1.0, - buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_requests_override=None, - max_tokens_override=None, - buffer_overflow_factor=None, + max_tokens=None, is_hybrid_model=is_hybrid_model, ) - dynamic_context.block_allocator.block_count_avail = 10 + dynamic_context.block_allocator.active_count = 10 assert dynamic_context.block_allocator.is_memory_available(10) assert not dynamic_context.block_allocator.is_memory_available(11) assert dynamic_context.block_allocator.is_memory_available(1) - dynamic_context.block_allocator.block_count_avail = 0 + dynamic_context.block_allocator.active_count = 0 assert not dynamic_context.block_allocator.is_memory_available(1) - dynamic_context.block_allocator.block_count_avail = 10 - dynamic_context.gtd_block_count = 5 - assert dynamic_context.block_allocator.is_memory_available(6) - assert not dynamic_context.block_allocator.is_memory_available(6, safe=True) - @pytest.mark.internal @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_request_overflow(self, is_hybrid_model: bool): @@ -182,16 +174,14 @@ def test_request_overflow(self, is_hybrid_model: bool): num_attention_heads=8, max_sequence_length=128, buffer_size_gb=0.01, - buffer_guaranteed_fraction=0.1, block_size_tokens=32, - max_requests_override=None, - max_tokens_override=None, - buffer_overflow_factor=None, + max_tokens=None, rounder=1, is_hybrid_model=is_hybrid_model, ) + dynamic_context.max_active_requests //= 2 with pytest.raises(RequestOverflowError): - for i in range(dynamic_context.max_requests + 1): + for i in range(dynamic_context.max_active_requests + 1): dynamic_context.add_request( DynamicInferenceRequest( request_id=i, @@ -214,11 +204,8 @@ def test_token_overflow_error(self, is_hybrid_model: bool): num_attention_heads=8, max_sequence_length=512, buffer_size_gb=0.1, - buffer_guaranteed_fraction=0.1, block_size_tokens=128, - buffer_overflow_factor=1.0, - max_requests_override=2, - max_tokens_override=20, # Setting a very low token limit + max_tokens=200, # setting low, but >= context.max_active_requests. rounder=1, is_hybrid_model=is_hybrid_model, ) @@ -227,7 +214,7 @@ def test_token_overflow_error(self, is_hybrid_model: bool): dynamic_context.add_request( DynamicInferenceRequest( request_id=1, - prompt_tokens=torch.arange(0, 25, device='cuda'), + prompt_tokens=torch.arange(0, 225, device='cuda'), sampling_params=SamplingParams( num_tokens_to_generate=dynamic_context.max_tokens - 25 ), @@ -246,11 +233,8 @@ def test_reset(self, is_hybrid_model: bool): num_attention_heads=8, max_sequence_length=128, buffer_size_gb=1.0, - buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_requests_override=None, - max_tokens_override=None, - buffer_overflow_factor=None, + max_tokens=None, is_hybrid_model=is_hybrid_model, ) @@ -273,7 +257,6 @@ def test_reset(self, is_hybrid_model: bool): dynamic_context.token_to_position_in_request.fill_(1) dynamic_context.token_to_block_idx.fill_(1) dynamic_context.token_to_local_position_within_kv_block.fill_(1) - dynamic_context.block_allocator.block_count_avail = 5 dynamic_context.memory_buffer.fill_(1) dynamic_context.request_to_kv_block_ids.fill_(1) if is_hybrid_model: @@ -303,8 +286,8 @@ def test_reset(self, is_hybrid_model: bool): assert torch.all(dynamic_context.token_to_block_idx == -1) assert torch.all(dynamic_context.token_to_local_position_within_kv_block == 0) assert ( - dynamic_context.block_allocator.block_count_avail - == dynamic_context.block_allocator.block_count_total - 1 + dynamic_context.block_allocator.active_count + == dynamic_context.block_allocator.total_count // 2 ) assert torch.all(dynamic_context.request_to_kv_block_ids == -1) if is_hybrid_model: @@ -323,16 +306,13 @@ def test_allocate_and_release_memory_blocks(self, is_hybrid_model): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, - buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_requests_override=None, - max_tokens_override=None, - buffer_overflow_factor=None, + max_tokens=None, is_hybrid_model=is_hybrid_model, ) if is_hybrid_model: - expected_memory_blocks = [1151, 1152, 1153, 1154] + expected_memory_blocks = [550, 551, 552, 553] else: expected_memory_blocks = [486, 487, 488, 489] expected_block_count_avail = expected_memory_blocks[0] @@ -345,20 +325,20 @@ def test_allocate_and_release_memory_blocks(self, is_hybrid_model): .tolist() == expected_memory_blocks ) - assert dynamic_context.block_allocator.block_count_avail == expected_block_count_avail + assert dynamic_context.block_allocator.total_avail == expected_block_count_avail dynamic_context.block_allocator.release_memory_blocks( torch.tensor(expected_memory_blocks[-2:], device='cuda') ) - assert dynamic_context.block_allocator.block_count_avail == expected_block_count_avail + 2 + assert dynamic_context.block_allocator.total_avail == expected_block_count_avail + 2 assert ( dynamic_context.block_allocator.allocate_memory_blocks(1).item() == expected_memory_blocks[-1] ) - assert dynamic_context.block_allocator.block_count_avail == expected_block_count_avail + 1 + assert dynamic_context.block_allocator.total_avail == expected_block_count_avail + 1 # Should return None since we allocate more blocks than what we have. assert ( dynamic_context.block_allocator.allocate_memory_blocks( - dynamic_context.block_allocator.block_count_avail + 100 + dynamic_context.block_allocator.total_avail + 100 ) == None ) @@ -375,11 +355,8 @@ def test_add_request(self, is_hybrid_model: bool): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, - buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_requests_override=None, - max_tokens_override=None, - buffer_overflow_factor=None, + max_tokens=None, is_hybrid_model=is_hybrid_model, ) assert dynamic_context.block_size_tokens == 128 @@ -401,7 +378,7 @@ def test_add_request(self, is_hybrid_model: bool): assert dynamic_context.request_kv_length_offsets[0] == 0 assert dynamic_context.request_kv_block_counts[0] == 2 assert dynamic_context.request_last_kv_block_id[0].item() == ( - 1154 if is_hybrid_model else 489 + 553 if is_hybrid_model else 489 ) assert dynamic_context.request_last_kv_block_offset[0].item() == 15 assert torch.all( @@ -451,11 +428,8 @@ def test_update_request(self, is_hybrid_model: bool): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, - buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_requests_override=None, - max_tokens_override=None, - buffer_overflow_factor=None, + max_tokens=None, is_hybrid_model=is_hybrid_model, ) @@ -464,7 +438,7 @@ def test_update_request(self, is_hybrid_model: bool): dynamic_context.paused_request_count = 0 dynamic_context.total_request_count = 3 dynamic_context.request_kv_block_counts[0:3] = 1 - new_block_ids = dynamic_context.block_allocator.allocate_memory_blocks(3, safe=True) + new_block_ids = dynamic_context.block_allocator.allocate_memory_blocks(3) dynamic_context.request_to_kv_block_ids[0:3, 0] = new_block_ids if is_hybrid_model: @@ -498,11 +472,8 @@ def test_update_request(self, is_hybrid_model: bool): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, - buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_requests_override=None, - max_tokens_override=None, - buffer_overflow_factor=None, + max_tokens=None, is_hybrid_model=is_hybrid_model, ) @@ -520,18 +491,16 @@ def test_update_request(self, is_hybrid_model: bool): ) total_request_count = 10 - dynamic_context.block_allocator.block_count_avail -= 11 # We align 11 blocks to the 10 requests we have. 3rd request alone we setup like it requires 2 blocks + dynamic_context.block_allocator.total_avail -= 11 # We align 11 blocks to the 10 requests we have. 3rd request alone we setup like it requires 2 blocks dynamic_context.total_request_count = total_request_count dynamic_context.request_to_kv_block_ids[0:total_request_count, 0] = torch.arange( - dynamic_context.block_allocator.block_count_avail, - dynamic_context.block_allocator.block_count_avail + 10, + dynamic_context.block_allocator.total_avail, + dynamic_context.block_allocator.total_avail + 10, ) dynamic_context.request_to_kv_block_ids[3][ 1 - ] = ( - dynamic_context.block_allocator.block_count_avail - ) # Assign one extra block to request 3. + ] = dynamic_context.block_allocator.total_avail # Assign one extra block to request 3. dynamic_context.request_kv_length_offsets[0:total_request_count] = 10 # For 0, 1, 5, 6, the total number of tokens in last block is block size -1, so that they will all need extra blocks dynamic_context.request_kv_length_offsets[0:2] = dynamic_context.block_size_tokens - 1 @@ -617,13 +586,13 @@ def test_update_request(self, is_hybrid_model: bool): dynamic_context.request_to_kv_block_ids[0:10].cpu() == torch.tensor( [ - [1144, 1147, -1, -1], - [1145, 1144, -1, -1], - [1149, 1151, -1, -1], - [1150, 1152, -1, -1], - [1148, -1, -1, -1], - [1146, -1, -1, -1], - [1153, -1, -1, -1], + [543, 546, -1, -1], + [544, 543, -1, -1], + [548, 550, -1, -1], + [549, 551, -1, -1], + [547, -1, -1, -1], + [545, -1, -1, -1], + [552, -1, -1, -1], [-1, -1, -1, -1], [-1, -1, -1, -1], [-1, -1, -1, -1], @@ -662,22 +631,19 @@ def test_release_memory_blocks_for_finished_requests(self, is_hybrid_model): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, - buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_requests_override=None, - max_tokens_override=None, - buffer_overflow_factor=None, + max_tokens=None, is_hybrid_model=is_hybrid_model, ) # Set up the initial state with 5 requests # Allocate 5 blocks for 5 requests - initial_blocks = dynamic_context.block_allocator.allocate_memory_blocks(5, safe=True) + initial_blocks = dynamic_context.block_allocator.allocate_memory_blocks(5) dynamic_context.total_request_count = 5 dynamic_context.paused_request_count = 0 # Record the available blocks before releasing memory - initial_available_blocks = dynamic_context.block_allocator.block_count_avail + initial_available_blocks = dynamic_context.block_allocator.total_avail # Assign blocks to the requests (one block per request) for i in range(5): @@ -708,7 +674,7 @@ def test_release_memory_blocks_for_finished_requests(self, is_hybrid_model): assert dynamic_context.active_token_count == 2 # Verify that 3 blocks were released by checking the available blocks - assert dynamic_context.block_allocator.block_count_avail == initial_available_blocks + 3 + assert dynamic_context.block_allocator.total_avail == initial_available_blocks + 3 if is_hybrid_model: # Request at position 3 now moves into finished request position 0 @@ -737,22 +703,19 @@ def test_finished_requests_with_multiple_blocks(self, is_hybrid_model): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, - buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_requests_override=None, - max_tokens_override=None, - buffer_overflow_factor=None, + max_tokens=None, is_hybrid_model=is_hybrid_model, ) # Set up the initial state with 3 requests, where some use multiple blocks # Allocate 6 blocks in total for the requests - initial_blocks = dynamic_context.block_allocator.allocate_memory_blocks(6, safe=True) + initial_blocks = dynamic_context.block_allocator.allocate_memory_blocks(6) dynamic_context.total_request_count = 3 dynamic_context.paused_request_count = 0 # Record the available blocks before releasing memory - initial_available_blocks = dynamic_context.block_allocator.block_count_avail + initial_available_blocks = dynamic_context.block_allocator.total_avail # Assign blocks to the requests: # - Request 0: 1 block @@ -792,7 +755,7 @@ def test_finished_requests_with_multiple_blocks(self, is_hybrid_model): assert dynamic_context.active_token_count == 0 # Verify that all 6 blocks were released by checking the available blocks - assert dynamic_context.block_allocator.block_count_avail == initial_available_blocks + 6 + assert dynamic_context.block_allocator.total_avail == initial_available_blocks + 6 if is_hybrid_model: # All mamba states should be zeroed out @@ -813,11 +776,8 @@ def test_mamba_states_cache(self, is_hybrid_model: bool): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, - buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_requests_override=None, - max_tokens_override=None, - buffer_overflow_factor=None, + max_tokens=None, is_hybrid_model=False, ) with pytest.raises(AssertionError) as error: @@ -831,11 +791,8 @@ def test_mamba_states_cache(self, is_hybrid_model: bool): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, - buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_requests_override=None, - max_tokens_override=None, - buffer_overflow_factor=None, + max_tokens=None, is_hybrid_model=is_hybrid_model, layer_type_list=[Symbols.MAMBA, Symbols.ATTENTION, Symbols.MAMBA, Symbols.ATTENTION], ) @@ -890,11 +847,8 @@ def test_calculate_and_store_log_probs(self): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, - buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_requests_override=None, - max_tokens_override=None, - buffer_overflow_factor=None, + max_tokens=None, ) # Add a few requests to the context @@ -1097,56 +1051,3 @@ def test_calculate_and_store_log_probs(self): ) current_global_token_offset += expected_len - - @pytest.mark.internal - def test_unified_memory(self): - - from megatron.core.inference.unified_memory import ( - UnifiedMemoryUnsupportedError, - create_unified_mempool, - ) - - # Check UVM support. - try: - create_unified_mempool() - except UnifiedMemoryUnsupportedError: - pytest.skip("Unified memory not available due to bad environment.") - - # Setup. - self._setup_model_parallel_group(1, 1) - - # Compute number of contexts needed to fill GPU memory. - gpu_size_gb = ( - torch.cuda.get_device_properties(torch.cuda.current_device()).total_memory / 1024**3 - ) - buffer_size_gb = 20 - num_contexts = math.ceil(gpu_size_gb / buffer_size_gb) + 1 - - # Allocate enough contexts to fill GPU memory. - def init_contexts(*, unified_memory_level): - contexts = [] - for i in range(num_contexts): - contexts.append( - DynamicInferenceContext( - params_dtype=torch.float32, - num_layers=4, - kv_channels=8, - num_attention_heads=2, - max_sequence_length=512, - buffer_size_gb=buffer_size_gb, - buffer_overflow_factor=1, - buffer_guaranteed_fraction=0, - unified_memory_level=unified_memory_level, - ) - ) - - # Pure GPU memory test should OOM. - try: - init_contexts(unified_memory_level=0) - except torch.OutOfMemoryError: - pass - else: - raise Exception("expected OOM.") - - # Unified memory test should succeed. - init_contexts(unified_memory_level=1) diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py index 0ac4b296746..174bf89350b 100644 --- a/tests/unit_tests/inference/engines/test_dynamic_engine.py +++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py @@ -1,9 +1,10 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import asyncio +import math import random import types -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Dict, List, Optional, Tuple import pytest @@ -12,6 +13,9 @@ from transformer_engine.pytorch.fp8 import check_fp8_support from megatron.core import parallel_state +from megatron.core.inference.contexts.attention_context.mamba_metadata import ( + MambaInferenceStateConfig, +) from megatron.core.inference.contexts.dynamic_context import ( ActiveRequestCountOverflowError, BlockOverflowError, @@ -34,6 +38,7 @@ ) from megatron.core.models.gpt.gpt_layer_specs import ( get_gpt_layer_local_spec, + get_gpt_layer_with_inference_spec, get_gpt_layer_with_transformer_engine_spec, ) from megatron.core.models.gpt.gpt_model import GPTModel @@ -44,7 +49,7 @@ from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import ( check_mamba_sequence_packing_support, - get_attr_wrapped_model, + get_mamba_inference_state_config_from_model, is_fa_min_version, is_te_min_version, ) @@ -86,10 +91,7 @@ class DynamicEngineTestConfig: context_buffer_size_gb: float = 0.1 # enough room for all tokens. context_block_size_tokens: int = 256 - context_buffer_guaranteed_fraction: float = 0.01 - context_buffer_overflow_factor: Optional[float] = None - context_max_requests_override: Optional[int] = None - context_max_tokens_override: Optional[int] = None + context_max_tokens: Optional[int] = None tensor_model_parallel_size: int = 1 pipeline_model_parallel_size: int = 1 expert_model_parallel_size: int = 1 @@ -105,12 +107,14 @@ class DynamicEngineTestConfig: skip_prompt_log_probs: bool = False cuda_graph_scope: List[str] = None force_build_cuda_graphs: bool = False + transformer_impl: str = "local" # If False, do not build cuda graphs in the tests, even if # num_cuda_graphs is set. # For tests concerning cuda-graph warmups, we set this to False # to avoid the overhead of building the graphs, which is not # relevant to the test. The tests only check if the required # context attributes are set correctly. + suspend_resume_interval: Optional[int] = None fp8: bool = False @@ -125,17 +129,6 @@ def __post_init__(self): assert self.num_tokens_total is not None self.max_sequence_length = self.num_tokens_total - # Update overrides if not using overflow factor. - if self.context_buffer_overflow_factor is None: - - # Enough room for all requests. - if self.context_max_requests_override is None: - self.context_max_requests_override = self.num_requests - - # Enough room for all tokens. - if self.context_max_tokens_override is None: - self.context_max_tokens_override = self.num_requests * self.max_sequence_length - if self.cuda_graph_scope is None: self.cuda_graph_scope = ["full_iteration"] @@ -147,6 +140,9 @@ class DynamicEngineTestEnv: config: DynamicEngineTestConfig requests: List[DynamicInferenceRequest] engine: DynamicInferenceEngine + mem_usage: dict = field( + default_factory=lambda: {"start": None, "end": None, "suspend_resume": {}} + ) class TestDynamicInferenceEngine: @@ -215,34 +211,29 @@ def _build_inference_context( test_config: DynamicEngineTestConfig, transformer_config: TransformerConfig, requests: List[DynamicInferenceRequest], - layer_type_list: Optional[List[str]], - mamba_conv_states_shape: Optional[Tuple[int]] = None, - mamba_ssm_states_shape: Optional[Tuple[int]] = None, + mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, ): """The inference context manages the KV cache and other inference state.""" # Inference context. context = DynamicInferenceContext( params_dtype=transformer_config.params_dtype, - num_layers=transformer_config.num_layers, + num_layers=transformer_config.num_layers + // transformer_config.pipeline_model_parallel_size, kv_channels=transformer_config.kv_channels, num_attention_heads=transformer_config.num_query_groups, max_sequence_length=test_config.max_sequence_length, num_cuda_graphs=test_config.num_cuda_graphs, use_cuda_graphs_for_non_decode_steps=not test_config.model_provider == "mamba", buffer_size_gb=test_config.context_buffer_size_gb, - buffer_guaranteed_fraction=test_config.context_buffer_guaranteed_fraction, block_size_tokens=test_config.context_block_size_tokens, - buffer_overflow_factor=test_config.context_buffer_overflow_factor, - max_requests_override=test_config.context_max_requests_override, - max_tokens_override=test_config.context_max_tokens_override, + max_tokens=test_config.context_max_tokens, tensor_model_parallel_size=transformer_config.tensor_model_parallel_size, - layer_type_list=layer_type_list, - mamba_conv_states_shape=mamba_conv_states_shape, - mamba_ssm_states_shape=mamba_ssm_states_shape, + mamba_inference_state_config=mamba_inference_state_config, materialize_only_last_token_logits=test_config.materialize_only_last_token_logits, use_flashinfer_fused_rope=None, # default to using flash-infer if available # this is for compatibility with the LTS environment + unified_memory_level=0, # unit tests currently broken with UVM ) return context @@ -295,16 +286,26 @@ def _build_test_env(cls, test_config): ), sequence_parallel=test_config.sequence_parallel, pipeline_dtype=torch.bfloat16, - add_bias_linear=test_config.expert_model_parallel_size == 1, + add_bias_linear=test_config.expert_model_parallel_size == 1 + and not (test_config.transformer_impl == "inference_optimized"), fp8="hybrid" if test_config.fp8 else None, fp8_recipe="tensorwise" if test_config.fp8 else None, inference_sampling_seed=test_config.random_seed, cuda_graph_scope=test_config.cuda_graph_scope, + transformer_impl=test_config.transformer_impl, + normalization=( + "RMSNorm" + if test_config.transformer_impl == "inference_optimized" + else "LayerNorm" + ), + # inference optimized currently only supports RMS Norm ) - if test_config.fp8: + if test_config.fp8 or test_config.transformer_impl == "transformer_engine": layer_spec = get_gpt_layer_with_transformer_engine_spec() - else: + elif test_config.transformer_impl == "local": layer_spec = get_gpt_layer_local_spec() + elif test_config.transformer_impl == "inference_optimized": + layer_spec = get_gpt_layer_with_inference_spec() # GPT model. model = GPTModel( @@ -317,10 +318,13 @@ def _build_test_env(cls, test_config): post_process=parallel_state.is_pipeline_last_stage(), ).cuda() elif test_config.model_provider == "mamba": + pp_size = test_config.pipeline_model_parallel_size # Transformer config. transformer_config = TransformerConfig( params_dtype=torch.bfloat16, - num_layers=3, # 1 Mamba layer, 1 attention layer, 1 MLP layer + num_layers=( + 3 if pp_size == 1 else 6 + ), # 1 Mamba layer, 1 attention layer, 1 MLP layer hidden_size=256, # The Mamba layer places several constraints on this mamba_num_heads=16, num_attention_heads=16, @@ -333,7 +337,7 @@ def _build_test_env(cls, test_config): ), inference_rng_tracker=True, tensor_model_parallel_size=test_config.tensor_model_parallel_size, - pipeline_model_parallel_size=test_config.pipeline_model_parallel_size, + pipeline_model_parallel_size=pp_size, expert_model_parallel_size=test_config.expert_model_parallel_size, num_moe_experts=( None @@ -346,6 +350,7 @@ def _build_test_env(cls, test_config): fp8="hybrid" if test_config.fp8 else None, fp8_recipe="tensorwise" if test_config.fp8 else None, cuda_graph_scope=test_config.cuda_graph_scope, + is_hybrid_model=True, # Needs to be set for correct out_proj init ) # Mamba model. @@ -368,22 +373,7 @@ def _build_test_env(cls, test_config): model.eval() - # Layer type list for hybrid models - decoder = get_attr_wrapped_model(model, "decoder") - layer_type_list = getattr(decoder, "layer_type_list", None) - if test_config.model_provider == "mamba": - mamba_states_shapes = decoder.mamba_state_shapes_per_request() - if mamba_states_shapes is not None: - (mamba_conv_states_shape, mamba_ssm_states_shape) = mamba_states_shapes - else: - # A `MambaBlock` can only not have a `MambaLayer` if using pipeline parallelism - # and a particular pipeline stage was not assigned a `MambaLayer`. - assert test_config.pipeline_model_parallel_size > 1 - mamba_conv_states_shape = None - mamba_ssm_states_shape = None - else: - mamba_conv_states_shape = None - mamba_ssm_states_shape = None + mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) # Inference config. inference_config = InferenceWrapperConfig( @@ -400,9 +390,7 @@ def _build_test_env(cls, test_config): test_config=test_config, transformer_config=transformer_config, requests=requests, - layer_type_list=layer_type_list, - mamba_conv_states_shape=mamba_conv_states_shape, - mamba_ssm_states_shape=mamba_ssm_states_shape, + mamba_inference_state_config=mamba_inference_state_config, ) # Inference model wrapper. @@ -416,7 +404,9 @@ def _build_test_env(cls, test_config): # Text generation controller. text_generation_controller = TextGenerationController( inference_wrapped_model=inference_wrapped_model, - tokenizer=types.SimpleNamespace(vocab_size=test_config.vocab_size), + tokenizer=types.SimpleNamespace( + vocab_size=test_config.vocab_size, detokenize=lambda tokens: "tokenized_prompt" + ), ) # Reset global cuda graph state. @@ -435,12 +425,6 @@ def _build_test_env(cls, test_config): # Test env. env = DynamicEngineTestEnv(config=test_config, requests=requests, engine=engine) - # Mock the detokenize method to return predictable result - def mock_detokenize_prompt(tokens): - return "tokenized_prompt" - - env.engine.controller.tokenizer.detokenize = mock_detokenize_prompt - return env @classmethod @@ -453,7 +437,31 @@ def _run_step(cls, env): # and engine.async_step() doesn't use this sampling param's # num_tokens_to_generate. result = env.engine.step_modern(verbose=False) - finished_requests = result["finished_requests"] + + # Suspend + resume. + if ( + env.config.suspend_resume_interval is not None + and env.engine.step_count % env.config.suspend_resume_interval == 0 + ): + suspend_resume_mems = {} + suspend_resume_mems["start"] = torch.cuda.memory_stats() + env.engine.suspend() # suspend. + suspend_resume_mems["mid"] = torch.cuda.memory_stats() + env.engine.resume() # resume. + suspend_resume_mems["end"] = torch.cuda.memory_stats() + env.mem_usage["suspend_resume"][env.engine.step_count] = suspend_resume_mems + + # Nothing done? + finished_request_records = result["finished_request_records"] + if len(finished_request_records) == 0: + return + + # Append output tokens. + for finished_request_record in finished_request_records: + finished_request = finished_request_record.merge(env.engine.controller.tokenizer) + request = env.requests[finished_request.request_id] + request.output = finished_request.generated_tokens + request.status = finished_request.status @classmethod @torch.inference_mode() @@ -463,10 +471,12 @@ def _run_test(cls, **test_config_kwargs): env = cls._build_test_env(test_config) # Add requests to engine. + env.mem_usage["start"] = torch.cuda.memory_stats() for request in tqdm(env.requests, "add requests"): # Add request. env.engine._add_request(request) + request.state = "pending" # Insert gap steps between adding requests. for _ in range(test_config.num_gap_steps): @@ -493,14 +503,20 @@ def _run_test(cls, **test_config_kwargs): if num_tokens_total is None else num_tokens_total - len(request.prompt_tokens) ) - assert ( - (num_tokens_to_generate is None and num_tokens_total is None) - or len(request.generated_tokens) == num_tokens_expected - or request.status == Status.FAILED - ), ( - f"Request {request.request_id} expected to generate {num_tokens_to_generate} " - f"tokens but generated {len(request.generated_tokens)}" - ) + + # Validate the output length only if suspend_resume_interval is None. + # If it is not None, then the output length could be anything in the + # range [1, num_tokens_to_generate]. + if test_config.suspend_resume_interval is None: + assert ( + (num_tokens_to_generate is None and num_tokens_total is None) + or len(request.generated_tokens) <= num_tokens_expected + or request.status == Status.FAILED + ), ( + f"Request {request.request_id} expected to generate {num_tokens_to_generate} " + f"tokens but generated {len(request.generated_tokens)}" + ) + env.mem_usage["end"] = torch.cuda.memory_stats() return env @@ -518,40 +534,40 @@ def teardown_method(self, method): def test_simple(self, model_provider, num_cuda_graphs, cuda_graph_scope) -> None: """Simple test that runs without errors, and validates output.""" skip_if_mamba_sequence_packing_not_available(model_provider) + num_tokens_to_generate = 16 # Run test. env = self._run_test( + num_tokens_to_generate=num_tokens_to_generate, model_provider=model_provider, num_cuda_graphs=num_cuda_graphs, - context_max_requests_override=32, cuda_graph_scope=cuda_graph_scope, force_build_cuda_graphs=True, ) # Validate max_requests, max_tokens. - assert env.engine.context.max_requests == 32 - assert env.engine.context.max_tokens == 160 + assert env.engine.context.max_tokens == DynamicInferenceContext.DEFAULT_MAX_TOKENS - # Validate output tokens. + # Validate generated tokens. gpt_expected_generated_tokens = [ - [69, 85, 55, 74], - [29, 54, 85, 89], - [33, 30, 64, 59], - [45, 76, 33, 67], - [41, 56, 15, 58], - [28, 17, 6, 37], - [17, 2, 54, 47], - [], # this request is failed due to max sequence length overflow + [69, 85, 55, 74, 56, 89, 64, 59, 55, 67, 15, 58, 6, 37, 54, 47], + [29, 54, 33, 72, 45, 76, 41, 56, 28, 25, 17, 2, 61, 6, 98, 76], + [35, 78, 54, 16, 79, 98, 22, 5, 60, 0, 1, 76, 77, 11, 25, 7], + [25, 75, 57, 85, 81, 37, 88, 17, 71, 15, 70, 64, 50, 0, 64, 45], + [32, 5, 85, 75, 30, 68, 23, 33, 20, 26, 89, 20, 92, 97, 38, 81], + [33, 69, 32, 49, 93, 24, 33, 6, 97, 36, 37, 99], + [82, 78, 78, 65, 22, 1, 87, 42, 36, 26, 27, 56, 82, 32, 8, 80], + [], ] mamba_expected_generated_tokens = [ - [74, 72, 83, 59], - [25, 54, 1, 70], - [28, 14, 15, 89], - [87, 27, 30, 52], - [44, 13, 82, 70], - [28, 74, 64, 16], - [8, 4, 83, 5], + [74, 72, 9, 59, 1, 70, 15, 89, 30, 52, 82, 70, 64, 16, 83, 5], + [25, 54, 28, 14, 87, 27, 60, 92, 28, 74, 8, 63, 60, 68, 87, 82], + [31, 21, 87, 25, 96, 13, 32, 49, 40, 54, 55, 68, 73, 2, 64, 96], + [72, 80, 35, 72, 77, 85, 98, 36, 4, 97, 37, 46, 79, 95, 83, 25], + [8, 80, 56, 4, 87, 1, 43, 98, 85, 7, 50, 38, 24, 28, 18, 80], + [9, 94, 36, 16, 87, 57, 25, 76, 64, 92, 47, 86, 73, 72, 71, 97], + [17, 5, 62, 66, 15, 52, 32, 75, 66, 18, 90, 14, 67, 37, 94, 33], [], ] @@ -562,6 +578,10 @@ def test_simple(self, model_provider, num_cuda_graphs, cuda_graph_scope) -> None else: raise ValueError(f"Invalid model_provider {model_provider}") + print(f"Validating {len(env.requests)} requests.") + print(f"Expected generated tokens: {expected_generated_tokens_list}") + print(f"Actual generated tokens: {[request.generated_tokens for request in env.requests]}") + assert len(env.requests) == len(expected_generated_tokens_list) for request, expected_generated_tokens in zip(env.requests, expected_generated_tokens_list): @@ -571,41 +591,6 @@ def test_simple(self, model_provider, num_cuda_graphs, cuda_graph_scope) -> None f"expected ({expected_generated_tokens})." ) - @pytest.mark.internal - @pytest.mark.skipif( - not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching" - ) - def test_overflow_factor(self, model_provider: str = "gpt") -> None: - """Test overflow factor arg.""" - skip_if_mamba_sequence_packing_not_available(model_provider) - - # Run test. - env = self._run_test( - context_buffer_overflow_factor=0.1, - context_max_requests_override=None, - context_max_tokens_override=None, - model_provider=model_provider, - ) - - # Validate max_requests, max_tokens. - if model_provider == "gpt": - assert env.engine.context.max_requests == 420 - assert env.engine.context.max_tokens == 420 - elif model_provider == "mamba": - assert env.engine.context.max_requests == 16 - assert env.engine.context.max_tokens == 16 - - @pytest.mark.internal - @pytest.mark.skipif( - not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching" - ) - @pytest.mark.parametrize("model_provider", ["gpt", "mamba"]) - def test_request_overflow(self, model_provider: str) -> None: - """Test request overflow.""" - skip_if_mamba_sequence_packing_not_available(model_provider) - - self._run_test(context_max_requests_override=4, model_provider=model_provider) - @pytest.mark.skipif( not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching" ) @@ -613,7 +598,11 @@ def test_request_overflow(self, model_provider: str) -> None: def test_token_overflow_transient(self) -> None: """Test token overflow.""" test_config = DynamicEngineTestConfig( - num_requests=2, min_prompt_length=8, max_prompt_length=8, context_max_tokens_override=12 + num_requests=2, + min_prompt_length=512, + max_prompt_length=512, + num_tokens_to_generate=2, + context_max_tokens=900, ) env = self._build_test_env(test_config) env.engine._add_request(env.requests[0]) @@ -632,7 +621,7 @@ def test_token_overflow_transient(self) -> None: ) def test_token_overflow_nontransient(self) -> None: """Test token overflow (non-transient).""" - test_config = DynamicEngineTestConfig(context_max_tokens_override=8) + test_config = DynamicEngineTestConfig(context_max_tokens=8) env = self._build_test_env(test_config) try: env.engine._add_request(env.requests[0]) @@ -689,19 +678,21 @@ def test_cuda_graph_token_counts(self) -> None: # Test num_cuda_graphs. for num_cuda_graphs, expected_cuda_graph_token_counts in [ - (0, [64]), - (1, [64]), - (2, [64, 32]), - (4, [64, 48, 32, 16]), - (8, [64, 56, 48, 40, 32, 24, 16, 8]), - (16, [64, 56, 48, 40, 32, 24, 16, 8]), - (64, [64, 56, 48, 40, 32, 24, 16, 8]), - (1024, [64, 56, 48, 40, 32, 24, 16, 8]), + (0, [40]), + (1, [40]), + (2, [40, 24]), + (4, [40, 32, 16]), + (8, [40, 32, 24, 16, 8]), + (16, [40, 32, 24, 16, 8]), + (64, [40, 32, 24, 16, 8]), + (1024, [40, 32, 24, 16, 8]), ]: # Build cuda graphs (inside dynamic engine). env = self._build_test_env( - DynamicEngineTestConfig(num_requests=64, num_cuda_graphs=num_cuda_graphs) + DynamicEngineTestConfig( + context_buffer_size_gb=0.01, num_cuda_graphs=num_cuda_graphs + ) ) actual_cuda_graph_token_counts = env.engine.context.cuda_graph_token_counts assert ( @@ -721,19 +712,7 @@ def test_cuda_graph_token_counts(self) -> None: ) @pytest.mark.parametrize( "num_warmup_tokens, expected_cuda_graph_token_count", - [ - (1, 8), - (2, 8), - (4, 8), - (8, 8), - (10, 16), - (12, 16), - (16, 16), - (20, 24), - (24, 24), - (28, 32), - (32, 32), - ], + [(1, 8), (2, 8), (4, 8), (8, 8), (10, 16), (12, 16), (16, 16)], ) @torch.inference_mode() def test_cuda_graph_warmup( @@ -748,17 +727,16 @@ def test_cuda_graph_warmup( # Initialize context. env = self._build_test_env( - DynamicEngineTestConfig(num_requests=32, num_cuda_graphs=8, num_tokens_to_generate=1) + DynamicEngineTestConfig( + context_buffer_size_gb=0.0041, num_cuda_graphs=8, num_tokens_to_generate=1 + ) ) context = env.engine.context assert context.is_decode_only() - assert context.cuda_graph_token_counts == [ - 32, - 24, - 16, - 8, - ], "cuda_graph_token_counts: %s." % str(context.cuda_graph_token_counts) + assert context.cuda_graph_token_counts == [16, 8], "cuda_graph_token_counts: %s." % str( + context.cuda_graph_token_counts + ) context.initialize_attention_state( num_warmup_tokens=num_warmup_tokens, warmup_engine_mode=warmup_engine_mode @@ -851,7 +829,10 @@ def mock_tokenize_prompt(prompt, add_BOS=False): # Call the generate function. # It's safe to use request 0's sampling params here because all sampling # params are identical as long as use_fixed_output_lengths == False. - finished_requests = env.engine.generate(prompts, env.requests[0].sampling_params) + finished_request_records = env.engine.generate(prompts, env.requests[0].sampling_params) + finished_requests = [ + r.merge(env.engine.controller.tokenizer) for r in finished_request_records + ] # Verify results assert len(finished_requests) == len( @@ -901,10 +882,11 @@ async def test_run_engine(self): num_tokens_to_generate = env.requests[ request_id ].sampling_params.num_tokens_to_generate - result = fut.result() - assert result.generated_length == num_tokens_to_generate, ( + request_record = fut.result() + request = request_record.merge(env.engine.controller.tokenizer) + assert request.generated_length == num_tokens_to_generate, ( f"Request {request_id} expected to generate {num_tokens_to_generate} " - f"tokens but generated {result.generated_length}" + f"tokens but generated {request.generated_length}" ) engine_task.cancel() @@ -951,6 +933,7 @@ def test_return_log_probs(self): @pytest.mark.parametrize("pp_size", [1, 2]) @pytest.mark.parametrize("tp_size", [1, 2]) @pytest.mark.parametrize("model_provider", ["gpt", "mamba"]) + @pytest.mark.parametrize("transformer_impl", ["local", "inference_optimized"]) @torch.inference_mode() def test_parallel_inference( self, @@ -960,6 +943,7 @@ def test_parallel_inference( ep_size, sequence_parallel, materialize_only_last_token_logits, + transformer_impl, ): skip_if_mamba_sequence_packing_not_available(model_provider) @@ -975,13 +959,22 @@ def test_parallel_inference( pytest.skip(reason="Sequence parallelism requires tp_size > 1") elif tp_size > 1 and ep_size > 1 and not sequence_parallel: pytest.skip(reason="Sequence parallelism must be used with tp_size > 1 and ep_size > 1") - elif pp_size > 1 and model_provider == "mamba": - pytest.skip( - reason=( - "Running hybrid models with pp_size > 1 and no attention on some " - "pipeline stages is not supported yet." + elif transformer_impl == "inference_optimized": + if ep_size > 1: + pytest.skip( + reason="MoE models are not supported with the inference optimized transformer." + ) + if tp_size > 1 and not sequence_parallel: + pytest.skip( + reason=( + "The inference optimized transformer requires sequence parallelism " + "when tp_size > 1." + ) + ) + if model_provider == "mamba": + pytest.skip( + reason="Mamba model is not supported with the inference optimized transformer." ) - ) env = self._run_test( model_provider=model_provider, @@ -990,6 +983,7 @@ def test_parallel_inference( expert_model_parallel_size=ep_size, sequence_parallel=sequence_parallel, materialize_only_last_token_logits=materialize_only_last_token_logits, + transformer_impl=transformer_impl, ) @pytest.mark.internal @@ -1038,8 +1032,7 @@ def test_events(self): max_prompt_length=10, num_tokens_to_generate=32, context_buffer_size_gb=0.001, # 0.001, # 8 blocks - context_max_requests_override=8, - context_max_tokens_override=8, + context_max_tokens=8, num_gap_steps=1, ) @@ -1088,27 +1081,58 @@ def test_chunked_prefill(self, model_provider: str): materialize_only_last_token_logits=False, model_provider=model_provider, context_block_size_tokens=256, - context_max_tokens_override=300, + context_max_tokens=1000, ) - -if __name__ == "__main__": - test = TestDynamicInferenceEngine() - test.test_simple(4) - test.test_overflow_factor() - test.test_request_overflow() - test.test_token_overflow_transient() - # test.test_token_overflow_nontransient() # uncomment in megatron-core 0.16 - test.test_block_overflow() - test.test_multi_add() - test.test_fixed_output_lengths() - test.test_cuda_graph_request_counts() - test.test_cuda_graph_warmup(WarmupEngineMode.DECODE, 1, 8) - test.test_generate_function() - asyncio.run(test.test_run_engine()) - test.test_return_log_probs() - test.test_parallel_inference() - # test.test_events() # uncomment in megatron-core 0.16 - test.teardown_method(None) - print("~~~") - print("success.") + @pytest.mark.internal + @pytest.mark.skipif( + not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching" + ) + @pytest.mark.skip( + reason="test works in isolation, but memory dynamics change when run " + "within unt test suite." + ) + def test_suspend_resume_memory(self): + + # Run tests. + mem_usages = {} + for suspend_resume_interval in None, 8, 4, 2: # interval 1 acts funny. + + # Run test. + env = self._run_test(suspend_resume_interval=suspend_resume_interval, num_gap_steps=1) + + # Record memory usage. + mem_usages[suspend_resume_interval] = env.mem_usage + + # Clear memory to make recorded memories consistent between tests. + # TODO(@lmcafee): why is memory not automatically cleared? + # env.engine.suspend() # TODO(@lmcafee): useful? + del env + + # Utility methods. + get_alloc = lambda mem_stats: mem_stats["allocated_bytes.all.current"] + + # Validate overall 'end' memory usage. + golden_end_bytes = get_alloc(mem_usages[None]["end"]) + for interval, mem_usage in mem_usages.items(): + current_end_bytes = get_alloc(mem_usage["end"]) + assert math.isclose( + golden_end_bytes, current_end_bytes, rel_tol=0.01 + ), f"{current_end_bytes} != {golden_end_bytes}." + + # Validate 'suspend/resume' memory usage. + get_suspend_resume_bytes = lambda key: list( + get_alloc(list(d["suspend_resume"].values())[-1][key]) + for i, d in mem_usages.items() + if i is not None + ) + suspend_resume_mid_bytes = get_suspend_resume_bytes("mid") + suspend_resume_end_bytes = get_suspend_resume_bytes("end") + for mid_bytes in suspend_resume_mid_bytes: + assert math.isclose( + suspend_resume_mid_bytes[0], mid_bytes, rel_tol=0.01 + ), f"{mid_bytes} != {suspend_resume_mid_bytes[0]}." + for end_bytes in suspend_resume_end_bytes: + assert math.isclose( + suspend_resume_end_bytes[0], end_bytes, rel_tol=0.01 + ), f"{end_bytes} != {suspend_resume_end_bytes[0]}." diff --git a/tests/unit_tests/inference/engines/test_static_engine.py b/tests/unit_tests/inference/engines/test_static_engine.py index 699a4d1f473..40187a5eff9 100644 --- a/tests/unit_tests/inference/engines/test_static_engine.py +++ b/tests/unit_tests/inference/engines/test_static_engine.py @@ -12,7 +12,11 @@ from megatron.core import parallel_state from megatron.core.inference.contexts import StaticInferenceContext from megatron.core.inference.engines import StaticInferenceEngine -from megatron.core.inference.inference_request import InferenceRequest, Status +from megatron.core.inference.inference_request import ( + DynamicInferenceRequestRecord, + InferenceRequest, + Status, +) from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) @@ -188,12 +192,19 @@ def test_generate_dynamic(self, batch_size: int, num_trials: int, empty_prompt: prompts = ["" for i in range(batch_size)] else: prompts = ["sample" * (i + 1) for i in range(batch_size)] - results: List[InferenceRequest] = self.static_engine.generate( - prompts, sampling_params=SamplingParams(num_tokens_to_generate=10) + results: List[Union[InferenceRequest, DynamicInferenceRequestRecord]] = ( + self.static_engine.generate( + prompts, sampling_params=SamplingParams(num_tokens_to_generate=10) + ) ) assert len(results) == batch_size for result in results: + if isinstance(result, DynamicInferenceRequestRecord): + result = result.merge(self.static_engine.controller.tokenizer) + assert isinstance(result, InferenceRequest), ( + "expected ; found <%s>." % type(result).__name__ + ) assert ( result.status == Status.COMPLETED ), f"Status should be completed but its {result.status}" diff --git a/tests/unit_tests/inference/test_data_parallel_inference_coordinator.py b/tests/unit_tests/inference/test_data_parallel_inference_coordinator.py new file mode 100644 index 00000000000..7b4fb4b4250 --- /dev/null +++ b/tests/unit_tests/inference/test_data_parallel_inference_coordinator.py @@ -0,0 +1,471 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +import asyncio +import random +import time +from collections import deque +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Tuple + +import pytest +import torch.distributed as dist +from tqdm import tqdm + +from megatron.core.inference.data_parallel_inference_coordinator import ( + DataParallelInferenceCoordinator, +) +from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine, RequestEntry +from megatron.core.inference.inference_client import InferenceClient +from megatron.core.inference.inference_request import ( + DynamicInferenceRequest, + DynamicInferenceRequestRecord, + Status, +) +from megatron.core.inference.sampling_params import SamplingParams +from megatron.core.utils import get_asyncio_loop +from tests.unit_tests.test_utilities import Utils + +try: + import zmq + + HAVE_ZMQ = True +except Exception: + HAVE_ZMQ = False + +IS_ZMQ_FLAKY = True + + +class DummyContext: + """Dummy inference context.""" + + def __init__(self): + self.active_cnt = 0 + + def get_active_request_count(self) -> int: + return self.active_cnt + + +class DummyEngine(DynamicInferenceEngine): + """Dummy inference engine that only implements coordinator-related methods.""" + + def __init__(self): + """We cannot call super().__init__() because it requires complex setup.""" + self.waiting_request_ids = deque() + self.requests: Dict[int, RequestEntry] = {} + self.suspend_signal = False + self.is_suspended = False + self._loop = get_asyncio_loop() + self.context = DummyContext() + self.running = asyncio.Event() + self.paused = asyncio.Event() + self.stopped = asyncio.Event() + self.pending_microbatch = deque() + self.received_pause: bool = False + self.received_stop: bool = False + + def add_request( + self, request_id: int, prompt: str, sampling_params: Optional[SamplingParams] = None + ) -> asyncio.Future[DynamicInferenceRequestRecord]: + """Dummy add_request.""" + + self.requests[request_id] = RequestEntry( + record=DynamicInferenceRequestRecord.from_request( + DynamicInferenceRequest( + prompt=prompt, + request_id=request_id, + sampling_params=sampling_params, + status=Status.WAITING_IN_QUEUE, + ) + ), + future=self._loop.create_future(), + ) + self.waiting_request_ids.append(request_id) + + return self.requests[request_id].future + + async def async_step(self, *, verbose: Optional[bool] = False) -> Dict: + """Dummy async_step.""" + # Finish "active" requests. + finished_request_records = [] + to_remove = [] + for request_id, entry in self.requests.items(): + request = entry.record[-1] + if request.status == Status.ACTIVE_AND_GENERATING_TOKENS: + request.sampling_params.num_tokens_to_generate -= 1 + if request.sampling_params.num_tokens_to_generate > 0: + continue + request.status = Status.COMPLETED + self.context.active_cnt -= 1 + finished_request_records.append(entry.record) + entry.future.set_result(entry.record) + to_remove.append(request_id) + for request_id in to_remove: + del self.requests[request_id] + + # Activate queued requests. They will "process" for 1 step. + active_request_ids = [] + while self.waiting_request_ids: + request_id = self.waiting_request_ids.popleft() + record = self.requests[request_id].record + record[-1].status = Status.ACTIVE_AND_GENERATING_TOKENS + self.context.active_cnt += 1 + active_request_ids.append(request_id) + + return { + "active_request_ids": active_request_ids, + "finished_request_records": finished_request_records, + "step_time": 0.01, + "cuda_graph_request_count": 1, + } + + +@dataclass +class CoordinatorTestConfig: + """Test configuration args.""" + + port: int = 46581 + mp_port: int = 49581 + launch_inference_coordinator: bool = True + stop_engines: bool = True + verify_results: bool = True + + num_requests: int = 10**1 + min_time_offset: float = 10 ** (-4) + max_time_offset: float = 10 ** (-3) + num_steps_to_finish: int = 1 + num_iterations: int = 1 + + tensor_model_parallel_size: int = 1 + pipeline_model_parallel_size: int = 1 + + +@dataclass +class CoordinatorTestEnv: + """Test environment, including requests.""" + + config: CoordinatorTestConfig + requests: List[Tuple] + engine: DummyEngine + responses: List[List[DynamicInferenceRequest]] = field(default_factory=list) + timing_data: Dict[str, Optional[float]] = field( + default_factory=lambda: { + "start_time": None, + "init_time": None, + "done_time": None, + "stop_time": None, + } + ) + + +class TestCoordinator: + + @classmethod + def _build_requests(cls, test_config: CoordinatorTestConfig) -> List[Tuple]: + ret = [] + + for _ in range(test_config.num_requests): + arrival_delta = random.uniform(test_config.min_time_offset, test_config.max_time_offset) + num_tokens = test_config.num_steps_to_finish + ret.append( + ("Hello world!", SamplingParams(num_tokens_to_generate=num_tokens), arrival_delta) + ) + return ret + + @classmethod + def _build_test_env(cls, test_config): + Utils.initialize_model_parallel( + tensor_model_parallel_size=test_config.tensor_model_parallel_size, + pipeline_model_parallel_size=test_config.pipeline_model_parallel_size, + ) + requests = cls._build_requests(test_config) + engine = DummyEngine() + engine.num_steps_to_finish = test_config.num_steps_to_finish + return CoordinatorTestEnv(config=test_config, requests=requests, engine=engine) + + @classmethod + async def _run_test(cls, **test_config_kwargs): + # Test environment. + test_config = CoordinatorTestConfig(**test_config_kwargs) + env = cls._build_test_env(test_config) + + # Connect each engine to their respective processes. + env.timing_data["start_time"] = time.time() + await env.engine.start_listening_to_data_parallel_coordinator( + inference_coordinator_port=test_config.port, + launch_inference_coordinator=test_config.launch_inference_coordinator, + ) + + results_success = False + shutdown_success = False + try: + if dist.get_rank() == 0: + client = InferenceClient(test_config.port) + await client.start() + env.timing_data["init_time"] = time.time() + + all_results = [] + for _ in range(test_config.num_iterations): + futures = [] + for request in tqdm(env.requests, "add_requests"): + prompt, sampling_params, arrival_delta = request + await asyncio.sleep(arrival_delta) + fut = client.add_request(prompt=prompt, sampling_params=sampling_params) + futures.append(fut) + results = await asyncio.wait_for(asyncio.gather(*futures), timeout=10.0) + all_results.append(results) + env.timing_data["done_time"] = time.time() + results_success = True + finally: + try: + if dist.get_rank() == 0: + if test_config.stop_engines: + await asyncio.wait_for(client.stop_engines(), timeout=10.0) + client.stop() + if test_config.stop_engines: + await asyncio.wait_for(env.engine.engine_loop_task, timeout=10.0) + shutdown_success = True + except: + env.engine.engine_loop_task.cancel() + + env.timing_data["stop_time"] = time.time() + + assert results_success, "Did not receive all results successfully." + assert shutdown_success, "Did not shutdown successfully." + if dist.get_rank() == 0: + env.responses = all_results + if test_config.verify_results: + for batch in all_results: + for record in batch: + request = record[-1] + assert request.status == Status.COMPLETED + + return env + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + @pytest.mark.internal + @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI") + @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test") + @pytest.mark.asyncio + async def test_simple(self): + """Simple test with no TP or PP.""" + env = await self._run_test(tensor_model_parallel_size=1, pipeline_model_parallel_size=1) + + @pytest.mark.internal + @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI") + @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test") + @pytest.mark.asyncio + async def test_tp(self): + """Simple test with TP, but no PP.""" + env = await self._run_test(tensor_model_parallel_size=2, pipeline_model_parallel_size=1) + + @pytest.mark.internal + @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI") + @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test") + @pytest.mark.asyncio + async def test_pp(self): + """Simple test with no TP, but PP.""" + env = await self._run_test(tensor_model_parallel_size=1, pipeline_model_parallel_size=2) + + @pytest.mark.internal + @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI") + @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test") + @pytest.mark.asyncio + async def test_tp_pp(self): + """Simple test with both TP and PP.""" + env = await self._run_test(tensor_model_parallel_size=2, pipeline_model_parallel_size=2) + + @pytest.mark.internal + @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI") + @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test") + @pytest.mark.asyncio + async def test_pp(self): + """Simple test with no TP, but PP.""" + env = await self._run_test(tensor_model_parallel_size=1, pipeline_model_parallel_size=2) + + @pytest.mark.internal + @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test") + @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI") + @pytest.mark.asyncio + async def test_tp_pp(self): + """Simple test with both TP and PP.""" + env = await self._run_test(tensor_model_parallel_size=2, pipeline_model_parallel_size=2) + + @pytest.mark.internal + @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test") + @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI") + @pytest.mark.asyncio + async def test_pause(self): + """Pause/resume test.""" + test_config = CoordinatorTestConfig( + tensor_model_parallel_size=2, pipeline_model_parallel_size=1, num_requests=32 + ) + env = self._build_test_env(test_config) + + await env.engine.start_listening_to_data_parallel_coordinator( + inference_coordinator_port=test_config.port, launch_inference_coordinator=True + ) + + success = False + try: + if dist.get_rank() == 0: + # Start client as usual. + client = InferenceClient(test_config.port) + await client.start() + + ### TEST 1: Pause after all requests have finished. + futures = [] + for i, request in enumerate(env.requests[:2]): + prompt, sampling_params, _ = request + fut = client.add_request(prompt=prompt, sampling_params=sampling_params) + futures.append(fut) + # Wait a sufficient time for the requests to complete. + await asyncio.sleep(0.1) + # Get a pause awaitable. + to_pause = client.pause_engines() + awaitables = futures + [to_pause] + # Gather all awaitables; assert that the requests actually complete. + try: + await asyncio.wait_for(asyncio.gather(*awaitables), timeout=0.1) + except asyncio.TimeoutError: + pytest.fail("Simple pause did not succeed.") + + ### TEST 2: Ensure that requests can be added while paused. + prompt, sampling_params, _ = env.requests[2] + paused_fut = client.add_request(prompt=prompt, sampling_params=sampling_params) + with pytest.raises(asyncio.TimeoutError): + await asyncio.wait_for(paused_fut, timeout=0.1) + + ### TEST 3: Resume after pause and drain the queued requests. + client.unpause_engines() + # TODO: The system should not be incorrectly raising a cancelled error here. + with pytest.raises(asyncio.CancelledError): + await paused_fut + + ### TEST 4: Add new requests after resume. + futures = [] + for i, request in enumerate(env.requests[3:4]): + prompt, sampling_params, _ = request + fut = client.add_request(prompt=prompt, sampling_params=sampling_params) + futures.append(fut) + # Wait a sufficient time for the requests to complete. + await asyncio.sleep(0.1) + # Gather all awaitables; assert that the requests actually complete. + try: + await asyncio.wait_for(asyncio.gather(*futures), timeout=0.1) + except asyncio.TimeoutError: + pytest.fail("Simple resume did not succeed.") + + ### TEST 5: Pause while requests are being processed. + ### Note: this situation cannot occur in a synchronous system. + if False: + for request in env.engine.requests[4:6]: + request.sampling_params.num_tokens_to_generate = 100 + futures = [] + for i, request in enumerate(env.requests[4:6]): + prompt, sampling_params, _ = request + fut = client.add_request(prompt=prompt, sampling_params=sampling_params) + futures.append(fut) + # Do not wait for the requests to complete. + await client.pause_engines() + # Gather all awaitables; assert that the requests do not complete. + with pytest.raises(asyncio.TimeoutError): + await asyncio.wait_for(asyncio.gather(*futures), timeout=0.1) + success = True + finally: + try: + if dist.get_rank() == 0: + await asyncio.wait_for(client.stop_engines(), timeout=5.0) + client.stop() + await asyncio.wait_for(env.engine.engine_loop_task, timeout=5.0) + except asyncio.TimeoutError: + env.engine.engine_loop_task.cancel() + assert success, "Pause/resume test did not complete successfully." + + @pytest.mark.internal + @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test") + @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI") + @pytest.mark.asyncio + async def test_throughput(self): + """Throughput test with no TP or PP.""" + import torch + import torch.distributed as dist + + env = await self._run_test( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + num_requests=10**4, + num_iterations=10, + min_time_offset=0.0, + max_time_offset=0.0, + ) + + flags = torch.tensor([1, 1, 1], dtype=torch.int, device=torch.cuda.current_device()) + + init_duration = golden_init_duration = None + run_duration = golden_run_duration = None + stop_duration = golden_stop_duration = None + + if dist.get_rank() == 0: + init_duration = (env.timing_data["init_time"] - env.timing_data["start_time"]) * 10**3 + golden_init_duration = 4445.64 # ms + run_duration = (env.timing_data["done_time"] - env.timing_data["init_time"]) * 10**3 + golden_run_duration = 2906.29 # ms + stop_duration = (env.timing_data["stop_time"] - env.timing_data["done_time"]) * 10**3 + golden_stop_duration = 33.17 # ms + + def clamp_to_golden_value(value, golden_value, delta=0.1): + return value > golden_value * (1 - delta) and value < golden_value * (1 + delta) + + if not clamp_to_golden_value(init_duration, golden_init_duration, delta=0.5): + flags[0] = 0 + if not clamp_to_golden_value(run_duration, golden_run_duration, delta=0.2): + flags[1] = 0 + if not clamp_to_golden_value(stop_duration, golden_stop_duration, delta=1.0): + flags[2] = 0 + + # Synchronize results + dist.broadcast(flags, src=0) + + if dist.get_rank() == 0: + # Print current results. + print(f"Initialization time: {init_duration:.2f} ms") + print(f"Run time: {run_duration:.2f} ms") + print(f"Stop time: {stop_duration:.2f} ms") + + assert flags[0].item() == 1, ( + f"WARNING: Init duration {init_duration:.2f}s deviates from " + f"golden value {golden_init_duration:.2f}s" + ) + assert flags[1].item() == 1, ( + f"WARNING: Run duration {run_duration:.2f}s deviates from " + f"golden value {golden_run_duration:.2f}s" + ) + assert flags[2].item() == 1, ( + f"WARNING: Stop duration {stop_duration:.2f}s deviates from " + f"golden value {golden_stop_duration:.2f}s" + ) + + print( + f"ZMQ throughput is approximately " + f"{env.config.num_requests * env.config.num_iterations / (run_duration):.2f} " + f"requests/ms" + ) + else: + assert flags[0].item() == 1 + assert flags[1].item() == 1 + assert flags[2].item() == 1 + + +if __name__ == "__main__": + test = TestCoordinator() + asyncio.run(test.test_simple()) + asyncio.run(test.test_tp()) + asyncio.run(test.test_pp()) + asyncio.run(test.test_tp_pp()) + asyncio.run(test.test_pause()) + asyncio.run(test.test_throughput()) + test.teardown_method(None) + print("~~~") + print("success.") diff --git a/tests/unit_tests/inference/test_wandb_logging.py b/tests/unit_tests/inference/test_wandb_logging.py index 1512e805f9c..1d5d054b80e 100644 --- a/tests/unit_tests/inference/test_wandb_logging.py +++ b/tests/unit_tests/inference/test_wandb_logging.py @@ -50,7 +50,6 @@ def _get_dynamic_context( max_sequence_length=512, buffer_size_gb=0.03, block_size_tokens=128, - buffer_guaranteed_fraction=0.1, metrics_writer=None, ): """Helper to create a DynamicInferenceContext.""" @@ -62,9 +61,9 @@ def _get_dynamic_context( max_sequence_length=max_sequence_length, num_cuda_graphs=None, buffer_size_gb=buffer_size_gb, - buffer_guaranteed_fraction=buffer_guaranteed_fraction, block_size_tokens=block_size_tokens, metrics_writer=metrics_writer, + unified_memory_level=0, # unit tests currently broken with UVM ) @pytest.mark.internal @@ -83,12 +82,11 @@ def test_get_kvcache_utilization_stats_with_requests(self): assert 'active_utilization' in stats assert 'active_request_count' in stats assert 'paused_request_count' in stats - assert 'gtd_block_count' in stats assert 'block_count_avail' in stats - assert 'num_non_gtd_blocks' in stats assert 'active_token_count' in stats assert 'total_request_count' in stats - assert 'max_requests' in stats + assert 'max_total_requests' in stats + assert 'max_active_requests' in stats # Verify values for empty context assert stats['allocated_blocks'] == 0 @@ -134,12 +132,11 @@ def test_get_kvcache_utilization_stats_with_requests(self): assert stats_after['total_blocks'] == stats['total_blocks'] assert stats_after['total_blocks'] > 0 - # Verify that gtd_block_count remains constant - assert stats_after['gtd_block_count'] == stats['gtd_block_count'] - # Verify that max_requests remains constant - assert stats_after['max_requests'] == stats['max_requests'] - assert stats_after['max_requests'] > 0 + assert stats_after['max_total_requests'] == stats['max_total_requests'] + assert stats_after['max_total_requests'] > 0 + assert stats_after['max_active_requests'] == stats['max_active_requests'] + assert stats_after['max_active_requests'] > 0 # Verify block availability decreased after allocation assert stats_after['block_count_avail'] < stats['block_count_avail'] @@ -147,7 +144,7 @@ def test_get_kvcache_utilization_stats_with_requests(self): # Verify relationship: allocated_blocks + block_count_avail + 1 (dummy) = total assert ( stats_after['allocated_blocks'] + stats_after['block_count_avail'] + 1 - == dynamic_context.block_allocator.block_count_total + == dynamic_context.block_allocator.total_count ) # Verify utilization bounds [0, 1] @@ -180,12 +177,11 @@ def test_kvcache_utilization_stats_types(self): 'active_unique_blocks', 'active_request_count', 'paused_request_count', - 'gtd_block_count', 'block_count_avail', - 'num_non_gtd_blocks', 'active_token_count', 'total_request_count', - 'max_requests', + 'max_total_requests', + 'max_active_requests', ] for field in int_fields: @@ -240,8 +236,8 @@ def test_paused_requests_in_stats(self): max_sequence_length=128, num_cuda_graphs=None, buffer_size_gb=0.01, # Small buffer to force pausing - buffer_guaranteed_fraction=0.1, block_size_tokens=32, + unified_memory_level=0, # unit tests currently broken with UVM ) # Add multiple requests to potentially trigger pausing diff --git a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py index 10ffe2fdd40..ee6bc5b2468 100644 --- a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py +++ b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py @@ -80,6 +80,9 @@ def setup_model( fp8="hybrid" if fp8 else None, fp8_recipe="tensorwise" if fp8 else None, fp8_param=fp8, + tensor_model_parallel_size=tensor_model_parallel_size, + pipeline_model_parallel_size=pipeline_model_parallel_size, + pipeline_dtype=dtype, ) if dtype == torch.bfloat16: transformer_config.bf16 = True @@ -112,15 +115,15 @@ def setup_model( else: inference_context = DynamicInferenceContext( params_dtype=dtype, - num_layers=transformer_config.num_layers, + num_layers=transformer_config.num_layers // pipeline_model_parallel_size, kv_channels=transformer_config.kv_channels, num_attention_heads=transformer_config.num_attention_heads, max_sequence_length=2048, - buffer_size_gb=1, - buffer_guaranteed_fraction=0.1, + buffer_size_gb=0.2, materialize_only_last_token_logits=False, use_flashinfer_fused_rope=None, # default to using flash-infer if available # this is for compatibility with the LTS environment + unified_memory_level=0, # unit tests currently broken with UVM ) inference_wrapped_model = GPTInferenceWrapper( @@ -228,41 +231,75 @@ def detokenize(self, inp, skip_special_tokens=False): sampled_logits >= expected_min_value ), f"The sampled logits should all be greater than {expected_min_value} but its {sampled_logits}" - def test_sample_from_dynamic_logits(self): + @pytest.mark.parametrize("backend", ["torch"]) + def test_sample_from_dynamic_logits(self, backend): batch_size = 12 self.setup_model(torch.float32, batch_size=batch_size, static=False) self.mock_tokenizer.eod = self.vocab_size - active_sampling_map: List[Tuple[SamplingParams, List[int]]] = [ - (SamplingParams(top_k=3), [0, 3, 2]), + context = self.text_generation_controller.inference_wrapped_model.inference_context + context.materialize_only_last_token_logits = True + + # Prepare sampling params in human-readable format, to aid with test maintenance. + sampling_test_cases: List[Tuple[SamplingParams, List[int]]] = [ + (SamplingParams(temperature=0.1, top_p=0.01), [9, 6, 10]), + (SamplingParams(temperature=5.0, top_k=15), [0, 3, 2]), (SamplingParams(top_p=0.8), [4, 1, 7]), - (SamplingParams(top_k=5), [11, 5, 8]), - # (SamplingParams(top_k=5, top_p=0.7), [11, 5, 8]), # uncomment for FlashInfer sampling - (SamplingParams(temperature=2.0), [9, 6, 10]), + (SamplingParams(temperature=10.0, top_k=5), [11, 5, 8]), ] - rev_sampling_map: List[SamplingParams] = [None] * batch_size - for sampling_params, indices in active_sampling_map: + # For non-torch backends, test simultaneous top_k and top_p sampling. + if backend != "torch": + sampling_test_cases[3][0].top_p = 0.8 + + # Convert sampling params to non-readable format. + rev_sampling_dict: List[SamplingParams] = [None] * batch_size + for sampling_params, indices in sampling_test_cases: for idx in indices: - rev_sampling_map[idx] = sampling_params + rev_sampling_dict[idx] = sampling_params - last_token_logits = torch.arange(0, self.vocab_size).repeat(batch_size, 1).float().cuda() - sampled_logits, _ = self.text_generation_controller.sample_from_dynamic_logits( - last_token_logits, active_sampling_map, vocab_size=self.vocab_size + # Prepare metadata for sample bookkeeping. + request_metadata_labels = DynamicInferenceRequest.get_metadata_labels() + request_metadata = torch.empty( + (batch_size, len(request_metadata_labels)), dtype=torch.float32 + ).cuda() + top_k_values = torch.Tensor([s.top_k for s in rev_sampling_dict]).cuda() + request_metadata[:, request_metadata_labels["top_k"]] = top_k_values + top_p_values = torch.Tensor([s.top_p for s in rev_sampling_dict]).cuda() + request_metadata[:, request_metadata_labels["top_p"]] = top_p_values + temp_values = torch.Tensor([s.temperature for s in rev_sampling_dict]).cuda() + request_metadata[:, request_metadata_labels["temperature"]] = temp_values + + # Bookkeeping. + self.text_generation_controller._dynamic_step_sample_bookkeeping( + request_metadata=request_metadata + ) + + # Sampling. + logits = torch.arange(0, self.vocab_size).repeat(batch_size, 1).unsqueeze(0).float().cuda() + sampled_logits = self.text_generation_controller._dynamic_step_sample_logits( + logits, backend=backend ) - top_k_values = torch.Tensor([s.top_k for s in rev_sampling_map]).cuda().unsqueeze(1) - top_k_values[top_k_values == 0] = self.vocab_size - top_p_values = torch.Tensor([s.top_p for s in rev_sampling_map]).cuda().unsqueeze(1) - temp_values = torch.Tensor([s.temperature for s in rev_sampling_map]).cuda().unsqueeze(1) vocab_indices = torch.arange(self.vocab_size).cuda() + top_k_values[top_k_values == 0] = self.vocab_size assert torch.all( sampled_logits >= self.vocab_size - top_k_values ), f"The sampled logits should all be greater than {self.vocab_size - top_k_values} but its {sampled_logits}" - l = last_token_logits[0] - sampled_l = l.div(temp_values).softmax(dim=-1) - top_k_mask = vocab_indices.unsqueeze(0) < (self.vocab_size - top_k_values) + l = logits.squeeze(0) + sampled_l = l.div(temp_values.unsqueeze(1)).softmax(dim=-1) + top_k_mask = vocab_indices.unsqueeze(0) < (self.vocab_size - top_k_values.unsqueeze(1)) sampled_l.masked_fill_(top_k_mask, 0.0) - expected_min_values = sampled_l[sampled_l.cumsum(dim=-1) > top_p_values].amax(dim=-1) + top_p_mask = sampled_l.cumsum(dim=-1) > top_p_values.unsqueeze(1) + + first_excluded = torch.where( + top_p_mask.any(dim=-1), + top_p_mask.float().argmax(dim=-1), + torch.full((batch_size,), self.vocab_size, device=top_p_mask.device), + ) + last_included = torch.clamp(first_excluded - 1, min=0) + start_idx = torch.clamp(self.vocab_size - top_k_values, min=0).long() + last_included = torch.max(last_included, start_idx) + expected_min_values = l.gather(1, last_included.unsqueeze(1)).squeeze(1) assert torch.all( sampled_logits >= expected_min_values ), f"The sampled logits should all be greater than {expected_min_values} but its {sampled_logits}" @@ -773,14 +810,15 @@ def test_sampled_tokens_match_with_parallelism(self, static, tp_size, pp_size): ), ) ) - sampling_params = SamplingParams(top_k=10, return_log_probs=True, termination_id=-1) - sampling_map = [(sampling_params, list(range(len(active_requests))))] + expected_active_requests = set(int(x) for x in active_requests.keys()) while context.has_unfinished_requests(): - result = self.text_generation_controller.generate_output_tokens_dynamic_batch( - active_sampling_map=sampling_map - ) + result = self.text_generation_controller.generate_output_tokens_dynamic_batch() new_tokens = result["sample"] - assert len(new_tokens) == len(active_requests) + active_ids = result["active_request_ids"].tolist() + finished_ids = result["finished_request_ids"].tolist() + assert len(new_tokens) == len(expected_active_requests) + assert set(active_ids) == expected_active_requests + expected_active_requests -= set(finished_ids) for i, token in enumerate(new_tokens.tolist()): all_generated_tokens[i].append(token) diff --git a/tests/unit_tests/test_checkpointing.py b/tests/unit_tests/test_checkpointing.py index 194f9721300..4bbf54301f5 100644 --- a/tests/unit_tests/test_checkpointing.py +++ b/tests/unit_tests/test_checkpointing.py @@ -9,6 +9,8 @@ import torch import torch.distributed.checkpoint +from megatron.core.distributed import DistributedDataParallelConfig +from megatron.core.distributed.fsdp.mcore_fsdp_adapter import FullyShardedDataParallel from megatron.core.num_microbatches_calculator import ( init_num_microbatches_calculator, unset_num_microbatches_calculator, @@ -23,6 +25,7 @@ _load_base_checkpoint, get_checkpoint_tracker_filename, load_checkpoint, + read_metadata, save_checkpoint, ) from megatron.training.global_vars import set_args @@ -51,6 +54,9 @@ def __init__(self, state_dict): self.is_stub_optimizer = False self._called_metadata = [] + # Optimizers are expected to have this attribute for checkpointing. + self.param_groups = [] + def state_dict(self, is_loading=False): return self._state_dict @@ -111,6 +117,8 @@ def create_args(): args.retro_add_retriever = False args.ckpt_convert_update_legacy_dist_opt_format = False args.ckpt_step = None + args.swiglu = True + args.num_experts = 1 yield args @@ -191,7 +199,7 @@ def test_load_base_checkpoint( assert ckpt_type == expected_ckpt_type -@pytest.mark.parametrize("ckpt_format", ["torch", "torch_dcp"]) +@pytest.mark.parametrize("ckpt_format", ["torch", "torch_dcp", "fsdp_dtensor"]) def test_save_checkpoint(init_model_parallel, create_args, tmp_path_dist_ckpt, ckpt_format): """Test save_checkpoint.""" args = create_args @@ -207,6 +215,15 @@ def test_save_checkpoint(init_model_parallel, create_args, tmp_path_dist_ckpt, c config = TransformerConfig(num_layers=1, kv_channels=1) model = MockModel(config) optimizer = MockState({"optimizer": "optimizer_state"}) + if ckpt_format == "fsdp_dtensor": + model = FullyShardedDataParallel( + config=config, + ddp_config=DistributedDataParallelConfig( + use_distributed_optimizer=True, use_megatron_fsdp=True + ), + module=model, + ) + optimizer = MockState({"state": {}}) opt_param_scheduler = MockState({"opt_param_scheduler": "scheduler_state"}) num_floating_point_operations_so_far = 456 @@ -226,7 +243,7 @@ def test_save_checkpoint(init_model_parallel, create_args, tmp_path_dist_ckpt, c expected_ckpt_path = None if ckpt_format == "torch": expected_ckpt_path = ckpt_dir / "mp_rank_00" / "model_optim_rng.pt" - elif ckpt_format == "torch_dcp": + elif ckpt_format in ["torch_dcp", "fsdp_dtensor"]: expected_ckpt_path = ckpt_dir / ".metadata" assert os.path.exists(expected_ckpt_path) @@ -337,3 +354,27 @@ def test_dist_checkpoint_versioning(init_model_parallel, tmp_path_dist_ckpt, cre first_job_mock_metadata, second_job_mock_metadata, ] + + +@pytest.mark.parametrize( + "metadata_content,expected_iter,expected_release", + [ + ("456", 456, False), # Normal iteration + ("release", 0, True), # Release checkpoint should return iteration=1 + ("123", 123, False), # Another normal iteration + ], +) +def test_read_metadata_non_distributed(tmp_path, metadata_content, expected_iter, expected_release): + """Test read_metadata without torch.distributed initialized.""" + test_dir = tmp_path / "test_read_metadata_non_distributed" + test_dir.mkdir(parents=True, exist_ok=True) + tracker_file = test_dir / "latest_checkpointed_iteration.txt" + + with open(tracker_file, "w") as f: + f.write(metadata_content) + + with mock.patch('torch.distributed.is_initialized', return_value=False): + max_iter, release = read_metadata(str(tracker_file)) + + assert max_iter == expected_iter, f"Expected iteration {expected_iter}, got {max_iter}" + assert release == expected_release, f"Expected release={expected_release}, got {release}" diff --git a/tests/unit_tests/test_process_groups_config.py b/tests/unit_tests/test_process_groups_config.py index 032de47e951..013bc6746d4 100644 --- a/tests/unit_tests/test_process_groups_config.py +++ b/tests/unit_tests/test_process_groups_config.py @@ -67,6 +67,29 @@ def test_hierarchical_context_parallel_groups(self, mocker): assert model_pgs.hcp[0] == mock_pg1 assert model_pgs.hcp[1] == mock_pg2 + def test_repr(self, mocker): + """Test __repr__ shows active process groups and their sizes.""" + tp_size = 4 + pp_size = 2 + mock_tp = mocker.Mock(spec=dist.ProcessGroup) + mock_tp.size.return_value = tp_size + mock_pp = mocker.Mock(spec=dist.ProcessGroup) + mock_pp.size.return_value = pp_size + + # Test empty collection + empty_pgs = ProcessGroupCollection() + assert repr(empty_pgs) == "ProcessGroupCollection(empty)" + + # Test collection with process groups + model_pgs = ProcessGroupCollection() + model_pgs.tp = mock_tp + model_pgs.pp = mock_pp + + repr_str = repr(model_pgs) + assert "ProcessGroupCollection(" in repr_str + assert f"tp({tp_size})" in repr_str + assert f"pp({pp_size})" in repr_str + class TestPGConfigDefaultInitialization: diff --git a/tests/unit_tests/test_rl_utils.py b/tests/unit_tests/test_rl_utils.py new file mode 100644 index 00000000000..5ea89ff2a02 --- /dev/null +++ b/tests/unit_tests/test_rl_utils.py @@ -0,0 +1,656 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +from unittest.mock import patch + +import torch + +from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig +from megatron.core.models.common.language_module.language_module import LanguageModule +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec +from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer import TransformerConfig +from megatron.core.transformer.module import Float16Module +from megatron.rl import rl_utils +from megatron.rl.agent.api import TokenRollout +from megatron.training import arguments, global_vars +from tests.unit_tests.test_utilities import Utils + +BATCH = 2 +SEQ = 4 +VOCAB = 754 + + +class MockModel(LanguageModule): + def __init__(self, batch=BATCH, seq=SEQ, vocab=VOCAB): + self.batch = batch + self.seq = seq + self.vocab = vocab + self.config = TransformerConfig(num_attention_heads=1, num_layers=1) + + def __call__(self, x, position_ids, attention_mask, **kwargs): + del position_ids + del attention_mask + batch, seq = x.shape + mock_model_outputs = torch.ones((batch, seq, self.vocab), device=x.device) + return mock_model_outputs + + def load_state_dict(self, params): + del params + + def train(self, mode=True): + del mode + + def state_dict(self): + return {} + + +class MockTokenizer: + def __init__(self): + self.pad = 42 + self.eod = 43 + self.vocab_size = VOCAB + self.bos = None + + def detokenize(self, tokens): + return [str(tok) for tok in tokens] + + +def test_get_logprobs(): + """Test that getting logprobs at least does not crash.""" + # We use args inside of get_logprobs, we need to initialize them. + args = arguments.parse_args(ignore_unknown_args=True) + global_vars.set_args(args) + + tokens = torch.ones((BATCH, SEQ), dtype=torch.long) + logprobs = rl_utils.get_logprobs(MockModel(), tokens, position_ids=None, attention_mask=None) + # We chop off 1 element from the sequence dimension. + assert logprobs.shape == (BATCH, SEQ - 1) + # As we return ones as logits, all logprobs should be the same. + assert torch.all(logprobs == logprobs[0, 0]).item() + + +def test_get_logprobs_with_sequence_packing(): + """Test that getting logprobs at least does not crash.""" + # We use args inside of get_logprobs, we need to initialize them. + args = arguments.parse_args(ignore_unknown_args=True) + setattr(args, 'rl_use_sequence_packing', True) + global_vars.set_args(args) + + tokens = torch.ones((BATCH, SEQ), dtype=torch.long) + logprobs = rl_utils.get_logprobs(MockModel(), tokens, position_ids=None, attention_mask=None) + # We chop off 1 element from the sequence dimension. + assert logprobs.shape == (BATCH, SEQ - 1) + # As we return ones as logits, all logprobs should be the same. + assert torch.all(logprobs == logprobs[0, 0]).item() + + +def test_prepare_trajectories(): + # Make sure sequence packing is disabled for this test + import megatron.training.global_vars as global_vars + + old_args = global_vars.get_args() if global_vars.get_args() is not None else None + + # Create minimal args without sequence packing + args = type('Args', (), {})() + args.rl_use_sequence_packing = False + args.rl_inference_logprobs_is_correction = True + global_vars.set_args(args) + + tokenizer = MockTokenizer() + r1 = TokenRollout( + trajectory=[1, 2, tokenizer.eod], + reward=3.14, + generation_mask=[False, True, True], + logprobs=[0.1, 0.2, 0.3], + env_id='MEGAENV', + problem_id="2", + ) + r2 = TokenRollout( + trajectory=[1, 2, tokenizer.eod], + reward=0.14, + generation_mask=[False, True, True], + logprobs=[0.1, 0.2, 0.3], + env_id='MEGAENV', + problem_id="2", + ) + rollouts = [[r1, r2]] + seq_len = 7 + + trajs, genmask, inference_logprobs = rl_utils.prepare_trajectories(rollouts, tokenizer, seq_len) + + # Check that inference logprobs are being returned. + torch.testing.assert_close(inference_logprobs[0], torch.tensor([0.1, 0.2, 0.3])) + torch.testing.assert_close(inference_logprobs[1], torch.tensor([0.1, 0.2, 0.3])) + + expected_mask = torch.tensor( + [ + [False, True, True, False, False, False, False], + [False, True, True, False, False, False, False], + ] + ) + torch.testing.assert_close(genmask, expected_mask) + + expected_trajs = torch.tensor([[1, 2, 43, 42, 42, 42, 42], [1, 2, 43, 42, 42, 42, 42]]) + torch.testing.assert_close(trajs, expected_trajs) + + +def test_prepare_trajectories_with_packing(): + """Test that rollouts data is properly prepared with sequence packing enabled.""" + # Initialize args for sequence packing + args = arguments.parse_args(ignore_unknown_args=True) + setattr(args, 'micro_batch_size', 1) + setattr(args, 'global_batch_size', 1) + setattr(args, 'rl_use_sequence_packing', True) + global_vars.set_args(args) + + tokenizer = MockTokenizer() + r1 = TokenRollout( + trajectory=[1, 2, tokenizer.eod], + reward=3.14, + generation_mask=[False, True, True], + logprobs=[0.1, 0.2, 0.3], + env_id='MEGAENV', + problem_id="2", + ) + r2 = TokenRollout( + trajectory=[1, 2, 3, tokenizer.eod], + reward=0.14, + generation_mask=[False, True, True, True], + logprobs=[0.1, 0.2, 0.3, -1.2], + env_id='MEGAENV', + problem_id="2", + ) + rollouts = [[r1, r2]] + seq_len = 7 + + trajs, genmask, inference_logprobs = rl_utils.prepare_trajectories(rollouts, tokenizer, seq_len) + + # With sequence packing, inference logprobs should be padded to same length + assert isinstance(inference_logprobs, torch.Tensor) + assert inference_logprobs.shape == (2, 7) # 2 sequences, each padded to seq_len + + # Check values (padded with zeros) + torch.testing.assert_close( + inference_logprobs[0], torch.tensor([0.1, 0.2, 0.3, 0.0, 0.0, 0.0, 0.0]) + ) + torch.testing.assert_close( + inference_logprobs[1], torch.tensor([0.1, 0.2, 0.3, -1.2, 0.0, 0.0, 0.0]) + ) + + expected_mask = torch.tensor( + [ + [False, True, True, False, False, False, False], + [False, True, True, True, False, False, False], + ] + ) + torch.testing.assert_close(genmask, expected_mask) + + expected_trajs = torch.tensor([[1, 2, 43, 42, 42, 42, 42], [1, 2, 3, 43, 42, 42, 42]]) + torch.testing.assert_close(trajs, expected_trajs) + + +def test_grpo_loss_calculation_all_pi_eq(): + # All policies are equal: clamping is inactive, ratios are ones. + current_logprobs = torch.ones(BATCH, SEQ) + old_logprobs = torch.ones(BATCH, SEQ) + ref_logprobs = torch.ones(BATCH, SEQ) + advantages = torch.zeros(BATCH) + loss, kl_term, ratios, entropy_term, _, _ = rl_utils.calculate_grpo_loss( + current_logprobs=current_logprobs, + old_logprobs=old_logprobs, + ref_logprobs=ref_logprobs, + advantages=advantages, + clamp_eps_lower=0.1, + clamp_eps_upper=0.1, + kl_beta=0.1, + entropy_weight=0.0, + ) + torch.testing.assert_close(loss, torch.zeros_like(loss)) + torch.testing.assert_close(kl_term, torch.zeros_like(kl_term)) + torch.testing.assert_close(ratios, torch.ones_like(ratios)) + torch.testing.assert_close(entropy_term, torch.ones_like(ratios) * torch.e) + + +def test_grpo_loss_calculation_2x_ratios(): + # All policies are equal: clamping is inactive, ratios are ones. + current_logprobs = torch.ones(BATCH, SEQ) + old_logprobs = torch.ones(BATCH, SEQ) - torch.log(torch.Tensor([2])) + ref_logprobs = torch.ones(BATCH, SEQ) + advantages = torch.ones(BATCH) + loss, kl_term, ratios, _, _, _ = rl_utils.calculate_grpo_loss( + current_logprobs=current_logprobs, + old_logprobs=old_logprobs, + ref_logprobs=ref_logprobs, + advantages=advantages, + clamp_eps_lower=2.1, + clamp_eps_upper=2.1, + kl_beta=0.0, + entropy_weight=0.0, + ) + # Clamping does not affect us, as 2.1 [eps] > 2 [ratio]. + # kl_beta = 0 -> we only have the non-kl term of the loss active. + torch.testing.assert_close(loss, -torch.ones_like(loss) * 2) + # pi and pi_{ref} are the same here. + torch.testing.assert_close(kl_term, torch.zeros_like(kl_term)) + # Current probs are 2x more probable than old pi. + torch.testing.assert_close(ratios, torch.ones_like(ratios) * 2) + + +def test_entropy_calculation(): + # All policies are equal: clamping is inactive, ratios are ones. + current_logprobs = torch.ones(BATCH, SEQ) + old_logprobs = torch.ones(BATCH, SEQ) + ref_logprobs = torch.ones(BATCH, SEQ) + advantages = torch.zeros(BATCH) + loss, _, ratios, entropy_term, _, _ = rl_utils.calculate_grpo_loss( + current_logprobs=current_logprobs, + old_logprobs=old_logprobs, + ref_logprobs=ref_logprobs, + advantages=advantages, + clamp_eps_lower=0.1, + clamp_eps_upper=0.1, + kl_beta=0.0, + entropy_weight=1.0, + ) + torch.testing.assert_close(loss, torch.ones_like(ratios) * torch.e) + torch.testing.assert_close(entropy_term, torch.ones_like(ratios) * torch.e) + + +def test_grpo_loss_truncation(): + + # All ratios are 2 + _, _, _, _, truncated_from_above, truncated_from_below = rl_utils.calculate_grpo_loss( + current_logprobs=torch.ones(BATCH, SEQ), + old_logprobs=0.5 * torch.ones(BATCH, SEQ), + ref_logprobs=torch.ones(BATCH, SEQ), + advantages=torch.zeros(BATCH), + clamp_eps_lower=0.1, + clamp_eps_upper=0.1, + kl_beta=0.1, + entropy_weight=0.0, + ) + assert truncated_from_above.float().mean() == 1 + assert truncated_from_below.float().sum() == 0 + + # All ratios are 0.01 + _, _, _, _, truncated_from_above, truncated_from_below = rl_utils.calculate_grpo_loss( + current_logprobs=0.01 * torch.ones(BATCH, SEQ), + old_logprobs=torch.ones(BATCH, SEQ), + ref_logprobs=torch.ones(BATCH, SEQ), + advantages=torch.zeros(BATCH), + clamp_eps_lower=0.1, + clamp_eps_upper=0.1, + kl_beta=0.1, + entropy_weight=0.0, + ) + assert truncated_from_above.float().sum() == 0 + assert truncated_from_below.float().mean() == 1 + + current_logprobs = torch.tensor([[1.0, 1.0], [1.0, 1.0]]) + old_logprobs = torch.tensor([[0.5, 2.0], [0.05, 1.0]]) + _, _, _, _, truncated_from_above, truncated_from_below = rl_utils.calculate_grpo_loss( + current_logprobs=current_logprobs, + old_logprobs=old_logprobs, + ref_logprobs=old_logprobs, + advantages=torch.zeros(BATCH), + clamp_eps_lower=0.1, + clamp_eps_upper=0.1, + kl_beta=0.1, + entropy_weight=0.0, + ) + # ratios: [[2., 0.5],[20., 1.]] + torch.testing.assert_close(truncated_from_above, torch.tensor([[True, False], [True, False]])) + torch.testing.assert_close(truncated_from_below, torch.tensor([[False, True], [False, False]])) + + +@patch('megatron.rl.rl_utils.mpu') +def test_prepare_data_for_update(mock_mpu): + """Test that getting logprobs at least does not crash.""" + mock_mpu.get_expert_data_parallel_world_size.return_value = 0 + # We use args inside of get_logprobs, we need to initialize them. + + args = arguments.parse_args(ignore_unknown_args=True) + setattr(args, 'data_parallel_size', 1) + setattr(args, 'micro_batch_size', 2) + setattr(args, 'global_batch_size', 2) + setattr(args, 'seq_length', 4) + setattr(args, 'curr_iteration', 1) + global_vars.unset_global_variables() + global_vars.set_global_variables(args, build_tokenizer=False) + + model = MockModel() + tokenizer = MockTokenizer() + + r1 = TokenRollout( + trajectory=[1, 2, 3], + reward=3.14, + generation_mask=[False, True, True], + logprobs=[0.1, 0.2, 0.3], + env_id='MEGAENV', + problem_id="2", + ) + r2 = TokenRollout( + trajectory=[1, 2, 3, 4], + reward=0.14, + generation_mask=[False, True, True, True], + logprobs=[0.1, 0.2, 0.3, -1.2], + env_id='MEGAENV', + problem_id="2", + ) + rollouts = [[r1, r2]] + try: + data_iter = rl_utils.prepare_data_for_update([model], {}, rollouts, tokenizer) + except AssertionError as e: + # We expect trajectories to come padded there. + assert str(e).startswith('Rollout is not the correct length') + + r1 = TokenRollout( + trajectory=torch.Tensor([1, 2, 3, tokenizer.eod]).cuda(), + reward=3.14, + generation_mask=torch.Tensor([False, True, True, True]).cuda(), + logprobs=torch.Tensor([-0.2, -0.3, -3.2]).cuda(), + env_id='MEGAENV', + problem_id="2", + ) + r2 = TokenRollout( + trajectory=torch.Tensor([1, 2, 234, tokenizer.eod]).cuda(), + reward=0.14, + generation_mask=torch.Tensor([False, True, True, True]).cuda(), + logprobs=torch.Tensor([-0.2, -0.3, -1.2]), + env_id='MEGAENV', + problem_id="2", + ) + rollouts = [[r1, r2]] + data_iter = rl_utils.prepare_data_for_update([model], {}, rollouts, tokenizer) + + _, _, old_logprobs, _, _, _, _ = next(data_iter) + # All logits are ones in the MockModel. + # All probabilities should be uniform. + torch.testing.assert_close(old_logprobs.exp(), torch.ones_like(old_logprobs) / VOCAB) + + +def test_sequence_packing_basic(): + """Test basic sequence packing functionality.""" + # Initialize args as required by SequencePacker + args = arguments.parse_args(ignore_unknown_args=True) + setattr(args, 'seq_length', 16) + global_vars.set_args(args) + + tokenizer = MockTokenizer() + bin_size = 16 + packer = rl_utils.SequencePacker(bin_size=bin_size, pad_token=tokenizer.pad) + + # Create test sequences of varying lengths, all padded to same length + max_len = 5 + sequences = [ + torch.cat( + [ + torch.tensor([1, 2, 3, tokenizer.eod]), + torch.full((1,), tokenizer.pad, dtype=torch.long), + ] + ), # length 4 -> 5 + torch.cat( + [torch.tensor([4, 5, tokenizer.eod]), torch.full((2,), tokenizer.pad, dtype=torch.long)] + ), # length 3 -> 5 + torch.tensor([6, 7, 8, 9, tokenizer.eod]), # length 5 + torch.cat( + [torch.tensor([10, tokenizer.eod]), torch.full((3,), tokenizer.pad, dtype=torch.long)] + ), # length 2 -> 5 + ] + + generation_masks = torch.tensor( + [ + [False, True, True, True, False], # Matches padded length + [False, True, True, False, False], + [False, True, True, True, True], + [False, True, False, False, False], + ] + ) + + rewards = torch.tensor([1.0, 2.0, 3.0, 4.0]) + + # Pack sequences + packed_trajs, packed_position_ids, packed_attention_mask, packed_loss_mask, packing_info = ( + packer.pack_sequences(sequences, generation_masks) + ) + + # Verify packed data structure + assert packed_trajs is not None + assert packed_position_ids is not None + assert packed_attention_mask is not None + assert packed_loss_mask is not None + assert packing_info is not None + + # Check that sequences fit in bins properly + # The packer trims sequences to their actual length (removing padding) + # Actual lengths: 4, 3, 5, 2 = 14 total tokens + # With bin_size=16, this should fit in 1 bin + assert packed_trajs.shape[0] >= 1 # At least one bin + assert packed_trajs.shape[1] == bin_size + + # Verify position_ids are correct + for bin_idx in range(packed_trajs.shape[0]): + # Check that position_ids reset for each sequence in the bin + for i in range(packed_trajs.shape[1]): + if i == 0 or packed_trajs[bin_idx, i - 1] == tokenizer.eod: + # Start of a new sequence + if packed_trajs[bin_idx, i] != tokenizer.pad: + assert packed_position_ids[bin_idx, i] == 0 + + +def test_sequence_packing_with_generation_masks(): + """Test sequence packing with generation masks.""" + # Initialize args as required by SequencePacker + args = arguments.parse_args(ignore_unknown_args=True) + setattr(args, 'seq_length', 20) + global_vars.set_args(args) + + tokenizer = MockTokenizer() + bin_size = 20 + packer = rl_utils.SequencePacker(bin_size=bin_size, pad_token=tokenizer.pad) + + # Create test data with generation masks + sequences = [torch.tensor([1, 2, 3, tokenizer.eod]), torch.tensor([4, 5, 6, 7, tokenizer.eod])] + + # Pad sequences to same length for stacking + max_len = max(len(s) for s in sequences) + padded_sequences = [] + for seq in sequences: + padded = torch.cat([seq, torch.full((max_len - len(seq),), tokenizer.pad, dtype=seq.dtype)]) + padded_sequences.append(padded) + + generation_masks = torch.tensor( + [ + [False, True, True, True, False], # Padded to match max_len + [False, True, True, True, True], + ] + ) + + # Pack sequences + packed_trajs, packed_position_ids, packed_attention_mask, packed_loss_mask, packing_info = ( + packer.pack_sequences(padded_sequences, generation_masks) + ) + + # Verify packed tensors + assert packed_trajs.shape[0] == 1 # One bin + assert packed_trajs.shape[1] == bin_size + + # Check that loss mask is set correctly for generation tokens + # The loss mask should be 1 for generation tokens and 0 for padding/prompt + + +def test_sequence_packing_empty_bins(): + """Test that empty bins are created correctly.""" + # Initialize args if needed + args = arguments.parse_args(ignore_unknown_args=True) + setattr(args, 'seq_length', 8) + global_vars.set_args(args) + + tokenizer = MockTokenizer() + bin_size = 8 + num_empty_bins = 3 + + # Create a simple packed data structure + packed_trajs = torch.tensor( + [[1, 2, 3, tokenizer.eod, tokenizer.pad, tokenizer.pad, tokenizer.pad, tokenizer.pad]] + ) + packed_position_ids = torch.tensor([[0, 1, 2, 3, 0, 0, 0, 0]]) + packed_loss_mask = torch.tensor([[1, 1, 1, 1, 0, 0, 0, 0]], dtype=torch.float) + packed_attention_mask = torch.ones(1, bin_size, bin_size) # Simple full attention mask + + # Create empty bins + empty_trajs, empty_position_ids, empty_loss_mask, empty_attention_mask, empty_packing_info = ( + rl_utils.create_empty_bins( + num_empty_bins=num_empty_bins, + bin_size=bin_size, + packed_trajs=packed_trajs, + packed_position_ids=packed_position_ids, + packed_loss_mask=packed_loss_mask, + packed_attention_mask=packed_attention_mask, + tokenizer=tokenizer, + ) + ) + + # Verify shapes + assert empty_trajs.shape[0] == num_empty_bins + assert empty_trajs.shape[1] == bin_size + + # Check that empty bins are filled with padding + for i in range(num_empty_bins): + assert torch.all(empty_trajs[i] == tokenizer.pad) + assert torch.all(empty_position_ids[i] == 0) + assert torch.all(empty_loss_mask[i] == 0) + + # Verify packing info for empty bins + assert len(empty_packing_info) == num_empty_bins + for info in empty_packing_info: + assert len(info['bin_seq_indices']) == 0 # No sequences in empty bins + assert len(info['seq_starts']) == 0 # No sequence starts + + +def test_prepare_trajectories_with_sequence_packing(): + """Test prepare_trajectories with sequence packing enabled.""" + # Set up args with sequence packing + args = arguments.parse_args(ignore_unknown_args=True) + setattr(args, 'rl_use_sequence_packing', True) + setattr(args, 'rl_sequence_packing_bin_size', 16) + setattr(args, 'data_parallel_size', 1) + setattr(args, 'micro_batch_size', 2) + setattr(args, 'global_batch_size', 2) + setattr(args, 'seq_length', 16) + setattr(args, 'curr_iteration', 1) + global_vars.unset_global_variables() + global_vars.set_global_variables(args, build_tokenizer=False) + + tokenizer = MockTokenizer() + + # Create rollouts of varying lengths + r1 = TokenRollout( + trajectory=[1, 2, tokenizer.eod], + reward=3.14, + generation_mask=[False, True, True], + logprobs=[0.1, 0.2, 0.3], + env_id='MEGAENV', + problem_id="1", + ) + r2 = TokenRollout( + trajectory=[4, 5, 6, 7, tokenizer.eod], + reward=0.14, + generation_mask=[False, True, True, True, True], + logprobs=[0.4, 0.5, 0.6, 0.7, 0.8], + env_id='MEGAENV', + problem_id="2", + ) + r3 = TokenRollout( + trajectory=[8, 9, tokenizer.eod], + reward=2.71, + generation_mask=[False, True, True], + logprobs=[0.9, 1.0, 1.1], + env_id='MEGAENV', + problem_id="3", + ) + + rollouts = [[r1, r2, r3]] + seq_len = 16 + + # Call prepare_trajectories with sequence packing + trajs, genmask, inference_logprobs = rl_utils.prepare_trajectories(rollouts, tokenizer, seq_len) + + # With sequence packing enabled but called from prepare_trajectories, + # it might still return individual sequences (not packed into bins yet) + # because the actual packing happens later in prepare_data_for_update + assert trajs.shape[0] == 3 # Three sequences + assert trajs.shape[1] == seq_len + + # Verify that each sequence is properly padded + # Sequence 1: [1, 2, eod, pad] + padding + assert trajs[0, 0] == 1 + assert trajs[0, 1] == 2 + assert trajs[0, 2] == tokenizer.eod + assert trajs[0, 3] == tokenizer.pad + + # Sequence 2: [4, 5, 6, 7, eod, pad] + padding + assert trajs[1, 0] == 4 + assert trajs[1, 1] == 5 + assert trajs[1, 4] == tokenizer.eod + assert trajs[1, 5] == tokenizer.pad + + +def test_sequence_packing_integration(): + """Simple integration test for sequence packing - just verifies the packing works.""" + # Initialize minimal args needed for SequencePacker + args = arguments.parse_args(ignore_unknown_args=True) + setattr(args, 'seq_length', 16) + global_vars.set_args(args) + + tokenizer = MockTokenizer() + bin_size = 16 + + # Test that we can pack sequences and get expected outputs + packer = rl_utils.SequencePacker(bin_size=bin_size, pad_token=tokenizer.pad) + + # Create test data - need to pad to same length for stacking + max_len = 5 + sequences = [ + torch.cat( + [ + torch.tensor([1, 2, 3, tokenizer.eod]), + torch.full((1,), tokenizer.pad, dtype=torch.long), + ] + ), # length 4 -> 5 + torch.cat( + [torch.tensor([4, 5, tokenizer.eod]), torch.full((2,), tokenizer.pad, dtype=torch.long)] + ), # length 3 -> 5 + torch.tensor([6, 7, 8, 9, tokenizer.eod]), # length 5 + ] + generation_masks = [ + torch.tensor([False, True, True, True, False]), + torch.tensor([False, True, True, False, False]), + torch.tensor([False, True, True, True, True]), + ] + + # Pack sequences + packed_trajs, packed_position_ids, packed_attention_mask, packed_loss_mask, packing_info = ( + packer.pack_sequences(sequences, generation_masks) + ) + + # Basic assertions + assert packed_trajs is not None + assert packed_trajs.shape[1] == bin_size # Each bin should be bin_size + assert packed_position_ids.shape == packed_trajs.shape + assert packed_loss_mask.shape == packed_trajs.shape + + # Verify the sequences are packed correctly + # Total length: 4 + 3 + 5 = 12, should fit in 1 bin + assert packed_trajs.shape[0] == 1 + + # The packer sorts sequences by length (descending), so order is: seq3 (len 5), seq1 (len 4), seq2 (len 3) + expected_start = torch.tensor( + [6, 7, 8, 9, tokenizer.eod, 1, 2, 3, tokenizer.eod, 4, 5, tokenizer.eod] + ) + assert torch.all(packed_trajs[0, :12] == expected_start) + + # Rest should be padding + assert torch.all(packed_trajs[0, 12:] == tokenizer.pad) diff --git a/tests/unit_tests/transformer/moe/test_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_token_dispatcher.py index 4b4cfa567c5..6a155920e2f 100644 --- a/tests/unit_tests/transformer/moe/test_token_dispatcher.py +++ b/tests/unit_tests/transformer/moe/test_token_dispatcher.py @@ -417,7 +417,10 @@ def is_hybrid_ep_available(): return HAVE_HYBRIDEP -@pytest.mark.skipif(True, reason="Deep EP and Hybrid EP are not available") +@pytest.mark.skipif( + not is_deep_ep_available() and not is_hybrid_ep_available(), + reason="Deep EP and Hybrid EP are not available", +) class TestFlexDispatcher: def setup_method(self, method): pass diff --git a/tools/run_inference_performance_test.py b/tools/run_inference_performance_test.py index 01e5ab58898..dda2b8284b3 100644 --- a/tools/run_inference_performance_test.py +++ b/tools/run_inference_performance_test.py @@ -24,9 +24,8 @@ from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) -from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols from megatron.core.transformer.module import MegatronModule -from megatron.core.utils import get_attr_wrapped_model +from megatron.core.utils import get_mamba_inference_state_config_from_model from model_provider import model_provider sys.path.append( @@ -89,14 +88,7 @@ def get_inference_engine(args: argparse.Namespace, model: MegatronModule) -> Abs moe_pad_experts_for_cuda_graph_inference=args.moe_pad_experts_for_cuda_graph_inference, ) - # Layer type list for hybrid models - decoder = get_attr_wrapped_model(model, "decoder") - layer_type_list = getattr(decoder, "layer_type_list", None) - if layer_type_list is not None and Symbols.MAMBA in layer_type_list: - (mamba_conv_states_shape, mamba_ssm_states_shape) = decoder.mamba_state_shapes_per_request() - else: - mamba_conv_states_shape = None - mamba_ssm_states_shape = None + mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) if args.engine_type == "static": inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config) @@ -129,9 +121,7 @@ def get_inference_engine(args: argparse.Namespace, model: MegatronModule) -> Abs block_size_tokens=args.inference_dynamic_batching_block_size, tensor_model_parallel_size=args.tensor_model_parallel_size, materialize_only_last_token_logits=not args.return_log_probs, - layer_type_list=layer_type_list, - mamba_conv_states_shape=mamba_conv_states_shape, - mamba_ssm_states_shape=mamba_ssm_states_shape, + mamba_inference_state_config=mamba_inference_state_config, cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents, kv_lora_rank=args.kv_lora_rank if args.multi_latent_attention else None, qk_pos_emb_head_dim=args.qk_pos_emb_head_dim, diff --git a/train_rl.py b/train_rl.py index 479498d392a..bf632d81e2c 100644 --- a/train_rl.py +++ b/train_rl.py @@ -191,7 +191,7 @@ def forward_step(data_iterator, model: GPTModel, loss_only: bool = False): seq_lengths = None attention_mask = None - if args.use_sequence_packing: + if args.rl_use_sequence_packing: # Get bin index from data iterator bin_tensor = batch_data[0] bin_idx = bin_tensor.item() From b9c48ecb99af17c659d6409c50ff2c81c81216e3 Mon Sep 17 00:00:00 2001 From: Michael Wojcikiewicz Date: Tue, 25 Nov 2025 17:12:23 -0500 Subject: [PATCH 155/248] adding action for checking whether PR author is nvidia employee or not for selecting ephemeral ci hosts (#2402) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .../check-nvidia-sso-membership/action.yml | 139 ++++++++++++++++++ .github/workflows/cicd-main.yml | 66 ++++----- 2 files changed, 166 insertions(+), 39 deletions(-) create mode 100644 .github/actions/check-nvidia-sso-membership/action.yml diff --git a/.github/actions/check-nvidia-sso-membership/action.yml b/.github/actions/check-nvidia-sso-membership/action.yml new file mode 100644 index 00000000000..71926c4547d --- /dev/null +++ b/.github/actions/check-nvidia-sso-membership/action.yml @@ -0,0 +1,139 @@ +name: 'Check NVIDIA SSO Membership' +description: 'Check if a GitHub username exists in the NVIDIA SSO users list from github-audits' +author: 'NVIDIA' + +inputs: + username: + description: 'GitHub username to check' + required: true + github_audits_repo: + description: 'Repository containing SSO users file' + required: false + default: 'NVIDIA-GitHub-Management/github-audits' + github_audits_version: + description: 'Release version tag' + required: false + default: 'v0.1.0' + sso_users_filename: + description: 'Filename of SSO users JSON' + required: false + default: 'users_sso.json' + github_token: + description: 'GitHub token with access to github-audits repo' + required: true + +outputs: + is_member: + description: 'Boolean - true if user is in NVIDIA SSO list, false otherwise' + value: ${{ steps.check-membership.outputs.is_member }} + is_org_member: + description: 'Boolean - true if user has NVIDIA or NVIDIA-NeMo in org_roles' + value: ${{ steps.check-membership.outputs.is_org_member }} + user_orgs: + description: 'Comma-separated list of orgs user is member of' + value: ${{ steps.check-membership.outputs.user_orgs }} + sso_file_available: + description: 'Boolean - true if SSO file was successfully downloaded' + value: ${{ steps.download-sso.outputs.sso_file_available }} + user_count: + description: 'Number of users in the SSO file (0 if download failed)' + value: ${{ steps.download-sso.outputs.user_count }} + +runs: + using: 'composite' + steps: + - name: Download NVIDIA SSO users from github-audits + id: download-sso + shell: bash + env: + GH_TOKEN: ${{ inputs.github_token }} + run: | + echo "Downloading ${{ inputs.sso_users_filename }} from ${{ inputs.github_audits_repo }} ${{ inputs.github_audits_version }} release..." + + # Download the release asset using gh CLI + gh release download ${{ inputs.github_audits_version }} \ + --repo ${{ inputs.github_audits_repo }} \ + --pattern ${{ inputs.sso_users_filename }} \ + --clobber 2>&1 || { + echo "ERROR: Failed to download ${{ inputs.sso_users_filename }} from github-audits release" + echo "sso_file_available=false" >> $GITHUB_OUTPUT + echo "user_count=0" >> $GITHUB_OUTPUT + exit 0 + } + + # Verify file was downloaded and is valid JSON + if [ ! -f ${{ inputs.sso_users_filename }} ]; then + echo "ERROR: ${{ inputs.sso_users_filename }} file not found after download" + echo "sso_file_available=false" >> $GITHUB_OUTPUT + echo "user_count=0" >> $GITHUB_OUTPUT + exit 0 + fi + + # Validate JSON structure + if ! jq -e 'type == "object"' ${{ inputs.sso_users_filename }} > /dev/null 2>&1; then + echo "ERROR: ${{ inputs.sso_users_filename }} is not a valid JSON object" + echo "sso_file_available=false" >> $GITHUB_OUTPUT + echo "user_count=0" >> $GITHUB_OUTPUT + exit 0 + fi + + USER_COUNT=$(jq 'length' ${{ inputs.sso_users_filename }}) + echo "Successfully downloaded ${{ inputs.sso_users_filename }} with $USER_COUNT NVIDIA SSO users" + echo "sso_file_available=true" >> $GITHUB_OUTPUT + echo "user_count=$USER_COUNT" >> $GITHUB_OUTPUT + + - name: Check if user is in SSO list + id: check-membership + shell: bash + run: | + USERNAME="${{ inputs.username }}" + SSO_FILE="${{ inputs.sso_users_filename }}" + + echo "Checking if $USERNAME is in NVIDIA SSO users list..." + + # Check if SSO file is available + if [ "${{ steps.download-sso.outputs.sso_file_available }}" != "true" ] || [ ! -f "$SSO_FILE" ]; then + echo "ERROR: $SSO_FILE not available - cannot check membership" + echo "is_member=false" >> $GITHUB_OUTPUT + echo "is_org_member=false" >> $GITHUB_OUTPUT + echo "user_orgs=" >> $GITHUB_OUTPUT + exit 0 + fi + + # Check if username exists as a key in the JSON object + if jq -e --arg user "$USERNAME" 'has($user)' "$SSO_FILE" > /dev/null 2>&1; then + echo "$USERNAME found in NVIDIA SSO users" + echo "is_member=true" >> $GITHUB_OUTPUT + + # Extract and check org membership + IS_ORG_MEMBER=$(jq -r --arg user "$USERNAME" ' + .[$user].org_roles // [] | + map(select(test("^(NVIDIA|NVIDIA-NeMo):Member$"))) | + length > 0 + ' "$SSO_FILE") + + USER_ORGS=$(jq -r --arg user "$USERNAME" ' + .[$user].org_roles // [] | + map(split(":")[0]) | + unique | + join(",") + ' "$SSO_FILE") + + echo "is_org_member=$IS_ORG_MEMBER" >> $GITHUB_OUTPUT + echo "user_orgs=$USER_ORGS" >> $GITHUB_OUTPUT + + if [ "$IS_ORG_MEMBER" == "true" ]; then + echo "$USERNAME is a member of NVIDIA or NVIDIA-NeMo org" + else + echo "$USERNAME has @nvidia.com email but is not in NVIDIA or NVIDIA-NeMo org (orgs: $USER_ORGS)" + fi + else + echo "$USERNAME NOT found in NVIDIA SSO users" + echo "is_member=false" >> $GITHUB_OUTPUT + echo "is_org_member=false" >> $GITHUB_OUTPUT + echo "user_orgs=" >> $GITHUB_OUTPUT + fi + +branding: + icon: 'shield' + color: 'green' diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index b05b6c55b84..d76d68e463e 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -20,8 +20,8 @@ on: branches: - dev - main - - "pull-request/[0-9]+" - - "deploy-release/*" + - 'pull-request/[0-9]+' + - 'deploy-release/*' merge_group: types: [checks_requested] workflow_dispatch: @@ -43,6 +43,8 @@ jobs: if: github.repository == 'NVIDIA/Megatron-LM' outputs: is_external_contributor: ${{ github.event.pull_request.user.type == 'User' }} + is_maintainer: ${{ steps.check-membership.outputs.is_maintainer }} + selected_runner: ${{ steps.check-membership.outputs.is_maintainer == 'true' && 'nvidia-ci-aws-gpu-x8' || 'nvidia-ci-aws-gpu-x8-ephemeral' }} permissions: issues: write pull-requests: write @@ -60,7 +62,14 @@ jobs: if: startsWith(github.ref, 'refs/heads/pull-request/') uses: nv-gha-runners/get-pr-info@main - - name: Check membership + - name: Check NVIDIA SSO membership + id: check-sso + uses: ./.github/actions/check-nvidia-sso-membership + with: + username: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }} + github_token: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }} + + - name: Set maintainer status id: check-membership env: IS_MAIN_BRANCH: ${{ github.ref == 'refs/heads/main' }} @@ -68,38 +77,15 @@ jobs: IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }} SCHEDULED_JOB: ${{ github.event_name == 'schedule' }} run: | - PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }} - + # Skip SSO check for scheduled jobs, main branch, dev branch, or merge groups if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_DEV_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT exit 0 fi - echo "Checking if $PR_AUTHOR is a repo collaborator..." - API_URL="https://api.github.com/repos/$REPO/collaborators/$PR_AUTHOR" - REPO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer $GITHUB_TOKEN" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - $API_URL) - - echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA-NeMo..." - API_URL="https://api.github.com/orgs/NVIDIA-NeMo/members/$PR_AUTHOR" - ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer $GITHUB_TOKEN" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - $API_URL) - - echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA..." - API_URL="https://api.github.com/orgs/NVIDIA/members/$PR_AUTHOR" - ORG_NVIDIA_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer $GITHUB_TOKEN" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - $API_URL) - - if [ "$REPO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_MEMBERSHIP_RESPONSE" -eq 204 ]; then + # Use SSO membership check result + IS_MEMBER="${{ steps.check-sso.outputs.is_member }}" + if [ "$IS_MEMBER" == "true" ]; then echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT else echo "is_maintainer=false" | tee -a $GITHUB_OUTPUT @@ -112,7 +98,7 @@ jobs: with: issue-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} repository: ${{ github.repository }} - body-includes: "" + body-includes: '' - name: Delete comment uses: actions/github-script@v7 @@ -212,8 +198,8 @@ jobs: echo "is_merge_group: ${{ needs.pre-flight.outputs.is_merge_group }}" cicd-container-build: - needs: [pre-flight, cicd-wait-in-queue] - runs-on: nvidia-ci-aws-gpu-x8 + needs: [is-not-external-contributor, pre-flight, cicd-wait-in-queue] + runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }} if: | ( success() @@ -362,12 +348,13 @@ jobs: matrix: include: ${{ fromJson(needs.cicd-parse-unit-tests.outputs.unit-tests) }} needs: + - is-not-external-contributor - pre-flight - cicd-wait-in-queue - cicd-container-build - cicd-parse-unit-tests - runs-on: nvidia-ci-aws-gpu-x8 - name: "${{ matrix.bucket }} - latest" + runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }} + name: '${{ matrix.bucket }} - latest' if: | ( success() @@ -389,7 +376,7 @@ jobs: test_case: ${{ matrix.bucket }} tag: latest timeout: ${{ matrix.timeout || 30 }} - is_unit_test: "true" + is_unit_test: 'true' PAT: ${{ secrets.PAT }} container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }} @@ -474,12 +461,13 @@ jobs: matrix: include: ${{ fromJson(needs.cicd-parse-integration-tests.outputs.integration-tests) }} needs: + - is-not-external-contributor - pre-flight - cicd-wait-in-queue - cicd-parse-integration-tests - cicd-unit-tests-latest - runs-on: nvidia-ci-aws-gpu-x8 - name: "${{ matrix.model }}/${{ matrix.test_case }} - latest" + runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }} + name: '${{ matrix.model }}/${{ matrix.test_case }} - latest' env: PIP_DISABLE_PIP_VERSION_CHECK: 1 PIP_NO_PYTHON_VERSION_WARNING: 1 @@ -502,7 +490,7 @@ jobs: model: ${{ matrix.model }} tag: latest timeout: ${{ matrix.timeout || 30 }} - is_unit_test: "false" + is_unit_test: 'false' PAT: ${{ secrets.PAT }} container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }} From 3aa0c4e9e99c7f48517f41072cabcf1229259df9 Mon Sep 17 00:00:00 2001 From: Michael Wojcikiewicz Date: Wed, 26 Nov 2025 10:16:10 -0500 Subject: [PATCH 156/248] fix: exit failure when PR author is external contributor removed (#2410) --- .github/workflows/cicd-main.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index d76d68e463e..fe4da54df4f 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -127,14 +127,6 @@ jobs: Thank you for your understanding. - - name: exit - run: | - if [ "${{ steps.check-membership.outputs.is_maintainer }}" == "true" ]; then - exit 0 - else - exit 1 - fi - pre-flight: needs: [is-not-external-contributor] if: github.repository == 'NVIDIA/Megatron-LM' From b750bdba73b87741c1d49c86f5cfb5c1015b86ce Mon Sep 17 00:00:00 2001 From: Michael Wojcikiewicz Date: Thu, 27 Nov 2025 15:57:44 -0500 Subject: [PATCH 157/248] fix: adding k8s taints for ephermeral jobs (#2420) --- .github/workflows/cicd-main.yml | 84 +++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index fe4da54df4f..ef37210cea3 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -201,6 +201,34 @@ jobs: && needs.pre-flight.outputs.is_merge_group == 'false' && !cancelled() steps: + - name: Taint node for job isolation + if: contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral') + shell: bash + run: | + # Verify prerequisites + if [ -z "$NODE_NAME" ]; then + echo "ERROR: NODE_NAME not set" + exit 1 + fi + + if ! command -v kubectl &> /dev/null; then + echo "ERROR: kubectl not found" + exit 1 + fi + + # Apply taint + JOB_ID="${GITHUB_RUN_ID}-${GITHUB_JOB}" + echo "=== Adding node taint for job isolation ===" + echo "Node: $NODE_NAME" + echo "Job ID: $JOB_ID" + + kubectl taint node "$NODE_NAME" "github.com/job-id=${JOB_ID}:NoSchedule" --overwrite=true + kubectl label node "$NODE_NAME" \ + "github.com/workflow=${GITHUB_WORKFLOW}" \ + "github.com/run-id=${GITHUB_RUN_ID}" \ + "github.com/job=${GITHUB_JOB}" \ + --overwrite=true + - name: Checkout uses: actions/checkout@v4 @@ -360,6 +388,34 @@ jobs: PIP_NO_PYTHON_VERSION_WARNING: 1 PIP_ROOT_USER_ACTION: ignore steps: + - name: Taint node for job isolation + if: contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral') + shell: bash + run: | + # Verify prerequisites + if [ -z "$NODE_NAME" ]; then + echo "ERROR: NODE_NAME not set" + exit 1 + fi + + if ! command -v kubectl &> /dev/null; then + echo "ERROR: kubectl not found" + exit 1 + fi + + # Apply taint + JOB_ID="${GITHUB_RUN_ID}-${GITHUB_JOB}" + echo "=== Adding node taint for job isolation ===" + echo "Node: $NODE_NAME" + echo "Job ID: $JOB_ID" + + kubectl taint node "$NODE_NAME" "github.com/job-id=${JOB_ID}:NoSchedule" --overwrite=true + kubectl label node "$NODE_NAME" \ + "github.com/workflow=${GITHUB_WORKFLOW}" \ + "github.com/run-id=${GITHUB_RUN_ID}" \ + "github.com/job=${GITHUB_JOB}" \ + --overwrite=true + - name: Checkout uses: actions/checkout@v4 - name: main @@ -473,6 +529,34 @@ jobs: && needs.pre-flight.outputs.is_merge_group == 'false' && !cancelled() steps: + - name: Taint node for job isolation + if: contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral') + shell: bash + run: | + # Verify prerequisites + if [ -z "$NODE_NAME" ]; then + echo "ERROR: NODE_NAME not set" + exit 1 + fi + + if ! command -v kubectl &> /dev/null; then + echo "ERROR: kubectl not found" + exit 1 + fi + + # Apply taint + JOB_ID="${GITHUB_RUN_ID}-${GITHUB_JOB}" + echo "=== Adding node taint for job isolation ===" + echo "Node: $NODE_NAME" + echo "Job ID: $JOB_ID" + + kubectl taint node "$NODE_NAME" "github.com/job-id=${JOB_ID}:NoSchedule" --overwrite=true + kubectl label node "$NODE_NAME" \ + "github.com/workflow=${GITHUB_WORKFLOW}" \ + "github.com/run-id=${GITHUB_RUN_ID}" \ + "github.com/job=${GITHUB_JOB}" \ + --overwrite=true + - name: Checkout uses: actions/checkout@v4 - name: main From c12909b7b589d125bbcea88e07218404747d185f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 27 Nov 2025 23:10:12 +0100 Subject: [PATCH 158/248] ci: Enable functional tests (#2419) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/actions/action.yml | 57 ++++---- .github/workflows/cicd-main.yml | 19 ++- ...pt-dynamic-inference-with-coordinator.yaml | 7 +- .../recipes/gpt-dynamic-inference.yaml | 8 +- .../recipes/gpt-static-inference.yaml | 10 +- tests/test_utils/recipes/gpt.yaml | 124 +++++++++--------- .../recipes/mamba-dynamic-inference.yaml | 4 +- .../recipes/mamba-static-inference.yaml | 6 +- tests/test_utils/recipes/mamba.yaml | 10 +- .../recipes/moe-dynamic-inference.yaml | 6 +- .../recipes/moe-static-inference.yaml | 8 +- tests/test_utils/recipes/moe.yaml | 24 ++-- .../test_utils/recipes/multimodal-llava.yaml | 6 +- 13 files changed, 156 insertions(+), 133 deletions(-) diff --git a/.github/actions/action.yml b/.github/actions/action.yml index 8c6ca3a6865..5c35385b036 100644 --- a/.github/actions/action.yml +++ b/.github/actions/action.yml @@ -11,28 +11,28 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -name: "Test Template" -description: "Template for running NeMo tests in a containerized environment" +name: 'Test Template' +description: 'Template for running NeMo tests in a containerized environment' inputs: container-image: - description: "Container image to use for test" + description: 'Container image to use for test' required: true timeout: - description: "Max runtime of test in minutes" + description: 'Max runtime of test in minutes' required: false - default: "30" + default: '30' script: - description: "Test script to execute" + description: 'Test script to execute' required: true is-optional: - description: "Pass this job on failure." + description: 'Pass this job on failure.' required: false - default: "false" + default: 'false' is_unit_test: - description: "Upload coverage as unit test" + description: 'Upload coverage as unit test' required: false - default: "false" + default: 'false' tag: description: Latest or legacy test suite required: true @@ -43,11 +43,11 @@ inputs: description: Model to launch required: false PAT: - description: "GitHub Personal Access Token" + description: 'GitHub Personal Access Token' required: true runs: - using: "composite" + using: 'composite' steps: - name: Checkout repository uses: actions/checkout@v2 @@ -114,6 +114,16 @@ runs: HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false" echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT + - name: Has Run functional tests label + shell: bash -x -e -u -o pipefail {0} + id: has-run-functional-tests-label + env: + GH_TOKEN: ${{ github.token }} + run: | + PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} + HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")') || echo "false" + echo "main=$HAS_RUN_FUNCTIONAL_TESTS_LABEL" | tee -a $GITHUB_OUTPUT + - name: Create run-script (e2e test) shell: bash -x -e -u -o pipefail {0} if: inputs.is_unit_test == 'false' @@ -126,16 +136,19 @@ runs: set -euxo pipefail if [ "${{ steps.has-run-tests-label.outputs.main }}" == "true" ]; then - ARGS=( - --scope mr-github - --enable-lightweight-mode - ) - else - ARGS=( - --scope mr-slim - --enable-lightweight-mode - ) - fi + ARGS=( + --scope mr-github + --enable-lightweight-mode + ) + elif [ "${{ steps.has-run-functional-tests-label.outputs.main }}" == "true" ]; then + ARGS=( + --scope mr-github + ) + else + ARGS=( + --scope mr-github-slim + ) + fi export PYTHONPATH=$(pwd) export NEMORUN_HOME=$(pwd) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index ef37210cea3..2fb08030686 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -259,8 +259,6 @@ jobs: - name: Download test data shell: bash - env: - GH_TOKEN: ${{ secrets.PAT }} run: | echo "::group::Download test data" pip install --no-cache-dir pygithub click @@ -463,10 +461,20 @@ jobs: HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false" echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT + - name: Has Run functional tests label + id: has-run-functional-tests-label + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} + HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")') || echo "false" + echo "main=$HAS_RUN_FUNCTIONAL_TESTS_LABEL" | tee -a $GITHUB_OUTPUT + - name: Parse functional tests id: main env: HAS_RUN_TESTS_LABEL: ${{ steps.has-run-tests-label.outputs.main }} + HAS_RUN_FUNCTIONAL_TESTS_LABEL: ${{ steps.has-run-functional-tests-label.outputs.main }} run: | export PYTHONPATH=$(pwd) @@ -475,10 +483,13 @@ jobs: --scope mr-github --enable-lightweight-mode ) + elif [ "$HAS_RUN_FUNCTIONAL_TESTS_LABEL" == "true" ]; then + ARGS=( + --scope mr-github + ) else ARGS=( - --scope mr-slim - --enable-lightweight-mode + --scope mr-github-slim ) fi diff --git a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml b/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml index 6a3d582d3ae..e882d721860 100644 --- a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml +++ b/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml @@ -3,7 +3,7 @@ format_version: 1 maintainers: [mcore] loggers: [stdout] spec: - name: "{test_case}_{environment}_{platforms}" + name: '{test_case}_{environment}_{platforms}' model: gpt build: mcore-pyt-{environment} nodes: 1 @@ -67,15 +67,14 @@ products: - test_case: [gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq] products: - environment: [dev] scope: [flaky] - diff --git a/tests/test_utils/recipes/gpt-dynamic-inference.yaml b/tests/test_utils/recipes/gpt-dynamic-inference.yaml index 66fa6887de8..a3853c3d9e1 100644 --- a/tests/test_utils/recipes/gpt-dynamic-inference.yaml +++ b/tests/test_utils/recipes/gpt-dynamic-inference.yaml @@ -3,7 +3,7 @@ format_version: 1 maintainers: [mcore] loggers: [stdout] spec: - name: "{test_case}_{environment}_{platforms}" + name: '{test_case}_{environment}_{platforms}' model: gpt build: mcore-pyt-{environment} nodes: 1 @@ -62,15 +62,15 @@ products: - test_case: [gpt_dynamic_inference_tp8_pp1_583m_logitsmatch] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/gpt-static-inference.yaml b/tests/test_utils/recipes/gpt-static-inference.yaml index 033c6c35116..39c2c3c934e 100644 --- a/tests/test_utils/recipes/gpt-static-inference.yaml +++ b/tests/test_utils/recipes/gpt-static-inference.yaml @@ -3,7 +3,7 @@ format_version: 1 maintainers: [mcore] loggers: [stdout] spec: - name: "{test_case}_{environment}_{platforms}" + name: '{test_case}_{environment}_{platforms}' model: gpt build: mcore-pyt-{environment} nodes: 1 @@ -57,20 +57,20 @@ products: - test_case: [gpt_static_inference_tp1_pp1_583m_logitsmatch] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt_static_inference_tp1_pp1_583m_cudagraphs] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/gpt.yaml index 34030e4923a..eae09a6e16a 100644 --- a/tests/test_utils/recipes/gpt.yaml +++ b/tests/test_utils/recipes/gpt.yaml @@ -110,14 +110,14 @@ products: - test_case: [gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer] products: @@ -129,201 +129,201 @@ products: - test_case: [gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] # - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic] # products: # - environment: [dev] - # scope: [mr, mr-github] + # scope: [mr] # - environment: [lts] # scope: [nightly] # Non-deterministic: #487 - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # outdated TE: #501 - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # non-determinism: #436 - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # non-determinism: #437 - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_vp1] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] # - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist] # products: # - environment: [dev] - # scope: [mr, mr-github] + # scope: [mr] # platforms: [dgx_h100] # Hangs: #513 # - environment: [lts] # scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather] products: # - environment: [dev] - # scope: [mr, mr-github] + # scope: [mr] # platforms: [dgx_h100] # Hangs: #513 - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied] products: # - environment: [dev] - # scope: [mr, mr-github] # Hangs: #513 + # scope: [mr] # Hangs: #513 # platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap] products: # - environment: [dev] - # scope: [mr, mr-github] # Hangs: #513 + # scope: [mr] # Hangs: #513 # platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp2_cp2] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_nondeterministic] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] @@ -331,14 +331,14 @@ products: - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] @@ -350,96 +350,96 @@ products: - test_case: [gpt3_mcore_te_tp2_pp2_mla] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader] products: # - environment: [dev] - # scope: [mr, mr-github] # Hangs: #513 + # scope: [mr] # Hangs: #513 # platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_mcore_tp2_pp2_uninstall_te] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] scope: [nightly] - test_case: [gpt3_7b_tp1_pp4_memory_speed] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # OOM: #434 - test_case: [gpt3_7b_tp4_pp1_memory_speed] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # OOM: #434 - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] - test_case: [gpt3_mcore_te_tp2_pp1_modelopt_distill_resume] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] # Outdated: #502 # - test_case: [gpt3_mcore_te_tp2_pp1_fsdp2_resume_torch_dist] # products: # - environment: [dev] - # scope: [mr, mr-github] # Broken: #484 + # scope: [mr] # Broken: #484 # - environment: [lts] # scope: [nightly] # Requires PyT 2.4: #481 ####################################################################### @@ -455,57 +455,57 @@ products: # - test_case: [gpt3_mcore_reruns_persistent_2] # products: # - environment: [dev] - # scope: [mr, mr-github] + # scope: [mr] # platforms: [dgx_h100] # - environment: [lts] # scope: [nightly] - test_case: [gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer] products: - environment: [lts] - scope: [mr, mr-github] + scope: [mr] - environment: [dev] - scope: [mr, mr-github, mr-slim] + scope: [mr, mr-github, mr-github-slim] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] - scope: [mr, mr-github] + scope: [mr] - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather] products: - environment: [lts] - scope: [mr, mr-github] + scope: [mr] - environment: [dev] - scope: [mr, mr-github, mr-slim] + scope: [mr, mr-github, mr-github-slim] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - environment: [lts] - scope: [mr, mr-github] + scope: [mr] # - test_case: [gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone] # products: # - environment: [dev] - # scope: [mr, mr-github] + # scope: [mr] # platforms: [dgx_h100] # - test_case: [gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer] # products: # - environment: [dev] - # scope: [mr, mr-github] + # scope: [mr] # platforms: [dgx_h100] # - test_case: [gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu] # products: # - environment: [dev] - # scope: [mr, mr-github] + # scope: [mr] # platforms: [dgx_h100] # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic] # products: # - environment: [dev] - # scope: [mr, mr-github] + # scope: [mr] # platforms: [dgx_a100, dgx_h100] # - test_case: [gpt3_weekly_dgx_b200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap] # products: @@ -555,4 +555,4 @@ products: # - test_case: [gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te] # products: # - environment: [dev, lts] - # scope: [mr, mr-github] # Non-deterministic: #483 + # scope: [mr] # Non-deterministic: #483 diff --git a/tests/test_utils/recipes/mamba-dynamic-inference.yaml b/tests/test_utils/recipes/mamba-dynamic-inference.yaml index 9ca1bab4402..0d02ce29a54 100644 --- a/tests/test_utils/recipes/mamba-dynamic-inference.yaml +++ b/tests/test_utils/recipes/mamba-dynamic-inference.yaml @@ -3,7 +3,7 @@ format_version: 1 maintainers: [mcore] loggers: [stdout] spec: - name: "{test_case}_{environment}_{platforms}" + name: '{test_case}_{environment}_{platforms}' model: hybrid build: mcore-pyt-{environment} nodes: 1 @@ -57,5 +57,5 @@ products: - test_case: [hybrid_dynamic_inference_tp1_pp1_dp8_583m] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/mamba-static-inference.yaml b/tests/test_utils/recipes/mamba-static-inference.yaml index 06107618916..9645b1b0b8a 100644 --- a/tests/test_utils/recipes/mamba-static-inference.yaml +++ b/tests/test_utils/recipes/mamba-static-inference.yaml @@ -3,7 +3,7 @@ format_version: 1 maintainers: [mcore] loggers: [stdout] spec: - name: "{test_case}_{environment}_{platforms}" + name: '{test_case}_{environment}_{platforms}' model: hybrid build: mcore-pyt-{environment} nodes: 1 @@ -57,10 +57,10 @@ products: - test_case: [hybrid_static_inference_tp1_pp1_2B_logitsmatch] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [hybrid_static_inference_tp1_pp1_2B_cudagraphs] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/mamba.yaml b/tests/test_utils/recipes/mamba.yaml index bb742200d26..92b799d3d1c 100644 --- a/tests/test_utils/recipes/mamba.yaml +++ b/tests/test_utils/recipes/mamba.yaml @@ -3,7 +3,7 @@ format_version: 1 maintainers: [mcore] loggers: [stdout] spec: - name: "{test_case}_{environment}_{platforms}" + name: '{test_case}_{environment}_{platforms}' model: hybrid build: mcore-pyt-{environment} nodes: 1 @@ -58,7 +58,7 @@ products: - test_case: [hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] # - environment: [lts] # disabled until triton is bumped # scope: [nightly] @@ -67,14 +67,14 @@ products: # - test_case: [hybrid_mr_mcore_te_tp1_pp4_cp1_dgx_a100_1N8G] # products: # - environment: [dev] - # scope: [mr, mr-github] + # scope: [mr] # - environment: [lts] # disabled until triton is bumped # scope: [nightly] - test_case: [hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] # - environment: [lts] # disabled until triton is bumped # scope: [nightly] @@ -82,7 +82,7 @@ products: - test_case: [hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] # - environment: [lts] # disabled until triton is bumped # scope: [nightly] diff --git a/tests/test_utils/recipes/moe-dynamic-inference.yaml b/tests/test_utils/recipes/moe-dynamic-inference.yaml index 9bb23f8a322..6d8fdc533e1 100644 --- a/tests/test_utils/recipes/moe-dynamic-inference.yaml +++ b/tests/test_utils/recipes/moe-dynamic-inference.yaml @@ -3,7 +3,7 @@ format_version: 1 maintainers: [mcore] loggers: [stdout] spec: - name: "{test_case}_{environment}_{platforms}" + name: '{test_case}_{environment}_{platforms}' model: moe build: mcore-pyt-{environment} nodes: 1 @@ -57,10 +57,10 @@ products: - test_case: [gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch] products: - environment: [dev] - scope: [mr-broken, mr-github] + scope: [mr-broken] platforms: [dgx_h100] - test_case: [gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/moe-static-inference.yaml b/tests/test_utils/recipes/moe-static-inference.yaml index 136606d0955..9cebb66f2e2 100644 --- a/tests/test_utils/recipes/moe-static-inference.yaml +++ b/tests/test_utils/recipes/moe-static-inference.yaml @@ -3,7 +3,7 @@ format_version: 1 maintainers: [mcore] loggers: [stdout] spec: - name: "{test_case}_{environment}_{platforms}" + name: '{test_case}_{environment}_{platforms}' model: moe build: mcore-pyt-{environment} nodes: 1 @@ -57,15 +57,15 @@ products: - test_case: [gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml index 2d4e8c4c94c..285d16c99f3 100644 --- a/tests/test_utils/recipes/moe.yaml +++ b/tests/test_utils/recipes/moe.yaml @@ -3,7 +3,7 @@ format_version: 1 maintainers: [mcore] loggers: [stdout] spec: - name: "{test_case}_{environment}_{platforms}" + name: '{test_case}_{environment}_{platforms}' model: moe build: mcore-pyt-{environment} nodes: 1 @@ -84,27 +84,27 @@ products: - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] # hang: #513 # - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental] # products: # - environment: [dev] - # scope: [mr, mr-github] + # scope: [mr] # platforms: [dgx_h100] # hang: #513 - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router] products: @@ -114,12 +114,12 @@ products: - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] # - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon] # products: @@ -152,12 +152,12 @@ products: # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer] # products: # - environment: [dev] - # scope: [mr, mr-github] + # scope: [mr] # platforms: [dgx_h100] # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM] # products: # - environment: [dev] - # scope: [mr, mr-github] + # scope: [mr] # platforms: [dgx_h100] ########################### # Merge train tests # @@ -165,12 +165,12 @@ products: - test_case: [gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer] products: - environment: [dev] - scope: [mr, mr-github, mr-slim] + scope: [mr, mr-github, mr-github-slim] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed] products: - environment: [dev] - scope: [mr, mr-github, mr-slim] + scope: [mr] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8] products: diff --git a/tests/test_utils/recipes/multimodal-llava.yaml b/tests/test_utils/recipes/multimodal-llava.yaml index 0e199764c09..72702de33c5 100644 --- a/tests/test_utils/recipes/multimodal-llava.yaml +++ b/tests/test_utils/recipes/multimodal-llava.yaml @@ -3,7 +3,7 @@ format_version: 1 maintainers: [mcore] loggers: [stdout] spec: - name: "{test_case}_{environment}_{platforms}" + name: '{test_case}_{environment}_{platforms}' model: multimodal-llava build: mcore-pyt-{environment} nodes: 1 @@ -61,10 +61,10 @@ products: - test_case: [multimodal_llava_mcore_te_tp1_pp1] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] - test_case: [multimodal_llava_mcore_te_tp4_sp_cp2] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr] platforms: [dgx_h100] From 44933d7cc202e0eb197936231ceaf9c6f3d8518c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 28 Nov 2025 00:24:49 +0100 Subject: [PATCH 159/248] Reapply "build: Upgrade deps (NVIDIA#2289)" (#2408) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/cicd-main.yml | 2 +- .gitlab/scripts/build.sh | 5 +- docker/Dockerfile.ci.dev | 1 + .../core/dist_checkpointing/exchange_utils.py | 2 +- megatron/core/dist_checkpointing/mapping.py | 2 +- .../core/dist_checkpointing/validation.py | 2 +- pyproject.toml | 35 +- .../download_unit_tests_dataset.py | 205 +- tests/unit_tests/conftest.py | 9 +- uv.lock | 2832 ++++++++--------- 10 files changed, 1376 insertions(+), 1719 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 2fb08030686..7043e022c95 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -261,7 +261,7 @@ jobs: shell: bash run: | echo "::group::Download test data" - pip install --no-cache-dir pygithub click + pip install --no-cache-dir click requests python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets echo "::endgroup::" diff --git a/.gitlab/scripts/build.sh b/.gitlab/scripts/build.sh index 960af104628..e64434e834d 100644 --- a/.gitlab/scripts/build.sh +++ b/.gitlab/scripts/build.sh @@ -7,9 +7,9 @@ eval "IMAGE=\$$IMAGE" # Start a named container in detached mode docker run -d --name download_test_data -w /workdir/ python:3.12-slim bash -c 'sleep infinity' docker cp tests/. download_test_data:/workdir/tests -docker exec -e GH_TOKEN=$GH_TOKEN download_test_data bash -c ' +docker exec download_test_data bash -c ' ls -al /workdir/ - pip install --no-cache-dir pygithub click + pip install --no-cache-dir click requests python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets ' docker cp download_test_data:/workdir/assets ./ @@ -50,6 +50,7 @@ DOCKER_BUILDKIT=1 docker build \ --builder=container \ --build-arg JET_API_VERSION=$JET_API_VERSION \ --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID} \ + --cache-from type=registry,ref=${IMAGE}-buildcache:dev \ --cache-from type=registry,ref=${IMAGE}-buildcache:main \ --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \ --push \ diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev index 6596fc01aaf..482c6af460c 100644 --- a/docker/Dockerfile.ci.dev +++ b/docker/Dockerfile.ci.dev @@ -36,6 +36,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --no-install-package torch \ --no-install-package torchvision \ --no-install-package triton \ + --no-install-package transformer-engine-cu12 \ --no-install-package nvidia-cublas-cu12 \ --no-install-package nvidia-cuda-cupti-cu12 \ --no-install-package nvidia-cuda-nvrtc-cu12 \ diff --git a/megatron/core/dist_checkpointing/exchange_utils.py b/megatron/core/dist_checkpointing/exchange_utils.py index def79fb778e..2f791449057 100644 --- a/megatron/core/dist_checkpointing/exchange_utils.py +++ b/megatron/core/dist_checkpointing/exchange_utils.py @@ -63,7 +63,7 @@ class ShardDistribution(NamedTuple): def _shard_size(sh_ten: ShardedTensor): """Returns size in bytes of a given sharded tensor.""" if sh_ten.flattened_range is None: - numel = np.product(sh_ten.local_shape) + numel = np.prod(sh_ten.local_shape) else: numel = sh_ten.flattened_range.stop - sh_ten.flattened_range.start return numel * torch._utils._element_size(sh_ten.dtype) diff --git a/megatron/core/dist_checkpointing/mapping.py b/megatron/core/dist_checkpointing/mapping.py index d38ea57eee0..45a105666ab 100644 --- a/megatron/core/dist_checkpointing/mapping.py +++ b/megatron/core/dist_checkpointing/mapping.py @@ -216,7 +216,7 @@ def local_coordinates(self) -> Tuple[np.ndarray, ...]: ) # TODO: np.unravel_index? - mask = np.zeros(np.product(self.local_shape), dtype=bool) + mask = np.zeros(np.prod(self.local_shape), dtype=bool) mask[self.flattened_range] = True return np.nonzero(mask.reshape(self.local_shape)) diff --git a/megatron/core/dist_checkpointing/validation.py b/megatron/core/dist_checkpointing/validation.py index 96945055319..9bcb59bdbf4 100644 --- a/megatron/core/dist_checkpointing/validation.py +++ b/megatron/core/dist_checkpointing/validation.py @@ -519,7 +519,7 @@ def _validate_sharding_for_key_flattened(tensors_by_shard): all_slices.append((sharding.flattened_range.start, sharding.flattened_range.stop)) starts, stops = map(np.asarray, zip(*sorted(all_slices))) - expected_size = np.product(local_shape) + expected_size = np.prod(local_shape) if starts[0] != 0 or stops[-1] != expected_size or not np.all(starts[1:] == stops[:-1]): raise CheckpointingException( f"Flattened ranges dont cover the whole shard {tensors_by_shard[0]} of size {expected_size}. Ranges: {(starts, stops)}" diff --git a/pyproject.toml b/pyproject.toml index 7f734927c1a..553f898ae6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ dynamic = ["version", "readme"] description = "Megatron Core - a library for efficient and scalable training of transformer based models" requires-python = ">=3.10" license = { text = "Apache 2.0" } -dependencies = ["torch", "numpy<2.0.0", "packaging>=24.2"] +dependencies = ["torch", "numpy", "packaging>=24.2"] authors = [{ name = "NVIDIA", email = "nemo-toolkit@nvidia.com" }] maintainers = [{ name = "NVIDIA", email = "nemo-toolkit@nvidia.com" }] keywords = [ @@ -67,37 +67,44 @@ Homepage = "https://github.com/NVIDIA/Megatron-LM/megatron/core" mlm = ["flask-restful", "sentencepiece", "tiktoken", "wandb", "transformers"] dev = [ - "nvidia-modelopt[torch]>=0.33.0a0,<0.34.0; sys_platform != 'darwin'", - "transformer-engine[pytorch]>=2.9.0a0,<2.10.0", - "nvidia-resiliency-ext>=0.4.0a0,<0.5.0", + "nvidia-modelopt[torch]; sys_platform != 'darwin'", + "transformer-engine[pytorch,core_cu13]>=2.9.0a0,<2.10.0", + "nvidia-resiliency-ext", "tqdm", "einops~=0.8", "tensorstore~=0.1,!=0.1.46,!=0.1.72", "nvtx~=0.2", "multi-storage-client~=0.27", "opentelemetry-api~=1.33.1", - "setuptools<80.0.0", "mamba-ssm~=2.2", "causal-conv1d~=1.5", "nv-grouped-gemm~=1.1", "megatron-energon[av_decode]~=6.0", - "av<16.0.0", # At the time, av 16.0.0 is not compatible with Python 3.12 + "av", "flashinfer-python", "wget", "onnxscript", "flash-linear-attention~=0.3.2", "emerging_optimizers", + "fastapi~=0.50", # Forcing a little bit more recent version of fastapi to be compatible with pydantic 2.0 ] lts = [ "tqdm", - "einops", - "tensorstore!=0.1.46,!=0.1.72", - "nvtx", - "transformers", - "zarr", - "setuptools<80.0.0", + "einops~=0.8", + "tensorstore~=0.1,!=0.1.46,!=0.1.72", + "nvtx~=0.2", + "multi-storage-client~=0.27", + "opentelemetry-api~=1.33.1", + "mamba-ssm~=2.2", + "causal-conv1d~=1.5", + "nv-grouped-gemm~=1.1", + "megatron-energon[av_decode]~=6.0", + "av", + "flashinfer-python", "wget", + "onnxscript", + "fastapi~=0.50", # Forcing a little bit more recent version of fastapi to be compatible with pydantic 2.0 ] [dependency-groups] @@ -141,7 +148,7 @@ linting = [ "pylint==3.2.6", ] ci = ["python-gitlab", "slack-sdk", "pandas"] -flash_mla = ["flash_mla"] +no_pypi_wheels = ["flash_mla", "emerging_optimizers"] [tool.uv] default-groups = ["linting", "build", "test"] @@ -168,7 +175,7 @@ override-dependencies = [ flash_mla = [ { git = "https://github.com/deepseek-ai/FlashMLA", rev = "9edee0c022cd0938148a18e334203b0aab43aa19" }, ] -transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" } # on `release_v2.9` +# transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" } # on `release_v2.9` nemo-run = { git = "https://github.com/NVIDIA-NeMo/Run.git", rev = "01a9a8ba360f7b2908728ad0516e0ad9d936966d" } emerging_optimizers = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git", rev = "v0.1.0" } diff --git a/tests/test_utils/python_scripts/download_unit_tests_dataset.py b/tests/test_utils/python_scripts/download_unit_tests_dataset.py index 04470c2f820..a29394c29de 100644 --- a/tests/test_utils/python_scripts/download_unit_tests_dataset.py +++ b/tests/test_utils/python_scripts/download_unit_tests_dataset.py @@ -1,21 +1,35 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + #!/usr/bin/env python3 """ Script to fetch the oldest release of NVIDIA/Megatron-LM on GitHub and list its assets. Uses the PyGithub SDK to interact with the GitHub API. """ -import os -import sys +import logging import tarfile import zipfile from pathlib import Path import click import requests -from github import Github +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +ASSETS = [ + { + "name": "datasets.zip", + "url": "https://github.com/NVIDIA/Megatron-LM/releases/download/v2.5/datasets.zip", + }, + { + "name": "tokenizers.zip", + "url": "https://github.com/NVIDIA/Megatron-LM/releases/download/v2.5/tokenizers.zip", + }, +] -def download_and_extract_asset(asset_url: str, asset_name: str, assets_dir: Path) -> bool: + +def download_and_extract_asset(assets_dir: Path) -> bool: """ Download and extract an asset to the assets directory. @@ -27,144 +41,43 @@ def download_and_extract_asset(asset_url: str, asset_name: str, assets_dir: Path Returns: bool: True if successful, False otherwise """ - try: - # Download the asset - print(f" Downloading {asset_name}...") - response = requests.get(asset_url, stream=True) - response.raise_for_status() - - # Save to temporary file - temp_file = assets_dir / asset_name - with open(temp_file, 'wb') as f: - for chunk in response.iter_content(chunk_size=8192): - f.write(chunk) - - print(f" Extracting {asset_name} to {assets_dir}...") - - # Extract based on file type - if asset_name.endswith('.zip'): - with zipfile.ZipFile(temp_file, 'r') as zip_ref: - zip_ref.extractall(assets_dir) - elif asset_name.endswith(('.tar.gz', '.tgz')): - with tarfile.open(temp_file, 'r:gz') as tar_ref: - tar_ref.extractall(assets_dir) - elif asset_name.endswith('.tar'): - with tarfile.open(temp_file, 'r') as tar_ref: - tar_ref.extractall(assets_dir) - else: - print(f" Warning: Unknown file type for {asset_name}, skipping extraction") - return False - - # Clean up temporary file - temp_file.unlink() - print(f" Successfully extracted to {assets_dir}") - return True - - except Exception as e: - print(f" Error downloading/extracting {asset_name}: {e}") - return False - - -def get_oldest_release_and_assets( - repo_name: str = "NVIDIA/Megatron-LM", assets_dir: str = "assets" -) -> None: - """ - Fetch the oldest release of a GitHub repository and list its assets. - - Args: - repo_name: The repository name in format "owner/repo" - assets_dir: Directory to extract assets to - """ - try: - # Initialize GitHub client - g = Github(login_or_token=os.getenv('GH_TOKEN', None)) - - # Get the repository - repo = g.get_repo(repo_name) - print(f"Repository: {repo.full_name}") - print(f"Description: {repo.description}") - print(f"URL: {repo.html_url}") - print("-" * 80) - - # Get all releases - releases = list(repo.get_releases()) - - if not releases: - print("No releases found for this repository.") - return - - # Sort releases by creation date to find the oldest - releases.sort(key=lambda x: x.created_at) - oldest_release = releases[0] - - print(f"Oldest Release:") - print(f" Tag: {oldest_release.tag_name}") - print(f" Title: {oldest_release.title}") - print(f" Created: {oldest_release.created_at}") - print(f" Published: {oldest_release.published_at}") - print(f" Draft: {oldest_release.draft}") - print(f" Prerelease: {oldest_release.prerelease}") - print(f" URL: {oldest_release.html_url}") - - if oldest_release.body: - print(f" Description: {oldest_release.body[:200]}...") - - print("-" * 80) - - # List assets - assets = list(oldest_release.get_assets()) - - if not assets: - print("No assets found for this release.") - return - - print(f"Assets ({len(assets)} total):") - print("-" * 80) - - for i, asset in enumerate(assets, 1): - print(f"{i}. {asset.name}") - print(f" Size: {asset.size} bytes ({asset.size / 1024 / 1024:.2f} MB)") - print(f" Downloads: {asset.download_count}") - print(f" Content Type: {asset.content_type}") - print(f" URL: {asset.browser_download_url}") - print(f" Created: {asset.created_at}") - print(f" Updated: {asset.updated_at}") - print() - - # Summary - total_size = sum(asset.size for asset in assets) - total_downloads = sum(asset.download_count for asset in assets) - - print(f"Summary:") - print(f" Total assets: {len(assets)}") - print(f" Total size: {total_size} bytes ({total_size / 1024 / 1024:.2f} MB)") - print(f" Total downloads: {total_downloads}") - - # Download and extract assets if requested - if assets: - print("-" * 80) - print("Downloading and extracting assets...") - - # Create assets directory - assets_path = Path(assets_dir) - assets_path.mkdir(parents=True, exist_ok=True) - print(f"Created assets directory: {assets_path.absolute()}") - - successful_downloads = 0 - for asset in assets: - print(f"\nProcessing asset: {asset.name}") - if download_and_extract_asset(asset.browser_download_url, asset.name, assets_path): - successful_downloads += 1 - - print(f"\nDownload Summary:") - print( - f" Successfully downloaded and extracted: {successful_downloads}/{len(assets)} assets" - ) - print(f" Assets directory: {assets_path.absolute()}") - - except Exception as e: - print(f"Error: {e}") - sys.exit(1) + for asset in ASSETS: + asset_name, asset_url = asset.values() + try: + # Download the asset + logger.info(f" Downloading {asset_name}...") + response = requests.get(asset_url, stream=True) + response.raise_for_status() + + # Save to temporary file + temp_file = assets_dir / asset_name + with open(temp_file, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + logger.info(f" Extracting {asset_name} to {assets_dir}...") + + # Extract based on file type + if asset_name.endswith('.zip'): + with zipfile.ZipFile(temp_file, 'r') as zip_ref: + zip_ref.extractall(assets_dir) + elif asset_name.endswith(('.tar.gz', '.tgz')): + with tarfile.open(temp_file, 'r:gz') as tar_ref: + tar_ref.extractall(assets_dir) + elif asset_name.endswith('.tar'): + with tarfile.open(temp_file, 'r') as tar_ref: + tar_ref.extractall(assets_dir) + else: + logger.warning( + f" Warning: Unknown file type for {asset_name}, skipping extraction" + ) + + # Clean up temporary file + temp_file.unlink() + logger.info(f" Successfully extracted to {assets_dir}") + + except Exception as e: + logger.error(f" Error downloading/extracting {asset_name}: {e}") @click.command() @@ -174,10 +87,12 @@ def get_oldest_release_and_assets( @click.option('--assets-dir', default='assets', help='Directory to extract assets to') def main(repo, assets_dir): """Fetch the oldest release of a GitHub repository and download its assets.""" - print(f"Fetching oldest release of {repo}...") - print("=" * 80) + logger.info(f"Fetching oldest release of {repo}...") + logger.info("=" * 80) + + Path(assets_dir).mkdir(parents=True, exist_ok=True) - get_oldest_release_and_assets(repo_name=repo, assets_dir=assets_dir) + download_and_extract_asset(Path(assets_dir)) if __name__ == "__main__": diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py index 611f9ae6098..e251a3c1e7e 100644 --- a/tests/unit_tests/conftest.py +++ b/tests/unit_tests/conftest.py @@ -1,5 +1,6 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import os -import sys from pathlib import Path import pytest @@ -8,9 +9,7 @@ from megatron.core import config from megatron.core.utils import is_te_min_version -from tests.test_utils.python_scripts.download_unit_tests_dataset import ( - get_oldest_release_and_assets, -) +from tests.test_utils.python_scripts.download_unit_tests_dataset import download_and_extract_asset from tests.unit_tests.dist_checkpointing import TempNamedDir from tests.unit_tests.test_utilities import Utils @@ -83,7 +82,7 @@ def ensure_test_data(): try: # Download assets to /opt/data - get_oldest_release_and_assets(assets_dir=str(data_path)) + download_and_extract_asset(assets_dir=str(data_path)) print("Test data downloaded successfully.") diff --git a/uv.lock b/uv.lock index f636a791f12..af8e548b625 100644 --- a/uv.lock +++ b/uv.lock @@ -2,50 +2,16 @@ version = 1 revision = 2 requires-python = ">=3.10" resolution-markers = [ - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version < '3.11' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version < '3.11' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", + "python_full_version >= '3.14' and sys_platform == 'linux'", + "python_full_version == '3.13.*' and sys_platform == 'linux'", + "python_full_version == '3.12.*' and sys_platform == 'linux'", + "python_full_version >= '3.14' and sys_platform != 'linux'", + "python_full_version == '3.13.*' and sys_platform != 'linux'", + "python_full_version == '3.12.*' and sys_platform != 'linux'", + "python_full_version == '3.11.*' and sys_platform == 'linux'", + "python_full_version == '3.11.*' and sys_platform != 'linux'", + "python_full_version < '3.11' and sys_platform == 'linux'", + "python_full_version < '3.11' and sys_platform != 'linux'", ] conflicts = [[ { package = "megatron-core", extra = "dev" }, @@ -82,7 +48,7 @@ wheels = [ [[package]] name = "aiobotocore" -version = "2.25.1" +version = "2.26.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, @@ -91,11 +57,11 @@ dependencies = [ { name = "jmespath" }, { name = "multidict" }, { name = "python-dateutil" }, - { name = "wrapt", version = "1.17.3", source = { registry = "https://pypi.org/simple" } }, + { name = "wrapt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/62/94/2e4ec48cf1abb89971cb2612d86f979a6240520f0a659b53a43116d344dc/aiobotocore-2.25.1.tar.gz", hash = "sha256:ea9be739bfd7ece8864f072ec99bb9ed5c7e78ebb2b0b15f29781fbe02daedbc", size = 120560, upload-time = "2025-10-28T22:33:21.787Z" } +sdist = { url = "https://files.pythonhosted.org/packages/4d/f8/99fa90d9c25b78292899fd4946fce97b6353838b5ecc139ad8ba1436e70c/aiobotocore-2.26.0.tar.gz", hash = "sha256:50567feaf8dfe2b653570b4491f5bc8c6e7fb9622479d66442462c021db4fadc", size = 122026, upload-time = "2025-11-28T07:54:59.956Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/95/2a/d275ec4ce5cd0096665043995a7d76f5d0524853c76a3d04656de49f8808/aiobotocore-2.25.1-py3-none-any.whl", hash = "sha256:eb6daebe3cbef5b39a0bb2a97cffbe9c7cb46b2fcc399ad141f369f3c2134b1f", size = 86039, upload-time = "2025-10-28T22:33:19.949Z" }, + { url = "https://files.pythonhosted.org/packages/b7/58/3bf0b7d474607dc7fd67dd1365c4e0f392c8177eaf4054e5ddee3ebd53b5/aiobotocore-2.26.0-py3-none-any.whl", hash = "sha256:a793db51c07930513b74ea7a95bd79aaa42f545bdb0f011779646eafa216abec", size = 87333, upload-time = "2025-11-28T07:54:58.457Z" }, ] [[package]] @@ -229,11 +195,11 @@ wheels = [ [[package]] name = "aioitertools" -version = "0.12.0" +version = "0.13.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/06/de/38491a84ab323b47c7f86e94d2830e748780525f7a10c8600b67ead7e9ea/aioitertools-0.12.0.tar.gz", hash = "sha256:c2a9055b4fbb7705f561b9d86053e8af5d10cc845d22c32008c43490b2d8dd6b", size = 19369, upload-time = "2024-09-02T03:33:40.349Z" } +sdist = { url = "https://files.pythonhosted.org/packages/fd/3c/53c4a17a05fb9ea2313ee1777ff53f5e001aefd5cc85aa2f4c2d982e1e38/aioitertools-0.13.0.tar.gz", hash = "sha256:620bd241acc0bbb9ec819f1ab215866871b4bbd1f73836a55f799200ee86950c", size = 19322, upload-time = "2025-11-06T22:17:07.609Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/85/13/58b70a580de00893223d61de8fea167877a3aed97d4a5e1405c9159ef925/aioitertools-0.12.0-py3-none-any.whl", hash = "sha256:fc1f5fac3d737354de8831cbba3eb04f79dd649d8f3afb4c5b114925e662a796", size = 24345, upload-time = "2024-09-02T03:34:59.454Z" }, + { url = "https://files.pythonhosted.org/packages/10/a1/510b0a7fadc6f43a6ce50152e69dbd86415240835868bb0bd9b5b88b1e06/aioitertools-0.13.0-py3-none-any.whl", hash = "sha256:0be0292b856f08dfac90e31f4739432f4cb6d7520ab9eb73e143f4f2fa5259be", size = 24182, upload-time = "2025-11-06T22:17:06.502Z" }, ] [[package]] @@ -269,11 +235,11 @@ wheels = [ [[package]] name = "annotated-doc" -version = "0.0.3" +version = "0.0.4" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d7/a6/dc46877b911e40c00d395771ea710d5e77b6de7bacd5fdcd78d70cc5a48f/annotated_doc-0.0.3.tar.gz", hash = "sha256:e18370014c70187422c33e945053ff4c286f453a984eba84d0dbfa0c935adeda", size = 5535, upload-time = "2025-10-24T14:57:10.718Z" } +sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288, upload-time = "2025-11-10T22:07:42.062Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/02/b7/cf592cb5de5cb3bade3357f8d2cf42bf103bbe39f459824b4939fd212911/annotated_doc-0.0.3-py3-none-any.whl", hash = "sha256:348ec6664a76f1fd3be81f43dffbee4c7e8ce931ba71ec67cc7f4ade7fbbb580", size = 5488, upload-time = "2025-10-24T14:57:09.462Z" }, + { url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303, upload-time = "2025-11-10T22:07:40.673Z" }, ] [[package]] @@ -308,44 +274,38 @@ wheels = [ [[package]] name = "apache-tvm-ffi" -version = "0.1.1" +version = "0.1.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d8/e8/7db1ca6db40877d190a8538cc378f740aae247c6fe063815898607c2d2ca/apache_tvm_ffi-0.1.1.tar.gz", hash = "sha256:728ce3f4ae02b89a7147b718f7f670afac3c6d1f96df38d488757274643709fc", size = 1259223, upload-time = "2025-11-04T02:43:38.154Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8a/ad/550aff4c9652ee8297f90a04c3ab4143ece1d373101010d85b5c9a9a2e7d/apache_tvm_ffi-0.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:af0de7bb9581ac9e090276cba37c4e7ffaeed601a2b2b546bf0e2daed3810cec", size = 1723658, upload-time = "2025-11-04T02:42:37.628Z" }, - { url = "https://files.pythonhosted.org/packages/48/5a/01e65f4a6c2b146f7c40f6d8d663d76b60c3be324159f8fb8223ea505738/apache_tvm_ffi-0.1.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb7d6828652803cb8c0e13d1f06d01fc6bfb8e79e77e3de7e6fd4b5fae5ee9d2", size = 1882437, upload-time = "2025-11-04T02:42:39.647Z" }, - { url = "https://files.pythonhosted.org/packages/6b/bd/b52b71d03637d7a82388c2e90d48dddec2c46121be1333c9851d6a135824/apache_tvm_ffi-0.1.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1fe072b55a7949720a792a9d455c0659aa097825e709a16a4667d720137b8b5c", size = 1954949, upload-time = "2025-11-04T02:42:41.119Z" }, - { url = "https://files.pythonhosted.org/packages/ac/ef/ff85926928694785f2399a4c5b793bcfecf8c3cf806dedf9202b7db73b8b/apache_tvm_ffi-0.1.1-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b25178b265903dabd9a35bd767db26928be3b7869f681fe1d6e1aed93d7c0799", size = 1837395, upload-time = "2025-11-04T02:42:42.954Z" }, - { url = "https://files.pythonhosted.org/packages/de/69/f048bda5e5445a89200737062a202cb39097d3b1902e886654de9cd6b624/apache_tvm_ffi-0.1.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d5552af3c625750361d1b7d646d499a28caf94858967e74c9cce6ed7d4629b28", size = 1947740, upload-time = "2025-11-04T02:42:44.49Z" }, - { url = "https://files.pythonhosted.org/packages/dc/df/295f71613502edeb39a39b30c8bbb9ec8fcc06bd95b3043dd99b55fa98a8/apache_tvm_ffi-0.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:c102ba5899ce106c8068a3f21155c106790b5b0141fba52a52ed6e9aeb286aff", size = 1710966, upload-time = "2025-11-04T02:42:46.037Z" }, - { url = "https://files.pythonhosted.org/packages/8f/a9/544767d7058f825c0ceb5bc25760ad3a821b2efcc6a3dbe2e3988a3aee86/apache_tvm_ffi-0.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7cbf31c472920cdc5b3f75f2d2720b8a6b37ddbdb11d573fa94524815ea5a144", size = 1725662, upload-time = "2025-11-04T02:42:47.528Z" }, - { url = "https://files.pythonhosted.org/packages/54/c3/fe1a9f8968d5ce2d3b674e397c2bf01961e32a72b723817478c67c9780e3/apache_tvm_ffi-0.1.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d7602bc37019387a4705677b6e742059c7e1973a899b6918af235febcb3d3b47", size = 1884278, upload-time = "2025-11-04T02:42:48.998Z" }, - { url = "https://files.pythonhosted.org/packages/24/b9/80cbba18b2d7d9013031d8c13671986912275b9ca6aaea70a1dd9b361c39/apache_tvm_ffi-0.1.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7941f82a2ae4549f55c07d82d37c5765628d70f29dace98628393fcea525e870", size = 1957018, upload-time = "2025-11-04T02:42:50.538Z" }, - { url = "https://files.pythonhosted.org/packages/b4/0c/d27beb98d6841a3929468648433ed2c53e4da953fadb73c754b9372b2356/apache_tvm_ffi-0.1.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2e0d6d8e0888ee3a3defd2cbe1eff7a65c05900b4e8fa0e18c890048fc6a44a6", size = 1839279, upload-time = "2025-11-04T02:42:52.438Z" }, - { url = "https://files.pythonhosted.org/packages/0f/10/d7cf7779c65047ad2ca652234a174c2908d936cb69bc4f5156e17382fa91/apache_tvm_ffi-0.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:549c2150e1c2d7ca7912cad173f62a192aec90cd981c024bd246161283ea5d78", size = 1950476, upload-time = "2025-11-04T02:42:54.159Z" }, - { url = "https://files.pythonhosted.org/packages/53/71/bb5ee4bca52a37a8f9580ab1f1de1be5366808a194981c324a756dabbe15/apache_tvm_ffi-0.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:3fbcfe526b458bc8edeafdc769388782d3bb4321c46a987e50bcece93ae78af8", size = 1711278, upload-time = "2025-11-04T02:42:55.56Z" }, - { url = "https://files.pythonhosted.org/packages/d1/1e/f8d16dbe2303d1e7348037b4207d6c1093c554573484c97c8f3cde61a060/apache_tvm_ffi-0.1.1-cp312-abi3-macosx_11_0_arm64.whl", hash = "sha256:f2c0164a5c6286f9c333ddedeb448b855cbc1225688d0a4c9aeab006ddfa1180", size = 1701072, upload-time = "2025-11-04T02:42:57.28Z" }, - { url = "https://files.pythonhosted.org/packages/3d/47/f7a55e9b5b741f901ed9101a3ef46fd250f2c1519a6479e055432ff4f308/apache_tvm_ffi-0.1.1-cp312-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:33cc35738e0c44f2a67e550457b6b7dc7de9109ca64422a9e7063b1ba43c336e", size = 1854467, upload-time = "2025-11-04T02:43:00.158Z" }, - { url = "https://files.pythonhosted.org/packages/f2/db/f3adbe1e2d092fbb18908971a25ceb5496669ec65d01a28b7dd57f471ae0/apache_tvm_ffi-0.1.1-cp312-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e9db6484259120b1bdc600f736084ee3d574775b1f4a3e8fef110323e3a9d2b6", size = 1930968, upload-time = "2025-11-04T02:43:01.96Z" }, - { url = "https://files.pythonhosted.org/packages/3b/da/7f678675ccc8af1c7d313322f3875e2c829f1faaa58c0d982431beeb3b3e/apache_tvm_ffi-0.1.1-cp312-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7bd812058ce9046cb69fd7b3e18538d1d0eefa1719822a1441b00bb841f7af4", size = 1811173, upload-time = "2025-11-04T02:43:03.404Z" }, - { url = "https://files.pythonhosted.org/packages/e1/11/c8b3b7d69ceebd219dcb06f5e4a3997edea3bc2e0bbdd8f57ae65bba4f2f/apache_tvm_ffi-0.1.1-cp312-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:807def3039fb336a228c120ca8c32eb794bdfd2d7aff218c8611f287ad913736", size = 1922690, upload-time = "2025-11-04T02:43:04.846Z" }, - { url = "https://files.pythonhosted.org/packages/fd/0b/f816735d761049e53eb388264238655f58fcb42a31e0d1848a4fb6a6556b/apache_tvm_ffi-0.1.1-cp312-abi3-win_amd64.whl", hash = "sha256:624b4430ca3949f85fffd9ef498ebaf1155ff0ac659fc764eec6c6fd66ec7986", size = 1690969, upload-time = "2025-11-04T02:43:06.581Z" }, - { url = "https://files.pythonhosted.org/packages/12/aa/df81df8f8b39d3c41fbac41b1e6661d192d9987a3ef317fabcefecf727a6/apache_tvm_ffi-0.1.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c93d9de81c1ba9560fcc696cf84d777f88016eb53f05ee2d6288ddcb95a5e72f", size = 1732582, upload-time = "2025-11-04T02:43:08.042Z" }, - { url = "https://files.pythonhosted.org/packages/a8/55/861090532e4accd855e119f0e67e0e482b42abb866c9505edd8956148ebc/apache_tvm_ffi-0.1.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f9e0227179a0ce83384132b34757fd05f492270f1c031eae615870a5641b5039", size = 1870196, upload-time = "2025-11-04T02:43:09.911Z" }, - { url = "https://files.pythonhosted.org/packages/2a/c6/470493934559e371ad699e1764649176efc5e022267c6dd0a565217177ad/apache_tvm_ffi-0.1.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:78e75e193d675b9639e6fd0c33c60c3a4259d4c9f848f60baa6a3194df7e1fea", size = 1941999, upload-time = "2025-11-04T02:43:11.467Z" }, - { url = "https://files.pythonhosted.org/packages/85/b8/84eba0d266c9b10beae59a6863ef5c68044e20a6f12d46a42116e80db774/apache_tvm_ffi-0.1.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49792622720421525a18e378d848411731d32fcb05a00b6e54b84d05ff46cc22", size = 1823965, upload-time = "2025-11-04T02:43:12.941Z" }, - { url = "https://files.pythonhosted.org/packages/64/73/ca73a43260a1374b1f34d0e6fcf6f8af16f66867a89dfd562b26184af1bd/apache_tvm_ffi-0.1.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:039293086d44e7f601bf8231e369198afe7ad38986330969ddb1a5fc7622976b", size = 1933779, upload-time = "2025-11-04T02:43:14.543Z" }, - { url = "https://files.pythonhosted.org/packages/5b/91/687c3b9ff3313addeebc1188ac50b299a82944ef1784b91890fc6f250ebd/apache_tvm_ffi-0.1.1-cp314-cp314t-win_amd64.whl", hash = "sha256:3f6cbd214bee2e52719d5264f05a2685c955ae7b096980f0361d917a5a9f47a6", size = 1751905, upload-time = "2025-11-04T02:43:16.286Z" }, -] - -[[package]] -name = "asciitree" -version = "0.3.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/2d/6a/885bc91484e1aa8f618f6f0228d76d0e67000b0fdd6090673b777e311913/asciitree-0.3.3.tar.gz", hash = "sha256:4aa4b9b649f85e3fcb343363d97564aa1fb62e249677f2e18a96765145cc0f6e", size = 3951, upload-time = "2016-09-05T19:10:42.681Z" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/f0/af641a18833f35b37f01ecbdbf9baa0095805475adf8cd52ebeb7698fa8c/apache_tvm_ffi-0.1.3.tar.gz", hash = "sha256:d33f0bc0d028cddf321d69724c916504272a7f03dfc1d8e507d9d0f88b6f7cbf", size = 1276869, upload-time = "2025-11-21T05:11:00.562Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/13/ad0af6fb5203df6c92e404c5465d44a60bae7de0741a93fb1a3b4829692e/apache_tvm_ffi-0.1.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d8999f431b3acd04a2d79f38e2ebfbb089d0f43ed87528674d7bda6d3f796ddc", size = 1743043, upload-time = "2025-11-21T05:10:05.255Z" }, + { url = "https://files.pythonhosted.org/packages/3d/64/f362d0010daacea93a928de0c31df6b7d40ef8cd57e9117535ee0adc2704/apache_tvm_ffi-0.1.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:81f187d08d9040ec98b22fb6906c68b1df60b41567f2b507293f53f630b0136f", size = 1895551, upload-time = "2025-11-21T05:10:07.223Z" }, + { url = "https://files.pythonhosted.org/packages/f1/98/daa0f491312ebe4dccc7d84799c0b5b1bc5eee6b1093208a4fbb98175579/apache_tvm_ffi-0.1.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dacfd2974a60a6b531a5fe8a3985f60368fc88a8ab3872c381fc1a80315d3d24", size = 1969790, upload-time = "2025-11-21T05:10:09.032Z" }, + { url = "https://files.pythonhosted.org/packages/87/9c/68e30812874e60b141b99202dd3c4e4de964a7cb62cf6455de170b3a5111/apache_tvm_ffi-0.1.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ff65bf8a96dbbd2725937ff1502e52571e7a90d81d355a21a303328dd06449cc", size = 1844888, upload-time = "2025-11-21T05:10:10.871Z" }, + { url = "https://files.pythonhosted.org/packages/49/97/ffe70c4679aebef0c1e32eec3970dc7e35113995d318aeb8c2ef0e4a3eb9/apache_tvm_ffi-0.1.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:48ad3df2224f1b0943344895c6cba2f3f0a53bc67ddafdd3e9d7a34f56100aa9", size = 1953886, upload-time = "2025-11-21T05:10:12.55Z" }, + { url = "https://files.pythonhosted.org/packages/a6/f3/e03e5716a4e025d060585a9ca3123ce76e13dff8f464cda4d5e48ef9a26a/apache_tvm_ffi-0.1.3-cp310-cp310-win_amd64.whl", hash = "sha256:6d56b2026aa614bd56d20375e5062ddb8d4baebd7a6b93476bbe3f0339cfa095", size = 1725820, upload-time = "2025-11-21T05:10:14.043Z" }, + { url = "https://files.pythonhosted.org/packages/8f/f0/d19a0b8e97e102f8376e18cd8234cc0a5f37d5c935ce74bf587e15f8450e/apache_tvm_ffi-0.1.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fae211bb8693c118109e106b73393164e3ca878823185cfd6e03765e04056f37", size = 1742398, upload-time = "2025-11-21T05:10:15.384Z" }, + { url = "https://files.pythonhosted.org/packages/5b/0c/699e26a3b7db2c1627ac87335deccf8a8b6cb2e218766fe9acd5aadb5f78/apache_tvm_ffi-0.1.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:79ff39b5d6a2ed8665f4b91282391a052e8c7c76ac0f12f776ad0747f212f201", size = 1895272, upload-time = "2025-11-21T05:10:17.164Z" }, + { url = "https://files.pythonhosted.org/packages/22/39/f64a1f1a23dc3298d3f50ceb275eb9b98b6898ea3df52e6d95fed756610c/apache_tvm_ffi-0.1.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e2cc20f00d98e263ca35fef9a139fe65992988deddd570498ff77c11780ce22e", size = 1969033, upload-time = "2025-11-21T05:10:18.855Z" }, + { url = "https://files.pythonhosted.org/packages/51/dc/fb9e25b83a57ae7b4df7308d839febf13d2e77b481ea79800e89f1eee470/apache_tvm_ffi-0.1.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b5b2d1c8c421aaa0685fcc77347566da68e45d8d2dc150c2ee957906b1186d62", size = 1844972, upload-time = "2025-11-21T05:10:20.201Z" }, + { url = "https://files.pythonhosted.org/packages/63/f2/ef1521e617254c2fe38b2f60440694de426b2402b225e1cc4ae04e9a22c2/apache_tvm_ffi-0.1.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:adbc2f3b496d67199adaa999baecb9a3c9137cf1fc32163a4834950062bd0dd7", size = 1954220, upload-time = "2025-11-21T05:10:21.571Z" }, + { url = "https://files.pythonhosted.org/packages/96/7c/1cadf17119f75b4d22761f8c003a767e63d456aac3f738ae42403ef7d990/apache_tvm_ffi-0.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:d797b29f70ea8c1843f4141a6b12b9770579a2b770f76898a96b721d2f987a23", size = 1725528, upload-time = "2025-11-21T05:10:23.043Z" }, + { url = "https://files.pythonhosted.org/packages/21/b4/9983c1df90d239cc15055469c795a894bab85ffd75f9325d2f5e392dbf09/apache_tvm_ffi-0.1.3-cp312-abi3-macosx_11_0_arm64.whl", hash = "sha256:71d1de0c139cae3824c1e8b511acf6b2bfd37deccfc640cb83b80ba17b33d6e3", size = 1719369, upload-time = "2025-11-21T05:10:24.768Z" }, + { url = "https://files.pythonhosted.org/packages/01/e3/1b47af4391863351d9db42ab1ed116e3eba2c4ef49c1e161e4cd0ba379d9/apache_tvm_ffi-0.1.3-cp312-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b0bc38da581c54c862840960c5bf0da5bb78aa007630d6f026675d1d4b1df898", size = 1867353, upload-time = "2025-11-21T05:10:26.481Z" }, + { url = "https://files.pythonhosted.org/packages/0a/6e/0d12246b90534be733accdfbfe6e2d5bde8d7c722293c21821fe10b09412/apache_tvm_ffi-0.1.3-cp312-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:48160e8fa0235e8f3fad45102c4e856edb798c8b2954603f80f6721e3c0fd7ef", size = 1945829, upload-time = "2025-11-21T05:10:27.831Z" }, + { url = "https://files.pythonhosted.org/packages/2d/89/c4ad96b76a6e2d38795871bfb048c74aa60d1a7c01fab48cbe4e8c10f1a2/apache_tvm_ffi-0.1.3-cp312-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b1c215d4608e17d7f2382f3c6b2903a4696255727ac905041f3a005c50a98afc", size = 1817481, upload-time = "2025-11-21T05:10:29.543Z" }, + { url = "https://files.pythonhosted.org/packages/7b/c7/2f6bc83fcc987c2eb00037c3f27f1d182c2f0d8976a16807ef1395a8ece1/apache_tvm_ffi-0.1.3-cp312-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b75cc773bc29db64bb69f11d260ec66e88ad0a4a951d25650f69d3b2c9f9a186", size = 1927595, upload-time = "2025-11-21T05:10:30.882Z" }, + { url = "https://files.pythonhosted.org/packages/12/a0/597c522588abef7fcf3fe38492cf832eed8ba9123f01d3c33dfaec174dcc/apache_tvm_ffi-0.1.3-cp312-abi3-win_amd64.whl", hash = "sha256:86fd1e1012ec2ec25213f714f5f28e6f6b897360776872d5f71c4be8cae8aeb8", size = 1706236, upload-time = "2025-11-21T05:10:32.25Z" }, + { url = "https://files.pythonhosted.org/packages/3e/76/8404875ee3fb61a3c97026e2eaab8d97e7f974601e444d5abb37a765c686/apache_tvm_ffi-0.1.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0ef290a792d6e3734e2fe1ff19b2b82e6bd3af6714216c7fe32d0a39c0d0e8df", size = 1750006, upload-time = "2025-11-21T05:10:33.594Z" }, + { url = "https://files.pythonhosted.org/packages/98/98/7989ccb343044f97491cb1e46e675da75defc82a56495c320dcb1e31583b/apache_tvm_ffi-0.1.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c7b137ab0c7ec6507f61e88885ddbd3541d7d14d8ca25938f5fa106ca06996d3", size = 1880792, upload-time = "2025-11-21T05:10:35.239Z" }, + { url = "https://files.pythonhosted.org/packages/64/2e/f772e75f947ebfa2faa305980ba2c172ae26a53f66c8f0c1f8915c4fa690/apache_tvm_ffi-0.1.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d5187a90cf1c0663b8071f34f621f49ba83866412298deed9c4a94d1d991711b", size = 1953343, upload-time = "2025-11-21T05:10:36.879Z" }, + { url = "https://files.pythonhosted.org/packages/c2/a8/7d1d75f70d5a2cd283ded60784d9657c59fa7516f4b3c32437f70901d117/apache_tvm_ffi-0.1.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:54001ceab111e708a1638fd9e40713d9d55f6a073037a2d4a9f1982f8dda3c69", size = 1829560, upload-time = "2025-11-21T05:10:38.421Z" }, + { url = "https://files.pythonhosted.org/packages/21/3a/6bee12cf517ace0bb8fd83bb72f6ca227743a49bab0c30918f523b5428df/apache_tvm_ffi-0.1.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:996d87d6f180250e734ce7b7cce39f234e3ad3369fffb3882c8f29c79d280db4", size = 1937457, upload-time = "2025-11-21T05:10:40.505Z" }, + { url = "https://files.pythonhosted.org/packages/5c/99/107f082536447dba2a628e1571dd423b577df6bd8e441896e3f8b0929001/apache_tvm_ffi-0.1.3-cp314-cp314t-win_amd64.whl", hash = "sha256:6010c918c62fb19995e70c4f149dfc5c248783da0d22d5c40e84649bd89a9357", size = 1766053, upload-time = "2025-11-21T05:10:41.859Z" }, +] [[package]] name = "astroid" @@ -379,52 +339,59 @@ wheels = [ [[package]] name = "av" -version = "15.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e9/c3/83e6e73d1592bc54436eae0bc61704ae0cff0c3cfbde7b58af9ed67ebb49/av-15.1.0.tar.gz", hash = "sha256:39cda2dc810e11c1938f8cb5759c41d6b630550236b3365790e67a313660ec85", size = 3774192, upload-time = "2025-08-30T04:41:56.076Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3a/6a/91e3e68ae0d1b53b480ec69a96f2ae820fb007bc60e6b821741f31c7ba4e/av-15.1.0-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:cf067b66cee2248220b29df33b60eb4840d9e7b9b75545d6b922f9c41d88c4ee", size = 21781685, upload-time = "2025-08-30T04:39:13.118Z" }, - { url = "https://files.pythonhosted.org/packages/bc/6d/afa951b9cb615c3bc6d95c4eed280c6cefb52c006f4e15e79043626fab39/av-15.1.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:26426163d96fc3bde9a015ba4d60da09ef848d9284fe79b4ca5e60965a008fc5", size = 26962481, upload-time = "2025-08-30T04:39:16.875Z" }, - { url = "https://files.pythonhosted.org/packages/3c/42/0c384884235c42c439cef28cbd129e4624ad60229119bf3c6c6020805119/av-15.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:92f524541ce74b8a12491d8934164a5c57e983da24826547c212f60123de400b", size = 37571839, upload-time = "2025-08-30T04:39:20.325Z" }, - { url = "https://files.pythonhosted.org/packages/25/c0/5c967b0872fce1add80a8f50fa7ce11e3e3e5257c2b079263570bc854699/av-15.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:659f9d6145fb2c58e8b31907283b6ba876570f5dd6e7e890d74c09614c436c8e", size = 39070227, upload-time = "2025-08-30T04:39:24.079Z" }, - { url = "https://files.pythonhosted.org/packages/e2/81/e333056d49363c35a74b828ed5f87c96dfbcc1a506b49d79a31ac773b94d/av-15.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:07a8ae30c0cfc3132eff320a6b27d18a5e0dda36effd0ae28892888f4ee14729", size = 39619362, upload-time = "2025-08-30T04:39:27.7Z" }, - { url = "https://files.pythonhosted.org/packages/d5/ae/50cc2af1bf68452cbfec8d1b2554c18f6d167c8ba6d7ad7707797dfd1541/av-15.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e33a76e38f03bb5de026b9f66ccf23dc01ddd2223221096992cb52ac22e62538", size = 40371627, upload-time = "2025-08-30T04:39:31.207Z" }, - { url = "https://files.pythonhosted.org/packages/50/e6/381edf1779106dd31c9ef1ac9842f643af4465b8a87cbc278d3eaa76229a/av-15.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:aa4bf12bdce20edc2a3b13a2776c474c5ab63e1817d53793714504476eeba82e", size = 31340369, upload-time = "2025-08-30T04:39:34.774Z" }, - { url = "https://files.pythonhosted.org/packages/47/58/4e44cf6939be7aba96a4abce024e1be11ba7539ecac74d09369b8c03aa05/av-15.1.0-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:b785948762a8d45fc58fc24a20251496829ace1817e9a7a508a348d6de2182c3", size = 21767323, upload-time = "2025-08-30T04:39:37.989Z" }, - { url = "https://files.pythonhosted.org/packages/9b/f6/a946544cdb49f6d892d2761b1d61a8bc6ce912fe57ba06769bdc640c0a7f/av-15.1.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:9c7131494a3a318612b4ee4db98fe5bc50eb705f6b6536127c7ab776c524fd8b", size = 26946268, upload-time = "2025-08-30T04:39:40.601Z" }, - { url = "https://files.pythonhosted.org/packages/70/7c/b33513c0af73d0033af59a98f035b521c5b93445a6af7e9efbf41a6e8383/av-15.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:2b9623ae848625c59213b610c8665817924f913580c7c5c91e0dc18936deb00d", size = 38062118, upload-time = "2025-08-30T04:39:43.928Z" }, - { url = "https://files.pythonhosted.org/packages/5e/95/31b7fb34f9fea7c7389240364194f4f56ad2d460095038cc720f50a90bb3/av-15.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:c8ef597087db560514617143532b1fafc4825ebb2dda9a22418f548b113a0cc7", size = 39571086, upload-time = "2025-08-30T04:39:47.109Z" }, - { url = "https://files.pythonhosted.org/packages/e7/b0/7b0b45474a4e90c35c11d0032947d8b3c7386872957ce29c6f12add69a74/av-15.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:08eac47a90ebae1e2bd5935f400dd515166019bab4ff5b03c4625fa6ac3a0a5e", size = 40112634, upload-time = "2025-08-30T04:39:50.981Z" }, - { url = "https://files.pythonhosted.org/packages/aa/04/038b94bc9a1ee10a451c867d4a2fc91e845f83bfc2dae9df25893abcb57f/av-15.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d3f66ff200ea166e606cb3c5cb1bd2fc714effbec2e262a5d67ce60450c8234a", size = 40878695, upload-time = "2025-08-30T04:39:54.493Z" }, - { url = "https://files.pythonhosted.org/packages/1d/3d/9f8f96c0deeaaf648485a3dbd1699b2f0580f2ce8a36cb616c0138ba7615/av-15.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:57b99544d91121b8bea570e4ddf61700f679a6b677c1f37966bc1a22e1d4cd5c", size = 31335683, upload-time = "2025-08-30T04:39:57.861Z" }, - { url = "https://files.pythonhosted.org/packages/d1/58/de78b276d20db6ffcd4371283df771721a833ba525a3d57e753d00a9fe79/av-15.1.0-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:40c5df37f4c354ab8190c6fd68dab7881d112f527906f64ca73da4c252a58cee", size = 21760991, upload-time = "2025-08-30T04:40:00.801Z" }, - { url = "https://files.pythonhosted.org/packages/56/cc/45f85775304ae60b66976360d82ba5b152ad3fd91f9267d5020a51e9a828/av-15.1.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:af455ce65ada3d361f80c90c810d9bced4db5655ab9aa513024d6c71c5c476d5", size = 26953097, upload-time = "2025-08-30T04:40:03.998Z" }, - { url = "https://files.pythonhosted.org/packages/f3/f8/2d781e5e71d02fc829487e775ccb1185e72f95340d05f2e84eb57a11e093/av-15.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86226d2474c80c3393fa07a9c366106029ae500716098b72b3ec3f67205524c3", size = 38319710, upload-time = "2025-08-30T04:40:07.701Z" }, - { url = "https://files.pythonhosted.org/packages/ac/13/37737ef2193e83862ccacff23580c39de251da456a1bf0459e762cca273c/av-15.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:11326f197e7001c4ca53a83b2dbc67fd39ddff8cdf62ce6be3b22d9f3f9338bd", size = 39915519, upload-time = "2025-08-30T04:40:11.066Z" }, - { url = "https://files.pythonhosted.org/packages/26/e9/e8032c7b8f2a4129a03f63f896544f8b7cf068e2db2950326fa2400d5c47/av-15.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a631ea879cc553080ee62874f4284765c42ba08ee0279851a98a85e2ceb3cc8d", size = 40286166, upload-time = "2025-08-30T04:40:14.561Z" }, - { url = "https://files.pythonhosted.org/packages/e2/23/612c0fd809444d04b8387a2dfd942ccc77829507bd78a387ff65a9d98c24/av-15.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8f383949b010c3e731c245f80351d19dc0c08f345e194fc46becb1cb279be3ff", size = 41150592, upload-time = "2025-08-30T04:40:17.951Z" }, - { url = "https://files.pythonhosted.org/packages/15/74/6f8e38a3b0aea5f28e72813672ff45b64615f2c69e6a4a558718c95edb9f/av-15.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:d5921aa45f4c1f8c1a8d8185eb347e02aa4c3071278a2e2dd56368d54433d643", size = 31336093, upload-time = "2025-08-30T04:40:21.393Z" }, - { url = "https://files.pythonhosted.org/packages/2e/bc/78b2ffa8235eeffc29aa4a8cc47b02e660cfec32f601f39a00975fb06d0e/av-15.1.0-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:2f77853c3119c59d1bff4214ccbe46e3133eccff85ed96adee51c68684443f4e", size = 21726244, upload-time = "2025-08-30T04:40:24.14Z" }, - { url = "https://files.pythonhosted.org/packages/1a/99/66d69453a2dce028e6e8ebea085d90e880aac03d3a3ab7d8ec16755ffd75/av-15.1.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:c0bc4471c156a0a1c70a607502434f477bc8dfe085eef905e55b4b0d66bcd3a5", size = 26918663, upload-time = "2025-08-30T04:40:27.557Z" }, - { url = "https://files.pythonhosted.org/packages/fa/51/1a7dfbeda71f2772bc46d758af0e7fab1cc8388ce4bc7f24aecbc4bfd764/av-15.1.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:37839d4fa1407f047af82560dfc0f94d8d6266071eff49e1cbe16c4483054621", size = 38041408, upload-time = "2025-08-30T04:40:30.811Z" }, - { url = "https://files.pythonhosted.org/packages/d7/97/2c4e0288ad4359b6064cb06ae79c2ff3a84ac73d27e91f2161b75fcd86fa/av-15.1.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:729179cd8622815e8b6f6854d13a806fe710576e08895c77e5e4ad254609de9a", size = 39642563, upload-time = "2025-08-30T04:40:34.617Z" }, - { url = "https://files.pythonhosted.org/packages/ea/94/2362502149e276d00957edabcc201a5f4d5109a8a7b4fd30793714a532f3/av-15.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4abdf085bfa4eec318efccff567831b361ea56c045cc38366811552e3127c665", size = 40022119, upload-time = "2025-08-30T04:40:37.703Z" }, - { url = "https://files.pythonhosted.org/packages/df/58/1a0ce1b3835d9728da0a7a54aeffaa0a2b1a88405eaed9322efd55212a54/av-15.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f985661644879e4520d28a995fcb2afeb951bc15a1d51412eb8e5f36da85b6fe", size = 40885158, upload-time = "2025-08-30T04:40:40.952Z" }, - { url = "https://files.pythonhosted.org/packages/30/e6/054bb64e424d90b77ed5fc6a7358e4013fb436154c998fc90a89a186313f/av-15.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:7d7804a44c8048bb4b014a99353dd124663a12cd1d4613ba2bd3b457c3b1d539", size = 31312256, upload-time = "2025-08-30T04:40:44.224Z" }, - { url = "https://files.pythonhosted.org/packages/6f/8b/89eae6dca10d7d2b83c131025a31ccc750be78699ac0304439faa1d1df99/av-15.1.0-cp314-cp314-macosx_13_0_arm64.whl", hash = "sha256:5dd73c6447947edcb82e5fecf96e1f146aeda0f169c7ad4c54df4d9f66f63fde", size = 21730645, upload-time = "2025-08-30T04:40:47.259Z" }, - { url = "https://files.pythonhosted.org/packages/a3/f0/abffaf69405ed68041524be12a1e294faf396971d6a0e70eb00e93687df7/av-15.1.0-cp314-cp314-macosx_13_0_x86_64.whl", hash = "sha256:a81cd515934a5d51290aa66b059b7ed29c4a212e704f3c5e99e32877ff1c312c", size = 26913753, upload-time = "2025-08-30T04:40:50.445Z" }, - { url = "https://files.pythonhosted.org/packages/37/9e/7af078bcfc3cd340c981ac5d613c090ab007023d2ac13b05acd52f22f069/av-15.1.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:57cc7a733a7e7d7a153682f35c9cf5d01e8269367b049c954779de36fc3d0b10", size = 38027048, upload-time = "2025-08-30T04:40:54.076Z" }, - { url = "https://files.pythonhosted.org/packages/02/76/1f9dac11ad713e3619288993ea04e9c9cf4ec0f04e5ee81e83b8129dd8f3/av-15.1.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:a77b75bdb6899a64302ff923a5246e0747b3f0a3ecee7d61118db407a22c3f53", size = 39565396, upload-time = "2025-08-30T04:40:57.84Z" }, - { url = "https://files.pythonhosted.org/packages/8b/32/2188c46e2747247458ffc26b230c57dd28e61f65ff7b9e6223a411af5e98/av-15.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d0a1154ce081f1720082a133cfe12356c59f62dad2b93a7a1844bf1dcd010d85", size = 40015050, upload-time = "2025-08-30T04:41:01.091Z" }, - { url = "https://files.pythonhosted.org/packages/1e/41/b57fbce9994580619d7574817ece0fe0e7b822cde2af57904549d0150b8d/av-15.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8a7bf5a34dee15c86790414fa86a144e6d0dcc788bc83b565fdcbc080b4fbc90", size = 40821225, upload-time = "2025-08-30T04:41:04.349Z" }, - { url = "https://files.pythonhosted.org/packages/b1/36/e85cd1f0d3369c6764ad422882895d082f7ececb66d3df8aeae3234ef7a6/av-15.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:e30c9a6fd9734784941384a2e25fad3c22881a7682f378914676aa7e795acdb7", size = 31311750, upload-time = "2025-08-30T04:41:07.744Z" }, - { url = "https://files.pythonhosted.org/packages/80/d8/08a681758a4e49adfda409a6a35eff533f42654c6a6cfa102bc5cae1a728/av-15.1.0-cp314-cp314t-macosx_13_0_arm64.whl", hash = "sha256:60666833d7e65ebcfc48034a072de74349edbb62c9aaa3e6722fef31ca028eb6", size = 21828343, upload-time = "2025-08-30T04:41:10.81Z" }, - { url = "https://files.pythonhosted.org/packages/4a/52/29bec3fe68669b21f7d1ab5d94e21f597b8dfd37f50a3e3c9af6a8da925c/av-15.1.0-cp314-cp314t-macosx_13_0_x86_64.whl", hash = "sha256:53fbdae45aa2a49a22e864ff4f4017416ef62c060a172085d3247ba0a101104e", size = 27001666, upload-time = "2025-08-30T04:41:13.822Z" }, - { url = "https://files.pythonhosted.org/packages/9d/54/2c1d1faced66d708f5df328e800997cb47f90b500a214130c3a0f2ad601e/av-15.1.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:e6c51061667983dc801502aff9140bbc4f0e0d97f879586f17fb2f9a7e49c381", size = 39496753, upload-time = "2025-08-30T04:41:16.759Z" }, - { url = "https://files.pythonhosted.org/packages/c3/76/06ded5e52c4dcc2d9b5184c6da8de5ea77bd7ecb79a59a2b9700f1984949/av-15.1.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:2f80ec387f04aa34868662b11018b5f09654ae1530a61e24e92a142a24b10b62", size = 40784729, upload-time = "2025-08-30T04:41:20.491Z" }, - { url = "https://files.pythonhosted.org/packages/52/ef/797b76f3b39c99a96e387f501bbc07dca340b27d3dda12862fe694066b63/av-15.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4975e03177d37d8165c99c8d494175675ba8acb72458fb5d7e43f746a53e0374", size = 41284953, upload-time = "2025-08-30T04:41:23.949Z" }, - { url = "https://files.pythonhosted.org/packages/31/47/e4656f00e62fd059ea5a40b492dea784f5aecfe1dfac10c0d7a0664ce200/av-15.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8f78f3dad11780b4cdd024cdb92ce43cb170929297c00f2f4555c2b103f51e55", size = 41985340, upload-time = "2025-08-30T04:41:27.561Z" }, - { url = "https://files.pythonhosted.org/packages/b1/c9/15bb4fd7a1f39d70db35af2b9c20a0ae19e4220eb58a8b8446e903b98d72/av-15.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:9a20c5eba3ec49c2f4b281797021923fc68a86aeb66c5cda4fd0252fa8004951", size = 31487337, upload-time = "2025-08-30T04:41:30.591Z" }, +version = "16.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/15/c3/fd72a0315bc6c943ced1105aaac6e0ec1be57c70d8a616bd05acaa21ffee/av-16.0.1.tar.gz", hash = "sha256:dd2ce779fa0b5f5889a6d9e00fbbbc39f58e247e52d31044272648fe16ff1dbf", size = 3904030, upload-time = "2025-10-13T12:28:51.082Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e8/3c/eefa29b7d0f5afdf7af9197bbecad8ec2ad06bcb5ac7e909c05a624b00a6/av-16.0.1-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:8b141aaa29a3afc96a1d467d106790782c1914628b57309eaadb8c10c299c9c0", size = 27206679, upload-time = "2025-10-13T12:24:41.145Z" }, + { url = "https://files.pythonhosted.org/packages/ac/89/a474feb07d5b94aa5af3771b0fe328056e2e0a840039b329f4fa2a1fd13a/av-16.0.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:4b8a08a59a5be0082af063d3f4b216e3950340121c6ea95b505a3f5f5cc8f21d", size = 21774556, upload-time = "2025-10-13T12:24:44.332Z" }, + { url = "https://files.pythonhosted.org/packages/be/e5/4361010dcac398bc224823e4b2a47803845e159af9f95164662c523770dc/av-16.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:792e7fc3c08eae005ff36486983966476e553cbb55aaeb0ec99adc4909377320", size = 38176763, upload-time = "2025-10-13T12:24:46.98Z" }, + { url = "https://files.pythonhosted.org/packages/d4/db/b27bdd20c9dc80de5b8792dae16dd6f4edf16408c0c7b28070c6228a8057/av-16.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:4e8ef5df76d8d0ee56139789f80bb90ad1a82a7e6df6e080e2e95c06fa22aea7", size = 39696277, upload-time = "2025-10-13T12:24:50.951Z" }, + { url = "https://files.pythonhosted.org/packages/4e/c8/dd48e6a3ac1e922c141475a0dc30e2b6dfdef9751b3274829889a9281cce/av-16.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4f7a6985784a7464f078e419c71f5528c3e550ee5d605e7149b4a37a111eb136", size = 39576660, upload-time = "2025-10-13T12:24:55.773Z" }, + { url = "https://files.pythonhosted.org/packages/b9/f0/223d047e2e60672a2fb5e51e28913de8d52195199f3e949cbfda1e6cd64b/av-16.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3f45c8d7b803b6faa2a25a26de5964a0a897de68298d9c9672c7af9d65d8b48a", size = 40752775, upload-time = "2025-10-13T12:25:00.827Z" }, + { url = "https://files.pythonhosted.org/packages/18/73/73acad21c9203bc63d806e8baf42fe705eb5d36dafd1996b71ab5861a933/av-16.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:58e6faf1d9328d8cc6be14c5aadacb7d2965ed6d6ae1af32696993096543ff00", size = 32302328, upload-time = "2025-10-13T12:25:06.042Z" }, + { url = "https://files.pythonhosted.org/packages/49/d3/f2a483c5273fccd556dfa1fce14fab3b5d6d213b46e28e54e254465a2255/av-16.0.1-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:e310d1fb42879df9bad2152a8db6d2ff8bf332c8c36349a09d62cc122f5070fb", size = 27191982, upload-time = "2025-10-13T12:25:10.622Z" }, + { url = "https://files.pythonhosted.org/packages/e0/39/dff28bd252131b3befd09d8587992fe18c09d5125eaefc83a6434d5f56ff/av-16.0.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:2f4b357e5615457a84e6b6290916b22864b76b43d5079e1a73bc27581a5b9bac", size = 21760305, upload-time = "2025-10-13T12:25:14.882Z" }, + { url = "https://files.pythonhosted.org/packages/4a/4d/2312d50a09c84a9b4269f7fea5de84f05dd2b7c7113dd961d31fad6c64c4/av-16.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:286665c77034c3a98080169b8b5586d5568a15da81fbcdaf8099252f2d232d7c", size = 38691616, upload-time = "2025-10-13T12:25:20.063Z" }, + { url = "https://files.pythonhosted.org/packages/15/9a/3d2d30b56252f998e53fced13720e2ce809c4db477110f944034e0fa4c9f/av-16.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:f88de8e5b8ea29e41af4d8d61df108323d050ccfbc90f15b13ec1f99ce0e841e", size = 40216464, upload-time = "2025-10-13T12:25:24.848Z" }, + { url = "https://files.pythonhosted.org/packages/98/cb/3860054794a47715b4be0006105158c7119a57be58d9e8882b72e4d4e1dd/av-16.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0cdb71ebe4d1b241cf700f8f0c44a7d2a6602b921e16547dd68c0842113736e1", size = 40094077, upload-time = "2025-10-13T12:25:30.238Z" }, + { url = "https://files.pythonhosted.org/packages/41/58/79830fb8af0a89c015250f7864bbd427dff09c70575c97847055f8a302f7/av-16.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:28c27a65d40e8cf82b6db2543f8feeb8b56d36c1938f50773494cd3b073c7223", size = 41279948, upload-time = "2025-10-13T12:25:35.24Z" }, + { url = "https://files.pythonhosted.org/packages/83/79/6e1463b04382f379f857113b851cf5f9d580a2f7bd794211cd75352f4e04/av-16.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:ffea39ac7574f234f5168f9b9602e8d4ecdd81853238ec4d661001f03a6d3f64", size = 32297586, upload-time = "2025-10-13T12:25:39.826Z" }, + { url = "https://files.pythonhosted.org/packages/44/78/12a11d7a44fdd8b26a65e2efa1d8a5826733c8887a989a78306ec4785956/av-16.0.1-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:e41a8fef85dfb2c717349f9ff74f92f9560122a9f1a94b1c6c9a8a9c9462ba71", size = 27206375, upload-time = "2025-10-13T12:25:44.423Z" }, + { url = "https://files.pythonhosted.org/packages/27/19/3a4d3882852a0ee136121979ce46f6d2867b974eb217a2c9a070939f55ad/av-16.0.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:6352a64b25c9f985d4f279c2902db9a92424e6f2c972161e67119616f0796cb9", size = 21752603, upload-time = "2025-10-13T12:25:49.122Z" }, + { url = "https://files.pythonhosted.org/packages/cb/6e/f7abefba6e008e2f69bebb9a17ba38ce1df240c79b36a5b5fcacf8c8fcfd/av-16.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:5201f7b4b5ed2128118cb90c2a6d64feedb0586ca7c783176896c78ffb4bbd5c", size = 38931978, upload-time = "2025-10-13T12:25:55.021Z" }, + { url = "https://files.pythonhosted.org/packages/b2/7a/1305243ab47f724fdd99ddef7309a594e669af7f0e655e11bdd2c325dfae/av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:daecc2072b82b6a942acbdaa9a2e00c05234c61fef976b22713983c020b07992", size = 40549383, upload-time = "2025-10-13T12:26:00.897Z" }, + { url = "https://files.pythonhosted.org/packages/32/b2/357cc063185043eb757b4a48782bff780826103bcad1eb40c3ddfc050b7e/av-16.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6573da96e8bebc3536860a7def108d7dbe1875c86517072431ced702447e6aea", size = 40241993, upload-time = "2025-10-13T12:26:06.993Z" }, + { url = "https://files.pythonhosted.org/packages/20/bb/ced42a4588ba168bf0ef1e9d016982e3ba09fde6992f1dda586fd20dcf71/av-16.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4bc064e48a8de6c087b97dd27cf4ef8c13073f0793108fbce3ecd721201b2502", size = 41532235, upload-time = "2025-10-13T12:26:12.488Z" }, + { url = "https://files.pythonhosted.org/packages/15/37/c7811eca0f318d5fd3212f7e8c3d8335f75a54907c97a89213dc580b8056/av-16.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0c669b6b6668c8ae74451c15ec6d6d8a36e4c3803dc5d9910f607a174dd18f17", size = 32296912, upload-time = "2025-10-13T12:26:19.187Z" }, + { url = "https://files.pythonhosted.org/packages/86/59/972f199ccc4f8c9e51f59e0f8962a09407396b3f6d11355e2c697ba555f9/av-16.0.1-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:4c61c6c120f5c5d95c711caf54e2c4a9fb2f1e613ac0a9c273d895f6b2602e44", size = 27170433, upload-time = "2025-10-13T12:26:24.673Z" }, + { url = "https://files.pythonhosted.org/packages/53/9d/0514cbc185fb20353ab25da54197fbd169a233e39efcbb26533c36a9dbb9/av-16.0.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:7ecc2e41320c69095f44aff93470a0d32c30892b2dbad0a08040441c81efa379", size = 21717654, upload-time = "2025-10-13T12:26:29.12Z" }, + { url = "https://files.pythonhosted.org/packages/32/8c/881409dd124b4e07d909d2b70568acb21126fc747656390840a2238651c9/av-16.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:036f0554d6faef3f4a94acaeb0cedd388e3ab96eb0eb5a14ec27c17369c466c9", size = 38651601, upload-time = "2025-10-13T12:26:33.919Z" }, + { url = "https://files.pythonhosted.org/packages/35/fd/867ba4cc3ab504442dc89b0c117e6a994fc62782eb634c8f31304586f93e/av-16.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:876415470a62e4a3550cc38db2fc0094c25e64eea34d7293b7454125d5958190", size = 40278604, upload-time = "2025-10-13T12:26:39.2Z" }, + { url = "https://files.pythonhosted.org/packages/b3/87/63cde866c0af09a1fa9727b4f40b34d71b0535785f5665c27894306f1fbc/av-16.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:56902a06bd0828d13f13352874c370670882048267191ff5829534b611ba3956", size = 39984854, upload-time = "2025-10-13T12:26:44.581Z" }, + { url = "https://files.pythonhosted.org/packages/71/3b/8f40a708bff0e6b0f957836e2ef1f4d4429041cf8d99a415a77ead8ac8a3/av-16.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fe988c2bf0fc2d952858f791f18377ea4ae4e19ba3504793799cd6c2a2562edf", size = 41270352, upload-time = "2025-10-13T12:26:50.817Z" }, + { url = "https://files.pythonhosted.org/packages/1e/b5/c114292cb58a7269405ae13b7ba48c7d7bfeebbb2e4e66c8073c065a4430/av-16.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:708a66c248848029bf518f0482b81c5803846f1b597ef8013b19c014470b620f", size = 32273242, upload-time = "2025-10-13T12:26:55.788Z" }, + { url = "https://files.pythonhosted.org/packages/ff/e9/a5b714bc078fdcca8b46c8a0b38484ae5c24cd81d9c1703d3e8ae2b57259/av-16.0.1-cp313-cp313t-macosx_11_0_x86_64.whl", hash = "sha256:79a77ee452537030c21a0b41139bedaf16629636bf764b634e93b99c9d5f4558", size = 27248984, upload-time = "2025-10-13T12:27:00.564Z" }, + { url = "https://files.pythonhosted.org/packages/06/ef/ff777aaf1f88e3f6ce94aca4c5806a0c360e68d48f9d9f0214e42650f740/av-16.0.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:080823a6ff712f81e7089ae9756fb1512ca1742a138556a852ce50f58e457213", size = 21828098, upload-time = "2025-10-13T12:27:05.433Z" }, + { url = "https://files.pythonhosted.org/packages/34/d7/a484358d24a42bedde97f61f5d6ee568a7dd866d9df6e33731378db92d9e/av-16.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:04e00124afa8b46a850ed48951ddda61de874407fb8307d6a875bba659d5727e", size = 40051697, upload-time = "2025-10-13T12:27:10.525Z" }, + { url = "https://files.pythonhosted.org/packages/73/87/6772d6080837da5d5c810a98a95bde6977e1f5a6e2e759e8c9292af9ec69/av-16.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:bc098c1c6dc4e7080629a7e9560e67bd4b5654951e17e5ddfd2b1515cfcd37db", size = 41352596, upload-time = "2025-10-13T12:27:16.217Z" }, + { url = "https://files.pythonhosted.org/packages/bd/58/fe448c60cf7f85640a0ed8936f16bac874846aa35e1baa521028949c1ea3/av-16.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e6ffd3559a72c46a76aa622630751a821499ba5a780b0047ecc75105d43a6b61", size = 41183156, upload-time = "2025-10-13T12:27:21.574Z" }, + { url = "https://files.pythonhosted.org/packages/85/c6/a039a0979d0c278e1bed6758d5a6186416c3ccb8081970df893fdf9a0d99/av-16.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7a3f1a36b550adadd7513f4f5ee956f9e06b01a88e59f3150ef5fec6879d6f79", size = 42302331, upload-time = "2025-10-13T12:27:26.953Z" }, + { url = "https://files.pythonhosted.org/packages/18/7b/2ca4a9e3609ff155436dac384e360f530919cb1e328491f7df294be0f0dc/av-16.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:c6de794abe52b8c0be55d8bb09ade05905efa74b1a5ab4860b4b9c2bfb6578bf", size = 32462194, upload-time = "2025-10-13T12:27:32.942Z" }, + { url = "https://files.pythonhosted.org/packages/14/9a/6d17e379906cf53a7a44dfac9cf7e4b2e7df2082ba2dbf07126055effcc1/av-16.0.1-cp314-cp314-macosx_11_0_x86_64.whl", hash = "sha256:4b55ba69a943ae592ad7900da67129422954789de9dc384685d6b529925f542e", size = 27167101, upload-time = "2025-10-13T12:27:38.886Z" }, + { url = "https://files.pythonhosted.org/packages/6c/34/891816cd82d5646cb5a51d201d20be0a578232536d083b7d939734258067/av-16.0.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:d4a0c47b6c9bbadad8909b82847f5fe64a608ad392f0b01704e427349bcd9a47", size = 21722708, upload-time = "2025-10-13T12:27:43.29Z" }, + { url = "https://files.pythonhosted.org/packages/1d/20/c24ad34038423ab8c9728cef3301e0861727c188442dcfd70a4a10834c63/av-16.0.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:8bba52f3035708456f6b1994d10b0371b45cfd8f917b5e84ff81aef4ec2f08bf", size = 38638842, upload-time = "2025-10-13T12:27:49.776Z" }, + { url = "https://files.pythonhosted.org/packages/d7/32/034412309572ba3ad713079d07a3ffc13739263321aece54a3055d7a4f1f/av-16.0.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:08e34c7e7b5e55e29931180bbe21095e1874ac120992bf6b8615d39574487617", size = 40197789, upload-time = "2025-10-13T12:27:55.688Z" }, + { url = "https://files.pythonhosted.org/packages/fb/9c/40496298c32f9094e7df28641c5c58aa6fb07554dc232a9ac98a9894376f/av-16.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0d6250ab9db80c641b299987027c987f14935ea837ea4c02c5f5182f6b69d9e5", size = 39980829, upload-time = "2025-10-13T12:28:01.507Z" }, + { url = "https://files.pythonhosted.org/packages/4a/7e/5c38268ac1d424f309b13b2de4597ad28daea6039ee5af061e62918b12a8/av-16.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7b621f28d8bcbb07cdcd7b18943ddc040739ad304545715ae733873b6e1b739d", size = 41205928, upload-time = "2025-10-13T12:28:08.431Z" }, + { url = "https://files.pythonhosted.org/packages/e3/07/3176e02692d8753a6c4606021c60e4031341afb56292178eee633b6760a4/av-16.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:92101f49082392580c9dba4ba2fe5b931b3bb0fb75a1a848bfb9a11ded68be91", size = 32272836, upload-time = "2025-10-13T12:28:13.405Z" }, + { url = "https://files.pythonhosted.org/packages/8a/47/10e03b88de097385d1550cbb6d8de96159131705c13adb92bd9b7e677425/av-16.0.1-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:07c464bf2bc362a154eccc82e235ef64fd3aaf8d76fc8ed63d0ae520943c6d3f", size = 27248864, upload-time = "2025-10-13T12:28:17.467Z" }, + { url = "https://files.pythonhosted.org/packages/b1/60/7447f206bec3e55e81371f1989098baa2fe9adb7b46c149e6937b7e7c1ca/av-16.0.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:750da0673864b669c95882c7b25768cd93ece0e47010d74ebcc29dbb14d611f8", size = 21828185, upload-time = "2025-10-13T12:28:21.461Z" }, + { url = "https://files.pythonhosted.org/packages/68/48/ee2680e7a01bc4911bbe902b814346911fa2528697a44f3043ee68e0f07e/av-16.0.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:0b7c0d060863b2e341d07cd26851cb9057b7979814148b028fb7ee5d5eb8772d", size = 40040572, upload-time = "2025-10-13T12:28:26.585Z" }, + { url = "https://files.pythonhosted.org/packages/da/68/2c43d28871721ae07cde432d6e36ae2f7035197cbadb43764cc5bf3d4b33/av-16.0.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:e67c2eca6023ca7d76b0709c5f392b23a5defba499f4c262411f8155b1482cbd", size = 41344288, upload-time = "2025-10-13T12:28:32.512Z" }, + { url = "https://files.pythonhosted.org/packages/ec/7f/1d801bff43ae1af4758c45eee2eaae64f303bbb460e79f352f08587fd179/av-16.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e3243d54d84986e8fbdc1946db634b0c41fe69b6de35a99fa8b763e18503d040", size = 41175142, upload-time = "2025-10-13T12:28:38.356Z" }, + { url = "https://files.pythonhosted.org/packages/e4/06/bb363138687066bbf8997c1433dbd9c81762bae120955ea431fb72d69d26/av-16.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a1bcf73efab5379601e6510abd7afe5f397d0f6defe69b1610c2f37a4a17996b", size = 42293932, upload-time = "2025-10-13T12:28:43.442Z" }, + { url = "https://files.pythonhosted.org/packages/92/15/5e713098a085f970ccf88550194d277d244464d7b3a7365ad92acb4b6dc1/av-16.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:6368d4ff153d75469d2a3217bc403630dc870a72fe0a014d9135de550d731a86", size = 32460624, upload-time = "2025-10-13T12:28:48.767Z" }, ] [[package]] @@ -667,16 +634,16 @@ wheels = [ [[package]] name = "botocore" -version = "1.40.61" +version = "1.41.5" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jmespath" }, { name = "python-dateutil" }, { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/28/a3/81d3a47c2dbfd76f185d3b894f2ad01a75096c006a2dd91f237dca182188/botocore-1.40.61.tar.gz", hash = "sha256:a2487ad69b090f9cccd64cf07c7021cd80ee9c0655ad974f87045b02f3ef52cd", size = 14393956, upload-time = "2025-10-28T19:26:46.108Z" } +sdist = { url = "https://files.pythonhosted.org/packages/90/22/7fe08c726a2e3b11a0aef8bf177e83891c9cb2dc1809d35c9ed91a9e60e6/botocore-1.41.5.tar.gz", hash = "sha256:0367622b811597d183bfcaab4a350f0d3ede712031ce792ef183cabdee80d3bf", size = 14668152, upload-time = "2025-11-26T20:27:38.026Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/38/c5/f6ce561004db45f0b847c2cd9b19c67c6bf348a82018a48cb718be6b58b0/botocore-1.40.61-py3-none-any.whl", hash = "sha256:17ebae412692fd4824f99cde0f08d50126dc97954008e5ba2b522eb049238aa7", size = 14055973, upload-time = "2025-10-28T19:26:42.15Z" }, + { url = "https://files.pythonhosted.org/packages/4e/4e/21cd0b8f365449f1576f93de1ec8718ed18a7a3bc086dfbdeb79437bba7a/botocore-1.41.5-py3-none-any.whl", hash = "sha256:3fef7fcda30c82c27202d232cfdbd6782cb27f20f8e7e21b20606483e66ee73a", size = 14337008, upload-time = "2025-11-26T20:27:35.208Z" }, ] [[package]] @@ -719,11 +686,11 @@ sdist = { url = "https://files.pythonhosted.org/packages/64/cb/104778c728dc3d5ea [[package]] name = "certifi" -version = "2025.10.5" +version = "2025.11.12" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/4c/5b/b6ce21586237c77ce67d01dc5507039d444b630dd76611bbca2d8e5dcd91/certifi-2025.10.5.tar.gz", hash = "sha256:47c09d31ccf2acf0be3f701ea53595ee7e0b8fa08801c6624be771df09ae7b43", size = 164519, upload-time = "2025-10-05T04:12:15.808Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/8c/58f469717fa48465e4a50c014a0400602d3c437d7c0c468e17ada824da3a/certifi-2025.11.12.tar.gz", hash = "sha256:d8ab5478f2ecd78af242878415affce761ca6bc54a22a27e026d7c25357c3316", size = 160538, upload-time = "2025-11-12T02:54:51.517Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e4/37/af0d2ef3967ac0d6113837b44a4f0bfe1328c2b9763bd5b1744520e5cfed/certifi-2025.10.5-py3-none-any.whl", hash = "sha256:0f212c2744a9bb6de0c56639a6f68afe01ecd92d91f14ae897c4fe7bbeeef0de", size = 163286, upload-time = "2025-10-05T04:12:14.03Z" }, + { url = "https://files.pythonhosted.org/packages/70/7d/9bc192684cea499815ff478dfcdc13835ddf401365057044fb721ec6bddb/certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b", size = 159438, upload-time = "2025-11-12T02:54:49.735Z" }, ] [[package]] @@ -899,14 +866,14 @@ wheels = [ [[package]] name = "click" -version = "8.3.0" +version = "8.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/46/61/de6cd827efad202d7057d93e0fed9294b96952e188f7384832791c7b2254/click-8.3.0.tar.gz", hash = "sha256:e7b8232224eba16f4ebe410c25ced9f7875cb5f3263ffc93cc3e8da705e229c4", size = 276943, upload-time = "2025-09-18T17:32:23.696Z" } +sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/db/d3/9dcc0f5797f070ec8edf30fbadfb200e71d9db6b84d211e3b2085a7589a0/click-8.3.0-py3-none-any.whl", hash = "sha256:9b9f285302c6e3064f4330c05f05b81945b2a39544279343e6e7c5f27a9baddc", size = 107295, upload-time = "2025-09-18T17:32:22.42Z" }, + { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, ] [[package]] @@ -938,101 +905,101 @@ wheels = [ [[package]] name = "coverage" -version = "7.11.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1c/38/ee22495420457259d2f3390309505ea98f98a5eed40901cf62196abad006/coverage-7.11.0.tar.gz", hash = "sha256:167bd504ac1ca2af7ff3b81d245dfea0292c5032ebef9d66cc08a7d28c1b8050", size = 811905, upload-time = "2025-10-15T15:15:08.542Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/12/95/c49df0aceb5507a80b9fe5172d3d39bf23f05be40c23c8d77d556df96cec/coverage-7.11.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eb53f1e8adeeb2e78962bade0c08bfdc461853c7969706ed901821e009b35e31", size = 215800, upload-time = "2025-10-15T15:12:19.824Z" }, - { url = "https://files.pythonhosted.org/packages/dc/c6/7bb46ce01ed634fff1d7bb53a54049f539971862cc388b304ff3c51b4f66/coverage-7.11.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d9a03ec6cb9f40a5c360f138b88266fd8f58408d71e89f536b4f91d85721d075", size = 216198, upload-time = "2025-10-15T15:12:22.549Z" }, - { url = "https://files.pythonhosted.org/packages/94/b2/75d9d8fbf2900268aca5de29cd0a0fe671b0f69ef88be16767cc3c828b85/coverage-7.11.0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:0d7f0616c557cbc3d1c2090334eddcbb70e1ae3a40b07222d62b3aa47f608fab", size = 242953, upload-time = "2025-10-15T15:12:24.139Z" }, - { url = "https://files.pythonhosted.org/packages/65/ac/acaa984c18f440170525a8743eb4b6c960ace2dbad80dc22056a437fc3c6/coverage-7.11.0-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:e44a86a47bbdf83b0a3ea4d7df5410d6b1a0de984fbd805fa5101f3624b9abe0", size = 244766, upload-time = "2025-10-15T15:12:25.974Z" }, - { url = "https://files.pythonhosted.org/packages/d8/0d/938d0bff76dfa4a6b228c3fc4b3e1c0e2ad4aa6200c141fcda2bd1170227/coverage-7.11.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:596763d2f9a0ee7eec6e643e29660def2eef297e1de0d334c78c08706f1cb785", size = 246625, upload-time = "2025-10-15T15:12:27.387Z" }, - { url = "https://files.pythonhosted.org/packages/38/54/8f5f5e84bfa268df98f46b2cb396b1009734cfb1e5d6adb663d284893b32/coverage-7.11.0-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ef55537ff511b5e0a43edb4c50a7bf7ba1c3eea20b4f49b1490f1e8e0e42c591", size = 243568, upload-time = "2025-10-15T15:12:28.799Z" }, - { url = "https://files.pythonhosted.org/packages/68/30/8ba337c2877fe3f2e1af0ed7ff4be0c0c4aca44d6f4007040f3ca2255e99/coverage-7.11.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9cbabd8f4d0d3dc571d77ae5bdbfa6afe5061e679a9d74b6797c48d143307088", size = 244665, upload-time = "2025-10-15T15:12:30.297Z" }, - { url = "https://files.pythonhosted.org/packages/cc/fb/c6f1d6d9a665536b7dde2333346f0cc41dc6a60bd1ffc10cd5c33e7eb000/coverage-7.11.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e24045453384e0ae2a587d562df2a04d852672eb63051d16096d3f08aa4c7c2f", size = 242681, upload-time = "2025-10-15T15:12:32.326Z" }, - { url = "https://files.pythonhosted.org/packages/be/38/1b532319af5f991fa153c20373291dc65c2bf532af7dbcffdeef745c8f79/coverage-7.11.0-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:7161edd3426c8d19bdccde7d49e6f27f748f3c31cc350c5de7c633fea445d866", size = 242912, upload-time = "2025-10-15T15:12:34.079Z" }, - { url = "https://files.pythonhosted.org/packages/67/3d/f39331c60ef6050d2a861dc1b514fa78f85f792820b68e8c04196ad733d6/coverage-7.11.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3d4ed4de17e692ba6415b0587bc7f12bc80915031fc9db46a23ce70fc88c9841", size = 243559, upload-time = "2025-10-15T15:12:35.809Z" }, - { url = "https://files.pythonhosted.org/packages/4b/55/cb7c9df9d0495036ce582a8a2958d50c23cd73f84a23284bc23bd4711a6f/coverage-7.11.0-cp310-cp310-win32.whl", hash = "sha256:765c0bc8fe46f48e341ef737c91c715bd2a53a12792592296a095f0c237e09cf", size = 218266, upload-time = "2025-10-15T15:12:37.429Z" }, - { url = "https://files.pythonhosted.org/packages/68/a8/b79cb275fa7bd0208767f89d57a1b5f6ba830813875738599741b97c2e04/coverage-7.11.0-cp310-cp310-win_amd64.whl", hash = "sha256:24d6f3128f1b2d20d84b24f4074475457faedc3d4613a7e66b5e769939c7d969", size = 219169, upload-time = "2025-10-15T15:12:39.25Z" }, - { url = "https://files.pythonhosted.org/packages/49/3a/ee1074c15c408ddddddb1db7dd904f6b81bc524e01f5a1c5920e13dbde23/coverage-7.11.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d58ecaa865c5b9fa56e35efc51d1014d4c0d22838815b9fce57a27dd9576847", size = 215912, upload-time = "2025-10-15T15:12:40.665Z" }, - { url = "https://files.pythonhosted.org/packages/70/c4/9f44bebe5cb15f31608597b037d78799cc5f450044465bcd1ae8cb222fe1/coverage-7.11.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b679e171f1c104a5668550ada700e3c4937110dbdd153b7ef9055c4f1a1ee3cc", size = 216310, upload-time = "2025-10-15T15:12:42.461Z" }, - { url = "https://files.pythonhosted.org/packages/42/01/5e06077cfef92d8af926bdd86b84fb28bf9bc6ad27343d68be9b501d89f2/coverage-7.11.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ca61691ba8c5b6797deb221a0d09d7470364733ea9c69425a640f1f01b7c5bf0", size = 246706, upload-time = "2025-10-15T15:12:44.001Z" }, - { url = "https://files.pythonhosted.org/packages/40/b8/7a3f1f33b35cc4a6c37e759137533119560d06c0cc14753d1a803be0cd4a/coverage-7.11.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:aef1747ede4bd8ca9cfc04cc3011516500c6891f1b33a94add3253f6f876b7b7", size = 248634, upload-time = "2025-10-15T15:12:45.768Z" }, - { url = "https://files.pythonhosted.org/packages/7a/41/7f987eb33de386bc4c665ab0bf98d15fcf203369d6aacae74f5dd8ec489a/coverage-7.11.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a1839d08406e4cba2953dcc0ffb312252f14d7c4c96919f70167611f4dee2623", size = 250741, upload-time = "2025-10-15T15:12:47.222Z" }, - { url = "https://files.pythonhosted.org/packages/23/c1/a4e0ca6a4e83069fb8216b49b30a7352061ca0cb38654bd2dc96b7b3b7da/coverage-7.11.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e0eb0a2dcc62478eb5b4cbb80b97bdee852d7e280b90e81f11b407d0b81c4287", size = 246837, upload-time = "2025-10-15T15:12:48.904Z" }, - { url = "https://files.pythonhosted.org/packages/5d/03/ced062a17f7c38b4728ff76c3acb40d8465634b20b4833cdb3cc3a74e115/coverage-7.11.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bc1fbea96343b53f65d5351d8fd3b34fd415a2670d7c300b06d3e14a5af4f552", size = 248429, upload-time = "2025-10-15T15:12:50.73Z" }, - { url = "https://files.pythonhosted.org/packages/97/af/a7c6f194bb8c5a2705ae019036b8fe7f49ea818d638eedb15fdb7bed227c/coverage-7.11.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:214b622259dd0cf435f10241f1333d32caa64dbc27f8790ab693428a141723de", size = 246490, upload-time = "2025-10-15T15:12:52.646Z" }, - { url = "https://files.pythonhosted.org/packages/ab/c3/aab4df02b04a8fde79068c3c41ad7a622b0ef2b12e1ed154da986a727c3f/coverage-7.11.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:258d9967520cca899695d4eb7ea38be03f06951d6ca2f21fb48b1235f791e601", size = 246208, upload-time = "2025-10-15T15:12:54.586Z" }, - { url = "https://files.pythonhosted.org/packages/30/d8/e282ec19cd658238d60ed404f99ef2e45eed52e81b866ab1518c0d4163cf/coverage-7.11.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:cf9e6ff4ca908ca15c157c409d608da77a56a09877b97c889b98fb2c32b6465e", size = 247126, upload-time = "2025-10-15T15:12:56.485Z" }, - { url = "https://files.pythonhosted.org/packages/d1/17/a635fa07fac23adb1a5451ec756216768c2767efaed2e4331710342a3399/coverage-7.11.0-cp311-cp311-win32.whl", hash = "sha256:fcc15fc462707b0680cff6242c48625da7f9a16a28a41bb8fd7a4280920e676c", size = 218314, upload-time = "2025-10-15T15:12:58.365Z" }, - { url = "https://files.pythonhosted.org/packages/2a/29/2ac1dfcdd4ab9a70026edc8d715ece9b4be9a1653075c658ee6f271f394d/coverage-7.11.0-cp311-cp311-win_amd64.whl", hash = "sha256:865965bf955d92790f1facd64fe7ff73551bd2c1e7e6b26443934e9701ba30b9", size = 219203, upload-time = "2025-10-15T15:12:59.902Z" }, - { url = "https://files.pythonhosted.org/packages/03/21/5ce8b3a0133179115af4c041abf2ee652395837cb896614beb8ce8ddcfd9/coverage-7.11.0-cp311-cp311-win_arm64.whl", hash = "sha256:5693e57a065760dcbeb292d60cc4d0231a6d4b6b6f6a3191561e1d5e8820b745", size = 217879, upload-time = "2025-10-15T15:13:01.35Z" }, - { url = "https://files.pythonhosted.org/packages/c4/db/86f6906a7c7edc1a52b2c6682d6dd9be775d73c0dfe2b84f8923dfea5784/coverage-7.11.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9c49e77811cf9d024b95faf86c3f059b11c0c9be0b0d61bc598f453703bd6fd1", size = 216098, upload-time = "2025-10-15T15:13:02.916Z" }, - { url = "https://files.pythonhosted.org/packages/21/54/e7b26157048c7ba555596aad8569ff903d6cd67867d41b75287323678ede/coverage-7.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a61e37a403a778e2cda2a6a39abcc895f1d984071942a41074b5c7ee31642007", size = 216331, upload-time = "2025-10-15T15:13:04.403Z" }, - { url = "https://files.pythonhosted.org/packages/b9/19/1ce6bf444f858b83a733171306134a0544eaddf1ca8851ede6540a55b2ad/coverage-7.11.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:c79cae102bb3b1801e2ef1511fb50e91ec83a1ce466b2c7c25010d884336de46", size = 247825, upload-time = "2025-10-15T15:13:05.92Z" }, - { url = "https://files.pythonhosted.org/packages/71/0b/d3bcbbc259fcced5fb67c5d78f6e7ee965f49760c14afd931e9e663a83b2/coverage-7.11.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:16ce17ceb5d211f320b62df002fa7016b7442ea0fd260c11cec8ce7730954893", size = 250573, upload-time = "2025-10-15T15:13:07.471Z" }, - { url = "https://files.pythonhosted.org/packages/58/8d/b0ff3641a320abb047258d36ed1c21d16be33beed4152628331a1baf3365/coverage-7.11.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:80027673e9d0bd6aef86134b0771845e2da85755cf686e7c7c59566cf5a89115", size = 251706, upload-time = "2025-10-15T15:13:09.4Z" }, - { url = "https://files.pythonhosted.org/packages/59/c8/5a586fe8c7b0458053d9c687f5cff515a74b66c85931f7fe17a1c958b4ac/coverage-7.11.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:4d3ffa07a08657306cd2215b0da53761c4d73cb54d9143b9303a6481ec0cd415", size = 248221, upload-time = "2025-10-15T15:13:10.964Z" }, - { url = "https://files.pythonhosted.org/packages/d0/ff/3a25e3132804ba44cfa9a778cdf2b73dbbe63ef4b0945e39602fc896ba52/coverage-7.11.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a3b6a5f8b2524fd6c1066bc85bfd97e78709bb5e37b5b94911a6506b65f47186", size = 249624, upload-time = "2025-10-15T15:13:12.5Z" }, - { url = "https://files.pythonhosted.org/packages/c5/12/ff10c8ce3895e1b17a73485ea79ebc1896a9e466a9d0f4aef63e0d17b718/coverage-7.11.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:fcc0a4aa589de34bc56e1a80a740ee0f8c47611bdfb28cd1849de60660f3799d", size = 247744, upload-time = "2025-10-15T15:13:14.554Z" }, - { url = "https://files.pythonhosted.org/packages/16/02/d500b91f5471b2975947e0629b8980e5e90786fe316b6d7299852c1d793d/coverage-7.11.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:dba82204769d78c3fd31b35c3d5f46e06511936c5019c39f98320e05b08f794d", size = 247325, upload-time = "2025-10-15T15:13:16.438Z" }, - { url = "https://files.pythonhosted.org/packages/77/11/dee0284fbbd9cd64cfce806b827452c6df3f100d9e66188e82dfe771d4af/coverage-7.11.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:81b335f03ba67309a95210caf3eb43bd6fe75a4e22ba653ef97b4696c56c7ec2", size = 249180, upload-time = "2025-10-15T15:13:17.959Z" }, - { url = "https://files.pythonhosted.org/packages/59/1b/cdf1def928f0a150a057cab03286774e73e29c2395f0d30ce3d9e9f8e697/coverage-7.11.0-cp312-cp312-win32.whl", hash = "sha256:037b2d064c2f8cc8716fe4d39cb705779af3fbf1ba318dc96a1af858888c7bb5", size = 218479, upload-time = "2025-10-15T15:13:19.608Z" }, - { url = "https://files.pythonhosted.org/packages/ff/55/e5884d55e031da9c15b94b90a23beccc9d6beee65e9835cd6da0a79e4f3a/coverage-7.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:d66c0104aec3b75e5fd897e7940188ea1892ca1d0235316bf89286d6a22568c0", size = 219290, upload-time = "2025-10-15T15:13:21.593Z" }, - { url = "https://files.pythonhosted.org/packages/23/a8/faa930cfc71c1d16bc78f9a19bb73700464f9c331d9e547bfbc1dbd3a108/coverage-7.11.0-cp312-cp312-win_arm64.whl", hash = "sha256:d91ebeac603812a09cf6a886ba6e464f3bbb367411904ae3790dfe28311b15ad", size = 217924, upload-time = "2025-10-15T15:13:23.39Z" }, - { url = "https://files.pythonhosted.org/packages/60/7f/85e4dfe65e400645464b25c036a26ac226cf3a69d4a50c3934c532491cdd/coverage-7.11.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:cc3f49e65ea6e0d5d9bd60368684fe52a704d46f9e7fc413918f18d046ec40e1", size = 216129, upload-time = "2025-10-15T15:13:25.371Z" }, - { url = "https://files.pythonhosted.org/packages/96/5d/dc5fa98fea3c175caf9d360649cb1aa3715e391ab00dc78c4c66fabd7356/coverage-7.11.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f39ae2f63f37472c17b4990f794035c9890418b1b8cca75c01193f3c8d3e01be", size = 216380, upload-time = "2025-10-15T15:13:26.976Z" }, - { url = "https://files.pythonhosted.org/packages/b2/f5/3da9cc9596708273385189289c0e4d8197d37a386bdf17619013554b3447/coverage-7.11.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7db53b5cdd2917b6eaadd0b1251cf4e7d96f4a8d24e174bdbdf2f65b5ea7994d", size = 247375, upload-time = "2025-10-15T15:13:28.923Z" }, - { url = "https://files.pythonhosted.org/packages/65/6c/f7f59c342359a235559d2bc76b0c73cfc4bac7d61bb0df210965cb1ecffd/coverage-7.11.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10ad04ac3a122048688387828b4537bc9cf60c0bf4869c1e9989c46e45690b82", size = 249978, upload-time = "2025-10-15T15:13:30.525Z" }, - { url = "https://files.pythonhosted.org/packages/e7/8c/042dede2e23525e863bf1ccd2b92689692a148d8b5fd37c37899ba882645/coverage-7.11.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4036cc9c7983a2b1f2556d574d2eb2154ac6ed55114761685657e38782b23f52", size = 251253, upload-time = "2025-10-15T15:13:32.174Z" }, - { url = "https://files.pythonhosted.org/packages/7b/a9/3c58df67bfa809a7bddd786356d9c5283e45d693edb5f3f55d0986dd905a/coverage-7.11.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:7ab934dd13b1c5e94b692b1e01bd87e4488cb746e3a50f798cb9464fd128374b", size = 247591, upload-time = "2025-10-15T15:13:34.147Z" }, - { url = "https://files.pythonhosted.org/packages/26/5b/c7f32efd862ee0477a18c41e4761305de6ddd2d49cdeda0c1116227570fd/coverage-7.11.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:59a6e5a265f7cfc05f76e3bb53eca2e0dfe90f05e07e849930fecd6abb8f40b4", size = 249411, upload-time = "2025-10-15T15:13:38.425Z" }, - { url = "https://files.pythonhosted.org/packages/76/b5/78cb4f1e86c1611431c990423ec0768122905b03837e1b4c6a6f388a858b/coverage-7.11.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:df01d6c4c81e15a7c88337b795bb7595a8596e92310266b5072c7e301168efbd", size = 247303, upload-time = "2025-10-15T15:13:40.464Z" }, - { url = "https://files.pythonhosted.org/packages/87/c9/23c753a8641a330f45f221286e707c427e46d0ffd1719b080cedc984ec40/coverage-7.11.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:8c934bd088eed6174210942761e38ee81d28c46de0132ebb1801dbe36a390dcc", size = 247157, upload-time = "2025-10-15T15:13:42.087Z" }, - { url = "https://files.pythonhosted.org/packages/c5/42/6e0cc71dc8a464486e944a4fa0d85bdec031cc2969e98ed41532a98336b9/coverage-7.11.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5a03eaf7ec24078ad64a07f02e30060aaf22b91dedf31a6b24d0d98d2bba7f48", size = 248921, upload-time = "2025-10-15T15:13:43.715Z" }, - { url = "https://files.pythonhosted.org/packages/e8/1c/743c2ef665e6858cccb0f84377dfe3a4c25add51e8c7ef19249be92465b6/coverage-7.11.0-cp313-cp313-win32.whl", hash = "sha256:695340f698a5f56f795b2836abe6fb576e7c53d48cd155ad2f80fd24bc63a040", size = 218526, upload-time = "2025-10-15T15:13:45.336Z" }, - { url = "https://files.pythonhosted.org/packages/ff/d5/226daadfd1bf8ddbccefbd3aa3547d7b960fb48e1bdac124e2dd13a2b71a/coverage-7.11.0-cp313-cp313-win_amd64.whl", hash = "sha256:2727d47fce3ee2bac648528e41455d1b0c46395a087a229deac75e9f88ba5a05", size = 219317, upload-time = "2025-10-15T15:13:47.401Z" }, - { url = "https://files.pythonhosted.org/packages/97/54/47db81dcbe571a48a298f206183ba8a7ba79200a37cd0d9f4788fcd2af4a/coverage-7.11.0-cp313-cp313-win_arm64.whl", hash = "sha256:0efa742f431529699712b92ecdf22de8ff198df41e43aeaaadf69973eb93f17a", size = 217948, upload-time = "2025-10-15T15:13:49.096Z" }, - { url = "https://files.pythonhosted.org/packages/e5/8b/cb68425420154e7e2a82fd779a8cc01549b6fa83c2ad3679cd6c088ebd07/coverage-7.11.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:587c38849b853b157706407e9ebdca8fd12f45869edb56defbef2daa5fb0812b", size = 216837, upload-time = "2025-10-15T15:13:51.09Z" }, - { url = "https://files.pythonhosted.org/packages/33/55/9d61b5765a025685e14659c8d07037247de6383c0385757544ffe4606475/coverage-7.11.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b971bdefdd75096163dd4261c74be813c4508477e39ff7b92191dea19f24cd37", size = 217061, upload-time = "2025-10-15T15:13:52.747Z" }, - { url = "https://files.pythonhosted.org/packages/52/85/292459c9186d70dcec6538f06ea251bc968046922497377bf4a1dc9a71de/coverage-7.11.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:269bfe913b7d5be12ab13a95f3a76da23cf147be7fa043933320ba5625f0a8de", size = 258398, upload-time = "2025-10-15T15:13:54.45Z" }, - { url = "https://files.pythonhosted.org/packages/1f/e2/46edd73fb8bf51446c41148d81944c54ed224854812b6ca549be25113ee0/coverage-7.11.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:dadbcce51a10c07b7c72b0ce4a25e4b6dcb0c0372846afb8e5b6307a121eb99f", size = 260574, upload-time = "2025-10-15T15:13:56.145Z" }, - { url = "https://files.pythonhosted.org/packages/07/5e/1df469a19007ff82e2ca8fe509822820a31e251f80ee7344c34f6cd2ec43/coverage-7.11.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9ed43fa22c6436f7957df036331f8fe4efa7af132054e1844918866cd228af6c", size = 262797, upload-time = "2025-10-15T15:13:58.635Z" }, - { url = "https://files.pythonhosted.org/packages/f9/50/de216b31a1434b94d9b34a964c09943c6be45069ec704bfc379d8d89a649/coverage-7.11.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:9516add7256b6713ec08359b7b05aeff8850c98d357784c7205b2e60aa2513fa", size = 257361, upload-time = "2025-10-15T15:14:00.409Z" }, - { url = "https://files.pythonhosted.org/packages/82/1e/3f9f8344a48111e152e0fd495b6fff13cc743e771a6050abf1627a7ba918/coverage-7.11.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:eb92e47c92fcbcdc692f428da67db33337fa213756f7adb6a011f7b5a7a20740", size = 260349, upload-time = "2025-10-15T15:14:02.188Z" }, - { url = "https://files.pythonhosted.org/packages/65/9b/3f52741f9e7d82124272f3070bbe316006a7de1bad1093f88d59bfc6c548/coverage-7.11.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:d06f4fc7acf3cabd6d74941d53329e06bab00a8fe10e4df2714f0b134bfc64ef", size = 258114, upload-time = "2025-10-15T15:14:03.907Z" }, - { url = "https://files.pythonhosted.org/packages/0b/8b/918f0e15f0365d50d3986bbd3338ca01178717ac5678301f3f547b6619e6/coverage-7.11.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:6fbcee1a8f056af07ecd344482f711f563a9eb1c2cad192e87df00338ec3cdb0", size = 256723, upload-time = "2025-10-15T15:14:06.324Z" }, - { url = "https://files.pythonhosted.org/packages/44/9e/7776829f82d3cf630878a7965a7d70cc6ca94f22c7d20ec4944f7148cb46/coverage-7.11.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dbbf012be5f32533a490709ad597ad8a8ff80c582a95adc8d62af664e532f9ca", size = 259238, upload-time = "2025-10-15T15:14:08.002Z" }, - { url = "https://files.pythonhosted.org/packages/9a/b8/49cf253e1e7a3bedb85199b201862dd7ca4859f75b6cf25ffa7298aa0760/coverage-7.11.0-cp313-cp313t-win32.whl", hash = "sha256:cee6291bb4fed184f1c2b663606a115c743df98a537c969c3c64b49989da96c2", size = 219180, upload-time = "2025-10-15T15:14:09.786Z" }, - { url = "https://files.pythonhosted.org/packages/ac/e1/1a541703826be7ae2125a0fb7f821af5729d56bb71e946e7b933cc7a89a4/coverage-7.11.0-cp313-cp313t-win_amd64.whl", hash = "sha256:a386c1061bf98e7ea4758e4313c0ab5ecf57af341ef0f43a0bf26c2477b5c268", size = 220241, upload-time = "2025-10-15T15:14:11.471Z" }, - { url = "https://files.pythonhosted.org/packages/d5/d1/5ee0e0a08621140fd418ec4020f595b4d52d7eb429ae6a0c6542b4ba6f14/coverage-7.11.0-cp313-cp313t-win_arm64.whl", hash = "sha256:f9ea02ef40bb83823b2b04964459d281688fe173e20643870bb5d2edf68bc836", size = 218510, upload-time = "2025-10-15T15:14:13.46Z" }, - { url = "https://files.pythonhosted.org/packages/f4/06/e923830c1985ce808e40a3fa3eb46c13350b3224b7da59757d37b6ce12b8/coverage-7.11.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:c770885b28fb399aaf2a65bbd1c12bf6f307ffd112d6a76c5231a94276f0c497", size = 216110, upload-time = "2025-10-15T15:14:15.157Z" }, - { url = "https://files.pythonhosted.org/packages/42/82/cdeed03bfead45203fb651ed756dfb5266028f5f939e7f06efac4041dad5/coverage-7.11.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a3d0e2087dba64c86a6b254f43e12d264b636a39e88c5cc0a01a7c71bcfdab7e", size = 216395, upload-time = "2025-10-15T15:14:16.863Z" }, - { url = "https://files.pythonhosted.org/packages/fc/ba/e1c80caffc3199aa699813f73ff097bc2df7b31642bdbc7493600a8f1de5/coverage-7.11.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:73feb83bb41c32811973b8565f3705caf01d928d972b72042b44e97c71fd70d1", size = 247433, upload-time = "2025-10-15T15:14:18.589Z" }, - { url = "https://files.pythonhosted.org/packages/80/c0/5b259b029694ce0a5bbc1548834c7ba3db41d3efd3474489d7efce4ceb18/coverage-7.11.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c6f31f281012235ad08f9a560976cc2fc9c95c17604ff3ab20120fe480169bca", size = 249970, upload-time = "2025-10-15T15:14:20.307Z" }, - { url = "https://files.pythonhosted.org/packages/8c/86/171b2b5e1aac7e2fd9b43f7158b987dbeb95f06d1fbecad54ad8163ae3e8/coverage-7.11.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e9570ad567f880ef675673992222746a124b9595506826b210fbe0ce3f0499cd", size = 251324, upload-time = "2025-10-15T15:14:22.419Z" }, - { url = "https://files.pythonhosted.org/packages/1a/7e/7e10414d343385b92024af3932a27a1caf75c6e27ee88ba211221ff1a145/coverage-7.11.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8badf70446042553a773547a61fecaa734b55dc738cacf20c56ab04b77425e43", size = 247445, upload-time = "2025-10-15T15:14:24.205Z" }, - { url = "https://files.pythonhosted.org/packages/c4/3b/e4f966b21f5be8c4bf86ad75ae94efa0de4c99c7bbb8114476323102e345/coverage-7.11.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a09c1211959903a479e389685b7feb8a17f59ec5a4ef9afde7650bd5eabc2777", size = 249324, upload-time = "2025-10-15T15:14:26.234Z" }, - { url = "https://files.pythonhosted.org/packages/00/a2/8479325576dfcd909244d0df215f077f47437ab852ab778cfa2f8bf4d954/coverage-7.11.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:5ef83b107f50db3f9ae40f69e34b3bd9337456c5a7fe3461c7abf8b75dd666a2", size = 247261, upload-time = "2025-10-15T15:14:28.42Z" }, - { url = "https://files.pythonhosted.org/packages/7b/d8/3a9e2db19d94d65771d0f2e21a9ea587d11b831332a73622f901157cc24b/coverage-7.11.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:f91f927a3215b8907e214af77200250bb6aae36eca3f760f89780d13e495388d", size = 247092, upload-time = "2025-10-15T15:14:30.784Z" }, - { url = "https://files.pythonhosted.org/packages/b3/b1/bbca3c472544f9e2ad2d5116b2379732957048be4b93a9c543fcd0207e5f/coverage-7.11.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:cdbcd376716d6b7fbfeedd687a6c4be019c5a5671b35f804ba76a4c0a778cba4", size = 248755, upload-time = "2025-10-15T15:14:32.585Z" }, - { url = "https://files.pythonhosted.org/packages/89/49/638d5a45a6a0f00af53d6b637c87007eb2297042186334e9923a61aa8854/coverage-7.11.0-cp314-cp314-win32.whl", hash = "sha256:bab7ec4bb501743edc63609320aaec8cd9188b396354f482f4de4d40a9d10721", size = 218793, upload-time = "2025-10-15T15:14:34.972Z" }, - { url = "https://files.pythonhosted.org/packages/30/cc/b675a51f2d068adb3cdf3799212c662239b0ca27f4691d1fff81b92ea850/coverage-7.11.0-cp314-cp314-win_amd64.whl", hash = "sha256:3d4ba9a449e9364a936a27322b20d32d8b166553bfe63059bd21527e681e2fad", size = 219587, upload-time = "2025-10-15T15:14:37.047Z" }, - { url = "https://files.pythonhosted.org/packages/93/98/5ac886876026de04f00820e5094fe22166b98dcb8b426bf6827aaf67048c/coverage-7.11.0-cp314-cp314-win_arm64.whl", hash = "sha256:ce37f215223af94ef0f75ac68ea096f9f8e8c8ec7d6e8c346ee45c0d363f0479", size = 218168, upload-time = "2025-10-15T15:14:38.861Z" }, - { url = "https://files.pythonhosted.org/packages/14/d1/b4145d35b3e3ecf4d917e97fc8895bcf027d854879ba401d9ff0f533f997/coverage-7.11.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:f413ce6e07e0d0dc9c433228727b619871532674b45165abafe201f200cc215f", size = 216850, upload-time = "2025-10-15T15:14:40.651Z" }, - { url = "https://files.pythonhosted.org/packages/ca/d1/7f645fc2eccd318369a8a9948acc447bb7c1ade2911e31d3c5620544c22b/coverage-7.11.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:05791e528a18f7072bf5998ba772fe29db4da1234c45c2087866b5ba4dea710e", size = 217071, upload-time = "2025-10-15T15:14:42.755Z" }, - { url = "https://files.pythonhosted.org/packages/54/7d/64d124649db2737ceced1dfcbdcb79898d5868d311730f622f8ecae84250/coverage-7.11.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cacb29f420cfeb9283b803263c3b9a068924474ff19ca126ba9103e1278dfa44", size = 258570, upload-time = "2025-10-15T15:14:44.542Z" }, - { url = "https://files.pythonhosted.org/packages/6c/3f/6f5922f80dc6f2d8b2c6f974835c43f53eb4257a7797727e6ca5b7b2ec1f/coverage-7.11.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:314c24e700d7027ae3ab0d95fbf8d53544fca1f20345fd30cd219b737c6e58d3", size = 260738, upload-time = "2025-10-15T15:14:46.436Z" }, - { url = "https://files.pythonhosted.org/packages/0e/5f/9e883523c4647c860b3812b417a2017e361eca5b635ee658387dc11b13c1/coverage-7.11.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:630d0bd7a293ad2fc8b4b94e5758c8b2536fdf36c05f1681270203e463cbfa9b", size = 262994, upload-time = "2025-10-15T15:14:48.3Z" }, - { url = "https://files.pythonhosted.org/packages/07/bb/43b5a8e94c09c8bf51743ffc65c4c841a4ca5d3ed191d0a6919c379a1b83/coverage-7.11.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e89641f5175d65e2dbb44db15fe4ea48fade5d5bbb9868fdc2b4fce22f4a469d", size = 257282, upload-time = "2025-10-15T15:14:50.236Z" }, - { url = "https://files.pythonhosted.org/packages/aa/e5/0ead8af411411330b928733e1d201384b39251a5f043c1612970310e8283/coverage-7.11.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c9f08ea03114a637dab06cedb2e914da9dc67fa52c6015c018ff43fdde25b9c2", size = 260430, upload-time = "2025-10-15T15:14:52.413Z" }, - { url = "https://files.pythonhosted.org/packages/ae/66/03dd8bb0ba5b971620dcaac145461950f6d8204953e535d2b20c6b65d729/coverage-7.11.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:ce9f3bde4e9b031eaf1eb61df95c1401427029ea1bfddb8621c1161dcb0fa02e", size = 258190, upload-time = "2025-10-15T15:14:54.268Z" }, - { url = "https://files.pythonhosted.org/packages/45/ae/28a9cce40bf3174426cb2f7e71ee172d98e7f6446dff936a7ccecee34b14/coverage-7.11.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:e4dc07e95495923d6fd4d6c27bf70769425b71c89053083843fd78f378558996", size = 256658, upload-time = "2025-10-15T15:14:56.436Z" }, - { url = "https://files.pythonhosted.org/packages/5c/7c/3a44234a8599513684bfc8684878fd7b126c2760f79712bb78c56f19efc4/coverage-7.11.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:424538266794db2861db4922b05d729ade0940ee69dcf0591ce8f69784db0e11", size = 259342, upload-time = "2025-10-15T15:14:58.538Z" }, - { url = "https://files.pythonhosted.org/packages/e1/e6/0108519cba871af0351725ebdb8660fd7a0fe2ba3850d56d32490c7d9b4b/coverage-7.11.0-cp314-cp314t-win32.whl", hash = "sha256:4c1eeb3fb8eb9e0190bebafd0462936f75717687117339f708f395fe455acc73", size = 219568, upload-time = "2025-10-15T15:15:00.382Z" }, - { url = "https://files.pythonhosted.org/packages/c9/76/44ba876e0942b4e62fdde23ccb029ddb16d19ba1bef081edd00857ba0b16/coverage-7.11.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b56efee146c98dbf2cf5cffc61b9829d1e94442df4d7398b26892a53992d3547", size = 220687, upload-time = "2025-10-15T15:15:02.322Z" }, - { url = "https://files.pythonhosted.org/packages/b9/0c/0df55ecb20d0d0ed5c322e10a441775e1a3a5d78c60f0c4e1abfe6fcf949/coverage-7.11.0-cp314-cp314t-win_arm64.whl", hash = "sha256:b5c2705afa83f49bd91962a4094b6b082f94aef7626365ab3f8f4bd159c5acf3", size = 218711, upload-time = "2025-10-15T15:15:04.575Z" }, - { url = "https://files.pythonhosted.org/packages/5f/04/642c1d8a448ae5ea1369eac8495740a79eb4e581a9fb0cbdce56bbf56da1/coverage-7.11.0-py3-none-any.whl", hash = "sha256:4b7589765348d78fb4e5fb6ea35d07564e387da2fc5efff62e0222971f155f68", size = 207761, upload-time = "2025-10-15T15:15:06.439Z" }, +version = "7.12.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/89/26/4a96807b193b011588099c3b5c89fbb05294e5b90e71018e065465f34eb6/coverage-7.12.0.tar.gz", hash = "sha256:fc11e0a4e372cb5f282f16ef90d4a585034050ccda536451901abfb19a57f40c", size = 819341, upload-time = "2025-11-18T13:34:20.766Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/4a/0dc3de1c172d35abe512332cfdcc43211b6ebce629e4cc42e6cd25ed8f4d/coverage-7.12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:32b75c2ba3f324ee37af3ccee5b30458038c50b349ad9b88cee85096132a575b", size = 217409, upload-time = "2025-11-18T13:31:53.122Z" }, + { url = "https://files.pythonhosted.org/packages/01/c3/086198b98db0109ad4f84241e8e9ea7e5fb2db8c8ffb787162d40c26cc76/coverage-7.12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cb2a1b6ab9fe833714a483a915de350abc624a37149649297624c8d57add089c", size = 217927, upload-time = "2025-11-18T13:31:54.458Z" }, + { url = "https://files.pythonhosted.org/packages/5d/5f/34614dbf5ce0420828fc6c6f915126a0fcb01e25d16cf141bf5361e6aea6/coverage-7.12.0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5734b5d913c3755e72f70bf6cc37a0518d4f4745cde760c5d8e12005e62f9832", size = 244678, upload-time = "2025-11-18T13:31:55.805Z" }, + { url = "https://files.pythonhosted.org/packages/55/7b/6b26fb32e8e4a6989ac1d40c4e132b14556131493b1d06bc0f2be169c357/coverage-7.12.0-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b527a08cdf15753279b7afb2339a12073620b761d79b81cbe2cdebdb43d90daa", size = 246507, upload-time = "2025-11-18T13:31:57.05Z" }, + { url = "https://files.pythonhosted.org/packages/06/42/7d70e6603d3260199b90fb48b537ca29ac183d524a65cc31366b2e905fad/coverage-7.12.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9bb44c889fb68004e94cab71f6a021ec83eac9aeabdbb5a5a88821ec46e1da73", size = 248366, upload-time = "2025-11-18T13:31:58.362Z" }, + { url = "https://files.pythonhosted.org/packages/2d/4a/d86b837923878424c72458c5b25e899a3c5ca73e663082a915f5b3c4d749/coverage-7.12.0-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:4b59b501455535e2e5dde5881739897967b272ba25988c89145c12d772810ccb", size = 245366, upload-time = "2025-11-18T13:31:59.572Z" }, + { url = "https://files.pythonhosted.org/packages/e6/c2/2adec557e0aa9721875f06ced19730fdb7fc58e31b02b5aa56f2ebe4944d/coverage-7.12.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d8842f17095b9868a05837b7b1b73495293091bed870e099521ada176aa3e00e", size = 246408, upload-time = "2025-11-18T13:32:00.784Z" }, + { url = "https://files.pythonhosted.org/packages/5a/4b/8bd1f1148260df11c618e535fdccd1e5aaf646e55b50759006a4f41d8a26/coverage-7.12.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:c5a6f20bf48b8866095c6820641e7ffbe23f2ac84a2efc218d91235e404c7777", size = 244416, upload-time = "2025-11-18T13:32:01.963Z" }, + { url = "https://files.pythonhosted.org/packages/0e/13/3a248dd6a83df90414c54a4e121fd081fb20602ca43955fbe1d60e2312a9/coverage-7.12.0-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:5f3738279524e988d9da2893f307c2093815c623f8d05a8f79e3eff3a7a9e553", size = 244681, upload-time = "2025-11-18T13:32:03.408Z" }, + { url = "https://files.pythonhosted.org/packages/76/30/aa833827465a5e8c938935f5d91ba055f70516941078a703740aaf1aa41f/coverage-7.12.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e0d68c1f7eabbc8abe582d11fa393ea483caf4f44b0af86881174769f185c94d", size = 245300, upload-time = "2025-11-18T13:32:04.686Z" }, + { url = "https://files.pythonhosted.org/packages/38/24/f85b3843af1370fb3739fa7571819b71243daa311289b31214fe3e8c9d68/coverage-7.12.0-cp310-cp310-win32.whl", hash = "sha256:7670d860e18b1e3ee5930b17a7d55ae6287ec6e55d9799982aa103a2cc1fa2ef", size = 220008, upload-time = "2025-11-18T13:32:05.806Z" }, + { url = "https://files.pythonhosted.org/packages/3a/a2/c7da5b9566f7164db9eefa133d17761ecb2c2fde9385d754e5b5c80f710d/coverage-7.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:f999813dddeb2a56aab5841e687b68169da0d3f6fc78ccf50952fa2463746022", size = 220943, upload-time = "2025-11-18T13:32:07.166Z" }, + { url = "https://files.pythonhosted.org/packages/5a/0c/0dfe7f0487477d96432e4815537263363fb6dd7289743a796e8e51eabdf2/coverage-7.12.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aa124a3683d2af98bd9d9c2bfa7a5076ca7e5ab09fdb96b81fa7d89376ae928f", size = 217535, upload-time = "2025-11-18T13:32:08.812Z" }, + { url = "https://files.pythonhosted.org/packages/9b/f5/f9a4a053a5bbff023d3bec259faac8f11a1e5a6479c2ccf586f910d8dac7/coverage-7.12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d93fbf446c31c0140208dcd07c5d882029832e8ed7891a39d6d44bd65f2316c3", size = 218044, upload-time = "2025-11-18T13:32:10.329Z" }, + { url = "https://files.pythonhosted.org/packages/95/c5/84fc3697c1fa10cd8571919bf9693f693b7373278daaf3b73e328d502bc8/coverage-7.12.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:52ca620260bd8cd6027317bdd8b8ba929be1d741764ee765b42c4d79a408601e", size = 248440, upload-time = "2025-11-18T13:32:12.536Z" }, + { url = "https://files.pythonhosted.org/packages/f4/36/2d93fbf6a04670f3874aed397d5a5371948a076e3249244a9e84fb0e02d6/coverage-7.12.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f3433ffd541380f3a0e423cff0f4926d55b0cc8c1d160fdc3be24a4c03aa65f7", size = 250361, upload-time = "2025-11-18T13:32:13.852Z" }, + { url = "https://files.pythonhosted.org/packages/5d/49/66dc65cc456a6bfc41ea3d0758c4afeaa4068a2b2931bf83be6894cf1058/coverage-7.12.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f7bbb321d4adc9f65e402c677cd1c8e4c2d0105d3ce285b51b4d87f1d5db5245", size = 252472, upload-time = "2025-11-18T13:32:15.068Z" }, + { url = "https://files.pythonhosted.org/packages/35/1f/ebb8a18dffd406db9fcd4b3ae42254aedcaf612470e8712f12041325930f/coverage-7.12.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:22a7aade354a72dff3b59c577bfd18d6945c61f97393bc5fb7bd293a4237024b", size = 248592, upload-time = "2025-11-18T13:32:16.328Z" }, + { url = "https://files.pythonhosted.org/packages/da/a8/67f213c06e5ea3b3d4980df7dc344d7fea88240b5fe878a5dcbdfe0e2315/coverage-7.12.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3ff651dcd36d2fea66877cd4a82de478004c59b849945446acb5baf9379a1b64", size = 250167, upload-time = "2025-11-18T13:32:17.687Z" }, + { url = "https://files.pythonhosted.org/packages/f0/00/e52aef68154164ea40cc8389c120c314c747fe63a04b013a5782e989b77f/coverage-7.12.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:31b8b2e38391a56e3cea39d22a23faaa7c3fc911751756ef6d2621d2a9daf742", size = 248238, upload-time = "2025-11-18T13:32:19.2Z" }, + { url = "https://files.pythonhosted.org/packages/1f/a4/4d88750bcf9d6d66f77865e5a05a20e14db44074c25fd22519777cb69025/coverage-7.12.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:297bc2da28440f5ae51c845a47c8175a4db0553a53827886e4fb25c66633000c", size = 247964, upload-time = "2025-11-18T13:32:21.027Z" }, + { url = "https://files.pythonhosted.org/packages/a7/6b/b74693158899d5b47b0bf6238d2c6722e20ba749f86b74454fac0696bb00/coverage-7.12.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6ff7651cc01a246908eac162a6a86fc0dbab6de1ad165dfb9a1e2ec660b44984", size = 248862, upload-time = "2025-11-18T13:32:22.304Z" }, + { url = "https://files.pythonhosted.org/packages/18/de/6af6730227ce0e8ade307b1cc4a08e7f51b419a78d02083a86c04ccceb29/coverage-7.12.0-cp311-cp311-win32.whl", hash = "sha256:313672140638b6ddb2c6455ddeda41c6a0b208298034544cfca138978c6baed6", size = 220033, upload-time = "2025-11-18T13:32:23.714Z" }, + { url = "https://files.pythonhosted.org/packages/e2/a1/e7f63021a7c4fe20994359fcdeae43cbef4a4d0ca36a5a1639feeea5d9e1/coverage-7.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:a1783ed5bd0d5938d4435014626568dc7f93e3cb99bc59188cc18857c47aa3c4", size = 220966, upload-time = "2025-11-18T13:32:25.599Z" }, + { url = "https://files.pythonhosted.org/packages/77/e8/deae26453f37c20c3aa0c4433a1e32cdc169bf415cce223a693117aa3ddd/coverage-7.12.0-cp311-cp311-win_arm64.whl", hash = "sha256:4648158fd8dd9381b5847622df1c90ff314efbfc1df4550092ab6013c238a5fc", size = 219637, upload-time = "2025-11-18T13:32:27.265Z" }, + { url = "https://files.pythonhosted.org/packages/02/bf/638c0427c0f0d47638242e2438127f3c8ee3cfc06c7fdeb16778ed47f836/coverage-7.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:29644c928772c78512b48e14156b81255000dcfd4817574ff69def189bcb3647", size = 217704, upload-time = "2025-11-18T13:32:28.906Z" }, + { url = "https://files.pythonhosted.org/packages/08/e1/706fae6692a66c2d6b871a608bbde0da6281903fa0e9f53a39ed441da36a/coverage-7.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8638cbb002eaa5d7c8d04da667813ce1067080b9a91099801a0053086e52b736", size = 218064, upload-time = "2025-11-18T13:32:30.161Z" }, + { url = "https://files.pythonhosted.org/packages/a9/8b/eb0231d0540f8af3ffda39720ff43cb91926489d01524e68f60e961366e4/coverage-7.12.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:083631eeff5eb9992c923e14b810a179798bb598e6a0dd60586819fc23be6e60", size = 249560, upload-time = "2025-11-18T13:32:31.835Z" }, + { url = "https://files.pythonhosted.org/packages/e9/a1/67fb52af642e974d159b5b379e4d4c59d0ebe1288677fbd04bbffe665a82/coverage-7.12.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:99d5415c73ca12d558e07776bd957c4222c687b9f1d26fa0e1b57e3598bdcde8", size = 252318, upload-time = "2025-11-18T13:32:33.178Z" }, + { url = "https://files.pythonhosted.org/packages/41/e5/38228f31b2c7665ebf9bdfdddd7a184d56450755c7e43ac721c11a4b8dab/coverage-7.12.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e949ebf60c717c3df63adb4a1a366c096c8d7fd8472608cd09359e1bd48ef59f", size = 253403, upload-time = "2025-11-18T13:32:34.45Z" }, + { url = "https://files.pythonhosted.org/packages/ec/4b/df78e4c8188f9960684267c5a4897836f3f0f20a20c51606ee778a1d9749/coverage-7.12.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6d907ddccbca819afa2cd014bc69983b146cca2735a0b1e6259b2a6c10be1e70", size = 249984, upload-time = "2025-11-18T13:32:35.747Z" }, + { url = "https://files.pythonhosted.org/packages/ba/51/bb163933d195a345c6f63eab9e55743413d064c291b6220df754075c2769/coverage-7.12.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b1518ecbad4e6173f4c6e6c4a46e49555ea5679bf3feda5edb1b935c7c44e8a0", size = 251339, upload-time = "2025-11-18T13:32:37.352Z" }, + { url = "https://files.pythonhosted.org/packages/15/40/c9b29cdb8412c837cdcbc2cfa054547dd83affe6cbbd4ce4fdb92b6ba7d1/coverage-7.12.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:51777647a749abdf6f6fd8c7cffab12de68ab93aab15efc72fbbb83036c2a068", size = 249489, upload-time = "2025-11-18T13:32:39.212Z" }, + { url = "https://files.pythonhosted.org/packages/c8/da/b3131e20ba07a0de4437a50ef3b47840dfabf9293675b0cd5c2c7f66dd61/coverage-7.12.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:42435d46d6461a3b305cdfcad7cdd3248787771f53fe18305548cba474e6523b", size = 249070, upload-time = "2025-11-18T13:32:40.598Z" }, + { url = "https://files.pythonhosted.org/packages/70/81/b653329b5f6302c08d683ceff6785bc60a34be9ae92a5c7b63ee7ee7acec/coverage-7.12.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5bcead88c8423e1855e64b8057d0544e33e4080b95b240c2a355334bb7ced937", size = 250929, upload-time = "2025-11-18T13:32:42.915Z" }, + { url = "https://files.pythonhosted.org/packages/a3/00/250ac3bca9f252a5fb1338b5ad01331ebb7b40223f72bef5b1b2cb03aa64/coverage-7.12.0-cp312-cp312-win32.whl", hash = "sha256:dcbb630ab034e86d2a0f79aefd2be07e583202f41e037602d438c80044957baa", size = 220241, upload-time = "2025-11-18T13:32:44.665Z" }, + { url = "https://files.pythonhosted.org/packages/64/1c/77e79e76d37ce83302f6c21980b45e09f8aa4551965213a10e62d71ce0ab/coverage-7.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:2fd8354ed5d69775ac42986a691fbf68b4084278710cee9d7c3eaa0c28fa982a", size = 221051, upload-time = "2025-11-18T13:32:46.008Z" }, + { url = "https://files.pythonhosted.org/packages/31/f5/641b8a25baae564f9e52cac0e2667b123de961985709a004e287ee7663cc/coverage-7.12.0-cp312-cp312-win_arm64.whl", hash = "sha256:737c3814903be30695b2de20d22bcc5428fdae305c61ba44cdc8b3252984c49c", size = 219692, upload-time = "2025-11-18T13:32:47.372Z" }, + { url = "https://files.pythonhosted.org/packages/b8/14/771700b4048774e48d2c54ed0c674273702713c9ee7acdfede40c2666747/coverage-7.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:47324fffca8d8eae7e185b5bb20c14645f23350f870c1649003618ea91a78941", size = 217725, upload-time = "2025-11-18T13:32:49.22Z" }, + { url = "https://files.pythonhosted.org/packages/17/a7/3aa4144d3bcb719bf67b22d2d51c2d577bf801498c13cb08f64173e80497/coverage-7.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ccf3b2ede91decd2fb53ec73c1f949c3e034129d1e0b07798ff1d02ea0c8fa4a", size = 218098, upload-time = "2025-11-18T13:32:50.78Z" }, + { url = "https://files.pythonhosted.org/packages/fc/9c/b846bbc774ff81091a12a10203e70562c91ae71badda00c5ae5b613527b1/coverage-7.12.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:b365adc70a6936c6b0582dc38746b33b2454148c02349345412c6e743efb646d", size = 249093, upload-time = "2025-11-18T13:32:52.554Z" }, + { url = "https://files.pythonhosted.org/packages/76/b6/67d7c0e1f400b32c883e9342de4a8c2ae7c1a0b57c5de87622b7262e2309/coverage-7.12.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bc13baf85cd8a4cfcf4a35c7bc9d795837ad809775f782f697bf630b7e200211", size = 251686, upload-time = "2025-11-18T13:32:54.862Z" }, + { url = "https://files.pythonhosted.org/packages/cc/75/b095bd4b39d49c3be4bffbb3135fea18a99a431c52dd7513637c0762fecb/coverage-7.12.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:099d11698385d572ceafb3288a5b80fe1fc58bf665b3f9d362389de488361d3d", size = 252930, upload-time = "2025-11-18T13:32:56.417Z" }, + { url = "https://files.pythonhosted.org/packages/6e/f3/466f63015c7c80550bead3093aacabf5380c1220a2a93c35d374cae8f762/coverage-7.12.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:473dc45d69694069adb7680c405fb1e81f60b2aff42c81e2f2c3feaf544d878c", size = 249296, upload-time = "2025-11-18T13:32:58.074Z" }, + { url = "https://files.pythonhosted.org/packages/27/86/eba2209bf2b7e28c68698fc13437519a295b2d228ba9e0ec91673e09fa92/coverage-7.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:583f9adbefd278e9de33c33d6846aa8f5d164fa49b47144180a0e037f0688bb9", size = 251068, upload-time = "2025-11-18T13:32:59.646Z" }, + { url = "https://files.pythonhosted.org/packages/ec/55/ca8ae7dbba962a3351f18940b359b94c6bafdd7757945fdc79ec9e452dc7/coverage-7.12.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b2089cc445f2dc0af6f801f0d1355c025b76c24481935303cf1af28f636688f0", size = 249034, upload-time = "2025-11-18T13:33:01.481Z" }, + { url = "https://files.pythonhosted.org/packages/7a/d7/39136149325cad92d420b023b5fd900dabdd1c3a0d1d5f148ef4a8cedef5/coverage-7.12.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:950411f1eb5d579999c5f66c62a40961f126fc71e5e14419f004471957b51508", size = 248853, upload-time = "2025-11-18T13:33:02.935Z" }, + { url = "https://files.pythonhosted.org/packages/fe/b6/76e1add8b87ef60e00643b0b7f8f7bb73d4bf5249a3be19ebefc5793dd25/coverage-7.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b1aab7302a87bafebfe76b12af681b56ff446dc6f32ed178ff9c092ca776e6bc", size = 250619, upload-time = "2025-11-18T13:33:04.336Z" }, + { url = "https://files.pythonhosted.org/packages/95/87/924c6dc64f9203f7a3c1832a6a0eee5a8335dbe5f1bdadcc278d6f1b4d74/coverage-7.12.0-cp313-cp313-win32.whl", hash = "sha256:d7e0d0303c13b54db495eb636bc2465b2fb8475d4c8bcec8fe4b5ca454dfbae8", size = 220261, upload-time = "2025-11-18T13:33:06.493Z" }, + { url = "https://files.pythonhosted.org/packages/91/77/dd4aff9af16ff776bf355a24d87eeb48fc6acde54c907cc1ea89b14a8804/coverage-7.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:ce61969812d6a98a981d147d9ac583a36ac7db7766f2e64a9d4d059c2fe29d07", size = 221072, upload-time = "2025-11-18T13:33:07.926Z" }, + { url = "https://files.pythonhosted.org/packages/70/49/5c9dc46205fef31b1b226a6e16513193715290584317fd4df91cdaf28b22/coverage-7.12.0-cp313-cp313-win_arm64.whl", hash = "sha256:bcec6f47e4cb8a4c2dc91ce507f6eefc6a1b10f58df32cdc61dff65455031dfc", size = 219702, upload-time = "2025-11-18T13:33:09.631Z" }, + { url = "https://files.pythonhosted.org/packages/9b/62/f87922641c7198667994dd472a91e1d9b829c95d6c29529ceb52132436ad/coverage-7.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:459443346509476170d553035e4a3eed7b860f4fe5242f02de1010501956ce87", size = 218420, upload-time = "2025-11-18T13:33:11.153Z" }, + { url = "https://files.pythonhosted.org/packages/85/dd/1cc13b2395ef15dbb27d7370a2509b4aee77890a464fb35d72d428f84871/coverage-7.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:04a79245ab2b7a61688958f7a855275997134bc84f4a03bc240cf64ff132abf6", size = 218773, upload-time = "2025-11-18T13:33:12.569Z" }, + { url = "https://files.pythonhosted.org/packages/74/40/35773cc4bb1e9d4658d4fb669eb4195b3151bef3bbd6f866aba5cd5dac82/coverage-7.12.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:09a86acaaa8455f13d6a99221d9654df249b33937b4e212b4e5a822065f12aa7", size = 260078, upload-time = "2025-11-18T13:33:14.037Z" }, + { url = "https://files.pythonhosted.org/packages/ec/ee/231bb1a6ffc2905e396557585ebc6bdc559e7c66708376d245a1f1d330fc/coverage-7.12.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:907e0df1b71ba77463687a74149c6122c3f6aac56c2510a5d906b2f368208560", size = 262144, upload-time = "2025-11-18T13:33:15.601Z" }, + { url = "https://files.pythonhosted.org/packages/28/be/32f4aa9f3bf0b56f3971001b56508352c7753915345d45fab4296a986f01/coverage-7.12.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b57e2d0ddd5f0582bae5437c04ee71c46cd908e7bc5d4d0391f9a41e812dd12", size = 264574, upload-time = "2025-11-18T13:33:17.354Z" }, + { url = "https://files.pythonhosted.org/packages/68/7c/00489fcbc2245d13ab12189b977e0cf06ff3351cb98bc6beba8bd68c5902/coverage-7.12.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:58c1c6aa677f3a1411fe6fb28ec3a942e4f665df036a3608816e0847fad23296", size = 259298, upload-time = "2025-11-18T13:33:18.958Z" }, + { url = "https://files.pythonhosted.org/packages/96/b4/f0760d65d56c3bea95b449e02570d4abd2549dc784bf39a2d4721a2d8ceb/coverage-7.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4c589361263ab2953e3c4cd2a94db94c4ad4a8e572776ecfbad2389c626e4507", size = 262150, upload-time = "2025-11-18T13:33:20.644Z" }, + { url = "https://files.pythonhosted.org/packages/c5/71/9a9314df00f9326d78c1e5a910f520d599205907432d90d1c1b7a97aa4b1/coverage-7.12.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:91b810a163ccad2e43b1faa11d70d3cf4b6f3d83f9fd5f2df82a32d47b648e0d", size = 259763, upload-time = "2025-11-18T13:33:22.189Z" }, + { url = "https://files.pythonhosted.org/packages/10/34/01a0aceed13fbdf925876b9a15d50862eb8845454301fe3cdd1df08b2182/coverage-7.12.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:40c867af715f22592e0d0fb533a33a71ec9e0f73a6945f722a0c85c8c1cbe3a2", size = 258653, upload-time = "2025-11-18T13:33:24.239Z" }, + { url = "https://files.pythonhosted.org/packages/8d/04/81d8fd64928acf1574bbb0181f66901c6c1c6279c8ccf5f84259d2c68ae9/coverage-7.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:68b0d0a2d84f333de875666259dadf28cc67858bc8fd8b3f1eae84d3c2bec455", size = 260856, upload-time = "2025-11-18T13:33:26.365Z" }, + { url = "https://files.pythonhosted.org/packages/f2/76/fa2a37bfaeaf1f766a2d2360a25a5297d4fb567098112f6517475eee120b/coverage-7.12.0-cp313-cp313t-win32.whl", hash = "sha256:73f9e7fbd51a221818fd11b7090eaa835a353ddd59c236c57b2199486b116c6d", size = 220936, upload-time = "2025-11-18T13:33:28.165Z" }, + { url = "https://files.pythonhosted.org/packages/f9/52/60f64d932d555102611c366afb0eb434b34266b1d9266fc2fe18ab641c47/coverage-7.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:24cff9d1f5743f67db7ba46ff284018a6e9aeb649b67aa1e70c396aa1b7cb23c", size = 222001, upload-time = "2025-11-18T13:33:29.656Z" }, + { url = "https://files.pythonhosted.org/packages/77/df/c303164154a5a3aea7472bf323b7c857fed93b26618ed9fc5c2955566bb0/coverage-7.12.0-cp313-cp313t-win_arm64.whl", hash = "sha256:c87395744f5c77c866d0f5a43d97cc39e17c7f1cb0115e54a2fe67ca75c5d14d", size = 220273, upload-time = "2025-11-18T13:33:31.415Z" }, + { url = "https://files.pythonhosted.org/packages/bf/2e/fc12db0883478d6e12bbd62d481210f0c8daf036102aa11434a0c5755825/coverage-7.12.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:a1c59b7dc169809a88b21a936eccf71c3895a78f5592051b1af8f4d59c2b4f92", size = 217777, upload-time = "2025-11-18T13:33:32.86Z" }, + { url = "https://files.pythonhosted.org/packages/1f/c1/ce3e525d223350c6ec16b9be8a057623f54226ef7f4c2fee361ebb6a02b8/coverage-7.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8787b0f982e020adb732b9f051f3e49dd5054cebbc3f3432061278512a2b1360", size = 218100, upload-time = "2025-11-18T13:33:34.532Z" }, + { url = "https://files.pythonhosted.org/packages/15/87/113757441504aee3808cb422990ed7c8bcc2d53a6779c66c5adef0942939/coverage-7.12.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5ea5a9f7dc8877455b13dd1effd3202e0bca72f6f3ab09f9036b1bcf728f69ac", size = 249151, upload-time = "2025-11-18T13:33:36.135Z" }, + { url = "https://files.pythonhosted.org/packages/d9/1d/9529d9bd44049b6b05bb319c03a3a7e4b0a8a802d28fa348ad407e10706d/coverage-7.12.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fdba9f15849534594f60b47c9a30bc70409b54947319a7c4fd0e8e3d8d2f355d", size = 251667, upload-time = "2025-11-18T13:33:37.996Z" }, + { url = "https://files.pythonhosted.org/packages/11/bb/567e751c41e9c03dc29d3ce74b8c89a1e3396313e34f255a2a2e8b9ebb56/coverage-7.12.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a00594770eb715854fb1c57e0dea08cce6720cfbc531accdb9850d7c7770396c", size = 253003, upload-time = "2025-11-18T13:33:39.553Z" }, + { url = "https://files.pythonhosted.org/packages/e4/b3/c2cce2d8526a02fb9e9ca14a263ca6fc074449b33a6afa4892838c903528/coverage-7.12.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5560c7e0d82b42eb1951e4f68f071f8017c824ebfd5a6ebe42c60ac16c6c2434", size = 249185, upload-time = "2025-11-18T13:33:42.086Z" }, + { url = "https://files.pythonhosted.org/packages/0e/a7/967f93bb66e82c9113c66a8d0b65ecf72fc865adfba5a145f50c7af7e58d/coverage-7.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d6c2e26b481c9159c2773a37947a9718cfdc58893029cdfb177531793e375cfc", size = 251025, upload-time = "2025-11-18T13:33:43.634Z" }, + { url = "https://files.pythonhosted.org/packages/b9/b2/f2f6f56337bc1af465d5b2dc1ee7ee2141b8b9272f3bf6213fcbc309a836/coverage-7.12.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:6e1a8c066dabcde56d5d9fed6a66bc19a2883a3fe051f0c397a41fc42aedd4cc", size = 248979, upload-time = "2025-11-18T13:33:46.04Z" }, + { url = "https://files.pythonhosted.org/packages/f4/7a/bf4209f45a4aec09d10a01a57313a46c0e0e8f4c55ff2965467d41a92036/coverage-7.12.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:f7ba9da4726e446d8dd8aae5a6cd872511184a5d861de80a86ef970b5dacce3e", size = 248800, upload-time = "2025-11-18T13:33:47.546Z" }, + { url = "https://files.pythonhosted.org/packages/b8/b7/1e01b8696fb0521810f60c5bbebf699100d6754183e6cc0679bf2ed76531/coverage-7.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e0f483ab4f749039894abaf80c2f9e7ed77bbf3c737517fb88c8e8e305896a17", size = 250460, upload-time = "2025-11-18T13:33:49.537Z" }, + { url = "https://files.pythonhosted.org/packages/71/ae/84324fb9cb46c024760e706353d9b771a81b398d117d8c1fe010391c186f/coverage-7.12.0-cp314-cp314-win32.whl", hash = "sha256:76336c19a9ef4a94b2f8dc79f8ac2da3f193f625bb5d6f51a328cd19bfc19933", size = 220533, upload-time = "2025-11-18T13:33:51.16Z" }, + { url = "https://files.pythonhosted.org/packages/e2/71/1033629deb8460a8f97f83e6ac4ca3b93952e2b6f826056684df8275e015/coverage-7.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:7c1059b600aec6ef090721f8f633f60ed70afaffe8ecab85b59df748f24b31fe", size = 221348, upload-time = "2025-11-18T13:33:52.776Z" }, + { url = "https://files.pythonhosted.org/packages/0a/5f/ac8107a902f623b0c251abdb749be282dc2ab61854a8a4fcf49e276fce2f/coverage-7.12.0-cp314-cp314-win_arm64.whl", hash = "sha256:172cf3a34bfef42611963e2b661302a8931f44df31629e5b1050567d6b90287d", size = 219922, upload-time = "2025-11-18T13:33:54.316Z" }, + { url = "https://files.pythonhosted.org/packages/79/6e/f27af2d4da367f16077d21ef6fe796c874408219fa6dd3f3efe7751bd910/coverage-7.12.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:aa7d48520a32cb21c7a9b31f81799e8eaec7239db36c3b670be0fa2403828d1d", size = 218511, upload-time = "2025-11-18T13:33:56.343Z" }, + { url = "https://files.pythonhosted.org/packages/67/dd/65fd874aa460c30da78f9d259400d8e6a4ef457d61ab052fd248f0050558/coverage-7.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:90d58ac63bc85e0fb919f14d09d6caa63f35a5512a2205284b7816cafd21bb03", size = 218771, upload-time = "2025-11-18T13:33:57.966Z" }, + { url = "https://files.pythonhosted.org/packages/55/e0/7c6b71d327d8068cb79c05f8f45bf1b6145f7a0de23bbebe63578fe5240a/coverage-7.12.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ca8ecfa283764fdda3eae1bdb6afe58bf78c2c3ec2b2edcb05a671f0bba7b3f9", size = 260151, upload-time = "2025-11-18T13:33:59.597Z" }, + { url = "https://files.pythonhosted.org/packages/49/ce/4697457d58285b7200de6b46d606ea71066c6e674571a946a6ea908fb588/coverage-7.12.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:874fe69a0785d96bd066059cd4368022cebbec1a8958f224f0016979183916e6", size = 262257, upload-time = "2025-11-18T13:34:01.166Z" }, + { url = "https://files.pythonhosted.org/packages/2f/33/acbc6e447aee4ceba88c15528dbe04a35fb4d67b59d393d2e0d6f1e242c1/coverage-7.12.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5b3c889c0b8b283a24d721a9eabc8ccafcfc3aebf167e4cd0d0e23bf8ec4e339", size = 264671, upload-time = "2025-11-18T13:34:02.795Z" }, + { url = "https://files.pythonhosted.org/packages/87/ec/e2822a795c1ed44d569980097be839c5e734d4c0c1119ef8e0a073496a30/coverage-7.12.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8bb5b894b3ec09dcd6d3743229dc7f2c42ef7787dc40596ae04c0edda487371e", size = 259231, upload-time = "2025-11-18T13:34:04.397Z" }, + { url = "https://files.pythonhosted.org/packages/72/c5/a7ec5395bb4a49c9b7ad97e63f0c92f6bf4a9e006b1393555a02dae75f16/coverage-7.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:79a44421cd5fba96aa57b5e3b5a4d3274c449d4c622e8f76882d76635501fd13", size = 262137, upload-time = "2025-11-18T13:34:06.068Z" }, + { url = "https://files.pythonhosted.org/packages/67/0c/02c08858b764129f4ecb8e316684272972e60777ae986f3865b10940bdd6/coverage-7.12.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:33baadc0efd5c7294f436a632566ccc1f72c867f82833eb59820ee37dc811c6f", size = 259745, upload-time = "2025-11-18T13:34:08.04Z" }, + { url = "https://files.pythonhosted.org/packages/5a/04/4fd32b7084505f3829a8fe45c1a74a7a728cb251aaadbe3bec04abcef06d/coverage-7.12.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:c406a71f544800ef7e9e0000af706b88465f3573ae8b8de37e5f96c59f689ad1", size = 258570, upload-time = "2025-11-18T13:34:09.676Z" }, + { url = "https://files.pythonhosted.org/packages/48/35/2365e37c90df4f5342c4fa202223744119fe31264ee2924f09f074ea9b6d/coverage-7.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e71bba6a40883b00c6d571599b4627f50c360b3d0d02bfc658168936be74027b", size = 260899, upload-time = "2025-11-18T13:34:11.259Z" }, + { url = "https://files.pythonhosted.org/packages/05/56/26ab0464ca733fa325e8e71455c58c1c374ce30f7c04cebb88eabb037b18/coverage-7.12.0-cp314-cp314t-win32.whl", hash = "sha256:9157a5e233c40ce6613dead4c131a006adfda70e557b6856b97aceed01b0e27a", size = 221313, upload-time = "2025-11-18T13:34:12.863Z" }, + { url = "https://files.pythonhosted.org/packages/da/1c/017a3e1113ed34d998b27d2c6dba08a9e7cb97d362f0ec988fcd873dcf81/coverage-7.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:e84da3a0fd233aeec797b981c51af1cabac74f9bd67be42458365b30d11b5291", size = 222423, upload-time = "2025-11-18T13:34:15.14Z" }, + { url = "https://files.pythonhosted.org/packages/4c/36/bcc504fdd5169301b52568802bb1b9cdde2e27a01d39fbb3b4b508ab7c2c/coverage-7.12.0-cp314-cp314t-win_arm64.whl", hash = "sha256:01d24af36fedda51c2b1aca56e4330a3710f83b02a5ff3743a6b015ffa7c9384", size = 220459, upload-time = "2025-11-18T13:34:17.222Z" }, + { url = "https://files.pythonhosted.org/packages/ce/a3/43b749004e3c09452e39bb56347a008f0a0668aad37324a99b5c8ca91d9e/coverage-7.12.0-py3-none-any.whl", hash = "sha256:159d50c0b12e060b15ed3d39f87ed43d4f7f7ad40b8a534f4dd331adbb51104a", size = 209503, upload-time = "2025-11-18T13:34:18.892Z" }, ] [package.optional-dependencies] @@ -1040,82 +1007,6 @@ toml = [ { name = "tomli", marker = "python_full_version <= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] -[[package]] -name = "crc32c" -version = "2.8" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e3/66/7e97aa77af7cf6afbff26e3651b564fe41932599bc2d3dce0b2f73d4829a/crc32c-2.8.tar.gz", hash = "sha256:578728964e59c47c356aeeedee6220e021e124b9d3e8631d95d9a5e5f06e261c", size = 48179, upload-time = "2025-10-17T06:20:13.61Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c4/a0/28b4686a8db0bb0f77970f4c6ccede90d1d5740a1d4b4703bd54c3e75655/crc32c-2.8-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2c0f4eb01fe7c0a3e3f973a418e04d52101bb077dd77626fd80c658ec60aaf95", size = 66321, upload-time = "2025-10-17T06:18:53.543Z" }, - { url = "https://files.pythonhosted.org/packages/76/1f/1697f5b8b770f715ed9b264d79e36b4f77ae0527f81f3c749ef08937a32e/crc32c-2.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6baefcfbca82b1a9678455416da24f18629769a76920c640d5a538620a7d12bb", size = 62985, upload-time = "2025-10-17T06:18:54.97Z" }, - { url = "https://files.pythonhosted.org/packages/e0/e5/333cfa5ffa8d5779733aced2b984b5e5139b4a8ceaa2c6bc563e9a1092f3/crc32c-2.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d7f959fcf6c5aad1c4a653ee1a50f05760dab1d1c35d98ec4d7f0f68643f7612", size = 61517, upload-time = "2025-10-17T06:18:55.795Z" }, - { url = "https://files.pythonhosted.org/packages/e1/d8/362a009e8140dd926a153b44d56753e3aa7cb50aca243779a84adadbff11/crc32c-2.8-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9bb678507a4e4cf3f0506607b046ecc4ed1c58a19e08a3fb3c2d25441c480bf1", size = 79385, upload-time = "2025-10-17T06:18:56.598Z" }, - { url = "https://files.pythonhosted.org/packages/4a/9f/0d4ea3aa71ffb15f1285669d23024cc40779388ce32157d339dc2584491c/crc32c-2.8-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1a16f7ffa4c242a909558565567cbba95148603717b53538ea299c98da68e7a9", size = 80965, upload-time = "2025-10-17T06:18:57.384Z" }, - { url = "https://files.pythonhosted.org/packages/20/44/d77657aaca4a2c0283f2356a3da6f8e91b003567bb8f09daaf540cbf192f/crc32c-2.8-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0184369aad562d801f91f454c81f56b9ecb966f6b96684c4d6cf82fc8741d2ad", size = 79993, upload-time = "2025-10-17T06:18:58.503Z" }, - { url = "https://files.pythonhosted.org/packages/ab/c0/07017a93ebf85d9408028b7e03ef96d5c6bfb14cb77cfe90d35eedcc1501/crc32c-2.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:86d2eeb5f0189bd803720abe7387019328ea34c4acde62999e5723f789bc316b", size = 79243, upload-time = "2025-10-17T06:18:59.273Z" }, - { url = "https://files.pythonhosted.org/packages/c7/1a/b3c5ac4cf2fd1f82395173d0bd8e1a15d09f0bc1eccdf10ea7f8caaccd67/crc32c-2.8-cp310-cp310-win32.whl", hash = "sha256:51da61904a9e753780a2e6011885677d601db1fa840be4b68799643a113e6f08", size = 64888, upload-time = "2025-10-17T06:19:00.089Z" }, - { url = "https://files.pythonhosted.org/packages/b6/f2/60c45fc7bb2221d3c93c7a872e921be591f40d45228fe46f879b1d8c0424/crc32c-2.8-cp310-cp310-win_amd64.whl", hash = "sha256:b2d6a1f2500daaf2e4b08f97ad0349aa2eff5faaaa5fd3350314a26eade334cd", size = 66639, upload-time = "2025-10-17T06:19:00.974Z" }, - { url = "https://files.pythonhosted.org/packages/dc/0b/5e03b22d913698e9cc563f39b9f6bbd508606bf6b8e9122cd6bf196b87ea/crc32c-2.8-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e560a97fbb96c9897cb1d9b5076ef12fc12e2e25622530a1afd0de4240f17e1f", size = 66329, upload-time = "2025-10-17T06:19:01.771Z" }, - { url = "https://files.pythonhosted.org/packages/6b/38/2fe0051ffe8c6a650c8b1ac0da31b8802d1dbe5fa40a84e4b6b6f5583db5/crc32c-2.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6762d276d90331a490ef7e71ffee53b9c0eb053bd75a272d786f3b08d3fe3671", size = 62988, upload-time = "2025-10-17T06:19:02.953Z" }, - { url = "https://files.pythonhosted.org/packages/3e/30/5837a71c014be83aba1469c58820d287fc836512a0cad6b8fdd43868accd/crc32c-2.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:60670569f5ede91e39f48fb0cb4060e05b8d8704dd9e17ede930bf441b2f73ef", size = 61522, upload-time = "2025-10-17T06:19:03.796Z" }, - { url = "https://files.pythonhosted.org/packages/ca/29/63972fc1452778e2092ae998c50cbfc2fc93e3fa9798a0278650cd6169c5/crc32c-2.8-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:711743da6ccc70b3c6718c328947b0b6f34a1fe6a6c27cc6c1d69cc226bf70e9", size = 80200, upload-time = "2025-10-17T06:19:04.617Z" }, - { url = "https://files.pythonhosted.org/packages/cb/3a/60eb49d7bdada4122b3ffd45b0df54bdc1b8dd092cda4b069a287bdfcff4/crc32c-2.8-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5eb4094a2054774f13b26f21bf56792bb44fa1fcee6c6ad099387a43ffbfb4fa", size = 81757, upload-time = "2025-10-17T06:19:05.496Z" }, - { url = "https://files.pythonhosted.org/packages/f5/63/6efc1b64429ef7d23bd58b75b7ac24d15df327e3ebbe9c247a0f7b1c2ed1/crc32c-2.8-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fff15bf2bd3e95780516baae935ed12be88deaa5ebe6143c53eb0d26a7bdc7b7", size = 80830, upload-time = "2025-10-17T06:19:06.621Z" }, - { url = "https://files.pythonhosted.org/packages/e1/eb/0ae9f436f8004f1c88f7429e659a7218a3879bd11a6b18ed1257aad7e98b/crc32c-2.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4c0e11e3826668121fa53e0745635baf5e4f0ded437e8ff63ea56f38fc4f970a", size = 80095, upload-time = "2025-10-17T06:19:07.381Z" }, - { url = "https://files.pythonhosted.org/packages/9e/81/4afc9d468977a4cd94a2eb62908553345009a7c0d30e74463a15d4b48ec3/crc32c-2.8-cp311-cp311-win32.whl", hash = "sha256:38f915336715d1f1353ab07d7d786f8a789b119e273aea106ba55355dfc9101d", size = 64886, upload-time = "2025-10-17T06:19:08.497Z" }, - { url = "https://files.pythonhosted.org/packages/d6/e8/94e839c9f7e767bf8479046a207afd440a08f5c59b52586e1af5e64fa4a0/crc32c-2.8-cp311-cp311-win_amd64.whl", hash = "sha256:60e0a765b1caab8d31b2ea80840639253906a9351d4b861551c8c8625ea20f86", size = 66639, upload-time = "2025-10-17T06:19:09.338Z" }, - { url = "https://files.pythonhosted.org/packages/b6/36/fd18ef23c42926b79c7003e16cb0f79043b5b179c633521343d3b499e996/crc32c-2.8-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:572ffb1b78cce3d88e8d4143e154d31044a44be42cb3f6fbbf77f1e7a941c5ab", size = 66379, upload-time = "2025-10-17T06:19:10.115Z" }, - { url = "https://files.pythonhosted.org/packages/7f/b8/c584958e53f7798dd358f5bdb1bbfc97483134f053ee399d3eeb26cca075/crc32c-2.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cf827b3758ee0c4aacd21ceca0e2da83681f10295c38a10bfeb105f7d98f7a68", size = 63042, upload-time = "2025-10-17T06:19:10.946Z" }, - { url = "https://files.pythonhosted.org/packages/62/e6/6f2af0ec64a668a46c861e5bc778ea3ee42171fedfc5440f791f470fd783/crc32c-2.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:106fbd79013e06fa92bc3b51031694fcc1249811ed4364ef1554ee3dd2c7f5a2", size = 61528, upload-time = "2025-10-17T06:19:11.768Z" }, - { url = "https://files.pythonhosted.org/packages/17/8b/4a04bd80a024f1a23978f19ae99407783e06549e361ab56e9c08bba3c1d3/crc32c-2.8-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6dde035f91ffbfe23163e68605ee5a4bb8ceebd71ed54bb1fb1d0526cdd125a2", size = 80028, upload-time = "2025-10-17T06:19:12.554Z" }, - { url = "https://files.pythonhosted.org/packages/21/8f/01c7afdc76ac2007d0e6a98e7300b4470b170480f8188475b597d1f4b4c6/crc32c-2.8-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e41ebe7c2f0fdcd9f3a3fd206989a36b460b4d3f24816d53e5be6c7dba72c5e1", size = 81531, upload-time = "2025-10-17T06:19:13.406Z" }, - { url = "https://files.pythonhosted.org/packages/32/2b/8f78c5a8cc66486be5f51b6f038fc347c3ba748d3ea68be17a014283c331/crc32c-2.8-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ecf66cf90266d9c15cea597d5cc86c01917cd1a238dc3c51420c7886fa750d7e", size = 80608, upload-time = "2025-10-17T06:19:14.223Z" }, - { url = "https://files.pythonhosted.org/packages/db/86/fad1a94cdeeeb6b6e2323c87f970186e74bfd6fbfbc247bf5c88ad0873d5/crc32c-2.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:59eee5f3a69ad0793d5fa9cdc9b9d743b0cd50edf7fccc0a3988a821fef0208c", size = 79886, upload-time = "2025-10-17T06:19:15.345Z" }, - { url = "https://files.pythonhosted.org/packages/d5/db/1a7cb6757a1e32376fa2dfce00c815ea4ee614a94f9bff8228e37420c183/crc32c-2.8-cp312-cp312-win32.whl", hash = "sha256:a73d03ce3604aa5d7a2698e9057a0eef69f529c46497b27ee1c38158e90ceb76", size = 64896, upload-time = "2025-10-17T06:19:16.457Z" }, - { url = "https://files.pythonhosted.org/packages/bf/8e/2024de34399b2e401a37dcb54b224b56c747b0dc46de4966886827b4d370/crc32c-2.8-cp312-cp312-win_amd64.whl", hash = "sha256:56b3b7d015247962cf58186e06d18c3d75a1a63d709d3233509e1c50a2d36aa2", size = 66645, upload-time = "2025-10-17T06:19:17.235Z" }, - { url = "https://files.pythonhosted.org/packages/e8/d8/3ae227890b3be40955a7144106ef4dd97d6123a82c2a5310cdab58ca49d8/crc32c-2.8-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:36f1e03ee9e9c6938e67d3bcb60e36f260170aa5f37da1185e04ef37b56af395", size = 66380, upload-time = "2025-10-17T06:19:18.009Z" }, - { url = "https://files.pythonhosted.org/packages/bd/8b/178d3f987cd0e049b484615512d3f91f3d2caeeb8ff336bb5896ae317438/crc32c-2.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b2f3226b94b85a8dd9b3533601d7a63e9e3e8edf03a8a169830ee8303a199aeb", size = 63048, upload-time = "2025-10-17T06:19:18.853Z" }, - { url = "https://files.pythonhosted.org/packages/f2/a1/48145ae2545ebc0169d3283ebe882da580ea4606bfb67cf4ca922ac3cfc3/crc32c-2.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6e08628bc72d5b6bc8e0730e8f142194b610e780a98c58cb6698e665cb885a5b", size = 61530, upload-time = "2025-10-17T06:19:19.974Z" }, - { url = "https://files.pythonhosted.org/packages/06/4b/cf05ed9d934cc30e5ae22f97c8272face420a476090e736615d9a6b53de0/crc32c-2.8-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:086f64793c5ec856d1ab31a026d52ad2b895ac83d7a38fce557d74eb857f0a82", size = 80001, upload-time = "2025-10-17T06:19:20.784Z" }, - { url = "https://files.pythonhosted.org/packages/15/ab/4b04801739faf36345f6ba1920be5b1c70282fec52f8280afd3613fb13e2/crc32c-2.8-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bcf72ee7e0135b3d941c34bb2c26c3fc6bc207106b49fd89aaafaeae223ae209", size = 81543, upload-time = "2025-10-17T06:19:21.557Z" }, - { url = "https://files.pythonhosted.org/packages/a9/1b/6e38dde5bfd2ea69b7f2ab6ec229fcd972a53d39e2db4efe75c0ac0382ce/crc32c-2.8-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8a717dd9c3fd777d9bc6603717eae172887d402c4ab589d124ebd0184a83f89e", size = 80644, upload-time = "2025-10-17T06:19:22.325Z" }, - { url = "https://files.pythonhosted.org/packages/ce/45/012176ffee90059ae8ec7131019c71724ea472aa63e72c0c8edbd1fad1d7/crc32c-2.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0450bb845b3c3c7b9bdc0b4e95620ec9a40824abdc8c86d6285c919a90743c1a", size = 79919, upload-time = "2025-10-17T06:19:23.101Z" }, - { url = "https://files.pythonhosted.org/packages/f0/2b/f557629842f9dec2b3461cb3a0d854bb586ec45b814cea58b082c32f0dde/crc32c-2.8-cp313-cp313-win32.whl", hash = "sha256:765d220bfcbcffa6598ac11eb1e10af0ee4802b49fe126aa6bf79f8ddb9931d1", size = 64896, upload-time = "2025-10-17T06:19:23.88Z" }, - { url = "https://files.pythonhosted.org/packages/d0/db/fd0f698c15d1e21d47c64181a98290665a08fcbb3940cd559e9c15bda57e/crc32c-2.8-cp313-cp313-win_amd64.whl", hash = "sha256:171ff0260d112c62abcce29332986950a57bddee514e0a2418bfde493ea06bb3", size = 66646, upload-time = "2025-10-17T06:19:24.702Z" }, - { url = "https://files.pythonhosted.org/packages/db/b9/8e5d7054fe8e7eecab10fd0c8e7ffb01439417bdb6de1d66a81c38fc4a20/crc32c-2.8-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b977a32a3708d6f51703c8557008f190aaa434d7347431efb0e86fcbe78c2a50", size = 66203, upload-time = "2025-10-17T06:19:25.872Z" }, - { url = "https://files.pythonhosted.org/packages/55/5f/cc926c70057a63cc0c98a3c8a896eb15fc7e74d3034eadd53c94917c6cc3/crc32c-2.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7399b01db4adaf41da2fb36fe2408e75a8d82a179a9564ed7619412e427b26d6", size = 62956, upload-time = "2025-10-17T06:19:26.652Z" }, - { url = "https://files.pythonhosted.org/packages/a1/8a/0660c44a2dd2cb6ccbb529eb363b9280f5c766f1017bc8355ed8d695bd94/crc32c-2.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4379f73f9cdad31958a673d11a332ec725ca71572401ca865867229f5f15e853", size = 61442, upload-time = "2025-10-17T06:19:27.74Z" }, - { url = "https://files.pythonhosted.org/packages/f5/5a/6108d2dfc0fe33522ce83ba07aed4b22014911b387afa228808a278e27cd/crc32c-2.8-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2e68264555fab19bab08331550dab58573e351a63ed79c869d455edd3b0aa417", size = 79109, upload-time = "2025-10-17T06:19:28.535Z" }, - { url = "https://files.pythonhosted.org/packages/84/1e/c054f9e390090c197abf3d2936f4f9effaf0c6ee14569ae03d6ddf86958a/crc32c-2.8-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b48f2486727b8d0e7ccbae4a34cb0300498433d2a9d6b49cb13cb57c2e3f19cb", size = 80987, upload-time = "2025-10-17T06:19:29.305Z" }, - { url = "https://files.pythonhosted.org/packages/c8/ad/1650e5c3341e4a485f800ea83116d72965030c5d48ccc168fcc685756e4d/crc32c-2.8-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:ecf123348934a086df8c8fde7f9f2d716d523ca0707c5a1367b8bb00d8134823", size = 79994, upload-time = "2025-10-17T06:19:30.109Z" }, - { url = "https://files.pythonhosted.org/packages/d7/3b/f2ed924b177729cbb2ab30ca2902abff653c31d48c95e7b66717a9ca9fcc/crc32c-2.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e636ac60f76de538f7a2c0d0f3abf43104ee83a8f5e516f6345dc283ed1a4df7", size = 79046, upload-time = "2025-10-17T06:19:30.894Z" }, - { url = "https://files.pythonhosted.org/packages/4b/80/413b05ee6ace613208b31b3670c3135ee1cf451f0e72a9c839b4946acc04/crc32c-2.8-cp313-cp313t-win32.whl", hash = "sha256:8dd4a19505e0253892e1b2f1425cc3bd47f79ae5a04cb8800315d00aad7197f2", size = 64837, upload-time = "2025-10-17T06:19:32.03Z" }, - { url = "https://files.pythonhosted.org/packages/3b/1b/85eddb6ac5b38496c4e35c20298aae627970c88c3c624a22ab33e84f16c7/crc32c-2.8-cp313-cp313t-win_amd64.whl", hash = "sha256:4bb18e4bd98fb266596523ffc6be9c5b2387b2fa4e505ec56ca36336f49cb639", size = 66574, upload-time = "2025-10-17T06:19:33.143Z" }, - { url = "https://files.pythonhosted.org/packages/aa/df/50e9079b532ff53dbfc0e66eed781374bd455af02ed5df8b56ad538de4ff/crc32c-2.8-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:3a3b2e4bcf7b3ee333050e7d3ff38e2ba46ea205f1d73d8949b248aaffe937ac", size = 66399, upload-time = "2025-10-17T06:19:34.279Z" }, - { url = "https://files.pythonhosted.org/packages/5a/2e/67e3b0bc3d30e46ea5d16365cc81203286387671e22f2307eb41f19abb9c/crc32c-2.8-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:445e559e66dff16be54f8a4ef95aa6b01db799a639956d995c5498ba513fccc2", size = 63044, upload-time = "2025-10-17T06:19:35.062Z" }, - { url = "https://files.pythonhosted.org/packages/36/ea/1723b17437e4344ed8d067456382ecb1f5b535d83fdc5aaebab676c6d273/crc32c-2.8-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:bf3040919e17afa5782e01b1875d6a05f44b8f19c05f211d8b9f8a1deb8bbd9c", size = 61541, upload-time = "2025-10-17T06:19:36.204Z" }, - { url = "https://files.pythonhosted.org/packages/4c/6a/cbec8a235c5b46a01f319939b538958662159aec0ed3a74944e3a6de21f1/crc32c-2.8-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5607ab8221e1ffd411f64aa40dbb6850cf06dd2908c9debd05d371e1acf62ff3", size = 80139, upload-time = "2025-10-17T06:19:37.351Z" }, - { url = "https://files.pythonhosted.org/packages/21/31/d096722fe74b692d6e8206c27da1ea5f6b2a12ff92c54a62a6ba2f376254/crc32c-2.8-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7f5db4f16816926986d3c94253314920689706ae13a9bf4888b47336c6735ce", size = 81736, upload-time = "2025-10-17T06:19:38.16Z" }, - { url = "https://files.pythonhosted.org/packages/f6/a2/f75ef716ff7e3c22f385ba6ef30c5de80c19a21ebe699dc90824a1903275/crc32c-2.8-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:70b0153c4d418b673309d3529334d117e1074c4a3b2d7f676e430d72c14de67b", size = 80795, upload-time = "2025-10-17T06:19:38.948Z" }, - { url = "https://files.pythonhosted.org/packages/d8/94/6d647a12d96ab087d9b8eacee3da073f981987827d57c7072f89ffc7b6cd/crc32c-2.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5c8933531442042438753755a5c8a9034e4d88b01da9eb796f7e151b31a7256c", size = 80042, upload-time = "2025-10-17T06:19:39.725Z" }, - { url = "https://files.pythonhosted.org/packages/cd/dc/32b8896b40a0afee7a3c040536d0da5a73e68df2be9fadd21770fd158e16/crc32c-2.8-cp314-cp314-win32.whl", hash = "sha256:cdc83a3fe6c4e5df9457294cfd643de7d95bd4e9382c1dd6ed1e0f0f9169172c", size = 64914, upload-time = "2025-10-17T06:19:40.527Z" }, - { url = "https://files.pythonhosted.org/packages/f2/b4/4308b27d307e8ecaf8dd1dcc63bbb0e47ae1826d93faa3e62d1ee00ee2d5/crc32c-2.8-cp314-cp314-win_amd64.whl", hash = "sha256:509e10035106df66770fe24b9eb8d9e32b6fb967df17744402fb67772d8b2bc7", size = 66723, upload-time = "2025-10-17T06:19:42.449Z" }, - { url = "https://files.pythonhosted.org/packages/90/d5/a19d2489fa997a143bfbbf971a5c9a43f8b1ba9e775b1fb362d8fb15260c/crc32c-2.8-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:864359a39777a07b09b28eb31337c0cc603d5c1bf0fc328c3af736a8da624ec0", size = 66201, upload-time = "2025-10-17T06:19:43.273Z" }, - { url = "https://files.pythonhosted.org/packages/98/c2/5f82f22d2c1242cb6f6fe92aa9a42991ebea86de994b8f9974d9c1d128e2/crc32c-2.8-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:14511d7cfc5d9f5e1a6c6b64caa6225c2bdc1ed00d725e9a374a3e84073ce180", size = 62956, upload-time = "2025-10-17T06:19:44.099Z" }, - { url = "https://files.pythonhosted.org/packages/9b/61/3d43d33489cf974fb78bfb3500845770e139ae6d1d83473b660bd8f79a6c/crc32c-2.8-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:918b7999b52b5dcbcea34081e9a02d46917d571921a3f209956a9a429b2e06e5", size = 61443, upload-time = "2025-10-17T06:19:44.89Z" }, - { url = "https://files.pythonhosted.org/packages/52/6d/f306ce64a352a3002f76b0fc88a1373f4541f9d34fad3668688610bab14b/crc32c-2.8-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:cc445da03fc012a5a03b71da1df1b40139729e6a5571fd4215ab40bfb39689c7", size = 79106, upload-time = "2025-10-17T06:19:45.688Z" }, - { url = "https://files.pythonhosted.org/packages/a5/b7/1f74965dd7ea762954a69d172dfb3a706049c84ffa45d31401d010a4a126/crc32c-2.8-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1e3dde2ec59a8a830511d72a086ead95c0b0b7f0d418f93ea106244c5e77e350", size = 80983, upload-time = "2025-10-17T06:19:46.792Z" }, - { url = "https://files.pythonhosted.org/packages/1b/50/af93f0d91ccd61833ce77374ebfbd16f5805f5c17d18c6470976d9866d76/crc32c-2.8-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:61d51681a08b6a2a2e771b7f0cd1947fb87cb28f38ed55a01cb7c40b2ac4cdd8", size = 80009, upload-time = "2025-10-17T06:19:47.619Z" }, - { url = "https://files.pythonhosted.org/packages/ee/fa/94f394beb68a88258af694dab2f1284f55a406b615d7900bdd6235283bc4/crc32c-2.8-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:67c0716c3b1a02d5235be649487b637eed21f2d070f2b3f63f709dcd2fefb4c7", size = 79066, upload-time = "2025-10-17T06:19:48.409Z" }, - { url = "https://files.pythonhosted.org/packages/91/c6/a6050e0c64fd73c67a97da96cb59f08b05111e00b958fb87ecdce99f17ac/crc32c-2.8-cp314-cp314t-win32.whl", hash = "sha256:2e8fe863fbbd8bdb6b414a2090f1b0f52106e76e9a9c96a413495dbe5ebe492a", size = 64869, upload-time = "2025-10-17T06:19:49.197Z" }, - { url = "https://files.pythonhosted.org/packages/08/1f/c7735034e401cb1ea14f996a224518e3a3fa9987cb13680e707328a7d779/crc32c-2.8-cp314-cp314t-win_amd64.whl", hash = "sha256:20a9cfb897693eb6da19e52e2a7be2026fd4d9fc8ae318f086c0d71d5dd2d8e0", size = 66633, upload-time = "2025-10-17T06:19:50.003Z" }, - { url = "https://files.pythonhosted.org/packages/a7/1d/dd926c68eb8aac8b142a1a10b8eb62d95212c1cf81775644373fe7cceac2/crc32c-2.8-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5833f4071da7ea182c514ba17d1eee8aec3c5be927d798222fbfbbd0f5eea02c", size = 62345, upload-time = "2025-10-17T06:20:09.39Z" }, - { url = "https://files.pythonhosted.org/packages/51/be/803404e5abea2ef2c15042edca04bbb7f625044cca879e47f186b43887c2/crc32c-2.8-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:1dc4da036126ac07b39dd9d03e93e585ec615a2ad28ff12757aef7de175295a8", size = 61229, upload-time = "2025-10-17T06:20:10.236Z" }, - { url = "https://files.pythonhosted.org/packages/fc/3a/00cc578cd27ed0b22c9be25cef2c24539d92df9fa80ebd67a3fc5419724c/crc32c-2.8-pp311-pypy311_pp73-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:15905fa78344654e241371c47e6ed2411f9eeb2b8095311c68c88eccf541e8b4", size = 64108, upload-time = "2025-10-17T06:20:11.072Z" }, - { url = "https://files.pythonhosted.org/packages/6b/bc/0587ef99a1c7629f95dd0c9d4f3d894de383a0df85831eb16c48a6afdae4/crc32c-2.8-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c596f918688821f796434e89b431b1698396c38bf0b56de873621528fe3ecb1e", size = 64815, upload-time = "2025-10-17T06:20:11.919Z" }, - { url = "https://files.pythonhosted.org/packages/73/42/94f2b8b92eae9064fcfb8deef2b971514065bd606231f8857ff8ae02bebd/crc32c-2.8-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8d23c4fe01b3844cb6e091044bc1cebdef7d16472e058ce12d9fadf10d2614af", size = 66659, upload-time = "2025-10-17T06:20:12.766Z" }, -] - [[package]] name = "cryptography" version = "42.0.8" @@ -1207,40 +1098,40 @@ wheels = [ [[package]] name = "cython" -version = "3.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/52/82/01f0b63287cb922e5ba96c5147c30f1e51f541ce91bd178025bb3518b1ba/cython-3.2.0.tar.gz", hash = "sha256:41fdce8237baee2d961c292ed0386903dfe126f131e450a62de0fd7a5280d4b2", size = 3267264, upload-time = "2025-11-05T13:35:04.231Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/57/8d/b2e9578d960d38b1b04a278bf66e13008486aa73e73967186f2015d63d1c/cython-3.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ee408125b2d218ec7d7a061e09d24715fcab9bf7ea1a4ac01907c3f8ec8730b3", size = 2953775, upload-time = "2025-11-05T13:35:22.291Z" }, - { url = "https://files.pythonhosted.org/packages/19/dd/cfd684f98bac9e0f505af1cbb7998498c59d713275e920a72b40dab03bfa/cython-3.2.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c93ce307b05fcd86a5bb0e4a7d7fab238e2f0e9936636097a60bc0e21f2def30", size = 3361627, upload-time = "2025-11-05T13:35:24.519Z" }, - { url = "https://files.pythonhosted.org/packages/9c/c1/75acdbe9f6292514f0bb92ab1b78df5eedd7049235f4cbd194d2c6c46bfc/cython-3.2.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:191cfc2fa84642ad41a52d5abaacfb330d9a6653a465e4bf0a5681f66197a967", size = 3529751, upload-time = "2025-11-05T13:35:26.341Z" }, - { url = "https://files.pythonhosted.org/packages/f2/ce/d0468eb6d87b956902b02909f5007ad61e3839d4c07ab235b514911d869b/cython-3.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:a259053037ef82959b743b7fde238bd191ee43f88eb8e51101d5f3d8849f1e32", size = 2758839, upload-time = "2025-11-05T13:35:28.36Z" }, - { url = "https://files.pythonhosted.org/packages/ff/2b/904493fceda95747ba83971b40a66c8cc29ff009313429903f38ee620140/cython-3.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e9e4b2248dc3a98b86aeba65e9862d2cc881d072c163c0fb31b511d4d72e93c8", size = 2946248, upload-time = "2025-11-05T13:35:30.406Z" }, - { url = "https://files.pythonhosted.org/packages/89/fe/abe926699fe6c580967e30bc4035da54b5e31355ba9b1f4c0cf574228a84/cython-3.2.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:02fb4990a83d5d6f780dda18ed8baa8d587cb6523f57b4d72bc0b41ad3766c96", size = 3236384, upload-time = "2025-11-05T13:35:32.233Z" }, - { url = "https://files.pythonhosted.org/packages/1b/36/6b6266549802234286438298d494152deb19922a94928d9dcd256659ebd1/cython-3.2.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8a98925517819d62ea25d2cf40057df60a9bcf75fdd1d6ed3882e6ae0730d82f", size = 3372915, upload-time = "2025-11-05T13:35:34.082Z" }, - { url = "https://files.pythonhosted.org/packages/29/fa/5cf15466b428f9248e38a28515cf0fd98078ae869aa395cfb300315964c4/cython-3.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:4c959a5d4cd6331e8498822ba47200bd2ff4bf74517c0c91475d5bc21da3b4d5", size = 2762735, upload-time = "2025-11-05T13:35:35.806Z" }, - { url = "https://files.pythonhosted.org/packages/57/d3/2e6f5f2552c860bb9c00653d092103521846114f6a2ae0648ecf84c0816c/cython-3.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:511d823d9f8a1b850178ec355d6df0a1731b9c20b08ee6d1a780f68215e9013f", size = 2959932, upload-time = "2025-11-05T13:35:37.518Z" }, - { url = "https://files.pythonhosted.org/packages/dd/bf/7bdc7f231fff6780f78586f939c1740475adecaa03bf256fcb62b2353952/cython-3.2.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bbadeedcb2d135655bcce7380fb28c9e2a75b6810426c12b6e5a6fe6106fafb4", size = 3218588, upload-time = "2025-11-05T13:35:39.642Z" }, - { url = "https://files.pythonhosted.org/packages/be/81/7d7a81010897dc5abee59691f5fc85849dcc4c8a7687b22ed01bc8d86a7a/cython-3.2.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:92d2394a3e3fe704210b5324eb8118333b514af72c98b1e02a6503945825b231", size = 3381940, upload-time = "2025-11-05T13:35:41.886Z" }, - { url = "https://files.pythonhosted.org/packages/4f/9d/35e7fb7b591bd9912685a772fcc773d7bb951a8feb6fb9be20addbc38928/cython-3.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:73435e56654a34ece57d4c3304a4556a8402cc4ae2d0e30f71c237a985dc5246", size = 2750886, upload-time = "2025-11-05T13:35:43.629Z" }, - { url = "https://files.pythonhosted.org/packages/5d/d0/dc4b260e8fde81b23ab4dca56948b3e69617ef470247ec6a3e09370a9849/cython-3.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d900e58e826f9a5a27b0e2b50e33473e9986a5bae375c39b0f2e19f2c545fa23", size = 2950437, upload-time = "2025-11-05T13:35:45.427Z" }, - { url = "https://files.pythonhosted.org/packages/c8/53/c322bf0486a938ad954a645866b67e978777d79183cf0a042bda6bea11de/cython-3.2.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a9d38cd3aab720d21fa6d6ee168228352f69aea0a95bd4fb84e8879c6ed38fbb", size = 3209331, upload-time = "2025-11-05T13:35:47.278Z" }, - { url = "https://files.pythonhosted.org/packages/cd/48/55d02dba0606768d3450afd088e2bbcd6f8a54977dce041c2c3c1894631c/cython-3.2.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:92b31d0b7b0a49b3d2aa94faaf75d44a03174cff2616b341a8853c919e511d51", size = 3370974, upload-time = "2025-11-05T13:35:49.534Z" }, - { url = "https://files.pythonhosted.org/packages/ce/bd/6dab19652b68464572b7a137d07a91ebe86db2a81c35842ff5e49ef23403/cython-3.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:2847b74e76dbad612f6fc7182c12a5f78cffb0d05808fd2c4b638cf02d1aade6", size = 2746274, upload-time = "2025-11-05T13:35:51.522Z" }, - { url = "https://files.pythonhosted.org/packages/e2/db/de5331ca6489da1761078825709257e1f24e543b4040f86a2502a4b841f9/cython-3.2.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a0a8274959d538d12f865193dcd67bb5630906e020190c890d2b7c13d31713c6", size = 2961164, upload-time = "2025-11-05T13:35:53.826Z" }, - { url = "https://files.pythonhosted.org/packages/54/3e/64e37e419331f7c4c540ad25c0b3e6d8f44d597f21ab8861afbc66aa7e02/cython-3.2.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a1c800833c25195833805c7c3626a2c30b3baaaa9ba361a1af3bbc379662a8d", size = 3249627, upload-time = "2025-11-05T13:35:55.524Z" }, - { url = "https://files.pythonhosted.org/packages/9b/fc/9faedfcc2de807f77115d97a4910c260dd4693f4fa9e0e3be0d9ae89e260/cython-3.2.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:df15af08c21c18a2e848df5954d6fd3310735089b60405132fa4111e2cf7482a", size = 3375458, upload-time = "2025-11-05T13:35:57.279Z" }, - { url = "https://files.pythonhosted.org/packages/31/e0/30d449cd97ee0d6395aba18f2646b61b52ab3dc5a3851a346e2d363a7d85/cython-3.2.0-cp314-cp314-win_amd64.whl", hash = "sha256:9d6876af2132757fff1b42a2f4eaa72482f991863160e3f0dc8f2c812b300ebf", size = 2783210, upload-time = "2025-11-05T13:35:59.54Z" }, - { url = "https://files.pythonhosted.org/packages/dd/6b/9e1e171fe19274465d84dffa4610d46f434b1ae945e946802db396695d67/cython-3.2.0-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:04821ce06598a3aa5c9e0270d98960cfe6556dedbd1418c65e4479162b8ae74a", size = 2869249, upload-time = "2025-11-05T13:36:08.944Z" }, - { url = "https://files.pythonhosted.org/packages/c4/f1/f461726f664668a96072b2a245bdfae566d68e2eb1393ec72780cc59c21e/cython-3.2.0-cp39-abi3-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:54b5b1c72a63da822b3f4739a0e31546c0a19f8e834b174906bf817ed5f9d65f", size = 3204332, upload-time = "2025-11-05T13:36:11.386Z" }, - { url = "https://files.pythonhosted.org/packages/78/d8/73c07ce64cae496e5f5a6dfe3e53574af1a8ef777e2a834d10dae8b67a4e/cython-3.2.0-cp39-abi3-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6155a6c360e32af1aaa16fa10b0119b49deeadff42a1958973324150870af1b5", size = 2851317, upload-time = "2025-11-05T13:36:13.14Z" }, - { url = "https://files.pythonhosted.org/packages/bc/d9/d9f321637b8034b5028fa5fe7d1085ffa9351fea350af6510d5cb924c014/cython-3.2.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:861258ac3878b76c57b9b5a379787d772a0bc47fec9167b43986777de542c474", size = 2987155, upload-time = "2025-11-05T13:36:15.018Z" }, - { url = "https://files.pythonhosted.org/packages/f8/b5/9f9e7d261f083b4066d734b27a7872b0c584fd4c3578196652dbf72b3f62/cython-3.2.0-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:85dbf955e3193893d0288105afa0fa5f4e835ff587061681f240a4f0487c44fb", size = 2884219, upload-time = "2025-11-05T13:36:17.334Z" }, - { url = "https://files.pythonhosted.org/packages/88/64/5aeb6e43e0ded9efedc5a516f87a487fdca8e434491cc352e5a805380459/cython-3.2.0-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:3b3f13822526726bac43275c0e92916bbcc2c30e9f559edc4c1132670b70498d", size = 3218067, upload-time = "2025-11-05T13:36:19.493Z" }, - { url = "https://files.pythonhosted.org/packages/c4/a0/1958f54cd79d8251a330b9c9652b2a5ceba6a3fcec10782dd03e2a23c74f/cython-3.2.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ab18d09673d219008be5b6174bcbb6dbfd50904e66371f104a8a4698b791472d", size = 3108277, upload-time = "2025-11-05T13:36:21.203Z" }, - { url = "https://files.pythonhosted.org/packages/9c/84/9b8112160cab922b97edef00616ed18771567d88b5ba9d30d1736880c345/cython-3.2.0-cp39-abi3-win32.whl", hash = "sha256:c9fd986413fc52929b916187630a9abab9f876299951488c4b905ad5346afee6", size = 2430852, upload-time = "2025-11-05T13:36:23.049Z" }, - { url = "https://files.pythonhosted.org/packages/8f/57/65d3de140b51c45dd6892846bfabdfaaa032e2418f1cb1a2f46058c1fe42/cython-3.2.0-cp39-abi3-win_arm64.whl", hash = "sha256:ee2ea79ddeb721f912e7efea039b9db059c81767ff04fbf9a995f64e1187df99", size = 2435793, upload-time = "2025-11-05T13:36:25.139Z" }, - { url = "https://files.pythonhosted.org/packages/20/58/1f798ddb7fe6bfddf85f4f97d2d4ad63a491a7b643e85c1e274d0f09138e/cython-3.2.0-py3-none-any.whl", hash = "sha256:73f7f4c75acde5b5b4df05b11fdc2705ec637b99241d1bc2f4ebf345f7a2ea90", size = 1252818, upload-time = "2025-11-05T13:35:00.391Z" }, +version = "3.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/83/36/cce2972e13e83ffe58bc73bfd9d37340b5e5113e8243841a57511c7ae1c2/cython-3.2.1.tar.gz", hash = "sha256:2be1e4d0cbdf7f4cd4d9b8284a034e1989b59fd060f6bd4d24bf3729394d2ed8", size = 3270455, upload-time = "2025-11-12T19:02:59.847Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/74/f9fe9e7034f24aef407e7816880c012d8e863bedaa6b42b9ff33e79ea139/cython-3.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f1d10b3731171a33563ba81fdcba39c229e45087269dfbe07a1c00e7dcb2537f", size = 2957374, upload-time = "2025-11-12T19:03:10.132Z" }, + { url = "https://files.pythonhosted.org/packages/65/47/f9dd519117f520aaf4d723c88fd9e9139262a0379edc01e71a1e9825e082/cython-3.2.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92b814b6066d178a5057b557d372e2a03854e947e41cb9dec21db732fbd14c3c", size = 3366838, upload-time = "2025-11-12T19:03:11.742Z" }, + { url = "https://files.pythonhosted.org/packages/5d/3e/d967acfafef00056c3ba832692b9bb358ede2919f641e4a2d24828adacc6/cython-3.2.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9fc6abd0532007827d8c6143b2bfedf80c7cb89a3c1c12f058336663489ed2e", size = 3535901, upload-time = "2025-11-12T19:03:13.545Z" }, + { url = "https://files.pythonhosted.org/packages/68/79/bc46e714ecb010f80a8aa7f7eaf412c53cbabbe7489590d6aba5f4478ba5/cython-3.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:14f1ed135347587cfddcd3c3219667cac4f0ea0b66aa1c4c0187d50a1b92c222", size = 2764043, upload-time = "2025-11-12T19:03:15.584Z" }, + { url = "https://files.pythonhosted.org/packages/48/d4/ba7b9f341ec168de78bd659600e04bb7de3b2d069bf98b2178a135e88ea4/cython-3.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3cb32c650e7f4476941d1f735cae75a2067d5e3279576273bb8802e8ea907222", size = 2949720, upload-time = "2025-11-12T19:03:17.492Z" }, + { url = "https://files.pythonhosted.org/packages/ad/47/c42417f424c0b928361f48d7dd0ae72716ee21f647b73ceb16f66b98663e/cython-3.2.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a2b306813d7f28aa0a2c3e4e63ada1427a8109917532df942cd5429db228252", size = 3242127, upload-time = "2025-11-12T19:03:19.227Z" }, + { url = "https://files.pythonhosted.org/packages/e6/fc/1040460889129551649ec35be45e05169871fbcf71bd8e13c533e86f9468/cython-3.2.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0959d9a36d4f004ce63acc1474b3c606745af98b65e8ae709efd0c10988e9d6b", size = 3377094, upload-time = "2025-11-12T19:03:21.25Z" }, + { url = "https://files.pythonhosted.org/packages/f8/f2/8c754298eefa40e21af0ae3592837c6e71254900d5aea1c8859e96b11de5/cython-3.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:60c62e734421365135cc2842013d883136054a26c617c001be494235edfc447a", size = 2767824, upload-time = "2025-11-12T19:03:23.317Z" }, + { url = "https://files.pythonhosted.org/packages/ee/0e/19d5041b87f98ed19c94c388607cd27c1f7458078c3bad5de2dead55b2e1/cython-3.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ea5097d97afd2ab14e98637b7033eba5146de29a5dedf89f5e946076396ab891", size = 2966736, upload-time = "2025-11-12T19:03:25.064Z" }, + { url = "https://files.pythonhosted.org/packages/84/b8/bcc36d9d2464348106984956608a52a42a01ab44ea64031207dffdebc078/cython-3.2.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a4bf12de0475bb6a21e2336a4a04dc4a2b4dd0507a2a3c703e045f3484266605", size = 3221633, upload-time = "2025-11-12T19:03:26.754Z" }, + { url = "https://files.pythonhosted.org/packages/79/20/7d4807fe4ebcef9f20f2e5f93312d0f5d02f9f76524fd4e37706d04e83f7/cython-3.2.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18c64a0f69a1b8164de70ec7efc72250c589fec21519170de21582300f6aaed9", size = 3389542, upload-time = "2025-11-12T19:03:28.656Z" }, + { url = "https://files.pythonhosted.org/packages/2a/92/b06ba6721299293bc41e89732070132c453bdbaaeabb8f8cc76851b75345/cython-3.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:5ba14907d5826d8010e82306ce279a0d3650f5b50a4813c80836a17b2213c520", size = 2755307, upload-time = "2025-11-12T19:03:30.684Z" }, + { url = "https://files.pythonhosted.org/packages/40/28/c6e36c214baeb27ae45b518552e74457536c7c964b1a55b5900b047fa467/cython-3.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b4e850fc7a2f72d19679dd083fe4d20bf66860fceabb4f3207112f240249d708", size = 2957307, upload-time = "2025-11-12T19:03:32.471Z" }, + { url = "https://files.pythonhosted.org/packages/c8/c8/b0b9ba64f81f2875c42aab5c0979d6454cd1ac6b3c1e2373ad552701565d/cython-3.2.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3d20ca4afe993f7dccad3aeddbf4c3536cb0fd3ad6dc7a225935a666a5655af2", size = 3210919, upload-time = "2025-11-12T19:03:34.274Z" }, + { url = "https://files.pythonhosted.org/packages/f9/33/5d9ca6abba0e77e1851b843dd1b3c4095fbc6373166935e83c4414f80e88/cython-3.2.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f5a54a757d01ca6a260b02ce5baf17d9db1c2253566ab5844ee4966ff2a69c19", size = 3373350, upload-time = "2025-11-12T19:03:35.927Z" }, + { url = "https://files.pythonhosted.org/packages/e4/29/4408c3486ff380a2d6ae0d4b71da5195efcef3c4360017113ee7d1cb7335/cython-3.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:1b81e56584727a328e00d91c164f8f0f2c59b02bf6857c3f000cd830fa571453", size = 2753425, upload-time = "2025-11-12T19:03:38.157Z" }, + { url = "https://files.pythonhosted.org/packages/f0/32/c1aa03ccadda89487ff31b90d8651c3706ce2744bf4f2c2ae213147e89bd/cython-3.2.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:d7af6ad01c0fe1965d1d3badaeb6df53c1f37383ebae1ccb405b73f628f87713", size = 2967833, upload-time = "2025-11-12T19:03:40.233Z" }, + { url = "https://files.pythonhosted.org/packages/ff/dc/3488d3ade0635408a2ebb05561a3009e2f54616bfefd1f107088dfeb2c4c/cython-3.2.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e3ea7cd085b62acb67c0fbde5cd17a7d9e47992c965e81ec977cf9ea7c59cd65", size = 3256237, upload-time = "2025-11-12T19:03:42.005Z" }, + { url = "https://files.pythonhosted.org/packages/7b/ba/f3d35d3803c9a424fa8812893847114deb9e2440c1bc67a31ab9ec4b9355/cython-3.2.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:986aea38fdf231e78d73745f83271c5654852c822dc5141a1d3fba64429a6aa6", size = 3383100, upload-time = "2025-11-12T19:03:43.675Z" }, + { url = "https://files.pythonhosted.org/packages/86/dc/d72dbb2f8e7ca95d2d18fd86f32b2e385996576230e7ecddd7d250786825/cython-3.2.1-cp314-cp314-win_amd64.whl", hash = "sha256:4960e26cd34c1385f21646339f2e0361fcdd2ed3c01cdb50fe734add577ec56a", size = 2790322, upload-time = "2025-11-12T19:03:45.373Z" }, + { url = "https://files.pythonhosted.org/packages/5a/7e/1194f4ba98b981bbdca945a292e4f49e87ea09d69516b24445409e7cf611/cython-3.2.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:4e9167316bf6ecfea33dcca62f074605648fb93cc053ef46b5deb3e5d12fc0d3", size = 2872858, upload-time = "2025-11-12T19:03:55.074Z" }, + { url = "https://files.pythonhosted.org/packages/6b/1a/393ca8ffec7ad3f02b8e4bffaba3dba4fb62c4a1c4c0b6dbf3b80e709fe3/cython-3.2.1-cp39-abi3-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3095df6cd470064742f428c937bed7200c5123b9e19ee04aa09ec61281e565a3", size = 3209664, upload-time = "2025-11-12T19:03:56.771Z" }, + { url = "https://files.pythonhosted.org/packages/37/57/f209f64c609d3d8fac60a572e56da2f621dc1789e399c58db61d5645a31f/cython-3.2.1-cp39-abi3-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:db3f53b2d9afb206075a2605f1150aa019f0733c7795a38eccc6119c2e9c3f7b", size = 2854607, upload-time = "2025-11-12T19:03:59.413Z" }, + { url = "https://files.pythonhosted.org/packages/fc/af/1e5c73fe52423f40776130b0be914fd9f9f8dc26c4f6ea4c2ed04772d558/cython-3.2.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0fc5e7687ac8f8e2b2fb95648f43e9e074ebaa72fd5cb3d8e20e5f1e8b8e02d9", size = 2991567, upload-time = "2025-11-12T19:04:02.209Z" }, + { url = "https://files.pythonhosted.org/packages/39/2c/3ea175b6b1fdfb429f9e9c395240d894155b3c0615caced05fef43264cba/cython-3.2.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:bbb3bc152bc0de82b031c8d355418fa4890a92424209d59366c2c0bc9e6cf53c", size = 2889178, upload-time = "2025-11-12T19:04:05.272Z" }, + { url = "https://files.pythonhosted.org/packages/f1/88/b2ab22a3a3feac78c62354a823c5c0c33659909e9918f53aa05904532b4b/cython-3.2.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:a2022bc48ad0c2c0e0485bf0b54902913a3d81086b7d435f4437620c667799f6", size = 3223755, upload-time = "2025-11-12T19:04:07.262Z" }, + { url = "https://files.pythonhosted.org/packages/0b/56/9ba58629a03cbffb5965a3c65ccd91fa683d95d588c21a875da72fdc249b/cython-3.2.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:99fdd4ffc2dcb513f4be9ce71c6fedd895b96b1f814655b6bbab196df497b090", size = 3113456, upload-time = "2025-11-12T19:04:09.175Z" }, + { url = "https://files.pythonhosted.org/packages/56/5b/148c1a7ea5aebe460a70cad716a77e5fd0205be2de9fc5250491eb13ad8c/cython-3.2.1-cp39-abi3-win32.whl", hash = "sha256:06071f85bd5ce040464d43b2f9f287742a79f905e81b709fe904567230f1ed51", size = 2434223, upload-time = "2025-11-12T19:04:11.294Z" }, + { url = "https://files.pythonhosted.org/packages/7a/54/bb9b0c9db2a92a5e93747ca3027cfc645741411f8f1c6af2fb2a7b82df5d/cython-3.2.1-cp39-abi3-win_arm64.whl", hash = "sha256:e87c131d59480aee1ebac622b64f287c0e1d665ad1a1b7d498ac48accdb36c6b", size = 2439268, upload-time = "2025-11-12T19:04:12.931Z" }, + { url = "https://files.pythonhosted.org/packages/aa/30/373775b8d933d781d055c1dd0f110f275a101f320dab724c8c63a7c1b945/cython-3.2.1-py3-none-any.whl", hash = "sha256:cd72c46e7bffe8250c52d400e72c8d5d3086437b6aeec5b0eca99ccd337f5834", size = 1254219, upload-time = "2025-11-12T19:02:56.14Z" }, ] [[package]] @@ -1254,7 +1145,8 @@ dependencies = [ { name = "httpx" }, { name = "huggingface-hub" }, { name = "multiprocess" }, - { name = "numpy" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "packaging" }, { name = "pandas" }, { name = "pyarrow" }, @@ -1291,8 +1183,7 @@ name = "deprecated" version = "1.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "wrapt", version = "1.17.3", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" }, - { name = "wrapt", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" }, + { name = "wrapt" }, ] sdist = { url = "https://files.pythonhosted.org/packages/49/85/12f0a49a7c4ffb70572b6c2ef13c90c88fd190debda93b23f026b25f9634/deprecated-1.3.1.tar.gz", hash = "sha256:b1b50e0ff0c1fddaa5708a2c6b0a6588bb09b892825ab2b214ac9ea9d92a5223", size = 2932523, upload-time = "2025-10-30T08:19:02.757Z" } wheels = [ @@ -1340,18 +1231,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408, upload-time = "2024-04-23T18:57:14.835Z" }, ] -[[package]] -name = "donfig" -version = "0.8.1.post1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pyyaml", marker = "python_full_version >= '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/25/71/80cc718ff6d7abfbabacb1f57aaa42e9c1552bfdd01e64ddd704e4a03638/donfig-0.8.1.post1.tar.gz", hash = "sha256:3bef3413a4c1c601b585e8d297256d0c1470ea012afa6e8461dc28bfb7c23f52", size = 19506, upload-time = "2024-05-23T14:14:31.513Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0c/d5/c5db1ea3394c6e1732fb3286b3bd878b59507a8f77d32a2cebda7d7b7cd4/donfig-0.8.1.post1-py3-none-any.whl", hash = "sha256:2a3175ce74a06109ff9307d90a230f81215cbac9a751f4d1c6194644b8204f9d", size = 21592, upload-time = "2024-05-23T14:13:55.283Z" }, -] - [[package]] name = "ebmlite" version = "3.4.1" @@ -1382,14 +1261,14 @@ dependencies = [ [[package]] name = "exceptiongroup" -version = "1.3.0" +version = "1.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" } +sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" }, + { url = "https://files.pythonhosted.org/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598", size = 16740, upload-time = "2025-11-21T23:01:53.443Z" }, ] [[package]] @@ -1409,7 +1288,7 @@ wheels = [ [[package]] name = "fastapi" -version = "0.121.0" +version = "0.122.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "annotated-doc" }, @@ -1417,18 +1296,9 @@ dependencies = [ { name = "starlette" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8c/e3/77a2df0946703973b9905fd0cde6172c15e0781984320123b4f5079e7113/fastapi-0.121.0.tar.gz", hash = "sha256:06663356a0b1ee93e875bbf05a31fb22314f5bed455afaaad2b2dad7f26e98fa", size = 342412, upload-time = "2025-11-03T10:25:54.818Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dd/2c/42277afc1ba1a18f8358561eee40785d27becab8f80a1f945c0a3051c6eb/fastapi-0.121.0-py3-none-any.whl", hash = "sha256:8bdf1b15a55f4e4b0d6201033da9109ea15632cb76cf156e7b8b4019f2172106", size = 109183, upload-time = "2025-11-03T10:25:53.27Z" }, -] - -[[package]] -name = "fasteners" -version = "0.20" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/2d/18/7881a99ba5244bfc82f06017316ffe93217dbbbcfa52b887caa1d4f2a6d3/fasteners-0.20.tar.gz", hash = "sha256:55dce8792a41b56f727ba6e123fcaee77fd87e638a6863cec00007bfea84c8d8", size = 25087, upload-time = "2025-08-11T10:19:37.785Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/de/3ee97a4f6ffef1fb70bf20561e4f88531633bb5045dc6cebc0f8471f764d/fastapi-0.122.0.tar.gz", hash = "sha256:cd9b5352031f93773228af8b4c443eedc2ac2aa74b27780387b853c3726fb94b", size = 346436, upload-time = "2025-11-24T19:17:47.95Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/51/ac/e5d886f892666d2d1e5cb8c1a41146e1d79ae8896477b1153a21711d3b44/fasteners-0.20-py3-none-any.whl", hash = "sha256:9422c40d1e350e4259f509fb2e608d6bc43c0136f79a00db1b49046029d0b3b7", size = 18702, upload-time = "2025-08-11T10:19:35.716Z" }, + { url = "https://files.pythonhosted.org/packages/7a/93/aa8072af4ff37b795f6bbf43dcaf61115f40f49935c7dbb180c9afc3f421/fastapi-0.122.0-py3-none-any.whl", hash = "sha256:a456e8915dfc6c8914a50d9651133bd47ec96d331c5b44600baa635538a30d67", size = 110671, upload-time = "2025-11-24T19:17:45.96Z" }, ] [[package]] @@ -1513,14 +1383,15 @@ source = { git = "https://github.com/deepseek-ai/FlashMLA?rev=9edee0c022cd093814 [[package]] name = "flashinfer-python" -version = "0.5.1" +version = "0.5.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "apache-tvm-ffi" }, { name = "click" }, { name = "einops" }, { name = "ninja" }, - { name = "numpy" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "nvidia-cudnn-frontend" }, { name = "nvidia-cutlass-dsl" }, { name = "nvidia-ml-py" }, @@ -1530,9 +1401,9 @@ dependencies = [ { name = "torch", marker = "sys_platform == 'never'" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6c/bb/897c3b9d683dcf6490f70e468efb585eebcd673970b13a04ed947b491982/flashinfer_python-0.5.1.tar.gz", hash = "sha256:f12b32d88d8cc10a396456df8ab017f1c4661fbf257e14f4d2461961ec0d090e", size = 4627606, upload-time = "2025-11-04T05:55:02.376Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b4/91/cca69baeff24bb3efd12c7479a026432c8717ee47193694010494c528b22/flashinfer_python-0.5.3.tar.gz", hash = "sha256:100d59b0ede47878d2808cd3a1b9039d7a952d66338bc9f68dac192ae1b2e3f1", size = 4682367, upload-time = "2025-11-20T21:22:46.976Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f4/f1/33dedad087a2bc3d66244126bd5d1c79721ea22d1f2124299f9e5bdaf3b1/flashinfer_python-0.5.1-py3-none-any.whl", hash = "sha256:ec8434d21e53a0ec333734a3c61946a0f7d2f972e344aefa99ba5b87e63aa76a", size = 6932706, upload-time = "2025-11-04T05:55:00.335Z" }, + { url = "https://files.pythonhosted.org/packages/76/78/6dc7e7da8cb87c9965644ea0d2439457a1bc9256c45ceda0044595be4143/flashinfer_python-0.5.3-py3-none-any.whl", hash = "sha256:b601293b72f9138bad173edc28df84b9f239a013be974e2e79d4ba98aeb38cf5", size = 6998069, upload-time = "2025-11-20T21:22:45.104Z" }, ] [[package]] @@ -1820,7 +1691,7 @@ wheels = [ [[package]] name = "hatchling" -version = "1.27.0" +version = "1.28.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "packaging" }, @@ -1829,9 +1700,9 @@ dependencies = [ { name = "tomli", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "trove-classifiers" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8f/8a/cc1debe3514da292094f1c3a700e4ca25442489731ef7c0814358816bb03/hatchling-1.27.0.tar.gz", hash = "sha256:971c296d9819abb3811112fc52c7a9751c8d381898f36533bb16f9791e941fd6", size = 54983, upload-time = "2024-12-15T17:08:11.894Z" } +sdist = { url = "https://files.pythonhosted.org/packages/0b/8e/e480359492affde4119a131da729dd26da742c2c9b604dff74836e47eef9/hatchling-1.28.0.tar.gz", hash = "sha256:4d50b02aece6892b8cd0b3ce6c82cb218594d3ec5836dbde75bf41a21ab004c8", size = 55365, upload-time = "2025-11-27T00:31:13.766Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/08/e7/ae38d7a6dfba0533684e0b2136817d667588ae3ec984c1a4e5df5eb88482/hatchling-1.27.0-py3-none-any.whl", hash = "sha256:d3a2f3567c4f926ea39849cdf924c7e99e6686c9c8e288ae1037c8fa2a5d937b", size = 75794, upload-time = "2024-12-15T17:08:10.364Z" }, + { url = "https://files.pythonhosted.org/packages/0d/a5/48cb7efb8b4718b1a4c0c331e3364a3a33f614ff0d6afd2b93ee883d3c47/hatchling-1.28.0-py3-none-any.whl", hash = "sha256:dc48722b68b3f4bbfa3ff618ca07cdea6750e7d03481289ffa8be1521d18a961", size = 76075, upload-time = "2025-11-27T00:31:12.544Z" }, ] [[package]] @@ -1956,74 +1827,14 @@ wheels = [ name = "importlib-metadata" version = "8.6.1" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.12.*' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.12.*' and sys_platform != 'linux'", - "python_full_version == '3.11.*' and sys_platform == 'linux'", - "python_full_version == '3.11.*' and sys_platform != 'linux'", - "python_full_version < '3.11' and sys_platform == 'linux'", - "python_full_version < '3.11' and sys_platform != 'linux'", -] dependencies = [ - { name = "zipp", marker = "extra == 'extra-13-megatron-core-dev'" }, + { name = "zipp" }, ] sdist = { url = "https://files.pythonhosted.org/packages/33/08/c1395a292bb23fd03bdf572a1357c5a733d3eecbab877641ceacab23db6e/importlib_metadata-8.6.1.tar.gz", hash = "sha256:310b41d755445d74569f993ccfc22838295d9fe005425094fad953d7f15c8580", size = 55767, upload-time = "2025-01-20T22:21:30.429Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/79/9d/0fb148dc4d6fa4a7dd1d8378168d9b4cd8d4560a6fbf6f0121c5fc34eb68/importlib_metadata-8.6.1-py3-none-any.whl", hash = "sha256:02a89390c1e15fdfdc0d7c6b25cb3e62650d0494005c97d6f148bf5b9787525e", size = 26971, upload-time = "2025-01-20T22:21:29.177Z" }, ] -[[package]] -name = "importlib-metadata" -version = "8.7.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", -] -dependencies = [ - { name = "zipp", marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/76/66/650a33bd90f786193e4de4b3ad86ea60b53c89b669a5c7be931fac31cdb0/importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000", size = 56641, upload-time = "2025-04-27T15:29:01.736Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd", size = 27656, upload-time = "2025-04-27T15:29:00.214Z" }, -] - [[package]] name = "iniconfig" version = "2.3.0" @@ -2150,7 +1961,7 @@ wheels = [ [[package]] name = "leptonai" -version = "0.26.6" +version = "0.26.7" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -2175,7 +1986,7 @@ dependencies = [ { name = "uvicorn" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/68/b4/e29dfe5a6e63a0e55fc26115a8eef55fbbc004c7677544bbd88798e1c003/leptonai-0.26.6-py3-none-any.whl", hash = "sha256:e76846b52d6ffc186b26a1fa40ebf0432eb1d8108dda1fb2f7785a1f25c803c2", size = 2443372, upload-time = "2025-09-23T08:04:27.984Z" }, + { url = "https://files.pythonhosted.org/packages/c2/4d/2b5ab13294b23326ba1d8ef6ad703b1d9535bf72a0617030ddd6238eb925/leptonai-0.26.7-py3-none-any.whl", hash = "sha256:74996da36bf177d2b148887dd349627ab8cd78b94623d543bc91ed9ad65ba0e2", size = 2452890, upload-time = "2025-11-07T20:07:14.99Z" }, ] [[package]] @@ -2414,7 +2225,8 @@ wheels = [ name = "megatron-core" source = { editable = "." } dependencies = [ - { name = "numpy" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "packaging" }, { name = "torch", marker = "sys_platform == 'never' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] @@ -2425,6 +2237,7 @@ dev = [ { name = "causal-conv1d" }, { name = "einops" }, { name = "emerging-optimizers" }, + { name = "fastapi" }, { name = "flash-linear-attention" }, { name = "flashinfer-python" }, { name = "mamba-ssm" }, @@ -2434,27 +2247,31 @@ dev = [ { name = "nvidia-modelopt", marker = "(sys_platform != 'darwin' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "nvidia-resiliency-ext" }, { name = "nvtx" }, - { name = "onnxscript", version = "0.5.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "onnxscript", version = "0.5.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "onnxscript" }, { name = "opentelemetry-api" }, - { name = "setuptools" }, - { name = "tensorstore", version = "0.1.74", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "tensorstore", version = "0.1.78", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "tensorstore", version = "0.1.78", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "tensorstore", version = "0.1.79", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "tqdm" }, - { name = "transformer-engine", marker = "extra == 'extra-13-megatron-core-dev'" }, + { name = "transformer-engine", extra = ["core-cu13", "pytorch"], marker = "extra == 'extra-13-megatron-core-dev'" }, { name = "wget" }, ] lts = [ + { name = "av" }, + { name = "causal-conv1d" }, { name = "einops" }, + { name = "fastapi" }, + { name = "flashinfer-python" }, + { name = "mamba-ssm" }, + { name = "megatron-energon", extra = ["av-decode"], marker = "extra == 'extra-13-megatron-core-lts'" }, + { name = "multi-storage-client" }, + { name = "nv-grouped-gemm" }, { name = "nvtx" }, - { name = "setuptools" }, - { name = "tensorstore", version = "0.1.74", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.13' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "tensorstore", version = "0.1.78", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "onnxscript" }, + { name = "opentelemetry-api" }, + { name = "tensorstore", version = "0.1.78", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "tensorstore", version = "0.1.79", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "tqdm" }, - { name = "transformers" }, { name = "wget" }, - { name = "zarr", version = "2.18.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "zarr", version = "3.1.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] mlm = [ { name = "flask-restful" }, @@ -2489,9 +2306,6 @@ docs = [ { name = "sphinx-autodoc2" }, { name = "sphinx-copybutton" }, ] -flash-mla = [ - { name = "flash-mla" }, -] linting = [ { name = "black" }, { name = "flake8" }, @@ -2499,6 +2313,10 @@ linting = [ { name = "pylint" }, { name = "ruff" }, ] +no-pypi-wheels = [ + { name = "emerging-optimizers" }, + { name = "flash-mla" }, +] test = [ { name = "coverage" }, { name = "nemo-run" }, @@ -2512,48 +2330,54 @@ test = [ { name = "pytest-random-order" }, { name = "pyyaml" }, { name = "tensorboard" }, - { name = "wrapt", version = "1.17.3", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" }, - { name = "wrapt", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" }, + { name = "wrapt" }, ] [package.metadata] requires-dist = [ - { name = "av", marker = "extra == 'dev'", specifier = "<16.0.0" }, + { name = "av", marker = "extra == 'dev'" }, + { name = "av", marker = "extra == 'lts'" }, { name = "causal-conv1d", marker = "extra == 'dev'", specifier = "~=1.5" }, + { name = "causal-conv1d", marker = "extra == 'lts'", specifier = "~=1.5" }, { name = "einops", marker = "extra == 'dev'", specifier = "~=0.8" }, - { name = "einops", marker = "extra == 'lts'" }, + { name = "einops", marker = "extra == 'lts'", specifier = "~=0.8" }, { name = "emerging-optimizers", marker = "extra == 'dev'", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.1.0" }, + { name = "fastapi", marker = "extra == 'dev'", specifier = "~=0.50" }, + { name = "fastapi", marker = "extra == 'lts'", specifier = "~=0.50" }, { name = "flash-linear-attention", marker = "extra == 'dev'", specifier = "~=0.3.2" }, { name = "flashinfer-python", marker = "extra == 'dev'" }, + { name = "flashinfer-python", marker = "extra == 'lts'" }, { name = "flask-restful", marker = "extra == 'mlm'" }, { name = "mamba-ssm", marker = "extra == 'dev'", specifier = "~=2.2" }, + { name = "mamba-ssm", marker = "extra == 'lts'", specifier = "~=2.2" }, { name = "megatron-energon", extras = ["av-decode"], marker = "extra == 'dev'", specifier = "~=6.0" }, + { name = "megatron-energon", extras = ["av-decode"], marker = "extra == 'lts'", specifier = "~=6.0" }, { name = "multi-storage-client", marker = "extra == 'dev'", specifier = "~=0.27" }, - { name = "numpy", specifier = "<2.0.0" }, + { name = "multi-storage-client", marker = "extra == 'lts'", specifier = "~=0.27" }, + { name = "numpy" }, { name = "nv-grouped-gemm", marker = "extra == 'dev'", specifier = "~=1.1" }, - { name = "nvidia-modelopt", extras = ["torch"], marker = "sys_platform != 'darwin' and extra == 'dev'", specifier = ">=0.33.0a0,<0.34.0" }, - { name = "nvidia-resiliency-ext", marker = "extra == 'dev'", specifier = ">=0.4.0a0,<0.5.0" }, + { name = "nv-grouped-gemm", marker = "extra == 'lts'", specifier = "~=1.1" }, + { name = "nvidia-modelopt", extras = ["torch"], marker = "sys_platform != 'darwin' and extra == 'dev'" }, + { name = "nvidia-resiliency-ext", marker = "extra == 'dev'" }, { name = "nvtx", marker = "extra == 'dev'", specifier = "~=0.2" }, - { name = "nvtx", marker = "extra == 'lts'" }, + { name = "nvtx", marker = "extra == 'lts'", specifier = "~=0.2" }, { name = "onnxscript", marker = "extra == 'dev'" }, + { name = "onnxscript", marker = "extra == 'lts'" }, { name = "opentelemetry-api", marker = "extra == 'dev'", specifier = "~=1.33.1" }, + { name = "opentelemetry-api", marker = "extra == 'lts'", specifier = "~=1.33.1" }, { name = "packaging", specifier = ">=24.2" }, { name = "sentencepiece", marker = "extra == 'mlm'" }, - { name = "setuptools", marker = "extra == 'dev'", specifier = "<80.0.0" }, - { name = "setuptools", marker = "extra == 'lts'", specifier = "<80.0.0" }, { name = "tensorstore", marker = "extra == 'dev'", specifier = "~=0.1,!=0.1.46,!=0.1.72" }, - { name = "tensorstore", marker = "extra == 'lts'", specifier = "!=0.1.46,!=0.1.72" }, + { name = "tensorstore", marker = "extra == 'lts'", specifier = "~=0.1,!=0.1.46,!=0.1.72" }, { name = "tiktoken", marker = "extra == 'mlm'" }, { name = "torch" }, { name = "tqdm", marker = "extra == 'dev'" }, { name = "tqdm", marker = "extra == 'lts'" }, - { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'dev'", git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.9" }, - { name = "transformers", marker = "extra == 'lts'" }, + { name = "transformer-engine", extras = ["core-cu13", "pytorch"], marker = "extra == 'dev'", specifier = ">=2.9.0a0,<2.10.0" }, { name = "transformers", marker = "extra == 'mlm'" }, { name = "wandb", marker = "extra == 'mlm'" }, { name = "wget", marker = "extra == 'dev'" }, { name = "wget", marker = "extra == 'lts'" }, - { name = "zarr", marker = "extra == 'lts'" }, ] provides-extras = ["mlm", "dev", "lts"] @@ -2580,7 +2404,6 @@ docs = [ { name = "sphinx-autodoc2" }, { name = "sphinx-copybutton" }, ] -flash-mla = [{ name = "flash-mla", git = "https://github.com/deepseek-ai/FlashMLA?rev=9edee0c022cd0938148a18e334203b0aab43aa19" }] linting = [ { name = "black", specifier = "==24.4.2" }, { name = "flake8", specifier = "==7.1.0" }, @@ -2588,6 +2411,10 @@ linting = [ { name = "pylint", specifier = "==3.2.6" }, { name = "ruff", specifier = "~=0.9.0" }, ] +no-pypi-wheels = [ + { name = "emerging-optimizers", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.1.0" }, + { name = "flash-mla", git = "https://github.com/deepseek-ai/FlashMLA?rev=9edee0c022cd0938148a18e334203b0aab43aa19" }, +] test = [ { name = "coverage" }, { name = "nemo-run", git = "https://github.com/NVIDIA-NeMo/Run.git?rev=01a9a8ba360f7b2908728ad0516e0ad9d936966d" }, @@ -2612,7 +2439,8 @@ dependencies = [ { name = "braceexpand" }, { name = "click" }, { name = "multi-storage-client" }, - { name = "numpy" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "pillow" }, { name = "pyyaml" }, { name = "s3fs" }, @@ -2637,84 +2465,48 @@ av-decode = [ [[package]] name = "ml-dtypes" -version = "0.4.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", -] -dependencies = [ - { name = "numpy", marker = "python_full_version >= '3.13'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/fd/15/76f86faa0902836cc133939732f7611ace68cf54148487a99c539c272dc8/ml_dtypes-0.4.1.tar.gz", hash = "sha256:fad5f2de464fd09127e49b7fd1252b9006fb43d2edc1ff112d390c324af5ca7a", size = 692594, upload-time = "2024-09-13T19:07:11.624Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/56/9e/76b84f77c7afee3b116dc8407903a2d5004ba3059a8f3dcdcfa6ebf33fff/ml_dtypes-0.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1fe8b5b5e70cd67211db94b05cfd58dace592f24489b038dc6f9fe347d2e07d5", size = 397975, upload-time = "2024-09-13T19:06:44.265Z" }, - { url = "https://files.pythonhosted.org/packages/03/7b/32650e1b2a2713a5923a0af2a8503d0d4a8fc99d1e1e0a1c40e996634460/ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c09a6d11d8475c2a9fd2bc0695628aec105f97cab3b3a3fb7c9660348ff7d24", size = 2182570, upload-time = "2024-09-13T19:06:46.189Z" }, - { url = "https://files.pythonhosted.org/packages/16/86/a9f7569e7e4f5395f927de38a13b92efa73f809285d04f2923b291783dd2/ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f5e8f75fa371020dd30f9196e7d73babae2abd51cf59bdd56cb4f8de7e13354", size = 2160365, upload-time = "2024-09-13T19:06:48.198Z" }, - { url = "https://files.pythonhosted.org/packages/04/1b/9a3afb437702503514f3934ec8d7904270edf013d28074f3e700e5dfbb0f/ml_dtypes-0.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:15fdd922fea57e493844e5abb930b9c0bd0af217d9edd3724479fc3d7ce70e3f", size = 126633, upload-time = "2024-09-13T19:06:50.656Z" }, - { url = "https://files.pythonhosted.org/packages/d1/76/9835c8609c29f2214359e88f29255fc4aad4ea0f613fb48aa8815ceda1b6/ml_dtypes-0.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2d55b588116a7085d6e074cf0cdb1d6fa3875c059dddc4d2c94a4cc81c23e975", size = 397973, upload-time = "2024-09-13T19:06:51.748Z" }, - { url = "https://files.pythonhosted.org/packages/7e/99/e68c56fac5de973007a10254b6e17a0362393724f40f66d5e4033f4962c2/ml_dtypes-0.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e138a9b7a48079c900ea969341a5754019a1ad17ae27ee330f7ebf43f23877f9", size = 2185134, upload-time = "2024-09-13T19:06:53.197Z" }, - { url = "https://files.pythonhosted.org/packages/28/bc/6a2344338ea7b61cd7b46fb24ec459360a5a0903b57c55b156c1e46c644a/ml_dtypes-0.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:74c6cfb5cf78535b103fde9ea3ded8e9f16f75bc07789054edc7776abfb3d752", size = 2163661, upload-time = "2024-09-13T19:06:54.519Z" }, - { url = "https://files.pythonhosted.org/packages/e8/d3/ddfd9878b223b3aa9a930c6100a99afca5cfab7ea703662e00323acb7568/ml_dtypes-0.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:274cc7193dd73b35fb26bef6c5d40ae3eb258359ee71cd82f6e96a8c948bdaa6", size = 126727, upload-time = "2024-09-13T19:06:55.897Z" }, - { url = "https://files.pythonhosted.org/packages/ba/1a/99e924f12e4b62139fbac87419698c65f956d58de0dbfa7c028fa5b096aa/ml_dtypes-0.4.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:827d3ca2097085cf0355f8fdf092b888890bb1b1455f52801a2d7756f056f54b", size = 405077, upload-time = "2024-09-13T19:06:57.538Z" }, - { url = "https://files.pythonhosted.org/packages/8f/8c/7b610bd500617854c8cc6ed7c8cfb9d48d6a5c21a1437a36a4b9bc8a3598/ml_dtypes-0.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:772426b08a6172a891274d581ce58ea2789cc8abc1c002a27223f314aaf894e7", size = 2181554, upload-time = "2024-09-13T19:06:59.196Z" }, - { url = "https://files.pythonhosted.org/packages/c7/c6/f89620cecc0581dc1839e218c4315171312e46c62a62da6ace204bda91c0/ml_dtypes-0.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:126e7d679b8676d1a958f2651949fbfa182832c3cd08020d8facd94e4114f3e9", size = 2160488, upload-time = "2024-09-13T19:07:03.131Z" }, - { url = "https://files.pythonhosted.org/packages/ae/11/a742d3c31b2cc8557a48efdde53427fd5f9caa2fa3c9c27d826e78a66f51/ml_dtypes-0.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:df0fb650d5c582a9e72bb5bd96cfebb2cdb889d89daff621c8fbc60295eba66c", size = 127462, upload-time = "2024-09-13T19:07:04.916Z" }, -] - -[[package]] -name = "ml-dtypes" -version = "0.5.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version == '3.12.*' and sys_platform == 'linux'", - "python_full_version == '3.12.*' and sys_platform != 'linux'", - "python_full_version == '3.11.*' and sys_platform == 'linux'", - "python_full_version == '3.11.*' and sys_platform != 'linux'", - "python_full_version < '3.11' and sys_platform == 'linux'", - "python_full_version < '3.11' and sys_platform != 'linux'", -] -dependencies = [ - { name = "numpy", marker = "python_full_version < '3.13'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/78/a7/aad060393123cfb383956dca68402aff3db1e1caffd5764887ed5153f41b/ml_dtypes-0.5.3.tar.gz", hash = "sha256:95ce33057ba4d05df50b1f3cfefab22e351868a843b3b15a46c65836283670c9", size = 692316, upload-time = "2025-07-29T18:39:19.454Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ac/bb/1f32124ab6d3a279ea39202fe098aea95b2d81ef0ce1d48612b6bf715e82/ml_dtypes-0.5.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0a1d68a7cb53e3f640b2b6a34d12c0542da3dd935e560fdf463c0c77f339fc20", size = 667409, upload-time = "2025-07-29T18:38:17.321Z" }, - { url = "https://files.pythonhosted.org/packages/1d/ac/e002d12ae19136e25bb41c7d14d7e1a1b08f3c0e99a44455ff6339796507/ml_dtypes-0.5.3-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0cd5a6c711b5350f3cbc2ac28def81cd1c580075ccb7955e61e9d8f4bfd40d24", size = 4960702, upload-time = "2025-07-29T18:38:19.616Z" }, - { url = "https://files.pythonhosted.org/packages/dd/12/79e9954e6b3255a4b1becb191a922d6e2e94d03d16a06341ae9261963ae8/ml_dtypes-0.5.3-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bdcf26c2dbc926b8a35ec8cbfad7eff1a8bd8239e12478caca83a1fc2c400dc2", size = 4933471, upload-time = "2025-07-29T18:38:21.809Z" }, - { url = "https://files.pythonhosted.org/packages/d5/aa/d1eff619e83cd1ddf6b561d8240063d978e5d887d1861ba09ef01778ec3a/ml_dtypes-0.5.3-cp310-cp310-win_amd64.whl", hash = "sha256:aecbd7c5272c82e54d5b99d8435fd10915d1bc704b7df15e4d9ca8dc3902be61", size = 206330, upload-time = "2025-07-29T18:38:23.663Z" }, - { url = "https://files.pythonhosted.org/packages/af/f1/720cb1409b5d0c05cff9040c0e9fba73fa4c67897d33babf905d5d46a070/ml_dtypes-0.5.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4a177b882667c69422402df6ed5c3428ce07ac2c1f844d8a1314944651439458", size = 667412, upload-time = "2025-07-29T18:38:25.275Z" }, - { url = "https://files.pythonhosted.org/packages/6a/d5/05861ede5d299f6599f86e6bc1291714e2116d96df003cfe23cc54bcc568/ml_dtypes-0.5.3-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9849ce7267444c0a717c80c6900997de4f36e2815ce34ac560a3edb2d9a64cd2", size = 4964606, upload-time = "2025-07-29T18:38:27.045Z" }, - { url = "https://files.pythonhosted.org/packages/db/dc/72992b68de367741bfab8df3b3fe7c29f982b7279d341aa5bf3e7ef737ea/ml_dtypes-0.5.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c3f5ae0309d9f888fd825c2e9d0241102fadaca81d888f26f845bc8c13c1e4ee", size = 4938435, upload-time = "2025-07-29T18:38:29.193Z" }, - { url = "https://files.pythonhosted.org/packages/81/1c/d27a930bca31fb07d975a2d7eaf3404f9388114463b9f15032813c98f893/ml_dtypes-0.5.3-cp311-cp311-win_amd64.whl", hash = "sha256:58e39349d820b5702bb6f94ea0cb2dc8ec62ee81c0267d9622067d8333596a46", size = 206334, upload-time = "2025-07-29T18:38:30.687Z" }, - { url = "https://files.pythonhosted.org/packages/1a/d8/6922499effa616012cb8dc445280f66d100a7ff39b35c864cfca019b3f89/ml_dtypes-0.5.3-cp311-cp311-win_arm64.whl", hash = "sha256:66c2756ae6cfd7f5224e355c893cfd617fa2f747b8bbd8996152cbdebad9a184", size = 157584, upload-time = "2025-07-29T18:38:32.187Z" }, - { url = "https://files.pythonhosted.org/packages/0d/eb/bc07c88a6ab002b4635e44585d80fa0b350603f11a2097c9d1bfacc03357/ml_dtypes-0.5.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:156418abeeda48ea4797db6776db3c5bdab9ac7be197c1233771e0880c304057", size = 663864, upload-time = "2025-07-29T18:38:33.777Z" }, - { url = "https://files.pythonhosted.org/packages/cf/89/11af9b0f21b99e6386b6581ab40fb38d03225f9de5f55cf52097047e2826/ml_dtypes-0.5.3-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1db60c154989af253f6c4a34e8a540c2c9dce4d770784d426945e09908fbb177", size = 4951313, upload-time = "2025-07-29T18:38:36.45Z" }, - { url = "https://files.pythonhosted.org/packages/d8/a9/b98b86426c24900b0c754aad006dce2863df7ce0bb2bcc2c02f9cc7e8489/ml_dtypes-0.5.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1b255acada256d1fa8c35ed07b5f6d18bc21d1556f842fbc2d5718aea2cd9e55", size = 4928805, upload-time = "2025-07-29T18:38:38.29Z" }, - { url = "https://files.pythonhosted.org/packages/50/c1/85e6be4fc09c6175f36fb05a45917837f30af9a5146a5151cb3a3f0f9e09/ml_dtypes-0.5.3-cp312-cp312-win_amd64.whl", hash = "sha256:da65e5fd3eea434ccb8984c3624bc234ddcc0d9f4c81864af611aaebcc08a50e", size = 208182, upload-time = "2025-07-29T18:38:39.72Z" }, - { url = "https://files.pythonhosted.org/packages/9e/17/cf5326d6867be057f232d0610de1458f70a8ce7b6290e4b4a277ea62b4cd/ml_dtypes-0.5.3-cp312-cp312-win_arm64.whl", hash = "sha256:8bb9cd1ce63096567f5f42851f5843b5a0ea11511e50039a7649619abfb4ba6d", size = 161560, upload-time = "2025-07-29T18:38:41.072Z" }, - { url = "https://files.pythonhosted.org/packages/2d/87/1bcc98a66de7b2455dfb292f271452cac9edc4e870796e0d87033524d790/ml_dtypes-0.5.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:5103856a225465371fe119f2fef737402b705b810bd95ad5f348e6e1a6ae21af", size = 663781, upload-time = "2025-07-29T18:38:42.984Z" }, - { url = "https://files.pythonhosted.org/packages/fd/2c/bd2a79ba7c759ee192b5601b675b180a3fd6ccf48ffa27fe1782d280f1a7/ml_dtypes-0.5.3-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cae435a68861660af81fa3c5af16b70ca11a17275c5b662d9c6f58294e0f113", size = 4956217, upload-time = "2025-07-29T18:38:44.65Z" }, - { url = "https://files.pythonhosted.org/packages/14/f3/091ba84e5395d7fe5b30c081a44dec881cd84b408db1763ee50768b2ab63/ml_dtypes-0.5.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6936283b56d74fbec431ca57ce58a90a908fdbd14d4e2d22eea6d72bb208a7b7", size = 4933109, upload-time = "2025-07-29T18:38:46.405Z" }, - { url = "https://files.pythonhosted.org/packages/bc/24/054036dbe32c43295382c90a1363241684c4d6aaa1ecc3df26bd0c8d5053/ml_dtypes-0.5.3-cp313-cp313-win_amd64.whl", hash = "sha256:d0f730a17cf4f343b2c7ad50cee3bd19e969e793d2be6ed911f43086460096e4", size = 208187, upload-time = "2025-07-29T18:38:48.24Z" }, - { url = "https://files.pythonhosted.org/packages/a6/3d/7dc3ec6794a4a9004c765e0c341e32355840b698f73fd2daff46f128afc1/ml_dtypes-0.5.3-cp313-cp313-win_arm64.whl", hash = "sha256:2db74788fc01914a3c7f7da0763427280adfc9cd377e9604b6b64eb8097284bd", size = 161559, upload-time = "2025-07-29T18:38:50.493Z" }, - { url = "https://files.pythonhosted.org/packages/12/91/e6c7a0d67a152b9330445f9f0cf8ae6eee9b83f990b8c57fe74631e42a90/ml_dtypes-0.5.3-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:93c36a08a6d158db44f2eb9ce3258e53f24a9a4a695325a689494f0fdbc71770", size = 689321, upload-time = "2025-07-29T18:38:52.03Z" }, - { url = "https://files.pythonhosted.org/packages/9e/6c/b7b94b84a104a5be1883305b87d4c6bd6ae781504474b4cca067cb2340ec/ml_dtypes-0.5.3-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0e44a3761f64bc009d71ddb6d6c71008ba21b53ab6ee588dadab65e2fa79eafc", size = 5274495, upload-time = "2025-07-29T18:38:53.797Z" }, - { url = "https://files.pythonhosted.org/packages/5b/38/6266604dffb43378055394ea110570cf261a49876fc48f548dfe876f34cc/ml_dtypes-0.5.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bdf40d2aaabd3913dec11840f0d0ebb1b93134f99af6a0a4fd88ffe924928ab4", size = 5285422, upload-time = "2025-07-29T18:38:56.603Z" }, - { url = "https://files.pythonhosted.org/packages/7c/88/8612ff177d043a474b9408f0382605d881eeb4125ba89d4d4b3286573a83/ml_dtypes-0.5.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:aec640bd94c4c85c0d11e2733bd13cbb10438fb004852996ec0efbc6cacdaf70", size = 661182, upload-time = "2025-07-29T18:38:58.414Z" }, - { url = "https://files.pythonhosted.org/packages/6f/2b/0569a5e88b29240d373e835107c94ae9256fb2191d3156b43b2601859eff/ml_dtypes-0.5.3-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bda32ce212baa724e03c68771e5c69f39e584ea426bfe1a701cb01508ffc7035", size = 4956187, upload-time = "2025-07-29T18:39:00.611Z" }, - { url = "https://files.pythonhosted.org/packages/51/66/273c2a06ae44562b104b61e6b14444da00061fd87652506579d7eb2c40b1/ml_dtypes-0.5.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c205cac07d24a29840c163d6469f61069ce4b065518519216297fc2f261f8db9", size = 4930911, upload-time = "2025-07-29T18:39:02.405Z" }, - { url = "https://files.pythonhosted.org/packages/93/ab/606be3e87dc0821bd360c8c1ee46108025c31a4f96942b63907bb441b87d/ml_dtypes-0.5.3-cp314-cp314-win_amd64.whl", hash = "sha256:cd7c0bb22d4ff86d65ad61b5dd246812e8993fbc95b558553624c33e8b6903ea", size = 216664, upload-time = "2025-07-29T18:39:03.927Z" }, - { url = "https://files.pythonhosted.org/packages/30/a2/e900690ca47d01dffffd66375c5de8c4f8ced0f1ef809ccd3b25b3e6b8fa/ml_dtypes-0.5.3-cp314-cp314-win_arm64.whl", hash = "sha256:9d55ea7f7baf2aed61bf1872116cefc9d0c3693b45cae3916897ee27ef4b835e", size = 160203, upload-time = "2025-07-29T18:39:05.671Z" }, - { url = "https://files.pythonhosted.org/packages/53/21/783dfb51f40d2660afeb9bccf3612b99f6a803d980d2a09132b0f9d216ab/ml_dtypes-0.5.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:e12e29764a0e66a7a31e9b8bf1de5cc0423ea72979f45909acd4292de834ccd3", size = 689324, upload-time = "2025-07-29T18:39:07.567Z" }, - { url = "https://files.pythonhosted.org/packages/09/f7/a82d249c711abf411ac027b7163f285487f5e615c3e0716c61033ce996ab/ml_dtypes-0.5.3-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:19f6c3a4f635c2fc9e2aa7d91416bd7a3d649b48350c51f7f715a09370a90d93", size = 5275917, upload-time = "2025-07-29T18:39:09.339Z" }, - { url = "https://files.pythonhosted.org/packages/7f/3c/541c4b30815ab90ebfbb51df15d0b4254f2f9f1e2b4907ab229300d5e6f2/ml_dtypes-0.5.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ab039ffb40f3dc0aeeeba84fd6c3452781b5e15bef72e2d10bcb33e4bbffc39", size = 5285284, upload-time = "2025-07-29T18:39:11.532Z" }, +version = "0.5.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0e/4a/c27b42ed9b1c7d13d9ba8b6905dece787d6259152f2309338aed29b2447b/ml_dtypes-0.5.4.tar.gz", hash = "sha256:8ab06a50fb9bf9666dd0fe5dfb4676fa2b0ac0f31ecff72a6c3af8e22c063453", size = 692314, upload-time = "2025-11-17T22:32:31.031Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/3a/c5b855752a70267ff729c349e650263adb3c206c29d28cc8ea7ace30a1d5/ml_dtypes-0.5.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b95e97e470fe60ed493fd9ae3911d8da4ebac16bd21f87ffa2b7c588bf22ea2c", size = 679735, upload-time = "2025-11-17T22:31:31.367Z" }, + { url = "https://files.pythonhosted.org/packages/41/79/7433f30ee04bd4faa303844048f55e1eb939131c8e5195a00a96a0939b64/ml_dtypes-0.5.4-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b4b801ebe0b477be666696bda493a9be8356f1f0057a57f1e35cd26928823e5a", size = 5051883, upload-time = "2025-11-17T22:31:33.658Z" }, + { url = "https://files.pythonhosted.org/packages/10/b1/8938e8830b0ee2e167fc75a094dea766a1152bde46752cd9bfc57ee78a82/ml_dtypes-0.5.4-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:388d399a2152dd79a3f0456a952284a99ee5c93d3e2f8dfe25977511e0515270", size = 5030369, upload-time = "2025-11-17T22:31:35.595Z" }, + { url = "https://files.pythonhosted.org/packages/c7/a3/51886727bd16e2f47587997b802dd56398692ce8c6c03c2e5bb32ecafe26/ml_dtypes-0.5.4-cp310-cp310-win_amd64.whl", hash = "sha256:4ff7f3e7ca2972e7de850e7b8fcbb355304271e2933dd90814c1cb847414d6e2", size = 210738, upload-time = "2025-11-17T22:31:37.43Z" }, + { url = "https://files.pythonhosted.org/packages/c6/5e/712092cfe7e5eb667b8ad9ca7c54442f21ed7ca8979745f1000e24cf8737/ml_dtypes-0.5.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6c7ecb74c4bd71db68a6bea1edf8da8c34f3d9fe218f038814fd1d310ac76c90", size = 679734, upload-time = "2025-11-17T22:31:39.223Z" }, + { url = "https://files.pythonhosted.org/packages/4f/cf/912146dfd4b5c0eea956836c01dcd2fce6c9c844b2691f5152aca196ce4f/ml_dtypes-0.5.4-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bc11d7e8c44a65115d05e2ab9989d1e045125d7be8e05a071a48bc76eb6d6040", size = 5056165, upload-time = "2025-11-17T22:31:41.071Z" }, + { url = "https://files.pythonhosted.org/packages/a9/80/19189ea605017473660e43762dc853d2797984b3c7bf30ce656099add30c/ml_dtypes-0.5.4-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:19b9a53598f21e453ea2fbda8aa783c20faff8e1eeb0d7ab899309a0053f1483", size = 5034975, upload-time = "2025-11-17T22:31:42.758Z" }, + { url = "https://files.pythonhosted.org/packages/b4/24/70bd59276883fdd91600ca20040b41efd4902a923283c4d6edcb1de128d2/ml_dtypes-0.5.4-cp311-cp311-win_amd64.whl", hash = "sha256:7c23c54a00ae43edf48d44066a7ec31e05fdc2eee0be2b8b50dd1903a1db94bb", size = 210742, upload-time = "2025-11-17T22:31:44.068Z" }, + { url = "https://files.pythonhosted.org/packages/a0/c9/64230ef14e40aa3f1cb254ef623bf812735e6bec7772848d19131111ac0d/ml_dtypes-0.5.4-cp311-cp311-win_arm64.whl", hash = "sha256:557a31a390b7e9439056644cb80ed0735a6e3e3bb09d67fd5687e4b04238d1de", size = 160709, upload-time = "2025-11-17T22:31:46.557Z" }, + { url = "https://files.pythonhosted.org/packages/a8/b8/3c70881695e056f8a32f8b941126cf78775d9a4d7feba8abcb52cb7b04f2/ml_dtypes-0.5.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a174837a64f5b16cab6f368171a1a03a27936b31699d167684073ff1c4237dac", size = 676927, upload-time = "2025-11-17T22:31:48.182Z" }, + { url = "https://files.pythonhosted.org/packages/54/0f/428ef6881782e5ebb7eca459689448c0394fa0a80bea3aa9262cba5445ea/ml_dtypes-0.5.4-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a7f7c643e8b1320fd958bf098aa7ecf70623a42ec5154e3be3be673f4c34d900", size = 5028464, upload-time = "2025-11-17T22:31:50.135Z" }, + { url = "https://files.pythonhosted.org/packages/3a/cb/28ce52eb94390dda42599c98ea0204d74799e4d8047a0eb559b6fd648056/ml_dtypes-0.5.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ad459e99793fa6e13bd5b7e6792c8f9190b4e5a1b45c63aba14a4d0a7f1d5ff", size = 5009002, upload-time = "2025-11-17T22:31:52.001Z" }, + { url = "https://files.pythonhosted.org/packages/f5/f0/0cfadd537c5470378b1b32bd859cf2824972174b51b873c9d95cfd7475a5/ml_dtypes-0.5.4-cp312-cp312-win_amd64.whl", hash = "sha256:c1a953995cccb9e25a4ae19e34316671e4e2edaebe4cf538229b1fc7109087b7", size = 212222, upload-time = "2025-11-17T22:31:53.742Z" }, + { url = "https://files.pythonhosted.org/packages/16/2e/9acc86985bfad8f2c2d30291b27cd2bb4c74cea08695bd540906ed744249/ml_dtypes-0.5.4-cp312-cp312-win_arm64.whl", hash = "sha256:9bad06436568442575beb2d03389aa7456c690a5b05892c471215bfd8cf39460", size = 160793, upload-time = "2025-11-17T22:31:55.358Z" }, + { url = "https://files.pythonhosted.org/packages/d9/a1/4008f14bbc616cfb1ac5b39ea485f9c63031c4634ab3f4cf72e7541f816a/ml_dtypes-0.5.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c760d85a2f82e2bed75867079188c9d18dae2ee77c25a54d60e9cc79be1bc48", size = 676888, upload-time = "2025-11-17T22:31:56.907Z" }, + { url = "https://files.pythonhosted.org/packages/d3/b7/dff378afc2b0d5a7d6cd9d3209b60474d9819d1189d347521e1688a60a53/ml_dtypes-0.5.4-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ce756d3a10d0c4067172804c9cc276ba9cc0ff47af9078ad439b075d1abdc29b", size = 5036993, upload-time = "2025-11-17T22:31:58.497Z" }, + { url = "https://files.pythonhosted.org/packages/eb/33/40cd74219417e78b97c47802037cf2d87b91973e18bb968a7da48a96ea44/ml_dtypes-0.5.4-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:533ce891ba774eabf607172254f2e7260ba5f57bdd64030c9a4fcfbd99815d0d", size = 5010956, upload-time = "2025-11-17T22:31:59.931Z" }, + { url = "https://files.pythonhosted.org/packages/e1/8b/200088c6859d8221454825959df35b5244fa9bdf263fd0249ac5fb75e281/ml_dtypes-0.5.4-cp313-cp313-win_amd64.whl", hash = "sha256:f21c9219ef48ca5ee78402d5cc831bd58ea27ce89beda894428bc67a52da5328", size = 212224, upload-time = "2025-11-17T22:32:01.349Z" }, + { url = "https://files.pythonhosted.org/packages/8f/75/dfc3775cb36367816e678f69a7843f6f03bd4e2bcd79941e01ea960a068e/ml_dtypes-0.5.4-cp313-cp313-win_arm64.whl", hash = "sha256:35f29491a3e478407f7047b8a4834e4640a77d2737e0b294d049746507af5175", size = 160798, upload-time = "2025-11-17T22:32:02.864Z" }, + { url = "https://files.pythonhosted.org/packages/4f/74/e9ddb35fd1dd43b1106c20ced3f53c2e8e7fc7598c15638e9f80677f81d4/ml_dtypes-0.5.4-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:304ad47faa395415b9ccbcc06a0350800bc50eda70f0e45326796e27c62f18b6", size = 702083, upload-time = "2025-11-17T22:32:04.08Z" }, + { url = "https://files.pythonhosted.org/packages/74/f5/667060b0aed1aa63166b22897fdf16dca9eb704e6b4bbf86848d5a181aa7/ml_dtypes-0.5.4-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6a0df4223b514d799b8a1629c65ddc351b3efa833ccf7f8ea0cf654a61d1e35d", size = 5354111, upload-time = "2025-11-17T22:32:05.546Z" }, + { url = "https://files.pythonhosted.org/packages/40/49/0f8c498a28c0efa5f5c95a9e374c83ec1385ca41d0e85e7cf40e5d519a21/ml_dtypes-0.5.4-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:531eff30e4d368cb6255bc2328d070e35836aa4f282a0fb5f3a0cd7260257298", size = 5366453, upload-time = "2025-11-17T22:32:07.115Z" }, + { url = "https://files.pythonhosted.org/packages/8c/27/12607423d0a9c6bbbcc780ad19f1f6baa2b68b18ce4bddcdc122c4c68dc9/ml_dtypes-0.5.4-cp313-cp313t-win_amd64.whl", hash = "sha256:cb73dccfc991691c444acc8c0012bee8f2470da826a92e3a20bb333b1a7894e6", size = 225612, upload-time = "2025-11-17T22:32:08.615Z" }, + { url = "https://files.pythonhosted.org/packages/e5/80/5a5929e92c72936d5b19872c5fb8fc09327c1da67b3b68c6a13139e77e20/ml_dtypes-0.5.4-cp313-cp313t-win_arm64.whl", hash = "sha256:3bbbe120b915090d9dd1375e4684dd17a20a2491ef25d640a908281da85e73f1", size = 164145, upload-time = "2025-11-17T22:32:09.782Z" }, + { url = "https://files.pythonhosted.org/packages/72/4e/1339dc6e2557a344f5ba5590872e80346f76f6cb2ac3dd16e4666e88818c/ml_dtypes-0.5.4-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:2b857d3af6ac0d39db1de7c706e69c7f9791627209c3d6dedbfca8c7e5faec22", size = 673781, upload-time = "2025-11-17T22:32:11.364Z" }, + { url = "https://files.pythonhosted.org/packages/04/f9/067b84365c7e83bda15bba2b06c6ca250ce27b20630b1128c435fb7a09aa/ml_dtypes-0.5.4-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:805cef3a38f4eafae3a5bf9ebdcdb741d0bcfd9e1bd90eb54abd24f928cd2465", size = 5036145, upload-time = "2025-11-17T22:32:12.783Z" }, + { url = "https://files.pythonhosted.org/packages/c6/bb/82c7dcf38070b46172a517e2334e665c5bf374a262f99a283ea454bece7c/ml_dtypes-0.5.4-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:14a4fd3228af936461db66faccef6e4f41c1d82fcc30e9f8d58a08916b1d811f", size = 5010230, upload-time = "2025-11-17T22:32:14.38Z" }, + { url = "https://files.pythonhosted.org/packages/e9/93/2bfed22d2498c468f6bcd0d9f56b033eaa19f33320389314c19ef6766413/ml_dtypes-0.5.4-cp314-cp314-win_amd64.whl", hash = "sha256:8c6a2dcebd6f3903e05d51960a8058d6e131fe69f952a5397e5dbabc841b6d56", size = 221032, upload-time = "2025-11-17T22:32:15.763Z" }, + { url = "https://files.pythonhosted.org/packages/76/a3/9c912fe6ea747bb10fe2f8f54d027eb265db05dfb0c6335e3e063e74e6e8/ml_dtypes-0.5.4-cp314-cp314-win_arm64.whl", hash = "sha256:5a0f68ca8fd8d16583dfa7793973feb86f2fbb56ce3966daf9c9f748f52a2049", size = 163353, upload-time = "2025-11-17T22:32:16.932Z" }, + { url = "https://files.pythonhosted.org/packages/cd/02/48aa7d84cc30ab4ee37624a2fd98c56c02326785750cd212bc0826c2f15b/ml_dtypes-0.5.4-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:bfc534409c5d4b0bf945af29e5d0ab075eae9eecbb549ff8a29280db822f34f9", size = 702085, upload-time = "2025-11-17T22:32:18.175Z" }, + { url = "https://files.pythonhosted.org/packages/5a/e7/85cb99fe80a7a5513253ec7faa88a65306be071163485e9a626fce1b6e84/ml_dtypes-0.5.4-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2314892cdc3fcf05e373d76d72aaa15fda9fb98625effa73c1d646f331fcecb7", size = 5355358, upload-time = "2025-11-17T22:32:19.7Z" }, + { url = "https://files.pythonhosted.org/packages/79/2b/a826ba18d2179a56e144aef69e57fb2ab7c464ef0b2111940ee8a3a223a2/ml_dtypes-0.5.4-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0d2ffd05a2575b1519dc928c0b93c06339eb67173ff53acb00724502cda231cf", size = 5366332, upload-time = "2025-11-17T22:32:21.193Z" }, + { url = "https://files.pythonhosted.org/packages/84/44/f4d18446eacb20ea11e82f133ea8f86e2bf2891785b67d9da8d0ab0ef525/ml_dtypes-0.5.4-cp314-cp314t-win_amd64.whl", hash = "sha256:4381fe2f2452a2d7589689693d3162e876b3ddb0a832cde7a414f8e1adf7eab1", size = 236612, upload-time = "2025-11-17T22:32:22.579Z" }, + { url = "https://files.pythonhosted.org/packages/ad/3f/3d42e9a78fe5edf792a83c074b13b9b770092a4fbf3462872f4303135f09/ml_dtypes-0.5.4-cp314-cp314t-win_arm64.whl", hash = "sha256:11942cbf2cf92157db91e5022633c0d9474d4dfd813a909383bd23ce828a4b7d", size = 168825, upload-time = "2025-11-17T22:32:23.766Z" }, ] [[package]] @@ -2789,7 +2581,7 @@ wheels = [ [[package]] name = "multi-storage-client" -version = "0.33.0" +version = "0.36.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, @@ -2802,26 +2594,27 @@ dependencies = [ { name = "python-dateutil" }, { name = "pyyaml" }, { name = "tqdm" }, + { name = "tzdata" }, { name = "wcmatch" }, { name = "xattr" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/5c/c4/6279fb7d4b8b0a7af060047d592f00f8d49c547adfebe50bcd8d0d2dc8a5/multi_storage_client-0.33.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:df52b3040ef5698c6388fa589bd63812ae0d2f967d358a792abcad5638686590", size = 5282006, upload-time = "2025-10-23T03:45:37.761Z" }, - { url = "https://files.pythonhosted.org/packages/22/3b/23d8beccd73b887c4552bf884275611255b5028388fa3317365cd56c2a93/multi_storage_client-0.33.0-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:370da04b1e56a601ba505a29d42fcabc19b583e10d725a37bc0c11ba3573d211", size = 5403083, upload-time = "2025-10-23T03:53:11.998Z" }, - { url = "https://files.pythonhosted.org/packages/b0/ad/dc355d05fd369da0d800e5f7de24da0393f542c5a6f775f6bcee7edcacb1/multi_storage_client-0.33.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c57749a28ec5d49440f465fd73e4e2feaab18ece9b6e57c73395308b41950f66", size = 3178432, upload-time = "2025-10-23T04:07:00.543Z" }, - { url = "https://files.pythonhosted.org/packages/e0/ad/97b54419d8a58f696b85504568391a627641152f80650d7d2697fc2702ed/multi_storage_client-0.33.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c7d95f5fe094aab00a240bf6aa11dfe85bec293b76b3688ec3a9c33d86c751d2", size = 3351102, upload-time = "2025-10-23T03:47:47.622Z" }, - { url = "https://files.pythonhosted.org/packages/52/28/1038a68b9df1b179a61967ce9f7d2e80b9954cdb289801afecde5f7660db/multi_storage_client-0.33.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4b5a0f5a0b7684835be20ae6782070884982a86665e9bab317375a56a20294d1", size = 5281523, upload-time = "2025-10-23T04:06:36.671Z" }, - { url = "https://files.pythonhosted.org/packages/6c/c5/e18de5e2a2671efdc0a12383b8d63f523044ca453525725b3450d0179c0e/multi_storage_client-0.33.0-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:0db694311f90f44ee8f6f7734a14a0857738a467f2ae201649218a3ecf1f6ab2", size = 5403353, upload-time = "2025-10-23T04:07:25.941Z" }, - { url = "https://files.pythonhosted.org/packages/7e/c9/d9f65eb2370151dbbb06925f4216ee017e6cdbf7657263fd98e60944e52b/multi_storage_client-0.33.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cbe3a0b856f0b968f9fc693670a521b5a995b625351241ca008f866fdfff62a", size = 3180052, upload-time = "2025-10-23T03:57:32.797Z" }, - { url = "https://files.pythonhosted.org/packages/e7/38/08b9d84c93b19ae87caf542ae77f17dfa44a85281ba09de660ffcf3a7718/multi_storage_client-0.33.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:018e7e82255feeff973ff02563f11a30f5e507e4cbc87a2167a9568740144ef2", size = 3351389, upload-time = "2025-10-23T04:02:07.348Z" }, - { url = "https://files.pythonhosted.org/packages/6a/31/c95634a27723b5ba9d2d74158444cc5e40b151b51ae59ca196fc9993f039/multi_storage_client-0.33.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:030b3a592c6352605e9ebdb8d9303dd42daf5d171ffa684f3283d4a5c6e2edfe", size = 5273976, upload-time = "2025-10-23T04:04:35.99Z" }, - { url = "https://files.pythonhosted.org/packages/8c/cf/82d1778d73c3baaec331da4ae8d01fa7934bcd73336aa88a08d86d080347/multi_storage_client-0.33.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:14dc0ace16d3830917427d6376d14ef62bd053fb2509f893998555ca1e9c4dcb", size = 5400735, upload-time = "2025-10-23T03:58:37.149Z" }, - { url = "https://files.pythonhosted.org/packages/fc/34/a6194ec725ef80c02de58b5ed3520bb1711807df75a27f7214effd22df34/multi_storage_client-0.33.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a2821765d5c6de365b5b1dcdc7cf2ebba719ff4061fd02975639629f8aa319f6", size = 3182623, upload-time = "2025-10-23T04:03:29.551Z" }, - { url = "https://files.pythonhosted.org/packages/8f/36/7ec85178fd1dd69c278407a82acaccfb806449deda13f3dbd41f653d73bd/multi_storage_client-0.33.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f92f89480c58067fa53c178785b86e7650e16f277a61a732a8a7019173b16129", size = 3352104, upload-time = "2025-10-23T04:08:51.005Z" }, - { url = "https://files.pythonhosted.org/packages/88/ef/f2eb2efefb0e0588b29ed573b8354ecd72c38e6143da7ed5ecf53e859bf8/multi_storage_client-0.33.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ed9af7e77e3cbac1f614816062b36975dcbc610bd3f8c86741d48aa18c718781", size = 5272154, upload-time = "2025-10-23T04:07:49.572Z" }, - { url = "https://files.pythonhosted.org/packages/1e/49/050aa4fccb2579d2ef5bd0d27169ec98fe85c92bba7a2c31154c491a4f75/multi_storage_client-0.33.0-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:c9d75e95a266ee858cf20c88ed255021552de67a40af9c8884d2fc22037dcd2b", size = 5399474, upload-time = "2025-10-23T04:09:14.545Z" }, - { url = "https://files.pythonhosted.org/packages/f6/4b/70c2df3b60c28360f185188d351e9c3958b702614963a09ffb1dc251c1ca/multi_storage_client-0.33.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48195a2ab9e6e9a2763bde17184cad2bdef82684353e210d0d325f20cea18869", size = 3181788, upload-time = "2025-10-23T04:03:10.404Z" }, - { url = "https://files.pythonhosted.org/packages/9b/96/5008852677fdad10eb9d8dd08a6ea58c6f7e820199a3b2c56607186ac6d5/multi_storage_client-0.33.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd64403efdcee2a6efcf7bfdb01422dd174c146014563b09f44590346fd835e6", size = 3351269, upload-time = "2025-10-23T04:00:34.714Z" }, + { url = "https://files.pythonhosted.org/packages/be/5f/8011fd041f695670b339c25f059b68207c315250ccc25a08f190bff78318/multi_storage_client-0.36.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:763cdb5e24b78adf33882b1d1c0d15021cc2c0088ffc6e7b0269259f0cd45fd2", size = 5299321, upload-time = "2025-11-26T20:03:58.147Z" }, + { url = "https://files.pythonhosted.org/packages/51/06/cfd17d307fe29fbbce9f196ec1d8dda3f93fd44711c0adb282d9c393a2b2/multi_storage_client-0.36.0-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:eb84ea0bdffcfddf9beb7239c6d0b1950a67a0afe36ef970da70ba4ab373c0c9", size = 5420867, upload-time = "2025-11-26T20:05:32.445Z" }, + { url = "https://files.pythonhosted.org/packages/7c/7f/bf22f9c67c70d5ec2f6a7a4798cb106f3023bf25ba6c21b0ade1a53fa5b3/multi_storage_client-0.36.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ff03a0213ce1377abee61e8deb87607f0ccd35c245fbaab2fee51d2e591e833e", size = 3188237, upload-time = "2025-11-26T20:01:51.354Z" }, + { url = "https://files.pythonhosted.org/packages/fb/20/c0c019b3dc7719f79c1826364fc9c3e1bbe9b00246b1d7414ce2b4defd0b/multi_storage_client-0.36.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f16e577ef4ee6f8ac481b3f2290e7b0525676efd82c71fb694ba4e6c65a8facd", size = 3363259, upload-time = "2025-11-26T20:00:10.679Z" }, + { url = "https://files.pythonhosted.org/packages/2b/f8/eea6be7f4258c811373dc989e8eaa23a404499c2574059f6fd876d6904e4/multi_storage_client-0.36.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6c913b132573fbd7a5ada63086d3ce2669b913b79206f86867cc674d57b9164d", size = 5299844, upload-time = "2025-11-26T20:00:32.46Z" }, + { url = "https://files.pythonhosted.org/packages/df/aa/b73441dc17097ee92e7efac5080e2cfb8fe4515dd4dc91ca351829e6b7a9/multi_storage_client-0.36.0-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:4dd2ccf67deae403098a5e867ce33d35ce348d2acd1a743c9ef485b3b1eea65c", size = 5424007, upload-time = "2025-11-26T19:55:30.305Z" }, + { url = "https://files.pythonhosted.org/packages/54/d6/850550de6b0dc740ced2f8fbf83f13f757860b5fdaa652e477c567c01f34/multi_storage_client-0.36.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:04b31b6a5d6a3c90a592b23a4b90368fa1dcca8cb03f76a862d307f8b072c1d3", size = 3188451, upload-time = "2025-11-26T19:56:32.191Z" }, + { url = "https://files.pythonhosted.org/packages/a3/c5/93e038c0cce46cb9b1b8e19f7215ce3e7fa1af5e0a9662f36dfe47062f7e/multi_storage_client-0.36.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:252f84116f674962eabd066e16040f0304f6191c06ab09ef2ec02dbfd2c4d2ea", size = 3366554, upload-time = "2025-11-26T19:58:37.742Z" }, + { url = "https://files.pythonhosted.org/packages/28/a2/46320db394150a2f0547930b902e8ad045a084fb519f408e2c9b4ca673a0/multi_storage_client-0.36.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2058e8e8f8fd9eef033171b0bf1966596e9862c7f20c2886101ad979996c453b", size = 5293778, upload-time = "2025-11-26T20:07:11.731Z" }, + { url = "https://files.pythonhosted.org/packages/00/2d/658af3b4104c4f2aa2621469482dca8270490601e98d8f7997361499adaa/multi_storage_client-0.36.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:22b69c7f3c9ffa166f38bafa7e08f6b664a5dbee8c88d5d740bed719e6f410a1", size = 5418642, upload-time = "2025-11-26T19:58:15.717Z" }, + { url = "https://files.pythonhosted.org/packages/09/2f/6441794bf8dc195d614d63ad2b7068ad7703972fd6f960d43202d29748b1/multi_storage_client-0.36.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b384fb326637e79706ff706e60f384b24fdbcc824420bb66ef615a9ef5ffb4ec", size = 3194133, upload-time = "2025-11-26T20:05:54.618Z" }, + { url = "https://files.pythonhosted.org/packages/0e/ba/b07361ff84e5bd263e299b03776382f59bd92862573c915dd705a09f3c1d/multi_storage_client-0.36.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7111567b971a68719c0eb68245d49a0a3c3bf5af2f609351446f20ac3e83c0d5", size = 3364563, upload-time = "2025-11-26T20:04:20.3Z" }, + { url = "https://files.pythonhosted.org/packages/f9/4a/cbd61589a457e2f4fbacd08b7e7dd11cdb74690857f4b40042844b1ff894/multi_storage_client-0.36.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a8137558d5f05e4722c54540e2d6067ea61e9ce3d736fa9cb5c541c7f94d1b48", size = 5293550, upload-time = "2025-11-26T20:03:36.459Z" }, + { url = "https://files.pythonhosted.org/packages/a7/3d/7499a9d537fa950a9acf11604b1f9372ed2cadd582b55f1c7cb885ce6f40/multi_storage_client-0.36.0-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:5394c5e040c32433b42e902d9fcf03f8a475c5c9ff1cca80743b2cb944c8af9e", size = 5417538, upload-time = "2025-11-26T20:06:16.782Z" }, + { url = "https://files.pythonhosted.org/packages/d7/c3/1b1adc3b3b8569d258a34dbedb6a8c51fc94b947b2df276e251f0f1e23a2/multi_storage_client-0.36.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:195e8c8d57d812b73efd41b96cd60825c484d317ec86379fad3e435e9365a4a6", size = 3193426, upload-time = "2025-11-26T20:00:56.034Z" }, + { url = "https://files.pythonhosted.org/packages/60/f5/f8b97a87d928057b493733760f37de70ae5ffff84b86f6efae101cdd57a2/multi_storage_client-0.36.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8402d0e1cefedf38ad9eefe8b3c56d3a44cfec7775ef711da18e7dbf72669444", size = 3363531, upload-time = "2025-11-26T20:02:35.296Z" }, ] [[package]] @@ -3025,7 +2818,7 @@ dependencies = [ { name = "jinja2" }, { name = "leptonai" }, { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "networkx", version = "3.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "omegaconf" }, { name = "packaging" }, { name = "rich" }, @@ -3049,51 +2842,21 @@ wheels = [ [[package]] name = "networkx" -version = "3.5" +version = "3.6" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", -] -sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec", size = 2034406, upload-time = "2025-05-29T11:35:04.961Z" }, + "python_full_version >= '3.14' and sys_platform == 'linux'", + "python_full_version == '3.13.*' and sys_platform == 'linux'", + "python_full_version == '3.12.*' and sys_platform == 'linux'", + "python_full_version >= '3.14' and sys_platform != 'linux'", + "python_full_version == '3.13.*' and sys_platform != 'linux'", + "python_full_version == '3.12.*' and sys_platform != 'linux'", + "python_full_version == '3.11.*' and sys_platform == 'linux'", + "python_full_version == '3.11.*' and sys_platform != 'linux'", +] +sdist = { url = "https://files.pythonhosted.org/packages/e8/fc/7b6fd4d22c8c4dc5704430140d8b3f520531d4fe7328b8f8d03f5a7950e8/networkx-3.6.tar.gz", hash = "sha256:285276002ad1f7f7da0f7b42f004bcba70d381e936559166363707fdad3d72ad", size = 2511464, upload-time = "2025-11-24T03:03:47.158Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/c7/d64168da60332c17d24c0d2f08bdf3987e8d1ae9d84b5bbd0eec2eb26a55/networkx-3.6-py3-none-any.whl", hash = "sha256:cdb395b105806062473d3be36458d8f1459a4e4b98e236a66c3a48996e07684f", size = 2063713, upload-time = "2025-11-24T03:03:45.21Z" }, ] [[package]] @@ -3138,170 +2901,373 @@ wheels = [ ] [[package]] -name = "numcodecs" -version = "0.13.1" +name = "numpy" +version = "2.2.6" source = { registry = "https://pypi.org/simple" } resolution-markers = [ "python_full_version < '3.11' and sys_platform == 'linux'", "python_full_version < '3.11' and sys_platform != 'linux'", ] -dependencies = [ - { name = "numpy", marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/85/56/8895a76abe4ec94ebd01eeb6d74f587bc4cddd46569670e1402852a5da13/numcodecs-0.13.1.tar.gz", hash = "sha256:a3cf37881df0898f3a9c0d4477df88133fe85185bffe57ba31bcc2fa207709bc", size = 5955215, upload-time = "2024-10-09T16:28:00.188Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/14/c0/6d72cde772bcec196b7188731d41282993b2958440f77fdf0db216f722da/numcodecs-0.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:96add4f783c5ce57cc7e650b6cac79dd101daf887c479a00a29bc1487ced180b", size = 1580012, upload-time = "2024-10-09T16:27:19.069Z" }, - { url = "https://files.pythonhosted.org/packages/94/1d/f81fc1fa9210bbea97258242393a1f9feab4f6d8fb201f81f76003005e4b/numcodecs-0.13.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:237b7171609e868a20fd313748494444458ccd696062f67e198f7f8f52000c15", size = 1176919, upload-time = "2024-10-09T16:27:21.634Z" }, - { url = "https://files.pythonhosted.org/packages/16/e4/b9ec2f4dfc34ecf724bc1beb96a9f6fa9b91801645688ffadacd485089da/numcodecs-0.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96e42f73c31b8c24259c5fac6adba0c3ebf95536e37749dc6c62ade2989dca28", size = 8625842, upload-time = "2024-10-09T16:27:24.168Z" }, - { url = "https://files.pythonhosted.org/packages/fe/90/299952e1477954ec4f92813fa03e743945e3ff711bb4f6c9aace431cb3da/numcodecs-0.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:eda7d7823c9282e65234731fd6bd3986b1f9e035755f7fed248d7d366bb291ab", size = 828638, upload-time = "2024-10-09T16:27:27.063Z" }, - { url = "https://files.pythonhosted.org/packages/f0/78/34b8e869ef143e88d62e8231f4dbfcad85e5c41302a11fc5bd2228a13df5/numcodecs-0.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2eda97dd2f90add98df6d295f2c6ae846043396e3d51a739ca5db6c03b5eb666", size = 1580199, upload-time = "2024-10-09T16:27:29.336Z" }, - { url = "https://files.pythonhosted.org/packages/3b/cf/f70797d86bb585d258d1e6993dced30396f2044725b96ce8bcf87a02be9c/numcodecs-0.13.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2a86f5367af9168e30f99727ff03b27d849c31ad4522060dde0bce2923b3a8bc", size = 1177203, upload-time = "2024-10-09T16:27:31.011Z" }, - { url = "https://files.pythonhosted.org/packages/a8/b5/d14ad69b63fde041153dfd05d7181a49c0d4864de31a7a1093c8370da957/numcodecs-0.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:233bc7f26abce24d57e44ea8ebeb5cd17084690b4e7409dd470fdb75528d615f", size = 8868743, upload-time = "2024-10-09T16:27:32.833Z" }, - { url = "https://files.pythonhosted.org/packages/13/d4/27a7b5af0b33f6d61e198faf177fbbf3cb83ff10d9d1a6857b7efc525ad5/numcodecs-0.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:796b3e6740107e4fa624cc636248a1580138b3f1c579160f260f76ff13a4261b", size = 829603, upload-time = "2024-10-09T16:27:35.415Z" }, - { url = "https://files.pythonhosted.org/packages/37/3a/bc09808425e7d3df41e5fc73fc7a802c429ba8c6b05e55f133654ade019d/numcodecs-0.13.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5195bea384a6428f8afcece793860b1ab0ae28143c853f0b2b20d55a8947c917", size = 1575806, upload-time = "2024-10-09T16:27:37.804Z" }, - { url = "https://files.pythonhosted.org/packages/3a/cc/dc74d0bfdf9ec192332a089d199f1e543e747c556b5659118db7a437dcca/numcodecs-0.13.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3501a848adaddce98a71a262fee15cd3618312692aa419da77acd18af4a6a3f6", size = 1178233, upload-time = "2024-10-09T16:27:40.169Z" }, - { url = "https://files.pythonhosted.org/packages/d4/ce/434e8e3970b8e92ae9ab6d9db16cb9bc7aa1cd02e17c11de6848224100a1/numcodecs-0.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da2230484e6102e5fa3cc1a5dd37ca1f92dfbd183d91662074d6f7574e3e8f53", size = 8857827, upload-time = "2024-10-09T16:27:42.743Z" }, - { url = "https://files.pythonhosted.org/packages/83/e7/1d8b1b266a92f9013c755b1c146c5ad71a2bff147ecbc67f86546a2e4d6a/numcodecs-0.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:e5db4824ebd5389ea30e54bc8aeccb82d514d28b6b68da6c536b8fa4596f4bca", size = 826539, upload-time = "2024-10-09T16:27:44.808Z" }, - { url = "https://files.pythonhosted.org/packages/83/8b/06771dead2cc4a8ae1ea9907737cf1c8d37a323392fa28f938a586373468/numcodecs-0.13.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7a60d75179fd6692e301ddfb3b266d51eb598606dcae7b9fc57f986e8d65cb43", size = 1571660, upload-time = "2024-10-09T16:27:47.125Z" }, - { url = "https://files.pythonhosted.org/packages/f9/ea/d925bf85f92dfe4635356018da9fe4bfecb07b1c72f62b01c1bc47f936b1/numcodecs-0.13.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3f593c7506b0ab248961a3b13cb148cc6e8355662ff124ac591822310bc55ecf", size = 1169925, upload-time = "2024-10-09T16:27:49.512Z" }, - { url = "https://files.pythonhosted.org/packages/0f/d6/643a3839d571d8e439a2c77dc4b0b8cab18d96ac808e4a81dbe88e959ab6/numcodecs-0.13.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80d3071465f03522e776a31045ddf2cfee7f52df468b977ed3afdd7fe5869701", size = 8814257, upload-time = "2024-10-09T16:27:52.059Z" }, - { url = "https://files.pythonhosted.org/packages/a6/c5/f3e56bc9b4e438a287fff738993d6d11abef368c0328a612ac2842ba9fca/numcodecs-0.13.1-cp313-cp313-win_amd64.whl", hash = "sha256:90d3065ae74c9342048ae0046006f99dcb1388b7288da5a19b3bddf9c30c3176", size = 821887, upload-time = "2024-10-09T16:27:55.039Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9a/3e/ed6db5be21ce87955c0cbd3009f2803f59fa08df21b5df06862e2d8e2bdd/numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b412caa66f72040e6d268491a59f2c43bf03eb6c96dd8f0307829feb7fa2b6fb", size = 21165245, upload-time = "2025-05-17T21:27:58.555Z" }, + { url = "https://files.pythonhosted.org/packages/22/c2/4b9221495b2a132cc9d2eb862e21d42a009f5a60e45fc44b00118c174bff/numpy-2.2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e41fd67c52b86603a91c1a505ebaef50b3314de0213461c7a6e99c9a3beff90", size = 14360048, upload-time = "2025-05-17T21:28:21.406Z" }, + { url = "https://files.pythonhosted.org/packages/fd/77/dc2fcfc66943c6410e2bf598062f5959372735ffda175b39906d54f02349/numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:37e990a01ae6ec7fe7fa1c26c55ecb672dd98b19c3d0e1d1f326fa13cb38d163", size = 5340542, upload-time = "2025-05-17T21:28:30.931Z" }, + { url = "https://files.pythonhosted.org/packages/7a/4f/1cb5fdc353a5f5cc7feb692db9b8ec2c3d6405453f982435efc52561df58/numpy-2.2.6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:5a6429d4be8ca66d889b7cf70f536a397dc45ba6faeb5f8c5427935d9592e9cf", size = 6878301, upload-time = "2025-05-17T21:28:41.613Z" }, + { url = "https://files.pythonhosted.org/packages/eb/17/96a3acd228cec142fcb8723bd3cc39c2a474f7dcf0a5d16731980bcafa95/numpy-2.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efd28d4e9cd7d7a8d39074a4d44c63eda73401580c5c76acda2ce969e0a38e83", size = 14297320, upload-time = "2025-05-17T21:29:02.78Z" }, + { url = "https://files.pythonhosted.org/packages/b4/63/3de6a34ad7ad6646ac7d2f55ebc6ad439dbbf9c4370017c50cf403fb19b5/numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc7b73d02efb0e18c000e9ad8b83480dfcd5dfd11065997ed4c6747470ae8915", size = 16801050, upload-time = "2025-05-17T21:29:27.675Z" }, + { url = "https://files.pythonhosted.org/packages/07/b6/89d837eddef52b3d0cec5c6ba0456c1bf1b9ef6a6672fc2b7873c3ec4e2e/numpy-2.2.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:74d4531beb257d2c3f4b261bfb0fc09e0f9ebb8842d82a7b4209415896adc680", size = 15807034, upload-time = "2025-05-17T21:29:51.102Z" }, + { url = "https://files.pythonhosted.org/packages/01/c8/dc6ae86e3c61cfec1f178e5c9f7858584049b6093f843bca541f94120920/numpy-2.2.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8fc377d995680230e83241d8a96def29f204b5782f371c532579b4f20607a289", size = 18614185, upload-time = "2025-05-17T21:30:18.703Z" }, + { url = "https://files.pythonhosted.org/packages/5b/c5/0064b1b7e7c89137b471ccec1fd2282fceaae0ab3a9550f2568782d80357/numpy-2.2.6-cp310-cp310-win32.whl", hash = "sha256:b093dd74e50a8cba3e873868d9e93a85b78e0daf2e98c6797566ad8044e8363d", size = 6527149, upload-time = "2025-05-17T21:30:29.788Z" }, + { url = "https://files.pythonhosted.org/packages/a3/dd/4b822569d6b96c39d1215dbae0582fd99954dcbcf0c1a13c61783feaca3f/numpy-2.2.6-cp310-cp310-win_amd64.whl", hash = "sha256:f0fd6321b839904e15c46e0d257fdd101dd7f530fe03fd6359c1ea63738703f3", size = 12904620, upload-time = "2025-05-17T21:30:48.994Z" }, + { url = "https://files.pythonhosted.org/packages/da/a8/4f83e2aa666a9fbf56d6118faaaf5f1974d456b1823fda0a176eff722839/numpy-2.2.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f9f1adb22318e121c5c69a09142811a201ef17ab257a1e66ca3025065b7f53ae", size = 21176963, upload-time = "2025-05-17T21:31:19.36Z" }, + { url = "https://files.pythonhosted.org/packages/b3/2b/64e1affc7972decb74c9e29e5649fac940514910960ba25cd9af4488b66c/numpy-2.2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c820a93b0255bc360f53eca31a0e676fd1101f673dda8da93454a12e23fc5f7a", size = 14406743, upload-time = "2025-05-17T21:31:41.087Z" }, + { url = "https://files.pythonhosted.org/packages/4a/9f/0121e375000b5e50ffdd8b25bf78d8e1a5aa4cca3f185d41265198c7b834/numpy-2.2.6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3d70692235e759f260c3d837193090014aebdf026dfd167834bcba43e30c2a42", size = 5352616, upload-time = "2025-05-17T21:31:50.072Z" }, + { url = "https://files.pythonhosted.org/packages/31/0d/b48c405c91693635fbe2dcd7bc84a33a602add5f63286e024d3b6741411c/numpy-2.2.6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:481b49095335f8eed42e39e8041327c05b0f6f4780488f61286ed3c01368d491", size = 6889579, upload-time = "2025-05-17T21:32:01.712Z" }, + { url = "https://files.pythonhosted.org/packages/52/b8/7f0554d49b565d0171eab6e99001846882000883998e7b7d9f0d98b1f934/numpy-2.2.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b64d8d4d17135e00c8e346e0a738deb17e754230d7e0810ac5012750bbd85a5a", size = 14312005, upload-time = "2025-05-17T21:32:23.332Z" }, + { url = "https://files.pythonhosted.org/packages/b3/dd/2238b898e51bd6d389b7389ffb20d7f4c10066d80351187ec8e303a5a475/numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba10f8411898fc418a521833e014a77d3ca01c15b0c6cdcce6a0d2897e6dbbdf", size = 16821570, upload-time = "2025-05-17T21:32:47.991Z" }, + { url = "https://files.pythonhosted.org/packages/83/6c/44d0325722cf644f191042bf47eedad61c1e6df2432ed65cbe28509d404e/numpy-2.2.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bd48227a919f1bafbdda0583705e547892342c26fb127219d60a5c36882609d1", size = 15818548, upload-time = "2025-05-17T21:33:11.728Z" }, + { url = "https://files.pythonhosted.org/packages/ae/9d/81e8216030ce66be25279098789b665d49ff19eef08bfa8cb96d4957f422/numpy-2.2.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9551a499bf125c1d4f9e250377c1ee2eddd02e01eac6644c080162c0c51778ab", size = 18620521, upload-time = "2025-05-17T21:33:39.139Z" }, + { url = "https://files.pythonhosted.org/packages/6a/fd/e19617b9530b031db51b0926eed5345ce8ddc669bb3bc0044b23e275ebe8/numpy-2.2.6-cp311-cp311-win32.whl", hash = "sha256:0678000bb9ac1475cd454c6b8c799206af8107e310843532b04d49649c717a47", size = 6525866, upload-time = "2025-05-17T21:33:50.273Z" }, + { url = "https://files.pythonhosted.org/packages/31/0a/f354fb7176b81747d870f7991dc763e157a934c717b67b58456bc63da3df/numpy-2.2.6-cp311-cp311-win_amd64.whl", hash = "sha256:e8213002e427c69c45a52bbd94163084025f533a55a59d6f9c5b820774ef3303", size = 12907455, upload-time = "2025-05-17T21:34:09.135Z" }, + { url = "https://files.pythonhosted.org/packages/82/5d/c00588b6cf18e1da539b45d3598d3557084990dcc4331960c15ee776ee41/numpy-2.2.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41c5a21f4a04fa86436124d388f6ed60a9343a6f767fced1a8a71c3fbca038ff", size = 20875348, upload-time = "2025-05-17T21:34:39.648Z" }, + { url = "https://files.pythonhosted.org/packages/66/ee/560deadcdde6c2f90200450d5938f63a34b37e27ebff162810f716f6a230/numpy-2.2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de749064336d37e340f640b05f24e9e3dd678c57318c7289d222a8a2f543e90c", size = 14119362, upload-time = "2025-05-17T21:35:01.241Z" }, + { url = "https://files.pythonhosted.org/packages/3c/65/4baa99f1c53b30adf0acd9a5519078871ddde8d2339dc5a7fde80d9d87da/numpy-2.2.6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:894b3a42502226a1cac872f840030665f33326fc3dac8e57c607905773cdcde3", size = 5084103, upload-time = "2025-05-17T21:35:10.622Z" }, + { url = "https://files.pythonhosted.org/packages/cc/89/e5a34c071a0570cc40c9a54eb472d113eea6d002e9ae12bb3a8407fb912e/numpy-2.2.6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:71594f7c51a18e728451bb50cc60a3ce4e6538822731b2933209a1f3614e9282", size = 6625382, upload-time = "2025-05-17T21:35:21.414Z" }, + { url = "https://files.pythonhosted.org/packages/f8/35/8c80729f1ff76b3921d5c9487c7ac3de9b2a103b1cd05e905b3090513510/numpy-2.2.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2618db89be1b4e05f7a1a847a9c1c0abd63e63a1607d892dd54668dd92faf87", size = 14018462, upload-time = "2025-05-17T21:35:42.174Z" }, + { url = "https://files.pythonhosted.org/packages/8c/3d/1e1db36cfd41f895d266b103df00ca5b3cbe965184df824dec5c08c6b803/numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd83c01228a688733f1ded5201c678f0c53ecc1006ffbc404db9f7a899ac6249", size = 16527618, upload-time = "2025-05-17T21:36:06.711Z" }, + { url = "https://files.pythonhosted.org/packages/61/c6/03ed30992602c85aa3cd95b9070a514f8b3c33e31124694438d88809ae36/numpy-2.2.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:37c0ca431f82cd5fa716eca9506aefcabc247fb27ba69c5062a6d3ade8cf8f49", size = 15505511, upload-time = "2025-05-17T21:36:29.965Z" }, + { url = "https://files.pythonhosted.org/packages/b7/25/5761d832a81df431e260719ec45de696414266613c9ee268394dd5ad8236/numpy-2.2.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fe27749d33bb772c80dcd84ae7e8df2adc920ae8297400dabec45f0dedb3f6de", size = 18313783, upload-time = "2025-05-17T21:36:56.883Z" }, + { url = "https://files.pythonhosted.org/packages/57/0a/72d5a3527c5ebffcd47bde9162c39fae1f90138c961e5296491ce778e682/numpy-2.2.6-cp312-cp312-win32.whl", hash = "sha256:4eeaae00d789f66c7a25ac5f34b71a7035bb474e679f410e5e1a94deb24cf2d4", size = 6246506, upload-time = "2025-05-17T21:37:07.368Z" }, + { url = "https://files.pythonhosted.org/packages/36/fa/8c9210162ca1b88529ab76b41ba02d433fd54fecaf6feb70ef9f124683f1/numpy-2.2.6-cp312-cp312-win_amd64.whl", hash = "sha256:c1f9540be57940698ed329904db803cf7a402f3fc200bfe599334c9bd84a40b2", size = 12614190, upload-time = "2025-05-17T21:37:26.213Z" }, + { url = "https://files.pythonhosted.org/packages/f9/5c/6657823f4f594f72b5471f1db1ab12e26e890bb2e41897522d134d2a3e81/numpy-2.2.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0811bb762109d9708cca4d0b13c4f67146e3c3b7cf8d34018c722adb2d957c84", size = 20867828, upload-time = "2025-05-17T21:37:56.699Z" }, + { url = "https://files.pythonhosted.org/packages/dc/9e/14520dc3dadf3c803473bd07e9b2bd1b69bc583cb2497b47000fed2fa92f/numpy-2.2.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:287cc3162b6f01463ccd86be154f284d0893d2b3ed7292439ea97eafa8170e0b", size = 14143006, upload-time = "2025-05-17T21:38:18.291Z" }, + { url = "https://files.pythonhosted.org/packages/4f/06/7e96c57d90bebdce9918412087fc22ca9851cceaf5567a45c1f404480e9e/numpy-2.2.6-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:f1372f041402e37e5e633e586f62aa53de2eac8d98cbfb822806ce4bbefcb74d", size = 5076765, upload-time = "2025-05-17T21:38:27.319Z" }, + { url = "https://files.pythonhosted.org/packages/73/ed/63d920c23b4289fdac96ddbdd6132e9427790977d5457cd132f18e76eae0/numpy-2.2.6-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:55a4d33fa519660d69614a9fad433be87e5252f4b03850642f88993f7b2ca566", size = 6617736, upload-time = "2025-05-17T21:38:38.141Z" }, + { url = "https://files.pythonhosted.org/packages/85/c5/e19c8f99d83fd377ec8c7e0cf627a8049746da54afc24ef0a0cb73d5dfb5/numpy-2.2.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f92729c95468a2f4f15e9bb94c432a9229d0d50de67304399627a943201baa2f", size = 14010719, upload-time = "2025-05-17T21:38:58.433Z" }, + { url = "https://files.pythonhosted.org/packages/19/49/4df9123aafa7b539317bf6d342cb6d227e49f7a35b99c287a6109b13dd93/numpy-2.2.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1bc23a79bfabc5d056d106f9befb8d50c31ced2fbc70eedb8155aec74a45798f", size = 16526072, upload-time = "2025-05-17T21:39:22.638Z" }, + { url = "https://files.pythonhosted.org/packages/b2/6c/04b5f47f4f32f7c2b0e7260442a8cbcf8168b0e1a41ff1495da42f42a14f/numpy-2.2.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e3143e4451880bed956e706a3220b4e5cf6172ef05fcc397f6f36a550b1dd868", size = 15503213, upload-time = "2025-05-17T21:39:45.865Z" }, + { url = "https://files.pythonhosted.org/packages/17/0a/5cd92e352c1307640d5b6fec1b2ffb06cd0dabe7d7b8227f97933d378422/numpy-2.2.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b4f13750ce79751586ae2eb824ba7e1e8dba64784086c98cdbbcc6a42112ce0d", size = 18316632, upload-time = "2025-05-17T21:40:13.331Z" }, + { url = "https://files.pythonhosted.org/packages/f0/3b/5cba2b1d88760ef86596ad0f3d484b1cbff7c115ae2429678465057c5155/numpy-2.2.6-cp313-cp313-win32.whl", hash = "sha256:5beb72339d9d4fa36522fc63802f469b13cdbe4fdab4a288f0c441b74272ebfd", size = 6244532, upload-time = "2025-05-17T21:43:46.099Z" }, + { url = "https://files.pythonhosted.org/packages/cb/3b/d58c12eafcb298d4e6d0d40216866ab15f59e55d148a5658bb3132311fcf/numpy-2.2.6-cp313-cp313-win_amd64.whl", hash = "sha256:b0544343a702fa80c95ad5d3d608ea3599dd54d4632df855e4c8d24eb6ecfa1c", size = 12610885, upload-time = "2025-05-17T21:44:05.145Z" }, + { url = "https://files.pythonhosted.org/packages/6b/9e/4bf918b818e516322db999ac25d00c75788ddfd2d2ade4fa66f1f38097e1/numpy-2.2.6-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0bca768cd85ae743b2affdc762d617eddf3bcf8724435498a1e80132d04879e6", size = 20963467, upload-time = "2025-05-17T21:40:44Z" }, + { url = "https://files.pythonhosted.org/packages/61/66/d2de6b291507517ff2e438e13ff7b1e2cdbdb7cb40b3ed475377aece69f9/numpy-2.2.6-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:fc0c5673685c508a142ca65209b4e79ed6740a4ed6b2267dbba90f34b0b3cfda", size = 14225144, upload-time = "2025-05-17T21:41:05.695Z" }, + { url = "https://files.pythonhosted.org/packages/e4/25/480387655407ead912e28ba3a820bc69af9adf13bcbe40b299d454ec011f/numpy-2.2.6-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:5bd4fc3ac8926b3819797a7c0e2631eb889b4118a9898c84f585a54d475b7e40", size = 5200217, upload-time = "2025-05-17T21:41:15.903Z" }, + { url = "https://files.pythonhosted.org/packages/aa/4a/6e313b5108f53dcbf3aca0c0f3e9c92f4c10ce57a0a721851f9785872895/numpy-2.2.6-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:fee4236c876c4e8369388054d02d0e9bb84821feb1a64dd59e137e6511a551f8", size = 6712014, upload-time = "2025-05-17T21:41:27.321Z" }, + { url = "https://files.pythonhosted.org/packages/b7/30/172c2d5c4be71fdf476e9de553443cf8e25feddbe185e0bd88b096915bcc/numpy-2.2.6-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e1dda9c7e08dc141e0247a5b8f49cf05984955246a327d4c48bda16821947b2f", size = 14077935, upload-time = "2025-05-17T21:41:49.738Z" }, + { url = "https://files.pythonhosted.org/packages/12/fb/9e743f8d4e4d3c710902cf87af3512082ae3d43b945d5d16563f26ec251d/numpy-2.2.6-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f447e6acb680fd307f40d3da4852208af94afdfab89cf850986c3ca00562f4fa", size = 16600122, upload-time = "2025-05-17T21:42:14.046Z" }, + { url = "https://files.pythonhosted.org/packages/12/75/ee20da0e58d3a66f204f38916757e01e33a9737d0b22373b3eb5a27358f9/numpy-2.2.6-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:389d771b1623ec92636b0786bc4ae56abafad4a4c513d36a55dce14bd9ce8571", size = 15586143, upload-time = "2025-05-17T21:42:37.464Z" }, + { url = "https://files.pythonhosted.org/packages/76/95/bef5b37f29fc5e739947e9ce5179ad402875633308504a52d188302319c8/numpy-2.2.6-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8e9ace4a37db23421249ed236fdcdd457d671e25146786dfc96835cd951aa7c1", size = 18385260, upload-time = "2025-05-17T21:43:05.189Z" }, + { url = "https://files.pythonhosted.org/packages/09/04/f2f83279d287407cf36a7a8053a5abe7be3622a4363337338f2585e4afda/numpy-2.2.6-cp313-cp313t-win32.whl", hash = "sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff", size = 6377225, upload-time = "2025-05-17T21:43:16.254Z" }, + { url = "https://files.pythonhosted.org/packages/67/0e/35082d13c09c02c011cf21570543d202ad929d961c02a147493cb0c2bdf5/numpy-2.2.6-cp313-cp313t-win_amd64.whl", hash = "sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06", size = 12771374, upload-time = "2025-05-17T21:43:35.479Z" }, + { url = "https://files.pythonhosted.org/packages/9e/3b/d94a75f4dbf1ef5d321523ecac21ef23a3cd2ac8b78ae2aac40873590229/numpy-2.2.6-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0b605b275d7bd0c640cad4e5d30fa701a8d59302e127e5f79138ad62762c3e3d", size = 21040391, upload-time = "2025-05-17T21:44:35.948Z" }, + { url = "https://files.pythonhosted.org/packages/17/f4/09b2fa1b58f0fb4f7c7963a1649c64c4d315752240377ed74d9cd878f7b5/numpy-2.2.6-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:7befc596a7dc9da8a337f79802ee8adb30a552a94f792b9c9d18c840055907db", size = 6786754, upload-time = "2025-05-17T21:44:47.446Z" }, + { url = "https://files.pythonhosted.org/packages/af/30/feba75f143bdc868a1cc3f44ccfa6c4b9ec522b36458e738cd00f67b573f/numpy-2.2.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce47521a4754c8f4593837384bd3424880629f718d87c5d44f8ed763edd63543", size = 16643476, upload-time = "2025-05-17T21:45:11.871Z" }, + { url = "https://files.pythonhosted.org/packages/37/48/ac2a9584402fb6c0cd5b5d1a91dcf176b15760130dd386bbafdbfe3640bf/numpy-2.2.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d042d24c90c41b54fd506da306759e06e568864df8ec17ccc17e9e884634fd00", size = 12812666, upload-time = "2025-05-17T21:45:31.426Z" }, ] [[package]] -name = "numcodecs" -version = "0.16.3" +name = "numpy" +version = "2.3.5" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", + "python_full_version >= '3.14' and sys_platform == 'linux'", + "python_full_version == '3.13.*' and sys_platform == 'linux'", "python_full_version == '3.12.*' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", + "python_full_version >= '3.14' and sys_platform != 'linux'", + "python_full_version == '3.13.*' and sys_platform != 'linux'", "python_full_version == '3.12.*' and sys_platform != 'linux'", "python_full_version == '3.11.*' and sys_platform == 'linux'", "python_full_version == '3.11.*' and sys_platform != 'linux'", ] +sdist = { url = "https://files.pythonhosted.org/packages/76/65/21b3bc86aac7b8f2862db1e808f1ea22b028e30a225a34a5ede9bf8678f2/numpy-2.3.5.tar.gz", hash = "sha256:784db1dcdab56bf0517743e746dfb0f885fc68d948aba86eeec2cba234bdf1c0", size = 20584950, upload-time = "2025-11-16T22:52:42.067Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/77/84dd1d2e34d7e2792a236ba180b5e8fcc1e3e414e761ce0253f63d7f572e/numpy-2.3.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:de5672f4a7b200c15a4127042170a694d4df43c992948f5e1af57f0174beed10", size = 17034641, upload-time = "2025-11-16T22:49:19.336Z" }, + { url = "https://files.pythonhosted.org/packages/2a/ea/25e26fa5837106cde46ae7d0b667e20f69cbbc0efd64cba8221411ab26ae/numpy-2.3.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:acfd89508504a19ed06ef963ad544ec6664518c863436306153e13e94605c218", size = 12528324, upload-time = "2025-11-16T22:49:22.582Z" }, + { url = "https://files.pythonhosted.org/packages/4d/1a/e85f0eea4cf03d6a0228f5c0256b53f2df4bc794706e7df019fc622e47f1/numpy-2.3.5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:ffe22d2b05504f786c867c8395de703937f934272eb67586817b46188b4ded6d", size = 5356872, upload-time = "2025-11-16T22:49:25.408Z" }, + { url = "https://files.pythonhosted.org/packages/5c/bb/35ef04afd567f4c989c2060cde39211e4ac5357155c1833bcd1166055c61/numpy-2.3.5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:872a5cf366aec6bb1147336480fef14c9164b154aeb6542327de4970282cd2f5", size = 6893148, upload-time = "2025-11-16T22:49:27.549Z" }, + { url = "https://files.pythonhosted.org/packages/f2/2b/05bbeb06e2dff5eab512dfc678b1cc5ee94d8ac5956a0885c64b6b26252b/numpy-2.3.5-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3095bdb8dd297e5920b010e96134ed91d852d81d490e787beca7e35ae1d89cf7", size = 14557282, upload-time = "2025-11-16T22:49:30.964Z" }, + { url = "https://files.pythonhosted.org/packages/65/fb/2b23769462b34398d9326081fad5655198fcf18966fcb1f1e49db44fbf31/numpy-2.3.5-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8cba086a43d54ca804ce711b2a940b16e452807acebe7852ff327f1ecd49b0d4", size = 16897903, upload-time = "2025-11-16T22:49:34.191Z" }, + { url = "https://files.pythonhosted.org/packages/ac/14/085f4cf05fc3f1e8aa95e85404e984ffca9b2275a5dc2b1aae18a67538b8/numpy-2.3.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6cf9b429b21df6b99f4dee7a1218b8b7ffbbe7df8764dc0bd60ce8a0708fed1e", size = 16341672, upload-time = "2025-11-16T22:49:37.2Z" }, + { url = "https://files.pythonhosted.org/packages/6f/3b/1f73994904142b2aa290449b3bb99772477b5fd94d787093e4f24f5af763/numpy-2.3.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:396084a36abdb603546b119d96528c2f6263921c50df3c8fd7cb28873a237748", size = 18838896, upload-time = "2025-11-16T22:49:39.727Z" }, + { url = "https://files.pythonhosted.org/packages/cd/b9/cf6649b2124f288309ffc353070792caf42ad69047dcc60da85ee85fea58/numpy-2.3.5-cp311-cp311-win32.whl", hash = "sha256:b0c7088a73aef3d687c4deef8452a3ac7c1be4e29ed8bf3b366c8111128ac60c", size = 6563608, upload-time = "2025-11-16T22:49:42.079Z" }, + { url = "https://files.pythonhosted.org/packages/aa/44/9fe81ae1dcc29c531843852e2874080dc441338574ccc4306b39e2ff6e59/numpy-2.3.5-cp311-cp311-win_amd64.whl", hash = "sha256:a414504bef8945eae5f2d7cb7be2d4af77c5d1cb5e20b296c2c25b61dff2900c", size = 13078442, upload-time = "2025-11-16T22:49:43.99Z" }, + { url = "https://files.pythonhosted.org/packages/6d/a7/f99a41553d2da82a20a2f22e93c94f928e4490bb447c9ff3c4ff230581d3/numpy-2.3.5-cp311-cp311-win_arm64.whl", hash = "sha256:0cd00b7b36e35398fa2d16af7b907b65304ef8bb4817a550e06e5012929830fa", size = 10458555, upload-time = "2025-11-16T22:49:47.092Z" }, + { url = "https://files.pythonhosted.org/packages/44/37/e669fe6cbb2b96c62f6bbedc6a81c0f3b7362f6a59230b23caa673a85721/numpy-2.3.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:74ae7b798248fe62021dbf3c914245ad45d1a6b0cb4a29ecb4b31d0bfbc4cc3e", size = 16733873, upload-time = "2025-11-16T22:49:49.84Z" }, + { url = "https://files.pythonhosted.org/packages/c5/65/df0db6c097892c9380851ab9e44b52d4f7ba576b833996e0080181c0c439/numpy-2.3.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ee3888d9ff7c14604052b2ca5535a30216aa0a58e948cdd3eeb8d3415f638769", size = 12259838, upload-time = "2025-11-16T22:49:52.863Z" }, + { url = "https://files.pythonhosted.org/packages/5b/e1/1ee06e70eb2136797abe847d386e7c0e830b67ad1d43f364dd04fa50d338/numpy-2.3.5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:612a95a17655e213502f60cfb9bf9408efdc9eb1d5f50535cc6eb365d11b42b5", size = 5088378, upload-time = "2025-11-16T22:49:55.055Z" }, + { url = "https://files.pythonhosted.org/packages/6d/9c/1ca85fb86708724275103b81ec4cf1ac1d08f465368acfc8da7ab545bdae/numpy-2.3.5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3101e5177d114a593d79dd79658650fe28b5a0d8abeb8ce6f437c0e6df5be1a4", size = 6628559, upload-time = "2025-11-16T22:49:57.371Z" }, + { url = "https://files.pythonhosted.org/packages/74/78/fcd41e5a0ce4f3f7b003da85825acddae6d7ecb60cf25194741b036ca7d6/numpy-2.3.5-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b973c57ff8e184109db042c842423ff4f60446239bd585a5131cc47f06f789d", size = 14250702, upload-time = "2025-11-16T22:49:59.632Z" }, + { url = "https://files.pythonhosted.org/packages/b6/23/2a1b231b8ff672b4c450dac27164a8b2ca7d9b7144f9c02d2396518352eb/numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0d8163f43acde9a73c2a33605353a4f1bc4798745a8b1d73183b28e5b435ae28", size = 16606086, upload-time = "2025-11-16T22:50:02.127Z" }, + { url = "https://files.pythonhosted.org/packages/a0/c5/5ad26fbfbe2012e190cc7d5003e4d874b88bb18861d0829edc140a713021/numpy-2.3.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:51c1e14eb1e154ebd80e860722f9e6ed6ec89714ad2db2d3aa33c31d7c12179b", size = 16025985, upload-time = "2025-11-16T22:50:04.536Z" }, + { url = "https://files.pythonhosted.org/packages/d2/fa/dd48e225c46c819288148d9d060b047fd2a6fb1eb37eae25112ee4cb4453/numpy-2.3.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b46b4ec24f7293f23adcd2d146960559aaf8020213de8ad1909dba6c013bf89c", size = 18542976, upload-time = "2025-11-16T22:50:07.557Z" }, + { url = "https://files.pythonhosted.org/packages/05/79/ccbd23a75862d95af03d28b5c6901a1b7da4803181513d52f3b86ed9446e/numpy-2.3.5-cp312-cp312-win32.whl", hash = "sha256:3997b5b3c9a771e157f9aae01dd579ee35ad7109be18db0e85dbdbe1de06e952", size = 6285274, upload-time = "2025-11-16T22:50:10.746Z" }, + { url = "https://files.pythonhosted.org/packages/2d/57/8aeaf160312f7f489dea47ab61e430b5cb051f59a98ae68b7133ce8fa06a/numpy-2.3.5-cp312-cp312-win_amd64.whl", hash = "sha256:86945f2ee6d10cdfd67bcb4069c1662dd711f7e2a4343db5cecec06b87cf31aa", size = 12782922, upload-time = "2025-11-16T22:50:12.811Z" }, + { url = "https://files.pythonhosted.org/packages/78/a6/aae5cc2ca78c45e64b9ef22f089141d661516856cf7c8a54ba434576900d/numpy-2.3.5-cp312-cp312-win_arm64.whl", hash = "sha256:f28620fe26bee16243be2b7b874da327312240a7cdc38b769a697578d2100013", size = 10194667, upload-time = "2025-11-16T22:50:16.16Z" }, + { url = "https://files.pythonhosted.org/packages/db/69/9cde09f36da4b5a505341180a3f2e6fadc352fd4d2b7096ce9778db83f1a/numpy-2.3.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d0f23b44f57077c1ede8c5f26b30f706498b4862d3ff0a7298b8411dd2f043ff", size = 16728251, upload-time = "2025-11-16T22:50:19.013Z" }, + { url = "https://files.pythonhosted.org/packages/79/fb/f505c95ceddd7027347b067689db71ca80bd5ecc926f913f1a23e65cf09b/numpy-2.3.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:aa5bc7c5d59d831d9773d1170acac7893ce3a5e130540605770ade83280e7188", size = 12254652, upload-time = "2025-11-16T22:50:21.487Z" }, + { url = "https://files.pythonhosted.org/packages/78/da/8c7738060ca9c31b30e9301ee0cf6c5ffdbf889d9593285a1cead337f9a5/numpy-2.3.5-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:ccc933afd4d20aad3c00bcef049cb40049f7f196e0397f1109dba6fed63267b0", size = 5083172, upload-time = "2025-11-16T22:50:24.562Z" }, + { url = "https://files.pythonhosted.org/packages/a4/b4/ee5bb2537fb9430fd2ef30a616c3672b991a4129bb1c7dcc42aa0abbe5d7/numpy-2.3.5-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:afaffc4393205524af9dfa400fa250143a6c3bc646c08c9f5e25a9f4b4d6a903", size = 6622990, upload-time = "2025-11-16T22:50:26.47Z" }, + { url = "https://files.pythonhosted.org/packages/95/03/dc0723a013c7d7c19de5ef29e932c3081df1c14ba582b8b86b5de9db7f0f/numpy-2.3.5-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c75442b2209b8470d6d5d8b1c25714270686f14c749028d2199c54e29f20b4d", size = 14248902, upload-time = "2025-11-16T22:50:28.861Z" }, + { url = "https://files.pythonhosted.org/packages/f5/10/ca162f45a102738958dcec8023062dad0cbc17d1ab99d68c4e4a6c45fb2b/numpy-2.3.5-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11e06aa0af8c0f05104d56450d6093ee639e15f24ecf62d417329d06e522e017", size = 16597430, upload-time = "2025-11-16T22:50:31.56Z" }, + { url = "https://files.pythonhosted.org/packages/2a/51/c1e29be863588db58175175f057286900b4b3327a1351e706d5e0f8dd679/numpy-2.3.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ed89927b86296067b4f81f108a2271d8926467a8868e554eaf370fc27fa3ccaf", size = 16024551, upload-time = "2025-11-16T22:50:34.242Z" }, + { url = "https://files.pythonhosted.org/packages/83/68/8236589d4dbb87253d28259d04d9b814ec0ecce7cb1c7fed29729f4c3a78/numpy-2.3.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51c55fe3451421f3a6ef9a9c1439e82101c57a2c9eab9feb196a62b1a10b58ce", size = 18533275, upload-time = "2025-11-16T22:50:37.651Z" }, + { url = "https://files.pythonhosted.org/packages/40/56/2932d75b6f13465239e3b7b7e511be27f1b8161ca2510854f0b6e521c395/numpy-2.3.5-cp313-cp313-win32.whl", hash = "sha256:1978155dd49972084bd6ef388d66ab70f0c323ddee6f693d539376498720fb7e", size = 6277637, upload-time = "2025-11-16T22:50:40.11Z" }, + { url = "https://files.pythonhosted.org/packages/0c/88/e2eaa6cffb115b85ed7c7c87775cb8bcf0816816bc98ca8dbfa2ee33fe6e/numpy-2.3.5-cp313-cp313-win_amd64.whl", hash = "sha256:00dc4e846108a382c5869e77c6ed514394bdeb3403461d25a829711041217d5b", size = 12779090, upload-time = "2025-11-16T22:50:42.503Z" }, + { url = "https://files.pythonhosted.org/packages/8f/88/3f41e13a44ebd4034ee17baa384acac29ba6a4fcc2aca95f6f08ca0447d1/numpy-2.3.5-cp313-cp313-win_arm64.whl", hash = "sha256:0472f11f6ec23a74a906a00b48a4dcf3849209696dff7c189714511268d103ae", size = 10194710, upload-time = "2025-11-16T22:50:44.971Z" }, + { url = "https://files.pythonhosted.org/packages/13/cb/71744144e13389d577f867f745b7df2d8489463654a918eea2eeb166dfc9/numpy-2.3.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:414802f3b97f3c1eef41e530aaba3b3c1620649871d8cb38c6eaff034c2e16bd", size = 16827292, upload-time = "2025-11-16T22:50:47.715Z" }, + { url = "https://files.pythonhosted.org/packages/71/80/ba9dc6f2a4398e7f42b708a7fdc841bb638d353be255655498edbf9a15a8/numpy-2.3.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5ee6609ac3604fa7780e30a03e5e241a7956f8e2fcfe547d51e3afa5247ac47f", size = 12378897, upload-time = "2025-11-16T22:50:51.327Z" }, + { url = "https://files.pythonhosted.org/packages/2e/6d/db2151b9f64264bcceccd51741aa39b50150de9b602d98ecfe7e0c4bff39/numpy-2.3.5-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:86d835afea1eaa143012a2d7a3f45a3adce2d7adc8b4961f0b362214d800846a", size = 5207391, upload-time = "2025-11-16T22:50:54.542Z" }, + { url = "https://files.pythonhosted.org/packages/80/ae/429bacace5ccad48a14c4ae5332f6aa8ab9f69524193511d60ccdfdc65fa/numpy-2.3.5-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:30bc11310e8153ca664b14c5f1b73e94bd0503681fcf136a163de856f3a50139", size = 6721275, upload-time = "2025-11-16T22:50:56.794Z" }, + { url = "https://files.pythonhosted.org/packages/74/5b/1919abf32d8722646a38cd527bc3771eb229a32724ee6ba340ead9b92249/numpy-2.3.5-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1062fde1dcf469571705945b0f221b73928f34a20c904ffb45db101907c3454e", size = 14306855, upload-time = "2025-11-16T22:50:59.208Z" }, + { url = "https://files.pythonhosted.org/packages/a5/87/6831980559434973bebc30cd9c1f21e541a0f2b0c280d43d3afd909b66d0/numpy-2.3.5-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ce581db493ea1a96c0556360ede6607496e8bf9b3a8efa66e06477267bc831e9", size = 16657359, upload-time = "2025-11-16T22:51:01.991Z" }, + { url = "https://files.pythonhosted.org/packages/dd/91/c797f544491ee99fd00495f12ebb7802c440c1915811d72ac5b4479a3356/numpy-2.3.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:cc8920d2ec5fa99875b670bb86ddeb21e295cb07aa331810d9e486e0b969d946", size = 16093374, upload-time = "2025-11-16T22:51:05.291Z" }, + { url = "https://files.pythonhosted.org/packages/74/a6/54da03253afcbe7a72785ec4da9c69fb7a17710141ff9ac5fcb2e32dbe64/numpy-2.3.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:9ee2197ef8c4f0dfe405d835f3b6a14f5fee7782b5de51ba06fb65fc9b36e9f1", size = 18594587, upload-time = "2025-11-16T22:51:08.585Z" }, + { url = "https://files.pythonhosted.org/packages/80/e9/aff53abbdd41b0ecca94285f325aff42357c6b5abc482a3fcb4994290b18/numpy-2.3.5-cp313-cp313t-win32.whl", hash = "sha256:70b37199913c1bd300ff6e2693316c6f869c7ee16378faf10e4f5e3275b299c3", size = 6405940, upload-time = "2025-11-16T22:51:11.541Z" }, + { url = "https://files.pythonhosted.org/packages/d5/81/50613fec9d4de5480de18d4f8ef59ad7e344d497edbef3cfd80f24f98461/numpy-2.3.5-cp313-cp313t-win_amd64.whl", hash = "sha256:b501b5fa195cc9e24fe102f21ec0a44dffc231d2af79950b451e0d99cea02234", size = 12920341, upload-time = "2025-11-16T22:51:14.312Z" }, + { url = "https://files.pythonhosted.org/packages/bb/ab/08fd63b9a74303947f34f0bd7c5903b9c5532c2d287bead5bdf4c556c486/numpy-2.3.5-cp313-cp313t-win_arm64.whl", hash = "sha256:a80afd79f45f3c4a7d341f13acbe058d1ca8ac017c165d3fa0d3de6bc1a079d7", size = 10262507, upload-time = "2025-11-16T22:51:16.846Z" }, + { url = "https://files.pythonhosted.org/packages/ba/97/1a914559c19e32d6b2e233cf9a6a114e67c856d35b1d6babca571a3e880f/numpy-2.3.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:bf06bc2af43fa8d32d30fae16ad965663e966b1a3202ed407b84c989c3221e82", size = 16735706, upload-time = "2025-11-16T22:51:19.558Z" }, + { url = "https://files.pythonhosted.org/packages/57/d4/51233b1c1b13ecd796311216ae417796b88b0616cfd8a33ae4536330748a/numpy-2.3.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:052e8c42e0c49d2575621c158934920524f6c5da05a1d3b9bab5d8e259e045f0", size = 12264507, upload-time = "2025-11-16T22:51:22.492Z" }, + { url = "https://files.pythonhosted.org/packages/45/98/2fe46c5c2675b8306d0b4a3ec3494273e93e1226a490f766e84298576956/numpy-2.3.5-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:1ed1ec893cff7040a02c8aa1c8611b94d395590d553f6b53629a4461dc7f7b63", size = 5093049, upload-time = "2025-11-16T22:51:25.171Z" }, + { url = "https://files.pythonhosted.org/packages/ce/0e/0698378989bb0ac5f1660c81c78ab1fe5476c1a521ca9ee9d0710ce54099/numpy-2.3.5-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:2dcd0808a421a482a080f89859a18beb0b3d1e905b81e617a188bd80422d62e9", size = 6626603, upload-time = "2025-11-16T22:51:27Z" }, + { url = "https://files.pythonhosted.org/packages/5e/a6/9ca0eecc489640615642a6cbc0ca9e10df70df38c4d43f5a928ff18d8827/numpy-2.3.5-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:727fd05b57df37dc0bcf1a27767a3d9a78cbbc92822445f32cc3436ba797337b", size = 14262696, upload-time = "2025-11-16T22:51:29.402Z" }, + { url = "https://files.pythonhosted.org/packages/c8/f6/07ec185b90ec9d7217a00eeeed7383b73d7e709dae2a9a021b051542a708/numpy-2.3.5-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fffe29a1ef00883599d1dc2c51aa2e5d80afe49523c261a74933df395c15c520", size = 16597350, upload-time = "2025-11-16T22:51:32.167Z" }, + { url = "https://files.pythonhosted.org/packages/75/37/164071d1dde6a1a84c9b8e5b414fa127981bad47adf3a6b7e23917e52190/numpy-2.3.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8f7f0e05112916223d3f438f293abf0727e1181b5983f413dfa2fefc4098245c", size = 16040190, upload-time = "2025-11-16T22:51:35.403Z" }, + { url = "https://files.pythonhosted.org/packages/08/3c/f18b82a406b04859eb026d204e4e1773eb41c5be58410f41ffa511d114ae/numpy-2.3.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2e2eb32ddb9ccb817d620ac1d8dae7c3f641c1e5f55f531a33e8ab97960a75b8", size = 18536749, upload-time = "2025-11-16T22:51:39.698Z" }, + { url = "https://files.pythonhosted.org/packages/40/79/f82f572bf44cf0023a2fe8588768e23e1592585020d638999f15158609e1/numpy-2.3.5-cp314-cp314-win32.whl", hash = "sha256:66f85ce62c70b843bab1fb14a05d5737741e74e28c7b8b5a064de10142fad248", size = 6335432, upload-time = "2025-11-16T22:51:42.476Z" }, + { url = "https://files.pythonhosted.org/packages/a3/2e/235b4d96619931192c91660805e5e49242389742a7a82c27665021db690c/numpy-2.3.5-cp314-cp314-win_amd64.whl", hash = "sha256:e6a0bc88393d65807d751a614207b7129a310ca4fe76a74e5c7da5fa5671417e", size = 12919388, upload-time = "2025-11-16T22:51:45.275Z" }, + { url = "https://files.pythonhosted.org/packages/07/2b/29fd75ce45d22a39c61aad74f3d718e7ab67ccf839ca8b60866054eb15f8/numpy-2.3.5-cp314-cp314-win_arm64.whl", hash = "sha256:aeffcab3d4b43712bb7a60b65f6044d444e75e563ff6180af8f98dd4b905dfd2", size = 10476651, upload-time = "2025-11-16T22:51:47.749Z" }, + { url = "https://files.pythonhosted.org/packages/17/e1/f6a721234ebd4d87084cfa68d081bcba2f5cfe1974f7de4e0e8b9b2a2ba1/numpy-2.3.5-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:17531366a2e3a9e30762c000f2c43a9aaa05728712e25c11ce1dbe700c53ad41", size = 16834503, upload-time = "2025-11-16T22:51:50.443Z" }, + { url = "https://files.pythonhosted.org/packages/5c/1c/baf7ffdc3af9c356e1c135e57ab7cf8d247931b9554f55c467efe2c69eff/numpy-2.3.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d21644de1b609825ede2f48be98dfde4656aefc713654eeee280e37cadc4e0ad", size = 12381612, upload-time = "2025-11-16T22:51:53.609Z" }, + { url = "https://files.pythonhosted.org/packages/74/91/f7f0295151407ddc9ba34e699013c32c3c91944f9b35fcf9281163dc1468/numpy-2.3.5-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:c804e3a5aba5460c73955c955bdbd5c08c354954e9270a2c1565f62e866bdc39", size = 5210042, upload-time = "2025-11-16T22:51:56.213Z" }, + { url = "https://files.pythonhosted.org/packages/2e/3b/78aebf345104ec50dd50a4d06ddeb46a9ff5261c33bcc58b1c4f12f85ec2/numpy-2.3.5-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:cc0a57f895b96ec78969c34f682c602bf8da1a0270b09bc65673df2e7638ec20", size = 6724502, upload-time = "2025-11-16T22:51:58.584Z" }, + { url = "https://files.pythonhosted.org/packages/02/c6/7c34b528740512e57ef1b7c8337ab0b4f0bddf34c723b8996c675bc2bc91/numpy-2.3.5-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:900218e456384ea676e24ea6a0417f030a3b07306d29d7ad843957b40a9d8d52", size = 14308962, upload-time = "2025-11-16T22:52:01.698Z" }, + { url = "https://files.pythonhosted.org/packages/80/35/09d433c5262bc32d725bafc619e095b6a6651caf94027a03da624146f655/numpy-2.3.5-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:09a1bea522b25109bf8e6f3027bd810f7c1085c64a0c7ce050c1676ad0ba010b", size = 16655054, upload-time = "2025-11-16T22:52:04.267Z" }, + { url = "https://files.pythonhosted.org/packages/7a/ab/6a7b259703c09a88804fa2430b43d6457b692378f6b74b356155283566ac/numpy-2.3.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:04822c00b5fd0323c8166d66c701dc31b7fbd252c100acd708c48f763968d6a3", size = 16091613, upload-time = "2025-11-16T22:52:08.651Z" }, + { url = "https://files.pythonhosted.org/packages/c2/88/330da2071e8771e60d1038166ff9d73f29da37b01ec3eb43cb1427464e10/numpy-2.3.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d6889ec4ec662a1a37eb4b4fb26b6100841804dac55bd9df579e326cdc146227", size = 18591147, upload-time = "2025-11-16T22:52:11.453Z" }, + { url = "https://files.pythonhosted.org/packages/51/41/851c4b4082402d9ea860c3626db5d5df47164a712cb23b54be028b184c1c/numpy-2.3.5-cp314-cp314t-win32.whl", hash = "sha256:93eebbcf1aafdf7e2ddd44c2923e2672e1010bddc014138b229e49725b4d6be5", size = 6479806, upload-time = "2025-11-16T22:52:14.641Z" }, + { url = "https://files.pythonhosted.org/packages/90/30/d48bde1dfd93332fa557cff1972fbc039e055a52021fbef4c2c4b1eefd17/numpy-2.3.5-cp314-cp314t-win_amd64.whl", hash = "sha256:c8a9958e88b65c3b27e22ca2a076311636850b612d6bbfb76e8d156aacde2aaf", size = 13105760, upload-time = "2025-11-16T22:52:17.975Z" }, + { url = "https://files.pythonhosted.org/packages/2d/fd/4b5eb0b3e888d86aee4d198c23acec7d214baaf17ea93c1adec94c9518b9/numpy-2.3.5-cp314-cp314t-win_arm64.whl", hash = "sha256:6203fdf9f3dc5bdaed7319ad8698e685c7a3be10819f41d32a0723e611733b42", size = 10545459, upload-time = "2025-11-16T22:52:20.55Z" }, + { url = "https://files.pythonhosted.org/packages/c6/65/f9dea8e109371ade9c782b4e4756a82edf9d3366bca495d84d79859a0b79/numpy-2.3.5-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f0963b55cdd70fad460fa4c1341f12f976bb26cb66021a5580329bd498988310", size = 16910689, upload-time = "2025-11-16T22:52:23.247Z" }, + { url = "https://files.pythonhosted.org/packages/00/4f/edb00032a8fb92ec0a679d3830368355da91a69cab6f3e9c21b64d0bb986/numpy-2.3.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:f4255143f5160d0de972d28c8f9665d882b5f61309d8362fdd3e103cf7bf010c", size = 12457053, upload-time = "2025-11-16T22:52:26.367Z" }, + { url = "https://files.pythonhosted.org/packages/16/a4/e8a53b5abd500a63836a29ebe145fc1ab1f2eefe1cfe59276020373ae0aa/numpy-2.3.5-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:a4b9159734b326535f4dd01d947f919c6eefd2d9827466a696c44ced82dfbc18", size = 5285635, upload-time = "2025-11-16T22:52:29.266Z" }, + { url = "https://files.pythonhosted.org/packages/a3/2f/37eeb9014d9c8b3e9c55bc599c68263ca44fdbc12a93e45a21d1d56df737/numpy-2.3.5-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:2feae0d2c91d46e59fcd62784a3a83b3fb677fead592ce51b5a6fbb4f95965ff", size = 6801770, upload-time = "2025-11-16T22:52:31.421Z" }, + { url = "https://files.pythonhosted.org/packages/7d/e4/68d2f474df2cb671b2b6c2986a02e520671295647dad82484cde80ca427b/numpy-2.3.5-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ffac52f28a7849ad7576293c0cb7b9f08304e8f7d738a8cb8a90ec4c55a998eb", size = 14391768, upload-time = "2025-11-16T22:52:33.593Z" }, + { url = "https://files.pythonhosted.org/packages/b8/50/94ccd8a2b141cb50651fddd4f6a48874acb3c91c8f0842b08a6afc4b0b21/numpy-2.3.5-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63c0e9e7eea69588479ebf4a8a270d5ac22763cc5854e9a7eae952a3908103f7", size = 16729263, upload-time = "2025-11-16T22:52:36.369Z" }, + { url = "https://files.pythonhosted.org/packages/2d/ee/346fa473e666fe14c52fcdd19ec2424157290a032d4c41f98127bfb31ac7/numpy-2.3.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:f16417ec91f12f814b10bafe79ef77e70113a2f5f7018640e7425ff979253425", size = 12967213, upload-time = "2025-11-16T22:52:39.38Z" }, +] + +[[package]] +name = "nv-grouped-gemm" +version = "1.1.4.post6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "absl-py" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "torch", marker = "sys_platform == 'never'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/05/79/87c45f32e661b25e0aaa1e325ba166511f57be5dff8f0fcabc12d3e73b64/nv_grouped_gemm-1.1.4.post6.tar.gz", hash = "sha256:dad6115f4b4ff7ceb0bc40ad44e923c13a24fc88cfe1e20b1a6b4c9cf24c445c", size = 26508, upload-time = "2025-10-10T18:52:29.508Z" } + +[[package]] +name = "nv-one-logger-core" +version = "2.3.1" +source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy", marker = "python_full_version >= '3.11'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.11'" }, + { name = "overrides" }, + { name = "pydantic" }, + { name = "strenum" }, + { name = "toml" }, + { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/f6/48/6188e359b90a9d8a1850f2bc888c023e66f4a8b2b496820babbea414f008/numcodecs-0.16.3.tar.gz", hash = "sha256:53d705865faaf0a7927c973af3777532001c8fbb653de119c1e844608614d799", size = 6275704, upload-time = "2025-09-18T18:54:57.221Z" } +sdist = { url = "https://files.pythonhosted.org/packages/3b/37/963095797035f371e0db6ea761f5aaccb624fc786af217115b423baeb0e2/nv_one_logger_core-2.3.1.tar.gz", hash = "sha256:cbb2f87604c78b96a302f32d87199902129d76153a73a20f8455a250b3246c1d", size = 52640, upload-time = "2025-10-29T21:11:55.812Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d4/cc/917a85972537498f2bbd7914047efc98babc8667587ceb9dcb228378978a/numcodecs-0.16.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:95c9f2a49bef10cf91ad614a761cba9bfe96656b60c12540e1080de5d909b4ca", size = 1642356, upload-time = "2025-09-18T18:54:36.402Z" }, - { url = "https://files.pythonhosted.org/packages/3b/6a/64c25a089e8537441fe67c09ecb7f3f7fb5d98cd04faf01f605d43aca41c/numcodecs-0.16.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2afe73d5ebaf9ca0cd5c83aad945da80d29a33d860a80d43a7248491d8813ff", size = 1169186, upload-time = "2025-09-18T18:54:37.838Z" }, - { url = "https://files.pythonhosted.org/packages/d8/a0/0de627baeb43e2045a3d4b3de99bf8b69af329a33df1ed4cda468d70c1fb/numcodecs-0.16.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:913f08194d82dcb37594e6705e6d4ae6ccd4b6571500b832fb3e4a155de1dfe8", size = 8341668, upload-time = "2025-09-18T18:54:39.444Z" }, - { url = "https://files.pythonhosted.org/packages/b6/0f/49d1f74a216149240c4b9403218111f11670bd11af0919fda357bb056bf2/numcodecs-0.16.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85a7f1cae9eb18b85709af46570bf9c60056e7155c4c8f610e8080c68124d0e5", size = 8866611, upload-time = "2025-09-18T18:54:41.168Z" }, - { url = "https://files.pythonhosted.org/packages/aa/51/03aece765108fe247717105b5131856546e5428f22a56a14ffdebd017424/numcodecs-0.16.3-cp311-cp311-win_amd64.whl", hash = "sha256:f7bb7f2c46eb7ec8a1c5f8d8fe1a72c222256dd6d6df5af9eaac7a6b905f3575", size = 806787, upload-time = "2025-09-18T18:54:42.78Z" }, - { url = "https://files.pythonhosted.org/packages/0d/78/e4b34803a3aa1d0769919695de4b133266c18c80c474d32ebc462fa1a9bd/numcodecs-0.16.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c77454d92941a335d148b0b822f5d4783103f392774d5d76283bbf7f21b49529", size = 1681108, upload-time = "2025-09-18T18:54:43.856Z" }, - { url = "https://files.pythonhosted.org/packages/25/cf/ca36f463b03a4097767d2a1c1b72f31810e8c6384e9449dd9b925203783c/numcodecs-0.16.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:270e7a33ee96bdf5c957acf25a2487002a233811a125a155c400c2f036b69c73", size = 1165589, upload-time = "2025-09-18T18:54:44.954Z" }, - { url = "https://files.pythonhosted.org/packages/ed/ae/670260c3c4b5ed34a0674561355f3d4ce7fcbdf09a667e5bc841526d271c/numcodecs-0.16.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:12f43fa4a347d1dba775c4506a1c9b15b90144c258433b81f79f1c1b1a990db5", size = 8316365, upload-time = "2025-09-18T18:54:46.073Z" }, - { url = "https://files.pythonhosted.org/packages/bb/fa/94e022419c751a60ff0f53642ebae5ef81ed3cc3640f958588e3ad3dc18d/numcodecs-0.16.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44869ef564a50aa545215c6a0d42ba5bbc34e9715523fb2336ada3d1fb2b331d", size = 8846228, upload-time = "2025-09-18T18:54:47.858Z" }, - { url = "https://files.pythonhosted.org/packages/71/60/f23733589f3e059bf8589508acd23ffeec230bdf179f138a54f5ab16e0a6/numcodecs-0.16.3-cp312-cp312-win_amd64.whl", hash = "sha256:9aae6996172ba10c5f5111b2998709071b5aeba6b58b1ee0b26b61ed6aa7f2f4", size = 806260, upload-time = "2025-09-18T18:54:49.41Z" }, - { url = "https://files.pythonhosted.org/packages/3c/d5/d3536d06ac1e5fb848a3186958204082b68b106364c9a3669652dd786731/numcodecs-0.16.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:947406b01c20f2ce7ce2e631e7f21b782e8a9d4b57b374a41c9e7b1341a8f3a2", size = 1677129, upload-time = "2025-09-18T18:54:50.5Z" }, - { url = "https://files.pythonhosted.org/packages/e1/fd/b0513a3428dc2b38ec85eea771703ae69c49f09b9650d6c44c9105c80073/numcodecs-0.16.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7cf50e351398a34b45817974c411527629e88937b7683695e276afd65da6ed6f", size = 1159058, upload-time = "2025-09-18T18:54:51.675Z" }, - { url = "https://files.pythonhosted.org/packages/98/05/b7c127283cfb154a97abb284363825401b69302d71a28608af66f73257cc/numcodecs-0.16.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f7938502fcc060ed9543814f38ca67048b33d7bd2667756e36e6b1060455b17e", size = 8260987, upload-time = "2025-09-18T18:54:52.883Z" }, - { url = "https://files.pythonhosted.org/packages/ff/46/320d960aff884bc63abaaf846ffa3de4803e83e8070b6f84c5688464839c/numcodecs-0.16.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:010d628c95be1214536fb22c0df4ced58da954b404b1fcb25ddebf64e4a3f7f3", size = 8805295, upload-time = "2025-09-18T18:54:54.698Z" }, - { url = "https://files.pythonhosted.org/packages/31/ae/acc2e0f1f49ba32afa2174578f170673139248ef86f77e334f2619133867/numcodecs-0.16.3-cp313-cp313-win_amd64.whl", hash = "sha256:e83115e3c32de798c7b7164503e06aae9f9746c1cef564d029616eb44bd6cd90", size = 803204, upload-time = "2025-09-18T18:54:56.192Z" }, + { url = "https://files.pythonhosted.org/packages/ee/c4/ea91554c4fcbff66057f667690101d7a4b965605741350ac661b03fa6c46/nv_one_logger_core-2.3.1-py3-none-any.whl", hash = "sha256:0c8b77bcdac4daa1ea913bf8d4afd2a057bd5526e3654ac39f67caba157341a6", size = 63066, upload-time = "2025-10-29T21:11:52.753Z" }, ] -[package.optional-dependencies] -crc32c = [ - { name = "crc32c", marker = "python_full_version >= '3.11'" }, +[[package]] +name = "nv-one-logger-training-telemetry" +version = "2.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nv-one-logger-core" }, + { name = "strenum" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c5/21/016fa067967734d52f1ccf5a2a37a1a65216f2d7053bc2b85872cce956ca/nv_one_logger_training_telemetry-2.3.1.tar.gz", hash = "sha256:8c67940ea71799afaf1f46df3ba2f52f93aea26321c6f1c1d54aae02efc2a4af", size = 44435, upload-time = "2025-10-29T21:21:42.035Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/15/97e6e4ddfe5fc35bcee74a45b7c33fb73abb83713c7dfa26420b971a86c3/nv_one_logger_training_telemetry-2.3.1-py3-none-any.whl", hash = "sha256:5319443829b59378a498c3c62ac98973e14f31be675c229ff2b14e2fe109aa0b", size = 44140, upload-time = "2025-10-29T21:21:40.72Z" }, ] [[package]] -name = "numpy" -version = "1.26.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", size = 15786129, upload-time = "2024-02-06T00:26:44.495Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a7/94/ace0fdea5241a27d13543ee117cbc65868e82213fb31a8eb7fe9ff23f313/numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0", size = 20631468, upload-time = "2024-02-05T23:48:01.194Z" }, - { url = "https://files.pythonhosted.org/packages/20/f7/b24208eba89f9d1b58c1668bc6c8c4fd472b20c45573cb767f59d49fb0f6/numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a", size = 13966411, upload-time = "2024-02-05T23:48:29.038Z" }, - { url = "https://files.pythonhosted.org/packages/fc/a5/4beee6488160798683eed5bdb7eead455892c3b4e1f78d79d8d3f3b084ac/numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4", size = 14219016, upload-time = "2024-02-05T23:48:54.098Z" }, - { url = "https://files.pythonhosted.org/packages/4b/d7/ecf66c1cd12dc28b4040b15ab4d17b773b87fa9d29ca16125de01adb36cd/numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f", size = 18240889, upload-time = "2024-02-05T23:49:25.361Z" }, - { url = "https://files.pythonhosted.org/packages/24/03/6f229fe3187546435c4f6f89f6d26c129d4f5bed40552899fcf1f0bf9e50/numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a", size = 13876746, upload-time = "2024-02-05T23:49:51.983Z" }, - { url = "https://files.pythonhosted.org/packages/39/fe/39ada9b094f01f5a35486577c848fe274e374bbf8d8f472e1423a0bbd26d/numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2", size = 18078620, upload-time = "2024-02-05T23:50:22.515Z" }, - { url = "https://files.pythonhosted.org/packages/d5/ef/6ad11d51197aad206a9ad2286dc1aac6a378059e06e8cf22cd08ed4f20dc/numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07", size = 5972659, upload-time = "2024-02-05T23:50:35.834Z" }, - { url = "https://files.pythonhosted.org/packages/19/77/538f202862b9183f54108557bfda67e17603fc560c384559e769321c9d92/numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5", size = 15808905, upload-time = "2024-02-05T23:51:03.701Z" }, - { url = "https://files.pythonhosted.org/packages/11/57/baae43d14fe163fa0e4c47f307b6b2511ab8d7d30177c491960504252053/numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71", size = 20630554, upload-time = "2024-02-05T23:51:50.149Z" }, - { url = "https://files.pythonhosted.org/packages/1a/2e/151484f49fd03944c4a3ad9c418ed193cfd02724e138ac8a9505d056c582/numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef", size = 13997127, upload-time = "2024-02-05T23:52:15.314Z" }, - { url = "https://files.pythonhosted.org/packages/79/ae/7e5b85136806f9dadf4878bf73cf223fe5c2636818ba3ab1c585d0403164/numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e", size = 14222994, upload-time = "2024-02-05T23:52:47.569Z" }, - { url = "https://files.pythonhosted.org/packages/3a/d0/edc009c27b406c4f9cbc79274d6e46d634d139075492ad055e3d68445925/numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5", size = 18252005, upload-time = "2024-02-05T23:53:15.637Z" }, - { url = "https://files.pythonhosted.org/packages/09/bf/2b1aaf8f525f2923ff6cfcf134ae5e750e279ac65ebf386c75a0cf6da06a/numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a", size = 13885297, upload-time = "2024-02-05T23:53:42.16Z" }, - { url = "https://files.pythonhosted.org/packages/df/a0/4e0f14d847cfc2a633a1c8621d00724f3206cfeddeb66d35698c4e2cf3d2/numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a", size = 18093567, upload-time = "2024-02-05T23:54:11.696Z" }, - { url = "https://files.pythonhosted.org/packages/d2/b7/a734c733286e10a7f1a8ad1ae8c90f2d33bf604a96548e0a4a3a6739b468/numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20", size = 5968812, upload-time = "2024-02-05T23:54:26.453Z" }, - { url = "https://files.pythonhosted.org/packages/3f/6b/5610004206cf7f8e7ad91c5a85a8c71b2f2f8051a0c0c4d5916b76d6cbb2/numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2", size = 15811913, upload-time = "2024-02-05T23:54:53.933Z" }, - { url = "https://files.pythonhosted.org/packages/95/12/8f2020a8e8b8383ac0177dc9570aad031a3beb12e38847f7129bacd96228/numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218", size = 20335901, upload-time = "2024-02-05T23:55:32.801Z" }, - { url = "https://files.pythonhosted.org/packages/75/5b/ca6c8bd14007e5ca171c7c03102d17b4f4e0ceb53957e8c44343a9546dcc/numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b", size = 13685868, upload-time = "2024-02-05T23:55:56.28Z" }, - { url = "https://files.pythonhosted.org/packages/79/f8/97f10e6755e2a7d027ca783f63044d5b1bc1ae7acb12afe6a9b4286eac17/numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b", size = 13925109, upload-time = "2024-02-05T23:56:20.368Z" }, - { url = "https://files.pythonhosted.org/packages/0f/50/de23fde84e45f5c4fda2488c759b69990fd4512387a8632860f3ac9cd225/numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed", size = 17950613, upload-time = "2024-02-05T23:56:56.054Z" }, - { url = "https://files.pythonhosted.org/packages/4c/0c/9c603826b6465e82591e05ca230dfc13376da512b25ccd0894709b054ed0/numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a", size = 13572172, upload-time = "2024-02-05T23:57:21.56Z" }, - { url = "https://files.pythonhosted.org/packages/76/8c/2ba3902e1a0fc1c74962ea9bb33a534bb05984ad7ff9515bf8d07527cadd/numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0", size = 17786643, upload-time = "2024-02-05T23:57:56.585Z" }, - { url = "https://files.pythonhosted.org/packages/28/4a/46d9e65106879492374999e76eb85f87b15328e06bd1550668f79f7b18c6/numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110", size = 5677803, upload-time = "2024-02-05T23:58:08.963Z" }, - { url = "https://files.pythonhosted.org/packages/16/2e/86f24451c2d530c88daf997cb8d6ac622c1d40d19f5a031ed68a4b73a374/numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818", size = 15517754, upload-time = "2024-02-05T23:58:36.364Z" }, +name = "nvidia-cublas-cu12" +version = "12.8.4.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/29/99/db44d685f0e257ff0e213ade1964fc459b4a690a73293220e98feb3307cf/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:b86f6dd8935884615a0683b663891d43781b819ac4f2ba2b0c9604676af346d0", size = 590537124, upload-time = "2025-03-07T01:43:53.556Z" }, + { url = "https://files.pythonhosted.org/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142", size = 594346921, upload-time = "2025-03-07T01:44:31.254Z" }, + { url = "https://files.pythonhosted.org/packages/70/61/7d7b3c70186fb651d0fbd35b01dbfc8e755f69fd58f817f3d0f642df20c3/nvidia_cublas_cu12-12.8.4.1-py3-none-win_amd64.whl", hash = "sha256:47e9b82132fa8d2b4944e708049229601448aaad7e6f296f630f2d1a32de35af", size = 567544208, upload-time = "2025-03-07T01:53:30.535Z" }, ] [[package]] -name = "nv-grouped-gemm" -version = "1.1.4.post6" +name = "nvidia-cuda-cupti-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/1f/b3bd73445e5cb342727fd24fe1f7b748f690b460acadc27ea22f904502c8/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4412396548808ddfed3f17a467b104ba7751e6b58678a4b840675c56d21cf7ed", size = 9533318, upload-time = "2025-03-07T01:40:10.421Z" }, + { url = "https://files.pythonhosted.org/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182", size = 10248621, upload-time = "2025-03-07T01:40:21.213Z" }, + { url = "https://files.pythonhosted.org/packages/41/bc/83f5426095d93694ae39fe1311431b5d5a9bb82e48bf0dd8e19be2765942/nvidia_cuda_cupti_cu12-12.8.90-py3-none-win_amd64.whl", hash = "sha256:bb479dcdf7e6d4f8b0b01b115260399bf34154a1a2e9fe11c85c517d87efd98e", size = 7015759, upload-time = "2025-03-07T01:51:11.355Z" }, +] + +[[package]] +name = "nvidia-cuda-nvrtc-cu12" +version = "12.8.93" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994", size = 88040029, upload-time = "2025-03-07T01:42:13.562Z" }, + { url = "https://files.pythonhosted.org/packages/eb/d1/e50d0acaab360482034b84b6e27ee83c6738f7d32182b987f9c7a4e32962/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc1fec1e1637854b4c0a65fb9a8346b51dd9ee69e61ebaccc82058441f15bce8", size = 43106076, upload-time = "2025-03-07T01:41:59.817Z" }, + { url = "https://files.pythonhosted.org/packages/45/51/52a3d84baa2136cc8df15500ad731d74d3a1114d4c123e043cb608d4a32b/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-win_amd64.whl", hash = "sha256:7a4b6b2904850fe78e0bd179c4b655c404d4bb799ef03ddc60804247099ae909", size = 73586838, upload-time = "2025-03-07T01:52:13.483Z" }, +] + +[[package]] +name = "nvidia-cuda-runtime-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/75/f865a3b236e4647605ea34cc450900854ba123834a5f1598e160b9530c3a/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:52bf7bbee900262ffefe5e9d5a2a69a30d97e2bc5bb6cc866688caa976966e3d", size = 965265, upload-time = "2025-03-07T01:39:43.533Z" }, + { url = "https://files.pythonhosted.org/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90", size = 954765, upload-time = "2025-03-07T01:40:01.615Z" }, + { url = "https://files.pythonhosted.org/packages/30/a5/a515b7600ad361ea14bfa13fb4d6687abf500adc270f19e89849c0590492/nvidia_cuda_runtime_cu12-12.8.90-py3-none-win_amd64.whl", hash = "sha256:c0c6027f01505bfed6c3b21ec546f69c687689aad5f1a377554bc6ca4aa993a8", size = 944318, upload-time = "2025-03-07T01:51:01.794Z" }, +] + +[[package]] +name = "nvidia-cudnn-cu12" +version = "9.10.2.21" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "absl-py" }, - { name = "numpy" }, - { name = "torch", marker = "sys_platform == 'never'" }, + { name = "nvidia-cublas-cu12" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/41/e79269ce215c857c935fd86bcfe91a451a584dfc27f1e068f568b9ad1ab7/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8", size = 705026878, upload-time = "2025-06-06T21:52:51.348Z" }, + { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" }, + { url = "https://files.pythonhosted.org/packages/3d/90/0bd6e586701b3a890fd38aa71c387dab4883d619d6e5ad912ccbd05bfd67/nvidia_cudnn_cu12-9.10.2.21-py3-none-win_amd64.whl", hash = "sha256:c6288de7d63e6cf62988f0923f96dc339cea362decb1bf5b3141883392a7d65e", size = 692992268, upload-time = "2025-06-06T21:55:18.114Z" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/05/79/87c45f32e661b25e0aaa1e325ba166511f57be5dff8f0fcabc12d3e73b64/nv_grouped_gemm-1.1.4.post6.tar.gz", hash = "sha256:dad6115f4b4ff7ceb0bc40ad44e923c13a24fc88cfe1e20b1a6b4c9cf24c445c", size = 26508, upload-time = "2025-10-10T18:52:29.508Z" } [[package]] name = "nvidia-cudnn-frontend" -version = "1.15.0" +version = "1.16.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/cf/3cd3cc682df5488288c6043fc0977090497ff015a082ab160076fecb080a/nvidia_cudnn_frontend-1.16.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:83ecbe6d1145dc208a9ae82aa0b45b2c8f74ed8a43d3a102a13eef2117e2fedd", size = 1835542, upload-time = "2025-11-07T01:28:20.133Z" }, + { url = "https://files.pythonhosted.org/packages/92/45/87f3f2d94a928be21459949b03b0b8bcea13531d30094ad84a8ae4fca761/nvidia_cudnn_frontend-1.16.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:77cb06b91877c8489363867434ba1d9936f3e10bf7ed98d82e98f5f578611920", size = 1950339, upload-time = "2025-11-07T01:31:41.69Z" }, + { url = "https://files.pythonhosted.org/packages/be/f5/1662f18084ef4441bfb3a01383cbf77194905b53474dcb51c0d0f373c74b/nvidia_cudnn_frontend-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:ee3f3886f107919dad48cbc905fa6ae9207c8d7d5a24165e55625ea96f0fe40f", size = 1367883, upload-time = "2025-11-07T01:25:17.791Z" }, + { url = "https://files.pythonhosted.org/packages/10/b7/d0a3a337f5e83f26ff79a7fd63a859181ff2911f1d905d6fbab5fc80170d/nvidia_cudnn_frontend-1.16.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c360d5840d6eb597aade9e9c8780e24aec283b8e6bc97d52881c821a35c92aa9", size = 1837573, upload-time = "2025-11-07T01:29:05.507Z" }, + { url = "https://files.pythonhosted.org/packages/95/dc/465a14f2d235778405f2e84fce336d07ab045bf1c7df6404bdf8033e06a8/nvidia_cudnn_frontend-1.16.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c4a8fc573d85a86e08b15d9bf37f729e2487298781867a492a59cde6ac295e2", size = 1952630, upload-time = "2025-11-07T01:32:00.242Z" }, + { url = "https://files.pythonhosted.org/packages/3b/89/f14435f616603a999975930c4456d6140127f6acb19a877c752beccad837/nvidia_cudnn_frontend-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:a257f10a932ffde9741f644efd3611acf77e2fd89d493d81bc6a8353c48f1ec2", size = 1368775, upload-time = "2025-11-07T01:25:42.252Z" }, + { url = "https://files.pythonhosted.org/packages/00/39/79b606e805abd67ab4fa72f752a5413a496159f10d94fbdb1d67bb5ae86c/nvidia_cudnn_frontend-1.16.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd6fdd71c0896ff2ca1809d914cbd17f2904d55863f8881f47946e1d634c7a88", size = 1839271, upload-time = "2025-11-07T01:29:53.06Z" }, + { url = "https://files.pythonhosted.org/packages/09/21/a0e0d50ba8d7b639fe635500fee0d9c0319561b1ae72176d7024ec04b439/nvidia_cudnn_frontend-1.16.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:16efb069d4bda4d3b99134f59f376cfd4d09558298bd96af778fdc7f2851e696", size = 1954062, upload-time = "2025-11-07T01:32:18.556Z" }, + { url = "https://files.pythonhosted.org/packages/ce/d6/30ae67bb9c010e9459d1211c56d73373eb4e3dd9f57f4c3c1fe0966efcb1/nvidia_cudnn_frontend-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:7b7860db03767c158accbe0b4e9c9553506513cc970ff08ed28c7761681ac466", size = 1368435, upload-time = "2025-11-07T01:26:28.022Z" }, + { url = "https://files.pythonhosted.org/packages/32/2c/b4376afef0a6342c56e82e3465c1f8f5c719f588293a50dd04019a22ae6e/nvidia_cudnn_frontend-1.16.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b6bcb3a2fbff80538958e21e2227520f082a961164865aaeedaac527f61084f9", size = 1839805, upload-time = "2025-11-07T01:30:31.056Z" }, + { url = "https://files.pythonhosted.org/packages/71/13/836b90354036154ab82db3861210e5736983fe1fc44bb39c146ad93b333b/nvidia_cudnn_frontend-1.16.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cbdad88b2bec5dde837f8fa7632022334cddb4756f923b5421c06a712cb59d31", size = 1953953, upload-time = "2025-11-07T01:33:03.781Z" }, + { url = "https://files.pythonhosted.org/packages/e5/30/3025f34f2c86ceef85134dc1f323f8cf2a26d3ffddc5ada48528c80bfae1/nvidia_cudnn_frontend-1.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:138de2bc4697fabb2eb2f0f601a7e31f8fe97874908e26e33d737276f335473c", size = 1368359, upload-time = "2025-11-07T01:26:51.561Z" }, +] + +[[package]] +name = "nvidia-cufft-cu12" +version = "11.3.3.83" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/60/bc/7771846d3a0272026c416fbb7e5f4c1f146d6d80704534d0b187dd6f4800/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a", size = 193109211, upload-time = "2025-03-07T01:44:56.873Z" }, + { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" }, + { url = "https://files.pythonhosted.org/packages/7d/ec/ce1629f1e478bb5ccd208986b5f9e0316a78538dd6ab1d0484f012f8e2a1/nvidia_cufft_cu12-11.3.3.83-py3-none-win_amd64.whl", hash = "sha256:7a64a98ef2a7c47f905aaf8931b69a3a43f27c55530c698bb2ed7c75c0b42cb7", size = 192216559, upload-time = "2025-03-07T01:53:57.106Z" }, +] + +[[package]] +name = "nvidia-cufile-cu12" +version = "1.13.1.3" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc", size = 1197834, upload-time = "2025-03-07T01:45:50.723Z" }, + { url = "https://files.pythonhosted.org/packages/1e/f5/5607710447a6fe9fd9b3283956fceeee8a06cda1d2f56ce31371f595db2a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:4beb6d4cce47c1a0f1013d72e02b0994730359e17801d395bdcbf20cfb3bb00a", size = 1120705, upload-time = "2025-03-07T01:45:41.434Z" }, +] + +[[package]] +name = "nvidia-curand-cu12" +version = "10.3.9.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/45/5e/92aa15eca622a388b80fbf8375d4760738df6285b1e92c43d37390a33a9a/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:dfab99248034673b779bc6decafdc3404a8a6f502462201f2f31f11354204acd", size = 63625754, upload-time = "2025-03-07T01:46:10.735Z" }, + { url = "https://files.pythonhosted.org/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9", size = 63619976, upload-time = "2025-03-07T01:46:23.323Z" }, + { url = "https://files.pythonhosted.org/packages/b9/75/70c05b2f3ed5be3bb30b7102b6eb78e100da4bbf6944fd6725c012831cab/nvidia_curand_cu12-10.3.9.90-py3-none-win_amd64.whl", hash = "sha256:f149a8ca457277da854f89cf282d6ef43176861926c7ac85b2a0fbd237c587ec", size = 62765309, upload-time = "2025-03-07T01:54:20.478Z" }, +] + +[[package]] +name = "nvidia-cusolver-cu12" +version = "11.7.3.90" source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12" }, + { name = "nvidia-cusparse-cu12" }, + { name = "nvidia-nvjitlink-cu12" }, +] wheels = [ - { url = "https://files.pythonhosted.org/packages/d7/3f/d7bf811f4a76f4e9aa4ef390b11217562bba06f0c77f9e14c765681ccba6/nvidia_cudnn_frontend-1.15.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b4e8c77e848502ad79f8aef6b6c699613a6b5139572aba1f55f626d7bf31b44", size = 1743761, upload-time = "2025-10-10T18:54:15.142Z" }, - { url = "https://files.pythonhosted.org/packages/3e/b8/286f7fb3f1068acf0014a851f86863ed9fec69aff79a10dcc0dfbffe0523/nvidia_cudnn_frontend-1.15.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:64a926602e52268e09127cf7a227e6b3d7c6e9e2a97fb57eebe88132aec8d9c8", size = 1859188, upload-time = "2025-10-10T18:56:59.386Z" }, - { url = "https://files.pythonhosted.org/packages/e8/f7/6e55b0122ca5924f0cdbd717392d35a92f43c6ed4b6d64c7d378ee01f301/nvidia_cudnn_frontend-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:7a21ec041fa4009cc8b76b2d26ad73010ab5e005804e4df8b1c1abdba5e23cd5", size = 1296575, upload-time = "2025-10-10T18:45:45.04Z" }, - { url = "https://files.pythonhosted.org/packages/80/b8/d0f1ab5c309c513fe1e4235e860872fc7ee60876e69b30eb0a20fe8c35d8/nvidia_cudnn_frontend-1.15.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:570c2e028ff9b8293f9625b31484084a638de6fb685802194b8dfe16db5a44b4", size = 1747611, upload-time = "2025-10-10T18:54:51.427Z" }, - { url = "https://files.pythonhosted.org/packages/0e/52/5b77edb810063c10040ac34e1517ee62690c4f030f0cf68298a4608552bc/nvidia_cudnn_frontend-1.15.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21ac16e4add264839a8db570d5378bb6583bf9539649d80bc8802ded00098a20", size = 1860815, upload-time = "2025-10-10T18:57:17.393Z" }, - { url = "https://files.pythonhosted.org/packages/de/2b/1fa26eee0479ae0b40582679c1bd08eb78a0b49bb5893ec3edce2a606e9f/nvidia_cudnn_frontend-1.15.0-cp311-cp311-win_amd64.whl", hash = "sha256:c1be7480e3200606c2f2f49263cc13adc72c2a38e38f31f18e9b3727d99618b2", size = 1297355, upload-time = "2025-10-10T18:46:10.171Z" }, - { url = "https://files.pythonhosted.org/packages/cb/9c/0c2340454f8c9cc4143fdbccef8218dad1e49042d62b26c1781915617c40/nvidia_cudnn_frontend-1.15.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c2cfe2a0f94bff71614bd3add0ae077f513f7d14909c223afca01ac8056ff84", size = 1749017, upload-time = "2025-10-10T18:55:29.412Z" }, - { url = "https://files.pythonhosted.org/packages/19/b4/c35104b8fc32986111b611b3080bbcf35fd3fd6794d4aec4e068136ea628/nvidia_cudnn_frontend-1.15.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aab1098ad4c79935b6e8dc251e9145129a04a8dc6ff75eb30871aacdd1487946", size = 1865629, upload-time = "2025-10-10T18:57:35.941Z" }, - { url = "https://files.pythonhosted.org/packages/a6/d7/6534807d209a27817d101cf86745e335896e96379bf2d207195cfe9f24ab/nvidia_cudnn_frontend-1.15.0-cp312-cp312-win_amd64.whl", hash = "sha256:13e58a5b001154899f0744165716a7ad24cd7567d759a8229a9ada730a1046b2", size = 1297335, upload-time = "2025-10-10T18:46:35.069Z" }, - { url = "https://files.pythonhosted.org/packages/9b/75/5a75942aae2bb3a0c1cc44378e9f80c1213a6d7b952c8df19b8845836a34/nvidia_cudnn_frontend-1.15.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fda240405eba3c04866e30b3c1beae26ea7775af4fa4d555cd598695067d32ac", size = 1750048, upload-time = "2025-10-10T18:56:06.057Z" }, - { url = "https://files.pythonhosted.org/packages/79/70/2ed9802725cb305189dac906a67c799eeb47e4f395b97df0249a750c56fe/nvidia_cudnn_frontend-1.15.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:14941c05a6484d3f05f3089cd290c9b1e6614298f37e07cd01789933932c9f28", size = 1867440, upload-time = "2025-10-10T18:57:53.964Z" }, - { url = "https://files.pythonhosted.org/packages/d1/04/519fd6e3ea12fe7fe98c497c4d51f6c5c87763d02e90ea3102cef32a6ef1/nvidia_cudnn_frontend-1.15.0-cp313-cp313-win_amd64.whl", hash = "sha256:7c8c6f12534b73b0cd55956c5e9419b7840a01e4c260837606112450ce1ca0d9", size = 1297324, upload-time = "2025-10-10T18:46:53.104Z" }, + { url = "https://files.pythonhosted.org/packages/c8/32/f7cd6ce8a7690544d084ea21c26e910a97e077c9b7f07bf5de623ee19981/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0", size = 267229841, upload-time = "2025-03-07T01:46:54.356Z" }, + { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" }, + { url = "https://files.pythonhosted.org/packages/13/c0/76ca8551b8a84146ffa189fec81c26d04adba4bc0dbe09cd6e6fd9b7de04/nvidia_cusolver_cu12-11.7.3.90-py3-none-win_amd64.whl", hash = "sha256:4a550db115fcabc4d495eb7d39ac8b58d4ab5d8e63274d3754df1c0ad6a22d34", size = 256720438, upload-time = "2025-03-07T01:54:39.898Z" }, +] + +[[package]] +name = "nvidia-cusparse-cu12" +version = "12.5.8.93" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/f7/cd777c4109681367721b00a106f491e0d0d15cfa1fd59672ce580ce42a97/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc", size = 288117129, upload-time = "2025-03-07T01:47:40.407Z" }, + { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" }, + { url = "https://files.pythonhosted.org/packages/62/07/f3b2ad63f8e3d257a599f422ae34eb565e70c41031aecefa3d18b62cabd1/nvidia_cusparse_cu12-12.5.8.93-py3-none-win_amd64.whl", hash = "sha256:9a33604331cb2cac199f2e7f5104dfbb8a5a898c367a53dfda9ff2acb6b6b4dd", size = 284937404, upload-time = "2025-03-07T01:55:07.742Z" }, +] + +[[package]] +name = "nvidia-cusparselt-cu12" +version = "0.7.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/73/b9/598f6ff36faaece4b3c50d26f50e38661499ff34346f00e057760b35cc9d/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8878dce784d0fac90131b6817b607e803c36e629ba34dc5b433471382196b6a5", size = 283835557, upload-time = "2025-02-26T00:16:54.265Z" }, + { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" }, + { url = "https://files.pythonhosted.org/packages/2f/d8/a6b0d0d0c2435e9310f3e2bb0d9c9dd4c33daef86aa5f30b3681defd37ea/nvidia_cusparselt_cu12-0.7.1-py3-none-win_amd64.whl", hash = "sha256:f67fbb5831940ec829c9117b7f33807db9f9678dc2a617fbe781cac17b4e1075", size = 271020911, upload-time = "2025-02-26T00:14:47.204Z" }, ] [[package]] name = "nvidia-cutlass-dsl" -version = "4.2.1" +version = "4.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cuda-python" }, - { name = "numpy" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "typing-extensions" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/b3/0f/1e96ce9fbe07e8c39484fae4d2cf36e328bdf434b311d88ccedccbfed7db/nvidia_cutlass_dsl-4.2.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:1628bacedde042c60c7ebb1aeccce5a82501197f5e5c4fbbf803712fa45fba59", size = 58540319, upload-time = "2025-09-23T14:38:00.634Z" }, - { url = "https://files.pythonhosted.org/packages/7c/e3/bc6071743d0ad43d837bf633139bfe1202260c28d893e30f247cf0aa8019/nvidia_cutlass_dsl-4.2.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:aec74b50f700a8ef455f15863de4cb5f1486f72b7bd4becea88624c58c555a13", size = 62233601, upload-time = "2025-09-23T14:39:50.44Z" }, - { url = "https://files.pythonhosted.org/packages/1d/2a/e65312728338e5bb00b592ce0be12b51e7594a3ef288cd8c99bc1c456968/nvidia_cutlass_dsl-4.2.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:04e605417773957405cad0ac6c2d46139a88aca07a783b4f66e1363f3a91a835", size = 58540069, upload-time = "2025-09-23T14:38:56.002Z" }, - { url = "https://files.pythonhosted.org/packages/be/f3/20eacdf9876abd892668c191003edc5d7100e45fabfa027d9f3f99d21871/nvidia_cutlass_dsl-4.2.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:874aa3620b3d3dc6598af2226fa3b78f2e7998b8656929b492259e0c9f778786", size = 62233009, upload-time = "2025-09-23T14:39:23.308Z" }, - { url = "https://files.pythonhosted.org/packages/1e/1d/f168a3dbd8570e5dbbe0deca217d7b374c977b4a4970ebadf3b6d0f1174f/nvidia_cutlass_dsl-4.2.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:10ace6e2005cb0bc04d158c7660f8ec104ab29aeffb26f1ed3bb0b5a577ccc34", size = 58535504, upload-time = "2025-09-23T14:38:29.028Z" }, - { url = "https://files.pythonhosted.org/packages/02/ab/5bcc0c8c620af5d4acbc71abce10e3eb3023e50342e6bc29b6461f72530e/nvidia_cutlass_dsl-4.2.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:d7ddc9c1f5bb803718d736c907fac857fc606f1fce630c0b1d741935a72723b9", size = 62230361, upload-time = "2025-09-23T14:40:18.156Z" }, - { url = "https://files.pythonhosted.org/packages/cf/d5/9b79faaec3fa12c52b7de1e727af94c54184b00f280c79b667ab045550db/nvidia_cutlass_dsl-4.2.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c0985124a74ba435e1f756aa78e89f64c6d01e4f54de1d5a5d218ebbc1c92eff", size = 58535424, upload-time = "2025-09-23T14:37:33.064Z" }, - { url = "https://files.pythonhosted.org/packages/43/86/78c8cd3fa1a684f3976535d7ac69e54f4ede165b5abca7979fd0820f74f2/nvidia_cutlass_dsl-4.2.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9356604afc8f62aac46634b3a12baf8cb3f3a6f2e44e398dcfe6ec98ff1a8d1b", size = 62230122, upload-time = "2025-09-23T14:40:46.621Z" }, + { url = "https://files.pythonhosted.org/packages/75/c3/3cd4c440f386a24c348c7c67adff5e38bb2405d08579ae3ac9312fa14ee4/nvidia_cutlass_dsl-4.3.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:29d6ccb56955e6528c818591fe752a820305951a73fbb69f9a816b3e228d57f8", size = 58726035, upload-time = "2025-11-28T00:59:03.749Z" }, + { url = "https://files.pythonhosted.org/packages/35/b5/854b713e2355e6211624dfc9df65aca5ebc2a8aaae97a696def34a4b9c9a/nvidia_cutlass_dsl-4.3.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f54d98339d4fca37d39390933186c4a7987291b57129da9bf45c7746d47786af", size = 58591793, upload-time = "2025-11-28T01:03:01.473Z" }, + { url = "https://files.pythonhosted.org/packages/45/24/432ab11c9da47742518e008f61c58166b3cced5d39df987155d103d5e18e/nvidia_cutlass_dsl-4.3.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:c7b27b3faf2d3cb4e9504ad55129ac58c09aa59f3af6eaabb88f4bda010a2792", size = 58725123, upload-time = "2025-11-28T00:58:11.337Z" }, + { url = "https://files.pythonhosted.org/packages/a2/07/59509304cac496275a0a7bdae436c267829611b38e4500b2622424c9f737/nvidia_cutlass_dsl-4.3.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:24cfbf55aad55b3dd06ddaa340d13028b4e49b15e0e557105187a9d0bbc260db", size = 58592193, upload-time = "2025-11-28T00:59:54.448Z" }, + { url = "https://files.pythonhosted.org/packages/b2/c5/f1586c64fcf569b890da776d08a32836a3ef2450cbe9e3ac2971dbecbcce/nvidia_cutlass_dsl-4.3.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:025a8c7a0fb80626e2a893954ea19b2e1ece8d131078c7da12b7fabc2634d04d", size = 58726236, upload-time = "2025-11-28T00:59:29.376Z" }, + { url = "https://files.pythonhosted.org/packages/dc/5b/fe6a2db1688a690a94f8ad03706fa6db2055d82fab0c4fab764e8c89640f/nvidia_cutlass_dsl-4.3.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b95ce5633e09f12c8d1fcd30c5db06b8325d41b3da0875d3e8a4c110ed5b5cdf", size = 58591826, upload-time = "2025-11-28T01:00:19.559Z" }, + { url = "https://files.pythonhosted.org/packages/40/fe/5e48c63ff5a510c0edbac5167921a819c70f71daf3b6ead0e0e5346b2a42/nvidia_cutlass_dsl-4.3.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c8e816cc061b34e016906fa87948f2b0fa836a95f27732c14097f3ddda8286e2", size = 58725695, upload-time = "2025-11-28T01:01:32.1Z" }, + { url = "https://files.pythonhosted.org/packages/9c/ef/34b1bdd375226b818cd810145e207cceb50fd12eaa87e88a6e67820574d4/nvidia_cutlass_dsl-4.3.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:f71adcfb56607fc86ea621edcf9503eaa31f66f70efd7ab719c33683db082183", size = 58592065, upload-time = "2025-11-28T01:02:35.83Z" }, ] [[package]] name = "nvidia-mathdx" -version = "25.1.1" +version = "25.6.0" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/59/00/f1a73ac224d466b31b6eb09794656112e896185678720b05668777e87db3/nvidia_mathdx-25.1.1-py3-none-any.whl", hash = "sha256:4fb948fe4842d24e679f3d0c140c8a0e8e24c3c7ae5eb6e08584253ad94a198b", size = 39894902, upload-time = "2025-05-06T22:58:32.29Z" }, + { url = "https://files.pythonhosted.org/packages/20/1a/a418b8c1adc58abd87fd69414c19883af5c1b10514e3dbfcc27cde831b13/nvidia_mathdx-25.6.0-py3-none-any.whl", hash = "sha256:22e6ad5d0d005f836be5cbd14e836cf2e9ea42c82deb602707246ce8198eaa96", size = 23013087, upload-time = "2025-11-13T18:25:11.228Z" }, ] [[package]] @@ -3315,13 +3281,13 @@ wheels = [ [[package]] name = "nvidia-modelopt" -version = "0.33.1" +version = "0.39.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "ninja" }, - { name = "numpy" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "nvidia-ml-py" }, - { name = "nvidia-modelopt-core" }, { name = "packaging" }, { name = "pulp" }, { name = "pydantic" }, @@ -3332,52 +3298,76 @@ dependencies = [ { name = "scipy", version = "1.16.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "torch", marker = "sys_platform == 'never'" }, { name = "torchprofile" }, - { name = "torchvision", marker = "sys_platform == 'never'" }, { name = "tqdm" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/ca/cb/4af39357792a96f334c7877ea0380c9337aec210ff4794a7dd95beb7c349/nvidia_modelopt-0.33.1-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:6c51091683a117cd40fdb96a0ec28579f2276f6b627db7ccddc370df544e1dd7", size = 751683, upload-time = "2025-08-12T18:37:48.832Z" }, - { url = "https://files.pythonhosted.org/packages/0a/b1/fc2f468d140ef58e90fac584759d0cc449db9bc4f64668cdff750ef38fef/nvidia_modelopt-0.33.1-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:ef78a98901890f265596ec413dffac177d4a1865201d89a14f29f4fa0cf8e710", size = 751683, upload-time = "2025-08-12T18:36:59.964Z" }, + { url = "https://files.pythonhosted.org/packages/b0/d5/b03ad3ffa28984b629a72da678fa98f912fc45bac3b514c4a70cf2a82fe3/nvidia_modelopt-0.39.0-py3-none-any.whl", hash = "sha256:32f05317c81be1ff2ffeab749e5258b7bea8e4c6e60a09c760584f25ad03f648", size = 864981, upload-time = "2025-11-13T07:35:42.761Z" }, ] [[package]] -name = "nvidia-modelopt-core" -version = "0.33.1" +name = "nvidia-nccl-cu12" +version = "2.27.5" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/6a/21/d12ca11f5554340684d11958aae6c6e7755cf0aaae10a2d2c9db217228cf/nvidia_modelopt_core-0.33.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:f25f6a817609c693ee39d1bcf2d3aeef462b9769f971590133de8b1b0310885b", size = 1307716, upload-time = "2025-08-12T18:41:12.086Z" }, - { url = "https://files.pythonhosted.org/packages/eb/df/7bead24d4854274d9f2818f1ae780fc24260aab60b7b6f73e1af4f056ce5/nvidia_modelopt_core-0.33.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:195f32f06d19bc9f9d858811f1864bddcc1db6278974d98ea6309cb3553427f1", size = 1326896, upload-time = "2025-08-12T18:39:48.243Z" }, - { url = "https://files.pythonhosted.org/packages/a1/36/3318980c670292d827ace5ac6110ab6054d0f2d87e507382842ea9e7c78f/nvidia_modelopt_core-0.33.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:ffd008a90d8867660ae41c98002156b526e368a4cdf39e225fe20f478adce8b2", size = 1376104, upload-time = "2025-08-12T18:41:47.358Z" }, - { url = "https://files.pythonhosted.org/packages/27/97/99d1ddabe01ab262c18621619c996e1c2c119bc058607d2bc9ce7eb85fe7/nvidia_modelopt_core-0.33.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:be49121b2f74db4cb73955396a7bb83935d92232c5a20bcfd7b8e7cae68e482f", size = 1393729, upload-time = "2025-08-12T18:40:07.86Z" }, - { url = "https://files.pythonhosted.org/packages/9b/b5/ba79b1c52b634b24e45dca409f133f947217a5c7ec5c256266e4ec5fa3eb/nvidia_modelopt_core-0.33.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1ddd9279d8312f8e972b302692a26e6180f1c9fd277232f5925a5589f42b1b76", size = 1338081, upload-time = "2025-08-12T18:40:36.156Z" }, - { url = "https://files.pythonhosted.org/packages/13/40/4427583475dfd8eb1b8c7522d75d4d059f0512ff03dcc62d6986a22ab918/nvidia_modelopt_core-0.33.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:69d5ace564f2b056c916117be2023f2b7fc01cd1501073915e6b2ced2b8a5394", size = 1363366, upload-time = "2025-08-12T18:39:28.854Z" }, + { url = "https://files.pythonhosted.org/packages/bb/1c/857979db0ef194ca5e21478a0612bcdbbe59458d7694361882279947b349/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:31432ad4d1fb1004eb0c56203dc9bc2178a1ba69d1d9e02d64a6938ab5e40e7a", size = 322400625, upload-time = "2025-06-26T04:11:04.496Z" }, + { url = "https://files.pythonhosted.org/packages/6e/89/f7a07dc961b60645dbbf42e80f2bc85ade7feb9a491b11a1e973aa00071f/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457", size = 322348229, upload-time = "2025-06-26T04:11:28.385Z" }, +] + +[[package]] +name = "nvidia-nvjitlink-cu12" +version = "12.8.93" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88", size = 39254836, upload-time = "2025-03-07T01:49:55.661Z" }, + { url = "https://files.pythonhosted.org/packages/2a/a2/8cee5da30d13430e87bf99bb33455d2724d0a4a9cb5d7926d80ccb96d008/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:adccd7161ace7261e01bb91e44e88da350895c270d23f744f0820c818b7229e7", size = 38386204, upload-time = "2025-03-07T01:49:43.612Z" }, + { url = "https://files.pythonhosted.org/packages/ed/d7/34f02dad2e30c31b10a51f6b04e025e5dd60e5f936af9045a9b858a05383/nvidia_nvjitlink_cu12-12.8.93-py3-none-win_amd64.whl", hash = "sha256:bd93fbeeee850917903583587f4fc3a4eafa022e34572251368238ab5e6bd67f", size = 268553710, upload-time = "2025-03-07T01:56:24.13Z" }, +] + +[[package]] +name = "nvidia-nvshmem-cu12" +version = "3.3.20" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/92/9d/3dd98852568fb845ec1f7902c90a22b240fe1cbabda411ccedf2fd737b7b/nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b0b960da3842212758e4fa4696b94f129090b30e5122fea3c5345916545cff0", size = 124484616, upload-time = "2025-08-04T20:24:59.172Z" }, + { url = "https://files.pythonhosted.org/packages/3b/6c/99acb2f9eb85c29fc6f3a7ac4dccfd992e22666dd08a642b303311326a97/nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d00f26d3f9b2e3c3065be895e3059d6479ea5c638a3f38c9fec49b1b9dd7c1e5", size = 124657145, upload-time = "2025-08-04T20:25:19.995Z" }, +] + +[[package]] +name = "nvidia-nvtx-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/c0/1b303feea90d296f6176f32a2a70b5ef230f9bdeb3a72bddb0dc922dc137/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d7ad891da111ebafbf7e015d34879f7112832fc239ff0d7d776b6cb685274615", size = 91161, upload-time = "2025-03-07T01:42:23.922Z" }, + { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" }, + { url = "https://files.pythonhosted.org/packages/9f/99/4c9c0c329bf9fc125008c3b54c7c94c0023518d06fc025ae36431375e1fe/nvidia_nvtx_cu12-12.8.90-py3-none-win_amd64.whl", hash = "sha256:619c8304aedc69f02ea82dd244541a83c3d9d40993381b3b590f1adaed3db41e", size = 56492, upload-time = "2025-03-07T01:52:24.69Z" }, ] [[package]] name = "nvidia-resiliency-ext" -version = "0.4.1" +version = "0.5.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "defusedxml" }, + { name = "nv-one-logger-core" }, + { name = "nv-one-logger-training-telemetry" }, { name = "nvidia-ml-py" }, { name = "packaging" }, { name = "psutil" }, - { name = "pynvml" }, { name = "pyyaml" }, { name = "torch", marker = "sys_platform == 'never'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/a7/8c/6547d9fdea9730d4f69a19ca492ccbe221768f8473b82502a78a824acc3d/nvidia_resiliency_ext-0.4.1-cp310-cp310-manylinux_2_31_aarch64.whl", hash = "sha256:cf80599411018ebbf03da64769527dee6b37746b72b8606f919b7999633770b8", size = 442891, upload-time = "2025-07-17T03:53:38.878Z" }, - { url = "https://files.pythonhosted.org/packages/34/0d/520cab980949ad11bd5291784fea309bcd6654a9c97943a3a87644c1d111/nvidia_resiliency_ext-0.4.1-cp310-cp310-manylinux_2_31_x86_64.whl", hash = "sha256:0c23e621d598ba436549db83deeb3569c19df0194b89fe6169d62b6ead711be3", size = 448044, upload-time = "2025-07-17T03:48:30.851Z" }, - { url = "https://files.pythonhosted.org/packages/46/77/8cda264b262e2868a4e6ebcddaea112200b1e34b8d5a35a2fe3b4978d137/nvidia_resiliency_ext-0.4.1-cp311-cp311-manylinux_2_31_aarch64.whl", hash = "sha256:d8ca454a8b8abef72e0ff0e33914686c263414e8891471c02a9f6af9d2d6b925", size = 443649, upload-time = "2025-07-17T03:49:16.183Z" }, - { url = "https://files.pythonhosted.org/packages/3a/53/029cc7493b5833cb8dfa201f15a1e422e2e1cc6308d34c5b0a90028a73fd/nvidia_resiliency_ext-0.4.1-cp311-cp311-manylinux_2_31_x86_64.whl", hash = "sha256:dde6034f29350ac6326cdd861ceec641bdd93be0eddbf034739f4cd9452a4dd9", size = 449189, upload-time = "2025-07-17T03:52:15.24Z" }, - { url = "https://files.pythonhosted.org/packages/70/05/38d491962273c7905708762279f440520eb79f3c00b67a023497215ad023/nvidia_resiliency_ext-0.4.1-cp312-cp312-manylinux_2_31_aarch64.whl", hash = "sha256:b3bd5f01535574b16d0f38bca6e39afe3806c4a2896eee1b321cd944e00025a7", size = 444570, upload-time = "2025-07-17T03:50:58.877Z" }, - { url = "https://files.pythonhosted.org/packages/18/8b/4cb8aa2bbdf3705d3034c3f3dacdadb03b3b7dd3dc7f5200e64663fb477f/nvidia_resiliency_ext-0.4.1-cp312-cp312-manylinux_2_31_x86_64.whl", hash = "sha256:ca9f8de465af345952bedbea53c90c0e2323d88cfd830ded0e806fad91845c0e", size = 450280, upload-time = "2025-07-17T03:49:55.327Z" }, + { url = "https://files.pythonhosted.org/packages/df/18/1898cad3bdd643c6bfa5f7aee125a5ef308ab1701ab15106e3e9c66bb416/nvidia_resiliency_ext-0.5.0-cp310-cp310-manylinux_2_39_aarch64.whl", hash = "sha256:97d4b68d3949f3b8370addb474d8662d6ac5008c3c1296420cdeb93a88d6a804", size = 402915, upload-time = "2025-11-13T21:28:34.578Z" }, + { url = "https://files.pythonhosted.org/packages/fa/48/10fc3f278898e3b2aacc3bea65f0ac4b579e6e0e8447b467742d75adeec1/nvidia_resiliency_ext-0.5.0-cp310-cp310-manylinux_2_39_x86_64.whl", hash = "sha256:ceb04ec5a7bc9301fd6f14449bda6b0d1f37ead4fbe37aa3bf1d7b2ad5b662d4", size = 406483, upload-time = "2025-11-13T21:28:58.732Z" }, + { url = "https://files.pythonhosted.org/packages/14/17/c19dfed8d4aced307a1c1404f0917ee6c1b319db8092b3cfe2af4e76de6d/nvidia_resiliency_ext-0.5.0-cp311-cp311-manylinux_2_39_aarch64.whl", hash = "sha256:62d396356adcf898cb86a54956eeece29017a41b5872db0b364c8449d23f2f66", size = 404062, upload-time = "2025-11-13T21:29:46.873Z" }, + { url = "https://files.pythonhosted.org/packages/7f/99/b4324595171c3cdffb03cef070006ab9a3de7fca90a22403576ec6423b69/nvidia_resiliency_ext-0.5.0-cp311-cp311-manylinux_2_39_x86_64.whl", hash = "sha256:c4fcd006ef69300f753bb30d17efbb6bcee6699f044e3532209b2825d22e9977", size = 407027, upload-time = "2025-11-13T21:30:09.124Z" }, + { url = "https://files.pythonhosted.org/packages/8c/73/232d9f25558f3c6165ff1d15c980a434b47c13e8f527f999cd265859abcf/nvidia_resiliency_ext-0.5.0-cp312-cp312-manylinux_2_39_aarch64.whl", hash = "sha256:81e3d827885e90bed369e67f76dda6709dd4073c2e5fa1228df85d6987cee495", size = 403317, upload-time = "2025-11-13T21:31:24.603Z" }, + { url = "https://files.pythonhosted.org/packages/44/89/4d7f39416aa3be72ee9f1260a7af56af40f2570f5add1e039d96279a8764/nvidia_resiliency_ext-0.5.0-cp312-cp312-manylinux_2_39_x86_64.whl", hash = "sha256:eb720cd25feabef07f971d4051c7bcac2f9ec73642a9031953d2663307950cb9", size = 407963, upload-time = "2025-11-13T21:30:28.998Z" }, ] [[package]] name = "nvidia-sphinx-theme" -version = "0.0.8" +version = "0.0.9.post1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "pydata-sphinx-theme" }, @@ -3385,27 +3375,26 @@ dependencies = [ { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/dd/74/996dbc314da8ed670cd5e040d0b4b5be79ff1fc3db3fe25e63134deebe9a/nvidia_sphinx_theme-0.0.8-py3-none-any.whl", hash = "sha256:18f117aa154a3a156251a75647279c541464f3e75f7df2ae283e720cc7d0bc2c", size = 140678, upload-time = "2025-03-24T21:56:25.621Z" }, + { url = "https://files.pythonhosted.org/packages/8c/79/017fab2f7167a9a9795665f894d04f77aafceca80821b51589bb4b23ff5c/nvidia_sphinx_theme-0.0.9.post1-py3-none-any.whl", hash = "sha256:21ca60206dff2f380d7783d64bbaf71a5b9cacae53c7d0686f089c16b5a3d45a", size = 143816, upload-time = "2025-11-09T23:16:55.719Z" }, ] [[package]] name = "nvtx" -version = "0.2.13" +version = "0.2.14" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/97/02/b3fd3da4ba51764cfc0e4d2b22d5a61511fa79d825344d4704f8429c0bd6/nvtx-0.2.13.tar.gz", hash = "sha256:9db7ba135168e14e1f038866100bf8ed42d3e00b404e9bc7b6280ee3af828b92", size = 112104, upload-time = "2025-08-05T03:27:16.383Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/53/64/d27e344632116da937100a81054c88b0fd6a259de09d6778e03e8231216b/nvtx-0.2.13-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:462bdcc65a12b53bfa3e7df564ddfb72092a030a923dccd1cf88c4b771ecae3f", size = 470534, upload-time = "2025-08-04T19:36:19.389Z" }, - { url = "https://files.pythonhosted.org/packages/34/15/0b56e9b3020613d7d167bc4cdee3ba8686f6320c6aa62e85ed17b54c4dcb/nvtx-0.2.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7874534af889ab7c2c63554c73119d193d2beb7671b551b7f43de5b97ceb5971", size = 474158, upload-time = "2025-08-04T19:39:39.801Z" }, - { url = "https://files.pythonhosted.org/packages/2b/be/e00ab0d21f4fb46ad66b0eae89d9e9f7d53af65a37c3db2414a590e05e97/nvtx-0.2.13-cp310-cp310-win_amd64.whl", hash = "sha256:4f26d04b5ea5b96096941cb9a7115a73454e9e9d5c247bfcd34ec584559cf9dd", size = 99104, upload-time = "2025-08-04T19:24:01.775Z" }, - { url = "https://files.pythonhosted.org/packages/22/02/f74e26cedbdb136440d1234a646cedfddf9a43d19586e1ee466d6275e6b6/nvtx-0.2.13-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ad794a0c046ef268b2fb3b6812a35bb3bce5cd19207d164689943f0031ac45f", size = 522330, upload-time = "2025-08-04T19:34:49.075Z" }, - { url = "https://files.pythonhosted.org/packages/1d/55/e1e43201959dd854005c72b8a13ec86b775c349cdcb1d23423d841bbad58/nvtx-0.2.13-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5640ca4b8be2c19a8fc4ca8403d3c2598165ea27541940b4897138a7b0a717fe", size = 522841, upload-time = "2025-08-04T19:38:27.819Z" }, - { url = "https://files.pythonhosted.org/packages/a9/8c/89d1f499a4880e30e0b5bdf429cbd1d8c612d09c49c13016384ce9cd156d/nvtx-0.2.13-cp311-cp311-win_amd64.whl", hash = "sha256:be6d53143cb2bd44e04aecdb7f3b34b48ded96f3673ae41362239d9f54bcfe27", size = 99106, upload-time = "2025-08-04T19:22:49.181Z" }, - { url = "https://files.pythonhosted.org/packages/c5/73/ad21e09dc2534f1e9723bbe5871fa5f03361ac51ca4d411fea6f765b5b6a/nvtx-0.2.13-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3435cbbffa132f6aaba3abdb01e71a1b961a20858b4cb791883895a25b9305d6", size = 539358, upload-time = "2025-08-04T19:33:16.494Z" }, - { url = "https://files.pythonhosted.org/packages/12/ab/762da984e7671f7c34ae87e5b70523c3eeb4563759268bfaea07c97f32a6/nvtx-0.2.13-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:453d838dd1424a04303281ee57a73e2b8dca0e03039bc609a945861b8fe7d7d9", size = 545588, upload-time = "2025-08-04T19:37:40.64Z" }, - { url = "https://files.pythonhosted.org/packages/2a/b6/55bc5916386db70b93cbf543b1e880ead786d9ff0cdcfa262f5a2af46c74/nvtx-0.2.13-cp312-cp312-win_amd64.whl", hash = "sha256:0722d743e0e41e1fb866ebe6446e0cd0d268ca8671313f8da4f8c969956b74d3", size = 99123, upload-time = "2025-08-04T19:24:24.391Z" }, - { url = "https://files.pythonhosted.org/packages/41/73/98c0669d5f9387a36d56b0e62ea3919124dd8dd7582d896ed1cae2998f57/nvtx-0.2.13-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1561d2111c698b1b1075899ff9c3fa7ba83603fc27c2e8ef567de6bbbe85ce1", size = 519840, upload-time = "2025-08-04T19:34:00.877Z" }, - { url = "https://files.pythonhosted.org/packages/14/4b/21e975997def8a387543ba2bbe227551ad466781c39fc67f37f53555f37e/nvtx-0.2.13-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:edd7b729ed0211350258a21dd13422f59bc521de2b2fd21feb6c177af492f4e1", size = 524711, upload-time = "2025-08-04T19:38:03.559Z" }, - { url = "https://files.pythonhosted.org/packages/21/d7/0ca146afd875f1e02636323840960071f768b5d8ba3e7d37f2ac9192bfd9/nvtx-0.2.13-cp313-cp313-win_amd64.whl", hash = "sha256:f0524bb71443d5a1f19a6409a9a81405fc437e53c5edfc4c44b6f4504ccf46e3", size = 97317, upload-time = "2025-08-04T19:24:46.391Z" }, + { url = "https://files.pythonhosted.org/packages/ed/ca/fa76ea4985fd8f3d8c437bffec2580b1cac7f2401671089ac842610ae466/nvtx-0.2.14-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b70b2415ab97edf19514be226d5058898922c6b6bb1d7fdd5ef92d1e086f3e0f", size = 695204, upload-time = "2025-11-27T17:28:52.688Z" }, + { url = "https://files.pythonhosted.org/packages/b9/1f/0aa62d52062d700dbed36dd2ebfddf5133c72180d448cce66545e5ccbe5d/nvtx-0.2.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23ab874f9c70e5433f39e40ca318ffcfc14fb43ed6798e6be5a30f74e4ca831f", size = 686698, upload-time = "2025-11-27T17:23:19.335Z" }, + { url = "https://files.pythonhosted.org/packages/18/c9/a12d48157221a8e939f3f7ec8f8a543e232fb9248820afb164ff9eb3eaa7/nvtx-0.2.14-cp310-cp310-win_amd64.whl", hash = "sha256:3a22be895546ca609e83e54614b56739200ab6f4d13e15f5685544082b1b7908", size = 119654, upload-time = "2025-11-27T17:32:08.536Z" }, + { url = "https://files.pythonhosted.org/packages/87/a6/4d473abd7c07a6d1060c0f708e21ddf46a960258532ffc897681db5c0f46/nvtx-0.2.14-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:227f6406d2fe1a4b890be17eb1f4c1f5bd4df8f7032dd1cb8c7651d379f35541", size = 732764, upload-time = "2025-11-27T17:26:21.853Z" }, + { url = "https://files.pythonhosted.org/packages/94/06/3ab72e5a463af1b95934638cb8377e99f58e5ef21a47cbf69b92267d6602/nvtx-0.2.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0664aa75b24e2ad0abdd0fa52c49e9c8a120652f2194289c85dc2d93cbc6017f", size = 724555, upload-time = "2025-11-27T17:22:36.402Z" }, + { url = "https://files.pythonhosted.org/packages/18/1d/64f6078a5ab4134af91ba294035ee1ebb3512edaaa9d60d8f0f023178620/nvtx-0.2.14-cp311-cp311-win_amd64.whl", hash = "sha256:10f5971661d61c1a90cd36c3069240452c904ecec4b3a08d0d6fdba1e5398165", size = 119660, upload-time = "2025-11-27T17:32:30.406Z" }, + { url = "https://files.pythonhosted.org/packages/8a/de/2cc15bb805b1b18317b60837b853ed023757730d0db82de291635fc88bc3/nvtx-0.2.14-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ece46f555e725db879df06549980744f89db5923a77e6f7a5aecda75292421a", size = 727708, upload-time = "2025-11-27T17:25:20.836Z" }, + { url = "https://files.pythonhosted.org/packages/81/94/b37d634fef8677ce525b5bfd2886737ea2c064bc3576fc84423973ff5b97/nvtx-0.2.14-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17efe5d903996bceb0c8a12cae80fa9b66bee7ee895923bd9d8ec2a5af1aabd8", size = 737691, upload-time = "2025-11-27T17:21:27.87Z" }, + { url = "https://files.pythonhosted.org/packages/ad/c1/f633aa32003050ff83626a19402f03c83990a15b4df658a7bf1b590ee83e/nvtx-0.2.14-cp312-cp312-win_amd64.whl", hash = "sha256:f40db4746714d525d3020c702a0df866c2335efd6a27c41e869e577402a53a4b", size = 119193, upload-time = "2025-11-27T17:31:42.943Z" }, + { url = "https://files.pythonhosted.org/packages/04/a3/603ecdfd5cd97feee59c7e51da4929e22eac8dbe68ac78df53e74152813f/nvtx-0.2.14-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8cd1f2b464675b4d3c2036b7bbaf975baa9307f0795107dc69c556c0c8d191d", size = 710057, upload-time = "2025-11-27T17:28:08.127Z" }, + { url = "https://files.pythonhosted.org/packages/97/29/945dd440e6bd459e6064f321ed425dbae7d03d39ffa97a38e5434fbcda27/nvtx-0.2.14-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6532556d81f782e24eb12c5e0c75e297493d6ab0431177c93c12bb29c523ea9e", size = 717825, upload-time = "2025-11-27T17:22:57.556Z" }, + { url = "https://files.pythonhosted.org/packages/16/3e/5d7872f2a0809237e3d524f81a7a3c7fbeb98bdc9dcec4723b75a45cd552/nvtx-0.2.14-cp313-cp313-win_amd64.whl", hash = "sha256:cd86f78ed56aede301b03e5ab8cb1aaeb8ba0b5ed683f98f87fbe474996d73f2", size = 118546, upload-time = "2025-11-27T17:30:32.549Z" }, ] [[package]] @@ -3423,141 +3412,75 @@ wheels = [ [[package]] name = "onnx" -version = "1.19.0" +version = "1.19.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "ml-dtypes", version = "0.4.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "ml-dtypes", version = "0.5.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy" }, + { name = "ml-dtypes" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "protobuf" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/5b/bf/b0a63ee9f3759dcd177b28c6f2cb22f2aecc6d9b3efecaabc298883caa5f/onnx-1.19.0.tar.gz", hash = "sha256:aa3f70b60f54a29015e41639298ace06adf1dd6b023b9b30f1bca91bb0db9473", size = 11949859, upload-time = "2025-08-27T02:34:27.107Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/00/b3/8a6f3b05d18dffdc7c18839bd829587c826c8513f4bdbe21ddf37dacce50/onnx-1.19.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:e927d745939d590f164e43c5aec7338c5a75855a15130ee795f492fc3a0fa565", size = 18310869, upload-time = "2025-08-27T02:32:47.346Z" }, - { url = "https://files.pythonhosted.org/packages/b9/92/550d6155ab3f2c00e95add1726397c95b4b79d6eb4928d049ff591ad4c84/onnx-1.19.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c6cdcb237c5c4202463bac50417c5a7f7092997a8469e8b7ffcd09f51de0f4a9", size = 18028144, upload-time = "2025-08-27T02:32:50.306Z" }, - { url = "https://files.pythonhosted.org/packages/79/21/9bcc715ea6d9aab3f6c583bfc59504a14777e39e0591030e7345f4e40315/onnx-1.19.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ed0b85a33deacb65baffe6ca4ce91adf2bb906fa2dee3856c3c94e163d2eb563", size = 18200923, upload-time = "2025-08-27T02:32:54.325Z" }, - { url = "https://files.pythonhosted.org/packages/c8/90/3a6f0741ff22270e2f4b741f440ab68ba5525ebc94775cd6f2c01f531374/onnx-1.19.0-cp310-cp310-win32.whl", hash = "sha256:89a9cefe75547aec14a796352c2243e36793bbbcb642d8897118595ab0c2395b", size = 16332097, upload-time = "2025-08-27T02:32:56.997Z" }, - { url = "https://files.pythonhosted.org/packages/4c/4c/ef61d359865712803d488672607023d36bfcd21fa008d8dc1d6ee8e8b23c/onnx-1.19.0-cp310-cp310-win_amd64.whl", hash = "sha256:a16a82bfdf4738691c0a6eda5293928645ab8b180ab033df84080817660b5e66", size = 16451402, upload-time = "2025-08-27T02:33:00.534Z" }, - { url = "https://files.pythonhosted.org/packages/db/5c/b959b17608cfb6ccf6359b39fe56a5b0b7d965b3d6e6a3c0add90812c36e/onnx-1.19.0-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:206f00c47b85b5c7af79671e3307147407991a17994c26974565aadc9e96e4e4", size = 18312580, upload-time = "2025-08-27T02:33:03.081Z" }, - { url = "https://files.pythonhosted.org/packages/2c/ee/ac052bbbc832abe0debb784c2c57f9582444fb5f51d63c2967fd04432444/onnx-1.19.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4d7bee94abaac28988b50da675ae99ef8dd3ce16210d591fbd0b214a5930beb3", size = 18029165, upload-time = "2025-08-27T02:33:05.771Z" }, - { url = "https://files.pythonhosted.org/packages/5c/c9/8687ba0948d46fd61b04e3952af9237883bbf8f16d716e7ed27e688d73b8/onnx-1.19.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7730b96b68c0c354bbc7857961bb4909b9aaa171360a8e3708d0a4c749aaadeb", size = 18202125, upload-time = "2025-08-27T02:33:09.325Z" }, - { url = "https://files.pythonhosted.org/packages/e2/16/6249c013e81bd689f46f96c7236d7677f1af5dd9ef22746716b48f10e506/onnx-1.19.0-cp311-cp311-win32.whl", hash = "sha256:7cb7a3ad8059d1a0dfdc5e0a98f71837d82002e441f112825403b137227c2c97", size = 16332738, upload-time = "2025-08-27T02:33:12.448Z" }, - { url = "https://files.pythonhosted.org/packages/6a/28/34a1e2166e418c6a78e5c82e66f409d9da9317832f11c647f7d4e23846a6/onnx-1.19.0-cp311-cp311-win_amd64.whl", hash = "sha256:d75452a9be868bd30c3ef6aa5991df89bbfe53d0d90b2325c5e730fbd91fff85", size = 16452303, upload-time = "2025-08-27T02:33:15.176Z" }, - { url = "https://files.pythonhosted.org/packages/e6/b7/639664626e5ba8027860c4d2a639ee02b37e9c322215c921e9222513c3aa/onnx-1.19.0-cp311-cp311-win_arm64.whl", hash = "sha256:23c7959370d7b3236f821e609b0af7763cff7672a758e6c1fc877bac099e786b", size = 16425340, upload-time = "2025-08-27T02:33:17.78Z" }, - { url = "https://files.pythonhosted.org/packages/0d/94/f56f6ca5e2f921b28c0f0476705eab56486b279f04e1d568ed64c14e7764/onnx-1.19.0-cp312-cp312-macosx_12_0_universal2.whl", hash = "sha256:61d94e6498ca636756f8f4ee2135708434601b2892b7c09536befb19bc8ca007", size = 18322331, upload-time = "2025-08-27T02:33:20.373Z" }, - { url = "https://files.pythonhosted.org/packages/c8/00/8cc3f3c40b54b28f96923380f57c9176872e475face726f7d7a78bd74098/onnx-1.19.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:224473354462f005bae985c72028aaa5c85ab11de1b71d55b06fdadd64a667dd", size = 18027513, upload-time = "2025-08-27T02:33:23.44Z" }, - { url = "https://files.pythonhosted.org/packages/61/90/17c4d2566fd0117a5e412688c9525f8950d467f477fbd574e6b32bc9cb8d/onnx-1.19.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ae475c85c89bc4d1f16571006fd21a3e7c0e258dd2c091f6e8aafb083d1ed9b", size = 18202278, upload-time = "2025-08-27T02:33:26.103Z" }, - { url = "https://files.pythonhosted.org/packages/bc/6e/a9383d9cf6db4ac761a129b081e9fa5d0cd89aad43cf1e3fc6285b915c7d/onnx-1.19.0-cp312-cp312-win32.whl", hash = "sha256:323f6a96383a9cdb3960396cffea0a922593d221f3929b17312781e9f9b7fb9f", size = 16333080, upload-time = "2025-08-27T02:33:28.559Z" }, - { url = "https://files.pythonhosted.org/packages/a7/2e/3ff480a8c1fa7939662bdc973e41914add2d4a1f2b8572a3c39c2e4982e5/onnx-1.19.0-cp312-cp312-win_amd64.whl", hash = "sha256:50220f3499a499b1a15e19451a678a58e22ad21b34edf2c844c6ef1d9febddc2", size = 16453927, upload-time = "2025-08-27T02:33:31.177Z" }, - { url = "https://files.pythonhosted.org/packages/57/37/ad500945b1b5c154fe9d7b826b30816ebd629d10211ea82071b5bcc30aa4/onnx-1.19.0-cp312-cp312-win_arm64.whl", hash = "sha256:efb768299580b786e21abe504e1652ae6189f0beed02ab087cd841cb4bb37e43", size = 16426022, upload-time = "2025-08-27T02:33:33.515Z" }, - { url = "https://files.pythonhosted.org/packages/be/29/d7b731f63d243f815d9256dce0dca3c151dcaa1ac59f73e6ee06c9afbe91/onnx-1.19.0-cp313-cp313-macosx_12_0_universal2.whl", hash = "sha256:9aed51a4b01acc9ea4e0fe522f34b2220d59e9b2a47f105ac8787c2e13ec5111", size = 18322412, upload-time = "2025-08-27T02:33:36.723Z" }, - { url = "https://files.pythonhosted.org/packages/58/f5/d3106becb42cb374f0e17ff4c9933a97f1ee1d6a798c9452067f7d3ff61b/onnx-1.19.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ce2cdc3eb518bb832668c4ea9aeeda01fbaa59d3e8e5dfaf7aa00f3d37119404", size = 18026565, upload-time = "2025-08-27T02:33:39.493Z" }, - { url = "https://files.pythonhosted.org/packages/83/fa/b086d17bab3900754c7ffbabfb244f8e5e5da54a34dda2a27022aa2b373b/onnx-1.19.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8b546bd7958734b6abcd40cfede3d025e9c274fd96334053a288ab11106bd0aa", size = 18202077, upload-time = "2025-08-27T02:33:42.115Z" }, - { url = "https://files.pythonhosted.org/packages/35/f2/5e2dfb9d4cf873f091c3f3c6d151f071da4295f9893fbf880f107efe3447/onnx-1.19.0-cp313-cp313-win32.whl", hash = "sha256:03086bffa1cf5837430cf92f892ca0cd28c72758d8905578c2bf8ffaf86c6743", size = 16333198, upload-time = "2025-08-27T02:33:45.172Z" }, - { url = "https://files.pythonhosted.org/packages/79/67/b3751a35c2522f62f313156959575619b8fa66aa883db3adda9d897d8eb2/onnx-1.19.0-cp313-cp313-win_amd64.whl", hash = "sha256:1715b51eb0ab65272e34ef51cb34696160204b003566cd8aced2ad20a8f95cb8", size = 16453836, upload-time = "2025-08-27T02:33:47.779Z" }, - { url = "https://files.pythonhosted.org/packages/14/b9/1df85effc960fbbb90bb7bc36eb3907c676b104bc2f88bce022bcfdaef63/onnx-1.19.0-cp313-cp313-win_arm64.whl", hash = "sha256:6bf5acdb97a3ddd6e70747d50b371846c313952016d0c41133cbd8f61b71a8d5", size = 16425877, upload-time = "2025-08-27T02:33:50.357Z" }, - { url = "https://files.pythonhosted.org/packages/23/2b/089174a1427be9149f37450f8959a558ba20f79fca506ba461d59379d3a1/onnx-1.19.0-cp313-cp313t-macosx_12_0_universal2.whl", hash = "sha256:46cf29adea63e68be0403c68de45ba1b6acc9bb9592c5ddc8c13675a7c71f2cb", size = 18348546, upload-time = "2025-08-27T02:33:56.132Z" }, - { url = "https://files.pythonhosted.org/packages/c0/d6/3458f0e3a9dc7677675d45d7d6528cb84ad321c8670cc10c69b32c3e03da/onnx-1.19.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:246f0de1345498d990a443d55a5b5af5101a3e25a05a2c3a5fe8b7bd7a7d0707", size = 18033067, upload-time = "2025-08-27T02:33:58.661Z" }, - { url = "https://files.pythonhosted.org/packages/e4/16/6e4130e1b4b29465ee1fb07d04e8d6f382227615c28df8f607ba50909e2a/onnx-1.19.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ae0d163ffbc250007d984b8dd692a4e2e4506151236b50ca6e3560b612ccf9ff", size = 18205741, upload-time = "2025-08-27T02:34:01.538Z" }, - { url = "https://files.pythonhosted.org/packages/fe/d8/f64d010fd024b2a2b11ce0c4ee179e4f8f6d4ccc95f8184961c894c22af1/onnx-1.19.0-cp313-cp313t-win_amd64.whl", hash = "sha256:7c151604c7cca6ae26161c55923a7b9b559df3344938f93ea0074d2d49e7fe78", size = 16453839, upload-time = "2025-08-27T02:34:06.515Z" }, - { url = "https://files.pythonhosted.org/packages/67/ec/8761048eabef4dad55af4c002c672d139b9bd47c3616abaed642a1710063/onnx-1.19.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:236bc0e60d7c0f4159300da639953dd2564df1c195bce01caba172a712e75af4", size = 18027605, upload-time = "2025-08-27T02:34:08.962Z" }, -] - -[[package]] -name = "onnx-ir" -version = "0.1.8" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", -] -dependencies = [ - { name = "ml-dtypes", version = "0.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" }, - { name = "numpy", marker = "python_full_version >= '3.13'" }, - { name = "onnx", marker = "python_full_version >= '3.13'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.13'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/af/4a/7ea3952e556e7281b8bfe7f7fce016a13fdac85544d6d6af8ebca5cae160/onnx_ir-0.1.8.tar.gz", hash = "sha256:85ea59eaf165b2b107788193480a260e2723cfc7a1dac1bde7085fd0b7e380d7", size = 108961, upload-time = "2025-09-05T15:45:33.887Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0f/1c/3bb51fa9e278cbc655a1943c8016163d76a6e24137e73e5198ebc20fc965/onnx_ir-0.1.8-py3-none-any.whl", hash = "sha256:61a42021b6249e566ff3b89a03342bc88dce4dc2d984b97cfb060f33ef179f8a", size = 125316, upload-time = "2025-09-05T15:45:31.211Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/27/2f/c619eb65769357e9b6de9212c9a821ab39cd484448e5d6b3fb5fb0a64c6d/onnx-1.19.1.tar.gz", hash = "sha256:737524d6eb3907d3499ea459c6f01c5a96278bb3a0f2ff8ae04786fb5d7f1ed5", size = 12033525, upload-time = "2025-10-10T04:01:34.342Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5b/f3/892eea0206ed13a986239bd508c82b974387ef1b0ffd83ece0ce0725aaf6/onnx-1.19.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:7343250cc5276cf439fe623b8f92e11cf0d1eebc733ae4a8b2e86903bb72ae68", size = 18319433, upload-time = "2025-10-10T03:59:47.236Z" }, + { url = "https://files.pythonhosted.org/packages/9c/f3/c7ea4a1dfda9b9ddeff914a601ffaf5ed151b3352529f223eae74c03c8d1/onnx-1.19.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1fb8f79de7f3920bb82b537f3c6ac70c0ce59f600471d9c3eed2b5f8b079b748", size = 18043327, upload-time = "2025-10-10T03:59:50.854Z" }, + { url = "https://files.pythonhosted.org/packages/8d/eb/30159bb6a108b03f2b7521410369a5bd8d296be3fbf0b30ab7acd9ef42ad/onnx-1.19.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:92b9d2dece41cc84213dbbfd1acbc2a28c27108c53bd28ddb6d1043fbfcbd2d5", size = 18216877, upload-time = "2025-10-10T03:59:54.512Z" }, + { url = "https://files.pythonhosted.org/packages/0c/86/dc034e5a723a20ca45aa8dd76dda53c358a5f955908e1436f42c21bdfb3a/onnx-1.19.1-cp310-cp310-win32.whl", hash = "sha256:c0b1a2b6bb19a0fc9f5de7661a547136d082c03c169a5215e18ff3ececd2a82f", size = 16344116, upload-time = "2025-10-10T03:59:57.991Z" }, + { url = "https://files.pythonhosted.org/packages/b6/60/537f2c19050f71445ee00ed91e78a396b6189dd1fce61b29ac6a0d651c7e/onnx-1.19.1-cp310-cp310-win_amd64.whl", hash = "sha256:1c0498c00db05fcdb3426697d330dcecc3f60020015065e2c76fa795f2c9a605", size = 16462819, upload-time = "2025-10-10T04:00:01.157Z" }, + { url = "https://files.pythonhosted.org/packages/36/07/0019c72924909e4f64b9199770630ab7b8d7914b912b03230e68f5eda7ae/onnx-1.19.1-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:17aaf5832126de0a5197a5864e4f09a764dd7681d3035135547959b4b6b77a09", size = 18320936, upload-time = "2025-10-10T04:00:04.235Z" }, + { url = "https://files.pythonhosted.org/packages/af/2f/5c47acf740dc35f0decc640844260fbbdc0efa0565657c93fd7ff30f13f3/onnx-1.19.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:01b292a4d0b197c45d8184545bbc8ae1df83466341b604187c1b05902cb9c920", size = 18044269, upload-time = "2025-10-10T04:00:07.449Z" }, + { url = "https://files.pythonhosted.org/packages/d5/61/6c457ee8c3a62a3cad0a4bfa4c5436bb3ac4df90c3551d40bee1224b5b51/onnx-1.19.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1839af08ab4a909e4af936b8149c27f8c64b96138981024e251906e0539d8bf9", size = 18218092, upload-time = "2025-10-10T04:00:11.135Z" }, + { url = "https://files.pythonhosted.org/packages/54/d5/ab832e1369505e67926a70e9a102061f89ad01f91aa296c4b1277cb81b25/onnx-1.19.1-cp311-cp311-win32.whl", hash = "sha256:0bdbb676e3722bd32f9227c465d552689f49086f986a696419d865cb4e70b989", size = 16344809, upload-time = "2025-10-10T04:00:14.634Z" }, + { url = "https://files.pythonhosted.org/packages/8b/b5/6eb4611d24b85002f878ba8476b4cecbe6f9784c0236a3c5eff85236cc0a/onnx-1.19.1-cp311-cp311-win_amd64.whl", hash = "sha256:1346853df5c1e3ebedb2e794cf2a51e0f33759affd655524864ccbcddad7035b", size = 16464319, upload-time = "2025-10-10T04:00:18.235Z" }, + { url = "https://files.pythonhosted.org/packages/0c/ff/f0e1f06420c70e20d497fec7c94a864d069943b6312bedd4224c0ab946f8/onnx-1.19.1-cp311-cp311-win_arm64.whl", hash = "sha256:2d69c280c0e665b7f923f499243b9bb84fe97970b7a4668afa0032045de602c8", size = 16437503, upload-time = "2025-10-10T04:00:21.247Z" }, + { url = "https://files.pythonhosted.org/packages/50/07/f6c5b2cffef8c29e739616d1415aea22f7b7ef1f19c17f02b7cff71f5498/onnx-1.19.1-cp312-cp312-macosx_12_0_universal2.whl", hash = "sha256:3612193a89ddbce5c4e86150869b9258780a82fb8c4ca197723a4460178a6ce9", size = 18327840, upload-time = "2025-10-10T04:00:24.259Z" }, + { url = "https://files.pythonhosted.org/packages/93/20/0568ebd52730287ae80cac8ac893a7301c793ea1630984e2519ee92b02a9/onnx-1.19.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6c2fd2f744e7a3880ad0c262efa2edf6d965d0bd02b8f327ec516ad4cb0f2f15", size = 18042539, upload-time = "2025-10-10T04:00:27.693Z" }, + { url = "https://files.pythonhosted.org/packages/14/fd/cd7a0fd10a04f8cc5ae436b63e0022e236fe51b9dbb8ee6317fd48568c72/onnx-1.19.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:485d3674d50d789e0ee72fa6f6e174ab81cb14c772d594f992141bd744729d8a", size = 18218271, upload-time = "2025-10-10T04:00:30.495Z" }, + { url = "https://files.pythonhosted.org/packages/65/68/cc8b8c05469fe08384b446304ad7e6256131ca0463bf6962366eebec98c0/onnx-1.19.1-cp312-cp312-win32.whl", hash = "sha256:638bc56ff1a5718f7441e887aeb4e450f37a81c6eac482040381b140bd9ba601", size = 16345111, upload-time = "2025-10-10T04:00:34.982Z" }, + { url = "https://files.pythonhosted.org/packages/c7/5e/d1cb16693598a512c2cf9ffe0841d8d8fd2c83ae8e889efd554f5aa427cf/onnx-1.19.1-cp312-cp312-win_amd64.whl", hash = "sha256:bc7e2e4e163e679721e547958b5a7db875bf822cad371b7c1304aa4401a7c7a4", size = 16465621, upload-time = "2025-10-10T04:00:39.107Z" }, + { url = "https://files.pythonhosted.org/packages/90/32/da116cc61fdef334782aa7f87a1738431dd1af1a5d1a44bd95d6d51ad260/onnx-1.19.1-cp312-cp312-win_arm64.whl", hash = "sha256:17c215b1c0f20fe93b4cbe62668247c1d2294b9bc7f6be0ca9ced28e980c07b7", size = 16437505, upload-time = "2025-10-10T04:00:42.255Z" }, + { url = "https://files.pythonhosted.org/packages/b4/b8/ab1fdfe2e8502f4dc4289fc893db35816bd20d080d8370f86e74dda5f598/onnx-1.19.1-cp313-cp313-macosx_12_0_universal2.whl", hash = "sha256:4e5f938c68c4dffd3e19e4fd76eb98d298174eb5ebc09319cdd0ec5fe50050dc", size = 18327815, upload-time = "2025-10-10T04:00:45.682Z" }, + { url = "https://files.pythonhosted.org/packages/04/40/eb875745a4b92aea10e5e32aa2830f409c4d7b6f7b48ca1c4eaad96636c5/onnx-1.19.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:86e20a5984b017feeef2dbf4ceff1c7c161ab9423254968dd77d3696c38691d0", size = 18041464, upload-time = "2025-10-10T04:00:48.557Z" }, + { url = "https://files.pythonhosted.org/packages/cf/8e/8586135f40dbe4989cec4d413164bc8fc5c73d37c566f33f5ea3a7f2b6f6/onnx-1.19.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d9c467f0f29993c12f330736af87972f30adb8329b515f39d63a0db929cb2c", size = 18218244, upload-time = "2025-10-10T04:00:51.891Z" }, + { url = "https://files.pythonhosted.org/packages/51/b5/4201254b8683129db5da3fb55aa1f7e56d0a8d45c66ce875dec21ca1ff25/onnx-1.19.1-cp313-cp313-win32.whl", hash = "sha256:65eee353a51b4e4ca3e797784661e5376e2b209f17557e04921eac9166a8752e", size = 16345330, upload-time = "2025-10-10T04:00:54.858Z" }, + { url = "https://files.pythonhosted.org/packages/69/67/c6d239afbcdbeb6805432969b908b5c9f700c96d332b34e3f99518d76caf/onnx-1.19.1-cp313-cp313-win_amd64.whl", hash = "sha256:c3bc87e38b53554b1fc9ef7b275c81c6f5c93c90a91935bb0aa8d4d498a6d48e", size = 16465567, upload-time = "2025-10-10T04:00:57.893Z" }, + { url = "https://files.pythonhosted.org/packages/99/fe/89f1e40f5bc54595ff0dcf5391ce19e578b528973ccc74dd99800196d30d/onnx-1.19.1-cp313-cp313-win_arm64.whl", hash = "sha256:e41496f400afb980ec643d80d5164753a88a85234fa5c06afdeebc8b7d1ec252", size = 16437562, upload-time = "2025-10-10T04:01:00.703Z" }, + { url = "https://files.pythonhosted.org/packages/86/43/b186ccbc8fe7e93643a6a6d40bbf2bb6ce4fb9469bbd3453c77e270c50ad/onnx-1.19.1-cp313-cp313t-macosx_12_0_universal2.whl", hash = "sha256:5f6274abf0fd74e80e78ecbb44bd44509409634525c89a9b38276c8af47dc0a2", size = 18355703, upload-time = "2025-10-10T04:01:03.735Z" }, + { url = "https://files.pythonhosted.org/packages/60/f1/22ee4d8b8f9fa4cb1d1b9579da3b4b5187ddab33846ec5ac744af02c0e2b/onnx-1.19.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:07dcd4d83584eb4bf8f21ac04c82643712e5e93ac2a0ed10121ec123cb127e1e", size = 18047830, upload-time = "2025-10-10T04:01:06.552Z" }, + { url = "https://files.pythonhosted.org/packages/8e/a4/8f3d51e3a095d42cdf2039a590cff06d024f2a10efbd0b1a2a6b3825f019/onnx-1.19.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1975860c3e720db25d37f1619976582828264bdcc64fa7511c321ac4fc01add3", size = 18221126, upload-time = "2025-10-10T04:01:09.77Z" }, + { url = "https://files.pythonhosted.org/packages/4f/0d/f9d6c2237083f1aac14b37f0b03b0d81f1147a8e2af0c3828165e0a6a67b/onnx-1.19.1-cp313-cp313t-win_amd64.whl", hash = "sha256:9807d0e181f6070ee3a6276166acdc571575d1bd522fc7e89dba16fd6e7ffed9", size = 16465560, upload-time = "2025-10-10T04:01:13.212Z" }, + { url = "https://files.pythonhosted.org/packages/36/70/8418a58faa7d606d6a92cab69ae8d361b3b3969bf7e7e9a65a86d5d1b674/onnx-1.19.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b6ee83e6929d75005482d9f304c502ac7c9b8d6db153aa6b484dae74d0f28570", size = 18042812, upload-time = "2025-10-10T04:01:15.919Z" }, ] [[package]] name = "onnx-ir" version = "0.1.12" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version == '3.12.*' and sys_platform == 'linux'", - "python_full_version == '3.12.*' and sys_platform != 'linux'", - "python_full_version == '3.11.*' and sys_platform == 'linux'", - "python_full_version == '3.11.*' and sys_platform != 'linux'", - "python_full_version < '3.11' and sys_platform == 'linux'", - "python_full_version < '3.11' and sys_platform != 'linux'", -] dependencies = [ - { name = "ml-dtypes", version = "0.5.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" }, - { name = "numpy", marker = "python_full_version < '3.13'" }, - { name = "onnx", marker = "python_full_version < '3.13'" }, - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "ml-dtypes" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "onnx" }, + { name = "typing-extensions" }, ] sdist = { url = "https://files.pythonhosted.org/packages/6c/1a/2a94112a39d01a9d1490f5ef3c205d8a17fe1ca27f307b026c40d62d8e9f/onnx_ir-0.1.12.tar.gz", hash = "sha256:742e0bff875d0547724187560b3f441833191c8aa939c05f14176f4892784deb", size = 112699, upload-time = "2025-10-28T23:43:54.129Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/c8/36/c4df116f5dcaa82ec7944e5d25624a3811f6603fd190660b0b079ea759fb/onnx_ir-0.1.12-py3-none-any.whl", hash = "sha256:17f86faf8a53b979430bde1bc6022c7a162b0d1534550ddb17a1d37eb993e765", size = 129277, upload-time = "2025-10-28T23:43:52.493Z" }, ] -[[package]] -name = "onnxscript" -version = "0.5.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", -] -dependencies = [ - { name = "ml-dtypes", version = "0.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" }, - { name = "numpy", marker = "python_full_version >= '3.13'" }, - { name = "onnx", marker = "python_full_version >= '3.13'" }, - { name = "onnx-ir", version = "0.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" }, - { name = "packaging", marker = "python_full_version >= '3.13'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.13'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f5/2f/0bb2b6ca727e4d5173f640527f402ab4225def4bc8d667269b83047be8c4/onnxscript-0.5.0.tar.gz", hash = "sha256:4aba215e1f80fbcd07ba0d97d6bca96797fc3e9639eacb5434d35317ce1406aa", size = 588762, upload-time = "2025-09-12T16:57:46.484Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e7/f7/f0eb0b10771637a8c176a3b0594c65c5ba3cea440847741297901cef2c5e/onnxscript-0.5.0-py3-none-any.whl", hash = "sha256:da33715ac8ec80e0263a5200f1ad1b3532225804c05a13a0d6ea83712b5b4a8f", size = 684685, upload-time = "2025-09-12T16:57:48.869Z" }, -] - [[package]] name = "onnxscript" version = "0.5.6" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version == '3.12.*' and sys_platform == 'linux'", - "python_full_version == '3.12.*' and sys_platform != 'linux'", - "python_full_version == '3.11.*' and sys_platform == 'linux'", - "python_full_version == '3.11.*' and sys_platform != 'linux'", - "python_full_version < '3.11' and sys_platform == 'linux'", - "python_full_version < '3.11' and sys_platform != 'linux'", -] dependencies = [ - { name = "ml-dtypes", version = "0.5.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" }, - { name = "numpy", marker = "python_full_version < '3.13'" }, - { name = "onnx", marker = "python_full_version < '3.13'" }, - { name = "onnx-ir", version = "0.1.12", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" }, - { name = "packaging", marker = "python_full_version < '3.13'" }, - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "ml-dtypes" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "onnx" }, + { name = "onnx-ir" }, + { name = "packaging" }, + { name = "typing-extensions" }, ] sdist = { url = "https://files.pythonhosted.org/packages/fb/4b/eed2199327bbf12c3443d7835893e3c4c23b1c1a4aa13efe0f7fbe0a6bf9/onnxscript-0.5.6.tar.gz", hash = "sha256:cc3338b2976daffd2af0bb6ac4866a4dca76aefface1666a0d7bc65ad9850822", size = 587017, upload-time = "2025-10-31T03:50:38.656Z" } wheels = [ @@ -3570,13 +3493,22 @@ version = "1.33.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "deprecated" }, - { name = "importlib-metadata", version = "8.6.1", source = { registry = "https://pypi.org/simple" } }, + { name = "importlib-metadata" }, ] sdist = { url = "https://files.pythonhosted.org/packages/9a/8d/1f5a45fbcb9a7d87809d460f09dc3399e3fbd31d7f3e14888345e9d29951/opentelemetry_api-1.33.1.tar.gz", hash = "sha256:1c6055fc0a2d3f23a50c7e17e16ef75ad489345fd3df1f8b8af7c0bbf8a109e8", size = 65002, upload-time = "2025-05-16T18:52:41.146Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/05/44/4c45a34def3506122ae61ad684139f0bbc4e00c39555d4f7e20e0e001c8a/opentelemetry_api-1.33.1-py3-none-any.whl", hash = "sha256:4db83ebcf7ea93e64637ec6ee6fabee45c5cbe4abd9cf3da95c43828ddb50b83", size = 65771, upload-time = "2025-05-16T18:52:17.419Z" }, ] +[[package]] +name = "overrides" +version = "7.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/36/86/b585f53236dec60aba864e050778b25045f857e17f6e5ea0ae95fe80edd2/overrides-7.7.0.tar.gz", hash = "sha256:55158fa3d93b98cc75299b1e67078ad9003ca27945c76162c1c0766d6f91820a", size = 22812, upload-time = "2024-01-27T21:01:33.423Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/ab/fc8290c6a4c722e5514d80f62b2dc4c4df1a68a41d1364e625c35990fcf3/overrides-7.7.0-py3-none-any.whl", hash = "sha256:c7ed9d062f78b8e4c1a7b70bd8796b35ead4d9f510227ef9c5dc7626c60d7e49", size = 17832, upload-time = "2024-01-27T21:01:31.393Z" }, +] + [[package]] name = "packaging" version = "25.0" @@ -3591,7 +3523,8 @@ name = "pandas" version = "2.3.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "python-dateutil" }, { name = "pytz" }, { name = "tzdata" }, @@ -3798,14 +3731,14 @@ wheels = [ [[package]] name = "prettytable" -version = "3.16.0" +version = "3.17.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "wcwidth" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/99/b1/85e18ac92afd08c533603e3393977b6bc1443043115a47bb094f3b98f94f/prettytable-3.16.0.tar.gz", hash = "sha256:3c64b31719d961bf69c9a7e03d0c1e477320906a98da63952bc6698d6164ff57", size = 66276, upload-time = "2025-03-24T19:39:04.008Z" } +sdist = { url = "https://files.pythonhosted.org/packages/79/45/b0847d88d6cfeb4413566738c8bbf1e1995fad3d42515327ff32cc1eb578/prettytable-3.17.0.tar.gz", hash = "sha256:59f2590776527f3c9e8cf9fe7b66dd215837cca96a9c39567414cbc632e8ddb0", size = 67892, upload-time = "2025-11-14T17:33:20.212Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/02/c7/5613524e606ea1688b3bdbf48aa64bafb6d0a4ac3750274c43b6158a390f/prettytable-3.16.0-py3-none-any.whl", hash = "sha256:b5eccfabb82222f5aa46b798ff02a8452cf530a352c31bddfa29be41242863aa", size = 33863, upload-time = "2025-03-24T19:39:02.359Z" }, + { url = "https://files.pythonhosted.org/packages/ee/8c/83087ebc47ab0396ce092363001fa37c17153119ee282700c0713a195853/prettytable-3.17.0-py3-none-any.whl", hash = "sha256:aad69b294ddbe3e1f95ef8886a060ed1666a0b83018bbf56295f6f226c43d287", size = 34433, upload-time = "2025-11-14T17:33:19.093Z" }, ] [[package]] @@ -3958,17 +3891,17 @@ wheels = [ [[package]] name = "protobuf" -version = "6.33.0" +version = "6.33.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/19/ff/64a6c8f420818bb873713988ca5492cba3a7946be57e027ac63495157d97/protobuf-6.33.0.tar.gz", hash = "sha256:140303d5c8d2037730c548f8c7b93b20bb1dc301be280c378b82b8894589c954", size = 443463, upload-time = "2025-10-15T20:39:52.159Z" } +sdist = { url = "https://files.pythonhosted.org/packages/0a/03/a1440979a3f74f16cab3b75b0da1a1a7f922d56a8ddea96092391998edc0/protobuf-6.33.1.tar.gz", hash = "sha256:97f65757e8d09870de6fd973aeddb92f85435607235d20b2dfed93405d00c85b", size = 443432, upload-time = "2025-11-13T16:44:18.895Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7e/ee/52b3fa8feb6db4a833dfea4943e175ce645144532e8a90f72571ad85df4e/protobuf-6.33.0-cp310-abi3-win32.whl", hash = "sha256:d6101ded078042a8f17959eccd9236fb7a9ca20d3b0098bbcb91533a5680d035", size = 425593, upload-time = "2025-10-15T20:39:40.29Z" }, - { url = "https://files.pythonhosted.org/packages/7b/c6/7a465f1825872c55e0341ff4a80198743f73b69ce5d43ab18043699d1d81/protobuf-6.33.0-cp310-abi3-win_amd64.whl", hash = "sha256:9a031d10f703f03768f2743a1c403af050b6ae1f3480e9c140f39c45f81b13ee", size = 436882, upload-time = "2025-10-15T20:39:42.841Z" }, - { url = "https://files.pythonhosted.org/packages/e1/a9/b6eee662a6951b9c3640e8e452ab3e09f117d99fc10baa32d1581a0d4099/protobuf-6.33.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:905b07a65f1a4b72412314082c7dbfae91a9e8b68a0cc1577515f8df58ecf455", size = 427521, upload-time = "2025-10-15T20:39:43.803Z" }, - { url = "https://files.pythonhosted.org/packages/10/35/16d31e0f92c6d2f0e77c2a3ba93185130ea13053dd16200a57434c882f2b/protobuf-6.33.0-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:e0697ece353e6239b90ee43a9231318302ad8353c70e6e45499fa52396debf90", size = 324445, upload-time = "2025-10-15T20:39:44.932Z" }, - { url = "https://files.pythonhosted.org/packages/e6/eb/2a981a13e35cda8b75b5585aaffae2eb904f8f351bdd3870769692acbd8a/protobuf-6.33.0-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:e0a1715e4f27355afd9570f3ea369735afc853a6c3951a6afe1f80d8569ad298", size = 339159, upload-time = "2025-10-15T20:39:46.186Z" }, - { url = "https://files.pythonhosted.org/packages/21/51/0b1cbad62074439b867b4e04cc09b93f6699d78fd191bed2bbb44562e077/protobuf-6.33.0-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:35be49fd3f4fefa4e6e2aacc35e8b837d6703c37a2168a55ac21e9b1bc7559ef", size = 323172, upload-time = "2025-10-15T20:39:47.465Z" }, - { url = "https://files.pythonhosted.org/packages/07/d1/0a28c21707807c6aacd5dc9c3704b2aa1effbf37adebd8caeaf68b17a636/protobuf-6.33.0-py3-none-any.whl", hash = "sha256:25c9e1963c6734448ea2d308cfa610e692b801304ba0908d7bfa564ac5132995", size = 170477, upload-time = "2025-10-15T20:39:51.311Z" }, + { url = "https://files.pythonhosted.org/packages/06/f1/446a9bbd2c60772ca36556bac8bfde40eceb28d9cc7838755bc41e001d8f/protobuf-6.33.1-cp310-abi3-win32.whl", hash = "sha256:f8d3fdbc966aaab1d05046d0240dd94d40f2a8c62856d41eaa141ff64a79de6b", size = 425593, upload-time = "2025-11-13T16:44:06.275Z" }, + { url = "https://files.pythonhosted.org/packages/a6/79/8780a378c650e3df849b73de8b13cf5412f521ca2ff9b78a45c247029440/protobuf-6.33.1-cp310-abi3-win_amd64.whl", hash = "sha256:923aa6d27a92bf44394f6abf7ea0500f38769d4b07f4be41cb52bd8b1123b9ed", size = 436883, upload-time = "2025-11-13T16:44:09.222Z" }, + { url = "https://files.pythonhosted.org/packages/cd/93/26213ff72b103ae55bb0d73e7fb91ea570ef407c3ab4fd2f1f27cac16044/protobuf-6.33.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:fe34575f2bdde76ac429ec7b570235bf0c788883e70aee90068e9981806f2490", size = 427522, upload-time = "2025-11-13T16:44:10.475Z" }, + { url = "https://files.pythonhosted.org/packages/c2/32/df4a35247923393aa6b887c3b3244a8c941c32a25681775f96e2b418f90e/protobuf-6.33.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:f8adba2e44cde2d7618996b3fc02341f03f5bc3f2748be72dc7b063319276178", size = 324445, upload-time = "2025-11-13T16:44:11.869Z" }, + { url = "https://files.pythonhosted.org/packages/8e/d0/d796e419e2ec93d2f3fa44888861c3f88f722cde02b7c3488fcc6a166820/protobuf-6.33.1-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:0f4cf01222c0d959c2b399142deb526de420be8236f22c71356e2a544e153c53", size = 339161, upload-time = "2025-11-13T16:44:12.778Z" }, + { url = "https://files.pythonhosted.org/packages/1d/2a/3c5f05a4af06649547027d288747f68525755de692a26a7720dced3652c0/protobuf-6.33.1-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:8fd7d5e0eb08cd5b87fd3df49bc193f5cfd778701f47e11d127d0afc6c39f1d1", size = 323171, upload-time = "2025-11-13T16:44:14.035Z" }, + { url = "https://files.pythonhosted.org/packages/08/b4/46310463b4f6ceef310f8348786f3cff181cea671578e3d9743ba61a459e/protobuf-6.33.1-py3-none-any.whl", hash = "sha256:d595a9fd694fdeb061a62fbe10eb039cc1e444df81ec9bb70c7fc59ebcb1eafa", size = 170477, upload-time = "2025-11-13T16:44:17.633Z" }, ] [[package]] @@ -4092,7 +4025,7 @@ wheels = [ [[package]] name = "pydantic" -version = "2.12.4" +version = "2.12.5" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "annotated-types" }, @@ -4100,9 +4033,9 @@ dependencies = [ { name = "typing-extensions" }, { name = "typing-inspection" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/96/ad/a17bc283d7d81837c061c49e3eaa27a45991759a1b7eae1031921c6bd924/pydantic-2.12.4.tar.gz", hash = "sha256:0f8cb9555000a4b5b617f66bfd2566264c4984b27589d3b845685983e8ea85ac", size = 821038, upload-time = "2025-11-05T10:50:08.59Z" } +sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/82/2f/e68750da9b04856e2a7ec56fc6f034a5a79775e9b9a81882252789873798/pydantic-2.12.4-py3-none-any.whl", hash = "sha256:92d3d202a745d46f9be6df459ac5a064fdaa3c1c4cd8adcfa332ccf3c05f871e", size = 463400, upload-time = "2025-11-05T10:50:06.732Z" }, + { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" }, ] [[package]] @@ -4311,51 +4244,39 @@ wheels = [ [[package]] name = "pynacl" -version = "1.6.0" +version = "1.6.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cffi", marker = "platform_python_implementation != 'PyPy' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/06/c6/a3124dee667a423f2c637cfd262a54d67d8ccf3e160f3c50f622a85b7723/pynacl-1.6.0.tar.gz", hash = "sha256:cb36deafe6e2bce3b286e5d1f3e1c246e0ccdb8808ddb4550bb2792f2df298f2", size = 3505641, upload-time = "2025-09-10T23:39:22.308Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/70/24/1b639176401255605ba7c2b93a7b1eb1e379e0710eca62613633eb204201/pynacl-1.6.0-cp314-cp314t-macosx_10_10_universal2.whl", hash = "sha256:f46386c24a65383a9081d68e9c2de909b1834ec74ff3013271f1bca9c2d233eb", size = 384141, upload-time = "2025-09-10T23:38:28.675Z" }, - { url = "https://files.pythonhosted.org/packages/5e/7b/874efdf57d6bf172db0df111b479a553c3d9e8bb4f1f69eb3ffff772d6e8/pynacl-1.6.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:dea103a1afcbc333bc0e992e64233d360d393d1e63d0bc88554f572365664348", size = 808132, upload-time = "2025-09-10T23:38:38.995Z" }, - { url = "https://files.pythonhosted.org/packages/f3/61/9b53f5913f3b75ac3d53170cdb897101b2b98afc76f4d9d3c8de5aa3ac05/pynacl-1.6.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:04f20784083014e265ad58c1b2dd562c3e35864b5394a14ab54f5d150ee9e53e", size = 1407253, upload-time = "2025-09-10T23:38:40.492Z" }, - { url = "https://files.pythonhosted.org/packages/7c/0a/b138916b22bbf03a1bdbafecec37d714e7489dd7bcaf80cd17852f8b67be/pynacl-1.6.0-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bbcc4452a1eb10cd5217318c822fde4be279c9de8567f78bad24c773c21254f8", size = 843719, upload-time = "2025-09-10T23:38:30.87Z" }, - { url = "https://files.pythonhosted.org/packages/01/3b/17c368197dfb2c817ce033f94605a47d0cc27901542109e640cef263f0af/pynacl-1.6.0-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:51fed9fe1bec9e7ff9af31cd0abba179d0e984a2960c77e8e5292c7e9b7f7b5d", size = 1445441, upload-time = "2025-09-10T23:38:33.078Z" }, - { url = "https://files.pythonhosted.org/packages/35/3c/f79b185365ab9be80cd3cd01dacf30bf5895f9b7b001e683b369e0bb6d3d/pynacl-1.6.0-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:10d755cf2a455d8c0f8c767a43d68f24d163b8fe93ccfaabfa7bafd26be58d73", size = 825691, upload-time = "2025-09-10T23:38:34.832Z" }, - { url = "https://files.pythonhosted.org/packages/f7/1f/8b37d25e95b8f2a434a19499a601d4d272b9839ab8c32f6b0fc1e40c383f/pynacl-1.6.0-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:536703b8f90e911294831a7fbcd0c062b837f3ccaa923d92a6254e11178aaf42", size = 1410726, upload-time = "2025-09-10T23:38:36.893Z" }, - { url = "https://files.pythonhosted.org/packages/bd/93/5a4a4cf9913014f83d615ad6a2df9187330f764f606246b3a744c0788c03/pynacl-1.6.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6b08eab48c9669d515a344fb0ef27e2cbde847721e34bba94a343baa0f33f1f4", size = 801035, upload-time = "2025-09-10T23:38:42.109Z" }, - { url = "https://files.pythonhosted.org/packages/bf/60/40da6b0fe6a4d5fd88f608389eb1df06492ba2edca93fca0b3bebff9b948/pynacl-1.6.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5789f016e08e5606803161ba24de01b5a345d24590a80323379fc4408832d290", size = 1371854, upload-time = "2025-09-10T23:38:44.16Z" }, - { url = "https://files.pythonhosted.org/packages/44/b2/37ac1d65008f824cba6b5bf68d18b76d97d0f62d7a032367ea69d4a187c8/pynacl-1.6.0-cp314-cp314t-win32.whl", hash = "sha256:4853c154dc16ea12f8f3ee4b7e763331876316cc3a9f06aeedf39bcdca8f9995", size = 230345, upload-time = "2025-09-10T23:38:48.276Z" }, - { url = "https://files.pythonhosted.org/packages/f4/5a/9234b7b45af890d02ebee9aae41859b9b5f15fb4a5a56d88e3b4d1659834/pynacl-1.6.0-cp314-cp314t-win_amd64.whl", hash = "sha256:347dcddce0b4d83ed3f32fd00379c83c425abee5a9d2cd0a2c84871334eaff64", size = 243103, upload-time = "2025-09-10T23:38:45.503Z" }, - { url = "https://files.pythonhosted.org/packages/c9/2c/c1a0f19d720ab0af3bc4241af2bdf4d813c3ecdcb96392b5e1ddf2d8f24f/pynacl-1.6.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2d6cd56ce4998cb66a6c112fda7b1fdce5266c9f05044fa72972613bef376d15", size = 187778, upload-time = "2025-09-10T23:38:46.731Z" }, - { url = "https://files.pythonhosted.org/packages/63/37/87c72df19857c5b3b47ace6f211a26eb862ada495cc96daa372d96048fca/pynacl-1.6.0-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:f4b3824920e206b4f52abd7de621ea7a44fd3cb5c8daceb7c3612345dfc54f2e", size = 382610, upload-time = "2025-09-10T23:38:49.459Z" }, - { url = "https://files.pythonhosted.org/packages/0c/64/3ce958a5817fd3cc6df4ec14441c43fd9854405668d73babccf77f9597a3/pynacl-1.6.0-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:16dd347cdc8ae0b0f6187a2608c0af1c8b7ecbbe6b4a06bff8253c192f696990", size = 798744, upload-time = "2025-09-10T23:38:58.531Z" }, - { url = "https://files.pythonhosted.org/packages/e4/8a/3f0dd297a0a33fa3739c255feebd0206bb1df0b44c52fbe2caf8e8bc4425/pynacl-1.6.0-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:16c60daceee88d04f8d41d0a4004a7ed8d9a5126b997efd2933e08e93a3bd850", size = 1397879, upload-time = "2025-09-10T23:39:00.44Z" }, - { url = "https://files.pythonhosted.org/packages/41/94/028ff0434a69448f61348d50d2c147dda51aabdd4fbc93ec61343332174d/pynacl-1.6.0-cp38-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:25720bad35dfac34a2bcdd61d9e08d6bfc6041bebc7751d9c9f2446cf1e77d64", size = 833907, upload-time = "2025-09-10T23:38:50.936Z" }, - { url = "https://files.pythonhosted.org/packages/52/bc/a5cff7f8c30d5f4c26a07dfb0bcda1176ab8b2de86dda3106c00a02ad787/pynacl-1.6.0-cp38-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8bfaa0a28a1ab718bad6239979a5a57a8d1506d0caf2fba17e524dbb409441cf", size = 1436649, upload-time = "2025-09-10T23:38:52.783Z" }, - { url = "https://files.pythonhosted.org/packages/7a/20/c397be374fd5d84295046e398de4ba5f0722dc14450f65db76a43c121471/pynacl-1.6.0-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:ef214b90556bb46a485b7da8258e59204c244b1b5b576fb71848819b468c44a7", size = 817142, upload-time = "2025-09-10T23:38:54.4Z" }, - { url = "https://files.pythonhosted.org/packages/12/30/5efcef3406940cda75296c6d884090b8a9aad2dcc0c304daebb5ae99fb4a/pynacl-1.6.0-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:49c336dd80ea54780bcff6a03ee1a476be1612423010472e60af83452aa0f442", size = 1401794, upload-time = "2025-09-10T23:38:56.614Z" }, - { url = "https://files.pythonhosted.org/packages/be/e1/a8fe1248cc17ccb03b676d80fa90763760a6d1247da434844ea388d0816c/pynacl-1.6.0-cp38-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:f3482abf0f9815e7246d461fab597aa179b7524628a4bc36f86a7dc418d2608d", size = 772161, upload-time = "2025-09-10T23:39:01.93Z" }, - { url = "https://files.pythonhosted.org/packages/a3/76/8a62702fb657d6d9104ce13449db221a345665d05e6a3fdefb5a7cafd2ad/pynacl-1.6.0-cp38-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:140373378e34a1f6977e573033d1dd1de88d2a5d90ec6958c9485b2fd9f3eb90", size = 1370720, upload-time = "2025-09-10T23:39:03.531Z" }, - { url = "https://files.pythonhosted.org/packages/6d/38/9e9e9b777a1c4c8204053733e1a0269672c0bd40852908c9ad6b6eaba82c/pynacl-1.6.0-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6b393bc5e5a0eb86bb85b533deb2d2c815666665f840a09e0aa3362bb6088736", size = 791252, upload-time = "2025-09-10T23:39:05.058Z" }, - { url = "https://files.pythonhosted.org/packages/63/ef/d972ce3d92ae05c9091363cf185e8646933f91c376e97b8be79ea6e96c22/pynacl-1.6.0-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:4a25cfede801f01e54179b8ff9514bd7b5944da560b7040939732d1804d25419", size = 1362910, upload-time = "2025-09-10T23:39:06.924Z" }, - { url = "https://files.pythonhosted.org/packages/35/2c/ee0b373a1861f66a7ca8bdb999331525615061320dd628527a50ba8e8a60/pynacl-1.6.0-cp38-abi3-win32.whl", hash = "sha256:dcdeb41c22ff3c66eef5e63049abf7639e0db4edee57ba70531fc1b6b133185d", size = 226461, upload-time = "2025-09-10T23:39:11.894Z" }, - { url = "https://files.pythonhosted.org/packages/75/f7/41b6c0b9dd9970173b6acc026bab7b4c187e4e5beef2756d419ad65482da/pynacl-1.6.0-cp38-abi3-win_amd64.whl", hash = "sha256:cf831615cc16ba324240de79d925eacae8265b7691412ac6b24221db157f6bd1", size = 238802, upload-time = "2025-09-10T23:39:08.966Z" }, - { url = "https://files.pythonhosted.org/packages/8e/0f/462326910c6172fa2c6ed07922b22ffc8e77432b3affffd9e18f444dbfbb/pynacl-1.6.0-cp38-abi3-win_arm64.whl", hash = "sha256:84709cea8f888e618c21ed9a0efdb1a59cc63141c403db8bf56c469b71ad56f2", size = 183846, upload-time = "2025-09-10T23:39:10.552Z" }, -] - -[[package]] -name = "pynvml" -version = "13.0.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "nvidia-ml-py" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/5c/57/da7dc63a79f59e082e26a66ac02d87d69ea316b35b35b7a00d82f3ce3d2f/pynvml-13.0.1.tar.gz", hash = "sha256:1245991d9db786b4d2f277ce66869bd58f38ac654e38c9397d18f243c8f6e48f", size = 35226, upload-time = "2025-09-05T20:33:25.377Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d7/4a/cac76c174bb439a0c46c9a4413fcbea5c6cabfb01879f7bbdb9fdfaed76c/pynvml-13.0.1-py3-none-any.whl", hash = "sha256:e2b20e0a501eeec951e2455b7ab444759cf048e0e13a57b08049fa2775266aa8", size = 28810, upload-time = "2025-09-05T20:33:24.13Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/b2/46/aeca065d227e2265125aea590c9c47fbf5786128c9400ee0eb7c88931f06/pynacl-1.6.1.tar.gz", hash = "sha256:8d361dac0309f2b6ad33b349a56cd163c98430d409fa503b10b70b3ad66eaa1d", size = 3506616, upload-time = "2025-11-10T16:02:13.195Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/75/d6/4b2dca33ed512de8f54e5c6074aa06eaeb225bfbcd9b16f33a414389d6bd/pynacl-1.6.1-cp314-cp314t-macosx_10_10_universal2.whl", hash = "sha256:7d7c09749450c385301a3c20dca967a525152ae4608c0a096fe8464bfc3df93d", size = 389109, upload-time = "2025-11-10T16:01:28.79Z" }, + { url = "https://files.pythonhosted.org/packages/3c/30/e8dbb8ff4fa2559bbbb2187ba0d0d7faf728d17cb8396ecf4a898b22d3da/pynacl-1.6.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc734c1696ffd49b40f7c1779c89ba908157c57345cf626be2e0719488a076d3", size = 808254, upload-time = "2025-11-10T16:01:37.839Z" }, + { url = "https://files.pythonhosted.org/packages/44/f9/f5449c652f31da00249638dbab065ad4969c635119094b79b17c3a4da2ab/pynacl-1.6.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3cd787ec1f5c155dc8ecf39b1333cfef41415dc96d392f1ce288b4fe970df489", size = 1407365, upload-time = "2025-11-10T16:01:40.454Z" }, + { url = "https://files.pythonhosted.org/packages/eb/2f/9aa5605f473b712065c0a193ebf4ad4725d7a245533f0cd7e5dcdbc78f35/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b35d93ab2df03ecb3aa506be0d3c73609a51449ae0855c2e89c7ed44abde40b", size = 843842, upload-time = "2025-11-10T16:01:30.524Z" }, + { url = "https://files.pythonhosted.org/packages/32/8d/748f0f6956e207453da8f5f21a70885fbbb2e060d5c9d78e0a4a06781451/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dece79aecbb8f4640a1adbb81e4aa3bfb0e98e99834884a80eb3f33c7c30e708", size = 1445559, upload-time = "2025-11-10T16:01:33.663Z" }, + { url = "https://files.pythonhosted.org/packages/78/d0/2387f0dcb0e9816f38373999e48db4728ed724d31accdd4e737473319d35/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:c2228054f04bf32d558fb89bb99f163a8197d5a9bf4efa13069a7fa8d4b93fc3", size = 825791, upload-time = "2025-11-10T16:01:34.823Z" }, + { url = "https://files.pythonhosted.org/packages/18/3d/ef6fb7eb072aaf15f280bc66f26ab97e7fc9efa50fb1927683013ef47473/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:2b12f1b97346f177affcdfdc78875ff42637cb40dcf79484a97dae3448083a78", size = 1410843, upload-time = "2025-11-10T16:01:36.401Z" }, + { url = "https://files.pythonhosted.org/packages/e3/fb/23824a017526850ee7d8a1cc4cd1e3e5082800522c10832edbbca8619537/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e735c3a1bdfde3834503baf1a6d74d4a143920281cb724ba29fb84c9f49b9c48", size = 801140, upload-time = "2025-11-10T16:01:42.013Z" }, + { url = "https://files.pythonhosted.org/packages/5d/d1/ebc6b182cb98603a35635b727d62f094bc201bf610f97a3bb6357fe688d2/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3384a454adf5d716a9fadcb5eb2e3e72cd49302d1374a60edc531c9957a9b014", size = 1371966, upload-time = "2025-11-10T16:01:43.297Z" }, + { url = "https://files.pythonhosted.org/packages/64/f4/c9d7b6f02924b1f31db546c7bd2a83a2421c6b4a8e6a2e53425c9f2802e0/pynacl-1.6.1-cp314-cp314t-win32.whl", hash = "sha256:d8615ee34d01c8e0ab3f302dcdd7b32e2bcf698ba5f4809e7cc407c8cdea7717", size = 230482, upload-time = "2025-11-10T16:01:47.688Z" }, + { url = "https://files.pythonhosted.org/packages/c4/2c/942477957fba22da7bf99131850e5ebdff66623418ab48964e78a7a8293e/pynacl-1.6.1-cp314-cp314t-win_amd64.whl", hash = "sha256:5f5b35c1a266f8a9ad22525049280a600b19edd1f785bccd01ae838437dcf935", size = 243232, upload-time = "2025-11-10T16:01:45.208Z" }, + { url = "https://files.pythonhosted.org/packages/7a/0c/bdbc0d04a53b96a765ab03aa2cf9a76ad8653d70bf1665459b9a0dedaa1c/pynacl-1.6.1-cp314-cp314t-win_arm64.whl", hash = "sha256:d984c91fe3494793b2a1fb1e91429539c6c28e9ec8209d26d25041ec599ccf63", size = 187907, upload-time = "2025-11-10T16:01:46.328Z" }, + { url = "https://files.pythonhosted.org/packages/49/41/3cfb3b4f3519f6ff62bf71bf1722547644bcfb1b05b8fdbdc300249ba113/pynacl-1.6.1-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:a6f9fd6d6639b1e81115c7f8ff16b8dedba1e8098d2756275d63d208b0e32021", size = 387591, upload-time = "2025-11-10T16:01:49.1Z" }, + { url = "https://files.pythonhosted.org/packages/18/21/b8a6563637799f617a3960f659513eccb3fcc655d5fc2be6e9dc6416826f/pynacl-1.6.1-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e49a3f3d0da9f79c1bec2aa013261ab9fa651c7da045d376bd306cf7c1792993", size = 798866, upload-time = "2025-11-10T16:01:55.688Z" }, + { url = "https://files.pythonhosted.org/packages/e8/6c/dc38033bc3ea461e05ae8f15a81e0e67ab9a01861d352ae971c99de23e7c/pynacl-1.6.1-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7713f8977b5d25f54a811ec9efa2738ac592e846dd6e8a4d3f7578346a841078", size = 1398001, upload-time = "2025-11-10T16:01:57.101Z" }, + { url = "https://files.pythonhosted.org/packages/9f/05/3ec0796a9917100a62c5073b20c4bce7bf0fea49e99b7906d1699cc7b61b/pynacl-1.6.1-cp38-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a3becafc1ee2e5ea7f9abc642f56b82dcf5be69b961e782a96ea52b55d8a9fc", size = 834024, upload-time = "2025-11-10T16:01:50.228Z" }, + { url = "https://files.pythonhosted.org/packages/f0/b7/ae9982be0f344f58d9c64a1c25d1f0125c79201634efe3c87305ac7cb3e3/pynacl-1.6.1-cp38-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ce50d19f1566c391fedc8dc2f2f5be265ae214112ebe55315e41d1f36a7f0a9", size = 1436766, upload-time = "2025-11-10T16:01:51.886Z" }, + { url = "https://files.pythonhosted.org/packages/b4/51/b2ccbf89cf3025a02e044dd68a365cad593ebf70f532299f2c047d2b7714/pynacl-1.6.1-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:543f869140f67d42b9b8d47f922552d7a967e6c116aad028c9bfc5f3f3b3a7b7", size = 817275, upload-time = "2025-11-10T16:01:53.351Z" }, + { url = "https://files.pythonhosted.org/packages/a8/6c/dd9ee8214edf63ac563b08a9b30f98d116942b621d39a751ac3256694536/pynacl-1.6.1-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a2bb472458c7ca959aeeff8401b8efef329b0fc44a89d3775cffe8fad3398ad8", size = 1401891, upload-time = "2025-11-10T16:01:54.587Z" }, + { url = "https://files.pythonhosted.org/packages/0f/c1/97d3e1c83772d78ee1db3053fd674bc6c524afbace2bfe8d419fd55d7ed1/pynacl-1.6.1-cp38-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3206fa98737fdc66d59b8782cecc3d37d30aeec4593d1c8c145825a345bba0f0", size = 772291, upload-time = "2025-11-10T16:01:58.111Z" }, + { url = "https://files.pythonhosted.org/packages/4d/ca/691ff2fe12f3bb3e43e8e8df4b806f6384593d427f635104d337b8e00291/pynacl-1.6.1-cp38-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:53543b4f3d8acb344f75fd4d49f75e6572fce139f4bfb4815a9282296ff9f4c0", size = 1370839, upload-time = "2025-11-10T16:01:59.252Z" }, + { url = "https://files.pythonhosted.org/packages/30/27/06fe5389d30391fce006442246062cc35773c84fbcad0209fbbf5e173734/pynacl-1.6.1-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:319de653ef84c4f04e045eb250e6101d23132372b0a61a7acf91bac0fda8e58c", size = 791371, upload-time = "2025-11-10T16:02:01.075Z" }, + { url = "https://files.pythonhosted.org/packages/2c/7a/e2bde8c9d39074a5aa046c7d7953401608d1f16f71e237f4bef3fb9d7e49/pynacl-1.6.1-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:262a8de6bba4aee8a66f5edf62c214b06647461c9b6b641f8cd0cb1e3b3196fe", size = 1363031, upload-time = "2025-11-10T16:02:02.656Z" }, + { url = "https://files.pythonhosted.org/packages/dd/b6/63fd77264dae1087770a1bb414bc604470f58fbc21d83822fc9c76248076/pynacl-1.6.1-cp38-abi3-win32.whl", hash = "sha256:9fd1a4eb03caf8a2fe27b515a998d26923adb9ddb68db78e35ca2875a3830dde", size = 226585, upload-time = "2025-11-10T16:02:07.116Z" }, + { url = "https://files.pythonhosted.org/packages/12/c8/b419180f3fdb72ab4d45e1d88580761c267c7ca6eda9a20dcbcba254efe6/pynacl-1.6.1-cp38-abi3-win_amd64.whl", hash = "sha256:a569a4069a7855f963940040f35e87d8bc084cb2d6347428d5ad20550a0a1a21", size = 238923, upload-time = "2025-11-10T16:02:04.401Z" }, + { url = "https://files.pythonhosted.org/packages/35/76/c34426d532e4dce7ff36e4d92cb20f4cbbd94b619964b93d24e8f5b5510f/pynacl-1.6.1-cp38-abi3-win_arm64.whl", hash = "sha256:5953e8b8cfadb10889a6e7bd0f53041a745d1b3d30111386a1bb37af171e6daf", size = 183970, upload-time = "2025-11-10T16:02:05.786Z" }, ] [[package]] @@ -4390,16 +4311,16 @@ wheels = [ [[package]] name = "pytest-asyncio" -version = "1.2.0" +version = "1.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "backports-asyncio-runner", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "pytest" }, { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/42/86/9e3c5f48f7b7b638b216e4b9e645f54d199d7abbbab7a64a13b4e12ba10f/pytest_asyncio-1.2.0.tar.gz", hash = "sha256:c609a64a2a8768462d0c99811ddb8bd2583c33fd33cf7f21af1c142e824ffb57", size = 50119, upload-time = "2025-09-12T07:33:53.816Z" } +sdist = { url = "https://files.pythonhosted.org/packages/90/2c/8af215c0f776415f3590cac4f9086ccefd6fd463befeae41cd4d3f193e5a/pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5", size = 50087, upload-time = "2025-11-10T16:07:47.256Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/04/93/2fa34714b7a4ae72f2f8dad66ba17dd9a2c793220719e736dda28b7aec27/pytest_asyncio-1.2.0-py3-none-any.whl", hash = "sha256:8e17ae5e46d8e7efe51ab6494dd2010f4ca8dae51652aa3c8d55acf50bfb2e99", size = 15095, upload-time = "2025-09-12T07:33:52.639Z" }, + { url = "https://files.pythonhosted.org/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5", size = 15075, upload-time = "2025-11-10T16:07:45.537Z" }, ] [[package]] @@ -4595,7 +4516,7 @@ wheels = [ [[package]] name = "ray" -version = "2.49.2" +version = "2.51.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click" }, @@ -4608,25 +4529,21 @@ dependencies = [ { name = "requests" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/e4/99/517f224ffd073689c4905bdb185c21d9d8936d75066a96d454878f9e1e47/ray-2.49.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:08bec467576bc030d8bd0638004e1b8e075588929349112988a4bd4928684e8c", size = 66869076, upload-time = "2025-09-19T19:14:37.371Z" }, - { url = "https://files.pythonhosted.org/packages/61/c5/c2ceba832fe3f47cfd7e11cd7cc7a1bbc2c028424c5bca70435aa4ca1dec/ray-2.49.2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:3e441bf2acd7f368cf45132752066c5c3b83d88cd5f85762e703774bba4f2b6d", size = 69263514, upload-time = "2025-09-19T19:14:45.519Z" }, - { url = "https://files.pythonhosted.org/packages/63/0e/830df5a0f7e2b582422ee8ad0cdf2a2a9563aa63bb8e60be9ceec494981c/ray-2.49.2-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:eae07b3fed45f5b041a8bf9795cd26fad2464be5126efd447e4484905a29b677", size = 69125462, upload-time = "2025-09-19T19:14:51.029Z" }, - { url = "https://files.pythonhosted.org/packages/c0/85/a340eba596db3f66d3a338aff43942d8bac32732fb4cf4a20ed4bbbd07eb/ray-2.49.2-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:74566876af7bf4e48ea4b9b3b75b34db053d1064cc4d4b1670dc4ce78f6894af", size = 69935752, upload-time = "2025-09-19T19:14:56.191Z" }, - { url = "https://files.pythonhosted.org/packages/ac/e6/809730d87cdf762e76728ea6bb3f96e38fa2dc7ef7d572a49c0d7ebcde95/ray-2.49.2-cp310-cp310-win_amd64.whl", hash = "sha256:e6becc2026d900ca0ba07eff12a130c9d651a91290bb24d43594842b575cc4e5", size = 26246695, upload-time = "2025-09-19T19:15:00.9Z" }, - { url = "https://files.pythonhosted.org/packages/b5/63/27c7fb49513c816b825c809dd33a8570b35d511d1b5e568a4b33b0557997/ray-2.49.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:4fb9f9bf62fd5c92d22da20cd2aacb4ade1fb23033765fa9274f0a0c50bc42f6", size = 66869606, upload-time = "2025-09-19T19:15:05.838Z" }, - { url = "https://files.pythonhosted.org/packages/52/9a/9728d1e9dc5473acf0e4f67081dc323d3333c8c87a1e9260ea8878720017/ray-2.49.2-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:9ece957a13985f7bbf4077f4ff0204314d7e99a941f95dff2a16b453d5376dc3", size = 69273124, upload-time = "2025-09-19T19:15:11.348Z" }, - { url = "https://files.pythonhosted.org/packages/38/67/93f0d6d558874a730581059eb6dfa8860991a5410502ea0685dba5e788e4/ray-2.49.2-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:eada9dd89ccda643a3c6c2cba7016b59898432d126e10b38fed52d74165364f4", size = 69266231, upload-time = "2025-09-19T19:15:16.92Z" }, - { url = "https://files.pythonhosted.org/packages/c1/2b/f2efd0e7bcef06d51422db1af48cc5695a3f9b40a444f9d270a2d4663252/ray-2.49.2-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:54077dde338c5ffba349a4ab61b72352a3c3be69ea5b4f1b436d98d40b312763", size = 70070382, upload-time = "2025-09-19T19:15:22.048Z" }, - { url = "https://files.pythonhosted.org/packages/d7/b5/dfe1240e13d88dc68de03ee7c617f7578ef026e8569a42f7eeeb4729c5e3/ray-2.49.2-cp311-cp311-win_amd64.whl", hash = "sha256:41e11802ebbc487380e6c21dc041cb405e69fdda717a4eafdfeea294c6c3f9ca", size = 26243798, upload-time = "2025-09-19T19:15:26.405Z" }, - { url = "https://files.pythonhosted.org/packages/01/66/0d4e518d611486244b357a6cf58a31d7d184f5558e03d5e482c335749616/ray-2.49.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:d6d612de5c6341b776fc75edeee5b698bb4af7ee84a2ff30552b32a9e6e4a772", size = 66857495, upload-time = "2025-09-19T19:15:31.427Z" }, - { url = "https://files.pythonhosted.org/packages/1a/4c/76f2c7c0946645fdd8d286a3e00e2c42130d676286de206be5d60d271218/ray-2.49.2-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:6784e076e4418222ef8ee3b6a8bfeb867d8797803b25bcfcce3bf3bc5414bef1", size = 69262599, upload-time = "2025-09-19T19:15:36.732Z" }, - { url = "https://files.pythonhosted.org/packages/da/99/23b732c0b7b2ee2ffd28bf632257fb98924a03251d251810cb637512fcab/ray-2.49.2-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:dd0d8d8641d142fafe6d83e87d3c19bd5637d21e34608d3ff69ad71ea3e2f462", size = 69287193, upload-time = "2025-09-19T19:15:42.093Z" }, - { url = "https://files.pythonhosted.org/packages/69/ca/94791be5c3b68ed0df85589a8ca558334818a47bf2978000f85533245aed/ray-2.49.2-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:2ecaaa51f588ccdda2b61563a8be3843bf65dfaaa83a240588a307f4ebb82471", size = 70114942, upload-time = "2025-09-19T19:15:47.536Z" }, - { url = "https://files.pythonhosted.org/packages/e0/22/3f4b77498eefb3152a5946f9f544fcf336e7b9970c5c8af8e2d5eed13f0b/ray-2.49.2-cp312-cp312-win_amd64.whl", hash = "sha256:cba59684f031c9e778c588bc925777967e1b49bab3f00c638e4980bfdab07aec", size = 26223595, upload-time = "2025-09-19T19:15:51.803Z" }, - { url = "https://files.pythonhosted.org/packages/99/dc/a7e569bf7030e0ec50163aed731189e744ca857d74f51b24361ce426697a/ray-2.49.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:2e2fe20fa90562e73630da9ff7932d3ed6507e73291c4d9bdf566537ae9deddf", size = 66803846, upload-time = "2025-09-19T19:15:56.928Z" }, - { url = "https://files.pythonhosted.org/packages/4e/cf/6667e01f39cd28637f082273e9147f16d5f8fff34e2fb0ca60cc5da76e22/ray-2.49.2-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:b2f4f0fed936faf688e87ffdcc9356c034513c00259a2f1a8589e345fcfbdbc0", size = 69208426, upload-time = "2025-09-19T19:16:02.085Z" }, - { url = "https://files.pythonhosted.org/packages/c5/84/5361bcdc9c9fb9f4abbf836801803b7df75c76c16a56493413eb154b8a34/ray-2.49.2-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:b4c7869688c518e902f7b6288edec2365ab4d28a464291e6d0a7040c7d01b5f7", size = 69198140, upload-time = "2025-09-19T19:16:07.413Z" }, - { url = "https://files.pythonhosted.org/packages/b0/0c/9e49c3da7502f18483e4deb3273a3104d501c5e9cf1664a136b8ea36df48/ray-2.49.2-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:b7d8214cff86df044fec727eeeabccc3bfc9b0271d28d61ba92c09f0d127d01d", size = 70027331, upload-time = "2025-09-19T19:16:12.968Z" }, + { url = "https://files.pythonhosted.org/packages/72/4b/8ded0ecb0ed08b75af47340fac4b14b15196a76a6d733f3945cc5cb77354/ray-2.51.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e8ce218c85e9f4043c37136fc90b41343bdb844fcdc9520f21c000d1d8d49f89", size = 68039113, upload-time = "2025-11-01T03:23:30.619Z" }, + { url = "https://files.pythonhosted.org/packages/6d/a7/aba274bd1e1014cb232ee04548cc3d7aab9b84eb13c44d71b72d189421f9/ray-2.51.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:36feb519f31c52d3b4dbcd68ffb2baf93195ceec06ea711e21559096bab95fed", size = 70340511, upload-time = "2025-11-01T03:23:38.217Z" }, + { url = "https://files.pythonhosted.org/packages/fa/42/a5712f4f8c911ea5b8b3cb406ceef18a1c1bc98490c66fa902cb72391af3/ray-2.51.1-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:8a21f5914baa3deefcb4fa5f3878e03b589c190b864fe1b80e6dc0cbfba26004", size = 71166513, upload-time = "2025-11-01T03:23:44.123Z" }, + { url = "https://files.pythonhosted.org/packages/91/1e/eeae1da4ffac6eeeeafce2d11c0b6133fd4df1b3e53bc44d61c30c05b6d9/ray-2.51.1-cp310-cp310-win_amd64.whl", hash = "sha256:a82417b89260ed751a76e9cfaef6d11392ab0da464cde1a9d07a0bb7dc272a7b", size = 26695587, upload-time = "2025-11-01T03:23:49.739Z" }, + { url = "https://files.pythonhosted.org/packages/43/66/f1e11291d9fdf0634ea763cfb167cf449773d13918bb04390e6263b7129b/ray-2.51.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:bd8211fc033be1bce9c039e474e97a9077be593020978fdcfba1d770bdc40ba5", size = 68043927, upload-time = "2025-11-01T03:23:59.655Z" }, + { url = "https://files.pythonhosted.org/packages/be/89/9a11d0addbba6143f5a34929ed1fdef51159328b9b76a877c0c7f98b2848/ray-2.51.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d2d7c8af45441ff50bc002352d31e0afec5c85dd5075bf527027178931497bce", size = 70460551, upload-time = "2025-11-01T03:24:05.77Z" }, + { url = "https://files.pythonhosted.org/packages/f7/67/40a8d63e4cb3ff1a1a5a12db77ca655e21cb13f10e024a9513f24ed11d98/ray-2.51.1-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:dd353010d2548bc345e46c45795f70291bb460c236aa6a3393b51a9cd861b56f", size = 71280610, upload-time = "2025-11-01T03:24:11.981Z" }, + { url = "https://files.pythonhosted.org/packages/62/97/90bcfed6b8c986f9ea24def19bbb81480575dd5fa87630eeaa4c92652507/ray-2.51.1-cp311-cp311-win_amd64.whl", hash = "sha256:606c6e0733eb18fc307c9645ea84ccbd1aad8a5ba8bad764bed54b94e926d33c", size = 26691238, upload-time = "2025-11-01T03:24:16.978Z" }, + { url = "https://files.pythonhosted.org/packages/f6/95/51e44ce79e42f02ca1c4d4c5501e6dd49f3a384c5f6324aceb4e0015988a/ray-2.51.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:ef847b025ca758baee4571a1ca001d973897cad772f8e95d7f303d24c38b649e", size = 68029226, upload-time = "2025-11-01T03:24:21.928Z" }, + { url = "https://files.pythonhosted.org/packages/e2/b5/a93e39e131067edb7cba3385a609f61aaaf7aa54728cd3a7474bfbf3b0fc/ray-2.51.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:0bed9408712bad1511e65683a455302f88d94e5e5cb6a58cc4a154b61d8a0b4a", size = 70502423, upload-time = "2025-11-01T03:24:27.398Z" }, + { url = "https://files.pythonhosted.org/packages/ee/59/69b7a653ed8176fc7fd894d462ed34bb1477e7fa71700324de99179b5b7e/ray-2.51.1-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:4e786da7862cf73664977d0212a505d6d5a585beadf63e7dc1e1c129259bee20", size = 71353730, upload-time = "2025-11-01T03:24:33.495Z" }, + { url = "https://files.pythonhosted.org/packages/38/91/0c4fe7aed34baa14d9c050c88f39ff16083d555bd6dcd6c4ffb4332a6f8a/ray-2.51.1-cp312-cp312-win_amd64.whl", hash = "sha256:198fda93074a6863555f4003e9013bb2ba0cd50b59b18c02affdc294b28a2eef", size = 26674921, upload-time = "2025-11-01T03:24:38.394Z" }, + { url = "https://files.pythonhosted.org/packages/65/1c/3ebf7277d8ae5f99150a5890bff4bdc627021e3a1be7caacd075d2996c7a/ray-2.51.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:d81547886435142dbd79bff1d4e4edf578a5f20e3b11bbd4ced49cfafbd37d27", size = 67974221, upload-time = "2025-11-01T03:24:44.118Z" }, + { url = "https://files.pythonhosted.org/packages/f6/47/13ba6c4d0e97aff94dcf8537f2832d1101c2080a0aea5c973a4de1d4d8bd/ray-2.51.1-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:3f2bd2acf9b7f4738c17d08592caaad26eafb7a4fc380ad9ab42d5f0a78f73ad", size = 70410610, upload-time = "2025-11-01T03:24:50.075Z" }, + { url = "https://files.pythonhosted.org/packages/ac/87/3cdf6d0504659d8192baa6576dd7a17ea395a4d969010274f7cc0e894281/ray-2.51.1-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:265ecd6fd6d4a695b09c686e17d58fca0c09e7198c073628ae7bf4974b03e9ca", size = 71269225, upload-time = "2025-11-01T03:24:55.929Z" }, ] [[package]] @@ -4801,124 +4718,124 @@ wheels = [ [[package]] name = "rpds-py" -version = "0.28.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/48/dc/95f074d43452b3ef5d06276696ece4b3b5d696e7c9ad7173c54b1390cd70/rpds_py-0.28.0.tar.gz", hash = "sha256:abd4df20485a0983e2ca334a216249b6186d6e3c1627e106651943dbdb791aea", size = 27419, upload-time = "2025-10-22T22:24:29.327Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/82/f8/13bb772dc7cbf2c3c5b816febc34fa0cb2c64a08e0569869585684ce6631/rpds_py-0.28.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:7b6013db815417eeb56b2d9d7324e64fcd4fa289caeee6e7a78b2e11fc9b438a", size = 362820, upload-time = "2025-10-22T22:21:15.074Z" }, - { url = "https://files.pythonhosted.org/packages/84/91/6acce964aab32469c3dbe792cb041a752d64739c534e9c493c701ef0c032/rpds_py-0.28.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a4c6b05c685c0c03f80dabaeb73e74218c49deea965ca63f76a752807397207", size = 348499, upload-time = "2025-10-22T22:21:17.658Z" }, - { url = "https://files.pythonhosted.org/packages/f1/93/c05bb1f4f5e0234db7c4917cb8dd5e2e0a9a7b26dc74b1b7bee3c9cfd477/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4794c6c3fbe8f9ac87699b131a1f26e7b4abcf6d828da46a3a52648c7930eba", size = 379356, upload-time = "2025-10-22T22:21:19.847Z" }, - { url = "https://files.pythonhosted.org/packages/5c/37/e292da436f0773e319753c567263427cdf6c645d30b44f09463ff8216cda/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2e8456b6ee5527112ff2354dd9087b030e3429e43a74f480d4a5ca79d269fd85", size = 390151, upload-time = "2025-10-22T22:21:21.569Z" }, - { url = "https://files.pythonhosted.org/packages/76/87/a4e3267131616e8faf10486dc00eaedf09bd61c87f01e5ef98e782ee06c9/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:beb880a9ca0a117415f241f66d56025c02037f7c4efc6fe59b5b8454f1eaa50d", size = 524831, upload-time = "2025-10-22T22:21:23.394Z" }, - { url = "https://files.pythonhosted.org/packages/e1/c8/4a4ca76f0befae9515da3fad11038f0fce44f6bb60b21fe9d9364dd51fb0/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6897bebb118c44b38c9cb62a178e09f1593c949391b9a1a6fe777ccab5934ee7", size = 404687, upload-time = "2025-10-22T22:21:25.201Z" }, - { url = "https://files.pythonhosted.org/packages/6a/65/118afe854424456beafbbebc6b34dcf6d72eae3a08b4632bc4220f8240d9/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1b553dd06e875249fd43efd727785efb57a53180e0fde321468222eabbeaafa", size = 382683, upload-time = "2025-10-22T22:21:26.536Z" }, - { url = "https://files.pythonhosted.org/packages/f7/bc/0625064041fb3a0c77ecc8878c0e8341b0ae27ad0f00cf8f2b57337a1e63/rpds_py-0.28.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:f0b2044fdddeea5b05df832e50d2a06fe61023acb44d76978e1b060206a8a476", size = 398927, upload-time = "2025-10-22T22:21:27.864Z" }, - { url = "https://files.pythonhosted.org/packages/5d/1a/fed7cf2f1ee8a5e4778f2054153f2cfcf517748875e2f5b21cf8907cd77d/rpds_py-0.28.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:05cf1e74900e8da73fa08cc76c74a03345e5a3e37691d07cfe2092d7d8e27b04", size = 411590, upload-time = "2025-10-22T22:21:29.474Z" }, - { url = "https://files.pythonhosted.org/packages/c1/64/a8e0f67fa374a6c472dbb0afdaf1ef744724f165abb6899f20e2f1563137/rpds_py-0.28.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:efd489fec7c311dae25e94fe7eeda4b3d06be71c68f2cf2e8ef990ffcd2cd7e8", size = 559843, upload-time = "2025-10-22T22:21:30.917Z" }, - { url = "https://files.pythonhosted.org/packages/a9/ea/e10353f6d7c105be09b8135b72787a65919971ae0330ad97d87e4e199880/rpds_py-0.28.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ada7754a10faacd4f26067e62de52d6af93b6d9542f0df73c57b9771eb3ba9c4", size = 584188, upload-time = "2025-10-22T22:21:32.827Z" }, - { url = "https://files.pythonhosted.org/packages/18/b0/a19743e0763caf0c89f6fc6ba6fbd9a353b24ffb4256a492420c5517da5a/rpds_py-0.28.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c2a34fd26588949e1e7977cfcbb17a9a42c948c100cab890c6d8d823f0586457", size = 550052, upload-time = "2025-10-22T22:21:34.702Z" }, - { url = "https://files.pythonhosted.org/packages/de/bc/ec2c004f6c7d6ab1e25dae875cdb1aee087c3ebed5b73712ed3000e3851a/rpds_py-0.28.0-cp310-cp310-win32.whl", hash = "sha256:f9174471d6920cbc5e82a7822de8dfd4dcea86eb828b04fc8c6519a77b0ee51e", size = 215110, upload-time = "2025-10-22T22:21:36.645Z" }, - { url = "https://files.pythonhosted.org/packages/6c/de/4ce8abf59674e17187023933547d2018363e8fc76ada4f1d4d22871ccb6e/rpds_py-0.28.0-cp310-cp310-win_amd64.whl", hash = "sha256:6e32dd207e2c4f8475257a3540ab8a93eff997abfa0a3fdb287cae0d6cd874b8", size = 223850, upload-time = "2025-10-22T22:21:38.006Z" }, - { url = "https://files.pythonhosted.org/packages/a6/34/058d0db5471c6be7bef82487ad5021ff8d1d1d27794be8730aad938649cf/rpds_py-0.28.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:03065002fd2e287725d95fbc69688e0c6daf6c6314ba38bdbaa3895418e09296", size = 362344, upload-time = "2025-10-22T22:21:39.713Z" }, - { url = "https://files.pythonhosted.org/packages/5d/67/9503f0ec8c055a0782880f300c50a2b8e5e72eb1f94dfc2053da527444dd/rpds_py-0.28.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:28ea02215f262b6d078daec0b45344c89e161eab9526b0d898221d96fdda5f27", size = 348440, upload-time = "2025-10-22T22:21:41.056Z" }, - { url = "https://files.pythonhosted.org/packages/68/2e/94223ee9b32332a41d75b6f94b37b4ce3e93878a556fc5f152cbd856a81f/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25dbade8fbf30bcc551cb352376c0ad64b067e4fc56f90e22ba70c3ce205988c", size = 379068, upload-time = "2025-10-22T22:21:42.593Z" }, - { url = "https://files.pythonhosted.org/packages/b4/25/54fd48f9f680cfc44e6a7f39a5fadf1d4a4a1fd0848076af4a43e79f998c/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3c03002f54cc855860bfdc3442928ffdca9081e73b5b382ed0b9e8efe6e5e205", size = 390518, upload-time = "2025-10-22T22:21:43.998Z" }, - { url = "https://files.pythonhosted.org/packages/1b/85/ac258c9c27f2ccb1bd5d0697e53a82ebcf8088e3186d5d2bf8498ee7ed44/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b9699fa7990368b22032baf2b2dce1f634388e4ffc03dfefaaac79f4695edc95", size = 525319, upload-time = "2025-10-22T22:21:45.645Z" }, - { url = "https://files.pythonhosted.org/packages/40/cb/c6734774789566d46775f193964b76627cd5f42ecf246d257ce84d1912ed/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b9b06fe1a75e05e0713f06ea0c89ecb6452210fd60e2f1b6ddc1067b990e08d9", size = 404896, upload-time = "2025-10-22T22:21:47.544Z" }, - { url = "https://files.pythonhosted.org/packages/1f/53/14e37ce83202c632c89b0691185dca9532288ff9d390eacae3d2ff771bae/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac9f83e7b326a3f9ec3ef84cda98fb0a74c7159f33e692032233046e7fd15da2", size = 382862, upload-time = "2025-10-22T22:21:49.176Z" }, - { url = "https://files.pythonhosted.org/packages/6a/83/f3642483ca971a54d60caa4449f9d6d4dbb56a53e0072d0deff51b38af74/rpds_py-0.28.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:0d3259ea9ad8743a75a43eb7819324cdab393263c91be86e2d1901ee65c314e0", size = 398848, upload-time = "2025-10-22T22:21:51.024Z" }, - { url = "https://files.pythonhosted.org/packages/44/09/2d9c8b2f88e399b4cfe86efdf2935feaf0394e4f14ab30c6c5945d60af7d/rpds_py-0.28.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a7548b345f66f6695943b4ef6afe33ccd3f1b638bd9afd0f730dd255c249c9e", size = 412030, upload-time = "2025-10-22T22:21:52.665Z" }, - { url = "https://files.pythonhosted.org/packages/dd/f5/e1cec473d4bde6df1fd3738be8e82d64dd0600868e76e92dfeaebbc2d18f/rpds_py-0.28.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9a40040aa388b037eb39416710fbcce9443498d2eaab0b9b45ae988b53f5c67", size = 559700, upload-time = "2025-10-22T22:21:54.123Z" }, - { url = "https://files.pythonhosted.org/packages/8d/be/73bb241c1649edbf14e98e9e78899c2c5e52bbe47cb64811f44d2cc11808/rpds_py-0.28.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8f60c7ea34e78c199acd0d3cda37a99be2c861dd2b8cf67399784f70c9f8e57d", size = 584581, upload-time = "2025-10-22T22:21:56.102Z" }, - { url = "https://files.pythonhosted.org/packages/9c/9c/ffc6e9218cd1eb5c2c7dbd276c87cd10e8c2232c456b554169eb363381df/rpds_py-0.28.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1571ae4292649100d743b26d5f9c63503bb1fedf538a8f29a98dce2d5ba6b4e6", size = 549981, upload-time = "2025-10-22T22:21:58.253Z" }, - { url = "https://files.pythonhosted.org/packages/5f/50/da8b6d33803a94df0149345ee33e5d91ed4d25fc6517de6a25587eae4133/rpds_py-0.28.0-cp311-cp311-win32.whl", hash = "sha256:5cfa9af45e7c1140af7321fa0bef25b386ee9faa8928c80dc3a5360971a29e8c", size = 214729, upload-time = "2025-10-22T22:21:59.625Z" }, - { url = "https://files.pythonhosted.org/packages/12/fd/b0f48c4c320ee24c8c20df8b44acffb7353991ddf688af01eef5f93d7018/rpds_py-0.28.0-cp311-cp311-win_amd64.whl", hash = "sha256:dd8d86b5d29d1b74100982424ba53e56033dc47720a6de9ba0259cf81d7cecaa", size = 223977, upload-time = "2025-10-22T22:22:01.092Z" }, - { url = "https://files.pythonhosted.org/packages/b4/21/c8e77a2ac66e2ec4e21f18a04b4e9a0417ecf8e61b5eaeaa9360a91713b4/rpds_py-0.28.0-cp311-cp311-win_arm64.whl", hash = "sha256:4e27d3a5709cc2b3e013bf93679a849213c79ae0573f9b894b284b55e729e120", size = 217326, upload-time = "2025-10-22T22:22:02.944Z" }, - { url = "https://files.pythonhosted.org/packages/b8/5c/6c3936495003875fe7b14f90ea812841a08fca50ab26bd840e924097d9c8/rpds_py-0.28.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6b4f28583a4f247ff60cd7bdda83db8c3f5b05a7a82ff20dd4b078571747708f", size = 366439, upload-time = "2025-10-22T22:22:04.525Z" }, - { url = "https://files.pythonhosted.org/packages/56/f9/a0f1ca194c50aa29895b442771f036a25b6c41a35e4f35b1a0ea713bedae/rpds_py-0.28.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d678e91b610c29c4b3d52a2c148b641df2b4676ffe47c59f6388d58b99cdc424", size = 348170, upload-time = "2025-10-22T22:22:06.397Z" }, - { url = "https://files.pythonhosted.org/packages/18/ea/42d243d3a586beb72c77fa5def0487daf827210069a95f36328e869599ea/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e819e0e37a44a78e1383bf1970076e2ccc4dc8c2bbaa2f9bd1dc987e9afff628", size = 378838, upload-time = "2025-10-22T22:22:07.932Z" }, - { url = "https://files.pythonhosted.org/packages/e7/78/3de32e18a94791af8f33601402d9d4f39613136398658412a4e0b3047327/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5ee514e0f0523db5d3fb171f397c54875dbbd69760a414dccf9d4d7ad628b5bd", size = 393299, upload-time = "2025-10-22T22:22:09.435Z" }, - { url = "https://files.pythonhosted.org/packages/13/7e/4bdb435afb18acea2eb8a25ad56b956f28de7c59f8a1d32827effa0d4514/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5f3fa06d27fdcee47f07a39e02862da0100cb4982508f5ead53ec533cd5fe55e", size = 518000, upload-time = "2025-10-22T22:22:11.326Z" }, - { url = "https://files.pythonhosted.org/packages/31/d0/5f52a656875cdc60498ab035a7a0ac8f399890cc1ee73ebd567bac4e39ae/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:46959ef2e64f9e4a41fc89aa20dbca2b85531f9a72c21099a3360f35d10b0d5a", size = 408746, upload-time = "2025-10-22T22:22:13.143Z" }, - { url = "https://files.pythonhosted.org/packages/3e/cd/49ce51767b879cde77e7ad9fae164ea15dce3616fe591d9ea1df51152706/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8455933b4bcd6e83fde3fefc987a023389c4b13f9a58c8d23e4b3f6d13f78c84", size = 386379, upload-time = "2025-10-22T22:22:14.602Z" }, - { url = "https://files.pythonhosted.org/packages/6a/99/e4e1e1ee93a98f72fc450e36c0e4d99c35370220e815288e3ecd2ec36a2a/rpds_py-0.28.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:ad50614a02c8c2962feebe6012b52f9802deec4263946cddea37aaf28dd25a66", size = 401280, upload-time = "2025-10-22T22:22:16.063Z" }, - { url = "https://files.pythonhosted.org/packages/61/35/e0c6a57488392a8b319d2200d03dad2b29c0db9996f5662c3b02d0b86c02/rpds_py-0.28.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e5deca01b271492553fdb6c7fd974659dce736a15bae5dad7ab8b93555bceb28", size = 412365, upload-time = "2025-10-22T22:22:17.504Z" }, - { url = "https://files.pythonhosted.org/packages/ff/6a/841337980ea253ec797eb084665436007a1aad0faac1ba097fb906c5f69c/rpds_py-0.28.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:735f8495a13159ce6a0d533f01e8674cec0c57038c920495f87dcb20b3ddb48a", size = 559573, upload-time = "2025-10-22T22:22:19.108Z" }, - { url = "https://files.pythonhosted.org/packages/e7/5e/64826ec58afd4c489731f8b00729c5f6afdb86f1df1df60bfede55d650bb/rpds_py-0.28.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:961ca621ff10d198bbe6ba4957decca61aa2a0c56695384c1d6b79bf61436df5", size = 583973, upload-time = "2025-10-22T22:22:20.768Z" }, - { url = "https://files.pythonhosted.org/packages/b6/ee/44d024b4843f8386a4eeaa4c171b3d31d55f7177c415545fd1a24c249b5d/rpds_py-0.28.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2374e16cc9131022e7d9a8f8d65d261d9ba55048c78f3b6e017971a4f5e6353c", size = 553800, upload-time = "2025-10-22T22:22:22.25Z" }, - { url = "https://files.pythonhosted.org/packages/7d/89/33e675dccff11a06d4d85dbb4d1865f878d5020cbb69b2c1e7b2d3f82562/rpds_py-0.28.0-cp312-cp312-win32.whl", hash = "sha256:d15431e334fba488b081d47f30f091e5d03c18527c325386091f31718952fe08", size = 216954, upload-time = "2025-10-22T22:22:24.105Z" }, - { url = "https://files.pythonhosted.org/packages/af/36/45f6ebb3210887e8ee6dbf1bc710ae8400bb417ce165aaf3024b8360d999/rpds_py-0.28.0-cp312-cp312-win_amd64.whl", hash = "sha256:a410542d61fc54710f750d3764380b53bf09e8c4edbf2f9141a82aa774a04f7c", size = 227844, upload-time = "2025-10-22T22:22:25.551Z" }, - { url = "https://files.pythonhosted.org/packages/57/91/f3fb250d7e73de71080f9a221d19bd6a1c1eb0d12a1ea26513f6c1052ad6/rpds_py-0.28.0-cp312-cp312-win_arm64.whl", hash = "sha256:1f0cfd1c69e2d14f8c892b893997fa9a60d890a0c8a603e88dca4955f26d1edd", size = 217624, upload-time = "2025-10-22T22:22:26.914Z" }, - { url = "https://files.pythonhosted.org/packages/d3/03/ce566d92611dfac0085c2f4b048cd53ed7c274a5c05974b882a908d540a2/rpds_py-0.28.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e9e184408a0297086f880556b6168fa927d677716f83d3472ea333b42171ee3b", size = 366235, upload-time = "2025-10-22T22:22:28.397Z" }, - { url = "https://files.pythonhosted.org/packages/00/34/1c61da1b25592b86fd285bd7bd8422f4c9d748a7373b46126f9ae792a004/rpds_py-0.28.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:edd267266a9b0448f33dc465a97cfc5d467594b600fe28e7fa2f36450e03053a", size = 348241, upload-time = "2025-10-22T22:22:30.171Z" }, - { url = "https://files.pythonhosted.org/packages/fc/00/ed1e28616848c61c493a067779633ebf4b569eccaacf9ccbdc0e7cba2b9d/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85beb8b3f45e4e32f6802fb6cd6b17f615ef6c6a52f265371fb916fae02814aa", size = 378079, upload-time = "2025-10-22T22:22:31.644Z" }, - { url = "https://files.pythonhosted.org/packages/11/b2/ccb30333a16a470091b6e50289adb4d3ec656fd9951ba8c5e3aaa0746a67/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d2412be8d00a1b895f8ad827cc2116455196e20ed994bb704bf138fe91a42724", size = 393151, upload-time = "2025-10-22T22:22:33.453Z" }, - { url = "https://files.pythonhosted.org/packages/8c/d0/73e2217c3ee486d555cb84920597480627d8c0240ff3062005c6cc47773e/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cf128350d384b777da0e68796afdcebc2e9f63f0e9f242217754e647f6d32491", size = 517520, upload-time = "2025-10-22T22:22:34.949Z" }, - { url = "https://files.pythonhosted.org/packages/c4/91/23efe81c700427d0841a4ae7ea23e305654381831e6029499fe80be8a071/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a2036d09b363aa36695d1cc1a97b36865597f4478470b0697b5ee9403f4fe399", size = 408699, upload-time = "2025-10-22T22:22:36.584Z" }, - { url = "https://files.pythonhosted.org/packages/ca/ee/a324d3198da151820a326c1f988caaa4f37fc27955148a76fff7a2d787a9/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8e1e9be4fa6305a16be628959188e4fd5cd6f1b0e724d63c6d8b2a8adf74ea6", size = 385720, upload-time = "2025-10-22T22:22:38.014Z" }, - { url = "https://files.pythonhosted.org/packages/19/ad/e68120dc05af8b7cab4a789fccd8cdcf0fe7e6581461038cc5c164cd97d2/rpds_py-0.28.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:0a403460c9dd91a7f23fc3188de6d8977f1d9603a351d5db6cf20aaea95b538d", size = 401096, upload-time = "2025-10-22T22:22:39.869Z" }, - { url = "https://files.pythonhosted.org/packages/99/90/c1e070620042459d60df6356b666bb1f62198a89d68881816a7ed121595a/rpds_py-0.28.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d7366b6553cdc805abcc512b849a519167db8f5e5c3472010cd1228b224265cb", size = 411465, upload-time = "2025-10-22T22:22:41.395Z" }, - { url = "https://files.pythonhosted.org/packages/68/61/7c195b30d57f1b8d5970f600efee72a4fad79ec829057972e13a0370fd24/rpds_py-0.28.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5b43c6a3726efd50f18d8120ec0551241c38785b68952d240c45ea553912ac41", size = 558832, upload-time = "2025-10-22T22:22:42.871Z" }, - { url = "https://files.pythonhosted.org/packages/b0/3d/06f3a718864773f69941d4deccdf18e5e47dd298b4628062f004c10f3b34/rpds_py-0.28.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:0cb7203c7bc69d7c1585ebb33a2e6074492d2fc21ad28a7b9d40457ac2a51ab7", size = 583230, upload-time = "2025-10-22T22:22:44.877Z" }, - { url = "https://files.pythonhosted.org/packages/66/df/62fc783781a121e77fee9a21ead0a926f1b652280a33f5956a5e7833ed30/rpds_py-0.28.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7a52a5169c664dfb495882adc75c304ae1d50df552fbd68e100fdc719dee4ff9", size = 553268, upload-time = "2025-10-22T22:22:46.441Z" }, - { url = "https://files.pythonhosted.org/packages/84/85/d34366e335140a4837902d3dea89b51f087bd6a63c993ebdff59e93ee61d/rpds_py-0.28.0-cp313-cp313-win32.whl", hash = "sha256:2e42456917b6687215b3e606ab46aa6bca040c77af7df9a08a6dcfe8a4d10ca5", size = 217100, upload-time = "2025-10-22T22:22:48.342Z" }, - { url = "https://files.pythonhosted.org/packages/3c/1c/f25a3f3752ad7601476e3eff395fe075e0f7813fbb9862bd67c82440e880/rpds_py-0.28.0-cp313-cp313-win_amd64.whl", hash = "sha256:e0a0311caedc8069d68fc2bf4c9019b58a2d5ce3cd7cb656c845f1615b577e1e", size = 227759, upload-time = "2025-10-22T22:22:50.219Z" }, - { url = "https://files.pythonhosted.org/packages/e0/d6/5f39b42b99615b5bc2f36ab90423ea404830bdfee1c706820943e9a645eb/rpds_py-0.28.0-cp313-cp313-win_arm64.whl", hash = "sha256:04c1b207ab8b581108801528d59ad80aa83bb170b35b0ddffb29c20e411acdc1", size = 217326, upload-time = "2025-10-22T22:22:51.647Z" }, - { url = "https://files.pythonhosted.org/packages/5c/8b/0c69b72d1cee20a63db534be0df271effe715ef6c744fdf1ff23bb2b0b1c/rpds_py-0.28.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:f296ea3054e11fc58ad42e850e8b75c62d9a93a9f981ad04b2e5ae7d2186ff9c", size = 355736, upload-time = "2025-10-22T22:22:53.211Z" }, - { url = "https://files.pythonhosted.org/packages/f7/6d/0c2ee773cfb55c31a8514d2cece856dd299170a49babd50dcffb15ddc749/rpds_py-0.28.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5a7306c19b19005ad98468fcefeb7100b19c79fc23a5f24a12e06d91181193fa", size = 342677, upload-time = "2025-10-22T22:22:54.723Z" }, - { url = "https://files.pythonhosted.org/packages/e2/1c/22513ab25a27ea205144414724743e305e8153e6abe81833b5e678650f5a/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5d9b86aa501fed9862a443c5c3116f6ead8bc9296185f369277c42542bd646b", size = 371847, upload-time = "2025-10-22T22:22:56.295Z" }, - { url = "https://files.pythonhosted.org/packages/60/07/68e6ccdb4b05115ffe61d31afc94adef1833d3a72f76c9632d4d90d67954/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e5bbc701eff140ba0e872691d573b3d5d30059ea26e5785acba9132d10c8c31d", size = 381800, upload-time = "2025-10-22T22:22:57.808Z" }, - { url = "https://files.pythonhosted.org/packages/73/bf/6d6d15df80781d7f9f368e7c1a00caf764436518c4877fb28b029c4624af/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a5690671cd672a45aa8616d7374fdf334a1b9c04a0cac3c854b1136e92374fe", size = 518827, upload-time = "2025-10-22T22:22:59.826Z" }, - { url = "https://files.pythonhosted.org/packages/7b/d3/2decbb2976cc452cbf12a2b0aaac5f1b9dc5dd9d1f7e2509a3ee00421249/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9f1d92ecea4fa12f978a367c32a5375a1982834649cdb96539dcdc12e609ab1a", size = 399471, upload-time = "2025-10-22T22:23:01.968Z" }, - { url = "https://files.pythonhosted.org/packages/b1/2c/f30892f9e54bd02e5faca3f6a26d6933c51055e67d54818af90abed9748e/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d252db6b1a78d0a3928b6190156042d54c93660ce4d98290d7b16b5296fb7cc", size = 377578, upload-time = "2025-10-22T22:23:03.52Z" }, - { url = "https://files.pythonhosted.org/packages/f0/5d/3bce97e5534157318f29ac06bf2d279dae2674ec12f7cb9c12739cee64d8/rpds_py-0.28.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:d61b355c3275acb825f8777d6c4505f42b5007e357af500939d4a35b19177259", size = 390482, upload-time = "2025-10-22T22:23:05.391Z" }, - { url = "https://files.pythonhosted.org/packages/e3/f0/886bd515ed457b5bd93b166175edb80a0b21a210c10e993392127f1e3931/rpds_py-0.28.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:acbe5e8b1026c0c580d0321c8aae4b0a1e1676861d48d6e8c6586625055b606a", size = 402447, upload-time = "2025-10-22T22:23:06.93Z" }, - { url = "https://files.pythonhosted.org/packages/42/b5/71e8777ac55e6af1f4f1c05b47542a1eaa6c33c1cf0d300dca6a1c6e159a/rpds_py-0.28.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:8aa23b6f0fc59b85b4c7d89ba2965af274346f738e8d9fc2455763602e62fd5f", size = 552385, upload-time = "2025-10-22T22:23:08.557Z" }, - { url = "https://files.pythonhosted.org/packages/5d/cb/6ca2d70cbda5a8e36605e7788c4aa3bea7c17d71d213465a5a675079b98d/rpds_py-0.28.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:7b14b0c680286958817c22d76fcbca4800ddacef6f678f3a7c79a1fe7067fe37", size = 575642, upload-time = "2025-10-22T22:23:10.348Z" }, - { url = "https://files.pythonhosted.org/packages/4a/d4/407ad9960ca7856d7b25c96dcbe019270b5ffdd83a561787bc682c797086/rpds_py-0.28.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bcf1d210dfee61a6c86551d67ee1031899c0fdbae88b2d44a569995d43797712", size = 544507, upload-time = "2025-10-22T22:23:12.434Z" }, - { url = "https://files.pythonhosted.org/packages/51/31/2f46fe0efcac23fbf5797c6b6b7e1c76f7d60773e525cb65fcbc582ee0f2/rpds_py-0.28.0-cp313-cp313t-win32.whl", hash = "sha256:3aa4dc0fdab4a7029ac63959a3ccf4ed605fee048ba67ce89ca3168da34a1342", size = 205376, upload-time = "2025-10-22T22:23:13.979Z" }, - { url = "https://files.pythonhosted.org/packages/92/e4/15947bda33cbedfc134490a41841ab8870a72a867a03d4969d886f6594a2/rpds_py-0.28.0-cp313-cp313t-win_amd64.whl", hash = "sha256:7b7d9d83c942855e4fdcfa75d4f96f6b9e272d42fffcb72cd4bb2577db2e2907", size = 215907, upload-time = "2025-10-22T22:23:15.5Z" }, - { url = "https://files.pythonhosted.org/packages/08/47/ffe8cd7a6a02833b10623bf765fbb57ce977e9a4318ca0e8cf97e9c3d2b3/rpds_py-0.28.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:dcdcb890b3ada98a03f9f2bb108489cdc7580176cb73b4f2d789e9a1dac1d472", size = 353830, upload-time = "2025-10-22T22:23:17.03Z" }, - { url = "https://files.pythonhosted.org/packages/f9/9f/890f36cbd83a58491d0d91ae0db1702639edb33fb48eeb356f80ecc6b000/rpds_py-0.28.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:f274f56a926ba2dc02976ca5b11c32855cbd5925534e57cfe1fda64e04d1add2", size = 341819, upload-time = "2025-10-22T22:23:18.57Z" }, - { url = "https://files.pythonhosted.org/packages/09/e3/921eb109f682aa24fb76207698fbbcf9418738f35a40c21652c29053f23d/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fe0438ac4a29a520ea94c8c7f1754cdd8feb1bc490dfda1bfd990072363d527", size = 373127, upload-time = "2025-10-22T22:23:20.216Z" }, - { url = "https://files.pythonhosted.org/packages/23/13/bce4384d9f8f4989f1a9599c71b7a2d877462e5fd7175e1f69b398f729f4/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8a358a32dd3ae50e933347889b6af9a1bdf207ba5d1a3f34e1a38cd3540e6733", size = 382767, upload-time = "2025-10-22T22:23:21.787Z" }, - { url = "https://files.pythonhosted.org/packages/23/e1/579512b2d89a77c64ccef5a0bc46a6ef7f72ae0cf03d4b26dcd52e57ee0a/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e80848a71c78aa328fefaba9c244d588a342c8e03bda518447b624ea64d1ff56", size = 517585, upload-time = "2025-10-22T22:23:23.699Z" }, - { url = "https://files.pythonhosted.org/packages/62/3c/ca704b8d324a2591b0b0adcfcaadf9c862375b11f2f667ac03c61b4fd0a6/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f586db2e209d54fe177e58e0bc4946bea5fb0102f150b1b2f13de03e1f0976f8", size = 399828, upload-time = "2025-10-22T22:23:25.713Z" }, - { url = "https://files.pythonhosted.org/packages/da/37/e84283b9e897e3adc46b4c88bb3f6ec92a43bd4d2f7ef5b13459963b2e9c/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ae8ee156d6b586e4292491e885d41483136ab994e719a13458055bec14cf370", size = 375509, upload-time = "2025-10-22T22:23:27.32Z" }, - { url = "https://files.pythonhosted.org/packages/1a/c2/a980beab869d86258bf76ec42dec778ba98151f253a952b02fe36d72b29c/rpds_py-0.28.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:a805e9b3973f7e27f7cab63a6b4f61d90f2e5557cff73b6e97cd5b8540276d3d", size = 392014, upload-time = "2025-10-22T22:23:29.332Z" }, - { url = "https://files.pythonhosted.org/packages/da/b5/b1d3c5f9d3fa5aeef74265f9c64de3c34a0d6d5cd3c81c8b17d5c8f10ed4/rpds_py-0.28.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5d3fd16b6dc89c73a4da0b4ac8b12a7ecc75b2864b95c9e5afed8003cb50a728", size = 402410, upload-time = "2025-10-22T22:23:31.14Z" }, - { url = "https://files.pythonhosted.org/packages/74/ae/cab05ff08dfcc052afc73dcb38cbc765ffc86f94e966f3924cd17492293c/rpds_py-0.28.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:6796079e5d24fdaba6d49bda28e2c47347e89834678f2bc2c1b4fc1489c0fb01", size = 553593, upload-time = "2025-10-22T22:23:32.834Z" }, - { url = "https://files.pythonhosted.org/packages/70/80/50d5706ea2a9bfc9e9c5f401d91879e7c790c619969369800cde202da214/rpds_py-0.28.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:76500820c2af232435cbe215e3324c75b950a027134e044423f59f5b9a1ba515", size = 576925, upload-time = "2025-10-22T22:23:34.47Z" }, - { url = "https://files.pythonhosted.org/packages/ab/12/85a57d7a5855a3b188d024b099fd09c90db55d32a03626d0ed16352413ff/rpds_py-0.28.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:bbdc5640900a7dbf9dd707fe6388972f5bbd883633eb68b76591044cfe346f7e", size = 542444, upload-time = "2025-10-22T22:23:36.093Z" }, - { url = "https://files.pythonhosted.org/packages/6c/65/10643fb50179509150eb94d558e8837c57ca8b9adc04bd07b98e57b48f8c/rpds_py-0.28.0-cp314-cp314-win32.whl", hash = "sha256:adc8aa88486857d2b35d75f0640b949759f79dc105f50aa2c27816b2e0dd749f", size = 207968, upload-time = "2025-10-22T22:23:37.638Z" }, - { url = "https://files.pythonhosted.org/packages/b4/84/0c11fe4d9aaea784ff4652499e365963222481ac647bcd0251c88af646eb/rpds_py-0.28.0-cp314-cp314-win_amd64.whl", hash = "sha256:66e6fa8e075b58946e76a78e69e1a124a21d9a48a5b4766d15ba5b06869d1fa1", size = 218876, upload-time = "2025-10-22T22:23:39.179Z" }, - { url = "https://files.pythonhosted.org/packages/0f/e0/3ab3b86ded7bb18478392dc3e835f7b754cd446f62f3fc96f4fe2aca78f6/rpds_py-0.28.0-cp314-cp314-win_arm64.whl", hash = "sha256:a6fe887c2c5c59413353b7c0caff25d0e566623501ccfff88957fa438a69377d", size = 212506, upload-time = "2025-10-22T22:23:40.755Z" }, - { url = "https://files.pythonhosted.org/packages/51/ec/d5681bb425226c3501eab50fc30e9d275de20c131869322c8a1729c7b61c/rpds_py-0.28.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:7a69df082db13c7070f7b8b1f155fa9e687f1d6aefb7b0e3f7231653b79a067b", size = 355433, upload-time = "2025-10-22T22:23:42.259Z" }, - { url = "https://files.pythonhosted.org/packages/be/ec/568c5e689e1cfb1ea8b875cffea3649260955f677fdd7ddc6176902d04cd/rpds_py-0.28.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b1cde22f2c30ebb049a9e74c5374994157b9b70a16147d332f89c99c5960737a", size = 342601, upload-time = "2025-10-22T22:23:44.372Z" }, - { url = "https://files.pythonhosted.org/packages/32/fe/51ada84d1d2a1d9d8f2c902cfddd0133b4a5eb543196ab5161d1c07ed2ad/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5338742f6ba7a51012ea470bd4dc600a8c713c0c72adaa0977a1b1f4327d6592", size = 372039, upload-time = "2025-10-22T22:23:46.025Z" }, - { url = "https://files.pythonhosted.org/packages/07/c1/60144a2f2620abade1a78e0d91b298ac2d9b91bc08864493fa00451ef06e/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e1460ebde1bcf6d496d80b191d854adedcc619f84ff17dc1c6d550f58c9efbba", size = 382407, upload-time = "2025-10-22T22:23:48.098Z" }, - { url = "https://files.pythonhosted.org/packages/45/ed/091a7bbdcf4038a60a461df50bc4c82a7ed6d5d5e27649aab61771c17585/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e3eb248f2feba84c692579257a043a7699e28a77d86c77b032c1d9fbb3f0219c", size = 518172, upload-time = "2025-10-22T22:23:50.16Z" }, - { url = "https://files.pythonhosted.org/packages/54/dd/02cc90c2fd9c2ef8016fd7813bfacd1c3a1325633ec8f244c47b449fc868/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3bbba5def70b16cd1c1d7255666aad3b290fbf8d0fe7f9f91abafb73611a91", size = 399020, upload-time = "2025-10-22T22:23:51.81Z" }, - { url = "https://files.pythonhosted.org/packages/ab/81/5d98cc0329bbb911ccecd0b9e19fbf7f3a5de8094b4cda5e71013b2dd77e/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3114f4db69ac5a1f32e7e4d1cbbe7c8f9cf8217f78e6e002cedf2d54c2a548ed", size = 377451, upload-time = "2025-10-22T22:23:53.711Z" }, - { url = "https://files.pythonhosted.org/packages/b4/07/4d5bcd49e3dfed2d38e2dcb49ab6615f2ceb9f89f5a372c46dbdebb4e028/rpds_py-0.28.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:4b0cb8a906b1a0196b863d460c0222fb8ad0f34041568da5620f9799b83ccf0b", size = 390355, upload-time = "2025-10-22T22:23:55.299Z" }, - { url = "https://files.pythonhosted.org/packages/3f/79/9f14ba9010fee74e4f40bf578735cfcbb91d2e642ffd1abe429bb0b96364/rpds_py-0.28.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cf681ac76a60b667106141e11a92a3330890257e6f559ca995fbb5265160b56e", size = 403146, upload-time = "2025-10-22T22:23:56.929Z" }, - { url = "https://files.pythonhosted.org/packages/39/4c/f08283a82ac141331a83a40652830edd3a4a92c34e07e2bbe00baaea2f5f/rpds_py-0.28.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1e8ee6413cfc677ce8898d9cde18cc3a60fc2ba756b0dec5b71eb6eb21c49fa1", size = 552656, upload-time = "2025-10-22T22:23:58.62Z" }, - { url = "https://files.pythonhosted.org/packages/61/47/d922fc0666f0dd8e40c33990d055f4cc6ecff6f502c2d01569dbed830f9b/rpds_py-0.28.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:b3072b16904d0b5572a15eb9d31c1954e0d3227a585fc1351aa9878729099d6c", size = 576782, upload-time = "2025-10-22T22:24:00.312Z" }, - { url = "https://files.pythonhosted.org/packages/d3/0c/5bafdd8ccf6aa9d3bfc630cfece457ff5b581af24f46a9f3590f790e3df2/rpds_py-0.28.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b670c30fd87a6aec281c3c9896d3bae4b205fd75d79d06dc87c2503717e46092", size = 544671, upload-time = "2025-10-22T22:24:02.297Z" }, - { url = "https://files.pythonhosted.org/packages/2c/37/dcc5d8397caa924988693519069d0beea077a866128719351a4ad95e82fc/rpds_py-0.28.0-cp314-cp314t-win32.whl", hash = "sha256:8014045a15b4d2b3476f0a287fcc93d4f823472d7d1308d47884ecac9e612be3", size = 205749, upload-time = "2025-10-22T22:24:03.848Z" }, - { url = "https://files.pythonhosted.org/packages/d7/69/64d43b21a10d72b45939a28961216baeb721cc2a430f5f7c3bfa21659a53/rpds_py-0.28.0-cp314-cp314t-win_amd64.whl", hash = "sha256:7a4e59c90d9c27c561eb3160323634a9ff50b04e4f7820600a2beb0ac90db578", size = 216233, upload-time = "2025-10-22T22:24:05.471Z" }, - { url = "https://files.pythonhosted.org/packages/ae/bc/b43f2ea505f28119bd551ae75f70be0c803d2dbcd37c1b3734909e40620b/rpds_py-0.28.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f5e7101145427087e493b9c9b959da68d357c28c562792300dd21a095118ed16", size = 363913, upload-time = "2025-10-22T22:24:07.129Z" }, - { url = "https://files.pythonhosted.org/packages/28/f2/db318195d324c89a2c57dc5195058cbadd71b20d220685c5bd1da79ee7fe/rpds_py-0.28.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:31eb671150b9c62409a888850aaa8e6533635704fe2b78335f9aaf7ff81eec4d", size = 350452, upload-time = "2025-10-22T22:24:08.754Z" }, - { url = "https://files.pythonhosted.org/packages/ae/f2/1391c819b8573a4898cedd6b6c5ec5bc370ce59e5d6bdcebe3c9c1db4588/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48b55c1f64482f7d8bd39942f376bfdf2f6aec637ee8c805b5041e14eeb771db", size = 380957, upload-time = "2025-10-22T22:24:10.826Z" }, - { url = "https://files.pythonhosted.org/packages/5a/5c/e5de68ee7eb7248fce93269833d1b329a196d736aefb1a7481d1e99d1222/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:24743a7b372e9a76171f6b69c01aedf927e8ac3e16c474d9fe20d552a8cb45c7", size = 391919, upload-time = "2025-10-22T22:24:12.559Z" }, - { url = "https://files.pythonhosted.org/packages/fb/4f/2376336112cbfeb122fd435d608ad8d5041b3aed176f85a3cb32c262eb80/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:389c29045ee8bbb1627ea190b4976a310a295559eaf9f1464a1a6f2bf84dde78", size = 528541, upload-time = "2025-10-22T22:24:14.197Z" }, - { url = "https://files.pythonhosted.org/packages/68/53/5ae232e795853dd20da7225c5dd13a09c0a905b1a655e92bdf8d78a99fd9/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23690b5827e643150cf7b49569679ec13fe9a610a15949ed48b85eb7f98f34ec", size = 405629, upload-time = "2025-10-22T22:24:16.001Z" }, - { url = "https://files.pythonhosted.org/packages/b9/2d/351a3b852b683ca9b6b8b38ed9efb2347596973849ba6c3a0e99877c10aa/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f0c9266c26580e7243ad0d72fc3e01d6b33866cfab5084a6da7576bcf1c4f72", size = 384123, upload-time = "2025-10-22T22:24:17.585Z" }, - { url = "https://files.pythonhosted.org/packages/e0/15/870804daa00202728cc91cb8e2385fa9f1f4eb49857c49cfce89e304eae6/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:4c6c4db5d73d179746951486df97fd25e92396be07fc29ee8ff9a8f5afbdfb27", size = 400923, upload-time = "2025-10-22T22:24:19.512Z" }, - { url = "https://files.pythonhosted.org/packages/53/25/3706b83c125fa2a0bccceac951de3f76631f6bd0ee4d02a0ed780712ef1b/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a3b695a8fa799dd2cfdb4804b37096c5f6dba1ac7f48a7fbf6d0485bcd060316", size = 413767, upload-time = "2025-10-22T22:24:21.316Z" }, - { url = "https://files.pythonhosted.org/packages/ef/f9/ce43dbe62767432273ed2584cef71fef8411bddfb64125d4c19128015018/rpds_py-0.28.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:6aa1bfce3f83baf00d9c5fcdbba93a3ab79958b4c7d7d1f55e7fe68c20e63912", size = 561530, upload-time = "2025-10-22T22:24:22.958Z" }, - { url = "https://files.pythonhosted.org/packages/46/c9/ffe77999ed8f81e30713dd38fd9ecaa161f28ec48bb80fa1cd9118399c27/rpds_py-0.28.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:7b0f9dceb221792b3ee6acb5438eb1f02b0cb2c247796a72b016dcc92c6de829", size = 585453, upload-time = "2025-10-22T22:24:24.779Z" }, - { url = "https://files.pythonhosted.org/packages/ed/d2/4a73b18821fd4669762c855fd1f4e80ceb66fb72d71162d14da58444a763/rpds_py-0.28.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:5d0145edba8abd3db0ab22b5300c99dc152f5c9021fab861be0f0544dc3cbc5f", size = 552199, upload-time = "2025-10-22T22:24:26.54Z" }, +version = "0.29.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/98/33/23b3b3419b6a3e0f559c7c0d2ca8fc1b9448382b25245033788785921332/rpds_py-0.29.0.tar.gz", hash = "sha256:fe55fe686908f50154d1dc599232016e50c243b438c3b7432f24e2895b0e5359", size = 69359, upload-time = "2025-11-16T14:50:39.532Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/7a/c5b2ff381b74bc742768e8d870f26babac4ef256ba160bdbf8d57af56461/rpds_py-0.29.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:4ae4b88c6617e1b9e5038ab3fccd7bac0842fdda2b703117b2aa99bc85379113", size = 372385, upload-time = "2025-11-16T14:47:36.287Z" }, + { url = "https://files.pythonhosted.org/packages/28/36/531f1eb4d5bed4a9c150f363a7ec4a98d2dc746151bba5473bc38ee85dec/rpds_py-0.29.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7d9128ec9d8cecda6f044001fde4fb71ea7c24325336612ef8179091eb9596b9", size = 362869, upload-time = "2025-11-16T14:47:38.196Z" }, + { url = "https://files.pythonhosted.org/packages/54/df/7e9c0493a2015d9c82807a2d5f023ea9774e27a4c15b33ef1cdb7456138d/rpds_py-0.29.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d37812c3da8e06f2bb35b3cf10e4a7b68e776a706c13058997238762b4e07f4f", size = 391582, upload-time = "2025-11-16T14:47:39.746Z" }, + { url = "https://files.pythonhosted.org/packages/15/38/42a981c3592ef46fbd7e17adbf8730cc5ec87e6aa1770c658c44bbb52960/rpds_py-0.29.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:66786c3fb1d8de416a7fa8e1cb1ec6ba0a745b2b0eee42f9b7daa26f1a495545", size = 405685, upload-time = "2025-11-16T14:47:41.472Z" }, + { url = "https://files.pythonhosted.org/packages/12/45/628b8c15856c3849c3f52ec6dac93c046ed5faeed4a435af03b70525fd29/rpds_py-0.29.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b58f5c77f1af888b5fd1876c9a0d9858f6f88a39c9dd7c073a88e57e577da66d", size = 527067, upload-time = "2025-11-16T14:47:43.036Z" }, + { url = "https://files.pythonhosted.org/packages/dc/ba/6b56d09badeabd95098016d72a437d4a0fd82d4672ce92a7607df5d70a42/rpds_py-0.29.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:799156ef1f3529ed82c36eb012b5d7a4cf4b6ef556dd7cc192148991d07206ae", size = 412532, upload-time = "2025-11-16T14:47:44.484Z" }, + { url = "https://files.pythonhosted.org/packages/f1/39/2f1f3db92888314b50b8f9641f679188bd24b3665a8cb9923b7201ae8011/rpds_py-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:453783477aa4f2d9104c4b59b08c871431647cb7af51b549bbf2d9eb9c827756", size = 392736, upload-time = "2025-11-16T14:47:46.053Z" }, + { url = "https://files.pythonhosted.org/packages/60/43/3c3b1dcd827e50f2ae28786d846b8a351080d8a69a3b49bc10ae44cc39b1/rpds_py-0.29.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:24a7231493e3c4a4b30138b50cca089a598e52c34cf60b2f35cebf62f274fdea", size = 406300, upload-time = "2025-11-16T14:47:47.268Z" }, + { url = "https://files.pythonhosted.org/packages/da/02/bc96021b67f8525e6bcdd68935c4543ada61e1f3dcb067ed037d68b8c6d2/rpds_py-0.29.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7033c1010b1f57bb44d8067e8c25aa6fa2e944dbf46ccc8c92b25043839c3fd2", size = 423641, upload-time = "2025-11-16T14:47:48.878Z" }, + { url = "https://files.pythonhosted.org/packages/38/e9/c435ddb602ced19a80b8277a41371734f33ad3f91cc4ceb4d82596800a3c/rpds_py-0.29.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0248b19405422573621172ab8e3a1f29141362d13d9f72bafa2e28ea0cdca5a2", size = 574153, upload-time = "2025-11-16T14:47:50.435Z" }, + { url = "https://files.pythonhosted.org/packages/84/82/dc3c32e1f89ecba8a59600d4cd65fe0ad81b6c636ccdbf6cd177fd6a7bac/rpds_py-0.29.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:f9f436aee28d13b9ad2c764fc273e0457e37c2e61529a07b928346b219fcde3b", size = 600304, upload-time = "2025-11-16T14:47:51.599Z" }, + { url = "https://files.pythonhosted.org/packages/35/98/785290e0b7142470735dc1b1f68fb33aae29e5296f062c88396eedf796c8/rpds_py-0.29.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:24a16cb7163933906c62c272de20ea3c228e4542c8c45c1d7dc2b9913e17369a", size = 562211, upload-time = "2025-11-16T14:47:53.094Z" }, + { url = "https://files.pythonhosted.org/packages/30/58/4eeddcb0737c6875f3e30c65dc9d7e7a10dfd5779646a990fa602c6d56c5/rpds_py-0.29.0-cp310-cp310-win32.whl", hash = "sha256:1a409b0310a566bfd1be82119891fefbdce615ccc8aa558aff7835c27988cbef", size = 221803, upload-time = "2025-11-16T14:47:54.404Z" }, + { url = "https://files.pythonhosted.org/packages/54/77/b35a8dbdcbeb32505500547cdafaa9f8863e85f8faac50ef34464ec5a256/rpds_py-0.29.0-cp310-cp310-win_amd64.whl", hash = "sha256:c5523b0009e7c3c1263471b69d8da1c7d41b3ecb4cb62ef72be206b92040a950", size = 235530, upload-time = "2025-11-16T14:47:56.061Z" }, + { url = "https://files.pythonhosted.org/packages/36/ab/7fb95163a53ab122c74a7c42d2d2f012819af2cf3deb43fb0d5acf45cc1a/rpds_py-0.29.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:9b9c764a11fd637e0322a488560533112837f5334ffeb48b1be20f6d98a7b437", size = 372344, upload-time = "2025-11-16T14:47:57.279Z" }, + { url = "https://files.pythonhosted.org/packages/b3/45/f3c30084c03b0d0f918cb4c5ae2c20b0a148b51ba2b3f6456765b629bedd/rpds_py-0.29.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3fd2164d73812026ce970d44c3ebd51e019d2a26a4425a5dcbdfa93a34abc383", size = 363041, upload-time = "2025-11-16T14:47:58.908Z" }, + { url = "https://files.pythonhosted.org/packages/e3/e9/4d044a1662608c47a87cbb37b999d4d5af54c6d6ebdda93a4d8bbf8b2a10/rpds_py-0.29.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a097b7f7f7274164566ae90a221fd725363c0e9d243e2e9ed43d195ccc5495c", size = 391775, upload-time = "2025-11-16T14:48:00.197Z" }, + { url = "https://files.pythonhosted.org/packages/50/c9/7616d3ace4e6731aeb6e3cd85123e03aec58e439044e214b9c5c60fd8eb1/rpds_py-0.29.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7cdc0490374e31cedefefaa1520d5fe38e82fde8748cbc926e7284574c714d6b", size = 405624, upload-time = "2025-11-16T14:48:01.496Z" }, + { url = "https://files.pythonhosted.org/packages/c2/e2/6d7d6941ca0843609fd2d72c966a438d6f22617baf22d46c3d2156c31350/rpds_py-0.29.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89ca2e673ddd5bde9b386da9a0aac0cab0e76f40c8f0aaf0d6311b6bbf2aa311", size = 527894, upload-time = "2025-11-16T14:48:03.167Z" }, + { url = "https://files.pythonhosted.org/packages/8d/f7/aee14dc2db61bb2ae1e3068f134ca9da5f28c586120889a70ff504bb026f/rpds_py-0.29.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a5d9da3ff5af1ca1249b1adb8ef0573b94c76e6ae880ba1852f033bf429d4588", size = 412720, upload-time = "2025-11-16T14:48:04.413Z" }, + { url = "https://files.pythonhosted.org/packages/2f/e2/2293f236e887c0360c2723d90c00d48dee296406994d6271faf1712e94ec/rpds_py-0.29.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8238d1d310283e87376c12f658b61e1ee23a14c0e54c7c0ce953efdbdc72deed", size = 392945, upload-time = "2025-11-16T14:48:06.252Z" }, + { url = "https://files.pythonhosted.org/packages/14/cd/ceea6147acd3bd1fd028d1975228f08ff19d62098078d5ec3eed49703797/rpds_py-0.29.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:2d6fb2ad1c36f91c4646989811e84b1ea5e0c3cf9690b826b6e32b7965853a63", size = 406385, upload-time = "2025-11-16T14:48:07.575Z" }, + { url = "https://files.pythonhosted.org/packages/52/36/fe4dead19e45eb77a0524acfdbf51e6cda597b26fc5b6dddbff55fbbb1a5/rpds_py-0.29.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:534dc9df211387547267ccdb42253aa30527482acb38dd9b21c5c115d66a96d2", size = 423943, upload-time = "2025-11-16T14:48:10.175Z" }, + { url = "https://files.pythonhosted.org/packages/a1/7b/4551510803b582fa4abbc8645441a2d15aa0c962c3b21ebb380b7e74f6a1/rpds_py-0.29.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d456e64724a075441e4ed648d7f154dc62e9aabff29bcdf723d0c00e9e1d352f", size = 574204, upload-time = "2025-11-16T14:48:11.499Z" }, + { url = "https://files.pythonhosted.org/packages/64/ba/071ccdd7b171e727a6ae079f02c26f75790b41555f12ca8f1151336d2124/rpds_py-0.29.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:a738f2da2f565989401bd6fd0b15990a4d1523c6d7fe83f300b7e7d17212feca", size = 600587, upload-time = "2025-11-16T14:48:12.822Z" }, + { url = "https://files.pythonhosted.org/packages/03/09/96983d48c8cf5a1e03c7d9cc1f4b48266adfb858ae48c7c2ce978dbba349/rpds_py-0.29.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a110e14508fd26fd2e472bb541f37c209409876ba601cf57e739e87d8a53cf95", size = 562287, upload-time = "2025-11-16T14:48:14.108Z" }, + { url = "https://files.pythonhosted.org/packages/40/f0/8c01aaedc0fa92156f0391f39ea93b5952bc0ec56b897763858f95da8168/rpds_py-0.29.0-cp311-cp311-win32.whl", hash = "sha256:923248a56dd8d158389a28934f6f69ebf89f218ef96a6b216a9be6861804d3f4", size = 221394, upload-time = "2025-11-16T14:48:15.374Z" }, + { url = "https://files.pythonhosted.org/packages/7e/a5/a8b21c54c7d234efdc83dc034a4d7cd9668e3613b6316876a29b49dece71/rpds_py-0.29.0-cp311-cp311-win_amd64.whl", hash = "sha256:539eb77eb043afcc45314d1be09ea6d6cafb3addc73e0547c171c6d636957f60", size = 235713, upload-time = "2025-11-16T14:48:16.636Z" }, + { url = "https://files.pythonhosted.org/packages/a7/1f/df3c56219523947b1be402fa12e6323fe6d61d883cf35d6cb5d5bb6db9d9/rpds_py-0.29.0-cp311-cp311-win_arm64.whl", hash = "sha256:bdb67151ea81fcf02d8f494703fb728d4d34d24556cbff5f417d74f6f5792e7c", size = 229157, upload-time = "2025-11-16T14:48:17.891Z" }, + { url = "https://files.pythonhosted.org/packages/3c/50/bc0e6e736d94e420df79be4deb5c9476b63165c87bb8f19ef75d100d21b3/rpds_py-0.29.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a0891cfd8db43e085c0ab93ab7e9b0c8fee84780d436d3b266b113e51e79f954", size = 376000, upload-time = "2025-11-16T14:48:19.141Z" }, + { url = "https://files.pythonhosted.org/packages/3e/3a/46676277160f014ae95f24de53bed0e3b7ea66c235e7de0b9df7bd5d68ba/rpds_py-0.29.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3897924d3f9a0361472d884051f9a2460358f9a45b1d85a39a158d2f8f1ad71c", size = 360575, upload-time = "2025-11-16T14:48:20.443Z" }, + { url = "https://files.pythonhosted.org/packages/75/ba/411d414ed99ea1afdd185bbabeeaac00624bd1e4b22840b5e9967ade6337/rpds_py-0.29.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a21deb8e0d1571508c6491ce5ea5e25669b1dd4adf1c9d64b6314842f708b5d", size = 392159, upload-time = "2025-11-16T14:48:22.12Z" }, + { url = "https://files.pythonhosted.org/packages/8f/b1/e18aa3a331f705467a48d0296778dc1fea9d7f6cf675bd261f9a846c7e90/rpds_py-0.29.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9efe71687d6427737a0a2de9ca1c0a216510e6cd08925c44162be23ed7bed2d5", size = 410602, upload-time = "2025-11-16T14:48:23.563Z" }, + { url = "https://files.pythonhosted.org/packages/2f/6c/04f27f0c9f2299274c76612ac9d2c36c5048bb2c6c2e52c38c60bf3868d9/rpds_py-0.29.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:40f65470919dc189c833e86b2c4bd21bd355f98436a2cef9e0a9a92aebc8e57e", size = 515808, upload-time = "2025-11-16T14:48:24.949Z" }, + { url = "https://files.pythonhosted.org/packages/83/56/a8412aa464fb151f8bc0d91fb0bb888adc9039bd41c1c6ba8d94990d8cf8/rpds_py-0.29.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:def48ff59f181130f1a2cb7c517d16328efac3ec03951cca40c1dc2049747e83", size = 416015, upload-time = "2025-11-16T14:48:26.782Z" }, + { url = "https://files.pythonhosted.org/packages/04/4c/f9b8a05faca3d9e0a6397c90d13acb9307c9792b2bff621430c58b1d6e76/rpds_py-0.29.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad7bd570be92695d89285a4b373006930715b78d96449f686af422debb4d3949", size = 395325, upload-time = "2025-11-16T14:48:28.055Z" }, + { url = "https://files.pythonhosted.org/packages/34/60/869f3bfbf8ed7b54f1ad9a5543e0fdffdd40b5a8f587fe300ee7b4f19340/rpds_py-0.29.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:5a572911cd053137bbff8e3a52d31c5d2dba51d3a67ad902629c70185f3f2181", size = 410160, upload-time = "2025-11-16T14:48:29.338Z" }, + { url = "https://files.pythonhosted.org/packages/91/aa/e5b496334e3aba4fe4c8a80187b89f3c1294c5c36f2a926da74338fa5a73/rpds_py-0.29.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d583d4403bcbf10cffc3ab5cee23d7643fcc960dff85973fd3c2d6c86e8dbb0c", size = 425309, upload-time = "2025-11-16T14:48:30.691Z" }, + { url = "https://files.pythonhosted.org/packages/85/68/4e24a34189751ceb6d66b28f18159922828dd84155876551f7ca5b25f14f/rpds_py-0.29.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:070befbb868f257d24c3bb350dbd6e2f645e83731f31264b19d7231dd5c396c7", size = 574644, upload-time = "2025-11-16T14:48:31.964Z" }, + { url = "https://files.pythonhosted.org/packages/8c/cf/474a005ea4ea9c3b4f17b6108b6b13cebfc98ebaff11d6e1b193204b3a93/rpds_py-0.29.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:fc935f6b20b0c9f919a8ff024739174522abd331978f750a74bb68abd117bd19", size = 601605, upload-time = "2025-11-16T14:48:33.252Z" }, + { url = "https://files.pythonhosted.org/packages/f4/b1/c56f6a9ab8c5f6bb5c65c4b5f8229167a3a525245b0773f2c0896686b64e/rpds_py-0.29.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8c5a8ecaa44ce2d8d9d20a68a2483a74c07f05d72e94a4dff88906c8807e77b0", size = 564593, upload-time = "2025-11-16T14:48:34.643Z" }, + { url = "https://files.pythonhosted.org/packages/b3/13/0494cecce4848f68501e0a229432620b4b57022388b071eeff95f3e1e75b/rpds_py-0.29.0-cp312-cp312-win32.whl", hash = "sha256:ba5e1aeaf8dd6d8f6caba1f5539cddda87d511331714b7b5fc908b6cfc3636b7", size = 223853, upload-time = "2025-11-16T14:48:36.419Z" }, + { url = "https://files.pythonhosted.org/packages/1f/6a/51e9aeb444a00cdc520b032a28b07e5f8dc7bc328b57760c53e7f96997b4/rpds_py-0.29.0-cp312-cp312-win_amd64.whl", hash = "sha256:b5f6134faf54b3cb83375db0f113506f8b7770785be1f95a631e7e2892101977", size = 239895, upload-time = "2025-11-16T14:48:37.956Z" }, + { url = "https://files.pythonhosted.org/packages/d1/d4/8bce56cdad1ab873e3f27cb31c6a51d8f384d66b022b820525b879f8bed1/rpds_py-0.29.0-cp312-cp312-win_arm64.whl", hash = "sha256:b016eddf00dca7944721bf0cd85b6af7f6c4efaf83ee0b37c4133bd39757a8c7", size = 230321, upload-time = "2025-11-16T14:48:39.71Z" }, + { url = "https://files.pythonhosted.org/packages/fd/d9/c5de60d9d371bbb186c3e9bf75f4fc5665e11117a25a06a6b2e0afb7380e/rpds_py-0.29.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1585648d0760b88292eecab5181f5651111a69d90eff35d6b78aa32998886a61", size = 375710, upload-time = "2025-11-16T14:48:41.063Z" }, + { url = "https://files.pythonhosted.org/packages/b3/b3/0860cdd012291dc21272895ce107f1e98e335509ba986dd83d72658b82b9/rpds_py-0.29.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:521807963971a23996ddaf764c682b3e46459b3c58ccd79fefbe16718db43154", size = 360582, upload-time = "2025-11-16T14:48:42.423Z" }, + { url = "https://files.pythonhosted.org/packages/92/8a/a18c2f4a61b3407e56175f6aab6deacdf9d360191a3d6f38566e1eaf7266/rpds_py-0.29.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a8896986efaa243ab713c69e6491a4138410f0fe36f2f4c71e18bd5501e8014", size = 391172, upload-time = "2025-11-16T14:48:43.75Z" }, + { url = "https://files.pythonhosted.org/packages/fd/49/e93354258508c50abc15cdcd5fcf7ac4117f67bb6233ad7859f75e7372a0/rpds_py-0.29.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1d24564a700ef41480a984c5ebed62b74e6ce5860429b98b1fede76049e953e6", size = 409586, upload-time = "2025-11-16T14:48:45.498Z" }, + { url = "https://files.pythonhosted.org/packages/5a/8d/a27860dae1c19a6bdc901f90c81f0d581df1943355802961a57cdb5b6cd1/rpds_py-0.29.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e6596b93c010d386ae46c9fba9bfc9fc5965fa8228edeac51576299182c2e31c", size = 516339, upload-time = "2025-11-16T14:48:47.308Z" }, + { url = "https://files.pythonhosted.org/packages/fc/ad/a75e603161e79b7110c647163d130872b271c6b28712c803c65d492100f7/rpds_py-0.29.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5cc58aac218826d054c7da7f95821eba94125d88be673ff44267bb89d12a5866", size = 416201, upload-time = "2025-11-16T14:48:48.615Z" }, + { url = "https://files.pythonhosted.org/packages/b9/42/555b4ee17508beafac135c8b450816ace5a96194ce97fefc49d58e5652ea/rpds_py-0.29.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de73e40ebc04dd5d9556f50180395322193a78ec247e637e741c1b954810f295", size = 395095, upload-time = "2025-11-16T14:48:50.027Z" }, + { url = "https://files.pythonhosted.org/packages/cd/f0/c90b671b9031e800ec45112be42ea9f027f94f9ac25faaac8770596a16a1/rpds_py-0.29.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:295ce5ac7f0cf69a651ea75c8f76d02a31f98e5698e82a50a5f4d4982fbbae3b", size = 410077, upload-time = "2025-11-16T14:48:51.515Z" }, + { url = "https://files.pythonhosted.org/packages/3d/80/9af8b640b81fe21e6f718e9dec36c0b5f670332747243130a5490f292245/rpds_py-0.29.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1ea59b23ea931d494459c8338056fe7d93458c0bf3ecc061cd03916505369d55", size = 424548, upload-time = "2025-11-16T14:48:53.237Z" }, + { url = "https://files.pythonhosted.org/packages/e4/0b/b5647446e991736e6a495ef510e6710df91e880575a586e763baeb0aa770/rpds_py-0.29.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f49d41559cebd608042fdcf54ba597a4a7555b49ad5c1c0c03e0af82692661cd", size = 573661, upload-time = "2025-11-16T14:48:54.769Z" }, + { url = "https://files.pythonhosted.org/packages/f7/b3/1b1c9576839ff583d1428efbf59f9ee70498d8ce6c0b328ac02f1e470879/rpds_py-0.29.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:05a2bd42768ea988294ca328206efbcc66e220d2d9b7836ee5712c07ad6340ea", size = 600937, upload-time = "2025-11-16T14:48:56.247Z" }, + { url = "https://files.pythonhosted.org/packages/6c/7b/b6cfca2f9fee4c4494ce54f7fb1b9f578867495a9aa9fc0d44f5f735c8e0/rpds_py-0.29.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:33ca7bdfedd83339ca55da3a5e1527ee5870d4b8369456b5777b197756f3ca22", size = 564496, upload-time = "2025-11-16T14:48:57.691Z" }, + { url = "https://files.pythonhosted.org/packages/b9/fb/ba29ec7f0f06eb801bac5a23057a9ff7670623b5e8013bd59bec4aa09de8/rpds_py-0.29.0-cp313-cp313-win32.whl", hash = "sha256:20c51ae86a0bb9accc9ad4e6cdeec58d5ebb7f1b09dd4466331fc65e1766aae7", size = 223126, upload-time = "2025-11-16T14:48:59.058Z" }, + { url = "https://files.pythonhosted.org/packages/3c/6b/0229d3bed4ddaa409e6d90b0ae967ed4380e4bdd0dad6e59b92c17d42457/rpds_py-0.29.0-cp313-cp313-win_amd64.whl", hash = "sha256:6410e66f02803600edb0b1889541f4b5cc298a5ccda0ad789cc50ef23b54813e", size = 239771, upload-time = "2025-11-16T14:49:00.872Z" }, + { url = "https://files.pythonhosted.org/packages/e4/38/d2868f058b164f8efd89754d85d7b1c08b454f5c07ac2e6cc2e9bd4bd05b/rpds_py-0.29.0-cp313-cp313-win_arm64.whl", hash = "sha256:56838e1cd9174dc23c5691ee29f1d1be9eab357f27efef6bded1328b23e1ced2", size = 229994, upload-time = "2025-11-16T14:49:02.673Z" }, + { url = "https://files.pythonhosted.org/packages/52/91/5de91c5ec7d41759beec9b251630824dbb8e32d20c3756da1a9a9d309709/rpds_py-0.29.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:37d94eadf764d16b9a04307f2ab1d7af6dc28774bbe0535c9323101e14877b4c", size = 365886, upload-time = "2025-11-16T14:49:04.133Z" }, + { url = "https://files.pythonhosted.org/packages/85/7c/415d8c1b016d5f47ecec5145d9d6d21002d39dce8761b30f6c88810b455a/rpds_py-0.29.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d472cf73efe5726a067dce63eebe8215b14beabea7c12606fd9994267b3cfe2b", size = 355262, upload-time = "2025-11-16T14:49:05.543Z" }, + { url = "https://files.pythonhosted.org/packages/3d/14/bf83e2daa4f980e4dc848aed9299792a8b84af95e12541d9e7562f84a6ef/rpds_py-0.29.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:72fdfd5ff8992e4636621826371e3ac5f3e3b8323e9d0e48378e9c13c3dac9d0", size = 384826, upload-time = "2025-11-16T14:49:07.301Z" }, + { url = "https://files.pythonhosted.org/packages/33/b8/53330c50a810ae22b4fbba5e6cf961b68b9d72d9bd6780a7c0a79b070857/rpds_py-0.29.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2549d833abdf8275c901313b9e8ff8fba57e50f6a495035a2a4e30621a2f7cc4", size = 394234, upload-time = "2025-11-16T14:49:08.782Z" }, + { url = "https://files.pythonhosted.org/packages/cc/32/01e2e9645cef0e584f518cfde4567563e57db2257244632b603f61b40e50/rpds_py-0.29.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4448dad428f28a6a767c3e3b80cde3446a22a0efbddaa2360f4bb4dc836d0688", size = 520008, upload-time = "2025-11-16T14:49:10.253Z" }, + { url = "https://files.pythonhosted.org/packages/98/c3/0d1b95a81affae2b10f950782e33a1fd2edd6ce2a479966cac98c9a66f57/rpds_py-0.29.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:115f48170fd4296a33938d8c11f697f5f26e0472e43d28f35624764173a60e4d", size = 409569, upload-time = "2025-11-16T14:49:12.478Z" }, + { url = "https://files.pythonhosted.org/packages/fa/60/aa3b8678f3f009f675b99174fa2754302a7fbfe749162e8043d111de2d88/rpds_py-0.29.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e5bb73ffc029820f4348e9b66b3027493ae00bca6629129cd433fd7a76308ee", size = 385188, upload-time = "2025-11-16T14:49:13.88Z" }, + { url = "https://files.pythonhosted.org/packages/92/02/5546c1c8aa89c18d40c1fcffdcc957ba730dee53fb7c3ca3a46f114761d2/rpds_py-0.29.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:b1581fcde18fcdf42ea2403a16a6b646f8eb1e58d7f90a0ce693da441f76942e", size = 398587, upload-time = "2025-11-16T14:49:15.339Z" }, + { url = "https://files.pythonhosted.org/packages/6c/e0/ad6eeaf47e236eba052fa34c4073078b9e092bd44da6bbb35aaae9580669/rpds_py-0.29.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:16e9da2bda9eb17ea318b4c335ec9ac1818e88922cbe03a5743ea0da9ecf74fb", size = 416641, upload-time = "2025-11-16T14:49:16.832Z" }, + { url = "https://files.pythonhosted.org/packages/1a/93/0acedfd50ad9cdd3879c615a6dc8c5f1ce78d2fdf8b87727468bb5bb4077/rpds_py-0.29.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:28fd300326dd21198f311534bdb6d7e989dd09b3418b3a91d54a0f384c700967", size = 566683, upload-time = "2025-11-16T14:49:18.342Z" }, + { url = "https://files.pythonhosted.org/packages/62/53/8c64e0f340a9e801459fc6456821abc15b3582cb5dc3932d48705a9d9ac7/rpds_py-0.29.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:2aba991e041d031c7939e1358f583ae405a7bf04804ca806b97a5c0e0af1ea5e", size = 592730, upload-time = "2025-11-16T14:49:19.767Z" }, + { url = "https://files.pythonhosted.org/packages/85/ef/3109b6584f8c4b0d2490747c916df833c127ecfa82be04d9a40a376f2090/rpds_py-0.29.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7f437026dbbc3f08c99cc41a5b2570c6e1a1ddbe48ab19a9b814254128d4ea7a", size = 557361, upload-time = "2025-11-16T14:49:21.574Z" }, + { url = "https://files.pythonhosted.org/packages/ff/3b/61586475e82d57f01da2c16edb9115a618afe00ce86fe1b58936880b15af/rpds_py-0.29.0-cp313-cp313t-win32.whl", hash = "sha256:6e97846e9800a5d0fe7be4d008f0c93d0feeb2700da7b1f7528dabafb31dfadb", size = 211227, upload-time = "2025-11-16T14:49:23.03Z" }, + { url = "https://files.pythonhosted.org/packages/3b/3a/12dc43f13594a54ea0c9d7e9d43002116557330e3ad45bc56097ddf266e2/rpds_py-0.29.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f49196aec7c4b406495f60e6f947ad71f317a765f956d74bbd83996b9edc0352", size = 225248, upload-time = "2025-11-16T14:49:24.841Z" }, + { url = "https://files.pythonhosted.org/packages/89/b1/0b1474e7899371d9540d3bbb2a499a3427ae1fc39c998563fe9035a1073b/rpds_py-0.29.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:394d27e4453d3b4d82bb85665dc1fcf4b0badc30fc84282defed71643b50e1a1", size = 363731, upload-time = "2025-11-16T14:49:26.683Z" }, + { url = "https://files.pythonhosted.org/packages/28/12/3b7cf2068d0a334ed1d7b385a9c3c8509f4c2bcba3d4648ea71369de0881/rpds_py-0.29.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:55d827b2ae95425d3be9bc9a5838b6c29d664924f98146557f7715e331d06df8", size = 354343, upload-time = "2025-11-16T14:49:28.24Z" }, + { url = "https://files.pythonhosted.org/packages/eb/73/5afcf8924bc02a749416eda64e17ac9c9b28f825f4737385295a0e99b0c1/rpds_py-0.29.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc31a07ed352e5462d3ee1b22e89285f4ce97d5266f6d1169da1142e78045626", size = 385406, upload-time = "2025-11-16T14:49:29.943Z" }, + { url = "https://files.pythonhosted.org/packages/c8/37/5db736730662508535221737a21563591b6f43c77f2e388951c42f143242/rpds_py-0.29.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c4695dd224212f6105db7ea62197144230b808d6b2bba52238906a2762f1d1e7", size = 396162, upload-time = "2025-11-16T14:49:31.833Z" }, + { url = "https://files.pythonhosted.org/packages/70/0d/491c1017d14f62ce7bac07c32768d209a50ec567d76d9f383b4cfad19b80/rpds_py-0.29.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcae1770b401167f8b9e1e3f566562e6966ffa9ce63639916248a9e25fa8a244", size = 517719, upload-time = "2025-11-16T14:49:33.804Z" }, + { url = "https://files.pythonhosted.org/packages/d7/25/b11132afcb17cd5d82db173f0c8dab270ffdfaba43e5ce7a591837ae9649/rpds_py-0.29.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:90f30d15f45048448b8da21c41703b31c61119c06c216a1bf8c245812a0f0c17", size = 409498, upload-time = "2025-11-16T14:49:35.222Z" }, + { url = "https://files.pythonhosted.org/packages/0f/7d/e6543cedfb2e6403a1845710a5ab0e0ccf8fc288e0b5af9a70bfe2c12053/rpds_py-0.29.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a91e0ab77bdc0004b43261a4b8cd6d6b451e8d443754cfda830002b5745b32", size = 382743, upload-time = "2025-11-16T14:49:36.704Z" }, + { url = "https://files.pythonhosted.org/packages/75/11/a4ebc9f654293ae9fefb83b2b6be7f3253e85ea42a5db2f77d50ad19aaeb/rpds_py-0.29.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:4aa195e5804d32c682e453b34474f411ca108e4291c6a0f824ebdc30a91c973c", size = 400317, upload-time = "2025-11-16T14:49:39.132Z" }, + { url = "https://files.pythonhosted.org/packages/52/18/97677a60a81c7f0e5f64e51fb3f8271c5c8fcabf3a2df18e97af53d7c2bf/rpds_py-0.29.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7971bdb7bf4ee0f7e6f67fa4c7fbc6019d9850cc977d126904392d363f6f8318", size = 416979, upload-time = "2025-11-16T14:49:40.575Z" }, + { url = "https://files.pythonhosted.org/packages/f0/69/28ab391a9968f6c746b2a2db181eaa4d16afaa859fedc9c2f682d19f7e18/rpds_py-0.29.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8ae33ad9ce580c7a47452c3b3f7d8a9095ef6208e0a0c7e4e2384f9fc5bf8212", size = 567288, upload-time = "2025-11-16T14:49:42.24Z" }, + { url = "https://files.pythonhosted.org/packages/3b/d3/0c7afdcdb830eee94f5611b64e71354ffe6ac8df82d00c2faf2bfffd1d4e/rpds_py-0.29.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:c661132ab2fb4eeede2ef69670fd60da5235209874d001a98f1542f31f2a8a94", size = 593157, upload-time = "2025-11-16T14:49:43.782Z" }, + { url = "https://files.pythonhosted.org/packages/e2/ac/a0fcbc2feed4241cf26d32268c195eb88ddd4bd862adfc9d4b25edfba535/rpds_py-0.29.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:bb78b3a0d31ac1bde132c67015a809948db751cb4e92cdb3f0b242e430b6ed0d", size = 554741, upload-time = "2025-11-16T14:49:45.557Z" }, + { url = "https://files.pythonhosted.org/packages/0f/f1/fcc24137c470df8588674a677f33719d5800ec053aaacd1de8a5d5d84d9e/rpds_py-0.29.0-cp314-cp314-win32.whl", hash = "sha256:f475f103488312e9bd4000bc890a95955a07b2d0b6e8884aef4be56132adbbf1", size = 215508, upload-time = "2025-11-16T14:49:47.562Z" }, + { url = "https://files.pythonhosted.org/packages/7b/c7/1d169b2045512eac019918fc1021ea07c30e84a4343f9f344e3e0aa8c788/rpds_py-0.29.0-cp314-cp314-win_amd64.whl", hash = "sha256:b9cf2359a4fca87cfb6801fae83a76aedf66ee1254a7a151f1341632acf67f1b", size = 228125, upload-time = "2025-11-16T14:49:49.064Z" }, + { url = "https://files.pythonhosted.org/packages/be/36/0cec88aaba70ec4a6e381c444b0d916738497d27f0c30406e3d9fcbd3bc2/rpds_py-0.29.0-cp314-cp314-win_arm64.whl", hash = "sha256:9ba8028597e824854f0f1733d8b964e914ae3003b22a10c2c664cb6927e0feb9", size = 221992, upload-time = "2025-11-16T14:49:50.777Z" }, + { url = "https://files.pythonhosted.org/packages/b1/fa/a2e524631717c9c0eb5d90d30f648cfba6b731047821c994acacb618406c/rpds_py-0.29.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:e71136fd0612556b35c575dc2726ae04a1669e6a6c378f2240312cf5d1a2ab10", size = 366425, upload-time = "2025-11-16T14:49:52.691Z" }, + { url = "https://files.pythonhosted.org/packages/a2/a4/6d43ebe0746ff694a30233f63f454aed1677bd50ab7a59ff6b2bb5ac61f2/rpds_py-0.29.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:76fe96632d53f3bf0ea31ede2f53bbe3540cc2736d4aec3b3801b0458499ef3a", size = 355282, upload-time = "2025-11-16T14:49:54.292Z" }, + { url = "https://files.pythonhosted.org/packages/fa/a7/52fd8270e0320b09eaf295766ae81dd175f65394687906709b3e75c71d06/rpds_py-0.29.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9459a33f077130dbb2c7c3cea72ee9932271fb3126404ba2a2661e4fe9eb7b79", size = 384968, upload-time = "2025-11-16T14:49:55.857Z" }, + { url = "https://files.pythonhosted.org/packages/f4/7d/e6bc526b7a14e1ef80579a52c1d4ad39260a058a51d66c6039035d14db9d/rpds_py-0.29.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5c9546cfdd5d45e562cc0444b6dddc191e625c62e866bf567a2c69487c7ad28a", size = 394714, upload-time = "2025-11-16T14:49:57.343Z" }, + { url = "https://files.pythonhosted.org/packages/c0/3f/f0ade3954e7db95c791e7eaf978aa7e08a756d2046e8bdd04d08146ed188/rpds_py-0.29.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12597d11d97b8f7e376c88929a6e17acb980e234547c92992f9f7c058f1a7310", size = 520136, upload-time = "2025-11-16T14:49:59.162Z" }, + { url = "https://files.pythonhosted.org/packages/87/b3/07122ead1b97009715ab9d4082be6d9bd9546099b2b03fae37c3116f72be/rpds_py-0.29.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28de03cf48b8a9e6ec10318f2197b83946ed91e2891f651a109611be4106ac4b", size = 409250, upload-time = "2025-11-16T14:50:00.698Z" }, + { url = "https://files.pythonhosted.org/packages/c9/c6/dcbee61fd1dc892aedcb1b489ba661313101aa82ec84b1a015d4c63ebfda/rpds_py-0.29.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd7951c964069039acc9d67a8ff1f0a7f34845ae180ca542b17dc1456b1f1808", size = 384940, upload-time = "2025-11-16T14:50:02.312Z" }, + { url = "https://files.pythonhosted.org/packages/47/11/914ecb6f3574cf9bf8b38aced4063e0f787d6e1eb30b181a7efbc6c1da9a/rpds_py-0.29.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:c07d107b7316088f1ac0177a7661ca0c6670d443f6fe72e836069025e6266761", size = 399392, upload-time = "2025-11-16T14:50:03.829Z" }, + { url = "https://files.pythonhosted.org/packages/f5/fd/2f4bd9433f58f816434bb934313584caa47dbc6f03ce5484df8ac8980561/rpds_py-0.29.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1de2345af363d25696969befc0c1688a6cb5e8b1d32b515ef84fc245c6cddba3", size = 416796, upload-time = "2025-11-16T14:50:05.558Z" }, + { url = "https://files.pythonhosted.org/packages/79/a5/449f0281af33efa29d5c71014399d74842342ae908d8cd38260320167692/rpds_py-0.29.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:00e56b12d2199ca96068057e1ae7f9998ab6e99cda82431afafd32f3ec98cca9", size = 566843, upload-time = "2025-11-16T14:50:07.243Z" }, + { url = "https://files.pythonhosted.org/packages/ab/32/0a6a1ccee2e37fcb1b7ba9afde762b77182dbb57937352a729c6cd3cf2bb/rpds_py-0.29.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:3919a3bbecee589300ed25000b6944174e07cd20db70552159207b3f4bbb45b8", size = 593956, upload-time = "2025-11-16T14:50:09.029Z" }, + { url = "https://files.pythonhosted.org/packages/4a/3d/eb820f95dce4306f07a495ede02fb61bef36ea201d9137d4fcd5ab94ec1e/rpds_py-0.29.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e7fa2ccc312bbd91e43aa5e0869e46bc03278a3dddb8d58833150a18b0f0283a", size = 557288, upload-time = "2025-11-16T14:50:10.73Z" }, + { url = "https://files.pythonhosted.org/packages/e9/f8/b8ff786f40470462a252918e0836e0db903c28e88e3eec66bc4a7856ee5d/rpds_py-0.29.0-cp314-cp314t-win32.whl", hash = "sha256:97c817863ffc397f1e6a6e9d2d89fe5408c0a9922dac0329672fb0f35c867ea5", size = 211382, upload-time = "2025-11-16T14:50:12.827Z" }, + { url = "https://files.pythonhosted.org/packages/c9/7f/1a65ae870bc9d0576aebb0c501ea5dccf1ae2178fe2821042150ebd2e707/rpds_py-0.29.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2023473f444752f0f82a58dfcbee040d0a1b3d1b3c2ec40e884bd25db6d117d2", size = 225919, upload-time = "2025-11-16T14:50:14.734Z" }, + { url = "https://files.pythonhosted.org/packages/f2/ac/b97e80bf107159e5b9ba9c91df1ab95f69e5e41b435f27bdd737f0d583ac/rpds_py-0.29.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:acd82a9e39082dc5f4492d15a6b6c8599aa21db5c35aaf7d6889aea16502c07d", size = 373963, upload-time = "2025-11-16T14:50:16.205Z" }, + { url = "https://files.pythonhosted.org/packages/40/5a/55e72962d5d29bd912f40c594e68880d3c7a52774b0f75542775f9250712/rpds_py-0.29.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:715b67eac317bf1c7657508170a3e011a1ea6ccb1c9d5f296e20ba14196be6b3", size = 364644, upload-time = "2025-11-16T14:50:18.22Z" }, + { url = "https://files.pythonhosted.org/packages/99/2a/6b6524d0191b7fc1351c3c0840baac42250515afb48ae40c7ed15499a6a2/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3b1b87a237cb2dba4db18bcfaaa44ba4cd5936b91121b62292ff21df577fc43", size = 393847, upload-time = "2025-11-16T14:50:20.012Z" }, + { url = "https://files.pythonhosted.org/packages/1c/b8/c5692a7df577b3c0c7faed7ac01ee3c608b81750fc5d89f84529229b6873/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1c3c3e8101bb06e337c88eb0c0ede3187131f19d97d43ea0e1c5407ea74c0cbf", size = 407281, upload-time = "2025-11-16T14:50:21.64Z" }, + { url = "https://files.pythonhosted.org/packages/f0/57/0546c6f84031b7ea08b76646a8e33e45607cc6bd879ff1917dc077bb881e/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8e54d6e61f3ecd3abe032065ce83ea63417a24f437e4a3d73d2f85ce7b7cfe", size = 529213, upload-time = "2025-11-16T14:50:23.219Z" }, + { url = "https://files.pythonhosted.org/packages/fa/c1/01dd5f444233605555bc11fe5fed6a5c18f379f02013870c176c8e630a23/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3fbd4e9aebf110473a420dea85a238b254cf8a15acb04b22a5a6b5ce8925b760", size = 413808, upload-time = "2025-11-16T14:50:25.262Z" }, + { url = "https://files.pythonhosted.org/packages/aa/0a/60f98b06156ea2a7af849fb148e00fbcfdb540909a5174a5ed10c93745c7/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80fdf53d36e6c72819993e35d1ebeeb8e8fc688d0c6c2b391b55e335b3afba5a", size = 394600, upload-time = "2025-11-16T14:50:26.956Z" }, + { url = "https://files.pythonhosted.org/packages/37/f1/dc9312fc9bec040ece08396429f2bd9e0977924ba7a11c5ad7056428465e/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:ea7173df5d86f625f8dde6d5929629ad811ed8decda3b60ae603903839ac9ac0", size = 408634, upload-time = "2025-11-16T14:50:28.989Z" }, + { url = "https://files.pythonhosted.org/packages/ed/41/65024c9fd40c89bb7d604cf73beda4cbdbcebe92d8765345dd65855b6449/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:76054d540061eda273274f3d13a21a4abdde90e13eaefdc205db37c05230efce", size = 426064, upload-time = "2025-11-16T14:50:30.674Z" }, + { url = "https://files.pythonhosted.org/packages/a2/e0/cf95478881fc88ca2fdbf56381d7df36567cccc39a05394beac72182cd62/rpds_py-0.29.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:9f84c549746a5be3bc7415830747a3a0312573afc9f95785eb35228bb17742ec", size = 575871, upload-time = "2025-11-16T14:50:33.428Z" }, + { url = "https://files.pythonhosted.org/packages/ea/c0/df88097e64339a0218b57bd5f9ca49898e4c394db756c67fccc64add850a/rpds_py-0.29.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:0ea962671af5cb9a260489e311fa22b2e97103e3f9f0caaea6f81390af96a9ed", size = 601702, upload-time = "2025-11-16T14:50:36.051Z" }, + { url = "https://files.pythonhosted.org/packages/87/f4/09ffb3ebd0cbb9e2c7c9b84d252557ecf434cd71584ee1e32f66013824df/rpds_py-0.29.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:f7728653900035fb7b8d06e1e5900545d8088efc9d5d4545782da7df03ec803f", size = 564054, upload-time = "2025-11-16T14:50:37.733Z" }, ] [[package]] @@ -4962,24 +4879,28 @@ wheels = [ [[package]] name = "safetensors" -version = "0.6.2" +version = "0.7.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ac/cc/738f3011628920e027a11754d9cae9abec1aed00f7ae860abbf843755233/safetensors-0.6.2.tar.gz", hash = "sha256:43ff2aa0e6fa2dc3ea5524ac7ad93a9839256b8703761e76e2d0b2a3fa4f15d9", size = 197968, upload-time = "2025-08-08T13:13:58.654Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4d/b1/3f5fd73c039fc87dba3ff8b5d528bfc5a32b597fea8e7a6a4800343a17c7/safetensors-0.6.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9c85ede8ec58f120bad982ec47746981e210492a6db876882aa021446af8ffba", size = 454797, upload-time = "2025-08-08T13:13:52.066Z" }, - { url = "https://files.pythonhosted.org/packages/8c/c9/bb114c158540ee17907ec470d01980957fdaf87b4aa07914c24eba87b9c6/safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d6675cf4b39c98dbd7d940598028f3742e0375a6b4d4277e76beb0c35f4b843b", size = 432206, upload-time = "2025-08-08T13:13:50.931Z" }, - { url = "https://files.pythonhosted.org/packages/d3/8e/f70c34e47df3110e8e0bb268d90db8d4be8958a54ab0336c9be4fe86dac8/safetensors-0.6.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d2d2b3ce1e2509c68932ca03ab8f20570920cd9754b05063d4368ee52833ecd", size = 473261, upload-time = "2025-08-08T13:13:41.259Z" }, - { url = "https://files.pythonhosted.org/packages/2a/f5/be9c6a7c7ef773e1996dc214e73485286df1836dbd063e8085ee1976f9cb/safetensors-0.6.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:93de35a18f46b0f5a6a1f9e26d91b442094f2df02e9fd7acf224cfec4238821a", size = 485117, upload-time = "2025-08-08T13:13:43.506Z" }, - { url = "https://files.pythonhosted.org/packages/c9/55/23f2d0a2c96ed8665bf17a30ab4ce5270413f4d74b6d87dd663258b9af31/safetensors-0.6.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89a89b505f335640f9120fac65ddeb83e40f1fd081cb8ed88b505bdccec8d0a1", size = 616154, upload-time = "2025-08-08T13:13:45.096Z" }, - { url = "https://files.pythonhosted.org/packages/98/c6/affb0bd9ce02aa46e7acddbe087912a04d953d7a4d74b708c91b5806ef3f/safetensors-0.6.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc4d0d0b937e04bdf2ae6f70cd3ad51328635fe0e6214aa1fc811f3b576b3bda", size = 520713, upload-time = "2025-08-08T13:13:46.25Z" }, - { url = "https://files.pythonhosted.org/packages/fe/5d/5a514d7b88e310c8b146e2404e0dc161282e78634d9358975fd56dfd14be/safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8045db2c872db8f4cbe3faa0495932d89c38c899c603f21e9b6486951a5ecb8f", size = 485835, upload-time = "2025-08-08T13:13:49.373Z" }, - { url = "https://files.pythonhosted.org/packages/7a/7b/4fc3b2ba62c352b2071bea9cfbad330fadda70579f617506ae1a2f129cab/safetensors-0.6.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:81e67e8bab9878bb568cffbc5f5e655adb38d2418351dc0859ccac158f753e19", size = 521503, upload-time = "2025-08-08T13:13:47.651Z" }, - { url = "https://files.pythonhosted.org/packages/5a/50/0057e11fe1f3cead9254315a6c106a16dd4b1a19cd247f7cc6414f6b7866/safetensors-0.6.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b0e4d029ab0a0e0e4fdf142b194514695b1d7d3735503ba700cf36d0fc7136ce", size = 652256, upload-time = "2025-08-08T13:13:53.167Z" }, - { url = "https://files.pythonhosted.org/packages/e9/29/473f789e4ac242593ac1656fbece6e1ecd860bb289e635e963667807afe3/safetensors-0.6.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:fa48268185c52bfe8771e46325a1e21d317207bcabcb72e65c6e28e9ffeb29c7", size = 747281, upload-time = "2025-08-08T13:13:54.656Z" }, - { url = "https://files.pythonhosted.org/packages/68/52/f7324aad7f2df99e05525c84d352dc217e0fa637a4f603e9f2eedfbe2c67/safetensors-0.6.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:d83c20c12c2d2f465997c51b7ecb00e407e5f94d7dec3ea0cc11d86f60d3fde5", size = 692286, upload-time = "2025-08-08T13:13:55.884Z" }, - { url = "https://files.pythonhosted.org/packages/ad/fe/cad1d9762868c7c5dc70c8620074df28ebb1a8e4c17d4c0cb031889c457e/safetensors-0.6.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d944cea65fad0ead848b6ec2c37cc0b197194bec228f8020054742190e9312ac", size = 655957, upload-time = "2025-08-08T13:13:57.029Z" }, - { url = "https://files.pythonhosted.org/packages/59/a7/e2158e17bbe57d104f0abbd95dff60dda916cf277c9f9663b4bf9bad8b6e/safetensors-0.6.2-cp38-abi3-win32.whl", hash = "sha256:cab75ca7c064d3911411461151cb69380c9225798a20e712b102edda2542ddb1", size = 308926, upload-time = "2025-08-08T13:14:01.095Z" }, - { url = "https://files.pythonhosted.org/packages/2c/c3/c0be1135726618dc1e28d181b8c442403d8dbb9e273fd791de2d4384bcdd/safetensors-0.6.2-cp38-abi3-win_amd64.whl", hash = "sha256:c7b214870df923cbc1593c3faee16bec59ea462758699bd3fee399d00aac072c", size = 320192, upload-time = "2025-08-08T13:13:59.467Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/29/9c/6e74567782559a63bd040a236edca26fd71bc7ba88de2ef35d75df3bca5e/safetensors-0.7.0.tar.gz", hash = "sha256:07663963b67e8bd9f0b8ad15bb9163606cd27cc5a1b96235a50d8369803b96b0", size = 200878, upload-time = "2025-11-19T15:18:43.199Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/47/aef6c06649039accf914afef490268e1067ed82be62bcfa5b7e886ad15e8/safetensors-0.7.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c82f4d474cf725255d9e6acf17252991c3c8aac038d6ef363a4bf8be2f6db517", size = 467781, upload-time = "2025-11-19T15:18:35.84Z" }, + { url = "https://files.pythonhosted.org/packages/e8/00/374c0c068e30cd31f1e1b46b4b5738168ec79e7689ca82ee93ddfea05109/safetensors-0.7.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:94fd4858284736bb67a897a41608b5b0c2496c9bdb3bf2af1fa3409127f20d57", size = 447058, upload-time = "2025-11-19T15:18:34.416Z" }, + { url = "https://files.pythonhosted.org/packages/f1/06/578ffed52c2296f93d7fd2d844cabfa92be51a587c38c8afbb8ae449ca89/safetensors-0.7.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e07d91d0c92a31200f25351f4acb2bc6aff7f48094e13ebb1d0fb995b54b6542", size = 491748, upload-time = "2025-11-19T15:18:09.79Z" }, + { url = "https://files.pythonhosted.org/packages/ae/33/1debbbb70e4791dde185edb9413d1fe01619255abb64b300157d7f15dddd/safetensors-0.7.0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8469155f4cb518bafb4acf4865e8bb9d6804110d2d9bdcaa78564b9fd841e104", size = 503881, upload-time = "2025-11-19T15:18:16.145Z" }, + { url = "https://files.pythonhosted.org/packages/8e/1c/40c2ca924d60792c3be509833df711b553c60effbd91da6f5284a83f7122/safetensors-0.7.0-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54bef08bf00a2bff599982f6b08e8770e09cc012d7bba00783fc7ea38f1fb37d", size = 623463, upload-time = "2025-11-19T15:18:21.11Z" }, + { url = "https://files.pythonhosted.org/packages/9b/3a/13784a9364bd43b0d61eef4bea2845039bc2030458b16594a1bd787ae26e/safetensors-0.7.0-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:42cb091236206bb2016d245c377ed383aa7f78691748f3bb6ee1bfa51ae2ce6a", size = 532855, upload-time = "2025-11-19T15:18:25.719Z" }, + { url = "https://files.pythonhosted.org/packages/a0/60/429e9b1cb3fc651937727befe258ea24122d9663e4d5709a48c9cbfceecb/safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dac7252938f0696ddea46f5e855dd3138444e82236e3be475f54929f0c510d48", size = 507152, upload-time = "2025-11-19T15:18:33.023Z" }, + { url = "https://files.pythonhosted.org/packages/3c/a8/4b45e4e059270d17af60359713ffd83f97900d45a6afa73aaa0d737d48b6/safetensors-0.7.0-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1d060c70284127fa805085d8f10fbd0962792aed71879d00864acda69dbab981", size = 541856, upload-time = "2025-11-19T15:18:31.075Z" }, + { url = "https://files.pythonhosted.org/packages/06/87/d26d8407c44175d8ae164a95b5a62707fcc445f3c0c56108e37d98070a3d/safetensors-0.7.0-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:cdab83a366799fa730f90a4ebb563e494f28e9e92c4819e556152ad55e43591b", size = 674060, upload-time = "2025-11-19T15:18:37.211Z" }, + { url = "https://files.pythonhosted.org/packages/11/f5/57644a2ff08dc6325816ba7217e5095f17269dada2554b658442c66aed51/safetensors-0.7.0-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:672132907fcad9f2aedcb705b2d7b3b93354a2aec1b2f706c4db852abe338f85", size = 771715, upload-time = "2025-11-19T15:18:38.689Z" }, + { url = "https://files.pythonhosted.org/packages/86/31/17883e13a814bd278ae6e266b13282a01049b0c81341da7fd0e3e71a80a3/safetensors-0.7.0-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:5d72abdb8a4d56d4020713724ba81dac065fedb7f3667151c4a637f1d3fb26c0", size = 714377, upload-time = "2025-11-19T15:18:40.162Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d8/0c8a7dc9b41dcac53c4cbf9df2b9c83e0e0097203de8b37a712b345c0be5/safetensors-0.7.0-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b0f6d66c1c538d5a94a73aa9ddca8ccc4227e6c9ff555322ea40bdd142391dd4", size = 677368, upload-time = "2025-11-19T15:18:41.627Z" }, + { url = "https://files.pythonhosted.org/packages/05/e5/cb4b713c8a93469e3c5be7c3f8d77d307e65fe89673e731f5c2bfd0a9237/safetensors-0.7.0-cp38-abi3-win32.whl", hash = "sha256:c74af94bf3ac15ac4d0f2a7c7b4663a15f8c2ab15ed0fc7531ca61d0835eccba", size = 326423, upload-time = "2025-11-19T15:18:45.74Z" }, + { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" }, + { url = "https://files.pythonhosted.org/packages/a7/6a/4d08d89a6fcbe905c5ae68b8b34f0791850882fc19782d0d02c65abbdf3b/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4729811a6640d019a4b7ba8638ee2fd21fa5ca8c7e7bdf0fed62068fcaac737", size = 492430, upload-time = "2025-11-19T15:18:11.884Z" }, + { url = "https://files.pythonhosted.org/packages/dd/29/59ed8152b30f72c42d00d241e58eaca558ae9dbfa5695206e2e0f54c7063/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:12f49080303fa6bb424b362149a12949dfbbf1e06811a88f2307276b0c131afd", size = 503977, upload-time = "2025-11-19T15:18:17.523Z" }, + { url = "https://files.pythonhosted.org/packages/d3/0b/4811bfec67fa260e791369b16dab105e4bae82686120554cc484064e22b4/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0071bffba4150c2f46cae1432d31995d77acfd9f8db598b5d1a2ce67e8440ad2", size = 623890, upload-time = "2025-11-19T15:18:22.666Z" }, + { url = "https://files.pythonhosted.org/packages/58/5b/632a58724221ef03d78ab65062e82a1010e1bef8e8e0b9d7c6d7b8044841/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:473b32699f4200e69801bf5abf93f1a4ecd432a70984df164fc22ccf39c4a6f3", size = 531885, upload-time = "2025-11-19T15:18:27.146Z" }, ] [[package]] @@ -4991,7 +4912,7 @@ resolution-markers = [ "python_full_version < '3.11' and sys_platform != 'linux'", ] dependencies = [ - { name = "numpy", marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0f/37/6964b830433e654ec7485e45a00fc9a27cf868d622838f6b6d9c5ec0d532/scipy-1.15.3.tar.gz", hash = "sha256:eae3cf522bc7df64b42cad3925c876e1b0b6c35c1337c93e12c0f366f55b0eaf", size = 59419214, upload-time = "2025-05-08T16:13:05.955Z" } wheels = [ @@ -5047,21 +4968,17 @@ name = "scipy" version = "1.16.3" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", + "python_full_version >= '3.14' and sys_platform == 'linux'", + "python_full_version == '3.13.*' and sys_platform == 'linux'", "python_full_version == '3.12.*' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", + "python_full_version >= '3.14' and sys_platform != 'linux'", + "python_full_version == '3.13.*' and sys_platform != 'linux'", "python_full_version == '3.12.*' and sys_platform != 'linux'", "python_full_version == '3.11.*' and sys_platform == 'linux'", "python_full_version == '3.11.*' and sys_platform != 'linux'", ] dependencies = [ - { name = "numpy", marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0a/ca/d8ace4f98322d01abcd52d381134344bf7b431eba7ed8b42bdea5a3c2ac9/scipy-1.16.3.tar.gz", hash = "sha256:01e87659402762f43bd2fee13370553a17ada367d42e7487800bf2916535aecb", size = 30597883, upload-time = "2025-10-28T17:38:54.068Z" } wheels = [ @@ -5193,15 +5110,15 @@ wheels = [ [[package]] name = "sentry-sdk" -version = "2.43.0" +version = "2.46.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "certifi" }, { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b3/18/09875b4323b03ca9025bae7e6539797b27e4fc032998a466b4b9c3d24653/sentry_sdk-2.43.0.tar.gz", hash = "sha256:52ed6e251c5d2c084224d73efee56b007ef5c2d408a4a071270e82131d336e20", size = 368953, upload-time = "2025-10-29T11:26:08.156Z" } +sdist = { url = "https://files.pythonhosted.org/packages/7c/d7/c140a5837649e2bf2ec758494fde1d9a016c76777eab64e75ef38d685bbb/sentry_sdk-2.46.0.tar.gz", hash = "sha256:91821a23460725734b7741523021601593f35731808afc0bb2ba46c27b8acd91", size = 374761, upload-time = "2025-11-24T09:34:13.932Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/69/31/8228fa962f7fd8814d634e4ebece8780e2cdcfbdf0cd2e14d4a6861a7cd5/sentry_sdk-2.43.0-py2.py3-none-any.whl", hash = "sha256:4aacafcf1756ef066d359ae35030881917160ba7f6fc3ae11e0e58b09edc2d5d", size = 400997, upload-time = "2025-10-29T11:26:05.77Z" }, + { url = "https://files.pythonhosted.org/packages/4b/b6/ce7c502a366f4835b1f9c057753f6989a92d3c70cbadb168193f5fb7499b/sentry_sdk-2.46.0-py2.py3-none-any.whl", hash = "sha256:4eeeb60198074dff8d066ea153fa6f241fef1668c10900ea53a4200abc8da9b1", size = 406266, upload-time = "2025-11-24T09:34:12.114Z" }, ] [[package]] @@ -5233,11 +5150,11 @@ wheels = [ [[package]] name = "slack-sdk" -version = "3.37.0" +version = "3.39.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/8e/c2/0a174a155623d7dc3ed4d1360cdf755590acdc2c3fc9ce0d2340f468909f/slack_sdk-3.37.0.tar.gz", hash = "sha256:242d6cffbd9e843af807487ff04853189b812081aeaa22f90a8f159f20220ed9", size = 241612, upload-time = "2025-10-06T23:07:20.856Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b6/dd/645f3eb93fce38eadbb649e85684730b1fc3906c2674ca59bddc2ca2bd2e/slack_sdk-3.39.0.tar.gz", hash = "sha256:6a56be10dc155c436ff658c6b776e1c082e29eae6a771fccf8b0a235822bbcb1", size = 247207, upload-time = "2025-11-20T15:27:57.556Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/07/fd/a502ee24d8c7d12a8f749878ae0949b8eeb50aeac22dc5a613d417a256d0/slack_sdk-3.37.0-py2.py3-none-any.whl", hash = "sha256:e108a0836eafda74d8a95e76c12c2bcb010e645d504d8497451e4c7ebb229c87", size = 302751, upload-time = "2025-10-06T23:07:19.542Z" }, + { url = "https://files.pythonhosted.org/packages/ef/1f/32bcf088e535c1870b1a1f2e3b916129c66fdfe565a793316317241d41e5/slack_sdk-3.39.0-py2.py3-none-any.whl", hash = "sha256:b1556b2f5b8b12b94e5ea3f56c4f2c7f04462e4e1013d325c5764ff118044fa8", size = 309850, upload-time = "2025-11-20T15:27:55.729Z" }, ] [[package]] @@ -5282,7 +5199,8 @@ version = "0.13.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cffi" }, - { name = "numpy" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" } wheels = [ @@ -5341,44 +5259,14 @@ name = "sphinx" version = "8.2.3" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", + "python_full_version >= '3.14' and sys_platform == 'linux'", + "python_full_version == '3.13.*' and sys_platform == 'linux'", + "python_full_version == '3.12.*' and sys_platform == 'linux'", + "python_full_version >= '3.14' and sys_platform != 'linux'", + "python_full_version == '3.13.*' and sys_platform != 'linux'", + "python_full_version == '3.12.*' and sys_platform != 'linux'", + "python_full_version == '3.11.*' and sys_platform == 'linux'", + "python_full_version == '3.11.*' and sys_platform != 'linux'", ] dependencies = [ { name = "alabaster", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, @@ -5430,44 +5318,14 @@ name = "sphinx-autobuild" version = "2025.8.25" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", + "python_full_version >= '3.14' and sys_platform == 'linux'", + "python_full_version == '3.13.*' and sys_platform == 'linux'", + "python_full_version == '3.12.*' and sys_platform == 'linux'", + "python_full_version >= '3.14' and sys_platform != 'linux'", + "python_full_version == '3.13.*' and sys_platform != 'linux'", + "python_full_version == '3.12.*' and sys_platform != 'linux'", + "python_full_version == '3.11.*' and sys_platform == 'linux'", + "python_full_version == '3.11.*' and sys_platform != 'linux'", ] dependencies = [ { name = "colorama", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, @@ -5565,15 +5423,24 @@ wheels = [ [[package]] name = "starlette" -version = "0.49.3" +version = "0.50.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, - { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/de/1a/608df0b10b53b0beb96a37854ee05864d182ddd4b1156a22f1ad3860425a/starlette-0.49.3.tar.gz", hash = "sha256:1c14546f299b5901a1ea0e34410575bc33bbd741377a10484a54445588d00284", size = 2655031, upload-time = "2025-11-01T15:12:26.13Z" } +sdist = { url = "https://files.pythonhosted.org/packages/ba/b8/73a0e6a6e079a9d9cfa64113d771e421640b6f679a52eeb9b32f72d871a1/starlette-0.50.0.tar.gz", hash = "sha256:a2a17b22203254bcbc2e1f926d2d55f3f9497f769416b3190768befe598fa3ca", size = 2646985, upload-time = "2025-11-01T15:25:27.516Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a3/e0/021c772d6a662f43b63044ab481dc6ac7592447605b5b35a957785363122/starlette-0.49.3-py3-none-any.whl", hash = "sha256:b579b99715fdc2980cf88c8ec96d3bf1ce16f5a8051a7c2b84ef9b1cdecaea2f", size = 74340, upload-time = "2025-11-01T15:12:24.387Z" }, + { url = "https://files.pythonhosted.org/packages/d9/52/1064f510b141bd54025f9b55105e26d1fa970b9be67ad766380a3c9b74b0/starlette-0.50.0-py3-none-any.whl", hash = "sha256:9e5391843ec9b6e472eed1365a78c8098cfceb7a74bfd4d6b1c0c0095efb3bca", size = 74033, upload-time = "2025-11-01T15:25:25.461Z" }, +] + +[[package]] +name = "strenum" +version = "0.4.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/85/ad/430fb60d90e1d112a62ff57bdd1f286ec73a2a0331272febfddd21f330e1/StrEnum-0.4.15.tar.gz", hash = "sha256:878fb5ab705442070e4dd1929bb5e2249511c0bcf2b0eeacf3bcd80875c82eff", size = 23384, upload-time = "2023-06-29T22:02:58.399Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/69/297302c5f5f59c862faa31e6cb9a4cd74721cd1e052b38e464c5b402df8b/StrEnum-0.4.15-py3-none-any.whl", hash = "sha256:a30cda4af7cc6b5bf52c8055bc4bf4b2b6b14a93b574626da33df53cf7740659", size = 8851, upload-time = "2023-06-29T22:02:56.947Z" }, ] [[package]] @@ -5581,7 +5448,7 @@ name = "sympy" version = "1.14.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "mpmath", marker = "sys_platform != 'linux'" }, + { name = "mpmath" }, ] sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } wheels = [ @@ -5605,7 +5472,8 @@ dependencies = [ { name = "absl-py" }, { name = "grpcio" }, { name = "markdown" }, - { name = "numpy" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "packaging" }, { name = "pillow" }, { name = "protobuf" }, @@ -5627,63 +5495,17 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/73/c6/825dab04195756cf8ff2e12698f22513b3db2f64925bdd41671bfb33aaa5/tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530", size = 6590363, upload-time = "2023-10-23T21:23:35.583Z" }, ] -[[package]] -name = "tensorstore" -version = "0.1.74" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", -] -dependencies = [ - { name = "ml-dtypes", version = "0.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" }, - { name = "numpy", marker = "python_full_version >= '3.13'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3c/b9/ea25aba62c688a87d7d7d9cc5926d602e2f9e84fa72586825486fb180b7e/tensorstore-0.1.74.tar.gz", hash = "sha256:a062875f27283d30ce4959c408c253ecb336fce8e3f9837c064e3d30cda79203", size = 6795605, upload-time = "2025-04-24T15:42:18.829Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f2/20/1e7e776dc30f2f07416223c12f9ad244ec539af5fa1fbef9320812a9a3b6/tensorstore-0.1.74-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:edfae80aceb05640ac2209a11a4b76cecd5d9c4a95c01ede8c89c8edaa90f9d5", size = 15292660, upload-time = "2025-04-24T15:41:18.253Z" }, - { url = "https://files.pythonhosted.org/packages/76/cc/81bf2d6a4caa239d38905b439864d3a8bf06b27d6d31bb2396e3f4f5cc55/tensorstore-0.1.74-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ab985d767d53e9478987c23dc7aea8f7e8aed2ef90ec8f7f939e8b399667feb1", size = 13260438, upload-time = "2025-04-24T15:41:22.596Z" }, - { url = "https://files.pythonhosted.org/packages/88/4c/a26c4c8b8e7573d2b552505cd46a658b9a68a80d88e9d3c68f16d10e4d62/tensorstore-0.1.74-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d16d1181c292ea065ebd203e823420c65e365d0407eea8f0a3dd82995da0cc65", size = 17041531, upload-time = "2025-04-24T15:41:25.492Z" }, - { url = "https://files.pythonhosted.org/packages/e4/a9/3859b1b497dacf2093e196e1d4ed3b95e8553c7d7c9fe1f88216c72253a9/tensorstore-0.1.74-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f327e813152705b5297f251824a91106e17a06fd2f6b5f6e94c6401c5937da8c", size = 18392852, upload-time = "2025-04-24T15:41:28.136Z" }, - { url = "https://files.pythonhosted.org/packages/2d/3b/b7494ea0a37dd4cd3721f104fc52d4c953354b801eb1adf08e40bc08aaa0/tensorstore-0.1.74-cp310-cp310-win_amd64.whl", hash = "sha256:e56e9690cc20463951a52a6908e18056a93ce5bcd4a881834e2b5962801a1125", size = 12429998, upload-time = "2025-04-24T15:41:30.794Z" }, - { url = "https://files.pythonhosted.org/packages/0d/3e/d67bb3d9bb7409469d15fb90ef5756e6ac8b835af7f27c02fc542c4b4059/tensorstore-0.1.74-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:8353e619d9140ca50fc0cb5b846e07c68462dd5015b4714752a0a664e48a03d3", size = 15294582, upload-time = "2025-04-24T15:41:33.794Z" }, - { url = "https://files.pythonhosted.org/packages/01/f4/49cb5ea8e63303fcb0a6ebf0ed546aaec63982a4abca0e9801da5e3a24e3/tensorstore-0.1.74-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3ad1bfbb257ab84de1a5c9b79a60cebb5fbb7a411ddb1c246c21c9795789ba1", size = 13261395, upload-time = "2025-04-24T15:41:36.372Z" }, - { url = "https://files.pythonhosted.org/packages/ad/7b/9c12d4687e6ff19222f12719286c13a546f1714e5dbed75d52a4267534ed/tensorstore-0.1.74-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3ad9daf4c757db41ad091a1a5502807baeb848be0937986d8766049c39c8466", size = 17042621, upload-time = "2025-04-24T15:41:39.284Z" }, - { url = "https://files.pythonhosted.org/packages/b5/07/cf0dc4540a78bc715fbcf4417c5dc708f3d12ed1664bf117f22463f411fc/tensorstore-0.1.74-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a35364804e7d71bf5e86d2dae4de04c90249b61ff71448b9713b4e72b2389bd", size = 18393581, upload-time = "2025-04-24T15:41:42.554Z" }, - { url = "https://files.pythonhosted.org/packages/ac/42/edf004c5a101e021f052ea3564250d773d7cf6458f92934456ffa967383f/tensorstore-0.1.74-cp311-cp311-win_amd64.whl", hash = "sha256:15dcb6ce282e32d005caad34d595b0be070947578448a2861c63fdd608fc7394", size = 12431849, upload-time = "2025-04-24T15:41:45.263Z" }, - { url = "https://files.pythonhosted.org/packages/a1/14/2e6d1cad744af9e9a1a78d881a908a859ad95b61b15de10397069f55fbd8/tensorstore-0.1.74-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:7218722ee5d74e4d01f357917d3b1b7b1d6b1c068aa73e3d801cb3d58fc45116", size = 15334307, upload-time = "2025-04-24T15:41:48.315Z" }, - { url = "https://files.pythonhosted.org/packages/b2/ac/8d572b8c6d689eb50db0252e9d35ee6278a6aed481b64d7e025cf51e32c4/tensorstore-0.1.74-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a6926554a8633d0210bdba619d3996fff6a6af4214237fbca626e6ddfcc8ea39", size = 13288669, upload-time = "2025-04-24T15:41:50.808Z" }, - { url = "https://files.pythonhosted.org/packages/9d/6c/3e76d614ad70b61670686d91abaa3ddee6b01255bf2b40f050beb15b7970/tensorstore-0.1.74-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d584e468eb4ef8195f5d21a9da4780cf96c6074b87ef219b43a89efce3d503ca", size = 17031720, upload-time = "2025-04-24T15:41:55.092Z" }, - { url = "https://files.pythonhosted.org/packages/31/f3/09d7c3ad7c9517f89b5be9b4460b83333e98dce1c9ab0a52464ded0bab67/tensorstore-0.1.74-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0af2225431d59f8a2bb4db4c1519252f10ee407e6550875d78212d3d34ee743", size = 18378829, upload-time = "2025-04-24T15:41:58.167Z" }, - { url = "https://files.pythonhosted.org/packages/a7/f2/45ece38705280ed9ebf4ccaf084ed1e76e35b1eeec8c510e589978ac8dcd/tensorstore-0.1.74-cp312-cp312-win_amd64.whl", hash = "sha256:4e35f3679873cdc488aae20b9ae2cea4589c7b147a80edb07eb3f09eba47d43d", size = 12432300, upload-time = "2025-04-24T15:42:00.761Z" }, - { url = "https://files.pythonhosted.org/packages/fb/e9/a08c6a6eb7d6b4b26053d4575196a06c6fccf4e89f9bc625f81e7c91bb5d/tensorstore-0.1.74-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:f7d2c80de9ab352ca14aeca798d6650c5670725e6f8eac73f4fcc8f3147ca614", size = 15334469, upload-time = "2025-04-24T15:42:03.731Z" }, - { url = "https://files.pythonhosted.org/packages/9a/a9/64b90c6e66e0b8043e641090144c6614b0c78d9a719b9110d953d13a516d/tensorstore-0.1.74-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ceef7d2dcfd1caf61356f7eeb9a37896b4825b4be2750b00615cf5fb1ae47a8b", size = 13288791, upload-time = "2025-04-24T15:42:06.145Z" }, - { url = "https://files.pythonhosted.org/packages/62/e8/226cfc25d7eac00e783ff2ee4994830c4a42cd8690e207c4a8b93210f3d9/tensorstore-0.1.74-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e71637002a806bc1b0f0f05556d1c33493a43f3ab35f9632b3d48855677d93dc", size = 17031815, upload-time = "2025-04-24T15:42:09.239Z" }, - { url = "https://files.pythonhosted.org/packages/9a/09/dce8a0942d84f6bb039b5ea3e8bc6a479b1a9535cd216b0d42dd03c4f761/tensorstore-0.1.74-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c799edf9000aee68d6676e3d2f73d4e1a56fc817c47e150732f6d3bd2b1ef46d", size = 18378091, upload-time = "2025-04-24T15:42:13.546Z" }, - { url = "https://files.pythonhosted.org/packages/a6/23/5218575d25de9d8debfb3faf290a1e3b9a7b6be9e77ba07ff3a63a0bc899/tensorstore-0.1.74-cp313-cp313-win_amd64.whl", hash = "sha256:5da86437ffa1ee0f0c590c38daa2f4b548890ce66b1f470ac98714cb0eabdbf5", size = 12432635, upload-time = "2025-04-24T15:42:16.275Z" }, -] - [[package]] name = "tensorstore" version = "0.1.78" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.12.*' and sys_platform == 'linux'", - "python_full_version == '3.12.*' and sys_platform != 'linux'", - "python_full_version == '3.11.*' and sys_platform == 'linux'", - "python_full_version == '3.11.*' and sys_platform != 'linux'", "python_full_version < '3.11' and sys_platform == 'linux'", "python_full_version < '3.11' and sys_platform != 'linux'", ] dependencies = [ - { name = "ml-dtypes", version = "0.5.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" }, - { name = "numpy", marker = "python_full_version < '3.13'" }, + { name = "ml-dtypes", marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/9f/ee/05eb424437f4db63331c90e4605025eedc0f71da3faff97161d5d7b405af/tensorstore-0.1.78.tar.gz", hash = "sha256:e26074ffe462394cf54197eb76d6569b500f347573cd74da3f4dd5f510a4ad7c", size = 6913502, upload-time = "2025-10-06T17:44:29.649Z" } wheels = [ @@ -5709,6 +5531,48 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/db/a2/dbd1af0e97d5d549051309d72c6e3f2fe81fae636f9db3692d21adc9c731/tensorstore-0.1.78-cp313-cp313-win_amd64.whl", hash = "sha256:e0073de8fa3074bc4cc92ced0210310fd89851899faf42a5ba256f0ba87d095c", size = 12711250, upload-time = "2025-10-06T17:44:27.926Z" }, ] +[[package]] +name = "tensorstore" +version = "0.1.79" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14' and sys_platform == 'linux'", + "python_full_version == '3.13.*' and sys_platform == 'linux'", + "python_full_version == '3.12.*' and sys_platform == 'linux'", + "python_full_version >= '3.14' and sys_platform != 'linux'", + "python_full_version == '3.13.*' and sys_platform != 'linux'", + "python_full_version == '3.12.*' and sys_platform != 'linux'", + "python_full_version == '3.11.*' and sys_platform == 'linux'", + "python_full_version == '3.11.*' and sys_platform != 'linux'", +] +dependencies = [ + { name = "ml-dtypes", marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/26/2c/50ab489a0862ca88d2d766130a6fec45ccd5174f0e04081d8b7b07a8aedd/tensorstore-0.1.79.tar.gz", hash = "sha256:8dad44a8a7f2952a5d0030a8bd868b3cfdff048bd40ab53e7226f3d8b0881c5e", size = 7075782, upload-time = "2025-11-11T22:05:23.824Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/a9/1695d7ea197c4568c2f02f34b203eef702ec8080422331f00a65c6fb2a37/tensorstore-0.1.79-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:11a2c62694ea9c21770bc5a09938d3d15c4b9662b738ae6e1e513c26ed96251a", size = 16466511, upload-time = "2025-11-11T22:04:18.614Z" }, + { url = "https://files.pythonhosted.org/packages/db/0e/5ce8a615c7f9ad7cf8ed4ac6e182fe0ef46fd06fef89757e49ba84a6ba9e/tensorstore-0.1.79-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5e152d334bf34fbabdfe8e5bc35b87d1f9947065924ff83c29e659308b36e948", size = 14499810, upload-time = "2025-11-11T22:04:21.725Z" }, + { url = "https://files.pythonhosted.org/packages/c0/29/2cb9552138fe84ab29421489121350e4af0502eafff31ccd9017490be0d8/tensorstore-0.1.79-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4230b8fd29795e88e441f749d881973eca8dadf33c5262b367839fb8891f79b", size = 18937510, upload-time = "2025-11-11T22:04:24.221Z" }, + { url = "https://files.pythonhosted.org/packages/42/70/d2a672a93faebdd176cd8541405cd5614b14d3d8dc812fbeaf2cf46d390a/tensorstore-0.1.79-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:83072ee0e551d6dca582e154b64c8b8066d276ec0759784e3149c28212a61f18", size = 20910324, upload-time = "2025-11-11T22:04:26.769Z" }, + { url = "https://files.pythonhosted.org/packages/91/d5/7958cbfb614c4ffa5070ae9575874d46937067c0d81a7739e67fb1d62de5/tensorstore-0.1.79-cp311-cp311-win_amd64.whl", hash = "sha256:6c98c6b74c00e00eba7969292144e471d5c45d67088f0dc08e3a4c60a15ee191", size = 13206191, upload-time = "2025-11-11T22:04:29.254Z" }, + { url = "https://files.pythonhosted.org/packages/f1/a2/a77be16b4a882ace36da0748305795f35306bdad568472f208bd89b96b9d/tensorstore-0.1.79-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:71aa9b45436d888c37b965f7b71195916d15438119b7dccb66a3b0776bfba367", size = 16485740, upload-time = "2025-11-11T22:04:33.478Z" }, + { url = "https://files.pythonhosted.org/packages/7a/e4/7fe268ec41aa70b71a1c56b1ec83346fbcbf12f4bfbefc79d14fb9c03408/tensorstore-0.1.79-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:108c0e867aa2c87d4982cc6325a2de0c4f5bd63c2bea18adb193a370c40594ce", size = 14508736, upload-time = "2025-11-11T22:04:38.613Z" }, + { url = "https://files.pythonhosted.org/packages/5a/f1/b1248dae02598ce534834413e841f915a32ab185c36ecd05e4c67bdc8d19/tensorstore-0.1.79-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:debd435042c00be68ba1fb3cf59325a7babb3f4a3cf4744c87dde346802cbbb4", size = 18947817, upload-time = "2025-11-11T22:04:40.768Z" }, + { url = "https://files.pythonhosted.org/packages/87/4a/60e234147570e21bbab4ac70ab79dd794a5ef9a4945d36c34c1914a73205/tensorstore-0.1.79-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:608f7178ec6e4e4a3c26545b0a44f44bf83438d04bf2d960cd0e7699eaa99ef6", size = 20929832, upload-time = "2025-11-11T22:04:43.613Z" }, + { url = "https://files.pythonhosted.org/packages/f8/48/0531868bce12a2f520002e810d4200ec6f01ba33a2f27b6bd7289fbc197b/tensorstore-0.1.79-cp312-cp312-win_amd64.whl", hash = "sha256:a071c6c255b7e412957a6aa563bc4250242c7894edad06ae6358e3d30b7d88ce", size = 13211970, upload-time = "2025-11-11T22:04:46.179Z" }, + { url = "https://files.pythonhosted.org/packages/fa/0b/54a44e55836d8e8f576343134c0e3db71c6c837d39a0ac44699aba5b01df/tensorstore-0.1.79-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:1e8e2d098829919caac6a62cf568902e34789069ceddb28497d6e36ebcb95c0b", size = 16485855, upload-time = "2025-11-11T22:04:48.734Z" }, + { url = "https://files.pythonhosted.org/packages/04/59/cadb9a45896d480882476df4759cda1659c70669aff87a4d5a4a07ded084/tensorstore-0.1.79-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:29cf4336153af136ac8ac528e2ed46df19367edae7e14e37bca1a8b7c4848ef2", size = 14508277, upload-time = "2025-11-11T22:04:50.775Z" }, + { url = "https://files.pythonhosted.org/packages/e6/cb/3647bdd03c7692882ebc10c19df9ede49f290c216b2906f785edbdb53ef1/tensorstore-0.1.79-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:94d8fc9df1721b0287046aca7209fd5040889cad4202e7b73a1fdb77cd9b71c6", size = 18949307, upload-time = "2025-11-11T22:04:53.145Z" }, + { url = "https://files.pythonhosted.org/packages/20/a0/f91ac492cf2ee9f7541aefaaed4ad1258e73e33f3cd3e06cdce5859431db/tensorstore-0.1.79-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9f2dc3342e4686af98f6e259dc9fb377f1bf657b649c247bf6647bbe4f98090", size = 20930427, upload-time = "2025-11-11T22:04:55.353Z" }, + { url = "https://files.pythonhosted.org/packages/69/a6/752fd11747eb9fead715b02d389da7fb180a56172b885de0b48b20237d1e/tensorstore-0.1.79-cp313-cp313-win_amd64.whl", hash = "sha256:0fd6165f3df49abc7c9de029b2b72d74bebd2ff2481a5ced003607eb61c56d3e", size = 13212196, upload-time = "2025-11-11T22:05:00.451Z" }, + { url = "https://files.pythonhosted.org/packages/46/57/1649019893accb3f195780fec55b8bf6793343faf140040bc73f1c28d6a5/tensorstore-0.1.79-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:6f8f5a940eab434a951c2dadcc7c0516c7bef6d8b7a7144054f7a0c56152b5f5", size = 16488849, upload-time = "2025-11-11T22:05:03.014Z" }, + { url = "https://files.pythonhosted.org/packages/bf/23/2668cb120e855a6a7a8a5eb0eba30e2e7020da932a4d3fa13c6ee3c41f9f/tensorstore-0.1.79-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:97756d2cba3c5ce21e15602c2af5a02521cc0ecda7f9fb6d18da2f3bd51827f4", size = 14511448, upload-time = "2025-11-11T22:05:05.58Z" }, + { url = "https://files.pythonhosted.org/packages/6a/0e/c38f079f3933cc284aab53d52976f6cb4f1ad43bb6a704ac27e0b710f176/tensorstore-0.1.79-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:847982652273fb7b2d694b789205747aaf3e50ae64738c5cb7b5eb03d86a9947", size = 18949282, upload-time = "2025-11-11T22:05:07.562Z" }, + { url = "https://files.pythonhosted.org/packages/6f/99/03479deea5bfd27a0d8a8c75d5f1d85417a7bbc9c6c7a90fb85b4a4e347a/tensorstore-0.1.79-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7af9422269c2bfcdecf9dd55309060665ab9c2d7f6c892377ed32c032400feea", size = 20931601, upload-time = "2025-11-11T22:05:10.098Z" }, + { url = "https://files.pythonhosted.org/packages/26/36/2617edf6c6d6fc73b3ff96d9d0b97332adf0d0c56fa2014a226bf4f7dfa6/tensorstore-0.1.79-cp314-cp314-win_amd64.whl", hash = "sha256:bbd8c1ab7d2e3c03ded3d40bb373ee9a67668e33a564484927865ce43b210386", size = 13599766, upload-time = "2025-11-11T22:05:12.265Z" }, +] + [[package]] name = "tiktoken" version = "0.12.0" @@ -5864,48 +5728,63 @@ wheels = [ [[package]] name = "torch" -version = "2.9.0" +version = "2.9.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock", marker = "sys_platform != 'linux'" }, - { name = "fsspec", marker = "sys_platform != 'linux'" }, - { name = "jinja2", marker = "sys_platform != 'linux'" }, - { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and sys_platform != 'linux') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and sys_platform != 'linux') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "setuptools", marker = "python_full_version >= '3.12' and sys_platform != 'linux'" }, - { name = "sympy", marker = "sys_platform != 'linux'" }, + { name = "filelock" }, + { name = "fsspec" }, + { name = "jinja2" }, + { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "networkx", version = "3.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvshmem-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "setuptools", marker = "python_full_version >= '3.12'" }, + { name = "sympy" }, { name = "triton", marker = "sys_platform == 'never'" }, - { name = "typing-extensions", marker = "sys_platform != 'linux'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/bb/86/245c240d2138c17ed572c943c289056c2721abab70810d772c6bf5495b28/torch-2.9.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:030bbfe367379ae6a4ae4042b6c44da25383343b8b3c68abaa9c7231efbaf2dd", size = 104213554, upload-time = "2025-10-15T15:45:59.798Z" }, - { url = "https://files.pythonhosted.org/packages/58/1d/fd1e88ae0948825efcab7dd66d12bec23f05d4d38ed81573c8d453c14c06/torch-2.9.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:51cb63902182a78e90886e8068befd8ea102af4b00e420263591a3d70c7d3c6c", size = 899795167, upload-time = "2025-10-15T15:47:12.695Z" }, - { url = "https://files.pythonhosted.org/packages/63/5a/496197b45c14982bef4e079b24c61dc108e3ab0d0cc9718dba9f54f45a46/torch-2.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:3f6aad4d2f0ee2248bac25339d74858ff846c3969b27d14ac235821f055af83d", size = 109310314, upload-time = "2025-10-15T15:46:16.633Z" }, - { url = "https://files.pythonhosted.org/packages/58/b0/2b4e647b0fc706e88eb6c253d05511865578f5f67b55fad639bf3272a4a1/torch-2.9.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:413e1654c9203733138858780e184d9fc59442f0b3b209e16f39354eb893db9b", size = 74452019, upload-time = "2025-10-15T15:46:04.296Z" }, - { url = "https://files.pythonhosted.org/packages/58/fe/334225e6330e672b36aef23d77451fa906ea12881570c08638a91331a212/torch-2.9.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:c596708b5105d0b199215acf0c9be7c1db5f1680d88eddadf4b75a299259a677", size = 104230578, upload-time = "2025-10-15T15:46:08.182Z" }, - { url = "https://files.pythonhosted.org/packages/05/cc/49566caaa218872ec9a2912456f470ff92649894a4bc2e5274aa9ef87c4a/torch-2.9.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:51de31219c97c51cf4bf2be94d622e3deb5dcc526c6dc00e97c17eaec0fc1d67", size = 899815990, upload-time = "2025-10-15T15:48:03.336Z" }, - { url = "https://files.pythonhosted.org/packages/74/25/e9ab21d5925b642d008f139d4a3c9664fc9ee1faafca22913c080cc4c0a5/torch-2.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:dd515c70059afd95f48b8192733764c08ca37a1d19803af6401b5ecad7c8676e", size = 109313698, upload-time = "2025-10-15T15:46:12.425Z" }, - { url = "https://files.pythonhosted.org/packages/b3/b7/205ef3e94de636feffd64b28bb59a0dfac0771221201b9871acf9236f5ca/torch-2.9.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:614a185e4986326d526a91210c8fc1397e76e8cfafa78baf6296a790e53a9eec", size = 74463678, upload-time = "2025-10-15T15:46:29.779Z" }, - { url = "https://files.pythonhosted.org/packages/d1/d3/3985739f3b8e88675127bf70f82b3a48ae083e39cda56305dbd90398fec0/torch-2.9.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:e5f7af1dc4c0a7c4a260c2534f41ddaf209714f7c89145e644c44712fbd6b642", size = 104107898, upload-time = "2025-10-15T15:46:20.883Z" }, - { url = "https://files.pythonhosted.org/packages/a5/4b/f4bb2e6c25d0272f798cd6d7a04ed315da76cec68c602d87040c7847287f/torch-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:01cff95ecd9a212ea2f141db28acccdceb6a4c54f64e6c51091146f5e2a772c6", size = 899738273, upload-time = "2025-10-15T15:50:04.188Z" }, - { url = "https://files.pythonhosted.org/packages/66/11/c1c5ba6691cda6279087c35bd626536e4fd29521fe740abf5008377a9a02/torch-2.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:4582b162f541651f0cb184d3e291c05c2f556c7117c64a9873e2ee158d40062b", size = 109280887, upload-time = "2025-10-15T15:46:26.228Z" }, - { url = "https://files.pythonhosted.org/packages/dd/5f/b85bd8c05312d71de9402bf5868d217c38827cfd09d8f8514e5be128a52b/torch-2.9.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:33f58e9a102a91259af289d50525c30323b5c9ae1d31322b6447c0814da68695", size = 74478983, upload-time = "2025-10-15T15:46:39.406Z" }, - { url = "https://files.pythonhosted.org/packages/c2/1c/90eb13833cdf4969ea9707586d7b57095c3b6e2b223a7256bf111689bcb8/torch-2.9.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c30a17fc83eeab346913e237c64b15b5ba6407fff812f6c541e322e19bc9ea0e", size = 104111330, upload-time = "2025-10-15T15:46:35.238Z" }, - { url = "https://files.pythonhosted.org/packages/0e/21/2254c54b8d523592c25ef4434769aa23e29b1e6bf5f4c0ad9e27bf442927/torch-2.9.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:8f25033b8667b57857dfd01458fbf2a9e6a6df1f8def23aef0dc46292f6aa642", size = 899750243, upload-time = "2025-10-15T15:48:57.459Z" }, - { url = "https://files.pythonhosted.org/packages/b7/a5/5cb94fa4fd1e78223455c23c200f30f6dc10c6d4a2bcc8f6e7f2a2588370/torch-2.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:d037f1b4ffd25013be4a7bf3651a0a910c68554956c7b2c92ebe87c76475dece", size = 109284513, upload-time = "2025-10-15T15:46:45.061Z" }, - { url = "https://files.pythonhosted.org/packages/66/e8/fc414d8656250ee46120b44836ffbb3266343db424b3e18ca79ebbf69d4f/torch-2.9.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e4e5b5cba837a2a8d1a497ba9a58dae46fa392593eaa13b871c42f71847503a5", size = 74830362, upload-time = "2025-10-15T15:46:48.983Z" }, - { url = "https://files.pythonhosted.org/packages/ed/5f/9474c98fc5ae0cd04b9466035428cd360e6611a86b8352a0fc2fa504acdc/torch-2.9.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:64693568f5dc4dbd5f880a478b1cea0201cc6b510d91d1bc54fea86ac5d1a637", size = 104144940, upload-time = "2025-10-15T15:47:29.076Z" }, - { url = "https://files.pythonhosted.org/packages/2d/5a/8e0c1cf57830172c109d4bd6be2708cabeaf550983eee7029291322447a0/torch-2.9.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:f8ed31ddd7d10bfb3fbe0b9fe01b1243577f13d75e6f4a0839a283915ce3791e", size = 899744054, upload-time = "2025-10-15T15:48:29.864Z" }, - { url = "https://files.pythonhosted.org/packages/6d/28/82c28b30fcb4b7c9cdd995763d18bbb830d6521356712faebbad92ffa61d/torch-2.9.0-cp313-cp313t-win_amd64.whl", hash = "sha256:eff527d4e4846e6f70d2afd8058b73825761203d66576a7e04ea2ecfebcb4ab8", size = 109517546, upload-time = "2025-10-15T15:47:33.395Z" }, - { url = "https://files.pythonhosted.org/packages/ff/c3/a91f96ec74347fa5fd24453fa514bc61c61ecc79196fa760b012a1873d96/torch-2.9.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:f8877779cf56d1ce431a7636703bdb13307f5960bb1af49716d8b179225e0e6a", size = 74480732, upload-time = "2025-10-15T15:47:38.002Z" }, - { url = "https://files.pythonhosted.org/packages/5c/73/9f70af34b334a7e0ef496ceec96b7ec767bd778ea35385ce6f77557534d1/torch-2.9.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:7e614fae699838038d888729f82b687c03413c5989ce2a9481f9a7e7a396e0bb", size = 74433037, upload-time = "2025-10-15T15:47:41.894Z" }, - { url = "https://files.pythonhosted.org/packages/b7/84/37cf88625901934c97109e583ecc21777d21c6f54cda97a7e5bbad1ee2f2/torch-2.9.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:dfb5b8cd310ba3436c7e14e8b7833ef658cf3045e50d2bdaed23c8fc517065eb", size = 104116482, upload-time = "2025-10-15T15:47:46.266Z" }, - { url = "https://files.pythonhosted.org/packages/56/8e/ca8b17866943a8d4f4664d402ea84210aa274588b4c5d89918f5caa24eec/torch-2.9.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:b3d29524993a478e46f5d598b249cd824b7ed98d7fba538bd9c4cde6c803948f", size = 899746916, upload-time = "2025-10-15T15:50:40.294Z" }, - { url = "https://files.pythonhosted.org/packages/43/65/3b17c0fbbdab6501c5b320a52a648628d0d44e7379f64e27d9eef701b6bf/torch-2.9.0-cp314-cp314-win_amd64.whl", hash = "sha256:71c7578984f5ec0eb645eb4816ac8435fcf3e3e2ae1901bcd2f519a9cafb5125", size = 109275151, upload-time = "2025-10-15T15:49:20.715Z" }, - { url = "https://files.pythonhosted.org/packages/83/36/74f8c051f785500396e42f93542422422dfd874a174f21f8d955d36e5d64/torch-2.9.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:71d9309aee457bbe0b164bce2111cd911c4ed4e847e65d5077dbbcd3aba6befc", size = 74823353, upload-time = "2025-10-15T15:49:16.59Z" }, - { url = "https://files.pythonhosted.org/packages/62/51/dc3b4e2f9ba98ae27238f0153ca098bf9340b2dafcc67fde645d496dfc2a/torch-2.9.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:c08fb654d783899e204a32cca758a7ce8a45b2d78eeb89517cc937088316f78e", size = 104140340, upload-time = "2025-10-15T15:50:19.67Z" }, - { url = "https://files.pythonhosted.org/packages/c0/8d/b00657f8141ac16af7bb6cda2e67de18499a3263b78d516b9a93fcbc98e3/torch-2.9.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:ec8feb0099b2daa5728fbc7abb0b05730fd97e0f359ff8bda09865aaa7bd7d4b", size = 899731750, upload-time = "2025-10-15T15:49:36.673Z" }, - { url = "https://files.pythonhosted.org/packages/fc/29/bd361e0cbb2c79ce6450f42643aaf6919956f89923a50571b0ebfe92d142/torch-2.9.0-cp314-cp314t-win_amd64.whl", hash = "sha256:695ba920f234ad4170c9c50e28d56c848432f8f530e6bc7f88fcb15ddf338e75", size = 109503850, upload-time = "2025-10-15T15:50:24.118Z" }, + { name = "typing-extensions" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/5f/56/9577683b23072075ed2e40d725c52c2019d71a972fab8e083763da8e707e/torch-2.9.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:1cc208435f6c379f9b8fdfd5ceb5be1e3b72a6bdf1cb46c0d2812aa73472db9e", size = 104207681, upload-time = "2025-11-12T15:19:56.48Z" }, + { url = "https://files.pythonhosted.org/packages/38/45/be5a74f221df8f4b609b78ff79dc789b0cc9017624544ac4dd1c03973150/torch-2.9.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:9fd35c68b3679378c11f5eb73220fdcb4e6f4592295277fbb657d31fd053237c", size = 899794036, upload-time = "2025-11-12T15:21:01.886Z" }, + { url = "https://files.pythonhosted.org/packages/67/95/a581e8a382596b69385a44bab2733f1273d45c842f5d4a504c0edc3133b6/torch-2.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:2af70e3be4a13becba4655d6cc07dcfec7ae844db6ac38d6c1dafeb245d17d65", size = 110969861, upload-time = "2025-11-12T15:21:30.145Z" }, + { url = "https://files.pythonhosted.org/packages/ad/51/1756dc128d2bf6ea4e0a915cb89ea5e730315ff33d60c1ff56fd626ba3eb/torch-2.9.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:a83b0e84cc375e3318a808d032510dde99d696a85fe9473fc8575612b63ae951", size = 74452222, upload-time = "2025-11-12T15:20:46.223Z" }, + { url = "https://files.pythonhosted.org/packages/15/db/c064112ac0089af3d2f7a2b5bfbabf4aa407a78b74f87889e524b91c5402/torch-2.9.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:62b3fd888277946918cba4478cf849303da5359f0fb4e3bfb86b0533ba2eaf8d", size = 104220430, upload-time = "2025-11-12T15:20:31.705Z" }, + { url = "https://files.pythonhosted.org/packages/56/be/76eaa36c9cd032d3b01b001e2c5a05943df75f26211f68fae79e62f87734/torch-2.9.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d033ff0ac3f5400df862a51bdde9bad83561f3739ea0046e68f5401ebfa67c1b", size = 899821446, upload-time = "2025-11-12T15:20:15.544Z" }, + { url = "https://files.pythonhosted.org/packages/47/cc/7a2949e38dfe3244c4df21f0e1c27bce8aedd6c604a587dd44fc21017cb4/torch-2.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:0d06b30a9207b7c3516a9e0102114024755a07045f0c1d2f2a56b1819ac06bcb", size = 110973074, upload-time = "2025-11-12T15:21:39.958Z" }, + { url = "https://files.pythonhosted.org/packages/1e/ce/7d251155a783fb2c1bb6837b2b7023c622a2070a0a72726ca1df47e7ea34/torch-2.9.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:52347912d868653e1528b47cafaf79b285b98be3f4f35d5955389b1b95224475", size = 74463887, upload-time = "2025-11-12T15:20:36.611Z" }, + { url = "https://files.pythonhosted.org/packages/0f/27/07c645c7673e73e53ded71705045d6cb5bae94c4b021b03aa8d03eee90ab/torch-2.9.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:da5f6f4d7f4940a173e5572791af238cb0b9e21b1aab592bd8b26da4c99f1cd6", size = 104126592, upload-time = "2025-11-12T15:20:41.62Z" }, + { url = "https://files.pythonhosted.org/packages/19/17/e377a460603132b00760511299fceba4102bd95db1a0ee788da21298ccff/torch-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:27331cd902fb4322252657f3902adf1c4f6acad9dcad81d8df3ae14c7c4f07c4", size = 899742281, upload-time = "2025-11-12T15:22:17.602Z" }, + { url = "https://files.pythonhosted.org/packages/b1/1a/64f5769025db846a82567fa5b7d21dba4558a7234ee631712ee4771c436c/torch-2.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:81a285002d7b8cfd3fdf1b98aa8df138d41f1a8334fd9ea37511517cedf43083", size = 110940568, upload-time = "2025-11-12T15:21:18.689Z" }, + { url = "https://files.pythonhosted.org/packages/6e/ab/07739fd776618e5882661d04c43f5b5586323e2f6a2d7d84aac20d8f20bd/torch-2.9.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:c0d25d1d8e531b8343bea0ed811d5d528958f1dcbd37e7245bc686273177ad7e", size = 74479191, upload-time = "2025-11-12T15:21:25.816Z" }, + { url = "https://files.pythonhosted.org/packages/20/60/8fc5e828d050bddfab469b3fe78e5ab9a7e53dda9c3bdc6a43d17ce99e63/torch-2.9.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c29455d2b910b98738131990394da3e50eea8291dfeb4b12de71ecf1fdeb21cb", size = 104135743, upload-time = "2025-11-12T15:21:34.936Z" }, + { url = "https://files.pythonhosted.org/packages/f2/b7/6d3f80e6918213babddb2a37b46dbb14c15b14c5f473e347869a51f40e1f/torch-2.9.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:524de44cd13931208ba2c4bde9ec7741fd4ae6bfd06409a604fc32f6520c2bc9", size = 899749493, upload-time = "2025-11-12T15:24:36.356Z" }, + { url = "https://files.pythonhosted.org/packages/a6/47/c7843d69d6de8938c1cbb1eba426b1d48ddf375f101473d3e31a5fc52b74/torch-2.9.1-cp313-cp313-win_amd64.whl", hash = "sha256:545844cc16b3f91e08ce3b40e9c2d77012dd33a48d505aed34b7740ed627a1b2", size = 110944162, upload-time = "2025-11-12T15:21:53.151Z" }, + { url = "https://files.pythonhosted.org/packages/28/0e/2a37247957e72c12151b33a01e4df651d9d155dd74d8cfcbfad15a79b44a/torch-2.9.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5be4bf7496f1e3ffb1dd44b672adb1ac3f081f204c5ca81eba6442f5f634df8e", size = 74830751, upload-time = "2025-11-12T15:21:43.792Z" }, + { url = "https://files.pythonhosted.org/packages/4b/f7/7a18745edcd7b9ca2381aa03353647bca8aace91683c4975f19ac233809d/torch-2.9.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:30a3e170a84894f3652434b56d59a64a2c11366b0ed5776fab33c2439396bf9a", size = 104142929, upload-time = "2025-11-12T15:21:48.319Z" }, + { url = "https://files.pythonhosted.org/packages/f4/dd/f1c0d879f2863ef209e18823a988dc7a1bf40470750e3ebe927efdb9407f/torch-2.9.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:8301a7b431e51764629208d0edaa4f9e4c33e6df0f2f90b90e261d623df6a4e2", size = 899748978, upload-time = "2025-11-12T15:23:04.568Z" }, + { url = "https://files.pythonhosted.org/packages/1f/9f/6986b83a53b4d043e36f3f898b798ab51f7f20fdf1a9b01a2720f445043d/torch-2.9.1-cp313-cp313t-win_amd64.whl", hash = "sha256:2e1c42c0ae92bf803a4b2409fdfed85e30f9027a66887f5e7dcdbc014c7531db", size = 111176995, upload-time = "2025-11-12T15:22:01.618Z" }, + { url = "https://files.pythonhosted.org/packages/40/60/71c698b466dd01e65d0e9514b5405faae200c52a76901baf6906856f17e4/torch-2.9.1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:2c14b3da5df416cf9cb5efab83aa3056f5b8cd8620b8fde81b4987ecab730587", size = 74480347, upload-time = "2025-11-12T15:21:57.648Z" }, + { url = "https://files.pythonhosted.org/packages/48/50/c4b5112546d0d13cc9eaa1c732b823d676a9f49ae8b6f97772f795874a03/torch-2.9.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1edee27a7c9897f4e0b7c14cfc2f3008c571921134522d5b9b5ec4ebbc69041a", size = 74433245, upload-time = "2025-11-12T15:22:39.027Z" }, + { url = "https://files.pythonhosted.org/packages/81/c9/2628f408f0518b3bae49c95f5af3728b6ab498c8624ab1e03a43dd53d650/torch-2.9.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:19d144d6b3e29921f1fc70503e9f2fc572cde6a5115c0c0de2f7ca8b1483e8b6", size = 104134804, upload-time = "2025-11-12T15:22:35.222Z" }, + { url = "https://files.pythonhosted.org/packages/28/fc/5bc91d6d831ae41bf6e9e6da6468f25330522e92347c9156eb3f1cb95956/torch-2.9.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:c432d04376f6d9767a9852ea0def7b47a7bbc8e7af3b16ac9cf9ce02b12851c9", size = 899747132, upload-time = "2025-11-12T15:23:36.068Z" }, + { url = "https://files.pythonhosted.org/packages/63/5d/e8d4e009e52b6b2cf1684bde2a6be157b96fb873732542fb2a9a99e85a83/torch-2.9.1-cp314-cp314-win_amd64.whl", hash = "sha256:d187566a2cdc726fc80138c3cdb260970fab1c27e99f85452721f7759bbd554d", size = 110934845, upload-time = "2025-11-12T15:22:48.367Z" }, + { url = "https://files.pythonhosted.org/packages/bd/b2/2d15a52516b2ea3f414643b8de68fa4cb220d3877ac8b1028c83dc8ca1c4/torch-2.9.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cb10896a1f7fedaddbccc2017ce6ca9ecaaf990f0973bdfcf405439750118d2c", size = 74823558, upload-time = "2025-11-12T15:22:43.392Z" }, + { url = "https://files.pythonhosted.org/packages/86/5c/5b2e5d84f5b9850cd1e71af07524d8cbb74cba19379800f1f9f7c997fc70/torch-2.9.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:0a2bd769944991c74acf0c4ef23603b9c777fdf7637f115605a4b2d8023110c7", size = 104145788, upload-time = "2025-11-12T15:23:52.109Z" }, + { url = "https://files.pythonhosted.org/packages/a9/8c/3da60787bcf70add986c4ad485993026ac0ca74f2fc21410bc4eb1bb7695/torch-2.9.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:07c8a9660bc9414c39cac530ac83b1fb1b679d7155824144a40a54f4a47bfa73", size = 899735500, upload-time = "2025-11-12T15:24:08.788Z" }, + { url = "https://files.pythonhosted.org/packages/db/2b/f7818f6ec88758dfd21da46b6cd46af9d1b3433e53ddbb19ad1e0da17f9b/torch-2.9.1-cp314-cp314t-win_amd64.whl", hash = "sha256:c88d3299ddeb2b35dcc31753305612db485ab6f1823e37fb29451c8b2732b87e", size = 111163659, upload-time = "2025-11-12T15:23:20.009Z" }, ] [[package]] @@ -5913,7 +5792,8 @@ name = "torchprofile" version = "0.0.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "torch", marker = "sys_platform == 'never'" }, { name = "torchvision", marker = "sys_platform == 'never'" }, ] @@ -5924,42 +5804,43 @@ wheels = [ [[package]] name = "torchvision" -version = "0.24.0" +version = "0.24.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy", marker = "sys_platform != 'linux'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "pillow", marker = "sys_platform != 'linux'" }, { name = "torch", marker = "sys_platform == 'never'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/63/5b/1404eeab00819df71a30e916c2081654366741f7838fcc4fff86b7bd9e7e/torchvision-0.24.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5e8d5e667deff87bd66d26df6d225f46224bb0782d4f3f8f5d2f3068b5fd4492", size = 1891723, upload-time = "2025-10-15T15:51:08.5Z" }, - { url = "https://files.pythonhosted.org/packages/88/e3/1b003ecd52bd721f8304aeb66691edfbc2002747ec83d36188ad6abab506/torchvision-0.24.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a110a51c75e89807a8382b0d8034f5e180fb9319570be3389ffd3d4ac4fd57a9", size = 2418988, upload-time = "2025-10-15T15:51:25.195Z" }, - { url = "https://files.pythonhosted.org/packages/56/2e/3c19a35e62da0f606baf8f6e2ceeab1eb66aaa2f84c6528538b06b416d54/torchvision-0.24.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:81d5b12a6df1bb2cc8bdbad837b637d6ea446f2866e6d94f1b5d478856331be3", size = 8046769, upload-time = "2025-10-15T15:51:15.221Z" }, - { url = "https://files.pythonhosted.org/packages/e0/1d/e7ab614a1ace820a2366eab1532679fbe81bd9501ffd6a1b7be14936366d/torchvision-0.24.0-cp310-cp310-win_amd64.whl", hash = "sha256:0839dbb305d34671f5a64f558782095134b04bbeff8b90f11eb80515d7d50092", size = 3686529, upload-time = "2025-10-15T15:51:20.982Z" }, - { url = "https://files.pythonhosted.org/packages/a3/17/54ed2ec6944ea972b461a86424c8c7f98835982c90cbc45bf59bd962863a/torchvision-0.24.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f771cf918351ad509a28488be475f3e9cc71a750d6b1467842bfb64863a5e986", size = 1891719, upload-time = "2025-10-15T15:51:10.384Z" }, - { url = "https://files.pythonhosted.org/packages/f8/07/0cd6776eee784742ad3cb2bfd3295383d84cb2f9e87386119333d1587f0f/torchvision-0.24.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbd63bf4ebff84c48c50123eba90526cc9f794fe45bc9f5dd07cec19e8c62bce", size = 2420513, upload-time = "2025-10-15T15:51:18.087Z" }, - { url = "https://files.pythonhosted.org/packages/1a/f4/6026c08011ddcefcbc14161c5aa9dce55c35c6b045e04ef0952e88bf4594/torchvision-0.24.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:78fe414b3bb6dbf7e6f6da6f733ba96881f6b29a9b997228de7c5f603e5ed940", size = 8048018, upload-time = "2025-10-15T15:51:13.579Z" }, - { url = "https://files.pythonhosted.org/packages/2f/b4/362b4e67ed87cee0fb4f8f0363a852eaeef527968bf62c07ed56f764d729/torchvision-0.24.0-cp311-cp311-win_amd64.whl", hash = "sha256:629584b94e52f32a6278f2a35d85eeaae95fcc38730fcb765064f26c3c96df5d", size = 4027686, upload-time = "2025-10-15T15:51:19.189Z" }, - { url = "https://files.pythonhosted.org/packages/47/ef/81e4e69e02e2c4650b30e8c11c8974f946682a30e0ab7e9803a831beff76/torchvision-0.24.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c61d40bcd2e2451e932902a702ad495ba1ec6f279e90b1e15cef2bb55dc911e2", size = 1891726, upload-time = "2025-10-15T15:51:16.977Z" }, - { url = "https://files.pythonhosted.org/packages/00/7b/e3809b3302caea9a12c13f3adebe4fef127188438e719fd6c8dc93db1da6/torchvision-0.24.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b0531d1483fc322d7da0d83be52f0df860a75114ab87dbeeb9de765feaeda843", size = 2419495, upload-time = "2025-10-15T15:51:11.885Z" }, - { url = "https://files.pythonhosted.org/packages/7e/e6/7324ead6793075a8c75c56abeed1236d1750de16a5613cfe2ddad164a92a/torchvision-0.24.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:26b9dd9c083f8e5f7ac827de6d5b88c615d9c582dc87666770fbdf16887e4c25", size = 8050480, upload-time = "2025-10-15T15:51:24.012Z" }, - { url = "https://files.pythonhosted.org/packages/3e/ad/3c56fcd2a0d6e8afa80e115b5ade4302232ec99655220a51d05709819523/torchvision-0.24.0-cp312-cp312-win_amd64.whl", hash = "sha256:060b7c50ed4b3fb0316b08e2e31bfd874ec2f63ef5ae02f81e54341ca4e88703", size = 4292225, upload-time = "2025-10-15T15:51:27.699Z" }, - { url = "https://files.pythonhosted.org/packages/4f/b5/b2008e4b77a8d6aada828dd0f6a438d8f94befa23fdd2d62fa0ac6e60113/torchvision-0.24.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:84d79cfc6457310107ce4d712de7a3d388b24484bc9aeded4a76d8f8e3a2813d", size = 1891722, upload-time = "2025-10-15T15:51:28.854Z" }, - { url = "https://files.pythonhosted.org/packages/8f/02/e2f6b0ff93ca4db5751ac9c5be43f13d5e53d9e9412324f464dca1775027/torchvision-0.24.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:fec12a269cf80f6b0b71471c8d498cd3bdd9d8e892c425bf39fecb604852c3b0", size = 2371478, upload-time = "2025-10-15T15:51:37.842Z" }, - { url = "https://files.pythonhosted.org/packages/77/85/42e5fc4f716ec7b73cf1f32eeb5c77961be4d4054b26cd6a5ff97f20c966/torchvision-0.24.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:7323a9be5e3da695605753f501cdc87824888c5655d27735cdeaa9986b45884c", size = 8050200, upload-time = "2025-10-15T15:51:46.276Z" }, - { url = "https://files.pythonhosted.org/packages/93/c2/48cb0b6b26276d2120b1e0dbc877579a748eae02b4091a7522ce54f6d5e1/torchvision-0.24.0-cp313-cp313-win_amd64.whl", hash = "sha256:08cad8b204196e945f0b2d73adee952d433db1c03645851d52b22a45f1015b13", size = 4309939, upload-time = "2025-10-15T15:51:39.002Z" }, - { url = "https://files.pythonhosted.org/packages/7d/d7/3dd10830b047eeb46ae6b465474258d7b4fbb7d8872dca69bd42449f5c82/torchvision-0.24.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:6ab956a6e588623353e0f20d4b03eb1656cb4a3c75ca4dd8b4e32e01bc43271a", size = 2028355, upload-time = "2025-10-15T15:51:22.384Z" }, - { url = "https://files.pythonhosted.org/packages/f7/cf/2d7e43409089ce7070f5336161f9216d58653ee1cb26bcb5d6c84cc2de36/torchvision-0.24.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:b1b3db80609c32a088554e8e94b4fc31f1033fe5bb4ac0673ec49c3eb03fb4da", size = 2374466, upload-time = "2025-10-15T15:51:35.382Z" }, - { url = "https://files.pythonhosted.org/packages/e9/30/8f7c328fd7e0a9665da4b6b56b1c627665c18470bfe62f3729ad3eda9aec/torchvision-0.24.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:e6635f100d455c80b43f297df4b8585a76c6a2e114802f6567ddd28d7b5479b0", size = 8217068, upload-time = "2025-10-15T15:51:36.623Z" }, - { url = "https://files.pythonhosted.org/packages/55/a2/b6f9e40e2904574c80b3bb872c66af20bbd642053e7c8e1b9e99ab396535/torchvision-0.24.0-cp313-cp313t-win_amd64.whl", hash = "sha256:4ce158bbdc3a9086034bced0b5212888bd5b251fee6d08a9eff151d30b4b228a", size = 4273912, upload-time = "2025-10-15T15:51:33.866Z" }, - { url = "https://files.pythonhosted.org/packages/1b/24/790a39645cc8c71bf442d54a76da9bda5caeb2a44c5f7e02498649cd99d4/torchvision-0.24.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4bdfc85a5ed706421555f32cdc5e3ddb6d40bf65ef03a274ce3c176393e2904b", size = 2028335, upload-time = "2025-10-15T15:51:26.252Z" }, - { url = "https://files.pythonhosted.org/packages/b0/d7/69479a066ea773653e88eda99031e38681e9094046f87cb957af5036db0e/torchvision-0.24.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:73576a9c4a593223fbae85a64e8bbd77049abd1101893ecf3c5e981284fd58b4", size = 2371609, upload-time = "2025-10-15T15:51:29.859Z" }, - { url = "https://files.pythonhosted.org/packages/46/64/3c7fdb3771ec992b9445a1f7a969466b23ce2cdb14e09303b3db351a0655/torchvision-0.24.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:dd565b1b06666ff399d0801d4d1824fa570c0167a179ca700a5be232527b3c62", size = 8214918, upload-time = "2025-10-15T15:51:41.465Z" }, - { url = "https://files.pythonhosted.org/packages/58/51/abc416bc34d574ad479af738e413d9ebf93027ee92d0f4ae38f966b818f7/torchvision-0.24.0-cp314-cp314-win_amd64.whl", hash = "sha256:eb45d12ac48d757738788fd3fb8e88e647d6b2ab2424134ca87556efc72d81b5", size = 4257776, upload-time = "2025-10-15T15:51:42.642Z" }, - { url = "https://files.pythonhosted.org/packages/08/f7/261d1353c611820541ecd43046b89da3f1ae998dc786e4288b890a009883/torchvision-0.24.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:68120e7e03c31900e499a10bb7fdd63cfd67f0054c9fa108e7e27f9cd372f315", size = 2028359, upload-time = "2025-10-15T15:51:32.119Z" }, - { url = "https://files.pythonhosted.org/packages/a2/fd/615d8a86db1578345de7fa1edaf476fbcf4f057bf7e4fd898306b620c487/torchvision-0.24.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:64e54494043eecf9f57a9881c6fdea49c62282782e737c002ae8b1639e6ea80e", size = 2374469, upload-time = "2025-10-15T15:51:40.19Z" }, - { url = "https://files.pythonhosted.org/packages/04/98/bac11e8fdbf00d6c398246ff2781370aa72c99f2ac685c01ce79354c9a32/torchvision-0.24.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:75ef9546323b321a451239d886f0cb528f7e98bb294da47a3200effd4e572064", size = 8217060, upload-time = "2025-10-15T15:51:45.033Z" }, - { url = "https://files.pythonhosted.org/packages/47/6f/9fba8abc468c904570699eceeb51588f9622172b8fffa4ab11bcf15598c2/torchvision-0.24.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2efb617667950814fc8bb9437e5893861b3616e214285be33cbc364a3f42c599", size = 4358490, upload-time = "2025-10-15T15:51:43.884Z" }, + { url = "https://files.pythonhosted.org/packages/f7/09/d51aadf8591138e08b74c64a6eb783630c7a31ca2634416277115a9c3a2b/torchvision-0.24.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ded5e625788572e4e1c4d155d1bbc48805c113794100d70e19c76e39e4d53465", size = 1891441, upload-time = "2025-11-12T15:25:01.687Z" }, + { url = "https://files.pythonhosted.org/packages/6b/49/a35df863e7c153aad82af7505abd8264a5b510306689712ef86bea862822/torchvision-0.24.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:54ed17c3d30e718e08d8da3fd5b30ea44b0311317e55647cb97077a29ecbc25b", size = 2386226, upload-time = "2025-11-12T15:25:05.449Z" }, + { url = "https://files.pythonhosted.org/packages/49/20/f2d7cd1eea052887c1083afff0b8df5228ec93b53e03759f20b1a3c6d22a/torchvision-0.24.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f476da4e085b7307aaab6f540219617d46d5926aeda24be33e1359771c83778f", size = 8046093, upload-time = "2025-11-12T15:25:09.425Z" }, + { url = "https://files.pythonhosted.org/packages/d8/cf/0ff4007c09903199307da5f53a192ff5d62b45447069e9ef3a19bdc5ff12/torchvision-0.24.1-cp310-cp310-win_amd64.whl", hash = "sha256:fbdbdae5e540b868a681240b7dbd6473986c862445ee8a138680a6a97d6c34ff", size = 3696202, upload-time = "2025-11-12T15:25:10.657Z" }, + { url = "https://files.pythonhosted.org/packages/e7/69/30f5f03752aa1a7c23931d2519b31e557f3f10af5089d787cddf3b903ecf/torchvision-0.24.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:056c525dc875f18fe8e9c27079ada166a7b2755cea5a2199b0bc7f1f8364e600", size = 1891436, upload-time = "2025-11-12T15:25:04.3Z" }, + { url = "https://files.pythonhosted.org/packages/0c/69/49aae86edb75fe16460b59a191fcc0f568c2378f780bb063850db0fe007a/torchvision-0.24.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:1e39619de698e2821d71976c92c8a9e50cdfd1e993507dfb340f2688bfdd8283", size = 2387757, upload-time = "2025-11-12T15:25:06.795Z" }, + { url = "https://files.pythonhosted.org/packages/11/c9/1dfc3db98797b326f1d0c3f3bb61c83b167a813fc7eab6fcd2edb8c7eb9d/torchvision-0.24.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a0f106663e60332aa4fcb1ca2159ef8c3f2ed266b0e6df88de261048a840e0df", size = 8047682, upload-time = "2025-11-12T15:25:21.125Z" }, + { url = "https://files.pythonhosted.org/packages/fa/bb/cfc6a6f6ccc84a534ed1fdf029ae5716dd6ff04e57ed9dc2dab38bf652d5/torchvision-0.24.1-cp311-cp311-win_amd64.whl", hash = "sha256:a9308cdd37d8a42e14a3e7fd9d271830c7fecb150dd929b642f3c1460514599a", size = 4037588, upload-time = "2025-11-12T15:25:14.402Z" }, + { url = "https://files.pythonhosted.org/packages/f0/af/18e2c6b9538a045f60718a0c5a058908ccb24f88fde8e6f0fc12d5ff7bd3/torchvision-0.24.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e48bf6a8ec95872eb45763f06499f87bd2fb246b9b96cb00aae260fda2f96193", size = 1891433, upload-time = "2025-11-12T15:25:03.232Z" }, + { url = "https://files.pythonhosted.org/packages/9d/43/600e5cfb0643d10d633124f5982d7abc2170dfd7ce985584ff16edab3e76/torchvision-0.24.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:7fb7590c737ebe3e1c077ad60c0e5e2e56bb26e7bccc3b9d04dbfc34fd09f050", size = 2386737, upload-time = "2025-11-12T15:25:08.288Z" }, + { url = "https://files.pythonhosted.org/packages/93/b1/db2941526ecddd84884132e2742a55c9311296a6a38627f9e2627f5ac889/torchvision-0.24.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:66a98471fc18cad9064123106d810a75f57f0838eee20edc56233fd8484b0cc7", size = 8049868, upload-time = "2025-11-12T15:25:13.058Z" }, + { url = "https://files.pythonhosted.org/packages/69/98/16e583f59f86cd59949f59d52bfa8fc286f86341a229a9d15cbe7a694f0c/torchvision-0.24.1-cp312-cp312-win_amd64.whl", hash = "sha256:4aa6cb806eb8541e92c9b313e96192c6b826e9eb0042720e2fa250d021079952", size = 4302006, upload-time = "2025-11-12T15:25:16.184Z" }, + { url = "https://files.pythonhosted.org/packages/e4/97/ab40550f482577f2788304c27220e8ba02c63313bd74cf2f8920526aac20/torchvision-0.24.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:8a6696db7fb71eadb2c6a48602106e136c785642e598eb1533e0b27744f2cce6", size = 1891435, upload-time = "2025-11-12T15:25:28.642Z" }, + { url = "https://files.pythonhosted.org/packages/30/65/ac0a3f9be6abdbe4e1d82c915d7e20de97e7fd0e9a277970508b015309f3/torchvision-0.24.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:db2125c46f9cb25dc740be831ce3ce99303cfe60439249a41b04fd9f373be671", size = 2338718, upload-time = "2025-11-12T15:25:26.19Z" }, + { url = "https://files.pythonhosted.org/packages/10/b5/5bba24ff9d325181508501ed7f0c3de8ed3dd2edca0784d48b144b6c5252/torchvision-0.24.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:f035f0cacd1f44a8ff6cb7ca3627d84c54d685055961d73a1a9fb9827a5414c8", size = 8049661, upload-time = "2025-11-12T15:25:22.558Z" }, + { url = "https://files.pythonhosted.org/packages/5c/ec/54a96ae9ab6a0dd66d4bba27771f892e36478a9c3489fa56e51c70abcc4d/torchvision-0.24.1-cp313-cp313-win_amd64.whl", hash = "sha256:16274823b93048e0a29d83415166a2e9e0bf4e1b432668357b657612a4802864", size = 4319808, upload-time = "2025-11-12T15:25:17.318Z" }, + { url = "https://files.pythonhosted.org/packages/d5/f3/a90a389a7e547f3eb8821b13f96ea7c0563cdefbbbb60a10e08dda9720ff/torchvision-0.24.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e3f96208b4bef54cd60e415545f5200346a65024e04f29a26cd0006dbf9e8e66", size = 2005342, upload-time = "2025-11-12T15:25:11.871Z" }, + { url = "https://files.pythonhosted.org/packages/a9/fe/ff27d2ed1b524078164bea1062f23d2618a5fc3208e247d6153c18c91a76/torchvision-0.24.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:f231f6a4f2aa6522713326d0d2563538fa72d613741ae364f9913027fa52ea35", size = 2341708, upload-time = "2025-11-12T15:25:25.08Z" }, + { url = "https://files.pythonhosted.org/packages/b1/b9/d6c903495cbdfd2533b3ef6f7b5643ff589ea062f8feb5c206ee79b9d9e5/torchvision-0.24.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:1540a9e7f8cf55fe17554482f5a125a7e426347b71de07327d5de6bfd8d17caa", size = 8177239, upload-time = "2025-11-12T15:25:18.554Z" }, + { url = "https://files.pythonhosted.org/packages/4f/2b/ba02e4261369c3798310483028495cf507e6cb3f394f42e4796981ecf3a7/torchvision-0.24.1-cp313-cp313t-win_amd64.whl", hash = "sha256:d83e16d70ea85d2f196d678bfb702c36be7a655b003abed84e465988b6128938", size = 4251604, upload-time = "2025-11-12T15:25:34.069Z" }, + { url = "https://files.pythonhosted.org/packages/42/84/577b2cef8f32094add5f52887867da4c2a3e6b4261538447e9b48eb25812/torchvision-0.24.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:cccf4b4fec7fdfcd3431b9ea75d1588c0a8596d0333245dafebee0462abe3388", size = 2005319, upload-time = "2025-11-12T15:25:23.827Z" }, + { url = "https://files.pythonhosted.org/packages/5f/34/ecb786bffe0159a3b49941a61caaae089853132f3cd1e8f555e3621f7e6f/torchvision-0.24.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:1b495edd3a8f9911292424117544f0b4ab780452e998649425d1f4b2bed6695f", size = 2338844, upload-time = "2025-11-12T15:25:32.625Z" }, + { url = "https://files.pythonhosted.org/packages/51/99/a84623786a6969504c87f2dc3892200f586ee13503f519d282faab0bb4f0/torchvision-0.24.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:ab211e1807dc3e53acf8f6638df9a7444c80c0ad050466e8d652b3e83776987b", size = 8175144, upload-time = "2025-11-12T15:25:31.355Z" }, + { url = "https://files.pythonhosted.org/packages/6d/ba/8fae3525b233e109317ce6a9c1de922ab2881737b029a7e88021f81e068f/torchvision-0.24.1-cp314-cp314-win_amd64.whl", hash = "sha256:18f9cb60e64b37b551cd605a3d62c15730c086362b40682d23e24b616a697d41", size = 4234459, upload-time = "2025-11-12T15:25:19.859Z" }, + { url = "https://files.pythonhosted.org/packages/50/33/481602c1c72d0485d4b3a6b48c9534b71c2957c9d83bf860eb837bf5a620/torchvision-0.24.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ec9d7379c519428395e4ffda4dbb99ec56be64b0a75b95989e00f9ec7ae0b2d7", size = 2005336, upload-time = "2025-11-12T15:25:27.225Z" }, + { url = "https://files.pythonhosted.org/packages/d0/7f/372de60bf3dd8f5593bd0d03f4aecf0d1fd58f5bc6943618d9d913f5e6d5/torchvision-0.24.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:af9201184c2712d808bd4eb656899011afdfce1e83721c7cb08000034df353fe", size = 2341704, upload-time = "2025-11-12T15:25:29.857Z" }, + { url = "https://files.pythonhosted.org/packages/36/9b/0f3b9ff3d0225ee2324ec663de0e7fb3eb855615ca958ac1875f22f1f8e5/torchvision-0.24.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:9ef95d819fd6df81bc7cc97b8f21a15d2c0d3ac5dbfaab5cbc2d2ce57114b19e", size = 8177422, upload-time = "2025-11-12T15:25:37.357Z" }, + { url = "https://files.pythonhosted.org/packages/d6/ab/e2bcc7c2f13d882a58f8b30ff86f794210b075736587ea50f8c545834f8a/torchvision-0.24.1-cp314-cp314t-win_amd64.whl", hash = "sha256:480b271d6edff83ac2e8d69bbb4cf2073f93366516a50d48f140ccfceedb002e", size = 4335190, upload-time = "2025-11-12T15:25:35.745Z" }, ] [[package]] @@ -5971,8 +5852,7 @@ dependencies = [ { name = "docstring-parser" }, { name = "filelock" }, { name = "fsspec" }, - { name = "importlib-metadata", version = "8.6.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" }, - { name = "importlib-metadata", version = "8.7.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" }, + { name = "importlib-metadata" }, { name = "pyre-extensions" }, { name = "pyyaml" }, { name = "tabulate" }, @@ -5997,27 +5877,70 @@ wheels = [ [[package]] name = "transformer-engine" -version = "2.9.0+70f53666" -source = { git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.9#70f536662ae10a62a54f4ed1ba92e3314c5cfd69" } +version = "2.9.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6d/5c/21152e73aa46ac7c969d694ce86cdeb199024c7810b2d700e900ea4efb1a/transformer_engine-2.9.0-py3-none-any.whl", hash = "sha256:953147ed4c490e54c9884bb0d876a1341f05c5c5b7d304bf61f4740f6faee5af", size = 662107, upload-time = "2025-11-11T15:50:49.167Z" }, +] + +[package.optional-dependencies] +core-cu13 = [ + { name = "transformer-engine-cu13" }, +] +pytorch = [ + { name = "transformer-engine-torch" }, +] + +[[package]] +name = "transformer-engine-cu12" +version = "2.9.0" +source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "einops" }, - { name = "importlib-metadata", version = "8.6.1", source = { registry = "https://pypi.org/simple" } }, - { name = "onnx" }, - { name = "onnxscript", version = "0.5.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "onnxscript", version = "0.5.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "importlib-metadata" }, { name = "packaging" }, { name = "pydantic" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/a6/af/1c449ad0c43d3d6b5c529c812a4e8338b20965ae5361a9b612c7dce21e4d/transformer_engine_cu12-2.9.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:81162874c0618f3e62eb5ffba0bb1b608b4e56d70238205b1dced7ee965d82b3", size = 303669451, upload-time = "2025-11-11T15:54:12.008Z" }, + { url = "https://files.pythonhosted.org/packages/82/21/aa351994d8ade95681763df2b10770c768900ecc7f1cedbfa4e89fe1935a/transformer_engine_cu12-2.9.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:ad14981cbbd964f8e4446c35199d1bc5349ea30244e76bc57c1cceb5d469dd24", size = 304164366, upload-time = "2025-11-11T15:50:22.169Z" }, +] + +[[package]] +name = "transformer-engine-cu13" +version = "2.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "importlib-metadata" }, + { name = "packaging" }, + { name = "pydantic" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/b9/c1c788875848bf50faa22749107d91e92e9c0c78bb1878b99939209e40f9/transformer_engine_cu13-2.9.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:590aaeb3a4d552fe9ebc7019d43315f3e61153fcd1c5a07dc0c90bd8b278316e", size = 185010342, upload-time = "2025-11-13T22:35:04.742Z" }, + { url = "https://files.pythonhosted.org/packages/95/7f/3019c21565f63eeb79d24fa7d3bae39b5b73f21c72d7d5123d21d7ce945a/transformer_engine_cu13-2.9.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:4e869f5a0fd74aaa05a5d801a96688ed21827d23efe9774bd3038d5f2802ef46", size = 185669069, upload-time = "2025-11-13T22:35:13.709Z" }, +] + +[[package]] +name = "transformer-engine-torch" +version = "2.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "einops" }, + { name = "onnx" }, + { name = "onnxscript" }, { name = "torch", marker = "sys_platform == 'never'" }, + { name = "transformer-engine-cu12" }, ] +sdist = { url = "https://files.pythonhosted.org/packages/a2/a3/401d741eceb8f402595e63ee0b1828d60cae988b22f2f23c9cfcc24185bd/transformer_engine_torch-2.9.0.tar.gz", hash = "sha256:abbc59f6acf635abf865085ecdf90e7d4ca9a3782bc91a9845e38adb2655a547", size = 215138, upload-time = "2025-11-11T15:49:04.258Z" } [[package]] name = "transformers" -version = "4.57.1" +version = "4.57.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, { name = "huggingface-hub" }, - { name = "numpy" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "packaging" }, { name = "pyyaml" }, { name = "regex" }, @@ -6026,39 +5949,39 @@ dependencies = [ { name = "tokenizers" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d6/68/a39307bcc4116a30b2106f2e689130a48de8bd8a1e635b5e1030e46fcd9e/transformers-4.57.1.tar.gz", hash = "sha256:f06c837959196c75039809636cd964b959f6604b75b8eeec6fdfc0440b89cc55", size = 10142511, upload-time = "2025-10-14T15:39:26.18Z" } +sdist = { url = "https://files.pythonhosted.org/packages/dd/70/d42a739e8dfde3d92bb2fff5819cbf331fe9657323221e79415cd5eb65ee/transformers-4.57.3.tar.gz", hash = "sha256:df4945029aaddd7c09eec5cad851f30662f8bd1746721b34cc031d70c65afebc", size = 10139680, upload-time = "2025-11-25T15:51:30.139Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/71/d3/c16c3b3cf7655a67db1144da94b021c200ac1303f82428f2beef6c2e72bb/transformers-4.57.1-py3-none-any.whl", hash = "sha256:b10d05da8fa67dc41644dbbf9bc45a44cb86ae33da6f9295f5fbf5b7890bd267", size = 11990925, upload-time = "2025-10-14T15:39:23.085Z" }, + { url = "https://files.pythonhosted.org/packages/6a/6b/2f416568b3c4c91c96e5a365d164f8a4a4a88030aa8ab4644181fdadce97/transformers-4.57.3-py3-none-any.whl", hash = "sha256:c77d353a4851b1880191603d36acb313411d3577f6e2897814f333841f7003f4", size = 11993463, upload-time = "2025-11-25T15:51:26.493Z" }, ] [[package]] name = "triton" -version = "3.5.0" +version = "3.5.1" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/dd/22/507b6f58a35e05e84381630b2dc2a3cee1a7a2a7eaf4cba857c638a18a24/triton-3.5.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6f90de6a6566bb619b4c0adc9855729e1b1b5e26533fca1bf6206e96b6d277a3", size = 159827599, upload-time = "2025-10-15T19:15:43.87Z" }, - { url = "https://files.pythonhosted.org/packages/0b/eb/09e31d107a5d00eb281aa7e6635ca463e9bca86515944e399480eadb71f8/triton-3.5.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d5d3b3d480debf24eaa739623c9a42446b0b77f95593d30eb1f64cd2278cc1f0", size = 170333110, upload-time = "2025-10-13T16:37:49.588Z" }, - { url = "https://files.pythonhosted.org/packages/79/f9/b6f60f978397c616fd8dacca2305759fe4f80d397b20ef72534803244bd5/triton-3.5.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8457b22148defefdcb7fa8144b05ce211b9faefad650a1ce85b23df488d5549c", size = 159926731, upload-time = "2025-10-15T19:15:49.682Z" }, - { url = "https://files.pythonhosted.org/packages/3d/78/949a04391c21956c816523678f0e5fa308eb5b1e7622d88c4e4ef5fceca0/triton-3.5.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f34bfa21c5b3a203c0f0eab28dcc1e49bd1f67d22724e77fb6665a659200a4ec", size = 170433488, upload-time = "2025-10-13T16:37:57.132Z" }, - { url = "https://files.pythonhosted.org/packages/87/9b/30988039e1e84df7554fba24e6a734d2d0e847af33cabdf9b532b3c51456/triton-3.5.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7da21fccceafc163e3a5e857abe34351ef76345af06cabf9637a914742671f0b", size = 159946647, upload-time = "2025-10-15T19:15:56.325Z" }, - { url = "https://files.pythonhosted.org/packages/f5/3a/e991574f3102147b642e49637e0281e9bb7c4ba254edb2bab78247c85e01/triton-3.5.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9e71db82261c4ffa3921cd050cd5faa18322d2d405c30eb56084afaff3b0833", size = 170476535, upload-time = "2025-10-13T16:38:05.18Z" }, - { url = "https://files.pythonhosted.org/packages/cd/85/e37f1197acb04c8f3d83851d23d5d6ed5060ef74580668b112e23fdfa203/triton-3.5.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:188da5b81fa2f8322c27fec1627703eac24cb9bb7ab0dfbe9925973bc1b070d3", size = 159958970, upload-time = "2025-10-15T19:16:01.717Z" }, - { url = "https://files.pythonhosted.org/packages/6c/29/10728de8a6e932e517c10773486b8e99f85d1b1d9dd87d9a9616e1fef4a1/triton-3.5.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e6bb9aa5519c084a333acdba443789e50012a4b851cd486c54f0b8dc2a8d3a12", size = 170487289, upload-time = "2025-10-13T16:38:11.662Z" }, - { url = "https://files.pythonhosted.org/packages/b8/1d/38258f05010ac17a7b058c022911c9cae6526e149b7397134a048cf5a6c2/triton-3.5.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:03127d9b33aaf979c856676b394bc059ec1d68cb6da68ae03f62dd8ad77a04ae", size = 160073012, upload-time = "2025-10-15T19:16:07.477Z" }, - { url = "https://files.pythonhosted.org/packages/5c/38/db80e48b9220c9bce872b0f616ad0446cdf554a40b85c7865cbca99ab3c2/triton-3.5.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c83f2343e1a220a716c7b3ab9fccfcbe3ad4020d189549200e2d2e8d5868bed9", size = 170577179, upload-time = "2025-10-13T16:38:17.865Z" }, - { url = "https://files.pythonhosted.org/packages/91/fe/8f5771d00227f4eb1ee034f218ed427102b989366d2275fe3b3c105a3921/triton-3.5.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:468936651d383f4a6d10068d34a627505e13af55be5d002b9f27b987e7a5f0ac", size = 159957460, upload-time = "2025-10-15T19:16:12.626Z" }, - { url = "https://files.pythonhosted.org/packages/ff/60/1810655d1d856c9a4fcc90ee8966d85f552d98c53a6589f95ab2cbe27bb8/triton-3.5.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da0fa67ccd76c3dcfb0bffe1b1c57c685136a6bd33d141c24d9655d4185b1289", size = 170487949, upload-time = "2025-10-13T16:38:24.881Z" }, - { url = "https://files.pythonhosted.org/packages/78/59/99edd103958fe6e42b50b9ad8ce4f223ddf4ccf475259cf7d2b53381dc6c/triton-3.5.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7ceef21410229ac23173a28eee5cfc0e37c1dfdb8b4bc11ecda2e3ecec7c686", size = 160075629, upload-time = "2025-10-15T19:16:18.746Z" }, - { url = "https://files.pythonhosted.org/packages/fb/b7/1dec8433ac604c061173d0589d99217fe7bf90a70bdc375e745d044b8aad/triton-3.5.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:317fe477ea8fd4524a6a8c499fb0a36984a56d0b75bf9c9cb6133a1c56d5a6e7", size = 170580176, upload-time = "2025-10-13T16:38:31.14Z" }, + { url = "https://files.pythonhosted.org/packages/d9/2e/f95e673222afa2c7f0c687d8913e98fcf2589ef0b1405de76894e37fe18f/triton-3.5.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f63e34dcb32d7bd3a1d0195f60f30d2aee8b08a69a0424189b71017e23dfc3d2", size = 159821655, upload-time = "2025-11-11T17:51:44.09Z" }, + { url = "https://files.pythonhosted.org/packages/fd/6e/676ab5019b4dde8b9b7bab71245102fc02778ef3df48218b298686b9ffd6/triton-3.5.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5fc53d849f879911ea13f4a877243afc513187bc7ee92d1f2c0f1ba3169e3c94", size = 170320692, upload-time = "2025-11-11T17:40:46.074Z" }, + { url = "https://files.pythonhosted.org/packages/dc/dc/6ce44d055f2fc2403c4ec6b3cfd3a9b25f57b7d95efadccdea91497f8e81/triton-3.5.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da47169e30a779bade679ce78df4810fca6d78a955843d2ddb11f226adc517dc", size = 159928005, upload-time = "2025-11-11T17:51:50.008Z" }, + { url = "https://files.pythonhosted.org/packages/b0/72/ec90c3519eaf168f22cb1757ad412f3a2add4782ad3a92861c9ad135d886/triton-3.5.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:61413522a48add32302353fdbaaf92daaaab06f6b5e3229940d21b5207f47579", size = 170425802, upload-time = "2025-11-11T17:40:53.209Z" }, + { url = "https://files.pythonhosted.org/packages/db/53/2bcc46879910991f09c063eea07627baef2bc62fe725302ba8f46a2c1ae5/triton-3.5.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:275a045b6ed670dd1bd005c3e6c2d61846c74c66f4512d6f33cc027b11de8fd4", size = 159940689, upload-time = "2025-11-11T17:51:55.938Z" }, + { url = "https://files.pythonhosted.org/packages/f2/50/9a8358d3ef58162c0a415d173cfb45b67de60176e1024f71fbc4d24c0b6d/triton-3.5.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d2c6b915a03888ab931a9fd3e55ba36785e1fe70cbea0b40c6ef93b20fc85232", size = 170470207, upload-time = "2025-11-11T17:41:00.253Z" }, + { url = "https://files.pythonhosted.org/packages/f1/ba/805684a992ee32d486b7948d36aed2f5e3c643fc63883bf8bdca1c3f3980/triton-3.5.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56765ffe12c554cd560698398b8a268db1f616c120007bfd8829d27139abd24a", size = 159955460, upload-time = "2025-11-11T17:52:01.861Z" }, + { url = "https://files.pythonhosted.org/packages/27/46/8c3bbb5b0a19313f50edcaa363b599e5a1a5ac9683ead82b9b80fe497c8d/triton-3.5.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f3f4346b6ebbd4fad18773f5ba839114f4826037c9f2f34e0148894cd5dd3dba", size = 170470410, upload-time = "2025-11-11T17:41:06.319Z" }, + { url = "https://files.pythonhosted.org/packages/84/1e/7df59baef41931e21159371c481c31a517ff4c2517343b62503d0cd2be99/triton-3.5.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:02c770856f5e407d24d28ddc66e33cf026e6f4d360dcb8b2fabe6ea1fc758621", size = 160072799, upload-time = "2025-11-11T17:52:07.293Z" }, + { url = "https://files.pythonhosted.org/packages/37/92/e97fcc6b2c27cdb87ce5ee063d77f8f26f19f06916aa680464c8104ef0f6/triton-3.5.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0b4d2c70127fca6a23e247f9348b8adde979d2e7a20391bfbabaac6aebc7e6a8", size = 170579924, upload-time = "2025-11-11T17:41:12.455Z" }, + { url = "https://files.pythonhosted.org/packages/14/f9/0430e879c1e63a1016cb843261528fd3187c872c3a9539132efc39514753/triton-3.5.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f617aa7925f9ea9968ec2e1adaf93e87864ff51549c8f04ce658f29bbdb71e2d", size = 159956163, upload-time = "2025-11-11T17:52:12.999Z" }, + { url = "https://files.pythonhosted.org/packages/a4/e6/c595c35e5c50c4bc56a7bac96493dad321e9e29b953b526bbbe20f9911d0/triton-3.5.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d0637b1efb1db599a8e9dc960d53ab6e4637db7d4ab6630a0974705d77b14b60", size = 170480488, upload-time = "2025-11-11T17:41:18.222Z" }, + { url = "https://files.pythonhosted.org/packages/41/1e/63d367c576c75919e268e4fbc33c1cb33b6dc12bb85e8bfe531c2a8bd5d3/triton-3.5.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8932391d7f93698dfe5bc9bead77c47a24f97329e9f20c10786bb230a9083f56", size = 160073620, upload-time = "2025-11-11T17:52:18.403Z" }, + { url = "https://files.pythonhosted.org/packages/16/b5/b0d3d8b901b6a04ca38df5e24c27e53afb15b93624d7fd7d658c7cd9352a/triton-3.5.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bac7f7d959ad0f48c0e97d6643a1cc0fd5786fe61cb1f83b537c6b2d54776478", size = 170582192, upload-time = "2025-11-11T17:41:23.963Z" }, ] [[package]] name = "trove-classifiers" -version = "2025.9.11.17" +version = "2025.11.14.15" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ca/9a/778622bc06632529817c3c524c82749a112603ae2bbcf72ee3eb33a2c4f1/trove_classifiers-2025.9.11.17.tar.gz", hash = "sha256:931ca9841a5e9c9408bc2ae67b50d28acf85bef56219b56860876dd1f2d024dd", size = 16975, upload-time = "2025-09-11T17:07:50.97Z" } +sdist = { url = "https://files.pythonhosted.org/packages/bf/a9/880cccf76af9e7b322112f52e4e2dbb3534cbe671197b8f443a42189dfc7/trove_classifiers-2025.11.14.15.tar.gz", hash = "sha256:6b60f49d40bbd895bc61d8dc414fc2f2286d70eb72ed23548db8cf94f62804ca", size = 16995, upload-time = "2025-11-14T15:23:13.78Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e1/85/a4ff8758c66f1fc32aa5e9a145908394bf9cf1c79ffd1113cfdeb77e74e4/trove_classifiers-2025.9.11.17-py3-none-any.whl", hash = "sha256:5d392f2d244deb1866556457d6f3516792124a23d1c3a463a2e8668a5d1c15dd", size = 14158, upload-time = "2025-09-11T17:07:49.886Z" }, + { url = "https://files.pythonhosted.org/packages/49/f6/73c4aa003d1237ee9bea8a46f49dc38c45dfe95af4f0da7e60678d388011/trove_classifiers-2025.11.14.15-py3-none-any.whl", hash = "sha256:d1dac259c1e908939862e3331177931c6df0a37af2c1a8debcc603d9115fcdd9", size = 14191, upload-time = "2025-11-14T15:23:12.467Z" }, ] [[package]] @@ -6144,7 +6067,7 @@ wheels = [ [[package]] name = "wandb" -version = "0.22.3" +version = "0.23.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click" }, @@ -6158,17 +6081,17 @@ dependencies = [ { name = "sentry-sdk" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c1/d1/6b70f365ed86bd69debba8ad55dec8606fc21006e7ca703a5a091bd3b719/wandb-0.22.3.tar.gz", hash = "sha256:04468a8ab2769a46f5e384c9c4ada5da0dced005ca689a8424e4b8b5cb2a0291", size = 44337368, upload-time = "2025-10-28T23:59:10.275Z" } +sdist = { url = "https://files.pythonhosted.org/packages/ef/8b/db2d44395c967cd452517311fd6ede5d1e07310769f448358d4874248512/wandb-0.23.0.tar.gz", hash = "sha256:e5f98c61a8acc3ee84583ca78057f64344162ce026b9f71cb06eea44aec27c93", size = 44413921, upload-time = "2025-11-11T21:06:30.737Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/23/02/87fb60f587ec249f784a40bd91c30de1b2b24d691ee72675d5b66c3d0728/wandb-0.22.3-py3-none-macosx_12_0_arm64.whl", hash = "sha256:81b3b6e405f38342b0a080898b7d00c5b9375432f5ba358942a09e65cdcfe781", size = 18758047, upload-time = "2025-10-28T23:58:46.56Z" }, - { url = "https://files.pythonhosted.org/packages/26/88/64081740ef2b2efc7fbcb2139a07a849e42bcb09ae0c56ae50c41bd0ad63/wandb-0.22.3-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:d29c16817cca6401b4919069ec7570c781eacb67dc0b1ff2e0096a9a59581720", size = 19798011, upload-time = "2025-10-28T23:58:49.718Z" }, - { url = "https://files.pythonhosted.org/packages/19/72/c4f922b33dbb84d1c81ee045ff8791dd14e26d79e1e9bbafff964b7043e2/wandb-0.22.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb955d73a4ba55df9adc61fafbabef5556784d33fc39c7b5c8165d2694ddeb3b", size = 18542713, upload-time = "2025-10-28T23:58:51.927Z" }, - { url = "https://files.pythonhosted.org/packages/ad/98/3ce5f6e2086d91b0c51b38ae7ff591109e7da2bb25fe1a12eec0cdbaa494/wandb-0.22.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23f3ebe41a26506117a098fdfd2706ed0e50b37899bfbefe3a0628fcbd70c69d", size = 19984910, upload-time = "2025-10-28T23:58:54.641Z" }, - { url = "https://files.pythonhosted.org/packages/5e/57/e68cb38427b60490d6ddf1b992e6c7f36be83be1079d291ce87a8d347f48/wandb-0.22.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:2973462bed5d4a653b1a97cf9fc350673bb200fb356a2f4eba34beae9b87e0aa", size = 18581776, upload-time = "2025-10-28T23:58:56.975Z" }, - { url = "https://files.pythonhosted.org/packages/66/6d/543f907ce0c6b6da13628b23d19ca7282c559fd73eb47b04977b9a61d0c6/wandb-0.22.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:c5c2bd18f95c1639863c527da0a5818ac6b0e5194f9c691426b265908ddd8b2c", size = 20078800, upload-time = "2025-10-28T23:58:59.217Z" }, - { url = "https://files.pythonhosted.org/packages/da/91/1decaf1a6ac2017481c782e0fad7f90bc9ae4057f3d76d478cb6527f3dd3/wandb-0.22.3-py3-none-win32.whl", hash = "sha256:09ca1edfe0fd6dc30447d368acddb825668e60ee705c98594a6bbfd30d34d47e", size = 19160297, upload-time = "2025-10-28T23:59:01.536Z" }, - { url = "https://files.pythonhosted.org/packages/4c/ba/3b092634279994b0c79fe05220532822be09f3a353ae95c54e7142769db8/wandb-0.22.3-py3-none-win_amd64.whl", hash = "sha256:55403bf93872c9978433d101324f51e43e78c70c809bf6d06ca7b2760e39f497", size = 19160300, upload-time = "2025-10-28T23:59:04.06Z" }, - { url = "https://files.pythonhosted.org/packages/7f/80/4662fce9eebcc8c71f5083e9152ccaf7d43d4ca9c446e1422f9aa784a51c/wandb-0.22.3-py3-none-win_arm64.whl", hash = "sha256:49f66b05882abfa53816cc8d01b3c2435a89c5a090176802fa6928b5979d34d9", size = 17461959, upload-time = "2025-10-28T23:59:07.059Z" }, + { url = "https://files.pythonhosted.org/packages/41/61/a3220c7fa4cadfb2b2a5c09e3fa401787326584ade86d7c1f58bf1cd43bd/wandb-0.23.0-py3-none-macosx_12_0_arm64.whl", hash = "sha256:b682ec5e38fc97bd2e868ac7615a0ab4fc6a15220ee1159e87270a5ebb7a816d", size = 18992250, upload-time = "2025-11-11T21:06:03.412Z" }, + { url = "https://files.pythonhosted.org/packages/90/16/e69333cf3d11e7847f424afc6c8ae325e1f6061b2e5118d7a17f41b6525d/wandb-0.23.0-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:ec094eb71b778e77db8c188da19e52c4f96cb9d5b4421d7dc05028afc66fd7e7", size = 20045616, upload-time = "2025-11-11T21:06:07.109Z" }, + { url = "https://files.pythonhosted.org/packages/62/79/42dc6c7bb0b425775fe77f1a3f1a22d75d392841a06b43e150a3a7f2553a/wandb-0.23.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e43f1f04b98c34f407dcd2744cec0a590abce39bed14a61358287f817514a7b", size = 18758848, upload-time = "2025-11-11T21:06:09.832Z" }, + { url = "https://files.pythonhosted.org/packages/b8/94/d6ddb78334996ccfc1179444bfcfc0f37ffd07ee79bb98940466da6f68f8/wandb-0.23.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e5847f98cbb3175caf5291932374410141f5bb3b7c25f9c5e562c1988ce0bf5", size = 20231493, upload-time = "2025-11-11T21:06:12.323Z" }, + { url = "https://files.pythonhosted.org/packages/52/4d/0ad6df0e750c19dabd24d2cecad0938964f69a072f05fbdab7281bec2b64/wandb-0.23.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:6151355fd922539926e870be811474238c9614b96541773b990f1ce53368aef6", size = 18793473, upload-time = "2025-11-11T21:06:14.967Z" }, + { url = "https://files.pythonhosted.org/packages/f8/da/c2ba49c5573dff93dafc0acce691bb1c3d57361bf834b2f2c58e6193439b/wandb-0.23.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:df62e426e448ebc44269140deb7240df474e743b12d4b1f53b753afde4aa06d4", size = 20332882, upload-time = "2025-11-11T21:06:17.865Z" }, + { url = "https://files.pythonhosted.org/packages/40/65/21bfb10ee5cd93fbcaf794958863c7e05bac4bbeb1cc1b652094aa3743a5/wandb-0.23.0-py3-none-win32.whl", hash = "sha256:6c21d3eadda17aef7df6febdffdddfb0b4835c7754435fc4fe27631724269f5c", size = 19433198, upload-time = "2025-11-11T21:06:21.913Z" }, + { url = "https://files.pythonhosted.org/packages/f1/33/cbe79e66c171204e32cf940c7fdfb8b5f7d2af7a00f301c632f3a38aa84b/wandb-0.23.0-py3-none-win_amd64.whl", hash = "sha256:b50635fa0e16e528bde25715bf446e9153368428634ca7a5dbd7a22c8ae4e915", size = 19433201, upload-time = "2025-11-11T21:06:24.607Z" }, + { url = "https://files.pythonhosted.org/packages/1c/a0/5ecfae12d78ea036a746c071e4c13b54b28d641efbba61d2947c73b3e6f9/wandb-0.23.0-py3-none-win_arm64.whl", hash = "sha256:fa0181b02ce4d1993588f4a728d8b73ae487eb3cb341e6ce01c156be7a98ec72", size = 17678649, upload-time = "2025-11-11T21:06:27.289Z" }, ] [[package]] @@ -6301,7 +6224,8 @@ version = "1.0.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "braceexpand" }, - { name = "numpy" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "pyyaml" }, ] sdist = { url = "https://files.pythonhosted.org/packages/5a/3a/68800d92e065cf4750ebecf973b13979c0c929b439e1293012938862038d/webdataset-1.0.2.tar.gz", hash = "sha256:7f0498be827cfa46cc5430a58768a24e2c6a410676a61be1838f53d61afdaab4", size = 80090, upload-time = "2025-06-19T23:26:21.945Z" } @@ -6399,22 +6323,6 @@ wheels = [ name = "wrapt" version = "1.17.3" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.12.*' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.12.*' and sys_platform != 'linux'", - "python_full_version == '3.11.*' and sys_platform == 'linux'", - "python_full_version == '3.11.*' and sys_platform != 'linux'", - "python_full_version < '3.11' and sys_platform == 'linux'", - "python_full_version < '3.11' and sys_platform != 'linux'", -] sdist = { url = "https://files.pythonhosted.org/packages/95/8f/aeb76c5b46e273670962298c23e7ddde79916cb74db802131d49a85e4b7d/wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0", size = 55547, upload-time = "2025-08-12T05:53:21.714Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/3f/23/bb82321b86411eb51e5a5db3fb8f8032fd30bd7c2d74bfe936136b2fa1d6/wrapt-1.17.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:88bbae4d40d5a46142e70d58bf664a89b6b4befaea7b2ecc14e03cedb8e06c04", size = 53482, upload-time = "2025-08-12T05:51:44.467Z" }, @@ -6480,131 +6388,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" }, ] -[[package]] -name = "wrapt" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", - "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'", -] -sdist = { url = "https://files.pythonhosted.org/packages/49/19/5e5bcd855d808892fe02d49219f97a50f64cd6d8313d75df3494ee97b1a3/wrapt-2.0.0.tar.gz", hash = "sha256:35a542cc7a962331d0279735c30995b024e852cf40481e384fd63caaa391cbb9", size = 81722, upload-time = "2025-10-19T23:47:54.07Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ee/db/ac9546e89b645e525686727f8749847485e3b45ffc4507b61c4669358638/wrapt-2.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a7cebcee61f21b1e46aa32db8d9d93826d0fbf1ad85defc2ccfb93b4adef1435", size = 77431, upload-time = "2025-10-19T23:45:25.177Z" }, - { url = "https://files.pythonhosted.org/packages/74/bc/3b57c8012bbd0d02eec5ae838681c1a819df6c5e765ebc897f52623b5eb1/wrapt-2.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:827e6e3a3a560f6ec1f5ee92d4319c21a0549384f896ec692f3201eda31ebd11", size = 60644, upload-time = "2025-10-19T23:45:27.511Z" }, - { url = "https://files.pythonhosted.org/packages/b8/6e/b5e7d47713e3d46c30ec6ae83fafd369bc34de8148668c6e3168d9301863/wrapt-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a91075a5383a7cbfe46aed1845ef7c3f027e8e20e7d9a8a75e36ebc9b0dd15e", size = 61526, upload-time = "2025-10-19T23:45:28.789Z" }, - { url = "https://files.pythonhosted.org/packages/28/8d/d5df2af58ae479785473607a3b25726c295640cdcaee830847cee339eff9/wrapt-2.0.0-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b6a18c813196e18146b8d041e20875bdb0cb09b94ac1d1e1146e0fa87b2deb0d", size = 113638, upload-time = "2025-10-19T23:45:31.977Z" }, - { url = "https://files.pythonhosted.org/packages/f9/b7/9501c45ab93b4d6ba396ef02fcfb55867866bc8579fff045bb54cae58423/wrapt-2.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ec5028d26011a53c76bd91bb6198b30b438c6e0f7adb45f2ad84fe2655b6a104", size = 115651, upload-time = "2025-10-19T23:45:33.257Z" }, - { url = "https://files.pythonhosted.org/packages/5e/3a/bfebe2ba51cf98ae80c5dbb6fa5892ae75d1acf1a4c404eda88e28f5ab06/wrapt-2.0.0-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bed9b04900204721a24bcefc652ca267b01c1e8ad8bc8c0cff81558a45a3aadc", size = 112060, upload-time = "2025-10-19T23:45:30.298Z" }, - { url = "https://files.pythonhosted.org/packages/00/e7/cd50a32bed022d98f61a90e57faf782aa063f7930f57eb67eb105d3189be/wrapt-2.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:03442f2b45fa3f2b98a94a1917f52fb34670de8f96c0a009c02dbd512d855a3d", size = 114829, upload-time = "2025-10-19T23:45:34.23Z" }, - { url = "https://files.pythonhosted.org/packages/9d/2c/c709578271df0c70a27ab8f797c44c258650f24a32b452f03d7afedc070d/wrapt-2.0.0-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:17d0b5c42495ba142a1cee52b76414f9210591c84aae94dffda70240753bfb3c", size = 111249, upload-time = "2025-10-19T23:45:35.554Z" }, - { url = "https://files.pythonhosted.org/packages/60/ef/cb58f6eea41f129600bda68d1ae4c80b14d4e0663eec1d5220cbffe50be5/wrapt-2.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ee44215e7d13e112a8fc74e12ed1a1f41cab2bc07b11cc703f2398cd114b261c", size = 113312, upload-time = "2025-10-19T23:45:36.66Z" }, - { url = "https://files.pythonhosted.org/packages/59/55/97e6c4e1c175fb27f8dec717a3e36493ff0c4e50173a95f439496556910f/wrapt-2.0.0-cp310-cp310-win32.whl", hash = "sha256:fe6eafac3bc3c957ab6597a0c0654a0a308868458d00d218743e5b5fae51951c", size = 57961, upload-time = "2025-10-19T23:45:40.958Z" }, - { url = "https://files.pythonhosted.org/packages/3b/0a/898b1d81ae1f3dd9a79fd2e0330a7c8dd793982f815a318548777cb21ee5/wrapt-2.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:9e070c3491397fba0445b8977900271eca9656570cca7c900d9b9352186703a0", size = 60311, upload-time = "2025-10-19T23:45:38.033Z" }, - { url = "https://files.pythonhosted.org/packages/44/f1/e7e92f9535f5624ee22879f09456df9d1f1ae9bb338eef711077b48e456a/wrapt-2.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:806e2e73186eb5e3546f39fb5d0405040e0088db0fc8b2f667fd1863de2b3c99", size = 58822, upload-time = "2025-10-19T23:45:39.785Z" }, - { url = "https://files.pythonhosted.org/packages/12/8f/8e4c8b6da60b4205191d588cbac448fb9ff4f5ed89f4e555dc4813ab30cf/wrapt-2.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b7e221abb6c5387819db9323dac3c875b459695057449634f1111955d753c621", size = 77433, upload-time = "2025-10-19T23:45:42.543Z" }, - { url = "https://files.pythonhosted.org/packages/22/9a/01a29ccb029aa8e78241f8b53cb89ae8826c240129abbbb6ebba3416eff9/wrapt-2.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1147a84c8fc852426580af8b6e33138461ddbc65aa459a25ea539374d32069fa", size = 60641, upload-time = "2025-10-19T23:45:43.866Z" }, - { url = "https://files.pythonhosted.org/packages/3d/ec/e058997971428b7665b5c3665a55b18bb251ea7e08d002925e3ca017c020/wrapt-2.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5d6691d4a711504a0bc10de789842ad6ac627bed22937b10f37a1211a8ab7bb3", size = 61526, upload-time = "2025-10-19T23:45:44.839Z" }, - { url = "https://files.pythonhosted.org/packages/70/c3/c82263503f554715aa1847e85dc75a69631a54e9d7ab0f1a55e34a22d44a/wrapt-2.0.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f460e1eb8e75a17c3918c8e35ba57625721eef2439ef0bcf05304ac278a65e1d", size = 114069, upload-time = "2025-10-19T23:45:47.223Z" }, - { url = "https://files.pythonhosted.org/packages/dc/97/d95e88a3a1bc2890a1aa47880c2762cf0eb6d231b5a64048e351cec6f071/wrapt-2.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:12c37784b77bf043bf65cc96c7195a5db474b8e54173208af076bdbb61df7b3e", size = 116109, upload-time = "2025-10-19T23:45:48.252Z" }, - { url = "https://files.pythonhosted.org/packages/dc/36/cba0bf954f2303897b80fa5342499b43f8c5201110dddf0d578d6841b149/wrapt-2.0.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:75e5c049eb583835f7a0e0e311d9dde9bfbaac723a6dd89d052540f9b2809977", size = 112500, upload-time = "2025-10-19T23:45:45.838Z" }, - { url = "https://files.pythonhosted.org/packages/d7/2b/8cb88e63bec989f641d208acb3fd198bfdbbb4ef7dfb71f0cac3c90b07a9/wrapt-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e50bcbd5b65dac21b82319fcf18486e6ac439947e9305034b00704eb7405f553", size = 115356, upload-time = "2025-10-19T23:45:49.249Z" }, - { url = "https://files.pythonhosted.org/packages/bb/60/a6d5fb94648cd430648705bef9f4241bd22ead123ead552b6d2873ad5240/wrapt-2.0.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:06b78cb6b9320f57737a52fede882640d93cface98332d1a3df0c5696ec9ae9f", size = 111754, upload-time = "2025-10-19T23:45:51.21Z" }, - { url = "https://files.pythonhosted.org/packages/d0/44/1963854edf0592ae806307899dc7bf891e76cec19e598f55845c94603a65/wrapt-2.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:8c8349ebfc3cd98bc9105e0112dd8c8ac1f3c7cb5601f9d02248cae83a63f748", size = 113789, upload-time = "2025-10-19T23:45:52.473Z" }, - { url = "https://files.pythonhosted.org/packages/62/ec/4b1d76cb6d96ac511aaaa92efc57f528e57f06082a595b8b2663fcdb0f20/wrapt-2.0.0-cp311-cp311-win32.whl", hash = "sha256:028f19ec29e204fe725139d4a8b09f77ecfb64f8f02b7ab5ee822c85e330b68b", size = 57954, upload-time = "2025-10-19T23:45:57.03Z" }, - { url = "https://files.pythonhosted.org/packages/d4/cf/df8ff9bd64d4a75f9a9f6c1c93480a51904d0c9bd71c11994301c47d8a33/wrapt-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:c6961f05e58d919153ba311b397b7b904b907132b7b8344dde47865d4bb5ec89", size = 60308, upload-time = "2025-10-19T23:45:54.314Z" }, - { url = "https://files.pythonhosted.org/packages/69/d8/61e245fe387d58d84b3f913d5da9d909c4f239b887db692a05105aaf2a1b/wrapt-2.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:be7e316c2accd5a31dbcc230de19e2a846a325f8967fdea72704d00e38e6af06", size = 58822, upload-time = "2025-10-19T23:45:55.772Z" }, - { url = "https://files.pythonhosted.org/packages/3c/28/7f266b5bf50c3ad0c99c524d99faa0f7d6eecb045d950e7d2c9e1f0e1338/wrapt-2.0.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:73c6f734aecb1a030d9a265c13a425897e1ea821b73249bb14471445467ca71c", size = 78078, upload-time = "2025-10-19T23:45:58.855Z" }, - { url = "https://files.pythonhosted.org/packages/06/0c/bbdcad7eb535fae9d6b0fcfa3995c364797cd8e2b423bba5559ab2d88dcf/wrapt-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b4a7f8023b8ce8a36370154733c747f8d65c8697cb977d8b6efeb89291fff23e", size = 61158, upload-time = "2025-10-19T23:46:00.096Z" }, - { url = "https://files.pythonhosted.org/packages/d3/8a/bba3e7a4ebf4d1624103ee59d97b78a1fbb08fb5753ff5d1b69f5ef5e863/wrapt-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a1cb62f686c50e9dab5983c68f6c8e9cbf14a6007935e683662898a7d892fa69", size = 61646, upload-time = "2025-10-19T23:46:01.279Z" }, - { url = "https://files.pythonhosted.org/packages/ff/0c/0f565294897a72493dbafe7b46229b5f09f3776795a894d6b737e98387de/wrapt-2.0.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:43dc0550ae15e33e6bb45a82a5e1b5495be2587fbaa996244b509921810ee49f", size = 121442, upload-time = "2025-10-19T23:46:04.287Z" }, - { url = "https://files.pythonhosted.org/packages/da/80/7f03501a8a078ad79b19b1a888f9192a9494e62ddf8985267902766a4f30/wrapt-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:39c5b45b056d630545e40674d1f5e1b51864b3546f25ab6a4a331943de96262e", size = 123018, upload-time = "2025-10-19T23:46:06.052Z" }, - { url = "https://files.pythonhosted.org/packages/37/6b/ad0e1ff98359f13b4b0c2c52848e792841146fe79ac5f56899b9a028fc0d/wrapt-2.0.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:804e88f824b76240a1b670330637ccfd2d18b9efa3bb4f02eb20b2f64880b324", size = 117369, upload-time = "2025-10-19T23:46:02.53Z" }, - { url = "https://files.pythonhosted.org/packages/ac/6c/a90437bba8cb1ce2ed639af979515e09784678c2a7f4ffc79f2cf7de809e/wrapt-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c2c476aa3fc2b9899c3f7b20963fac4f952e7edb74a31fc92f7745389a2e3618", size = 121453, upload-time = "2025-10-19T23:46:07.747Z" }, - { url = "https://files.pythonhosted.org/packages/2c/a9/b3982f9bd15bd45857a23c48b7c36e47d05db4a4dcc5061c31f169238845/wrapt-2.0.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8d851e526891216f89fcb7a1820dad9bd503ba3468fb9635ee28e93c781aa98e", size = 116250, upload-time = "2025-10-19T23:46:09.385Z" }, - { url = "https://files.pythonhosted.org/packages/73/e2/b7a8b1afac9f791d8f5eac0d9726559f1d7ec4a2b5a6b4e67ac145b007a5/wrapt-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b95733c2360c4a8656ee93c7af78e84c0bd617da04a236d7a456c8faa34e7a2d", size = 120575, upload-time = "2025-10-19T23:46:11.882Z" }, - { url = "https://files.pythonhosted.org/packages/a2/0f/37920eeea96094f450ae35505d39f1135df951a2cdee0d4e01d4f843396a/wrapt-2.0.0-cp312-cp312-win32.whl", hash = "sha256:ea56817176834edf143df1109ae8fdaa087be82fdad3492648de0baa8ae82bf2", size = 58175, upload-time = "2025-10-19T23:46:15.678Z" }, - { url = "https://files.pythonhosted.org/packages/f0/db/b395f3b0c7f2c60d9219afacc54ceb699801ccf2d3d969ba556dc6d3af20/wrapt-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:3c7d3bee7be7a2665286103f4d1f15405c8074e6e1f89dac5774f9357c9a3809", size = 60415, upload-time = "2025-10-19T23:46:12.913Z" }, - { url = "https://files.pythonhosted.org/packages/86/22/33d660214548af47fc59d9eec8c0e0693bcedc5b3a0b52e8cbdd61f3b646/wrapt-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:680f707e1d26acbc60926659799b15659f077df5897a6791c7c598a5d4a211c4", size = 58911, upload-time = "2025-10-19T23:46:13.889Z" }, - { url = "https://files.pythonhosted.org/packages/18/0a/dd88abfe756b1aa79f0777e5ee4ce9e4b5dc4999bd805e9b04b52efc7b18/wrapt-2.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e2ea096db28d5eb64d381af0e93464621ace38a7003a364b6b5ffb7dd713aabe", size = 78083, upload-time = "2025-10-19T23:46:16.937Z" }, - { url = "https://files.pythonhosted.org/packages/7f/b9/8afebc1655a863bb2178b23c2d699b8743f3a7dab466904adc6155f3c858/wrapt-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c92b5a82d28491e3f14f037e1aae99a27a5e6e0bb161e65f52c0445a3fa7c940", size = 61156, upload-time = "2025-10-19T23:46:17.927Z" }, - { url = "https://files.pythonhosted.org/packages/bb/8b/f710a6528ccc52e21943f42c8cf64814cde90f9adbd3bcd58c7c274b4f75/wrapt-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:81d234718aabe632d179fac52c7f69f0f99fbaac4d4bcd670e62462bbcbfcad7", size = 61641, upload-time = "2025-10-19T23:46:19.229Z" }, - { url = "https://files.pythonhosted.org/packages/e4/5f/e4eabd0cc6684c5b208c2abc5c3459449c4d15be1694a9bbcf51e0e135fd/wrapt-2.0.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:db2eea83c43f84e4e41dbbb4c1de371a53166e55f900a6b130c3ef51c6345c1a", size = 121454, upload-time = "2025-10-19T23:46:21.808Z" }, - { url = "https://files.pythonhosted.org/packages/6f/c4/ec31ee17cc7866960d323609ba7402be786d211a6d713a59f776c4270bb3/wrapt-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:65f50e356c425c061e1e17fe687ff30e294fed9bf3441dc1f13ef73859c2a817", size = 123063, upload-time = "2025-10-19T23:46:23.545Z" }, - { url = "https://files.pythonhosted.org/packages/b0/2b/a4b10c3c0022e40aeae9bec009bafb049f440493f0575ebb27ecf61c32f8/wrapt-2.0.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:887f2a667e3cbfb19e204032d42ad7dedaa43972e4861dc7a3d51ae951d9b578", size = 117401, upload-time = "2025-10-19T23:46:20.433Z" }, - { url = "https://files.pythonhosted.org/packages/2a/4a/ade23a76967e1f148e461076a4d0e24a7950a5f18b394c9107fe60224ae2/wrapt-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9054829da4be461e3ad3192e4b6bbf1fc18af64c9975ce613aec191924e004dc", size = 121485, upload-time = "2025-10-19T23:46:24.85Z" }, - { url = "https://files.pythonhosted.org/packages/cb/ba/33b5f3e2edede4e1cfd259f0d9c203cf370f259bb9b215dd58fc6cbb94e9/wrapt-2.0.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:b952ffd77133a5a2798ee3feb18e51b0a299d2f440961e5bb7737dbb02e57289", size = 116276, upload-time = "2025-10-19T23:46:27.006Z" }, - { url = "https://files.pythonhosted.org/packages/eb/bf/b7f95bb4529a35ca11eb95d48f9d1a563b495471f7cf404c644566fb4293/wrapt-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e25fde03c480061b8234d8ee4863eb5f40a9be4fb258ce105b364de38fc6bcf9", size = 120578, upload-time = "2025-10-19T23:46:28.679Z" }, - { url = "https://files.pythonhosted.org/packages/f8/71/984849df6f052592474a44aafd6b847e1cffad39b0debc5390a04aa46331/wrapt-2.0.0-cp313-cp313-win32.whl", hash = "sha256:49e982b7860d325094978292a49e0418833fc7fc42c0dc7cd0b7524d7d06ee74", size = 58178, upload-time = "2025-10-19T23:46:32.372Z" }, - { url = "https://files.pythonhosted.org/packages/f9/3b/4e1fc0f2e1355fbc55ab248311bf4c958dbbd96bd9183b9e96882cc16213/wrapt-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:6e5c86389d9964050ce50babe247d172a5e3911d59a64023b90db2b4fa00ae7c", size = 60423, upload-time = "2025-10-19T23:46:30.041Z" }, - { url = "https://files.pythonhosted.org/packages/20/0a/9384e0551f56fe361f41bb8f209a13bb9ef689c3a18264225b249849b12c/wrapt-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:b96fdaa4611e05c7231937930567d3c16782be9dbcf03eb9f60d83e57dd2f129", size = 58918, upload-time = "2025-10-19T23:46:31.056Z" }, - { url = "https://files.pythonhosted.org/packages/68/70/37b90d3ee5bf0d0dc4859306383da08b685c9a51abff6fd6b0a7c052e117/wrapt-2.0.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:f2c7b7fead096dbf1dcc455b7f59facb05de3f5bfb04f60a69f98cdfe6049e5f", size = 81980, upload-time = "2025-10-19T23:46:33.368Z" }, - { url = "https://files.pythonhosted.org/packages/95/23/0ce69cc90806b90b3ee4cfd9ad8d2ee9becc3a1aab7df3c3bfc7d0904cb6/wrapt-2.0.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:04c7c8393f25b11c0faa5d907dd9eb462e87e4e7ba55e308a046d7ed37f4bbe2", size = 62900, upload-time = "2025-10-19T23:46:34.415Z" }, - { url = "https://files.pythonhosted.org/packages/54/76/03ec08170c02f38f3be3646977920976b968e0b704a0693a98f95d02f4d2/wrapt-2.0.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a93e0f8b376c0735b2f4daf58018b4823614d2b896cb72b6641c4d3dbdca1d75", size = 63636, upload-time = "2025-10-19T23:46:35.643Z" }, - { url = "https://files.pythonhosted.org/packages/75/c1/04ce0511e504cdcd84cdb6980bc7d4efa38ac358e8103d6dd0cd278bfc6d/wrapt-2.0.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b42d13603da4416c43c430dbc6313c8d7ff745c40942f146ed4f6dd02c7d2547", size = 152650, upload-time = "2025-10-19T23:46:38.717Z" }, - { url = "https://files.pythonhosted.org/packages/17/06/cd2e32b5f744701189c954f9ab5eee449c86695b13f414bb8ea7a83f6d48/wrapt-2.0.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c8bbd2472abf8c33480ad2314b1f8fac45d592aba6cc093e8839a7b2045660e6", size = 158811, upload-time = "2025-10-19T23:46:40.875Z" }, - { url = "https://files.pythonhosted.org/packages/7d/a2/a6d920695cca62563c1b969064e5cd2051344a6e330c184b6f80383d87e4/wrapt-2.0.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e64a3a1fd9a308ab9b815a2ad7a65b679730629dbf85f8fc3f7f970d634ee5df", size = 146033, upload-time = "2025-10-19T23:46:37.351Z" }, - { url = "https://files.pythonhosted.org/packages/c6/90/7fd2abe4ec646bc43cb6b0d05086be6fcf15e64f06f51fc4198804396d68/wrapt-2.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d61214525eaf88e0d0edf3d1ad5b5889863c6f88e588c6cdc6aa4ee5d1f10a4a", size = 155673, upload-time = "2025-10-19T23:46:42.582Z" }, - { url = "https://files.pythonhosted.org/packages/5f/8d/6cce7f8c41633e677ac8aa34e84b53a22a645ec2a680deb991785ca2798d/wrapt-2.0.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:04f7a5f92c5f7324a1735043cc467b1295a1c5b4e0c1395472b7c44706e3dc61", size = 144364, upload-time = "2025-10-19T23:46:44.381Z" }, - { url = "https://files.pythonhosted.org/packages/72/42/9570349e03afa9d83daf7f33ffb17e8cdc62d7e84c0d09005d0f51912efa/wrapt-2.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2356f76cb99b3de5b4e5b8210367fbbb81c7309fe39b622f5d199dd88eb7f765", size = 150275, upload-time = "2025-10-19T23:46:45.662Z" }, - { url = "https://files.pythonhosted.org/packages/f2/d8/448728e6fe030e5c4f1022c82cd3af1de1c672fa53d2d5b36b32a55ce7bf/wrapt-2.0.0-cp313-cp313t-win32.whl", hash = "sha256:0a921b657a224e40e4bc161b5d33934583b34f0c9c5bdda4e6ac66f9d2fcb849", size = 59867, upload-time = "2025-10-19T23:46:49.593Z" }, - { url = "https://files.pythonhosted.org/packages/8f/b1/ad812b1fe1cd85f6498dc3a3c9809a1e880d6108283b1735119bec217041/wrapt-2.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:c16f6d4eea98080f6659a8a7fc559d4a0a337ee66960659265cad2c8a40f7c0f", size = 63170, upload-time = "2025-10-19T23:46:46.87Z" }, - { url = "https://files.pythonhosted.org/packages/7f/29/c105b1e76650c82823c491952a7a8eafe09b78944f7a43f22d37ed860229/wrapt-2.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:52878edc13dc151c58a9966621d67163a80654bc6cff4b2e1c79fa62d0352b26", size = 60339, upload-time = "2025-10-19T23:46:47.862Z" }, - { url = "https://files.pythonhosted.org/packages/f8/38/0dd39f83163fd28326afba84e3e416656938df07e60a924ac4d992b30220/wrapt-2.0.0-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:79a53d86c2aff7b32cc77267e3a308365d1fcb881e74bc9cbe26f63ee90e37f0", size = 78242, upload-time = "2025-10-19T23:46:51.096Z" }, - { url = "https://files.pythonhosted.org/packages/08/ef/fa7a5c1d73f8690c712f9d2e4615700c6809942536dd3f441b9ba650a310/wrapt-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d731a4f22ed6ffa4cb551b4d2b0c24ff940c27a88edaf8e3490a5ee3a05aef71", size = 61207, upload-time = "2025-10-19T23:46:52.558Z" }, - { url = "https://files.pythonhosted.org/packages/23/d9/67cb93da492eb0a1cb17b7ed18220d059e58f00467ce6728b674d3441b3d/wrapt-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3e02ab8c0ac766a5a6e81cd3b6cc39200c69051826243182175555872522bd5a", size = 61748, upload-time = "2025-10-19T23:46:54.468Z" }, - { url = "https://files.pythonhosted.org/packages/e5/be/912bbd70cc614f491b526a1d7fe85695b283deed19287b9f32460178c54d/wrapt-2.0.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:895870602d65d7338edb3b6a717d856632ad9f14f7ff566214e4fb11f0816649", size = 120424, upload-time = "2025-10-19T23:46:57.575Z" }, - { url = "https://files.pythonhosted.org/packages/b2/e1/10df8937e7da2aa9bc3662a4b623e51a323c68f42cad7b13f0e61a700ce2/wrapt-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b9ad4fab76a0086dc364c4f17f39ad289600e73ef5c6e9ab529aff22cac1ac3", size = 122804, upload-time = "2025-10-19T23:46:59.308Z" }, - { url = "https://files.pythonhosted.org/packages/f3/60/576751b1919adab9f63168e3b5fd46c0d1565871b1cc4c2569503ccf4be6/wrapt-2.0.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e7ca0562606d7bad2736b2c18f61295d61f50cd3f4bfc51753df13614dbcce1b", size = 117398, upload-time = "2025-10-19T23:46:55.814Z" }, - { url = "https://files.pythonhosted.org/packages/ec/55/243411f360cc27bae5f8e21c16f1a8d87674c5534f4558e8a97c1e0d1c6f/wrapt-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fe089d9f5a4a3dea0108a8ae34bced114d0c4cca417bada1c5e8f42d98af9050", size = 121230, upload-time = "2025-10-19T23:47:01.347Z" }, - { url = "https://files.pythonhosted.org/packages/d6/23/2f21f692c3b3f0857cb82708ce0c341fbac55a489d4025ae4e3fd5d5de8c/wrapt-2.0.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:e761f2d2f8dbc80384af3d547b522a80e67db3e319c7b02e7fd97aded0a8a678", size = 116296, upload-time = "2025-10-19T23:47:02.659Z" }, - { url = "https://files.pythonhosted.org/packages/bd/ed/678957fad212cfb1b65b2359d62f5619f5087d1d1cf296c6a996be45171c/wrapt-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:17ba1bdc52d0c783481850996aa26cea5237720769197335abea2ae6b4c23bc0", size = 119602, upload-time = "2025-10-19T23:47:03.775Z" }, - { url = "https://files.pythonhosted.org/packages/dc/e3/aeb4c3b052d3eed95e61babc20dcb1a512651e098cca4b84a6896585c06a/wrapt-2.0.0-cp314-cp314-win32.whl", hash = "sha256:f73318741b141223a4674ba96992aa2291b1b3f7a5e85cb3c2c964f86171eb45", size = 58649, upload-time = "2025-10-19T23:47:07.382Z" }, - { url = "https://files.pythonhosted.org/packages/aa/2a/a71c51cb211798405b59172c7df5789a5b934b18317223cf22e0c6f852de/wrapt-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:8e08d4edb13cafe7b3260f31d4de033f73d3205774540cf583bffaa4bec97db9", size = 60897, upload-time = "2025-10-19T23:47:04.862Z" }, - { url = "https://files.pythonhosted.org/packages/f8/a5/acc5628035d06f69e9144cca543ca54c33b42a5a23b6f1e8fa131026db89/wrapt-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:af01695c2b7bbd8d67b869d8e3de2b123a7bfbee0185bdd138c2775f75373b83", size = 59306, upload-time = "2025-10-19T23:47:05.883Z" }, - { url = "https://files.pythonhosted.org/packages/a7/e6/1318ca07d7fcee57e4592a78dacd9d5493b8ddd971c553a62904fb2c0cf2/wrapt-2.0.0-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:057f02c13cce7b26c79624c06a3e1c2353e6dc9708525232232f6768118042ca", size = 81987, upload-time = "2025-10-19T23:47:08.7Z" }, - { url = "https://files.pythonhosted.org/packages/e7/bf/ffac358ddf61c3923d94a8b0e7620f2af1cd1b637a0fe4963a3919aa62b7/wrapt-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:79bdd84570267f3f43d609c892ae2d30b91ee4b8614c2cbfd311a2965f1c9bdb", size = 62902, upload-time = "2025-10-19T23:47:10.248Z" }, - { url = "https://files.pythonhosted.org/packages/b5/af/387c51f9e7b544fe95d852fc94f9f3866e3f7d7d39c2ee65041752f90bc2/wrapt-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:93c8b4f4d54fd401a817abbfc9bf482aa72fd447f8adf19ce81d035b3f5c762c", size = 63635, upload-time = "2025-10-19T23:47:11.746Z" }, - { url = "https://files.pythonhosted.org/packages/7c/99/d38d8c80b9cc352531d4d539a17e3674169a5cc25a7e6e5e3c27bc29893e/wrapt-2.0.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5e09ffd31001dce71c2c2a4fc201bdba9a2f9f62b23700cf24af42266e784741", size = 152659, upload-time = "2025-10-19T23:47:15.344Z" }, - { url = "https://files.pythonhosted.org/packages/5a/2a/e154432f274e22ecf2465583386c5ceffa5e0bab3947c1c5b26cc8e7b275/wrapt-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d87c285ff04e26083c4b03546e7b74df7ba4f1f32f1dcb92e9ac13c2dbb4c379", size = 158818, upload-time = "2025-10-19T23:47:17.569Z" }, - { url = "https://files.pythonhosted.org/packages/c5/7a/3a40c453300e2898e99c27495b8109ff7cd526997d12cfb8ebd1843199a4/wrapt-2.0.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e52e50ea0a72ea48d1291cf8b8aaedcc99072d9dc5baba6b820486dcf4c67da8", size = 146113, upload-time = "2025-10-19T23:47:13.026Z" }, - { url = "https://files.pythonhosted.org/packages/9e/e2/3116a9eade8bea2bf5eedba3fa420e3c7d193d4b047440330d8eaf1098de/wrapt-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1fd4c95536975895f32571073446e614d5e2810b666b64955586dcddfd438fd3", size = 155689, upload-time = "2025-10-19T23:47:19.397Z" }, - { url = "https://files.pythonhosted.org/packages/43/1c/277d3fbe9d177830ab9e54fe9253f38455b75a22d639a4bd9fa092d55ae5/wrapt-2.0.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:d6ebfe9283209220ed9de80a3e9442aab8fc2be5a9bbf8491b99e02ca9349a89", size = 144403, upload-time = "2025-10-19T23:47:20.779Z" }, - { url = "https://files.pythonhosted.org/packages/d8/37/ab6ddaf182248aac5ed925725ef4c69a510594764665ecbd95bdd4481f16/wrapt-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5d3ebd784804f146b7ea55359beb138e23cc18e5a5cc2cf26ad438723c00ce3a", size = 150307, upload-time = "2025-10-19T23:47:22.604Z" }, - { url = "https://files.pythonhosted.org/packages/f6/d7/df9e2d8040a3af618ff9496261cf90ca4f886fd226af0f4a69ac0c020c3b/wrapt-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:9b15940ae9debc8b40b15dc57e1ce4433f7fb9d3f8761c7fab1ddd94cb999d99", size = 60557, upload-time = "2025-10-19T23:47:26.73Z" }, - { url = "https://files.pythonhosted.org/packages/b4/c2/502bd4557a3a9199ea73cc5932cf83354bd362682162f0b14164d2e90216/wrapt-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:7a0efbbc06d3e2077476a04f55859819d23206600b4c33f791359a8e6fa3c362", size = 63988, upload-time = "2025-10-19T23:47:23.826Z" }, - { url = "https://files.pythonhosted.org/packages/1f/f2/632b13942f45db7af709f346ff38b8992c8c21b004e61ab320b0dec525fe/wrapt-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:7fec8a9455c029c8cf4ff143a53b6e7c463268d42be6c17efa847ebd2f809965", size = 60584, upload-time = "2025-10-19T23:47:25.396Z" }, - { url = "https://files.pythonhosted.org/packages/00/5c/c34575f96a0a038579683c7f10fca943c15c7946037d1d254ab9db1536ec/wrapt-2.0.0-py3-none-any.whl", hash = "sha256:02482fb0df89857e35427dfb844319417e14fae05878f295ee43fa3bf3b15502", size = 43998, upload-time = "2025-10-19T23:47:52.858Z" }, -] - [[package]] name = "xattr" version = "1.3.0" @@ -6902,55 +6685,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/73/ae/b48f95715333080afb75a4504487cbe142cae1268afc482d06692d605ae6/yarl-1.22.0-py3-none-any.whl", hash = "sha256:1380560bdba02b6b6c90de54133c81c9f2a453dee9912fe58c1dcced1edb7cff", size = 46814, upload-time = "2025-10-06T14:12:53.872Z" }, ] -[[package]] -name = "zarr" -version = "2.18.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.11' and sys_platform == 'linux'", - "python_full_version < '3.11' and sys_platform != 'linux'", -] -dependencies = [ - { name = "asciitree", marker = "python_full_version < '3.11'" }, - { name = "fasteners", marker = "python_full_version < '3.11' and sys_platform != 'emscripten'" }, - { name = "numcodecs", version = "0.13.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "numpy", marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/23/c4/187a21ce7cf7c8f00c060dd0e04c2a81139bb7b1ab178bba83f2e1134ce2/zarr-2.18.3.tar.gz", hash = "sha256:2580d8cb6dd84621771a10d31c4d777dca8a27706a1a89b29f42d2d37e2df5ce", size = 3603224, upload-time = "2024-09-04T23:20:16.595Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ed/c9/142095e654c2b97133ff71df60979422717b29738b08bc8a1709a5d5e0d0/zarr-2.18.3-py3-none-any.whl", hash = "sha256:b1f7dfd2496f436745cdd4c7bcf8d3b4bc1dceef5fdd0d589c87130d842496dd", size = 210723, upload-time = "2024-09-04T23:20:14.491Z" }, -] - -[[package]] -name = "zarr" -version = "3.1.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'", - "python_full_version == '3.12.*' and sys_platform == 'linux'", - "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'", - "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'", - "python_full_version == '3.12.*' and sys_platform != 'linux'", - "python_full_version == '3.11.*' and sys_platform == 'linux'", - "python_full_version == '3.11.*' and sys_platform != 'linux'", -] -dependencies = [ - { name = "donfig", marker = "python_full_version >= '3.11'" }, - { name = "numcodecs", version = "0.16.3", source = { registry = "https://pypi.org/simple" }, extra = ["crc32c"], marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", marker = "python_full_version >= '3.11'" }, - { name = "packaging", marker = "python_full_version >= '3.11'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/d6/67/14be68a7bad15eecda09b1e81fca2420f7533645fe187bf4d6104c1aad52/zarr-3.1.3.tar.gz", hash = "sha256:01342f3e26a02ed5670db608a5576fbdb8d76acb5c280bd2d0082454b1ba6f79", size = 349125, upload-time = "2025-09-18T19:32:41.688Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1a/71/9de7229515a53d1cc5705ca9c411530f711a2242f962214d9dbfe2741aa4/zarr-3.1.3-py3-none-any.whl", hash = "sha256:45f67f87f65f14fa453f99dd8110a5936b7ac69f3a21981d33e90407c80c302a", size = 276427, upload-time = "2025-09-18T19:32:40.042Z" }, -] - [[package]] name = "zipp" version = "3.23.0" From 98c64b29d6a2cf2a55436bb17cc0595f022bbcba Mon Sep 17 00:00:00 2001 From: Michael Wojcikiewicz Date: Thu, 27 Nov 2025 18:21:58 -0500 Subject: [PATCH 160/248] fix: use a script to do node tainting in the cicd workflow (#2421) --- .github/workflows/cicd-main.yml | 75 ++------------------------------- 1 file changed, 3 insertions(+), 72 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 7043e022c95..eff0ad2e3fe 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -204,30 +204,7 @@ jobs: - name: Taint node for job isolation if: contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral') shell: bash - run: | - # Verify prerequisites - if [ -z "$NODE_NAME" ]; then - echo "ERROR: NODE_NAME not set" - exit 1 - fi - - if ! command -v kubectl &> /dev/null; then - echo "ERROR: kubectl not found" - exit 1 - fi - - # Apply taint - JOB_ID="${GITHUB_RUN_ID}-${GITHUB_JOB}" - echo "=== Adding node taint for job isolation ===" - echo "Node: $NODE_NAME" - echo "Job ID: $JOB_ID" - - kubectl taint node "$NODE_NAME" "github.com/job-id=${JOB_ID}:NoSchedule" --overwrite=true - kubectl label node "$NODE_NAME" \ - "github.com/workflow=${GITHUB_WORKFLOW}" \ - "github.com/run-id=${GITHUB_RUN_ID}" \ - "github.com/job=${GITHUB_JOB}" \ - --overwrite=true + run: taint-node.sh - name: Checkout uses: actions/checkout@v4 @@ -389,30 +366,7 @@ jobs: - name: Taint node for job isolation if: contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral') shell: bash - run: | - # Verify prerequisites - if [ -z "$NODE_NAME" ]; then - echo "ERROR: NODE_NAME not set" - exit 1 - fi - - if ! command -v kubectl &> /dev/null; then - echo "ERROR: kubectl not found" - exit 1 - fi - - # Apply taint - JOB_ID="${GITHUB_RUN_ID}-${GITHUB_JOB}" - echo "=== Adding node taint for job isolation ===" - echo "Node: $NODE_NAME" - echo "Job ID: $JOB_ID" - - kubectl taint node "$NODE_NAME" "github.com/job-id=${JOB_ID}:NoSchedule" --overwrite=true - kubectl label node "$NODE_NAME" \ - "github.com/workflow=${GITHUB_WORKFLOW}" \ - "github.com/run-id=${GITHUB_RUN_ID}" \ - "github.com/job=${GITHUB_JOB}" \ - --overwrite=true + run: taint-node.sh - name: Checkout uses: actions/checkout@v4 @@ -543,30 +497,7 @@ jobs: - name: Taint node for job isolation if: contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral') shell: bash - run: | - # Verify prerequisites - if [ -z "$NODE_NAME" ]; then - echo "ERROR: NODE_NAME not set" - exit 1 - fi - - if ! command -v kubectl &> /dev/null; then - echo "ERROR: kubectl not found" - exit 1 - fi - - # Apply taint - JOB_ID="${GITHUB_RUN_ID}-${GITHUB_JOB}" - echo "=== Adding node taint for job isolation ===" - echo "Node: $NODE_NAME" - echo "Job ID: $JOB_ID" - - kubectl taint node "$NODE_NAME" "github.com/job-id=${JOB_ID}:NoSchedule" --overwrite=true - kubectl label node "$NODE_NAME" \ - "github.com/workflow=${GITHUB_WORKFLOW}" \ - "github.com/run-id=${GITHUB_RUN_ID}" \ - "github.com/job=${GITHUB_JOB}" \ - --overwrite=true + run: taint-node.sh - name: Checkout uses: actions/checkout@v4 From 03150b48272d5fc28e03cf75ff29a1286909ed5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 28 Nov 2025 16:30:50 +0000 Subject: [PATCH 161/248] Revert "[DEV] pull main Nov 25 (#2395)" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 56682f80b0db4492afeee013a07187eadfa9dc8f. Signed-off-by: oliver könig --- .github/copy-pr-bot.yaml | 2 +- .github/workflows/auto-update-copy-pr-bot.yml | 6 +- .github/workflows/cicd-main.yml | 3 + .github/workflows/community-bot.yml | 3 +- .../inference/gpt/gpt_dynamic_inference.py | 238 ++-- .../gpt/gpt_dynamic_inference_12b.sh | 10 +- .../gpt/gpt_dynamic_inference_357m.sh | 10 +- .../gpt_dynamic_inference_with_coordinator.py | 206 +--- examples/inference/gpt/utils.py | 74 +- examples/post_training/modelopt/.gitignore | 1 - examples/post_training/modelopt/ADVANCED.md | 93 +- examples/post_training/modelopt/Dockerfile | 2 +- examples/post_training/modelopt/README.md | 97 +- .../post_training/modelopt/conf/arguments.sh | 3 - .../conf/moonshotai/kimi_k2_instruct.sh | 7 - .../moonshotai/kimi_k2_instruct_export.sh | 15 - .../nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base.sh | 42 +- .../conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2.sh | 41 - .../{Qwen => qwen}/Qwen2.5-0.5B-Instruct.sh | 0 .../{Qwen => qwen}/Qwen2.5-7B-Instruct.sh | 0 .../conf/{Qwen => qwen}/Qwen3-0.6B.sh | 0 .../conf/{Qwen => qwen}/Qwen3-235B-A22B.sh | 0 .../conf/{Qwen => qwen}/Qwen3-30B-A3B.sh | 0 .../modelopt/conf/{Qwen => qwen}/Qwen3-8B.sh | 0 .../post_training/modelopt/convert_model.py | 12 +- examples/post_training/modelopt/finetune.py | 7 +- examples/post_training/modelopt/finetune.sh | 3 - examples/post_training/modelopt/prune.py | 38 +- examples/post_training/modelopt/prune.sh | 36 +- .../modelopt/slurm/env_setup_template.sh | 7 - .../post_training/modelopt/slurm/sbatch.sh | 63 - examples/post_training/modelopt/validate.sh | 8 +- gpt_builders.py | 10 - .../fsdp/src/megatron_fsdp/megatron_fsdp.py | 7 +- .../core/fusions/fused_pad_routing_map.py | 3 +- .../core/inference/communication_utils.py | 3 +- .../attention_context/mamba_metadata.py | 22 +- .../contexts/dynamic_block_allocator.py | 86 +- .../inference/contexts/dynamic_context.py | 643 +++++------ .../data_parallel_inference_coordinator.py | 70 +- megatron/core/inference/engines/__init__.py | 2 +- .../core/inference/engines/dynamic_engine.py | 883 +++++--------- .../core/inference/engines/static_engine.py | 9 +- megatron/core/inference/headers.py | 27 +- megatron/core/inference/inference_client.py | 102 +- megatron/core/inference/inference_request.py | 237 +--- megatron/core/inference/sampling_params.py | 2 +- .../text_generation_controller.py | 435 +++---- megatron/core/inference/unified_memory.py | 59 +- megatron/core/inference/utils.py | 55 - megatron/core/models/backends.py | 61 - megatron/core/models/gpt/gpt_layer_specs.py | 102 +- megatron/core/models/gpt/moe_module_specs.py | 10 +- .../core/models/mamba/mamba_layer_specs.py | 16 - megatron/core/optimizer/__init__.py | 307 ++--- megatron/core/optimizer/muon.py | 41 +- megatron/core/optimizer/optimizer.py | 1 - megatron/core/optimizer/optimizer_config.py | 75 +- megatron/core/optimizer_param_scheduler.py | 31 +- megatron/core/parallel_state.py | 1 - megatron/core/process_groups_config.py | 17 - megatron/core/safe_globals.py | 2 - megatron/core/ssm/mamba_block.py | 63 +- .../core/ssm/mamba_hybrid_layer_allocation.py | 7 +- megatron/core/ssm/mamba_layer.py | 2 - megatron/core/ssm/mamba_mixer.py | 32 +- .../core/tensor_parallel/inference_layers.py | 151 --- .../text/libraries/huggingface_tokenizer.py | 11 +- .../text/libraries/null_tokenizer.py | 8 - .../core/tokenizers/text/text_tokenizer.py | 16 +- megatron/core/transformer/attention.py | 37 +- megatron/core/transformer/cuda_graphs.py | 17 - .../transformer/fsdp_dtensor_checkpoint.py | 2 +- .../core/transformer/moe/token_dispatcher.py | 3 +- .../core/transformer/transformer_config.py | 10 - megatron/core/utils.py | 113 +- .../legacy/data/biencoder_dataset_utils.py | 11 +- .../datasets => legacy/data}/data_samplers.py | 168 +-- megatron/legacy/data/vit_dataset.py | 14 +- megatron/post_training/algos/__init__.py | 1 + megatron/post_training/algos/distillation.py | 601 ++++++++++ megatron/post_training/checkpointing.py | 9 +- megatron/post_training/docs/distillation.md | 2 +- megatron/post_training/generate.py | 6 +- megatron/post_training/loss_func.py | 6 +- megatron/post_training/model_builder.py | 18 +- megatron/post_training/non_loss_data_func.py | 19 +- megatron/post_training/utils.py | 3 +- megatron/rl/inference/megatron.py | 102 +- megatron/rl/rl_utils.py | 85 +- megatron/training/arguments.py | 86 +- megatron/training/checkpointing.py | 17 +- megatron/training/datasets/README.md | 34 - megatron/training/datasets/fim_dataset.py | 308 ----- megatron/training/dist_signal_handler.py | 10 +- megatron/training/global_vars.py | 9 +- megatron/training/training.py | 162 ++- pretrain_gpt.py | 66 +- .../golden_values_dev_dgx_h100.json | 287 ----- .../model_config.yaml | 56 - .../golden_values_dev_dgx_h100.json | 361 +++--- .../model_config.yaml | 2 + .../golden_values_dev_dgx_h100.json | 361 +++--- .../model_config.yaml | 2 + .../model_config.yaml | 6 +- .../golden_values_dev_dgx_h100.json | 1028 ----------------- .../model_config.yaml | 59 - .../golden_values_dev_dgx_h100.json | 158 --- .../model_config.yaml | 58 - .../golden_values_dev_dgx_h100.json | 158 --- .../model_config.yaml | 58 - .../golden_values_dev_dgx_h100.json | 4 +- .../model_config.yaml | 5 +- .../golden_values_dev_dgx_h100.json | 314 ++--- .../model_config.yaml | 6 +- .../golden_values_dev_dgx_h100.json | 135 --- .../model_config.yaml | 72 -- .../golden_values_dev_dgx_h100.json | 2 +- .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../python_scripts/auto_reminder_github.py | 29 +- ...pt-dynamic-inference-with-coordinator.yaml | 16 +- tests/test_utils/recipes/gpt.yaml | 5 - .../recipes/mamba-dynamic-inference.yaml | 61 - tests/unit_tests/data/test_fim_dataset.py | 87 -- .../contexts/test_dynamic_context.py | 251 ++-- .../inference/engines/test_dynamic_engine.py | 398 +++---- .../inference/engines/test_static_engine.py | 17 +- ...est_data_parallel_inference_coordinator.py | 471 -------- .../inference/test_wandb_logging.py | 26 +- .../test_simple_text_generation_controller.py | 96 +- tests/unit_tests/test_checkpointing.py | 45 +- .../unit_tests/test_process_groups_config.py | 23 - tests/unit_tests/test_rl_utils.py | 656 ----------- .../transformer/moe/test_token_dispatcher.py | 5 +- tools/run_inference_performance_test.py | 16 +- train_rl.py | 2 +- 137 files changed, 3400 insertions(+), 8493 deletions(-) delete mode 100644 examples/post_training/modelopt/.gitignore delete mode 100644 examples/post_training/modelopt/conf/moonshotai/kimi_k2_instruct.sh delete mode 100644 examples/post_training/modelopt/conf/moonshotai/kimi_k2_instruct_export.sh mode change 120000 => 100644 examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base.sh delete mode 100644 examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2.sh rename examples/post_training/modelopt/conf/{Qwen => qwen}/Qwen2.5-0.5B-Instruct.sh (100%) rename examples/post_training/modelopt/conf/{Qwen => qwen}/Qwen2.5-7B-Instruct.sh (100%) rename examples/post_training/modelopt/conf/{Qwen => qwen}/Qwen3-0.6B.sh (100%) rename examples/post_training/modelopt/conf/{Qwen => qwen}/Qwen3-235B-A22B.sh (100%) rename examples/post_training/modelopt/conf/{Qwen => qwen}/Qwen3-30B-A3B.sh (100%) rename examples/post_training/modelopt/conf/{Qwen => qwen}/Qwen3-8B.sh (100%) delete mode 100644 examples/post_training/modelopt/slurm/env_setup_template.sh delete mode 100644 examples/post_training/modelopt/slurm/sbatch.sh delete mode 100644 megatron/core/tensor_parallel/inference_layers.py rename megatron/{training/datasets => legacy/data}/data_samplers.py (56%) create mode 100644 megatron/post_training/algos/__init__.py create mode 100644 megatron/post_training/algos/distillation.py delete mode 100644 megatron/training/datasets/README.md delete mode 100644 megatron/training/datasets/fim_dataset.py delete mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_h100.json delete mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/model_config.yaml delete mode 100644 tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/golden_values_dev_dgx_h100.json delete mode 100644 tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/model_config.yaml delete mode 100644 tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json delete mode 100644 tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/model_config.yaml delete mode 100644 tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json delete mode 100644 tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/model_config.yaml delete mode 100644 tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/golden_values_dev_dgx_h100.json delete mode 100644 tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/model_config.yaml delete mode 100644 tests/test_utils/recipes/mamba-dynamic-inference.yaml delete mode 100644 tests/unit_tests/data/test_fim_dataset.py delete mode 100644 tests/unit_tests/inference/test_data_parallel_inference_coordinator.py delete mode 100644 tests/unit_tests/test_rl_utils.py diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml index 8e703301ca7..7013df60dc2 100644 --- a/.github/copy-pr-bot.yaml +++ b/.github/copy-pr-bot.yaml @@ -1,4 +1,4 @@ enabled: true auto_sync_draft: false auto_sync_ready: true -trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "gautham-kollu", "guyueh1", "hxbai", "jaredcasper", "jiemingz", "jkamalu", "jon-barker", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "pablo-garay", "parthmannan", "pthombre", "rogerwaleffe", "sanandaraj5597", "santhnm2", "sbak5", "shanmugamr1992", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"] +trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "gautham-kollu", "hxbai", "jaredcasper", "jiemingz", "jkamalu", "jon-barker", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "pablo-garay", "parthmannan", "pthombre", "rogerwaleffe", "sanandaraj5597", "santhnm2", "sbak5", "shanmugamr1992", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"] diff --git a/.github/workflows/auto-update-copy-pr-bot.yml b/.github/workflows/auto-update-copy-pr-bot.yml index b04d34251f0..969c46e3fdd 100644 --- a/.github/workflows/auto-update-copy-pr-bot.yml +++ b/.github/workflows/auto-update-copy-pr-bot.yml @@ -48,10 +48,8 @@ jobs: mv .github/copy-pr-bot.yaml.new .github/copy-pr-bot.yaml - name: Commit changes - env: - GH_TOKEN: ${{ secrets.PAT }} run: | - git remote set-url origin https://x-access-token:${GH_TOKEN}@github.com/NVIDIA/Megatron-LM.git + git remote set-url origin https://x-access-token:${{ secrets.PAT }}@github.com/NVIDIA/Megatron-LM.git git config --global user.name "GitHub Actions" git config --global user.email "github-actions[bot]@users.noreply.github.com" git add .github/copy-pr-bot.yaml @@ -60,4 +58,4 @@ jobs: exit 0 fi git commit -m "Update copy-pr-bot.yaml [skip ci]" - git push -u origin main + git push diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index eff0ad2e3fe..a5a7a82287e 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -40,6 +40,7 @@ env: jobs: is-not-external-contributor: runs-on: ubuntu-latest + environment: nemo-ci if: github.repository == 'NVIDIA/Megatron-LM' outputs: is_external_contributor: ${{ github.event.pull_request.user.type == 'User' }} @@ -387,6 +388,7 @@ jobs: - cicd-wait-in-queue - cicd-container-build - cicd-unit-tests-latest + environment: nemo-ci if: | ( success() @@ -566,6 +568,7 @@ jobs: && needs.pre-flight.outputs.is_ci_workload == 'false' && !cancelled() && github.repository == 'NVIDIA/Megatron-LM' + environment: nemo-ci steps: - name: Generate fake coverage report uses: actions/github-script@v6 diff --git a/.github/workflows/community-bot.yml b/.github/workflows/community-bot.yml index 1a98ece0f85..3b102894e1f 100644 --- a/.github/workflows/community-bot.yml +++ b/.github/workflows/community-bot.yml @@ -22,8 +22,7 @@ on: jobs: community-bot: uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_community_bot.yml@v0.65.10 - with: - community_project_id: ${{ vars.COMMUNITY_PROJECT_ID }} if: github.repository == 'NVIDIA/Megatron-LM' secrets: GH_TOKEN: ${{ secrets.PAT }} + environment: main diff --git a/examples/inference/gpt/gpt_dynamic_inference.py b/examples/inference/gpt/gpt_dynamic_inference.py index 1a537870020..251aa100cba 100644 --- a/examples/inference/gpt/gpt_dynamic_inference.py +++ b/examples/inference/gpt/gpt_dynamic_inference.py @@ -1,7 +1,6 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import hashlib -import io import json import math import os @@ -14,26 +13,14 @@ from tqdm import tqdm from typing import Dict, List, Tuple, Optional -sys.path.append( - os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) -) +import torch +from tqdm import tqdm -import megatron -from examples.inference.gpt.utils import ( - Request, - add_common_inference_args, - build_dynamic_engine_setup_prefix, - build_requests, - get_curr_time, -) from megatron.core.inference.contexts.dynamic_context import ( ContextOverflowError, DynamicInferenceContext, ) -from megatron.core.inference.contexts.attention_context.mamba_metadata import ( - MambaInferenceStateConfig, -) -from megatron.core.inference.engines import DynamicInferenceEngine, EngineSuspendedError +from megatron.core.inference.engines import DynamicInferenceEngine from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) @@ -41,9 +28,10 @@ from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) +from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer from megatron.core.transformer.module import MegatronModule -from megatron.core.utils import get_mamba_inference_state_config_from_model +from megatron.core.utils import get_attr_wrapped_model sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) @@ -65,14 +53,14 @@ build_requests, get_curr_time, ) +from megatron.training import get_args +from megatron.training import get_model as _get_model +from megatron.training import get_tokenizer, initialize_megatron from megatron.training.checkpointing import load_checkpoint -from model_provider import model_provider -from gpt_builders import gpt_builder - -torch.serialization.add_safe_globals([io.BytesIO]) -torch.serialization.add_safe_globals([megatron.core.rerun_state_machine.RerunState]) -torch.serialization.add_safe_globals([megatron.core.rerun_state_machine.RerunDiagnostic]) +import torch +import io +import megatron def add_dynamic_inference_args(parser: ArgumentParser) -> ArgumentParser: @@ -88,24 +76,9 @@ def add_dynamic_inference_args(parser: ArgumentParser) -> ArgumentParser: ) group.add_argument( "--termination-id", type=int, default=None, - help="Termination ID that overrides `tokenizer.eod`.", - ) - group.add_argument( - "--suspend-resume-interval", type=int, default=None, - help="Suspend and resume the dynamic engine every " - "`suspend_resume_interval` steps. This is used to tet the suspend/resume " - "system.", - ) - group.add_argument( - "--inference-repeat-n", type=int, default=1, - help="Repeat inference iterations N times for benchmarking." - ) - group.add_argument( - "--throughput-check-only", - action='store_true', - default=False, - help="If true, only run throughput check without verifying outputs." + help="Termination ID that overrides `tokenizer.eod`." ) + group.add_argument('--inference-repeat-n', type=int, default=1, help="Repeat inference iterations N times for benchmarking.") return parser @@ -152,12 +125,13 @@ def get_inference_context( requests: List[Request], sampling_params: Optional[SamplingParams] = None, calculate_max_sequence_length_from_requests: bool = True, - mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, + layer_type_list: Optional[List[str]] = None, + mamba_conv_states_shape: Optional[Tuple[int]] = None, + mamba_ssm_states_shape: Optional[Tuple[int]] = None, ): """The inference context manages the KV cache and other inference state.""" args = get_args() - # Max sequence length. if calculate_max_sequence_length_from_requests: max_gen_length = sampling_params.num_tokens_to_generate @@ -173,7 +147,7 @@ def get_inference_context( # Inference context. context = DynamicInferenceContext( params_dtype=args.params_dtype, - num_layers=args.num_layers // args.pipeline_model_parallel_size, + num_layers=args.num_layers, kv_channels=args.kv_channels, num_attention_heads=( args.num_query_groups if args.group_query_attention else args.num_attention_heads @@ -186,10 +160,15 @@ def get_inference_context( ), block_size_tokens=args.inference_dynamic_batching_block_size, buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb, - max_tokens=args.inference_dynamic_batching_max_tokens, + buffer_guaranteed_fraction=args.inference_dynamic_batching_buffer_guaranteed_fraction, + buffer_overflow_factor=args.inference_dynamic_batching_buffer_overflow_factor, + max_requests_override=args.inference_dynamic_batching_max_requests_override, + max_tokens_override=args.inference_dynamic_batching_max_tokens_override, tensor_model_parallel_size=args.tensor_model_parallel_size, materialize_only_last_token_logits=not args.return_log_probs, - mamba_inference_state_config=mamba_inference_state_config, + layer_type_list=layer_type_list, + mamba_conv_states_shape=mamba_conv_states_shape, + mamba_ssm_states_shape=mamba_ssm_states_shape, cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents, kv_lora_rank=args.kv_lora_rank if args.multi_latent_attention else None, qk_pos_emb_head_dim=args.qk_pos_emb_head_dim, @@ -271,12 +250,12 @@ def run_inference( num_requests_total = len(requests) num_requests_added = 0 num_requests_finished = 0 + step_id = 0 step_times = {"prefill": [], "decode": []} add_times = [] output_times = [] tbar = tqdm(total=num_requests_total) total_output_tokens = 0 - attempted_step_count = 0 if args.cuda_graph_impl == "local": cuda_graph_request_count_map = {r:0 for r in engine.context.cuda_graph_request_counts} else: @@ -319,37 +298,10 @@ def _add_request(): # Step inference engine (i.e., generate a token for each active request). # Before step, we haven't done the scheduling, so we cannot know the is_decode_only - try: - result = engine.step_modern(verbose=True) - except EngineSuspendedError as e: - result = e - pass # ignore error in order to call 'engine.resume()' below. - attempted_step_count += 1 - + result = engine.step_modern(verbose=True) # After step, we lost track of last iteration's is_decode_only, so we need to get it from the engine is_decode_only = engine.is_decode_only - - # Test suspending and resuming engine. - if args.suspend_resume_interval is not None: - - # Suspend. - if attempted_step_count % args.suspend_resume_interval == 0: - print("**** step %d/%d ... suspend." % (engine.step_count, attempted_step_count)) - engine.suspend() - - # Resume, 0+ attempted steps later. - if ( - attempted_step_count > 0 - and - (attempted_step_count - args.suspend_resume_interval // 2) - % args.suspend_resume_interval == 0 - ): - print("**** step %d/%d ... resume." % (engine.step_count, attempted_step_count)) - engine.resume() - - # If engine suspended, continue to next iter. - if isinstance(result, EngineSuspendedError): - continue + step_id += 1 # Record cuda_graph_request_count. cuda_graph_request_count = result["cuda_graph_request_count"] @@ -357,10 +309,10 @@ def _add_request(): cuda_graph_request_count_map[cuda_graph_request_count] += 1 # Update requests. - active_request_ids = result["active_request_ids"] - finished_request_records = result["finished_request_records"] + active_requests = result["active_requests"] + finished_requests = result["finished_requests"] step_time = result["step_time"] - if len(active_request_ids) > 0 or len(finished_request_records) > 0: + if len(active_requests) > 0 or len(finished_requests) > 0: if is_decode_only: step_times["decode"].append(step_time) else: @@ -368,26 +320,14 @@ def _add_request(): # Append output tokens. output_start = get_curr_time() - for finished_request_record in finished_request_records: - - finished_request = finished_request_record.merge(engine.controller.tokenizer) - - # Update local request object. + for finished_request in finished_requests: request = requests[finished_request.request_id] + request.output_tokens = finished_request.generated_tokens + total_output_tokens += len(request.output_tokens) request.time_end = get_curr_time() + request.output_text = finished_request.generated_text request.state = "finished" request.request_id = finished_request.request_id - - # Update prompt, in case engine has been suspended and resumed. - request.prompt_tokens = finished_request.prompt_tokens - request.prompt_text = finished_request.prompt - - # Get output tokens and text. - request.output_tokens = finished_request.generated_tokens - request.output_text = finished_request.generated_text - total_output_tokens += len(request.output_tokens) - - # Log probs. if finished_request.sampling_params.return_log_probs: request.log_probs = ( finished_request.prompt_log_probs + finished_request.generated_log_probs @@ -441,14 +381,23 @@ def main(): model = get_model() - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) + # Layer type list for hybrid models + decoder = get_attr_wrapped_model(model, "decoder") + layer_type_list = getattr(decoder, "layer_type_list", None) + if layer_type_list is not None and Symbols.MAMBA in layer_type_list: + (mamba_conv_states_shape, mamba_ssm_states_shape) = decoder.mamba_state_shapes_per_request() + else: + mamba_conv_states_shape = None + mamba_ssm_states_shape = None # Requests, context, controller. requests = build_requests(args, tokenizer, sampling_params) context = get_inference_context( requests, sampling_params, - mamba_inference_state_config=mamba_inference_state_config, + layer_type_list=layer_type_list, + mamba_conv_states_shape=mamba_conv_states_shape, + mamba_ssm_states_shape=mamba_ssm_states_shape, ) controller = get_inference_controller(model, context) @@ -514,9 +463,7 @@ def escape_str(s): unique_prompt_map[request.prompt_text].append(request_idx) # Print unique prompts + outputs. - text_hashes = [] for unique_idx, (prompt_text, request_idxs) in enumerate(unique_prompt_map.items()): - # ---- Prompt summary line ---- prompt_len = len(requests[request_idxs[0]].prompt_tokens) escaped_prompt_text = escape_str(prompt_text) @@ -531,20 +478,15 @@ def escape_str(s): # ---- Print each unique output ---- for output_text, output_request_idxs in output_map.items(): if output_text is not None: - # Use hash of prompt + generated text in case engine was - # suspended and resumed, which misaligns boundary between - # prompt and generated tokens. - o_hash = hashlib.sha256( - (prompt_text + output_text).encode() - ).hexdigest()[:6] + o_hash = hashlib.sha256(output_text.encode()).hexdigest()[:6] o_len = len(requests[output_request_idxs[0]].output_tokens) escaped_output_text = escape_str(output_text) + print(f" >>>> [n {len(output_request_idxs)}, l {o_len}, hash {o_hash}] {escaped_output_text}") else: o_hash = "--" o_len = 0 escaped_output_text = "--" - print(f" >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}] {escaped_output_text}") - text_hashes.append(o_hash) + print(f" >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}] {escaped_output_text}") # Write results to JSON. Primarily used for functional testing. if args.output_path: @@ -572,49 +514,47 @@ def escape_str(s): with open(args.output_path, "w") as fp: json.dump(json_results, fp, indent=1) - # Timing results. - stats = torch.cuda.memory_stats() - throughput = total_output_tokens / total_time - print("~~~") - peak_alloc_gb = stats["allocated_bytes.all.peak"] / 1024**3 - peak_resvd_gb = stats["reserved_bytes.all.peak"] / 1024**3 - - p_times = step_times["prefill"] - d_times = step_times["decode"] - - p_total = sum(p_times) - d_total = sum(d_times) - - p_count = len(p_times) - d_count = len(d_times) - - p_mean = p_total / p_count - d_mean = d_total / d_count if d_count != 0 else 0. - - # Commented out for now as the step/add/output times are not calculated correctly. - # print( - # f"{setup_prefix} … " - # f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … " - # f"total time: {step_total:.3f}s … " - # f"step time: total {step_total:.3f}s " - # f"[ p {p_total:.3f}s, d {d_total:.3f}s ], " - # f"mean [ p {p_mean:.3f}s, d {d_mean:.3f}s ], " - # f"count [ p {p_count}, d {d_count} ]." - # ) - capture_str = ( - f"{engine.capture_stats['time']:.2f} sec" - if engine.capture_stats else - "--" - ) - print( - f"{setup_prefix} … " - f"throughput: {throughput:.3f} tok/s", - f"total time: {total_time:.3f}s … " - f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … " - f"steps: {engine.step_count:d} … " - f"capture {capture_str} … " - ) - print("~~~") + # Timing results. + print("~~~") + peak_alloc_gb = stats["allocated_bytes.all.peak"] / 1024**3 + peak_resvd_gb = stats["reserved_bytes.all.peak"] / 1024**3 + + p_times = step_times["prefill"] + d_times = step_times["decode"] + + p_total = sum(p_times) + d_total = sum(d_times) + + p_count = len(p_times) + d_count = len(d_times) + + p_mean = p_total / p_count + d_mean = d_total / d_count + + # Commented out for now as the step/add/output times are not calculated correctly. + # print( + # f"{setup_prefix} … " + # f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … " + # f"total time: {step_total:.3f}s … " + # f"step time: total {step_total:.3f}s " + # f"[ p {p_total:.3f}s, d {d_total:.3f}s ], " + # f"mean [ p {p_mean:.3f}s, d {d_mean:.3f}s ], " + # f"count [ p {p_count}, d {d_count} ]." + # ) + capture_str = ( + f"{engine.capture_stats['time']:.2f} sec" + if engine.capture_stats else + "--" + ) + print( + f"{setup_prefix} … " + f"capture {capture_str} … " + f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … " + f"total time: {total_time:.3f}s … " + f"steps: {engine.step_count:d} … " + f"throughput: {throughput:.3f} tok/s" + ) + print("~~~") # Stop Nsight profiler. if os.environ.get("NSIGHT_PREFIX"): diff --git a/examples/inference/gpt/gpt_dynamic_inference_12b.sh b/examples/inference/gpt/gpt_dynamic_inference_12b.sh index 20f1a29cb5b..a16fe5176d5 100644 --- a/examples/inference/gpt/gpt_dynamic_inference_12b.sh +++ b/examples/inference/gpt/gpt_dynamic_inference_12b.sh @@ -24,9 +24,13 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 # Dynamic context. : ${BUFFER_SIZE_GB=50.} +: ${BUFFER_OVERFLOW_FACTOR=1.} +: ${BUFFER_GUARANTEED_FRACTION=0.05} # Cuda graphs. +: ${CUDA_GRAPH_IMPL=local} : ${NUM_CUDA_GRAPHS=16} +: ${CUDA_GRAPH_SHARE_IO_BUFFERS=1} # Miscellaneous. : ${USE_COORDINATOR=0} @@ -75,6 +79,8 @@ ARGS=" \ \ --inference-dynamic-batching \ --inference-dynamic-batching-buffer-size-gb ${BUFFER_SIZE_GB} \ + --inference-dynamic-batching-buffer-overflow-factor ${BUFFER_OVERFLOW_FACTOR} \ + --inference-dynamic-batching-buffer-guaranteed-fraction ${BUFFER_GUARANTEED_FRACTION} \ \ ${EXTRA_ARGS} \ " @@ -85,10 +91,6 @@ if [ "${NUM_CUDA_GRAPHS}" != "0" ]; then --cuda-graph-impl local \ --inference-dynamic-batching-num-cuda-graphs ${NUM_CUDA_GRAPHS} \ " -else - ARGS+=" \ - --cuda-graph-impl none \ - " fi # Prompts. diff --git a/examples/inference/gpt/gpt_dynamic_inference_357m.sh b/examples/inference/gpt/gpt_dynamic_inference_357m.sh index 215cc2bac8f..c095371714f 100644 --- a/examples/inference/gpt/gpt_dynamic_inference_357m.sh +++ b/examples/inference/gpt/gpt_dynamic_inference_357m.sh @@ -25,9 +25,13 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 # Dynamic context. : ${BUFFER_SIZE_GB=50.} +: ${BUFFER_OVERFLOW_FACTOR=1.} +: ${BUFFER_GUARANTEED_FRACTION=0.05} # Cuda graphs. +: ${CUDA_GRAPH_IMPL=local} : ${NUM_CUDA_GRAPHS=16} +: ${CUDA_GRAPH_SHARE_IO_BUFFERS=1} # Miscellaneous. : ${USE_COORDINATOR=0} @@ -61,6 +65,8 @@ ARGS=" \ \ --inference-dynamic-batching \ --inference-dynamic-batching-buffer-size-gb ${BUFFER_SIZE_GB} \ + --inference-dynamic-batching-buffer-overflow-factor ${BUFFER_OVERFLOW_FACTOR} \ + --inference-dynamic-batching-buffer-guaranteed-fraction ${BUFFER_GUARANTEED_FRACTION} \ \ ${EXTRA_ARGS} \ " @@ -71,10 +77,6 @@ if [ "${NUM_CUDA_GRAPHS}" != "0" ]; then --cuda-graph-impl local \ --inference-dynamic-batching-num-cuda-graphs ${NUM_CUDA_GRAPHS} \ " -else - ARGS+=" \ - --cuda-graph-impl none \ - " fi # Prompts. diff --git a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py index 7869002fff3..9e2b6bfa983 100644 --- a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py +++ b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py @@ -1,41 +1,26 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +from megatron.core.inference.inference_client import InferenceClient +from examples.inference.gpt.utils import add_common_inference_args import asyncio -import json -import os -import time -import torch import torch.distributed as dist -from collections import defaultdict +from examples.inference.gpt.gpt_dynamic_inference import get_model, get_inference_context, get_inference_controller, add_dynamic_inference_args +from megatron.core.inference.inference_request import DynamicInferenceRequest +from megatron.training import initialize_megatron +import torch +import os +from megatron.training import get_args, get_tokenizer +from megatron.core.inference.sampling_params import SamplingParams +from examples.inference.gpt.utils import build_requests, build_dynamic_engine_setup_prefix, Request +from megatron.core.inference.engines import DynamicInferenceEngine +import time from tqdm import tqdm from typing import List -import warnings -import logging - -from examples.inference.gpt.gpt_dynamic_inference import ( - add_dynamic_inference_args, - get_inference_context, - get_inference_controller, - get_model, -) -from examples.inference.gpt.utils import ( - Request, - build_dynamic_engine_setup_prefix, - build_requests, - add_common_inference_args -) - -from megatron.core import parallel_state -from megatron.core.inference.engines import DynamicInferenceEngine -from megatron.core.inference.inference_client import InferenceClient -from megatron.core.inference.inference_request import DynamicInferenceRequestRecord -from megatron.core.inference.sampling_params import SamplingParams -from megatron.core.utils import get_mamba_inference_state_config_from_model - -from megatron.training import get_args, get_tokenizer, initialize_megatron +import json from megatron.training.arguments import parse_args +from megatron.core import parallel_state -# pylint: disable=line-too-long +import logging logging.basicConfig(level=logging.INFO, force=True) @@ -53,45 +38,19 @@ async def main( ) # once you call engine.start_listening_to_data_parallel_coordinator, # the engine will start accepting requests from the data parallel coordinator. - # and processing them in an asyncio coroutine. - - await engine.start_listening_to_data_parallel_coordinator( - inference_coordinator_port=port, - launch_inference_coordinator=True, - verbose=True, + # and processing them in an asyncio coroutine. + await engine.start_listening_to_data_parallel_coordinator( + inference_coordinator_port=port, launch_inference_coordinator=True ) - - # if you want to use your own inference coordinator - + # if you want to use your own inference coordinator - # 1. set launch_inference_coordinator to False # 2. setup a router socket at tcp://MASTER_ADDR:PORT # 3. wait for data parallel groups to establish connection (BasicInferenceCoordinator.__init__) # 4. look at InferenceCoordinator.start() to see how we can route requests from users <-> data parallel groups - # based on headers. - # 5. look at InferenceClient to see how we create requests with headers. - - args = get_args() - - # Test suspend/resume intervals. - if args.suspend_resume_interval is not None: - # Since the client doesn't directly call engine.async_step here, we test - # the suspend-resume system ~4 times. - suspend_resume_interval = max(1, len(requests) // 4) - suspend_idxs = set(range( - suspend_resume_interval, - len(requests) + 1, - suspend_resume_interval, - )) - resume_idxs = set( - min(len(requests), i + suspend_resume_interval // 2) - for i in suspend_idxs - ) - else: - suspend_idxs = set() - resume_idxs = set() - - # Create client and run example. - if dist.get_rank() == 0: - client = InferenceClient(port) # submits requests to the inference coordinator + # based on headers. + # 5. look at InferenceClient to see how we create requests with headers. + if dist.get_rank() == 0: + client = InferenceClient(port) # submits requests to the inference coordinator await client.start() base_arrival_time = time.time_ns() / 10**9 for request in requests: @@ -99,104 +58,61 @@ async def main( futures = [] num_requests_total = len(requests) num_requests_added = 0 - + #tbar = tqdm(total=num_requests_total) while True: current_time = time.time_ns() / 10**9 - if args.incoming_requests_per_step is None: - # Only add requests that have arrived at the current time. - while num_requests_added < num_requests_total and requests[num_requests_added].time_arrival <= current_time: - request = requests[num_requests_added] - # These add-request calls will queue up the request on a zmq socket and return - # instantaneously. They will return an asyncio future which can be awaited for - # request completion. - futures.append(client.add_request(request.prompt_text, request.sampling_params)) - num_requests_added += 1 - - # Test suspend/resume. - if num_requests_added in suspend_idxs: - client.suspend_engines() - if num_requests_added in resume_idxs: - client.resume_engines() - - else: - # Add deterministic number of requests (generally used for debugging). - for i in range(min( - args.incoming_requests_per_step, - num_requests_total - num_requests_added - )): - # Change sampling parameters to force different generation lengths. - request = requests[num_requests_added] - n = request.sampling_params.num_tokens_to_generate - request.sampling_params.num_tokens_to_generate = n + i - futures.append(client.add_request(request.prompt_text, request.sampling_params)) - num_requests_added += 1 - - # Test suspend/resume. - if num_requests_added in suspend_idxs: - client.suspend_engines() - if num_requests_added in resume_idxs: - client.resume_engines() - + # Only add requests that have arrived at the current time. + while num_requests_added < num_requests_total and requests[num_requests_added].time_arrival <= current_time: + request = requests[num_requests_added] + # These add-request calls will queue up the request on a zmq socket and return + # instantaneously. They will return an asyncio future which can be awaited for + # request completion. + futures.append(client.add_request(request.prompt_text, request.sampling_params)) + num_requests_added += 1 + #tbar.update(1) if num_requests_added == num_requests_total: break - # Relinquish control since there are no more requests to add at the moment. This allows the engine to run. + # Relinquish control since there are no more requests to add at the moment. This allows the engine to run. await asyncio.sleep(0) - # While we wait for the requests to complete, the engine runs in the background. - results: List[DynamicInferenceRequestRecord] = await asyncio.gather(*futures) + results: List[DynamicInferenceRequest] = await asyncio.gather(*futures) + if dist.get_rank() == 0: # Write results to JSON. Primarily used for functional testing. if args.output_path: json_results = {} - throughputs = [] - for record in results: - req = record.merge(engine.controller.tokenizer) + for req in results: result_dict = { "input_prompt": req.prompt, "generated_text": req.generated_text.replace("\n", "\\n"), "generated_tokens": req.generated_tokens, - "latency": req.latency, # InferenceClient populates this field in the returned future. + "latency": req.latency, #InferenceClient populates this field in the returned future. } if req.sampling_params["return_log_probs"]: result_dict["logprobs"] = req.prompt_log_probs + req.generated_log_probs - throughput = len(req.generated_tokens) / req.latency - throughputs.append(throughput) json_results[req.request_id] = result_dict - throughput_dict = {"throughput": throughputs} - if args.throughput_check_only: - json_results = throughput_dict with open(args.output_path, "w") as fp: json.dump(json_results, fp, indent=4) else: print("Results:") - unique_prompt_map = defaultdict(list) - for record in results: - req = record.merge(engine.controller.tokenizer) - unique_prompt_map[req.prompt].append(req) - for idx, (prompt_text, reqs) in enumerate(unique_prompt_map.items()): - print(f"%d/%d. prompt '%s' ... [%d] output '%s'." % ( - idx, - len(unique_prompt_map), - prompt_text.replace("\n", "\\n"), - len(reqs), - reqs[0].generated_text.replace("\n", "\\n"), - )) - + for req in results: + print(f"rid: {req.request_id}\nprompt: {req.prompt!r}\noutput: {req.generated_text!r}\n\n") + # kill the engines and suspend the client client.stop_engines() client.stop() - + # once the stop signal eventually makes its way to each GPU, the engines will stop. await asyncio.gather(engine.engine_loop_task) - if __name__ == "__main__": - # enable inference mode in the very beginning as some fp-8 optimizations + # enable inference mode in the very beginning as some fp-8 optimizations # check for it. with torch.inference_mode(): initialize_megatron( + #parsed_args=args extra_args_provider=add_dynamic_inference_args, args_defaults={'no_load_rng': True, 'no_load_optim': True}, ) @@ -215,25 +131,17 @@ async def main( top_p=args.top_p, return_log_probs=args.return_log_probs, num_tokens_to_generate=args.num_tokens_to_generate, - termination_id=( - args.termination_id if args.termination_id is not None else tokenizer.eod - ), + termination_id=args.termination_id if args.termination_id is not None else tokenizer.eod, ) # Requests, context, conroller. model = get_model() - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) - requests = ( - build_requests(args, tokenizer, sampling_params) if dist.get_rank() == 0 else None - ) - - context = get_inference_context( - None, - None, - calculate_max_sequence_length_from_requests=False, - mamba_inference_state_config=mamba_inference_state_config, - ) + requests = build_requests(args, tokenizer, sampling_params) if dist.get_rank() == 0 else None + context = get_inference_context(None, + None, + calculate_max_sequence_length_from_requests=False) + controller = get_inference_controller(model, context) # Inference engine. @@ -242,19 +150,17 @@ async def main( context, enable_cuda_graph=args.cuda_graph_impl == "local", random_seed=args.seed, - enable_chunked_prefill=not args.disable_chunked_prefill, + enable_chunked_prefill=not args.disable_chunked_prefill ) + if dist.get_rank() == 0: setup_prefix = build_dynamic_engine_setup_prefix(args, model, context, requests) print("~~~") print(setup_prefix) print("~~~") + + asyncio.run(main(engine, + requests, + args.inference_coordinator_port)) - asyncio.run( - main( - engine, - requests, - args.inference_coordinator_port, - ) - ) diff --git a/examples/inference/gpt/utils.py b/examples/inference/gpt/utils.py index efd4fdab4fc..0ea1f5a3df0 100644 --- a/examples/inference/gpt/utils.py +++ b/examples/inference/gpt/utils.py @@ -1,6 +1,5 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -import copy import json import itertools import random @@ -12,12 +11,12 @@ from megatron.core.inference.inference_request import DynamicInferenceRequest from megatron.core.inference.contexts import DynamicInferenceContext -from megatron.core.inference.contexts.dynamic_context import get_mem_size_str from megatron.core.transformer.module import MegatronModule from megatron.core.inference.sampling_params import SamplingParams + def add_common_inference_args(parser: ArgumentParser) -> ArgumentParser: """Common inference arguments.""" @@ -54,12 +53,6 @@ def add_common_inference_args(parser: ArgumentParser) -> ArgumentParser: default=30, help='Number of tokens to generate for each prompt', ) - group.add_argument( - "--num-tokens-from-file", - action='store_true', - default=False, - help='Use per-prompt num_tokens_to_generate from prompt file', - ) group.add_argument( "--top-n-logprobs", type=int, @@ -72,7 +65,7 @@ def add_common_inference_args(parser: ArgumentParser) -> ArgumentParser: help="Add a deterministic number of requests per step. This arg is " "prioritized over `--incoming-requests-per-sec` below (which is non-" "deterministic). Note that the number of requests added per step is " - "additionally limited by the inference context's `max_active_requests`, " + "additionally limited by the inference context's `max_requests`, " "`max_tokens`, and KV buffer size.", ) group.add_argument( @@ -123,6 +116,12 @@ def add_common_inference_args(parser: ArgumentParser) -> ArgumentParser: '`--prompt-file` above). The first `--prompt-file-num-truncate` samples ' 'will be used, in order.', ) + group.add_argument( + "--inference-coordinator-port", + type=int, + help="This port will be used to setup the inference co-ordinator on node-0", + default=12346 + ) group.add_argument( "--use-flashinfer-fused-rope", action='store_true', @@ -177,7 +176,6 @@ def __init__(self, prompt_text: str, time_offset: float, tokenizer: Any, samplin self.time_end = None self.state = "not-started" self.sampling_params: SamplingParams = sampling_params if sampling_params is not None else get_default_sampling_params(tokenizer.eod) - self.sampling_params = copy.deepcopy(self.sampling_params) def __str__(self) -> str: return "state '%s'; toffset %.1e; prompt len %d; output len %d; '%s'" % ( @@ -264,27 +262,10 @@ def get_synthetic_requests( int(args.incoming_requests_per_sec * args.incoming_requests_duration), ) - # Build prompts with expected lengths. - assert ( - len(args.num_tokens_to_prompt) == 2 - and - args.num_tokens_to_prompt[1] >= args.num_tokens_to_prompt[0] - ) - max_prompt_length = args.num_tokens_to_prompt[1] - max_prompt_text = "hi " * max_prompt_length - max_prompt_tokens = tokenizer.tokenize(max_prompt_text) - prompt_lengths = [ - random.randint(*args.num_tokens_to_prompt) - for _ in time_offsets - ] - prompt_tokens_list = [ max_prompt_tokens[:l] for l in prompt_lengths ] - prompt_texts = [ tokenizer.detokenize(tt) for tt in prompt_tokens_list ] - # Init requests. - assert len(prompt_texts) == len(time_offsets) requests = [ - Request(t, o, tokenizer, sampling_params=sampling_params) - for t, o in zip(prompt_texts, time_offsets) + Request("hi " * random.randint(*args.num_tokens_to_prompt), t, tokenizer, sampling_params) + for t in time_offsets ] return requests @@ -300,18 +281,9 @@ def get_requests_from_file( # Load prompts. n_prompts = sum(1 for _ in open(args.prompt_file)) prompts = [] - sampling_params = get_default_sampling_params(tokenizer.eod) - sampling_params_list = [] with open(args.prompt_file) as f: for line in tqdm(f.readlines(), "read prompt file", total=n_prompts): - line_dict = json.loads(line) - prompts.append(line_dict["text"]) - - sp = copy.deepcopy(sampling_params) - if args.num_tokens_from_file: - sp.num_tokens_to_generate = line_dict["chatgpt_output_token_length"] - sampling_params_list.append(sp) - + prompts.append(json.loads(line)["text"]) if len(prompts) == args.prompt_file_num_truncate: break @@ -325,8 +297,8 @@ def get_requests_from_file( # Init requests. requests = [ - Request(p, t, tokenizer, sp) - for p, t, sp in tqdm(zip(prompts, time_offsets, sampling_params_list), "init requests", total=len(prompts)) + Request(p, t, tokenizer, sampling_params) + for p, t in tqdm(zip(prompts, time_offsets), "init requests", total=len(prompts)) ] return requests @@ -370,7 +342,7 @@ def build_dynamic_engine_setup_prefix( Args: args (Namespace): Command-line arguments for this run. - context (DynamicInferenceContext): Stores limits such as `max_active_requests`, + context (DynamicInferenceContext): Stores limits such as `max_requests`, `max_tokens`, and `gtd_request_count`. requests (List[DynamicInferenceRequest]): List of inference requests. @@ -380,9 +352,7 @@ def build_dynamic_engine_setup_prefix( # CUDA graph config if args.cuda_graph_impl == "local": cg_str = ( - "graphs " - f"[{len(context.cuda_graph_token_counts)}] " - f"{context.cuda_graph_token_counts[0]}:" + f"graphs {context.cuda_graph_token_counts[0]}:" f"{context.cuda_graph_token_counts[-1]}" ) else: @@ -409,10 +379,17 @@ def build_dynamic_engine_setup_prefix( ) # Buffer limits config + flw = args.inference_dynamic_batching_buffer_overflow_factor + flw_str = "no overflow" if flw is None else f"{flw:.1f}" buffer_limits_str = ( - f"bf: {get_mem_size_str(args.inference_dynamic_batching_buffer_size_gb*1024**3)}, " - f"{context.block_allocator.active_count} chunks " - f"[r {context.max_active_requests}, t {context.max_tokens}]" + f"bf {args.inference_dynamic_batching_buffer_size_gb:.0f}, {flw_str} " + f"[r {context.max_requests}, t {context.max_tokens}]" + ) + + # Guaranteed request config + guaranteed_fraction_str = ( + f"gtd {args.inference_dynamic_batching_buffer_guaranteed_fraction:.2f} " + f"[r {context.gtd_request_count}]" ) parts = [ @@ -422,6 +399,7 @@ def build_dynamic_engine_setup_prefix( uvm_str, request_str, buffer_limits_str, + guaranteed_fraction_str, ] return " | ".join(parts) diff --git a/examples/post_training/modelopt/.gitignore b/examples/post_training/modelopt/.gitignore deleted file mode 100644 index b9272bd3eb2..00000000000 --- a/examples/post_training/modelopt/.gitignore +++ /dev/null @@ -1 +0,0 @@ -!slurm* diff --git a/examples/post_training/modelopt/ADVANCED.md b/examples/post_training/modelopt/ADVANCED.md index 28aad7d7964..20b17831b70 100644 --- a/examples/post_training/modelopt/ADVANCED.md +++ b/examples/post_training/modelopt/ADVANCED.md @@ -1,93 +1,12 @@
    -# Advanced Usage +# TensorRT Model Optimizer Integration Advanced Topics -[Advanced Configuration](#advanced-configuration) | -[Slurm Examples](#slurm-examples) | -[Checkpoint Resume](#checkpoint-resume) | +[Local Examples](#getting-started-in-a-local-environment) | +[Configuration](#learn-more-about-configuration) | +[Slurm Examples](ADVANCED.md#slurm-examples) | +[Advanced Topics](ADVANCED.md) | +[Megatron-LM Integration](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt)
    -## Advanced Configuration - -### Understanding Configuration Variables - -For simplicity, we use `shell` scripts and variables as arguments. Each script has at least 1 positional -argument `[model_conf]`. Some scripts may require more such as `[qformat]` is needed for -quantization. - -```sh -\ - HF_MODEL_CKPT= \ - bash quantize.sh [model_conf] [qformat] -``` - -> **❗ IMPORTANT:** `model_conf` is used to get the corresponding Megatron-LM `${MODEL_ARGS}`. For example, -> `meta-llama/Llama-3.1-8B-Instruct` or `deepseek-ai/DeepSeek-R1` are both supported. -> -> Provide the pretrained checkpoint through variable `${HF_MODEL_CKPT}` in commandline or -> in a configuration shell script. More variables (e.g. `${TP}`, `${EP}`, ...) can be provided through -> commandline but we recommend passing all variables in a separate `shell` script. - -### Using Configuration Scripts - -When `${HF_MODEL_CKPT}` is not set through the commandline, `./env_setup_template.sh` can be used -to pass all variables instead. If you have your own script, use `${SANDBOX_ENV_SETUP}`. - -```sh -\ - SANDBOX_ENV_SETUP= \ - bash quantize.sh [model_conf] [qformat] -``` - -**For Slurm execution**, you **MUST USE** `${SANDBOX_ENV_SETUP}` (default: `./env_setup_template.sh`). -Other variables are not passed through `sbatch` and `srun` automatically. - -### Common Configuration Variables - -- `HF_MODEL_CKPT`: Path to pretrained model checkpoint -- `TP`: Tensor parallelism degree -- `PP`: Pipeline parallelism degree -- `EP`: Expert parallelism degree (for MoE models) -- `ETP`: Expert tensor parallelism degree (for MoE models) -- `MLM_MODEL_SAVE`: Path to save Megatron-LM checkpoint -- `MLM_MODEL_LOAD`: Path to load Megatron-LM checkpoint -- `MLM_EXTRA_ARGS`: Additional Megatron-LM arguments (e.g., for uneven PP) - -## Slurm Examples - -For models that require multi-node, our scripts in Megatron-LM examples also support `slurm` with a sbatch wrapper. -Start with the example `slurm/sbatch.sh` with some minor modification or use your existing `sbatch` -script. - -Different from local environment, we only allow passing variables through a shell script (default: `env_setup_template.sh`). -Commandline variable passthrough is not supported. - -
    - -### ⭐ BF16 Kimi-K2-Instruct EAGLE3 Training - - `conf/moonshotai/kimi_k2_instruct.sh` is a config that has been tested -with 8 nodes of DGX H100 (TP=8, ETP=1, EP=64, overall 64 H100 GPUs in total). Update `HF_MODEL_CKPT` to the exact -checkpoint path in the container to start: - -```sh -export USER_FSW= -export CONTAINER_IMAGE= -export SANDBOX_ENV_SETUP=./conf/moonshotai/kimi_k2_instruct.sh -sbatch --nodes=8 slurm/sbatch.sh "eagle3.sh moonshotai/Kimi-K2-Instruct" -``` - -To export the trained EAGLE3 model, switch to `kimi_k2_instruct_export.sh`. -**We only support pipeline-parallel (PP) export.** In this case, 2 nodes are used (PP=16). - -```sh -export USER_FSW= -export CONTAINER_IMAGE= -export SANDBOX_ENV_SETUP=./conf/moonshotai/kimi_k2_instruct_export.sh -sbatch --nodes=2 slurm/sbatch.sh "export.sh moonshotai/Kimi-K2-Instruct" -``` - -## Checkpoint Resume - -WIP diff --git a/examples/post_training/modelopt/Dockerfile b/examples/post_training/modelopt/Dockerfile index e127215904d..e0b4f00021e 100644 --- a/examples/post_training/modelopt/Dockerfile +++ b/examples/post_training/modelopt/Dockerfile @@ -4,7 +4,7 @@ ARG PIP_CONSTRAINT= WORKDIR /workspace/nmm-sandbox -RUN pip install jsonlines omegaconf +RUN pip install jsonlines omegaconf pulp torchprofile RUN pip install flask flask_restful fire nltk RUN pip install tiktoken blobfile diff --git a/examples/post_training/modelopt/README.md b/examples/post_training/modelopt/README.md index 33528c30097..be455019096 100644 --- a/examples/post_training/modelopt/README.md +++ b/examples/post_training/modelopt/README.md @@ -5,21 +5,22 @@ [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) | [Local Examples](#getting-started-in-a-local-environment) | -[Configuration](./ADVANCED.md#advanced-configuration) | -[Slurm Examples](./ADVANCED.md#slurm-examples) | -[Speculative Decoding](./speculative.md) | -[Advanced Topics](./ADVANCED.md) +[Configuration](ADVANCED.md#learn-more-about-configuration) | +[Slurm Examples](ADVANCED.md#slurm-examples) | +[Speculative Decoding](speculative.md) | +[Advanced Topics](ADVANCED.md) [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) (**ModelOpt**, `nvidia-modelopt`) -provides end-to-end model optimization for NVIDIA hardware including quantization (real or simulated), -knowledge distillation, pruning, speculative decoding, and more. +provides end-to-end model optimization for +NVIDIA hardware including quantization (real or simulated), sparsity, knowledge distillation, pruning, +neural architecture search, and speulative decoding. ## Major Features -- Start from Hugging Face pretrained model checkpoint with on-the-fly conversion to Megatron-LM checkpoint format. +- Start from Hugging Face pretrained model checkpoint with on-the-fly conversion. - Support all kinds of model parallelism (TP, EP, ETP, PP). - Export to TensorRT-LLM, vLLM, and SGLang ready unified checkpoint. @@ -27,14 +28,11 @@ knowledge distillation, pruning, speculative decoding, and more. | Model (`conf/`) | Quantization | EAGLE3 | Pruning (PP only) | Distillation | | :---: | :---: | :---: | :---: | :---: | -| `deepseek-ai/DeepSeek-R1` | ✅ | ✅ | - | - | -| `meta-llama/Llama-{3.1-8B, 3.1-405B, 3.2-1B}-Instruct` | ✅ | ✅ | ✅ | ✅ | -| `meta-llama/Llama-4-{Scout,Maverick}-17B-{16,128}E-Instruct` | ✅ | ✅ | - | - | | `moonshotai/Kimi-K2-Instruct` | ✅ | ✅ | - | - | -| `nvidia/NVIDIA-Nemotron-Nano-9B-v2` | ✅ | - | ✅ | ✅ | -| `openai/gpt-oss-{20b, 120b}` | ✅ | **Online** | ✅ | ✅ | +| `Qwen/Qwen3-{30B-A3B, 235B-A22B}` | **WAR** | ✅ | - | - | | `Qwen/Qwen3-{0.6B, 8B}` | ✅ | ✅ | ✅ | ✅ | -| `Qwen/Qwen3-{30B-A3B, 235B-A22B}` | **WAR** | ✅ | ✅ | ✅ | +| `deepseek-ai/DeepSeek-R1` | ✅ | ✅ | - | - | +| `meta-llama/Llama-{3.1-8B, 3.1-405B, 3.2-1B}-Instruct` | ✅ | ✅ | ✅ | ✅ | ## Getting Started in a Local Environment @@ -45,10 +43,6 @@ pip install -U nvidia-modelopt Alternatively, you can install from [source](https://github.com/NVIDIA/TensorRT-Model-Optimizer) to try our latest features. -> **❗ IMPORTANT:** The first positional argument (e.g. `meta-llama/Llama-3.2-1B-Instruct`) of each script -> is the config name used to match the supported model config in `conf/`. The pretrained HF checkpoint should -> be downloaded and provided through `${HF_MODEL_CKPT}`. - ### ⭐ NVFP4 Quantization, Qauntization-Aware Training, and Model Export @@ -61,7 +55,7 @@ provide `${EXPORT_DIR}` to `export.sh`. > low-precision numerical behavior (fake-quant) which can be run on GPUs with compute > 80. > Real low-precision paramters (e.g. `E4M3` or `E2M1`) > and low-precision compute (e.g. `FP8Linear`) are also supported depending on GPU compute capability. -> **See [Adanvanced Topics](./ADVANCED.md) for details**. +> **See [Adanvanced Topics](advanced.md) for details**. ```sh \ @@ -78,6 +72,31 @@ provide `${EXPORT_DIR}` to `export.sh`. ./export.sh meta-llama/Llama-3.2-1B-Instruct ``` +> **❗ IMPORTANT:** The first positional arugment (e.g. `meta-llama/Llama-3.2-1B-Instruct`) of each script +> is the config name used to match the supported model config in `conf/`. The pretrained checkpoint should +> be downloaded and provided through `${HF_MODEL_CKPT}`. + +Loading the saved distributed checkpoint, the quantized Megatron model can be resumed for inference +(generate or evaluate) or training (SFT or PEFT). To read more about these features, see +[Adanvanced Topics](advanced.md). To learn more about the design, see our [Design]() document [WIP]. + +```sh +\ + TP=1 \ + MLM_MODEL_CKPT=/tmp/Llama-3.2-1B-Instruct_quant \ + ./generate.sh meta-llama/Llama-3.2-1B-Instruct + +\ + TP=1 \ + MLM_MODEL_CKPT=/tmp/Llama-3.2-1B-Instruct_quant \ + ./mmlu.sh meta-llama/Llama-3.2-1B-Instruct + +\ + TP=1 \ + MLM_MODEL_CKPT=/tmp/Llama-3.2-1B-Instruct_quant \ + ./finetune.sh meta-llama/Llama-3.2-1B-Instruct +``` + ### ⭐ Online BF16 EAGLE3 Training Online EAGLE3 training has both the target (frozen) and draft models in the memory where the `hidden_states` @@ -100,23 +119,19 @@ deployment. ./export.sh meta-llama/Llama-3.2-1B-Instruct ``` -See [Adanvanced Topics](./ADVANCED.md) for a `moonshotai/Kimi-K2-Instruct` EAGLE3 training example using `slurm`. +See [Adanvanced Topics](ADVANCED.md) for a `moonshotai/Kimi-K2-Instruct` EAGLE3 training example using `slurm`. ### ⭐ Pruning Checkout pruning getting started section and guidelines for configuring pruning parameters in the [ModelOpt pruning README](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/pruning). -Pruning is supported for GPT and Mamba models in Pipeline Parallel mode. Available pruning dimensions are: - +Pruning is supported for GPT and Mamba models. Available pruning options are: - `TARGET_FFN_HIDDEN_SIZE` - `TARGET_HIDDEN_SIZE` - `TARGET_NUM_ATTENTION_HEADS` - `TARGET_NUM_QUERY_GROUPS` - `TARGET_MAMBA_NUM_HEADS` - `TARGET_MAMBA_HEAD_DIM` -- `TARGET_NUM_MOE_EXPERTS` -- `TARGET_MOE_FFN_HIDDEN_SIZE` -- `TARGET_MOE_SHARED_EXPERT_INTERMEDIATE_SIZE` - `TARGET_NUM_LAYERS` - `LAYERS_TO_DROP` (comma separated, 1-indexed list of layer numbers to directly drop) @@ -127,44 +142,12 @@ PP=1 \ TARGET_NUM_LAYERS=24 \ HF_MODEL_CKPT= \ MLM_MODEL_SAVE=Qwen3-8B-Pruned \ -./prune.sh Qwen/Qwen3-8B +./prune.sh qwen/Qwen3-8B ``` > [!TIP] > If number of layers in the model is not divisible by pipeline parallel size (PP), you can configure uneven > PP by setting `MLM_EXTRA_ARGS="--decoder-first-pipeline-num-layers --decoder-last-pipeline-num-layers "` -> [!TIP] -> You can reuse pruning scores for pruning same model again to different architectures by setting -> `PRUNE_ARGS="--pruning-scores-path "` - -> [!NOTE] -> When loading pruned M-LM checkpoint for subsequent steps, make sure overwrite the pruned parameters in the -> default `conf/` by setting `MLM_EXTRA_ARGS`. E.g.: for loading above pruned Qwen3-8B checkpoint for mmlu, set: -> `MLM_EXTRA_ARGS="--num-layers 24"` - -### ⭐ Inference and Training - -The saved Megatron-LM distributed checkpoint (output of above scripts) can be resumed for inference -(generate or evaluate) or training (SFT or PEFT). To read more about these features, see -[Advanced Topics](./ADVANCED.md). - -```sh -\ - TP=1 \ - MLM_MODEL_CKPT=/tmp/Llama-3.2-1B-Instruct_quant \ - ./generate.sh meta-llama/Llama-3.2-1B-Instruct - -\ - TP=1 \ - MLM_MODEL_CKPT=/tmp/Llama-3.2-1B-Instruct_quant \ - ./mmlu.sh meta-llama/Llama-3.2-1B-Instruct - -\ - TP=1 \ - MLM_MODEL_CKPT=/tmp/Llama-3.2-1B-Instruct_quant \ - ./finetune.sh meta-llama/Llama-3.2-1B-Instruct -``` - ## Advanced Usage TBD diff --git a/examples/post_training/modelopt/conf/arguments.sh b/examples/post_training/modelopt/conf/arguments.sh index 0193bf8b643..f29e0a9d989 100644 --- a/examples/post_training/modelopt/conf/arguments.sh +++ b/examples/post_training/modelopt/conf/arguments.sh @@ -1,6 +1,3 @@ -#!/bin/bash -set -e - MLM_MODEL_CFG=$1 # Bash coloring diff --git a/examples/post_training/modelopt/conf/moonshotai/kimi_k2_instruct.sh b/examples/post_training/modelopt/conf/moonshotai/kimi_k2_instruct.sh deleted file mode 100644 index 4f301f31c1d..00000000000 --- a/examples/post_training/modelopt/conf/moonshotai/kimi_k2_instruct.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -HF_MODEL_CKPT=/workspace/scratch/moonshotai/Kimi-K2-Instruct -TP=8 -ETP=1 -EP=64 - diff --git a/examples/post_training/modelopt/conf/moonshotai/kimi_k2_instruct_export.sh b/examples/post_training/modelopt/conf/moonshotai/kimi_k2_instruct_export.sh deleted file mode 100644 index 73ee80a6d93..00000000000 --- a/examples/post_training/modelopt/conf/moonshotai/kimi_k2_instruct_export.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -HF_MODEL_CKPT=/workspace/scratch/moonshotai/Kimi-K2-Instruct - -MLM_EXTRA_ARGS=" \ - --decoder-first-pipeline-num-layers 3 \ - --decoder-last-pipeline-num-layers 2 \ - --init-model-with-meta-device \ - --use-cpu-initialization \ - -" - -# Layer distribution over PP: 3, [4] * 14, 2. -PP=16 - diff --git a/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base.sh b/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base.sh deleted file mode 120000 index 3771c930263..00000000000 --- a/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base.sh +++ /dev/null @@ -1 +0,0 @@ -NVIDIA-Nemotron-Nano-9B-v2.sh \ No newline at end of file diff --git a/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base.sh b/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base.sh new file mode 100644 index 00000000000..d6ba1e1dcc4 --- /dev/null +++ b/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +if [ -z ${HF_MODEL_CKPT} ]; then + HF_MODEL_CKPT=nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base + TOKENIZER_MODEL=nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base +else + TOKENIZER_MODEL=${HF_MODEL_CKPT} +fi + +MODEL_ARGS=" \ + --save-interval 100000 \ + --micro-batch-size 1 \ + --bf16 \ + --no-masked-softmax-fusion \ + --disable-bias-linear \ + --untie-embeddings-and-output-weights \ + --position-embedding-type none \ + --no-rope-fusion \ + --normalization RMSNorm \ + --squared-relu \ + --num-layers 56 \ + --hidden-size 4480 \ + --ffn-hidden-size 15680 \ + --num-attention-heads 40 \ + --kv-channels 128 \ + --group-query-attention \ + --num-query-groups 8 \ + --hybrid-override-pattern M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M- \ + --is-hybrid-model \ + --mamba-head-dim 80 \ + --mamba-num-heads 128 \ + --mamba-num-groups 8 \ + --mamba-state-dim 128 \ + --seq-length 4096 \ + --max-position-embeddings 131072 \ + --tokenizer-type HuggingFaceTokenizer \ + --make-vocab-size-divisible-by 1 \ + --use-mcore-models \ + --export-model-type MambaModel \ + --padded-vocab-size 131072 \ +" diff --git a/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2.sh b/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2.sh deleted file mode 100644 index d6ba1e1dcc4..00000000000 --- a/examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-Nano-9B-v2.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -if [ -z ${HF_MODEL_CKPT} ]; then - HF_MODEL_CKPT=nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base - TOKENIZER_MODEL=nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base -else - TOKENIZER_MODEL=${HF_MODEL_CKPT} -fi - -MODEL_ARGS=" \ - --save-interval 100000 \ - --micro-batch-size 1 \ - --bf16 \ - --no-masked-softmax-fusion \ - --disable-bias-linear \ - --untie-embeddings-and-output-weights \ - --position-embedding-type none \ - --no-rope-fusion \ - --normalization RMSNorm \ - --squared-relu \ - --num-layers 56 \ - --hidden-size 4480 \ - --ffn-hidden-size 15680 \ - --num-attention-heads 40 \ - --kv-channels 128 \ - --group-query-attention \ - --num-query-groups 8 \ - --hybrid-override-pattern M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M- \ - --is-hybrid-model \ - --mamba-head-dim 80 \ - --mamba-num-heads 128 \ - --mamba-num-groups 8 \ - --mamba-state-dim 128 \ - --seq-length 4096 \ - --max-position-embeddings 131072 \ - --tokenizer-type HuggingFaceTokenizer \ - --make-vocab-size-divisible-by 1 \ - --use-mcore-models \ - --export-model-type MambaModel \ - --padded-vocab-size 131072 \ -" diff --git a/examples/post_training/modelopt/conf/Qwen/Qwen2.5-0.5B-Instruct.sh b/examples/post_training/modelopt/conf/qwen/Qwen2.5-0.5B-Instruct.sh similarity index 100% rename from examples/post_training/modelopt/conf/Qwen/Qwen2.5-0.5B-Instruct.sh rename to examples/post_training/modelopt/conf/qwen/Qwen2.5-0.5B-Instruct.sh diff --git a/examples/post_training/modelopt/conf/Qwen/Qwen2.5-7B-Instruct.sh b/examples/post_training/modelopt/conf/qwen/Qwen2.5-7B-Instruct.sh similarity index 100% rename from examples/post_training/modelopt/conf/Qwen/Qwen2.5-7B-Instruct.sh rename to examples/post_training/modelopt/conf/qwen/Qwen2.5-7B-Instruct.sh diff --git a/examples/post_training/modelopt/conf/Qwen/Qwen3-0.6B.sh b/examples/post_training/modelopt/conf/qwen/Qwen3-0.6B.sh similarity index 100% rename from examples/post_training/modelopt/conf/Qwen/Qwen3-0.6B.sh rename to examples/post_training/modelopt/conf/qwen/Qwen3-0.6B.sh diff --git a/examples/post_training/modelopt/conf/Qwen/Qwen3-235B-A22B.sh b/examples/post_training/modelopt/conf/qwen/Qwen3-235B-A22B.sh similarity index 100% rename from examples/post_training/modelopt/conf/Qwen/Qwen3-235B-A22B.sh rename to examples/post_training/modelopt/conf/qwen/Qwen3-235B-A22B.sh diff --git a/examples/post_training/modelopt/conf/Qwen/Qwen3-30B-A3B.sh b/examples/post_training/modelopt/conf/qwen/Qwen3-30B-A3B.sh similarity index 100% rename from examples/post_training/modelopt/conf/Qwen/Qwen3-30B-A3B.sh rename to examples/post_training/modelopt/conf/qwen/Qwen3-30B-A3B.sh diff --git a/examples/post_training/modelopt/conf/Qwen/Qwen3-8B.sh b/examples/post_training/modelopt/conf/qwen/Qwen3-8B.sh similarity index 100% rename from examples/post_training/modelopt/conf/Qwen/Qwen3-8B.sh rename to examples/post_training/modelopt/conf/qwen/Qwen3-8B.sh diff --git a/examples/post_training/modelopt/convert_model.py b/examples/post_training/modelopt/convert_model.py index 20ee59a2fe0..9790d73fc4c 100644 --- a/examples/post_training/modelopt/convert_model.py +++ b/examples/post_training/modelopt/convert_model.py @@ -162,7 +162,17 @@ def check_arguments(): if eagle_module is not None: mcore_eagle_state_dict = torch.load(args.extra_model_path) eagle_module.load_state_dict(mcore_eagle_state_dict, strict=False) - + + # Add mask tokens for parallel draft + if unwrapped_model.eagle_config.parallel_draft_step > 1: + assert unwrapped_model.eagle_config.parallel_draft_step <= 4, "Parallel draft only supports steps less than or equal to 4." + tokenizer = get_tokenizer() + for i in range(unwrapped_model.eagle_config.parallel_draft_step - 1): + mask_token = "[MASK_{}]".format(i) + tokenizer._tokenizer.add_tokens([mask_token], special_tokens=True) + token_id = tokenizer._tokenizer.convert_tokens_to_ids(mask_token) + setattr(unwrapped_model, "mask_token_{}".format(i), torch.tensor(token_id)) + elif args.algorithm == "medusa": config = {"medusa_num_heads": args.export_num_medusa_heads, "medusa_num_layers": 1} unwrapped_model = mtsp.convert(unwrapped_model, [("medusa", config)]) diff --git a/examples/post_training/modelopt/finetune.py b/examples/post_training/modelopt/finetune.py index 6489d394392..bd0569bb513 100755 --- a/examples/post_training/modelopt/finetune.py +++ b/examples/post_training/modelopt/finetune.py @@ -167,7 +167,7 @@ def __init__( hf_dataset_kwargs = SFTDataset.hf_dataset_to_kwargs.get( self.hf_dataset, {"split": "train"} ) - self._raw_samples = datasets.load_dataset(self.hf_dataset, token=os.environ.get("HF_TOKEN", None), **hf_dataset_kwargs) + self._raw_samples = datasets.load_dataset(self.hf_dataset, **hf_dataset_kwargs) self._raw_samples = self._raw_samples.shard( num_shards=self.num_shards, index=shard_index ) @@ -455,10 +455,7 @@ def non_loss_data_func(model: GPTModel): """Callback to compute the acceptance length.""" args = get_args() if not args.export_offline_model: - try: - report_draft_acceptance_length(model) - except Exception as e: - print(e) + report_draft_acceptance_length(model) diff --git a/examples/post_training/modelopt/finetune.sh b/examples/post_training/modelopt/finetune.sh index 21493697374..0579dd69157 100755 --- a/examples/post_training/modelopt/finetune.sh +++ b/examples/post_training/modelopt/finetune.sh @@ -14,7 +14,6 @@ MLM_DEFAULT_ARGS=" \ --distributed-timeout-minutes 30 \ --auto-detect-ckpt-format \ --export-te-mcore-model \ - --finetune \ " @@ -68,8 +67,6 @@ if [ -z ${MLM_EVAL_ARGS} ]; then " fi -export HF_TOKEN=${HF_TOKEN} - ${LAUNCH_SCRIPT} ${SCRIPT_DIR}/finetune.py \ ${MODEL_ARGS} \ --tensor-model-parallel-size ${TP} \ diff --git a/examples/post_training/modelopt/prune.py b/examples/post_training/modelopt/prune.py index 6a0178a1420..7819b2ed2af 100644 --- a/examples/post_training/modelopt/prune.py +++ b/examples/post_training/modelopt/prune.py @@ -20,7 +20,6 @@ from modelopt.torch.export import import_mcore_gpt_from_hf from modelopt.torch.prune.plugins.mcore_minitron import SUPPORTED_HPARAMS -from megatron.core.parallel_state import get_pipeline_model_parallel_group, get_tensor_model_parallel_group from megatron.post_training.arguments import add_modelopt_args from megatron.post_training.checkpointing import load_modelopt_checkpoint from megatron.post_training.generate import simple_generate @@ -92,21 +91,6 @@ def add_prune_args(parser): type=int, help="Prune dimension of Mamba attention heads to this value", ) - group.add_argument( - "--target-num-moe-experts", - type=int, - help="Prune number of MoE experts to this value", - ) - group.add_argument( - "--target-moe-ffn-hidden-size", - type=int, - help="Prune MoE FFN hidden size to this value", - ) - group.add_argument( - "--target-moe-shared-expert-intermediate-size", - type=int, - help="Prune MoE shared expert intermediate size to this value", - ) group.add_argument( "--target-num-layers", type=int, @@ -120,12 +104,6 @@ def add_prune_args(parser): nargs="*", help="Drop specific model layers (1-indexed). Cannot be used with rest of the pruning options", ) - group.add_argument( - "--pruning-scores-path", - type=str, - default=None, - help="Path to the cache and reuse pruning scores for pruning again to different params", - ) add_modelopt_args(parser) return parser @@ -147,14 +125,6 @@ def get_calib_dataloader(calib_size=1024, max_sequence_length=512): yield dataset[i][text_column][:max_sequence_length] -def get_params(model): - params = sum(p.numel() for p in model.parameters()) - reduced_params = torch.Tensor([params]).to(device=next(model.parameters()).device) - torch.distributed.all_reduce(reduced_params, group=get_pipeline_model_parallel_group()) - torch.distributed.all_reduce(reduced_params, group=get_tensor_model_parallel_group()) - return reduced_params.item() - - if __name__ == "__main__": initialize_megatron( extra_args_provider=add_prune_args, @@ -211,7 +181,7 @@ def _hf_dataset_forword_loop_func(model): simple_generate(model, tokens.input_ids.cuda(), osl=1) if args.layers_to_drop: - mtp.mcore_minitron.drop_mcore_language_model_layers(model, layers_to_drop=args.layers_to_drop) + mtp.plugins.drop_mcore_language_model_layers(model, layers_to_drop=args.layers_to_drop) else: print_rank_0("Pruning model...") export_config = { @@ -219,22 +189,18 @@ def _hf_dataset_forword_loop_func(model): for k in SUPPORTED_HPARAMS if getattr(args, f"target_{k}", None) is not None } - config = {"forward_loop": _hf_dataset_forword_loop_func} - if args.pruning_scores_path is not None: - config["scores_path"] = args.pruning_scores_path mtp.prune( unwrapped_model, mode="mcore_minitron", constraints={"export_config": export_config}, dummy_input=None, # Not used - config=config, + config={"forward_loop": _hf_dataset_forword_loop_func}, ) # [WAR till modelopt 0.39]: Remove prune state to avoid converting again on restore which forces TP=1. if mto.ModeloptStateManager.has_state_for_mode_type("prune", model=unwrapped_model): mto.ModeloptStateManager.remove_state(unwrapped_model) print_rank_0(f"Pruned Model:\n {unwrapped_model}") - print_rank_0(f"Pruned Model Params: {get_params(unwrapped_model)/1e9:.2f}B") _custom_prompt_forward_loop_func(unwrapped_model) diff --git a/examples/post_training/modelopt/prune.sh b/examples/post_training/modelopt/prune.sh index 33f3e615e96..ef86260b062 100755 --- a/examples/post_training/modelopt/prune.sh +++ b/examples/post_training/modelopt/prune.sh @@ -23,27 +23,23 @@ MLM_DEFAULT_ARGS=" # Example: export LAYERS_TO_DROP="1 5 10" # Define pruning argument mappings: "env_var:cli_arg" -# List of environment variables we want to check for pruning CLI args -PRUNE_ENV_VARS=( - TARGET_FFN_HIDDEN_SIZE - TARGET_HIDDEN_SIZE - TARGET_NUM_ATTENTION_HEADS - TARGET_NUM_QUERY_GROUPS - TARGET_MAMBA_NUM_HEADS - TARGET_MAMBA_HEAD_DIM - TARGET_NUM_MOE_EXPERTS - TARGET_MOE_FFN_HIDDEN_SIZE - TARGET_MOE_SHARED_EXPERT_INTERMEDIATE_SIZE - TARGET_NUM_LAYERS - LAYERS_TO_DROP +PRUNE_ARG_MAPPINGS=( + "TARGET_FFN_HIDDEN_SIZE:--target-ffn-hidden-size" + "TARGET_HIDDEN_SIZE:--target-hidden-size" + "TARGET_NUM_ATTENTION_HEADS:--target-num-attention-heads" + "TARGET_NUM_QUERY_GROUPS:--target-num-query-groups" + "TARGET_MAMBA_NUM_HEADS:--target-mamba-num-heads" + "TARGET_MAMBA_HEAD_DIM:--target-mamba-head-dim" + "TARGET_NUM_LAYERS:--target-num-layers" + "LAYERS_TO_DROP:--layers-to-drop" ) -# Build arguments from environment variables (TARGET_NUM_LAYERS -> --target-num-layers, etc.) -PRUNE_ARGS=${PRUNE_ARGS:-""} -for env_var in "${PRUNE_ENV_VARS[@]}"; do +# Build arguments from environment variables +PRUNE_ARGS="" +for mapping in "${PRUNE_ARG_MAPPINGS[@]}"; do + env_var="${mapping%%:*}" + cli_arg="${mapping##*:}" if [ ! -z "${!env_var}" ]; then - # prepend --, convert to lowercase, replace _ with - - cli_arg="--$(echo "${env_var}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')" PRUNE_ARGS="${PRUNE_ARGS} ${cli_arg} ${!env_var}" fi done @@ -63,9 +59,6 @@ else LOAD_ARGS="--load ${MLM_MODEL_CKPT}" fi - -set -ex - ${LAUNCH_SCRIPT} ${SCRIPT_DIR}/prune.py \ ${MODEL_ARGS} \ ${LOAD_ARGS} \ @@ -74,5 +67,6 @@ ${LAUNCH_SCRIPT} ${SCRIPT_DIR}/prune.py \ --tokenizer-model ${TOKENIZER_MODEL} \ --save ${MLM_MODEL_SAVE} \ --references "${MLM_REF_LABEL}" \ + --calib-size 1024 \ ${PRUNE_ARGS} \ ${MLM_DEFAULT_ARGS} ${MLM_EXTRA_ARGS} diff --git a/examples/post_training/modelopt/slurm/env_setup_template.sh b/examples/post_training/modelopt/slurm/env_setup_template.sh deleted file mode 100644 index 12b59f06eed..00000000000 --- a/examples/post_training/modelopt/slurm/env_setup_template.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -HF_MODEL_CKPT=/workspace/scratch/meta-llama/Llama-3.2-1B-Instruct -TP=1 -ETP=1 -EP=1 -PP=1 diff --git a/examples/post_training/modelopt/slurm/sbatch.sh b/examples/post_training/modelopt/slurm/sbatch.sh deleted file mode 100644 index 3916c5de2b5..00000000000 --- a/examples/post_training/modelopt/slurm/sbatch.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash - -#SBATCH -A -#SBATCH -p -#SBATCH --job-name= -#SBATCH --nodes=1 --ntasks-per-node=8 --gpus-per-node=8 -#SBATCH -t 04:00:00 -#SBATCH --exclusive --mem=0 --overcommit - -# Bash coloring -RED='\033[0;31m' -YELLOW='\033[0;33m' -GREEN='\033[0;32m' -BLUE='\033[0;34m' -PURPLE='\033[0;35m' -WHITE='\033[0;37m' - -# Predefined logging -MLM_ERROR="${RED}ERROR: ${WHITE}" -MLM_WARNING="${YELLOW}WARNING:${WHITE}" - -# CHANGE THE FOLLOWING TO YOUR DATA, MEGATRON, and CHECKPOINT DIR -if [[ -z ${USER_FSW} ]]; then - printf "${MLM_ERROR} Variable USER_FSW (read/write scratch space) must be set!\n" - exit 1 -fi - -if [ -z ${SANDBOX_DIR} ]; then - SANDBOX_DIR="$(pwd)" - printf "${MLM_WARNING} Variable SANDBOX_DIR not set! (default: ${SANDBOX_DIR})\n" -fi - -if [ -z ${SANDBOX_ENV_SETUP} ]; then - SANDBOX_ENV_SETUP=./env_setup_template.sh - printf "${MLM_WARNING} Variable SANDBOX_ENV_SETUP not set! (default: ${SANDBOX_ENV_SETUP})\n" -fi - -if [ -z ${CONTAINER_IMAGE} ]; then - CONTAINER_IMAGE="nvidia-modelopt-megatron:latest" - printf "${MLM_WARNING} Variable CONTAINER_IMAGE not set! (default: ${CONTAINER_IMAGE})\n" -fi - -if [ -z ${LAUNCH_SCRIPT} ]; then - LAUNCH_SCRIPT="python" - printf "${MLM_WARNING} Variable LAUNCH_SCRIPT not set! (default: ${LAUNCH_SCRIPT})\n" -fi - -# DO NOT MODIFY THE VALUES BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!! -DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` - -CONTAINER_MOUNT="${SANDBOX_DIR}:/workspace/nmm-sandbox,${USER_FSW}:/workspace/scratch" - -srun -l \ - --mpi=pmix \ - --output=%x_%j_$DATETIME.log \ - --container-image ${CONTAINER_IMAGE} \ - --container-workdir "/workspace/nmm-sandbox" \ - --container-mounts ${CONTAINER_MOUNT} \ - --export "HF_MODEL_CKPT=${HF_MODEL_CKPT},SANDBOX_ENV_SETUP=${SANDBOX_ENV_SETUP},LAUNCH_SCRIPT=${LAUNCH_SCRIPT}" \ - bash ${1} - -set +x - diff --git a/examples/post_training/modelopt/validate.sh b/examples/post_training/modelopt/validate.sh index 796231e508e..90ff4810117 100644 --- a/examples/post_training/modelopt/validate.sh +++ b/examples/post_training/modelopt/validate.sh @@ -16,9 +16,8 @@ if [ -z ${MLM_MODEL_CKPT} ]; then fi if [ -z ${PROMPTS_PATH} ]; then - PROMPT_ARGS="" -else - PROMPT_ARGS="--prompts-path ${PROMPTS_PATH}" + printf "${MLM_ERROR} Variable ${PURPLE}PROMPTS_PATH${WHITE} must be set!\n" + exit 1 fi if [ -z ${STEPS} ]; then @@ -41,7 +40,6 @@ if [ -z ${OSL} ]; then STEPS=64 fi -export HF_TOKEN=${HF_TOKEN} ${LAUNCH_SCRIPT} ${SCRIPT_DIR}/validate.py \ ${MODEL_ARGS} \ @@ -51,9 +49,9 @@ ${LAUNCH_SCRIPT} ${SCRIPT_DIR}/validate.py \ --pipeline-model-parallel-size ${PP} \ --tokenizer-model ${TOKENIZER_MODEL} \ --load ${MLM_MODEL_CKPT} \ + --prompts-path ${PROMPTS_PATH} \ --steps ${STEPS} \ --osl ${OSL} \ - ${PROMPT_ARGS} \ ${GT_ARGS} \ ${SAVE_ARGS} \ ${MLM_DEFAULT_ARGS} ${MLM_EXTRA_ARGS} diff --git a/gpt_builders.py b/gpt_builders.py index 2ef41846f2c..9fa1aff72c7 100644 --- a/gpt_builders.py +++ b/gpt_builders.py @@ -5,7 +5,6 @@ get_gpt_decoder_block_spec, get_gpt_layer_local_spec, get_gpt_layer_with_transformer_engine_spec, - get_gpt_layer_with_inference_spec, get_gpt_mtp_block_spec, get_gpt_decoder_layer_specs, ) @@ -44,7 +43,6 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None): use_te = args.transformer_impl == "transformer_engine" if args.num_experts or (args.linear_attention_type is not None): - assert not (config.transformer_impl == "inference_optimized") # Define the decoder block spec transformer_layer_spec = get_gpt_decoder_block_spec( config, @@ -54,14 +52,12 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None): vp_stage=vp_stage, ) elif args.heterogeneous_layers_config_path is not None: - assert not (config.transformer_impl == "inference_optimized") transformer_layer_spec = get_gpt_heterogeneous_layer_spec(config, use_te) else: # Define the decoder layer spec transformer_layer_spec = _get_transformer_layer_spec(use_te, config) mtp_block_spec = None if args.mtp_num_layers is not None: - assert not (config.transformer_impl == "inference_optimized") # Get GPT decoder layer specs for the model. if args.spec is not None: mtp_transformer_layer_spec = import_module(args.spec) @@ -124,12 +120,6 @@ def _get_transformer_layer_spec(use_te, config): use_kitchen=config.use_kitchen, fallback_to_eager_attn=config.fallback_to_eager_attn, ) - elif config.transformer_impl == "inference_optimized": - return get_gpt_layer_with_inference_spec( - args.qk_layernorm, - args.multi_latent_attention, - qk_l2_norm=args.qk_l2_norm, - ) else: return get_gpt_layer_local_spec( args.num_experts, diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py index 8a63e0f5cf7..d6ef5f6210e 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py @@ -898,10 +898,9 @@ def forward_hook(_module, inputs, output): # Register pre state_dict hook to ensure that the module parameters are # distributed before saving the state_dict. - for name, module in self.named_modules(): - module.register_state_dict_pre_hook( - lambda *args, **kwargs: self._replace_param_with_distributed_if_needed() - ) + self._state_dict_pre_hook = self.module.register_state_dict_pre_hook( + lambda *args, **kwargs: self._replace_param_with_distributed_if_needed() + ) @contextmanager def no_sync(self): diff --git a/megatron/core/fusions/fused_pad_routing_map.py b/megatron/core/fusions/fused_pad_routing_map.py index 8e4d1763270..c382178b6c9 100644 --- a/megatron/core/fusions/fused_pad_routing_map.py +++ b/megatron/core/fusions/fused_pad_routing_map.py @@ -6,7 +6,7 @@ from packaging import version from megatron.core.jit import jit_fuser -from megatron.core.utils import experimental_fn, null_decorator +from megatron.core.utils import null_decorator try: import triton @@ -70,7 +70,6 @@ def _pad_routing_map_kernel( tl.store(output_row_ptr + token_indices, output_row, mask=token_mask) -@experimental_fn(introduced_with_version="0.13.0") @jit_fuser def fused_pad_routing_map(routing_map: torch.Tensor, pad_multiple: int) -> torch.Tensor: """Fused version of pad_routing_map. diff --git a/megatron/core/inference/communication_utils.py b/megatron/core/inference/communication_utils.py index a5bfe75fbb6..18fbb18f2f0 100644 --- a/megatron/core/inference/communication_utils.py +++ b/megatron/core/inference/communication_utils.py @@ -71,7 +71,8 @@ def broadcast_from_last_pipeline_stage( tensor.shape ), f"Expected tensor of shape {size} but got {list(tensor.shape)}" assert dtype == tensor.dtype, f"Expected tensor of type {dtype} but got {tensor.dtype}" - _is_cuda_contiguous(tensor) + _is_cuda(tensor) + assert tensor.is_contiguous() else: tensor = torch.empty(size, dtype=dtype, device=torch.cuda.current_device()) diff --git a/megatron/core/inference/contexts/attention_context/mamba_metadata.py b/megatron/core/inference/contexts/attention_context/mamba_metadata.py index ecb0296559f..e9cd99a6c48 100644 --- a/megatron/core/inference/contexts/attention_context/mamba_metadata.py +++ b/megatron/core/inference/contexts/attention_context/mamba_metadata.py @@ -1,28 +1,8 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -from dataclasses import dataclass -from typing import List, Optional, Tuple - import torch -@dataclass -class MambaInferenceStateConfig: - """Config for initializing Mamba model inference state tensors.""" - - layer_type_list: List[str] - """ - A list of strings that indicates the layer type (Mamba / Attention / MLP) for each layer. - See `megatron/core/ssm/mamba_hybrid_layer_allocation.py` for the list of symbols. - """ - - mamba_conv_states_shape: Tuple[int] - """Mamba conv states shape per request.""" - - mamba_ssm_states_shape: Tuple[int] - """Mamba ssm states shape per request.""" - - class MambaMetadata: """Manages the metadata tensors required for Mamba layers during inference.""" @@ -84,7 +64,7 @@ def update_cudagraph_mapping( """ self.request_to_mamba_state_idx_cudagraph_only[0:num_active_requests] = active_mamba_indices - def allocate_slot(self) -> Optional[int]: + def allocate_slot(self) -> int: """ Allocates a new slot for a request in the Mamba state buffers. diff --git a/megatron/core/inference/contexts/dynamic_block_allocator.py b/megatron/core/inference/contexts/dynamic_block_allocator.py index 026ee47d094..4baa3f5212c 100644 --- a/megatron/core/inference/contexts/dynamic_block_allocator.py +++ b/megatron/core/inference/contexts/dynamic_block_allocator.py @@ -13,86 +13,60 @@ class BlockAllocator: - Initializing a pool of block IDs - Allocating blocks from the pool - Releasing blocks back to the pool + - Managing the guaranteed block count for active requests Args: - context (DynamicInferenceContext): Dynamic inference context. - active_count (int): Total number of active blocks available in the buffer. - The full buffer size is 2*active_count, to accommodate an equal-size - space for paused requests that live on the CPU. + block_count_total (int): Total number of blocks available in the buffer. + gtd_block_count (int): Number of blocks reserved for guaranteed requests. """ - def __init__(self, context: "DynamicInferenceContext", total_count: int): + def __init__(self, block_count_total: int, gtd_block_count: int): + self.block_count_total = block_count_total + self.gtd_block_count = gtd_block_count - self.context = context - - active_count = (total_count - 1) // 2 # -1 for dummy_block_idx (see below) - active_count = max(1, active_count) # need at least one block - self.total_count = 2 * active_count + 1 # +1 for dummy_block_idx - self.total_avail = self.total_count - 1 # -1 for dummy_block_idx - self.active_count = active_count - self.paused_count = self.total_count - self.active_count - 1 # -1 for dummy_block_idx - self.dummy_block_idx = self.total_count - 1 + # Reserve last block ID as dummy block for decode-only inference steps + self.block_count_avail = self.block_count_total - 1 + self.dummy_block_idx = self.block_count_total - 1 # Initialize block pool as a "stack" data structure self.block_bag = torch.arange( - self.total_count, dtype=torch.int32, device=torch.cuda.current_device() - ) - - def __str__(self): - return ( - f"total avail {self.total_avail} / {self.total_count - 1}" - f"; active {self.active_count}" + self.block_count_total, dtype=torch.int32, device=torch.cuda.current_device() ) - def get_active_used(self): - """Compute number of active blocks used.""" - return ( - self.context.request_kv_block_counts[ - self.context.paused_request_count : self.context.total_request_count - ] - .sum() - .item() - ) - - def get_paused_used(self): - """Compute number of paused blocks used.""" - return ( - self.context.request_kv_block_counts[: self.context.paused_request_count].sum().item() - ) - - def get_active_avail(self): - """Compute number of active blocks available.""" - return self.active_count - self.get_active_used() - - def get_paused_avail(self): - """Compute number of paused blocks available.""" - return self.paused_count - self.get_paused_used() - - def is_memory_available(self, num_blocks: int) -> bool: + def is_memory_available(self, num_blocks: int, safe: bool = False) -> bool: """Check if memory blocks are available. + Use 'safe' to avoid all requests being deadlocked. A fraction of the KV cache + memory buffer is reserved to guarantee that a minimum number of active + requests can run on any given step. + Args: num_blocks (int): Number of blocks to check. + safe (bool): Include extra space for guaranteeing ability to run + requests to completion. Return: (bool) Is memory available? """ - return self.get_active_avail() >= num_blocks + if safe: + return self.block_count_avail >= num_blocks + self.gtd_block_count + else: + return self.block_count_avail >= num_blocks - def allocate_memory_blocks(self, num_blocks: int) -> Optional[Tensor]: + def allocate_memory_blocks(self, num_blocks: int = 1, safe: bool = False) -> Optional[Tensor]: """Allocate memory blocks if available, else return None. Args: num_blocks (int): Number of blocks to allocate. + safe (bool): Include extra space for guaranteeing ability to run + requests to completion. Return: (Optional[Tensor]) Allocated block IDs. """ - if self.is_memory_available(num_blocks): - self.total_avail -= num_blocks - block_ids = self.block_bag[self.total_avail : (self.total_avail + num_blocks)] - assert num_blocks == block_ids.numel() - return block_ids + if self.is_memory_available(num_blocks, safe): + self.block_count_avail -= num_blocks + return self.block_bag[self.block_count_avail : (self.block_count_avail + num_blocks)] else: return None @@ -106,8 +80,8 @@ def release_memory_blocks(self, blocks: Tensor) -> None: None """ num_blocks = blocks.size(dim=0) - self.block_bag[self.total_avail : (self.total_avail + num_blocks)] = blocks - self.total_avail += num_blocks + self.block_bag[self.block_count_avail : (self.block_count_avail + num_blocks)] = blocks + self.block_count_avail += num_blocks def reset(self) -> None: """Reset the allocator to initial state. @@ -115,4 +89,4 @@ def reset(self) -> None: This resets the available block count to the entire memory pool (except for the dummy block). """ - self.total_avail = self.total_count - 1 + self.block_count_avail = self.block_count_total - 1 diff --git a/megatron/core/inference/contexts/dynamic_context.py b/megatron/core/inference/contexts/dynamic_context.py index d15daa90d10..000b58200f8 100644 --- a/megatron/core/inference/contexts/dynamic_context.py +++ b/megatron/core/inference/contexts/dynamic_context.py @@ -1,6 +1,5 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -import logging import math import warnings from contextlib import nullcontext @@ -24,11 +23,14 @@ from megatron.core.inference.utils import tensor_swap from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb from megatron.core.package_info import __version__ as mcore_version -from megatron.core.ssm.mamba_hybrid_layer_allocation import get_layer_maps_from_layer_type_list +from megatron.core.ssm.mamba_hybrid_layer_allocation import ( + Symbols, + get_layer_maps_from_layer_type_list, +) from megatron.core.transformer import TransformerConfig from megatron.core.utils import divide as core_divide -from .attention_context.mamba_metadata import MambaInferenceStateConfig, MambaMetadata +from .attention_context.mamba_metadata import MambaMetadata from .attention_context.mha_metadata import GraphedMHAMetadata, NonGraphedMHAMetadata from .base_context import BaseInferenceContext from .dynamic_block_allocator import BlockAllocator @@ -111,7 +113,7 @@ class BlockOverflowError(ContextOverflowError): class ActiveRequestCountOverflowError(ContextOverflowError): '''Used when `initialize_attention_state()` is called with - `num_warmup_requests > max_active_requests.''' + `num_warmup_requests > max_requests.''' def __init__(self, max_request_count, active_request_count): assert active_request_count > max_request_count @@ -122,13 +124,6 @@ def __init__(self, max_request_count, active_request_count): ) -class TensorStateDeallocatedError(ContextOverflowError): - """Context's tensor state is currently deallocated, such as when the engine - has been suspended.""" - - pass - - class ContextErrorFactory: """Factory class for serializing/deserializing context errors.""" @@ -180,15 +175,6 @@ class WarmupEngineMode(Enum): NON_DECODE = "non_decode" -def get_mem_size_str(n_bytes: int) -> str: - """Convert number of bytes to human-readable string.""" - for exp, suffix in ((4, "TB"), (3, "GB"), (2, "MB"), (3, "KB"), (0, "bytes")): - nquery = int(1024**exp) - if round(n_bytes / nquery) >= 1: - return "%.3g %s" % (n_bytes / nquery, suffix) - raise Exception(f"something went wrong, n_bytes={n_bytes}.") - - # pylint: disable=line-too-long class DynamicInferenceContext(BaseInferenceContext): """Inference context that is passed to the main model in order @@ -199,37 +185,64 @@ class DynamicInferenceContext(BaseInferenceContext): arbitrary sequence length may be added, paused, or removed from the context at any step. The only constraint is the maximum number of requests or tokens that the context is defined to support. For the block-level KV cache, a memory - buffer is allocated up front (size `buffer_size_gb` if `unified_memory_level` - == 0, or `2 * buffer_size_gb` if `unified_memory_level` == 1), that is - divided into blocks and dynamically assigned to requests. At any given step, - any unassigned blocks equate to unused space. + buffer is allocated up front (size `buffer_size_gb`), that is divided into + blocks and dynamically assigned to requests. At any given step, any unassigned + blocks equate to unused space. + + Additionally, a fraction of the memory buffer (`gtd_request_fraction`, i.e., + the 'guaranteed' request fraction) is reserved for guaranteeing that a + minimum number of active requests may continue to generate tokens on any step. + The reason for this is that the context manages two pools of requests: 1) + active requests, and 2) paused requests. Paused requests are requests where + insufficient memory blocks remain for future assignment, and these requests + are set aside until enough memory blocks are available. Active requests are + requests that have sufficient memory blocks to proceed with their generations. + + The situation can arise where all requests eventually become paused due to all + memory blocks being assigned. In this case, there are no active requests and + thus no progress can be made. To handle this case, a fraction of the memory + buffer is reserved that only allows active requests, and no paused requests. + This fraction must be carefully tuned, as it can have an order of magnitude + impact on overall latency. Args: params_dtype (torch.dtype): Dtype used for KV cache. - num_layers (int): Number of layers on this pipeline parallel rank. + num_layers (int): Number of layers. kv_channels (int): Hidden dimension per attention head. num_attention_heads (int): Number of attention heads. max_sequence_length (int): Max possible sequence length (prompt + output) that will occur. - buffer_size_gb (float): Buffer size reserved on the GPU for the KV cache. - if `unified_memory_level` >= 1, then CPU memory is additionally - utilized, resulting in a total buffer size of `2 * buffer_size_gb`. - Regardless of total buffer size, the KV cache is conceptually divided - into 50% active requests and 50% paused requests. - max_tokens (int): Max number of tokens to use for forward passes. This is - primarily limited by prefill activation memory usage. (Defaults to - 16384). + buffer_size_gb (float): Total buffer size (GB), shared by main and + fallback contexts. block_size_tokens (int): Size of KV cache block size. + buffer_guaranteed_fraction (float): Fraction of the memory buffer that is + reserved to guarantee that one or more active requests are able to + run to completion. Without reserving this memory, paused requests are + able to fill the memory buffer and block execution of any requests. + buffer_overflow_factor (Optional[float]): Scaling factor over the buffer + size for auto computing `max_requests` and `max_tokens`. This scaling + factor is used for fitting more requests and tokens in the memory + buffer than it can safely hold, which in turn increases throughput. + max_requests_override (Optional[int]): If set, overrides value computed + from `buffer_overflow_factor`. + max_tokens_override (Optional[int]): If set, overrides value computed + from `buffer_overflow_factor`. tensor_model_parallel_size (Optional[int]): Tensor model parallel size. num_cuda_graphs (Optional[int]): Maximum number of cuda graphs to capture, - where the cuda graph batch sizes range from 1 to `max_active_requests` - (as computed below). Due to rounding, the actual number of cuda graphs - may not equal this argument. + where the cuda graph batch sizes range from 1 to `max_requests` (as + computed below). Due to rounding, the actual number of cuda graphs may + not equal this argument. materialize_only_last_token_logits (Optional[bool]): Whether to only materialize logits for the last token. This should be set to False if returning log probs. - mamba_inference_state_config (Optional[MambaInferenceStateConfig]): The Mamba - inference state config if the model is a hybrid model. + layer_type_list (Optional[List[str]]): A list of strings that indicates + the layer type (Mamba / Attention / MLP) for each layer. + See `megatron/core/ssm/mamba_hybrid_layer_allocation.py` for the list + of symbols. This must be provided for hybrid models. + mamba_conv_states_shape: (Optional[Tuple[int]]): Mamba conv states shape per request. + This must be provided for hybrid models. + mamba_ssm_states_shape: (Optional[Tuple[int]]): Mamba ssm states shape per request. + This must be provided for hybrid models. use_cuda_graphs_for_non_decode_steps (bool): If True, use cuda graphs for non-decode engine steps. unified_memory_level (Optional[int]): Set unified memory usage within the @@ -237,17 +250,10 @@ class DynamicInferenceContext(BaseInferenceContext): allocate `memory_buffer` in unified memory. Eventually, additional levels will be included to control other tensors within the context. use_flashinfer_fused_rope (bool): If True, use flashinfer's fused rope implementation. - If None, defaults to using flash-infer if available. + If None, defaults to using flash-infer if available. metrics_writer (Optional['WandbModule']): Wandb module for writing metrics. - num_request_metadata (Optional[int]): Number of metadata fields to track per request. - These represent metadata that is needed by the text generation controller, - and that must be kept in sync with active requests through update_requests. """ - DEFAULT_MAX_TOKENS = 16384 - TOKEN_ROUNDER = 64 - REQUEST_ROUNDER = 4 - def __init__( self, *, @@ -257,20 +263,24 @@ def __init__( num_attention_heads: int, max_sequence_length: int, buffer_size_gb: float, - max_tokens: int = DEFAULT_MAX_TOKENS, + buffer_guaranteed_fraction: float, block_size_tokens: int = 256, + buffer_overflow_factor: Optional[float] = None, + max_requests_override: Optional[int] = None, + max_tokens_override: Optional[int] = None, tensor_model_parallel_size: Optional[int] = None, cache_mla_latent: bool = False, kv_lora_rank: Optional[int] = None, qk_pos_emb_head_dim: Optional[int] = None, num_cuda_graphs: Optional[int] = None, materialize_only_last_token_logits: Optional[bool] = True, - mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, + layer_type_list: Optional[List[str]] = None, + mamba_conv_states_shape: Optional[Tuple[int]] = None, + mamba_ssm_states_shape: Optional[Tuple[int]] = None, use_cuda_graphs_for_non_decode_steps: bool = True, use_flashinfer_fused_rope: bool = False, - unified_memory_level: Optional[int] = 1, + unified_memory_level: Optional[int] = 0, metrics_writer: Optional['WandbModule'] = None, - num_request_metadata: Optional[int] = None, ): super().__init__(materialize_only_last_token_logits=materialize_only_last_token_logits) @@ -288,40 +298,36 @@ def __init__( tp_size = parallel_state.get_tensor_model_parallel_world_size() else: tp_size = tensor_model_parallel_size - self.hidden_size_per_attention_head = core_divide(projection_size, num_attention_heads) - self.num_attention_heads_per_partition = core_divide(num_attention_heads, tp_size) + hidden_size_per_attention_head = core_divide(projection_size, num_attention_heads) + num_attention_heads_per_partition = core_divide(num_attention_heads, tp_size) # Mamba states. - self.is_hybrid_model = mamba_inference_state_config is not None + self.is_hybrid_model = layer_type_list is not None and Symbols.MAMBA in layer_type_list if self.is_hybrid_model: - mamba_conv_states_shape = mamba_inference_state_config.mamba_conv_states_shape - mamba_ssm_states_shape = mamba_inference_state_config.mamba_ssm_states_shape assert ( mamba_conv_states_shape is not None ), "`mamba_conv_states_shape` must be specified for hybrid models" assert ( mamba_ssm_states_shape is not None ), "`mamba_ssm_states_shape` must be specified for hybrid models" - assert not ( - num_cuda_graphs is not None and use_cuda_graphs_for_non_decode_steps + assert ( + not use_cuda_graphs_for_non_decode_steps ), "Non-decode CUDA graphs not yet supported for hybrid models" # For hybrid models, the layer map converts the global layer index to the # corresponding attention layer index or Mamba layer index depending on the # layer type. - attention_layer_map, mamba_layer_map, _, _ = get_layer_maps_from_layer_type_list( - mamba_inference_state_config.layer_type_list + attention_layer_map, mamba_layer_map, _ = get_layer_maps_from_layer_type_list( + layer_type_list ) self.num_attention_layers = len(attention_layer_map) self.num_mamba_layers = len(mamba_layer_map) - self.mamba_conv_states_shape = mamba_conv_states_shape - self.mamba_ssm_states_shape = mamba_ssm_states_shape self.layer_map = attention_layer_map | mamba_layer_map else: # The layer map is the identity function for pure Transformer models. self.num_attention_layers = num_layers self.num_mamba_layers = 0 - (self.mamba_conv_states_shape, self.mamba_ssm_states_shape) = (None, None) + (mamba_conv_states_shape, mamba_ssm_states_shape) = (None, None) self.layer_map = {i: i for i in range(self.num_attention_layers)} if self.num_attention_layers == 0: @@ -334,12 +340,10 @@ def __init__( self.block_size_tokens = block_size_tokens if self.cache_mla_latent: # one vector c_t (rank) + optional RoPE phase slice - self.kv_reduced_dim = kv_lora_rank + qk_pos_emb_head_dim + kv_reduced_dim = kv_lora_rank + qk_pos_emb_head_dim + self.kv_reduced_dim = kv_reduced_dim self.block_size_bytes = ( - dtype_size_bytes - * self.num_attention_layers - * self.block_size_tokens - * self.kv_reduced_dim + dtype_size_bytes * num_layers * self.block_size_tokens * kv_reduced_dim ) else: self.block_size_bytes = ( @@ -347,18 +351,62 @@ def __init__( * 2 # key, value * self.num_attention_layers * self.block_size_tokens - * self.num_attention_heads_per_partition - * self.hidden_size_per_attention_head + * num_attention_heads_per_partition + * hidden_size_per_attention_head ) assert self.block_size_bytes > 0 + # Adjust buffer to be a multiple of block size. + buffer_size_bytes = int(buffer_size_gb * 1024**3) + buffer_size_bytes_rem = buffer_size_bytes % self.block_size_bytes + buffer_size_bytes = buffer_size_bytes - buffer_size_bytes_rem + mamba_states_memory_per_request = 0 if self.is_hybrid_model: - mamba_states_memory_per_request += math.prod(self.mamba_conv_states_shape) - mamba_states_memory_per_request += math.prod(self.mamba_ssm_states_shape) + mamba_states_memory_per_request += math.prod(mamba_conv_states_shape) + mamba_states_memory_per_request += math.prod(mamba_ssm_states_shape) mamba_states_memory_per_request *= self.num_mamba_layers mamba_states_memory_per_request *= dtype_size_bytes + # Compute max_requets, max_tokens from buffer size, overflow factor, and Mamba state size. + def bytes_to_max_requests_and_tokens(n_bytes): + bytes_per_token = self.block_size_bytes / self.block_size_tokens + cost_per_request_bytes = ( + mamba_states_memory_per_request + max_sequence_length * bytes_per_token + ) + # TODO(ksanthanam): Leave room for an extra request in the event of padding + # for non-decode CUDA graphs + n_requests = n_bytes / cost_per_request_bytes + n_tokens = n_requests * max_sequence_length + n_requests = self.round_up_requests(int(n_requests), tp_size=tp_size) + n_tokens = self.round_up_tokens(int(n_tokens), tp_size=tp_size) + return n_requests, n_tokens + + self.max_requests, self.max_tokens = bytes_to_max_requests_and_tokens(buffer_size_bytes) + if buffer_overflow_factor is not None: + self.max_requests = self.round_up_requests( + int(self.max_requests * buffer_overflow_factor), tp_size=tp_size + ) + self.max_tokens = self.round_up_tokens( + int(self.max_tokens * buffer_overflow_factor / 50.0), tp_size=tp_size + ) + + if max_requests_override is not None: + self.max_requests = ( + max_requests_override + if max_requests_override < self.REQUEST_ROUNDER + else self.round_up_requests(max_requests_override, tp_size=tp_size) + ) + + if max_tokens_override is not None: + self.max_tokens = self.round_up_tokens(max_tokens_override, tp_size=tp_size) + + self.max_requests = min(self.max_requests, self.max_tokens) # e.g., decode only. + + # Initialize context state. + self.params_dtype = params_dtype + self.max_sequence_length = max_sequence_length + # Unified memory. self.unified_memory_level = unified_memory_level if unified_memory_level > 0: @@ -371,38 +419,6 @@ def __init__( ) self.unified_memory_level = 0 - # Initialize block allocator. - buffer_size_bytes = int(buffer_size_gb * 1024**3) - block_count_total = buffer_size_bytes // ( - self.block_size_bytes + mamba_states_memory_per_request - ) - self.block_allocator = BlockAllocator( - context=self, - total_count=( - block_count_total if self.unified_memory_level == 0 else 2 * block_count_total - ), - ) - - # Set max_total_requests, max_active_requests, max_tokens. - self.max_total_requests = self.block_allocator.total_count - 1 # -1 for dummy block - self.max_active_requests = self.block_allocator.active_count - self.max_tokens = max_tokens or self.DEFAULT_MAX_TOKENS - - assert self.max_tokens >= self.max_active_requests, ( - f"max_tokens ({self.max_tokens}) must be >= " - f"max_active_requests ({self.max_active_requests}), " - "to have consistency between cuda graph sizes and the block table size." - ) - - # Track request metadata. - if num_request_metadata is None: - num_request_metadata = len(DynamicInferenceRequest.get_metadata_labels()) - self.num_request_metadata = num_request_metadata - - # Initialize context state. - self.params_dtype = params_dtype - self.max_sequence_length = max_sequence_length - # Request and token counts. self.total_request_count = 0 self.active_token_count = 0 @@ -411,19 +427,93 @@ def __init__( self.padded_active_request_count = None self.paused_tokens = None + # Per-request state. + self.request_ids = torch.full( + (self.max_requests,), -1, dtype=torch.int32, device=torch.cuda.current_device() + ) + # request_query_lengths is the input prompt tokens length during prefill phase (1st step) and then 1 for the decode phase (i.e During generation) + self.request_query_lengths = torch.empty_like(self.request_ids) + # request_output_lengths is len(input_prompt_tokens) + num_tokens_to_generate + self.request_output_lengths = torch.empty_like(self.request_ids) + # request_kv_length_offsets is the same as query length during prefill phase (1st step) and then 1 for the decode phase (i.e During generation) + self.request_kv_length_offsets = torch.empty_like(self.request_ids) + self.request_kv_block_counts = torch.empty_like(self.request_ids) + self.request_last_kv_block_id = torch.empty_like(self.request_ids) + # request_last_kv_block_offset represents number of tokens in the last kv block + self.request_last_kv_block_offset = torch.empty_like(self.request_ids) + + # Per-token state. + self.token_to_input_ids = torch.full( + (self.max_tokens,), 0, dtype=torch.long, device=torch.cuda.current_device() + ) + self.token_to_pos_ids = torch.full_like(self.token_to_input_ids, 0) + self.token_to_request_idx = torch.empty_like(self.token_to_input_ids) + self.token_to_block_idx = torch.empty_like(self.token_to_input_ids) + # i.e For a set of tokens A B C D E F .. and block_size 4: + # token_to_position_in_request is [0, 1, 2, 3, 4, 5] + # token_to_local_position_within_kv_block is [0 , 1, 2, 3, 0, 1, 2] + self.token_to_position_in_request = torch.empty_like(self.token_to_input_ids) + self.token_to_local_position_within_kv_block = torch.empty_like(self.token_to_input_ids) + + # Calculate the total number of chunks available in the buffer + total_mamba_states_memory = mamba_states_memory_per_request * self.max_requests + block_count_total = ( + max(0, buffer_size_bytes - total_mamba_states_memory) // self.block_size_bytes + ) + + # Memory buffer. + ctx_manager = ( + torch.cuda.use_mem_pool(self.unified_memory_mempool) + if self.unified_memory_level > 0 + else nullcontext() + ) + with ctx_manager: + if cache_mla_latent: + self.memory_buffer = torch.full( + ( + self.num_attention_layers, + block_count_total, + self.block_size_tokens, + kv_reduced_dim, + ), + -1, + dtype=self.params_dtype, + device=torch.cuda.current_device(), + ) + else: + self.memory_buffer = torch.full( + ( + 2, # key and value + self.num_attention_layers, + block_count_total, + self.block_size_tokens, + num_attention_heads_per_partition, + hidden_size_per_attention_head, + ), + -1, + dtype=self.params_dtype, + device=torch.cuda.current_device(), + ) + # Block ids. self.max_kv_block_count = math.ceil(self.max_sequence_length / self.block_size_tokens) + self.request_to_kv_block_ids = torch.full( + (self.max_requests, self.max_kv_block_count), + -1, + dtype=torch.int, + device=torch.cuda.current_device(), + ) # Cuda graph token-counts (i.e., token counts used by cuda-graph steps, both decode and non-decode). self.cuda_graph_token_counts = None if num_cuda_graphs is not None: # Ensure valid num_cuda_graphs. - num_cuda_graphs = min(max(num_cuda_graphs, 1), self.max_active_requests) + num_cuda_graphs = min(max(num_cuda_graphs, 1), self.max_requests) # Cuda graph step size. cuda_graph_rounder = 8 - self.cuda_graph_step_size = self.max_active_requests / num_cuda_graphs + self.cuda_graph_step_size = self.max_requests / num_cuda_graphs self.cuda_graph_step_size = ( math.ceil(self.cuda_graph_step_size / cuda_graph_rounder) * cuda_graph_rounder ) @@ -432,17 +522,13 @@ def __init__( # Cuda graph token counts. if num_cuda_graphs == 1: - self.cuda_graph_token_counts = [self.max_active_requests] + self.cuda_graph_token_counts = [self.max_requests] else: self.cuda_graph_token_counts = list( - range( - self.cuda_graph_step_size, - self.max_active_requests, - self.cuda_graph_step_size, - ) + range(self.cuda_graph_step_size, self.max_requests, self.cuda_graph_step_size) ) - if self.cuda_graph_token_counts[-1] != self.max_active_requests: - self.cuda_graph_token_counts.append(self.max_active_requests) + if self.cuda_graph_token_counts[-1] != self.max_requests: + self.cuda_graph_token_counts.append(self.max_requests) self.cuda_graph_token_counts.reverse() # Set used for validating active cuda graph token count. @@ -464,205 +550,82 @@ def __init__( self.active_attn_metadata = None self.graph_attn_metadata["mha_metadata"] = GraphedMHAMetadata( - block_count_total=self.block_allocator.total_count, + block_count_total=block_count_total, max_kv_block_count=self.max_kv_block_count, - max_requests=self.max_total_requests, + max_requests=self.max_requests, block_size_tokens=self.block_size_tokens, max_seqlen=self.max_sequence_length, ) self.non_graph_attn_metadata["mha_metadata"] = NonGraphedMHAMetadata( - block_count_total=self.block_allocator.total_count, + block_count_total=block_count_total, max_kv_block_count=self.max_kv_block_count, - max_requests=self.max_total_requests, + max_requests=self.max_requests, block_size_tokens=self.block_size_tokens, max_seqlen=self.max_sequence_length, ) - # Deal with chunked prefill - self.chunked_prefill_request_id = -1 - - # FlashInfer. - if use_flashinfer_fused_rope is True: - assert HAVE_FLASHINFER, "flashinfer is not installed" - elif use_flashinfer_fused_rope is None: - use_flashinfer_fused_rope = HAVE_FLASHINFER - self.use_flashinfer_fused_rope = use_flashinfer_fused_rope - - # Allocate GPU state. - self.is_tensor_state_allocated = False - self.allocate_all_tensors(is_init=True) - - # Print info. - logging.info( - "DynamicInferenceContext: allocated context with active buffer size %s (%d blocks)." - % ( - get_mem_size_str(self.block_allocator.active_count * self.block_size_bytes), - self.block_allocator.active_count, - ) - ) - - def allocate_all_tensors(self, *, is_init: bool) -> None: - """Allocate GPU state. - - This method is used for both 1) initial allocation, and 2) resuming the - GPU state after a suspend. - - Args: - is_init (bool): True if this is being called from `__init__()`. - """ - - # Only allocate tensors when not using unified memory at all (level 0), - # or for initial allocation during `__init__()`. For levels 1 and 2, we do - # not perform any explicit allocations or deallocations after the initial - # call to `__init__()`. - if self.unified_memory_level != 0 and not is_init: - return - - # Mark allocated. - if self.is_tensor_state_allocated: - return - self.is_tensor_state_allocated = True - - # Validate no tensors allocated prior to this method. - for key in vars(self).keys(): - value = getattr(self, key) - assert not isinstance(value, torch.Tensor), ( - "All tensors should be allocated within `allocate_all_tensors()." - f"Please move tensor '{key}'." - ) - - # Per-request state. - self.request_ids = torch.full( - (self.max_total_requests,), -1, dtype=torch.int32, device=torch.cuda.current_device() - ) - # request_query_lengths is the input prompt tokens length during prefill phase (1st step) and then 1 for the decode phase (i.e During generation) - self.request_query_lengths = torch.empty_like(self.request_ids) - # request_output_lengths is len(input_prompt_tokens) + num_tokens_to_generate - self.request_output_lengths = torch.empty_like(self.request_ids) - # request_kv_length_offsets is the same as query length during prefill phase (1st step) and then 1 for the decode phase (i.e During generation) - self.request_kv_length_offsets = torch.empty_like(self.request_ids) - self.request_kv_block_counts = torch.empty_like(self.request_ids) - self.request_last_kv_block_id = torch.empty_like(self.request_ids) - # request_last_kv_block_offset represents number of tokens in the last kv block - self.request_last_kv_block_offset = torch.empty_like(self.request_ids) - self.request_to_kv_block_ids = torch.full( - (self.max_total_requests, self.max_kv_block_count), - -1, - dtype=torch.int, - device=torch.cuda.current_device(), - ) - - # Track request metadata. - self.request_metadata = torch.empty( - (self.max_total_requests, self.num_request_metadata), - dtype=torch.float32, - device=torch.cuda.current_device(), + # Guaranteed active requests. + # * See details in the class docstring above. `gtd_request_fraction` is + # the fraction of blocks in the memory buffer that are reserved for + # guaranteeing that some number of active requests can always proceed + # with their generations. The number of blocks defined by + # `buffer_guaranteed_fraction * block_count_total` is converted to a + # number of requests that this reserved space can safely handle + # (`gtd_request_count`). + # * Note: computing the size of this guaranteed space from blocks rather + # than bytes is safer due to the non-linear impacts of a large + # `block_size_tokens` or `max_kv_block_count`. When computing from + # blocks, this space will always be less than `block_count_total`. When + # computing from bytes, this space can unexpectedly be much larger than + # `block_count_total`, resulting in stalled generations. + gtd_block_count = int(buffer_guaranteed_fraction * block_count_total) + gtd_block_count = min(gtd_block_count, block_count_total) + self.gtd_request_count = max(1, gtd_block_count // self.max_kv_block_count) + self.gtd_block_count = self.gtd_request_count * self.max_kv_block_count + + # Initialize allocator for KV memory blocks + self.block_allocator = BlockAllocator( + block_count_total=block_count_total, gtd_block_count=self.gtd_block_count ) - # Per-token state. - self.token_to_input_ids = torch.full( - (self.max_tokens,), 0, dtype=torch.long, device=torch.cuda.current_device() - ) - self.token_to_pos_ids = torch.full_like(self.token_to_input_ids, 0) - self.token_to_request_idx = torch.empty_like(self.token_to_input_ids) - self.token_to_block_idx = torch.empty_like(self.token_to_input_ids) - # i.e For a set of tokens A B C D E F .. and block_size 4: - # token_to_position_in_request is [0, 1, 2, 3, 4, 5] - # token_to_local_position_within_kv_block is [0 , 1, 2, 3, 0, 1, 2] - self.token_to_position_in_request = torch.empty_like(self.token_to_input_ids) - self.token_to_local_position_within_kv_block = torch.empty_like(self.token_to_input_ids) - - # Memory buffer. - def allocate_memory_buffer(): - """Allocate the memory buffer. This function is called below within - `with ctx_manager:`.""" - if self.cache_mla_latent: - self.memory_buffer = torch.full( - ( - self.num_attention_layers, - self.block_allocator.total_count, - self.block_size_tokens, - self.kv_reduced_dim, - ), - -1, - dtype=self.params_dtype, - device=torch.cuda.current_device(), - ) - else: - self.memory_buffer = torch.full( - ( - 2, # key and value - self.num_attention_layers, - self.block_allocator.total_count, - self.block_size_tokens, - self.num_attention_heads_per_partition, - self.hidden_size_per_attention_head, - ), - -1, - dtype=self.params_dtype, - device=torch.cuda.current_device(), - ) - # Optional state tensors for hybrid models - def allocate_mamba_states(): - """Allocate Mamba states. This function is called below within - `with ctx_manager:`.""" - if self.is_hybrid_model: - self.mamba_metadata = MambaMetadata(max_requests=self.max_total_requests) + if self.is_hybrid_model: + self.mamba_metadata = MambaMetadata(max_requests=self.max_requests) + + with ctx_manager: self.mamba_conv_states = torch.zeros( - (self.num_mamba_layers, self.max_total_requests) + self.mamba_conv_states_shape, + (self.num_mamba_layers, self.max_requests) + mamba_conv_states_shape, dtype=self.params_dtype, device=torch.cuda.current_device(), ) self.mamba_ssm_states = torch.zeros( - (self.num_mamba_layers, self.max_total_requests) + self.mamba_ssm_states_shape, + (self.num_mamba_layers, self.max_requests) + mamba_ssm_states_shape, dtype=self.params_dtype, device=torch.cuda.current_device(), ) - else: - self.mamba_metadata = None + else: + self.mamba_metadata = None - # Allocate `ctx_manager`-managed buffers. (For currently unknown reasons, - # `ctx_manager` can only be used once.) - ctx_manager = ( - torch.cuda.use_mem_pool(self.unified_memory_mempool) - if self.unified_memory_level > 0 - else nullcontext() - ) - with ctx_manager: - allocate_memory_buffer() - allocate_mamba_states() + # Store the dummy block idx reference for convenience + self.dummy_block_idx = self.block_allocator.dummy_block_idx + + # Deal with chunked prefill + self.chunked_prefill_request_id = -1 # Reset attention and Mamba state. self.reset_attention_state() self.reset_mamba_state() - def deallocate_all_tensors(self): - """Deallocate GPU state. - - This method is used for suspending the dynamic engine. - """ - - # Only deallocate tensors when not using unified memory at all (level 0). - # For levels 1 and 2, we do not perform any explicit allocations or - # deallocations after the initial call to `__init__()`. - if self.unified_memory_level != 0: - return - - # Mark deallocated. - if not self.is_tensor_state_allocated: - return - self.is_tensor_state_allocated = False + if use_flashinfer_fused_rope is True: + assert HAVE_FLASHINFER, "flashinfer is not installed" + elif use_flashinfer_fused_rope is None: + use_flashinfer_fused_rope = HAVE_FLASHINFER + self.use_flashinfer_fused_rope = use_flashinfer_fused_rope - # Delete all tensor attributes. - # TODO(@lmcafee): check that device == 'cuda'? - keys = list(vars(self).keys()) - for key in keys: - value = getattr(self, key) - if isinstance(value, torch.Tensor): - delattr(self, key) + TOKEN_ROUNDER = 64 + REQUEST_ROUNDER = 4 @classmethod def round_up_tokens(cls, value, tp_size=None): @@ -693,13 +656,13 @@ def from_config( max_batch_size: int, buffer_size_gb: float = 40, num_cuda_graphs: int = None, - mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, ): """ Instantiate a `DynamicInferenceContext` from a `TransformerConfig` and an `InferenceWrapperConfig`. """ # TODO: Add other necessary configs from inference_config + buffer_guaranteed_fraction = 0.1 model_config = model.config max_sequence_length = ( inference_config.inference_max_seq_length or model_config.max_sequence_length @@ -707,15 +670,16 @@ def from_config( max_sequence_length = max(max_sequence_length, max_batch_size) return cls( params_dtype=inference_config.params_dtype, - num_layers=model_config.num_layers // model_config.pipeline_model_parallel_size, + num_layers=model_config.num_layers, kv_channels=model_config.kv_channels, num_attention_heads=model_config.num_query_groups, max_sequence_length=inference_config.inference_max_seq_length, buffer_size_gb=buffer_size_gb, + buffer_guaranteed_fraction=buffer_guaranteed_fraction, materialize_only_last_token_logits=False, + max_requests_override=max_batch_size, num_cuda_graphs=num_cuda_graphs, use_flashinfer_fused_rope=None, - mamba_inference_state_config=mamba_inference_state_config, ) @classmethod @@ -856,7 +820,6 @@ def key_value_cache(self, layer_number: int) -> Tuple[Tensor, Tensor]: to blocks within the block-level memory buffer. """ attention_layer_number = self.layer_map[layer_number - 1] - if self.cache_mla_latent: return ( self.memory_buffer[attention_layer_number], @@ -1025,7 +988,7 @@ def initialize_attention_state( Args: num_warmup_tokens (Optional[int]): Number of tokens to use for warming up cuda graphs. Must be less than or equal to - `max_active_requests`. + `max_requests`. warmup_engine_mode (WarmupEngineMode): Denote whether to setup for a decode or a non-decode cuda-graph warmup. num_warmup_requests (Optional[int]): [DEPRECATED] Use num_warmup_tokens instead. @@ -1045,8 +1008,8 @@ def initialize_attention_state( # warmup both decode and non-decode engine steps if num_warmup_tokens is not None: - if num_warmup_tokens > self.max_active_requests: - raise ActiveRequestCountOverflowError(self.max_active_requests, num_warmup_tokens) + if num_warmup_tokens > self.max_requests: + raise ActiveRequestCountOverflowError(self.max_requests, num_warmup_tokens) if warmup_engine_mode == WarmupEngineMode.NON_DECODE: assert self.non_decode_cuda_graphs, "Set non-decode cuda graphs to True" @@ -1065,9 +1028,7 @@ def initialize_attention_state( math.ceil(active_token_count / self.cuda_graph_step_size) * self.cuda_graph_step_size ) - self.padded_active_token_count = min( - self.padded_active_token_count, self.max_active_requests - ) + self.padded_active_token_count = min(self.padded_active_token_count, self.max_requests) assert ( self.padded_active_token_count in self.cuda_graph_token_counts_set ), f"padded_active_token_count: {self.padded_active_token_count} not in cuda_graph_token_counts_set: {self.cuda_graph_token_counts_set}" @@ -1077,7 +1038,7 @@ def initialize_attention_state( if self.is_decode_only(): # For decode-only, the padded active token count cannot exceed max-requests. self.padded_active_token_count = min( - self.padded_active_token_count, self.max_active_requests + self.padded_active_token_count, self.max_requests ) # How are we calculating the padded active request count? @@ -1095,7 +1056,7 @@ def initialize_attention_state( # Update token position indexes. self.token_to_block_idx[self.active_token_count : self.padded_active_token_count] = ( - self.block_allocator.dummy_block_idx + self.dummy_block_idx ) self.token_to_local_position_within_kv_block[ self.active_token_count : self.padded_active_token_count @@ -1170,7 +1131,6 @@ def reset(self) -> None: self.request_last_kv_block_id.fill_(-1) self.request_last_kv_block_offset.fill_(0) self.request_to_kv_block_ids.fill_(-1) - self.request_metadata.fill_(0) # Reset token indexes. self.token_to_input_ids.fill_(0) @@ -1238,20 +1198,20 @@ def last_token_logits(self, logits: Tensor) -> Tensor: return last_token_logits - def check_availability(self, req: DynamicInferenceRequest) -> (bool, bool, bool): + def check_availability( + self, req: DynamicInferenceRequest, safe: bool = False + ) -> (bool, bool, bool): """ Check if the request can be added to the context. """ - request_can_be_added = ( - self.total_request_count - self.paused_request_count < self.max_active_requests - ) + request_can_be_added = self.total_request_count < self.max_requests request_tokens_can_be_added = ( self.active_token_count + req.remaining_prompt_length <= self.max_tokens ) blocks = math.ceil( (req.remaining_prompt_length + req.finished_chunk_token_count) / self.block_size_tokens ) - math.ceil(req.finished_chunk_token_count / self.block_size_tokens) - kv_cache_available = self.block_allocator.is_memory_available(blocks) + kv_cache_available = self.block_allocator.is_memory_available(blocks, safe=safe) return request_can_be_added, request_tokens_can_be_added, kv_cache_available def add_request(self, req: DynamicInferenceRequest, chunk_length: Optional[int] = None) -> None: @@ -1264,12 +1224,6 @@ def add_request(self, req: DynamicInferenceRequest, chunk_length: Optional[int] Return: None """ - - # If tensor state is deallocated, do not add request. - if not self.is_tensor_state_allocated: - raise TensorStateDeallocatedError(req.request_id) - - # Chunk length. if chunk_length is None: chunk_length = req.remaining_prompt_length @@ -1297,7 +1251,9 @@ def add_request(self, req: DynamicInferenceRequest, chunk_length: Optional[int] num_blocks_needed = overall_required_blocks - already_allocated_blocks if num_blocks_needed > 0: - new_block_ids = self.block_allocator.allocate_memory_blocks(num_blocks_needed) + new_block_ids = self.block_allocator.allocate_memory_blocks( + num_blocks_needed, safe=not is_chunked_prefill + ) if new_block_ids is None or len(new_block_ids) != num_blocks_needed: raise BlockOverflowError(req.request_id) @@ -1315,22 +1271,13 @@ def add_request(self, req: DynamicInferenceRequest, chunk_length: Optional[int] else: current_id = self.total_request_count - if current_id >= self.max_active_requests: + if current_id >= self.max_requests: raise RequestOverflowError(req.request_id) if self.active_token_count + chunk_length > self.max_tokens: raise TokenOverflowError(req.request_id) self.request_ids[current_id] = req.request_id - # Handle request metadata. - metadata = req.tracked_metadata - assert ( - len(metadata) == self.num_request_metadata - ), "Request added to context with invalid metadata length" - self.request_metadata[current_id] = torch.tensor( - metadata, dtype=torch.float32, device=self.request_metadata.device - ) - # Handle length and block assignments. self.request_query_lengths[current_id] = chunk_length self.request_output_lengths[current_id] = ( req.finished_chunk_token_count @@ -1395,7 +1342,6 @@ def _move_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens): self.request_kv_length_offsets[dst_idxs] = self.request_kv_length_offsets[src_idxs] self.request_query_lengths[dst_idxs] = self.request_query_lengths[src_idxs] self.request_output_lengths[dst_idxs] = self.request_output_lengths[src_idxs] - self.request_metadata[dst_idxs] = self.request_metadata[src_idxs] self.request_ids[dst_idxs] = self.request_ids[src_idxs] next_tokens[dst_idxs] = next_tokens[src_idxs] @@ -1416,7 +1362,6 @@ def _swap_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens): tensor_swap(self.request_kv_length_offsets, src_idxs, dst_idxs) tensor_swap(self.request_query_lengths, src_idxs, dst_idxs) tensor_swap(self.request_output_lengths, src_idxs, dst_idxs) - tensor_swap(self.request_metadata, src_idxs, dst_idxs) tensor_swap(self.request_ids, src_idxs, dst_idxs) tensor_swap(next_tokens, src_idxs, dst_idxs) tensor_swap(self.request_to_kv_block_ids, src_idxs, dst_idxs) @@ -1427,14 +1372,6 @@ def _swap_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens): if self.is_hybrid_model: tensor_swap(self.mamba_metadata.request_to_mamba_state_idx, src_idxs, dst_idxs) - def get_index_of_chunked_prefill_request(self) -> int: - """Get the index of the chunked prefill request in the context. - - Return: - (int) Index of the chunked prefill request, or -1 if none exists. - """ - return torch.where(self.request_ids == self.chunked_prefill_request_id)[0][0] - # TODO: see if we can compile this function def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> Tensor: """Update context state after calling engine.step(). @@ -1452,7 +1389,7 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T between these request groups. - 0:paused_request_count -> paused requests - paused_request_count:total_request_count -> active requests - - total_request_count:max_active_requests -> completed requests are moved here. + - total_request_count:max_requests -> completed requests are moved here. The reason for maintaining contiguous tensors rather than multiple smaller (e.g., per-group or per-request) tensors is for both 1) speed (avoid unnecessary tensor allocations), and 2) compatibility with the @@ -1476,7 +1413,6 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T Return: (Tensor) Newly paused request IDs. """ - # 1. The active token mask tells us which requests are still active and which are completed # active_request_count -> This corresponds to requests that have not reached EOD or max length # finished_request_count are requests that have reached the termination criterion @@ -1496,9 +1432,6 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T # Reset attention state. self.reset_attention_state() - # Update total_request_count. - self.total_request_count = active_request_count + self.paused_request_count - # 2. If no paused requests are present and no active requests we release memory and reset. if active_request_count + self.paused_request_count == 0: if finished_request_count > 0: @@ -1591,19 +1524,13 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T if self.chunked_prefill_request_id != -1: # find the id in request_ids that is the chunked_prefill_request_id. Only one request should be chunked. - active_requests_requiring_new_block[self.get_index_of_chunked_prefill_request()] = ( - 0 # chunked prefill should not be paused - ) + pos = torch.where(self.request_ids == self.chunked_prefill_request_id)[0][0] + active_requests_requiring_new_block[pos] = 0 # chunked prefill should not be paused active_requests_requiring_new_block_count = ( (active_requests_requiring_new_block == 1).sum().item() ) - if active_requests_requiring_new_block_count > 0: - newly_paused_request_ids = self.request_ids[ - torch.nonzero(active_requests_requiring_new_block) + self.paused_request_count - ] - # Swap unfinished active requests on the left side with paused requests on the right side # NOTE : We add paused request count because we concatenate # paused tokens to the left at the beginning of update requests @@ -1636,6 +1563,7 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T self._move_book_keeping_tensors( src_idxs=src_idxs, dst_idxs=dst_idxs, next_tokens=next_tokens ) + newly_paused_request_ids = self.request_ids[dst_idxs] self.paused_request_count += active_requests_requiring_new_block_count active_request_count -= active_requests_requiring_new_block_count @@ -1644,26 +1572,26 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T # We determine how many requests we can resume and resume them # Assign released blocks to paused requests. # todo: @shanmugamr, un-pause requests using FIFO, rather than LIFO. - resume_request_count = 0 - if self.paused_request_count > 0: - active_block_count_avail = self.block_allocator.get_active_avail() - paused_block_counts = self.request_kv_block_counts[: self.paused_request_count] - paused_block_counts = paused_block_counts.flip(dims=[0]) - paused_block_counts += 1 # +1 for newly added block - paused_block_counts_cumsum = paused_block_counts.cumsum(dim=0) + num_non_gtd_blocks = max(0, self.block_allocator.block_count_avail - self.gtd_block_count) + if num_non_gtd_blocks: + # if we have non-gtd blocks, use them. Do not dip into the gtd-block pool + resume_request_count = min(num_non_gtd_blocks, self.paused_request_count) + else: + # only dip into the gtd-block pool if we have run out of non-gtd-blocks and the active + # request count has fallen below a certain threshold. resume_request_count = min( - torch.nonzero(paused_block_counts_cumsum <= active_block_count_avail).numel(), - self.block_allocator.total_avail, + max(self.gtd_request_count - active_request_count, 0), self.paused_request_count ) self.paused_request_count -= resume_request_count active_request_count += resume_request_count assert active_request_count > 0, "active_request_count == %d." % active_request_count - # finally, swap the chunked prefill to the end of the active requests to obey the invariance + # finally, swap the chunked prefill to the end of the active requests to obey the invariant if self.chunked_prefill_request_id != -1: + pos = torch.where(self.request_ids == self.chunked_prefill_request_id)[0][0] self._swap_book_keeping_tensors( - src_idxs=torch.tensor([self.get_index_of_chunked_prefill_request()]), + src_idxs=torch.tensor([pos]), dst_idxs=torch.tensor([active_request_count + self.paused_request_count - 1]), next_tokens=next_tokens, ) @@ -1712,7 +1640,6 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T == 0 ), "The request_last_kv_block_offset should be 0 for the requests that just got resumed this step. " - assert resume_request_count <= self.block_allocator.total_avail block_ids = self.block_allocator.allocate_memory_blocks(resume_request_count) row_idx = torch.arange( self.paused_request_count, @@ -1834,11 +1761,11 @@ def get_kvcache_utilization_stats(self) -> dict: } """ # Total usable blocks exclude the reserved dummy block. - total_blocks = max(self.block_allocator.total_count - 1, 1) - block_count_avail = int(self.block_allocator.total_avail) + total_blocks = max(self.block_allocator.block_count_total - 1, 1) + block_count_avail = int(self.block_allocator.block_count_avail) # Overall allocated blocks in the buffer right now. - allocated_blocks = (self.block_allocator.total_count - 1) - block_count_avail + allocated_blocks = (self.block_allocator.block_count_total - 1) - block_count_avail allocated_blocks = int(max(0, allocated_blocks)) # Active unique blocks referenced by current active requests only. @@ -1860,6 +1787,7 @@ def get_kvcache_utilization_stats(self) -> dict: active_utilization = float(active_unique_blocks) / float(total_blocks) # Diagnostic helpers + num_non_gtd_blocks = max(0, block_count_avail - int(self.gtd_block_count)) total_request_count = int(self.total_request_count) return { 'total_blocks': int(total_blocks), @@ -1869,9 +1797,10 @@ def get_kvcache_utilization_stats(self) -> dict: 'active_utilization': active_utilization, 'active_request_count': int(self.get_active_request_count()), 'paused_request_count': int(self.paused_request_count), + 'gtd_block_count': int(self.gtd_block_count), 'block_count_avail': int(block_count_avail), + 'num_non_gtd_blocks': int(num_non_gtd_blocks), 'active_token_count': int(self.active_token_count), 'total_request_count': int(total_request_count), - 'max_total_requests': int(self.max_total_requests), - 'max_active_requests': int(self.max_active_requests), + 'max_requests': int(self.max_requests), } diff --git a/megatron/core/inference/data_parallel_inference_coordinator.py b/megatron/core/inference/data_parallel_inference_coordinator.py index e1fe7b21566..0045d5947a1 100644 --- a/megatron/core/inference/data_parallel_inference_coordinator.py +++ b/megatron/core/inference/data_parallel_inference_coordinator.py @@ -9,7 +9,7 @@ import torch -from megatron.core.inference.headers import Headers, UnknownHeaderError +from megatron.core.inference.headers import Headers try: import zmq @@ -109,8 +109,6 @@ def __init__(self, inference_coordinator_port: int, data_parallel_size: int): self.identities_of_data_parallel_ranks.append(identity) logging.info("Inference Coordinator: Connected with data parallel ranks...") self.data_parallel_rank_iterator = cycle(self.identities_of_data_parallel_ranks) - self.data_parallel_pause_acks = set() - self.data_parallel_stop_acks = set() self.request_id_to_client_id = {} self.request_id_to_client_request_id = {} @@ -153,7 +151,7 @@ def start(self): # print(f"New client connected: {sender_identity}") known_clients.add(sender_identity) self.router_socket.send_multipart( - [sender_identity, msgpack.packb([Headers.CONNECT_ACK.value], use_bin_type=True)] + [sender_identity, msgpack.packb([Headers.ACK.value], use_bin_type=True)] ) elif header == Headers.SUBMIT_REQUEST: @@ -195,13 +193,7 @@ def start(self): ), ] ) - elif header in [ - Headers.PAUSE, - Headers.UNPAUSE, - Headers.SUSPEND, - Headers.RESUME, - Headers.STOP, - ]: + elif header in [Headers.PAUSE, Headers.UNPAUSE, Headers.STOP]: # control signals for the engine # broadcast to all data parallel ranks if sender_identity not in known_clients: @@ -210,57 +202,13 @@ def start(self): self.router_socket.send_multipart( [data_parallel_rank_id, msgpack.packb([header.value], use_bin_type=True)] ) - if header == Headers.UNPAUSE: - self.data_parallel_pause_acks = set() - elif header == Headers.PAUSE_ACK: - # control signal ack from the engine - assert sender_identity in self.identities_of_data_parallel_ranks - assert sender_identity not in self.data_parallel_pause_acks - self.data_parallel_pause_acks.add(sender_identity) - # route to all clients only once we have gotten an ack from all data parallel ranks - if len(self.data_parallel_pause_acks) == self.data_parallel_size: - for client_id in known_clients: - self.router_socket.send_multipart( - [ - client_id, - msgpack.packb([header.value, sender_identity], use_bin_type=True), - ] - ) - for data_parallel_rank_id in self.identities_of_data_parallel_ranks: - self.router_socket.send_multipart( - [ - data_parallel_rank_id, - msgpack.packb([Headers.PAUSE_ACK.value], use_bin_type=True), - ] - ) - elif header == Headers.STOP_ACK: - # control signal ack from the engine - assert sender_identity in self.identities_of_data_parallel_ranks - assert sender_identity not in self.data_parallel_stop_acks - self.data_parallel_stop_acks.add(sender_identity) - # route to all clients only once we have gotten an ack from all data parallel ranks - if len(self.data_parallel_stop_acks) == self.data_parallel_size: - for client_id in known_clients: - self.router_socket.send_multipart( - [ - client_id, - msgpack.packb([header.value, sender_identity], use_bin_type=True), - ] - ) - for data_parallel_rank_id in self.identities_of_data_parallel_ranks: - self.router_socket.send_multipart( - [ - data_parallel_rank_id, - msgpack.packb([Headers.STOP_ACK.value], use_bin_type=True), - ] - ) elif header == Headers.ENGINE_REPLY: # This is the output of a single engine step on some data parallel rank. assert sender_identity in self.identities_of_data_parallel_ranks - finished_request_records = deserialized_payload[1] + finished_requests = deserialized_payload[1] - for finished_request_record in finished_request_records: - fid = finished_request_record["requests"][0]["request_id"] + for finished_request in finished_requests: + fid = finished_request["request_id"] client_identity = self.request_id_to_client_id[fid] client_request_identity = self.request_id_to_client_request_id[fid] del self.request_id_to_client_id[fid] @@ -270,15 +218,11 @@ def start(self): [ client_identity, msgpack.packb( - [header.value, client_request_identity, finished_request_record], - use_bin_type=True, + [client_request_identity, finished_request], use_bin_type=True ), ] ) - else: - raise UnknownHeaderError(header) - @classmethod def entrypoint( cls, ready_event: Event, inference_coordinator_port: int, data_parallel_size: int diff --git a/megatron/core/inference/engines/__init__.py b/megatron/core/inference/engines/__init__.py index d6a4f6eb694..9cd902d9d63 100644 --- a/megatron/core/inference/engines/__init__.py +++ b/megatron/core/inference/engines/__init__.py @@ -1,5 +1,5 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. from .abstract_engine import AbstractEngine -from .dynamic_engine import DynamicInferenceEngine, EngineSuspendedError +from .dynamic_engine import DynamicInferenceEngine from .static_engine import StaticInferenceEngine diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py index 5fad1369308..4bff4f85fa8 100644 --- a/megatron/core/inference/engines/dynamic_engine.py +++ b/megatron/core/inference/engines/dynamic_engine.py @@ -4,13 +4,10 @@ import logging import multiprocessing import os -import socket import struct import time import warnings from collections import deque -from contextlib import contextmanager -from dataclasses import dataclass from datetime import datetime from itertools import repeat from typing import Dict, List, Optional, Tuple, Union @@ -30,19 +27,14 @@ DataParallelInferenceCoordinator, ) from megatron.core.inference.engines.abstract_engine import AbstractEngine -from megatron.core.inference.headers import Headers, UnknownHeaderError -from megatron.core.inference.inference_request import ( - DynamicInferenceRequest, - DynamicInferenceRequestRecord, - Status, -) +from megatron.core.inference.headers import Headers +from megatron.core.inference.inference_request import DynamicInferenceRequest, Status from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) from megatron.core.inference.utils import Counter, await_process_event -from megatron.core.transformer.cuda_graphs import delete_cuda_graphs -from megatron.core.utils import get_asyncio_loop, internal_api, trace_async_exceptions +from megatron.core.utils import get_asyncio_loop, trace_async_exceptions try: from tqdm import tqdm @@ -73,19 +65,6 @@ HAVE_WANDB = False wandb = None -try: - import psutil - - HAVE_PSUTIL = True -except ImportError: - HAVE_PSUTIL = False - - -class EngineSuspendedError(Exception): - """Engine is currently suspended and not performing steps.""" - - pass - def format_mem_bytes(mem_bytes): """Convert a byte count to a human-readable string in tb, gb, mb, kb, or bytes.""" @@ -96,14 +75,6 @@ def format_mem_bytes(mem_bytes): return "%d bytes" % mem_bytes -@dataclass(kw_only=True) -class RequestEntry: - """Entry in the engine's `self.requests` dict.""" - - record: DynamicInferenceRequestRecord - future: asyncio.Future - - # pylint: disable=line-too-long class DynamicInferenceEngine(AbstractEngine): """The dynamic inference engine. @@ -123,6 +94,9 @@ class DynamicInferenceEngine(AbstractEngine): batching and a dynamic block-level KV cache (similar to paged attention). random_seed (Optional[int]): Use a random seed if you want deterministic results. Defaults to None. + static_sampling (bool): If True, all requests are assumed to have the same + sampling parameters. This avoids needing to loop through all requests and + their sampling parameters every generation step, improving latency. inference_logging_step_interval (int): The step interval at which to log inference metrics to wandb. Defaults to 0, which means no logging. """ @@ -136,9 +110,17 @@ def __init__( *, track_paused_request_events: bool = False, enable_chunked_prefill: bool = True, + static_sampling: bool = False, inference_logging_step_interval: int = 0, ): + if enable_cuda_graph is not None: + warnings.warn( + "The `enable_cuda_graph` argument is deprecated and will be " + "removed in `megatron-core 0.15`. `enable_cuda_graph` is now " + "read directly from the transformer config object." + ) + assert isinstance( controller, TextGenerationController ), f"controller must be a TextGenerationController, got {type(controller)}" @@ -147,41 +129,31 @@ def __init__( ), f"context must be a DynamicInferenceContext, got {type(context)}" assert isinstance(random_seed, int), f"random_seed must be an int, got {type(random_seed)}" - # Deprecate `enable_cuda_graph`. - if enable_cuda_graph is not None: - warnings.warn( - "The `enable_cuda_graph` argument is deprecated and will be " - "removed in `megatron-core 0.15`. `enable_cuda_graph` is now " - "read directly from the transformer config object." - ) - self.enable_cuda_graph = enable_cuda_graph - else: - self.enable_cuda_graph = ( - controller.inference_wrapped_model.model.config.enable_cuda_graph - ) - - # Initialization options. + self.request_counter = Counter() self.controller = controller self.context = context self.random_seed = random_seed self.track_paused_request_events = track_paused_request_events + self.step_count = 0 + self.finished_request_count = 0 + self.waiting_request_ids = deque() + self.failed_request_ids = [] # deque() + self.request_counter = Counter() + self.requests: Dict[int, DynamicInferenceRequest] = {} + self.request_completion_futures: Dict[int, asyncio.Future] = {} + self.step_start_event = torch.cuda.Event(enable_timing=True) + self.step_end_event = torch.cuda.Event(enable_timing=True) + self.paused = False + self.stopped = False self.enable_chunked_prefill = enable_chunked_prefill - self.inference_logging_step_interval = inference_logging_step_interval - self.unified_memory_level = context.unified_memory_level - - if enable_cuda_graph is not None: - self.cuda_graph_impl = "local" if enable_cuda_graph else "none" - else: - self.cuda_graph_impl = controller.inference_wrapped_model.model.config.cuda_graph_impl - - # Initialize engine. - self.reset() + self.static_sampling = static_sampling + self.inference_logging_step_interval = inference_logging_step_interval # Configure wandb to use separate step counter for inference metrics (only once) if self.inference_logging_step_interval > 0 and self.context.metrics_writer is not None: logging.info( f"\033[1;93m[INFERENCE]\033[0m " - f"\033[1;95mLogging inference metrics to wandb (rank {self.rank})\033[0m" + f"\033[1;95mLogging inference metrics to wandb (rank {torch.distributed.get_rank()})\033[0m" ) if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb": # Make all inference/* metrics use inference_step as their x-axis @@ -202,43 +174,21 @@ def __init__( max_step = int(val) self.inference_step_offset = int(max_step) - # Create cuda graphs. - self.create_cuda_graphs() - - def reset(self) -> None: - """Reset by removing all requests and reset all state.""" - - self.context.reset() - - # Request state. - self.request_counter = Counter() - self.finished_request_count = 0 - - self.requests: Dict[int, RequestEntry] = {} - self.waiting_request_ids = deque() - self.failed_request_ids = [] + # Initialize the asyncio loop if it has not already been initialized. + # TODO: Start the engine loop here. + self._loop = get_asyncio_loop() + self._cond = asyncio.Condition() - # Timing and logging variables. - self.rank = torch.distributed.get_rank() - self.step_count = 0 - self.step_start_event = torch.cuda.Event(enable_timing=True) - self.step_end_event = torch.cuda.Event(enable_timing=True) + # Capture cuda graph. self.capture_stats = None - # Runtime state. - self._loop = get_asyncio_loop(getattr(self, "_loop", None)) - self._cond = asyncio.Condition() - self.running = asyncio.Event() - self.paused = asyncio.Event() - self.stopped = asyncio.Event() - self.received_pause: bool = False - self.received_stop: bool = False - self.suspend_signal = False - self.is_suspended = False - self.resume_request_ids = None - - # Coordinator state. - self.use_coordinator = False + if enable_cuda_graph is not None: + self.cuda_graph_impl = "local" if enable_cuda_graph else "none" + else: + self.cuda_graph_impl = controller.inference_wrapped_model.model.config.cuda_graph_impl + + if self.cuda_graph_impl == "local": + self.create_cuda_graphs() def create_cuda_graphs(self, reset_context: bool = True): """Create cuda graphs. @@ -249,10 +199,6 @@ def create_cuda_graphs(self, reset_context: bool = True): Args: reset_context (bool): Whether to reset the context after building cuda graphs. """ - - if self.cuda_graph_impl != "local": - return - context = self.context controller = self.controller @@ -261,7 +207,7 @@ def create_cuda_graphs(self, reset_context: bool = True): if moe_pad_experts and context.non_decode_cuda_graphs: context.non_decode_cuda_graphs = False - if self.rank == 0: + if torch.distributed.get_rank() == 0: warnings.warn( "MoE models do not support non-decode cuda graphs. " "Forcing non_decode_cuda_graphs to False." @@ -346,12 +292,10 @@ def create_cuda_graphs(self, reset_context: bool = True): self.capture_stats = capture_stats - @internal_api async def start_listening_to_data_parallel_coordinator( self, inference_coordinator_port: int, launch_inference_coordinator: bool = True, - verbose: bool = False, *, loop: Optional[asyncio.AbstractEventLoop] = None, ): @@ -362,18 +306,16 @@ async def start_listening_to_data_parallel_coordinator( `InferenceCoordinator`. It configures different ZMQ socket patterns based on the rank's role within the distributed topology. - Note that this method must be called on all ranks, as it uses blocking torch broadcasts. - The setup involves two primary roles within each data-parallel group: - 1. **MP Coordinator (TP_rank=0, PP_rank=0)**: This rank connects directly + 1. **TP Coordinator (TP_rank=0, PP_rank=0)**: This rank connects directly to the central coordinator via a ZMQ `DEALER` socket. It receives requests and uses a ZMQ `PUB` (publisher) socket to broadcast them - to all other ranks within its model-parallel (MP) group. - 2. **MP Workers (all other ranks)**: These ranks use ZMQ `SUB` (subscriber) - sockets to listen for requests broadcast by their local MP Coordinator. + to all other ranks within its tensor-parallel (TP) group. + 2. **TP Workers (all other ranks)**: These ranks use ZMQ `SUB` (subscriber) + sockets to listen for requests broadcast by their local TP Coordinator. - This architecture uses TCP sockets for both inter-node and intra-node broadcasts - within an MP group. + This architecture uses fast Inter-Process Communication (`ipc`) sockets for + intra-node broadcasts within a TP group. Finally, after setting up the communication channels and ensuring all ranks are synchronized, this method starts the main engine processing loop @@ -385,7 +327,12 @@ async def start_listening_to_data_parallel_coordinator( launch_inference_coordinator (bool, optional): If True, the global rank 0 process will spawn and manage the `InferenceCoordinator` process. Defaults to True. - verbose (bool): Whether to run in verbose mode. + + Note: + The current implementation uses `ipc` sockets for broadcasting requests + within a Tensor Parallel group, which limits each TP group to a single + physical node. For example, if you have 8 GPUs per node, then this will only + work with TP=[1,2,4,8] """ assert HAVE_ZMQ, ( @@ -396,25 +343,7 @@ async def start_listening_to_data_parallel_coordinator( "pip install msgpack" ) - self.zmq_context = zmq.Context().instance() - self.zmq_sockets = [] # keep track of all sockets created by this engine - - # Get world info. - dp_group = parallel_state.get_data_parallel_group() - dp_src = parallel_state.get_data_parallel_src_rank() - dp_size = parallel_state.get_data_parallel_world_size() - dp_rank = parallel_state.get_data_parallel_rank() - - mp_group = parallel_state.get_model_parallel_group() - mp_src = parallel_state.get_model_parallel_src_rank() - tp_rank = parallel_state.get_tensor_model_parallel_rank() - pp_rank = parallel_state.get_pipeline_model_parallel_rank() - - self.is_mp_coordinator = tp_rank == 0 and pp_rank == 0 - self.is_dp_coordinator = (dp_rank == 0) and self.is_mp_coordinator - - # Spawn a DP coordinator process and get the connection info. - if launch_inference_coordinator and self.is_dp_coordinator: + if launch_inference_coordinator and torch.distributed.get_rank() == 0: spawn_context = multiprocessing.get_context('spawn') coordinator_ready_event = spawn_context.Event() self.inference_coordinator_process = spawn_context.Process( @@ -427,223 +356,67 @@ async def start_listening_to_data_parallel_coordinator( ) self.inference_coordinator_process.start() - # Find available ports for MP and bind to them. - if self.is_mp_coordinator: - local_ip = socket.gethostname() - mp_req_sock = self.zmq_context.socket(zmq.PUB) - mp_req_sock.bind_to_random_port(f"tcp://{local_ip}") - mp_req_addr = mp_req_sock.getsockopt_string(zmq.LAST_ENDPOINT) - - mp_len_sock = self.zmq_context.socket(zmq.PUB) - mp_len_sock.bind_to_random_port(f"tcp://{local_ip}") - mp_len_addr = mp_len_sock.getsockopt_string(zmq.LAST_ENDPOINT) - else: - mp_req_addr = None - mp_len_addr = None - - # Broadcast addresses to respective ranks. - bcast = [mp_req_addr, mp_len_addr] - torch.distributed.broadcast_object_list(bcast, src=mp_src, group=mp_group) - [mp_req_addr, mp_len_addr] = bcast - + # Todo [Siddharth]: can we move this code to another file? + self.zmq_context = zmq.Context() + self.zmq_sockets = [] # keep track of all sockets created by this engine ip_address_of_dp_coordinator = os.getenv('MASTER_ADDR', '127.0.0.1') - dp_addr = f"tcp://{ip_address_of_dp_coordinator}:{inference_coordinator_port}" - identity = f'mp-coord-{dp_rank}' - if self.is_mp_coordinator: + identity = f'tp-coord-{parallel_state.get_data_parallel_rank()}' + if ( + parallel_state.get_tensor_model_parallel_rank() == 0 + and parallel_state.get_pipeline_model_parallel_rank() == 0 + ): # 1. Create dealer sockets where tp_rank = 0 and pp_rank = 0 # These will receive requests from an InferenceCoordinator. self.socket_for_receiving_requests = self.zmq_context.socket(zmq.DEALER) self.socket_for_receiving_requests.setsockopt(zmq.IDENTITY, identity.encode('utf-8')) - self.socket_for_receiving_requests.connect(dp_addr) + self.socket_for_receiving_requests.connect( + f"tcp://{ip_address_of_dp_coordinator}:{inference_coordinator_port}" + ) # send empty string. this is used to register with the coordinator. self.socket_for_receiving_requests.send(b"") # 2. Create a publisher socket. This is used to publish or broadcast - # requests within the model parallel group - self.model_parallel_publisher_socket = mp_req_sock + # requests within the tensor parallel group + self.tensor_parallel_publisher_socket = self.zmq_context.socket(zmq.PUB) + self.tensor_parallel_publisher_socket.bind(f"ipc:///tmp/{identity}-tp-bcast-socket-req") # 3. Create another publisher socket to broadcast the number of messages to receive. - self.model_parallel_num_msgs_publisher_socket = mp_len_sock + self.tensor_parallel_num_msgs_publisher_socket = self.zmq_context.socket(zmq.PUB) + self.tensor_parallel_num_msgs_publisher_socket.bind( + f"ipc:///tmp/{identity}-tp-bcast-socket-len" + ) self.zmq_sockets += [ self.socket_for_receiving_requests, - self.model_parallel_num_msgs_publisher_socket, - self.model_parallel_publisher_socket, + self.tensor_parallel_num_msgs_publisher_socket, + self.tensor_parallel_publisher_socket, ] - # All MP ranks subscribe to the two publisher sockets - self.model_parallel_subscriber_socket = self.zmq_context.socket(zmq.SUB) - self.model_parallel_subscriber_socket.connect(mp_req_addr) - self.model_parallel_subscriber_socket.setsockopt_string(zmq.SUBSCRIBE, "") - - self.model_parallel_num_msgs_subscriber_socket = self.zmq_context.socket(zmq.SUB) - self.model_parallel_num_msgs_subscriber_socket.connect(mp_len_addr) - self.model_parallel_num_msgs_subscriber_socket.setsockopt_string(zmq.SUBSCRIBE, "") + # All TP ranks subscribe to the two publisher sockets + self.tensor_parallel_subscriber_socket = self.zmq_context.socket(zmq.SUB) + self.tensor_parallel_subscriber_socket.connect(f"ipc:///tmp/{identity}-tp-bcast-socket-req") + self.tensor_parallel_subscriber_socket.setsockopt_string(zmq.SUBSCRIBE, "") + + self.tensor_parallel_num_msgs_subscriber_socket = self.zmq_context.socket(zmq.SUB) + self.tensor_parallel_num_msgs_subscriber_socket.connect( + f"ipc:///tmp/{identity}-tp-bcast-socket-len" + ) + self.tensor_parallel_num_msgs_subscriber_socket.setsockopt_string(zmq.SUBSCRIBE, "") self.zmq_sockets += [ - self.model_parallel_subscriber_socket, - self.model_parallel_num_msgs_subscriber_socket, + self.tensor_parallel_subscriber_socket, + self.tensor_parallel_num_msgs_subscriber_socket, ] - torch.distributed.barrier(mp_group) + torch.distributed.barrier(parallel_state.get_tensor_model_parallel_group()) - if launch_inference_coordinator and self.is_dp_coordinator: + if launch_inference_coordinator and torch.distributed.get_rank() == 0: await await_process_event(coordinator_ready_event, self.inference_coordinator_process) logging.info("Inference co-ordinator is ready to receive requests!") # Finally run the engine infinite loop loop = get_asyncio_loop(loop) - self.engine_loop_task = loop.create_task( - self.run_engine_with_coordinator(loop=loop, verbose=verbose) - ) - - @contextmanager - @staticmethod - def suspend_resume_ctx(key: str, *, unified_memory_level: int) -> None: - """Context manager for of suspending and resuming the engine. - - This context manager records the time and memory usage when suspending - and resuming the context. TODO(@lmcafee): add argument to optionally - return nullcontext, to avoid overhead. - - Args: - key (str): Key that identifies caller (e.g., 'suspend' or 'resume'). - - Return: - None. - """ - - try: - - start_mem = torch.cuda.memory_stats() - start_time = time.time() - torch.cuda.synchronize() - - yield - - finally: - - end_time = time.time() - - end_mem = torch.cuda.memory_stats() - start_mem_alloc = start_mem["allocated_bytes.all.current"] - end_mem_alloc = end_mem["allocated_bytes.all.current"] - start_mem_res = start_mem["reserved_bytes.all.current"] - end_mem_res = end_mem["reserved_bytes.all.current"] - - rank_str = torch.distributed.get_rank() - dir_str = "deallocating" if end_mem_alloc <= start_mem_alloc else "allocating" - relative_time_str = f"{end_time - start_time:.3f} sec" - relative_mem_str = f"{abs(start_mem_alloc - end_mem_alloc) / 1024**3:.1f} gb" - - if HAVE_PSUTIL: - process = psutil.Process() - mem_info = process.memory_info() - cpu_mem_str = f"{mem_info.rss / 1024**3:.1f} gb" - else: - cpu_mem_str = "--" - - total_mem_str = ", ".join( - ( - f"cpu: {cpu_mem_str}", - f"gpu: alloc {end_mem_alloc / 1024**3:.1f} gb", - f"res {end_mem_res / 1024**3:.1f} gb", - ) - ) - logging.info( - f"[rank {rank_str}] dynamic engine {key}, " - f"unified {unified_memory_level}, " - f"{dir_str} " - f"{relative_mem_str} in {relative_time_str} ... " - f"abs mem usage: {total_mem_str}" - ) - - def suspend(self): - """Suspend engine by deallocating context's GPU state.""" - - # Skip if already suspended, which can happen when using the inference - # coordinator. - if self.is_suspended: - return - self.is_suspended = True - - # Deallocate context tensors. - with self.__class__.suspend_resume_ctx( - "suspended", unified_memory_level=self.unified_memory_level - ): - self.context.deallocate_all_tensors() - - # Delete cuda graphs when not using unified memory at all (level 0). For - # levels 1 and 2, the context's tensors maintain static memory addresses, - # so the cuda graphs are re-used. - if self.unified_memory_level == 0: - delete_cuda_graphs() - - # Maintain references to requests before reset. - waiting_request_ids = list(self.waiting_request_ids) - active_request_ids = set(self.requests.keys()) - set(waiting_request_ids) - self.resume_request_ids = [*active_request_ids, *waiting_request_ids] - self.waiting_request_ids.clear() - - # Suspend requests objects. - for request_id in active_request_ids: - self.requests[request_id].record.suspend(self.controller.tokenizer) - - def resume(self): - """Resume engine by reallocating context's GPU state.""" - - # Skip if not suspended, which can happen when using the inference - # coordinator. - if not self.is_suspended: - return - self.is_suspended = False - - # Resume. - with self.__class__.suspend_resume_ctx( - "resumed", unified_memory_level=self.unified_memory_level - ): - - # Allocate context tensors. - alloc_time = time.time() - torch.cuda.synchronize() - self.context.allocate_all_tensors(is_init=False) - torch.cuda.synchronize() - alloc_time = time.time() - alloc_time - - # Reset context and request data. - self.context.reset() - - # Create cuda graphs (before adding requests, to be in decode mode). - # Only create cuda graphs when not using unified memory at all (level - # 0). For levels 1 and 2, the context's tensors maintain static - # memory addresses, so the cuda graphs are re-used. - capture_time = time.time() - if self.unified_memory_level == 0: - self.create_cuda_graphs() - capture_time = time.time() - capture_time - - # Add requests. - add_time = time.time() - torch.cuda.synchronize() - for request_id in self.resume_request_ids: - self._add_request(self.get_request(request_id)) - torch.cuda.synchronize() - add_time = time.time() - add_time - - # Print inner timing (must be outside context manager above for correct formatting). - logging.info( - " > " - + ", ".join( - ( - f"inner timing: alloc {alloc_time:.3f}", - f"add {add_time:.3f}", - f"capture {capture_time:.3f}.", - ) - ) - ) - - # Notify event loop. - self._loop.call_soon_threadsafe(asyncio.create_task, self._notify_cond_for_new_request()) + self.engine_loop_task = loop.create_task(self.run_engine_with_coordinator(loop=loop)) @trace_async_exceptions async def _notify_cond_for_new_request(self): @@ -655,31 +428,19 @@ def has_unfinished_requests(self) -> bool: """Test if context contains unfinished requests.""" return self.context.has_unfinished_requests() or len(self.waiting_request_ids) > 0 - def get_request(self, request_id: int) -> DynamicInferenceRequest: - """Get most recent request from a request record. - - Args: - request_id (int): Request id. - - Returns: - (DynamicInferenceRequest) The most recent request in the record. - """ - return self.requests[request_id].record[-1] + def reset(self) -> None: + """Reset by removing all requests and reset all state.""" + self.context.reset() + self.waiting_request_ids.clear() + self.step_count = 0 + self.finished_request_count = 0 def _add_request( self, request: DynamicInferenceRequest ) -> asyncio.Future[DynamicInferenceRequest]: request_id = request.request_id - - # Add request to self.requests. If the engine has previously been - # suspended, then the request may already exist. - if request_id not in self.requests: - self.requests[request_id] = RequestEntry( - record=DynamicInferenceRequestRecord.from_request(request), - future=self._loop.create_future(), - ) - + self.requests[request_id] = request if request.status is None: request.status = Status.ACTIVE_AND_GENERATING_TOKENS @@ -695,17 +456,6 @@ def _add_request( request.sampling_params.num_tokens_to_generate = self.context.max_sequence_length - len( request.prompt_tokens ) - if request.sampling_params.termination_id is None: - try: - eod = self.controller.tokenizer.eod - except AttributeError: - if self.rank == 0: - warnings.warn( - "Termination ID not specified, and tokenizer does not define eod." - "Defaulting to not using termination id." - ) - eod = -1 - request.sampling_params.termination_id = eod if ( len(request.prompt_tokens) + request.sampling_params.num_tokens_to_generate @@ -720,10 +470,10 @@ def _add_request( if request.status != Status.FAILED: self.waiting_request_ids.append(request_id) - else: - self.failed_request_ids.append(request_id) - return self.requests[request_id].future + # Create a new asyncio Future to notify the user when the request has completed. + self.request_completion_futures[request_id] = self._loop.create_future() + return self.request_completion_futures[request_id] def add_request( self, @@ -741,6 +491,7 @@ def add_request( Return: Returns an asyncio `Future[DynamicInferenceRequest]` for the user to wait on. """ + prompt_str = None # Tokenize prompt if text. if isinstance(prompt, str): @@ -769,8 +520,8 @@ def add_request( # Initialize request. request = DynamicInferenceRequest( - request_id=request_id, prompt=prompt_str, + request_id=request_id, prompt_tokens=tokens, sampling_params=sampling_params, ) @@ -799,9 +550,9 @@ def post_process_requests( Returns: A list of active requests and completed requests as `DynamicInferenceRequest` objects """ - active_request_ids: list[int] = [] + active_requests: List[DynamicInferenceRequest] = [] + finished_requests: List[DynamicInferenceRequest] = [] finished_request_ids = set(finished_request_ids.tolist()) - finished_request_records: list[DynamicInferenceRequestRecord] = [] self.finished_request_count += len(finished_request_ids) log_probs_iter = log_probs if log_probs else repeat(None) @@ -809,7 +560,7 @@ def post_process_requests( for request_id, token, request_log_probs in zip( request_ids.tolist(), sample.tolist(), log_probs_iter ): - request: DynamicInferenceRequest = self.get_request(request_id) + request: DynamicInferenceRequest = self.requests[request_id] if request_id != self.context.chunked_prefill_request_id: request.generated_tokens.append(token) if request.tpot is None: @@ -843,20 +594,19 @@ def post_process_requests( if request_id in finished_request_ids: request.generated_length = len(request.generated_tokens) request.status = Status.COMPLETED - finished_entry = self.requests.pop(request_id) - finished_request = finished_entry.record[-1] + finished_request = self.requests.pop(request_id) if finished_request.prompt is None: finished_request.prompt = self.controller.tokenizer.detokenize( finished_request.prompt_tokens.tolist() ) finished_request.generated_length = len(finished_request.generated_tokens) + finished_requests.append(finished_request) finished_request.generated_text = self.controller.tokenizer.detokenize( finished_request.generated_tokens ) - finished_request_records.append(finished_entry.record) - finished_entry.future.set_result(finished_entry.record) + self.request_completion_futures[request_id].set_result(finished_request) else: - active_request_ids.append(request_id) + active_requests.append(request) else: # The chunked prefill produces useless tokens # so we are not appending them to the generated tokens. @@ -874,9 +624,9 @@ def post_process_requests( request.prompt_log_probs = [] request.prompt_log_probs.extend(request_log_probs) request.generated_log_probs = [] - active_request_ids.append(request_id) + active_requests.append(request) - return active_request_ids, finished_request_records + return active_requests, finished_requests def schedule_waiting_requests(self): """Tries to schedule any requests in the waiting pool.""" @@ -890,9 +640,9 @@ def schedule_non_chunked_prefill(self): Perform the same original scheduling logic for non-chunked runs """ while self.waiting_request_ids: - req = self.get_request(self.waiting_request_ids[0]) + req = self.requests[self.waiting_request_ids[0]] request_can_be_added, request_tokens_can_be_added, kv_cache_available = ( - self.context.check_availability(req) + self.context.check_availability(req, safe=True) ) if request_can_be_added and request_tokens_can_be_added and kv_cache_available: self.context.add_request(req) @@ -905,6 +655,37 @@ def schedule_non_chunked_prefill(self): else: break + def get_active_sampling_map(self) -> List[Tuple[SamplingParams, List[int]]]: + """Gets a map of sampling methods to active requests indices in the context.""" + # Get all active request IDs. + active_request_ids = self.context.request_ids[ + self.context.paused_request_count : self.context.total_request_count + ].tolist() + if self.static_sampling: + return [(next(iter(self.requests.values())).sampling_params, active_request_ids)] + + # Get a map from request_id to context array index. + context_id_map = {r: i for i, r in enumerate(active_request_ids)} + + # Create map of sampling methods to context array indices. + sampling_map: List[Tuple[SamplingParams, List[int]]] = [] + for request_id, request in self.requests.items(): + if request_id not in context_id_map: + continue + context_id = context_id_map[request_id] + sp = request.sampling_params + + # Look for a pre-existing group with these sampling parameters. + for sampling, indices in sampling_map: + if sampling == sp: + indices.append(context_id) + break + # If no group exists, create a new one. + else: + sampling_map.append((sp, [context_id])) + + return sampling_map + def schedule_chunked_prefill(self): """ This function schedules chunked prefill requests. @@ -923,7 +704,7 @@ def schedule_chunked_prefill(self): can_schedule = True while self.waiting_request_ids and can_schedule: can_schedule = False - req = self.get_request(self.waiting_request_ids[0]) + req = self.requests[self.waiting_request_ids[0]] # is_continuing_chunked_prefill is True if we are scheduling next # chunk of a existing chunked prefill request @@ -935,7 +716,9 @@ def schedule_chunked_prefill(self): self.context.active_token_count + remaining_len <= self.context.max_tokens ) token_partially_can_be_added = self.context.active_token_count < self.context.max_tokens - request_can_be_added, _, kv_cache_available = self.context.check_availability(req) + request_can_be_added, _, kv_cache_available = self.context.check_availability( + req, safe=not is_continuing_chunked_prefill + ) request_can_be_added = is_continuing_chunked_prefill or request_can_be_added if request_can_be_added and kv_cache_available: @@ -964,157 +747,104 @@ def schedule_chunked_prefill(self): # chunked prefill request at the head of the waiting queue # Note that we do not need to continue check the queue, as the tokens are full - async def async_forward(self) -> Tuple[Dict, Dict, float, int]: - """Uses `asyncio` for continuous generation. - Sleeps when no requests are available, until new requests have been added. + async def async_step( + self, *, verbose: Optional[bool] = False + ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]: + """ + Wrapper for controller.generate_output_tokens_dynamic_batch(), to + match vLLM API. Uses `asyncio` for continuous generation which allows this + method to sleep and wake up when new requests are available. + + Args: + sampling_params (SamplingParams): The sampling parameters. + verbose (bool): Whether to run in verbose mode. Returns: A tuple comprised of: - step_result (Optional[Dict]): The result of the step. - context_state (Dict): A tuple consisting of the state of the context. - is_decode_only, total/paused request count, active token count. - step_time (float): How long this step took. + 1. Requests that ran in the last step and are still active. + 2. Requests that ran in the last step and have now finished. + 3. The step time in seconds. """ - - # If suspended, no stepping. - if self.is_suspended: - raise EngineSuspendedError(self.step_count) - # schedule requests self.schedule_waiting_requests() - # Saving pre-step state, for printing output below. - is_decode_only = self.context.is_decode_only() - pre_step_context_state = { - "is_decode_only": is_decode_only, - "total_request_count": self.context.total_request_count, - "paused_request_count": self.context.paused_request_count, - "active_token_count": self.context.active_token_count, - } + # Previous context state, for printing output below. + prev_is_decode_only = self.context.is_decode_only() + prev_total_request_count = self.context.total_request_count + prev_paused_request_count = self.context.paused_request_count + prev_active_token_count = self.context.active_token_count + + range_push("Prefill" if not prev_is_decode_only else "Decode") # Generate tokens. - range_push("Prefill" if not is_decode_only else "Decode") - # TODO @TDE: Account for this line when overlapping forward and bookkeep. + is_decode_only = self.context.is_decode_only() + # save the is_decode_only AFTER scheduling, BEFORE update self.is_decode_only = is_decode_only - self.step_start_event.record() - result = await self.controller.async_generate_output_tokens_dynamic_batch() + sampling_map = self.get_active_sampling_map() + result = await self.controller.async_generate_output_tokens_dynamic_batch(sampling_map) self.step_end_event.record() self.step_end_event.synchronize() step_time = self.step_start_event.elapsed_time(self.step_end_event) / 1e3 - self.step_count += 1 - - range_pop() - - if ( - self.inference_logging_step_interval > 0 - and step_count > 0 - and step_count % self.inference_logging_step_interval == 0 - and self.context.metrics_writer is not None - ): - kvcache_util_stats = self.context.get_kvcache_utilization_stats() - else: - kvcache_util_stats = None - - post_step_context_state = { - "waiting_request_count": len(self.waiting_request_ids), - "finished_request_count": self.finished_request_count, - "kv_stats": kvcache_util_stats, - "padded_active_token_count": self.context.padded_active_token_count, - "using_cuda_graph_this_step": self.context.using_cuda_graph_this_step(), - "total_active_block_count": self.context.block_allocator.active_count, - "total_paused_block_count": self.context.block_allocator.paused_count, - "total_active_used_blocks": self.context.block_allocator.get_active_used(), - "total_paused_used_blocks": self.context.block_allocator.get_paused_used(), - } - - context_state = {**pre_step_context_state, **post_step_context_state} - - return result, context_state, step_time, self.step_count - - async def async_bookkeep( - self, - step_result: Optional[Dict], - context_state: Dict, - step_time: float, - step_count: int, - *, - verbose: bool = False, - ): - """Uses `asyncio` for continuous bookkeeping. - - Args: - step_result (Optional[Dict]): The result of the step. - context_state (Dict): is_decode_only, total/paused request count, active token count. - step_time (float): How long this step took. - step_count (int): The count of the step. - verbose (bool): Whether to run in verbose mode. - Returns: - A dictionary containing: - active_requests (List): Requests that ran in the last step and are still active. - finished_requests (List): Requests that ran in the last step and have now finished. - step_time (float): The step time in seconds. - cuda_graph_request_count (int): The CUDA graph batch size matching this step. - """ # Increment finished_request_count. cuda_graph_request_count = None - if step_result is not None: - active_request_ids = step_result["active_request_ids"] - newly_paused_request_ids = step_result["newly_paused_request_ids"] - finished_request_ids = step_result["finished_request_ids"] - sample = step_result["sample"] - log_probs = step_result["log_probs"] - cuda_graph_request_count = step_result["cuda_graph_request_count"] + if result is not None: + active_request_ids = result["active_request_ids"] + newly_paused_request_ids = result["newly_paused_request_ids"] + finished_request_ids = result["finished_request_ids"] + sample = result["sample"] + log_probs = result["log_probs"] + cuda_graph_request_count = result["cuda_graph_request_count"] # Add paused events. if newly_paused_request_ids is not None and self.track_paused_request_events: newly_paused_request_ids = newly_paused_request_ids.tolist() - [self.get_request(i).add_event_pause() for i in newly_paused_request_ids] + [self.requests[i].add_event_pause() for i in newly_paused_request_ids] # Mark requests finished. - [self.get_request(i).add_event_finish() for i in finished_request_ids.tolist()] + [self.requests[i].add_event_finish() for i in finished_request_ids.tolist()] # Add finished events. - active_request_ids, finished_request_records = self.post_process_requests( + (active_requests, finished_requests) = self.post_process_requests( active_request_ids, finished_request_ids, step_time, sample, log_probs ) else: - active_request_ids: list[int] = [] - finished_request_records: list[DynamicInferenceRequestRecord] = [] + active_requests: List[DynamicInferenceRequest] = [] + finished_requests: List[DynamicInferenceRequest] = [] # Failed requests. for failed_request_id in self.failed_request_ids: - failed_entry = self.requests.pop(failed_request_id) - failed_request = failed_entry.record[-1] + failed_request = self.requests.pop(failed_request_id) failed_request.status = Status.FAILED failed_request.add_event_fail() - finished_request_records.append(failed_entry.record) - failed_entry.future.set_result(failed_entry.record) + finished_requests.append(failed_request) + self.request_completion_futures[failed_request_id].set_result(failed_request) self.failed_request_ids.clear() - # Handle necessary ZMQ DP coordinator communication. - if self.use_coordinator and self.is_mp_coordinator and finished_request_records: - payload = msgpack.packb( - [Headers.ENGINE_REPLY.value, [r.serialize() for r in finished_request_records]], - use_bin_type=True, - ) - self.socket_for_receiving_requests.send(payload) - # Log KV cache utilization stats to W&B - if context_state["kv_stats"] is not None: + if ( + self.inference_logging_step_interval > 0 + and self.step_count > 0 + and self.step_count % self.inference_logging_step_interval == 0 + and self.context.metrics_writer is not None + ): + + # Get KV cache utilization stats from dynamic context + kv_stats = self.context.get_kvcache_utilization_stats() + # Prepare metrics dictionary with all stats # Use 'inference/' prefix for all metrics to separate from training metrics metrics = { - 'inference/inference_step': int(self.inference_step_offset + int(step_count)), + 'inference/inference_step': int(self.inference_step_offset + int(self.step_count)), 'inference/step_time_s': float(step_time), 'inference/waiting_queue_len': int(len(self.waiting_request_ids)), 'inference/total_requests_dict_size': int(len(self.requests)), } # Add KV stats with inference/ prefix # Convert utilization metrics from 0-1 range to 0-100 percentage range for better visualization - for key, value in context_state["kv_stats"].items(): + for key, value in kv_stats.items(): if 'utilization' in key: # Convert to percentage (0-100) and group under kvcache_utilization metrics[f'inference/{key}'] = float(value * 100.0) @@ -1130,16 +860,15 @@ async def async_bookkeep( # Print context state. if verbose: + context = self.context mem = torch.cuda.memory_stats() - step_type = "decode" if context_state["is_decode_only"] else "non-decode" + step_type = "decode" if is_decode_only else "non-decode" output_str = ( - "* rank %d | step %d | %s ... time: %.3f%s ... " - "reqs: a %d/%d, p %d/%d, w %d, f %d ... " - "blocks: a %d/%d, p %d/%d ... " + "* step %d | %s ... time: %.3f%s ... " + "reqs: %d [ gtd %d, active %d, paused %d, finished %d ] ... " "mem: tensors %d, alloc %.1f gb, res %.1f gb." % ( - self.rank, - step_count, + self.step_count, datetime.now().strftime("%H:%M:%S"), step_time, ( @@ -1148,71 +877,44 @@ async def async_bookkeep( step_type, ( "DIM %d:%d" - % ( - context_state["padded_active_token_count"], - context_state["active_token_count"], - ) - if context_state["using_cuda_graph_this_step"] + % (context.padded_active_token_count, prev_active_token_count) + if self.context.using_cuda_graph_this_step() else "OFF" ), ) ), - context_state["total_request_count"] - context_state["paused_request_count"], - context_state["total_active_block_count"], - context_state["paused_request_count"], - context_state["total_paused_block_count"], - context_state["waiting_request_count"], - context_state["finished_request_count"], - context_state["total_active_used_blocks"], - context_state["total_active_block_count"], - context_state["total_paused_used_blocks"], - context_state["total_paused_block_count"], + prev_total_request_count, + context.gtd_request_count, + prev_total_request_count - prev_paused_request_count, + prev_paused_request_count, + self.finished_request_count, mem["allocation.all.current"], mem["allocated_bytes.all.current"] / (1024**3), mem["reserved_bytes.all.current"] / (1024**3), ) ) - if context_state["is_decode_only"]: + if prev_is_decode_only: output_str = f"\033[94m{output_str}\033[0m" logging.info(output_str) + self.step_count += 1 + + range_pop() return { - "active_request_ids": active_request_ids, - "finished_request_records": finished_request_records, + "active_requests": active_requests, + "finished_requests": finished_requests, "step_time": step_time, "cuda_graph_request_count": cuda_graph_request_count, } - async def async_step( - self, *, verbose: bool = False - ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]: - """ - Wrapper for controller.generate_output_tokens_dynamic_batch(), to - match vLLM API. Uses `asyncio` for continuous generation which allows this - method to sleep and wake up when new requests are available. - - Args: - verbose (bool): Whether to run in verbose mode. - - Returns: - A tuple comprised of: - 1. Requests that ran in the last step and are still active. - 2. Requests that ran in the last step and have now finished. - 3. The step time in seconds. - """ - last_step_data = await self.async_forward() - ret = await self.async_bookkeep(*last_step_data, verbose=verbose) - # Keep for compatibility with current test suite. - return ret - def step_modern( - self, *, verbose: bool = False + self, *, verbose: Optional[bool] = False ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]: """Synchronous wrapper for `self.async_step`.""" return self._loop.run_until_complete(self.async_step(verbose=verbose)) def step_legacy( - self, sampling_params: SamplingParams, *, verbose: bool = False + self, sampling_params: SamplingParams, *, verbose: Optional[bool] = False ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]: """Synchronous wrapper for `self.async_step`.""" warnings.warn( @@ -1220,10 +922,10 @@ def step_legacy( "0.16. Please use `step_modern()` going forward, which will eventually " "be renamed to `step()`." ) - result = self._loop.run_until_complete(self.async_step(verbose=verbose)) - active_requests = [self.get_request(i) for i in result["active_request_ids"]] - finished_requests = [r.merge() for r in result["finished_request_records"]] - return active_requests, finished_requests, result["step_time"] + result = self._loop.run_until_complete( + self.async_step(sampling_params=sampling_params, verbose=verbose) + ) + return (result["active_requests"], result["finished_requests"], result["step_time"]) # For backwards compatibility, point `step()` to `step_legacy()`. Starting in # `megatron-core` 0.16, `step_modern()` will be renamed to `step()`. @@ -1238,40 +940,39 @@ def generate( request_id = int(next(self.request_counter)) _ = self.add_request(request_id, prompt, sampling_params) - finished_request_records_list = [] + finished_requests_list = [] while self.has_unfinished_requests(): result = self.step_modern() - finished_request_records_list.extend(result["finished_request_records"]) + finished_requests_list.extend(result["finished_requests"]) - # Ensure requests are returned in the same order they were passed in. - finished_request_records_list.sort(key=lambda r: r.request_id) + # Ensure requests are returned in the same order they were passed in + finished_requests_list.sort(key=lambda x: x.request_id) - return finished_request_records_list + return finished_requests_list def schedule_requests(self) -> int: """Drains the ZMQ socket for a batch of requests and adds them to the engine. This method is a collective and synchronous operation that must be called - by all ranks in a Model Parallel (MP) group at the same time. It ensures + by all ranks in a Tensor Parallel (TP) group at the same time. It ensures that all ranks process the exact same batch of incoming requests and control signals. The synchronization works as follows: - 1. The MP rank 0 drains all pending messages from its subscriber socket + 1. The TP rank 0 drains all pending messages from its subscriber socket in a non-blocking manner. - 2. MP rank 0 then broadcasts the number of messages it received to all other - ranks in its MP group using a dedicated publisher socket. - 3. The other MP ranks wait to receive this count, and then receive exactly + 2. TP rank 0 then broadcasts the number of messages it received to all other + ranks in its TP group using a dedicated publisher socket. + 3. The other TP ranks wait to receive this count, and then receive exactly that many messages from their subscriber sockets. Once all ranks have the same batch of messages, they are unpacked and processed. New requests are added to the engine's queue, and control - signals (PAUSE, UNPAUSE, SUSPEND, RESUME, STOP) update the engine's - internal state. + signals (PAUSE, STOP, UNPAUSE) update the engine's internal state. Note: This function is synchronous and must be called collectively by all - ranks in a MP group. It should not be launched in a separate coroutine + ranks in a TP group. It should not be launched in a separate coroutine to ensure all ranks execute it in lockstep before proceeding to the next engine step. @@ -1279,9 +980,10 @@ def schedule_requests(self) -> int: int: The number of messages that were received and processed in this batch. """ + rank = parallel_state.get_tensor_model_parallel_rank() torch.cuda.nvtx.range_push("drain_zmq_socket") all_messages = [] - if self.is_mp_coordinator: + if rank == 0: while True: try: # Receive messages in a non-blocking way. @@ -1293,72 +995,37 @@ def schedule_requests(self) -> int: # First publish the number of messages to dequeue. # This is important because we want all tensor parallel ranks # to dequeue the same number of messages. - self.model_parallel_num_msgs_publisher_socket.send( + self.tensor_parallel_num_msgs_publisher_socket.send( struct.pack('!i', messages_to_dequeue) ) - # Now publish the actual messages to all model parallel ranks - if messages_to_dequeue > 0: - self.model_parallel_publisher_socket.send_multipart(all_messages) + # Now publish the actual messages to all tensor parallel ranks + for message in all_messages: + self.tensor_parallel_publisher_socket.send(message) else: - # First, receive the number of messages to dequeue from mp-rank 0 + # First, receive the number of messages to dequeue from tp-rank 0 messages_to_dequeue = struct.unpack( - '!i', self.model_parallel_num_msgs_subscriber_socket.recv() + '!i', self.tensor_parallel_num_msgs_subscriber_socket.recv() )[0] # Now, dequeue the same number of messages from the subscriber socket. # Note that these receives are blocking, because the messages # are guaranteed to be available after the tp-rank 0 has sent them. - if messages_to_dequeue > 0: - all_messages = self.model_parallel_subscriber_socket.recv_multipart() - else: - all_messages = [] + for _ in range(messages_to_dequeue): + all_messages.append(self.tensor_parallel_subscriber_socket.recv()) torch.cuda.nvtx.range_pop() for message in all_messages: data = msgpack.unpackb(message, raw=False) header = Headers(data[0]) - - if self.received_stop: - assert ( - header == Headers.STOP_ACK - ), "Engine is shutting down. No other messages allowed except STOP_ACK." - if header == Headers.SUBMIT_REQUEST: request_id, prompt, sampling_params = data[1:] sampling_params = SamplingParams.deserialize(sampling_params) self.add_request(request_id, prompt, sampling_params) elif header == Headers.PAUSE: - # Pause thyself. - self.received_pause = True - self.running.clear() - # Send PAUSE_ACK back to coordinator. - if self.is_mp_coordinator: - payload = msgpack.packb([Headers.PAUSE_ACK.value], use_bin_type=True) - self.socket_for_receiving_requests.send(payload) - elif header == Headers.STOP: - # Stop thyself. - self.received_stop = True - self.running.clear() - # Send STOP_ACK back to coordinator. - if self.is_mp_coordinator: - payload = msgpack.packb([Headers.STOP_ACK.value], use_bin_type=True) - self.socket_for_receiving_requests.send(payload) - elif header == Headers.PAUSE_ACK: - self.paused.set() - self.received_pause = False - elif header == Headers.STOP_ACK: - self.stopped.set() - self.stop() - elif header == Headers.UNPAUSE: - self.paused.clear() - self.running.set() - elif header == Headers.SUSPEND: - self.suspend_signal = True - elif header == Headers.RESUME: - self.suspend_signal = False + self.paused = True elif header == Headers.STOP: self.stopped = True - else: - raise UnknownHeaderError(header) + elif header == Headers.UNPAUSE: + self.paused = False return len(all_messages) @@ -1376,6 +1043,7 @@ def stop(self): for socket in self.zmq_sockets: socket.close() self.zmq_context.term() + parallel_state.destroy_model_parallel() @trace_async_exceptions async def run_engine( @@ -1383,20 +1051,15 @@ async def run_engine( ): """Continually steps the engine asynchronously.""" self._loop = get_asyncio_loop(loop) - self.use_coordinator = False try: while True: # Wait until there are active requests before proceeding. async with self._cond: await self._cond.wait_for( - lambda: ( - not self.is_suspended - and ( - self.context.get_active_request_count() > 0 - or self.waiting_request_ids - ) - ) + lambda: self.context.get_active_request_count() > 0 + or self.waiting_request_ids ) + await self.async_step(verbose=verbose) except asyncio.CancelledError: pass @@ -1407,14 +1070,14 @@ async def run_engine_with_coordinator( ): """Continually steps the engine asynchronously.""" self._loop = get_asyncio_loop(loop) - self.use_coordinator = True try: while True: self.schedule_requests() - if self.stopped.is_set(): - break + if self.stopped: + self.stop() + return - # for the cases below (no active requests, or undergoing a state-change) + # for the cases below (engine is paused or no active requests), # do not use asyncio.sleep(0) # as tp-rank=0 will flood the num_messages publisher # with "0" repeatedly. This causes some packets to drop. @@ -1426,20 +1089,10 @@ async def run_engine_with_coordinator( # todo [Siddharth]: Can this hardcoded sleep be avoided # with asyncio zmq sockets? - if self.paused.is_set() or self.received_pause or self.received_stop: - await asyncio.sleep(0.02) - continue - - # Suspend, resume. - if self.suspend_signal: - self.suspend() + if self.paused: await asyncio.sleep(0.02) continue - else: - self.resume() - - # No requests. if ( self.context.get_active_request_count() == 0 and len(self.waiting_request_ids) == 0 @@ -1447,7 +1100,25 @@ async def run_engine_with_coordinator( await asyncio.sleep(0.02) continue - await self.async_step(verbose=verbose) + engine_output = await self.async_step(verbose=verbose) + + is_tp0_and_pp0 = ( + parallel_state.get_tensor_model_parallel_rank() == 0 + and parallel_state.get_pipeline_model_parallel_rank() == 0 + ) + if ( + is_tp0_and_pp0 + and engine_output is not None + and engine_output["finished_requests"] + ): + payload = msgpack.packb( + [ + Headers.ENGINE_REPLY.value, + [r.serializable() for r in engine_output["finished_requests"]], + ], + use_bin_type=True, + ) + self.socket_for_receiving_requests.send(payload) except asyncio.CancelledError: pass diff --git a/megatron/core/inference/engines/static_engine.py b/megatron/core/inference/engines/static_engine.py index d4c61965d2b..dc86eb775f9 100644 --- a/megatron/core/inference/engines/static_engine.py +++ b/megatron/core/inference/engines/static_engine.py @@ -17,7 +17,7 @@ from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) -from megatron.core.utils import get_asyncio_loop, get_mamba_inference_state_config_from_model +from megatron.core.utils import get_asyncio_loop try: from tqdm import tqdm @@ -93,10 +93,6 @@ def __init__( # Store original context in case we need to fall back to legacy static engine original_context = text_generation_controller.inference_wrapped_model.inference_context - mamba_inference_state_config = get_mamba_inference_state_config_from_model( - text_generation_controller.inference_wrapped_model.model - ) - try: if not legacy: dynamic_context = DynamicInferenceContext.from_config( @@ -105,17 +101,16 @@ def __init__( max_batch_size=max_batch_size, buffer_size_gb=buffer_size_gb, num_cuda_graphs=1, - mamba_inference_state_config=mamba_inference_state_config, ) self.controller.inference_wrapped_model.inference_context = dynamic_context self.controller.inference_wrapped_model.prep_model_for_inference() - self.controller._init_dynamic_sampling_tensors() self.dynamic_engine = DynamicInferenceEngine( controller=self.controller, random_seed=self.random_seed, context=dynamic_context, enable_cuda_graph=True, + static_sampling=True, ) except Exception as e: # Get exception details for better debugging diff --git a/megatron/core/inference/headers.py b/megatron/core/inference/headers.py index a22d1328679..ff894cc1918 100644 --- a/megatron/core/inference/headers.py +++ b/megatron/core/inference/headers.py @@ -1,6 +1,6 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -from enum import Enum, auto +from enum import Enum class Headers(Enum): @@ -8,21 +8,10 @@ class Headers(Enum): Enum representing headers used for communication with the inference-coordinator. """ - CONNECT = auto() - CONNECT_ACK = auto() - SUBMIT_REQUEST = auto() - ENGINE_REPLY = auto() - PAUSE = auto() - PAUSE_ACK = auto() - UNPAUSE = auto() - SUSPEND = auto() - RESUME = auto() - STOP = auto() - STOP_ACK = auto() - - -class UnknownHeaderError(Exception): - """A signal with an unrecognized header was received by the coordinator.""" - - def __init_(self, header): - super().__init__(f"specialize for {header}.") + CONNECT = 0 + ACK = 1 + SUBMIT_REQUEST = 2 + ENGINE_REPLY = 3 + PAUSE = 4 + UNPAUSE = 5 + STOP = 6 diff --git a/megatron/core/inference/inference_client.py b/megatron/core/inference/inference_client.py index 8a19e226c46..53daac091b0 100644 --- a/megatron/core/inference/inference_client.py +++ b/megatron/core/inference/inference_client.py @@ -4,9 +4,9 @@ import logging import os import time -from typing import Awaitable, List, Optional, Union +from typing import List, Union -from megatron.core.inference.inference_request import DynamicInferenceRequestRecord +from megatron.core.inference.inference_request import DynamicInferenceRequest from megatron.core.inference.sampling_params import SamplingParams from megatron.core.utils import get_asyncio_loop, trace_async_exceptions @@ -73,11 +73,6 @@ def __init__(self, inference_coordinator_port: int): inference_coordinator_address = os.getenv('MASTER_ADDR', '127.0.0.1') socket.connect(f"tcp://{inference_coordinator_address}:{inference_coordinator_port}") - self._loop = None - self.running = asyncio.Event() - self.paused = asyncio.Event() - self.stopped = asyncio.Event() - self.socket = socket self.completion_futures = {} self.request_submission_times = {} @@ -97,55 +92,41 @@ def add_request( prompt (str): The input prompt to send to the language model. sampling_params: An object containing the sampling parameters for text generation (e.g., temperature, top_p). It must have a - `serialize()` method. + `serializable()` method. Returns: asyncio.Future: A future that will be resolved with a - `DynamicInferenceRequestRecord` object containing the completed result. + `DynamicInferenceRequest` object containing the completed result. """ - if not self.running.is_set(): - raise RuntimeError("InferenceClient is not currently running.") request_id = self.next_request_id self.next_request_id += 1 - payload = [Headers.SUBMIT_REQUEST.value, request_id, prompt, sampling_params.serialize()] + payload = [Headers.SUBMIT_REQUEST.value, request_id, prompt, sampling_params.serializable()] payload_serialized = msgpack.packb(payload, use_bin_type=True) self.socket.send(payload_serialized) assert request_id not in self.completion_futures - self.completion_futures[request_id] = self._loop.create_future() + self.completion_futures[request_id] = get_asyncio_loop().create_future() self.request_submission_times[request_id] = time.perf_counter() return self.completion_futures[request_id] @trace_async_exceptions - async def _recv_task(self): + async def _listen_for_completed_requests(self): """ Listens for completed inference requests from the coordinator. This coroutine runs in an infinite loop, continuously polling the socket - for data. - When a request reply is received, it unpacks the message, finds the + for replies. When a reply is received, it unpacks the message, finds the corresponding Future using the request ID, and sets the result. - Other control packets are handled appropriately. This method is started as a background task by the `start()` method. """ while True: try: - data = msgpack.unpackb(self.socket.recv(flags=zmq.NOBLOCK), raw=False) - header = Headers(data[0]) - if header == Headers.ENGINE_REPLY: - request_id, reply = data[1:] - reply['latency'] = time.perf_counter() - self.request_submission_times.pop( - request_id - ) - completion_future = self.completion_futures.pop(request_id) - if completion_future.done(): - logging.warning(f"Client: The future for {request_id} has been cancelled!") - continue - completion_future.set_result(DynamicInferenceRequestRecord.deserialize(reply)) - elif header == Headers.PAUSE_ACK: - self.paused.set() - elif header == Headers.STOP_ACK: - self.stopped.set() + request_id, reply = msgpack.unpackb(self.socket.recv(flags=zmq.NOBLOCK), raw=False) + reply['latency'] = time.perf_counter() - self.request_submission_times.pop( + request_id + ) + completion_future = self.completion_futures.pop(request_id) + completion_future.set_result(DynamicInferenceRequest.deserialize(reply)) except zmq.Again: await asyncio.sleep(0.005) continue @@ -156,15 +137,15 @@ def _connect_with_inference_coordinator(self): """ Performs the initial handshake with the inference coordinator. - Sends a CONNECT signal and waits for a CONNECT_ACK reply to ensure the + Sends a CONNECT signal and waits for an ACK reply to ensure the connection is established and acknowledged by the coordinator. """ payload = [Headers.CONNECT.value] self.socket.send(msgpack.packb(payload, use_bin_type=True)) reply = msgpack.unpackb(self.socket.recv(), raw=False)[0] - assert Headers(reply) == Headers.CONNECT_ACK + assert Headers(reply) == Headers.ACK - async def start(self, loop: Optional[asyncio.AbstractEventLoop] = None): + async def start(self): """ Connects to the coordinator and starts the background listener task. @@ -173,12 +154,8 @@ async def start(self, loop: Optional[asyncio.AbstractEventLoop] = None): coroutine. """ logging.info("Client: Connecting to InferenceCoordinator...") - self._loop = get_asyncio_loop(loop) - self.running.set() - self.paused.clear() - self.stopped.clear() self._connect_with_inference_coordinator() - self.listener_task = self._loop.create_task(self._recv_task()) + self.listener_task = asyncio.create_task(self._listen_for_completed_requests()) def _send_signal_to_engines(self, signal): """ @@ -191,52 +168,17 @@ def _send_signal_to_engines(self, signal): payload_serialized = msgpack.packb(payload, use_bin_type=True) self.socket.send(payload_serialized) - def pause_engines(self) -> Awaitable: - """Sends a signal to pause all inference engines. - - The signal first propagates thru the coordinator to all engines. - All engines acknowledge this signal and clear their `running` flags. - The coordinator awaits all acknowledgements before forwarding the ACK - back to the client, as well as to the engines. - The engines set their `paused` flags upon seeing the ACK. - - Returns: - Awaitable: An awaitable that resolves when all engines have paused. - """ - self._send_signal_to_engines(Headers.PAUSE) - return self.paused.wait() - - def unpause_engines(self) -> None: - """Sends a signal to unpause all inference engines.""" - self.paused.clear() - self.running.set() - self._send_signal_to_engines(Headers.UNPAUSE) - - def suspend_engines(self): + def pause_engines(self): """Sends a signal to pause all inference engines.""" self._send_signal_to_engines(Headers.PAUSE) - self._send_signal_to_engines(Headers.SUSPEND) - def resume_engines(self): + def unpause_engines(self): """Sends a signal to unpause all inference engines.""" - self._send_signal_to_engines(Headers.RESUME) self._send_signal_to_engines(Headers.UNPAUSE) - def stop_engines(self) -> Awaitable: - """Sends a signal to gracefully stop all inference engines. - - The signal first propagates thru the coordinator to all engines. - All engines acknowledge this signal and clear their `running` flags. - The coordinator awaits all acknowledgements before forwarding the ACK - back to the client, as well as to the engines. - The engines set their `stopped` flags upon seeing the ACK. - - Returns: - Awaitable: An awaitable that resolves when all engines have stopped. - """ + def stop_engines(self): + """Sends a signal to gracefully stop all inference engines.""" self._send_signal_to_engines(Headers.STOP) - self.running.clear() - return self.stopped.wait() def stop(self): """ diff --git a/megatron/core/inference/inference_request.py b/megatron/core/inference/inference_request.py index b58fac1b281..21ff7786d6a 100644 --- a/megatron/core/inference/inference_request.py +++ b/megatron/core/inference/inference_request.py @@ -11,18 +11,10 @@ import torch from megatron.core.inference.sampling_params import SamplingParams -from megatron.core.tokenizers import MegatronTokenizer -def serialize_tensor(tensor: torch.Tensor) -> bytes: - """Serialize tensor to bytes. - - Args: - tensor (Tensor): Tensor. - - Returns: - (bytes) Byte representation of tensor. - """ +def serialize_tensor(tensor): + """Serialize tensor to bytes.""" buffer = io.BytesIO() torch.save(tensor, buffer) buffer.seek(0) @@ -30,15 +22,8 @@ def serialize_tensor(tensor: torch.Tensor) -> bytes: return tensor_bytes -def deserialize_tensor(tensor_bytes: bytes) -> torch.Tensor: - """Deserialize tensor from bytes. - - Args: - tensor_bytes (bytes): Byte representation of tensor. - - Returns: - (Tensor) Tensor. - """ +def deserialize_tensor(tensor_bytes): + """Deserialize tensor from bytes.""" buffer = io.BytesIO(tensor_bytes) tensor = torch.load(buffer) return tensor @@ -91,12 +76,11 @@ def __post_init__(self): ) self.sampling_params = self.inference_parameters - def serialize(self) -> dict: - """Converts the instance into a serializable dictionary. - + def serializable(self): + """ + Converts the instance into a serializable dictionary. Returns: - (dict) A dictionary representation of the instance suitable for - serialization. + dict: A dictionary representation of the instance suitable for serialization. """ # Dataclass to dict. @@ -185,12 +169,11 @@ def __str__(self): payload_str = "" if self.payload is None else f", {type(self.payload).__name__}" return f"[{self.timestamp:.3f}] {self.type.name}{payload_str}" - def serialize(self) -> dict: - """Converts the instance into a serializable dictionary. - + def serialize(self): + """ + Converts the instance into a serializable dictionary. Returns: - (dict) A dictionary representation of the instance suitable for - serialization. + dict: A dictionary representation of the instance suitable for serialization. """ # Dataclass to dict. @@ -270,14 +253,13 @@ def __str__(self): ) ) - def serialize(self): - """Converts the instance into a serializable dictionary. - + def serializable(self): + """ + Converts the instance into a serializable dictionary. Returns: - (dict) A dictionary representation of the instance suitable for - serialization. + dict: A dictionary representation of the instance suitable for serialization. """ - obj = super().serialize() + obj = super().serializable() obj["events"] = [e.serialize() for e in self.events] return obj @@ -295,39 +277,6 @@ def deserialize(cls, obj: dict) -> "DynamicInferenceRequest": request.events = [DynamicInferenceEvent.deserialize(e) for e in obj["events"]] return request - @property - def tracked_metadata(self) -> List[Any]: - """Obtain an ordered list of all request metadata to be tracked by the context. - - This consists of metadata that is used to inform text generation. - The values of such fields are tensorized and kept aligned with the current active batch. - - Note that while the general request object is mutable, this metadata is - inherently assumed to remain immutable once the request becomes active. - """ - sp = self.sampling_params - if sp.termination_id is None: - if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: - warnings.warn( - f"DynamicInferenceRequest {self.request_id} has no termination_id set " - "in its sampling_params. Defaulting to -1." - ) - sp.termination_id = -1 - return [getattr(sp, field) for field in self.get_metadata_labels().keys()] - - @staticmethod - def get_metadata_labels() -> Dict[str, int]: - """Provides human-readable labels for the tracked metadata fields.""" - ret = [ - "temperature", - "top_k", - "top_p", - "termination_id", - "return_log_probs", - "skip_prompt_log_probs", - ] - return {k: v for v, k in enumerate(ret)} - def add_event(self, type: DynamicInferenceEventType, payload: Optional[Any] = None) -> None: """Add event.""" self.events.append(DynamicInferenceEvent(type=type, payload=payload)) @@ -365,158 +314,6 @@ def failed(self) -> bool: return self.status == Status.FAILED -@dataclass(kw_only=True) -class DynamicInferenceRequestRecord: - """History of DynamicInferenceRequest objects over multiple suspend and - resumes.""" - - requests: list[DynamicInferenceRequest] = field(default_factory=list) - latency: Optional[float] = None - - @classmethod - def from_request(cls, request: DynamicInferenceRequest) -> "DynamicInferenceRequestRecord": - """Initialize record from a single request. - - Args: - request (DynamicInferenceRequest): Initial request. - - Returns: - (DynamicInferenceRequestRecord) A record. - """ - record = cls() - record.requests.append(request) - return record - - def __getitem__(self, idx: int) -> DynamicInferenceRequest: - """Get request by index. - - Args: - idx (int): Request index. - - Returns: - (DynamicInferenceRequest) Request object. - """ - return self.requests[idx] - - @property - def request_id(self) -> int: - """Get request id. - - Returns: - (int) Request id. - """ - return self.requests[0].request_id - - def suspend(self, tokenizer: MegatronTokenizer): - """Suspend request by storing references to previous prompt, generations, - and sampling params. - - Args: - tokenizer (MegatronTokenizer): The tokenizer. - """ - - old_request = self[-1] - - # New prompt (concatenate prompt + generated tokens). - new_prompt_tokens = torch.cat( - ( - old_request.prompt_tokens, - torch.tensor( - old_request.generated_tokens, - dtype=old_request.prompt_tokens.dtype, - device=old_request.prompt_tokens.device, - ), - ), - dim=0, - ) - new_prompt_str = tokenizer.detokenize(new_prompt_tokens.tolist()) - - # New sampling params. - new_sampling_params = SamplingParams( - **{ - **asdict(old_request.sampling_params), - "num_tokens_to_generate": ( - old_request.sampling_params.num_tokens_to_generate - - len(old_request.generated_tokens) - ), - } - ) - - # New request. - new_request = DynamicInferenceRequest( - request_id=old_request.request_id, - prompt=new_prompt_str, - prompt_tokens=new_prompt_tokens, - sampling_params=new_sampling_params, - ) - self.requests.append(new_request) - - def merge(self, tokenizer: MegatronTokenizer) -> DynamicInferenceRequest: - """Merge requests into a single suspend-agnostic request object. - - Args: - tokenizer (MegatronTokenizer): The tokenizer. - - Returns: - (DynamicInferenceRequest) Merged request. - """ - - def merge_lists(key): - if getattr(self.requests[0], key) is None: - return None - else: - return [v for r in self.requests for v in getattr(r, key)] - - prompt_tokens = self.requests[0].prompt_tokens - generated_tokens = merge_lists("generated_tokens") - - # Merged request. - request = DynamicInferenceRequest( - request_id=self.requests[0].request_id, - prompt=tokenizer.detokenize(prompt_tokens.tolist()), - prompt_tokens=prompt_tokens, - prompt_log_probs=self.requests[0].prompt_log_probs, - prompt_top_n_logprobs=self.requests[0].prompt_top_n_logprobs, - generated_text=tokenizer.detokenize(generated_tokens), - generated_tokens=generated_tokens, - generated_length=len(generated_tokens), - generated_log_probs=merge_lists("generated_log_probs"), - generated_top_n_logprobs=merge_lists("generated_top_n_logprobs"), - sampling_params=self.requests[0].sampling_params, - tpot=merge_lists("tpot"), - status=self.requests[-1].status, - latency=self.latency, - events=merge_lists("events"), - ) - - return request - - def serialize(self) -> dict: - """Converts the instance into a serializable dictionary. - - Returns: - (dict) A dictionary representation of the instance suitable for - serialization. - """ - obj = asdict(self) - obj["requests"] = [r.serialize() for r in self.requests] - return obj - - @classmethod - def deserialize(cls, obj: dict) -> "DynamicInferenceRequestRecord": - """Deserialize record. - - Args: - obj (dict): Serialized record data. - - Returns: - (DynamicInferenceRequestRecord) Deserialized record. - """ - request = cls(**obj) - request.requests = [DynamicInferenceRequest.deserialize(r) for r in obj["requests"]] - return request - - @dataclass(kw_only=True) class VLMInferenceRequest(InferenceRequest): """Class for a VLM inference request""" diff --git a/megatron/core/inference/sampling_params.py b/megatron/core/inference/sampling_params.py index d85b2816c80..e215b3f134b 100644 --- a/megatron/core/inference/sampling_params.py +++ b/megatron/core/inference/sampling_params.py @@ -44,7 +44,7 @@ def add_attributes(self, attribute_value_pair: dict): for key, value in attribute_value_pair.items(): setattr(self, key, value) - def serialize(self) -> dict: + def serializable(self) -> dict: """Return a dictionary that is msgpack-serializable.""" return self.__dict__.copy() diff --git a/megatron/core/inference/text_generation_controllers/text_generation_controller.py b/megatron/core/inference/text_generation_controllers/text_generation_controller.py index 0aed3df079e..2bda1425710 100644 --- a/megatron/core/inference/text_generation_controllers/text_generation_controller.py +++ b/megatron/core/inference/text_generation_controllers/text_generation_controller.py @@ -23,11 +23,7 @@ MaxSequenceLengthOverflowError, WarmupEngineMode, ) -from megatron.core.inference.inference_request import ( - DynamicInferenceRequest, - InferenceRequest, - Status, -) +from megatron.core.inference.inference_request import InferenceRequest, Status from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( AbstractModelInferenceWrapper, ) @@ -78,35 +74,6 @@ def __init__( self.sampling_rng = torch.Generator(device=torch.cuda.current_device()) self.sampling_rng.manual_seed(model_config.inference_sampling_seed) - if self.inference_wrapped_model.inference_context.is_dynamic_batching(): - self._init_dynamic_sampling_tensors() - - def _init_dynamic_sampling_tensors(self): - """Initialize tensors needed for dynamic sampling.""" - context = self.inference_wrapped_model.inference_context - max_requests = context.max_total_requests - - device = torch.cuda.current_device() - logits_dtype = self.inference_wrapped_model.inference_wrapper_config.params_dtype - # Use padded vocab size because tokenizer vocab size might pad to nearest power of 2. - vocab_size = self.inference_wrapped_model.inference_wrapper_config.padded_vocab_size - - # Initialize bookkeeping tensors. - self.sampling_logits_cuda = torch.empty( - max_requests, vocab_size, dtype=logits_dtype, device=device - ) - self.sampled_tokens_cuda = torch.empty(max_requests, dtype=torch.int64, device=device) - - self.temperature_cuda = torch.empty_like(self.sampled_tokens_cuda, dtype=torch.float) - self.top_k_cuda = torch.empty_like(self.sampled_tokens_cuda, dtype=torch.int32) - self.top_p_cuda = torch.empty_like(self.sampled_tokens_cuda, dtype=torch.float) - self.termination_id_cuda = torch.empty(max_requests, dtype=torch.int64, device=device) - self.return_log_probs_cuda = torch.empty(max_requests, dtype=torch.bool, device=device) - self.skip_prompt_log_probs_cuda = torch.empty(max_requests, dtype=torch.bool, device=device) - - # Used for inefficient torch sampling. - self.torch_sampling_buckets: List[Tensor] = [] - def tokenize_prompt(self, prompt: str, add_BOS: bool = False) -> List[int]: """Utility to tokenize the input prompts. @@ -210,14 +177,16 @@ def detokenize_generations( return text, prompts_plus_generations_segments - def _torch_sampling_func( + def sample_from_logits( self, last_token_logits: torch.Tensor, - temperature: float, - top_k: int, - top_p: float, + sampling_params: Optional[SamplingParams] = None, vocab_size: Optional[int] = None, - ): + generation_started: Optional[torch.Tensor] = None, + top_n_logprobs_dict: Dict[int, List[Dict[str, float]]] = None, + logits: Optional[torch.Tensor] = None, + **kwargs, + ) -> torch.Tensor: """Samples the logits to generate outputs Given the logits of the last token, this function samples it @@ -227,15 +196,26 @@ def _torch_sampling_func( Args: last_token_logits (torch.Tensor): The last token logits. A tensor of - size [batch_size, vocab_size]. - temperature (float): The temperature to use for sampling. - top_k (int): The top-k value to use for sampling. - top_p (float): The top-p value to use for sampling. - vocab_size (int): Obtained from the tokenizer. Defaults to None. + size [batch_size, vocab_size] + sampling_params (SamplingParams): The parameters to use for inference. + vocab_size (int): Obtained from the tokenizer. Defaults to None + generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True + indicates the prompt at that index has started generating tokens. + top_n_logprobs_dict (top_n_logprobs_dict): The dict to be updated Returns: sampled_logits (torch.Tensor): 1D tensor with [batch_size] elements + top_n_logprobs_this_step (torch.return_types.topk): a topk tensor with values as logits + and indices as the top k elements. None if sampling params top_n_logprobs is 0. """ + + if kwargs.get("common_inference_params"): + sampling_params = kwargs["common_inference_params"] + + top_p = sampling_params.top_p + top_k = sampling_params.top_k + temperature = sampling_params.temperature + assert isinstance(top_p, float) assert isinstance(top_k, int) assert not (top_k > 0 and top_p > 0.0), "Cannot have top-p and top-k both greater than zero" @@ -266,6 +246,53 @@ def modify_logits_for_top_p_filtering(logits, top_p): filter_ = filter_.scatter(1, sorted_indices, filter_) logits.masked_fill_(filter_, float("-Inf")) + if sampling_params.top_n_logprobs > 0: + # NOTE : This thing can also be clubbed with where we compute log probs + # when --return-log-probs is enabled. This is just more efficient + assert generation_started is not None + if logits is None: + batch_size = last_token_logits.shape[0] + last_token_log_probs = F.log_softmax(last_token_logits, dim=1).to(torch.float32) + top_n_logits_this_step = torch.topk( + last_token_log_probs, k=sampling_params.top_n_logprobs + ) + top_n_logprobs_this_step = top_n_logits_this_step.values.cpu() + top_n_logprobs_indices = top_n_logits_this_step.indices.cpu() + + # If we return prompt top_n_log_probs then we always append to the + # logprobs dict. Otherwise we only append for generated tokens. + if sampling_params.return_prompt_top_n_logprobs: + mask = torch.ones(batch_size, dtype=torch.bool) + else: + mask = generation_started.cpu() + + self._update_top_n_logprobs_dict( + top_n_logprobs_this_step, top_n_logprobs_indices, mask, top_n_logprobs_dict + ) + else: + assert sampling_params.return_prompt_top_n_logprobs + + # Compute the prompt logprobs + batch_size, seq_length, _ = logits.shape + log_probs = F.log_softmax(logits, dim=2).to(torch.float32) + top_n_logits_this_step = torch.topk(log_probs, k=sampling_params.top_n_logprobs) + + # Move the token dimension to the front and then add each token logprobs + # individually for every request in the batch + top_n_logprobs_this_step = top_n_logits_this_step.values.permute(1, 0, 2).cpu() + top_n_logprobs_indices = top_n_logits_this_step.indices.permute(1, 0, 2).cpu() + + # We append to the logprobs dict for every prompt token + mask = torch.ones(batch_size, dtype=torch.bool) + + for i in range(seq_length): + self._update_top_n_logprobs_dict( + top_n_logprobs_this_step[i], + top_n_logprobs_indices[i], + mask, + top_n_logprobs_dict, + ) + # Greedy sampling if top_k == 1: sampled_logits = torch.argmax(last_token_logits, dim=-1) @@ -295,10 +322,10 @@ def modify_logits_for_top_p_filtering(logits, top_p): return sampled_logits - def sample_from_logits( + def sample_from_dynamic_logits( self, last_token_logits: torch.Tensor, - sampling_params: Optional[SamplingParams] = None, + active_sampling_map: List[Tuple[SamplingParams, List[int]]], vocab_size: Optional[int] = None, generation_started: Optional[torch.Tensor] = None, top_n_logprobs_dict: Dict[int, List[Dict[str, float]]] = None, @@ -308,14 +335,16 @@ def sample_from_logits( """Samples the logits to generate outputs Given the logits of the last token, this function samples it - according to the parameters defined in sampling_params + according to the parameters defined in active_sampling_map and returns the samples. If sampling parameters top_n_logprobs > 0 at each step it also updates the top_n_logprobs dict. Args: last_token_logits (torch.Tensor): The last token logits. A tensor of size [batch_size, vocab_size] - sampling_params (SamplingParams): The parameters to use for inference. + active_sampling_map (List[Tuple[SamplingParams, List[int]]]): A list of tuples + matching each unique set of sampling params to the context array indices + of the corresponding active requests. vocab_size (int): Obtained from the tokenizer. Defaults to None generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True indicates the prompt at that index has started generating tokens. @@ -323,65 +352,29 @@ def sample_from_logits( Returns: sampled_logits (torch.Tensor): 1D tensor with [batch_size] elements + termination_id (torch.Tensor): Tensor of shape [batch_size] with termination ids top_n_logprobs_this_step (torch.return_types.topk): a topk tensor with values as logits and indices as the top k elements. None if sampling params top_n_logprobs is 0. """ - - if kwargs.get("common_inference_params"): - sampling_params = kwargs["common_inference_params"] - - if sampling_params.top_n_logprobs > 0: - # NOTE : This thing can also be clubbed with where we compute log probs - # when --return-log-probs is enabled. This is just more efficient - assert generation_started is not None - if logits is None: - batch_size = last_token_logits.shape[0] - last_token_log_probs = F.log_softmax(last_token_logits, dim=1).to(torch.float32) - top_n_logits_this_step = torch.topk( - last_token_log_probs, k=sampling_params.top_n_logprobs + batch_size = last_token_logits.size(0) + new_sample = torch.zeros(batch_size, dtype=torch.int64, device=last_token_logits.device) + termination_id = torch.zeros_like(new_sample, dtype=torch.int64) + + for sampling_params, mask in active_sampling_map: + # Filter out indices that are out of bounds for the current batch + valid_mask = [i for i in mask if i < batch_size] + if valid_mask: + new_sample[valid_mask] = self.sample_from_logits( + last_token_logits[valid_mask], + sampling_params=sampling_params, + vocab_size=vocab_size, ) - top_n_logprobs_this_step = top_n_logits_this_step.values.cpu() - top_n_logprobs_indices = top_n_logits_this_step.indices.cpu() - - # If we return prompt top_n_log_probs then we always append to the - # logprobs dict. Otherwise we only append for generated tokens. - if sampling_params.return_prompt_top_n_logprobs: - mask = torch.ones(batch_size, dtype=torch.bool) + if sampling_params.termination_id is not None: + termination_id[valid_mask] = sampling_params.termination_id else: - mask = generation_started.cpu() - - self._update_top_n_logprobs_dict( - top_n_logprobs_this_step, top_n_logprobs_indices, mask, top_n_logprobs_dict - ) - else: - assert sampling_params.return_prompt_top_n_logprobs - - # Compute the prompt logprobs - batch_size, seq_length, _ = logits.shape - log_probs = F.log_softmax(logits, dim=2).to(torch.float32) - top_n_logits_this_step = torch.topk(log_probs, k=sampling_params.top_n_logprobs) - - # Move the token dimension to the front and then add each token logprobs - # individually for every request in the batch - top_n_logprobs_this_step = top_n_logits_this_step.values.permute(1, 0, 2).cpu() - top_n_logprobs_indices = top_n_logits_this_step.indices.permute(1, 0, 2).cpu() + termination_id[valid_mask] = self.tokenizer.eod - # We append to the logprobs dict for every prompt token - mask = torch.ones(batch_size, dtype=torch.bool) - - for i in range(seq_length): - self._update_top_n_logprobs_dict( - top_n_logprobs_this_step[i], - top_n_logprobs_indices[i], - mask, - top_n_logprobs_dict, - ) - - top_p = sampling_params.top_p - top_k = sampling_params.top_k - temperature = sampling_params.temperature - - return self._torch_sampling_func(last_token_logits, temperature, top_k, top_p, vocab_size) + return new_sample, termination_id def update_generation_status( self, @@ -542,12 +535,10 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor) input_ids (Tensor): The input token IDs. position_ids (Tensor): The position IDs. """ - inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config - context = self.inference_wrapped_model.inference_context materialize_only_last_token_logits = context.materialize_only_last_token_logits - active_request_count = context.total_request_count - context.paused_request_count + inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config with torch.inference_mode(): logits = self.inference_wrapped_model.run_one_forward_step( @@ -555,8 +546,9 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor) ) if self.model_is_pipeline_parallel: + batch_size = context.total_request_count - context.paused_request_count logits_seq_len = ( - active_request_count if materialize_only_last_token_logits else input_ids.shape[1] + batch_size if materialize_only_last_token_logits else input_ids.shape[1] ) vocab_size = inference_wrapper_config.padded_vocab_size logits_shape = [1, logits_seq_len, vocab_size] @@ -564,6 +556,8 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor) if is_pipeline_last_stage(self.pp_group): assert logits is not None and torch.Size(logits_shape) == logits.shape + # TODO(ksanthanam): Evaluate whether it makes more sense to sample on 1 rank + # and then broadcast the sampled tokens rather than broadcasting the raw logits. logits = broadcast_from_last_pipeline_stage( logits_shape, dtype=inference_wrapper_config.params_dtype, @@ -573,95 +567,31 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor) return logits def _dynamic_step_sample_bookkeeping( - self, - *, - backend: str = "torch", - request_metadata: Optional[Tensor] = None, - request_metadata_labels: Dict[str, int] = None, + self, active_sampling_map: List[Tuple[SamplingParams, List[int]]] ): - """Perform bookkeeping necessary to sample logits for dynamic batching. + """Perform bookkeeping necessary to sample logits for dynamic batching.""" + pass - The ability to override the context's data is solely intended for - standalone use or testing, and should never be used in a running system. + def _dynamic_step_sample_logits( + self, logits: Tensor, active_sampling_map: List[Tuple[SamplingParams, List[int]]] + ) -> Tensor: + """Sample logits for dynamic batching. Args: - backend (str): The sampling backend to use. - request_metadata (Optional[Tensor]): An override for the tensor that manages all - request metadata, such as sampling parameters. By default, this metadata is - retrieved from the context. - request_metadata_labels (Optional[Dict]): An override for the map of metadata labels - to their index in the request_metadata tensor. By default, this metadata is - retrieved from the request object. - """ - assert backend in ["torch"] - context = self.inference_wrapped_model.inference_context - - if request_metadata is None: - request_metadata = context.request_metadata[ - context.paused_request_count : context.total_request_count, : - ] - if request_metadata_labels is None: - request_metadata_labels = DynamicInferenceRequest.get_metadata_labels() - active_request_count = request_metadata.size(0) - - # Shorthand these, because the torch backend needs them. - temp = request_metadata[:, request_metadata_labels["temperature"]] - top_k = request_metadata[:, request_metadata_labels["top_k"]] - top_p = request_metadata[:, request_metadata_labels["top_p"]] - - # Copy data into relevant tensors. - self.temperature_cuda[:active_request_count].copy_(temp, non_blocking=True) - self.top_k_cuda[:active_request_count] = top_k.to( - dtype=torch.int32, copy=True, non_blocking=True - ) - self.top_p_cuda[:active_request_count].copy_(top_p, non_blocking=True) - self.termination_id_cuda[:active_request_count] = request_metadata[ - :, request_metadata_labels["termination_id"] - ].to(dtype=torch.int64, copy=True, non_blocking=True) - self.return_log_probs_cuda[:active_request_count] = request_metadata[ - :, request_metadata_labels["return_log_probs"] - ].to(dtype=torch.bool, copy=True, non_blocking=True) - self.skip_prompt_log_probs_cuda[:active_request_count] = request_metadata[ - :, request_metadata_labels["skip_prompt_log_probs"] - ].to(dtype=torch.bool, copy=True, non_blocking=True) - - if backend == "torch": - # Bucketize the core sampling parameters. - core_params = torch.stack((temp, top_k, top_p), dim=1) - _, inv_indices, cnts = torch.unique( - core_params, dim=0, return_inverse=True, return_counts=True - ) - order = torch.argsort(inv_indices, stable=True) - sampling_buckets = torch.split(order, cnts.tolist()) - # Perform the D2H sync needed by `_torch_sampling_func` here. - group_reps = torch.stack([indices[0] for indices in sampling_buckets], dim=0) - core_params_reps = core_params[group_reps].detach().cpu() - temp_reps = core_params_reps[:, 0].tolist() - top_k_reps = core_params_reps[:, 1].to(torch.int32).tolist() - top_p_reps = core_params_reps[:, 2].tolist() - # Store the buckets and their equivalence class representatives. - self.torch_sampling_buckets = ( - (sampling_buckets[idx], temp_reps[idx], top_k_reps[idx], top_p_reps[idx]) - for idx in range(len(sampling_buckets)) - ) - - def _dynamic_step_sample_logits(self, logits: Tensor, backend: str = "torch") -> Tensor: - """Sample tokens from logits for dynamic batching. - - Args: - logits (Tensor): The logits to sample from. - backend (str): The sampling backend to use. + logits (Tensor): The logits from the forward step. + active_sampling_map (List[Tuple[SamplingParams, List[int]]]): A list of tuples + matching each unique set of sampling params to the context array indices + of the corresponding active requests. Returns: - new_sample (Tensor): The sampled tokens. + new_sample (Tensor): The sampled tokens for each active request. + termination_id (int): The termination token IDs of each active request. """ - # TODO(ksanthanam): Evaluate whether it makes more sense to sample on 1 rank - # and then broadcast the sampled tokens rather than broadcasting the raw logits. - assert backend in ["torch"] - context = self.inference_wrapped_model.inference_context materialize_only_last_token_logits = context.materialize_only_last_token_logits + inference_wrapper_config = self.inference_wrapped_model.inference_wrapper_config + # Last token logits. if materialize_only_last_token_logits: # When materialize_only_last_token_logits is true, last_token_logits is @@ -669,72 +599,60 @@ def _dynamic_step_sample_logits(self, logits: Tensor, backend: str = "torch") -> last_token_logits = logits.squeeze(0) else: last_token_logits = context.last_token_logits(logits) - active_request_count = last_token_logits.size(0) - # Copy last_token_logits to contiguous buffer. - self.sampling_logits_cuda[:active_request_count].copy_(last_token_logits, non_blocking=True) - - if backend == "torch": - # Concatenate the outputs once to prevent repeated small writes. - token_list = [] - indices_list = [] - - for indices, temp, top_k, top_p in self.torch_sampling_buckets: - token_list.append( - self._torch_sampling_func( - self.sampling_logits_cuda[indices, :], temp, top_k, top_p - ) - ) - indices_list.append(indices) - # Single write to the output tensor. - sampled_tokens = torch.cat(token_list, dim=0) - sampled_indices = torch.cat(indices_list, dim=0) - self.sampled_tokens_cuda.index_copy_(0, sampled_indices, sampled_tokens) - return self.sampled_tokens_cuda[:active_request_count].clone() + # Sample. + # Use padded vocab size because tokenizer vocab size might not include padding + # to nearest power of 2. + vocab_size = inference_wrapper_config.padded_vocab_size + new_sample, termination_id = self.sample_from_dynamic_logits( + last_token_logits, active_sampling_map, vocab_size=vocab_size + ) + return new_sample, termination_id - def _dynamic_step_log_probs_bookkeeping(self) -> bool: + def _dynamic_step_log_probs_bookkeeping(self): """Perform bookkeeping necessary to compute log probs for dynamic batching.""" - context = self.inference_wrapped_model.inference_context - materialize_only_last_token_logits = context.materialize_only_last_token_logits + pass - active_request_count = context.total_request_count - context.paused_request_count - - to_check = self.return_log_probs_cuda[:active_request_count] - to_check &= ~self.skip_prompt_log_probs_cuda[:active_request_count] - - assert not ( - to_check.any() and materialize_only_last_token_logits - ), "Prompt log probs cannot be calculated if only last token logits are materialized." - - return self.return_log_probs_cuda[:active_request_count].any() - - def _dynamic_step_calculate_log_probs(self, logits: Tensor) -> Optional[Tensor]: - """Calculate log probs from logits.""" + def _dynamic_step_calculate_log_probs( + self, + logits: Tensor, + new_sample: Tensor, + active_sampling_map: List[Tuple[SamplingParams, List[int]]], + ) -> Optional[Tensor]: context = self.inference_wrapped_model.inference_context materialize_only_last_token_logits = context.materialize_only_last_token_logits - active_request_count = context.total_request_count - context.paused_request_count + log_probs = None + return_log_probs = False + for sampling_params, mask in active_sampling_map: + if sampling_params.return_log_probs: + assert ( + sampling_params.skip_prompt_log_probs + or materialize_only_last_token_logits is False + ), "Materialize only last token logits must be false for returning log probs" + return_log_probs = True - ret = context.calculate_log_probs( - logits, - self.sampled_tokens_cuda[:active_request_count], - only_last_token_logits=materialize_only_last_token_logits, - ) - return ret + if return_log_probs: + log_probs = context.calculate_log_probs( + logits, new_sample, only_last_token_logits=materialize_only_last_token_logits + ) - def _dynamic_step_context_bookkeeping(self, new_sample) -> Dict[str, Tensor]: + return log_probs + + def _dynamic_step_context_bookkeeping( + self, new_sample: Tensor, termination_id: int + ) -> Tuple[Tensor, Tensor, Tensor]: """Update the dynamic inference context after sampling. + Args: + new_sample (Tensor): The newly sampled tokens for each active request. + termination_id (int): The token ID that indicates termination. + Return: - Dict [str, Tensor]: A dictionary containing: - active_request_ids (Tensor): Current active request IDs. - newly_paused_request_ids (Tensor): Newly paused request IDs. - finished_request_ids (Tensor): Finished request IDs. + Tuple[Tensor, Tensor, Tensor]: active / paused / finished request IDs. """ context = self.inference_wrapped_model.inference_context - active_request_count = context.total_request_count - context.paused_request_count - # Active sequence lengths. active_request_ids = context.request_ids[ context.paused_request_count : context.total_request_count @@ -745,10 +663,9 @@ def _dynamic_step_context_bookkeeping(self, new_sample) -> Dict[str, Tensor]: # Request finished if termination_id or length >= max_sequence_length. # Note: termination_id tensor has per-request termination IDs from mixed sampling - active_request_mask = ( - self.sampled_tokens_cuda[:active_request_count] - != self.termination_id_cuda[:active_request_count] - ).byte() & torch.less(active_sequence_lengths, max_sequence_lengths).byte() + active_request_mask = (new_sample != termination_id).byte() & torch.less( + active_sequence_lengths, max_sequence_lengths + ).byte() finished_idxs = ( torch.nonzero(active_request_mask == 0, as_tuple=True)[0] + context.paused_request_count ) @@ -768,11 +685,16 @@ def _dynamic_step_context_bookkeeping(self, new_sample) -> Dict[str, Tensor]: @torch.inference_mode() async def async_generate_output_tokens_dynamic_batch( - self, skip_bookkeeping: Optional[bool] = False + self, + active_sampling_map: List[Tuple[SamplingParams, List[int]]], + skip_bookkeeping: Optional[bool] = False, ) -> Optional[Dict]: """Forward step the model and update the inference context. Args: + active_sampling_map (List[Tuple[SamplingParams, List[int]]]): A list of tuples + matching each unique set of sampling params to the context array indices + of the corresponding active requests. skip_bookkeeping (Optional[bool]): If true, skip the context bookkeeping step. Return: @@ -793,12 +715,13 @@ async def async_generate_output_tokens_dynamic_batch( if context.active_token_count == 0: return None + # This method only performs computations using CPU tensors. input_ids, position_ids = self._dynamic_step_context_init() - cuda_graph_request_count = ( context.padded_active_request_count if context.is_decode_only() else None ) + # This method only performs computations using GPU tensors. logits = self._dynamic_step_forward_logits(input_ids, position_ids) # This is the best place to yield control back to event loop. @@ -810,35 +733,41 @@ async def async_generate_output_tokens_dynamic_batch( # NOTE [TDE]: This will be moved once CPU and GPU methods are separated. await asyncio.sleep(0) - self._dynamic_step_sample_bookkeeping() - new_sample = self._dynamic_step_sample_logits(logits) + # This method will only perform computations using CPU tensors in the future. + self._dynamic_step_sample_bookkeeping(active_sampling_map) + # This method will only perform computations using GPU tensors in the future. + new_sample, termination_id = self._dynamic_step_sample_logits(logits, active_sampling_map) - return_log_probs = self._dynamic_step_log_probs_bookkeeping() - if return_log_probs: - log_probs = self._dynamic_step_calculate_log_probs(logits) - else: - log_probs = None + # This method will only perform computations using CPU tensors in the future. + self._dynamic_step_log_probs_bookkeeping() + # This method will only perform computations using GPU tensors in the future. + log_probs = self._dynamic_step_calculate_log_probs(logits, new_sample, active_sampling_map) + # This method only performs computations using CPU tensors. if skip_bookkeeping: - request_bookkeeping = {} + request_bookeeping = {} else: - request_bookkeeping = self._dynamic_step_context_bookkeeping(new_sample) + request_bookeeping = self._dynamic_step_context_bookkeeping(new_sample, termination_id) ret = { "sample": new_sample, "log_probs": log_probs, "cuda_graph_request_count": cuda_graph_request_count, } - ret.update(request_bookkeeping) + ret.update(request_bookeeping) return ret @torch.inference_mode() def generate_output_tokens_dynamic_batch( - self, loop: Optional[asyncio.AbstractEventLoop] = None + self, + active_sampling_map: List[Tuple[SamplingParams, List[int]]], + loop: Optional[asyncio.AbstractEventLoop] = None, ) -> Optional[Dict]: """Synchronous wrapper for `self.async_generate_output_tokens_dynamic_batch.""" loop = get_asyncio_loop(loop) - return loop.run_until_complete(self.async_generate_output_tokens_dynamic_batch()) + return loop.run_until_complete( + self.async_generate_output_tokens_dynamic_batch(active_sampling_map) + ) def _update_top_n_logprobs_dict( self, diff --git a/megatron/core/inference/unified_memory.py b/megatron/core/inference/unified_memory.py index e06e3022561..6e5e85ed668 100644 --- a/megatron/core/inference/unified_memory.py +++ b/megatron/core/inference/unified_memory.py @@ -56,9 +56,9 @@ def compile_allocator(): EXPORT void* managed_malloc(size_t size, int device, void* stream) { (void)stream; - int prev_device = -1; - cudaGetDevice(&prev_device); - if (device != prev_device && device >= 0) cudaSetDevice(device); + int cur = -1; + cudaGetDevice(&cur); + if (device != cur && device >= 0) cudaSetDevice(device); // cudaMallocManaged allows for more memory to be allocated than the device memory size. // The cudaMemAttachGlobal flag makes the memory accessible from both host and device. @@ -69,32 +69,13 @@ def compile_allocator(): if (device >= 0) { // cudaMemAdviseSetPreferredLocation sets the preferred location for the memory. // This is a hint that tries to prevent data from being migrated away from the device. - - #if CUDART_VERSION >= 13000 - // For CUDA >= 13, the cudaMemAdvise device arg is type cudaMemLocation - // instead of an int, so we setup the location and conditionally use it - // in calls to cudaMemAdvise. - cudaMemLocation location; - location.type = cudaMemLocationTypeDevice; - location.id = device; - - cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetPreferredLocation, location); - - // cudaMemAdviseSetAccessedBy ensures the memory always lives in the device's page table. - // Even if the memory has to be migrated away from the device, it still does not page fault. - // The CUDA docs claim that cudaMemAdviseSetPreferredLocation completely overrides this flag, - // but there is no harm in adding this flag as well for future-proofing. - cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetAccessedBy, location); - #else - cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetPreferredLocation, device); - // cudaMemAdviseSetAccessedBy ensures the memory always lives in the device's page table. - // Even if the memory has to be migrated away from the device, it still does not page fault. - // The CUDA docs claim that cudaMemAdviseSetPreferredLocation completely overrides this flag, - // but there is no harm in adding this flag as well for future-proofing. - cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetAccessedBy, device); - #endif + cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetPreferredLocation, device); + // cudaMemAdviseSetAccessedBy ensures the memory always lives in the device's page table. + // Even if the memory has to be migrated away from the device, it still does not page fault. + // The CUDA docs claim that cudaMemAdviseSetPreferredLocation completely overrides this flag, + // but there is no harm in adding this flag as well for future-proofing. + cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetAccessedBy, device); } - if (device != prev_device && prev_device >= 0) cudaSetDevice(prev_device); return ptr; } @@ -119,29 +100,13 @@ def compile_allocator(): functions=[], with_cuda=True, extra_ldflags=_extra_ldflags, - verbose=True, + verbose=False, ) _so_path = Path(_mod.__file__).as_posix() _alloc = CUDAPluggableAllocator(_so_path, "managed_malloc", "managed_free").allocator() _compilation_state = CompilationState.SUCCESS - except (RuntimeError, ImportError, OSError) as e: - warnings.warn(f"Failed to create unified memory mempool: '{e}'.") - _compilation_state = CompilationState.FAILURE - - # Synchronize failure state across ranks. (For currently unknown reasons, - # one rank can show as FAILURE while the remaining ranks show as SUCCESS.) - import torch - - local_state = torch.tensor( - [_compilation_state.value], dtype=torch.uint8, device=torch.cuda.current_device() - ) - world_states = [ - torch.empty(1, dtype=torch.uint8, device=torch.cuda.current_device()) - for _ in range(torch.distributed.get_world_size()) - ] - torch.distributed.all_gather(world_states, local_state) - world_states = set(s.item() for s in world_states) - if CompilationState.FAILURE.value in world_states: + except (RuntimeError, ImportError, OSError): + warnings.warn("Failed to create unified memory mempool.") _compilation_state = CompilationState.FAILURE diff --git a/megatron/core/inference/utils.py b/megatron/core/inference/utils.py index 55536a52088..d58f3c3a652 100644 --- a/megatron/core/inference/utils.py +++ b/megatron/core/inference/utils.py @@ -2,7 +2,6 @@ import asyncio import multiprocessing -import sys import torch @@ -162,57 +161,3 @@ async def await_process_event( raise RuntimeError( f"Process {process.name} (pid {process.pid}) has exited unexpectedly." ) - - -# Compatibility for Python < 3.13 asyncio Queue functionality. -# This is necessary because asyncio Queues are broken in Python < 3.13. -if sys.version_info < (3, 13): - - _SHUTDOWN_SENTINEL = object() - - class asyncio_QueueShutDown(Exception): - """Compatibility exception for Python < 3.13.""" - - pass - - class asyncio_Queue(asyncio.Queue): - """An asyncio.Queue with Python 3.13 compatibility features for Python < 3.13.""" - - def __init__(self, maxsize: int = 0): - super().__init__(maxsize) - self._is_shutdown = False - - async def get(self): - """Get an item from the queue with Python < 3.13 compatibility.""" - if self._is_shutdown and self.empty(): - raise asyncio_QueueShutDown - ret = await super().get() - if ret is _SHUTDOWN_SENTINEL: - super().put_nowait(_SHUTDOWN_SENTINEL) - super().task_done() - raise asyncio_QueueShutDown - return ret - - def put_nowait(self, item): - """Put an item into the queue without blocking""" - if self._is_shutdown: - raise asyncio_QueueShutDown - if item is _SHUTDOWN_SENTINEL: - raise ValueError(f"{item} is reserved for shutdown purposes for Python < 3.13") - super().put_nowait(item) - - def shutdown(self): - """Shutdown the queue for Python < 3.13. - - Note that the listening side of the queue can continue to get old data - off the queue even after it has already been shutdown. The listener only - shutdowns when the queue is BOTH shutdown AND empty. - """ - if not self._is_shutdown: - super().put_nowait(_SHUTDOWN_SENTINEL) - super().task_done() - self._is_shutdown = True - -else: - asyncio_QueueShutDown = asyncio.QueueShutDown - asyncio_Queue = asyncio.Queue diff --git a/megatron/core/models/backends.py b/megatron/core/models/backends.py index 29169285b3e..abda7c47787 100644 --- a/megatron/core/models/backends.py +++ b/megatron/core/models/backends.py @@ -22,19 +22,6 @@ LNImpl = WrappedTorchNorm HAVE_APEX = False -from megatron.core.extensions.transformer_engine import ( - TEActivationOp, - TEColumnParallelLinear, - TEDotProductAttention, - TELinear, - TENorm, -) -from megatron.core.tensor_parallel.inference_layers import ( - InferenceLayerNormColumnParallelLinear, - InferenceRowParallelLinear, -) -from megatron.core.utils import is_te_min_version - class BackendSpecProvider(Protocol): """A protocol for providing the submodules used in Spec building.""" @@ -132,51 +119,3 @@ def grouped_mlp_modules( def activation_func(self) -> type: """Which module to use for activation function""" return None - - -class InferenceSpecProvider(BackendSpecProvider): - """A protocol for providing the submodules used in Spec building.""" - - def linear(self) -> type: - """Which linear module TE backend uses""" - return TELinear - - def column_parallel_linear(self) -> type: - """Which column parallel linear module TE backend uses""" - return TEColumnParallelLinear - - def row_parallel_linear(self) -> type: - """Which row parallel linear module TE backend uses""" - return InferenceRowParallelLinear - - def fuse_layernorm_and_linear(self) -> bool: - """TE backend chooses a single module for layernorm and linear""" - return True - - def column_parallel_layer_norm_linear(self) -> Optional[type]: - """Which module for sequential layernorm and linear""" - return InferenceLayerNormColumnParallelLinear - - def layer_norm(self, rms_norm: bool = False, for_qk: bool = False) -> type: - """Which module to use for layer norm""" - if for_qk and not is_te_min_version("1.9.0"): - # TENorm significantly harms convergence when used - # for QKLayerNorm if TE Version < 1.9; - # we instead use the Apex implementation. - return FusedLayerNorm - return TENorm - - def core_attention(self) -> type: - """Which module to use for attention""" - return TEDotProductAttention - - def activation_func(self) -> type: - """Which module to use for activation function""" - return TEActivationOp - - def grouped_mlp_modules( - self, moe_use_grouped_gemm: bool, moe_use_legacy_grouped_gemm: bool - ) -> Tuple[type, Optional[MLPSubmodules]]: - raise NotImplementedError( - "MOE is not supported with inference optimized transformer implementation." - ) diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py index 7405150c4b3..c5c9caa3d67 100755 --- a/megatron/core/models/gpt/gpt_layer_specs.py +++ b/megatron/core/models/gpt/gpt_layer_specs.py @@ -4,11 +4,7 @@ from typing import Optional, Union from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add -from megatron.core.models.backends import ( - BackendSpecProvider, - InferenceSpecProvider, - LocalSpecProvider, -) +from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider from megatron.core.models.gpt.linear_attention_module_specs import ( get_linear_attention_module_spec_for_backend, ) @@ -77,102 +73,6 @@ HAVE_APEX = False -def get_gpt_layer_with_inference_spec( - qk_layernorm: Optional[bool] = False, - multi_latent_attention: Optional[bool] = False, - qk_l2_norm: Optional[bool] = False, -) -> ModuleSpec: - """Use this spec to use inference optimized linear layers. - Args: - qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False. - multi_latent_attention (bool, optional): To use MLA. Defaults to False. - qk_l2_norm (bool, optional): To use l2 norm for queries/keys. Defaults to False. - """ - assert HAVE_TE, "--transformer-impl inference_optimized requires transformer engine" - backend = InferenceSpecProvider() - - mlp = get_mlp_module_spec_for_backend( - backend=backend, - num_experts=None, - moe_grouped_gemm=False, - moe_use_legacy_grouped_gemm=False, - use_te_op_fuser=False, - use_te_activation_func=False, - ) - - if multi_latent_attention: - assert qk_l2_norm is False, "qk_l2_norm is not supported with MLA." - linear_q_up_proj = ( - backend.column_parallel_layer_norm_linear() - if qk_layernorm - else backend.column_parallel_linear() - ) - linear_kv_up_proj = ( - backend.column_parallel_layer_norm_linear() - if qk_layernorm - else backend.column_parallel_linear() - ) - return ModuleSpec( - module=TransformerLayer, - submodules=TransformerLayerSubmodules( - input_layernorm=backend.layer_norm(), - self_attention=ModuleSpec( - module=MLASelfAttention, - params={"attn_mask_type": AttnMaskType.causal}, - submodules=MLASelfAttentionSubmodules( - linear_q_proj=backend.column_parallel_linear(), - linear_q_down_proj=backend.linear(), - linear_q_up_proj=linear_q_up_proj, - linear_kv_down_proj=backend.linear(), - linear_kv_up_proj=linear_kv_up_proj, - core_attention=backend.core_attention(), - linear_proj=backend.row_parallel_linear(), - q_layernorm=IdentityOp, - kv_layernorm=IdentityOp, - ), - ), - self_attn_bda=get_bias_dropout_add, - pre_mlp_layernorm=IdentityOp, - mlp=mlp, - mlp_bda=get_bias_dropout_add, - ), - ) - else: - qk_norm = backend.layer_norm(for_qk=True) - return ModuleSpec( - module=TransformerLayer, - submodules=TransformerLayerSubmodules( - self_attention=ModuleSpec( - module=SelfAttention, - params={"attn_mask_type": AttnMaskType.causal}, - submodules=SelfAttentionSubmodules( - linear_qkv=backend.column_parallel_layer_norm_linear(), - core_attention=backend.core_attention(), - linear_proj=backend.row_parallel_linear(), - q_layernorm=( - L2Norm if qk_l2_norm else (qk_norm if qk_layernorm else IdentityOp) - ), - k_layernorm=( - L2Norm if qk_l2_norm else (qk_norm if qk_layernorm else IdentityOp) - ), - ), - ), - self_attn_bda=get_bias_dropout_add, - pre_mlp_layernorm=IdentityOp, - mlp=mlp, - mlp_bda=get_bias_dropout_add, - sharded_state_dict_keys_map={ - "mlp.0.weight": "mlp.linear_fc1.layer_norm_weight", - "mlp.0.bias": "mlp.linear_fc1.layer_norm_bias", - "mlp.1.basic_ops.0.weight": "mlp.linear_fc1.weight", - "mlp.1.basic_ops.1.bias": "mlp.linear_fc1.bias", - "mlp.3.basic_ops.0.weight": "mlp.linear_fc2.weight", - "mlp.3.basic_ops.1.bias": "mlp.linear_fc2.bias", - }, - ), - ) - - def get_gpt_layer_with_transformer_engine_spec( num_experts: Optional[int] = None, moe_grouped_gemm: Optional[bool] = False, diff --git a/megatron/core/models/gpt/moe_module_specs.py b/megatron/core/models/gpt/moe_module_specs.py index 62ee4537cfc..1de0f14efcd 100755 --- a/megatron/core/models/gpt/moe_module_specs.py +++ b/megatron/core/models/gpt/moe_module_specs.py @@ -2,13 +2,21 @@ from typing import Optional -from megatron.core.extensions.transformer_engine_spec_provider import TESpecProvider from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider from megatron.core.transformer.mlp import MLPSubmodules from megatron.core.transformer.moe.moe_layer import MoELayer, MoESubmodules from megatron.core.transformer.moe.shared_experts import SharedExpertMLP from megatron.core.transformer.spec_utils import ModuleSpec +try: + import transformer_engine as te # pylint: disable=unused-import + + from megatron.core.extensions.transformer_engine_spec_provider import TESpecProvider + + HAVE_TE = True +except ImportError: + HAVE_TE = False + def get_moe_module_spec( use_te: Optional[bool] = True, diff --git a/megatron/core/models/mamba/mamba_layer_specs.py b/megatron/core/models/mamba/mamba_layer_specs.py index bfe38c2bbc8..8ef4a2ab3e4 100755 --- a/megatron/core/models/mamba/mamba_layer_specs.py +++ b/megatron/core/models/mamba/mamba_layer_specs.py @@ -3,11 +3,9 @@ from megatron.core.extensions.transformer_engine import ( TEDotProductAttention, TELayerNormColumnParallelLinear, - TENorm, TERowParallelLinear, ) from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add -from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules @@ -18,13 +16,6 @@ from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules -moe = get_moe_module_spec( - use_te=True, - num_experts=8, # Can be any positive integer (must not be None). - moe_grouped_gemm=True, - moe_use_legacy_grouped_gemm=False, -) - mamba_stack_spec = ModuleSpec( module=MambaStack, submodules=MambaStackSubmodules( @@ -73,12 +64,5 @@ mlp_bda=get_bias_dropout_add, ), ), - moe_layer=ModuleSpec( - # TODO (rwaleffe): change this to be an "MoELayer" to work with CudaGraphs? - module=TransformerLayer, - submodules=TransformerLayerSubmodules( - pre_mlp_layernorm=TENorm, mlp=moe, mlp_bda=get_bias_dropout_add - ), - ), ), ) diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py index 061cb25f5b8..c254b2f6882 100644 --- a/megatron/core/optimizer/__init__.py +++ b/megatron/core/optimizer/__init__.py @@ -1,9 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -import copy import logging import warnings -from dataclasses import astuple -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, Tuple import torch from torch.optim import SGD as CPUSGD @@ -50,114 +48,100 @@ MegatronOptimizer, param_group_identifier_keys, ) -from .optimizer_config import AdamOptimizerConfig, OptimizerConfig, ParamKey, SGDOptimizerConfig +from .optimizer_config import OptimizerConfig logger = logging.getLogger(__name__) -def _matches(param: torch.nn.Parameter, param_name: str, param_key: ParamKey) -> bool: - """Returns true if passed-in parameter (with name) matches `param_key`. - - Args: - param (torch.nn.Parameter): Handle to parameter object. - param_name (str): Name of parameter in underlying PyTorch module. - param_key (ParamKey): ParamKey object. - - Returns: - bool: True if parameter matches passed-in param_key. - """ - - # Check if name matches. - if isinstance(param_key.name, str): - target_names = [param_key.name] - else: - target_names = list(param_key.name) - for target_name in target_names: - if param_name in target_name: - return True - - # Check if attribute matches. - if isinstance(param_key.attr, str): - target_attrs = [param_key.attr] - else: - target_attrs = list(param_key.attr) - for target_attr in target_attrs: - if getattr(param, target_attr, False): - return True - - return False - - def _get_param_groups( model_chunks: List[MegatronModule], - config: OptimizerConfig, - config_overrides: Optional[Dict[ParamKey, OptimizerConfig]], + no_weight_decay_cond: Optional[Callable], + scale_lr_cond: Optional[Callable], + lr_mult: float, + lr: float, + min_lr: float, + decoupled_lr: Optional[float], + decoupled_min_lr: Optional[float], + default_skip_embedding_weight_decay: bool = False, ) -> List[Dict]: """Create parameter groups for optimizer. - Creates parameter groups from provided optimizer config object. + Creates parameter groups based on weight decay condition (regularized vs + non regularized), learning rate scale condition (lr vs lr_mult * lr), + and whether it is expert parameters. scale_lr_cond is used during finetuning + where head of the network requires a scaled version of the base learning rate. Args: model_chunks (List[MegatronModule]): model chunks to create parameter groups for. - config (OptimizerConfig): optimizer configuration object. - config_overrides (Optional[Dict[LayerKey, OptimizerConfig]): optimizer overrides, - specified on a per-layer basis. + no_weight_decay_cond (func, optional): function to determine whether a + parameter should not perform weight decay. + scale_lr_cond (func, optional): function to determine whether a parameter + should have a scaled learning rate. + lr_mult (float): learning rate multiplier for parameters that + satisfy scale_lr_cond. + lr (float): learning rate. + min_lr (float): minimum learning rate. + decoupled_lr (Optional[float]): optional decoupled learning rate. + decoupled_min_lr (Optional[float]): optional decoupled minimum learning rate. + default_skip_embedding_weight_decay (bool): whether to skip weight decay for embedding + parameters by default, if no_weight_decay_cond is not provided. + Returns: List of parameter groups. """ - # Map (wd_mult, is_expert_parallel, param_group_hyperparameters_config) to params. - params_map = {} - configs_map = {} + use_decoupled_learning_rate = decoupled_lr is not None + # Map (wd_mult, lr_mult, is_expert_parallel, is_decoupled_lr) to params. + params_map = {} for model_chunk in model_chunks: for name, param in model_chunk.named_parameters(): if not param.requires_grad: continue - uses_default_config = False - # Get optimizer config for this parameter. - if config_overrides is None: - config_for_param = config - uses_default_config = True - else: - config_for_param = None - for param_key in config_overrides: - if _matches(param, name, param_key): - config_for_param = config_overrides[param_key] - break - # Fall back to default config. - if config_for_param is None: - config_for_param = config - uses_default_config = True - is_expert_parallel = not getattr(param, 'allreduce', True) - # TODO: Make sure there is a way to support old no_weight_decay_func functionality - # and default_skip_embedding_weight_decay: - # or (default_skip_embedding_weight_decay and "embedding" in name) - no_wd = name.endswith(".bias") or len(param.shape) == 1 - if not no_wd: - wd_mult = 1.0 + if no_weight_decay_cond is not None: + no_wd: bool = no_weight_decay_cond(name, param) else: - wd_mult = 0.0 - - # Create config_tuple that is hash-able. Remove timers object before - # creating config_tuple. - config_for_param_copy = copy.deepcopy(config_for_param) - config_for_param_copy.timers = None - config_tuple = astuple(config_for_param_copy) - key = (wd_mult, is_expert_parallel, config_tuple) + # Do not regularize biases and norm parameters. + # optionally, also skip weight decay for embedding parameters if requested + # (useful if you do not want embeddings to shrink to zero in training + # https://arxiv.org/abs/2312.16903) + no_wd = ( + name.endswith(".bias") + or len(param.shape) == 1 + or (default_skip_embedding_weight_decay and "embedding" in name) + ) + + if scale_lr_cond is not None: + scale_lr = scale_lr_cond(name, param) + else: + scale_lr = False + + if not no_wd and not scale_lr: + wd_mult, _lr_mult = 1.0, 1.0 + elif not no_wd and scale_lr: + wd_mult, _lr_mult = 1.0, lr_mult + elif no_wd and not scale_lr: + wd_mult, _lr_mult = 0.0, 1.0 + else: + wd_mult, _lr_mult = 0.0, lr_mult + + is_decoupled_lr = False + # For input/embedding and output layer: embedding.word_embeddings.weight / + # output_layer.weight. + if use_decoupled_learning_rate and getattr( + param, 'is_embedding_or_output_parameter', False + ): + is_decoupled_lr = True + + key = (wd_mult, _lr_mult, is_expert_parallel, is_decoupled_lr) if key not in params_map: params_map[key] = [] params_map[key].append(param) - if key in configs_map: - assert (config_for_param, uses_default_config) == configs_map[key] - else: - configs_map[key] = (config_for_param, uses_default_config) - # Distributed checkpoint requires all ranks to have the same param groups, # so we need to align the param groups across ranks, otherwise we may have # runtime error when loading the checkpoint or numerical error when resuming training. @@ -171,33 +155,67 @@ def _get_param_groups( param_groups = [] for key in params_key: - wd_mult, is_expert_parallel, _ = key + wd_mult, _lr_mult, is_expert_parallel, is_decoupled_lr = key params = params_map[key] if key in params_map else [] - config, uses_default_config = None, True - if key not in configs_map: - assert params == [] - else: - config, uses_default_config = configs_map[key] - assert config is not None - - # TODO: Remove "backwards compatible" fields below eventually. param_group = { 'params': params, - 'wd_mult': wd_mult, # For backwards compatibility. - 'lr_mult': 1.0, # For backwards compatibility. + 'wd_mult': wd_mult, + 'lr_mult': _lr_mult, 'is_expert_parallel': is_expert_parallel, - 'is_decoupled_lr': False, # For backwards compatibility. - 'default_config': uses_default_config, + 'is_decoupled_lr': is_decoupled_lr, } - - # Stick relevant fields into param_group from config object. - if config is not None: - param_group['max_lr'] = config.lr - param_group['min_lr'] = config.min_lr - # TODO: Add other relevant arguments (e.g., weight decay, optimizer) - # here as well. + # Ensure param_group has required keys for matching when loading optimizer state + # See MegatronOptimizer._filter_and_reorder_param_groups. + assert set(param_group.keys()) - set(param_group_identifier_keys) == {'params'} param_groups.append(param_group) + param_groups = _update_min_and_max_lr_in_param_groups( + param_groups, + lr=lr, + min_lr=min_lr, + decoupled_lr=decoupled_lr, + decoupled_min_lr=decoupled_min_lr, + ) + + return param_groups + + +def _update_min_and_max_lr_in_param_groups( + param_groups: List[Dict], + lr: float, + min_lr: float, + decoupled_lr: Optional[float], + decoupled_min_lr: Optional[float], +) -> List[Dict]: + """ + Updates `max_lr` and `min_lr` values in each parameter group, and returns new list. + By default, each group will use `lr` / `min_lr` as `max_lr` / `min_lr`. + If `decoupled_lr` is provided, then `decoupled_lr` / `decoupled_min_lr` will be used + as `max_lr` / `min_lr` for the input and output layer. + + Args: + param_groups (List): parameter groups whose 'max_lr' and `min_lr` fields need to + be adjusted. + lr (float): learning rate. + min_lr (float): minimum learning rate. + decoupled_lr (Optional[float]): optional decoupled learning rate. + decoupled_min_lr (Optional[float]): optional decoupled minimum learning rate. + + Returns: + List of adjusted parameter groups. + """ + + if decoupled_min_lr is None: + decoupled_min_lr = min_lr + + for param_group in param_groups: + if param_group['is_decoupled_lr']: + assert decoupled_lr is not None + param_group['max_lr'] = decoupled_lr + param_group['min_lr'] = decoupled_min_lr + else: + param_group['max_lr'] = lr + param_group['min_lr'] = min_lr return param_groups @@ -205,9 +223,12 @@ def _get_param_groups_and_buffers( model_chunks: List[MegatronModule], model_chunk_offset: int, config: OptimizerConfig, - config_overrides: Optional[Dict[ParamKey, OptimizerConfig]], + no_weight_decay_cond: Optional[Callable], + scale_lr_cond: Optional[Callable], + lr_mult: float, filter_fn: Callable, buffer_name: str, + default_skip_embedding_weight_decay: bool = False, ) -> Tuple[List[Dict], Dict[int, List[_ParamAndGradBuffer]]]: """Returns parameter groups and buffer for optimizer. @@ -216,17 +237,33 @@ def _get_param_groups_and_buffers( groups for. model_chunk_offset (int): offset of model_chunks in global model_chunks list. config (OptimizerConfig): optimizer configuration object. - config_overrides (Optional[Dict[LayerKey, OptimizerConfig]): optimizer overrides, - specified on a per-layer basis. + no_weight_decay_cond (func, optional): function to determine whether a + parameter should not perform weight decay. + scale_lr_cond (func, optional): function to determine whether a parameter + should have a scaled learning rate. + lr_mult (float): learning rate multiplier for parameters that + satisfy scale_lr_cond. lr (float): learning rate. min_lr (float): minimum learning rate. filter_fn (callable): filtering function for param_groups. buffer_name (str): name of buffer. + default_skip_embedding_weight_decay (bool): whether to skip weight decay for + embedding parameters by default, if no_weight_decay_cond is not provided. Returns: List of parameter groups and dictionary of model chunk IDs to buffers. """ - param_groups = _get_param_groups(model_chunks, config, config_overrides) + param_groups = _get_param_groups( + model_chunks, + no_weight_decay_cond, + scale_lr_cond, + lr_mult, + lr=config.lr, + min_lr=config.min_lr, + decoupled_lr=config.decoupled_lr, + decoupled_min_lr=config.decoupled_min_lr, + default_skip_embedding_weight_decay=default_skip_embedding_weight_decay, + ) param_groups = list(filter(filter_fn, param_groups)) buffers = {} for model_chunk_idx, model_chunk in enumerate(model_chunks): @@ -267,12 +304,9 @@ def _get_megatron_optimizer_based_on_param_groups( Returns: Instance of MegatronOptimizer. """ - # TODO: Logic needs to be updated to handle different optimizer types (i.e., param_groups - # passed into this function need to correspond to the same optimizer). - - # When freezing sub-models we may have no trainable parameters on a rank and + # when freezing sub-models we may have no trainable parameters on a rank and # hence an empty param_groups. However, we still need to create an optimizer - # for the purposes of grad stats reductions. + # for the purposes of grad stats reductions if param_groups: if config.optimizer_cpu_offload: if torch.__version__ < '2.3.0': @@ -442,8 +476,11 @@ def init_state_fn(opt, config=None): def get_megatron_optimizer( config: OptimizerConfig, model_chunks: List[MegatronModule], - config_overrides: Optional[Dict[ParamKey, OptimizerConfig]] = None, + no_weight_decay_cond: Optional[Callable] = None, + scale_lr_cond: Optional[Callable] = None, + lr_mult: float = 1.0, use_gloo_process_groups: bool = True, + default_skip_embedding_weight_decay: bool = False, pg_collection: Optional[ProcessGroupCollection] = None, dump_param_to_param_group_map: Optional[str] = None, ) -> MegatronOptimizer: @@ -454,11 +491,18 @@ def get_megatron_optimizer( Args: config (OptimizerConfig): optimizer configuration object. model_chunks (List[MegatronModule]): model chunks to get optimizer for. - config_overrides (Optional[Dict[ParamKey, OptimizerConfig]]): optional dictionary of - optimizer configuration objects to override default optimizer behavior for different - subsets of parameters (identified by ParamKey). + no_weight_decay_cond (func, optional): function to determine whether a parameter + should not perform weight decay. Defaults to None. + scale_lr_cond (func, optional): function to determine whether a parameter + should have a scaled learning rate. Defaults to None. + lr_mult (float, optional): learning rate multiplier for parameters that + satisfy scale_lr_cond. Defaults to 1.0. use_gloo_process_groups (bool): if false, disable use of Gloo process groups in underlying Megatron optimizers. + default_skip_embedding_weight_decay (bool): whether to skip weight decay for + embedding parameters by default, if no_weight_decay_cond is not provided. + This is useful if you do not want embeddings to shrink to zero in training + as recommended in https://arxiv.org/abs/2312.16903 pg_collection: Optional unified process group for distributed training. dump_param_to_param_group_map (Optional[str]): path to dump parameter to param group map. @@ -468,20 +512,6 @@ def get_megatron_optimizer( log_single_rank(logger, logging.INFO, f'Setting up optimizer with config {config}') - # TODO: Remove `optimizer` from this eventually (e.g., if we use Muon for some layers and - # Adam for other layers). This would need some more refactoring to work though (param_groups - # filtered by optimizer passed into _get_megatron_optimizer_based_on_param_groups). - fields_to_check_for_consistency = [ - 'overlap_param_gather_with_optimizer_step', - 'optimizer', - 'optimizer_cpu_offload', - ] - for field_name in fields_to_check_for_consistency: - field = getattr(config, field_name, None) - if config_overrides is not None: - all_configs = list(config_overrides.values()) - assert all([getattr(x, field_name, None) == field for x in all_configs]) - # Separate out first model chunk if overlapping param AG with optimizer step. if config.overlap_param_gather_with_optimizer_step: all_dense_model_chunks = [[model_chunks[0]], model_chunks[1:]] @@ -523,14 +553,17 @@ def get_megatron_optimizer( model_chunk, model_chunk_offset=model_chunk_offset, config=config, - config_overrides=config_overrides, + no_weight_decay_cond=no_weight_decay_cond, + scale_lr_cond=scale_lr_cond, + lr_mult=lr_mult, filter_fn=lambda g: True, buffer_name='buffers', + default_skip_embedding_weight_decay=default_skip_embedding_weight_decay, ) optimizers.append( _get_megatron_optimizer_based_on_param_groups( - config=config, + config, model_chunks=model_chunk, param_groups=param_groups, per_model_buffers=buffers, @@ -559,9 +592,12 @@ def get_megatron_optimizer( dense_model_chunks, model_chunk_offset=model_chunk_offset, config=config, - config_overrides=config_overrides, + no_weight_decay_cond=no_weight_decay_cond, + scale_lr_cond=scale_lr_cond, + lr_mult=lr_mult, filter_fn=lambda g: not g['is_expert_parallel'], buffer_name='buffers', + default_skip_embedding_weight_decay=default_skip_embedding_weight_decay, ) for model_chunk in dense_model_chunks: model_chunk.overlap_param_gather_with_optimizer_step = ( @@ -577,7 +613,7 @@ def get_megatron_optimizer( # Pass Gloo process groups into optimizer only if needed. optimizers.append( _get_megatron_optimizer_based_on_param_groups( - config=config, + config, model_chunks=dense_model_chunks, param_groups=param_groups, per_model_buffers=buffers, @@ -595,9 +631,12 @@ def get_megatron_optimizer( model_chunks, model_chunk_offset=0, config=config, - config_overrides=config_overrides, + no_weight_decay_cond=no_weight_decay_cond, + scale_lr_cond=scale_lr_cond, + lr_mult=lr_mult, filter_fn=lambda g: g['is_expert_parallel'], buffer_name='expert_parallel_buffers', + default_skip_embedding_weight_decay=default_skip_embedding_weight_decay, ) if dump_param_to_param_group_map is not None: for param_group in moe_param_groups: @@ -614,7 +653,7 @@ def get_megatron_optimizer( expt_data_parallel_group_gloo = None optimizers.append( _get_megatron_optimizer_based_on_param_groups( - config=config, + config, model_chunks=model_chunks, param_groups=moe_param_groups, per_model_buffers=moe_buffers, diff --git a/megatron/core/optimizer/muon.py b/megatron/core/optimizer/muon.py index 2b1f0502e46..ddf20b0abb8 100644 --- a/megatron/core/optimizer/muon.py +++ b/megatron/core/optimizer/muon.py @@ -3,7 +3,7 @@ """Megatron muon optimizer wrapper to handle tensor-parallel.""" import logging -from typing import Any, Callable, Dict, List, Literal, Optional +from typing import Any, Callable, List, Literal, Optional import torch from torch.optim.optimizer import ParamsT @@ -21,7 +21,7 @@ FP32Optimizer, MegatronOptimizer, ) -from .optimizer_config import OptimizerConfig, ParamKey +from .optimizer_config import OptimizerConfig try: from emerging_optimizers.orthogonalized_optimizers import ( @@ -166,7 +166,9 @@ def orthogonalize(self, p: torch.Tensor, grad: torch.Tensor, **kwargs: Any) -> t def get_megatron_muon_optimizer( config: OptimizerConfig, model_chunks: List[MegatronModule], - config_overrides: Optional[Dict[ParamKey, OptimizerConfig]] = None, + no_weight_decay_cond: Optional[Callable] = None, + scale_lr_cond: Optional[Callable] = None, + lr_mult: float = 1.0, use_gloo_process_groups: bool = True, layer_wise_distributed_optimizer: bool = False, pg_collection: Optional[ProcessGroupCollection] = None, @@ -177,15 +179,17 @@ def get_megatron_muon_optimizer( Args: config (OptimizerConfig): optimizer configuration object. model_chunks (List[MegatronModule]): model chunks to get optimizer for. + no_weight_decay_cond (func, optional): function to determine whether a parameter + should not perform weight decay. Defaults to None. + scale_lr_cond (func, optional): function to determine whether a parameter + should have a scaled learning rate. Defaults to None. + lr_mult (float, optional): learning rate multiplier for parameters that + satisfy scale_lr_cond. Defaults to 1.0. use_gloo_process_groups (bool): if false, disable use of Gloo process groups in underlying Megatron optimizers. layer_wise_distributed_optimizer (bool): if true, use layer-wise distributed optimizer. Defaults to False. """ - # Muon currently use adam config. setting str here to call regular get for adam creation - # side effect is muon optimizer will have wrong name, i.e. config.optimizer == 'adam' - config.optimizer = 'adam' - assert HAVE_EMERGING_OPTIMIZERS, "Emerging Optimizers is not installed." # dist-optim is not supported due to strong coupling with how DDP init grad buffer @@ -242,7 +246,16 @@ def get_megatron_muon_optimizer( for param in nonlinear_params: param.requires_grad = False - linear_param_groups = _get_param_groups(model_chunks, config, config_overrides) + linear_param_groups = _get_param_groups( + model_chunks, + no_weight_decay_cond, + scale_lr_cond, + lr_mult, + lr=config.lr, + min_lr=config.min_lr, + decoupled_lr=config.decoupled_lr, + decoupled_min_lr=config.decoupled_min_lr, + ) optimizer = TensorParallelMuon( linear_param_groups, @@ -261,6 +274,13 @@ def get_megatron_muon_optimizer( mode=config.muon_tp_mode, ) + # set config here to: + # 1. get adam for rest of layer + # 2. avoid ChainedOptimizer check fail that assert all optimizers are same kind + # side effect is muon optimizer will have wrong name str, i.e. config.optimizer == 'adam' + # TODO(deyuf): allow user to select optimizer mix and relax ChainedOptimizer design + config.optimizer = 'adam' + # Needed for torch_dist ckpt_format, unlike torch ckpt_format # For other emerging optimizers, need to implement init_state_fn as well # TODO(boxiangw): Improve usability after optimizer refactor @@ -311,10 +331,7 @@ def adam_init_state_fn(opt, config=None): # call original get. linear params will be skipped since they're freezed chained_adam = get_megatron_optimizer( - config, - model_chunks, - config_overrides=config_overrides, - use_gloo_process_groups=use_gloo_process_groups, + config, model_chunks, no_weight_decay_cond, scale_lr_cond, lr_mult, use_gloo_process_groups ) # unfreeze everything diff --git a/megatron/core/optimizer/optimizer.py b/megatron/core/optimizer/optimizer.py index 54e7f67c629..1829cb424f1 100644 --- a/megatron/core/optimizer/optimizer.py +++ b/megatron/core/optimizer/optimizer.py @@ -3,7 +3,6 @@ """Megatron optimizer.""" import copy -import logging import math import warnings from abc import ABC, abstractmethod diff --git a/megatron/core/optimizer/optimizer_config.py b/megatron/core/optimizer/optimizer_config.py index 6a4199a1f7a..8692d1e9b52 100644 --- a/megatron/core/optimizer/optimizer_config.py +++ b/megatron/core/optimizer/optimizer_config.py @@ -1,34 +1,23 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -from dataclasses import dataclass, field -from typing import Callable, Optional, Tuple, Union +from dataclasses import dataclass +from typing import Callable, Optional import torch from ..utils import is_te_min_version -@dataclass(frozen=True, slots=True) -class ParamKey: - """Key to group parameters by. All such grouped parameters can share an - optimizer config specification.""" - - # TODO: Can add layer_id here later. - - name: Union[str, Tuple[str]] = field(default_factory=tuple) - """Parameter name(s).""" - - attr: Union[str, Tuple[str]] = field(default_factory=tuple) - """Parameter attribute(s).""" - - @dataclass class OptimizerConfig: - """Base optimizer configuration object.""" + """Configuration for optimizer.""" ############## # General ############## + optimizer: str = 'adam' + """Optimizer to use (one of Adam, SGD, or Muon).""" + lr: Optional[float] = None """Initial learning rate. Depending on decay style and initial warmup, the learning rate at each iteration would be different. @@ -37,6 +26,14 @@ class OptimizerConfig: min_lr: Optional[float] = None """Minumum value for learning rate. The scheduler clip values below this threshold.""" + decoupled_lr: Optional[float] = None + """Separate learning rate for the input and output layer.""" + + decoupled_min_lr: Optional[float] = None + """Minimum value for learning rate for the input and output layer. The scheduler clip values + below this threshold. + """ + weight_decay: float = 0.01 """Weight decay coefficient for L2 regularization.""" @@ -81,9 +78,6 @@ class OptimizerConfig: exp_avg_sq_dtype: torch.dtype = torch.float32 """dtype of exp_avg_sq when enabling precision-aware-optimizer""" - optimizer: str = 'adam' - """Optimizer name. NOTE: Deprecated, use individual optimizer classes instead.""" - ############### # Loss scaling ############### @@ -104,10 +98,10 @@ class OptimizerConfig: hysteresis: int = 2 """Hysteresis for dynamic loss scaling.""" - ################################################################################### - # Optimizer (NOTE: Deprecated, use individual optimizer classes instead.). - ################################################################################### - # Adam. + ############## + # Optimizer + ############## + # Adam adam_beta1: float = 0.9 """First coefficient for computing running averages of gradient and its square in Adam optimizer. @@ -265,7 +259,6 @@ def __post_init__(self): try: import inspect - # TODO: Move this below? from transformer_engine.pytorch.optimizers import FusedAdam as Adam adam_args = inspect.signature(Adam).parameters @@ -298,35 +291,3 @@ def __post_init__(self): assert ( self.exp_avg_sq_dtype == torch.float32 ), "exp_avg_sq_dtype can only be fp32 when not using precision-aware optimizer" - - -@dataclass -class AdamOptimizerConfig(OptimizerConfig): - """Adam optimizer configuration object.""" - - optimizer: str = 'adam' - """Optimizer name.""" - - adam_beta1: float = 0.9 - """First coefficient for computing running averages of gradient and its square in Adam - optimizer. - """ - - adam_beta2: float = 0.999 - """Second coefficient for computing running averages of gradient and its square in Adam - optimizer. - """ - - adam_eps: float = 1e-08 - """Term added to the denominator to improve numerical stability in Adam optimizer.""" - - -@dataclass -class SGDOptimizerConfig(OptimizerConfig): - """SGD optimizer configuration object.""" - - optimizer: str = 'sgd' - """Optimizer name.""" - - sgd_momentum: float = 0.9 - """Momentum factor for SGD optimizer.""" diff --git a/megatron/core/optimizer_param_scheduler.py b/megatron/core/optimizer_param_scheduler.py index 9f771c612e8..da7e0787676 100644 --- a/megatron/core/optimizer_param_scheduler.py +++ b/megatron/core/optimizer_param_scheduler.py @@ -95,30 +95,19 @@ def __init__( self.step(0) log_single_rank(logger, logging.INFO, f"> learning rate decay style: {self.lr_decay_style}") - def get_wd(self, param_group: Optional[dict] = None) -> float: - """Weight decay incr functions - - Args: - param_group (dict): parameter group from the optimizer.""" - - if param_group is not None: - start_wd = param_group.get('start_wd', self.start_wd) - end_wd = param_group.get('end_wd', self.end_wd) - else: - start_wd = self.start_wd - end_wd = self.end_wd - + def get_wd(self) -> float: + """Weight decay incr functions""" if self.num_steps > self.wd_incr_steps: - return end_wd + return self.end_wd if self.wd_incr_style == 'constant': - assert start_wd == end_wd - return end_wd + assert self.start_wd == self.end_wd + return self.end_wd incr_ratio = float(self.num_steps) / float(self.wd_incr_steps) assert incr_ratio >= 0.0 assert incr_ratio <= 1.0 - delta_wd = end_wd - start_wd + delta_wd = self.end_wd - self.start_wd if self.wd_incr_style == 'linear': coeff = incr_ratio @@ -127,7 +116,7 @@ def get_wd(self, param_group: Optional[dict] = None) -> float: else: raise Exception(f'{self.wd_incr_style} weight decay increment style is not supported.') - return start_wd + coeff * delta_wd + return self.start_wd + coeff * delta_wd def get_lr(self, param_group: dict) -> float: """Learning rate decay functions from: @@ -202,9 +191,11 @@ def step(self, increment: int) -> None: increment (int): number of steps to increment """ self.num_steps += increment + new_wd = self.get_wd() for param_group in self.optimizer.param_groups: - param_group['lr'] = self.get_lr(param_group) - param_group['weight_decay'] = self.get_wd(param_group) * param_group.get('wd_mult', 1.0) + new_lr = self.get_lr(param_group) + param_group['lr'] = new_lr * param_group.get('lr_mult', 1.0) + param_group['weight_decay'] = new_wd * param_group.get('wd_mult', 1.0) def state_dict(self) -> dict: """Return the state dict.""" diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py index 1916bfff079..1e41bf9d8c2 100644 --- a/megatron/core/parallel_state.py +++ b/megatron/core/parallel_state.py @@ -1122,7 +1122,6 @@ def initialize_model_parallel( for ranks in expert_decoder_rank_generator.get_ranks('ep'): group = create_group( ranks, - timeout=timeout, pg_options=get_nccl_options("ep", nccl_comm_cfgs), group_desc="EXPERT_MODEL_PARALLEL_GROUP", ) diff --git a/megatron/core/process_groups_config.py b/megatron/core/process_groups_config.py index ef8f31ea150..07c922ea685 100644 --- a/megatron/core/process_groups_config.py +++ b/megatron/core/process_groups_config.py @@ -140,23 +140,6 @@ def __init__(self, **kwargs): else: raise ValueError(f"Unknown attribute: {key}") - def __repr__(self): - """Return a concise representation showing which process groups exist and their sizes.""" - active_pgs = [] - for field_info in fields(self): - if hasattr(self, field_info.name): - pg = getattr(self, field_info.name) - if pg is not None: - active_pgs.append(f"{field_info.name}({pg.size()})") - else: - # Field exists but is None - active_pgs.append(f"{field_info.name}(None)") - return ( - f"ProcessGroupCollection({', '.join(active_pgs)})" - if active_pgs - else "ProcessGroupCollection(empty)" - ) - @classmethod def use_mpu_process_groups(cls, required_pgs: Optional[List[str]] = None): """ diff --git a/megatron/core/safe_globals.py b/megatron/core/safe_globals.py index cc5eb8809e8..d2baed2a4a0 100755 --- a/megatron/core/safe_globals.py +++ b/megatron/core/safe_globals.py @@ -11,7 +11,6 @@ from numpy.dtypes import UInt32DType from megatron.core.enums import ModelType -from megatron.core.optimizer import OptimizerConfig from megatron.core.rerun_state_machine import RerunDiagnostic, RerunMode, RerunState from megatron.core.transformer.enums import AttnBackend @@ -25,7 +24,6 @@ Namespace, AttnBackend, ModelType, - OptimizerConfig, RerunDiagnostic, RerunMode, RerunState, diff --git a/megatron/core/ssm/mamba_block.py b/megatron/core/ssm/mamba_block.py index de27bb89d2e..1bcadd0af10 100644 --- a/megatron/core/ssm/mamba_block.py +++ b/megatron/core/ssm/mamba_block.py @@ -5,8 +5,10 @@ # This source code is licensed under the Apache license found in the # LICENSE file in the root directory of this source tree. +import math from contextlib import nullcontext from dataclasses import dataclass +from functools import partial from typing import Optional, Tuple, Union import torch @@ -21,6 +23,7 @@ from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols as LayerSymbols from megatron.core.ssm.mamba_hybrid_layer_allocation import allocate_layers +from megatron.core.tensor_parallel import get_cuda_rng_tracker from megatron.core.transformer import TransformerConfig from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.module import MegatronModule @@ -30,6 +33,50 @@ from megatron.core.utils import WrappedTensor, deprecate_inference_params, make_viewless_tensor +# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454 +def _init_weights( + module, + n_layer, + initializer_range=0.02, # Now only used for embedding layer. + rescale_prenorm_residual=True, + n_residuals_per_layer=1, # Change to 2 if we have MLP +): + with get_cuda_rng_tracker().fork(): + if isinstance(module, nn.Linear): + if not getattr(module.weight, "_no_reinit", False): + nn.init.normal_(module.weight, std=initializer_range) + if module.bias is not None: + if not getattr(module.bias, "_no_reinit", False): + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, std=initializer_range) + + for name, p in module.named_parameters(): + if name in ["conv1d.weight", "out_proj.weight"]: + nn.init.kaiming_uniform_(p, a=math.sqrt(5)) + if name in ["in_proj.weight"]: + nn.init.normal_(p, mean=0.0, std=initializer_range) + + if rescale_prenorm_residual: + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the + # > residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of + # > 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): + # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + for name, p in module.named_parameters(): + if name in ["out_proj.weight", "fc2.weight"]: + # Special Scaled Initialization + nn.init.normal_( + p, + mean=0.0, + std=initializer_range / math.sqrt(n_residuals_per_layer * n_layer), + ) + + @dataclass class MambaStackSubmodules: """ @@ -39,7 +86,6 @@ class MambaStackSubmodules: mamba_layer: Union[ModuleSpec, type] = IdentityOp attention_layer: Union[ModuleSpec, type] = IdentityOp mlp_layer: Union[ModuleSpec, type] = IdentityOp - moe_layer: Union[ModuleSpec, type] = IdentityOp class MambaStack(MegatronModule): @@ -125,7 +171,6 @@ def __init__( config=self.config, residual_in_fp32=residual_in_fp32, layer_number=i + 1 + pp_layer_offset, - pp_layer_offset=pp_layer_offset, pg_collection=pg_collection, ) elif layer_type == LayerSymbols.ATTENTION: @@ -144,11 +189,6 @@ def __init__( layer_number=i + 1, pg_collection=pg_collection, ) - elif layer_type == LayerSymbols.MOE: - # Transformer layers apply their own pp_layer_offset - layer = build_module( - submodules.moe_layer, config=self.config, layer_number=i + 1 - ) else: assert False, "unexpected layer_type" self.layers.append(layer) @@ -164,6 +204,15 @@ def __init__( eps=self.config.layernorm_epsilon, ) + if self.config.perform_initialization: + self.apply( + partial( + _init_weights, + n_layer=self.config.num_layers, + initializer_range=self.config.init_method_std, + ) + ) + def _select_layers_for_pipeline_parallel(self, layer_type_list): num_layers_per_pipeline_rank = self.config.num_layers // self.pp_group.size() diff --git a/megatron/core/ssm/mamba_hybrid_layer_allocation.py b/megatron/core/ssm/mamba_hybrid_layer_allocation.py index fe997e2249a..7407bfe899f 100644 --- a/megatron/core/ssm/mamba_hybrid_layer_allocation.py +++ b/megatron/core/ssm/mamba_hybrid_layer_allocation.py @@ -28,8 +28,7 @@ class Symbols: MAMBA = "M" ATTENTION = "*" MLP = "-" - MOE = 'E' - VALID = {MAMBA, ATTENTION, MLP, MOE} + VALID = {MAMBA, ATTENTION, MLP} def _allocate_auto( @@ -173,9 +172,9 @@ def get_layer_maps_from_layer_type_list( ) -> Tuple[Dict[int, int], Dict[int, int], Dict[int, int]]: """ Returns maps from global layer index to the corresponding layer index - for each layer type in [Attention, Mamba, MLP, MoE] given a layer type list. + for each layer type in [Attention, Mamba, MLP] given a layer type list. """ - layer_types = [Symbols.ATTENTION, Symbols.MAMBA, Symbols.MLP, Symbols.MOE] + layer_types = [Symbols.ATTENTION, Symbols.MAMBA, Symbols.MLP] layer_maps = {layer_type: {} for layer_type in layer_types} for global_layer_idx, layer_type in enumerate(layer_type_list): layer_map = layer_maps[layer_type] diff --git a/megatron/core/ssm/mamba_layer.py b/megatron/core/ssm/mamba_layer.py index 6514050ac63..69d5ef21c81 100644 --- a/megatron/core/ssm/mamba_layer.py +++ b/megatron/core/ssm/mamba_layer.py @@ -61,7 +61,6 @@ def __init__( layer_number: int = 1, residual_in_fp32=False, pg_collection: ProcessGroupCollection = None, - pp_layer_offset: int = 0, ): """Initialize Mamba Layer.""" super().__init__(config) @@ -78,7 +77,6 @@ def __init__( d_model=self.config.hidden_size, layer_number=layer_number, pg_collection=pg_collection, - pp_layer_offset=pp_layer_offset, ) self.norm = build_module(submodules.norm, self.config, self.config.hidden_size) self.mamba_bda = build_module(submodules.mamba_bda) diff --git a/megatron/core/ssm/mamba_mixer.py b/megatron/core/ssm/mamba_mixer.py index 91dc266e590..b792f8a2f1f 100644 --- a/megatron/core/ssm/mamba_mixer.py +++ b/megatron/core/ssm/mamba_mixer.py @@ -162,7 +162,6 @@ def __init__( headdim=None, ngroups=None, pg_collection: ProcessGroupCollection = None, - pp_layer_offset: int = 0, ): if not HAVE_MAMBA_SSM: raise ImportError( @@ -184,7 +183,6 @@ def __init__( self.norm_before_gate = norm_before_gate self.chunk_size = chunk_size self.layer_number = layer_number - self.pp_layer_offset = pp_layer_offset self.cached_batch_size = None assert pg_collection is not None, "pg_collection must be provided for MambaMixer" self.pg_collection = pg_collection @@ -299,12 +297,9 @@ def __init__( setattr(self.conv1d.weight, "tensor_model_parallel", True) setattr(self.conv1d.bias, "tensor_model_parallel", True) - if self.config.perform_initialization: + if self.config.perform_initialization and self.conv_init is not None: with get_cuda_rng_tracker().fork(): - if self.conv_init is not None: - nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init) - else: - nn.init.kaiming_uniform_(self.conv1d.weight, a=math.sqrt(5)) + nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init) self.activation = "silu" self.act = nn.SiLU() @@ -329,6 +324,13 @@ def __init__( ) self.dt_bias = nn.Parameter(inv_dt) + # Our initialization would set all Linear.bias to zero, + # need to mark this one as _no_reinit + self.dt_bias._no_reinit = True + # Just to be explicit. Without this we already don't + # put wd on dt_bias because of the check + # name.endswith("bias") in param_grouping.py + self.dt_bias._no_weight_decay = True setattr(self.dt_bias, "tensor_model_parallel", True) # A parameter @@ -340,6 +342,7 @@ def __init__( A = A.uniform_(*A_init_range) A_log = torch.log(A) # Keep A_log in fp32 self.A_log = nn.Parameter(A_log) + self.A_log._no_weight_decay = True setattr(self.A_log, "tensor_model_parallel", True) # D "skip" parameter @@ -349,6 +352,7 @@ def __init__( device=torch.cuda.current_device(), ) ) # Keep in fp32 + self.D._no_weight_decay = True setattr(self.D, "tensor_model_parallel", True) if self.rmsnorm: @@ -361,7 +365,6 @@ def __init__( device=torch.cuda.current_device(), dtype=config.params_dtype, ) - setattr(self.norm.weight, "tensor_model_parallel", True) # Assume sequence parallelism: input is partitioned along d_inner and # output is partitioned along the sequence dimension @@ -455,7 +458,7 @@ def dynamic_inference(self, hidden_states: torch.Tensor, context: DynamicInferen ) assert sequence_packing_available, reason_for_no_sequence_packing - conv_state, ssm_state = context.mamba_states_cache(self.layer_number - self.pp_layer_offset) + conv_state, ssm_state = context.mamba_states_cache(self.layer_number) # Fast path: decode-only if context.is_decode_only(): @@ -501,10 +504,7 @@ def dynamic_inference(self, hidden_states: torch.Tensor, context: DynamicInferen zxBCdt_chunked_prefill = zxBCdt[ active_token_count - chunked_prefill_request_token_count : active_token_count ] - - batch_index_chunked_prefill = batch_indices[ - context.get_index_of_chunked_prefill_request() - ] + batch_index_chunked_prefill = batch_indices[context.chunked_prefill_request_id] y_prefill_chunked = self.ssm_prefill( zxBCdt_chunked_prefill, @@ -941,12 +941,6 @@ def ssm_decode( x_reshaped = rearrange(x, "b (h p) -> b h p", p=self.headdim) if not self.rmsnorm: z = rearrange(z, "b (h p) -> b h p", p=self.headdim) - - # Upcast the batch_indices to prevent integer overflow errors in the case of - # large max request counts. - if batch_indices is not None: - batch_indices = batch_indices.to(torch.int64) - y = selective_state_update( ssm_state, x_reshaped, diff --git a/megatron/core/tensor_parallel/inference_layers.py b/megatron/core/tensor_parallel/inference_layers.py deleted file mode 100644 index 05f7b88d095..00000000000 --- a/megatron/core/tensor_parallel/inference_layers.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - - -from typing import Callable, Optional - -import torch -import torch.distributed as dist - -from megatron.core.extensions.transformer_engine import ( - TELayerNormColumnParallelLinear, - TERowParallelLinear, -) -from megatron.core.model_parallel_config import ModelParallelConfig -from megatron.core.transformer.transformer_config import TransformerConfig -from megatron.core.utils import get_tensor_model_parallel_group_if_none - -try: - import transformer_engine.pytorch.cpp_extensions as tex - from transformer_engine.pytorch.constants import TE_DType - from transformer_engine.pytorch.distributed import ( - gather_along_first_dim, - reduce_scatter_along_first_dim, - ) - - HAVE_TE = True -except ImportError: - HAVE_TE = False - - -def _te_rms_norm_kernel(x: torch.Tensor, weight: torch.Tensor, eps: float): - x_shape = x.shape - x = x.view(-1, x.size(-1)) - out, _, _ = tex.rmsnorm_fwd( - x, weight, eps, None, None, TE_DType[x.dtype], 16, False # sm-margin # zero centered gamma - ) - out = out.view(*x_shape[:-1], -1) - return out.to(x.dtype) - - -class InferenceLayerNormColumnParallelLinear(TELayerNormColumnParallelLinear): - """ - Inference optimized version of TELayerNormColumnParallelLinear. - """ - - def __init__( - self, - input_size: int, - output_size: int, - *, - config: TransformerConfig, - init_method: Callable, - gather_output: bool, - bias: bool, - skip_bias_add: bool, - is_expert: bool, - skip_weight_param_allocation: bool = False, - tp_comm_buffer_name: Optional[str] = None, - tp_group: Optional[torch.distributed.ProcessGroup] = None, - ): - assert HAVE_TE, "--transformer-impl=inference_optimized requires transformer engine" - super().__init__( - input_size, - output_size, - config=config, - init_method=init_method, - gather_output=gather_output, - bias=bias, - skip_bias_add=skip_bias_add, - is_expert=is_expert, - skip_weight_param_allocation=skip_weight_param_allocation, - tp_comm_buffer_name=tp_comm_buffer_name, - tp_group=tp_group, - ) - self.tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert) - self.tp_size = dist.get_world_size(self.tp_group) - - assert ( - output_size % self.tp_size == 0 - ), f"output_size ({output_size}) must be divisible by tp_size ({self.tp_size})" - - self.eps = config.layernorm_epsilon - - if self.tp_size > 1: - assert ( - config.sequence_parallel - ), "--transformer-impl=inference_optimized requires --sequence-parallel" - - @torch.no_grad() - def forward(self, x: torch.Tensor) -> torch.Tensor: - """ - Forward pass. - """ - x = _te_rms_norm_kernel(x=x, weight=self.layer_norm_weight, eps=self.eps) - if self.tp_size > 1: - x, _ = gather_along_first_dim(x, process_group=self.tp_group) - x = torch.matmul(x, self.weight.t()) - return x, None - - -class InferenceRowParallelLinear(TERowParallelLinear): - """ - Inference optimized version of TERowParallelLinear. - """ - - def __init__( - self, - input_size: int, - output_size: int, - *, - config: ModelParallelConfig, - init_method: Callable, - bias: bool, - input_is_parallel: bool, - skip_bias_add: bool, - is_expert: bool, - tp_comm_buffer_name: Optional[str] = None, - tp_group: Optional[torch.distributed.ProcessGroup] = None, - ): - assert HAVE_TE, "--transformer-impl=inference_optimized requires transformer engine" - super().__init__( - input_size, - output_size, - config=config, - init_method=init_method, - bias=bias, - input_is_parallel=input_is_parallel, - skip_bias_add=skip_bias_add, - is_expert=is_expert, - tp_comm_buffer_name=tp_comm_buffer_name, - tp_group=tp_group, - ) - self.tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert) - self.tp_size = dist.get_world_size(self.tp_group) - assert ( - input_size % self.tp_size == 0 - ), f"input_size ({input_size}) must be divisible by tp_size ({self.tp_size})" - - if self.tp_size > 1: - assert ( - config.sequence_parallel - ), "--transformer-impl=inference_optimized requires --sequence-parallel" - - @torch.no_grad() - def forward(self, x: torch.Tensor) -> torch.Tensor: - """ - Forward pass. - """ - x = torch.matmul(x, self.weight.t()) - if self.tp_size > 1: - x, _ = reduce_scatter_along_first_dim(x, tp_group=self.tp_group) - return x, None diff --git a/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py b/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py index 458689fa1f4..c68b0ef89b1 100644 --- a/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +++ b/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py @@ -69,6 +69,7 @@ def __init__( pretrained_model_name_or_path=tokenizer_path, use_fast=use_fast, trust_remote_code=trust_remote_code, + chat_template=chat_template, ) elif merges_file is None: self.tokenizer = AutoTokenizer.from_pretrained( @@ -76,6 +77,7 @@ def __init__( vocab_file=vocab_file, use_fast=use_fast, trust_remote_code=trust_remote_code, + chat_template=chat_template, ) else: self.tokenizer = AutoTokenizer.from_pretrained( @@ -84,6 +86,7 @@ def __init__( merge_files=merges_file, use_fast=use_fast, trust_remote_code=trust_remote_code, + chat_template=chat_template, ) except Exception as e: raise ValueError( @@ -91,14 +94,6 @@ def __init__( f'for {tokenizer_path}. Exception: {e}' ) - # Store the tokenizer's existing chat template if the user does not provide - # a custom chat template. Otherwise, override the default chat template with - # the user-provided template. - if chat_template is None: - chat_template = self.tokenizer.chat_template - else: - self.tokenizer.chat_template = chat_template - self.include_special_tokens = include_special_tokens self.original_vocab_size = len(self.tokenizer) self.chat_template = chat_template diff --git a/megatron/core/tokenizers/text/libraries/null_tokenizer.py b/megatron/core/tokenizers/text/libraries/null_tokenizer.py index 4ddf77fc774..13d56436192 100644 --- a/megatron/core/tokenizers/text/libraries/null_tokenizer.py +++ b/megatron/core/tokenizers/text/libraries/null_tokenizer.py @@ -25,14 +25,6 @@ def ids_to_text(self, ids): text = [str(x) for x in ids] return ' '.join(text) - def tokens_to_ids(self, tokens): - """Converts tokens to ids.""" - return [int(x) for x in tokens] - - def ids_to_tokens(self, ids): - """Converts ids to tokens.""" - return [str(x) for x in ids] - def offsets(self, ids: list[int], text: str) -> list[int]: """Returns offsets.""" offsets, start_idx = [], 0 diff --git a/megatron/core/tokenizers/text/text_tokenizer.py b/megatron/core/tokenizers/text/text_tokenizer.py index 4e0c624e006..2107cf9dce4 100644 --- a/megatron/core/tokenizers/text/text_tokenizer.py +++ b/megatron/core/tokenizers/text/text_tokenizer.py @@ -37,17 +37,13 @@ def __init__(self, path: str, config: dict, **kwargs) -> None: self._tokenizer = self._restore_model(**kwargs) self.additional_args = kwargs self.path = path - - config_template = config.get("chat_template", None) - tokenizer_template = getattr(self._tokenizer, "chat_template", None) - kwargs_template = kwargs.get("chat_template", None) - - if config_template is not None: - self.chat_template = config_template - elif tokenizer_template is not None: - self.chat_template = tokenizer_template + if ( + config.get("chat_template", None) is None + and kwargs.get("chat_template", None) is not None + ): + self.chat_template = kwargs.get("chat_template", None) else: - self.chat_template = kwargs_template + self.chat_template = config.get("chat_template", None) def _restore_model(self, **kwargs) -> MegatronTokenizerTextAbstract: """Returns tokenizer library object.""" diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index 7bb9a12c697..74031f38219 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -35,7 +35,6 @@ from megatron.core.utils import ( deprecate_inference_params, divide, - get_pg_rank, get_pg_size, is_fa_min_version, is_te_min_version, @@ -159,7 +158,6 @@ def __init__( self.config = config self.layer_number = layer_number - self.attn_mask_type = attn_mask_type self.attention_type = attention_type @@ -308,19 +306,6 @@ def _allocate_memory(self, inference_max_sequence_length, batch_size, dim, dtype device=torch.cuda.current_device(), ) - def _get_pp_layer_offset_for_inference(self): - """Return the pipeline parallel layer offset for inference.""" - assert ( - self.config.virtual_pipeline_model_parallel_size is None - ), "Virtual pipeline parallelism is not supported for inference" - - # Import here to avoid circular imports - from megatron.core.transformer.transformer_layer import get_transformer_layer_offset - - return get_transformer_layer_offset( - self.config, vp_stage=None, pp_rank=get_pg_rank(self.pg_collection.pp) - ) - def _adjust_key_value_for_inference( self, inference_context: BaseInferenceContext, @@ -386,15 +371,9 @@ def _adjust_key_value_for_inference( inference_context.key_value_memory_dict[self.layer_number] ) - if ( - not inference_context.is_static_batching() or inference_context.sequence_len_offset > 0 - ) and (not self.training or not is_te_min_version("2.2.0")): + if not inference_context.is_static_batching() or inference_context.sequence_len_offset > 0: # This should mean that we are past the prompt forward_step # and so we need to turn off masking - # Note: in ModelOpt, we may use inference_context for speculative decoding - # in training. In that case, we do not want to turn off masking as we need - # customized attention mask for speculative decoding. - attn_mask_type = AttnMaskType.no_mask if inference_context.is_static_batching(): @@ -465,8 +444,6 @@ def _adjust_key_value_for_inference( key = inference_key_memory[:sequence_end, batch_start:batch_end, ...] value = inference_value_memory[:sequence_end, batch_start:batch_end, ...] else: - pp_layer_offset = self._get_pp_layer_offset_for_inference() - # Apply rotary embeddings before appending KV cache. if inference_context.use_flashinfer_fused_rope and (rotary_pos_cos_sin is not None): query, key = inference_context.apply_fused_qk_rotary_emb( @@ -481,23 +458,17 @@ def _adjust_key_value_for_inference( rotary_pos_emb = (q_pos_emb, None) # key rotary emb has been applied # Append key/value data tensors to cache. - inference_context.append_key_value_cache( - self.layer_number - pp_layer_offset, key, value - ) + inference_context.append_key_value_cache(self.layer_number, key, value) _, max_seqlen_q = inference_context.cu_query_lengths() if getattr(self.config, "cache_mla_latents", None) and max_seqlen_q > 1: # Doing unabsorbed MLA Attention with cached mla latents (prefill/mixed mode) - kv_cache, _, block_table = inference_context.key_value_cache( - self.layer_number - pp_layer_offset - ) + kv_cache, _, block_table = inference_context.key_value_cache(self.layer_number) # Uncompress the KV cache for prefill/mixed mode key, value = self.uncompress_kv_from_cache(kv_cache) else: # Read key/value *pointer* tensors from cache. - key, value, block_table = inference_context.key_value_cache( - self.layer_number - pp_layer_offset - ) + key, value, block_table = inference_context.key_value_cache(self.layer_number) return query, key, value, rotary_pos_emb, attn_mask_type, block_table @abstractmethod diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index 10a739e11c0..12f15ee980a 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -368,26 +368,9 @@ def create_cudagraphs(): def delete_cuda_graphs(): """Delete all CUDA graphs.""" - # Reset runners. - for record in [ - *_CudagraphGlobalRecord.cudagraph_record, - *_CudagraphGlobalRecord.cudagraph_inference_record, - ]: - runner = record[0] - assert isinstance(runner, _CudaGraphRunner) - - runner.cudagraph_created = False - runner.fwd_graph_recorded = False - runner.bwd_graph_recorded = False - runner.fwd_graph = None - runner.bwd_graph = None - runner.fwd_mempool = None - runner.bwd_mempool = None - # Reset global tracking state _CudagraphGlobalRecord.cudagraph_created = False _CudagraphGlobalRecord.cudagraph_record = [] - _CudagraphGlobalRecord.cudagraph_inference_record = [] # TODO: Optional?: Force garbage collection to clean up memory gc.collect() diff --git a/megatron/core/transformer/fsdp_dtensor_checkpoint.py b/megatron/core/transformer/fsdp_dtensor_checkpoint.py index 04ec982e6ff..65e2f5f9dff 100644 --- a/megatron/core/transformer/fsdp_dtensor_checkpoint.py +++ b/megatron/core/transformer/fsdp_dtensor_checkpoint.py @@ -484,6 +484,6 @@ def get_global_unique_param_name(model_chunks, param): # Get EP unique parameter name num_experts = model_chunks[0].config.num_moe_experts if model_chunks else None - param_name = next(iter(handle_experts_in_state_dict({param_name: None}, num_experts).keys())) + param_name = list(handle_experts_in_state_dict({param_name: None}, num_experts).keys())[0] return param_name diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py index 8754e938348..b2135fdb00d 100644 --- a/megatron/core/transformer/moe/token_dispatcher.py +++ b/megatron/core/transformer/moe/token_dispatcher.py @@ -48,8 +48,6 @@ num_global_tokens: num_local_tokens*TP*EP """ -logger = logging.getLogger(__name__) - class MoETokenDispatcher: """ @@ -1272,6 +1270,7 @@ def _pad_routing_map( # Check if there are enough tokens to pad enough_tokens_to_pad = torch.all(target_tokens_per_expert <= num_input_tokens) if not enough_tokens_to_pad: + logger = logging.getLogger(__name__) logger.warning( "Not enough tokens to pad. The total number of tokens received in this rank " "is smaller than the target number of tokens for each expert. " diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index 3f8c97099da..fae2e2f5d4d 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -749,9 +749,6 @@ class TransformerConfig(ModelParallelConfig): symmetric_ar_type: Optional[str] = None """Type of symmetric all reduce to use""" - use_inference_optimized_layers: bool = False - """If True, use inference optimized transformer layers during inference.""" - mrope_section: Optional[List[int]] = None """ Multimodal rope section is for channel dimension of temporal, height and width in rope calculation. """ @@ -1877,13 +1874,6 @@ def __post_init__(self): f"for context parallelism, but got {self.cp_comm_type=} instead." ) - if self.transformer_impl == "inference_optimized": - assert self.normalization == "RMSNorm" - assert not self.layernorm_zero_centered_gamma - assert not self.add_bias_linear - assert not self.add_qkv_bias - assert not self.use_kitchen - @dataclass class MLATransformerConfig(TransformerConfig): diff --git a/megatron/core/utils.py b/megatron/core/utils.py index 77a004a6845..9b62b18d400 100644 --- a/megatron/core/utils.py +++ b/megatron/core/utils.py @@ -24,7 +24,7 @@ from functools import lru_cache, reduce, wraps from importlib.metadata import version from types import TracebackType -from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Callable, Coroutine, Dict, List, Optional, Tuple, Type, Union import numpy import torch @@ -2140,28 +2140,23 @@ def maybe_cat(a, b, dim=0, *, required=False): return xs[0] if len(xs) == 1 else torch.cat(xs, dim=dim) -_ASYNC_IO_LOOP: asyncio.AbstractEventLoop | None = None - - def get_asyncio_loop(loop: asyncio.AbstractEventLoop | None = None) -> asyncio.AbstractEventLoop: """Creates an asyncio loop if necessary and then returns the current asyncio loop.""" - global _ASYNC_IO_LOOP if loop is None: try: loop = asyncio.get_running_loop() except RuntimeError as e: - if _ASYNC_IO_LOOP is not None: - return _ASYNC_IO_LOOP - else: - _ASYNC_IO_LOOP = loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) return loop _ASYNC_TASK_STATS = defaultdict(lambda: [0, 0.0]) # cnt, total_time -def trace_async_exceptions(func: Optional[Callable] = None, *, verbose: bool = False): +def trace_async_exceptions( + func: Optional[Callable[..., Coroutine]], *, verbose: bool = False +) -> Callable[..., Coroutine]: """Decorator to be applied to every coroutine that runs in a separate task. This is needed because asyncio tasks do not propagate exceptions. @@ -2176,81 +2171,41 @@ async def my_coroutine(...): ``` """ - def _log_verbose(name: str, start: float) -> None: - elapsed = (time.perf_counter() - start) * 1000.0 - cnt, tot = _ASYNC_TASK_STATS[name] - _ASYNC_TASK_STATS[name] = [cnt + 1, tot + elapsed] - avg = _ASYNC_TASK_STATS[name][1] / _ASYNC_TASK_STATS[name][0] - - log10 = numpy.log10(max(cnt, 1)) - if numpy.isclose(log10, round(log10)): - logger.info( - f"{name} completed in {elapsed:.3f} ms, " - f"lifetime avg: {avg:.3f} ms, " - f"lifetime cnt: {cnt + 1}" - ) - - def _decorate(fn: Callable): - if asyncio.iscoroutinefunction(fn): - - @functools.wraps(fn) - async def wrapper(*args, **kwargs): + def _decorate(fn): + if not asyncio.iscoroutinefunction(fn): + raise TypeError("trace_async_exceptions can only be used with async functions") + + @functools.wraps(fn) + async def wrapper(*args, **kwargs): + if verbose: + start = time.perf_counter() + try: + return await fn(*args, **kwargs) + except Exception as e: + logger.error(f"Exception in async function {fn.__name__}: {e}") + traceback.print_exc() + sys.exit(1) + finally: if verbose: - start = time.perf_counter() - try: - return await fn(*args, **kwargs) - except Exception as e: - logger.error(f"Exception in async function {fn.__name__}: {e}") - traceback.print_exc() - sys.exit(1) - finally: - if verbose: - _log_verbose(fn.__qualname__, start) - - elif inspect.isasyncgenfunction(fn): - - @functools.wraps(fn) - async def wrapper(*args, **kwargs): - if verbose: - start = time.perf_counter() - agen = fn(*args, **kwargs) - try: - async for item in agen: - yield item - except Exception as e: - logger.error(f"Exception in async generator {fn.__name__}: {e}") - traceback.print_exc() - sys.exit(1) - finally: - if verbose: - _log_verbose(fn.__qualname__, start) + elapsed = (time.perf_counter() - start) * 1000.0 + name = fn.__qualname__ + cnt, tot = _ASYNC_TASK_STATS[name] + _ASYNC_TASK_STATS[name] = [cnt + 1, tot + elapsed] + avg = _ASYNC_TASK_STATS[name][1] / _ASYNC_TASK_STATS[name][0] + + log10 = numpy.log10(max(cnt, 1)) + if numpy.isclose(log10, round(log10)): + logger.info( + f"{name} completed in {elapsed:.3f} ms, " + f"lifetime avg: {avg:.3f} ms, " + f"lifetime cnt: {cnt + 1}" + ) - else: - raise TypeError("trace_async_exceptions must be used on async functions or generators") return wrapper return _decorate if func is None else _decorate(func) -def get_mamba_inference_state_config_from_model(model) -> Optional["MambaInferenceStateConfig"]: - """Returns Mamba inference state config from the model if it is a hybrid model.""" - from megatron.core.inference.contexts.attention_context.mamba_metadata import ( - MambaInferenceStateConfig, - ) - from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols - - decoder = get_attr_wrapped_model(model, "decoder") - layer_type_list = getattr(decoder, "layer_type_list", None) - if layer_type_list is not None and Symbols.MAMBA in layer_type_list: - (mamba_conv_states_shape, mamba_ssm_states_shape) = decoder.mamba_state_shapes_per_request() - return MambaInferenceStateConfig( - layer_type_list=layer_type_list, - mamba_conv_states_shape=mamba_conv_states_shape, - mamba_ssm_states_shape=mamba_ssm_states_shape, - ) - return None - - # ============================================================================ # Backward Compatibility Decorators # ============================================================================ diff --git a/megatron/legacy/data/biencoder_dataset_utils.py b/megatron/legacy/data/biencoder_dataset_utils.py index 6d69fabbe48..6fa391c8a22 100644 --- a/megatron/legacy/data/biencoder_dataset_utils.py +++ b/megatron/legacy/data/biencoder_dataset_utils.py @@ -5,14 +5,11 @@ import numpy as np import torch -from megatron.core import mpu, tensor_parallel -from megatron.legacy.data.dataset_utils import ( - create_masked_lm_predictions, - pad_and_convert_to_numpy, -) from megatron.training import get_args, get_tokenizer, print_rank_0 -from megatron.training.datasets.data_samplers import MegatronPretrainingSampler - +from megatron.core import mpu, tensor_parallel +from megatron.legacy.data.dataset_utils import create_masked_lm_predictions, \ + pad_and_convert_to_numpy +from megatron.legacy.data.data_samplers import MegatronPretrainingSampler def make_attention_mask(source_block, target_block): """ diff --git a/megatron/training/datasets/data_samplers.py b/megatron/legacy/data/data_samplers.py similarity index 56% rename from megatron/training/datasets/data_samplers.py rename to megatron/legacy/data/data_samplers.py index 1e7f47510d1..1bf1bf5ee91 100644 --- a/megatron/training/datasets/data_samplers.py +++ b/megatron/legacy/data/data_samplers.py @@ -4,17 +4,13 @@ import random - -import numpy as np import torch +import numpy as np from torch.utils.data import Dataset - +from megatron.training import get_args from megatron.core import mpu from megatron.core.datasets.utils import Split -from megatron.training import get_args -from megatron.training.dist_signal_handler import DistributedSignalHandler - def build_pretraining_data_loader(dataset, consumed_samples): """Build dataloader given an input dataset.""" @@ -22,10 +18,10 @@ def build_pretraining_data_loader(dataset, consumed_samples): if dataset is None: return None args = get_args() - - if hasattr(dataset, 'split'): + + if hasattr(dataset,'split'): split = dataset.split - elif hasattr(dataset, 'index_split'): + elif hasattr(dataset,'index_split'): split = dataset.index_split else: split = None @@ -36,8 +32,7 @@ def build_pretraining_data_loader(dataset, consumed_samples): consumed_samples=0, micro_batch_size=args.micro_batch_size, data_parallel_rank=mpu.get_data_parallel_rank(), - data_parallel_size=mpu.get_data_parallel_world_size(), - ) + data_parallel_size=mpu.get_data_parallel_world_size()) elif args.dataloader_type == 'single': # Megatron sampler batch_sampler = MegatronPretrainingSampler( @@ -45,8 +40,7 @@ def build_pretraining_data_loader(dataset, consumed_samples): consumed_samples=consumed_samples, micro_batch_size=args.micro_batch_size, data_parallel_rank=mpu.get_data_parallel_rank(), - data_parallel_size=mpu.get_data_parallel_world_size(), - ) + data_parallel_size=mpu.get_data_parallel_world_size()) elif args.dataloader_type == 'cyclic': batch_sampler = MegatronPretrainingRandomSampler( dataset, @@ -55,82 +49,52 @@ def build_pretraining_data_loader(dataset, consumed_samples): micro_batch_size=args.micro_batch_size, data_parallel_rank=mpu.get_data_parallel_rank(), data_parallel_size=mpu.get_data_parallel_world_size(), - data_sharding=args.data_sharding, - ) + data_sharding=args.data_sharding) elif args.dataloader_type == "external": # External dataloaders are passed through. User is expected to provide a # torch-compatible dataloader and define samplers, if needed. return dataset else: - raise Exception('{} dataloader type is not supported.'.format(args.dataloader_type)) - - def worker_init_fn(_): - DistributedSignalHandler(args.exit_signal).__enter__() + raise Exception('{} dataloader type is not supported.'.format( + args.dataloader_type)) - maybe_worker_init_fn = ( - worker_init_fn if args.exit_signal_handler and args.num_workers > 0 else None - ) # Torch dataloader. - return torch.utils.data.DataLoader( - dataset, - batch_sampler=batch_sampler, - num_workers=args.num_workers, - pin_memory=True, - persistent_workers=True if args.num_workers > 0 else False, - worker_init_fn=maybe_worker_init_fn, - ) - + return torch.utils.data.DataLoader(dataset, + batch_sampler=batch_sampler, + num_workers=args.num_workers, + pin_memory=True, + persistent_workers=True if args.num_workers > 0 else False, + ) class MegatronPretrainingSampler: - """ - Sampler for Megatron pretraining dataloaders that divides data samples across - data parallel workers. Each worker receives a contiguous chunk of data determined by - its rank and the micro batch size. Supports dropping the last incomplete batch if - specified, and keeps track of total and consumed samples. Designed to work with - distributed training using Megatron's data parallelism. - """ - def __init__( - self, - total_samples, - consumed_samples, - micro_batch_size, - data_parallel_rank, - data_parallel_size, - drop_last=True, - ): + def __init__(self, total_samples, consumed_samples, micro_batch_size, + data_parallel_rank, data_parallel_size, drop_last=True): # Keep a copy of input params for later use. self.total_samples = total_samples self.consumed_samples = consumed_samples self.micro_batch_size = micro_batch_size self.data_parallel_rank = data_parallel_rank - self.micro_batch_times_data_parallel_size = self.micro_batch_size * data_parallel_size + self.micro_batch_times_data_parallel_size = \ + self.micro_batch_size * data_parallel_size self.drop_last = drop_last # Sanity checks. - assert self.total_samples > 0, 'no sample to consume: {}'.format(self.total_samples) - assert ( - self.consumed_samples < self.total_samples - ), 'no samples left to consume: {}, {}'.format(self.consumed_samples, self.total_samples) + assert self.total_samples > 0, \ + 'no sample to consume: {}'.format(self.total_samples) + assert self.consumed_samples < self.total_samples, \ + 'no samples left to consume: {}, {}'.format(self.consumed_samples, + self.total_samples) assert self.micro_batch_size > 0 assert data_parallel_size > 0 - assert ( - self.data_parallel_rank < data_parallel_size - ), 'data_parallel_rank should be smaller than data size: {}, ' '{}'.format( - self.data_parallel_rank, data_parallel_size - ) + assert self.data_parallel_rank < data_parallel_size, \ + 'data_parallel_rank should be smaller than data size: {}, ' \ + '{}'.format(self.data_parallel_rank, data_parallel_size) def __len__(self): return self.total_samples def get_start_end_idx(self): - """ - Calculate the start and end indices for the current data parallel worker's - chunk within a batch. - - Returns: - tuple: (start_idx, end_idx) indicating the slice of the batch for this worker. - """ start_idx = self.data_parallel_rank * self.micro_batch_size end_idx = start_idx + self.micro_batch_size return start_idx, end_idx @@ -152,37 +116,17 @@ def __iter__(self): class RandomSeedDataset(Dataset): - """ - A dataset wrapper that resets the random seed before each sample. - This ensures deterministic behavior per sample by setting the RNG state - for torch, numpy, and random before accessing each underlying data sample. - The base seed is retrieved from training arguments, and can be varied per epoch - using the set_epoch method to ensure different shuffling or augmentation each epoch. - - Args: - dataset: The underlying dataset to wrap. - - Methods: - set_epoch(epoch): Change the seed offset so each epoch produces different randomization. - __getitem__(idx): Sets the seed based on the sample index and current epoch. - """ - - def __init__(self, dataset, seed): - self.base_seed = seed - self.curr_seed = seed + def __init__(self, dataset): + args = get_args() + self.base_seed = args.seed + self.curr_seed = args.seed self.dataset = dataset def __len__(self): return len(self.dataset) def set_epoch(self, epoch): - """ - Change the seed offset so each epoch produces different randomization. - - Args: - epoch: The epoch number to use as the seed offset. - """ self.curr_seed = self.base_seed + epoch def __getitem__(self, idx): @@ -194,23 +138,9 @@ def __getitem__(self, idx): class MegatronPretrainingRandomSampler: - """ - Sampler for Megatron pretraining dataloaders that performs random sampling - across data parallel workers. Supports data sharding to divide the dataset - into buckets and shuffle within each bucket. Designed to work with distributed - training using Megatron's data parallelism. - """ - def __init__( - self, - dataset, - total_samples, - consumed_samples, - micro_batch_size, - data_parallel_rank, - data_parallel_size, - data_sharding, - ): + def __init__(self, dataset, total_samples, consumed_samples, micro_batch_size, + data_parallel_rank, data_parallel_size, data_sharding): # Keep a copy of input params for later use. self.dataset = dataset self.total_samples = total_samples @@ -219,18 +149,19 @@ def __init__( self.data_parallel_rank = data_parallel_rank self.data_parallel_size = data_parallel_size self.data_sharding = data_sharding - self.micro_batch_times_data_parallel_size = self.micro_batch_size * data_parallel_size - self.last_batch_size = self.total_samples % self.micro_batch_times_data_parallel_size + self.micro_batch_times_data_parallel_size = \ + self.micro_batch_size * data_parallel_size + self.last_batch_size = \ + self.total_samples % self.micro_batch_times_data_parallel_size # Sanity checks. - assert self.total_samples > 0, 'no sample to consume: {}'.format(self.total_samples) + assert self.total_samples > 0, \ + 'no sample to consume: {}'.format(self.total_samples) assert self.micro_batch_size > 0 assert data_parallel_size > 0 - assert ( - self.data_parallel_rank < data_parallel_size - ), 'data_parallel_rank should be smaller than data size: {}, ' '{}'.format( - self.data_parallel_rank, data_parallel_size - ) + assert self.data_parallel_rank < data_parallel_size, \ + 'data_parallel_rank should be smaller than data size: {}, ' \ + '{}'.format(self.data_parallel_rank, data_parallel_size) def __len__(self): return self.total_samples @@ -246,9 +177,8 @@ def __iter__(self): # data sharding and random sampling if self.data_sharding: - bucket_size = ( - self.total_samples // self.micro_batch_times_data_parallel_size - ) * self.micro_batch_size + bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \ + * self.micro_batch_size bucket_offset = current_epoch_samples // self.data_parallel_size start_idx = self.data_parallel_rank * bucket_size @@ -257,13 +187,15 @@ def __iter__(self): random_idx = torch.randperm(bucket_size, generator=g).tolist() idx_range = [start_idx + x for x in random_idx[bucket_offset:]] else: - full_bucket_size = (self.total_samples // self.micro_batch_size) * self.micro_batch_size + full_bucket_size = (self.total_samples // self.micro_batch_size) \ + * self.micro_batch_size full_bucket_offset = current_epoch_samples g = torch.Generator() g.manual_seed(self.epoch) - idx_range_total = torch.randperm(full_bucket_size, generator=g).tolist() + idx_range_total = \ + torch.randperm(full_bucket_size, generator=g).tolist() idx_range_active = idx_range_total[full_bucket_offset:] - idx_range = idx_range_active[self.data_parallel_rank :: self.data_parallel_size] + idx_range = idx_range_active[self.data_parallel_rank::self.data_parallel_size] batch = [] # Last batch if not complete will be dropped. diff --git a/megatron/legacy/data/vit_dataset.py b/megatron/legacy/data/vit_dataset.py index 504075a5506..e65c536c897 100644 --- a/megatron/legacy/data/vit_dataset.py +++ b/megatron/legacy/data/vit_dataset.py @@ -1,17 +1,15 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. import os import random - import numpy as np import torch import torchvision.transforms as T -from PIL import Image, ImageFilter, ImageOps from torchvision import datasets - -from megatron.legacy.data.autoaugment import ImageNetPolicy -from megatron.legacy.data.image_folder import ImageFolder from megatron.training import get_args -from megatron.training.datasets.data_samplers import RandomSeedDataset +from megatron.legacy.data.image_folder import ImageFolder +from megatron.legacy.data.autoaugment import ImageNetPolicy +from megatron.legacy.data.data_samplers import RandomSeedDataset +from PIL import Image, ImageFilter, ImageOps class GaussianBlur(object): @@ -238,7 +236,7 @@ def build_train_valid_datasets(data_path, image_size=224): classes_fraction=args.classes_fraction, data_per_class_fraction=args.data_per_class_fraction ) - train_data = RandomSeedDataset(train_data, args.seed) + train_data = RandomSeedDataset(train_data) # validation dataset val_data_path = data_path[1] @@ -246,6 +244,6 @@ def build_train_valid_datasets(data_path, image_size=224): root=val_data_path, transform=val_transform ) - val_data = RandomSeedDataset(val_data, args.seed) + val_data = RandomSeedDataset(val_data) return train_data, val_data diff --git a/megatron/post_training/algos/__init__.py b/megatron/post_training/algos/__init__.py new file mode 100644 index 00000000000..f8011007a50 --- /dev/null +++ b/megatron/post_training/algos/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/megatron/post_training/algos/distillation.py b/megatron/post_training/algos/distillation.py new file mode 100644 index 00000000000..c54add0a8d7 --- /dev/null +++ b/megatron/post_training/algos/distillation.py @@ -0,0 +1,601 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +"""Distillation loss function(s).""" + +import logging +import re +import types +from abc import ABCMeta +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import modelopt.torch.distill as mtd +import modelopt.torch.opt as mto +import torch +import torch.nn as nn +import torch.nn.functional as F +import yaml +from torch import Tensor +from torch.nn.modules.loss import _Loss + +from megatron.core.dist_checkpointing.mapping import ShardedStateDict +from megatron.core.parallel_state import ( + get_context_parallel_group, + get_pipeline_model_parallel_world_size, + get_tensor_and_context_parallel_rank, + get_tensor_model_parallel_group, + get_virtual_pipeline_model_parallel_world_size, + is_pipeline_last_stage, +) +from megatron.core.pipeline_parallel.schedules import get_tensor_shapes +from megatron.core.transformer import MegatronModule, TransformerConfig, TransformerLayer +from megatron.core.utils import get_model_config + +logger = logging.getLogger(__name__) + + +def load_distillation_config( + config_path: Optional[str], student_cfg: TransformerConfig, teacher_cfg: TransformerConfig +) -> Dict[str, Any]: + """Read the distillation yaml config file specified by ``args.export_kd_cfg``. + + Args: + config_path: Path to user-defined distillation settings yaml file. + If `None`, uses default logits-only distillation mode for GPT models. + student_cfg: Model config for student model. + teacher_cfg: Model config for teacher model. + + WARNING: Assumes intermediate hidden sizes are always that found in the model config's ``hidden_size`` attribute. + """ + if not config_path: + logger.warning("Distillation config not provided. Using default.") + cfg = { + "logit_layers": ["output_layer", "output_layer"], + "intermediate_layer_pairs": [], + "skip_lm_loss": True, + "kd_loss_scale": 1.0, + } + else: + with open(config_path) as f: + cfg = yaml.safe_load(f) + + intermediate_pairs = cfg.get("intermediate_layer_pairs", []) + logit_pair = cfg["logit_layers"] + skip_lm_loss = cfg["skip_lm_loss"] + loss_scale = cfg["kd_loss_scale"] + + criterion = {} + if student_cfg.pipeline_model_parallel_size == 1 or is_pipeline_last_stage(): + criterion[tuple(logit_pair)] = LogitsKLLoss(student_cfg) + # NOTE: Projection layer shared among intermediate layer pairs. + projection_layer = ProjectionLayer(student_cfg, teacher_cfg) + + for entry in intermediate_pairs: + if len(entry) == 2: + student_layer, teacher_layer = entry + loss = "hidden_cosine" + elif len(entry) == 3: + student_layer, teacher_layer, loss = entry + + loss_fn = None + + if loss == "mse": + loss_fn = MSELoss + elif loss == "hidden_cosine": + loss_fn = HiddenStateCosineLoss + else: + assert False, f"loss passed was {loss=}" + + if get_tensor_and_context_parallel_rank() == 0: + print( + "Distillation: Adding intermediate loss between" + f" `{student_layer}` of student (hidden size {student_cfg.hidden_size}) and" + f" `{teacher_layer}` of teacher (hidden size {teacher_cfg.hidden_size})." + ) + student_layer = _adjust_layer_index_for_pp(student_layer, student_cfg) + teacher_layer = _adjust_layer_index_for_pp(teacher_layer, teacher_cfg) + criterion[(student_layer, teacher_layer)] = loss_fn( + student_cfg, projection_layer=projection_layer + ) + + loss_balancer = LogitsAndIntermediatesLossBalancer( + kd_loss_scale=loss_scale, skip_original_loss=skip_lm_loss + ) + + cfg["criterion"] = criterion + cfg["loss_balancer"] = loss_balancer + + return cfg + + +def _adjust_layer_index_for_pp(submodule_name, model_cfg): + """Adjust any sequence-based layer indices found in a submodule name for Pipeline Parallelism.""" + + match = re.search(r'(?<=\.)\d+(?=\.)', submodule_name) + if not match: + return submodule_name + + offset = TransformerLayer._get_layer_offset(model_cfg) + new_layer_idx = int(match.group(0)) - offset + if new_layer_idx < 0: + raise ValueError(f"Layer {submodule_name} does not fall on final PP rank.") + + new_submodule_name = submodule_name.replace(match.group(0), str(new_layer_idx)) + if get_tensor_and_context_parallel_rank() == 0: + print( + f'Distillation: Renamed layer "{submodule_name}" on final PP rank to "{new_submodule_name}"' + ) + return new_submodule_name + + +######################################################## + + +class BaseLoss(_Loss, metaclass=ABCMeta): + """Abstract base class for Megatron distillation losses.""" + + def __init__( + self, model_config: TransformerConfig, projection_layer: Optional[nn.Module] = None + ): + """ + Constructor. + + Args: + model_config: MCore transformer config. + projection_layer: Module which projects student activations to teacher's hidden dim. + """ + super().__init__() + self._config = model_config + self._projection = projection_layer + + def pre_forward(self, predictions: Tensor, targets: Tensor) -> Tuple[Tensor, Tensor]: + """Performs projection of student tensor to match teacher's size if necessary.""" + if isinstance(predictions, tuple): + # `ColumnParallelLinear` returns bias too + predictions, targets = predictions[0], targets[0] + + if self._projection is not None: + predictions = self._projection(predictions) + targets = targets.detach() + + return predictions, targets + + def post_forward(self, loss: Tensor, tp_reduce: bool = False, is_sequence_parallel: bool = False) -> Tensor: + """Reshapes tensor from [s, b] to [b, s] for upcoming loss masking.""" + loss = loss.transpose(0, 1).contiguous() + return (loss, tp_reduce, is_sequence_parallel) + + +class HiddenStateCosineLoss(BaseLoss): + """ + Calculates Cosine loss between two tensors without reducing the sequence dim. + + The tensors are assumed to be intermediate activations, so extra restrictions are in place. + """ + + def __init__( + self, model_config: TransformerConfig, projection_layer: Optional[nn.Module] = None + ): + """ + Constructor. + + Args: + model_config: MCore transformer config. + projection_layer: Module which projects student activations to teacher's hidden dim. + """ + super().__init__(model_config, projection_layer=projection_layer) + + if self._config.tensor_model_parallel_size > 1 and not self._config.sequence_parallel: + logger.warning( + "``HiddenStateCosineLoss`` only works with tensors with full hidden dim. Ensure the " + "tensor inputs meet this requirement or use `--sequence_parallel` if tensor parallel is enabled." + ) + + def forward(self, predictions: Tensor, targets: Tensor) -> Tensor: + """ + Forward function. + + Args: + predictions: Student model tensors (size [s, b, h]) + targets: Teacher model tensors (size [s, b, h]) + + Returns: + Cosine loss of tensors (size [b, s]) + """ + predictions, targets = self.pre_forward(predictions, targets) + + loss = F.cosine_embedding_loss( + predictions.view(-1, predictions.size(-1)), + targets.view(-1, targets.size(-1)), + targets.new_ones(1), + reduction="none", + ) + loss = loss.view(*predictions.shape[:2]) + + # NOTE: Tensor sequence length is still split among TP ranks. + return self.post_forward(loss, is_sequence_parallel=self._config.sequence_parallel) + + +class MSELoss(BaseLoss): + """Calculates MSE loss between two tensors without reducing the sequence dim.""" + + def forward(self, predictions: Tensor, targets: Tensor) -> Tensor: + """Forward function. + + Args: + predictions: Student model tensors (size [s, b, h]) + targets: Teacher model tensors (size [s, b, h]) + + Returns: + MSE loss of tensors (size [b, s]) + """ + predictions, targets = self.pre_forward(predictions, targets) + + loss = F.mse_loss(predictions, targets, reduction="none") + loss = loss.mean(dim=-1) + + return self.post_forward(loss, is_sequence_parallel=self._config.sequence_parallel) + + +class LogitsKLLoss(BaseLoss): + """Calculates KL-Divergence loss between two logits tensors without reducing the sequence dim.""" + + def __init__( + self, model_config: TransformerConfig, temperature: float = 1.0, reverse: bool = False + ): + """ + Constructor. + + Args: + model_config: MCore transformer config. + temperature: Divide tensors by this value prior to calculating loss. + reverse: Whether to reverse the loss as KLD(teacher, student) instead of KLD(student, teacher) + """ + super().__init__(model_config) + self._temperature = temperature + self._reverse = reverse + + def forward(self, predictions: Tensor, targets: Tensor) -> Tensor: + """ + Forward function. + + Args: + predictions: Student model tensors (size [s, b, h]) + targets: Teacher model tensors (size [s, b, h]) + + Returns: + KLD loss of tensors (size [b, s]) + """ + predictions, targets = self.pre_forward(predictions, targets) + + # Division by temp should happen prior to finding max for both student and teacher. + # Currently we don't use temperature in any of ours runs (temp=1.0) + output_teacher = targets.float() / self._temperature + output_student = predictions.float() / self._temperature + + # Compute local softmax, and the reweight to compute global softmax. + if self._config.tensor_model_parallel_size > 1: + + # Maximum value along vocab dimension across all GPUs. + teacher_logits_max, _ = torch.max(output_teacher, dim=-1) + torch.distributed.all_reduce( + teacher_logits_max, + op=torch.distributed.ReduceOp.MAX, + group=get_tensor_model_parallel_group(), + ) + output_teacher = output_teacher - teacher_logits_max.unsqueeze(dim=-1) + + denom_teacher = torch.sum(torch.exp(output_teacher), dim=-1) + # We can't use standard reduction function here since the computation + # that follows it isn't identical across TP ranks. + denom_teacher = all_reduce_autograd( + denom_teacher, group=get_tensor_model_parallel_group() + ) + + # Maximum value along vocab dimension across all GPUs. + student_logits_max, _ = torch.max(output_student, dim=-1) + torch.distributed.all_reduce( + student_logits_max, + op=torch.distributed.ReduceOp.MAX, + group=get_tensor_model_parallel_group(), + ) + output_student = output_student - student_logits_max.unsqueeze(dim=-1).detach() + + denom_student = torch.sum(torch.exp(output_student), dim=-1) + denom_student = all_reduce_autograd( + denom_student, group=get_tensor_model_parallel_group() + ) + + slen, bsz, sharded_vocab_size = output_student.shape + student_log_prob = output_student - torch.log(denom_student).view(slen, bsz, 1).expand( + slen, bsz, sharded_vocab_size + ) + teacher_log_prob = output_teacher - torch.log(denom_teacher).view(slen, bsz, 1).expand( + slen, bsz, sharded_vocab_size + ) + + if self._reverse: + loss = torch.sum( + F.kl_div(teacher_log_prob, student_log_prob, reduction="none", log_target=True), + dim=-1, + ) + else: + loss = torch.sum( + F.kl_div(student_log_prob, teacher_log_prob, reduction="none", log_target=True), + dim=-1, + ) + + else: + if self._reverse: + loss = torch.sum( + F.kl_div( + F.log_softmax(output_teacher, dim=-1), + F.softmax(output_student, dim=-1), + reduction="none", + ), + dim=-1, + ) + else: + loss = torch.sum( + F.kl_div( + F.log_softmax(output_student, dim=-1), + F.softmax(output_teacher, dim=-1), + reduction="none", + ), + dim=-1, + ) + + return self.post_forward(loss, tp_reduce=True) + + +######################################################## + + +class LogitsAndIntermediatesLossBalancer(mtd.DistillationLossBalancer): + """ + LossBalancer implementation for Logit and Intermediate losses. + + Dynamically weighs distillation and original losses to balance during training. + """ + + def __init__(self, kd_loss_scale: float = 1.0, skip_original_loss: bool = False): + """Constructor. + + Args: + kd_loss_scale: Multiply distillation losses by this before weighing. + (Not used when `skip_original_loss` is True.) + skip_original_loss: Used to signal whether the original loss should be used, regardless + of whether it was passed into ``mtd.DistillationModel.compute_kd_loss()`` or not. + """ + super().__init__() + self._kd_loss_scale = kd_loss_scale + self._skip_original_loss = skip_original_loss + + def forward(self, loss_dict: Dict[str, Tensor]) -> Tensor: + """Forward function. + + Args: + loss_dict: All individual scalar losses, passed in during ``mtd.DistillationModel.compute_kd_loss()`` + + Returns: + Aggregate total scalar loss. + """ + original_loss = loss_dict.pop(mtd.loss_balancers.STUDENT_LOSS_KEY) + for _key in loss_dict: + if _key.startswith(LogitsKLLoss.__name__): + logits_key = _key # should only be one + logits_loss = loss_dict.pop(logits_key) + intermediate_loss = sum(loss_dict.values()) / max(len(loss_dict), 1) + + if intermediate_loss > 0: + dynamic_scale = logits_loss.item() / intermediate_loss.item() + intermediate_loss_scaled = intermediate_loss * dynamic_scale + kd_loss_scale = self._kd_loss_scale / 2.0 + else: + kd_loss_scale = self._kd_loss_scale + intermediate_loss = logits_loss.new_tensor(intermediate_loss) + intermediate_loss_scaled = intermediate_loss + + if self._skip_original_loss: + total_loss = logits_loss + intermediate_loss_scaled + else: + kd_loss = (logits_loss + intermediate_loss_scaled) * kd_loss_scale + dynamic_scale = original_loss.item() / kd_loss.item() + total_loss = original_loss + kd_loss * dynamic_scale + + out_dict = { + "kd_loss": total_loss, + "logits_loss": logits_loss, + "intermediate_loss": intermediate_loss, + } + return out_dict + + +######################################################## + + +class ProjectionLayer(MegatronModule): + """Module to project student layer activations to teacher's size.""" + + def __init__(self, student_config: TransformerConfig, teacher_config: TransformerConfig): + """ + Constructor. + + Args: + student_config: Student's MCore transformer config. + teacher_config: Teacher's MCore transformer config. + """ + super().__init__(config=student_config) + if student_config.hidden_size == teacher_config.hidden_size: + self._fit = nn.Identity() + else: + self._fit = nn.Linear(student_config.hidden_size, teacher_config.hidden_size) + self.apply(self._init_weights) + # Attribute below needed to reduce gradients during backward properly. + setattr(self._fit.weight, "sequence_parallel", self.config.sequence_parallel) + setattr(self._fit.bias, "sequence_parallel", self.config.sequence_parallel) + + def forward(self, student_tensor: Tensor): + """ + Forward function. + + Args: + student_tensor: Tensor to be fit to teacher size. + """ + return self._fit(student_tensor) + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=0.01) + if module.bias is not None: + module.bias.data.zero_() + + +class _AllReduce(torch.autograd.Function): + """Implementation from old PyTorch `torch.distributed.nn.parallel`.""" + + @staticmethod + def forward(ctx, op, group, tensor): + ctx.group, ctx.op = group, op + tensor = tensor.clone() + torch.distributed.all_reduce(tensor, op=op, group=group) + return tensor + + @staticmethod + def backward(ctx, grad_output): + return (None, None, _AllReduce.apply(ctx.op, ctx.group, grad_output)) + + +def all_reduce_autograd( + tensor, op=torch.distributed.ReduceOp.SUM, group=torch.distributed.group.WORLD +): + """Custom all-reduce function. + + Needed instead of other all-reduce functions available when the computation following + the all-reduce call differs per rank. In KL loss, this corresponds to the different numerators. + """ + return _AllReduce.apply(op, group, tensor) + + +######################################################## + + +def adjust_distillation_model_for_mcore(model: mtd.DistillationModel, distill_cfg: Dict[str, Any]): + """Extra modifcations to ``mtd.DistillationModel`` requried for Megatron-Core.""" + + # HACK: Get rid of ModelOpt Distillation state + # NOTE: If re-placed, above losses need modifcation as `TransformerConfig` has non-pickleable elements. + mto.ModeloptStateManager(model)._state.pop() + + # HACK: Hide teacher during `sharded_state_dict` method. + def _sharded_state_dict(self, *args, **kwargs) -> ShardedStateDict: + with self.hide_teacher_model(): + return type(self).sharded_state_dict(self, *args, **kwargs) + + model.sharded_state_dict = types.MethodType(_sharded_state_dict, model) + + # HACK: Skip `lm_loss` bypassing it when training if not needed for backprop. + def _compute_language_model_loss(self, labels, logits) -> Tensor: + if distill_cfg["skip_lm_loss"] and self.training: + return torch.zeros_like(labels) + return type(self).compute_language_model_loss(self, labels, logits) + + model.compute_language_model_loss = types.MethodType(_compute_language_model_loss, model) + + # HACK: Skip `lm_loss` always for teacher. + def _compute_language_model_loss(self, labels, logits) -> Tensor: + return torch.zeros_like(labels) + + model.teacher_model.compute_language_model_loss = types.MethodType( + _compute_language_model_loss, model.teacher_model + ) + + # HACK: Pipeline-parallel Distillation requires splitting input tensor into student and teacher parts. + def _set_student_input_tensor_shape(self, shapes: List[Tuple[int]]): + self._tensor_split_idx = shapes[0][-1] + + def _set_input_tensor(self, input_tensors: List[Tensor]): + teacher_inputs = [t[..., self._tensor_split_idx:] if t is not None else t for t in input_tensors] + student_inputs = [t[..., :self._tensor_split_idx] if t is not None else t for t in input_tensors] + type(self).set_input_tensor(self.teacher_model, teacher_inputs) + type(self).set_input_tensor(self, student_inputs) + + model.set_student_input_tensor_shape = types.MethodType(_set_student_input_tensor_shape, model) + model.set_input_tensor = types.MethodType(_set_input_tensor, model) + + # HACK: Concatenate output tensors when PP>1 so they can be passed between ranks. + def _forward(self, *args, **kwargs): + if not self.training: + with self.only_student_forward(): + return type(self).forward(self, *args, **kwargs) + + with torch.no_grad(): + self._teacher_model.eval() + teacher_output = self._teacher_model(*args, **kwargs) + with self.only_student_forward(): + student_output = type(self).forward(self, *args, **kwargs) + + if not is_pipeline_last_stage(): + return torch.cat([student_output, teacher_output], dim=-1) + else: + return student_output + + model.forward = types.MethodType(_forward, model) + + +def get_tensor_shapes_adjust_fn_for_distillation( + model: Union[torch.nn.Module, List[torch.nn.Module]], + seq_length: int, + micro_batch_size: int, + decoder_seq_length: Optional[int] = None, + forward_only: bool = False, +) -> Union[Callable, None]: + if ( + forward_only + or get_pipeline_model_parallel_world_size() == 1 + or get_virtual_pipeline_model_parallel_world_size() is not None + ): + return None + # Unwrap + if isinstance(model, list): + model = model[0] + while hasattr(model, "module"): + model = model.module + if not isinstance(model, mtd.DistillationModel): + return None + + def adjust_tensor_shapes(recv_tensor_shapes: List[Tuple[int, ...]], send_tensor_shapes: List[Tuple[int, ...]]): + teacher_config = get_model_config(model.teacher_model) + tp_group = get_tensor_model_parallel_group() + cp_group = get_context_parallel_group() + + teacher_recv_tensor_shapes = get_tensor_shapes( + seq_length=seq_length, + micro_batch_size=micro_batch_size, + decoder_seq_length=decoder_seq_length, + config=teacher_config, + tp_group=tp_group, + cp_group=cp_group, + ) + teacher_send_tensor_shapes = get_tensor_shapes( + seq_length=seq_length, + micro_batch_size=micro_batch_size, + decoder_seq_length=decoder_seq_length, + config=teacher_config, + tp_group=tp_group, + cp_group=cp_group, + ) + model.set_student_input_tensor_shape(recv_tensor_shapes) + + for i, shape in enumerate(recv_tensor_shapes): + shape = list(shape) + shape[-1] += teacher_recv_tensor_shapes[0][-1] + recv_tensor_shapes[i] = tuple(shape) + for i, shape in enumerate(send_tensor_shapes): + shape = list(shape) + shape[-1] += teacher_send_tensor_shapes[0][-1] + send_tensor_shapes[i] = tuple(shape) + + return recv_tensor_shapes, send_tensor_shapes + + return adjust_tensor_shapes diff --git a/megatron/post_training/checkpointing.py b/megatron/post_training/checkpointing.py index 143cbb9c6ab..aac59341e37 100644 --- a/megatron/post_training/checkpointing.py +++ b/megatron/post_training/checkpointing.py @@ -183,7 +183,14 @@ def _remove_prefix_state_dict_pre_hook( logger.warning(f"PyTorch version {get_torch_version()} below 2.6 detected." f" Forcing dist_ckpt_save_pre_mcore_014 behavior.") - sharded_state_dict = unwrapped_model[0].sharded_state_dict(prefix=additional_sharded_prefix) + # NOTE: singleton_local_shards only take care of the weight and bias. There are be issue when linear_fc1._amax + # is a matrix such as NVFP4 real quant, awq, and blockwise 128. + if args.dist_ckpt_save_pre_mcore_014 or force_pre_mcore_014: + metadata = {"singleton_local_shards": False} + else: + metadata = {"singleton_local_shards": True} + + sharded_state_dict = unwrapped_model[0].sharded_state_dict(prefix=additional_sharded_prefix, metadata=metadata) if additional_sharded_prefix: unwrapped_model[0]._register_load_state_dict_pre_hook( diff --git a/megatron/post_training/docs/distillation.md b/megatron/post_training/docs/distillation.md index 9f0d5524176..6ca1ec18417 100644 --- a/megatron/post_training/docs/distillation.md +++ b/megatron/post_training/docs/distillation.md @@ -75,7 +75,7 @@ Model Optimizer modifies the model using the loss criterion present in the disti defines a loss function between two module attribute names of the teacher and student model, respectively. Default loss function used between logits is a KL-Divergence Loss and loss used among intermediate tensors is Cosine-Similarity, -both defined in `modelopt.torch.distill.plugins.megatron`. +both defined in `megatron/inference/algos/distillation.py`. ## Restrictions diff --git a/megatron/post_training/generate.py b/megatron/post_training/generate.py index 2a124734a30..0c5be3eceab 100644 --- a/megatron/post_training/generate.py +++ b/megatron/post_training/generate.py @@ -104,7 +104,7 @@ def simple_speculative_generate( input_ids: torch.Tensor, images: Optional[torch.Tensor] = None, osl: int = 32, - steps: int = 0, + draft_length: int = 0, eos_token_id: List[int] = [], disable_tqdm: bool = False, ): @@ -127,7 +127,7 @@ def simple_speculative_generate( # Speculative decoding forward # NOTE: PP is not yet supported. - new_token, draft_tokens = model.pseudo_speculative_generate(input_ids, steps=steps) + new_token, draft_tokens = model.pseudo_speculative_generate(input_ids, steps=draft_length) # Always accept the first token. input_ids = output_ids[:, : offset] @@ -138,8 +138,6 @@ def simple_speculative_generate( for i in range(draft_tokens.shape[-1]): if torch.equal(draft_tokens[:, i : i + 1], output_ids[:, offset: offset + 1]): offset += 1 - else: - break # Broadcast the accepted offset from the last rank. offset = [offset] diff --git a/megatron/post_training/loss_func.py b/megatron/post_training/loss_func.py index 9c99529172d..eb8dbca1c6a 100644 --- a/megatron/post_training/loss_func.py +++ b/megatron/post_training/loss_func.py @@ -55,18 +55,16 @@ def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor, model: GPTMo num_tokens = loss_mask.sum().clone().detach().to(torch.int) report = {'lm loss': torch.cat([loss_lm.clone().detach().view(1), num_tokens.view(1)])} - if args.export_kd_teacher_load: + if model.training and args.export_kd_teacher_load: # [ModelOpt]: Handle knowledge distillation losses = model.compute_kd_loss( student_loss=loss_lm, loss_reduction_fn=lambda x: _mask_loss(x, loss_mask), ) + loss = losses["kd_loss"] report["total loss"] = torch.cat([losses["kd_loss"].clone().detach().view(1), num_tokens.view(1)]) report["logits distillation loss"] = torch.cat([losses["logits_loss"].clone().detach().view(1), num_tokens.view(1)]) report["intermediate distillation loss"] = torch.cat([losses["intermediate_loss"].clone().detach().view(1), num_tokens.view(1)]) - if model.training: - loss = losses["kd_loss"] - return loss, num_tokens, report diff --git a/megatron/post_training/model_builder.py b/megatron/post_training/model_builder.py index cb2654e7107..34daa279651 100644 --- a/megatron/post_training/model_builder.py +++ b/megatron/post_training/model_builder.py @@ -7,8 +7,6 @@ from typing import Any, Dict import modelopt.torch.distill as mtd -import modelopt.torch.distill.plugins.megatron as mtd_mcore -import modelopt.torch.opt as mto import yaml from megatron.core.models.gpt import GPTModel as MCoreGPTModel @@ -20,6 +18,7 @@ from megatron.core.post_training.modelopt.gpt.state_dict_hooks import ( mcore_gpt_load_te_state_dict_pre_hook, ) +from megatron.post_training.algos import distillation from megatron.post_training.checkpointing import load_modelopt_checkpoint, load_modelopt_state from megatron.training import get_args, print_rank_0 from megatron.training.arguments import core_transformer_config_from_args @@ -286,7 +285,7 @@ def modelopt_gpt_mamba_builder(args, pre_process, post_process, vp_stage=None, c ), "ModelOpt Distillation currently incompatible with interleaved pipeline schedule." teacher_config = _load_teacher_model_config(args.export_kd_teacher_load) - distill_cfg = mtd_mcore.setup_distillation_config( + distill_cfg = distillation.load_distillation_config( args.export_kd_cfg, student_cfg=config, teacher_cfg=core_transformer_config_from_args(teacher_config) ) if "hybrid_override_pattern" in teacher_config and args.is_hybrid_model: @@ -298,15 +297,14 @@ def modelopt_gpt_mamba_builder(args, pre_process, post_process, vp_stage=None, c kd_config = { "teacher_model": (_teacher_provider, [teacher_config, model_kwargs], {}), - "criterion": distill_cfg.criterion, - "loss_balancer": distill_cfg.loss_balancer, + "criterion": distill_cfg["criterion"], + "loss_balancer": distill_cfg["loss_balancer"], } model = mtd.convert(model, mode=[("kd_loss", kd_config)]) - # Additional tweaks needed for MCore. - # (accounts for sharded state, pipeline parallel, and potentially skipping LM loss) - mtd_mcore.adjust_distillation_model_for_mcore(model, distill_cfg) - # Also remove KD mode state to prevent issues with re-conversion after restore. - mto.ModeloptStateManager(model).state_dict().pop() # TODO(aanoosheh): remove once fixed in ModelOpt + # Additional tweaks needed for MCore/Nemo. + # NOTE: Distillation state manually removed in this function. + # ModelOpt state restoration above will not return a `mtd.DistillationModel` for simplicity reasons. + distillation.adjust_distillation_model_for_mcore(model, distill_cfg) return model diff --git a/megatron/post_training/non_loss_data_func.py b/megatron/post_training/non_loss_data_func.py index 49c29b4912c..49fb9220258 100644 --- a/megatron/post_training/non_loss_data_func.py +++ b/megatron/post_training/non_loss_data_func.py @@ -8,11 +8,10 @@ from megatron.training.utils import unwrap_model -def report_draft_acceptance_length(model, osl: int = 64, draft_steps: int = 7): +def report_draft_acceptance_length(model, osl: int = 64, draft_length: int = 7): """Report MTBench acceptance length.""" tokenizer = get_tokenizer()._tokenizer unwrapped_model = unwrap_model(model)[0] - parallel_draft_step = unwrapped_model.eagle_config.parallel_draft_step if hasattr(unwrapped_model, "eagle_config") else 1 if unwrapped_model.training: return @@ -34,15 +33,15 @@ def report_draft_acceptance_length(model, osl: int = 64, draft_steps: int = 7): conversations, return_tensors="pt", add_generation_prompt=True ).to(torch.cuda.current_device()) output_ids, actual_osl, steps = simple_speculative_generate( - unwrapped_model, input_ids, osl=osl, steps=draft_steps, disable_tqdm=True + unwrapped_model, input_ids, osl=osl, draft_length=draft_length, disable_tqdm=True ) total_osl += actual_osl total_steps += steps if torch.distributed.get_rank() == 0: al = actual_osl / steps - ar = al / (draft_steps + parallel_draft_step - 1) + ar = al / draft_length print( - "Rank {:3}/{:3} {:12} AL {:.1f} AR {:.2f} STEPS {:5}/{:5} DRAFT {:2} PARALLEL {:2}".format( + "Rank {:3}/{:3} {:12} AL {:.1f} AR {:.2f} STEPS {:5}/{:5} DRAFT {:2}".format( torch.distributed.get_rank(), torch.distributed.get_world_size(), category, @@ -50,16 +49,15 @@ def report_draft_acceptance_length(model, osl: int = 64, draft_steps: int = 7): ar, steps, actual_osl, - draft_steps, - parallel_draft_step, + draft_length, ), flush=True, ) if torch.distributed.get_rank() == 0: al = total_osl / total_steps - ar = al / (draft_steps + parallel_draft_step - 1) + ar = al / draft_length print( - "Rank {:3}/{:3} {:12} AL {:.1f} AR {:.2f} STEPS {:5}/{:5} DRAFT {:2} PARALLEL {:2}".format( + "Rank {:3}/{:3} {:12} AL {:.1f} AR {:.2f} STEPS {:5}/{:5} DRAFT {:2}".format( torch.distributed.get_rank(), torch.distributed.get_world_size(), "average", @@ -67,8 +65,7 @@ def report_draft_acceptance_length(model, osl: int = 64, draft_steps: int = 7): ar, total_steps, total_osl, - draft_steps, - parallel_draft_step, + draft_length, ), flush=True, ) diff --git a/megatron/post_training/utils.py b/megatron/post_training/utils.py index 4bec8c96cf1..5d9f301cd41 100644 --- a/megatron/post_training/utils.py +++ b/megatron/post_training/utils.py @@ -1,6 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -import os import torch from datasets import load_dataset @@ -35,7 +34,7 @@ def mtbench_to_oai_chat(example): example["conversations"] = conversations return example - dataset = load_dataset("HuggingFaceH4/mt_bench_prompts", split="train", token=os.environ.get("HF_TOKEN", None)) + dataset = load_dataset("HuggingFaceH4/mt_bench_prompts", split="train") return dataset.map(mtbench_to_oai_chat) def to_empty_if_meta(module: torch.nn.Module, *, device: torch.device, recurse=True): diff --git a/megatron/rl/inference/megatron.py b/megatron/rl/inference/megatron.py index ad22bd14ac9..58613b364a6 100644 --- a/megatron/rl/inference/megatron.py +++ b/megatron/rl/inference/megatron.py @@ -5,11 +5,10 @@ from argparse import Namespace from pydantic import PrivateAttr -import torch.distributed as dist from megatron.core import parallel_state -from megatron.core.inference.inference_client import InferenceClient from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext +from megatron.core.inference.coordinator import DynamicEngineCoordinator from megatron.core.inference.engines.abstract_engine import AbstractEngine from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine from megatron.core.inference.engines.mcore_engine import MCoreEngine @@ -24,11 +23,9 @@ SimpleTextGenerationController, ) from megatron.core.models.gpt.gpt_model import GPTModel -from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols from megatron.core.transformer.module import MegatronModule -from megatron.core.utils import get_mamba_inference_state_config_from_model, log_single_rank +from megatron.core.utils import log_single_rank from megatron.training.global_vars import get_args, get_tokenizer -from megatron.training import get_wandb_writer from ..inference.inference_interface import ( ChatInferenceInterface, @@ -105,36 +102,38 @@ def get_dynamic_inference_engine(args: Namespace, model: MegatronModule, inferen """ tokenizer = get_tokenizer() - enable_cuda_graph = args.cuda_graph_impl == "local" + num_cuda_graphs = None + if args.enable_cuda_graph: + num_cuda_graphs = args.inference_dynamic_batching_num_cuda_graphs - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) + module = model.module.module if hasattr(model.module, "module") else model.module # Inference context. inference_context = DynamicInferenceContext( params_dtype=args.params_dtype, - num_layers=args.num_layers // args.pipeline_model_parallel_size, + num_layers=args.num_layers, kv_channels=args.kv_channels, num_attention_heads=( args.num_query_groups if args.group_query_attention else args.num_attention_heads ), max_sequence_length=args.inference_max_seq_length, - num_cuda_graphs=( - args.inference_dynamic_batching_num_cuda_graphs - if enable_cuda_graph - else None - ), - block_size_tokens=args.inference_dynamic_batching_block_size, + num_cuda_graphs=num_cuda_graphs, buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb, - max_tokens=args.inference_dynamic_batching_max_tokens, + buffer_guaranteed_fraction=args.inference_dynamic_batching_buffer_guaranteed_fraction, + chunk_size_tokens=args.inference_dynamic_batching_chunk_size, + buffer_overflow_factor=args.inference_dynamic_batching_buffer_overflow_factor, + max_requests_override=args.inference_dynamic_batching_max_requests_override, + max_tokens_override=args.inference_dynamic_batching_max_tokens_override, tensor_model_parallel_size=args.tensor_model_parallel_size, materialize_only_last_token_logits=True, - mamba_inference_state_config=mamba_inference_state_config, - cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents, - kv_lora_rank=args.kv_lora_rank if args.multi_latent_attention else None, - qk_pos_emb_head_dim=args.qk_pos_emb_head_dim, - use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs, - use_flashinfer_fused_rope=None, - unified_memory_level=args.inference_dynamic_batching_unified_memory_level, + unified_memory_kvcache=args.inference_dynamic_batching_unified_memory_kvcache, + is_hybrid_model=args.is_hybrid_model, + layer_type_list=module.decoder.layer_type_list if args.is_hybrid_model else None, + mamba_head_dim=args.mamba_head_dim, + mamba_num_groups=args.mamba_num_groups, + mamba_d_model=args.hidden_size, + mamba_d_conv=4 if args.is_hybrid_model else None, + mamba_d_state=args.mamba_state_dim, metrics_writer=metrics_writer, ) @@ -151,7 +150,7 @@ def get_dynamic_inference_engine(args: Namespace, model: MegatronModule, inferen return DynamicInferenceEngine( controller=text_generation_controller, context=inference_context, - enable_cuda_graph=enable_cuda_graph, + enable_cuda_graph=args.enable_cuda_graph, random_seed=args.seed, inference_logging_step_interval=inference_logging_step_interval, ) @@ -160,8 +159,9 @@ def get_dynamic_inference_engine(args: Namespace, model: MegatronModule, inferen class MegatronLocal(InferenceServer, ReturnsTokens, ReturnsRaw): """Interface to use MCoreEngine directly as an inference engine.""" - _client: InferenceClient = PrivateAttr(None) - _inference_engine: DynamicInferenceEngine = PrivateAttr(None) + _coordinator: DynamicEngineCoordinator = PrivateAttr(None) + _engine_task: asyncio.Task = PrivateAttr(None) + _kill_engine: bool = PrivateAttr(False) async def base_generate(self, request: InferenceRequest): @@ -174,29 +174,25 @@ async def base_generate(self, request: InferenceRequest): isinstance(p, str) for p in request.prompt ), "MegatronLocal only supports string prompts." - assert self._client is not None, "Client is not initialized" - tokenizer = get_tokenizer() sampling_params = SamplingParams( - num_tokens_to_generate=None, - num_tokens_total=request.generation_args.max_tokens, + num_tokens_to_generate=request.generation_args.max_tokens or 1024, temperature=request.generation_args.temperature or 1.0, top_k=request.generation_args.top_k or 0, top_p=request.generation_args.top_p or 0.0, - termination_id=self._inference_engine.controller.tokenizer.eod, + termination_id=self._coordinator.engine.controller.tokenizer.eod, return_log_probs=True, skip_prompt_log_probs=True, add_BOS=tokenizer.bos is not None, ) - requests = [ - self._client.add_request(prompt=prompt, sampling_params=sampling_params) + request_ids = [ + self._coordinator.schedule_request(prompt=prompt, sampling_params=sampling_params) for prompt in request.prompt ] - records = await asyncio.gather( - *requests + responses = await asyncio.gather( + *[self._coordinator.get_response(id) for id in request_ids] ) - responses = [record[-1] for record in records] return [ InferenceResponse( response=r.generated_text, @@ -233,32 +229,28 @@ async def launch(cls, model: GPTModel, **kwargs): "wandb module is available. Inference logging will be disabled.") inference_engine: DynamicInferenceEngine = get_dynamic_inference_engine(args, model, inference_logging_step_interval, metrics_writer) - await inference_engine.start_listening_to_data_parallel_coordinator(inference_coordinator_port=41521, launch_inference_coordinator=True) - if dist.get_rank() == 0: - # TODO: We have to do this only on the rank 0 process, should be fixed in the future when we have support for multiple inference clients. !2278 - client = InferenceClient(inference_coordinator_port=41521) - await client.start() - else: - client = None + coordinator = DynamicEngineCoordinator( + inference_engine, + inference_max_requests=inference_engine.context.max_requests, + log_level=0, + ) launched_server = cls(**kwargs) - launched_server._client = client - launched_server._inference_engine = inference_engine + launched_server._coordinator = coordinator + + loop = asyncio.get_running_loop() + + coordinator.startup(loop) return launched_server async def kill(self): - if dist.get_rank() == 0: - await self._client.stop_engines() - await self._inference_engine.stopped.wait() + await self._coordinator.shutdown() async def suspend(self): - if dist.get_rank() == 0: - await self._client.pause_engines() - await self._inference_engine.paused.wait() - - async def resume(self): - if dist.get_rank() == 0: - self._client.unpause_engines() - await self._inference_engine.running.wait() + await self._coordinator.suspend_engine() + + def resume(self): + self._coordinator.resume_engine() + class MegatronChatLocal(ChatInferenceInterface, MegatronLocal): ... diff --git a/megatron/rl/rl_utils.py b/megatron/rl/rl_utils.py index 11e005f74af..c0992778d57 100644 --- a/megatron/rl/rl_utils.py +++ b/megatron/rl/rl_utils.py @@ -24,7 +24,7 @@ from megatron.core import mpu from megatron.core.datasets.megatron_tokenizer import MegatronLegacyTokenizer -from megatron.core.utils import get_asyncio_loop +from megatron.core.inference.utils import get_event_loop from megatron.core.models.common.language_module.language_module import LanguageModule from megatron.core.num_microbatches_calculator import get_num_microbatches from megatron.core.optimizer import MegatronOptimizer @@ -607,11 +607,11 @@ def get_environment_rollouts( ), "n_prompts must be divisible by data_parallel_world_size" with nvtx_range("rollout-collection"): - loop = get_asyncio_loop() + loop = get_event_loop() with megatron_rl_inference_mode( model, optimizer, - args.cuda_graph_impl, + args.enable_cuda_graph, args.rl_reset_cuda_graphs, args.rl_offload_optimizer_during_inference, args.rl_offload_kv_cache_during_training, @@ -1006,7 +1006,7 @@ def prepare_trajectories( args = get_args() # Only process if we have inference_logprobs if inference_logprobs and any(lp is not None for lp in inference_logprobs): - if args.rl_use_sequence_packing: + if args.use_sequence_packing: # For sequence packing, we need to pad all logprobs to the same size padded_logprobs = [] for logprobs in inference_logprobs: @@ -1207,14 +1207,14 @@ def prepare_data_for_update( # [g, group_size] # Making an assumption that all groups are of the same size! # For packing mode, use all rollouts to compute rewards - rollouts_for_rewards = all_rollouts if args.rl_use_sequence_packing else rollouts + rollouts_for_rewards = all_rollouts if args.use_sequence_packing else rollouts rewards = torch.tensor( [[rollout.reward for rollout in group] for group in rollouts_for_rewards], device='cpu' ) # We flatten them for logging. with nvtx_range("prepare_trajectories"): - if args.rl_use_sequence_packing: + if args.use_sequence_packing: trajs, generation_masks, inference_logprobs = prepare_packed_trajectories( all_rollouts, tokenizer, args ) @@ -1228,14 +1228,14 @@ def prepare_data_for_update( # Sequence packing or standard processing packing_context = {} # Store all packing-related data - if args.rl_use_sequence_packing: + if args.use_sequence_packing: with nvtx_range("sequence_packing"): timers('sequence-packing-overhead', log_level=1).start() - bin_size = args.rl_sequence_packing_bin_size + bin_size = args.sequence_packing_bin_size # Create packer with max sequences per bin limit to prevent extreme imbalance - max_sequences_per_bin = getattr(args, 'rl_sequence_packing_max_sequences_per_bin', 100) + max_sequences_per_bin = getattr(args, 'sequence_packing_max_sequences_per_bin', 100) packer = SequencePacker( bin_size=bin_size, pad_token=tokenizer.pad, @@ -1276,7 +1276,7 @@ def prepare_data_for_update( world_size = mpu.get_expert_data_parallel_world_size() # Choose distribution algorithm based on args.sequence_packing_algo - packing_algo = getattr(args, 'rl_sequence_packing_algo', 'fifo') + packing_algo = getattr(args, 'sequence_packing_algo', 'fifo') if packing_algo == 'round-robin': # Round-robin assignment: rank i gets bins [i, i+world_size, i+2*world_size, ...] @@ -1596,7 +1596,7 @@ def prepare_data_for_update( ) original_loss_mask[~generation_masks] = 0.0 - if not args.rl_use_sequence_packing: + if not args.use_sequence_packing: # Use original masks if not packing attention_mask = original_attention_mask loss_mask = original_loss_mask @@ -1606,7 +1606,7 @@ def prepare_data_for_update( timers('compute-logprobs', log_level=0).start() # Before we can update the model, we need to get the logprobs for the \pi_{old} model. # Use packed sequences if packing is enabled for performance benefits - if args.rl_use_sequence_packing and 'packed_trajs' in packing_context: + if args.use_sequence_packing and 'packed_trajs' in packing_context: compute_trajs = packing_context['packed_trajs'] compute_position_ids = packing_context['packed_position_ids'] compute_attention_mask = packing_context['packed_attention_mask'] @@ -1661,7 +1661,7 @@ def prepare_data_for_update( if ( inference_logprobs is not None and args.rl_inference_logprobs_is_correction - and not args.rl_use_sequence_packing + and not args.use_sequence_packing ): inference_logprobs = align_unpacked_inference_logprobs( inference_logprobs=inference_logprobs, @@ -1670,14 +1670,14 @@ def prepare_data_for_update( group_stats=group_stats, ) else: - if not args.rl_use_sequence_packing: + if not args.use_sequence_packing: # Keep inference_logprobs as None instead of zeros inference_logprobs = None # For sequence packing, inference_logprobs will be handled separately # Handle packing of inference_logprobs for sequence packing mode if ( - args.rl_use_sequence_packing + args.use_sequence_packing and inference_logprobs is not None and args.rl_inference_logprobs_is_correction ): @@ -1687,7 +1687,7 @@ def prepare_data_for_update( inference_logprobs=inference_logprobs, packing_info=packing_context['packing_info'], generation_masks=generation_masks, - bin_size=args.rl_sequence_packing_bin_size, + bin_size=args.sequence_packing_bin_size, ) # Store packed inference logprobs in packing context @@ -1754,7 +1754,7 @@ def prepare_data_for_update( timers('prepare-advantages').stop() with nvtx_range("create_dataloader"): - if args.rl_use_sequence_packing: + if args.use_sequence_packing: # Store packing context in runtime state for forward_step runtime_state = get_rl_runtime_state() runtime_state.packing_context = packing_context @@ -2049,14 +2049,14 @@ def evaluate_and_print_results_rl( with megatron_rl_inference_mode( model, optimizer, - args.cuda_graph_impl, + args.enable_cuda_graph, args.rl_reset_cuda_graphs, args.rl_offload_optimizer_during_inference, args.rl_offload_kv_cache_during_training, args.rl_remove_kv_cache_during_training, ) as inference_interface: - loop = get_asyncio_loop() + loop = get_event_loop() rank = torch.distributed.get_rank() if rank == 0: @@ -2230,7 +2230,7 @@ def calculate_grpo_loss( def megatron_rl_inference_mode( model: list[LanguageModule], optimizer: MegatronOptimizer, - cuda_graph_impl: str, + enable_cuda_graph: bool, reset_cuda_graphs: bool, offload_optimizer_during_inference: bool, offload_kv_cache_during_training: bool, @@ -2241,7 +2241,7 @@ def megatron_rl_inference_mode( Args: model: model to prepare. optimizer: optimizer used to train the model. - cuda_graph_impl: which cuda graph implementation to use. + enable_cuda_graph: use cuda graphs or not. reset_cuda_graphs: rebuild cuda graphs for each inference stage or not. offload_optimizer_during_inference: move optimizer to cpu during inference or not. offload_kv_cache_during_training: manually offload kv cache to host before training or not. @@ -2252,7 +2252,7 @@ def megatron_rl_inference_mode( """ args = get_args() - loop = get_asyncio_loop() + loop = get_event_loop() nvtx_range = get_nvtx_range() print(f"[{dist.get_rank()}:DP] Entering inference mode") @@ -2275,9 +2275,8 @@ def megatron_rl_inference_mode( with nvtx_range("offload-optimizer-before-inference"): optimizer.offload_to_cpu() - # TODO: Remove this if statement once a change to `toggle_cuda_graphs` makes it safe to. - if cuda_graph_impl != "none": - toggle_cuda_graphs(lang_module, cuda_graph_impl, reset_cuda_graphs=reset_cuda_graphs) + if enable_cuda_graph: + toggle_cuda_graphs(lang_module, True, reset_cuda_graphs=reset_cuda_graphs) inference_interface = get_inference_interface(args, loop, model) @@ -2287,28 +2286,25 @@ def megatron_rl_inference_mode( reset_cuda_graphs ), "reset_cuda_graphs must be True when offloading kv cache during training" print( - f"[{dist.get_rank()}:DP] Restoring kv cache ({inference_interface._inference_engine.context.memory_buffer.numel() / 1024**3:.2f} GB) to GPU" + f"[{dist.get_rank()}:DP] Restoring kv cache ({inference_interface._coordinator.engine.context.memory_buffer.numel() / 1024**3:.2f} GB) to GPU" ) - kv_cache = inference_interface._inference_engine.context.memory_buffer - inference_interface._inference_engine.context.memory_buffer = kv_cache.cuda() + kv_cache = inference_interface._coordinator.engine.context.memory_buffer + inference_interface._coordinator.engine.context.memory_buffer = kv_cache.cuda() elif remove_kv_cache_during_training: - if inference_interface._inference_engine.context.memory_buffer is None: - inference_interface._inference_engine.context.build_memory_buffer() + if inference_interface._coordinator.engine.context.memory_buffer is None: + inference_interface._coordinator.engine.context.build_memory_buffer() - # TODO: Improve this if statement once a change is made to CUDA graph handling. - cuda_graph_exists = len(_CudagraphGlobalRecord.cudagraph_inference_record) != 0 - if cuda_graph_impl != "none" and not cuda_graph_exists: + if enable_cuda_graph and not _CudagraphGlobalRecord.cudagraph_created: with nvtx_range("wait-for-decode-only"): - while not inference_interface._inference_engine.context.is_decode_only(): + while not inference_interface._coordinator.engine.context.is_decode_only(): active_requests, finished_requests, step_time = loop.run_until_complete( - inference_interface._inference_engine.async_step() + inference_interface._coordinator.engine.async_step() ) with nvtx_range("build-cuda-graphs"): - inference_interface._inference_engine.create_cuda_graphs(reset_context=True) + inference_interface._coordinator.engine.build_cuda_graphs(reset_context=False) - loop.run_until_complete(inference_interface.resume()) + inference_interface.resume() - print(f"[{dist.get_rank()}:DP] Entered inference mode") yield inference_interface with nvtx_range("suspend-engine"): @@ -2316,17 +2312,16 @@ def megatron_rl_inference_mode( with nvtx_range("offload-kv-cache-after-inference"): if offload_kv_cache_during_training: - kv_cache = inference_interface._inference_engine.context.memory_buffer + kv_cache = inference_interface._coordinator.engine.context.memory_buffer print( f"[{dist.get_rank()}:DP] Offloading kv cache ({kv_cache.numel() * kv_cache.element_size() / 1024**3:.2f} GB) to CPU" ) - inference_interface._inference_engine.context.memory_buffer = kv_cache.cpu() + inference_interface._coordinator.engine.context.memory_buffer = kv_cache.cpu() elif remove_kv_cache_during_training: - inference_interface._inference_engine.context.memory_buffer = None + inference_interface._coordinator.engine.context.memory_buffer = None - # TODO: Remove this if statement once a change to `toggle_cuda_graphs` makes it safe to. - if cuda_graph_impl != "none": - toggle_cuda_graphs(lang_module, 'none', reset_cuda_graphs=reset_cuda_graphs) + if enable_cuda_graph: + toggle_cuda_graphs(lang_module, False, reset_cuda_graphs=reset_cuda_graphs) if offload_optimizer_during_inference: with nvtx_range("onload-optimizer-after-inference"): @@ -2353,7 +2348,7 @@ def get_iteration_sequence_count(args): def update_sequence_packing_metrics(args): """Update bin tracking for sequence packing mode.""" - if args.rl_use_sequence_packing: + if args.use_sequence_packing: bin_count = ( mpu.get_data_parallel_world_size() * args.micro_batch_size * get_num_microbatches() ) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index be667e32419..bb1b17e9ba2 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -9,6 +9,7 @@ from pathlib import Path import re import types +import warnings import torch import torch.nn.functional as F @@ -34,7 +35,6 @@ ) from megatron.core.activations import squared_relu from megatron.core.fusions.fused_bias_geglu import quick_gelu -from megatron.training.dist_signal_handler import SIGNAL_MAP from megatron.training.utils import ( get_device_arch_version, update_use_dist_ckpt, @@ -1062,6 +1062,8 @@ def validate_args(args, defaults={}): # MoE Spec check if args.num_experts == 0: args.num_experts = None + if args.num_experts is not None: + assert args.spec is None, "Model Spec must be None when using MoEs" if args.num_experts is not None and args.moe_ffn_hidden_size is None: args.moe_ffn_hidden_size = args.ffn_hidden_size print("Warning: moe_ffn_hidden_size is not set, using ffn_hidden_size for MoE instead.") @@ -1106,20 +1108,6 @@ def validate_args(args, defaults={}): any([args.train_data_path, args.valid_data_path, args.test_data_path]) \ <= 1, "A single data source must be provided in training mode, else None" - if args.fim_data: - extra_tokens = [ - args.fim_prefix_token, - args.fim_middle_token, - args.fim_suffix_token, - args.fim_pad_token, - args.fim_eod_token, - ] - assert not args.mock_data, "Mock dataset is not supported with FIM dataset." - assert not args.legacy_tokenizer, "FIM dataset is not supported with legacy tokenizers." - assert args.fim_rate, "--fim-rate should be specified." - assert args.fim_spm_rate, "--fim-spm-rate should be specified." - assert all(token is not None for token in extra_tokens), "FIM extra tokens should be specified." - # Deterministic mode if args.deterministic_mode: assert not args.use_flash_attn, "Flash attention can not be used in deterministic mode." @@ -1194,6 +1182,7 @@ def validate_args(args, defaults={}): if args.inference_dynamic_batching: assert args.inference_dynamic_batching_buffer_size_gb is not None assert args.inference_dynamic_batching_block_size % 256 == 0, "block size should be a multiple of 256" + assert args.inference_dynamic_batching_buffer_guaranteed_fraction is not None # MoE upcycling check if args.moe_use_upcycling: @@ -1418,7 +1407,7 @@ def _add_transformer_engine_args(parser): help='Execute wgrad in higher precision even for FP8 runs', dest='fp8_wgrad') group.add_argument('--transformer-impl', default='transformer_engine', - choices=['local', 'transformer_engine', 'inference_optimized'], + choices=['local', 'transformer_engine'], help='Which Transformer implementation to use.') group.add_argument('--fallback-to-eager-attn', action='store_true', help='Fallback to eager attention in TE implementation. ' @@ -1527,22 +1516,34 @@ def _add_inference_args(parser): help='Enable dynamic batching mode.') group.add_argument('--inference-dynamic-batching-buffer-size-gb', type=float, default=40., - help='Amount of on-GPU memory allocated for the KV cache. ' - 'The total amount of memory allocated for the KV cache ' - '(CPU + GPU memory) depends on the value set for the ' - 'unified virtual memory (UVM) level (via ' - '`--inference-dynamic-batching-unified-memory-level`).' - 'If the UVM level is 0, then only GPU memory is used and ' - 'the total memory equals `buffer_size_gb`. If the UVM ' - 'level is 1, then additional memory is utilized on the ' - 'CPU and the total memory equals `2 * buffer_size_gb`.') + help='Total buffer size (GB) allocated for the block-level KV ' + 'memory.') group.add_argument('--inference-dynamic-batching-block-size', type=int, default=256, help='KV cache block size. ' 'It should be a multiple of 256') - group.add_argument('--inference-dynamic-batching-max-tokens', + group.add_argument('--inference-dynamic-batching-buffer-guaranteed-fraction', + type=float, default=0.2, + help='Space is reserved within the inference context ' + 'memory buffer to guarantee that a minimum number of ' + 'active requests will always be able to run to ' + 'completion. This is to avoid the context being deadlocked ' + 'by paused requests.') + group.add_argument('--inference-dynamic-batching-buffer-overflow-factor', + type=float, default=None, + help='Scaling factor over the memory buffer size for auto ' + 'computing `max_requests` and `max_tokens`. This scaling ' + 'factor is used for fitting more requests and tokens in ' + 'the memory buffer than it can safely hold, which in turn ' + 'increases throughput.') + group.add_argument('--inference-dynamic-batching-max-requests-override', + type=int, default=None, + help='If set, this overrides the max requests as computed ' + 'from `--inference-dynamic-batching-buffer-overflow-factor`.') + group.add_argument('--inference-dynamic-batching-max-tokens-override', type=int, default=None, - help='Override the inference context\'s default `max_tokens`.') + help='If set, this overrides the max tokens as computed ' + 'from `--inference-dynamic-batching-buffer-overflow-factor`.') group.add_argument('--inference-dynamic-batching-num-cuda-graphs', type=int, default=16, help='Maximum number of cuda graphs to capture, where the ' @@ -1559,7 +1560,7 @@ def _add_inference_args(parser): action='store_true', default=False, help='Only use cuda graphs for decode-only steps, not prefill and mixed steps.') group.add_argument('--inference-dynamic-batching-unified-memory-level', - type=int, default=1, choices=[0, 1], + type=int, default=0, choices=[0, 1], help='Set unified memory usage within the dynamic ' 'inference context. The levels are: 0) no unified memory, ' '1) allocate `memory_buffer` in unified memory. ' @@ -1579,8 +1580,7 @@ def _add_inference_args(parser): group.add_argument('--inference-wandb-logging-step-interval', type=int, default=0, help='Step interval for logging inference metrics to wandb. ' 'Default to 0 to disable inference wandb logging.') - group.add_argument("--inference-coordinator-port", type=int, default=12346, - help="This port will be used to setup the inference coordinator on node-0") + return parser @@ -2273,10 +2273,7 @@ def _add_training_args(parser): help='Exit the program after this many minutes.') group.add_argument('--exit-signal-handler', action='store_true', help='Dynamically save the checkpoint and shutdown the ' - 'training if signal is received') - group.add_argument('--exit-signal', type=str, default='SIGTERM', - choices=list(SIGNAL_MAP.keys()), - help='Signal to use for exit signal handler. If not specified, defaults to SIGTERM.') + 'training if SIGTERM is received') group.add_argument('--tensorboard-dir', type=str, default=None, help='Write TensorBoard logs to this directory.') group.add_argument('--no-masked-softmax-fusion', @@ -3046,27 +3043,6 @@ def _add_data_args(parser): 'If instead this argument is set, the training flow will treat all tokens ' 'that share the same id as the pad token as true pad tokens, potentially ' 'causing severe training instability.') - group.add_argument('--fim-data', action='store_true', help='Whether to use the FIM dataset.') - group.add_argument('--fim-rate', type=float, default=0.5, - help='Probability to convert a training sample into a FIM format.') - group.add_argument('--fim-spm-rate', type=float, default=0.5, - help='Probability that the a FIM sample uses the SPM format over the PSM format.') - group.add_argument('--fim-split-sample', type=str, default=None, - help='String around which to split the sample for FIM.') - group.add_argument('--fim-fragment-rate', type=float, default=None, - help='Rate of FIM on each fragment when --fim-split-sample is not None.') - group.add_argument('--fim-no-prefix', type=str, default=None, - help='Do not apply FIM to fragments that start with this prefix') - group.add_argument('--fim-prefix-token', type=str, default='', - help='FIM prefix token') - group.add_argument('--fim-middle-token', type=str, default='', - help='FIM middle token') - group.add_argument('--fim-suffix-token', type=str, default='', - help='FIM suffix token') - group.add_argument('--fim-pad-token', type=str, default='', - help='FIM PAD token') - group.add_argument('--fim-eod-token', type=str, default='<|endoftext|>', - help='FIM EOD token') return parser diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py index 48a2025fa63..feacccba162 100644 --- a/megatron/training/checkpointing.py +++ b/megatron/training/checkpointing.py @@ -270,7 +270,7 @@ def checkpoint_exists(checkpoints_path): def read_metadata(tracker_filename): # Read the tracker file and either set the iteration or # mark it as a release checkpoint. - iteration = -1 + iteration = 0 release = False with open_file(tracker_filename, 'r') as f: @@ -283,10 +283,7 @@ def read_metadata(tracker_filename): print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format( tracker_filename)) sys.exit() - else: - # Set iteration to 0 for release checkpoints - iteration = 0 - assert iteration > -1 or release, 'error parsing metadata file {}'.format( + assert iteration > 0 or release, 'error parsing metadata file {}'.format( tracker_filename) # Get the max iteration retrieved across the ranks. @@ -1831,16 +1828,6 @@ def load_model_state_dict(module, state_dict, strict: bool): is_local_chkpt = (ckpt_type == CheckpointType.LOCAL) ft_integration.on_checkpoint_loaded(is_local_chkpt=is_local_chkpt) - # Patch checkpoint as needed if required field is not found. - if optimizer is not None: - log_printed = False - for param_group in optimizer.param_groups: - if 'default_config' not in param_group: - param_group['default_config'] = True - if not log_printed: - print_rank_0(">>> Inserting 'default_config' field into optimizer.param_groups...") - log_printed = True - return iteration, num_floating_point_operations_so_far diff --git a/megatron/training/datasets/README.md b/megatron/training/datasets/README.md deleted file mode 100644 index d5543c3d1b5..00000000000 --- a/megatron/training/datasets/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# Data Pipeline - -## FIM dataset - -`GPTFIMDataset` extends Megatron-Core’s `GPTDataset` to support **Fill-in-the-Middle (FIM)** data augmentation. -It probabilistically converts samples into FIM format using configurable rates, with support for both PSM and SPM patterns, fragment-level splitting, and length-preserving output. - -`GPTFIMDatasetConfig` provides the configuration needed to enable this behavior. -`GPTFIMDatasetConfig` configuration object extending `GPTDatasetConfig` to enable FIM preprocessing. - -**Attributes** - -- `rate`: Probability of converting a sample into a FIM example. A value of `1.0` means FIM is always applied. a value of `0.0` means FIM is never applied. -- `spm_rate`: Probability of using the SPM FIM pattern (vs PSM). The remaining probability (`1 - spm_rate`) selects the PSM (prefix-suffix-middle) pattern instead. For example, if `spm_rate = 0.3`: 30% SPM, 70% PSM. -- `extra_tokens`: Dictionary containing the FIM special tokens: {"prefix", "middle", "suffix", "pad", "eod"}. -- `split_sample`: Optional token around which samples are split before applying FIM. If provided, the input sequence is divided at every occurrence of this token, and FIM is applied independently to each fragment. `A B C D E F G H` -> `FIM(Fragment 1) FIM(Fragment 2) FIM(Fragment 3)`. -- `fragment_rate`: Probability of applying FIM to each fragment when split_sample is used. -- `no_prefix`: If the decoded sequence starts with this prefix, FIM is skipped. -`GPTFIMDataset` dataset class that loads token sequences from an `IndexedDataset` and applies FIM transformations before returning each sample. - -**PSM Format** -``` -[prefix_tok] prefix [suffix_tok] suffix [middle_tok] middle -``` - -**SPM Format** -``` -[prefix_tok, suffix_tok] suffix [middle_tok] prefix middle -``` - -**Special cases:** - -- If the sequence starts with no_prefix, FIM is skipped. -- If FIM is not applied, the sample is returned unchanged. \ No newline at end of file diff --git a/megatron/training/datasets/fim_dataset.py b/megatron/training/datasets/fim_dataset.py deleted file mode 100644 index 730b7e033a1..00000000000 --- a/megatron/training/datasets/fim_dataset.py +++ /dev/null @@ -1,308 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. - -from typing import Dict, Tuple, Optional -from dataclasses import dataclass, field - -import numpy as np -import logging -from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig -from megatron.core.datasets.indexed_dataset import IndexedDataset -from megatron.core.datasets.utils import Split - -logger = logging.getLogger(__name__) - - -@dataclass -class GPTFIMDatasetConfig(GPTDatasetConfig): - """Configuration object for Megatron Core GPT FIM datasets""" - - fim_rate: float = None - """Probability to convert a training sample into a FIM format""" - - fim_spm_rate: float = None - """Probability that the a FIM sample uses the SPM format over the PSM format""" - - fim_extra_tokens: Dict = None - """FIM extra tokens. Should consist of prefix, middle, suffix, PAD, and EOD tokens.""" - - fim_split_sample: Optional[str] = None - """String around which to split the sample for FIM""" - - fim_fragment_rate: Optional[float] = None - """Rate of FIM on each fragment when split_sample is not None""" - - fim_no_prefix: Optional[str] = None - """Do not apply FIM to fragments that start with this prefix""" - - -class GPTFIMDataset(GPTDataset): - """The base GPT dataset - - Args: - indexed_dataset (IndexedDataset): The IndexedDataset around which to build the - MegatronDataset - - indexed_indices (np.ndarray): The set of the documents indices to expose - - num_samples (int): The number of samples to draw from the indexed dataset - - index_split (Split): The indexed_indices Split - - config (GPTFIMDatasetConfig): The GPT-specific container for all config sourced parameters - """ - - def __init__( - self, - indexed_dataset: IndexedDataset, - dataset_path: str, - indexed_indices: np.ndarray, - num_samples: int, - index_split: Split, - config: GPTFIMDatasetConfig, - ) -> None: - super().__init__( - indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config - ) - - self.np_rng = np.random.RandomState(seed=self.config.random_seed) - logger.info(f"Initialized FIM RNG with seed = {self.config.random_seed}") - # get FIM params - self.fim_rate = self.config.fim_rate - self.fim_spm_rate = self.config.fim_spm_rate - self.fragment_fim_rate = self.config.fim_fragment_rate - fim_split_sample = self.config.fim_split_sample - self.no_fim_prefix = self.config.fim_no_prefix - if fim_split_sample: - fim_split_sample_ids = self.config.tokenizer._tokenizer.tokens_to_ids(fim_split_sample) - assert isinstance(fim_split_sample_ids, int) or len(fim_split_sample_ids) == 1 - self.fim_split_sample = ( - fim_split_sample_ids - if isinstance(fim_split_sample_ids, int) - else fim_split_sample_ids[0] - ) - else: - self.fim_split_sample = None - - # get extra tokens ids - fim_tokens = self.config.fim_extra_tokens - fim_tokens = [ - fim_tokens["prefix"], - fim_tokens["middle"], - fim_tokens["suffix"], - fim_tokens["pad"], - fim_tokens["eod"], - ] - fim_tokens_ids = self.config.tokenizer._tokenizer.tokens_to_ids(fim_tokens) - ( - self.prefix_tok_id, - self.middle_tok_id, - self.suffix_tok_id, - self.pad_tok_id, - self.eod_tok_id, - ) = fim_tokens_ids - - def _query_document_sample_shuffle_indices(self, idx: int) -> Tuple[np.ndarray, np.ndarray]: - """Get the text (token ids) and document ids for a given index - - Args: - idx (int): The index into the dataset - - Returns: - Tuple[np.ndarray, np.ndarray]: The text ids and document ids - """ - # Do the shuffle mapping - idx = self.shuffle_index[idx] - - # Get the beginning and end documents and offsets - doc_index_beg, doc_index_beg_offset = self.sample_index[idx] - doc_index_end, doc_index_end_offset = self.sample_index[idx + 1] - - document_ids = [] - sample_parts = [] - - # Sample spans a single document - if doc_index_beg == doc_index_end: - # Add the document id - document_ids.append(self.document_index[doc_index_beg]) - - # Add the entire sample - sample_parts.append( - self.dataset.get( - self.document_index[doc_index_beg], - offset=doc_index_beg_offset, - length=doc_index_end_offset - doc_index_beg_offset + 1, - ) - ) - - # Sample spans multiple documents - else: - for i in range(doc_index_beg, doc_index_end + 1): - # Add the document id - document_ids.append(self.document_index[i]) - - # Add the sample part - offset = 0 if i > doc_index_beg else doc_index_beg_offset - length = None if i < doc_index_end else doc_index_end_offset + 1 - sample_parts.append( - self.dataset.get(self.document_index[i], offset=offset, length=length) - ) - - sample = np.concatenate(sample_parts) - - sample_len = sample.shape[0] - segment_breaks = np.argwhere(sample == self.eod_tok_id) - - if segment_breaks.shape != (0, 1): # then there is an EOD token in this example - curr_start_position = 0 - new_samples = [] - for loc in np.nditer(segment_breaks): - # Only permute non-empty segments. - if loc - curr_start_position > 0: - # permute {prefix, suffix, middle} or {suffix, prefix, middle} - permuted = self._fim_split_and_permute_sequence(sample[curr_start_position:loc]) - new_samples += [permuted, [self.eod_tok_id]] - - curr_start_position = loc + 1 # jump over the EOD token - # Permute the segment after the last EOD - permuted = self._fim_split_and_permute_sequence(sample[curr_start_position:]) - new_samples.append(permuted) - - sample = np.concatenate(new_samples) - else: - sample = self._fim_split_and_permute_sequence(sample) - - diff = sample.shape[0] - sample_len - if diff > 0: # too long - sample = sample[:sample_len] - elif diff < 0: # too short - sample = np.concatenate([sample, np.full((-1 * diff), self.pad_tok_id)]) - - assert sample.shape[0] == sample_len - - return (np.array(sample, dtype=np.int64), np.array(document_ids, dtype=np.int64)) - - def _fim_permute_sequence(self, sequence, rate): - return self._permute( - sequence, - rate, - self.fim_spm_rate, - self.config.tokenizer, - truncate_or_pad=False, - suffix_tok_id=self.suffix_tok_id, - prefix_tok_id=self.prefix_tok_id, - middle_tok_id=self.middle_tok_id, - pad_tok_id=self.pad_tok_id, - no_fim_prefix=self.no_fim_prefix, - ) - - def _fim_split_and_permute_sequence(self, sequence): - """ - If self.fim_split_sample is not None, split the sequence. - Then apply FIM on the fragments, or the whole sequence if self.fim_split_sample is None. - """ - if self.fim_split_sample is None: - return self._fim_permute_sequence(sequence, self.fim_rate) - # fim_split_sample is set: split the sample on this token and permute each fragment separately. - # Typically, if each sample is a repository, then we split again on the file level. - # Each fragment is a file, and we permute the files. - fragment_breaks = np.argwhere(sequence == self.fim_split_sample) - if fragment_breaks.shape == (0, 1): - # no split token in this sample - return self._fim_permute_sequence(sequence, self.fim_rate) - if not self.np_rng.binomial(1, self.fim_rate): - # don't do FIM preproc - return sequence - # Do FIM on each fragment - curr_start_position = 0 - new_samples = [] - for loc in np.nditer(fragment_breaks): - if loc - curr_start_position > 0: - permuted = self._fim_permute_sequence( - sequence[curr_start_position:loc], self.fragment_fim_rate - ) - new_samples += [permuted, [self.fim_split_sample]] - curr_start_position = loc + 1 # Jump over the split token - # Permute the segment after the last split token - permuted = self._fim_permute_sequence( - sequence[curr_start_position:], self.fragment_fim_rate - ) - new_samples.append(permuted) - - return np.concatenate(new_samples) - - def _permute( - self, - sample, - fim_rate, - fim_spm_rate, - tokenizer, - truncate_or_pad=True, - suffix_tok_id=None, - prefix_tok_id=None, - middle_tok_id=None, - pad_tok_id=None, - no_fim_prefix=None, - ): - """ - Take in a sample (np array w/ size (0,chunklength)) and perform a FIM transformation on it. - Maintain the same sample length (if transform creates a few extra tokens, drop them). - """ - if self.np_rng.binomial(1, fim_rate): # sample bernoulli dist - - contents = tokenizer._tokenizer.ids_to_text(sample) - - # Do not apply FIM if the sample starts with no_fim_prefix - if no_fim_prefix is not None and contents.startswith(no_fim_prefix): - return sample - - try: - # A boundary can be =0 (prefix will be empty) - # a boundary can be =len(contents) (suffix will be empty) - # The two boundaries can be equal (middle will be empty) - boundaries = list(self.np_rng.randint(low=0, high=len(contents) + 1, size=2)) - boundaries.sort() - except ValueError as e: - print(len(contents), contents) - print(e) - raise e - - prefix = contents[: boundaries[0]] - middle = contents[boundaries[0] : boundaries[1]] - suffix = contents[boundaries[1] :] - - prefix = np.array([*tokenizer._tokenizer.text_to_ids(prefix)], dtype=np.int64) - middle = np.array([*tokenizer._tokenizer.text_to_ids(middle)], dtype=np.int64) - suffix = np.array([*tokenizer._tokenizer.text_to_ids(suffix)], dtype=np.int64) - - # here we truncate each given segment to fit the same length as it was before - # A consequence is that we never reach the end of a file? - # we should rather truncate at the context-level - if truncate_or_pad: - # need to make same length as the input. Take the 3 sentinel tokens into account - new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3 - diff = new_length - sample.shape[0] - if diff > 0: # too long - if ( - suffix.shape[0] <= diff - ): # if there's no space to truncate the suffix: stop and report it. atm i should have stopped this from happening - return sample - suffix = suffix[: suffix.shape[0] - diff] - elif diff < 0: # too short - suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)]) - - if self.np_rng.binomial(1, fim_spm_rate): - # SPM (variant 2 from FIM paper) - new_sample = np.concatenate( - [[prefix_tok_id, suffix_tok_id], suffix, [middle_tok_id], prefix, middle] - ) - else: - # PSM - new_sample = np.concatenate( - [[prefix_tok_id], prefix, [suffix_tok_id], suffix, [middle_tok_id], middle] - ) - - else: - # don't do FIM preproc - new_sample = sample - - return new_sample diff --git a/megatron/training/dist_signal_handler.py b/megatron/training/dist_signal_handler.py index f1f3725c8a9..f4b4fbf5c0d 100644 --- a/megatron/training/dist_signal_handler.py +++ b/megatron/training/dist_signal_handler.py @@ -3,12 +3,6 @@ import torch -SIGNAL_MAP = { - 'SIGTERM': signal.SIGTERM, - 'SIGINT': signal.SIGINT, - 'SIGUSR1': signal.SIGUSR1, - 'SIGUSR2': signal.SIGUSR2 -} def get_world_size(): if torch.distributed.is_available() and torch.distributed.is_initialized(): @@ -55,8 +49,8 @@ def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None): class DistributedSignalHandler: - def __init__(self, sig: str = 'SIGTERM'): - self.sig = SIGNAL_MAP.get(sig, signal.SIGTERM) + def __init__(self, sig=signal.SIGTERM): + self.sig = sig def signals_received(self): all_received = all_gather_item( diff --git a/megatron/training/global_vars.py b/megatron/training/global_vars.py index a718877b40c..ec402263d29 100644 --- a/megatron/training/global_vars.py +++ b/megatron/training/global_vars.py @@ -11,7 +11,7 @@ from megatron.core.energy_monitor import EnergyMonitor from megatron.core.jit import disable_jit_fuser from megatron.core.num_microbatches_calculator import init_num_microbatches_calculator, unset_num_microbatches_calculator -from megatron.training.dist_signal_handler import DistributedSignalHandler +from megatron.training import dist_signal_handler from megatron.training.tokenizer import build_tokenizer _GLOBAL_ARGS = None @@ -74,11 +74,10 @@ def get_signal_handler(): return _GLOBAL_SIGNAL_HANDLER -def _set_signal_handler(exit_signal): - +def _set_signal_handler(): global _GLOBAL_SIGNAL_HANDLER _ensure_var_is_not_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler') - _GLOBAL_SIGNAL_HANDLER = DistributedSignalHandler(exit_signal).__enter__() + _GLOBAL_SIGNAL_HANDLER = dist_signal_handler.DistributedSignalHandler().__enter__() @@ -111,7 +110,7 @@ def set_global_variables(args, build_tokenizer=True): set_experimental_flag(True) if args.exit_signal_handler: - _set_signal_handler(args.exit_signal) + _set_signal_handler() if args.disable_jit_fuser: disable_jit_fuser() diff --git a/megatron/training/training.py b/megatron/training/training.py index 58dcfbde734..9986f931641 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -2,7 +2,6 @@ """Pretrain utilities.""" -import copy import dataclasses from datetime import datetime, timedelta import functools @@ -12,7 +11,7 @@ import math import os import sys -from typing import Any, Optional +from typing import List, Optional import torch.distributed @@ -34,7 +33,7 @@ except ImportError: has_rl_utils = False try: - from modelopt.torch.distill.plugins.megatron import ( + from megatron.post_training.algos.distillation import ( get_tensor_shapes_adjust_fn_for_distillation, ) @@ -76,7 +75,7 @@ from megatron.core.distributed import finalize_model_grads from megatron.core.enums import ModelType -from megatron.core.optimizer import get_megatron_optimizer, AdamOptimizerConfig, SGDOptimizerConfig, OptimizerConfig, ParamKey +from megatron.core.optimizer import get_megatron_optimizer, OptimizerConfig from megatron.core.optimizer.muon import get_megatron_muon_optimizer from megatron.core.rerun_state_machine import ( get_rerun_state_machine, @@ -88,7 +87,7 @@ from megatron.training.initialize import write_args_to_tensorboard from megatron.training.initialize import set_jit_fusion_options from megatron.training.utils import get_batch_on_this_cp_rank, get_batch_on_this_tp_rank -from megatron.training.datasets.data_samplers import build_pretraining_data_loader +from megatron.legacy.data.data_samplers import build_pretraining_data_loader from megatron.core.optimizer_param_scheduler import OptimizerParamScheduler from megatron.core.transformer.moe import upcycling_utils from megatron.core.transformer.moe.moe_utils import track_moe_metrics @@ -162,32 +161,22 @@ def num_floating_point_operations(args, batch_size): def calculate_layer_counts(): """Calculate the number of attention, Mamba, and MLP layers.""" if args.hybrid_override_pattern: - counts = {'M': 0, '*': 0, '-': 0, 'E':0} + counts = {'M': 0, '*': 0, '-': 0} for layer_type in args.hybrid_override_pattern: if layer_type in counts: counts[layer_type] += 1 - return counts['*'], counts['M'], counts['-'], counts['E'] + return counts['*'], counts['M'], counts['-'] else: num_attn_layers = round(args.num_layers * args.hybrid_attention_ratio) num_mlp_layers = round(args.num_layers * args.hybrid_mlp_ratio) num_mamba_layers = args.num_layers - num_attn_layers - num_mlp_layers - num_moe_layers = 0 - return num_attn_layers, num_mamba_layers, num_mlp_layers, num_moe_layers + return num_attn_layers, num_mamba_layers, num_mlp_layers def mlp_layer_flops(batch_size, seq_len, hidden_size, expansion=4.0, swiglu=False): """Calculate FLOPs for an MLP layer.""" scale_factor = 3.0 / 2.0 if swiglu else 1.0 return 4 * expansion * scale_factor * batch_size * seq_len * hidden_size**2 - def moe_layer_flops(batch_size, seq_len, hidden_size, moe_ffn_hidden_size, - shared_expert_ffn_hidden_size, num_experts_routed_to, swiglu=False): - """Calculate FLOPs for an MoE layer.""" - scale_factor = 3.0 / 2.0 if swiglu else 1.0 - routed_flops = (4 * batch_size * seq_len * hidden_size * - moe_ffn_hidden_size * num_experts_routed_to * scale_factor) - shared_flops = 4 * batch_size * seq_len * hidden_size * shared_expert_ffn_hidden_size * scale_factor - return routed_flops + shared_flops - def attn_layer_flops( batch_size, seq_len, hidden_size, num_heads, gqa=True, gqa_groups=8, kv_channels=None ): @@ -226,13 +215,12 @@ def mamba_layer_flops(batch_size, seq_len, hidden_size, state_dim=16, ) def hybrid_flops(batch_size, seq_len, hidden_size, - num_attn_layers, num_mamba_layers, num_mlp_layers, num_moe_layers, + num_attn_layers, num_mamba_layers, num_mlp_layers, mamba_state_dim=128, mamba_head_dim=64, mamba_num_groups=8, mamba_num_heads=128, - num_attn_heads=32, gqa=True, + num_attn_heads=32,gqa=True, gqa_groups=8, kv_channels=None, mlp_expansion=4.0, swiglu=False, - moe_ffn_hidden_size=2048, shared_expert_ffn_hidden_size=2048, num_experts_routed_to=1, vocab_size=256000): """Calculate total FLOPs for the hybrid model.""" flops_fwd = ( @@ -243,8 +231,6 @@ def hybrid_flops(batch_size, seq_len, hidden_size, num_mamba_layers * mamba_layer_flops(batch_size, seq_len, hidden_size, mamba_state_dim, mamba_head_dim, mamba_num_groups, mamba_num_heads) + - num_moe_layers * moe_layer_flops(batch_size, seq_len, hidden_size, moe_ffn_hidden_size, - shared_expert_ffn_hidden_size, num_experts_routed_to, swiglu) + (2 * batch_size * seq_len * hidden_size * vocab_size) # logits computation ) return flops_fwd * 3 @@ -493,7 +479,7 @@ def transformer_flops(): # Main entrypoint for FLOPs calculation. if args.is_hybrid_model: # Calculate the number of each type of layer. - num_attn_layers, num_mamba_layers, num_mlp_layers, num_moe_layers = calculate_layer_counts() + num_attn_layers, num_mamba_layers, num_mlp_layers = calculate_layer_counts() # Compute hybrid model FLOPs. return hybrid_flops( @@ -503,7 +489,6 @@ def transformer_flops(): num_attn_layers=num_attn_layers, num_mamba_layers=num_mamba_layers, num_mlp_layers=num_mlp_layers, - num_moe_layers=num_moe_layers, mamba_state_dim=args.mamba_state_dim, mamba_head_dim=args.mamba_head_dim, mamba_num_groups=args.mamba_num_groups, @@ -514,11 +499,6 @@ def transformer_flops(): kv_channels=args.kv_channels, mlp_expansion=args.ffn_hidden_size / args.hidden_size, swiglu=args.swiglu, - moe_ffn_hidden_size=(args.moe_ffn_hidden_size if args.moe_ffn_hidden_size is not None - else args.ffn_hidden_size), - shared_expert_ffn_hidden_size=(0 if args.moe_shared_expert_intermediate_size is None - else args.moe_shared_expert_intermediate_size), - num_experts_routed_to=args.moe_router_topk, vocab_size=args.padded_vocab_size, ) else: @@ -614,6 +594,30 @@ def reorder_inner_param_groups(optimizer_state_dict): return preprocessed_common_state_dict +def get_no_weight_decay_cond(no_weight_decay_cond_type, default_skip_embedding_weight_decay): + """Get the no weight decay condition function.""" + + # Default case: no_weight_decay_cond_type is None + no_weight_decay_cond_fn = None + + if no_weight_decay_cond_type == 'apply_wd_to_qk_layernorm': + # Qwen3-Next applies weight decay to qk layernorm as a special case + def apply_wd_to_qk_layernorm_fn(name, param): + if "q_layernorm" in name or "k_layernorm" in name: + no_wd = False + else: + no_wd = ( + name.endswith(".bias") + or len(param.shape) == 1 + or (default_skip_embedding_weight_decay and "embedding" in name) + ) + return no_wd + no_weight_decay_cond_fn = apply_wd_to_qk_layernorm_fn + elif no_weight_decay_cond_type is not None: + raise ValueError(f"Invalid no_weight_decay_cond_type: {no_weight_decay_cond_type}") + + return no_weight_decay_cond_fn + def pretrain( train_valid_test_dataset_provider, model_provider, @@ -750,8 +754,15 @@ def pretrain( # Model, optimizer, and learning rate. timers('model-and-optimizer-setup', log_level=0).start(barrier=True) + no_weight_decay_cond = get_no_weight_decay_cond( + args.no_weight_decay_cond_type, + default_skip_embedding_weight_decay=args.embedding_init_method_std is not None, + ) model, optimizer, opt_param_scheduler = setup_model_and_optimizer( - model_provider, model_type, checkpointing_context=checkpointing_context + model_provider, + model_type, + checkpointing_context=checkpointing_context, + no_weight_decay_cond=no_weight_decay_cond, ) timers('model-and-optimizer-setup').stop() @@ -1167,45 +1178,12 @@ def get_optimizer_param_scheduler(optimizer): return opt_param_scheduler -def get_megatron_optimizer_config(args: Any) -> OptimizerConfig: - """Return a Megatron optimizer config object from Megatron's arguments.""" - - config = None - if args.optimizer == 'adam' or 'muon' in args.optimizer: - # TODO(deyuf): Muon needs both adam + muon but get() only receive one config - # So for now we keep using adam config that's back compat with old way - kwargs = {} - for f in dataclasses.fields(AdamOptimizerConfig): - if hasattr(args, f.name): - kwargs[f.name] = getattr(args, f.name) - config = AdamOptimizerConfig(**kwargs) - elif args.optimizer == 'sgd': - kwargs = {} - for f in dataclasses.fields(SGDOptimizerConfig): - if hasattr(args, f.name): - kwargs[f.name] = getattr(args, f.name) - config = SGDOptimizerConfig(**kwargs) - else: - raise ValueError("Invalid optimizer type!") - - # Construct the appropriate config_overrides object. - # TODO: add more logic here as needed down the road. - if args.decoupled_lr is not None: - decoupled_param_key = ParamKey(attr="is_embedding_or_output_parameter") - decoupled_optimizer_config = copy.deepcopy(config) - decoupled_optimizer_config.lr = args.decoupled_lr - if args.decoupled_min_lr is not None: - decoupled_optimizer_config.min_lr = args.decoupled_min_lr - config_overrides = {decoupled_param_key: decoupled_optimizer_config} - else: - config_overrides = None - - return config, config_overrides - - def setup_model_and_optimizer( model_provider_func, model_type, + no_weight_decay_cond=None, + scale_lr_cond=None, + lr_mult=1.0, checkpointing_context=None, ): """Setup model and optimizer.""" @@ -1217,25 +1195,33 @@ def setup_model_and_optimizer( unwrapped_model = unwrap_model(model) one_logger and one_logger.log_metrics({"app_build_optimzer_start_time": one_logger_utils.get_timestamp_in_ms()}) - config, config_overrides = get_megatron_optimizer_config(args) + kwargs = {} + for f in dataclasses.fields(OptimizerConfig): + if hasattr(args, f.name): + kwargs[f.name] = getattr(args, f.name) + config = OptimizerConfig(**kwargs) config.timers = timers if 'muon' not in config.optimizer: - # If the user is asking for a non-zero embedding init std, skip weight decay for embeddings - # to avoid embeddings from shrinking to zero as recommended in https://arxiv.org/abs/2312.16903 - # default_skip_embedding_weight_decay=args.embedding_init_method_std is not None, optimizer = get_megatron_optimizer( config, model, - config_overrides=config_overrides, + no_weight_decay_cond, + scale_lr_cond, + lr_mult, use_gloo_process_groups=args.enable_gloo_process_groups, + # If the user is asking for a non-zero embedding init std, skip weight decay for embeddings + # to avoid embeddings from shrinking to zero as recommended in https://arxiv.org/abs/2312.16903 + default_skip_embedding_weight_decay=args.embedding_init_method_std is not None, dump_param_to_param_group_map=args.dump_param_to_param_group_map, ) else: optimizer = get_megatron_muon_optimizer( config, model, - config_overrides=config_overrides, + no_weight_decay_cond, + scale_lr_cond, + lr_mult, use_gloo_process_groups=args.enable_gloo_process_groups, layer_wise_distributed_optimizer='dist' in config.optimizer, ) @@ -1379,10 +1365,7 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch if has_nvidia_modelopt: # [ModelOpt]: Pipeline-parallel Distillation stacks student and teacher tensors adjust_tensor_shapes_fn = get_tensor_shapes_adjust_fn_for_distillation( - model, - seq_length=args.seq_length, - micro_batch_size=args.micro_batch_size, - decoder_seq_length=args.decoder_seq_length, + model, args.seq_length, args.micro_batch_size, args.decoder_seq_length ) else: adjust_tensor_shapes_fn = None @@ -1511,6 +1494,7 @@ def training_log( loss_dict, total_loss_dict, learning_rate, + decoupled_learning_rate, iteration, loss_scale, report_memory_flag, @@ -1615,6 +1599,8 @@ def training_log( writer.add_scalar('learning-rate vs samples', learning_rate, args.consumed_train_samples) if wandb_writer: wandb_writer.log({'learning-rate': learning_rate}, iteration) + if args.decoupled_lr is not None: + writer.add_scalar('decoupled-learning-rate', decoupled_learning_rate, iteration) if args.skipped_train_samples > 0: writer.add_scalar('skipped-train-samples', args.skipped_train_samples, iteration) if wandb_writer: @@ -1694,12 +1680,6 @@ def training_log( track_names.append("global_load_balancing_loss") if args.moe_z_loss_coeff is not None: track_names.append("z_loss") - - if args.is_hybrid_model: - layers = args.hybrid_override_pattern.count('E') - else: - layers = args.num_layers - track_moe_metrics( loss_scale=moe_loss_scale, iteration=iteration, @@ -1709,7 +1689,7 @@ def training_log( per_layer_logging=args.moe_per_layer_logging, force_initialize=True, track_names=track_names, - num_layers=layers, + num_layers=args.num_layers, moe_layer_freq=args.moe_layer_freq, mtp_num_layers=args.mtp_num_layers, ) @@ -1770,6 +1750,14 @@ def training_log( wandb_writer.log({'power/gpu': power}, iteration) # Decoupled_learning_rate should be not None only on first and last pipeline stage. log_string += f' learning rate: {learning_rate:.6E} |' + if args.decoupled_lr is not None and ( + mpu.is_pipeline_first_stage(ignore_virtual=True) + or mpu.is_pipeline_last_stage(ignore_virtual=True) + ): + assert decoupled_learning_rate is not None + log_string += f' decoupled learning rate: {decoupled_learning_rate:.6E} |' + else: + assert decoupled_learning_rate is None log_string += f' global batch size: {batch_size:5d} |' for key in total_loss_dict: if key not in [advanced_iters_key, skipped_iters_key, nan_iters_key]: @@ -2535,15 +2523,19 @@ def get_e2e_base_metrics(): if args.log_params_norm: params_norm = calc_params_l2_norm(model) learning_rate = None + decoupled_learning_rate = None for param_group in optimizer.param_groups: if len(param_group['params']) == 0: continue - if param_group['default_config']: + if param_group['is_decoupled_lr']: + decoupled_learning_rate = param_group['lr'] + else: learning_rate = param_group['lr'] report_memory_flag = training_log( loss_dict, total_loss_dict, learning_rate, + decoupled_learning_rate, iteration, loss_scale, report_memory_flag, diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 9b13d66c7a7..ecb7163ff70 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -20,7 +20,6 @@ from megatron.training.arguments import core_transformer_config_from_args from megatron.training import get_args, get_timers, get_tokenizer, inprocess_restart, pretrain, print_rank_0 from megatron.training.datasets.sft_dataset import SFTDataset -from megatron.training.datasets.fim_dataset import GPTFIMDataset, GPTFIMDatasetConfig from megatron.training.utils import ( get_batch_on_this_cp_rank, get_batch_on_this_tp_rank, @@ -186,49 +185,26 @@ def core_gpt_dataset_config_from_args(args): blend_per_split: Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]] blend, blend_per_split = get_blend_and_blend_per_split(args) - data_args = { - "random_seed": args.seed, - "sequence_length": args.seq_length, - "blend": blend, - "blend_per_split": blend_per_split, - "split": args.split, - "multiple_validation_sets": args.multiple_validation_sets, - "full_validation": args.full_validation, - "num_dataset_builder_threads": args.num_dataset_builder_threads, - "path_to_cache": args.data_cache_path, - "mmap_bin_files": args.mmap_bin_files, - "tokenizer": tokenizer, - "reset_position_ids": args.reset_position_ids, - "reset_attention_mask": args.reset_attention_mask, - "eod_mask_loss": args.eod_mask_loss, - "create_attention_mask": args.create_attention_mask_in_dataloader, - "object_storage_cache_path": args.object_storage_cache_path, - "mid_level_dataset_surplus": args.mid_level_dataset_surplus, - "allow_ambiguous_pad_tokens": args.allow_ambiguous_pad_tokens, - } - - # add FIM args to the config - if args.fim_data: - extra_tokens = { - "prefix": args.fim_prefix_token, - "middle": args.fim_middle_token, - "suffix": args.fim_suffix_token, - "pad": args.fim_pad_token, - "eod": args.fim_eod_token, - } - data_args.update( - { - "fim_rate": args.fim_rate, - "fim_spm_rate": args.fim_spm_rate, - "fim_extra_tokens": extra_tokens, - "fim_split_sample": args.fim_split_sample, - "fim_fragment_rate": args.fim_fragment_rate, - "fim_no_prefix": args.fim_no_prefix, - } - ) - return GPTFIMDatasetConfig(**data_args) - - return GPTDatasetConfig(**data_args) + return GPTDatasetConfig( + random_seed=args.seed, + sequence_length=args.seq_length, + blend=blend, + blend_per_split=blend_per_split, + split=args.split, + multiple_validation_sets=args.multiple_validation_sets, + full_validation=args.full_validation, + num_dataset_builder_threads=args.num_dataset_builder_threads, + path_to_cache=args.data_cache_path, + mmap_bin_files=args.mmap_bin_files, + tokenizer=tokenizer, + reset_position_ids=args.reset_position_ids, + reset_attention_mask=args.reset_attention_mask, + eod_mask_loss=args.eod_mask_loss, + create_attention_mask=args.create_attention_mask_in_dataloader, + object_storage_cache_path=args.object_storage_cache_path, + mid_level_dataset_surplus=args.mid_level_dataset_surplus, + allow_ambiguous_pad_tokens=args.allow_ambiguous_pad_tokens, + ) def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None): @@ -246,8 +222,6 @@ def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None else: if args.mock_data: dataset_type = MockGPTDataset - elif args.fim_data: - dataset_type = GPTFIMDataset else: dataset_type = GPTDataset diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_h100.json deleted file mode 100644 index cd90888e65d..00000000000 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/golden_values_dev_dgx_h100.json +++ /dev/null @@ -1,287 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 10.89074, - "2": 10.89234, - "3": 10.89032, - "4": 10.89221, - "5": 10.89416, - "6": 10.90226, - "7": 10.8884, - "8": 10.90211, - "9": 10.90202, - "10": 10.88512, - "11": 10.87636, - "12": 10.89499, - "13": 10.89837, - "14": 10.89182, - "15": 10.85125, - "16": 10.8534, - "17": 10.82862, - "18": 10.83653, - "19": 10.82847, - "20": 10.74583, - "21": 10.73117, - "22": 10.61256, - "23": 10.72616, - "24": 10.62932, - "25": 10.59394, - "26": 10.63357, - "27": 10.63137, - "28": 10.58201, - "29": 10.58671, - "30": 10.40936, - "31": 10.15873, - "32": 10.48319, - "33": 10.46977, - "34": 10.23978, - "35": 10.28144, - "36": 10.23894, - "37": 10.35198, - "38": 10.20565, - "39": 10.40496, - "40": 10.09271, - "41": 10.16148, - "42": 10.2231, - "43": 9.84152, - "44": 9.97329, - "45": 9.84544, - "46": 9.82102, - "47": 10.14261, - "48": 9.86553, - "49": 9.54033, - "50": 9.9169 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 1544.0, - "2": 1729.0, - "3": 1672.0, - "4": 1807.0, - "5": 1942.0, - "6": 1736.0, - "7": 1956.0, - "8": 1716.0, - "9": 2011.0, - "10": 1385.0, - "11": 1864.0, - "12": 1767.0, - "13": 2019.0, - "14": 1787.0, - "15": 1828.0, - "16": 1908.0, - "17": 1718.0, - "18": 1602.0, - "19": 1785.0, - "20": 1679.0, - "21": 1917.0, - "22": 1712.0, - "23": 2034.0, - "24": 1752.0, - "25": 1645.0, - "26": 1820.0, - "27": 1915.0, - "28": 1996.0, - "29": 2051.0, - "30": 1890.0, - "31": 1577.0, - "32": 1886.0, - "33": 2116.0, - "34": 1912.0, - "35": 2037.0, - "36": 1924.0, - "37": 2462.0, - "38": 2241.0, - "39": 2321.0, - "40": 2221.0, - "41": 2345.0, - "42": 2386.0, - "43": 2027.0, - "44": 2211.0, - "45": 2096.0, - "46": 2285.0, - "47": 2536.0, - "48": 2289.0, - "49": 2270.0, - "50": 2421.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 581489664.0, - "2": 581489664.0, - "3": 581489664.0, - "4": 581489664.0, - "5": 581489664.0, - "6": 581489664.0, - "7": 581489664.0, - "8": 581489664.0, - "9": 581489664.0, - "10": 581489664.0, - "11": 581489664.0, - "12": 581489664.0, - "13": 581489664.0, - "14": 581489664.0, - "15": 581489664.0, - "16": 581489664.0, - "17": 581489664.0, - "18": 581489664.0, - "19": 581489664.0, - "20": 581489664.0, - "21": 581489664.0, - "22": 581489664.0, - "23": 581489664.0, - "24": 581489664.0, - "25": 581489664.0, - "26": 581489664.0, - "27": 581489664.0, - "28": 581489664.0, - "29": 581489664.0, - "30": 581489664.0, - "31": 581489664.0, - "32": 581489664.0, - "33": 581489664.0, - "34": 581489664.0, - "35": 581489664.0, - "36": 581489664.0, - "37": 581489664.0, - "38": 581489664.0, - "39": 581489664.0, - "40": 581489664.0, - "41": 581489664.0, - "42": 581489664.0, - "43": 581489664.0, - "44": 581489664.0, - "45": 581489664.0, - "46": 581489664.0, - "47": 581489664.0, - "48": 581489664.0, - "49": 581489664.0, - "50": 581489664.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 4605814272.0, - "2": 4702430720.0, - "3": 4702430720.0, - "4": 4702430720.0, - "5": 4702430720.0, - "6": 4702430720.0, - "7": 4702430720.0, - "8": 4702430720.0, - "9": 4702430720.0, - "10": 4702430720.0, - "11": 4702430720.0, - "12": 4702430720.0, - "13": 4702430720.0, - "14": 4702430720.0, - "15": 4702430720.0, - "16": 4702430720.0, - "17": 4702430720.0, - "18": 4702430720.0, - "19": 4702430720.0, - "20": 4702430720.0, - "21": 4702430720.0, - "22": 4702430720.0, - "23": 4702430720.0, - "24": 4702430720.0, - "25": 4702430720.0, - "26": 4702430720.0, - "27": 4702430720.0, - "28": 4702430720.0, - "29": 4702430720.0, - "30": 4702430720.0, - "31": 4702430720.0, - "32": 4702430720.0, - "33": 4702430720.0, - "34": 4702430720.0, - "35": 4702430720.0, - "36": 4702430720.0, - "37": 4702430720.0, - "38": 4702430720.0, - "39": 4702430720.0, - "40": 4702430720.0, - "41": 4702430720.0, - "42": 4702430720.0, - "43": 4702430720.0, - "44": 4702430720.0, - "45": 4702430720.0, - "46": 4702430720.0, - "47": 4702430720.0, - "48": 4702430720.0, - "49": 4702430720.0, - "50": 4702430720.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 6.95394, - "2": 0.0878, - "3": 0.06953, - "4": 0.07916, - "5": 0.06775, - "6": 0.07681, - "7": 0.06695, - "8": 0.0786, - "9": 0.0664, - "10": 0.08059, - "11": 0.06554, - "12": 0.07501, - "13": 0.06663, - "14": 0.06608, - "15": 0.06585, - "16": 0.06738, - "17": 0.067, - "18": 0.06553, - "19": 0.06755, - "20": 0.06723, - "21": 0.06559, - "22": 0.0664, - "23": 0.06722, - "24": 0.06553, - "25": 0.06829, - "26": 0.06873, - "27": 0.06733, - "28": 0.06731, - "29": 0.06824, - "30": 0.06696, - "31": 0.06661, - "32": 0.06587, - "33": 0.06588, - "34": 0.06564, - "35": 0.06761, - "36": 0.06655, - "37": 0.06712, - "38": 0.06601, - "39": 0.06661, - "40": 0.06632, - "41": 0.0691, - "42": 0.06551, - "43": 0.06839, - "44": 0.06528, - "45": 0.06744, - "46": 0.0675, - "47": 0.06698, - "48": 0.0649, - "49": 0.06596, - "50": 0.06581 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/model_config.yaml deleted file mode 100644 index ddc8286573b..00000000000 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset/model_config.yaml +++ /dev/null @@ -1,56 +0,0 @@ -ENV_VARS: - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NCCL_ALGO: Ring - CUBLAS_WORKSPACE_CONFIG: :4096:8 -MODEL_ARGS: - --num-layers: 12 - --hidden-size: 512 - --num-attention-heads: 8 - --log-params-norm: true - --log-num-zeros-in-grad: true - --log-validation-ppl-to-tensorboard: true - --log-timers-to-tensorboard: true - --tensorboard-dir: ${TENSORBOARD_PATH} - --micro-batch-size: 4 - --global-batch-size: 32 - --seq-length: 1024 - --max-position-embeddings: 1024 - --train-iters: 50 - --timing-log-level: 0 - --lr-decay-iters: 320000 - --save: ${CHECKPOINT_SAVE_PATH} - --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document - --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json - --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt - --split: 949,50,1 - --distributed-backend: nccl - --lr: 0.00015 - --lr-decay-style: cosine - --min-lr: 1.0e-5 - --weight-decay: 1e-2 - --clip-grad: 1.0 - --lr-warmup-fraction: .01 - --log-interval: 1 - --save-interval: 10000 - --eval-interval: 1000 - --eval-iters: 10 - --transformer-impl: transformer_engine - --tensor-model-parallel-size: 1 - --pipeline-model-parallel-size: 1 - --use-distributed-optimizer: true - --deterministic-mode: true - --no-gradient-accumulation-fusion: true - --attention-softmax-in-fp32: true - --use-mcore-models: true - --ckpt-format: torch_dist - --dist-ckpt-strictness: log_all # backward compatibility for TE changes - --data-cache-path: ${DATA_CACHE_PATH} - --bf16: true - --attention-backend: unfused - --log-memory-to-tensorboard: true - --fim-data: true - --fim-rate: 0.5 - --fim-spm-rate: 0.5 -TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgx_h100.json index cbc5f4fa3ae..12a9b70df83 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/golden_values_dev_dgx_h100.json @@ -1,187 +1,178 @@ { - "0": { - "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", - "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", - "generated_tokens": [ - 3060, - 2430, - 1636, - 2012, - 1317, - 1278, - 2362, - 1307, - 1278, - 16070, - 1044, - 1321, - 1636, - 23067, - 1455, - 1593, - 1395, - 1605, - 3140, - 5152, - 1513, - 1747, - 1046, - 2409, - 1395, - 3140, - 5152, - 1513, - 1278, - 2362 - ], - "latency": 0.2963709831237793, - "cuda_graph_request_count_map": { - "852": 0, - "840": 0, - "784": 0, - "728": 0, - "672": 0, - "616": 0, - "560": 0, - "504": 0, - "448": 0, - "392": 0, - "336": 0, - "280": 0, - "224": 0, - "168": 0, - "112": 0, - "56": 29 - }, - "step_count": 240, - "logprobs": [ - -9.362494468688965, - -2.827894449234009, - -4.557381629943848, - -1.4968647956848145, - -0.717312216758728, - -1.7262351512908936, - -2.522736072540283, - -2.1782360076904297, - -2.3603432178497314, - -6.136383533477783, - -1.4676916599273682, - -3.468963384628296, - -4.424870491027832, - -3.7345848083496094, - -2.012619972229004, - -1.8833301067352295, - -3.5708768367767334, - -6.8197832107543945, - -0.3122292757034302, - -0.9820290207862854, - -6.532033443450928, - -7.498172760009766, - -12.615165710449219, - -2.409003496170044, - -3.8550546169281006, - -0.5105050802230835, - -4.2802581787109375, - -0.06971167027950287, - -0.054025799036026, - -3.319596767425537, - -9.703240394592285, - -1.0997297763824463, - -6.224854469299316, - -5.234503269195557, - -3.934987783432007, - -2.5263679027557373, - -3.1843955516815186, - -5.880871295928955, - -1.8436813354492188, - -5.906496047973633, - -12.15787410736084, - -12.5841064453125, - -0.0819428563117981, - -2.6212656497955322, - -1.4329369068145752, - -2.885145425796509, - -1.2901865243911743, - -0.006647023372352123, - -3.5115818977355957, - -12.945953369140625, - -3.793078899383545, - -3.0094375610351562, - -5.966838836669922, - -0.8998424410820007, - -0.040962252765893936, - -1.5467679500579834, - -1.0785343647003174, - -5.73494815826416, - -0.38491737842559814, - -5.017007827758789, - -0.5568072199821472, - -0.5968841910362244, - -2.3609962463378906, - -13.582086563110352, - -0.09050048142671585, - -3.7264108657836914, - -1.1208789348602295, - -6.052675247192383, - -0.5848909616470337, - -3.5906238555908203, - -0.9494907855987549, - -1.5676641464233398, - -5.127577781677246, - -17.19189453125, - -6.698403835296631, - -1.0449178218841553, - -4.365664958953857, - -1.1243419647216797, - -2.2092156410217285, - -1.8081634044647217, - -0.23330983519554138, - -9.439546585083008, - -0.2947109341621399, - -7.253565788269043, - -2.3855936527252197, - -4.629369258880615, - -3.4186267852783203, - -1.9727531671524048, - -2.354729652404785, - -1.474542498588562, - -2.48478364944458, - -1.7641210556030273, - -1.1853944063186646, - -2.8624324798583984, - -0.5740103125572205, - -0.4542185962200165, - -1.4300930500030518, - -0.8807456493377686, - -0.4597663879394531, - -0.9252307415008545, - -1.648141860961914, - -0.44453874230384827, - -1.818476915359497, - -0.5714479088783264, - -1.2115143537521362, - -1.0910619497299194, - -0.0023161747958511114, - -1.3206473588943481, - -0.008621376007795334, - -0.7551823854446411, - -0.9404395818710327, - -0.07279698550701141, - -0.9365248680114746, - -0.03344438225030899, - -1.9720849990844727, - -1.3928067684173584, - -0.7453650832176208 - ] - }, - "throughput": [ - 5.425516447410972, - 95.53889537647129, - 98.64633360458717, - 100.31860128598137, - 100.41338716203114, - 100.2318180695741, - 100.30260782227111, - 100.30996418216475 - ] + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", + "generated_tokens": [ + 3060, + 2430, + 1636, + 2012, + 1317, + 1278, + 2362, + 1307, + 1278, + 16070, + 1044, + 1321, + 1636, + 23067, + 1455, + 1593, + 1395, + 1605, + 3140, + 5152, + 1513, + 1747, + 1046, + 2409, + 1395, + 3140, + 5152, + 1513, + 1278, + 2362 + ], + "latency": 0.29413437843322754, + "cuda_graph_request_count_map": { + "372": 0, + "360": 0, + "336": 0, + "312": 0, + "288": 0, + "264": 0, + "240": 0, + "216": 0, + "192": 0, + "168": 0, + "144": 0, + "120": 0, + "96": 0, + "72": 0, + "48": 0, + "24": 29 + }, + "step_count": 240, + "logprobs": [ + -9.362494468688965, + -2.827894449234009, + -4.557381629943848, + -1.4968647956848145, + -0.717312216758728, + -1.7262351512908936, + -2.522736072540283, + -2.1782360076904297, + -2.3603432178497314, + -6.136383533477783, + -1.4676916599273682, + -3.468963384628296, + -4.424870491027832, + -3.7345848083496094, + -2.012619972229004, + -1.8833301067352295, + -3.5708768367767334, + -6.8197832107543945, + -0.3122292757034302, + -0.9820290207862854, + -6.532033443450928, + -7.498172760009766, + -12.615165710449219, + -2.409003496170044, + -3.8550546169281006, + -0.5105050802230835, + -4.2802581787109375, + -0.06971167027950287, + -0.054025799036026, + -3.319596767425537, + -9.703240394592285, + -1.0997297763824463, + -6.224854469299316, + -5.234503269195557, + -3.934987783432007, + -2.5263679027557373, + -3.1843955516815186, + -5.880871295928955, + -1.8436813354492188, + -5.906496047973633, + -12.15787410736084, + -12.5841064453125, + -0.0819428563117981, + -2.6212656497955322, + -1.4329369068145752, + -2.885145425796509, + -1.2901865243911743, + -0.006647023372352123, + -3.5115818977355957, + -12.945953369140625, + -3.793078899383545, + -3.0094375610351562, + -5.966838836669922, + -0.8998424410820007, + -0.040962252765893936, + -1.5467679500579834, + -1.0785343647003174, + -5.73494815826416, + -0.38491737842559814, + -5.017007827758789, + -0.5568072199821472, + -0.5968841910362244, + -2.3609962463378906, + -13.582086563110352, + -0.09050048142671585, + -3.7264108657836914, + -1.1208789348602295, + -6.052675247192383, + -0.5848909616470337, + -3.5906238555908203, + -0.9494907855987549, + -1.5676641464233398, + -5.127577781677246, + -17.19189453125, + -6.698403835296631, + -1.0449178218841553, + -4.365664958953857, + -1.1243419647216797, + -2.2092156410217285, + -1.8081634044647217, + -0.23330983519554138, + -9.439546585083008, + -0.2947109341621399, + -7.253565788269043, + -2.3855936527252197, + -4.629369258880615, + -3.4186267852783203, + -1.9727531671524048, + -2.331681251525879, + -1.5606917142868042, + -2.454296588897705, + -1.5334703922271729, + -1.2631131410598755, + -2.657367706298828, + -0.6480202078819275, + -0.4550393521785736, + -1.3625166416168213, + -0.8142069578170776, + -0.4496593475341797, + -0.9312890768051147, + -1.732723355293274, + -0.44613128900527954, + -1.6895122528076172, + -0.6082233190536499, + -1.0978344678878784, + -1.1122435331344604, + -0.002520838286727667, + -1.4072327613830566, + -0.007462364621460438, + -0.7548662424087524, + -0.9937503337860107, + -0.0675487294793129, + -0.9595617055892944, + -0.029961343854665756, + -2.205785036087036, + -1.2615025043487549, + -0.7878209352493286 + ] + }, + "throughput": [104.98559493782837, 104.98559493782837] } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml index 15a4a655049..0675b047464 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch/model_config.yaml @@ -46,6 +46,8 @@ MODEL_ARGS: --return-log-probs: true --num-tokens-to-generate: 30 --enable-cuda-graph: true + --inference-dynamic-batching-buffer-guaranteed-fraction: 0 + --inference-dynamic-batching-buffer-overflow-factor: 0.2 --inference-dynamic-batching-buffer-size-gb: 20 --dist-ckpt-strictness: log_unexpected --inference-ckpt-non-strict: true # To handle the extra_state errors diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgx_h100.json index c22bb604f94..8e07dfee229 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/golden_values_dev_dgx_h100.json @@ -1,187 +1,178 @@ { - "0": { - "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", - "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", - "generated_tokens": [ - 3060, - 2430, - 1636, - 2012, - 1317, - 1278, - 2362, - 1307, - 1278, - 16070, - 1044, - 1321, - 1636, - 23067, - 1455, - 1593, - 1395, - 1605, - 3140, - 5152, - 1513, - 1747, - 1046, - 2409, - 1395, - 3140, - 5152, - 1513, - 1278, - 2362 - ], - "latency": 0.38181447982788086, - "cuda_graph_request_count_map": { - "852": 0, - "840": 0, - "784": 0, - "728": 0, - "672": 0, - "616": 0, - "560": 0, - "504": 0, - "448": 0, - "392": 0, - "336": 0, - "280": 0, - "224": 0, - "168": 0, - "112": 0, - "56": 29 - }, - "step_count": 240, - "logprobs": [ - -9.362494468688965, - -2.827894449234009, - -4.557381629943848, - -1.4968647956848145, - -0.717312216758728, - -1.7262351512908936, - -2.522736072540283, - -2.1782360076904297, - -2.3603432178497314, - -6.136383533477783, - -1.4676916599273682, - -3.468963384628296, - -4.424870491027832, - -3.7345848083496094, - -2.012619972229004, - -1.8833301067352295, - -3.5708768367767334, - -6.8197832107543945, - -0.3122292757034302, - -0.9820290207862854, - -6.532033443450928, - -7.498172760009766, - -12.615165710449219, - -2.409003496170044, - -3.8550546169281006, - -0.5105050802230835, - -4.2802581787109375, - -0.06971167027950287, - -0.054025799036026, - -3.319596767425537, - -9.703240394592285, - -1.0997297763824463, - -6.224854469299316, - -5.234503269195557, - -3.934987783432007, - -2.5263679027557373, - -3.1843955516815186, - -5.880871295928955, - -1.8436813354492188, - -5.906496047973633, - -12.15787410736084, - -12.5841064453125, - -0.0819428563117981, - -2.6212656497955322, - -1.4329369068145752, - -2.885145425796509, - -1.2901865243911743, - -0.006647023372352123, - -3.5115818977355957, - -12.945953369140625, - -3.793078899383545, - -3.0094375610351562, - -5.966838836669922, - -0.8998424410820007, - -0.040962252765893936, - -1.5467679500579834, - -1.0785343647003174, - -5.73494815826416, - -0.38491737842559814, - -5.017007827758789, - -0.5568072199821472, - -0.5968841910362244, - -2.3609962463378906, - -13.582086563110352, - -0.09050048142671585, - -3.7264108657836914, - -1.1208789348602295, - -6.052675247192383, - -0.5848909616470337, - -3.5906238555908203, - -0.9494907855987549, - -1.5676641464233398, - -5.127577781677246, - -17.19189453125, - -6.698403835296631, - -1.0449178218841553, - -4.365664958953857, - -1.1243419647216797, - -2.2092156410217285, - -1.8081634044647217, - -0.23330983519554138, - -9.439546585083008, - -0.2947109341621399, - -7.253565788269043, - -2.3855936527252197, - -4.629369258880615, - -3.4186267852783203, - -1.9727531671524048, - -2.354729652404785, - -1.474542498588562, - -2.48478364944458, - -1.7641210556030273, - -1.1853944063186646, - -2.8624324798583984, - -0.5740103125572205, - -0.4542185962200165, - -1.4300930500030518, - -0.8807456493377686, - -0.4597663879394531, - -0.9252307415008545, - -1.648141860961914, - -0.44453874230384827, - -1.818476915359497, - -0.5714479088783264, - -1.2115143537521362, - -1.0910619497299194, - -0.0023161747958511114, - -1.3206473588943481, - -0.008621376007795334, - -0.7551823854446411, - -0.9404395818710327, - -0.07279698550701141, - -0.9365248680114746, - -0.03344438225030899, - -1.9720849990844727, - -1.3928067684173584, - -0.7453650832176208 - ] - }, - "throughput": [ - 3.896181563640281, - 77.1287764739343, - 77.17674536709352, - 76.8666671960972, - 77.944911028325, - 77.95118832563914, - 78.13236085816422, - 78.0046829173943 - ] + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", + "generated_tokens": [ + 3060, + 2430, + 1636, + 2012, + 1317, + 1278, + 2362, + 1307, + 1278, + 16070, + 1044, + 1321, + 1636, + 23067, + 1455, + 1593, + 1395, + 1605, + 3140, + 5152, + 1513, + 1747, + 1046, + 2409, + 1395, + 3140, + 5152, + 1513, + 1278, + 2362 + ], + "latency": 0.3712351322174072, + "cuda_graph_request_count_map": { + "372": 0, + "360": 0, + "336": 0, + "312": 0, + "288": 0, + "264": 0, + "240": 0, + "216": 0, + "192": 0, + "168": 0, + "144": 0, + "120": 0, + "96": 0, + "72": 0, + "48": 0, + "24": 29 + }, + "step_count": 240, + "logprobs": [ + -9.362494468688965, + -2.827894449234009, + -4.557381629943848, + -1.4968647956848145, + -0.717312216758728, + -1.7262351512908936, + -2.522736072540283, + -2.1782360076904297, + -2.3603432178497314, + -6.136383533477783, + -1.4676916599273682, + -3.468963384628296, + -4.424870491027832, + -3.7345848083496094, + -2.012619972229004, + -1.8833301067352295, + -3.5708768367767334, + -6.8197832107543945, + -0.3122292757034302, + -0.9820290207862854, + -6.532033443450928, + -7.498172760009766, + -12.615165710449219, + -2.409003496170044, + -3.8550546169281006, + -0.5105050802230835, + -4.2802581787109375, + -0.06971167027950287, + -0.054025799036026, + -3.319596767425537, + -9.703240394592285, + -1.0997297763824463, + -6.224854469299316, + -5.234503269195557, + -3.934987783432007, + -2.5263679027557373, + -3.1843955516815186, + -5.880871295928955, + -1.8436813354492188, + -5.906496047973633, + -12.15787410736084, + -12.5841064453125, + -0.0819428563117981, + -2.6212656497955322, + -1.4329369068145752, + -2.885145425796509, + -1.2901865243911743, + -0.006647023372352123, + -3.5115818977355957, + -12.945953369140625, + -3.793078899383545, + -3.0094375610351562, + -5.966838836669922, + -0.8998424410820007, + -0.040962252765893936, + -1.5467679500579834, + -1.0785343647003174, + -5.73494815826416, + -0.38491737842559814, + -5.017007827758789, + -0.5568072199821472, + -0.5968841910362244, + -2.3609962463378906, + -13.582086563110352, + -0.09050048142671585, + -3.7264108657836914, + -1.1208789348602295, + -6.052675247192383, + -0.5848909616470337, + -3.5906238555908203, + -0.9494907855987549, + -1.5676641464233398, + -5.127577781677246, + -17.19189453125, + -6.698403835296631, + -1.0449178218841553, + -4.365664958953857, + -1.1243419647216797, + -2.2092156410217285, + -1.8081634044647217, + -0.23330983519554138, + -9.439546585083008, + -0.2947109341621399, + -7.253565788269043, + -2.3855936527252197, + -4.629369258880615, + -3.4186267852783203, + -1.9727531671524048, + -2.331681251525879, + -1.5606917142868042, + -2.454296588897705, + -1.5334703922271729, + -1.2631131410598755, + -2.657367706298828, + -0.6480202078819275, + -0.4550393521785736, + -1.3625166416168213, + -0.8142069578170776, + -0.4496593475341797, + -0.9312890768051147, + -1.732723355293274, + -0.44613128900527954, + -1.6895122528076172, + -0.6082233190536499, + -1.0978344678878784, + -1.1122435331344604, + -0.002520838286727667, + -1.4072327613830566, + -0.007462364621460438, + -0.7548662424087524, + -0.9937503337860107, + -0.0675487294793129, + -0.9595617055892944, + -0.029961343854665756, + -2.205785036087036, + -1.2615025043487549, + -0.7878209352493286 + ] + }, + "throughput": [79.88988160240554, 79.88988160240554] } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/model_config.yaml index b368242b9af..2ba9050ceaf 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only/model_config.yaml @@ -47,6 +47,8 @@ MODEL_ARGS: --num-tokens-to-generate: 30 --enable-cuda-graph: true --decode-only-cuda-graphs: true + --inference-dynamic-batching-buffer-guaranteed-fraction: 0 + --inference-dynamic-batching-buffer-overflow-factor: 0.2 --inference-dynamic-batching-buffer-size-gb: 20 --dist-ckpt-strictness: log_unexpected --inference-ckpt-non-strict: true # To handle the extra_state errors diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml index 7fcf9e9cf81..a4f47d3705f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml @@ -22,8 +22,7 @@ MODEL_ARGS: --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ --distributed-backend: nccl --log-interval: 1 - --transformer-impl: inference_optimized - --sequence-parallel: true + --transformer-impl: transformer_engine --tensor-model-parallel-size: 1 --pipeline-model-parallel-size: 1 --deterministic-mode: true @@ -42,6 +41,9 @@ MODEL_ARGS: --top_k: 1 --return-log-probs: true --num-tokens-to-generate: 30 + --inference-dynamic-batching-max-requests-override: 8 # hardcode decode padding tokens to 7 for reproducibility + --inference-dynamic-batching-buffer-guaranteed-fraction: 0 + --inference-dynamic-batching-buffer-overflow-factor: 0.2 --inference-dynamic-batching-buffer-size-gb: 20 --dist-ckpt-strictness: log_unexpected --inference-ckpt-non-strict: true # To handle the extra_state errors diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/golden_values_dev_dgx_h100.json deleted file mode 100644 index 9be8a9dc0ca..00000000000 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/golden_values_dev_dgx_h100.json +++ /dev/null @@ -1,1028 +0,0 @@ -{ - "throughput": [ - 94.6087716527102, - 115.85992244026639, - 138.9562527069375, - 133.18726531918395, - 81.97861561771212, - 134.30726469422635, - 86.456140428456, - 114.99456351298251, - 147.3101800153954, - 3.0364623744653003, - 124.7590786954667, - 134.2276982994434, - 3.0580463134110167, - 117.03969654341354, - 130.92134521286803, - 48.493091604204935, - 1.4498729599486508, - 128.01470907994928, - 1.8330770354872434, - 66.31842482241125, - 82.24189975425459, - 1.07058112939944, - 1.8815468970982412, - 0.9373246942729808, - 134.9963160815443, - 2.285771114682068, - 43.068220270070434, - 134.9677086822377, - 82.44946740133796, - 47.71839155542011, - 114.4199568886962, - 29.67621576315833, - 144.1589742491705, - 95.8164720809401, - 122.80562228460093, - 39.21436814433054, - 3.041180292262413, - 3.2867844729646842, - 72.43808226229888, - 0.8371525937296347, - 1.2212635079980698, - 145.6869075644325, - 42.317711349146016, - 109.1196064871946, - 73.6281770453198, - 140.4495689387567, - 1.219834296561022, - 138.66856497329005, - 23.33818821323391, - 67.82342558671365, - 130.09683254313987, - 147.60199288178146, - 0.9427431720755464, - 3.2856495013162523, - 79.12426666101076, - 86.41557345094756, - 120.17346279825053, - 137.16615251640926, - 108.93291864542198, - 110.10504114490513, - 46.19253755421628, - 0.950218846923012, - 136.50642826951463, - 142.73168666846448, - 1.2206786818073785, - 1.898581377105612, - 131.72636154091063, - 2.2842414327001976, - 89.76521170090028, - 114.66053545744656, - 58.64474290044525, - 0.8367865961030284, - 128.01767795820945, - 60.87292097103301, - 124.20016865241587, - 119.59336898055426, - 0.9425820346281929, - 93.70053305431952, - 1.0728113870213674, - 135.7596767309971, - 112.89357243644062, - 89.2743296587299, - 137.86411291342458, - 135.6974706051771, - 102.59633828443238, - 129.82058179399326, - 139.57672703148444, - 140.5642311163746, - 78.49182953675201, - 123.40912657074227, - 82.74099904578694, - 75.5490641626476, - 93.38596238341951, - 141.19058076067225, - 1.072254167577298, - 100.8669047802279, - 132.77382347347034, - 92.29086179175866, - 137.20301032384705, - 89.57723938765776, - 67.5465256589703, - 0.9498935124108836, - 1.0716887464650027, - 0.8365472180547067, - 137.902625307774, - 132.67132600219722, - 1.45201860416265, - 1.8366476879619427, - 88.65095604379363, - 132.1806036761347, - 126.0481874394642, - 127.43750324083169, - 93.27238135265156, - 109.83884164204308, - 102.30516355984702, - 141.10387096377744, - 0.9425154448032942, - 95.04281981148903, - 103.11525529548061, - 0.8361762901534399, - 135.3171561172067, - 123.30032998064965, - 118.75691144485415, - 82.21375599642211, - 66.37216333263251, - 120.02349229491865, - 27.339414655466246, - 133.1312422227687, - 123.02377779863252, - 111.0798894329, - 58.88405247768833, - 131.31767475108893, - 40.19076958615912, - 123.58362152151858, - 130.6541142941889, - 61.39555613504246, - 43.92154495664044, - 1.037012527495492, - 127.16052127606021, - 137.06554800183082, - 85.67161160523041, - 1.0253417447981334, - 139.20903624514017, - 140.19068787455728, - 117.67416498245059, - 23.410837515725987, - 130.73052473972666, - 22.561824695346466, - 1.028901717647808, - 119.30712483977753, - 117.77548263464804, - 135.2959098119142, - 142.10193821260228, - 1.0366044325624144, - 1.0350271698893887, - 132.8943567509843, - 51.50353963446039, - 113.39559408843714, - 124.25424103796537, - 129.60407993083075, - 136.8566687186031, - 1.036163010240988, - 1.0345739017743927, - 118.72350056844492, - 32.453707095990595, - 43.851925176925825, - 139.39206855448938, - 141.0979597861742, - 132.81461728578432, - 80.95956255477945, - 133.42483643501154, - 57.27721135575491, - 81.47649794801364, - 79.39765285063396, - 56.40255861789973, - 0.8890603607397893, - 137.59325887086797, - 118.03982850100024, - 53.04390121587005, - 88.31177924841927, - 1.0287550608831881, - 54.67393025836421, - 54.73556135447348, - 129.6143036059356, - 123.57095756116274, - 146.05184555314386, - 55.506024155977386, - 84.40666358740559, - 62.68531518105107, - 147.42894642823578, - 1.0274253590993496, - 145.9063526676371, - 76.36231256557768, - 1.035808949157935, - 136.1858098182613, - 93.13144140533397, - 54.57886608953819, - 1.0251956490815057, - 1.0270063804838983, - 67.96952180390161, - 136.90103479290272, - 78.62986077133174, - 129.97235998681177, - 70.57784076609056, - 1.028567312218149, - 69.64434330087829, - 1.0266016363366386, - 25.142311727265525, - 139.54750333578679, - 118.80547132463877, - 1.0342055876192149, - 132.79991800938092, - 88.25494664060619, - 132.4600307114398, - 1.026200775415348, - 111.33264788932784, - 1.031301270403004, - 104.45912302410692, - 1.0337771723701492, - 124.53550504281608, - 1.0283501183885058, - 126.53361938982871, - 139.83512785200963, - 102.28350299734186, - 122.68389734539087, - 139.27095111763788, - 1.0333552237490158, - 97.04945381465573, - 60.63422077140298, - 1.0248694052483192, - 96.77644543721476, - 118.38370846079931, - 1.0309087229819596, - 136.0487423665781, - 1.032932214377732, - 104.96525711514936, - 50.75370028394122, - 125.67617176346853, - 125.47392048276225, - 101.59371483024698, - 119.1183231384482, - 134.24568445137294, - 1.0323996653747745, - 119.28563313083153, - 50.183581144589674, - 107.50817556608582, - 127.4693561344537, - 116.0234844098742, - 149.0429439759437, - 127.77855747904051, - 1.0319900690130652, - 129.7400124946839, - 60.27584011696136, - 1.0245534026749026, - 113.8687773549026, - 129.9927880985222, - 41.55332067297356, - 12.991853549713621, - 144.9384518471586, - 127.77570879015505, - 79.09214991388126, - 1.0326234729165304, - 144.50618896622706, - 44.461452482592826, - 145.75357879817352, - 150.5618330832813, - 123.17802281879979, - 147.0133924731902, - 57.07203337285457, - 140.17944630269687, - 44.5066568841284, - 150.2834791394652, - 146.37106237628518, - 135.59553639884948, - 21.91845075979551, - 1.0391172002596458, - 92.42182316100705, - 14.98578222593142, - 19.944740287073653, - 32.75622847272977, - 58.94666795839769, - 1.0428676908165904, - 97.94938911630567, - 140.5399781540016, - 36.397689902912774, - 1.0322919875583962, - 33.76444948259586, - 147.54902815924785, - 51.316830076622495, - 153.55703202636914, - 46.423895018386204, - 140.271682540213, - 1.0340651759548871, - 85.22971449383292, - 141.80480996358014, - 1.0234621691055457, - 1.0355322329825165, - 136.96321865236195, - 138.2293990177049, - 136.89440582973347, - 96.94919171687799, - 54.992986423891566, - 142.91167590864902, - 138.73615931624403, - 86.32837448704223, - 1.0424247604140402, - 127.58052889290863, - 138.2472241943501, - 1.0338260095695477, - 1.0317372756221133, - 150.59249576769173, - 1.0229533138894364, - 149.1711141084735, - 1.0419379125129562, - 1.040305113121658, - 150.13261057757276, - 62.47975017460808, - 70.20443057037575, - 76.88821624674898, - 1.0225242667788867, - 136.83301633777177, - 1.0414381555227956, - 131.6044067829552, - 1.038902005769604, - 1.0335832618537684, - 83.38230404797935, - 3.047737981863063, - 140.9843162162637, - 1.0352264324041114, - 1.0409374510445146, - 103.17228299164871, - 1.0383219913492376, - 67.5151836065632, - 126.94018489907108, - 95.29974174831813, - 1.022161551972834, - 1.0348032799350415, - 93.24855217625235, - 140.00831851627856, - 142.46553219867087, - 80.52507876480331, - 149.47939431741142, - 125.60095189608528, - 92.57991472689042, - 153.09192667088175, - 98.78787611117323, - 136.9802701171813, - 1.0378200246498124, - 79.05370338483348, - 145.63143231877774, - 107.86253722014555, - 113.1390555766259, - 150.4596904971142, - 6.010262757833046, - 138.11675690694213, - 1.0371929842524894, - 55.1702723554103, - 148.4142582794926, - 108.62464742566522, - 142.2515578682958, - 149.5588988951372, - 1.0310870179234204, - 32.798276334675066, - 145.8363475163408, - 82.52497836005318, - 144.77105210255448, - 140.95035733017403, - 145.4844811663436, - 145.0646083055648, - 139.1641494303434, - 1.0401220454548914, - 146.10598185112948, - 1.0335329080843159, - 1.0316085392161136, - 133.98012837767038, - 129.62059667226987, - 151.2681266565858, - 1.030719335336581, - 135.9600336007384, - 1.0366589924031362, - 107.70864165999221, - 118.06361914834272, - 148.4615541738592, - 135.1206190516379, - 1.0788915925864082, - 1.0662361391973343, - 1.0784094142292293, - 145.5492563111853, - 100.1745158858024, - 89.97448812790176, - 140.13008352060388, - 8.378443606045758, - 19.841723966559687, - 31.11972559764219, - 127.75589035167928, - 144.649118240912, - 83.40454687650907, - 13.609558087727212, - 144.14916775068022, - 143.0831699051951, - 144.53789580070173, - 129.35689525213576, - 126.54760361436873, - 136.72725454688293, - 83.66753329456253, - 35.238850690537326, - 138.73588075606074, - 148.39285997484404, - 141.43706957675556, - 35.20788617289704, - 140.22918428708584, - 141.42288954532623, - 80.8071906111917, - 53.480908541665116, - 96.60869116876205, - 138.83030943256392, - 146.89537016655746, - 1.0659353965573166, - 138.66041009897964, - 138.0783824554628, - 54.95061283513892, - 1.0688789370964418, - 145.4981195236156, - 107.91672388693667, - 147.39387423946786, - 143.49840246862203, - 1.0781871694837721, - 125.37215873599833, - 46.390553110182545, - 1.0683430650310588, - 60.55314896188811, - 128.32962060837178, - 142.6648214311374, - 1.065532502621677, - 145.06202945295232, - 149.5985088362253, - 43.61426254132819, - 139.2120402464869, - 138.80120892663803, - 142.59390751862693, - 147.27000174003754, - 139.5980537408405, - 142.37081759892675, - 76.47257166426981, - 0.8663971721944621, - 1.067847671923619, - 1.0752972325757186, - 139.11225337731244, - 154.1012640338781, - 91.85315813315137, - 7.34066705730821, - 1.0763437477764217, - 56.03391448680589, - 1.067309924884827, - 1.0747789028833068, - 1.057667310022394, - 146.4284745539176, - 142.32867288307636, - 132.81801172672715, - 142.5746724111237, - 43.178263922620026, - 140.19958418325498, - 1.0742201855279276, - 139.95237701874325, - 124.69044225989671, - 89.93275546978569, - 1.0778110524743836, - 108.03753008375865, - 0.8649825661375887, - 101.22782607000799, - 138.6615942910557, - 1.0572642952018412, - 143.509260845593, - 1.0651693329533294, - 97.454990956795, - 1.075960473594851, - 104.89429761368234, - 153.46849816095335, - 143.28204379991922, - 112.57923589922926, - 145.35468060283986, - 119.53338040876814, - 132.53105489182144, - 146.60735281445733, - 0.8648000721123511, - 132.61504628627392, - 140.81953388748138, - 1.05684091289561, - 147.29646966899597, - 1.0646855258714663, - 1.0772400203863821, - 137.87592499226204, - 101.79954304062817, - 134.45893707567646, - 1.0737967838723397, - 147.3289039421509, - 142.95955673278567, - 123.11846557585149, - 139.7223884224781, - 5.274894457437767, - 0.8646226703470901, - 135.27010135142623, - 134.53222451904563, - 140.4520894166607, - 148.6784682726068, - 148.83999547746723, - 144.76059628877204, - 146.09818079047014, - 0.8644123666240657, - 133.05795012757028, - 141.21253159110282, - 147.08086640702987, - 153.13511211461227, - 147.72437078211334, - 53.87242850230838, - 61.34701685378028, - 74.50771860339175, - 16.40780504974564, - 16.448796993269678, - 144.08505364828036, - 143.78069847853888, - 145.08382905436133, - 139.4144567792124, - 1.113422304912727, - 23.732299099149245, - 146.716938504402, - 1.1150428401994323, - 1.1070863332993708, - 147.462815334713, - 15.300506166735937, - 142.89311901203018, - 35.881455163220174, - 0.8959120615185874, - 134.50389621984408, - 79.91603718165896, - 145.31776951960734, - 153.19384567886857, - 142.494036234602, - 130.58249312188119, - 1.1128817603274543, - 56.157995916719756, - 35.81413980204931, - 116.5213087641768, - 63.30354399512571, - 55.0117106848875, - 47.52954249314361, - 153.04709230401787, - 1.112276523473745, - 80.1523559974256, - 136.20373724941714, - 1.114673225365626, - 1.1067132158651183, - 149.29883052073288, - 145.10950784560325, - 130.53765167080937, - 1.111788125890117, - 0.8957719496064405, - 1.1050775451489783, - 17.522300994030367, - 154.45472111064055, - 152.07616582090188, - 1.1020107149905272, - 138.6808068419634, - 76.87873177159636, - 51.43702839643221, - 138.95045176064437, - 138.64177504011988, - 140.72197385602811, - 132.80947742972836, - 149.78872816785005, - 139.94034036065392, - 154.2632802491591, - 55.57148538150843, - 1.1044580058296936, - 147.1712801496827, - 77.84198065949245, - 142.38330204183904, - 151.76812011990265, - 145.19131540821485, - 147.26566215388425, - 87.12413393605841, - 1.1038403429439656, - 141.4935550752979, - 145.7397470598185, - 3.3080164659931235, - 123.0327553358976, - 146.24080278853327, - 148.10448175245884, - 29.234562433775857, - 151.30177873039895, - 135.4653748135468, - 144.3293913931314, - 148.16163203136404, - 1.1015876034201657, - 1.1114790318458536, - 136.68047783885697, - 77.72584511329579, - 125.73692105352463, - 106.98755729483561, - 96.25926845246491, - 1.109721323323522, - 141.71073652156545, - 130.22006710827588, - 145.24478945746003, - 80.67459353439743, - 1.1033551544760267, - 150.03177939272493, - 154.12875534463626, - 150.04771421074818, - 1.1010813815407388, - 1.1110434127990452, - 145.385699877379, - 86.86487551811825, - 130.16687493633253, - 143.8726181331947, - 111.91340621077623, - 146.0394914387852, - 1.1006353022455784, - 134.47903589563677, - 148.6907436994389, - 102.87151097507036, - 137.41724911494663, - 1.1146766644704549, - 143.85952373403495, - 146.92280951248307, - 1.100156488603178, - 144.04783334738536, - 148.53630346113712, - 58.74848466983248, - 147.0485685726298, - 141.32891699761203, - 142.8441702922343, - 131.04366253726744, - 128.6305301075303, - 1.1106412111686195, - 147.90025888582002, - 0.8959265584913588, - 149.5194069726666, - 137.43649451567626, - 1.1068068376551545, - 68.05269425995475, - 138.94056631255367, - 138.43818227469507, - 69.60391199895408, - 114.83395091462887, - 151.34107787433956, - 141.57237630997332, - 146.07433910500515, - 9.941778754980154, - 131.297822968639, - 10.386636719874664, - 10.545636067043365, - 114.58677137445733, - 75.28902943071078, - 90.63452059810655, - 143.58694736923238, - 9.901118804514459, - 144.5206530902411, - 144.78737732574044, - 79.81136215142409, - 84.9314508821071, - 120.18939827456474, - 10.225253542151219, - 9.702822548173124, - 103.1188517219872, - 138.5008491242522, - 92.02238700298246, - 151.99592340131602, - 9.807595290716304, - 150.0447954775559, - 134.2614008494909, - 149.38544573345007, - 149.62298116309924, - 124.32358754465251, - 132.817456221544, - 10.50607995390264, - 9.78317681034783, - 151.07916494121415, - 146.93545537009487, - 118.45851163082196, - 145.03008316360754, - 154.4449202186591, - 146.86002069809945, - 150.6932855951215, - 110.74803327496042, - 127.40788523389726, - 150.81323854197058, - 150.0047673310006, - 149.6063654551971, - 133.87244996538675, - 10.329695475492791, - 9.414695716712222, - 106.77032789813472, - 118.34636653947105, - 123.44441062862572, - 144.9015592115516, - 153.74652990582067, - 10.065713405335144, - 129.38998560194165, - 117.69087049838025, - 99.15650839997046, - 127.90462338199198, - 147.3574863739125, - 9.696544883885949, - 9.8853852911422, - 128.35872796896587, - 145.2939860705264, - 128.72081963712404, - 94.09935653689803, - 142.8780531031409, - 130.5213122981276, - 126.89288883528536, - 153.36107852781166, - 149.17239657923582, - 9.177632630803961, - 9.387171298727486, - 109.68196882316985, - 148.55536204011432, - 152.61730207818772, - 9.648922236946333, - 132.805446535875, - 138.74295200738652, - 141.66118217831166, - 124.0399127789103, - 113.05005278683446, - 149.71230902297984, - 25.727698431920004, - 129.56419655827216, - 130.40687823665095, - 128.46470366050013, - 150.46298369674685, - 9.22073843893938, - 110.36443029340542, - 148.23878821929193, - 10.219508495480236, - 9.615051521185155, - 9.8723813087942, - 149.91378148843256, - 9.149056684599877, - 130.37704092008303, - 114.86611671621016, - 134.53633480709703, - 131.11593468604048, - 149.74665952988033, - 136.60701891253495, - 146.50864617645632, - 9.094221140419737, - 149.69902295915708, - 126.93245475406366, - 141.2463933703881, - 10.18172163650932, - 136.76582155059438, - 155.5823388453975, - 144.68082947663285, - 142.0128061769988, - 116.20800508912414, - 101.13756407758095, - 10.050927550768915, - 10.14139856150474, - 9.573219645146107, - 146.33874064646594, - 137.22302119976462, - 132.14965518046, - 148.08190796641483, - 117.6843964457568, - 153.04352772565807, - 146.79238076404926, - 9.522740968586977, - 145.93484469600287, - 13.925952420322696, - 12.697420287309185, - 146.39122941822845, - 113.94298610788566, - 13.844109957456581, - 154.57922917096633, - 13.525210269101805, - 103.83976095796662, - 97.75660804271413, - 135.83818209343426, - 158.60060111529293, - 111.57793188874757, - 13.768524263105455, - 154.2203592546867, - 108.85242762118563, - 111.15752259030245, - 149.5942138872604, - 119.77102605185765, - 120.68065341205389, - 105.29698904913548, - 151.41465167808087, - 138.90606724001483, - 13.437371194424983, - 119.97194649055415, - 144.6223725248399, - 146.9934910169238, - 149.45319992777343, - 121.48260402443249, - 13.662736071688842, - 14.448955892498802, - 144.5545360346381, - 154.00382983055897, - 151.8635735223181, - 137.2321484611102, - 119.71487519948164, - 88.24978714231261, - 147.74815341218743, - 142.1113258863455, - 132.08775922189477, - 124.63351274554526, - 145.72256212355262, - 100.50708502243579, - 139.16363846809003, - 114.82662827063822, - 154.78307253831395, - 149.22879563842886, - 152.6744734255461, - 145.81022434241217, - 152.68018782123758, - 116.75549006136289, - 12.968595875688791, - 6.824624970615158, - 125.05116103474757, - 147.66072487793718, - 147.5735120742967, - 139.1302141298083, - 146.48542990069834, - 12.674865288395944, - 147.88858853602966, - 6.8124480142416175, - 137.54766974463703, - 130.89979405333307, - 13.364169845161861, - 14.116086127002273, - 130.3002929300388, - 116.98398239487472, - 152.70827610346095, - 98.51470626500011, - 135.1252373635164, - 14.405992358855888, - 154.13709739001223, - 146.28661687368685, - 137.87827066214206, - 12.621081453489012, - 154.04574874294514, - 6.802625211185703, - 152.18661864386252, - 149.30257880598677, - 13.244501725269068, - 138.34068638798834, - 150.95140747506372, - 141.8441899037163, - 152.99022366652198, - 103.95004802425926, - 140.28144756248412, - 154.51222806007945, - 85.40777548962518, - 154.7067128296305, - 120.47843952303268, - 12.568053995018431, - 12.916583075889136, - 105.92477484543576, - 137.92878859711615, - 135.13853669037294, - 137.88549737290148, - 157.83019925734393, - 145.48927689323145, - 12.509532718065461, - 150.6233829715981, - 119.23669844460764, - 138.49099023171033, - 154.0870149904812, - 140.1862744667834, - 148.860174031694, - 147.54629689336036, - 12.448861769003683, - 152.4711466483636, - 102.47079224461186, - 152.40864885890767, - 156.21773232766026, - 13.139291580904986, - 150.30653960489693, - 145.43571147072188, - 132.8965387342577, - 144.85972103961666, - 125.5438694385711, - 158.07457773478276, - 14.359506122440205, - 137.7658155977229, - 153.68125116011197, - 156.57780724945528, - 12.394708947912125, - 12.874702780202174, - 110.61518572692995, - 149.4338565730422, - 149.67552030435513, - 146.20909415912828, - 9.308833539527914, - 26.176147260970783, - 8.701217384742513, - 66.92241449340185, - 105.12940849136734, - 145.25326276553395, - 139.68219350261262, - 131.60335890332783, - 150.53420884400245, - 17.552483447968918, - 99.60476667168517, - 9.003208512207522, - 8.539560747895454, - 9.946172723540226, - 150.55644446784382, - 9.608936841972842, - 104.80864366760326, - 25.95068644438624, - 99.42592550150236, - 108.35979254469888, - 113.9171427720856, - 9.905905876631499, - 131.1684982861573, - 154.7989292174601, - 151.34753888952145, - 150.11816141981262, - 143.00557828542912, - 126.2310299151925, - 113.53830001728545, - 148.13405630794878, - 150.7564429392251, - 155.252325076404, - 18.20048176554747, - 25.725436761645142, - 8.678711562613207, - 143.3683328827327, - 127.0294451168928, - 137.50119476282134, - 10.068367539846923, - 155.64822784014916, - 153.2789382926615, - 25.46950813818654, - 142.9138107220956, - 155.10510899417167, - 107.40557834412083, - 9.871948602847068, - 144.4712732194919, - 140.17802930301565, - 9.286026243902361, - 129.1488895575147, - 124.35586045151207, - 140.1410811550992, - 96.63692877337894, - 153.62093095799207, - 156.05800033315097, - 9.587609950939838, - 140.09721428165886, - 134.898750425008, - 8.652809034763463, - 8.989448046931262, - 107.64260577858933, - 9.825071080298192, - 150.6237132142087, - 143.76058852986372, - 154.01627264735168, - 140.85322298632985, - 143.63714834446708, - 149.7259575806535, - 8.53942846683121, - 157.02635815805976, - 150.83913162907433, - 154.0283691261865, - 9.246842209481716, - 154.5851361854829, - 133.4662155767381, - 137.55396410787307, - 105.77910782321499, - 148.97953057255376, - 111.3041581371634, - 9.543858351726714, - 142.71996301994741, - 144.2417836324451, - 148.5293262803374, - 8.95331376662564, - 105.2724164655814, - 149.16646109060707, - 151.1947852118465, - 9.503293907683512, - 133.40055362812345, - 8.776394391795916, - 148.3675722527084, - 154.66946641450528, - 122.71674068416665, - 149.62192317697068, - 153.40159484208397, - 9.46860898864519, - 146.10526710538994, - 143.96020057925128, - 8.62472208077336, - 8.906885562515198, - 105.7754218686014, - 150.17957794387223, - 144.0451331512576, - 149.95461039551162, - 151.46311089131117, - 142.22104279807664, - 147.3679944003333, - 140.5394711174869, - 123.62157744638432, - 152.32796921399395, - 156.6603241829257, - 9.43621164630811, - 158.2241383954169, - 149.33346139426692, - 144.12074054746773, - 143.1977521817863, - 8.536662624511228, - 9.785635570067782, - 147.61880087321424, - 9.402323265876474, - 159.1161790596516, - 146.56796834276156, - 147.64890403285438, - 157.70847517328534, - 114.64282143770687, - 148.5000942425868, - 10.052761003641129, - 147.38801074409378 - ] -} diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/model_config.yaml deleted file mode 100644 index 2d65c154a0e..00000000000 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/model_config.yaml +++ /dev/null @@ -1,59 +0,0 @@ -ENV_VARS: - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NCCL_ALGO: Ring - CUBLAS_WORKSPACE_CONFIG: :4096:8 -TEST_TYPE: frozen-start -MODE: inference -MODEL_ARGS: - --tiktoken-pattern: v2 - --use-mcore-models: true - --tokenizer-type: TikTokenizer - --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json - --auto-detect-ckpt-format: true - --max-tokens-to-oom: 3600000 - --inference-max-seq-length: 4096 - --attention-backend: flash - --use-checkpoint-args: true - --micro-batch-size: 1 - --no-load-optim: true - --no-use-tokenizer-model-from-checkpoint-args: true - --timing-log-level: 0 - --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ - --distributed-backend: nccl - --log-interval: 1 - --transformer-impl: transformer_engine - --tensor-model-parallel-size: 1 - --pipeline-model-parallel-size: 1 - --ckpt-format: torch_dist - --bf16: true - --log-memory-to-tensorboard: true - --log-num-zeros-in-grad: true - --log-validation-ppl-to-tensorboard: true - --log-timers-to-tensorboard: true - --num-layers: 24 - --hidden-size: 1152 - --num-attention-heads: 16 - --max-position-embeddings: 1024 - --seq-length: 1024 - --temperature: 1.0 - --top_k: 1 - --seed: 42 - --return-log-probs: true - --num-tokens-from-file: true - --inference-dynamic-batching-buffer-size-gb: 20 - --cuda-graph-impl: local - --cuda-graph-scope: full_iteration - --disable-chunked-prefill: true - --dist-ckpt-strictness: log_unexpected - --inference-ckpt-non-strict: true # To handle the extra_state errors - --output-path: ${TENSORBOARD_PATH} - --output-every-n-results: 32 - --prompt-file: ${DATA_PATH}/text/sharegpt-vicuna/filtered/processed.jsonl - --prompt-file-num-truncate: 1024 - --incoming-requests-per-step: 128 - --use-flashinfer-fused-rope: true - --throughput-check-only: true -METRICS: - - "generated_tokens" - - "logprobs" diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json deleted file mode 100644 index 07adf271434..00000000000 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json +++ /dev/null @@ -1,158 +0,0 @@ -{ - "0": { - "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", - "generated_text": " And that this is the place where you can be yourself, and be yourself in the most beautiful way. And that this is the place where you can", - "generated_tokens": [ - 3060, - 1455, - 1593, - 1395, - 1278, - 3535, - 2478, - 1636, - 1710, - 1402, - 14019, - 1044, - 1321, - 1402, - 14019, - 1294, - 1278, - 2725, - 15568, - 3039, - 1046, - 3060, - 1455, - 1593, - 1395, - 1278, - 3535, - 2478, - 1636, - 1710 - ], - "latency": 2.020272731781006, - "logprobs": [ - -9.358587265014648, - -2.7594826221466064, - -4.608366012573242, - -1.4093360900878906, - -0.6152952313423157, - -1.7217562198638916, - -2.496668815612793, - -2.0547454357147217, - -2.441960573196411, - -6.280838966369629, - -1.5643692016601562, - -3.462346076965332, - -4.428728103637695, - -3.8633861541748047, - -1.9936373233795166, - -1.8929449319839478, - -3.796365737915039, - -6.8360137939453125, - -0.2901247441768646, - -0.9246833324432373, - -6.633338928222656, - -7.166708469390869, - -12.771251678466797, - -2.198296308517456, - -3.7778120040893555, - -0.4983733296394348, - -4.381269454956055, - -0.0666784718632698, - -0.09580295532941818, - -3.2437636852264404, - -10.079947471618652, - -1.172220230102539, - -5.977442741394043, - -5.046236038208008, - -3.855658531188965, - -2.5585858821868896, - -3.356245994567871, - -5.557229518890381, - -1.6787731647491455, - -5.483290672302246, - -12.218501091003418, - -12.61402702331543, - -0.09662941098213196, - -2.5431432723999023, - -1.4071024656295776, - -2.9154715538024902, - -1.1964417695999146, - -0.006458481773734093, - -3.3625335693359375, - -13.262511253356934, - -4.314079761505127, - -2.617699146270752, - -5.987792015075684, - -0.778266429901123, - -0.048888545483350754, - -1.548882007598877, - -1.1381981372833252, - -5.627166748046875, - -0.4078553318977356, - -4.958505630493164, - -0.6187160611152649, - -0.7174848914146423, - -2.469533920288086, - -13.620073318481445, - -0.09088654816150665, - -3.526974678039551, - -1.4195809364318848, - -6.402483940124512, - -0.5898402333259583, - -3.565917491912842, - -0.8561318516731262, - -1.6140165328979492, - -5.370549201965332, - -17.159223556518555, - -6.583524703979492, - -0.8855001926422119, - -4.19431209564209, - -1.2012220621109009, - -2.2563133239746094, - -1.7674944400787354, - -0.22064533829689026, - -9.292220115661621, - -0.12445646524429321, - -7.29617977142334, - -2.526529312133789, - -4.071560859680176, - -3.5568013191223145, - -1.926215410232544, - -2.349026918411255, - -2.2132363319396973, - -0.3125414550304413, - -1.4718132019042969, - -2.149106740951538, - -1.0855519771575928, - -1.631832242012024, - -1.3751734495162964, - -1.9396103620529175, - -1.5293723344802856, - -0.8444125056266785, - -1.2414811849594116, - -1.9522171020507812, - -2.4338042736053467, - -1.5651824474334717, - -0.9498789310455322, - -1.8044980764389038, - -2.356677770614624, - -1.247452974319458, - -1.550165057182312, - -0.5635553598403931, - -0.6177330017089844, - -0.4778785705566406, - -0.020452087745070457, - -0.48500269651412964, - -0.23854275047779083, - -0.06543659418821335, - -0.11837350577116013, - -0.0585334412753582 - ] - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/model_config.yaml deleted file mode 100644 index 96d3fd0fc0c..00000000000 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq/model_config.yaml +++ /dev/null @@ -1,58 +0,0 @@ -ENV_VARS: - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NCCL_ALGO: Ring - CUBLAS_WORKSPACE_CONFIG: :4096:8 -TEST_TYPE: frozen-start -MODE: inference -MODEL_ARGS: - --tiktoken-pattern: v2 - --use-mcore-models: true - --tokenizer-type: TikTokenizer - --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json - --auto-detect-ckpt-format: true - --max-tokens-to-oom: 3600000 - --inference-max-seq-length: 4096 - --attention-backend: flash - --use-checkpoint-args: true - --micro-batch-size: 1 - --no-load-optim: true - --no-use-tokenizer-model-from-checkpoint-args: true - --timing-log-level: 0 - --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ - --distributed-backend: nccl - --log-interval: 1 - --transformer-impl: inference_optimized - --sequence-parallel: true - --tensor-model-parallel-size: 1 - --pipeline-model-parallel-size: 8 - --deterministic-mode: true - --ckpt-format: torch_dist - --bf16: true - --log-memory-to-tensorboard: true - --log-num-zeros-in-grad: true - --log-validation-ppl-to-tensorboard: true - --log-timers-to-tensorboard: true - --num-layers: 24 - --hidden-size: 1152 - --num-attention-heads: 16 - --max-position-embeddings: 1024 - --seq-length: 1024 - --temperature: 1.0 - --top_k: 1 - --return-log-probs: true - --num-tokens-to-generate: 30 - --inference-dynamic-batching-max-requests-override: 8 # hardcode decode padding tokens to 7 for reproducibility - --inference-dynamic-batching-buffer-guaranteed-fraction: 0 - --inference-dynamic-batching-buffer-overflow-factor: 0.2 - --inference-dynamic-batching-buffer-size-gb: 20 - --dist-ckpt-strictness: log_unexpected - --inference-ckpt-non-strict: true # To handle the extra_state errors - --output-path: ${TENSORBOARD_PATH} - --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." - --incoming-requests-per-step: 32 - --use-flashinfer-fused-rope: true - -METRICS: - - "generated_tokens" - - "logprobs" diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json deleted file mode 100644 index 55d6955055a..00000000000 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json +++ /dev/null @@ -1,158 +0,0 @@ -{ - "0": { - "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", - "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", - "generated_tokens": [ - 3060, - 2430, - 1636, - 2012, - 1317, - 1278, - 2362, - 1307, - 1278, - 16070, - 1044, - 1321, - 1636, - 23067, - 1455, - 1593, - 1395, - 1605, - 3140, - 5152, - 1513, - 1747, - 1046, - 2409, - 1395, - 3140, - 5152, - 1513, - 1278, - 2362 - ], - "latency": 44.73653959017247, - "logprobs": [ - -9.358970642089844, - -2.7523813247680664, - -4.628502368927002, - -1.4058877229690552, - -0.6050865054130554, - -1.7354254722595215, - -2.4828507900238037, - -2.0520384311676025, - -2.4089853763580322, - -6.2649126052856445, - -1.5644135475158691, - -3.4096615314483643, - -4.358163833618164, - -3.866471767425537, - -2.0575876235961914, - -1.904883623123169, - -3.7622976303100586, - -6.835415363311768, - -0.2829523980617523, - -0.9827429056167603, - -6.655940055847168, - -7.188957214355469, - -12.757233619689941, - -2.1933951377868652, - -3.808887481689453, - -0.515199601650238, - -4.323916912078857, - -0.067625492811203, - -0.09976530075073242, - -3.228640556335449, - -10.129311561584473, - -1.1787357330322266, - -5.97692346572876, - -5.036575794219971, - -3.8267176151275635, - -2.6010468006134033, - -3.366438865661621, - -5.553505897521973, - -1.6046268939971924, - -5.442874908447266, - -12.218503952026367, - -12.597894668579102, - -0.0976092740893364, - -2.530579090118408, - -1.4139617681503296, - -2.8606526851654053, - -1.1690009832382202, - -0.0066696410067379475, - -3.361189365386963, - -13.191482543945312, - -4.413737773895264, - -2.639688491821289, - -6.0114641189575195, - -0.7672993540763855, - -0.047326065599918365, - -1.550362467765808, - -1.137772798538208, - -5.627618789672852, - -0.40103790163993835, - -4.908735275268555, - -0.5704602599143982, - -0.6625558733940125, - -2.364135503768921, - -13.609526634216309, - -0.08865148574113846, - -3.5251970291137695, - -1.3791766166687012, - -6.395696640014648, - -0.588782787322998, - -3.566770076751709, - -0.8742034435272217, - -1.5827170610427856, - -5.3912353515625, - -17.150842666625977, - -6.6234588623046875, - -0.885993242263794, - -4.162992477416992, - -1.1942744255065918, - -2.281689405441284, - -1.7708709239959717, - -0.22030864655971527, - -9.292593955993652, - -0.1258234828710556, - -7.346449851989746, - -2.5470826625823975, - -4.115433692932129, - -3.5646262168884277, - -1.9410749673843384, - -2.3247878551483154, - -1.523364543914795, - -2.360647678375244, - -1.708706021308899, - -1.131014108657837, - -2.944424867630005, - -0.5273782014846802, - -0.44912564754486084, - -1.753378987312317, - -0.8341047167778015, - -0.4124295711517334, - -0.9006240367889404, - -1.4890273809432983, - -0.4379286766052246, - -1.6497018337249756, - -0.5444425344467163, - -1.2305881977081299, - -1.164027214050293, - -0.002498721005395055, - -1.165798544883728, - -0.007112303748726845, - -0.718407154083252, - -0.7442683577537537, - -0.04299728572368622, - -0.8688321113586426, - -0.021008115261793137, - -2.033963680267334, - -1.2936673164367676, - -0.78721684217453 - ] - } -} diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/model_config.yaml deleted file mode 100644 index 306c12bd653..00000000000 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq/model_config.yaml +++ /dev/null @@ -1,58 +0,0 @@ -ENV_VARS: - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NCCL_ALGO: Ring - CUBLAS_WORKSPACE_CONFIG: :4096:8 -TEST_TYPE: frozen-start -MODE: inference -MODEL_ARGS: - --tiktoken-pattern: v2 - --use-mcore-models: true - --tokenizer-type: TikTokenizer - --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json - --auto-detect-ckpt-format: true - --max-tokens-to-oom: 3600000 - --inference-max-seq-length: 4096 - --attention-backend: flash - --use-checkpoint-args: true - --micro-batch-size: 1 - --no-load-optim: true - --no-use-tokenizer-model-from-checkpoint-args: true - --timing-log-level: 0 - --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ - --distributed-backend: nccl - --log-interval: 1 - --transformer-impl: inference_optimized - --sequence-parallel: true - --tensor-model-parallel-size: 2 - --pipeline-model-parallel-size: 2 - --deterministic-mode: true - --ckpt-format: torch_dist - --bf16: true - --log-memory-to-tensorboard: true - --log-num-zeros-in-grad: true - --log-validation-ppl-to-tensorboard: true - --log-timers-to-tensorboard: true - --num-layers: 24 - --hidden-size: 1152 - --num-attention-heads: 16 - --max-position-embeddings: 1024 - --seq-length: 1024 - --temperature: 1.0 - --top_k: 1 - --return-log-probs: true - --num-tokens-to-generate: 30 - --inference-dynamic-batching-max-requests-override: 8 # hardcode decode padding tokens to 7 for reproducibility - --inference-dynamic-batching-buffer-guaranteed-fraction: 0 - --inference-dynamic-batching-buffer-overflow-factor: 0.2 - --inference-dynamic-batching-buffer-size-gb: 20 - --dist-ckpt-strictness: log_unexpected - --inference-ckpt-non-strict: true # To handle the extra_state errors - --output-path: ${TENSORBOARD_PATH} - --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." - --incoming-requests-per-step: 32 - --use-flashinfer-fused-rope: true - -METRICS: - - "generated_tokens" - - "logprobs" diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json index f32580e937f..6ef98105cbd 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json @@ -157,5 +157,5 @@ -0.0585334412753582 ] }, - "throughput": [12.319796866345767, 12.319796866345767] -} + "throughput": [13.93210545115292, 13.93210545115292] +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml index e6b659cf46f..59186f8d532 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_583m_logitsmatch/model_config.yaml @@ -41,7 +41,10 @@ MODEL_ARGS: --top_k: 1 --return-log-probs: true --num-tokens-to-generate: 30 - --inference-dynamic-batching-buffer-size-gb: 10 + --inference-dynamic-batching-max-requests-override: 8 # hardcode decode padding tokens to 7 for reproducibility + --inference-dynamic-batching-buffer-guaranteed-fraction: 0 + --inference-dynamic-batching-buffer-overflow-factor: 0.2 + --inference-dynamic-batching-buffer-size-gb: 20 --dist-ckpt-strictness: log_unexpected --inference-ckpt-non-strict: true # To handle the extra_state errors --output-path: ${TENSORBOARD_PATH} diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json index 4ebaf72f5e7..07adf271434 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json @@ -1,158 +1,158 @@ { - "0": { - "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", - "generated_text": " And then you get to the end of the movie, and you realize that this is not New York at all. This is New York at the end", - "generated_tokens": [ - 3060, - 2430, - 1636, - 2012, - 1317, - 1278, - 2362, - 1307, - 1278, - 16070, - 1044, - 1321, - 1636, - 23067, - 1455, - 1593, - 1395, - 1605, - 3140, - 5152, - 1513, - 1747, - 1046, - 2409, - 1395, - 3140, - 5152, - 1513, - 1278, - 2362 - ], - "latency": 42.63835311005823, - "logprobs": [ - -9.358713150024414, - -2.724055767059326, - -4.5792131423950195, - -1.4844143390655518, - -0.6546129584312439, - -1.7303215265274048, - -2.4795279502868652, - -2.0776171684265137, - -2.4553134441375732, - -6.219150066375732, - -1.566371202468872, - -3.486889362335205, - -4.418787479400635, - -3.8580172061920166, - -2.0664010047912598, - -1.843908667564392, - -3.744598627090454, - -6.82543420791626, - -0.2880207300186157, - -0.9257857799530029, - -6.612694263458252, - -7.218401908874512, - -12.827808380126953, - -2.1861495971679688, - -3.8218231201171875, - -0.5008565187454224, - -4.383245468139648, - -0.06934759020805359, - -0.09667497128248215, - -3.2640299797058105, - -10.102912902832031, - -1.1498218774795532, - -5.979549407958984, - -5.0192108154296875, - -3.8367133140563965, - -2.581653356552124, - -3.4087462425231934, - -5.545716285705566, - -1.6541939973831177, - -5.547749996185303, - -12.21850872039795, - -12.582784652709961, - -0.09534379839897156, - -2.522055149078369, - -1.4054086208343506, - -2.8758127689361572, - -1.1866405010223389, - -0.005799253936856985, - -3.3871712684631348, - -13.193516731262207, - -4.389392852783203, - -2.520228862762451, - -6.023908615112305, - -0.7408540844917297, - -0.04526234790682793, - -1.5508661270141602, - -1.1332746744155884, - -5.653256416320801, - -0.4028852581977844, - -4.9457244873046875, - -0.618165135383606, - -0.6616490483283997, - -2.36385178565979, - -13.6455078125, - -0.08668932318687439, - -3.5266754627227783, - -1.3801541328430176, - -6.351947784423828, - -0.5434023141860962, - -3.5673093795776367, - -0.871107816696167, - -1.618450403213501, - -5.378700256347656, - -17.17119026184082, - -6.662005424499512, - -0.9221409559249878, - -4.141905784606934, - -1.2047083377838135, - -2.227570056915283, - -1.7645721435546875, - -0.21892313659191132, - -9.296550750732422, - -0.11995092779397964, - -7.402207851409912, - -2.512965679168701, - -4.100971221923828, - -3.580245018005371, - -1.9462040662765503, - -2.347074031829834, - -1.5288957357406616, - -2.4033043384552, - -1.7311294078826904, - -1.1686863899230957, - -2.938558340072632, - -0.5278136730194092, - -0.4748117923736572, - -1.749883770942688, - -0.8397680521011353, - -0.4109693169593811, - -0.9552587270736694, - -1.5238327980041504, - -0.4656376838684082, - -1.6448218822479248, - -0.5414345264434814, - -1.2422380447387695, - -1.1426063776016235, - -0.002245525596663356, - -1.252556562423706, - -0.007873333990573883, - -0.7185167670249939, - -0.7521701455116272, - -0.042445242404937744, - -0.8852499723434448, - -0.02266514115035534, - -2.0951969623565674, - -1.348037838935852, - -0.8296748399734497 - ] - } -} + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And that this is the place where you can be yourself, and be yourself in the most beautiful way. And that this is the place where you can", + "generated_tokens": [ + 3060, + 1455, + 1593, + 1395, + 1278, + 3535, + 2478, + 1636, + 1710, + 1402, + 14019, + 1044, + 1321, + 1402, + 14019, + 1294, + 1278, + 2725, + 15568, + 3039, + 1046, + 3060, + 1455, + 1593, + 1395, + 1278, + 3535, + 2478, + 1636, + 1710 + ], + "latency": 2.020272731781006, + "logprobs": [ + -9.358587265014648, + -2.7594826221466064, + -4.608366012573242, + -1.4093360900878906, + -0.6152952313423157, + -1.7217562198638916, + -2.496668815612793, + -2.0547454357147217, + -2.441960573196411, + -6.280838966369629, + -1.5643692016601562, + -3.462346076965332, + -4.428728103637695, + -3.8633861541748047, + -1.9936373233795166, + -1.8929449319839478, + -3.796365737915039, + -6.8360137939453125, + -0.2901247441768646, + -0.9246833324432373, + -6.633338928222656, + -7.166708469390869, + -12.771251678466797, + -2.198296308517456, + -3.7778120040893555, + -0.4983733296394348, + -4.381269454956055, + -0.0666784718632698, + -0.09580295532941818, + -3.2437636852264404, + -10.079947471618652, + -1.172220230102539, + -5.977442741394043, + -5.046236038208008, + -3.855658531188965, + -2.5585858821868896, + -3.356245994567871, + -5.557229518890381, + -1.6787731647491455, + -5.483290672302246, + -12.218501091003418, + -12.61402702331543, + -0.09662941098213196, + -2.5431432723999023, + -1.4071024656295776, + -2.9154715538024902, + -1.1964417695999146, + -0.006458481773734093, + -3.3625335693359375, + -13.262511253356934, + -4.314079761505127, + -2.617699146270752, + -5.987792015075684, + -0.778266429901123, + -0.048888545483350754, + -1.548882007598877, + -1.1381981372833252, + -5.627166748046875, + -0.4078553318977356, + -4.958505630493164, + -0.6187160611152649, + -0.7174848914146423, + -2.469533920288086, + -13.620073318481445, + -0.09088654816150665, + -3.526974678039551, + -1.4195809364318848, + -6.402483940124512, + -0.5898402333259583, + -3.565917491912842, + -0.8561318516731262, + -1.6140165328979492, + -5.370549201965332, + -17.159223556518555, + -6.583524703979492, + -0.8855001926422119, + -4.19431209564209, + -1.2012220621109009, + -2.2563133239746094, + -1.7674944400787354, + -0.22064533829689026, + -9.292220115661621, + -0.12445646524429321, + -7.29617977142334, + -2.526529312133789, + -4.071560859680176, + -3.5568013191223145, + -1.926215410232544, + -2.349026918411255, + -2.2132363319396973, + -0.3125414550304413, + -1.4718132019042969, + -2.149106740951538, + -1.0855519771575928, + -1.631832242012024, + -1.3751734495162964, + -1.9396103620529175, + -1.5293723344802856, + -0.8444125056266785, + -1.2414811849594116, + -1.9522171020507812, + -2.4338042736053467, + -1.5651824474334717, + -0.9498789310455322, + -1.8044980764389038, + -2.356677770614624, + -1.247452974319458, + -1.550165057182312, + -0.5635553598403931, + -0.6177330017089844, + -0.4778785705566406, + -0.020452087745070457, + -0.48500269651412964, + -0.23854275047779083, + -0.06543659418821335, + -0.11837350577116013, + -0.0585334412753582 + ] + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml index 551ba8115cb..612e621534d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp8_pp1_dp1_583m_logitsmatch_zmq/model_config.yaml @@ -22,9 +22,8 @@ MODEL_ARGS: --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ --distributed-backend: nccl --log-interval: 1 - --transformer-impl: inference_optimized - --sequence-parallel: true - --tensor-model-parallel-size: 8 + --transformer-impl: transformer_engine + --tensor-model-parallel-size: 1 --pipeline-model-parallel-size: 1 --deterministic-mode: true --ckpt-format: torch_dist @@ -52,7 +51,6 @@ MODEL_ARGS: --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." --incoming-requests-per-step: 32 --use-flashinfer-fused-rope: true - METRICS: - "generated_tokens" - "logprobs" diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/golden_values_dev_dgx_h100.json deleted file mode 100644 index dccdd34a5e7..00000000000 --- a/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/golden_values_dev_dgx_h100.json +++ /dev/null @@ -1,135 +0,0 @@ -{ - "0": { - "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", - "generated_text": " Then, when you're ready, go home and watch the movie again.
    ", - "generated_tokens": [ - 6830, - 1044, - 2200, - 1636, - 6185, - 11831, - 1044, - 1974, - 4590, - 1321, - 9951, - 1278, - 16070, - 2790, - 1046, - 2 - ], - "latency": 22.701347589492798, - "cuda_graph_request_count_map": null, - "step_count": 16, - "logprobs": [ - -9.498085021972656, - -3.787536859512329, - -3.0404648780822754, - -1.7445809841156006, - -0.29672086238861084, - -1.3661342859268188, - -2.3458175659179688, - -1.83931303024292, - -1.4894113540649414, - -6.440437316894531, - -0.8176816701889038, - -1.790361762046814, - -3.6521127223968506, - -3.7014482021331787, - -1.5858951807022095, - -1.5492421388626099, - -2.844204902648926, - -6.694585800170898, - -0.06552714854478836, - -1.333437204360962, - -6.077418327331543, - -9.448220252990723, - -10.46927261352539, - -1.4987666606903076, - -4.727880001068115, - -0.7596290111541748, - -2.152517795562744, - -0.013758113607764244, - -0.040566492825746536, - -3.1010313034057617, - -8.735280990600586, - -1.5446771383285522, - -5.841436862945557, - -3.0970406532287598, - -4.0269670486450195, - -3.769413948059082, - -2.466399669647217, - -2.3482255935668945, - -0.47234833240509033, - -1.114174723625183, - -5.310229778289795, - -8.236719131469727, - -0.015452657826244831, - -2.854970932006836, - -1.2198810577392578, - -3.923705577850342, - -0.9644856452941895, - -0.0026721982285380363, - -3.096668243408203, - -11.110801696777344, - -3.688267230987549, - -2.3297765254974365, - -4.670788764953613, - -0.09854680299758911, - -0.06234245002269745, - -1.3255000114440918, - -2.169330596923828, - -4.490111827850342, - -0.4412422776222229, - -3.9356117248535156, - -0.5775455832481384, - -0.2409835010766983, - -2.9197134971618652, - -13.475022315979004, - -0.10248012840747833, - -3.5023770332336426, - -0.8544933795928955, - -5.194520473480225, - -0.32954925298690796, - -2.3026833534240723, - -0.5346049070358276, - -1.2862977981567383, - -4.881562232971191, - -15.555293083190918, - -4.919404029846191, - -0.22008435428142548, - -6.644532680511475, - -0.8938115239143372, - -2.1304054260253906, - -1.8866363763809204, - -0.20106904208660126, - -5.917205810546875, - -0.0056310598738491535, - -7.453446388244629, - -3.1677205562591553, - -3.706507682800293, - -2.136584520339966, - -2.9287283420562744, - -1.4792609214782715, - -2.4399306774139404, - -1.2330785989761353, - -1.9715899229049683, - -1.9578948020935059, - -0.23143476247787476, - -2.052696466445923, - -1.0413113832473755, - -1.1709030866622925, - -2.825991630554199, - -1.6848523616790771, - -2.2008259296417236, - -1.5216114521026611, - -1.2439141273498535, - -1.412055253982544 - ] - }, - "throughput": [ - 13.750125804204401, 13.955213632130931 - ] -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/model_config.yaml b/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/model_config.yaml deleted file mode 100644 index 4ae5c719291..00000000000 --- a/tests/functional_tests/test_cases/hybrid/hybrid_dynamic_inference_tp1_pp1_dp8_583m/model_config.yaml +++ /dev/null @@ -1,72 +0,0 @@ -ENV_VARS: - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NCCL_ALGO: Ring - CUBLAS_WORKSPACE_CONFIG: :4096:8 -TEST_TYPE: frozen-start -MODE: inference -MODEL_ARGS: - --log-num-zeros-in-grad: true - --log-validation-ppl-to-tensorboard: true - --log-timers-to-tensorboard: true - --log-memory-to-tensorboard: true - --timing-log-level: 0 - --load: ${CHECKPOINT_LOAD_PATH}/model/mamba_hybrid_2b/dcp/mcore-v1_bf16/checkpoint - --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mamba_hybrid_2b/dcp/mcore-v1_bf16/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json - --tokenizer-type: TikTokenizer - --tiktoken-pattern: v2 - --distributed-backend: nccl - --log-interval: 1 - --transformer-impl: transformer_engine - --tensor-model-parallel-size: 1 - --pipeline-model-parallel-size: 1 - --expert-model-parallel-size: 1 - --use-mcore-models: true - --is-hybrid-model: true - --model-provider: mamba - --init-method-std: 0.0198 - --untie-embeddings-and-output-weights: true - --disable-bias-linear: true - --init-method-std: 0.014 - --position-embedding-type: none - --num-layers: 50 - --hidden-size: 2048 - --ffn-hidden-size: 11264 - --num-attention-heads: 16 - --kv-channels: 128 - --hybrid-override-pattern: M-M-M-M*-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- - --spec: megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec - --normalization: RMSNorm - --swiglu: true - --attention-dropout: 0.0 - --hidden-dropout: 0.0 - --seq-length: 4096 - --max-position-embeddings: 4096 - --micro-batch-size: 1 - --ckpt-format: torch_dist - --ckpt-fully-parallel-save: true - --ckpt-fully-parallel-load: true - --ckpt-assume-constant-structure: true - --dist-ckpt-strictness: log_unexpected - --bf16: true - --attention-backend: flash - --no-create-attention-mask-in-dataloader: true - --num-workers: 8 - --use-checkpoint-args: true - --no-use-tokenizer-model-from-checkpoint-args: true - --no-load-optim: true - --deterministic-mode: true - --save-interval: 2000 - --temperature: 1.0 - --top_k: 1 - --return-log-probs: true - --num-tokens-to-generate: 30 - --max-tokens-to-oom: 3600000 - --inference-max-seq-length: 4096 - --output-path: ${TENSORBOARD_PATH} - --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." - --incoming-requests-per-step: 32 - --inference-repeat-n: 3 -METRICS: - - "generated_tokens" - - "logprobs" diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgx_h100.json index d9a60d1ae11..1a9705f8181 100644 --- a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/golden_values_dev_dgx_h100.json @@ -174,5 +174,5 @@ -0.5394397377967834 ] }, - "throughput": [34.95064017365726, 34.95064017365726] + "throughput": [25.35687538450034, 25.35687538450034] } diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml index e97dc0b56a4..0e1f9110793 100644 --- a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml @@ -80,7 +80,6 @@ MODEL_ARGS: --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." --incoming-requests-per-sec: -1 --inference-repeat-n: 8 - --inference-dynamic-batching-buffer-size-gb: 20 METRICS: - "generated_tokens" - "logprobs" diff --git a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml index 6c119cc548b..1b9eaaf1f65 100644 --- a/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch/model_config.yaml @@ -76,7 +76,6 @@ MODEL_ARGS: --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." --incoming-requests-per-sec: -1 # all requests arrive up front. --inference-repeat-n: 8 - --inference-dynamic-batching-buffer-size-gb: 20 METRICS: - "generated_tokens" - "logprobs" diff --git a/tests/test_utils/python_scripts/auto_reminder_github.py b/tests/test_utils/python_scripts/auto_reminder_github.py index 7484244b717..df75ec0542c 100644 --- a/tests/test_utils/python_scripts/auto_reminder_github.py +++ b/tests/test_utils/python_scripts/auto_reminder_github.py @@ -58,42 +58,27 @@ def get_user_email(self, username: str): try: user = self.github.get_user(username) - public_email = None # 1. Try public profile email first if user.email and not user.email.endswith("@users.noreply.github.com"): - if user.email.endswith("@nvidia.com"): - self.email_cache[username] = user.email - return user.email - else: - public_email = user.email + self.email_cache[username] = user.email + return user.email # 2. If no public email, check recent commits on the main repo try: # Use get_commits(author=...) which is more direct than search_commits for commit in self.repo.get_commits(author=user)[:10]: email = commit.commit.author.email - if ( - email - and not email.endswith("@users.noreply.github.com") - and email.endswith("@nvidia.com") - ): + if email and not email.endswith("@users.noreply.github.com"): self.email_cache[username] = email return email - elif ( - email - and not email.endswith("@users.noreply.github.com") - and public_email is None - ): - public_email = email except Exception as e: logger.debug(f"Could not check commits for {username}: {e}") - if public_email is None: - public_email = f"{username}@users.noreply.github.com" - - self.email_cache[username] = public_email - return public_email + # 3. Fallback to public email (even if noreply) or a constructed noreply + email = user.email or f"{username}@users.noreply.github.com" + self.email_cache[username] = email + return email except Exception as e: logger.warning(f"Could not get user object for {username}: {e}") diff --git a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml b/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml index e882d721860..1b4786e8230 100644 --- a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml +++ b/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml @@ -39,7 +39,7 @@ spec: ARGUMENTS=( "CHECKPOINT_LOAD_PATH=/mnt/artifacts" "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" - "DATA_PATH=/mnt/artifacts/" + "DATA_PATH=null" "DATA_CACHE_PATH=/workspace/data/cache" "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" @@ -59,22 +59,8 @@ products: - environment: [dev] scope: [flaky] platforms: [dgx_h100] - - test_case: [gpt_dynamic_inference_tp1_pp8_dp1_583m_logitsmatch_zmq] - products: - - environment: [dev] - scope: [flaky] - platforms: [dgx_h100] - test_case: [gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq] products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - - test_case: [gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] - - test_case: [gpt_dynamic_inference_tp2_pp2_dp2_583m_logitsmatch_zmq] - products: - - environment: [dev] - scope: [flaky] diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/gpt.yaml index eae09a6e16a..0b068c55220 100644 --- a/tests/test_utils/recipes/gpt.yaml +++ b/tests/test_utils/recipes/gpt.yaml @@ -114,11 +114,6 @@ products: platforms: [dgx_h100] - environment: [lts] scope: [nightly] - - test_case: [gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer] products: - environment: [dev] diff --git a/tests/test_utils/recipes/mamba-dynamic-inference.yaml b/tests/test_utils/recipes/mamba-dynamic-inference.yaml deleted file mode 100644 index 0d02ce29a54..00000000000 --- a/tests/test_utils/recipes/mamba-dynamic-inference.yaml +++ /dev/null @@ -1,61 +0,0 @@ -type: basic -format_version: 1 -maintainers: [mcore] -loggers: [stdout] -spec: - name: '{test_case}_{environment}_{platforms}' - model: hybrid - build: mcore-pyt-{environment} - nodes: 1 - gpus: 1 - n_repeat: 1 - platforms: dgx_a100 - script_setup: | - unset https_proxy - echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc - - # Checkout latest - cd /opt - rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm - git init - git remote add origin $MCORE_REPO - git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*' - git fetch origin $MCORE_MR_COMMIT - git checkout $MCORE_MR_COMMIT - git rev-parse HEAD - # Checkout backwards-ref - cd /opt - rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy - git init - git remote add origin $MCORE_REPO - git fetch origin $MCORE_BACKWARDS_COMMIT - git checkout $MCORE_BACKWARDS_COMMIT - git rev-parse HEAD - rm -rf megatron; cp -a /opt/megatron-lm/megatron ./ - script: |- - ls - cd /opt/megatron-lm - - ARGUMENTS=( - "CHECKPOINT_LOAD_PATH=/mnt/artifacts" - "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" - "DATA_PATH=null" - "DATA_CACHE_PATH=/workspace/data/cache" - "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_dynamic_inference.py" - "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" - "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" - "OUTPUT_PATH={assets_dir}" - "TENSORBOARD_PATH={assets_dir}/generations_{environment}_{platforms}.json" - "N_REPEAT={n_repeat}" - "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" - "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" - ) - - bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} - -products: - - test_case: [hybrid_dynamic_inference_tp1_pp1_dp8_583m] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] diff --git a/tests/unit_tests/data/test_fim_dataset.py b/tests/unit_tests/data/test_fim_dataset.py deleted file mode 100644 index 7022a4b5fa9..00000000000 --- a/tests/unit_tests/data/test_fim_dataset.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. - -import pytest -import torch - -from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder -from megatron.core.datasets.utils import compile_helpers, get_blend_from_list -from megatron.core.tokenizers import MegatronTokenizer -from megatron.training.datasets.fim_dataset import GPTFIMDataset, GPTFIMDatasetConfig -from tests.unit_tests.test_utilities import Utils - - -@pytest.mark.parametrize("spm_rate", [0.0, 1.0]) -@pytest.mark.parametrize("split_sample", [None, "python"]) -def test_fim_gpt_dataset(spm_rate, split_sample): - if torch.distributed.is_available(): - Utils.initialize_distributed() - if torch.distributed.get_rank() == 0: - compile_helpers() - torch.distributed.barrier() - else: - compile_helpers() - - tokenizer = MegatronTokenizer.from_pretrained( - tokenizer_path="/opt/data/tokenizers/huggingface", - metadata_path={"library": "huggingface"}, - additional_special_tokens=["", "", "", "", ""], - include_special_tokens=True, - ) - blend = get_blend_from_list(["/opt/data/datasets/fim/fim_text_document"]) - extra_tokens = { - "prefix": "", - "middle": "", - "suffix": "", - "pad": "", - "eod": "", - } - seq_length = 32 - rate = 1.0 - fragment_rate = 1.0 - config = GPTFIMDatasetConfig( - blend=blend, - random_seed=1234, - sequence_length=seq_length, - split="990,9,1", - tokenizer=tokenizer, - reset_position_ids=True, - reset_attention_mask=True, - eod_mask_loss=True, - fim_extra_tokens=extra_tokens, - fim_rate=rate, - fim_spm_rate=spm_rate, - fim_fragment_rate=fragment_rate, - fim_split_sample=split_sample, - ) - - datasets = BlendedMegatronDatasetBuilder( - GPTFIMDataset, [10, 10, 10], lambda: True, config - ).build() - - prefix_id = tokenizer.tokenize("")[1] - suffix_id = tokenizer.tokenize("")[1] - middle_id = tokenizer.tokenize("")[1] - - dataset = datasets[0] - assert dataset.fim_rate == rate - assert dataset.fim_spm_rate == spm_rate - assert dataset.fragment_fim_rate == fragment_rate - - tokens = dataset[0]["tokens"].tolist() - if split_sample: - split_sample_id = tokenizer.tokenize(split_sample)[1] - split_sample_index = tokens.index(split_sample_id) - assert prefix_id == tokens[split_sample_index + 1] - if spm_rate == 0.0: - assert prefix_id == tokens[0] - assert suffix_id in tokens - assert middle_id in tokens - assert tokens.index(suffix_id) < tokens.index(middle_id) - else: - assert prefix_id == tokens[0] - assert suffix_id == tokens[1] - assert middle_id in tokens - - -if __name__ == "__main__": - test_fim_gpt_dataset() diff --git a/tests/unit_tests/inference/contexts/test_dynamic_context.py b/tests/unit_tests/inference/contexts/test_dynamic_context.py index 1baf9034c9d..0674cdfcabd 100644 --- a/tests/unit_tests/inference/contexts/test_dynamic_context.py +++ b/tests/unit_tests/inference/contexts/test_dynamic_context.py @@ -5,9 +5,6 @@ import pytest import torch -from megatron.core.inference.contexts.attention_context.mamba_metadata import ( - MambaInferenceStateConfig, -) from megatron.core.inference.contexts.dynamic_context import ( DynamicInferenceContext, RequestOverflowError, @@ -31,8 +28,6 @@ class TestDynamicContext: def _setup_model_parallel_group(self, tensor_parallel_size, pipeline_parallel_size): - self.pp_size = pipeline_parallel_size - Utils.initialize_model_parallel( tensor_model_parallel_size=tensor_parallel_size, pipeline_model_parallel_size=pipeline_parallel_size, @@ -48,39 +43,38 @@ def _get_dynamic_context( max_sequence_length, buffer_size_gb, block_size_tokens, - max_tokens, + buffer_guaranteed_fraction, + buffer_overflow_factor, + max_requests_override, + max_tokens_override, is_hybrid_model=False, layer_type_list=None, rounder=64, ): set_rounder(rounder) - if is_hybrid_model: - if layer_type_list is None: - layer_type_list = [Symbols.MAMBA, Symbols.MLP, Symbols.ATTENTION, Symbols.MLP] - mamba_conv_states_shape = (544, 4) - mamba_ssm_states_shape = (8, 64, 16) - mamba_inference_state_config = MambaInferenceStateConfig( - layer_type_list, mamba_conv_states_shape, mamba_ssm_states_shape - ) - else: - mamba_inference_state_config = None + if is_hybrid_model and layer_type_list is None: + layer_type_list = [Symbols.MAMBA, Symbols.MLP, Symbols.ATTENTION, Symbols.MLP] dynamic_context = DynamicInferenceContext( params_dtype=params_dtype, - num_layers=num_layers // self.pp_size, + num_layers=num_layers, kv_channels=kv_channels, num_attention_heads=num_attention_heads, max_sequence_length=max_sequence_length, num_cuda_graphs=None, use_cuda_graphs_for_non_decode_steps=not is_hybrid_model, buffer_size_gb=buffer_size_gb, + buffer_guaranteed_fraction=buffer_guaranteed_fraction, block_size_tokens=block_size_tokens, - max_tokens=max_tokens, - mamba_inference_state_config=mamba_inference_state_config, + buffer_overflow_factor=buffer_overflow_factor, + max_requests_override=max_requests_override, + max_tokens_override=max_tokens_override, + layer_type_list=layer_type_list, + mamba_conv_states_shape=(544, 4), + mamba_ssm_states_shape=(8, 64, 16), use_flashinfer_fused_rope=None, # default to using flash-infer if available # this is for compatibility with the LTS environment - unified_memory_level=0, # unit tests currently broken with UVM ) return dynamic_context @@ -99,25 +93,28 @@ def test_initialize_dynamic_context(self, is_hybrid_model: bool): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, + buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_tokens=None, + max_requests_override=None, + max_tokens_override=None, + buffer_overflow_factor=None, is_hybrid_model=is_hybrid_model, ) if not is_hybrid_model: - assert dynamic_context.block_allocator.total_count == 491 - assert dynamic_context.block_allocator.active_count == 245 - assert dynamic_context.max_total_requests == 490 - assert dynamic_context.max_active_requests == 245 - assert dynamic_context.max_tokens == 16384 + assert dynamic_context.gtd_block_count == 48 + assert dynamic_context.gtd_request_count == 12 + assert dynamic_context.block_allocator.block_count_total == 491 + assert dynamic_context.max_requests == 128 + assert dynamic_context.max_tokens == 62848 assert dynamic_context.num_mamba_layers == 0 assert dynamic_context.mamba_metadata is None else: - assert dynamic_context.block_allocator.total_count == 555 - assert dynamic_context.block_allocator.active_count == 277 - assert dynamic_context.max_total_requests == 554 - assert dynamic_context.max_active_requests == 277 - assert dynamic_context.max_tokens == 16384 + assert dynamic_context.gtd_block_count == 112 + assert dynamic_context.gtd_request_count == 28 + assert dynamic_context.block_allocator.block_count_total == 1156 + assert dynamic_context.max_requests == 320 + assert dynamic_context.max_tokens == 154176 assert dynamic_context.num_mamba_layers == 1 assert dynamic_context.mamba_metadata is not None @@ -134,8 +131,11 @@ def test_is_static_batching(self): num_attention_heads=8, max_sequence_length=512, buffer_size_gb=1.0, + buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_tokens=None, + max_requests_override=None, + max_tokens_override=None, + buffer_overflow_factor=None, ) assert not dynamic_context.is_static_batching() @@ -150,18 +150,26 @@ def test_is_memory_available(self, is_hybrid_model): num_attention_heads=8, max_sequence_length=512, buffer_size_gb=1.0, + buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_tokens=None, + max_requests_override=None, + max_tokens_override=None, + buffer_overflow_factor=None, is_hybrid_model=is_hybrid_model, ) - dynamic_context.block_allocator.active_count = 10 + dynamic_context.block_allocator.block_count_avail = 10 assert dynamic_context.block_allocator.is_memory_available(10) assert not dynamic_context.block_allocator.is_memory_available(11) assert dynamic_context.block_allocator.is_memory_available(1) - dynamic_context.block_allocator.active_count = 0 + dynamic_context.block_allocator.block_count_avail = 0 assert not dynamic_context.block_allocator.is_memory_available(1) + dynamic_context.block_allocator.block_count_avail = 10 + dynamic_context.gtd_block_count = 5 + assert dynamic_context.block_allocator.is_memory_available(6) + assert not dynamic_context.block_allocator.is_memory_available(6, safe=True) + @pytest.mark.internal @pytest.mark.parametrize("is_hybrid_model", [False, True]) def test_request_overflow(self, is_hybrid_model: bool): @@ -174,14 +182,16 @@ def test_request_overflow(self, is_hybrid_model: bool): num_attention_heads=8, max_sequence_length=128, buffer_size_gb=0.01, + buffer_guaranteed_fraction=0.1, block_size_tokens=32, - max_tokens=None, + max_requests_override=None, + max_tokens_override=None, + buffer_overflow_factor=None, rounder=1, is_hybrid_model=is_hybrid_model, ) - dynamic_context.max_active_requests //= 2 with pytest.raises(RequestOverflowError): - for i in range(dynamic_context.max_active_requests + 1): + for i in range(dynamic_context.max_requests + 1): dynamic_context.add_request( DynamicInferenceRequest( request_id=i, @@ -204,8 +214,11 @@ def test_token_overflow_error(self, is_hybrid_model: bool): num_attention_heads=8, max_sequence_length=512, buffer_size_gb=0.1, + buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_tokens=200, # setting low, but >= context.max_active_requests. + buffer_overflow_factor=1.0, + max_requests_override=2, + max_tokens_override=20, # Setting a very low token limit rounder=1, is_hybrid_model=is_hybrid_model, ) @@ -214,7 +227,7 @@ def test_token_overflow_error(self, is_hybrid_model: bool): dynamic_context.add_request( DynamicInferenceRequest( request_id=1, - prompt_tokens=torch.arange(0, 225, device='cuda'), + prompt_tokens=torch.arange(0, 25, device='cuda'), sampling_params=SamplingParams( num_tokens_to_generate=dynamic_context.max_tokens - 25 ), @@ -233,8 +246,11 @@ def test_reset(self, is_hybrid_model: bool): num_attention_heads=8, max_sequence_length=128, buffer_size_gb=1.0, + buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_tokens=None, + max_requests_override=None, + max_tokens_override=None, + buffer_overflow_factor=None, is_hybrid_model=is_hybrid_model, ) @@ -257,6 +273,7 @@ def test_reset(self, is_hybrid_model: bool): dynamic_context.token_to_position_in_request.fill_(1) dynamic_context.token_to_block_idx.fill_(1) dynamic_context.token_to_local_position_within_kv_block.fill_(1) + dynamic_context.block_allocator.block_count_avail = 5 dynamic_context.memory_buffer.fill_(1) dynamic_context.request_to_kv_block_ids.fill_(1) if is_hybrid_model: @@ -286,8 +303,8 @@ def test_reset(self, is_hybrid_model: bool): assert torch.all(dynamic_context.token_to_block_idx == -1) assert torch.all(dynamic_context.token_to_local_position_within_kv_block == 0) assert ( - dynamic_context.block_allocator.active_count - == dynamic_context.block_allocator.total_count // 2 + dynamic_context.block_allocator.block_count_avail + == dynamic_context.block_allocator.block_count_total - 1 ) assert torch.all(dynamic_context.request_to_kv_block_ids == -1) if is_hybrid_model: @@ -306,13 +323,16 @@ def test_allocate_and_release_memory_blocks(self, is_hybrid_model): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, + buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_tokens=None, + max_requests_override=None, + max_tokens_override=None, + buffer_overflow_factor=None, is_hybrid_model=is_hybrid_model, ) if is_hybrid_model: - expected_memory_blocks = [550, 551, 552, 553] + expected_memory_blocks = [1151, 1152, 1153, 1154] else: expected_memory_blocks = [486, 487, 488, 489] expected_block_count_avail = expected_memory_blocks[0] @@ -325,20 +345,20 @@ def test_allocate_and_release_memory_blocks(self, is_hybrid_model): .tolist() == expected_memory_blocks ) - assert dynamic_context.block_allocator.total_avail == expected_block_count_avail + assert dynamic_context.block_allocator.block_count_avail == expected_block_count_avail dynamic_context.block_allocator.release_memory_blocks( torch.tensor(expected_memory_blocks[-2:], device='cuda') ) - assert dynamic_context.block_allocator.total_avail == expected_block_count_avail + 2 + assert dynamic_context.block_allocator.block_count_avail == expected_block_count_avail + 2 assert ( dynamic_context.block_allocator.allocate_memory_blocks(1).item() == expected_memory_blocks[-1] ) - assert dynamic_context.block_allocator.total_avail == expected_block_count_avail + 1 + assert dynamic_context.block_allocator.block_count_avail == expected_block_count_avail + 1 # Should return None since we allocate more blocks than what we have. assert ( dynamic_context.block_allocator.allocate_memory_blocks( - dynamic_context.block_allocator.total_avail + 100 + dynamic_context.block_allocator.block_count_avail + 100 ) == None ) @@ -355,8 +375,11 @@ def test_add_request(self, is_hybrid_model: bool): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, + buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_tokens=None, + max_requests_override=None, + max_tokens_override=None, + buffer_overflow_factor=None, is_hybrid_model=is_hybrid_model, ) assert dynamic_context.block_size_tokens == 128 @@ -378,7 +401,7 @@ def test_add_request(self, is_hybrid_model: bool): assert dynamic_context.request_kv_length_offsets[0] == 0 assert dynamic_context.request_kv_block_counts[0] == 2 assert dynamic_context.request_last_kv_block_id[0].item() == ( - 553 if is_hybrid_model else 489 + 1154 if is_hybrid_model else 489 ) assert dynamic_context.request_last_kv_block_offset[0].item() == 15 assert torch.all( @@ -428,8 +451,11 @@ def test_update_request(self, is_hybrid_model: bool): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, + buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_tokens=None, + max_requests_override=None, + max_tokens_override=None, + buffer_overflow_factor=None, is_hybrid_model=is_hybrid_model, ) @@ -438,7 +464,7 @@ def test_update_request(self, is_hybrid_model: bool): dynamic_context.paused_request_count = 0 dynamic_context.total_request_count = 3 dynamic_context.request_kv_block_counts[0:3] = 1 - new_block_ids = dynamic_context.block_allocator.allocate_memory_blocks(3) + new_block_ids = dynamic_context.block_allocator.allocate_memory_blocks(3, safe=True) dynamic_context.request_to_kv_block_ids[0:3, 0] = new_block_ids if is_hybrid_model: @@ -472,8 +498,11 @@ def test_update_request(self, is_hybrid_model: bool): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, + buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_tokens=None, + max_requests_override=None, + max_tokens_override=None, + buffer_overflow_factor=None, is_hybrid_model=is_hybrid_model, ) @@ -491,16 +520,18 @@ def test_update_request(self, is_hybrid_model: bool): ) total_request_count = 10 - dynamic_context.block_allocator.total_avail -= 11 # We align 11 blocks to the 10 requests we have. 3rd request alone we setup like it requires 2 blocks + dynamic_context.block_allocator.block_count_avail -= 11 # We align 11 blocks to the 10 requests we have. 3rd request alone we setup like it requires 2 blocks dynamic_context.total_request_count = total_request_count dynamic_context.request_to_kv_block_ids[0:total_request_count, 0] = torch.arange( - dynamic_context.block_allocator.total_avail, - dynamic_context.block_allocator.total_avail + 10, + dynamic_context.block_allocator.block_count_avail, + dynamic_context.block_allocator.block_count_avail + 10, ) dynamic_context.request_to_kv_block_ids[3][ 1 - ] = dynamic_context.block_allocator.total_avail # Assign one extra block to request 3. + ] = ( + dynamic_context.block_allocator.block_count_avail + ) # Assign one extra block to request 3. dynamic_context.request_kv_length_offsets[0:total_request_count] = 10 # For 0, 1, 5, 6, the total number of tokens in last block is block size -1, so that they will all need extra blocks dynamic_context.request_kv_length_offsets[0:2] = dynamic_context.block_size_tokens - 1 @@ -586,13 +617,13 @@ def test_update_request(self, is_hybrid_model: bool): dynamic_context.request_to_kv_block_ids[0:10].cpu() == torch.tensor( [ - [543, 546, -1, -1], - [544, 543, -1, -1], - [548, 550, -1, -1], - [549, 551, -1, -1], - [547, -1, -1, -1], - [545, -1, -1, -1], - [552, -1, -1, -1], + [1144, 1147, -1, -1], + [1145, 1144, -1, -1], + [1149, 1151, -1, -1], + [1150, 1152, -1, -1], + [1148, -1, -1, -1], + [1146, -1, -1, -1], + [1153, -1, -1, -1], [-1, -1, -1, -1], [-1, -1, -1, -1], [-1, -1, -1, -1], @@ -631,19 +662,22 @@ def test_release_memory_blocks_for_finished_requests(self, is_hybrid_model): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, + buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_tokens=None, + max_requests_override=None, + max_tokens_override=None, + buffer_overflow_factor=None, is_hybrid_model=is_hybrid_model, ) # Set up the initial state with 5 requests # Allocate 5 blocks for 5 requests - initial_blocks = dynamic_context.block_allocator.allocate_memory_blocks(5) + initial_blocks = dynamic_context.block_allocator.allocate_memory_blocks(5, safe=True) dynamic_context.total_request_count = 5 dynamic_context.paused_request_count = 0 # Record the available blocks before releasing memory - initial_available_blocks = dynamic_context.block_allocator.total_avail + initial_available_blocks = dynamic_context.block_allocator.block_count_avail # Assign blocks to the requests (one block per request) for i in range(5): @@ -674,7 +708,7 @@ def test_release_memory_blocks_for_finished_requests(self, is_hybrid_model): assert dynamic_context.active_token_count == 2 # Verify that 3 blocks were released by checking the available blocks - assert dynamic_context.block_allocator.total_avail == initial_available_blocks + 3 + assert dynamic_context.block_allocator.block_count_avail == initial_available_blocks + 3 if is_hybrid_model: # Request at position 3 now moves into finished request position 0 @@ -703,19 +737,22 @@ def test_finished_requests_with_multiple_blocks(self, is_hybrid_model): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, + buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_tokens=None, + max_requests_override=None, + max_tokens_override=None, + buffer_overflow_factor=None, is_hybrid_model=is_hybrid_model, ) # Set up the initial state with 3 requests, where some use multiple blocks # Allocate 6 blocks in total for the requests - initial_blocks = dynamic_context.block_allocator.allocate_memory_blocks(6) + initial_blocks = dynamic_context.block_allocator.allocate_memory_blocks(6, safe=True) dynamic_context.total_request_count = 3 dynamic_context.paused_request_count = 0 # Record the available blocks before releasing memory - initial_available_blocks = dynamic_context.block_allocator.total_avail + initial_available_blocks = dynamic_context.block_allocator.block_count_avail # Assign blocks to the requests: # - Request 0: 1 block @@ -755,7 +792,7 @@ def test_finished_requests_with_multiple_blocks(self, is_hybrid_model): assert dynamic_context.active_token_count == 0 # Verify that all 6 blocks were released by checking the available blocks - assert dynamic_context.block_allocator.total_avail == initial_available_blocks + 6 + assert dynamic_context.block_allocator.block_count_avail == initial_available_blocks + 6 if is_hybrid_model: # All mamba states should be zeroed out @@ -776,8 +813,11 @@ def test_mamba_states_cache(self, is_hybrid_model: bool): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, + buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_tokens=None, + max_requests_override=None, + max_tokens_override=None, + buffer_overflow_factor=None, is_hybrid_model=False, ) with pytest.raises(AssertionError) as error: @@ -791,8 +831,11 @@ def test_mamba_states_cache(self, is_hybrid_model: bool): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, + buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_tokens=None, + max_requests_override=None, + max_tokens_override=None, + buffer_overflow_factor=None, is_hybrid_model=is_hybrid_model, layer_type_list=[Symbols.MAMBA, Symbols.ATTENTION, Symbols.MAMBA, Symbols.ATTENTION], ) @@ -847,8 +890,11 @@ def test_calculate_and_store_log_probs(self): num_attention_heads=2, max_sequence_length=512, buffer_size_gb=0.03, + buffer_guaranteed_fraction=0.1, block_size_tokens=128, - max_tokens=None, + max_requests_override=None, + max_tokens_override=None, + buffer_overflow_factor=None, ) # Add a few requests to the context @@ -1051,3 +1097,56 @@ def test_calculate_and_store_log_probs(self): ) current_global_token_offset += expected_len + + @pytest.mark.internal + def test_unified_memory(self): + + from megatron.core.inference.unified_memory import ( + UnifiedMemoryUnsupportedError, + create_unified_mempool, + ) + + # Check UVM support. + try: + create_unified_mempool() + except UnifiedMemoryUnsupportedError: + pytest.skip("Unified memory not available due to bad environment.") + + # Setup. + self._setup_model_parallel_group(1, 1) + + # Compute number of contexts needed to fill GPU memory. + gpu_size_gb = ( + torch.cuda.get_device_properties(torch.cuda.current_device()).total_memory / 1024**3 + ) + buffer_size_gb = 20 + num_contexts = math.ceil(gpu_size_gb / buffer_size_gb) + 1 + + # Allocate enough contexts to fill GPU memory. + def init_contexts(*, unified_memory_level): + contexts = [] + for i in range(num_contexts): + contexts.append( + DynamicInferenceContext( + params_dtype=torch.float32, + num_layers=4, + kv_channels=8, + num_attention_heads=2, + max_sequence_length=512, + buffer_size_gb=buffer_size_gb, + buffer_overflow_factor=1, + buffer_guaranteed_fraction=0, + unified_memory_level=unified_memory_level, + ) + ) + + # Pure GPU memory test should OOM. + try: + init_contexts(unified_memory_level=0) + except torch.OutOfMemoryError: + pass + else: + raise Exception("expected OOM.") + + # Unified memory test should succeed. + init_contexts(unified_memory_level=1) diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py index 174bf89350b..0ac4b296746 100644 --- a/tests/unit_tests/inference/engines/test_dynamic_engine.py +++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py @@ -1,10 +1,9 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import asyncio -import math import random import types -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Dict, List, Optional, Tuple import pytest @@ -13,9 +12,6 @@ from transformer_engine.pytorch.fp8 import check_fp8_support from megatron.core import parallel_state -from megatron.core.inference.contexts.attention_context.mamba_metadata import ( - MambaInferenceStateConfig, -) from megatron.core.inference.contexts.dynamic_context import ( ActiveRequestCountOverflowError, BlockOverflowError, @@ -38,7 +34,6 @@ ) from megatron.core.models.gpt.gpt_layer_specs import ( get_gpt_layer_local_spec, - get_gpt_layer_with_inference_spec, get_gpt_layer_with_transformer_engine_spec, ) from megatron.core.models.gpt.gpt_model import GPTModel @@ -49,7 +44,7 @@ from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import ( check_mamba_sequence_packing_support, - get_mamba_inference_state_config_from_model, + get_attr_wrapped_model, is_fa_min_version, is_te_min_version, ) @@ -91,7 +86,10 @@ class DynamicEngineTestConfig: context_buffer_size_gb: float = 0.1 # enough room for all tokens. context_block_size_tokens: int = 256 - context_max_tokens: Optional[int] = None + context_buffer_guaranteed_fraction: float = 0.01 + context_buffer_overflow_factor: Optional[float] = None + context_max_requests_override: Optional[int] = None + context_max_tokens_override: Optional[int] = None tensor_model_parallel_size: int = 1 pipeline_model_parallel_size: int = 1 expert_model_parallel_size: int = 1 @@ -107,14 +105,12 @@ class DynamicEngineTestConfig: skip_prompt_log_probs: bool = False cuda_graph_scope: List[str] = None force_build_cuda_graphs: bool = False - transformer_impl: str = "local" # If False, do not build cuda graphs in the tests, even if # num_cuda_graphs is set. # For tests concerning cuda-graph warmups, we set this to False # to avoid the overhead of building the graphs, which is not # relevant to the test. The tests only check if the required # context attributes are set correctly. - suspend_resume_interval: Optional[int] = None fp8: bool = False @@ -129,6 +125,17 @@ def __post_init__(self): assert self.num_tokens_total is not None self.max_sequence_length = self.num_tokens_total + # Update overrides if not using overflow factor. + if self.context_buffer_overflow_factor is None: + + # Enough room for all requests. + if self.context_max_requests_override is None: + self.context_max_requests_override = self.num_requests + + # Enough room for all tokens. + if self.context_max_tokens_override is None: + self.context_max_tokens_override = self.num_requests * self.max_sequence_length + if self.cuda_graph_scope is None: self.cuda_graph_scope = ["full_iteration"] @@ -140,9 +147,6 @@ class DynamicEngineTestEnv: config: DynamicEngineTestConfig requests: List[DynamicInferenceRequest] engine: DynamicInferenceEngine - mem_usage: dict = field( - default_factory=lambda: {"start": None, "end": None, "suspend_resume": {}} - ) class TestDynamicInferenceEngine: @@ -211,29 +215,34 @@ def _build_inference_context( test_config: DynamicEngineTestConfig, transformer_config: TransformerConfig, requests: List[DynamicInferenceRequest], - mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None, + layer_type_list: Optional[List[str]], + mamba_conv_states_shape: Optional[Tuple[int]] = None, + mamba_ssm_states_shape: Optional[Tuple[int]] = None, ): """The inference context manages the KV cache and other inference state.""" # Inference context. context = DynamicInferenceContext( params_dtype=transformer_config.params_dtype, - num_layers=transformer_config.num_layers - // transformer_config.pipeline_model_parallel_size, + num_layers=transformer_config.num_layers, kv_channels=transformer_config.kv_channels, num_attention_heads=transformer_config.num_query_groups, max_sequence_length=test_config.max_sequence_length, num_cuda_graphs=test_config.num_cuda_graphs, use_cuda_graphs_for_non_decode_steps=not test_config.model_provider == "mamba", buffer_size_gb=test_config.context_buffer_size_gb, + buffer_guaranteed_fraction=test_config.context_buffer_guaranteed_fraction, block_size_tokens=test_config.context_block_size_tokens, - max_tokens=test_config.context_max_tokens, + buffer_overflow_factor=test_config.context_buffer_overflow_factor, + max_requests_override=test_config.context_max_requests_override, + max_tokens_override=test_config.context_max_tokens_override, tensor_model_parallel_size=transformer_config.tensor_model_parallel_size, - mamba_inference_state_config=mamba_inference_state_config, + layer_type_list=layer_type_list, + mamba_conv_states_shape=mamba_conv_states_shape, + mamba_ssm_states_shape=mamba_ssm_states_shape, materialize_only_last_token_logits=test_config.materialize_only_last_token_logits, use_flashinfer_fused_rope=None, # default to using flash-infer if available # this is for compatibility with the LTS environment - unified_memory_level=0, # unit tests currently broken with UVM ) return context @@ -286,26 +295,16 @@ def _build_test_env(cls, test_config): ), sequence_parallel=test_config.sequence_parallel, pipeline_dtype=torch.bfloat16, - add_bias_linear=test_config.expert_model_parallel_size == 1 - and not (test_config.transformer_impl == "inference_optimized"), + add_bias_linear=test_config.expert_model_parallel_size == 1, fp8="hybrid" if test_config.fp8 else None, fp8_recipe="tensorwise" if test_config.fp8 else None, inference_sampling_seed=test_config.random_seed, cuda_graph_scope=test_config.cuda_graph_scope, - transformer_impl=test_config.transformer_impl, - normalization=( - "RMSNorm" - if test_config.transformer_impl == "inference_optimized" - else "LayerNorm" - ), - # inference optimized currently only supports RMS Norm ) - if test_config.fp8 or test_config.transformer_impl == "transformer_engine": + if test_config.fp8: layer_spec = get_gpt_layer_with_transformer_engine_spec() - elif test_config.transformer_impl == "local": + else: layer_spec = get_gpt_layer_local_spec() - elif test_config.transformer_impl == "inference_optimized": - layer_spec = get_gpt_layer_with_inference_spec() # GPT model. model = GPTModel( @@ -318,13 +317,10 @@ def _build_test_env(cls, test_config): post_process=parallel_state.is_pipeline_last_stage(), ).cuda() elif test_config.model_provider == "mamba": - pp_size = test_config.pipeline_model_parallel_size # Transformer config. transformer_config = TransformerConfig( params_dtype=torch.bfloat16, - num_layers=( - 3 if pp_size == 1 else 6 - ), # 1 Mamba layer, 1 attention layer, 1 MLP layer + num_layers=3, # 1 Mamba layer, 1 attention layer, 1 MLP layer hidden_size=256, # The Mamba layer places several constraints on this mamba_num_heads=16, num_attention_heads=16, @@ -337,7 +333,7 @@ def _build_test_env(cls, test_config): ), inference_rng_tracker=True, tensor_model_parallel_size=test_config.tensor_model_parallel_size, - pipeline_model_parallel_size=pp_size, + pipeline_model_parallel_size=test_config.pipeline_model_parallel_size, expert_model_parallel_size=test_config.expert_model_parallel_size, num_moe_experts=( None @@ -350,7 +346,6 @@ def _build_test_env(cls, test_config): fp8="hybrid" if test_config.fp8 else None, fp8_recipe="tensorwise" if test_config.fp8 else None, cuda_graph_scope=test_config.cuda_graph_scope, - is_hybrid_model=True, # Needs to be set for correct out_proj init ) # Mamba model. @@ -373,7 +368,22 @@ def _build_test_env(cls, test_config): model.eval() - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) + # Layer type list for hybrid models + decoder = get_attr_wrapped_model(model, "decoder") + layer_type_list = getattr(decoder, "layer_type_list", None) + if test_config.model_provider == "mamba": + mamba_states_shapes = decoder.mamba_state_shapes_per_request() + if mamba_states_shapes is not None: + (mamba_conv_states_shape, mamba_ssm_states_shape) = mamba_states_shapes + else: + # A `MambaBlock` can only not have a `MambaLayer` if using pipeline parallelism + # and a particular pipeline stage was not assigned a `MambaLayer`. + assert test_config.pipeline_model_parallel_size > 1 + mamba_conv_states_shape = None + mamba_ssm_states_shape = None + else: + mamba_conv_states_shape = None + mamba_ssm_states_shape = None # Inference config. inference_config = InferenceWrapperConfig( @@ -390,7 +400,9 @@ def _build_test_env(cls, test_config): test_config=test_config, transformer_config=transformer_config, requests=requests, - mamba_inference_state_config=mamba_inference_state_config, + layer_type_list=layer_type_list, + mamba_conv_states_shape=mamba_conv_states_shape, + mamba_ssm_states_shape=mamba_ssm_states_shape, ) # Inference model wrapper. @@ -404,9 +416,7 @@ def _build_test_env(cls, test_config): # Text generation controller. text_generation_controller = TextGenerationController( inference_wrapped_model=inference_wrapped_model, - tokenizer=types.SimpleNamespace( - vocab_size=test_config.vocab_size, detokenize=lambda tokens: "tokenized_prompt" - ), + tokenizer=types.SimpleNamespace(vocab_size=test_config.vocab_size), ) # Reset global cuda graph state. @@ -425,6 +435,12 @@ def _build_test_env(cls, test_config): # Test env. env = DynamicEngineTestEnv(config=test_config, requests=requests, engine=engine) + # Mock the detokenize method to return predictable result + def mock_detokenize_prompt(tokens): + return "tokenized_prompt" + + env.engine.controller.tokenizer.detokenize = mock_detokenize_prompt + return env @classmethod @@ -437,31 +453,7 @@ def _run_step(cls, env): # and engine.async_step() doesn't use this sampling param's # num_tokens_to_generate. result = env.engine.step_modern(verbose=False) - - # Suspend + resume. - if ( - env.config.suspend_resume_interval is not None - and env.engine.step_count % env.config.suspend_resume_interval == 0 - ): - suspend_resume_mems = {} - suspend_resume_mems["start"] = torch.cuda.memory_stats() - env.engine.suspend() # suspend. - suspend_resume_mems["mid"] = torch.cuda.memory_stats() - env.engine.resume() # resume. - suspend_resume_mems["end"] = torch.cuda.memory_stats() - env.mem_usage["suspend_resume"][env.engine.step_count] = suspend_resume_mems - - # Nothing done? - finished_request_records = result["finished_request_records"] - if len(finished_request_records) == 0: - return - - # Append output tokens. - for finished_request_record in finished_request_records: - finished_request = finished_request_record.merge(env.engine.controller.tokenizer) - request = env.requests[finished_request.request_id] - request.output = finished_request.generated_tokens - request.status = finished_request.status + finished_requests = result["finished_requests"] @classmethod @torch.inference_mode() @@ -471,12 +463,10 @@ def _run_test(cls, **test_config_kwargs): env = cls._build_test_env(test_config) # Add requests to engine. - env.mem_usage["start"] = torch.cuda.memory_stats() for request in tqdm(env.requests, "add requests"): # Add request. env.engine._add_request(request) - request.state = "pending" # Insert gap steps between adding requests. for _ in range(test_config.num_gap_steps): @@ -503,20 +493,14 @@ def _run_test(cls, **test_config_kwargs): if num_tokens_total is None else num_tokens_total - len(request.prompt_tokens) ) - - # Validate the output length only if suspend_resume_interval is None. - # If it is not None, then the output length could be anything in the - # range [1, num_tokens_to_generate]. - if test_config.suspend_resume_interval is None: - assert ( - (num_tokens_to_generate is None and num_tokens_total is None) - or len(request.generated_tokens) <= num_tokens_expected - or request.status == Status.FAILED - ), ( - f"Request {request.request_id} expected to generate {num_tokens_to_generate} " - f"tokens but generated {len(request.generated_tokens)}" - ) - env.mem_usage["end"] = torch.cuda.memory_stats() + assert ( + (num_tokens_to_generate is None and num_tokens_total is None) + or len(request.generated_tokens) == num_tokens_expected + or request.status == Status.FAILED + ), ( + f"Request {request.request_id} expected to generate {num_tokens_to_generate} " + f"tokens but generated {len(request.generated_tokens)}" + ) return env @@ -534,40 +518,40 @@ def teardown_method(self, method): def test_simple(self, model_provider, num_cuda_graphs, cuda_graph_scope) -> None: """Simple test that runs without errors, and validates output.""" skip_if_mamba_sequence_packing_not_available(model_provider) - num_tokens_to_generate = 16 # Run test. env = self._run_test( - num_tokens_to_generate=num_tokens_to_generate, model_provider=model_provider, num_cuda_graphs=num_cuda_graphs, + context_max_requests_override=32, cuda_graph_scope=cuda_graph_scope, force_build_cuda_graphs=True, ) # Validate max_requests, max_tokens. - assert env.engine.context.max_tokens == DynamicInferenceContext.DEFAULT_MAX_TOKENS + assert env.engine.context.max_requests == 32 + assert env.engine.context.max_tokens == 160 - # Validate generated tokens. + # Validate output tokens. gpt_expected_generated_tokens = [ - [69, 85, 55, 74, 56, 89, 64, 59, 55, 67, 15, 58, 6, 37, 54, 47], - [29, 54, 33, 72, 45, 76, 41, 56, 28, 25, 17, 2, 61, 6, 98, 76], - [35, 78, 54, 16, 79, 98, 22, 5, 60, 0, 1, 76, 77, 11, 25, 7], - [25, 75, 57, 85, 81, 37, 88, 17, 71, 15, 70, 64, 50, 0, 64, 45], - [32, 5, 85, 75, 30, 68, 23, 33, 20, 26, 89, 20, 92, 97, 38, 81], - [33, 69, 32, 49, 93, 24, 33, 6, 97, 36, 37, 99], - [82, 78, 78, 65, 22, 1, 87, 42, 36, 26, 27, 56, 82, 32, 8, 80], - [], + [69, 85, 55, 74], + [29, 54, 85, 89], + [33, 30, 64, 59], + [45, 76, 33, 67], + [41, 56, 15, 58], + [28, 17, 6, 37], + [17, 2, 54, 47], + [], # this request is failed due to max sequence length overflow ] mamba_expected_generated_tokens = [ - [74, 72, 9, 59, 1, 70, 15, 89, 30, 52, 82, 70, 64, 16, 83, 5], - [25, 54, 28, 14, 87, 27, 60, 92, 28, 74, 8, 63, 60, 68, 87, 82], - [31, 21, 87, 25, 96, 13, 32, 49, 40, 54, 55, 68, 73, 2, 64, 96], - [72, 80, 35, 72, 77, 85, 98, 36, 4, 97, 37, 46, 79, 95, 83, 25], - [8, 80, 56, 4, 87, 1, 43, 98, 85, 7, 50, 38, 24, 28, 18, 80], - [9, 94, 36, 16, 87, 57, 25, 76, 64, 92, 47, 86, 73, 72, 71, 97], - [17, 5, 62, 66, 15, 52, 32, 75, 66, 18, 90, 14, 67, 37, 94, 33], + [74, 72, 83, 59], + [25, 54, 1, 70], + [28, 14, 15, 89], + [87, 27, 30, 52], + [44, 13, 82, 70], + [28, 74, 64, 16], + [8, 4, 83, 5], [], ] @@ -578,10 +562,6 @@ def test_simple(self, model_provider, num_cuda_graphs, cuda_graph_scope) -> None else: raise ValueError(f"Invalid model_provider {model_provider}") - print(f"Validating {len(env.requests)} requests.") - print(f"Expected generated tokens: {expected_generated_tokens_list}") - print(f"Actual generated tokens: {[request.generated_tokens for request in env.requests]}") - assert len(env.requests) == len(expected_generated_tokens_list) for request, expected_generated_tokens in zip(env.requests, expected_generated_tokens_list): @@ -591,6 +571,41 @@ def test_simple(self, model_provider, num_cuda_graphs, cuda_graph_scope) -> None f"expected ({expected_generated_tokens})." ) + @pytest.mark.internal + @pytest.mark.skipif( + not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching" + ) + def test_overflow_factor(self, model_provider: str = "gpt") -> None: + """Test overflow factor arg.""" + skip_if_mamba_sequence_packing_not_available(model_provider) + + # Run test. + env = self._run_test( + context_buffer_overflow_factor=0.1, + context_max_requests_override=None, + context_max_tokens_override=None, + model_provider=model_provider, + ) + + # Validate max_requests, max_tokens. + if model_provider == "gpt": + assert env.engine.context.max_requests == 420 + assert env.engine.context.max_tokens == 420 + elif model_provider == "mamba": + assert env.engine.context.max_requests == 16 + assert env.engine.context.max_tokens == 16 + + @pytest.mark.internal + @pytest.mark.skipif( + not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching" + ) + @pytest.mark.parametrize("model_provider", ["gpt", "mamba"]) + def test_request_overflow(self, model_provider: str) -> None: + """Test request overflow.""" + skip_if_mamba_sequence_packing_not_available(model_provider) + + self._run_test(context_max_requests_override=4, model_provider=model_provider) + @pytest.mark.skipif( not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching" ) @@ -598,11 +613,7 @@ def test_simple(self, model_provider, num_cuda_graphs, cuda_graph_scope) -> None def test_token_overflow_transient(self) -> None: """Test token overflow.""" test_config = DynamicEngineTestConfig( - num_requests=2, - min_prompt_length=512, - max_prompt_length=512, - num_tokens_to_generate=2, - context_max_tokens=900, + num_requests=2, min_prompt_length=8, max_prompt_length=8, context_max_tokens_override=12 ) env = self._build_test_env(test_config) env.engine._add_request(env.requests[0]) @@ -621,7 +632,7 @@ def test_token_overflow_transient(self) -> None: ) def test_token_overflow_nontransient(self) -> None: """Test token overflow (non-transient).""" - test_config = DynamicEngineTestConfig(context_max_tokens=8) + test_config = DynamicEngineTestConfig(context_max_tokens_override=8) env = self._build_test_env(test_config) try: env.engine._add_request(env.requests[0]) @@ -678,21 +689,19 @@ def test_cuda_graph_token_counts(self) -> None: # Test num_cuda_graphs. for num_cuda_graphs, expected_cuda_graph_token_counts in [ - (0, [40]), - (1, [40]), - (2, [40, 24]), - (4, [40, 32, 16]), - (8, [40, 32, 24, 16, 8]), - (16, [40, 32, 24, 16, 8]), - (64, [40, 32, 24, 16, 8]), - (1024, [40, 32, 24, 16, 8]), + (0, [64]), + (1, [64]), + (2, [64, 32]), + (4, [64, 48, 32, 16]), + (8, [64, 56, 48, 40, 32, 24, 16, 8]), + (16, [64, 56, 48, 40, 32, 24, 16, 8]), + (64, [64, 56, 48, 40, 32, 24, 16, 8]), + (1024, [64, 56, 48, 40, 32, 24, 16, 8]), ]: # Build cuda graphs (inside dynamic engine). env = self._build_test_env( - DynamicEngineTestConfig( - context_buffer_size_gb=0.01, num_cuda_graphs=num_cuda_graphs - ) + DynamicEngineTestConfig(num_requests=64, num_cuda_graphs=num_cuda_graphs) ) actual_cuda_graph_token_counts = env.engine.context.cuda_graph_token_counts assert ( @@ -712,7 +721,19 @@ def test_cuda_graph_token_counts(self) -> None: ) @pytest.mark.parametrize( "num_warmup_tokens, expected_cuda_graph_token_count", - [(1, 8), (2, 8), (4, 8), (8, 8), (10, 16), (12, 16), (16, 16)], + [ + (1, 8), + (2, 8), + (4, 8), + (8, 8), + (10, 16), + (12, 16), + (16, 16), + (20, 24), + (24, 24), + (28, 32), + (32, 32), + ], ) @torch.inference_mode() def test_cuda_graph_warmup( @@ -727,16 +748,17 @@ def test_cuda_graph_warmup( # Initialize context. env = self._build_test_env( - DynamicEngineTestConfig( - context_buffer_size_gb=0.0041, num_cuda_graphs=8, num_tokens_to_generate=1 - ) + DynamicEngineTestConfig(num_requests=32, num_cuda_graphs=8, num_tokens_to_generate=1) ) context = env.engine.context assert context.is_decode_only() - assert context.cuda_graph_token_counts == [16, 8], "cuda_graph_token_counts: %s." % str( - context.cuda_graph_token_counts - ) + assert context.cuda_graph_token_counts == [ + 32, + 24, + 16, + 8, + ], "cuda_graph_token_counts: %s." % str(context.cuda_graph_token_counts) context.initialize_attention_state( num_warmup_tokens=num_warmup_tokens, warmup_engine_mode=warmup_engine_mode @@ -829,10 +851,7 @@ def mock_tokenize_prompt(prompt, add_BOS=False): # Call the generate function. # It's safe to use request 0's sampling params here because all sampling # params are identical as long as use_fixed_output_lengths == False. - finished_request_records = env.engine.generate(prompts, env.requests[0].sampling_params) - finished_requests = [ - r.merge(env.engine.controller.tokenizer) for r in finished_request_records - ] + finished_requests = env.engine.generate(prompts, env.requests[0].sampling_params) # Verify results assert len(finished_requests) == len( @@ -882,11 +901,10 @@ async def test_run_engine(self): num_tokens_to_generate = env.requests[ request_id ].sampling_params.num_tokens_to_generate - request_record = fut.result() - request = request_record.merge(env.engine.controller.tokenizer) - assert request.generated_length == num_tokens_to_generate, ( + result = fut.result() + assert result.generated_length == num_tokens_to_generate, ( f"Request {request_id} expected to generate {num_tokens_to_generate} " - f"tokens but generated {request.generated_length}" + f"tokens but generated {result.generated_length}" ) engine_task.cancel() @@ -933,7 +951,6 @@ def test_return_log_probs(self): @pytest.mark.parametrize("pp_size", [1, 2]) @pytest.mark.parametrize("tp_size", [1, 2]) @pytest.mark.parametrize("model_provider", ["gpt", "mamba"]) - @pytest.mark.parametrize("transformer_impl", ["local", "inference_optimized"]) @torch.inference_mode() def test_parallel_inference( self, @@ -943,7 +960,6 @@ def test_parallel_inference( ep_size, sequence_parallel, materialize_only_last_token_logits, - transformer_impl, ): skip_if_mamba_sequence_packing_not_available(model_provider) @@ -959,22 +975,13 @@ def test_parallel_inference( pytest.skip(reason="Sequence parallelism requires tp_size > 1") elif tp_size > 1 and ep_size > 1 and not sequence_parallel: pytest.skip(reason="Sequence parallelism must be used with tp_size > 1 and ep_size > 1") - elif transformer_impl == "inference_optimized": - if ep_size > 1: - pytest.skip( - reason="MoE models are not supported with the inference optimized transformer." - ) - if tp_size > 1 and not sequence_parallel: - pytest.skip( - reason=( - "The inference optimized transformer requires sequence parallelism " - "when tp_size > 1." - ) - ) - if model_provider == "mamba": - pytest.skip( - reason="Mamba model is not supported with the inference optimized transformer." + elif pp_size > 1 and model_provider == "mamba": + pytest.skip( + reason=( + "Running hybrid models with pp_size > 1 and no attention on some " + "pipeline stages is not supported yet." ) + ) env = self._run_test( model_provider=model_provider, @@ -983,7 +990,6 @@ def test_parallel_inference( expert_model_parallel_size=ep_size, sequence_parallel=sequence_parallel, materialize_only_last_token_logits=materialize_only_last_token_logits, - transformer_impl=transformer_impl, ) @pytest.mark.internal @@ -1032,7 +1038,8 @@ def test_events(self): max_prompt_length=10, num_tokens_to_generate=32, context_buffer_size_gb=0.001, # 0.001, # 8 blocks - context_max_tokens=8, + context_max_requests_override=8, + context_max_tokens_override=8, num_gap_steps=1, ) @@ -1081,58 +1088,27 @@ def test_chunked_prefill(self, model_provider: str): materialize_only_last_token_logits=False, model_provider=model_provider, context_block_size_tokens=256, - context_max_tokens=1000, + context_max_tokens_override=300, ) - @pytest.mark.internal - @pytest.mark.skipif( - not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching" - ) - @pytest.mark.skip( - reason="test works in isolation, but memory dynamics change when run " - "within unt test suite." - ) - def test_suspend_resume_memory(self): - - # Run tests. - mem_usages = {} - for suspend_resume_interval in None, 8, 4, 2: # interval 1 acts funny. - - # Run test. - env = self._run_test(suspend_resume_interval=suspend_resume_interval, num_gap_steps=1) - - # Record memory usage. - mem_usages[suspend_resume_interval] = env.mem_usage - - # Clear memory to make recorded memories consistent between tests. - # TODO(@lmcafee): why is memory not automatically cleared? - # env.engine.suspend() # TODO(@lmcafee): useful? - del env - - # Utility methods. - get_alloc = lambda mem_stats: mem_stats["allocated_bytes.all.current"] - - # Validate overall 'end' memory usage. - golden_end_bytes = get_alloc(mem_usages[None]["end"]) - for interval, mem_usage in mem_usages.items(): - current_end_bytes = get_alloc(mem_usage["end"]) - assert math.isclose( - golden_end_bytes, current_end_bytes, rel_tol=0.01 - ), f"{current_end_bytes} != {golden_end_bytes}." - - # Validate 'suspend/resume' memory usage. - get_suspend_resume_bytes = lambda key: list( - get_alloc(list(d["suspend_resume"].values())[-1][key]) - for i, d in mem_usages.items() - if i is not None - ) - suspend_resume_mid_bytes = get_suspend_resume_bytes("mid") - suspend_resume_end_bytes = get_suspend_resume_bytes("end") - for mid_bytes in suspend_resume_mid_bytes: - assert math.isclose( - suspend_resume_mid_bytes[0], mid_bytes, rel_tol=0.01 - ), f"{mid_bytes} != {suspend_resume_mid_bytes[0]}." - for end_bytes in suspend_resume_end_bytes: - assert math.isclose( - suspend_resume_end_bytes[0], end_bytes, rel_tol=0.01 - ), f"{end_bytes} != {suspend_resume_end_bytes[0]}." + +if __name__ == "__main__": + test = TestDynamicInferenceEngine() + test.test_simple(4) + test.test_overflow_factor() + test.test_request_overflow() + test.test_token_overflow_transient() + # test.test_token_overflow_nontransient() # uncomment in megatron-core 0.16 + test.test_block_overflow() + test.test_multi_add() + test.test_fixed_output_lengths() + test.test_cuda_graph_request_counts() + test.test_cuda_graph_warmup(WarmupEngineMode.DECODE, 1, 8) + test.test_generate_function() + asyncio.run(test.test_run_engine()) + test.test_return_log_probs() + test.test_parallel_inference() + # test.test_events() # uncomment in megatron-core 0.16 + test.teardown_method(None) + print("~~~") + print("success.") diff --git a/tests/unit_tests/inference/engines/test_static_engine.py b/tests/unit_tests/inference/engines/test_static_engine.py index 40187a5eff9..699a4d1f473 100644 --- a/tests/unit_tests/inference/engines/test_static_engine.py +++ b/tests/unit_tests/inference/engines/test_static_engine.py @@ -12,11 +12,7 @@ from megatron.core import parallel_state from megatron.core.inference.contexts import StaticInferenceContext from megatron.core.inference.engines import StaticInferenceEngine -from megatron.core.inference.inference_request import ( - DynamicInferenceRequestRecord, - InferenceRequest, - Status, -) +from megatron.core.inference.inference_request import InferenceRequest, Status from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( GPTInferenceWrapper, ) @@ -192,19 +188,12 @@ def test_generate_dynamic(self, batch_size: int, num_trials: int, empty_prompt: prompts = ["" for i in range(batch_size)] else: prompts = ["sample" * (i + 1) for i in range(batch_size)] - results: List[Union[InferenceRequest, DynamicInferenceRequestRecord]] = ( - self.static_engine.generate( - prompts, sampling_params=SamplingParams(num_tokens_to_generate=10) - ) + results: List[InferenceRequest] = self.static_engine.generate( + prompts, sampling_params=SamplingParams(num_tokens_to_generate=10) ) assert len(results) == batch_size for result in results: - if isinstance(result, DynamicInferenceRequestRecord): - result = result.merge(self.static_engine.controller.tokenizer) - assert isinstance(result, InferenceRequest), ( - "expected ; found <%s>." % type(result).__name__ - ) assert ( result.status == Status.COMPLETED ), f"Status should be completed but its {result.status}" diff --git a/tests/unit_tests/inference/test_data_parallel_inference_coordinator.py b/tests/unit_tests/inference/test_data_parallel_inference_coordinator.py deleted file mode 100644 index 7b4fb4b4250..00000000000 --- a/tests/unit_tests/inference/test_data_parallel_inference_coordinator.py +++ /dev/null @@ -1,471 +0,0 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - -import asyncio -import random -import time -from collections import deque -from dataclasses import dataclass, field -from typing import Dict, List, Optional, Tuple - -import pytest -import torch.distributed as dist -from tqdm import tqdm - -from megatron.core.inference.data_parallel_inference_coordinator import ( - DataParallelInferenceCoordinator, -) -from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine, RequestEntry -from megatron.core.inference.inference_client import InferenceClient -from megatron.core.inference.inference_request import ( - DynamicInferenceRequest, - DynamicInferenceRequestRecord, - Status, -) -from megatron.core.inference.sampling_params import SamplingParams -from megatron.core.utils import get_asyncio_loop -from tests.unit_tests.test_utilities import Utils - -try: - import zmq - - HAVE_ZMQ = True -except Exception: - HAVE_ZMQ = False - -IS_ZMQ_FLAKY = True - - -class DummyContext: - """Dummy inference context.""" - - def __init__(self): - self.active_cnt = 0 - - def get_active_request_count(self) -> int: - return self.active_cnt - - -class DummyEngine(DynamicInferenceEngine): - """Dummy inference engine that only implements coordinator-related methods.""" - - def __init__(self): - """We cannot call super().__init__() because it requires complex setup.""" - self.waiting_request_ids = deque() - self.requests: Dict[int, RequestEntry] = {} - self.suspend_signal = False - self.is_suspended = False - self._loop = get_asyncio_loop() - self.context = DummyContext() - self.running = asyncio.Event() - self.paused = asyncio.Event() - self.stopped = asyncio.Event() - self.pending_microbatch = deque() - self.received_pause: bool = False - self.received_stop: bool = False - - def add_request( - self, request_id: int, prompt: str, sampling_params: Optional[SamplingParams] = None - ) -> asyncio.Future[DynamicInferenceRequestRecord]: - """Dummy add_request.""" - - self.requests[request_id] = RequestEntry( - record=DynamicInferenceRequestRecord.from_request( - DynamicInferenceRequest( - prompt=prompt, - request_id=request_id, - sampling_params=sampling_params, - status=Status.WAITING_IN_QUEUE, - ) - ), - future=self._loop.create_future(), - ) - self.waiting_request_ids.append(request_id) - - return self.requests[request_id].future - - async def async_step(self, *, verbose: Optional[bool] = False) -> Dict: - """Dummy async_step.""" - # Finish "active" requests. - finished_request_records = [] - to_remove = [] - for request_id, entry in self.requests.items(): - request = entry.record[-1] - if request.status == Status.ACTIVE_AND_GENERATING_TOKENS: - request.sampling_params.num_tokens_to_generate -= 1 - if request.sampling_params.num_tokens_to_generate > 0: - continue - request.status = Status.COMPLETED - self.context.active_cnt -= 1 - finished_request_records.append(entry.record) - entry.future.set_result(entry.record) - to_remove.append(request_id) - for request_id in to_remove: - del self.requests[request_id] - - # Activate queued requests. They will "process" for 1 step. - active_request_ids = [] - while self.waiting_request_ids: - request_id = self.waiting_request_ids.popleft() - record = self.requests[request_id].record - record[-1].status = Status.ACTIVE_AND_GENERATING_TOKENS - self.context.active_cnt += 1 - active_request_ids.append(request_id) - - return { - "active_request_ids": active_request_ids, - "finished_request_records": finished_request_records, - "step_time": 0.01, - "cuda_graph_request_count": 1, - } - - -@dataclass -class CoordinatorTestConfig: - """Test configuration args.""" - - port: int = 46581 - mp_port: int = 49581 - launch_inference_coordinator: bool = True - stop_engines: bool = True - verify_results: bool = True - - num_requests: int = 10**1 - min_time_offset: float = 10 ** (-4) - max_time_offset: float = 10 ** (-3) - num_steps_to_finish: int = 1 - num_iterations: int = 1 - - tensor_model_parallel_size: int = 1 - pipeline_model_parallel_size: int = 1 - - -@dataclass -class CoordinatorTestEnv: - """Test environment, including requests.""" - - config: CoordinatorTestConfig - requests: List[Tuple] - engine: DummyEngine - responses: List[List[DynamicInferenceRequest]] = field(default_factory=list) - timing_data: Dict[str, Optional[float]] = field( - default_factory=lambda: { - "start_time": None, - "init_time": None, - "done_time": None, - "stop_time": None, - } - ) - - -class TestCoordinator: - - @classmethod - def _build_requests(cls, test_config: CoordinatorTestConfig) -> List[Tuple]: - ret = [] - - for _ in range(test_config.num_requests): - arrival_delta = random.uniform(test_config.min_time_offset, test_config.max_time_offset) - num_tokens = test_config.num_steps_to_finish - ret.append( - ("Hello world!", SamplingParams(num_tokens_to_generate=num_tokens), arrival_delta) - ) - return ret - - @classmethod - def _build_test_env(cls, test_config): - Utils.initialize_model_parallel( - tensor_model_parallel_size=test_config.tensor_model_parallel_size, - pipeline_model_parallel_size=test_config.pipeline_model_parallel_size, - ) - requests = cls._build_requests(test_config) - engine = DummyEngine() - engine.num_steps_to_finish = test_config.num_steps_to_finish - return CoordinatorTestEnv(config=test_config, requests=requests, engine=engine) - - @classmethod - async def _run_test(cls, **test_config_kwargs): - # Test environment. - test_config = CoordinatorTestConfig(**test_config_kwargs) - env = cls._build_test_env(test_config) - - # Connect each engine to their respective processes. - env.timing_data["start_time"] = time.time() - await env.engine.start_listening_to_data_parallel_coordinator( - inference_coordinator_port=test_config.port, - launch_inference_coordinator=test_config.launch_inference_coordinator, - ) - - results_success = False - shutdown_success = False - try: - if dist.get_rank() == 0: - client = InferenceClient(test_config.port) - await client.start() - env.timing_data["init_time"] = time.time() - - all_results = [] - for _ in range(test_config.num_iterations): - futures = [] - for request in tqdm(env.requests, "add_requests"): - prompt, sampling_params, arrival_delta = request - await asyncio.sleep(arrival_delta) - fut = client.add_request(prompt=prompt, sampling_params=sampling_params) - futures.append(fut) - results = await asyncio.wait_for(asyncio.gather(*futures), timeout=10.0) - all_results.append(results) - env.timing_data["done_time"] = time.time() - results_success = True - finally: - try: - if dist.get_rank() == 0: - if test_config.stop_engines: - await asyncio.wait_for(client.stop_engines(), timeout=10.0) - client.stop() - if test_config.stop_engines: - await asyncio.wait_for(env.engine.engine_loop_task, timeout=10.0) - shutdown_success = True - except: - env.engine.engine_loop_task.cancel() - - env.timing_data["stop_time"] = time.time() - - assert results_success, "Did not receive all results successfully." - assert shutdown_success, "Did not shutdown successfully." - if dist.get_rank() == 0: - env.responses = all_results - if test_config.verify_results: - for batch in all_results: - for record in batch: - request = record[-1] - assert request.status == Status.COMPLETED - - return env - - def teardown_method(self, method): - Utils.destroy_model_parallel() - - @pytest.mark.internal - @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI") - @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test") - @pytest.mark.asyncio - async def test_simple(self): - """Simple test with no TP or PP.""" - env = await self._run_test(tensor_model_parallel_size=1, pipeline_model_parallel_size=1) - - @pytest.mark.internal - @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI") - @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test") - @pytest.mark.asyncio - async def test_tp(self): - """Simple test with TP, but no PP.""" - env = await self._run_test(tensor_model_parallel_size=2, pipeline_model_parallel_size=1) - - @pytest.mark.internal - @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI") - @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test") - @pytest.mark.asyncio - async def test_pp(self): - """Simple test with no TP, but PP.""" - env = await self._run_test(tensor_model_parallel_size=1, pipeline_model_parallel_size=2) - - @pytest.mark.internal - @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI") - @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test") - @pytest.mark.asyncio - async def test_tp_pp(self): - """Simple test with both TP and PP.""" - env = await self._run_test(tensor_model_parallel_size=2, pipeline_model_parallel_size=2) - - @pytest.mark.internal - @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI") - @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test") - @pytest.mark.asyncio - async def test_pp(self): - """Simple test with no TP, but PP.""" - env = await self._run_test(tensor_model_parallel_size=1, pipeline_model_parallel_size=2) - - @pytest.mark.internal - @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test") - @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI") - @pytest.mark.asyncio - async def test_tp_pp(self): - """Simple test with both TP and PP.""" - env = await self._run_test(tensor_model_parallel_size=2, pipeline_model_parallel_size=2) - - @pytest.mark.internal - @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test") - @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI") - @pytest.mark.asyncio - async def test_pause(self): - """Pause/resume test.""" - test_config = CoordinatorTestConfig( - tensor_model_parallel_size=2, pipeline_model_parallel_size=1, num_requests=32 - ) - env = self._build_test_env(test_config) - - await env.engine.start_listening_to_data_parallel_coordinator( - inference_coordinator_port=test_config.port, launch_inference_coordinator=True - ) - - success = False - try: - if dist.get_rank() == 0: - # Start client as usual. - client = InferenceClient(test_config.port) - await client.start() - - ### TEST 1: Pause after all requests have finished. - futures = [] - for i, request in enumerate(env.requests[:2]): - prompt, sampling_params, _ = request - fut = client.add_request(prompt=prompt, sampling_params=sampling_params) - futures.append(fut) - # Wait a sufficient time for the requests to complete. - await asyncio.sleep(0.1) - # Get a pause awaitable. - to_pause = client.pause_engines() - awaitables = futures + [to_pause] - # Gather all awaitables; assert that the requests actually complete. - try: - await asyncio.wait_for(asyncio.gather(*awaitables), timeout=0.1) - except asyncio.TimeoutError: - pytest.fail("Simple pause did not succeed.") - - ### TEST 2: Ensure that requests can be added while paused. - prompt, sampling_params, _ = env.requests[2] - paused_fut = client.add_request(prompt=prompt, sampling_params=sampling_params) - with pytest.raises(asyncio.TimeoutError): - await asyncio.wait_for(paused_fut, timeout=0.1) - - ### TEST 3: Resume after pause and drain the queued requests. - client.unpause_engines() - # TODO: The system should not be incorrectly raising a cancelled error here. - with pytest.raises(asyncio.CancelledError): - await paused_fut - - ### TEST 4: Add new requests after resume. - futures = [] - for i, request in enumerate(env.requests[3:4]): - prompt, sampling_params, _ = request - fut = client.add_request(prompt=prompt, sampling_params=sampling_params) - futures.append(fut) - # Wait a sufficient time for the requests to complete. - await asyncio.sleep(0.1) - # Gather all awaitables; assert that the requests actually complete. - try: - await asyncio.wait_for(asyncio.gather(*futures), timeout=0.1) - except asyncio.TimeoutError: - pytest.fail("Simple resume did not succeed.") - - ### TEST 5: Pause while requests are being processed. - ### Note: this situation cannot occur in a synchronous system. - if False: - for request in env.engine.requests[4:6]: - request.sampling_params.num_tokens_to_generate = 100 - futures = [] - for i, request in enumerate(env.requests[4:6]): - prompt, sampling_params, _ = request - fut = client.add_request(prompt=prompt, sampling_params=sampling_params) - futures.append(fut) - # Do not wait for the requests to complete. - await client.pause_engines() - # Gather all awaitables; assert that the requests do not complete. - with pytest.raises(asyncio.TimeoutError): - await asyncio.wait_for(asyncio.gather(*futures), timeout=0.1) - success = True - finally: - try: - if dist.get_rank() == 0: - await asyncio.wait_for(client.stop_engines(), timeout=5.0) - client.stop() - await asyncio.wait_for(env.engine.engine_loop_task, timeout=5.0) - except asyncio.TimeoutError: - env.engine.engine_loop_task.cancel() - assert success, "Pause/resume test did not complete successfully." - - @pytest.mark.internal - @pytest.mark.skipif(not HAVE_ZMQ, reason="pyzmq is required for this test") - @pytest.mark.skipif(IS_ZMQ_FLAKY, reason="pyzmq is flaky in CI") - @pytest.mark.asyncio - async def test_throughput(self): - """Throughput test with no TP or PP.""" - import torch - import torch.distributed as dist - - env = await self._run_test( - tensor_model_parallel_size=1, - pipeline_model_parallel_size=1, - num_requests=10**4, - num_iterations=10, - min_time_offset=0.0, - max_time_offset=0.0, - ) - - flags = torch.tensor([1, 1, 1], dtype=torch.int, device=torch.cuda.current_device()) - - init_duration = golden_init_duration = None - run_duration = golden_run_duration = None - stop_duration = golden_stop_duration = None - - if dist.get_rank() == 0: - init_duration = (env.timing_data["init_time"] - env.timing_data["start_time"]) * 10**3 - golden_init_duration = 4445.64 # ms - run_duration = (env.timing_data["done_time"] - env.timing_data["init_time"]) * 10**3 - golden_run_duration = 2906.29 # ms - stop_duration = (env.timing_data["stop_time"] - env.timing_data["done_time"]) * 10**3 - golden_stop_duration = 33.17 # ms - - def clamp_to_golden_value(value, golden_value, delta=0.1): - return value > golden_value * (1 - delta) and value < golden_value * (1 + delta) - - if not clamp_to_golden_value(init_duration, golden_init_duration, delta=0.5): - flags[0] = 0 - if not clamp_to_golden_value(run_duration, golden_run_duration, delta=0.2): - flags[1] = 0 - if not clamp_to_golden_value(stop_duration, golden_stop_duration, delta=1.0): - flags[2] = 0 - - # Synchronize results - dist.broadcast(flags, src=0) - - if dist.get_rank() == 0: - # Print current results. - print(f"Initialization time: {init_duration:.2f} ms") - print(f"Run time: {run_duration:.2f} ms") - print(f"Stop time: {stop_duration:.2f} ms") - - assert flags[0].item() == 1, ( - f"WARNING: Init duration {init_duration:.2f}s deviates from " - f"golden value {golden_init_duration:.2f}s" - ) - assert flags[1].item() == 1, ( - f"WARNING: Run duration {run_duration:.2f}s deviates from " - f"golden value {golden_run_duration:.2f}s" - ) - assert flags[2].item() == 1, ( - f"WARNING: Stop duration {stop_duration:.2f}s deviates from " - f"golden value {golden_stop_duration:.2f}s" - ) - - print( - f"ZMQ throughput is approximately " - f"{env.config.num_requests * env.config.num_iterations / (run_duration):.2f} " - f"requests/ms" - ) - else: - assert flags[0].item() == 1 - assert flags[1].item() == 1 - assert flags[2].item() == 1 - - -if __name__ == "__main__": - test = TestCoordinator() - asyncio.run(test.test_simple()) - asyncio.run(test.test_tp()) - asyncio.run(test.test_pp()) - asyncio.run(test.test_tp_pp()) - asyncio.run(test.test_pause()) - asyncio.run(test.test_throughput()) - test.teardown_method(None) - print("~~~") - print("success.") diff --git a/tests/unit_tests/inference/test_wandb_logging.py b/tests/unit_tests/inference/test_wandb_logging.py index 1d5d054b80e..1512e805f9c 100644 --- a/tests/unit_tests/inference/test_wandb_logging.py +++ b/tests/unit_tests/inference/test_wandb_logging.py @@ -50,6 +50,7 @@ def _get_dynamic_context( max_sequence_length=512, buffer_size_gb=0.03, block_size_tokens=128, + buffer_guaranteed_fraction=0.1, metrics_writer=None, ): """Helper to create a DynamicInferenceContext.""" @@ -61,9 +62,9 @@ def _get_dynamic_context( max_sequence_length=max_sequence_length, num_cuda_graphs=None, buffer_size_gb=buffer_size_gb, + buffer_guaranteed_fraction=buffer_guaranteed_fraction, block_size_tokens=block_size_tokens, metrics_writer=metrics_writer, - unified_memory_level=0, # unit tests currently broken with UVM ) @pytest.mark.internal @@ -82,11 +83,12 @@ def test_get_kvcache_utilization_stats_with_requests(self): assert 'active_utilization' in stats assert 'active_request_count' in stats assert 'paused_request_count' in stats + assert 'gtd_block_count' in stats assert 'block_count_avail' in stats + assert 'num_non_gtd_blocks' in stats assert 'active_token_count' in stats assert 'total_request_count' in stats - assert 'max_total_requests' in stats - assert 'max_active_requests' in stats + assert 'max_requests' in stats # Verify values for empty context assert stats['allocated_blocks'] == 0 @@ -132,11 +134,12 @@ def test_get_kvcache_utilization_stats_with_requests(self): assert stats_after['total_blocks'] == stats['total_blocks'] assert stats_after['total_blocks'] > 0 + # Verify that gtd_block_count remains constant + assert stats_after['gtd_block_count'] == stats['gtd_block_count'] + # Verify that max_requests remains constant - assert stats_after['max_total_requests'] == stats['max_total_requests'] - assert stats_after['max_total_requests'] > 0 - assert stats_after['max_active_requests'] == stats['max_active_requests'] - assert stats_after['max_active_requests'] > 0 + assert stats_after['max_requests'] == stats['max_requests'] + assert stats_after['max_requests'] > 0 # Verify block availability decreased after allocation assert stats_after['block_count_avail'] < stats['block_count_avail'] @@ -144,7 +147,7 @@ def test_get_kvcache_utilization_stats_with_requests(self): # Verify relationship: allocated_blocks + block_count_avail + 1 (dummy) = total assert ( stats_after['allocated_blocks'] + stats_after['block_count_avail'] + 1 - == dynamic_context.block_allocator.total_count + == dynamic_context.block_allocator.block_count_total ) # Verify utilization bounds [0, 1] @@ -177,11 +180,12 @@ def test_kvcache_utilization_stats_types(self): 'active_unique_blocks', 'active_request_count', 'paused_request_count', + 'gtd_block_count', 'block_count_avail', + 'num_non_gtd_blocks', 'active_token_count', 'total_request_count', - 'max_total_requests', - 'max_active_requests', + 'max_requests', ] for field in int_fields: @@ -236,8 +240,8 @@ def test_paused_requests_in_stats(self): max_sequence_length=128, num_cuda_graphs=None, buffer_size_gb=0.01, # Small buffer to force pausing + buffer_guaranteed_fraction=0.1, block_size_tokens=32, - unified_memory_level=0, # unit tests currently broken with UVM ) # Add multiple requests to potentially trigger pausing diff --git a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py index ee6bc5b2468..10ffe2fdd40 100644 --- a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py +++ b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py @@ -80,9 +80,6 @@ def setup_model( fp8="hybrid" if fp8 else None, fp8_recipe="tensorwise" if fp8 else None, fp8_param=fp8, - tensor_model_parallel_size=tensor_model_parallel_size, - pipeline_model_parallel_size=pipeline_model_parallel_size, - pipeline_dtype=dtype, ) if dtype == torch.bfloat16: transformer_config.bf16 = True @@ -115,15 +112,15 @@ def setup_model( else: inference_context = DynamicInferenceContext( params_dtype=dtype, - num_layers=transformer_config.num_layers // pipeline_model_parallel_size, + num_layers=transformer_config.num_layers, kv_channels=transformer_config.kv_channels, num_attention_heads=transformer_config.num_attention_heads, max_sequence_length=2048, - buffer_size_gb=0.2, + buffer_size_gb=1, + buffer_guaranteed_fraction=0.1, materialize_only_last_token_logits=False, use_flashinfer_fused_rope=None, # default to using flash-infer if available # this is for compatibility with the LTS environment - unified_memory_level=0, # unit tests currently broken with UVM ) inference_wrapped_model = GPTInferenceWrapper( @@ -231,75 +228,41 @@ def detokenize(self, inp, skip_special_tokens=False): sampled_logits >= expected_min_value ), f"The sampled logits should all be greater than {expected_min_value} but its {sampled_logits}" - @pytest.mark.parametrize("backend", ["torch"]) - def test_sample_from_dynamic_logits(self, backend): + def test_sample_from_dynamic_logits(self): batch_size = 12 self.setup_model(torch.float32, batch_size=batch_size, static=False) self.mock_tokenizer.eod = self.vocab_size - context = self.text_generation_controller.inference_wrapped_model.inference_context - context.materialize_only_last_token_logits = True - - # Prepare sampling params in human-readable format, to aid with test maintenance. - sampling_test_cases: List[Tuple[SamplingParams, List[int]]] = [ - (SamplingParams(temperature=0.1, top_p=0.01), [9, 6, 10]), - (SamplingParams(temperature=5.0, top_k=15), [0, 3, 2]), + active_sampling_map: List[Tuple[SamplingParams, List[int]]] = [ + (SamplingParams(top_k=3), [0, 3, 2]), (SamplingParams(top_p=0.8), [4, 1, 7]), - (SamplingParams(temperature=10.0, top_k=5), [11, 5, 8]), + (SamplingParams(top_k=5), [11, 5, 8]), + # (SamplingParams(top_k=5, top_p=0.7), [11, 5, 8]), # uncomment for FlashInfer sampling + (SamplingParams(temperature=2.0), [9, 6, 10]), ] - # For non-torch backends, test simultaneous top_k and top_p sampling. - if backend != "torch": - sampling_test_cases[3][0].top_p = 0.8 - - # Convert sampling params to non-readable format. - rev_sampling_dict: List[SamplingParams] = [None] * batch_size - for sampling_params, indices in sampling_test_cases: + rev_sampling_map: List[SamplingParams] = [None] * batch_size + for sampling_params, indices in active_sampling_map: for idx in indices: - rev_sampling_dict[idx] = sampling_params - - # Prepare metadata for sample bookkeeping. - request_metadata_labels = DynamicInferenceRequest.get_metadata_labels() - request_metadata = torch.empty( - (batch_size, len(request_metadata_labels)), dtype=torch.float32 - ).cuda() - top_k_values = torch.Tensor([s.top_k for s in rev_sampling_dict]).cuda() - request_metadata[:, request_metadata_labels["top_k"]] = top_k_values - top_p_values = torch.Tensor([s.top_p for s in rev_sampling_dict]).cuda() - request_metadata[:, request_metadata_labels["top_p"]] = top_p_values - temp_values = torch.Tensor([s.temperature for s in rev_sampling_dict]).cuda() - request_metadata[:, request_metadata_labels["temperature"]] = temp_values - - # Bookkeeping. - self.text_generation_controller._dynamic_step_sample_bookkeeping( - request_metadata=request_metadata - ) + rev_sampling_map[idx] = sampling_params - # Sampling. - logits = torch.arange(0, self.vocab_size).repeat(batch_size, 1).unsqueeze(0).float().cuda() - sampled_logits = self.text_generation_controller._dynamic_step_sample_logits( - logits, backend=backend + last_token_logits = torch.arange(0, self.vocab_size).repeat(batch_size, 1).float().cuda() + sampled_logits, _ = self.text_generation_controller.sample_from_dynamic_logits( + last_token_logits, active_sampling_map, vocab_size=self.vocab_size ) + top_k_values = torch.Tensor([s.top_k for s in rev_sampling_map]).cuda().unsqueeze(1) + top_k_values[top_k_values == 0] = self.vocab_size + top_p_values = torch.Tensor([s.top_p for s in rev_sampling_map]).cuda().unsqueeze(1) + temp_values = torch.Tensor([s.temperature for s in rev_sampling_map]).cuda().unsqueeze(1) vocab_indices = torch.arange(self.vocab_size).cuda() - top_k_values[top_k_values == 0] = self.vocab_size assert torch.all( sampled_logits >= self.vocab_size - top_k_values ), f"The sampled logits should all be greater than {self.vocab_size - top_k_values} but its {sampled_logits}" - l = logits.squeeze(0) - sampled_l = l.div(temp_values.unsqueeze(1)).softmax(dim=-1) - top_k_mask = vocab_indices.unsqueeze(0) < (self.vocab_size - top_k_values.unsqueeze(1)) + l = last_token_logits[0] + sampled_l = l.div(temp_values).softmax(dim=-1) + top_k_mask = vocab_indices.unsqueeze(0) < (self.vocab_size - top_k_values) sampled_l.masked_fill_(top_k_mask, 0.0) - top_p_mask = sampled_l.cumsum(dim=-1) > top_p_values.unsqueeze(1) - - first_excluded = torch.where( - top_p_mask.any(dim=-1), - top_p_mask.float().argmax(dim=-1), - torch.full((batch_size,), self.vocab_size, device=top_p_mask.device), - ) - last_included = torch.clamp(first_excluded - 1, min=0) - start_idx = torch.clamp(self.vocab_size - top_k_values, min=0).long() - last_included = torch.max(last_included, start_idx) - expected_min_values = l.gather(1, last_included.unsqueeze(1)).squeeze(1) + expected_min_values = sampled_l[sampled_l.cumsum(dim=-1) > top_p_values].amax(dim=-1) assert torch.all( sampled_logits >= expected_min_values ), f"The sampled logits should all be greater than {expected_min_values} but its {sampled_logits}" @@ -810,15 +773,14 @@ def test_sampled_tokens_match_with_parallelism(self, static, tp_size, pp_size): ), ) ) - expected_active_requests = set(int(x) for x in active_requests.keys()) + sampling_params = SamplingParams(top_k=10, return_log_probs=True, termination_id=-1) + sampling_map = [(sampling_params, list(range(len(active_requests))))] while context.has_unfinished_requests(): - result = self.text_generation_controller.generate_output_tokens_dynamic_batch() + result = self.text_generation_controller.generate_output_tokens_dynamic_batch( + active_sampling_map=sampling_map + ) new_tokens = result["sample"] - active_ids = result["active_request_ids"].tolist() - finished_ids = result["finished_request_ids"].tolist() - assert len(new_tokens) == len(expected_active_requests) - assert set(active_ids) == expected_active_requests - expected_active_requests -= set(finished_ids) + assert len(new_tokens) == len(active_requests) for i, token in enumerate(new_tokens.tolist()): all_generated_tokens[i].append(token) diff --git a/tests/unit_tests/test_checkpointing.py b/tests/unit_tests/test_checkpointing.py index 4bbf54301f5..194f9721300 100644 --- a/tests/unit_tests/test_checkpointing.py +++ b/tests/unit_tests/test_checkpointing.py @@ -9,8 +9,6 @@ import torch import torch.distributed.checkpoint -from megatron.core.distributed import DistributedDataParallelConfig -from megatron.core.distributed.fsdp.mcore_fsdp_adapter import FullyShardedDataParallel from megatron.core.num_microbatches_calculator import ( init_num_microbatches_calculator, unset_num_microbatches_calculator, @@ -25,7 +23,6 @@ _load_base_checkpoint, get_checkpoint_tracker_filename, load_checkpoint, - read_metadata, save_checkpoint, ) from megatron.training.global_vars import set_args @@ -54,9 +51,6 @@ def __init__(self, state_dict): self.is_stub_optimizer = False self._called_metadata = [] - # Optimizers are expected to have this attribute for checkpointing. - self.param_groups = [] - def state_dict(self, is_loading=False): return self._state_dict @@ -117,8 +111,6 @@ def create_args(): args.retro_add_retriever = False args.ckpt_convert_update_legacy_dist_opt_format = False args.ckpt_step = None - args.swiglu = True - args.num_experts = 1 yield args @@ -199,7 +191,7 @@ def test_load_base_checkpoint( assert ckpt_type == expected_ckpt_type -@pytest.mark.parametrize("ckpt_format", ["torch", "torch_dcp", "fsdp_dtensor"]) +@pytest.mark.parametrize("ckpt_format", ["torch", "torch_dcp"]) def test_save_checkpoint(init_model_parallel, create_args, tmp_path_dist_ckpt, ckpt_format): """Test save_checkpoint.""" args = create_args @@ -215,15 +207,6 @@ def test_save_checkpoint(init_model_parallel, create_args, tmp_path_dist_ckpt, c config = TransformerConfig(num_layers=1, kv_channels=1) model = MockModel(config) optimizer = MockState({"optimizer": "optimizer_state"}) - if ckpt_format == "fsdp_dtensor": - model = FullyShardedDataParallel( - config=config, - ddp_config=DistributedDataParallelConfig( - use_distributed_optimizer=True, use_megatron_fsdp=True - ), - module=model, - ) - optimizer = MockState({"state": {}}) opt_param_scheduler = MockState({"opt_param_scheduler": "scheduler_state"}) num_floating_point_operations_so_far = 456 @@ -243,7 +226,7 @@ def test_save_checkpoint(init_model_parallel, create_args, tmp_path_dist_ckpt, c expected_ckpt_path = None if ckpt_format == "torch": expected_ckpt_path = ckpt_dir / "mp_rank_00" / "model_optim_rng.pt" - elif ckpt_format in ["torch_dcp", "fsdp_dtensor"]: + elif ckpt_format == "torch_dcp": expected_ckpt_path = ckpt_dir / ".metadata" assert os.path.exists(expected_ckpt_path) @@ -354,27 +337,3 @@ def test_dist_checkpoint_versioning(init_model_parallel, tmp_path_dist_ckpt, cre first_job_mock_metadata, second_job_mock_metadata, ] - - -@pytest.mark.parametrize( - "metadata_content,expected_iter,expected_release", - [ - ("456", 456, False), # Normal iteration - ("release", 0, True), # Release checkpoint should return iteration=1 - ("123", 123, False), # Another normal iteration - ], -) -def test_read_metadata_non_distributed(tmp_path, metadata_content, expected_iter, expected_release): - """Test read_metadata without torch.distributed initialized.""" - test_dir = tmp_path / "test_read_metadata_non_distributed" - test_dir.mkdir(parents=True, exist_ok=True) - tracker_file = test_dir / "latest_checkpointed_iteration.txt" - - with open(tracker_file, "w") as f: - f.write(metadata_content) - - with mock.patch('torch.distributed.is_initialized', return_value=False): - max_iter, release = read_metadata(str(tracker_file)) - - assert max_iter == expected_iter, f"Expected iteration {expected_iter}, got {max_iter}" - assert release == expected_release, f"Expected release={expected_release}, got {release}" diff --git a/tests/unit_tests/test_process_groups_config.py b/tests/unit_tests/test_process_groups_config.py index 013bc6746d4..032de47e951 100644 --- a/tests/unit_tests/test_process_groups_config.py +++ b/tests/unit_tests/test_process_groups_config.py @@ -67,29 +67,6 @@ def test_hierarchical_context_parallel_groups(self, mocker): assert model_pgs.hcp[0] == mock_pg1 assert model_pgs.hcp[1] == mock_pg2 - def test_repr(self, mocker): - """Test __repr__ shows active process groups and their sizes.""" - tp_size = 4 - pp_size = 2 - mock_tp = mocker.Mock(spec=dist.ProcessGroup) - mock_tp.size.return_value = tp_size - mock_pp = mocker.Mock(spec=dist.ProcessGroup) - mock_pp.size.return_value = pp_size - - # Test empty collection - empty_pgs = ProcessGroupCollection() - assert repr(empty_pgs) == "ProcessGroupCollection(empty)" - - # Test collection with process groups - model_pgs = ProcessGroupCollection() - model_pgs.tp = mock_tp - model_pgs.pp = mock_pp - - repr_str = repr(model_pgs) - assert "ProcessGroupCollection(" in repr_str - assert f"tp({tp_size})" in repr_str - assert f"pp({pp_size})" in repr_str - class TestPGConfigDefaultInitialization: diff --git a/tests/unit_tests/test_rl_utils.py b/tests/unit_tests/test_rl_utils.py deleted file mode 100644 index 5ea89ff2a02..00000000000 --- a/tests/unit_tests/test_rl_utils.py +++ /dev/null @@ -1,656 +0,0 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - -from unittest.mock import patch - -import torch - -from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig -from megatron.core.models.common.language_module.language_module import LanguageModule -from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec -from megatron.core.models.gpt.gpt_model import GPTModel -from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer -from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed -from megatron.core.transformer import TransformerConfig -from megatron.core.transformer.module import Float16Module -from megatron.rl import rl_utils -from megatron.rl.agent.api import TokenRollout -from megatron.training import arguments, global_vars -from tests.unit_tests.test_utilities import Utils - -BATCH = 2 -SEQ = 4 -VOCAB = 754 - - -class MockModel(LanguageModule): - def __init__(self, batch=BATCH, seq=SEQ, vocab=VOCAB): - self.batch = batch - self.seq = seq - self.vocab = vocab - self.config = TransformerConfig(num_attention_heads=1, num_layers=1) - - def __call__(self, x, position_ids, attention_mask, **kwargs): - del position_ids - del attention_mask - batch, seq = x.shape - mock_model_outputs = torch.ones((batch, seq, self.vocab), device=x.device) - return mock_model_outputs - - def load_state_dict(self, params): - del params - - def train(self, mode=True): - del mode - - def state_dict(self): - return {} - - -class MockTokenizer: - def __init__(self): - self.pad = 42 - self.eod = 43 - self.vocab_size = VOCAB - self.bos = None - - def detokenize(self, tokens): - return [str(tok) for tok in tokens] - - -def test_get_logprobs(): - """Test that getting logprobs at least does not crash.""" - # We use args inside of get_logprobs, we need to initialize them. - args = arguments.parse_args(ignore_unknown_args=True) - global_vars.set_args(args) - - tokens = torch.ones((BATCH, SEQ), dtype=torch.long) - logprobs = rl_utils.get_logprobs(MockModel(), tokens, position_ids=None, attention_mask=None) - # We chop off 1 element from the sequence dimension. - assert logprobs.shape == (BATCH, SEQ - 1) - # As we return ones as logits, all logprobs should be the same. - assert torch.all(logprobs == logprobs[0, 0]).item() - - -def test_get_logprobs_with_sequence_packing(): - """Test that getting logprobs at least does not crash.""" - # We use args inside of get_logprobs, we need to initialize them. - args = arguments.parse_args(ignore_unknown_args=True) - setattr(args, 'rl_use_sequence_packing', True) - global_vars.set_args(args) - - tokens = torch.ones((BATCH, SEQ), dtype=torch.long) - logprobs = rl_utils.get_logprobs(MockModel(), tokens, position_ids=None, attention_mask=None) - # We chop off 1 element from the sequence dimension. - assert logprobs.shape == (BATCH, SEQ - 1) - # As we return ones as logits, all logprobs should be the same. - assert torch.all(logprobs == logprobs[0, 0]).item() - - -def test_prepare_trajectories(): - # Make sure sequence packing is disabled for this test - import megatron.training.global_vars as global_vars - - old_args = global_vars.get_args() if global_vars.get_args() is not None else None - - # Create minimal args without sequence packing - args = type('Args', (), {})() - args.rl_use_sequence_packing = False - args.rl_inference_logprobs_is_correction = True - global_vars.set_args(args) - - tokenizer = MockTokenizer() - r1 = TokenRollout( - trajectory=[1, 2, tokenizer.eod], - reward=3.14, - generation_mask=[False, True, True], - logprobs=[0.1, 0.2, 0.3], - env_id='MEGAENV', - problem_id="2", - ) - r2 = TokenRollout( - trajectory=[1, 2, tokenizer.eod], - reward=0.14, - generation_mask=[False, True, True], - logprobs=[0.1, 0.2, 0.3], - env_id='MEGAENV', - problem_id="2", - ) - rollouts = [[r1, r2]] - seq_len = 7 - - trajs, genmask, inference_logprobs = rl_utils.prepare_trajectories(rollouts, tokenizer, seq_len) - - # Check that inference logprobs are being returned. - torch.testing.assert_close(inference_logprobs[0], torch.tensor([0.1, 0.2, 0.3])) - torch.testing.assert_close(inference_logprobs[1], torch.tensor([0.1, 0.2, 0.3])) - - expected_mask = torch.tensor( - [ - [False, True, True, False, False, False, False], - [False, True, True, False, False, False, False], - ] - ) - torch.testing.assert_close(genmask, expected_mask) - - expected_trajs = torch.tensor([[1, 2, 43, 42, 42, 42, 42], [1, 2, 43, 42, 42, 42, 42]]) - torch.testing.assert_close(trajs, expected_trajs) - - -def test_prepare_trajectories_with_packing(): - """Test that rollouts data is properly prepared with sequence packing enabled.""" - # Initialize args for sequence packing - args = arguments.parse_args(ignore_unknown_args=True) - setattr(args, 'micro_batch_size', 1) - setattr(args, 'global_batch_size', 1) - setattr(args, 'rl_use_sequence_packing', True) - global_vars.set_args(args) - - tokenizer = MockTokenizer() - r1 = TokenRollout( - trajectory=[1, 2, tokenizer.eod], - reward=3.14, - generation_mask=[False, True, True], - logprobs=[0.1, 0.2, 0.3], - env_id='MEGAENV', - problem_id="2", - ) - r2 = TokenRollout( - trajectory=[1, 2, 3, tokenizer.eod], - reward=0.14, - generation_mask=[False, True, True, True], - logprobs=[0.1, 0.2, 0.3, -1.2], - env_id='MEGAENV', - problem_id="2", - ) - rollouts = [[r1, r2]] - seq_len = 7 - - trajs, genmask, inference_logprobs = rl_utils.prepare_trajectories(rollouts, tokenizer, seq_len) - - # With sequence packing, inference logprobs should be padded to same length - assert isinstance(inference_logprobs, torch.Tensor) - assert inference_logprobs.shape == (2, 7) # 2 sequences, each padded to seq_len - - # Check values (padded with zeros) - torch.testing.assert_close( - inference_logprobs[0], torch.tensor([0.1, 0.2, 0.3, 0.0, 0.0, 0.0, 0.0]) - ) - torch.testing.assert_close( - inference_logprobs[1], torch.tensor([0.1, 0.2, 0.3, -1.2, 0.0, 0.0, 0.0]) - ) - - expected_mask = torch.tensor( - [ - [False, True, True, False, False, False, False], - [False, True, True, True, False, False, False], - ] - ) - torch.testing.assert_close(genmask, expected_mask) - - expected_trajs = torch.tensor([[1, 2, 43, 42, 42, 42, 42], [1, 2, 3, 43, 42, 42, 42]]) - torch.testing.assert_close(trajs, expected_trajs) - - -def test_grpo_loss_calculation_all_pi_eq(): - # All policies are equal: clamping is inactive, ratios are ones. - current_logprobs = torch.ones(BATCH, SEQ) - old_logprobs = torch.ones(BATCH, SEQ) - ref_logprobs = torch.ones(BATCH, SEQ) - advantages = torch.zeros(BATCH) - loss, kl_term, ratios, entropy_term, _, _ = rl_utils.calculate_grpo_loss( - current_logprobs=current_logprobs, - old_logprobs=old_logprobs, - ref_logprobs=ref_logprobs, - advantages=advantages, - clamp_eps_lower=0.1, - clamp_eps_upper=0.1, - kl_beta=0.1, - entropy_weight=0.0, - ) - torch.testing.assert_close(loss, torch.zeros_like(loss)) - torch.testing.assert_close(kl_term, torch.zeros_like(kl_term)) - torch.testing.assert_close(ratios, torch.ones_like(ratios)) - torch.testing.assert_close(entropy_term, torch.ones_like(ratios) * torch.e) - - -def test_grpo_loss_calculation_2x_ratios(): - # All policies are equal: clamping is inactive, ratios are ones. - current_logprobs = torch.ones(BATCH, SEQ) - old_logprobs = torch.ones(BATCH, SEQ) - torch.log(torch.Tensor([2])) - ref_logprobs = torch.ones(BATCH, SEQ) - advantages = torch.ones(BATCH) - loss, kl_term, ratios, _, _, _ = rl_utils.calculate_grpo_loss( - current_logprobs=current_logprobs, - old_logprobs=old_logprobs, - ref_logprobs=ref_logprobs, - advantages=advantages, - clamp_eps_lower=2.1, - clamp_eps_upper=2.1, - kl_beta=0.0, - entropy_weight=0.0, - ) - # Clamping does not affect us, as 2.1 [eps] > 2 [ratio]. - # kl_beta = 0 -> we only have the non-kl term of the loss active. - torch.testing.assert_close(loss, -torch.ones_like(loss) * 2) - # pi and pi_{ref} are the same here. - torch.testing.assert_close(kl_term, torch.zeros_like(kl_term)) - # Current probs are 2x more probable than old pi. - torch.testing.assert_close(ratios, torch.ones_like(ratios) * 2) - - -def test_entropy_calculation(): - # All policies are equal: clamping is inactive, ratios are ones. - current_logprobs = torch.ones(BATCH, SEQ) - old_logprobs = torch.ones(BATCH, SEQ) - ref_logprobs = torch.ones(BATCH, SEQ) - advantages = torch.zeros(BATCH) - loss, _, ratios, entropy_term, _, _ = rl_utils.calculate_grpo_loss( - current_logprobs=current_logprobs, - old_logprobs=old_logprobs, - ref_logprobs=ref_logprobs, - advantages=advantages, - clamp_eps_lower=0.1, - clamp_eps_upper=0.1, - kl_beta=0.0, - entropy_weight=1.0, - ) - torch.testing.assert_close(loss, torch.ones_like(ratios) * torch.e) - torch.testing.assert_close(entropy_term, torch.ones_like(ratios) * torch.e) - - -def test_grpo_loss_truncation(): - - # All ratios are 2 - _, _, _, _, truncated_from_above, truncated_from_below = rl_utils.calculate_grpo_loss( - current_logprobs=torch.ones(BATCH, SEQ), - old_logprobs=0.5 * torch.ones(BATCH, SEQ), - ref_logprobs=torch.ones(BATCH, SEQ), - advantages=torch.zeros(BATCH), - clamp_eps_lower=0.1, - clamp_eps_upper=0.1, - kl_beta=0.1, - entropy_weight=0.0, - ) - assert truncated_from_above.float().mean() == 1 - assert truncated_from_below.float().sum() == 0 - - # All ratios are 0.01 - _, _, _, _, truncated_from_above, truncated_from_below = rl_utils.calculate_grpo_loss( - current_logprobs=0.01 * torch.ones(BATCH, SEQ), - old_logprobs=torch.ones(BATCH, SEQ), - ref_logprobs=torch.ones(BATCH, SEQ), - advantages=torch.zeros(BATCH), - clamp_eps_lower=0.1, - clamp_eps_upper=0.1, - kl_beta=0.1, - entropy_weight=0.0, - ) - assert truncated_from_above.float().sum() == 0 - assert truncated_from_below.float().mean() == 1 - - current_logprobs = torch.tensor([[1.0, 1.0], [1.0, 1.0]]) - old_logprobs = torch.tensor([[0.5, 2.0], [0.05, 1.0]]) - _, _, _, _, truncated_from_above, truncated_from_below = rl_utils.calculate_grpo_loss( - current_logprobs=current_logprobs, - old_logprobs=old_logprobs, - ref_logprobs=old_logprobs, - advantages=torch.zeros(BATCH), - clamp_eps_lower=0.1, - clamp_eps_upper=0.1, - kl_beta=0.1, - entropy_weight=0.0, - ) - # ratios: [[2., 0.5],[20., 1.]] - torch.testing.assert_close(truncated_from_above, torch.tensor([[True, False], [True, False]])) - torch.testing.assert_close(truncated_from_below, torch.tensor([[False, True], [False, False]])) - - -@patch('megatron.rl.rl_utils.mpu') -def test_prepare_data_for_update(mock_mpu): - """Test that getting logprobs at least does not crash.""" - mock_mpu.get_expert_data_parallel_world_size.return_value = 0 - # We use args inside of get_logprobs, we need to initialize them. - - args = arguments.parse_args(ignore_unknown_args=True) - setattr(args, 'data_parallel_size', 1) - setattr(args, 'micro_batch_size', 2) - setattr(args, 'global_batch_size', 2) - setattr(args, 'seq_length', 4) - setattr(args, 'curr_iteration', 1) - global_vars.unset_global_variables() - global_vars.set_global_variables(args, build_tokenizer=False) - - model = MockModel() - tokenizer = MockTokenizer() - - r1 = TokenRollout( - trajectory=[1, 2, 3], - reward=3.14, - generation_mask=[False, True, True], - logprobs=[0.1, 0.2, 0.3], - env_id='MEGAENV', - problem_id="2", - ) - r2 = TokenRollout( - trajectory=[1, 2, 3, 4], - reward=0.14, - generation_mask=[False, True, True, True], - logprobs=[0.1, 0.2, 0.3, -1.2], - env_id='MEGAENV', - problem_id="2", - ) - rollouts = [[r1, r2]] - try: - data_iter = rl_utils.prepare_data_for_update([model], {}, rollouts, tokenizer) - except AssertionError as e: - # We expect trajectories to come padded there. - assert str(e).startswith('Rollout is not the correct length') - - r1 = TokenRollout( - trajectory=torch.Tensor([1, 2, 3, tokenizer.eod]).cuda(), - reward=3.14, - generation_mask=torch.Tensor([False, True, True, True]).cuda(), - logprobs=torch.Tensor([-0.2, -0.3, -3.2]).cuda(), - env_id='MEGAENV', - problem_id="2", - ) - r2 = TokenRollout( - trajectory=torch.Tensor([1, 2, 234, tokenizer.eod]).cuda(), - reward=0.14, - generation_mask=torch.Tensor([False, True, True, True]).cuda(), - logprobs=torch.Tensor([-0.2, -0.3, -1.2]), - env_id='MEGAENV', - problem_id="2", - ) - rollouts = [[r1, r2]] - data_iter = rl_utils.prepare_data_for_update([model], {}, rollouts, tokenizer) - - _, _, old_logprobs, _, _, _, _ = next(data_iter) - # All logits are ones in the MockModel. - # All probabilities should be uniform. - torch.testing.assert_close(old_logprobs.exp(), torch.ones_like(old_logprobs) / VOCAB) - - -def test_sequence_packing_basic(): - """Test basic sequence packing functionality.""" - # Initialize args as required by SequencePacker - args = arguments.parse_args(ignore_unknown_args=True) - setattr(args, 'seq_length', 16) - global_vars.set_args(args) - - tokenizer = MockTokenizer() - bin_size = 16 - packer = rl_utils.SequencePacker(bin_size=bin_size, pad_token=tokenizer.pad) - - # Create test sequences of varying lengths, all padded to same length - max_len = 5 - sequences = [ - torch.cat( - [ - torch.tensor([1, 2, 3, tokenizer.eod]), - torch.full((1,), tokenizer.pad, dtype=torch.long), - ] - ), # length 4 -> 5 - torch.cat( - [torch.tensor([4, 5, tokenizer.eod]), torch.full((2,), tokenizer.pad, dtype=torch.long)] - ), # length 3 -> 5 - torch.tensor([6, 7, 8, 9, tokenizer.eod]), # length 5 - torch.cat( - [torch.tensor([10, tokenizer.eod]), torch.full((3,), tokenizer.pad, dtype=torch.long)] - ), # length 2 -> 5 - ] - - generation_masks = torch.tensor( - [ - [False, True, True, True, False], # Matches padded length - [False, True, True, False, False], - [False, True, True, True, True], - [False, True, False, False, False], - ] - ) - - rewards = torch.tensor([1.0, 2.0, 3.0, 4.0]) - - # Pack sequences - packed_trajs, packed_position_ids, packed_attention_mask, packed_loss_mask, packing_info = ( - packer.pack_sequences(sequences, generation_masks) - ) - - # Verify packed data structure - assert packed_trajs is not None - assert packed_position_ids is not None - assert packed_attention_mask is not None - assert packed_loss_mask is not None - assert packing_info is not None - - # Check that sequences fit in bins properly - # The packer trims sequences to their actual length (removing padding) - # Actual lengths: 4, 3, 5, 2 = 14 total tokens - # With bin_size=16, this should fit in 1 bin - assert packed_trajs.shape[0] >= 1 # At least one bin - assert packed_trajs.shape[1] == bin_size - - # Verify position_ids are correct - for bin_idx in range(packed_trajs.shape[0]): - # Check that position_ids reset for each sequence in the bin - for i in range(packed_trajs.shape[1]): - if i == 0 or packed_trajs[bin_idx, i - 1] == tokenizer.eod: - # Start of a new sequence - if packed_trajs[bin_idx, i] != tokenizer.pad: - assert packed_position_ids[bin_idx, i] == 0 - - -def test_sequence_packing_with_generation_masks(): - """Test sequence packing with generation masks.""" - # Initialize args as required by SequencePacker - args = arguments.parse_args(ignore_unknown_args=True) - setattr(args, 'seq_length', 20) - global_vars.set_args(args) - - tokenizer = MockTokenizer() - bin_size = 20 - packer = rl_utils.SequencePacker(bin_size=bin_size, pad_token=tokenizer.pad) - - # Create test data with generation masks - sequences = [torch.tensor([1, 2, 3, tokenizer.eod]), torch.tensor([4, 5, 6, 7, tokenizer.eod])] - - # Pad sequences to same length for stacking - max_len = max(len(s) for s in sequences) - padded_sequences = [] - for seq in sequences: - padded = torch.cat([seq, torch.full((max_len - len(seq),), tokenizer.pad, dtype=seq.dtype)]) - padded_sequences.append(padded) - - generation_masks = torch.tensor( - [ - [False, True, True, True, False], # Padded to match max_len - [False, True, True, True, True], - ] - ) - - # Pack sequences - packed_trajs, packed_position_ids, packed_attention_mask, packed_loss_mask, packing_info = ( - packer.pack_sequences(padded_sequences, generation_masks) - ) - - # Verify packed tensors - assert packed_trajs.shape[0] == 1 # One bin - assert packed_trajs.shape[1] == bin_size - - # Check that loss mask is set correctly for generation tokens - # The loss mask should be 1 for generation tokens and 0 for padding/prompt - - -def test_sequence_packing_empty_bins(): - """Test that empty bins are created correctly.""" - # Initialize args if needed - args = arguments.parse_args(ignore_unknown_args=True) - setattr(args, 'seq_length', 8) - global_vars.set_args(args) - - tokenizer = MockTokenizer() - bin_size = 8 - num_empty_bins = 3 - - # Create a simple packed data structure - packed_trajs = torch.tensor( - [[1, 2, 3, tokenizer.eod, tokenizer.pad, tokenizer.pad, tokenizer.pad, tokenizer.pad]] - ) - packed_position_ids = torch.tensor([[0, 1, 2, 3, 0, 0, 0, 0]]) - packed_loss_mask = torch.tensor([[1, 1, 1, 1, 0, 0, 0, 0]], dtype=torch.float) - packed_attention_mask = torch.ones(1, bin_size, bin_size) # Simple full attention mask - - # Create empty bins - empty_trajs, empty_position_ids, empty_loss_mask, empty_attention_mask, empty_packing_info = ( - rl_utils.create_empty_bins( - num_empty_bins=num_empty_bins, - bin_size=bin_size, - packed_trajs=packed_trajs, - packed_position_ids=packed_position_ids, - packed_loss_mask=packed_loss_mask, - packed_attention_mask=packed_attention_mask, - tokenizer=tokenizer, - ) - ) - - # Verify shapes - assert empty_trajs.shape[0] == num_empty_bins - assert empty_trajs.shape[1] == bin_size - - # Check that empty bins are filled with padding - for i in range(num_empty_bins): - assert torch.all(empty_trajs[i] == tokenizer.pad) - assert torch.all(empty_position_ids[i] == 0) - assert torch.all(empty_loss_mask[i] == 0) - - # Verify packing info for empty bins - assert len(empty_packing_info) == num_empty_bins - for info in empty_packing_info: - assert len(info['bin_seq_indices']) == 0 # No sequences in empty bins - assert len(info['seq_starts']) == 0 # No sequence starts - - -def test_prepare_trajectories_with_sequence_packing(): - """Test prepare_trajectories with sequence packing enabled.""" - # Set up args with sequence packing - args = arguments.parse_args(ignore_unknown_args=True) - setattr(args, 'rl_use_sequence_packing', True) - setattr(args, 'rl_sequence_packing_bin_size', 16) - setattr(args, 'data_parallel_size', 1) - setattr(args, 'micro_batch_size', 2) - setattr(args, 'global_batch_size', 2) - setattr(args, 'seq_length', 16) - setattr(args, 'curr_iteration', 1) - global_vars.unset_global_variables() - global_vars.set_global_variables(args, build_tokenizer=False) - - tokenizer = MockTokenizer() - - # Create rollouts of varying lengths - r1 = TokenRollout( - trajectory=[1, 2, tokenizer.eod], - reward=3.14, - generation_mask=[False, True, True], - logprobs=[0.1, 0.2, 0.3], - env_id='MEGAENV', - problem_id="1", - ) - r2 = TokenRollout( - trajectory=[4, 5, 6, 7, tokenizer.eod], - reward=0.14, - generation_mask=[False, True, True, True, True], - logprobs=[0.4, 0.5, 0.6, 0.7, 0.8], - env_id='MEGAENV', - problem_id="2", - ) - r3 = TokenRollout( - trajectory=[8, 9, tokenizer.eod], - reward=2.71, - generation_mask=[False, True, True], - logprobs=[0.9, 1.0, 1.1], - env_id='MEGAENV', - problem_id="3", - ) - - rollouts = [[r1, r2, r3]] - seq_len = 16 - - # Call prepare_trajectories with sequence packing - trajs, genmask, inference_logprobs = rl_utils.prepare_trajectories(rollouts, tokenizer, seq_len) - - # With sequence packing enabled but called from prepare_trajectories, - # it might still return individual sequences (not packed into bins yet) - # because the actual packing happens later in prepare_data_for_update - assert trajs.shape[0] == 3 # Three sequences - assert trajs.shape[1] == seq_len - - # Verify that each sequence is properly padded - # Sequence 1: [1, 2, eod, pad] + padding - assert trajs[0, 0] == 1 - assert trajs[0, 1] == 2 - assert trajs[0, 2] == tokenizer.eod - assert trajs[0, 3] == tokenizer.pad - - # Sequence 2: [4, 5, 6, 7, eod, pad] + padding - assert trajs[1, 0] == 4 - assert trajs[1, 1] == 5 - assert trajs[1, 4] == tokenizer.eod - assert trajs[1, 5] == tokenizer.pad - - -def test_sequence_packing_integration(): - """Simple integration test for sequence packing - just verifies the packing works.""" - # Initialize minimal args needed for SequencePacker - args = arguments.parse_args(ignore_unknown_args=True) - setattr(args, 'seq_length', 16) - global_vars.set_args(args) - - tokenizer = MockTokenizer() - bin_size = 16 - - # Test that we can pack sequences and get expected outputs - packer = rl_utils.SequencePacker(bin_size=bin_size, pad_token=tokenizer.pad) - - # Create test data - need to pad to same length for stacking - max_len = 5 - sequences = [ - torch.cat( - [ - torch.tensor([1, 2, 3, tokenizer.eod]), - torch.full((1,), tokenizer.pad, dtype=torch.long), - ] - ), # length 4 -> 5 - torch.cat( - [torch.tensor([4, 5, tokenizer.eod]), torch.full((2,), tokenizer.pad, dtype=torch.long)] - ), # length 3 -> 5 - torch.tensor([6, 7, 8, 9, tokenizer.eod]), # length 5 - ] - generation_masks = [ - torch.tensor([False, True, True, True, False]), - torch.tensor([False, True, True, False, False]), - torch.tensor([False, True, True, True, True]), - ] - - # Pack sequences - packed_trajs, packed_position_ids, packed_attention_mask, packed_loss_mask, packing_info = ( - packer.pack_sequences(sequences, generation_masks) - ) - - # Basic assertions - assert packed_trajs is not None - assert packed_trajs.shape[1] == bin_size # Each bin should be bin_size - assert packed_position_ids.shape == packed_trajs.shape - assert packed_loss_mask.shape == packed_trajs.shape - - # Verify the sequences are packed correctly - # Total length: 4 + 3 + 5 = 12, should fit in 1 bin - assert packed_trajs.shape[0] == 1 - - # The packer sorts sequences by length (descending), so order is: seq3 (len 5), seq1 (len 4), seq2 (len 3) - expected_start = torch.tensor( - [6, 7, 8, 9, tokenizer.eod, 1, 2, 3, tokenizer.eod, 4, 5, tokenizer.eod] - ) - assert torch.all(packed_trajs[0, :12] == expected_start) - - # Rest should be padding - assert torch.all(packed_trajs[0, 12:] == tokenizer.pad) diff --git a/tests/unit_tests/transformer/moe/test_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_token_dispatcher.py index 6a155920e2f..4b4cfa567c5 100644 --- a/tests/unit_tests/transformer/moe/test_token_dispatcher.py +++ b/tests/unit_tests/transformer/moe/test_token_dispatcher.py @@ -417,10 +417,7 @@ def is_hybrid_ep_available(): return HAVE_HYBRIDEP -@pytest.mark.skipif( - not is_deep_ep_available() and not is_hybrid_ep_available(), - reason="Deep EP and Hybrid EP are not available", -) +@pytest.mark.skipif(True, reason="Deep EP and Hybrid EP are not available") class TestFlexDispatcher: def setup_method(self, method): pass diff --git a/tools/run_inference_performance_test.py b/tools/run_inference_performance_test.py index dda2b8284b3..01e5ab58898 100644 --- a/tools/run_inference_performance_test.py +++ b/tools/run_inference_performance_test.py @@ -24,8 +24,9 @@ from megatron.core.inference.text_generation_controllers.text_generation_controller import ( TextGenerationController, ) +from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols from megatron.core.transformer.module import MegatronModule -from megatron.core.utils import get_mamba_inference_state_config_from_model +from megatron.core.utils import get_attr_wrapped_model from model_provider import model_provider sys.path.append( @@ -88,7 +89,14 @@ def get_inference_engine(args: argparse.Namespace, model: MegatronModule) -> Abs moe_pad_experts_for_cuda_graph_inference=args.moe_pad_experts_for_cuda_graph_inference, ) - mamba_inference_state_config = get_mamba_inference_state_config_from_model(model) + # Layer type list for hybrid models + decoder = get_attr_wrapped_model(model, "decoder") + layer_type_list = getattr(decoder, "layer_type_list", None) + if layer_type_list is not None and Symbols.MAMBA in layer_type_list: + (mamba_conv_states_shape, mamba_ssm_states_shape) = decoder.mamba_state_shapes_per_request() + else: + mamba_conv_states_shape = None + mamba_ssm_states_shape = None if args.engine_type == "static": inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config) @@ -121,7 +129,9 @@ def get_inference_engine(args: argparse.Namespace, model: MegatronModule) -> Abs block_size_tokens=args.inference_dynamic_batching_block_size, tensor_model_parallel_size=args.tensor_model_parallel_size, materialize_only_last_token_logits=not args.return_log_probs, - mamba_inference_state_config=mamba_inference_state_config, + layer_type_list=layer_type_list, + mamba_conv_states_shape=mamba_conv_states_shape, + mamba_ssm_states_shape=mamba_ssm_states_shape, cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents, kv_lora_rank=args.kv_lora_rank if args.multi_latent_attention else None, qk_pos_emb_head_dim=args.qk_pos_emb_head_dim, diff --git a/train_rl.py b/train_rl.py index bf632d81e2c..479498d392a 100644 --- a/train_rl.py +++ b/train_rl.py @@ -191,7 +191,7 @@ def forward_step(data_iterator, model: GPTModel, loss_only: bool = False): seq_lengths = None attention_mask = None - if args.rl_use_sequence_packing: + if args.use_sequence_packing: # Get bin index from data iterator bin_tensor = batch_data[0] bin_idx = bin_tensor.item() From 6ca67bc4a345d56fc047998b32b8c807d84c7402 Mon Sep 17 00:00:00 2001 From: Li Tao Date: Mon, 1 Dec 2025 11:45:40 +0800 Subject: [PATCH 162/248] [Dev] Support packed seq in MTP (#2043) Signed-off-by: Li Tao Signed-off-by: lit --- megatron/core/models/gpt/gpt_model.py | 14 +- .../transformer/multi_token_prediction.py | 118 +++++++++- .../test_multi_token_prediction.py | 208 +++++++++++++++++- 3 files changed, 331 insertions(+), 9 deletions(-) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index e840fca99b3..ce1e8e76bd9 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -575,9 +575,19 @@ def _postprocess( runtime_gather_output=runtime_gather_output, ) # Calc loss for the current Multi-Token Prediction (MTP) layers. - mtp_labels, _ = roll_tensor(mtp_labels, shifts=-1, dims=-1, cp_group=self.cp_group) + mtp_labels, _ = roll_tensor( + mtp_labels, + shifts=-1, + dims=-1, + cp_group=self.cp_group, + packed_seq_params=packed_seq_params, + ) loss_mask, num_tokens = roll_tensor( - loss_mask, shifts=-1, dims=-1, cp_group=self.cp_group + loss_mask, + shifts=-1, + dims=-1, + cp_group=self.cp_group, + packed_seq_params=packed_seq_params, ) mtp_loss = self.compute_language_model_loss(mtp_labels, mtp_logits) mtp_loss = loss_mask * mtp_loss diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py index e79af23ef04..a8f4abfcdd3 100755 --- a/megatron/core/transformer/multi_token_prediction.py +++ b/megatron/core/transformer/multi_token_prediction.py @@ -126,7 +126,7 @@ def tie_output_layer_state_dict( ) -def roll_tensor(tensor, shifts=-1, dims=-1, cp_group=None): +def roll_tensor(tensor, shifts=-1, dims=-1, cp_group=None, packed_seq_params=None): """Roll the tensor input along the sequence dimension with Context Parallelism (CP) support. This function extends the original roll_tensor to support Context Parallelism, which allows @@ -138,15 +138,24 @@ def roll_tensor(tensor, shifts=-1, dims=-1, cp_group=None): For CP>1: Splits tensor into chunks, performs rolling within each chunk, then exchanges boundary elements between adjacent CP ranks to maintain sequence continuity. + For packed sequences: Respects sequence boundaries when rolling to avoid mixing tokens + from different sequences. + Args: tensor (Tensor): The input tensor to roll. shifts (int): The shift of the tensor (typically -1 for MTP). dims (int): The dimension to roll (typically -1 for sequence dimension). cp_group (ProcessGroup): The context parallelism process group. If None or size=1, falls back to standard rolling behavior. + packed_seq_params (PackedSeqParams): Parameters for packed sequence processing. + If provided, respects sequence boundaries. Returns: tuple: (rolled_tensor, sum_of_rolled_tensor) """ + # Handle packed sequences cases + if packed_seq_params is not None: + return _roll_tensor_packed_seq(tensor, shifts, dims, packed_seq_params, cp_group) + # Standard rolling behavior when CP is not enabled (cp_group is None or size=1) if cp_group is None or cp_group.size() == 1: rolled_tensor = torch.roll(tensor, shifts=shifts, dims=dims) @@ -215,6 +224,91 @@ def roll_tensor(tensor, shifts=-1, dims=-1, cp_group=None): return rolled_tensor, rolled_tensor.sum() +def _roll_tensor_packed_seq(tensor, shifts, dims, packed_seq_params, cp_group=None): + """Roll tensor with packed sequence support. + This function handles rolling for packed sequences by respecting sequence boundaries + """ + + # Notice: This is a naive implementation to test the correctness, + # a better solution will only sync the boundary tokens once. + assert ( + dims == -1 or dims == tensor.dim() - 1 + ), "Packed sequence roll only supports the last dimension." + assert shifts == -1, "Packed sequence roll only supports a single-token left shift." + cu_seqlens = packed_seq_params.cu_seqlens_q + assert cu_seqlens is not None, "Packed sequence parameters must provide cu_seqlens_q." + + rolled_tensor = tensor.clone() + + cp_size = cp_group.size() if cp_group is not None else 1 + if cp_size == 1: + # CP disabled: roll each packed sequence independently within its boundaries + for i in range(len(cu_seqlens) - 1): + start_idx = cu_seqlens[i] + end_idx = cu_seqlens[i + 1] + seq_slice = tensor[..., start_idx:end_idx] + rolled_seq = torch.roll(seq_slice, shifts=shifts, dims=dims) + # Zero out the last position(s) that would cross sequence boundaries + rolled_seq[..., shifts:] = 0 + rolled_tensor[..., start_idx:end_idx] = rolled_seq + return rolled_tensor, rolled_tensor.sum() + + # CP enabled: each rank owns two chunks per sequence (front and mirrored tail). + local_rank = torch.distributed.get_rank(group=cp_group) + global_ranks = torch.distributed.get_process_group_ranks(group=cp_group) + next_rank = global_ranks[(local_rank + 1) % cp_size] + prev_rank = global_ranks[(local_rank - 1) % cp_size] + + # Iterate over each sequence individually + for i in range(len(cu_seqlens) - 1): + start_idx = cu_seqlens[i] + end_idx = cu_seqlens[i + 1] + + # the idx has been multiplied by cp_size, need to divide it by cp_size to get the local idx + local_start_idx = start_idx // cp_size + local_end_idx = end_idx // cp_size + tensor_slice = rolled_tensor[..., local_start_idx:local_end_idx].clone() + + # The following code is very similar as the code in roll_tensor function + local_chunks = tensor_slice.chunk(2, dim=dims) + rolled_chunks = [torch.roll(chunk, shifts=shifts, dims=dims) for chunk in local_chunks] + + tensor_send_list = [] + tensor_recv_list = [] + for chunk in rolled_chunks: + boundary = chunk.select(dims, shifts).contiguous().clone() + tensor_send_list.append(boundary) + tensor_recv_list.append(torch.empty_like(boundary)) + + ops = [] + if local_rank != 0: + ops.append(torch.distributed.isend(tensor=tensor_send_list[0], dst=prev_rank)) + ops.append(torch.distributed.irecv(tensor=tensor_recv_list[1], src=prev_rank)) + else: + tensor_recv_list[1].zero_() + + if local_rank != cp_size - 1: + ops.append(torch.distributed.irecv(tensor=tensor_recv_list[0], src=next_rank)) + ops.append(torch.distributed.isend(tensor=tensor_send_list[1], dst=next_rank)) + else: + tensor_recv_list[0].copy_(tensor_send_list[1]) + + for op in ops: + op.wait() + + index = [slice(None)] * rolled_chunks[0].dim() + index[dims] = shifts + for chunk, recv in zip(rolled_chunks, tensor_recv_list): + chunk[tuple(index)] = recv + + seq_result = torch.cat(rolled_chunks, dim=dims) + + # update the rolled tensor + rolled_tensor[..., local_start_idx:local_end_idx] = seq_result + + return rolled_tensor, rolled_tensor.sum() + + class MTPLossLoggingHelper: """Helper class for logging MTP losses.""" @@ -595,6 +689,7 @@ def _get_embeddings( position_ids: torch.Tensor, embedding: Callable, hidden_states: torch.Tensor, + packed_seq_params: Optional[PackedSeqParams] = None, ): """ Preprocesses input data for the Multi-Token Prediction (MTP) layers. @@ -609,10 +704,23 @@ def _get_embeddings( from gpt model to compute the decoder input. hidden_states (torch.Tensor): hidden states tensor of shape [s, b, h] where s is the sequence length, b is the batch size, and h is the hidden size. + packed_seq_params (PackedSeqParams): Parameters for packed sequence processing. """ # Calc logits for the current Multi-Token Prediction (MTP) layers. - input_ids, _ = roll_tensor(input_ids, shifts=-1, dims=-1, cp_group=self.cp_group) - position_ids, _ = roll_tensor(position_ids, shifts=-1, dims=-1, cp_group=self.cp_group) + input_ids, _ = roll_tensor( + input_ids, + shifts=-1, + dims=-1, + cp_group=self.cp_group, + packed_seq_params=packed_seq_params, + ) + position_ids, _ = roll_tensor( + position_ids, + shifts=-1, + dims=-1, + cp_group=self.cp_group, + packed_seq_params=packed_seq_params, + ) # embedding decoder_input = embedding(input_ids=input_ids, position_ids=position_ids) @@ -795,15 +903,13 @@ def forward( [s, b, h], and optionally the updated context tensor if cross-attention is used. """ assert context is None, f"multi token prediction + cross attention is not yet supported." - assert ( - packed_seq_params is None - ), f"multi token prediction + sequence packing is not yet supported." input_ids, position_ids, decoder_input, hidden_states = self._get_embeddings( input_ids=input_ids, position_ids=position_ids, embedding=embedding, hidden_states=hidden_states, + packed_seq_params=packed_seq_params, ) if self.config.recompute_granularity == 'full' and self.training: diff --git a/tests/unit_tests/transformer/test_multi_token_prediction.py b/tests/unit_tests/transformer/test_multi_token_prediction.py index 9b9d2c67881..ddfa9bfba16 100644 --- a/tests/unit_tests/transformer/test_multi_token_prediction.py +++ b/tests/unit_tests/transformer/test_multi_token_prediction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import os import sys @@ -14,11 +14,14 @@ ) from megatron.core.models.gpt.gpt_model import GPTModel from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator +from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.parallel_state import get_context_parallel_group from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed from megatron.core.transformer.multi_token_prediction import ( MTPLossLoggingHelper, MultiTokenPredictionBlock, + roll_tensor, ) from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import is_te_min_version @@ -245,6 +248,66 @@ def get_batch(self, seq_length, micro_batch_size): } return batch + def get_packed_batch(self, seq_lengths, micro_batch_size): + """ + Create a packed sequence batch with multiple sequences of varying lengths. + + Args: + seq_lengths: List of sequence lengths (e.g., [10, 15, 8] for 3 sequences) + micro_batch_size: Batch size (typically 1 for packed sequences) + + Returns: + batch: Dictionary containing packed sequences and PackedSeqParams + """ + total_seq_length = sum(seq_lengths) + + # Create packed input_ids, labels, and position_ids + input_ids_list = [] + labels_list = [] + position_ids_list = [] + + for seq_len in seq_lengths: + data = list(range(seq_len)) + input_ids_list.extend(data) + labels_list.extend([x + 1 for x in data]) + position_ids_list.extend(data) + + # Convert to tensors with shape [batch, total_seq_length] + input_ids = torch.tensor(input_ids_list, dtype=torch.int64).unsqueeze(0).cuda() + labels = torch.tensor(labels_list, dtype=torch.int64).unsqueeze(0).cuda() + position_ids = torch.tensor(position_ids_list, dtype=torch.int64).unsqueeze(0).cuda() + + # Create attention mask for packed sequences (all ones for simplicity) + attention_mask = torch.ones( + (micro_batch_size, 1, total_seq_length, total_seq_length), dtype=bool + ).cuda() + + # Create loss mask with shape [batch, total_seq_length] + loss_mask = torch.ones(micro_batch_size, total_seq_length).cuda() + + # Create cumulative sequence lengths for PackedSeqParams + cu_seqlens = torch.tensor( + [0] + [sum(seq_lengths[: i + 1]) for i in range(len(seq_lengths))], dtype=torch.int32 + ).cuda() + + packed_seq_params = PackedSeqParams( + cu_seqlens_q=cu_seqlens, + cu_seqlens_kv=cu_seqlens, + max_seqlen_q=max(seq_lengths), + max_seqlen_kv=max(seq_lengths), + qkv_format='thd', + ) + + batch = { + 'tokens': input_ids, + 'labels': labels, + 'loss_mask': loss_mask, + 'attention_mask': attention_mask, + 'position_ids': position_ids, + 'packed_seq_params': packed_seq_params, + } + return batch + @pytest.mark.skipif( not HAVE_TE or not is_te_min_version("2.1.0"), reason="grouped_gemm requires TransformerEngine >= 2.1.0", @@ -404,6 +467,149 @@ def test_fp8_support(self, full_recompute): loss = output.mean() loss.backward() + @pytest.mark.skipif( + not HAVE_TE or not is_te_min_version("2.1.0"), + reason="grouped_gemm requires TransformerEngine >= 2.1.0", + ) + @pytest.mark.parametrize(("tp", "cp"), [(1, 1), (2, 1), (2, 2)]) + def test_packed_sequences(self, tp, cp): + """Test MTP with packed sequences.""" + # Create args with packed sequences support + seq_lengths = [16, 24, 12] # Three sequences of different lengths + total_seq_length = sum(seq_lengths) + + args = self.create_test_args(tp, cp, total_seq_length, micro_batch_size=1) + set_args(args) + + torch.manual_seed(_SEED) + Utils.initialize_model_parallel(tensor_model_parallel_size=tp, context_parallel_size=cp) + + # Get packed batch + batch = self.get_packed_batch(seq_lengths, micro_batch_size=1) + tokens = batch['tokens'] + labels = batch['labels'] + loss_mask = batch['loss_mask'] + attention_mask = batch['attention_mask'] + position_ids = batch['position_ids'] + packed_seq_params = batch['packed_seq_params'] + + # Create model + gpt_model, optimizer, opt_param_scheduler = setup_model_and_optimizer( + self.model_provider, ModelType.encoder_or_decoder + ) + + # Forward pass with packed sequences + output = gpt_model[0].forward( + input_ids=tokens, + position_ids=position_ids, + attention_mask=attention_mask, + labels=labels, + loss_mask=loss_mask, + packed_seq_params=packed_seq_params, + ) + + # Verify output shape + assert output.shape[0] == 1 # batch size + assert output.shape[1] == total_seq_length + + # Verify MTP loss was computed + tracker = MTPLossLoggingHelper.tracker + assert "values" in tracker + mtp_loss = tracker['values'].clone() + assert mtp_loss.shape[0] == args.mtp_num_layers + MTPLossLoggingHelper.clean_loss_in_tracker() + + # Backward pass + loss = output.mean() + loss.backward() + + # Verify gradients exist + for name, param in gpt_model[0].named_parameters(): + assert param.main_grad is not None, f"Gradient missing for {name}" + + @pytest.mark.parametrize("cp", [1, 2]) + def test_roll_tensor_with_packed_sequences(self, cp): + """Test roll_tensor function with packed sequences, with and without CP. + + For CP=1: Tests standard packed sequence rolling with verified expected values + For CP=2: Tests CP-enabled rolling executes without errors + """ + Utils.initialize_model_parallel(tensor_model_parallel_size=1, context_parallel_size=cp) + cp_group = get_context_parallel_group() if cp > 1 else None + cp_rank = torch.distributed.get_rank(group=cp_group) if cp_group is not None else 0 + + if cp == 1: + # Test case: Simple packed sequences (CP disabled) + tensor = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32).cuda() + cu_seqlens = torch.tensor([0, 3, 5], dtype=torch.int32).cuda() + + packed_seq_params = PackedSeqParams( + cu_seqlens_q=cu_seqlens, + cu_seqlens_kv=cu_seqlens, + max_seqlen_q=3, + max_seqlen_kv=3, + qkv_format='thd', + ) + + # Roll by -1 (shift left) + rolled, sum_val = roll_tensor( + tensor, shifts=-1, dims=0, cp_group=cp_group, packed_seq_params=packed_seq_params + ) + + # Expected: [2, 3, 0, 5, 0] - boundaries at indices 2 and 4 are zeroed + expected = torch.tensor([2, 3, 0, 5, 0], dtype=torch.float32).cuda() + assert torch.equal(rolled, expected), f"Expected {expected}, got {rolled}" + else: + # Test case: Packed sequences with CP=2 + # Two sequences: + # seq1 = [1, 2, 3, 4, 5, 6, 7, 8] + # seq2 = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22] + + if cp_rank == 0: + # CP Rank 0: first half of each sequence + tensor = torch.tensor( + [1, 2, 7, 8, 11, 12, 13, 20, 21, 22], dtype=torch.float32 + ).cuda() + expected = torch.tensor( + [2, 3, 8, 0, 12, 13, 14, 21, 22, 0], dtype=torch.float32 + ).cuda() + else: + # CP Rank 1: second half of each sequence + tensor = torch.tensor( + [3, 4, 5, 6, 14, 15, 16, 17, 18, 19], dtype=torch.float32 + ).cuda() + expected = torch.tensor( + [4, 5, 6, 7, 15, 16, 17, 18, 19, 20], dtype=torch.float32 + ).cuda() + + cu_seqlens = torch.tensor([0, 8, 20], dtype=torch.int32).cuda() + + packed_seq_params = PackedSeqParams( + cu_seqlens_q=cu_seqlens, + cu_seqlens_kv=cu_seqlens, + max_seqlen_q=6, # max(4, 6) - max local seq length per sequence + max_seqlen_kv=6, + qkv_format='thd', + ) + + # Roll by -1 (shift left) with CP communication + rolled, sum_val = roll_tensor( + tensor, shifts=-1, dims=0, cp_group=cp_group, packed_seq_params=packed_seq_params + ) + + # Verify the rolled tensor matches expected values + assert ( + rolled.shape == expected.shape + ), f"Shape mismatch: expected {expected.shape}, got {rolled.shape}" + assert torch.equal( + rolled, expected + ), f"CP Rank {cp_rank}: Expected\n{expected}\nbut got\n{rolled}\nDiff:\n{rolled - expected}" + + # Verify sum is correct + assert sum_val.numel() == 1, "Sum should be a scalar" + + Utils.destroy_model_parallel() + class TestMTPLossLoggingHelper: def setup_method(self, method): From 11caf01283f4b3e17f12807099a1aad04ff3a9c2 Mon Sep 17 00:00:00 2001 From: Santosh Bhavani Date: Sun, 30 Nov 2025 20:49:11 -0800 Subject: [PATCH 163/248] Fix runaway Etpt in straggler detector by resetting FLOPs accumulator (#2128) Signed-off-by: Santosh Bhavani Co-authored-by: Li Ruixiao --- megatron/training/training.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/megatron/training/training.py b/megatron/training/training.py index 9986f931641..9fe372a3780 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -1929,6 +1929,7 @@ def post_training_step_callbacks( # Straggler detector. if iteration % args.log_interval == 0 and args.log_straggler: + # Use FLOPs accumulated since last log event and then reset the counter stimer.report(num_floating_point_operations_since_last_log_event, args.log_interval) num_floating_point_operations_since_last_log_event = 0.0 @@ -1970,6 +1971,9 @@ def post_training_step_callbacks( if args.manual_gc_interval != 0 and iteration % args.manual_gc_interval == 0: gc.collect() + # Return updated FLOPs accumulator so caller can persist the reset + return num_floating_point_operations_since_last_log_event + def checkpoint_and_decide_exit( model, @@ -2585,8 +2589,9 @@ def get_e2e_base_metrics(): energy_monitor.resume() # Miscellaneous post-training-step functions (e.g., FT heartbeats, GC). - # Some of these only happen at specific iterations. - post_training_step_callbacks( + # Some of these only happen at specific iterations. Capture updated FLOPs accumulator + # (it is reset inside the callback after logging). + num_floating_point_operations_since_last_log_event = post_training_step_callbacks( model, optimizer, opt_param_scheduler, From 92c8482e6dcd11c3666c61bb8d1f7e8d0730ed13 Mon Sep 17 00:00:00 2001 From: Robin Zhang Date: Mon, 1 Dec 2025 13:09:36 +0800 Subject: [PATCH 164/248] [Dev] feat(MoE): Refactor cuda_graph_scope - part2 (#2353) Signed-off-by: Robin Zhang --- .../text_generation_controller.py | 3 +- .../common/language_module/language_module.py | 5 +- megatron/core/models/gpt/gpt_model.py | 4 +- megatron/core/pipeline_parallel/schedules.py | 7 +- megatron/core/ssm/mamba_block.py | 3 +- megatron/core/transformer/attention.py | 4 +- megatron/core/transformer/cuda_graphs.py | 47 +++++-- megatron/core/transformer/enums.py | 12 ++ megatron/core/transformer/moe/fused_a2a.py | 8 ++ megatron/core/transformer/moe/moe_utils.py | 7 +- .../core/transformer/moe/token_dispatcher.py | 12 +- .../core/transformer/transformer_block.py | 4 +- .../core/transformer/transformer_config.py | 112 +++++++++-------- .../core/transformer/transformer_layer.py | 47 +++---- megatron/training/arguments.py | 18 ++- megatron/training/training.py | 9 +- .../inference/engines/test_dynamic_engine.py | 12 +- tests/unit_tests/test_fp8_param.py | 24 ++-- .../transformer/test_cuda_graphs.py | 117 ++++++++++++------ 19 files changed, 302 insertions(+), 153 deletions(-) diff --git a/megatron/core/inference/text_generation_controllers/text_generation_controller.py b/megatron/core/inference/text_generation_controllers/text_generation_controller.py index 2bda1425710..6e00f58ac23 100644 --- a/megatron/core/inference/text_generation_controllers/text_generation_controller.py +++ b/megatron/core/inference/text_generation_controllers/text_generation_controller.py @@ -29,6 +29,7 @@ ) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.utils import get_attention_mask, set_decode_expert_padding +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.moe.moe_layer import BaseMoELayer from megatron.core.transformer.utils import set_model_to_sequence_parallel from megatron.core.utils import get_asyncio_loop, get_model_config, unwrap_model @@ -851,7 +852,7 @@ def generate_all_output_tokens_static_batch( # Check whether CUDA graphs are enabled enable_cuda_graph = ( model_config.cuda_graph_impl == "local" - and "full_iteration" not in model_config.cuda_graph_scope + and CudaGraphScope.full_iteration not in model_config.cuda_graph_scope ) # Pad batch tokens if necessary diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py index de2ecfb8011..259bb716a93 100644 --- a/megatron/core/models/common/language_module/language_module.py +++ b/megatron/core/models/common/language_module/language_module.py @@ -21,7 +21,7 @@ is_vp_last_stage, ) from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.transformer.enums import AttnBackend +from megatron.core.transformer.enums import AttnBackend, CudaGraphScope from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.utils import ensure_metadata_has_dp_cp_group @@ -144,8 +144,7 @@ def compute_language_model_loss(self, labels: Tensor, logits: Tensor) -> Tensor: # Use is_cg_capturable=True for full iteration CUDA graphs to avoid torch.equal checks is_cg_capturable = ( hasattr(self.config, 'cuda_graph_scope') - and self.config.cuda_graph_scope - and 'full_iteration' in self.config.cuda_graph_scope + and CudaGraphScope.full_iteration in self.config.cuda_graph_scope ) if is_cg_capturable and not is_te_min_version("2.7.0"): from megatron.core.utils import get_te_version diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index ce1e8e76bd9..a3d1a8bfc00 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -24,7 +24,7 @@ from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.quantization.utils import get_quant_config_or_none from megatron.core.tensor_parallel import gather_from_sequence_parallel_region -from megatron.core.transformer.enums import ModelType +from megatron.core.transformer.enums import CudaGraphScope, ModelType from megatron.core.transformer.multi_token_prediction import ( MTPLossAutoScaler, MTPLossLoggingHelper, @@ -374,7 +374,7 @@ def _preprocess( and ( ( self.config.cuda_graph_impl == "local" - and "full_iteration" not in self.config.cuda_graph_scope + and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope ) or self.config.flash_decode ) diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index d0b912349b4..18344429c45 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -21,6 +21,7 @@ ) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.cuda_graphs import create_cudagraphs +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.moe.router import MoEAuxLossAutoScaler from megatron.core.utils import ( drain_embedding_wgrad_compute, @@ -656,7 +657,7 @@ def forward_backward_no_pipelining( if ( hasattr(config, 'cuda_graph_impl') and config.cuda_graph_impl == "local" - and "full_iteration" not in config.cuda_graph_scope + and CudaGraphScope.full_iteration not in config.cuda_graph_scope ): create_cudagraphs() @@ -1923,7 +1924,7 @@ def pp_post_backward(input_tensor_grad, vp_stage=None): if ( hasattr(config, 'cuda_graph_impl') and config.cuda_graph_impl == "local" - and "full_iteration" not in config.cuda_graph_scope + and CudaGraphScope.full_iteration not in config.cuda_graph_scope ): create_cudagraphs() nvtx_range_pop(suffix="misc") @@ -2310,7 +2311,7 @@ def enable_grad_sync(): if ( hasattr(config, 'cuda_graph_impl') and config.cuda_graph_impl == "local" - and "full_iteration" not in config.cuda_graph_scope + and CudaGraphScope.full_iteration not in config.cuda_graph_scope ): create_cudagraphs() diff --git a/megatron/core/ssm/mamba_block.py b/megatron/core/ssm/mamba_block.py index 1bcadd0af10..3201a8bfb28 100644 --- a/megatron/core/ssm/mamba_block.py +++ b/megatron/core/ssm/mamba_block.py @@ -25,6 +25,7 @@ from megatron.core.ssm.mamba_hybrid_layer_allocation import allocate_layers from megatron.core.tensor_parallel import get_cuda_rng_tracker from megatron.core.transformer import TransformerConfig +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.spec_utils import ModuleSpec, build_module @@ -294,7 +295,7 @@ def forward( ( ( self.config.cuda_graph_impl == "local" - and "full_iteration" not in self.config.cuda_graph_scope + and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope ) or self.config.flash_decode ) diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index 74031f38219..57ba494742b 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -45,7 +45,7 @@ from ..models.common.embeddings.yarn_rotary_pos_embedding import ( _yarn_get_concentration_factor_from_config, ) -from .enums import AttnMaskType +from .enums import AttnMaskType, CudaGraphScope from .transformer_config import TransformerConfig try: @@ -828,7 +828,7 @@ def forward( if ( in_decode_mode and self.config.cuda_graph_impl == "local" - and "full_iteration" not in self.config.cuda_graph_scope + and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope and inference_context.is_static_batching() ): raise ValueError(f"CUDA graphs must use flash decode with static batching!") diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index 12f15ee980a..5b0a0333d9e 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -21,6 +21,7 @@ get_all_rng_states, get_cuda_rng_tracker, ) +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.module import GraphableMegatronModule, MegatronModule from megatron.core.transformer.transformer_config import TransformerConfig @@ -1344,24 +1345,24 @@ def _layer_is_graphable(layer, config): from megatron.core.transformer.moe.moe_layer import MoELayer from megatron.core.transformer.transformer_layer import TransformerLayer - if isinstance(layer, MambaLayer) and 'mamba' in config.cuda_graph_scope: + if isinstance(layer, MambaLayer) and CudaGraphScope.mamba in config.cuda_graph_scope: # mamba layer. return True if isinstance(layer, TransformerLayer): - if 'attn' in config.cuda_graph_scope and not ( + if CudaGraphScope.attn in config.cuda_graph_scope and not ( isinstance(layer.self_attention, IdentityOp) and isinstance(layer.cross_attention, IdentityOp) ): # attn layer. return True if ( - 'moe' in config.cuda_graph_scope - or 'moe_router' in config.cuda_graph_scope - or 'moe_preprocess' in config.cuda_graph_scope + CudaGraphScope.moe in config.cuda_graph_scope + or CudaGraphScope.moe_router in config.cuda_graph_scope + or CudaGraphScope.moe_preprocess in config.cuda_graph_scope ) and isinstance(layer.mlp, MoELayer): # moe layer. return True - if 'mlp' in config.cuda_graph_scope and isinstance(layer.mlp, MLP): + if CudaGraphScope.mlp in config.cuda_graph_scope and isinstance(layer.mlp, MLP): # mlp layer. return True return False @@ -1388,7 +1389,7 @@ def __init__(self, model, config, seq_length, micro_batch_size, optimizers=[]): "Setting NCCL_GRAPH_REGISTER=0 to avoid illegal memory access when using " "CUDA Graph with PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True." ) - assert "full_iteration" not in config.cuda_graph_scope, ( + assert CudaGraphScope.full_iteration not in config.cuda_graph_scope, ( "full_iteration cuda graph is not supported for cuda_graph_impl=transformer_engine. " "Please use cuda_graph_impl=local instead." ) @@ -1529,7 +1530,7 @@ def get_rotary_pos_emb(transformer_module, transformer_input): and not isinstance(layer.self_attention, IdentityOp) and ( not self.config.cuda_graph_scope - or 'attn' in self.config.cuda_graph_scope + or CudaGraphScope.attn in self.config.cuda_graph_scope ) ) if is_te_min_version("1.10.0"): @@ -1712,3 +1713,33 @@ def cuda_graph_set_manual_hooks(self): model_chunk = self.model[chunk_number] for layer in layers: layer.setup_manual_hooks(model_chunk._make_forward_pre_hook) + + def delete_cuda_graphs(self): + """ + Delete all CUDA graphs. + """ + assert self._graphs_created, "CUDA Graphs have not been created." + + graph_resettable = is_te_min_version("2.10.0") + graphs_reset, graphs_not_reset = 0, 0 + for layers in self.callables_per_chunk: + for layer in layers: + for graph in layer.cuda_graphs: + if graph_resettable: + graph.reset() + graphs_reset += 1 + else: + graphs_not_reset += 1 + layer.cuda_graphs = [] + layer.cuda_graph_manual_hooks = [] + + log_on_each_pipeline_stage( + logger=logger, + tp_group=None, + dp_cp_group=None, + level=logging.INFO, + msg=f'Rank {torch.distributed.get_rank()}: ' + f'{graphs_reset} graphs deleted with explicit reset, ' + f'{graphs_not_reset} graphs deleted without explicit reset.', + ) + self._graphs_created = False diff --git a/megatron/core/transformer/enums.py b/megatron/core/transformer/enums.py index 52b82029f90..d06d58d65f2 100644 --- a/megatron/core/transformer/enums.py +++ b/megatron/core/transformer/enums.py @@ -65,3 +65,15 @@ class AttnBackend(enum.Enum): unfused = 3 local = 4 auto = 5 + + +class CudaGraphScope(enum.Enum): + """Cuda Graph Scope - defines which parts of the model to capture.""" + + full_iteration = 1 # Captures the entire training/inference iteration + attn = 2 # Captures attention layers + mlp = 3 # Captures MLP layers (dense layers only) + moe = 4 # Captures MoE layers (drop-and-pad MoE layers only) + moe_router = 5 # Captures MoE router part + moe_preprocess = 6 # Captures MoE preprocessing part (requires moe_router) + mamba = 7 # Captures Mamba layers diff --git a/megatron/core/transformer/moe/fused_a2a.py b/megatron/core/transformer/moe/fused_a2a.py index 60b0b11a32c..045a93039b3 100644 --- a/megatron/core/transformer/moe/fused_a2a.py +++ b/megatron/core/transformer/moe/fused_a2a.py @@ -320,6 +320,14 @@ def init_hybrid_ep_buffer( ) +def reset_hybrid_ep_buffer(): + ''' + Reset the HybridEP buffer + ''' + global _hybrid_ep_buffer + _hybrid_ep_buffer = None + + class HybridEPDispatch(torch.autograd.Function): ''' Fused dispatch operation for permute + dispatch a2a + permute using the HybridEP backend diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py index d28cbfea3fe..3ed31d375e2 100644 --- a/megatron/core/transformer/moe/moe_utils.py +++ b/megatron/core/transformer/moe/moe_utils.py @@ -11,6 +11,7 @@ from megatron.core.fp8_utils import get_fp8_align_size from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.cuda_graphs import is_graph_capturing +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.transformer_config import TransformerConfig try: @@ -1205,13 +1206,13 @@ def maybe_raise_signal(moe_layer, **kwargs): ): if ( step_condition == "route" - and 'moe_router' in moe_layer.config.cuda_graph_scope - and 'moe_preprocess' not in moe_layer.config.cuda_graph_scope + and CudaGraphScope.moe_router in moe_layer.config.cuda_graph_scope + and CudaGraphScope.moe_preprocess not in moe_layer.config.cuda_graph_scope ): raise MoECudaGraphPartialCaptureSignal(moe_layer, "route", **kwargs) elif ( step_condition == "preprocess" - and 'moe_preprocess' in moe_layer.config.cuda_graph_scope + and CudaGraphScope.moe_preprocess in moe_layer.config.cuda_graph_scope ): raise MoECudaGraphPartialCaptureSignal(moe_layer, "preprocess", **kwargs) diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py index b2135fdb00d..af8ae572adb 100644 --- a/megatron/core/transformer/moe/token_dispatcher.py +++ b/megatron/core/transformer/moe/token_dispatcher.py @@ -16,6 +16,7 @@ gather_from_sequence_parallel_region, reduce_scatter_to_sequence_parallel_region, ) +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.moe.fused_a2a import ( fused_combine, fused_dispatch, @@ -436,7 +437,7 @@ def __init__( } if ( config.cuda_graph_impl == "transformer_engine" - and 'moe_preprocess' in config.cuda_graph_scope + and CudaGraphScope.moe_preprocess in config.cuda_graph_scope ): self.cuda_dtoh_point = "before_ep_alltoall" else: @@ -1075,10 +1076,13 @@ def combine( num_permuted_tokens=self.num_permuted_tokens, pad_multiple=self.pad_multiple, ) - # Release the used handle/num_permuted_tokens which could change in each iteration + # Release the used handle/num_permuted_tokens which could change in each iteration. + # For drop_and_pad mode, we don't need to reset the num_permuted_tokens and + # num_dispatched_tokens, because their values never change. self.handle = None - self.num_permuted_tokens = None - self.num_dispatched_tokens = None + if not self.drop_and_pad: + self.num_permuted_tokens = None + self.num_dispatched_tokens = None return hidden_states def get_permuted_hidden_states_by_experts(self, hidden_states: torch.Tensor) -> torch.Tensor: diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index 6f69927e9e8..023db1fe75a 100755 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -21,7 +21,7 @@ ) from megatron.core.pipeline_parallel.utils import is_vp_first_stage, is_vp_last_stage from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.transformer.enums import LayerType +from megatron.core.transformer.enums import CudaGraphScope, LayerType from megatron.core.transformer.module import GraphableMegatronModule, MegatronModule from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.transformer_config import TransformerConfig @@ -555,7 +555,7 @@ def _should_call_local_cudagraph(self, *args, **kwargs): kwargs.get('inference_context') is not None or kwargs.get('inference_params') is not None ) - and 'full_iteration' in self.config.cuda_graph_scope + and CudaGraphScope.full_iteration in self.config.cuda_graph_scope ): if kwargs['inference_context'].is_static_batching(): using_cuda_graph = kwargs['inference_context'].is_decode_only() diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index fae2e2f5d4d..cc714e9ac15 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -9,7 +9,7 @@ from megatron.core.enums import Fp4Recipe, Fp8Recipe from megatron.core.quantization.quant_config import RecipeConfig -from megatron.core.transformer.enums import AttnBackend +from megatron.core.transformer.enums import AttnBackend, CudaGraphScope from megatron.core.transformer.pipeline_parallel_layer_layout import PipelineParallelLayerLayout from ..fusions.fused_bias_geglu import quick_gelu @@ -711,7 +711,7 @@ class TransformerConfig(ModelParallelConfig): excluding optimizer) is enabled. "transformer_engine": capture the CUDA graph using TE make_graphed_callables().""" - cuda_graph_scope: Optional[List[str]] = None + cuda_graph_scope: Optional[List[CudaGraphScope]] = None """Determines the CUDA graphs capturing scope. When cuda_graph_impl is set to "transformer_engine", valid values are "attn", "mlp", "moe", "moe_router", "moe_preprocess", "mamba". None means the full layer. @@ -1593,65 +1593,76 @@ def __post_init__(self): 'use cuda_graph_impl=transformer_engine instead.' ) self.cuda_graph_impl = "transformer_engine" + if self.cuda_graph_scope is None: self.cuda_graph_scope = [] + elif not isinstance(self.cuda_graph_scope, list): + if isinstance(self.cuda_graph_scope, CudaGraphScope): + self.cuda_graph_scope = [self.cuda_graph_scope] + else: + assert isinstance(self.cuda_graph_scope, str), ( + "cuda_graph_scope must be a string that can be converted to a list of " + f"CudaGraphScope, got {self.cuda_graph_scope}." + ) + self.cuda_graph_scope = self.cuda_graph_scope.split(',') + if all(isinstance(scope, str) for scope in self.cuda_graph_scope): + # Backward compatibility for "full" scope. Now we use an empty list instead. + if "full" in self.cuda_graph_scope: + assert self.cuda_graph_scope == [ + "full" + ], "full scope cannot be used with other scopes." + warnings.warn( + "full scope is deprecated. " + "Use empty cuda_graph_scope to capture the whole layer." + ) + self.cuda_graph_scope = [] + else: + self.cuda_graph_scope = [CudaGraphScope[scope] for scope in self.cuda_graph_scope] + assert all( + isinstance(scope, CudaGraphScope) for scope in self.cuda_graph_scope + ), f"cuda_graph_scope must be a list of CudaGraphScope, got {self.cuda_graph_scope}." + if self.cuda_graph_impl != "none": assert self.cuda_graph_impl in [ "transformer_engine", "local", ], f"Invalid cuda graph implementation: {self.cuda_graph_impl}" + if self.cpu_offloading: raise ValueError("CUDA graphs not supported with CPU offloading.") - elif not isinstance(self.cuda_graph_scope, list): - assert isinstance(self.cuda_graph_scope, str), ( - "cuda_graph_scope must be a string or a list of strings, " - f"got {self.cuda_graph_scope}." - ) - self.cuda_graph_scope = [self.cuda_graph_scope] - if self.cuda_graph_impl == "local": - assert not self.cuda_graph_scope or self.cuda_graph_scope == ["full_iteration"], ( - "For local cuda graph implementation, the only valid value " - "for cuda_graph_scope is full_iteration. " - "To use other scopes, use cuda_graph_impl=transformer_engine." + assert not self.cuda_graph_scope or self.cuda_graph_scope == [ + CudaGraphScope.full_iteration + ], ( + "For local cuda graph implementation, the only valid value for " + "cuda_graph_scope is full_iteration, or an empty list to denote layerwise " + "graphs. To use other scopes, use cuda_graph_impl=transformer_engine." ) if self.cuda_graph_impl == "transformer_engine": - assert "full_iteration" not in self.cuda_graph_scope, ( + assert CudaGraphScope.full_iteration not in self.cuda_graph_scope, ( "To use full iteration cuda graph, please use " - "cuda_graph_impl=transformer_engine instead of cuda_graph_impl=local." + "cuda_graph_impl=local instead of cuda_graph_impl=transformer_engine." ) - for scope in self.cuda_graph_scope: - assert scope in [ - 'attn', - 'mlp', - 'moe', - 'moe_router', - 'moe_preprocess', - 'mamba', - ], ( - "--cuda-graph-scope should be attn, mlp, moe, moe_router, moe_preprocess, " - f"or mamba, got {self.cuda_graph_scope}." - ) - assert ( - 'moe' not in self.cuda_graph_scope or 'moe_router' not in self.cuda_graph_scope + CudaGraphScope.moe not in self.cuda_graph_scope + or CudaGraphScope.moe_router not in self.cuda_graph_scope ), 'cuda_graph_scope must not contain both moe and moe_router.' - if 'moe_preprocess' in self.cuda_graph_scope: + if CudaGraphScope.moe_preprocess in self.cuda_graph_scope: assert ( - 'moe_router' in self.cuda_graph_scope + CudaGraphScope.moe_router in self.cuda_graph_scope ), 'moe_preprocess cuda graph is only supported with moe_router cuda graph.' if self.num_moe_experts is None or self.num_moe_experts <= 1: assert ( - 'moe' not in self.cuda_graph_scope - and 'moe_router' not in self.cuda_graph_scope + CudaGraphScope.moe not in self.cuda_graph_scope + and CudaGraphScope.moe_router not in self.cuda_graph_scope ), 'moe cuda graph is only supported for MoE.' else: if self.moe_layer_freq == 1 or ( isinstance(self.moe_layer_freq, list) and 0 not in self.moe_layer_freq ): - assert 'mlp' not in self.cuda_graph_scope, ( + assert CudaGraphScope.mlp not in self.cuda_graph_scope, ( 'mlp cuda graph is only supported for dense layers, ' 'but not found in the model.' ) @@ -1660,13 +1671,13 @@ def __post_init__(self): or not self.moe_pad_expert_input_to_capacity ): assert ( - 'moe' not in self.cuda_graph_scope + CudaGraphScope.moe not in self.cuda_graph_scope ), 'moe cuda graph is only supported with drop-padding MoE.' if self.moe_token_dispatcher_type == 'alltoall' and ( self.moe_expert_capacity_factor is not None or self.moe_router_padding_for_quantization ): - assert 'moe_preprocess' not in self.cuda_graph_scope, ( + assert CudaGraphScope.moe_preprocess not in self.cuda_graph_scope, ( 'moe_preprocess cuda graph is not supported when there are ' 'DtoH copies and synchronizations in the preprocess step.' ) @@ -1676,25 +1687,28 @@ def __post_init__(self): raise ValueError( "Full-layer CUDA graphs not supported with activation recomputation." ) - elif self.cuda_graph_scope != ['full_iteration']: + elif self.cuda_graph_scope != [CudaGraphScope.full_iteration]: # For scoped CUDA graphs, only the non-graphed parts of the layer can be # recomputed. So check if there are overlaps between the recomputed parts # and the graphed parts. - if "attn" in self.cuda_graph_scope: + if CudaGraphScope.attn in self.cuda_graph_scope: for module in self.recompute_modules: if module in ['core_attn', 'mla_up_proj']: raise ValueError( f'attn cuda graph is not supported with {module} recompute.' ) - if "mlp" in self.cuda_graph_scope and "mlp" in self.recompute_modules: + if ( + CudaGraphScope.mlp in self.cuda_graph_scope + and "mlp" in self.recompute_modules + ): raise ValueError(f'mlp cuda graph is not supported with mlp recompute.') - if "moe" in self.cuda_graph_scope: + if CudaGraphScope.moe in self.cuda_graph_scope: for module in self.recompute_modules: if module in ['moe_act', 'moe', 'shared_experts']: raise ValueError( f'moe cuda graph is not supported with {module} recompute.' ) - if "moe_router" in self.cuda_graph_scope: + if CudaGraphScope.moe_router in self.cuda_graph_scope: for module in self.recompute_modules: if module in ['moe', 'shared_experts']: raise ValueError( @@ -1703,25 +1717,25 @@ def __post_init__(self): ) if "layernorm" in self.recompute_modules: if ( - "attn" in self.cuda_graph_scope - and "mlp" in self.cuda_graph_scope + CudaGraphScope.attn in self.cuda_graph_scope + and CudaGraphScope.mlp in self.cuda_graph_scope and ( - "moe" in self.cuda_graph_scope - or "moe_router" in self.cuda_graph_scope + CudaGraphScope.moe in self.cuda_graph_scope + or CudaGraphScope.moe_router in self.cuda_graph_scope ) ): raise ValueError( 'cuda graph is not supported with layernorm recompute.' ) - if "attn" in self.cuda_graph_scope: + if CudaGraphScope.attn in self.cuda_graph_scope: warnings.warn( "input_layernorm recompute is not supported with attention " "cudagraph. Will only recompute the pre_mlp_layernorm." ) if ( - "mlp" in self.cuda_graph_scope - or "moe" in self.cuda_graph_scope - or "moe_router" in self.cuda_graph_scope + CudaGraphScope.mlp in self.cuda_graph_scope + or CudaGraphScope.moe in self.cuda_graph_scope + or CudaGraphScope.moe_router in self.cuda_graph_scope ): warnings.warn( "pre_mlp_layernorm recompute is not supported with mlp/moe " diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index f89678e6216..3ea40577009 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -16,7 +16,7 @@ from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.cuda_graphs import is_graph_capturing -from megatron.core.transformer.enums import LayerType +from megatron.core.transformer.enums import CudaGraphScope, LayerType from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp from megatron.core.transformer.mlp import MLP from megatron.core.transformer.module import GraphableMegatronModule @@ -382,18 +382,21 @@ def __init__( if "layernorm" in self.config.recompute_modules: if not isinstance(self.input_layernorm, IdentityOp) and ( self.config.cuda_graph_impl == "none" - or 'attn' not in self.config.cuda_graph_scope + or CudaGraphScope.attn not in self.config.cuda_graph_scope ): self.recompute_input_layernorm = True if self.config.fp8 or self.config.fp4: self.self_attention.set_for_recompute_input_layernorm() if not isinstance(self.pre_mlp_layernorm, IdentityOp) and ( self.config.cuda_graph_impl == "none" - or (not self.is_moe_layer and 'mlp' not in self.config.cuda_graph_scope) + or ( + not self.is_moe_layer + and CudaGraphScope.mlp not in self.config.cuda_graph_scope + ) or ( self.is_moe_layer - and 'moe' not in self.config.cuda_graph_scope - and 'moe_router' not in self.config.cuda_graph_scope + and CudaGraphScope.moe not in self.config.cuda_graph_scope + and CudaGraphScope.moe_router not in self.config.cuda_graph_scope ) ): self.recompute_pre_mlp_layernorm = True @@ -634,12 +637,13 @@ def _forward_mlp(self, hidden_states, inference_context=None): and self.config.cuda_graph_impl == "transformer_engine" and self.training and is_graph_capturing() - and 'moe_router' in self.config.cuda_graph_scope + and CudaGraphScope.moe_router in self.config.cuda_graph_scope ): assert ( not self.recompute_pre_mlp_layernorm ), "Recomputation is not supported for CUDA graph." cudagraph_outputs = self.mlp(pre_mlp_layernorm_output) + nvtx_range_pop(suffix="mlp") return cudagraph_outputs + [residual] elif self.recompute_mlp: if self.config.fp8 or self.config.fp4: @@ -694,6 +698,7 @@ def _forward_post_mlp(self, mlp_output_with_bias, residual): Returns: output (Tensor): Transformed hidden states of shape [s, b, h]. """ + from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( fine_grained_offloading_group_commit, ) @@ -757,7 +762,7 @@ def get_layer_static_inputs(self, seq_length, micro_batch_size): static_inputs = super().get_layer_static_inputs(seq_length, micro_batch_size) if not isinstance(self.self_attention, IdentityOp) and ( - not self.config.cuda_graph_scope or 'attn' in self.config.cuda_graph_scope + not self.config.cuda_graph_scope or CudaGraphScope.attn in self.config.cuda_graph_scope ): slen_per_cp = seq_length // self.config.context_parallel_size static_inputs["attention_mask"] = ( @@ -776,18 +781,18 @@ def _get_submodules_under_cudagraphs(self): return super()._get_submodules_under_cudagraphs() submodules = [] - if 'attn' in self.config.cuda_graph_scope: + if CudaGraphScope.attn in self.config.cuda_graph_scope: submodules += [ self.input_layernorm, self.self_attention, self.pre_cross_attn_layernorm, self.cross_attention, ] - if (not self.is_moe_layer and 'mlp' in self.config.cuda_graph_scope) or ( - self.is_moe_layer and 'moe' in self.config.cuda_graph_scope + if (not self.is_moe_layer and CudaGraphScope.mlp in self.config.cuda_graph_scope) or ( + self.is_moe_layer and CudaGraphScope.moe in self.config.cuda_graph_scope ): submodules += [self.pre_mlp_layernorm, self.mlp] - elif self.is_moe_layer and 'moe_router' in self.config.cuda_graph_scope: + elif self.is_moe_layer and CudaGraphScope.moe_router in self.config.cuda_graph_scope: submodules += [self.pre_mlp_layernorm, self.mlp.router] if ( self.config.moe_shared_expert_intermediate_size is not None @@ -805,7 +810,7 @@ def _te_cuda_graph_capture(self, *args, **kwargs): 2. If context is None, it cannot be returned as output. """ context = None - if not self.config.cuda_graph_scope or 'attn' in self.config.cuda_graph_scope: + if not self.config.cuda_graph_scope or CudaGraphScope.attn in self.config.cuda_graph_scope: hidden_states, context = self._forward_attention(*args, **kwargs) else: if len(args) > 0: @@ -815,12 +820,12 @@ def _te_cuda_graph_capture(self, *args, **kwargs): if ( not self.config.cuda_graph_scope - or (not self.is_moe_layer and 'mlp' in self.config.cuda_graph_scope) + or (not self.is_moe_layer and CudaGraphScope.mlp in self.config.cuda_graph_scope) or ( self.is_moe_layer and ( - 'moe' in self.config.cuda_graph_scope - or 'moe_router' in self.config.cuda_graph_scope + CudaGraphScope.moe in self.config.cuda_graph_scope + or CudaGraphScope.moe_router in self.config.cuda_graph_scope ) ) ): @@ -841,7 +846,7 @@ def _te_cuda_graph_replay(self, *args, **kwargs): Hence, `inference_context` and `packed_seq_params` are excluded from input list. """ context = None - if self.config.cuda_graph_scope and 'attn' not in self.config.cuda_graph_scope: + if self.config.cuda_graph_scope and CudaGraphScope.attn not in self.config.cuda_graph_scope: hidden_states, context = self._forward_attention(*args, **kwargs) args = (hidden_states,) kwargs = {} @@ -861,13 +866,13 @@ def _te_cuda_graph_replay(self, *args, **kwargs): if ( not self.config.cuda_graph_scope - or (not self.is_moe_layer and 'mlp' in self.config.cuda_graph_scope) - or (self.is_moe_layer and 'moe' in self.config.cuda_graph_scope) + or (not self.is_moe_layer and CudaGraphScope.mlp in self.config.cuda_graph_scope) + or (self.is_moe_layer and CudaGraphScope.moe in self.config.cuda_graph_scope) ): # CUDA Graph captures the whole MLP/MoE part. CUDA Graph output is the layer output. assert len(cuda_graph_output) == 1, "CUDA Graph output should be the layer output." output = cuda_graph_output.pop() - elif self.is_moe_layer and 'moe_router' in self.config.cuda_graph_scope: + elif self.is_moe_layer and CudaGraphScope.moe_router in self.config.cuda_graph_scope: # CUDA Graph partially captures the MoE. # The rest of the layer should go to the normal pass. shared_expert_output, routing_map, residual = None, None, None @@ -882,7 +887,7 @@ def _te_cuda_graph_replay(self, *args, **kwargs): # Split cudagraph outputs into function outputs and attribute outputs, and # process them separately. Function outputs should have three tensors. func_output, attr_outputs = cuda_graph_output[:3], cuda_graph_output[3:] - if 'moe_preprocess' in self.config.cuda_graph_scope: + if CudaGraphScope.moe_preprocess in self.config.cuda_graph_scope: hidden_states, probs, residual = func_output valid_cudagraph_attrs = self.mlp.token_dispatcher.valid_cudagraph_attrs assert len(attr_outputs) == len( @@ -989,7 +994,7 @@ def _should_call_local_cudagraph(self, *args, **kwargs): (kwargs.get('inference_context') is not None) or (kwargs.get('inference_params') is not None) ) - and 'full_iteration' not in self.config.cuda_graph_scope + and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope ): if kwargs['inference_context'].is_static_batching(): using_cuda_graph = kwargs['inference_context'].is_decode_only() diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index bb1b17e9ba2..15576e2ceac 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -23,7 +23,7 @@ from megatron.core.rerun_state_machine import RerunStateMachine from megatron.core.transformer import MLATransformerConfig, TransformerConfig from megatron.core.transformer.pipeline_parallel_layer_layout import PipelineParallelLayerLayout -from megatron.core.transformer.enums import AttnBackend +from megatron.core.transformer.enums import AttnBackend, CudaGraphScope from megatron.core.transformer.heterogeneous.heterogeneous_config import ( HeterogeneousTransformerConfig, MLPConfig, @@ -772,7 +772,7 @@ def validate_args(args, defaults={}): if args.rank == 0: print('accumulate and all-reduce gradients in fp32 for ' 'bfloat16 data type.', flush=True) - if args.cuda_graph_impl == "local" and "full_iteration" in args.cuda_graph_scope: + if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope: if not args.inference_dynamic_batching: assert not args.check_for_nan_in_loss_and_grad, \ "--no-check-for-nan-in-loss-and-grad should be set with full_iteration CUDA graph" @@ -1265,6 +1265,15 @@ def validate_args(args, defaults={}): assert ( args.recompute_granularity != 'full' ), 'recompute_granularity must not be full when CUDA Graphs are enabled.' + if args.cuda_graph_scope == "full" or ( + isinstance(args.cuda_graph_scope, list) and "full" in args.cuda_graph_scope + ): + if isinstance(args.cuda_graph_scope, list): + assert args.cuda_graph_scope == ["full"], "full scope cannot be used with other scopes." + args.cuda_graph_scope = [] + warn_rank_0( + 'full scope is deprecated. Use empty cuda_graph_scope to capture the whole layer.' + ) if args.multi_latent_attention: assert not args.group_query_attention, "Group query attention is mutually exclusive with multi latent attention." @@ -1486,7 +1495,7 @@ def _add_inference_args(parser): '"none": no CUDA graph. ' '"local": capture the CUDA graph using MCore local implementation. --cuda-graph-scope=\"full_iteration\" enables whole iteration CUDA graph. ' '"transformer_engine": capture the CUDA graph using TE make_graphed_callables().') - group.add_argument('--cuda-graph-scope', nargs='+', type=str, default=[], + group.add_argument('--cuda-graph-scope', nargs='+', type=lambda scope: CudaGraphScope[scope] if scope != "full" else scope, default=[], help='Determines the CUDA graphs capturing scope. ' 'choices: "attn", "mlp", "moe", "moe_router", "moe_preprocess", "mamba", "full_iteration". ' '"attn": captures operations in TransformerLayer._forward_attention(). ' @@ -1498,7 +1507,8 @@ def _add_inference_args(parser): '"mamba": captures the mamba layer. ' '"full_iteration": captures a whole iteration. ' 'full_iteration scope is only supported with --cuda-graph-impl=local, other scopes are only supported with --cuda-graph-impl=transformer_engine. ' - 'If not specified, the default scope is to capture the whole Transformer layer.') + 'If not specified, the default scope is to capture the whole Transformer layer. ' + 'For backward compatibility, we still allow passing "full" to specify capturing the whole layer, and convert it to an empty list.') group.add_argument('--use-legacy-static-engine', action='store_true', default=False, help='Use legacy static engine. (Current static engine uses dynamic engine under the hood)', dest='use_legacy_static_engine') diff --git a/megatron/training/training.py b/megatron/training/training.py index 9fe372a3780..555cc0ecfee 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -59,6 +59,7 @@ from megatron.training.checkpointing import checkpoint_exists from megatron.core.full_cuda_graph import FullCudaGraphWrapper from megatron.core.transformer.cuda_graphs import TECudaGraphHelper +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.module import Float16Module from megatron.core.distributed import DistributedDataParallelConfig, TorchFullyShardedDataParallelConfig from megatron.core.distributed import DistributedDataParallel as DDP @@ -2265,7 +2266,7 @@ def train( eval_iterations = 0 # Wrap forward_backward_func for Full iteration CUDA graph forward_backward_func = get_forward_backward_func() - if args.cuda_graph_impl == "local" and "full_iteration" in args.cuda_graph_scope: + if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope: forward_backward_func = FullCudaGraphWrapper(forward_backward_func, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps) def get_e2e_base_metrics(): @@ -2614,6 +2615,10 @@ def get_e2e_base_metrics(): if should_exit: break + # Destroy CUDA Graphs. + if args.cuda_graph_impl == "transformer_engine" and cuda_graph_helper.graphs_created(): + cuda_graph_helper.delete_cuda_graphs() + one_logger_utils.track_e2e_metrics() # Flush TensorBoard, WandB writers and one-logger. @@ -2687,7 +2692,7 @@ def evaluate( eval_batch_size = args.global_batch_size eval_num_microbatches = eval_batch_size // (args.micro_batch_size * args.data_parallel_size) forward_backward_func = get_forward_backward_func() - if args.cuda_graph_impl == "local" and "full_iteration" in args.cuda_graph_scope: + if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope: forward_backward_func = FullCudaGraphWrapper(forward_backward_func, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps) if eval_iters is None: diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py index 0ac4b296746..26d3dcfbd6d 100644 --- a/tests/unit_tests/inference/engines/test_dynamic_engine.py +++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py @@ -3,7 +3,7 @@ import asyncio import random import types -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Dict, List, Optional, Tuple import pytest @@ -41,6 +41,7 @@ from megatron.core.models.mamba.mamba_model import MambaModel from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed from megatron.core.transformer.cuda_graphs import CudaGraphManager, _CudagraphGlobalRecord +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import ( check_mamba_sequence_packing_support, @@ -103,7 +104,9 @@ class DynamicEngineTestConfig: return_log_probs: bool = False materialize_only_last_token_logits: bool = True skip_prompt_log_probs: bool = False - cuda_graph_scope: List[str] = None + cuda_graph_scope: List[CudaGraphScope] = field( + default_factory=lambda: [CudaGraphScope.full_iteration] + ) force_build_cuda_graphs: bool = False # If False, do not build cuda graphs in the tests, even if # num_cuda_graphs is set. @@ -136,9 +139,6 @@ def __post_init__(self): if self.context_max_tokens_override is None: self.context_max_tokens_override = self.num_requests * self.max_sequence_length - if self.cuda_graph_scope is None: - self.cuda_graph_scope = ["full_iteration"] - @dataclass class DynamicEngineTestEnv: @@ -514,7 +514,7 @@ def teardown_method(self, method): ) @pytest.mark.parametrize("model_provider", ["gpt", "mamba"]) @pytest.mark.parametrize("num_cuda_graphs", [None, 1, 4]) - @pytest.mark.parametrize("cuda_graph_scope", [[], ["full_iteration"]]) + @pytest.mark.parametrize("cuda_graph_scope", [[], [CudaGraphScope.full_iteration]]) def test_simple(self, model_provider, num_cuda_graphs, cuda_graph_scope) -> None: """Simple test that runs without errors, and validates output.""" skip_if_mamba_sequence_packing_not_available(model_provider) diff --git a/tests/unit_tests/test_fp8_param.py b/tests/unit_tests/test_fp8_param.py index 0b8d41769ec..361698f7127 100644 --- a/tests/unit_tests/test_fp8_param.py +++ b/tests/unit_tests/test_fp8_param.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import contextlib import gc @@ -36,7 +36,10 @@ try: from transformer_engine.pytorch.tensor.utils import post_all_gather_processing - cuda_graph_supported = True + if is_te_min_version("2.10.0"): + cuda_graph_supported = True + else: + reason_for_no_cuda_graph = "Need newer TransformerEngine" except ImportError: reason_for_no_cuda_graph = "Need newer TransformerEngine" @@ -65,12 +68,16 @@ class TestFP8Param: def setup_method(self, method): self.seq_length = 512 self.micro_batch_size = 2 + self.cuda_graph_helper = None os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1' def teardown_method(self, method): Utils.destroy_model_parallel() destroy_global_vars() destroy_num_microbatches_calculator() + if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created(): + self.cuda_graph_helper.delete_cuda_graphs() + self.cuda_graph_helper = None gc.collect() def model_provider( @@ -209,13 +216,12 @@ def _run_test_helper( ) assert len(gpt_model) == 1 # Assume only one model in the model provider. - cuda_graph_helper = None # Hard coded to use cuda_graph_impl="transformer_engine" cuda_graph_impl = "transformer_engine" if use_cuda_graph and cuda_graph_impl == "transformer_engine": from megatron.core.transformer.cuda_graphs import TECudaGraphHelper - cuda_graph_helper = TECudaGraphHelper( + self.cuda_graph_helper = TECudaGraphHelper( model=gpt_model, config=gpt_model[0].config, seq_length=self.seq_length, @@ -250,13 +256,13 @@ def _run_test_helper( # Capture CUDA graphs after warmup if helper is provided. # Hard coded cuda_graph_warmup_steps = 0. cuda_graph_warmup_steps = 0 - if cuda_graph_helper is not None and i == cuda_graph_warmup_steps: + if self.cuda_graph_helper is not None and i == cuda_graph_warmup_steps: if should_disable_forward_pre_hook(args): disable_forward_pre_hook(gpt_model, param_sync=False) - cuda_graph_helper.create_cudagraphs() + self.cuda_graph_helper.create_cudagraphs() if should_disable_forward_pre_hook(args): enable_forward_pre_hook(gpt_model) - cuda_graph_helper.cuda_graph_set_manual_hooks() + self.cuda_graph_helper.cuda_graph_set_manual_hooks() # For the mxfp8_param with reuse_grad_buf_for_mxfp8_param_ag and dp_ag_overlap, # we need to call the _copy_main_params_to_param_buffer() after the grad buffer @@ -297,6 +303,10 @@ def _run_test_helper( loss_list.append(loss.item()) + if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created(): + self.cuda_graph_helper.delete_cuda_graphs() + self.cuda_graph_helper = None + return torch.tensor(loss_list) def run_test(self, tp_size, recipe, inference: bool = False, **kwargs): diff --git a/tests/unit_tests/transformer/test_cuda_graphs.py b/tests/unit_tests/transformer/test_cuda_graphs.py index 3ad0262a1cf..cee75171560 100644 --- a/tests/unit_tests/transformer/test_cuda_graphs.py +++ b/tests/unit_tests/transformer/test_cuda_graphs.py @@ -9,6 +9,7 @@ import pytest import torch +from transformer_engine.pytorch.fp8 import check_fp8_support from megatron.core import parallel_state from megatron.core.enums import ModelType @@ -25,6 +26,7 @@ TextGenerationController, ) from megatron.core.models.gpt.gpt_layer_specs import ( + get_gpt_decoder_block_spec, get_gpt_layer_local_spec, get_gpt_layer_with_transformer_engine_spec, get_gpt_mtp_block_spec, @@ -41,6 +43,8 @@ model_parallel_cuda_manual_seed, ) from megatron.core.transformer.cuda_graphs import CudaGraphManager, _CudagraphGlobalRecord +from megatron.core.transformer.enums import CudaGraphScope +from megatron.core.transformer.moe.fused_a2a import reset_hybrid_ep_buffer from megatron.core.transformer.transformer_block import TransformerBlock from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import is_fa_min_version, is_te_min_version @@ -54,6 +58,8 @@ from megatron.training.training import setup_model_and_optimizer from tests.unit_tests.test_utilities import Utils +fp8_available, _ = check_fp8_support() + class TestParallelTransformerBlockCudagraphs: def setup_method(self, method): @@ -747,6 +753,9 @@ class TestPartialCudaGraph: def setup_method(self, method): self.seq_length = 512 self.micro_batch_size = 2 + self.tp_size = 2 + self.cp_size = 2 + self.cuda_graph_helper = None # Store original environment variable values self.original_env = { 'CUDA_DEVICE_MAX_CONNECTIONS': os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS'), @@ -762,22 +771,28 @@ def teardown_method(self, method): os.environ.pop(key, None) else: os.environ[key] = value - Utils.destroy_model_parallel() destroy_global_vars() destroy_num_microbatches_calculator() + if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created(): + self.cuda_graph_helper.delete_cuda_graphs() + self.cuda_graph_helper = None gc.collect() def model_provider( self, pre_process=True, post_process=True, - layer_spec_fn=get_gpt_layer_with_transformer_engine_spec, + layer_spec_fn=get_gpt_decoder_block_spec, **config_kwargs, ): - model_parallel_cuda_manual_seed(123) args = get_args() config = core_transformer_config_from_args(args) - transformer_layer_spec = layer_spec_fn() + transformer_layer_spec = layer_spec_fn( + config, + use_transformer_engine=True, + normalization=args.normalization, + qk_l2_norm=args.qk_l2_norm, + ) if args.mtp_num_layers: mtp_block_spec = get_gpt_mtp_block_spec( config, transformer_layer_spec, use_transformer_engine=True @@ -810,18 +825,17 @@ def create_test_args( args.num_layers = 4 args.mtp_num_layers = 1 args.vocab_size = 1024 - args.hidden_size = 128 + args.hidden_size = 512 args.num_attention_heads = 8 args.max_position_embeddings = 512 - args.global_batch_size = self.micro_batch_size * 8 + args.global_batch_size = self.micro_batch_size * 8 // self.tp_size // self.cp_size args.micro_batch_size = self.micro_batch_size args.create_attention_mask_in_dataloader = True args.seq_length = self.seq_length - args.tensor_model_parallel_size = 2 - args.sequence_parallel = True + args.tensor_model_parallel_size = self.tp_size + args.sequence_parallel = True if self.tp_size > 1 else False args.pipeline_model_parallel_size = 1 - args.context_parallel_size = 1 - args.expert_model_parallel_size = ep_size + args.context_parallel_size = self.cp_size args.train_iters = 10 args.lr = 3e-5 args.bf16 = True @@ -836,17 +850,26 @@ def create_test_args( # MoE settings args.num_experts = 4 args.expert_model_parallel_size = ep_size + args.expert_tensor_parallel_size = 1 if ep_size > 1 else self.tp_size args.moe_shared_expert_intermediate_size = 1024 - args.moe_layer_freq = "[0,0,1,1]" + args.moe_layer_freq = [0, 0, 1, 1] args.moe_permute_fusion = True args.moe_router_fusion = True args.moe_router_topk = 2 + args.moe_router_dtype = "fp32" # CUDA graph settings args.cuda_graph_impl = cuda_graph_impl args.cuda_graph_scope = cuda_graph_scope args.cuda_graph_warmup_steps = cuda_graph_warmup_steps - args.use_te_rng_tracker = cuda_graph_impl != "none" + + # fp8 settings + if fp8_available: + args.fp8 = "e4m3" + args.fp8_recipe = "tensorwise" + args.first_last_layers_bf16 = True + args.num_layers_at_start_in_bf16 = 1 + args.num_layers_at_end_in_bf16 = 1 for key, value in kwargs.items(): assert hasattr(args, key) @@ -856,15 +879,15 @@ def create_test_args( set_global_variables(args, False) return args - def get_batch(self, seq_length, micro_batch_size): - data = list(range(seq_length)) + def get_batch(self, seq_length, micro_batch_size, cp_size): + data = list(range(seq_length // cp_size)) input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() labels = 1 + torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() attention_mask = torch.ones( - (micro_batch_size, 1, seq_length, seq_length), dtype=bool + (micro_batch_size, 1, seq_length // cp_size, seq_length), dtype=bool ).cuda() - loss_mask = torch.ones(seq_length).repeat((micro_batch_size, 1)).cuda() + loss_mask = torch.ones(seq_length // cp_size).repeat((micro_batch_size, 1)).cuda() return input_ids, labels, position_ids, attention_mask, loss_mask def _run_test_helper( @@ -877,12 +900,10 @@ def _run_test_helper( set_args(args) torch.manual_seed(123) - Utils.initialize_model_parallel( - tensor_model_parallel_size=2, expert_model_parallel_size=ep_size - ) + model_parallel_cuda_manual_seed(123) input_ids, labels, position_ids, attention_mask, loss_mask = self.get_batch( - self.seq_length, self.micro_batch_size + self.seq_length, self.micro_batch_size, self.cp_size ) gpt_model, optimizer, _ = setup_model_and_optimizer( @@ -890,13 +911,10 @@ def _run_test_helper( ) assert len(gpt_model) == 1 # Assume only one model in the model provider. - loss_list = [] - - cuda_graph_helper = None if cuda_graph_impl == "transformer_engine": from megatron.core.transformer.cuda_graphs import TECudaGraphHelper - cuda_graph_helper = TECudaGraphHelper( + self.cuda_graph_helper = TECudaGraphHelper( model=gpt_model, config=gpt_model[0].config, seq_length=self.seq_length, @@ -904,14 +922,17 @@ def _run_test_helper( optimizers=[optimizer], ) + loss_list = [] + for i in range(100): gpt_model[0].zero_grad_buffer() optimizer.zero_grad() # Capture CUDA graphs after warmup if helper is provided - if cuda_graph_helper is not None and i == cuda_graph_warmup_steps: - cuda_graph_helper.create_cudagraphs() + if self.cuda_graph_helper is not None and i == cuda_graph_warmup_steps: + self.cuda_graph_helper.create_cudagraphs() + gpt_model[0].set_is_first_microbatch() output = gpt_model[0].forward( input_ids=input_ids, position_ids=position_ids, @@ -922,7 +943,7 @@ def _run_test_helper( # Check output shapes assert output.shape[0] == self.micro_batch_size - assert output.shape[1] == self.seq_length + assert output.shape[1] == self.seq_length // self.cp_size # Verify gradients loss = output.mean() @@ -936,16 +957,29 @@ def _run_test_helper( loss_list.append(loss.item()) + if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created(): + self.cuda_graph_helper.delete_cuda_graphs() + self.cuda_graph_helper = None + return torch.tensor(loss_list) @pytest.mark.skipif( - not (HAVE_TE and is_te_min_version("1.14.0")), - reason="Partial CUDA graph support requires TransformerEngine version >= 1.14.0", + not (HAVE_TE and is_te_min_version("2.10.0")), + reason="Partial CUDA graph UT support requires TransformerEngine version >= 2.10.0", ) @pytest.mark.parametrize("ep_size", [1, 4]) @pytest.mark.parametrize("moe_dropless_dispatcher", [False, True]) @pytest.mark.parametrize("moe_dispatcher_type", ["alltoall", "deepep", "hybridep"]) def test_moe_partial_cudagraph(self, ep_size, moe_dropless_dispatcher, moe_dispatcher_type): + initialize_rng_tracker(use_te_rng_tracker=True, force_reset=True) + Utils.initialize_model_parallel( + tensor_model_parallel_size=self.tp_size, + context_parallel_size=self.cp_size, + pipeline_model_parallel_size=1, + expert_tensor_parallel_size=1 if ep_size > 1 else self.tp_size, + expert_model_parallel_size=ep_size, + ) + extra_kwargs = {} if moe_dispatcher_type == "deepep": if not is_deep_ep_available(): @@ -962,19 +996,28 @@ def test_moe_partial_cudagraph(self, ep_size, moe_dropless_dispatcher, moe_dispa if not moe_dropless_dispatcher: if moe_dispatcher_type == "deepep": pytest.skip("Deep EP doesn't support drop&pad MoE") + if moe_dispatcher_type == "hybridep" and ep_size == 1: + pytest.skip("Hybrid EP doesn't support drop&pad MoE with ep_size == 1") extra_kwargs["moe_expert_capacity_factor"] = 1.0 extra_kwargs["moe_pad_expert_input_to_capacity"] = True loss_list_ref = self._run_test_helper(ep_size, "none", None, 0, **extra_kwargs) for cuda_graph_scope in [ None, - ["attn"], - ["moe"], - ["mlp", "moe_router"], - ["attn", "mlp", "moe_router", "moe_preprocess"], + [CudaGraphScope.attn], + [CudaGraphScope.moe], + [CudaGraphScope.mlp, CudaGraphScope.moe_router], + [ + CudaGraphScope.attn, + CudaGraphScope.mlp, + CudaGraphScope.moe_router, + CudaGraphScope.moe_preprocess, + ], ]: - if moe_dropless_dispatcher and (cuda_graph_scope is None or "moe" in cuda_graph_scope): - # Dropless MoE doesn't work with "moe" scope cudagraph. Skip. + if (moe_dropless_dispatcher or moe_dispatcher_type == "hybridep") and ( + cuda_graph_scope is None or CudaGraphScope.moe in cuda_graph_scope + ): + # Dropless MoE or Hybrid EP doesn't work with "moe" scope cudagraph. Skip. continue cuda_graph_warmup_steps = 3 loss_list = self._run_test_helper( @@ -986,6 +1029,10 @@ def test_moe_partial_cudagraph(self, ep_size, moe_dropless_dispatcher, moe_dispa ) assert torch.equal(loss_list, loss_list_ref) + if moe_dispatcher_type == "hybridep": + reset_hybrid_ep_buffer() + Utils.destroy_model_parallel() + if __name__ == "__main__": From b0c96b3c99dcb4037a638f0f2a35128786a11939 Mon Sep 17 00:00:00 2001 From: Kunlun Li <94586211+kunlunl@users.noreply.github.com> Date: Mon, 1 Dec 2025 17:30:28 +0800 Subject: [PATCH 165/248] [dev] DeepSeek V3.2 support (#2154) Signed-off-by: kunlunl --- gpt_builders.py | 7 +- ...rimental_attention_variant_module_specs.py | 132 ++ megatron/core/models/gpt/gpt_layer_specs.py | 52 +- .../gpt/linear_attention_module_specs.py | 27 - megatron/core/transformer/attention.py | 1 + .../experimental_attention_variant/dsa.py | 822 +++++++++++ .../transformer/multi_latent_attention.py | 87 +- .../core/transformer/transformer_config.py | 42 +- megatron/training/arguments.py | 35 +- megatron/training/training.py | 16 +- tests/unit_tests/ssm/test_gated_delta_net.py | 4 +- .../transformer/test_attention_variant_dsa.py | 1271 +++++++++++++++++ 12 files changed, 2404 insertions(+), 92 deletions(-) create mode 100644 megatron/core/models/gpt/experimental_attention_variant_module_specs.py delete mode 100644 megatron/core/models/gpt/linear_attention_module_specs.py create mode 100644 megatron/core/transformer/experimental_attention_variant/dsa.py create mode 100644 tests/unit_tests/transformer/test_attention_variant_dsa.py diff --git a/gpt_builders.py b/gpt_builders.py index 9fa1aff72c7..61d159b9967 100644 --- a/gpt_builders.py +++ b/gpt_builders.py @@ -42,7 +42,8 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None): else: use_te = args.transformer_impl == "transformer_engine" - if args.num_experts or (args.linear_attention_type is not None): + linear_attention_variants = ["gated_delta_net"] + if args.num_experts or args.experimental_attention_variant in linear_attention_variants: # Define the decoder block spec transformer_layer_spec = get_gpt_decoder_block_spec( config, @@ -114,7 +115,7 @@ def _get_transformer_layer_spec(use_te, config): args.moe_grouped_gemm, args.qk_layernorm, args.multi_latent_attention, - args.linear_attention_type, + args.experimental_attention_variant, moe_use_legacy_grouped_gemm=args.moe_use_legacy_grouped_gemm, qk_l2_norm=args.qk_l2_norm, use_kitchen=config.use_kitchen, @@ -126,7 +127,7 @@ def _get_transformer_layer_spec(use_te, config): args.moe_grouped_gemm, args.qk_layernorm, args.multi_latent_attention, - args.linear_attention_type, + args.experimental_attention_variant, moe_use_legacy_grouped_gemm=args.moe_use_legacy_grouped_gemm, normalization=args.normalization, use_kitchen=config.use_kitchen, diff --git a/megatron/core/models/gpt/experimental_attention_variant_module_specs.py b/megatron/core/models/gpt/experimental_attention_variant_module_specs.py new file mode 100644 index 00000000000..cbe59618baf --- /dev/null +++ b/megatron/core/models/gpt/experimental_attention_variant_module_specs.py @@ -0,0 +1,132 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +from typing import Optional + +from megatron.core.models.backends import BackendSpecProvider +from megatron.core.ssm.gated_delta_net import GatedDeltaNet, GatedDeltaNetSubmodules +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.experimental_attention_variant.dsa import ( + DSAIndexer, + DSAIndexerSubmodules, + DSAttention, + DSAttentionSubmodules, +) +from megatron.core.transformer.identity_op import IdentityOp +from megatron.core.transformer.multi_latent_attention import ( + MLASelfAttention, + MLASelfAttentionSubmodules, +) +from megatron.core.transformer.spec_utils import ModuleSpec + + +def get_gated_delta_net_module_spec_for_backend( + backend: BackendSpecProvider, normalization: Optional[str] = None +) -> ModuleSpec: + """Helper function to get module spec for Linear Attention""" + rms_norm = normalization == "RMSNorm" + attention = ModuleSpec( + module=GatedDeltaNet, + submodules=GatedDeltaNetSubmodules( + in_proj=backend.column_parallel_layer_norm_linear(), + out_norm=backend.layer_norm(rms_norm=rms_norm, for_qk=False), + out_proj=backend.row_parallel_linear(), + ), + metainfo={"fuse_input_layernorm": True}, + ) + return attention + + +def get_dsa_module_spec_for_backend( + backend: BackendSpecProvider, + qk_layernorm: Optional[bool] = False, + qk_l2_norm: Optional[bool] = False, + multi_latent_attention: Optional[bool] = False, + mla_down_proj_use_column_parallel: Optional[bool] = False, + normalization: Optional[str] = None, + fallback_to_eager_attn: Optional[bool] = False, +) -> ModuleSpec: + """Helper function to get module spec for Sparse Attention.""" + assert multi_latent_attention, "Currently only MLA supports sparse attention." + assert qk_l2_norm is False, "qk_l2_norm is not supported with MLA." + assert fallback_to_eager_attn is False, "Fallback to eager attention is not supported with DSA." + + linear_q_down_proj = ( + backend.column_parallel_linear() if mla_down_proj_use_column_parallel else backend.linear() + ) + linear_kv_down_proj = ( + backend.column_parallel_linear() if mla_down_proj_use_column_parallel else backend.linear() + ) + linear_q_up_proj = backend.column_parallel_linear() + linear_kv_up_proj = backend.column_parallel_linear() + + # Because TransformerEngine does not support sparse attention yet, we use local + # implementation whether the backend is TransformerEngine or not. + core_attention = ModuleSpec( + module=DSAttention, + submodules=DSAttentionSubmodules( + indexer=ModuleSpec( + module=DSAIndexer, + submodules=DSAIndexerSubmodules( + linear_wq_b=backend.linear(), + linear_wk=backend.linear(), + k_norm=backend.layer_norm(rms_norm=False, for_qk=True), + linear_weights_proj=backend.linear(), + ), + ) + ), + ) + + # Adjust for RMS norm. + rms_norm = normalization == "RMSNorm" + qk_norm = backend.layer_norm(rms_norm=rms_norm, for_qk=True) if qk_layernorm else IdentityOp + + attention = ModuleSpec( + module=MLASelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=MLASelfAttentionSubmodules( + linear_q_proj=backend.column_parallel_linear(), + linear_q_down_proj=linear_q_down_proj, + linear_q_up_proj=linear_q_up_proj, + linear_kv_down_proj=linear_kv_down_proj, + linear_kv_up_proj=linear_kv_up_proj, + core_attention=core_attention, + linear_proj=backend.row_parallel_linear(), + q_layernorm=qk_norm, + kv_layernorm=qk_norm, + ), + metainfo={"fuse_input_layernorm": False}, + ) + + return attention + + +def get_experimental_attention_variant_module_spec_for_backend( + backend: BackendSpecProvider, + sharded_state_dict_keys_map: dict, + experimental_attention_variant: Optional[str] = None, + qk_layernorm: Optional[bool] = False, + qk_l2_norm: Optional[bool] = False, + multi_latent_attention: Optional[bool] = False, + mla_down_proj_use_column_parallel: Optional[bool] = False, + normalization: Optional[str] = None, + fallback_to_eager_attn: Optional[bool] = False, +) -> ModuleSpec: + """Helper function to get module spec for Attention""" + if experimental_attention_variant == "gated_delta_net": + return get_gated_delta_net_module_spec_for_backend( + backend=backend, normalization=normalization + ) + elif experimental_attention_variant == "dsa": + return get_dsa_module_spec_for_backend( + backend=backend, + qk_layernorm=qk_layernorm, + qk_l2_norm=qk_l2_norm, + multi_latent_attention=multi_latent_attention, + mla_down_proj_use_column_parallel=mla_down_proj_use_column_parallel, + normalization=normalization, + fallback_to_eager_attn=fallback_to_eager_attn, + ) + else: + raise ValueError( + f"Invalid experimental attention variant: {experimental_attention_variant}" + ) diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py index c5c9caa3d67..5395b158749 100755 --- a/megatron/core/models/gpt/gpt_layer_specs.py +++ b/megatron/core/models/gpt/gpt_layer_specs.py @@ -5,8 +5,8 @@ from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider -from megatron.core.models.gpt.linear_attention_module_specs import ( - get_linear_attention_module_spec_for_backend, +from megatron.core.models.gpt.experimental_attention_variant_module_specs import ( + get_experimental_attention_variant_module_spec_for_backend, ) from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec_for_backend from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules @@ -78,7 +78,7 @@ def get_gpt_layer_with_transformer_engine_spec( moe_grouped_gemm: Optional[bool] = False, qk_layernorm: Optional[bool] = False, multi_latent_attention: Optional[bool] = False, - linear_attention_type: Optional[str] = None, + experimental_attention_variant: Optional[str] = None, fp8: Optional[str] = None, # pylint: disable=unused-argument moe_use_legacy_grouped_gemm: Optional[bool] = False, normalization: Optional[str] = None, @@ -96,7 +96,8 @@ def get_gpt_layer_with_transformer_engine_spec( moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False. qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False. multi_latent_attention (bool, optional): To use multi-latent attention. Defaults to False. - linear_attention_type (str, optional): The type of linear attention. Defaults to None. + experimental_attention_variant (str, optional): The type of experimental attention variant. + Defaults to None. fp8 (str, optional): Deprecated. For temporary Nemo compatibility. moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP. Defaults to False. @@ -133,7 +134,7 @@ def get_gpt_layer_with_transformer_engine_spec( attention = get_attention_module_spec_for_backend( backend=backend, sharded_state_dict_keys_map=sharded_state_dict_keys_map, - linear_attention_type=linear_attention_type, + experimental_attention_variant=experimental_attention_variant, qk_layernorm=qk_layernorm, qk_l2_norm=qk_l2_norm, multi_latent_attention=multi_latent_attention, @@ -166,7 +167,7 @@ def get_gpt_layer_local_spec( moe_grouped_gemm: Optional[bool] = False, qk_layernorm: Optional[bool] = False, multi_latent_attention: Optional[bool] = False, - linear_attention_type: Optional[str] = None, + experimental_attention_variant: Optional[str] = None, fp8: Optional[str] = None, # pylint: disable=unused-argument moe_use_legacy_grouped_gemm: Optional[bool] = False, normalization: Optional[str] = None, @@ -181,7 +182,8 @@ def get_gpt_layer_local_spec( moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False. qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False. multi_latent_attention (bool, optional): To use multi-latent attention. Defaults to False. - linear_attention_type (str, optional): The type of linear attention. Defaults to None. + experimental_attention_variant (str, optional): The type of experimental attention variant. + Defaults to None. fp8 (str, optional): Deprecated. For temporary Nemo compatibility. moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP. Defaults to False. @@ -205,15 +207,17 @@ def get_gpt_layer_local_spec( " and will be removed soon. Please update your code accordingly." ) - if linear_attention_type is not None: - raise NotImplementedError("Linear attention is not supported with local spec yet.") + if experimental_attention_variant is not None: + raise NotImplementedError( + "Experimental attention variant is not supported with local spec yet." + ) sharded_state_dict_keys_map = {} attention = get_attention_module_spec_for_backend( backend=backend, sharded_state_dict_keys_map=sharded_state_dict_keys_map, - linear_attention_type=linear_attention_type, + experimental_attention_variant=experimental_attention_variant, qk_layernorm=qk_layernorm, qk_l2_norm=qk_l2_norm, multi_latent_attention=multi_latent_attention, @@ -278,7 +282,7 @@ def get_transformer_layer_spec_for_backend( def get_attention_module_spec_for_backend( backend: BackendSpecProvider, sharded_state_dict_keys_map: dict, - linear_attention_type: Optional[str] = None, + experimental_attention_variant: Optional[str] = None, qk_layernorm: Optional[bool] = False, qk_l2_norm: Optional[bool] = False, multi_latent_attention: Optional[bool] = False, @@ -288,11 +292,17 @@ def get_attention_module_spec_for_backend( ) -> ModuleSpec: """Helper function to get module spec for Attention""" - if linear_attention_type is not None: - return get_linear_attention_module_spec_for_backend( - backend=backend, - linear_attention_type=linear_attention_type, - normalization=normalization, + if experimental_attention_variant is not None: + return get_experimental_attention_variant_module_spec_for_backend( + backend, + sharded_state_dict_keys_map, + experimental_attention_variant, + qk_layernorm, + qk_l2_norm, + multi_latent_attention, + mla_down_proj_use_column_parallel, + normalization, + fallback_to_eager_attn, ) # Adjust for RMS norm. @@ -526,13 +536,12 @@ def get_gpt_decoder_layer_specs( num_experts = None moe_grouped_gemm = None if attention_type == "linear_attention": - if config.linear_attention_type is None: + linear_attention_variants = ["gated_delta_net"] + if config.experimental_attention_variant not in linear_attention_variants: # Skip if there is no linear attention layer in the model. continue - linear_attention_type = config.linear_attention_type multi_latent_attention = None else: - linear_attention_type = None multi_latent_attention = config.multi_latent_attention layer_spec_key = f"{mlp_type}_{attention_type}" @@ -540,7 +549,7 @@ def get_gpt_decoder_layer_specs( num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, multi_latent_attention=multi_latent_attention, - linear_attention_type=linear_attention_type, + experimental_attention_variant=config.experimental_attention_variant, **get_layer_spec_kwargs, ) @@ -583,7 +592,8 @@ def get_gpt_decoder_layer_specs( f"current linear attention pattern: {config.linear_attention_freq}" ) elif config.linear_attention_freq is None: - if config.linear_attention_type is None: + linear_attention_variants = ["gated_delta_net"] + if config.experimental_attention_variant not in linear_attention_variants: linear_attention_pattern = [0] * config.num_layers else: linear_attention_pattern = [1] * config.num_layers diff --git a/megatron/core/models/gpt/linear_attention_module_specs.py b/megatron/core/models/gpt/linear_attention_module_specs.py deleted file mode 100644 index 7e76d845cff..00000000000 --- a/megatron/core/models/gpt/linear_attention_module_specs.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. - -from typing import Optional - -from megatron.core.models.backends import BackendSpecProvider -from megatron.core.ssm.gated_delta_net import GatedDeltaNet, GatedDeltaNetSubmodules -from megatron.core.transformer.spec_utils import ModuleSpec - - -def get_linear_attention_module_spec_for_backend( - backend: BackendSpecProvider, linear_attention_type: str, normalization: Optional[str] = None -) -> ModuleSpec: - """Helper function to get module spec for Linear Attention""" - rms_norm = normalization == "RMSNorm" - if linear_attention_type == "gated_delta_net": - attention = ModuleSpec( - module=GatedDeltaNet, - submodules=GatedDeltaNetSubmodules( - in_proj=backend.column_parallel_layer_norm_linear(), - out_norm=backend.layer_norm(rms_norm=rms_norm, for_qk=False), - out_proj=backend.row_parallel_linear(), - ), - metainfo={"fuse_input_layernorm": True}, - ) - else: - raise ValueError(f"Invalid linear attention type: {linear_attention_type}") - return attention diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index 57ba494742b..5cf22d25a4b 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -190,6 +190,7 @@ def __init__( self.key_hidden_size = self.hidden_size_per_attention_head self.val_hidden_size = self.hidden_size_per_attention_head + # TODO: This is built twice when using MLA, should be refactored. self.core_attention = build_module( submodules.core_attention, config=self.config, diff --git a/megatron/core/transformer/experimental_attention_variant/dsa.py b/megatron/core/transformer/experimental_attention_variant/dsa.py new file mode 100644 index 00000000000..fc994490b1b --- /dev/null +++ b/megatron/core/transformer/experimental_attention_variant/dsa.py @@ -0,0 +1,822 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +import copy +import math +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import torch + +from megatron.core import parallel_state +from megatron.core.models.common.embeddings import ( + RotaryEmbedding, + YarnRotaryEmbedding, + apply_rotary_pos_emb, +) +from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.process_groups_config import ProcessGroupCollection +from megatron.core.tensor_parallel.mappings import gather_from_sequence_parallel_region +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.module import MegatronModule +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.transformer_config import TransformerConfig + +try: + from fast_hadamard_transform import hadamard_transform +except ImportError: + hadamard_transform = None + + +def rotate_activation(x: torch.Tensor) -> torch.Tensor: + """Apply Hadamard rotation activation. + Reference: + https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/inference/model.py#L424-L428 + + Args: + x: Input tensor (must be bfloat16). + + Returns: + Rotated tensor. + """ + assert ( + x.dtype == torch.bfloat16 + ), f"rotate_activation only support bf16 input, but got {x.dtype}" + assert hadamard_transform is not None, "fast_hadamard_transform is not installed." + hidden_size = x.size(-1) + return hadamard_transform(x, scale=hidden_size**-0.5) + + +class DSAIndexerLossLoggingHelper: + """Helper class for logging sparse attention indexer losses.""" + + tracker = {} + + @staticmethod + def save_loss_to_tracker( + loss: torch.Tensor, + layer_number: int, + num_layers: int, + reduce_group: torch.distributed.ProcessGroup = None, + avg_group: torch.distributed.ProcessGroup = None, + ): + """Save the indexer loss for logging. + + Args: + loss: The loss tensor. + layer_number: Layer index of the loss, 1-indexed. + num_layers: The number of total layers. + reduce_group: The group for reducing the loss. + avg_group: The group for averaging the loss. + """ + # Skip indexer loss logging if layer_number is None. + if layer_number is None: + return + + tracker = DSAIndexerLossLoggingHelper.tracker + if "values" not in tracker: + tracker["values"] = torch.zeros(num_layers, device=torch.cuda.current_device()) + tracker["values"][layer_number - 1] += loss.detach() + tracker["reduce_group"] = reduce_group + tracker["avg_group"] = avg_group + + @staticmethod + def clean_loss_in_tracker(): + """Clear the indexer losses.""" + tracker = DSAIndexerLossLoggingHelper.tracker + if "values" in tracker: + tracker["values"].zero_() + tracker["reduce_group"] = None + tracker["avg_group"] = None + + @staticmethod + def reduce_loss_in_tracker(): + """Collect and reduce the indexer losses across ranks.""" + tracker = DSAIndexerLossLoggingHelper.tracker + if "values" not in tracker: + return + values = tracker["values"] + + torch.distributed.all_reduce( + values, group=parallel_state.get_pipeline_model_parallel_group() + ) + # Reduce indexer losses across ranks. + if tracker.get('reduce_group') is not None: + torch.distributed.all_reduce(values, group=tracker.get('reduce_group')) + if tracker.get('avg_group') is not None: + torch.distributed.all_reduce( + values, group=tracker['avg_group'], op=torch.distributed.ReduceOp.AVG + ) + torch.distributed.all_reduce( + values, + group=parallel_state.get_data_parallel_group(with_context_parallel=False), + op=torch.distributed.ReduceOp.AVG, + ) + + @staticmethod + def track_indexer_metrics( + loss_scale: float, + iteration: int, + writer, + wandb_writer=None, + total_loss_dict=None, + per_layer_logging: bool = False, + ): + """Track the sparse attention indexer metrics for logging. + + Args: + loss_scale: Scale factor for the loss. + iteration: Current training iteration. + writer: TensorBoard writer. + wandb_writer: Weights & Biases writer. + total_loss_dict: Dictionary to accumulate total losses. + per_layer_logging: Whether to log per-layer losses. + """ + DSAIndexerLossLoggingHelper.reduce_loss_in_tracker() + tracker = DSAIndexerLossLoggingHelper.tracker + if "values" not in tracker: + return + + indexer_loss_values = tracker["values"] * loss_scale + num_layers = indexer_loss_values.shape[0] + + # Average across all layers (assuming all layers have sparse attention) + avg_indexer_loss = indexer_loss_values.sum() / num_layers + + # Log average loss + if total_loss_dict is not None: + if "indexer loss" in total_loss_dict: + total_loss_dict["indexer loss"] += avg_indexer_loss + else: + total_loss_dict["indexer loss"] = avg_indexer_loss + + if writer is not None: + writer.add_scalar("indexer loss", avg_indexer_loss, iteration) + + if wandb_writer is not None: + wandb_writer.log({"indexer loss": avg_indexer_loss}, iteration) + + DSAIndexerLossLoggingHelper.clean_loss_in_tracker() + + +def compute_dsa_indexer_loss( + index_scores: torch.Tensor, + topk_indices: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + softmax_scale: float, + loss_coeff: float, + sparse_loss: bool, + pg_collection: ProcessGroupCollection, +) -> torch.Tensor: + """ + Compute KL divergence loss between index_scores and true attention_scores. + + This loss trains the indexer to predict which tokens are important by matching the distribution + of true attention scores. + + Reference: Section 2.1 of + https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/DeepSeek_V3_2.pdf + + Args: + index_scores: Scores predicted by indexer [batch, seqlen_q, seqlen_k]. + topk_indices: Top-k indices [batch, seqlen_q, index_topk]. + query: Query tensor [seqlen_q, batch, heads, dim]. + key: Key tensor [seqlen_k, batch, heads, dim]. + softmax_scale: Scale coefficient after q @ k^T. + loss_coeff: Coefficient for the indexer KL divergence loss. + sparse_loss: bool, whether to use sparse indexer loss. If True, only the topk + indices will be used to compute the loss. + pg_collection: Process group collection, must have TP process group. + + Returns: + index_loss: KL divergence loss (scalar). + """ + sq, b, np, hn = query.size() + sk = key.size(0) + + # [sq, b, np, hn] -> [b, np, sq, hn] -> [b * np, sq, hn] + query = query.permute(1, 2, 0, 3).reshape(b * np, sq, hn) + # [sk, b, np, hn] -> [b, np, hn, sk] -> [b * np, hn, sk] + key = key.permute(1, 2, 3, 0).reshape(b * np, hn, sk) + # Compute attention scores [b * np, sq, sk] + attention_scores = torch.bmm(query.float(), key.float()) * softmax_scale + # Reshape to [b, np, sq, sk] + attention_scores = attention_scores.reshape(b, np, sq, sk) + + # causal_mask [sq, sk] + causal_mask = torch.triu( + torch.full((sq, sk), float('-inf'), dtype=torch.float32, device=attention_scores.device), + diagonal=1, + ) + # index_mask [b, sq, sk] + index_mask = torch.full( + (b, sq, sk), float("-inf"), dtype=torch.float32, device=causal_mask.device + ).scatter_(-1, topk_indices, 0) + + # [b, np, sq, skv] + [1, 1, sq, skv] -> [b, np, sq, skv] + attention_scores += causal_mask.view(1, 1, sq, sk) + if sparse_loss: + # [b, np, sq, sk] + [b, 1, sq, sk] -> [b, np, sq, sk] + attention_scores += index_mask.view(b, 1, sq, sk) + # [b, sq, sk] + [b, sq, sk] -> [b, sq, sk] + index_scores += index_mask + + # [b, np, sq, sk] -> [b, np, sq, sk] + attention_scores = torch.nn.functional.softmax(attention_scores, dim=-1, dtype=torch.float32) + # [b, sq, sk] -> [b, sq, sk] + index_scores = torch.nn.functional.softmax(index_scores, dim=-1, dtype=torch.float32) + + # Sum attention scores across heads. + # [batch, heads, seqlen_q, seqlen_k] -> [batch, seqlen_q, seqlen_k] + attention_scores = attention_scores.sum(dim=1) + if pg_collection.tp.size() > 1: + # attention scores are scattered to TP ranks in head dimension. + torch.distributed.all_reduce(attention_scores.contiguous(), group=pg_collection.tp) + # L1 normalize target on the last dimension. Doesn't use abs() because attention_scores are + # obtained from softmax so they are already non-negative. + attention_scores = attention_scores / attention_scores.sum(dim=-1, keepdim=True) + + # Compute KL divergence: KL(target || index) = target(x) * log(target(x) / index(x)) + # kl_per_element [b, sq, sk] + kl_per_element = attention_scores * ( + torch.log(attention_scores + 1e-10) - torch.log(index_scores + 1e-10) + ) + + # [b, sq, sk] -> [b, sq] -> [1] + # Each token has same weight in the loss. + kl_div = kl_per_element.sum(dim=-1).mean() + + # Scale by coefficient. + indexer_loss = kl_div * loss_coeff + + return indexer_loss + + +class DSAIndexerLossAutoScaler(torch.autograd.Function): + """An AutoScaler that triggers the backward pass and scales the grad for indexer loss. + + This custom autograd function attaches a KL divergence loss to the activation + to train the indexer to predict attention scores without affecting the forward pass. + """ + + main_loss_backward_scale: torch.Tensor = None + + @staticmethod + def forward(ctx, output: torch.Tensor, indexer_loss: torch.Tensor): + """Preserve the indexer_loss by storing it in the context to avoid garbage collection. + + Args: + output: The output tensor (activation). + indexer_loss: The indexer KL divergence loss tensor. + + Returns: + torch.Tensor: The output tensor unchanged. + """ + ctx.save_for_backward(indexer_loss) + return output + + @staticmethod + def backward(ctx, grad_output: torch.Tensor): + """Compute and scale the gradient for indexer loss. + + Args: + grad_output: The gradient of the output. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: The gradient of the output, scaled indexer loss + gradient. + """ + (indexer_loss,) = ctx.saved_tensors + if DSAIndexerLossAutoScaler.main_loss_backward_scale is None: + DSAIndexerLossAutoScaler.main_loss_backward_scale = torch.tensor( + 1.0, device=indexer_loss.device + ) + indexer_loss_backward_scale = DSAIndexerLossAutoScaler.main_loss_backward_scale + scaled_indexer_loss_grad = torch.ones_like(indexer_loss) * indexer_loss_backward_scale + return grad_output, scaled_indexer_loss_grad + + @staticmethod + def set_loss_scale(scale: torch.Tensor): + """Set the scale of the indexer loss. + + Args: + scale: The scale value to set. + """ + if DSAIndexerLossAutoScaler.main_loss_backward_scale is None: + DSAIndexerLossAutoScaler.main_loss_backward_scale = scale + else: + DSAIndexerLossAutoScaler.main_loss_backward_scale.copy_(scale) + + +@dataclass +class DSAIndexerSubmodules: + """ + Configuration class for specifying the submodules of an DSA Indexer. + + Args: + linear_wq_b: Linear projection for query bottleneck expansion. + linear_wk: Linear projection for key. + k_norm: Layer normalization for key. + linear_weights_proj: Linear projection for attention weights. + """ + + linear_wq_b: Union[ModuleSpec, type] = None + linear_wk: Union[ModuleSpec, type] = None + k_norm: Union[ModuleSpec, type] = None + linear_weights_proj: Union[ModuleSpec, type] = None + + +@dataclass +class DSAttentionSubmodules: + """ + Configuration class for specifying the submodules of DSAttention. + + Args: + indexer: DSA Indexer module for computing sparse attention indices. + """ + + indexer: Union[ModuleSpec, type] = None + + +class DSAIndexer(MegatronModule): + """ + DSA Lightning Indexer for DeepSeek Sparse Attention. + + Computes index scores to identify the top-k most relevant key-value pairs for each query in + sparse attention. + + Reference: + https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/inference/model.py#L431-L480 + """ + + def __init__( + self, + config: TransformerConfig, + submodules: DSAIndexerSubmodules, + pg_collection: Optional[ProcessGroupCollection] = None, + ) -> None: + """Initialize the indexer. + + Args: + config (TransformerConfig): The configuration for the transformer model. + submodules (DSAIndexerSubmodules): Indexer submodules specification. + pg_collection (ProcessGroupCollection, optional): Process groups for the indexer. + """ + super().__init__(config=config) + self.hidden_size = self.config.hidden_size + self.qk_pos_emb_head_dim = self.config.qk_pos_emb_head_dim + self.q_lora_rank = ( + self.config.q_lora_rank + if self.config.q_lora_rank is not None + else self.config.hidden_size + ) + + self.index_n_heads = self.config.dsa_indexer_n_heads + self.index_head_dim = self.config.dsa_indexer_head_dim + self.index_topk = self.config.dsa_indexer_topk + + self.softmax_scale: float = self.index_head_dim**-0.5 + + if pg_collection is None: + pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp']) + self.pg_collection = pg_collection + + # Initialize Position Embedding. + if self.config.rope_type == 'rope': + self.rotary_pos_emb = RotaryEmbedding( + self.qk_pos_emb_head_dim, + rotary_percent=self.config.rotary_percent, + rotary_base=self.config.rotary_base, + cp_group=self.pg_collection.cp, + ) + elif self.config.rope_type == 'yarn': + self.rotary_pos_emb = YarnRotaryEmbedding( + self.qk_pos_emb_head_dim, + rotary_base=self.config.rotary_base, + scaling_factor=self.config.rotary_scaling_factor, + original_max_position_embeddings=self.config.original_max_position_embeddings, + beta_fast=self.config.beta_fast, + beta_slow=self.config.beta_slow, + mscale=self.config.mscale, + mscale_all_dim=self.config.mscale_all_dim, + cp_group=self.pg_collection.cp, + ) + else: + raise ValueError( + f'Unsupported RoPE type: {self.config.rope_type}, supported types are "rope" and ' + f'"yarn"' + ) + + self.linear_wq_b = build_module( + submodules.linear_wq_b, + self.q_lora_rank, + self.index_n_heads * self.index_head_dim, + config=self.config, + init_method=self.config.init_method, + bias=False, + skip_bias_add=False, + skip_weight_param_allocation=False, + parallel_mode="duplicated", + ) + + self.linear_wk = build_module( + submodules.linear_wk, + self.hidden_size, + self.index_head_dim, + config=self.config, + init_method=self.config.init_method, + bias=False, + skip_bias_add=False, + skip_weight_param_allocation=False, + parallel_mode="duplicated", + ) + + k_norm_config = copy.copy(self.config) + k_norm_config.normalization = "LayerNorm" + self.k_norm = build_module( + submodules.k_norm, + config=k_norm_config, + hidden_size=self.index_head_dim, + eps=self.config.layernorm_epsilon, + ) + + self.linear_weights_proj = build_module( + submodules.linear_weights_proj, + self.hidden_size, + self.index_n_heads, + config=self.config, + init_method=self.config.init_method, + bias=False, + skip_bias_add=False, + skip_weight_param_allocation=False, + parallel_mode="duplicated", + ) + + def _apply_rope(self, x: torch.Tensor, rotary_pos_emb: torch.Tensor, mscale: float): + """Apply RoPE to the input tensor.""" + # x_nope [seqlen, batch, *, index_head_dim - qk_pos_emb_head_dim] + # x_pe [seqlen, batch, *, qk_pos_emb_head_dim] + x_nope, x_pe = torch.split( + x, [self.index_head_dim - self.qk_pos_emb_head_dim, self.qk_pos_emb_head_dim], dim=-1 + ) + x_pe = apply_rotary_pos_emb( + x_pe, + rotary_pos_emb, + config=self.config, + cu_seqlens=None, + mscale=mscale, + cp_group=self.pg_collection.cp, + ) + # [seqlen, batch, *, index_head_dim] + x = torch.cat([x_nope, x_pe], dim=-1) + return x + + def _compute_index_scores( + self, q: torch.Tensor, weights: torch.Tensor, k: torch.Tensor + ) -> torch.Tensor: + """ + Perform index score using BF16 precision. + + Reference: + https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/inference/kernel.py#L254-L274 + This is a BF16 implementation of the `fp8_index` logic: + 1. Compute attention scores: q @ k^T; + 2. Apply ReLU activation; + 3. Weight by attention weights; + 4. Sum across attention heads. + + Args: + q: BF16 [seqlen_q, batch, index_n_heads, index_head_dim], the query tensor. + weights: BF16 [seqlen_q, batch, index_n_heads], the attention weights. + k: BF16 [seqlen_k, batch, index_head_dim], the key tensor. + + Returns: + index_scores: FP32 [batch, seqlen_q, seqlen_k], the index scores. + """ + # Compute attention scores: q @ k^T + # [seqlen_q, batch, index_n_heads, index_head_dim] @ [seqlen_k, batch, index_head_dim]^T + # -> [seqlen_q, batch, index_n_heads, seqlen_k] + index_scores = torch.einsum('sbhd,tbd->sbht', q.float(), k.float()) + + # Apply ReLU activation. + index_scores = torch.relu(index_scores) + + # Weight each head by attention weights. + # [seqlen_q, batch, index_n_heads, seqlen_k] * [seqlen_q, batch, index_n_heads, 1] + # -> [seqlen_q, batch, index_n_heads, seqlen_k] + index_scores = index_scores * weights.unsqueeze(-1) + + # Sum across attention heads. + # [seqlen_q, batch, index_n_heads, seqlen_k] -> [seqlen_q, batch, seqlen_k] + index_scores = index_scores.sum(dim=2) + + # Transpose to [batch, seqlen_q, seqlen_k]. + index_scores = index_scores.transpose(0, 1) + + return index_scores + + def forward_with_scores( + self, + x: torch.Tensor, + qr: torch.Tensor, + mask: Optional[torch.Tensor] = None, + packed_seq_params: Optional[PackedSeqParams] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Forward pass for DSA Indexer that returns both index scores and top-k indices. + + This is used when KL loss is enabled to compare indexer scores with true attention scores. + + Args: + x: hidden states [seqlen, batch, hidden_size]. + qr: Low-rank query tensor [seqlen, batch, q_lora_rank]. + mask: Attention mask [batch, seqlen, seqlen]. + packed_seq_params: Packed sequence parameters for variable length sequences. + + Returns: + index_scores: Index scores [batch, seqlen, seqlen]. + topk_indices: Top-k indices [batch, seqlen, index_topk]. + """ + assert packed_seq_params is None, "Packed sequence is not supported for DSAttention" + + # ========================================= + # Prepare RoPE params + # ========================================= + rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len( + None, None, x, self.config, packed_seq_params + ) + if self.config.rope_type == "rope": + rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len, packed_seq=False) + mscale = 1.0 + else: + rotary_pos_emb, mscale = self.rotary_pos_emb(rotary_seq_len, packed_seq=False) + + # ========================================= + # Gather inputs if sp is enabled + # ========================================= + if self.config.sequence_parallel and self.pg_collection.tp.size() > 1: + x = gather_from_sequence_parallel_region(x, group=self.pg_collection.tp) + qr = gather_from_sequence_parallel_region(qr, group=self.pg_collection.tp) + + # ========================================= + # Get sequence length and batch size + # ========================================= + seqlen, bsz, _ = x.size() + + # ========================================= + # q linear and apply rope to q + # ========================================= + # [seqlen, batch, q_lora_rank] -> [seqlen, batch, index_n_heads * index_head_dim] + q, _ = self.linear_wq_b(qr) + # [seqlen, batch, index_n_heads * index_head_dim] + # -> [seqlen, batch, index_n_heads, index_head_dim] + q = q.reshape(seqlen, bsz, self.index_n_heads, self.index_head_dim) + q = self._apply_rope(q, rotary_pos_emb, mscale) + + # ========================================= + # k linear and apply rope to k + # ========================================= + # [seqlen, batch, hidden_size] -> [seqlen, batch, index_head_dim] + k, _ = self.linear_wk(x) + k = self.k_norm(k) + # [seqlen, batch, index_head_dim] -> [seqlen, batch, 1, index_head_dim] + k = k.reshape(seqlen, bsz, 1, self.index_head_dim) + k = self._apply_rope(k, rotary_pos_emb, mscale) + # [seqlen, batch, 1, index_head_dim] -> [seqlen, batch, index_head_dim] + k = k.reshape(seqlen, bsz, self.index_head_dim) + + # ========================================= + # Rotate activation + # ========================================= + q = rotate_activation(q) + k = rotate_activation(k) + + # ========================================= + # Compute index scores + # ========================================= + # [seqlen, batch, hidden_size] -> [seqlen, batch, index_n_heads] + weights, _ = self.linear_weights_proj(x) + weights = weights * (self.index_n_heads**-0.5) * self.softmax_scale + # [batch, seqlen, seqlen] + index_scores = self._compute_index_scores(q, weights, k) + if mask is not None: + assert mask.dtype == index_scores.dtype, "Mask dtype must match index scores dtype" + index_scores = index_scores + mask + + # ========================================= + # Select top-k indices + # ========================================= + topk_k = min(self.index_topk, seqlen) + # [batch, seqlen, index_topk] + topk_indices = index_scores.topk(topk_k, dim=-1)[1] + + return index_scores, topk_indices + + def forward( + self, + x: torch.Tensor, + qr: torch.Tensor, + mask: Optional[torch.Tensor] = None, + packed_seq_params: Optional[PackedSeqParams] = None, + ): + """ + Forward pass for DSA Indexer. + + Args: + x: hidden states [seqlen, batch, hidden_size]. + qr: Low-rank query tensor [seqlen, batch, q_lora_rank]. + mask: Attention mask [batch, seqlen, seqlen]. + packed_seq_params: Packed sequence parameters for variable length sequences. + + Returns: + topk_indices: Top-k indices for sparse attention [batch, seqlen, index_topk]. + """ + _, topk_indices = self.forward_with_scores(x, qr, mask, packed_seq_params) + return topk_indices + + +def unfused_dsa_fn(query, key, value, topk_indices, softmax_scale): + """ + Unfused sparse attention implementation. + """ + sq, b, np, hn = query.size() + skv = key.size(0) + hnv = value.size(3) + + # =================================== + # Raw attention scores [b, np, sq, skv] + # =================================== + # [sq, b, np, hn] -> [b, np, sq, hn] -> [b * np, sq, hn] + query = query.permute(1, 2, 0, 3).reshape(b * np, sq, hn) + # [skv, b, np, hn] -> [b, np, hn, skv] -> [b * np, hn, skv] + key = key.permute(1, 2, 3, 0).reshape(b * np, hn, skv) + # Compute attention scores [b * np, sq, skv] + attention_scores = torch.bmm(query.float(), key.float()) * softmax_scale + # Reshape to [b, np, sq, skv] + attention_scores = attention_scores.reshape(b, np, sq, skv) + + # =================================== + # Apply sparse mask from indexer + # =================================== + # index_mask [b, sq, skv] + index_mask = torch.full((b, sq, skv), float("-inf"), device=attention_scores.device) + index_mask.scatter_(-1, topk_indices, 0) + # causal_mask [sq, skv] + causal_mask = torch.triu( + torch.full((sq, skv), float('-inf'), dtype=torch.float32, device=index_mask.device), + diagonal=1, + ) + # [b, sq, skv] + [1, sq, skv] -> [b, sq, skv] + index_mask += causal_mask.view(1, sq, skv) + # [b, np, sq, skv] + [b, 1, sq, skv] -> [b, np, sq, skv] + attention_scores += index_mask.unsqueeze(1) + attention_scores = torch.nn.functional.softmax(attention_scores, dim=-1, dtype=torch.float32) + + # =================================== + # Output + # =================================== + # [skv, b, np, hnv] -> [b, np, skv, hnv] -> [b * np, skv, hnv] + value = value.permute(1, 2, 0, 3).reshape(b * np, skv, hnv) + # Reshape attention_scores: [b, np, sq, skv] -> [b * np, sq, skv] + attention_scores = attention_scores.reshape(b * np, sq, skv) + # Compute output: [b * np, sq, hnv] + output = torch.bmm(attention_scores.to(value.dtype), value) + # Reshape output: [b * np, sq, hnv] -> [b, np, sq, hnv] -> [sq, b, np, hnv] + output = output.reshape(b, np, sq, hnv).permute(2, 0, 1, 3).contiguous() + # Flatten: [sq, b, np, hnv] -> [sq, b, np * hnv] + output = output.reshape(sq, b, np * hnv) + return output + + +class DSAttention(MegatronModule): + """ + This module implements sparse attention mechanism using an DSA Indexer to compute top-k + attention indices for reducing computational complexity. + + Reference: + https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/inference/model.py#L491-L597 + """ + + def __init__( + self, + config: TransformerConfig, + submodules: DSAttentionSubmodules, + layer_number: int, + attn_mask_type: AttnMaskType, + attention_type: str, + attention_dropout: Optional[float] = None, + softmax_scale: Optional[float] = None, + k_channels: Optional[int] = None, + v_channels: Optional[int] = None, + cp_comm_type: str = "p2p", + pg_collection: ProcessGroupCollection = None, + ): + super().__init__(config=config) + + self.layer_number = layer_number + + self.indexer = build_module( + submodules.indexer, config=self.config, pg_collection=pg_collection + ) + + if softmax_scale is None: + softmax_scale = 1.0 / math.sqrt( + k_channels if k_channels is not None else config.kv_channels + ) + self.softmax_scale = softmax_scale + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + x: torch.Tensor, + qr: torch.Tensor, + attention_mask: torch.Tensor, + attn_mask_type: AttnMaskType = None, + attention_bias: torch.Tensor = None, + packed_seq_params: PackedSeqParams = None, + ): + """ + Forward pass for Sparse Attention. + + Args: + query: Query tensor [sq, b, np, hn]. + key: Key tensor [skv, b, np, hn]. + value: Value tensor [skv, b, np, hnv]. + x: Original hidden states [sq, b, hidden_size]. + qr: Low-rank query representation [sq, b, q_lora_rank]. + attention_mask: Attention mask tensor [b, 1, sq, sk]. + attn_mask_type: Type of attention mask. + attention_bias: Optional attention bias. + packed_seq_params: Packed sequence parameters. + + Returns: + output: Output tensor [sq, b, hidden_size] + """ + sq, b, np, hn = query.size() + skv = key.size(0) + hnv = value.size(3) + + # Detach x and qr to prevent gradients of indexer from flowing back to the main model. + x = x.detach() + qr = qr.detach() + + # Get a FP32 mask with -inf for masked positions. + if attn_mask_type is not None: + assert attn_mask_type == AttnMaskType.causal, 'Only causal mask is supported for now' + # Generate upper triangular mask with -inf above diagonal, 0 elsewhere + # torch.triu with diagonal=1 creates upper triangular matrix (excluding main diagonal) + # float_mask [sq, skv] + float_mask = torch.triu( + torch.full((sq, skv), float('-inf'), dtype=torch.float32, device=x.device), + diagonal=1, + ) + else: + assert attention_mask.shape == (b, 1, sq, skv), 'attention_mask shape mismatch' + # [b, 1, sq, skv] -> [b, sq, skv] + mask = attention_mask.squeeze() + # float_mask [b, sq, skv] + float_mask = torch.zeros_like(mask, dtype=torch.float32).masked_fill( + mask, float('-inf') + ) + + # =================================== + # Get index scores and top-k indices + # =================================== + index_scores, topk_indices = self.indexer.forward_with_scores( + x, qr, mask=float_mask, packed_seq_params=packed_seq_params + ) + + # =================================== + # Run sparse attention kernel + # =================================== + output = unfused_dsa_fn(query, key, value, topk_indices, self.softmax_scale) + + # =================================== + # Attach indexer loss + # =================================== + if self.training and torch.is_grad_enabled(): + # Compute KL divergence loss between indexer scores and true attention scores + indexer_loss_coeff = getattr(self.config, 'dsa_indexer_loss_coeff', 0.0) + indexer_loss = compute_dsa_indexer_loss( + index_scores, + topk_indices, + query.detach(), + key.detach(), + self.softmax_scale, + indexer_loss_coeff, + getattr(self.config, "dsa_indexer_use_sparse_loss", False), + self.indexer.pg_collection, + ) + # Save indexer loss for logging + if indexer_loss_coeff > 0: + DSAIndexerLossLoggingHelper.save_loss_to_tracker( + loss=indexer_loss, + layer_number=self.layer_number, + num_layers=self.config.num_layers, + ) + # Attach loss to output + output = DSAIndexerLossAutoScaler.apply(output, indexer_loss) + + return output diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py index 074523afd7b..3953d933b45 100644 --- a/megatron/core/transformer/multi_latent_attention.py +++ b/megatron/core/transformer/multi_latent_attention.py @@ -243,13 +243,28 @@ def forward( # Get the query, key and value tensors based on the type of attention - # self or cross attn. # query: [96, 1, 16, 128], key:[96, 1, 16, 128], value:[96, 1, 16, 128] - query, key, value = self.get_query_key_value_tensors( - hidden_states, - key_value_states, - position_ids, - packed_seq_params, - inference_context=inference_context, - ) + if self.config.experimental_attention_variant is None: + query, key, value = self.get_query_key_value_tensors( + hidden_states, + key_value_states, + position_ids, + packed_seq_params, + inference_context=inference_context, + ) + elif self.config.experimental_attention_variant == "dsa": + query, key, value, q_compressed, _ = self.get_query_key_value_tensors( + hidden_states, + key_value_states, + position_ids, + packed_seq_params, + inference_context=inference_context, + return_compressed_tensors=True, + ) + else: + raise ValueError( + f"Unsupported experimental attention variant: " + f"{self.config.experimental_attention_variant}" + ) # =================================================== # Adjust key, value for inference @@ -281,14 +296,34 @@ def forward( if inference_context is None or inference_context.is_static_batching(): with get_fine_grained_offloading_context(self.offload_core_attention): - core_attn_out = self.core_attention( - query, - key, - value, - attention_mask, - packed_seq_params=packed_seq_params, - attn_mask_type=attn_mask_type, - ) + if self.config.experimental_attention_variant is None: + core_attn_out = self.core_attention( + query, + key, + value, + attention_mask, + packed_seq_params=packed_seq_params, + attn_mask_type=attn_mask_type, + ) + elif self.config.experimental_attention_variant == "dsa": + # For dsa we need to pass in the original hidden states and the compressed + # query representation. + core_attn_out = self.core_attention( + query, + key, + value, + x=hidden_states, + qr=q_compressed, + attention_mask=attention_mask, + attn_mask_type=attn_mask_type, + attention_bias=None, + packed_seq_params=packed_seq_params, + ) + else: + raise ValueError( + f"Unsupported attention variant: " + f"{self.config.experimental_attention_variant}" + ) elif self.cache_mla_latents: # Dynamic batching attention kernel. q, k, v = (query, key, value) @@ -494,6 +529,7 @@ def get_query_key_value_tensors( inference_context=None, *, inference_params=None, + return_compressed_tensors=False, ): """ Derives `query`, `key` and `value` tensors from `hidden_states`. @@ -603,6 +639,16 @@ def get_query_key_value_tensors( kv_compressed = kv_compressed.squeeze(1) k_pos_emb = k_pos_emb.squeeze(1) + # ========================================= + # Apply norm + # ========================================= + + if self.config.q_lora_rank is not None: + # q_compressed: [num_tokens, q_lora_rank] + q_compressed = self.q_layernorm(q_compressed) + + kv_compressed = self.kv_layernorm(kv_compressed) + # ========================================= # QKV up projection and RoPE apply # ========================================= @@ -613,7 +659,6 @@ def qkv_up_proj_and_rope_apply_for_cached_latent_kv( if self.config.q_lora_rank is not None: # q_compressed: [num_tokens, q_lora_rank] # q: [num_tokens, n * (qk_head_dim + qk_pos_emb_head_dim)] - q_compressed = self.q_layernorm(q_compressed) q, _ = self.linear_q_up_proj(q_compressed) else: # q_compressed: [num_tokens, hidden_size] @@ -623,8 +668,6 @@ def qkv_up_proj_and_rope_apply_for_cached_latent_kv( # q: [num_tokens, n, q_head_dim] q = q.view(*q.size()[:-1], self.num_attention_heads_per_partition, self.q_head_dim) - kv_compressed = self.kv_layernorm(kv_compressed) - # [num_tokens, qk_pos_emb_head_dim] -> [num_tokens, 1, qk_pos_emb_head_dim] k_pos_emb = torch.unsqueeze(k_pos_emb, -2) @@ -688,7 +731,6 @@ def qkv_up_proj_and_rope_apply(q_compressed, kv_compressed, k_pos_emb, rotary_po if self.config.q_lora_rank is not None: # q_compressed: [num_tokens, q_lora_rank] # q: [num_tokens, n * (qk_head_dim + qk_pos_emb_head_dim)] - q_compressed = self.q_layernorm(q_compressed) q, _ = self.linear_q_up_proj(q_compressed) else: # q_compressed: [num_tokens, hidden_size] @@ -698,8 +740,6 @@ def qkv_up_proj_and_rope_apply(q_compressed, kv_compressed, k_pos_emb, rotary_po # q: [num_tokens, n, q_head_dim] q = q.view(*q.size()[:-1], self.num_attention_heads_per_partition, self.q_head_dim) - kv_compressed = self.kv_layernorm(kv_compressed) - # kv: [num_tokens, n * (qk_head_dim + v_head_dim)] kv, _ = self.linear_kv_up_proj(kv_compressed) @@ -824,7 +864,10 @@ def qkv_up_proj_and_rope_apply(q_compressed, kv_compressed, k_pos_emb, rotary_po q_compressed, kv_compressed, k_pos_emb, rotary_pos_emb ) - return query, key, value + if return_compressed_tensors: + return query, key, value, q_compressed, kv_compressed + else: + return query, key, value def uncompress_kv_from_cache(self, kv_cached): """ diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index cc714e9ac15..a3a16754977 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -233,11 +233,14 @@ class TransformerConfig(ModelParallelConfig): 16 SMs can generally achieve good bandwidth.""" #################### - # linear attention + # attention variant #################### - linear_attention_type: Optional[str] = None - """Type of linear attention to use. Currently support gated_delta_net.""" + experimental_attention_variant: Optional[str] = None + """Type of attention variant to use. Currently support gated_delta_net and dsa.""" + #################### + # attention variant: gated_delta_net + #################### linear_attention_freq: Optional[Union[int, List[int]]] = None """Frequency between LA (linear attention) layers and SDPA (scaled dot-product attention) layers. @@ -260,6 +263,25 @@ class TransformerConfig(ModelParallelConfig): linear_num_value_heads: Optional[int] = None """Number of value and gate heads for the gated delta net.""" + #################### + # attention variant: dsa + #################### + dsa_indexer_n_heads: Optional[int] = None + """Number of DSA indexer heads.""" + + dsa_indexer_head_dim: Optional[int] = None + """Dimension per DSA indexer head.""" + + dsa_indexer_topk: Optional[int] = None + """Number of top-k tokens to select in DSA indexer.""" + + dsa_indexer_loss_coeff: Optional[float] = None + """Coefficient for the DSA indexer KL divergence loss. Set to 0 to disable indexer loss.""" + + dsa_indexer_use_sparse_loss: Optional[bool] = None + """Whether to use sparse DSA indexer loss. If True, the indexer loss will be computed using the + top-k indices.""" + #################### # initialization #################### @@ -855,17 +877,12 @@ def __post_init__(self): f"tensor_model_parallel_size ({self.tensor_model_parallel_size})." ) - if self.linear_attention_type is not None: - supported_la_types = ["gated_delta_net"] - assert self.linear_attention_type in supported_la_types, ( - f"linear_attention_type ({self.linear_attention_type}) only support" - f" one of {supported_la_types}." - ) + if self.experimental_attention_variant in ["gated_delta_net"]: assert ( self.linear_attention_freq is not None ), f"linear_attention_freq must be set for linear attention." - if self.linear_attention_type == "gated_delta_net": + if self.experimental_attention_variant == "gated_delta_net": # Check required parameters assert ( self.linear_conv_kernel_dim is not None @@ -900,6 +917,11 @@ def __post_init__(self): f"Gated delta net does not support context parallel for now," f" but got {self.context_parallel_size=}." ) + elif self.experimental_attention_variant == "dsa": + assert ( + self.context_parallel_size == 1 + ), "Currently context parallelism is not supported by DSAttention!" + assert not self.apply_rope_fusion, "RoPE fusion is not supported for DSAttention" if self.fp8: # cannot support first last layer bf16 with delayed scaling diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 15576e2ceac..0cf2d006863 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -69,7 +69,7 @@ def add_megatron_arguments(parser: argparse.ArgumentParser): parser = _add_vision_args(parser) parser = _add_moe_args(parser) parser = _add_mla_args(parser) - parser = _add_linear_attention_args(parser) + parser = _add_experimental_attention_variant_args(parser) parser = _add_heterogeneous_args(parser) parser = _add_logging_args(parser) parser = _add_straggler_detector_args(parser) @@ -1194,13 +1194,21 @@ def validate_args(args, defaults={}): args.no_load_rng = True print('Warning: disabling --no-load-rng for upcycling.') + if args.linear_attention_type is not None: + print_rank_0( + '--linear-attention-type is deprecated, use --experimental-attention-variant instead.', + args.rank, + ) + args.experimental_attention_variant = args.linear_attention_type + del args.linear_attention_type + # Muon optimizercheck if 'muon' in args.optimizer: assert not args.use_distributed_optimizer, "Muon optimizer does not support distributed optimizer for now." assert not args.use_torch_fsdp2, "Muon optimizer does not support Torch-FSDP2 for now." assert not args.use_megatron_fsdp, "Muon optimizer does not support Megatron-FSDP for now." assert args.ckpt_format in ["torch", "torch_dist"], "Muon optimizer supports torch and torch_dist checkpoint format." - assert args.linear_attention_type is None, "Muon optimizer does not support linear attention type for now." + assert args.experimental_attention_variant is None, "Muon optimizer does not support attention variant for now." assert not args.attention_output_gate, "Muon optimizer does not support attention output gate for now." # Optimizer CPU offload check @@ -3361,10 +3369,14 @@ def _add_mla_args(parser): return parser -def _add_linear_attention_args(parser): - group = parser.add_argument_group(title="la") +def _add_experimental_attention_variant_args(parser): + group = parser.add_argument_group(title="experimental_attention_variant") + group.add_argument('--experimental-attention-variant', default=None, choices=['gated_delta_net', 'dsa'], type=str, + help='Type of attention variant to use. Currently support gated_delta_net and dsa.') + + # Linear attention group.add_argument('--linear-attention-type', default=None, choices=['gated_delta_net'], type=str, - help='Type of linear attention to use. Currently support gated_delta_net.') + help='(Deprecated, use --experimental-attention-variant instead) Type of linear attention to use. Currently support gated_delta_net.') group.add_argument('--linear-attention-freq', type=la_freq_type, default=None, help='Frequency between LA (linear attention) layers and' ' SDPA (scaled dot-product attention) layers. Accepts either: ' @@ -3384,6 +3396,19 @@ def _add_linear_attention_args(parser): help='Number of query and key heads for the gated delta net.') group.add_argument('--linear-num-value-heads', default=32, type=int, help='Number of value and gate heads for the gated delta net.') + + # DSA + group.add_argument('--dsa-indexer-n-heads', default=None, type=int, + help='Number of indexer heads for sparse attention. If not set, defaults to num-attention-heads.') + group.add_argument('--dsa-indexer-head-dim', default=None, type=int, + help='Dimension per indexer head for sparse attention. If not set, defaults to kv-channels.') + group.add_argument('--dsa-indexer-topk', default=None, type=int, + help='Number of top-k tokens to select in sparse attention indexer.') + group.add_argument('--dsa-indexer-loss-coeff', default=0.0, type=float, + help='Coefficient for the indexer KL divergence loss. Set to 0 to disable indexer loss.') + group.add_argument('--dsa-indexer-use-sparse-loss', action='store_true', + help='Use sparse indexer loss. If set, the indexer loss will be computed using the top-k indices.') + return parser def _add_heterogeneous_args(parser): diff --git a/megatron/training/training.py b/megatron/training/training.py index 555cc0ecfee..e88b9839d28 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -92,6 +92,7 @@ from megatron.core.optimizer_param_scheduler import OptimizerParamScheduler from megatron.core.transformer.moe import upcycling_utils from megatron.core.transformer.moe.moe_utils import track_moe_metrics +from megatron.core.transformer.experimental_attention_variant.dsa import DSAIndexerLossLoggingHelper from megatron.core.transformer.multi_token_prediction import MTPLossLoggingHelper from megatron.core.parallel_state import ( destroy_global_memory_buffer, @@ -376,7 +377,8 @@ def transformer_flops(): ) ) - if args.linear_attention_type is not None: + linear_attention_variants = ["gated_delta_net"] + if args.experimental_attention_variant in linear_attention_variants: # Calculate number of dense and MoE Transformer MLPs. if isinstance(args.linear_attention_freq, int): linear_attention_pattern = [ @@ -401,7 +403,7 @@ def transformer_flops(): num_linear_attention_layers = sum(linear_attention_pattern) num_standard_attention_layers = num_layers - num_linear_attention_layers - if args.linear_attention_type == "gated_delta_net": + if args.experimental_attention_variant == "gated_delta_net": # Calculate the FLOPs for the gated delta net attention. qk_head_dim = args.linear_key_head_dim v_head_dim = args.linear_value_head_dim @@ -1699,6 +1701,16 @@ def training_log( MTPLossLoggingHelper.track_mtp_metrics( mtp_loss_scale, iteration, writer, wandb_writer, total_loss_dict ) + # Track sparse attention indexer loss + if args.dsa_indexer_loss_coeff is not None and args.dsa_indexer_loss_coeff > 0: + indexer_loss_scale = 1 / get_num_microbatches() + DSAIndexerLossLoggingHelper.track_indexer_metrics( + loss_scale=indexer_loss_scale, + iteration=iteration, + writer=writer, + wandb_writer=wandb_writer, + total_loss_dict=total_loss_dict, + ) if iteration % args.log_interval == 0: if args.record_memory_history and (is_last_rank() or torch.distributed.get_backend() == 'fake'): snapshot = torch.cuda.memory._snapshot() diff --git a/tests/unit_tests/ssm/test_gated_delta_net.py b/tests/unit_tests/ssm/test_gated_delta_net.py index dbf8d203634..89a185e3755 100644 --- a/tests/unit_tests/ssm/test_gated_delta_net.py +++ b/tests/unit_tests/ssm/test_gated_delta_net.py @@ -88,7 +88,7 @@ def setup_method(self, tp_size, sp, cp_size): context_parallel_size=cp_size, ) gdn_submodules = get_gpt_layer_with_transformer_engine_spec( - linear_attention_type="gated_delta_net", normalization="RMSNorm" + experimental_attention_variant="gated_delta_net", normalization="RMSNorm" ).submodules.self_attention.submodules self.gdn = GatedDeltaNet( @@ -157,7 +157,7 @@ def test_parallel_gated_delta_net_correctness(tmp_path_dist_ckpt, tp, sp, cp): # Model initialization function def initialize_gpt_model(config, pre_process=True, post_process=True, vp_stage=None): layer_spec = get_gpt_layer_with_transformer_engine_spec( - linear_attention_type="gated_delta_net", normalization=normalization + experimental_attention_variant="gated_delta_net", normalization=normalization ) gpt_model = GPTModel( config=config, diff --git a/tests/unit_tests/transformer/test_attention_variant_dsa.py b/tests/unit_tests/transformer/test_attention_variant_dsa.py new file mode 100644 index 00000000000..bd106aa6f0e --- /dev/null +++ b/tests/unit_tests/transformer/test_attention_variant_dsa.py @@ -0,0 +1,1271 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +from unittest.mock import patch + +import pytest +import torch + +import megatron.core.parallel_state as parallel_state +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec +from megatron.core.process_groups_config import ProcessGroupCollection +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer import TransformerConfig +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.experimental_attention_variant.dsa import ( + DSAIndexer, + DSAIndexerLossAutoScaler, + DSAIndexerSubmodules, + DSAttention, + DSAttentionSubmodules, + compute_dsa_indexer_loss, + rotate_activation, +) +from megatron.core.transformer.transformer_config import MLATransformerConfig +from tests.unit_tests.test_utilities import Utils + +try: + from fast_hadamard_transform import hadamard_transform as _hadamard_transform + + HAVE_HADAMARD = True +except ImportError: + HAVE_HADAMARD = False + _hadamard_transform = None + + +def mock_hadamard_transform(x: torch.Tensor, scale: float = 1.0) -> torch.Tensor: + """Mock implementation of hadamard_transform for testing without the library installed. + + This is a simple identity-like transformation that preserves shape and applies scaling. + """ + return x * scale + + +@pytest.fixture(autouse=True) +def patch_hadamard_if_needed(): + """Automatically patch hadamard_transform in dsa module if not installed.""" + if not HAVE_HADAMARD: + with patch( + 'megatron.core.transformer.experimental_attention_variant.dsa.hadamard_transform', + mock_hadamard_transform, + ): + yield + else: + yield + + +class TestRotateActivation: + """Test rotate_activation function.""" + + @pytest.fixture(scope='function', autouse=True) + def setup_method(self): + Utils.initialize_model_parallel( + tensor_model_parallel_size=1, pipeline_model_parallel_size=1 + ) + yield + Utils.destroy_model_parallel() + + def test_rotate_activation_shape(self): + """Test that rotate_activation preserves shape.""" + batch_size = 2 + seq_len = 16 + hidden_size = 128 + + x = torch.randn(seq_len, batch_size, hidden_size, dtype=torch.bfloat16).cuda() + output = rotate_activation(x) + + assert output.shape == x.shape + assert output.dtype == torch.bfloat16 + + def test_rotate_activation_dtype_check(self): + """Test that rotate_activation only accepts bfloat16.""" + x = torch.randn(16, 2, 128, dtype=torch.float32).cuda() + + with pytest.raises(AssertionError, match="only support bf16"): + rotate_activation(x) + + +@pytest.mark.parametrize("seqlen_and_topk", [[16, 32], [64, 32]]) +class TestComputeDSAIndexerLoss: + """Test compute_dsa_indexer_loss function.""" + + @pytest.fixture(scope='function', autouse=True) + def setup_method(self): + Utils.initialize_model_parallel( + tensor_model_parallel_size=1, pipeline_model_parallel_size=1 + ) + self.pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp']) + yield + Utils.destroy_model_parallel() + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_dsa_indexer_loss_shape(self, seqlen_and_topk): + """Test that indexer loss returns a scalar.""" + batch_size = 2 + seqlen = seqlen_and_topk[0] + num_heads = 4 + head_dim = 128 + index_topk = seqlen_and_topk[1] + + # Create dummy index scores + index_scores = torch.randn(batch_size, seqlen, seqlen, dtype=torch.float32).cuda() + + # Apply causal mask to index_scores before computing topk + causal_mask = torch.triu( + torch.full( + (seqlen, seqlen), float('-inf'), dtype=torch.float32, device=index_scores.device + ), + diagonal=1, + ) + # [batch_size, seqlen, seqlen] + [seqlen, seqlen] -> [batch_size, seqlen, seqlen] + masked_index_scores = index_scores + causal_mask + + # Get topk indices from masked index_scores + topk_k = min(index_topk, seqlen) + topk_indices = masked_index_scores.topk(topk_k, dim=-1)[1] + + query = torch.randn(seqlen, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda() + key = torch.randn(seqlen, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda() + softmax_scale = head_dim**-0.5 + + loss = compute_dsa_indexer_loss( + index_scores=index_scores, + topk_indices=topk_indices, + query=query, + key=key, + softmax_scale=softmax_scale, + loss_coeff=1.0, + sparse_loss=False, + pg_collection=self.pg_collection, + ) + + assert loss.shape == torch.Size([]) + assert loss.dtype == torch.float32 + assert loss >= 0 # KL divergence should be non-negative + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_dsa_indexer_loss_sparse(self, seqlen_and_topk): + """Test sparse indexer loss computation.""" + batch_size = 2 + seqlen = seqlen_and_topk[0] + num_heads = 4 + head_dim = 128 + index_topk = seqlen_and_topk[1] + + # Create dummy index scores + index_scores = torch.randn(batch_size, seqlen, seqlen, dtype=torch.float32).cuda() + + # Apply causal mask to index_scores before computing topk + causal_mask = torch.triu( + torch.full( + (seqlen, seqlen), float('-inf'), dtype=torch.float32, device=index_scores.device + ), + diagonal=1, + ) + # [batch_size, seqlen, seqlen] + [seqlen, seqlen] -> [batch_size, seqlen, seqlen] + masked_index_scores = index_scores + causal_mask + + # Get topk indices from masked index_scores + topk_k = min(index_topk, seqlen) + topk_indices = masked_index_scores.topk(topk_k, dim=-1)[1] + + query = torch.randn(seqlen, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda() + key = torch.randn(seqlen, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda() + softmax_scale = head_dim**-0.5 + + loss_sparse = compute_dsa_indexer_loss( + index_scores=index_scores, + topk_indices=topk_indices, + query=query, + key=key, + softmax_scale=softmax_scale, + loss_coeff=1.0, + sparse_loss=True, + pg_collection=self.pg_collection, + ) + + loss_dense = compute_dsa_indexer_loss( + index_scores=index_scores, + topk_indices=topk_indices, + query=query, + key=key, + softmax_scale=softmax_scale, + loss_coeff=1.0, + sparse_loss=False, + pg_collection=self.pg_collection, + ) + + # Sparse loss should be different from dense loss + if seqlen > index_topk: + assert loss_sparse != loss_dense + else: + assert loss_sparse == loss_dense + assert loss_sparse >= 0 + assert loss_dense >= 0 + + +class TestDSAIndexerLossAutoScaler: + """Test DSAIndexerLossAutoScaler autograd function.""" + + @pytest.fixture(scope='function', autouse=True) + def setup_method(self): + Utils.initialize_model_parallel( + tensor_model_parallel_size=1, pipeline_model_parallel_size=1 + ) + yield + Utils.destroy_model_parallel() + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_forward_pass(self): + """Test that forward pass preserves output.""" + output = torch.randn(16, 2, 128).cuda() + output.requires_grad_(True) + indexer_loss = torch.tensor(0.5).cuda() + indexer_loss.requires_grad_(True) + + result = DSAIndexerLossAutoScaler.apply(output, indexer_loss) + + assert torch.allclose(result, output, atol=0, rtol=0) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_backward_pass(self): + """Test that backward pass triggers indexer loss backward and scales gradient correctly.""" + output = torch.randn(16, 2, 128).cuda() + output.requires_grad_(True) + + # Create indexer_loss with computation graph + # This simulates compute_dsa_indexer_loss which computes KL divergence + dummy_input = torch.randn(10).cuda() + dummy_input.requires_grad_(True) + indexer_loss = dummy_input.mean() + + # Set loss scale + scale = torch.tensor(2.0).cuda() + DSAIndexerLossAutoScaler.set_loss_scale(scale) + + # Apply the autograd function + result = DSAIndexerLossAutoScaler.apply(output, indexer_loss) + + # Trigger backward + main_loss = result.sum() + main_loss.backward() + + # Check that gradients flow back to output + assert output.grad is not None, "Gradient should flow back to parameters" + + # Check that indexer_loss backward was triggered + assert dummy_input.grad is not None, "Indexer loss backward should be triggered" + + # Verify the gradient is scaled correctly + expected_grad_per_element = scale.item() / len(dummy_input) + assert torch.allclose( + dummy_input.grad, + torch.full_like(dummy_input, expected_grad_per_element), + rtol=0, + atol=0, + ), f"Gradient should be scaled by loss scale, expected {expected_grad_per_element}, got {dummy_input.grad[0].item()}" + + +@pytest.mark.parametrize("seqlen", [16, 64]) +class TestDSAIndexer: + """Test DSA Indexer module basic functionality with TP=1.""" + + @pytest.fixture(scope='function', autouse=True) + def setup_method(self): + Utils.initialize_model_parallel( + tensor_model_parallel_size=1, pipeline_model_parallel_size=1 + ) + torch.manual_seed(123) + model_parallel_cuda_manual_seed(123) + + # Create MLA config with sparse attention parameters + self.index_topk = 32 + self.config = MLATransformerConfig( + num_layers=2, + hidden_size=256, + num_attention_heads=16, + use_cpu_initialization=True, + bf16=True, + params_dtype=torch.bfloat16, + # MLA specific configs + q_lora_rank=64, + kv_lora_rank=64, + qk_head_dim=64, + qk_pos_emb_head_dim=32, + v_head_dim=64, + rope_type='rope', + rotary_base=10000, + rotary_percent=1.0, + # Sparse attention specific configs + dsa_indexer_n_heads=8, + dsa_indexer_head_dim=64, + dsa_indexer_topk=self.index_topk, + ) + + # Create indexer submodules spec + from megatron.core.extensions.transformer_engine import TELinear, TENorm + from megatron.core.transformer.spec_utils import ModuleSpec + + indexer_submodules = DSAIndexerSubmodules( + linear_wq_b=ModuleSpec(module=TELinear), + linear_wk=ModuleSpec(module=TELinear), + k_norm=ModuleSpec(module=TENorm), + linear_weights_proj=ModuleSpec(module=TELinear), + ) + + self.pg_collection = ProcessGroupCollection.use_mpu_process_groups( + required_pgs=['tp', 'cp'] + ) + self.indexer = DSAIndexer(self.config, indexer_submodules, self.pg_collection) + + yield + Utils.destroy_model_parallel() + + def test_dsa_indexer_constructor(self, seqlen): + """Test indexer initialization.""" + assert isinstance(self.indexer, DSAIndexer) + assert self.indexer.hidden_size == 256 + assert self.indexer.index_n_heads == 8 + assert self.indexer.index_head_dim == 64 + assert self.indexer.index_topk == 32 + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_dsa_indexer_forward(self, seqlen): + """Test indexer forward pass.""" + batch_size = 2 + + self.indexer.cuda() + + # Create input tensors + x = torch.randn(seqlen, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda() + qr = torch.randn(seqlen, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda() + + # Forward pass + topk_indices = self.indexer(x, qr) + + # Check output shape + assert topk_indices.shape == (batch_size, seqlen, min(self.config.dsa_indexer_topk, seqlen)) + assert topk_indices.dtype == torch.long + assert torch.all((topk_indices >= 0) & (topk_indices < seqlen)) + # Make sure no duplicate indices are selected + assert torch.all( + torch.sort(topk_indices, dim=-1).values[:, :, 1:] + != torch.sort(topk_indices, dim=-1).values[:, :, :-1] + ) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_dsa_indexer_forward_with_scores(self, seqlen): + """Test indexer forward pass with scores.""" + batch_size = 2 + + self.indexer.cuda() + + # Create input tensors + x = torch.randn(seqlen, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda() + qr = torch.randn(seqlen, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda() + + # Forward pass with scores + index_scores, topk_indices = self.indexer.forward_with_scores(x, qr) + + # Check output shapes + assert index_scores.shape == (batch_size, seqlen, seqlen) + assert topk_indices.shape == (batch_size, seqlen, min(self.config.dsa_indexer_topk, seqlen)) + assert index_scores.dtype == torch.float32 + assert topk_indices.dtype == torch.long + assert torch.all((topk_indices >= 0) & (topk_indices < seqlen)) + # Make sure no duplicate indices are selected + assert torch.all( + torch.sort(topk_indices, dim=-1).values[:, :, 1:] + != torch.sort(topk_indices, dim=-1).values[:, :, :-1] + ) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_dsa_indexer_with_mask(self, seqlen): + """Test indexer with attention mask.""" + batch_size = 2 + + self.indexer.cuda() + + # Create input tensors + x = torch.randn(seqlen, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda() + qr = torch.randn(seqlen, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda() + mask = torch.triu( + torch.full((batch_size, seqlen, seqlen), float('-inf'), dtype=torch.float32).cuda(), + diagonal=1, + ) + + # Forward pass with mask + index_scores, topk_indices = self.indexer.forward_with_scores(x, qr, mask=mask) + + # Check that masked positions are not selected + # For causal mask, topk_indices[b, i, :] should all be <= i (except for the case that + # i < index_topk). + for b in range(batch_size): + for i in range(seqlen): + assert torch.all(topk_indices[b, i] <= max(self.index_topk, i)) + + +class TestDSAttention: + """Test DSAttention module basic functionality with TP=1.""" + + @pytest.fixture(scope='function', autouse=True) + def setup_method(self): + Utils.initialize_model_parallel( + tensor_model_parallel_size=1, pipeline_model_parallel_size=1 + ) + torch.manual_seed(123) + model_parallel_cuda_manual_seed(123) + + # Create MLA config with sparse attention parameters + self.config = MLATransformerConfig( + num_layers=2, + hidden_size=256, + num_attention_heads=16, + use_cpu_initialization=True, + bf16=True, + params_dtype=torch.bfloat16, + # MLA specific configs + q_lora_rank=64, + kv_lora_rank=64, + qk_head_dim=64, + qk_pos_emb_head_dim=32, + v_head_dim=64, + rope_type='rope', + rotary_base=10000, + rotary_percent=1.0, + # Sparse attention specific configs + dsa_indexer_n_heads=8, + dsa_indexer_head_dim=64, + dsa_indexer_topk=32, + dsa_indexer_loss_coeff=1.0, + dsa_indexer_use_sparse_loss=False, + ) + + # Create sparse attention submodules spec + from megatron.core.extensions.transformer_engine import TELinear, TENorm + from megatron.core.transformer.spec_utils import ModuleSpec + + indexer_submodules = DSAIndexerSubmodules( + linear_wq_b=ModuleSpec(module=TELinear), + linear_wk=ModuleSpec(module=TELinear), + k_norm=ModuleSpec(module=TENorm), + linear_weights_proj=ModuleSpec(module=TELinear), + ) + indexer_spec = ModuleSpec(module=DSAIndexer, submodules=indexer_submodules) + sparse_attention_submodules = DSAttentionSubmodules(indexer=indexer_spec) + + self.pg_collection = ProcessGroupCollection.use_mpu_process_groups( + required_pgs=['tp', 'cp'] + ) + + self.sparse_attention = DSAttention( + config=self.config, + submodules=sparse_attention_submodules, + layer_number=1, + attn_mask_type=AttnMaskType.causal, + attention_type='self', + pg_collection=self.pg_collection, + ) + + yield + Utils.destroy_model_parallel() + + def test_dsa_constructor(self): + """Test sparse attention initialization.""" + assert isinstance(self.sparse_attention, DSAttention) + assert hasattr(self.sparse_attention, 'indexer') + assert isinstance(self.sparse_attention.indexer, DSAIndexer) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_dsa_forward(self): + """Test sparse attention forward pass.""" + seq_len = 16 + batch_size = 2 + num_heads = self.config.num_attention_heads + head_dim = self.config.hidden_size // num_heads + + self.sparse_attention.cuda() + + # Create input tensors [seq_len, batch, num_heads, head_dim] + query = ( + torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16) + .cuda() + .requires_grad_(True) + ) + key = ( + torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16) + .cuda() + .requires_grad_(True) + ) + value = ( + torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16) + .cuda() + .requires_grad_(True) + ) + + # Original hidden states and low-rank query + x = torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda() + qr = torch.randn(seq_len, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda() + + # Create causal attention mask + attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda() + attention_mask = torch.tril(attention_mask) + + # Forward pass + output = self.sparse_attention( + query=query, + key=key, + value=value, + x=x, + qr=qr, + attention_mask=attention_mask, + attn_mask_type=AttnMaskType.causal, + ) + + # Check output shape + assert output.shape == (seq_len, batch_size, self.config.hidden_size) + assert output.dtype == torch.bfloat16 + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_dsa_backward(self): + """Test sparse attention backward pass with indexer loss.""" + seq_len = 16 + batch_size = 2 + num_heads = self.config.num_attention_heads + head_dim = self.config.hidden_size // num_heads + + self.sparse_attention.train() + self.sparse_attention.cuda() + + # Create input tensors + query = ( + torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16) + .cuda() + .requires_grad_(True) + ) + key = ( + torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16) + .cuda() + .requires_grad_(True) + ) + value = ( + torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16) + .cuda() + .requires_grad_(True) + ) + + # Original hidden states and low-rank query + x = torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda() + qr = torch.randn(seq_len, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda() + + # Create causal attention mask + attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda() + attention_mask = torch.tril(attention_mask) + + # Forward pass + output = self.sparse_attention( + query=query, + key=key, + value=value, + x=x, + qr=qr, + attention_mask=attention_mask, + attn_mask_type=AttnMaskType.causal, + ) + + # Backward pass + loss = output.sum() + loss.backward() + + # Check that gradients are computed for inputs + assert query.grad is not None + assert key.grad is not None + assert value.grad is not None + + # Check that indexer parameters have gradients + for name, param in self.sparse_attention.indexer.named_parameters(): + if param.requires_grad: + assert param.grad is not None, f"Indexer parameter {name} has no gradient" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_dsa_topk_selection(self): + """Test that sparse attention correctly selects top-k indices.""" + seq_len = 16 + batch_size = 2 + num_heads = self.config.num_attention_heads + head_dim = self.config.hidden_size // num_heads + + self.sparse_attention.eval() + self.sparse_attention.cuda() + + # Create input tensors + query = torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda() + key = torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda() + value = torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda() + + # Original hidden states and low-rank query + x = torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda() + qr = torch.randn(seq_len, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda() + + # Create causal attention mask + attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda() + attention_mask = torch.tril(attention_mask) + + with torch.no_grad(): + # Get topk indices from indexer + _, topk_indices = self.sparse_attention.indexer.forward_with_scores(x, qr) + + # Forward pass + output = self.sparse_attention( + query=query, + key=key, + value=value, + x=x, + qr=qr, + attention_mask=attention_mask, + attn_mask_type=AttnMaskType.causal, + ) + + # Check that topk_indices are valid + assert torch.all(topk_indices >= 0) + assert torch.all(topk_indices < seq_len) + assert topk_indices.shape[2] == min(self.config.dsa_indexer_topk, seq_len) + + +# ====================================================================================== +# Tensor Parallel Consistency Tests +# ====================================================================================== + + +@pytest.mark.parametrize("tensor_model_parallel_size", [2, 4, 8]) +@pytest.mark.parametrize("sequence_parallel", [False, True]) +class TestIndexerTensorParallel: + """Test DSA Indexer with different TP sizes and SP settings, compare with TP=1 baseline.""" + + def _create_config(self, sequence_parallel=False): + """Helper to create MLA config.""" + # Get TP size from parallel_state + tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size() + + return MLATransformerConfig( + num_layers=2, + hidden_size=256, + num_attention_heads=16, + use_cpu_initialization=True, + bf16=True, + params_dtype=torch.bfloat16, + tensor_model_parallel_size=tensor_model_parallel_size, + sequence_parallel=sequence_parallel, + # MLA specific configs + q_lora_rank=64, + kv_lora_rank=64, + qk_head_dim=64, + qk_pos_emb_head_dim=32, + v_head_dim=64, + rope_type='rope', + rotary_base=10000, + rotary_percent=1.0, + # Sparse attention specific configs + dsa_indexer_n_heads=8, + dsa_indexer_head_dim=64, + dsa_indexer_topk=32, + ) + + def _create_indexer(self, config, pg_collection): + """Helper to create indexer.""" + from megatron.core.extensions.transformer_engine import TELinear, TENorm + from megatron.core.transformer.spec_utils import ModuleSpec + + indexer_submodules = DSAIndexerSubmodules( + linear_wq_b=ModuleSpec(module=TELinear), + linear_wk=ModuleSpec(module=TELinear), + k_norm=ModuleSpec(module=TENorm), + linear_weights_proj=ModuleSpec(module=TELinear), + ) + + return DSAIndexer(config, indexer_submodules, pg_collection) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_dsa_indexer_weight_consistency(self, tensor_model_parallel_size, sequence_parallel): + """Test that indexer weights are identical across ALL GPUs.""" + Utils.initialize_model_parallel( + tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1 + ) + torch.manual_seed(123) + model_parallel_cuda_manual_seed(123) + + config = self._create_config(sequence_parallel=sequence_parallel) + pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp']) + indexer = self._create_indexer(config, pg_collection).cuda() + + # Check that all weights are identical across ALL ranks (not just TP group) + world_size = torch.distributed.get_world_size() + world_rank = torch.distributed.get_rank() + + if world_size > 1: + for name, param in indexer.named_parameters(): + # Gather weights from ALL ranks in WORLD group + param_list = [torch.zeros_like(param.data) for _ in range(world_size)] + torch.distributed.all_gather(param_list, param.data) + + # All weights should be identical across all GPUs + for i in range(1, world_size): + assert torch.allclose( + param_list[0], param_list[i], rtol=0, atol=0 + ), f"Parameter {name} differs between rank 0 and rank {i} (world)" + + Utils.destroy_model_parallel() + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_dsa_indexer_forward_consistency(self, tensor_model_parallel_size, sequence_parallel): + """Test that indexer gives consistent results across different TP sizes and SP settings.""" + # First run with TP=1 to get baseline + Utils.initialize_model_parallel( + tensor_model_parallel_size=1, pipeline_model_parallel_size=1 + ) + torch.manual_seed(123) + model_parallel_cuda_manual_seed(123) + + config_tp1 = self._create_config(sequence_parallel=False) # TP=1 doesn't use SP + pg_collection_tp1 = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp']) + indexer_tp1 = self._create_indexer(config_tp1, pg_collection_tp1).cuda() + + seq_len = 64 + batch_size = 2 + + # Create one common input (all ranks create same input with same seed) + x_input = torch.randn( + seq_len, batch_size, config_tp1.hidden_size, dtype=torch.bfloat16 + ).cuda() + qr_input = torch.randn( + seq_len, batch_size, config_tp1.q_lora_rank, dtype=torch.bfloat16 + ).cuda() + + # Forward pass with gradients enabled + index_scores_tp1, topk_indices_tp1 = indexer_tp1.forward_with_scores(x_input, qr_input) + + # Backward pass + loss_tp1 = index_scores_tp1.sum() + loss_tp1.backward() + + # Save gradients from TP=1 + indexer_tp1_grads = { + name: param.grad.clone().cpu() + for name, param in indexer_tp1.named_parameters() + if param.grad is not None + } + + Utils.destroy_model_parallel() + + # Now run with target TP size + Utils.initialize_model_parallel( + tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1 + ) + torch.manual_seed(123) + model_parallel_cuda_manual_seed(123) + + config_tpn = self._create_config(sequence_parallel=sequence_parallel) + pg_collection_tpn = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp']) + indexer_tpn = self._create_indexer(config_tpn, pg_collection_tpn).cuda() + + # Prepare input: split along seqlen if SP is enabled + if sequence_parallel: + tp_rank = parallel_state.get_tensor_model_parallel_rank() + seq_per_rank = seq_len // tensor_model_parallel_size + start_idx = tp_rank * seq_per_rank + end_idx = (tp_rank + 1) * seq_per_rank + x_tpn = x_input[start_idx:end_idx] + qr_tpn = qr_input[start_idx:end_idx] + else: + # No SP: all TP ranks see full input + x_tpn = x_input + qr_tpn = qr_input + + # Forward pass with gradients enabled + index_scores_tpn, topk_indices_tpn = indexer_tpn.forward_with_scores(x_tpn, qr_tpn) + + # Backward pass + loss_tpn = index_scores_tpn.sum() + loss_tpn.backward() + + # Compare forward outputs + assert index_scores_tpn.shape == index_scores_tp1.shape + assert topk_indices_tpn.shape == topk_indices_tp1.shape + + # Check that index scores are close (allow for floating point accumulation errors) + assert torch.allclose( + index_scores_tpn, index_scores_tp1, rtol=0, atol=0 + ), f"Index scores mismatch between TP=1 and TP={tensor_model_parallel_size}, SP={sequence_parallel}" + + # Check that topk indices are exactly the same + assert torch.equal( + topk_indices_tpn, topk_indices_tp1 + ), f"Top-k indices mismatch between TP=1 and TP={tensor_model_parallel_size}, SP={sequence_parallel}" + + # Compare gradients - indexer grads should be identical (duplicated weights) + for name, param in indexer_tpn.named_parameters(): + if param.grad is not None and name in indexer_tp1_grads: + assert torch.allclose( + param.grad.cpu(), indexer_tp1_grads[name], rtol=0, atol=0 + ), f"Indexer gradient {name} mismatch between TP=1 and TP={tensor_model_parallel_size}, SP={sequence_parallel}" + + Utils.destroy_model_parallel() + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_dsa_indexer_gradient_sync(self, tensor_model_parallel_size, sequence_parallel): + """Test that gradients are properly synchronized within TP group.""" + Utils.initialize_model_parallel( + tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1 + ) + torch.manual_seed(123) + model_parallel_cuda_manual_seed(123) + + config = self._create_config(sequence_parallel=sequence_parallel) + pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp']) + indexer = self._create_indexer(config, pg_collection).cuda() + + seq_len = 64 + batch_size = 2 + + # Create one common input (all ranks create same input with same seed) + x_input = torch.randn(seq_len, batch_size, config.hidden_size, dtype=torch.bfloat16).cuda() + qr_input = torch.randn(seq_len, batch_size, config.q_lora_rank, dtype=torch.bfloat16).cuda() + + # Prepare input: split along seqlen if SP is enabled + if sequence_parallel: + tp_rank = parallel_state.get_tensor_model_parallel_rank() + tp_size = parallel_state.get_tensor_model_parallel_world_size() + seq_per_rank = seq_len // tp_size + start_idx = tp_rank * seq_per_rank + end_idx = (tp_rank + 1) * seq_per_rank + x = x_input[start_idx:end_idx] + qr = qr_input[start_idx:end_idx] + else: + # No SP: all TP ranks see full input + x = x_input + qr = qr_input + + # Forward and backward + index_scores, topk_indices = indexer.forward_with_scores(x, qr) + loss = index_scores.sum() + loss.backward() + + # Check that all parameters have gradients + for name, param in indexer.named_parameters(): + if param.requires_grad: + assert param.grad is not None, f"Parameter {name} has no gradient" + + # After TP sync, check that gradients are identical within TP group + # Note: We only check TP group because DDP sync happens separately + tp_size = parallel_state.get_tensor_model_parallel_world_size() + if tp_size > 1: + for name, param in indexer.named_parameters(): + if param.requires_grad and param.grad is not None: + # Gather gradients from all ranks in TP group only + grad_list = [torch.zeros_like(param.grad) for _ in range(tp_size)] + torch.distributed.all_gather(grad_list, param.grad, group=pg_collection.tp) + + # All gradients should be identical within TP group after sync + for i in range(1, tp_size): + assert torch.allclose( + grad_list[0], grad_list[i], rtol=0, atol=0 + ), f"Gradient for {name} differs between TP rank 0 and rank {i} after TP sync" + + Utils.destroy_model_parallel() + + +@pytest.mark.parametrize("tensor_model_parallel_size", [2, 4]) +@pytest.mark.parametrize("sequence_parallel", [False, True]) +@pytest.mark.parametrize("use_sparse_indexer_loss", [False, True]) +class TestDSAttentionTensorParallel: + """Test DSAttention with different TP sizes, SP settings, and sparse indexer loss.""" + + def _create_config(self, sequence_parallel=False, use_sparse_indexer_loss=False): + """Helper to create MLA config.""" + # Get TP size from parallel_state + tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size() + + return MLATransformerConfig( + num_layers=2, + hidden_size=256, + num_attention_heads=16, + use_cpu_initialization=True, + bf16=True, + params_dtype=torch.bfloat16, + tensor_model_parallel_size=tensor_model_parallel_size, + sequence_parallel=sequence_parallel, + # MLA specific configs + q_lora_rank=64, + kv_lora_rank=64, + qk_head_dim=64, + qk_pos_emb_head_dim=32, + v_head_dim=64, + rope_type='rope', + rotary_base=10000, + rotary_percent=1.0, + # Sparse attention specific configs + dsa_indexer_n_heads=8, + dsa_indexer_head_dim=64, + dsa_indexer_topk=32, + dsa_indexer_loss_coeff=1.0, + dsa_indexer_use_sparse_loss=use_sparse_indexer_loss, + ) + + def _create_sparse_attention(self, config, pg_collection): + """Helper to create sparse attention.""" + from megatron.core.extensions.transformer_engine import TELinear, TENorm + from megatron.core.transformer.spec_utils import ModuleSpec + + indexer_submodules = DSAIndexerSubmodules( + linear_wq_b=ModuleSpec(module=TELinear), + linear_wk=ModuleSpec(module=TELinear), + k_norm=ModuleSpec(module=TENorm), + linear_weights_proj=ModuleSpec(module=TELinear), + ) + indexer_spec = ModuleSpec(module=DSAIndexer, submodules=indexer_submodules) + sparse_attention_submodules = DSAttentionSubmodules(indexer=indexer_spec) + + return DSAttention( + config=config, + submodules=sparse_attention_submodules, + layer_number=1, + attn_mask_type=AttnMaskType.causal, + attention_type='self', + pg_collection=pg_collection, + ) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_dsa_weight_consistency( + self, tensor_model_parallel_size, sequence_parallel, use_sparse_indexer_loss + ): + """Test that sparse attention indexer weights are identical across ALL GPUs.""" + Utils.initialize_model_parallel( + tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1 + ) + torch.manual_seed(123) + model_parallel_cuda_manual_seed(123) + + config = self._create_config( + sequence_parallel=sequence_parallel, use_sparse_indexer_loss=use_sparse_indexer_loss + ) + pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp']) + sparse_attention = self._create_sparse_attention(config, pg_collection).cuda() + + # Check that all indexer weights are identical across ALL ranks + world_size = torch.distributed.get_world_size() + world_rank = torch.distributed.get_rank() + + if world_size > 1: + for name, param in sparse_attention.indexer.named_parameters(): + # Gather weights from ALL ranks in WORLD group + param_list = [torch.zeros_like(param.data) for _ in range(world_size)] + torch.distributed.all_gather(param_list, param.data) + + # All weights should be identical across all GPUs + for i in range(1, world_size): + torch.testing.assert_close(param_list[0], param_list[i], rtol=0, atol=0) + + Utils.destroy_model_parallel() + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_dsa_forward_consistency( + self, tensor_model_parallel_size, sequence_parallel, use_sparse_indexer_loss + ): + """Test that sparse attention gives consistent results across different TP, SP, and sparse loss settings.""" + # First run with TP=1 to get baseline + Utils.initialize_model_parallel( + tensor_model_parallel_size=1, pipeline_model_parallel_size=1 + ) + torch.manual_seed(123) + model_parallel_cuda_manual_seed(123) + + config_tp1 = self._create_config( + sequence_parallel=False, use_sparse_indexer_loss=use_sparse_indexer_loss + ) # TP=1 doesn't use SP + pg_collection_tp1 = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp']) + sparse_attention_tp1 = self._create_sparse_attention(config_tp1, pg_collection_tp1).cuda() + + seq_len = 64 + batch_size = 2 + num_heads = config_tp1.num_attention_heads + head_dim = config_tp1.hidden_size // num_heads + + # Create one common input (all ranks create same input with same seed) + query_input = ( + torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16) + .cuda() + .requires_grad_(True) + ) + key_input = ( + torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16) + .cuda() + .requires_grad_(True) + ) + value_input = ( + torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16) + .cuda() + .requires_grad_(True) + ) + x_input = torch.randn( + seq_len, batch_size, config_tp1.hidden_size, dtype=torch.bfloat16 + ).cuda() + qr_input = torch.randn( + seq_len, batch_size, config_tp1.q_lora_rank, dtype=torch.bfloat16 + ).cuda() + attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda() + attention_mask = torch.tril(attention_mask) + + # Forward pass with gradients enabled + sparse_attention_tp1.train() + output_tp1 = sparse_attention_tp1( + query=query_input, + key=key_input, + value=value_input, + x=x_input, + qr=qr_input, + attention_mask=attention_mask, + attn_mask_type=AttnMaskType.causal, + ) + + # Backward pass + loss_tp1 = output_tp1.sum() + loss_tp1.backward() + + # Save gradients from TP=1 + indexer_tp1_grads = { + name: param.grad.clone() + for name, param in sparse_attention_tp1.indexer.named_parameters() + if param.grad is not None + } + query_tp1_grad = query_input.grad.clone().cpu() + key_tp1_grad = key_input.grad.clone().cpu() + value_tp1_grad = value_input.grad.clone().cpu() + + Utils.destroy_model_parallel() + + # Now run with target TP size + Utils.initialize_model_parallel( + tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1 + ) + torch.manual_seed(123) + model_parallel_cuda_manual_seed(123) + + config_tpn = self._create_config( + sequence_parallel=sequence_parallel, use_sparse_indexer_loss=use_sparse_indexer_loss + ) + pg_collection_tpn = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp']) + sparse_attention_tpn = self._create_sparse_attention(config_tpn, pg_collection_tpn).cuda() + + # Create one common input (all ranks create same input with same seed) + query_input = torch.randn( + seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16 + ).cuda() + key_input = torch.randn( + seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16 + ).cuda() + value_input = torch.randn( + seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16 + ).cuda() + x_input = torch.randn( + seq_len, batch_size, config_tp1.hidden_size, dtype=torch.bfloat16 + ).cuda() + qr_input = torch.randn( + seq_len, batch_size, config_tp1.q_lora_rank, dtype=torch.bfloat16 + ).cuda() + attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda() + attention_mask = torch.tril(attention_mask) + + # Prepare input: split along seqlen if SP is enabled + tp_rank = parallel_state.get_tensor_model_parallel_rank() + if sequence_parallel: + seq_per_rank = seq_len // tensor_model_parallel_size + start_idx = tp_rank * seq_per_rank + end_idx = (tp_rank + 1) * seq_per_rank + x_tpn = x_input[start_idx:end_idx] + qr_tpn = qr_input[start_idx:end_idx] + else: + x_tpn = x_input + qr_tpn = qr_input + + query_input = query_input.detach() + key_input = key_input.detach() + value_input = value_input.detach() + head_per_rank = num_heads // tensor_model_parallel_size + start_head = tp_rank * head_per_rank + end_head = (tp_rank + 1) * head_per_rank + query_tpn = query_input[:, :, start_head:end_head, :].clone().requires_grad_(True) + key_tpn = key_input[:, :, start_head:end_head, :].clone().requires_grad_(True) + value_tpn = value_input[:, :, start_head:end_head, :].clone().requires_grad_(True) + attention_mask_tpn = attention_mask + + # Forward pass with gradients enabled + sparse_attention_tpn.train() + output_tpn = sparse_attention_tpn( + query=query_tpn, + key=key_tpn, + value=value_tpn, + x=x_tpn, + qr=qr_tpn, + attention_mask=attention_mask_tpn, + attn_mask_type=AttnMaskType.causal, + ) + + # Backward pass + loss_tpn = output_tpn.sum() + loss_tpn.backward() + + from megatron.core.tensor_parallel.mappings import gather_from_tensor_model_parallel_region + + output_tpn_gathered = gather_from_tensor_model_parallel_region( + output_tpn, group=pg_collection_tpn.tp + ) + assert output_tpn_gathered.shape == output_tp1.shape + assert torch.allclose( + output_tpn_gathered.detach(), output_tp1.detach(), rtol=0, atol=0 + ), f"Sparse attention outputs mismatch between TP=1 and TP={tensor_model_parallel_size}, SP={sequence_parallel}, sparse_loss={use_sparse_indexer_loss}" + + # 1. Check indexer gradients. + for name, param in sparse_attention_tpn.indexer.named_parameters(): + if param.grad is not None and name in indexer_tp1_grads: + torch.testing.assert_close( + param.grad, indexer_tp1_grads[name], rtol=1e-5, atol=1e-5 + ) + + # 2. Query/Key/Value gradients need to be gathered along num_heads dim (dim 2) if SP is enabled + # Flatten last two dims: [seq_len, batch, num_heads, head_dim] -> [seq_len, batch, num_heads * head_dim] + sq, b, nh, hd = query_tpn.grad.shape + query_grad_flat = query_tpn.grad.reshape(sq, b, nh * hd) + key_grad_flat = key_tpn.grad.reshape(sq, b, nh * hd) + value_grad_flat = value_tpn.grad.reshape(sq, b, nh * hd) + + # Gather along last dim + query_grad_gathered_flat = gather_from_tensor_model_parallel_region( + query_grad_flat, group=pg_collection_tpn.tp + ) + key_grad_gathered_flat = gather_from_tensor_model_parallel_region( + key_grad_flat, group=pg_collection_tpn.tp + ) + value_grad_gathered_flat = gather_from_tensor_model_parallel_region( + value_grad_flat, group=pg_collection_tpn.tp + ) + + # Reshape back: [seq_len, batch, num_heads * head_dim] -> [seq_len, batch, num_heads, head_dim] + query_tpn_grad_gathered = query_grad_gathered_flat.reshape(sq, b, num_heads, hd) + key_tpn_grad_gathered = key_grad_gathered_flat.reshape(sq, b, num_heads, hd) + value_tpn_grad_gathered = value_grad_gathered_flat.reshape(sq, b, num_heads, hd) + + assert torch.allclose( + query_tpn_grad_gathered.cpu(), query_tp1_grad, rtol=0, atol=0 + ), f"Query gradient mismatch between TP=1 and TP={tensor_model_parallel_size}" + assert torch.allclose( + key_tpn_grad_gathered.cpu(), key_tp1_grad, rtol=0, atol=0 + ), f"Key gradient mismatch between TP=1 and TP={tensor_model_parallel_size}" + assert torch.allclose( + value_tpn_grad_gathered.cpu(), value_tp1_grad, rtol=0, atol=0 + ), f"Value gradient mismatch between TP=1 and TP={tensor_model_parallel_size}" + + Utils.destroy_model_parallel() + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_dsa_gradient_sync( + self, tensor_model_parallel_size, sequence_parallel, use_sparse_indexer_loss + ): + """Test that indexer gradients are properly synchronized within TP group.""" + Utils.initialize_model_parallel( + tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1 + ) + torch.manual_seed(123) + model_parallel_cuda_manual_seed(123) + + config = self._create_config( + sequence_parallel=sequence_parallel, use_sparse_indexer_loss=use_sparse_indexer_loss + ) + pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp']) + sparse_attention = self._create_sparse_attention(config, pg_collection).cuda() + sparse_attention.train() + + seq_len = 64 + batch_size = 2 + num_heads = config.num_attention_heads + head_dim = config.hidden_size // num_heads + + # Create one common input (all ranks create same input with same seed) + query_input = torch.randn( + seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16 + ).cuda() + key_input = torch.randn( + seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16 + ).cuda() + value_input = torch.randn( + seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16 + ).cuda() + x_input = torch.randn(seq_len, batch_size, config.hidden_size, dtype=torch.bfloat16).cuda() + qr_input = torch.randn(seq_len, batch_size, config.q_lora_rank, dtype=torch.bfloat16).cuda() + + # Prepare input: split along seqlen if SP is enabled + tp_rank = parallel_state.get_tensor_model_parallel_rank() + if sequence_parallel: + tp_size = parallel_state.get_tensor_model_parallel_world_size() + seq_per_rank = seq_len // tp_size + start_idx = tp_rank * seq_per_rank + end_idx = (tp_rank + 1) * seq_per_rank + x = x_input[start_idx:end_idx] + qr = qr_input[start_idx:end_idx] + else: + x = x_input + qr = qr_input + + # query, key, value should be split along num_heads dim + head_per_rank = num_heads // tensor_model_parallel_size + start_head = tp_rank * head_per_rank + end_head = (tp_rank + 1) * head_per_rank + query = query_input[:, :, start_head:end_head, :] + key = key_input[:, :, start_head:end_head, :] + value = value_input[:, :, start_head:end_head, :] + + attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda() + attention_mask = torch.tril(attention_mask) + + query.requires_grad_(True) + key.requires_grad_(True) + value.requires_grad_(True) + + # Forward and backward + output = sparse_attention( + query=query, + key=key, + value=value, + x=x, + qr=qr, + attention_mask=attention_mask, + attn_mask_type=AttnMaskType.causal, + ) + + loss = output.sum() + loss.backward() + + # Check that gradients exist before sync + assert query.grad is not None + assert key.grad is not None + assert value.grad is not None + + # Check that indexer parameters have gradients + for name, param in sparse_attention.indexer.named_parameters(): + if param.requires_grad: + assert param.grad is not None, f"Indexer parameter {name} has no gradient" + + # Check that indexer gradients are identical within TP group + tp_size = parallel_state.get_tensor_model_parallel_world_size() + if tp_size > 1: + for name, param in sparse_attention.indexer.named_parameters(): + if param.requires_grad and param.grad is not None: + # Gather gradients from all ranks in TP group only + grad_list = [torch.zeros_like(param.grad) for _ in range(tp_size)] + torch.distributed.all_gather(grad_list, param.grad, group=pg_collection.tp) + + # All gradients should be identical within TP group after sync + for i in range(1, tp_size): + assert torch.allclose( + grad_list[0], grad_list[i], rtol=0, atol=0 + ), f"Indexer gradient for {name} differs between TP rank 0 and rank {i} after TP sync" + + Utils.destroy_model_parallel() From 71357e2ba87c012245fd018eb987a59edffcf222 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 1 Dec 2025 18:27:16 +0000 Subject: [PATCH 166/248] Revert "[Dev] feat(MoE): Refactor cuda_graph_scope - part2 (#2353)" This reverts commit 92c8482e6dcd11c3666c61bb8d1f7e8d0730ed13. --- .../text_generation_controller.py | 3 +- .../common/language_module/language_module.py | 5 +- megatron/core/models/gpt/gpt_model.py | 4 +- megatron/core/pipeline_parallel/schedules.py | 7 +- megatron/core/ssm/mamba_block.py | 3 +- megatron/core/transformer/attention.py | 4 +- megatron/core/transformer/cuda_graphs.py | 47 ++----- megatron/core/transformer/enums.py | 12 -- megatron/core/transformer/moe/fused_a2a.py | 8 -- megatron/core/transformer/moe/moe_utils.py | 7 +- .../core/transformer/moe/token_dispatcher.py | 12 +- .../core/transformer/transformer_block.py | 4 +- .../core/transformer/transformer_config.py | 112 ++++++++--------- .../core/transformer/transformer_layer.py | 47 ++++--- megatron/training/arguments.py | 18 +-- megatron/training/training.py | 9 +- .../inference/engines/test_dynamic_engine.py | 12 +- tests/unit_tests/test_fp8_param.py | 24 ++-- .../transformer/test_cuda_graphs.py | 117 ++++++------------ 19 files changed, 153 insertions(+), 302 deletions(-) diff --git a/megatron/core/inference/text_generation_controllers/text_generation_controller.py b/megatron/core/inference/text_generation_controllers/text_generation_controller.py index 6e00f58ac23..2bda1425710 100644 --- a/megatron/core/inference/text_generation_controllers/text_generation_controller.py +++ b/megatron/core/inference/text_generation_controllers/text_generation_controller.py @@ -29,7 +29,6 @@ ) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.utils import get_attention_mask, set_decode_expert_padding -from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.moe.moe_layer import BaseMoELayer from megatron.core.transformer.utils import set_model_to_sequence_parallel from megatron.core.utils import get_asyncio_loop, get_model_config, unwrap_model @@ -852,7 +851,7 @@ def generate_all_output_tokens_static_batch( # Check whether CUDA graphs are enabled enable_cuda_graph = ( model_config.cuda_graph_impl == "local" - and CudaGraphScope.full_iteration not in model_config.cuda_graph_scope + and "full_iteration" not in model_config.cuda_graph_scope ) # Pad batch tokens if necessary diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py index 259bb716a93..de2ecfb8011 100644 --- a/megatron/core/models/common/language_module/language_module.py +++ b/megatron/core/models/common/language_module/language_module.py @@ -21,7 +21,7 @@ is_vp_last_stage, ) from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.transformer.enums import AttnBackend, CudaGraphScope +from megatron.core.transformer.enums import AttnBackend from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.utils import ensure_metadata_has_dp_cp_group @@ -144,7 +144,8 @@ def compute_language_model_loss(self, labels: Tensor, logits: Tensor) -> Tensor: # Use is_cg_capturable=True for full iteration CUDA graphs to avoid torch.equal checks is_cg_capturable = ( hasattr(self.config, 'cuda_graph_scope') - and CudaGraphScope.full_iteration in self.config.cuda_graph_scope + and self.config.cuda_graph_scope + and 'full_iteration' in self.config.cuda_graph_scope ) if is_cg_capturable and not is_te_min_version("2.7.0"): from megatron.core.utils import get_te_version diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index a3d1a8bfc00..ce1e8e76bd9 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -24,7 +24,7 @@ from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.quantization.utils import get_quant_config_or_none from megatron.core.tensor_parallel import gather_from_sequence_parallel_region -from megatron.core.transformer.enums import CudaGraphScope, ModelType +from megatron.core.transformer.enums import ModelType from megatron.core.transformer.multi_token_prediction import ( MTPLossAutoScaler, MTPLossLoggingHelper, @@ -374,7 +374,7 @@ def _preprocess( and ( ( self.config.cuda_graph_impl == "local" - and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope + and "full_iteration" not in self.config.cuda_graph_scope ) or self.config.flash_decode ) diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index 18344429c45..d0b912349b4 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -21,7 +21,6 @@ ) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.cuda_graphs import create_cudagraphs -from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.moe.router import MoEAuxLossAutoScaler from megatron.core.utils import ( drain_embedding_wgrad_compute, @@ -657,7 +656,7 @@ def forward_backward_no_pipelining( if ( hasattr(config, 'cuda_graph_impl') and config.cuda_graph_impl == "local" - and CudaGraphScope.full_iteration not in config.cuda_graph_scope + and "full_iteration" not in config.cuda_graph_scope ): create_cudagraphs() @@ -1924,7 +1923,7 @@ def pp_post_backward(input_tensor_grad, vp_stage=None): if ( hasattr(config, 'cuda_graph_impl') and config.cuda_graph_impl == "local" - and CudaGraphScope.full_iteration not in config.cuda_graph_scope + and "full_iteration" not in config.cuda_graph_scope ): create_cudagraphs() nvtx_range_pop(suffix="misc") @@ -2311,7 +2310,7 @@ def enable_grad_sync(): if ( hasattr(config, 'cuda_graph_impl') and config.cuda_graph_impl == "local" - and CudaGraphScope.full_iteration not in config.cuda_graph_scope + and "full_iteration" not in config.cuda_graph_scope ): create_cudagraphs() diff --git a/megatron/core/ssm/mamba_block.py b/megatron/core/ssm/mamba_block.py index 3201a8bfb28..1bcadd0af10 100644 --- a/megatron/core/ssm/mamba_block.py +++ b/megatron/core/ssm/mamba_block.py @@ -25,7 +25,6 @@ from megatron.core.ssm.mamba_hybrid_layer_allocation import allocate_layers from megatron.core.tensor_parallel import get_cuda_rng_tracker from megatron.core.transformer import TransformerConfig -from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.spec_utils import ModuleSpec, build_module @@ -295,7 +294,7 @@ def forward( ( ( self.config.cuda_graph_impl == "local" - and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope + and "full_iteration" not in self.config.cuda_graph_scope ) or self.config.flash_decode ) diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index 5cf22d25a4b..f6f40027789 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -45,7 +45,7 @@ from ..models.common.embeddings.yarn_rotary_pos_embedding import ( _yarn_get_concentration_factor_from_config, ) -from .enums import AttnMaskType, CudaGraphScope +from .enums import AttnMaskType from .transformer_config import TransformerConfig try: @@ -829,7 +829,7 @@ def forward( if ( in_decode_mode and self.config.cuda_graph_impl == "local" - and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope + and "full_iteration" not in self.config.cuda_graph_scope and inference_context.is_static_batching() ): raise ValueError(f"CUDA graphs must use flash decode with static batching!") diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index 5b0a0333d9e..12f15ee980a 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -21,7 +21,6 @@ get_all_rng_states, get_cuda_rng_tracker, ) -from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.module import GraphableMegatronModule, MegatronModule from megatron.core.transformer.transformer_config import TransformerConfig @@ -1345,24 +1344,24 @@ def _layer_is_graphable(layer, config): from megatron.core.transformer.moe.moe_layer import MoELayer from megatron.core.transformer.transformer_layer import TransformerLayer - if isinstance(layer, MambaLayer) and CudaGraphScope.mamba in config.cuda_graph_scope: + if isinstance(layer, MambaLayer) and 'mamba' in config.cuda_graph_scope: # mamba layer. return True if isinstance(layer, TransformerLayer): - if CudaGraphScope.attn in config.cuda_graph_scope and not ( + if 'attn' in config.cuda_graph_scope and not ( isinstance(layer.self_attention, IdentityOp) and isinstance(layer.cross_attention, IdentityOp) ): # attn layer. return True if ( - CudaGraphScope.moe in config.cuda_graph_scope - or CudaGraphScope.moe_router in config.cuda_graph_scope - or CudaGraphScope.moe_preprocess in config.cuda_graph_scope + 'moe' in config.cuda_graph_scope + or 'moe_router' in config.cuda_graph_scope + or 'moe_preprocess' in config.cuda_graph_scope ) and isinstance(layer.mlp, MoELayer): # moe layer. return True - if CudaGraphScope.mlp in config.cuda_graph_scope and isinstance(layer.mlp, MLP): + if 'mlp' in config.cuda_graph_scope and isinstance(layer.mlp, MLP): # mlp layer. return True return False @@ -1389,7 +1388,7 @@ def __init__(self, model, config, seq_length, micro_batch_size, optimizers=[]): "Setting NCCL_GRAPH_REGISTER=0 to avoid illegal memory access when using " "CUDA Graph with PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True." ) - assert CudaGraphScope.full_iteration not in config.cuda_graph_scope, ( + assert "full_iteration" not in config.cuda_graph_scope, ( "full_iteration cuda graph is not supported for cuda_graph_impl=transformer_engine. " "Please use cuda_graph_impl=local instead." ) @@ -1530,7 +1529,7 @@ def get_rotary_pos_emb(transformer_module, transformer_input): and not isinstance(layer.self_attention, IdentityOp) and ( not self.config.cuda_graph_scope - or CudaGraphScope.attn in self.config.cuda_graph_scope + or 'attn' in self.config.cuda_graph_scope ) ) if is_te_min_version("1.10.0"): @@ -1713,33 +1712,3 @@ def cuda_graph_set_manual_hooks(self): model_chunk = self.model[chunk_number] for layer in layers: layer.setup_manual_hooks(model_chunk._make_forward_pre_hook) - - def delete_cuda_graphs(self): - """ - Delete all CUDA graphs. - """ - assert self._graphs_created, "CUDA Graphs have not been created." - - graph_resettable = is_te_min_version("2.10.0") - graphs_reset, graphs_not_reset = 0, 0 - for layers in self.callables_per_chunk: - for layer in layers: - for graph in layer.cuda_graphs: - if graph_resettable: - graph.reset() - graphs_reset += 1 - else: - graphs_not_reset += 1 - layer.cuda_graphs = [] - layer.cuda_graph_manual_hooks = [] - - log_on_each_pipeline_stage( - logger=logger, - tp_group=None, - dp_cp_group=None, - level=logging.INFO, - msg=f'Rank {torch.distributed.get_rank()}: ' - f'{graphs_reset} graphs deleted with explicit reset, ' - f'{graphs_not_reset} graphs deleted without explicit reset.', - ) - self._graphs_created = False diff --git a/megatron/core/transformer/enums.py b/megatron/core/transformer/enums.py index d06d58d65f2..52b82029f90 100644 --- a/megatron/core/transformer/enums.py +++ b/megatron/core/transformer/enums.py @@ -65,15 +65,3 @@ class AttnBackend(enum.Enum): unfused = 3 local = 4 auto = 5 - - -class CudaGraphScope(enum.Enum): - """Cuda Graph Scope - defines which parts of the model to capture.""" - - full_iteration = 1 # Captures the entire training/inference iteration - attn = 2 # Captures attention layers - mlp = 3 # Captures MLP layers (dense layers only) - moe = 4 # Captures MoE layers (drop-and-pad MoE layers only) - moe_router = 5 # Captures MoE router part - moe_preprocess = 6 # Captures MoE preprocessing part (requires moe_router) - mamba = 7 # Captures Mamba layers diff --git a/megatron/core/transformer/moe/fused_a2a.py b/megatron/core/transformer/moe/fused_a2a.py index 045a93039b3..60b0b11a32c 100644 --- a/megatron/core/transformer/moe/fused_a2a.py +++ b/megatron/core/transformer/moe/fused_a2a.py @@ -320,14 +320,6 @@ def init_hybrid_ep_buffer( ) -def reset_hybrid_ep_buffer(): - ''' - Reset the HybridEP buffer - ''' - global _hybrid_ep_buffer - _hybrid_ep_buffer = None - - class HybridEPDispatch(torch.autograd.Function): ''' Fused dispatch operation for permute + dispatch a2a + permute using the HybridEP backend diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py index 3ed31d375e2..d28cbfea3fe 100644 --- a/megatron/core/transformer/moe/moe_utils.py +++ b/megatron/core/transformer/moe/moe_utils.py @@ -11,7 +11,6 @@ from megatron.core.fp8_utils import get_fp8_align_size from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.cuda_graphs import is_graph_capturing -from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.transformer_config import TransformerConfig try: @@ -1206,13 +1205,13 @@ def maybe_raise_signal(moe_layer, **kwargs): ): if ( step_condition == "route" - and CudaGraphScope.moe_router in moe_layer.config.cuda_graph_scope - and CudaGraphScope.moe_preprocess not in moe_layer.config.cuda_graph_scope + and 'moe_router' in moe_layer.config.cuda_graph_scope + and 'moe_preprocess' not in moe_layer.config.cuda_graph_scope ): raise MoECudaGraphPartialCaptureSignal(moe_layer, "route", **kwargs) elif ( step_condition == "preprocess" - and CudaGraphScope.moe_preprocess in moe_layer.config.cuda_graph_scope + and 'moe_preprocess' in moe_layer.config.cuda_graph_scope ): raise MoECudaGraphPartialCaptureSignal(moe_layer, "preprocess", **kwargs) diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py index af8ae572adb..b2135fdb00d 100644 --- a/megatron/core/transformer/moe/token_dispatcher.py +++ b/megatron/core/transformer/moe/token_dispatcher.py @@ -16,7 +16,6 @@ gather_from_sequence_parallel_region, reduce_scatter_to_sequence_parallel_region, ) -from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.moe.fused_a2a import ( fused_combine, fused_dispatch, @@ -437,7 +436,7 @@ def __init__( } if ( config.cuda_graph_impl == "transformer_engine" - and CudaGraphScope.moe_preprocess in config.cuda_graph_scope + and 'moe_preprocess' in config.cuda_graph_scope ): self.cuda_dtoh_point = "before_ep_alltoall" else: @@ -1076,13 +1075,10 @@ def combine( num_permuted_tokens=self.num_permuted_tokens, pad_multiple=self.pad_multiple, ) - # Release the used handle/num_permuted_tokens which could change in each iteration. - # For drop_and_pad mode, we don't need to reset the num_permuted_tokens and - # num_dispatched_tokens, because their values never change. + # Release the used handle/num_permuted_tokens which could change in each iteration self.handle = None - if not self.drop_and_pad: - self.num_permuted_tokens = None - self.num_dispatched_tokens = None + self.num_permuted_tokens = None + self.num_dispatched_tokens = None return hidden_states def get_permuted_hidden_states_by_experts(self, hidden_states: torch.Tensor) -> torch.Tensor: diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index 023db1fe75a..6f69927e9e8 100755 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -21,7 +21,7 @@ ) from megatron.core.pipeline_parallel.utils import is_vp_first_stage, is_vp_last_stage from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.transformer.enums import CudaGraphScope, LayerType +from megatron.core.transformer.enums import LayerType from megatron.core.transformer.module import GraphableMegatronModule, MegatronModule from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.transformer_config import TransformerConfig @@ -555,7 +555,7 @@ def _should_call_local_cudagraph(self, *args, **kwargs): kwargs.get('inference_context') is not None or kwargs.get('inference_params') is not None ) - and CudaGraphScope.full_iteration in self.config.cuda_graph_scope + and 'full_iteration' in self.config.cuda_graph_scope ): if kwargs['inference_context'].is_static_batching(): using_cuda_graph = kwargs['inference_context'].is_decode_only() diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index a3a16754977..656699ea2a2 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -9,7 +9,7 @@ from megatron.core.enums import Fp4Recipe, Fp8Recipe from megatron.core.quantization.quant_config import RecipeConfig -from megatron.core.transformer.enums import AttnBackend, CudaGraphScope +from megatron.core.transformer.enums import AttnBackend from megatron.core.transformer.pipeline_parallel_layer_layout import PipelineParallelLayerLayout from ..fusions.fused_bias_geglu import quick_gelu @@ -733,7 +733,7 @@ class TransformerConfig(ModelParallelConfig): excluding optimizer) is enabled. "transformer_engine": capture the CUDA graph using TE make_graphed_callables().""" - cuda_graph_scope: Optional[List[CudaGraphScope]] = None + cuda_graph_scope: Optional[List[str]] = None """Determines the CUDA graphs capturing scope. When cuda_graph_impl is set to "transformer_engine", valid values are "attn", "mlp", "moe", "moe_router", "moe_preprocess", "mamba". None means the full layer. @@ -1615,76 +1615,65 @@ def __post_init__(self): 'use cuda_graph_impl=transformer_engine instead.' ) self.cuda_graph_impl = "transformer_engine" - if self.cuda_graph_scope is None: self.cuda_graph_scope = [] - elif not isinstance(self.cuda_graph_scope, list): - if isinstance(self.cuda_graph_scope, CudaGraphScope): - self.cuda_graph_scope = [self.cuda_graph_scope] - else: - assert isinstance(self.cuda_graph_scope, str), ( - "cuda_graph_scope must be a string that can be converted to a list of " - f"CudaGraphScope, got {self.cuda_graph_scope}." - ) - self.cuda_graph_scope = self.cuda_graph_scope.split(',') - if all(isinstance(scope, str) for scope in self.cuda_graph_scope): - # Backward compatibility for "full" scope. Now we use an empty list instead. - if "full" in self.cuda_graph_scope: - assert self.cuda_graph_scope == [ - "full" - ], "full scope cannot be used with other scopes." - warnings.warn( - "full scope is deprecated. " - "Use empty cuda_graph_scope to capture the whole layer." - ) - self.cuda_graph_scope = [] - else: - self.cuda_graph_scope = [CudaGraphScope[scope] for scope in self.cuda_graph_scope] - assert all( - isinstance(scope, CudaGraphScope) for scope in self.cuda_graph_scope - ), f"cuda_graph_scope must be a list of CudaGraphScope, got {self.cuda_graph_scope}." - if self.cuda_graph_impl != "none": assert self.cuda_graph_impl in [ "transformer_engine", "local", ], f"Invalid cuda graph implementation: {self.cuda_graph_impl}" - if self.cpu_offloading: raise ValueError("CUDA graphs not supported with CPU offloading.") + elif not isinstance(self.cuda_graph_scope, list): + assert isinstance(self.cuda_graph_scope, str), ( + "cuda_graph_scope must be a string or a list of strings, " + f"got {self.cuda_graph_scope}." + ) + self.cuda_graph_scope = [self.cuda_graph_scope] + if self.cuda_graph_impl == "local": - assert not self.cuda_graph_scope or self.cuda_graph_scope == [ - CudaGraphScope.full_iteration - ], ( - "For local cuda graph implementation, the only valid value for " - "cuda_graph_scope is full_iteration, or an empty list to denote layerwise " - "graphs. To use other scopes, use cuda_graph_impl=transformer_engine." + assert not self.cuda_graph_scope or self.cuda_graph_scope == ["full_iteration"], ( + "For local cuda graph implementation, the only valid value " + "for cuda_graph_scope is full_iteration. " + "To use other scopes, use cuda_graph_impl=transformer_engine." ) if self.cuda_graph_impl == "transformer_engine": - assert CudaGraphScope.full_iteration not in self.cuda_graph_scope, ( + assert "full_iteration" not in self.cuda_graph_scope, ( "To use full iteration cuda graph, please use " - "cuda_graph_impl=local instead of cuda_graph_impl=transformer_engine." + "cuda_graph_impl=transformer_engine instead of cuda_graph_impl=local." ) + for scope in self.cuda_graph_scope: + assert scope in [ + 'attn', + 'mlp', + 'moe', + 'moe_router', + 'moe_preprocess', + 'mamba', + ], ( + "--cuda-graph-scope should be attn, mlp, moe, moe_router, moe_preprocess, " + f"or mamba, got {self.cuda_graph_scope}." + ) + assert ( - CudaGraphScope.moe not in self.cuda_graph_scope - or CudaGraphScope.moe_router not in self.cuda_graph_scope + 'moe' not in self.cuda_graph_scope or 'moe_router' not in self.cuda_graph_scope ), 'cuda_graph_scope must not contain both moe and moe_router.' - if CudaGraphScope.moe_preprocess in self.cuda_graph_scope: + if 'moe_preprocess' in self.cuda_graph_scope: assert ( - CudaGraphScope.moe_router in self.cuda_graph_scope + 'moe_router' in self.cuda_graph_scope ), 'moe_preprocess cuda graph is only supported with moe_router cuda graph.' if self.num_moe_experts is None or self.num_moe_experts <= 1: assert ( - CudaGraphScope.moe not in self.cuda_graph_scope - and CudaGraphScope.moe_router not in self.cuda_graph_scope + 'moe' not in self.cuda_graph_scope + and 'moe_router' not in self.cuda_graph_scope ), 'moe cuda graph is only supported for MoE.' else: if self.moe_layer_freq == 1 or ( isinstance(self.moe_layer_freq, list) and 0 not in self.moe_layer_freq ): - assert CudaGraphScope.mlp not in self.cuda_graph_scope, ( + assert 'mlp' not in self.cuda_graph_scope, ( 'mlp cuda graph is only supported for dense layers, ' 'but not found in the model.' ) @@ -1693,13 +1682,13 @@ def __post_init__(self): or not self.moe_pad_expert_input_to_capacity ): assert ( - CudaGraphScope.moe not in self.cuda_graph_scope + 'moe' not in self.cuda_graph_scope ), 'moe cuda graph is only supported with drop-padding MoE.' if self.moe_token_dispatcher_type == 'alltoall' and ( self.moe_expert_capacity_factor is not None or self.moe_router_padding_for_quantization ): - assert CudaGraphScope.moe_preprocess not in self.cuda_graph_scope, ( + assert 'moe_preprocess' not in self.cuda_graph_scope, ( 'moe_preprocess cuda graph is not supported when there are ' 'DtoH copies and synchronizations in the preprocess step.' ) @@ -1709,28 +1698,25 @@ def __post_init__(self): raise ValueError( "Full-layer CUDA graphs not supported with activation recomputation." ) - elif self.cuda_graph_scope != [CudaGraphScope.full_iteration]: + elif self.cuda_graph_scope != ['full_iteration']: # For scoped CUDA graphs, only the non-graphed parts of the layer can be # recomputed. So check if there are overlaps between the recomputed parts # and the graphed parts. - if CudaGraphScope.attn in self.cuda_graph_scope: + if "attn" in self.cuda_graph_scope: for module in self.recompute_modules: if module in ['core_attn', 'mla_up_proj']: raise ValueError( f'attn cuda graph is not supported with {module} recompute.' ) - if ( - CudaGraphScope.mlp in self.cuda_graph_scope - and "mlp" in self.recompute_modules - ): + if "mlp" in self.cuda_graph_scope and "mlp" in self.recompute_modules: raise ValueError(f'mlp cuda graph is not supported with mlp recompute.') - if CudaGraphScope.moe in self.cuda_graph_scope: + if "moe" in self.cuda_graph_scope: for module in self.recompute_modules: if module in ['moe_act', 'moe', 'shared_experts']: raise ValueError( f'moe cuda graph is not supported with {module} recompute.' ) - if CudaGraphScope.moe_router in self.cuda_graph_scope: + if "moe_router" in self.cuda_graph_scope: for module in self.recompute_modules: if module in ['moe', 'shared_experts']: raise ValueError( @@ -1739,25 +1725,25 @@ def __post_init__(self): ) if "layernorm" in self.recompute_modules: if ( - CudaGraphScope.attn in self.cuda_graph_scope - and CudaGraphScope.mlp in self.cuda_graph_scope + "attn" in self.cuda_graph_scope + and "mlp" in self.cuda_graph_scope and ( - CudaGraphScope.moe in self.cuda_graph_scope - or CudaGraphScope.moe_router in self.cuda_graph_scope + "moe" in self.cuda_graph_scope + or "moe_router" in self.cuda_graph_scope ) ): raise ValueError( 'cuda graph is not supported with layernorm recompute.' ) - if CudaGraphScope.attn in self.cuda_graph_scope: + if "attn" in self.cuda_graph_scope: warnings.warn( "input_layernorm recompute is not supported with attention " "cudagraph. Will only recompute the pre_mlp_layernorm." ) if ( - CudaGraphScope.mlp in self.cuda_graph_scope - or CudaGraphScope.moe in self.cuda_graph_scope - or CudaGraphScope.moe_router in self.cuda_graph_scope + "mlp" in self.cuda_graph_scope + or "moe" in self.cuda_graph_scope + or "moe_router" in self.cuda_graph_scope ): warnings.warn( "pre_mlp_layernorm recompute is not supported with mlp/moe " diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index 3ea40577009..f89678e6216 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -16,7 +16,7 @@ from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.cuda_graphs import is_graph_capturing -from megatron.core.transformer.enums import CudaGraphScope, LayerType +from megatron.core.transformer.enums import LayerType from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp from megatron.core.transformer.mlp import MLP from megatron.core.transformer.module import GraphableMegatronModule @@ -382,21 +382,18 @@ def __init__( if "layernorm" in self.config.recompute_modules: if not isinstance(self.input_layernorm, IdentityOp) and ( self.config.cuda_graph_impl == "none" - or CudaGraphScope.attn not in self.config.cuda_graph_scope + or 'attn' not in self.config.cuda_graph_scope ): self.recompute_input_layernorm = True if self.config.fp8 or self.config.fp4: self.self_attention.set_for_recompute_input_layernorm() if not isinstance(self.pre_mlp_layernorm, IdentityOp) and ( self.config.cuda_graph_impl == "none" - or ( - not self.is_moe_layer - and CudaGraphScope.mlp not in self.config.cuda_graph_scope - ) + or (not self.is_moe_layer and 'mlp' not in self.config.cuda_graph_scope) or ( self.is_moe_layer - and CudaGraphScope.moe not in self.config.cuda_graph_scope - and CudaGraphScope.moe_router not in self.config.cuda_graph_scope + and 'moe' not in self.config.cuda_graph_scope + and 'moe_router' not in self.config.cuda_graph_scope ) ): self.recompute_pre_mlp_layernorm = True @@ -637,13 +634,12 @@ def _forward_mlp(self, hidden_states, inference_context=None): and self.config.cuda_graph_impl == "transformer_engine" and self.training and is_graph_capturing() - and CudaGraphScope.moe_router in self.config.cuda_graph_scope + and 'moe_router' in self.config.cuda_graph_scope ): assert ( not self.recompute_pre_mlp_layernorm ), "Recomputation is not supported for CUDA graph." cudagraph_outputs = self.mlp(pre_mlp_layernorm_output) - nvtx_range_pop(suffix="mlp") return cudagraph_outputs + [residual] elif self.recompute_mlp: if self.config.fp8 or self.config.fp4: @@ -698,7 +694,6 @@ def _forward_post_mlp(self, mlp_output_with_bias, residual): Returns: output (Tensor): Transformed hidden states of shape [s, b, h]. """ - from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( fine_grained_offloading_group_commit, ) @@ -762,7 +757,7 @@ def get_layer_static_inputs(self, seq_length, micro_batch_size): static_inputs = super().get_layer_static_inputs(seq_length, micro_batch_size) if not isinstance(self.self_attention, IdentityOp) and ( - not self.config.cuda_graph_scope or CudaGraphScope.attn in self.config.cuda_graph_scope + not self.config.cuda_graph_scope or 'attn' in self.config.cuda_graph_scope ): slen_per_cp = seq_length // self.config.context_parallel_size static_inputs["attention_mask"] = ( @@ -781,18 +776,18 @@ def _get_submodules_under_cudagraphs(self): return super()._get_submodules_under_cudagraphs() submodules = [] - if CudaGraphScope.attn in self.config.cuda_graph_scope: + if 'attn' in self.config.cuda_graph_scope: submodules += [ self.input_layernorm, self.self_attention, self.pre_cross_attn_layernorm, self.cross_attention, ] - if (not self.is_moe_layer and CudaGraphScope.mlp in self.config.cuda_graph_scope) or ( - self.is_moe_layer and CudaGraphScope.moe in self.config.cuda_graph_scope + if (not self.is_moe_layer and 'mlp' in self.config.cuda_graph_scope) or ( + self.is_moe_layer and 'moe' in self.config.cuda_graph_scope ): submodules += [self.pre_mlp_layernorm, self.mlp] - elif self.is_moe_layer and CudaGraphScope.moe_router in self.config.cuda_graph_scope: + elif self.is_moe_layer and 'moe_router' in self.config.cuda_graph_scope: submodules += [self.pre_mlp_layernorm, self.mlp.router] if ( self.config.moe_shared_expert_intermediate_size is not None @@ -810,7 +805,7 @@ def _te_cuda_graph_capture(self, *args, **kwargs): 2. If context is None, it cannot be returned as output. """ context = None - if not self.config.cuda_graph_scope or CudaGraphScope.attn in self.config.cuda_graph_scope: + if not self.config.cuda_graph_scope or 'attn' in self.config.cuda_graph_scope: hidden_states, context = self._forward_attention(*args, **kwargs) else: if len(args) > 0: @@ -820,12 +815,12 @@ def _te_cuda_graph_capture(self, *args, **kwargs): if ( not self.config.cuda_graph_scope - or (not self.is_moe_layer and CudaGraphScope.mlp in self.config.cuda_graph_scope) + or (not self.is_moe_layer and 'mlp' in self.config.cuda_graph_scope) or ( self.is_moe_layer and ( - CudaGraphScope.moe in self.config.cuda_graph_scope - or CudaGraphScope.moe_router in self.config.cuda_graph_scope + 'moe' in self.config.cuda_graph_scope + or 'moe_router' in self.config.cuda_graph_scope ) ) ): @@ -846,7 +841,7 @@ def _te_cuda_graph_replay(self, *args, **kwargs): Hence, `inference_context` and `packed_seq_params` are excluded from input list. """ context = None - if self.config.cuda_graph_scope and CudaGraphScope.attn not in self.config.cuda_graph_scope: + if self.config.cuda_graph_scope and 'attn' not in self.config.cuda_graph_scope: hidden_states, context = self._forward_attention(*args, **kwargs) args = (hidden_states,) kwargs = {} @@ -866,13 +861,13 @@ def _te_cuda_graph_replay(self, *args, **kwargs): if ( not self.config.cuda_graph_scope - or (not self.is_moe_layer and CudaGraphScope.mlp in self.config.cuda_graph_scope) - or (self.is_moe_layer and CudaGraphScope.moe in self.config.cuda_graph_scope) + or (not self.is_moe_layer and 'mlp' in self.config.cuda_graph_scope) + or (self.is_moe_layer and 'moe' in self.config.cuda_graph_scope) ): # CUDA Graph captures the whole MLP/MoE part. CUDA Graph output is the layer output. assert len(cuda_graph_output) == 1, "CUDA Graph output should be the layer output." output = cuda_graph_output.pop() - elif self.is_moe_layer and CudaGraphScope.moe_router in self.config.cuda_graph_scope: + elif self.is_moe_layer and 'moe_router' in self.config.cuda_graph_scope: # CUDA Graph partially captures the MoE. # The rest of the layer should go to the normal pass. shared_expert_output, routing_map, residual = None, None, None @@ -887,7 +882,7 @@ def _te_cuda_graph_replay(self, *args, **kwargs): # Split cudagraph outputs into function outputs and attribute outputs, and # process them separately. Function outputs should have three tensors. func_output, attr_outputs = cuda_graph_output[:3], cuda_graph_output[3:] - if CudaGraphScope.moe_preprocess in self.config.cuda_graph_scope: + if 'moe_preprocess' in self.config.cuda_graph_scope: hidden_states, probs, residual = func_output valid_cudagraph_attrs = self.mlp.token_dispatcher.valid_cudagraph_attrs assert len(attr_outputs) == len( @@ -994,7 +989,7 @@ def _should_call_local_cudagraph(self, *args, **kwargs): (kwargs.get('inference_context') is not None) or (kwargs.get('inference_params') is not None) ) - and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope + and 'full_iteration' not in self.config.cuda_graph_scope ): if kwargs['inference_context'].is_static_batching(): using_cuda_graph = kwargs['inference_context'].is_decode_only() diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 0cf2d006863..8be173c75a0 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -23,7 +23,7 @@ from megatron.core.rerun_state_machine import RerunStateMachine from megatron.core.transformer import MLATransformerConfig, TransformerConfig from megatron.core.transformer.pipeline_parallel_layer_layout import PipelineParallelLayerLayout -from megatron.core.transformer.enums import AttnBackend, CudaGraphScope +from megatron.core.transformer.enums import AttnBackend from megatron.core.transformer.heterogeneous.heterogeneous_config import ( HeterogeneousTransformerConfig, MLPConfig, @@ -772,7 +772,7 @@ def validate_args(args, defaults={}): if args.rank == 0: print('accumulate and all-reduce gradients in fp32 for ' 'bfloat16 data type.', flush=True) - if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope: + if args.cuda_graph_impl == "local" and "full_iteration" in args.cuda_graph_scope: if not args.inference_dynamic_batching: assert not args.check_for_nan_in_loss_and_grad, \ "--no-check-for-nan-in-loss-and-grad should be set with full_iteration CUDA graph" @@ -1273,15 +1273,6 @@ def validate_args(args, defaults={}): assert ( args.recompute_granularity != 'full' ), 'recompute_granularity must not be full when CUDA Graphs are enabled.' - if args.cuda_graph_scope == "full" or ( - isinstance(args.cuda_graph_scope, list) and "full" in args.cuda_graph_scope - ): - if isinstance(args.cuda_graph_scope, list): - assert args.cuda_graph_scope == ["full"], "full scope cannot be used with other scopes." - args.cuda_graph_scope = [] - warn_rank_0( - 'full scope is deprecated. Use empty cuda_graph_scope to capture the whole layer.' - ) if args.multi_latent_attention: assert not args.group_query_attention, "Group query attention is mutually exclusive with multi latent attention." @@ -1503,7 +1494,7 @@ def _add_inference_args(parser): '"none": no CUDA graph. ' '"local": capture the CUDA graph using MCore local implementation. --cuda-graph-scope=\"full_iteration\" enables whole iteration CUDA graph. ' '"transformer_engine": capture the CUDA graph using TE make_graphed_callables().') - group.add_argument('--cuda-graph-scope', nargs='+', type=lambda scope: CudaGraphScope[scope] if scope != "full" else scope, default=[], + group.add_argument('--cuda-graph-scope', nargs='+', type=str, default=[], help='Determines the CUDA graphs capturing scope. ' 'choices: "attn", "mlp", "moe", "moe_router", "moe_preprocess", "mamba", "full_iteration". ' '"attn": captures operations in TransformerLayer._forward_attention(). ' @@ -1515,8 +1506,7 @@ def _add_inference_args(parser): '"mamba": captures the mamba layer. ' '"full_iteration": captures a whole iteration. ' 'full_iteration scope is only supported with --cuda-graph-impl=local, other scopes are only supported with --cuda-graph-impl=transformer_engine. ' - 'If not specified, the default scope is to capture the whole Transformer layer. ' - 'For backward compatibility, we still allow passing "full" to specify capturing the whole layer, and convert it to an empty list.') + 'If not specified, the default scope is to capture the whole Transformer layer.') group.add_argument('--use-legacy-static-engine', action='store_true', default=False, help='Use legacy static engine. (Current static engine uses dynamic engine under the hood)', dest='use_legacy_static_engine') diff --git a/megatron/training/training.py b/megatron/training/training.py index e88b9839d28..5c9de623ce5 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -59,7 +59,6 @@ from megatron.training.checkpointing import checkpoint_exists from megatron.core.full_cuda_graph import FullCudaGraphWrapper from megatron.core.transformer.cuda_graphs import TECudaGraphHelper -from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.module import Float16Module from megatron.core.distributed import DistributedDataParallelConfig, TorchFullyShardedDataParallelConfig from megatron.core.distributed import DistributedDataParallel as DDP @@ -2278,7 +2277,7 @@ def train( eval_iterations = 0 # Wrap forward_backward_func for Full iteration CUDA graph forward_backward_func = get_forward_backward_func() - if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope: + if args.cuda_graph_impl == "local" and "full_iteration" in args.cuda_graph_scope: forward_backward_func = FullCudaGraphWrapper(forward_backward_func, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps) def get_e2e_base_metrics(): @@ -2627,10 +2626,6 @@ def get_e2e_base_metrics(): if should_exit: break - # Destroy CUDA Graphs. - if args.cuda_graph_impl == "transformer_engine" and cuda_graph_helper.graphs_created(): - cuda_graph_helper.delete_cuda_graphs() - one_logger_utils.track_e2e_metrics() # Flush TensorBoard, WandB writers and one-logger. @@ -2704,7 +2699,7 @@ def evaluate( eval_batch_size = args.global_batch_size eval_num_microbatches = eval_batch_size // (args.micro_batch_size * args.data_parallel_size) forward_backward_func = get_forward_backward_func() - if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope: + if args.cuda_graph_impl == "local" and "full_iteration" in args.cuda_graph_scope: forward_backward_func = FullCudaGraphWrapper(forward_backward_func, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps) if eval_iters is None: diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py index 26d3dcfbd6d..0ac4b296746 100644 --- a/tests/unit_tests/inference/engines/test_dynamic_engine.py +++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py @@ -3,7 +3,7 @@ import asyncio import random import types -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Dict, List, Optional, Tuple import pytest @@ -41,7 +41,6 @@ from megatron.core.models.mamba.mamba_model import MambaModel from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed from megatron.core.transformer.cuda_graphs import CudaGraphManager, _CudagraphGlobalRecord -from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import ( check_mamba_sequence_packing_support, @@ -104,9 +103,7 @@ class DynamicEngineTestConfig: return_log_probs: bool = False materialize_only_last_token_logits: bool = True skip_prompt_log_probs: bool = False - cuda_graph_scope: List[CudaGraphScope] = field( - default_factory=lambda: [CudaGraphScope.full_iteration] - ) + cuda_graph_scope: List[str] = None force_build_cuda_graphs: bool = False # If False, do not build cuda graphs in the tests, even if # num_cuda_graphs is set. @@ -139,6 +136,9 @@ def __post_init__(self): if self.context_max_tokens_override is None: self.context_max_tokens_override = self.num_requests * self.max_sequence_length + if self.cuda_graph_scope is None: + self.cuda_graph_scope = ["full_iteration"] + @dataclass class DynamicEngineTestEnv: @@ -514,7 +514,7 @@ def teardown_method(self, method): ) @pytest.mark.parametrize("model_provider", ["gpt", "mamba"]) @pytest.mark.parametrize("num_cuda_graphs", [None, 1, 4]) - @pytest.mark.parametrize("cuda_graph_scope", [[], [CudaGraphScope.full_iteration]]) + @pytest.mark.parametrize("cuda_graph_scope", [[], ["full_iteration"]]) def test_simple(self, model_provider, num_cuda_graphs, cuda_graph_scope) -> None: """Simple test that runs without errors, and validates output.""" skip_if_mamba_sequence_packing_not_available(model_provider) diff --git a/tests/unit_tests/test_fp8_param.py b/tests/unit_tests/test_fp8_param.py index 361698f7127..0b8d41769ec 100644 --- a/tests/unit_tests/test_fp8_param.py +++ b/tests/unit_tests/test_fp8_param.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. import contextlib import gc @@ -36,10 +36,7 @@ try: from transformer_engine.pytorch.tensor.utils import post_all_gather_processing - if is_te_min_version("2.10.0"): - cuda_graph_supported = True - else: - reason_for_no_cuda_graph = "Need newer TransformerEngine" + cuda_graph_supported = True except ImportError: reason_for_no_cuda_graph = "Need newer TransformerEngine" @@ -68,16 +65,12 @@ class TestFP8Param: def setup_method(self, method): self.seq_length = 512 self.micro_batch_size = 2 - self.cuda_graph_helper = None os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1' def teardown_method(self, method): Utils.destroy_model_parallel() destroy_global_vars() destroy_num_microbatches_calculator() - if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created(): - self.cuda_graph_helper.delete_cuda_graphs() - self.cuda_graph_helper = None gc.collect() def model_provider( @@ -216,12 +209,13 @@ def _run_test_helper( ) assert len(gpt_model) == 1 # Assume only one model in the model provider. + cuda_graph_helper = None # Hard coded to use cuda_graph_impl="transformer_engine" cuda_graph_impl = "transformer_engine" if use_cuda_graph and cuda_graph_impl == "transformer_engine": from megatron.core.transformer.cuda_graphs import TECudaGraphHelper - self.cuda_graph_helper = TECudaGraphHelper( + cuda_graph_helper = TECudaGraphHelper( model=gpt_model, config=gpt_model[0].config, seq_length=self.seq_length, @@ -256,13 +250,13 @@ def _run_test_helper( # Capture CUDA graphs after warmup if helper is provided. # Hard coded cuda_graph_warmup_steps = 0. cuda_graph_warmup_steps = 0 - if self.cuda_graph_helper is not None and i == cuda_graph_warmup_steps: + if cuda_graph_helper is not None and i == cuda_graph_warmup_steps: if should_disable_forward_pre_hook(args): disable_forward_pre_hook(gpt_model, param_sync=False) - self.cuda_graph_helper.create_cudagraphs() + cuda_graph_helper.create_cudagraphs() if should_disable_forward_pre_hook(args): enable_forward_pre_hook(gpt_model) - self.cuda_graph_helper.cuda_graph_set_manual_hooks() + cuda_graph_helper.cuda_graph_set_manual_hooks() # For the mxfp8_param with reuse_grad_buf_for_mxfp8_param_ag and dp_ag_overlap, # we need to call the _copy_main_params_to_param_buffer() after the grad buffer @@ -303,10 +297,6 @@ def _run_test_helper( loss_list.append(loss.item()) - if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created(): - self.cuda_graph_helper.delete_cuda_graphs() - self.cuda_graph_helper = None - return torch.tensor(loss_list) def run_test(self, tp_size, recipe, inference: bool = False, **kwargs): diff --git a/tests/unit_tests/transformer/test_cuda_graphs.py b/tests/unit_tests/transformer/test_cuda_graphs.py index cee75171560..3ad0262a1cf 100644 --- a/tests/unit_tests/transformer/test_cuda_graphs.py +++ b/tests/unit_tests/transformer/test_cuda_graphs.py @@ -9,7 +9,6 @@ import pytest import torch -from transformer_engine.pytorch.fp8 import check_fp8_support from megatron.core import parallel_state from megatron.core.enums import ModelType @@ -26,7 +25,6 @@ TextGenerationController, ) from megatron.core.models.gpt.gpt_layer_specs import ( - get_gpt_decoder_block_spec, get_gpt_layer_local_spec, get_gpt_layer_with_transformer_engine_spec, get_gpt_mtp_block_spec, @@ -43,8 +41,6 @@ model_parallel_cuda_manual_seed, ) from megatron.core.transformer.cuda_graphs import CudaGraphManager, _CudagraphGlobalRecord -from megatron.core.transformer.enums import CudaGraphScope -from megatron.core.transformer.moe.fused_a2a import reset_hybrid_ep_buffer from megatron.core.transformer.transformer_block import TransformerBlock from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import is_fa_min_version, is_te_min_version @@ -58,8 +54,6 @@ from megatron.training.training import setup_model_and_optimizer from tests.unit_tests.test_utilities import Utils -fp8_available, _ = check_fp8_support() - class TestParallelTransformerBlockCudagraphs: def setup_method(self, method): @@ -753,9 +747,6 @@ class TestPartialCudaGraph: def setup_method(self, method): self.seq_length = 512 self.micro_batch_size = 2 - self.tp_size = 2 - self.cp_size = 2 - self.cuda_graph_helper = None # Store original environment variable values self.original_env = { 'CUDA_DEVICE_MAX_CONNECTIONS': os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS'), @@ -771,28 +762,22 @@ def teardown_method(self, method): os.environ.pop(key, None) else: os.environ[key] = value + Utils.destroy_model_parallel() destroy_global_vars() destroy_num_microbatches_calculator() - if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created(): - self.cuda_graph_helper.delete_cuda_graphs() - self.cuda_graph_helper = None gc.collect() def model_provider( self, pre_process=True, post_process=True, - layer_spec_fn=get_gpt_decoder_block_spec, + layer_spec_fn=get_gpt_layer_with_transformer_engine_spec, **config_kwargs, ): + model_parallel_cuda_manual_seed(123) args = get_args() config = core_transformer_config_from_args(args) - transformer_layer_spec = layer_spec_fn( - config, - use_transformer_engine=True, - normalization=args.normalization, - qk_l2_norm=args.qk_l2_norm, - ) + transformer_layer_spec = layer_spec_fn() if args.mtp_num_layers: mtp_block_spec = get_gpt_mtp_block_spec( config, transformer_layer_spec, use_transformer_engine=True @@ -825,17 +810,18 @@ def create_test_args( args.num_layers = 4 args.mtp_num_layers = 1 args.vocab_size = 1024 - args.hidden_size = 512 + args.hidden_size = 128 args.num_attention_heads = 8 args.max_position_embeddings = 512 - args.global_batch_size = self.micro_batch_size * 8 // self.tp_size // self.cp_size + args.global_batch_size = self.micro_batch_size * 8 args.micro_batch_size = self.micro_batch_size args.create_attention_mask_in_dataloader = True args.seq_length = self.seq_length - args.tensor_model_parallel_size = self.tp_size - args.sequence_parallel = True if self.tp_size > 1 else False + args.tensor_model_parallel_size = 2 + args.sequence_parallel = True args.pipeline_model_parallel_size = 1 - args.context_parallel_size = self.cp_size + args.context_parallel_size = 1 + args.expert_model_parallel_size = ep_size args.train_iters = 10 args.lr = 3e-5 args.bf16 = True @@ -850,26 +836,17 @@ def create_test_args( # MoE settings args.num_experts = 4 args.expert_model_parallel_size = ep_size - args.expert_tensor_parallel_size = 1 if ep_size > 1 else self.tp_size args.moe_shared_expert_intermediate_size = 1024 - args.moe_layer_freq = [0, 0, 1, 1] + args.moe_layer_freq = "[0,0,1,1]" args.moe_permute_fusion = True args.moe_router_fusion = True args.moe_router_topk = 2 - args.moe_router_dtype = "fp32" # CUDA graph settings args.cuda_graph_impl = cuda_graph_impl args.cuda_graph_scope = cuda_graph_scope args.cuda_graph_warmup_steps = cuda_graph_warmup_steps - - # fp8 settings - if fp8_available: - args.fp8 = "e4m3" - args.fp8_recipe = "tensorwise" - args.first_last_layers_bf16 = True - args.num_layers_at_start_in_bf16 = 1 - args.num_layers_at_end_in_bf16 = 1 + args.use_te_rng_tracker = cuda_graph_impl != "none" for key, value in kwargs.items(): assert hasattr(args, key) @@ -879,15 +856,15 @@ def create_test_args( set_global_variables(args, False) return args - def get_batch(self, seq_length, micro_batch_size, cp_size): - data = list(range(seq_length // cp_size)) + def get_batch(self, seq_length, micro_batch_size): + data = list(range(seq_length)) input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() labels = 1 + torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() attention_mask = torch.ones( - (micro_batch_size, 1, seq_length // cp_size, seq_length), dtype=bool + (micro_batch_size, 1, seq_length, seq_length), dtype=bool ).cuda() - loss_mask = torch.ones(seq_length // cp_size).repeat((micro_batch_size, 1)).cuda() + loss_mask = torch.ones(seq_length).repeat((micro_batch_size, 1)).cuda() return input_ids, labels, position_ids, attention_mask, loss_mask def _run_test_helper( @@ -900,10 +877,12 @@ def _run_test_helper( set_args(args) torch.manual_seed(123) - model_parallel_cuda_manual_seed(123) + Utils.initialize_model_parallel( + tensor_model_parallel_size=2, expert_model_parallel_size=ep_size + ) input_ids, labels, position_ids, attention_mask, loss_mask = self.get_batch( - self.seq_length, self.micro_batch_size, self.cp_size + self.seq_length, self.micro_batch_size ) gpt_model, optimizer, _ = setup_model_and_optimizer( @@ -911,10 +890,13 @@ def _run_test_helper( ) assert len(gpt_model) == 1 # Assume only one model in the model provider. + loss_list = [] + + cuda_graph_helper = None if cuda_graph_impl == "transformer_engine": from megatron.core.transformer.cuda_graphs import TECudaGraphHelper - self.cuda_graph_helper = TECudaGraphHelper( + cuda_graph_helper = TECudaGraphHelper( model=gpt_model, config=gpt_model[0].config, seq_length=self.seq_length, @@ -922,17 +904,14 @@ def _run_test_helper( optimizers=[optimizer], ) - loss_list = [] - for i in range(100): gpt_model[0].zero_grad_buffer() optimizer.zero_grad() # Capture CUDA graphs after warmup if helper is provided - if self.cuda_graph_helper is not None and i == cuda_graph_warmup_steps: - self.cuda_graph_helper.create_cudagraphs() + if cuda_graph_helper is not None and i == cuda_graph_warmup_steps: + cuda_graph_helper.create_cudagraphs() - gpt_model[0].set_is_first_microbatch() output = gpt_model[0].forward( input_ids=input_ids, position_ids=position_ids, @@ -943,7 +922,7 @@ def _run_test_helper( # Check output shapes assert output.shape[0] == self.micro_batch_size - assert output.shape[1] == self.seq_length // self.cp_size + assert output.shape[1] == self.seq_length # Verify gradients loss = output.mean() @@ -957,29 +936,16 @@ def _run_test_helper( loss_list.append(loss.item()) - if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created(): - self.cuda_graph_helper.delete_cuda_graphs() - self.cuda_graph_helper = None - return torch.tensor(loss_list) @pytest.mark.skipif( - not (HAVE_TE and is_te_min_version("2.10.0")), - reason="Partial CUDA graph UT support requires TransformerEngine version >= 2.10.0", + not (HAVE_TE and is_te_min_version("1.14.0")), + reason="Partial CUDA graph support requires TransformerEngine version >= 1.14.0", ) @pytest.mark.parametrize("ep_size", [1, 4]) @pytest.mark.parametrize("moe_dropless_dispatcher", [False, True]) @pytest.mark.parametrize("moe_dispatcher_type", ["alltoall", "deepep", "hybridep"]) def test_moe_partial_cudagraph(self, ep_size, moe_dropless_dispatcher, moe_dispatcher_type): - initialize_rng_tracker(use_te_rng_tracker=True, force_reset=True) - Utils.initialize_model_parallel( - tensor_model_parallel_size=self.tp_size, - context_parallel_size=self.cp_size, - pipeline_model_parallel_size=1, - expert_tensor_parallel_size=1 if ep_size > 1 else self.tp_size, - expert_model_parallel_size=ep_size, - ) - extra_kwargs = {} if moe_dispatcher_type == "deepep": if not is_deep_ep_available(): @@ -996,28 +962,19 @@ def test_moe_partial_cudagraph(self, ep_size, moe_dropless_dispatcher, moe_dispa if not moe_dropless_dispatcher: if moe_dispatcher_type == "deepep": pytest.skip("Deep EP doesn't support drop&pad MoE") - if moe_dispatcher_type == "hybridep" and ep_size == 1: - pytest.skip("Hybrid EP doesn't support drop&pad MoE with ep_size == 1") extra_kwargs["moe_expert_capacity_factor"] = 1.0 extra_kwargs["moe_pad_expert_input_to_capacity"] = True loss_list_ref = self._run_test_helper(ep_size, "none", None, 0, **extra_kwargs) for cuda_graph_scope in [ None, - [CudaGraphScope.attn], - [CudaGraphScope.moe], - [CudaGraphScope.mlp, CudaGraphScope.moe_router], - [ - CudaGraphScope.attn, - CudaGraphScope.mlp, - CudaGraphScope.moe_router, - CudaGraphScope.moe_preprocess, - ], + ["attn"], + ["moe"], + ["mlp", "moe_router"], + ["attn", "mlp", "moe_router", "moe_preprocess"], ]: - if (moe_dropless_dispatcher or moe_dispatcher_type == "hybridep") and ( - cuda_graph_scope is None or CudaGraphScope.moe in cuda_graph_scope - ): - # Dropless MoE or Hybrid EP doesn't work with "moe" scope cudagraph. Skip. + if moe_dropless_dispatcher and (cuda_graph_scope is None or "moe" in cuda_graph_scope): + # Dropless MoE doesn't work with "moe" scope cudagraph. Skip. continue cuda_graph_warmup_steps = 3 loss_list = self._run_test_helper( @@ -1029,10 +986,6 @@ def test_moe_partial_cudagraph(self, ep_size, moe_dropless_dispatcher, moe_dispa ) assert torch.equal(loss_list, loss_list_ref) - if moe_dispatcher_type == "hybridep": - reset_hybrid_ep_buffer() - Utils.destroy_model_parallel() - if __name__ == "__main__": From fdcb0a400c9967eb2c8d6803c7dd4fbc8d3ab12c Mon Sep 17 00:00:00 2001 From: Robin Zhang Date: Tue, 2 Dec 2025 11:15:30 +0800 Subject: [PATCH 167/248] Replay "[Dev] feat(MoE): Refactor cuda_graph_scope - part2 (#2353)" (#2447) --- .../text_generation_controller.py | 3 +- .../common/language_module/language_module.py | 5 +- megatron/core/models/gpt/gpt_model.py | 4 +- megatron/core/pipeline_parallel/schedules.py | 7 +- megatron/core/safe_globals.py | 3 +- megatron/core/ssm/mamba_block.py | 3 +- megatron/core/transformer/attention.py | 4 +- megatron/core/transformer/cuda_graphs.py | 47 +++++-- megatron/core/transformer/enums.py | 12 ++ megatron/core/transformer/moe/fused_a2a.py | 8 ++ megatron/core/transformer/moe/moe_utils.py | 7 +- .../core/transformer/moe/token_dispatcher.py | 12 +- .../core/transformer/transformer_block.py | 4 +- .../core/transformer/transformer_config.py | 112 +++++++++-------- .../core/transformer/transformer_layer.py | 47 +++---- megatron/training/arguments.py | 18 ++- megatron/training/training.py | 9 +- .../inference/engines/test_dynamic_engine.py | 12 +- tests/unit_tests/test_fp8_param.py | 24 ++-- .../transformer/test_cuda_graphs.py | 117 ++++++++++++------ 20 files changed, 304 insertions(+), 154 deletions(-) diff --git a/megatron/core/inference/text_generation_controllers/text_generation_controller.py b/megatron/core/inference/text_generation_controllers/text_generation_controller.py index 2bda1425710..6e00f58ac23 100644 --- a/megatron/core/inference/text_generation_controllers/text_generation_controller.py +++ b/megatron/core/inference/text_generation_controllers/text_generation_controller.py @@ -29,6 +29,7 @@ ) from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.utils import get_attention_mask, set_decode_expert_padding +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.moe.moe_layer import BaseMoELayer from megatron.core.transformer.utils import set_model_to_sequence_parallel from megatron.core.utils import get_asyncio_loop, get_model_config, unwrap_model @@ -851,7 +852,7 @@ def generate_all_output_tokens_static_batch( # Check whether CUDA graphs are enabled enable_cuda_graph = ( model_config.cuda_graph_impl == "local" - and "full_iteration" not in model_config.cuda_graph_scope + and CudaGraphScope.full_iteration not in model_config.cuda_graph_scope ) # Pad batch tokens if necessary diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py index de2ecfb8011..259bb716a93 100644 --- a/megatron/core/models/common/language_module/language_module.py +++ b/megatron/core/models/common/language_module/language_module.py @@ -21,7 +21,7 @@ is_vp_last_stage, ) from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.transformer.enums import AttnBackend +from megatron.core.transformer.enums import AttnBackend, CudaGraphScope from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.utils import ensure_metadata_has_dp_cp_group @@ -144,8 +144,7 @@ def compute_language_model_loss(self, labels: Tensor, logits: Tensor) -> Tensor: # Use is_cg_capturable=True for full iteration CUDA graphs to avoid torch.equal checks is_cg_capturable = ( hasattr(self.config, 'cuda_graph_scope') - and self.config.cuda_graph_scope - and 'full_iteration' in self.config.cuda_graph_scope + and CudaGraphScope.full_iteration in self.config.cuda_graph_scope ) if is_cg_capturable and not is_te_min_version("2.7.0"): from megatron.core.utils import get_te_version diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index ce1e8e76bd9..a3d1a8bfc00 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -24,7 +24,7 @@ from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.quantization.utils import get_quant_config_or_none from megatron.core.tensor_parallel import gather_from_sequence_parallel_region -from megatron.core.transformer.enums import ModelType +from megatron.core.transformer.enums import CudaGraphScope, ModelType from megatron.core.transformer.multi_token_prediction import ( MTPLossAutoScaler, MTPLossLoggingHelper, @@ -374,7 +374,7 @@ def _preprocess( and ( ( self.config.cuda_graph_impl == "local" - and "full_iteration" not in self.config.cuda_graph_scope + and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope ) or self.config.flash_decode ) diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index d0b912349b4..18344429c45 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -21,6 +21,7 @@ ) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.cuda_graphs import create_cudagraphs +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.moe.router import MoEAuxLossAutoScaler from megatron.core.utils import ( drain_embedding_wgrad_compute, @@ -656,7 +657,7 @@ def forward_backward_no_pipelining( if ( hasattr(config, 'cuda_graph_impl') and config.cuda_graph_impl == "local" - and "full_iteration" not in config.cuda_graph_scope + and CudaGraphScope.full_iteration not in config.cuda_graph_scope ): create_cudagraphs() @@ -1923,7 +1924,7 @@ def pp_post_backward(input_tensor_grad, vp_stage=None): if ( hasattr(config, 'cuda_graph_impl') and config.cuda_graph_impl == "local" - and "full_iteration" not in config.cuda_graph_scope + and CudaGraphScope.full_iteration not in config.cuda_graph_scope ): create_cudagraphs() nvtx_range_pop(suffix="misc") @@ -2310,7 +2311,7 @@ def enable_grad_sync(): if ( hasattr(config, 'cuda_graph_impl') and config.cuda_graph_impl == "local" - and "full_iteration" not in config.cuda_graph_scope + and CudaGraphScope.full_iteration not in config.cuda_graph_scope ): create_cudagraphs() diff --git a/megatron/core/safe_globals.py b/megatron/core/safe_globals.py index d2baed2a4a0..41239c310b0 100755 --- a/megatron/core/safe_globals.py +++ b/megatron/core/safe_globals.py @@ -12,7 +12,7 @@ from megatron.core.enums import ModelType from megatron.core.rerun_state_machine import RerunDiagnostic, RerunMode, RerunState -from megatron.core.transformer.enums import AttnBackend +from megatron.core.transformer.enums import AttnBackend, CudaGraphScope SAFE_GLOBALS = [ SimpleNamespace, @@ -23,6 +23,7 @@ UInt32DType, Namespace, AttnBackend, + CudaGraphScope, ModelType, RerunDiagnostic, RerunMode, diff --git a/megatron/core/ssm/mamba_block.py b/megatron/core/ssm/mamba_block.py index 1bcadd0af10..3201a8bfb28 100644 --- a/megatron/core/ssm/mamba_block.py +++ b/megatron/core/ssm/mamba_block.py @@ -25,6 +25,7 @@ from megatron.core.ssm.mamba_hybrid_layer_allocation import allocate_layers from megatron.core.tensor_parallel import get_cuda_rng_tracker from megatron.core.transformer import TransformerConfig +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.spec_utils import ModuleSpec, build_module @@ -294,7 +295,7 @@ def forward( ( ( self.config.cuda_graph_impl == "local" - and "full_iteration" not in self.config.cuda_graph_scope + and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope ) or self.config.flash_decode ) diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index f6f40027789..5cf22d25a4b 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -45,7 +45,7 @@ from ..models.common.embeddings.yarn_rotary_pos_embedding import ( _yarn_get_concentration_factor_from_config, ) -from .enums import AttnMaskType +from .enums import AttnMaskType, CudaGraphScope from .transformer_config import TransformerConfig try: @@ -829,7 +829,7 @@ def forward( if ( in_decode_mode and self.config.cuda_graph_impl == "local" - and "full_iteration" not in self.config.cuda_graph_scope + and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope and inference_context.is_static_batching() ): raise ValueError(f"CUDA graphs must use flash decode with static batching!") diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index 12f15ee980a..5b0a0333d9e 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -21,6 +21,7 @@ get_all_rng_states, get_cuda_rng_tracker, ) +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.module import GraphableMegatronModule, MegatronModule from megatron.core.transformer.transformer_config import TransformerConfig @@ -1344,24 +1345,24 @@ def _layer_is_graphable(layer, config): from megatron.core.transformer.moe.moe_layer import MoELayer from megatron.core.transformer.transformer_layer import TransformerLayer - if isinstance(layer, MambaLayer) and 'mamba' in config.cuda_graph_scope: + if isinstance(layer, MambaLayer) and CudaGraphScope.mamba in config.cuda_graph_scope: # mamba layer. return True if isinstance(layer, TransformerLayer): - if 'attn' in config.cuda_graph_scope and not ( + if CudaGraphScope.attn in config.cuda_graph_scope and not ( isinstance(layer.self_attention, IdentityOp) and isinstance(layer.cross_attention, IdentityOp) ): # attn layer. return True if ( - 'moe' in config.cuda_graph_scope - or 'moe_router' in config.cuda_graph_scope - or 'moe_preprocess' in config.cuda_graph_scope + CudaGraphScope.moe in config.cuda_graph_scope + or CudaGraphScope.moe_router in config.cuda_graph_scope + or CudaGraphScope.moe_preprocess in config.cuda_graph_scope ) and isinstance(layer.mlp, MoELayer): # moe layer. return True - if 'mlp' in config.cuda_graph_scope and isinstance(layer.mlp, MLP): + if CudaGraphScope.mlp in config.cuda_graph_scope and isinstance(layer.mlp, MLP): # mlp layer. return True return False @@ -1388,7 +1389,7 @@ def __init__(self, model, config, seq_length, micro_batch_size, optimizers=[]): "Setting NCCL_GRAPH_REGISTER=0 to avoid illegal memory access when using " "CUDA Graph with PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True." ) - assert "full_iteration" not in config.cuda_graph_scope, ( + assert CudaGraphScope.full_iteration not in config.cuda_graph_scope, ( "full_iteration cuda graph is not supported for cuda_graph_impl=transformer_engine. " "Please use cuda_graph_impl=local instead." ) @@ -1529,7 +1530,7 @@ def get_rotary_pos_emb(transformer_module, transformer_input): and not isinstance(layer.self_attention, IdentityOp) and ( not self.config.cuda_graph_scope - or 'attn' in self.config.cuda_graph_scope + or CudaGraphScope.attn in self.config.cuda_graph_scope ) ) if is_te_min_version("1.10.0"): @@ -1712,3 +1713,33 @@ def cuda_graph_set_manual_hooks(self): model_chunk = self.model[chunk_number] for layer in layers: layer.setup_manual_hooks(model_chunk._make_forward_pre_hook) + + def delete_cuda_graphs(self): + """ + Delete all CUDA graphs. + """ + assert self._graphs_created, "CUDA Graphs have not been created." + + graph_resettable = is_te_min_version("2.10.0") + graphs_reset, graphs_not_reset = 0, 0 + for layers in self.callables_per_chunk: + for layer in layers: + for graph in layer.cuda_graphs: + if graph_resettable: + graph.reset() + graphs_reset += 1 + else: + graphs_not_reset += 1 + layer.cuda_graphs = [] + layer.cuda_graph_manual_hooks = [] + + log_on_each_pipeline_stage( + logger=logger, + tp_group=None, + dp_cp_group=None, + level=logging.INFO, + msg=f'Rank {torch.distributed.get_rank()}: ' + f'{graphs_reset} graphs deleted with explicit reset, ' + f'{graphs_not_reset} graphs deleted without explicit reset.', + ) + self._graphs_created = False diff --git a/megatron/core/transformer/enums.py b/megatron/core/transformer/enums.py index 52b82029f90..d06d58d65f2 100644 --- a/megatron/core/transformer/enums.py +++ b/megatron/core/transformer/enums.py @@ -65,3 +65,15 @@ class AttnBackend(enum.Enum): unfused = 3 local = 4 auto = 5 + + +class CudaGraphScope(enum.Enum): + """Cuda Graph Scope - defines which parts of the model to capture.""" + + full_iteration = 1 # Captures the entire training/inference iteration + attn = 2 # Captures attention layers + mlp = 3 # Captures MLP layers (dense layers only) + moe = 4 # Captures MoE layers (drop-and-pad MoE layers only) + moe_router = 5 # Captures MoE router part + moe_preprocess = 6 # Captures MoE preprocessing part (requires moe_router) + mamba = 7 # Captures Mamba layers diff --git a/megatron/core/transformer/moe/fused_a2a.py b/megatron/core/transformer/moe/fused_a2a.py index 60b0b11a32c..045a93039b3 100644 --- a/megatron/core/transformer/moe/fused_a2a.py +++ b/megatron/core/transformer/moe/fused_a2a.py @@ -320,6 +320,14 @@ def init_hybrid_ep_buffer( ) +def reset_hybrid_ep_buffer(): + ''' + Reset the HybridEP buffer + ''' + global _hybrid_ep_buffer + _hybrid_ep_buffer = None + + class HybridEPDispatch(torch.autograd.Function): ''' Fused dispatch operation for permute + dispatch a2a + permute using the HybridEP backend diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py index d28cbfea3fe..3ed31d375e2 100644 --- a/megatron/core/transformer/moe/moe_utils.py +++ b/megatron/core/transformer/moe/moe_utils.py @@ -11,6 +11,7 @@ from megatron.core.fp8_utils import get_fp8_align_size from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.cuda_graphs import is_graph_capturing +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.transformer_config import TransformerConfig try: @@ -1205,13 +1206,13 @@ def maybe_raise_signal(moe_layer, **kwargs): ): if ( step_condition == "route" - and 'moe_router' in moe_layer.config.cuda_graph_scope - and 'moe_preprocess' not in moe_layer.config.cuda_graph_scope + and CudaGraphScope.moe_router in moe_layer.config.cuda_graph_scope + and CudaGraphScope.moe_preprocess not in moe_layer.config.cuda_graph_scope ): raise MoECudaGraphPartialCaptureSignal(moe_layer, "route", **kwargs) elif ( step_condition == "preprocess" - and 'moe_preprocess' in moe_layer.config.cuda_graph_scope + and CudaGraphScope.moe_preprocess in moe_layer.config.cuda_graph_scope ): raise MoECudaGraphPartialCaptureSignal(moe_layer, "preprocess", **kwargs) diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py index b2135fdb00d..af8ae572adb 100644 --- a/megatron/core/transformer/moe/token_dispatcher.py +++ b/megatron/core/transformer/moe/token_dispatcher.py @@ -16,6 +16,7 @@ gather_from_sequence_parallel_region, reduce_scatter_to_sequence_parallel_region, ) +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.moe.fused_a2a import ( fused_combine, fused_dispatch, @@ -436,7 +437,7 @@ def __init__( } if ( config.cuda_graph_impl == "transformer_engine" - and 'moe_preprocess' in config.cuda_graph_scope + and CudaGraphScope.moe_preprocess in config.cuda_graph_scope ): self.cuda_dtoh_point = "before_ep_alltoall" else: @@ -1075,10 +1076,13 @@ def combine( num_permuted_tokens=self.num_permuted_tokens, pad_multiple=self.pad_multiple, ) - # Release the used handle/num_permuted_tokens which could change in each iteration + # Release the used handle/num_permuted_tokens which could change in each iteration. + # For drop_and_pad mode, we don't need to reset the num_permuted_tokens and + # num_dispatched_tokens, because their values never change. self.handle = None - self.num_permuted_tokens = None - self.num_dispatched_tokens = None + if not self.drop_and_pad: + self.num_permuted_tokens = None + self.num_dispatched_tokens = None return hidden_states def get_permuted_hidden_states_by_experts(self, hidden_states: torch.Tensor) -> torch.Tensor: diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index 6f69927e9e8..023db1fe75a 100755 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -21,7 +21,7 @@ ) from megatron.core.pipeline_parallel.utils import is_vp_first_stage, is_vp_last_stage from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.transformer.enums import LayerType +from megatron.core.transformer.enums import CudaGraphScope, LayerType from megatron.core.transformer.module import GraphableMegatronModule, MegatronModule from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.transformer_config import TransformerConfig @@ -555,7 +555,7 @@ def _should_call_local_cudagraph(self, *args, **kwargs): kwargs.get('inference_context') is not None or kwargs.get('inference_params') is not None ) - and 'full_iteration' in self.config.cuda_graph_scope + and CudaGraphScope.full_iteration in self.config.cuda_graph_scope ): if kwargs['inference_context'].is_static_batching(): using_cuda_graph = kwargs['inference_context'].is_decode_only() diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index 656699ea2a2..a3a16754977 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -9,7 +9,7 @@ from megatron.core.enums import Fp4Recipe, Fp8Recipe from megatron.core.quantization.quant_config import RecipeConfig -from megatron.core.transformer.enums import AttnBackend +from megatron.core.transformer.enums import AttnBackend, CudaGraphScope from megatron.core.transformer.pipeline_parallel_layer_layout import PipelineParallelLayerLayout from ..fusions.fused_bias_geglu import quick_gelu @@ -733,7 +733,7 @@ class TransformerConfig(ModelParallelConfig): excluding optimizer) is enabled. "transformer_engine": capture the CUDA graph using TE make_graphed_callables().""" - cuda_graph_scope: Optional[List[str]] = None + cuda_graph_scope: Optional[List[CudaGraphScope]] = None """Determines the CUDA graphs capturing scope. When cuda_graph_impl is set to "transformer_engine", valid values are "attn", "mlp", "moe", "moe_router", "moe_preprocess", "mamba". None means the full layer. @@ -1615,65 +1615,76 @@ def __post_init__(self): 'use cuda_graph_impl=transformer_engine instead.' ) self.cuda_graph_impl = "transformer_engine" + if self.cuda_graph_scope is None: self.cuda_graph_scope = [] + elif not isinstance(self.cuda_graph_scope, list): + if isinstance(self.cuda_graph_scope, CudaGraphScope): + self.cuda_graph_scope = [self.cuda_graph_scope] + else: + assert isinstance(self.cuda_graph_scope, str), ( + "cuda_graph_scope must be a string that can be converted to a list of " + f"CudaGraphScope, got {self.cuda_graph_scope}." + ) + self.cuda_graph_scope = self.cuda_graph_scope.split(',') + if all(isinstance(scope, str) for scope in self.cuda_graph_scope): + # Backward compatibility for "full" scope. Now we use an empty list instead. + if "full" in self.cuda_graph_scope: + assert self.cuda_graph_scope == [ + "full" + ], "full scope cannot be used with other scopes." + warnings.warn( + "full scope is deprecated. " + "Use empty cuda_graph_scope to capture the whole layer." + ) + self.cuda_graph_scope = [] + else: + self.cuda_graph_scope = [CudaGraphScope[scope] for scope in self.cuda_graph_scope] + assert all( + isinstance(scope, CudaGraphScope) for scope in self.cuda_graph_scope + ), f"cuda_graph_scope must be a list of CudaGraphScope, got {self.cuda_graph_scope}." + if self.cuda_graph_impl != "none": assert self.cuda_graph_impl in [ "transformer_engine", "local", ], f"Invalid cuda graph implementation: {self.cuda_graph_impl}" + if self.cpu_offloading: raise ValueError("CUDA graphs not supported with CPU offloading.") - elif not isinstance(self.cuda_graph_scope, list): - assert isinstance(self.cuda_graph_scope, str), ( - "cuda_graph_scope must be a string or a list of strings, " - f"got {self.cuda_graph_scope}." - ) - self.cuda_graph_scope = [self.cuda_graph_scope] - if self.cuda_graph_impl == "local": - assert not self.cuda_graph_scope or self.cuda_graph_scope == ["full_iteration"], ( - "For local cuda graph implementation, the only valid value " - "for cuda_graph_scope is full_iteration. " - "To use other scopes, use cuda_graph_impl=transformer_engine." + assert not self.cuda_graph_scope or self.cuda_graph_scope == [ + CudaGraphScope.full_iteration + ], ( + "For local cuda graph implementation, the only valid value for " + "cuda_graph_scope is full_iteration, or an empty list to denote layerwise " + "graphs. To use other scopes, use cuda_graph_impl=transformer_engine." ) if self.cuda_graph_impl == "transformer_engine": - assert "full_iteration" not in self.cuda_graph_scope, ( + assert CudaGraphScope.full_iteration not in self.cuda_graph_scope, ( "To use full iteration cuda graph, please use " - "cuda_graph_impl=transformer_engine instead of cuda_graph_impl=local." + "cuda_graph_impl=local instead of cuda_graph_impl=transformer_engine." ) - for scope in self.cuda_graph_scope: - assert scope in [ - 'attn', - 'mlp', - 'moe', - 'moe_router', - 'moe_preprocess', - 'mamba', - ], ( - "--cuda-graph-scope should be attn, mlp, moe, moe_router, moe_preprocess, " - f"or mamba, got {self.cuda_graph_scope}." - ) - assert ( - 'moe' not in self.cuda_graph_scope or 'moe_router' not in self.cuda_graph_scope + CudaGraphScope.moe not in self.cuda_graph_scope + or CudaGraphScope.moe_router not in self.cuda_graph_scope ), 'cuda_graph_scope must not contain both moe and moe_router.' - if 'moe_preprocess' in self.cuda_graph_scope: + if CudaGraphScope.moe_preprocess in self.cuda_graph_scope: assert ( - 'moe_router' in self.cuda_graph_scope + CudaGraphScope.moe_router in self.cuda_graph_scope ), 'moe_preprocess cuda graph is only supported with moe_router cuda graph.' if self.num_moe_experts is None or self.num_moe_experts <= 1: assert ( - 'moe' not in self.cuda_graph_scope - and 'moe_router' not in self.cuda_graph_scope + CudaGraphScope.moe not in self.cuda_graph_scope + and CudaGraphScope.moe_router not in self.cuda_graph_scope ), 'moe cuda graph is only supported for MoE.' else: if self.moe_layer_freq == 1 or ( isinstance(self.moe_layer_freq, list) and 0 not in self.moe_layer_freq ): - assert 'mlp' not in self.cuda_graph_scope, ( + assert CudaGraphScope.mlp not in self.cuda_graph_scope, ( 'mlp cuda graph is only supported for dense layers, ' 'but not found in the model.' ) @@ -1682,13 +1693,13 @@ def __post_init__(self): or not self.moe_pad_expert_input_to_capacity ): assert ( - 'moe' not in self.cuda_graph_scope + CudaGraphScope.moe not in self.cuda_graph_scope ), 'moe cuda graph is only supported with drop-padding MoE.' if self.moe_token_dispatcher_type == 'alltoall' and ( self.moe_expert_capacity_factor is not None or self.moe_router_padding_for_quantization ): - assert 'moe_preprocess' not in self.cuda_graph_scope, ( + assert CudaGraphScope.moe_preprocess not in self.cuda_graph_scope, ( 'moe_preprocess cuda graph is not supported when there are ' 'DtoH copies and synchronizations in the preprocess step.' ) @@ -1698,25 +1709,28 @@ def __post_init__(self): raise ValueError( "Full-layer CUDA graphs not supported with activation recomputation." ) - elif self.cuda_graph_scope != ['full_iteration']: + elif self.cuda_graph_scope != [CudaGraphScope.full_iteration]: # For scoped CUDA graphs, only the non-graphed parts of the layer can be # recomputed. So check if there are overlaps between the recomputed parts # and the graphed parts. - if "attn" in self.cuda_graph_scope: + if CudaGraphScope.attn in self.cuda_graph_scope: for module in self.recompute_modules: if module in ['core_attn', 'mla_up_proj']: raise ValueError( f'attn cuda graph is not supported with {module} recompute.' ) - if "mlp" in self.cuda_graph_scope and "mlp" in self.recompute_modules: + if ( + CudaGraphScope.mlp in self.cuda_graph_scope + and "mlp" in self.recompute_modules + ): raise ValueError(f'mlp cuda graph is not supported with mlp recompute.') - if "moe" in self.cuda_graph_scope: + if CudaGraphScope.moe in self.cuda_graph_scope: for module in self.recompute_modules: if module in ['moe_act', 'moe', 'shared_experts']: raise ValueError( f'moe cuda graph is not supported with {module} recompute.' ) - if "moe_router" in self.cuda_graph_scope: + if CudaGraphScope.moe_router in self.cuda_graph_scope: for module in self.recompute_modules: if module in ['moe', 'shared_experts']: raise ValueError( @@ -1725,25 +1739,25 @@ def __post_init__(self): ) if "layernorm" in self.recompute_modules: if ( - "attn" in self.cuda_graph_scope - and "mlp" in self.cuda_graph_scope + CudaGraphScope.attn in self.cuda_graph_scope + and CudaGraphScope.mlp in self.cuda_graph_scope and ( - "moe" in self.cuda_graph_scope - or "moe_router" in self.cuda_graph_scope + CudaGraphScope.moe in self.cuda_graph_scope + or CudaGraphScope.moe_router in self.cuda_graph_scope ) ): raise ValueError( 'cuda graph is not supported with layernorm recompute.' ) - if "attn" in self.cuda_graph_scope: + if CudaGraphScope.attn in self.cuda_graph_scope: warnings.warn( "input_layernorm recompute is not supported with attention " "cudagraph. Will only recompute the pre_mlp_layernorm." ) if ( - "mlp" in self.cuda_graph_scope - or "moe" in self.cuda_graph_scope - or "moe_router" in self.cuda_graph_scope + CudaGraphScope.mlp in self.cuda_graph_scope + or CudaGraphScope.moe in self.cuda_graph_scope + or CudaGraphScope.moe_router in self.cuda_graph_scope ): warnings.warn( "pre_mlp_layernorm recompute is not supported with mlp/moe " diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index f89678e6216..3ea40577009 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -16,7 +16,7 @@ from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.cuda_graphs import is_graph_capturing -from megatron.core.transformer.enums import LayerType +from megatron.core.transformer.enums import CudaGraphScope, LayerType from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp from megatron.core.transformer.mlp import MLP from megatron.core.transformer.module import GraphableMegatronModule @@ -382,18 +382,21 @@ def __init__( if "layernorm" in self.config.recompute_modules: if not isinstance(self.input_layernorm, IdentityOp) and ( self.config.cuda_graph_impl == "none" - or 'attn' not in self.config.cuda_graph_scope + or CudaGraphScope.attn not in self.config.cuda_graph_scope ): self.recompute_input_layernorm = True if self.config.fp8 or self.config.fp4: self.self_attention.set_for_recompute_input_layernorm() if not isinstance(self.pre_mlp_layernorm, IdentityOp) and ( self.config.cuda_graph_impl == "none" - or (not self.is_moe_layer and 'mlp' not in self.config.cuda_graph_scope) + or ( + not self.is_moe_layer + and CudaGraphScope.mlp not in self.config.cuda_graph_scope + ) or ( self.is_moe_layer - and 'moe' not in self.config.cuda_graph_scope - and 'moe_router' not in self.config.cuda_graph_scope + and CudaGraphScope.moe not in self.config.cuda_graph_scope + and CudaGraphScope.moe_router not in self.config.cuda_graph_scope ) ): self.recompute_pre_mlp_layernorm = True @@ -634,12 +637,13 @@ def _forward_mlp(self, hidden_states, inference_context=None): and self.config.cuda_graph_impl == "transformer_engine" and self.training and is_graph_capturing() - and 'moe_router' in self.config.cuda_graph_scope + and CudaGraphScope.moe_router in self.config.cuda_graph_scope ): assert ( not self.recompute_pre_mlp_layernorm ), "Recomputation is not supported for CUDA graph." cudagraph_outputs = self.mlp(pre_mlp_layernorm_output) + nvtx_range_pop(suffix="mlp") return cudagraph_outputs + [residual] elif self.recompute_mlp: if self.config.fp8 or self.config.fp4: @@ -694,6 +698,7 @@ def _forward_post_mlp(self, mlp_output_with_bias, residual): Returns: output (Tensor): Transformed hidden states of shape [s, b, h]. """ + from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( fine_grained_offloading_group_commit, ) @@ -757,7 +762,7 @@ def get_layer_static_inputs(self, seq_length, micro_batch_size): static_inputs = super().get_layer_static_inputs(seq_length, micro_batch_size) if not isinstance(self.self_attention, IdentityOp) and ( - not self.config.cuda_graph_scope or 'attn' in self.config.cuda_graph_scope + not self.config.cuda_graph_scope or CudaGraphScope.attn in self.config.cuda_graph_scope ): slen_per_cp = seq_length // self.config.context_parallel_size static_inputs["attention_mask"] = ( @@ -776,18 +781,18 @@ def _get_submodules_under_cudagraphs(self): return super()._get_submodules_under_cudagraphs() submodules = [] - if 'attn' in self.config.cuda_graph_scope: + if CudaGraphScope.attn in self.config.cuda_graph_scope: submodules += [ self.input_layernorm, self.self_attention, self.pre_cross_attn_layernorm, self.cross_attention, ] - if (not self.is_moe_layer and 'mlp' in self.config.cuda_graph_scope) or ( - self.is_moe_layer and 'moe' in self.config.cuda_graph_scope + if (not self.is_moe_layer and CudaGraphScope.mlp in self.config.cuda_graph_scope) or ( + self.is_moe_layer and CudaGraphScope.moe in self.config.cuda_graph_scope ): submodules += [self.pre_mlp_layernorm, self.mlp] - elif self.is_moe_layer and 'moe_router' in self.config.cuda_graph_scope: + elif self.is_moe_layer and CudaGraphScope.moe_router in self.config.cuda_graph_scope: submodules += [self.pre_mlp_layernorm, self.mlp.router] if ( self.config.moe_shared_expert_intermediate_size is not None @@ -805,7 +810,7 @@ def _te_cuda_graph_capture(self, *args, **kwargs): 2. If context is None, it cannot be returned as output. """ context = None - if not self.config.cuda_graph_scope or 'attn' in self.config.cuda_graph_scope: + if not self.config.cuda_graph_scope or CudaGraphScope.attn in self.config.cuda_graph_scope: hidden_states, context = self._forward_attention(*args, **kwargs) else: if len(args) > 0: @@ -815,12 +820,12 @@ def _te_cuda_graph_capture(self, *args, **kwargs): if ( not self.config.cuda_graph_scope - or (not self.is_moe_layer and 'mlp' in self.config.cuda_graph_scope) + or (not self.is_moe_layer and CudaGraphScope.mlp in self.config.cuda_graph_scope) or ( self.is_moe_layer and ( - 'moe' in self.config.cuda_graph_scope - or 'moe_router' in self.config.cuda_graph_scope + CudaGraphScope.moe in self.config.cuda_graph_scope + or CudaGraphScope.moe_router in self.config.cuda_graph_scope ) ) ): @@ -841,7 +846,7 @@ def _te_cuda_graph_replay(self, *args, **kwargs): Hence, `inference_context` and `packed_seq_params` are excluded from input list. """ context = None - if self.config.cuda_graph_scope and 'attn' not in self.config.cuda_graph_scope: + if self.config.cuda_graph_scope and CudaGraphScope.attn not in self.config.cuda_graph_scope: hidden_states, context = self._forward_attention(*args, **kwargs) args = (hidden_states,) kwargs = {} @@ -861,13 +866,13 @@ def _te_cuda_graph_replay(self, *args, **kwargs): if ( not self.config.cuda_graph_scope - or (not self.is_moe_layer and 'mlp' in self.config.cuda_graph_scope) - or (self.is_moe_layer and 'moe' in self.config.cuda_graph_scope) + or (not self.is_moe_layer and CudaGraphScope.mlp in self.config.cuda_graph_scope) + or (self.is_moe_layer and CudaGraphScope.moe in self.config.cuda_graph_scope) ): # CUDA Graph captures the whole MLP/MoE part. CUDA Graph output is the layer output. assert len(cuda_graph_output) == 1, "CUDA Graph output should be the layer output." output = cuda_graph_output.pop() - elif self.is_moe_layer and 'moe_router' in self.config.cuda_graph_scope: + elif self.is_moe_layer and CudaGraphScope.moe_router in self.config.cuda_graph_scope: # CUDA Graph partially captures the MoE. # The rest of the layer should go to the normal pass. shared_expert_output, routing_map, residual = None, None, None @@ -882,7 +887,7 @@ def _te_cuda_graph_replay(self, *args, **kwargs): # Split cudagraph outputs into function outputs and attribute outputs, and # process them separately. Function outputs should have three tensors. func_output, attr_outputs = cuda_graph_output[:3], cuda_graph_output[3:] - if 'moe_preprocess' in self.config.cuda_graph_scope: + if CudaGraphScope.moe_preprocess in self.config.cuda_graph_scope: hidden_states, probs, residual = func_output valid_cudagraph_attrs = self.mlp.token_dispatcher.valid_cudagraph_attrs assert len(attr_outputs) == len( @@ -989,7 +994,7 @@ def _should_call_local_cudagraph(self, *args, **kwargs): (kwargs.get('inference_context') is not None) or (kwargs.get('inference_params') is not None) ) - and 'full_iteration' not in self.config.cuda_graph_scope + and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope ): if kwargs['inference_context'].is_static_batching(): using_cuda_graph = kwargs['inference_context'].is_decode_only() diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 8be173c75a0..0cf2d006863 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -23,7 +23,7 @@ from megatron.core.rerun_state_machine import RerunStateMachine from megatron.core.transformer import MLATransformerConfig, TransformerConfig from megatron.core.transformer.pipeline_parallel_layer_layout import PipelineParallelLayerLayout -from megatron.core.transformer.enums import AttnBackend +from megatron.core.transformer.enums import AttnBackend, CudaGraphScope from megatron.core.transformer.heterogeneous.heterogeneous_config import ( HeterogeneousTransformerConfig, MLPConfig, @@ -772,7 +772,7 @@ def validate_args(args, defaults={}): if args.rank == 0: print('accumulate and all-reduce gradients in fp32 for ' 'bfloat16 data type.', flush=True) - if args.cuda_graph_impl == "local" and "full_iteration" in args.cuda_graph_scope: + if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope: if not args.inference_dynamic_batching: assert not args.check_for_nan_in_loss_and_grad, \ "--no-check-for-nan-in-loss-and-grad should be set with full_iteration CUDA graph" @@ -1273,6 +1273,15 @@ def validate_args(args, defaults={}): assert ( args.recompute_granularity != 'full' ), 'recompute_granularity must not be full when CUDA Graphs are enabled.' + if args.cuda_graph_scope == "full" or ( + isinstance(args.cuda_graph_scope, list) and "full" in args.cuda_graph_scope + ): + if isinstance(args.cuda_graph_scope, list): + assert args.cuda_graph_scope == ["full"], "full scope cannot be used with other scopes." + args.cuda_graph_scope = [] + warn_rank_0( + 'full scope is deprecated. Use empty cuda_graph_scope to capture the whole layer.' + ) if args.multi_latent_attention: assert not args.group_query_attention, "Group query attention is mutually exclusive with multi latent attention." @@ -1494,7 +1503,7 @@ def _add_inference_args(parser): '"none": no CUDA graph. ' '"local": capture the CUDA graph using MCore local implementation. --cuda-graph-scope=\"full_iteration\" enables whole iteration CUDA graph. ' '"transformer_engine": capture the CUDA graph using TE make_graphed_callables().') - group.add_argument('--cuda-graph-scope', nargs='+', type=str, default=[], + group.add_argument('--cuda-graph-scope', nargs='+', type=lambda scope: CudaGraphScope[scope] if scope != "full" else scope, default=[], help='Determines the CUDA graphs capturing scope. ' 'choices: "attn", "mlp", "moe", "moe_router", "moe_preprocess", "mamba", "full_iteration". ' '"attn": captures operations in TransformerLayer._forward_attention(). ' @@ -1506,7 +1515,8 @@ def _add_inference_args(parser): '"mamba": captures the mamba layer. ' '"full_iteration": captures a whole iteration. ' 'full_iteration scope is only supported with --cuda-graph-impl=local, other scopes are only supported with --cuda-graph-impl=transformer_engine. ' - 'If not specified, the default scope is to capture the whole Transformer layer.') + 'If not specified, the default scope is to capture the whole Transformer layer. ' + 'For backward compatibility, we still allow passing "full" to specify capturing the whole layer, and convert it to an empty list.') group.add_argument('--use-legacy-static-engine', action='store_true', default=False, help='Use legacy static engine. (Current static engine uses dynamic engine under the hood)', dest='use_legacy_static_engine') diff --git a/megatron/training/training.py b/megatron/training/training.py index 5c9de623ce5..e88b9839d28 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -59,6 +59,7 @@ from megatron.training.checkpointing import checkpoint_exists from megatron.core.full_cuda_graph import FullCudaGraphWrapper from megatron.core.transformer.cuda_graphs import TECudaGraphHelper +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.module import Float16Module from megatron.core.distributed import DistributedDataParallelConfig, TorchFullyShardedDataParallelConfig from megatron.core.distributed import DistributedDataParallel as DDP @@ -2277,7 +2278,7 @@ def train( eval_iterations = 0 # Wrap forward_backward_func for Full iteration CUDA graph forward_backward_func = get_forward_backward_func() - if args.cuda_graph_impl == "local" and "full_iteration" in args.cuda_graph_scope: + if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope: forward_backward_func = FullCudaGraphWrapper(forward_backward_func, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps) def get_e2e_base_metrics(): @@ -2626,6 +2627,10 @@ def get_e2e_base_metrics(): if should_exit: break + # Destroy CUDA Graphs. + if args.cuda_graph_impl == "transformer_engine" and cuda_graph_helper.graphs_created(): + cuda_graph_helper.delete_cuda_graphs() + one_logger_utils.track_e2e_metrics() # Flush TensorBoard, WandB writers and one-logger. @@ -2699,7 +2704,7 @@ def evaluate( eval_batch_size = args.global_batch_size eval_num_microbatches = eval_batch_size // (args.micro_batch_size * args.data_parallel_size) forward_backward_func = get_forward_backward_func() - if args.cuda_graph_impl == "local" and "full_iteration" in args.cuda_graph_scope: + if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope: forward_backward_func = FullCudaGraphWrapper(forward_backward_func, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps) if eval_iters is None: diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py index 0ac4b296746..26d3dcfbd6d 100644 --- a/tests/unit_tests/inference/engines/test_dynamic_engine.py +++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py @@ -3,7 +3,7 @@ import asyncio import random import types -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Dict, List, Optional, Tuple import pytest @@ -41,6 +41,7 @@ from megatron.core.models.mamba.mamba_model import MambaModel from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed from megatron.core.transformer.cuda_graphs import CudaGraphManager, _CudagraphGlobalRecord +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import ( check_mamba_sequence_packing_support, @@ -103,7 +104,9 @@ class DynamicEngineTestConfig: return_log_probs: bool = False materialize_only_last_token_logits: bool = True skip_prompt_log_probs: bool = False - cuda_graph_scope: List[str] = None + cuda_graph_scope: List[CudaGraphScope] = field( + default_factory=lambda: [CudaGraphScope.full_iteration] + ) force_build_cuda_graphs: bool = False # If False, do not build cuda graphs in the tests, even if # num_cuda_graphs is set. @@ -136,9 +139,6 @@ def __post_init__(self): if self.context_max_tokens_override is None: self.context_max_tokens_override = self.num_requests * self.max_sequence_length - if self.cuda_graph_scope is None: - self.cuda_graph_scope = ["full_iteration"] - @dataclass class DynamicEngineTestEnv: @@ -514,7 +514,7 @@ def teardown_method(self, method): ) @pytest.mark.parametrize("model_provider", ["gpt", "mamba"]) @pytest.mark.parametrize("num_cuda_graphs", [None, 1, 4]) - @pytest.mark.parametrize("cuda_graph_scope", [[], ["full_iteration"]]) + @pytest.mark.parametrize("cuda_graph_scope", [[], [CudaGraphScope.full_iteration]]) def test_simple(self, model_provider, num_cuda_graphs, cuda_graph_scope) -> None: """Simple test that runs without errors, and validates output.""" skip_if_mamba_sequence_packing_not_available(model_provider) diff --git a/tests/unit_tests/test_fp8_param.py b/tests/unit_tests/test_fp8_param.py index 0b8d41769ec..361698f7127 100644 --- a/tests/unit_tests/test_fp8_param.py +++ b/tests/unit_tests/test_fp8_param.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import contextlib import gc @@ -36,7 +36,10 @@ try: from transformer_engine.pytorch.tensor.utils import post_all_gather_processing - cuda_graph_supported = True + if is_te_min_version("2.10.0"): + cuda_graph_supported = True + else: + reason_for_no_cuda_graph = "Need newer TransformerEngine" except ImportError: reason_for_no_cuda_graph = "Need newer TransformerEngine" @@ -65,12 +68,16 @@ class TestFP8Param: def setup_method(self, method): self.seq_length = 512 self.micro_batch_size = 2 + self.cuda_graph_helper = None os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1' def teardown_method(self, method): Utils.destroy_model_parallel() destroy_global_vars() destroy_num_microbatches_calculator() + if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created(): + self.cuda_graph_helper.delete_cuda_graphs() + self.cuda_graph_helper = None gc.collect() def model_provider( @@ -209,13 +216,12 @@ def _run_test_helper( ) assert len(gpt_model) == 1 # Assume only one model in the model provider. - cuda_graph_helper = None # Hard coded to use cuda_graph_impl="transformer_engine" cuda_graph_impl = "transformer_engine" if use_cuda_graph and cuda_graph_impl == "transformer_engine": from megatron.core.transformer.cuda_graphs import TECudaGraphHelper - cuda_graph_helper = TECudaGraphHelper( + self.cuda_graph_helper = TECudaGraphHelper( model=gpt_model, config=gpt_model[0].config, seq_length=self.seq_length, @@ -250,13 +256,13 @@ def _run_test_helper( # Capture CUDA graphs after warmup if helper is provided. # Hard coded cuda_graph_warmup_steps = 0. cuda_graph_warmup_steps = 0 - if cuda_graph_helper is not None and i == cuda_graph_warmup_steps: + if self.cuda_graph_helper is not None and i == cuda_graph_warmup_steps: if should_disable_forward_pre_hook(args): disable_forward_pre_hook(gpt_model, param_sync=False) - cuda_graph_helper.create_cudagraphs() + self.cuda_graph_helper.create_cudagraphs() if should_disable_forward_pre_hook(args): enable_forward_pre_hook(gpt_model) - cuda_graph_helper.cuda_graph_set_manual_hooks() + self.cuda_graph_helper.cuda_graph_set_manual_hooks() # For the mxfp8_param with reuse_grad_buf_for_mxfp8_param_ag and dp_ag_overlap, # we need to call the _copy_main_params_to_param_buffer() after the grad buffer @@ -297,6 +303,10 @@ def _run_test_helper( loss_list.append(loss.item()) + if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created(): + self.cuda_graph_helper.delete_cuda_graphs() + self.cuda_graph_helper = None + return torch.tensor(loss_list) def run_test(self, tp_size, recipe, inference: bool = False, **kwargs): diff --git a/tests/unit_tests/transformer/test_cuda_graphs.py b/tests/unit_tests/transformer/test_cuda_graphs.py index 3ad0262a1cf..cee75171560 100644 --- a/tests/unit_tests/transformer/test_cuda_graphs.py +++ b/tests/unit_tests/transformer/test_cuda_graphs.py @@ -9,6 +9,7 @@ import pytest import torch +from transformer_engine.pytorch.fp8 import check_fp8_support from megatron.core import parallel_state from megatron.core.enums import ModelType @@ -25,6 +26,7 @@ TextGenerationController, ) from megatron.core.models.gpt.gpt_layer_specs import ( + get_gpt_decoder_block_spec, get_gpt_layer_local_spec, get_gpt_layer_with_transformer_engine_spec, get_gpt_mtp_block_spec, @@ -41,6 +43,8 @@ model_parallel_cuda_manual_seed, ) from megatron.core.transformer.cuda_graphs import CudaGraphManager, _CudagraphGlobalRecord +from megatron.core.transformer.enums import CudaGraphScope +from megatron.core.transformer.moe.fused_a2a import reset_hybrid_ep_buffer from megatron.core.transformer.transformer_block import TransformerBlock from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import is_fa_min_version, is_te_min_version @@ -54,6 +58,8 @@ from megatron.training.training import setup_model_and_optimizer from tests.unit_tests.test_utilities import Utils +fp8_available, _ = check_fp8_support() + class TestParallelTransformerBlockCudagraphs: def setup_method(self, method): @@ -747,6 +753,9 @@ class TestPartialCudaGraph: def setup_method(self, method): self.seq_length = 512 self.micro_batch_size = 2 + self.tp_size = 2 + self.cp_size = 2 + self.cuda_graph_helper = None # Store original environment variable values self.original_env = { 'CUDA_DEVICE_MAX_CONNECTIONS': os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS'), @@ -762,22 +771,28 @@ def teardown_method(self, method): os.environ.pop(key, None) else: os.environ[key] = value - Utils.destroy_model_parallel() destroy_global_vars() destroy_num_microbatches_calculator() + if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created(): + self.cuda_graph_helper.delete_cuda_graphs() + self.cuda_graph_helper = None gc.collect() def model_provider( self, pre_process=True, post_process=True, - layer_spec_fn=get_gpt_layer_with_transformer_engine_spec, + layer_spec_fn=get_gpt_decoder_block_spec, **config_kwargs, ): - model_parallel_cuda_manual_seed(123) args = get_args() config = core_transformer_config_from_args(args) - transformer_layer_spec = layer_spec_fn() + transformer_layer_spec = layer_spec_fn( + config, + use_transformer_engine=True, + normalization=args.normalization, + qk_l2_norm=args.qk_l2_norm, + ) if args.mtp_num_layers: mtp_block_spec = get_gpt_mtp_block_spec( config, transformer_layer_spec, use_transformer_engine=True @@ -810,18 +825,17 @@ def create_test_args( args.num_layers = 4 args.mtp_num_layers = 1 args.vocab_size = 1024 - args.hidden_size = 128 + args.hidden_size = 512 args.num_attention_heads = 8 args.max_position_embeddings = 512 - args.global_batch_size = self.micro_batch_size * 8 + args.global_batch_size = self.micro_batch_size * 8 // self.tp_size // self.cp_size args.micro_batch_size = self.micro_batch_size args.create_attention_mask_in_dataloader = True args.seq_length = self.seq_length - args.tensor_model_parallel_size = 2 - args.sequence_parallel = True + args.tensor_model_parallel_size = self.tp_size + args.sequence_parallel = True if self.tp_size > 1 else False args.pipeline_model_parallel_size = 1 - args.context_parallel_size = 1 - args.expert_model_parallel_size = ep_size + args.context_parallel_size = self.cp_size args.train_iters = 10 args.lr = 3e-5 args.bf16 = True @@ -836,17 +850,26 @@ def create_test_args( # MoE settings args.num_experts = 4 args.expert_model_parallel_size = ep_size + args.expert_tensor_parallel_size = 1 if ep_size > 1 else self.tp_size args.moe_shared_expert_intermediate_size = 1024 - args.moe_layer_freq = "[0,0,1,1]" + args.moe_layer_freq = [0, 0, 1, 1] args.moe_permute_fusion = True args.moe_router_fusion = True args.moe_router_topk = 2 + args.moe_router_dtype = "fp32" # CUDA graph settings args.cuda_graph_impl = cuda_graph_impl args.cuda_graph_scope = cuda_graph_scope args.cuda_graph_warmup_steps = cuda_graph_warmup_steps - args.use_te_rng_tracker = cuda_graph_impl != "none" + + # fp8 settings + if fp8_available: + args.fp8 = "e4m3" + args.fp8_recipe = "tensorwise" + args.first_last_layers_bf16 = True + args.num_layers_at_start_in_bf16 = 1 + args.num_layers_at_end_in_bf16 = 1 for key, value in kwargs.items(): assert hasattr(args, key) @@ -856,15 +879,15 @@ def create_test_args( set_global_variables(args, False) return args - def get_batch(self, seq_length, micro_batch_size): - data = list(range(seq_length)) + def get_batch(self, seq_length, micro_batch_size, cp_size): + data = list(range(seq_length // cp_size)) input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() labels = 1 + torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() attention_mask = torch.ones( - (micro_batch_size, 1, seq_length, seq_length), dtype=bool + (micro_batch_size, 1, seq_length // cp_size, seq_length), dtype=bool ).cuda() - loss_mask = torch.ones(seq_length).repeat((micro_batch_size, 1)).cuda() + loss_mask = torch.ones(seq_length // cp_size).repeat((micro_batch_size, 1)).cuda() return input_ids, labels, position_ids, attention_mask, loss_mask def _run_test_helper( @@ -877,12 +900,10 @@ def _run_test_helper( set_args(args) torch.manual_seed(123) - Utils.initialize_model_parallel( - tensor_model_parallel_size=2, expert_model_parallel_size=ep_size - ) + model_parallel_cuda_manual_seed(123) input_ids, labels, position_ids, attention_mask, loss_mask = self.get_batch( - self.seq_length, self.micro_batch_size + self.seq_length, self.micro_batch_size, self.cp_size ) gpt_model, optimizer, _ = setup_model_and_optimizer( @@ -890,13 +911,10 @@ def _run_test_helper( ) assert len(gpt_model) == 1 # Assume only one model in the model provider. - loss_list = [] - - cuda_graph_helper = None if cuda_graph_impl == "transformer_engine": from megatron.core.transformer.cuda_graphs import TECudaGraphHelper - cuda_graph_helper = TECudaGraphHelper( + self.cuda_graph_helper = TECudaGraphHelper( model=gpt_model, config=gpt_model[0].config, seq_length=self.seq_length, @@ -904,14 +922,17 @@ def _run_test_helper( optimizers=[optimizer], ) + loss_list = [] + for i in range(100): gpt_model[0].zero_grad_buffer() optimizer.zero_grad() # Capture CUDA graphs after warmup if helper is provided - if cuda_graph_helper is not None and i == cuda_graph_warmup_steps: - cuda_graph_helper.create_cudagraphs() + if self.cuda_graph_helper is not None and i == cuda_graph_warmup_steps: + self.cuda_graph_helper.create_cudagraphs() + gpt_model[0].set_is_first_microbatch() output = gpt_model[0].forward( input_ids=input_ids, position_ids=position_ids, @@ -922,7 +943,7 @@ def _run_test_helper( # Check output shapes assert output.shape[0] == self.micro_batch_size - assert output.shape[1] == self.seq_length + assert output.shape[1] == self.seq_length // self.cp_size # Verify gradients loss = output.mean() @@ -936,16 +957,29 @@ def _run_test_helper( loss_list.append(loss.item()) + if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created(): + self.cuda_graph_helper.delete_cuda_graphs() + self.cuda_graph_helper = None + return torch.tensor(loss_list) @pytest.mark.skipif( - not (HAVE_TE and is_te_min_version("1.14.0")), - reason="Partial CUDA graph support requires TransformerEngine version >= 1.14.0", + not (HAVE_TE and is_te_min_version("2.10.0")), + reason="Partial CUDA graph UT support requires TransformerEngine version >= 2.10.0", ) @pytest.mark.parametrize("ep_size", [1, 4]) @pytest.mark.parametrize("moe_dropless_dispatcher", [False, True]) @pytest.mark.parametrize("moe_dispatcher_type", ["alltoall", "deepep", "hybridep"]) def test_moe_partial_cudagraph(self, ep_size, moe_dropless_dispatcher, moe_dispatcher_type): + initialize_rng_tracker(use_te_rng_tracker=True, force_reset=True) + Utils.initialize_model_parallel( + tensor_model_parallel_size=self.tp_size, + context_parallel_size=self.cp_size, + pipeline_model_parallel_size=1, + expert_tensor_parallel_size=1 if ep_size > 1 else self.tp_size, + expert_model_parallel_size=ep_size, + ) + extra_kwargs = {} if moe_dispatcher_type == "deepep": if not is_deep_ep_available(): @@ -962,19 +996,28 @@ def test_moe_partial_cudagraph(self, ep_size, moe_dropless_dispatcher, moe_dispa if not moe_dropless_dispatcher: if moe_dispatcher_type == "deepep": pytest.skip("Deep EP doesn't support drop&pad MoE") + if moe_dispatcher_type == "hybridep" and ep_size == 1: + pytest.skip("Hybrid EP doesn't support drop&pad MoE with ep_size == 1") extra_kwargs["moe_expert_capacity_factor"] = 1.0 extra_kwargs["moe_pad_expert_input_to_capacity"] = True loss_list_ref = self._run_test_helper(ep_size, "none", None, 0, **extra_kwargs) for cuda_graph_scope in [ None, - ["attn"], - ["moe"], - ["mlp", "moe_router"], - ["attn", "mlp", "moe_router", "moe_preprocess"], + [CudaGraphScope.attn], + [CudaGraphScope.moe], + [CudaGraphScope.mlp, CudaGraphScope.moe_router], + [ + CudaGraphScope.attn, + CudaGraphScope.mlp, + CudaGraphScope.moe_router, + CudaGraphScope.moe_preprocess, + ], ]: - if moe_dropless_dispatcher and (cuda_graph_scope is None or "moe" in cuda_graph_scope): - # Dropless MoE doesn't work with "moe" scope cudagraph. Skip. + if (moe_dropless_dispatcher or moe_dispatcher_type == "hybridep") and ( + cuda_graph_scope is None or CudaGraphScope.moe in cuda_graph_scope + ): + # Dropless MoE or Hybrid EP doesn't work with "moe" scope cudagraph. Skip. continue cuda_graph_warmup_steps = 3 loss_list = self._run_test_helper( @@ -986,6 +1029,10 @@ def test_moe_partial_cudagraph(self, ep_size, moe_dropless_dispatcher, moe_dispa ) assert torch.equal(loss_list, loss_list_ref) + if moe_dispatcher_type == "hybridep": + reset_hybrid_ep_buffer() + Utils.destroy_model_parallel() + if __name__ == "__main__": From 14b19b1a9f347cb860064dc40291e9de79d99e4b Mon Sep 17 00:00:00 2001 From: Robin Zhang Date: Tue, 2 Dec 2025 21:37:05 +0800 Subject: [PATCH 168/248] [Dev] Optimize TE cudagraph input memory (#2391) Signed-off-by: Robin Zhang --- megatron/core/transformer/cuda_graphs.py | 245 +++++++++++++---- .../transformer/test_cuda_graphs.py | 258 +++++++++++++++++- 2 files changed, 444 insertions(+), 59 deletions(-) diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index 5b0a0333d9e..f0fb39e6500 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -1485,72 +1485,204 @@ def graphs_created(self): """ return self._graphs_created - def _get_cuda_graph_input_data(self): + def _get_sample_arguments(self, order): """ - Create the CUDA Graph capturing input data. - The data is organized per-chunk per-microbatch per-layer. + Generate sample arguments and keyword arguments for CUDA Graph capturing with + memory-optimized buffer reuse. + + This method creates static input tensors for each (layer, microbatch) pair needed + by TE's make_graphed_callables(). It optimizes memory usage by reusing input buffers + across non-overlapping forward passes based on the pipeline parallel schedule. + This optimization is essential for reducing peak memory during CUDA Graph capturing with + many microbatches, as it allows buffers to be reused instead of allocating new ones for + later microbatches. + + Memory Optimization Strategy: + The 1F1B (one-forward-one-backward) interleaved schedule in pipeline parallelism + means that once a microbatch's backward pass completes, its input buffers are no + longer needed. This method tracks buffer lifecycle and reuses "consumed" buffers + (those whose backward has completed) for new forward passes with matching tensor + signatures (shape, dtype, layout). + + Example schedule: [1, 1, 1, 2, 2, 2, -2, 1, -2, 1, -2, 2, -1, 2, -1, -1, -2, -2, -1, -1] + - Positive values indicate forward passes (chunk_id = value) + - Negative values indicate backward passes (chunk_id = -value) + - When processing -2 (backward of chunk 2), its buffers become available for reuse + - The next forward with matching signature can reuse those buffers + + Args: + order (List[int]): The forward/backward execution order from + convert_schedule_table_to_order(). Positive integers represent forward passes + (1-indexed chunk ID), negative integers represent backward passes. + + Returns: + Tuple[List[Tuple], List[Dict]]: A tuple containing: + - sample_args: List of positional argument tuples for each (layer, microbatch). + Length = num_layers * num_microbatches. Elements with the same tensor + signature may share references to reduce memory allocation. + - sample_kwargs: List of keyword argument dicts for each (layer, microbatch). + Length = num_layers * num_microbatches. Elements with the same tensor + signature may share references to reduce memory allocation. + + Data Structures: + - fwd_sample_queues: Dict[chunk_id, List[Tuple[sample_keys, fwd_idx]]] + Queue of forward samples per chunk awaiting their backward pass. + - consumed_sample_queue: Dict[sample_keys, List[fwd_idx]] + Pool of buffer indices whose backward is complete, keyed by tensor signature. + - sample_keys: Tuple of (shape, dtype, layout) for args + (key, shape, dtype, layout) + for kwargs, used to match compatible buffers for reuse. """ + assert self.num_model_chunks == max( + order + ), "num_model_chunks must match the max chunk id in order." + assert ( + get_num_microbatches() == len(order) // self.num_model_chunks // 2 + ), "num_microbatches must match the number of microbatches in order." + + # Generate sample arguments and keyword arguments for capturing. + sample_args = [None] * (len(self.flattened_callables) * get_num_microbatches()) + sample_kwargs = [None] * (len(self.flattened_callables) * get_num_microbatches()) rotary_pos_emb_cache = {} - def get_rotary_pos_emb(transformer_module, transformer_input): - if ( - transformer_module.position_embedding_type == 'rope' - and not self.config.multi_latent_attention - ): - rotary_seq_len = transformer_module.rotary_pos_emb.get_rotary_seq_len( - None, transformer_module.decoder, transformer_input, self.config, None - ) - if rotary_seq_len not in rotary_pos_emb_cache: - rotary_pos_emb_cache[rotary_seq_len] = transformer_module.rotary_pos_emb( - rotary_seq_len + def _get_layer_static_inputs(layer, chunk_of_the_layer): + """ + Get the static inputs for a layer. + """ + assert layer in chunk_of_the_layer.decoder.layers or any( + layer is mtp_layer.transformer_layer for mtp_layer in chunk_of_the_layer.mtp.layers + ), "Layer is not in the chunk" + + def get_rotary_pos_emb(transformer_module, transformer_input): + if ( + transformer_module.position_embedding_type == 'rope' + and not self.config.multi_latent_attention + ): + rotary_seq_len = transformer_module.rotary_pos_emb.get_rotary_seq_len( + None, transformer_module.decoder, transformer_input, self.config, None ) - return rotary_pos_emb_cache[rotary_seq_len] - else: - return None + if rotary_seq_len not in rotary_pos_emb_cache: + rotary_pos_emb_cache[rotary_seq_len] = transformer_module.rotary_pos_emb( + rotary_seq_len + ) + return rotary_pos_emb_cache[rotary_seq_len] + else: + return None - # Generate sample arguments and keyword arguments for capturing. - sample_args = [] - sample_kwargs = [] - for chunk_number, chunk_with_decoder in enumerate(self.chunks_with_decoder): - if chunk_with_decoder is None: - continue - layers = self.callables_per_chunk[chunk_number] - for _ in range(get_num_microbatches()): - for layer in layers: - static_inputs = layer.get_layer_static_inputs( - self.seq_length, self.micro_batch_size - ) + static_inputs = layer.get_layer_static_inputs(self.seq_length, self.micro_batch_size) - from megatron.core.transformer.identity_op import IdentityOp - from megatron.core.transformer.transformer_layer import TransformerLayer + from megatron.core.transformer.identity_op import IdentityOp + from megatron.core.transformer.transformer_layer import TransformerLayer - contains_self_attn = ( - isinstance(layer, TransformerLayer) - and not isinstance(layer.self_attention, IdentityOp) - and ( - not self.config.cuda_graph_scope - or CudaGraphScope.attn in self.config.cuda_graph_scope - ) - ) - if is_te_min_version("1.10.0"): - # te.make_graphed_callables() accepts keyword arguments since 1.10.0. - hidden_states = static_inputs.pop("hidden_states") - sample_args.append((hidden_states,)) - if contains_self_attn: - rotary_pos_emb = get_rotary_pos_emb(chunk_with_decoder, hidden_states) - if rotary_pos_emb is not None: - static_inputs["rotary_pos_emb"] = rotary_pos_emb - sample_kwargs.append(static_inputs) - elif contains_self_attn: - sample_args.append( - ( - static_inputs.pop("hidden_states"), - static_inputs.pop("attention_mask"), + contains_self_attn = ( + isinstance(layer, TransformerLayer) + and not isinstance(layer.self_attention, IdentityOp) + and ( + not self.config.cuda_graph_scope + or CudaGraphScope.attn in self.config.cuda_graph_scope + ) + ) + + _sample_kwargs = {} + if is_te_min_version("1.10.0"): + # te.make_graphed_callables() accepts keyword arguments since 1.10.0. + hidden_states = static_inputs.pop("hidden_states") + _sample_args = (hidden_states,) + if contains_self_attn: + rotary_pos_emb = get_rotary_pos_emb(chunk_of_the_layer, hidden_states) + if rotary_pos_emb is not None: + static_inputs["rotary_pos_emb"] = rotary_pos_emb + _sample_kwargs = static_inputs + elif contains_self_attn: + _sample_args = ( + static_inputs.pop("hidden_states"), + static_inputs.pop("attention_mask"), + ) + else: + _sample_args = (static_inputs.pop("hidden_states"),) + return _sample_args, _sample_kwargs + + # Calculate the starting index of each chunk in callables for future use. + prefix_num_layers = [0] + for model_chunk_idx in range(self.num_model_chunks): + num_layers = self.num_layers_per_chunk[model_chunk_idx] + prefix_num_layers.append(prefix_num_layers[-1] + num_layers) + + # Reorganize args and kwargs for input tensor reuse. + # fwd_sample_queues is keyed by model chunk index. The value is a queue of tuples. + # Each tuple contains the sample key signature and its fwd_idx. When we finish a backward + # chunk, we pop the corresponding fwd_idx and push to the consumed_sample_queue. + # consumed_sample_queue is keyed by the sample key signature. The value is a queue of the + # fwd_idx whose backward has been called so that we can reuse the same static buffers. + # In this way, we can reuse the same static input buffers for the non-overlapping samples + # with the same input signature. + fwd_sample_queues = {} + consumed_sample_queue = {} + fwd_idx = [0] * self.num_model_chunks + for chunk_id in order: + model_chunk_idx = abs(chunk_id) - 1 + + if chunk_id > 0: + sample_start_idx = (prefix_num_layers[model_chunk_idx] * get_num_microbatches()) + ( + fwd_idx[model_chunk_idx] * self.num_layers_per_chunk[model_chunk_idx] + ) + fwd_sample_idx = [ + sample_start_idx + i for i in range(self.num_layers_per_chunk[model_chunk_idx]) + ] + if model_chunk_idx not in fwd_sample_queues: + fwd_sample_queues[model_chunk_idx] = [] + for per_callable_fwd_idx in fwd_sample_idx: + if sample_args[per_callable_fwd_idx] is None: + sample_args[per_callable_fwd_idx], sample_kwargs[per_callable_fwd_idx] = ( + _get_layer_static_inputs( + self.callables_per_chunk[model_chunk_idx][ + per_callable_fwd_idx - sample_start_idx + ], + self.chunks_with_decoder[model_chunk_idx], ) ) - else: - sample_args.append((static_inputs.pop("hidden_states"),)) + + sample_args_keys = tuple( + (t.shape, t.dtype, t.layout) for t in sample_args[per_callable_fwd_idx] + ) + sample_kwargs_keys = tuple( + (k, v.shape, v.dtype, v.layout) + for k, v in sorted(sample_kwargs[per_callable_fwd_idx].items()) + ) + sample_keys = sample_args_keys + sample_kwargs_keys + + fwd_sample_queues[model_chunk_idx].append((sample_keys, per_callable_fwd_idx)) + if consumed_sample_queue.get(sample_keys, []): + reuse_fwd_idx = consumed_sample_queue[sample_keys].pop(0) + assert ( + sample_args[reuse_fwd_idx] is not None + and sample_kwargs[reuse_fwd_idx] is not None + ), "sample_args and sample_kwargs must not be None when reusing." + sample_args[per_callable_fwd_idx] = sample_args[reuse_fwd_idx] + sample_kwargs[per_callable_fwd_idx] = sample_kwargs[reuse_fwd_idx] + fwd_idx[model_chunk_idx] += 1 + else: + num_consumed_samples = min( + len(fwd_sample_queues[model_chunk_idx]), + self.num_layers_per_chunk[model_chunk_idx], + ) + for sample_keys, per_callable_fwd_idx in fwd_sample_queues[model_chunk_idx][ + :num_consumed_samples + ]: + if sample_keys not in consumed_sample_queue: + consumed_sample_queue[sample_keys] = [] + consumed_sample_queue[sample_keys].append(per_callable_fwd_idx) + fwd_sample_queues[model_chunk_idx] = fwd_sample_queues[model_chunk_idx][ + num_consumed_samples: + ] + + return sample_args, sample_kwargs + + def _get_cuda_graph_input_data(self): + """ + Create the CUDA Graph capturing input data. + The data is organized per-chunk per-microbatch per-layer. + """ # Get the PP and VPP scheduling order. from megatron.core.pipeline_parallel.schedules import ( @@ -1581,6 +1713,9 @@ def get_rotary_pos_emb(transformer_module, transformer_input): msg=f'Rank {torch.distributed.get_rank()}: ORDER {order}', ) + # Generate sample arguments and keyword arguments for capturing. + sample_args, sample_kwargs = self._get_sample_arguments(order) + def get_make_graphed_callables_kwargs(): kwargs = {'num_warmup_iters': 11, 'allow_unused_input': True, '_order': order} diff --git a/tests/unit_tests/transformer/test_cuda_graphs.py b/tests/unit_tests/transformer/test_cuda_graphs.py index cee75171560..0eac7c28c6d 100644 --- a/tests/unit_tests/transformer/test_cuda_graphs.py +++ b/tests/unit_tests/transformer/test_cuda_graphs.py @@ -33,7 +33,10 @@ ) from megatron.core.models.gpt.gpt_model import GPTModel from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec -from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator +from megatron.core.num_microbatches_calculator import ( + destroy_num_microbatches_calculator, + init_num_microbatches_calculator, +) from megatron.core.pipeline_parallel.schedules import set_current_microbatch from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.ssm.mamba_block import MambaStack @@ -42,7 +45,11 @@ initialize_rng_tracker, model_parallel_cuda_manual_seed, ) -from megatron.core.transformer.cuda_graphs import CudaGraphManager, _CudagraphGlobalRecord +from megatron.core.transformer.cuda_graphs import ( + CudaGraphManager, + TECudaGraphHelper, + _CudagraphGlobalRecord, +) from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.moe.fused_a2a import reset_hybrid_ep_buffer from megatron.core.transformer.transformer_block import TransformerBlock @@ -735,6 +742,251 @@ def test_capture_freeze_gc(self): ) +# Global storage for comparing unique buffer counts across different num_microbatches +_unique_buffer_counts = None + + +class TestTECudaGraphHelper: + def setup_method(self, method): + # Initialize parallel state + initialize_rng_tracker(use_te_rng_tracker=True, force_reset=True) + Utils.initialize_model_parallel( + tensor_model_parallel_size=1, pipeline_model_parallel_size=1 + ) + model_parallel_cuda_manual_seed(123) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + destroy_global_vars() + destroy_num_microbatches_calculator() + # Note: _unique_buffer_counts is intentionally NOT cleared here so we can + # compare values across parametrized test runs + + @pytest.mark.parametrize("num_microbatches", [4, 16, 64, 256]) + def test_get_cuda_graph_input_data(self, num_microbatches): + """Test _get_cuda_graph_input_data function in TECudaGraphHelper.""" + + # Set up test configuration + seq_length = 128 + micro_batch_size = 2 + num_layers = 4 + vocab_size = 1024 + hidden_size = 64 + num_attention_heads = 4 + + # Initialize num_microbatches calculator + init_num_microbatches_calculator( + rank=0, + rampup_batch_size=None, + global_batch_size=micro_batch_size * num_microbatches, + micro_batch_size=micro_batch_size, + data_parallel_size=1, + decrease_batch_size_if_needed=False, + ) + + # Create transformer config directly + transformer_config = TransformerConfig( + num_layers=num_layers, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + use_cpu_initialization=True, + cuda_graph_impl="transformer_engine", + use_te_rng_tracker=True, + bf16=True, + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + context_parallel_size=1, + ) + + # Create model + torch.manual_seed(123) + model_parallel_cuda_manual_seed(123) + + gpt_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(), + vocab_size=vocab_size, + max_sequence_length=seq_length, + parallel_output=True, + position_embedding_type="rope", + ) + + # Move model to CUDA + gpt_model.cuda() + + # Initialize TECudaGraphHelper + cuda_graph_helper = TECudaGraphHelper( + model=[gpt_model], + config=transformer_config, + seq_length=seq_length, + micro_batch_size=micro_batch_size, + optimizers=[], + ) + + # Call _get_cuda_graph_input_data (which internally calls _get_sample_arguments) + sample_args, make_graphed_callables_kwargs = cuda_graph_helper._get_cuda_graph_input_data() + + # Extract sample_kwargs from the kwargs dict + # For TE >= 1.10.0, sample_kwargs should always be present + assert ( + 'sample_kwargs' in make_graphed_callables_kwargs + ), "sample_kwargs should be present in make_graphed_callables_kwargs for TE >= 1.10.0" + sample_kwargs = make_graphed_callables_kwargs['sample_kwargs'] + + # Basic checks + num_graphable_layers = len(cuda_graph_helper.flattened_callables) + expected_length = num_graphable_layers * num_microbatches + assert len(sample_args) == expected_length, ( + f"sample_args length mismatch: expected {expected_length}, " f"got {len(sample_args)}" + ) + assert len(sample_kwargs) == expected_length, ( + f"sample_kwargs length mismatch: expected {expected_length}, " + f"got {len(sample_kwargs)}" + ) + + # Check that all elements are not None + for i, (args_item, kwargs_item) in enumerate(zip(sample_args, sample_kwargs)): + assert args_item is not None, f"sample_args[{i}] is None" + assert kwargs_item is not None, f"sample_kwargs[{i}] is None" + assert isinstance(args_item, tuple), f"sample_args[{i}] should be a tuple" + assert isinstance(kwargs_item, dict), f"sample_kwargs[{i}] should be a dict" + assert len(args_item) > 0, f"sample_args[{i}] should not be empty" + # Check that hidden_states is present + assert "hidden_states" in kwargs_item or ( + len(args_item) > 0 and torch.is_tensor(args_item[0]) + ), f"sample_args[{i}] or sample_kwargs[{i}] should contain hidden_states" + + # Check tensor properties + for i, (args_item, kwargs_item) in enumerate(zip(sample_args, sample_kwargs)): + # Get hidden_states from args or kwargs + if len(args_item) > 0 and torch.is_tensor(args_item[0]): + hidden_states = args_item[0] + elif "hidden_states" in kwargs_item: + hidden_states = kwargs_item["hidden_states"] + else: + continue + + assert torch.is_tensor(hidden_states), f"hidden_states at index {i} should be a tensor" + # Check shape matches expected (accounting for TP/CP) + expected_seq_len = seq_length // transformer_config.context_parallel_size + if transformer_config.sequence_parallel: + expected_seq_len = expected_seq_len // transformer_config.tensor_model_parallel_size + assert hidden_states.shape[0] == expected_seq_len, ( + f"hidden_states seq_len mismatch at index {i}: " + f"expected {expected_seq_len}, got {hidden_states.shape[0]}" + ) + assert hidden_states.shape[1] == micro_batch_size, ( + f"hidden_states batch_size mismatch at index {i}: " + f"expected {micro_batch_size}, got {hidden_states.shape[1]}" + ) + assert hidden_states.shape[2] == transformer_config.hidden_size, ( + f"hidden_states hidden_size mismatch at index {i}: " + f"expected {transformer_config.hidden_size}, got {hidden_states.shape[2]}" + ) + + # Memory optimization check: verify that buffers with same signature are reused + # Create a mapping of sample_keys to indices + sample_keys_to_indices = {} + for idx, (args_item, kwargs_item) in enumerate(zip(sample_args, sample_kwargs)): + # Create sample_keys similar to the function + args_keys = tuple((t.shape, t.dtype, t.layout) for t in args_item if torch.is_tensor(t)) + kwargs_keys = tuple( + (k, v.shape, v.dtype, v.layout) + for k, v in sorted(kwargs_item.items()) + if torch.is_tensor(v) + ) + sample_keys = args_keys + kwargs_keys + + if sample_keys not in sample_keys_to_indices: + sample_keys_to_indices[sample_keys] = [] + sample_keys_to_indices[sample_keys].append(idx) + + # Check that buffers with same signature share references (memory optimization) + # The optimization reuses buffers when: + # 1. They have the same signature (shape, dtype, layout) + # 2. The backward pass of the original buffer has completed + # 3. A new forward pass with matching signature needs a buffer + # Count how many times each tensor is reused + unique_tensors = set() + tensor_reuse_count = {} + for idx, (args_item, kwargs_item) in enumerate(zip(sample_args, sample_kwargs)): + # Get the first tensor from args (hidden_states) + if len(args_item) > 0 and torch.is_tensor(args_item[0]): + tensor_ptr = args_item[0].data_ptr() + unique_tensors.add(tensor_ptr) + tensor_reuse_count[tensor_ptr] = tensor_reuse_count.get(tensor_ptr, 0) + 1 + + # With memory optimization, we should see some buffers reused + # (i.e., some tensors should appear multiple times) + max_reuse = max(tensor_reuse_count.values()) if tensor_reuse_count else 0 + total_entries = len(sample_args) + unique_buffer_count = len(unique_tensors) + + # Verify that memory optimization is working: + # - The number of unique buffers should be <= total entries + # - With the 1F1B schedule and multiple microbatches, we should see some buffer reuse + # - The number of unique buffers should be bounded as num_microbatches grows. + assert unique_buffer_count <= total_entries, ( + f"Memory optimization check: unique_buffer_count ({unique_buffer_count}) " + f"should be <= total_entries ({total_entries})" + ) + global _unique_buffer_counts + if _unique_buffer_counts is None: + _unique_buffer_counts = unique_buffer_count + else: + assert unique_buffer_count == _unique_buffer_counts, ( + f"Unique buffer count mismatch: expected {_unique_buffer_counts}, " + f"got {unique_buffer_count}" + ) + + # Verify that buffers with the same signature can potentially be reused + # (the actual reuse depends on the schedule, but the mechanism should work) + if num_microbatches > 1 and num_graphable_layers > 0: + # Check that we have multiple entries with the same signature + has_duplicate_signatures = any( + len(indices) > 1 for indices in sample_keys_to_indices.values() + ) + assert has_duplicate_signatures, ( + "Memory optimization: expected duplicate signatures for buffer reuse, " + "but all signatures are unique" + ) + + # If we have duplicate signatures and the schedule allows it, + # some buffers should be reused (max_reuse > 1) + # Note: The exact amount of reuse depends on the schedule order + # With 1F1B interleaved schedule, we should see some reuse + if max_reuse > 1: + # Verify that reused buffers have the same signature + reused_tensors = [ptr for ptr, count in tensor_reuse_count.items() if count > 1] + assert len(reused_tensors) > 0, "Expected some reused tensors" + + # Verify that make_graphed_callables_kwargs contains expected keys + assert ( + '_order' in make_graphed_callables_kwargs + ), "make_graphed_callables_kwargs should contain '_order'" + assert ( + 'num_warmup_iters' in make_graphed_callables_kwargs + ), "make_graphed_callables_kwargs should contain 'num_warmup_iters'" + assert ( + 'allow_unused_input' in make_graphed_callables_kwargs + ), "make_graphed_callables_kwargs should contain 'allow_unused_input'" + + # Verify the order in kwargs matches expectations + order = make_graphed_callables_kwargs['_order'] + num_model_chunks = cuda_graph_helper.num_model_chunks + expected_order_length = num_microbatches * num_model_chunks * 2 + assert ( + len(order) == expected_order_length + ), f"Order length mismatch: expected {expected_order_length}, got {len(order)}" + + # Verify that all forward passes in order have corresponding entries in sample_args + forward_count = sum(1 for chunk_id in order if chunk_id > 0) + assert forward_count == num_microbatches * num_model_chunks, ( + f"Forward count mismatch: expected {num_microbatches * num_model_chunks}, " + f"got {forward_count}" + ) + + def is_deep_ep_available(): from megatron.core.transformer.moe.fused_a2a import HAVE_DEEP_EP @@ -912,8 +1164,6 @@ def _run_test_helper( assert len(gpt_model) == 1 # Assume only one model in the model provider. if cuda_graph_impl == "transformer_engine": - from megatron.core.transformer.cuda_graphs import TECudaGraphHelper - self.cuda_graph_helper = TECudaGraphHelper( model=gpt_model, config=gpt_model[0].config, From b0f5746735a965e67852d936a8fd0ef8928e9a81 Mon Sep 17 00:00:00 2001 From: Lifu Zhang Date: Tue, 2 Dec 2025 06:14:02 -0800 Subject: [PATCH 169/248] Fix HSDP Registering Device Mesh (#2388) Signed-off-by: Lifu Zhang Co-authored-by: Lifu Zhang Co-authored-by: Zijie Yan --- megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py index b94a332bb0d..2b8eccb69d3 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py @@ -772,6 +772,8 @@ def register_submesh(device_mesh, submesh, is_expert_parallel): # Register EP submeshes if self.expt_device_mesh is not None: + register_submesh(self.device_mesh, hsdp_submesh, True) + register_submesh(self.device_mesh, hsdp_tp_submesh, True) register_submesh(self.expt_device_mesh, tp_submesh, True) register_submesh(self.expt_device_mesh, fsdp_tp_submesh, True) register_submesh(self.expt_device_mesh, fsdp_submesh, True) From 5375ad418ba3362d720badfa7f495b34ba49b962 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Tue, 2 Dec 2025 10:31:32 -0800 Subject: [PATCH 170/248] fix: update baseline (#2468) Signed-off-by: Pablo Garay --- .../workflows/check_api_backwards_compatibility_workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check_api_backwards_compatibility_workflow.yml b/.github/workflows/check_api_backwards_compatibility_workflow.yml index 5f6adec4c91..c8f247b8439 100644 --- a/.github/workflows/check_api_backwards_compatibility_workflow.yml +++ b/.github/workflows/check_api_backwards_compatibility_workflow.yml @@ -66,7 +66,7 @@ jobs: # Default baseline for automatic PR checks # Can be: branch name (e.g., 'main'), commit hash, or tag # Will be resolved to commit hash during execution - DEFAULT_BASELINE: 'c6f277a7f869274c19aace594582d9938b06abac' + DEFAULT_BASELINE: 'b0f5746735a965e67852d936a8fd0ef8928e9a81' # Tag pattern for auto-detection (e.g., 'core_r*', 'core_v*') TAG_PATTERN: 'core_v*' # Tag regex filter (e.g., '^core_v[0-9]+\.[0-9]+\.[0-9]+$' for stable versions only) From 79660b7bedd8ab18f36a712ed4c3de3d3fbc4e6a Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Tue, 2 Dec 2025 11:37:29 -0800 Subject: [PATCH 171/248] fix: Add merge_group support with pre-flight pattern (#2469) Signed-off-by: Pablo Garay --- ...k_api_backwards_compatibility_workflow.yml | 45 ++++++++++++++----- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/.github/workflows/check_api_backwards_compatibility_workflow.yml b/.github/workflows/check_api_backwards_compatibility_workflow.yml index c8f247b8439..707d5f76316 100644 --- a/.github/workflows/check_api_backwards_compatibility_workflow.yml +++ b/.github/workflows/check_api_backwards_compatibility_workflow.yml @@ -3,7 +3,12 @@ name: API Compatibility Check on: push: branches: - - "pull-request/[0-9]+" + - dev + - main + - 'pull-request/[0-9]+' + - 'deploy-release/*' + merge_group: + types: [checks_requested] # Allow manual trigger workflow_dispatch: @@ -33,17 +38,35 @@ jobs: echo "Manual trigger - will run compatibility check" exit 0 fi - - # Check if any relevant files changed - # Use merge-base to find common ancestor with dev - # This ensures we only detect changes actually made in this PR branch, - # not changes that happened in dev after the branch was created - BASE_SHA=$(git merge-base origin/dev HEAD) - echo "Comparing against merge-base: $BASE_SHA" - + + # Determine base SHA based on event type + if [ "${{ github.event_name }}" == "merge_group" ]; then + BASE_SHA="${{ github.event.merge_group.base_sha }}" + echo "Merge group event - comparing against base: $BASE_SHA" + else + # For push events, use merge-base to find common ancestor + # This ensures we only detect changes actually made in this PR branch, + # not changes that happened in dev after the branch was created + BASE_SHA=$(git merge-base origin/dev HEAD 2>/dev/null || echo "") + if [ -z "$BASE_SHA" ]; then + # Fallback for branches targeting main + BASE_SHA=$(git merge-base origin/main HEAD 2>/dev/null || echo "") + fi + echo "Push event - comparing against merge-base: $BASE_SHA" + fi + + if [ -z "$BASE_SHA" ]; then + echo "Could not determine base SHA - will run compatibility check" + echo "should_skip=false" >> $GITHUB_OUTPUT + exit 0 + fi + # Check for changes in megatron/core Python files (excluding tests and legacy) - CHANGED_FILES=$(git diff --name-only "$BASE_SHA" HEAD -- 'megatron/core/**/*.py' ':!megatron/core/tests/**' ':!megatron/legacy/**' || echo "") - + CHANGED_FILES=$(git diff --name-only "$BASE_SHA" HEAD -- \ + 'megatron/core/**/*.py' \ + ':!megatron/core/tests/**' \ + ':!megatron/legacy/**' 2>/dev/null || echo "") + if [ -z "$CHANGED_FILES" ]; then echo "should_skip=true" >> $GITHUB_OUTPUT echo "No relevant megatron/core files changed - will skip compatibility check" From d72b218d45e0ef7964331f06498b688f6dcf5227 Mon Sep 17 00:00:00 2001 From: Lifu Zhang Date: Wed, 3 Dec 2025 00:44:55 -0800 Subject: [PATCH 172/248] DeepSeek V3 FSDP Fix for Precision-Aware Optimizer (#2204) Signed-off-by: Lifu Zhang Co-authored-by: Lifu Zhang Co-authored-by: Jianbin Chang --- .../fsdp/src/megatron_fsdp/param_and_grad_buffer.py | 5 +++-- megatron/training/training.py | 2 -- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py index 6a294b69602..88254d89988 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py @@ -2474,8 +2474,9 @@ def update_main_grads(self): item_id, only_shard=sharded_optimizer_state ) if group.main_weight_buffer is not None: - # Convert the gradient to the main weight buffer dtype. - optimizer_grad = optimizer_grad.to(param.dtype) + if not getattr(self, "use_precision_aware_optimizer", False): + # Convert the gradient to the main weight buffer dtype. + optimizer_grad = optimizer_grad.to(param.dtype) if name not in self.dist_main_grad: # Register the gradient as a distributed tensor. diff --git a/megatron/training/training.py b/megatron/training/training.py index e88b9839d28..d47a8abd20e 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -1086,8 +1086,6 @@ def build_model(): kwargs['pad_buckets_for_high_nccl_busbw'] = args.ddp_pad_buckets_for_high_nccl_busbw kwargs['reduce_scatter_with_fp32_accumulation'] = args.ddp_reduce_scatter_with_fp32_accumulation kwargs['average_in_collective'] = args.ddp_average_in_collective - if args.use_megatron_fsdp and args.use_precision_aware_optimizer: - kwargs["preserve_fp32_weights"] = False ddp_config = DistributedDataParallelConfig(**kwargs) # In the Megatron FSDP and DDP use path, we need to initialize the bucket size. From 436065a86b749ca3b50eebca68f55c9e690a9f63 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Wed, 3 Dec 2025 21:31:57 +0800 Subject: [PATCH 173/248] [Dev] fix(moe): minor refactor for fine-grained activation offloading (#2285) Signed-off-by: Hongbin Liu Co-authored-by: Zijie Yan --- .../core/extensions/transformer_engine.py | 10 +++- .../fine_grained_activation_offload.py | 48 +++---------------- megatron/core/pipeline_parallel/utils.py | 33 +++++++++++++ 3 files changed, 48 insertions(+), 43 deletions(-) diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index 85732c0f7ea..9da6e85d8e9 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -2187,8 +2187,14 @@ def set_save_original_input(module): try: # pylint: disable=unused-import - from transformer_engine.pytorch import cpu_offload + from transformer_engine.pytorch import cpu_offload_v1 as cpu_offload +except ImportError: + try: + from transformer_engine.pytorch import cpu_offload + except ImportError: + cpu_offload = None +try: + # pylint: disable=unused-import from transformer_engine.pytorch.float8_tensor import Float8Tensor except ImportError: Float8Tensor = None - cpu_offload = None diff --git a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py index 1e280a09d35..138dcd8f7b1 100644 --- a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py +++ b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py @@ -1,12 +1,13 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -import warnings from collections import deque from contextlib import nullcontext from typing import Any import torch +from megatron.core.pipeline_parallel.utils import set_ideal_affinity_for_current_gpu + # CPU offload implementation for pipeline parallelism DEBUG = False DEBUG_RANK = 0 @@ -22,39 +23,6 @@ def debug_rank(message): print(message) -def set_ideal_affinity_for_current_gpu(): - """Set CPU affinity for the current GPU to optimize host-device transfers.""" - import uuid - - try: - import cuda.bindings.driver as cuda_driver - import cuda.bindings.runtime as cuda_runtime - except ImportError: - try: - import cuda.cuda as cuda_driver - import cuda.cudart as cuda_runtime - except ImportError: - # print("cuda-python may not be installed, skipping GPU affinity setting") - warnings.warn("cuda-python may not be installed, skipping GPU affinity setting") - return - try: - import pynvml - except ImportError: - warnings.warn("pynvml is not installed, skipping GPU affinity setting") - return - - # Get current CUDA device ID - err, device_id = cuda_runtime.cudaGetDevice() - assert err == cuda_runtime.cudaError_t.cudaSuccess - # Get device UUID - err, device_uuid = cuda_driver.cuDeviceGetUuid(device_id) - assert err == cuda_driver.CUresult.CUDA_SUCCESS - # Set CPU affinity based on GPU's NUMA node - pynvml.nvmlInit() - handle = pynvml.nvmlDeviceGetHandleByUUID("GPU-" + str(uuid.UUID(bytes=device_uuid.bytes))) - pynvml.nvmlDeviceSetCpuAffinity(handle) - - class PipelineOffloadManager: """ Singleton manager for coordinating activation offloading across pipeline stages. @@ -200,6 +168,8 @@ def __enter__(self): if cpu_offload is not None: cpu_offload.CPUOffloadEnabled = True + else: + raise RuntimeError("TE CPU offload is not available") self.inside_context = True torch._C._autograd._push_saved_tensors_default_hooks( @@ -213,6 +183,8 @@ def __exit__(self, *args: Any): if cpu_offload is not None: cpu_offload.CPUOffloadEnabled = False + else: + raise RuntimeError("TE CPU offload is not available") self.inside_context = False torch._C._autograd._pop_saved_tensors_default_hooks() @@ -244,24 +216,18 @@ class ChunkOffloadHandler: def offload(src_tensor, pin_memory=True): """Offload.""" debug_rank("--------offload") - from megatron.core.extensions.transformer_engine import Float8Tensor - - fp8_offload = isinstance(src_tensor, Float8Tensor) if Float8Tensor is not None else False if not src_tensor.is_contiguous(): src_tensor = src_tensor.contiguous() cpu_backup = torch.empty( src_tensor.size(), - dtype=torch.uint8 if fp8_offload else src_tensor.dtype, + dtype=src_tensor.dtype, layout=src_tensor.layout, device="cpu", pin_memory=pin_memory, ) - if fp8_offload: - cpu_backup = Float8Tensor.make_like(src_tensor, data=cpu_backup) - cpu_backup.copy_(src_tensor, non_blocking=pin_memory) state = (src_tensor.device, cpu_backup) return state diff --git a/megatron/core/pipeline_parallel/utils.py b/megatron/core/pipeline_parallel/utils.py index fae8e5466da..c50c6ac7964 100644 --- a/megatron/core/pipeline_parallel/utils.py +++ b/megatron/core/pipeline_parallel/utils.py @@ -80,6 +80,39 @@ def make_viewless(e): return e +def set_ideal_affinity_for_current_gpu(): + """Set CPU affinity for the current GPU to optimize host-device transfers.""" + import uuid + + try: + import cuda.bindings.driver as cuda_driver + import cuda.bindings.runtime as cuda_runtime + except ImportError: + try: + import cuda.cuda as cuda_driver + import cuda.cudart as cuda_runtime + except ImportError: + # print("cuda-python may not be installed, skipping GPU affinity setting") + warnings.warn("cuda-python may not be installed, skipping GPU affinity setting") + return + try: + import pynvml + except ImportError: + warnings.warn("pynvml is not installed, skipping GPU affinity setting") + return + + # Get current CUDA device ID + err, device_id = cuda_runtime.cudaGetDevice() + assert err == cuda_runtime.cudaError_t.cudaSuccess + # Get device UUID + err, device_uuid = cuda_driver.cuDeviceGetUuid(device_id) + assert err == cuda_driver.CUresult.CUDA_SUCCESS + # Set CPU affinity based on GPU's NUMA node + pynvml.nvmlInit() + handle = pynvml.nvmlDeviceGetHandleByUUID("GPU-" + str(uuid.UUID(bytes=device_uuid.bytes))) + pynvml.nvmlDeviceSetCpuAffinity(handle) + + @contextmanager def stream_acquire_context(stream, event): """Stream acquire context""" From a4bee49f1460f7831e88e04e95e2b86f95185709 Mon Sep 17 00:00:00 2001 From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> Date: Thu, 4 Dec 2025 09:38:54 -1000 Subject: [PATCH 174/248] [Dev] feat: m4 leftover changes (#2226) Signed-off-by: dimapihtar Signed-off-by: yaoyu-33 Co-authored-by: dimapihtar --- .../distributed/distributed_data_parallel.py | 64 ++++++------------- .../core/extensions/transformer_engine.py | 37 ++++++----- megatron/core/hyper_comm_grid.py | 1 - megatron/core/optimizer/__init__.py | 31 ++++++--- megatron/core/optimizer/clip_grads.py | 3 +- megatron/core/optimizer/optimizer.py | 5 +- megatron/core/pipeline_parallel/schedules.py | 17 +++-- megatron/core/tensor_parallel/layers.py | 12 ++-- megatron/core/transformer/module.py | 18 +++++- megatron/core/transformer/moe/experts.py | 13 ++-- megatron/core/transformer/moe/moe_utils.py | 28 +++++--- .../core/transformer/moe/shared_experts.py | 4 +- .../transformer/multi_latent_attention.py | 27 ++++++-- megatron/core/utils.py | 24 ++++++- megatron/training/training.py | 6 ++ 15 files changed, 181 insertions(+), 109 deletions(-) diff --git a/megatron/core/distributed/distributed_data_parallel.py b/megatron/core/distributed/distributed_data_parallel.py index df1d7ae94db..e831d7cf4ec 100644 --- a/megatron/core/distributed/distributed_data_parallel.py +++ b/megatron/core/distributed/distributed_data_parallel.py @@ -6,7 +6,6 @@ import torch -from .. import parallel_state from ..config_logger import has_config_logger_enabled, log_config_to_disk from ..fp8_utils import is_float8tensor, post_all_gather_processing from ..process_groups_config import ProcessGroupCollection @@ -55,10 +54,15 @@ def __init__( # If using very large dp_sizes, make buckets larger to ensure that chunks used in NCCL # ring-reduce implementations are large enough to remain bandwidth-bound rather than # latency-bound. + # Setup process groups, handling both None and provided pg_collection values. + process_group_dict = ProcessGroupCollection.setup_process_groups_for_ddp( + pg_collection, config, ddp_config + ) + + # If bucket_size is not provided as an input, use sane default based on dp_group size. + dp_group = process_group_dict['dp_group'] if ddp_config.bucket_size is None: - ddp_config.bucket_size = max( - 40000000, 1000000 * parallel_state.get_data_parallel_world_size() - ) + ddp_config.bucket_size = max(40000000, 1000000 * dp_group.size()) # Set bucket_size to infinity if overlap_grad_reduce is False. if not ddp_config.overlap_grad_reduce: ddp_config.bucket_size = None @@ -70,45 +74,19 @@ def __init__( f'Setting up DistributedDataParallel with config {self.ddp_config}', ) - if pg_collection is None: - self.dp_group = parallel_state.get_data_parallel_group( - with_context_parallel=False, partial_data_parallel=False - ) - self.dp_cp_group = parallel_state.get_data_parallel_group( - with_context_parallel=True, partial_data_parallel=False - ) - self.intra_dp_cp_group = parallel_state.get_data_parallel_group( - with_context_parallel=True, partial_data_parallel=True - ) - self.expt_dp_group = parallel_state.get_expert_data_parallel_group() - self.intra_expt_dp_group = parallel_state.get_expert_data_parallel_group( - partial_expert_data_parallel=True - ) - if self.ddp_config.num_distributed_optimizer_instances > 1: - self.inter_dist_opt_group = ( - parallel_state.get_inter_distributed_optimizer_instance_group() - ) - self.tp_group = parallel_state.get_tensor_model_parallel_group() - self.pp_group = parallel_state.get_pipeline_model_parallel_group() - self.ep_group = parallel_state.get_expert_model_parallel_group() - else: - # Setup process groups using DDP-specific helper method - process_groups = ProcessGroupCollection.setup_process_groups_for_ddp( - pg_collection, config, self.ddp_config - ) - - self.dp_group = process_groups['dp_group'] - self.dp_cp_group = process_groups['dp_cp_group'] - self.intra_dp_cp_group = process_groups['intra_dp_cp_group'] - self.expt_dp_group = process_groups['expt_dp_group'] - self.intra_expt_dp_group = process_groups['intra_expt_dp_group'] - self.tp_group = process_groups['tp_group'] - self.pp_group = process_groups['pp_group'] - self.ep_group = process_groups['ep_group'] - - # Set inter_dist_opt_group if multiple optimizer instances - if self.ddp_config.num_distributed_optimizer_instances > 1: - self.inter_dist_opt_group = process_groups['inter_dist_opt_group'] + # Assign all required process groups + self.dp_group = process_group_dict['dp_group'] + self.dp_cp_group = process_group_dict['dp_cp_group'] + self.intra_dp_cp_group = process_group_dict['intra_dp_cp_group'] + self.expt_dp_group = process_group_dict['expt_dp_group'] + self.intra_expt_dp_group = process_group_dict['intra_expt_dp_group'] + self.tp_group = process_group_dict['tp_group'] + self.pp_group = process_group_dict['pp_group'] + self.ep_group = process_group_dict['ep_group'] + + # Set inter_dist_opt_group if multiple optimizer instances + if self.ddp_config.num_distributed_optimizer_instances > 1: + self.inter_dist_opt_group = process_group_dict['inter_dist_opt_group'] # Turn off bucketing if we are on a pipeline stage that is not the first (since # data-parallel communication on these stages is not on the critical path), or if diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index 9da6e85d8e9..ab9962cfb1c 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -20,9 +20,6 @@ from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.parallel_state import ( get_context_parallel_group, - get_expert_data_parallel_rank, - get_expert_model_parallel_rank, - get_expert_model_parallel_world_size, get_hierarchical_context_parallel_groups, get_tensor_model_parallel_group, get_tensor_model_parallel_world_size, @@ -372,9 +369,10 @@ def __init__( extra_kwargs["rng_tracker_name"] = rng_tracker_name te_parallel_mode = parallel_mode + tp_group_for_te = tp_group if parallel_mode == "duplicated": # Handle non-parallel case - tp_group = None + tp_group_for_te = None tp_size = 1 explicit_expert_comm = False te_parallel_mode = None @@ -389,7 +387,7 @@ def __init__( input_size = divide(input_size, tp_size) te_parallel_mode = None tp_size = 1 - tp_group = None + tp_group_for_te = None super().__init__( in_features=input_size, @@ -397,7 +395,7 @@ def __init__( sequence_parallel=self.config.sequence_parallel, fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion, # Pass None if not initialized for backward compatibility with the ckpt converter. - tp_group=tp_group if torch.distributed.is_initialized() else None, + tp_group=tp_group_for_te if torch.distributed.is_initialized() else None, tp_size=tp_size, get_rng_state_tracker=( get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None @@ -1166,7 +1164,7 @@ def __init__( skip_bias_add: bool, is_expert: bool = False, tp_comm_buffer_name: Optional[str] = None, - tp_group: Optional[torch.distributed.ProcessGroup] = None, + pg_collection: Optional[ProcessGroupCollection] = None, ): self.config = config @@ -1197,9 +1195,14 @@ def __init__( # The comms between TP and EP group is explicitly handled by MoE token dispatcher. # So we disable comms by making TE agnostic of model parallel. - tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert) + if pg_collection is None: + pg_collection = ProcessGroupCollection.use_mpu_process_groups() + self._pg_collection = pg_collection + assert is_expert, "TEGroupedLinear only supports expert parallelism" + tp_group = pg_collection.expt_tp self._tp_group = tp_group tp_size = get_pg_size(tp_group) + tp_group_for_te = tp_group self.explicit_expert_comm = is_expert and (tp_size > 1 or self.expert_parallel) @@ -1210,7 +1213,7 @@ def __init__( input_size = divide(input_size, tp_size) parallel_mode = None tp_size = 1 - tp_group = None + tp_group_for_te = None super().__init__( num_gemms=num_gemms, @@ -1218,7 +1221,7 @@ def __init__( out_features=output_size, sequence_parallel=self.config.sequence_parallel, fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion, - tp_group=tp_group if torch.distributed.is_initialized() else None, + tp_group=tp_group_for_te if torch.distributed.is_initialized() else None, tp_size=tp_size, get_rng_state_tracker=( get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None @@ -1411,8 +1414,8 @@ def _sharded_state_dict_grouped( singleton_local_shards = (metadata or {}).get('singleton_local_shards', False) sharded_state_dict = {} full_state_dict = self.state_dict(prefix="", keep_vars=True) - num_global_experts = get_expert_model_parallel_world_size() * self.num_gemms - local_expert_indices_offset = get_expert_model_parallel_rank() * self.num_gemms + num_global_experts = get_pg_size(self._pg_collection.ep) * self.num_gemms + local_expert_indices_offset = get_pg_rank(self._pg_collection.ep) * self.num_gemms ep_axis = len(sharded_offsets) extra_states = self._split_extra_state(full_state_dict["_extra_state"]) for gemm_idx in range(self.num_gemms): @@ -1461,7 +1464,7 @@ def _sharded_state_dict_grouped( if getattr(sh_ten, "is_data_parallel_fully_shard", False): edp_replica_id = 0 else: - edp_replica_id = get_expert_data_parallel_rank() + edp_replica_id = get_pg_rank(self._pg_collection.expt_dp) sh_ten.replica_id = (*replica_id[:2], edp_replica_id) return sharded_state_dict @@ -1491,7 +1494,7 @@ def __init__( skip_bias_add: bool, is_expert: bool, tp_comm_buffer_name: Optional[str] = None, - tp_group: Optional[torch.distributed.ProcessGroup] = None, + pg_collection: Optional[ProcessGroupCollection] = None, ): super().__init__( num_gemms=num_gemms, @@ -1504,7 +1507,7 @@ def __init__( skip_bias_add=skip_bias_add, is_expert=is_expert, tp_comm_buffer_name=tp_comm_buffer_name, - tp_group=tp_group, + pg_collection=pg_collection, ) def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None): @@ -1537,7 +1540,7 @@ def __init__( skip_bias_add: bool, is_expert: bool, tp_comm_buffer_name: Optional[str] = None, - tp_group: Optional[torch.distributed.ProcessGroup] = None, + pg_collection: Optional[ProcessGroupCollection] = None, ): super().__init__( num_gemms=num_gemms, @@ -1550,7 +1553,7 @@ def __init__( skip_bias_add=skip_bias_add, is_expert=is_expert, tp_comm_buffer_name=tp_comm_buffer_name, - tp_group=tp_group, + pg_collection=pg_collection, ) def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None): diff --git a/megatron/core/hyper_comm_grid.py b/megatron/core/hyper_comm_grid.py index dce2aa16a7f..379bca69f74 100644 --- a/megatron/core/hyper_comm_grid.py +++ b/megatron/core/hyper_comm_grid.py @@ -160,7 +160,6 @@ def create_pg(self, dims: Union[str, list[str]], **kwargs: Any) -> dist.ProcessG logging.info(f"Generated process group for {unique_group_key} with enumeration {rank_enum}") self._pgs[unique_group_key] = pg - return pg def get_pg(self, dims: Union[str, list[str]]) -> dist.ProcessGroup: diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py index c254b2f6882..1496cc7d17a 100644 --- a/megatron/core/optimizer/__init__.py +++ b/megatron/core/optimizer/__init__.py @@ -284,6 +284,7 @@ def _get_megatron_optimizer_based_on_param_groups( data_parallel_group_idx: Optional[int] = None, intra_dist_opt_group: Optional[torch.distributed.ProcessGroup] = None, distributed_optimizer_instance_id: Optional[int] = 0, + pg_collection: Optional[ProcessGroupCollection] = None, ) -> MegatronOptimizer: """Get Megatron optimizer based on parameter groups. @@ -470,6 +471,13 @@ def init_state_fn(opt, config=None): optimizer = FP32Optimizer(optimizer, config, init_state_fn) setattr(optimizer, 'grad_stats_parallel_group', model_parallel_group) + if pg_collection is None or not hasattr(pg_collection, 'tp'): + tp_group = parallel_state.get_tensor_model_parallel_group() + else: + tp_group = pg_collection.tp + # TODO(M4): plumb tp_group through optimizer constructors so this setattr disappears. + setattr(optimizer, 'tp_group', tp_group) + return optimizer @@ -521,23 +529,23 @@ def get_megatron_optimizer( overlap_param_gather_with_optimizer_step_flags = [False] # Setup process groups using helper method - process_groups = ProcessGroupCollection.setup_process_groups_for_optimizer( + process_groups_dict = ProcessGroupCollection.setup_process_groups_for_optimizer( pg_collection, model_chunks, use_gloo_process_groups ) - dp_cp_group = process_groups['dp_cp_group'] - intra_dp_cp_group = process_groups['intra_dp_cp_group'] - intra_expt_dp_group = process_groups['intra_expt_dp_group'] - mp_group = process_groups['mp_group'] - expt_tp_pp_group = process_groups['expt_tp_pp_group'] - intra_dp_cp_group_gloo = process_groups['intra_dp_cp_group_gloo'] - intra_expt_dp_group_gloo = process_groups['intra_expt_dp_group_gloo'] - intra_dist_opt_group = process_groups['intra_dist_opt_group'] + dp_cp_group = process_groups_dict['dp_cp_group'] + intra_dp_cp_group = process_groups_dict['intra_dp_cp_group'] + intra_expt_dp_group = process_groups_dict['intra_expt_dp_group'] + mp_group = process_groups_dict['mp_group'] + expt_tp_pp_group = process_groups_dict['expt_tp_pp_group'] + intra_dp_cp_group_gloo = process_groups_dict['intra_dp_cp_group_gloo'] + intra_expt_dp_group_gloo = process_groups_dict['intra_expt_dp_group_gloo'] + intra_dist_opt_group = process_groups_dict['intra_dist_opt_group'] model_parallel_rank = get_pg_rank(mp_group) if get_pg_size(dp_cp_group) > get_pg_size(intra_dp_cp_group): - inter_dist_opt_group = process_groups['inter_dist_opt_group'] + inter_dist_opt_group = process_groups_dict['inter_dist_opt_group'] distributed_optimizer_instance_id = get_pg_rank(inter_dist_opt_group) else: distributed_optimizer_instance_id = 0 @@ -573,6 +581,7 @@ def get_megatron_optimizer( data_parallel_group_idx=model_parallel_rank, intra_dist_opt_group=intra_dist_opt_group, distributed_optimizer_instance_id=distributed_optimizer_instance_id, + pg_collection=pg_collection, ) ) model_chunk_offset += 1 @@ -623,6 +632,7 @@ def get_megatron_optimizer( data_parallel_group_idx=model_parallel_rank, intra_dist_opt_group=intra_dist_opt_group, distributed_optimizer_instance_id=distributed_optimizer_instance_id, + pg_collection=pg_collection, ) ) model_chunk_offset += 1 @@ -663,6 +673,7 @@ def get_megatron_optimizer( data_parallel_group_idx=expt_model_parallel_rank, intra_dist_opt_group=intra_dist_opt_group, distributed_optimizer_instance_id=distributed_optimizer_instance_id, + pg_collection=pg_collection, ) ) diff --git a/megatron/core/optimizer/clip_grads.py b/megatron/core/optimizer/clip_grads.py index 70117858b77..cb2f23a685f 100644 --- a/megatron/core/optimizer/clip_grads.py +++ b/megatron/core/optimizer/clip_grads.py @@ -181,6 +181,7 @@ def count_zeros_fp32( parameters: Union[List[torch.Tensor], torch.Tensor], grad_stats_parallel_group: torch.distributed.ProcessGroup, use_decoupled_grad: bool = False, + tp_group: Optional[torch.distributed.ProcessGroup] = None, ) -> float: """Counts the number of zeros in gradients associated with the passed-in list of parameters. @@ -218,7 +219,7 @@ def count_zeros_fp32( grad_attr = "decoupled_grad" if use_decoupled_grad else "grad" grad_not_none = hasattr(param, grad_attr) and getattr(param, grad_attr) is not None is_not_shared = param_is_not_shared(param) - is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param) + is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param, tp_group=tp_group) if grad_not_none and is_not_shared and is_not_tp_duplicate: grad_obj = getattr(param, grad_attr) data_parallel_group = get_data_parallel_group_if_dtensor(grad_obj, data_parallel_group) diff --git a/megatron/core/optimizer/optimizer.py b/megatron/core/optimizer/optimizer.py index 1829cb424f1..8d6fb65136b 100644 --- a/megatron/core/optimizer/optimizer.py +++ b/megatron/core/optimizer/optimizer.py @@ -152,7 +152,9 @@ def get_main_grads_for_grad_norm(self) -> List[torch.Tensor]: grad = param.grad grad_not_none = grad is not None is_not_shared = param_is_not_shared(param) - is_not_tp_duplicate = tensor_parallel.param_is_not_tensor_parallel_duplicate(param) + is_not_tp_duplicate = tensor_parallel.param_is_not_tensor_parallel_duplicate( + param, getattr(self, 'tp_group', None) + ) if grad_not_none and is_not_shared and is_not_tp_duplicate: grads_for_norm.append(grad) @@ -224,6 +226,7 @@ def count_zeros(self) -> float: params, grad_stats_parallel_group=self.get_grad_stats_parallel_group(), use_decoupled_grad=self.config.use_precision_aware_optimizer_no_fp8_or_ds_fp8, + tp_group=getattr(self, 'tp_group', None), ) @abstractmethod diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index 18344429c45..97d8aefad85 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -41,7 +41,7 @@ Shape = Union[List[int], torch.Size] -def get_forward_backward_func(): +def get_forward_backward_func(pp_size: Optional[int] = None, vp_size: Optional[int] = None): """Retrieves the appropriate forward_backward function given the configuration of parallel_state. @@ -124,10 +124,18 @@ def forward_step(data_iterator, model): respective list of shapes. Thus it is not used in the other forward-backward functions which have different shape handling. + Args: + pp_size (Optional[int]): Pipeline model parallel size to use. + vp_size (Optional[int]): Virtual pipeline model parallel size to use. + If both pp_size and vp_size are None, both values fall back to parallel_state. + Otherwise, provided values are used as-is and None is treated as an explicit input. """ - pipeline_model_parallel_size = parallel_state.get_pipeline_model_parallel_world_size() - if pipeline_model_parallel_size > 1: - if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: + if pp_size is None and vp_size is None: + pp_size = parallel_state.get_pipeline_model_parallel_world_size() + vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size() + + if pp_size > 1: + if vp_size is not None: forward_backward_func = forward_backward_pipelining_with_interleaving else: forward_backward_func = forward_backward_pipelining_without_interleaving @@ -513,6 +521,7 @@ def forward_backward_no_pipelining( collect_non_loss_data: bool = False, first_val_step: Optional[bool] = None, adjust_tensor_shapes_fn: Optional[Callable] = None, # unused + p2p_communicator: Optional[P2PCommunicator] = None, # unused pg_collection: Optional[ProcessGroupCollection] = None, ): """Run forward and backward passes with no pipeline parallelism""" diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py index 221f3327e50..d3ec11aaf5c 100644 --- a/megatron/core/tensor_parallel/layers.py +++ b/megatron/core/tensor_parallel/layers.py @@ -86,12 +86,16 @@ dist_reduce_scatter_func = torch.distributed._reduce_scatter_base -def param_is_not_tensor_parallel_duplicate(param): +def param_is_not_tensor_parallel_duplicate(param, tp_group=None): """Returns true if the passed-in parameter is not a duplicate parameter on another TP rank.""" - return (hasattr(param, "tensor_model_parallel") and param.tensor_model_parallel) or ( - get_tensor_model_parallel_rank() == 0 - ) + if hasattr(param, "tensor_model_parallel") and param.tensor_model_parallel: + return True + # Prefer provided tp_group when available (new explicit path). + if tp_group is not None: + return tp_group.rank() == 0 + # Fallback to legacy global state (back-compat). + return get_tensor_model_parallel_rank() == 0 def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride): diff --git a/megatron/core/transformer/module.py b/megatron/core/transformer/module.py index 1058a207b12..2330df91b52 100644 --- a/megatron/core/transformer/module.py +++ b/megatron/core/transformer/module.py @@ -393,7 +393,9 @@ def __init__(self, config: TransformerConfig, module: torch.nn.Module): self.config = config self.fp16 = config.fp16 self.bf16 = config.bf16 + self.vp_size = config.virtual_pipeline_model_parallel_size self.vp_stage = getattr(module, 'vp_stage', None) + self.pg_collection = getattr(module, 'pg_collection', None) if self.fp16: self.add_module('module', module.half()) @@ -438,11 +440,23 @@ def forward(self, *inputs, fp32_output=True, **kwargs): The wrapped module's outputs, potentially upcast to fp32 depending on pipeline stage and ``fp32_output``. """ - if parallel_state.is_pipeline_first_stage(ignore_virtual=False, vp_stage=self.vp_stage): + from megatron.core.pipeline_parallel.utils import ( + is_pp_first_stage, + is_pp_last_stage, + is_vp_first_stage, + is_vp_last_stage, + ) + + if self.pg_collection is None: + pp_group = parallel_state.get_pipeline_model_parallel_group() + else: + pp_group = self.pg_collection.pp + if is_vp_first_stage(self.vp_stage, self.vp_size) and is_pp_first_stage(pp_group): inputs = fp32_to_float16(inputs, self.float16_convertor) outputs = self.module(*inputs, **kwargs) if ( - parallel_state.is_pipeline_last_stage(ignore_virtual=False, vp_stage=self.vp_stage) + is_vp_last_stage(self.vp_stage, self.vp_size) + and is_pp_last_stage(pp_group) and fp32_output is True ): outputs = float16_to_fp32(outputs) diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py index 7391bcaf123..83cf5b51ffc 100644 --- a/megatron/core/transformer/moe/experts.py +++ b/megatron/core/transformer/moe/experts.py @@ -50,6 +50,7 @@ make_sharded_object_for_checkpoint, sharded_state_dict_default, ) +from megatron.core.utils import internal_api try: import transformer_engine as te # pylint: disable=unused-import @@ -69,6 +70,8 @@ class GroupedMLP(MegatronModule): Executes multiple experts in parallel to maximize computational efficiency. """ + # TODO(M4): breaking api, switched from pass in tp_group to pass in pg_collection. + @internal_api def __init__( self, num_local_experts: int, @@ -732,6 +735,8 @@ class TEGroupedMLP(MegatronModule): Executes multiple experts in parallel to maximize computational efficiency. """ + # TODO(M4): breaking api, switched from pass in tp_group to pass in pg_collection. + @internal_api def __init__( self, num_local_experts, @@ -754,7 +759,6 @@ def __init__( if self.config.gated_linear_unit: ffn_hidden_size *= 2 - # TODO(Hepteract): pass pg_collection to submodule after refactoring Linear modules self.linear_fc1 = build_module( submodules.linear_fc1, self.num_local_experts, @@ -766,7 +770,7 @@ def __init__( skip_bias_add=False, is_expert=True, tp_comm_buffer_name='fc1', - tp_group=pg_collection.expt_tp, + pg_collection=pg_collection, ) if self.config.use_te_activation_func and not (submodules.activation_func is None): @@ -774,7 +778,6 @@ def __init__( else: self.activation_func = self.config.activation_func - # TODO(Hepteract): pass pg_collection to submodule after refactoring Linear modules self.linear_fc2 = build_module( submodules.linear_fc2, self.num_local_experts, @@ -786,7 +789,7 @@ def __init__( skip_bias_add=True, is_expert=True, tp_comm_buffer_name='fc2', - tp_group=pg_collection.expt_tp, + pg_collection=pg_collection, ) self.offload_expert_fc1 = ( @@ -1040,6 +1043,8 @@ class SequentialMLP(MegatronModule): This class executes each expert sequentially. """ + # TODO(M4): breaking api, switched from pass in tp_group to pass in pg_collection. + @internal_api def __init__( self, num_local_experts, diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py index 3ed31d375e2..8bab8d70065 100644 --- a/megatron/core/transformer/moe/moe_utils.py +++ b/megatron/core/transformer/moe/moe_utils.py @@ -755,18 +755,29 @@ def clear_aux_losses_tracker(): tracker[name]["values"].zero_() -def reduce_aux_losses_tracker_across_ranks(track_names: Optional[List[str]] = None): +def reduce_aux_losses_tracker_across_ranks( + track_names: Optional[List[str]] = None, pg_collection: Optional[ProcessGroupCollection] = None +): """Collect and reduce the auxiliary losses across ranks.""" tracker = get_moe_layer_wise_logging_tracker() if track_names is None: track_names = tracker.keys() + + if pg_collection is None: + # Use parallel_state groups + pp_group = parallel_state.get_pipeline_model_parallel_group() + dp_group = parallel_state.get_data_parallel_group( + with_context_parallel=False, partial_data_parallel=False + ) + else: + pp_group = pg_collection.pp + dp_group = pg_collection.dp + for name in track_names: values = tracker[name]["values"] # TODO(Hepteract): delete the usage of the global parallel_state. # Collect aux losses across PP. - torch.distributed.all_reduce( - values, group=parallel_state.get_pipeline_model_parallel_group() - ) + torch.distributed.all_reduce(values, group=pp_group) # Reduce aux losses across ranks. if tracker[name].get('reduce_group') is not None: torch.distributed.all_reduce(values, group=tracker[name].get('reduce_group')) @@ -778,11 +789,7 @@ def reduce_aux_losses_tracker_across_ranks(track_names: Optional[List[str]] = No # The `global_load_balancing_loss` already uses `tp_dp_cp_group` in `reduce_group`, # so we don't need to reduce it again. Others use `tp_cp_group` in `reduce_group`. if name != "global_load_balancing_loss": - torch.distributed.all_reduce( - values, - group=parallel_state.get_data_parallel_group(with_context_parallel=False), - op=torch.distributed.ReduceOp.AVG, - ) + torch.distributed.all_reduce(values, group=dp_group, op=torch.distributed.ReduceOp.AVG) def track_moe_metrics( @@ -797,6 +804,7 @@ def track_moe_metrics( num_layers: Optional[int] = None, moe_layer_freq: Optional[Union[int, List[int]]] = None, mtp_num_layers: Optional[int] = None, + pg_collection: Optional[ProcessGroupCollection] = None, ): """Track the MoE metrics for logging.""" # Aux loss logging @@ -810,7 +818,7 @@ def track_moe_metrics( tracker[key]["values"] = torch.zeros(num_layers, device="cuda") tracker[key]["reduce_group"] = None tracker[key]["avg_group"] = None - reduce_aux_losses_tracker_across_ranks(track_names) + reduce_aux_losses_tracker_across_ranks(track_names, pg_collection=pg_collection) # Get number of MoE layers if moe_layer_freq is None: diff --git a/megatron/core/transformer/moe/shared_experts.py b/megatron/core/transformer/moe/shared_experts.py index bf2c2072af9..ab075d94e52 100644 --- a/megatron/core/transformer/moe/shared_experts.py +++ b/megatron/core/transformer/moe/shared_experts.py @@ -1,7 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. import warnings -from copy import deepcopy +from copy import copy from typing import Optional import torch @@ -43,7 +43,7 @@ def __init__( gate: bool, pg_collection: Optional[ProcessGroupCollection] = None, ): - config = deepcopy(config) + config = copy(config) assert config.add_bias_linear == False, "bias is not supported in the shared experts, " "please set '--disable-bias-linear' instead." diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py index 3953d933b45..b65294fcc10 100644 --- a/megatron/core/transformer/multi_latent_attention.py +++ b/megatron/core/transformer/multi_latent_attention.py @@ -15,7 +15,7 @@ HAVE_EINOPS = False -from megatron.core import parallel_state, tensor_parallel +from megatron.core import tensor_parallel from megatron.core.models.common.embeddings import ( RotaryEmbedding, YarnRotaryEmbedding, @@ -41,7 +41,7 @@ from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.transformer_config import MLATransformerConfig -from megatron.core.utils import deprecate_inference_params, is_te_min_version +from megatron.core.utils import deprecate_inference_params, get_pg_size, is_te_min_version try: from megatron.core.fusions.fused_mla_yarn_rope_apply import ( @@ -178,6 +178,7 @@ def __init__( skip_bias_add=True, is_expert=False, tp_comm_buffer_name='proj', + tp_group=self.pg_collection.tp, ) if ( @@ -401,6 +402,9 @@ def __init__( cp_comm_type: Optional[str] = None, pg_collection: ProcessGroupCollection = None, ): + if pg_collection is None: + pg_collection = ProcessGroupCollection.use_mpu_process_groups() + super().__init__( config=config, submodules=submodules, @@ -450,6 +454,11 @@ def __init__( is_expert=False, tp_comm_buffer_name='q_down_proj', skip_weight_param_allocation=False, + tp_group=( + pg_collection.tp + if q_down_proj_kwargs.get('parallel_mode') != 'duplicated' + else None + ), **q_down_proj_kwargs, ) @@ -464,6 +473,7 @@ def __init__( skip_bias_add=False, is_expert=False, tp_comm_buffer_name='q_up_proj', + tp_group=pg_collection.tp, ) kv_down_proj_kwargs = {} @@ -489,6 +499,11 @@ def __init__( is_expert=False, tp_comm_buffer_name='kv_down_proj', skip_weight_param_allocation=False, + tp_group=( + pg_collection.tp + if kv_down_proj_kwargs.get('parallel_mode') != 'duplicated' + else None + ), **kv_down_proj_kwargs, ) @@ -503,6 +518,7 @@ def __init__( skip_bias_add=False, is_expert=False, tp_comm_buffer_name='kv_up_proj', + tp_group=pg_collection.tp, ) if self.config.q_lora_rank is not None: @@ -624,12 +640,9 @@ def get_query_key_value_tensors( kv_compressed, k_pos_emb = torch.split( kv_combined, [self.config.kv_lora_rank, self.config.qk_pos_emb_head_dim], dim=-1 ) - if ( - parallel_state.get_tensor_model_parallel_world_size() > 1 - and self.config.sequence_parallel - ): + if get_pg_size(self.tp_group) > 1 and self.config.sequence_parallel: # k_pos_emb: [s, b, qk_pos_emb_head_dim] - k_pos_emb = gather_from_sequence_parallel_region(k_pos_emb) + k_pos_emb = gather_from_sequence_parallel_region(k_pos_emb, group=self.tp_group) if packed_seq_params is not None: # If sequence packing, TE expect [t, h, d] shaped qkv input. diff --git a/megatron/core/utils.py b/megatron/core/utils.py index 9b62b18d400..431b56bd002 100644 --- a/megatron/core/utils.py +++ b/megatron/core/utils.py @@ -494,6 +494,10 @@ def get_tensor_model_parallel_group_if_none(tp_group, is_expert=False, check_ini if not torch.distributed.is_initialized(): return None + # if parallel_state is not initialized, pass `tp_group` thru + if not parallel_state.is_initialized(): + return tp_group + if tp_group is None: if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0: warnings.warn( @@ -1942,9 +1946,17 @@ def is_submodule(module, parent_module, strict=True): ######################## -def get_batch_on_this_cp_rank(batch: Dict[str, Any]): +def get_batch_on_this_cp_rank( + batch: Dict[str, Any], cp_group: Optional[torch.distributed.ProcessGroup] = None +): """Slice batch input along sequence dimension into multiple chunks, which are parallelized across GPUs in a context parallel group. + + Args: + batch (Dict[str, Any]): Input batch tensors. + cp_group (Optional[torch.distributed.ProcessGroup]): Context-parallel process group. + If provided, uses this group's size and rank. Otherwise, falls back to + the current context-parallel settings from parallel_state. """ # With causal masking, each token only attends to its prior tokens. Simply split @@ -1953,9 +1965,15 @@ def get_batch_on_this_cp_rank(batch: Dict[str, Any]): # we split sequence into 2*CP ranks. Assuming CP=2, we then get 4 chunks, chunk_0 # and chunk_3 are assigned to GPU0, chunk_1 and chunk_2 are assigned to GPU1, so # that we can get balanced workload among GPUs in a context parallel group. - cp_size = parallel_state.get_context_parallel_world_size() - if cp_size > 1: + # Determine CP topology either from provided group or from current context parallel state + if cp_group is not None: + cp_size = get_pg_size(cp_group) + cp_rank = get_pg_rank(cp_group) + else: + cp_size = parallel_state.get_context_parallel_world_size() cp_rank = parallel_state.get_context_parallel_rank() + + if cp_size > 1: for key, val in batch.items(): if val is not None: seq_dim = 1 if key != "attention_mask" else 2 diff --git a/megatron/training/training.py b/megatron/training/training.py index d47a8abd20e..99fbd453426 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -50,6 +50,7 @@ from megatron.core import mpu, tensor_parallel from megatron.core.utils import ( check_param_hashes_across_dp_replicas, + get_attr_wrapped_model, get_model_config, StragglerDetector, ) @@ -1504,6 +1505,7 @@ def training_log( params_norm, num_zeros_in_grad, max_attention_logit, + pg_collection=None, ): """Log training information such as losses, timing, ....""" args = get_args() @@ -1693,6 +1695,7 @@ def training_log( num_layers=args.num_layers, moe_layer_freq=args.moe_layer_freq, mtp_num_layers=args.mtp_num_layers, + pg_collection=pg_collection, ) if args.mtp_num_layers is not None: mtp_loss_scale = 1 / get_num_microbatches() @@ -2188,6 +2191,8 @@ def train( for model_module in model: model_module.train() + model_pg_collection = get_attr_wrapped_model(model[0], "pg_collection") + # Tracking loss. total_loss_dict = {} @@ -2559,6 +2564,7 @@ def get_e2e_base_metrics(): params_norm, num_zeros_in_grad, max_attention_logit, + pg_collection=model_pg_collection, ) # Evaluation. From ad5a222b2ea9727b15fed108ace31c8bbd7b5c80 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Thu, 4 Dec 2025 15:44:30 -0800 Subject: [PATCH 175/248] feat: add decorator: experimental_api (#2546) Signed-off-by: Pablo Garay --- ...k_api_backwards_compatibility_workflow.yml | 12 ++++++- docs/api-backwards-compatibility-check.md | 31 ++++++++++++++--- megatron/core/utils.py | 33 +++++++++++++++++++ scripts/check_api_backwards_compatibility.py | 4 +-- 4 files changed, 72 insertions(+), 8 deletions(-) diff --git a/.github/workflows/check_api_backwards_compatibility_workflow.yml b/.github/workflows/check_api_backwards_compatibility_workflow.yml index 707d5f76316..551978cb84a 100644 --- a/.github/workflows/check_api_backwards_compatibility_workflow.yml +++ b/.github/workflows/check_api_backwards_compatibility_workflow.yml @@ -197,7 +197,17 @@ jobs: echo " def internal_helper_function():" echo " pass" echo "" - echo "3️⃣ USE DEPRECATION (For gradual API changes)" + echo "3️⃣ MARK AS EXPERIMENTAL API (If this is experimental code)" + echo " → Add @experimental_api decorator from megatron.core.utils" + echo "" + echo " Example:" + echo " from megatron.core.utils import experimental_api" + echo "" + echo " @experimental_api" + echo " class ExperimentalFeature:" + echo " pass" + echo "" + echo "4️⃣ USE DEPRECATION (For gradual API changes)" echo " → Add @deprecated decorator for transition period" echo " → Example:" echo " from megatron.core.utils import deprecated" diff --git a/docs/api-backwards-compatibility-check.md b/docs/api-backwards-compatibility-check.md index e2fabbf4cd2..0e78eaec669 100644 --- a/docs/api-backwards-compatibility-check.md +++ b/docs/api-backwards-compatibility-check.md @@ -26,7 +26,7 @@ The compatibility checker: ### ⏭️ What Gets Skipped - **Test functions** - Functions starting with `test_` -- **Exempt decorators** - Functions marked with `@internal_api` or `@deprecated` +- **Exempt decorators** - Functions marked with `@internal_api`, `@experimental_api`, or `@deprecated` - **Excluded paths** - Code in `tests/`, `experimental/`, `legacy/` ### ✅ Allowed Changes @@ -57,6 +57,8 @@ python scripts/check_api_backwards_compatibility.py --baseline core_r0.8.0 --cur If you need to make breaking changes to internal or experimental APIs: +#### Internal API (for internal implementation details) + ```python from megatron.core.utils import internal_api @@ -69,11 +71,29 @@ def experimental_feature(x, y): pass ``` -**When to use:** +**When to use `@internal_api`:** - Internal APIs not documented for external use - Experimental features explicitly marked as unstable - Functions in development that haven't been released yet +#### Experimental API (for experimental features) + +```python +from megatron.core.utils import experimental_api + +@experimental_api +def new_experimental_feature(x, y): + """ + This API is experimental and may change without notice. + """ + pass +``` + +**When to use `@experimental_api`:** +- Experimental features explicitly marked as unstable +- New APIs under active development +- Features that haven't been stabilized yet + ### Deprecating APIs For planned API changes, use the deprecation workflow: @@ -196,7 +216,7 @@ Script loads code via griffe: • Current: PR branch ↓ Apply filtering: - • Skip @internal_api and @deprecated + • Skip @internal_api, @experimental_api, and @deprecated • Skip private functions (_prefix) • Skip test/experimental paths ↓ @@ -223,6 +243,7 @@ Edit `scripts/check_api_backwards_compatibility.py`: # Add more exempt decorators EXEMPT_DECORATORS = [ "internal_api", + "experimental_api", "deprecated", ] @@ -255,11 +276,11 @@ The workflow auto-detects the latest `core_r*` tag. To manually specify: ### Q: Can I disable the check for my PR? -**A:** No, but you can mark specific functions as exempt using `@internal_api`. +**A:** No, but you can mark specific functions as exempt using `@internal_api` or `@experimental_api`. ### Q: What if I need to make a breaking change? -**A:** Use the `@deprecated` decorator for a gradual transition, or mark the function as exempt if it's internal/experimental. +**A:** Use the `@deprecated` decorator for a gradual transition, or mark the function as exempt using `@internal_api` (for internal code) or `@experimental_api` (for experimental features). ### Q: Does this check all of Megatron-LM? diff --git a/megatron/core/utils.py b/megatron/core/utils.py index 431b56bd002..91b15dabf74 100644 --- a/megatron/core/utils.py +++ b/megatron/core/utils.py @@ -2325,3 +2325,36 @@ class ExperimentalFeature: """ func._internal_api = True return func + + +def experimental_api(func: Callable) -> Callable: + """ + Mark a function or class as experimental API. + + Use this decorator for: + - Experimental features that may change without notice + - New APIs under active development + - Features that are not yet stable + + Objects marked with this decorator will be exempt from backward + compatibility checks, allowing rapid iteration during development. + + Args: + func: The function or class to mark as experimental + + Returns: + The original function/class with an experimental API marker + + Example: + @experimental_api + def new_experimental_feature(): + '''This API is experimental and may change''' + pass + + @experimental_api + class ExperimentalModel: + '''This model is under active development''' + pass + """ + func._experimental_api = True + return func diff --git a/scripts/check_api_backwards_compatibility.py b/scripts/check_api_backwards_compatibility.py index 9c1f29ca890..bf5492c2962 100644 --- a/scripts/check_api_backwards_compatibility.py +++ b/scripts/check_api_backwards_compatibility.py @@ -4,7 +4,7 @@ Megatron Core API Compatibility Checker Simple checker using Griffe to find breaking changes between two versions. -Objects decorated with @internal_api or @deprecated are excluded from checks. +Objects decorated with @internal_api, @experimental_api, or @deprecated are excluded from checks. Usage: python scripts/check_api_backwards_compatibility.py --baseline core_v0.14.0 @@ -44,7 +44,7 @@ # Decorators that exempt objects from compatibility checks -EXEMPT_DECORATORS = ['internal_api', 'deprecated'] +EXEMPT_DECORATORS = ['internal_api', 'deprecated', 'experimental_api'] def has_exempt_decorator(obj: Object) -> bool: From 7d17116bf409059e20df998732b29022a8dae406 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Thu, 4 Dec 2025 15:45:04 -0800 Subject: [PATCH 176/248] feat: API compat: ignore AttributeChangedValueBreakage (not a signature change) - dev (#2547) Signed-off-by: Pablo Garay --- ...check_api_backwards_compatibility_workflow.yml | 2 ++ scripts/check_api_backwards_compatibility.py | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/.github/workflows/check_api_backwards_compatibility_workflow.yml b/.github/workflows/check_api_backwards_compatibility_workflow.yml index 551978cb84a..002a18194a3 100644 --- a/.github/workflows/check_api_backwards_compatibility_workflow.yml +++ b/.github/workflows/check_api_backwards_compatibility_workflow.yml @@ -62,7 +62,9 @@ jobs: fi # Check for changes in megatron/core Python files (excluding tests and legacy) + # Note: Using both *.py and **/*.py to match files at root and in subdirectories CHANGED_FILES=$(git diff --name-only "$BASE_SHA" HEAD -- \ + 'megatron/core/*.py' \ 'megatron/core/**/*.py' \ ':!megatron/core/tests/**' \ ':!megatron/legacy/**' 2>/dev/null || echo "") diff --git a/scripts/check_api_backwards_compatibility.py b/scripts/check_api_backwards_compatibility.py index bf5492c2962..4977b806433 100644 --- a/scripts/check_api_backwards_compatibility.py +++ b/scripts/check_api_backwards_compatibility.py @@ -46,6 +46,13 @@ # Decorators that exempt objects from compatibility checks EXEMPT_DECORATORS = ['internal_api', 'deprecated', 'experimental_api'] +# Breakage kinds to ignore (not actual API signature changes) +# AttributeChangedValueBreakage: Changing constant values (e.g., VERSION = "1.0" -> "2.0") +# is not a breaking API change - the constant still exists with the same name +IGNORED_BREAKAGE_KINDS = [ + 'AttributeChangedValueBreakage', +] + def has_exempt_decorator(obj: Object) -> bool: """Check if a Griffe object has any exempt decorator. @@ -206,9 +213,10 @@ def get_object_path(change) -> str: def should_skip_change(change, filtered_paths: set) -> bool: - """Determine if a breaking change should be skipped based on exempt decorators. + """Determine if a breaking change should be skipped. A change is skipped if: + - The change kind is in IGNORED_BREAKAGE_KINDS (not a signature change) - The changed object itself is in filtered_paths (exact match) - The changed object is a child of an exempt object (prefix match) @@ -219,6 +227,11 @@ def should_skip_change(change, filtered_paths: set) -> bool: Returns: bool: True if the change should be skipped (filtered out) """ + # Check if this breakage kind should be ignored (not a signature change) + change_kind = type(change).__name__ + if change_kind in IGNORED_BREAKAGE_KINDS: + return True + path = get_object_path(change) if not path: return False From 274e04d21fbcb7f53f63de992ee1217f275f1cf2 Mon Sep 17 00:00:00 2001 From: Parth Mannan <38387286+parthmannan@users.noreply.github.com> Date: Thu, 4 Dec 2025 15:49:09 -0800 Subject: [PATCH 177/248] [Dev] Hybrid Data x Context Parallelism Feature (#2054) Signed-off-by: tailaim Signed-off-by: Parth Mannan Co-authored-by: Mcore Bot Co-authored-by: tailaim Co-authored-by: kunlunl Co-authored-by: Kunlun Li <94586211+kunlunl@users.noreply.github.com> --- megatron/core/datasets/data_schedule.py | 301 ++++++++ megatron/core/datasets/gpt_dataset.py | 18 + .../core/extensions/transformer_engine.py | 20 + megatron/core/model_parallel_config.py | 19 + .../common/embeddings/rotary_pos_embedding.py | 61 +- .../embeddings/yarn_rotary_pos_embedding.py | 44 +- megatron/core/models/gpt/gpt_model.py | 12 +- megatron/core/packed_seq_params.py | 3 + megatron/core/parallel_state.py | 54 ++ .../pipeline_parallel/hybrid_cp_schedule.py | 660 ++++++++++++++++++ megatron/core/pipeline_parallel/schedules.py | 19 + megatron/core/transformer/attention.py | 4 +- .../experimental_attention_variant/dsa.py | 8 +- .../transformer/multi_latent_attention.py | 17 +- megatron/core/utils.py | 108 ++- megatron/legacy/data/data_samplers.py | 71 +- megatron/training/arguments.py | 14 + megatron/training/initialize.py | 1 + megatron/training/training.py | 34 +- megatron/training/utils.py | 96 ++- pretrain_gpt.py | 34 +- pretrain_mamba.py | 7 + tests/unit_tests/test_parallel_state.py | 31 + 23 files changed, 1558 insertions(+), 78 deletions(-) create mode 100644 megatron/core/datasets/data_schedule.py create mode 100644 megatron/core/pipeline_parallel/hybrid_cp_schedule.py diff --git a/megatron/core/datasets/data_schedule.py b/megatron/core/datasets/data_schedule.py new file mode 100644 index 00000000000..0f016473b6a --- /dev/null +++ b/megatron/core/datasets/data_schedule.py @@ -0,0 +1,301 @@ +# Copyright (c) 2025 NVIDIA CORPORATION. All rights reserved. + +from typing import Any, List, Optional + +import torch + +from megatron.core import parallel_state +from megatron.core.pipeline_parallel.hybrid_cp_schedule import BalancedCPScheduler +from megatron.core.process_groups_config import ProcessGroupCollection + + +class HybridCPDataLoaderWrapper: + """ + A wrapper class that wraps around an existing data_iterator. + For every __next__ call, + 1. Each DP rank pulls a batch of packed samples. + 2. Extracts the sequence lengths of each sub-sample and all-gathers across the DP group. + 3. Schedules the sub-samples to the DPxCP ranks using the BalancedCPScheduler. + 4. Based on the schedule, reroutes the sub-samples to the correct rank using all-to-all. + 5. Returns the assigned sub-samples to this rank. + + Args: + data_iterator: The original data_iterator to wrap around + config: The config object containing the max_seqlen_per_dp_cp_rank + dp_cp_group: Data parallel context parallel group. + """ + + def __init__( + self, data_iterator, config, pg_collection: Optional[ProcessGroupCollection] = None + ): + self.data_iterator = data_iterator + self.config = config + if pg_collection is None: + self.dp_cp_group = parallel_state.get_data_parallel_group(with_context_parallel=True) + self.dp_group = parallel_state.get_data_parallel_group() + self.tp_group = parallel_state.get_tensor_model_parallel_group() + else: + self.dp_cp_group = pg_collection.dp_cp + self.dp_group = pg_collection.dp + self.tp_group = pg_collection.tp + assert ( + self.dp_cp_group is not None and self.dp_group is not None and self.tp_group is not None + ), "dp_cp_group, dp_group, tp_group must not be None when using hybrid context parallel" + + self.cp_balancing_scheduler = BalancedCPScheduler( + max_seq_len_per_rank=self.config.max_seqlen_per_dp_cp_rank, dp_cp_group=self.dp_cp_group + ) + + self.total_hdp_gpus = self.dp_cp_group.size() + + def __iter__(self): + """Return self as an iterator.""" + return self + + def get_global_seqlens(self, subsample_seqlens: torch.Tensor) -> List[int]: + """ + Gathers the sequence lengths of all subsamples from all DP ranks. + Each DP rank loads the same number of microbatches but each microbatch + may have a different number of subsamples. + + We find the number of subsamples each rank holds and then gather the + sequence lengths of all subsamples from all ranks. + """ + # Collect the number of subsamples from all ranks + local_len = torch.tensor([subsample_seqlens.shape[0]], dtype=torch.int32).cuda() + dp_subsample_count = [torch.zeros_like(local_len) for _ in range(self.dp_group.size())] + torch.distributed.all_gather(dp_subsample_count, local_len, group=self.dp_group) + + # Find the max number of subsamples across all ranks and pad subsample_seqlens to max length + dp_subsample_counts = torch.stack(dp_subsample_count, dim=0).cpu().view(-1) + max_sub_samples = int(dp_subsample_counts.max().item()) + + if local_len.item() < max_sub_samples: + subsample_seqlens_padded = torch.cat( + [ + subsample_seqlens, + torch.zeros(max_sub_samples - local_len.item(), dtype=torch.int32).cuda(), + ], + dim=0, + ) + else: + subsample_seqlens_padded = subsample_seqlens + + # Gather the subsample_seqlens from all ranks + seqlens_gathered = [ + torch.empty_like(subsample_seqlens_padded) for _ in range(self.dp_group.size()) + ] + torch.distributed.all_gather( + seqlens_gathered, subsample_seqlens_padded, group=self.dp_group + ) + + # Trim each seqlens_gathered to the length of the correct sample + for dp_rank, seqlen in enumerate(seqlens_gathered): + seqlens_gathered[dp_rank] = seqlen[: dp_subsample_counts[dp_rank]] + + seqlens_gathered = torch.cat(seqlens_gathered, dim=0) + seqlens_gathered = seqlens_gathered.cpu().tolist() + + # Calculate the offsets to assign unique global ID to each subsample. + csum = torch.cumsum(dp_subsample_counts, dim=0, dtype=torch.int32) + offsets = torch.cat([torch.zeros(1, dtype=torch.int32), csum[:-1]], dim=0) + + return seqlens_gathered, offsets + + def get_global_id_seqlens(self, num_local_subsamples, offsets, seqlens_gathered): + """ + Calculates the global ID for each subsample. + + We assign a unique global ID to each subsample. + + Returns: + global_id_seqlens: list of (global_id, seqlen) tuples for scheduling. + global_ids_this_rank: list of global IDs locally present on this rank. + """ + dp_rank = self.dp_group.rank() + global_ids = torch.arange(len(seqlens_gathered), dtype=torch.int32).cuda() + # Create a list of (global_id, seqlen) tuples for scheduling + global_id_seqlens = [(i, seqlens_gathered[i]) for i in range(len(global_ids))] + # Get the global IDs locally present on this rank + global_ids_this_rank = global_ids[ + offsets[dp_rank] : offsets[dp_rank] + num_local_subsamples + ] + + return global_id_seqlens, global_ids_this_rank + + def _gid_to_src_rank(self, gid: int, offsets: List[int]) -> int: + dp_src_rank = torch.bucketize(gid, offsets[1:] - 1) + # Since the torch.distributed.get_process_group_ranks + # provides the global rank, we need to consider TP + hdp_rank = ( + torch.distributed.get_process_group_ranks(self.dp_group)[dp_src_rank] + // self.tp_group.size() + ) + return hdp_rank + + def reroute_samples_to_hdp_ranks( + self, batch, global_ids_this_rank, global_id_seqlens, sample_id_groups, offsets + ): + """ + Reroutes the sub-samples to the correct rank after scheduling. + + For each key in the batch dict, we perform an all-to-all communication + to transfer the data to the correct ranks. + Since all CP ranks within a DP group have the same data, we only need + to transfer data between matching CP ranks. + """ + gid2local_id = {int(gid): i for i, gid in enumerate(global_ids_this_rank)} + hdp_rank = self.dp_cp_group.rank() + dp_ranks = torch.distributed.get_process_group_ranks(self.dp_group) + # Here we actually want to get the DP group's rank within the HDP group, + # we need to consider TP + dp_ranks = [r // self.tp_group.size() for r in dp_ranks] + + data_keys = batch[0].keys() + + # Create the send plan + combined_sample_id_groups: List[List[int]] = [[] for _ in range(self.total_hdp_gpus)] + + for d in range(self.total_hdp_gpus): + for sample_id_group in sample_id_groups: + combined_sample_id_groups[d].extend(sample_id_group[d]) + + for dest_rank in range(self.total_hdp_gpus): + combined_sample_id_groups[dest_rank].sort() + + # Filter out samples that are not present on this rank + send_ids_sorted = [ + gid + for d in dp_ranks + for gid in combined_sample_id_groups[d] + if gid in global_ids_this_rank + ] + # send_counts = [len(combined_sample_id_groups[d]) for d in range(self.total_hdp_gpus)] + + send_lens_split = [0] * self.total_hdp_gpus + for dest_rank in range(self.total_hdp_gpus): + if dest_rank in dp_ranks: + send_lens_split[dest_rank] = sum( + [ + global_id_seqlens[gid][1] + for gid in combined_sample_id_groups[dest_rank] + if gid in global_ids_this_rank + ] + ) + else: + # We only need to share local data with DP ranks that have different data. + send_lens_split[dest_rank] = 0 + + # Create the recv plan + recv_sample_id_groups = [[] for _ in range(self.total_hdp_gpus)] + for gid in combined_sample_id_groups[hdp_rank]: + src_rank = self._gid_to_src_rank(gid, offsets) + recv_sample_id_groups[src_rank].append(gid) + + recv_lens_split = [0] * self.total_hdp_gpus + for src_rank in range(self.total_hdp_gpus): + recv_lens_split[src_rank] = sum( + [global_id_seqlens[gid][1] for gid in recv_sample_id_groups[src_rank]] + ) + + recv_ids_sorted = [ + gid for d in range(self.total_hdp_gpus) for gid in recv_sample_id_groups[d] + ] + recv_counts = [len(recv_sample_id_groups[d]) for d in range(self.total_hdp_gpus)] + + recv_samples = [{k: None for k in data_keys} for _ in range(sum(recv_counts))] + + def _pack_sample_by_key(key: str) -> torch.Tensor: + flattened_tensors = [] + for gid in send_ids_sorted: + t = batch[gid2local_id[gid]][key].to(torch.cuda.current_device(), non_blocking=True) + flattened_tensors.append(t) + return ( + torch.cat(flattened_tensors, dim=0) + if flattened_tensors + else torch.empty(0, device=torch.cuda.current_device(), dtype=batch[0][key].dtype) + ) + + def _unpack_sample_by_key(key: str, recv_tensor: torch.Tensor): + cursor = 0 + for i, gid in enumerate(recv_ids_sorted): + sample_len = global_id_seqlens[gid][1] + recv_samples[i][key] = recv_tensor[cursor : cursor + sample_len] + cursor += sample_len + + for key in data_keys: + send_tensor = _pack_sample_by_key(key) + recv_tensor = torch.empty( + sum(recv_lens_split), device=torch.cuda.current_device(), dtype=send_tensor.dtype + ) + torch.distributed.all_to_all_single( + output=recv_tensor, + input=send_tensor, + output_split_sizes=recv_lens_split, + input_split_sizes=send_lens_split, + group=self.dp_cp_group, + ) + _unpack_sample_by_key(key, recv_tensor) + + recv_sample_with_id = { + recv_id: recv_samples[i] for i, recv_id in enumerate(recv_ids_sorted) + } + return recv_sample_with_id + + def unpack_batch(self, batch): + """ + Unpacks the packed samples into a list of sub-samples. + Since each sub-sample may be routed to different DPxCP ranks, + we unpack the sample here to avoid unnecessarily transferring + the entire packed sample. + """ + batch_unpacked = [] + for sample in batch: + for sub_sample in range(sample["cu_seqlens"].shape[0] - 1): + sub_sample_dict = {} + start_idx = sample["cu_seqlens"][sub_sample] + end_idx = sample["cu_seqlens"][sub_sample + 1] + if end_idx - start_idx == 0: + continue + for key in sample.keys(): + if key in ["cu_seqlens", "batch_idx", "max_seqlen"]: + continue + sub_sample_dict[key] = sample[key][start_idx:end_idx] + batch_unpacked.append(sub_sample_dict) + return batch_unpacked + + def __next__(self) -> Any: + """ + Get the next item from the dataset, pull scheduling metadata and return it. + """ + if self.data_iterator is None: + # TP0 reads from data_iterator, others receive via broadcast. + return None, None + else: + batch = next(self.data_iterator) + subsample_seqlens = [] + for sample in batch: + subsample_seqlens.extend( + [ + int(sample["cu_seqlens"][i + 1] - sample["cu_seqlens"][i]) + for i in range(0, sample["cu_seqlens"].shape[0] - 1) + ] + ) + subsample_seqlens = torch.tensor(subsample_seqlens, dtype=torch.int32).cuda() + subsample_seqlens = subsample_seqlens[subsample_seqlens != 0] + + seqlens_gathered, offsets = self.get_global_seqlens(subsample_seqlens) + + global_id_seqlens, global_ids_this_rank = self.get_global_id_seqlens( + subsample_seqlens.shape[0], offsets, seqlens_gathered + ) + + groups, sample_id_groups = self.cp_balancing_scheduler.get_groups_and_subsamples( + global_id_seqlens, self.config + ) + + batch = self.unpack_batch(batch) + samples_this_rank_with_id = self.reroute_samples_to_hdp_ranks( + batch, global_ids_this_rank, global_id_seqlens, sample_id_groups, offsets + ) + return samples_this_rank_with_id, sample_id_groups diff --git a/megatron/core/datasets/gpt_dataset.py b/megatron/core/datasets/gpt_dataset.py index 710a4c684ff..f50a6a77f57 100644 --- a/megatron/core/datasets/gpt_dataset.py +++ b/megatron/core/datasets/gpt_dataset.py @@ -49,6 +49,24 @@ class GPTDatasetConfig(BlendedMegatronDatasetConfig): object_storage_cache_path: Optional[str] = None """Path for caching indices for s3 or msc dataloading.""" + context_parallel_size: int = 1 + """Option to enable context parallelism""" + + data_parallel_size: int = 1 + """Option to enable data parallelism""" + + sequence_parallel_size: int = 0 + """Option to indicate the sequence parallelism size when using TP + Set to 0 if sequence parallel is not enabled regardless of TP size. + """ + + hybrid_context_parallel: bool = False + """Option to enable hybrid context parallelism. When setting this to True, + each sample should be divisible by the data parallel size * context parallel size * 2. + If sequence parallel is enabled, it should be divisible by the + data parallel size * context parallel size * sequence parallel size * 2. + """ + def __post_init__(self) -> None: """Do asserts and set fields post init""" super().__post_init__() diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index ab9962cfb1c..acb93ef7853 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -1005,6 +1005,7 @@ def __init__( self.kept_packed_seq_params = set( field.name for field in dataclasses.fields(PackedSeqParams) ) + if get_te_version() < PkgVersion("1.3.0"): # TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H # copies (#555) @@ -1055,6 +1056,25 @@ def forward( packed_seq_params: PackedSeqParams = None, ): """Forward.""" + if packed_seq_params is not None: + # If Dynamic CP group is provided, update TE DPA CP group + if packed_seq_params.cp_group is not None: + self.cp_group = packed_seq_params.cp_group + super().set_context_parallel_group( + self.cp_group, + torch.distributed.get_process_group_ranks(self.cp_group), + TEDotProductAttention.cp_stream, + self.cp_comm_type, + ) + # If cp_group is None but local_cp_size is provided, + # Indicates to turn off CP dynamically + elif packed_seq_params.local_cp_size is not None: + assert ( + packed_seq_params.local_cp_size == 1 + ), "local_cp_size must be == 1 if provided without cp_group" + super().set_context_parallel_group(None, None, None, self.cp_comm_type) + self.kept_packed_seq_params.discard("cp_group") + self.kept_packed_seq_params.discard("local_cp_size") packed_seq_kwargs = ( {key: getattr(packed_seq_params, key) for key in self.kept_packed_seq_params} if packed_seq_params is not None diff --git a/megatron/core/model_parallel_config.py b/megatron/core/model_parallel_config.py index e31fcd2577e..e75ff4a0273 100644 --- a/megatron/core/model_parallel_config.py +++ b/megatron/core/model_parallel_config.py @@ -6,8 +6,11 @@ import torch +from megatron.core.utils import internal_api + @dataclass +@internal_api class ModelParallelConfig: """Base configuration for Megatron Core @@ -53,6 +56,22 @@ class ModelParallelConfig: type. """ + max_seqlen_per_dp_cp_rank: Optional[int] = None + """ + Maximum sequence length per DPxCP rank. This is the maximum sequence length each rank + can handle without overflowing the memory. Typically, a good starting point is to set this + to maximum sequence length / context parallel size. + This is used to calculate the number and length of sub-samples assigned to + each rank when using hybrid_context_parallel. + """ + + hybrid_context_parallel: bool = False + """ + If true, enables hybrid context parallel. This is used to balance the workload of + each CP rank when we use packed samples with variable sequence lengths. + Please set max_seqlen_per_dp_cp_rank when using hybrid_context_parallel. + """ + expert_model_parallel_size: int = 1 """Distributes Moe Experts across sub data parallel dimension.""" diff --git a/megatron/core/models/common/embeddings/rotary_pos_embedding.py b/megatron/core/models/common/embeddings/rotary_pos_embedding.py index 0d7d5e626d0..5d7b69cd34e 100644 --- a/megatron/core/models/common/embeddings/rotary_pos_embedding.py +++ b/megatron/core/models/common/embeddings/rotary_pos_embedding.py @@ -25,7 +25,7 @@ apply_rotary_pos_emb, get_pos_emb_on_this_cp_rank, ) -from megatron.core.utils import deprecate_inference_params +from megatron.core.utils import deprecate_inference_params, internal_api logger = logging.getLogger(__name__) @@ -148,13 +148,12 @@ def get_cos_sin(self, max_seq_len: int, offset: int = 0) -> (Tensor, Tensor): return cos, sin @lru_cache(maxsize=32) - def forward(self, max_seq_len: int, offset: int = 0, packed_seq: bool = False) -> Tensor: - """Forward pass of RoPE embedding. + def get_emb(self, max_seq_len: int, offset: int = 0) -> Tensor: + """Forward pass of RoPE embedding before CP sharding. Args: max_seq_len (int): Maximum size of sequence offset (int, optional): RoPE offset. Defaults to 0. - packed_seq (bool, optional): Whether to use packed sequence. Defaults to False. Returns: Tensor: Embeddings after applying RoPE. @@ -174,10 +173,35 @@ def forward(self, max_seq_len: int, offset: int = 0, packed_seq: bool = False) - ) # emb [seq_length, .., dim] emb = emb[:, None, None, :] - if self.cp_group is not None and self.cp_group.size() > 1 and not packed_seq: - # slice rotary_pos_emb along sequence dimension and select the parition of the current - # CP rank - emb = get_pos_emb_on_this_cp_rank(emb, 0, self.cp_group) + return emb + + @internal_api + def forward( + self, max_seq_len: int, offset: int = 0, packed_seq_params: Optional[PackedSeqParams] = None + ) -> Tensor: + """Forward pass of RoPE embedding. + + Args: + max_seq_len (int): Maximum size of sequence + offset (int, optional): RoPE offset. Defaults to 0. + packed_seq_params (PackedSeqParams, optional): Packed sequence params. Defaults to None. + + Returns: + Tensor: Embeddings after applying RoPE. + """ + emb = self.get_emb(max_seq_len, offset) + packed_seq = packed_seq_params is not None and packed_seq_params.qkv_format == 'thd' + if packed_seq_params is not None and packed_seq_params.local_cp_size is not None: + # Set CP group to dynamic CP group for CP slicing + cp_group = packed_seq_params.cp_group + else: + cp_group = self.cp_group + + if cp_group is not None and cp_group.size() > 1 and not packed_seq: + # slice rotary_pos_emb along sequence dimension + # and select the parition of the current CP rank + emb = get_pos_emb_on_this_cp_rank(emb, 0, cp_group) + return emb def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): @@ -279,13 +303,19 @@ def __init__( else parallel_state.get_context_parallel_group(check_initialized=False) ) - def forward(self, position_ids: torch.Tensor, mrope_section: List[int]) -> Tensor: + def forward( + self, + position_ids: torch.Tensor, + mrope_section: List[int], + packed_seq_params: Optional[PackedSeqParams] = None, + ) -> Tensor: """Forward pass of multimodal RoPE embedding. Args: position_ids (torch.Tensor): A postion_id tensor with shape [3, batchsize, seqlens] mrope_section (list[int]): Multimodal rope section is for channel dimension of temporal, height and width in rope calculation. + packed_seq_params (PackedSeqParams, optional): Packed sequence params. Defaults to None. Returns: Tensor: Embeddings after applying RoPE. @@ -318,8 +348,17 @@ def forward(self, position_ids: torch.Tensor, mrope_section: List[int]) -> Tenso # shape (seq_length, bs, 1, 2 * dim) emb = emb[..., None, :].transpose(0, 1).contiguous() - if self.cp_group is not None and self.cp_group.size() > 1: + if packed_seq_params is not None and packed_seq_params.local_cp_size is not None: + if packed_seq_params.local_cp_size > 1: + # Set CP group to dynamic CP group for CP slicing + cp_group = packed_seq_params.cp_group + else: + # Set CP group to None to avoid CP slicing + cp_group = None + else: + cp_group = self.cp_group + if cp_group is not None and cp_group.size() > 1: # slice rotary_pos_emb along sequence dimension and select the parition of the current # CP rank - emb = get_pos_emb_on_this_cp_rank(emb, 0, self.cp_group) + emb = get_pos_emb_on_this_cp_rank(emb, 0, cp_group) return emb diff --git a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py index bcbb74b0dff..c2ef638050c 100644 --- a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +++ b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py @@ -13,6 +13,7 @@ from megatron.core.models.common.embeddings.rope_utils import get_pos_emb_on_this_cp_rank from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding from megatron.core.transformer import TransformerConfig +from megatron.core.utils import internal_api logger = logging.getLogger(__name__) @@ -99,13 +100,12 @@ def __init__( ) @lru_cache(maxsize=32) - def forward(self, max_seq_len: int, offset: int = 0, packed_seq: bool = False) -> Tensor: + def get_emb(self, max_seq_len: int, offset: int = 0) -> Tensor: """Forward pass of Yarn Rotary Embedding. Args: max_seq_len (int): Maximum size of sequence offset (int, optional): RoPE offset. Defaults to 0. - packed_seq (bool, optional): Whether to use packed sequence. Defaults to False. Returns: Tensor: Embeddings after applying Yarn RoPE. @@ -151,19 +151,44 @@ def forward(self, max_seq_len: int, offset: int = 0, packed_seq: bool = False) - emb = torch.cat((freqs, freqs), dim=-1) # emb [seq_length, .., dim] emb = emb[:, None, None, :] - if self.cp_group is not None and self.cp_group.size() > 1 and not packed_seq: + return emb, _mscale + + @internal_api + def forward( + self, max_seq_len: int, offset: int = 0, packed_seq_params: Optional[PackedSeqParams] = None + ) -> Tensor: + """Forward pass of Yarn Rotary Embedding. + + Args: + max_seq_len (int): Maximum size of sequence + offset (int, optional): RoPE offset. Defaults to 0. + packed_seq_params (PackedSeqParams, optional): Packed sequence params. Defaults to None. + + Returns: + Tensor: Embeddings after applying Yarn RoPE. + """ + emb, _mscale = self.get_emb(max_seq_len, offset) + packed_seq = packed_seq_params is not None and packed_seq_params.qkv_format == 'thd' + if packed_seq_params is not None and packed_seq_params.local_cp_size is not None: + # Set CP group to dynamic CP group for CP slicing + cp_group = packed_seq_params.cp_group + else: + cp_group = self.cp_group + if cp_group is not None and cp_group.size() > 1 and not packed_seq: # slice rotary_pos_emb along sequence dimension # and select the parition of the current CP rank - emb = get_pos_emb_on_this_cp_rank(emb, 0, self.cp_group) + emb = get_pos_emb_on_this_cp_rank(emb, 0, cp_group) return emb, _mscale - def _set_cos_sin_cache(self, seq_len, offset, dtype, packed_seq=False): + def _set_cos_sin_cache(self, seq_len, offset, dtype, packed_seq_params=None): self.max_seq_len_cached = seq_len self.offset_cached = offset self.dtype_cached = dtype - self.packed_seq_cached = packed_seq + self.packed_seq_cached = ( + packed_seq_params is not None and packed_seq_params.qkv_format == 'thd' + ) - emb, _mscale = self.forward(seq_len, offset, packed_seq) + emb, _mscale = self.forward(seq_len, offset, packed_seq_params) self.register_buffer( "cos_cached", (emb.cos() * _mscale).to(dtype).contiguous(), persistent=False ) @@ -172,16 +197,17 @@ def _set_cos_sin_cache(self, seq_len, offset, dtype, packed_seq=False): ) def get_cached_cos_sin( - self, seq_len, offset=0, dtype=torch.get_default_dtype(), packed_seq=False + self, seq_len, offset=0, dtype=torch.get_default_dtype(), packed_seq_params=None ): """Get cached cos and sin values.""" + packed_seq = packed_seq_params is not None and packed_seq_params.qkv_format == 'thd' if ( seq_len > self.max_seq_len_cached or offset != self.offset_cached or dtype != self.dtype_cached or packed_seq != self.packed_seq_cached ): - self._set_cos_sin_cache(seq_len, offset, dtype, packed_seq) + self._set_cos_sin_cache(seq_len, offset, dtype, packed_seq_params) return (self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...]) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index a3d1a8bfc00..70eea932683 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -344,16 +344,16 @@ def _preprocess( inference_context, self.decoder, decoder_input, self.config, packed_seq_params ) rotary_pos_emb = self.rotary_pos_emb( - rotary_seq_len, - packed_seq=packed_seq_params is not None - and packed_seq_params.qkv_format == 'thd', + rotary_seq_len, packed_seq_params=packed_seq_params ) elif self.position_embedding_type == 'yarn': if self.training or not self.config.flash_decode: rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len( inference_context, self.decoder, decoder_input, self.config, packed_seq_params ) - rotary_pos_emb, _ = self.rotary_pos_emb(rotary_seq_len) + rotary_pos_emb, _ = self.rotary_pos_emb( + rotary_seq_len, packed_seq_params=packed_seq_params + ) else: raise NotImplementedError( "Flash decoding uses precomputed cos and sin for RoPE, not implemented in " @@ -361,7 +361,9 @@ def _preprocess( ) elif self.position_embedding_type == 'mrope' and not self.config.multi_latent_attention: if self.training or not self.config.flash_decode: - rotary_pos_emb = self.rotary_pos_emb(position_ids, self.mrope_section) + rotary_pos_emb = self.rotary_pos_emb( + position_ids, self.mrope_section, packed_seq_params=packed_seq_params + ) else: # Flash decoding uses precomputed cos and sin for RoPE raise NotImplementedError( diff --git a/megatron/core/packed_seq_params.py b/megatron/core/packed_seq_params.py index 330d0e03471..08ebdac67d8 100644 --- a/megatron/core/packed_seq_params.py +++ b/megatron/core/packed_seq_params.py @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. from dataclasses import dataclass +import torch.distributed as dist from torch import Tensor @@ -18,3 +19,5 @@ class PackedSeqParams: cu_seqlens_kv_padded: Tensor = None max_seqlen_q: int = None max_seqlen_kv: int = None + local_cp_size: int = None + cp_group: dist.ProcessGroup = None diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py index 1e41bf9d8c2..fd0d0d9b9d9 100644 --- a/megatron/core/parallel_state.py +++ b/megatron/core/parallel_state.py @@ -6,6 +6,7 @@ import os import warnings from datetime import timedelta +from math import log2 from typing import Callable, List, Optional import numpy as np @@ -110,6 +111,8 @@ _CONTEXT_PARALLEL_GLOBAL_RANKS = None # Hierarchical context parallel groups _HIERARCHICAL_CONTEXT_PARALLEL_GROUPS = None +# Hybrid context parallel groups +_HYBRID_DP_CP_GROUPS = {} # Data parallel group information with context parallel combined. _DATA_PARALLEL_GROUP_WITH_CP = None @@ -410,6 +413,31 @@ def create_hierarchical_groups( return hierarchical_groups, hierarchical_groups_gloo +def create_hybrid_dp_cp_groups(rank, ranks, pg_options): + """ + Creates groups required for hybrid DPxCP. + Creates a new group for every power of 2 up to the number of DPxCP ranks. + Returns a dictionary indexed by group size. + """ + hybrid_dp_cp_groups = {} + # Generate group for every power of 2 up to the number of CP ranks + # We limit the allowed group sizes in order to avoid excessive overhead. + group_sizes = [2**i for i in range(int(log2(len(ranks))))][1:] + for group_size in group_sizes: + for i in range(0, len(ranks), group_size): + group = create_group( + ranks[i : i + group_size], + pg_options=pg_options, + group_desc=f"HYBRID_DP_CP_GROUP_{group_size}", + ) + if rank in ranks[i : i + group_size]: + assert ( + group_size not in hybrid_dp_cp_groups + ), f"Rank {rank} appears in multiple Hybrid DP CP groups of size {group_size}" + hybrid_dp_cp_groups[group_size] = group + return hybrid_dp_cp_groups + + class RankGenerator(object): """A class for generating rank groups for different modes of parallelism.""" @@ -530,6 +558,7 @@ def initialize_model_parallel( create_gloo_process_groups: bool = True, high_priority_stream_groups: Optional[List[str]] = None, sharp_enabled_group: Optional[str] = None, + hybrid_context_parallel: bool = False, ) -> None: """Initialize model data parallel groups. @@ -881,6 +910,19 @@ def initialize_model_parallel( if "NCCL_COLLNET_ENABLE" in os.environ: del os.environ["NCCL_COLLNET_ENABLE"] + if hybrid_context_parallel: + global _HYBRID_DP_CP_GROUPS + for ranks_with_cp in decoder_rank_generator.get_ranks('dp-cp'): + assert ( + len(ranks_with_cp) % 2 == 0 + ), "Hybrid context parallel requires an even number of ranks" + _HYBRID_DP_CP_GROUPS.update( + create_hybrid_dp_cp_groups( + rank, ranks_with_cp, get_nccl_options("dp_cp", nccl_comm_cfgs) + ) + ) + # TODO: Are gloo groups needed for hybrid cp? + for ranks in decoder_rank_generator.get_ranks('dp'): group = create_group( ranks, @@ -1395,6 +1437,18 @@ def get_hierarchical_context_parallel_groups(check_initialized=True): return _HIERARCHICAL_CONTEXT_PARALLEL_GROUPS +def get_hybrid_data_context_parallel_groups(check_initialized=True, group_size=None): + """Get the hybrid context parallel groups the caller rank belongs to.""" + # If the group size is the same as the entire DPxCP group, return the original group + if get_data_parallel_world_size(with_context_parallel=True) == group_size: + if check_initialized: + assert _DATA_PARALLEL_GROUP_WITH_CP is not None + return _DATA_PARALLEL_GROUP_WITH_CP + if check_initialized: + assert _HYBRID_DP_CP_GROUPS is not None + return _HYBRID_DP_CP_GROUPS[group_size] + + def get_embedding_group(check_initialized=True): """Get the embedding group the caller rank belongs to.""" if check_initialized: diff --git a/megatron/core/pipeline_parallel/hybrid_cp_schedule.py b/megatron/core/pipeline_parallel/hybrid_cp_schedule.py new file mode 100644 index 00000000000..27b5fc87945 --- /dev/null +++ b/megatron/core/pipeline_parallel/hybrid_cp_schedule.py @@ -0,0 +1,660 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +from collections import deque +from functools import lru_cache +from math import ceil, log2 +from typing import Callable, List, Optional, Tuple + +import torch + +from megatron.core import parallel_state +from megatron.core.rerun_state_machine import RerunDataIterator + + +class BalancedCPScheduler: + """ + This class provides the functionality to form groups of sub-samples + such that all DPxCP ranks have a roughly balanced workload in the group. + """ + + def __init__(self, max_seq_len_per_rank: int, dp_cp_group: torch.distributed.ProcessGroup): + self.max_seq_len_per_rank = max_seq_len_per_rank + self.num_subsamples = 0 + self.num_subsamples_processed = 0 + self.free_resources = [] + self.total_hdp_gpus = dp_cp_group.size() + + @lru_cache(maxsize=128) + def get_total_workload(self, seq_length: int, cp_size: Optional[int] = None): + """ + seq_length: sequence length of a sub-sample + cp_size: total number of CP ranks working on this sub-sample + + Note: + This function is used to estimate the relative workload intensity + of a sub-sample. This is not meant to be an accurate flops calculator. + + Returns: workload of a sub-sample + """ + if cp_size is None: + cp_size = self.gpus_needed(seq_length) + return (seq_length * seq_length) / cp_size + + @lru_cache(maxsize=128) + def gpus_needed(self, seq_len: int) -> int: + """ + Calculates the number of GPUs needed for a given sequence length + and max sequence length per CP rank. + This is used to determine the CP size of a sub-sample. + + The number is rounded up to the next power of 2 to match the available + hybrid context parallel process group sizes. + """ + return max(1, 2 ** ceil(log2((seq_len / self.max_seq_len_per_rank)))) + + def make_buckets_equal( + self, + sample_seqlens: List[Tuple[int, int]], # List of (sample_id, sequence_length) tuples + compute_estimator: Callable[[int], float], + ) -> List[deque]: + """ + Makes as many buckets as unique CP sizes needed. + This keeps sample IDs tethered to their sequence lengths throughout the bucketing process. + """ + # Extract just the sequence lengths for determining k + seqlens = [seq_len for _, seq_len in sample_seqlens] + + # Determine k based on unique GPU categories needed + k = len({self.gpus_needed(L) for L in seqlens}) + + # Create a work target for each bucket + # This is the total work divided by the number of buckets + work = [] + for _, s in sample_seqlens: + cp_size = self.gpus_needed(s) + work.append(compute_estimator(s, cp_size)) + total_work = sum(work) + target = total_work / k + buckets, cur, cur_work = [], [], 0.0 + remaining_work = total_work + remaining_k = k + + for i, (sample_id, seq_len) in enumerate(sample_seqlens): + work = compute_estimator(seq_len) + projected = cur_work + work + + # Check if we should close this bucket + if cur and ( + projected > target * 1.1 # Too much work + or len(sample_seqlens) - i <= remaining_k - len(buckets) + ): # Need to save sequences for remaining buckets + buckets.append(deque(cur)) + cur, cur_work = [], 0.0 + remaining_work -= sum(compute_estimator(seq_len) for _, seq_len in cur) + remaining_k -= 1 + + cur.append((sample_id, seq_len)) + cur_work += work + + if cur: + buckets.append(deque(cur)) + + return buckets + + def next_hdp_group( + self, + sample_seqlens: List[Tuple[int, int]], # List of (sample_id, sequence_length) tuples + compute_estimator: Callable[[int], float], + total_gpus: int, + delta: float = 0.05, # balance slack (e.g. 5 %) + strategy: str = "dp", # "dp" or "pp" + eps_bucket: float = 0.10, # ε target for bucket balance + ) -> Tuple[List[List[int]], List[Tuple[int, int]], List[float], List[List[int]]]: + """ + Given a list of (sample_id, sequence_length) tuples, this function aims to assign + sequences in a group such that all GPUs in the DPxCP group have a roughly balanced + workload. Once each group is roughly balanced, we exit and return the + group and the leftover sequences. + + The function performs the following passes in order to form a balanced microbatch: + 1. We create buckets of sequences that are roughly balanced. + We try to create as many buckets as possible CP sizes. + 2. Given a bucket has sequences available, we assign the sample + a. To a new set of GPUs if there are enough free GPUs. + b. To an existing set of GPUs with the lowest load. + 3. We check if the group is balanced whenever we need to move onto a new CP size + in the same set of GPUs. + 4. We trim the group if removing the last added sequence helps improve balance. + 5. If we run out of sequences to assign and there are empty GPUs, + we redistribute work to empty GPUs by recursively increasing the CP size of a + sample until no empty GPUs are left. + + Returns (micro_batches, leftover_sample_seqlens, exec_times, sample_ids_per_gpu). + """ + if not sample_seqlens: + return ( + [[] for _ in range(total_gpus)], + [], + [0.0 for _ in range(total_gpus)], + [[] for _ in range(total_gpus)], + ) + + # Get buckets of sequences with balanced work + buckets = self.make_buckets_equal(sample_seqlens, compute_estimator) + + # Initialize tracking structures + micro_batches = [[] for _ in range(total_gpus)] + exec_times = [0.0 for _ in range(total_gpus)] + sample_ids_per_gpu = [[] for _ in range(total_gpus)] + + gpu_group_id = [None] * total_gpus + group_members = {} + group_size = {} + next_gid = 0 + + pp_cursor = 0 + prev_needed = None + check_balance = False + + while buckets: + # ---- Step 1 – pick the next sequence we COULD place ------------------ + sample_seq_tuple = bucket_idx = None + needed = None + + scan_order = ( + range(len(buckets)) + if strategy == "dp" + else [(pp_cursor + i) % len(buckets) for i in range(len(buckets))] + ) + + for idx in scan_order: + if not buckets[idx]: + continue + cand_tuple = buckets[idx][0] # This is now (sample_id, seq_len) + cand_seq_len = cand_tuple[1] + needed = self.gpus_needed(cand_seq_len) + + # (a) Do we have an *existing* group of size `needed`? + candidate_gids = [gid for gid, sz in group_size.items() if sz == needed] + + # (b) Or enough completely free GPUs to start a new group? + free_ranks = [r for r, gid in enumerate(gpu_group_id) if gid is None] + if candidate_gids or len(free_ranks) >= needed: + sample_seq_tuple, bucket_idx = cand_tuple, idx + break + + # No place to put any remaining sequence – finish this micro‑batch + if sample_seq_tuple is None: + break + + # TODO[pmannan]: PP not yet supported. Add PP scheduling. + if strategy == "pp": + pp_cursor = (bucket_idx + 1) % len(buckets) + + sample_id, seq_len = sample_seq_tuple + needed = self.gpus_needed(seq_len) + if prev_needed is None: + prev_needed = needed + + # (a) Existing groups of exactly this size + candidate_gids = [gid for gid, sz in group_size.items() if sz == needed] + if candidate_gids: + best_gid, best_load = min( + ( + (gid, max(exec_times[r] for r in group_members[gid])) + for gid in candidate_gids + ), + key=lambda t: t[1], + ) + else: + best_gid, best_load = None, float("inf") + + # (b) Hypothetical **new** group from completely free GPUs + free_ranks = [r for r, gid in enumerate(gpu_group_id) if gid is None] + if len(free_ranks) >= needed: + free_sorted = sorted(free_ranks, key=lambda r: exec_times[r]) + new_members = free_sorted[:needed] + new_load = exec_times[new_members[-1]] + + if new_load < best_load: + best_gid = None + chosen_members = new_members + else: + chosen_members = group_members[best_gid] + else: + chosen_members = group_members[best_gid] + + # ---- Step 2 – if we decided to create a fresh group ---------------- + if best_gid is None: + best_gid = next_gid + next_gid += 1 + group_members[best_gid] = chosen_members + group_size[best_gid] = needed + for r in chosen_members: + gpu_group_id[r] = best_gid + + # ---- Step 3 – assign the sequence to every member of that group ------ + per_gpu_cost = compute_estimator(seq_len) + + for r in chosen_members: + micro_batches[r].append(seq_len) + exec_times[r] += per_gpu_cost + sample_ids_per_gpu[r].append(sample_id) + + # Remove the sequence definitively from its bucket + buckets[bucket_idx].popleft() + + # ---- Step 4 – tidy, balance‑check, maybe early‑exit ------------------ + while buckets and not buckets[0]: + buckets.pop(0) + pp_cursor %= max(1, len(buckets)) + + # TODO: Removing this helps reduce the number of groups when we have + # lots of samples with same CP size. + # But because we don't exit as soon as we get balanced, + # even if there is one group available that can take the next sample, + # we will keep adding samples to the same group. + # trim_overload() does not help because it only checks if removing the + # last added sample helps. + # We cannot check after adding every sample because there will always be imbalance + # if we don't wait for future scheduling. + + # IMPORTANT: So we need a solution here + if needed < prev_needed: + # When we get into a lower CP size in the same group, + # we can start checking for balance. There is still a gotcha here. + # Let's say we have a group of 3 GPU 0-2, then we move onto group of 2. + # We keep assigning group of 2 as we do in descending order but GPU 7/15 + # never sees a microbatch assigned to it + # until we run out of samples with CP2. + # This means we are never balanced as min(exec_times) will always be 0. + # We need a smart way of identifying that we have run out of big samples + # and if we are having to assign work to a GPU already working, + # is it because there are empty GPUs? + # Would assigning work to empty GPUs first by moving onto next CP bucket help? + # But we need to remember to come back to this CP size bucket and then + # check for balance. Maybe the scheduling algorithm should look at empty + # GPUs and find work rather than going sequence by sequence. + check_balance = True + + if ( + check_balance + and buckets + and max(exec_times) - min(exec_times) <= delta * max(exec_times) + ): + break + + # Gather leftovers (flatten remaining buckets, preserve order) + leftovers = [] + for b in buckets: + for sample_seq_tuple in b: + leftovers.append(sample_seq_tuple) + + # --------------------------------------------------------------------------- + def trim_overload(): + """ + Iteratively pop the most‑recent sequence from the *most‑loaded group* + whenever doing so reduces the global slack. + """ + while True: + cur_max = max(exec_times) + cur_min = min(exec_times) + cur_slack = cur_max - cur_min + if cur_slack <= delta * cur_max: + # Slack is already within limit. + break + if cur_min == 0: + # There are empty GPUs that will be + # handled in the next step. + break + + max_r = exec_times.index(cur_max) + gid = gpu_group_id[max_r] + members = group_members[gid] + + if not micro_batches[max_r] or len(micro_batches[max_r]) <= 1: + break + + seq = micro_batches[max_r][-1] + need = group_size[gid] + per_gpu_cost = compute_estimator(seq) + + proj_times = exec_times[:] + for r in members: + proj_times[r] -= per_gpu_cost + + proj_slack = max(proj_times) - min(proj_times) + + # Check if trimming the workload helps imbalance + if proj_slack < cur_slack: + sample_id_to_remove = sample_ids_per_gpu[max_r][-1] + for r in members: + micro_batches[r].pop() + exec_times[r] -= per_gpu_cost + sample_ids_per_gpu[r].pop() + leftovers.append((sample_id_to_remove, seq)) + else: + break + + trim_overload() + + # Track samples in this group before redistribution to empty GPUs + total_work_before = sum(len(mb) for mb in micro_batches) + + # Check for empty GPUs and redistribute work + def fill_empty_gpus( + micro_batches, exec_times, sample_ids_per_gpu, group_members, group_size + ): + """ + Recursively check for empty GPUs and redistribute work by increasing + the number of GPUs sharing samples. This ensures all GPUs have work. + GPUs must be allocated consecutively so we may need to push existing + work to other ranks in order to expand samples. + """ + # Find empty GPUs + empty_gpus = [i for i in range(total_gpus) if not micro_batches[i]] + if not empty_gpus: + return ( + micro_batches, + exec_times, + sample_ids_per_gpu, + group_members, + group_size, + ) # No empty GPUs, we're done + + # Find the smallest group size that exists + existing_group_sizes = set(group_size.values()) + assert ( + existing_group_sizes + ), "There should be at least one group existing, cannot reditribute, " + "try to increase 'max-seqlen-per-cp-rank'." + + min_group_size = min(existing_group_sizes) + # We have Hybrid DPxCP groups for every power of 2 of GPUs or the entire DPxCP group. + next_power = min(min_group_size * 2, total_gpus) + + # Find the first group of min_group_size that can be expanded + expandable_gid = None + expandable_members = None + expandable_new_gpus = None + + for gid, size in group_size.items(): + if size == min_group_size: + members = group_members[gid] + needed_count = next_power - min_group_size + group_start_gpu = members[0] + group_end_gpu = members[-1] + empty_gpu = [idx for idx, work in enumerate(micro_batches) if not work][0] + assert not all( + work for work in micro_batches[empty_gpu : empty_gpu + needed_count] + ), f"Empty GPUs were detected but not enough to expand." + work_to_push = micro_batches[ + group_end_gpu + 1 : empty_gpu + ] # This is work of all other subsequent sub-samples + exec_times_to_push = exec_times[group_end_gpu + 1 : empty_gpu] + sample_ids_to_push = sample_ids_per_gpu[group_end_gpu + 1 : empty_gpu] + + new_micro_batches = [[]] * len(micro_batches) + new_exec_times = [0.0] * len(exec_times) + new_sample_ids_per_gpu = [[]] * len(sample_ids_per_gpu) + + # No change in work until the group selected for expansion + for i in range(group_start_gpu): + new_micro_batches[i] = micro_batches[i] + new_exec_times[i] = exec_times[i] + new_sample_ids_per_gpu[i] = sample_ids_per_gpu[i] + + # The work is distributed across the expanded group + for i in range(group_start_gpu, group_end_gpu + needed_count + 1): + new_micro_batches[i] = micro_batches[group_end_gpu] + new_exec_times[i] = self.get_total_workload( + micro_batches[group_end_gpu][0], next_power + ) + new_sample_ids_per_gpu[i] = sample_ids_per_gpu[group_end_gpu] + + # Any assigned work on expanded GPUs is pushed + for i, work in enumerate(work_to_push): + new_micro_batches[group_end_gpu + needed_count + 1 + i] = work + new_exec_times[group_end_gpu + needed_count + 1 + i] = exec_times_to_push[i] + new_sample_ids_per_gpu[group_end_gpu + needed_count + 1 + i] = ( + sample_ids_to_push[i] + ) + + group_size[gid] = next_power + group_members[gid] = list(range(members[0], members[-1] + needed_count + 1)) + for pushed_gid in group_size.keys(): + if pushed_gid > gid: + group_members[pushed_gid] = [ + x + needed_count for x in group_members[pushed_gid] + ] + + return ( + new_micro_batches, + new_exec_times, + new_sample_ids_per_gpu, + group_members, + group_size, + ) + + empty_gpus = any([not micro_batches[i] for i in range(total_gpus)]) + while empty_gpus: + micro_batches, exec_times, sample_ids_per_gpu, group_members, group_size = ( + fill_empty_gpus( + micro_batches, exec_times, sample_ids_per_gpu, group_members, group_size + ) + ) + empty_gpus = any([not micro_batches[i] for i in range(total_gpus)]) + + # Assert that no sample has been completely removed + total_work_after = sum(len(mb) for mb in micro_batches) + assert ( + total_work_after >= total_work_before + ), f"Samples were removed: {total_work_before} -> {total_work_after}" + + return micro_batches, leftovers, exec_times, sample_ids_per_gpu + + def get_groups_and_subsamples(self, sample_id_seqlens, config): + """ + This function recursively forms groups of sub-samples such that all DPxCP ranks + have a roughly balanced workload in the group. + """ + groups = [] + sample_id_groups = [] + # We assign a sample_id to each sub-sample in order to track assignment to each GPU. + sample_id_seqlens = sorted(sample_id_seqlens, key=lambda x: x[1], reverse=True) + while sample_id_seqlens: + mb, sample_id_seqlens, exec_times, sample_ids = self.next_hdp_group( + sample_id_seqlens, self.get_total_workload, self.total_hdp_gpus + ) + groups.append(mb) + if len(sample_ids) < self.total_hdp_gpus: + sample_ids.extend([] * (self.total_hdp_gpus - len(sample_ids))) + sample_id_groups.append(sample_ids) + + return groups, sample_id_groups + + +def hybrid_context_parallel_forward_backward( + forward_step_func, + data_iterator, + model, + num_microbatches, + input_tensor, + output_tensor_grad, + forward_data_store, + config, + collect_non_loss_data, + first_val_step, + forward_only, + no_sync_func, + total_num_tokens, + check_first_val_step, + model_type, +): + """ + Scheduler for Hybrid Context Parallel. + + This function performs the packed sample scheduling and determines + 1. The number of microbatches to schedule for each CP rank + 2. The number of groups each CP rank should execute + 3. The number of sub-samples per group each CP rank should execute + + A group is defined by a set of samples that can run across the CP domain without any barrier. + There are many reasons why we may not be able to run endless samples within a single group. + For example, if we have 8 GPUs, + if GPU 0-5 are assigned a long sample that requires CP6, + GPU 6-7 are assigned a short sample that requires CP2, + The next sample which requires CP4 can be assigned GPU 4-7. + But GPU 6-7 will finish first and get deadlocked if GPU 4-5 are not participating in the group. + """ + from .schedules import backward_step, forward_step + + def _broadcast(item): + if item is not None: + torch.distributed.broadcast( + item, + parallel_state.get_tensor_model_parallel_src_rank(), + group=parallel_state.get_tensor_model_parallel_group(), + ) + + def _broadcast_num_samples_this_group(num_samples_this_group): + dev = torch.cuda.current_device() + torch.distributed.barrier() + + n = 0 if num_samples_this_group is None else int(num_samples_this_group.numel()) + n = torch.tensor([n], dtype=torch.int64, device=dev) + + _broadcast(n) + n = int(n.item()) + + assert n > 0, "there should be at least 1 sub samples in the group" + num_samples_this_group_broadcast = ( + torch.empty(n, dtype=torch.int32, device=dev) + if num_samples_this_group is None + else num_samples_this_group + ) + _broadcast(num_samples_this_group_broadcast) + return num_samples_this_group_broadcast + + def _get_new_data_iterator(sample_id_in_group, group_id): + if is_first_tp_rank: + sub_sample_id = sample_ids_this_group[sample_id_in_group] + sample = batch[sub_sample_id] + partner_cp_size = len( + [True for sample_ids in sample_id_groups[group_id] if sub_sample_id in sample_ids] + ) + sample["local_cp_size"] = torch.tensor(partner_cp_size, dtype=torch.int32) + new_data_iterator = RerunDataIterator(iter([sample])) + return new_data_iterator + else: + return None + + # We get data once per global batch and schedule the sub-samples. + # TODO(pmannan): Should we wrap the data_iterator here instead of the training.py file? + hdp_rank = parallel_state.get_data_parallel_rank(with_context_parallel=True) + is_first_tp_rank = parallel_state.get_tensor_model_parallel_rank() == 0 + + if is_first_tp_rank: + data = next(data_iterator) + sample_id_groups = data[1] + batch = data[0] + else: + data, sample_id_groups, batch = None, None, None + + num_samples_this_group = None + if is_first_tp_rank: + num_samples_this_group = torch.tensor( + [len(group[hdp_rank]) for group in sample_id_groups], dtype=torch.int32, device='cuda' + ) + + num_samples_this_group = _broadcast_num_samples_this_group(num_samples_this_group) + num_samples_this_group = num_samples_this_group.cpu().numpy() + num_total_groups = num_samples_this_group.shape[0] + + current_microbatch = 0 + + # Upto last group, we don't need any sync. + with no_sync_func(): + for j in range(num_total_groups - 1): + sample_ids_this_group = sample_id_groups[j][hdp_rank] if is_first_tp_rank else None + for i in range(num_samples_this_group[j]): + # Call forward step for each sub-sample + new_data_iterator = _get_new_data_iterator(i, j) + # TODO: Find the usage of current_microbatch and is_first_microbatch and + # how that may affect my usage. + output_tensor, num_tokens = forward_step( + forward_step_func, + new_data_iterator, + model, + num_microbatches, + input_tensor, + forward_data_store, + config, + collect_non_loss_data, + is_first_microbatch=check_first_val_step( + first_val_step, forward_only, current_microbatch == 0 + ), + current_microbatch=current_microbatch, + ) + current_microbatch += 1 + total_num_tokens += num_tokens.item() + if not forward_only: + backward_step( + input_tensor, output_tensor, output_tensor_grad, model_type, config + ) + + # Create a barrier at end of each group. + # This barrier ensures that all ranks are prepared to change assigned CP group sizes and + # no rank is starting a sub-sample ahead of it's partner ranks. + torch.distributed.barrier( + parallel_state.get_data_parallel_group(with_context_parallel=True) + ) + + # For the last group, we need to run the last sub-sample out of the context handler. + with no_sync_func(): + sample_ids_this_group = sample_id_groups[-1][hdp_rank] if is_first_tp_rank else None + for i in range(num_samples_this_group[-1] - 1): + new_data_iterator = _get_new_data_iterator(i, -1) + # Call forward step for each sub-sample + output_tensor, num_tokens = forward_step( + forward_step_func, + new_data_iterator, + model, + num_microbatches, + input_tensor, + forward_data_store, + config, + collect_non_loss_data, + is_first_microbatch=check_first_val_step( + first_val_step, forward_only, current_microbatch == 0 + ), + current_microbatch=current_microbatch, + ) + current_microbatch += 1 + total_num_tokens += num_tokens.item() + if not forward_only: + backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config) + + # The last sub-sample of the last group of the last microbatch is + # run out of the context handler. + new_data_iterator = _get_new_data_iterator(-1, -1) + # Call forward step for each sub-sample + output_tensor, num_tokens = forward_step( + forward_step_func, + new_data_iterator, + model, + num_microbatches, + input_tensor, + forward_data_store, + config, + collect_non_loss_data, + is_first_microbatch=check_first_val_step( + first_val_step, forward_only, current_microbatch == 0 + ), + current_microbatch=current_microbatch, + ) + total_num_tokens += num_tokens.item() + if not forward_only: + backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config) + + return forward_data_store, total_num_tokens diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index 97d8aefad85..a8fdf2324f2 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -36,6 +36,7 @@ combined_1f1b_schedule_for_interleaved_pipelining, combined_1f1b_schedule_for_no_pipelining, ) +from .hybrid_cp_schedule import hybrid_context_parallel_forward_backward # Types Shape = Union[List[int], torch.Size] @@ -607,6 +608,24 @@ def forward_backward_no_pipelining( total_num_tokens, partial(check_first_val_step, first_val_step, forward_only), ) + elif config.hybrid_context_parallel: + forward_data_store, total_num_tokens = hybrid_context_parallel_forward_backward( + forward_step_func, + data_iterator, + model, + num_microbatches, + input_tensor, + output_tensor_grad, + forward_data_store, + config, + collect_non_loss_data, + first_val_step, + forward_only, + no_sync_func, + total_num_tokens, + check_first_val_step, + model_type, + ) else: with no_sync_func(): for i in range(num_microbatches - 1): diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index 5cf22d25a4b..3c1c05f8c86 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -849,7 +849,7 @@ def forward( ) ) - if packed_seq_params is not None: + if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd': query = query.squeeze(1) key = key.squeeze(1) value = value.squeeze(1) @@ -864,7 +864,7 @@ def forward( ): q_pos_emb, k_pos_emb = rotary_pos_emb - if packed_seq_params is not None: + if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd': if packed_seq_params.cu_seqlens_q_padded is not None: cu_seqlens_q = packed_seq_params.cu_seqlens_q_padded else: diff --git a/megatron/core/transformer/experimental_attention_variant/dsa.py b/megatron/core/transformer/experimental_attention_variant/dsa.py index fc994490b1b..353b31e9bcd 100644 --- a/megatron/core/transformer/experimental_attention_variant/dsa.py +++ b/megatron/core/transformer/experimental_attention_variant/dsa.py @@ -546,10 +546,14 @@ def forward_with_scores( None, None, x, self.config, packed_seq_params ) if self.config.rope_type == "rope": - rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len, packed_seq=False) + rotary_pos_emb = self.rotary_pos_emb( + rotary_seq_len, packed_seq_params=packed_seq_params + ) mscale = 1.0 else: - rotary_pos_emb, mscale = self.rotary_pos_emb(rotary_seq_len, packed_seq=False) + rotary_pos_emb, mscale = self.rotary_pos_emb( + rotary_seq_len, packed_seq_params=packed_seq_params + ) # ========================================= # Gather inputs if sp is enabled diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py index b65294fcc10..ed90fdffa97 100644 --- a/megatron/core/transformer/multi_latent_attention.py +++ b/megatron/core/transformer/multi_latent_attention.py @@ -555,6 +555,11 @@ def get_query_key_value_tensors( assert ( hidden_states.ndim == 3 ), f"hidden_states should be 3D, [s, b, n*h], got {hidden_states.ndim}D" + if packed_seq_params is not None: + assert ( + packed_seq_params.local_cp_size is None + ), "hybrid_context_parallel is not supported with MLA yet and is planned for future. \ + Please disable hybrid_context_parallel." inference_context = deprecate_inference_params(inference_context, inference_params) @@ -571,11 +576,13 @@ def get_query_key_value_tensors( rotary_pos_sin = None packed_seq = packed_seq_params is not None and packed_seq_params.qkv_format == 'thd' if self.config.rope_type == "rope": - rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len, packed_seq=packed_seq) + rotary_pos_emb = self.rotary_pos_emb( + rotary_seq_len, packed_seq_params=packed_seq_params + ) else: if self.config.apply_rope_fusion: rotary_pos_cos, rotary_pos_sin = self.rotary_pos_emb.get_cached_cos_sin( - rotary_seq_len, dtype=hidden_states.dtype, packed_seq=packed_seq + rotary_seq_len, dtype=hidden_states.dtype, packed_seq_params=packed_seq_params ) rotary_pos_emb = None assert inference_context is None, "Inference with MLA RoPE fusion is not supported" @@ -584,9 +591,11 @@ def get_query_key_value_tensors( and fused_apply_mla_rope_for_kv is not None ), "Fused MLA RoPE apply is not imported successfully" else: - rotary_pos_emb, mscale = self.rotary_pos_emb(rotary_seq_len, packed_seq=packed_seq) + rotary_pos_emb, mscale = self.rotary_pos_emb( + rotary_seq_len, packed_seq_params=packed_seq_params + ) - if packed_seq_params is not None: + if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd': if packed_seq_params.cu_seqlens_q_padded is not None: cu_seqlens_q = packed_seq_params.cu_seqlens_q_padded else: diff --git a/megatron/core/utils.py b/megatron/core/utils.py index 91b15dabf74..3a153468ae6 100644 --- a/megatron/core/utils.py +++ b/megatron/core/utils.py @@ -59,6 +59,15 @@ logger = logging.getLogger(__name__) +try: + # Register the TE CUDA kernels + import transformer_engine # pylint: disable=unused-import + + # Alias the PyTorch wrapper so we can call tex.* APIs + import transformer_engine_torch as tex +except ImportError: + # TE isn’t installed or the torch wrapper is missing + tex = None try: _torch_version = PkgVersion(torch.__version__) @@ -1976,7 +1985,7 @@ def get_batch_on_this_cp_rank( if cp_size > 1: for key, val in batch.items(): if val is not None: - seq_dim = 1 if key != "attention_mask" else 2 + seq_dim = 1 if key != 'attention_mask' else 2 val = val.view( *val.shape[0:seq_dim], 2 * cp_size, @@ -1993,6 +2002,103 @@ def get_batch_on_this_cp_rank( return batch +def get_thd_batch_on_this_cp_rank( + batch: Dict[str, Any], + cu_seqlens: torch.Tensor, + cu_seqlens_padded: torch.Tensor, + max_seqlen: torch.Tensor, + cp_group: Optional[torch.distributed.ProcessGroup] = None, +): + """Slice each sub-sample in a packed sample batch input along + sequence dimension into multiple chunks, which are parallelized + across GPUs in a context parallel group. + """ + packed_seq_params = PackedSeqParams( + qkv_format="thd", + cu_seqlens_q=cu_seqlens, + cu_seqlens_kv=cu_seqlens, + cu_seqlens_q_padded=cu_seqlens_padded, + cu_seqlens_kv_padded=cu_seqlens_padded, + max_seqlen_q=int(max_seqlen[0].item()), + max_seqlen_kv=int(max_seqlen[0].item()), + ) + + if cp_group is not None: + cp_size = get_pg_size(cp_group) + cp_rank = get_pg_rank(cp_group) + else: + cp_size = parallel_state.get_context_parallel_world_size() + cp_rank = parallel_state.get_context_parallel_rank() + if cp_size > 1: # slice batch along sequence dimension for context parallelism + assert tex is not None and is_te_min_version("1.10.0"), ( + "Please update Transformer Engine to >= 1.10 to use " + "Context Parallel with THD format data" + ) + index = tex.thd_get_partitioned_indices( + cu_seqlens_padded, batch['tokens'].size(1), cp_size, cp_rank + ) + for key, data in batch.items(): + if key in {'attention_mask', 'cu_seqlens', 'cu_seqlens_padded', 'max_seqlen'}: + continue + batch[key] = data.index_select(1, index) + + return batch, packed_seq_params + + +################################ +### hybrid context parallel ### +################################ + + +def get_batch_on_this_hybrid_cp_rank( + batch: Dict[str, Any], + local_cp_size: int, + cp_group: Optional[torch.distributed.ProcessGroup] = None, +): + """Slice batch input along sequence dimension into multiple chunks, + which are parallelized across GPUs in a context parallel group. + """ + assert local_cp_size is not None + if cp_group is None: + # Get the local cp group required for as defined by the HybridCPDataLoaderWrapper + if local_cp_size > 1: + cp_group = parallel_state.get_hybrid_data_context_parallel_groups( + group_size=local_cp_size + ) + else: + # If cp group is provided, it must match the local cp size + # as defined by the HybridCPDataLoaderWrapper + assert cp_group.size() == local_cp_size + + # Convert [seqlen] to [1, seqlen] similar to default collate_fn + # as hybrid_context_parallel dataloader wrapper does not go through default collate_fn + for key, data in batch.items(): + if key in ['attention_mask']: + continue + batch[key] = torch.stack([data], 0) + sample_length = batch['tokens'].shape[1] + # TODO(pmannan): Take care of padding tokens here if not divisible by cp_size*2 + # Create packed_seq_params for SBHD format with cp group information. + packed_seq_params = PackedSeqParams( + qkv_format="sbhd", + cu_seqlens_q=torch.tensor([0, sample_length], device="cuda", pin_memory=True), + cu_seqlens_kv=torch.tensor([0, sample_length], device="cuda", pin_memory=True), + cu_seqlens_q_padded=torch.tensor([0, sample_length], device="cuda", pin_memory=True), + cu_seqlens_kv_padded=torch.tensor([0, sample_length], device="cuda", pin_memory=True), + max_seqlen_q=sample_length, + max_seqlen_kv=sample_length, + local_cp_size=local_cp_size, + cp_group=cp_group, + ) + + if cp_group is not None and cp_group.size() > 1: + # When using hybrid_context_parallel, each sub-sample of a packed sample is + # required to be divisible by CP*DP*2 or CP*DP*TP*2 (if using sequence parallel) + batch = get_batch_on_this_cp_rank(batch, cp_group) + + return batch, packed_seq_params + + ###################### ### NVTX profiling ### ###################### diff --git a/megatron/legacy/data/data_samplers.py b/megatron/legacy/data/data_samplers.py index 1bf1bf5ee91..79bdc7b193f 100644 --- a/megatron/legacy/data/data_samplers.py +++ b/megatron/legacy/data/data_samplers.py @@ -34,13 +34,22 @@ def build_pretraining_data_loader(dataset, consumed_samples): data_parallel_rank=mpu.get_data_parallel_rank(), data_parallel_size=mpu.get_data_parallel_world_size()) elif args.dataloader_type == 'single': - # Megatron sampler - batch_sampler = MegatronPretrainingSampler( - total_samples=len(dataset), - consumed_samples=consumed_samples, - micro_batch_size=args.micro_batch_size, - data_parallel_rank=mpu.get_data_parallel_rank(), - data_parallel_size=mpu.get_data_parallel_world_size()) + if args.hybrid_context_parallel: + batch_sampler = HybridCPMegatronPretrainingSampler( + total_samples=len(dataset), + consumed_samples=consumed_samples, + micro_batch_size=args.micro_batch_size, + global_batch_size=args.global_batch_size, + data_parallel_rank=mpu.get_data_parallel_rank(), + data_parallel_size=mpu.get_data_parallel_world_size()) + else: + # Megatron sampler + batch_sampler = MegatronPretrainingSampler( + total_samples=len(dataset), + consumed_samples=consumed_samples, + micro_batch_size=args.micro_batch_size, + data_parallel_rank=mpu.get_data_parallel_rank(), + data_parallel_size=mpu.get_data_parallel_world_size()) elif args.dataloader_type == 'cyclic': batch_sampler = MegatronPretrainingRandomSampler( dataset, @@ -59,11 +68,16 @@ def build_pretraining_data_loader(dataset, consumed_samples): args.dataloader_type)) # Torch dataloader. + if args.hybrid_context_parallel: + extra_kwargs = {"collate_fn": lambda x: x,} + else: + extra_kwargs = {} return torch.utils.data.DataLoader(dataset, batch_sampler=batch_sampler, num_workers=args.num_workers, pin_memory=True, persistent_workers=True if args.num_workers > 0 else False, + **extra_kwargs, ) class MegatronPretrainingSampler: @@ -114,6 +128,49 @@ def __iter__(self): start_idx, end_idx = self.get_start_end_idx() yield batch[start_idx:end_idx] +class HybridCPMegatronPretrainingSampler(MegatronPretrainingSampler): + """ + Data sampler for hybrid context parallel (Hybrid CP) format. + This data sampler pulls in the entire global batch at once across all data parallel ranks. + This helps provide the Hybrid CP Dataloader Wrapper to schedule and load balance sub-samples + of the entire global batch. + """ + + def __init__(self, total_samples, consumed_samples, micro_batch_size, global_batch_size, + data_parallel_rank, data_parallel_size, drop_last=True): + super().__init__(total_samples, consumed_samples, micro_batch_size, data_parallel_rank, data_parallel_size, drop_last) + self.global_batch_size = global_batch_size + self.data_parallel_size = data_parallel_size + self.num_micro_batches = self.global_batch_size // self.micro_batch_times_data_parallel_size + + def __len__(self): + return self.total_samples + + def get_start_end_idx_global_batch(self): + start_idx = [self.data_parallel_rank * self.micro_batch_size + i * self.micro_batch_size * self.data_parallel_size for i in range(self.num_micro_batches)] + end_idx = [start_idx[i] + self.micro_batch_size for i in range(self.num_micro_batches)] + return start_idx, end_idx + + def __iter__(self): + batch = [] + # Last batch will be dropped if drop_last is not set False + for idx in range(self.consumed_samples, self.total_samples): + batch.append(idx) + if len(batch) == self.micro_batch_times_data_parallel_size * self.num_micro_batches: + start_idx, end_idx = self.get_start_end_idx_global_batch() + global_batch_idx = [] + for i in range(self.num_micro_batches): + global_batch_idx.extend(batch[start_idx[i]:end_idx[i]]) + yield global_batch_idx + batch = [] + + # Check the last partial batch and see drop_last is set + if len(batch) > 0 and not self.drop_last: + start_idx, end_idx = self.get_start_end_idx_global_batch() + global_batch_idx = [] + for i in range(self.num_micro_batches): + global_batch_idx.extend(batch[start_idx[i]:end_idx[i]]) + yield global_batch_idx class RandomSeedDataset(Dataset): diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 0cf2d006863..c413c346b69 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -956,6 +956,13 @@ def validate_args(args, defaults={}): if args.tp_comm_overlap: assert args.sequence_parallel == True, 'Tensor parallel communication/GEMM overlap can happen only when sequence parallelism is enabled' + if args.hybrid_context_parallel: + assert not args.pipeline_model_parallel_size > 1, 'Hybrid context parallelism not supported with pipeline parallelism' + assert not args.enable_cuda_graph, 'Hybrid context parallelism not supported with CUDA Graph' + assert not args.use_megatron_fsdp, 'Hybrid context parallelism not supported with Megatron FSDP' + assert args.dataloader_type == 'single', 'Hybrid context parallelism only supported with single dataloader type' + assert args.calculate_per_token_loss, 'Hybrid context parallelism must be used with --calculate-per-token-loss' + # disable async_tensor_model_parallel_allreduce when # model parallel memory optimization is enabled if (args.tensor_model_parallel_size > 1 or args.context_parallel_size > 1) \ @@ -2876,6 +2883,13 @@ def _add_distributed_args(parser): '--hierarchical-context-parallel-sizes 2 4 indicates every two adjacent gpus ' 'forms the first level of cp groups and the cp ranks with the same odevity ' 'forms the second level of cp groups.') + group.add_argument('--max-seqlen-per-cp-rank', type=int, default=None, + help='Maximum sequence length per CP rank. This is used to calculate the ' + 'number of sub-samples assigned to each CP rank when using heterogeneous context parallel.') + group.add_argument('--hybrid-context-parallel', action='store_true', default=False, + help='Enables hybrid context parallel. This is used to balance the workload ' + 'of each CP rank when we use packed samples with variable sequence lengths. ' + 'Requires --max-seqlen-per-cp-rank to be set.') group.add_argument('--nccl-communicator-config-path', type=str, default=None, help='Path to the yaml file with NCCL communicator ' 'configurations. The number of min/max thread groups and thread ' diff --git a/megatron/training/initialize.py b/megatron/training/initialize.py index 8b585fdd87b..fb9a3aa273b 100644 --- a/megatron/training/initialize.py +++ b/megatron/training/initialize.py @@ -369,6 +369,7 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks, s use_sharp=args.use_sharp, context_parallel_size=args.context_parallel_size, hierarchical_context_parallel_sizes=args.hierarchical_context_parallel_sizes, + hybrid_context_parallel=args.hybrid_context_parallel, expert_model_parallel_size=args.expert_model_parallel_size, num_distributed_optimizer_instances=args.num_distributed_optimizer_instances, expert_tensor_parallel_size=args.expert_tensor_parallel_size, diff --git a/megatron/training/training.py b/megatron/training/training.py index 99fbd453426..a732e3917e5 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -90,6 +90,7 @@ from megatron.training.initialize import set_jit_fusion_options from megatron.training.utils import get_batch_on_this_cp_rank, get_batch_on_this_tp_rank from megatron.legacy.data.data_samplers import build_pretraining_data_loader +from megatron.core.datasets.data_schedule import HybridCPDataLoaderWrapper from megatron.core.optimizer_param_scheduler import OptimizerParamScheduler from megatron.core.transformer.moe import upcycling_utils from megatron.core.transformer.moe.moe_utils import track_moe_metrics @@ -1451,28 +1452,14 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch for key in losses_reduced[0].keys(): val = [x[key].view(-1) for x in losses_reduced] if val[0].numel() == 2: - if args.sft: - # in mcore the normalization happens on micro batch instead of global - val = torch.vstack(val) - val = val[:, 0] / val[:, 1] - val = val.mean() - torch.distributed.all_reduce( - val, - group=mpu.get_data_parallel_group(with_context_parallel=True) - ) - val /= torch.distributed.get_world_size( - group=mpu.get_data_parallel_group(with_context_parallel=True) - ) - loss_reduced[key] = val - else: - # there is one dict per microbatch. in new reporting, we average - # over the total number of tokens across the global batch. - val = torch.vstack(val).sum(dim=0) - torch.distributed.all_reduce( - val, - group=mpu.get_data_parallel_group(with_context_parallel=True) - ) - loss_reduced[key] = val[0] / val[1] + # there is one dict per microbatch. in new reporting, we average + # over the total number of tokens across the global batch. + val = torch.vstack(val).sum(dim=0) + torch.distributed.all_reduce( + val, + group=mpu.get_data_parallel_group(with_context_parallel=True) + ) + loss_reduced[key] = val[0] / val[1] elif val[0].numel() == 1: # legacy behavior, we average over the number of microbatches val = torch.cat(val).mean() @@ -2173,6 +2160,9 @@ def train( energy_monitor = get_energy_monitor() one_logger = get_one_logger() + if args.hybrid_context_parallel: + train_data_iterator = iter(HybridCPDataLoaderWrapper(train_data_iterator, config)) + if args.run_workload_inspector_server: try: from workload_inspector.utils.webserver import run_server diff --git a/megatron/training/utils.py b/megatron/training/utils.py index 52a3bf36d88..4730a525271 100644 --- a/megatron/training/utils.py +++ b/megatron/training/utils.py @@ -541,19 +541,58 @@ def _broadcast(item): else data["attention_mask"].cuda(non_blocking=True) ), 'position_ids': data["position_ids"].cuda(non_blocking=True), + 'cu_seqlens': ( + None + if "cu_seqlens" not in data + else data["cu_seqlens"].cuda(non_blocking=True) + ), + 'max_seqlen': ( + None + if "max_seqlen" not in data + else data["max_seqlen"].cuda(non_blocking=True) + ), + 'local_cp_size': ( + None + if "local_cp_size" not in data + else data["local_cp_size"].cuda(non_blocking=True) + ), } + def _broadcast_cu_seqlens(cu_seqlens): + dev = torch.cuda.current_device() + n = 0 if cu_seqlens is None else int(cu_seqlens.numel()) + n_tensor = torch.tensor(n, dtype=torch.int64, device=dev) + _broadcast(n_tensor) + + if n == 0: + buf = torch.empty(0, dtype=torch.int32, device=dev) + else: + assert isinstance(cu_seqlens, torch.Tensor) + assert cu_seqlens.dtype == torch.int32 + assert cu_seqlens.shape[0] == 1, "micro-batch-size must be 1 for packing" + buf = cu_seqlens.to(device=dev, non_blocking=True).contiguous() + _broadcast(buf) + + if args.hybrid_context_parallel: + seq_len = torch.tensor(batch['tokens'].shape[0], dtype=torch.int32, device=torch.cuda.current_device()) + _broadcast(seq_len) + if args.pipeline_model_parallel_size == 1 or mtp_on_this_rank: _broadcast(batch['tokens']) _broadcast(batch['labels']) _broadcast(batch['loss_mask']) _broadcast(batch['attention_mask']) _broadcast(batch['position_ids']) + _broadcast_cu_seqlens(batch['cu_seqlens']) + _broadcast(batch['max_seqlen']) + _broadcast(batch['local_cp_size']) elif mpu.is_pipeline_first_stage(): _broadcast(batch['tokens']) _broadcast(batch['attention_mask']) _broadcast(batch['position_ids']) + _broadcast_cu_seqlens(batch['cu_seqlens']) + _broadcast(batch['max_seqlen']) elif mpu.is_pipeline_last_stage(): # Multi-Token Prediction (MTP) layers need tokens and position_ids to calculate embedding. @@ -564,42 +603,79 @@ def _broadcast(item): _broadcast(batch['attention_mask']) else: - + if args.hybrid_context_parallel: + seq_len = torch.tensor(0, dtype=torch.int32, device=torch.cuda.current_device()) + _broadcast(seq_len) + shape = (seq_len.item()) + else: + shape = (args.micro_batch_size, args.seq_length) + tokens = torch.empty( - (args.micro_batch_size, args.seq_length), + shape, dtype=torch.int64, device=torch.cuda.current_device(), ) labels = torch.empty( - (args.micro_batch_size, args.seq_length), + shape, dtype=torch.int64, device=torch.cuda.current_device(), ) loss_mask = torch.empty( - (args.micro_batch_size, args.seq_length), + shape, dtype=torch.float32, device=torch.cuda.current_device(), ) if args.create_attention_mask_in_dataloader: + shape_attention_mask = (args.micro_batch_size, 1, args.seq_length, args.seq_length) if not args.hybrid_context_parallel else (1, 1, shape[0], shape[0]) attention_mask = torch.empty( - (args.micro_batch_size, 1, args.seq_length, args.seq_length), + shape_attention_mask, dtype=torch.bool, device=torch.cuda.current_device(), ) else: attention_mask = None position_ids = torch.empty( - (args.micro_batch_size, args.seq_length), + shape, dtype=torch.int64, device=torch.cuda.current_device(), ) + cu_seqlens = None + max_seqlen = torch.empty( + 1, + dtype=torch.int32, + device=torch.cuda.current_device(), + ) if args.hybrid_context_parallel else None + local_cp_size = torch.empty( + 1, + dtype=torch.int32, + device=torch.cuda.current_device(), + ) if args.hybrid_context_parallel else None + + def _broadcast_cu_seqlens(): + dev = torch.cuda.current_device() + + n = torch.empty((), dtype=torch.int64, device=dev) + _broadcast(n) + n = int(n.item()) + + if n == 0: + cu_seqlens = torch.empty(0, dtype=torch.int32, device=dev) + else: + cu_seqlens = torch.empty((args.micro_batch_size, n), dtype=torch.int32, device=dev) + _broadcast(cu_seqlens) + + return cu_seqlens if n > 0 else None + if args.pipeline_model_parallel_size == 1 or mtp_on_this_rank: _broadcast(tokens) _broadcast(labels) _broadcast(loss_mask) _broadcast(attention_mask) _broadcast(position_ids) + cu_seqlens = _broadcast_cu_seqlens() + _broadcast(max_seqlen) + _broadcast(local_cp_size) elif mpu.is_pipeline_first_stage(): labels = None @@ -608,6 +684,8 @@ def _broadcast(item): _broadcast(tokens) _broadcast(attention_mask) _broadcast(position_ids) + cu_seqlens = _broadcast_cu_seqlens() + _broadcast(max_seqlen) elif mpu.is_pipeline_last_stage(): # Multi-Token Prediction (MTP) layers need tokens and position_ids to calculate embedding. @@ -615,7 +693,8 @@ def _broadcast(item): # to broadcast tokens and position_ids to all of the tensor parallel ranks on the last stage. tokens = None position_ids = None - + cu_seqlens = None + max_seqlen = None _broadcast(labels) _broadcast(loss_mask) _broadcast(attention_mask) @@ -626,6 +705,9 @@ def _broadcast(item): 'loss_mask': loss_mask, 'attention_mask': attention_mask, 'position_ids': position_ids, + 'cu_seqlens': cu_seqlens, + 'max_seqlen': max_seqlen, + 'local_cp_size': local_cp_size, } return batch diff --git a/pretrain_gpt.py b/pretrain_gpt.py index ecb7163ff70..e976f5aff79 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -14,9 +14,9 @@ from megatron.core.enums import ModelType from megatron.core.models.gpt import GPTModel from megatron.core.rerun_state_machine import get_rerun_state_machine +from megatron.core.utils import get_attr_wrapped_model, get_thd_batch_on_this_cp_rank, get_batch_on_this_hybrid_cp_rank, StragglerDetector from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer from megatron.core.transformer.multi_token_prediction import mtp_on_this_rank, get_mtp_ranks -from megatron.core.utils import StragglerDetector, get_attr_wrapped_model from megatron.training.arguments import core_transformer_config_from_args from megatron.training import get_args, get_timers, get_tokenizer, inprocess_restart, pretrain, print_rank_0 from megatron.training.datasets.sft_dataset import SFTDataset @@ -46,7 +46,7 @@ def get_batch(data_iterator, vp_stage: Optional[int] = None): # TODO: this is pretty hacky, find a better way if not is_first_or_last_pipeline_stage(vp_stage) and ( (not mtp_on_this_rank(config, ignore_virtual=False, vp_stage=vp_stage))): - return None, None, None, None, None + return None, None, None, None, None, None # get batches based on the TP rank you are on batch = get_batch_on_this_tp_rank( @@ -54,10 +54,24 @@ def get_batch(data_iterator, vp_stage: Optional[int] = None): mtp_on_this_rank=mtp_on_this_rank(config, ignore_virtual=False, vp_stage=vp_stage) ) - # slice batch along sequence dimension for context parallelism - batch = get_batch_on_this_cp_rank(batch) - - return batch.values() + cu_seqlens = batch.pop('cu_seqlens', None) + cu_seqlens_padded = batch.pop('cu_seqlens_padded', None) + max_seqlen = batch.pop('max_seqlen', None) + local_cp_size = batch.pop('local_cp_size', None) + if local_cp_size is not None: + local_cp_size = int(local_cp_size.item()) + + if cu_seqlens is None and local_cp_size is None: + # slice batch along sequence dimension for context parallelism + batch = get_batch_on_this_cp_rank(batch) # The implementation of this function is in MCore + packed_seq_params = None + elif local_cp_size is None: # Packed THD format + assert max_seqlen.dim() == 1 + batch, packed_seq_params = get_thd_batch_on_this_cp_rank(batch, cu_seqlens, cu_seqlens_padded, max_seqlen) + else: # Hybrid CP format + batch, packed_seq_params = get_batch_on_this_hybrid_cp_rank(batch, local_cp_size) + + return (*batch.values(), packed_seq_params) # define spiky loss as a loss that's 10x the max loss observed @@ -142,7 +156,7 @@ def forward_step(data_iterator, model: GPTModel, return_schedule_plan: bool = Fa global stimer with stimer(bdata=True): vp_stage = get_attr_wrapped_model(model, "vp_stage") - tokens, labels, loss_mask, attention_mask, position_ids = get_batch(data_iterator, vp_stage) + tokens, labels, loss_mask, attention_mask, position_ids, packed_seq_params = get_batch(data_iterator, vp_stage) timers('batch-generator').stop() with stimer: @@ -158,7 +172,7 @@ def forward_step(data_iterator, model: GPTModel, return_schedule_plan: bool = Fa return schedule_plan, partial(loss_func, loss_mask, model=model) else: output_tensor = model( - tokens, position_ids, attention_mask, labels=labels, loss_mask=loss_mask + tokens, position_ids, attention_mask, labels=labels, loss_mask=loss_mask, packed_seq_params=packed_seq_params ) # [ModelOpt]: model is needed to access ModelOpt distillation losses @@ -204,6 +218,10 @@ def core_gpt_dataset_config_from_args(args): object_storage_cache_path=args.object_storage_cache_path, mid_level_dataset_surplus=args.mid_level_dataset_surplus, allow_ambiguous_pad_tokens=args.allow_ambiguous_pad_tokens, + context_parallel_size=args.context_parallel_size, + data_parallel_size=args.data_parallel_size, + sequence_parallel_size=args.tensor_model_parallel_size*args.sequence_parallel, + hybrid_context_parallel=args.hybrid_context_parallel, ) diff --git a/pretrain_mamba.py b/pretrain_mamba.py index 45b646a6cc0..ca2008620be 100644 --- a/pretrain_mamba.py +++ b/pretrain_mamba.py @@ -44,6 +44,13 @@ def get_batch(data_iterator, vp_stage=None): # get batches based on the TP rank you are on batch = get_batch_on_this_tp_rank(data_iterator) + + # Support for Packed Sequence (Unused in this script) + cu_seqlens = batch.pop('cu_seqlens', None) + cu_seqlens_padded = batch.pop('cu_seqlens_padded', None) + max_seqlen = batch.pop('max_seqlen', None) + # Support for Hybrid Context Parallel (Unused in this script) + local_cp_size = batch.pop('local_cp_size', None) # slice batch along sequence dimension for context parallelism batch = get_batch_on_this_cp_rank(batch) diff --git a/tests/unit_tests/test_parallel_state.py b/tests/unit_tests/test_parallel_state.py index 7218ed5b6e1..0c722ee0257 100644 --- a/tests/unit_tests/test_parallel_state.py +++ b/tests/unit_tests/test_parallel_state.py @@ -1,5 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +from math import log2 + import pytest import torch @@ -499,3 +501,32 @@ def golden_rank_result_from_past_code( assert expert_dp_group == expert_rank_generator.get_ranks( "dp" ), f"{expert_dp_group} != {expert_rank_generator.get_ranks('dp')}." + + +@pytest.mark.parametrize( + "world_size, tp_size, cp_size, dp_size", + [(8, 1, 2, 4), (8, 1, 1, 8)], # 8 GPUs, 1 TP, 2 CP, 4 DP # 8 GPUs, 1 TP, 1 CP, 8 DP +) +def test_hybrid_dp_cp_groups(world_size, tp_size, cp_size, dp_size): + """ + Test that hybrid DPxCP groups are created correctly. + """ + Utils.destroy_model_parallel() + + # Skip if world size doesn't match + actual_world_size = torch.cuda.device_count() + if actual_world_size != world_size: + pytest.skip(f"Test requires world_size={world_size}, but got {actual_world_size}") + Utils.initialize_model_parallel( + tensor_model_parallel_size=tp_size, + context_parallel_size=cp_size, + hybrid_context_parallel=True, + ) + + dp_cp_size = ps.get_data_parallel_world_size(with_context_parallel=True) + group_sizes = [2**i for i in range(int(log2(dp_cp_size)))][1:] + for group_size in group_sizes: + group = ps.get_hybrid_data_context_parallel_groups(group_size=group_size) + assert group.size() == group_size + + Utils.destroy_model_parallel() From 87ac13dbe71bfbf88dff81f6cfe87f9dcf8a88db Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Thu, 4 Dec 2025 15:57:03 -0800 Subject: [PATCH 178/248] update API compat check baseline to 274e04d (#2548) Signed-off-by: Pablo Garay --- .../workflows/check_api_backwards_compatibility_workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check_api_backwards_compatibility_workflow.yml b/.github/workflows/check_api_backwards_compatibility_workflow.yml index 002a18194a3..0ccaa8ccc5e 100644 --- a/.github/workflows/check_api_backwards_compatibility_workflow.yml +++ b/.github/workflows/check_api_backwards_compatibility_workflow.yml @@ -91,7 +91,7 @@ jobs: # Default baseline for automatic PR checks # Can be: branch name (e.g., 'main'), commit hash, or tag # Will be resolved to commit hash during execution - DEFAULT_BASELINE: 'b0f5746735a965e67852d936a8fd0ef8928e9a81' + DEFAULT_BASELINE: '274e04d21fbcb7f53f63de992ee1217f275f1cf2' # Tag pattern for auto-detection (e.g., 'core_r*', 'core_v*') TAG_PATTERN: 'core_v*' # Tag regex filter (e.g., '^core_v[0-9]+\.[0-9]+\.[0-9]+$' for stable versions only) From f0c1b55eee7dd9dd208d6b0c7b33a45dc1e9cba8 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Thu, 4 Dec 2025 16:35:46 -0800 Subject: [PATCH 179/248] feat: mcore trigger mbridge (#2340) (#2552) Signed-off-by: Pablo Garay --- .github/workflows/trigger-mbridge-tests.yml | 183 ++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 .github/workflows/trigger-mbridge-tests.yml diff --git a/.github/workflows/trigger-mbridge-tests.yml b/.github/workflows/trigger-mbridge-tests.yml new file mode 100644 index 00000000000..b1a3aa0089d --- /dev/null +++ b/.github/workflows/trigger-mbridge-tests.yml @@ -0,0 +1,183 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: Trigger MBridge Tests +# Remote testing of MBridge from MCore +# Triggers MBridge CI tests with current MCore commit to verify backward compatibility + +on: + # Manual trigger only + workflow_dispatch: + inputs: + mbridge_ref: + description: 'MBridge branch/ref to trigger' + required: false + type: string + default: 'main' + run_cicd_main: + description: 'Run cicd-main.yml (full CI/CD)' + required: false + type: boolean + default: true + run_install_test: + description: 'Run install-test.yml (quick install check)' + required: false + type: boolean + default: true + test_suite: + description: 'Test suite to run (for cicd-main)' + required: false + type: choice + options: + - 'all' + - 'unit-only' + - 'functional-only' + default: 'all' + +jobs: + # First job: Get MCore commit info (shared by all matrix jobs) + get-mcore-info: + runs-on: ubuntu-latest + outputs: + sha: ${{ steps.mcore_info.outputs.sha }} + short_sha: ${{ steps.mcore_info.outputs.short_sha }} + branch: ${{ steps.mcore_info.outputs.branch }} + repo_url: ${{ steps.mcore_info.outputs.repo_url }} + steps: + - name: Checkout MCore + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get MCore commit info + id: mcore_info + run: | + echo "sha=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT + echo "short_sha=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT + echo "branch=${GITHUB_REF#refs/heads/}" >> $GITHUB_OUTPUT + + # Get repo URL from origin remote, fallback to constructing from github context + REPO_URL=$(git remote get-url origin 2>/dev/null || echo "${{ github.server_url }}/${{ github.repository }}.git") + echo "repo_url=${REPO_URL}" >> $GITHUB_OUTPUT + + echo "📦 MCore commit: $(git rev-parse --short HEAD)" + echo "🌿 Branch: ${GITHUB_REF#refs/heads/}" + echo "📍 Repo: ${REPO_URL}" + + # Matrix job: Trigger and monitor MBridge workflows in parallel + trigger-and-monitor: + needs: [get-mcore-info] + runs-on: ubuntu-latest + continue-on-error: true # Don't fail workflow if monitoring times out + strategy: + fail-fast: false # Continue other matrix jobs even if one fails + matrix: + include: + - workflow: install-test.yml + name: Install Test + - workflow: cicd-main.yml + name: CI/CD Main + + name: ${{ matrix.name }} + + steps: + - name: Check if workflow should run + id: should_run + run: | + if [[ "${{ matrix.workflow }}" == "install-test.yml" && "${{ inputs.run_install_test }}" == "true" ]]; then + echo "run=true" >> $GITHUB_OUTPUT + elif [[ "${{ matrix.workflow }}" == "cicd-main.yml" && "${{ inputs.run_cicd_main }}" == "true" ]]; then + echo "run=true" >> $GITHUB_OUTPUT + else + echo "run=false" >> $GITHUB_OUTPUT + echo "⏭️ Skipping ${{ matrix.workflow }} (not enabled)" + fi + + - name: Trigger ${{ matrix.workflow }} + if: steps.should_run.outputs.run == 'true' + id: trigger + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + echo "🚀 Triggering ${{ matrix.workflow }} | MCore: ${{ needs.get-mcore-info.outputs.short_sha }} | MBridge: ${{ inputs.mbridge_ref }}" + + gh workflow run ${{ matrix.workflow }} \ + --repo NVIDIA-NeMo/Megatron-Bridge --ref ${{ inputs.mbridge_ref }} \ + --field mcore_commit=${{ needs.get-mcore-info.outputs.sha }} \ + --field mcore_branch=${{ needs.get-mcore-info.outputs.branch }} \ + --field mcore_repo=${{ needs.get-mcore-info.outputs.repo_url }} \ + --field test_suite=${{ inputs.test_suite }} \ + --field triggered_by=mcore-ci + + - name: Get run ID + if: steps.should_run.outputs.run == 'true' + id: get_run_id + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + sleep 10 # Wait for run to appear + RUN_ID=$(gh run list \ + --repo NVIDIA-NeMo/Megatron-Bridge \ + --workflow=${{ matrix.workflow }} \ + --limit 5 \ + --json databaseId,createdAt \ + --jq "sort_by(.createdAt) | reverse | .[0] | .databaseId") + + echo "run_id=${RUN_ID}" >> $GITHUB_OUTPUT + echo "📋 Run ID: ${RUN_ID}" + + cat >> $GITHUB_STEP_SUMMARY << EOF + ## 🔄 ${{ matrix.name }} Triggered + + **MCore:** \`${{ needs.get-mcore-info.outputs.short_sha }}\` | **MBridge:** \`${{ inputs.mbridge_ref }}\` | **Suite:** \`${{ inputs.test_suite }}\` + + - 🔄 [${{ matrix.workflow }}](https://github.com/NVIDIA-NeMo/Megatron-Bridge/actions/runs/${RUN_ID}) - Running... + - ⏳ Monitoring every 5 minutes until completion + + > **Note:** Tests run without approval when triggered from MCore + EOF + + - name: Monitor workflow + if: steps.should_run.outputs.run == 'true' + id: monitor + continue-on-error: true + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + RUN_ID="${{ steps.get_run_id.outputs.run_id }}" + echo "📊 Monitoring ${{ matrix.workflow }} (Run ID: ${RUN_ID})" + + gh run watch ${RUN_ID} --repo NVIDIA-NeMo/Megatron-Bridge --exit-status + + CONCLUSION=$(gh run view ${RUN_ID} --repo NVIDIA-NeMo/Megatron-Bridge --json conclusion --jq -r .conclusion) + echo "workflow_status=${CONCLUSION}" >> $GITHUB_ENV + echo "✅ Completed: ${CONCLUSION}" + + - name: Report results + if: always() && steps.should_run.outputs.run == 'true' + run: | + CONCLUSION="${{ env.workflow_status || 'unknown' }}" + RUN_ID="${{ steps.get_run_id.outputs.run_id }}" + + case "$CONCLUSION" in + "success") ICON="✅"; MSG="passed" ;; + "failure") ICON="❌"; MSG="failed"; EXIT_CODE=1 ;; + "cancelled") ICON="🚫"; MSG="cancelled"; EXIT_CODE=0 ;; + *) ICON="⏳"; MSG="still running or timed out"; EXIT_CODE=0 ;; + esac + + cat >> $GITHUB_STEP_SUMMARY << EOF + ## 📊 ${{ matrix.name }} Results + + ### ${ICON} ${{ matrix.workflow }} + **Status:** \`${CONCLUSION}\` + + [View full results →](https://github.com/NVIDIA-NeMo/Megatron-Bridge/actions/runs/${RUN_ID}) + + --- + *Triggered from MCore \`${{ needs.get-mcore-info.outputs.short_sha }}\`* + EOF + + echo "${ICON} ${{ matrix.name }} ${MSG}" + exit ${EXIT_CODE:-0} + From 8de5a7f192d7e63b10af3677330e0f4f6e3fbb5d Mon Sep 17 00:00:00 2001 From: Robin Zhang Date: Fri, 5 Dec 2025 09:58:26 +0800 Subject: [PATCH 180/248] [Dev] Optimize TE CUDA Graph capturing time (#2483) Signed-off-by: Robin Zhang --- megatron/core/transformer/cuda_graphs.py | 50 +++++++++++++--- .../transformer/test_cuda_graphs.py | 59 +++++++++++-------- 2 files changed, 74 insertions(+), 35 deletions(-) diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index f0fb39e6500..9f2bb2dd5f2 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -3,6 +3,7 @@ import gc import inspect import logging +import math import os import time from collections import defaultdict @@ -1401,6 +1402,9 @@ def __init__(self, model, config, seq_length, micro_batch_size, optimizers=[]): self.optimizers = optimizers self.num_model_chunks = len(model) + # Number of microbatches to capture. The value will be set in _get_cuda_graph_input_data(). + self.num_microbatches = None + # Get callables with captureable layers. self.chunks_with_decoder = [] self.num_layers_per_chunk = [] @@ -1536,12 +1540,12 @@ def _get_sample_arguments(self, order): order ), "num_model_chunks must match the max chunk id in order." assert ( - get_num_microbatches() == len(order) // self.num_model_chunks // 2 + self.num_microbatches == len(order) // self.num_model_chunks // 2 ), "num_microbatches must match the number of microbatches in order." # Generate sample arguments and keyword arguments for capturing. - sample_args = [None] * (len(self.flattened_callables) * get_num_microbatches()) - sample_kwargs = [None] * (len(self.flattened_callables) * get_num_microbatches()) + sample_args = [None] * (len(self.flattened_callables) * self.num_microbatches) + sample_kwargs = [None] * (len(self.flattened_callables) * self.num_microbatches) rotary_pos_emb_cache = {} @@ -1623,7 +1627,7 @@ def get_rotary_pos_emb(transformer_module, transformer_input): model_chunk_idx = abs(chunk_id) - 1 if chunk_id > 0: - sample_start_idx = (prefix_num_layers[model_chunk_idx] * get_num_microbatches()) + ( + sample_start_idx = (prefix_num_layers[model_chunk_idx] * self.num_microbatches) + ( fwd_idx[model_chunk_idx] * self.num_layers_per_chunk[model_chunk_idx] ) fwd_sample_idx = [ @@ -1691,14 +1695,23 @@ def _get_cuda_graph_input_data(self): get_schedule_table, ) + # If PP is not enabled, we only need to capture one microbatch. + if parallel_state.get_pipeline_model_parallel_world_size() == 1: + assert ( + self.num_model_chunks == 1 + ), "If PP is not enabled, there should be only one model chunk." + self.num_microbatches = 1 + else: + self.num_microbatches = get_num_microbatches() + _, _, num_warmup_microbatches, _ = get_pp_rank_microbatches( - get_num_microbatches(), + self.num_microbatches, self.num_model_chunks, self.config.microbatch_group_size_per_vp_stage, False, ) schedule_table = get_schedule_table( - get_num_microbatches(), + self.num_microbatches, self.num_model_chunks, self.config.microbatch_group_size_per_vp_stage, ) @@ -1717,7 +1730,21 @@ def _get_cuda_graph_input_data(self): sample_args, sample_kwargs = self._get_sample_arguments(order) def get_make_graphed_callables_kwargs(): - kwargs = {'num_warmup_iters': 11, 'allow_unused_input': True, '_order': order} + kwargs = {'allow_unused_input': True, '_order': order} + + # Calculate the number of warmup iterations per layer per microbatch inside TE + # make_graphed_callables(). There are two rules: + # 1. There should be at least 1 warmup iteration per layer per microbatch inside TE + # make_graphed_callables(). + # 2. There should be at least 10 warmup iterations per layer, counting the MCore warmup + # steps before going into this capture routine. + kwargs['num_warmup_iters'] = max( + 1, + math.ceil( + (10 - self.config.cuda_graph_warmup_steps * get_num_microbatches()) + / self.num_microbatches + ), + ) if is_te_min_version("2.6.0"): # Starting from TE 2.6.0, make_graphed_callables() accepts different number @@ -1780,6 +1807,8 @@ def _start_capturing(self): torch.distributed.barrier() gc.collect() torch.cuda.empty_cache() + if FREEZE_GC: + gc.freeze() _set_capture_start() log_single_rank(logger, logging.INFO, f'Start CUDA Graphs capture...') @@ -1807,6 +1836,9 @@ def _finish_capturing(self, start_time): optimizer.zero_grad() clear_aux_losses_tracker() reset_model_temporary_tensors(self.config, self.model) + + if FREEZE_GC: + gc.unfreeze() gc.collect() torch.cuda.empty_cache() @@ -1827,10 +1859,10 @@ def create_cudagraphs(self): for layers in self.callables_per_chunk: for layer_number, layer in enumerate(layers): layer.cuda_graphs = [] - for batch_number in range(get_num_microbatches()): + for batch_number in range(self.num_microbatches): layer.cuda_graphs.append( graphs[ - num_layers_accumulated * get_num_microbatches() + num_layers_accumulated * self.num_microbatches + batch_number * len(layers) + layer_number ] diff --git a/tests/unit_tests/transformer/test_cuda_graphs.py b/tests/unit_tests/transformer/test_cuda_graphs.py index 0eac7c28c6d..8133a3d2db0 100644 --- a/tests/unit_tests/transformer/test_cuda_graphs.py +++ b/tests/unit_tests/transformer/test_cuda_graphs.py @@ -742,18 +742,14 @@ def test_capture_freeze_gc(self): ) -# Global storage for comparing unique buffer counts across different num_microbatches -_unique_buffer_counts = None +# Global storage for comparing unique buffer counts across different num_microbatches, keyed by pp_size +_unique_buffer_counts = {} class TestTECudaGraphHelper: def setup_method(self, method): # Initialize parallel state initialize_rng_tracker(use_te_rng_tracker=True, force_reset=True) - Utils.initialize_model_parallel( - tensor_model_parallel_size=1, pipeline_model_parallel_size=1 - ) - model_parallel_cuda_manual_seed(123) def teardown_method(self, method): Utils.destroy_model_parallel() @@ -763,9 +759,14 @@ def teardown_method(self, method): # compare values across parametrized test runs @pytest.mark.parametrize("num_microbatches", [4, 16, 64, 256]) - def test_get_cuda_graph_input_data(self, num_microbatches): + @pytest.mark.parametrize("pp_size", [1, 2, 4]) + def test_get_cuda_graph_input_data(self, num_microbatches, pp_size): """Test _get_cuda_graph_input_data function in TECudaGraphHelper.""" + Utils.initialize_model_parallel( + tensor_model_parallel_size=1, pipeline_model_parallel_size=pp_size + ) + # Set up test configuration seq_length = 128 micro_batch_size = 2 @@ -794,7 +795,8 @@ def test_get_cuda_graph_input_data(self, num_microbatches): use_te_rng_tracker=True, bf16=True, tensor_model_parallel_size=1, - pipeline_model_parallel_size=1, + pipeline_model_parallel_size=pp_size, + pipeline_dtype=torch.bfloat16, context_parallel_size=1, ) @@ -835,7 +837,10 @@ def test_get_cuda_graph_input_data(self, num_microbatches): # Basic checks num_graphable_layers = len(cuda_graph_helper.flattened_callables) - expected_length = num_graphable_layers * num_microbatches + if pp_size > 1: + expected_length = num_graphable_layers * num_microbatches + else: + expected_length = num_graphable_layers assert len(sample_args) == expected_length, ( f"sample_args length mismatch: expected {expected_length}, " f"got {len(sample_args)}" ) @@ -931,17 +936,17 @@ def test_get_cuda_graph_input_data(self, num_microbatches): f"should be <= total_entries ({total_entries})" ) global _unique_buffer_counts - if _unique_buffer_counts is None: - _unique_buffer_counts = unique_buffer_count + if pp_size not in _unique_buffer_counts: + _unique_buffer_counts[pp_size] = unique_buffer_count else: - assert unique_buffer_count == _unique_buffer_counts, ( - f"Unique buffer count mismatch: expected {_unique_buffer_counts}, " + assert unique_buffer_count == _unique_buffer_counts[pp_size], ( + f"Unique buffer count mismatch: expected {_unique_buffer_counts[pp_size]}, " f"got {unique_buffer_count}" ) # Verify that buffers with the same signature can potentially be reused # (the actual reuse depends on the schedule, but the mechanism should work) - if num_microbatches > 1 and num_graphable_layers > 0: + if expected_length > 1: # Check that we have multiple entries with the same signature has_duplicate_signatures = any( len(indices) > 1 for indices in sample_keys_to_indices.values() @@ -955,10 +960,8 @@ def test_get_cuda_graph_input_data(self, num_microbatches): # some buffers should be reused (max_reuse > 1) # Note: The exact amount of reuse depends on the schedule order # With 1F1B interleaved schedule, we should see some reuse - if max_reuse > 1: - # Verify that reused buffers have the same signature - reused_tensors = [ptr for ptr, count in tensor_reuse_count.items() if count > 1] - assert len(reused_tensors) > 0, "Expected some reused tensors" + if pp_size > num_microbatches: + assert max_reuse > 1, "Expected some buffer reuse" # Verify that make_graphed_callables_kwargs contains expected keys assert ( @@ -974,18 +977,22 @@ def test_get_cuda_graph_input_data(self, num_microbatches): # Verify the order in kwargs matches expectations order = make_graphed_callables_kwargs['_order'] num_model_chunks = cuda_graph_helper.num_model_chunks - expected_order_length = num_microbatches * num_model_chunks * 2 + forward_count = sum(1 for chunk_id in order if chunk_id > 0) + if pp_size > 1: + # Verify that all forward passes in order have corresponding entries in sample_args + assert forward_count == num_microbatches * num_model_chunks, ( + f"Forward count mismatch: expected {num_microbatches * num_model_chunks}, " + f"got {forward_count}" + ) + expected_order_length = num_microbatches * num_model_chunks * 2 + else: + assert num_model_chunks == 1, "Expected only one model chunk for pp_size == 1" + assert forward_count == 1, "Expected only one forward pass for pp_size == 1" + expected_order_length = 2 assert ( len(order) == expected_order_length ), f"Order length mismatch: expected {expected_order_length}, got {len(order)}" - # Verify that all forward passes in order have corresponding entries in sample_args - forward_count = sum(1 for chunk_id in order if chunk_id > 0) - assert forward_count == num_microbatches * num_model_chunks, ( - f"Forward count mismatch: expected {num_microbatches * num_model_chunks}, " - f"got {forward_count}" - ) - def is_deep_ep_available(): from megatron.core.transformer.moe.fused_a2a import HAVE_DEEP_EP From 1f08cebac2f7e63159ad2966b3ebc6c9b7da3689 Mon Sep 17 00:00:00 2001 From: Jianbing Date: Fri, 5 Dec 2025 10:21:13 +0800 Subject: [PATCH 181/248] [Dev] Feature: linear cross entropy fusion (#2256) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jianbing Dong Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Signed-off-by: oliver könig Signed-off-by: Ananth Subramaniam Signed-off-by: dimapihtar Signed-off-by: Youngeun Kwon Signed-off-by: Youngeun Signed-off-by: Maanu Grover Signed-off-by: ykarnati Signed-off-by: Deepak Narayanan Signed-off-by: GitHub Actions Signed-off-by: Charlie Truong Signed-off-by: Zhongbo Zhu Signed-off-by: Xiaowei Ren Signed-off-by: Xin Yao Signed-off-by: Keshav Santhanam Signed-off-by: Pablo Garay Signed-off-by: Asha Anoosheh Signed-off-by: Chen Cui Signed-off-by: Li Tao Signed-off-by: lit Signed-off-by: Santosh Bhavani Signed-off-by: Robin Zhang Signed-off-by: kunlunl Co-authored-by: Jianbin Chang Co-authored-by: Deyu Fu Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Co-authored-by: Yashaswi Karnati <144376261+yashaswikarnati@users.noreply.github.com> Co-authored-by: Jared Casper <155158+jaredcasper@users.noreply.github.com> Co-authored-by: Antoni-Joan Solergibert Co-authored-by: oliver könig Co-authored-by: Ananth Subramaniam Co-authored-by: Teodor-Dumitru Ene <34819528+tdene@users.noreply.github.com> Co-authored-by: Siddharth Singh <136645615+sidsingh-nvidia@users.noreply.github.com> Co-authored-by: Mcore Bot Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: Youngeun Kwon Co-authored-by: Lawrence McAfee <85179052+lmcafee-nvidia@users.noreply.github.com> Co-authored-by: Maanu Grover <109391026+maanug-nv@users.noreply.github.com> Co-authored-by: Lawrence McAfee Co-authored-by: AJ Schmidt Co-authored-by: Deepak Narayanan <2724038+deepakn94@users.noreply.github.com> Co-authored-by: helen ngo Co-authored-by: GitHub Actions Co-authored-by: Aaron Gokaslan Co-authored-by: Robert Kirby Co-authored-by: Teodor-Dumitru Ene Co-authored-by: yeyu-nvidia Co-authored-by: Abhinav Khattar Co-authored-by: Roger Waleffe Co-authored-by: Charlie Truong Co-authored-by: Tong Liu Co-authored-by: Zhongbo Zhu <42691305+zhongbozhu@users.noreply.github.com> Co-authored-by: Xiaowei Ren Co-authored-by: Xin Yao Co-authored-by: Teodor-Dumitru Ene Co-authored-by: Zijie Yan Co-authored-by: root Co-authored-by: Keshav Santhanam Co-authored-by: Pablo Garay Co-authored-by: Asha Anoosheh Co-authored-by: Kan Zhu Co-authored-by: Robert Kirby Co-authored-by: Jorge Albericio Co-authored-by: Jon Barker <19699370+jon-barker@users.noreply.github.com> Co-authored-by: Chen Cui Co-authored-by: Pablo Garay Co-authored-by: Tong Liu Co-authored-by: Michael Wojcikiewicz Co-authored-by: Li Tao Co-authored-by: Santosh Bhavani Co-authored-by: Li Ruixiao Co-authored-by: Robin Zhang Co-authored-by: Kunlun Li <94586211+kunlunl@users.noreply.github.com> --- .../fusions/fused_linear_cross_entropy.py | 242 +++ .../fusions/linear_cross_entropy/__init__.py | 1 + .../blackwell/__init__.py | 1 + .../blackwell/bwd_partial_dlogits.py | 667 ++++++++ .../linear_cross_entropy/blackwell/entry.py | 475 ++++++ .../blackwell/fwd_mainloop.py | 693 ++++++++ .../linear_cross_entropy/blackwell/triton.py | 248 +++ .../fusions/linear_cross_entropy/utils.py | 43 + .../common/language_module/language_module.py | 65 +- megatron/core/models/gpt/gpt_model.py | 42 +- megatron/core/models/mamba/mamba_model.py | 19 +- megatron/training/arguments.py | 2 +- .../test_fused_linear_cross_entropy.py | 1509 +++++++++++++++++ 13 files changed, 3990 insertions(+), 17 deletions(-) create mode 100644 megatron/core/fusions/fused_linear_cross_entropy.py create mode 100644 megatron/core/fusions/linear_cross_entropy/__init__.py create mode 100644 megatron/core/fusions/linear_cross_entropy/blackwell/__init__.py create mode 100644 megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py create mode 100644 megatron/core/fusions/linear_cross_entropy/blackwell/entry.py create mode 100644 megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py create mode 100644 megatron/core/fusions/linear_cross_entropy/blackwell/triton.py create mode 100644 megatron/core/fusions/linear_cross_entropy/utils.py create mode 100644 tests/unit_tests/fusions/test_fused_linear_cross_entropy.py diff --git a/megatron/core/fusions/fused_linear_cross_entropy.py b/megatron/core/fusions/fused_linear_cross_entropy.py new file mode 100644 index 00000000000..b533fef7aa3 --- /dev/null +++ b/megatron/core/fusions/fused_linear_cross_entropy.py @@ -0,0 +1,242 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +""" +Linear Cross Entropy API +Fuse cross entropy with linear layer. +""" + +import typing +from functools import lru_cache + +import torch + + +class Platform: + """ + Singleton class for targeted GPU platform. + """ + + _instance: typing.Optional["Platform"] = None + + def __new__(cls) -> "Platform": + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__(self) -> None: + if getattr(self, "_initialized", False): + return + + assert torch.cuda.is_available(), "CUDA is not available" + device = torch.cuda.current_device() + cc = torch.cuda.get_device_capability(device) + + if cc[0] == 10: + from .linear_cross_entropy.blackwell import entry as gpu_entry + + self.forward_func: typing.Callable[..., typing.Any] = gpu_entry.forward + self.backward_func: typing.Callable[..., typing.Any] = gpu_entry.backward + else: + raise ValueError(f"Unsupported architecture: {cc[0]}") + + self._initialized = True + + +@lru_cache(maxsize=1) +def _get_platform() -> Platform: + """ + Helper function to lazy initialize the platform. + """ + return Platform() + + +class LinearCrossEntropy(torch.autograd.Function): + """ + This class implements a custom autograd function for linear and cross entropy, + whose equivalent logic in PyTorch is: + ```python + def torch_entropy(hidden, weight, labels): + logits = torch.matmul(hidden, weight) + logprobs = torch.nn.functional.cross_entropy(logits, labels) + return logprobs + ``` + """ + + @staticmethod + def forward( + ctx, + hidden: torch.Tensor, + weight: torch.Tensor, + labels: torch.Tensor, + tp_group: typing.Optional[torch.distributed.ProcessGroup] = None, + reduction: typing.Literal["none", "sum", "mean"] = "mean", + ignore_index: int = -100, + sequence_parallel: bool = False, + ) -> torch.Tensor: + """ + The forward pass of the Linear Cross Entropy. + If tp_group is not None, the weight tensor to each TP rank should be + (global_vocab_size // world_size, dim). + Note that each of the ranks should get equal shards along the vocab_size dimension. + + Args: + @param hidden: the input tensor with shape (num_tokens, dim) + @param weight: the lm_head weight tensor with shape (local_vocab_size, dim) + @param labels: the labels tensor with shape (num_tokens,) + @param tp_group: the distributed process group for TP. + @param reduction: Default to "mean", and can be one of "none", "sum", "mean". + @param ignore_index: The index to ignore. Default to -100. + @param sequence_parallel: Whether to use sequence parallel. Default to False. + Returns: + @return: logprobs with shape + - either (num_tokens,) when reduction is "none" + - or (1,) when reduction is "mean" or "sum" + + tp_group is None ----------------------------------> DP + B + A C + tp_group is not None & sequence_parallel is False -> TP + B0 B1 + A C0 C1 + tp_group is not None & sequence_parallel is True --> SP + B0 B1 + A0 C0 XX + A1 XX C1 + + When tp_group is not None, the weight tensor will be split along the vocab_size + dimension, which means each rank will get equal shards along the global_vocab_size + dimension. Specifically, the weight tensor to each rank will be (local_vocab_size, dim). + And there is an assumption that each rank will get the same local_vocab_size. + + When sequence_parallel is True, the hidden tensor will be split along the + sequence length dimension, which means each rank will get equal shards along + the sequence length dimension. Specifically, the hidden tensor to each rank + will be (local_num_tokens, dim). And there is an assumption that each rank + will get the same local_num_tokens. + + In TP forward pass, the hidden tensor and label tensor shall be identical + among all TP ranks, and it's user's responsibility to ensure the hidden tensor + is identical among all TP ranks. Then this operation will produce identical + logprobs among all TP ranks. + + In TP backward pass, the gradient of the logprobs shall be identical among all + TP ranks, and it's user's responsibility to ensure the gradient of the logprobs + is identical among all TP ranks. Then this operation will produce distinct gradients + for the local weight tensor, and identical gradients for the hidden tensor. + + ```python + # ------------ forward pass ------------ # + hidden = tp_group.broadcast(hidden, src=0) # handled by framework + labels = tp_group.broadcast(labels, src=0) # handled by framework + logprobs = linear_cross_entropy(...) + # each rank will get the same logprobs + + # ------------ backward pass ------------ # + g_logprobs = tp_group.broadcast(g_logprobs, src=0) # handled by framework + d_hidden, d_weight = torch.autograd.grad(...) + # each rank will get the same d_hidden, + # and distinct d_weight for local weight shard + ``` + + In SP forward pass, the hidden tensor shall be split along the sequence length dimension, + and the label tensor shall be identical among all TP ranks. + Then this operation will produce identical logprobs among all TP ranks. + + In SP backward pass, the gradient of the logprobs shall be identical among all TP ranks, + Then this operation will produce distinct gradients for the local hidden tensor + and local weight tensor. + ```python + # ------------ forward pass ------------ # + hidden = global_hidden[tp_rank] # handled by framework + labels = tp_group.broadcast(labels, src=0) # handled by framework + logprobs = linear_cross_entropy(...) + # each rank will get the same logprobs + + # ------------ backward pass ------------ # + g_logprobs = tp_group.broadcast(g_logprobs, src=0) # handled by framework + d_hidden, d_weight = torch.autograd.grad(...) + # each rank will get distinct local d_hidden and d_weight + ``` + """ + with torch.cuda.nvtx.range("LinearCrossEntropy-forward"): + ( + logprobs, + _maximum, + _acc, + _num_valid_tokens, + tp_rank, + tp_world_size, + global_hidden, + ) = _get_platform().forward_func( + hidden, weight, labels, tp_group, reduction, ignore_index, sequence_parallel + ) + ctx.save_for_backward(global_hidden, weight, labels, _maximum, _acc, _num_valid_tokens) + ctx.tp_group = tp_group + ctx.ignore_index = ignore_index + ctx.reduction = reduction + ctx.tp_rank = tp_rank + ctx.tp_world_size = tp_world_size + ctx.sequence_parallel = sequence_parallel + + return logprobs + + @staticmethod + def backward( + ctx, dlogprobs: torch.Tensor + ) -> typing.Tuple[torch.Tensor, torch.Tensor, None, None, None, None, None]: + """ + The backward pass of the Linear Cross Entropy. + Args: + dlogprobs (torch.Tensor): The gradient of the cross entropy, with shape + - either (num_tokens,) when reduction is "none" + - or (1,) when reduction is "mean" or "sum" + Returns: + dhidden (torch.Tensor): The gradient of the hidden. + dweight (torch.Tensor): The gradient of the weight. + """ + with torch.cuda.nvtx.range("LinearCrossEntropy-backward"): + (global_hidden, weight, labels, _maximum, _accu, _num_valid_tokens) = ctx.saved_tensors + + tp_group = ctx.tp_group + ignore_index = ctx.ignore_index + reduction = ctx.reduction + tp_rank = ctx.tp_rank + tp_world_size = ctx.tp_world_size + sequence_parallel = ctx.sequence_parallel + + d_hidden, d_weight = _get_platform().backward_func( + dlogprobs, + global_hidden, + weight, + labels, + _maximum, + _accu, + _num_valid_tokens, + reduction, + ignore_index, + tp_group, + tp_rank, + tp_world_size, + sequence_parallel, + ) + + return d_hidden, d_weight, None, None, None, None, None + + +def linear_cross_entropy( + hidden: torch.Tensor, + weight: torch.Tensor, + labels: torch.Tensor, + tp_group: typing.Optional[torch.distributed.ProcessGroup] = None, + reduction: typing.Literal["none", "sum", "mean"] = "mean", + ignore_index: int = -100, + sequence_parallel: bool = False, +) -> torch.Tensor: + """ + helper function for linear cross entropy. + """ + _impl = LinearCrossEntropy.apply + return _impl(hidden, weight, labels, tp_group, reduction, ignore_index, sequence_parallel) + + +__all__ = ["linear_cross_entropy", "LinearCrossEntropy"] diff --git a/megatron/core/fusions/linear_cross_entropy/__init__.py b/megatron/core/fusions/linear_cross_entropy/__init__.py new file mode 100644 index 00000000000..b9a9591fa69 --- /dev/null +++ b/megatron/core/fusions/linear_cross_entropy/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/__init__.py b/megatron/core/fusions/linear_cross_entropy/blackwell/__init__.py new file mode 100644 index 00000000000..b9a9591fa69 --- /dev/null +++ b/megatron/core/fusions/linear_cross_entropy/blackwell/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py b/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py new file mode 100644 index 00000000000..3178e8c6909 --- /dev/null +++ b/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py @@ -0,0 +1,667 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +import logging +from typing import Optional, Tuple, Type + +try: + import cuda.bindings.driver as cuda # type: ignore + import cutlass + import cutlass.cute as cute + import cutlass.pipeline as pipeline # type: ignore + import cutlass.utils as utils # type: ignore + import cutlass.utils.blackwell_helpers as sm100_utils # type: ignore + from cutlass.cute.nvgpu import cpasync, tcgen05 + + SM100_TMEM_CAPACITY_COLUMNS: int = 512 + + def make_thread_cooperative_group(size: int, alignment: Optional[int] = None): + """ + Create a thread cooperative group. + """ + return pipeline.CooperativeGroup( + pipeline.Agent.Thread, size, alignment=alignment if alignment is not None else size + ) + + class BwdPartialDlogits: + """ + This class implements the backward kernel for partial d_logits. + """ + + def __init__( + self, + reduction: int, + acc_dtype: Type[cutlass.Numeric] = cutlass.Float32, + use_2cta_instrs: bool = False, + mma_tiler_mn: Tuple[int, int] = (128, 256), + vocab_per_split: int = 512, + ): + self.REDUCTION: cutlass.Constexpr[cutlass.Int32] = cutlass.const_expr(reduction) + self.acc_dtype = acc_dtype + self.use_2cta_instrs = use_2cta_instrs + self.mma_tiler = (*mma_tiler_mn, 1) + self.vocab_per_split = vocab_per_split + + self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE + self.cluster_shape_mn = (2, 1) if self.use_2cta_instrs else (1, 1) + + self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100") + + self.threads_per_warp: int = 32 + + self.epi_warp_ids = (0, 1, 2, 3) + self.load_warp_ids = 4 + self.mma_warp_ids = 5 + self.empty_warp_ids = (6, 7) + + self.threads_per_cta: int = self.threads_per_warp * len( + (*self.epi_warp_ids, self.load_warp_ids, self.mma_warp_ids, *self.empty_warp_ids) + ) + self.cta_sync_barrier = pipeline.NamedBarrier( + barrier_id=1, num_threads=self.threads_per_cta + ) + + self.buffer_align_bytes: int = 1024 + self.num_regs_other: int = 32 + self.num_regs_epi: int = 192 + + def _compute_grid( + self, + problem_mnk: Tuple[int, int, int], + cluster_shape_mn: Tuple[int, int], + cta_tiler: Tuple[int, int, int], + ) -> Tuple[int, int, int]: + cluster_shape_mnk = (*cluster_shape_mn, 1) + + grid = cute.round_up( + ( + cute.ceil_div(problem_mnk[0], cta_tiler[0]), + cute.ceil_div(self.vocab_per_split, cta_tiler[1]), + 1, + ), + cluster_shape_mnk, + ) + return grid + + def _compute_stages( + self, + tiled_mma: cute.TiledMma, + mma_tiler: Tuple[int, int, int], + a_dtype: Type[cutlass.Numeric], + b_dtype: Type[cutlass.Numeric], + ): + num_acc_stage = 1 + num_ab_stage = 4 + num_epi_stage_per_tile = 4 + return num_acc_stage, num_ab_stage, num_epi_stage_per_tile + + def _setup_attributes( + self, + tiled_mma: cute.TiledMma, + a_dtype: Type[cutlass.Numeric], + b_dtype: Type[cutlass.Numeric], + ): + self.cluster_shape_mnk = (*self.cluster_shape_mn, 1) + self.cluster_layout_vmnk = cute.tiled_divide( + cute.make_layout(self.cluster_shape_mnk), (tiled_mma.thr_id.shape,) + ) + + mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2]) + # it requires k-mode to be 128B aligned + mma_inst_tile_k: int = 4 + self.mma_tiler = ( + self.mma_tiler[0], + self.mma_tiler[1], + mma_inst_shape_k * mma_inst_tile_k, + ) + + self.num_acc_stage, self.num_ab_stage, self.num_epi_stage_per_tile = ( + self._compute_stages(tiled_mma, self.mma_tiler, a_dtype, b_dtype) + ) + self.tmem_alloc_cols = self.num_acc_stage * self.mma_tiler[1] + assert self.tmem_alloc_cols <= SM100_TMEM_CAPACITY_COLUMNS + + self.cta_tile_shape_mnk = ( + self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape), + self.mma_tiler[1], + self.mma_tiler[2], + ) + + @cute.kernel + def kernel( + self, + split_idx: cutlass.Int32, + tiled_mma: cute.TiledMma, + tma_atom_a: cute.CopyAtom, + mA: cute.Tensor, + tma_atom_b: cute.CopyAtom, + mB: cute.Tensor, + mLabels: cute.Tensor, + mDlogprobs: cute.Tensor, + mMaximum: cute.Tensor, + mAccu: cute.Tensor, + mDlogits_partial: cute.Tensor, + scalarNumValidTokens: cute.Pointer, + ignore_index: cutlass.Int64, + a_smem_layout_staged: cute.ComposedLayout, + b_smem_layout_staged: cute.ComposedLayout, + cluster_layout_vmnk: cute.Layout, + problem_mnk: Tuple[int, int, int], + rank: cutlass.Int32, + ) -> None: + """ + The backward kernel for partial d_logits. + """ + warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx()) + tidx, _, _ = cute.arch.thread_idx() + bidx, bidy, _ = cute.arch.block_idx() + # FIXME: block swizzling applied here + pidm, pidn = bidx, bidy + + # FIXME: if 2 CTAs, modify here + cta_rank_in_cluster = 0 + block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster) + + # prefetch tma descriptors + if warp_idx == self.load_warp_ids: + cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_a) + cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_b) + + smem = utils.SmemAllocator() + storage = smem.allocate(self.shared_storage) + + ab_pipeline = pipeline.PipelineTmaUmma.create( + num_stages=self.num_ab_stage, + producer_group=make_thread_cooperative_group(len([self.load_warp_ids])), + consumer_group=make_thread_cooperative_group(len([self.mma_warp_ids])), + tx_count=self.tma_copy_ab_bytes, + barrier_storage=storage.load_ab_mbar_ptr.data_ptr(), + ) + ab_producer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Producer, self.num_ab_stage + ) + ab_consumer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Consumer, self.num_ab_stage + ) + + mma_pipeline = pipeline.PipelineUmmaAsync.create( + num_stages=self.num_acc_stage, + producer_group=make_thread_cooperative_group(len([self.mma_warp_ids])), + consumer_group=make_thread_cooperative_group( + self.threads_per_warp * len(self.epi_warp_ids) + ), + barrier_storage=storage.mma_mbar_ptr.data_ptr(), + ) + mma_producer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Producer, self.num_acc_stage + ) + mma_consumer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Consumer, self.num_acc_stage + ) + + tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr.data_ptr() + if warp_idx == self.empty_warp_ids[0]: + with cute.arch.elect_one(): + cute.arch.mbarrier_init( + tmem_dealloc_mbar_ptr, self.threads_per_warp * len(self.epi_warp_ids) + ) + cute.arch.mbarrier_init_fence() + + # -------- tensor partition ------------ # + # swizzle o [(tileM, tileK), loopM, loopK, stage] + sA = storage.sA.get_tensor( + a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner + ) + # swizzle o [(tileN, tileK), loopN, loopK, stage] + sB = storage.sB.get_tensor( + b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner + ) + + # FIXME: if 2 CTAs, modify here + thr_mma = tiled_mma.get_slice(0) + # [MMA, loopM, loopK, stage] + tCsA = thr_mma.make_fragment_A(sA) + # [MMA, loopN, loopK, stage] + tCsB = thr_mma.make_fragment_B(sB) + + # [tileM, tileK, loopK] + gA = cute.local_tile( + mA, (self.cta_tile_shape_mnk[0], self.cta_tile_shape_mnk[2]), (pidm, None) + ) + # [vocab_per_split, dim] + mB_n = cute.local_tile( + mB, (self.vocab_per_split, cute.size(mB.layout.shape, mode=[1])), (split_idx, 0) + ) + # [tileN, tileK, loopK] + gB = cute.local_tile( + mB_n, (self.cta_tile_shape_mnk[1], self.cta_tile_shape_mnk[2]), (pidn, None) + ) + + a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape) + # just to make sure SMEM and GMEM tensor has the same size in the first rank + tCgA = thr_mma.partition_A(gA) + tCgB = thr_mma.partition_B(gB) + # [CPY, stage] & [CPY, loopK] + tTMAsA, tTMAgA = cpasync.tma_partition( + tma_atom_a, + block_in_cluster_coord_vmnk[2], # cta_coord, + a_cta_layout, + cute.group_modes(sA, 0, 3), + cute.group_modes(tCgA, 0, 3), + ) + b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape) + # [CPY, stage] & [CPY, loopK] + tTMAsB, tTMAgB = cpasync.tma_partition( + tma_atom_b, + block_in_cluster_coord_vmnk[1], # cta_coord + b_cta_layout, + cute.group_modes(sB, 0, 3), + cute.group_modes(tCgB, 0, 3), + ) + + # ------ Allocate TMEM ------ # + tmem_holding_buf = storage.tmem_holding_buf + if warp_idx == self.empty_warp_ids[0]: + cute.arch.alloc_tmem( + self.tmem_alloc_cols, tmem_holding_buf, is_two_cta=self.use_2cta_instrs + ) + self.cta_sync_barrier.arrive_and_wait() + tmem_ptr = cute.arch.retrieve_tmem_ptr( + self.acc_dtype, alignment=16, ptr_to_buffer_holding_addr=tmem_holding_buf + ) + + tmem_shape = (128, self.tmem_alloc_cols) + acc_shape = thr_mma.partition_shape_C(tmem_shape) + tCtC_fake = thr_mma.make_fragment_C(acc_shape) + # [(tileM, tileN), loopM, loopN] + tCtC = cute.make_tensor(tmem_ptr, tCtC_fake.layout) + + # ------ Empty ------ # + if warp_idx in self.empty_warp_ids: + cute.arch.warpgroup_reg_dealloc(self.num_regs_other) + + # ------ Load ------ # + if warp_idx == self.load_warp_ids: + cute.arch.warpgroup_reg_dealloc(self.num_regs_other) + + for k in cutlass.range(cute.size(gA, mode=[2])): + ab_pipeline.producer_acquire(ab_producer_state) + cute.copy( + tma_atom_a, + tTMAgA[(None, k)], + tTMAsA[(None, ab_producer_state.index)], + tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state), + ) + cute.copy( + tma_atom_b, + tTMAgB[(None, k)], + tTMAsB[(None, ab_producer_state.index)], + tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state), + ) + ab_pipeline.producer_commit(ab_producer_state) + ab_producer_state.advance() + + # ------ MMA ------ # + if warp_idx == self.mma_warp_ids: + cute.arch.warpgroup_reg_dealloc(self.num_regs_other) + + tiled_mma.set(tcgen05.Field.ACCUMULATE, False) + mma_pipeline.producer_acquire(mma_producer_state) + + for k in cutlass.range(cute.size(gA, mode=[2])): + ab_pipeline.consumer_wait(ab_consumer_state) + + for kblock_idx in cutlass.range(cute.size(tCsA, mode=[2]), unroll_full=True): + cute.gemm( + tiled_mma, + cute.append_ones(tCtC[(None, None, mma_producer_state.index)]), + tCsA[(None, None, kblock_idx, ab_consumer_state.index)], + tCsB[(None, None, kblock_idx, ab_consumer_state.index)], + cute.append_ones(tCtC[(None, None, mma_producer_state.index)]), + ) + tiled_mma.set(tcgen05.Field.ACCUMULATE, True) + + ab_pipeline.consumer_release(ab_consumer_state) + ab_consumer_state.advance() + + mma_pipeline.producer_commit(mma_producer_state) + mma_producer_state.advance() + + # ------ EPI ------ # + if warp_idx in self.epi_warp_ids: + cute.arch.warpgroup_reg_alloc(self.num_regs_epi) + + copy_atom_t2r = sm100_utils.get_tmem_load_op( + self.cta_tile_shape_mnk, + utils.LayoutEnum.ROW_MAJOR, + self.acc_dtype, + self.acc_dtype, + (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile), + self.use_2cta_instrs, + ) + # [tileM, subTileN, loopM, CntSubTileN, loopN] + tAcc_epi = cute.flat_divide( + tCtC[((None, None), 0, None)], + (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile), + ) + tiled_copy_t2r = tcgen05.make_tmem_copy( + copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)] + ) + thr_copy_t2r = tiled_copy_t2r.get_slice(tidx) + tTMEM_load_tAcc = thr_copy_t2r.partition_S(tAcc_epi) + tTMEM_load_tAcc = cute.group_modes( + tTMEM_load_tAcc, 3, cute.rank(tTMEM_load_tAcc) - 1 + ) + + # predicates + cAcc = cute.make_identity_tensor(self.mma_tiler[:2]) + tCcAcc = thr_mma.partition_C(cAcc) + tCcAcc_epi = cute.flat_divide( + tCcAcc[((None, None), 0, None)], + (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile), + ) + tTMEM_load_cAcc = thr_copy_t2r.partition_D(tCcAcc_epi) + tTMEM_load_cAcc_shape = cute.select(tTMEM_load_cAcc.shape, mode=[0, 1, 2]) + tTMEM_load_rAcc = cute.make_fragment(tTMEM_load_cAcc_shape, self.acc_dtype) + + copy_atom_g2r_int64 = cute.make_copy_atom( + cute.nvgpu.CopyUniversalOp(), mLabels.element_type + ) + copy_atom_g2r_fp32 = cute.make_copy_atom( + cute.nvgpu.CopyUniversalOp(), mDlogprobs.element_type + ) + epilogue_thread_layout = cute.make_layout((128, 1), stride=(1, 1)) + tiled_copy_g2r_int64 = cute.make_tiled_copy_tv( + copy_atom_g2r_int64, epilogue_thread_layout, cute.make_layout((1, 1)) + ) + tiled_copy_g2r_fp32 = cute.make_tiled_copy_tv( + copy_atom_g2r_fp32, epilogue_thread_layout, cute.make_layout((1, 1)) + ) + thr_copy_g2r_int64 = tiled_copy_g2r_int64.get_slice(tidx) + thr_copy_g2r_fp32 = tiled_copy_g2r_fp32.get_slice(tidx) + + # [tileM] + gLabels = cute.local_tile(mLabels, (self.epi_tile[0],), (pidm,)) + gMaximum = cute.local_tile(mMaximum, (self.epi_tile[0],), (pidm,)) + gAccu = cute.local_tile(mAccu, (self.epi_tile[0],), (pidm,)) + + # slice along M direction + tMCAcc = thr_copy_g2r_int64.partition_S(cAcc)[(None, None, 0)] + # [(1, 1), 1] + tMCAcc_mask = cute.make_fragment(tMCAcc.shape, cutlass.Boolean) + # to align shape with gMax and gAccu + tMCAcc_mask = cute.append_ones(tMCAcc_mask) + tMCAcc_mask[0] = cute.elem_less( + pidm * self.epi_tile[0] + tidx, cute.size(mA, mode=[0]) + ) + # [(1, 1), 1, 1] + tMgLabels = thr_copy_g2r_int64.partition_S(cute.append_ones(gLabels)) + tMrLabels = cute.make_fragment(tMgLabels.shape, tMgLabels.element_type) + cute.copy(tiled_copy_g2r_int64, tMgLabels, tMrLabels, pred=tMCAcc_mask) + tMgMaximum = thr_copy_g2r_fp32.partition_S(cute.append_ones(gMaximum)) + tMrMaximum = cute.make_fragment(tMgMaximum.layout, tMgMaximum.element_type) + cute.copy(tiled_copy_g2r_fp32, tMgMaximum, tMrMaximum, pred=tMCAcc_mask) + tMgAccu = thr_copy_g2r_fp32.partition_S(cute.append_ones(gAccu)) + tMrAccu = cute.make_fragment(tMgAccu.layout, tMgAccu.element_type) + cute.copy(tiled_copy_g2r_fp32, tMgAccu, tMrAccu, pred=tMCAcc_mask) + + tMrDlogprobs = cute.make_fragment(tMgAccu.layout, mDlogprobs.element_type) + if cutlass.const_expr(self.REDUCTION == 2): + # mean reduction + num_valid_tokens = cute.make_tensor(scalarNumValidTokens, layout=(1,)) + tMrDlogprobs[0] = mDlogprobs[0] / num_valid_tokens[0].to(cutlass.Float32) + elif cutlass.const_expr(self.REDUCTION == 1): + # sum reduction + tMrDlogprobs[0] = mDlogprobs[0] + else: + # no reduction + gDlogprobs = cute.local_tile(mDlogprobs, (self.epi_tile[0],), (pidm,)) + tMgDlogprobs = thr_copy_g2r_fp32.partition_S(cute.append_ones(gDlogprobs)) + cute.copy(tiled_copy_g2r_fp32, tMgDlogprobs, tMrDlogprobs, pred=tMCAcc_mask) + + tMrAccu[0] = cute.arch.rcp_approx(tMrAccu[0]) + tMrDlogprobs[0] *= tMrLabels[0] != ignore_index + tMr_d_acc_exp_logits = tMrDlogprobs[0] * tMrAccu[0] + + # ------ Partial output ------ # + # [tileM, tileN] + gDlogits_partial = cute.local_tile( + mDlogits_partial, (self.epi_tile[0], self.epi_tile[1]), (pidm, pidn) + ) + # blackwell supports STG.256 + copy_atom_r2g = cute.make_copy_atom( + cute.nvgpu.CopyUniversalOp(), + gDlogits_partial.element_type, + num_bits_per_copy=256, + ) + tiled_copy_r2g = cute.make_tiled_copy_tv( + copy_atom_r2g, epilogue_thread_layout, copy_atom_r2g.layout_dst_tv + ) + thr_copy_r2g = tiled_copy_r2g.get_slice(tidx) + + # [CPY, loopM, loopN] + tR2GCAcc = thr_copy_r2g.partition_S(cAcc) + tR2GCAcc_pred = cute.make_fragment(tR2GCAcc.shape, cutlass.Boolean) + for elem in cutlass.range(cute.size(tR2GCAcc_pred, mode=[0])): + for row in cutlass.range(cute.size(tR2GCAcc_pred, mode=[1])): + for col in cutlass.range(cute.size(tR2GCAcc_pred, mode=[2])): + tR2GCAcc_pred[elem, row, col] = cute.elem_less( + pidm * self.epi_tile[0] + tR2GCAcc[elem, row, col][0], + problem_mnk[0], + ) and cute.elem_less( + split_idx * self.vocab_per_split + + pidn * self.epi_tile[1] + + tR2GCAcc[elem, row, col][1], + problem_mnk[1], + ) + + tR2GgDlogits = thr_copy_r2g.partition_D(gDlogits_partial) + + # for type conversion + dLogits_half = cute.make_fragment(tTMEM_load_rAcc.shape, tR2GgDlogits.element_type) + dLogits_half = cute.tiled_divide( + dLogits_half, (cute.size(tR2GgDlogits, mode=[0]), 1) + ) + dLogits_half = cute.group_modes(dLogits_half, 2, cute.rank(dLogits_half)) + + mma_pipeline.consumer_wait(mma_consumer_state) + + block_vocab_left_idx: cutlass.Int64 = ( + split_idx * self.vocab_per_split + pidn * self.epi_tile[1] + ) + block_vocab_right_idx: cutlass.Int64 = min( + split_idx * self.vocab_per_split + (pidn + 1) * self.epi_tile[1], + min((split_idx + 1) * self.vocab_per_split, problem_mnk[1]), + ) + num_n_subtiles: cutlass.Int64 = cute.ceil_div( + (block_vocab_right_idx - block_vocab_left_idx), + cute.size(tTMEM_load_rAcc, mode=[0]), + ) + for n_subtile in cutlass.range(num_n_subtiles): + cute.copy( + tiled_copy_t2r, + tTMEM_load_tAcc[(None, None, None, n_subtile, mma_consumer_state.index)], + tTMEM_load_rAcc, + ) + + for idx in cutlass.range( + cute.size(tTMEM_load_rAcc, mode=[0]), unroll_full=True + ): + # exp_logits + tTMEM_load_rAcc[idx] = cute.exp(tTMEM_load_rAcc[idx] - tMrMaximum[0]) + + position: cutlass.Int64 = ( + rank * problem_mnk[1] + + split_idx * self.vocab_per_split + + pidn * self.epi_tile[1] + + n_subtile * cute.size(tTMEM_load_rAcc, mode=[0]) + + idx + ) + mask: cutlass.Boolean = ( + position == tMrLabels[0] and tMrLabels[0] != ignore_index + ) + # d_logits + tTMEM_load_rAcc[idx] *= tMr_d_acc_exp_logits + tTMEM_load_rAcc[idx] += mask * -tMrDlogprobs[0] + dLogits_half[idx] = tTMEM_load_rAcc[idx].to(dLogits_half.element_type) + + for idx in cutlass.range(cute.size(dLogits_half, mode=[1]), unroll_full=True): + copy_id = n_subtile * cute.size(dLogits_half, mode=[1]) + idx + cute.copy( + tiled_copy_r2g, + dLogits_half[(None, idx, None)], + tR2GgDlogits[(None, None, copy_id)], + pred=tR2GCAcc_pred[((0, None), None, copy_id)], + ) + + mma_pipeline.consumer_release(mma_consumer_state) + mma_consumer_state.advance() + + # ------ Deallocate TMEM ------ # + self.cta_sync_barrier.arrive_and_wait() + if warp_idx == self.empty_warp_ids[0]: + cute.arch.relinquish_tmem_alloc_permit() + cute.arch.dealloc_tmem( + tmem_ptr, self.tmem_alloc_cols, is_two_cta=self.use_2cta_instrs + ) + + @cute.jit + def __call__( + self, + split_idx: cutlass.Int32, + hidden: cute.Tensor, + weight: cute.Tensor, + labels: cute.Tensor, + dlogprobs: cute.Tensor, + maximum: cute.Tensor, + accu: cute.Tensor, + dlogits_partial: cute.Tensor, + scalarNumValidTokens: cute.Pointer, + ignore_index: cutlass.Int64, + rank: cutlass.Int32, + stream: cuda.CUstream, + ) -> None: + a_dtype: Type[cutlass.Numeric] = hidden.element_type + b_dtype: Type[cutlass.Numeric] = weight.element_type + + if cutlass.const_expr(hidden.element_type != weight.element_type): + raise RuntimeError( + f"data type don't match: {hidden.element_type} v.s. {weight.element_type}" + ) + if cutlass.const_expr(hidden.element_type not in [cutlass.Float16, cutlass.BFloat16]): + raise RuntimeError("hidden can only be FP16 or BF16") + if cutlass.const_expr(hidden.layout.shape[1] != weight.layout.shape[1]): + raise RuntimeError("K dimension doesn't match") + + problem_mnk = (hidden.layout.shape[0], weight.layout.shape[0], hidden.layout.shape[1]) + if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 16 != 0): + raise RuntimeError(f"K dimension is not 16B aligned: {problem_mnk[2]}") + if cutlass.const_expr((problem_mnk[2] * b_dtype.width // 8) % 128 != 0): + raise RuntimeError(f"N dimension is not 128B aligned: {problem_mnk[1]}") + + grid = self._compute_grid( + problem_mnk=problem_mnk, + cluster_shape_mn=self.cluster_shape_mn, + cta_tiler=self.mma_tiler, + ) + + a_major_mode = utils.LayoutEnum.from_tensor(hidden).mma_major_mode() + b_major_mode = utils.LayoutEnum.from_tensor(weight).mma_major_mode() + + tiled_mma = sm100_utils.make_trivial_tiled_mma( + a_dtype, + a_major_mode, + b_major_mode, + self.acc_dtype, + self.cta_group, + self.mma_tiler[:2], + ) + self._setup_attributes(tiled_mma, a_dtype, b_dtype) + + self.epi_tile = self.cta_tile_shape_mnk[:2] + + # Swizzle o [(tileM, tileK), loopM, loopK, stage] + a_smem_layout_staged = sm100_utils.make_smem_layout_a( + tiled_mma, self.mma_tiler, a_dtype, self.num_ab_stage + ) + # Swizzle o [(tileN, tileK), loopN, loopK, stage] + b_smem_layout_staged = sm100_utils.make_smem_layout_b( + tiled_mma, self.mma_tiler, b_dtype, self.num_ab_stage + ) + tma_load_op = cpasync.CopyBulkTensorTileG2SOp(self.cta_group) + tma_store_op = cpasync.CopyBulkTensorTileS2GOp() + + # Swizzle o [(tileM, tileK), loopM, loopK] + a_smem_layout = cute.select(a_smem_layout_staged, mode=[0, 1, 2]) + tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A( + tma_load_op, + hidden, + a_smem_layout, + self.mma_tiler, + tiled_mma, + self.cluster_layout_vmnk.shape, + ) + # Swizzle o [(tileN, tileK), loopN, loopK] + b_smem_layout = cute.select(b_smem_layout_staged, mode=[0, 1, 2]) + tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B( + tma_load_op, + weight, + b_smem_layout, + self.mma_tiler, + tiled_mma, + self.cluster_layout_vmnk.shape, + ) + a_copy_size = cute.size_in_bytes(a_dtype, a_smem_layout) + b_copy_size = cute.size_in_bytes(b_dtype, b_smem_layout) + self.tma_copy_ab_bytes = a_copy_size + b_copy_size + + @cute.struct + class SharedStorage: + """ + The shared storage for the backward kernel. + """ + + load_ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2] + mma_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2] + + tmem_dealloc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, 1] + tmem_holding_buf: cutlass.Int32 + + sA: cute.struct.Align[ + cute.struct.MemRange[a_dtype, cute.cosize(a_smem_layout_staged)], + self.buffer_align_bytes, + ] + sB: cute.struct.Align[ + cute.struct.MemRange[b_dtype, cute.cosize(b_smem_layout_staged)], + self.buffer_align_bytes, + ] + + self.shared_storage = SharedStorage + + self.kernel( + split_idx, + tiled_mma, + tma_atom_a, + tma_tensor_a, + tma_atom_b, + tma_tensor_b, + labels, + dlogprobs, + maximum, + accu, + dlogits_partial, + scalarNumValidTokens, + ignore_index, + a_smem_layout_staged, + b_smem_layout_staged, + self.cluster_layout_vmnk, + problem_mnk, + rank, + ).launch( + grid=grid, + block=[self.threads_per_cta, 1, 1], + cluster=self.cluster_shape_mnk, + stream=stream, + ) + +except ImportError: + logging.warning("Cutlass or CUDA bindings not found. BwdPartialDlogits will not be available.") diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py new file mode 100644 index 00000000000..dc369a7c558 --- /dev/null +++ b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py @@ -0,0 +1,475 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +import logging +import os +import typing +from dataclasses import dataclass, field +from functools import lru_cache + +try: + import cuda.bindings.driver as cuda # type: ignore + import cutlass + import cutlass.cute as cute + import torch + import torch.distributed as dist + import triton # type: ignore + from cutlass.cute.runtime import from_dlpack + + import megatron.core.fusions.linear_cross_entropy.utils as utils + from megatron.core.fusions.linear_cross_entropy.blackwell import ( + bwd_partial_dlogits as bwd_partial_dlogits, + ) + from megatron.core.fusions.linear_cross_entropy.blackwell import fwd_mainloop as fwd_mainloop + from megatron.core.fusions.linear_cross_entropy.blackwell import triton as triton_kernels + + @dataclass + class FwdConfig: + """ + The configuration for the forward pass. + """ + + _dedicated_stream: torch.cuda.Stream = field(default_factory=torch.cuda.Stream) + _dedicated_events: typing.List[torch.cuda.Event] = field(default_factory=list) + _initialized: bool = field(default=False) + _fwd_mainloop_kernels: typing.Dict[str, cute.kernel] = field(default_factory=dict) + _vocab_per_split: int = field( + default=int(os.environ.get("LCE_FWD_VOCAB_SPLIT_SIZE", 512 * 6)) + ) + + @dataclass + class BwdConfig: + """ + The configuration for the backward pass. + """ + + _bwd_kernel: typing.Dict[str, cute.kernel] = field(default_factory=dict) + _vocab_per_split: int = field( + default=int(os.environ.get("LCE_BWD_VOCAB_SPLIT_SIZE", 512 * 6)) + ) + _backward_method: utils.BackwardMethodEnum = field( + default=utils.BackwardMethodEnum.kDlogitsSplitN + ) + + @lru_cache(maxsize=1) + def _get_fwd_config() -> FwdConfig: + """ + Helper function to lazy initialize the forward configuration. + """ + return FwdConfig() + + @lru_cache(maxsize=1) + def _get_bwd_config() -> BwdConfig: + """ + Helper function to lazy initialize the backward configuration. + """ + return BwdConfig() + + def forward( + hidden: torch.Tensor, + weight: torch.Tensor, + labels: torch.Tensor, + tp_group: typing.Optional[torch.distributed.ProcessGroup] = None, + reduction: typing.Literal["none", "sum", "mean"] = "mean", + ignore_index: int = -100, + sequence_parallel: bool = False, + ) -> typing.Tuple[ + torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, torch.Tensor + ]: + """ + forward host function + """ + tp_rank = 0 if tp_group is None else torch.distributed.get_rank(tp_group) + tp_world_size = 1 if tp_group is None else torch.distributed.get_world_size(tp_group) + in_tp_mode = (tp_group is not None) and (tp_world_size > 1) + + assert hidden.is_cuda and weight.is_cuda and labels.is_cuda + assert weight.device == hidden.device and labels.device == hidden.device + + # hidden could be [batch, seqlen, dim] or [seqlen, batch, dim] or [tokens, dim] + assert hidden.dim() == 2 or hidden.dim() == 3 + # weight must be [vocab_size, dim] + assert weight.dim() == 2 + # labels could be [batch, seqlen] or [seqlen, batch] or [tokens] + assert (hidden.dim() == 2 and labels.dim() == 1) or ( + hidden.dim() == 3 and labels.dim() == 2 + ) + assert hidden.is_contiguous() and weight.is_contiguous() and labels.is_contiguous() + + hidden_view = hidden.view(-1, hidden.shape[-1]) + labels_view = labels.view(-1) + + assert ( + sequence_parallel and hidden_view.shape[0] * tp_world_size == labels_view.shape[0] + ) or (not sequence_parallel and hidden_view.shape[0] == labels_view.shape[0]) + assert hidden_view.shape[1] == weight.shape[1] + + global_hidden = hidden + if in_tp_mode and sequence_parallel: + partial_hidden_shape = hidden.shape + global_hidden_shape = ( + partial_hidden_shape[0] * tp_world_size, + *partial_hidden_shape[1:], + ) + global_hidden = torch.empty( + global_hidden_shape, dtype=hidden.dtype, device=hidden.device + ) + dist.all_gather_into_tensor(global_hidden, hidden, group=tp_group) + assert global_hidden.is_contiguous() + hidden_view = global_hidden.view(-1, global_hidden.shape[-1]) + + num_tokens, dim = hidden_view.shape + vocab_size, _ = weight.shape + + if not _get_fwd_config()._initialized: + _get_fwd_config()._dedicated_stream = torch.cuda.Stream(hidden.device) + _get_fwd_config()._dedicated_events = [torch.cuda.Event() for _ in range(2)] + _get_fwd_config()._initialized = True + + REDUCTION = utils.str_to_reduction_enum(reduction) + # declare logprobs + if REDUCTION == utils.EntropyReductionEnum.kNone: + logprobs = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32) + if in_tp_mode: + logprobs.zero_() + else: + logprobs = torch.zeros((), device=hidden.device, dtype=torch.float32) + # declare auxiliary tensors + maximum = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32) + accumulate = torch.empty_like(maximum, dtype=torch.float32) + num_valid_tokens = torch.empty((), device=hidden.device, dtype=torch.int64) + assert ( + maximum.is_contiguous() + and accumulate.is_contiguous() + and num_valid_tokens.is_contiguous() + ) + # declare intermediate tensors + # NOTE: this is a parameter for tuning + num_splits = ( + vocab_size + _get_fwd_config()._vocab_per_split - 1 + ) // _get_fwd_config()._vocab_per_split + _max = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32) + _accu = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32) + if REDUCTION == utils.EntropyReductionEnum.kNone: + _logprobs = logprobs + else: + _logprobs = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32) + if in_tp_mode: + _logprobs.zero_() + assert _max.is_contiguous() and _accu.is_contiguous() and _logprobs.is_contiguous() + + triton_kernels.get_num_valid_tokens[(1,)]( + num_tokens, ignore_index, labels_view, labels_view.stride(0), num_valid_tokens + ) + + # need to compile the kernel for the first time + hidden_packed = from_dlpack( + hidden_view.detach(), assumed_align=16 + ).mark_compact_shape_dynamic(mode=0) + weight_packed = from_dlpack(weight.detach(), assumed_align=16) + labels_packed = from_dlpack( + labels_view.detach(), assumed_align=8 + ).mark_compact_shape_dynamic(mode=0) + logprobs_packed = from_dlpack(_logprobs, assumed_align=16).mark_compact_shape_dynamic( + mode=0 + ) + _max_packed = from_dlpack(_max, assumed_align=8).mark_compact_shape_dynamic( + mode=0, stride_order=(0, 1) + ) + _accu_packed = from_dlpack(_accu, assumed_align=8).mark_compact_shape_dynamic( + mode=0, stride_order=(0, 1) + ) + cuda_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream) + + # VocabSize and Dim are fixed for a given model, + # only the number of tokens can vary + key = f"vocab_size:{vocab_size}+dim:{dim}+dtype:{hidden_view.dtype}" + if _get_fwd_config()._fwd_mainloop_kernels.get(key) is None: + fwd_mainloop_kernel = fwd_mainloop.FwdMainLoop( + vocab_per_split=_get_fwd_config()._vocab_per_split + ) + fwd_mainloop_compiled_kernel = cute.compile( + fwd_mainloop_kernel, + hidden_packed, + weight_packed, + labels_packed, + logprobs_packed, + _max_packed, + _accu_packed, + ignore_index, + tp_rank, + cuda_stream, + ) + _get_fwd_config()._fwd_mainloop_kernels[key] = fwd_mainloop_compiled_kernel + else: + fwd_mainloop_compiled_kernel = _get_fwd_config()._fwd_mainloop_kernels[key] + fwd_mainloop_compiled_kernel( + hidden_packed, + weight_packed, + labels_packed, + logprobs_packed, + _max_packed, + _accu_packed, + ignore_index, + tp_rank, + cuda_stream, + ) + + if not in_tp_mode: + + def grid(meta): + return (triton.cdiv(num_tokens, meta["BLOCK_SIZE_M"]),) + + triton_kernels.forward_dp_epilogue[grid]( + num_tokens, + num_splits, + ignore_index, + labels_view, + labels_view.stride(0), + num_valid_tokens, + _max, + _max.stride(0), + _max.stride(1), + _accu, + _accu.stride(0), + _accu.stride(1), + maximum, + maximum.stride(0), + accumulate, + maximum.stride(0), + _logprobs, + _logprobs.stride(0), + logprobs, + triton.language.constexpr(REDUCTION.value), + ) + else: + _max_backup = _max.clone() + dist.all_reduce(_max, op=dist.ReduceOp.MAX, group=tp_group) + + torch.cuda.current_stream().record_event(_get_fwd_config()._dedicated_events[0]) + with torch.cuda.stream(_get_fwd_config()._dedicated_stream): + _get_fwd_config()._dedicated_stream.wait_event( + _get_fwd_config()._dedicated_events[0] + ) + dist.all_reduce(_logprobs, op=dist.ReduceOp.SUM, group=tp_group) + _get_fwd_config()._dedicated_stream.record_event( + _get_fwd_config()._dedicated_events[1] + ) + + def grid(meta): + return (triton.cdiv(num_tokens, meta["BLOCK_SIZE_M"]),) + + triton_kernels.forward_tp_epilogue[grid]( + num_tokens, + num_splits, + _max, + _max.stride(0), + _max.stride(1), + _max_backup, + _max_backup.stride(0), + _max_backup.stride(1), + _accu, + _accu.stride(0), + _accu.stride(1), + maximum, + maximum.stride(0), + accumulate, + maximum.stride(0), + ) + # reduce accumulate + dist.all_reduce(accumulate, op=dist.ReduceOp.SUM, group=tp_group) + + # update logprobs + torch.cuda.current_stream().wait_event(_get_fwd_config()._dedicated_events[1]) + triton_kernels.forward_tp_epilogue_update_logprobs[grid]( + num_tokens, + ignore_index, + num_valid_tokens, + labels_view, + labels_view.stride(0), + _logprobs, + _logprobs.stride(0), + maximum, + maximum.stride(0), + accumulate, + accumulate.stride(0), + logprobs, + REDUCTION.value, + ) + + return ( + logprobs, + maximum, + accumulate, + num_valid_tokens, + tp_rank, + tp_world_size, + global_hidden, + ) + + def backward( + dlogprobs: torch.Tensor, + global_hidden: torch.Tensor, + weight: torch.Tensor, + labels: torch.Tensor, + maximum: torch.Tensor, + accu: torch.Tensor, + num_valid_tokens: torch.Tensor, + reduction: typing.Literal["none", "sum", "mean"] = "mean", + ignore_index: int = -100, + tp_group: typing.Optional[dist.ProcessGroup] = None, + tp_rank: int = 0, + tp_world_size: int = 1, + sequence_parallel: bool = False, + ) -> typing.Tuple[torch.Tensor, torch.Tensor]: + """ + backward host function + """ + in_tp_mode = (tp_group is not None) and (tp_world_size > 1) + + hidden_view = global_hidden.view(-1, global_hidden.shape[-1]) + labels_view = labels.view(-1) + + num_tokens, dim = hidden_view.shape + vocab_size, _ = weight.shape + + REDUCTION = utils.str_to_reduction_enum(reduction) + dlogprobs_view = dlogprobs.view(-1) + assert ( + REDUCTION == utils.EntropyReductionEnum.kNone and dlogprobs.shape == (num_tokens,) + ) or (REDUCTION != utils.EntropyReductionEnum.kNone and dlogprobs.dim() == 0) + assert dlogprobs.is_contiguous() and dlogprobs.is_cuda + + assert ( + num_valid_tokens.dim() == 0 + and num_valid_tokens.is_cuda + and num_valid_tokens.dtype == torch.int64 + ) + + d_hidden = torch.empty_like(global_hidden) + d_weight = torch.empty_like(weight) + assert d_hidden.is_contiguous() and d_weight.is_contiguous() + + # FIXME: implement different backward methods + _backward_method = _get_bwd_config()._backward_method + if _backward_method == utils.BackwardMethodEnum.kDlogitsSplitN: + vocab_per_split = _get_bwd_config()._vocab_per_split + num_splits = (vocab_size + vocab_per_split - 1) // vocab_per_split + + _d_logits = torch.empty( + (num_tokens, vocab_per_split), + device=global_hidden.device, + dtype=global_hidden.dtype, + ) + + hidden_packed = from_dlpack( + hidden_view.detach(), assumed_align=16 + ).mark_compact_shape_dynamic(mode=0) + weight_packed = from_dlpack(weight.detach(), assumed_align=16) + labels_packed = from_dlpack( + labels_view.detach(), assumed_align=8 + ).mark_compact_shape_dynamic(mode=0) + dlogprobs_packed = from_dlpack( + dlogprobs_view.detach(), assumed_align=8 + ).mark_compact_shape_dynamic(mode=0) + maximum_packed = from_dlpack( + maximum.detach(), assumed_align=8 + ).mark_compact_shape_dynamic(mode=0) + accu_packed = from_dlpack(accu.detach(), assumed_align=8).mark_compact_shape_dynamic( + mode=0 + ) + dlogits_packed = from_dlpack(_d_logits, assumed_align=32).mark_compact_shape_dynamic( + mode=0 + ) + scalarNumValidTokens_packed = cute.runtime.make_ptr( + cutlass.Int64, num_valid_tokens.data_ptr(), cute.AddressSpace.gmem, assumed_align=8 + ) + + stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream) + + key = ( + f"vocab_size:{vocab_size}+dim:{dim}+reduction:{REDUCTION}+dtype:{hidden_view.dtype}" + ) + if _get_bwd_config()._bwd_kernel.get(key) is None: + bwd_kernel = bwd_partial_dlogits.BwdPartialDlogits( + reduction=REDUCTION.value, vocab_per_split=vocab_per_split + ) + bwd_kernel_compiled = cute.compile( + bwd_kernel, + 0, # split_idx + hidden_packed, + weight_packed, + labels_packed, + dlogprobs_packed, + maximum_packed, + accu_packed, + dlogits_packed, + scalarNumValidTokens_packed, + ignore_index, + tp_rank, + stream, + ) + _get_bwd_config()._bwd_kernel[key] = bwd_kernel_compiled + else: + bwd_kernel_compiled = _get_bwd_config()._bwd_kernel.get(key) + + for split_idx in range(num_splits): + bwd_kernel_compiled( + split_idx, + hidden_packed, + weight_packed, + labels_packed, + dlogprobs_packed, + maximum_packed, + accu_packed, + dlogits_packed, + scalarNumValidTokens_packed, + ignore_index, + tp_rank, + stream, + ) + # remove padding areas + # cublas can handle non-contiguous tensors + # therefore, we do not need to contiguous the tensor + vocab_right_bound = ( + min((split_idx + 1) * vocab_per_split, vocab_size) - split_idx * vocab_per_split + ) + valid_d_logits = _d_logits[:, :vocab_right_bound] + + torch.addmm( + input=d_hidden.view(-1, dim), + mat1=valid_d_logits, + mat2=weight[split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :], + beta=(split_idx != 0), + alpha=1.0, + out=d_hidden.view(-1, dim), + ) + torch.matmul( + valid_d_logits.T, + hidden_view, + out=d_weight[ + split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, : + ], + ) + else: + raise NotImplementedError(f"Unsupported backward method: {_backward_method}") + + if in_tp_mode: + dist.all_reduce(d_hidden, op=dist.ReduceOp.SUM, group=tp_group) + if sequence_parallel: + partial_hidden_shape = ( + global_hidden.shape[0] // tp_world_size, + *global_hidden.shape[1:], + ) + partial_num_tokens = num_tokens // tp_world_size + d_hidden = d_hidden.view(-1, d_hidden.shape[-1])[ + tp_rank * partial_num_tokens : (tp_rank + 1) * partial_num_tokens, : + ] + d_hidden = d_hidden.view(partial_hidden_shape).clone() + + return d_hidden, d_weight + +except ImportError: + logging.warning( + "Cutlass or CUDA bindings not found. LinearCrossEntropy Blackwell entry " + "points will not be available." + ) diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py b/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py new file mode 100644 index 00000000000..93f5b9523e7 --- /dev/null +++ b/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py @@ -0,0 +1,693 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +""" +Implementations of the fusion lm_head(Linear) + Cross-Entropy kernel +""" + +import logging +from typing import Tuple, Type + +try: + import cuda.bindings.driver as cuda # type: ignore + import cutlass + import cutlass.cute as cute + import cutlass.pipeline as pipeline # type: ignore + import cutlass.utils as utils # type: ignore + import cutlass.utils.blackwell_helpers as sm100_utils # type: ignore + from cutlass.cute.nvgpu import cpasync, tcgen05 + + SM100_TMEM_CAPACITY_COLUMNS: int = 512 + + def make_thread_cooperative_group(size: int): + """ + Create a thread cooperative group. + """ + return pipeline.CooperativeGroup(pipeline.Agent.Thread, size, alignment=size) + + class FwdMainLoop: + """ + This class implements the mainloop for forward process. + + Traits stored as attributes. + + :param acc_dtype: + """ + + def __init__( + self, + acc_dtype: Type[cutlass.Numeric] = cutlass.Float32, + use_2cta_instrs: bool = False, + mma_tiler_mn: Tuple[int, int] = (128, 256), + vocab_per_split: int = 512, + ): + """ + Configuration including: + - MMA instruction settings + - Cluster Shape + """ + self.acc_dtype: Type[cutlass.Numeric] = acc_dtype + self.use_2cta_instrs = use_2cta_instrs + # This is the shape covered by tiledMMA, not just single MMA instruction + self.mma_tiler = (*mma_tiler_mn, 1) + self.cta_tiler = (self.mma_tiler[0], vocab_per_split, self.mma_tiler[2]) + self.vocab_per_split = vocab_per_split + + self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE + self.cluster_shape_mn = (2, 1) if self.use_2cta_instrs else (1, 1) + + self.occupancy = 1 + # query SMEM capacity + self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100") + + # the maximum columns per MMA is 256, and there is only one GEMM, so we can fully + # assign TMEM for that GEMM of different tiles. + # so 512 = 2 * 256 + + self.threads_per_warp: int = 32 + # 1 warp for loading, 1 warp for issuing MMA, 1 WG for storing + self.epi_warp_ids = (0, 1, 2, 3) + self.load_warp_ids = 4 + self.mma_warp_ids = 5 + self.empty_warp_ids = (6, 7) + + self.threads_per_cta: int = self.threads_per_warp * len( + (*self.epi_warp_ids, self.load_warp_ids, self.mma_warp_ids, *self.empty_warp_ids) + ) + + self.cta_sync_barrier = pipeline.NamedBarrier( + barrier_id=1, num_threads=self.threads_per_cta + ) + self.tmem_alloc_barrier = pipeline.NamedBarrier( + barrier_id=2, num_threads=self.threads_per_cta + ) + + self.buffer_align_bytes: int = 1024 + self.num_regs_other: int = 32 + self.num_regs_epi: int = 192 + + def _compute_stages( + self, + tiled_mma: cute.TiledMma, + mma_tiler: Tuple[int, int, int], + a_dtype: Type[cutlass.Numeric], + b_dtype: Type[cutlass.Numeric], + ): + a_smem_layout_stage_one = sm100_utils.make_smem_layout_a( + tiled_mma, mma_tiler, a_dtype, 1 # only single stage + ) + b_smem_layout_stage_one = sm100_utils.make_smem_layout_b( + tiled_mma, mma_tiler, b_dtype, 1 + ) + a_bytes_per_stage = cute.size_in_bytes(a_dtype, a_smem_layout_stage_one) + b_bytes_per_stage = cute.size_in_bytes(b_dtype, b_smem_layout_stage_one) + num_acc_stage = 2 + num_a_stage = 4 + num_b_stage = 4 + num_epi_stage_per_tile = 4 + + return num_acc_stage, num_a_stage, num_b_stage, num_epi_stage_per_tile + + def _setup_attributes( + self, + tiled_mma: cute.TiledMma, + a_dtype: Type[cutlass.Numeric], + b_dtype: Type[cutlass.Numeric], + ): + self.cluster_shape_mnk = (*self.cluster_shape_mn, 1) + self.cluster_layout_vmnk = cute.tiled_divide( + cute.make_layout(self.cluster_shape_mnk), (tiled_mma.thr_id.shape,) + ) + + # this is fixed for dense MMA, k=16 + mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2]) + # 16*4 = 64; 64 * sizeof(FP16) = 128Bytes + mma_inst_tile_k: int = 4 + self.mma_tiler = ( + self.mma_tiler[0], + self.mma_tiler[1], + mma_inst_shape_k * mma_inst_tile_k, + ) + + self.num_acc_stage, self.num_a_stage, self.num_b_stage, self.num_epi_stage_per_tile = ( + self._compute_stages(tiled_mma, self.mma_tiler, a_dtype, b_dtype) + ) + self.tmem_alloc_cols = self.num_acc_stage * self.mma_tiler[1] + assert self.tmem_alloc_cols <= SM100_TMEM_CAPACITY_COLUMNS + + self.cta_tile_shape_mnk = ( + self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape), + self.mma_tiler[1], + self.mma_tiler[2], + ) + + @cute.kernel + def kernel( + self, + tiled_mma: cute.TiledMma, + tma_atom_a: cute.CopyAtom, + mA: cute.Tensor, + tma_atom_b: cute.CopyAtom, + mB: cute.Tensor, + mLabels: cute.Tensor, + mMax: cute.Tensor, + mAccu: cute.Tensor, + mLogprobs: cute.Tensor, + a_smem_layout_staged: cute.ComposedLayout, + b_smem_layout_staged: cute.ComposedLayout, + cluster_layout_vmnk: cute.Layout, + problem_mnk: Tuple[int, int, int], + ignore_index: cutlass.Int64, + rank: cutlass.Int32, + ): + """ + The forward kernel for the mainloop. + """ + warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx()) + tidx, _, _ = cute.arch.thread_idx() + bidx, bidy, _ = cute.arch.block_idx() + # FIXME: block swizzling applied here + pidm, pidn = bidx, bidy + + # prefetch tma descriptors + if warp_idx == self.load_warp_ids: + cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_a) + cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_b) + + # declare SMEM + smem = utils.SmemAllocator() + storage = smem.allocate(self.shared_storage) + + ab_pipeline = pipeline.PipelineTmaUmma.create( + num_stages=self.num_a_stage, + producer_group=make_thread_cooperative_group(len([self.load_warp_ids])), + consumer_group=make_thread_cooperative_group(len([self.mma_warp_ids])), + tx_count=self.tma_copy_a_bytes + self.tma_copy_b_bytes, + barrier_storage=storage.load_ab_mbar_ptr.data_ptr(), + ) + ab_producer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Producer, self.num_a_stage + ) + ab_consumer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Consumer, self.num_a_stage + ) + + mma_pipeline = pipeline.PipelineUmmaAsync.create( + num_stages=self.num_acc_stage, + producer_group=make_thread_cooperative_group(len([self.mma_warp_ids])), + consumer_group=make_thread_cooperative_group( + self.threads_per_warp * len(self.epi_warp_ids) + ), + barrier_storage=storage.mma_mbar_ptr.data_ptr(), + ) + mma_producer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Producer, self.num_acc_stage + ) + mma_consumer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Consumer, self.num_acc_stage + ) + + tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr.data_ptr() + if warp_idx == self.empty_warp_ids[0]: + with cute.arch.elect_one(): + cute.arch.mbarrier_init( + tmem_dealloc_mbar_ptr, self.threads_per_warp * len(self.epi_warp_ids) + ) + cute.arch.mbarrier_init_fence() + + # -------- SMEM partition ------------ # + # swizzle o [(tileM, tileK), loopM, loopK, Stage] + sA = storage.sA.get_tensor( + a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner + ) + # swizzle o [(tileN, tileK), loopN, loopK, stage] + sB = storage.sB.get_tensor( + b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner + ) + + # FIXME: if 2 CTAs, modify here + thr_mma = tiled_mma.get_slice(0) + # [MMA, loopM, loopK, stage] + tCsA = thr_mma.make_fragment_A(sA) + # [MMA, loopN, loopK, stage] + tCsB = thr_mma.make_fragment_B(sB) + + # ---------- GMEM partition ----------- # + # [tileM, tileK, loopK] + gA = cute.local_tile(mA, (self.mma_tiler[0], self.mma_tiler[2]), (pidm, None)) + + # [vocab_size_per_split, dim] + mB_n = cute.local_tile( + mB, (self.vocab_per_split, cute.size(mB.layout.shape, mode=[1])), (pidn, 0) + ) + + # [tileN, tileK, loopN, loopK] + gB = cute.local_tile(mB_n, (self.mma_tiler[1], self.mma_tiler[2]), (None, None)) + + # [MMA, tileCntM, tileCntK, loopK] + tCgA = thr_mma.partition_A(gA) + # [MMA, tileCntN, tileCntK, loopN, loopK] + tCgB = thr_mma.partition_B(gB) + + a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape) + # FIXME: if 2 CTAs, modify here + cta_rank_in_cluster = 0 + block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster) + tTMAsA, tTMAgA = cpasync.tma_partition( + tma_atom_a, + block_in_cluster_coord_vmnk[2], # cta_coord, + a_cta_layout, + cute.group_modes(sA, 0, 3), # SMEM tensor + cute.group_modes(tCgA, 0, 3), # GMEM tensor + ) + b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape) + tTMAsB, tTMAgB = cpasync.tma_partition( + tma_atom_b, + block_in_cluster_coord_vmnk[1], # cta_coord + b_cta_layout, + cute.group_modes(sB, 0, 3), + cute.group_modes(tCgB, 0, 3), + ) + + # Allocate TMEM + tmem_holding_buf = storage.tmem_holding_buf + if warp_idx == self.empty_warp_ids[0]: + cute.arch.alloc_tmem( + self.tmem_alloc_cols, tmem_holding_buf, is_two_cta=self.use_2cta_instrs + ) + self.cta_sync_barrier.arrive_and_wait() + tmem_ptr = cute.arch.retrieve_tmem_ptr( + self.acc_dtype, alignment=16, ptr_to_buffer_holding_addr=tmem_holding_buf + ) + + # [(tileM, tileN), loopM, loopN] + tmem_shape = (128, self.tmem_alloc_cols) + acc_shape = thr_mma.partition_shape_C(tmem_shape) + tCtC_fake = thr_mma.make_fragment_C(acc_shape) + tCtC = cute.make_tensor(tmem_ptr, tCtC_fake.layout) + + block_vocab_left_idx: cutlass.Int64 = pidn * self.vocab_per_split + block_vocab_right_idx: cutlass.Int64 = min( + (pidn + 1) * self.vocab_per_split, problem_mnk[1] + ) + num_n_tiles: cutlass.Int64 = cute.ceil_div( + (block_vocab_right_idx - block_vocab_left_idx), self.mma_tiler[1] + ) + + # /////// + # empty + # /////// + if warp_idx in self.empty_warp_ids: + cute.arch.warpgroup_reg_dealloc(self.num_regs_other) + + # /////// + # load + # /////// + if warp_idx == self.load_warp_ids: + cute.arch.warpgroup_reg_dealloc(self.num_regs_other) + + for n in cutlass.range(num_n_tiles): + for k in cutlass.range(cute.size(gA, mode=[2])): + ab_pipeline.producer_acquire(ab_producer_state) + cute.copy( + tma_atom_a, + tTMAgA[(None, k)], + tTMAsA[(None, ab_producer_state.index)], + tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state), + ) + cute.copy( + tma_atom_b, + tTMAgB[(None, n, k)], + tTMAsB[(None, ab_producer_state.index)], + tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state), + ) + ab_pipeline.producer_commit(ab_producer_state) + ab_producer_state.advance() + + # /////// + # mma + # /////// + if warp_idx == self.mma_warp_ids: + cute.arch.warpgroup_reg_dealloc(self.num_regs_other) + + for n in cutlass.range(num_n_tiles): + # disable accumulate for the first tile + tiled_mma.set(tcgen05.Field.ACCUMULATE, False) + mma_pipeline.producer_acquire(mma_producer_state) + + for k in cutlass.range(cute.size(gA, mode=[2])): + ab_pipeline.consumer_wait(ab_consumer_state) + + for kblock_idx in cutlass.range( + cute.size(tCsA, mode=[2]), unroll_full=True + ): + cute.gemm( + tiled_mma, + cute.append_ones(tCtC[(None, None, mma_producer_state.index)]), + tCsA[(None, None, kblock_idx, ab_consumer_state.index)], + tCsB[(None, None, kblock_idx, ab_consumer_state.index)], + cute.append_ones(tCtC[(None, None, mma_producer_state.index)]), + ) + # enable accumulate for the next tile + tiled_mma.set(tcgen05.Field.ACCUMULATE, True) + + ab_pipeline.consumer_release(ab_consumer_state) + ab_consumer_state.advance() + + mma_pipeline.producer_commit(mma_producer_state) + mma_producer_state.advance() + + # ////////// + # epilogue + # ////////// + if warp_idx in self.epi_warp_ids: + cute.arch.warpgroup_reg_alloc(self.num_regs_epi) + + # epilog TMEM copy and partition + copy_atom_t2r = sm100_utils.get_tmem_load_op( + self.cta_tile_shape_mnk, + utils.LayoutEnum.ROW_MAJOR, # This is hard-coded + self.acc_dtype, + self.acc_dtype, + (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile), + self.use_2cta_instrs, + ) + # [tileM, subTileN, loopM, CntSubTileN, loopN] + tAcc_epi = cute.flat_divide( + tCtC[((None, None), 0, None)], + (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile), + ) + tiled_copy_t2r = tcgen05.make_tmem_copy( + copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)] + ) + thr_copy_t2r = tiled_copy_t2r.get_slice(tidx) + tTMEM_load_tAcc = thr_copy_t2r.partition_S(tAcc_epi) + # [(pattern), loopM, loopN, CntTileM, CntTileN] + tTMEM_load_tAcc = cute.group_modes( + tTMEM_load_tAcc, 3, cute.rank(tTMEM_load_tAcc) - 1 + ) + + cAcc = cute.make_identity_tensor(self.mma_tiler[:2]) + tCcAcc = thr_mma.partition_C(cAcc) + # [tileM, subTileN, loopM, CntSubTileN, CntTileN] + tCcAcc_epi = cute.flat_divide( + tCcAcc[((None, None), 0, None)], + (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile), + ) + tTMEM_load_cAcc = thr_copy_t2r.partition_D(tCcAcc_epi) + tTMEM_load_cAcc_shape = cute.select(tTMEM_load_cAcc.shape, mode=[0, 1, 2]) + + # epilogue layouts + epilogue_thread_layout = cute.make_layout((128, 1)) + copy_atom_g2r = cute.make_copy_atom( + cute.nvgpu.CopyUniversalOp(), mLabels.element_type + ) + tiled_copy_g2r = cute.make_tiled_copy( + copy_atom_g2r, epilogue_thread_layout, (128, 1) + ) + thr_copy_g2r = tiled_copy_g2r.get_slice(tidx) + + copy_atom_r2g = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), cutlass.Float32) + tiled_copy_r2g = cute.make_tiled_copy( + copy_atom_r2g, epilogue_thread_layout, (128, 1) + ) + thr_copy_r2g = tiled_copy_r2g.get_slice(tidx) + + # auxiliary tensors + # [tileM] + gLabels = cute.local_tile(mLabels, (self.epi_tile[0],), (pidm,)) + + tLabelsCAcc = thr_copy_g2r.partition_S(cAcc)[(None, None, 0)] + tLabelsCAcc_mask = cute.make_fragment(tLabelsCAcc.shape, cutlass.Boolean) + # [(1, 1), 1] + tLabelsCAcc_mask[0] = cute.elem_less(pidm * self.epi_tile[0] + tidx, problem_mnk[0]) + # to align shape with gMax and gAccu + tLabelsCAcc_mask = cute.append_ones(tLabelsCAcc_mask) + + # [(1, 1), 1, 1] + tLabelsgLabels = thr_copy_g2r.partition_S(cute.append_ones(gLabels)) + tLabelsrLabels = cute.make_fragment( + tLabelsgLabels.shape, tLabelsgLabels.element_type + ) + cute.copy(tiled_copy_g2r, tLabelsgLabels, tLabelsrLabels, pred=tLabelsCAcc_mask) + valid_mask: cutlass.Boolean = ( + tLabelsrLabels[0] != ignore_index + ) and tLabelsCAcc_mask[0] + + # [tileM, 1] + gMax = cute.local_tile(mMax, (self.epi_tile[0], 1), (pidm, pidn)) + # [(CPYM, CPYN), loopM, loopN] + tR2GgMax = thr_copy_r2g.partition_D(gMax) + tR2GrMax = cute.make_fragment(tR2GgMax.shape, tR2GgMax.element_type) + tR2GrMax.fill(-1e30) + + # [tileM, 1] + gAccu = cute.local_tile(mAccu, (self.epi_tile[0], 1), (pidm, pidn)) + # [(CPYM, CPYN), loopM, loopN] + tR2GgAccu = thr_copy_r2g.partition_D(gAccu) + tR2GrAccu = cute.make_fragment(tR2GgAccu.shape, tR2GgAccu.element_type) + tR2GrAccu.fill(0.0) + + # [tileM, 1] + gLogprobs = cute.append_ones( + cute.local_tile(mLogprobs, (self.epi_tile[0],), (pidm,)) + ) + # [(CPYM, CPYN), loopM, loopN] + tR2GgLogprobs = thr_copy_r2g.partition_D(gLogprobs) + tR2GrLogprobs = cute.make_fragment(tR2GgLogprobs.shape, tR2GgLogprobs.element_type) + tR2GrLogprobs.fill(0.0) + + # [(tileN // num_epi_stage_per_tile, 1), 1, 1] + tTMEM_load_rAcc = cute.make_fragment(tTMEM_load_cAcc_shape, self.acc_dtype) + + for n in cutlass.range(num_n_tiles): + mma_pipeline.consumer_wait(mma_consumer_state) + + left: cutlass.Int64 = block_vocab_left_idx + n * self.epi_tile[1] + right: cutlass.Int64 = min( + (n + 1) * self.epi_tile[1] + block_vocab_left_idx, block_vocab_right_idx + ) + num_n_subtiles: cutlass.Int64 = cute.ceil_div( + (right - left), cute.size(tTMEM_load_rAcc, mode=[0]) + ) + for n_subtile in cutlass.range(num_n_subtiles): + cute.copy( + tiled_copy_t2r, + tTMEM_load_tAcc[ + (None, None, None, n_subtile, mma_consumer_state.index) + ], + tTMEM_load_rAcc, + ) + + for idx in cutlass.range( + cute.size(tTMEM_load_rAcc, mode=[0]), unroll_full=True + ): + local_position: cutlass.Int64 = ( + n * self.epi_tile[1] + + n_subtile * cute.size(tTMEM_load_rAcc, mode=[0]) + + idx + ) + if (block_vocab_left_idx + local_position) < block_vocab_right_idx: + _max_old = tR2GrMax[0] + tR2GrMax[0] = cute.arch.fmax(tR2GrMax[0], tTMEM_load_rAcc[idx]) + exp_logits = cute.exp(tTMEM_load_rAcc[idx] - tR2GrMax[0]) + coeff = cute.exp(_max_old - tR2GrMax[0]) + tR2GrAccu[0] = coeff * tR2GrAccu[0] + exp_logits + + position: cutlass.Int64 = ( + rank * problem_mnk[1] + + pidn * self.vocab_per_split + + local_position + ) + mask: cutlass.Boolean = valid_mask and ( + position == tLabelsrLabels[0] + ) + tR2GrLogprobs[0] += mask * tTMEM_load_rAcc[idx] + + mma_pipeline.consumer_release(mma_consumer_state) + mma_consumer_state.advance() + + cute.copy(tiled_copy_r2g, tR2GrMax, tR2GgMax, pred=tLabelsCAcc_mask) + cute.copy(tiled_copy_r2g, tR2GrAccu, tR2GgAccu, pred=tLabelsCAcc_mask) + + vocab_left_idx: cutlass.Int64 = rank * problem_mnk[1] + pidn * self.vocab_per_split + vocab_right_idx: cutlass.Int64 = rank * problem_mnk[1] + min( + (pidn + 1) * self.vocab_per_split, problem_mnk[1] + ) + valid: cutlass.Boolean = ( + tLabelsrLabels[0] >= vocab_left_idx and tLabelsrLabels[0] < vocab_right_idx + ) + tLabelsCAcc_mask[0] &= valid + + cute.copy(tiled_copy_r2g, tR2GrLogprobs, tR2GgLogprobs, pred=tLabelsCAcc_mask) + + # Dealloc TMEM + self.cta_sync_barrier.arrive_and_wait() + if warp_idx == self.empty_warp_ids[0]: + cute.arch.relinquish_tmem_alloc_permit() + cute.arch.dealloc_tmem( + tmem_ptr, self.tmem_alloc_cols, is_two_cta=self.use_2cta_instrs + ) + + @staticmethod + def _compute_grid( + problem_mnk: Tuple[int, int, int], + cluster_shape_mn: Tuple[int, int], + cta_tiler: Tuple[int, int, int], + num_splits: int, + ) -> Tuple[int, int, int]: + + cluster_shape = (*cluster_shape_mn, 1) + + grid = cute.round_up( + (cute.ceil_div(problem_mnk[0], cta_tiler[0]), num_splits, 1), cluster_shape + ) + return grid + + @cute.jit + def __call__( + self, + hidden: cute.Tensor, + weight: cute.Tensor, + labels: cute.Tensor, + _logprobs: cute.Tensor, + _max: cute.Tensor, + _accu: cute.Tensor, + ignore_index: cutlass.Int64, + rank: cutlass.Int32, + stream: cuda.CUstream, + ) -> None: + a_dtype: Type[cutlass.Numeric] = hidden.element_type + b_dtype: Type[cutlass.Numeric] = weight.element_type + + if cutlass.const_expr(hidden.element_type != weight.element_type): + raise RuntimeError( + f"data type don't match: {hidden.element_type} v.s. {weight.element_type}" + ) + if cutlass.const_expr(hidden.element_type not in [cutlass.Float16, cutlass.BFloat16]): + raise RuntimeError("hidden can only be FP16 or BF16") + if cutlass.const_expr(hidden.layout.shape[1] != weight.layout.shape[1]): + raise RuntimeError("K dimension doesn't match") + + problem_mnk = (hidden.layout.shape[0], weight.layout.shape[0], hidden.layout.shape[1]) + if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 16 != 0): + raise RuntimeError(f"K dimension is not 16B aligned: {problem_mnk[2]}") + + num_splits = cute.ceil_div(problem_mnk[1], self.vocab_per_split) + + grid = self._compute_grid( + problem_mnk=problem_mnk, + cluster_shape_mn=self.cluster_shape_mn, + cta_tiler=self.cta_tiler, + num_splits=num_splits, + ) + a_major_mode = utils.LayoutEnum.from_tensor(hidden).mma_major_mode() + b_major_mode = utils.LayoutEnum.from_tensor(weight).mma_major_mode() + + tiled_mma = sm100_utils.make_trivial_tiled_mma( + a_dtype, + a_major_mode, + b_major_mode, + self.acc_dtype, + self.cta_group, + self.mma_tiler[:2], + ) + + self._setup_attributes(tiled_mma, a_dtype, b_dtype) + if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 128 != 0): + raise RuntimeError(f"K dimension is not 128B aligned: {problem_mnk[2]}") + + self.epi_tile = self.mma_tiler[:2] + + # Swizzle o [(tileM, tileK), loopM, loopK, stage] + a_smem_layout_staged = sm100_utils.make_smem_layout_a( + tiled_mma, self.mma_tiler, a_dtype, self.num_a_stage + ) + # Swizzle o [(tileN, tileK), loopN, loopK, stage] + b_smem_layout_staged = sm100_utils.make_smem_layout_b( + tiled_mma, self.mma_tiler, b_dtype, self.num_b_stage + ) + + # TMA loading + tma_load_op = cpasync.CopyBulkTensorTileG2SOp(self.cta_group) + tma_store_op = cpasync.CopyBulkTensorTileS2GOp() + + # Swizzle o [(tileM, tileK), loopM, loopK] + a_smem_layout = cute.select(a_smem_layout_staged, mode=[0, 1, 2]) + # create tma copy atom for hidden, + # and the cooresponding tma descriptor tensor + tma_atom_a, tma_desc_a = cute.nvgpu.make_tiled_tma_atom_A( + tma_load_op, + hidden, # gmem_tensor + a_smem_layout, # SMEM layout + self.mma_tiler, # MMA tiler + tiled_mma, # TiledMMA + self.cluster_layout_vmnk.shape, # cluster_shape_vmnk + ) + # Swizzle o [(tileN, tileK), loopN, loopK] + b_smem_layout = cute.select(b_smem_layout_staged, mode=[0, 1, 2]) + tma_atom_b, tma_desc_b = cute.nvgpu.make_tiled_tma_atom_B( + tma_load_op, + weight, # gmem_tensor + b_smem_layout, # SMEM layout + self.mma_tiler, # MMA tiler + tiled_mma, # TiledMMA + self.cluster_layout_vmnk.shape, # cluster_shape_vmnk + ) + a_copy_size = cute.size_in_bytes(a_dtype, a_smem_layout) + b_copy_size = cute.size_in_bytes(b_dtype, b_smem_layout) + self.tma_copy_a_bytes = a_copy_size + self.tma_copy_b_bytes = b_copy_size + + assert self.num_a_stage == self.num_b_stage + + @cute.struct + class SharedStorage: + """ + The shared storage for the forward kernel. + """ + + # pipeline barriers, 2 = producer + consumer + load_ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_a_stage * 2] + mma_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2] + tmem_dealloc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, 1] + # tmem holding buffer + tmem_holding_buf: cutlass.Int32 + # SMEM tensors + sA: cute.struct.Align[ + cute.struct.MemRange[a_dtype, cute.cosize(a_smem_layout_staged)], + self.buffer_align_bytes, + ] + sB: cute.struct.Align[ + cute.struct.MemRange[b_dtype, cute.cosize(b_smem_layout_staged)], + self.buffer_align_bytes, + ] + + self.shared_storage = SharedStorage + + # launch kernel + self.kernel( + tiled_mma, + tma_atom_a, + tma_desc_a, + tma_atom_b, + tma_desc_b, + labels, + _max, + _accu, + _logprobs, + a_smem_layout_staged, + b_smem_layout_staged, + self.cluster_layout_vmnk, + problem_mnk, + ignore_index, + rank, + ).launch( + grid=grid, + block=[self.threads_per_cta, 1, 1], + cluster=self.cluster_shape_mnk, + stream=stream, + ) + return None + +except ImportError: + logging.warning("Cutlass or CUDA Python bindings not found. FwdMainLoop will not be available.") diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py b/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py new file mode 100644 index 00000000000..e025cc046f4 --- /dev/null +++ b/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py @@ -0,0 +1,248 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +import triton # type: ignore +import triton.language as tl # type: ignore + +# NOTE: tl.pointer_type() is not available in Triton 3.3.0 + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE_M": 1024}, num_stages=3, num_warps=32), + triton.Config({"BLOCK_SIZE_M": 2048}, num_stages=3, num_warps=32), + ], + key=["num_tokens"], +) +@triton.jit +def get_num_valid_tokens( + num_tokens: tl.int64, + ignore_index: tl.int64, + labels_ptr, #: tl.pointer_type(tl.int64), + stride_labels: tl.int64, + num_valid_tokens_ptr, #: tl.pointer_type(tl.int64), + BLOCK_SIZE_M: tl.constexpr, +): + """ + Calculate the number of valid tokens in the labels tensor. + """ + num_pid_m: tl.int64 = tl.cdiv(num_tokens, BLOCK_SIZE_M) + + num_valid_tokens: tl.int64 = tl.zeros((), dtype=tl.int64) + for m in range(0, num_pid_m): + offs_am = m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + + labels = tl.load( + labels_ptr + offs_am * stride_labels, mask=offs_am < num_tokens, other=ignore_index + ) + + valid_labels_mask = labels != ignore_index + num_valid_tokens += (tl.sum(valid_labels_mask.to(tl.int32), axis=0)).to(tl.int64) + tl.store(num_valid_tokens_ptr, num_valid_tokens) + + +@triton.autotune( + configs=[triton.Config({"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64})], + key=["num_tokens", "num_splits"], +) +@triton.jit +def forward_dp_epilogue( + num_tokens: tl.int64, + num_splits: tl.int64, # TODO: maybe this could be a constexpr + ignore_index: tl.int64, + labels_ptr, #: tl.pointer_type(tl.int64), + stride_labels: tl.int64, + num_valid_tokens_ptr, #: tl.pointer_type(tl.int64), + max_ptr, #: tl.pointer_type(tl.float32), + stride_max_m: tl.int64, + stride_max_n: tl.int64, + accu_ptr, #: tl.pointer_type(tl.float32), + stride_accu_m: tl.int64, + stride_accu_n: tl.int64, + global_max_ptr, #: tl.pointer_type(tl.float32), + stride_global_max: tl.int64, + global_accu_ptr, #: tl.pointer_type(tl.float32), + stride_global_accu: tl.int64, + global_logprobs_ptr, #: tl.pointer_type(tl.float32), + stride_global_logprobs: tl.int64, + global_logprobs_scalar_ptr, #: tl.pointer_type(tl.float32), + REDUCTION: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, +): + """ + forward epilogue in dp + """ + pid_m = tl.program_id(axis=0) + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + global_max = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32) + global_accu = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32) + + for pid_n in range(0, tl.cdiv(num_splits, BLOCK_SIZE_N)): + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + _max = tl.load( + max_ptr + offs_m[:, None] * stride_max_m + offs_n[None, :] * stride_max_n, + mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits), + other=0.0, + ) + _accu = tl.load( + accu_ptr + offs_m[:, None] * stride_accu_m + offs_n[None, :] * stride_accu_n, + mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits), + other=0.0, + ) + + # local reduction + _max_old = global_max + _local_max = tl.max(_max, axis=1, return_indices=False) + global_max = tl.maximum(global_max, _local_max) + + _scale = tl.exp(_max - global_max[:, None]) + _coeff = tl.exp(_max_old - global_max) + global_accu = _coeff * global_accu + tl.sum(_scale * _accu, axis=1) + + # store maximum + tl.store(global_max_ptr + offs_m * stride_global_max, global_max, mask=offs_m < num_tokens) + # store accumulate + tl.store(global_accu_ptr + offs_m * stride_global_accu, global_accu, mask=offs_m < num_tokens) + # update logprobs + labels = tl.load( + labels_ptr + offs_m * stride_labels, mask=offs_m < num_tokens, other=ignore_index + ) + global_logprobs_ptrs = global_logprobs_ptr + offs_m * stride_global_logprobs + global_logprobs = tl.load(global_logprobs_ptrs, mask=offs_m < num_tokens) + global_logprobs = global_max + tl.log(global_accu) - global_logprobs + label_mask = labels != ignore_index + global_logprobs = tl.where(label_mask, global_logprobs, 0.0) + + if REDUCTION == 0: # no-reduction + tl.store(global_logprobs_ptrs, global_logprobs, mask=offs_m < num_tokens) + elif REDUCTION == 1: # sum + global_logprobs_scalar = tl.sum(global_logprobs, axis=0) + tl.atomic_add(global_logprobs_scalar_ptr, global_logprobs_scalar) + elif REDUCTION == 2: # mean + num_valid_tokens = tl.load(num_valid_tokens_ptr) + global_logprobs_scalar = tl.fdiv( + tl.sum(global_logprobs, axis=0), num_valid_tokens.to(tl.float32) + ) + tl.atomic_add(global_logprobs_scalar_ptr, global_logprobs_scalar) + + +@triton.autotune( + configs=[triton.Config({"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64})], + key=["num_tokens", "num_splits"], +) +@triton.jit +def forward_tp_epilogue( + num_tokens: tl.int64, + num_splits: tl.int64, + reduced_max_ptr, #: tl.pointer_type(tl.float32), + stride_reduced_max_m: tl.int64, + stride_reduced_max_n: tl.int64, + original_max_ptr, #: tl.pointer_type(tl.float32), + stride_original_max_m: tl.int64, + stride_original_max_n: tl.int64, + accu_ptr, #: tl.pointer_type(tl.float32), + stride_accu_m: tl.int64, + stride_accu_n: tl.int64, + global_max_ptr, #: tl.pointer_type(tl.float32), + stride_global_max: tl.int64, + global_accu_ptr, #: tl.pointer_type(tl.float32), + stride_global_accu: tl.int64, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, +): + """ + forward epilogue in tp + """ + pid_m = tl.program_id(axis=0) + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + + global_max = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32) + global_accu = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32) + + for pid_n in range(0, tl.cdiv(num_splits, BLOCK_SIZE_N)): + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + _reduced_max = tl.load( + reduced_max_ptr + + offs_m[:, None] * stride_reduced_max_m + + offs_n[None, :] * stride_reduced_max_n, + mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits), + other=0.0, + ) + _original_max = tl.load( + original_max_ptr + + offs_m[:, None] * stride_original_max_m + + offs_n[None, :] * stride_original_max_n, + mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits), + other=0.0, + ) + _accu = tl.load( + accu_ptr + offs_m[:, None] * stride_accu_m + offs_n[None, :] * stride_accu_n, + mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits), + other=0.0, + ) + + # local reduction + _max_old = global_max + _local_max = tl.max(_reduced_max, axis=1) + global_max = tl.maximum(global_max, _local_max) + + # update accumulate + _coeff = tl.exp(_max_old - global_max) + _scale = tl.exp(_original_max - global_max[:, None]) + global_accu = _coeff * global_accu + tl.sum(_scale * _accu, axis=1) + + # store + tl.store(global_max_ptr + offs_m * stride_global_max, global_max, mask=offs_m < num_tokens) + tl.store(global_accu_ptr + offs_m * stride_global_accu, global_accu, mask=offs_m < num_tokens) + + +@triton.autotune(configs=[triton.Config({"BLOCK_SIZE_M": 16})], key=["num_tokens"]) +@triton.jit +def forward_tp_epilogue_update_logprobs( + num_tokens: tl.int64, + ignore_index: tl.int64, + num_valid_tokens_ptr, #: tl.pointer_type(tl.int64), + labels_ptr, #: tl.pointer_type(tl.int64), + stride_labels: tl.int64, + logprobs_ptr, #: tl.pointer_type(tl.float32), + stride_logprobs: tl.int64, + maximum_ptr, #: tl.pointer_type(tl.float32), + stride_maximum: tl.int64, + accumulate_ptr, #: tl.pointer_type(tl.float32), + stride_accumulate: tl.int64, + logprobs_scalar_ptr, #: tl.pointer_type(tl.float32), + REDUCTION: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, +): + """ + update logprobs in tp + """ + pid_m = tl.program_id(axis=0) + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + + logprobs = tl.load(logprobs_ptr + offs_m * stride_logprobs, mask=offs_m < num_tokens) + maximum = tl.load(maximum_ptr + offs_m * stride_maximum, mask=offs_m < num_tokens) + accumulate = tl.load(accumulate_ptr + offs_m * stride_accumulate, mask=offs_m < num_tokens) + + labels = tl.load( + labels_ptr + offs_m * stride_labels, mask=offs_m < num_tokens, other=ignore_index + ) + label_mask = labels != ignore_index + + logprobs = maximum + tl.log(accumulate) - logprobs + logprobs = tl.where(label_mask, logprobs, 0.0) + + if REDUCTION == 0: # no-reduction + tl.store(logprobs_ptr + offs_m * stride_logprobs, logprobs, mask=offs_m < num_tokens) + elif REDUCTION == 1: # sum + logprobs_scalar = tl.sum(logprobs, axis=0) + tl.atomic_add(logprobs_scalar_ptr, logprobs_scalar) + elif REDUCTION == 2: # mean + num_valid_tokens = tl.load(num_valid_tokens_ptr) + logprobs_scalar = tl.fdiv(tl.sum(logprobs, axis=0), num_valid_tokens.to(tl.float32)) + tl.atomic_add(logprobs_scalar_ptr, logprobs_scalar) diff --git a/megatron/core/fusions/linear_cross_entropy/utils.py b/megatron/core/fusions/linear_cross_entropy/utils.py new file mode 100644 index 00000000000..d077d64ab17 --- /dev/null +++ b/megatron/core/fusions/linear_cross_entropy/utils.py @@ -0,0 +1,43 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +import typing +from enum import Enum + + +class EntropyReductionEnum(Enum): + """ + Enum for the reduction method of cross entropy. + """ + + kNone = 0 + kSum = 1 + kMean = 2 + + +def str_to_reduction_enum(reduction: typing.Literal["none", "sum", "mean"]) -> EntropyReductionEnum: + """ + str -> EntropyReductionEnum + """ + _enum = EntropyReductionEnum.kNone + if reduction == "none": + _enum = EntropyReductionEnum.kNone + elif reduction == "sum": + _enum = EntropyReductionEnum.kSum + elif reduction == "mean": + _enum = EntropyReductionEnum.kMean + else: + raise ValueError(f"Invalid reduction: {reduction}") + return _enum + + +class BackwardMethodEnum(Enum): + """ + Enum for the backward method of linear cross entropy. + """ + + # two separate kernels for d_hidden and d_weight, respectively + kTwoKernels = 0 + # calculate partial d_logits along its N dimension + kDlogitsSplitN = 1 + # fuse d_hidden and d_weight into a single kernel + kFused = 2 diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py index 259bb716a93..13d74aa5271 100644 --- a/megatron/core/models/common/language_module/language_module.py +++ b/megatron/core/models/common/language_module/language_module.py @@ -1,7 +1,7 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import logging import os -from typing import Optional, Tuple +from typing import Any, Dict, Literal, Optional, Tuple import torch from torch import Tensor @@ -14,6 +14,7 @@ except: te_parallel_cross_entropy = None from megatron.core.fusions.fused_cross_entropy import fused_vocab_parallel_cross_entropy +from megatron.core.fusions.fused_linear_cross_entropy import linear_cross_entropy from megatron.core.pipeline_parallel.utils import ( is_pp_first_stage, is_pp_last_stage, @@ -125,6 +126,68 @@ def check_and_set_env_variable( check_and_set_env_variable("NVTE_FUSED_ATTN", 1, AttnBackend.auto) check_and_set_env_variable("NVTE_UNFUSED_ATTN", 1, AttnBackend.auto) + def compute_output_layer_and_language_model_loss( + self, + hidden: Tensor, + labels: Optional[Tensor], + weight: Tensor = None, + sequence_parallel_enabled: bool = False, + column_parallel_linear: torch.nn.Module = None, + col_linear_kwargs: Dict[str, Any] = {}, + reduction: Literal["none", "sum", "mean"] = "none", + ignore_index: int = -100, + ) -> Tensor: + """Computes the language model logits and loss (Cross entropy across vocabulary) + + Args: + hidden (Tensor): The hidden states from the transformer model + labels (Optional[Tensor]): The labels of dimension [batch size, seq length] + weight (Tensor): The weight tensor of shape [vocab size, hidden size]. + Required if using fused linear cross entropy. + column_parallel_linear (torch.nn.Module): The column parallel linear + layer to use for computing logits when not using fused linear cross entropy. + col_linear_kwargs (Dict[str, Any]): Additional kwargs for column parallel linear layer + reduction (Optional[str]): The reduction method. Defaults to "none", and can be + one of "none", "sum", "mean". + ignore_index (Optional[int]): The index to ignore in the loss calculation. + Defaults to -100. + + Returns: + Tensor: Loss tensor of dimensions [batch size, sequence_length]. + """ + if ( + self.config.cross_entropy_loss_fusion + and self.config.cross_entropy_fusion_impl == 'linear' + ): + assert ( + weight is not None + ), "weight cannot be None when using fused linear cross entropy." + assert ( + labels is not None + ), "labels cannot be None when using fused linear cross entropy." + # [b s] => [s b] + labels = labels.transpose(0, 1).contiguous() + loss = linear_cross_entropy( + hidden, + weight, + labels, + tp_group=self.pg_collection.tp, + sequence_parallel=sequence_parallel_enabled, + reduction=reduction, + ignore_index=ignore_index, + ) + + # [s b] => [b, s] + loss = loss.view_as(labels).transpose(0, 1).contiguous() + return loss + else: + assert ( + column_parallel_linear is not None + ), "column_parallel_linear cannot be None when not using fused linear cross entropy." + logits, _ = column_parallel_linear(hidden, **col_linear_kwargs) + + return self.compute_language_model_loss(labels, logits) + def compute_language_model_loss(self, labels: Tensor, logits: Tensor) -> Tensor: """Computes the language model loss (Cross entropy across vocabulary) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 70eea932683..4a6370bc49d 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -570,12 +570,6 @@ def _postprocess( # if loss_mask is not provided, use all ones as loss_mask loss_mask = torch.ones_like(mtp_labels) for mtp_layer_number in range(self.config.mtp_num_layers): - # output - mtp_logits, _ = self.output_layer( - hidden_states_list[mtp_layer_number + 1], - weight=output_weight, - runtime_gather_output=runtime_gather_output, - ) # Calc loss for the current Multi-Token Prediction (MTP) layers. mtp_labels, _ = roll_tensor( mtp_labels, @@ -591,7 +585,20 @@ def _postprocess( cp_group=self.cp_group, packed_seq_params=packed_seq_params, ) - mtp_loss = self.compute_language_model_loss(mtp_labels, mtp_logits) + + # Compute mtp loss without storing logits to save memory. + mtp_loss = self.compute_output_layer_and_language_model_loss( + hidden_states_list[mtp_layer_number + 1], + labels=mtp_labels, + weight=self.shared_embedding_or_output_weight(), + sequence_parallel_enabled=self.output_layer.sequence_parallel, + column_parallel_linear=self.output_layer, + col_linear_kwargs={ + 'weight': output_weight, + 'runtime_gather_output': runtime_gather_output, + }, + ) + mtp_loss = loss_mask * mtp_loss if self.training: # TODO(shifangx): remove the use of parallel_state here @@ -636,9 +643,12 @@ def _postprocess( hidden_states.squeeze(1).unsqueeze(0) ).unsqueeze(1) - logits, _ = self.output_layer( - hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output - ) + if has_config_logger_enabled(self.config) or labels is None: + logits, _ = self.output_layer( + hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output + ) + else: + logits = None # Restore sequence parallel execution to the output layer if necessary. if sequence_parallel_override: @@ -665,7 +675,17 @@ def _postprocess( # [s b h] => [b s h] return logits.transpose(0, 1).contiguous() - loss = self.compute_language_model_loss(labels, logits) + loss = self.compute_output_layer_and_language_model_loss( + hidden_states, + labels=labels, + weight=self.shared_embedding_or_output_weight(), + sequence_parallel_enabled=self.output_layer.sequence_parallel, + column_parallel_linear=self.output_layer, + col_linear_kwargs={ + 'weight': output_weight, + 'runtime_gather_output': runtime_gather_output, + }, + ) return loss diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py index 378cf7e47d6..e4074eda806 100644 --- a/megatron/core/models/mamba/mamba_model.py +++ b/megatron/core/models/mamba/mamba_model.py @@ -267,9 +267,10 @@ def forward( hidden_states.squeeze(1).unsqueeze(0) ).unsqueeze(1) - logits, _ = self.output_layer( - hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output - ) + if labels is None: + logits, _ = self.output_layer( + hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output + ) # Restore sequence parallel execution to the output layer if necessary. if sequence_parallel_override: @@ -284,6 +285,16 @@ def forward( # [s b h] => [b s h] return logits.transpose(0, 1).contiguous() - loss = self.compute_language_model_loss(labels, logits) + loss = self.compute_output_layer_and_language_model_loss( + hidden_states, + labels, + weight=self.shared_embedding_or_output_weight(), + sequence_parallel_enabled=self.output_layer.sequence_parallel, + column_parallel_linear=self.output_layer, + col_linear_kwargs={ + "weight": output_weight, + "runtime_gather_output": runtime_gather_output, + }, + ) return loss diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index c413c346b69..2c87532c919 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -2330,7 +2330,7 @@ def _add_training_args(parser): help='Enabled fusion of cross entropy loss calculation.', dest='cross_entropy_loss_fusion') group.add_argument('--cross-entropy-fusion-impl', type=str, default='native', - choices=['native', 'te'], + choices=['native', 'te', 'linear'], help='Implementation of cross entropy loss calculation.') group.add_argument('--use-flash-attn', action='store_true', help='use FlashAttention implementation of attention. ' diff --git a/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py new file mode 100644 index 00000000000..3ac8e7f6200 --- /dev/null +++ b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py @@ -0,0 +1,1509 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +import contextlib +import os +import typing +from contextlib import ExitStack +from dataclasses import dataclass + +import numpy as np +import pytest +import torch +import torch.distributed as dist +from torch.utils.data import DataLoader, Dataset +from torch.utils.data.distributed import DistributedSampler + +import megatron.core.parallel_state as ps +from megatron.core.fusions.fused_linear_cross_entropy import linear_cross_entropy +from megatron.core.models.gpt.gpt_layer_specs import ( + get_gpt_decoder_block_spec, + get_gpt_mtp_block_spec, +) +from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.training.utils import get_device_arch_version +from tests.unit_tests.a2a_overlap.utils import ( + deterministic_mode, + get_test_config, + get_valid_fp8_flags, + get_valid_token_dispatcher_types, +) +from tests.unit_tests.test_utilities import Utils + + +# 1. Define a standardized context to hold your distributed info +@dataclass +class DistContext: + rank: int + world_size: int + group: dist.ProcessGroup + is_chief: bool + + +# 2. Create a module-scoped fixture +# This runs ONE time per file, no matter how many test classes you have. +@pytest.fixture(scope="module") +def distributed_context(): + # --- PRE-CHECK --- + if "WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2: + pytest.skip("Requires torchrun with multiple GPUs (WORLD_SIZE >= 2)") + + # --- SETUP --- + is_external_init = dist.is_initialized() + + if not is_external_init: + # Initialize only if not already done (e.g., by another test runner) + dist.init_process_group( + backend="nccl", + init_method="env://", + world_size=int(os.environ["WORLD_SIZE"]), + rank=int(os.environ["RANK"]), + ) + + # Set device immediately to avoid cross-device pollution + local_rank = int(os.environ.get("LOCAL_RANK", os.environ["RANK"])) + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(device) + + # Gather context data + rank = dist.get_rank() + world_size = dist.get_world_size() + group = dist.group.WORLD + + print(f"[INFO]: Initialized Rank: {rank} / {world_size}") + + context = DistContext(rank=rank, world_size=world_size, group=group, is_chief=(rank == 0)) + + # Yield control to the tests + yield context + + # --- TEARDOWN --- + # Only destroy if we were the ones who initialized it + if not is_external_init: + dist.destroy_process_group() + + +class MockDataset(Dataset): + """ + Mock dataset for torchtitan GPT training tests + Generates synthetic tokenized sequences on-the-fly + """ + + def __init__( + self, + num_samples=10000, + micro_batch_size=4, + sequence_length=2048, + vocab_size=128256, + seed=42, + ): + """ + Initialize mock dataset + + Args: + num_samples: Total number of samples + sequence_length: Length of each sequence + vocab_size: Size of vocabulary + seed: Random seed for reproducibility + """ + self.num_samples = num_samples + self.micro_batch_size = micro_batch_size + self.sequence_length = sequence_length + self.vocab_size = vocab_size + self.seed = seed + + # Set numpy seed for deterministic generation + np.random.seed(seed) + + def __len__(self): + return self.num_samples + + def __getitem__(self, idx): + """ + Generate a single training sample + + Returns: + dict with 'tokens' and 'labels' + """ + # Use idx as seed for reproducible but varied samples + rng = np.random.RandomState(self.seed + idx) + + # Generate random token sequence + tokens = rng.randint(0, self.vocab_size, size=self.sequence_length, dtype=np.int64) + + # Labels are tokens shifted by 1 (next token prediction) + labels = rng.randint(0, self.vocab_size, size=self.sequence_length, dtype=np.int64) + + return { + 'input_ids': torch.from_numpy(tokens.copy()), + 'labels': torch.from_numpy(labels.copy()), + "attention_mask": torch.ones( + (1, self.sequence_length, self.sequence_length), dtype=bool + ), + } + + +def build_model(config): + max_seq_len = 300 + + # build layer spec + transformer_layer_spec = get_gpt_decoder_block_spec(config=config, use_transformer_engine=True) + mtp_block_spec = get_gpt_mtp_block_spec(config, transformer_layer_spec.layer_specs[-1], True) + + # build model + gpt_model = GPTModel( + config=config, + transformer_layer_spec=transformer_layer_spec, + mtp_block_spec=mtp_block_spec, + vocab_size=100, + pre_process=True, + post_process=True, + max_sequence_length=max_seq_len, + ) + return gpt_model + + +# Define a reusable context manager +@contextlib.contextmanager +def init_model_parallel(tp=1, pp=1, ep=1): + try: + Utils.initialize_model_parallel( + tensor_model_parallel_size=tp, + pipeline_model_parallel_size=pp, + expert_model_parallel_size=ep, + ) + yield + finally: + Utils.destroy_model_parallel() + + +def init_gpt_dataloader( + dp_group, micro_batch_size=1, vocab_size=50257, sequence_length=128, batch_size=8 +): + dataset = MockDataset( + num_samples=1000, + micro_batch_size=micro_batch_size, + sequence_length=sequence_length, + vocab_size=vocab_size, + seed=42, + ) + sampler = DistributedSampler(dataset, num_replicas=dp_group.size(), rank=dp_group.rank()) + dataloader = DataLoader(dataset, batch_size=batch_size, sampler=sampler) + return dataloader + + +# skip it for good +@pytest.mark.skipif( + ("WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2) or True, + reason="Requires torchrun with multiple GPUs", +) +class TestFusedLinearCrossEntropyOnGptModel: + @pytest.mark.parametrize("fp8_flag", get_valid_fp8_flags()) + @pytest.mark.parametrize("mtp_layers", [0, 1]) + @pytest.mark.parametrize("dispatcher_type", get_valid_token_dispatcher_types()) + @pytest.mark.parametrize("layer_num", [2]) + def test_gpt_model(self, mtp_layers, dispatcher_type, fp8_flag, layer_num): + with ExitStack() as stack: + gpu_count = torch.cuda.device_count() + tp = min(2, gpu_count) + ep = gpu_count // tp + stack.enter_context(init_model_parallel(tp=tp, ep=ep)) + stack.enter_context(deterministic_mode()) + + # create TransformerConfig + extra_kwargs = { + "moe_token_dispatcher_type": dispatcher_type, + "sequence_parallel": tp > 1, + "tensor_model_parallel_size": tp, + } + if dispatcher_type == "flex": + extra_kwargs["moe_enable_deepep"] = True + extra_kwargs["moe_router_dtype"] = "fp32" + if fp8_flag is not None: + extra_kwargs["fp8"] = fp8_flag[0] + extra_kwargs["fp8_recipe"] = fp8_flag[1] + if mtp_layers > 0: + extra_kwargs["mtp_num_layers"] = mtp_layers + extra_kwargs["mtp_loss_scaling_factor"] = 1.1 + + # build config + config = get_test_config(num_layers=layer_num, extra_kwargs=extra_kwargs) + config.expert_model_parallel_size = ep + + # build model + gpt_model = build_model(config) + gpt_model.cuda() + + dataloader = init_gpt_dataloader( + ps.get_data_parallel_group(), + vocab_size=gpt_model.vocab_size, + micro_batch_size=1, + sequence_length=gpt_model.max_sequence_length, + batch_size=4, + ) + # for batch in dataloder: + for batch in dataloader: + batch["position_ids"] = torch.arange( + gpt_model.max_sequence_length, dtype=torch.int64 + ) + batch = {k: v.cuda() for k, v in batch.items()} + gpt_model.zero_grad() + output = gpt_model(**batch) + loss = output.sum() + loss.backward() + + +@pytest.mark.skipif( + "WORLD_SIZE" in os.environ and os.environ["WORLD_SIZE"] != "1", reason="Requires single GPU" +) +@pytest.mark.skipif(get_device_arch_version() != 10, reason="Requires GPU architecture = 10") +class TestFusedLinearCrossEntropyDataParallel: + def cleanup(self): + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + import gc + + gc.collect() + torch.cuda.synchronize() + + @staticmethod + def torch_linear_cross_entropy( + hidden: torch.Tensor, + weight: torch.Tensor, + labels: torch.Tensor, + reduction: str, + ignore_index: int, + ): + # NOTE: need to convert to fp32 to fp32 accumulation, + # thus assure accuracy + logits = hidden.to(torch.float32) @ weight.T.to(torch.float32) + logprobs = torch.nn.functional.cross_entropy( + logits.view(-1, logits.shape[-1]), + labels.view(-1), + reduction=reduction, + ignore_index=ignore_index, + ) + return logprobs.to(torch.float32) + + @staticmethod + def get_problems(): + return [ + (80, 125, 64), + (80, 152064, 64), + (1024, 152064, 4096), + (4096, 152063, 8192), + ((1, 4096), 152064, 8192), + ((2, 4096), 152064, 8192), + ] + + @staticmethod + def get_ignore_index(): + return [-100, 4] + + def test_kernel_launch(self): + """ + Check if the compiled kernel can be + launched with different problem sizes + """ + self.cleanup() + + num_tokens = [15, 26, 128, 513, 2048, 8192] + vocab_size = 152064 + dim = 4096 + dtype = torch.bfloat16 + reduction = "mean" + ignore_index = -100 + + weight = torch.randn(vocab_size, dim, dtype=dtype, device="cuda").requires_grad_() + for num_token in num_tokens: + hidden = torch.randn(num_token, dim, dtype=dtype, device="cuda").requires_grad_() + labels = torch.randint(0, vocab_size, (num_token,), dtype=torch.long, device="cuda") + + logprobs = linear_cross_entropy( + hidden, weight, labels, reduction=reduction, ignore_index=ignore_index + ) + assert not torch.isnan(logprobs).any() + + gLogprobs = torch.randn_like(logprobs) + (d_hidden, d_weight) = torch.autograd.grad( + (logprobs,), (hidden, weight), (gLogprobs,), retain_graph=False + ) + assert not torch.isnan(d_hidden).any() + assert not torch.isnan(d_weight).any() + + @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) + @pytest.mark.parametrize("problem", get_problems()) + @pytest.mark.parametrize("reduction", ["none", "mean", "sum"]) + @pytest.mark.parametrize("ignore_index", get_ignore_index()) + def test_correctness(self, dtype, problem, reduction, ignore_index): + num_tokens, vocabsize, dim = problem + hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim) + labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens + + hidden = ( + torch.empty(hidden_shape, dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + weight = ( + torch.empty((vocabsize, dim), dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda") + if ignore_index >= 0 and ignore_index < vocabsize: + pad_labels = torch.nn.functional.pad(labels, (0, 1), value=ignore_index) + labels = pad_labels[..., 1:].contiguous() + + # forward + torch_logprobs = self.torch_linear_cross_entropy( + hidden, weight, labels, reduction=reduction, ignore_index=ignore_index + ) + + custom_logprobs = linear_cross_entropy( + hidden, weight, labels, reduction=reduction, ignore_index=ignore_index + ) + + torch.testing.assert_close(torch_logprobs, custom_logprobs) + + # backward + g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1) + + (d_torch_hidden, d_torch_weight) = torch.autograd.grad( + (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + + (d_custom_hidden, d_custom_weight) = torch.autograd.grad( + (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + + torch.testing.assert_close(d_torch_hidden, d_custom_hidden, atol=1e-3, rtol=1e-3) + torch.testing.assert_close(d_torch_weight, d_custom_weight, atol=1e-3, rtol=1e-3) + + @pytest.mark.parametrize("problem", [((1, 4096), 129280, 7168)]) + @pytest.mark.parametrize("dtype", [torch.bfloat16]) + @pytest.mark.parametrize("reduction", ["mean"]) + @pytest.mark.parametrize("ignore_index", [-100]) + def test_performance(self, problem, dtype, reduction, ignore_index): + num_tokens, vocabsize, dim = problem + hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim) + labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + torch_fwd_latency = list() + torch_bwd_latency = list() + custom_fwd_latency = list() + custom_bwd_latency = list() + + iterations = 5 + for i in range(iterations): + hidden = ( + torch.empty(hidden_shape, dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + weight = ( + torch.empty((vocabsize, dim), dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda") + if ignore_index >= 0 and ignore_index < vocabsize: + pad_labels = torch.nn.functional.pad(labels, (0, 1), value=ignore_index) + labels = pad_labels[..., 1:].contiguous() + + # -------- forward -------- # + start_event.record() + torch_logprobs = self.torch_linear_cross_entropy( + hidden, weight, labels, reduction=reduction, ignore_index=ignore_index + ) + end_event.record() + torch.cuda.synchronize() + torch_fwd_latency.append(start_event.elapsed_time(end_event)) + + start_event.record() + custom_logprobs = linear_cross_entropy( + hidden, weight, labels, reduction=reduction, ignore_index=ignore_index + ) + end_event.record() + torch.cuda.synchronize() + custom_fwd_latency.append(start_event.elapsed_time(end_event)) + + # -------- backward -------- # + g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1) + + start_event.record() + (d_torch_hidden, d_torch_weight) = torch.autograd.grad( + (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + end_event.record() + torch.cuda.synchronize() + torch_bwd_latency.append(start_event.elapsed_time(end_event)) + + start_event.record() + (d_custom_hidden, d_custom_weight) = torch.autograd.grad( + (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + end_event.record() + torch.cuda.synchronize() + custom_bwd_latency.append(start_event.elapsed_time(end_event)) + + # --- remove first latency due to warmup --- # + torch_fwd_latency = torch_fwd_latency[1:] + torch_bwd_latency = torch_bwd_latency[1:] + custom_fwd_latency = custom_fwd_latency[1:] + custom_bwd_latency = custom_bwd_latency[1:] + + print() + print(f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}:") + print( + f"[INFO]: Torch forward latency: {sum(torch_fwd_latency) / len(torch_fwd_latency):.2f} ms" + ) + print( + f"[INFO]: Custom forward latency: {sum(custom_fwd_latency) / len(custom_fwd_latency):.2f} ms" + ) + print( + f"[INFO]: Torch backward latency: {sum(torch_bwd_latency) / len(torch_bwd_latency):.2f} ms" + ) + print( + f"[INFO]: Custom backward latency: {sum(custom_bwd_latency) / len(custom_bwd_latency):.2f} ms" + ) + + @pytest.mark.parametrize("problem", [((1, 4096), 129280, 7168)]) + @pytest.mark.parametrize("dtype", [torch.bfloat16]) + @pytest.mark.parametrize("reduction", ["mean"]) + @pytest.mark.parametrize("ignore_index", [-100]) + def test_storage(self, problem, dtype, reduction, ignore_index): + num_tokens, vocabsize, dim = problem + hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim) + labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens + print() + print(f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}:") + + def torch_storage(): + hidden = ( + torch.empty(hidden_shape, dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + weight = ( + torch.empty((vocabsize, dim), dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda") + if ignore_index >= 0 and ignore_index < vocabsize: + pad_labels = torch.nn.functional.pad(labels, (0, 1), value=ignore_index) + labels = pad_labels[..., 1:].contiguous() + + torch.cuda.reset_peak_memory_stats() + torch_logprobs = self.torch_linear_cross_entropy( + hidden, weight, labels, reduction=reduction, ignore_index=ignore_index + ) + torch.cuda.synchronize() + torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 + print(f"[INFO]: Torch Forward pass peak memory: {torch_max_memory:.2f} MB") + + torch.cuda.reset_peak_memory_stats() + g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1) + (d_torch_hidden, d_torch_weight) = torch.autograd.grad( + (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + torch.cuda.synchronize() + torch_backward_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 + print(f"[INFO]: Torch Backward pass peak memory: {torch_backward_max_memory:.2f} MB") + + def custom_storage(): + hidden = ( + torch.empty(hidden_shape, dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + weight = ( + torch.empty((vocabsize, dim), dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda") + if ignore_index >= 0 and ignore_index < vocabsize: + pad_labels = torch.nn.functional.pad(labels, (0, 1), value=ignore_index) + labels = pad_labels[..., 1:].contiguous() + + torch.cuda.reset_peak_memory_stats() + custom_logprobs = linear_cross_entropy( + hidden, weight, labels, reduction=reduction, ignore_index=ignore_index + ) + torch.cuda.synchronize() + custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 + print(f"[INFO]: Custom Forward pass peak memory: {custom_max_memory:.2f} MB") + + torch.cuda.reset_peak_memory_stats() + g_logprobs = torch.empty_like(custom_logprobs).uniform_(-0.1, 0.1) + (d_custom_hidden, d_custom_weight) = torch.autograd.grad( + (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + torch.cuda.synchronize() + custom_backward_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 + print(f"[INFO]: Custom Backward pass peak memory: {custom_backward_max_memory:.2f} MB") + + self.cleanup() + torch_storage() + self.cleanup() + custom_storage() + + +@pytest.mark.skipif( + ("WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2), # or True, + reason="Requires torchrun with multiple GPUs", +) +@pytest.mark.skipif(get_device_arch_version() != 10, reason="Requires GPU architecture = 10") +@pytest.mark.usefixtures("distributed_context") +class TestFusedLinearCrossEntropyTensorParallel: + @pytest.fixture(autouse=True) + def setup_attrs(self, distributed_context): + """ + Setup attributes for the test class. + """ + self.tp_group = distributed_context.group + self.tp_rank = distributed_context.rank + self.tp_world_size = distributed_context.world_size + self.is_chief = distributed_context.is_chief + + def cleanup(self): + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + import gc + + gc.collect() + torch.cuda.synchronize() + + @staticmethod + def torch_linear_cross_entropy_single_gpu( + hidden: torch.Tensor, + weight: torch.Tensor, + labels: torch.Tensor, + reduction: typing.Optional[str] = "mean", + ): + logits = hidden.to(torch.float32) @ weight.T.to(torch.float32) + logprobs = torch.nn.functional.cross_entropy( + logits.view(-1, logits.shape[-1]), labels.view(-1), reduction=reduction + ) + return logprobs.to(torch.float32) + + class TorchLinearCrossEntropy(torch.autograd.Function): + @staticmethod + def forward( + ctx, + hidden: torch.Tensor, + weight: torch.Tensor, + labels: torch.Tensor, + tp_group: torch.distributed.ProcessGroup, + reduction: typing.Optional[str] = "mean", + ): + tp_rank = 0 if tp_group is None else torch.distributed.get_rank(tp_group) + tp_world_size = 1 if tp_group is None else torch.distributed.get_world_size(tp_group) + + logits = hidden.to(torch.float32) @ weight.T.to(torch.float32) + + whole_logits = torch.empty( + (logits.shape[0], logits.shape[-1] * tp_world_size), + dtype=logits.dtype, + device=logits.device, + ) + whole_logits_ref = [ + whole_logits[..., i * logits.shape[-1] : (i + 1) * logits.shape[-1]] + for i in range(tp_world_size) + ] + dist.all_gather(whole_logits_ref, logits, group=tp_group) + + logprobs = torch.nn.functional.cross_entropy( + whole_logits.view(-1, whole_logits.shape[-1]), labels.view(-1), reduction=reduction + ) + + # If we don't preserve whole_logits, + # we need to re-compute it in the backward pass + ctx.save_for_backward(hidden, weight, labels) + ctx.tp_group = tp_group + ctx.reduction = reduction + ctx.tp_rank = tp_rank + ctx.tp_world_size = tp_world_size + + return logprobs.to(torch.float32) + + @staticmethod + def backward(ctx, g_logprobs: torch.Tensor): + hidden, weight, labels = ctx.saved_tensors + tp_group = ctx.tp_group + reduction = ctx.reduction + tp_rank = ctx.tp_rank + tp_world_size = ctx.tp_world_size + + num_tokens, dim = hidden.shape + + if reduction == "mean": + _g_logprobs = torch.broadcast_to(g_logprobs / num_tokens, (num_tokens,)) + elif reduction == "sum": + _g_logprobs = torch.broadcast_to(g_logprobs, (num_tokens,)) + else: + _g_logprobs = g_logprobs + + # re-compute whole_logits + logits = hidden.to(torch.float32) @ weight.T.to(torch.float32) + whole_logits = torch.empty( + (logits.shape[0], logits.shape[-1] * tp_world_size), + dtype=logits.dtype, + device=logits.device, + ) + whole_logits_ref = [ + whole_logits[..., i * logits.shape[-1] : (i + 1) * logits.shape[-1]] + for i in range(tp_world_size) + ] + dist.all_gather(whole_logits_ref, logits, group=tp_group) + + one_hot = torch.zeros_like(whole_logits) + one_hot.scatter_(1, labels.view(-1).unsqueeze(-1), 1) + + pd = torch.nn.functional.softmax(whole_logits, dim=-1) + d_logits = (pd - one_hot) * _g_logprobs.unsqueeze(-1) + d_logits = d_logits.to(hidden.dtype) + + local_size = weight.size(0) + local_d_logits = d_logits[:, tp_rank * local_size : (tp_rank + 1) * local_size] + + local_d_hidden = local_d_logits @ weight + local_d_weight = local_d_logits.T @ hidden + + dist.all_reduce(local_d_hidden, op=dist.ReduceOp.SUM, group=tp_group) + + return local_d_hidden, local_d_weight, None, None, None + + @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) + @pytest.mark.parametrize("reduction", ["mean", "sum", "none"]) + @pytest.mark.parametrize("problem", [(4096, 129280, 8192)]) + def test_torch_tp_vs_single_gpu(self, dtype, reduction, problem): + num_tokens, vocabsize, dim = problem + vocabsize = vocabsize // self.tp_world_size + + hidden = ( + torch.empty((num_tokens, dim), dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + weight = ( + torch.empty((vocabsize, dim), dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + labels = torch.randint(0, vocabsize, (num_tokens,), dtype=torch.long, device="cuda") + + # ------------ forward pass ------------ # + dist.broadcast(hidden, src=0, group=self.tp_group) + dist.broadcast(labels, src=0, group=self.tp_group) + + # single GPU + whole_weight = torch.empty( + (vocabsize * self.tp_world_size, dim), dtype=dtype, device="cuda" + ) + whole_weight_view = [ + whole_weight[i * vocabsize : (i + 1) * vocabsize, :] for i in range(self.tp_world_size) + ] + dist.all_gather(whole_weight_view, weight, group=self.tp_group) + whole_weight = whole_weight.clone().requires_grad_() + logprobs_single_gpu = self.torch_linear_cross_entropy_single_gpu( + hidden, whole_weight, labels, reduction=reduction + ) + + # TP + logprobs_tp = self.TorchLinearCrossEntropy.apply( + hidden, weight, labels, self.tp_group, reduction + ) + torch.testing.assert_close(logprobs_single_gpu, logprobs_tp) + + # ------------ backward pass ------------ # + g_logprobs = torch.empty_like(logprobs_single_gpu).uniform_(-0.1, 0.1) + dist.broadcast(g_logprobs, src=0, group=self.tp_group) + + # single GPU + (d_hidden_single_gpu, d_weight_single_gpu) = torch.autograd.grad( + (logprobs_single_gpu,), (hidden, whole_weight), (g_logprobs,), retain_graph=False + ) + + # TP + (d_hidden_tp, d_weight_tp) = torch.autograd.grad( + (logprobs_tp,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + torch.testing.assert_close(d_hidden_single_gpu, d_hidden_tp, atol=1e-3, rtol=1e-3) + local_d_weight_single_gpu = d_weight_single_gpu[ + self.tp_rank * weight.shape[0] : (self.tp_rank + 1) * weight.shape[0], : + ] + torch.testing.assert_close(local_d_weight_single_gpu, d_weight_tp, atol=1e-3, rtol=1e-3) + + @staticmethod + def get_problems(): + return [ + (80, 125, 64), + (80, 152064, 64), + (1024, 152064, 4096), + (4096, 152063, 8192), + ((1, 4096), 152064, 8192), + ((2, 4096), 152064, 8192), + ] + + @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) + @pytest.mark.parametrize("reduction", ["mean", "sum", "none"]) + @pytest.mark.parametrize("problem", get_problems()) + def test_correctness(self, dtype, reduction, problem): + num_tokens, vocabsize, dim = problem + hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim) + labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens + + hidden = ( + torch.empty(hidden_shape, dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + weight = ( + torch.empty((vocabsize, dim), dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda") + + # ------ forward pass ------ # + dist.broadcast(hidden, src=0, group=self.tp_group) + dist.broadcast(labels, src=0, group=self.tp_group) + + torch_logprobs = self.TorchLinearCrossEntropy.apply( + hidden.view(-1, dim), weight, labels, self.tp_group, reduction + ) + + custom_logprobs = linear_cross_entropy( + hidden, weight, labels, tp_group=self.tp_group, reduction=reduction + ) + + torch.testing.assert_close(torch_logprobs, custom_logprobs) + + # ------- backward pass ------- # + g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1) + dist.broadcast(g_logprobs, src=0, group=self.tp_group) + + (d_hidden_torch, d_weight_torch) = torch.autograd.grad( + (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + (d_hidden_custom, d_weight_custom) = torch.autograd.grad( + (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + torch.testing.assert_close(d_hidden_torch, d_hidden_custom, atol=1e-3, rtol=1e-3) + torch.testing.assert_close(d_weight_torch, d_weight_custom, atol=1e-4, rtol=1e-4) + + @pytest.mark.parametrize("problem", [((1, 4096), 129280, 7168)]) + @pytest.mark.parametrize("dtype", [torch.bfloat16]) + @pytest.mark.parametrize("reduction", ["mean"]) + def test_performance(self, problem, dtype, reduction): + num_tokens, vocabsize, dim = problem + hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim) + labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + torch_fwd_latency = list() + torch_bwd_latency = list() + custom_fwd_latency = list() + custom_bwd_latency = list() + + iterations = 5 + for i in range(iterations): + hidden = ( + torch.empty(hidden_shape, dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + weight = ( + torch.empty((vocabsize, dim), dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda") + + # ------ forward pass ------ # + dist.broadcast(hidden, src=0, group=self.tp_group) + dist.broadcast(labels, src=0, group=self.tp_group) + + start_event.record() + torch_logprobs = self.TorchLinearCrossEntropy.apply( + hidden.view(-1, dim), weight, labels, self.tp_group, reduction + ) + end_event.record() + torch.cuda.synchronize() + torch_fwd_latency.append(start_event.elapsed_time(end_event)) + + start_event.record() + custom_logprobs = linear_cross_entropy( + hidden, weight, labels, tp_group=self.tp_group, reduction=reduction + ) + end_event.record() + torch.cuda.synchronize() + custom_fwd_latency.append(start_event.elapsed_time(end_event)) + + # ------- backward pass ------- # + g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1) + dist.broadcast(g_logprobs, src=0, group=self.tp_group) + + start_event.record() + (d_hidden_torch, d_weight_torch) = torch.autograd.grad( + (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + end_event.record() + torch.cuda.synchronize() + torch_bwd_latency.append(start_event.elapsed_time(end_event)) + + start_event.record() + (d_hidden_custom, d_weight_custom) = torch.autograd.grad( + (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + end_event.record() + torch.cuda.synchronize() + custom_bwd_latency.append(start_event.elapsed_time(end_event)) + + # --- remove first latency due to warmup --- # + torch_fwd_latency = torch_fwd_latency[1:] + torch_bwd_latency = torch_bwd_latency[1:] + custom_fwd_latency = custom_fwd_latency[1:] + custom_bwd_latency = custom_bwd_latency[1:] + + if self.is_chief: + print() + print( + f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}:" + ) + print( + f"[INFO]: Torch forward latency: {sum(torch_fwd_latency) / len(torch_fwd_latency):.2f} ms" + ) + print( + f"[INFO]: Custom forward latency: {sum(custom_fwd_latency) / len(custom_fwd_latency):.2f} ms" + ) + print( + f"[INFO]: Torch backward latency: {sum(torch_bwd_latency) / len(torch_bwd_latency):.2f} ms" + ) + print( + f"[INFO]: Custom backward latency: {sum(custom_bwd_latency) / len(custom_bwd_latency):.2f} ms" + ) + + @pytest.mark.parametrize("problem", [((1, 4096), 129280, 7168)]) + @pytest.mark.parametrize("dtype", [torch.bfloat16]) + @pytest.mark.parametrize("reduction", ["mean"]) + def test_storage(self, problem, dtype, reduction): + num_tokens, vocabsize, dim = problem + hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim) + labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens + + if self.is_chief: + print() + print( + f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}:" + ) + + def torch_storage(): + hidden = ( + torch.empty(hidden_shape, dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + weight = ( + torch.empty((vocabsize, dim), dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda") + + dist.broadcast(hidden, src=0, group=self.tp_group) + dist.broadcast(labels, src=0, group=self.tp_group) + + torch.cuda.reset_peak_memory_stats() + torch_logprobs = self.TorchLinearCrossEntropy.apply( + hidden.view(-1, dim), weight, labels, self.tp_group, reduction + ) + torch.cuda.synchronize() + torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 + if self.is_chief: + print( + f"[INFO]: On GPU {self.tp_rank}, Torch Forward pass peak memory: {torch_max_memory:.2f} MB" + ) + + g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1) + dist.broadcast(g_logprobs, src=0, group=self.tp_group) + + torch.cuda.reset_peak_memory_stats() + (d_hidden_torch, d_weight_torch) = torch.autograd.grad( + (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + torch.cuda.synchronize() + torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 + if self.is_chief: + print( + f"[INFO]: On GPU {self.tp_rank}, Torch Backward pass peak memory: {torch_max_memory:.2f} MB" + ) + + def custom_storage(): + hidden = ( + torch.empty(hidden_shape, dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + weight = ( + torch.empty((vocabsize, dim), dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda") + + dist.broadcast(hidden, src=0, group=self.tp_group) + dist.broadcast(labels, src=0, group=self.tp_group) + + torch.cuda.reset_peak_memory_stats() + custom_logprobs = linear_cross_entropy( + hidden, weight, labels, tp_group=self.tp_group, reduction=reduction + ) + torch.cuda.synchronize() + custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 + if self.is_chief: + print( + f"[INFO]: On GPU {self.tp_rank}, Custom Forward pass peak memory: {custom_max_memory:.2f} MB" + ) + + g_logprobs = torch.empty_like(custom_logprobs).uniform_(-0.1, 0.1) + dist.broadcast(g_logprobs, src=0, group=self.tp_group) + + torch.cuda.reset_peak_memory_stats() + (d_hidden_custom, d_weight_custom) = torch.autograd.grad( + (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + torch.cuda.synchronize() + custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 + if self.is_chief: + print( + f"[INFO]: On GPU {self.tp_rank}, Custom Backward pass peak memory: {custom_max_memory:.2f} MB" + ) + + self.cleanup() + torch_storage() + self.cleanup() + custom_storage() + + +@pytest.mark.skipif( + "WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2, + reason="Requires torchrun with multiple GPUs", +) +@pytest.mark.skipif(get_device_arch_version() != 10, reason="Requires GPU architecture = 10") +@pytest.mark.usefixtures("distributed_context") +class TestFusedLinearCrossEntropySequenceParallel: + @pytest.fixture(autouse=True) + def setup_attrs(self, distributed_context): + """ + Setup attributes for the test class. + """ + self.tp_group = distributed_context.group + self.tp_rank = distributed_context.rank + self.tp_world_size = distributed_context.world_size + self.is_chief = distributed_context.is_chief + + @staticmethod + def timed_barrier(timeout_s=10): + import time + + work = torch.distributed.barrier(async_op=True) + t0 = time.time() + while not work.is_completed(): + if time.time() - t0 > timeout_s: + exit(1) + time.sleep(0.05) + work.wait() + + def cleanup(self): + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + import gc + + gc.collect() + torch.cuda.synchronize() + + @staticmethod + def torch_linear_cross_entropy_single_gpu( + hidden: torch.Tensor, + weight: torch.Tensor, + labels: torch.Tensor, + reduction: typing.Optional[str] = "mean", + ): + logits = hidden.to(torch.float32) @ weight.T.to(torch.float32) + logprobs = torch.nn.functional.cross_entropy( + logits.view(-1, logits.shape[-1]), labels.view(-1), reduction=reduction + ) + return logprobs.to(torch.float32) + + class TorchLinearCrossEntropy(torch.autograd.Function): + @staticmethod + def forward( + ctx, + hidden: torch.Tensor, + weight: torch.Tensor, + labels: torch.Tensor, + tp_group: torch.distributed.ProcessGroup, + reduction: typing.Optional[str] = "mean", + ): + tp_rank = 0 if tp_group is None else torch.distributed.get_rank(tp_group) + tp_world_size = 1 if tp_group is None else torch.distributed.get_world_size(tp_group) + + whole_hidden = torch.empty( + (hidden.shape[0] * tp_world_size, hidden.shape[-1]), + dtype=hidden.dtype, + device=hidden.device, + ) + dist.all_gather_into_tensor(whole_hidden, hidden, group=tp_group) + + logits = whole_hidden.to(torch.float32) @ weight.T.to(torch.float32) + + whole_logits = torch.empty( + (logits.shape[0], logits.shape[-1] * tp_world_size), + dtype=logits.dtype, + device=logits.device, + ) + whole_logits_ref = [ + whole_logits[..., i * logits.shape[-1] : (i + 1) * logits.shape[-1]] + for i in range(tp_world_size) + ] + dist.all_gather(whole_logits_ref, logits, group=tp_group) + + logprobs = torch.nn.functional.cross_entropy( + whole_logits.view(-1, whole_logits.shape[-1]), labels.view(-1), reduction=reduction + ) + + # If we don't preserve whole_logits, + # we need to re-compute it in the backward pass + ctx.save_for_backward(whole_hidden, weight, labels) + ctx.tp_group = tp_group + ctx.reduction = reduction + ctx.tp_rank = tp_rank + ctx.tp_world_size = tp_world_size + + return logprobs.to(torch.float32) + + @staticmethod + def backward(ctx, g_logprobs: torch.Tensor): + whole_hidden, weight, labels = ctx.saved_tensors + tp_group = ctx.tp_group + reduction = ctx.reduction + tp_rank = ctx.tp_rank + tp_world_size = ctx.tp_world_size + + num_tokens, dim = whole_hidden.shape + + if reduction == "mean": + _g_logprobs = torch.broadcast_to(g_logprobs / num_tokens, (num_tokens,)) + elif reduction == "sum": + _g_logprobs = torch.broadcast_to(g_logprobs, (num_tokens,)) + else: + _g_logprobs = g_logprobs + + # re-compute whole_logits + logits = whole_hidden.to(torch.float32) @ weight.T.to(torch.float32) + whole_logits = torch.empty( + (logits.shape[0], logits.shape[-1] * tp_world_size), + dtype=logits.dtype, + device=logits.device, + ) + whole_logits_ref = [ + whole_logits[..., i * logits.shape[-1] : (i + 1) * logits.shape[-1]] + for i in range(tp_world_size) + ] + dist.all_gather(whole_logits_ref, logits, group=tp_group) + + one_hot = torch.zeros_like(whole_logits) + one_hot.scatter_(1, labels.view(-1).unsqueeze(-1), 1) + + pd = torch.nn.functional.softmax(whole_logits, dim=-1) + d_logits = (pd - one_hot) * _g_logprobs.unsqueeze(-1) + d_logits = d_logits.to(whole_hidden.dtype) + + local_size = weight.size(0) + local_d_logits = d_logits[:, tp_rank * local_size : (tp_rank + 1) * local_size] + + d_hidden = local_d_logits @ weight + local_d_weight = local_d_logits.T @ whole_hidden + + # dist.all_reduce( + # local_d_hidden, + # op=dist.ReduceOp.SUM, + # group=tp_group + # ) + + # split the local_d_hidden along the sequence length dimension + local_num_tokens = num_tokens // tp_world_size + # local_d_hidden = local_d_hidden[tp_rank * local_num_tokens : (tp_rank + 1) * local_num_tokens, :] + + local_d_hidden = torch.empty( + (local_num_tokens, dim), dtype=weight.dtype, device=weight.device + ) + dist.reduce_scatter_tensor( + local_d_hidden, d_hidden, op=dist.ReduceOp.SUM, group=tp_group + ) + return local_d_hidden, local_d_weight, None, None, None + + @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) + @pytest.mark.parametrize("reduction", ["mean", "sum", "none"]) + @pytest.mark.parametrize("problem", [(256, 129280, 8192)]) + def test_torch_sp_vs_single_gpu(self, dtype, reduction, problem): + num_tokens, vocabsize, dim = problem + vocabsize = vocabsize // self.tp_world_size + + hidden = ( + torch.empty((num_tokens, dim), dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + weight = ( + torch.empty((vocabsize, dim), dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + labels = torch.randint( + 0, vocabsize, (num_tokens * self.tp_world_size,), dtype=torch.long, device="cuda" + ) + + # ------------ forward pass ------------ # + dist.broadcast(labels, src=0, group=self.tp_group) + + # single GPU + whole_hidden = torch.empty( + (num_tokens * self.tp_world_size, dim), dtype=dtype, device="cuda" + ) + dist.all_gather_into_tensor(whole_hidden, hidden, group=self.tp_group) + whole_hidden = whole_hidden.clone().requires_grad_() + + whole_weight = torch.empty( + (vocabsize * self.tp_world_size, dim), dtype=dtype, device="cuda" + ) + whole_weight_view = [ + whole_weight[i * vocabsize : (i + 1) * vocabsize, :] for i in range(self.tp_world_size) + ] + dist.all_gather(whole_weight_view, weight, group=self.tp_group) + whole_weight = whole_weight.clone().requires_grad_() + logprobs_single_gpu = self.torch_linear_cross_entropy_single_gpu( + whole_hidden, whole_weight, labels, reduction=reduction + ) + + # TP + logprobs_tp = self.TorchLinearCrossEntropy.apply( + hidden, weight, labels, self.tp_group, reduction + ) + torch.testing.assert_close(logprobs_single_gpu, logprobs_tp) + + # ------------ backward pass ------------ # + g_logprobs = torch.empty_like(logprobs_single_gpu).uniform_(-0.1, 0.1) + dist.broadcast(g_logprobs, src=0, group=self.tp_group) + + # single GPU + (d_hidden_single_gpu, d_weight_single_gpu) = torch.autograd.grad( + (logprobs_single_gpu,), (whole_hidden, whole_weight), (g_logprobs,), retain_graph=False + ) + + # TP + (d_hidden_tp, d_weight_tp) = torch.autograd.grad( + (logprobs_tp,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + + local_d_hidden_single_gpu = d_hidden_single_gpu[ + self.tp_rank * hidden.shape[0] : (self.tp_rank + 1) * hidden.shape[0], : + ] + torch.testing.assert_close(local_d_hidden_single_gpu, d_hidden_tp, atol=1e-3, rtol=1e-3) + local_d_weight_single_gpu = d_weight_single_gpu[ + self.tp_rank * weight.shape[0] : (self.tp_rank + 1) * weight.shape[0], : + ] + torch.testing.assert_close(local_d_weight_single_gpu, d_weight_tp, atol=1e-3, rtol=1e-3) + + self.cleanup() + + @staticmethod + def get_problems(): + return [ + (80, 125, 64), + (80, 152064, 64), + (1024, 152064, 4096), + (4096, 15206, 1024), + ((1, 4096), 15206, 1024), + ((4, 1024), 15206, 1024), + ] + + @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) + @pytest.mark.parametrize("reduction", ["mean", "sum", "none"]) + @pytest.mark.parametrize("problem", get_problems()) + def test_correctness(self, dtype, reduction, problem): + num_tokens, vocabsize, dim = problem + hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim) + labels_shape = ( + (num_tokens * self.tp_world_size,) + if isinstance(num_tokens, int) + else (num_tokens[0] * self.tp_world_size, *num_tokens[1:]) + ) + + hidden = ( + torch.empty(hidden_shape, dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + weight = ( + torch.empty((vocabsize, dim), dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda") + + # ------ forward pass ------ # + dist.broadcast(labels, src=0, group=self.tp_group) + + torch_logprobs = self.TorchLinearCrossEntropy.apply( + hidden.view(-1, dim), weight, labels, self.tp_group, reduction + ) + + custom_logprobs = linear_cross_entropy( + hidden, + weight, + labels, + tp_group=self.tp_group, + reduction=reduction, + sequence_parallel=True, + ) + + torch.testing.assert_close(torch_logprobs, custom_logprobs) + + # ------- backward pass ------- # + g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1) + dist.broadcast(g_logprobs, src=0, group=self.tp_group) + + (d_hidden_torch, d_weight_torch) = torch.autograd.grad( + (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + (d_hidden_custom, d_weight_custom) = torch.autograd.grad( + (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + + # in case one GPU failed, and leading to hang + torch.testing.assert_close(d_hidden_torch, d_hidden_custom, atol=1e-3, rtol=1e-3) + torch.testing.assert_close(d_weight_torch, d_weight_custom, atol=1e-3, rtol=1e-3) + self.timed_barrier() + + self.cleanup() + + @pytest.mark.parametrize("problem", [((1, 1024), 129280, 7168)]) + @pytest.mark.parametrize("dtype", [torch.bfloat16]) + @pytest.mark.parametrize("reduction", ["mean"]) + def test_performance(self, problem, dtype, reduction): + num_tokens, vocabsize, dim = problem + hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim) + labels_shape = ( + (num_tokens * self.tp_world_size,) + if isinstance(num_tokens, int) + else (num_tokens[0] * self.tp_world_size, *num_tokens[1:]) + ) + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + torch_fwd_latency = list() + torch_bwd_latency = list() + custom_fwd_latency = list() + custom_bwd_latency = list() + + iterations = 5 + for i in range(iterations): + hidden = ( + torch.empty(hidden_shape, dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + weight = ( + torch.empty((vocabsize, dim), dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda") + + # ------ forward pass ------ # + dist.broadcast(labels, src=0, group=self.tp_group) + + start_event.record() + torch_logprobs = self.TorchLinearCrossEntropy.apply( + hidden.view(-1, dim), weight, labels, self.tp_group, reduction + ) + end_event.record() + torch.cuda.synchronize() + torch_fwd_latency.append(start_event.elapsed_time(end_event)) + + start_event.record() + custom_logprobs = linear_cross_entropy( + hidden, + weight, + labels, + tp_group=self.tp_group, + reduction=reduction, + sequence_parallel=True, + ) + end_event.record() + torch.cuda.synchronize() + custom_fwd_latency.append(start_event.elapsed_time(end_event)) + + # ------- backward pass ------- # + g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1) + dist.broadcast(g_logprobs, src=0, group=self.tp_group) + + start_event.record() + (d_hidden_torch, d_weight_torch) = torch.autograd.grad( + (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + end_event.record() + torch.cuda.synchronize() + torch_bwd_latency.append(start_event.elapsed_time(end_event)) + + start_event.record() + (d_hidden_custom, d_weight_custom) = torch.autograd.grad( + (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + end_event.record() + torch.cuda.synchronize() + custom_bwd_latency.append(start_event.elapsed_time(end_event)) + + # --- remove first latency due to warmup --- # + torch_fwd_latency = torch_fwd_latency[1:] + torch_bwd_latency = torch_bwd_latency[1:] + custom_fwd_latency = custom_fwd_latency[1:] + custom_bwd_latency = custom_bwd_latency[1:] + + if self.is_chief: + print() + print( + f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}, Sequence Parallel: True:" + ) + print( + f"[INFO]: Torch forward latency: {sum(torch_fwd_latency) / len(torch_fwd_latency):.2f} ms" + ) + print( + f"[INFO]: Custom forward latency: {sum(custom_fwd_latency) / len(custom_fwd_latency):.2f} ms" + ) + print( + f"[INFO]: Torch backward latency: {sum(torch_bwd_latency) / len(torch_bwd_latency):.2f} ms" + ) + print( + f"[INFO]: Custom backward latency: {sum(custom_bwd_latency) / len(custom_bwd_latency):.2f} ms" + ) + + @pytest.mark.parametrize("problem", [((1, 1024), 129280, 7168)]) + @pytest.mark.parametrize("dtype", [torch.bfloat16]) + @pytest.mark.parametrize("reduction", ["mean"]) + def test_storage(self, problem, dtype, reduction): + num_tokens, vocabsize, dim = problem + hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim) + labels_shape = ( + (num_tokens * self.tp_world_size,) + if isinstance(num_tokens, int) + else (num_tokens[0] * self.tp_world_size, *num_tokens[1:]) + ) + + if self.is_chief: + print() + print( + f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}, Sequence Parallel: True:" + ) + + def torch_storage(): + hidden = ( + torch.empty(hidden_shape, dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + weight = ( + torch.empty((vocabsize, dim), dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda") + + dist.broadcast(hidden, src=0, group=self.tp_group) + dist.broadcast(labels, src=0, group=self.tp_group) + + torch.cuda.reset_peak_memory_stats() + torch_logprobs = self.TorchLinearCrossEntropy.apply( + hidden.view(-1, dim), weight, labels, self.tp_group, reduction + ) + torch.cuda.synchronize() + torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 + if self.is_chief: + print( + f"[INFO]: On GPU {self.tp_rank}, Torch Forward pass peak memory: {torch_max_memory:.2f} MB" + ) + + g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1) + dist.broadcast(g_logprobs, src=0, group=self.tp_group) + + torch.cuda.reset_peak_memory_stats() + (d_hidden_torch, d_weight_torch) = torch.autograd.grad( + (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + torch.cuda.synchronize() + torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 + if self.is_chief: + print( + f"[INFO]: On GPU {self.tp_rank}, Torch Backward pass peak memory: {torch_max_memory:.2f} MB" + ) + + def custom_storage(): + hidden = ( + torch.empty(hidden_shape, dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + weight = ( + torch.empty((vocabsize, dim), dtype=dtype, device="cuda") + .uniform_(-0.1, 0.1) + .requires_grad_() + ) + labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda") + + dist.broadcast(hidden, src=0, group=self.tp_group) + dist.broadcast(labels, src=0, group=self.tp_group) + + torch.cuda.reset_peak_memory_stats() + custom_logprobs = linear_cross_entropy( + hidden, + weight, + labels, + tp_group=self.tp_group, + reduction=reduction, + sequence_parallel=True, + ) + torch.cuda.synchronize() + custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 + if self.is_chief: + print( + f"[INFO]: On GPU {self.tp_rank}, Custom Forward pass peak memory: {custom_max_memory:.2f} MB" + ) + + g_logprobs = torch.empty_like(custom_logprobs).uniform_(-0.1, 0.1) + dist.broadcast(g_logprobs, src=0, group=self.tp_group) + + torch.cuda.reset_peak_memory_stats() + (d_hidden_custom, d_weight_custom) = torch.autograd.grad( + (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False + ) + torch.cuda.synchronize() + custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 + if self.is_chief: + print( + f"[INFO]: On GPU {self.tp_rank}, Custom Backward pass peak memory: {custom_max_memory:.2f} MB" + ) + + self.cleanup() + torch_storage() + self.cleanup() + custom_storage() From 9cf6838aec19fd17be4f0c975c38e9b95621fc9c Mon Sep 17 00:00:00 2001 From: Yuzhong Wang Date: Fri, 5 Dec 2025 11:40:37 +0800 Subject: [PATCH 182/248] Fix gpt_layer_spec for frequently linear attention (#2481) Co-authored-by: Kunlun Li <94586211+kunlunl@users.noreply.github.com> --- gpt_builders.py | 6 +++-- megatron/core/datasets/retro/config/config.py | 3 +++ megatron/core/model_parallel_config.py | 4 ++-- ...rimental_attention_variant_module_specs.py | 6 +++++ megatron/core/models/gpt/gpt_layer_specs.py | 24 +++++++++++++------ megatron/core/models/retro/config.py | 3 ++- .../core/transformer/transformer_config.py | 15 ++++++++++++ megatron/training/arguments.py | 3 ++- megatron/training/training.py | 11 ++++++--- 9 files changed, 59 insertions(+), 16 deletions(-) diff --git a/gpt_builders.py b/gpt_builders.py index 61d159b9967..2850354553b 100644 --- a/gpt_builders.py +++ b/gpt_builders.py @@ -8,6 +8,9 @@ get_gpt_mtp_block_spec, get_gpt_decoder_layer_specs, ) +from megatron.core.models.gpt.experimental_attention_variant_module_specs import ( + is_linear_attention_variant, +) from megatron.core.models.gpt.heterogeneous.heterogeneous_layer_specs import ( get_gpt_heterogeneous_layer_spec, ) @@ -42,8 +45,7 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None): else: use_te = args.transformer_impl == "transformer_engine" - linear_attention_variants = ["gated_delta_net"] - if args.num_experts or args.experimental_attention_variant in linear_attention_variants: + if args.num_experts or is_linear_attention_variant(args.experimental_attention_variant): # Define the decoder block spec transformer_layer_spec = get_gpt_decoder_block_spec( config, diff --git a/megatron/core/datasets/retro/config/config.py b/megatron/core/datasets/retro/config/config.py index ac9ca841242..73f34a47545 100644 --- a/megatron/core/datasets/retro/config/config.py +++ b/megatron/core/datasets/retro/config/config.py @@ -5,6 +5,7 @@ from dataclasses import dataclass from megatron.core.transformer import TransformerConfig +from megatron.core.utils import experimental_api from .bert_embedders import RetroBertEmbedders from .gpt_chunk_datasets import RetroGPTChunkDatasets @@ -12,7 +13,9 @@ @dataclass +@experimental_api class RetroPreprocessingConfig(TransformerConfig): + # pylint: disable=line-too-long """Configuration object for Retro preprocessing. *Note* : Arguments prefixed with '--retro-gpt-*' or '--retro-bert-*' are diff --git a/megatron/core/model_parallel_config.py b/megatron/core/model_parallel_config.py index e75ff4a0273..129135c4cc0 100644 --- a/megatron/core/model_parallel_config.py +++ b/megatron/core/model_parallel_config.py @@ -6,11 +6,11 @@ import torch -from megatron.core.utils import internal_api +from megatron.core.utils import experimental_api @dataclass -@internal_api +@experimental_api class ModelParallelConfig: """Base configuration for Megatron Core diff --git a/megatron/core/models/gpt/experimental_attention_variant_module_specs.py b/megatron/core/models/gpt/experimental_attention_variant_module_specs.py index cbe59618baf..e6d6fa03ce7 100644 --- a/megatron/core/models/gpt/experimental_attention_variant_module_specs.py +++ b/megatron/core/models/gpt/experimental_attention_variant_module_specs.py @@ -19,6 +19,12 @@ from megatron.core.transformer.spec_utils import ModuleSpec +def is_linear_attention_variant(experimental_attention_variant: str) -> bool: + """Check if the experimental attention variant is a linear attention variant.""" + linear_attention_variants = ["gated_delta_net"] + return experimental_attention_variant in linear_attention_variants + + def get_gated_delta_net_module_spec_for_backend( backend: BackendSpecProvider, normalization: Optional[str] = None ) -> ModuleSpec: diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py index 5395b158749..f25408e9553 100755 --- a/megatron/core/models/gpt/gpt_layer_specs.py +++ b/megatron/core/models/gpt/gpt_layer_specs.py @@ -7,6 +7,7 @@ from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider from megatron.core.models.gpt.experimental_attention_variant_module_specs import ( get_experimental_attention_variant_module_spec_for_backend, + is_linear_attention_variant, ) from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec_for_backend from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules @@ -536,20 +537,29 @@ def get_gpt_decoder_layer_specs( num_experts = None moe_grouped_gemm = None if attention_type == "linear_attention": - linear_attention_variants = ["gated_delta_net"] - if config.experimental_attention_variant not in linear_attention_variants: + multi_latent_attention = None + if is_linear_attention_variant(config.experimental_attention_variant): + # There exists linear attention layer in the model. + experimental_attention_variant = config.experimental_attention_variant + else: # Skip if there is no linear attention layer in the model. continue - multi_latent_attention = None else: multi_latent_attention = config.multi_latent_attention + if is_linear_attention_variant(config.experimental_attention_variant): + # experimental_attention_variant is a linear attention variant, + # so softmax attention is regular attention layer. + experimental_attention_variant = None + else: + # Softmax attention is an experimental attention variant. + experimental_attention_variant = config.experimental_attention_variant layer_spec_key = f"{mlp_type}_{attention_type}" layer_spec_dict[layer_spec_key] = get_layer_spec_fn( num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, multi_latent_attention=multi_latent_attention, - experimental_attention_variant=config.experimental_attention_variant, + experimental_attention_variant=experimental_attention_variant, **get_layer_spec_kwargs, ) @@ -592,13 +602,13 @@ def get_gpt_decoder_layer_specs( f"current linear attention pattern: {config.linear_attention_freq}" ) elif config.linear_attention_freq is None: - linear_attention_variants = ["gated_delta_net"] - if config.experimental_attention_variant not in linear_attention_variants: + if not is_linear_attention_variant(config.experimental_attention_variant): linear_attention_pattern = [0] * config.num_layers else: linear_attention_pattern = [1] * config.num_layers warnings.warn( - "Linear attention type is specified but linear_attention_freq is None. " + f"Linear attention type {config.experimental_attention_variant} is specified " + "but linear_attention_freq is None. " "Setting linear_attention_pattern to [1] * config.num_layers as default." ) else: diff --git a/megatron/core/models/retro/config.py b/megatron/core/models/retro/config.py index 1b486767264..4e45be30b2e 100644 --- a/megatron/core/models/retro/config.py +++ b/megatron/core/models/retro/config.py @@ -7,10 +7,11 @@ from megatron.core.transformer import TransformerConfig from megatron.core.transformer.enums import AttnBackend -from megatron.core.utils import is_te_min_version +from megatron.core.utils import experimental_api, is_te_min_version @dataclass +@experimental_api class RetroConfig(TransformerConfig): """Configuration object for Retro models.""" diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index a3a16754977..31dd5a98a58 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -11,6 +11,7 @@ from megatron.core.quantization.quant_config import RecipeConfig from megatron.core.transformer.enums import AttnBackend, CudaGraphScope from megatron.core.transformer.pipeline_parallel_layer_layout import PipelineParallelLayerLayout +from megatron.core.utils import experimental_api from ..fusions.fused_bias_geglu import quick_gelu from ..model_parallel_config import ModelParallelConfig @@ -31,6 +32,7 @@ @dataclass +@experimental_api class TransformerConfig(ModelParallelConfig): """Configuration object for megatron-core transformers. @@ -241,6 +243,10 @@ class TransformerConfig(ModelParallelConfig): #################### # attention variant: gated_delta_net #################### + linear_attention_type: Optional[str] = None + """Type of linear attention to use. + Deprecated. Use experimental_attention_variant instead.""" + linear_attention_freq: Optional[Union[int, List[int]]] = None """Frequency between LA (linear attention) layers and SDPA (scaled dot-product attention) layers. @@ -877,6 +883,14 @@ def __post_init__(self): f"tensor_model_parallel_size ({self.tensor_model_parallel_size})." ) + if self.linear_attention_type is not None: + warnings.warn( + "linear_attention_type is deprecated, " + "use experimental_attention_variant instead." + ) + self.experimental_attention_variant = self.linear_attention_type + self.linear_attention_type = None + if self.experimental_attention_variant in ["gated_delta_net"]: assert ( self.linear_attention_freq is not None @@ -1912,6 +1926,7 @@ def __post_init__(self): @dataclass +@experimental_api class MLATransformerConfig(TransformerConfig): """Configuration object for megatron-core Multi-Latent Attention (MLA) transformers. diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 2c87532c919..757f2b63de4 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1201,6 +1201,7 @@ def validate_args(args, defaults={}): args.no_load_rng = True print('Warning: disabling --no-load-rng for upcycling.') + # Experimental attention variant check if args.linear_attention_type is not None: print_rank_0( '--linear-attention-type is deprecated, use --experimental-attention-variant instead.', @@ -1209,7 +1210,7 @@ def validate_args(args, defaults={}): args.experimental_attention_variant = args.linear_attention_type del args.linear_attention_type - # Muon optimizercheck + # Muon optimizer check if 'muon' in args.optimizer: assert not args.use_distributed_optimizer, "Muon optimizer does not support distributed optimizer for now." assert not args.use_torch_fsdp2, "Muon optimizer does not support Torch-FSDP2 for now." diff --git a/megatron/training/training.py b/megatron/training/training.py index a732e3917e5..f7731ab3c1a 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -48,6 +48,9 @@ from megatron.core import mpu, tensor_parallel +from megatron.core.models.gpt.experimental_attention_variant_module_specs import ( + is_linear_attention_variant, +) from megatron.core.utils import ( check_param_hashes_across_dp_replicas, get_attr_wrapped_model, @@ -379,8 +382,7 @@ def transformer_flops(): ) ) - linear_attention_variants = ["gated_delta_net"] - if args.experimental_attention_variant in linear_attention_variants: + if is_linear_attention_variant(args.experimental_attention_variant): # Calculate number of dense and MoE Transformer MLPs. if isinstance(args.linear_attention_freq, int): linear_attention_pattern = [ @@ -433,7 +435,10 @@ def transformer_flops(): ) ) else: - raise ValueError(f"Invalid linear_attention_type: {args.linear_attention_type}") + raise ValueError( + "Invalid experimental_attention_variant: " + f"{args.experimental_attention_variant}" + ) else: num_linear_attention_layers = 0 linear_self_attn_term = 0 From 89fe8953cd0f46cb1f59cdfbb8647e73a7dcbdd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niccol=C3=B2=20Ajroldi?= <61059403+Niccolo-Ajroldi@users.noreply.github.com> Date: Fri, 5 Dec 2025 07:16:38 +0100 Subject: [PATCH 183/248] Skip trainloader when `args.skip_train` is True (#2501) --- megatron/training/training.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/megatron/training/training.py b/megatron/training/training.py index f7731ab3c1a..c29c48d4c9f 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -2990,7 +2990,8 @@ def build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider valid_ds = [valid_ds] if not isinstance(valid_ds, list) else valid_ds # Build dataloders. - train_dataloader = build_pretraining_data_loader(train_ds, args.consumed_train_samples) + if not args.skip_train: + train_dataloader = build_pretraining_data_loader(train_ds, args.consumed_train_samples) valid_dataloaders = [] for valid_d in valid_ds: From a6d86a6da6591fd27b77e5e732690ab65632a8a0 Mon Sep 17 00:00:00 2001 From: Deyu Fu Date: Fri, 5 Dec 2025 15:40:40 +0800 Subject: [PATCH 184/248] [DEV] fixes for muon(qwen3-next, ep multi-adam) (#2564) Signed-off-by: Deyu Fu --- megatron/core/optimizer/muon.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/megatron/core/optimizer/muon.py b/megatron/core/optimizer/muon.py index ddf20b0abb8..b6af7a3e188 100644 --- a/megatron/core/optimizer/muon.py +++ b/megatron/core/optimizer/muon.py @@ -234,9 +234,10 @@ def get_megatron_muon_optimizer( # TODO(deyuf): support MLA if 'linear_qkv.weight' in name and len(param.shape) == 2: param.is_qkv = True - # TODO(deyuf): might not be sufficient for future algorithm. revisit this conditioning - if not getattr(param, 'is_embedding_or_output_parameter', False) and not ( - len(param.shape) == 1 + # TODO(deyuf): currently only allow 2D non-embedding weight to avoid breaking + if ( + not getattr(param, 'is_embedding_or_output_parameter', False) + and len(param.shape) == 2 ): linear_params.append(param) else: @@ -339,6 +340,7 @@ def adam_init_state_fn(opt, config=None): param.requires_grad = True # chain everything together + init_fns = [muon_init_state_fn] + len(chained_adam.chained_optimizers) * [adam_init_state_fn] optimizers += chained_adam.chained_optimizers if layer_wise_distributed_optimizer: @@ -346,9 +348,6 @@ def adam_init_state_fn(opt, config=None): if reset_config_bf16: config.bf16 = True return LayerWiseDistributedOptimizer( - optimizers, - config, - pg_collection, - init_state_fn_list=[muon_init_state_fn, adam_init_state_fn], + optimizers, config, pg_collection, init_state_fn_list=init_fns ) return ChainedOptimizer(optimizers) From aee4a74bb69838c08c2b251b143bb9b3d5795874 Mon Sep 17 00:00:00 2001 From: HaochenYuan <106647990+HaochenYuan@users.noreply.github.com> Date: Mon, 8 Dec 2025 18:20:58 +0800 Subject: [PATCH 185/248] [Dev] remove fp16 assert in moe_grouped_gemm & EP (#2494) --- megatron/core/transformer/moe/experts.py | 1 + megatron/training/arguments.py | 3 - .../transformer/moe/test_moe_layer.py | 84 +++++++++++++++++++ 3 files changed, 85 insertions(+), 3 deletions(-) diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py index 83cf5b51ffc..5eeafdd8d1d 100644 --- a/megatron/core/transformer/moe/experts.py +++ b/megatron/core/transformer/moe/experts.py @@ -238,6 +238,7 @@ def forward( permuted_probs: torch.Tensor, ): """Forward step of the GroupedMLP.""" + assert self.config.bf16, "Currently GroupedGEMM for MoE only supports bf16." if self.activation_recompute: self.activation_checkpoint = tensor_parallel.CheckpointWithoutOutput() diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 757f2b63de4..682bd94bdf9 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -900,7 +900,6 @@ def validate_args(args, defaults={}): 'residual connection in fp32 only supported when using fp16 or bf16.' if args.moe_grouped_gemm: - assert args.bf16, 'Currently GroupedGEMM for MoE only supports bf16 dtype.' dc = torch.cuda.get_device_capability() assert dc[0] >= 8, "Unsupported compute capability for GroupedGEMM kernels." @@ -1084,8 +1083,6 @@ def validate_args(args, defaults={}): assert args.num_experts is not None, "num_experts must be non None to use expert model parallelism" assert args.num_experts % args.expert_model_parallel_size == 0, \ "Number of experts should be a multiple of expert model parallel_size." - assert not args.fp16, \ - "Expert parallelism is not supported with fp16 training." # MoE router check if isinstance(args.moe_router_load_balancing_type, list) and len(args.moe_router_load_balancing_type) == 1: diff --git a/tests/unit_tests/transformer/moe/test_moe_layer.py b/tests/unit_tests/transformer/moe/test_moe_layer.py index 59385f757b3..2a2c995257e 100644 --- a/tests/unit_tests/transformer/moe/test_moe_layer.py +++ b/tests/unit_tests/transformer/moe/test_moe_layer.py @@ -192,3 +192,87 @@ def test_interleave_transformer_block(self, moe_layer_freq): def teardown_method(self, method): Utils.destroy_model_parallel() + + +class TestMoELayerFP16: + """Test MoE layer with FP16 precision.""" + + def setup_method(self, method): + pass + + @pytest.mark.parametrize("moe_token_dispatcher_type", ["allgather", "alltoall"]) + @pytest.mark.parametrize("num_moe_experts", [2, 4]) + @pytest.mark.parametrize("tp_size,ep_size", [(1, 1), (2, 2), (4, 2)]) + def test_moe_layer_fp16_forward_backward( + self, num_moe_experts, moe_token_dispatcher_type, tp_size, ep_size + ): + """Test MoE layer forward and backward pass with fp16 params and inputs.""" + Utils.initialize_model_parallel( + tensor_model_parallel_size=tp_size, expert_model_parallel_size=ep_size + ) + _set_random_seed(seed_=123, data_parallel_random_init=False) + + hidden_size = 64 + sequence_length = 32 + micro_batch_size = 2 + + transformer_config = TransformerConfig( + num_layers=1, + hidden_size=hidden_size, + num_attention_heads=4, + num_moe_experts=num_moe_experts, + use_cpu_initialization=False, + moe_token_dispatcher_type=moe_token_dispatcher_type, + moe_router_load_balancing_type="aux_loss", + moe_router_topk=2, + moe_aux_loss_coeff=0.01, + moe_grouped_gemm=False, # Use SequentialMLP for fp16 test + moe_ffn_hidden_size=256, + add_bias_linear=False, + tensor_model_parallel_size=tp_size, + expert_model_parallel_size=ep_size, + sequence_parallel=tp_size > 1, + fp16=True, + params_dtype=torch.float16, + ) + + transformer_layer_spec = get_gpt_layer_local_spec( + num_experts=num_moe_experts, moe_grouped_gemm=False + ) + + moe_layer = MoELayer( + transformer_config, transformer_layer_spec.submodules.mlp.submodules + ).cuda() + + hidden_states = torch.randn( + sequence_length, + micro_batch_size, + hidden_size, + device=torch.cuda.current_device(), + dtype=torch.float16, + requires_grad=True, + ) + + # Forward pass + output, _ = moe_layer(hidden_states) + + assert output.dtype == torch.float16, f"Expected fp16 output, got {output.dtype}" + assert output.shape == hidden_states.shape, f"Output shape mismatch" + + # Backward pass + loss = output.sum() + loss.backward() + + assert hidden_states.grad is not None, "Input gradients should exist" + assert ( + hidden_states.grad.dtype == torch.float16 + ), f"Expected fp16 gradients, got {hidden_states.grad.dtype}" + + for name, param in moe_layer.named_parameters(): + if param.requires_grad: + assert param.grad is not None, f"Gradient for {name} should exist" + + Utils.destroy_model_parallel() + + def teardown_method(self, method): + Utils.destroy_model_parallel() From dfe4da21527a58ce7790e5310c40c8d1fe0eb664 Mon Sep 17 00:00:00 2001 From: Hao Wu Date: Mon, 8 Dec 2025 08:03:54 -0800 Subject: [PATCH 186/248] Update tp support in muon (#2385) Signed-off-by: Hao Wu --- megatron/core/optimizer/muon.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/megatron/core/optimizer/muon.py b/megatron/core/optimizer/muon.py index b6af7a3e188..ca7c8563b6f 100644 --- a/megatron/core/optimizer/muon.py +++ b/megatron/core/optimizer/muon.py @@ -8,7 +8,6 @@ import torch from torch.optim.optimizer import ParamsT -from megatron.core import parallel_state from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.module import MegatronModule from megatron.core.utils import get_pg_size, log_single_rank @@ -76,7 +75,7 @@ def scaled_orthogonalize_fn( f'{scale_mode} scale mode, extra_scale_factor={extra_scale_factor}', ) size = [grad.size(-2), grad.size(-1)] - if partition_dim: + if partition_dim is not None: size[partition_dim] *= get_pg_size(tp_group) orth_grad = newton_schulz_tp( grad, @@ -130,8 +129,7 @@ def orthogonalize(self, p: torch.Tensor, grad: torch.Tensor, **kwargs: Any) -> t tp_group = None partition_dim = None if self.mode == "blockwise" else getattr(p, "partition_dim", None) if partition_dim == -1: - # llm-shower use different default value for partition_dim than TE. - # Because -1 is a valid index for ndarray, we decided to not overload it. + # emerging-optimizers use None instead of -1 to indicate no tensor parallel partition_dim = None if self.split_qkv and self.is_qkv_fn(p): # type: ignore[misc] @@ -201,8 +199,6 @@ def get_megatron_muon_optimizer( # before this function receive properly created collection if pg_collection is None: pg_collection = ProcessGroupCollection.use_mpu_process_groups() - pg_collection.dp_cp = parallel_state.get_data_parallel_group(with_context_parallel=True) - pg_collection.expt_dp = parallel_state.get_expert_data_parallel_group() log_single_rank(logger, logging.INFO, f'Setting up emerging optimizer with config {config}') From 1d462bd37dac21cfa14177405d4921eedb987052 Mon Sep 17 00:00:00 2001 From: "Dennis(Zhenhuan) Liu" Date: Mon, 8 Dec 2025 14:55:24 -0800 Subject: [PATCH 187/248] [DEV] Update GitHub MoE functional test cases (#2449) --- .../model_config.yaml | 2 +- tests/test_utils/recipes/moe.yaml | 28 +++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/model_config.yaml index 81b023bd86e..d3e3baa9f14 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_ep8_resume_torch_dist_muon/model_config.yaml @@ -64,4 +64,4 @@ MODEL_ARGS: --muon-momentum: 0.9 --muon-extra-scale-factor: 0.2 --muon-scale-mode: spectral -TEST_TYPE: ckpt-resume +TEST_TYPE: regular diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml index 285d16c99f3..aea3ec97597 100644 --- a/tests/test_utils/recipes/moe.yaml +++ b/tests/test_utils/recipes/moe.yaml @@ -109,7 +109,7 @@ products: - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective] products: @@ -121,30 +121,30 @@ products: - environment: [dev] scope: [mr] platforms: [dgx_h100] - # - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon] - # products: - # - environment: [dev] - # scope: [mr, mr-github, mr-slim] - # platforms: [dgx_h100] - # - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_muon] - # products: - # - environment: [dev] - # scope: [mr, mr-github, mr-slim] - # platforms: [dgx_h100] + - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon] + products: + - environment: [dev] + scope: [mr, mr-github] + platforms: [dgx_h100] + - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_muon] + products: + - environment: [dev] + scope: [mr, mr-github] + platforms: [dgx_h100] - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] - test_case: [gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph] products: - environment: [dev] - scope: [mr] + scope: [mr, mr-github] platforms: [dgx_h100] ####################################################################### # Super important mr, mr-github tests that run for both DEV and LTS per mr, mr-github # From 23e092f41ec8bc659020e401ddac9576c1cfed7e Mon Sep 17 00:00:00 2001 From: rj42 Date: Tue, 9 Dec 2025 13:50:31 +0300 Subject: [PATCH 188/248] Fix: don't enter branch if mtp_num_layers == 0 (#2581) Co-authored-by: Xin Yao --- megatron/core/models/gpt/gpt_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 4a6370bc49d..a1230568cbd 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -562,7 +562,8 @@ def _postprocess( if not self.post_process: return hidden_states - if self.config.mtp_num_layers is not None: + # Skip when mtp_num_layers is None or 0 + if self.config.mtp_num_layers: mtp_labels = labels.clone() hidden_states_list = torch.chunk(hidden_states, 1 + self.config.mtp_num_layers, dim=0) hidden_states = hidden_states_list[0] From c60d5c2b7ff564c9cfbaf928d182cee7a887d87c Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Wed, 10 Dec 2025 17:27:45 +0800 Subject: [PATCH 189/248] [Dev] fix(moe): Support HybridEP and reduce memory overhead for 1F1B A2A overlap (#2201) Signed-off-by: Hongbin Liu Signed-off-by: Pingtian Li Co-authored-by: root Co-authored-by: Zijie Yan Co-authored-by: Pingtian Li --- megatron/core/model_parallel_config.py | 13 ++++ .../common/model_chunk_schedule_plan.py | 63 ++++++++++++++++--- .../core/models/gpt/fine_grained_callables.py | 48 ++++++++++---- megatron/core/pipeline_parallel/utils.py | 7 +++ .../core/transformer/transformer_config.py | 11 ++++ megatron/training/arguments.py | 2 + .../a2a_overlap/test_schedule_layer_1f1b.py | 52 +++++++++++++++ 7 files changed, 174 insertions(+), 22 deletions(-) diff --git a/megatron/core/model_parallel_config.py b/megatron/core/model_parallel_config.py index 129135c4cc0..4452bdf360b 100644 --- a/megatron/core/model_parallel_config.py +++ b/megatron/core/model_parallel_config.py @@ -265,6 +265,19 @@ class ModelParallelConfig: delay_wgrad_compute: bool = False """Delay the weight gradient computation to improve batch-level communication overlapping""" + ep_overlap_early_attn_memory_release: bool = False + """Enable early memory release of attention activations during EP overlap. + EP overlap can increase peak memory usage when the overlapped forward module allocates + more memory than what is freed by the backward module. This flag addresses this by + reordering the attention backward pass to occur earlier in the schedule. + Specifically: + - Without this flag: attn_bwd executes after moe_combine_fwd + - With this flag: attn_bwd executes before mlp_fwd + The earlier execution releases attention activations sooner, reducing peak memory. + Note: This may impact performance as moe_combine_fwd and moe_dispatch_bwd become + exposed (not overlapped with other computation). + """ + ################### # Pipeline Parallel ################### diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py index 401d9a81a97..486a498dd73 100644 --- a/megatron/core/models/common/model_chunk_schedule_plan.py +++ b/megatron/core/models/common/model_chunk_schedule_plan.py @@ -77,6 +77,7 @@ def __init__(self, layer, event, chunk_state, comp_stream, comm_stream, extra_ar """ from megatron.core.models.gpt.fine_grained_callables import TransformerLayerState + self.config = layer.config self.layer_state = TransformerLayerState() self.chunk_state = chunk_state self.layer = layer @@ -87,6 +88,32 @@ def __init__(self, layer, event, chunk_state, comp_stream, comm_stream, extra_ar # get callable nodes for transformer/mtp layer self._build_callable_nodes(event, comp_stream, comm_stream, extra_args) + def release_state(self): + """Release reference, this helps avoid memory leak.""" + if hasattr(self, 'attn') and self.attn is not None: + del self.attn + self.attn = None + if hasattr(self, 'post_attn') and self.post_attn is not None: + del self.post_attn + self.post_attn = None + if hasattr(self, 'moe_dispatch') and self.moe_dispatch is not None: + del self.moe_dispatch + self.moe_dispatch = None + if hasattr(self, 'mlp') and self.mlp is not None: + del self.mlp + self.mlp = None + if hasattr(self, 'moe_combine') and self.moe_combine is not None: + del self.moe_combine + self.moe_combine = None + if hasattr(self, 'mtp_post_process') and self.mtp_post_process is not None: + del self.mtp_post_process + self.mtp_post_process = None + if hasattr(self, 'layer_state') and self.layer_state is not None: + del self.layer_state + self.layer_state = None + if hasattr(self, 'layer'): + del self.layer + def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args): """ Builds the callable nodes for the transformer/mtp layer: @@ -114,7 +141,12 @@ def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args): self.layer.config.moe_token_dispatcher_type == "flex" and self.layer.config.moe_flex_dispatcher_backend == "deepep" ) + enable_hybridep = ( + self.layer.config.moe_token_dispatcher_type == "flex" + and self.layer.config.moe_flex_dispatcher_backend == "hybridep" + ) extra_args["enable_deepep"] = enable_deepep + extra_args["enable_hybridep"] = enable_hybridep extra_args["is_moe"] = is_moe extra_args["delay_wgrad_compute"] = self.layer.config.delay_wgrad_compute extra_args["is_mtp"] = is_mtp @@ -221,6 +253,10 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False) b_layer.mlp.backward_dw() b_grad = b_layer.moe_dispatch.backward(b_grad) + if b_layer is not None and b_layer.config.ep_overlap_early_attn_memory_release: + b_grad = b_layer.post_attn.backward(b_grad) + b_grad = b_layer.attn.backward(b_grad) + if f_layer is not None: with f_layer.get_fp8_context(): f_input = f_layer.mlp.forward(f_input) @@ -230,7 +266,7 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False) f_input = f_layer.moe_combine.forward(f_input) f_input = f_layer.mtp_post_process.forward(f_input) - if b_layer is not None: + if b_layer is not None and not b_layer.config.ep_overlap_early_attn_memory_release: b_grad = b_layer.post_attn.backward(b_grad) b_grad = b_layer.attn.backward(b_grad) @@ -372,6 +408,10 @@ def get_layer(self, i): assert i < self.num_layers() return self._transformer_layers[i] + def pop_layer(self): + """Pops the transformer layer in FILO order.""" + return self._transformer_layers.pop() + def num_layers(self): """Gets the number of transformer layers.""" return len(self._transformer_layers) @@ -450,13 +490,14 @@ def run( b_num_layers = b_schedule_plan.num_layers() if b_schedule_plan is not None else 0 overlapped_layers = min(f_num_layers, b_num_layers) + f_layer = b_layer = None # combined forward and backward pass for overlapped layers for i in range(overlapped_layers): f_layer = f_schedule_plan.get_layer(i) - b_layer = b_schedule_plan.get_layer(b_num_layers - 1 - i) - torch.cuda.nvtx.range_push(f"layer_{i}f-layer_{b_num_layers - 1 - i}b") if f_layer.layer.config.fine_grained_activation_offloading: fine_grained_offloading_set_last_layer(i == f_num_layers - 1) + b_layer = b_schedule_plan.pop_layer() + torch.cuda.nvtx.range_push(f"layer_{i}f-layer_{b_schedule_plan.num_layers()}b") f_input, b_grad = TransformerLayerSchedulePlan.run( f_layer, b_layer, @@ -464,15 +505,19 @@ def run( b_grad=b_grad, is_last_layer_in_bwd=(i == b_num_layers - 1), ) + if i < b_num_layers - 1: + b_layer.release_state() torch.cuda.nvtx.range_pop() # backward pass for the remaining layers for i in range(overlapped_layers, b_num_layers): - b_layer = b_schedule_plan.get_layer(b_num_layers - 1 - i) - torch.cuda.nvtx.range_push(f"layer_{b_num_layers - 1 - i}b") + b_layer = b_schedule_plan.pop_layer() + torch.cuda.nvtx.range_push(f"layer_{b_schedule_plan.num_layers()}b") _, b_grad = TransformerLayerSchedulePlan.run( None, b_layer, b_grad=b_grad, is_last_layer_in_bwd=(i == b_num_layers - 1) ) + if i < b_num_layers - 1: + b_layer.release_state() torch.cuda.nvtx.range_pop() # forward pass for the remaining layers @@ -500,7 +545,9 @@ def run( # Delay the last attn_dw in backward pass (attn_dw of the first layer) # for overlapping with the p2p comm if b_num_layers > 0: - b_schedule_plan.get_layer(0).attn.backward_dw() + assert b_layer is not None + b_layer.attn.backward_dw() + b_layer.release_state() # post process forward if f_schedule_plan is not None and f_schedule_plan.post_process is not None: @@ -513,9 +560,7 @@ def run( f_schedule_plan.wait_current_stream() if b_schedule_plan: b_schedule_plan.wait_current_stream() - - # Release reference as early as possible, this helps avoid memory leak. - if b_schedule_plan is not None: + # Release reference as early as possible, this helps avoid memory leak. b_schedule_plan.release_state() return f_input diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index 952b83f95fb..60094976a9a 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -21,6 +21,7 @@ get_mtp_layer_offset, ) from megatron.core.transformer.transformer_layer import TransformerLayer, make_viewless_tensor +from megatron.core.utils import internal_api def weak_method(method): @@ -40,13 +41,15 @@ def wrapped_func(*args, **kwarg): return wrapped_func -def should_free_input(name, is_moe, is_deepep): +@internal_api +def should_free_input(name, is_moe, enable_deepep, enable_hybridep): """Determine if the node should free its input memory. Args: name: Node name is_moe: Whether it's a MoE model - is_deepep: Whether it's a DeepEP model + enable_deepep: Whether to use DeepEP dispatcher + enable_hybridep: Whether to use HybridEP dispatcher Returns: bool: Whether to free input memory @@ -60,12 +63,13 @@ def should_free_input(name, is_moe, is_deepep): # The input and output of A2A are not needed anymore after the forward pass, # so we can free the input memory after the forward pass. free_input_nodes = { - "mlp": True, + "mlp": not enable_hybridep, "moe_combine": True, - # For non-deepep mode, the input is the un-dispatched tokens and probs before dispatch A2A - # and it's not needed anymore after the forward pass - # For deepep mode, they are both needed in backward pass, so they cannot be freed. - "moe_dispatch": not is_deepep, + # For non-DeepEP and non-HybridEP dispatcher mode, the input is the un-dispatched tokens + # and probs before dispatch A2A and it's not needed anymore after the forward pass + # For DeepEP and HybridEP dispatcher mode, they are both needed in backward pass + # and cannot be freed. + "moe_dispatch": not (enable_deepep or enable_hybridep), } return free_input_nodes.get(name, False) @@ -223,12 +227,13 @@ def __init__( it's the per_batch_state_context, o.w. nullcontext name (str): Node name, also used to determine memory strategy bwd_dw_callables (list): List of weight gradient functions for the layer. - extra_args (dict): Extra arguments for the node: is_moe, enable_deepep. + extra_args (dict): Extra arguments for nodes: is_moe, enable_deepep, enable_hybridep. """ # determine whether to free input memory is_moe = extra_args.get("is_moe", False) enable_deepep = extra_args.get("enable_deepep", False) - free_input = should_free_input(name, is_moe, enable_deepep) + enable_hybridep = extra_args.get("enable_hybridep", False) + free_input = should_free_input(name, is_moe, enable_deepep, enable_hybridep) self.delay_wgrad_compute = extra_args.get("delay_wgrad_compute", False) super().__init__( @@ -274,7 +279,13 @@ def backward_impl(self, outputs, output_grad): detached_grad = tuple([e.grad for e in self.detached]) grads = output_grad + detached_grad self.default_backward_func(outputs + self.before_detached, grads) - self._release_state() + # release the output grad memory after backward finishes, + # except when delay_wgrad_comptue is enabled, the grad should be + # kept until all modules' backward_dw has been invoked. + if self.delay_wgrad_compute: + self.output_grads = grads + self.delay_grads_release = len(self.bwd_dw_callables) > 0 + # return grads for record stream return grads @@ -285,9 +296,16 @@ def backward_dw(self): with torch.cuda.nvtx.range(f"{self.name} wgrad"): for module in self.bwd_dw_callables: module.backward_dw() + + # the output grad memory is last used in wgrad compute, should be safe to release. + assert self.delay_grads_release, "output grad memory should be valid before wgrad." + for tensor in self.output_grads: + tensor.untyped_storage().resize_(0) + self.output_grads = None + self.bwd_dw_callables = None - def _release_state(self): + def __del__(self): # Release reference as early as possible, this helps avoid memory leak. self.before_detached = None self.detached = None @@ -328,6 +346,10 @@ def build_transformer_layer_callables(layer: TransformerLayer): layer.config.moe_token_dispatcher_type == "flex" and layer.config.moe_flex_dispatcher_backend == "deepep" ) + enable_hybridep = ( + layer.config.moe_token_dispatcher_type == "flex" + and layer.config.moe_flex_dispatcher_backend == "hybridep" + ) def submodule_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor): """ @@ -379,7 +401,7 @@ def submodule_dispatch_forward( Dispatches tokens to the experts based on the router output. """ token_dispatcher = layer.mlp.token_dispatcher - if enable_deepep: + if enable_deepep or enable_hybridep: # update token_probs to be the detached version, prevents # backward graph from connecting to attn submodule token_dispatcher._comm_manager.token_probs = probs @@ -396,7 +418,7 @@ def submodule_moe_forward(node: ScheduleNode, dispatched_tokens: torch.Tensor): shared_expert_output = None dispatched_probs = node.layer_state.dispatched_probs token_dispatcher = layer.mlp.token_dispatcher - if enable_deepep: + if enable_deepep or enable_hybridep: # update dispatched_probs to be detached version, prevents # backward graph from connecting to dispatch submodule token_dispatcher._comm_manager.dispatched_probs = dispatched_probs diff --git a/megatron/core/pipeline_parallel/utils.py b/megatron/core/pipeline_parallel/utils.py index c50c6ac7964..52d401c79f9 100644 --- a/megatron/core/pipeline_parallel/utils.py +++ b/megatron/core/pipeline_parallel/utils.py @@ -182,6 +182,7 @@ def __init__( self.free_input = free_input self.inputs = None self.outputs = None + self.delay_grads_release = False def default_backward_func(self, outputs, output_grad): """Default backward function""" @@ -263,6 +264,12 @@ def _backward(self, *output_grad): for g in output_grad: if g is not None: g.record_stream(self.stream) + # Manually trigger the memory release of dgrad tensor + # to avoid delayed garbage collection. If + # delay_grads_release is True, dgrad is last used in + # wgrad compute and skip the release here. + if not self.delay_grads_release: + g.untyped_storage().resize_(0) grads = self.get_grad() self._release_state() diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index 31dd5a98a58..fcc45a54c87 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -1843,6 +1843,11 @@ def __post_init__(self): assert ( self.mtp_num_layers is None or self.mtp_num_layers == 1 ), 'MTP layernum only supports 1 when enabling overlap_moe_expert_parallel_comm.' + if self.mtp_num_layers == 1: + assert self.pipeline_model_parallel_size > 1, ( + 'Pipeline model parallel size must be larger than 1 ' + 'when enabling overlap_moe_expert_parallel_comm with MTP layer.' + ) # Check delay_wgrad_compute compatibility if self.delay_wgrad_compute: @@ -1853,6 +1858,12 @@ def __post_init__(self): not self.moe_use_legacy_grouped_gemm ), 'delay_wgrad_compute is not supported with legacy groupedgemm implementation' + if self.ep_overlap_early_attn_memory_release: + assert self.overlap_moe_expert_parallel_comm, ( + 'overlap_moe_expert_parallel_comm must be enabled when enabling ' + 'ep_overlap_early_attn_memory_release' + ) + if self.context_parallel_size > 1 and self.cp_comm_type is not None: if isinstance(self.cp_comm_type, list): assert len(self.cp_comm_type) == self.num_layers, ( diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 682bd94bdf9..847f1531767 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -3348,6 +3348,8 @@ def _add_moe_args(parser): help='Overlap the EP A2A communication by batch-level overlapping in 1f1b stage.') group.add_argument('--delay-wgrad-compute', action='store_true', help='Delay the wgrad compute for batch-level overlapping') + group.add_argument('--ep-overlap-early-attn-memory-release', action='store_true', + help='Release the memory of the attention module early in EP overlap.') group.add_argument('--moe-upcycling-granularity', type=int, default=1, help='This param sepecifics how many times smaller is the expert hidden size compared with the original dense FFN hidden size. ' diff --git a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py index 3ebffb810e5..7fb97f6e586 100644 --- a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py +++ b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py @@ -347,6 +347,58 @@ def test_transformer_layer_overlap_shared_expert(self): comp_res = compare_captures(capture_ref, capture_a2a_overlap, True) assert comp_res[0], f"[rank {torch.distributed.get_rank()}] {comp_res[1]}" + @pytest.mark.skipif(not is_te_min_version("1.9.0.dev0"), reason="Requires TE >= 1.9.0.dev0") + def test_transformer_layer_overlap_early_attn_memory_release(self): + """ + Verifies all-to-all overlap optimization in transformer layer with early attn memory release + produces the same results as the reference implementation. + """ + extra_kwargs = { + "moe_token_dispatcher_type": "alltoall", + "ep_overlap_early_attn_memory_release": True, + "overlap_moe_expert_parallel_comm": True, + } + overlap_config = get_test_config(extra_kwargs=extra_kwargs) + ref_config = get_test_config(extra_kwargs=extra_kwargs) + microbatches = 4 + with deterministic_mode(): + transformer_layer_spec = get_gpt_decoder_block_spec( + config=ref_config, use_transformer_engine=True + ) + gpt_model = GPTModel( + config=ref_config, + transformer_layer_spec=transformer_layer_spec, + vocab_size=100, + pre_process=True, + post_process=True, + max_sequence_length=300, + ) + + params = reset_model(gpt_model) + input_tensors = [build_data() for _ in range(microbatches)] + + fp8_context = get_fp8_context(ref_config, 0) if ref_config.fp8 else nullcontext() + with fp8_context: + capture_ref = run_transformer_layer_ref_with_capture( + gpt_model, input_tensors, microbatches + ) + del gpt_model + + gpt_model = GPTModel( + config=overlap_config, + transformer_layer_spec=transformer_layer_spec, + vocab_size=100, + pre_process=True, + post_process=True, + max_sequence_length=300, + ) + reset_model(gpt_model, params) + capture_a2a_overlap = run_transformer_layer_a2a_overlap_with_capture( + gpt_model, input_tensors, microbatches + ) + comp_res = compare_captures(capture_ref, capture_a2a_overlap, True) + assert comp_res[0], f"[rank {torch.distributed.get_rank()}] {comp_res[1]}" + @pytest.mark.skipif(not is_te_min_version("1.9.0.dev0"), reason="Requires TE >= 1.9.0.dev0") @pytest.mark.parametrize("dispatcher_type", get_valid_token_dispatcher_types()) @pytest.mark.parametrize("fp8_flag", get_valid_fp8_flags()) From 2d398b42fd4237fffb553109563d73ac099751c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 10 Dec 2025 20:28:35 -0800 Subject: [PATCH 190/248] chore: Bump baseline (#2626) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- ...k_api_backwards_compatibility_workflow.yml | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/.github/workflows/check_api_backwards_compatibility_workflow.yml b/.github/workflows/check_api_backwards_compatibility_workflow.yml index 0ccaa8ccc5e..42db9486cac 100644 --- a/.github/workflows/check_api_backwards_compatibility_workflow.yml +++ b/.github/workflows/check_api_backwards_compatibility_workflow.yml @@ -28,7 +28,7 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 - + - name: Check if relevant files changed id: check_files run: | @@ -83,7 +83,7 @@ jobs: if: needs.pre-flight.outputs.should_skip != 'true' name: Check API Backward Compatibility runs-on: ubuntu-latest - + # ============================================================================ # Configuration Parameters (modify here) # ============================================================================ @@ -91,24 +91,24 @@ jobs: # Default baseline for automatic PR checks # Can be: branch name (e.g., 'main'), commit hash, or tag # Will be resolved to commit hash during execution - DEFAULT_BASELINE: '274e04d21fbcb7f53f63de992ee1217f275f1cf2' + DEFAULT_BASELINE: 'ed804b49860201e7103ce0f9c1129a330a384a65' # Tag pattern for auto-detection (e.g., 'core_r*', 'core_v*') TAG_PATTERN: 'core_v*' # Tag regex filter (e.g., '^core_v[0-9]+\.[0-9]+\.[0-9]+$' for stable versions only) TAG_REGEX_FILTER: '^core_v[0-9]+\.[0-9]+\.[0-9]+$' # ============================================================================ - + steps: - name: Checkout code uses: actions/checkout@v4 with: - fetch-depth: 0 # Need full history to access baseline ref - + fetch-depth: 0 # Need full history to access baseline ref + - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.12' - + - name: Install griffe run: | python -m pip install --upgrade pip @@ -116,7 +116,7 @@ jobs: python -c "import griffe; print('Griffe installed successfully')" python -c "from griffe import Object; print('Object import successful')" || echo "Object import from griffe failed" python -c "from griffe.dataclasses import Object; print('Object import from dataclasses successful')" || echo "Object import from dataclasses failed" - + - name: Determine baseline reference id: baseline run: | @@ -134,13 +134,13 @@ jobs: # BASELINE_REF="${{ env.DEFAULT_BASELINE }}" # fi fi - + # Resolve baseline to commit hash (works for branches, tags, or commit hashes) BASELINE_HASH=$(git rev-parse "$BASELINE_REF") - + echo "baseline=$BASELINE_HASH" >> $GITHUB_OUTPUT echo "Using baseline: $BASELINE_REF (resolved to commit: $BASELINE_HASH)" - + - name: Run compatibility check id: compat_check run: | @@ -148,13 +148,13 @@ jobs: python scripts/check_api_backwards_compatibility.py \ --baseline ${{ steps.baseline.outputs.baseline }} \ --verbose 2>&1 | tee compat_check_output.txt - + # Capture exit code EXIT_CODE=${PIPESTATUS[0]} echo "exit_code=$EXIT_CODE" >> $GITHUB_OUTPUT exit $EXIT_CODE continue-on-error: true - + - name: Fail job if breaking changes detected if: steps.compat_check.outcome == 'failure' run: | @@ -233,10 +233,10 @@ jobs: echo "🔧 Checker script: scripts/check_api_backwards_compatibility.py" echo "❓ Questions? Check the docs or ask in #megatron-core" echo "" - + echo "::error::Breaking API changes detected. Please review the output above and choose a resolution strategy." exit 1 - + - name: Success message if: steps.compat_check.outcome == 'success' run: | @@ -271,4 +271,3 @@ jobs: gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "API Backward Compatibility Check Summary") | .name' exit 1 fi - From e8a927578d0fdeb98db5d40ab7bdc81d123795f7 Mon Sep 17 00:00:00 2001 From: Tong Liu Date: Fri, 12 Dec 2025 11:48:39 +0800 Subject: [PATCH 191/248] [Dev] Use the latest Hybrid-EP (#2424) --- docker/Dockerfile.ci.dev | 2 +- megatron/core/transformer/moe/fused_a2a.py | 51 +++++-------------- .../core/transformer/moe/token_dispatcher.py | 15 ++---- 3 files changed, 18 insertions(+), 50 deletions(-) diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev index 482c6af460c..5caa6003630 100644 --- a/docker/Dockerfile.ci.dev +++ b/docker/Dockerfile.ci.dev @@ -62,7 +62,7 @@ RUN bash -ex <<"EOF" git clone --branch hybrid-ep https://github.com/deepseek-ai/DeepEP.git pushd DeepEP - git checkout 1dddd194c26911c35b4f53a148617dd73de0ffc9 + git checkout 83e0d156807f31abed4ea55c2fa6eb4b62a11b82 patch -p1 < /workspace/deepep.patch popd TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" uv pip install --no-build-isolation -v DeepEP/. diff --git a/megatron/core/transformer/moe/fused_a2a.py b/megatron/core/transformer/moe/fused_a2a.py index 045a93039b3..aa13b9b5b5b 100644 --- a/megatron/core/transformer/moe/fused_a2a.py +++ b/megatron/core/transformer/moe/fused_a2a.py @@ -3,6 +3,7 @@ # Copyright (c) 2025 DeepSeek # Licensed under the MIT License - https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE +from megatron.core.utils import internal_api try: from deep_ep import Buffer @@ -328,6 +329,7 @@ def reset_hybrid_ep_buffer(): _hybrid_ep_buffer = None +@internal_api class HybridEPDispatch(torch.autograd.Function): ''' Fused dispatch operation for permute + dispatch a2a + permute using the HybridEP backend @@ -343,7 +345,6 @@ def forward( num_local_experts, num_sms_dispatch_api=24, num_sms_combine_api=24, - num_dispatched_tokens=None, num_permuted_tokens=None, pad_multiple=None, ): @@ -362,11 +363,9 @@ def forward( num_sms_combine_api, fp8_dispatch, ) - # Defaultly, the output token_per_expert and num_dispatched_tokens_tensor - # will be put on the CPU to avoid the potential sync in combine/backward pass, - # but if we provide the num_dispatched_tokens and num_permuted_tokens on CPU, - # we do not need to the D2H here. - use_host_meta = num_dispatched_tokens is None or num_permuted_tokens is None + # If we provide the num_permuted_tokens, we do not need to use sync to + # wait for the data in pinned memory ready + non_blocking = num_permuted_tokens is not None # Process the dispatch ( dispatched_hidden, @@ -381,14 +380,12 @@ def forward( scaling_factor=None, num_of_experts_per_rank=num_local_experts, pad_multiple=pad_multiple, - num_dispatched_tokens=num_dispatched_tokens, num_permuted_tokens=num_permuted_tokens, - use_host_meta=use_host_meta, + non_blocking=non_blocking, ) ctx.handle = handle ctx.pad_multiple = pad_multiple - ctx.num_dispatched_tokens = num_dispatched_tokens return ( dispatched_hidden, dispatched_probs, @@ -404,36 +401,27 @@ def backward(ctx, grad_x, grad_probs, grad_scaling_factor, grad_tokens_per_exper ''' handle = ctx.handle combined_hidden, combined_probs = _hybrid_ep_buffer.combine_with_unpermute( - hidden=grad_x, - probs=grad_probs, - handle=handle, - pad_multiple=ctx.pad_multiple, - num_dispatched_tokens=ctx.num_dispatched_tokens, + hidden=grad_x, probs=grad_probs, handle=handle, pad_multiple=ctx.pad_multiple ) return combined_hidden, None, combined_probs, None, None, None, None, None, None, None +@internal_api class HybridEPCombine(torch.autograd.Function): ''' Fused combine operation for permute + combine a2a + permute using the HybridEP backend ''' @staticmethod - def forward( - ctx, x, handle, num_dispatched_tokens=None, num_permuted_tokens=None, pad_multiple=None - ): + def forward(ctx, x, handle, num_permuted_tokens=None, pad_multiple=None): ''' Forward pass of fused combine of the HybridEP backend ''' combined_hidden, _ = _hybrid_ep_buffer.combine_with_unpermute( - hidden=x, - handle=handle, - pad_multiple=pad_multiple, - num_dispatched_tokens=num_dispatched_tokens, + hidden=x, handle=handle, pad_multiple=pad_multiple ) ctx.handle = handle ctx.pad_multiple = pad_multiple - ctx.num_dispatched_tokens = num_dispatched_tokens ctx.num_permuted_tokens = num_permuted_tokens return combined_hidden @@ -448,7 +436,6 @@ def backward(ctx, grad_x): scaling_factor=None, handle=handle, pad_multiple=ctx.pad_multiple, - num_dispatched_tokens=ctx.num_dispatched_tokens, num_permuted_tokens=ctx.num_permuted_tokens, ) return dispatched_hidden, None, None, None, None @@ -456,6 +443,7 @@ def backward(ctx, grad_x): if HAVE_HYBRIDEP: + @internal_api def hybrid_ep_dispatch( x, routing_map, @@ -464,7 +452,6 @@ def hybrid_ep_dispatch( num_local_experts, num_sms_dispatch_api=24, num_sms_combine_api=24, - num_dispatched_tokens=None, num_permuted_tokens=None, pad_multiple=None, ): @@ -487,10 +474,6 @@ def hybrid_ep_dispatch( Number of SMs used by the dispatch API. num_sms_combine_api (int): Number of SMs used by the combine API. - num_dispatched_tokens (int): - Number of tokens after dispatch but before permute. HybridEP uses this - to allocate buffers. If not provided, HybridEP obtains the size from - a GPU tensor, which causes a D2H synchronization. num_permuted_tokens (int): Number of tokens after permute. HybridEP uses this to allocate buffers. If not provided, HybridEP obtains the size from a GPU tensor, @@ -507,12 +490,12 @@ def hybrid_ep_dispatch( num_local_experts, num_sms_dispatch_api, num_sms_combine_api, - num_dispatched_tokens, num_permuted_tokens, pad_multiple, ) - def hybrid_ep_combine(x, handle, num_dispatched_tokens, num_permuted_tokens, pad_multiple): + @internal_api + def hybrid_ep_combine(x, handle, num_permuted_tokens, pad_multiple): ''' Perform fused combine operation for unpermute + combine a2a + unpermute using the HybridEP backend @@ -522,10 +505,6 @@ def hybrid_ep_combine(x, handle, num_dispatched_tokens, num_permuted_tokens, pad Input hidden states to combine handle (EventHandle): Communication handle from dispatch operation - num_dispatched_tokens (int): - The number of tokens after unpermute but before combine. HybridEP uses this - to allocate buffers. If not provided, HybridEP obtains the size from a GPU tensor, - which causes a D2H synchronization. num_permuted_tokens (int): The number of tokens before unpermute. HybridEP uses this to allocate buffers. If not provided, HybridEP obtains the size from a GPU tensor, which causes a D2H synchronization. @@ -533,9 +512,7 @@ def hybrid_ep_combine(x, handle, num_dispatched_tokens, num_permuted_tokens, pad The alignment multiple required for FP8 GEMM. If not provided, no padding is performed. ''' - return HybridEPCombine.apply( - x, handle, num_dispatched_tokens, num_permuted_tokens, pad_multiple - ) + return HybridEPCombine.apply(x, handle, num_permuted_tokens, pad_multiple) else: hybrid_ep_dispatch = None diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py index 61ef0b5f084..d0da38d6322 100644 --- a/megatron/core/transformer/moe/token_dispatcher.py +++ b/megatron/core/transformer/moe/token_dispatcher.py @@ -985,11 +985,8 @@ def __init__( if self.drop_and_pad: assert self.capacity_factor is not None self.capacity = None - # The up-bound for the number of tokens after dispatch op, -1 means no up-bound, - # which will cause a CPU sync - self.num_dispatched_tokens = None - # Actually the sum of tokens_per_expert, the up-bound for the number of tokens - # after permute op, -1 means no up-bound, will cause a CPU sync + # Actually the the up-bound for the number of tokens + # after permute op, None means no up-bound, will cause a CPU sync self.num_permuted_tokens = None # Metadata @@ -1018,12 +1015,9 @@ def setup_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor): num_experts=self.num_experts, capacity_factor=self.capacity_factor, ) - # We cannot predict the actual number of tokens after the dispatch op, - # so we set it to the worst case in drop_and_pad mode - self.num_dispatched_tokens = self.capacity * self.group.size() * self.num_local_experts # In drop_and_pad mode, the number of tokens after the permute op # can be computed on the CPU - self.num_permuted_tokens = self.num_dispatched_tokens + self.num_permuted_tokens = self.capacity * self.group.size() * self.num_local_experts self.tokens_per_expert = torch.full( (self.num_local_experts,), self.capacity * self.group.size(), dtype=torch.long ) @@ -1052,7 +1046,6 @@ def dispatch( num_local_experts=self.num_local_experts, num_sms_dispatch_api=self.config.moe_hybridep_num_sms, num_sms_combine_api=self.config.moe_hybridep_num_sms, - num_dispatched_tokens=self.num_dispatched_tokens, num_permuted_tokens=self.num_permuted_tokens, pad_multiple=self.pad_multiple, ) @@ -1074,7 +1067,6 @@ def combine( hidden_states = hybrid_ep_combine( x=hidden_states, handle=self.handle, - num_dispatched_tokens=self.num_dispatched_tokens, num_permuted_tokens=self.num_permuted_tokens, pad_multiple=self.pad_multiple, ) @@ -1084,7 +1076,6 @@ def combine( self.handle = None if not self.drop_and_pad: self.num_permuted_tokens = None - self.num_dispatched_tokens = None return hidden_states def get_permuted_hidden_states_by_experts(self, hidden_states: torch.Tensor) -> torch.Tensor: From 305957aa065b65d07bd5c876dd74a571c3eca409 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 12 Dec 2025 10:04:50 -0800 Subject: [PATCH 192/248] API compat: ignore ParameterMovedBreakage for __init__ methods (#2649) Signed-off-by: Pablo Garay --- scripts/check_api_backwards_compatibility.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/scripts/check_api_backwards_compatibility.py b/scripts/check_api_backwards_compatibility.py index 4977b806433..3c66f00b619 100644 --- a/scripts/check_api_backwards_compatibility.py +++ b/scripts/check_api_backwards_compatibility.py @@ -46,13 +46,22 @@ # Decorators that exempt objects from compatibility checks EXEMPT_DECORATORS = ['internal_api', 'deprecated', 'experimental_api'] -# Breakage kinds to ignore (not actual API signature changes) +# Breakage kinds to ignore globally (not actual API signature changes) # AttributeChangedValueBreakage: Changing constant values (e.g., VERSION = "1.0" -> "2.0") # is not a breaking API change - the constant still exists with the same name IGNORED_BREAKAGE_KINDS = [ 'AttributeChangedValueBreakage', ] +# Breakage kinds to ignore only for __init__ methods +# ParameterMovedBreakage: Reordering parameters in __init__ is generally safe because: +# - Config dataclasses should always be initialized with keyword arguments +# - Adding fields to parent dataclasses shifts child __init__ params (inheritance artifact) +# - Nobody should call Config(4096, 32, ...) with positional args +IGNORED_FOR_INIT_METHODS = [ + 'ParameterMovedBreakage', +] + def has_exempt_decorator(obj: Object) -> bool: """Check if a Griffe object has any exempt decorator. @@ -217,6 +226,7 @@ def should_skip_change(change, filtered_paths: set) -> bool: A change is skipped if: - The change kind is in IGNORED_BREAKAGE_KINDS (not a signature change) + - The change kind is in IGNORED_FOR_INIT_METHODS and affects an __init__ method - The changed object itself is in filtered_paths (exact match) - The changed object is a child of an exempt object (prefix match) @@ -227,7 +237,7 @@ def should_skip_change(change, filtered_paths: set) -> bool: Returns: bool: True if the change should be skipped (filtered out) """ - # Check if this breakage kind should be ignored (not a signature change) + # Check if this breakage kind should be ignored globally (not a signature change) change_kind = type(change).__name__ if change_kind in IGNORED_BREAKAGE_KINDS: return True @@ -240,6 +250,12 @@ def should_skip_change(change, filtered_paths: set) -> bool: # e.g., "Class.__init__(param)" -> "Class.__init__" clean_path = path.split('(')[0] if '(' in path else path + # Check if this is a breakage kind we ignore for __init__ methods + # Config dataclasses should use keyword args, so parameter reordering is safe + if change_kind in IGNORED_FOR_INIT_METHODS: + if '.__init__' in clean_path: + return True + # Check exact match if clean_path in filtered_paths or path in filtered_paths: return True From e93814b4c6965c3f8639abdf690416c08937f370 Mon Sep 17 00:00:00 2001 From: Maanu Grover <109391026+maanug-nv@users.noreply.github.com> Date: Mon, 15 Dec 2025 18:01:42 -0800 Subject: [PATCH 193/248] [training migration] add training config dataclass and arg generation utility (#2651) Signed-off-by: Maanu Grover Co-authored-by: Eric Harper --- megatron/core/safe_globals.py | 2 + megatron/training/argument_utils.py | 250 +++++++++ megatron/training/arguments.py | 102 +--- megatron/training/config.py | 116 ++++ megatron/training/dist_signal_handler.py | 11 +- tests/unit_tests/test_argument_utils.py | 643 +++++++++++++++++++++++ 6 files changed, 1023 insertions(+), 101 deletions(-) create mode 100644 megatron/training/argument_utils.py create mode 100644 megatron/training/config.py create mode 100644 tests/unit_tests/test_argument_utils.py diff --git a/megatron/core/safe_globals.py b/megatron/core/safe_globals.py index ddb1dd25399..8bcfe788f60 100755 --- a/megatron/core/safe_globals.py +++ b/megatron/core/safe_globals.py @@ -3,6 +3,7 @@ from argparse import Namespace from io import BytesIO from pathlib import PosixPath +from signal import Signals from types import SimpleNamespace import torch @@ -31,6 +32,7 @@ RerunMode, RerunState, BytesIO, + Signals, ] diff --git a/megatron/training/argument_utils.py b/megatron/training/argument_utils.py new file mode 100644 index 00000000000..b9f7c7b22d1 --- /dev/null +++ b/megatron/training/argument_utils.py @@ -0,0 +1,250 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +import dataclasses +import typing +import types +from typing import Any, Optional +from argparse import ArgumentParser, _ArgumentGroup +import inspect +import itertools +import builtins +import ast +import enum +from dataclasses import Field, fields + +# TODO: support arg renames + +class TypeInferenceError(Exception): + """Custom exception type to be conditionally handled by ArgumentGroupFactory.""" + pass + +class ArgumentGroupFactory: + """Utility that adds an argument group to an ArgumentParser based on the attributes of a dataclass. + + This utility uses dataclass metadata including type annotations and docstrings to automatically + infer the type, default, and other argparse keyword arguments. + + You can override or supplement the automatically inferred argparse kwargs for any + dataclass field by providing an "argparse_meta" key in the field's metadata dict. + The value should be a dict of kwargs that will be passed to ArgumentParser.add_argument(). + These metadata kwargs take precedence over the automatically inferred values. + + Example: + @dataclass + class YourConfig: + your_attribute: int | str | None = field( + default=None, + metadata={ + "argparse_meta": { + "arg_names": ["--your-arg-name1", "--your-arg-name2"], + "type": str, + "nargs": "+", + "default": "foo", + } + }, + ) + + In this example, inferring the type automatically would fail, as Unions are + not supported. However the metadata is present, so that takes precedence. + Any keyword arguments to `ArgumentParser.add_argument()` can be included in + the "argparse_meta" dict, as well as "arg_names" for the argument flag name. + + This class can also be used as a base class and extended as needed to support dataclasses + that require some customized or additional handling. + + Args: + src_cfg_class: The source dataclass type (not instance) whose fields will be + converted into command-line arguments. Each field's type annotation determines + the argument type, default values become argument defaults, and field-level + docstrings are extracted to populate argument help text. + exclude: Optional list of attribute names from `src_cfg_class` to exclude from + argument generation. Useful for omitting internal fields, computed properties, + or attributes that should be configured through other means. If None, all + dataclass fields will be converted to command-line arguments. Default: None. + """ + + def __init__(self, src_cfg_class: type, exclude: Optional[list[str]] = None) -> None: + self.src_cfg_class = src_cfg_class + self.field_docstrings = self._get_field_docstrings(src_cfg_class) + self.exclude = set(exclude) if exclude is not None else set() + + def _format_arg_name(self, config_attr_name: str, prefix: Optional[str] = None) -> str: + """Convert dataclass name into appropriate argparse flag name. + + Args: + config_attr_name: dataclass attribute name + prefix: prefix string to add to the dataclass attribute name. e.g. 'no' for bool + settings that are default True. A hyphen is added after the prefix. Default: None + """ + arg_name = config_attr_name + if prefix: + arg_name = prefix + '_' + arg_name + arg_name = "--" + arg_name.replace("_", "-") + return arg_name + + def _get_enum_kwargs(self, config_type: enum.EnumMeta) -> dict[str, Any]: + """Build kwargs for Enums. + + With these settings, the user must provide a valid enum value, e.g. + 'flash', for `AttnBackend.flash`. + """ + def enum_type_handler(cli_arg): + return config_type[cli_arg] + + return {"type": enum_type_handler, "choices": list(config_type)} + + def _extract_type(self, config_type: type) -> dict[str, Any]: + """Determine the type, nargs, and choices settings for this argument. + + Args: + config_type: attribute type from dataclass + """ + origin = typing.get_origin(config_type) + type_tuple = typing.get_args(config_type) + + if isinstance(config_type, type) and issubclass(config_type, enum.Enum): + return self._get_enum_kwargs(config_type) + + # Primitive type + if origin is None: + return {"type": config_type} + + if origin in [types.UnionType, typing.Union]: + # Handle Optional and Union + if type_tuple[1] == type(None): # Optional type. First element is value inside Optional[] + return self._extract_type(type_tuple[0]) + else: + raise TypeInferenceError(f"Unions not supported by argparse: {config_type}") + + elif origin is list: + if len(type_tuple) == 1: + kwargs = self._extract_type(type_tuple[0]) + kwargs["nargs"] = "+" + return kwargs + else: + raise TypeInferenceError(f"Multi-type lists not supported by argparse: {config_type}") + + elif origin is typing.Literal: + choices_types = [type(choice) for choice in type_tuple] + assert all([t == choices_types[0] for t in choices_types]), "Type of each choice in a Literal type should all be the same." + kwargs = {"type": choices_types[0], "choices": type_tuple} + return kwargs + else: + raise TypeInferenceError(f"Unsupported type: {config_type}") + + + def _build_argparse_kwargs_from_field(self, attribute: Field) -> dict[str, Any]: + """Assemble kwargs for add_argument(). + + Args: + attribute: dataclass attribute + """ + argparse_kwargs = {} + argparse_kwargs["arg_names"] = [self._format_arg_name(attribute.name)] + argparse_kwargs["dest"] = attribute.name + argparse_kwargs["help"] = self.field_docstrings[attribute.name] if attribute.name in self.field_docstrings else "" + + # dataclasses specifies that both should not be set + if isinstance(attribute.default, type(dataclasses.MISSING)): + # dataclasses specified default_factory must be a zero-argument callable + argparse_kwargs["default"] = attribute.default_factory() + else: + argparse_kwargs["default"] = attribute.default + + attr_argparse_meta = None + if attribute.metadata != {} and "argparse_meta" in attribute.metadata: + # save metadata here, but update at the end so the metadata has highest precedence + attr_argparse_meta = attribute.metadata["argparse_meta"] + + + # if we cannot infer the argparse type, all of this logic may fail. we try to defer + # to the developer-specified metadata if present + try: + argparse_kwargs.update(self._extract_type(attribute.type)) + + # use store_true or store_false action for enable/disable flags, which doesn't accept a 'type' + if argparse_kwargs["type"] == bool: + argparse_kwargs["action"] = "store_true" if attribute.default == False else "store_false" + argparse_kwargs.pop("type") + + # add '--no-*' and '--disable-*' prefix if this is a store_false argument + if argparse_kwargs["action"] == "store_false": + argparse_kwargs["arg_names"] = [self._format_arg_name(attribute.name, prefix="no"), self._format_arg_name(attribute.name, prefix="disable")] + except TypeInferenceError as e: + if attr_argparse_meta is not None: + print( + f"WARNING: Inferring the appropriate argparse argument type from {self.src_cfg_class} " + f"failed for {attribute.name}: {attribute.type}.\n" + "Deferring to attribute metadata. If the metadata is incomplete, 'parser.add_argument()' may fail.\n" + f"Original failure: {e}" + ) + else: + raise e + + # metadata provided by field takes precedence + if attr_argparse_meta is not None: + argparse_kwargs.update(attr_argparse_meta) + + return argparse_kwargs + + def build_group(self, parser: ArgumentParser, title: Optional[str] = None) -> _ArgumentGroup: + """Entrypoint method that adds the argument group to the parser. + + Args: + parser: The parser to add arguments to + title: Title for the argument group + """ + arg_group = parser.add_argument_group(title=title, description=self.src_cfg_class.__doc__) + for attr in fields(self.src_cfg_class): + if attr.name in self.exclude or attr.init is False: + continue + + add_arg_kwargs = self._build_argparse_kwargs_from_field(attr) + + arg_names = add_arg_kwargs.pop("arg_names") + arg_group.add_argument(*arg_names, **add_arg_kwargs) + + return arg_group + + def _get_field_docstrings(self, src_cfg_class: type) -> dict[str, str]: + """Extract field-level docstrings from a dataclass by inspecting its AST. + + Recurses on parent classes of `src_cfg_class`. + + Args: + src_cfg_class: Dataclass to get docstrings from. + """ + source = inspect.getsource(src_cfg_class) + tree = ast.parse(source) + root_node = tree.body[0] + + assert isinstance(root_node, ast.ClassDef), "Provided object must be a class." + + field_docstrings = {} + + # Iterate over body of the dataclass using 2-width sliding window. + # When 'a' is an assignment expression and 'b' is a constant, the window is + # lined up with an attribute-docstring pair. The pair can be saved to our dict. + for a, b in itertools.pairwise(root_node.body): + a_cond = isinstance(a, ast.AnnAssign) and isinstance(a.target, ast.Name) + b_cond = isinstance(b, ast.Expr) and isinstance(b.value, ast.Constant) + + if a_cond and b_cond: + # These should be guaranteed by typechecks above, but assert just in case + assert isinstance(a.target.id, str), "Dataclass attribute not in the expected format. Name is not a string." + assert isinstance(b.value.value, str), "Dataclass attribute docstring is not a string." + + # Formatting + docstring = inspect.cleandoc(b.value.value) + docstring = ' '.join(docstring.split()) + + field_docstrings[a.target.id] = docstring + + # recurse on parent class + base_classes = src_cfg_class.__bases__ + if len(base_classes) > 0: + parent_class = base_classes[0] + if parent_class.__name__ not in builtins.__dict__: + field_docstrings.update(self._get_field_docstrings(base_classes[0])) + + return field_docstrings diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 7c9e4531c6d..70d1e4b1306 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -34,7 +34,6 @@ ) from megatron.core.activations import squared_relu from megatron.core.fusions.fused_bias_geglu import quick_gelu -from megatron.training.dist_signal_handler import SIGNAL_MAP from megatron.training.utils import ( get_device_arch_version, update_use_dist_ckpt, @@ -48,6 +47,8 @@ load_quantization_recipe, ) +from megatron.training.argument_utils import ArgumentGroupFactory + def add_megatron_arguments(parser: argparse.ArgumentParser): """"Add Megatron-LM arguments to the given parser.""" @@ -2118,41 +2119,14 @@ def _add_rl_args(parser): return parser def _add_training_args(parser): - group = parser.add_argument_group(title='training') + from megatron.training.config import TrainingConfig + + train_factory = ArgumentGroupFactory(TrainingConfig) + group = train_factory.build_group(parser, "training") - group.add_argument('--micro-batch-size', type=int, default=None, - help='Batch size per model instance (local batch size). ' - 'Global batch size is local batch size times data ' - 'parallel size times number of micro batches.') group.add_argument('--batch-size', type=int, default=None, help='Old batch size parameter, do not use. ' 'Use --micro-batch-size instead') - group.add_argument('--global-batch-size', type=int, default=None, - help='Training batch size. If set, it should be a ' - 'multiple of micro-batch-size times data-parallel-size. ' - 'If this value is None, then ' - 'use micro-batch-size * data-parallel-size as the ' - 'global batch size. This choice will result in 1 for ' - 'number of micro-batches.') - group.add_argument('--rampup-batch-size', nargs='*', default=None, - help='Batch size ramp up with the following values:' - ' --rampup-batch-size ' - ' ' - ' ' - 'For example:' - ' --rampup-batch-size 16 8 300000 \\ ' - ' --global-batch-size 1024' - 'will start with global batch size 16 and over ' - ' (1024 - 16) / 8 = 126 intervals will increase' - 'the batch size linearly to 1024. In each interval' - 'we will use approximately 300000 / 126 = 2380 samples.') - group.add_argument('--decrease-batch-size-if-needed', action='store_true', default=False, - help='If set, decrease batch size if microbatch_size * dp_size' - 'does not divide batch_size. Useful for KSO (Keep Soldiering On)' - 'to continue making progress if number of healthy GPUs (and' - 'corresponding dp_size) does not support current batch_size.' - 'Old batch_size will be restored if training is re-started with' - 'dp_size that divides batch_size // microbatch_size.') group.add_argument('--recompute-activations', action='store_true', help='recompute activation to allow for training ' 'with larger models, sequences, and batch sizes.') @@ -2221,8 +2195,6 @@ def _add_training_args(parser): help='Global step to start profiling.') group.add_argument('--profile-step-end', type=int, default=12, help='Global step to stop profiling.') - group.add_argument('--iterations-to-skip', nargs='+', type=int, default=[], - help='List of iterations to skip, empty by default.') group.add_argument('--result-rejected-tracker-filename', type=str, default=None, help='Optional name of file tracking `result_rejected` events.') group.add_argument('--disable-gloo-process-groups', action='store_false', @@ -2265,47 +2237,19 @@ def _add_training_args(parser): group.add_argument('--use-cpu-initialization', action='store_true', default=None, help='If set, initialize weights on the CPU. This eliminates init differences based on tensor parallelism.') - group.add_argument('--empty-unused-memory-level', default=0, type=int, - choices=[0, 1, 2], - help='Call torch.cuda.empty_cache() each iteration ' - '(training and eval), to reduce fragmentation.' - '0=off, 1=moderate, 2=aggressive.') group.add_argument('--deterministic-mode', action='store_true', help='Choose code that has deterministic execution. This usually ' 'means slower execution, but is good for debugging and testing.') - group.add_argument('--check-weight-hash-across-dp-replicas-interval', type=int, default=None, - help='Interval to check weight hashes are same across DP replicas. If not specified, weight hashes not checked.') group.add_argument('--calculate-per-token-loss', action='store_true', help=('Scale cross entropy loss by the number of non-padded tokens in the ' 'global batch, versus the default behavior of assuming all tokens are non-padded.')) - group.add_argument('--train-sync-interval', type=int, default=None, - help='Training CPU-GPU synchronization interval, to ensure that CPU is not running too far ahead of GPU.') # deprecated group.add_argument('--checkpoint-activations', action='store_true', help='Checkpoint activation to allow for training ' 'with larger models, sequences, and batch sizes.') - group.add_argument('--train-iters', type=int, default=None, - help='Total number of iterations to train over all ' - 'training runs. Note that either train-iters or ' - 'train-samples should be provided.') - group.add_argument('--train-samples', type=int, default=None, - help='Total number of samples to train over all ' - 'training runs. Note that either train-iters or ' - 'train-samples should be provided.') group.add_argument('--log-interval', type=int, default=100, help='Report loss and timing interval.') - group.add_argument('--exit-interval', type=int, default=None, - help='Exit the program after the iteration is divisible ' - 'by this value.') - group.add_argument('--exit-duration-in-mins', type=int, default=None, - help='Exit the program after this many minutes.') - group.add_argument('--exit-signal-handler', action='store_true', - help='Dynamically save the checkpoint and shutdown the ' - 'training if signal is received') - group.add_argument('--exit-signal', type=str, default='SIGTERM', - choices=list(SIGNAL_MAP.keys()), - help='Signal to use for exit signal handler. If not specified, defaults to SIGTERM.') group.add_argument('--tensorboard-dir', type=str, default=None, help='Write TensorBoard logs to this directory.') group.add_argument('--no-masked-softmax-fusion', @@ -2399,22 +2343,6 @@ def _add_training_args(parser): '--use-legacy-models to not use core models.') group.add_argument('--use-legacy-models', action='store_true', help='Use the legacy Megatron models, not Megatron-Core models.') - group.add_argument('--manual-gc', action='store_true', - help='Disable the threshold-based default garbage ' - 'collector and trigger the garbage collection manually. ' - 'Manual garbage collection helps to align the timing of ' - 'the collection across ranks which mitigates the impact ' - 'of CPU-associated jitters. When the manual gc is enabled, ' - 'garbage collection is performed only at the start and the ' - 'end of the validation routine by default.') - group.add_argument('--manual-gc-interval', type=int, default=0, - help='Training step interval to trigger manual garbage ' - 'collection. When the value is set to 0, garbage ' - 'collection is not triggered between training steps.') - group.add_argument('--no-manual-gc-eval', action='store_false', - help='When using manual garbage collection, disable ' - 'garbage collection at the start and the end of each ' - 'evaluation run.', dest='manual_gc_eval') group.add_argument('--disable-tp-comm-split-ag', action='store_false', help='Disables the All-Gather overlap with fprop GEMM.', dest='tp_comm_split_ag') @@ -2923,20 +2851,10 @@ def _add_distributed_args(parser): def _add_validation_args(parser): - group = parser.add_argument_group(title='validation') - - group.add_argument('--full-validation', action='store_true', help='If set, each time validation occurs it uses the full validation dataset(s). This currently only works for GPT datasets!') - group.add_argument('--multiple-validation-sets', action='store_true', help='If set, multiple datasets listed in the validation split are evaluated independently with a separate loss for each dataset in the list. This argument requires that no weights are included in the list') - group.add_argument('--eval-iters', type=int, default=100, - help='Number of iterations to run for evaluation' - 'validation/test for.') - group.add_argument('--eval-interval', type=int, default=1000, - help='Interval between running evaluation on ' - 'validation set.') - group.add_argument("--test-mode", action="store_true", help='Run all real-time test alongside the experiment.') - group.add_argument('--skip-train', action='store_true', - default=False, help='If set, bypass the training loop, ' - 'optionally do evaluation for validation/test, and exit.') + from megatron.training.config import ValidationConfig + + val_factory = ArgumentGroupFactory(ValidationConfig) + group = val_factory.build_group(parser, "validation") return parser diff --git a/megatron/training/config.py b/megatron/training/config.py new file mode 100644 index 00000000000..d978083372d --- /dev/null +++ b/megatron/training/config.py @@ -0,0 +1,116 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +from dataclasses import dataclass, field +import signal +from typing import Literal + +@dataclass(kw_only=True) +class TrainingConfig: + """Configuration settings related to the training loop.""" + + micro_batch_size: int | None = None + """Batch size per model instance (local batch size). Global batch size is local batch size times + data parallel size times number of micro batches.""" + + global_batch_size: int | None = None + """Training batch size. If set, it should be a multiple of micro-batch-size times + data-parallel-size. If this value is None, then use micro-batch-size * data-parallel-size + as the global batch size. This choice will result in 1 for number of micro-batches.""" + + rampup_batch_size: list[int] | None = field(default=None, metadata={"argparse_meta": {"nargs": 3}}) + """Batch size ramp up with the following values: , , + + For example: + rampup-batch-size = [16, 8, 300000] + global-batch-size 1024 + will start with global batch size 16 and over (1024 - 16) / 8 = 126 intervals will increase + the batch size linearly to 1024. In each interval we will use approximately + 300000 / 126 = 2380 samples. + """ + + decrease_batch_size_if_needed: bool = False + """If set, decrease batch size if microbatch_size * dp_size does not + divide batch_size. Old batch_size will be restored if training is re-started + with dp_size that divides batch_size // microbatch_size.""" + + empty_unused_memory_level: Literal[0, 1, 2] = 0 + """Call torch.cuda.empty_cache() each iteration (training and eval), to reduce fragmentation. + 0=off, 1=moderate, 2=aggressive. + """ + + check_weight_hash_across_dp_replicas_interval: int | None = None + """Interval to check weight hashes are same across DP replicas. If not specified, weight hashes not checked.""" + + train_sync_interval: int | None = None + """Training CPU-GPU synchronization interval, to ensure that CPU is not running too far ahead of GPU.""" + + train_iters: int | None = None + """Total number of iterations to train over all training runs. + Note that either train_iters or train_samples should be provided. + """ + + train_samples: int | None = None + """Total number of samples to train over all training runs. + Note that either train_iters or train_samples should be provided.""" + + exit_interval: int | None = None + """Exit the program after the iteration is divisible by this value.""" + + exit_duration_in_mins: int | None = None + """Exit the program after this many minutes.""" + + exit_signal_handler: bool = False + """Dynamically save the checkpoint and shutdown the training if SIGTERM is received""" + + exit_signal: signal.Signals = signal.SIGTERM + """Signal for the signal handler to detect.""" + + exit_signal_handler_for_dataloader: bool = False + """Use signal handler for dataloader workers""" + + manual_gc: bool = False + """Disable the threshold-based default garbage collector and trigger the garbage collection + manually. Manual garbage collection helps to align the timing of the collection across ranks + which mitigates the impact of CPU-associated jitters. When the manual gc is enabled, garbage + collection is performed only at the start and the end of the validation routine by default.""" + + manual_gc_interval: int = 0 + """Training step interval to trigger manual garbage collection. Values > 0 will trigger garbage + collections between training steps. + """ + + manual_gc_eval: bool = True + """When using manual garbage collection, this controls garbage collection at the start and the + end of each evaluation run. + """ + + iterations_to_skip: list[int] = field(default_factory=list) + """List of iterations to skip during training, empty by default.""" + + +@dataclass(kw_only=True) +class ValidationConfig: + """Configuration settings related to validation during or after model training.""" + + eval_iters: int | None = 100 + """Number of iterations to run for evaluation. Used for both validation and test. If not set, + evaluation will not run.""" + + eval_interval: int | None = None + """Interval between running evaluation on validation set. If not set, evaluation will not run + during training. + """ + + skip_train: bool = False + """If set, bypass the training loop, perform evaluation for validation/test, and exit.""" + + test_mode: bool = False + """Run all real-time test alongside the experiment.""" + + full_validation: bool = False + """If set, each time validation occurs it uses the full validation dataset(s). This currently only works for GPT datasets!""" + + multiple_validation_sets: bool = False + """If set, multiple datasets listed in the validation split are evaluated independently with a + separate loss for each dataset in the list. This argument requires that no weights are + included in the list. + """ diff --git a/megatron/training/dist_signal_handler.py b/megatron/training/dist_signal_handler.py index f1f3725c8a9..0ecd706fdc7 100644 --- a/megatron/training/dist_signal_handler.py +++ b/megatron/training/dist_signal_handler.py @@ -3,13 +3,6 @@ import torch -SIGNAL_MAP = { - 'SIGTERM': signal.SIGTERM, - 'SIGINT': signal.SIGINT, - 'SIGUSR1': signal.SIGUSR1, - 'SIGUSR2': signal.SIGUSR2 -} - def get_world_size(): if torch.distributed.is_available() and torch.distributed.is_initialized(): world_size = torch.distributed.get_world_size() @@ -55,8 +48,8 @@ def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None): class DistributedSignalHandler: - def __init__(self, sig: str = 'SIGTERM'): - self.sig = SIGNAL_MAP.get(sig, signal.SIGTERM) + def __init__(self, sig: signal.Signals = signal.SIGTERM): + self.sig = sig def signals_received(self): all_received = all_gather_item( diff --git a/tests/unit_tests/test_argument_utils.py b/tests/unit_tests/test_argument_utils.py new file mode 100644 index 00000000000..e5744c3b074 --- /dev/null +++ b/tests/unit_tests/test_argument_utils.py @@ -0,0 +1,643 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +import signal +from argparse import ArgumentError, ArgumentParser +from dataclasses import dataclass, field +from typing import Callable, Literal, Optional, Union + +import pytest + +from megatron.training.argument_utils import ArgumentGroupFactory, TypeInferenceError + + +@dataclass +class DummyConfig: + """A dummy configuration for testing.""" + + name: str = "default_name" + """Name of the configuration""" + + count: int = 42 + """Number of items""" + + learning_rate: float = 0.001 + """Learning rate for training""" + + enabled: bool = False + """Whether feature is enabled""" + + disabled_feature: bool = True + """Feature that is disabled by default""" + + enum_setting: signal.Signals = signal.SIGTERM + """Setting with enum type to test enum handling""" + + +@dataclass +class ConfigWithOptional: + """Config with optional fields.""" + + required_field: str = "required" + """A required field""" + + optional_field: Optional[int] = None + """An optional integer field""" + + optional_str: Optional[str] = "default" + """An optional string with default""" + + int_new_form: int | None = None + """Optional using new syntax""" + + str_new_form: str | None = "default" + """Optional string using new syntax""" + + +@dataclass +class ConfigWithList: + """Config with list fields.""" + + tags: list[str] = field(default_factory=list) + """List of tags""" + + numbers: list[int] = field(default_factory=lambda: [1, 2, 3]) + """List of numbers with default""" + + +@dataclass +class ConfigWithLiteral: + """Config with Literal types.""" + + mode: Literal["train", "eval", "test"] = "train" + """Operating mode""" + + precision: Literal[16, 32] = 32 + """Precision level""" + + +class TestArgumentGroupFactoryBasic: + """Test basic functionality of ArgumentGroupFactory.""" + + def test_creates_argument_group(self): + """Test that build_group creates an argument group.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(DummyConfig) + + arg_group = factory.build_group(parser, title="Test Group") + + assert arg_group is not None + assert arg_group.title == "Test Group" + assert arg_group.description == DummyConfig.__doc__ + + def test_all_fields_added(self): + """Test that all dataclass fields are added as arguments.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(DummyConfig) + + factory.build_group(parser, title="Test Group") + + # Parse empty args to get all defaults + args = parser.parse_args([]) + + # Check all fields exist + assert hasattr(args, 'name') + assert hasattr(args, 'count') + assert hasattr(args, 'learning_rate') + assert hasattr(args, 'enabled') + assert hasattr(args, 'disabled_feature') + + def test_default_values_preserved(self): + """Test that default values from dataclass are preserved.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(DummyConfig) + + factory.build_group(parser, title="Test Group") + args = parser.parse_args([]) + + assert args.name == "default_name" + assert args.count == 42 + assert args.learning_rate == 0.001 + assert args.enabled == False + assert args.disabled_feature == True + + def test_argument_types(self): + """Test that argument types are correctly inferred.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(DummyConfig) + + factory.build_group(parser, title="Test Group") + + # Parse with actual values + args = parser.parse_args( + ['--name', 'test_name', '--count', '100', '--learning-rate', '0.01'] + ) + + assert isinstance(args.name, str) + assert args.name == 'test_name' + assert isinstance(args.count, int) + assert args.count == 100 + assert isinstance(args.learning_rate, float) + assert args.learning_rate == 0.01 + + def test_boolean_store_true(self): + """Test that boolean fields with default False use store_true.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(DummyConfig) + + factory.build_group(parser, title="Test Group") + + # Without flag, should be False + args = parser.parse_args([]) + assert args.enabled == False + + # With flag, should be True + args = parser.parse_args(['--enabled']) + assert args.enabled == True + + def test_boolean_store_false(self): + """Test that boolean fields with default True use store_false with no- prefix.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(DummyConfig) + + factory.build_group(parser, title="Test Group") + + # Without flag, should be True + args = parser.parse_args([]) + assert args.disabled_feature == True + + # With --no- flag, should be False + args = parser.parse_args(['--no-disabled-feature']) + assert args.disabled_feature == False + + # With --disable- flag, should also be False + args = parser.parse_args(['--disable-disabled-feature']) + assert args.disabled_feature == False + + def test_field_docstrings_as_help(self): + """Test that field docstrings are extracted and used as help text.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(DummyConfig) + + # Check that field_docstrings were extracted + assert 'name' in factory.field_docstrings + assert factory.field_docstrings['name'] == "Name of the configuration" + assert factory.field_docstrings['count'] == "Number of items" + assert factory.field_docstrings['learning_rate'] == "Learning rate for training" + + def test_enum_handling(self): + """Test that enum types are handled correctly.""" + parser = ArgumentParser(exit_on_error=False) + factory = ArgumentGroupFactory(DummyConfig) + + factory.build_group(parser, title="Test Group") + + args = parser.parse_args([]) + assert args.enum_setting == signal.SIGTERM + + # test a different valid enum value + args = parser.parse_args(["--enum-setting", "SIGINT"]) + assert args.enum_setting == signal.SIGINT + + # test an invalid enum value + with pytest.raises(KeyError, match="sigbar"): + parser.parse_args(["--enum-setting", "sigbar"]) + + +class TestArgumentGroupFactoryExclusion: + """Test exclusion functionality.""" + + def test_exclude_single_field(self): + """Test excluding a single field.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(DummyConfig, exclude=['count']) + + factory.build_group(parser, title="Test Group") + args = parser.parse_args([]) + + # Excluded field should not exist + assert hasattr(args, 'name') + assert not hasattr(args, 'count') + assert hasattr(args, 'learning_rate') + + def test_exclude_multiple_fields(self): + """Test excluding multiple fields.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(DummyConfig, exclude=['count', 'learning_rate']) + + factory.build_group(parser, title="Test Group") + args = parser.parse_args([]) + + assert hasattr(args, 'name') + assert not hasattr(args, 'count') + assert not hasattr(args, 'learning_rate') + assert hasattr(args, 'enabled') + + +class TestArgumentGroupFactoryOptional: + """Test handling of Optional types.""" + + def test_optional_fields(self): + """Test that Optional fields are handled correctly.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(ConfigWithOptional) + + factory.build_group(parser, title="Test Group") + + # Default values + args = parser.parse_args([]) + assert args.required_field == "required" + assert args.optional_field is None + assert args.optional_str == "default" + + # Provided values + args = parser.parse_args( + ['--required-field', 'new_value', '--optional-field', '123', '--optional-str', 'custom'] + ) + assert args.required_field == "new_value" + assert args.optional_field == 123 + assert args.optional_str == "custom" + + +class TestArgumentGroupFactoryList: + """Test handling of list types.""" + + def test_list_fields_with_default_factory(self): + """Test that list fields use nargs='+'.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(ConfigWithList) + + factory.build_group(parser, title="Test Group") + + # Default values + args = parser.parse_args([]) + assert args.tags == [] + assert args.numbers == [1, 2, 3] + + # Provided values + args = parser.parse_args(['--tags', 'tag1', 'tag2', 'tag3', '--numbers', '10', '20', '30']) + assert args.tags == ['tag1', 'tag2', 'tag3'] + assert args.numbers == [10, 20, 30] + + +class TestArgumentGroupFactoryLiteral: + """Test handling of Literal types.""" + + def test_literal_fields_have_choices(self): + """Test that Literal types create choice constraints.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(ConfigWithLiteral) + + factory.build_group(parser, title="Test Group") + + # Default values + args = parser.parse_args([]) + assert args.mode == "train" + assert args.precision == 32 + + # Valid choices + args = parser.parse_args(['--mode', 'eval', '--precision', '16']) + assert args.mode == "eval" + assert args.precision == 16 + + def test_literal_fields_reject_invalid_choices(self): + """Test that invalid Literal choices are rejected.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(ConfigWithLiteral) + + factory.build_group(parser, title="Test Group") + + # Invalid choice should raise error + with pytest.raises(SystemExit): + parser.parse_args(['--mode', 'invalid']) + + with pytest.raises(SystemExit): + parser.parse_args(['--precision', '64']) + + +class TestArgumentGroupFactoryHelpers: + """Test helper methods.""" + + def test_format_arg_name_basic(self): + """Test basic argument name formatting.""" + factory = ArgumentGroupFactory(DummyConfig) + + assert factory._format_arg_name("simple") == "--simple" + assert factory._format_arg_name("with_underscore") == "--with-underscore" + assert factory._format_arg_name("multiple_under_scores") == "--multiple-under-scores" + + def test_format_arg_name_with_prefix(self): + """Test argument name formatting with prefix.""" + factory = ArgumentGroupFactory(DummyConfig) + + assert factory._format_arg_name("feature", prefix="no") == "--no-feature" + assert factory._format_arg_name("feature", prefix="disable") == "--disable-feature" + assert factory._format_arg_name("multi_word", prefix="no") == "--no-multi-word" + + def test_extract_type_primitive(self): + """Test type extraction for primitive types.""" + factory = ArgumentGroupFactory(DummyConfig) + + assert factory._extract_type(int) == {"type": int} + assert factory._extract_type(str) == {"type": str} + assert factory._extract_type(float) == {"type": float} + + def test_extract_type_optional(self): + """Test type extraction for Optional types.""" + factory = ArgumentGroupFactory(DummyConfig) + + result = factory._extract_type(Optional[int]) + assert result == {"type": int} + + result = factory._extract_type(Optional[str]) + assert result == {"type": str} + + def test_extract_type_list(self): + """Test type extraction for list types.""" + factory = ArgumentGroupFactory(DummyConfig) + + result = factory._extract_type(list[int]) + assert result == {"type": int, "nargs": "+"} + + result = factory._extract_type(list[str]) + assert result == {"type": str, "nargs": "+"} + + def test_extract_type_literal(self): + """Test type extraction for Literal types.""" + factory = ArgumentGroupFactory(DummyConfig) + + result = factory._extract_type(Literal["a", "b", "c"]) + assert result == {"type": str, "choices": ("a", "b", "c")} + + result = factory._extract_type(Literal[1, 2, 3]) + assert result == {"type": int, "choices": (1, 2, 3)} + + +@dataclass +class ConfigWithArgparseMeta: + """Config with argparse_meta metadata for testing overrides.""" + + custom_help: str = field( + default="default_value", + metadata={"argparse_meta": {"help": "Custom help text from metadata"}}, + ) + """Original help text""" + + custom_type: str = field(default="100", metadata={"argparse_meta": {"type": int}}) + """Field with type override""" + + custom_default: str = field( + default="original_default", metadata={"argparse_meta": {"default": "overridden_default"}} + ) + """Field with default override""" + + custom_choices: str = field( + default="option1", + metadata={"argparse_meta": {"choices": ["option1", "option2", "option3"]}}, + ) + """Field with choices override""" + + custom_dest: str = field( + default="value", metadata={"argparse_meta": {"dest": "renamed_destination"}} + ) + """Field with dest override""" + + custom_action: bool = field( + default=False, + metadata={"argparse_meta": {"action": "store_const", "const": "special_value"}}, + ) + """Field with custom action override""" + + multiple_overrides: int = field( + default=42, + metadata={ + "argparse_meta": { + "type": str, + "help": "Multiple overrides applied", + "default": "999", + "dest": "multi_override_dest", + } + }, + ) + """Field with multiple metadata overrides""" + + nargs_override: str = field(default="single", metadata={"argparse_meta": {"nargs": "?"}}) + """Field with nargs override""" + + +@dataclass +class ConfigWithUnsupportedCallables: + """Config with argparse_meta metadata for testing overrides.""" + + unsupported_type: Optional[Callable] = None + """Cannot take a callable over CLI""" + + unsupported_with_metadata: Optional[Callable] = field( + default=None, metadata={"argparse_meta": {"type": int, "choices": (0, 1, 2)}} + ) + """This argument should be 0, 1, or 2. The appropriate + Callable will be set by some other logic. + """ + + +@dataclass +class ConfigWithUnsupportedUnions: + """Config with argparse_meta metadata for testing overrides.""" + + unsupported_type: Union[int, str] = 0 + """Cannot infer type of a Union""" + + unsupported_with_metadata: Union[int, str] = field( + default=0, metadata={"argparse_meta": {"type": str, "choices": ("foo", "bar")}} + ) + """Metadata should take precedence over the exception caused by Union""" + + +class TestArgumentGroupFactoryArgparseMeta: + """Test argparse_meta metadata override functionality.""" + + def test_help_override(self): + """Test that argparse_meta can override help text.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(ConfigWithArgparseMeta) + + factory.build_group(parser, title="Test Group") + + # Find the action for this argument + for action in parser._actions: + if hasattr(action, 'dest') and action.dest == 'custom_help': + assert action.help == "Custom help text from metadata" + return + + pytest.fail("custom_help argument not found") + + def test_type_override(self): + """Test that argparse_meta can override argument type.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(ConfigWithArgparseMeta) + + factory.build_group(parser, title="Test Group") + + # Parse with integer value (metadata overrides type to int) + args = parser.parse_args(['--custom-type', '42']) + + # Should be parsed as int, not str + assert isinstance(args.custom_type, int) + assert args.custom_type == 42 + + def test_default_override(self): + """Test that argparse_meta can override default value.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(ConfigWithArgparseMeta) + + factory.build_group(parser, title="Test Group") + + # Parse with no arguments + args = parser.parse_args([]) + + # Should use metadata default, not field default + assert args.custom_default == "overridden_default" + + def test_choices_override(self): + """Test that argparse_meta can override choices.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(ConfigWithArgparseMeta) + + factory.build_group(parser, title="Test Group") + + # Valid choice from metadata + args = parser.parse_args(['--custom-choices', 'option2']) + assert args.custom_choices == "option2" + + # Invalid choice should fail + with pytest.raises(SystemExit): + parser.parse_args(['--custom-choices', 'invalid_option']) + + def test_dest_override(self): + """Test that argparse_meta can override destination name.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(ConfigWithArgparseMeta) + + factory.build_group(parser, title="Test Group") + + args = parser.parse_args(['--custom-dest', 'test_value']) + + # Should be stored in renamed destination + assert hasattr(args, 'renamed_destination') + assert args.renamed_destination == "test_value" + + def test_action_override(self): + """Test that argparse_meta can override action.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(ConfigWithArgparseMeta) + + factory.build_group(parser, title="Test Group") + + # With custom action=store_const and const="special_value" + args = parser.parse_args(['--custom-action']) + assert args.custom_action == "special_value" + + # Without flag, should use default + args = parser.parse_args([]) + assert args.custom_action == False + + def test_multiple_overrides(self): + """Test that multiple argparse_meta overrides work together.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(ConfigWithArgparseMeta) + + factory.build_group(parser, title="Test Group") + + # Parse with no arguments to check default override + args = parser.parse_args([]) + + # Check all overrides applied + assert hasattr(args, 'multi_override_dest') + assert args.multi_override_dest == "999" # default override + + # Parse with value to check type override + args = parser.parse_args(['--multiple-overrides', 'text_value']) + assert isinstance(args.multi_override_dest, str) # type override + assert args.multi_override_dest == "text_value" + + # Check help override was applied + for action in parser._actions: + if hasattr(action, 'dest') and action.dest == 'multi_override_dest': + assert action.help == "Multiple overrides applied" + break + + def test_nargs_override(self): + """Test that argparse_meta can override nargs.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(ConfigWithArgparseMeta) + + factory.build_group(parser, title="Test Group") + + # With nargs='?', argument is optional + args = parser.parse_args(['--nargs-override']) + assert args.nargs_override is None # No value provided with '?' + + # With value + args = parser.parse_args(['--nargs-override', 'provided_value']) + assert args.nargs_override == "provided_value" + + # Without flag at all, should use default + args = parser.parse_args([]) + assert args.nargs_override == "single" + + def test_metadata_takes_precedence_over_inference(self): + """Test that metadata has highest precedence over type inference.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(ConfigWithArgparseMeta) + + # Build kwargs for custom_type field which is str but metadata says int + from dataclasses import fields as dc_fields + + for f in dc_fields(ConfigWithArgparseMeta): + if f.name == 'custom_type': + kwargs = factory._build_argparse_kwargs_from_field(f) + # Metadata type should override inferred type + assert kwargs['type'] == int + break + + def test_unhandled_unsupported_callables(self): + """Test that an unsupported type produces a TypInferenceError.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory( + ConfigWithUnsupportedCallables, exclude=["unsupported_with_metadata"] + ) + + with pytest.raises(TypeInferenceError, match="Unsupported type"): + factory.build_group(parser, title="Test Group") + + def test_handled_unsupported_callables(self): + """Test an attribute with an unsupported type that has type info in the metadata.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory(ConfigWithUnsupportedCallables, exclude=["unsupported_type"]) + + factory.build_group(parser, title="Test Group") + + args = parser.parse_args(['--unsupported-with-metadata', '0']) + assert args.unsupported_with_metadata == 0 + + def test_unhandled_unsupported_unions(self): + """Test that an unsupported type produces a TypInferenceError.""" + parser = ArgumentParser() + factory = ArgumentGroupFactory( + ConfigWithUnsupportedUnions, exclude=["unsupported_with_metadata"] + ) + + with pytest.raises(TypeInferenceError, match="Unions not supported by argparse"): + factory.build_group(parser, title="Test Group") + + def test_handled_unsupported_unions(self): + """Test an attribute with an unsupported type that has type info in the metadata.""" + parser = ArgumentParser(exit_on_error=False) + factory = ArgumentGroupFactory(ConfigWithUnsupportedUnions, exclude=["unsupported_type"]) + + factory.build_group(parser, title="Test Group") + + args = parser.parse_args(['--unsupported-with-metadata', 'foo']) + assert args.unsupported_with_metadata == 'foo' + + with pytest.raises(ArgumentError, match="invalid choice"): + args = parser.parse_args(['--unsupported-with-metadata', 'baz']) From 288b8ea985221e6dc6dead2fa088b1899419f537 Mon Sep 17 00:00:00 2001 From: Robin Zhang Date: Wed, 17 Dec 2025 12:01:13 +0800 Subject: [PATCH 194/248] [Dev] Optimize TE CUDA Graph _get_sample_arguments() Time (#2568) Signed-off-by: Robin Zhang --- megatron/core/transformer/cuda_graphs.py | 76 ++++++++++++++----- .../transformer/test_cuda_graphs.py | 60 ++++++++------- 2 files changed, 89 insertions(+), 47 deletions(-) diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index bcc90dc1240..6f75d67549e 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -1643,48 +1643,82 @@ def get_rotary_pos_emb(transformer_module, transformer_input): # with the same input signature. fwd_sample_queues = {} consumed_sample_queue = {} + layer_sample_keys_cache = {} fwd_idx = [0] * self.num_model_chunks for chunk_id in order: model_chunk_idx = abs(chunk_id) - 1 if chunk_id > 0: + if model_chunk_idx not in fwd_sample_queues: + fwd_sample_queues[model_chunk_idx] = [] + sample_start_idx = (prefix_num_layers[model_chunk_idx] * self.num_microbatches) + ( fwd_idx[model_chunk_idx] * self.num_layers_per_chunk[model_chunk_idx] ) - fwd_sample_idx = [ - sample_start_idx + i for i in range(self.num_layers_per_chunk[model_chunk_idx]) - ] - if model_chunk_idx not in fwd_sample_queues: - fwd_sample_queues[model_chunk_idx] = [] - for per_callable_fwd_idx in fwd_sample_idx: - if sample_args[per_callable_fwd_idx] is None: + for layer_idx, layer in enumerate(self.callables_per_chunk[model_chunk_idx]): + per_callable_fwd_idx = sample_start_idx + layer_idx + + # Get sample_args and sample_kwargs for index per_callable_fwd_idx. + assert ( + sample_args[per_callable_fwd_idx] is None + and sample_kwargs[per_callable_fwd_idx] is None + ), ( + f"sample_args and sample_kwargs must be None before assigning static data, " + f"but got sample_args[{per_callable_fwd_idx}] = " + f"{sample_args[per_callable_fwd_idx]} and " + f"sample_kwargs[{per_callable_fwd_idx}] = " + f"{sample_kwargs[per_callable_fwd_idx]}." + ) + if id(layer) not in layer_sample_keys_cache: + # Have not generated the static inputs for this layer yet. So we don't + # know the input signature of this layer. Generate the static inputs, and + # cache the signature. sample_args[per_callable_fwd_idx], sample_kwargs[per_callable_fwd_idx] = ( _get_layer_static_inputs( - self.callables_per_chunk[model_chunk_idx][ - per_callable_fwd_idx - sample_start_idx - ], - self.chunks_with_decoder[model_chunk_idx], + layer, self.chunks_with_decoder[model_chunk_idx] ) ) - - sample_args_keys = tuple( - (t.shape, t.dtype, t.layout) for t in sample_args[per_callable_fwd_idx] - ) - sample_kwargs_keys = tuple( - (k, v.shape, v.dtype, v.layout) - for k, v in sorted(sample_kwargs[per_callable_fwd_idx].items()) - ) - sample_keys = sample_args_keys + sample_kwargs_keys + sample_args_keys = tuple( + (t.shape, t.dtype, t.layout) for t in sample_args[per_callable_fwd_idx] + ) + sample_kwargs_keys = tuple( + (k, v.shape, v.dtype, v.layout) + for k, v in sorted(sample_kwargs[per_callable_fwd_idx].items()) + ) + sample_keys = sample_args_keys + sample_kwargs_keys + layer_sample_keys_cache[id(layer)] = sample_keys + else: + # Get signature from cache. This signature will be used to see if we can + # reuse the static inputs of a previous forward pass for this forward pass. + # If not, we still need to generate the new static inputs. + sample_keys = layer_sample_keys_cache[id(layer)] fwd_sample_queues[model_chunk_idx].append((sample_keys, per_callable_fwd_idx)) if consumed_sample_queue.get(sample_keys, []): + # We can reuse the static inputs of a previous forward pass for this + # forward pass, because they are of the same input signature and the + # backward pass of the previous forward pass has completed. reuse_fwd_idx = consumed_sample_queue[sample_keys].pop(0) assert ( sample_args[reuse_fwd_idx] is not None and sample_kwargs[reuse_fwd_idx] is not None - ), "sample_args and sample_kwargs must not be None when reusing." + ), ( + f"sample_args and sample_kwargs must not be None when reusing, but got " + f"sample_args[{reuse_fwd_idx}] = {sample_args[reuse_fwd_idx]} and " + f"sample_kwargs[{reuse_fwd_idx}] = {sample_kwargs[reuse_fwd_idx]}.", + ) sample_args[per_callable_fwd_idx] = sample_args[reuse_fwd_idx] sample_kwargs[per_callable_fwd_idx] = sample_kwargs[reuse_fwd_idx] + + if sample_args[per_callable_fwd_idx] is None: + # Unfortunately, no previous static inputs are available for reuse, + # sample_args is still None. Last attempt: generate the new static inputs + # for this forward pass. + sample_args[per_callable_fwd_idx], sample_kwargs[per_callable_fwd_idx] = ( + _get_layer_static_inputs( + layer, self.chunks_with_decoder[model_chunk_idx] + ) + ) fwd_idx[model_chunk_idx] += 1 else: num_consumed_samples = min( diff --git a/tests/unit_tests/transformer/test_cuda_graphs.py b/tests/unit_tests/transformer/test_cuda_graphs.py index 8133a3d2db0..7f49a559f32 100644 --- a/tests/unit_tests/transformer/test_cuda_graphs.py +++ b/tests/unit_tests/transformer/test_cuda_graphs.py @@ -742,7 +742,8 @@ def test_capture_freeze_gc(self): ) -# Global storage for comparing unique buffer counts across different num_microbatches, keyed by pp_size +# Global storage for comparing unique buffer counts across different num_microbatches, +# keyed by (pp_size, vpp_size) _unique_buffer_counts = {} @@ -758,19 +759,25 @@ def teardown_method(self, method): # Note: _unique_buffer_counts is intentionally NOT cleared here so we can # compare values across parametrized test runs - @pytest.mark.parametrize("num_microbatches", [4, 16, 64, 256]) + @pytest.mark.parametrize("num_microbatches", [16, 64, 256]) @pytest.mark.parametrize("pp_size", [1, 2, 4]) - def test_get_cuda_graph_input_data(self, num_microbatches, pp_size): + @pytest.mark.parametrize("vpp_size", [None, 2]) + def test_get_cuda_graph_input_data(self, num_microbatches, pp_size, vpp_size): """Test _get_cuda_graph_input_data function in TECudaGraphHelper.""" + if vpp_size and pp_size == 1: + pytest.skip("vpp_size must be None when pp_size is 1") + Utils.initialize_model_parallel( - tensor_model_parallel_size=1, pipeline_model_parallel_size=pp_size + tensor_model_parallel_size=1, + pipeline_model_parallel_size=pp_size, + virtual_pipeline_model_parallel_size=vpp_size, ) # Set up test configuration seq_length = 128 micro_batch_size = 2 - num_layers = 4 + num_layers = 8 vocab_size = 1024 hidden_size = 64 num_attention_heads = 4 @@ -796,6 +803,7 @@ def test_get_cuda_graph_input_data(self, num_microbatches, pp_size): bf16=True, tensor_model_parallel_size=1, pipeline_model_parallel_size=pp_size, + virtual_pipeline_model_parallel_size=vpp_size, pipeline_dtype=torch.bfloat16, context_parallel_size=1, ) @@ -804,21 +812,22 @@ def test_get_cuda_graph_input_data(self, num_microbatches, pp_size): torch.manual_seed(123) model_parallel_cuda_manual_seed(123) - gpt_model = GPTModel( - config=transformer_config, - transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(), - vocab_size=vocab_size, - max_sequence_length=seq_length, - parallel_output=True, - position_embedding_type="rope", - ) - - # Move model to CUDA - gpt_model.cuda() + model = [] + for i in range(vpp_size or 1): + this_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(), + vocab_size=vocab_size, + max_sequence_length=seq_length, + parallel_output=True, + position_embedding_type="rope", + vp_stage=i if vpp_size else None, + ).cuda() + model.append(this_model) # Initialize TECudaGraphHelper cuda_graph_helper = TECudaGraphHelper( - model=[gpt_model], + model=model, config=transformer_config, seq_length=seq_length, micro_batch_size=micro_batch_size, @@ -936,11 +945,13 @@ def test_get_cuda_graph_input_data(self, num_microbatches, pp_size): f"should be <= total_entries ({total_entries})" ) global _unique_buffer_counts - if pp_size not in _unique_buffer_counts: - _unique_buffer_counts[pp_size] = unique_buffer_count + # Use (pp_size, vpp_size) as key to track unique buffer counts per configuration + config_key = (pp_size, vpp_size) + if config_key not in _unique_buffer_counts: + _unique_buffer_counts[config_key] = unique_buffer_count else: - assert unique_buffer_count == _unique_buffer_counts[pp_size], ( - f"Unique buffer count mismatch: expected {_unique_buffer_counts[pp_size]}, " + assert unique_buffer_count == _unique_buffer_counts[config_key], ( + f"Unique buffer count mismatch: expected {_unique_buffer_counts[config_key]}, " f"got {unique_buffer_count}" ) @@ -956,11 +967,8 @@ def test_get_cuda_graph_input_data(self, num_microbatches, pp_size): "but all signatures are unique" ) - # If we have duplicate signatures and the schedule allows it, - # some buffers should be reused (max_reuse > 1) - # Note: The exact amount of reuse depends on the schedule order - # With 1F1B interleaved schedule, we should see some reuse - if pp_size > num_microbatches: + # We tested with a large number of microbatches, so we should see some buffer reuse. + if pp_size > 1: assert max_reuse > 1, "Expected some buffer reuse" # Verify that make_graphed_callables_kwargs contains expected keys From 0eec631b2ea4e2ed3cb3ab847bcccf749a881d4b Mon Sep 17 00:00:00 2001 From: Yuzhong Wang Date: Wed, 17 Dec 2025 12:03:49 +0800 Subject: [PATCH 195/248] Reopen qwen3next functional test in lightweight mode (#2493) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig Co-authored-by: oliver könig --- .gitlab/stages/00.pre.yml | 10 +- .../shell_test_utils/run_ci_test.sh | 2 + .../golden_values_dev_dgx_h100.json | 287 ------------------ .../model_config.yaml | 12 +- tests/test_utils/recipes/gpt.yaml | 2 +- 5 files changed, 19 insertions(+), 294 deletions(-) delete mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_gdn/golden_values_dev_dgx_h100.json diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml index 2210ddd7d02..ff9e4e5178b 100644 --- a/.gitlab/stages/00.pre.yml +++ b/.gitlab/stages/00.pre.yml @@ -49,7 +49,7 @@ pre:create_ci_branches: stage: .pre image: python:3.10 variables: - GIT_STRATEGY: "clone" + GIT_STRATEGY: 'clone' script: - git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/adlr/megatron-lm.git" - git switch --force-create $branch @@ -80,7 +80,7 @@ pre:create_ci_branches_dev: stage: .pre image: python:3.10 variables: - GIT_STRATEGY: "clone" + GIT_STRATEGY: 'clone' script: - git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/adlr/megatron-lm.git" - git switch --force-create $branch @@ -103,7 +103,7 @@ pre:label_merge_request: - cd gitlab-mr-labeler - go install . - cd .. - - go install github.com/itchyny/gojq/cmd/gojq@latest + - go install github.com/itchyny/gojq/cmd/gojq@v0.12.17 script: - set -x - | @@ -137,7 +137,7 @@ pre:maybe_cherry_pick_to_main: stage: .pre image: nentangso/alpine-git-curl-jq variables: - GIT_STRATEGY: "clone" + GIT_STRATEGY: 'clone' script: - | set -x @@ -202,7 +202,7 @@ pre:maybe_cherry_pick_commit: stage: .pre image: nentangso/alpine-git-curl-jq variables: - GIT_STRATEGY: "clone" + GIT_STRATEGY: 'clone' script: - set -x - set +e diff --git a/tests/functional_tests/shell_test_utils/run_ci_test.sh b/tests/functional_tests/shell_test_utils/run_ci_test.sh index 5a6ea64f42d..968d7dafeec 100644 --- a/tests/functional_tests/shell_test_utils/run_ci_test.sh +++ b/tests/functional_tests/shell_test_utils/run_ci_test.sh @@ -51,6 +51,8 @@ set -exo pipefail # Extract settings from params file TEST_TYPE=$(cat $TRAINING_PARAMS_PATH | /usr/local/bin/yq '.TEST_TYPE') +ENABLE_LIGHTWEIGHT_MODE=$(cat $TRAINING_PARAMS_PATH | + /usr/local/bin/yq '.ENV_VARS.ENABLE_LIGHTWEIGHT_MODE // "false"') MODE=$(cat $TRAINING_PARAMS_PATH | /usr/local/bin/yq '.MODE // "pretraining"') diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_gdn/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_gdn/golden_values_dev_dgx_h100.json deleted file mode 100644 index e836165b1af..00000000000 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_gdn/golden_values_dev_dgx_h100.json +++ /dev/null @@ -1,287 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 10.94549, - "2": 10.94266, - "3": 10.95029, - "4": 10.92935, - "5": 10.94226, - "6": 10.94118, - "7": 10.92599, - "8": 10.93843, - "9": 10.92667, - "10": 10.95239, - "11": 10.9316, - "12": 10.93754, - "13": 10.92806, - "14": 10.93106, - "15": 10.92268, - "16": 10.93309, - "17": 10.92783, - "18": 10.93162, - "19": 10.92174, - "20": 10.9222, - "21": 10.91749, - "22": 10.89939, - "23": 10.91334, - "24": 10.90584, - "25": 10.89761, - "26": 10.90421, - "27": 10.90329, - "28": 10.87234, - "29": 10.89828, - "30": 10.85482, - "31": 10.74433, - "32": 10.85937, - "33": 10.87082, - "34": 10.78866, - "35": 10.80404, - "36": 10.78603, - "37": 10.83611, - "38": 10.77081, - "39": 10.85659, - "40": 10.72227, - "41": 10.72701, - "42": 10.78348, - "43": 10.58371, - "44": 10.69609, - "45": 10.60756, - "46": 10.55935, - "47": 10.72505, - "48": 10.58391, - "49": 10.40808, - "50": 10.63209 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 22806516.0, - "2": 23004070.0, - "3": 22675828.0, - "4": 23298692.0, - "5": 22793918.0, - "6": 23100284.0, - "7": 22849388.0, - "8": 23004824.0, - "9": 22919836.0, - "10": 22997154.0, - "11": 22579508.0, - "12": 22537754.0, - "13": 22996688.0, - "14": 22467402.0, - "15": 22900118.0, - "16": 22909232.0, - "17": 22897812.0, - "18": 22661628.0, - "19": 22697360.0, - "20": 22773234.0, - "21": 22818520.0, - "22": 22878406.0, - "23": 22618508.0, - "24": 22849596.0, - "25": 22897480.0, - "26": 22626820.0, - "27": 22547392.0, - "28": 22531804.0, - "29": 22606952.0, - "30": 22710502.0, - "31": 23033192.0, - "32": 22663120.0, - "33": 22637648.0, - "34": 22914116.0, - "35": 22866052.0, - "36": 22667304.0, - "37": 22575802.0, - "38": 22974080.0, - "39": 22879488.0, - "40": 22736406.0, - "41": 22737628.0, - "42": 22745946.0, - "43": 23054018.0, - "44": 22825168.0, - "45": 22753408.0, - "46": 22962704.0, - "47": 22712868.0, - "48": 23007200.0, - "49": 22805320.0, - "50": 22983010.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 739501056.0, - "2": 739501056.0, - "3": 739501056.0, - "4": 739501056.0, - "5": 739501056.0, - "6": 739501056.0, - "7": 739501056.0, - "8": 739501056.0, - "9": 739501056.0, - "10": 739501056.0, - "11": 739501056.0, - "12": 739501056.0, - "13": 739501056.0, - "14": 739501056.0, - "15": 739501056.0, - "16": 739501056.0, - "17": 739501056.0, - "18": 739501056.0, - "19": 739501056.0, - "20": 739501056.0, - "21": 739501056.0, - "22": 739501056.0, - "23": 739501056.0, - "24": 739501056.0, - "25": 739501056.0, - "26": 739501056.0, - "27": 739501056.0, - "28": 739501056.0, - "29": 739501056.0, - "30": 739501056.0, - "31": 739501056.0, - "32": 739501056.0, - "33": 739501056.0, - "34": 739501056.0, - "35": 739501056.0, - "36": 739501056.0, - "37": 739501056.0, - "38": 739501056.0, - "39": 739501056.0, - "40": 739501056.0, - "41": 739501056.0, - "42": 739501056.0, - "43": 739501056.0, - "44": 739501056.0, - "45": 739501056.0, - "46": 739501056.0, - "47": 739501056.0, - "48": 739501056.0, - "49": 739501056.0, - "50": 739501056.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 2185745408.0, - "2": 2467083264.0, - "3": 2467083264.0, - "4": 2467083264.0, - "5": 2467083264.0, - "6": 2467083264.0, - "7": 2467083264.0, - "8": 2467083264.0, - "9": 2467083264.0, - "10": 2467083264.0, - "11": 2467083264.0, - "12": 2467083264.0, - "13": 2467083264.0, - "14": 2467083264.0, - "15": 2467083264.0, - "16": 2467083264.0, - "17": 2467083264.0, - "18": 2467083264.0, - "19": 2467083264.0, - "20": 2467083264.0, - "21": 2467083264.0, - "22": 2467083264.0, - "23": 2467083264.0, - "24": 2467083264.0, - "25": 2467083264.0, - "26": 2467083264.0, - "27": 2467083264.0, - "28": 2467083264.0, - "29": 2467083264.0, - "30": 2467083264.0, - "31": 2467083264.0, - "32": 2467083264.0, - "33": 2467083264.0, - "34": 2467083264.0, - "35": 2467083264.0, - "36": 2467083264.0, - "37": 2467083264.0, - "38": 2467083264.0, - "39": 2467083264.0, - "40": 2467083264.0, - "41": 2467083264.0, - "42": 2467083264.0, - "43": 2467083264.0, - "44": 2467083264.0, - "45": 2467083264.0, - "46": 2467083264.0, - "47": 2467083264.0, - "48": 2467083264.0, - "49": 2467083264.0, - "50": 2467083264.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 37.98779, - "2": 0.44183, - "3": 0.41794, - "4": 0.41574, - "5": 0.41502, - "6": 0.41403, - "7": 0.41636, - "8": 0.41731, - "9": 0.41907, - "10": 0.41341, - "11": 0.41278, - "12": 0.41269, - "13": 0.41248, - "14": 0.4133, - "15": 0.4156, - "16": 0.41652, - "17": 0.41625, - "18": 0.41902, - "19": 0.41584, - "20": 0.41729, - "21": 0.42212, - "22": 0.41334, - "23": 0.41588, - "24": 0.41641, - "25": 0.41859, - "26": 0.41721, - "27": 0.40783, - "28": 0.40735, - "29": 0.4046, - "30": 0.40445, - "31": 0.41196, - "32": 0.40703, - "33": 0.40362, - "34": 0.4043, - "35": 0.40787, - "36": 0.4094, - "37": 0.40514, - "38": 0.40653, - "39": 0.40616, - "40": 0.40471, - "41": 0.40633, - "42": 0.40318, - "43": 0.40362, - "44": 0.40095, - "45": 0.40173, - "46": 0.4018, - "47": 0.40121, - "48": 0.3989, - "49": 0.39861, - "50": 0.39894 - } - } -} diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_gdn/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_gdn/model_config.yaml index 8c5838748d1..5f63de867d9 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_gdn/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_gdn/model_config.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Ring CUBLAS_WORKSPACE_CONFIG: :4096:8 + ENABLE_LIGHTWEIGHT_MODE: true MODEL_ARGS: # Add network size args --untie-embeddings-and-output-weights: true @@ -18,13 +19,22 @@ MODEL_ARGS: --apply-layernorm-1p: true --attention-output-gate: true --no-weight-decay-cond-type: apply_wd_to_qk_layernorm - --linear-attention-type: gated_delta_net + --experimental-attention-variant: gated_delta_net --linear-attention-freq: 3 --linear-conv-kernel-dim: 4 --linear-key-head-dim: 64 --linear-value-head-dim: 64 --linear-num-key-heads: 4 --linear-num-value-heads: 8 + # Add MoE args + --num-experts: 32 + --moe-ffn-hidden-size: 64 + --moe-shared-expert-intermediate-size: 64 + --moe-shared-expert-gate: true + --moe-router-load-balancing-type: aux_loss + --moe-router-topk: 8 + --disable-bias-linear: true + --moe-router-dtype: fp32 # Add logging args --log-params-norm: true --log-num-zeros-in-grad: true diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/gpt.yaml index eae09a6e16a..f403ac20e3f 100644 --- a/tests/test_utils/recipes/gpt.yaml +++ b/tests/test_utils/recipes/gpt.yaml @@ -345,7 +345,7 @@ products: - test_case: [gpt3_mcore_te_tp2_pp1_gdn] products: - environment: [dev] - scope: [mr-broken, mr-github-broken] + scope: [mr, mr-github, mr-github-slim] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp2_mla] products: From 2ebff670288b28dd42dbd048e5e98ddbd19e89d5 Mon Sep 17 00:00:00 2001 From: Robin Zhang Date: Wed, 17 Dec 2025 19:51:32 +0800 Subject: [PATCH 196/248] [Dev] Fix CUDA RNG Tracker (#2640) Signed-off-by: Robin Zhang --- megatron/core/tensor_parallel/__init__.py | 4 + megatron/core/tensor_parallel/random.py | 78 +++++++++- megatron/core/transformer/cuda_graphs.py | 7 +- megatron/core/transformer/moe/moe_utils.py | 21 +-- megatron/training/arguments.py | 5 +- megatron/training/checkpointing.py | 15 +- .../unit_tests/tensor_parallel/test_random.py | 145 ++++++++++++++++++ 7 files changed, 249 insertions(+), 26 deletions(-) diff --git a/megatron/core/tensor_parallel/__init__.py b/megatron/core/tensor_parallel/__init__.py index afa53bdc6e1..e629e5982b1 100644 --- a/megatron/core/tensor_parallel/__init__.py +++ b/megatron/core/tensor_parallel/__init__.py @@ -28,9 +28,11 @@ from .random import ( CheckpointWithoutOutput, checkpoint, + convert_cuda_rng_state, get_cuda_rng_tracker, get_data_parallel_rng_tracker_name, get_expert_parallel_rng_tracker_name, + is_graph_safe_cuda_rng_tracker, model_parallel_cuda_manual_seed, ) from .utils import ( @@ -63,9 +65,11 @@ "scatter_to_sequence_parallel_region", # random.py "checkpoint", + "convert_cuda_rng_state", "get_cuda_rng_tracker", "model_parallel_cuda_manual_seed", "get_expert_parallel_rng_tracker_name", + "is_graph_safe_cuda_rng_tracker", "CheckpointWithoutOutput", # utils.py "split_tensor_along_last_dim", diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py index 396e5c54a2d..617d2803c12 100644 --- a/megatron/core/tensor_parallel/random.py +++ b/megatron/core/tensor_parallel/random.py @@ -111,6 +111,41 @@ def cb(): _lazy_call(cb) +def convert_cuda_rng_state( + state: Union[torch.Tensor, torch.Generator], to_graphable: bool = False +) -> Union[torch.Tensor, torch.Generator]: + """ + Convert the cuda rng state tensor to the graphable version, + or from the graphable version to the non-graphable tensor version. + """ + if to_graphable: + if isinstance(state, torch.Tensor): + # Convert to the graphable version. + # Store current rng state. + orig_cuda_rng_state = _get_cuda_rng_state(graph_safe=False) + # Set rng state to the desired one + _set_cuda_rng_state(state, graph_safe=False) + # Get the graphable state + graphable_state = _get_cuda_rng_state(clone=True, graph_safe=True) + # And set the state to the original state we started with. + _set_cuda_rng_state(orig_cuda_rng_state, graph_safe=False) + return graphable_state + elif isinstance(state, torch.Generator): + # already graphable, just return it. + return state + else: + raise ValueError(f"Invalid state type: {type(state)}") + else: + if isinstance(state, torch.Tensor): + # already non-graphable, just return it. + return state + elif isinstance(state, torch.Generator): + # Convert to the non-graphable tensor version. + return state.get_state() + else: + raise ValueError(f"Invalid state type: {type(state)}") + + def get_expert_parallel_rng_tracker_name(): """Get the expert parallel rng tracker name""" global _EXPERT_PARALLEL_RNG_TRACKER_NAME @@ -161,6 +196,10 @@ def reset(self): # Seeds are just for book keeping and ensure no seed is set twice. self.seeds_ = set() + # Name of the rng state currently being used in the generator. + # The default one is "default-rng" and won't be pushed to the self.states_ dictionary. + self._current_state_name = "default-rng" + def get_states(self): """Get rng states. Copy the dictionary so we have direct pointers to the states, not just a pointer to the dictionary.""" @@ -207,10 +246,14 @@ def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME): # Check if we have added the state if name not in self.states_: raise Exception('cuda rng state {} is not added'.format(name)) - # Store current rng state. + # Store current rng state and name. Store in self.states_ if it's not the default state. orig_cuda_rng_state = _get_cuda_rng_state(graph_safe=self.use_cudagraphable_rng) - # Set rng state to the desired one + orig_state_name = self._current_state_name + if orig_state_name != "default-rng": + self.states_[orig_state_name] = orig_cuda_rng_state + # Set rng state and name to the desired one. _set_cuda_rng_state(self.states_[name], graph_safe=self.use_cudagraphable_rng) + self._current_state_name = name # Record cpu RNG state cpu_rng_state = torch.get_rng_state() # Do the stuff we wanted to do. @@ -220,10 +263,19 @@ def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME): # Throw a warning if cpu RNG state changed if not torch.all(cpu_rng_state == torch.get_rng_state()).item(): logging.getLogger(__name__).warning('CPU RNG state changed within GPU RNG context') + # Check if the current state name is the same as the desired state name. + if self._current_state_name != name: + raise Exception( + f'current state name {self._current_state_name} is not the same as the desired ' + f'state name {name}.' + ) # Update the current rng state for later use. self.states_[name] = _get_cuda_rng_state(graph_safe=self.use_cudagraphable_rng) - # And set the state to the original state we started with. + # And set the state and name to the original state we started with. + if orig_state_name != "default-rng": + orig_cuda_rng_state = self.states_[orig_state_name] _set_cuda_rng_state(orig_cuda_rng_state, graph_safe=self.use_cudagraphable_rng) + self._current_state_name = orig_state_name # RNG tracker object. @@ -377,10 +429,24 @@ def model_parallel_cuda_manual_seed( _CUDA_RNG_STATE_TRACKER.add(_EXPERT_PARALLEL_RNG_TRACKER_NAME, expert_parallel_seed) +def is_graph_safe_cuda_rng_tracker(cuda_rng_tracker): + """Check if the cuda rng tracker is graph safe version.""" + if HAVE_TE and is_te_min_version("1.5.0"): + from megatron.core.extensions.transformer_engine import TECudaRNGStatesTracker + + if isinstance(cuda_rng_tracker, TECudaRNGStatesTracker): + return True + if getattr(cuda_rng_tracker, "use_cudagraphable_rng", False): + return True + return False + + def _get_all_rng_states(): """Get all the rng states.""" cpu_rng_state = torch.get_rng_state() - cuda_rng_state = _get_cuda_rng_state() + cuda_rng_state = _get_cuda_rng_state( + graph_safe=is_graph_safe_cuda_rng_tracker(get_cuda_rng_tracker()) + ) cuda_rng_state_tracker = get_cuda_rng_tracker().get_states() return cpu_rng_state, cuda_rng_state, cuda_rng_state_tracker @@ -388,7 +454,9 @@ def _get_all_rng_states(): def _set_all_rng_states(cpu_rng_state, cuda_rng_state, cuda_rng_state_tracker): """Set all the rng states.""" torch.set_rng_state(cpu_rng_state) - _set_cuda_rng_state(cuda_rng_state) + _set_cuda_rng_state( + cuda_rng_state, graph_safe=is_graph_safe_cuda_rng_tracker(get_cuda_rng_tracker()) + ) get_cuda_rng_tracker().set_states(cuda_rng_state_tracker) diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index 6f75d67549e..27e6c65c738 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -1907,7 +1907,12 @@ def create_cudagraphs(self): # Prepare CUDA Graph capturing input data and call `make_graphed_callables`. sample_args, kwargs = self._get_cuda_graph_input_data() - graphs = make_graphed_callables(tuple(self.flattened_callables), sample_args, **kwargs) + if self.config.sequence_parallel: + rng_context = get_cuda_rng_tracker().fork() + else: + rng_context = nullcontext() + with rng_context: + graphs = make_graphed_callables(tuple(self.flattened_callables), sample_args, **kwargs) # Push the captured graphs to the corresponding TransformerBlock. num_layers_accumulated = 0 diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py index 8bab8d70065..28cff06f5ec 100644 --- a/megatron/core/transformer/moe/moe_utils.py +++ b/megatron/core/transformer/moe/moe_utils.py @@ -10,9 +10,11 @@ from megatron.core.fp4_utils import get_fp4_align_size from megatron.core.fp8_utils import get_fp8_align_size from megatron.core.process_groups_config import ProcessGroupCollection +from megatron.core.tensor_parallel import get_cuda_rng_tracker, get_expert_parallel_rng_tracker_name from megatron.core.transformer.cuda_graphs import is_graph_capturing from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import internal_api try: import transformer_engine as te # pylint: disable=unused-import @@ -913,6 +915,7 @@ def get_moe_layer_wise_logging_tracker(): return _MOE_LAYER_WISE_LOGGING_TRACKER +@internal_api class RandomSTE(torch.autograd.Function): """ Straight-Through Estimator(STE) function that returns random values @@ -921,26 +924,14 @@ class RandomSTE(torch.autograd.Function): This is used to generate random logits of router for load-balanced benchmark. """ - generator = None - random_logits = None - @staticmethod def forward(ctx, logits): """ Forward pass returns random logits with rank-specific seed. """ - if is_graph_capturing() and RandomSTE.random_logits is not None: - return RandomSTE.random_logits - - if RandomSTE.generator is None: - global_rank = torch.distributed.get_rank() - base_seed = 42 - seed = base_seed + global_rank - RandomSTE.generator = torch.Generator(device=logits.device) - RandomSTE.generator.manual_seed(seed) - - RandomSTE.random_logits = logits.clone().normal_(generator=RandomSTE.generator) - return RandomSTE.random_logits + with get_cuda_rng_tracker().fork(get_expert_parallel_rng_tracker_name()): + random_logits = logits.clone().normal_() + return random_logits @staticmethod def backward(ctx, grad_output): diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 70d1e4b1306..c157d062c53 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1277,7 +1277,10 @@ def validate_args(args, defaults={}): # CUDA Graphs if args.cuda_graph_impl != "none": - if args.transformer_impl == 'transformer_engine' and not args.te_rng_tracker: + if ( + "transformer_engine" in (args.transformer_impl, args.cuda_graph_impl) + and not args.te_rng_tracker + ): args.te_rng_tracker = True warn_rank_0("te_rng_tracker is not enabled, enabling it for CUDA graphs.", args.rank) assert ( diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py index 48a2025fa63..19206312b67 100644 --- a/megatron/training/checkpointing.py +++ b/megatron/training/checkpointing.py @@ -1766,6 +1766,8 @@ def load_model_state_dict(module, state_dict, strict: bool): # rng states. if not release and not args.finetune and not args.no_load_rng and not ignore_rng_state: try: + cuda_rng_tracker = tensor_parallel.get_cuda_rng_tracker() + graph_safe_rng = tensor_parallel.is_graph_safe_cuda_rng_tracker(cuda_rng_tracker) if 'rng_state' in state_dict: if args.ckpt_format == "fsdp_dtensor": # FSDP DTensor checkpoints store rng_state in a different format. @@ -1791,8 +1793,10 @@ def load_model_state_dict(module, state_dict, strict: bool): # Check for empty states array if not rng_state['rng_tracker_states']: raise KeyError - tensor_parallel.get_cuda_rng_tracker().set_states( - rng_state['rng_tracker_states']) + rng_tracker_states = { + k: tensor_parallel.convert_cuda_rng_state(v, to_graphable=graph_safe_rng) + for k, v in rng_state['rng_tracker_states'].items() + } else: # backward compatability random.setstate(state_dict['random_rng_state']) np.random.set_state(state_dict['np_rng_state']) @@ -1801,8 +1805,11 @@ def load_model_state_dict(module, state_dict, strict: bool): # Check for empty states array if not state_dict['rng_tracker_states']: raise KeyError - tensor_parallel.get_cuda_rng_tracker().set_states( - state_dict['rng_tracker_states']) + rng_tracker_states = { + k: tensor_parallel.convert_cuda_rng_state(v, to_graphable=graph_safe_rng) + for k, v in state_dict['rng_tracker_states'].items() + } + cuda_rng_tracker.set_states(rng_tracker_states) except KeyError: print_rank_0('Unable to load rng state from checkpoint {}. ' 'Specify --no-load-rng or --finetune to prevent ' diff --git a/tests/unit_tests/tensor_parallel/test_random.py b/tests/unit_tests/tensor_parallel/test_random.py index 47b607b8795..a15ad83cb90 100644 --- a/tests/unit_tests/tensor_parallel/test_random.py +++ b/tests/unit_tests/tensor_parallel/test_random.py @@ -1,3 +1,5 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import pytest import torch @@ -5,6 +7,7 @@ CheckpointWithoutOutput, CudaRNGStatesTracker, checkpoint, + convert_cuda_rng_state, get_cuda_rng_tracker, model_parallel_cuda_manual_seed, ) @@ -33,6 +36,148 @@ def test_cuda_rng_states_tracker(): assert torch.equal(rng_tracker.get_states()['state2'], rng_state) +@pytest.mark.parametrize("use_cudagraphable_rng", [True, False]) +def test_double_fork_cuda_rng_states_tracker(use_cudagraphable_rng): + rng_tracker = CudaRNGStatesTracker(use_cudagraphable_rng=use_cudagraphable_rng) + rng_tracker.add("state1", 1234) + rng_tracker.add("state2", 5678) + randn_double_fork_1 = [] + randn_double_fork_2 = [] + with rng_tracker.fork("state1"): + randn_double_fork_1.append(torch.randn(10, device="cuda")) + with rng_tracker.fork("state2"): + randn_double_fork_2.append(torch.randn(10, device="cuda")) + with rng_tracker.fork("state1"): + randn_double_fork_1.append(torch.randn(10, device="cuda")) + randn_double_fork_2.append(torch.randn(10, device="cuda")) + randn_double_fork_1.append(torch.randn(10, device="cuda")) + if use_cudagraphable_rng: + double_fork_state1 = rng_tracker.get_states()["state1"].get_state() + double_fork_state2 = rng_tracker.get_states()["state2"].get_state() + else: + double_fork_state1 = rng_tracker.get_states()["state1"] + double_fork_state2 = rng_tracker.get_states()["state2"] + + rng_tracker.reset() + rng_tracker.add("state1", 1234) + rng_tracker.add("state2", 5678) + randn_single_fork_1 = [] + randn_single_fork_2 = [] + with rng_tracker.fork("state1"): + randn_single_fork_1.append(torch.randn(10, device="cuda")) + randn_single_fork_1.append(torch.randn(10, device="cuda")) + randn_single_fork_1.append(torch.randn(10, device="cuda")) + with rng_tracker.fork("state2"): + randn_single_fork_2.append(torch.randn(10, device="cuda")) + randn_single_fork_2.append(torch.randn(10, device="cuda")) + if use_cudagraphable_rng: + single_fork_state1 = rng_tracker.get_states()["state1"].get_state() + single_fork_state2 = rng_tracker.get_states()["state2"].get_state() + else: + single_fork_state1 = rng_tracker.get_states()["state1"] + single_fork_state2 = rng_tracker.get_states()["state2"] + + assert torch.equal(randn_double_fork_1[0], randn_single_fork_1[0]) + assert torch.equal(randn_double_fork_1[1], randn_single_fork_1[1]) + assert torch.equal(randn_double_fork_1[2], randn_single_fork_1[2]) + assert torch.equal(randn_double_fork_2[0], randn_single_fork_2[0]) + assert torch.equal(randn_double_fork_2[1], randn_single_fork_2[1]) + assert torch.equal(double_fork_state1, single_fork_state1) + assert torch.equal(double_fork_state2, single_fork_state2) + + +def test_convert_cuda_rng_state(): + ## Get the default rng state + torch.cuda.manual_seed(999) + randn = torch.randn(10, device="cuda") + rng_state = torch.cuda.get_rng_state() + + try: + from megatron.core.extensions.transformer_engine import TECudaRNGStatesTracker + except ImportError: + TECudaRNGStatesTracker = None + + ## from non-graphable RNG to graphable RNG + # get state from non-graphable RNG + tracker = CudaRNGStatesTracker(use_cudagraphable_rng=False) + tracker.add("state1", 123) + for i in range(3): + with tracker.fork("state1"): + randn = torch.randn(10, device="cuda") + state = convert_cuda_rng_state(tracker.states_["state1"], to_graphable=True) + rand_tensors = [] + for i in range(3): + with tracker.fork("state1"): + randn = torch.randn(10, device="cuda") + rand_tensors.append(randn) + + # set state to local graph RNG + cudagraphable_tracker = CudaRNGStatesTracker(use_cudagraphable_rng=True) + cudagraphable_tracker.set_states({"state1": state.clone_state()}) + for i in range(3): + with cudagraphable_tracker.fork("state1"): + randn = torch.randn(10, device="cuda") + assert torch.equal(randn, rand_tensors[i]) + + # set state to TE RNG + if TECudaRNGStatesTracker is not None: + te_tracker = TECudaRNGStatesTracker() + te_tracker.set_states({"state1": state}) + for i in range(3): + with te_tracker.fork("state1"): + randn = torch.randn(10, device="cuda") + assert torch.equal(randn, rand_tensors[i]) + + ## from graphable RNG to non-graphable RNG + # get state from graphable RNG + cudagraphable_tracker = CudaRNGStatesTracker(use_cudagraphable_rng=True) + cudagraphable_tracker.add("state2", 123) + for i in range(3): + with cudagraphable_tracker.fork("state2"): + randn = torch.randn(10, device="cuda") + state = convert_cuda_rng_state(cudagraphable_tracker.states_["state2"], to_graphable=False) + rand_tensors = [] + for i in range(3): + with cudagraphable_tracker.fork("state2"): + randn = torch.randn(10, device="cuda") + rand_tensors.append(randn) + + # set state to non-graphable RNG + tracker = CudaRNGStatesTracker(use_cudagraphable_rng=False) + tracker.set_states({"state2": state}) + for i in range(3): + with tracker.fork("state2"): + randn = torch.randn(10, device="cuda") + assert torch.equal(randn, rand_tensors[i]) + + ## from TE RNG to non-graphable RNG + if TECudaRNGStatesTracker is not None: + # get state from TE RNG + cudagraphable_tracker = TECudaRNGStatesTracker() + cudagraphable_tracker.add("state3", 123) + for i in range(3): + with cudagraphable_tracker.fork("state3"): + randn = torch.randn(10, device="cuda") + state = convert_cuda_rng_state(cudagraphable_tracker.states_["state3"], to_graphable=False) + rand_tensors = [] + for i in range(3): + with cudagraphable_tracker.fork("state3"): + randn = torch.randn(10, device="cuda") + rand_tensors.append(randn) + + # set state to non-graphable RNG + tracker = CudaRNGStatesTracker(use_cudagraphable_rng=False) + tracker.set_states({"state3": state}) + for i in range(3): + with tracker.fork("state3"): + randn = torch.randn(10, device="cuda") + assert torch.equal(randn, rand_tensors[i]) + + ## After all tests, check if the default rng state is still the same. + rng_state_final = torch.cuda.get_rng_state() + assert torch.equal(rng_state, rng_state_final) + + def test_model_parallel_cuda_manual_seed(): Utils.initialize_model_parallel(4, 2) model_parallel_cuda_manual_seed(0, force_reset_rng=True) From 368e580b7ad04fa5c6bfdaaf4ac05de9dbc96c07 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Wed, 17 Dec 2025 10:25:39 -0800 Subject: [PATCH 197/248] [Dev] Mark API backwards compatibility checks as OPTIONAL (non-blocking) (#2699) --- .../check_api_backwards_compatibility_workflow.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/check_api_backwards_compatibility_workflow.yml b/.github/workflows/check_api_backwards_compatibility_workflow.yml index 42db9486cac..4ba0ed2780c 100644 --- a/.github/workflows/check_api_backwards_compatibility_workflow.yml +++ b/.github/workflows/check_api_backwards_compatibility_workflow.yml @@ -81,7 +81,7 @@ jobs: check-compatibility: needs: [pre-flight] if: needs.pre-flight.outputs.should_skip != 'true' - name: Check API Backward Compatibility + name: "OPTIONAL: Check API Backward Compatibility" runs-on: ubuntu-latest # ============================================================================ @@ -245,7 +245,7 @@ jobs: api-backward-compatibility-summary: needs: [pre-flight, check-compatibility] runs-on: ubuntu-latest - name: API Backward Compatibility Check Summary + name: "OPTIONAL: API Backward Compatibility Check Summary" if: always() && !cancelled() steps: - name: Checkout @@ -257,7 +257,7 @@ jobs: GH_TOKEN: ${{ github.token }} SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.should_skip == 'true' }} run: | - FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "API Backward Compatibility Check Summary")] | length') || echo 0 + FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "OPTIONAL: API Backward Compatibility Check Summary")] | length') || echo 0 if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then if [ "$SKIPPING_IS_ALLOWED" == "true" ]; then @@ -268,6 +268,6 @@ jobs: exit 0 else echo "❌ Found $FAILED_JOBS failed job(s)" - gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "API Backward Compatibility Check Summary") | .name' + gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success" and .name != "OPTIONAL: API Backward Compatibility Check Summary") | .name' exit 1 fi From 3714d81d418c9f1bca4594fc35f9e8289f652862 Mon Sep 17 00:00:00 2001 From: Kunlun Li <94586211+kunlunl@users.noreply.github.com> Date: Thu, 18 Dec 2025 09:05:09 +0800 Subject: [PATCH 198/248] [Dev] FP8 params support for megatron-fsdp (MXFP8/Blockwise) (#2086) Signed-off-by: kunlunl Co-authored-by: jianbinc --- .../distributed/fsdp/mcore_fsdp_adapter.py | 4 + .../fsdp/src/megatron_fsdp/megatron_fsdp.py | 157 +++--- .../fsdp/src/megatron_fsdp/mixed_precision.py | 331 +++++++++++++ .../megatron_fsdp/param_and_grad_buffer.py | 450 +++++++++++++----- .../fsdp/src/megatron_fsdp/utils.py | 252 +--------- megatron/training/arguments.py | 3 + 6 files changed, 776 insertions(+), 421 deletions(-) create mode 100644 megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py diff --git a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py index 7432a7f9a36..d6384e70488 100644 --- a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +++ b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py @@ -111,6 +111,9 @@ def __init__( dist_index=self.megatron_fsdp_dist_index, calculate_per_token_loss=config.calculate_per_token_loss, init_model_with_meta_device=config.init_model_with_meta_device, + enable_fine_grained_param_gather_hook=( + config.fp8_recipe == "mxfp8" and ddp_config.fp8_param_gather + ), ), ) self.param_and_grad_buffer = self.module.param_and_grad_buffer @@ -123,6 +126,7 @@ def __init__( self.broadcast_params = self.module.broadcast_params self.module.state_dict_for_save_checkpoint = self.module.state_dict self.state_dict_for_save_checkpoint = self.state_dict + self.module.config = config self.sync_rng_states_across_tp_group() diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py index 8a63e0f5cf7..17f7f4d1c05 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py @@ -23,6 +23,20 @@ import torch.nn as nn from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten +from .mixed_precision import ( + fp8_create_transpose_cache, + fp8_discard_transpose_cache, + is_float8tensor, +) +from .param_and_grad_buffer import ( + AllGatherPipeline, + BucketingPolicy, + GradReducePipeline, + ParamAndGradBuffer, + PrefetchOrder, + override_sharded_param_methods_with_safety_checks, + to_local_if_dtensor, +) from .utils import FSDPDistributedIndex logger = logging.getLogger(__name__) @@ -34,23 +48,12 @@ from megatron.core.distributed.distributed_data_parallel_config import ( DistributedDataParallelConfig, ) - from megatron.core.fp8_utils import is_float8tensor from megatron.core.utils import is_submodule except ImportError: # Megatron-LM is not installed, use Megatron-FSDP as a standalone module. logger.info("Megatron Core is not installed, Megatron-FSDP will run without Megatron Core.") from .distributed_data_parallel_config import DistributedDataParallelConfig - from .utils import is_float8tensor, is_submodule - -from .param_and_grad_buffer import ( - AllGatherPipeline, - BucketingPolicy, - GradReducePipeline, - ParamAndGradBuffer, - PrefetchOrder, - override_sharded_param_methods_with_safety_checks, - to_local_if_dtensor, -) + from .utils import is_submodule class TrainingState(Enum): @@ -168,6 +171,7 @@ def __init__( nccl_ub: bool = False, fsdp_double_buffer: bool = False, disable_symmetric_registration: bool = False, + enable_fine_grained_param_gather_hook: bool = False, ): super().__init__() # If device is not specified, use the current device. @@ -217,6 +221,7 @@ def __init__( self.calculate_per_token_loss = calculate_per_token_loss self.init_model_with_meta_device = init_model_with_meta_device + self.enable_fine_grained_param_gather_hook = enable_fine_grained_param_gather_hook # Whether to constantly synchronize the model every training iteration, # which defaults to False to overlap communication with computation @@ -400,6 +405,7 @@ def all_gather_and_wait_parameters_ready( prefetch=True, prefetch_order=PrefetchOrder.FORWARD_PASS_ORDER, wait_bucket_ready=True, + bwd=False, ): """ All-gather parameters across the data parallel group and wait for @@ -426,11 +432,14 @@ def all_gather_and_wait_parameters_ready( and self.ddp_config.outer_dp_sharding_strategy != "no_shard" and (self.microbatch_count == 0 or self.model_auto_sync) ), + bwd=bwd, ) if wait_bucket_ready: for param in params: bucket_id = self.param_and_grad_buffer.param_to_param_group[param] - ag_pipeline.wait_bucket_ready(bucket_id) + ag_pipeline.wait_bucket_ready(bucket_id, bwd) + if bwd and is_float8tensor(param): + fp8_create_transpose_cache(param) for param in params: # This setting is needed to make FSDP store the weight object when used @@ -489,19 +498,17 @@ def _register_fsdp_hooks(self, root_module): """ fsdp_unit_modules = self.fsdp_unit_modules - def release_module_parameters(module, *unused): + def release_module_parameters(module, bwd, *unused): for param in module.parameters(): bucket_id = self.param_and_grad_buffer.param_to_param_group[param] - self.all_gather_pipeline.release_bucket(bucket_id) - + self.all_gather_pipeline.release_bucket(bucket_id, bwd) if not self.ddp_config.keep_fp8_transpose_cache: release_params_fp8_transpose_cache(module.parameters()) def release_params_fp8_transpose_cache(params): for param in params: if is_float8tensor(param): - param._transpose_invalid = True - param._transpose = None + fp8_discard_transpose_cache(param) def _grad_acc(param): """ @@ -558,12 +565,15 @@ def _post_backward(module, *unused): if self.ddp_config.data_parallel_sharding_strategy == "optim_grads_params": # Deallocate the module parameters after the backward pass, # because we have our data-parallel gradients computed. - release_module_parameters(module) + release_module_parameters(module, bwd=True) module._training_state = TrainingState.IDLE param_list = list(module.parameters()) else: param_list = list(module.parameters(recurse=False)) + if self.enable_fine_grained_param_gather_hook: + param_list = list(module.parameters(recurse=False)) + # If the parameter is shared, we do not accumulate gradients # here, as the gradients will be accumulated in the # root post-backward hook. @@ -615,6 +625,9 @@ def _pre_forward_param_unshard( # to allocate as little memory as possible for this forward pass. param_list = list(module.parameters(recurse=False)) + if self.enable_fine_grained_param_gather_hook: + param_list = list(module.parameters(recurse=False)) + # All-gather the parameters before the forward pass. self.all_gather_and_wait_parameters_ready( params=param_list, @@ -714,7 +727,7 @@ def _root_post_backward(*unused): if self.model_auto_sync: self.finish_grad_sync() - def _pre_backward(module: nn.Module, *unused): + def _pre_backward_param_unshard(module: nn.Module, *unused): """ Sub-module pre-backward hook to all-gather the module parameters before the backward pass. @@ -723,11 +736,19 @@ def _pre_backward(module: nn.Module, *unused): # and unsharding operations when performing activation recomputation # / gradient checkpointing. module._training_state = TrainingState.PRE_BACKWARD + if isinstance(module, tuple(fsdp_unit_modules)): - # All-gather / unshard the module parameters before the backward pass. - self.all_gather_and_wait_parameters_ready( - list(module.parameters()), prefetch_order=PrefetchOrder.BACKWARD_PASS_ORDER - ) + param_list = list(module.parameters()) + else: + param_list = list(module.parameters(recurse=False)) + + if self.enable_fine_grained_param_gather_hook: + param_list = list(module.parameters(recurse=False)) + + # All-gather / unshard the module parameters before the backward pass. + self.all_gather_and_wait_parameters_ready( + param_list, prefetch_order=PrefetchOrder.BACKWARD_PASS_ORDER, bwd=True + ) self._root_pre_backward_hook_issued = False @@ -754,7 +775,9 @@ def _root_pre_backward(module: nn.Module, *unused): for bucket_id in range(ag_pipeline.num_buckets): group = self.param_and_grad_buffer.parameter_groups[bucket_id] if group.fsdp_unit_id is not None: - ag_pipeline.bucket_can_be_released[bucket_id] = True + ag_pipeline.bucket_can_be_released[ + ag_pipeline.get_bucket_key(bucket_id, bwd=False) + ] = True # Track parameters that require gradient reduction and optimization. self._params_require_handle_grad = set() for param_group in self.param_and_grad_buffer.parameter_groups: @@ -776,8 +799,12 @@ def _post_forward(module: nn.Module, input: Any, output: Any): # during activation recomputation / gradient checkpointing. return output + assert isinstance( + module, tuple(fsdp_unit_modules) + ), "_post_forward hook should only be registered on FSDP unit modules." + # Release the module parameters after the forward pass to save memory. - release_module_parameters(module) + release_module_parameters(module, bwd=False) module._training_state = TrainingState.IDLE return output @@ -818,21 +845,55 @@ def forward_hook(_module, inputs, output): # on the output tensor(s). return module.register_forward_hook(forward_hook) + def _register_pre_forward_param_unshard_hook(module): + """ + Register the forward pre-hook to unshard parameters before the forward pass. + If we are not sharding anything, we do not have a model weight buffer and thus + have nothing to all-gather / un-shard. + """ + if self.ddp_config.data_parallel_sharding_strategy != "no_shard": + self.forward_pre_hooks[f"{module._get_name()} parameter unshard"] = ( + module.register_forward_pre_hook( + _pre_forward_param_unshard, prepend=True, with_kwargs=True + ) + ) + + def _register_pre_backward_param_unshard_hook(module): + """ + Register the backward pre-hook to unshard FSDP unit module parameters + immediately before the backward pass via attaching a gradient-triggered + hook to the output tensor(s) of a module during a post-forward hook. + """ + self.backward_pre_hooks[f"all-gather {module._get_name()} parameters"] = ( + create_custom_backward_hook(module, _pre_backward_param_unshard) + ) + + def _register_grad_acc_and_reduce_hook(module): + """ + Register the post-backward hook to deallocate model parameters and + reduce-scatter gradients immediately after the module backward pass + has completed to conserve memory for the subsequent backward pass. + """ + self.forward_pre_hooks[f"module {name} register post-backward hook"] = ( + module.register_forward_pre_hook( + functools.partial(_register_post_backward_hook, _post_backward), + with_kwargs=True, + ) + ) + fsdp_modules = [] for name, module in root_module.named_modules(): + if self.enable_fine_grained_param_gather_hook: + _register_pre_forward_param_unshard_hook(module) + _register_pre_backward_param_unshard_hook(module) + _register_grad_acc_and_reduce_hook(module) + # Skip if the module is already registered in fsdp_modules. if any(is_submodule(module, fsdp_module) for fsdp_module in fsdp_modules): continue - # Register the forward pre-hook to unshard parameters before the forward pass. - # If we are not sharding anything, we do not have a model weight buffer and thus - # have nothing to all-gather / un-shard. - if self.ddp_config.data_parallel_sharding_strategy != "no_shard": - self.forward_pre_hooks[f"module {name} parameter unshard"] = ( - module.register_forward_pre_hook( - _pre_forward_param_unshard, prepend=True, with_kwargs=True - ) - ) + if not self.enable_fine_grained_param_gather_hook: + _register_pre_forward_param_unshard_hook(module) if isinstance(module, tuple(fsdp_unit_modules)): fsdp_modules.append(module) @@ -843,12 +904,8 @@ def forward_hook(_module, inputs, output): module.register_forward_hook(_post_forward, prepend=False) ) - # Register the backward pre-hook to unshard FSDP unit module parameters - # immediately before the backward pass via attaching a gradient-triggered - # hook to the output tensor(s) of a module during a post-forward hook. - self.backward_pre_hooks[f"all-gather module {name} parameters"] = ( - create_custom_backward_hook(module, _pre_backward) - ) + if not self.enable_fine_grained_param_gather_hook: + _register_pre_backward_param_unshard_hook(module) elif ( not self.ddp_config.keep_fp8_transpose_cache and self.ddp_config.data_parallel_sharding_strategy == "optim_grads_params" @@ -861,15 +918,8 @@ def forward_hook(_module, inputs, output): module.register_forward_hook(_release_module_fp8_transpose_cache, prepend=False) ) - # Register the post-backward hook to deallocate model parameters and - # reduce-scatter gradients immediately after the module backward pass - # has completed to conserve memory for the subsequent backward pass. - self.forward_pre_hooks[f"module {name} register post-backward hook"] = ( - module.register_forward_pre_hook( - functools.partial(_register_post_backward_hook, _post_backward), - with_kwargs=True, - ) - ) + if not self.enable_fine_grained_param_gather_hook: + _register_grad_acc_and_reduce_hook(module) # Register root module pre- and post-backward hooks in cases where the # forward function of root module is not called, but rather the forward @@ -986,7 +1036,7 @@ def start_param_sync(self, *unused, force_sync: bool = False, force_dispatch: bo else: self.synchronize_param_gather() for bucket_id in range(self.all_gather_pipeline.num_buckets): - self.all_gather_pipeline.async_bucket_gather(bucket_id=bucket_id) + self.all_gather_pipeline.async_bucket_gather(bucket_id=bucket_id, bwd=False) group = self.param_and_grad_buffer.parameter_groups[bucket_id] if group.model_weight_buffer is None: continue @@ -994,9 +1044,10 @@ def start_param_sync(self, *unused, force_sync: bool = False, force_dispatch: bo if group.model_weight_buffer.is_data_distributed: # If model weight is sharded, we wait for the all-gather to complete and # then release the bucket immediately to save memory usage. - self.all_gather_pipeline.wait_bucket_ready(bucket_id) + self.all_gather_pipeline.wait_bucket_ready(bucket_id, False) + for bucket_id in range(self.all_gather_pipeline.num_buckets): - self.all_gather_pipeline.wait_bucket_ready(bucket_id) + self.all_gather_pipeline.wait_bucket_ready(bucket_id, False) def start_grad_sync(self, *unused): """ diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py new file mode 100644 index 00000000000..69a049ad955 --- /dev/null +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py @@ -0,0 +1,331 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from importlib.metadata import version +from typing import List, Optional, Tuple + +import torch +from packaging.version import Version as PkgVersion + +logger = logging.getLogger(__name__) + +# Detect if Transformer Engine is installed +try: + import transformer_engine # pylint: disable=W0611 + from transformer_engine.pytorch.module.base import TransformerEngineBaseModule + + HAVE_TE = True +except (ImportError, ModuleNotFoundError): + TransformerEngineBaseModule = None + HAVE_TE = False + logger.info("Using Megatron-FSDP without Transformer Engine.") + +# Detect the Transformer Engine version +try: + import transformer_engine as te + + if hasattr(te, "__version__"): + TE_VERSION = PkgVersion(str(te.__version__)) + else: + TE_VERSION = PkgVersion(version("transformer-engine")) +except: + TE_VERSION = None + +# Detect the FP8 tensor class +try: + from transformer_engine.pytorch.tensor import QuantizedTensor + + HAVE_TE_FP8_TENSOR_CLASS = True + FP8_TENSOR_CLASS = QuantizedTensor +except: + try: + from transformer_engine.pytorch.float8_tensor import Float8Tensor + + HAVE_TE_FP8_TENSOR_CLASS = True + FP8_TENSOR_CLASS = Float8Tensor + except: + HAVE_TE_FP8_TENSOR_CLASS = False + +# Detect the MXFP8 tensor class +try: + from transformer_engine.pytorch.tensor.mxfp8_tensor import MXFP8Tensor + + HAVE_TE_MXFP8TENSOR = True +except: + HAVE_TE_MXFP8TENSOR = False + +# Detect the Blockwise FP8 tensor class +try: + from transformer_engine.pytorch.tensor.float8_blockwise_tensor import Float8BlockwiseQTensor + + HAVE_TE_BLOCKWISE_FP8TENSOR = True +except: + HAVE_TE_BLOCKWISE_FP8TENSOR = False + +# Detect the "cast_master_weights_to_fp8" function of Transformer Engine +try: + from transformer_engine.pytorch.tensor.utils import cast_master_weights_to_fp8 + + HAVE_TE_CAST_MASTER_WEIGHTS_TO_FP8 = True +except: + HAVE_TE_CAST_MASTER_WEIGHTS_TO_FP8 = False + + # Try to import multi_tensor_apply, used in the fallback of fp8 quantization. + try: + from transformer_engine.pytorch.optimizers import multi_tensor_applier, multi_tensor_scale + + multi_tensor_scale_impl = multi_tensor_scale + except ImportError: + try: + import amp_C + from apex.multi_tensor_apply import multi_tensor_applier + + multi_tensor_scale_impl = amp_C.multi_tensor_scale + except ImportError: + import warnings + + warnings.warn( + "Transformer Engine and Apex are not installed. " + "Falling back to local implementations of " + "multi_tensor_applier and multi_tensor_scale" + ) + + def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args): + """Multi tensor op applier""" + return op(2048 * 32, noop_flag_buffer, tensor_lists, *args) + + def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale): + """Works as a drop-in replacement for amp_C.multi_tensor_scale.""" + for src, dst in zip(tensor_lists[0], tensor_lists[1]): + dst.copy_(src * scale) + + multi_tensor_applier = local_multi_tensor_applier + multi_tensor_scale_impl = local_multi_tensor_scale + + def _multi_tensor_copy_this_to_that( + this: List[torch.Tensor], + that: List[torch.Tensor], + overflow_buf: Optional[torch.Tensor] = None, + ): + """ + Use multi-tensor-applier to copy values from one list to another. + We don't have a bfloat16 implementation so for now if the overflow_buf + is not provided, we default back to simple loop copy to be compatible + with bfloat16. + """ + if overflow_buf is not None: + overflow_buf.fill_(0) + # Scaling with factor `1.0` is equivalent to copy. + multi_tensor_applier(multi_tensor_scale_impl, overflow_buf, [this, that], 1.0) + else: + for this_, that_ in zip(this, that): + that_.copy_(this_) + + +# Detect the "post_all_gather_processing" function of Transformer Engine +try: + from transformer_engine.pytorch.tensor.utils import post_all_gather_processing + + HAVE_TE_POST_ALL_GATHER_PROCESSING = True +except: + HAVE_TE_POST_ALL_GATHER_PROCESSING = False + + +def is_te_min_version(vers, check_equality=True): + """Check if minimum version of `transformer-engine` is installed.""" + if not isinstance(TE_VERSION, PkgVersion): + return False + + if check_equality: + return TE_VERSION >= PkgVersion(vers) + else: + return TE_VERSION > PkgVersion(vers) + + +def is_float8tensor(tensor: torch.Tensor) -> bool: + """Check if a tensor is a FP8 tensor.""" + return HAVE_TE and isinstance(tensor, FP8_TENSOR_CLASS) + + +def is_blockwise_float8tensor(tensor: torch.Tensor) -> bool: + """Check if a tensor is a Blockwise FP8 tensor.""" + return HAVE_TE_BLOCKWISE_FP8TENSOR and isinstance(tensor, Float8BlockwiseQTensor) + + +def fp8_need_transpose_data(tensor: torch.Tensor) -> bool: + """Check if a FP8 tensor needs transpose data.""" + return HAVE_TE_MXFP8TENSOR and isinstance(tensor, MXFP8Tensor) + + +def fp8_need_transpose_data_for_meta_device_init(module: TransformerEngineBaseModule) -> bool: + """Check if a FP8 tensor needs transpose data, for meta device init scenario.""" + return HAVE_TE_MXFP8TENSOR and module.fp8_meta["recipe"].mxfp8() + + +def fp8_discard_transpose_cache(tensor: torch.Tensor) -> None: + """Discard the transpose cache of a FP8 tensor.""" + assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor" + + if hasattr(tensor, "_transpose_invalid"): + tensor._transpose_invalid = True + tensor._transpose = None + elif not fp8_need_transpose_data(tensor): + tensor.update_usage(rowwise_usage=True, columnwise_usage=False) + + +def fp8_create_transpose_cache(tensors: List[torch.Tensor]) -> None: + """Create the transpose cache of a FP8 tensor.""" + if HAVE_TE_POST_ALL_GATHER_PROCESSING: + post_all_gather_processing(tensors) + else: + _fp8_create_transpose_cache_fallback(tensors) + + +def _fp8_create_transpose_cache_fallback(tensors: List[torch.Tensor]) -> None: + if not isinstance(tensors, list): + tensors = [tensors] + for tensor in tensors: + assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor" + if hasattr(tensor, "_create_transpose"): + tensor._create_transpose() + else: + tensor._create_columnwise() + + +def fp8_set_raw_data(tensor: torch.Tensor, data: torch.Tensor, set_transpose: bool = False) -> None: + """Set the raw data of a Transformer Engine Float8Tensor.""" + assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor" + + if set_transpose: + assert fp8_need_transpose_data(tensor), f"Type {type(tensor)} does not need transpose data" + data_attr = "_columnwise_data" + else: + data_attr = "_rowwise_data" if hasattr(tensor, "_rowwise_data") else "_data" + + old_data = getattr(tensor, data_attr) + assert old_data.dtype == data.dtype, "The data types of raw data don't match" + assert ( + old_data.shape == data.shape + ), f"Shape {old_data.shape} of old_data doesn't match {data.shape} of new_data" + setattr(tensor, data_attr, data) + + +def fp8_get_raw_data(tensor: torch.Tensor, get_transpose: bool = False) -> torch.Tensor: + """Get the underlying raw storage of a FP8 tensor.""" + assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor" + + if get_transpose: + assert fp8_need_transpose_data(tensor), f"Type {type(tensor)} does not need transpose data" + data_attr = "_columnwise_data" + else: + data_attr = "_rowwise_data" if hasattr(tensor, "_rowwise_data") else "_data" + + return getattr(tensor, data_attr) + + +def fp8_dequantize(tensor: torch.Tensor) -> torch.Tensor: + """Dequantize a FP8 tensor to a higher precision.""" + assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor" + assert is_te_min_version( + "2.0" + ), "Transformer Engine >= 2.0 is required for dequantizing parameters." + return tensor.dequantize() + + +def fp8_quantize( + model_params: List[torch.Tensor], + main_params: List[torch.Tensor], + start_offsets: List[int], + data_parallel_group: torch.distributed.ProcessGroup, + fsdp_shard_model_params: List[Tuple[torch.Tensor, Optional[torch.Tensor]]], +) -> None: + """Quantize sharded parameters to FP8.""" + if len(model_params) == 0: + return + fsdp_shard_model_params = [x[0] if x[1] is None else x for x in fsdp_shard_model_params] + + if HAVE_TE_CAST_MASTER_WEIGHTS_TO_FP8: + cast_master_weights_to_fp8( + model_params, main_params, start_offsets, data_parallel_group, fsdp_shard_model_params + ) + else: + _fp8_quantize_fallback( + model_params, main_params, start_offsets, data_parallel_group, fsdp_shard_model_params + ) + + +def _fp8_quantize_fallback( + model_params: List[torch.Tensor], + main_params: List[torch.Tensor], + start_offsets: List[int], + data_parallel_group: torch.distributed.ProcessGroup, + fsdp_shard_model_params: List[Tuple[torch.Tensor, Optional[torch.Tensor]]], +) -> None: + for model_param, main_param, start_offset, fsdp_shard_model_param in zip( + model_params, main_params, start_offsets, fsdp_shard_model_params + ): + if main_param is None: + continue + + if fsdp_shard_model_param is not None: + shard_model_param = fsdp_shard_model_param + else: + shard_model_param = model_param._data.view(-1)[ + start_offset : start_offset + main_param.numel() + ] + + quantizer = model_param._quantizer + # When not using fp8 params, the main_param (fp32) is first cast to bf16/fp16, and then + # cast to fp8 during forward. This logic keeps numerical consistency with bf16 params. + main_param = main_param.to(model_param.dtype) + out = Float8Tensor( + shape=main_param.size(), + dtype=model_param.dtype, + requires_grad=False, + data=shard_model_param, + fp8_scale_inv=model_param._scale_inv, + fp8_dtype=model_param._fp8_dtype, + quantizer=quantizer, + ) + quantizer.update_quantized(main_param, out) + + amaxes = [] + scales = [] + scale_invs = [] + for model_param in model_params: + quantizer = model_param._quantizer + amaxes.append(quantizer.amax.view(1)) + scales.append(quantizer.scale.view(1)) + scale_invs.append(model_param._scale_inv.view(1)) + model_param._reset_caches() + + dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device="cuda") + + # Update scaling factors. + packed_scales = torch.empty(len(scales), dtype=torch.float32, device=scales[0].device) + packed_scale_views = [packed_scales[i].view(1) for i in range(len(scales))] + _multi_tensor_copy_this_to_that(scales, packed_scale_views, dummy_overflow_buf) + torch.reciprocal(packed_scales, out=packed_scales) + _multi_tensor_copy_this_to_that(packed_scale_views, scale_invs, dummy_overflow_buf) + + # Reduce amaxes. + # Note: Assume each param has a separate amax. + packed_amaxes = torch.empty(len(amaxes), dtype=torch.float32, device=amaxes[0].device) + packed_amax_views = [packed_amaxes[i].view(1) for i in range(len(amaxes))] + _multi_tensor_copy_this_to_that(amaxes, packed_amax_views, dummy_overflow_buf) + torch.distributed.all_reduce( + packed_amaxes, op=torch.distributed.ReduceOp.MAX, group=data_parallel_group + ) + _multi_tensor_copy_this_to_that(packed_amax_views, amaxes, dummy_overflow_buf) diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py index 88254d89988..b0154cb94e9 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py @@ -33,6 +33,17 @@ from torch.distributed.tensor import DTensor, Replicate, Shard from torch.distributed.tensor.device_mesh import _mesh_resources +from .mixed_precision import ( + fp8_discard_transpose_cache, + fp8_get_raw_data, + fp8_need_transpose_data, + fp8_need_transpose_data_for_meta_device_init, + fp8_quantize, + fp8_set_raw_data, + is_blockwise_float8tensor, + is_float8tensor, + is_te_min_version, +) from .uneven_dtensor import update_uneven_dtensor_chunk_metadata, validate_uneven_dtensor from .utils import ( _MODEL_PARALLEL_RNG_TRACKER_NAME, @@ -51,27 +62,15 @@ from megatron.core.distributed.distributed_data_parallel_config import ( DistributedDataParallelConfig, ) - from megatron.core.fp8_utils import ( - is_float8tensor, - modify_underlying_storage, - quantize_param_shard, - ) from megatron.core.tensor_parallel import get_cuda_rng_tracker - from megatron.core.utils import is_submodule, is_te_min_version + from megatron.core.utils import is_submodule logger.info("Detected Megatron Core, using Megatron-FSDP with Megatron.") except ImportError: # Megatron-LM is not installed, use Megatron-FSDP as a standalone module. from .distributed_data_parallel_config import DistributedDataParallelConfig - from .utils import ( - get_cuda_rng_tracker, - is_float8tensor, - is_submodule, - is_te_min_version, - modify_underlying_storage, - quantize_param_shard, - ) + from .utils import get_cuda_rng_tracker, is_submodule logger.info("Megatron Core is not installed, Megatron-FSDP will run without Megatron Core.") @@ -817,7 +816,7 @@ def __init__( data_parallel_group: Optional[torch.distributed.ProcessGroup] = None, dp_rank: Optional[int] = None, temporary_bucket_allocator: Optional[TemporaryBucketAllocator] = None, - is_dtype_float8: bool = False, + is_transpose_buffer: bool = False, gradient_scaling_factor: Optional[float] = None, chunk_size_factor: int = 1, mem_alloc_context: Optional[Callable] = None, @@ -850,7 +849,7 @@ def __init__( self.temporary_bucket_allocator = ( temporary_bucket_allocator if temporary_bucket_allocator else TemporaryBucketAllocator() ) - self.is_dtype_float8 = is_dtype_float8 + self.is_transpose_buffer = is_transpose_buffer self.gradient_scaling_factor = gradient_scaling_factor self.mem_alloc_context = mem_alloc_context if mem_alloc_context else nullcontext @@ -946,11 +945,11 @@ def fetch_bucket( for p in self.params: item_id = self.param_idx[p] p = to_local_if_dtensor(p) + data = self.get_item_from_bucket(bucket, item_id).view(p.shape) if is_float8tensor(p): - p._data = self.get_item_from_bucket(bucket, item_id).view(p.shape) + fp8_set_raw_data(p, data, self.is_transpose_buffer) else: - p.data = self.get_item_from_bucket(bucket, item_id).view(p.shape) - + p.data = data return bucket def free_bucket_storage(self): @@ -1119,6 +1118,9 @@ def set_item(self, item_id: int, item_data: torch.Tensor) -> None: # When fully sharded, we need to get the slice of the item to be stored in this shard. # Otherwise, we can just flatten the entire item since this buffer contains # the entire bucket. + if is_float8tensor(item_data): + item_data = fp8_get_raw_data(item_data, self.is_transpose_buffer) + if self.is_data_distributed: # Get the coordinates of the slice of the item that is contained in this shard. slice_start, slice_end = self._get_item_slice_in_shard(item_id) @@ -1225,6 +1227,8 @@ class ParameterGroup: Factor determining chunk size for grouped parameter processing. model_weight_buffer (Optional[DataParallelBuffer]): Buffer used to store model weights for data-parallel operations. + transpose_weight_buffer (Optional[DataParallelBuffer]): + Buffer used to store transpose weights for data-parallel operations. main_weight_buffer (Optional[DataParallelBuffer]): Buffer used to store main model weights for data-parallel operations. main_grad_buffer (Optional[DataParallelBuffer]): @@ -1244,6 +1248,7 @@ class ParameterGroup: fsdp_unit_id: Optional[int] = None chunk_size_factor: int = 1 model_weight_buffer: Optional[DataParallelBuffer] = None + transpose_weight_buffer: Optional[DataParallelBuffer] = None main_weight_buffer: Optional[DataParallelBuffer] = None main_grad_buffer: Optional[DataParallelBuffer] = None hsdp_wbuf: Optional[DataParallelBuffer] = None @@ -1314,12 +1319,10 @@ def _does_param_require_new_bucket(param): parameter_groups = [] for name, param in module.named_parameters(): # We need this information to correctly dynamically allocate Tensors! + is_fp8 = is_float8tensor(param) + is_fp8_meta_device_init = meta_device_init_fp8_params.get(name, (False, False))[0] param_attrs = dict( - dtype=( - "float8" - if is_float8tensor(param) or meta_device_init_fp8_params.get(name, False) - else param.dtype - ), + dtype="float8" if (is_fp8 or is_fp8_meta_device_init) else param.dtype, is_expert_param=is_expert_parameter(name, param), requires_grad=param.requires_grad, fsdp_unit_id=None, @@ -1641,7 +1644,10 @@ def __init__( # to determine whether this parameter is fp8 or not. fp8_meta_index = m.param_init_meta[name].fp8_meta_index if m.primary_weights_in_fp8 and fp8_meta_index is not None: - meta_device_init_fp8_params[self.param_to_name[param]] = True + meta_device_init_fp8_params[self.param_to_name[param]] = ( + True, + fp8_need_transpose_data_for_meta_device_init(m), + ) # Get the parameter groups. (self.parameter_groups, self.param_to_param_group, self.bucket_to_bucket_group) = ( @@ -1725,6 +1731,7 @@ def _bytes_to_mb(bytes_val: int) -> str: numel = sum(to_local_if_dtensor(p).shape.numel() for p in group.params) buffers = { "weight": group.model_weight_buffer, + "transpose_weight": group.transpose_weight_buffer, "main_weight": group.main_weight_buffer, "grad": group.main_grad_buffer, } @@ -1794,12 +1801,18 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): self.weight_alloc = FixedPoolAllocator( name="fsdp_params", fsdp_param_groups=self.parameter_groups, size=UB_BUFFER_NUM ) + self.transpose_weight_alloc = FixedPoolAllocator( + name="fsdp_fp8_transpose_params", + fsdp_param_groups=self.parameter_groups, + size=UB_BUFFER_NUM, + ) self.main_grad_alloc = FixedPoolAllocator( name="fsdp_grads", fsdp_param_groups=self.parameter_groups, size=UB_BUFFER_NUM ) self.double_buf_units = self.weight_alloc.fsdp_double_buffer_units else: self.weight_alloc = StorageResizeBasedBucketAllocator() + self.transpose_weight_alloc = StorageResizeBasedBucketAllocator() self.main_grad_alloc = None self.double_buf_units = [] @@ -1839,8 +1852,9 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): ) # Check if the parameter group is FP8. one_param = group.params[0] - is_dtype_float8 = is_float8tensor(one_param) or meta_device_init_fp8_params.get( - self.param_to_name[one_param], False + is_dtype_float8 = ( + is_float8tensor(one_param) + or meta_device_init_fp8_params.get(self.param_to_name[one_param], (False, False))[0] ) if is_dtype_float8: param_dtype = torch.uint8 @@ -1849,6 +1863,16 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): param_dtype = group.params[0].dtype grad_dtype = param_dtype + # Check if the parameter group needs a transpose buffer for model weights. + # Currently, only mxfp8 needs it. + need_transpose_data = is_float8tensor(one_param) and fp8_need_transpose_data(one_param) + need_transpose_data_for_meta_device_init = meta_device_init_fp8_params.get( + self.param_to_name[one_param], (False, False) + )[1] + should_create_transpose_weight_buffer = ( + need_transpose_data or need_transpose_data_for_meta_device_init + ) + # Check if the parameter group requires a grad buffer or main weight buffer. should_create_grad_buffer_or_main_weight_buffer = ( not self.only_create_grad_buffer_and_main_weight_buffer_for_param_requires_grad @@ -1865,13 +1889,29 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): dtype=param_dtype, device=self.device, data_parallel_group=main_buf_dp_group, - is_dtype_float8=is_dtype_float8, + is_transpose_buffer=False, temporary_bucket_allocator=self.weight_alloc, bucket_id=group_id, chunk_size_factor=group.chunk_size_factor, mem_alloc_context=self.mem_alloc_context, **main_buf_extra_kwargs, ) + if should_create_transpose_weight_buffer: + group.transpose_weight_buffer = DataParallelBuffer( + self.ddp_config, + group.params, + is_data_distributed=is_model_weight_buffer_distributed + and main_buf_dp_group.size() > 1, + dtype=param_dtype, + device=self.device, + data_parallel_group=main_buf_dp_group, + is_transpose_buffer=True, + temporary_bucket_allocator=self.transpose_weight_alloc, + bucket_id=group_id, + chunk_size_factor=group.chunk_size_factor, + mem_alloc_context=self.mem_alloc_context, + **main_buf_extra_kwargs, + ) # Initialize the main weight buffer. if should_create_grad_buffer_or_main_weight_buffer and preserve_fp32_weights: @@ -1903,7 +1943,7 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): dtype=torch.float32 if grad_reduce_in_fp32 else grad_dtype, device=self.device, data_parallel_group=main_buf_dp_group, - is_dtype_float8=False, + is_transpose_buffer=False, temporary_bucket_allocator=self.main_grad_alloc, gradient_scaling_factor=gradient_scaling_factor, bucket_id=group_id, @@ -1927,7 +1967,7 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): dtype=wbuf.dtype, device=wbuf.device, data_parallel_group=hsdp_buf_dp_group, - is_dtype_float8=wbuf.is_dtype_float8, + is_transpose_buffer=False, temporary_bucket_allocator=self.weight_alloc, bucket_id=group_id, chunk_size_factor=group.chunk_size_factor, @@ -1943,6 +1983,9 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): ), ) + if group.transpose_weight_buffer is not None: + raise NotImplementedError("HSDP for transpose buffer is not implemented yet") + if should_create_grad_buffer_or_main_weight_buffer: # Initialize the HSDP grad buffer. gbuf = group.main_grad_buffer @@ -1954,7 +1997,7 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): dtype=gbuf.dtype, device=gbuf.device, data_parallel_group=hsdp_buf_dp_group, - is_dtype_float8=gbuf.is_dtype_float8, + is_transpose_buffer=False, temporary_bucket_allocator=self.main_grad_alloc, gradient_scaling_factor=gradient_scaling_factor, bucket_id=group_id, @@ -2037,6 +2080,20 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): torch.empty(wbuf.data_size, dtype=wbuf.dtype, device=self.device) ) bucket = wbuf.fetch_bucket() + + tbuf = group.transpose_weight_buffer + if tbuf: + with self.mem_alloc_context(): + if group.hsdp_wbuf: + raise NotImplementedError( + "HSDP for transpose buffer is not implemented yet" + ) + else: + tbuf.init_data( + torch.empty(tbuf.data_size, dtype=tbuf.dtype, device=self.device) + ) + transpose_bucket = tbuf.fetch_bucket() + mbuf = group.main_weight_buffer if mbuf: # Manually instantiate an empty tensor into the main weight buffer. @@ -2090,25 +2147,41 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): if not self.ddp_config.keep_fp8_transpose_cache: for _param in m.parameters(recurse=False): if is_float8tensor(_param): - _param._transpose_invalid = True - _param._transpose = None + fp8_discard_transpose_cache(_param) # Raise error if a meta parameter still exists after initialization. assert not p.is_meta, (self.param_to_name[p], module_reset_flag) + p_local = to_local_if_dtensor(p) + # Copy the model weight parameter tensor into the buffer. # When distributed, this shards and preserves the data across all ranks. - wbuf.set_item(item_id, to_local_if_dtensor(p)) + wbuf.set_item(item_id, p_local) + if tbuf: + tbuf.set_item(item_id, p_local) # Retrieve the newly allocated parameter data from the global bucket. # Attach the bucket-allocated parameter data to the module parameter, # to use the bucket-allocated data for autograd and NCCL. - new_param_data = wbuf.get_item_from_bucket(bucket, item_id).view( - to_local_if_dtensor(p).shape - ) - if is_float8tensor(p): - # Needed to instantiate FP8 parameters. Requires installing - # TransformerEngine. - modify_underlying_storage(p, new_param_data) + new_param_data = wbuf.get_item_from_bucket(bucket, item_id).view(p_local.shape) + if tbuf: + new_transpose_data = tbuf.get_item_from_bucket( + transpose_bucket, item_id + ).view(p_local.shape) + else: + new_transpose_data = None + + if is_float8tensor(p_local): + old_param_data = fp8_get_raw_data(p_local) + assert old_param_data._base is None + new_param_data.detach().copy_(old_param_data) + fp8_set_raw_data(p_local, new_param_data) + del old_param_data + if new_transpose_data is not None: + old_transpose_data = fp8_get_raw_data(p_local, True) + assert old_transpose_data._base is None + new_transpose_data.detach().copy_(old_transpose_data) + fp8_set_raw_data(p_local, new_transpose_data, True) + del old_transpose_data elif isinstance(p, DTensor): old_param_data = p._local_tensor.data p._local_tensor.data = new_param_data @@ -2146,7 +2219,12 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): # the (high-precision) main weight buffer. # Nothing else needs to be done, because the main weights # do not require autograd operations, only possibly sharding. - mbuf.set_item(item_id, to_local_if_dtensor(p)) + p_local = to_local_if_dtensor(p) + assert not is_float8tensor(p_local), ( + self.param_to_name[p], + "fp8 param should use get_high_precision_init_val method.", + ) + mbuf.set_item(item_id, p_local) if wbuf and wbuf.is_data_distributed: # Free the memory backing the temporarily-allocated bucket associated @@ -2158,6 +2236,9 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): # before forward activations and gradients are allocated in training. wbuf.free_bucket_storage() + if tbuf and tbuf.is_data_distributed: + tbuf.free_bucket_storage() + # Allocate the main_weight buffer and main_grad buffer data in one buffer. if self.buffer_all_in_one: with self.mem_alloc_context(): @@ -2281,6 +2362,7 @@ def _reset_parameters(self, old_params, new_params): group.params[item_id] = new_p for buf in [ group.model_weight_buffer, + group.transpose_weight_buffer, group.main_weight_buffer, group.main_grad_buffer, group.hsdp_wbuf, @@ -2328,6 +2410,7 @@ def _init_distributed_params(self): dist_main_weight = {} for pg in self.parameter_groups: wbuf = pg.model_weight_buffer + tbuf = pg.transpose_weight_buffer mbuf = pg.main_weight_buffer for item_id, orig_param in enumerate(pg.params): param_name = self.param_to_name[orig_param] @@ -2355,6 +2438,7 @@ def _init_distributed_params(self): ) dist_main_weight[param_name] = dist_param elif wbuf: + assert tbuf is None, "Transpose buffer should only exist when main params exist" dist_param = make_fsdp_dtensor( local_tensor=wbuf.get_item(item_id, only_shard=sharded_optimizer_state), param=orig_param, @@ -2524,9 +2608,54 @@ def copy_main_weights_to_model_weights(self): expert_param_quantize_kwargs = copy.deepcopy(dense_param_quantize_kwargs) data_parallel_group = None expert_data_parallel_group = None + clear_quantize_kwargs = lambda kwargs: [d.clear() for d in kwargs.values()] + + def _fp8_quantize_params(dense_param_quantize_kwargs, expert_param_quantize_kwargs): + if len(dense_param_quantize_kwargs["model_params"]) > 0: + # If we have FP8 parameters, we need to quantize them. + fp8_quantize(data_parallel_group=data_parallel_group, **dense_param_quantize_kwargs) + + if len(expert_param_quantize_kwargs["model_params"]) > 0: + # If we have FP8 expert parameters, we need to quantize them. + fp8_quantize( + data_parallel_group=expert_data_parallel_group, **expert_param_quantize_kwargs + ) + + clear_quantize_kwargs(dense_param_quantize_kwargs) + clear_quantize_kwargs(expert_param_quantize_kwargs) + + # Special handling of blockwise FP8 + BATCH_QUANT_MEMORY_LIMIT_BYTES = 5 * 1024**3 # 5 GB + blockwise_fp8_weight_buffers = [] + blockwise_fp8_param_buffers = [] + + def _batch_quantize_blockwise_fp8_params( + dense_param_quantize_kwargs, expert_param_quantize_kwargs, blockwise_fp8_param_buffers + ): + if len(blockwise_fp8_param_buffers) == 0: + return + + # Copy original param shards into their blockwise FP8 working buffers + for bufs in blockwise_fp8_param_buffers: + bufs["bucket_param"].copy_(bufs["param"]) + + # Apply FP8 quantization to blockwise FP8 parameters + _fp8_quantize_params(dense_param_quantize_kwargs, expert_param_quantize_kwargs) + + # Copy quantized params back from working buffers to original param tensors + for bufs in blockwise_fp8_param_buffers: + bufs["param"].copy_(bufs["bucket_param"]) + blockwise_fp8_param_buffers.clear() + + # Free bucket storage for blockwise FP8 weight buffers + for wbuf in blockwise_fp8_weight_buffers: + wbuf.free_bucket_storage() + blockwise_fp8_weight_buffers.clear() + for pg in self.parameter_groups: mbuf = pg.main_weight_buffer wbuf = pg.model_weight_buffer + tbuf = pg.transpose_weight_buffer if mbuf is None: continue @@ -2542,44 +2671,88 @@ def copy_main_weights_to_model_weights(self): shard_offsets_in_fp8 = quantize_func_kwargs["start_offsets"] shard_model_params = quantize_func_kwargs["fsdp_shard_model_params"] + has_blockwise_fp8_param = False for param in pg.params: item_id = mbuf.param_idx[param] if wbuf: if wbuf.is_data_distributed or mbuf.is_data_distributed: model_param = wbuf.get_item(item_id, only_shard=True) + if tbuf: + transpose_param = tbuf.get_item(item_id, only_shard=True) + else: + transpose_param = None main_weight = mbuf.get_item(item_id, only_shard=True) else: model_param = wbuf.get_item(item_id) + if tbuf: + transpose_param = tbuf.get_item(item_id) + else: + transpose_param = None main_weight = mbuf.get_item(item_id) else: assert not mbuf.is_data_distributed model_param = to_local_if_dtensor(param) main_weight = mbuf.get_item(item_id) + if is_blockwise_float8tensor(param): + fp8_params.append(param) + if model_param.numel() == 0: + shard_fp32_from_fp8.append(None) + shard_offsets_in_fp8.append(None) + shard_model_params.append([None, None]) + else: + shard_fp32_from_fp8.append(main_weight) + shard_offsets_in_fp8.append(wbuf.locate_item_in_global_item(item_id)[0]) + bucket = wbuf.fetch_bucket() + b_model_param = wbuf.get_item_from_bucket(bucket, item_id)[ + slice(*wbuf.locate_item_in_global_item(item_id)) + ] + assert ( + transpose_param is None + ), "Blockwise FP8 does not support transpose param." + shard_model_params.append([b_model_param, None]) + assert b_model_param.numel() == model_param.numel(), ( + f"Blockwise FP8 bucket param numel {b_model_param.numel()} does" + f" not match model param numel {model_param.numel()}" + f" name: {self.param_to_name[param]}" + ) + blockwise_fp8_param_buffers.append( + {"bucket_param": b_model_param, "param": model_param} + ) + has_blockwise_fp8_param = True + continue + if is_float8tensor(param): fp8_params.append(param) if model_param.numel() == 0: shard_fp32_from_fp8.append(None) shard_offsets_in_fp8.append(None) - shard_model_params.append(None) + shard_model_params.append([None, None]) else: shard_fp32_from_fp8.append(main_weight) shard_offsets_in_fp8.append(wbuf.locate_item_in_global_item(item_id)[0]) - shard_model_params.append(model_param) + shard_model_params.append([model_param, transpose_param]) continue if model_param.numel() > 0: model_param.data.copy_(main_weight.view(model_param.shape)) - if len(dense_param_quantize_kwargs["model_params"]) > 0: - # If we have FP8 parameters, we need to quantize them. - dense_param_quantize_kwargs["data_parallel_group"] = data_parallel_group - quantize_param_shard(**dense_param_quantize_kwargs) + if has_blockwise_fp8_param: + blockwise_fp8_weight_buffers.append(wbuf) + if ( + sum([wbuf.bucket_index.size for wbuf in blockwise_fp8_weight_buffers]) + > BATCH_QUANT_MEMORY_LIMIT_BYTES + ): + _batch_quantize_blockwise_fp8_params( + dense_param_quantize_kwargs, + expert_param_quantize_kwargs, + blockwise_fp8_param_buffers, + ) - if len(expert_param_quantize_kwargs["model_params"]) > 0: - # If we have FP8 expert parameters, we need to quantize them. - expert_param_quantize_kwargs["data_parallel_group"] = expert_data_parallel_group - quantize_param_shard(**expert_param_quantize_kwargs) + _batch_quantize_blockwise_fp8_params( + dense_param_quantize_kwargs, expert_param_quantize_kwargs, blockwise_fp8_param_buffers + ) + _fp8_quantize_params(dense_param_quantize_kwargs, expert_param_quantize_kwargs) @torch.no_grad() def copy_model_weights_to_main_weights(self): @@ -2597,6 +2770,7 @@ def copy_model_weights_to_main_weights(self): f"Master weight buffer size {mbuf.data.numel()} does not match " f"model weight buffer size {copyin_data.numel()}" ) + # TODO(mxfp8): Make sure it's not a fp8 buf? mbuf.data.copy_(copyin_data.data) def all_gather_parameters(self, async_op: bool = True): @@ -2614,15 +2788,18 @@ def all_gather_parameters(self, async_op: bool = True): all_gather_ops = [] for g in self.parameter_groups: - shard = g.model_weight_buffer.get_shard_from_local_buffer() - all_gather_handler = torch.distributed.all_gather_into_tensor( - output_tensor=g.model_weight_buffer.data, - input_tensor=shard, - group=g.model_weight_buffer.data_parallel_group, - async_op=async_op, - ) - if async_op: - all_gather_ops.append(all_gather_handler) + for buf in [g.model_weight_buffer, g.transpose_weight_buffer]: + if buf is None: + continue + shard = buf.get_shard_from_local_buffer() + all_gather_handler = torch.distributed.all_gather_into_tensor( + output_tensor=buf.data, + input_tensor=shard, + group=buf.data_parallel_group, + async_op=async_op, + ) + if async_op: + all_gather_ops.append(all_gather_handler) for op in all_gather_ops: op.wait() @@ -2643,7 +2820,7 @@ def reduce_scatter_gradients(self, async_op: bool = True): reduce_scatter_ops = [] for g in self.parameter_groups: gbuf = g.main_grad_buffer - if gbuf is not None: + if gbuf is None: continue scaling_factor = gbuf.gradient_scaling_factor reduce_op = gradient_reduce_preprocessing(gbuf.data, scaling_factor, self.ddp_config) @@ -3093,9 +3270,16 @@ def __init__( # Track the status of all-gather operations for each bucket. self.param_gather_event_map = {} # All buckets are initially deallocated / empty after initialization of ParamAndGradBuffer. - self.bucket_status = {i: BucketStatus.EMPTY for i in range(self.buffer.num_buckets)} + self.bucket_status = {} + for i in range(self.buffer.num_buckets): + for bwd in [False, True]: + self.bucket_status[self.get_bucket_key(i, bwd)] = BucketStatus.EMPTY + # Track whether each bucket can be deallocated. - self.bucket_can_be_released = {i: False for i in range(self.buffer.num_buckets)} + self.bucket_can_be_released = {} + for i in range(self.buffer.num_buckets): + for bwd in [False, True]: + self.bucket_can_be_released[self.get_bucket_key(i, bwd)] = False # Map each bucket to the bucket group it belongs to by enumerated ID. # Made to collect a subset of buckets in the same bucket group. @@ -3120,6 +3304,13 @@ def __init__( # all-gather parameters across groups. self.outer_fsdp_group_param_gather_stream = torch.cuda.Stream() + def get_bucket_key(self, bucket_id, bwd): + """Get the key for the bucket.""" + has_transpose_buffer = ( + self.buffer.parameter_groups[bucket_id].transpose_weight_buffer is not None + ) + return (bucket_id, has_transpose_buffer and bwd) + @property def num_buckets(self): """Return the number of buckets.""" @@ -3136,10 +3327,11 @@ def reset(self): UserWarning, ) while len(self.param_gather_event_map) > 0: - bucket_id = next(iter(self.param_gather_event_map)) - self.wait_bucket_ready(bucket_id) + (bucket_id, bwd) = next(iter(self.param_gather_event_map)) + self.wait_bucket_ready(bucket_id, bwd) for bucket_id in range(self.num_buckets): - self.bucket_can_be_released[bucket_id] = True + for bwd in [False, True]: + self.bucket_can_be_released[self.get_bucket_key(bucket_id, bwd)] = True self.recycle_unused_buckets() assert all([status is BucketStatus.EMPTY for status in self.bucket_status.values()]), ( @@ -3161,6 +3353,7 @@ def all_gather_params( suggested_AG_prefetch_size: Optional[int] = None, async_param_gather: bool = True, outer_fsdp_group_param_gather: bool = False, + bwd: bool = False, ): """All-gather the params. If prefetch is enabled, prefetch next buckets in the order of `prefetch_order`. @@ -3195,7 +3388,7 @@ def all_gather_params( # Do not release the buckets that are being all-gathered. for bucket_id in ag_buckets: - self.bucket_can_be_released[bucket_id] = False + self.bucket_can_be_released[self.get_bucket_key(bucket_id, bwd)] = False # If prefetch is enabled, we will add prefetch buckets to ag_buckets. if prefetch: @@ -3267,7 +3460,11 @@ def need_skip_prefetch(bucket_id): bucket_id = next_bucket_id(ag_buckets) # Only all-gather on buckets that have not been allocated yet. - ag_buckets = [i for i in ag_buckets if self.bucket_status[i] == BucketStatus.EMPTY] + ag_buckets = [ + bucket_id + for bucket_id in ag_buckets + if self.bucket_status[self.get_bucket_key(bucket_id, bwd)] == BucketStatus.EMPTY + ] if len(ag_buckets) == 0: return @@ -3286,6 +3483,7 @@ def need_skip_prefetch(bucket_id): self.ag_stream if self.ag_stream is not None else torch.cuda.current_stream() ) if outer_fsdp_group_param_gather: + # TODO(mxfp8): Support hsdp self.outer_fsdp_group_param_gather_stream.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(self.outer_fsdp_group_param_gather_stream): outer_fsdp_group = self.buffer.dist_index.get_outer_fsdp_group() @@ -3313,12 +3511,13 @@ def need_skip_prefetch(bucket_id): for bucket_id in buckets: # All-gather the module weights from each FSDP buffer shard # into an allocated bucket containing unsharded weights. - self.async_bucket_gather(bucket_id) + self.async_bucket_gather(bucket_id, bwd) # Replace the parameter all-gather event with coalescing event. for bucket_id in buckets: - _, mark_bucket_ready_to_use = self.param_gather_event_map[bucket_id] - self.param_gather_event_map[bucket_id] = ( + bucket_key = self.get_bucket_key(bucket_id, bwd) + _, mark_bucket_ready_to_use = self.param_gather_event_map[bucket_key] + self.param_gather_event_map[bucket_key] = ( coalescing_event, mark_bucket_ready_to_use, ) @@ -3326,14 +3525,16 @@ def need_skip_prefetch(bucket_id): # Wait for all-gather to finish if not async_param_gather: for bucket_id in buckets: - self.wait_bucket_ready(bucket_id) + self.wait_bucket_ready(bucket_id, bwd) - def wait_bucket_ready(self, bucket_id, empty_ok=False): + def wait_bucket_ready(self, bucket_id, bwd, empty_ok=False): """Wait for the bucket to be ready.""" - if self.bucket_status[bucket_id] == BucketStatus.READY_TO_USE: + bucket_key = self.get_bucket_key(bucket_id, bwd) + + if self.bucket_status[bucket_key] == BucketStatus.READY_TO_USE: # Already ready to use. return - if self.bucket_status[bucket_id] == BucketStatus.EMPTY: + if self.bucket_status[bucket_key] == BucketStatus.EMPTY: if empty_ok: return # Bucket shouldn't be empty, this implies that the bucket @@ -3341,48 +3542,64 @@ def wait_bucket_ready(self, bucket_id, empty_ok=False): raise ValueError(f"Bucket {bucket_id} is empty.") # Wait for asynchronous / overlapped NCCL operations to complete. - param_gather_event, mark_bucket_ready_to_use = self.param_gather_event_map.pop(bucket_id) + param_gather_event, mark_bucket_ready_to_use = self.param_gather_event_map.pop(bucket_key) param_gather_event.wait() mark_bucket_ready_to_use() @torch.no_grad() - def release_bucket(self, bucket_id: int): + def release_bucket(self, bucket_id, bwd): """Release the bucket.""" - if self.bucket_status[bucket_id] == BucketStatus.EMPTY: + # TODO(mxfp8): In some cases, there won't be ag before bwd? + bucket_key = self.get_bucket_key(bucket_id, bwd) + + if self.bucket_status[bucket_key] == BucketStatus.EMPTY: return - self.wait_bucket_ready(bucket_id, empty_ok=True) - if self.bucket_status[bucket_id] == BucketStatus.COMMUNICATING: + self.wait_bucket_ready(bucket_id, bwd, empty_ok=True) + if self.bucket_status[bucket_key] == BucketStatus.COMMUNICATING: raise ValueError(f"Bucket {bucket_id} is communicating.") - wbuf = self.buffer.parameter_groups[bucket_id].model_weight_buffer - wbuf.free_bucket_storage() - self.bucket_status[bucket_id] = BucketStatus.EMPTY + if bwd and self.buffer.parameter_groups[bucket_id].transpose_weight_buffer is not None: + buf = self.buffer.parameter_groups[bucket_id].transpose_weight_buffer + else: + buf = self.buffer.parameter_groups[bucket_id].model_weight_buffer + + buf.free_bucket_storage() + self.bucket_status[bucket_key] = BucketStatus.EMPTY def recycle_unused_buckets(self): """Recycle the unused buckets.""" - for bucket_id, can_be_released in self.bucket_can_be_released.items(): + for bucket_key, can_be_released in self.bucket_can_be_released.items(): if can_be_released: - self.release_bucket(bucket_id) - self.bucket_can_be_released[bucket_id] = False + bucket_id, is_transpose_weight = bucket_key[0], bucket_key[1] + self.release_bucket(bucket_id, is_transpose_weight) + self.bucket_can_be_released[bucket_key] = False - def get_fsdp_buffer(self, bucket_id: int) -> DataParallelBuffer: + def get_fsdp_buffer(self, bucket_id: int, bwd=False) -> DataParallelBuffer: """Get the FSDP buffer with the given bucket ID.""" param_group = self.buffer.parameter_groups[bucket_id] if self.buffer.ddp_config.outer_dp_sharding_strategy != "no_shard": - return param_group.hsdp_wbuf - return param_group.model_weight_buffer + if bwd and param_group.transpose_weight_buffer is not None: + raise RuntimeError("Transpose buffer is not supported for HSDP") + else: + return param_group.hsdp_wbuf + if bwd and param_group.transpose_weight_buffer is not None: + return param_group.transpose_weight_buffer + else: + return param_group.model_weight_buffer @torch.no_grad() - def async_bucket_gather(self, bucket_id: int) -> None: + def async_bucket_gather(self, bucket_id, bwd) -> None: """All-gather the bucket and set the items.""" - self.bucket_can_be_released[bucket_id] = False - if self.bucket_status[bucket_id] != BucketStatus.EMPTY: + bucket_key = self.get_bucket_key(bucket_id, bwd) + + self.bucket_can_be_released[bucket_key] = False + if self.bucket_status[bucket_key] != BucketStatus.EMPTY: return - self.bucket_status[bucket_id] = BucketStatus.COMMUNICATING + self.bucket_status[bucket_key] = BucketStatus.COMMUNICATING - wbuf = self.get_fsdp_buffer(bucket_id) + wbuf = self.get_fsdp_buffer(bucket_id, bwd) # Lazy release the unused buckets. self.recycle_unused_buckets() @@ -3397,18 +3614,21 @@ def async_bucket_gather(self, bucket_id: int) -> None: async_op=True, ) - def get_closure(bucket_id): + def get_closure(bucket_id, bwd): @torch.no_grad() def mark_bucket_ready_to_use(): # Mark the bucket as ready to use - all NCCL operations are complete. - self.bucket_status[bucket_id] = BucketStatus.READY_TO_USE + self.bucket_status[self.get_bucket_key(bucket_id, bwd)] = BucketStatus.READY_TO_USE return mark_bucket_ready_to_use - mark_bucket_ready_to_use = get_closure(bucket_id) + mark_bucket_ready_to_use = get_closure(bucket_id, bwd) # Track the async all-gather operation for the bucket. - self.param_gather_event_map[bucket_id] = (param_gather_event, mark_bucket_ready_to_use) + self.param_gather_event_map[self.get_bucket_key(bucket_id, bwd)] = ( + param_gather_event, + mark_bucket_ready_to_use, + ) @torch.no_grad() @@ -3501,15 +3721,13 @@ def override_sharded_param_methods_with_safety_checks(params, all_gather_pipelin def override_sharded_param_to_function_closure(p, to_function): def override_sharded_param_to_function(*args, **kwargs): - bucket_id = all_gather_pipeline.buffer.param_to_param_group[p] - status = all_gather_pipeline.bucket_status[bucket_id] - if status == BucketStatus.READY_TO_USE: - return to_function(*args, **kwargs) - raise RuntimeError( - "This parameter is already shard by MCore FSDP and the " - "shared-state parameter does not support 'to' function." - "please define the dtype and device of the parameter before FSDP wrap." - ) + if p._typed_storage()._size() == 0: + warnings.warn( + "The parameter may be sharded by Megatron-FSDP, " + "no actual 'to' operation is performed." + ) + return torch.empty([]) + return to_function(*args, **kwargs) return override_sharded_param_to_function @@ -3517,15 +3735,13 @@ def override_sharded_param_to_function(*args, **kwargs): def override_sharded_param_cpu_function_closure(p, cpu_function): def override_sharded_param_cpu_function(*args, **kwargs): - bucket_id = all_gather_pipeline.buffer.param_to_param_group[p] - status = all_gather_pipeline.bucket_status[bucket_id] - if status == BucketStatus.READY_TO_USE: - return cpu_function(*args, **kwargs) - warnings.warn( - "The parameters are sharded by MCore FSDP, and no actual cpu " - "operation is performed." - ) - return torch.empty([], device="cpu") + if p._typed_storage()._size() == 0: + warnings.warn( + "The parameter may be sharded by Megatron-FSDP, " + "no actual 'cpu' operation is performed." + ) + return torch.empty([], device="cpu") + return cpu_function(*args, **kwargs) return override_sharded_param_cpu_function diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py index c9679494737..3d15711275f 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py @@ -19,7 +19,7 @@ from contextlib import nullcontext from functools import reduce from importlib.metadata import version -from typing import Callable, List, Optional, Sequence, Union +from typing import Callable, Optional, Sequence, Union try: import einops @@ -79,52 +79,6 @@ def is_te_min_version(vers, check_equality=True): return te_version > PkgVersion(vers) -# Check if Transformer Engine has class for fp8 tensors. -try: - if is_te_min_version("2.0"): - # In TE2.x, QuantizedTensor is the base class for all different type of fp8 tensors, - # including fp8 tensor for delayed scaling, current scaling and mxfp8, etc. - from transformer_engine.pytorch.tensor import QuantizedTensor as FP8_TENSOR_CLASS - else: - from transformer_engine.pytorch.float8_tensor import Float8Tensor as FP8_TENSOR_CLASS - - HAVE_TE_FP8_TENSOR_CLASS = True -except (ImportError, ModuleNotFoundError): - # FP8 tensor class not found - HAVE_TE_FP8_TENSOR_CLASS = False - -try: - from transformer_engine.pytorch.optimizers import multi_tensor_applier, multi_tensor_scale - - multi_tensor_scale_impl = multi_tensor_scale -except ImportError: - try: - import amp_C - from apex.multi_tensor_apply import multi_tensor_applier - - multi_tensor_scale_impl = amp_C.multi_tensor_scale - except ImportError: - import warnings - - warnings.warn( - "Transformer Engine and Apex are not installed. " - "Falling back to local implementations of " - "multi_tensor_applier and multi_tensor_scale" - ) - - def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args): - """Multi tensor op applier""" - return op(2048 * 32, noop_flag_buffer, tensor_lists, *args) - - def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale): - """Works as a drop-in replacement for amp_C.multi_tensor_scale.""" - for src, dst in zip(tensor_lists[0], tensor_lists[1]): - dst.copy_(src * scale) - - multi_tensor_applier = local_multi_tensor_applier - multi_tensor_scale_impl = local_multi_tensor_scale - - def is_submodule(module, parent_module, strict=True): """ Check if a module is a submodule of another module. @@ -138,18 +92,6 @@ def is_submodule(module, parent_module, strict=True): return False -def is_float8tensor(tensor: torch.Tensor) -> bool: - """Check if a tensor is a Transformer Engine Float8Tensor. - - Note that in TE2.x, in order to support more recipes, the design of the fp8 tensor class has - changed. Now Float8Tensor is only used for current scaling and delayed scaling. And mxfp8 - and blockwise scaling have their own fp8 tensor classes. These different fp8 tensor classes - are both inherited from QuantizedTensor. So, for TE1.x, FP8_TENSOR_CLASS is Float8Tensor, - and for TE2.x, FP8_TENSOR_CLASS is QuantizedTensor. - """ - return HAVE_TE_FP8_TENSOR_CLASS and isinstance(tensor, FP8_TENSOR_CLASS) - - def get_mesh_names(device_mesh: Optional[DeviceMesh] = None) -> list[str]: """ Get all the sub-mesh names in the DeviceMesh. @@ -188,198 +130,6 @@ def contains_submesh( return all(submesh_name in device_mesh_names for submesh_name in submesh_names) -def _multi_tensor_copy_this_to_that( - this: List[torch.Tensor], that: List[torch.Tensor], overflow_buf: Optional[torch.Tensor] = None -): - """ - Use multi-tensor-applier to copy values from one list to another. - We don't have a bfloat16 implementation so for now if the overflow_buf - is not provided, we default back to simple loop copy to be compatible - with bfloat16. - """ - if overflow_buf is not None: - overflow_buf.fill_(0) - # Scaling with factor `1.0` is equivalent to copy. - multi_tensor_applier(multi_tensor_scale_impl, overflow_buf, [this, that], 1.0) - else: - for this_, that_ in zip(this, that): - that_.copy_(this_) - - -""" -The code below abstracts the functionalities needed for implementing "--fp8-param-gather" into -several functions. It provides different implementations for each function based on different -versions of TE, ensuring compatibility across various TE versions. - -Currently, there are three functions: - - modify_underlying_storage - This function is used in DDP to place all parameters into a contiguous buffer. For - non-fp8 tensors, replacing their data is simple, just using code like - "tensor.data = new_data". However, for fp8 tensors, their raw data is not stored in the - ".data" attribute, and it varies with different TE versions and different recipes. This - function provides a unified interface to replace the underlying storage of a fp8 tensor. - - quantize_param_shard - This function is used in dist-opt to cast fp32 main params to fp8 params. For non-fp8 - params, this casting is as simple as "bf16_params.copy_(fp32_main_params)"; but for fp8 - params, the casting logic varies with different TE versions and different recipes. This - function provides a unified interface to cast fp32 main params to fp8 params, and also - updates the necessary attributes (like amax, scale, scale_inv or transpose cache) of the - fp8 model params. - - correct_amax_history_if_needed - This function is used to correct the amax history of fp8 tensors. In TE1.x, some inplace - copy operations will write unwanted values to the amax_history of fp8 tensors. This function - corrects the amax_history back. For TE2.x, it's an empty function. - Only useful for delayed scaling. -""" -if HAVE_TE and is_te_min_version("2.2"): - # Supported TE versions: 2.2+ - from transformer_engine.pytorch.tensor import QuantizedTensor - - def _modify_underlying_storage_impl( - fp8_tensor: QuantizedTensor, new_raw_data: torch.Tensor - ) -> None: - from transformer_engine.pytorch.tensor.utils import replace_raw_data - - replace_raw_data(fp8_tensor, new_raw_data) - - def _quantize_param_shard_impl( - model_params: List[QuantizedTensor], - main_params: List[torch.Tensor], - start_offsets: List[int], - data_parallel_group: ProcessGroup, - fsdp_shard_model_params: Optional[List[torch.Tensor]] = None, - ) -> None: - if len(model_params) == 0: - return - - from transformer_engine.pytorch.tensor.utils import cast_master_weights_to_fp8 - - args = [model_params, main_params, start_offsets, data_parallel_group] - if fsdp_shard_model_params is not None: - if get_te_version() == PkgVersion("2.3.0.dev0+5fdd7bb") or is_te_min_version("2.3.0"): - args.append(fsdp_shard_model_params) - else: - raise NotImplementedError( - f"FSDP with --fp8-param-gather is not supported in TE v{get_te_version()}" - ) - cast_master_weights_to_fp8(*args) - -elif HAVE_TE and is_te_min_version("2.0"): - # Supported TE versions: 2.0 - from transformer_engine.pytorch.tensor import QuantizedTensor - from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor - - def _modify_underlying_storage_impl( - fp8_tensor: QuantizedTensor, new_raw_data: torch.Tensor - ) -> None: - old_raw_data = fp8_tensor._data - assert old_raw_data.dtype == new_raw_data.dtype - new_raw_data.detach().copy_(old_raw_data) - fp8_tensor._data = new_raw_data - del old_raw_data - - def _quantize_param_shard_impl( - model_params: List[QuantizedTensor], - main_params: List[torch.Tensor], - start_offsets: List[int], - data_parallel_group: ProcessGroup, - fsdp_shard_model_params: Optional[List[torch.Tensor]] = None, - ) -> None: - if len(model_params) == 0: - return - - if fsdp_shard_model_params is None: - fsdp_shard_model_params = [None] * len(model_params) - - for model_param, main_param, start_offset, fsdp_shard_model_param in zip( - model_params, main_params, start_offsets, fsdp_shard_model_params - ): - if main_param is None: - continue - - if fsdp_shard_model_param is not None: - shard_model_param = fsdp_shard_model_param - else: - shard_model_param = model_param._data.view(-1)[ - start_offset : start_offset + main_param.numel() - ] - - quantizer = model_param._quantizer - # When not using --fp8-param-gather, the main_param (fp32) is first cast to bf16/fp16, - # and then cast to fp8 during forward. - # Although it's not necessary when --fp8-param-gather is enabled, we still keep this - # logic to keep numerical consistency. So here cast the main_param to model_param.dtype. - main_param = main_param.to(model_param.dtype) - out = Float8Tensor( - shape=main_param.size(), - dtype=model_param.dtype, - requires_grad=False, - data=shard_model_param, - fp8_scale_inv=model_param._scale_inv, - fp8_dtype=model_param._fp8_dtype, - quantizer=quantizer, - ) - quantizer.update_quantized(main_param, out) - - amaxes = [] - scales = [] - scale_invs = [] - for model_param in model_params: - quantizer = model_param._quantizer - amaxes.append(quantizer.amax.view(1)) - scales.append(quantizer.scale.view(1)) - scale_invs.append(model_param._scale_inv.view(1)) - model_param._reset_caches() - - dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device="cuda") - - # Update scaling factors. - packed_scales = torch.empty(len(scales), dtype=torch.float32, device=scales[0].device) - packed_scale_views = [packed_scales[i].view(1) for i in range(len(scales))] - _multi_tensor_copy_this_to_that(scales, packed_scale_views, dummy_overflow_buf) - torch.reciprocal(packed_scales, out=packed_scales) - _multi_tensor_copy_this_to_that(packed_scale_views, scale_invs, dummy_overflow_buf) - - # Reduce amaxes. - # Note: Assume each param has a separate amax. - packed_amaxes = torch.empty(len(amaxes), dtype=torch.float32, device=amaxes[0].device) - packed_amax_views = [packed_amaxes[i].view(1) for i in range(len(amaxes))] - _multi_tensor_copy_this_to_that(amaxes, packed_amax_views, dummy_overflow_buf) - torch.distributed.all_reduce( - packed_amaxes, op=torch.distributed.ReduceOp.MAX, group=data_parallel_group - ) - _multi_tensor_copy_this_to_that(packed_amax_views, amaxes, dummy_overflow_buf) - -else: - # Fallback impl if TE version is invalid or TE is not installed. - def _modify_underlying_storage_impl(*args, **kwargs): - raise RuntimeError( - "Invalid Transformer Engine version for FP8 distributed optimizer, " - "please install Transformer Engine 2.0+ or install Megatron-Core" - ) - - def _quantize_param_shard_impl(*args, **kwargs): - raise RuntimeError( - "Invalid Transformer Engine version for FP8 distributed optimizer, " - "please install Transformer Engine 2.0+ or install Megatron-Core" - ) - - -def modify_underlying_storage(tensor: torch.Tensor, new_raw_data: torch.Tensor): - """Replace the underlying raw data of a tensor with new data.""" - _modify_underlying_storage_impl(tensor, new_raw_data) - - -def quantize_param_shard( - model_params, main_params, start_offsets, data_parallel_group, fsdp_shard_model_params=None -): - """Cast shard fp32 main params to fp8 model params.""" - assert HAVE_TE, "Transformer Engine is required for quantizing parameters." - _quantize_param_shard_impl( - model_params, main_params, start_offsets, data_parallel_group, fsdp_shard_model_params - ) - - def _get_cuda_rng_state( device: Union[int, str, torch.device] = "cuda", clone: bool = False, graph_safe: bool = False ) -> torch.Tensor: diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index c157d062c53..b267c8a8170 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -745,6 +745,9 @@ def validate_args(args, defaults={}): assert args.ckpt_format == "fsdp_dtensor", \ "Megatron FSDP only supports fsdp_dtensor checkpoint format" + if args.use_megatron_fsdp: + args.reuse_grad_buf_for_mxfp8_param_ag = False + # Parameters dtype. args.params_dtype = torch.float if args.fp16: From a935008a5fa775e8bd5a03fb9081ddceeeaa0d13 Mon Sep 17 00:00:00 2001 From: Yuzhong Wang Date: Fri, 19 Dec 2025 12:35:45 +0800 Subject: [PATCH 199/248] [Dev] Feat(moe): Gated delta net context parallel (CP) (#2614) --- megatron/core/ssm/gated_delta_net.py | 303 +++++++++++++++--- .../core/transformer/transformer_config.py | 19 +- tests/unit_tests/ssm/test_gated_delta_net.py | 178 +--------- 3 files changed, 291 insertions(+), 209 deletions(-) diff --git a/megatron/core/ssm/gated_delta_net.py b/megatron/core/ssm/gated_delta_net.py index dfa6e4c35e4..2b0a18b433b 100644 --- a/megatron/core/ssm/gated_delta_net.py +++ b/megatron/core/ssm/gated_delta_net.py @@ -21,6 +21,12 @@ from megatron.core.jit import jit_fuser from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.process_groups_config import ProcessGroupCollection +from megatron.core.ssm.mamba_context_parallel import ( + _all_to_all_cp2hp, + _all_to_all_hp2cp, + _redo_attention_load_balancing, + _undo_attention_load_balancing, +) from megatron.core.tensor_parallel import get_cuda_rng_tracker from megatron.core.transformer import TransformerConfig from megatron.core.transformer.identity_op import IdentityOp @@ -33,9 +39,6 @@ ) from megatron.core.utils import deprecate_inference_params, nvtx_range_pop, nvtx_range_push -# TODO: Implement GatedDeltaNetContextParallel -# from .gated_delta_net_context_parallel import GatedDeltaNetContextParallel - try: from fla.modules.l2norm import l2norm from fla.ops.gated_delta_rule import chunk_gated_delta_rule @@ -84,6 +87,7 @@ def __init__( use_qk_l2norm: bool = True, A_init_range: Tuple[float, float] = (1, 16), pg_collection: ProcessGroupCollection = None, + **kwargs, ): """ Args: @@ -114,6 +118,7 @@ def __init__( self.use_qk_l2norm = use_qk_l2norm assert pg_collection is not None, "pg_collection must be provided for GatedDeltaNet" self.pg_collection = pg_collection + self.cp_size = self.pg_collection.cp.size() self.tp_size = self.pg_collection.tp.size() self.sp_size = self.tp_size if config.sequence_parallel else 1 @@ -129,6 +134,8 @@ def __init__( self.num_value_heads = config.linear_num_value_heads self.qk_dim = self.key_head_dim * self.num_key_heads self.v_dim = self.value_head_dim * self.num_value_heads + self.qk_dim_local_tp = self.qk_dim // self.tp_size + self.v_dim_local_tp = self.v_dim // self.tp_size # Input projection (hidden_states -> q, k, v, gate, beta, alpha) # TODO: for now, output gate is forced for GDN. @@ -217,8 +224,6 @@ def __init__( tp_group=self.pg_collection.tp, ) - # TODO: support CP - self.reset_parameters() def reset_parameters(self): @@ -247,17 +252,12 @@ def forward( self, hidden_states: Tensor, attention_mask: Tensor, - key_value_states: Optional[Tensor] = None, inference_context: Optional[BaseInferenceContext] = None, - rotary_pos_emb: Optional[Union[Tensor, Tuple[Tensor, Tensor]]] = None, - rotary_pos_cos: Optional[Tensor] = None, - rotary_pos_sin: Optional[Tensor] = None, - rotary_pos_cos_sin: Optional[Tensor] = None, - attention_bias: Optional[Tensor] = None, packed_seq_params: Optional[PackedSeqParams] = None, sequence_len_offset: Optional[int] = None, *, inference_params: Optional[BaseInferenceContext] = None, + **kwargs, ): """ Perform a forward pass through the GDN module. @@ -265,15 +265,8 @@ def forward( Args: hidden_states (Tensor): Hidden states. attention_mask (Tensor): Attention mask. - key_value_states (Optional[Tensor]): Key/value states (for cross attention). inference_context (Optional[BaseInferenceContext]): Inference context that manages KV cache. - rotary_pos_emb (Optional[Union[Tensor, Tuple[Tensor, Tensor]]]): Rotary - embedding tensor(s). - rotary_pos_cos (Optional[Tensor]): Rotary embedding cosine. - rotary_pos_sin (Optional[Tensor]): Rotary embedding sine. - rotary_pos_cos_sin (Optional[Tensor]): Combined rotary embedding cosine and sine. - attention_bias (Optional[Tensor]): Attention bias. packed_seq_params (Optional[PackedSeqparams]): Parameters used for THD format. sequence_len_offset (Optional[int]): Sequence length offset used for inference CUDA graphs. @@ -287,7 +280,7 @@ def forward( inference_context = deprecate_inference_params(inference_context, inference_params) seq_len, batch, _ = hidden_states.shape - seq_len = seq_len * self.sp_size + seq_len = seq_len * self.sp_size * self.cp_size if inference_context is not None: assert ( @@ -306,6 +299,22 @@ def forward( qkvzba, _ = self.in_proj(hidden_states) nvtx_range_pop(suffix="in_proj") + # CP All to All: CP to HP + qkvzba = tensor_a2a_cp2hp( + qkvzba, + seq_dim=0, + head_dim=-1, + cp_group=self.pg_collection.cp, + split_sections=[ + self.qk_dim_local_tp, + self.qk_dim_local_tp, + self.v_dim_local_tp, + self.v_dim_local_tp, + self.num_value_heads // self.tp_size, + self.num_value_heads // self.tp_size, + ], + ) + # Transpose: s b x --> b s x # From sbhd to bshd format qkvzba = qkvzba.transpose(0, 1) @@ -314,10 +323,10 @@ def forward( qkv, gate, beta, alpha = torch.split( qkvzba, [ - (self.qk_dim * 2 + self.v_dim) // self.tp_size, - self.v_dim // self.tp_size, - self.num_value_heads // self.tp_size, - self.num_value_heads // self.tp_size, + (self.qk_dim_local_tp * 2 + self.v_dim_local_tp) // self.cp_size, + self.v_dim_local_tp // self.cp_size, + self.num_value_heads // self.tp_size // self.cp_size, + self.num_value_heads // self.tp_size // self.cp_size, ], dim=-1, ) @@ -328,14 +337,44 @@ def forward( # Convolution on qkv qkv = qkv.transpose(1, 2).contiguous() # b, s, d -> b, d, s nvtx_range_push(suffix="conv1d") + qkv_channels_split_sections = [ + self.qk_dim_local_tp, + self.qk_dim_local_tp, + self.v_dim_local_tp, + ] + conv1d_weight = get_parameter_local_cp( + self.conv1d.weight, + dim=0, + cp_group=self.pg_collection.cp, + split_sections=qkv_channels_split_sections, + ) + conv1d_bias = ( + get_parameter_local_cp( + self.conv1d.bias, + dim=0, + cp_group=self.pg_collection.cp, + split_sections=qkv_channels_split_sections, + ) + if self.conv_bias + else None + ) if (causal_conv1d_fn is None) or self.config.deterministic_mode: - qkv = self.act_fn(self.conv1d(qkv)[..., :seq_len]) + conv_out = F.conv1d( + input=qkv, + weight=conv1d_weight, + bias=conv1d_bias, + stride=self.conv1d.stride, + padding=self.conv1d.padding, + dilation=self.conv1d.dilation, + groups=self.conv_dim_local_tp // self.cp_size, + ) + qkv = self.act_fn(conv_out[..., :seq_len]) else: assert self.activation in ["silu", "swish"] qkv = causal_conv1d_fn( x=qkv, - weight=self.conv1d.weight.squeeze(1), # d, 1, w -> d, w - bias=self.conv1d.bias, + weight=conv1d_weight.squeeze(1), # d, 1, w -> d, w + bias=conv1d_bias, activation=self.activation, ) nvtx_range_pop(suffix="conv1d") @@ -343,7 +382,11 @@ def forward( qkv = qkv.transpose(1, 2) # b, d, s -> b, s, d query, key, value = torch.split( qkv, - [self.qk_dim // self.tp_size, self.qk_dim // self.tp_size, self.v_dim // self.tp_size], + [ + self.qk_dim_local_tp // self.cp_size, + self.qk_dim_local_tp // self.cp_size, + self.v_dim_local_tp // self.cp_size, + ], dim=-1, ) query = query.reshape(batch, seq_len, -1, self.key_head_dim) @@ -367,7 +410,11 @@ def forward( # Calculate g and beta nvtx_range_push(suffix="g_and_beta") - g = -self.A_log.exp() * F.softplus(alpha.float() + self.dt_bias) # In fp32 + A_log_local_cp = get_parameter_local_cp(self.A_log, dim=0, cp_group=self.pg_collection.cp) + dt_bias_local_cp = get_parameter_local_cp( + self.dt_bias, dim=0, cp_group=self.pg_collection.cp + ) + g = -A_log_local_cp.exp() * F.softplus(alpha.float() + dt_bias_local_cp) # In fp32 beta = beta.sigmoid() nvtx_range_pop(suffix="g_and_beta") @@ -406,6 +453,11 @@ def forward( norm_out = norm_out.reshape(batch, seq_len, -1) norm_out = norm_out.transpose(0, 1).contiguous() + # CP all to all: HP to CP + norm_out = tensor_a2a_hp2cp( + norm_out, seq_dim=0, head_dim=-1, cp_group=self.pg_collection.cp + ) + # Output projection nvtx_range_push(suffix="out_proj") out, out_bias = self.out_proj(norm_out) @@ -479,10 +531,10 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None, tp_gr sharded_state_dict[f"{prefix}in_proj.weight"] = _split_tensor_factory( sharded_state_dict[f"{prefix}in_proj.weight"], [ - self.qk_dim // self.tp_size, - self.qk_dim // self.tp_size, - self.v_dim // self.tp_size, - self.v_dim // self.tp_size, + self.qk_dim_local_tp, + self.qk_dim_local_tp, + self.v_dim_local_tp, + self.v_dim_local_tp, self.num_value_heads // self.tp_size, self.num_value_heads // self.tp_size, ], @@ -502,11 +554,7 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None, tp_gr for conv_layer_name in conv_layer_name_list: sharded_state_dict[f"{prefix}{conv_layer_name}"] = _split_tensor_factory( sharded_state_dict[f"{prefix}{conv_layer_name}"], - [ - self.qk_dim // self.tp_size, - self.qk_dim // self.tp_size, - self.v_dim // self.tp_size, - ], + [self.qk_dim_local_tp, self.qk_dim_local_tp, self.v_dim_local_tp], ["query", "key", "value"], 0, ) @@ -514,6 +562,9 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None, tp_gr return sharded_state_dict +#################### +# Sharded state dict utilities +#################### def _split_tensor_factory( orig_sh_ten: ShardedTensor, split_sections: List[int], split_names: List[str], split_dim: int ) -> ShardedTensorFactory: @@ -574,6 +625,184 @@ def sh_ten_merge_fn(sub_state_dict): ) +#################### +# Context parallel utilities +#################### +def get_parameter_local_cp( + param: torch.Tensor, + dim: int, + cp_group: torch.distributed.ProcessGroup, + split_sections: Optional[List[int]] = None, +) -> torch.Tensor: + """Get the local parameter for the current context parallel rank. + + Args: + param (torch.Tensor): The entire parameter to get the local parameter for. + dim (int): The dimension to split the parameter along. Usually the dimension of head. + cp_group (torch.distributed.ProcessGroup): The context parallel group. + split_sections (Optional[List[int]]): If not None, + first split the parameter along the dimension dim into sections, + then get the local hidden parallel weights separately, + finally concatenate the local hidden parallel weights along the dimension dim. + + Returns: + torch.Tensor: The local parameter for the current context parallel rank. + """ + + cp_size = cp_group.size() + cp_rank = cp_group.rank() + + # No need to split if CP size is 1. + if cp_size == 1: + return param + + # Split first if needed. + if split_sections is not None: + inputs = torch.split(param, split_sections, dim=dim) + outputs = [] + for p in inputs: + p = get_parameter_local_cp(p, dim, cp_group) + outputs.append(p) + return torch.cat(outputs, dim=dim) + + # Slice the parameter. + slices = [slice(None)] * param.dim() + dim_size = param.size(dim=dim) + slices[dim] = slice(cp_rank * dim_size // cp_size, (cp_rank + 1) * dim_size // cp_size) + param = param[slices] + return param + + +def tensor_a2a_cp2hp( + tensor: torch.Tensor, + seq_dim: int, + head_dim: int, + cp_group: torch.distributed.ProcessGroup, + split_sections: Optional[List[int]] = None, + undo_attention_load_balancing: bool = True, +): + """All-to-all context parallel to hidden parallel. + + Args: + tensor (torch.Tensor): The tensor to all-to-all. + Currently only support (seq_len, batch, head_dim) shaped tensor. + seq_dim (int): The dimension of sequence length. Currently only supports seq_dim == 0. + head_dim (int): The dimension of head. Currently only supports head_dim == -1 or 2. + cp_group (torch.distributed.ProcessGroup): The context parallel group. + split_sections (Optional[List[int]]): If not None, split the tensor along the dimension + head_dim into sections first, then do all-to-all for each section separately, + finally concatenate the separated tensors along the dimension head_dim. + undo_attention_load_balancing (bool): Whether to undo the attention load balancing of CP. + + Returns: + torch.Tensor: The all-to-all tensor. + """ + + cp_size = cp_group.size() + + # No need to all-to-all if CP size is 1. + if cp_size == 1: + return tensor + + # Limitations of mamba_context_parallel._all_to_all_cp2hp. + assert seq_dim == 0, f"tensor_a2a_cp2hp only supports seq_dim == 0 for now, but got {seq_dim=}" + assert ( + head_dim == -1 or head_dim == 2 + ), f"tensor_a2a_cp2hp only supports head_dim == -1 or 2 for now, but got {head_dim=}" + assert ( + tensor.dim() == 3 + ), f"tensor_a2a_cp2hp only supports 3-d input tensor for now, but got {tensor.dim()=}" + + # Split first if needed. + if split_sections is not None: + inputs = torch.split(tensor, split_sections, dim=head_dim) + outputs = [] + for x in inputs: + x = tensor_a2a_cp2hp( + x, + seq_dim=seq_dim, + head_dim=head_dim, + cp_group=cp_group, + undo_attention_load_balancing=False, + ) + outputs.append(x) + tensor = torch.cat(outputs, dim=head_dim) + else: + tensor = _all_to_all_cp2hp(tensor, cp_group) + + # Undo attention load balancing last if needed. + if undo_attention_load_balancing: + tensor = _undo_attention_load_balancing(tensor, cp_size) + return tensor + + +def tensor_a2a_hp2cp( + tensor: torch.Tensor, + seq_dim: int, + head_dim: int, + cp_group: torch.distributed.ProcessGroup, + split_sections: Optional[List[int]] = None, + redo_attention_load_balancing: bool = True, +): + """All-to-all hidden parallel to context parallel. + + Args: + tensor (torch.Tensor): The tensor to all-to-all. + Currently only support (seq_len, batch, head_dim) shaped tensor. + seq_dim (int): The dimension of sequence length. Currently only supports seq_dim == 0. + head_dim (int): The dimension of head. Currently only supports head_dim == -1 or 2. + cp_group (torch.distributed.ProcessGroup): The context parallel group. + split_sections (Optional[List[int]]): If not None, first split the tensor along the + dimension head_dim into sections, then do all-to-all for each section separately, + finally concatenate the separated tensors along the dimension head_dim. + redo_attention_load_balancing (bool): Whether to redo the attention load balancing of HP. + + Returns: + torch.Tensor: The all-to-all tensor. + """ + + cp_size = cp_group.size() + + # No need to all-to-all if CP size is 1. + if cp_size == 1: + return tensor + + # Limitations of mamba_context_parallel._all_to_all_hp2cp. + assert seq_dim == 0, f"tensor_a2a_cp2hp only supports seq_dim == 0 for now, but got {seq_dim=}" + assert ( + head_dim == -1 or head_dim == 2 + ), f"tensor_a2a_cp2hp only supports head_dim == -1 or 2 for now, but got {head_dim=}" + assert ( + tensor.dim() == 3 + ), f"tensor_a2a_cp2hp only supports 3-d input tensor for now, but got {tensor.dim()=}" + + # Redo attention load balancing first if needed. + if redo_attention_load_balancing: + tensor = _redo_attention_load_balancing(tensor, cp_size) + + # Split first if needed. + if split_sections is not None: + inputs = torch.split(tensor, split_sections, dim=head_dim) + outputs = [] + for x in inputs: + x = tensor_a2a_hp2cp( + x, + seq_dim=seq_dim, + head_dim=head_dim, + cp_group=cp_group, + redo_attention_load_balancing=False, + ) + outputs.append(x) + tensor = torch.cat(outputs, dim=head_dim) + else: + tensor = _all_to_all_hp2cp(tensor, cp_group) + + return tensor + + +#################### +# Torch native gated delta rule +#################### def torch_chunk_gated_delta_rule( query, key, diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index e2705bd9f51..6493a4bcce1 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -922,17 +922,14 @@ def __post_init__(self): ) # Check tensor parallelism compatibility - assert ( - self.linear_num_key_heads % self.tensor_model_parallel_size == 0 - ), "linear_num_key_heads must be a multiple of tensor_model_parallel_size." - assert ( - self.linear_num_value_heads % self.tensor_model_parallel_size == 0 - ), "linear_num_value_heads must be a multiple of tensor_model_parallel_size." - - # Do not support yet, but coming soon. - assert self.context_parallel_size == 1, ( - f"Gated delta net does not support context parallel for now," - f" but got {self.context_parallel_size=}." + tp_cp_size = self.tensor_model_parallel_size * self.context_parallel_size + assert self.linear_num_key_heads % tp_cp_size == 0, ( + f"{self.linear_num_key_heads=} must be a multiple of " + f"({self.tensor_model_parallel_size=} * {self.context_parallel_size=})." + ) + assert self.linear_num_value_heads % tp_cp_size == 0, ( + f"{self.linear_num_value_heads=} must be a multiple of " + f"({self.tensor_model_parallel_size=} * {self.context_parallel_size=})." ) elif self.experimental_attention_variant == "dsa": assert ( diff --git a/tests/unit_tests/ssm/test_gated_delta_net.py b/tests/unit_tests/ssm/test_gated_delta_net.py index 89a185e3755..725d18fbc06 100644 --- a/tests/unit_tests/ssm/test_gated_delta_net.py +++ b/tests/unit_tests/ssm/test_gated_delta_net.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. from functools import partial from unittest import mock @@ -28,6 +28,7 @@ init_checkpointing_mock_args, ) from tests.unit_tests.test_utilities import Utils +from tests.unit_tests.transformer.test_attention import _test_parallel_attention_correctness try: import fla @@ -39,12 +40,7 @@ @pytest.mark.parametrize( ("tp_size", "sp", "cp_size"), - [ - (1, False, 1), - (2, False, 1), - (2, True, 1), - # GDN does not support CP for now. Leave it for future work. - ], + [(1, False, 1), (2, False, 1), (2, True, 1), (1, False, 2), (2, False, 2), (2, True, 2)], ) @pytest.mark.skipif(not HAVE_FLA, reason="FLA is not installed.") @pytest.mark.internal @@ -142,50 +138,13 @@ def test_gpu_forward(self): [ (4, False, 1), # TP w/o SP (4, True, 1), # TP w/ SP - # CP does not support GDN for now. Add it once it is supported. + (1, False, 2), # CP + (2, False, 2), # TP w/o SP + CP + (2, True, 2), # TP w/ SP + CP ], ) @pytest.mark.skipif(not HAVE_FLA, reason="FLA is not installed.") def test_parallel_gated_delta_net_correctness(tmp_path_dist_ckpt, tp, sp, cp): - # Constants - seed = 123 - sequence_length = 256 - micro_batch_size = 4 - hidden_size = 128 - normalization = "RMSNorm" - - # Model initialization function - def initialize_gpt_model(config, pre_process=True, post_process=True, vp_stage=None): - layer_spec = get_gpt_layer_with_transformer_engine_spec( - experimental_attention_variant="gated_delta_net", normalization=normalization - ) - gpt_model = GPTModel( - config=config, - transformer_layer_spec=layer_spec, - vocab_size=128, - max_sequence_length=sequence_length, - pre_process=pre_process, - post_process=post_process, - vp_stage=vp_stage, - ) - return gpt_model - - # Initialize baseline parallel state - Utils.initialize_model_parallel( - tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=1 - ) - - # Initialize input hidden states - torch.manual_seed(seed) - model_parallel_cuda_manual_seed(seed) - input_hidden_states = ( - torch.rand((sequence_length, micro_batch_size, hidden_size)) - .cuda() - .bfloat16() - .requires_grad_(True) - ) - - # Initialize transformer config transformer_config = TransformerConfig( hidden_size=128, linear_conv_kernel_dim=2, @@ -194,7 +153,7 @@ def initialize_gpt_model(config, pre_process=True, post_process=True, vp_stage=N linear_num_key_heads=4, linear_num_value_heads=8, num_layers=1, - normalization=normalization, + normalization="RMSNorm", use_cpu_initialization=True, layernorm_zero_centered_gamma=True, num_attention_heads=8, @@ -202,118 +161,15 @@ def initialize_gpt_model(config, pre_process=True, post_process=True, vp_stage=N bf16=True, ) - with TempNamedDir(tmp_path_dist_ckpt / 'test_parallel_gdn', sync=True) as ckpt_dir: - # Set argument - mock_args = parse_args(ignore_unknown_args=True) - set_args(mock_args) - - # Initialize baseline model - init_basic_mock_args(mock_args, 1, 1, bf16=True) - mock_args.context_parallel_size = 1 - mock_args.sequence_parallel = 1 - gpt_model = unwrap_model( - get_model(partial(initialize_gpt_model, config=transformer_config)) - ) - - # Initialize args and save checkpoint - init_checkpointing_mock_args(mock_args, ckpt_dir, False) - mock_args.no_save_optim = True - mock_args.no_save_rng = True - mock_args.no_load_optim = True - mock_args.no_load_rng = True - save_checkpoint(10, gpt_model, None, None, 0) - - # Calculate baseline output - attention = gpt_model[0].decoder.layers[0].self_attention - output_hidden_states_baseline, bias_hidden_states_baseline = attention( - input_hidden_states, attention_mask=None - ) - output_hidden_states_baseline.sum().backward() - - # Save baseline output - input_grad_baseline = input_hidden_states.grad.detach() - output_hidden_states_baseline = output_hidden_states_baseline.detach() - - # Initialize parallel model - Utils.destroy_model_parallel() - Utils.initialize_model_parallel( - tensor_model_parallel_size=tp, pipeline_model_parallel_size=1, context_parallel_size=cp - ) - torch.manual_seed(seed) - model_parallel_cuda_manual_seed(seed) - transformer_config.context_parallel_size = cp - transformer_config.tensor_model_parallel_size = tp - transformer_config.sequence_parallel = sp - init_basic_mock_args(mock_args, tp, 1, bf16=True) - mock_args.context_parallel_size = cp - mock_args.sequence_parallel = sp - gpt_model = unwrap_model( - get_model(partial(initialize_gpt_model, config=transformer_config)) - ) - with mock.patch('megatron.training.checkpointing.check_checkpoint_args'): - with mock.patch('megatron.training.checkpointing.update_num_microbatches'): - load_checkpoint(gpt_model, None, None) - - # Function to get tensor on this tp and cp rank - cp_group = parallel_state.get_context_parallel_group() - tp_rank = parallel_state.get_tensor_model_parallel_rank() - - def get_tensor_on_this_rank(tensor): - if cp > 1: - tensor = get_tensor_on_this_cp_rank(tensor, 0, cp_group) - if tp > 1 and sp: - sp_seg = sequence_length // tp // cp - tensor = tensor[tp_rank * sp_seg : (tp_rank + 1) * sp_seg] - return tensor - - # Calculate parallel model output - input_hidden_states = get_tensor_on_this_rank(input_hidden_states) - input_hidden_states = input_hidden_states.detach().requires_grad_(True) - parallel_attention = gpt_model[0].decoder.layers[0].self_attention - output_hidden_states_parallel, bias_hidden_states_parallel = parallel_attention( - input_hidden_states, attention_mask=None - ) - output_hidden_states_parallel.sum().backward() - input_grad_parallel = input_hidden_states.grad.detach() - - # Check if the output is the same - if cp: - atol, rtol = 5e-3, 5e-3 - else: - atol, rtol = 5e-4, 5e-4 - output_hidden_states_baseline = get_tensor_on_this_rank(output_hidden_states_baseline) - input_grad_baseline = get_tensor_on_this_rank(input_grad_baseline) - - assert torch.all( - ~torch.isnan(output_hidden_states_baseline) - ), "output_hidden_states_baseline contains nan" - assert torch.all( - ~torch.isinf(output_hidden_states_baseline) - ), "output_hidden_states_baseline contains inf" - assert torch.all(~torch.isnan(input_grad_baseline)), "input_grad_baseline contains nan" - assert torch.all(~torch.isinf(input_grad_baseline)), "input_grad_baseline contains inf" - assert torch.all( - ~torch.isnan(output_hidden_states_parallel) - ), "output_hidden_states_parallel contains nan" - assert torch.all( - ~torch.isinf(output_hidden_states_parallel) - ), "output_hidden_states_parallel contains inf" - assert torch.all(~torch.isnan(input_grad_parallel)), "input_grad_parallel contains nan" - assert torch.all(~torch.isinf(input_grad_parallel)), "input_grad_parallel contains inf" + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( + experimental_attention_variant="gated_delta_net", normalization="RMSNorm" + ) - torch.testing.assert_close( - output_hidden_states_baseline, - output_hidden_states_parallel, - atol=atol, - rtol=rtol, - msg=lambda msg: f"Mismatch in output_hidden_states: {msg}", - ) - torch.testing.assert_close( - input_grad_baseline, - input_grad_parallel, - atol=atol, - rtol=rtol, - msg=lambda msg: f"Mismatch in input_grad: {msg}", - ) + if cp: + atol, rtol = 5e-3, 5e-3 + else: + atol, rtol = 5e-4, 5e-4 - Utils.destroy_model_parallel() + _test_parallel_attention_correctness( + transformer_config, transformer_layer_spec, tmp_path_dist_ckpt, tp, sp, cp + ) From fd932c9df547ec9364b6edcc58983f8ddfedea64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 19 Dec 2025 15:33:49 +0100 Subject: [PATCH 200/248] ci: Gridify test configs (#2707) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig Co-authored-by: Dennis Liu --- .../test_cases/ci_base_config.yml | 14 +++ .../golden_values_dev_dgx_h100.json | 0 .../golden_values_dev_dgx_h100.json | 0 .../moe2.0/model_configs/dsv3_proxy.yaml | 85 ++++++++++++++++ .../moe2.0/model_configs/qwen3_proxy.yaml | 74 ++++++++++++++ .../moe2.0/runtime_configs/tp1pp1ep8.yaml | 41 ++++++++ .../moe2.0/runtime_configs/tp2pp2ep4.yaml | 55 +++++++++++ .../test_utils/python_scripts/merge_config.py | 92 ++++++++++++++++++ .../python_scripts/recipe_parser.py | 41 ++++++-- tests/test_utils/recipes/moe2.0.yaml | 97 +++++++++++++++++++ 10 files changed, 491 insertions(+), 8 deletions(-) create mode 100644 tests/functional_tests/test_cases/ci_base_config.yml create mode 100644 tests/functional_tests/test_cases/moe2.0/golden_values/dsv3_tp1pp1ep8/golden_values_dev_dgx_h100.json create mode 100644 tests/functional_tests/test_cases/moe2.0/golden_values/dsv3_tp2pp2ep4/golden_values_dev_dgx_h100.json create mode 100644 tests/functional_tests/test_cases/moe2.0/model_configs/dsv3_proxy.yaml create mode 100644 tests/functional_tests/test_cases/moe2.0/model_configs/qwen3_proxy.yaml create mode 100644 tests/functional_tests/test_cases/moe2.0/runtime_configs/tp1pp1ep8.yaml create mode 100644 tests/functional_tests/test_cases/moe2.0/runtime_configs/tp2pp2ep4.yaml create mode 100644 tests/test_utils/python_scripts/merge_config.py create mode 100644 tests/test_utils/recipes/moe2.0.yaml diff --git a/tests/functional_tests/test_cases/ci_base_config.yml b/tests/functional_tests/test_cases/ci_base_config.yml new file mode 100644 index 00000000000..739f343da9d --- /dev/null +++ b/tests/functional_tests/test_cases/ci_base_config.yml @@ -0,0 +1,14 @@ +MODEL_ARGS: + # Add logging args + --log-timers-to-tensorboard: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-params-norm: true + --log-validation-ppl-to-tensorboard: true + --log-throughput: true + --log-interval: 1 + --logging-level: 40 + --tensorboard-dir: ${TENSORBOARD_PATH} + # Add checkpointing args + --save: ${CHECKPOINT_SAVE_PATH} + --load: ${CHECKPOINT_LOAD_PATH} diff --git a/tests/functional_tests/test_cases/moe2.0/golden_values/dsv3_tp1pp1ep8/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe2.0/golden_values/dsv3_tp1pp1ep8/golden_values_dev_dgx_h100.json new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/functional_tests/test_cases/moe2.0/golden_values/dsv3_tp2pp2ep4/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe2.0/golden_values/dsv3_tp2pp2ep4/golden_values_dev_dgx_h100.json new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/functional_tests/test_cases/moe2.0/model_configs/dsv3_proxy.yaml b/tests/functional_tests/test_cases/moe2.0/model_configs/dsv3_proxy.yaml new file mode 100644 index 00000000000..70924aed0cc --- /dev/null +++ b/tests/functional_tests/test_cases/moe2.0/model_configs/dsv3_proxy.yaml @@ -0,0 +1,85 @@ +MODEL_ARGS: + # Data args + --seq-length: 4096 + --data-cache-path: ${DATA_CACHE_PATH} + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt + --split: 949,50,1 + # Add transformer base args + --num-layers: 16 + --hidden-size: 1024 + --normalization: RMSNorm + --norm-epsilon: 1e-6 + --disable-bias-linear: true + --max-position-embeddings: 4096 + --make-vocab-size-divisible-by: 3232 + --untie-embeddings-and-output-weights: true + # Add attention related args + --multi-latent-attention: true + --num-attention-heads: 32 + --kv-channels: 128 + --qk-layernorm: true + --position-embedding-type: rope + --rotary-base: 10000 + --q-lora-rank: 1536 + --kv-lora-rank: 512 + --qk-head-dim: 128 + --qk-pos-emb-head-dim: 64 + --v-head-dim: 128 + --rotary-scaling-factor: 40 + --mscale: 1.0 + --mscale-all-dim: 1.0 + # Add MLP related args + --swiglu: true + --ffn-hidden-size: 4096 + # Add MoE args + --num-experts: 32 + --moe-layer-freq: ([0]*1+[1]*15) + --moe-ffn-hidden-size: 1024 + --moe-shared-expert-intermediate-size: 1024 + --moe-router-load-balancing-type: seq_aux_loss + --moe-router-topk: 4 + --moe-router-pre-softmax: true + --moe-grouped-gemm: true + --moe-aux-loss-coeff: 1e-4 + --moe-router-group-topk: 2 + --moe-router-num-groups: 4 + --moe-router-topk-scaling-factor: 2.0 + --moe-router-score-function: sigmoid + --moe-router-enable-expert-bias: true + --moe-router-bias-update-rate: 1e-3 + --moe-router-dtype: fp32 + # Comment out the following MTP args to disable MTP + --mtp-num-layers: 1 + --mtp-loss-scaling-factor: 0.1 + # Add regularization args + --attention-dropout: 0.0 + --hidden-dropout: 0.0 + --clip-grad: 1.0 + --weight-decay: 0.1 + # Add learning rate args + --lr-warmup-fraction: .01 + --lr: 0.00015 + --min-lr: 1.0e-5 + --lr-decay-style: cosine + --adam-beta1: 0.9 + --adam-beta2: 0.95 + # Add validation args + --eval-iters: 32 + --eval-interval: 200 + # Add initialization args + --init-method-std: 0.02 + # Training args + --global-batch-size: 32 + --train-iters: 50 + --exit-duration-in-mins: 230 + --no-check-for-nan-in-loss-and-grad: true + +METRICS: + - "lm loss" + - "num-zeros" + - "mem-allocated-bytes" + - "mem-max-allocated-bytes" + - "mtp_1 loss" + - "seq_load_balancing_loss" diff --git a/tests/functional_tests/test_cases/moe2.0/model_configs/qwen3_proxy.yaml b/tests/functional_tests/test_cases/moe2.0/model_configs/qwen3_proxy.yaml new file mode 100644 index 00000000000..46e298ec971 --- /dev/null +++ b/tests/functional_tests/test_cases/moe2.0/model_configs/qwen3_proxy.yaml @@ -0,0 +1,74 @@ +MODEL_ARGS: + # Data args + --seq-length: 4096 + --data-cache-path: ${DATA_CACHE_PATH} + --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document + --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json + --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt + --split: 949,50,1 + # Add transformer base args + --num-layers: 16 + --hidden-size: 1024 + --normalization: RMSNorm + --norm-epsilon: 1e-6 + --disable-bias-linear: true + --max-position-embeddings: 4096 + --make-vocab-size-divisible-by: 3232 + --untie-embeddings-and-output-weights: true + # Add attention related args + --group-query-attention: true + --num-query-groups: 4 + --kv-channels: 128 + --qk-layernorm: true + --position-embedding-type: rope + --rotary-percent: 1.0 + --rotary-base: 1000000 + # Add MLP related args + --swiglu: true + --ffn-hidden-size: 4096 + # Add MoE args + --num-experts: 32 + --moe-layer-freq: ([0]*1+[1]*15) + --moe-ffn-hidden-size: 1024 + --moe-shared-expert-intermediate-size: 1024 + --moe-router-load-balancing-type: aux_loss + --moe-router-topk: 4 + --moe-router-pre-softmax: true + --moe-grouped-gemm: true + --moe-aux-loss-coeff: 1e-4 + --moe-router-group-topk: 2 + --moe-router-num-groups: 4 + --moe-router-topk-scaling-factor: 2.0 + --moe-router-score-function: sigmoid + --moe-router-enable-expert-bias: true + --moe-router-bias-update-rate: 1e-3 + --moe-router-dtype: fp32 + # Add regularization args + --attention-dropout: 0.0 + --hidden-dropout: 0.0 + --clip-grad: 1.0 + --weight-decay: 0.1 + # Add learning rate args + --lr-warmup-fraction: .01 + --lr: 0.00015 + --min-lr: 1.0e-5 + --lr-decay-style: cosine + --adam-beta1: 0.9 + --adam-beta2: 0.95 + # Add validation args + --eval-iters: 32 + --eval-interval: 200 + # Add initialization args + --init-method-std: 0.02 + # Training args + --global-batch-size: 32 + --train-iters: 50 + --exit-duration-in-mins: 230 + --no-check-for-nan-in-loss-and-grad: true + +METRICS: + - "lm loss" + - "num-zeros" + - "mem-allocated-bytes" + - "mem-max-allocated-bytes" + - "load_balancing_loss" diff --git a/tests/functional_tests/test_cases/moe2.0/runtime_configs/tp1pp1ep8.yaml b/tests/functional_tests/test_cases/moe2.0/runtime_configs/tp1pp1ep8.yaml new file mode 100644 index 00000000000..305e2847305 --- /dev/null +++ b/tests/functional_tests/test_cases/moe2.0/runtime_configs/tp1pp1ep8.yaml @@ -0,0 +1,41 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True + NCCL_NVLS_ENABLE: 0 + PYTHONWARNINGS: ignore + NCCL_DEBUG: VERSION + +MODEL_ARGS: + # Transformer Engine args + --transformer-impl: transformer_engine + # Distributed args + --distributed-timeout-minutes: 60 + --tensor-model-parallel-size: 1 + --pipeline-model-parallel-size: 1 + --expert-model-parallel-size: 8 + --context-parallel-size: 1 + --expert-tensor-parallel-size: 1 + --use-distributed-optimizer: true + --overlap-grad-reduce: true + --overlap-param-gather: true + # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN + --attention-backend: unfused # TODO: switch back to fused attention after fix + --use-mcore-models: true + --sequence-parallel: true + --micro-batch-size: 4 + # MoE training related args + --moe-token-dispatcher-type: alltoall + --moe-permute-fusion: true + --save-interval: 25 + # Add mixed precision args + --bf16: true + --exit-interval: 50 + # kernel fusion related args + --no-rope-fusion: true + --cross-entropy-loss-fusion: true + --cross-entropy-fusion-impl: native + # MISC + --manual-gc: true + --manual-gc-interval: 100 +TEST_TYPE: resume-ckpt diff --git a/tests/functional_tests/test_cases/moe2.0/runtime_configs/tp2pp2ep4.yaml b/tests/functional_tests/test_cases/moe2.0/runtime_configs/tp2pp2ep4.yaml new file mode 100644 index 00000000000..b93862aff8c --- /dev/null +++ b/tests/functional_tests/test_cases/moe2.0/runtime_configs/tp2pp2ep4.yaml @@ -0,0 +1,55 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True + NCCL_NVLS_ENABLE: 0 + PYTHONWARNINGS: ignore + NCCL_DEBUG: VERSION + +MODEL_ARGS: + # Transformer Engine args + --transformer-impl: transformer_engine + # Distributed args + --distributed-timeout-minutes: 60 + --tensor-model-parallel-size: 2 + --pipeline-model-parallel-size: 2 + --num-virtual-stages-per-pipeline-rank: 4 + --expert-model-parallel-size: 4 + --context-parallel-size: 1 + --expert-tensor-parallel-size: 1 + --use-distributed-optimizer: true + --overlap-grad-reduce: true + --overlap-param-gather: true + # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN + --attention-backend: unfused # TODO: switch back to fused attention after fix + --use-mcore-models: true + --sequence-parallel: true + --micro-batch-size: 4 + # MoE training related args + --moe-token-dispatcher-type: alltoall + --moe-permute-fusion: true + # Add checkpointing args + --save: ${CHECKPOINT_SAVE_PATH} + --load: ${CHECKPOINT_LOAD_PATH} + --save-interval: 25 + # Add logging args + --log-timers-to-tensorboard: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-params-norm: true + --log-validation-ppl-to-tensorboard: true + --log-throughput: true + --log-interval: 1 + --logging-level: 40 + --tensorboard-dir: ${TENSORBOARD_PATH} + # Add mixed precision args + --bf16: true + --exit-interval: 50 + # kernel fusion related args + --no-rope-fusion: true + --cross-entropy-loss-fusion: true + --cross-entropy-fusion-impl: native + # MISC + --manual-gc: true + --manual-gc-interval: 100 +TEST_TYPE: resume-ckpt \ No newline at end of file diff --git a/tests/test_utils/python_scripts/merge_config.py b/tests/test_utils/python_scripts/merge_config.py new file mode 100644 index 00000000000..176706038b7 --- /dev/null +++ b/tests/test_utils/python_scripts/merge_config.py @@ -0,0 +1,92 @@ +""" +Merges base_config, runtime_config and model_config into one final config that the CI can launch. + +Starting Dec 19th 2025 MCore CI supports a new format of defining tests. We are decoupling the test +config into a modular system of base_config, model_config and runtime_config. This allows us to +re-use and parametrize a given model easily with multiple runtime configs, like parallelism settings. + +With this DRY principle, we simplify test maintenance and reduce the amount of code duplication. + +This refactoring is fully compliant with the original CI system as we merge the three configs into one +final config that the CI can launch. + +Precendence: Base config > Model config > Runtime config. + +Usage: + +python merge_config.py \ + --model_config model_config.yaml \ + --base_config base_config.yaml \ + --runtime_config runtime_config.yaml \ + --output_config output_config.yaml +""" + +import logging + +import click +import yaml + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +@click.command() +@click.option("--model_config", type=str, help="Model config to merge") +@click.option("--base_config", type=str, help="Base config to merge") +@click.option("--runtime_config", type=str, help="Run time config to merge") +@click.option("--output_config", type=str, help="Output config to merge") +def main(model_config, base_config, runtime_config, output_config): + + with open(model_config, "r") as f: + model_config = yaml.safe_load(f) + with open(base_config, "r") as f: + base_config = yaml.safe_load(f) + with open(runtime_config, "r") as f: + runtime_config = yaml.safe_load(f) + + config = {} + + # Collect all top-level keys (ENV_VARS, MODEL_ARGS, etc.) + all_keys = set(base_config.keys()) | set(model_config.keys()) | set(runtime_config.keys()) + + for key in all_keys: + base_val = base_config.get(key) + model_val = model_config.get(key) + runtime_val = runtime_config.get(key) + + # Get first non-None value to check type + first_val = base_val or model_val or runtime_val + + if isinstance(first_val, dict): + # Merge dicts + config[key] = {} + for val in [base_val, model_val, runtime_val]: + if val: + config[key].update(val) + elif isinstance(first_val, list): + # Concatenate lists (deduplicate while preserving order) + config[key] = [] + seen = set() + for val in [base_val, model_val, runtime_val]: + if val: + for item in val: + if item not in seen: + config[key].append(item) + seen.add(item) + else: + # Scalar value (string, int, bool, etc.) - use last defined + if runtime_val is not None: + config[key] = runtime_val + elif model_val is not None: + config[key] = model_val + else: + config[key] = base_val + + with open(output_config, "w") as f: + yaml.dump(config, f) + + logger.info(f"Config merged and saved to {output_config}") + + +if __name__ == "__main__": + main() diff --git a/tests/test_utils/python_scripts/recipe_parser.py b/tests/test_utils/python_scripts/recipe_parser.py index e26d04d6f20..a497bdbd9de 100644 --- a/tests/test_utils/python_scripts/recipe_parser.py +++ b/tests/test_utils/python_scripts/recipe_parser.py @@ -48,14 +48,34 @@ def resolve_artifact_config(cluster: str) -> str: def flatten_products(workload_manifest: dotdict) -> dotdict: """Flattens a nested dict of products""" - workload_manifest.products = [ - dict(**dict(zip(inp.keys(), values)), **{"test_case": product["test_case"][0]}) - for product in (workload_manifest.products or []) - if "products" in product - for inp in product["products"] - for values in itertools.product(*inp.values()) - ] - + expanded_products = [] + + for product in workload_manifest.products or []: + # Skip products that don't have nested product specifications + if "products" not in product: + continue + + test_case = product["test_case"][0] + + # Iterate over each input specification in the product + for inp in product["products"]: + # Generate all combinations of the input values (Cartesian product) + model_config = inp.pop("model_config", None) + runtime_config = inp.pop("runtime_config", None) + keys = inp.keys() + value_combinations = itertools.product(*inp.values()) + + # Create a flattened product dict for each combination + for values in value_combinations: + product_dict = dict(zip(keys, values)) + product_dict["test_case"] = test_case + if model_config: + product_dict["model_config"] = model_config + if runtime_config: + product_dict["runtime_config"] = runtime_config + expanded_products.append(product_dict) + + workload_manifest.products = expanded_products return workload_manifest @@ -98,11 +118,16 @@ def load_and_flatten(config_path: str) -> List[dotdict]: def filter_by_test_case(workload_manifests: List[dotdict], test_case: str) -> Optional[dotdict]: """Returns a workload with matching name. Raises an error if there no or more than a single workload.""" + print(len(workload_manifests)) workload_manifests = list( workload_manifest for workload_manifest in workload_manifests if workload_manifest["spec"]["test_case"] == test_case ) + print(len(workload_manifests)) + + for w in workload_manifests: + print(w["spec"]["test_case"]) if len(workload_manifests) > 1: logger.info("Duplicate test_case found!") diff --git a/tests/test_utils/recipes/moe2.0.yaml b/tests/test_utils/recipes/moe2.0.yaml new file mode 100644 index 00000000000..e3249dd6ad1 --- /dev/null +++ b/tests/test_utils/recipes/moe2.0.yaml @@ -0,0 +1,97 @@ +type: basic +format_version: 1 +maintainers: [mcore] +loggers: [stdout] +spec: + name: '{test_case}_{environment}_{platforms}' + model: moe2.0 + build: mcore-pyt-{environment} + nodes: 1 + gpus: 8 + n_repeat: 5 + platforms: dgx_a100 + script_setup: | + unset https_proxy + echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc + + # Checkout latest + cd /opt + rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm + git init + git remote add origin $MCORE_REPO + git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*' + git fetch origin $MCORE_MR_COMMIT + git checkout $MCORE_MR_COMMIT + git rev-parse HEAD + + # Checkout backwards-ref + cd /opt + rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy + git init + git remote add origin $MCORE_REPO + git fetch origin $MCORE_BACKWARDS_COMMIT + git checkout $MCORE_BACKWARDS_COMMIT + git rev-parse HEAD + rm -rf megatron; cp -a /opt/megatron-lm/megatron ./ + script: |- + ls + cd /opt/megatron-lm + + NAME=$(echo {test_case}_{environment} | sed 's/dgx_h100/dgx_a100/g') + + mkdir -p $(dirname ./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml) + python ./tests/test_utils/python_scripts/merge_config.py \ + --base_config ./tests/functional_tests/test_cases/ci_base_config.yml \ + --model_config ./tests/functional_tests/test_cases/{model}/model_configs/{model_config}.yaml \ + --runtime_config ./tests/functional_tests/test_cases/{model}/runtime_configs/{runtime_config}.yaml \ + --output_config ./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml + + ARGUMENTS=( + "DATA_PATH=/mnt/artifacts" + "DATA_CACHE_PATH=/workspace/data/cache" + "OUTPUT_PATH={assets_dir}" + "TENSORBOARD_PATH={assets_dir}/tensorboard" + "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints" + "CHECKPOINT_LOAD_PATH=/mnt/artifacts" + "TRAINING_SCRIPT_PATH=pretrain_gpt.py" + "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "N_REPEAT={n_repeat}" + "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" + "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" + ) + + bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} + +products: + ########################### + # Merge train tests # + ########################### + - test_case: [dsv3_tp1pp1ep8] + products: + - model_config: dsv3_proxy + runtime_config: tp1pp1ep8 + environment: [dev] + scope: [broken] + platforms: [dgx_h100] + - test_case: [dsv3_tp2pp2ep4] + products: + - model_config: dsv3_proxy + runtime_config: tp2pp2ep4 + environment: [dev] + scope: [broken] + platforms: [dgx_h100] + - test_case: [qwen3_tp1pp1ep1] + products: + - model_config: qwen3_proxy + runtime_config: tp1pp1ep1 + environment: [dev] + scope: [broken] + platforms: [dgx_h100] + - test_case: [qwen3_tp2pp2ep4] + products: + - model_config: qwen3_proxy + runtime_config: tp2pp2ep4 + environment: [dev] + scope: [broken] + platforms: [dgx_h100] From 2b1fc70891cd1b45b6a02a588430253a78bdb4fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 22 Dec 2025 08:49:00 +0000 Subject: [PATCH 201/248] Revert "[dev] Add assertion for mxfp8 params without dp overlap (#2270)" This reverts commit 7968d5f98f8457297d4a73f96d8a086d84a8fa67. --- .../core/distributed/distributed_data_parallel_config.py | 8 -------- .../src/megatron_fsdp/distributed_data_parallel_config.py | 8 -------- 2 files changed, 16 deletions(-) diff --git a/megatron/core/distributed/distributed_data_parallel_config.py b/megatron/core/distributed/distributed_data_parallel_config.py index e2a026d836f..3f97beab825 100644 --- a/megatron/core/distributed/distributed_data_parallel_config.py +++ b/megatron/core/distributed/distributed_data_parallel_config.py @@ -146,14 +146,6 @@ def __post_init__(self): """Check the validity of the config.""" if self.reuse_grad_buf_for_mxfp8_param_ag: assert self.fp8_param_gather, "Reuse grad buffer only when keeping params in MXFP8." - # Using mxfp8 param without overlap param gather and overlap grad reduce will cause NaN. - # TODO: Remove this assertion when the issue is fixed. - assert ( - self.overlap_param_gather - ), "--overlap-param-gather is required when using mxfp8 params" - assert ( - self.overlap_grad_reduce - ), "--overlap-grad-reduce is required when using mxfp8 params" if self.nccl_ub: if 'expandable_segments:True' in os.getenv('PYTORCH_CUDA_ALLOC_CONF', '').split(','): diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py index 5151ecabfb5..86826758498 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py @@ -137,14 +137,6 @@ def __post_init__(self): """Check the validity of the config.""" if self.reuse_grad_buf_for_mxfp8_param_ag: assert self.fp8_param_gather, "Reuse grad buffer only when keeping params in MXFP8." - # Using mxfp8 param without overlap param gather and overlap grad reduce will cause NaN. - # TODO: Remove this assertion when the issue is fixed. - assert ( - self.overlap_param_gather - ), "--overlap-param-gather is required when using mxfp8 params" - assert ( - self.overlap_grad_reduce - ), "--overlap-grad-reduce is required when using mxfp8 params" if self.nccl_ub: if 'expandable_segments:True' in os.getenv('PYTORCH_CUDA_ALLOC_CONF', '').split(','): From 4665be4dec0cd26f32e91d7fc4e1be4f1ea2132d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 22 Dec 2025 11:18:06 +0100 Subject: [PATCH 202/248] Revert "[Dev] Use the latest Hybrid-EP (#2424)" (#2732) --- docker/Dockerfile.ci.dev | 2 +- megatron/core/transformer/moe/fused_a2a.py | 51 ++++++++++++++----- .../core/transformer/moe/token_dispatcher.py | 15 ++++-- 3 files changed, 50 insertions(+), 18 deletions(-) diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev index 5caa6003630..482c6af460c 100644 --- a/docker/Dockerfile.ci.dev +++ b/docker/Dockerfile.ci.dev @@ -62,7 +62,7 @@ RUN bash -ex <<"EOF" git clone --branch hybrid-ep https://github.com/deepseek-ai/DeepEP.git pushd DeepEP - git checkout 83e0d156807f31abed4ea55c2fa6eb4b62a11b82 + git checkout 1dddd194c26911c35b4f53a148617dd73de0ffc9 patch -p1 < /workspace/deepep.patch popd TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" uv pip install --no-build-isolation -v DeepEP/. diff --git a/megatron/core/transformer/moe/fused_a2a.py b/megatron/core/transformer/moe/fused_a2a.py index aa13b9b5b5b..045a93039b3 100644 --- a/megatron/core/transformer/moe/fused_a2a.py +++ b/megatron/core/transformer/moe/fused_a2a.py @@ -3,7 +3,6 @@ # Copyright (c) 2025 DeepSeek # Licensed under the MIT License - https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE -from megatron.core.utils import internal_api try: from deep_ep import Buffer @@ -329,7 +328,6 @@ def reset_hybrid_ep_buffer(): _hybrid_ep_buffer = None -@internal_api class HybridEPDispatch(torch.autograd.Function): ''' Fused dispatch operation for permute + dispatch a2a + permute using the HybridEP backend @@ -345,6 +343,7 @@ def forward( num_local_experts, num_sms_dispatch_api=24, num_sms_combine_api=24, + num_dispatched_tokens=None, num_permuted_tokens=None, pad_multiple=None, ): @@ -363,9 +362,11 @@ def forward( num_sms_combine_api, fp8_dispatch, ) - # If we provide the num_permuted_tokens, we do not need to use sync to - # wait for the data in pinned memory ready - non_blocking = num_permuted_tokens is not None + # Defaultly, the output token_per_expert and num_dispatched_tokens_tensor + # will be put on the CPU to avoid the potential sync in combine/backward pass, + # but if we provide the num_dispatched_tokens and num_permuted_tokens on CPU, + # we do not need to the D2H here. + use_host_meta = num_dispatched_tokens is None or num_permuted_tokens is None # Process the dispatch ( dispatched_hidden, @@ -380,12 +381,14 @@ def forward( scaling_factor=None, num_of_experts_per_rank=num_local_experts, pad_multiple=pad_multiple, + num_dispatched_tokens=num_dispatched_tokens, num_permuted_tokens=num_permuted_tokens, - non_blocking=non_blocking, + use_host_meta=use_host_meta, ) ctx.handle = handle ctx.pad_multiple = pad_multiple + ctx.num_dispatched_tokens = num_dispatched_tokens return ( dispatched_hidden, dispatched_probs, @@ -401,27 +404,36 @@ def backward(ctx, grad_x, grad_probs, grad_scaling_factor, grad_tokens_per_exper ''' handle = ctx.handle combined_hidden, combined_probs = _hybrid_ep_buffer.combine_with_unpermute( - hidden=grad_x, probs=grad_probs, handle=handle, pad_multiple=ctx.pad_multiple + hidden=grad_x, + probs=grad_probs, + handle=handle, + pad_multiple=ctx.pad_multiple, + num_dispatched_tokens=ctx.num_dispatched_tokens, ) return combined_hidden, None, combined_probs, None, None, None, None, None, None, None -@internal_api class HybridEPCombine(torch.autograd.Function): ''' Fused combine operation for permute + combine a2a + permute using the HybridEP backend ''' @staticmethod - def forward(ctx, x, handle, num_permuted_tokens=None, pad_multiple=None): + def forward( + ctx, x, handle, num_dispatched_tokens=None, num_permuted_tokens=None, pad_multiple=None + ): ''' Forward pass of fused combine of the HybridEP backend ''' combined_hidden, _ = _hybrid_ep_buffer.combine_with_unpermute( - hidden=x, handle=handle, pad_multiple=pad_multiple + hidden=x, + handle=handle, + pad_multiple=pad_multiple, + num_dispatched_tokens=num_dispatched_tokens, ) ctx.handle = handle ctx.pad_multiple = pad_multiple + ctx.num_dispatched_tokens = num_dispatched_tokens ctx.num_permuted_tokens = num_permuted_tokens return combined_hidden @@ -436,6 +448,7 @@ def backward(ctx, grad_x): scaling_factor=None, handle=handle, pad_multiple=ctx.pad_multiple, + num_dispatched_tokens=ctx.num_dispatched_tokens, num_permuted_tokens=ctx.num_permuted_tokens, ) return dispatched_hidden, None, None, None, None @@ -443,7 +456,6 @@ def backward(ctx, grad_x): if HAVE_HYBRIDEP: - @internal_api def hybrid_ep_dispatch( x, routing_map, @@ -452,6 +464,7 @@ def hybrid_ep_dispatch( num_local_experts, num_sms_dispatch_api=24, num_sms_combine_api=24, + num_dispatched_tokens=None, num_permuted_tokens=None, pad_multiple=None, ): @@ -474,6 +487,10 @@ def hybrid_ep_dispatch( Number of SMs used by the dispatch API. num_sms_combine_api (int): Number of SMs used by the combine API. + num_dispatched_tokens (int): + Number of tokens after dispatch but before permute. HybridEP uses this + to allocate buffers. If not provided, HybridEP obtains the size from + a GPU tensor, which causes a D2H synchronization. num_permuted_tokens (int): Number of tokens after permute. HybridEP uses this to allocate buffers. If not provided, HybridEP obtains the size from a GPU tensor, @@ -490,12 +507,12 @@ def hybrid_ep_dispatch( num_local_experts, num_sms_dispatch_api, num_sms_combine_api, + num_dispatched_tokens, num_permuted_tokens, pad_multiple, ) - @internal_api - def hybrid_ep_combine(x, handle, num_permuted_tokens, pad_multiple): + def hybrid_ep_combine(x, handle, num_dispatched_tokens, num_permuted_tokens, pad_multiple): ''' Perform fused combine operation for unpermute + combine a2a + unpermute using the HybridEP backend @@ -505,6 +522,10 @@ def hybrid_ep_combine(x, handle, num_permuted_tokens, pad_multiple): Input hidden states to combine handle (EventHandle): Communication handle from dispatch operation + num_dispatched_tokens (int): + The number of tokens after unpermute but before combine. HybridEP uses this + to allocate buffers. If not provided, HybridEP obtains the size from a GPU tensor, + which causes a D2H synchronization. num_permuted_tokens (int): The number of tokens before unpermute. HybridEP uses this to allocate buffers. If not provided, HybridEP obtains the size from a GPU tensor, which causes a D2H synchronization. @@ -512,7 +533,9 @@ def hybrid_ep_combine(x, handle, num_permuted_tokens, pad_multiple): The alignment multiple required for FP8 GEMM. If not provided, no padding is performed. ''' - return HybridEPCombine.apply(x, handle, num_permuted_tokens, pad_multiple) + return HybridEPCombine.apply( + x, handle, num_dispatched_tokens, num_permuted_tokens, pad_multiple + ) else: hybrid_ep_dispatch = None diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py index d0da38d6322..61ef0b5f084 100644 --- a/megatron/core/transformer/moe/token_dispatcher.py +++ b/megatron/core/transformer/moe/token_dispatcher.py @@ -985,8 +985,11 @@ def __init__( if self.drop_and_pad: assert self.capacity_factor is not None self.capacity = None - # Actually the the up-bound for the number of tokens - # after permute op, None means no up-bound, will cause a CPU sync + # The up-bound for the number of tokens after dispatch op, -1 means no up-bound, + # which will cause a CPU sync + self.num_dispatched_tokens = None + # Actually the sum of tokens_per_expert, the up-bound for the number of tokens + # after permute op, -1 means no up-bound, will cause a CPU sync self.num_permuted_tokens = None # Metadata @@ -1015,9 +1018,12 @@ def setup_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor): num_experts=self.num_experts, capacity_factor=self.capacity_factor, ) + # We cannot predict the actual number of tokens after the dispatch op, + # so we set it to the worst case in drop_and_pad mode + self.num_dispatched_tokens = self.capacity * self.group.size() * self.num_local_experts # In drop_and_pad mode, the number of tokens after the permute op # can be computed on the CPU - self.num_permuted_tokens = self.capacity * self.group.size() * self.num_local_experts + self.num_permuted_tokens = self.num_dispatched_tokens self.tokens_per_expert = torch.full( (self.num_local_experts,), self.capacity * self.group.size(), dtype=torch.long ) @@ -1046,6 +1052,7 @@ def dispatch( num_local_experts=self.num_local_experts, num_sms_dispatch_api=self.config.moe_hybridep_num_sms, num_sms_combine_api=self.config.moe_hybridep_num_sms, + num_dispatched_tokens=self.num_dispatched_tokens, num_permuted_tokens=self.num_permuted_tokens, pad_multiple=self.pad_multiple, ) @@ -1067,6 +1074,7 @@ def combine( hidden_states = hybrid_ep_combine( x=hidden_states, handle=self.handle, + num_dispatched_tokens=self.num_dispatched_tokens, num_permuted_tokens=self.num_permuted_tokens, pad_multiple=self.pad_multiple, ) @@ -1076,6 +1084,7 @@ def combine( self.handle = None if not self.drop_and_pad: self.num_permuted_tokens = None + self.num_dispatched_tokens = None return hidden_states def get_permuted_hidden_states_by_experts(self, hidden_states: torch.Tensor) -> torch.Tensor: From 46b550591ad4765a447980ff0ca615929cf8fb78 Mon Sep 17 00:00:00 2001 From: Pingtian Li <158665726+Wohox@users.noreply.github.com> Date: Tue, 23 Dec 2025 11:15:53 +0800 Subject: [PATCH 203/248] [Dev] Fix ep overlap missing final layernorm (#2691) --- megatron/core/models/gpt/fine_grained_callables.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index 60094976a9a..741a25326fb 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -170,11 +170,16 @@ def forward_impl(self, hidden_states): Returns: The logits or loss depending on whether labels are provided. - - Note: - Final layernorm now has been moved from the post-process stage to the - last decoder layer, so we don't need to run the final layer norm here. """ + + empty_decoder = len(self.gpt_model.decoder.layers) == 0 + layer_norm = self.gpt_model.decoder.final_layernorm + if not self.gpt_model.config.mtp_num_layers and empty_decoder and layer_norm: + hidden_states = layer_norm(hidden_states) + hidden_states = make_viewless_tensor( + inp=hidden_states, requires_grad=True, keep_graph=True + ) + # Run GPTModel._postprocess loss = self.gpt_model._postprocess( hidden_states=hidden_states, From 0b6714ec87ec256aca0bc9400985247d26f98ef0 Mon Sep 17 00:00:00 2001 From: HaochenYuan <106647990+HaochenYuan@users.noreply.github.com> Date: Wed, 24 Dec 2025 10:34:10 +0800 Subject: [PATCH 204/248] [Dev] Remove calculation of padding token in moe routing loss (#2121) Co-authored-by: Li Tao --- .../core/extensions/transformer_engine.py | 2 +- .../common/model_chunk_schedule_plan.py | 2 + .../core/models/gpt/fine_grained_callables.py | 21 +- megatron/core/models/gpt/gpt_model.py | 37 +++- megatron/core/transformer/mlp.py | 2 +- megatron/core/transformer/moe/moe_layer.py | 27 ++- megatron/core/transformer/moe/moe_utils.py | 83 ++++++-- megatron/core/transformer/moe/router.py | 167 ++++++++++++---- .../core/transformer/transformer_block.py | 15 +- .../core/transformer/transformer_layer.py | 23 ++- .../python_scripts/recipe_parser.py | 1 + .../a2a_overlap/test_schedule_chunk_1f1b.py | 116 ++++++++++- .../a2a_overlap/test_schedule_layer_1f1b.py | 4 +- .../transformer/moe/test_aux_loss.py | 189 ++++++++++++++++++ .../transformer/moe/test_routers.py | 47 +++++ 15 files changed, 646 insertions(+), 90 deletions(-) diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index acb93ef7853..546f8a59318 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -1851,7 +1851,7 @@ def forward_post_hook(module, *_) -> None: "TEFusedMLP module does not support submodules with post-backward hooks" ) - def forward(self, hidden_states: torch.Tensor) -> Tuple[Tensor, Optional[Tensor]]: + def forward(self, hidden_states: torch.Tensor, **kwargs) -> Tuple[Tensor, Optional[Tensor]]: """Forward.""" # Construct fused impl if needed diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py index 486a498dd73..07bab1cb486 100644 --- a/megatron/core/models/common/model_chunk_schedule_plan.py +++ b/megatron/core/models/common/model_chunk_schedule_plan.py @@ -305,6 +305,7 @@ def __init__( extra_block_kwargs=None, runtime_gather_output: Optional[bool] = None, loss_mask: Optional[Tensor] = None, + padding_mask=None, ): """Initialize the schedule plan of all Transformer layers' sub-modules. @@ -347,6 +348,7 @@ def __init__( self._model_chunk_state.mtp_hidden_states = None self._model_chunk_state.loss_mask = loss_mask self._model_chunk_state.packed_seq_params = packed_seq_params + self._model_chunk_state.padding_mask = padding_mask self._model_chunk_state.extra_block_kwargs = extra_block_kwargs self._model_chunk_state.runtime_gather_output = runtime_gather_output self._model_chunk_state.model = model diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index 741a25326fb..b0923a37b80 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -120,13 +120,19 @@ def forward_impl(self): if not self.gpt_model.pre_process: self.chunk_state.decoder_input = self.gpt_model.decoder.input_tensor # Run GPTModel._preprocess - decoder_input, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, sequence_len_offset = ( - self.gpt_model._preprocess( - input_ids=self.chunk_state.input_ids, - position_ids=self.chunk_state.position_ids, - decoder_input=self.chunk_state.decoder_input, - packed_seq_params=self.chunk_state.packed_seq_params, - ) + ( + decoder_input, + rotary_pos_emb, + rotary_pos_cos, + rotary_pos_sin, + sequence_len_offset, + padding_mask, + ) = self.gpt_model._preprocess( + input_ids=self.chunk_state.input_ids, + position_ids=self.chunk_state.position_ids, + decoder_input=self.chunk_state.decoder_input, + packed_seq_params=self.chunk_state.packed_seq_params, + padding_mask=self.chunk_state.padding_mask, ) # Saved for later use @@ -135,6 +141,7 @@ def forward_impl(self): self.chunk_state.rotary_pos_cos = rotary_pos_cos self.chunk_state.rotary_pos_sin = rotary_pos_sin self.chunk_state.sequence_len_offset = sequence_len_offset + self.chunk_state.padding_mask = padding_mask return decoder_input diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index a1230568cbd..9e70c677226 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -284,6 +284,7 @@ def _preprocess( decoder_input: Tensor = None, inference_context: BaseInferenceContext = None, packed_seq_params: PackedSeqParams = None, + padding_mask: Optional[Tensor] = None, ): """Preprocesses inputs for the transformer decoder. @@ -300,7 +301,20 @@ def _preprocess( if decoder_input is not None: pass elif self.pre_process: + if padding_mask is not None: + assert padding_mask.shape == input_ids.shape, ( + f"padding_mask shape {padding_mask.shape} does not match " + f"input_ids shape {input_ids.shape}" + ) decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids) + if padding_mask is not None and self.config.sequence_parallel: + padding_mask = ( + tensor_parallel.scatter_to_sequence_parallel_region( + padding_mask.transpose(0, 1).contiguous() + ) + .transpose(0, 1) + .contiguous() + ) else: # intermediate stage of pipeline # decoder will get hidden_states from encoder.input_tensor @@ -403,6 +417,7 @@ def _preprocess( rotary_pos_cos, rotary_pos_sin, sequence_len_offset, + padding_mask, ) if rotary_pos_cos_sin is not None: # only in the case of flashinfer fused rope will we @@ -446,6 +461,7 @@ def forward( *, inference_params: Optional[BaseInferenceContext] = None, loss_mask: Optional[Tensor] = None, + padding_mask: Optional[Tensor] = None, ) -> Tensor: """Forward function of the GPT Model This function passes the input tensors through the embedding layer, and then the decoder and finally into the post @@ -456,6 +472,9 @@ def forward( Args: runtime_gather_output (bool): Gather output at runtime. Default None means `parallel_output` arg in the constructor will be used. + padding_mask (Tensor, optional): Padding mask for MoE routing. + Shape [bsz, seq_length]. True = padding (exclude), False = valid (include). + Only used for MoE layers to exclude padding tokens from routing computations. """ if self.config.fine_grained_activation_offloading: self.preprocess_for_fine_grained_offloading() @@ -468,13 +487,19 @@ def forward( decoder_input=decoder_input, inference_context=inference_context, packed_seq_params=packed_seq_params, + padding_mask=padding_mask, ) - (decoder_input, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, sequence_len_offset) = ( - preproc_output[:5] - ) + ( + decoder_input, + rotary_pos_emb, + rotary_pos_cos, + rotary_pos_sin, + sequence_len_offset, + padding_mask, + ) = preproc_output[:6] - rotary_pos_cos_sin = preproc_output[5] if len(preproc_output) == 6 else None + rotary_pos_cos_sin = preproc_output[6] if len(preproc_output) == 7 else None # Run decoder. hidden_states = self.decoder( @@ -487,6 +512,7 @@ def forward( rotary_pos_cos_sin=rotary_pos_cos_sin, packed_seq_params=packed_seq_params, sequence_len_offset=sequence_len_offset, + padding_mask=padding_mask, **(extra_block_kwargs or {}), ) @@ -724,6 +750,7 @@ def build_schedule_plan( runtime_gather_output: Optional[bool] = None, inference_params: Optional[BaseInferenceContext] = None, loss_mask: Optional[Tensor] = None, + padding_mask: Optional[Tensor] = None, ): """Builds a computation schedule plan for the model. @@ -749,6 +776,7 @@ def build_schedule_plan( inference_params (InferenceParams, optional): Parameters for inference. Defaults to None. loss_mask (Optional[Tensor], optional): Loss mask. Defaults to None. + padding_mask (Optional[Tensor], optional): Padding mask. Defaults to None. Returns: TransformerModelChunkSchedulePlan: The model chunk schedule plan. @@ -770,6 +798,7 @@ def build_schedule_plan( extra_block_kwargs, runtime_gather_output, loss_mask, + padding_mask, ) def sharded_state_dict( diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py index 8dcf196da94..fbb960f4be9 100644 --- a/megatron/core/transformer/mlp.py +++ b/megatron/core/transformer/mlp.py @@ -137,7 +137,7 @@ def __init__( tp_group=tp_group, ) - def forward(self, hidden_states, per_token_scale=None): + def forward(self, hidden_states, per_token_scale=None, **kwargs): """Perform the forward pass through the MLP block.""" # [s, b, 4 * h/p] nvtx_range_push(suffix="linear_fc1") diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py index 10d10f667fe..153bac00ec1 100644 --- a/megatron/core/transformer/moe/moe_layer.py +++ b/megatron/core/transformer/moe/moe_layer.py @@ -178,13 +178,13 @@ def __init__( self.cudagraph_tensor_store = MoECudaGraphTensorStore() @maybe_skip_or_early_return_by_cudagraph("route") - def route(self, hidden_states: torch.Tensor): + def route(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): """Compute token routing for preprocessing. This method uses the router to determine which experts to send each token to, producing routing probabilities and a mapping. """ - probs, routing_map = self.router(hidden_states) + probs, routing_map = self.router(hidden_states, padding_mask=padding_mask) return probs, routing_map @maybe_skip_or_early_return_by_cudagraph("preprocess") @@ -270,7 +270,7 @@ def combine(self, output: torch.Tensor, shared_expert_output: Optional[torch.Ten output = output + shared_expert_output return output - def forward(self, hidden_states: torch.Tensor): + def forward(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): """Forward pass for the MoE layer. The forward pass comprises four main steps: @@ -280,7 +280,11 @@ def forward(self, hidden_states: torch.Tensor): 4. Combine: The outputs from the experts are combined and returned. Args: - hidden_states (torch.Tensor): The input tensor to the MoE layer. + hidden_states (torch.Tensor): The input tensor shape [seq_length, bsz, hidden_size]. + padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. + used for correct auxiliary loss computation for packed sequence. + Shape = [bsz, seq_length]. True = padding (exclude), False = valid (include). + Defaults to None (all tokens are valid). Returns: A tuple containing the output tensor and the MLP bias, if any. @@ -291,11 +295,15 @@ def forward(self, hidden_states: torch.Tensor): "are enabled without also enabling sequence parallelism." ) + # Transpose from [bsz, seq_length] to [seq_length, bsz] to align with hidden_states + if padding_mask is not None: + padding_mask = padding_mask.transpose(0, 1).bool() + # MoE forward: route -> dispatch -> compute -> combine - def custom_forward(hidden_states): + def custom_forward(hidden_states, padding_mask=None): try: shared_expert_output = self.shared_experts_compute(hidden_states) - probs, routing_map = self.route(hidden_states) + probs, routing_map = self.route(hidden_states, padding_mask=padding_mask) hidden_states, probs, residual = self.preprocess(hidden_states, probs, routing_map) except MoECudaGraphPartialCaptureSignal as e: # This signal is raised from the maybe_skip_or_early_return_by_cudagraph decorator. @@ -318,11 +326,14 @@ def custom_forward(hidden_states): tensor_parallel.random.get_cuda_rng_tracker, parallel_state.get_tensor_model_parallel_group(), hidden_states, + padding_mask, ) else: - outputs = tensor_parallel.checkpoint(custom_forward, False, hidden_states) + outputs = tensor_parallel.checkpoint( + custom_forward, False, hidden_states, padding_mask + ) else: - outputs = custom_forward(hidden_states) + outputs = custom_forward(hidden_states, padding_mask) return outputs diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py index 28cff06f5ec..f44d441c765 100644 --- a/megatron/core/transformer/moe/moe_utils.py +++ b/megatron/core/transformer/moe/moe_utils.py @@ -1,5 +1,4 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - import math from dataclasses import dataclass from typing import List, Optional, Union @@ -11,6 +10,7 @@ from megatron.core.fp8_utils import get_fp8_align_size from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel import get_cuda_rng_tracker, get_expert_parallel_rng_tracker_name +from megatron.core.tensor_parallel.mappings import reduce_from_tensor_model_parallel_region from megatron.core.transformer.cuda_graphs import is_graph_capturing from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.transformer_config import TransformerConfig @@ -120,18 +120,34 @@ def switch_load_balancing_loss_func( return aux_loss -def z_loss_func(logits, z_loss_coeff): +def z_loss_func(logits, z_loss_coeff, padding_mask: Optional[torch.Tensor] = None): """Encourages the router's logits to remain small to enhance stability. Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details. Args: logits (torch.Tensor): The logits of the router. + z_loss_coeff (float): The coefficient for the z-loss. + padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. + Shape [num_tokens]. True = padding (exclude), + False = valid (include). Defaults to None. Returns: torch.Tensor: The logits after applying the z-loss. """ + logsum = torch.logsumexp(logits, dim=-1) + z_loss_values = torch.square(logsum) + + if padding_mask is not None: + # Invert padding_mask: True (padding) -> 0, False (valid) -> 1 + valid_mask = ~padding_mask + # Only compute z_loss for valid (non-padding) tokens + z_loss_values = z_loss_values * valid_mask + # Compute mean over valid tokens only + num_valid_tokens = valid_mask.sum() + z_loss = z_loss_values.sum() / torch.clamp(num_valid_tokens, min=1.0) * z_loss_coeff + else: + z_loss = torch.mean(z_loss_values) * z_loss_coeff - z_loss = torch.mean(torch.square(torch.logsumexp(logits, dim=-1))) * z_loss_coeff return z_loss @@ -171,6 +187,28 @@ def get_capacity(num_tokens: int, num_experts: int, capacity_factor: float, min_ return capacity +def get_tokens_per_expert_and_token_count( + routing_map: torch.Tensor, + reduce_group: torch.distributed.ProcessGroup, + topk: int = None, + with_padding_mask: bool = False, +) -> torch.Tensor: + """ + Compute global_tokens_per_expert, local_num_tokens and total_num_tokens with padding mask. + """ + local_tokens_per_expert = routing_map.sum(dim=0) + global_tokens_per_expert = reduce_from_tensor_model_parallel_region( + local_tokens_per_expert, reduce_group + ) + if with_padding_mask: + local_num_tokens = local_tokens_per_expert.sum() / topk + total_num_tokens = global_tokens_per_expert.sum() / topk + else: + local_num_tokens = routing_map.shape[0] + total_num_tokens = local_num_tokens * reduce_group.size() + return global_tokens_per_expert, local_num_tokens, total_num_tokens + + class MoEAuxLossAutoScaler(torch.autograd.Function): """An AutoScaler that triggers the backward pass and scales the grad for auxiliary loss.""" @@ -629,35 +667,48 @@ def compute_topk(scores, topk, num_groups=None, group_topk=None): def compute_routing_scores_for_aux_loss( - logits: torch.Tensor, topk: int, score_function: str, fused: bool = False + logits: torch.Tensor, + topk: int, + score_function: str, + fused: bool = False, + padding_mask: Optional[torch.Tensor] = None, ): """Compute routing scores based on the score function. Args: logits (torch.Tensor): The logits tensor after gating, shape: [num_tokens, num_experts]. - + padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. + Shape [num_tokens]. True = padding (exclude), + False = valid (include). Defaults to None. Returns: - torch.Tensor: The normalized routing scores. + Tuple[torch.Tensor, torch.Tensor]: routing_map and scores. """ if fused: if not HAVE_TE or fused_compute_score_for_moe_aux_loss is None: raise ValueError( "fused_compute_score_for_moe_aux_loss is not available. Please install TE >= 2.6.0." ) - return fused_compute_score_for_moe_aux_loss( + routing_map, scores = fused_compute_score_for_moe_aux_loss( logits=logits, topk=topk, score_function=score_function ) - - if score_function == "softmax": - scores = torch.softmax(logits, dim=-1, dtype=torch.float32) - elif score_function == "sigmoid": - scores = torch.sigmoid(logits) - scores = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20) else: - raise ValueError(f"Invalid score_function: {score_function}") + if score_function == "softmax": + scores = torch.softmax(logits, dim=-1, dtype=torch.float32) + elif score_function == "sigmoid": + scores = torch.sigmoid(logits) + scores = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20) + else: + raise ValueError(f"Invalid score_function: {score_function}") + + _, top_indices = torch.topk(scores, k=topk, dim=1) + routing_map = torch.zeros_like(logits).int().scatter(1, top_indices, 1).bool() - _, top_indices = torch.topk(scores, k=topk, dim=1) - routing_map = torch.zeros_like(logits).int().scatter(1, top_indices, 1).bool() + # Apply padding mask to scores if provided + if padding_mask is not None: + # Invert padding_mask and make True indicates valid tokens + valid_mask = (~padding_mask).unsqueeze(-1) + routing_map = routing_map * valid_mask + scores = scores * valid_mask return routing_map, scores diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py index 16fc9d9af8f..1c502e212ad 100644 --- a/megatron/core/transformer/moe/router.py +++ b/megatron/core/transformer/moe/router.py @@ -1,12 +1,11 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. from abc import ABC, abstractmethod -from typing import Optional +from typing import Optional, Union import torch from megatron.core.jit import jit_fuser -from megatron.core.tensor_parallel import reduce_from_tensor_model_parallel_region from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.moe.moe_utils import ( MoEAuxLossAutoScaler, @@ -14,6 +13,7 @@ apply_random_logits, apply_router_token_dropping, compute_routing_scores_for_aux_loss, + get_tokens_per_expert_and_token_count, router_gating_linear, save_to_aux_losses_tracker, sinkhorn, @@ -268,22 +268,28 @@ def is_aux_loss_enabled(self) -> bool: return False def _apply_aux_loss( - self, probs: torch.Tensor, scores_for_aux_loss: torch.Tensor, routing_map: torch.Tensor + self, + probs: torch.Tensor, + scores_for_aux_loss: torch.Tensor, + routing_map: torch.Tensor, + with_padding_mask: bool = False, ): """Apply the auxiliary loss for the given scores and routing map.""" aux_loss_coeff = self.get_aux_loss_coeff("aux_loss") if aux_loss_coeff == 0: return probs - tokens_per_expert = routing_map.sum(dim=0) - tokens_per_expert = reduce_from_tensor_model_parallel_region( - tokens_per_expert, self.tp_cp_group - ) - num_tokens = routing_map.shape[0] - total_num_tokens = num_tokens * self.tp_cp_group.size() + global_tokens_per_expert, local_num_tokens, total_num_tokens = ( + get_tokens_per_expert_and_token_count( + routing_map=routing_map, + reduce_group=self.tp_cp_group, + topk=self.topk, + with_padding_mask=with_padding_mask, + ) + ) aux_loss = switch_load_balancing_loss_func( probs=scores_for_aux_loss, - tokens_per_expert=tokens_per_expert, + tokens_per_expert=global_tokens_per_expert, total_num_tokens=total_num_tokens, topk=self.topk, num_experts=self.config.num_moe_experts, @@ -291,7 +297,12 @@ def _apply_aux_loss( fused=self.config.moe_router_fusion, ) probs = self.attach_and_log_load_balancing_loss( - probs, aux_loss_coeff, aux_loss, "load_balancing_loss", self.tp_cp_group + probs, + aux_loss_coeff, + aux_loss, + "load_balancing_loss", + self.tp_cp_group, + valid_token_count=local_num_tokens, ) return probs @@ -302,6 +313,7 @@ def _apply_seq_aux_loss( routing_map: torch.Tensor, seq_length: int, bsz: int, + with_padding_mask: bool = False, ): """Apply the sequence-level auxiliary loss for the given scores and routing map. @@ -315,17 +327,21 @@ def _apply_seq_aux_loss( return probs scores_for_aux_loss = scores_for_aux_loss.reshape(seq_length, -1) - tokens_per_expert = routing_map.reshape(seq_length, -1).sum(dim=0) - tokens_per_expert = reduce_from_tensor_model_parallel_region( - tokens_per_expert, self.tp_cp_group + routing_map = routing_map.reshape(seq_length, -1) + + global_tokens_per_expert, local_num_tokens, total_num_tokens = ( + get_tokens_per_expert_and_token_count( + routing_map=routing_map, + reduce_group=self.tp_cp_group, + with_padding_mask=with_padding_mask, + topk=self.topk * bsz, + ) ) - total_num_tokens = seq_length * self.tp_cp_group.size() - aux_loss = ( switch_load_balancing_loss_func( probs=scores_for_aux_loss, - tokens_per_expert=tokens_per_expert, + tokens_per_expert=global_tokens_per_expert, total_num_tokens=total_num_tokens, topk=self.topk, num_experts=self.config.num_moe_experts, @@ -334,31 +350,42 @@ def _apply_seq_aux_loss( ) / bsz ) + probs = self.attach_and_log_load_balancing_loss( - probs, seq_aux_loss_coeff, aux_loss, "seq_load_balancing_loss", self.tp_cp_group + probs, + seq_aux_loss_coeff, + aux_loss, + "seq_load_balancing_loss", + self.tp_cp_group, + valid_token_count=local_num_tokens, ) return probs def _apply_global_aux_loss( - self, probs: torch.Tensor, scores_for_aux_loss: torch.Tensor, routing_map: torch.Tensor + self, + probs: torch.Tensor, + scores_for_aux_loss: torch.Tensor, + routing_map: torch.Tensor, + with_padding_mask: bool = False, ): """Apply the global auxiliary loss for the given scores and routing map.""" global_aux_loss_coeff = self.get_aux_loss_coeff("global_aux_loss") if global_aux_loss_coeff == 0: return probs - tokens_per_expert = routing_map.sum(dim=0) - tokens_per_expert = reduce_from_tensor_model_parallel_region( - tokens_per_expert, self.tp_dp_cp_group + # Use unified function to compute tokens_per_expert and num_tokens + global_tokens_per_expert, local_num_tokens, total_num_tokens = ( + get_tokens_per_expert_and_token_count( + routing_map=routing_map, + reduce_group=self.tp_dp_cp_group, + with_padding_mask=with_padding_mask, + topk=self.topk, + ) ) - - self.global_tokens_per_expert += tokens_per_expert + self.global_tokens_per_expert += global_tokens_per_expert self.ga_steps += 1 averated_tokens_per_expert = self.global_tokens_per_expert / self.ga_steps - num_tokens = scores_for_aux_loss.shape[0] - total_num_tokens = num_tokens * self.tp_dp_cp_group.size() - global_aux_loss = switch_load_balancing_loss_func( probs=scores_for_aux_loss, tokens_per_expert=averated_tokens_per_expert, @@ -374,6 +401,7 @@ def _apply_global_aux_loss( global_aux_loss, "global_load_balancing_loss", self.tp_dp_cp_group, + valid_token_count=local_num_tokens, ) return probs @@ -384,8 +412,20 @@ def attach_and_log_load_balancing_loss( aux_loss: torch.Tensor, aux_loss_name: str, reduce_group: torch.distributed.ProcessGroup, + valid_token_count: Optional[Union[int, torch.Tensor]] = None, ): - """Attach aux loss function to activation and add to logging.""" + """Attach aux loss function to activation and add to logging. + + Args: + activation (torch.Tensor): Activation tensor to attach the aux loss to. + aux_loss_coeff (float): Coefficient for the aux loss. + aux_loss (torch.Tensor): Computed aux loss. + aux_loss_name (str): Name of the aux loss for logging. + reduce_group (torch.distributed.ProcessGroup): Process group for reduction. + valid_token_count (int or torch.Tensor, optional): Number of valid tokens excluding + padding tokens. Can be a Python int or a torch.Tensor (typically 0-d tensor). + If None, uses activation.shape[0]. Defaults to None. + """ # TODO (zijiey): fix the per_layer_logging for MTP, currently it will incorrectly # add the aux loss logging value to other layer's since it is difficult to get the # correct layer_number for MTP. It does not affect the correctness of the calculation @@ -408,17 +448,22 @@ def attach_and_log_load_balancing_loss( # which scales both the main_loss gradient and aux_loss gradient by # 1/(num_local_tokens * dp_size * num_micro_batches) in finalize_model_grads function. # To correct this scaling, we need to scale the aux_loss by num_local_tokens here. - activation = MoEAuxLossAutoScaler.apply(activation, aux_loss * activation.shape[0]) + # Use valid_token_count (excluding padding) if provided, otherwise use total tokens. + num_tokens = valid_token_count if valid_token_count is not None else activation.shape[0] + activation = MoEAuxLossAutoScaler.apply(activation, aux_loss * num_tokens) else: activation = MoEAuxLossAutoScaler.apply(activation, aux_loss) return activation - def apply_z_loss(self, logits): + def apply_z_loss(self, logits, padding_mask: Optional[torch.Tensor] = None): """Encourages the router's logits to remain small to enhance stability. Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details. Args: logits (torch.Tensor): The logits of the router. + padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. + Shape [num_tokens]. True = padding (exclude), + False = valid (include). Defaults to None. Returns: torch.Tensor: The logits after applying the z-loss. @@ -426,7 +471,7 @@ def apply_z_loss(self, logits): if self.config.moe_z_loss_coeff is not None and self.training and torch.is_grad_enabled(): # Skip Z loss calculations when using torch.no_grad() or checkpointing. moe_z_loss_coeff = self.config.moe_z_loss_coeff / self.tp_cp_group.size() - z_loss = z_loss_func(logits, moe_z_loss_coeff) + z_loss = z_loss_func(logits, moe_z_loss_coeff, padding_mask=padding_mask) scale_up = 1.0 if self.calculate_per_token_loss: # The expected final scaling for z_loss gradients is @@ -436,7 +481,9 @@ def apply_z_loss(self, logits): # which scales both the main_loss gradient and z_loss gradient by # 1/(num_local_tokens * dp_size * num_micro_batches) in finalize_model_grads(). # To correct this scaling, we need to scale the z_loss by num_local_tokens here. - logits = MoEAuxLossAutoScaler.apply(logits, z_loss * logits.shape[0]) + # Count valid tokens: sum of inverted mask (False -> True = valid) + num_tokens = (~padding_mask).sum() if padding_mask is not None else logits.shape[0] + logits = MoEAuxLossAutoScaler.apply(logits, z_loss * num_tokens) else: logits = MoEAuxLossAutoScaler.apply(logits, z_loss) @@ -470,20 +517,32 @@ def apply_input_jitter(self, input: torch.Tensor): return input @jit_fuser - def _apply_expert_bias(self, routing_map: torch.Tensor): + def _apply_expert_bias( + self, routing_map: torch.Tensor, padding_mask: Optional[torch.Tensor] = None + ): """ Update expert bias and tokens_per_expert Prevent extra local tokens accumulation on evaluation or activation recomputation + + Args: + routing_map (torch.Tensor): Token to expert routing map, [num_tokens, num_experts]. + padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. + Shape [num_tokens]. True = padding (exclude), False = valid (include). """ if self.enable_expert_bias and torch.is_grad_enabled(): with torch.no_grad(): + if padding_mask is not None: + routing_map = routing_map & (~padding_mask) self.local_tokens_per_expert += routing_map.sum(dim=0) - def routing(self, logits: torch.Tensor): + def routing(self, logits: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): """Top-k routing function Args: logits (torch.Tensor): Logits tensor after gating. + padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. + Shape = [seq_length, bsz]. True=padding(exclude), + False=valid(include). Defaults to None. Returns: probs (torch.Tensor): The probabilities of token to experts assignment. @@ -493,8 +552,12 @@ def routing(self, logits: torch.Tensor): seq_length, bsz = logits.shape[:2] logits = logits.view(-1, self.config.num_moe_experts) + # Flatten padding_mask to [num_tokens] if provided + if padding_mask is not None: + padding_mask = padding_mask.reshape(-1) + # Apply Z-Loss - logits = self.apply_z_loss(logits) + logits = self.apply_z_loss(logits, padding_mask=padding_mask) # Calculate probs and routing_map for token dispatching if self.routing_type == "sinkhorn": @@ -527,18 +590,35 @@ def routing(self, logits: torch.Tensor): if self.training and torch.is_grad_enabled() and self.is_aux_loss_enabled(): # Calculate scores and routing_map for aux loss routing_map_for_aux_loss, scores_for_aux_loss = compute_routing_scores_for_aux_loss( - logits, self.topk, self.score_function, fused=self.config.moe_router_fusion + logits, + self.topk, + self.score_function, + fused=self.config.moe_router_fusion, + padding_mask=padding_mask, + ) + probs = self._apply_aux_loss( + probs, + scores_for_aux_loss, + routing_map_for_aux_loss, + with_padding_mask=padding_mask is not None, ) - probs = self._apply_aux_loss(probs, scores_for_aux_loss, routing_map_for_aux_loss) probs = self._apply_seq_aux_loss( - probs, scores_for_aux_loss, routing_map_for_aux_loss, seq_length, bsz + probs, + scores_for_aux_loss, + routing_map_for_aux_loss, + seq_length, + bsz, + with_padding_mask=padding_mask is not None, ) probs = self._apply_global_aux_loss( - probs, scores_for_aux_loss, routing_map_for_aux_loss + probs, + scores_for_aux_loss, + routing_map_for_aux_loss, + with_padding_mask=padding_mask is not None, ) # Optionally apply expert bias - self._apply_expert_bias(routing_map) + self._apply_expert_bias(routing_map, padding_mask=padding_mask) return probs, routing_map @@ -548,12 +628,15 @@ def reset_global_aux_loss_tracker(self): self.global_tokens_per_expert.zero_() self.ga_steps.zero_() - def forward(self, input: torch.Tensor): + def forward(self, input: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): """ Forward pass of the router. Args: input (torch.Tensor): Input tensor. + padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. + Shape = [seq_length, bsz]. True=padding(exclude), + False=valid(include). Defaults to None. """ self._maintain_float32_expert_bias() @@ -565,7 +648,7 @@ def forward(self, input: torch.Tensor): # Apply force load balancing with random logits for benchmark logits = apply_random_logits(logits) - probs, routing_map = self.routing(logits) + probs, routing_map = self.routing(logits, padding_mask=padding_mask) return probs, routing_map diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index 023db1fe75a..cbbd7ec00eb 100755 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -390,7 +390,6 @@ def build_layer(layer_spec, layer_number): def has_final_layernorm_in_this_stage(self): """ Check if this vpp stage contains the final layernorm. - Note: Final layernorm now has been moved from the post-process stage to the last decoder layer by using this function. @@ -429,12 +428,18 @@ def _checkpointed_forward( attention_bias: Tensor, packed_seq_params: PackedSeqParams, use_inner_quantization_context: bool, + padding_mask: Optional[Tensor] = None, ): """Forward method with activation checkpointing.""" def custom(start: int, end: int): def custom_forward( - hidden_states, attention_mask, context, context_mask, rotary_pos_emb + hidden_states, + attention_mask, + context, + context_mask, + rotary_pos_emb, + padding_mask=None, ): for index in range(start, end): layer = self._get_layer(index) @@ -465,6 +470,7 @@ def custom_forward( attention_bias=attention_bias, inference_context=None, packed_seq_params=packed_seq_params, + padding_mask=padding_mask, ) return hidden_states, context @@ -484,6 +490,7 @@ def checkpoint_handler(forward_func): context, context_mask, rotary_pos_emb, + padding_mask, ) else: return tensor_parallel.checkpoint( @@ -494,6 +501,7 @@ def checkpoint_handler(forward_func): context, context_mask, rotary_pos_emb, + padding_mask, ) if self.config.recompute_method == 'uniform': @@ -599,6 +607,7 @@ def forward( inference_context: Optional[BaseInferenceContext] = None, packed_seq_params: Optional[PackedSeqParams] = None, sequence_len_offset: Optional[Tensor] = None, + padding_mask: Optional[Tensor] = None, *, inference_params: Optional[BaseInferenceContext] = None, dynamic_inference_decode_only: Optional[bool] = None, @@ -708,6 +717,7 @@ def forward( attention_bias=attention_bias, packed_seq_params=packed_seq_params, use_inner_quantization_context=use_inner_quantization_context, + padding_mask=padding_mask, ) else: for l_no, layer in enumerate(self.layers): @@ -745,6 +755,7 @@ def forward( inference_context=inference_context, packed_seq_params=packed_seq_params, sequence_len_offset=sequence_len_offset, + padding_mask=padding_mask, ) if ( diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index 3ea40577009..21f38b06f30 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -457,7 +457,12 @@ def forward(self, *args, **kwargs): # runners in the cuda graph manager kwargs.pop("dynamic_inference_decode_only", None) hidden_states, context = self._forward_attention(*args, **kwargs) - output = self._forward_mlp(hidden_states, kwargs.get("inference_context", None)) + + output = self._forward_mlp( + hidden_states, + kwargs.get("inference_context", None), + padding_mask=kwargs.get("padding_mask", None), + ) return output, context def _forward_attention( @@ -474,6 +479,7 @@ def _forward_attention( inference_context: Optional[Any] = None, packed_seq_params: Optional[PackedSeqParams] = None, sequence_len_offset: Optional[Tensor] = None, + padding_mask: Optional[Tensor] = None, *, inference_params: Optional[Any] = None, ): @@ -591,12 +597,18 @@ def _forward_attention( return hidden_states, context - def _forward_mlp(self, hidden_states, inference_context=None): + def _forward_mlp(self, hidden_states, inference_context=None, padding_mask=None): """ Perform a forward pass through the feed-forward layer. Args: hidden_states (Tensor): Transformed hidden states before the MLP layernorm. + Shape [seq_length, batch_size, hidden_size]. + inference_context: Inference context for optimizations. + padding_mask (Tensor, optional): Padding mask for MoE routing. + Shape [bsz, seq_length]. True = padding (exclude), False = valid (include). + Only used for MoE layers to exclude padding tokens from aux loss computations. + The MoELayer will internally transform this to [seq_length, bsz] format. Returns: output (Tensor): Transformed hidden states of shape [s, b, h]. @@ -642,7 +654,7 @@ def _forward_mlp(self, hidden_states, inference_context=None): assert ( not self.recompute_pre_mlp_layernorm ), "Recomputation is not supported for CUDA graph." - cudagraph_outputs = self.mlp(pre_mlp_layernorm_output) + cudagraph_outputs = self.mlp(pre_mlp_layernorm_output, padding_mask=padding_mask) nvtx_range_pop(suffix="mlp") return cudagraph_outputs + [residual] elif self.recompute_mlp: @@ -656,10 +668,11 @@ def _forward_mlp(self, hidden_states, inference_context=None): tensor_parallel.random.get_cuda_rng_tracker, self.pg_collection.tp, pre_mlp_layernorm_output, + padding_mask=padding_mask, ) else: mlp_output_with_bias = tensor_parallel.checkpoint( - self.mlp, False, pre_mlp_layernorm_output + self.mlp, False, pre_mlp_layernorm_output, padding_mask=padding_mask ) elif should_chunk_mlp_for_prefill: # Chunk input along sequence dimension @@ -675,7 +688,7 @@ def _forward_mlp(self, hidden_states, inference_context=None): bias_output = torch.stack(bias_chunks, dim=0).sum(dim=0) if bias_chunks else None mlp_output_with_bias = (mlp_output, bias_output) else: - mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output) + mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output, padding_mask=padding_mask) if self.recompute_pre_mlp_layernorm: # discard the output of the pre-mlp layernorm and register the recompute diff --git a/tests/test_utils/python_scripts/recipe_parser.py b/tests/test_utils/python_scripts/recipe_parser.py index a497bdbd9de..b866fbbf5c2 100644 --- a/tests/test_utils/python_scripts/recipe_parser.py +++ b/tests/test_utils/python_scripts/recipe_parser.py @@ -1,3 +1,4 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import copy import itertools import logging diff --git a/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py b/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py index 81e61a3404a..6c59dd3f9e3 100644 --- a/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py +++ b/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py @@ -23,7 +23,7 @@ from tests.unit_tests.test_utilities import Utils -def build_model(config): +def build_model(config, use_padding_mask=False): seq_len = 32 max_seq_len = 300 # ids = random.sample([i for i in range(max_seq_len)], seq_len) @@ -39,6 +39,12 @@ def build_model(config): "attention_mask": torch.ones((1, 1, seq_len, seq_len), dtype=bool).cuda(), } + # Optionally add padding_mask with same shape as input_ids + if use_padding_mask: + padding_mask = torch.zeros((1, seq_len), dtype=torch.bool).cuda() + padding_mask[0, -8:] = True + data["padding_mask"] = padding_mask + # build layer spec transformer_layer_spec = get_gpt_decoder_block_spec(config=config, use_transformer_engine=True) mtp_block_spec = get_gpt_mtp_block_spec(config, transformer_layer_spec.layer_specs[-1], True) @@ -48,7 +54,7 @@ def build_model(config): config=config, transformer_layer_spec=transformer_layer_spec, mtp_block_spec=mtp_block_spec, - vocab_size=100, + vocab_size=128, pre_process=True, post_process=True, max_sequence_length=max_seq_len, @@ -174,3 +180,109 @@ def test_1f1b_schedule_model_chunk(self, mtp_layers, dispatcher_type, fp8_flag, gpt_models[i] = None gc.collect() torch.cuda.empty_cache() + + @pytest.mark.skipif(not is_te_min_version("1.9.0.dev0"), reason="Requires TE >= 1.9.0.dev0") + @pytest.mark.parametrize("dispatcher_type", get_valid_token_dispatcher_types()) + @pytest.mark.parametrize("layers", [[2, 1], [1, 1]]) + @pytest.mark.parametrize("tp_size", [1, 2, 4, 8]) + def test_1f1b_schedule_model_chunk_with_padding_mask(self, dispatcher_type, layers, tp_size): + """ + Verifies all-to-all overlap optimization with padding_mask produces + the same results as the reference implementation with various TP/EP/CP combinations. + """ + # Re-initialize model parallel with the specified configuration + Utils.destroy_model_parallel() + Utils.initialize_model_parallel( + tensor_model_parallel_size=tp_size, + pipeline_model_parallel_size=1, + expert_model_parallel_size=4, + expert_tensor_parallel_size=1, + ) + set_streams() + + microbatches = 1 + + gpt_models = [] + schedule_plans = [] + ref_captures = [] + datas = [] + + # create TransformerConfig + extra_kwargs = { + "moe_token_dispatcher_type": dispatcher_type, + "tensor_model_parallel_size": tp_size, + "sequence_parallel": tp_size > 1, + } + if dispatcher_type == "flex": + extra_kwargs["moe_flex_dispatcher_backend"] = "deepep" + extra_kwargs["moe_router_dtype"] = "fp32" + with deterministic_mode(): + for layer_num in layers: + output_tensors = [] + # build config + config = get_test_config(num_layers=layer_num, extra_kwargs=extra_kwargs) + # build model with padding_mask + gpt_model, schedule_plan, data = build_model(config, use_padding_mask=True) + gpt_model.cuda() + gpt_models.append(gpt_model) + datas.append(data) + schedule_plans.append(schedule_plan) + + # run reference + for _ in range(microbatches): + loss = gpt_model.forward(**data) + loss = float16_to_fp32(loss) + loss.backward(torch.ones_like(loss)) + output_tensors.append(loss) + + capture = {"outputs": output_tensors} + for name, param in gpt_model.named_parameters(): + capture[name] = param.grad + ref_captures.append(capture) + gpt_model.zero_grad() + assert gpt_models[0].embedding is not None + assert gpt_models[1].embedding is not None + # run a2a overlap + capture_0 = {"outputs": []} + capture_1 = {"outputs": []} + a2a_captures = [capture_0, capture_1] + for i in range(microbatches): + # 1st forward + if i > 0: + assert ( + schedule_plans[0].pre_process is None + ), "pre_process should be released after backward" + schedule_plans[0] = gpt_models[0].build_schedule_plan(**datas[0]) + schedule_plans[1] = gpt_models[1].build_schedule_plan(**datas[1]) + f_input_0 = TransformerModelChunkSchedulePlan.run(schedule_plans[0], None) + capture_0["outputs"].append(f_input_0) + # overlap + f_input_1 = TransformerModelChunkSchedulePlan.run( + schedule_plans[1], schedule_plans[0], b_grad=torch.ones_like(f_input_0) + ) + capture_1["outputs"].append(f_input_1) + # last backward + TransformerModelChunkSchedulePlan.run( + None, schedule_plans[1], b_grad=torch.ones_like(f_input_1) + ) + for i in range(len(gpt_models)): + for name, param in gpt_models[i].named_parameters(): + a2a_captures[i][name] = param.grad + + # compare results + for i in range(len(ref_captures)): + comp_res = compare_captures(ref_captures[i], a2a_captures[i], True, True) + assert comp_res[0], f"[rank {torch.distributed.get_rank()}] {comp_res[1]}" + + # release resources is necessary, otherwise later testcases will oom + for i in range(len(schedule_plans)): + schedule_plans[i] = None + ref_captures[i] = None + a2a_captures[i] = None + for k in datas[i]: + datas[i][k] = None + datas[i] = None + gpt_models[i].zero_grad() + gpt_models[i] = None + gc.collect() + torch.cuda.empty_cache() diff --git a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py index 7fb97f6e586..5ec096e5a04 100644 --- a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py +++ b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py @@ -502,8 +502,8 @@ def test_mtp_layer_overlap(self, dispatcher_type, fp8_flag): position_ids = torch.tensor(data, dtype=torch.int64).repeat((1, 1)).cuda() attention_mask = torch.ones((1, 1, seq_len, seq_len), dtype=bool).cuda() # get rotary pos emb - _, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, _ = gpt_model._preprocess( - input_ids, position_ids + _, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, _, _padding_mask = ( + gpt_model._preprocess(input_ids, position_ids) ) # reset model params = reset_model(gpt_model) diff --git a/tests/unit_tests/transformer/moe/test_aux_loss.py b/tests/unit_tests/transformer/moe/test_aux_loss.py index b1f78582383..f5726777383 100644 --- a/tests/unit_tests/transformer/moe/test_aux_loss.py +++ b/tests/unit_tests/transformer/moe/test_aux_loss.py @@ -576,3 +576,192 @@ def test_force_balanced_aux_loss(self, tp_size, ep_size, cp_size): reduce_from_tensor_model_parallel_region(aux_loss, router.tp_cp_group) assert aux_loss.item() == 1, f"{aux_loss_type}: {aux_loss.item()}" clear_aux_losses_tracker() + + +class TestPaddingMaskAuxLoss: + """Test padding mask support in various aux loss types.""" + + def setup_model_parallel(self, tp_size=1, ep_size=1, cp_size=1, sequence_parallel=False): + """Initialize model parallel with given configuration. + + Args: + tp_size: Tensor parallel size. + ep_size: Expert parallel size. + cp_size: Context parallel size. + """ + Utils.initialize_model_parallel( + tensor_model_parallel_size=tp_size, + pipeline_model_parallel_size=1, + context_parallel_size=cp_size, + expert_model_parallel_size=ep_size, + ) + _set_random_seed(seed_=123, data_parallel_random_init=False) + + # Store parallel configuration + self.tp_size = tp_size + self.ep_size = ep_size + self.cp_size = cp_size + + # Default configuration + self.default_transformer_config = TransformerConfig( + num_layers=1, + hidden_size=12, + num_attention_heads=8, + num_moe_experts=32, + use_cpu_initialization=True, + moe_router_load_balancing_type="aux_loss", + moe_router_topk=8, + moe_aux_loss_coeff=1.0, + bf16=True, + params_dtype=torch.bfloat16, + add_bias_linear=False, + tensor_model_parallel_size=tp_size, + expert_model_parallel_size=ep_size, + context_parallel_size=cp_size, + sequence_parallel=sequence_parallel and tp_size > 1, + ) + + def new_router(self, **kwargs): + """Create a new router with updated configuration.""" + pg_collection = get_default_pg_collection() + new_transformer_config = dataclasses.replace(self.default_transformer_config, **kwargs) + router = TopKRouter(config=new_transformer_config, pg_collection=pg_collection) + router.set_layer_number(0) + return router + + @pytest.mark.internal + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.parametrize("sequence_parallel", [True, False]) + @pytest.mark.parametrize("aux_loss_type", ["aux_loss", "seq_aux_loss", "global_aux_loss"]) + @pytest.mark.parametrize( + "tp_size,ep_size,cp_size", [(8, 1, 1), (4, 2, 1), (1, 1, 8), (2, 1, 4), (2, 2, 2)] + ) + def test_padding_mask_removes_padding_tokens( + self, aux_loss_type, tp_size, ep_size, cp_size, sequence_parallel + ): + """Test that padding tokens are correctly excluded from aux loss calculation.""" + # Initialize model parallel with given configuration + self.setup_model_parallel( + tp_size=tp_size, ep_size=ep_size, cp_size=cp_size, sequence_parallel=sequence_parallel + ) + + try: + clear_aux_losses_tracker() + + router = self.new_router( + moe_router_load_balancing_type=aux_loss_type, + moe_aux_loss_coeff=1.0, + moe_router_dtype="fp64", + ).cuda() + + seq_len = 32 + batch_size = 2 + hidden_size = router.config.hidden_size + + # Create input with padding + hidden_states_full = torch.randn( + (seq_len, batch_size, hidden_size), dtype=torch.bfloat16, device='cuda' + ) + + # Create padding mask: first half valid (False), second half padding (True) + # Convention: True = padding (exclude), False = valid (include) + padding_mask = torch.zeros((seq_len, batch_size), dtype=torch.bool, device='cuda') + padding_mask[seq_len // 2 :, :] = True + + # Test with padding mask + router.weight.grad = None + scores_with_mask, routing_map_with_mask = router( + hidden_states_full, padding_mask=padding_mask + ) + scores_with_mask.backward(torch.zeros_like(scores_with_mask)) + + loss_name = { + "aux_loss": "load_balancing_loss", + "seq_aux_loss": "seq_load_balancing_loss", + "global_aux_loss": "global_load_balancing_loss", + }[aux_loss_type] + + tracker = get_moe_layer_wise_logging_tracker() + aux_loss_with_mask = tracker[loss_name]["values"][0].clone() + grad_with_mask = router.weight.grad.clone() + + # Test without padding (with only half of the tokens) + clear_aux_losses_tracker() + router.weight.grad = None + hidden_states_valid = hidden_states_full[: seq_len // 2, :, :] + scores_without_mask, routing_map_without_mask = router(hidden_states_valid) + scores_without_mask.backward(torch.zeros_like(scores_without_mask)) + + aux_loss_without_mask = tracker[loss_name]["values"][0].clone() + grad_without_mask = router.weight.grad.clone() + + # The aux loss with mask should be close to the aux loss without mask + assert torch.equal(aux_loss_with_mask, aux_loss_without_mask) + assert torch.equal(grad_with_mask, grad_without_mask) + + clear_aux_losses_tracker() + finally: + # Always cleanup model parallel + Utils.destroy_model_parallel() + + @pytest.mark.internal + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.parametrize( + "tp_size,ep_size,cp_size", [(8, 1, 1), (4, 2, 1), (1, 1, 8), (2, 1, 4), (2, 2, 2)] + ) + def test_padding_mask_with_z_loss(self, tp_size, ep_size, cp_size): + """Test that padding mask works correctly with z_loss.""" + # Initialize model parallel with given configuration + self.setup_model_parallel(tp_size=tp_size, ep_size=ep_size, cp_size=cp_size) + + try: + clear_aux_losses_tracker() + + router = self.new_router( + moe_router_load_balancing_type="aux_loss", + moe_aux_loss_coeff=0.0, + moe_z_loss_coeff=1.0, + moe_router_dtype="fp32", + ).cuda() + + seq_len = 32 + batch_size = 2 + hidden_size = router.config.hidden_size + + # Create input + hidden_states_full = torch.randn( + (seq_len, batch_size, hidden_size), dtype=torch.bfloat16, device='cuda' + ) + + # Create padding mask: first half valid (False), second half padding (True) + # Convention: True = padding (exclude), False = valid (include) + padding_mask = torch.zeros((seq_len, batch_size), dtype=torch.bool, device='cuda') + padding_mask[seq_len // 2 :, :] = True + + # Test with padding mask + router.weight.grad = None + scores_with_mask, _ = router(hidden_states_full, padding_mask=padding_mask) + scores_with_mask.sum().backward() + + tracker = get_moe_layer_wise_logging_tracker() + z_loss_with_mask = tracker["z_loss"]["values"][0].clone() + grad_with_mask = router.weight.grad.clone() + + # Test without padding (with only half of the tokens) + clear_aux_losses_tracker() + router.weight.grad = None + hidden_states_valid = hidden_states_full[: seq_len // 2, :, :] + scores_without_mask, _ = router(hidden_states_valid) + scores_without_mask.sum().backward() + + z_loss_without_mask = tracker["z_loss"]["values"][0].clone() + grad_without_mask = router.weight.grad.clone() + + # The z_loss with mask should be close to the z_loss without mask + assert torch.equal(z_loss_with_mask, z_loss_without_mask) + assert torch.equal(grad_with_mask, grad_without_mask) + + clear_aux_losses_tracker() + finally: + # Always cleanup model parallel + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/transformer/moe/test_routers.py b/tests/unit_tests/transformer/moe/test_routers.py index 677d938cdc7..abd1a4db2dc 100644 --- a/tests/unit_tests/transformer/moe/test_routers.py +++ b/tests/unit_tests/transformer/moe/test_routers.py @@ -125,6 +125,53 @@ def test_aux_loss(self): out.sum().mul_(0).backward() assert self.sequential_mlp.router.weight.grad.abs().sum() > 0 + @pytest.mark.internal + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_router_with_padding_mask(self): + """Test that padding mask correctly excludes padding tokens from routing.""" + self.router = self.router.cuda() + seq_len = 32 + batch_size = 2 + hidden_size = self.router.config.hidden_size + + # Create input with shape [seq_len, batch_size, hidden_size] + hidden_states = torch.randn((seq_len, batch_size, hidden_size)).cuda().bfloat16() + + # Create padding mask: first half valid (False), second half padding (True) + # padding_mask shape: [seq_len, batch_size] + # Convention: True = padding (exclude), False = valid (include) + padding_mask = torch.zeros((seq_len, batch_size), dtype=torch.bool, device='cuda') + padding_mask[seq_len // 2 :, :] = True # Second half is padding + + # Test forward pass with padding mask + with torch.no_grad(): + probs_with_mask, routing_map_with_mask = self.router( + hidden_states, padding_mask=padding_mask + ) + + # Test forward pass without padding mask (only valid tokens) + hidden_states_valid = hidden_states[: seq_len // 2, :, :] + probs_without_mask, routing_map_without_mask = self.router(hidden_states_valid) + + # The valid part of routing with mask should match routing without mask + probs_valid_part = probs_with_mask.reshape(seq_len, batch_size, -1)[ + : seq_len // 2, :, : + ] + probs_valid_part = probs_valid_part.reshape(-1, probs_valid_part.shape[-1]) + + # Check that shapes are as expected + assert probs_with_mask.shape == ( + seq_len * batch_size, + self.router.config.num_moe_experts, + ) + assert routing_map_with_mask.shape == ( + seq_len * batch_size, + self.router.config.num_moe_experts, + ) + + # Verify that probs for valid tokens are similar + assert torch.equal(probs_valid_part, probs_without_mask) + @pytest.mark.internal @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") def test_router_dtype(self): From 1068d775d665b9629193c5c8ec60813c4ec2b118 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Tue, 23 Dec 2025 23:04:37 -0600 Subject: [PATCH 205/248] Revert "[Dev] Remove calculation of padding token in moe routing loss (#2121)" (#2747) Signed-off-by: Charlie Truong --- .../core/extensions/transformer_engine.py | 2 +- .../common/model_chunk_schedule_plan.py | 2 - .../core/models/gpt/fine_grained_callables.py | 21 +- megatron/core/models/gpt/gpt_model.py | 37 +--- megatron/core/transformer/mlp.py | 2 +- megatron/core/transformer/moe/moe_layer.py | 27 +-- megatron/core/transformer/moe/moe_utils.py | 83 ++------ megatron/core/transformer/moe/router.py | 167 ++++------------ .../core/transformer/transformer_block.py | 15 +- .../core/transformer/transformer_layer.py | 23 +-- .../python_scripts/recipe_parser.py | 1 - .../a2a_overlap/test_schedule_chunk_1f1b.py | 116 +---------- .../a2a_overlap/test_schedule_layer_1f1b.py | 4 +- .../transformer/moe/test_aux_loss.py | 189 ------------------ .../transformer/moe/test_routers.py | 47 ----- 15 files changed, 90 insertions(+), 646 deletions(-) diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index 546f8a59318..acb93ef7853 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -1851,7 +1851,7 @@ def forward_post_hook(module, *_) -> None: "TEFusedMLP module does not support submodules with post-backward hooks" ) - def forward(self, hidden_states: torch.Tensor, **kwargs) -> Tuple[Tensor, Optional[Tensor]]: + def forward(self, hidden_states: torch.Tensor) -> Tuple[Tensor, Optional[Tensor]]: """Forward.""" # Construct fused impl if needed diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py index 07bab1cb486..486a498dd73 100644 --- a/megatron/core/models/common/model_chunk_schedule_plan.py +++ b/megatron/core/models/common/model_chunk_schedule_plan.py @@ -305,7 +305,6 @@ def __init__( extra_block_kwargs=None, runtime_gather_output: Optional[bool] = None, loss_mask: Optional[Tensor] = None, - padding_mask=None, ): """Initialize the schedule plan of all Transformer layers' sub-modules. @@ -348,7 +347,6 @@ def __init__( self._model_chunk_state.mtp_hidden_states = None self._model_chunk_state.loss_mask = loss_mask self._model_chunk_state.packed_seq_params = packed_seq_params - self._model_chunk_state.padding_mask = padding_mask self._model_chunk_state.extra_block_kwargs = extra_block_kwargs self._model_chunk_state.runtime_gather_output = runtime_gather_output self._model_chunk_state.model = model diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index b0923a37b80..741a25326fb 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -120,19 +120,13 @@ def forward_impl(self): if not self.gpt_model.pre_process: self.chunk_state.decoder_input = self.gpt_model.decoder.input_tensor # Run GPTModel._preprocess - ( - decoder_input, - rotary_pos_emb, - rotary_pos_cos, - rotary_pos_sin, - sequence_len_offset, - padding_mask, - ) = self.gpt_model._preprocess( - input_ids=self.chunk_state.input_ids, - position_ids=self.chunk_state.position_ids, - decoder_input=self.chunk_state.decoder_input, - packed_seq_params=self.chunk_state.packed_seq_params, - padding_mask=self.chunk_state.padding_mask, + decoder_input, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, sequence_len_offset = ( + self.gpt_model._preprocess( + input_ids=self.chunk_state.input_ids, + position_ids=self.chunk_state.position_ids, + decoder_input=self.chunk_state.decoder_input, + packed_seq_params=self.chunk_state.packed_seq_params, + ) ) # Saved for later use @@ -141,7 +135,6 @@ def forward_impl(self): self.chunk_state.rotary_pos_cos = rotary_pos_cos self.chunk_state.rotary_pos_sin = rotary_pos_sin self.chunk_state.sequence_len_offset = sequence_len_offset - self.chunk_state.padding_mask = padding_mask return decoder_input diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 9e70c677226..a1230568cbd 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -284,7 +284,6 @@ def _preprocess( decoder_input: Tensor = None, inference_context: BaseInferenceContext = None, packed_seq_params: PackedSeqParams = None, - padding_mask: Optional[Tensor] = None, ): """Preprocesses inputs for the transformer decoder. @@ -301,20 +300,7 @@ def _preprocess( if decoder_input is not None: pass elif self.pre_process: - if padding_mask is not None: - assert padding_mask.shape == input_ids.shape, ( - f"padding_mask shape {padding_mask.shape} does not match " - f"input_ids shape {input_ids.shape}" - ) decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids) - if padding_mask is not None and self.config.sequence_parallel: - padding_mask = ( - tensor_parallel.scatter_to_sequence_parallel_region( - padding_mask.transpose(0, 1).contiguous() - ) - .transpose(0, 1) - .contiguous() - ) else: # intermediate stage of pipeline # decoder will get hidden_states from encoder.input_tensor @@ -417,7 +403,6 @@ def _preprocess( rotary_pos_cos, rotary_pos_sin, sequence_len_offset, - padding_mask, ) if rotary_pos_cos_sin is not None: # only in the case of flashinfer fused rope will we @@ -461,7 +446,6 @@ def forward( *, inference_params: Optional[BaseInferenceContext] = None, loss_mask: Optional[Tensor] = None, - padding_mask: Optional[Tensor] = None, ) -> Tensor: """Forward function of the GPT Model This function passes the input tensors through the embedding layer, and then the decoder and finally into the post @@ -472,9 +456,6 @@ def forward( Args: runtime_gather_output (bool): Gather output at runtime. Default None means `parallel_output` arg in the constructor will be used. - padding_mask (Tensor, optional): Padding mask for MoE routing. - Shape [bsz, seq_length]. True = padding (exclude), False = valid (include). - Only used for MoE layers to exclude padding tokens from routing computations. """ if self.config.fine_grained_activation_offloading: self.preprocess_for_fine_grained_offloading() @@ -487,19 +468,13 @@ def forward( decoder_input=decoder_input, inference_context=inference_context, packed_seq_params=packed_seq_params, - padding_mask=padding_mask, ) - ( - decoder_input, - rotary_pos_emb, - rotary_pos_cos, - rotary_pos_sin, - sequence_len_offset, - padding_mask, - ) = preproc_output[:6] + (decoder_input, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, sequence_len_offset) = ( + preproc_output[:5] + ) - rotary_pos_cos_sin = preproc_output[6] if len(preproc_output) == 7 else None + rotary_pos_cos_sin = preproc_output[5] if len(preproc_output) == 6 else None # Run decoder. hidden_states = self.decoder( @@ -512,7 +487,6 @@ def forward( rotary_pos_cos_sin=rotary_pos_cos_sin, packed_seq_params=packed_seq_params, sequence_len_offset=sequence_len_offset, - padding_mask=padding_mask, **(extra_block_kwargs or {}), ) @@ -750,7 +724,6 @@ def build_schedule_plan( runtime_gather_output: Optional[bool] = None, inference_params: Optional[BaseInferenceContext] = None, loss_mask: Optional[Tensor] = None, - padding_mask: Optional[Tensor] = None, ): """Builds a computation schedule plan for the model. @@ -776,7 +749,6 @@ def build_schedule_plan( inference_params (InferenceParams, optional): Parameters for inference. Defaults to None. loss_mask (Optional[Tensor], optional): Loss mask. Defaults to None. - padding_mask (Optional[Tensor], optional): Padding mask. Defaults to None. Returns: TransformerModelChunkSchedulePlan: The model chunk schedule plan. @@ -798,7 +770,6 @@ def build_schedule_plan( extra_block_kwargs, runtime_gather_output, loss_mask, - padding_mask, ) def sharded_state_dict( diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py index fbb960f4be9..8dcf196da94 100644 --- a/megatron/core/transformer/mlp.py +++ b/megatron/core/transformer/mlp.py @@ -137,7 +137,7 @@ def __init__( tp_group=tp_group, ) - def forward(self, hidden_states, per_token_scale=None, **kwargs): + def forward(self, hidden_states, per_token_scale=None): """Perform the forward pass through the MLP block.""" # [s, b, 4 * h/p] nvtx_range_push(suffix="linear_fc1") diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py index 153bac00ec1..10d10f667fe 100644 --- a/megatron/core/transformer/moe/moe_layer.py +++ b/megatron/core/transformer/moe/moe_layer.py @@ -178,13 +178,13 @@ def __init__( self.cudagraph_tensor_store = MoECudaGraphTensorStore() @maybe_skip_or_early_return_by_cudagraph("route") - def route(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): + def route(self, hidden_states: torch.Tensor): """Compute token routing for preprocessing. This method uses the router to determine which experts to send each token to, producing routing probabilities and a mapping. """ - probs, routing_map = self.router(hidden_states, padding_mask=padding_mask) + probs, routing_map = self.router(hidden_states) return probs, routing_map @maybe_skip_or_early_return_by_cudagraph("preprocess") @@ -270,7 +270,7 @@ def combine(self, output: torch.Tensor, shared_expert_output: Optional[torch.Ten output = output + shared_expert_output return output - def forward(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): + def forward(self, hidden_states: torch.Tensor): """Forward pass for the MoE layer. The forward pass comprises four main steps: @@ -280,11 +280,7 @@ def forward(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tens 4. Combine: The outputs from the experts are combined and returned. Args: - hidden_states (torch.Tensor): The input tensor shape [seq_length, bsz, hidden_size]. - padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. - used for correct auxiliary loss computation for packed sequence. - Shape = [bsz, seq_length]. True = padding (exclude), False = valid (include). - Defaults to None (all tokens are valid). + hidden_states (torch.Tensor): The input tensor to the MoE layer. Returns: A tuple containing the output tensor and the MLP bias, if any. @@ -295,15 +291,11 @@ def forward(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tens "are enabled without also enabling sequence parallelism." ) - # Transpose from [bsz, seq_length] to [seq_length, bsz] to align with hidden_states - if padding_mask is not None: - padding_mask = padding_mask.transpose(0, 1).bool() - # MoE forward: route -> dispatch -> compute -> combine - def custom_forward(hidden_states, padding_mask=None): + def custom_forward(hidden_states): try: shared_expert_output = self.shared_experts_compute(hidden_states) - probs, routing_map = self.route(hidden_states, padding_mask=padding_mask) + probs, routing_map = self.route(hidden_states) hidden_states, probs, residual = self.preprocess(hidden_states, probs, routing_map) except MoECudaGraphPartialCaptureSignal as e: # This signal is raised from the maybe_skip_or_early_return_by_cudagraph decorator. @@ -326,14 +318,11 @@ def custom_forward(hidden_states, padding_mask=None): tensor_parallel.random.get_cuda_rng_tracker, parallel_state.get_tensor_model_parallel_group(), hidden_states, - padding_mask, ) else: - outputs = tensor_parallel.checkpoint( - custom_forward, False, hidden_states, padding_mask - ) + outputs = tensor_parallel.checkpoint(custom_forward, False, hidden_states) else: - outputs = custom_forward(hidden_states, padding_mask) + outputs = custom_forward(hidden_states) return outputs diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py index f44d441c765..28cff06f5ec 100644 --- a/megatron/core/transformer/moe/moe_utils.py +++ b/megatron/core/transformer/moe/moe_utils.py @@ -1,4 +1,5 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import math from dataclasses import dataclass from typing import List, Optional, Union @@ -10,7 +11,6 @@ from megatron.core.fp8_utils import get_fp8_align_size from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel import get_cuda_rng_tracker, get_expert_parallel_rng_tracker_name -from megatron.core.tensor_parallel.mappings import reduce_from_tensor_model_parallel_region from megatron.core.transformer.cuda_graphs import is_graph_capturing from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.transformer_config import TransformerConfig @@ -120,34 +120,18 @@ def switch_load_balancing_loss_func( return aux_loss -def z_loss_func(logits, z_loss_coeff, padding_mask: Optional[torch.Tensor] = None): +def z_loss_func(logits, z_loss_coeff): """Encourages the router's logits to remain small to enhance stability. Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details. Args: logits (torch.Tensor): The logits of the router. - z_loss_coeff (float): The coefficient for the z-loss. - padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. - Shape [num_tokens]. True = padding (exclude), - False = valid (include). Defaults to None. Returns: torch.Tensor: The logits after applying the z-loss. """ - logsum = torch.logsumexp(logits, dim=-1) - z_loss_values = torch.square(logsum) - - if padding_mask is not None: - # Invert padding_mask: True (padding) -> 0, False (valid) -> 1 - valid_mask = ~padding_mask - # Only compute z_loss for valid (non-padding) tokens - z_loss_values = z_loss_values * valid_mask - # Compute mean over valid tokens only - num_valid_tokens = valid_mask.sum() - z_loss = z_loss_values.sum() / torch.clamp(num_valid_tokens, min=1.0) * z_loss_coeff - else: - z_loss = torch.mean(z_loss_values) * z_loss_coeff + z_loss = torch.mean(torch.square(torch.logsumexp(logits, dim=-1))) * z_loss_coeff return z_loss @@ -187,28 +171,6 @@ def get_capacity(num_tokens: int, num_experts: int, capacity_factor: float, min_ return capacity -def get_tokens_per_expert_and_token_count( - routing_map: torch.Tensor, - reduce_group: torch.distributed.ProcessGroup, - topk: int = None, - with_padding_mask: bool = False, -) -> torch.Tensor: - """ - Compute global_tokens_per_expert, local_num_tokens and total_num_tokens with padding mask. - """ - local_tokens_per_expert = routing_map.sum(dim=0) - global_tokens_per_expert = reduce_from_tensor_model_parallel_region( - local_tokens_per_expert, reduce_group - ) - if with_padding_mask: - local_num_tokens = local_tokens_per_expert.sum() / topk - total_num_tokens = global_tokens_per_expert.sum() / topk - else: - local_num_tokens = routing_map.shape[0] - total_num_tokens = local_num_tokens * reduce_group.size() - return global_tokens_per_expert, local_num_tokens, total_num_tokens - - class MoEAuxLossAutoScaler(torch.autograd.Function): """An AutoScaler that triggers the backward pass and scales the grad for auxiliary loss.""" @@ -667,48 +629,35 @@ def compute_topk(scores, topk, num_groups=None, group_topk=None): def compute_routing_scores_for_aux_loss( - logits: torch.Tensor, - topk: int, - score_function: str, - fused: bool = False, - padding_mask: Optional[torch.Tensor] = None, + logits: torch.Tensor, topk: int, score_function: str, fused: bool = False ): """Compute routing scores based on the score function. Args: logits (torch.Tensor): The logits tensor after gating, shape: [num_tokens, num_experts]. - padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. - Shape [num_tokens]. True = padding (exclude), - False = valid (include). Defaults to None. + Returns: - Tuple[torch.Tensor, torch.Tensor]: routing_map and scores. + torch.Tensor: The normalized routing scores. """ if fused: if not HAVE_TE or fused_compute_score_for_moe_aux_loss is None: raise ValueError( "fused_compute_score_for_moe_aux_loss is not available. Please install TE >= 2.6.0." ) - routing_map, scores = fused_compute_score_for_moe_aux_loss( + return fused_compute_score_for_moe_aux_loss( logits=logits, topk=topk, score_function=score_function ) - else: - if score_function == "softmax": - scores = torch.softmax(logits, dim=-1, dtype=torch.float32) - elif score_function == "sigmoid": - scores = torch.sigmoid(logits) - scores = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20) - else: - raise ValueError(f"Invalid score_function: {score_function}") - _, top_indices = torch.topk(scores, k=topk, dim=1) - routing_map = torch.zeros_like(logits).int().scatter(1, top_indices, 1).bool() + if score_function == "softmax": + scores = torch.softmax(logits, dim=-1, dtype=torch.float32) + elif score_function == "sigmoid": + scores = torch.sigmoid(logits) + scores = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20) + else: + raise ValueError(f"Invalid score_function: {score_function}") - # Apply padding mask to scores if provided - if padding_mask is not None: - # Invert padding_mask and make True indicates valid tokens - valid_mask = (~padding_mask).unsqueeze(-1) - routing_map = routing_map * valid_mask - scores = scores * valid_mask + _, top_indices = torch.topk(scores, k=topk, dim=1) + routing_map = torch.zeros_like(logits).int().scatter(1, top_indices, 1).bool() return routing_map, scores diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py index 1c502e212ad..16fc9d9af8f 100644 --- a/megatron/core/transformer/moe/router.py +++ b/megatron/core/transformer/moe/router.py @@ -1,11 +1,12 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. from abc import ABC, abstractmethod -from typing import Optional, Union +from typing import Optional import torch from megatron.core.jit import jit_fuser +from megatron.core.tensor_parallel import reduce_from_tensor_model_parallel_region from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.moe.moe_utils import ( MoEAuxLossAutoScaler, @@ -13,7 +14,6 @@ apply_random_logits, apply_router_token_dropping, compute_routing_scores_for_aux_loss, - get_tokens_per_expert_and_token_count, router_gating_linear, save_to_aux_losses_tracker, sinkhorn, @@ -268,28 +268,22 @@ def is_aux_loss_enabled(self) -> bool: return False def _apply_aux_loss( - self, - probs: torch.Tensor, - scores_for_aux_loss: torch.Tensor, - routing_map: torch.Tensor, - with_padding_mask: bool = False, + self, probs: torch.Tensor, scores_for_aux_loss: torch.Tensor, routing_map: torch.Tensor ): """Apply the auxiliary loss for the given scores and routing map.""" aux_loss_coeff = self.get_aux_loss_coeff("aux_loss") if aux_loss_coeff == 0: return probs - - global_tokens_per_expert, local_num_tokens, total_num_tokens = ( - get_tokens_per_expert_and_token_count( - routing_map=routing_map, - reduce_group=self.tp_cp_group, - topk=self.topk, - with_padding_mask=with_padding_mask, - ) + tokens_per_expert = routing_map.sum(dim=0) + tokens_per_expert = reduce_from_tensor_model_parallel_region( + tokens_per_expert, self.tp_cp_group ) + num_tokens = routing_map.shape[0] + total_num_tokens = num_tokens * self.tp_cp_group.size() + aux_loss = switch_load_balancing_loss_func( probs=scores_for_aux_loss, - tokens_per_expert=global_tokens_per_expert, + tokens_per_expert=tokens_per_expert, total_num_tokens=total_num_tokens, topk=self.topk, num_experts=self.config.num_moe_experts, @@ -297,12 +291,7 @@ def _apply_aux_loss( fused=self.config.moe_router_fusion, ) probs = self.attach_and_log_load_balancing_loss( - probs, - aux_loss_coeff, - aux_loss, - "load_balancing_loss", - self.tp_cp_group, - valid_token_count=local_num_tokens, + probs, aux_loss_coeff, aux_loss, "load_balancing_loss", self.tp_cp_group ) return probs @@ -313,7 +302,6 @@ def _apply_seq_aux_loss( routing_map: torch.Tensor, seq_length: int, bsz: int, - with_padding_mask: bool = False, ): """Apply the sequence-level auxiliary loss for the given scores and routing map. @@ -327,21 +315,17 @@ def _apply_seq_aux_loss( return probs scores_for_aux_loss = scores_for_aux_loss.reshape(seq_length, -1) - routing_map = routing_map.reshape(seq_length, -1) - - global_tokens_per_expert, local_num_tokens, total_num_tokens = ( - get_tokens_per_expert_and_token_count( - routing_map=routing_map, - reduce_group=self.tp_cp_group, - with_padding_mask=with_padding_mask, - topk=self.topk * bsz, - ) + tokens_per_expert = routing_map.reshape(seq_length, -1).sum(dim=0) + tokens_per_expert = reduce_from_tensor_model_parallel_region( + tokens_per_expert, self.tp_cp_group ) + total_num_tokens = seq_length * self.tp_cp_group.size() + aux_loss = ( switch_load_balancing_loss_func( probs=scores_for_aux_loss, - tokens_per_expert=global_tokens_per_expert, + tokens_per_expert=tokens_per_expert, total_num_tokens=total_num_tokens, topk=self.topk, num_experts=self.config.num_moe_experts, @@ -350,42 +334,31 @@ def _apply_seq_aux_loss( ) / bsz ) - probs = self.attach_and_log_load_balancing_loss( - probs, - seq_aux_loss_coeff, - aux_loss, - "seq_load_balancing_loss", - self.tp_cp_group, - valid_token_count=local_num_tokens, + probs, seq_aux_loss_coeff, aux_loss, "seq_load_balancing_loss", self.tp_cp_group ) return probs def _apply_global_aux_loss( - self, - probs: torch.Tensor, - scores_for_aux_loss: torch.Tensor, - routing_map: torch.Tensor, - with_padding_mask: bool = False, + self, probs: torch.Tensor, scores_for_aux_loss: torch.Tensor, routing_map: torch.Tensor ): """Apply the global auxiliary loss for the given scores and routing map.""" global_aux_loss_coeff = self.get_aux_loss_coeff("global_aux_loss") if global_aux_loss_coeff == 0: return probs - # Use unified function to compute tokens_per_expert and num_tokens - global_tokens_per_expert, local_num_tokens, total_num_tokens = ( - get_tokens_per_expert_and_token_count( - routing_map=routing_map, - reduce_group=self.tp_dp_cp_group, - with_padding_mask=with_padding_mask, - topk=self.topk, - ) + tokens_per_expert = routing_map.sum(dim=0) + tokens_per_expert = reduce_from_tensor_model_parallel_region( + tokens_per_expert, self.tp_dp_cp_group ) - self.global_tokens_per_expert += global_tokens_per_expert + + self.global_tokens_per_expert += tokens_per_expert self.ga_steps += 1 averated_tokens_per_expert = self.global_tokens_per_expert / self.ga_steps + num_tokens = scores_for_aux_loss.shape[0] + total_num_tokens = num_tokens * self.tp_dp_cp_group.size() + global_aux_loss = switch_load_balancing_loss_func( probs=scores_for_aux_loss, tokens_per_expert=averated_tokens_per_expert, @@ -401,7 +374,6 @@ def _apply_global_aux_loss( global_aux_loss, "global_load_balancing_loss", self.tp_dp_cp_group, - valid_token_count=local_num_tokens, ) return probs @@ -412,20 +384,8 @@ def attach_and_log_load_balancing_loss( aux_loss: torch.Tensor, aux_loss_name: str, reduce_group: torch.distributed.ProcessGroup, - valid_token_count: Optional[Union[int, torch.Tensor]] = None, ): - """Attach aux loss function to activation and add to logging. - - Args: - activation (torch.Tensor): Activation tensor to attach the aux loss to. - aux_loss_coeff (float): Coefficient for the aux loss. - aux_loss (torch.Tensor): Computed aux loss. - aux_loss_name (str): Name of the aux loss for logging. - reduce_group (torch.distributed.ProcessGroup): Process group for reduction. - valid_token_count (int or torch.Tensor, optional): Number of valid tokens excluding - padding tokens. Can be a Python int or a torch.Tensor (typically 0-d tensor). - If None, uses activation.shape[0]. Defaults to None. - """ + """Attach aux loss function to activation and add to logging.""" # TODO (zijiey): fix the per_layer_logging for MTP, currently it will incorrectly # add the aux loss logging value to other layer's since it is difficult to get the # correct layer_number for MTP. It does not affect the correctness of the calculation @@ -448,22 +408,17 @@ def attach_and_log_load_balancing_loss( # which scales both the main_loss gradient and aux_loss gradient by # 1/(num_local_tokens * dp_size * num_micro_batches) in finalize_model_grads function. # To correct this scaling, we need to scale the aux_loss by num_local_tokens here. - # Use valid_token_count (excluding padding) if provided, otherwise use total tokens. - num_tokens = valid_token_count if valid_token_count is not None else activation.shape[0] - activation = MoEAuxLossAutoScaler.apply(activation, aux_loss * num_tokens) + activation = MoEAuxLossAutoScaler.apply(activation, aux_loss * activation.shape[0]) else: activation = MoEAuxLossAutoScaler.apply(activation, aux_loss) return activation - def apply_z_loss(self, logits, padding_mask: Optional[torch.Tensor] = None): + def apply_z_loss(self, logits): """Encourages the router's logits to remain small to enhance stability. Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details. Args: logits (torch.Tensor): The logits of the router. - padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. - Shape [num_tokens]. True = padding (exclude), - False = valid (include). Defaults to None. Returns: torch.Tensor: The logits after applying the z-loss. @@ -471,7 +426,7 @@ def apply_z_loss(self, logits, padding_mask: Optional[torch.Tensor] = None): if self.config.moe_z_loss_coeff is not None and self.training and torch.is_grad_enabled(): # Skip Z loss calculations when using torch.no_grad() or checkpointing. moe_z_loss_coeff = self.config.moe_z_loss_coeff / self.tp_cp_group.size() - z_loss = z_loss_func(logits, moe_z_loss_coeff, padding_mask=padding_mask) + z_loss = z_loss_func(logits, moe_z_loss_coeff) scale_up = 1.0 if self.calculate_per_token_loss: # The expected final scaling for z_loss gradients is @@ -481,9 +436,7 @@ def apply_z_loss(self, logits, padding_mask: Optional[torch.Tensor] = None): # which scales both the main_loss gradient and z_loss gradient by # 1/(num_local_tokens * dp_size * num_micro_batches) in finalize_model_grads(). # To correct this scaling, we need to scale the z_loss by num_local_tokens here. - # Count valid tokens: sum of inverted mask (False -> True = valid) - num_tokens = (~padding_mask).sum() if padding_mask is not None else logits.shape[0] - logits = MoEAuxLossAutoScaler.apply(logits, z_loss * num_tokens) + logits = MoEAuxLossAutoScaler.apply(logits, z_loss * logits.shape[0]) else: logits = MoEAuxLossAutoScaler.apply(logits, z_loss) @@ -517,32 +470,20 @@ def apply_input_jitter(self, input: torch.Tensor): return input @jit_fuser - def _apply_expert_bias( - self, routing_map: torch.Tensor, padding_mask: Optional[torch.Tensor] = None - ): + def _apply_expert_bias(self, routing_map: torch.Tensor): """ Update expert bias and tokens_per_expert Prevent extra local tokens accumulation on evaluation or activation recomputation - - Args: - routing_map (torch.Tensor): Token to expert routing map, [num_tokens, num_experts]. - padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. - Shape [num_tokens]. True = padding (exclude), False = valid (include). """ if self.enable_expert_bias and torch.is_grad_enabled(): with torch.no_grad(): - if padding_mask is not None: - routing_map = routing_map & (~padding_mask) self.local_tokens_per_expert += routing_map.sum(dim=0) - def routing(self, logits: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): + def routing(self, logits: torch.Tensor): """Top-k routing function Args: logits (torch.Tensor): Logits tensor after gating. - padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. - Shape = [seq_length, bsz]. True=padding(exclude), - False=valid(include). Defaults to None. Returns: probs (torch.Tensor): The probabilities of token to experts assignment. @@ -552,12 +493,8 @@ def routing(self, logits: torch.Tensor, padding_mask: Optional[torch.Tensor] = N seq_length, bsz = logits.shape[:2] logits = logits.view(-1, self.config.num_moe_experts) - # Flatten padding_mask to [num_tokens] if provided - if padding_mask is not None: - padding_mask = padding_mask.reshape(-1) - # Apply Z-Loss - logits = self.apply_z_loss(logits, padding_mask=padding_mask) + logits = self.apply_z_loss(logits) # Calculate probs and routing_map for token dispatching if self.routing_type == "sinkhorn": @@ -590,35 +527,18 @@ def routing(self, logits: torch.Tensor, padding_mask: Optional[torch.Tensor] = N if self.training and torch.is_grad_enabled() and self.is_aux_loss_enabled(): # Calculate scores and routing_map for aux loss routing_map_for_aux_loss, scores_for_aux_loss = compute_routing_scores_for_aux_loss( - logits, - self.topk, - self.score_function, - fused=self.config.moe_router_fusion, - padding_mask=padding_mask, - ) - probs = self._apply_aux_loss( - probs, - scores_for_aux_loss, - routing_map_for_aux_loss, - with_padding_mask=padding_mask is not None, + logits, self.topk, self.score_function, fused=self.config.moe_router_fusion ) + probs = self._apply_aux_loss(probs, scores_for_aux_loss, routing_map_for_aux_loss) probs = self._apply_seq_aux_loss( - probs, - scores_for_aux_loss, - routing_map_for_aux_loss, - seq_length, - bsz, - with_padding_mask=padding_mask is not None, + probs, scores_for_aux_loss, routing_map_for_aux_loss, seq_length, bsz ) probs = self._apply_global_aux_loss( - probs, - scores_for_aux_loss, - routing_map_for_aux_loss, - with_padding_mask=padding_mask is not None, + probs, scores_for_aux_loss, routing_map_for_aux_loss ) # Optionally apply expert bias - self._apply_expert_bias(routing_map, padding_mask=padding_mask) + self._apply_expert_bias(routing_map) return probs, routing_map @@ -628,15 +548,12 @@ def reset_global_aux_loss_tracker(self): self.global_tokens_per_expert.zero_() self.ga_steps.zero_() - def forward(self, input: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): + def forward(self, input: torch.Tensor): """ Forward pass of the router. Args: input (torch.Tensor): Input tensor. - padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. - Shape = [seq_length, bsz]. True=padding(exclude), - False=valid(include). Defaults to None. """ self._maintain_float32_expert_bias() @@ -648,7 +565,7 @@ def forward(self, input: torch.Tensor, padding_mask: Optional[torch.Tensor] = No # Apply force load balancing with random logits for benchmark logits = apply_random_logits(logits) - probs, routing_map = self.routing(logits, padding_mask=padding_mask) + probs, routing_map = self.routing(logits) return probs, routing_map diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index cbbd7ec00eb..023db1fe75a 100755 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -390,6 +390,7 @@ def build_layer(layer_spec, layer_number): def has_final_layernorm_in_this_stage(self): """ Check if this vpp stage contains the final layernorm. + Note: Final layernorm now has been moved from the post-process stage to the last decoder layer by using this function. @@ -428,18 +429,12 @@ def _checkpointed_forward( attention_bias: Tensor, packed_seq_params: PackedSeqParams, use_inner_quantization_context: bool, - padding_mask: Optional[Tensor] = None, ): """Forward method with activation checkpointing.""" def custom(start: int, end: int): def custom_forward( - hidden_states, - attention_mask, - context, - context_mask, - rotary_pos_emb, - padding_mask=None, + hidden_states, attention_mask, context, context_mask, rotary_pos_emb ): for index in range(start, end): layer = self._get_layer(index) @@ -470,7 +465,6 @@ def custom_forward( attention_bias=attention_bias, inference_context=None, packed_seq_params=packed_seq_params, - padding_mask=padding_mask, ) return hidden_states, context @@ -490,7 +484,6 @@ def checkpoint_handler(forward_func): context, context_mask, rotary_pos_emb, - padding_mask, ) else: return tensor_parallel.checkpoint( @@ -501,7 +494,6 @@ def checkpoint_handler(forward_func): context, context_mask, rotary_pos_emb, - padding_mask, ) if self.config.recompute_method == 'uniform': @@ -607,7 +599,6 @@ def forward( inference_context: Optional[BaseInferenceContext] = None, packed_seq_params: Optional[PackedSeqParams] = None, sequence_len_offset: Optional[Tensor] = None, - padding_mask: Optional[Tensor] = None, *, inference_params: Optional[BaseInferenceContext] = None, dynamic_inference_decode_only: Optional[bool] = None, @@ -717,7 +708,6 @@ def forward( attention_bias=attention_bias, packed_seq_params=packed_seq_params, use_inner_quantization_context=use_inner_quantization_context, - padding_mask=padding_mask, ) else: for l_no, layer in enumerate(self.layers): @@ -755,7 +745,6 @@ def forward( inference_context=inference_context, packed_seq_params=packed_seq_params, sequence_len_offset=sequence_len_offset, - padding_mask=padding_mask, ) if ( diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index 21f38b06f30..3ea40577009 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -457,12 +457,7 @@ def forward(self, *args, **kwargs): # runners in the cuda graph manager kwargs.pop("dynamic_inference_decode_only", None) hidden_states, context = self._forward_attention(*args, **kwargs) - - output = self._forward_mlp( - hidden_states, - kwargs.get("inference_context", None), - padding_mask=kwargs.get("padding_mask", None), - ) + output = self._forward_mlp(hidden_states, kwargs.get("inference_context", None)) return output, context def _forward_attention( @@ -479,7 +474,6 @@ def _forward_attention( inference_context: Optional[Any] = None, packed_seq_params: Optional[PackedSeqParams] = None, sequence_len_offset: Optional[Tensor] = None, - padding_mask: Optional[Tensor] = None, *, inference_params: Optional[Any] = None, ): @@ -597,18 +591,12 @@ def _forward_attention( return hidden_states, context - def _forward_mlp(self, hidden_states, inference_context=None, padding_mask=None): + def _forward_mlp(self, hidden_states, inference_context=None): """ Perform a forward pass through the feed-forward layer. Args: hidden_states (Tensor): Transformed hidden states before the MLP layernorm. - Shape [seq_length, batch_size, hidden_size]. - inference_context: Inference context for optimizations. - padding_mask (Tensor, optional): Padding mask for MoE routing. - Shape [bsz, seq_length]. True = padding (exclude), False = valid (include). - Only used for MoE layers to exclude padding tokens from aux loss computations. - The MoELayer will internally transform this to [seq_length, bsz] format. Returns: output (Tensor): Transformed hidden states of shape [s, b, h]. @@ -654,7 +642,7 @@ def _forward_mlp(self, hidden_states, inference_context=None, padding_mask=None) assert ( not self.recompute_pre_mlp_layernorm ), "Recomputation is not supported for CUDA graph." - cudagraph_outputs = self.mlp(pre_mlp_layernorm_output, padding_mask=padding_mask) + cudagraph_outputs = self.mlp(pre_mlp_layernorm_output) nvtx_range_pop(suffix="mlp") return cudagraph_outputs + [residual] elif self.recompute_mlp: @@ -668,11 +656,10 @@ def _forward_mlp(self, hidden_states, inference_context=None, padding_mask=None) tensor_parallel.random.get_cuda_rng_tracker, self.pg_collection.tp, pre_mlp_layernorm_output, - padding_mask=padding_mask, ) else: mlp_output_with_bias = tensor_parallel.checkpoint( - self.mlp, False, pre_mlp_layernorm_output, padding_mask=padding_mask + self.mlp, False, pre_mlp_layernorm_output ) elif should_chunk_mlp_for_prefill: # Chunk input along sequence dimension @@ -688,7 +675,7 @@ def _forward_mlp(self, hidden_states, inference_context=None, padding_mask=None) bias_output = torch.stack(bias_chunks, dim=0).sum(dim=0) if bias_chunks else None mlp_output_with_bias = (mlp_output, bias_output) else: - mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output, padding_mask=padding_mask) + mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output) if self.recompute_pre_mlp_layernorm: # discard the output of the pre-mlp layernorm and register the recompute diff --git a/tests/test_utils/python_scripts/recipe_parser.py b/tests/test_utils/python_scripts/recipe_parser.py index b866fbbf5c2..a497bdbd9de 100644 --- a/tests/test_utils/python_scripts/recipe_parser.py +++ b/tests/test_utils/python_scripts/recipe_parser.py @@ -1,4 +1,3 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import copy import itertools import logging diff --git a/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py b/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py index 6c59dd3f9e3..81e61a3404a 100644 --- a/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py +++ b/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py @@ -23,7 +23,7 @@ from tests.unit_tests.test_utilities import Utils -def build_model(config, use_padding_mask=False): +def build_model(config): seq_len = 32 max_seq_len = 300 # ids = random.sample([i for i in range(max_seq_len)], seq_len) @@ -39,12 +39,6 @@ def build_model(config, use_padding_mask=False): "attention_mask": torch.ones((1, 1, seq_len, seq_len), dtype=bool).cuda(), } - # Optionally add padding_mask with same shape as input_ids - if use_padding_mask: - padding_mask = torch.zeros((1, seq_len), dtype=torch.bool).cuda() - padding_mask[0, -8:] = True - data["padding_mask"] = padding_mask - # build layer spec transformer_layer_spec = get_gpt_decoder_block_spec(config=config, use_transformer_engine=True) mtp_block_spec = get_gpt_mtp_block_spec(config, transformer_layer_spec.layer_specs[-1], True) @@ -54,7 +48,7 @@ def build_model(config, use_padding_mask=False): config=config, transformer_layer_spec=transformer_layer_spec, mtp_block_spec=mtp_block_spec, - vocab_size=128, + vocab_size=100, pre_process=True, post_process=True, max_sequence_length=max_seq_len, @@ -180,109 +174,3 @@ def test_1f1b_schedule_model_chunk(self, mtp_layers, dispatcher_type, fp8_flag, gpt_models[i] = None gc.collect() torch.cuda.empty_cache() - - @pytest.mark.skipif(not is_te_min_version("1.9.0.dev0"), reason="Requires TE >= 1.9.0.dev0") - @pytest.mark.parametrize("dispatcher_type", get_valid_token_dispatcher_types()) - @pytest.mark.parametrize("layers", [[2, 1], [1, 1]]) - @pytest.mark.parametrize("tp_size", [1, 2, 4, 8]) - def test_1f1b_schedule_model_chunk_with_padding_mask(self, dispatcher_type, layers, tp_size): - """ - Verifies all-to-all overlap optimization with padding_mask produces - the same results as the reference implementation with various TP/EP/CP combinations. - """ - # Re-initialize model parallel with the specified configuration - Utils.destroy_model_parallel() - Utils.initialize_model_parallel( - tensor_model_parallel_size=tp_size, - pipeline_model_parallel_size=1, - expert_model_parallel_size=4, - expert_tensor_parallel_size=1, - ) - set_streams() - - microbatches = 1 - - gpt_models = [] - schedule_plans = [] - ref_captures = [] - datas = [] - - # create TransformerConfig - extra_kwargs = { - "moe_token_dispatcher_type": dispatcher_type, - "tensor_model_parallel_size": tp_size, - "sequence_parallel": tp_size > 1, - } - if dispatcher_type == "flex": - extra_kwargs["moe_flex_dispatcher_backend"] = "deepep" - extra_kwargs["moe_router_dtype"] = "fp32" - with deterministic_mode(): - for layer_num in layers: - output_tensors = [] - # build config - config = get_test_config(num_layers=layer_num, extra_kwargs=extra_kwargs) - # build model with padding_mask - gpt_model, schedule_plan, data = build_model(config, use_padding_mask=True) - gpt_model.cuda() - gpt_models.append(gpt_model) - datas.append(data) - schedule_plans.append(schedule_plan) - - # run reference - for _ in range(microbatches): - loss = gpt_model.forward(**data) - loss = float16_to_fp32(loss) - loss.backward(torch.ones_like(loss)) - output_tensors.append(loss) - - capture = {"outputs": output_tensors} - for name, param in gpt_model.named_parameters(): - capture[name] = param.grad - ref_captures.append(capture) - gpt_model.zero_grad() - assert gpt_models[0].embedding is not None - assert gpt_models[1].embedding is not None - # run a2a overlap - capture_0 = {"outputs": []} - capture_1 = {"outputs": []} - a2a_captures = [capture_0, capture_1] - for i in range(microbatches): - # 1st forward - if i > 0: - assert ( - schedule_plans[0].pre_process is None - ), "pre_process should be released after backward" - schedule_plans[0] = gpt_models[0].build_schedule_plan(**datas[0]) - schedule_plans[1] = gpt_models[1].build_schedule_plan(**datas[1]) - f_input_0 = TransformerModelChunkSchedulePlan.run(schedule_plans[0], None) - capture_0["outputs"].append(f_input_0) - # overlap - f_input_1 = TransformerModelChunkSchedulePlan.run( - schedule_plans[1], schedule_plans[0], b_grad=torch.ones_like(f_input_0) - ) - capture_1["outputs"].append(f_input_1) - # last backward - TransformerModelChunkSchedulePlan.run( - None, schedule_plans[1], b_grad=torch.ones_like(f_input_1) - ) - for i in range(len(gpt_models)): - for name, param in gpt_models[i].named_parameters(): - a2a_captures[i][name] = param.grad - - # compare results - for i in range(len(ref_captures)): - comp_res = compare_captures(ref_captures[i], a2a_captures[i], True, True) - assert comp_res[0], f"[rank {torch.distributed.get_rank()}] {comp_res[1]}" - - # release resources is necessary, otherwise later testcases will oom - for i in range(len(schedule_plans)): - schedule_plans[i] = None - ref_captures[i] = None - a2a_captures[i] = None - for k in datas[i]: - datas[i][k] = None - datas[i] = None - gpt_models[i].zero_grad() - gpt_models[i] = None - gc.collect() - torch.cuda.empty_cache() diff --git a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py index 5ec096e5a04..7fb97f6e586 100644 --- a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py +++ b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py @@ -502,8 +502,8 @@ def test_mtp_layer_overlap(self, dispatcher_type, fp8_flag): position_ids = torch.tensor(data, dtype=torch.int64).repeat((1, 1)).cuda() attention_mask = torch.ones((1, 1, seq_len, seq_len), dtype=bool).cuda() # get rotary pos emb - _, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, _, _padding_mask = ( - gpt_model._preprocess(input_ids, position_ids) + _, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, _ = gpt_model._preprocess( + input_ids, position_ids ) # reset model params = reset_model(gpt_model) diff --git a/tests/unit_tests/transformer/moe/test_aux_loss.py b/tests/unit_tests/transformer/moe/test_aux_loss.py index f5726777383..b1f78582383 100644 --- a/tests/unit_tests/transformer/moe/test_aux_loss.py +++ b/tests/unit_tests/transformer/moe/test_aux_loss.py @@ -576,192 +576,3 @@ def test_force_balanced_aux_loss(self, tp_size, ep_size, cp_size): reduce_from_tensor_model_parallel_region(aux_loss, router.tp_cp_group) assert aux_loss.item() == 1, f"{aux_loss_type}: {aux_loss.item()}" clear_aux_losses_tracker() - - -class TestPaddingMaskAuxLoss: - """Test padding mask support in various aux loss types.""" - - def setup_model_parallel(self, tp_size=1, ep_size=1, cp_size=1, sequence_parallel=False): - """Initialize model parallel with given configuration. - - Args: - tp_size: Tensor parallel size. - ep_size: Expert parallel size. - cp_size: Context parallel size. - """ - Utils.initialize_model_parallel( - tensor_model_parallel_size=tp_size, - pipeline_model_parallel_size=1, - context_parallel_size=cp_size, - expert_model_parallel_size=ep_size, - ) - _set_random_seed(seed_=123, data_parallel_random_init=False) - - # Store parallel configuration - self.tp_size = tp_size - self.ep_size = ep_size - self.cp_size = cp_size - - # Default configuration - self.default_transformer_config = TransformerConfig( - num_layers=1, - hidden_size=12, - num_attention_heads=8, - num_moe_experts=32, - use_cpu_initialization=True, - moe_router_load_balancing_type="aux_loss", - moe_router_topk=8, - moe_aux_loss_coeff=1.0, - bf16=True, - params_dtype=torch.bfloat16, - add_bias_linear=False, - tensor_model_parallel_size=tp_size, - expert_model_parallel_size=ep_size, - context_parallel_size=cp_size, - sequence_parallel=sequence_parallel and tp_size > 1, - ) - - def new_router(self, **kwargs): - """Create a new router with updated configuration.""" - pg_collection = get_default_pg_collection() - new_transformer_config = dataclasses.replace(self.default_transformer_config, **kwargs) - router = TopKRouter(config=new_transformer_config, pg_collection=pg_collection) - router.set_layer_number(0) - return router - - @pytest.mark.internal - @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") - @pytest.mark.parametrize("sequence_parallel", [True, False]) - @pytest.mark.parametrize("aux_loss_type", ["aux_loss", "seq_aux_loss", "global_aux_loss"]) - @pytest.mark.parametrize( - "tp_size,ep_size,cp_size", [(8, 1, 1), (4, 2, 1), (1, 1, 8), (2, 1, 4), (2, 2, 2)] - ) - def test_padding_mask_removes_padding_tokens( - self, aux_loss_type, tp_size, ep_size, cp_size, sequence_parallel - ): - """Test that padding tokens are correctly excluded from aux loss calculation.""" - # Initialize model parallel with given configuration - self.setup_model_parallel( - tp_size=tp_size, ep_size=ep_size, cp_size=cp_size, sequence_parallel=sequence_parallel - ) - - try: - clear_aux_losses_tracker() - - router = self.new_router( - moe_router_load_balancing_type=aux_loss_type, - moe_aux_loss_coeff=1.0, - moe_router_dtype="fp64", - ).cuda() - - seq_len = 32 - batch_size = 2 - hidden_size = router.config.hidden_size - - # Create input with padding - hidden_states_full = torch.randn( - (seq_len, batch_size, hidden_size), dtype=torch.bfloat16, device='cuda' - ) - - # Create padding mask: first half valid (False), second half padding (True) - # Convention: True = padding (exclude), False = valid (include) - padding_mask = torch.zeros((seq_len, batch_size), dtype=torch.bool, device='cuda') - padding_mask[seq_len // 2 :, :] = True - - # Test with padding mask - router.weight.grad = None - scores_with_mask, routing_map_with_mask = router( - hidden_states_full, padding_mask=padding_mask - ) - scores_with_mask.backward(torch.zeros_like(scores_with_mask)) - - loss_name = { - "aux_loss": "load_balancing_loss", - "seq_aux_loss": "seq_load_balancing_loss", - "global_aux_loss": "global_load_balancing_loss", - }[aux_loss_type] - - tracker = get_moe_layer_wise_logging_tracker() - aux_loss_with_mask = tracker[loss_name]["values"][0].clone() - grad_with_mask = router.weight.grad.clone() - - # Test without padding (with only half of the tokens) - clear_aux_losses_tracker() - router.weight.grad = None - hidden_states_valid = hidden_states_full[: seq_len // 2, :, :] - scores_without_mask, routing_map_without_mask = router(hidden_states_valid) - scores_without_mask.backward(torch.zeros_like(scores_without_mask)) - - aux_loss_without_mask = tracker[loss_name]["values"][0].clone() - grad_without_mask = router.weight.grad.clone() - - # The aux loss with mask should be close to the aux loss without mask - assert torch.equal(aux_loss_with_mask, aux_loss_without_mask) - assert torch.equal(grad_with_mask, grad_without_mask) - - clear_aux_losses_tracker() - finally: - # Always cleanup model parallel - Utils.destroy_model_parallel() - - @pytest.mark.internal - @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") - @pytest.mark.parametrize( - "tp_size,ep_size,cp_size", [(8, 1, 1), (4, 2, 1), (1, 1, 8), (2, 1, 4), (2, 2, 2)] - ) - def test_padding_mask_with_z_loss(self, tp_size, ep_size, cp_size): - """Test that padding mask works correctly with z_loss.""" - # Initialize model parallel with given configuration - self.setup_model_parallel(tp_size=tp_size, ep_size=ep_size, cp_size=cp_size) - - try: - clear_aux_losses_tracker() - - router = self.new_router( - moe_router_load_balancing_type="aux_loss", - moe_aux_loss_coeff=0.0, - moe_z_loss_coeff=1.0, - moe_router_dtype="fp32", - ).cuda() - - seq_len = 32 - batch_size = 2 - hidden_size = router.config.hidden_size - - # Create input - hidden_states_full = torch.randn( - (seq_len, batch_size, hidden_size), dtype=torch.bfloat16, device='cuda' - ) - - # Create padding mask: first half valid (False), second half padding (True) - # Convention: True = padding (exclude), False = valid (include) - padding_mask = torch.zeros((seq_len, batch_size), dtype=torch.bool, device='cuda') - padding_mask[seq_len // 2 :, :] = True - - # Test with padding mask - router.weight.grad = None - scores_with_mask, _ = router(hidden_states_full, padding_mask=padding_mask) - scores_with_mask.sum().backward() - - tracker = get_moe_layer_wise_logging_tracker() - z_loss_with_mask = tracker["z_loss"]["values"][0].clone() - grad_with_mask = router.weight.grad.clone() - - # Test without padding (with only half of the tokens) - clear_aux_losses_tracker() - router.weight.grad = None - hidden_states_valid = hidden_states_full[: seq_len // 2, :, :] - scores_without_mask, _ = router(hidden_states_valid) - scores_without_mask.sum().backward() - - z_loss_without_mask = tracker["z_loss"]["values"][0].clone() - grad_without_mask = router.weight.grad.clone() - - # The z_loss with mask should be close to the z_loss without mask - assert torch.equal(z_loss_with_mask, z_loss_without_mask) - assert torch.equal(grad_with_mask, grad_without_mask) - - clear_aux_losses_tracker() - finally: - # Always cleanup model parallel - Utils.destroy_model_parallel() diff --git a/tests/unit_tests/transformer/moe/test_routers.py b/tests/unit_tests/transformer/moe/test_routers.py index abd1a4db2dc..677d938cdc7 100644 --- a/tests/unit_tests/transformer/moe/test_routers.py +++ b/tests/unit_tests/transformer/moe/test_routers.py @@ -125,53 +125,6 @@ def test_aux_loss(self): out.sum().mul_(0).backward() assert self.sequential_mlp.router.weight.grad.abs().sum() > 0 - @pytest.mark.internal - @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") - def test_router_with_padding_mask(self): - """Test that padding mask correctly excludes padding tokens from routing.""" - self.router = self.router.cuda() - seq_len = 32 - batch_size = 2 - hidden_size = self.router.config.hidden_size - - # Create input with shape [seq_len, batch_size, hidden_size] - hidden_states = torch.randn((seq_len, batch_size, hidden_size)).cuda().bfloat16() - - # Create padding mask: first half valid (False), second half padding (True) - # padding_mask shape: [seq_len, batch_size] - # Convention: True = padding (exclude), False = valid (include) - padding_mask = torch.zeros((seq_len, batch_size), dtype=torch.bool, device='cuda') - padding_mask[seq_len // 2 :, :] = True # Second half is padding - - # Test forward pass with padding mask - with torch.no_grad(): - probs_with_mask, routing_map_with_mask = self.router( - hidden_states, padding_mask=padding_mask - ) - - # Test forward pass without padding mask (only valid tokens) - hidden_states_valid = hidden_states[: seq_len // 2, :, :] - probs_without_mask, routing_map_without_mask = self.router(hidden_states_valid) - - # The valid part of routing with mask should match routing without mask - probs_valid_part = probs_with_mask.reshape(seq_len, batch_size, -1)[ - : seq_len // 2, :, : - ] - probs_valid_part = probs_valid_part.reshape(-1, probs_valid_part.shape[-1]) - - # Check that shapes are as expected - assert probs_with_mask.shape == ( - seq_len * batch_size, - self.router.config.num_moe_experts, - ) - assert routing_map_with_mask.shape == ( - seq_len * batch_size, - self.router.config.num_moe_experts, - ) - - # Verify that probs for valid tokens are similar - assert torch.equal(probs_valid_part, probs_without_mask) - @pytest.mark.internal @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") def test_router_dtype(self): From 9885ddb8e08e05786d88b28ee4698739d38a91ae Mon Sep 17 00:00:00 2001 From: Pingtian Li <158665726+Wohox@users.noreply.github.com> Date: Tue, 30 Dec 2025 11:26:53 +0800 Subject: [PATCH 206/248] [Dev] Disable ep overlap memory optimization (#2750) --- megatron/core/models/gpt/fine_grained_callables.py | 5 +++-- megatron/core/pipeline_parallel/utils.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index 741a25326fb..a0be55c4ca1 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -304,8 +304,9 @@ def backward_dw(self): # the output grad memory is last used in wgrad compute, should be safe to release. assert self.delay_grads_release, "output grad memory should be valid before wgrad." - for tensor in self.output_grads: - tensor.untyped_storage().resize_(0) + if self.manual_release_grads: + for tensor in self.output_grads: + tensor.untyped_storage().resize_(0) self.output_grads = None self.bwd_dw_callables = None diff --git a/megatron/core/pipeline_parallel/utils.py b/megatron/core/pipeline_parallel/utils.py index 52d401c79f9..e7e416f99bd 100644 --- a/megatron/core/pipeline_parallel/utils.py +++ b/megatron/core/pipeline_parallel/utils.py @@ -183,6 +183,7 @@ def __init__( self.inputs = None self.outputs = None self.delay_grads_release = False + self.manual_release_grads = False def default_backward_func(self, outputs, output_grad): """Default backward function""" @@ -268,7 +269,7 @@ def _backward(self, *output_grad): # to avoid delayed garbage collection. If # delay_grads_release is True, dgrad is last used in # wgrad compute and skip the release here. - if not self.delay_grads_release: + if self.manual_release_grads and not self.delay_grads_release: g.untyped_storage().resize_(0) grads = self.get_grad() From 929e77f76585668b2dcfcf4c5ff4160831a14235 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Tue, 30 Dec 2025 13:19:28 -0800 Subject: [PATCH 207/248] feat: Cherry-pick PR of PR!2661 for dev branch (#2757) Signed-off-by: Youngeun Kwon --- .../distributed_data_parallel_config.py | 8 ++++ megatron/core/distributed/fsdp/src/README.md | 7 ++- .../distributed_data_parallel_config.py | 8 ++++ .../megatron_fsdp/param_and_grad_buffer.py | 44 +++++++++++++++++ megatron/core/nccl_allocator.py | 48 +++++++++++++++++++ megatron/training/arguments.py | 9 +++- megatron/training/training.py | 14 ++++++ .../test_mcore_fully_sharded_data_parallel.py | 20 ++++++-- 8 files changed, 151 insertions(+), 7 deletions(-) diff --git a/megatron/core/distributed/distributed_data_parallel_config.py b/megatron/core/distributed/distributed_data_parallel_config.py index 3f97beab825..eaec971c79c 100644 --- a/megatron/core/distributed/distributed_data_parallel_config.py +++ b/megatron/core/distributed/distributed_data_parallel_config.py @@ -137,6 +137,14 @@ class DistributedDataParallelConfig: when nccl_ub is set. """ + fsdp_manual_registration: bool = False + """If true, manually register the FSDP communication buffers to NCCL user buffer. + This option is only effective when use_megatron_fsdp and nccl_ub is set. + For symmetric registration with large models, the registration itself can take + a significant amount of time. This option minimizes the number of registration calls + to minimize the registration time. + """ + delay_wgrad_compute: bool = False """Delay the weight gradient computation to improve batch-level communication overlapping""" diff --git a/megatron/core/distributed/fsdp/src/README.md b/megatron/core/distributed/fsdp/src/README.md index 9e036f22f67..b4d81b2b368 100644 --- a/megatron/core/distributed/fsdp/src/README.md +++ b/megatron/core/distributed/fsdp/src/README.md @@ -220,13 +220,16 @@ optimizer.load_state_dict(ckpt_state_dict["optimizer"]) - **Only effective when using Megatron-LM.** - Defaults to `False`. - `nccl_ub` will allocate and register the NCCL userbuffer for param and grad buffers. This option enables an SM-efficient NCCL algorithm that could improve the performance of overlapped computations. This flag will be much more effective when used together with SHARP if the FSDP communication includes both NVL and IB domains. Enabling this option will cause additional memory overhead due to the requirement to enable the `fsdp_double_buffer` option. - - **Only effective when using Megatron-LM.** + - **Only effective when using with Megatron-Core.** - Defaults to `False`. - By default we try to use NCCL window (symmetric) registration if it is available. If not it falls back to conventional local registraion. +- `fsdp_manual_registration` will manually register the FSDP communication buffers with the NCCL user buffer. For symmetric registration with large models, the registration itself can take a significant amount of time. This option minimizes the number of registration calls to reduce the registration time. However, with this option enabled, you need to manually call the `ParamAndGradBuffer.manual_buffer_registration()` function after the first iteration. This is already implemented in the Megatron-LM training loop. In other use cases, users are expected to call this function themselves. + - **Only effective when using with Megatron-Core.** + - This option is only effective when `nccl_ub` is enabled. + - Defaults to `False`. - `disable_symmetric_registration` will disable NCCL window (i.e. symmetric) registraion when using `nccl_ub`. - Dafaults to `False`. - `fsdp_double_buffer` will use persistently allocated double buffers for temporarily-defined memory needed in `MegatronFSDP` communications. Having persistent double buffers may increase peak VRAM utilization, but is required to register NCCL user buffers (`nccl_ub=True`) for `MegatronFSDP`. Currently, this is only supported for simple repetitive model structures such as GPT. - - **Only effective when using Megatron-LM.** - Defaults to `False`. Automatically overridden to `True` when `nccl_ub` is enabled. - `preproc_state_dict_for_dcp_ckpt` adds `model.state_dict()` and `optimizer.state_dict()` post-hooks that modify the model and optimizer state in preparation for `torch.distributed.checkpoint.{save,load}` ([Torch DCP](https://docs.pytorch.org/docs/stable/distributed.checkpoint.html)) checkpointing. Specifically, it adds `__create_write_items__` and `__create_chunk_list__` methods to Tensors utilized by Torch DCP to redistribute parameters when saving and loading model and optimizer checkpoints. Can be deactivated should the user need a custom distributed checkpointing strategy. - Defaults to `True`. diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py index 86826758498..f0c817e1f80 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py @@ -131,6 +131,14 @@ class DistributedDataParallelConfig: when nccl_ub is set. """ + fsdp_manual_registration: bool = False + """If true, manually register the FSDP communication buffers to NCCL user buffer. + This option is only effective when use_megatron_fsdp and nccl_ub is set. + For symmetric registration with large models, the registration itself can take + a significant amount of time. This option minimizes the number of registration calls + to minimize the registration time. + """ + def __post_init__(self): import os diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py index b0154cb94e9..46b97743385 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py @@ -1570,6 +1570,7 @@ def __init__( reset_parameters_for_meta_device_init_module ) self.ubr_groups = None + self.already_registered = False # User buffer registration related settings if self.ddp_config.nccl_ub: assert nccl_allocator is not None, ( @@ -1676,6 +1677,10 @@ def get_mem_alloc_context(self, groups=None, symmetric=True): groups = [self.dist_index.get_fsdp_group(is_expert_parallel=False)] if NCCL_ALLOCATOR == "MCORE": + if self.ddp_config.fsdp_manual_registration: + return functools.partial( + nccl_allocator.MemPoolAllocatorWithoutRegistration, NCCL_MEMORY_POOL + ) if len(groups) == 1: # register buffers to the default group directly using nccl memory allocator mem_alloc_context = functools.partial( @@ -1692,6 +1697,12 @@ def get_mem_alloc_context(self, groups=None, symmetric=True): symmetric=symmetric, ) elif NCCL_ALLOCATOR == "APEX": + if self.ddp_config.fsdp_manual_registration: + logging.warning( + "FSDP manual registration is not supported for APEX NCCL allocator." + "falling back to default registration. " + "Please use Megatron Core NCCL allocator for manual registration." + ) if symmetric: logging.warning( "Symmetric registration is not supported for APEX NCCL allocator." @@ -1715,6 +1726,39 @@ def get_mem_alloc_context(self, groups=None, symmetric=True): else: return nullcontext + def manual_buffer_registration(self): + """ + Manually register the FSDP communication buffers to NCCL user buffer. + """ + assert self.ddp_config.nccl_ub, "NCCL UBR is not enabled" + assert self.ddp_config.fsdp_double_buffer, "FSDP double buffer is not enabled" + assert self.ddp_config.fsdp_manual_registration, "FSDP manual registration is not enabled" + assert not self.already_registered, "Mem pool is already registered" + + self.already_registered = True + + global NCCL_MEMORY_POOL + torch.cuda.synchronize() + torch.distributed.barrier(async_op=False) + torch.cuda.synchronize() + + for group in self.ubr_groups: + if torch.distributed.get_rank() == 0: + logging.info( + f"[MCORE][FSDP][Manual REG] Registering mem pool to group {group}," + f"group.group_desc:{group.group_desc}, group.size(): {group.size()}" + ) + nccl_allocator.register_mem_pool( + NCCL_MEMORY_POOL, + group, + symmetric=not self.ddp_config.disable_symmetric_registration, + ) + if torch.distributed.get_rank() == 0: + logging.info( + f"[MCORE][FSDP][Manual REG] Registered mem pool to group {group}," + f"group.group_desc:{group.group_desc}, group.size(): {group.size()}" + ) + def _log_parameter_groups(self): """Compact log of FSDP parameter groups and their parameters.""" diff --git a/megatron/core/nccl_allocator.py b/megatron/core/nccl_allocator.py index b46157e9d00..8eb4047634c 100644 --- a/megatron/core/nccl_allocator.py +++ b/megatron/core/nccl_allocator.py @@ -156,6 +156,37 @@ def init() -> None: logging.info(f"[MCORE][NCCL_ALLOCATOR] Initialized NCCL Allocator") +# register_mem_pool/deregister_mem_pool are used for manual (de)registration of the memory pool. +# They are used in the case of FSDP manual registration. +def register_mem_pool(pool, group, symmetric=True): + """ + Register a memory pool to a group. + symmetric: bool, this is for future use. + """ + backend = group._get_backend(torch.device("cuda", torch.cuda.current_device())) + if symmetric: + try: + backend.register_mem_pool(pool, symm=symmetric) + except TypeError: + # Older PyTorch/APIs without 'symm' keyword. + logging.warning( + f"[MCORE][NCCL_ALLOCATOR] Failed in symmetric registration." + f"Falling back to registration api without 'symm' keyword!!" + ) + backend.register_mem_pool(pool) + else: + backend.register_mem_pool(pool) + + +def deregister_mem_pool(pool, group): + """ + Deregister a memory pool from a group. + """ + backend = group._get_backend(torch.device("cuda", torch.cuda.current_device())) + if pool.snapshot(): + backend.deregister_mem_pool(pool) + + # Preserve the original APEX NCCL allocator interface for backward compatibility class nccl_mem: """ @@ -314,3 +345,20 @@ def __exit__(self, *args): f"{repr(group)}({desc}) group!!" ) self.mem_context.__exit__(*args) + + +class MemPoolAllocatorWithoutRegistration: + """ + An allocator class that uses allocates memory without registering to any communication group. + Users are expected to register the memory manually to the communication groups. + """ + + def __init__(self, pool): + self.pool = pool + self.mem_context = torch.cuda.use_mem_pool(self.pool) + + def __enter__(self): + self.mem_context.__enter__() + + def __exit__(self, *args): + self.mem_context.__exit__(*args) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index b267c8a8170..0fc00bd91be 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -744,10 +744,14 @@ def validate_args(args, defaults={}): assert args.ckpt_format == "fsdp_dtensor", \ "Megatron FSDP only supports fsdp_dtensor checkpoint format" - + if args.use_megatron_fsdp: args.reuse_grad_buf_for_mxfp8_param_ag = False + if args.fsdp_manual_registration: + assert args.use_megatron_fsdp, "FSDP manual registration is only supported with Megatron FSDP" + assert args.nccl_ub, "FSDP manual registration is only supported with nccl-ub option" + # Parameters dtype. args.params_dtype = torch.float if args.fp16: @@ -2773,6 +2777,9 @@ def _add_distributed_args(parser): group.add_argument('--disable-symmetric-registration', action='store_true', dest='disable_symmetric_registration', default=False, help='Disable symmetric (window) registration for NCCL userbuffer registration.' 'This option will force to use conventional (local) userbuffer registration when use-nccl-ub is set.') + group.add_argument('--fsdp-manual-registration', action='store_true', dest='fsdp_manual_registration', + default=False, help='Manually register the FSDP communication buffers to NCCL user buffer.' + 'This option is only effective when use-megatron-fsdp and use-nccl-ub is set.') group.add_argument('--use-sharp', action='store_true', help='Required to enable SHARP communication.') group.add_argument('--sharp-enabled-group', type=str, default=None, diff --git a/megatron/training/training.py b/megatron/training/training.py index 459e77e6c81..f006772bbdd 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -2517,6 +2517,20 @@ def get_e2e_base_metrics(): iteration += 1 + # If requested, manually register FSDP communication buffers after a short warmup. + if ( + getattr(args, "fsdp_manual_registration", False) + and getattr(args, "use_megatron_fsdp", False) + and iteration == start_iteration + 1 + ): + for model_chunk in model: + if isinstance(model_chunk, megatron_FSDP) and getattr( + model_chunk.ddp_config, "fsdp_manual_registration", False + ): + pad_buf = getattr(model_chunk, "param_and_grad_buffer", None) + if pad_buf is not None: + pad_buf.manual_buffer_registration() + if getattr(args, 'perform_rl_step', False) and args.rl_use_sequence_packing: iteration_sequences = rl_utils.get_iteration_sequence_count(args) # Track bins separately for packed mode diff --git a/tests/unit_tests/distributed/test_mcore_fully_sharded_data_parallel.py b/tests/unit_tests/distributed/test_mcore_fully_sharded_data_parallel.py index 3b41daf58ef..3f0cce4e40b 100644 --- a/tests/unit_tests/distributed/test_mcore_fully_sharded_data_parallel.py +++ b/tests/unit_tests/distributed/test_mcore_fully_sharded_data_parallel.py @@ -220,13 +220,16 @@ def train_step(model, optimizer, inputs): # Testing fsdp_double_buffer with and without nccl_ub @pytest.mark.parametrize( - ("dp_size", "nccl_ub", "fsdp_double_buffer"), [(8, False, True), (8, True, True)] + ("dp_size", "nccl_ub", "fsdp_double_buffer", "fsdp_manual_registration"), + [(8, False, True, False), (8, True, True, False), (8, True, True, True)], ) - def test_fsdp_user_buffer_registration(self, dp_size, nccl_ub, fsdp_double_buffer): + def test_fsdp_user_buffer_registration( + self, dp_size, nccl_ub, fsdp_double_buffer, fsdp_manual_registration + ): """Test that FSDP works correctly with user buffer registration. This test compares the training results of the baseline fsdp with the target fsdp config. - Baseline fsdp: nccl_ub=False, fsdp_double_buffer=False - Target fsdp: nccl_ub=[True, False], fsdp_double_buffer=[True, False] + Baseline fsdp: nccl_ub=False, fsdp_double_buffer=False, fsdp_manual_registration=False + Target fsdp: nccl_ub=[True, False], fsdp_double_buffer=[True, False], fsdp_manual_registration=[True, False] """ if not is_torch_min_version("2.4.0"): pytest.skip("Megatron FSDP requires torch >= 2.4.0") @@ -264,6 +267,7 @@ def test_fsdp_user_buffer_registration(self, dp_size, nccl_ub, fsdp_double_buffe use_megatron_fsdp=True, nccl_ub=False, fsdp_double_buffer=False, + fsdp_manual_registration=False, ) # Setup FSDP config - target fsdp config @@ -275,6 +279,7 @@ def test_fsdp_user_buffer_registration(self, dp_size, nccl_ub, fsdp_double_buffe use_megatron_fsdp=True, nccl_ub=nccl_ub, fsdp_double_buffer=fsdp_double_buffer, + fsdp_manual_registration=fsdp_manual_registration, ) # Create two identical models @@ -354,6 +359,13 @@ def train_step(model, optimizer, inputs): out1, loss1 = train_step(baseline_fsdp_model, optimizer1, input_data) out2, loss2 = train_step(target_fsdp_model, optimizer2, input_data) + # In case of manual registration, we need to manually register the buffer + # And proceed one more step to check the results + if fsdp_manual_registration: + out1, loss1 = train_step(baseline_fsdp_model, optimizer1, input_data) + target_fsdp_model.manual_buffer_registration() + out2, loss2 = train_step(target_fsdp_model, optimizer2, input_data) + testing.assert_close(out1, out2, rtol=0, atol=0) testing.assert_close(loss1, loss2, rtol=0, atol=0) From 922e8e9080611d6432276115666659301f4f874f Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Tue, 30 Dec 2025 22:49:53 -0600 Subject: [PATCH 208/248] cp: Allow disabling external contributors (#2784) (#2786) Signed-off-by: Charlie Truong --- .github/workflows/cicd-main.yml | 38 +++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index a5a7a82287e..1ce96750a36 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -52,6 +52,7 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.PAT }} REPO: ${{ github.repository }} + DISABLE_EXTERNAL_CONTRIBUTOR: ${{ vars.DISABLE_EXTERNAL_CONTRIBUTOR }} steps: - name: Checkout repository uses: actions/checkout@v4 @@ -86,6 +87,43 @@ jobs: # Use SSO membership check result IS_MEMBER="${{ steps.check-sso.outputs.is_member }}" + + # If external contributor is disabled, check if user is a repo collaborator or an org collaborator to NVIDIA or NVIDIA-NeMo + if [ "${{ env.DISABLE_EXTERNAL_CONTRIBUTOR }}" == "true" ] && [ "${{ steps.check-sso.outputs.is_member }}" != "true" ]; then + PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }} + + echo "Checking if $PR_AUTHOR is a repo collaborator..." + API_URL="https://api.github.com/repos/$REPO/collaborators/$PR_AUTHOR" + REPO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + $API_URL) + + echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA-NeMo..." + API_URL="https://api.github.com/orgs/NVIDIA-NeMo/members/$PR_AUTHOR" + ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + $API_URL) + + echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA..." + API_URL="https://api.github.com/orgs/NVIDIA/members/$PR_AUTHOR" + ORG_NVIDIA_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + $API_URL) + + if [ "$REPO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_MEMBERSHIP_RESPONSE" -eq 204 ]; then + IS_MEMBER="true" + else + exit 1 + fi + fi + + # Use SSO membership check result if [ "$IS_MEMBER" == "true" ]; then echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT else From 5455f0a010eadc81d2de48b0b94dccafd7c08a2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 3 Jan 2026 18:00:06 +0100 Subject: [PATCH 209/248] build: Pin down `nvidia-nvshmem-cu13` (#2798) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- docker/Dockerfile.ci.dev | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev index 482c6af460c..fa4d84bcad0 100644 --- a/docker/Dockerfile.ci.dev +++ b/docker/Dockerfile.ci.dev @@ -55,7 +55,7 @@ EOF COPY docker/patches/deepep.patch /workspace/deepep.patch RUN bash -ex <<"EOF" cd /workspace - uv pip install nvidia-nvshmem-cu13 + uv pip install nvidia-nvshmem-cu13==3.4.5 pushd /opt/venv/lib/python3.12/site-packages/nvidia/nvshmem/lib/ ln -s libnvshmem_host.so.3 libnvshmem_host.so popd From 71d5c84980aecd3be48ed4df368c70302f5560e3 Mon Sep 17 00:00:00 2001 From: Kunlun Li <94586211+kunlunl@users.noreply.github.com> Date: Mon, 5 Jan 2026 14:07:54 +0800 Subject: [PATCH 210/248] [dev] Fix bug of reuse_grad_buf_for_mxfp8_param_ag (#2801) Signed-off-by: kunlunl --- megatron/training/training.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/megatron/training/training.py b/megatron/training/training.py index f006772bbdd..91cd420c214 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -1401,10 +1401,19 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch # For the mxfp8_param with reuse_grad_buf_for_mxfp8_param_ag and dp_ag_overlap, # we need to call the _copy_main_params_to_param_buffer() after the grad buffer # is zeroed by zero_grad_buffer() because param and grad buffer are shared. + # + # However, we should skip this on the first iteration when forward_pre_hook is disabled, + # because: + # 1. The first iteration's params are already in param.data (from init or checkpoint). + # 2. Without forward_pre_hook, finish_param_sync() won't be called to zero the grad buffer, + # so the main grads will be polluted by the main params. if args.reuse_grad_buf_for_mxfp8_param_ag and args.overlap_param_gather: - for optim_instance in optimizer.chained_optimizers: - if isinstance(optim_instance, DistributedOptimizer): - optim_instance._copy_main_params_to_param_buffer() + # Check if forward_pre_hook is enabled by checking if hooks are registered. + forward_pre_hook_enabled = len(model[0].remove_forward_pre_hook_handles) > 0 + if forward_pre_hook_enabled: + for optim_instance in optimizer.chained_optimizers: + if isinstance(optim_instance, DistributedOptimizer): + optim_instance._copy_main_params_to_param_buffer() # Forward pass. losses_reduced = forward_backward_func( From 8b93e0d6ef0a5ca6ef3c1993b0728447a8ddc4b8 Mon Sep 17 00:00:00 2001 From: Pingtian Li <158665726+Wohox@users.noreply.github.com> Date: Mon, 5 Jan 2026 16:08:58 +0800 Subject: [PATCH 211/248] [Dev] Partial CUDA Graph support for EP Overlap (#2168) --- .../common/model_chunk_schedule_plan.py | 40 +- .../core/models/gpt/fine_grained_callables.py | 204 ++++++---- megatron/core/pipeline_parallel/schedules.py | 105 +++++ megatron/core/pipeline_parallel/utils.py | 4 +- megatron/core/transformer/cuda_graphs.py | 84 +++- megatron/core/transformer/moe/moe_layer.py | 7 +- .../core/transformer/transformer_config.py | 15 + .../core/transformer/transformer_layer.py | 36 ++ .../test_cuda_graphed_schedule_chunk_1f1b.py | 372 ++++++++++++++++++ .../a2a_overlap/test_schedule_layer_1f1b.py | 2 +- tests/unit_tests/a2a_overlap/utils.py | 1 + .../pipeline_parallel/test_schedules.py | 48 +++ .../transformer/test_submodule_callables.py | 16 +- 13 files changed, 804 insertions(+), 130 deletions(-) create mode 100644 tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py index 486a498dd73..04ca580eeaa 100644 --- a/megatron/core/models/common/model_chunk_schedule_plan.py +++ b/megatron/core/models/common/model_chunk_schedule_plan.py @@ -17,6 +17,7 @@ get_comm_stream, get_comp_stream, ) +from megatron.core.transformer.enums import CudaGraphScope class ModelChunkState: @@ -37,23 +38,20 @@ class TransformerLayerSchedulePlan: mtp post process nodes. layer (TransformerLayerSchedulePlan) - ├── attn (TransformerLayerNode): attention module - ├── post_attn (TransformerLayerNode): layernorm -> router -> dispatch preprocess + ├── attn (TransformerLayerNode): attention -> router -> dispatch preprocess ├── moe_dispatch (TransformerLayerNode): dispatch All2All ├── mlp (TransformerLayerNode): mlp module ├── moe_combine (TransformerLayerNode): combine All2All └── mtp_post_process (PostProcessNode): mtp post process Note that MTP layer has the same operation and execution order with TransformerLayer regarding - post_attn, moe_dispatch, mlp, moe_combine, but contains extra operations in attn and - mtp_post_process: + moe_dispatch, mlp, moe_combine, but contains extra operations in attn and mtp_post_process: * mtp.attn wraps around transformer_layer.attn with extra norm, proj and embedding operations. * mtp.mtp_post_process contains output_layer, mtp loss operations, whereas transformer_layer.mtp_post_process is empty. """ attn = None - post_attn = None moe_dispatch = None mlp = None moe_combine = None @@ -117,7 +115,7 @@ def release_state(self): def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args): """ Builds the callable nodes for the transformer/mtp layer: - attn, post_attn, mlp, moe_dispatch and moe_combine, and mtp_post_process. + attn, mlp, moe_dispatch and moe_combine, and mtp_post_process. """ from megatron.core.models.gpt.fine_grained_callables import ( TransformerLayerNode, @@ -137,16 +135,7 @@ def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args): else isinstance(self.layer.mlp, MoELayer) ) - enable_deepep = ( - self.layer.config.moe_token_dispatcher_type == "flex" - and self.layer.config.moe_flex_dispatcher_backend == "deepep" - ) - enable_hybridep = ( - self.layer.config.moe_token_dispatcher_type == "flex" - and self.layer.config.moe_flex_dispatcher_backend == "hybridep" - ) - extra_args["enable_deepep"] = enable_deepep - extra_args["enable_hybridep"] = enable_hybridep + extra_args["config"] = self.layer.config extra_args["is_moe"] = is_moe extra_args["delay_wgrad_compute"] = self.layer.config.delay_wgrad_compute extra_args["is_mtp"] = is_mtp @@ -167,7 +156,6 @@ def create_node(stream, module, name): ( attn_module, - post_attn_module, moe_dispatch_module, mlp_module, moe_combine_module, @@ -179,11 +167,9 @@ def create_node(stream, module, name): self.attn = create_node(comp_stream, attn_module, "attn") self.mlp = create_node(comp_stream, mlp_module, "mlp") if is_moe: - self.post_attn = create_node(comp_stream, post_attn_module, "post_attn") self.moe_dispatch = create_node(comm_stream, moe_dispatch_module, "moe_dispatch") self.moe_combine = create_node(comm_stream, moe_combine_module, "moe_combine") else: - self.post_attn = NoopScheduleNode() self.moe_dispatch = NoopScheduleNode() self.moe_combine = NoopScheduleNode() @@ -194,6 +180,11 @@ def create_node(stream, module, name): else: self.mtp_post_process = NoopScheduleNode() + # mlp and combine may receive dgrad from attn, which is managed by cuda graph. + if CudaGraphScope.attn in self.config.cuda_graph_scope: + self.mlp.manual_grads_release = False + self.moe_combine.manual_grads_release = False + def get_fp8_context(self): """ Get the fp8 context for the transformer layer. @@ -216,8 +207,8 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False) to maximize parallelism and efficiency. When f_layer and b_layer are not None, forward and backward pass are overlapped as follows: - comm_stream: combine_bwd | dispatch_fwd->dispatch_bwd | combine_fwd - comp_stream: attn_fwd->post_attn_fwd| mlp_bwd->mlp_bwd_dw->mlp_fwd| post_attn_bwd->attn_bwd + comm_stream: combine_bwd | dispatch_fwd->dispatch_bwd | combine_fwd + comp_stream: attn_fwd | mlp_bwd->mlp_bwd_dw->mlp_fwd| attn_bwd For MTP, mtp_post_process_fwd is executed after the combine_fwd in the comp_stream, and mtp_post_process_bwd is executed before the combine_bwd in the comp_stream. @@ -240,7 +231,6 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False) if f_layer is not None: with f_layer.get_fp8_context(): f_input = f_layer.attn.forward(f_input) - f_input = f_layer.post_attn.forward(f_input) if b_layer is not None: b_grad = b_layer.mlp.backward(b_grad) @@ -254,7 +244,6 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False) b_grad = b_layer.moe_dispatch.backward(b_grad) if b_layer is not None and b_layer.config.ep_overlap_early_attn_memory_release: - b_grad = b_layer.post_attn.backward(b_grad) b_grad = b_layer.attn.backward(b_grad) if f_layer is not None: @@ -267,7 +256,6 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False) f_input = f_layer.mtp_post_process.forward(f_input) if b_layer is not None and not b_layer.config.ep_overlap_early_attn_memory_release: - b_grad = b_layer.post_attn.backward(b_grad) b_grad = b_layer.attn.backward(b_grad) # Delay the last attn_dw in backward pass (attn_dw of the first layer) @@ -369,6 +357,10 @@ def __init__( model, self._model_chunk_state, self._event, comp_stream ) + # preprocess may receive dgrad from attn, which is managed by cuda graph. + if CudaGraphScope.attn in model.config.cuda_graph_scope: + self.pre_process.manual_grads_release = False + def _build_layer_schedule_plan(self, module, comp_stream, comm_stream): if module is None: return diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index a0be55c4ca1..ab76659d01b 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -6,14 +6,15 @@ from typing import Optional import torch +from torch import Tensor from megatron.core import tensor_parallel +from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( fine_grained_offloading_group_commit, - fine_grained_offloading_group_start, - get_fine_grained_offloading_context, ) from megatron.core.pipeline_parallel.utils import ScheduleNode, make_viewless +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.module import float16_to_fp32 from megatron.core.transformer.moe.moe_layer import MoELayer from megatron.core.transformer.multi_token_prediction import ( @@ -42,14 +43,13 @@ def wrapped_func(*args, **kwarg): @internal_api -def should_free_input(name, is_moe, enable_deepep, enable_hybridep): +def should_free_input(name, is_moe, config): """Determine if the node should free its input memory. Args: name: Node name is_moe: Whether it's a MoE model - enable_deepep: Whether to use DeepEP dispatcher - enable_hybridep: Whether to use HybridEP dispatcher + config: TransformerConfig object Returns: bool: Whether to free input memory @@ -57,6 +57,14 @@ def should_free_input(name, is_moe, enable_deepep, enable_hybridep): # For dense layers [attn, fake, mlp, fake], the input is needed during backward pass if not is_moe: return False + enable_deepep = ( + config.moe_token_dispatcher_type == "flex" + and config.moe_flex_dispatcher_backend == "deepep" + ) + enable_hybridep = ( + config.moe_token_dispatcher_type == "flex" + and config.moe_flex_dispatcher_backend == "hybridep" + ) # Define which nodes should free input memory # Since we split the computing graph into multiple nodes, we can manually control # when and how to free the input memory. @@ -69,7 +77,10 @@ def should_free_input(name, is_moe, enable_deepep, enable_hybridep): # and probs before dispatch A2A and it's not needed anymore after the forward pass # For DeepEP and HybridEP dispatcher mode, they are both needed in backward pass # and cannot be freed. - "moe_dispatch": not (enable_deepep or enable_hybridep), + # If moe_preprocess is in cuda graph scope, tokens and probs are fixed size tensors, + # so they cannot be freed. + "moe_dispatch": not (enable_deepep or enable_hybridep) + and (CudaGraphScope.moe_preprocess not in config.cuda_graph_scope), } return free_input_nodes.get(name, False) @@ -232,13 +243,13 @@ def __init__( it's the per_batch_state_context, o.w. nullcontext name (str): Node name, also used to determine memory strategy bwd_dw_callables (list): List of weight gradient functions for the layer. - extra_args (dict): Extra arguments for nodes: is_moe, enable_deepep, enable_hybridep. + extra_args (dict): Extra arguments for the node: is_moe, config. """ # determine whether to free input memory + config = extra_args.get("config", None) + assert config is not None, "model config must be passed to TransformerLayerNode." is_moe = extra_args.get("is_moe", False) - enable_deepep = extra_args.get("enable_deepep", False) - enable_hybridep = extra_args.get("enable_hybridep", False) - free_input = should_free_input(name, is_moe, enable_deepep, enable_hybridep) + free_input = should_free_input(name, is_moe, config) self.delay_wgrad_compute = extra_args.get("delay_wgrad_compute", False) super().__init__( @@ -303,8 +314,8 @@ def backward_dw(self): module.backward_dw() # the output grad memory is last used in wgrad compute, should be safe to release. - assert self.delay_grads_release, "output grad memory should be valid before wgrad." - if self.manual_release_grads: + if self.manual_grads_release: + assert self.delay_grads_release, "output grad memory should be valid before wgrad." for tensor in self.output_grads: tensor.untyped_storage().resize_(0) self.output_grads = None @@ -357,11 +368,95 @@ def build_transformer_layer_callables(layer: TransformerLayer): and layer.config.moe_flex_dispatcher_backend == "hybridep" ) + class _BackwardDWWrapper: + def __init__(self): + self.graphed_backward_dw_callable = None + self.attn_dw_callable = layer.self_attention.backward_dw + if isinstance(layer.mlp, MoELayer): + self.shared_expert_dw_callable = partial( + layer.mlp.backward_dw, routed_experts=False, shared_experts=True + ) + else: + self.shared_expert_dw_callable = None + self.cuda_graph_scope = layer.config.cuda_graph_scope + + def set_graphed_backward_dw_callable(self, graphed_backward_dw_callable): + """Store the CUDA graphed backward weight gradient callable.""" + self.graphed_backward_dw_callable = graphed_backward_dw_callable + + def backward_dw(self): + """Execute weight gradients, skipping CUDA graphed components during replay.""" + is_replay = hasattr(layer, 'cuda_graphs') and layer.cuda_graphs + if self.shared_expert_dw_callable is not None and ( + not is_replay or CudaGraphScope.moe_router not in self.cuda_graph_scope + ): + self.shared_expert_dw_callable() + if not is_replay or CudaGraphScope.attn not in self.cuda_graph_scope: + self.attn_dw_callable() + if is_replay and self.graphed_backward_dw_callable is not None: + self.graphed_backward_dw_callable() + + attn_backward_dw_wrapper = _BackwardDWWrapper() + def submodule_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor): """ - Performs same attnention forward logic as GPT Model. + Performs same attnention forward logic as GPT Model and forward pass for + computations between attention and dispatch: + pre mlp layernorm->router->dispatch preprocess """ - hidden_states, _ = layer._forward_attention( + + if hasattr(layer, 'cuda_graphs') and layer.cuda_graphs: + assert ( + CudaGraphScope.mlp not in layer.config.cuda_graph_scope + and CudaGraphScope.moe not in layer.config.cuda_graph_scope + ), ( + "Supported CUDA graph scope with EP overlap: " + "attn, moe_router, moe_preprocess, mlp, got {}".format( + layer.config.cuda_graph_scope + ) + ) + forward_func = layer._te_cuda_graph_replay + attn_backward_dw_wrapper.set_graphed_backward_dw_callable( + partial(layer.backward_dw_cudagraph, layer.current_microbatch) + ) + else: + # wrapper function that keeps consistent api with cuda graph replay + def forward_func( + hidden_states: Tensor, + attention_mask: Optional[Tensor] = None, + rotary_pos_emb: Optional[Tensor] = None, + rotary_pos_cos: Optional[Tensor] = None, + rotary_pos_sin: Optional[Tensor] = None, + packed_seq_params: Optional[PackedSeqParams] = None, + sequence_len_offset: Optional[Tensor] = None, + ): + hidden_states, _ = layer._forward_attention( + hidden_states=hidden_states, + attention_mask=attention_mask, + rotary_pos_emb=rotary_pos_emb, + rotary_pos_cos=rotary_pos_cos, + rotary_pos_sin=rotary_pos_sin, + packed_seq_params=packed_seq_params, + sequence_len_offset=sequence_len_offset, + ) + if not isinstance(layer.mlp, MoELayer): + return hidden_states, None, None, None + if layer.recompute_pre_mlp_layernorm: + layer.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput() + pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint( + layer.pre_mlp_layernorm, hidden_states + ) + else: + pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states) + + shared_expert_output = layer.mlp.shared_experts_compute(pre_mlp_layernorm_output) + probs, routing_map = layer.mlp.route(pre_mlp_layernorm_output) + local_tokens, probs, _ = layer.mlp.preprocess( + pre_mlp_layernorm_output, probs, routing_map + ) + return hidden_states, local_tokens, probs, shared_expert_output + + hidden_states, local_tokens, probs, shared_expert_output = forward_func( hidden_states=hidden_states, attention_mask=node.chunk_state.attention_mask, rotary_pos_emb=node.chunk_state.rotary_pos_emb, @@ -370,33 +465,14 @@ def submodule_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor): packed_seq_params=node.chunk_state.packed_seq_params, sequence_len_offset=node.chunk_state.sequence_len_offset, ) - return hidden_states - - def submodule_post_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor): - """ - Run forward pass for computations between attention and dispatch: - pre mlp layernorm->router->dispatch preprocess - """ - if layer.offload_mlp_norm: - hidden_states = fine_grained_offloading_group_start(hidden_states, name="mlp_norm") - if layer.recompute_pre_mlp_layernorm: - layer.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput() - with get_fine_grained_offloading_context(layer.offload_mlp_norm): - pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint( - layer.pre_mlp_layernorm, hidden_states - ) - else: - with get_fine_grained_offloading_context(layer.offload_mlp_norm): - pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states) - - probs, routing_map = layer.mlp.route(pre_mlp_layernorm_output) - local_tokens, probs, _ = layer.mlp.preprocess(pre_mlp_layernorm_output, probs, routing_map) + if not isinstance(layer.mlp, MoELayer): + return hidden_states # Detach here for mlp_bda residual connection node.layer_state.residual = node.detach(hidden_states) if layer.mlp.use_shared_expert and not layer.mlp.shared_expert_overlap: - # Detach here for shared expert connection - node.layer_state.pre_mlp_layernorm_output = node.detach(pre_mlp_layernorm_output) + # Detach here for shared expert connection in moe_combine + node.layer_state.shared_expert_output = node.detach(shared_expert_output) return local_tokens, probs @@ -421,7 +497,6 @@ def submodule_moe_forward(node: ScheduleNode, dispatched_tokens: torch.Tensor): Run forward pass for computations between dispatch and combine: post dispatch->experts->combine preprocess """ - shared_expert_output = None dispatched_probs = node.layer_state.dispatched_probs token_dispatcher = layer.mlp.token_dispatcher if enable_deepep or enable_hybridep: @@ -429,10 +504,8 @@ def submodule_moe_forward(node: ScheduleNode, dispatched_tokens: torch.Tensor): # backward graph from connecting to dispatch submodule token_dispatcher._comm_manager.dispatched_probs = dispatched_probs - pre_mlp_layernorm_output = getattr(node.layer_state, 'pre_mlp_layernorm_output', None) - shared_expert_output = layer.mlp.shared_experts_compute(pre_mlp_layernorm_output) - expert_output, mlp_bias = layer.mlp.routed_experts_compute( - dispatched_tokens, dispatched_probs, pre_mlp_layernorm_output + expert_output, _ = layer.mlp.routed_experts_compute( + dispatched_tokens, dispatched_probs, None ) if layer.recompute_pre_mlp_layernorm: @@ -442,16 +515,10 @@ def submodule_moe_forward(node: ScheduleNode, dispatched_tokens: torch.Tensor): # release tensor reference after use node.layer_state.dispatched_probs = None node.layer_state.pre_mlp_layernorm_output = None - if shared_expert_output is None: - # Return only expert_output, since shared_expert_output causes backward on None - return expert_output - return expert_output, shared_expert_output - - def submodule_combine_forward( - node: ScheduleNode, - output: torch.Tensor, - shared_expert_output: Optional[torch.Tensor] = None, - ): + + return expert_output + + def submodule_combine_forward(node: ScheduleNode, output: torch.Tensor): """ # Triggers token combine and the remaining computation in the transformer layer. # The `mlp_bda` computation is placed after `mlp.combine` due to data dependency. @@ -461,10 +528,11 @@ def submodule_combine_forward( # with another microbatch's computation and expose the communication. """ residual = node.layer_state.residual - + shared_expert_output = getattr(node.layer_state, 'shared_expert_output', None) output = layer.mlp.combine(output, shared_expert_output) mlp_output_with_bias = (output, None) - + if hasattr(layer, 'cuda_graphs') and layer.cuda_graphs: + layer.mlp.cudagraph_tensor_store.clear() with layer.bias_dropout_add_exec_handler(): hidden_states = layer.mlp_bda(layer.training, layer.config.bias_dropout_fusion)( mlp_output_with_bias, residual, layer.hidden_dropout @@ -500,13 +568,12 @@ def raise_not_implemented(*args): # Build forward and backward callable functions attn_func = submodule_attn_forward - post_attn_func = submodule_post_attn_forward if is_moe else raise_not_implemented dispatch_func = submodule_dispatch_forward if is_moe else raise_not_implemented mlp_func = submodule_moe_forward if is_moe else mlp_wrapper combine_func = submodule_combine_forward if is_moe else raise_not_implemented - forward_funcs = [attn_func, post_attn_func, dispatch_func, mlp_func, combine_func, None] - backward_dw = {"attn": layer.self_attention, "mlp": layer.mlp} + forward_funcs = [attn_func, dispatch_func, mlp_func, combine_func, None] + backward_dw = {"attn": attn_backward_dw_wrapper, "mlp": layer.mlp} return forward_funcs, backward_dw @@ -518,9 +585,7 @@ def build_mtp_layer_callables(layer): """ forward_funcs, backward_dw = build_transformer_layer_callables(layer.transformer_layer) - attn_forward, post_attn_forward, dispatch_forward, mlp_forward, combine_forward, _ = ( - forward_funcs - ) + attn_forward, dispatch_forward, mlp_forward, combine_forward, _ = forward_funcs is_moe = isinstance(layer.transformer_layer.mlp, MoELayer) assert is_moe, "MTP layer in a2a overlap only supports MoE layer for now." @@ -581,24 +646,17 @@ def rng_context_wrapper(func, *args, **kwargs): # Build forward and backward callable functions # attn_forward already has rng context, no need to wrap attn_func = submodule_mtp_attn_forward - post_attn_func = partial(rng_context_wrapper, post_attn_forward) dispatch_func = partial(rng_context_wrapper, dispatch_forward) mlp_func = partial(rng_context_wrapper, mlp_forward) combine_func = partial(rng_context_wrapper, combine_forward) mtp_post_process_func = submodule_mtp_postprocess_forward - forward_funcs = [ - attn_func, - post_attn_func, - dispatch_func, - mlp_func, - combine_func, - mtp_post_process_func, - ] - backward_dw = { - "attn": [layer.transformer_layer.self_attention, layer.eh_proj], - "mlp": layer.transformer_layer.mlp, - } + forward_funcs = [attn_func, dispatch_func, mlp_func, combine_func, mtp_post_process_func] + if isinstance(backward_dw["attn"], list): + backward_dw["attn"].append(layer.eh_proj) + else: + backward_dw["attn"] = [backward_dw["attn"], layer.eh_proj] + return forward_funcs, backward_dw diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index a8fdf2324f2..c41a09ea594 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -2,6 +2,7 @@ import contextlib from functools import partial +from itertools import zip_longest from typing import Callable, Iterator, List, Optional, Union import torch @@ -843,6 +844,110 @@ def convert_schedule_table_to_order(num_warmup_microbatches, num_model_chunks, s return order +def get_overlap_moe_expert_parallel_comm_order(order, num_layers_per_chunk, capture_wgrad_graph): + """ + This functions gets the order for overlap_moe_expert_parallel_comm schedule for the original + chunk-wise order list. Each chunk is transformered to chunks with only 1 layer so that + layers between 2 chunks can now overlap with each other while following the graph order. + If capture_wgrad_graph is True, the wgrad backward graph is also added to the order by + decreasing the layer id by 0.5. + + Args: + order (List[int]): The original chunk-wise order list. Positive values represent forward + passes for chunks, negative values represent backward passes. The absolute value + indicates the chunk ID (1-indexed). + num_layers_per_chunk (List[int]): Number of graphable layers in each chunk. The length + of this list equals the number of chunks. + capture_wgrad_graph (bool): If True, weight gradient computation graphs are added to the + order by appending entries with layer_id - 0.5. + + Returns: + Tuple[List[float], List[Optional[List[int]]]]: A tuple containing: + - new_order: The layer-wise order list where each chunk is expanded to individual + layers. Positive values are forward passes, negative values are backward passes. + Values with .5 suffix indicate weight gradient computations. + - chunk_id_list: A list parallel to new_order. For forward passes, contains + [chunk_id, layer_index_within_chunk]. For backward passes, contains None. + + Example: + original_order: [1, 2, -2, 1, -1, -1] + num_layers_per_chunk: [1, 2] + capture_wgrad_graph=True: + new_order: [1, 2, 3, 1, -3, -3.5, -2, -2.5, -1, -1.5, -1, -1.5] + chunk_id_list: [[0, 0], [1, 0], [1, 1], [0, 0], None, + None, None, None, None, None, None, None] + capture_wgrad_graph=False: + new_order: [1, 2, 3, 1, -3, -2, -1, -1] + chunk_id_list: [[0, 0], [1, 0], [1, 1], [0, 0], None, None, None, None] + """ + + def _add_order(new_order, chunk_id_list, c_id, layer_id, is_wgrad=False, index=None): + if is_wgrad: + new_order.append(layer_id - 0.5) + else: + new_order.append(layer_id) + if c_id > 0: + chunk_id_list.append([abs(c_id) - 1, index]) + else: + chunk_id_list.append(None) + + new_order = [] + chunk_id_list = [] + add_order = partial(_add_order, new_order, chunk_id_list) + first_backward_idx, last_forward_idx = None, None + for idx, c_id in enumerate(order): + if first_backward_idx is None and c_id < 0: + first_backward_idx = idx + if c_id > 0: + last_forward_idx = idx + + def get_layer_range(c_id): + num_layers = num_layers_per_chunk[abs(c_id) - 1] + num_layers_previous_chunks = sum(num_layers_per_chunk[: abs(c_id) - 1]) + if c_id > 0: + return list( + range(num_layers_previous_chunks + 1, num_layers_previous_chunks + num_layers + 1) + ) + return list(range(-num_layers_previous_chunks - num_layers, -num_layers_previous_chunks)) + + # warmup stage + for c_id in order[:first_backward_idx]: + layer_range = get_layer_range(c_id) + new_order += layer_range + chunk_id_list.extend([abs(c_id) - 1, i] for i in range(len(layer_range))) + + # 1f1b overlap stage + if first_backward_idx < last_forward_idx: + for c_id_b, c_id_f in zip( + order[first_backward_idx : last_forward_idx + 1 : 2], + order[first_backward_idx + 1 : last_forward_idx + 1 : 2], + ): + layer_range_f = get_layer_range(c_id_f) + layer_range_b = get_layer_range(c_id_b) + index = 0 + for l_b, l_f in zip_longest(layer_range_b, layer_range_f, fillvalue=0): + # always forward graph before backward graph + if l_f != 0: + add_order(c_id_f, l_f, index=index) + if l_b != 0: + add_order(c_id_b, l_b) + if capture_wgrad_graph and index < len(layer_range_b) - 1: + add_order(c_id_b, l_b, is_wgrad=True) + index += 1 + # last wgrad backward + if capture_wgrad_graph and layer_range_b: + add_order(c_id_b, layer_range_b[-1], is_wgrad=True) + + # cool down stage, backward graphs only + for c_id in order[last_forward_idx + 1 :]: + for l_b in get_layer_range(c_id): + add_order(c_id, l_b) + if capture_wgrad_graph: + add_order(c_id, l_b, is_wgrad=True) + + return new_order, chunk_id_list + + def forward_backward_pipelining_with_interleaving( *, forward_step_func, diff --git a/megatron/core/pipeline_parallel/utils.py b/megatron/core/pipeline_parallel/utils.py index e7e416f99bd..d38f6d702c0 100644 --- a/megatron/core/pipeline_parallel/utils.py +++ b/megatron/core/pipeline_parallel/utils.py @@ -182,8 +182,8 @@ def __init__( self.free_input = free_input self.inputs = None self.outputs = None + self.manual_grads_release = False self.delay_grads_release = False - self.manual_release_grads = False def default_backward_func(self, outputs, output_grad): """Default backward function""" @@ -269,7 +269,7 @@ def _backward(self, *output_grad): # to avoid delayed garbage collection. If # delay_grads_release is True, dgrad is last used in # wgrad compute and skip the release here. - if self.manual_release_grads and not self.delay_grads_release: + if self.manual_grads_release and not self.delay_grads_release: g.untyped_storage().resize_(0) grads = self.get_grad() diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index 27e6c65c738..b566c1830dc 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -10,6 +10,7 @@ from contextlib import nullcontext from dataclasses import fields, is_dataclass from enum import Enum +from math import ceil from typing import Any, Dict, List, Optional import torch @@ -1510,7 +1511,7 @@ def graphs_created(self): """ return self._graphs_created - def _get_sample_arguments(self, order): + def _get_sample_arguments(self, order, chunk_id_list=None): """ Generate sample arguments and keyword arguments for CUDA Graph capturing with memory-optimized buffer reuse. @@ -1539,6 +1540,9 @@ def _get_sample_arguments(self, order): order (List[int]): The forward/backward execution order from convert_schedule_table_to_order(). Positive integers represent forward passes (1-indexed chunk ID), negative integers represent backward passes. + chunk_id_list (List[Tuple[int, int]]): The list of chunk IDs and layer IDs in the + order. This is useful only when overlap_moe_expert_parallel_comm is enabled, + the order maps each layers' idx to their original chunk id. Returns: Tuple[List[Tuple], List[Dict]]: A tuple containing: @@ -1560,9 +1564,11 @@ def _get_sample_arguments(self, order): assert self.num_model_chunks == max( order ), "num_model_chunks must match the max chunk id in order." - assert ( - self.num_microbatches == len(order) // self.num_model_chunks // 2 - ), "num_microbatches must match the number of microbatches in order." + if chunk_id_list is None: + # check only if 1f1b overlap is disabled. + assert ( + self.num_microbatches == len(order) // self.num_model_chunks // 2 + ), "num_microbatches must match the number of microbatches in order." # Generate sample arguments and keyword arguments for capturing. sample_args = [None] * (len(self.flattened_callables) * self.num_microbatches) @@ -1645,8 +1651,8 @@ def get_rotary_pos_emb(transformer_module, transformer_input): consumed_sample_queue = {} layer_sample_keys_cache = {} fwd_idx = [0] * self.num_model_chunks - for chunk_id in order: - model_chunk_idx = abs(chunk_id) - 1 + for idx, chunk_id in enumerate(order): + model_chunk_idx = abs(ceil(chunk_id)) - 1 if chunk_id > 0: if model_chunk_idx not in fwd_sample_queues: @@ -1655,7 +1661,14 @@ def get_rotary_pos_emb(transformer_module, transformer_input): sample_start_idx = (prefix_num_layers[model_chunk_idx] * self.num_microbatches) + ( fwd_idx[model_chunk_idx] * self.num_layers_per_chunk[model_chunk_idx] ) - for layer_idx, layer in enumerate(self.callables_per_chunk[model_chunk_idx]): + if chunk_id_list: + model_chunk_idx = chunk_id_list[idx][0] + callables_curr_chunk = [ + self.callables_per_chunk[model_chunk_idx][chunk_id_list[idx][1]] + ] + else: + callables_curr_chunk = self.callables_per_chunk[model_chunk_idx] + for layer_idx, layer in enumerate(callables_curr_chunk): per_callable_fwd_idx = sample_start_idx + layer_idx # Get sample_args and sample_kwargs for index per_callable_fwd_idx. @@ -1692,7 +1705,7 @@ def get_rotary_pos_emb(transformer_module, transformer_input): # reuse the static inputs of a previous forward pass for this forward pass. # If not, we still need to generate the new static inputs. sample_keys = layer_sample_keys_cache[id(layer)] - + model_chunk_idx = abs(chunk_id) - 1 fwd_sample_queues[model_chunk_idx].append((sample_keys, per_callable_fwd_idx)) if consumed_sample_queue.get(sample_keys, []): # We can reuse the static inputs of a previous forward pass for this @@ -1714,13 +1727,16 @@ def get_rotary_pos_emb(transformer_module, transformer_input): # Unfortunately, no previous static inputs are available for reuse, # sample_args is still None. Last attempt: generate the new static inputs # for this forward pass. + if chunk_id_list: + model_chunk_idx = chunk_id_list[idx][0] sample_args[per_callable_fwd_idx], sample_kwargs[per_callable_fwd_idx] = ( _get_layer_static_inputs( layer, self.chunks_with_decoder[model_chunk_idx] ) ) + model_chunk_idx = abs(chunk_id) - 1 fwd_idx[model_chunk_idx] += 1 - else: + elif ceil(chunk_id) == chunk_id: num_consumed_samples = min( len(fwd_sample_queues[model_chunk_idx]), self.num_layers_per_chunk[model_chunk_idx], @@ -1734,6 +1750,9 @@ def get_rotary_pos_emb(transformer_module, transformer_input): fwd_sample_queues[model_chunk_idx] = fwd_sample_queues[model_chunk_idx][ num_consumed_samples: ] + else: + # skip register static inputs for wgrad backward graphs + continue return sample_args, sample_kwargs @@ -1746,12 +1765,16 @@ def _get_cuda_graph_input_data(self): # Get the PP and VPP scheduling order. from megatron.core.pipeline_parallel.schedules import ( convert_schedule_table_to_order, + get_overlap_moe_expert_parallel_comm_order, get_pp_rank_microbatches, get_schedule_table, ) # If PP is not enabled, we only need to capture one microbatch. - if parallel_state.get_pipeline_model_parallel_world_size() == 1: + if ( + parallel_state.get_pipeline_model_parallel_world_size() == 1 + and not self.config.overlap_moe_expert_parallel_comm + ): assert ( self.num_model_chunks == 1 ), "If PP is not enabled, there should be only one model chunk." @@ -1780,9 +1803,36 @@ def _get_cuda_graph_input_data(self): level=logging.DEBUG, msg=f'Rank {torch.distributed.get_rank()}: ORDER {order}', ) + chunk_id_list = None + if self.config.overlap_moe_expert_parallel_comm: + wgrad_in_graph_scope = CudaGraphScope.attn in self.config.cuda_graph_scope or ( + CudaGraphScope.moe_router in self.config.cuda_graph_scope + and self.config.moe_shared_expert_intermediate_size is not None + and not self.config.moe_shared_expert_overlap + ) + capture_wgrad_graph = self.config.delay_wgrad_compute and wgrad_in_graph_scope + order, chunk_id_list = get_overlap_moe_expert_parallel_comm_order( + order, self.num_layers_per_chunk, capture_wgrad_graph + ) + self.num_layers_per_chunk = [1] * sum(self.num_layers_per_chunk) + self.num_model_chunks = max(order) + _order_without_wgrad = [] + for c_id in order: + if ceil(c_id) != c_id: + continue + _order_without_wgrad.append(c_id) + self.num_microbatches = len(_order_without_wgrad) // self.num_model_chunks // 2 + log_on_each_pipeline_stage( + logger=logger, + tp_group=None, + dp_cp_group=None, + level=logging.DEBUG, + msg=f'Rank {torch.distributed.get_rank()}: ' + f'ORDER after overlap_moe_expert_parallel_comm {order}', + ) # Generate sample arguments and keyword arguments for capturing. - sample_args, sample_kwargs = self._get_sample_arguments(order) + sample_args, sample_kwargs = self._get_sample_arguments(order, chunk_id_list) def get_make_graphed_callables_kwargs(): kwargs = {'allow_unused_input': True, '_order': order} @@ -1920,13 +1970,17 @@ def create_cudagraphs(self): for layer_number, layer in enumerate(layers): layer.cuda_graphs = [] for batch_number in range(self.num_microbatches): - layer.cuda_graphs.append( - graphs[ + if self.config.overlap_moe_expert_parallel_comm: + graph_idx = ( + num_layers_accumulated + layer_number + ) * self.num_microbatches + batch_number + else: + graph_idx = ( num_layers_accumulated * self.num_microbatches + batch_number * len(layers) + layer_number - ] - ) + ) + layer.cuda_graphs.append(graphs[graph_idx]) num_layers_accumulated += len(layers) self._finish_capturing(start_time) diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py index 10d10f667fe..c8438bb2c8a 100644 --- a/megatron/core/transformer/moe/moe_layer.py +++ b/megatron/core/transformer/moe/moe_layer.py @@ -326,10 +326,11 @@ def custom_forward(hidden_states): return outputs - def backward_dw(self): + def backward_dw(self, routed_experts: bool = True, shared_experts: bool = False): """Compute weight gradients for experts and shared experts.""" - self.experts.backward_dw() - if self.use_shared_expert and not self.shared_expert_overlap: + if routed_experts: + self.experts.backward_dw() + if shared_experts and self.use_shared_expert and not self.shared_expert_overlap: self.shared_experts.backward_dw() def set_for_recompute_pre_mlp_layernorm(self): diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index 6493a4bcce1..a5636d94e26 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -1849,6 +1849,16 @@ def __post_init__(self): 'when enabling overlap_moe_expert_parallel_comm with MTP layer.' ) + if self.cuda_graph_impl != "none": + assert ( + self.cuda_graph_impl == "transformer_engine" + and CudaGraphScope.moe not in self.cuda_graph_scope + and CudaGraphScope.mlp not in self.cuda_graph_scope + ), ( + 'CUDA graph scope on moe and mlp is not ' + 'supported with overlap_moe_expert_parallel_comm' + ) + # Check delay_wgrad_compute compatibility if self.delay_wgrad_compute: assert ( @@ -1857,6 +1867,11 @@ def __post_init__(self): assert ( not self.moe_use_legacy_grouped_gemm ), 'delay_wgrad_compute is not supported with legacy groupedgemm implementation' + if self.cuda_graph_impl == "transformer_engine": + assert is_te_min_version("2.10.0"), ( + 'TE version >= 2.10.0 is required for delay_wgrad_compute with ' + 'partial cuda graph' + ) if self.ep_overlap_early_attn_memory_release: assert self.overlap_moe_expert_parallel_comm, ( diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index 3ea40577009..db57e21c891 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -872,6 +872,10 @@ def _te_cuda_graph_replay(self, *args, **kwargs): # CUDA Graph captures the whole MLP/MoE part. CUDA Graph output is the layer output. assert len(cuda_graph_output) == 1, "CUDA Graph output should be the layer output." output = cuda_graph_output.pop() + assert ( + not self.config.overlap_moe_expert_parallel_comm + ), "EP overlap must be \ + disabled when CUDA graph captures the whole MLP/MoE part." elif self.is_moe_layer and CudaGraphScope.moe_router in self.config.cuda_graph_scope: # CUDA Graph partially captures the MoE. # The rest of the layer should go to the normal pass. @@ -914,12 +918,35 @@ def _te_cuda_graph_replay(self, *args, **kwargs): residual=residual, shared_expert_output=shared_expert_output, ) + # If EP overlap is enabled, remaining of mlp will be called as fine_grained_callables + # and should be skipped here. + if self.config.overlap_moe_expert_parallel_comm: + probs, routing_map = self.mlp.route(hidden_states) + hidden_states, probs, residual = self.mlp.preprocess( + hidden_states, probs, routing_map + ) + nvtx_range_pop(suffix="mlp") + return mlp_residual, hidden_states, probs, shared_expert_output mlp_output_with_bias = self.mlp(hidden_states) self.mlp.cudagraph_tensor_store.clear() nvtx_range_pop(suffix="mlp") output = self._forward_post_mlp(mlp_output_with_bias, mlp_residual) else: + # If EP overlap is enabled, needs to return same outputs as submodule.attn + if self.config.overlap_moe_expert_parallel_comm: + assert len(cuda_graph_output) == 1, "CUDA Graph output should be the layer output." + mlp_residual = cuda_graph_output.pop() + if not self.is_moe_layer: + return mlp_residual, None, None, None + hidden_states = self.pre_mlp_layernorm(mlp_residual) + shared_expert_output = self.mlp.shared_experts_compute(hidden_states) + probs, routing_map = self.mlp.route(hidden_states) + hidden_states, probs, residual = self.mlp.preprocess( + hidden_states, probs, routing_map + ) + return mlp_residual, hidden_states, probs, shared_expert_output + # CUDA Graph does not capture the MLP/MoE part at all. output = self._forward_mlp(*cuda_graph_output) return output, context @@ -1007,6 +1034,15 @@ def _should_call_local_cudagraph(self, *args, **kwargs): return True return False + def backward_dw_cudagraph(self, microbatch_idx): + """ + CUDA Graph backward weight gradient computation for this layer. + """ + cg_index = microbatch_idx % len(self.cuda_graphs) + if not hasattr(self.cuda_graphs[cg_index], 'backward_dw'): + return + self.cuda_graphs[cg_index].backward_dw() + def __call__(self, *args, **kwargs): if self._should_call_local_cudagraph(*args, **kwargs): # Inference mode. diff --git a/tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py b/tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py new file mode 100644 index 00000000000..91c74fe1bb6 --- /dev/null +++ b/tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py @@ -0,0 +1,372 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +import gc +import os +import sys + +import pytest +import torch + +from megatron.core.enums import ModelType +from megatron.core.models.gpt.gpt_layer_specs import ( + get_gpt_decoder_block_spec, + get_gpt_mtp_block_spec, +) +from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator +from megatron.core.pipeline_parallel.utils import set_streams +from megatron.core.tensor_parallel.random import HAVE_TE, model_parallel_cuda_manual_seed +from megatron.core.transformer.enums import CudaGraphScope +from megatron.core.transformer.module import float16_to_fp32 +from megatron.core.utils import is_te_min_version, unwrap_model +from megatron.training.arguments import core_transformer_config_from_args, parse_args, validate_args +from megatron.training.global_vars import ( + destroy_global_vars, + get_args, + set_args, + set_global_variables, +) +from megatron.training.training import setup_model_and_optimizer +from tests.unit_tests.test_utilities import Utils + + +def is_deep_ep_available(): + from megatron.core.transformer.moe.fused_a2a import HAVE_DEEP_EP + + return HAVE_DEEP_EP + + +def is_hybrid_ep_available(): + from megatron.core.transformer.moe.fused_a2a import HAVE_HYBRIDEP + + return HAVE_HYBRIDEP + + +def save(fn, message): + with open(fn, 'w') as f: + f.write(message) + + +class TestPartialCudaGraphedA2AOverlap: + """Test that CUDA graph outputs match ep-overlapped CUDA graph outputs for various scopes.""" + + def setup_method(self, method): + self.seq_length = 512 + self.micro_batch_size = 2 + # Store original environment variable values + self.original_env = { + 'CUDA_DEVICE_MAX_CONNECTIONS': os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS'), + 'NVTE_ALLOW_NONDETERMINISTIC_ALGO': os.environ.get('NVTE_ALLOW_NONDETERMINISTIC_ALGO'), + } + self.cuda_graph_helper = None + os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1' + os.environ['NVTE_ALLOW_NONDETERMINISTIC_ALGO'] = '0' + + def teardown_method(self, method): + # Restore original environment variable values + for key, value in self.original_env.items(): + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value + Utils.destroy_model_parallel() + destroy_global_vars() + destroy_num_microbatches_calculator() + if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created(): + self.cuda_graph_helper.delete_cuda_graphs() + self.cuda_graph_helper = None + + gc.collect() + + def model_provider( + self, + pre_process=True, + post_process=True, + layer_spec_fn=get_gpt_decoder_block_spec, + **config_kwargs, + ): + model_parallel_cuda_manual_seed(123) + args = get_args() + config = core_transformer_config_from_args(args) + transformer_layer_spec = layer_spec_fn( + config, + use_transformer_engine=True, + normalization=args.normalization, + qk_l2_norm=args.qk_l2_norm, + ) + if args.mtp_num_layers: + mtp_block_spec = get_gpt_mtp_block_spec( + config, transformer_layer_spec, use_transformer_engine=True + ) + else: + mtp_block_spec = None + return GPTModel( + config=config, + transformer_layer_spec=transformer_layer_spec, + vocab_size=args.vocab_size, + max_sequence_length=args.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + fp16_lm_cross_entropy=args.fp16_lm_cross_entropy, + parallel_output=True, + share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, + position_embedding_type=args.position_embedding_type, + rotary_percent=args.rotary_percent, + mtp_block_spec=mtp_block_spec, + ) + + def create_test_args( + self, cuda_graph_impl, cuda_graph_scope, cuda_graph_warmup_steps, ep_size, **kwargs + ): + destroy_global_vars() + destroy_num_microbatches_calculator() + + sys.argv = ['test_cuda_graphs.py'] + args = parse_args() + args.num_layers = 1 + args.mtp_num_layers = None + args.vocab_size = 1024 + args.hidden_size = 128 + args.num_attention_heads = 8 + args.max_position_embeddings = 512 + args.global_batch_size = self.micro_batch_size * 8 + args.micro_batch_size = self.micro_batch_size + args.create_attention_mask_in_dataloader = True + args.seq_length = self.seq_length + args.tensor_model_parallel_size = 2 + args.sequence_parallel = True + args.pipeline_model_parallel_size = 1 + args.context_parallel_size = 1 + args.expert_model_parallel_size = ep_size + args.train_iters = 10 + args.lr = 3e-5 + args.bf16 = True + args.add_bias_linear = False + args.swiglu = True + args.use_distributed_optimizer = True + args.position_embedding_type = "rope" + args.rotary_percent = 1.0 + args.hidden_dropout = 0.0 + args.attention_dropout = 0.0 + args.untie_embeddings_and_output_weights = True + + # MoE settings + args.num_experts = 16 + args.expert_model_parallel_size = ep_size + args.moe_shared_expert_intermediate_size = 1024 + args.moe_layer_freq = kwargs.get("moe_layer_freq", "[0,0,1,1]") + args.moe_permute_fusion = True + args.moe_router_fusion = True + args.moe_router_topk = 2 + + # CUDA graph settings + args.cuda_graph_impl = cuda_graph_impl + args.cuda_graph_scope = cuda_graph_scope + args.cuda_graph_warmup_steps = cuda_graph_warmup_steps + args.use_te_rng_tracker = cuda_graph_impl != "none" + + for key, value in kwargs.items(): + assert hasattr(args, key) + setattr(args, key, value) + + validate_args(args) + set_global_variables(args, False) + return args + + def get_batch(self, seq_length, micro_batch_size): + data = list(range(seq_length)) + input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + labels = 1 + torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + attention_mask = torch.ones( + (micro_batch_size, 1, seq_length, seq_length), dtype=bool + ).cuda() + loss_mask = torch.ones(seq_length).repeat((micro_batch_size, 1)).cuda() + return input_ids, labels, position_ids, attention_mask, loss_mask + + def _run_1f1b_helper(self, gpt_model, optimizer, data, num_iters, cuda_graph_warmup_steps): + from megatron.core.models.common.model_chunk_schedule_plan import ( + TransformerModelChunkSchedulePlan, + ) + from megatron.core.pipeline_parallel.schedules import set_current_microbatch + + schedule_plans = [] + losses = [] + set_current_microbatch(gpt_model[0], 1) + + gpt_model[0].zero_grad_buffer() + optimizer.zero_grad() + assert cuda_graph_warmup_steps > 0, "cuda_graph_warmup_steps must be greater than 0" + for fwd_mb_idx in range(num_iters + 1): + # Capture CUDA graphs after warmup if helper is provided + if self.cuda_graph_helper is not None and fwd_mb_idx == cuda_graph_warmup_steps: + self.cuda_graph_helper.create_cudagraphs() + + if fwd_mb_idx < cuda_graph_warmup_steps: + gpt_model[0].zero_grad_buffer() + optimizer.zero_grad() + output = gpt_model[0].forward(**data) + schedule_plans.append(None) + else: + if fwd_mb_idx == cuda_graph_warmup_steps: + extra_schedule_plan = unwrap_model(gpt_model[0]).build_schedule_plan(**data) + TransformerModelChunkSchedulePlan.run(extra_schedule_plan, None) + schedule_plans[-1] = extra_schedule_plan + f_schedule_plan = unwrap_model(gpt_model[0]).build_schedule_plan(**data) + b_schedule_plan = schedule_plans[-1] + schedule_plans.append(f_schedule_plan) + if b_schedule_plan is not None: + gpt_model[0].zero_grad_buffer() + optimizer.zero_grad() + output = TransformerModelChunkSchedulePlan.run( + f_schedule_plan, + b_schedule_plan, + b_grad=torch.ones_like(output) if fwd_mb_idx > 0 else None, + ) + # Check output shapes + if fwd_mb_idx < num_iters: + assert output is not None + assert output.shape[0] == self.micro_batch_size + assert output.shape[1] == self.seq_length + losses.append(output) + + if fwd_mb_idx < cuda_graph_warmup_steps: + output.backward(torch.ones_like(output)) + + for param in gpt_model[0].parameters(): + assert param.main_grad is not None + + update_successful, _, _ = optimizer.step() + assert update_successful + + return losses + + def _run_test_helper( + self, + ep_size, + cuda_graph_impl, + cuda_graph_scope, + cuda_graph_warmup_steps, + ep_overlap=False, + **kwargs, + ): + """Test fp8_param with gpt_model.""" + args = self.create_test_args( + cuda_graph_impl, + cuda_graph_scope, + cuda_graph_warmup_steps, + ep_size, + overlap_moe_expert_parallel_comm=ep_overlap, + **kwargs, + ) + if ep_overlap: + set_streams() + set_args(args) + torch.manual_seed(123) + Utils.initialize_model_parallel( + tensor_model_parallel_size=2, expert_model_parallel_size=ep_size + ) + + input_ids, labels, position_ids, attention_mask, loss_mask = self.get_batch( + self.seq_length, self.micro_batch_size + ) + + gpt_model, optimizer, _ = setup_model_and_optimizer( + self.model_provider, ModelType.encoder_or_decoder + ) + assert len(gpt_model) == 1 # Assume only one model in the model provider. + + loss_list = [] + + if cuda_graph_impl == "transformer_engine": + from megatron.core.transformer.cuda_graphs import TECudaGraphHelper + + self.cuda_graph_helper = TECudaGraphHelper( + model=gpt_model, + config=gpt_model[0].config, + seq_length=self.seq_length, + micro_batch_size=self.micro_batch_size, + optimizers=[optimizer], + ) + + num_iters = cuda_graph_warmup_steps + 2 + data = { + "input_ids": input_ids, + "position_ids": position_ids, + "attention_mask": attention_mask, + "labels": labels, + "loss_mask": loss_mask, + } + if not ep_overlap: + for i in range(num_iters): + gpt_model[0].zero_grad_buffer() + optimizer.zero_grad() + + # Capture CUDA graphs after warmup if helper is provided + if self.cuda_graph_helper is not None and i == cuda_graph_warmup_steps: + self.cuda_graph_helper.create_cudagraphs() + + output = unwrap_model(gpt_model[0]).forward(**data) + output = float16_to_fp32(output) + + # Check output shapes + assert output.shape[0] == self.micro_batch_size + assert output.shape[1] == self.seq_length + + # Verify gradients + output.backward(torch.ones_like(output)) + for param in gpt_model[0].parameters(): + assert param.main_grad is not None + + update_successful, _, _ = optimizer.step() + assert update_successful + + loss_list.append(output) + else: + loss_list = self._run_1f1b_helper( + gpt_model, optimizer, data, num_iters, cuda_graph_warmup_steps + ) + + return loss_list + + @pytest.mark.skipif( + not (HAVE_TE and is_te_min_version("2.10.0")), + reason="Partial CUDA graph support requires TransformerEngine version >= 2.10.0", + ) + @pytest.mark.parametrize("moe_dispatcher_type", ["alltoall", "deepep"]) + def test_moe_partial_cudagraph_with_ep_overlap(self, moe_dispatcher_type): + extra_kwargs = {"moe_layer_freq": 1} + if moe_dispatcher_type == "deepep": + if not is_deep_ep_available(): + pytest.skip("Deep EP is not available") + extra_kwargs["moe_token_dispatcher_type"] = "flex" + extra_kwargs["moe_flex_dispatcher_backend"] = "deepep" + extra_kwargs["moe_router_dtype"] = "fp32" + elif moe_dispatcher_type == "hybridep": + if not is_hybrid_ep_available(): + pytest.skip("Hybrid EP is not available") + extra_kwargs["moe_token_dispatcher_type"] = "flex" + extra_kwargs["moe_flex_dispatcher_backend"] = "hybridep" + else: + extra_kwargs["moe_token_dispatcher_type"] = moe_dispatcher_type + + loss_list_ref = self._run_test_helper(4, "none", None, 3, **extra_kwargs) + for cuda_graph_scope in [ + [CudaGraphScope.attn], + [CudaGraphScope.attn, CudaGraphScope.moe_router], + [CudaGraphScope.attn, CudaGraphScope.moe_router, CudaGraphScope.moe_preprocess], + ]: + cuda_graph_warmup_steps = 3 + loss_list = self._run_test_helper( + 4, + "transformer_engine", + cuda_graph_scope, + cuda_graph_warmup_steps, + ep_overlap=True, + **extra_kwargs, + ) + assert len(loss_list) == len(loss_list_ref) + for i in range(len(loss_list)): + assert torch.equal( + loss_list[i].mean(), loss_list_ref[i].mean() + ), f"scope={cuda_graph_scope}, i={i},loss_list={loss_list[i]}, loss_list_ref={loss_list_ref[i]}" + print(f"[DEBUG] Pass {cuda_graph_scope}") diff --git a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py index 7fb97f6e586..0fd2c445c9f 100644 --- a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py +++ b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py @@ -306,7 +306,7 @@ def test_transformer_layer_overlap_shared_expert(self): "moe_shared_expert_intermediate_size": 512, } overlap_config = get_test_config(extra_kwargs=extra_kwargs) - extra_kwargs["moe_shared_expert_overlap"] = True + extra_kwargs["moe_shared_expert_overlap"] = False ref_config = get_test_config(extra_kwargs=extra_kwargs) microbatches = 4 with deterministic_mode(): diff --git a/tests/unit_tests/a2a_overlap/utils.py b/tests/unit_tests/a2a_overlap/utils.py index 7db4256a849..a52843956df 100644 --- a/tests/unit_tests/a2a_overlap/utils.py +++ b/tests/unit_tests/a2a_overlap/utils.py @@ -1,3 +1,4 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import os from contextlib import contextmanager from dataclasses import dataclass diff --git a/tests/unit_tests/pipeline_parallel/test_schedules.py b/tests/unit_tests/pipeline_parallel/test_schedules.py index b861aa2df49..86b9219fe0f 100644 --- a/tests/unit_tests/pipeline_parallel/test_schedules.py +++ b/tests/unit_tests/pipeline_parallel/test_schedules.py @@ -1,3 +1,5 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + import os import pytest @@ -127,6 +129,52 @@ def test_get_pipeline_parallel_order( for k, v in order_cnt.items(): assert -k in order_cnt and order_cnt[-k] == v + layers_per_chunk = 2 + num_layers_per_chunk = [layers_per_chunk] * num_model_chunks + # disable wgrad compute + overlapped_order, chunk_id_list = schedule.get_overlap_moe_expert_parallel_comm_order( + order, num_layers_per_chunk, False + ) + assert max(overlapped_order) == num_model_chunks * layers_per_chunk + assert len(overlapped_order) == len(order) * layers_per_chunk + assert len(chunk_id_list) == len(overlapped_order) + order_cnt = {} + accumulated_order = 0 + for o in overlapped_order: + order_cnt[o] = order_cnt.get(o, 0) + 1 + if o < 0: + assert -o in order_cnt and order_cnt[-o] >= order_cnt[o] + elif -o in order_cnt: + assert order_cnt[-o] < order_cnt[o] + accumulated_order += o + assert accumulated_order >= 0 + assert accumulated_order == 0 + + # enable wgrad compute + overlapped_order, chunk_id_list = schedule.get_overlap_moe_expert_parallel_comm_order( + order, num_layers_per_chunk, True + ) + assert max(overlapped_order) == num_model_chunks * layers_per_chunk + assert len(overlapped_order) == len(order) * layers_per_chunk * 3 // 2 + assert len(chunk_id_list) == len(overlapped_order) + from math import ceil + + order_cnt = {} + accumulated_order = 0 + prev_o = 0 + for o in overlapped_order: + if ceil(o) != o: + assert prev_o - 0.5 == o + else: + order_cnt[o] = order_cnt.get(o, 0) + 1 + if o < 0: + assert -o in order_cnt and order_cnt[-o] >= order_cnt[o] + elif -o in order_cnt: + assert order_cnt[-o] < order_cnt[o] + accumulated_order += o + prev_o = o + assert accumulated_order < 0 + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/transformer/test_submodule_callables.py b/tests/unit_tests/transformer/test_submodule_callables.py index 1ccb6fd5be8..73059495c06 100644 --- a/tests/unit_tests/transformer/test_submodule_callables.py +++ b/tests/unit_tests/transformer/test_submodule_callables.py @@ -64,7 +64,7 @@ def run_model_submodules_with_capture(model, input_tensors, microbatches): output_tensors = [] # get callables callables, dw = build_layer_callables(model) - attn, post_attn, dispatch, moe, combine, post_process = callables + attn, dispatch, moe, combine, post_process = callables assert post_process is None dummy_model = DummyState() dummy_model.decoder = DummyState() @@ -76,24 +76,16 @@ def run_model_submodules_with_capture(model, input_tensors, microbatches): node.chunk_state.model = dummy_model # attn fwd - hidden_states = attn(node, input_tensors[i]) - - # post attn fwd - local_tokens, probs = post_attn(node, hidden_states) + local_tokens, probs = attn(node, input_tensors[i]) # dispatch fwd dispatched_tokens = dispatch(node, local_tokens, probs) # moe fwd - expert_outputs = moe(node, dispatched_tokens) - if model.mlp.use_shared_expert: - expert_output, shared_expert_output = expert_outputs - else: - expert_output = expert_outputs - shared_expert_output = None + expert_output = moe(node, dispatched_tokens) # combine fwd - hidden_states = combine(node, expert_output, shared_expert_output) + hidden_states = combine(node, expert_output) # loss output_tensors.append(hidden_states) From c1045f6954a68599c0447f35310f80e94a07ff1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 5 Jan 2026 11:59:40 +0100 Subject: [PATCH 212/248] =?UTF-8?q?Revert=20"[Dev]=20FP8=20params=20suppor?= =?UTF-8?q?t=20for=20megatron-fsdp=20(MXFP8/Blockwise)=20=E2=80=A6=20(#280?= =?UTF-8?q?4)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .../distributed/fsdp/mcore_fsdp_adapter.py | 4 - .../fsdp/src/megatron_fsdp/megatron_fsdp.py | 157 +++--- .../fsdp/src/megatron_fsdp/mixed_precision.py | 331 ------------- .../megatron_fsdp/param_and_grad_buffer.py | 450 +++++------------- .../fsdp/src/megatron_fsdp/utils.py | 252 +++++++++- megatron/training/arguments.py | 7 - 6 files changed, 421 insertions(+), 780 deletions(-) delete mode 100644 megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py diff --git a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py index d6384e70488..7432a7f9a36 100644 --- a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +++ b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py @@ -111,9 +111,6 @@ def __init__( dist_index=self.megatron_fsdp_dist_index, calculate_per_token_loss=config.calculate_per_token_loss, init_model_with_meta_device=config.init_model_with_meta_device, - enable_fine_grained_param_gather_hook=( - config.fp8_recipe == "mxfp8" and ddp_config.fp8_param_gather - ), ), ) self.param_and_grad_buffer = self.module.param_and_grad_buffer @@ -126,7 +123,6 @@ def __init__( self.broadcast_params = self.module.broadcast_params self.module.state_dict_for_save_checkpoint = self.module.state_dict self.state_dict_for_save_checkpoint = self.state_dict - self.module.config = config self.sync_rng_states_across_tp_group() diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py index 17f7f4d1c05..8a63e0f5cf7 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py @@ -23,20 +23,6 @@ import torch.nn as nn from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten -from .mixed_precision import ( - fp8_create_transpose_cache, - fp8_discard_transpose_cache, - is_float8tensor, -) -from .param_and_grad_buffer import ( - AllGatherPipeline, - BucketingPolicy, - GradReducePipeline, - ParamAndGradBuffer, - PrefetchOrder, - override_sharded_param_methods_with_safety_checks, - to_local_if_dtensor, -) from .utils import FSDPDistributedIndex logger = logging.getLogger(__name__) @@ -48,12 +34,23 @@ from megatron.core.distributed.distributed_data_parallel_config import ( DistributedDataParallelConfig, ) + from megatron.core.fp8_utils import is_float8tensor from megatron.core.utils import is_submodule except ImportError: # Megatron-LM is not installed, use Megatron-FSDP as a standalone module. logger.info("Megatron Core is not installed, Megatron-FSDP will run without Megatron Core.") from .distributed_data_parallel_config import DistributedDataParallelConfig - from .utils import is_submodule + from .utils import is_float8tensor, is_submodule + +from .param_and_grad_buffer import ( + AllGatherPipeline, + BucketingPolicy, + GradReducePipeline, + ParamAndGradBuffer, + PrefetchOrder, + override_sharded_param_methods_with_safety_checks, + to_local_if_dtensor, +) class TrainingState(Enum): @@ -171,7 +168,6 @@ def __init__( nccl_ub: bool = False, fsdp_double_buffer: bool = False, disable_symmetric_registration: bool = False, - enable_fine_grained_param_gather_hook: bool = False, ): super().__init__() # If device is not specified, use the current device. @@ -221,7 +217,6 @@ def __init__( self.calculate_per_token_loss = calculate_per_token_loss self.init_model_with_meta_device = init_model_with_meta_device - self.enable_fine_grained_param_gather_hook = enable_fine_grained_param_gather_hook # Whether to constantly synchronize the model every training iteration, # which defaults to False to overlap communication with computation @@ -405,7 +400,6 @@ def all_gather_and_wait_parameters_ready( prefetch=True, prefetch_order=PrefetchOrder.FORWARD_PASS_ORDER, wait_bucket_ready=True, - bwd=False, ): """ All-gather parameters across the data parallel group and wait for @@ -432,14 +426,11 @@ def all_gather_and_wait_parameters_ready( and self.ddp_config.outer_dp_sharding_strategy != "no_shard" and (self.microbatch_count == 0 or self.model_auto_sync) ), - bwd=bwd, ) if wait_bucket_ready: for param in params: bucket_id = self.param_and_grad_buffer.param_to_param_group[param] - ag_pipeline.wait_bucket_ready(bucket_id, bwd) - if bwd and is_float8tensor(param): - fp8_create_transpose_cache(param) + ag_pipeline.wait_bucket_ready(bucket_id) for param in params: # This setting is needed to make FSDP store the weight object when used @@ -498,17 +489,19 @@ def _register_fsdp_hooks(self, root_module): """ fsdp_unit_modules = self.fsdp_unit_modules - def release_module_parameters(module, bwd, *unused): + def release_module_parameters(module, *unused): for param in module.parameters(): bucket_id = self.param_and_grad_buffer.param_to_param_group[param] - self.all_gather_pipeline.release_bucket(bucket_id, bwd) + self.all_gather_pipeline.release_bucket(bucket_id) + if not self.ddp_config.keep_fp8_transpose_cache: release_params_fp8_transpose_cache(module.parameters()) def release_params_fp8_transpose_cache(params): for param in params: if is_float8tensor(param): - fp8_discard_transpose_cache(param) + param._transpose_invalid = True + param._transpose = None def _grad_acc(param): """ @@ -565,15 +558,12 @@ def _post_backward(module, *unused): if self.ddp_config.data_parallel_sharding_strategy == "optim_grads_params": # Deallocate the module parameters after the backward pass, # because we have our data-parallel gradients computed. - release_module_parameters(module, bwd=True) + release_module_parameters(module) module._training_state = TrainingState.IDLE param_list = list(module.parameters()) else: param_list = list(module.parameters(recurse=False)) - if self.enable_fine_grained_param_gather_hook: - param_list = list(module.parameters(recurse=False)) - # If the parameter is shared, we do not accumulate gradients # here, as the gradients will be accumulated in the # root post-backward hook. @@ -625,9 +615,6 @@ def _pre_forward_param_unshard( # to allocate as little memory as possible for this forward pass. param_list = list(module.parameters(recurse=False)) - if self.enable_fine_grained_param_gather_hook: - param_list = list(module.parameters(recurse=False)) - # All-gather the parameters before the forward pass. self.all_gather_and_wait_parameters_ready( params=param_list, @@ -727,7 +714,7 @@ def _root_post_backward(*unused): if self.model_auto_sync: self.finish_grad_sync() - def _pre_backward_param_unshard(module: nn.Module, *unused): + def _pre_backward(module: nn.Module, *unused): """ Sub-module pre-backward hook to all-gather the module parameters before the backward pass. @@ -736,19 +723,11 @@ def _pre_backward_param_unshard(module: nn.Module, *unused): # and unsharding operations when performing activation recomputation # / gradient checkpointing. module._training_state = TrainingState.PRE_BACKWARD - if isinstance(module, tuple(fsdp_unit_modules)): - param_list = list(module.parameters()) - else: - param_list = list(module.parameters(recurse=False)) - - if self.enable_fine_grained_param_gather_hook: - param_list = list(module.parameters(recurse=False)) - - # All-gather / unshard the module parameters before the backward pass. - self.all_gather_and_wait_parameters_ready( - param_list, prefetch_order=PrefetchOrder.BACKWARD_PASS_ORDER, bwd=True - ) + # All-gather / unshard the module parameters before the backward pass. + self.all_gather_and_wait_parameters_ready( + list(module.parameters()), prefetch_order=PrefetchOrder.BACKWARD_PASS_ORDER + ) self._root_pre_backward_hook_issued = False @@ -775,9 +754,7 @@ def _root_pre_backward(module: nn.Module, *unused): for bucket_id in range(ag_pipeline.num_buckets): group = self.param_and_grad_buffer.parameter_groups[bucket_id] if group.fsdp_unit_id is not None: - ag_pipeline.bucket_can_be_released[ - ag_pipeline.get_bucket_key(bucket_id, bwd=False) - ] = True + ag_pipeline.bucket_can_be_released[bucket_id] = True # Track parameters that require gradient reduction and optimization. self._params_require_handle_grad = set() for param_group in self.param_and_grad_buffer.parameter_groups: @@ -799,12 +776,8 @@ def _post_forward(module: nn.Module, input: Any, output: Any): # during activation recomputation / gradient checkpointing. return output - assert isinstance( - module, tuple(fsdp_unit_modules) - ), "_post_forward hook should only be registered on FSDP unit modules." - # Release the module parameters after the forward pass to save memory. - release_module_parameters(module, bwd=False) + release_module_parameters(module) module._training_state = TrainingState.IDLE return output @@ -845,55 +818,21 @@ def forward_hook(_module, inputs, output): # on the output tensor(s). return module.register_forward_hook(forward_hook) - def _register_pre_forward_param_unshard_hook(module): - """ - Register the forward pre-hook to unshard parameters before the forward pass. - If we are not sharding anything, we do not have a model weight buffer and thus - have nothing to all-gather / un-shard. - """ - if self.ddp_config.data_parallel_sharding_strategy != "no_shard": - self.forward_pre_hooks[f"{module._get_name()} parameter unshard"] = ( - module.register_forward_pre_hook( - _pre_forward_param_unshard, prepend=True, with_kwargs=True - ) - ) - - def _register_pre_backward_param_unshard_hook(module): - """ - Register the backward pre-hook to unshard FSDP unit module parameters - immediately before the backward pass via attaching a gradient-triggered - hook to the output tensor(s) of a module during a post-forward hook. - """ - self.backward_pre_hooks[f"all-gather {module._get_name()} parameters"] = ( - create_custom_backward_hook(module, _pre_backward_param_unshard) - ) - - def _register_grad_acc_and_reduce_hook(module): - """ - Register the post-backward hook to deallocate model parameters and - reduce-scatter gradients immediately after the module backward pass - has completed to conserve memory for the subsequent backward pass. - """ - self.forward_pre_hooks[f"module {name} register post-backward hook"] = ( - module.register_forward_pre_hook( - functools.partial(_register_post_backward_hook, _post_backward), - with_kwargs=True, - ) - ) - fsdp_modules = [] for name, module in root_module.named_modules(): - if self.enable_fine_grained_param_gather_hook: - _register_pre_forward_param_unshard_hook(module) - _register_pre_backward_param_unshard_hook(module) - _register_grad_acc_and_reduce_hook(module) - # Skip if the module is already registered in fsdp_modules. if any(is_submodule(module, fsdp_module) for fsdp_module in fsdp_modules): continue - if not self.enable_fine_grained_param_gather_hook: - _register_pre_forward_param_unshard_hook(module) + # Register the forward pre-hook to unshard parameters before the forward pass. + # If we are not sharding anything, we do not have a model weight buffer and thus + # have nothing to all-gather / un-shard. + if self.ddp_config.data_parallel_sharding_strategy != "no_shard": + self.forward_pre_hooks[f"module {name} parameter unshard"] = ( + module.register_forward_pre_hook( + _pre_forward_param_unshard, prepend=True, with_kwargs=True + ) + ) if isinstance(module, tuple(fsdp_unit_modules)): fsdp_modules.append(module) @@ -904,8 +843,12 @@ def _register_grad_acc_and_reduce_hook(module): module.register_forward_hook(_post_forward, prepend=False) ) - if not self.enable_fine_grained_param_gather_hook: - _register_pre_backward_param_unshard_hook(module) + # Register the backward pre-hook to unshard FSDP unit module parameters + # immediately before the backward pass via attaching a gradient-triggered + # hook to the output tensor(s) of a module during a post-forward hook. + self.backward_pre_hooks[f"all-gather module {name} parameters"] = ( + create_custom_backward_hook(module, _pre_backward) + ) elif ( not self.ddp_config.keep_fp8_transpose_cache and self.ddp_config.data_parallel_sharding_strategy == "optim_grads_params" @@ -918,8 +861,15 @@ def _register_grad_acc_and_reduce_hook(module): module.register_forward_hook(_release_module_fp8_transpose_cache, prepend=False) ) - if not self.enable_fine_grained_param_gather_hook: - _register_grad_acc_and_reduce_hook(module) + # Register the post-backward hook to deallocate model parameters and + # reduce-scatter gradients immediately after the module backward pass + # has completed to conserve memory for the subsequent backward pass. + self.forward_pre_hooks[f"module {name} register post-backward hook"] = ( + module.register_forward_pre_hook( + functools.partial(_register_post_backward_hook, _post_backward), + with_kwargs=True, + ) + ) # Register root module pre- and post-backward hooks in cases where the # forward function of root module is not called, but rather the forward @@ -1036,7 +986,7 @@ def start_param_sync(self, *unused, force_sync: bool = False, force_dispatch: bo else: self.synchronize_param_gather() for bucket_id in range(self.all_gather_pipeline.num_buckets): - self.all_gather_pipeline.async_bucket_gather(bucket_id=bucket_id, bwd=False) + self.all_gather_pipeline.async_bucket_gather(bucket_id=bucket_id) group = self.param_and_grad_buffer.parameter_groups[bucket_id] if group.model_weight_buffer is None: continue @@ -1044,10 +994,9 @@ def start_param_sync(self, *unused, force_sync: bool = False, force_dispatch: bo if group.model_weight_buffer.is_data_distributed: # If model weight is sharded, we wait for the all-gather to complete and # then release the bucket immediately to save memory usage. - self.all_gather_pipeline.wait_bucket_ready(bucket_id, False) - + self.all_gather_pipeline.wait_bucket_ready(bucket_id) for bucket_id in range(self.all_gather_pipeline.num_buckets): - self.all_gather_pipeline.wait_bucket_ready(bucket_id, False) + self.all_gather_pipeline.wait_bucket_ready(bucket_id) def start_grad_sync(self, *unused): """ diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py deleted file mode 100644 index 69a049ad955..00000000000 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py +++ /dev/null @@ -1,331 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from importlib.metadata import version -from typing import List, Optional, Tuple - -import torch -from packaging.version import Version as PkgVersion - -logger = logging.getLogger(__name__) - -# Detect if Transformer Engine is installed -try: - import transformer_engine # pylint: disable=W0611 - from transformer_engine.pytorch.module.base import TransformerEngineBaseModule - - HAVE_TE = True -except (ImportError, ModuleNotFoundError): - TransformerEngineBaseModule = None - HAVE_TE = False - logger.info("Using Megatron-FSDP without Transformer Engine.") - -# Detect the Transformer Engine version -try: - import transformer_engine as te - - if hasattr(te, "__version__"): - TE_VERSION = PkgVersion(str(te.__version__)) - else: - TE_VERSION = PkgVersion(version("transformer-engine")) -except: - TE_VERSION = None - -# Detect the FP8 tensor class -try: - from transformer_engine.pytorch.tensor import QuantizedTensor - - HAVE_TE_FP8_TENSOR_CLASS = True - FP8_TENSOR_CLASS = QuantizedTensor -except: - try: - from transformer_engine.pytorch.float8_tensor import Float8Tensor - - HAVE_TE_FP8_TENSOR_CLASS = True - FP8_TENSOR_CLASS = Float8Tensor - except: - HAVE_TE_FP8_TENSOR_CLASS = False - -# Detect the MXFP8 tensor class -try: - from transformer_engine.pytorch.tensor.mxfp8_tensor import MXFP8Tensor - - HAVE_TE_MXFP8TENSOR = True -except: - HAVE_TE_MXFP8TENSOR = False - -# Detect the Blockwise FP8 tensor class -try: - from transformer_engine.pytorch.tensor.float8_blockwise_tensor import Float8BlockwiseQTensor - - HAVE_TE_BLOCKWISE_FP8TENSOR = True -except: - HAVE_TE_BLOCKWISE_FP8TENSOR = False - -# Detect the "cast_master_weights_to_fp8" function of Transformer Engine -try: - from transformer_engine.pytorch.tensor.utils import cast_master_weights_to_fp8 - - HAVE_TE_CAST_MASTER_WEIGHTS_TO_FP8 = True -except: - HAVE_TE_CAST_MASTER_WEIGHTS_TO_FP8 = False - - # Try to import multi_tensor_apply, used in the fallback of fp8 quantization. - try: - from transformer_engine.pytorch.optimizers import multi_tensor_applier, multi_tensor_scale - - multi_tensor_scale_impl = multi_tensor_scale - except ImportError: - try: - import amp_C - from apex.multi_tensor_apply import multi_tensor_applier - - multi_tensor_scale_impl = amp_C.multi_tensor_scale - except ImportError: - import warnings - - warnings.warn( - "Transformer Engine and Apex are not installed. " - "Falling back to local implementations of " - "multi_tensor_applier and multi_tensor_scale" - ) - - def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args): - """Multi tensor op applier""" - return op(2048 * 32, noop_flag_buffer, tensor_lists, *args) - - def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale): - """Works as a drop-in replacement for amp_C.multi_tensor_scale.""" - for src, dst in zip(tensor_lists[0], tensor_lists[1]): - dst.copy_(src * scale) - - multi_tensor_applier = local_multi_tensor_applier - multi_tensor_scale_impl = local_multi_tensor_scale - - def _multi_tensor_copy_this_to_that( - this: List[torch.Tensor], - that: List[torch.Tensor], - overflow_buf: Optional[torch.Tensor] = None, - ): - """ - Use multi-tensor-applier to copy values from one list to another. - We don't have a bfloat16 implementation so for now if the overflow_buf - is not provided, we default back to simple loop copy to be compatible - with bfloat16. - """ - if overflow_buf is not None: - overflow_buf.fill_(0) - # Scaling with factor `1.0` is equivalent to copy. - multi_tensor_applier(multi_tensor_scale_impl, overflow_buf, [this, that], 1.0) - else: - for this_, that_ in zip(this, that): - that_.copy_(this_) - - -# Detect the "post_all_gather_processing" function of Transformer Engine -try: - from transformer_engine.pytorch.tensor.utils import post_all_gather_processing - - HAVE_TE_POST_ALL_GATHER_PROCESSING = True -except: - HAVE_TE_POST_ALL_GATHER_PROCESSING = False - - -def is_te_min_version(vers, check_equality=True): - """Check if minimum version of `transformer-engine` is installed.""" - if not isinstance(TE_VERSION, PkgVersion): - return False - - if check_equality: - return TE_VERSION >= PkgVersion(vers) - else: - return TE_VERSION > PkgVersion(vers) - - -def is_float8tensor(tensor: torch.Tensor) -> bool: - """Check if a tensor is a FP8 tensor.""" - return HAVE_TE and isinstance(tensor, FP8_TENSOR_CLASS) - - -def is_blockwise_float8tensor(tensor: torch.Tensor) -> bool: - """Check if a tensor is a Blockwise FP8 tensor.""" - return HAVE_TE_BLOCKWISE_FP8TENSOR and isinstance(tensor, Float8BlockwiseQTensor) - - -def fp8_need_transpose_data(tensor: torch.Tensor) -> bool: - """Check if a FP8 tensor needs transpose data.""" - return HAVE_TE_MXFP8TENSOR and isinstance(tensor, MXFP8Tensor) - - -def fp8_need_transpose_data_for_meta_device_init(module: TransformerEngineBaseModule) -> bool: - """Check if a FP8 tensor needs transpose data, for meta device init scenario.""" - return HAVE_TE_MXFP8TENSOR and module.fp8_meta["recipe"].mxfp8() - - -def fp8_discard_transpose_cache(tensor: torch.Tensor) -> None: - """Discard the transpose cache of a FP8 tensor.""" - assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor" - - if hasattr(tensor, "_transpose_invalid"): - tensor._transpose_invalid = True - tensor._transpose = None - elif not fp8_need_transpose_data(tensor): - tensor.update_usage(rowwise_usage=True, columnwise_usage=False) - - -def fp8_create_transpose_cache(tensors: List[torch.Tensor]) -> None: - """Create the transpose cache of a FP8 tensor.""" - if HAVE_TE_POST_ALL_GATHER_PROCESSING: - post_all_gather_processing(tensors) - else: - _fp8_create_transpose_cache_fallback(tensors) - - -def _fp8_create_transpose_cache_fallback(tensors: List[torch.Tensor]) -> None: - if not isinstance(tensors, list): - tensors = [tensors] - for tensor in tensors: - assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor" - if hasattr(tensor, "_create_transpose"): - tensor._create_transpose() - else: - tensor._create_columnwise() - - -def fp8_set_raw_data(tensor: torch.Tensor, data: torch.Tensor, set_transpose: bool = False) -> None: - """Set the raw data of a Transformer Engine Float8Tensor.""" - assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor" - - if set_transpose: - assert fp8_need_transpose_data(tensor), f"Type {type(tensor)} does not need transpose data" - data_attr = "_columnwise_data" - else: - data_attr = "_rowwise_data" if hasattr(tensor, "_rowwise_data") else "_data" - - old_data = getattr(tensor, data_attr) - assert old_data.dtype == data.dtype, "The data types of raw data don't match" - assert ( - old_data.shape == data.shape - ), f"Shape {old_data.shape} of old_data doesn't match {data.shape} of new_data" - setattr(tensor, data_attr, data) - - -def fp8_get_raw_data(tensor: torch.Tensor, get_transpose: bool = False) -> torch.Tensor: - """Get the underlying raw storage of a FP8 tensor.""" - assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor" - - if get_transpose: - assert fp8_need_transpose_data(tensor), f"Type {type(tensor)} does not need transpose data" - data_attr = "_columnwise_data" - else: - data_attr = "_rowwise_data" if hasattr(tensor, "_rowwise_data") else "_data" - - return getattr(tensor, data_attr) - - -def fp8_dequantize(tensor: torch.Tensor) -> torch.Tensor: - """Dequantize a FP8 tensor to a higher precision.""" - assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor" - assert is_te_min_version( - "2.0" - ), "Transformer Engine >= 2.0 is required for dequantizing parameters." - return tensor.dequantize() - - -def fp8_quantize( - model_params: List[torch.Tensor], - main_params: List[torch.Tensor], - start_offsets: List[int], - data_parallel_group: torch.distributed.ProcessGroup, - fsdp_shard_model_params: List[Tuple[torch.Tensor, Optional[torch.Tensor]]], -) -> None: - """Quantize sharded parameters to FP8.""" - if len(model_params) == 0: - return - fsdp_shard_model_params = [x[0] if x[1] is None else x for x in fsdp_shard_model_params] - - if HAVE_TE_CAST_MASTER_WEIGHTS_TO_FP8: - cast_master_weights_to_fp8( - model_params, main_params, start_offsets, data_parallel_group, fsdp_shard_model_params - ) - else: - _fp8_quantize_fallback( - model_params, main_params, start_offsets, data_parallel_group, fsdp_shard_model_params - ) - - -def _fp8_quantize_fallback( - model_params: List[torch.Tensor], - main_params: List[torch.Tensor], - start_offsets: List[int], - data_parallel_group: torch.distributed.ProcessGroup, - fsdp_shard_model_params: List[Tuple[torch.Tensor, Optional[torch.Tensor]]], -) -> None: - for model_param, main_param, start_offset, fsdp_shard_model_param in zip( - model_params, main_params, start_offsets, fsdp_shard_model_params - ): - if main_param is None: - continue - - if fsdp_shard_model_param is not None: - shard_model_param = fsdp_shard_model_param - else: - shard_model_param = model_param._data.view(-1)[ - start_offset : start_offset + main_param.numel() - ] - - quantizer = model_param._quantizer - # When not using fp8 params, the main_param (fp32) is first cast to bf16/fp16, and then - # cast to fp8 during forward. This logic keeps numerical consistency with bf16 params. - main_param = main_param.to(model_param.dtype) - out = Float8Tensor( - shape=main_param.size(), - dtype=model_param.dtype, - requires_grad=False, - data=shard_model_param, - fp8_scale_inv=model_param._scale_inv, - fp8_dtype=model_param._fp8_dtype, - quantizer=quantizer, - ) - quantizer.update_quantized(main_param, out) - - amaxes = [] - scales = [] - scale_invs = [] - for model_param in model_params: - quantizer = model_param._quantizer - amaxes.append(quantizer.amax.view(1)) - scales.append(quantizer.scale.view(1)) - scale_invs.append(model_param._scale_inv.view(1)) - model_param._reset_caches() - - dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device="cuda") - - # Update scaling factors. - packed_scales = torch.empty(len(scales), dtype=torch.float32, device=scales[0].device) - packed_scale_views = [packed_scales[i].view(1) for i in range(len(scales))] - _multi_tensor_copy_this_to_that(scales, packed_scale_views, dummy_overflow_buf) - torch.reciprocal(packed_scales, out=packed_scales) - _multi_tensor_copy_this_to_that(packed_scale_views, scale_invs, dummy_overflow_buf) - - # Reduce amaxes. - # Note: Assume each param has a separate amax. - packed_amaxes = torch.empty(len(amaxes), dtype=torch.float32, device=amaxes[0].device) - packed_amax_views = [packed_amaxes[i].view(1) for i in range(len(amaxes))] - _multi_tensor_copy_this_to_that(amaxes, packed_amax_views, dummy_overflow_buf) - torch.distributed.all_reduce( - packed_amaxes, op=torch.distributed.ReduceOp.MAX, group=data_parallel_group - ) - _multi_tensor_copy_this_to_that(packed_amax_views, amaxes, dummy_overflow_buf) diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py index 46b97743385..cdd9d8bf0a1 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py @@ -33,17 +33,6 @@ from torch.distributed.tensor import DTensor, Replicate, Shard from torch.distributed.tensor.device_mesh import _mesh_resources -from .mixed_precision import ( - fp8_discard_transpose_cache, - fp8_get_raw_data, - fp8_need_transpose_data, - fp8_need_transpose_data_for_meta_device_init, - fp8_quantize, - fp8_set_raw_data, - is_blockwise_float8tensor, - is_float8tensor, - is_te_min_version, -) from .uneven_dtensor import update_uneven_dtensor_chunk_metadata, validate_uneven_dtensor from .utils import ( _MODEL_PARALLEL_RNG_TRACKER_NAME, @@ -62,15 +51,27 @@ from megatron.core.distributed.distributed_data_parallel_config import ( DistributedDataParallelConfig, ) + from megatron.core.fp8_utils import ( + is_float8tensor, + modify_underlying_storage, + quantize_param_shard, + ) from megatron.core.tensor_parallel import get_cuda_rng_tracker - from megatron.core.utils import is_submodule + from megatron.core.utils import is_submodule, is_te_min_version logger.info("Detected Megatron Core, using Megatron-FSDP with Megatron.") except ImportError: # Megatron-LM is not installed, use Megatron-FSDP as a standalone module. from .distributed_data_parallel_config import DistributedDataParallelConfig - from .utils import get_cuda_rng_tracker, is_submodule + from .utils import ( + get_cuda_rng_tracker, + is_float8tensor, + is_submodule, + is_te_min_version, + modify_underlying_storage, + quantize_param_shard, + ) logger.info("Megatron Core is not installed, Megatron-FSDP will run without Megatron Core.") @@ -816,7 +817,7 @@ def __init__( data_parallel_group: Optional[torch.distributed.ProcessGroup] = None, dp_rank: Optional[int] = None, temporary_bucket_allocator: Optional[TemporaryBucketAllocator] = None, - is_transpose_buffer: bool = False, + is_dtype_float8: bool = False, gradient_scaling_factor: Optional[float] = None, chunk_size_factor: int = 1, mem_alloc_context: Optional[Callable] = None, @@ -849,7 +850,7 @@ def __init__( self.temporary_bucket_allocator = ( temporary_bucket_allocator if temporary_bucket_allocator else TemporaryBucketAllocator() ) - self.is_transpose_buffer = is_transpose_buffer + self.is_dtype_float8 = is_dtype_float8 self.gradient_scaling_factor = gradient_scaling_factor self.mem_alloc_context = mem_alloc_context if mem_alloc_context else nullcontext @@ -945,11 +946,11 @@ def fetch_bucket( for p in self.params: item_id = self.param_idx[p] p = to_local_if_dtensor(p) - data = self.get_item_from_bucket(bucket, item_id).view(p.shape) if is_float8tensor(p): - fp8_set_raw_data(p, data, self.is_transpose_buffer) + p._data = self.get_item_from_bucket(bucket, item_id).view(p.shape) else: - p.data = data + p.data = self.get_item_from_bucket(bucket, item_id).view(p.shape) + return bucket def free_bucket_storage(self): @@ -1118,9 +1119,6 @@ def set_item(self, item_id: int, item_data: torch.Tensor) -> None: # When fully sharded, we need to get the slice of the item to be stored in this shard. # Otherwise, we can just flatten the entire item since this buffer contains # the entire bucket. - if is_float8tensor(item_data): - item_data = fp8_get_raw_data(item_data, self.is_transpose_buffer) - if self.is_data_distributed: # Get the coordinates of the slice of the item that is contained in this shard. slice_start, slice_end = self._get_item_slice_in_shard(item_id) @@ -1227,8 +1225,6 @@ class ParameterGroup: Factor determining chunk size for grouped parameter processing. model_weight_buffer (Optional[DataParallelBuffer]): Buffer used to store model weights for data-parallel operations. - transpose_weight_buffer (Optional[DataParallelBuffer]): - Buffer used to store transpose weights for data-parallel operations. main_weight_buffer (Optional[DataParallelBuffer]): Buffer used to store main model weights for data-parallel operations. main_grad_buffer (Optional[DataParallelBuffer]): @@ -1248,7 +1244,6 @@ class ParameterGroup: fsdp_unit_id: Optional[int] = None chunk_size_factor: int = 1 model_weight_buffer: Optional[DataParallelBuffer] = None - transpose_weight_buffer: Optional[DataParallelBuffer] = None main_weight_buffer: Optional[DataParallelBuffer] = None main_grad_buffer: Optional[DataParallelBuffer] = None hsdp_wbuf: Optional[DataParallelBuffer] = None @@ -1319,10 +1314,12 @@ def _does_param_require_new_bucket(param): parameter_groups = [] for name, param in module.named_parameters(): # We need this information to correctly dynamically allocate Tensors! - is_fp8 = is_float8tensor(param) - is_fp8_meta_device_init = meta_device_init_fp8_params.get(name, (False, False))[0] param_attrs = dict( - dtype="float8" if (is_fp8 or is_fp8_meta_device_init) else param.dtype, + dtype=( + "float8" + if is_float8tensor(param) or meta_device_init_fp8_params.get(name, False) + else param.dtype + ), is_expert_param=is_expert_parameter(name, param), requires_grad=param.requires_grad, fsdp_unit_id=None, @@ -1645,10 +1642,7 @@ def __init__( # to determine whether this parameter is fp8 or not. fp8_meta_index = m.param_init_meta[name].fp8_meta_index if m.primary_weights_in_fp8 and fp8_meta_index is not None: - meta_device_init_fp8_params[self.param_to_name[param]] = ( - True, - fp8_need_transpose_data_for_meta_device_init(m), - ) + meta_device_init_fp8_params[self.param_to_name[param]] = True # Get the parameter groups. (self.parameter_groups, self.param_to_param_group, self.bucket_to_bucket_group) = ( @@ -1775,7 +1769,6 @@ def _bytes_to_mb(bytes_val: int) -> str: numel = sum(to_local_if_dtensor(p).shape.numel() for p in group.params) buffers = { "weight": group.model_weight_buffer, - "transpose_weight": group.transpose_weight_buffer, "main_weight": group.main_weight_buffer, "grad": group.main_grad_buffer, } @@ -1845,18 +1838,12 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): self.weight_alloc = FixedPoolAllocator( name="fsdp_params", fsdp_param_groups=self.parameter_groups, size=UB_BUFFER_NUM ) - self.transpose_weight_alloc = FixedPoolAllocator( - name="fsdp_fp8_transpose_params", - fsdp_param_groups=self.parameter_groups, - size=UB_BUFFER_NUM, - ) self.main_grad_alloc = FixedPoolAllocator( name="fsdp_grads", fsdp_param_groups=self.parameter_groups, size=UB_BUFFER_NUM ) self.double_buf_units = self.weight_alloc.fsdp_double_buffer_units else: self.weight_alloc = StorageResizeBasedBucketAllocator() - self.transpose_weight_alloc = StorageResizeBasedBucketAllocator() self.main_grad_alloc = None self.double_buf_units = [] @@ -1896,9 +1883,8 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): ) # Check if the parameter group is FP8. one_param = group.params[0] - is_dtype_float8 = ( - is_float8tensor(one_param) - or meta_device_init_fp8_params.get(self.param_to_name[one_param], (False, False))[0] + is_dtype_float8 = is_float8tensor(one_param) or meta_device_init_fp8_params.get( + self.param_to_name[one_param], False ) if is_dtype_float8: param_dtype = torch.uint8 @@ -1907,16 +1893,6 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): param_dtype = group.params[0].dtype grad_dtype = param_dtype - # Check if the parameter group needs a transpose buffer for model weights. - # Currently, only mxfp8 needs it. - need_transpose_data = is_float8tensor(one_param) and fp8_need_transpose_data(one_param) - need_transpose_data_for_meta_device_init = meta_device_init_fp8_params.get( - self.param_to_name[one_param], (False, False) - )[1] - should_create_transpose_weight_buffer = ( - need_transpose_data or need_transpose_data_for_meta_device_init - ) - # Check if the parameter group requires a grad buffer or main weight buffer. should_create_grad_buffer_or_main_weight_buffer = ( not self.only_create_grad_buffer_and_main_weight_buffer_for_param_requires_grad @@ -1933,29 +1909,13 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): dtype=param_dtype, device=self.device, data_parallel_group=main_buf_dp_group, - is_transpose_buffer=False, + is_dtype_float8=is_dtype_float8, temporary_bucket_allocator=self.weight_alloc, bucket_id=group_id, chunk_size_factor=group.chunk_size_factor, mem_alloc_context=self.mem_alloc_context, **main_buf_extra_kwargs, ) - if should_create_transpose_weight_buffer: - group.transpose_weight_buffer = DataParallelBuffer( - self.ddp_config, - group.params, - is_data_distributed=is_model_weight_buffer_distributed - and main_buf_dp_group.size() > 1, - dtype=param_dtype, - device=self.device, - data_parallel_group=main_buf_dp_group, - is_transpose_buffer=True, - temporary_bucket_allocator=self.transpose_weight_alloc, - bucket_id=group_id, - chunk_size_factor=group.chunk_size_factor, - mem_alloc_context=self.mem_alloc_context, - **main_buf_extra_kwargs, - ) # Initialize the main weight buffer. if should_create_grad_buffer_or_main_weight_buffer and preserve_fp32_weights: @@ -1987,7 +1947,7 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): dtype=torch.float32 if grad_reduce_in_fp32 else grad_dtype, device=self.device, data_parallel_group=main_buf_dp_group, - is_transpose_buffer=False, + is_dtype_float8=False, temporary_bucket_allocator=self.main_grad_alloc, gradient_scaling_factor=gradient_scaling_factor, bucket_id=group_id, @@ -2011,7 +1971,7 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): dtype=wbuf.dtype, device=wbuf.device, data_parallel_group=hsdp_buf_dp_group, - is_transpose_buffer=False, + is_dtype_float8=wbuf.is_dtype_float8, temporary_bucket_allocator=self.weight_alloc, bucket_id=group_id, chunk_size_factor=group.chunk_size_factor, @@ -2027,9 +1987,6 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): ), ) - if group.transpose_weight_buffer is not None: - raise NotImplementedError("HSDP for transpose buffer is not implemented yet") - if should_create_grad_buffer_or_main_weight_buffer: # Initialize the HSDP grad buffer. gbuf = group.main_grad_buffer @@ -2041,7 +1998,7 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): dtype=gbuf.dtype, device=gbuf.device, data_parallel_group=hsdp_buf_dp_group, - is_transpose_buffer=False, + is_dtype_float8=gbuf.is_dtype_float8, temporary_bucket_allocator=self.main_grad_alloc, gradient_scaling_factor=gradient_scaling_factor, bucket_id=group_id, @@ -2124,20 +2081,6 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): torch.empty(wbuf.data_size, dtype=wbuf.dtype, device=self.device) ) bucket = wbuf.fetch_bucket() - - tbuf = group.transpose_weight_buffer - if tbuf: - with self.mem_alloc_context(): - if group.hsdp_wbuf: - raise NotImplementedError( - "HSDP for transpose buffer is not implemented yet" - ) - else: - tbuf.init_data( - torch.empty(tbuf.data_size, dtype=tbuf.dtype, device=self.device) - ) - transpose_bucket = tbuf.fetch_bucket() - mbuf = group.main_weight_buffer if mbuf: # Manually instantiate an empty tensor into the main weight buffer. @@ -2191,41 +2134,25 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): if not self.ddp_config.keep_fp8_transpose_cache: for _param in m.parameters(recurse=False): if is_float8tensor(_param): - fp8_discard_transpose_cache(_param) + _param._transpose_invalid = True + _param._transpose = None # Raise error if a meta parameter still exists after initialization. assert not p.is_meta, (self.param_to_name[p], module_reset_flag) - p_local = to_local_if_dtensor(p) - # Copy the model weight parameter tensor into the buffer. # When distributed, this shards and preserves the data across all ranks. - wbuf.set_item(item_id, p_local) - if tbuf: - tbuf.set_item(item_id, p_local) + wbuf.set_item(item_id, to_local_if_dtensor(p)) # Retrieve the newly allocated parameter data from the global bucket. # Attach the bucket-allocated parameter data to the module parameter, # to use the bucket-allocated data for autograd and NCCL. - new_param_data = wbuf.get_item_from_bucket(bucket, item_id).view(p_local.shape) - if tbuf: - new_transpose_data = tbuf.get_item_from_bucket( - transpose_bucket, item_id - ).view(p_local.shape) - else: - new_transpose_data = None - - if is_float8tensor(p_local): - old_param_data = fp8_get_raw_data(p_local) - assert old_param_data._base is None - new_param_data.detach().copy_(old_param_data) - fp8_set_raw_data(p_local, new_param_data) - del old_param_data - if new_transpose_data is not None: - old_transpose_data = fp8_get_raw_data(p_local, True) - assert old_transpose_data._base is None - new_transpose_data.detach().copy_(old_transpose_data) - fp8_set_raw_data(p_local, new_transpose_data, True) - del old_transpose_data + new_param_data = wbuf.get_item_from_bucket(bucket, item_id).view( + to_local_if_dtensor(p).shape + ) + if is_float8tensor(p): + # Needed to instantiate FP8 parameters. Requires installing + # TransformerEngine. + modify_underlying_storage(p, new_param_data) elif isinstance(p, DTensor): old_param_data = p._local_tensor.data p._local_tensor.data = new_param_data @@ -2263,12 +2190,7 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): # the (high-precision) main weight buffer. # Nothing else needs to be done, because the main weights # do not require autograd operations, only possibly sharding. - p_local = to_local_if_dtensor(p) - assert not is_float8tensor(p_local), ( - self.param_to_name[p], - "fp8 param should use get_high_precision_init_val method.", - ) - mbuf.set_item(item_id, p_local) + mbuf.set_item(item_id, to_local_if_dtensor(p)) if wbuf and wbuf.is_data_distributed: # Free the memory backing the temporarily-allocated bucket associated @@ -2280,9 +2202,6 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): # before forward activations and gradients are allocated in training. wbuf.free_bucket_storage() - if tbuf and tbuf.is_data_distributed: - tbuf.free_bucket_storage() - # Allocate the main_weight buffer and main_grad buffer data in one buffer. if self.buffer_all_in_one: with self.mem_alloc_context(): @@ -2406,7 +2325,6 @@ def _reset_parameters(self, old_params, new_params): group.params[item_id] = new_p for buf in [ group.model_weight_buffer, - group.transpose_weight_buffer, group.main_weight_buffer, group.main_grad_buffer, group.hsdp_wbuf, @@ -2454,7 +2372,6 @@ def _init_distributed_params(self): dist_main_weight = {} for pg in self.parameter_groups: wbuf = pg.model_weight_buffer - tbuf = pg.transpose_weight_buffer mbuf = pg.main_weight_buffer for item_id, orig_param in enumerate(pg.params): param_name = self.param_to_name[orig_param] @@ -2482,7 +2399,6 @@ def _init_distributed_params(self): ) dist_main_weight[param_name] = dist_param elif wbuf: - assert tbuf is None, "Transpose buffer should only exist when main params exist" dist_param = make_fsdp_dtensor( local_tensor=wbuf.get_item(item_id, only_shard=sharded_optimizer_state), param=orig_param, @@ -2652,54 +2568,9 @@ def copy_main_weights_to_model_weights(self): expert_param_quantize_kwargs = copy.deepcopy(dense_param_quantize_kwargs) data_parallel_group = None expert_data_parallel_group = None - clear_quantize_kwargs = lambda kwargs: [d.clear() for d in kwargs.values()] - - def _fp8_quantize_params(dense_param_quantize_kwargs, expert_param_quantize_kwargs): - if len(dense_param_quantize_kwargs["model_params"]) > 0: - # If we have FP8 parameters, we need to quantize them. - fp8_quantize(data_parallel_group=data_parallel_group, **dense_param_quantize_kwargs) - - if len(expert_param_quantize_kwargs["model_params"]) > 0: - # If we have FP8 expert parameters, we need to quantize them. - fp8_quantize( - data_parallel_group=expert_data_parallel_group, **expert_param_quantize_kwargs - ) - - clear_quantize_kwargs(dense_param_quantize_kwargs) - clear_quantize_kwargs(expert_param_quantize_kwargs) - - # Special handling of blockwise FP8 - BATCH_QUANT_MEMORY_LIMIT_BYTES = 5 * 1024**3 # 5 GB - blockwise_fp8_weight_buffers = [] - blockwise_fp8_param_buffers = [] - - def _batch_quantize_blockwise_fp8_params( - dense_param_quantize_kwargs, expert_param_quantize_kwargs, blockwise_fp8_param_buffers - ): - if len(blockwise_fp8_param_buffers) == 0: - return - - # Copy original param shards into their blockwise FP8 working buffers - for bufs in blockwise_fp8_param_buffers: - bufs["bucket_param"].copy_(bufs["param"]) - - # Apply FP8 quantization to blockwise FP8 parameters - _fp8_quantize_params(dense_param_quantize_kwargs, expert_param_quantize_kwargs) - - # Copy quantized params back from working buffers to original param tensors - for bufs in blockwise_fp8_param_buffers: - bufs["param"].copy_(bufs["bucket_param"]) - blockwise_fp8_param_buffers.clear() - - # Free bucket storage for blockwise FP8 weight buffers - for wbuf in blockwise_fp8_weight_buffers: - wbuf.free_bucket_storage() - blockwise_fp8_weight_buffers.clear() - for pg in self.parameter_groups: mbuf = pg.main_weight_buffer wbuf = pg.model_weight_buffer - tbuf = pg.transpose_weight_buffer if mbuf is None: continue @@ -2715,88 +2586,44 @@ def _batch_quantize_blockwise_fp8_params( shard_offsets_in_fp8 = quantize_func_kwargs["start_offsets"] shard_model_params = quantize_func_kwargs["fsdp_shard_model_params"] - has_blockwise_fp8_param = False for param in pg.params: item_id = mbuf.param_idx[param] if wbuf: if wbuf.is_data_distributed or mbuf.is_data_distributed: model_param = wbuf.get_item(item_id, only_shard=True) - if tbuf: - transpose_param = tbuf.get_item(item_id, only_shard=True) - else: - transpose_param = None main_weight = mbuf.get_item(item_id, only_shard=True) else: model_param = wbuf.get_item(item_id) - if tbuf: - transpose_param = tbuf.get_item(item_id) - else: - transpose_param = None main_weight = mbuf.get_item(item_id) else: assert not mbuf.is_data_distributed model_param = to_local_if_dtensor(param) main_weight = mbuf.get_item(item_id) - if is_blockwise_float8tensor(param): - fp8_params.append(param) - if model_param.numel() == 0: - shard_fp32_from_fp8.append(None) - shard_offsets_in_fp8.append(None) - shard_model_params.append([None, None]) - else: - shard_fp32_from_fp8.append(main_weight) - shard_offsets_in_fp8.append(wbuf.locate_item_in_global_item(item_id)[0]) - bucket = wbuf.fetch_bucket() - b_model_param = wbuf.get_item_from_bucket(bucket, item_id)[ - slice(*wbuf.locate_item_in_global_item(item_id)) - ] - assert ( - transpose_param is None - ), "Blockwise FP8 does not support transpose param." - shard_model_params.append([b_model_param, None]) - assert b_model_param.numel() == model_param.numel(), ( - f"Blockwise FP8 bucket param numel {b_model_param.numel()} does" - f" not match model param numel {model_param.numel()}" - f" name: {self.param_to_name[param]}" - ) - blockwise_fp8_param_buffers.append( - {"bucket_param": b_model_param, "param": model_param} - ) - has_blockwise_fp8_param = True - continue - if is_float8tensor(param): fp8_params.append(param) if model_param.numel() == 0: shard_fp32_from_fp8.append(None) shard_offsets_in_fp8.append(None) - shard_model_params.append([None, None]) + shard_model_params.append(None) else: shard_fp32_from_fp8.append(main_weight) shard_offsets_in_fp8.append(wbuf.locate_item_in_global_item(item_id)[0]) - shard_model_params.append([model_param, transpose_param]) + shard_model_params.append(model_param) continue if model_param.numel() > 0: model_param.data.copy_(main_weight.view(model_param.shape)) - if has_blockwise_fp8_param: - blockwise_fp8_weight_buffers.append(wbuf) - if ( - sum([wbuf.bucket_index.size for wbuf in blockwise_fp8_weight_buffers]) - > BATCH_QUANT_MEMORY_LIMIT_BYTES - ): - _batch_quantize_blockwise_fp8_params( - dense_param_quantize_kwargs, - expert_param_quantize_kwargs, - blockwise_fp8_param_buffers, - ) + if len(dense_param_quantize_kwargs["model_params"]) > 0: + # If we have FP8 parameters, we need to quantize them. + dense_param_quantize_kwargs["data_parallel_group"] = data_parallel_group + quantize_param_shard(**dense_param_quantize_kwargs) - _batch_quantize_blockwise_fp8_params( - dense_param_quantize_kwargs, expert_param_quantize_kwargs, blockwise_fp8_param_buffers - ) - _fp8_quantize_params(dense_param_quantize_kwargs, expert_param_quantize_kwargs) + if len(expert_param_quantize_kwargs["model_params"]) > 0: + # If we have FP8 expert parameters, we need to quantize them. + expert_param_quantize_kwargs["data_parallel_group"] = expert_data_parallel_group + quantize_param_shard(**expert_param_quantize_kwargs) @torch.no_grad() def copy_model_weights_to_main_weights(self): @@ -2814,7 +2641,6 @@ def copy_model_weights_to_main_weights(self): f"Master weight buffer size {mbuf.data.numel()} does not match " f"model weight buffer size {copyin_data.numel()}" ) - # TODO(mxfp8): Make sure it's not a fp8 buf? mbuf.data.copy_(copyin_data.data) def all_gather_parameters(self, async_op: bool = True): @@ -2832,18 +2658,15 @@ def all_gather_parameters(self, async_op: bool = True): all_gather_ops = [] for g in self.parameter_groups: - for buf in [g.model_weight_buffer, g.transpose_weight_buffer]: - if buf is None: - continue - shard = buf.get_shard_from_local_buffer() - all_gather_handler = torch.distributed.all_gather_into_tensor( - output_tensor=buf.data, - input_tensor=shard, - group=buf.data_parallel_group, - async_op=async_op, - ) - if async_op: - all_gather_ops.append(all_gather_handler) + shard = g.model_weight_buffer.get_shard_from_local_buffer() + all_gather_handler = torch.distributed.all_gather_into_tensor( + output_tensor=g.model_weight_buffer.data, + input_tensor=shard, + group=g.model_weight_buffer.data_parallel_group, + async_op=async_op, + ) + if async_op: + all_gather_ops.append(all_gather_handler) for op in all_gather_ops: op.wait() @@ -2864,7 +2687,7 @@ def reduce_scatter_gradients(self, async_op: bool = True): reduce_scatter_ops = [] for g in self.parameter_groups: gbuf = g.main_grad_buffer - if gbuf is None: + if gbuf is not None: continue scaling_factor = gbuf.gradient_scaling_factor reduce_op = gradient_reduce_preprocessing(gbuf.data, scaling_factor, self.ddp_config) @@ -3314,16 +3137,9 @@ def __init__( # Track the status of all-gather operations for each bucket. self.param_gather_event_map = {} # All buckets are initially deallocated / empty after initialization of ParamAndGradBuffer. - self.bucket_status = {} - for i in range(self.buffer.num_buckets): - for bwd in [False, True]: - self.bucket_status[self.get_bucket_key(i, bwd)] = BucketStatus.EMPTY - + self.bucket_status = {i: BucketStatus.EMPTY for i in range(self.buffer.num_buckets)} # Track whether each bucket can be deallocated. - self.bucket_can_be_released = {} - for i in range(self.buffer.num_buckets): - for bwd in [False, True]: - self.bucket_can_be_released[self.get_bucket_key(i, bwd)] = False + self.bucket_can_be_released = {i: False for i in range(self.buffer.num_buckets)} # Map each bucket to the bucket group it belongs to by enumerated ID. # Made to collect a subset of buckets in the same bucket group. @@ -3348,13 +3164,6 @@ def __init__( # all-gather parameters across groups. self.outer_fsdp_group_param_gather_stream = torch.cuda.Stream() - def get_bucket_key(self, bucket_id, bwd): - """Get the key for the bucket.""" - has_transpose_buffer = ( - self.buffer.parameter_groups[bucket_id].transpose_weight_buffer is not None - ) - return (bucket_id, has_transpose_buffer and bwd) - @property def num_buckets(self): """Return the number of buckets.""" @@ -3371,11 +3180,10 @@ def reset(self): UserWarning, ) while len(self.param_gather_event_map) > 0: - (bucket_id, bwd) = next(iter(self.param_gather_event_map)) - self.wait_bucket_ready(bucket_id, bwd) + bucket_id = next(iter(self.param_gather_event_map)) + self.wait_bucket_ready(bucket_id) for bucket_id in range(self.num_buckets): - for bwd in [False, True]: - self.bucket_can_be_released[self.get_bucket_key(bucket_id, bwd)] = True + self.bucket_can_be_released[bucket_id] = True self.recycle_unused_buckets() assert all([status is BucketStatus.EMPTY for status in self.bucket_status.values()]), ( @@ -3397,7 +3205,6 @@ def all_gather_params( suggested_AG_prefetch_size: Optional[int] = None, async_param_gather: bool = True, outer_fsdp_group_param_gather: bool = False, - bwd: bool = False, ): """All-gather the params. If prefetch is enabled, prefetch next buckets in the order of `prefetch_order`. @@ -3432,7 +3239,7 @@ def all_gather_params( # Do not release the buckets that are being all-gathered. for bucket_id in ag_buckets: - self.bucket_can_be_released[self.get_bucket_key(bucket_id, bwd)] = False + self.bucket_can_be_released[bucket_id] = False # If prefetch is enabled, we will add prefetch buckets to ag_buckets. if prefetch: @@ -3504,11 +3311,7 @@ def need_skip_prefetch(bucket_id): bucket_id = next_bucket_id(ag_buckets) # Only all-gather on buckets that have not been allocated yet. - ag_buckets = [ - bucket_id - for bucket_id in ag_buckets - if self.bucket_status[self.get_bucket_key(bucket_id, bwd)] == BucketStatus.EMPTY - ] + ag_buckets = [i for i in ag_buckets if self.bucket_status[i] == BucketStatus.EMPTY] if len(ag_buckets) == 0: return @@ -3527,7 +3330,6 @@ def need_skip_prefetch(bucket_id): self.ag_stream if self.ag_stream is not None else torch.cuda.current_stream() ) if outer_fsdp_group_param_gather: - # TODO(mxfp8): Support hsdp self.outer_fsdp_group_param_gather_stream.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(self.outer_fsdp_group_param_gather_stream): outer_fsdp_group = self.buffer.dist_index.get_outer_fsdp_group() @@ -3555,13 +3357,12 @@ def need_skip_prefetch(bucket_id): for bucket_id in buckets: # All-gather the module weights from each FSDP buffer shard # into an allocated bucket containing unsharded weights. - self.async_bucket_gather(bucket_id, bwd) + self.async_bucket_gather(bucket_id) # Replace the parameter all-gather event with coalescing event. for bucket_id in buckets: - bucket_key = self.get_bucket_key(bucket_id, bwd) - _, mark_bucket_ready_to_use = self.param_gather_event_map[bucket_key] - self.param_gather_event_map[bucket_key] = ( + _, mark_bucket_ready_to_use = self.param_gather_event_map[bucket_id] + self.param_gather_event_map[bucket_id] = ( coalescing_event, mark_bucket_ready_to_use, ) @@ -3569,16 +3370,14 @@ def need_skip_prefetch(bucket_id): # Wait for all-gather to finish if not async_param_gather: for bucket_id in buckets: - self.wait_bucket_ready(bucket_id, bwd) + self.wait_bucket_ready(bucket_id) - def wait_bucket_ready(self, bucket_id, bwd, empty_ok=False): + def wait_bucket_ready(self, bucket_id, empty_ok=False): """Wait for the bucket to be ready.""" - bucket_key = self.get_bucket_key(bucket_id, bwd) - - if self.bucket_status[bucket_key] == BucketStatus.READY_TO_USE: + if self.bucket_status[bucket_id] == BucketStatus.READY_TO_USE: # Already ready to use. return - if self.bucket_status[bucket_key] == BucketStatus.EMPTY: + if self.bucket_status[bucket_id] == BucketStatus.EMPTY: if empty_ok: return # Bucket shouldn't be empty, this implies that the bucket @@ -3586,64 +3385,48 @@ def wait_bucket_ready(self, bucket_id, bwd, empty_ok=False): raise ValueError(f"Bucket {bucket_id} is empty.") # Wait for asynchronous / overlapped NCCL operations to complete. - param_gather_event, mark_bucket_ready_to_use = self.param_gather_event_map.pop(bucket_key) + param_gather_event, mark_bucket_ready_to_use = self.param_gather_event_map.pop(bucket_id) param_gather_event.wait() mark_bucket_ready_to_use() @torch.no_grad() - def release_bucket(self, bucket_id, bwd): + def release_bucket(self, bucket_id: int): """Release the bucket.""" - # TODO(mxfp8): In some cases, there won't be ag before bwd? - bucket_key = self.get_bucket_key(bucket_id, bwd) - - if self.bucket_status[bucket_key] == BucketStatus.EMPTY: + if self.bucket_status[bucket_id] == BucketStatus.EMPTY: return - self.wait_bucket_ready(bucket_id, bwd, empty_ok=True) - if self.bucket_status[bucket_key] == BucketStatus.COMMUNICATING: + self.wait_bucket_ready(bucket_id, empty_ok=True) + if self.bucket_status[bucket_id] == BucketStatus.COMMUNICATING: raise ValueError(f"Bucket {bucket_id} is communicating.") - if bwd and self.buffer.parameter_groups[bucket_id].transpose_weight_buffer is not None: - buf = self.buffer.parameter_groups[bucket_id].transpose_weight_buffer - else: - buf = self.buffer.parameter_groups[bucket_id].model_weight_buffer - - buf.free_bucket_storage() - self.bucket_status[bucket_key] = BucketStatus.EMPTY + wbuf = self.buffer.parameter_groups[bucket_id].model_weight_buffer + wbuf.free_bucket_storage() + self.bucket_status[bucket_id] = BucketStatus.EMPTY def recycle_unused_buckets(self): """Recycle the unused buckets.""" - for bucket_key, can_be_released in self.bucket_can_be_released.items(): + for bucket_id, can_be_released in self.bucket_can_be_released.items(): if can_be_released: - bucket_id, is_transpose_weight = bucket_key[0], bucket_key[1] - self.release_bucket(bucket_id, is_transpose_weight) - self.bucket_can_be_released[bucket_key] = False + self.release_bucket(bucket_id) + self.bucket_can_be_released[bucket_id] = False - def get_fsdp_buffer(self, bucket_id: int, bwd=False) -> DataParallelBuffer: + def get_fsdp_buffer(self, bucket_id: int) -> DataParallelBuffer: """Get the FSDP buffer with the given bucket ID.""" param_group = self.buffer.parameter_groups[bucket_id] if self.buffer.ddp_config.outer_dp_sharding_strategy != "no_shard": - if bwd and param_group.transpose_weight_buffer is not None: - raise RuntimeError("Transpose buffer is not supported for HSDP") - else: - return param_group.hsdp_wbuf - if bwd and param_group.transpose_weight_buffer is not None: - return param_group.transpose_weight_buffer - else: - return param_group.model_weight_buffer + return param_group.hsdp_wbuf + return param_group.model_weight_buffer @torch.no_grad() - def async_bucket_gather(self, bucket_id, bwd) -> None: + def async_bucket_gather(self, bucket_id: int) -> None: """All-gather the bucket and set the items.""" - bucket_key = self.get_bucket_key(bucket_id, bwd) - - self.bucket_can_be_released[bucket_key] = False - if self.bucket_status[bucket_key] != BucketStatus.EMPTY: + self.bucket_can_be_released[bucket_id] = False + if self.bucket_status[bucket_id] != BucketStatus.EMPTY: return - self.bucket_status[bucket_key] = BucketStatus.COMMUNICATING + self.bucket_status[bucket_id] = BucketStatus.COMMUNICATING - wbuf = self.get_fsdp_buffer(bucket_id, bwd) + wbuf = self.get_fsdp_buffer(bucket_id) # Lazy release the unused buckets. self.recycle_unused_buckets() @@ -3658,21 +3441,18 @@ def async_bucket_gather(self, bucket_id, bwd) -> None: async_op=True, ) - def get_closure(bucket_id, bwd): + def get_closure(bucket_id): @torch.no_grad() def mark_bucket_ready_to_use(): # Mark the bucket as ready to use - all NCCL operations are complete. - self.bucket_status[self.get_bucket_key(bucket_id, bwd)] = BucketStatus.READY_TO_USE + self.bucket_status[bucket_id] = BucketStatus.READY_TO_USE return mark_bucket_ready_to_use - mark_bucket_ready_to_use = get_closure(bucket_id, bwd) + mark_bucket_ready_to_use = get_closure(bucket_id) # Track the async all-gather operation for the bucket. - self.param_gather_event_map[self.get_bucket_key(bucket_id, bwd)] = ( - param_gather_event, - mark_bucket_ready_to_use, - ) + self.param_gather_event_map[bucket_id] = (param_gather_event, mark_bucket_ready_to_use) @torch.no_grad() @@ -3765,13 +3545,15 @@ def override_sharded_param_methods_with_safety_checks(params, all_gather_pipelin def override_sharded_param_to_function_closure(p, to_function): def override_sharded_param_to_function(*args, **kwargs): - if p._typed_storage()._size() == 0: - warnings.warn( - "The parameter may be sharded by Megatron-FSDP, " - "no actual 'to' operation is performed." - ) - return torch.empty([]) - return to_function(*args, **kwargs) + bucket_id = all_gather_pipeline.buffer.param_to_param_group[p] + status = all_gather_pipeline.bucket_status[bucket_id] + if status == BucketStatus.READY_TO_USE: + return to_function(*args, **kwargs) + raise RuntimeError( + "This parameter is already shard by MCore FSDP and the " + "shared-state parameter does not support 'to' function." + "please define the dtype and device of the parameter before FSDP wrap." + ) return override_sharded_param_to_function @@ -3779,13 +3561,15 @@ def override_sharded_param_to_function(*args, **kwargs): def override_sharded_param_cpu_function_closure(p, cpu_function): def override_sharded_param_cpu_function(*args, **kwargs): - if p._typed_storage()._size() == 0: - warnings.warn( - "The parameter may be sharded by Megatron-FSDP, " - "no actual 'cpu' operation is performed." - ) - return torch.empty([], device="cpu") - return cpu_function(*args, **kwargs) + bucket_id = all_gather_pipeline.buffer.param_to_param_group[p] + status = all_gather_pipeline.bucket_status[bucket_id] + if status == BucketStatus.READY_TO_USE: + return cpu_function(*args, **kwargs) + warnings.warn( + "The parameters are sharded by MCore FSDP, and no actual cpu " + "operation is performed." + ) + return torch.empty([], device="cpu") return override_sharded_param_cpu_function diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py index 3d15711275f..c9679494737 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py @@ -19,7 +19,7 @@ from contextlib import nullcontext from functools import reduce from importlib.metadata import version -from typing import Callable, Optional, Sequence, Union +from typing import Callable, List, Optional, Sequence, Union try: import einops @@ -79,6 +79,52 @@ def is_te_min_version(vers, check_equality=True): return te_version > PkgVersion(vers) +# Check if Transformer Engine has class for fp8 tensors. +try: + if is_te_min_version("2.0"): + # In TE2.x, QuantizedTensor is the base class for all different type of fp8 tensors, + # including fp8 tensor for delayed scaling, current scaling and mxfp8, etc. + from transformer_engine.pytorch.tensor import QuantizedTensor as FP8_TENSOR_CLASS + else: + from transformer_engine.pytorch.float8_tensor import Float8Tensor as FP8_TENSOR_CLASS + + HAVE_TE_FP8_TENSOR_CLASS = True +except (ImportError, ModuleNotFoundError): + # FP8 tensor class not found + HAVE_TE_FP8_TENSOR_CLASS = False + +try: + from transformer_engine.pytorch.optimizers import multi_tensor_applier, multi_tensor_scale + + multi_tensor_scale_impl = multi_tensor_scale +except ImportError: + try: + import amp_C + from apex.multi_tensor_apply import multi_tensor_applier + + multi_tensor_scale_impl = amp_C.multi_tensor_scale + except ImportError: + import warnings + + warnings.warn( + "Transformer Engine and Apex are not installed. " + "Falling back to local implementations of " + "multi_tensor_applier and multi_tensor_scale" + ) + + def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args): + """Multi tensor op applier""" + return op(2048 * 32, noop_flag_buffer, tensor_lists, *args) + + def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale): + """Works as a drop-in replacement for amp_C.multi_tensor_scale.""" + for src, dst in zip(tensor_lists[0], tensor_lists[1]): + dst.copy_(src * scale) + + multi_tensor_applier = local_multi_tensor_applier + multi_tensor_scale_impl = local_multi_tensor_scale + + def is_submodule(module, parent_module, strict=True): """ Check if a module is a submodule of another module. @@ -92,6 +138,18 @@ def is_submodule(module, parent_module, strict=True): return False +def is_float8tensor(tensor: torch.Tensor) -> bool: + """Check if a tensor is a Transformer Engine Float8Tensor. + + Note that in TE2.x, in order to support more recipes, the design of the fp8 tensor class has + changed. Now Float8Tensor is only used for current scaling and delayed scaling. And mxfp8 + and blockwise scaling have their own fp8 tensor classes. These different fp8 tensor classes + are both inherited from QuantizedTensor. So, for TE1.x, FP8_TENSOR_CLASS is Float8Tensor, + and for TE2.x, FP8_TENSOR_CLASS is QuantizedTensor. + """ + return HAVE_TE_FP8_TENSOR_CLASS and isinstance(tensor, FP8_TENSOR_CLASS) + + def get_mesh_names(device_mesh: Optional[DeviceMesh] = None) -> list[str]: """ Get all the sub-mesh names in the DeviceMesh. @@ -130,6 +188,198 @@ def contains_submesh( return all(submesh_name in device_mesh_names for submesh_name in submesh_names) +def _multi_tensor_copy_this_to_that( + this: List[torch.Tensor], that: List[torch.Tensor], overflow_buf: Optional[torch.Tensor] = None +): + """ + Use multi-tensor-applier to copy values from one list to another. + We don't have a bfloat16 implementation so for now if the overflow_buf + is not provided, we default back to simple loop copy to be compatible + with bfloat16. + """ + if overflow_buf is not None: + overflow_buf.fill_(0) + # Scaling with factor `1.0` is equivalent to copy. + multi_tensor_applier(multi_tensor_scale_impl, overflow_buf, [this, that], 1.0) + else: + for this_, that_ in zip(this, that): + that_.copy_(this_) + + +""" +The code below abstracts the functionalities needed for implementing "--fp8-param-gather" into +several functions. It provides different implementations for each function based on different +versions of TE, ensuring compatibility across various TE versions. + +Currently, there are three functions: + - modify_underlying_storage + This function is used in DDP to place all parameters into a contiguous buffer. For + non-fp8 tensors, replacing their data is simple, just using code like + "tensor.data = new_data". However, for fp8 tensors, their raw data is not stored in the + ".data" attribute, and it varies with different TE versions and different recipes. This + function provides a unified interface to replace the underlying storage of a fp8 tensor. + - quantize_param_shard + This function is used in dist-opt to cast fp32 main params to fp8 params. For non-fp8 + params, this casting is as simple as "bf16_params.copy_(fp32_main_params)"; but for fp8 + params, the casting logic varies with different TE versions and different recipes. This + function provides a unified interface to cast fp32 main params to fp8 params, and also + updates the necessary attributes (like amax, scale, scale_inv or transpose cache) of the + fp8 model params. + - correct_amax_history_if_needed + This function is used to correct the amax history of fp8 tensors. In TE1.x, some inplace + copy operations will write unwanted values to the amax_history of fp8 tensors. This function + corrects the amax_history back. For TE2.x, it's an empty function. + Only useful for delayed scaling. +""" +if HAVE_TE and is_te_min_version("2.2"): + # Supported TE versions: 2.2+ + from transformer_engine.pytorch.tensor import QuantizedTensor + + def _modify_underlying_storage_impl( + fp8_tensor: QuantizedTensor, new_raw_data: torch.Tensor + ) -> None: + from transformer_engine.pytorch.tensor.utils import replace_raw_data + + replace_raw_data(fp8_tensor, new_raw_data) + + def _quantize_param_shard_impl( + model_params: List[QuantizedTensor], + main_params: List[torch.Tensor], + start_offsets: List[int], + data_parallel_group: ProcessGroup, + fsdp_shard_model_params: Optional[List[torch.Tensor]] = None, + ) -> None: + if len(model_params) == 0: + return + + from transformer_engine.pytorch.tensor.utils import cast_master_weights_to_fp8 + + args = [model_params, main_params, start_offsets, data_parallel_group] + if fsdp_shard_model_params is not None: + if get_te_version() == PkgVersion("2.3.0.dev0+5fdd7bb") or is_te_min_version("2.3.0"): + args.append(fsdp_shard_model_params) + else: + raise NotImplementedError( + f"FSDP with --fp8-param-gather is not supported in TE v{get_te_version()}" + ) + cast_master_weights_to_fp8(*args) + +elif HAVE_TE and is_te_min_version("2.0"): + # Supported TE versions: 2.0 + from transformer_engine.pytorch.tensor import QuantizedTensor + from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor + + def _modify_underlying_storage_impl( + fp8_tensor: QuantizedTensor, new_raw_data: torch.Tensor + ) -> None: + old_raw_data = fp8_tensor._data + assert old_raw_data.dtype == new_raw_data.dtype + new_raw_data.detach().copy_(old_raw_data) + fp8_tensor._data = new_raw_data + del old_raw_data + + def _quantize_param_shard_impl( + model_params: List[QuantizedTensor], + main_params: List[torch.Tensor], + start_offsets: List[int], + data_parallel_group: ProcessGroup, + fsdp_shard_model_params: Optional[List[torch.Tensor]] = None, + ) -> None: + if len(model_params) == 0: + return + + if fsdp_shard_model_params is None: + fsdp_shard_model_params = [None] * len(model_params) + + for model_param, main_param, start_offset, fsdp_shard_model_param in zip( + model_params, main_params, start_offsets, fsdp_shard_model_params + ): + if main_param is None: + continue + + if fsdp_shard_model_param is not None: + shard_model_param = fsdp_shard_model_param + else: + shard_model_param = model_param._data.view(-1)[ + start_offset : start_offset + main_param.numel() + ] + + quantizer = model_param._quantizer + # When not using --fp8-param-gather, the main_param (fp32) is first cast to bf16/fp16, + # and then cast to fp8 during forward. + # Although it's not necessary when --fp8-param-gather is enabled, we still keep this + # logic to keep numerical consistency. So here cast the main_param to model_param.dtype. + main_param = main_param.to(model_param.dtype) + out = Float8Tensor( + shape=main_param.size(), + dtype=model_param.dtype, + requires_grad=False, + data=shard_model_param, + fp8_scale_inv=model_param._scale_inv, + fp8_dtype=model_param._fp8_dtype, + quantizer=quantizer, + ) + quantizer.update_quantized(main_param, out) + + amaxes = [] + scales = [] + scale_invs = [] + for model_param in model_params: + quantizer = model_param._quantizer + amaxes.append(quantizer.amax.view(1)) + scales.append(quantizer.scale.view(1)) + scale_invs.append(model_param._scale_inv.view(1)) + model_param._reset_caches() + + dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device="cuda") + + # Update scaling factors. + packed_scales = torch.empty(len(scales), dtype=torch.float32, device=scales[0].device) + packed_scale_views = [packed_scales[i].view(1) for i in range(len(scales))] + _multi_tensor_copy_this_to_that(scales, packed_scale_views, dummy_overflow_buf) + torch.reciprocal(packed_scales, out=packed_scales) + _multi_tensor_copy_this_to_that(packed_scale_views, scale_invs, dummy_overflow_buf) + + # Reduce amaxes. + # Note: Assume each param has a separate amax. + packed_amaxes = torch.empty(len(amaxes), dtype=torch.float32, device=amaxes[0].device) + packed_amax_views = [packed_amaxes[i].view(1) for i in range(len(amaxes))] + _multi_tensor_copy_this_to_that(amaxes, packed_amax_views, dummy_overflow_buf) + torch.distributed.all_reduce( + packed_amaxes, op=torch.distributed.ReduceOp.MAX, group=data_parallel_group + ) + _multi_tensor_copy_this_to_that(packed_amax_views, amaxes, dummy_overflow_buf) + +else: + # Fallback impl if TE version is invalid or TE is not installed. + def _modify_underlying_storage_impl(*args, **kwargs): + raise RuntimeError( + "Invalid Transformer Engine version for FP8 distributed optimizer, " + "please install Transformer Engine 2.0+ or install Megatron-Core" + ) + + def _quantize_param_shard_impl(*args, **kwargs): + raise RuntimeError( + "Invalid Transformer Engine version for FP8 distributed optimizer, " + "please install Transformer Engine 2.0+ or install Megatron-Core" + ) + + +def modify_underlying_storage(tensor: torch.Tensor, new_raw_data: torch.Tensor): + """Replace the underlying raw data of a tensor with new data.""" + _modify_underlying_storage_impl(tensor, new_raw_data) + + +def quantize_param_shard( + model_params, main_params, start_offsets, data_parallel_group, fsdp_shard_model_params=None +): + """Cast shard fp32 main params to fp8 model params.""" + assert HAVE_TE, "Transformer Engine is required for quantizing parameters." + _quantize_param_shard_impl( + model_params, main_params, start_offsets, data_parallel_group, fsdp_shard_model_params + ) + + def _get_cuda_rng_state( device: Union[int, str, torch.device] = "cuda", clone: bool = False, graph_safe: bool = False ) -> torch.Tensor: diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 0fc00bd91be..dd0281e61b1 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -744,13 +744,6 @@ def validate_args(args, defaults={}): assert args.ckpt_format == "fsdp_dtensor", \ "Megatron FSDP only supports fsdp_dtensor checkpoint format" - - if args.use_megatron_fsdp: - args.reuse_grad_buf_for_mxfp8_param_ag = False - - if args.fsdp_manual_registration: - assert args.use_megatron_fsdp, "FSDP manual registration is only supported with Megatron FSDP" - assert args.nccl_ub, "FSDP manual registration is only supported with nccl-ub option" # Parameters dtype. args.params_dtype = torch.float From bd0694574f82dcafc1b552214fd1937917f45b30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 5 Jan 2026 12:30:18 +0000 Subject: [PATCH 213/248] Revert "[Dev] Partial CUDA Graph support for EP Overlap (#2168)" This reverts commit 8b93e0d6ef0a5ca6ef3c1993b0728447a8ddc4b8. --- .../common/model_chunk_schedule_plan.py | 40 +- .../core/models/gpt/fine_grained_callables.py | 204 ++++------ megatron/core/pipeline_parallel/schedules.py | 105 ----- megatron/core/pipeline_parallel/utils.py | 4 +- megatron/core/transformer/cuda_graphs.py | 84 +--- megatron/core/transformer/moe/moe_layer.py | 7 +- .../core/transformer/transformer_config.py | 15 - .../core/transformer/transformer_layer.py | 36 -- .../test_cuda_graphed_schedule_chunk_1f1b.py | 372 ------------------ .../a2a_overlap/test_schedule_layer_1f1b.py | 2 +- tests/unit_tests/a2a_overlap/utils.py | 1 - .../pipeline_parallel/test_schedules.py | 48 --- .../transformer/test_submodule_callables.py | 16 +- 13 files changed, 130 insertions(+), 804 deletions(-) delete mode 100644 tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py index 04ca580eeaa..486a498dd73 100644 --- a/megatron/core/models/common/model_chunk_schedule_plan.py +++ b/megatron/core/models/common/model_chunk_schedule_plan.py @@ -17,7 +17,6 @@ get_comm_stream, get_comp_stream, ) -from megatron.core.transformer.enums import CudaGraphScope class ModelChunkState: @@ -38,20 +37,23 @@ class TransformerLayerSchedulePlan: mtp post process nodes. layer (TransformerLayerSchedulePlan) - ├── attn (TransformerLayerNode): attention -> router -> dispatch preprocess + ├── attn (TransformerLayerNode): attention module + ├── post_attn (TransformerLayerNode): layernorm -> router -> dispatch preprocess ├── moe_dispatch (TransformerLayerNode): dispatch All2All ├── mlp (TransformerLayerNode): mlp module ├── moe_combine (TransformerLayerNode): combine All2All └── mtp_post_process (PostProcessNode): mtp post process Note that MTP layer has the same operation and execution order with TransformerLayer regarding - moe_dispatch, mlp, moe_combine, but contains extra operations in attn and mtp_post_process: + post_attn, moe_dispatch, mlp, moe_combine, but contains extra operations in attn and + mtp_post_process: * mtp.attn wraps around transformer_layer.attn with extra norm, proj and embedding operations. * mtp.mtp_post_process contains output_layer, mtp loss operations, whereas transformer_layer.mtp_post_process is empty. """ attn = None + post_attn = None moe_dispatch = None mlp = None moe_combine = None @@ -115,7 +117,7 @@ def release_state(self): def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args): """ Builds the callable nodes for the transformer/mtp layer: - attn, mlp, moe_dispatch and moe_combine, and mtp_post_process. + attn, post_attn, mlp, moe_dispatch and moe_combine, and mtp_post_process. """ from megatron.core.models.gpt.fine_grained_callables import ( TransformerLayerNode, @@ -135,7 +137,16 @@ def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args): else isinstance(self.layer.mlp, MoELayer) ) - extra_args["config"] = self.layer.config + enable_deepep = ( + self.layer.config.moe_token_dispatcher_type == "flex" + and self.layer.config.moe_flex_dispatcher_backend == "deepep" + ) + enable_hybridep = ( + self.layer.config.moe_token_dispatcher_type == "flex" + and self.layer.config.moe_flex_dispatcher_backend == "hybridep" + ) + extra_args["enable_deepep"] = enable_deepep + extra_args["enable_hybridep"] = enable_hybridep extra_args["is_moe"] = is_moe extra_args["delay_wgrad_compute"] = self.layer.config.delay_wgrad_compute extra_args["is_mtp"] = is_mtp @@ -156,6 +167,7 @@ def create_node(stream, module, name): ( attn_module, + post_attn_module, moe_dispatch_module, mlp_module, moe_combine_module, @@ -167,9 +179,11 @@ def create_node(stream, module, name): self.attn = create_node(comp_stream, attn_module, "attn") self.mlp = create_node(comp_stream, mlp_module, "mlp") if is_moe: + self.post_attn = create_node(comp_stream, post_attn_module, "post_attn") self.moe_dispatch = create_node(comm_stream, moe_dispatch_module, "moe_dispatch") self.moe_combine = create_node(comm_stream, moe_combine_module, "moe_combine") else: + self.post_attn = NoopScheduleNode() self.moe_dispatch = NoopScheduleNode() self.moe_combine = NoopScheduleNode() @@ -180,11 +194,6 @@ def create_node(stream, module, name): else: self.mtp_post_process = NoopScheduleNode() - # mlp and combine may receive dgrad from attn, which is managed by cuda graph. - if CudaGraphScope.attn in self.config.cuda_graph_scope: - self.mlp.manual_grads_release = False - self.moe_combine.manual_grads_release = False - def get_fp8_context(self): """ Get the fp8 context for the transformer layer. @@ -207,8 +216,8 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False) to maximize parallelism and efficiency. When f_layer and b_layer are not None, forward and backward pass are overlapped as follows: - comm_stream: combine_bwd | dispatch_fwd->dispatch_bwd | combine_fwd - comp_stream: attn_fwd | mlp_bwd->mlp_bwd_dw->mlp_fwd| attn_bwd + comm_stream: combine_bwd | dispatch_fwd->dispatch_bwd | combine_fwd + comp_stream: attn_fwd->post_attn_fwd| mlp_bwd->mlp_bwd_dw->mlp_fwd| post_attn_bwd->attn_bwd For MTP, mtp_post_process_fwd is executed after the combine_fwd in the comp_stream, and mtp_post_process_bwd is executed before the combine_bwd in the comp_stream. @@ -231,6 +240,7 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False) if f_layer is not None: with f_layer.get_fp8_context(): f_input = f_layer.attn.forward(f_input) + f_input = f_layer.post_attn.forward(f_input) if b_layer is not None: b_grad = b_layer.mlp.backward(b_grad) @@ -244,6 +254,7 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False) b_grad = b_layer.moe_dispatch.backward(b_grad) if b_layer is not None and b_layer.config.ep_overlap_early_attn_memory_release: + b_grad = b_layer.post_attn.backward(b_grad) b_grad = b_layer.attn.backward(b_grad) if f_layer is not None: @@ -256,6 +267,7 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False) f_input = f_layer.mtp_post_process.forward(f_input) if b_layer is not None and not b_layer.config.ep_overlap_early_attn_memory_release: + b_grad = b_layer.post_attn.backward(b_grad) b_grad = b_layer.attn.backward(b_grad) # Delay the last attn_dw in backward pass (attn_dw of the first layer) @@ -357,10 +369,6 @@ def __init__( model, self._model_chunk_state, self._event, comp_stream ) - # preprocess may receive dgrad from attn, which is managed by cuda graph. - if CudaGraphScope.attn in model.config.cuda_graph_scope: - self.pre_process.manual_grads_release = False - def _build_layer_schedule_plan(self, module, comp_stream, comm_stream): if module is None: return diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index ab76659d01b..a0be55c4ca1 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -6,15 +6,14 @@ from typing import Optional import torch -from torch import Tensor from megatron.core import tensor_parallel -from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( fine_grained_offloading_group_commit, + fine_grained_offloading_group_start, + get_fine_grained_offloading_context, ) from megatron.core.pipeline_parallel.utils import ScheduleNode, make_viewless -from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.module import float16_to_fp32 from megatron.core.transformer.moe.moe_layer import MoELayer from megatron.core.transformer.multi_token_prediction import ( @@ -43,13 +42,14 @@ def wrapped_func(*args, **kwarg): @internal_api -def should_free_input(name, is_moe, config): +def should_free_input(name, is_moe, enable_deepep, enable_hybridep): """Determine if the node should free its input memory. Args: name: Node name is_moe: Whether it's a MoE model - config: TransformerConfig object + enable_deepep: Whether to use DeepEP dispatcher + enable_hybridep: Whether to use HybridEP dispatcher Returns: bool: Whether to free input memory @@ -57,14 +57,6 @@ def should_free_input(name, is_moe, config): # For dense layers [attn, fake, mlp, fake], the input is needed during backward pass if not is_moe: return False - enable_deepep = ( - config.moe_token_dispatcher_type == "flex" - and config.moe_flex_dispatcher_backend == "deepep" - ) - enable_hybridep = ( - config.moe_token_dispatcher_type == "flex" - and config.moe_flex_dispatcher_backend == "hybridep" - ) # Define which nodes should free input memory # Since we split the computing graph into multiple nodes, we can manually control # when and how to free the input memory. @@ -77,10 +69,7 @@ def should_free_input(name, is_moe, config): # and probs before dispatch A2A and it's not needed anymore after the forward pass # For DeepEP and HybridEP dispatcher mode, they are both needed in backward pass # and cannot be freed. - # If moe_preprocess is in cuda graph scope, tokens and probs are fixed size tensors, - # so they cannot be freed. - "moe_dispatch": not (enable_deepep or enable_hybridep) - and (CudaGraphScope.moe_preprocess not in config.cuda_graph_scope), + "moe_dispatch": not (enable_deepep or enable_hybridep), } return free_input_nodes.get(name, False) @@ -243,13 +232,13 @@ def __init__( it's the per_batch_state_context, o.w. nullcontext name (str): Node name, also used to determine memory strategy bwd_dw_callables (list): List of weight gradient functions for the layer. - extra_args (dict): Extra arguments for the node: is_moe, config. + extra_args (dict): Extra arguments for nodes: is_moe, enable_deepep, enable_hybridep. """ # determine whether to free input memory - config = extra_args.get("config", None) - assert config is not None, "model config must be passed to TransformerLayerNode." is_moe = extra_args.get("is_moe", False) - free_input = should_free_input(name, is_moe, config) + enable_deepep = extra_args.get("enable_deepep", False) + enable_hybridep = extra_args.get("enable_hybridep", False) + free_input = should_free_input(name, is_moe, enable_deepep, enable_hybridep) self.delay_wgrad_compute = extra_args.get("delay_wgrad_compute", False) super().__init__( @@ -314,8 +303,8 @@ def backward_dw(self): module.backward_dw() # the output grad memory is last used in wgrad compute, should be safe to release. - if self.manual_grads_release: - assert self.delay_grads_release, "output grad memory should be valid before wgrad." + assert self.delay_grads_release, "output grad memory should be valid before wgrad." + if self.manual_release_grads: for tensor in self.output_grads: tensor.untyped_storage().resize_(0) self.output_grads = None @@ -368,95 +357,11 @@ def build_transformer_layer_callables(layer: TransformerLayer): and layer.config.moe_flex_dispatcher_backend == "hybridep" ) - class _BackwardDWWrapper: - def __init__(self): - self.graphed_backward_dw_callable = None - self.attn_dw_callable = layer.self_attention.backward_dw - if isinstance(layer.mlp, MoELayer): - self.shared_expert_dw_callable = partial( - layer.mlp.backward_dw, routed_experts=False, shared_experts=True - ) - else: - self.shared_expert_dw_callable = None - self.cuda_graph_scope = layer.config.cuda_graph_scope - - def set_graphed_backward_dw_callable(self, graphed_backward_dw_callable): - """Store the CUDA graphed backward weight gradient callable.""" - self.graphed_backward_dw_callable = graphed_backward_dw_callable - - def backward_dw(self): - """Execute weight gradients, skipping CUDA graphed components during replay.""" - is_replay = hasattr(layer, 'cuda_graphs') and layer.cuda_graphs - if self.shared_expert_dw_callable is not None and ( - not is_replay or CudaGraphScope.moe_router not in self.cuda_graph_scope - ): - self.shared_expert_dw_callable() - if not is_replay or CudaGraphScope.attn not in self.cuda_graph_scope: - self.attn_dw_callable() - if is_replay and self.graphed_backward_dw_callable is not None: - self.graphed_backward_dw_callable() - - attn_backward_dw_wrapper = _BackwardDWWrapper() - def submodule_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor): """ - Performs same attnention forward logic as GPT Model and forward pass for - computations between attention and dispatch: - pre mlp layernorm->router->dispatch preprocess + Performs same attnention forward logic as GPT Model. """ - - if hasattr(layer, 'cuda_graphs') and layer.cuda_graphs: - assert ( - CudaGraphScope.mlp not in layer.config.cuda_graph_scope - and CudaGraphScope.moe not in layer.config.cuda_graph_scope - ), ( - "Supported CUDA graph scope with EP overlap: " - "attn, moe_router, moe_preprocess, mlp, got {}".format( - layer.config.cuda_graph_scope - ) - ) - forward_func = layer._te_cuda_graph_replay - attn_backward_dw_wrapper.set_graphed_backward_dw_callable( - partial(layer.backward_dw_cudagraph, layer.current_microbatch) - ) - else: - # wrapper function that keeps consistent api with cuda graph replay - def forward_func( - hidden_states: Tensor, - attention_mask: Optional[Tensor] = None, - rotary_pos_emb: Optional[Tensor] = None, - rotary_pos_cos: Optional[Tensor] = None, - rotary_pos_sin: Optional[Tensor] = None, - packed_seq_params: Optional[PackedSeqParams] = None, - sequence_len_offset: Optional[Tensor] = None, - ): - hidden_states, _ = layer._forward_attention( - hidden_states=hidden_states, - attention_mask=attention_mask, - rotary_pos_emb=rotary_pos_emb, - rotary_pos_cos=rotary_pos_cos, - rotary_pos_sin=rotary_pos_sin, - packed_seq_params=packed_seq_params, - sequence_len_offset=sequence_len_offset, - ) - if not isinstance(layer.mlp, MoELayer): - return hidden_states, None, None, None - if layer.recompute_pre_mlp_layernorm: - layer.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput() - pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint( - layer.pre_mlp_layernorm, hidden_states - ) - else: - pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states) - - shared_expert_output = layer.mlp.shared_experts_compute(pre_mlp_layernorm_output) - probs, routing_map = layer.mlp.route(pre_mlp_layernorm_output) - local_tokens, probs, _ = layer.mlp.preprocess( - pre_mlp_layernorm_output, probs, routing_map - ) - return hidden_states, local_tokens, probs, shared_expert_output - - hidden_states, local_tokens, probs, shared_expert_output = forward_func( + hidden_states, _ = layer._forward_attention( hidden_states=hidden_states, attention_mask=node.chunk_state.attention_mask, rotary_pos_emb=node.chunk_state.rotary_pos_emb, @@ -465,14 +370,33 @@ def forward_func( packed_seq_params=node.chunk_state.packed_seq_params, sequence_len_offset=node.chunk_state.sequence_len_offset, ) - if not isinstance(layer.mlp, MoELayer): - return hidden_states + return hidden_states + + def submodule_post_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor): + """ + Run forward pass for computations between attention and dispatch: + pre mlp layernorm->router->dispatch preprocess + """ + if layer.offload_mlp_norm: + hidden_states = fine_grained_offloading_group_start(hidden_states, name="mlp_norm") + if layer.recompute_pre_mlp_layernorm: + layer.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput() + with get_fine_grained_offloading_context(layer.offload_mlp_norm): + pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint( + layer.pre_mlp_layernorm, hidden_states + ) + else: + with get_fine_grained_offloading_context(layer.offload_mlp_norm): + pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states) + + probs, routing_map = layer.mlp.route(pre_mlp_layernorm_output) + local_tokens, probs, _ = layer.mlp.preprocess(pre_mlp_layernorm_output, probs, routing_map) # Detach here for mlp_bda residual connection node.layer_state.residual = node.detach(hidden_states) if layer.mlp.use_shared_expert and not layer.mlp.shared_expert_overlap: - # Detach here for shared expert connection in moe_combine - node.layer_state.shared_expert_output = node.detach(shared_expert_output) + # Detach here for shared expert connection + node.layer_state.pre_mlp_layernorm_output = node.detach(pre_mlp_layernorm_output) return local_tokens, probs @@ -497,6 +421,7 @@ def submodule_moe_forward(node: ScheduleNode, dispatched_tokens: torch.Tensor): Run forward pass for computations between dispatch and combine: post dispatch->experts->combine preprocess """ + shared_expert_output = None dispatched_probs = node.layer_state.dispatched_probs token_dispatcher = layer.mlp.token_dispatcher if enable_deepep or enable_hybridep: @@ -504,8 +429,10 @@ def submodule_moe_forward(node: ScheduleNode, dispatched_tokens: torch.Tensor): # backward graph from connecting to dispatch submodule token_dispatcher._comm_manager.dispatched_probs = dispatched_probs - expert_output, _ = layer.mlp.routed_experts_compute( - dispatched_tokens, dispatched_probs, None + pre_mlp_layernorm_output = getattr(node.layer_state, 'pre_mlp_layernorm_output', None) + shared_expert_output = layer.mlp.shared_experts_compute(pre_mlp_layernorm_output) + expert_output, mlp_bias = layer.mlp.routed_experts_compute( + dispatched_tokens, dispatched_probs, pre_mlp_layernorm_output ) if layer.recompute_pre_mlp_layernorm: @@ -515,10 +442,16 @@ def submodule_moe_forward(node: ScheduleNode, dispatched_tokens: torch.Tensor): # release tensor reference after use node.layer_state.dispatched_probs = None node.layer_state.pre_mlp_layernorm_output = None - - return expert_output - - def submodule_combine_forward(node: ScheduleNode, output: torch.Tensor): + if shared_expert_output is None: + # Return only expert_output, since shared_expert_output causes backward on None + return expert_output + return expert_output, shared_expert_output + + def submodule_combine_forward( + node: ScheduleNode, + output: torch.Tensor, + shared_expert_output: Optional[torch.Tensor] = None, + ): """ # Triggers token combine and the remaining computation in the transformer layer. # The `mlp_bda` computation is placed after `mlp.combine` due to data dependency. @@ -528,11 +461,10 @@ def submodule_combine_forward(node: ScheduleNode, output: torch.Tensor): # with another microbatch's computation and expose the communication. """ residual = node.layer_state.residual - shared_expert_output = getattr(node.layer_state, 'shared_expert_output', None) + output = layer.mlp.combine(output, shared_expert_output) mlp_output_with_bias = (output, None) - if hasattr(layer, 'cuda_graphs') and layer.cuda_graphs: - layer.mlp.cudagraph_tensor_store.clear() + with layer.bias_dropout_add_exec_handler(): hidden_states = layer.mlp_bda(layer.training, layer.config.bias_dropout_fusion)( mlp_output_with_bias, residual, layer.hidden_dropout @@ -568,12 +500,13 @@ def raise_not_implemented(*args): # Build forward and backward callable functions attn_func = submodule_attn_forward + post_attn_func = submodule_post_attn_forward if is_moe else raise_not_implemented dispatch_func = submodule_dispatch_forward if is_moe else raise_not_implemented mlp_func = submodule_moe_forward if is_moe else mlp_wrapper combine_func = submodule_combine_forward if is_moe else raise_not_implemented - forward_funcs = [attn_func, dispatch_func, mlp_func, combine_func, None] - backward_dw = {"attn": attn_backward_dw_wrapper, "mlp": layer.mlp} + forward_funcs = [attn_func, post_attn_func, dispatch_func, mlp_func, combine_func, None] + backward_dw = {"attn": layer.self_attention, "mlp": layer.mlp} return forward_funcs, backward_dw @@ -585,7 +518,9 @@ def build_mtp_layer_callables(layer): """ forward_funcs, backward_dw = build_transformer_layer_callables(layer.transformer_layer) - attn_forward, dispatch_forward, mlp_forward, combine_forward, _ = forward_funcs + attn_forward, post_attn_forward, dispatch_forward, mlp_forward, combine_forward, _ = ( + forward_funcs + ) is_moe = isinstance(layer.transformer_layer.mlp, MoELayer) assert is_moe, "MTP layer in a2a overlap only supports MoE layer for now." @@ -646,17 +581,24 @@ def rng_context_wrapper(func, *args, **kwargs): # Build forward and backward callable functions # attn_forward already has rng context, no need to wrap attn_func = submodule_mtp_attn_forward + post_attn_func = partial(rng_context_wrapper, post_attn_forward) dispatch_func = partial(rng_context_wrapper, dispatch_forward) mlp_func = partial(rng_context_wrapper, mlp_forward) combine_func = partial(rng_context_wrapper, combine_forward) mtp_post_process_func = submodule_mtp_postprocess_forward - forward_funcs = [attn_func, dispatch_func, mlp_func, combine_func, mtp_post_process_func] - if isinstance(backward_dw["attn"], list): - backward_dw["attn"].append(layer.eh_proj) - else: - backward_dw["attn"] = [backward_dw["attn"], layer.eh_proj] - + forward_funcs = [ + attn_func, + post_attn_func, + dispatch_func, + mlp_func, + combine_func, + mtp_post_process_func, + ] + backward_dw = { + "attn": [layer.transformer_layer.self_attention, layer.eh_proj], + "mlp": layer.transformer_layer.mlp, + } return forward_funcs, backward_dw diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index c41a09ea594..a8fdf2324f2 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -2,7 +2,6 @@ import contextlib from functools import partial -from itertools import zip_longest from typing import Callable, Iterator, List, Optional, Union import torch @@ -844,110 +843,6 @@ def convert_schedule_table_to_order(num_warmup_microbatches, num_model_chunks, s return order -def get_overlap_moe_expert_parallel_comm_order(order, num_layers_per_chunk, capture_wgrad_graph): - """ - This functions gets the order for overlap_moe_expert_parallel_comm schedule for the original - chunk-wise order list. Each chunk is transformered to chunks with only 1 layer so that - layers between 2 chunks can now overlap with each other while following the graph order. - If capture_wgrad_graph is True, the wgrad backward graph is also added to the order by - decreasing the layer id by 0.5. - - Args: - order (List[int]): The original chunk-wise order list. Positive values represent forward - passes for chunks, negative values represent backward passes. The absolute value - indicates the chunk ID (1-indexed). - num_layers_per_chunk (List[int]): Number of graphable layers in each chunk. The length - of this list equals the number of chunks. - capture_wgrad_graph (bool): If True, weight gradient computation graphs are added to the - order by appending entries with layer_id - 0.5. - - Returns: - Tuple[List[float], List[Optional[List[int]]]]: A tuple containing: - - new_order: The layer-wise order list where each chunk is expanded to individual - layers. Positive values are forward passes, negative values are backward passes. - Values with .5 suffix indicate weight gradient computations. - - chunk_id_list: A list parallel to new_order. For forward passes, contains - [chunk_id, layer_index_within_chunk]. For backward passes, contains None. - - Example: - original_order: [1, 2, -2, 1, -1, -1] - num_layers_per_chunk: [1, 2] - capture_wgrad_graph=True: - new_order: [1, 2, 3, 1, -3, -3.5, -2, -2.5, -1, -1.5, -1, -1.5] - chunk_id_list: [[0, 0], [1, 0], [1, 1], [0, 0], None, - None, None, None, None, None, None, None] - capture_wgrad_graph=False: - new_order: [1, 2, 3, 1, -3, -2, -1, -1] - chunk_id_list: [[0, 0], [1, 0], [1, 1], [0, 0], None, None, None, None] - """ - - def _add_order(new_order, chunk_id_list, c_id, layer_id, is_wgrad=False, index=None): - if is_wgrad: - new_order.append(layer_id - 0.5) - else: - new_order.append(layer_id) - if c_id > 0: - chunk_id_list.append([abs(c_id) - 1, index]) - else: - chunk_id_list.append(None) - - new_order = [] - chunk_id_list = [] - add_order = partial(_add_order, new_order, chunk_id_list) - first_backward_idx, last_forward_idx = None, None - for idx, c_id in enumerate(order): - if first_backward_idx is None and c_id < 0: - first_backward_idx = idx - if c_id > 0: - last_forward_idx = idx - - def get_layer_range(c_id): - num_layers = num_layers_per_chunk[abs(c_id) - 1] - num_layers_previous_chunks = sum(num_layers_per_chunk[: abs(c_id) - 1]) - if c_id > 0: - return list( - range(num_layers_previous_chunks + 1, num_layers_previous_chunks + num_layers + 1) - ) - return list(range(-num_layers_previous_chunks - num_layers, -num_layers_previous_chunks)) - - # warmup stage - for c_id in order[:first_backward_idx]: - layer_range = get_layer_range(c_id) - new_order += layer_range - chunk_id_list.extend([abs(c_id) - 1, i] for i in range(len(layer_range))) - - # 1f1b overlap stage - if first_backward_idx < last_forward_idx: - for c_id_b, c_id_f in zip( - order[first_backward_idx : last_forward_idx + 1 : 2], - order[first_backward_idx + 1 : last_forward_idx + 1 : 2], - ): - layer_range_f = get_layer_range(c_id_f) - layer_range_b = get_layer_range(c_id_b) - index = 0 - for l_b, l_f in zip_longest(layer_range_b, layer_range_f, fillvalue=0): - # always forward graph before backward graph - if l_f != 0: - add_order(c_id_f, l_f, index=index) - if l_b != 0: - add_order(c_id_b, l_b) - if capture_wgrad_graph and index < len(layer_range_b) - 1: - add_order(c_id_b, l_b, is_wgrad=True) - index += 1 - # last wgrad backward - if capture_wgrad_graph and layer_range_b: - add_order(c_id_b, layer_range_b[-1], is_wgrad=True) - - # cool down stage, backward graphs only - for c_id in order[last_forward_idx + 1 :]: - for l_b in get_layer_range(c_id): - add_order(c_id, l_b) - if capture_wgrad_graph: - add_order(c_id, l_b, is_wgrad=True) - - return new_order, chunk_id_list - - def forward_backward_pipelining_with_interleaving( *, forward_step_func, diff --git a/megatron/core/pipeline_parallel/utils.py b/megatron/core/pipeline_parallel/utils.py index d38f6d702c0..e7e416f99bd 100644 --- a/megatron/core/pipeline_parallel/utils.py +++ b/megatron/core/pipeline_parallel/utils.py @@ -182,8 +182,8 @@ def __init__( self.free_input = free_input self.inputs = None self.outputs = None - self.manual_grads_release = False self.delay_grads_release = False + self.manual_release_grads = False def default_backward_func(self, outputs, output_grad): """Default backward function""" @@ -269,7 +269,7 @@ def _backward(self, *output_grad): # to avoid delayed garbage collection. If # delay_grads_release is True, dgrad is last used in # wgrad compute and skip the release here. - if self.manual_grads_release and not self.delay_grads_release: + if self.manual_release_grads and not self.delay_grads_release: g.untyped_storage().resize_(0) grads = self.get_grad() diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index b566c1830dc..27e6c65c738 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -10,7 +10,6 @@ from contextlib import nullcontext from dataclasses import fields, is_dataclass from enum import Enum -from math import ceil from typing import Any, Dict, List, Optional import torch @@ -1511,7 +1510,7 @@ def graphs_created(self): """ return self._graphs_created - def _get_sample_arguments(self, order, chunk_id_list=None): + def _get_sample_arguments(self, order): """ Generate sample arguments and keyword arguments for CUDA Graph capturing with memory-optimized buffer reuse. @@ -1540,9 +1539,6 @@ def _get_sample_arguments(self, order, chunk_id_list=None): order (List[int]): The forward/backward execution order from convert_schedule_table_to_order(). Positive integers represent forward passes (1-indexed chunk ID), negative integers represent backward passes. - chunk_id_list (List[Tuple[int, int]]): The list of chunk IDs and layer IDs in the - order. This is useful only when overlap_moe_expert_parallel_comm is enabled, - the order maps each layers' idx to their original chunk id. Returns: Tuple[List[Tuple], List[Dict]]: A tuple containing: @@ -1564,11 +1560,9 @@ def _get_sample_arguments(self, order, chunk_id_list=None): assert self.num_model_chunks == max( order ), "num_model_chunks must match the max chunk id in order." - if chunk_id_list is None: - # check only if 1f1b overlap is disabled. - assert ( - self.num_microbatches == len(order) // self.num_model_chunks // 2 - ), "num_microbatches must match the number of microbatches in order." + assert ( + self.num_microbatches == len(order) // self.num_model_chunks // 2 + ), "num_microbatches must match the number of microbatches in order." # Generate sample arguments and keyword arguments for capturing. sample_args = [None] * (len(self.flattened_callables) * self.num_microbatches) @@ -1651,8 +1645,8 @@ def get_rotary_pos_emb(transformer_module, transformer_input): consumed_sample_queue = {} layer_sample_keys_cache = {} fwd_idx = [0] * self.num_model_chunks - for idx, chunk_id in enumerate(order): - model_chunk_idx = abs(ceil(chunk_id)) - 1 + for chunk_id in order: + model_chunk_idx = abs(chunk_id) - 1 if chunk_id > 0: if model_chunk_idx not in fwd_sample_queues: @@ -1661,14 +1655,7 @@ def get_rotary_pos_emb(transformer_module, transformer_input): sample_start_idx = (prefix_num_layers[model_chunk_idx] * self.num_microbatches) + ( fwd_idx[model_chunk_idx] * self.num_layers_per_chunk[model_chunk_idx] ) - if chunk_id_list: - model_chunk_idx = chunk_id_list[idx][0] - callables_curr_chunk = [ - self.callables_per_chunk[model_chunk_idx][chunk_id_list[idx][1]] - ] - else: - callables_curr_chunk = self.callables_per_chunk[model_chunk_idx] - for layer_idx, layer in enumerate(callables_curr_chunk): + for layer_idx, layer in enumerate(self.callables_per_chunk[model_chunk_idx]): per_callable_fwd_idx = sample_start_idx + layer_idx # Get sample_args and sample_kwargs for index per_callable_fwd_idx. @@ -1705,7 +1692,7 @@ def get_rotary_pos_emb(transformer_module, transformer_input): # reuse the static inputs of a previous forward pass for this forward pass. # If not, we still need to generate the new static inputs. sample_keys = layer_sample_keys_cache[id(layer)] - model_chunk_idx = abs(chunk_id) - 1 + fwd_sample_queues[model_chunk_idx].append((sample_keys, per_callable_fwd_idx)) if consumed_sample_queue.get(sample_keys, []): # We can reuse the static inputs of a previous forward pass for this @@ -1727,16 +1714,13 @@ def get_rotary_pos_emb(transformer_module, transformer_input): # Unfortunately, no previous static inputs are available for reuse, # sample_args is still None. Last attempt: generate the new static inputs # for this forward pass. - if chunk_id_list: - model_chunk_idx = chunk_id_list[idx][0] sample_args[per_callable_fwd_idx], sample_kwargs[per_callable_fwd_idx] = ( _get_layer_static_inputs( layer, self.chunks_with_decoder[model_chunk_idx] ) ) - model_chunk_idx = abs(chunk_id) - 1 fwd_idx[model_chunk_idx] += 1 - elif ceil(chunk_id) == chunk_id: + else: num_consumed_samples = min( len(fwd_sample_queues[model_chunk_idx]), self.num_layers_per_chunk[model_chunk_idx], @@ -1750,9 +1734,6 @@ def get_rotary_pos_emb(transformer_module, transformer_input): fwd_sample_queues[model_chunk_idx] = fwd_sample_queues[model_chunk_idx][ num_consumed_samples: ] - else: - # skip register static inputs for wgrad backward graphs - continue return sample_args, sample_kwargs @@ -1765,16 +1746,12 @@ def _get_cuda_graph_input_data(self): # Get the PP and VPP scheduling order. from megatron.core.pipeline_parallel.schedules import ( convert_schedule_table_to_order, - get_overlap_moe_expert_parallel_comm_order, get_pp_rank_microbatches, get_schedule_table, ) # If PP is not enabled, we only need to capture one microbatch. - if ( - parallel_state.get_pipeline_model_parallel_world_size() == 1 - and not self.config.overlap_moe_expert_parallel_comm - ): + if parallel_state.get_pipeline_model_parallel_world_size() == 1: assert ( self.num_model_chunks == 1 ), "If PP is not enabled, there should be only one model chunk." @@ -1803,36 +1780,9 @@ def _get_cuda_graph_input_data(self): level=logging.DEBUG, msg=f'Rank {torch.distributed.get_rank()}: ORDER {order}', ) - chunk_id_list = None - if self.config.overlap_moe_expert_parallel_comm: - wgrad_in_graph_scope = CudaGraphScope.attn in self.config.cuda_graph_scope or ( - CudaGraphScope.moe_router in self.config.cuda_graph_scope - and self.config.moe_shared_expert_intermediate_size is not None - and not self.config.moe_shared_expert_overlap - ) - capture_wgrad_graph = self.config.delay_wgrad_compute and wgrad_in_graph_scope - order, chunk_id_list = get_overlap_moe_expert_parallel_comm_order( - order, self.num_layers_per_chunk, capture_wgrad_graph - ) - self.num_layers_per_chunk = [1] * sum(self.num_layers_per_chunk) - self.num_model_chunks = max(order) - _order_without_wgrad = [] - for c_id in order: - if ceil(c_id) != c_id: - continue - _order_without_wgrad.append(c_id) - self.num_microbatches = len(_order_without_wgrad) // self.num_model_chunks // 2 - log_on_each_pipeline_stage( - logger=logger, - tp_group=None, - dp_cp_group=None, - level=logging.DEBUG, - msg=f'Rank {torch.distributed.get_rank()}: ' - f'ORDER after overlap_moe_expert_parallel_comm {order}', - ) # Generate sample arguments and keyword arguments for capturing. - sample_args, sample_kwargs = self._get_sample_arguments(order, chunk_id_list) + sample_args, sample_kwargs = self._get_sample_arguments(order) def get_make_graphed_callables_kwargs(): kwargs = {'allow_unused_input': True, '_order': order} @@ -1970,17 +1920,13 @@ def create_cudagraphs(self): for layer_number, layer in enumerate(layers): layer.cuda_graphs = [] for batch_number in range(self.num_microbatches): - if self.config.overlap_moe_expert_parallel_comm: - graph_idx = ( - num_layers_accumulated + layer_number - ) * self.num_microbatches + batch_number - else: - graph_idx = ( + layer.cuda_graphs.append( + graphs[ num_layers_accumulated * self.num_microbatches + batch_number * len(layers) + layer_number - ) - layer.cuda_graphs.append(graphs[graph_idx]) + ] + ) num_layers_accumulated += len(layers) self._finish_capturing(start_time) diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py index c8438bb2c8a..10d10f667fe 100644 --- a/megatron/core/transformer/moe/moe_layer.py +++ b/megatron/core/transformer/moe/moe_layer.py @@ -326,11 +326,10 @@ def custom_forward(hidden_states): return outputs - def backward_dw(self, routed_experts: bool = True, shared_experts: bool = False): + def backward_dw(self): """Compute weight gradients for experts and shared experts.""" - if routed_experts: - self.experts.backward_dw() - if shared_experts and self.use_shared_expert and not self.shared_expert_overlap: + self.experts.backward_dw() + if self.use_shared_expert and not self.shared_expert_overlap: self.shared_experts.backward_dw() def set_for_recompute_pre_mlp_layernorm(self): diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index a5636d94e26..6493a4bcce1 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -1849,16 +1849,6 @@ def __post_init__(self): 'when enabling overlap_moe_expert_parallel_comm with MTP layer.' ) - if self.cuda_graph_impl != "none": - assert ( - self.cuda_graph_impl == "transformer_engine" - and CudaGraphScope.moe not in self.cuda_graph_scope - and CudaGraphScope.mlp not in self.cuda_graph_scope - ), ( - 'CUDA graph scope on moe and mlp is not ' - 'supported with overlap_moe_expert_parallel_comm' - ) - # Check delay_wgrad_compute compatibility if self.delay_wgrad_compute: assert ( @@ -1867,11 +1857,6 @@ def __post_init__(self): assert ( not self.moe_use_legacy_grouped_gemm ), 'delay_wgrad_compute is not supported with legacy groupedgemm implementation' - if self.cuda_graph_impl == "transformer_engine": - assert is_te_min_version("2.10.0"), ( - 'TE version >= 2.10.0 is required for delay_wgrad_compute with ' - 'partial cuda graph' - ) if self.ep_overlap_early_attn_memory_release: assert self.overlap_moe_expert_parallel_comm, ( diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index db57e21c891..3ea40577009 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -872,10 +872,6 @@ def _te_cuda_graph_replay(self, *args, **kwargs): # CUDA Graph captures the whole MLP/MoE part. CUDA Graph output is the layer output. assert len(cuda_graph_output) == 1, "CUDA Graph output should be the layer output." output = cuda_graph_output.pop() - assert ( - not self.config.overlap_moe_expert_parallel_comm - ), "EP overlap must be \ - disabled when CUDA graph captures the whole MLP/MoE part." elif self.is_moe_layer and CudaGraphScope.moe_router in self.config.cuda_graph_scope: # CUDA Graph partially captures the MoE. # The rest of the layer should go to the normal pass. @@ -918,35 +914,12 @@ def _te_cuda_graph_replay(self, *args, **kwargs): residual=residual, shared_expert_output=shared_expert_output, ) - # If EP overlap is enabled, remaining of mlp will be called as fine_grained_callables - # and should be skipped here. - if self.config.overlap_moe_expert_parallel_comm: - probs, routing_map = self.mlp.route(hidden_states) - hidden_states, probs, residual = self.mlp.preprocess( - hidden_states, probs, routing_map - ) - nvtx_range_pop(suffix="mlp") - return mlp_residual, hidden_states, probs, shared_expert_output mlp_output_with_bias = self.mlp(hidden_states) self.mlp.cudagraph_tensor_store.clear() nvtx_range_pop(suffix="mlp") output = self._forward_post_mlp(mlp_output_with_bias, mlp_residual) else: - # If EP overlap is enabled, needs to return same outputs as submodule.attn - if self.config.overlap_moe_expert_parallel_comm: - assert len(cuda_graph_output) == 1, "CUDA Graph output should be the layer output." - mlp_residual = cuda_graph_output.pop() - if not self.is_moe_layer: - return mlp_residual, None, None, None - hidden_states = self.pre_mlp_layernorm(mlp_residual) - shared_expert_output = self.mlp.shared_experts_compute(hidden_states) - probs, routing_map = self.mlp.route(hidden_states) - hidden_states, probs, residual = self.mlp.preprocess( - hidden_states, probs, routing_map - ) - return mlp_residual, hidden_states, probs, shared_expert_output - # CUDA Graph does not capture the MLP/MoE part at all. output = self._forward_mlp(*cuda_graph_output) return output, context @@ -1034,15 +1007,6 @@ def _should_call_local_cudagraph(self, *args, **kwargs): return True return False - def backward_dw_cudagraph(self, microbatch_idx): - """ - CUDA Graph backward weight gradient computation for this layer. - """ - cg_index = microbatch_idx % len(self.cuda_graphs) - if not hasattr(self.cuda_graphs[cg_index], 'backward_dw'): - return - self.cuda_graphs[cg_index].backward_dw() - def __call__(self, *args, **kwargs): if self._should_call_local_cudagraph(*args, **kwargs): # Inference mode. diff --git a/tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py b/tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py deleted file mode 100644 index 91c74fe1bb6..00000000000 --- a/tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py +++ /dev/null @@ -1,372 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -import gc -import os -import sys - -import pytest -import torch - -from megatron.core.enums import ModelType -from megatron.core.models.gpt.gpt_layer_specs import ( - get_gpt_decoder_block_spec, - get_gpt_mtp_block_spec, -) -from megatron.core.models.gpt.gpt_model import GPTModel -from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator -from megatron.core.pipeline_parallel.utils import set_streams -from megatron.core.tensor_parallel.random import HAVE_TE, model_parallel_cuda_manual_seed -from megatron.core.transformer.enums import CudaGraphScope -from megatron.core.transformer.module import float16_to_fp32 -from megatron.core.utils import is_te_min_version, unwrap_model -from megatron.training.arguments import core_transformer_config_from_args, parse_args, validate_args -from megatron.training.global_vars import ( - destroy_global_vars, - get_args, - set_args, - set_global_variables, -) -from megatron.training.training import setup_model_and_optimizer -from tests.unit_tests.test_utilities import Utils - - -def is_deep_ep_available(): - from megatron.core.transformer.moe.fused_a2a import HAVE_DEEP_EP - - return HAVE_DEEP_EP - - -def is_hybrid_ep_available(): - from megatron.core.transformer.moe.fused_a2a import HAVE_HYBRIDEP - - return HAVE_HYBRIDEP - - -def save(fn, message): - with open(fn, 'w') as f: - f.write(message) - - -class TestPartialCudaGraphedA2AOverlap: - """Test that CUDA graph outputs match ep-overlapped CUDA graph outputs for various scopes.""" - - def setup_method(self, method): - self.seq_length = 512 - self.micro_batch_size = 2 - # Store original environment variable values - self.original_env = { - 'CUDA_DEVICE_MAX_CONNECTIONS': os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS'), - 'NVTE_ALLOW_NONDETERMINISTIC_ALGO': os.environ.get('NVTE_ALLOW_NONDETERMINISTIC_ALGO'), - } - self.cuda_graph_helper = None - os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1' - os.environ['NVTE_ALLOW_NONDETERMINISTIC_ALGO'] = '0' - - def teardown_method(self, method): - # Restore original environment variable values - for key, value in self.original_env.items(): - if value is None: - os.environ.pop(key, None) - else: - os.environ[key] = value - Utils.destroy_model_parallel() - destroy_global_vars() - destroy_num_microbatches_calculator() - if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created(): - self.cuda_graph_helper.delete_cuda_graphs() - self.cuda_graph_helper = None - - gc.collect() - - def model_provider( - self, - pre_process=True, - post_process=True, - layer_spec_fn=get_gpt_decoder_block_spec, - **config_kwargs, - ): - model_parallel_cuda_manual_seed(123) - args = get_args() - config = core_transformer_config_from_args(args) - transformer_layer_spec = layer_spec_fn( - config, - use_transformer_engine=True, - normalization=args.normalization, - qk_l2_norm=args.qk_l2_norm, - ) - if args.mtp_num_layers: - mtp_block_spec = get_gpt_mtp_block_spec( - config, transformer_layer_spec, use_transformer_engine=True - ) - else: - mtp_block_spec = None - return GPTModel( - config=config, - transformer_layer_spec=transformer_layer_spec, - vocab_size=args.vocab_size, - max_sequence_length=args.max_position_embeddings, - pre_process=pre_process, - post_process=post_process, - fp16_lm_cross_entropy=args.fp16_lm_cross_entropy, - parallel_output=True, - share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, - position_embedding_type=args.position_embedding_type, - rotary_percent=args.rotary_percent, - mtp_block_spec=mtp_block_spec, - ) - - def create_test_args( - self, cuda_graph_impl, cuda_graph_scope, cuda_graph_warmup_steps, ep_size, **kwargs - ): - destroy_global_vars() - destroy_num_microbatches_calculator() - - sys.argv = ['test_cuda_graphs.py'] - args = parse_args() - args.num_layers = 1 - args.mtp_num_layers = None - args.vocab_size = 1024 - args.hidden_size = 128 - args.num_attention_heads = 8 - args.max_position_embeddings = 512 - args.global_batch_size = self.micro_batch_size * 8 - args.micro_batch_size = self.micro_batch_size - args.create_attention_mask_in_dataloader = True - args.seq_length = self.seq_length - args.tensor_model_parallel_size = 2 - args.sequence_parallel = True - args.pipeline_model_parallel_size = 1 - args.context_parallel_size = 1 - args.expert_model_parallel_size = ep_size - args.train_iters = 10 - args.lr = 3e-5 - args.bf16 = True - args.add_bias_linear = False - args.swiglu = True - args.use_distributed_optimizer = True - args.position_embedding_type = "rope" - args.rotary_percent = 1.0 - args.hidden_dropout = 0.0 - args.attention_dropout = 0.0 - args.untie_embeddings_and_output_weights = True - - # MoE settings - args.num_experts = 16 - args.expert_model_parallel_size = ep_size - args.moe_shared_expert_intermediate_size = 1024 - args.moe_layer_freq = kwargs.get("moe_layer_freq", "[0,0,1,1]") - args.moe_permute_fusion = True - args.moe_router_fusion = True - args.moe_router_topk = 2 - - # CUDA graph settings - args.cuda_graph_impl = cuda_graph_impl - args.cuda_graph_scope = cuda_graph_scope - args.cuda_graph_warmup_steps = cuda_graph_warmup_steps - args.use_te_rng_tracker = cuda_graph_impl != "none" - - for key, value in kwargs.items(): - assert hasattr(args, key) - setattr(args, key, value) - - validate_args(args) - set_global_variables(args, False) - return args - - def get_batch(self, seq_length, micro_batch_size): - data = list(range(seq_length)) - input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() - labels = 1 + torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() - position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() - attention_mask = torch.ones( - (micro_batch_size, 1, seq_length, seq_length), dtype=bool - ).cuda() - loss_mask = torch.ones(seq_length).repeat((micro_batch_size, 1)).cuda() - return input_ids, labels, position_ids, attention_mask, loss_mask - - def _run_1f1b_helper(self, gpt_model, optimizer, data, num_iters, cuda_graph_warmup_steps): - from megatron.core.models.common.model_chunk_schedule_plan import ( - TransformerModelChunkSchedulePlan, - ) - from megatron.core.pipeline_parallel.schedules import set_current_microbatch - - schedule_plans = [] - losses = [] - set_current_microbatch(gpt_model[0], 1) - - gpt_model[0].zero_grad_buffer() - optimizer.zero_grad() - assert cuda_graph_warmup_steps > 0, "cuda_graph_warmup_steps must be greater than 0" - for fwd_mb_idx in range(num_iters + 1): - # Capture CUDA graphs after warmup if helper is provided - if self.cuda_graph_helper is not None and fwd_mb_idx == cuda_graph_warmup_steps: - self.cuda_graph_helper.create_cudagraphs() - - if fwd_mb_idx < cuda_graph_warmup_steps: - gpt_model[0].zero_grad_buffer() - optimizer.zero_grad() - output = gpt_model[0].forward(**data) - schedule_plans.append(None) - else: - if fwd_mb_idx == cuda_graph_warmup_steps: - extra_schedule_plan = unwrap_model(gpt_model[0]).build_schedule_plan(**data) - TransformerModelChunkSchedulePlan.run(extra_schedule_plan, None) - schedule_plans[-1] = extra_schedule_plan - f_schedule_plan = unwrap_model(gpt_model[0]).build_schedule_plan(**data) - b_schedule_plan = schedule_plans[-1] - schedule_plans.append(f_schedule_plan) - if b_schedule_plan is not None: - gpt_model[0].zero_grad_buffer() - optimizer.zero_grad() - output = TransformerModelChunkSchedulePlan.run( - f_schedule_plan, - b_schedule_plan, - b_grad=torch.ones_like(output) if fwd_mb_idx > 0 else None, - ) - # Check output shapes - if fwd_mb_idx < num_iters: - assert output is not None - assert output.shape[0] == self.micro_batch_size - assert output.shape[1] == self.seq_length - losses.append(output) - - if fwd_mb_idx < cuda_graph_warmup_steps: - output.backward(torch.ones_like(output)) - - for param in gpt_model[0].parameters(): - assert param.main_grad is not None - - update_successful, _, _ = optimizer.step() - assert update_successful - - return losses - - def _run_test_helper( - self, - ep_size, - cuda_graph_impl, - cuda_graph_scope, - cuda_graph_warmup_steps, - ep_overlap=False, - **kwargs, - ): - """Test fp8_param with gpt_model.""" - args = self.create_test_args( - cuda_graph_impl, - cuda_graph_scope, - cuda_graph_warmup_steps, - ep_size, - overlap_moe_expert_parallel_comm=ep_overlap, - **kwargs, - ) - if ep_overlap: - set_streams() - set_args(args) - torch.manual_seed(123) - Utils.initialize_model_parallel( - tensor_model_parallel_size=2, expert_model_parallel_size=ep_size - ) - - input_ids, labels, position_ids, attention_mask, loss_mask = self.get_batch( - self.seq_length, self.micro_batch_size - ) - - gpt_model, optimizer, _ = setup_model_and_optimizer( - self.model_provider, ModelType.encoder_or_decoder - ) - assert len(gpt_model) == 1 # Assume only one model in the model provider. - - loss_list = [] - - if cuda_graph_impl == "transformer_engine": - from megatron.core.transformer.cuda_graphs import TECudaGraphHelper - - self.cuda_graph_helper = TECudaGraphHelper( - model=gpt_model, - config=gpt_model[0].config, - seq_length=self.seq_length, - micro_batch_size=self.micro_batch_size, - optimizers=[optimizer], - ) - - num_iters = cuda_graph_warmup_steps + 2 - data = { - "input_ids": input_ids, - "position_ids": position_ids, - "attention_mask": attention_mask, - "labels": labels, - "loss_mask": loss_mask, - } - if not ep_overlap: - for i in range(num_iters): - gpt_model[0].zero_grad_buffer() - optimizer.zero_grad() - - # Capture CUDA graphs after warmup if helper is provided - if self.cuda_graph_helper is not None and i == cuda_graph_warmup_steps: - self.cuda_graph_helper.create_cudagraphs() - - output = unwrap_model(gpt_model[0]).forward(**data) - output = float16_to_fp32(output) - - # Check output shapes - assert output.shape[0] == self.micro_batch_size - assert output.shape[1] == self.seq_length - - # Verify gradients - output.backward(torch.ones_like(output)) - for param in gpt_model[0].parameters(): - assert param.main_grad is not None - - update_successful, _, _ = optimizer.step() - assert update_successful - - loss_list.append(output) - else: - loss_list = self._run_1f1b_helper( - gpt_model, optimizer, data, num_iters, cuda_graph_warmup_steps - ) - - return loss_list - - @pytest.mark.skipif( - not (HAVE_TE and is_te_min_version("2.10.0")), - reason="Partial CUDA graph support requires TransformerEngine version >= 2.10.0", - ) - @pytest.mark.parametrize("moe_dispatcher_type", ["alltoall", "deepep"]) - def test_moe_partial_cudagraph_with_ep_overlap(self, moe_dispatcher_type): - extra_kwargs = {"moe_layer_freq": 1} - if moe_dispatcher_type == "deepep": - if not is_deep_ep_available(): - pytest.skip("Deep EP is not available") - extra_kwargs["moe_token_dispatcher_type"] = "flex" - extra_kwargs["moe_flex_dispatcher_backend"] = "deepep" - extra_kwargs["moe_router_dtype"] = "fp32" - elif moe_dispatcher_type == "hybridep": - if not is_hybrid_ep_available(): - pytest.skip("Hybrid EP is not available") - extra_kwargs["moe_token_dispatcher_type"] = "flex" - extra_kwargs["moe_flex_dispatcher_backend"] = "hybridep" - else: - extra_kwargs["moe_token_dispatcher_type"] = moe_dispatcher_type - - loss_list_ref = self._run_test_helper(4, "none", None, 3, **extra_kwargs) - for cuda_graph_scope in [ - [CudaGraphScope.attn], - [CudaGraphScope.attn, CudaGraphScope.moe_router], - [CudaGraphScope.attn, CudaGraphScope.moe_router, CudaGraphScope.moe_preprocess], - ]: - cuda_graph_warmup_steps = 3 - loss_list = self._run_test_helper( - 4, - "transformer_engine", - cuda_graph_scope, - cuda_graph_warmup_steps, - ep_overlap=True, - **extra_kwargs, - ) - assert len(loss_list) == len(loss_list_ref) - for i in range(len(loss_list)): - assert torch.equal( - loss_list[i].mean(), loss_list_ref[i].mean() - ), f"scope={cuda_graph_scope}, i={i},loss_list={loss_list[i]}, loss_list_ref={loss_list_ref[i]}" - print(f"[DEBUG] Pass {cuda_graph_scope}") diff --git a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py index 0fd2c445c9f..7fb97f6e586 100644 --- a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py +++ b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py @@ -306,7 +306,7 @@ def test_transformer_layer_overlap_shared_expert(self): "moe_shared_expert_intermediate_size": 512, } overlap_config = get_test_config(extra_kwargs=extra_kwargs) - extra_kwargs["moe_shared_expert_overlap"] = False + extra_kwargs["moe_shared_expert_overlap"] = True ref_config = get_test_config(extra_kwargs=extra_kwargs) microbatches = 4 with deterministic_mode(): diff --git a/tests/unit_tests/a2a_overlap/utils.py b/tests/unit_tests/a2a_overlap/utils.py index a52843956df..7db4256a849 100644 --- a/tests/unit_tests/a2a_overlap/utils.py +++ b/tests/unit_tests/a2a_overlap/utils.py @@ -1,4 +1,3 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import os from contextlib import contextmanager from dataclasses import dataclass diff --git a/tests/unit_tests/pipeline_parallel/test_schedules.py b/tests/unit_tests/pipeline_parallel/test_schedules.py index 86b9219fe0f..b861aa2df49 100644 --- a/tests/unit_tests/pipeline_parallel/test_schedules.py +++ b/tests/unit_tests/pipeline_parallel/test_schedules.py @@ -1,5 +1,3 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. - import os import pytest @@ -129,52 +127,6 @@ def test_get_pipeline_parallel_order( for k, v in order_cnt.items(): assert -k in order_cnt and order_cnt[-k] == v - layers_per_chunk = 2 - num_layers_per_chunk = [layers_per_chunk] * num_model_chunks - # disable wgrad compute - overlapped_order, chunk_id_list = schedule.get_overlap_moe_expert_parallel_comm_order( - order, num_layers_per_chunk, False - ) - assert max(overlapped_order) == num_model_chunks * layers_per_chunk - assert len(overlapped_order) == len(order) * layers_per_chunk - assert len(chunk_id_list) == len(overlapped_order) - order_cnt = {} - accumulated_order = 0 - for o in overlapped_order: - order_cnt[o] = order_cnt.get(o, 0) + 1 - if o < 0: - assert -o in order_cnt and order_cnt[-o] >= order_cnt[o] - elif -o in order_cnt: - assert order_cnt[-o] < order_cnt[o] - accumulated_order += o - assert accumulated_order >= 0 - assert accumulated_order == 0 - - # enable wgrad compute - overlapped_order, chunk_id_list = schedule.get_overlap_moe_expert_parallel_comm_order( - order, num_layers_per_chunk, True - ) - assert max(overlapped_order) == num_model_chunks * layers_per_chunk - assert len(overlapped_order) == len(order) * layers_per_chunk * 3 // 2 - assert len(chunk_id_list) == len(overlapped_order) - from math import ceil - - order_cnt = {} - accumulated_order = 0 - prev_o = 0 - for o in overlapped_order: - if ceil(o) != o: - assert prev_o - 0.5 == o - else: - order_cnt[o] = order_cnt.get(o, 0) + 1 - if o < 0: - assert -o in order_cnt and order_cnt[-o] >= order_cnt[o] - elif -o in order_cnt: - assert order_cnt[-o] < order_cnt[o] - accumulated_order += o - prev_o = o - assert accumulated_order < 0 - Utils.destroy_model_parallel() diff --git a/tests/unit_tests/transformer/test_submodule_callables.py b/tests/unit_tests/transformer/test_submodule_callables.py index 73059495c06..1ccb6fd5be8 100644 --- a/tests/unit_tests/transformer/test_submodule_callables.py +++ b/tests/unit_tests/transformer/test_submodule_callables.py @@ -64,7 +64,7 @@ def run_model_submodules_with_capture(model, input_tensors, microbatches): output_tensors = [] # get callables callables, dw = build_layer_callables(model) - attn, dispatch, moe, combine, post_process = callables + attn, post_attn, dispatch, moe, combine, post_process = callables assert post_process is None dummy_model = DummyState() dummy_model.decoder = DummyState() @@ -76,16 +76,24 @@ def run_model_submodules_with_capture(model, input_tensors, microbatches): node.chunk_state.model = dummy_model # attn fwd - local_tokens, probs = attn(node, input_tensors[i]) + hidden_states = attn(node, input_tensors[i]) + + # post attn fwd + local_tokens, probs = post_attn(node, hidden_states) # dispatch fwd dispatched_tokens = dispatch(node, local_tokens, probs) # moe fwd - expert_output = moe(node, dispatched_tokens) + expert_outputs = moe(node, dispatched_tokens) + if model.mlp.use_shared_expert: + expert_output, shared_expert_output = expert_outputs + else: + expert_output = expert_outputs + shared_expert_output = None # combine fwd - hidden_states = combine(node, expert_output) + hidden_states = combine(node, expert_output, shared_expert_output) # loss output_tensors.append(hidden_states) From dfa6cc12d3a246d55f4c45847d73c9127099327b Mon Sep 17 00:00:00 2001 From: HaochenYuan <106647990+HaochenYuan@users.noreply.github.com> Date: Tue, 6 Jan 2026 15:35:49 +0800 Subject: [PATCH 214/248] [Dev] Remove calculation of padding token in moe routing loss (#2754) Co-authored-by: Li Tao Co-authored-by: Dennis(Zhenhuan) Liu --- .../core/extensions/transformer_engine.py | 2 +- .../common/model_chunk_schedule_plan.py | 2 + .../core/models/gpt/fine_grained_callables.py | 21 +- megatron/core/models/gpt/gpt_model.py | 37 +++- megatron/core/transformer/mlp.py | 2 +- megatron/core/transformer/moe/moe_layer.py | 27 ++- megatron/core/transformer/moe/moe_utils.py | 83 ++++++-- megatron/core/transformer/moe/router.py | 157 +++++++++++---- .../core/transformer/transformer_block.py | 15 +- .../core/transformer/transformer_layer.py | 26 ++- .../python_scripts/recipe_parser.py | 1 + .../a2a_overlap/test_schedule_chunk_1f1b.py | 116 ++++++++++- .../a2a_overlap/test_schedule_layer_1f1b.py | 4 +- .../transformer/moe/test_aux_loss.py | 189 ++++++++++++++++++ .../transformer/moe/test_routers.py | 47 +++++ 15 files changed, 640 insertions(+), 89 deletions(-) diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index 151b8ad27fa..d823e42b0bc 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -2139,7 +2139,7 @@ def forward_post_hook(module, *_) -> None: "TEFusedMLP module does not support submodules with post-backward hooks" ) - def forward(self, hidden_states: torch.Tensor) -> Tuple[Tensor, Optional[Tensor]]: + def forward(self, hidden_states: torch.Tensor, **kwargs) -> Tuple[Tensor, Optional[Tensor]]: """Forward.""" # Construct fused impl if needed diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py index 486a498dd73..07bab1cb486 100644 --- a/megatron/core/models/common/model_chunk_schedule_plan.py +++ b/megatron/core/models/common/model_chunk_schedule_plan.py @@ -305,6 +305,7 @@ def __init__( extra_block_kwargs=None, runtime_gather_output: Optional[bool] = None, loss_mask: Optional[Tensor] = None, + padding_mask=None, ): """Initialize the schedule plan of all Transformer layers' sub-modules. @@ -347,6 +348,7 @@ def __init__( self._model_chunk_state.mtp_hidden_states = None self._model_chunk_state.loss_mask = loss_mask self._model_chunk_state.packed_seq_params = packed_seq_params + self._model_chunk_state.padding_mask = padding_mask self._model_chunk_state.extra_block_kwargs = extra_block_kwargs self._model_chunk_state.runtime_gather_output = runtime_gather_output self._model_chunk_state.model = model diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index a0be55c4ca1..5913dfaba33 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -120,13 +120,19 @@ def forward_impl(self): if not self.gpt_model.pre_process: self.chunk_state.decoder_input = self.gpt_model.decoder.input_tensor # Run GPTModel._preprocess - decoder_input, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, sequence_len_offset = ( - self.gpt_model._preprocess( - input_ids=self.chunk_state.input_ids, - position_ids=self.chunk_state.position_ids, - decoder_input=self.chunk_state.decoder_input, - packed_seq_params=self.chunk_state.packed_seq_params, - ) + ( + decoder_input, + rotary_pos_emb, + rotary_pos_cos, + rotary_pos_sin, + sequence_len_offset, + padding_mask, + ) = self.gpt_model._preprocess( + input_ids=self.chunk_state.input_ids, + position_ids=self.chunk_state.position_ids, + decoder_input=self.chunk_state.decoder_input, + packed_seq_params=self.chunk_state.packed_seq_params, + padding_mask=self.chunk_state.padding_mask, ) # Saved for later use @@ -135,6 +141,7 @@ def forward_impl(self): self.chunk_state.rotary_pos_cos = rotary_pos_cos self.chunk_state.rotary_pos_sin = rotary_pos_sin self.chunk_state.sequence_len_offset = sequence_len_offset + self.chunk_state.padding_mask = padding_mask return decoder_input diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index a1230568cbd..9e70c677226 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -284,6 +284,7 @@ def _preprocess( decoder_input: Tensor = None, inference_context: BaseInferenceContext = None, packed_seq_params: PackedSeqParams = None, + padding_mask: Optional[Tensor] = None, ): """Preprocesses inputs for the transformer decoder. @@ -300,7 +301,20 @@ def _preprocess( if decoder_input is not None: pass elif self.pre_process: + if padding_mask is not None: + assert padding_mask.shape == input_ids.shape, ( + f"padding_mask shape {padding_mask.shape} does not match " + f"input_ids shape {input_ids.shape}" + ) decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids) + if padding_mask is not None and self.config.sequence_parallel: + padding_mask = ( + tensor_parallel.scatter_to_sequence_parallel_region( + padding_mask.transpose(0, 1).contiguous() + ) + .transpose(0, 1) + .contiguous() + ) else: # intermediate stage of pipeline # decoder will get hidden_states from encoder.input_tensor @@ -403,6 +417,7 @@ def _preprocess( rotary_pos_cos, rotary_pos_sin, sequence_len_offset, + padding_mask, ) if rotary_pos_cos_sin is not None: # only in the case of flashinfer fused rope will we @@ -446,6 +461,7 @@ def forward( *, inference_params: Optional[BaseInferenceContext] = None, loss_mask: Optional[Tensor] = None, + padding_mask: Optional[Tensor] = None, ) -> Tensor: """Forward function of the GPT Model This function passes the input tensors through the embedding layer, and then the decoder and finally into the post @@ -456,6 +472,9 @@ def forward( Args: runtime_gather_output (bool): Gather output at runtime. Default None means `parallel_output` arg in the constructor will be used. + padding_mask (Tensor, optional): Padding mask for MoE routing. + Shape [bsz, seq_length]. True = padding (exclude), False = valid (include). + Only used for MoE layers to exclude padding tokens from routing computations. """ if self.config.fine_grained_activation_offloading: self.preprocess_for_fine_grained_offloading() @@ -468,13 +487,19 @@ def forward( decoder_input=decoder_input, inference_context=inference_context, packed_seq_params=packed_seq_params, + padding_mask=padding_mask, ) - (decoder_input, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, sequence_len_offset) = ( - preproc_output[:5] - ) + ( + decoder_input, + rotary_pos_emb, + rotary_pos_cos, + rotary_pos_sin, + sequence_len_offset, + padding_mask, + ) = preproc_output[:6] - rotary_pos_cos_sin = preproc_output[5] if len(preproc_output) == 6 else None + rotary_pos_cos_sin = preproc_output[6] if len(preproc_output) == 7 else None # Run decoder. hidden_states = self.decoder( @@ -487,6 +512,7 @@ def forward( rotary_pos_cos_sin=rotary_pos_cos_sin, packed_seq_params=packed_seq_params, sequence_len_offset=sequence_len_offset, + padding_mask=padding_mask, **(extra_block_kwargs or {}), ) @@ -724,6 +750,7 @@ def build_schedule_plan( runtime_gather_output: Optional[bool] = None, inference_params: Optional[BaseInferenceContext] = None, loss_mask: Optional[Tensor] = None, + padding_mask: Optional[Tensor] = None, ): """Builds a computation schedule plan for the model. @@ -749,6 +776,7 @@ def build_schedule_plan( inference_params (InferenceParams, optional): Parameters for inference. Defaults to None. loss_mask (Optional[Tensor], optional): Loss mask. Defaults to None. + padding_mask (Optional[Tensor], optional): Padding mask. Defaults to None. Returns: TransformerModelChunkSchedulePlan: The model chunk schedule plan. @@ -770,6 +798,7 @@ def build_schedule_plan( extra_block_kwargs, runtime_gather_output, loss_mask, + padding_mask, ) def sharded_state_dict( diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py index 5d765484709..98e30887e7b 100644 --- a/megatron/core/transformer/mlp.py +++ b/megatron/core/transformer/mlp.py @@ -142,7 +142,7 @@ def __init__( tp_group=tp_group, ) - def forward(self, hidden_states, per_token_scale=None): + def forward(self, hidden_states, per_token_scale=None, **kwargs): """Perform the forward pass through the MLP block.""" # [s, b, 4 * h/p] nvtx_range_push(suffix="linear_fc1") diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py index 12ca61b64c1..3742d064508 100644 --- a/megatron/core/transformer/moe/moe_layer.py +++ b/megatron/core/transformer/moe/moe_layer.py @@ -206,13 +206,13 @@ def __init__( self.cudagraph_tensor_store = MoECudaGraphTensorStore() @maybe_skip_or_early_return_by_cudagraph("route") - def route(self, hidden_states: torch.Tensor): + def route(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): """Compute token routing for preprocessing. This method uses the router to determine which experts to send each token to, producing routing probabilities and a mapping. """ - probs, routing_map = self.router(hidden_states) + probs, routing_map = self.router(hidden_states, padding_mask=padding_mask) return probs, routing_map @maybe_skip_or_early_return_by_cudagraph("preprocess") @@ -308,7 +308,7 @@ def combine(self, output: torch.Tensor, shared_expert_output: Optional[torch.Ten output = output + shared_expert_output return output - def forward(self, hidden_states: torch.Tensor): + def forward(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): """Forward pass for the MoE layer. The forward pass comprises four main steps: @@ -318,7 +318,11 @@ def forward(self, hidden_states: torch.Tensor): 4. Combine: The outputs from the experts are combined and returned. Args: - hidden_states (torch.Tensor): The input tensor to the MoE layer. + hidden_states (torch.Tensor): The input tensor shape [seq_length, bsz, hidden_size]. + padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. + used for correct auxiliary loss computation for packed sequence. + Shape = [bsz, seq_length]. True = padding (exclude), False = valid (include). + Defaults to None (all tokens are valid). Returns: A tuple containing the output tensor and the MLP bias, if any. @@ -329,11 +333,15 @@ def forward(self, hidden_states: torch.Tensor): "are enabled without also enabling sequence parallelism." ) + # Transpose from [bsz, seq_length] to [seq_length, bsz] to align with hidden_states + if padding_mask is not None: + padding_mask = padding_mask.transpose(0, 1).bool() + # MoE forward: route -> dispatch -> compute -> combine - def custom_forward(hidden_states): + def custom_forward(hidden_states, padding_mask=None): try: shared_expert_output = self.shared_experts_compute(hidden_states) - probs, routing_map = self.route(hidden_states) + probs, routing_map = self.route(hidden_states, padding_mask=padding_mask) hidden_states, probs, residual = self.preprocess(hidden_states, probs, routing_map) except MoECudaGraphPartialCaptureSignal as e: # This signal is raised from the maybe_skip_or_early_return_by_cudagraph decorator. @@ -358,11 +366,14 @@ def custom_forward(hidden_states): tensor_parallel.random.get_cuda_rng_tracker, parallel_state.get_tensor_model_parallel_group(), hidden_states, + padding_mask, ) else: - outputs = tensor_parallel.checkpoint(custom_forward, False, hidden_states) + outputs = tensor_parallel.checkpoint( + custom_forward, False, hidden_states, padding_mask + ) else: - outputs = custom_forward(hidden_states) + outputs = custom_forward(hidden_states, padding_mask) return outputs diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py index 0837675507d..d915cfabb26 100644 --- a/megatron/core/transformer/moe/moe_utils.py +++ b/megatron/core/transformer/moe/moe_utils.py @@ -1,5 +1,4 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - import math from dataclasses import dataclass from typing import List, Optional, Union @@ -11,6 +10,7 @@ from megatron.core.fp8_utils import get_fp8_align_size from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel import get_cuda_rng_tracker, get_expert_parallel_rng_tracker_name +from megatron.core.tensor_parallel.mappings import reduce_from_tensor_model_parallel_region from megatron.core.transformer.cuda_graphs import is_graph_capturing from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.transformer_config import TransformerConfig @@ -120,18 +120,34 @@ def switch_load_balancing_loss_func( return aux_loss -def z_loss_func(logits, z_loss_coeff): +def z_loss_func(logits, z_loss_coeff, padding_mask: Optional[torch.Tensor] = None): """Encourages the router's logits to remain small to enhance stability. Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details. Args: logits (torch.Tensor): The logits of the router. + z_loss_coeff (float): The coefficient for the z-loss. + padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. + Shape [num_tokens]. True = padding (exclude), + False = valid (include). Defaults to None. Returns: torch.Tensor: The logits after applying the z-loss. """ + logsum = torch.logsumexp(logits, dim=-1) + z_loss_values = torch.square(logsum) + + if padding_mask is not None: + # Invert padding_mask: True (padding) -> 0, False (valid) -> 1 + valid_mask = ~padding_mask + # Only compute z_loss for valid (non-padding) tokens + z_loss_values = z_loss_values * valid_mask + # Compute mean over valid tokens only + num_valid_tokens = valid_mask.sum() + z_loss = z_loss_values.sum() / torch.clamp(num_valid_tokens, min=1.0) * z_loss_coeff + else: + z_loss = torch.mean(z_loss_values) * z_loss_coeff - z_loss = torch.mean(torch.square(torch.logsumexp(logits, dim=-1))) * z_loss_coeff return z_loss @@ -171,6 +187,28 @@ def get_capacity(num_tokens: int, num_experts: int, capacity_factor: float, min_ return capacity +def get_tokens_per_expert_and_token_count( + routing_map: torch.Tensor, + reduce_group: torch.distributed.ProcessGroup, + topk: int = None, + with_padding_mask: bool = False, +) -> torch.Tensor: + """ + Compute global_tokens_per_expert, local_num_tokens and total_num_tokens with padding mask. + """ + local_tokens_per_expert = routing_map.sum(dim=0) + global_tokens_per_expert = reduce_from_tensor_model_parallel_region( + local_tokens_per_expert, reduce_group + ) + if with_padding_mask: + local_num_tokens = local_tokens_per_expert.sum() / topk + total_num_tokens = global_tokens_per_expert.sum() / topk + else: + local_num_tokens = routing_map.shape[0] + total_num_tokens = local_num_tokens * reduce_group.size() + return global_tokens_per_expert, local_num_tokens, total_num_tokens + + class MoEAuxLossAutoScaler(torch.autograd.Function): """An AutoScaler that triggers the backward pass and scales the grad for auxiliary loss.""" @@ -629,35 +667,48 @@ def compute_topk(scores, topk, num_groups=None, group_topk=None): def compute_routing_scores_for_aux_loss( - logits: torch.Tensor, topk: int, score_function: str, fused: bool = False + logits: torch.Tensor, + topk: int, + score_function: str, + fused: bool = False, + padding_mask: Optional[torch.Tensor] = None, ): """Compute routing scores based on the score function. Args: logits (torch.Tensor): The logits tensor after gating, shape: [num_tokens, num_experts]. - + padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. + Shape [num_tokens]. True = padding (exclude), + False = valid (include). Defaults to None. Returns: - torch.Tensor: The normalized routing scores. + Tuple[torch.Tensor, torch.Tensor]: routing_map and scores. """ if fused: if not HAVE_TE or fused_compute_score_for_moe_aux_loss is None: raise ValueError( "fused_compute_score_for_moe_aux_loss is not available. Please install TE >= 2.6.0." ) - return fused_compute_score_for_moe_aux_loss( + routing_map, scores = fused_compute_score_for_moe_aux_loss( logits=logits, topk=topk, score_function=score_function ) - - if score_function == "softmax": - scores = torch.softmax(logits, dim=-1, dtype=torch.float32) - elif score_function == "sigmoid": - scores = torch.sigmoid(logits) - scores = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20) else: - raise ValueError(f"Invalid score_function: {score_function}") + if score_function == "softmax": + scores = torch.softmax(logits, dim=-1, dtype=torch.float32) + elif score_function == "sigmoid": + scores = torch.sigmoid(logits) + scores = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20) + else: + raise ValueError(f"Invalid score_function: {score_function}") + + _, top_indices = torch.topk(scores, k=topk, dim=1) + routing_map = torch.zeros_like(logits).int().scatter(1, top_indices, 1).bool() - _, top_indices = torch.topk(scores, k=topk, dim=1) - routing_map = torch.zeros_like(logits).int().scatter(1, top_indices, 1).bool() + # Apply padding mask to scores if provided + if padding_mask is not None: + # Invert padding_mask and make True indicates valid tokens + valid_mask = (~padding_mask).unsqueeze(-1) + routing_map = routing_map * valid_mask + scores = scores * valid_mask return routing_map, scores diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py index 34d81a21ffa..bbfb01fec8b 100644 --- a/megatron/core/transformer/moe/router.py +++ b/megatron/core/transformer/moe/router.py @@ -1,12 +1,11 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. from abc import ABC, abstractmethod -from typing import Optional +from typing import Optional, Union import torch from megatron.core.jit import jit_fuser -from megatron.core.tensor_parallel import reduce_from_tensor_model_parallel_region from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.moe.moe_utils import ( MoEAuxLossAutoScaler, @@ -14,6 +13,7 @@ apply_random_logits, apply_router_token_dropping, compute_routing_scores_for_aux_loss, + get_tokens_per_expert_and_token_count, router_gating_linear, save_to_aux_losses_tracker, sinkhorn, @@ -268,22 +268,28 @@ def is_aux_loss_enabled(self) -> bool: return False def _apply_aux_loss( - self, probs: torch.Tensor, scores_for_aux_loss: torch.Tensor, routing_map: torch.Tensor + self, + probs: torch.Tensor, + scores_for_aux_loss: torch.Tensor, + routing_map: torch.Tensor, + with_padding_mask: bool = False, ): """Apply the auxiliary loss for the given scores and routing map.""" aux_loss_coeff = self.get_aux_loss_coeff("aux_loss") if aux_loss_coeff == 0: return probs - tokens_per_expert = routing_map.sum(dim=0) - tokens_per_expert = reduce_from_tensor_model_parallel_region( - tokens_per_expert, self.tp_cp_group - ) - num_tokens = routing_map.shape[0] - total_num_tokens = num_tokens * self.tp_cp_group.size() + global_tokens_per_expert, local_num_tokens, total_num_tokens = ( + get_tokens_per_expert_and_token_count( + routing_map=routing_map, + reduce_group=self.tp_cp_group, + topk=self.topk, + with_padding_mask=with_padding_mask, + ) + ) aux_loss = switch_load_balancing_loss_func( probs=scores_for_aux_loss, - tokens_per_expert=tokens_per_expert, + tokens_per_expert=global_tokens_per_expert, total_num_tokens=total_num_tokens, topk=self.topk, num_experts=self.config.num_moe_experts, @@ -291,7 +297,12 @@ def _apply_aux_loss( fused=self.config.moe_router_fusion, ) probs = self.attach_and_log_load_balancing_loss( - probs, aux_loss_coeff, aux_loss, "load_balancing_loss", self.tp_cp_group + probs, + aux_loss_coeff, + aux_loss, + "load_balancing_loss", + self.tp_cp_group, + valid_token_count=local_num_tokens, ) return probs @@ -302,6 +313,7 @@ def _apply_seq_aux_loss( routing_map: torch.Tensor, seq_length: int, bsz: int, + with_padding_mask: bool = False, ): """Apply the sequence-level auxiliary loss for the given scores and routing map. @@ -315,17 +327,21 @@ def _apply_seq_aux_loss( return probs scores_for_aux_loss = scores_for_aux_loss.reshape(seq_length, -1) - tokens_per_expert = routing_map.reshape(seq_length, -1).sum(dim=0) - tokens_per_expert = reduce_from_tensor_model_parallel_region( - tokens_per_expert, self.tp_cp_group + routing_map = routing_map.reshape(seq_length, -1) + + global_tokens_per_expert, local_num_tokens, total_num_tokens = ( + get_tokens_per_expert_and_token_count( + routing_map=routing_map, + reduce_group=self.tp_cp_group, + with_padding_mask=with_padding_mask, + topk=self.topk * bsz, + ) ) - total_num_tokens = seq_length * self.tp_cp_group.size() - aux_loss = ( switch_load_balancing_loss_func( probs=scores_for_aux_loss, - tokens_per_expert=tokens_per_expert, + tokens_per_expert=global_tokens_per_expert, total_num_tokens=total_num_tokens, topk=self.topk, num_experts=self.config.num_moe_experts, @@ -334,31 +350,42 @@ def _apply_seq_aux_loss( ) / bsz ) + probs = self.attach_and_log_load_balancing_loss( - probs, seq_aux_loss_coeff, aux_loss, "seq_load_balancing_loss", self.tp_cp_group + probs, + seq_aux_loss_coeff, + aux_loss, + "seq_load_balancing_loss", + self.tp_cp_group, + valid_token_count=local_num_tokens, ) return probs def _apply_global_aux_loss( - self, probs: torch.Tensor, scores_for_aux_loss: torch.Tensor, routing_map: torch.Tensor + self, + probs: torch.Tensor, + scores_for_aux_loss: torch.Tensor, + routing_map: torch.Tensor, + with_padding_mask: bool = False, ): """Apply the global auxiliary loss for the given scores and routing map.""" global_aux_loss_coeff = self.get_aux_loss_coeff("global_aux_loss") if global_aux_loss_coeff == 0: return probs - tokens_per_expert = routing_map.sum(dim=0) - tokens_per_expert = reduce_from_tensor_model_parallel_region( - tokens_per_expert, self.tp_dp_cp_group + # Use unified function to compute tokens_per_expert and num_tokens + global_tokens_per_expert, local_num_tokens, total_num_tokens = ( + get_tokens_per_expert_and_token_count( + routing_map=routing_map, + reduce_group=self.tp_dp_cp_group, + with_padding_mask=with_padding_mask, + topk=self.topk, + ) ) - - self.global_tokens_per_expert += tokens_per_expert + self.global_tokens_per_expert += global_tokens_per_expert self.ga_steps += 1 averated_tokens_per_expert = self.global_tokens_per_expert / self.ga_steps - num_tokens = scores_for_aux_loss.shape[0] - total_num_tokens = num_tokens * self.tp_dp_cp_group.size() - global_aux_loss = switch_load_balancing_loss_func( probs=scores_for_aux_loss, tokens_per_expert=averated_tokens_per_expert, @@ -374,6 +401,7 @@ def _apply_global_aux_loss( global_aux_loss, "global_load_balancing_loss", self.tp_dp_cp_group, + valid_token_count=local_num_tokens, reduce_group_has_dp=True, ) return probs @@ -385,6 +413,7 @@ def attach_and_log_load_balancing_loss( aux_loss: torch.Tensor, aux_loss_name: str, reduce_group: torch.distributed.ProcessGroup, + valid_token_count: Optional[Union[int, torch.Tensor]] = None, reduce_group_has_dp: bool = False, ): """Attach aux loss function to activation and add to logging. @@ -395,6 +424,9 @@ def attach_and_log_load_balancing_loss( aux_loss (torch.Tensor): The auxiliary loss tensor. aux_loss_name (str): The name of the auxiliary loss for logging. reduce_group (torch.distributed.ProcessGroup): The group for reducing the loss. + valid_token_count (int or torch.Tensor, optional): Number of valid tokens excluding + padding tokens. Can be a Python int or a torch.Tensor (typically 0-d tensor). + If None, uses activation.shape[0]. Defaults to None. reduce_group_has_dp (bool): Whether the reduce group has data parallel ranks. Set this to True if the reduce group has data parallel ranks. This flag is used to ensure the correct reduction in aux loss tracking. @@ -422,17 +454,22 @@ def attach_and_log_load_balancing_loss( # which scales both the main_loss gradient and aux_loss gradient by # 1/(num_local_tokens * dp_size * num_micro_batches) in finalize_model_grads function. # To correct this scaling, we need to scale the aux_loss by num_local_tokens here. - activation = MoEAuxLossAutoScaler.apply(activation, aux_loss * activation.shape[0]) + # Use valid_token_count (excluding padding) if provided, otherwise use total tokens. + num_tokens = valid_token_count if valid_token_count is not None else activation.shape[0] + activation = MoEAuxLossAutoScaler.apply(activation, aux_loss * num_tokens) else: activation = MoEAuxLossAutoScaler.apply(activation, aux_loss) return activation - def apply_z_loss(self, logits): + def apply_z_loss(self, logits, padding_mask: Optional[torch.Tensor] = None): """Encourages the router's logits to remain small to enhance stability. Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details. Args: logits (torch.Tensor): The logits of the router. + padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. + Shape [num_tokens]. True = padding (exclude), + False = valid (include). Defaults to None. Returns: torch.Tensor: The logits after applying the z-loss. @@ -440,7 +477,7 @@ def apply_z_loss(self, logits): if self.config.moe_z_loss_coeff is not None and self.training and torch.is_grad_enabled(): # Skip Z loss calculations when using torch.no_grad() or checkpointing. moe_z_loss_coeff = self.config.moe_z_loss_coeff / self.tp_cp_group.size() - z_loss = z_loss_func(logits, moe_z_loss_coeff) + z_loss = z_loss_func(logits, moe_z_loss_coeff, padding_mask=padding_mask) scale_up = 1.0 if self.calculate_per_token_loss: # The expected final scaling for z_loss gradients is @@ -450,7 +487,9 @@ def apply_z_loss(self, logits): # which scales both the main_loss gradient and z_loss gradient by # 1/(num_local_tokens * dp_size * num_micro_batches) in finalize_model_grads(). # To correct this scaling, we need to scale the z_loss by num_local_tokens here. - logits = MoEAuxLossAutoScaler.apply(logits, z_loss * logits.shape[0]) + # Count valid tokens: sum of inverted mask (False -> True = valid) + num_tokens = (~padding_mask).sum() if padding_mask is not None else logits.shape[0] + logits = MoEAuxLossAutoScaler.apply(logits, z_loss * num_tokens) else: logits = MoEAuxLossAutoScaler.apply(logits, z_loss) @@ -484,20 +523,32 @@ def apply_input_jitter(self, input: torch.Tensor): return input @jit_fuser - def _apply_expert_bias(self, routing_map: torch.Tensor): + def _apply_expert_bias( + self, routing_map: torch.Tensor, padding_mask: Optional[torch.Tensor] = None + ): """ Update expert bias and tokens_per_expert Prevent extra local tokens accumulation on evaluation or activation recomputation + + Args: + routing_map (torch.Tensor): Token to expert routing map, [num_tokens, num_experts]. + padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. + Shape [num_tokens]. True = padding (exclude), False = valid (include). """ if self.enable_expert_bias and torch.is_grad_enabled(): with torch.no_grad(): + if padding_mask is not None: + routing_map = routing_map & (~padding_mask) self.local_tokens_per_expert += routing_map.sum(dim=0) - def routing(self, logits: torch.Tensor): + def routing(self, logits: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): """Top-k routing function Args: logits (torch.Tensor): Logits tensor after gating. + padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. + Shape = [seq_length, bsz]. True=padding(exclude), + False=valid(include). Defaults to None. Returns: probs (torch.Tensor): The probabilities of token to experts assignment. @@ -507,8 +558,12 @@ def routing(self, logits: torch.Tensor): seq_length, bsz = logits.shape[:2] logits = logits.view(-1, self.config.num_moe_experts) + # Flatten padding_mask to [num_tokens] if provided + if padding_mask is not None: + padding_mask = padding_mask.reshape(-1) + # Apply Z-Loss - logits = self.apply_z_loss(logits) + logits = self.apply_z_loss(logits, padding_mask=padding_mask) # Calculate probs and routing_map for token dispatching if self.routing_type == "sinkhorn": @@ -541,18 +596,35 @@ def routing(self, logits: torch.Tensor): if self.training and torch.is_grad_enabled() and self.is_aux_loss_enabled(): # Calculate scores and routing_map for aux loss routing_map_for_aux_loss, scores_for_aux_loss = compute_routing_scores_for_aux_loss( - logits, self.topk, self.score_function, fused=self.config.moe_router_fusion + logits, + self.topk, + self.score_function, + fused=self.config.moe_router_fusion, + padding_mask=padding_mask, + ) + probs = self._apply_aux_loss( + probs, + scores_for_aux_loss, + routing_map_for_aux_loss, + with_padding_mask=padding_mask is not None, ) - probs = self._apply_aux_loss(probs, scores_for_aux_loss, routing_map_for_aux_loss) probs = self._apply_seq_aux_loss( - probs, scores_for_aux_loss, routing_map_for_aux_loss, seq_length, bsz + probs, + scores_for_aux_loss, + routing_map_for_aux_loss, + seq_length, + bsz, + with_padding_mask=padding_mask is not None, ) probs = self._apply_global_aux_loss( - probs, scores_for_aux_loss, routing_map_for_aux_loss + probs, + scores_for_aux_loss, + routing_map_for_aux_loss, + with_padding_mask=padding_mask is not None, ) # Optionally apply expert bias - self._apply_expert_bias(routing_map) + self._apply_expert_bias(routing_map, padding_mask=padding_mask) return probs, routing_map @@ -562,12 +634,15 @@ def reset_global_aux_loss_tracker(self): self.global_tokens_per_expert.zero_() self.ga_steps.zero_() - def forward(self, input: torch.Tensor): + def forward(self, input: torch.Tensor, padding_mask: Optional[torch.Tensor] = None): """ Forward pass of the router. Args: input (torch.Tensor): Input tensor. + padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions. + Shape = [seq_length, bsz]. True=padding(exclude), + False=valid(include). Defaults to None. """ self._maintain_float32_expert_bias() @@ -579,7 +654,7 @@ def forward(self, input: torch.Tensor): # Apply force load balancing with random logits for benchmark logits = apply_random_logits(logits) - probs, routing_map = self.routing(logits) + probs, routing_map = self.routing(logits, padding_mask=padding_mask) return probs, routing_map diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index 023db1fe75a..cbbd7ec00eb 100755 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -390,7 +390,6 @@ def build_layer(layer_spec, layer_number): def has_final_layernorm_in_this_stage(self): """ Check if this vpp stage contains the final layernorm. - Note: Final layernorm now has been moved from the post-process stage to the last decoder layer by using this function. @@ -429,12 +428,18 @@ def _checkpointed_forward( attention_bias: Tensor, packed_seq_params: PackedSeqParams, use_inner_quantization_context: bool, + padding_mask: Optional[Tensor] = None, ): """Forward method with activation checkpointing.""" def custom(start: int, end: int): def custom_forward( - hidden_states, attention_mask, context, context_mask, rotary_pos_emb + hidden_states, + attention_mask, + context, + context_mask, + rotary_pos_emb, + padding_mask=None, ): for index in range(start, end): layer = self._get_layer(index) @@ -465,6 +470,7 @@ def custom_forward( attention_bias=attention_bias, inference_context=None, packed_seq_params=packed_seq_params, + padding_mask=padding_mask, ) return hidden_states, context @@ -484,6 +490,7 @@ def checkpoint_handler(forward_func): context, context_mask, rotary_pos_emb, + padding_mask, ) else: return tensor_parallel.checkpoint( @@ -494,6 +501,7 @@ def checkpoint_handler(forward_func): context, context_mask, rotary_pos_emb, + padding_mask, ) if self.config.recompute_method == 'uniform': @@ -599,6 +607,7 @@ def forward( inference_context: Optional[BaseInferenceContext] = None, packed_seq_params: Optional[PackedSeqParams] = None, sequence_len_offset: Optional[Tensor] = None, + padding_mask: Optional[Tensor] = None, *, inference_params: Optional[BaseInferenceContext] = None, dynamic_inference_decode_only: Optional[bool] = None, @@ -708,6 +717,7 @@ def forward( attention_bias=attention_bias, packed_seq_params=packed_seq_params, use_inner_quantization_context=use_inner_quantization_context, + padding_mask=padding_mask, ) else: for l_no, layer in enumerate(self.layers): @@ -745,6 +755,7 @@ def forward( inference_context=inference_context, packed_seq_params=packed_seq_params, sequence_len_offset=sequence_len_offset, + padding_mask=padding_mask, ) if ( diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index 3ea40577009..5c310cc81e4 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -1,5 +1,6 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +import functools import logging import warnings from abc import ABC @@ -457,7 +458,12 @@ def forward(self, *args, **kwargs): # runners in the cuda graph manager kwargs.pop("dynamic_inference_decode_only", None) hidden_states, context = self._forward_attention(*args, **kwargs) - output = self._forward_mlp(hidden_states, kwargs.get("inference_context", None)) + + output = self._forward_mlp( + hidden_states, + kwargs.get("inference_context", None), + padding_mask=kwargs.get("padding_mask", None), + ) return output, context def _forward_attention( @@ -474,6 +480,7 @@ def _forward_attention( inference_context: Optional[Any] = None, packed_seq_params: Optional[PackedSeqParams] = None, sequence_len_offset: Optional[Tensor] = None, + padding_mask: Optional[Tensor] = None, *, inference_params: Optional[Any] = None, ): @@ -591,12 +598,18 @@ def _forward_attention( return hidden_states, context - def _forward_mlp(self, hidden_states, inference_context=None): + def _forward_mlp(self, hidden_states, inference_context=None, padding_mask=None): """ Perform a forward pass through the feed-forward layer. Args: hidden_states (Tensor): Transformed hidden states before the MLP layernorm. + Shape [seq_length, batch_size, hidden_size]. + inference_context: Inference context for optimizations. + padding_mask (Tensor, optional): Padding mask for MoE routing. + Shape [bsz, seq_length]. True = padding (exclude), False = valid (include). + Only used for MoE layers to exclude padding tokens from aux loss computations. + The MoELayer will internally transform this to [seq_length, bsz] format. Returns: output (Tensor): Transformed hidden states of shape [s, b, h]. @@ -642,7 +655,7 @@ def _forward_mlp(self, hidden_states, inference_context=None): assert ( not self.recompute_pre_mlp_layernorm ), "Recomputation is not supported for CUDA graph." - cudagraph_outputs = self.mlp(pre_mlp_layernorm_output) + cudagraph_outputs = self.mlp(pre_mlp_layernorm_output, padding_mask=padding_mask) nvtx_range_pop(suffix="mlp") return cudagraph_outputs + [residual] elif self.recompute_mlp: @@ -656,10 +669,13 @@ def _forward_mlp(self, hidden_states, inference_context=None): tensor_parallel.random.get_cuda_rng_tracker, self.pg_collection.tp, pre_mlp_layernorm_output, + padding_mask=padding_mask, ) else: mlp_output_with_bias = tensor_parallel.checkpoint( - self.mlp, False, pre_mlp_layernorm_output + functools.partial(self.mlp, padding_mask=padding_mask), + False, + pre_mlp_layernorm_output, ) elif should_chunk_mlp_for_prefill: # Chunk input along sequence dimension @@ -675,7 +691,7 @@ def _forward_mlp(self, hidden_states, inference_context=None): bias_output = torch.stack(bias_chunks, dim=0).sum(dim=0) if bias_chunks else None mlp_output_with_bias = (mlp_output, bias_output) else: - mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output) + mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output, padding_mask=padding_mask) if self.recompute_pre_mlp_layernorm: # discard the output of the pre-mlp layernorm and register the recompute diff --git a/tests/test_utils/python_scripts/recipe_parser.py b/tests/test_utils/python_scripts/recipe_parser.py index a497bdbd9de..b866fbbf5c2 100644 --- a/tests/test_utils/python_scripts/recipe_parser.py +++ b/tests/test_utils/python_scripts/recipe_parser.py @@ -1,3 +1,4 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import copy import itertools import logging diff --git a/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py b/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py index 81e61a3404a..6c59dd3f9e3 100644 --- a/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py +++ b/tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py @@ -23,7 +23,7 @@ from tests.unit_tests.test_utilities import Utils -def build_model(config): +def build_model(config, use_padding_mask=False): seq_len = 32 max_seq_len = 300 # ids = random.sample([i for i in range(max_seq_len)], seq_len) @@ -39,6 +39,12 @@ def build_model(config): "attention_mask": torch.ones((1, 1, seq_len, seq_len), dtype=bool).cuda(), } + # Optionally add padding_mask with same shape as input_ids + if use_padding_mask: + padding_mask = torch.zeros((1, seq_len), dtype=torch.bool).cuda() + padding_mask[0, -8:] = True + data["padding_mask"] = padding_mask + # build layer spec transformer_layer_spec = get_gpt_decoder_block_spec(config=config, use_transformer_engine=True) mtp_block_spec = get_gpt_mtp_block_spec(config, transformer_layer_spec.layer_specs[-1], True) @@ -48,7 +54,7 @@ def build_model(config): config=config, transformer_layer_spec=transformer_layer_spec, mtp_block_spec=mtp_block_spec, - vocab_size=100, + vocab_size=128, pre_process=True, post_process=True, max_sequence_length=max_seq_len, @@ -174,3 +180,109 @@ def test_1f1b_schedule_model_chunk(self, mtp_layers, dispatcher_type, fp8_flag, gpt_models[i] = None gc.collect() torch.cuda.empty_cache() + + @pytest.mark.skipif(not is_te_min_version("1.9.0.dev0"), reason="Requires TE >= 1.9.0.dev0") + @pytest.mark.parametrize("dispatcher_type", get_valid_token_dispatcher_types()) + @pytest.mark.parametrize("layers", [[2, 1], [1, 1]]) + @pytest.mark.parametrize("tp_size", [1, 2, 4, 8]) + def test_1f1b_schedule_model_chunk_with_padding_mask(self, dispatcher_type, layers, tp_size): + """ + Verifies all-to-all overlap optimization with padding_mask produces + the same results as the reference implementation with various TP/EP/CP combinations. + """ + # Re-initialize model parallel with the specified configuration + Utils.destroy_model_parallel() + Utils.initialize_model_parallel( + tensor_model_parallel_size=tp_size, + pipeline_model_parallel_size=1, + expert_model_parallel_size=4, + expert_tensor_parallel_size=1, + ) + set_streams() + + microbatches = 1 + + gpt_models = [] + schedule_plans = [] + ref_captures = [] + datas = [] + + # create TransformerConfig + extra_kwargs = { + "moe_token_dispatcher_type": dispatcher_type, + "tensor_model_parallel_size": tp_size, + "sequence_parallel": tp_size > 1, + } + if dispatcher_type == "flex": + extra_kwargs["moe_flex_dispatcher_backend"] = "deepep" + extra_kwargs["moe_router_dtype"] = "fp32" + with deterministic_mode(): + for layer_num in layers: + output_tensors = [] + # build config + config = get_test_config(num_layers=layer_num, extra_kwargs=extra_kwargs) + # build model with padding_mask + gpt_model, schedule_plan, data = build_model(config, use_padding_mask=True) + gpt_model.cuda() + gpt_models.append(gpt_model) + datas.append(data) + schedule_plans.append(schedule_plan) + + # run reference + for _ in range(microbatches): + loss = gpt_model.forward(**data) + loss = float16_to_fp32(loss) + loss.backward(torch.ones_like(loss)) + output_tensors.append(loss) + + capture = {"outputs": output_tensors} + for name, param in gpt_model.named_parameters(): + capture[name] = param.grad + ref_captures.append(capture) + gpt_model.zero_grad() + assert gpt_models[0].embedding is not None + assert gpt_models[1].embedding is not None + # run a2a overlap + capture_0 = {"outputs": []} + capture_1 = {"outputs": []} + a2a_captures = [capture_0, capture_1] + for i in range(microbatches): + # 1st forward + if i > 0: + assert ( + schedule_plans[0].pre_process is None + ), "pre_process should be released after backward" + schedule_plans[0] = gpt_models[0].build_schedule_plan(**datas[0]) + schedule_plans[1] = gpt_models[1].build_schedule_plan(**datas[1]) + f_input_0 = TransformerModelChunkSchedulePlan.run(schedule_plans[0], None) + capture_0["outputs"].append(f_input_0) + # overlap + f_input_1 = TransformerModelChunkSchedulePlan.run( + schedule_plans[1], schedule_plans[0], b_grad=torch.ones_like(f_input_0) + ) + capture_1["outputs"].append(f_input_1) + # last backward + TransformerModelChunkSchedulePlan.run( + None, schedule_plans[1], b_grad=torch.ones_like(f_input_1) + ) + for i in range(len(gpt_models)): + for name, param in gpt_models[i].named_parameters(): + a2a_captures[i][name] = param.grad + + # compare results + for i in range(len(ref_captures)): + comp_res = compare_captures(ref_captures[i], a2a_captures[i], True, True) + assert comp_res[0], f"[rank {torch.distributed.get_rank()}] {comp_res[1]}" + + # release resources is necessary, otherwise later testcases will oom + for i in range(len(schedule_plans)): + schedule_plans[i] = None + ref_captures[i] = None + a2a_captures[i] = None + for k in datas[i]: + datas[i][k] = None + datas[i] = None + gpt_models[i].zero_grad() + gpt_models[i] = None + gc.collect() + torch.cuda.empty_cache() diff --git a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py index 7fb97f6e586..5ec096e5a04 100644 --- a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py +++ b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py @@ -502,8 +502,8 @@ def test_mtp_layer_overlap(self, dispatcher_type, fp8_flag): position_ids = torch.tensor(data, dtype=torch.int64).repeat((1, 1)).cuda() attention_mask = torch.ones((1, 1, seq_len, seq_len), dtype=bool).cuda() # get rotary pos emb - _, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, _ = gpt_model._preprocess( - input_ids, position_ids + _, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, _, _padding_mask = ( + gpt_model._preprocess(input_ids, position_ids) ) # reset model params = reset_model(gpt_model) diff --git a/tests/unit_tests/transformer/moe/test_aux_loss.py b/tests/unit_tests/transformer/moe/test_aux_loss.py index b1f78582383..f5726777383 100644 --- a/tests/unit_tests/transformer/moe/test_aux_loss.py +++ b/tests/unit_tests/transformer/moe/test_aux_loss.py @@ -576,3 +576,192 @@ def test_force_balanced_aux_loss(self, tp_size, ep_size, cp_size): reduce_from_tensor_model_parallel_region(aux_loss, router.tp_cp_group) assert aux_loss.item() == 1, f"{aux_loss_type}: {aux_loss.item()}" clear_aux_losses_tracker() + + +class TestPaddingMaskAuxLoss: + """Test padding mask support in various aux loss types.""" + + def setup_model_parallel(self, tp_size=1, ep_size=1, cp_size=1, sequence_parallel=False): + """Initialize model parallel with given configuration. + + Args: + tp_size: Tensor parallel size. + ep_size: Expert parallel size. + cp_size: Context parallel size. + """ + Utils.initialize_model_parallel( + tensor_model_parallel_size=tp_size, + pipeline_model_parallel_size=1, + context_parallel_size=cp_size, + expert_model_parallel_size=ep_size, + ) + _set_random_seed(seed_=123, data_parallel_random_init=False) + + # Store parallel configuration + self.tp_size = tp_size + self.ep_size = ep_size + self.cp_size = cp_size + + # Default configuration + self.default_transformer_config = TransformerConfig( + num_layers=1, + hidden_size=12, + num_attention_heads=8, + num_moe_experts=32, + use_cpu_initialization=True, + moe_router_load_balancing_type="aux_loss", + moe_router_topk=8, + moe_aux_loss_coeff=1.0, + bf16=True, + params_dtype=torch.bfloat16, + add_bias_linear=False, + tensor_model_parallel_size=tp_size, + expert_model_parallel_size=ep_size, + context_parallel_size=cp_size, + sequence_parallel=sequence_parallel and tp_size > 1, + ) + + def new_router(self, **kwargs): + """Create a new router with updated configuration.""" + pg_collection = get_default_pg_collection() + new_transformer_config = dataclasses.replace(self.default_transformer_config, **kwargs) + router = TopKRouter(config=new_transformer_config, pg_collection=pg_collection) + router.set_layer_number(0) + return router + + @pytest.mark.internal + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.parametrize("sequence_parallel", [True, False]) + @pytest.mark.parametrize("aux_loss_type", ["aux_loss", "seq_aux_loss", "global_aux_loss"]) + @pytest.mark.parametrize( + "tp_size,ep_size,cp_size", [(8, 1, 1), (4, 2, 1), (1, 1, 8), (2, 1, 4), (2, 2, 2)] + ) + def test_padding_mask_removes_padding_tokens( + self, aux_loss_type, tp_size, ep_size, cp_size, sequence_parallel + ): + """Test that padding tokens are correctly excluded from aux loss calculation.""" + # Initialize model parallel with given configuration + self.setup_model_parallel( + tp_size=tp_size, ep_size=ep_size, cp_size=cp_size, sequence_parallel=sequence_parallel + ) + + try: + clear_aux_losses_tracker() + + router = self.new_router( + moe_router_load_balancing_type=aux_loss_type, + moe_aux_loss_coeff=1.0, + moe_router_dtype="fp64", + ).cuda() + + seq_len = 32 + batch_size = 2 + hidden_size = router.config.hidden_size + + # Create input with padding + hidden_states_full = torch.randn( + (seq_len, batch_size, hidden_size), dtype=torch.bfloat16, device='cuda' + ) + + # Create padding mask: first half valid (False), second half padding (True) + # Convention: True = padding (exclude), False = valid (include) + padding_mask = torch.zeros((seq_len, batch_size), dtype=torch.bool, device='cuda') + padding_mask[seq_len // 2 :, :] = True + + # Test with padding mask + router.weight.grad = None + scores_with_mask, routing_map_with_mask = router( + hidden_states_full, padding_mask=padding_mask + ) + scores_with_mask.backward(torch.zeros_like(scores_with_mask)) + + loss_name = { + "aux_loss": "load_balancing_loss", + "seq_aux_loss": "seq_load_balancing_loss", + "global_aux_loss": "global_load_balancing_loss", + }[aux_loss_type] + + tracker = get_moe_layer_wise_logging_tracker() + aux_loss_with_mask = tracker[loss_name]["values"][0].clone() + grad_with_mask = router.weight.grad.clone() + + # Test without padding (with only half of the tokens) + clear_aux_losses_tracker() + router.weight.grad = None + hidden_states_valid = hidden_states_full[: seq_len // 2, :, :] + scores_without_mask, routing_map_without_mask = router(hidden_states_valid) + scores_without_mask.backward(torch.zeros_like(scores_without_mask)) + + aux_loss_without_mask = tracker[loss_name]["values"][0].clone() + grad_without_mask = router.weight.grad.clone() + + # The aux loss with mask should be close to the aux loss without mask + assert torch.equal(aux_loss_with_mask, aux_loss_without_mask) + assert torch.equal(grad_with_mask, grad_without_mask) + + clear_aux_losses_tracker() + finally: + # Always cleanup model parallel + Utils.destroy_model_parallel() + + @pytest.mark.internal + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.parametrize( + "tp_size,ep_size,cp_size", [(8, 1, 1), (4, 2, 1), (1, 1, 8), (2, 1, 4), (2, 2, 2)] + ) + def test_padding_mask_with_z_loss(self, tp_size, ep_size, cp_size): + """Test that padding mask works correctly with z_loss.""" + # Initialize model parallel with given configuration + self.setup_model_parallel(tp_size=tp_size, ep_size=ep_size, cp_size=cp_size) + + try: + clear_aux_losses_tracker() + + router = self.new_router( + moe_router_load_balancing_type="aux_loss", + moe_aux_loss_coeff=0.0, + moe_z_loss_coeff=1.0, + moe_router_dtype="fp32", + ).cuda() + + seq_len = 32 + batch_size = 2 + hidden_size = router.config.hidden_size + + # Create input + hidden_states_full = torch.randn( + (seq_len, batch_size, hidden_size), dtype=torch.bfloat16, device='cuda' + ) + + # Create padding mask: first half valid (False), second half padding (True) + # Convention: True = padding (exclude), False = valid (include) + padding_mask = torch.zeros((seq_len, batch_size), dtype=torch.bool, device='cuda') + padding_mask[seq_len // 2 :, :] = True + + # Test with padding mask + router.weight.grad = None + scores_with_mask, _ = router(hidden_states_full, padding_mask=padding_mask) + scores_with_mask.sum().backward() + + tracker = get_moe_layer_wise_logging_tracker() + z_loss_with_mask = tracker["z_loss"]["values"][0].clone() + grad_with_mask = router.weight.grad.clone() + + # Test without padding (with only half of the tokens) + clear_aux_losses_tracker() + router.weight.grad = None + hidden_states_valid = hidden_states_full[: seq_len // 2, :, :] + scores_without_mask, _ = router(hidden_states_valid) + scores_without_mask.sum().backward() + + z_loss_without_mask = tracker["z_loss"]["values"][0].clone() + grad_without_mask = router.weight.grad.clone() + + # The z_loss with mask should be close to the z_loss without mask + assert torch.equal(z_loss_with_mask, z_loss_without_mask) + assert torch.equal(grad_with_mask, grad_without_mask) + + clear_aux_losses_tracker() + finally: + # Always cleanup model parallel + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/transformer/moe/test_routers.py b/tests/unit_tests/transformer/moe/test_routers.py index 677d938cdc7..abd1a4db2dc 100644 --- a/tests/unit_tests/transformer/moe/test_routers.py +++ b/tests/unit_tests/transformer/moe/test_routers.py @@ -125,6 +125,53 @@ def test_aux_loss(self): out.sum().mul_(0).backward() assert self.sequential_mlp.router.weight.grad.abs().sum() > 0 + @pytest.mark.internal + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_router_with_padding_mask(self): + """Test that padding mask correctly excludes padding tokens from routing.""" + self.router = self.router.cuda() + seq_len = 32 + batch_size = 2 + hidden_size = self.router.config.hidden_size + + # Create input with shape [seq_len, batch_size, hidden_size] + hidden_states = torch.randn((seq_len, batch_size, hidden_size)).cuda().bfloat16() + + # Create padding mask: first half valid (False), second half padding (True) + # padding_mask shape: [seq_len, batch_size] + # Convention: True = padding (exclude), False = valid (include) + padding_mask = torch.zeros((seq_len, batch_size), dtype=torch.bool, device='cuda') + padding_mask[seq_len // 2 :, :] = True # Second half is padding + + # Test forward pass with padding mask + with torch.no_grad(): + probs_with_mask, routing_map_with_mask = self.router( + hidden_states, padding_mask=padding_mask + ) + + # Test forward pass without padding mask (only valid tokens) + hidden_states_valid = hidden_states[: seq_len // 2, :, :] + probs_without_mask, routing_map_without_mask = self.router(hidden_states_valid) + + # The valid part of routing with mask should match routing without mask + probs_valid_part = probs_with_mask.reshape(seq_len, batch_size, -1)[ + : seq_len // 2, :, : + ] + probs_valid_part = probs_valid_part.reshape(-1, probs_valid_part.shape[-1]) + + # Check that shapes are as expected + assert probs_with_mask.shape == ( + seq_len * batch_size, + self.router.config.num_moe_experts, + ) + assert routing_map_with_mask.shape == ( + seq_len * batch_size, + self.router.config.num_moe_experts, + ) + + # Verify that probs for valid tokens are similar + assert torch.equal(probs_valid_part, probs_without_mask) + @pytest.mark.internal @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") def test_router_dtype(self): From 5823534a4078b030134e7e2d703d7817b1a64df9 Mon Sep 17 00:00:00 2001 From: Kunlun Li <94586211+kunlunl@users.noreply.github.com> Date: Wed, 7 Jan 2026 01:25:07 +0800 Subject: [PATCH 215/248] [dev] Reapply fsdp mxfp8 (#2828) Signed-off-by: jianbinc Co-authored-by: jianbinc --- .../distributed/fsdp/mcore_fsdp_adapter.py | 4 + .../fsdp/src/megatron_fsdp/megatron_fsdp.py | 157 +++--- .../fsdp/src/megatron_fsdp/mixed_precision.py | 334 +++++++++++++ .../megatron_fsdp/param_and_grad_buffer.py | 450 +++++++++++++----- .../fsdp/src/megatron_fsdp/utils.py | 252 +--------- megatron/training/arguments.py | 7 + 6 files changed, 783 insertions(+), 421 deletions(-) create mode 100644 megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py diff --git a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py index 7432a7f9a36..d6384e70488 100644 --- a/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +++ b/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py @@ -111,6 +111,9 @@ def __init__( dist_index=self.megatron_fsdp_dist_index, calculate_per_token_loss=config.calculate_per_token_loss, init_model_with_meta_device=config.init_model_with_meta_device, + enable_fine_grained_param_gather_hook=( + config.fp8_recipe == "mxfp8" and ddp_config.fp8_param_gather + ), ), ) self.param_and_grad_buffer = self.module.param_and_grad_buffer @@ -123,6 +126,7 @@ def __init__( self.broadcast_params = self.module.broadcast_params self.module.state_dict_for_save_checkpoint = self.module.state_dict self.state_dict_for_save_checkpoint = self.state_dict + self.module.config = config self.sync_rng_states_across_tp_group() diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py index 5e953e8c6c2..e2cbccf4356 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py @@ -23,6 +23,20 @@ import torch.nn as nn from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten +from .mixed_precision import ( + fp8_create_transpose_cache, + fp8_discard_transpose_cache, + is_float8tensor, +) +from .param_and_grad_buffer import ( + AllGatherPipeline, + BucketingPolicy, + GradReducePipeline, + ParamAndGradBuffer, + PrefetchOrder, + override_sharded_param_methods_with_safety_checks, + to_local_if_dtensor, +) from .utils import FSDPDistributedIndex logger = logging.getLogger(__name__) @@ -34,23 +48,12 @@ from megatron.core.distributed.distributed_data_parallel_config import ( DistributedDataParallelConfig, ) - from megatron.core.fp8_utils import is_float8tensor from megatron.core.utils import is_submodule except ImportError: # Megatron-LM is not installed, use Megatron-FSDP as a standalone module. logger.info("Megatron Core is not installed, Megatron-FSDP will run without Megatron Core.") from .distributed_data_parallel_config import DistributedDataParallelConfig - from .utils import is_float8tensor, is_submodule - -from .param_and_grad_buffer import ( - AllGatherPipeline, - BucketingPolicy, - GradReducePipeline, - ParamAndGradBuffer, - PrefetchOrder, - override_sharded_param_methods_with_safety_checks, - to_local_if_dtensor, -) + from .utils import is_submodule class TrainingState(Enum): @@ -168,6 +171,7 @@ def __init__( nccl_ub: bool = False, fsdp_double_buffer: bool = False, disable_symmetric_registration: bool = False, + enable_fine_grained_param_gather_hook: bool = False, ): super().__init__() # If device is not specified, use the current device. @@ -217,6 +221,7 @@ def __init__( self.calculate_per_token_loss = calculate_per_token_loss self.init_model_with_meta_device = init_model_with_meta_device + self.enable_fine_grained_param_gather_hook = enable_fine_grained_param_gather_hook # Whether to constantly synchronize the model every training iteration, # which defaults to False to overlap communication with computation @@ -406,6 +411,7 @@ def all_gather_and_wait_parameters_ready( prefetch=True, prefetch_order=PrefetchOrder.FORWARD_PASS_ORDER, wait_bucket_ready=True, + bwd=False, ): """ All-gather parameters across the data parallel group and wait for @@ -432,11 +438,14 @@ def all_gather_and_wait_parameters_ready( and self.ddp_config.outer_dp_sharding_strategy != "no_shard" and (self.microbatch_count == 0 or self.model_auto_sync) ), + bwd=bwd, ) if wait_bucket_ready: for param in params: bucket_id = self.param_and_grad_buffer.param_to_param_group[param] - ag_pipeline.wait_bucket_ready(bucket_id) + ag_pipeline.wait_bucket_ready(bucket_id, bwd) + if bwd and is_float8tensor(param): + fp8_create_transpose_cache(param) for param in params: # This setting is needed to make FSDP store the weight object when used @@ -495,19 +504,17 @@ def _register_fsdp_hooks(self, root_module): """ fsdp_unit_modules = self.fsdp_unit_modules - def release_module_parameters(module, *unused): + def release_module_parameters(module, bwd, *unused): for param in module.parameters(): bucket_id = self.param_and_grad_buffer.param_to_param_group[param] - self.all_gather_pipeline.release_bucket(bucket_id) - + self.all_gather_pipeline.release_bucket(bucket_id, bwd) if not self.ddp_config.keep_fp8_transpose_cache: release_params_fp8_transpose_cache(module.parameters()) def release_params_fp8_transpose_cache(params): for param in params: if is_float8tensor(param): - param._transpose_invalid = True - param._transpose = None + fp8_discard_transpose_cache(param) def _grad_acc(param): """ @@ -564,12 +571,15 @@ def _post_backward(module, *unused): if self.ddp_config.data_parallel_sharding_strategy == "optim_grads_params": # Deallocate the module parameters after the backward pass, # because we have our data-parallel gradients computed. - release_module_parameters(module) + release_module_parameters(module, bwd=True) module._training_state = TrainingState.IDLE param_list = list(module.parameters()) else: param_list = list(module.parameters(recurse=False)) + if self.enable_fine_grained_param_gather_hook: + param_list = list(module.parameters(recurse=False)) + # If the parameter is shared, we do not accumulate gradients # here, as the gradients will be accumulated in the # root post-backward hook. @@ -621,6 +631,9 @@ def _pre_forward_param_unshard( # to allocate as little memory as possible for this forward pass. param_list = list(module.parameters(recurse=False)) + if self.enable_fine_grained_param_gather_hook: + param_list = list(module.parameters(recurse=False)) + # All-gather the parameters before the forward pass. self.all_gather_and_wait_parameters_ready( params=param_list, @@ -720,7 +733,7 @@ def _root_post_backward(*unused): if self.model_auto_sync: self.finish_grad_sync() - def _pre_backward(module: nn.Module, *unused): + def _pre_backward_param_unshard(module: nn.Module, *unused): """ Sub-module pre-backward hook to all-gather the module parameters before the backward pass. @@ -729,11 +742,19 @@ def _pre_backward(module: nn.Module, *unused): # and unsharding operations when performing activation recomputation # / gradient checkpointing. module._training_state = TrainingState.PRE_BACKWARD + if isinstance(module, tuple(fsdp_unit_modules)): - # All-gather / unshard the module parameters before the backward pass. - self.all_gather_and_wait_parameters_ready( - list(module.parameters()), prefetch_order=PrefetchOrder.BACKWARD_PASS_ORDER - ) + param_list = list(module.parameters()) + else: + param_list = list(module.parameters(recurse=False)) + + if self.enable_fine_grained_param_gather_hook: + param_list = list(module.parameters(recurse=False)) + + # All-gather / unshard the module parameters before the backward pass. + self.all_gather_and_wait_parameters_ready( + param_list, prefetch_order=PrefetchOrder.BACKWARD_PASS_ORDER, bwd=True + ) self._root_pre_backward_hook_issued = False @@ -760,7 +781,9 @@ def _root_pre_backward(module: nn.Module, *unused): for bucket_id in range(ag_pipeline.num_buckets): group = self.param_and_grad_buffer.parameter_groups[bucket_id] if group.fsdp_unit_id is not None: - ag_pipeline.bucket_can_be_released[bucket_id] = True + ag_pipeline.bucket_can_be_released[ + ag_pipeline.get_bucket_key(bucket_id, bwd=False) + ] = True # Track parameters that require gradient reduction and optimization. self._params_require_handle_grad = set() for param_group in self.param_and_grad_buffer.parameter_groups: @@ -782,8 +805,12 @@ def _post_forward(module: nn.Module, input: Any, output: Any): # during activation recomputation / gradient checkpointing. return output + assert isinstance( + module, tuple(fsdp_unit_modules) + ), "_post_forward hook should only be registered on FSDP unit modules." + # Release the module parameters after the forward pass to save memory. - release_module_parameters(module) + release_module_parameters(module, bwd=False) module._training_state = TrainingState.IDLE return output @@ -824,21 +851,55 @@ def forward_hook(_module, inputs, output): # on the output tensor(s). return module.register_forward_hook(forward_hook) + def _register_pre_forward_param_unshard_hook(module): + """ + Register the forward pre-hook to unshard parameters before the forward pass. + If we are not sharding anything, we do not have a model weight buffer and thus + have nothing to all-gather / un-shard. + """ + if self.ddp_config.data_parallel_sharding_strategy != "no_shard": + self.forward_pre_hooks[f"{module._get_name()} parameter unshard"] = ( + module.register_forward_pre_hook( + _pre_forward_param_unshard, prepend=True, with_kwargs=True + ) + ) + + def _register_pre_backward_param_unshard_hook(module): + """ + Register the backward pre-hook to unshard FSDP unit module parameters + immediately before the backward pass via attaching a gradient-triggered + hook to the output tensor(s) of a module during a post-forward hook. + """ + self.backward_pre_hooks[f"all-gather {module._get_name()} parameters"] = ( + create_custom_backward_hook(module, _pre_backward_param_unshard) + ) + + def _register_grad_acc_and_reduce_hook(module): + """ + Register the post-backward hook to deallocate model parameters and + reduce-scatter gradients immediately after the module backward pass + has completed to conserve memory for the subsequent backward pass. + """ + self.forward_pre_hooks[f"module {name} register post-backward hook"] = ( + module.register_forward_pre_hook( + functools.partial(_register_post_backward_hook, _post_backward), + with_kwargs=True, + ) + ) + fsdp_modules = [] for name, module in root_module.named_modules(): + if self.enable_fine_grained_param_gather_hook: + _register_pre_forward_param_unshard_hook(module) + _register_pre_backward_param_unshard_hook(module) + _register_grad_acc_and_reduce_hook(module) + # Skip if the module is already registered in fsdp_modules. if any(is_submodule(module, fsdp_module) for fsdp_module in fsdp_modules): continue - # Register the forward pre-hook to unshard parameters before the forward pass. - # If we are not sharding anything, we do not have a model weight buffer and thus - # have nothing to all-gather / un-shard. - if self.ddp_config.data_parallel_sharding_strategy != "no_shard": - self.forward_pre_hooks[f"module {name} parameter unshard"] = ( - module.register_forward_pre_hook( - _pre_forward_param_unshard, prepend=True, with_kwargs=True - ) - ) + if not self.enable_fine_grained_param_gather_hook: + _register_pre_forward_param_unshard_hook(module) if isinstance(module, tuple(fsdp_unit_modules)): fsdp_modules.append(module) @@ -849,12 +910,8 @@ def forward_hook(_module, inputs, output): module.register_forward_hook(_post_forward, prepend=False) ) - # Register the backward pre-hook to unshard FSDP unit module parameters - # immediately before the backward pass via attaching a gradient-triggered - # hook to the output tensor(s) of a module during a post-forward hook. - self.backward_pre_hooks[f"all-gather module {name} parameters"] = ( - create_custom_backward_hook(module, _pre_backward) - ) + if not self.enable_fine_grained_param_gather_hook: + _register_pre_backward_param_unshard_hook(module) elif ( not self.ddp_config.keep_fp8_transpose_cache and self.ddp_config.data_parallel_sharding_strategy == "optim_grads_params" @@ -867,15 +924,8 @@ def forward_hook(_module, inputs, output): module.register_forward_hook(_release_module_fp8_transpose_cache, prepend=False) ) - # Register the post-backward hook to deallocate model parameters and - # reduce-scatter gradients immediately after the module backward pass - # has completed to conserve memory for the subsequent backward pass. - self.forward_pre_hooks[f"module {name} register post-backward hook"] = ( - module.register_forward_pre_hook( - functools.partial(_register_post_backward_hook, _post_backward), - with_kwargs=True, - ) - ) + if not self.enable_fine_grained_param_gather_hook: + _register_grad_acc_and_reduce_hook(module) # Register root module pre- and post-backward hooks in cases where the # forward function of root module is not called, but rather the forward @@ -992,7 +1042,7 @@ def start_param_sync(self, *unused, force_sync: bool = False, force_dispatch: bo else: self.synchronize_param_gather() for bucket_id in range(self.all_gather_pipeline.num_buckets): - self.all_gather_pipeline.async_bucket_gather(bucket_id=bucket_id) + self.all_gather_pipeline.async_bucket_gather(bucket_id=bucket_id, bwd=False) group = self.param_and_grad_buffer.parameter_groups[bucket_id] if group.model_weight_buffer is None: continue @@ -1000,9 +1050,10 @@ def start_param_sync(self, *unused, force_sync: bool = False, force_dispatch: bo if group.model_weight_buffer.is_data_distributed: # If model weight is sharded, we wait for the all-gather to complete and # then release the bucket immediately to save memory usage. - self.all_gather_pipeline.wait_bucket_ready(bucket_id) + self.all_gather_pipeline.wait_bucket_ready(bucket_id, False) + for bucket_id in range(self.all_gather_pipeline.num_buckets): - self.all_gather_pipeline.wait_bucket_ready(bucket_id) + self.all_gather_pipeline.wait_bucket_ready(bucket_id, False) def start_grad_sync(self, *unused): """ diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py new file mode 100644 index 00000000000..d7156bea5c6 --- /dev/null +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py @@ -0,0 +1,334 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from importlib.metadata import version +from typing import List, Optional, Tuple + +import torch +from packaging.version import Version as PkgVersion + +logger = logging.getLogger(__name__) + +# Detect if Transformer Engine is installed +try: + import transformer_engine # pylint: disable=W0611 + from transformer_engine.pytorch.module.base import TransformerEngineBaseModule + + HAVE_TE = True +except (ImportError, ModuleNotFoundError): + TransformerEngineBaseModule = None + HAVE_TE = False + logger.info("Using Megatron-FSDP without Transformer Engine.") + +# Detect the Transformer Engine version +try: + import transformer_engine as te + + if hasattr(te, "__version__"): + TE_VERSION = PkgVersion(str(te.__version__)) + else: + TE_VERSION = PkgVersion(version("transformer-engine")) +except: + TE_VERSION = None + +# Detect the FP8 tensor class +try: + from transformer_engine.pytorch.tensor import QuantizedTensor + + HAVE_TE_FP8_TENSOR_CLASS = True + FP8_TENSOR_CLASS = QuantizedTensor +except: + try: + from transformer_engine.pytorch.float8_tensor import Float8Tensor + + HAVE_TE_FP8_TENSOR_CLASS = True + FP8_TENSOR_CLASS = Float8Tensor + except: + HAVE_TE_FP8_TENSOR_CLASS = False + +# Detect the MXFP8 tensor class +try: + from transformer_engine.pytorch.tensor.mxfp8_tensor import MXFP8Tensor + + HAVE_TE_MXFP8TENSOR = True +except: + HAVE_TE_MXFP8TENSOR = False + +# Detect the Blockwise FP8 tensor class +try: + from transformer_engine.pytorch.tensor.float8_blockwise_tensor import Float8BlockwiseQTensor + + HAVE_TE_BLOCKWISE_FP8TENSOR = True +except: + HAVE_TE_BLOCKWISE_FP8TENSOR = False + +# Detect the "cast_master_weights_to_fp8" function of Transformer Engine +try: + from transformer_engine.pytorch.tensor.utils import cast_master_weights_to_fp8 + + HAVE_TE_CAST_MASTER_WEIGHTS_TO_FP8 = True +except: + HAVE_TE_CAST_MASTER_WEIGHTS_TO_FP8 = False + + # Try to import multi_tensor_apply, used in the fallback of fp8 quantization. + try: + from transformer_engine.pytorch.optimizers import multi_tensor_applier, multi_tensor_scale + + multi_tensor_scale_impl = multi_tensor_scale + except ImportError: + try: + import amp_C + from apex.multi_tensor_apply import multi_tensor_applier + + multi_tensor_scale_impl = amp_C.multi_tensor_scale + except ImportError: + import warnings + + warnings.warn( + "Transformer Engine and Apex are not installed. " + "Falling back to local implementations of " + "multi_tensor_applier and multi_tensor_scale" + ) + + def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args): + """Multi tensor op applier""" + return op(2048 * 32, noop_flag_buffer, tensor_lists, *args) + + def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale): + """Works as a drop-in replacement for amp_C.multi_tensor_scale.""" + for src, dst in zip(tensor_lists[0], tensor_lists[1]): + dst.copy_(src * scale) + + multi_tensor_applier = local_multi_tensor_applier + multi_tensor_scale_impl = local_multi_tensor_scale + + def _multi_tensor_copy_this_to_that( + this: List[torch.Tensor], + that: List[torch.Tensor], + overflow_buf: Optional[torch.Tensor] = None, + ): + """ + Use multi-tensor-applier to copy values from one list to another. + We don't have a bfloat16 implementation so for now if the overflow_buf + is not provided, we default back to simple loop copy to be compatible + with bfloat16. + """ + if overflow_buf is not None: + overflow_buf.fill_(0) + # Scaling with factor `1.0` is equivalent to copy. + multi_tensor_applier(multi_tensor_scale_impl, overflow_buf, [this, that], 1.0) + else: + for this_, that_ in zip(this, that): + that_.copy_(this_) + + +# Detect the "post_all_gather_processing" function of Transformer Engine +try: + from transformer_engine.pytorch.tensor.utils import post_all_gather_processing + + HAVE_TE_POST_ALL_GATHER_PROCESSING = True +except: + HAVE_TE_POST_ALL_GATHER_PROCESSING = False + + +def is_te_min_version(vers, check_equality=True): + """Check if minimum version of `transformer-engine` is installed.""" + if not isinstance(TE_VERSION, PkgVersion): + return False + + if check_equality: + return TE_VERSION >= PkgVersion(vers) + else: + return TE_VERSION > PkgVersion(vers) + + +def is_float8tensor(tensor: torch.Tensor) -> bool: + """Check if a tensor is a FP8 tensor.""" + return HAVE_TE and isinstance(tensor, FP8_TENSOR_CLASS) + + +def is_blockwise_float8tensor(tensor: torch.Tensor) -> bool: + """Check if a tensor is a Blockwise FP8 tensor.""" + return HAVE_TE_BLOCKWISE_FP8TENSOR and isinstance(tensor, Float8BlockwiseQTensor) + + +def fp8_need_transpose_data(tensor: torch.Tensor) -> bool: + """Check if a FP8 tensor needs transpose data.""" + return HAVE_TE_MXFP8TENSOR and isinstance(tensor, MXFP8Tensor) + + +def fp8_need_transpose_data_for_meta_device_init(module: TransformerEngineBaseModule) -> bool: + """Check if a FP8 tensor needs transpose data, for meta device init scenario.""" + return HAVE_TE_MXFP8TENSOR and module.fp8_meta["recipe"].mxfp8() + + +def fp8_discard_transpose_cache(tensor: torch.Tensor) -> None: + """Discard the transpose cache of a FP8 tensor.""" + assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor" + + if hasattr(tensor, "_transpose_invalid"): + tensor._transpose_invalid = True + tensor._transpose = None + elif not fp8_need_transpose_data(tensor): + tensor.update_usage(rowwise_usage=True, columnwise_usage=False) + + +def fp8_create_transpose_cache(tensors: List[torch.Tensor]) -> None: + """Create the transpose cache of a FP8 tensor.""" + if HAVE_TE_POST_ALL_GATHER_PROCESSING: + post_all_gather_processing(tensors) + else: + _fp8_create_transpose_cache_fallback(tensors) + + +def _fp8_create_transpose_cache_fallback(tensors: List[torch.Tensor]) -> None: + if not isinstance(tensors, list): + tensors = [tensors] + for tensor in tensors: + assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor" + if hasattr(tensor, "_create_transpose"): + tensor._create_transpose() + else: + tensor._create_columnwise() + + +def fp8_set_raw_data(tensor: torch.Tensor, data: torch.Tensor, set_transpose: bool = False) -> None: + """Set the raw data of a Transformer Engine Float8Tensor.""" + assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor" + + if set_transpose: + assert fp8_need_transpose_data(tensor), f"Type {type(tensor)} does not need transpose data" + data_attr = "_columnwise_data" + else: + data_attr = "_rowwise_data" if hasattr(tensor, "_rowwise_data") else "_data" + + old_data = getattr(tensor, data_attr) + if old_data is not None: + assert ( + old_data.dtype == data.dtype + ), f"The data types of raw data don't match {old_data.dtype} vs {data.dtype}" + assert ( + old_data.shape == data.shape + ), f"Shape {old_data.shape} of old_data doesn't match {data.shape} of new_data" + setattr(tensor, data_attr, data) + + +def fp8_get_raw_data(tensor: torch.Tensor, get_transpose: bool = False) -> torch.Tensor: + """Get the underlying raw storage of a FP8 tensor.""" + assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor" + + if get_transpose: + assert fp8_need_transpose_data(tensor), f"Type {type(tensor)} does not need transpose data" + data_attr = "_columnwise_data" + else: + data_attr = "_rowwise_data" if hasattr(tensor, "_rowwise_data") else "_data" + + return getattr(tensor, data_attr) + + +def fp8_dequantize(tensor: torch.Tensor) -> torch.Tensor: + """Dequantize a FP8 tensor to a higher precision.""" + assert is_float8tensor(tensor), f"Type {type(tensor)} is not a FP8 tensor" + assert is_te_min_version( + "2.0" + ), "Transformer Engine >= 2.0 is required for dequantizing parameters." + return tensor.dequantize() + + +def fp8_quantize( + model_params: List[torch.Tensor], + main_params: List[torch.Tensor], + start_offsets: List[int], + data_parallel_group: torch.distributed.ProcessGroup, + fsdp_shard_model_params: List[Tuple[torch.Tensor, Optional[torch.Tensor]]], +) -> None: + """Quantize sharded parameters to FP8.""" + if len(model_params) == 0: + return + fsdp_shard_model_params = [x[0] if x[1] is None else x for x in fsdp_shard_model_params] + + if HAVE_TE_CAST_MASTER_WEIGHTS_TO_FP8: + cast_master_weights_to_fp8( + model_params, main_params, start_offsets, data_parallel_group, fsdp_shard_model_params + ) + else: + _fp8_quantize_fallback( + model_params, main_params, start_offsets, data_parallel_group, fsdp_shard_model_params + ) + + +def _fp8_quantize_fallback( + model_params: List[torch.Tensor], + main_params: List[torch.Tensor], + start_offsets: List[int], + data_parallel_group: torch.distributed.ProcessGroup, + fsdp_shard_model_params: List[Tuple[torch.Tensor, Optional[torch.Tensor]]], +) -> None: + for model_param, main_param, start_offset, fsdp_shard_model_param in zip( + model_params, main_params, start_offsets, fsdp_shard_model_params + ): + if main_param is None: + continue + + if fsdp_shard_model_param is not None: + shard_model_param = fsdp_shard_model_param + else: + shard_model_param = model_param._data.view(-1)[ + start_offset : start_offset + main_param.numel() + ] + + quantizer = model_param._quantizer + # When not using fp8 params, the main_param (fp32) is first cast to bf16/fp16, and then + # cast to fp8 during forward. This logic keeps numerical consistency with bf16 params. + main_param = main_param.to(model_param.dtype) + out = Float8Tensor( + shape=main_param.size(), + dtype=model_param.dtype, + requires_grad=False, + data=shard_model_param, + fp8_scale_inv=model_param._scale_inv, + fp8_dtype=model_param._fp8_dtype, + quantizer=quantizer, + ) + quantizer.update_quantized(main_param, out) + + amaxes = [] + scales = [] + scale_invs = [] + for model_param in model_params: + quantizer = model_param._quantizer + amaxes.append(quantizer.amax.view(1)) + scales.append(quantizer.scale.view(1)) + scale_invs.append(model_param._scale_inv.view(1)) + model_param._reset_caches() + + dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device="cuda") + + # Update scaling factors. + packed_scales = torch.empty(len(scales), dtype=torch.float32, device=scales[0].device) + packed_scale_views = [packed_scales[i].view(1) for i in range(len(scales))] + _multi_tensor_copy_this_to_that(scales, packed_scale_views, dummy_overflow_buf) + torch.reciprocal(packed_scales, out=packed_scales) + _multi_tensor_copy_this_to_that(packed_scale_views, scale_invs, dummy_overflow_buf) + + # Reduce amaxes. + # Note: Assume each param has a separate amax. + packed_amaxes = torch.empty(len(amaxes), dtype=torch.float32, device=amaxes[0].device) + packed_amax_views = [packed_amaxes[i].view(1) for i in range(len(amaxes))] + _multi_tensor_copy_this_to_that(amaxes, packed_amax_views, dummy_overflow_buf) + torch.distributed.all_reduce( + packed_amaxes, op=torch.distributed.ReduceOp.MAX, group=data_parallel_group + ) + _multi_tensor_copy_this_to_that(packed_amax_views, amaxes, dummy_overflow_buf) diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py index 64fbe84e7eb..04ea09970f4 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py @@ -32,6 +32,17 @@ from torch.distributed import _coalescing_manager from torch.distributed.tensor import DTensor, Replicate, Shard +from .mixed_precision import ( + fp8_discard_transpose_cache, + fp8_get_raw_data, + fp8_need_transpose_data, + fp8_need_transpose_data_for_meta_device_init, + fp8_quantize, + fp8_set_raw_data, + is_blockwise_float8tensor, + is_float8tensor, + is_te_min_version, +) from .uneven_dtensor import update_uneven_dtensor_chunk_metadata, validate_uneven_dtensor from .utils import ( _MODEL_PARALLEL_RNG_TRACKER_NAME, @@ -50,27 +61,15 @@ from megatron.core.distributed.distributed_data_parallel_config import ( DistributedDataParallelConfig, ) - from megatron.core.fp8_utils import ( - is_float8tensor, - modify_underlying_storage, - quantize_param_shard, - ) from megatron.core.tensor_parallel import get_cuda_rng_tracker - from megatron.core.utils import is_submodule, is_te_min_version + from megatron.core.utils import is_submodule logger.info("Detected Megatron Core, using Megatron-FSDP with Megatron.") except ImportError: # Megatron-LM is not installed, use Megatron-FSDP as a standalone module. from .distributed_data_parallel_config import DistributedDataParallelConfig - from .utils import ( - get_cuda_rng_tracker, - is_float8tensor, - is_submodule, - is_te_min_version, - modify_underlying_storage, - quantize_param_shard, - ) + from .utils import get_cuda_rng_tracker, is_submodule logger.info("Megatron Core is not installed, Megatron-FSDP will run without Megatron Core.") @@ -816,7 +815,7 @@ def __init__( data_parallel_group: Optional[torch.distributed.ProcessGroup] = None, dp_rank: Optional[int] = None, temporary_bucket_allocator: Optional[TemporaryBucketAllocator] = None, - is_dtype_float8: bool = False, + is_transpose_buffer: bool = False, gradient_scaling_factor: Optional[float] = None, chunk_size_factor: int = 1, mem_alloc_context: Optional[Callable] = None, @@ -849,7 +848,7 @@ def __init__( self.temporary_bucket_allocator = ( temporary_bucket_allocator if temporary_bucket_allocator else TemporaryBucketAllocator() ) - self.is_dtype_float8 = is_dtype_float8 + self.is_transpose_buffer = is_transpose_buffer self.gradient_scaling_factor = gradient_scaling_factor self.mem_alloc_context = mem_alloc_context if mem_alloc_context else nullcontext @@ -945,11 +944,11 @@ def fetch_bucket( for p in self.params: item_id = self.param_idx[p] p = to_local_if_dtensor(p) + data = self.get_item_from_bucket(bucket, item_id).view(p.shape) if is_float8tensor(p): - p._data = self.get_item_from_bucket(bucket, item_id).view(p.shape) + fp8_set_raw_data(p, data, self.is_transpose_buffer) else: - p.data = self.get_item_from_bucket(bucket, item_id).view(p.shape) - + p.data = data return bucket def free_bucket_storage(self): @@ -1118,6 +1117,9 @@ def set_item(self, item_id: int, item_data: torch.Tensor) -> None: # When fully sharded, we need to get the slice of the item to be stored in this shard. # Otherwise, we can just flatten the entire item since this buffer contains # the entire bucket. + if is_float8tensor(item_data): + item_data = fp8_get_raw_data(item_data, self.is_transpose_buffer) + if self.is_data_distributed: # Get the coordinates of the slice of the item that is contained in this shard. slice_start, slice_end = self._get_item_slice_in_shard(item_id) @@ -1224,6 +1226,8 @@ class ParameterGroup: Factor determining chunk size for grouped parameter processing. model_weight_buffer (Optional[DataParallelBuffer]): Buffer used to store model weights for data-parallel operations. + transpose_weight_buffer (Optional[DataParallelBuffer]): + Buffer used to store transpose weights for data-parallel operations. main_weight_buffer (Optional[DataParallelBuffer]): Buffer used to store main model weights for data-parallel operations. main_grad_buffer (Optional[DataParallelBuffer]): @@ -1243,6 +1247,7 @@ class ParameterGroup: fsdp_unit_id: Optional[int] = None chunk_size_factor: int = 1 model_weight_buffer: Optional[DataParallelBuffer] = None + transpose_weight_buffer: Optional[DataParallelBuffer] = None main_weight_buffer: Optional[DataParallelBuffer] = None main_grad_buffer: Optional[DataParallelBuffer] = None hsdp_wbuf: Optional[DataParallelBuffer] = None @@ -1313,12 +1318,10 @@ def _does_param_require_new_bucket(param): parameter_groups = [] for name, param in module.named_parameters(): # We need this information to correctly dynamically allocate Tensors! + is_fp8 = is_float8tensor(param) + is_fp8_meta_device_init = meta_device_init_fp8_params.get(name, (False, False))[0] param_attrs = dict( - dtype=( - "float8" - if is_float8tensor(param) or meta_device_init_fp8_params.get(name, False) - else param.dtype - ), + dtype="float8" if (is_fp8 or is_fp8_meta_device_init) else param.dtype, is_expert_param=is_expert_parameter(name, param), requires_grad=param.requires_grad, fsdp_unit_id=None, @@ -1641,7 +1644,10 @@ def __init__( # to determine whether this parameter is fp8 or not. fp8_meta_index = m.param_init_meta[name].fp8_meta_index if m.primary_weights_in_fp8 and fp8_meta_index is not None: - meta_device_init_fp8_params[self.param_to_name[param]] = True + meta_device_init_fp8_params[self.param_to_name[param]] = ( + True, + fp8_need_transpose_data_for_meta_device_init(m), + ) # Get the parameter groups. (self.parameter_groups, self.param_to_param_group, self.bucket_to_bucket_group) = ( @@ -1768,6 +1774,7 @@ def _bytes_to_mb(bytes_val: int) -> str: numel = sum(to_local_if_dtensor(p).shape.numel() for p in group.params) buffers = { "weight": group.model_weight_buffer, + "transpose_weight": group.transpose_weight_buffer, "main_weight": group.main_weight_buffer, "grad": group.main_grad_buffer, } @@ -1837,12 +1844,18 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): self.weight_alloc = FixedPoolAllocator( name="fsdp_params", fsdp_param_groups=self.parameter_groups, size=UB_BUFFER_NUM ) + self.transpose_weight_alloc = FixedPoolAllocator( + name="fsdp_fp8_transpose_params", + fsdp_param_groups=self.parameter_groups, + size=UB_BUFFER_NUM, + ) self.main_grad_alloc = FixedPoolAllocator( name="fsdp_grads", fsdp_param_groups=self.parameter_groups, size=UB_BUFFER_NUM ) self.double_buf_units = self.weight_alloc.fsdp_double_buffer_units else: self.weight_alloc = StorageResizeBasedBucketAllocator() + self.transpose_weight_alloc = StorageResizeBasedBucketAllocator() self.main_grad_alloc = None self.double_buf_units = [] @@ -1882,8 +1895,9 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): ) # Check if the parameter group is FP8. one_param = group.params[0] - is_dtype_float8 = is_float8tensor(one_param) or meta_device_init_fp8_params.get( - self.param_to_name[one_param], False + is_dtype_float8 = ( + is_float8tensor(one_param) + or meta_device_init_fp8_params.get(self.param_to_name[one_param], (False, False))[0] ) if is_dtype_float8: param_dtype = torch.uint8 @@ -1892,6 +1906,16 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): param_dtype = group.params[0].dtype grad_dtype = param_dtype + # Check if the parameter group needs a transpose buffer for model weights. + # Currently, only mxfp8 needs it. + need_transpose_data = is_float8tensor(one_param) and fp8_need_transpose_data(one_param) + need_transpose_data_for_meta_device_init = meta_device_init_fp8_params.get( + self.param_to_name[one_param], (False, False) + )[1] + should_create_transpose_weight_buffer = ( + need_transpose_data or need_transpose_data_for_meta_device_init + ) + # Check if the parameter group requires a grad buffer or main weight buffer. should_create_grad_buffer_or_main_weight_buffer = ( not self.only_create_grad_buffer_and_main_weight_buffer_for_param_requires_grad @@ -1908,13 +1932,29 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): dtype=param_dtype, device=self.device, data_parallel_group=main_buf_dp_group, - is_dtype_float8=is_dtype_float8, + is_transpose_buffer=False, temporary_bucket_allocator=self.weight_alloc, bucket_id=group_id, chunk_size_factor=group.chunk_size_factor, mem_alloc_context=self.mem_alloc_context, **main_buf_extra_kwargs, ) + if should_create_transpose_weight_buffer: + group.transpose_weight_buffer = DataParallelBuffer( + self.ddp_config, + group.params, + is_data_distributed=is_model_weight_buffer_distributed + and main_buf_dp_group.size() > 1, + dtype=param_dtype, + device=self.device, + data_parallel_group=main_buf_dp_group, + is_transpose_buffer=True, + temporary_bucket_allocator=self.transpose_weight_alloc, + bucket_id=group_id, + chunk_size_factor=group.chunk_size_factor, + mem_alloc_context=self.mem_alloc_context, + **main_buf_extra_kwargs, + ) # Initialize the main weight buffer. if should_create_grad_buffer_or_main_weight_buffer and preserve_fp32_weights: @@ -1946,7 +1986,7 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): dtype=torch.float32 if grad_reduce_in_fp32 else grad_dtype, device=self.device, data_parallel_group=main_buf_dp_group, - is_dtype_float8=False, + is_transpose_buffer=False, temporary_bucket_allocator=self.main_grad_alloc, gradient_scaling_factor=gradient_scaling_factor, bucket_id=group_id, @@ -1970,7 +2010,7 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): dtype=wbuf.dtype, device=wbuf.device, data_parallel_group=hsdp_buf_dp_group, - is_dtype_float8=wbuf.is_dtype_float8, + is_transpose_buffer=False, temporary_bucket_allocator=self.weight_alloc, bucket_id=group_id, chunk_size_factor=group.chunk_size_factor, @@ -1986,6 +2026,9 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): ), ) + if group.transpose_weight_buffer is not None: + raise NotImplementedError("HSDP for transpose buffer is not implemented yet") + if should_create_grad_buffer_or_main_weight_buffer: # Initialize the HSDP grad buffer. gbuf = group.main_grad_buffer @@ -1997,7 +2040,7 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): dtype=gbuf.dtype, device=gbuf.device, data_parallel_group=hsdp_buf_dp_group, - is_dtype_float8=gbuf.is_dtype_float8, + is_transpose_buffer=False, temporary_bucket_allocator=self.main_grad_alloc, gradient_scaling_factor=gradient_scaling_factor, bucket_id=group_id, @@ -2080,6 +2123,20 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): torch.empty(wbuf.data_size, dtype=wbuf.dtype, device=self.device) ) bucket = wbuf.fetch_bucket() + + tbuf = group.transpose_weight_buffer + if tbuf: + with self.mem_alloc_context(): + if group.hsdp_wbuf: + raise NotImplementedError( + "HSDP for transpose buffer is not implemented yet" + ) + else: + tbuf.init_data( + torch.empty(tbuf.data_size, dtype=tbuf.dtype, device=self.device) + ) + transpose_bucket = tbuf.fetch_bucket() + mbuf = group.main_weight_buffer if mbuf: # Manually instantiate an empty tensor into the main weight buffer. @@ -2133,25 +2190,41 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): if not self.ddp_config.keep_fp8_transpose_cache: for _param in m.parameters(recurse=False): if is_float8tensor(_param): - _param._transpose_invalid = True - _param._transpose = None + fp8_discard_transpose_cache(_param) # Raise error if a meta parameter still exists after initialization. assert not p.is_meta, (self.param_to_name[p], module_reset_flag) + p_local = to_local_if_dtensor(p) + # Copy the model weight parameter tensor into the buffer. # When distributed, this shards and preserves the data across all ranks. - wbuf.set_item(item_id, to_local_if_dtensor(p)) + wbuf.set_item(item_id, p_local) + if tbuf: + tbuf.set_item(item_id, p_local) # Retrieve the newly allocated parameter data from the global bucket. # Attach the bucket-allocated parameter data to the module parameter, # to use the bucket-allocated data for autograd and NCCL. - new_param_data = wbuf.get_item_from_bucket(bucket, item_id).view( - to_local_if_dtensor(p).shape - ) - if is_float8tensor(p): - # Needed to instantiate FP8 parameters. Requires installing - # TransformerEngine. - modify_underlying_storage(p, new_param_data) + new_param_data = wbuf.get_item_from_bucket(bucket, item_id).view(p_local.shape) + if tbuf: + new_transpose_data = tbuf.get_item_from_bucket( + transpose_bucket, item_id + ).view(p_local.shape) + else: + new_transpose_data = None + + if is_float8tensor(p_local): + old_param_data = fp8_get_raw_data(p_local) + assert old_param_data._base is None + new_param_data.detach().copy_(old_param_data) + fp8_set_raw_data(p_local, new_param_data) + del old_param_data + if new_transpose_data is not None: + old_transpose_data = fp8_get_raw_data(p_local, True) + assert old_transpose_data._base is None + new_transpose_data.detach().copy_(old_transpose_data) + fp8_set_raw_data(p_local, new_transpose_data, True) + del old_transpose_data elif isinstance(p, DTensor): old_param_data = p._local_tensor.data p._local_tensor.data = new_param_data @@ -2189,7 +2262,12 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): # the (high-precision) main weight buffer. # Nothing else needs to be done, because the main weights # do not require autograd operations, only possibly sharding. - mbuf.set_item(item_id, to_local_if_dtensor(p)) + p_local = to_local_if_dtensor(p) + assert not is_float8tensor(p_local), ( + self.param_to_name[p], + "fp8 param should use get_high_precision_init_val method.", + ) + mbuf.set_item(item_id, p_local) if wbuf and wbuf.is_data_distributed: # Free the memory backing the temporarily-allocated bucket associated @@ -2201,6 +2279,9 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params): # before forward activations and gradients are allocated in training. wbuf.free_bucket_storage() + if tbuf and tbuf.is_data_distributed: + tbuf.free_bucket_storage() + # Allocate the main_weight buffer and main_grad buffer data in one buffer. if self.buffer_all_in_one: with self.mem_alloc_context(): @@ -2324,6 +2405,7 @@ def _reset_parameters(self, old_params, new_params): group.params[item_id] = new_p for buf in [ group.model_weight_buffer, + group.transpose_weight_buffer, group.main_weight_buffer, group.main_grad_buffer, group.hsdp_wbuf, @@ -2371,6 +2453,7 @@ def _init_distributed_params(self): dist_main_weight = {} for pg in self.parameter_groups: wbuf = pg.model_weight_buffer + tbuf = pg.transpose_weight_buffer mbuf = pg.main_weight_buffer for item_id, orig_param in enumerate(pg.params): param_name = self.param_to_name[orig_param] @@ -2398,6 +2481,7 @@ def _init_distributed_params(self): ) dist_main_weight[param_name] = dist_param elif wbuf: + assert tbuf is None, "Transpose buffer should only exist when main params exist" dist_param = make_fsdp_dtensor( local_tensor=wbuf.get_item(item_id, only_shard=sharded_optimizer_state), param=orig_param, @@ -2567,9 +2651,54 @@ def copy_main_weights_to_model_weights(self): expert_param_quantize_kwargs = copy.deepcopy(dense_param_quantize_kwargs) data_parallel_group = None expert_data_parallel_group = None + clear_quantize_kwargs = lambda kwargs: [d.clear() for d in kwargs.values()] + + def _fp8_quantize_params(dense_param_quantize_kwargs, expert_param_quantize_kwargs): + if len(dense_param_quantize_kwargs["model_params"]) > 0: + # If we have FP8 parameters, we need to quantize them. + fp8_quantize(data_parallel_group=data_parallel_group, **dense_param_quantize_kwargs) + + if len(expert_param_quantize_kwargs["model_params"]) > 0: + # If we have FP8 expert parameters, we need to quantize them. + fp8_quantize( + data_parallel_group=expert_data_parallel_group, **expert_param_quantize_kwargs + ) + + clear_quantize_kwargs(dense_param_quantize_kwargs) + clear_quantize_kwargs(expert_param_quantize_kwargs) + + # Special handling of blockwise FP8 + BATCH_QUANT_MEMORY_LIMIT_BYTES = 5 * 1024**3 # 5 GB + blockwise_fp8_weight_buffers = [] + blockwise_fp8_param_buffers = [] + + def _batch_quantize_blockwise_fp8_params( + dense_param_quantize_kwargs, expert_param_quantize_kwargs, blockwise_fp8_param_buffers + ): + if len(blockwise_fp8_param_buffers) == 0: + return + + # Copy original param shards into their blockwise FP8 working buffers + for bufs in blockwise_fp8_param_buffers: + bufs["bucket_param"].copy_(bufs["param"]) + + # Apply FP8 quantization to blockwise FP8 parameters + _fp8_quantize_params(dense_param_quantize_kwargs, expert_param_quantize_kwargs) + + # Copy quantized params back from working buffers to original param tensors + for bufs in blockwise_fp8_param_buffers: + bufs["param"].copy_(bufs["bucket_param"]) + blockwise_fp8_param_buffers.clear() + + # Free bucket storage for blockwise FP8 weight buffers + for wbuf in blockwise_fp8_weight_buffers: + wbuf.free_bucket_storage() + blockwise_fp8_weight_buffers.clear() + for pg in self.parameter_groups: mbuf = pg.main_weight_buffer wbuf = pg.model_weight_buffer + tbuf = pg.transpose_weight_buffer if mbuf is None: continue @@ -2585,44 +2714,88 @@ def copy_main_weights_to_model_weights(self): shard_offsets_in_fp8 = quantize_func_kwargs["start_offsets"] shard_model_params = quantize_func_kwargs["fsdp_shard_model_params"] + has_blockwise_fp8_param = False for param in pg.params: item_id = mbuf.param_idx[param] if wbuf: if wbuf.is_data_distributed or mbuf.is_data_distributed: model_param = wbuf.get_item(item_id, only_shard=True) + if tbuf: + transpose_param = tbuf.get_item(item_id, only_shard=True) + else: + transpose_param = None main_weight = mbuf.get_item(item_id, only_shard=True) else: model_param = wbuf.get_item(item_id) + if tbuf: + transpose_param = tbuf.get_item(item_id) + else: + transpose_param = None main_weight = mbuf.get_item(item_id) else: assert not mbuf.is_data_distributed model_param = to_local_if_dtensor(param) main_weight = mbuf.get_item(item_id) + if is_blockwise_float8tensor(param): + fp8_params.append(param) + if model_param.numel() == 0: + shard_fp32_from_fp8.append(None) + shard_offsets_in_fp8.append(None) + shard_model_params.append([None, None]) + else: + shard_fp32_from_fp8.append(main_weight) + shard_offsets_in_fp8.append(wbuf.locate_item_in_global_item(item_id)[0]) + bucket = wbuf.fetch_bucket() + b_model_param = wbuf.get_item_from_bucket(bucket, item_id)[ + slice(*wbuf.locate_item_in_global_item(item_id)) + ] + assert ( + transpose_param is None + ), "Blockwise FP8 does not support transpose param." + shard_model_params.append([b_model_param, None]) + assert b_model_param.numel() == model_param.numel(), ( + f"Blockwise FP8 bucket param numel {b_model_param.numel()} does" + f" not match model param numel {model_param.numel()}" + f" name: {self.param_to_name[param]}" + ) + blockwise_fp8_param_buffers.append( + {"bucket_param": b_model_param, "param": model_param} + ) + has_blockwise_fp8_param = True + continue + if is_float8tensor(param): fp8_params.append(param) if model_param.numel() == 0: shard_fp32_from_fp8.append(None) shard_offsets_in_fp8.append(None) - shard_model_params.append(None) + shard_model_params.append([None, None]) else: shard_fp32_from_fp8.append(main_weight) shard_offsets_in_fp8.append(wbuf.locate_item_in_global_item(item_id)[0]) - shard_model_params.append(model_param) + shard_model_params.append([model_param, transpose_param]) continue if model_param.numel() > 0: model_param.data.copy_(main_weight.view(model_param.shape)) - if len(dense_param_quantize_kwargs["model_params"]) > 0: - # If we have FP8 parameters, we need to quantize them. - dense_param_quantize_kwargs["data_parallel_group"] = data_parallel_group - quantize_param_shard(**dense_param_quantize_kwargs) + if has_blockwise_fp8_param: + blockwise_fp8_weight_buffers.append(wbuf) + if ( + sum([wbuf.bucket_index.size for wbuf in blockwise_fp8_weight_buffers]) + > BATCH_QUANT_MEMORY_LIMIT_BYTES + ): + _batch_quantize_blockwise_fp8_params( + dense_param_quantize_kwargs, + expert_param_quantize_kwargs, + blockwise_fp8_param_buffers, + ) - if len(expert_param_quantize_kwargs["model_params"]) > 0: - # If we have FP8 expert parameters, we need to quantize them. - expert_param_quantize_kwargs["data_parallel_group"] = expert_data_parallel_group - quantize_param_shard(**expert_param_quantize_kwargs) + _batch_quantize_blockwise_fp8_params( + dense_param_quantize_kwargs, expert_param_quantize_kwargs, blockwise_fp8_param_buffers + ) + _fp8_quantize_params(dense_param_quantize_kwargs, expert_param_quantize_kwargs) @torch.no_grad() def copy_model_weights_to_main_weights(self): @@ -2640,6 +2813,7 @@ def copy_model_weights_to_main_weights(self): f"Master weight buffer size {mbuf.data.numel()} does not match " f"model weight buffer size {copyin_data.numel()}" ) + # TODO(mxfp8): Make sure it's not a fp8 buf? mbuf.data.copy_(copyin_data.data) def all_gather_parameters(self, async_op: bool = True): @@ -2657,15 +2831,18 @@ def all_gather_parameters(self, async_op: bool = True): all_gather_ops = [] for g in self.parameter_groups: - shard = g.model_weight_buffer.get_shard_from_local_buffer() - all_gather_handler = torch.distributed.all_gather_into_tensor( - output_tensor=g.model_weight_buffer.data, - input_tensor=shard, - group=g.model_weight_buffer.data_parallel_group, - async_op=async_op, - ) - if async_op: - all_gather_ops.append(all_gather_handler) + for buf in [g.model_weight_buffer, g.transpose_weight_buffer]: + if buf is None: + continue + shard = buf.get_shard_from_local_buffer() + all_gather_handler = torch.distributed.all_gather_into_tensor( + output_tensor=buf.data, + input_tensor=shard, + group=buf.data_parallel_group, + async_op=async_op, + ) + if async_op: + all_gather_ops.append(all_gather_handler) for op in all_gather_ops: op.wait() @@ -2686,7 +2863,7 @@ def reduce_scatter_gradients(self, async_op: bool = True): reduce_scatter_ops = [] for g in self.parameter_groups: gbuf = g.main_grad_buffer - if gbuf is not None: + if gbuf is None: continue scaling_factor = gbuf.gradient_scaling_factor reduce_op = gradient_reduce_preprocessing(gbuf.data, scaling_factor, self.ddp_config) @@ -3136,9 +3313,16 @@ def __init__( # Track the status of all-gather operations for each bucket. self.param_gather_event_map = {} # All buckets are initially deallocated / empty after initialization of ParamAndGradBuffer. - self.bucket_status = {i: BucketStatus.EMPTY for i in range(self.buffer.num_buckets)} + self.bucket_status = {} + for i in range(self.buffer.num_buckets): + for bwd in [False, True]: + self.bucket_status[self.get_bucket_key(i, bwd)] = BucketStatus.EMPTY + # Track whether each bucket can be deallocated. - self.bucket_can_be_released = {i: False for i in range(self.buffer.num_buckets)} + self.bucket_can_be_released = {} + for i in range(self.buffer.num_buckets): + for bwd in [False, True]: + self.bucket_can_be_released[self.get_bucket_key(i, bwd)] = False # Map each bucket to the bucket group it belongs to by enumerated ID. # Made to collect a subset of buckets in the same bucket group. @@ -3163,6 +3347,13 @@ def __init__( # all-gather parameters across groups. self.outer_fsdp_group_param_gather_stream = torch.cuda.Stream() + def get_bucket_key(self, bucket_id, bwd): + """Get the key for the bucket.""" + has_transpose_buffer = ( + self.buffer.parameter_groups[bucket_id].transpose_weight_buffer is not None + ) + return (bucket_id, has_transpose_buffer and bwd) + @property def num_buckets(self): """Return the number of buckets.""" @@ -3179,10 +3370,11 @@ def reset(self): UserWarning, ) while len(self.param_gather_event_map) > 0: - bucket_id = next(iter(self.param_gather_event_map)) - self.wait_bucket_ready(bucket_id) + (bucket_id, bwd) = next(iter(self.param_gather_event_map)) + self.wait_bucket_ready(bucket_id, bwd) for bucket_id in range(self.num_buckets): - self.bucket_can_be_released[bucket_id] = True + for bwd in [False, True]: + self.bucket_can_be_released[self.get_bucket_key(bucket_id, bwd)] = True self.recycle_unused_buckets() assert all([status is BucketStatus.EMPTY for status in self.bucket_status.values()]), ( @@ -3204,6 +3396,7 @@ def all_gather_params( suggested_AG_prefetch_size: Optional[int] = None, async_param_gather: bool = True, outer_fsdp_group_param_gather: bool = False, + bwd: bool = False, ): """All-gather the params. If prefetch is enabled, prefetch next buckets in the order of `prefetch_order`. @@ -3238,7 +3431,7 @@ def all_gather_params( # Do not release the buckets that are being all-gathered. for bucket_id in ag_buckets: - self.bucket_can_be_released[bucket_id] = False + self.bucket_can_be_released[self.get_bucket_key(bucket_id, bwd)] = False # If prefetch is enabled, we will add prefetch buckets to ag_buckets. if prefetch: @@ -3310,7 +3503,11 @@ def need_skip_prefetch(bucket_id): bucket_id = next_bucket_id(ag_buckets) # Only all-gather on buckets that have not been allocated yet. - ag_buckets = [i for i in ag_buckets if self.bucket_status[i] == BucketStatus.EMPTY] + ag_buckets = [ + bucket_id + for bucket_id in ag_buckets + if self.bucket_status[self.get_bucket_key(bucket_id, bwd)] == BucketStatus.EMPTY + ] if len(ag_buckets) == 0: return @@ -3329,6 +3526,7 @@ def need_skip_prefetch(bucket_id): self.ag_stream if self.ag_stream is not None else torch.cuda.current_stream() ) if outer_fsdp_group_param_gather: + # TODO(mxfp8): Support hsdp self.outer_fsdp_group_param_gather_stream.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(self.outer_fsdp_group_param_gather_stream): outer_fsdp_group = self.buffer.dist_index.get_outer_fsdp_group() @@ -3356,12 +3554,13 @@ def need_skip_prefetch(bucket_id): for bucket_id in buckets: # All-gather the module weights from each FSDP buffer shard # into an allocated bucket containing unsharded weights. - self.async_bucket_gather(bucket_id) + self.async_bucket_gather(bucket_id, bwd) # Replace the parameter all-gather event with coalescing event. for bucket_id in buckets: - _, mark_bucket_ready_to_use = self.param_gather_event_map[bucket_id] - self.param_gather_event_map[bucket_id] = ( + bucket_key = self.get_bucket_key(bucket_id, bwd) + _, mark_bucket_ready_to_use = self.param_gather_event_map[bucket_key] + self.param_gather_event_map[bucket_key] = ( coalescing_event, mark_bucket_ready_to_use, ) @@ -3369,14 +3568,16 @@ def need_skip_prefetch(bucket_id): # Wait for all-gather to finish if not async_param_gather: for bucket_id in buckets: - self.wait_bucket_ready(bucket_id) + self.wait_bucket_ready(bucket_id, bwd) - def wait_bucket_ready(self, bucket_id, empty_ok=False): + def wait_bucket_ready(self, bucket_id, bwd, empty_ok=False): """Wait for the bucket to be ready.""" - if self.bucket_status[bucket_id] == BucketStatus.READY_TO_USE: + bucket_key = self.get_bucket_key(bucket_id, bwd) + + if self.bucket_status[bucket_key] == BucketStatus.READY_TO_USE: # Already ready to use. return - if self.bucket_status[bucket_id] == BucketStatus.EMPTY: + if self.bucket_status[bucket_key] == BucketStatus.EMPTY: if empty_ok: return # Bucket shouldn't be empty, this implies that the bucket @@ -3384,48 +3585,64 @@ def wait_bucket_ready(self, bucket_id, empty_ok=False): raise ValueError(f"Bucket {bucket_id} is empty.") # Wait for asynchronous / overlapped NCCL operations to complete. - param_gather_event, mark_bucket_ready_to_use = self.param_gather_event_map.pop(bucket_id) + param_gather_event, mark_bucket_ready_to_use = self.param_gather_event_map.pop(bucket_key) param_gather_event.wait() mark_bucket_ready_to_use() @torch.no_grad() - def release_bucket(self, bucket_id: int): + def release_bucket(self, bucket_id, bwd): """Release the bucket.""" - if self.bucket_status[bucket_id] == BucketStatus.EMPTY: + # TODO(mxfp8): In some cases, there won't be ag before bwd? + bucket_key = self.get_bucket_key(bucket_id, bwd) + + if self.bucket_status[bucket_key] == BucketStatus.EMPTY: return - self.wait_bucket_ready(bucket_id, empty_ok=True) - if self.bucket_status[bucket_id] == BucketStatus.COMMUNICATING: + self.wait_bucket_ready(bucket_id, bwd, empty_ok=True) + if self.bucket_status[bucket_key] == BucketStatus.COMMUNICATING: raise ValueError(f"Bucket {bucket_id} is communicating.") - wbuf = self.buffer.parameter_groups[bucket_id].model_weight_buffer - wbuf.free_bucket_storage() - self.bucket_status[bucket_id] = BucketStatus.EMPTY + if bwd and self.buffer.parameter_groups[bucket_id].transpose_weight_buffer is not None: + buf = self.buffer.parameter_groups[bucket_id].transpose_weight_buffer + else: + buf = self.buffer.parameter_groups[bucket_id].model_weight_buffer + + buf.free_bucket_storage() + self.bucket_status[bucket_key] = BucketStatus.EMPTY def recycle_unused_buckets(self): """Recycle the unused buckets.""" - for bucket_id, can_be_released in self.bucket_can_be_released.items(): + for bucket_key, can_be_released in self.bucket_can_be_released.items(): if can_be_released: - self.release_bucket(bucket_id) - self.bucket_can_be_released[bucket_id] = False + bucket_id, is_transpose_weight = bucket_key[0], bucket_key[1] + self.release_bucket(bucket_id, is_transpose_weight) + self.bucket_can_be_released[bucket_key] = False - def get_fsdp_buffer(self, bucket_id: int) -> DataParallelBuffer: + def get_fsdp_buffer(self, bucket_id: int, bwd=False) -> DataParallelBuffer: """Get the FSDP buffer with the given bucket ID.""" param_group = self.buffer.parameter_groups[bucket_id] if self.buffer.ddp_config.outer_dp_sharding_strategy != "no_shard": - return param_group.hsdp_wbuf - return param_group.model_weight_buffer + if bwd and param_group.transpose_weight_buffer is not None: + raise RuntimeError("Transpose buffer is not supported for HSDP") + else: + return param_group.hsdp_wbuf + if bwd and param_group.transpose_weight_buffer is not None: + return param_group.transpose_weight_buffer + else: + return param_group.model_weight_buffer @torch.no_grad() - def async_bucket_gather(self, bucket_id: int) -> None: + def async_bucket_gather(self, bucket_id, bwd) -> None: """All-gather the bucket and set the items.""" - self.bucket_can_be_released[bucket_id] = False - if self.bucket_status[bucket_id] != BucketStatus.EMPTY: + bucket_key = self.get_bucket_key(bucket_id, bwd) + + self.bucket_can_be_released[bucket_key] = False + if self.bucket_status[bucket_key] != BucketStatus.EMPTY: return - self.bucket_status[bucket_id] = BucketStatus.COMMUNICATING + self.bucket_status[bucket_key] = BucketStatus.COMMUNICATING - wbuf = self.get_fsdp_buffer(bucket_id) + wbuf = self.get_fsdp_buffer(bucket_id, bwd) # Lazy release the unused buckets. self.recycle_unused_buckets() @@ -3440,18 +3657,21 @@ def async_bucket_gather(self, bucket_id: int) -> None: async_op=True, ) - def get_closure(bucket_id): + def get_closure(bucket_id, bwd): @torch.no_grad() def mark_bucket_ready_to_use(): # Mark the bucket as ready to use - all NCCL operations are complete. - self.bucket_status[bucket_id] = BucketStatus.READY_TO_USE + self.bucket_status[self.get_bucket_key(bucket_id, bwd)] = BucketStatus.READY_TO_USE return mark_bucket_ready_to_use - mark_bucket_ready_to_use = get_closure(bucket_id) + mark_bucket_ready_to_use = get_closure(bucket_id, bwd) # Track the async all-gather operation for the bucket. - self.param_gather_event_map[bucket_id] = (param_gather_event, mark_bucket_ready_to_use) + self.param_gather_event_map[self.get_bucket_key(bucket_id, bwd)] = ( + param_gather_event, + mark_bucket_ready_to_use, + ) @torch.no_grad() @@ -3544,15 +3764,13 @@ def override_sharded_param_methods_with_safety_checks(params, all_gather_pipelin def override_sharded_param_to_function_closure(p, to_function): def override_sharded_param_to_function(*args, **kwargs): - bucket_id = all_gather_pipeline.buffer.param_to_param_group[p] - status = all_gather_pipeline.bucket_status[bucket_id] - if status == BucketStatus.READY_TO_USE: - return to_function(*args, **kwargs) - raise RuntimeError( - "This parameter is already shard by MCore FSDP and the " - "shared-state parameter does not support 'to' function." - "please define the dtype and device of the parameter before FSDP wrap." - ) + if p._typed_storage()._size() == 0: + warnings.warn( + "The parameter may be sharded by Megatron-FSDP, " + "no actual 'to' operation is performed." + ) + return torch.empty([]) + return to_function(*args, **kwargs) return override_sharded_param_to_function @@ -3560,15 +3778,13 @@ def override_sharded_param_to_function(*args, **kwargs): def override_sharded_param_cpu_function_closure(p, cpu_function): def override_sharded_param_cpu_function(*args, **kwargs): - bucket_id = all_gather_pipeline.buffer.param_to_param_group[p] - status = all_gather_pipeline.bucket_status[bucket_id] - if status == BucketStatus.READY_TO_USE: - return cpu_function(*args, **kwargs) - warnings.warn( - "The parameters are sharded by MCore FSDP, and no actual cpu " - "operation is performed." - ) - return torch.empty([], device="cpu") + if p._typed_storage()._size() == 0: + warnings.warn( + "The parameter may be sharded by Megatron-FSDP, " + "no actual 'cpu' operation is performed." + ) + return torch.empty([], device="cpu") + return cpu_function(*args, **kwargs) return override_sharded_param_cpu_function diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py index e3e9996335e..01523929ae1 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py @@ -19,7 +19,7 @@ from contextlib import nullcontext from functools import reduce from importlib.metadata import version -from typing import Callable, List, Optional, Sequence, Union +from typing import Callable, Optional, Sequence, Union try: import einops @@ -78,52 +78,6 @@ def is_te_min_version(vers, check_equality=True): return te_version > PkgVersion(vers) -# Check if Transformer Engine has class for fp8 tensors. -try: - if is_te_min_version("2.0"): - # In TE2.x, QuantizedTensor is the base class for all different type of fp8 tensors, - # including fp8 tensor for delayed scaling, current scaling and mxfp8, etc. - from transformer_engine.pytorch.tensor import QuantizedTensor as FP8_TENSOR_CLASS - else: - from transformer_engine.pytorch.float8_tensor import Float8Tensor as FP8_TENSOR_CLASS - - HAVE_TE_FP8_TENSOR_CLASS = True -except (ImportError, ModuleNotFoundError): - # FP8 tensor class not found - HAVE_TE_FP8_TENSOR_CLASS = False - -try: - from transformer_engine.pytorch.optimizers import multi_tensor_applier, multi_tensor_scale - - multi_tensor_scale_impl = multi_tensor_scale -except ImportError: - try: - import amp_C - from apex.multi_tensor_apply import multi_tensor_applier - - multi_tensor_scale_impl = amp_C.multi_tensor_scale - except ImportError: - import warnings - - warnings.warn( - "Transformer Engine and Apex are not installed. " - "Falling back to local implementations of " - "multi_tensor_applier and multi_tensor_scale" - ) - - def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args): - """Multi tensor op applier""" - return op(2048 * 32, noop_flag_buffer, tensor_lists, *args) - - def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale): - """Works as a drop-in replacement for amp_C.multi_tensor_scale.""" - for src, dst in zip(tensor_lists[0], tensor_lists[1]): - dst.copy_(src * scale) - - multi_tensor_applier = local_multi_tensor_applier - multi_tensor_scale_impl = local_multi_tensor_scale - - def is_submodule(module, parent_module, strict=True): """ Check if a module is a submodule of another module. @@ -137,18 +91,6 @@ def is_submodule(module, parent_module, strict=True): return False -def is_float8tensor(tensor: torch.Tensor) -> bool: - """Check if a tensor is a Transformer Engine Float8Tensor. - - Note that in TE2.x, in order to support more recipes, the design of the fp8 tensor class has - changed. Now Float8Tensor is only used for current scaling and delayed scaling. And mxfp8 - and blockwise scaling have their own fp8 tensor classes. These different fp8 tensor classes - are both inherited from QuantizedTensor. So, for TE1.x, FP8_TENSOR_CLASS is Float8Tensor, - and for TE2.x, FP8_TENSOR_CLASS is QuantizedTensor. - """ - return HAVE_TE_FP8_TENSOR_CLASS and isinstance(tensor, FP8_TENSOR_CLASS) - - def get_mesh_names( device_mesh: Optional[DeviceMesh] = None, only_submesh_dims: bool = False ) -> list[str]: @@ -210,198 +152,6 @@ def contains_submesh( return all(submesh_name in device_mesh_names for submesh_name in submesh_names) -def _multi_tensor_copy_this_to_that( - this: List[torch.Tensor], that: List[torch.Tensor], overflow_buf: Optional[torch.Tensor] = None -): - """ - Use multi-tensor-applier to copy values from one list to another. - We don't have a bfloat16 implementation so for now if the overflow_buf - is not provided, we default back to simple loop copy to be compatible - with bfloat16. - """ - if overflow_buf is not None: - overflow_buf.fill_(0) - # Scaling with factor `1.0` is equivalent to copy. - multi_tensor_applier(multi_tensor_scale_impl, overflow_buf, [this, that], 1.0) - else: - for this_, that_ in zip(this, that): - that_.copy_(this_) - - -""" -The code below abstracts the functionalities needed for implementing "--fp8-param-gather" into -several functions. It provides different implementations for each function based on different -versions of TE, ensuring compatibility across various TE versions. - -Currently, there are three functions: - - modify_underlying_storage - This function is used in DDP to place all parameters into a contiguous buffer. For - non-fp8 tensors, replacing their data is simple, just using code like - "tensor.data = new_data". However, for fp8 tensors, their raw data is not stored in the - ".data" attribute, and it varies with different TE versions and different recipes. This - function provides a unified interface to replace the underlying storage of a fp8 tensor. - - quantize_param_shard - This function is used in dist-opt to cast fp32 main params to fp8 params. For non-fp8 - params, this casting is as simple as "bf16_params.copy_(fp32_main_params)"; but for fp8 - params, the casting logic varies with different TE versions and different recipes. This - function provides a unified interface to cast fp32 main params to fp8 params, and also - updates the necessary attributes (like amax, scale, scale_inv or transpose cache) of the - fp8 model params. - - correct_amax_history_if_needed - This function is used to correct the amax history of fp8 tensors. In TE1.x, some inplace - copy operations will write unwanted values to the amax_history of fp8 tensors. This function - corrects the amax_history back. For TE2.x, it's an empty function. - Only useful for delayed scaling. -""" -if HAVE_TE and is_te_min_version("2.2"): - # Supported TE versions: 2.2+ - from transformer_engine.pytorch.tensor import QuantizedTensor - - def _modify_underlying_storage_impl( - fp8_tensor: QuantizedTensor, new_raw_data: torch.Tensor - ) -> None: - from transformer_engine.pytorch.tensor.utils import replace_raw_data - - replace_raw_data(fp8_tensor, new_raw_data) - - def _quantize_param_shard_impl( - model_params: List[QuantizedTensor], - main_params: List[torch.Tensor], - start_offsets: List[int], - data_parallel_group: ProcessGroup, - fsdp_shard_model_params: Optional[List[torch.Tensor]] = None, - ) -> None: - if len(model_params) == 0: - return - - from transformer_engine.pytorch.tensor.utils import cast_master_weights_to_fp8 - - args = [model_params, main_params, start_offsets, data_parallel_group] - if fsdp_shard_model_params is not None: - if get_te_version() == PkgVersion("2.3.0.dev0+5fdd7bb") or is_te_min_version("2.3.0"): - args.append(fsdp_shard_model_params) - else: - raise NotImplementedError( - f"FSDP with --fp8-param-gather is not supported in TE v{get_te_version()}" - ) - cast_master_weights_to_fp8(*args) - -elif HAVE_TE and is_te_min_version("2.0"): - # Supported TE versions: 2.0 - from transformer_engine.pytorch.tensor import QuantizedTensor - from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor - - def _modify_underlying_storage_impl( - fp8_tensor: QuantizedTensor, new_raw_data: torch.Tensor - ) -> None: - old_raw_data = fp8_tensor._data - assert old_raw_data.dtype == new_raw_data.dtype - new_raw_data.detach().copy_(old_raw_data) - fp8_tensor._data = new_raw_data - del old_raw_data - - def _quantize_param_shard_impl( - model_params: List[QuantizedTensor], - main_params: List[torch.Tensor], - start_offsets: List[int], - data_parallel_group: ProcessGroup, - fsdp_shard_model_params: Optional[List[torch.Tensor]] = None, - ) -> None: - if len(model_params) == 0: - return - - if fsdp_shard_model_params is None: - fsdp_shard_model_params = [None] * len(model_params) - - for model_param, main_param, start_offset, fsdp_shard_model_param in zip( - model_params, main_params, start_offsets, fsdp_shard_model_params - ): - if main_param is None: - continue - - if fsdp_shard_model_param is not None: - shard_model_param = fsdp_shard_model_param - else: - shard_model_param = model_param._data.view(-1)[ - start_offset : start_offset + main_param.numel() - ] - - quantizer = model_param._quantizer - # When not using --fp8-param-gather, the main_param (fp32) is first cast to bf16/fp16, - # and then cast to fp8 during forward. - # Although it's not necessary when --fp8-param-gather is enabled, we still keep this - # logic to keep numerical consistency. So here cast the main_param to model_param.dtype. - main_param = main_param.to(model_param.dtype) - out = Float8Tensor( - shape=main_param.size(), - dtype=model_param.dtype, - requires_grad=False, - data=shard_model_param, - fp8_scale_inv=model_param._scale_inv, - fp8_dtype=model_param._fp8_dtype, - quantizer=quantizer, - ) - quantizer.update_quantized(main_param, out) - - amaxes = [] - scales = [] - scale_invs = [] - for model_param in model_params: - quantizer = model_param._quantizer - amaxes.append(quantizer.amax.view(1)) - scales.append(quantizer.scale.view(1)) - scale_invs.append(model_param._scale_inv.view(1)) - model_param._reset_caches() - - dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device="cuda") - - # Update scaling factors. - packed_scales = torch.empty(len(scales), dtype=torch.float32, device=scales[0].device) - packed_scale_views = [packed_scales[i].view(1) for i in range(len(scales))] - _multi_tensor_copy_this_to_that(scales, packed_scale_views, dummy_overflow_buf) - torch.reciprocal(packed_scales, out=packed_scales) - _multi_tensor_copy_this_to_that(packed_scale_views, scale_invs, dummy_overflow_buf) - - # Reduce amaxes. - # Note: Assume each param has a separate amax. - packed_amaxes = torch.empty(len(amaxes), dtype=torch.float32, device=amaxes[0].device) - packed_amax_views = [packed_amaxes[i].view(1) for i in range(len(amaxes))] - _multi_tensor_copy_this_to_that(amaxes, packed_amax_views, dummy_overflow_buf) - torch.distributed.all_reduce( - packed_amaxes, op=torch.distributed.ReduceOp.MAX, group=data_parallel_group - ) - _multi_tensor_copy_this_to_that(packed_amax_views, amaxes, dummy_overflow_buf) - -else: - # Fallback impl if TE version is invalid or TE is not installed. - def _modify_underlying_storage_impl(*args, **kwargs): - raise RuntimeError( - "Invalid Transformer Engine version for FP8 distributed optimizer, " - "please install Transformer Engine 2.0+ or install Megatron-Core" - ) - - def _quantize_param_shard_impl(*args, **kwargs): - raise RuntimeError( - "Invalid Transformer Engine version for FP8 distributed optimizer, " - "please install Transformer Engine 2.0+ or install Megatron-Core" - ) - - -def modify_underlying_storage(tensor: torch.Tensor, new_raw_data: torch.Tensor): - """Replace the underlying raw data of a tensor with new data.""" - _modify_underlying_storage_impl(tensor, new_raw_data) - - -def quantize_param_shard( - model_params, main_params, start_offsets, data_parallel_group, fsdp_shard_model_params=None -): - """Cast shard fp32 main params to fp8 model params.""" - assert HAVE_TE, "Transformer Engine is required for quantizing parameters." - _quantize_param_shard_impl( - model_params, main_params, start_offsets, data_parallel_group, fsdp_shard_model_params - ) - - def _get_cuda_rng_state( device: Union[int, str, torch.device] = "cuda", clone: bool = False, graph_safe: bool = False ) -> torch.Tensor: diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 9d9bfcd7e90..9aba3a7cb8e 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -744,6 +744,13 @@ def validate_args(args, defaults={}): assert args.ckpt_format == "fsdp_dtensor", \ "Megatron FSDP only supports fsdp_dtensor checkpoint format" + + if args.use_megatron_fsdp: + args.reuse_grad_buf_for_mxfp8_param_ag = False + + if args.fsdp_manual_registration: + assert args.use_megatron_fsdp, "FSDP manual registration is only supported with Megatron FSDP" + assert args.nccl_ub, "FSDP manual registration is only supported with nccl-ub option" # Parameters dtype. args.params_dtype = torch.float From 1ec0beb1eb973058fad8d7a4ab9b6a0699485199 Mon Sep 17 00:00:00 2001 From: Pingtian Li <158665726+Wohox@users.noreply.github.com> Date: Wed, 7 Jan 2026 04:23:47 +0800 Subject: [PATCH 216/248] [Dev] Partial CUDA Graph support for EP Overlap (#2810) --- .../common/model_chunk_schedule_plan.py | 40 +- .../core/models/gpt/fine_grained_callables.py | 208 +++--- megatron/core/pipeline_parallel/schedules.py | 105 ++++ megatron/core/pipeline_parallel/utils.py | 4 +- megatron/core/transformer/cuda_graphs.py | 84 ++- megatron/core/transformer/moe/moe_layer.py | 7 +- .../core/transformer/transformer_config.py | 15 + .../core/transformer/transformer_layer.py | 36 ++ .../golden_values_dev_dgx_h100.json | 592 +++++++++--------- .../model_config.yaml | 5 +- .../test_cuda_graphed_schedule_chunk_1f1b.py | 372 +++++++++++ .../a2a_overlap/test_schedule_layer_1f1b.py | 2 +- tests/unit_tests/a2a_overlap/utils.py | 1 + .../pipeline_parallel/test_schedules.py | 48 ++ .../transformer/test_submodule_callables.py | 16 +- 15 files changed, 1109 insertions(+), 426 deletions(-) create mode 100644 tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py index 07bab1cb486..b8f11ed9d38 100644 --- a/megatron/core/models/common/model_chunk_schedule_plan.py +++ b/megatron/core/models/common/model_chunk_schedule_plan.py @@ -17,6 +17,7 @@ get_comm_stream, get_comp_stream, ) +from megatron.core.transformer.enums import CudaGraphScope class ModelChunkState: @@ -37,23 +38,20 @@ class TransformerLayerSchedulePlan: mtp post process nodes. layer (TransformerLayerSchedulePlan) - ├── attn (TransformerLayerNode): attention module - ├── post_attn (TransformerLayerNode): layernorm -> router -> dispatch preprocess + ├── attn (TransformerLayerNode): attention -> router -> dispatch preprocess ├── moe_dispatch (TransformerLayerNode): dispatch All2All ├── mlp (TransformerLayerNode): mlp module ├── moe_combine (TransformerLayerNode): combine All2All └── mtp_post_process (PostProcessNode): mtp post process Note that MTP layer has the same operation and execution order with TransformerLayer regarding - post_attn, moe_dispatch, mlp, moe_combine, but contains extra operations in attn and - mtp_post_process: + moe_dispatch, mlp, moe_combine, but contains extra operations in attn and mtp_post_process: * mtp.attn wraps around transformer_layer.attn with extra norm, proj and embedding operations. * mtp.mtp_post_process contains output_layer, mtp loss operations, whereas transformer_layer.mtp_post_process is empty. """ attn = None - post_attn = None moe_dispatch = None mlp = None moe_combine = None @@ -117,7 +115,7 @@ def release_state(self): def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args): """ Builds the callable nodes for the transformer/mtp layer: - attn, post_attn, mlp, moe_dispatch and moe_combine, and mtp_post_process. + attn, mlp, moe_dispatch and moe_combine, and mtp_post_process. """ from megatron.core.models.gpt.fine_grained_callables import ( TransformerLayerNode, @@ -137,16 +135,7 @@ def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args): else isinstance(self.layer.mlp, MoELayer) ) - enable_deepep = ( - self.layer.config.moe_token_dispatcher_type == "flex" - and self.layer.config.moe_flex_dispatcher_backend == "deepep" - ) - enable_hybridep = ( - self.layer.config.moe_token_dispatcher_type == "flex" - and self.layer.config.moe_flex_dispatcher_backend == "hybridep" - ) - extra_args["enable_deepep"] = enable_deepep - extra_args["enable_hybridep"] = enable_hybridep + extra_args["config"] = self.layer.config extra_args["is_moe"] = is_moe extra_args["delay_wgrad_compute"] = self.layer.config.delay_wgrad_compute extra_args["is_mtp"] = is_mtp @@ -167,7 +156,6 @@ def create_node(stream, module, name): ( attn_module, - post_attn_module, moe_dispatch_module, mlp_module, moe_combine_module, @@ -179,11 +167,9 @@ def create_node(stream, module, name): self.attn = create_node(comp_stream, attn_module, "attn") self.mlp = create_node(comp_stream, mlp_module, "mlp") if is_moe: - self.post_attn = create_node(comp_stream, post_attn_module, "post_attn") self.moe_dispatch = create_node(comm_stream, moe_dispatch_module, "moe_dispatch") self.moe_combine = create_node(comm_stream, moe_combine_module, "moe_combine") else: - self.post_attn = NoopScheduleNode() self.moe_dispatch = NoopScheduleNode() self.moe_combine = NoopScheduleNode() @@ -194,6 +180,11 @@ def create_node(stream, module, name): else: self.mtp_post_process = NoopScheduleNode() + # mlp and combine may receive dgrad from attn, which is managed by cuda graph. + if CudaGraphScope.attn in self.config.cuda_graph_scope: + self.mlp.manual_grads_release = False + self.moe_combine.manual_grads_release = False + def get_fp8_context(self): """ Get the fp8 context for the transformer layer. @@ -216,8 +207,8 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False) to maximize parallelism and efficiency. When f_layer and b_layer are not None, forward and backward pass are overlapped as follows: - comm_stream: combine_bwd | dispatch_fwd->dispatch_bwd | combine_fwd - comp_stream: attn_fwd->post_attn_fwd| mlp_bwd->mlp_bwd_dw->mlp_fwd| post_attn_bwd->attn_bwd + comm_stream: combine_bwd | dispatch_fwd->dispatch_bwd | combine_fwd + comp_stream: attn_fwd | mlp_bwd->mlp_bwd_dw->mlp_fwd| attn_bwd For MTP, mtp_post_process_fwd is executed after the combine_fwd in the comp_stream, and mtp_post_process_bwd is executed before the combine_bwd in the comp_stream. @@ -240,7 +231,6 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False) if f_layer is not None: with f_layer.get_fp8_context(): f_input = f_layer.attn.forward(f_input) - f_input = f_layer.post_attn.forward(f_input) if b_layer is not None: b_grad = b_layer.mlp.backward(b_grad) @@ -254,7 +244,6 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False) b_grad = b_layer.moe_dispatch.backward(b_grad) if b_layer is not None and b_layer.config.ep_overlap_early_attn_memory_release: - b_grad = b_layer.post_attn.backward(b_grad) b_grad = b_layer.attn.backward(b_grad) if f_layer is not None: @@ -267,7 +256,6 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False) f_input = f_layer.mtp_post_process.forward(f_input) if b_layer is not None and not b_layer.config.ep_overlap_early_attn_memory_release: - b_grad = b_layer.post_attn.backward(b_grad) b_grad = b_layer.attn.backward(b_grad) # Delay the last attn_dw in backward pass (attn_dw of the first layer) @@ -371,6 +359,10 @@ def __init__( model, self._model_chunk_state, self._event, comp_stream ) + # preprocess may receive dgrad from attn, which is managed by cuda graph. + if CudaGraphScope.attn in model.config.cuda_graph_scope: + self.pre_process.manual_grads_release = False + def _build_layer_schedule_plan(self, module, comp_stream, comm_stream): if module is None: return diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index 5913dfaba33..b4879cd1e13 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -6,14 +6,17 @@ from typing import Optional import torch +from torch import Tensor from megatron.core import tensor_parallel +from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( fine_grained_offloading_group_commit, fine_grained_offloading_group_start, get_fine_grained_offloading_context, ) from megatron.core.pipeline_parallel.utils import ScheduleNode, make_viewless +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.module import float16_to_fp32 from megatron.core.transformer.moe.moe_layer import MoELayer from megatron.core.transformer.multi_token_prediction import ( @@ -42,14 +45,13 @@ def wrapped_func(*args, **kwarg): @internal_api -def should_free_input(name, is_moe, enable_deepep, enable_hybridep): +def should_free_input(name, is_moe, config): """Determine if the node should free its input memory. Args: name: Node name is_moe: Whether it's a MoE model - enable_deepep: Whether to use DeepEP dispatcher - enable_hybridep: Whether to use HybridEP dispatcher + config: TransformerConfig object Returns: bool: Whether to free input memory @@ -57,6 +59,14 @@ def should_free_input(name, is_moe, enable_deepep, enable_hybridep): # For dense layers [attn, fake, mlp, fake], the input is needed during backward pass if not is_moe: return False + enable_deepep = ( + config.moe_token_dispatcher_type == "flex" + and config.moe_flex_dispatcher_backend == "deepep" + ) + enable_hybridep = ( + config.moe_token_dispatcher_type == "flex" + and config.moe_flex_dispatcher_backend == "hybridep" + ) # Define which nodes should free input memory # Since we split the computing graph into multiple nodes, we can manually control # when and how to free the input memory. @@ -69,7 +79,10 @@ def should_free_input(name, is_moe, enable_deepep, enable_hybridep): # and probs before dispatch A2A and it's not needed anymore after the forward pass # For DeepEP and HybridEP dispatcher mode, they are both needed in backward pass # and cannot be freed. - "moe_dispatch": not (enable_deepep or enable_hybridep), + # If moe_preprocess is in cuda graph scope, tokens and probs are fixed size tensors, + # so they cannot be freed. + "moe_dispatch": not (enable_deepep or enable_hybridep) + and (CudaGraphScope.moe_preprocess not in config.cuda_graph_scope), } return free_input_nodes.get(name, False) @@ -239,13 +252,13 @@ def __init__( it's the per_batch_state_context, o.w. nullcontext name (str): Node name, also used to determine memory strategy bwd_dw_callables (list): List of weight gradient functions for the layer. - extra_args (dict): Extra arguments for nodes: is_moe, enable_deepep, enable_hybridep. + extra_args (dict): Extra arguments for the node: is_moe, config. """ # determine whether to free input memory + config = extra_args.get("config", None) + assert config is not None, "model config must be passed to TransformerLayerNode." is_moe = extra_args.get("is_moe", False) - enable_deepep = extra_args.get("enable_deepep", False) - enable_hybridep = extra_args.get("enable_hybridep", False) - free_input = should_free_input(name, is_moe, enable_deepep, enable_hybridep) + free_input = should_free_input(name, is_moe, config) self.delay_wgrad_compute = extra_args.get("delay_wgrad_compute", False) super().__init__( @@ -310,8 +323,8 @@ def backward_dw(self): module.backward_dw() # the output grad memory is last used in wgrad compute, should be safe to release. - assert self.delay_grads_release, "output grad memory should be valid before wgrad." - if self.manual_release_grads: + if self.manual_grads_release: + assert self.delay_grads_release, "output grad memory should be valid before wgrad." for tensor in self.output_grads: tensor.untyped_storage().resize_(0) self.output_grads = None @@ -364,11 +377,101 @@ def build_transformer_layer_callables(layer: TransformerLayer): and layer.config.moe_flex_dispatcher_backend == "hybridep" ) + class _BackwardDWWrapper: + def __init__(self): + self.graphed_backward_dw_callable = None + self.attn_dw_callable = layer.self_attention.backward_dw + if isinstance(layer.mlp, MoELayer): + self.shared_expert_dw_callable = partial( + layer.mlp.backward_dw, routed_experts=False, shared_experts=True + ) + else: + self.shared_expert_dw_callable = None + self.cuda_graph_scope = layer.config.cuda_graph_scope + + def set_graphed_backward_dw_callable(self, graphed_backward_dw_callable): + """Store the CUDA graphed backward weight gradient callable.""" + self.graphed_backward_dw_callable = graphed_backward_dw_callable + + def backward_dw(self): + """Execute weight gradients, skipping CUDA graphed components during replay.""" + is_replay = hasattr(layer, 'cuda_graphs') and layer.cuda_graphs + if self.shared_expert_dw_callable is not None and ( + not is_replay or CudaGraphScope.moe_router not in self.cuda_graph_scope + ): + self.shared_expert_dw_callable() + if not is_replay or CudaGraphScope.attn not in self.cuda_graph_scope: + self.attn_dw_callable() + if is_replay and self.graphed_backward_dw_callable is not None: + self.graphed_backward_dw_callable() + + attn_backward_dw_wrapper = _BackwardDWWrapper() + def submodule_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor): """ - Performs same attnention forward logic as GPT Model. + Performs same attnention forward logic as GPT Model and forward pass for + computations between attention and dispatch: + pre mlp layernorm->router->dispatch preprocess """ - hidden_states, _ = layer._forward_attention( + + if hasattr(layer, 'cuda_graphs') and layer.cuda_graphs: + assert ( + CudaGraphScope.mlp not in layer.config.cuda_graph_scope + and CudaGraphScope.moe not in layer.config.cuda_graph_scope + ), ( + "Supported CUDA graph scope with EP overlap: " + "attn, moe_router, moe_preprocess, mlp, got {}".format( + layer.config.cuda_graph_scope + ) + ) + forward_func = layer._te_cuda_graph_replay + attn_backward_dw_wrapper.set_graphed_backward_dw_callable( + partial(layer.backward_dw_cudagraph, layer.current_microbatch) + ) + else: + # wrapper function that keeps consistent api with cuda graph replay + def forward_func( + hidden_states: Tensor, + attention_mask: Optional[Tensor] = None, + rotary_pos_emb: Optional[Tensor] = None, + rotary_pos_cos: Optional[Tensor] = None, + rotary_pos_sin: Optional[Tensor] = None, + packed_seq_params: Optional[PackedSeqParams] = None, + sequence_len_offset: Optional[Tensor] = None, + ): + hidden_states, _ = layer._forward_attention( + hidden_states=hidden_states, + attention_mask=attention_mask, + rotary_pos_emb=rotary_pos_emb, + rotary_pos_cos=rotary_pos_cos, + rotary_pos_sin=rotary_pos_sin, + packed_seq_params=packed_seq_params, + sequence_len_offset=sequence_len_offset, + ) + if not isinstance(layer.mlp, MoELayer): + return hidden_states, None, None, None + if layer.offload_mlp_norm: + hidden_states = fine_grained_offloading_group_start( + hidden_states, name="mlp_norm" + ) + if layer.recompute_pre_mlp_layernorm: + layer.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput() + with get_fine_grained_offloading_context(layer.offload_mlp_norm): + pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint( + layer.pre_mlp_layernorm, hidden_states + ) + else: + with get_fine_grained_offloading_context(layer.offload_mlp_norm): + pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states) + + shared_expert_output = layer.mlp.shared_experts_compute(pre_mlp_layernorm_output) + probs, routing_map = layer.mlp.route(pre_mlp_layernorm_output) + local_tokens, probs, _ = layer.mlp.preprocess( + pre_mlp_layernorm_output, probs, routing_map + ) + return hidden_states, local_tokens, probs, shared_expert_output + + hidden_states, local_tokens, probs, shared_expert_output = forward_func( hidden_states=hidden_states, attention_mask=node.chunk_state.attention_mask, rotary_pos_emb=node.chunk_state.rotary_pos_emb, @@ -377,33 +480,14 @@ def submodule_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor): packed_seq_params=node.chunk_state.packed_seq_params, sequence_len_offset=node.chunk_state.sequence_len_offset, ) - return hidden_states - - def submodule_post_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor): - """ - Run forward pass for computations between attention and dispatch: - pre mlp layernorm->router->dispatch preprocess - """ - if layer.offload_mlp_norm: - hidden_states = fine_grained_offloading_group_start(hidden_states, name="mlp_norm") - if layer.recompute_pre_mlp_layernorm: - layer.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput() - with get_fine_grained_offloading_context(layer.offload_mlp_norm): - pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint( - layer.pre_mlp_layernorm, hidden_states - ) - else: - with get_fine_grained_offloading_context(layer.offload_mlp_norm): - pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states) - - probs, routing_map = layer.mlp.route(pre_mlp_layernorm_output) - local_tokens, probs, _ = layer.mlp.preprocess(pre_mlp_layernorm_output, probs, routing_map) + if not isinstance(layer.mlp, MoELayer): + return hidden_states # Detach here for mlp_bda residual connection node.layer_state.residual = node.detach(hidden_states) if layer.mlp.use_shared_expert and not layer.mlp.shared_expert_overlap: - # Detach here for shared expert connection - node.layer_state.pre_mlp_layernorm_output = node.detach(pre_mlp_layernorm_output) + # Detach here for shared expert connection in moe_combine + node.layer_state.shared_expert_output = node.detach(shared_expert_output) return local_tokens, probs @@ -428,7 +512,6 @@ def submodule_moe_forward(node: ScheduleNode, dispatched_tokens: torch.Tensor): Run forward pass for computations between dispatch and combine: post dispatch->experts->combine preprocess """ - shared_expert_output = None dispatched_probs = node.layer_state.dispatched_probs token_dispatcher = layer.mlp.token_dispatcher if enable_deepep or enable_hybridep: @@ -436,10 +519,8 @@ def submodule_moe_forward(node: ScheduleNode, dispatched_tokens: torch.Tensor): # backward graph from connecting to dispatch submodule token_dispatcher._comm_manager.dispatched_probs = dispatched_probs - pre_mlp_layernorm_output = getattr(node.layer_state, 'pre_mlp_layernorm_output', None) - shared_expert_output = layer.mlp.shared_experts_compute(pre_mlp_layernorm_output) - expert_output, mlp_bias = layer.mlp.routed_experts_compute( - dispatched_tokens, dispatched_probs, pre_mlp_layernorm_output + expert_output, _ = layer.mlp.routed_experts_compute( + dispatched_tokens, dispatched_probs, None ) if layer.recompute_pre_mlp_layernorm: @@ -449,16 +530,10 @@ def submodule_moe_forward(node: ScheduleNode, dispatched_tokens: torch.Tensor): # release tensor reference after use node.layer_state.dispatched_probs = None node.layer_state.pre_mlp_layernorm_output = None - if shared_expert_output is None: - # Return only expert_output, since shared_expert_output causes backward on None - return expert_output - return expert_output, shared_expert_output - - def submodule_combine_forward( - node: ScheduleNode, - output: torch.Tensor, - shared_expert_output: Optional[torch.Tensor] = None, - ): + + return expert_output + + def submodule_combine_forward(node: ScheduleNode, output: torch.Tensor): """ # Triggers token combine and the remaining computation in the transformer layer. # The `mlp_bda` computation is placed after `mlp.combine` due to data dependency. @@ -468,10 +543,11 @@ def submodule_combine_forward( # with another microbatch's computation and expose the communication. """ residual = node.layer_state.residual - + shared_expert_output = getattr(node.layer_state, 'shared_expert_output', None) output = layer.mlp.combine(output, shared_expert_output) mlp_output_with_bias = (output, None) - + if hasattr(layer, 'cuda_graphs') and layer.cuda_graphs: + layer.mlp.cudagraph_tensor_store.clear() with layer.bias_dropout_add_exec_handler(): hidden_states = layer.mlp_bda(layer.training, layer.config.bias_dropout_fusion)( mlp_output_with_bias, residual, layer.hidden_dropout @@ -507,13 +583,12 @@ def raise_not_implemented(*args): # Build forward and backward callable functions attn_func = submodule_attn_forward - post_attn_func = submodule_post_attn_forward if is_moe else raise_not_implemented dispatch_func = submodule_dispatch_forward if is_moe else raise_not_implemented mlp_func = submodule_moe_forward if is_moe else mlp_wrapper combine_func = submodule_combine_forward if is_moe else raise_not_implemented - forward_funcs = [attn_func, post_attn_func, dispatch_func, mlp_func, combine_func, None] - backward_dw = {"attn": layer.self_attention, "mlp": layer.mlp} + forward_funcs = [attn_func, dispatch_func, mlp_func, combine_func, None] + backward_dw = {"attn": attn_backward_dw_wrapper, "mlp": layer.mlp} return forward_funcs, backward_dw @@ -525,9 +600,7 @@ def build_mtp_layer_callables(layer): """ forward_funcs, backward_dw = build_transformer_layer_callables(layer.transformer_layer) - attn_forward, post_attn_forward, dispatch_forward, mlp_forward, combine_forward, _ = ( - forward_funcs - ) + attn_forward, dispatch_forward, mlp_forward, combine_forward, _ = forward_funcs is_moe = isinstance(layer.transformer_layer.mlp, MoELayer) assert is_moe, "MTP layer in a2a overlap only supports MoE layer for now." @@ -588,24 +661,17 @@ def rng_context_wrapper(func, *args, **kwargs): # Build forward and backward callable functions # attn_forward already has rng context, no need to wrap attn_func = submodule_mtp_attn_forward - post_attn_func = partial(rng_context_wrapper, post_attn_forward) dispatch_func = partial(rng_context_wrapper, dispatch_forward) mlp_func = partial(rng_context_wrapper, mlp_forward) combine_func = partial(rng_context_wrapper, combine_forward) mtp_post_process_func = submodule_mtp_postprocess_forward - forward_funcs = [ - attn_func, - post_attn_func, - dispatch_func, - mlp_func, - combine_func, - mtp_post_process_func, - ] - backward_dw = { - "attn": [layer.transformer_layer.self_attention, layer.eh_proj], - "mlp": layer.transformer_layer.mlp, - } + forward_funcs = [attn_func, dispatch_func, mlp_func, combine_func, mtp_post_process_func] + if isinstance(backward_dw["attn"], list): + backward_dw["attn"].append(layer.eh_proj) + else: + backward_dw["attn"] = [backward_dw["attn"], layer.eh_proj] + return forward_funcs, backward_dw diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index a35ccac504a..9dc79ed11f7 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -2,6 +2,7 @@ import contextlib from functools import partial +from itertools import zip_longest from typing import Callable, Iterator, List, Optional, Union import torch @@ -845,6 +846,110 @@ def convert_schedule_table_to_order(num_warmup_microbatches, num_model_chunks, s return order +def get_overlap_moe_expert_parallel_comm_order(order, num_layers_per_chunk, capture_wgrad_graph): + """ + This functions gets the order for overlap_moe_expert_parallel_comm schedule for the original + chunk-wise order list. Each chunk is transformered to chunks with only 1 layer so that + layers between 2 chunks can now overlap with each other while following the graph order. + If capture_wgrad_graph is True, the wgrad backward graph is also added to the order by + decreasing the layer id by 0.5. + + Args: + order (List[int]): The original chunk-wise order list. Positive values represent forward + passes for chunks, negative values represent backward passes. The absolute value + indicates the chunk ID (1-indexed). + num_layers_per_chunk (List[int]): Number of graphable layers in each chunk. The length + of this list equals the number of chunks. + capture_wgrad_graph (bool): If True, weight gradient computation graphs are added to the + order by appending entries with layer_id - 0.5. + + Returns: + Tuple[List[float], List[Optional[List[int]]]]: A tuple containing: + - new_order: The layer-wise order list where each chunk is expanded to individual + layers. Positive values are forward passes, negative values are backward passes. + Values with .5 suffix indicate weight gradient computations. + - chunk_id_list: A list parallel to new_order. For forward passes, contains + [chunk_id, layer_index_within_chunk]. For backward passes, contains None. + + Example: + original_order: [1, 2, -2, 1, -1, -1] + num_layers_per_chunk: [1, 2] + capture_wgrad_graph=True: + new_order: [1, 2, 3, 1, -3, -3.5, -2, -2.5, -1, -1.5, -1, -1.5] + chunk_id_list: [[0, 0], [1, 0], [1, 1], [0, 0], None, + None, None, None, None, None, None, None] + capture_wgrad_graph=False: + new_order: [1, 2, 3, 1, -3, -2, -1, -1] + chunk_id_list: [[0, 0], [1, 0], [1, 1], [0, 0], None, None, None, None] + """ + + def _add_order(new_order, chunk_id_list, c_id, layer_id, is_wgrad=False, index=None): + if is_wgrad: + new_order.append(layer_id - 0.5) + else: + new_order.append(layer_id) + if c_id > 0: + chunk_id_list.append([abs(c_id) - 1, index]) + else: + chunk_id_list.append(None) + + new_order = [] + chunk_id_list = [] + add_order = partial(_add_order, new_order, chunk_id_list) + first_backward_idx, last_forward_idx = None, None + for idx, c_id in enumerate(order): + if first_backward_idx is None and c_id < 0: + first_backward_idx = idx + if c_id > 0: + last_forward_idx = idx + + def get_layer_range(c_id): + num_layers = num_layers_per_chunk[abs(c_id) - 1] + num_layers_previous_chunks = sum(num_layers_per_chunk[: abs(c_id) - 1]) + if c_id > 0: + return list( + range(num_layers_previous_chunks + 1, num_layers_previous_chunks + num_layers + 1) + ) + return list(range(-num_layers_previous_chunks - num_layers, -num_layers_previous_chunks)) + + # warmup stage + for c_id in order[:first_backward_idx]: + layer_range = get_layer_range(c_id) + new_order += layer_range + chunk_id_list.extend([abs(c_id) - 1, i] for i in range(len(layer_range))) + + # 1f1b overlap stage + if first_backward_idx < last_forward_idx: + for c_id_b, c_id_f in zip( + order[first_backward_idx : last_forward_idx + 1 : 2], + order[first_backward_idx + 1 : last_forward_idx + 1 : 2], + ): + layer_range_f = get_layer_range(c_id_f) + layer_range_b = get_layer_range(c_id_b) + index = 0 + for l_b, l_f in zip_longest(layer_range_b, layer_range_f, fillvalue=0): + # always forward graph before backward graph + if l_f != 0: + add_order(c_id_f, l_f, index=index) + if l_b != 0: + add_order(c_id_b, l_b) + if capture_wgrad_graph and index < len(layer_range_b) - 1: + add_order(c_id_b, l_b, is_wgrad=True) + index += 1 + # last wgrad backward + if capture_wgrad_graph and layer_range_b: + add_order(c_id_b, layer_range_b[-1], is_wgrad=True) + + # cool down stage, backward graphs only + for c_id in order[last_forward_idx + 1 :]: + for l_b in get_layer_range(c_id): + add_order(c_id, l_b) + if capture_wgrad_graph: + add_order(c_id, l_b, is_wgrad=True) + + return new_order, chunk_id_list + + def forward_backward_pipelining_with_interleaving( *, forward_step_func, diff --git a/megatron/core/pipeline_parallel/utils.py b/megatron/core/pipeline_parallel/utils.py index e7e416f99bd..d38f6d702c0 100644 --- a/megatron/core/pipeline_parallel/utils.py +++ b/megatron/core/pipeline_parallel/utils.py @@ -182,8 +182,8 @@ def __init__( self.free_input = free_input self.inputs = None self.outputs = None + self.manual_grads_release = False self.delay_grads_release = False - self.manual_release_grads = False def default_backward_func(self, outputs, output_grad): """Default backward function""" @@ -269,7 +269,7 @@ def _backward(self, *output_grad): # to avoid delayed garbage collection. If # delay_grads_release is True, dgrad is last used in # wgrad compute and skip the release here. - if self.manual_release_grads and not self.delay_grads_release: + if self.manual_grads_release and not self.delay_grads_release: g.untyped_storage().resize_(0) grads = self.get_grad() diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index 27e6c65c738..b566c1830dc 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -10,6 +10,7 @@ from contextlib import nullcontext from dataclasses import fields, is_dataclass from enum import Enum +from math import ceil from typing import Any, Dict, List, Optional import torch @@ -1510,7 +1511,7 @@ def graphs_created(self): """ return self._graphs_created - def _get_sample_arguments(self, order): + def _get_sample_arguments(self, order, chunk_id_list=None): """ Generate sample arguments and keyword arguments for CUDA Graph capturing with memory-optimized buffer reuse. @@ -1539,6 +1540,9 @@ def _get_sample_arguments(self, order): order (List[int]): The forward/backward execution order from convert_schedule_table_to_order(). Positive integers represent forward passes (1-indexed chunk ID), negative integers represent backward passes. + chunk_id_list (List[Tuple[int, int]]): The list of chunk IDs and layer IDs in the + order. This is useful only when overlap_moe_expert_parallel_comm is enabled, + the order maps each layers' idx to their original chunk id. Returns: Tuple[List[Tuple], List[Dict]]: A tuple containing: @@ -1560,9 +1564,11 @@ def _get_sample_arguments(self, order): assert self.num_model_chunks == max( order ), "num_model_chunks must match the max chunk id in order." - assert ( - self.num_microbatches == len(order) // self.num_model_chunks // 2 - ), "num_microbatches must match the number of microbatches in order." + if chunk_id_list is None: + # check only if 1f1b overlap is disabled. + assert ( + self.num_microbatches == len(order) // self.num_model_chunks // 2 + ), "num_microbatches must match the number of microbatches in order." # Generate sample arguments and keyword arguments for capturing. sample_args = [None] * (len(self.flattened_callables) * self.num_microbatches) @@ -1645,8 +1651,8 @@ def get_rotary_pos_emb(transformer_module, transformer_input): consumed_sample_queue = {} layer_sample_keys_cache = {} fwd_idx = [0] * self.num_model_chunks - for chunk_id in order: - model_chunk_idx = abs(chunk_id) - 1 + for idx, chunk_id in enumerate(order): + model_chunk_idx = abs(ceil(chunk_id)) - 1 if chunk_id > 0: if model_chunk_idx not in fwd_sample_queues: @@ -1655,7 +1661,14 @@ def get_rotary_pos_emb(transformer_module, transformer_input): sample_start_idx = (prefix_num_layers[model_chunk_idx] * self.num_microbatches) + ( fwd_idx[model_chunk_idx] * self.num_layers_per_chunk[model_chunk_idx] ) - for layer_idx, layer in enumerate(self.callables_per_chunk[model_chunk_idx]): + if chunk_id_list: + model_chunk_idx = chunk_id_list[idx][0] + callables_curr_chunk = [ + self.callables_per_chunk[model_chunk_idx][chunk_id_list[idx][1]] + ] + else: + callables_curr_chunk = self.callables_per_chunk[model_chunk_idx] + for layer_idx, layer in enumerate(callables_curr_chunk): per_callable_fwd_idx = sample_start_idx + layer_idx # Get sample_args and sample_kwargs for index per_callable_fwd_idx. @@ -1692,7 +1705,7 @@ def get_rotary_pos_emb(transformer_module, transformer_input): # reuse the static inputs of a previous forward pass for this forward pass. # If not, we still need to generate the new static inputs. sample_keys = layer_sample_keys_cache[id(layer)] - + model_chunk_idx = abs(chunk_id) - 1 fwd_sample_queues[model_chunk_idx].append((sample_keys, per_callable_fwd_idx)) if consumed_sample_queue.get(sample_keys, []): # We can reuse the static inputs of a previous forward pass for this @@ -1714,13 +1727,16 @@ def get_rotary_pos_emb(transformer_module, transformer_input): # Unfortunately, no previous static inputs are available for reuse, # sample_args is still None. Last attempt: generate the new static inputs # for this forward pass. + if chunk_id_list: + model_chunk_idx = chunk_id_list[idx][0] sample_args[per_callable_fwd_idx], sample_kwargs[per_callable_fwd_idx] = ( _get_layer_static_inputs( layer, self.chunks_with_decoder[model_chunk_idx] ) ) + model_chunk_idx = abs(chunk_id) - 1 fwd_idx[model_chunk_idx] += 1 - else: + elif ceil(chunk_id) == chunk_id: num_consumed_samples = min( len(fwd_sample_queues[model_chunk_idx]), self.num_layers_per_chunk[model_chunk_idx], @@ -1734,6 +1750,9 @@ def get_rotary_pos_emb(transformer_module, transformer_input): fwd_sample_queues[model_chunk_idx] = fwd_sample_queues[model_chunk_idx][ num_consumed_samples: ] + else: + # skip register static inputs for wgrad backward graphs + continue return sample_args, sample_kwargs @@ -1746,12 +1765,16 @@ def _get_cuda_graph_input_data(self): # Get the PP and VPP scheduling order. from megatron.core.pipeline_parallel.schedules import ( convert_schedule_table_to_order, + get_overlap_moe_expert_parallel_comm_order, get_pp_rank_microbatches, get_schedule_table, ) # If PP is not enabled, we only need to capture one microbatch. - if parallel_state.get_pipeline_model_parallel_world_size() == 1: + if ( + parallel_state.get_pipeline_model_parallel_world_size() == 1 + and not self.config.overlap_moe_expert_parallel_comm + ): assert ( self.num_model_chunks == 1 ), "If PP is not enabled, there should be only one model chunk." @@ -1780,9 +1803,36 @@ def _get_cuda_graph_input_data(self): level=logging.DEBUG, msg=f'Rank {torch.distributed.get_rank()}: ORDER {order}', ) + chunk_id_list = None + if self.config.overlap_moe_expert_parallel_comm: + wgrad_in_graph_scope = CudaGraphScope.attn in self.config.cuda_graph_scope or ( + CudaGraphScope.moe_router in self.config.cuda_graph_scope + and self.config.moe_shared_expert_intermediate_size is not None + and not self.config.moe_shared_expert_overlap + ) + capture_wgrad_graph = self.config.delay_wgrad_compute and wgrad_in_graph_scope + order, chunk_id_list = get_overlap_moe_expert_parallel_comm_order( + order, self.num_layers_per_chunk, capture_wgrad_graph + ) + self.num_layers_per_chunk = [1] * sum(self.num_layers_per_chunk) + self.num_model_chunks = max(order) + _order_without_wgrad = [] + for c_id in order: + if ceil(c_id) != c_id: + continue + _order_without_wgrad.append(c_id) + self.num_microbatches = len(_order_without_wgrad) // self.num_model_chunks // 2 + log_on_each_pipeline_stage( + logger=logger, + tp_group=None, + dp_cp_group=None, + level=logging.DEBUG, + msg=f'Rank {torch.distributed.get_rank()}: ' + f'ORDER after overlap_moe_expert_parallel_comm {order}', + ) # Generate sample arguments and keyword arguments for capturing. - sample_args, sample_kwargs = self._get_sample_arguments(order) + sample_args, sample_kwargs = self._get_sample_arguments(order, chunk_id_list) def get_make_graphed_callables_kwargs(): kwargs = {'allow_unused_input': True, '_order': order} @@ -1920,13 +1970,17 @@ def create_cudagraphs(self): for layer_number, layer in enumerate(layers): layer.cuda_graphs = [] for batch_number in range(self.num_microbatches): - layer.cuda_graphs.append( - graphs[ + if self.config.overlap_moe_expert_parallel_comm: + graph_idx = ( + num_layers_accumulated + layer_number + ) * self.num_microbatches + batch_number + else: + graph_idx = ( num_layers_accumulated * self.num_microbatches + batch_number * len(layers) + layer_number - ] - ) + ) + layer.cuda_graphs.append(graphs[graph_idx]) num_layers_accumulated += len(layers) self._finish_capturing(start_time) diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py index 3742d064508..e44d8647bd6 100644 --- a/megatron/core/transformer/moe/moe_layer.py +++ b/megatron/core/transformer/moe/moe_layer.py @@ -377,10 +377,11 @@ def custom_forward(hidden_states, padding_mask=None): return outputs - def backward_dw(self): + def backward_dw(self, routed_experts: bool = True, shared_experts: bool = False): """Compute weight gradients for experts and shared experts.""" - self.experts.backward_dw() - if self.use_shared_expert and not self.shared_expert_overlap: + if routed_experts: + self.experts.backward_dw() + if shared_experts and self.use_shared_expert and not self.shared_expert_overlap: self.shared_experts.backward_dw() def set_for_recompute_pre_mlp_layernorm(self): diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index 365c7a265eb..3a57f09f6cf 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -1869,6 +1869,16 @@ def __post_init__(self): 'when enabling overlap_moe_expert_parallel_comm with MTP layer.' ) + if self.cuda_graph_impl != "none": + assert ( + self.cuda_graph_impl == "transformer_engine" + and CudaGraphScope.moe not in self.cuda_graph_scope + and CudaGraphScope.mlp not in self.cuda_graph_scope + ), ( + 'CUDA graph scope on moe and mlp is not ' + 'supported with overlap_moe_expert_parallel_comm' + ) + # Check delay_wgrad_compute compatibility if self.delay_wgrad_compute: assert ( @@ -1877,6 +1887,11 @@ def __post_init__(self): assert ( not self.moe_use_legacy_grouped_gemm ), 'delay_wgrad_compute is not supported with legacy groupedgemm implementation' + if self.cuda_graph_impl == "transformer_engine": + assert is_te_min_version("2.10.0"), ( + 'TE version >= 2.10.0 is required for delay_wgrad_compute with ' + 'partial cuda graph' + ) if self.ep_overlap_early_attn_memory_release: assert self.overlap_moe_expert_parallel_comm, ( diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index 5c310cc81e4..53a1470c492 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -888,6 +888,10 @@ def _te_cuda_graph_replay(self, *args, **kwargs): # CUDA Graph captures the whole MLP/MoE part. CUDA Graph output is the layer output. assert len(cuda_graph_output) == 1, "CUDA Graph output should be the layer output." output = cuda_graph_output.pop() + assert ( + not self.config.overlap_moe_expert_parallel_comm + ), "EP overlap must be \ + disabled when CUDA graph captures the whole MLP/MoE part." elif self.is_moe_layer and CudaGraphScope.moe_router in self.config.cuda_graph_scope: # CUDA Graph partially captures the MoE. # The rest of the layer should go to the normal pass. @@ -930,12 +934,35 @@ def _te_cuda_graph_replay(self, *args, **kwargs): residual=residual, shared_expert_output=shared_expert_output, ) + # If EP overlap is enabled, remaining of mlp will be called as fine_grained_callables + # and should be skipped here. + if self.config.overlap_moe_expert_parallel_comm: + probs, routing_map = self.mlp.route(hidden_states) + hidden_states, probs, residual = self.mlp.preprocess( + hidden_states, probs, routing_map + ) + nvtx_range_pop(suffix="mlp") + return mlp_residual, hidden_states, probs, shared_expert_output mlp_output_with_bias = self.mlp(hidden_states) self.mlp.cudagraph_tensor_store.clear() nvtx_range_pop(suffix="mlp") output = self._forward_post_mlp(mlp_output_with_bias, mlp_residual) else: + # If EP overlap is enabled, needs to return same outputs as submodule.attn + if self.config.overlap_moe_expert_parallel_comm: + assert len(cuda_graph_output) == 1, "CUDA Graph output should be the layer output." + mlp_residual = cuda_graph_output.pop() + if not self.is_moe_layer: + return mlp_residual, None, None, None + hidden_states = self.pre_mlp_layernorm(mlp_residual) + shared_expert_output = self.mlp.shared_experts_compute(hidden_states) + probs, routing_map = self.mlp.route(hidden_states) + hidden_states, probs, residual = self.mlp.preprocess( + hidden_states, probs, routing_map + ) + return mlp_residual, hidden_states, probs, shared_expert_output + # CUDA Graph does not capture the MLP/MoE part at all. output = self._forward_mlp(*cuda_graph_output) return output, context @@ -1023,6 +1050,15 @@ def _should_call_local_cudagraph(self, *args, **kwargs): return True return False + def backward_dw_cudagraph(self, microbatch_idx): + """ + CUDA Graph backward weight gradient computation for this layer. + """ + cg_index = microbatch_idx % len(self.cuda_graphs) + if not hasattr(self.cuda_graphs[cg_index], 'backward_dw'): + return + self.cuda_graphs[cg_index].backward_dw() + def __call__(self, *args, **kwargs): if self._should_call_local_cudagraph(*args, **kwargs): # Inference mode. diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json index e7da3fb2265..51e9d7154c9 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json @@ -6,54 +6,54 @@ "values": { "1": 11.06693, "2": 11.0602, - "3": 10.21183, - "4": 9.95418, - "5": 10.12235, - "6": 8.8232, - "7": 9.52776, - "8": 8.44297, - "9": 7.84862, - "10": 7.0731, - "11": 9.29877, - "12": 9.14048, - "13": 7.86753, - "14": 8.20366, - "15": 8.2163, - "16": 8.17366, - "17": 8.20571, - "18": 7.48715, - "19": 8.08859, - "20": 7.6351, - "21": 7.94948, - "22": 7.29052, - "23": 7.93234, - "24": 7.43607, - "25": 8.23632, - "26": 7.75037, - "27": 7.69922, - "28": 7.65432, - "29": 7.75197, - "30": 7.56043, - "31": 7.81763, - "32": 6.46365, - "33": 7.20218, - "34": 7.7734, - "35": 7.72752, - "36": 6.71703, - "37": 8.09101, - "38": 7.61439, - "39": 7.96641, - "40": 7.49902, - "41": 7.49619, - "42": 6.10035, - "43": 7.59169, - "44": 7.9135, - "45": 6.83091, - "46": 7.40862, - "47": 7.78798, - "48": 7.87259, - "49": 7.58321, - "50": 6.84073 + "3": 10.21167, + "4": 9.95277, + "5": 10.12388, + "6": 8.82369, + "7": 9.52785, + "8": 8.44289, + "9": 7.85041, + "10": 7.07093, + "11": 9.28562, + "12": 9.13324, + "13": 7.86224, + "14": 8.19705, + "15": 8.22932, + "16": 8.17783, + "17": 8.2161, + "18": 7.50358, + "19": 8.08893, + "20": 7.64905, + "21": 7.95183, + "22": 7.29849, + "23": 7.93348, + "24": 7.43565, + "25": 8.2385, + "26": 7.75634, + "27": 7.70075, + "28": 7.66089, + "29": 7.75606, + "30": 7.56072, + "31": 7.81859, + "32": 6.46861, + "33": 7.20532, + "34": 7.77706, + "35": 7.73113, + "36": 6.72448, + "37": 8.09344, + "38": 7.62008, + "39": 7.96872, + "40": 7.4992, + "41": 7.49916, + "42": 6.11993, + "43": 7.59389, + "44": 7.91482, + "45": 6.83633, + "46": 7.41335, + "47": 7.78887, + "48": 7.87666, + "49": 7.58746, + "50": 6.84352 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 47165232.0, - "2": 46897932.0, - "3": 49538636.0, - "4": 293970432.0, - "5": 569239168.0, - "6": 649282112.0, - "7": 1024299712.0, - "8": 745969216.0, - "9": 849837376.0, - "10": 671136704.0, - "11": 820579712.0, - "12": 808020608.0, - "13": 642603904.0, - "14": 628553728.0, - "15": 703673088.0, - "16": 861425280.0, - "17": 658078464.0, - "18": 805612544.0, - "19": 902126016.0, - "20": 890704960.0, - "21": 670006528.0, - "22": 761263488.0, - "23": 761663488.0, - "24": 767542784.0, - "25": 638744256.0, - "26": 742320640.0, - "27": 745099136.0, - "28": 720589184.0, - "29": 751754368.0, - "30": 742684032.0, - "31": 656692864.0, - "32": 790831616.0, - "33": 789798208.0, - "34": 780255872.0, - "35": 776100992.0, - "36": 736753344.0, - "37": 740480640.0, - "38": 715119872.0, - "39": 739264064.0, - "40": 723054656.0, - "41": 698221312.0, - "42": 667945792.0, - "43": 654024448.0, - "44": 651974656.0, - "45": 625754432.0, - "46": 616508224.0, - "47": 607837184.0, - "48": 581971328.0, - "49": 562630912.0, - "50": 544389376.0 + "1": 47165160.0, + "2": 46897928.0, + "3": 52684380.0, + "4": 297108064.0, + "5": 556667648.0, + "6": 661861120.0, + "7": 1027446592.0, + "8": 742822528.0, + "9": 846651648.0, + "10": 693167680.0, + "11": 826875520.0, + "12": 814304768.0, + "13": 642608768.0, + "14": 606554752.0, + "15": 728814528.0, + "16": 845696384.0, + "17": 667529728.0, + "18": 673504384.0, + "19": 889544960.0, + "20": 890696768.0, + "21": 676302464.0, + "22": 688965120.0, + "23": 789972480.0, + "24": 761249536.0, + "25": 648185280.0, + "26": 789507392.0, + "27": 641355648.0, + "28": 805511168.0, + "29": 773780224.0, + "30": 811888960.0, + "31": 688167744.0, + "32": 834871424.0, + "33": 792944256.0, + "34": 777109568.0, + "35": 763515136.0, + "36": 733607744.0, + "37": 743626240.0, + "38": 746577024.0, + "39": 732972864.0, + "40": 735645696.0, + "41": 556711680.0, + "42": 680528384.0, + "43": 669752960.0, + "44": 667702912.0, + "45": 635197248.0, + "46": 629093120.0, + "47": 626713344.0, + "48": 600843456.0, + "49": 581506752.0, + "50": 572705728.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 5249984000.0, - "2": 5250123264.0, - "3": 5250260480.0, - "4": 5249986048.0, - "5": 5250123264.0, - "6": 5250260480.0, - "7": 5250397696.0, - "8": 5250534912.0, - "9": 5250672128.0, - "10": 5250809344.0, - "11": 5250946560.0, - "12": 5251083776.0, - "13": 5251220992.0, - "14": 5251358208.0, - "15": 5251495424.0, - "16": 5251632640.0, - "17": 5251769856.0, - "18": 5251907072.0, - "19": 5252044288.0, - "20": 5252181504.0, - "21": 5252318720.0, - "22": 5252455936.0, - "23": 5252593152.0, - "24": 5252730368.0, - "25": 5252867584.0, - "26": 5253004800.0, - "27": 5253142016.0, - "28": 5253279232.0, - "29": 5253416448.0, - "30": 5253553664.0, - "31": 5253690880.0, - "32": 5253828096.0, - "33": 5253965312.0, - "34": 5254102528.0, - "35": 5254239744.0, - "36": 5254376960.0, - "37": 5254514176.0, - "38": 5254651392.0, - "39": 5254788608.0, - "40": 5254925824.0, - "41": 5255063040.0, - "42": 5255200256.0, - "43": 5255337472.0, - "44": 5255474688.0, - "45": 5255611904.0, - "46": 5255749120.0, - "47": 5255886336.0, - "48": 5256023552.0, - "49": 5256160768.0, - "50": 5256297984.0 + "1": 5275215360.0, + "2": 5275420160.0, + "3": 5275622912.0, + "4": 5275217408.0, + "5": 5275420160.0, + "6": 5275622912.0, + "7": 5275825664.0, + "8": 5276028416.0, + "9": 5276231168.0, + "10": 5276433920.0, + "11": 5276636672.0, + "12": 5276839424.0, + "13": 5277042176.0, + "14": 5277244928.0, + "15": 5277447680.0, + "16": 5277650432.0, + "17": 5277853184.0, + "18": 5278055936.0, + "19": 5278258688.0, + "20": 5278461440.0, + "21": 5278664192.0, + "22": 5278866944.0, + "23": 5279069696.0, + "24": 5279272448.0, + "25": 5279475200.0, + "26": 5279677952.0, + "27": 5279880704.0, + "28": 5280083456.0, + "29": 5280286208.0, + "30": 5280488960.0, + "31": 5280691712.0, + "32": 5280894464.0, + "33": 5281097216.0, + "34": 5281299968.0, + "35": 5281502720.0, + "36": 5281705472.0, + "37": 5281908224.0, + "38": 5282110976.0, + "39": 5282313728.0, + "40": 5282516480.0, + "41": 5282719232.0, + "42": 5282921984.0, + "43": 5283124736.0, + "44": 5283327488.0, + "45": 5283530240.0, + "46": 5283732992.0, + "47": 5283935744.0, + "48": 5284138496.0, + "49": 5284341248.0, + "50": 5284544000.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 6101398016.0, - "2": 8124549632.0, - "3": 8124549632.0, - "4": 8124549632.0, - "5": 8124549632.0, - "6": 8127293952.0, - "7": 8146633216.0, - "8": 8146633216.0, - "9": 8151443968.0, - "10": 8151443968.0, - "11": 8153425408.0, - "12": 8153425408.0, - "13": 8153425408.0, - "14": 8153425408.0, - "15": 8153425408.0, - "16": 8169207296.0, - "17": 8190995456.0, - "18": 8190995456.0, - "19": 8190995456.0, - "20": 8206373376.0, - "21": 8206373376.0, - "22": 8209894400.0, - "23": 8209894400.0, - "24": 8209894400.0, - "25": 8209894400.0, - "26": 8209894400.0, - "27": 8209894400.0, - "28": 8209894400.0, - "29": 8209894400.0, - "30": 8231049216.0, - "31": 8231049216.0, - "32": 8231049216.0, - "33": 8231049216.0, - "34": 8231049216.0, - "35": 8231049216.0, - "36": 8231049216.0, - "37": 8231049216.0, - "38": 8231049216.0, - "39": 8231049216.0, - "40": 8231049216.0, - "41": 8231049216.0, - "42": 8231049216.0, - "43": 8231049216.0, - "44": 8231049216.0, - "45": 8231049216.0, - "46": 8231049216.0, - "47": 8231049216.0, - "48": 8231049216.0, - "49": 8231049216.0, - "50": 8231049216.0 + "1": 6208857600.0, + "2": 8233667072.0, + "3": 8233667072.0, + "4": 8233667072.0, + "5": 8233667072.0, + "6": 8233667072.0, + "7": 8233667072.0, + "8": 8233667072.0, + "9": 8233667072.0, + "10": 8233667072.0, + "11": 8262715904.0, + "12": 8262715904.0, + "13": 8262715904.0, + "14": 8262715904.0, + "15": 8262715904.0, + "16": 8268117504.0, + "17": 8288236032.0, + "18": 8288236032.0, + "19": 8288236032.0, + "20": 8288236032.0, + "21": 8288236032.0, + "22": 8299924992.0, + "23": 8302176768.0, + "24": 8302176768.0, + "25": 8302176768.0, + "26": 8302176768.0, + "27": 8302176768.0, + "28": 8302176768.0, + "29": 8302176768.0, + "30": 8302176768.0, + "31": 8302176768.0, + "32": 8302176768.0, + "33": 8302176768.0, + "34": 8302176768.0, + "35": 8302176768.0, + "36": 8302176768.0, + "37": 8302176768.0, + "38": 8313753088.0, + "39": 8313753088.0, + "40": 8313753088.0, + "41": 8313753088.0, + "42": 8313753088.0, + "43": 8313753088.0, + "44": 8313753088.0, + "45": 8313753088.0, + "46": 8313753088.0, + "47": 8313753088.0, + "48": 8313753088.0, + "49": 8313753088.0, + "50": 8313753088.0 } }, "mtp_1 loss": { @@ -234,54 +234,54 @@ "values": { "1": 11.07401, "2": 11.0927, - "3": 10.82643, - "4": 10.27622, - "5": 10.45336, - "6": 8.32745, - "7": 9.82615, - "8": 8.0154, - "9": 7.47567, - "10": 6.7579, - "11": 8.9295, - "12": 8.98788, - "13": 7.8023, - "14": 8.02404, - "15": 8.11201, - "16": 8.1414, - "17": 8.13011, - "18": 7.44461, - "19": 8.03519, - "20": 7.53958, - "21": 7.90042, - "22": 7.27752, - "23": 7.88457, - "24": 7.37662, - "25": 8.17118, - "26": 7.69984, - "27": 7.62511, - "28": 7.61547, - "29": 7.69882, - "30": 7.48104, - "31": 7.73945, - "32": 6.36982, - "33": 7.14012, - "34": 7.71799, - "35": 7.6339, - "36": 6.61216, - "37": 8.03046, - "38": 7.58074, - "39": 7.89628, - "40": 7.41236, - "41": 7.42281, - "42": 6.01575, - "43": 7.48966, - "44": 7.86842, - "45": 6.74992, - "46": 7.30434, - "47": 7.72759, - "48": 7.78813, - "49": 7.49091, - "50": 6.75731 + "3": 10.8262, + "4": 10.27574, + "5": 10.45324, + "6": 8.32758, + "7": 9.82629, + "8": 8.01538, + "9": 7.47611, + "10": 6.75851, + "11": 8.92961, + "12": 8.98772, + "13": 7.80203, + "14": 8.02221, + "15": 8.11372, + "16": 8.14498, + "17": 8.13435, + "18": 7.45035, + "19": 8.03784, + "20": 7.54246, + "21": 7.90269, + "22": 7.28093, + "23": 7.88727, + "24": 7.37587, + "25": 8.17289, + "26": 7.70083, + "27": 7.62668, + "28": 7.61747, + "29": 7.69888, + "30": 7.48586, + "31": 7.74301, + "32": 6.37542, + "33": 7.13919, + "34": 7.7198, + "35": 7.63387, + "36": 6.6127, + "37": 8.03449, + "38": 7.58334, + "39": 7.89887, + "40": 7.41168, + "41": 7.42316, + "42": 6.01689, + "43": 7.48867, + "44": 7.86976, + "45": 6.75113, + "46": 7.3054, + "47": 7.73281, + "48": 7.79017, + "49": 7.48985, + "50": 6.75753 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 89.01124, - "2": 2.6502, - "3": 2.63345, - "4": 4.59488, - "5": 2.67282, - "6": 2.75196, - "7": 2.38279, - "8": 1.95041, - "9": 2.55604, - "10": 1.89736, - "11": 1.9113, - "12": 2.59681, - "13": 1.87891, - "14": 1.89422, - "15": 1.89013, - "16": 1.88538, - "17": 1.91699, - "18": 1.88747, - "19": 1.93691, - "20": 1.88026, - "21": 1.94991, - "22": 1.90744, - "23": 1.8723, - "24": 1.87253, - "25": 1.93307, - "26": 1.93367, - "27": 1.88847, - "28": 1.93732, - "29": 1.95357, - "30": 1.93714, - "31": 1.89529, - "32": 1.87856, - "33": 1.96722, - "34": 1.88912, - "35": 1.88862, - "36": 1.88927, - "37": 1.8706, - "38": 1.85827, - "39": 1.86274, - "40": 1.9308, - "41": 1.93374, - "42": 1.88512, - "43": 1.89015, - "44": 1.90068, - "45": 1.89028, - "46": 1.89124, - "47": 1.87497, - "48": 1.86585, - "49": 1.87712, - "50": 1.95776 + "1": 64.76466, + "2": 2.42359, + "3": 2.56054, + "4": 2.61199, + "5": 2.3272, + "6": 2.19806, + "7": 2.16133, + "8": 1.97339, + "9": 2.14238, + "10": 2.05512, + "11": 2.00856, + "12": 1.96198, + "13": 2.08656, + "14": 1.96948, + "15": 1.96059, + "16": 1.97248, + "17": 1.97639, + "18": 2.01386, + "19": 1.9606, + "20": 1.94716, + "21": 2.00286, + "22": 1.965, + "23": 2.03401, + "24": 2.00528, + "25": 2.03321, + "26": 1.95999, + "27": 1.96395, + "28": 1.98191, + "29": 1.99346, + "30": 1.97579, + "31": 1.95097, + "32": 1.95726, + "33": 1.9399, + "34": 1.99177, + "35": 1.91153, + "36": 1.97534, + "37": 1.95691, + "38": 1.96206, + "39": 1.9414, + "40": 1.96027, + "41": 1.97807, + "42": 1.98861, + "43": 1.94856, + "44": 1.96339, + "45": 1.96835, + "46": 1.99733, + "47": 1.9716, + "48": 1.96591, + "49": 1.93865, + "50": 1.95198 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml index c657b9087e7..be34eb9aec5 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml @@ -5,6 +5,9 @@ ENV_VARS: NCCL_NVLS_ENABLE: 0 PYTHONWARNINGS: ignore NCCL_DEBUG: VERSION + NVTE_FUSED_ATTN: 0 + NCCL_ALGO: ^NVLS + CUBLAS_WORKSPACE_CONFIG: ':4096:8' MODEL_ARGS: # Distributed args --distributed-timeout-minutes: 60 @@ -29,8 +32,6 @@ MODEL_ARGS: --exit-duration-in-mins: 230 --no-check-for-nan-in-loss-and-grad: true --no-rope-fusion: true - --cross-entropy-loss-fusion: true - --cross-entropy-fusion-impl: native --manual-gc: true --manual-gc-interval: 100 --recompute-granularity: selective diff --git a/tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py b/tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py new file mode 100644 index 00000000000..91c74fe1bb6 --- /dev/null +++ b/tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py @@ -0,0 +1,372 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +import gc +import os +import sys + +import pytest +import torch + +from megatron.core.enums import ModelType +from megatron.core.models.gpt.gpt_layer_specs import ( + get_gpt_decoder_block_spec, + get_gpt_mtp_block_spec, +) +from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator +from megatron.core.pipeline_parallel.utils import set_streams +from megatron.core.tensor_parallel.random import HAVE_TE, model_parallel_cuda_manual_seed +from megatron.core.transformer.enums import CudaGraphScope +from megatron.core.transformer.module import float16_to_fp32 +from megatron.core.utils import is_te_min_version, unwrap_model +from megatron.training.arguments import core_transformer_config_from_args, parse_args, validate_args +from megatron.training.global_vars import ( + destroy_global_vars, + get_args, + set_args, + set_global_variables, +) +from megatron.training.training import setup_model_and_optimizer +from tests.unit_tests.test_utilities import Utils + + +def is_deep_ep_available(): + from megatron.core.transformer.moe.fused_a2a import HAVE_DEEP_EP + + return HAVE_DEEP_EP + + +def is_hybrid_ep_available(): + from megatron.core.transformer.moe.fused_a2a import HAVE_HYBRIDEP + + return HAVE_HYBRIDEP + + +def save(fn, message): + with open(fn, 'w') as f: + f.write(message) + + +class TestPartialCudaGraphedA2AOverlap: + """Test that CUDA graph outputs match ep-overlapped CUDA graph outputs for various scopes.""" + + def setup_method(self, method): + self.seq_length = 512 + self.micro_batch_size = 2 + # Store original environment variable values + self.original_env = { + 'CUDA_DEVICE_MAX_CONNECTIONS': os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS'), + 'NVTE_ALLOW_NONDETERMINISTIC_ALGO': os.environ.get('NVTE_ALLOW_NONDETERMINISTIC_ALGO'), + } + self.cuda_graph_helper = None + os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1' + os.environ['NVTE_ALLOW_NONDETERMINISTIC_ALGO'] = '0' + + def teardown_method(self, method): + # Restore original environment variable values + for key, value in self.original_env.items(): + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value + Utils.destroy_model_parallel() + destroy_global_vars() + destroy_num_microbatches_calculator() + if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created(): + self.cuda_graph_helper.delete_cuda_graphs() + self.cuda_graph_helper = None + + gc.collect() + + def model_provider( + self, + pre_process=True, + post_process=True, + layer_spec_fn=get_gpt_decoder_block_spec, + **config_kwargs, + ): + model_parallel_cuda_manual_seed(123) + args = get_args() + config = core_transformer_config_from_args(args) + transformer_layer_spec = layer_spec_fn( + config, + use_transformer_engine=True, + normalization=args.normalization, + qk_l2_norm=args.qk_l2_norm, + ) + if args.mtp_num_layers: + mtp_block_spec = get_gpt_mtp_block_spec( + config, transformer_layer_spec, use_transformer_engine=True + ) + else: + mtp_block_spec = None + return GPTModel( + config=config, + transformer_layer_spec=transformer_layer_spec, + vocab_size=args.vocab_size, + max_sequence_length=args.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + fp16_lm_cross_entropy=args.fp16_lm_cross_entropy, + parallel_output=True, + share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, + position_embedding_type=args.position_embedding_type, + rotary_percent=args.rotary_percent, + mtp_block_spec=mtp_block_spec, + ) + + def create_test_args( + self, cuda_graph_impl, cuda_graph_scope, cuda_graph_warmup_steps, ep_size, **kwargs + ): + destroy_global_vars() + destroy_num_microbatches_calculator() + + sys.argv = ['test_cuda_graphs.py'] + args = parse_args() + args.num_layers = 1 + args.mtp_num_layers = None + args.vocab_size = 1024 + args.hidden_size = 128 + args.num_attention_heads = 8 + args.max_position_embeddings = 512 + args.global_batch_size = self.micro_batch_size * 8 + args.micro_batch_size = self.micro_batch_size + args.create_attention_mask_in_dataloader = True + args.seq_length = self.seq_length + args.tensor_model_parallel_size = 2 + args.sequence_parallel = True + args.pipeline_model_parallel_size = 1 + args.context_parallel_size = 1 + args.expert_model_parallel_size = ep_size + args.train_iters = 10 + args.lr = 3e-5 + args.bf16 = True + args.add_bias_linear = False + args.swiglu = True + args.use_distributed_optimizer = True + args.position_embedding_type = "rope" + args.rotary_percent = 1.0 + args.hidden_dropout = 0.0 + args.attention_dropout = 0.0 + args.untie_embeddings_and_output_weights = True + + # MoE settings + args.num_experts = 16 + args.expert_model_parallel_size = ep_size + args.moe_shared_expert_intermediate_size = 1024 + args.moe_layer_freq = kwargs.get("moe_layer_freq", "[0,0,1,1]") + args.moe_permute_fusion = True + args.moe_router_fusion = True + args.moe_router_topk = 2 + + # CUDA graph settings + args.cuda_graph_impl = cuda_graph_impl + args.cuda_graph_scope = cuda_graph_scope + args.cuda_graph_warmup_steps = cuda_graph_warmup_steps + args.use_te_rng_tracker = cuda_graph_impl != "none" + + for key, value in kwargs.items(): + assert hasattr(args, key) + setattr(args, key, value) + + validate_args(args) + set_global_variables(args, False) + return args + + def get_batch(self, seq_length, micro_batch_size): + data = list(range(seq_length)) + input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + labels = 1 + torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda() + attention_mask = torch.ones( + (micro_batch_size, 1, seq_length, seq_length), dtype=bool + ).cuda() + loss_mask = torch.ones(seq_length).repeat((micro_batch_size, 1)).cuda() + return input_ids, labels, position_ids, attention_mask, loss_mask + + def _run_1f1b_helper(self, gpt_model, optimizer, data, num_iters, cuda_graph_warmup_steps): + from megatron.core.models.common.model_chunk_schedule_plan import ( + TransformerModelChunkSchedulePlan, + ) + from megatron.core.pipeline_parallel.schedules import set_current_microbatch + + schedule_plans = [] + losses = [] + set_current_microbatch(gpt_model[0], 1) + + gpt_model[0].zero_grad_buffer() + optimizer.zero_grad() + assert cuda_graph_warmup_steps > 0, "cuda_graph_warmup_steps must be greater than 0" + for fwd_mb_idx in range(num_iters + 1): + # Capture CUDA graphs after warmup if helper is provided + if self.cuda_graph_helper is not None and fwd_mb_idx == cuda_graph_warmup_steps: + self.cuda_graph_helper.create_cudagraphs() + + if fwd_mb_idx < cuda_graph_warmup_steps: + gpt_model[0].zero_grad_buffer() + optimizer.zero_grad() + output = gpt_model[0].forward(**data) + schedule_plans.append(None) + else: + if fwd_mb_idx == cuda_graph_warmup_steps: + extra_schedule_plan = unwrap_model(gpt_model[0]).build_schedule_plan(**data) + TransformerModelChunkSchedulePlan.run(extra_schedule_plan, None) + schedule_plans[-1] = extra_schedule_plan + f_schedule_plan = unwrap_model(gpt_model[0]).build_schedule_plan(**data) + b_schedule_plan = schedule_plans[-1] + schedule_plans.append(f_schedule_plan) + if b_schedule_plan is not None: + gpt_model[0].zero_grad_buffer() + optimizer.zero_grad() + output = TransformerModelChunkSchedulePlan.run( + f_schedule_plan, + b_schedule_plan, + b_grad=torch.ones_like(output) if fwd_mb_idx > 0 else None, + ) + # Check output shapes + if fwd_mb_idx < num_iters: + assert output is not None + assert output.shape[0] == self.micro_batch_size + assert output.shape[1] == self.seq_length + losses.append(output) + + if fwd_mb_idx < cuda_graph_warmup_steps: + output.backward(torch.ones_like(output)) + + for param in gpt_model[0].parameters(): + assert param.main_grad is not None + + update_successful, _, _ = optimizer.step() + assert update_successful + + return losses + + def _run_test_helper( + self, + ep_size, + cuda_graph_impl, + cuda_graph_scope, + cuda_graph_warmup_steps, + ep_overlap=False, + **kwargs, + ): + """Test fp8_param with gpt_model.""" + args = self.create_test_args( + cuda_graph_impl, + cuda_graph_scope, + cuda_graph_warmup_steps, + ep_size, + overlap_moe_expert_parallel_comm=ep_overlap, + **kwargs, + ) + if ep_overlap: + set_streams() + set_args(args) + torch.manual_seed(123) + Utils.initialize_model_parallel( + tensor_model_parallel_size=2, expert_model_parallel_size=ep_size + ) + + input_ids, labels, position_ids, attention_mask, loss_mask = self.get_batch( + self.seq_length, self.micro_batch_size + ) + + gpt_model, optimizer, _ = setup_model_and_optimizer( + self.model_provider, ModelType.encoder_or_decoder + ) + assert len(gpt_model) == 1 # Assume only one model in the model provider. + + loss_list = [] + + if cuda_graph_impl == "transformer_engine": + from megatron.core.transformer.cuda_graphs import TECudaGraphHelper + + self.cuda_graph_helper = TECudaGraphHelper( + model=gpt_model, + config=gpt_model[0].config, + seq_length=self.seq_length, + micro_batch_size=self.micro_batch_size, + optimizers=[optimizer], + ) + + num_iters = cuda_graph_warmup_steps + 2 + data = { + "input_ids": input_ids, + "position_ids": position_ids, + "attention_mask": attention_mask, + "labels": labels, + "loss_mask": loss_mask, + } + if not ep_overlap: + for i in range(num_iters): + gpt_model[0].zero_grad_buffer() + optimizer.zero_grad() + + # Capture CUDA graphs after warmup if helper is provided + if self.cuda_graph_helper is not None and i == cuda_graph_warmup_steps: + self.cuda_graph_helper.create_cudagraphs() + + output = unwrap_model(gpt_model[0]).forward(**data) + output = float16_to_fp32(output) + + # Check output shapes + assert output.shape[0] == self.micro_batch_size + assert output.shape[1] == self.seq_length + + # Verify gradients + output.backward(torch.ones_like(output)) + for param in gpt_model[0].parameters(): + assert param.main_grad is not None + + update_successful, _, _ = optimizer.step() + assert update_successful + + loss_list.append(output) + else: + loss_list = self._run_1f1b_helper( + gpt_model, optimizer, data, num_iters, cuda_graph_warmup_steps + ) + + return loss_list + + @pytest.mark.skipif( + not (HAVE_TE and is_te_min_version("2.10.0")), + reason="Partial CUDA graph support requires TransformerEngine version >= 2.10.0", + ) + @pytest.mark.parametrize("moe_dispatcher_type", ["alltoall", "deepep"]) + def test_moe_partial_cudagraph_with_ep_overlap(self, moe_dispatcher_type): + extra_kwargs = {"moe_layer_freq": 1} + if moe_dispatcher_type == "deepep": + if not is_deep_ep_available(): + pytest.skip("Deep EP is not available") + extra_kwargs["moe_token_dispatcher_type"] = "flex" + extra_kwargs["moe_flex_dispatcher_backend"] = "deepep" + extra_kwargs["moe_router_dtype"] = "fp32" + elif moe_dispatcher_type == "hybridep": + if not is_hybrid_ep_available(): + pytest.skip("Hybrid EP is not available") + extra_kwargs["moe_token_dispatcher_type"] = "flex" + extra_kwargs["moe_flex_dispatcher_backend"] = "hybridep" + else: + extra_kwargs["moe_token_dispatcher_type"] = moe_dispatcher_type + + loss_list_ref = self._run_test_helper(4, "none", None, 3, **extra_kwargs) + for cuda_graph_scope in [ + [CudaGraphScope.attn], + [CudaGraphScope.attn, CudaGraphScope.moe_router], + [CudaGraphScope.attn, CudaGraphScope.moe_router, CudaGraphScope.moe_preprocess], + ]: + cuda_graph_warmup_steps = 3 + loss_list = self._run_test_helper( + 4, + "transformer_engine", + cuda_graph_scope, + cuda_graph_warmup_steps, + ep_overlap=True, + **extra_kwargs, + ) + assert len(loss_list) == len(loss_list_ref) + for i in range(len(loss_list)): + assert torch.equal( + loss_list[i].mean(), loss_list_ref[i].mean() + ), f"scope={cuda_graph_scope}, i={i},loss_list={loss_list[i]}, loss_list_ref={loss_list_ref[i]}" + print(f"[DEBUG] Pass {cuda_graph_scope}") diff --git a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py index 5ec096e5a04..c6c4a75af99 100644 --- a/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py +++ b/tests/unit_tests/a2a_overlap/test_schedule_layer_1f1b.py @@ -306,7 +306,7 @@ def test_transformer_layer_overlap_shared_expert(self): "moe_shared_expert_intermediate_size": 512, } overlap_config = get_test_config(extra_kwargs=extra_kwargs) - extra_kwargs["moe_shared_expert_overlap"] = True + extra_kwargs["moe_shared_expert_overlap"] = False ref_config = get_test_config(extra_kwargs=extra_kwargs) microbatches = 4 with deterministic_mode(): diff --git a/tests/unit_tests/a2a_overlap/utils.py b/tests/unit_tests/a2a_overlap/utils.py index 7db4256a849..a52843956df 100644 --- a/tests/unit_tests/a2a_overlap/utils.py +++ b/tests/unit_tests/a2a_overlap/utils.py @@ -1,3 +1,4 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import os from contextlib import contextmanager from dataclasses import dataclass diff --git a/tests/unit_tests/pipeline_parallel/test_schedules.py b/tests/unit_tests/pipeline_parallel/test_schedules.py index b861aa2df49..86b9219fe0f 100644 --- a/tests/unit_tests/pipeline_parallel/test_schedules.py +++ b/tests/unit_tests/pipeline_parallel/test_schedules.py @@ -1,3 +1,5 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + import os import pytest @@ -127,6 +129,52 @@ def test_get_pipeline_parallel_order( for k, v in order_cnt.items(): assert -k in order_cnt and order_cnt[-k] == v + layers_per_chunk = 2 + num_layers_per_chunk = [layers_per_chunk] * num_model_chunks + # disable wgrad compute + overlapped_order, chunk_id_list = schedule.get_overlap_moe_expert_parallel_comm_order( + order, num_layers_per_chunk, False + ) + assert max(overlapped_order) == num_model_chunks * layers_per_chunk + assert len(overlapped_order) == len(order) * layers_per_chunk + assert len(chunk_id_list) == len(overlapped_order) + order_cnt = {} + accumulated_order = 0 + for o in overlapped_order: + order_cnt[o] = order_cnt.get(o, 0) + 1 + if o < 0: + assert -o in order_cnt and order_cnt[-o] >= order_cnt[o] + elif -o in order_cnt: + assert order_cnt[-o] < order_cnt[o] + accumulated_order += o + assert accumulated_order >= 0 + assert accumulated_order == 0 + + # enable wgrad compute + overlapped_order, chunk_id_list = schedule.get_overlap_moe_expert_parallel_comm_order( + order, num_layers_per_chunk, True + ) + assert max(overlapped_order) == num_model_chunks * layers_per_chunk + assert len(overlapped_order) == len(order) * layers_per_chunk * 3 // 2 + assert len(chunk_id_list) == len(overlapped_order) + from math import ceil + + order_cnt = {} + accumulated_order = 0 + prev_o = 0 + for o in overlapped_order: + if ceil(o) != o: + assert prev_o - 0.5 == o + else: + order_cnt[o] = order_cnt.get(o, 0) + 1 + if o < 0: + assert -o in order_cnt and order_cnt[-o] >= order_cnt[o] + elif -o in order_cnt: + assert order_cnt[-o] < order_cnt[o] + accumulated_order += o + prev_o = o + assert accumulated_order < 0 + Utils.destroy_model_parallel() diff --git a/tests/unit_tests/transformer/test_submodule_callables.py b/tests/unit_tests/transformer/test_submodule_callables.py index 1ccb6fd5be8..73059495c06 100644 --- a/tests/unit_tests/transformer/test_submodule_callables.py +++ b/tests/unit_tests/transformer/test_submodule_callables.py @@ -64,7 +64,7 @@ def run_model_submodules_with_capture(model, input_tensors, microbatches): output_tensors = [] # get callables callables, dw = build_layer_callables(model) - attn, post_attn, dispatch, moe, combine, post_process = callables + attn, dispatch, moe, combine, post_process = callables assert post_process is None dummy_model = DummyState() dummy_model.decoder = DummyState() @@ -76,24 +76,16 @@ def run_model_submodules_with_capture(model, input_tensors, microbatches): node.chunk_state.model = dummy_model # attn fwd - hidden_states = attn(node, input_tensors[i]) - - # post attn fwd - local_tokens, probs = post_attn(node, hidden_states) + local_tokens, probs = attn(node, input_tensors[i]) # dispatch fwd dispatched_tokens = dispatch(node, local_tokens, probs) # moe fwd - expert_outputs = moe(node, dispatched_tokens) - if model.mlp.use_shared_expert: - expert_output, shared_expert_output = expert_outputs - else: - expert_output = expert_outputs - shared_expert_output = None + expert_output = moe(node, dispatched_tokens) # combine fwd - hidden_states = combine(node, expert_output, shared_expert_output) + hidden_states = combine(node, expert_output) # loss output_tensors.append(hidden_states) From 0bc4114957a22d186e7c700e42b1c131b806e78b Mon Sep 17 00:00:00 2001 From: Pingtian Li <158665726+Wohox@users.noreply.github.com> Date: Wed, 7 Jan 2026 12:13:19 +0800 Subject: [PATCH 217/248] [Dev] fix EP Overlap Partial Cuda Graph Unit Test hang issue (#2838) --- .../a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py b/tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py index 91c74fe1bb6..719bd5df18f 100644 --- a/tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py +++ b/tests/unit_tests/a2a_overlap/test_cuda_graphed_schedule_chunk_1f1b.py @@ -71,12 +71,15 @@ def teardown_method(self, method): Utils.destroy_model_parallel() destroy_global_vars() destroy_num_microbatches_calculator() + self.delete_cuda_graphs() + + gc.collect() + + def delete_cuda_graphs(self): if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created(): self.cuda_graph_helper.delete_cuda_graphs() self.cuda_graph_helper = None - gc.collect() - def model_provider( self, pre_process=True, @@ -326,6 +329,8 @@ def _run_test_helper( gpt_model, optimizer, data, num_iters, cuda_graph_warmup_steps ) + self.delete_cuda_graphs() + return loss_list @pytest.mark.skipif( From 28c586e91506631835d8c5f29bf325a4e5aefddd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 8 Jan 2026 20:13:19 +0100 Subject: [PATCH 218/248] build: Bump jet-client (#2877) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- docker/Dockerfile.ci.dev | 4 +++- docker/Dockerfile.ci.nemo | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev index fa4d84bcad0..3f440efcd47 100644 --- a/docker/Dockerfile.ci.dev +++ b/docker/Dockerfile.ci.dev @@ -1,3 +1,5 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + # syntax=docker/dockerfile:1.3-labs ARG FROM_IMAGE_NAME @@ -90,6 +92,6 @@ RUN --mount=type=secret,id=JET_INDEX_URLS \ LOGGER_INDEX_URL=$(cat /run/secrets/LOGGER_INDEX_URL) uv pip install --no-cache-dir --upgrade $LOGGER_INDEX_URL "one-logger" uv pip install --no-cache-dir --upgrade "setuptools<80.0.0" - uv pip install --no-cache-dir --upgrade $JET_INDEX_URLS "jet-client~=3.0" + uv pip install --no-cache-dir --upgrade $JET_INDEX_URLS "jet-client~=4.0" EOF ### diff --git a/docker/Dockerfile.ci.nemo b/docker/Dockerfile.ci.nemo index 2369602f54d..93fe23bfd6f 100644 --- a/docker/Dockerfile.ci.nemo +++ b/docker/Dockerfile.ci.nemo @@ -1,3 +1,5 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + # syntax=docker/dockerfile:1.3-labs ARG FROM_IMAGE_NAME @@ -14,7 +16,7 @@ FROM main as jet ARG JET_API_VERSION RUN --mount=type=secret,id=JET_INDEX_URLS \ JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \ - pip install --no-cache-dir jet-api==$JET_API_VERSION "jet-client~=3.0" --upgrade $JET_INDEX_URLS + pip install --no-cache-dir jet-api==$JET_API_VERSION "jet-client~=4.0" --upgrade $JET_INDEX_URLS ENV PATH="$PATH:/opt/jet/bin" ### From 46d1f47d74c782f45c0bcdf4da001aed982c8de9 Mon Sep 17 00:00:00 2001 From: vasunvidia <108759426+vasunvidia@users.noreply.github.com> Date: Thu, 8 Jan 2026 17:00:06 -0800 Subject: [PATCH 219/248] FP8 attention knob for nvFP4 recipe (#2818) --- megatron/core/fp4_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/megatron/core/fp4_utils.py b/megatron/core/fp4_utils.py index 4f9e7e5d026..a4cc172796b 100644 --- a/megatron/core/fp4_utils.py +++ b/megatron/core/fp4_utils.py @@ -86,7 +86,9 @@ def get_fp4_recipe(config: TransformerConfig): if is_te_min_version("2.7.0.dev0"): if config.fp4_recipe == Fp4Recipe.nvfp4: try: - fp4_recipe = transformer_engine.common.recipe.NVFP4BlockScaling() + fp4_recipe = transformer_engine.common.recipe.NVFP4BlockScaling( + fp8_dpa=config.fp8_dot_product_attention + ) except AttributeError: raise ValueError( """NVFP4BlockScaling recipe is not available in this version of From ed6ebff3021e5eb5fc45aa13c00c9cdca889288f Mon Sep 17 00:00:00 2001 From: Zhongbo Zhu <42691305+zhongbozhu@users.noreply.github.com> Date: Thu, 8 Jan 2026 20:00:59 -0800 Subject: [PATCH 220/248] [DEV][NVFP4][MOE] 128 Zero Padding for Grouped Quantization kernels and Cuda Graph Support (#2654) Signed-off-by: Zhongbo Zhu Co-authored-by: Xin Yao --- megatron/core/fp4_utils.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/megatron/core/fp4_utils.py b/megatron/core/fp4_utils.py index a4cc172796b..95368d7c2b7 100644 --- a/megatron/core/fp4_utils.py +++ b/megatron/core/fp4_utils.py @@ -61,13 +61,23 @@ def get_fp4_align_size(fp4_recipe: Fp4Recipe) -> int: Note that since we are also random hadamard transform for NVFP4 training, we want fused group nvfp4 quantize plus hadamard transform. Hadamard transform will leverage tensor core instructions for better performance, while group quantize kernels also - prefer a more aligned size in token dimension M. Therefore, we apply align size 64 - here for better performance in MOE. + prefer a more aligned size in token dimension M. The efficiently leverage grouped + kernels, padding needs to be 64 multiple, but 128 multiple will bring even faster. + + When it comes to MOE cuda graph support, the number of tokens for each expert should + be a buffer on device memory, which means that we don't know the token dimension for + each expertin host, therefore we cannot calculate the zero padded scaling factors shape + on host to comply with the NVFP4 GEMM scaling factor layout. However, if we have already + zero padded the tokens to 128 multiple, then there is no need for such padding, so that + host doesn't need to copy the token distribution from device to host (which will break + the CUDA graph). Paper link: https://arxiv.org/pdf/2509.25149 + Scaling factor layout: https://docs.nvidia.com/cuda/cublas/#d-block-scaling-factors-layout + TE NVFP4 Grouped Quantization: https://github.com/NVIDIA/TransformerEngine/pull/2411 """ # pylint: disable=unused-argument - return 64 + return 128 def dequantize_fp4_tensor(fp4_tensor: torch.Tensor) -> torch.Tensor: From ebe7079ba472894e5f6ec845ca0027e1fd0c0e10 Mon Sep 17 00:00:00 2001 From: vasunvidia <108759426+vasunvidia@users.noreply.github.com> Date: Thu, 8 Jan 2026 20:01:01 -0800 Subject: [PATCH 221/248] Add check for full_iteration scope before instantiating CudaGraphManager (#2657) Co-authored-by: Xin Yao Co-authored-by: Zijie Yan --- megatron/core/transformer/module.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/megatron/core/transformer/module.py b/megatron/core/transformer/module.py index 2330df91b52..d68f34ffd0b 100644 --- a/megatron/core/transformer/module.py +++ b/megatron/core/transformer/module.py @@ -9,6 +9,7 @@ from megatron.core import parallel_state from megatron.core.dist_checkpointing.mapping import ShardedStateDict +from megatron.core.transformer.enums import CudaGraphScope from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.utils import ( ensure_metadata_has_dp_cp_group, @@ -167,7 +168,10 @@ def __init__(self, config: TransformerConfig, vp_stage: Optional[int] = None): assert isinstance(config, TransformerConfig), "config must be a TransformerConfig" # Enable cuda graphs. - if config.cuda_graph_impl == "local": + if ( + config.cuda_graph_impl == "local" + and CudaGraphScope.full_iteration not in config.cuda_graph_scope + ): from megatron.core.transformer.cuda_graphs import CudaGraphManager self.cudagraph_manager = CudaGraphManager(config, vp_stage=vp_stage) From 736da3cff027dd7f3849d1340dad0f8586b02666 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 9 Jan 2026 10:06:58 +0100 Subject: [PATCH 222/248] Reapply "[Dev] Use the latest Hybrid-EP (#2423)" (#2867) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- docker/Dockerfile.ci.dev | 4 +- megatron/core/transformer/moe/fused_a2a.py | 51 +++++-------------- .../core/transformer/moe/token_dispatcher.py | 15 ++---- 3 files changed, 19 insertions(+), 51 deletions(-) diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev index 3f440efcd47..d8c1dd33942 100644 --- a/docker/Dockerfile.ci.dev +++ b/docker/Dockerfile.ci.dev @@ -62,9 +62,9 @@ RUN bash -ex <<"EOF" ln -s libnvshmem_host.so.3 libnvshmem_host.so popd - git clone --branch hybrid-ep https://github.com/deepseek-ai/DeepEP.git + git clone --branch hybrid-ep https://github.com/Autumn1998/DeepEP.git pushd DeepEP - git checkout 1dddd194c26911c35b4f53a148617dd73de0ffc9 + git checkout df375b40f24e5c495e2db36e808125266661652c patch -p1 < /workspace/deepep.patch popd TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" uv pip install --no-build-isolation -v DeepEP/. diff --git a/megatron/core/transformer/moe/fused_a2a.py b/megatron/core/transformer/moe/fused_a2a.py index 045a93039b3..aa13b9b5b5b 100644 --- a/megatron/core/transformer/moe/fused_a2a.py +++ b/megatron/core/transformer/moe/fused_a2a.py @@ -3,6 +3,7 @@ # Copyright (c) 2025 DeepSeek # Licensed under the MIT License - https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE +from megatron.core.utils import internal_api try: from deep_ep import Buffer @@ -328,6 +329,7 @@ def reset_hybrid_ep_buffer(): _hybrid_ep_buffer = None +@internal_api class HybridEPDispatch(torch.autograd.Function): ''' Fused dispatch operation for permute + dispatch a2a + permute using the HybridEP backend @@ -343,7 +345,6 @@ def forward( num_local_experts, num_sms_dispatch_api=24, num_sms_combine_api=24, - num_dispatched_tokens=None, num_permuted_tokens=None, pad_multiple=None, ): @@ -362,11 +363,9 @@ def forward( num_sms_combine_api, fp8_dispatch, ) - # Defaultly, the output token_per_expert and num_dispatched_tokens_tensor - # will be put on the CPU to avoid the potential sync in combine/backward pass, - # but if we provide the num_dispatched_tokens and num_permuted_tokens on CPU, - # we do not need to the D2H here. - use_host_meta = num_dispatched_tokens is None or num_permuted_tokens is None + # If we provide the num_permuted_tokens, we do not need to use sync to + # wait for the data in pinned memory ready + non_blocking = num_permuted_tokens is not None # Process the dispatch ( dispatched_hidden, @@ -381,14 +380,12 @@ def forward( scaling_factor=None, num_of_experts_per_rank=num_local_experts, pad_multiple=pad_multiple, - num_dispatched_tokens=num_dispatched_tokens, num_permuted_tokens=num_permuted_tokens, - use_host_meta=use_host_meta, + non_blocking=non_blocking, ) ctx.handle = handle ctx.pad_multiple = pad_multiple - ctx.num_dispatched_tokens = num_dispatched_tokens return ( dispatched_hidden, dispatched_probs, @@ -404,36 +401,27 @@ def backward(ctx, grad_x, grad_probs, grad_scaling_factor, grad_tokens_per_exper ''' handle = ctx.handle combined_hidden, combined_probs = _hybrid_ep_buffer.combine_with_unpermute( - hidden=grad_x, - probs=grad_probs, - handle=handle, - pad_multiple=ctx.pad_multiple, - num_dispatched_tokens=ctx.num_dispatched_tokens, + hidden=grad_x, probs=grad_probs, handle=handle, pad_multiple=ctx.pad_multiple ) return combined_hidden, None, combined_probs, None, None, None, None, None, None, None +@internal_api class HybridEPCombine(torch.autograd.Function): ''' Fused combine operation for permute + combine a2a + permute using the HybridEP backend ''' @staticmethod - def forward( - ctx, x, handle, num_dispatched_tokens=None, num_permuted_tokens=None, pad_multiple=None - ): + def forward(ctx, x, handle, num_permuted_tokens=None, pad_multiple=None): ''' Forward pass of fused combine of the HybridEP backend ''' combined_hidden, _ = _hybrid_ep_buffer.combine_with_unpermute( - hidden=x, - handle=handle, - pad_multiple=pad_multiple, - num_dispatched_tokens=num_dispatched_tokens, + hidden=x, handle=handle, pad_multiple=pad_multiple ) ctx.handle = handle ctx.pad_multiple = pad_multiple - ctx.num_dispatched_tokens = num_dispatched_tokens ctx.num_permuted_tokens = num_permuted_tokens return combined_hidden @@ -448,7 +436,6 @@ def backward(ctx, grad_x): scaling_factor=None, handle=handle, pad_multiple=ctx.pad_multiple, - num_dispatched_tokens=ctx.num_dispatched_tokens, num_permuted_tokens=ctx.num_permuted_tokens, ) return dispatched_hidden, None, None, None, None @@ -456,6 +443,7 @@ def backward(ctx, grad_x): if HAVE_HYBRIDEP: + @internal_api def hybrid_ep_dispatch( x, routing_map, @@ -464,7 +452,6 @@ def hybrid_ep_dispatch( num_local_experts, num_sms_dispatch_api=24, num_sms_combine_api=24, - num_dispatched_tokens=None, num_permuted_tokens=None, pad_multiple=None, ): @@ -487,10 +474,6 @@ def hybrid_ep_dispatch( Number of SMs used by the dispatch API. num_sms_combine_api (int): Number of SMs used by the combine API. - num_dispatched_tokens (int): - Number of tokens after dispatch but before permute. HybridEP uses this - to allocate buffers. If not provided, HybridEP obtains the size from - a GPU tensor, which causes a D2H synchronization. num_permuted_tokens (int): Number of tokens after permute. HybridEP uses this to allocate buffers. If not provided, HybridEP obtains the size from a GPU tensor, @@ -507,12 +490,12 @@ def hybrid_ep_dispatch( num_local_experts, num_sms_dispatch_api, num_sms_combine_api, - num_dispatched_tokens, num_permuted_tokens, pad_multiple, ) - def hybrid_ep_combine(x, handle, num_dispatched_tokens, num_permuted_tokens, pad_multiple): + @internal_api + def hybrid_ep_combine(x, handle, num_permuted_tokens, pad_multiple): ''' Perform fused combine operation for unpermute + combine a2a + unpermute using the HybridEP backend @@ -522,10 +505,6 @@ def hybrid_ep_combine(x, handle, num_dispatched_tokens, num_permuted_tokens, pad Input hidden states to combine handle (EventHandle): Communication handle from dispatch operation - num_dispatched_tokens (int): - The number of tokens after unpermute but before combine. HybridEP uses this - to allocate buffers. If not provided, HybridEP obtains the size from a GPU tensor, - which causes a D2H synchronization. num_permuted_tokens (int): The number of tokens before unpermute. HybridEP uses this to allocate buffers. If not provided, HybridEP obtains the size from a GPU tensor, which causes a D2H synchronization. @@ -533,9 +512,7 @@ def hybrid_ep_combine(x, handle, num_dispatched_tokens, num_permuted_tokens, pad The alignment multiple required for FP8 GEMM. If not provided, no padding is performed. ''' - return HybridEPCombine.apply( - x, handle, num_dispatched_tokens, num_permuted_tokens, pad_multiple - ) + return HybridEPCombine.apply(x, handle, num_permuted_tokens, pad_multiple) else: hybrid_ep_dispatch = None diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py index 61ef0b5f084..d0da38d6322 100644 --- a/megatron/core/transformer/moe/token_dispatcher.py +++ b/megatron/core/transformer/moe/token_dispatcher.py @@ -985,11 +985,8 @@ def __init__( if self.drop_and_pad: assert self.capacity_factor is not None self.capacity = None - # The up-bound for the number of tokens after dispatch op, -1 means no up-bound, - # which will cause a CPU sync - self.num_dispatched_tokens = None - # Actually the sum of tokens_per_expert, the up-bound for the number of tokens - # after permute op, -1 means no up-bound, will cause a CPU sync + # Actually the the up-bound for the number of tokens + # after permute op, None means no up-bound, will cause a CPU sync self.num_permuted_tokens = None # Metadata @@ -1018,12 +1015,9 @@ def setup_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor): num_experts=self.num_experts, capacity_factor=self.capacity_factor, ) - # We cannot predict the actual number of tokens after the dispatch op, - # so we set it to the worst case in drop_and_pad mode - self.num_dispatched_tokens = self.capacity * self.group.size() * self.num_local_experts # In drop_and_pad mode, the number of tokens after the permute op # can be computed on the CPU - self.num_permuted_tokens = self.num_dispatched_tokens + self.num_permuted_tokens = self.capacity * self.group.size() * self.num_local_experts self.tokens_per_expert = torch.full( (self.num_local_experts,), self.capacity * self.group.size(), dtype=torch.long ) @@ -1052,7 +1046,6 @@ def dispatch( num_local_experts=self.num_local_experts, num_sms_dispatch_api=self.config.moe_hybridep_num_sms, num_sms_combine_api=self.config.moe_hybridep_num_sms, - num_dispatched_tokens=self.num_dispatched_tokens, num_permuted_tokens=self.num_permuted_tokens, pad_multiple=self.pad_multiple, ) @@ -1074,7 +1067,6 @@ def combine( hidden_states = hybrid_ep_combine( x=hidden_states, handle=self.handle, - num_dispatched_tokens=self.num_dispatched_tokens, num_permuted_tokens=self.num_permuted_tokens, pad_multiple=self.pad_multiple, ) @@ -1084,7 +1076,6 @@ def combine( self.handle = None if not self.drop_and_pad: self.num_permuted_tokens = None - self.num_dispatched_tokens = None return hidden_states def get_permuted_hidden_states_by_experts(self, hidden_states: torch.Tensor) -> torch.Tensor: From 9d741cf674fd29fca38988e54ae2f36505a7cc6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 13 Jan 2026 00:12:33 +0100 Subject: [PATCH 223/248] build: Main dependency bump for 26.02 (#2682) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/actions/action.yml | 1 + .gitlab/stages/01.build.yml | 8 +- docker/.ngc_version.dev | 2 +- pyproject.toml | 5 +- .../launch_nemo_run_workload.py | 1 + uv.lock | 1228 ++++++++--------- 6 files changed, 604 insertions(+), 641 deletions(-) diff --git a/.github/actions/action.yml b/.github/actions/action.yml index 5c35385b036..a17b4a9a8c1 100644 --- a/.github/actions/action.yml +++ b/.github/actions/action.yml @@ -77,6 +77,7 @@ runs: export PYTHONPATH=$(pwd) export NEMORUN_HOME=$(pwd) + export NCCL_DEBUG=INFO pip install --no-cache-dir uv uv sync --only-group test uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \ diff --git a/.gitlab/stages/01.build.yml b/.gitlab/stages/01.build.yml index d67225311f6..b3ab8cc5bd5 100644 --- a/.gitlab/stages/01.build.yml +++ b/.gitlab/stages/01.build.yml @@ -16,13 +16,13 @@ services: - name: docker:24.0.5-dind variables: - HEALTHCHECK_TCP_PORT: "2376" + HEALTHCHECK_TCP_PORT: '2376' timeout: 180m variables: DOCKER_HOST: tcp://docker:2376 - DOCKER_TLS_CERTDIR: "/certs" + DOCKER_TLS_CERTDIR: '/certs' DOCKER_TLS_VERIFY: 1 - DOCKER_CERT_PATH: "$DOCKER_TLS_CERTDIR/client" + DOCKER_CERT_PATH: '$DOCKER_TLS_CERTDIR/client' TAG: purpose/builder-large STAGE: jet MCORE_BACKWARDS_REF: core_r0.14.0 @@ -59,7 +59,7 @@ test:build_image: - IMAGE: CI_MCORE_DEV_IMAGE FILE: Dockerfile.ci.dev IMAGE_TYPE: dev - BASE_IMAGE: nvcr.io/nvidia/pytorch:25.09-py3 + BASE_IMAGE: nvcr.io/nvidia/pytorch:25.11-py3 - IMAGE: UTILITY_IMAGE FILE: Dockerfile.linting BASE_IMAGE: python:3.10 diff --git a/docker/.ngc_version.dev b/docker/.ngc_version.dev index 6b72812b34f..8e8108b9a9a 100644 --- a/docker/.ngc_version.dev +++ b/docker/.ngc_version.dev @@ -1 +1 @@ -nvcr.io/nvidia/pytorch:25.09-py3 \ No newline at end of file +nvcr.io/nvidia/pytorch:25.11-py3 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 120db5b2ad7..22ee405cb4f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,7 +68,7 @@ mlm = ["flask-restful", "sentencepiece", "tiktoken", "wandb", "transformers"] dev = [ "nvidia-modelopt[torch]; sys_platform != 'darwin'", - "transformer-engine[pytorch,core_cu13]>=2.9.0a0,<2.11.0", + "transformer-engine[pytorch,core_cu13]>=2.9.0a0,<2.12.0", "nvidia-resiliency-ext", "tqdm", "einops~=0.8", @@ -174,10 +174,11 @@ override-dependencies = [ ] [tool.uv.sources] + flash_mla = [ { git = "https://github.com/deepseek-ai/FlashMLA", rev = "9edee0c022cd0938148a18e334203b0aab43aa19" }, ] -# transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.10" } # on `release_v2.10` +transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.11" } nemo-run = { git = "https://github.com/NVIDIA-NeMo/Run.git", rev = "01a9a8ba360f7b2908728ad0516e0ad9d936966d" } emerging_optimizers = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git", rev = "v0.1.0" } diff --git a/tests/test_utils/python_scripts/launch_nemo_run_workload.py b/tests/test_utils/python_scripts/launch_nemo_run_workload.py index 6e2b73e430f..26a7dbd79f5 100644 --- a/tests/test_utils/python_scripts/launch_nemo_run_workload.py +++ b/tests/test_utils/python_scripts/launch_nemo_run_workload.py @@ -115,6 +115,7 @@ def main( "ENABLE_LIGHTWEIGHT_MODE": str(enable_lightweight_mode).lower(), "N_REPEAT": "1", "CLUSTER": "dgxh100_dgxc", + "NCCL_DEBUG": "INFO", }, packager=run.Packager(), volumes=artifacts, diff --git a/uv.lock b/uv.lock index b36351849fe..15892827c83 100644 --- a/uv.lock +++ b/uv.lock @@ -75,7 +75,7 @@ wheels = [ [[package]] name = "aiohttp" -version = "3.13.2" +version = "3.13.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohappyeyeballs" }, @@ -87,110 +87,110 @@ dependencies = [ { name = "propcache" }, { name = "yarl" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/1c/ce/3b83ebba6b3207a7135e5fcaba49706f8a4b6008153b4e30540c982fae26/aiohttp-3.13.2.tar.gz", hash = "sha256:40176a52c186aefef6eb3cad2cdd30cd06e3afbe88fe8ab2af9c0b90f228daca", size = 7837994, upload-time = "2025-10-28T20:59:39.937Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6d/34/939730e66b716b76046dedfe0842995842fa906ccc4964bba414ff69e429/aiohttp-3.13.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2372b15a5f62ed37789a6b383ff7344fc5b9f243999b0cd9b629d8bc5f5b4155", size = 736471, upload-time = "2025-10-28T20:55:27.924Z" }, - { url = "https://files.pythonhosted.org/packages/fd/cf/dcbdf2df7f6ca72b0bb4c0b4509701f2d8942cf54e29ca197389c214c07f/aiohttp-3.13.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e7f8659a48995edee7229522984bd1009c1213929c769c2daa80b40fe49a180c", size = 493985, upload-time = "2025-10-28T20:55:29.456Z" }, - { url = "https://files.pythonhosted.org/packages/9d/87/71c8867e0a1d0882dcbc94af767784c3cb381c1c4db0943ab4aae4fed65e/aiohttp-3.13.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:939ced4a7add92296b0ad38892ce62b98c619288a081170695c6babe4f50e636", size = 489274, upload-time = "2025-10-28T20:55:31.134Z" }, - { url = "https://files.pythonhosted.org/packages/38/0f/46c24e8dae237295eaadd113edd56dee96ef6462adf19b88592d44891dc5/aiohttp-3.13.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6315fb6977f1d0dd41a107c527fee2ed5ab0550b7d885bc15fee20ccb17891da", size = 1668171, upload-time = "2025-10-28T20:55:36.065Z" }, - { url = "https://files.pythonhosted.org/packages/eb/c6/4cdfb4440d0e28483681a48f69841fa5e39366347d66ef808cbdadddb20e/aiohttp-3.13.2-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6e7352512f763f760baaed2637055c49134fd1d35b37c2dedfac35bfe5cf8725", size = 1636036, upload-time = "2025-10-28T20:55:37.576Z" }, - { url = "https://files.pythonhosted.org/packages/84/37/8708cf678628216fb678ab327a4e1711c576d6673998f4f43e86e9ae90dd/aiohttp-3.13.2-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e09a0a06348a2dd73e7213353c90d709502d9786219f69b731f6caa0efeb46f5", size = 1727975, upload-time = "2025-10-28T20:55:39.457Z" }, - { url = "https://files.pythonhosted.org/packages/e6/2e/3ebfe12fdcb9b5f66e8a0a42dffcd7636844c8a018f261efb2419f68220b/aiohttp-3.13.2-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a09a6d073fb5789456545bdee2474d14395792faa0527887f2f4ec1a486a59d3", size = 1815823, upload-time = "2025-10-28T20:55:40.958Z" }, - { url = "https://files.pythonhosted.org/packages/a1/4f/ca2ef819488cbb41844c6cf92ca6dd15b9441e6207c58e5ae0e0fc8d70ad/aiohttp-3.13.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b59d13c443f8e049d9e94099c7e412e34610f1f49be0f230ec656a10692a5802", size = 1669374, upload-time = "2025-10-28T20:55:42.745Z" }, - { url = "https://files.pythonhosted.org/packages/f8/fe/1fe2e1179a0d91ce09c99069684aab619bf2ccde9b20bd6ca44f8837203e/aiohttp-3.13.2-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:20db2d67985d71ca033443a1ba2001c4b5693fe09b0e29f6d9358a99d4d62a8a", size = 1555315, upload-time = "2025-10-28T20:55:44.264Z" }, - { url = "https://files.pythonhosted.org/packages/5a/2b/f3781899b81c45d7cbc7140cddb8a3481c195e7cbff8e36374759d2ab5a5/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:960c2fc686ba27b535f9fd2b52d87ecd7e4fd1cf877f6a5cba8afb5b4a8bd204", size = 1639140, upload-time = "2025-10-28T20:55:46.626Z" }, - { url = "https://files.pythonhosted.org/packages/72/27/c37e85cd3ece6f6c772e549bd5a253d0c122557b25855fb274224811e4f2/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:6c00dbcf5f0d88796151e264a8eab23de2997c9303dd7c0bf622e23b24d3ce22", size = 1645496, upload-time = "2025-10-28T20:55:48.933Z" }, - { url = "https://files.pythonhosted.org/packages/66/20/3af1ab663151bd3780b123e907761cdb86ec2c4e44b2d9b195ebc91fbe37/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fed38a5edb7945f4d1bcabe2fcd05db4f6ec7e0e82560088b754f7e08d93772d", size = 1697625, upload-time = "2025-10-28T20:55:50.377Z" }, - { url = "https://files.pythonhosted.org/packages/95/eb/ae5cab15efa365e13d56b31b0d085a62600298bf398a7986f8388f73b598/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:b395bbca716c38bef3c764f187860e88c724b342c26275bc03e906142fc5964f", size = 1542025, upload-time = "2025-10-28T20:55:51.861Z" }, - { url = "https://files.pythonhosted.org/packages/e9/2d/1683e8d67ec72d911397fe4e575688d2a9b8f6a6e03c8fdc9f3fd3d4c03f/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:204ffff2426c25dfda401ba08da85f9c59525cdc42bda26660463dd1cbcfec6f", size = 1714918, upload-time = "2025-10-28T20:55:53.515Z" }, - { url = "https://files.pythonhosted.org/packages/99/a2/ffe8e0e1c57c5e542d47ffa1fcf95ef2b3ea573bf7c4d2ee877252431efc/aiohttp-3.13.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:05c4dd3c48fb5f15db31f57eb35374cb0c09afdde532e7fb70a75aede0ed30f6", size = 1656113, upload-time = "2025-10-28T20:55:55.438Z" }, - { url = "https://files.pythonhosted.org/packages/0d/42/d511aff5c3a2b06c09d7d214f508a4ad8ac7799817f7c3d23e7336b5e896/aiohttp-3.13.2-cp310-cp310-win32.whl", hash = "sha256:e574a7d61cf10351d734bcddabbe15ede0eaa8a02070d85446875dc11189a251", size = 432290, upload-time = "2025-10-28T20:55:56.96Z" }, - { url = "https://files.pythonhosted.org/packages/8b/ea/1c2eb7098b5bad4532994f2b7a8228d27674035c9b3234fe02c37469ef14/aiohttp-3.13.2-cp310-cp310-win_amd64.whl", hash = "sha256:364f55663085d658b8462a1c3f17b2b84a5c2e1ba858e1b79bff7b2e24ad1514", size = 455075, upload-time = "2025-10-28T20:55:58.373Z" }, - { url = "https://files.pythonhosted.org/packages/35/74/b321e7d7ca762638cdf8cdeceb39755d9c745aff7a64c8789be96ddf6e96/aiohttp-3.13.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4647d02df098f6434bafd7f32ad14942f05a9caa06c7016fdcc816f343997dd0", size = 743409, upload-time = "2025-10-28T20:56:00.354Z" }, - { url = "https://files.pythonhosted.org/packages/99/3d/91524b905ec473beaf35158d17f82ef5a38033e5809fe8742e3657cdbb97/aiohttp-3.13.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e3403f24bcb9c3b29113611c3c16a2a447c3953ecf86b79775e7be06f7ae7ccb", size = 497006, upload-time = "2025-10-28T20:56:01.85Z" }, - { url = "https://files.pythonhosted.org/packages/eb/d3/7f68bc02a67716fe80f063e19adbd80a642e30682ce74071269e17d2dba1/aiohttp-3.13.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:43dff14e35aba17e3d6d5ba628858fb8cb51e30f44724a2d2f0c75be492c55e9", size = 493195, upload-time = "2025-10-28T20:56:03.314Z" }, - { url = "https://files.pythonhosted.org/packages/98/31/913f774a4708775433b7375c4f867d58ba58ead833af96c8af3621a0d243/aiohttp-3.13.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e2a9ea08e8c58bb17655630198833109227dea914cd20be660f52215f6de5613", size = 1747759, upload-time = "2025-10-28T20:56:04.904Z" }, - { url = "https://files.pythonhosted.org/packages/e8/63/04efe156f4326f31c7c4a97144f82132c3bb21859b7bb84748d452ccc17c/aiohttp-3.13.2-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:53b07472f235eb80e826ad038c9d106c2f653584753f3ddab907c83f49eedead", size = 1704456, upload-time = "2025-10-28T20:56:06.986Z" }, - { url = "https://files.pythonhosted.org/packages/8e/02/4e16154d8e0a9cf4ae76f692941fd52543bbb148f02f098ca73cab9b1c1b/aiohttp-3.13.2-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e736c93e9c274fce6419af4aac199984d866e55f8a4cec9114671d0ea9688780", size = 1807572, upload-time = "2025-10-28T20:56:08.558Z" }, - { url = "https://files.pythonhosted.org/packages/34/58/b0583defb38689e7f06798f0285b1ffb3a6fb371f38363ce5fd772112724/aiohttp-3.13.2-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ff5e771f5dcbc81c64898c597a434f7682f2259e0cd666932a913d53d1341d1a", size = 1895954, upload-time = "2025-10-28T20:56:10.545Z" }, - { url = "https://files.pythonhosted.org/packages/6b/f3/083907ee3437425b4e376aa58b2c915eb1a33703ec0dc30040f7ae3368c6/aiohttp-3.13.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3b6fb0c207cc661fa0bf8c66d8d9b657331ccc814f4719468af61034b478592", size = 1747092, upload-time = "2025-10-28T20:56:12.118Z" }, - { url = "https://files.pythonhosted.org/packages/ac/61/98a47319b4e425cc134e05e5f3fc512bf9a04bf65aafd9fdcda5d57ec693/aiohttp-3.13.2-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:97a0895a8e840ab3520e2288db7cace3a1981300d48babeb50e7425609e2e0ab", size = 1606815, upload-time = "2025-10-28T20:56:14.191Z" }, - { url = "https://files.pythonhosted.org/packages/97/4b/e78b854d82f66bb974189135d31fce265dee0f5344f64dd0d345158a5973/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9e8f8afb552297aca127c90cb840e9a1d4bfd6a10d7d8f2d9176e1acc69bad30", size = 1723789, upload-time = "2025-10-28T20:56:16.101Z" }, - { url = "https://files.pythonhosted.org/packages/ed/fc/9d2ccc794fc9b9acd1379d625c3a8c64a45508b5091c546dea273a41929e/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:ed2f9c7216e53c3df02264f25d824b079cc5914f9e2deba94155190ef648ee40", size = 1718104, upload-time = "2025-10-28T20:56:17.655Z" }, - { url = "https://files.pythonhosted.org/packages/66/65/34564b8765ea5c7d79d23c9113135d1dd3609173da13084830f1507d56cf/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:99c5280a329d5fa18ef30fd10c793a190d996567667908bef8a7f81f8202b948", size = 1785584, upload-time = "2025-10-28T20:56:19.238Z" }, - { url = "https://files.pythonhosted.org/packages/30/be/f6a7a426e02fc82781afd62016417b3948e2207426d90a0e478790d1c8a4/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:2ca6ffef405fc9c09a746cb5d019c1672cd7f402542e379afc66b370833170cf", size = 1595126, upload-time = "2025-10-28T20:56:20.836Z" }, - { url = "https://files.pythonhosted.org/packages/e5/c7/8e22d5d28f94f67d2af496f14a83b3c155d915d1fe53d94b66d425ec5b42/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:47f438b1a28e926c37632bff3c44df7d27c9b57aaf4e34b1def3c07111fdb782", size = 1800665, upload-time = "2025-10-28T20:56:22.922Z" }, - { url = "https://files.pythonhosted.org/packages/d1/11/91133c8b68b1da9fc16555706aa7276fdf781ae2bb0876c838dd86b8116e/aiohttp-3.13.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9acda8604a57bb60544e4646a4615c1866ee6c04a8edef9b8ee6fd1d8fa2ddc8", size = 1739532, upload-time = "2025-10-28T20:56:25.924Z" }, - { url = "https://files.pythonhosted.org/packages/17/6b/3747644d26a998774b21a616016620293ddefa4d63af6286f389aedac844/aiohttp-3.13.2-cp311-cp311-win32.whl", hash = "sha256:868e195e39b24aaa930b063c08bb0c17924899c16c672a28a65afded9c46c6ec", size = 431876, upload-time = "2025-10-28T20:56:27.524Z" }, - { url = "https://files.pythonhosted.org/packages/c3/63/688462108c1a00eb9f05765331c107f95ae86f6b197b865d29e930b7e462/aiohttp-3.13.2-cp311-cp311-win_amd64.whl", hash = "sha256:7fd19df530c292542636c2a9a85854fab93474396a52f1695e799186bbd7f24c", size = 456205, upload-time = "2025-10-28T20:56:29.062Z" }, - { url = "https://files.pythonhosted.org/packages/29/9b/01f00e9856d0a73260e86dd8ed0c2234a466c5c1712ce1c281548df39777/aiohttp-3.13.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b1e56bab2e12b2b9ed300218c351ee2a3d8c8fdab5b1ec6193e11a817767e47b", size = 737623, upload-time = "2025-10-28T20:56:30.797Z" }, - { url = "https://files.pythonhosted.org/packages/5a/1b/4be39c445e2b2bd0aab4ba736deb649fabf14f6757f405f0c9685019b9e9/aiohttp-3.13.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:364e25edaabd3d37b1db1f0cbcee8c73c9a3727bfa262b83e5e4cf3489a2a9dc", size = 492664, upload-time = "2025-10-28T20:56:32.708Z" }, - { url = "https://files.pythonhosted.org/packages/28/66/d35dcfea8050e131cdd731dff36434390479b4045a8d0b9d7111b0a968f1/aiohttp-3.13.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c5c94825f744694c4b8db20b71dba9a257cd2ba8e010a803042123f3a25d50d7", size = 491808, upload-time = "2025-10-28T20:56:34.57Z" }, - { url = "https://files.pythonhosted.org/packages/00/29/8e4609b93e10a853b65f8291e64985de66d4f5848c5637cddc70e98f01f8/aiohttp-3.13.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ba2715d842ffa787be87cbfce150d5e88c87a98e0b62e0f5aa489169a393dbbb", size = 1738863, upload-time = "2025-10-28T20:56:36.377Z" }, - { url = "https://files.pythonhosted.org/packages/9d/fa/4ebdf4adcc0def75ced1a0d2d227577cd7b1b85beb7edad85fcc87693c75/aiohttp-3.13.2-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:585542825c4bc662221fb257889e011a5aa00f1ae4d75d1d246a5225289183e3", size = 1700586, upload-time = "2025-10-28T20:56:38.034Z" }, - { url = "https://files.pythonhosted.org/packages/da/04/73f5f02ff348a3558763ff6abe99c223381b0bace05cd4530a0258e52597/aiohttp-3.13.2-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:39d02cb6025fe1aabca329c5632f48c9532a3dabccd859e7e2f110668972331f", size = 1768625, upload-time = "2025-10-28T20:56:39.75Z" }, - { url = "https://files.pythonhosted.org/packages/f8/49/a825b79ffec124317265ca7d2344a86bcffeb960743487cb11988ffb3494/aiohttp-3.13.2-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:e67446b19e014d37342f7195f592a2a948141d15a312fe0e700c2fd2f03124f6", size = 1867281, upload-time = "2025-10-28T20:56:41.471Z" }, - { url = "https://files.pythonhosted.org/packages/b9/48/adf56e05f81eac31edcfae45c90928f4ad50ef2e3ea72cb8376162a368f8/aiohttp-3.13.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4356474ad6333e41ccefd39eae869ba15a6c5299c9c01dfdcfdd5c107be4363e", size = 1752431, upload-time = "2025-10-28T20:56:43.162Z" }, - { url = "https://files.pythonhosted.org/packages/30/ab/593855356eead019a74e862f21523db09c27f12fd24af72dbc3555b9bfd9/aiohttp-3.13.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:eeacf451c99b4525f700f078becff32c32ec327b10dcf31306a8a52d78166de7", size = 1562846, upload-time = "2025-10-28T20:56:44.85Z" }, - { url = "https://files.pythonhosted.org/packages/39/0f/9f3d32271aa8dc35036e9668e31870a9d3b9542dd6b3e2c8a30931cb27ae/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d8a9b889aeabd7a4e9af0b7f4ab5ad94d42e7ff679aaec6d0db21e3b639ad58d", size = 1699606, upload-time = "2025-10-28T20:56:46.519Z" }, - { url = "https://files.pythonhosted.org/packages/2c/3c/52d2658c5699b6ef7692a3f7128b2d2d4d9775f2a68093f74bca06cf01e1/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:fa89cb11bc71a63b69568d5b8a25c3ca25b6d54c15f907ca1c130d72f320b76b", size = 1720663, upload-time = "2025-10-28T20:56:48.528Z" }, - { url = "https://files.pythonhosted.org/packages/9b/d4/8f8f3ff1fb7fb9e3f04fcad4e89d8a1cd8fc7d05de67e3de5b15b33008ff/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:8aa7c807df234f693fed0ecd507192fc97692e61fee5702cdc11155d2e5cadc8", size = 1737939, upload-time = "2025-10-28T20:56:50.77Z" }, - { url = "https://files.pythonhosted.org/packages/03/d3/ddd348f8a27a634daae39a1b8e291ff19c77867af438af844bf8b7e3231b/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:9eb3e33fdbe43f88c3c75fa608c25e7c47bbd80f48d012763cb67c47f39a7e16", size = 1555132, upload-time = "2025-10-28T20:56:52.568Z" }, - { url = "https://files.pythonhosted.org/packages/39/b8/46790692dc46218406f94374903ba47552f2f9f90dad554eed61bfb7b64c/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9434bc0d80076138ea986833156c5a48c9c7a8abb0c96039ddbb4afc93184169", size = 1764802, upload-time = "2025-10-28T20:56:54.292Z" }, - { url = "https://files.pythonhosted.org/packages/ba/e4/19ce547b58ab2a385e5f0b8aa3db38674785085abcf79b6e0edd1632b12f/aiohttp-3.13.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ff15c147b2ad66da1f2cbb0622313f2242d8e6e8f9b79b5206c84523a4473248", size = 1719512, upload-time = "2025-10-28T20:56:56.428Z" }, - { url = "https://files.pythonhosted.org/packages/70/30/6355a737fed29dcb6dfdd48682d5790cb5eab050f7b4e01f49b121d3acad/aiohttp-3.13.2-cp312-cp312-win32.whl", hash = "sha256:27e569eb9d9e95dbd55c0fc3ec3a9335defbf1d8bc1d20171a49f3c4c607b93e", size = 426690, upload-time = "2025-10-28T20:56:58.736Z" }, - { url = "https://files.pythonhosted.org/packages/0a/0d/b10ac09069973d112de6ef980c1f6bb31cb7dcd0bc363acbdad58f927873/aiohttp-3.13.2-cp312-cp312-win_amd64.whl", hash = "sha256:8709a0f05d59a71f33fd05c17fc11fcb8c30140506e13c2f5e8ee1b8964e1b45", size = 453465, upload-time = "2025-10-28T20:57:00.795Z" }, - { url = "https://files.pythonhosted.org/packages/bf/78/7e90ca79e5aa39f9694dcfd74f4720782d3c6828113bb1f3197f7e7c4a56/aiohttp-3.13.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7519bdc7dfc1940d201651b52bf5e03f5503bda45ad6eacf64dda98be5b2b6be", size = 732139, upload-time = "2025-10-28T20:57:02.455Z" }, - { url = "https://files.pythonhosted.org/packages/db/ed/1f59215ab6853fbaa5c8495fa6cbc39edfc93553426152b75d82a5f32b76/aiohttp-3.13.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:088912a78b4d4f547a1f19c099d5a506df17eacec3c6f4375e2831ec1d995742", size = 490082, upload-time = "2025-10-28T20:57:04.784Z" }, - { url = "https://files.pythonhosted.org/packages/68/7b/fe0fe0f5e05e13629d893c760465173a15ad0039c0a5b0d0040995c8075e/aiohttp-3.13.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5276807b9de9092af38ed23ce120539ab0ac955547b38563a9ba4f5b07b95293", size = 489035, upload-time = "2025-10-28T20:57:06.894Z" }, - { url = "https://files.pythonhosted.org/packages/d2/04/db5279e38471b7ac801d7d36a57d1230feeee130bbe2a74f72731b23c2b1/aiohttp-3.13.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1237c1375eaef0db4dcd7c2559f42e8af7b87ea7d295b118c60c36a6e61cb811", size = 1720387, upload-time = "2025-10-28T20:57:08.685Z" }, - { url = "https://files.pythonhosted.org/packages/31/07/8ea4326bd7dae2bd59828f69d7fdc6e04523caa55e4a70f4a8725a7e4ed2/aiohttp-3.13.2-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:96581619c57419c3d7d78703d5b78c1e5e5fc0172d60f555bdebaced82ded19a", size = 1688314, upload-time = "2025-10-28T20:57:10.693Z" }, - { url = "https://files.pythonhosted.org/packages/48/ab/3d98007b5b87ffd519d065225438cc3b668b2f245572a8cb53da5dd2b1bc/aiohttp-3.13.2-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a2713a95b47374169409d18103366de1050fe0ea73db358fc7a7acb2880422d4", size = 1756317, upload-time = "2025-10-28T20:57:12.563Z" }, - { url = "https://files.pythonhosted.org/packages/97/3d/801ca172b3d857fafb7b50c7c03f91b72b867a13abca982ed6b3081774ef/aiohttp-3.13.2-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:228a1cd556b3caca590e9511a89444925da87d35219a49ab5da0c36d2d943a6a", size = 1858539, upload-time = "2025-10-28T20:57:14.623Z" }, - { url = "https://files.pythonhosted.org/packages/f7/0d/4764669bdf47bd472899b3d3db91fffbe925c8e3038ec591a2fd2ad6a14d/aiohttp-3.13.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ac6cde5fba8d7d8c6ac963dbb0256a9854e9fafff52fbcc58fdf819357892c3e", size = 1739597, upload-time = "2025-10-28T20:57:16.399Z" }, - { url = "https://files.pythonhosted.org/packages/c4/52/7bd3c6693da58ba16e657eb904a5b6decfc48ecd06e9ac098591653b1566/aiohttp-3.13.2-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f2bef8237544f4e42878c61cef4e2839fee6346dc60f5739f876a9c50be7fcdb", size = 1555006, upload-time = "2025-10-28T20:57:18.288Z" }, - { url = "https://files.pythonhosted.org/packages/48/30/9586667acec5993b6f41d2ebcf96e97a1255a85f62f3c653110a5de4d346/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:16f15a4eac3bc2d76c45f7ebdd48a65d41b242eb6c31c2245463b40b34584ded", size = 1683220, upload-time = "2025-10-28T20:57:20.241Z" }, - { url = "https://files.pythonhosted.org/packages/71/01/3afe4c96854cfd7b30d78333852e8e851dceaec1c40fd00fec90c6402dd2/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:bb7fb776645af5cc58ab804c58d7eba545a97e047254a52ce89c157b5af6cd0b", size = 1712570, upload-time = "2025-10-28T20:57:22.253Z" }, - { url = "https://files.pythonhosted.org/packages/11/2c/22799d8e720f4697a9e66fd9c02479e40a49de3de2f0bbe7f9f78a987808/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:e1b4951125ec10c70802f2cb09736c895861cd39fd9dcb35107b4dc8ae6220b8", size = 1733407, upload-time = "2025-10-28T20:57:24.37Z" }, - { url = "https://files.pythonhosted.org/packages/34/cb/90f15dd029f07cebbd91f8238a8b363978b530cd128488085b5703683594/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:550bf765101ae721ee1d37d8095f47b1f220650f85fe1af37a90ce75bab89d04", size = 1550093, upload-time = "2025-10-28T20:57:26.257Z" }, - { url = "https://files.pythonhosted.org/packages/69/46/12dce9be9d3303ecbf4d30ad45a7683dc63d90733c2d9fe512be6716cd40/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:fe91b87fc295973096251e2d25a811388e7d8adf3bd2b97ef6ae78bc4ac6c476", size = 1758084, upload-time = "2025-10-28T20:57:28.349Z" }, - { url = "https://files.pythonhosted.org/packages/f9/c8/0932b558da0c302ffd639fc6362a313b98fdf235dc417bc2493da8394df7/aiohttp-3.13.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e0c8e31cfcc4592cb200160344b2fb6ae0f9e4effe06c644b5a125d4ae5ebe23", size = 1716987, upload-time = "2025-10-28T20:57:30.233Z" }, - { url = "https://files.pythonhosted.org/packages/5d/8b/f5bd1a75003daed099baec373aed678f2e9b34f2ad40d85baa1368556396/aiohttp-3.13.2-cp313-cp313-win32.whl", hash = "sha256:0740f31a60848d6edb296a0df827473eede90c689b8f9f2a4cdde74889eb2254", size = 425859, upload-time = "2025-10-28T20:57:32.105Z" }, - { url = "https://files.pythonhosted.org/packages/5d/28/a8a9fc6957b2cee8902414e41816b5ab5536ecf43c3b1843c10e82c559b2/aiohttp-3.13.2-cp313-cp313-win_amd64.whl", hash = "sha256:a88d13e7ca367394908f8a276b89d04a3652044612b9a408a0bb22a5ed976a1a", size = 452192, upload-time = "2025-10-28T20:57:34.166Z" }, - { url = "https://files.pythonhosted.org/packages/9b/36/e2abae1bd815f01c957cbf7be817b3043304e1c87bad526292a0410fdcf9/aiohttp-3.13.2-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:2475391c29230e063ef53a66669b7b691c9bfc3f1426a0f7bcdf1216bdbac38b", size = 735234, upload-time = "2025-10-28T20:57:36.415Z" }, - { url = "https://files.pythonhosted.org/packages/ca/e3/1ee62dde9b335e4ed41db6bba02613295a0d5b41f74a783c142745a12763/aiohttp-3.13.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:f33c8748abef4d8717bb20e8fb1b3e07c6adacb7fd6beaae971a764cf5f30d61", size = 490733, upload-time = "2025-10-28T20:57:38.205Z" }, - { url = "https://files.pythonhosted.org/packages/1a/aa/7a451b1d6a04e8d15a362af3e9b897de71d86feac3babf8894545d08d537/aiohttp-3.13.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ae32f24bbfb7dbb485a24b30b1149e2f200be94777232aeadba3eecece4d0aa4", size = 491303, upload-time = "2025-10-28T20:57:40.122Z" }, - { url = "https://files.pythonhosted.org/packages/57/1e/209958dbb9b01174870f6a7538cd1f3f28274fdbc88a750c238e2c456295/aiohttp-3.13.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d7f02042c1f009ffb70067326ef183a047425bb2ff3bc434ead4dd4a4a66a2b", size = 1717965, upload-time = "2025-10-28T20:57:42.28Z" }, - { url = "https://files.pythonhosted.org/packages/08/aa/6a01848d6432f241416bc4866cae8dc03f05a5a884d2311280f6a09c73d6/aiohttp-3.13.2-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:93655083005d71cd6c072cdab54c886e6570ad2c4592139c3fb967bfc19e4694", size = 1667221, upload-time = "2025-10-28T20:57:44.869Z" }, - { url = "https://files.pythonhosted.org/packages/87/4f/36c1992432d31bbc789fa0b93c768d2e9047ec8c7177e5cd84ea85155f36/aiohttp-3.13.2-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0db1e24b852f5f664cd728db140cf11ea0e82450471232a394b3d1a540b0f906", size = 1757178, upload-time = "2025-10-28T20:57:47.216Z" }, - { url = "https://files.pythonhosted.org/packages/ac/b4/8e940dfb03b7e0f68a82b88fd182b9be0a65cb3f35612fe38c038c3112cf/aiohttp-3.13.2-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b009194665bcd128e23eaddef362e745601afa4641930848af4c8559e88f18f9", size = 1838001, upload-time = "2025-10-28T20:57:49.337Z" }, - { url = "https://files.pythonhosted.org/packages/d7/ef/39f3448795499c440ab66084a9db7d20ca7662e94305f175a80f5b7e0072/aiohttp-3.13.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c038a8fdc8103cd51dbd986ecdce141473ffd9775a7a8057a6ed9c3653478011", size = 1716325, upload-time = "2025-10-28T20:57:51.327Z" }, - { url = "https://files.pythonhosted.org/packages/d7/51/b311500ffc860b181c05d91c59a1313bdd05c82960fdd4035a15740d431e/aiohttp-3.13.2-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:66bac29b95a00db411cd758fea0e4b9bdba6d549dfe333f9a945430f5f2cc5a6", size = 1547978, upload-time = "2025-10-28T20:57:53.554Z" }, - { url = "https://files.pythonhosted.org/packages/31/64/b9d733296ef79815226dab8c586ff9e3df41c6aff2e16c06697b2d2e6775/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:4ebf9cfc9ba24a74cf0718f04aac2a3bbe745902cc7c5ebc55c0f3b5777ef213", size = 1682042, upload-time = "2025-10-28T20:57:55.617Z" }, - { url = "https://files.pythonhosted.org/packages/3f/30/43d3e0f9d6473a6db7d472104c4eff4417b1e9df01774cb930338806d36b/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:a4b88ebe35ce54205c7074f7302bd08a4cb83256a3e0870c72d6f68a3aaf8e49", size = 1680085, upload-time = "2025-10-28T20:57:57.59Z" }, - { url = "https://files.pythonhosted.org/packages/16/51/c709f352c911b1864cfd1087577760ced64b3e5bee2aa88b8c0c8e2e4972/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:98c4fb90bb82b70a4ed79ca35f656f4281885be076f3f970ce315402b53099ae", size = 1728238, upload-time = "2025-10-28T20:57:59.525Z" }, - { url = "https://files.pythonhosted.org/packages/19/e2/19bd4c547092b773caeb48ff5ae4b1ae86756a0ee76c16727fcfd281404b/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:ec7534e63ae0f3759df3a1ed4fa6bc8f75082a924b590619c0dd2f76d7043caa", size = 1544395, upload-time = "2025-10-28T20:58:01.914Z" }, - { url = "https://files.pythonhosted.org/packages/cf/87/860f2803b27dfc5ed7be532832a3498e4919da61299b4a1f8eb89b8ff44d/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:5b927cf9b935a13e33644cbed6c8c4b2d0f25b713d838743f8fe7191b33829c4", size = 1742965, upload-time = "2025-10-28T20:58:03.972Z" }, - { url = "https://files.pythonhosted.org/packages/67/7f/db2fc7618925e8c7a601094d5cbe539f732df4fb570740be88ed9e40e99a/aiohttp-3.13.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:88d6c017966a78c5265d996c19cdb79235be5e6412268d7e2ce7dee339471b7a", size = 1697585, upload-time = "2025-10-28T20:58:06.189Z" }, - { url = "https://files.pythonhosted.org/packages/0c/07/9127916cb09bb38284db5036036042b7b2c514c8ebaeee79da550c43a6d6/aiohttp-3.13.2-cp314-cp314-win32.whl", hash = "sha256:f7c183e786e299b5d6c49fb43a769f8eb8e04a2726a2bd5887b98b5cc2d67940", size = 431621, upload-time = "2025-10-28T20:58:08.636Z" }, - { url = "https://files.pythonhosted.org/packages/fb/41/554a8a380df6d3a2bba8a7726429a23f4ac62aaf38de43bb6d6cde7b4d4d/aiohttp-3.13.2-cp314-cp314-win_amd64.whl", hash = "sha256:fe242cd381e0fb65758faf5ad96c2e460df6ee5b2de1072fe97e4127927e00b4", size = 457627, upload-time = "2025-10-28T20:58:11Z" }, - { url = "https://files.pythonhosted.org/packages/c7/8e/3824ef98c039d3951cb65b9205a96dd2b20f22241ee17d89c5701557c826/aiohttp-3.13.2-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:f10d9c0b0188fe85398c61147bbd2a657d616c876863bfeff43376e0e3134673", size = 767360, upload-time = "2025-10-28T20:58:13.358Z" }, - { url = "https://files.pythonhosted.org/packages/a4/0f/6a03e3fc7595421274fa34122c973bde2d89344f8a881b728fa8c774e4f1/aiohttp-3.13.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:e7c952aefdf2460f4ae55c5e9c3e80aa72f706a6317e06020f80e96253b1accd", size = 504616, upload-time = "2025-10-28T20:58:15.339Z" }, - { url = "https://files.pythonhosted.org/packages/c6/aa/ed341b670f1bc8a6f2c6a718353d13b9546e2cef3544f573c6a1ff0da711/aiohttp-3.13.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c20423ce14771d98353d2e25e83591fa75dfa90a3c1848f3d7c68243b4fbded3", size = 509131, upload-time = "2025-10-28T20:58:17.693Z" }, - { url = "https://files.pythonhosted.org/packages/7f/f0/c68dac234189dae5c4bbccc0f96ce0cc16b76632cfc3a08fff180045cfa4/aiohttp-3.13.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e96eb1a34396e9430c19d8338d2ec33015e4a87ef2b4449db94c22412e25ccdf", size = 1864168, upload-time = "2025-10-28T20:58:20.113Z" }, - { url = "https://files.pythonhosted.org/packages/8f/65/75a9a76db8364b5d0e52a0c20eabc5d52297385d9af9c35335b924fafdee/aiohttp-3.13.2-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:23fb0783bc1a33640036465019d3bba069942616a6a2353c6907d7fe1ccdaf4e", size = 1719200, upload-time = "2025-10-28T20:58:22.583Z" }, - { url = "https://files.pythonhosted.org/packages/f5/55/8df2ed78d7f41d232f6bd3ff866b6f617026551aa1d07e2f03458f964575/aiohttp-3.13.2-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2e1a9bea6244a1d05a4e57c295d69e159a5c50d8ef16aa390948ee873478d9a5", size = 1843497, upload-time = "2025-10-28T20:58:24.672Z" }, - { url = "https://files.pythonhosted.org/packages/e9/e0/94d7215e405c5a02ccb6a35c7a3a6cfff242f457a00196496935f700cde5/aiohttp-3.13.2-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0a3d54e822688b56e9f6b5816fb3de3a3a64660efac64e4c2dc435230ad23bad", size = 1935703, upload-time = "2025-10-28T20:58:26.758Z" }, - { url = "https://files.pythonhosted.org/packages/0b/78/1eeb63c3f9b2d1015a4c02788fb543141aad0a03ae3f7a7b669b2483f8d4/aiohttp-3.13.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7a653d872afe9f33497215745da7a943d1dc15b728a9c8da1c3ac423af35178e", size = 1792738, upload-time = "2025-10-28T20:58:29.787Z" }, - { url = "https://files.pythonhosted.org/packages/41/75/aaf1eea4c188e51538c04cc568040e3082db263a57086ea74a7d38c39e42/aiohttp-3.13.2-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:56d36e80d2003fa3fc0207fac644216d8532e9504a785ef9a8fd013f84a42c61", size = 1624061, upload-time = "2025-10-28T20:58:32.529Z" }, - { url = "https://files.pythonhosted.org/packages/9b/c2/3b6034de81fbcc43de8aeb209073a2286dfb50b86e927b4efd81cf848197/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:78cd586d8331fb8e241c2dd6b2f4061778cc69e150514b39a9e28dd050475661", size = 1789201, upload-time = "2025-10-28T20:58:34.618Z" }, - { url = "https://files.pythonhosted.org/packages/c9/38/c15dcf6d4d890217dae79d7213988f4e5fe6183d43893a9cf2fe9e84ca8d/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:20b10bbfbff766294fe99987f7bb3b74fdd2f1a2905f2562132641ad434dcf98", size = 1776868, upload-time = "2025-10-28T20:58:38.835Z" }, - { url = "https://files.pythonhosted.org/packages/04/75/f74fd178ac81adf4f283a74847807ade5150e48feda6aef024403716c30c/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9ec49dff7e2b3c85cdeaa412e9d438f0ecd71676fde61ec57027dd392f00c693", size = 1790660, upload-time = "2025-10-28T20:58:41.507Z" }, - { url = "https://files.pythonhosted.org/packages/e7/80/7368bd0d06b16b3aba358c16b919e9c46cf11587dc572091031b0e9e3ef0/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:94f05348c4406450f9d73d38efb41d669ad6cd90c7ee194810d0eefbfa875a7a", size = 1617548, upload-time = "2025-10-28T20:58:43.674Z" }, - { url = "https://files.pythonhosted.org/packages/7d/4b/a6212790c50483cb3212e507378fbe26b5086d73941e1ec4b56a30439688/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:fa4dcb605c6f82a80c7f95713c2b11c3b8e9893b3ebd2bc9bde93165ed6107be", size = 1817240, upload-time = "2025-10-28T20:58:45.787Z" }, - { url = "https://files.pythonhosted.org/packages/ff/f7/ba5f0ba4ea8d8f3c32850912944532b933acbf0f3a75546b89269b9b7dde/aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cf00e5db968c3f67eccd2778574cf64d8b27d95b237770aa32400bd7a1ca4f6c", size = 1762334, upload-time = "2025-10-28T20:58:47.936Z" }, - { url = "https://files.pythonhosted.org/packages/7e/83/1a5a1856574588b1cad63609ea9ad75b32a8353ac995d830bf5da9357364/aiohttp-3.13.2-cp314-cp314t-win32.whl", hash = "sha256:d23b5fe492b0805a50d3371e8a728a9134d8de5447dce4c885f5587294750734", size = 464685, upload-time = "2025-10-28T20:58:50.642Z" }, - { url = "https://files.pythonhosted.org/packages/9f/4d/d22668674122c08f4d56972297c51a624e64b3ed1efaa40187607a7cb66e/aiohttp-3.13.2-cp314-cp314t-win_amd64.whl", hash = "sha256:ff0a7b0a82a7ab905cbda74006318d1b12e37c797eb1b0d4eb3e316cf47f658f", size = 498093, upload-time = "2025-10-28T20:58:52.782Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/50/42/32cf8e7704ceb4481406eb87161349abb46a57fee3f008ba9cb610968646/aiohttp-3.13.3.tar.gz", hash = "sha256:a949eee43d3782f2daae4f4a2819b2cb9b0c5d3b7f7a927067cc84dafdbb9f88", size = 7844556, upload-time = "2026-01-03T17:33:05.204Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/36/d6/5aec9313ee6ea9c7cde8b891b69f4ff4001416867104580670a31daeba5b/aiohttp-3.13.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d5a372fd5afd301b3a89582817fdcdb6c34124787c70dbcc616f259013e7eef7", size = 738950, upload-time = "2026-01-03T17:29:13.002Z" }, + { url = "https://files.pythonhosted.org/packages/68/03/8fa90a7e6d11ff20a18837a8e2b5dd23db01aabc475aa9271c8ad33299f5/aiohttp-3.13.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:147e422fd1223005c22b4fe080f5d93ced44460f5f9c105406b753612b587821", size = 496099, upload-time = "2026-01-03T17:29:15.268Z" }, + { url = "https://files.pythonhosted.org/packages/d2/23/b81f744d402510a8366b74eb420fc0cc1170d0c43daca12d10814df85f10/aiohttp-3.13.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:859bd3f2156e81dd01432f5849fc73e2243d4a487c4fd26609b1299534ee1845", size = 491072, upload-time = "2026-01-03T17:29:16.922Z" }, + { url = "https://files.pythonhosted.org/packages/d5/e1/56d1d1c0dd334cd203dd97706ce004c1aa24b34a813b0b8daf3383039706/aiohttp-3.13.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dca68018bf48c251ba17c72ed479f4dafe9dbd5a73707ad8d28a38d11f3d42af", size = 1671588, upload-time = "2026-01-03T17:29:18.539Z" }, + { url = "https://files.pythonhosted.org/packages/5f/34/8d7f962604f4bc2b4e39eb1220dac7d4e4cba91fb9ba0474b4ecd67db165/aiohttp-3.13.3-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:fee0c6bc7db1de362252affec009707a17478a00ec69f797d23ca256e36d5940", size = 1640334, upload-time = "2026-01-03T17:29:21.028Z" }, + { url = "https://files.pythonhosted.org/packages/94/1d/fcccf2c668d87337ddeef9881537baee13c58d8f01f12ba8a24215f2b804/aiohttp-3.13.3-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c048058117fd649334d81b4b526e94bde3ccaddb20463a815ced6ecbb7d11160", size = 1722656, upload-time = "2026-01-03T17:29:22.531Z" }, + { url = "https://files.pythonhosted.org/packages/aa/98/c6f3b081c4c606bc1e5f2ec102e87d6411c73a9ef3616fea6f2d5c98c062/aiohttp-3.13.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:215a685b6fbbfcf71dfe96e3eba7a6f58f10da1dfdf4889c7dd856abe430dca7", size = 1817625, upload-time = "2026-01-03T17:29:24.276Z" }, + { url = "https://files.pythonhosted.org/packages/2c/c0/cfcc3d2e11b477f86e1af2863f3858c8850d751ce8dc39c4058a072c9e54/aiohttp-3.13.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2c184bb1fe2cbd2cefba613e9db29a5ab559323f994b6737e370d3da0ac455", size = 1672604, upload-time = "2026-01-03T17:29:26.099Z" }, + { url = "https://files.pythonhosted.org/packages/1e/77/6b4ffcbcac4c6a5d041343a756f34a6dd26174ae07f977a64fe028dda5b0/aiohttp-3.13.3-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:75ca857eba4e20ce9f546cd59c7007b33906a4cd48f2ff6ccf1ccfc3b646f279", size = 1554370, upload-time = "2026-01-03T17:29:28.121Z" }, + { url = "https://files.pythonhosted.org/packages/f2/f0/e3ddfa93f17d689dbe014ba048f18e0c9f9b456033b70e94349a2e9048be/aiohttp-3.13.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:81e97251d9298386c2b7dbeb490d3d1badbdc69107fb8c9299dd04eb39bddc0e", size = 1642023, upload-time = "2026-01-03T17:29:30.002Z" }, + { url = "https://files.pythonhosted.org/packages/eb/45/c14019c9ec60a8e243d06d601b33dcc4fd92379424bde3021725859d7f99/aiohttp-3.13.3-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:c0e2d366af265797506f0283487223146af57815b388623f0357ef7eac9b209d", size = 1649680, upload-time = "2026-01-03T17:29:31.782Z" }, + { url = "https://files.pythonhosted.org/packages/9c/fd/09c9451dae5aa5c5ed756df95ff9ef549d45d4be663bafd1e4954fd836f0/aiohttp-3.13.3-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:4e239d501f73d6db1522599e14b9b321a7e3b1de66ce33d53a765d975e9f4808", size = 1692407, upload-time = "2026-01-03T17:29:33.392Z" }, + { url = "https://files.pythonhosted.org/packages/a6/81/938bc2ec33c10efd6637ccb3d22f9f3160d08e8f3aa2587a2c2d5ab578eb/aiohttp-3.13.3-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:0db318f7a6f065d84cb1e02662c526294450b314a02bd9e2a8e67f0d8564ce40", size = 1543047, upload-time = "2026-01-03T17:29:34.855Z" }, + { url = "https://files.pythonhosted.org/packages/f7/23/80488ee21c8d567c83045e412e1d9b7077d27171591a4eb7822586e8c06a/aiohttp-3.13.3-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:bfc1cc2fe31a6026a8a88e4ecfb98d7f6b1fec150cfd708adbfd1d2f42257c29", size = 1715264, upload-time = "2026-01-03T17:29:36.389Z" }, + { url = "https://files.pythonhosted.org/packages/e2/83/259a8da6683182768200b368120ab3deff5370bed93880fb9a3a86299f34/aiohttp-3.13.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af71fff7bac6bb7508956696dce8f6eec2bbb045eceb40343944b1ae62b5ef11", size = 1657275, upload-time = "2026-01-03T17:29:38.162Z" }, + { url = "https://files.pythonhosted.org/packages/3f/4f/2c41f800a0b560785c10fb316216ac058c105f9be50bdc6a285de88db625/aiohttp-3.13.3-cp310-cp310-win32.whl", hash = "sha256:37da61e244d1749798c151421602884db5270faf479cf0ef03af0ff68954c9dd", size = 434053, upload-time = "2026-01-03T17:29:40.074Z" }, + { url = "https://files.pythonhosted.org/packages/80/df/29cd63c7ecfdb65ccc12f7d808cac4fa2a19544660c06c61a4a48462de0c/aiohttp-3.13.3-cp310-cp310-win_amd64.whl", hash = "sha256:7e63f210bc1b57ef699035f2b4b6d9ce096b5914414a49b0997c839b2bd2223c", size = 456687, upload-time = "2026-01-03T17:29:41.819Z" }, + { url = "https://files.pythonhosted.org/packages/f1/4c/a164164834f03924d9a29dc3acd9e7ee58f95857e0b467f6d04298594ebb/aiohttp-3.13.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5b6073099fb654e0a068ae678b10feff95c5cae95bbfcbfa7af669d361a8aa6b", size = 746051, upload-time = "2026-01-03T17:29:43.287Z" }, + { url = "https://files.pythonhosted.org/packages/82/71/d5c31390d18d4f58115037c432b7e0348c60f6f53b727cad33172144a112/aiohttp-3.13.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cb93e166e6c28716c8c6aeb5f99dfb6d5ccf482d29fe9bf9a794110e6d0ab64", size = 499234, upload-time = "2026-01-03T17:29:44.822Z" }, + { url = "https://files.pythonhosted.org/packages/0e/c9/741f8ac91e14b1d2e7100690425a5b2b919a87a5075406582991fb7de920/aiohttp-3.13.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:28e027cf2f6b641693a09f631759b4d9ce9165099d2b5d92af9bd4e197690eea", size = 494979, upload-time = "2026-01-03T17:29:46.405Z" }, + { url = "https://files.pythonhosted.org/packages/75/b5/31d4d2e802dfd59f74ed47eba48869c1c21552c586d5e81a9d0d5c2ad640/aiohttp-3.13.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3b61b7169ababd7802f9568ed96142616a9118dd2be0d1866e920e77ec8fa92a", size = 1748297, upload-time = "2026-01-03T17:29:48.083Z" }, + { url = "https://files.pythonhosted.org/packages/1a/3e/eefad0ad42959f226bb79664826883f2687d602a9ae2941a18e0484a74d3/aiohttp-3.13.3-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:80dd4c21b0f6237676449c6baaa1039abae86b91636b6c91a7f8e61c87f89540", size = 1707172, upload-time = "2026-01-03T17:29:49.648Z" }, + { url = "https://files.pythonhosted.org/packages/c5/3a/54a64299fac2891c346cdcf2aa6803f994a2e4beeaf2e5a09dcc54acc842/aiohttp-3.13.3-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:65d2ccb7eabee90ce0503c17716fc77226be026dcc3e65cce859a30db715025b", size = 1805405, upload-time = "2026-01-03T17:29:51.244Z" }, + { url = "https://files.pythonhosted.org/packages/6c/70/ddc1b7169cf64075e864f64595a14b147a895a868394a48f6a8031979038/aiohttp-3.13.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5b179331a481cb5529fca8b432d8d3c7001cb217513c94cd72d668d1248688a3", size = 1899449, upload-time = "2026-01-03T17:29:53.938Z" }, + { url = "https://files.pythonhosted.org/packages/a1/7e/6815aab7d3a56610891c76ef79095677b8b5be6646aaf00f69b221765021/aiohttp-3.13.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d4c940f02f49483b18b079d1c27ab948721852b281f8b015c058100e9421dd1", size = 1748444, upload-time = "2026-01-03T17:29:55.484Z" }, + { url = "https://files.pythonhosted.org/packages/6b/f2/073b145c4100da5511f457dc0f7558e99b2987cf72600d42b559db856fbc/aiohttp-3.13.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f9444f105664c4ce47a2a7171a2418bce5b7bae45fb610f4e2c36045d85911d3", size = 1606038, upload-time = "2026-01-03T17:29:57.179Z" }, + { url = "https://files.pythonhosted.org/packages/0a/c1/778d011920cae03ae01424ec202c513dc69243cf2db303965615b81deeea/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:694976222c711d1d00ba131904beb60534f93966562f64440d0c9d41b8cdb440", size = 1724156, upload-time = "2026-01-03T17:29:58.914Z" }, + { url = "https://files.pythonhosted.org/packages/0e/cb/3419eabf4ec1e9ec6f242c32b689248365a1cf621891f6f0386632525494/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:f33ed1a2bf1997a36661874b017f5c4b760f41266341af36febaf271d179f6d7", size = 1722340, upload-time = "2026-01-03T17:30:01.962Z" }, + { url = "https://files.pythonhosted.org/packages/7a/e5/76cf77bdbc435bf233c1f114edad39ed4177ccbfab7c329482b179cff4f4/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e636b3c5f61da31a92bf0d91da83e58fdfa96f178ba682f11d24f31944cdd28c", size = 1783041, upload-time = "2026-01-03T17:30:03.609Z" }, + { url = "https://files.pythonhosted.org/packages/9d/d4/dd1ca234c794fd29c057ce8c0566b8ef7fd6a51069de5f06fa84b9a1971c/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:5d2d94f1f5fcbe40838ac51a6ab5704a6f9ea42e72ceda48de5e6b898521da51", size = 1596024, upload-time = "2026-01-03T17:30:05.132Z" }, + { url = "https://files.pythonhosted.org/packages/55/58/4345b5f26661a6180afa686c473620c30a66afdf120ed3dd545bbc809e85/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2be0e9ccf23e8a94f6f0650ce06042cefc6ac703d0d7ab6c7a917289f2539ad4", size = 1804590, upload-time = "2026-01-03T17:30:07.135Z" }, + { url = "https://files.pythonhosted.org/packages/7b/06/05950619af6c2df7e0a431d889ba2813c9f0129cec76f663e547a5ad56f2/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9af5e68ee47d6534d36791bbe9b646d2a7c7deb6fc24d7943628edfbb3581f29", size = 1740355, upload-time = "2026-01-03T17:30:09.083Z" }, + { url = "https://files.pythonhosted.org/packages/3e/80/958f16de79ba0422d7c1e284b2abd0c84bc03394fbe631d0a39ffa10e1eb/aiohttp-3.13.3-cp311-cp311-win32.whl", hash = "sha256:a2212ad43c0833a873d0fb3c63fa1bacedd4cf6af2fee62bf4b739ceec3ab239", size = 433701, upload-time = "2026-01-03T17:30:10.869Z" }, + { url = "https://files.pythonhosted.org/packages/dc/f2/27cdf04c9851712d6c1b99df6821a6623c3c9e55956d4b1e318c337b5a48/aiohttp-3.13.3-cp311-cp311-win_amd64.whl", hash = "sha256:642f752c3eb117b105acbd87e2c143de710987e09860d674e068c4c2c441034f", size = 457678, upload-time = "2026-01-03T17:30:12.719Z" }, + { url = "https://files.pythonhosted.org/packages/a0/be/4fc11f202955a69e0db803a12a062b8379c970c7c84f4882b6da17337cc1/aiohttp-3.13.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b903a4dfee7d347e2d87697d0713be59e0b87925be030c9178c5faa58ea58d5c", size = 739732, upload-time = "2026-01-03T17:30:14.23Z" }, + { url = "https://files.pythonhosted.org/packages/97/2c/621d5b851f94fa0bb7430d6089b3aa970a9d9b75196bc93bb624b0db237a/aiohttp-3.13.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a45530014d7a1e09f4a55f4f43097ba0fd155089372e105e4bff4ca76cb1b168", size = 494293, upload-time = "2026-01-03T17:30:15.96Z" }, + { url = "https://files.pythonhosted.org/packages/5d/43/4be01406b78e1be8320bb8316dc9c42dbab553d281c40364e0f862d5661c/aiohttp-3.13.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:27234ef6d85c914f9efeb77ff616dbf4ad2380be0cda40b4db086ffc7ddd1b7d", size = 493533, upload-time = "2026-01-03T17:30:17.431Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a8/5a35dc56a06a2c90d4742cbf35294396907027f80eea696637945a106f25/aiohttp-3.13.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d32764c6c9aafb7fb55366a224756387cd50bfa720f32b88e0e6fa45b27dcf29", size = 1737839, upload-time = "2026-01-03T17:30:19.422Z" }, + { url = "https://files.pythonhosted.org/packages/bf/62/4b9eeb331da56530bf2e198a297e5303e1c1ebdceeb00fe9b568a65c5a0c/aiohttp-3.13.3-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b1a6102b4d3ebc07dad44fbf07b45bb600300f15b552ddf1851b5390202ea2e3", size = 1703932, upload-time = "2026-01-03T17:30:21.756Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f6/af16887b5d419e6a367095994c0b1332d154f647e7dc2bd50e61876e8e3d/aiohttp-3.13.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c014c7ea7fb775dd015b2d3137378b7be0249a448a1612268b5a90c2d81de04d", size = 1771906, upload-time = "2026-01-03T17:30:23.932Z" }, + { url = "https://files.pythonhosted.org/packages/ce/83/397c634b1bcc24292fa1e0c7822800f9f6569e32934bdeef09dae7992dfb/aiohttp-3.13.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2b8d8ddba8f95ba17582226f80e2de99c7a7948e66490ef8d947e272a93e9463", size = 1871020, upload-time = "2026-01-03T17:30:26Z" }, + { url = "https://files.pythonhosted.org/packages/86/f6/a62cbbf13f0ac80a70f71b1672feba90fdb21fd7abd8dbf25c0105fb6fa3/aiohttp-3.13.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ae8dd55c8e6c4257eae3a20fd2c8f41edaea5992ed67156642493b8daf3cecc", size = 1755181, upload-time = "2026-01-03T17:30:27.554Z" }, + { url = "https://files.pythonhosted.org/packages/0a/87/20a35ad487efdd3fba93d5843efdfaa62d2f1479eaafa7453398a44faf13/aiohttp-3.13.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:01ad2529d4b5035578f5081606a465f3b814c542882804e2e8cda61adf5c71bf", size = 1561794, upload-time = "2026-01-03T17:30:29.254Z" }, + { url = "https://files.pythonhosted.org/packages/de/95/8fd69a66682012f6716e1bc09ef8a1a2a91922c5725cb904689f112309c4/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bb4f7475e359992b580559e008c598091c45b5088f28614e855e42d39c2f1033", size = 1697900, upload-time = "2026-01-03T17:30:31.033Z" }, + { url = "https://files.pythonhosted.org/packages/e5/66/7b94b3b5ba70e955ff597672dad1691333080e37f50280178967aff68657/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c19b90316ad3b24c69cd78d5c9b4f3aa4497643685901185b65166293d36a00f", size = 1728239, upload-time = "2026-01-03T17:30:32.703Z" }, + { url = "https://files.pythonhosted.org/packages/47/71/6f72f77f9f7d74719692ab65a2a0252584bf8d5f301e2ecb4c0da734530a/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:96d604498a7c782cb15a51c406acaea70d8c027ee6b90c569baa6e7b93073679", size = 1740527, upload-time = "2026-01-03T17:30:34.695Z" }, + { url = "https://files.pythonhosted.org/packages/fa/b4/75ec16cbbd5c01bdaf4a05b19e103e78d7ce1ef7c80867eb0ace42ff4488/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:084911a532763e9d3dd95adf78a78f4096cd5f58cdc18e6fdbc1b58417a45423", size = 1554489, upload-time = "2026-01-03T17:30:36.864Z" }, + { url = "https://files.pythonhosted.org/packages/52/8f/bc518c0eea29f8406dcf7ed1f96c9b48e3bc3995a96159b3fc11f9e08321/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7a4a94eb787e606d0a09404b9c38c113d3b099d508021faa615d70a0131907ce", size = 1767852, upload-time = "2026-01-03T17:30:39.433Z" }, + { url = "https://files.pythonhosted.org/packages/9d/f2/a07a75173124f31f11ea6f863dc44e6f09afe2bca45dd4e64979490deab1/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:87797e645d9d8e222e04160ee32aa06bc5c163e8499f24db719e7852ec23093a", size = 1722379, upload-time = "2026-01-03T17:30:41.081Z" }, + { url = "https://files.pythonhosted.org/packages/3c/4a/1a3fee7c21350cac78e5c5cef711bac1b94feca07399f3d406972e2d8fcd/aiohttp-3.13.3-cp312-cp312-win32.whl", hash = "sha256:b04be762396457bef43f3597c991e192ee7da460a4953d7e647ee4b1c28e7046", size = 428253, upload-time = "2026-01-03T17:30:42.644Z" }, + { url = "https://files.pythonhosted.org/packages/d9/b7/76175c7cb4eb73d91ad63c34e29fc4f77c9386bba4a65b53ba8e05ee3c39/aiohttp-3.13.3-cp312-cp312-win_amd64.whl", hash = "sha256:e3531d63d3bdfa7e3ac5e9b27b2dd7ec9df3206a98e0b3445fa906f233264c57", size = 455407, upload-time = "2026-01-03T17:30:44.195Z" }, + { url = "https://files.pythonhosted.org/packages/97/8a/12ca489246ca1faaf5432844adbfce7ff2cc4997733e0af120869345643a/aiohttp-3.13.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:5dff64413671b0d3e7d5918ea490bdccb97a4ad29b3f311ed423200b2203e01c", size = 734190, upload-time = "2026-01-03T17:30:45.832Z" }, + { url = "https://files.pythonhosted.org/packages/32/08/de43984c74ed1fca5c014808963cc83cb00d7bb06af228f132d33862ca76/aiohttp-3.13.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:87b9aab6d6ed88235aa2970294f496ff1a1f9adcd724d800e9b952395a80ffd9", size = 491783, upload-time = "2026-01-03T17:30:47.466Z" }, + { url = "https://files.pythonhosted.org/packages/17/f8/8dd2cf6112a5a76f81f81a5130c57ca829d101ad583ce57f889179accdda/aiohttp-3.13.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:425c126c0dc43861e22cb1c14ba4c8e45d09516d0a3ae0a3f7494b79f5f233a3", size = 490704, upload-time = "2026-01-03T17:30:49.373Z" }, + { url = "https://files.pythonhosted.org/packages/6d/40/a46b03ca03936f832bc7eaa47cfbb1ad012ba1be4790122ee4f4f8cba074/aiohttp-3.13.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7f9120f7093c2a32d9647abcaf21e6ad275b4fbec5b55969f978b1a97c7c86bf", size = 1720652, upload-time = "2026-01-03T17:30:50.974Z" }, + { url = "https://files.pythonhosted.org/packages/f7/7e/917fe18e3607af92657e4285498f500dca797ff8c918bd7d90b05abf6c2a/aiohttp-3.13.3-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:697753042d57f4bf7122cab985bf15d0cef23c770864580f5af4f52023a56bd6", size = 1692014, upload-time = "2026-01-03T17:30:52.729Z" }, + { url = "https://files.pythonhosted.org/packages/71/b6/cefa4cbc00d315d68973b671cf105b21a609c12b82d52e5d0c9ae61d2a09/aiohttp-3.13.3-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6de499a1a44e7de70735d0b39f67c8f25eb3d91eb3103be99ca0fa882cdd987d", size = 1759777, upload-time = "2026-01-03T17:30:54.537Z" }, + { url = "https://files.pythonhosted.org/packages/fb/e3/e06ee07b45e59e6d81498b591fc589629be1553abb2a82ce33efe2a7b068/aiohttp-3.13.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:37239e9f9a7ea9ac5bf6b92b0260b01f8a22281996da609206a84df860bc1261", size = 1861276, upload-time = "2026-01-03T17:30:56.512Z" }, + { url = "https://files.pythonhosted.org/packages/7c/24/75d274228acf35ceeb2850b8ce04de9dd7355ff7a0b49d607ee60c29c518/aiohttp-3.13.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f76c1e3fe7d7c8afad7ed193f89a292e1999608170dcc9751a7462a87dfd5bc0", size = 1743131, upload-time = "2026-01-03T17:30:58.256Z" }, + { url = "https://files.pythonhosted.org/packages/04/98/3d21dde21889b17ca2eea54fdcff21b27b93f45b7bb94ca029c31ab59dc3/aiohttp-3.13.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fc290605db2a917f6e81b0e1e0796469871f5af381ce15c604a3c5c7e51cb730", size = 1556863, upload-time = "2026-01-03T17:31:00.445Z" }, + { url = "https://files.pythonhosted.org/packages/9e/84/da0c3ab1192eaf64782b03971ab4055b475d0db07b17eff925e8c93b3aa5/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4021b51936308aeea0367b8f006dc999ca02bc118a0cc78c303f50a2ff6afb91", size = 1682793, upload-time = "2026-01-03T17:31:03.024Z" }, + { url = "https://files.pythonhosted.org/packages/ff/0f/5802ada182f575afa02cbd0ec5180d7e13a402afb7c2c03a9aa5e5d49060/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:49a03727c1bba9a97d3e93c9f93ca03a57300f484b6e935463099841261195d3", size = 1716676, upload-time = "2026-01-03T17:31:04.842Z" }, + { url = "https://files.pythonhosted.org/packages/3f/8c/714d53bd8b5a4560667f7bbbb06b20c2382f9c7847d198370ec6526af39c/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3d9908a48eb7416dc1f4524e69f1d32e5d90e3981e4e37eb0aa1cd18f9cfa2a4", size = 1733217, upload-time = "2026-01-03T17:31:06.868Z" }, + { url = "https://files.pythonhosted.org/packages/7d/79/e2176f46d2e963facea939f5be2d26368ce543622be6f00a12844d3c991f/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:2712039939ec963c237286113c68dbad80a82a4281543f3abf766d9d73228998", size = 1552303, upload-time = "2026-01-03T17:31:08.958Z" }, + { url = "https://files.pythonhosted.org/packages/ab/6a/28ed4dea1759916090587d1fe57087b03e6c784a642b85ef48217b0277ae/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:7bfdc049127717581866fa4708791220970ce291c23e28ccf3922c700740fdc0", size = 1763673, upload-time = "2026-01-03T17:31:10.676Z" }, + { url = "https://files.pythonhosted.org/packages/e8/35/4a3daeb8b9fab49240d21c04d50732313295e4bd813a465d840236dd0ce1/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8057c98e0c8472d8846b9c79f56766bcc57e3e8ac7bfd510482332366c56c591", size = 1721120, upload-time = "2026-01-03T17:31:12.575Z" }, + { url = "https://files.pythonhosted.org/packages/bc/9f/d643bb3c5fb99547323e635e251c609fbbc660d983144cfebec529e09264/aiohttp-3.13.3-cp313-cp313-win32.whl", hash = "sha256:1449ceddcdbcf2e0446957863af03ebaaa03f94c090f945411b61269e2cb5daf", size = 427383, upload-time = "2026-01-03T17:31:14.382Z" }, + { url = "https://files.pythonhosted.org/packages/4e/f1/ab0395f8a79933577cdd996dd2f9aa6014af9535f65dddcf88204682fe62/aiohttp-3.13.3-cp313-cp313-win_amd64.whl", hash = "sha256:693781c45a4033d31d4187d2436f5ac701e7bbfe5df40d917736108c1cc7436e", size = 453899, upload-time = "2026-01-03T17:31:15.958Z" }, + { url = "https://files.pythonhosted.org/packages/99/36/5b6514a9f5d66f4e2597e40dea2e3db271e023eb7a5d22defe96ba560996/aiohttp-3.13.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:ea37047c6b367fd4bd632bff8077449b8fa034b69e812a18e0132a00fae6e808", size = 737238, upload-time = "2026-01-03T17:31:17.909Z" }, + { url = "https://files.pythonhosted.org/packages/f7/49/459327f0d5bcd8c6c9ca69e60fdeebc3622861e696490d8674a6d0cb90a6/aiohttp-3.13.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:6fc0e2337d1a4c3e6acafda6a78a39d4c14caea625124817420abceed36e2415", size = 492292, upload-time = "2026-01-03T17:31:19.919Z" }, + { url = "https://files.pythonhosted.org/packages/e8/0b/b97660c5fd05d3495b4eb27f2d0ef18dc1dc4eff7511a9bf371397ff0264/aiohttp-3.13.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c685f2d80bb67ca8c3837823ad76196b3694b0159d232206d1e461d3d434666f", size = 493021, upload-time = "2026-01-03T17:31:21.636Z" }, + { url = "https://files.pythonhosted.org/packages/54/d4/438efabdf74e30aeceb890c3290bbaa449780583b1270b00661126b8aae4/aiohttp-3.13.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:48e377758516d262bde50c2584fc6c578af272559c409eecbdd2bae1601184d6", size = 1717263, upload-time = "2026-01-03T17:31:23.296Z" }, + { url = "https://files.pythonhosted.org/packages/71/f2/7bddc7fd612367d1459c5bcf598a9e8f7092d6580d98de0e057eb42697ad/aiohttp-3.13.3-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:34749271508078b261c4abb1767d42b8d0c0cc9449c73a4df494777dc55f0687", size = 1669107, upload-time = "2026-01-03T17:31:25.334Z" }, + { url = "https://files.pythonhosted.org/packages/00/5a/1aeaecca40e22560f97610a329e0e5efef5e0b5afdf9f857f0d93839ab2e/aiohttp-3.13.3-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:82611aeec80eb144416956ec85b6ca45a64d76429c1ed46ae1b5f86c6e0c9a26", size = 1760196, upload-time = "2026-01-03T17:31:27.394Z" }, + { url = "https://files.pythonhosted.org/packages/f8/f8/0ff6992bea7bd560fc510ea1c815f87eedd745fe035589c71ce05612a19a/aiohttp-3.13.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2fff83cfc93f18f215896e3a190e8e5cb413ce01553901aca925176e7568963a", size = 1843591, upload-time = "2026-01-03T17:31:29.238Z" }, + { url = "https://files.pythonhosted.org/packages/e3/d1/e30e537a15f53485b61f5be525f2157da719819e8377298502aebac45536/aiohttp-3.13.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bbe7d4cecacb439e2e2a8a1a7b935c25b812af7a5fd26503a66dadf428e79ec1", size = 1720277, upload-time = "2026-01-03T17:31:31.053Z" }, + { url = "https://files.pythonhosted.org/packages/84/45/23f4c451d8192f553d38d838831ebbc156907ea6e05557f39563101b7717/aiohttp-3.13.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b928f30fe49574253644b1ca44b1b8adbd903aa0da4b9054a6c20fc7f4092a25", size = 1548575, upload-time = "2026-01-03T17:31:32.87Z" }, + { url = "https://files.pythonhosted.org/packages/6a/ed/0a42b127a43712eda7807e7892c083eadfaf8429ca8fb619662a530a3aab/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7b5e8fe4de30df199155baaf64f2fcd604f4c678ed20910db8e2c66dc4b11603", size = 1679455, upload-time = "2026-01-03T17:31:34.76Z" }, + { url = "https://files.pythonhosted.org/packages/2e/b5/c05f0c2b4b4fe2c9d55e73b6d3ed4fd6c9dc2684b1d81cbdf77e7fad9adb/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:8542f41a62bcc58fc7f11cf7c90e0ec324ce44950003feb70640fc2a9092c32a", size = 1687417, upload-time = "2026-01-03T17:31:36.699Z" }, + { url = "https://files.pythonhosted.org/packages/c9/6b/915bc5dad66aef602b9e459b5a973529304d4e89ca86999d9d75d80cbd0b/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:5e1d8c8b8f1d91cd08d8f4a3c2b067bfca6ec043d3ff36de0f3a715feeedf926", size = 1729968, upload-time = "2026-01-03T17:31:38.622Z" }, + { url = "https://files.pythonhosted.org/packages/11/3b/e84581290a9520024a08640b63d07673057aec5ca548177a82026187ba73/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:90455115e5da1c3c51ab619ac57f877da8fd6d73c05aacd125c5ae9819582aba", size = 1545690, upload-time = "2026-01-03T17:31:40.57Z" }, + { url = "https://files.pythonhosted.org/packages/f5/04/0c3655a566c43fd647c81b895dfe361b9f9ad6d58c19309d45cff52d6c3b/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:042e9e0bcb5fba81886c8b4fbb9a09d6b8a00245fd8d88e4d989c1f96c74164c", size = 1746390, upload-time = "2026-01-03T17:31:42.857Z" }, + { url = "https://files.pythonhosted.org/packages/1f/53/71165b26978f719c3419381514c9690bd5980e764a09440a10bb816ea4ab/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2eb752b102b12a76ca02dff751a801f028b4ffbbc478840b473597fc91a9ed43", size = 1702188, upload-time = "2026-01-03T17:31:44.984Z" }, + { url = "https://files.pythonhosted.org/packages/29/a7/cbe6c9e8e136314fa1980da388a59d2f35f35395948a08b6747baebb6aa6/aiohttp-3.13.3-cp314-cp314-win32.whl", hash = "sha256:b556c85915d8efaed322bf1bdae9486aa0f3f764195a0fb6ee962e5c71ef5ce1", size = 433126, upload-time = "2026-01-03T17:31:47.463Z" }, + { url = "https://files.pythonhosted.org/packages/de/56/982704adea7d3b16614fc5936014e9af85c0e34b58f9046655817f04306e/aiohttp-3.13.3-cp314-cp314-win_amd64.whl", hash = "sha256:9bf9f7a65e7aa20dd764151fb3d616c81088f91f8df39c3893a536e279b4b984", size = 459128, upload-time = "2026-01-03T17:31:49.2Z" }, + { url = "https://files.pythonhosted.org/packages/6c/2a/3c79b638a9c3d4658d345339d22070241ea341ed4e07b5ac60fb0f418003/aiohttp-3.13.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:05861afbbec40650d8a07ea324367cb93e9e8cc7762e04dd4405df99fa65159c", size = 769512, upload-time = "2026-01-03T17:31:51.134Z" }, + { url = "https://files.pythonhosted.org/packages/29/b9/3e5014d46c0ab0db8707e0ac2711ed28c4da0218c358a4e7c17bae0d8722/aiohttp-3.13.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2fc82186fadc4a8316768d61f3722c230e2c1dcab4200d52d2ebdf2482e47592", size = 506444, upload-time = "2026-01-03T17:31:52.85Z" }, + { url = "https://files.pythonhosted.org/packages/90/03/c1d4ef9a054e151cd7839cdc497f2638f00b93cbe8043983986630d7a80c/aiohttp-3.13.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0add0900ff220d1d5c5ebbf99ed88b0c1bbf87aa7e4262300ed1376a6b13414f", size = 510798, upload-time = "2026-01-03T17:31:54.91Z" }, + { url = "https://files.pythonhosted.org/packages/ea/76/8c1e5abbfe8e127c893fe7ead569148a4d5a799f7cf958d8c09f3eedf097/aiohttp-3.13.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:568f416a4072fbfae453dcf9a99194bbb8bdeab718e08ee13dfa2ba0e4bebf29", size = 1868835, upload-time = "2026-01-03T17:31:56.733Z" }, + { url = "https://files.pythonhosted.org/packages/8e/ac/984c5a6f74c363b01ff97adc96a3976d9c98940b8969a1881575b279ac5d/aiohttp-3.13.3-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:add1da70de90a2569c5e15249ff76a631ccacfe198375eead4aadf3b8dc849dc", size = 1720486, upload-time = "2026-01-03T17:31:58.65Z" }, + { url = "https://files.pythonhosted.org/packages/b2/9a/b7039c5f099c4eb632138728828b33428585031a1e658d693d41d07d89d1/aiohttp-3.13.3-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:10b47b7ba335d2e9b1239fa571131a87e2d8ec96b333e68b2a305e7a98b0bae2", size = 1847951, upload-time = "2026-01-03T17:32:00.989Z" }, + { url = "https://files.pythonhosted.org/packages/3c/02/3bec2b9a1ba3c19ff89a43a19324202b8eb187ca1e928d8bdac9bbdddebd/aiohttp-3.13.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3dd4dce1c718e38081c8f35f323209d4c1df7d4db4bab1b5c88a6b4d12b74587", size = 1941001, upload-time = "2026-01-03T17:32:03.122Z" }, + { url = "https://files.pythonhosted.org/packages/37/df/d879401cedeef27ac4717f6426c8c36c3091c6e9f08a9178cc87549c537f/aiohttp-3.13.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:34bac00a67a812570d4a460447e1e9e06fae622946955f939051e7cc895cfab8", size = 1797246, upload-time = "2026-01-03T17:32:05.255Z" }, + { url = "https://files.pythonhosted.org/packages/8d/15/be122de1f67e6953add23335c8ece6d314ab67c8bebb3f181063010795a7/aiohttp-3.13.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a19884d2ee70b06d9204b2727a7b9f983d0c684c650254679e716b0b77920632", size = 1627131, upload-time = "2026-01-03T17:32:07.607Z" }, + { url = "https://files.pythonhosted.org/packages/12/12/70eedcac9134cfa3219ab7af31ea56bc877395b1ac30d65b1bc4b27d0438/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5f8ca7f2bb6ba8348a3614c7918cc4bb73268c5ac2a207576b7afea19d3d9f64", size = 1795196, upload-time = "2026-01-03T17:32:09.59Z" }, + { url = "https://files.pythonhosted.org/packages/32/11/b30e1b1cd1f3054af86ebe60df96989c6a414dd87e27ad16950eee420bea/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:b0d95340658b9d2f11d9697f59b3814a9d3bb4b7a7c20b131df4bcef464037c0", size = 1782841, upload-time = "2026-01-03T17:32:11.445Z" }, + { url = "https://files.pythonhosted.org/packages/88/0d/d98a9367b38912384a17e287850f5695c528cff0f14f791ce8ee2e4f7796/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:a1e53262fd202e4b40b70c3aff944a8155059beedc8a89bba9dc1f9ef06a1b56", size = 1795193, upload-time = "2026-01-03T17:32:13.705Z" }, + { url = "https://files.pythonhosted.org/packages/43/a5/a2dfd1f5ff5581632c7f6a30e1744deda03808974f94f6534241ef60c751/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:d60ac9663f44168038586cab2157e122e46bdef09e9368b37f2d82d354c23f72", size = 1621979, upload-time = "2026-01-03T17:32:15.965Z" }, + { url = "https://files.pythonhosted.org/packages/fa/f0/12973c382ae7c1cccbc4417e129c5bf54c374dfb85af70893646e1f0e749/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:90751b8eed69435bac9ff4e3d2f6b3af1f57e37ecb0fbeee59c0174c9e2d41df", size = 1822193, upload-time = "2026-01-03T17:32:18.219Z" }, + { url = "https://files.pythonhosted.org/packages/3c/5f/24155e30ba7f8c96918af1350eb0663e2430aad9e001c0489d89cd708ab1/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:fc353029f176fd2b3ec6cfc71be166aba1936fe5d73dd1992ce289ca6647a9aa", size = 1769801, upload-time = "2026-01-03T17:32:20.25Z" }, + { url = "https://files.pythonhosted.org/packages/eb/f8/7314031ff5c10e6ece114da79b338ec17eeff3a079e53151f7e9f43c4723/aiohttp-3.13.3-cp314-cp314t-win32.whl", hash = "sha256:2e41b18a58da1e474a057b3d35248d8320029f61d70a37629535b16a0c8f3767", size = 466523, upload-time = "2026-01-03T17:32:22.215Z" }, + { url = "https://files.pythonhosted.org/packages/b4/63/278a98c715ae467624eafe375542d8ba9b4383a016df8fdefe0ae28382a7/aiohttp-3.13.3-cp314-cp314t-win_amd64.whl", hash = "sha256:44531a36aa2264a1860089ffd4dce7baf875ee5a6079d5fb42e261c704ef7344", size = 499694, upload-time = "2026-01-03T17:32:24.546Z" }, ] [[package]] @@ -274,37 +274,37 @@ wheels = [ [[package]] name = "apache-tvm-ffi" -version = "0.1.6" +version = "0.1.7" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/45/20/8da071821b2142bdeed757d2859dede4817e0b82a96e9a4d8cfbffd49006/apache_tvm_ffi-0.1.6.tar.gz", hash = "sha256:53088126f7fce11823ddf0fb101e968a90298d79fd68829c0a981f25467a574c", size = 2387987, upload-time = "2025-12-16T19:00:33.523Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c7/f8/6bc29ca8945a8a0b52997fd1e564c783f5b2578b6125315ed30dd0b1d0e4/apache_tvm_ffi-0.1.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ecda748ad9139593296cde3581223e9ddf1be3feca987adea676708b98f297ac", size = 1806165, upload-time = "2025-12-16T18:59:40.928Z" }, - { url = "https://files.pythonhosted.org/packages/1c/12/310a9953d6a35c2975e0d585f5bdd936858ec6b5b9daee34dc49dd4e3e2e/apache_tvm_ffi-0.1.6-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d976e347d0e6f6695103ce90cc739c717b3623fb9fd4867ffc395e2fe006f345", size = 1965883, upload-time = "2025-12-16T18:59:42.54Z" }, - { url = "https://files.pythonhosted.org/packages/9a/e1/37326821f2976167f142d23ded0e80f15ca05408ab49d87a2151ff246c76/apache_tvm_ffi-0.1.6-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e6caf9fdc209c3a6f618a462fc8c0925525246f16912f6333424819f19484c06", size = 2037885, upload-time = "2025-12-16T18:59:43.846Z" }, - { url = "https://files.pythonhosted.org/packages/28/d2/614d397d69b20ccf86d07f3e02d77e0056415f82e81816905ae1d11cd6e5/apache_tvm_ffi-0.1.6-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d43d8540bc38eb7f5173f8516a7963b2b0a8cdbc3fe315600d856fe2e3ed0f6f", size = 1909586, upload-time = "2025-12-16T18:59:45.111Z" }, - { url = "https://files.pythonhosted.org/packages/1c/3a/79aac72fbf67aac585757d34a57770d17c0ee34e9e46f668ab62df5c16ce/apache_tvm_ffi-0.1.6-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f08cb6638dd2cd2e9f1cdc5126be676632ecaf09edb1ad6d43f836baa2f02845", size = 2019954, upload-time = "2025-12-16T18:59:46.612Z" }, - { url = "https://files.pythonhosted.org/packages/73/99/857e1497bfec2e3622ec21ca706b9af6f2ec94bca162d1216855cc617752/apache_tvm_ffi-0.1.6-cp310-cp310-win_amd64.whl", hash = "sha256:017576fc9a638a37cb2fc7024a3b2f9071a54db62545daf166efc8f9c8fda8a3", size = 1777727, upload-time = "2025-12-16T18:59:47.908Z" }, - { url = "https://files.pythonhosted.org/packages/a6/d1/dc4878dcca3d244918fa815a00c558652209f68a1678280b01cd79cdcc01/apache_tvm_ffi-0.1.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:52e9213b553e729e9bcf9acb2bfa0d7e3000fc4756f86ed375827b1e4b53692f", size = 1807748, upload-time = "2025-12-16T18:59:49.709Z" }, - { url = "https://files.pythonhosted.org/packages/fb/44/9e33ca98ee36f1ddf81246d8aad64a87728e03590dae71f3a99b8647c853/apache_tvm_ffi-0.1.6-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9532d721f208e4b9989f0e1b3a2d785c6b26d27d3e2b378b945c60d9c29e86ce", size = 1965166, upload-time = "2025-12-16T18:59:51.239Z" }, - { url = "https://files.pythonhosted.org/packages/c0/04/f1f580c53271795b6c231e4f9d65b1b263c4288413601abf4e3b175a474e/apache_tvm_ffi-0.1.6-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e93fe06aa0266faec4bd63de82a77af2005dc4b793cc6dd3dcc941eb05d4ba47", size = 2037588, upload-time = "2025-12-16T18:59:52.474Z" }, - { url = "https://files.pythonhosted.org/packages/56/7c/a0fc4194742766919a4d2664a1845561b81f4488d6088835f1d1c311680a/apache_tvm_ffi-0.1.6-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c1b8ca3e79d4a37266ab9b15c8e265fd9fd7131d351302149cff0a948f37986c", size = 1909384, upload-time = "2025-12-16T18:59:54.931Z" }, - { url = "https://files.pythonhosted.org/packages/f1/e1/c228f2314ad14bc72dd80c883108b0d84988b655f7afe74b5336e38224e1/apache_tvm_ffi-0.1.6-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4cdcba21a2425a40b72367d0a4299ee268ad1d19d5f4c2b9e55e02dadf4c2465", size = 2020174, upload-time = "2025-12-16T18:59:56.449Z" }, - { url = "https://files.pythonhosted.org/packages/5e/3a/42edbd6d5cc6eb403981e5ff0e1548a16794687d75d1dbbf04fa187adc62/apache_tvm_ffi-0.1.6-cp311-cp311-win_amd64.whl", hash = "sha256:bc9973e71c54cd77a9e9d3937534f304bc9079edc42df00598778c115380cb1c", size = 1778243, upload-time = "2025-12-16T18:59:58.077Z" }, - { url = "https://files.pythonhosted.org/packages/1f/de/4ae5dd4d493b1cea755a25d59088895486432c053cff5a3287b75e36ce54/apache_tvm_ffi-0.1.6-cp312-abi3-macosx_11_0_arm64.whl", hash = "sha256:5f4c0678854dbf3bfaa37795465f570d79c68759896b04b3d31774af0a03bcb8", size = 1779381, upload-time = "2025-12-16T18:59:59.593Z" }, - { url = "https://files.pythonhosted.org/packages/2d/40/2e943cbda764c3266a6966a34e582d3f0ac6046ab6aaa756631df9afd7bf/apache_tvm_ffi-0.1.6-cp312-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:653f1d4c8ffd6bca5300fd1825a81373a5be82f31dc79353d1c476fa31cf377a", size = 1936756, upload-time = "2025-12-16T19:00:00.844Z" }, - { url = "https://files.pythonhosted.org/packages/a3/91/fc43f155b4d4363e61707655c1f4bee75af1d6dd4a76680f4956dd9846fe/apache_tvm_ffi-0.1.6-cp312-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6a2cdfa90860a80e3cfb2364ce3b66a559fa5748de8d593a203b2e5992d92bc1", size = 2013641, upload-time = "2025-12-16T19:00:02.479Z" }, - { url = "https://files.pythonhosted.org/packages/14/9b/45208f2a9c70a88fd8e65668c0628f3917625d64668800ff55a2390d7fe0/apache_tvm_ffi-0.1.6-cp312-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223ac7ac08b34a6dbabe7085f23939b4aaa70666e72ddad41015659034e095af", size = 1881149, upload-time = "2025-12-16T19:00:03.776Z" }, - { url = "https://files.pythonhosted.org/packages/7d/c5/e3ba08379127578bb3417605b61e9cd5e513184a6947ec7f3fac93d16355/apache_tvm_ffi-0.1.6-cp312-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05cedb3ba7600dc9ae35c17b7325d44ecf02c56c3ba1b62668dca8390da7ec28", size = 1992886, upload-time = "2025-12-16T19:00:05.047Z" }, - { url = "https://files.pythonhosted.org/packages/d6/7b/4df1e523ae4bcbfbe65a3e7ef3c8810cb76e9ae44fa9b44c9fac152ecc2b/apache_tvm_ffi-0.1.6-cp312-abi3-win_amd64.whl", hash = "sha256:a6c29ba9dbc6273f4534bfc0e8a52a784f264724eb62df62daedc2b349dabe85", size = 1758454, upload-time = "2025-12-16T19:00:06.498Z" }, - { url = "https://files.pythonhosted.org/packages/65/b5/17d994698417882e3d0f4531390abfeec8eab08de3cf8117e22041a70f67/apache_tvm_ffi-0.1.6-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:23b1a7a7ca409189147d4c517b72676d12538fcbb1631437ad06919107ab91a3", size = 1809885, upload-time = "2025-12-16T19:00:08.028Z" }, - { url = "https://files.pythonhosted.org/packages/32/d6/32fd7385878ac4c721e23c6e01e7d914147ff175105f5f24696e5316ffb8/apache_tvm_ffi-0.1.6-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2720594c9d2bc5a50768b80b966ab9ef942e0f7a0aeb91e9fd7fd35703cfd944", size = 1950167, upload-time = "2025-12-16T19:00:09.365Z" }, - { url = "https://files.pythonhosted.org/packages/4d/ad/2877cc6d4c21d78783452e082b430a0d0cdcacaab6cec162d2542b753f75/apache_tvm_ffi-0.1.6-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d27fbdf7c0f41be14a56a043a55c056548cbc0a76031c4fb3c6157d487afdec", size = 2021788, upload-time = "2025-12-16T19:00:10.681Z" }, - { url = "https://files.pythonhosted.org/packages/57/3c/8252539e4b03305e0c78508f90441ff5a73070cdac499c40a68fb533716f/apache_tvm_ffi-0.1.6-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c258313a49e246e878391bd2d9469f287bd3089ce53dcb379eee07bb78ad0675", size = 1894013, upload-time = "2025-12-16T19:00:11.963Z" }, - { url = "https://files.pythonhosted.org/packages/07/e8/199779b4ad83e570dface5c7727f2e4a288d07bec8a7ceec21e51a5e96dc/apache_tvm_ffi-0.1.6-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4378ca283d680fa4af296cc430f6e050746434f487b29724273a56c169af2282", size = 2003016, upload-time = "2025-12-16T19:00:13.569Z" }, - { url = "https://files.pythonhosted.org/packages/fc/9f/0ffac1066ffb06b4c9645a74e6423ecae25228d26bae4c0a77abd0c032a0/apache_tvm_ffi-0.1.6-cp314-cp314t-win_amd64.whl", hash = "sha256:05fc0bde38884c9973126f9c87f3d296255b46b51fa4051c693d8ee559ba14ed", size = 1818312, upload-time = "2025-12-16T19:00:15.406Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/3d/07/6fbc8fbef1d04bd290f2dcdb3091ae784ac526b62649ec52993a41c65f72/apache_tvm_ffi-0.1.7.tar.gz", hash = "sha256:737cd4a067d6c6c7ad7dd909a0708eb3dc28540299039ea636f8ff5766b122be", size = 2397940, upload-time = "2025-12-28T09:13:25.52Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/00/e6c7e0710344ccfb2a42be68e04dfd1920864c25bab4a7411a48a4809a1a/apache_tvm_ffi-0.1.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cc6334f55ad8b4cb3c084dcdf33720b47665d0ea488c36a1b4f1b99445ae5a12", size = 1816700, upload-time = "2025-12-28T09:12:22.223Z" }, + { url = "https://files.pythonhosted.org/packages/84/68/82799768095fe83640f0def07eda01891c9d713a9db8770316ca460a6114/apache_tvm_ffi-0.1.7-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f69f1195ad7701b0a024a84914b934487a30d5975a9e5d5044c57eb9f9b0fcf7", size = 1976292, upload-time = "2025-12-28T09:12:24.623Z" }, + { url = "https://files.pythonhosted.org/packages/8a/ab/0c01ac5c3d545c04d1adf03a154f8167dc5884c0fdcbb519714107426028/apache_tvm_ffi-0.1.7-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7b6444a322279cc33ada0bb2a0482e3433c31028becda106dcb0d48c30fb2de0", size = 2048671, upload-time = "2025-12-28T09:12:26.457Z" }, + { url = "https://files.pythonhosted.org/packages/0a/e3/449fcdbe7ebd8df4b830399171fb325e7f77b2babe958c6fa6c537281e26/apache_tvm_ffi-0.1.7-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d5e9e668620ba3b78b1c1f393dee67a63850882b0713dba31972c5f854f02860", size = 1920010, upload-time = "2025-12-28T09:12:27.81Z" }, + { url = "https://files.pythonhosted.org/packages/a2/98/737ffc4576af7d4da97f3c73bf347f69d269497cfe9ac089517af5900919/apache_tvm_ffi-0.1.7-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5f7deaa48cfd720949dd1638dfbd4cc7d5285008c7f3f342887e2bf33cf1f5be", size = 2030727, upload-time = "2025-12-28T09:12:29.38Z" }, + { url = "https://files.pythonhosted.org/packages/f1/36/8ea373c1758c812a504a856a06fc08d8761df1c0e2515e6867c22168fea7/apache_tvm_ffi-0.1.7-cp310-cp310-win_amd64.whl", hash = "sha256:c1fd70f6e7578eeec5e5d8ed0fb814b12280b724531487ff4d899edddd188d97", size = 1787864, upload-time = "2025-12-28T09:12:31.194Z" }, + { url = "https://files.pythonhosted.org/packages/0a/e7/33ece51ba1670fa77a1897745720b9c8bdac854acb0e09d45e64340948f4/apache_tvm_ffi-0.1.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:20a8847f4609f1fe61015b7547bced99eba38072ed422799fc7bd15371d6d83c", size = 1818328, upload-time = "2025-12-28T09:12:32.784Z" }, + { url = "https://files.pythonhosted.org/packages/8f/b9/3bb4099a82b4c7198823b67067a3d206ec8a0b32204a559c5cca1bee54bd/apache_tvm_ffi-0.1.7-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f0e010e61d1f220ec4ce3d15053db3f8c8d9c79230ea763343fc5e4acf53ef17", size = 1975412, upload-time = "2025-12-28T09:12:34.737Z" }, + { url = "https://files.pythonhosted.org/packages/48/53/423788fb9b26460b3d7ceb8588d172dfe7ae4abcc335931fcbf08a859904/apache_tvm_ffi-0.1.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9b05155b4b60ebd3642213d0489b6ef24aff17b268960dbb5f106a39899bb8b1", size = 2047974, upload-time = "2025-12-28T09:12:36.296Z" }, + { url = "https://files.pythonhosted.org/packages/a6/30/45d4acf7f99e1fc79a8663f2111901b8031e1f9b316860af7acf4859c964/apache_tvm_ffi-0.1.7-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cceaddc7636060231aca4ada2632814189b1169224b2b451f41984145ef615fc", size = 1919697, upload-time = "2025-12-28T09:12:38.15Z" }, + { url = "https://files.pythonhosted.org/packages/dd/bb/fa5042076bf6e7daaf9774389f99149c1851434fc0d8e4cb34aa0c4a3810/apache_tvm_ffi-0.1.7-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5769cadc42e70522e2a523f1dfe24f48dbe3bf384e63f95df251f9d572ffcf23", size = 2030760, upload-time = "2025-12-28T09:12:39.813Z" }, + { url = "https://files.pythonhosted.org/packages/fe/74/fd06e97699e9cbf36d887c5fbbc56b14e896e2652bbe1781ab84cef82a40/apache_tvm_ffi-0.1.7-cp311-cp311-win_amd64.whl", hash = "sha256:b5c7716429ce2beb0a5b00c5a3bdd90b8a5891838afb782491c576ade42ba7c4", size = 1788026, upload-time = "2025-12-28T09:12:42.142Z" }, + { url = "https://files.pythonhosted.org/packages/26/4e/43a41ac023a5989803952d527dfea6e63da71fe223f6e010d4ec71ca0526/apache_tvm_ffi-0.1.7-cp312-abi3-macosx_11_0_arm64.whl", hash = "sha256:12950ca9f9f4f4436869afe17845a6bfc85cbcd8a15dfa2b16095f7e6f49d06f", size = 1790152, upload-time = "2025-12-28T09:12:43.975Z" }, + { url = "https://files.pythonhosted.org/packages/b9/d3/05ba0a63baba1e3aec0f6303c4bc567493fb1c070d9f298f929a7703c0fb/apache_tvm_ffi-0.1.7-cp312-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d0e579234ce6fb2899377335a881ecf15d0197d833e2d370c9269ea6ca578f6f", size = 1947362, upload-time = "2025-12-28T09:12:45.921Z" }, + { url = "https://files.pythonhosted.org/packages/f1/11/b69df7685d75144fd9f57e5155cdf4ff91d6617a9f8b89b1415204863da0/apache_tvm_ffi-0.1.7-cp312-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:258a4aecc16e963def8ba0ab07f585147c7e7f586156b9496bfdf34af229443d", size = 2024240, upload-time = "2025-12-28T09:12:47.337Z" }, + { url = "https://files.pythonhosted.org/packages/cf/b6/31459f4141ea8621377fecac7c29e1568d494cbf95c5aa1ddf2cbc12a8ff/apache_tvm_ffi-0.1.7-cp312-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:363701589349e11a945dabce026578203bd83cb8de71af9a066beadd77af085a", size = 1891485, upload-time = "2025-12-28T09:12:49.171Z" }, + { url = "https://files.pythonhosted.org/packages/a5/4d/d21874eda6e3ea59c5a84aa010b24b84617e3b286ad759ac5eadccb1a88c/apache_tvm_ffi-0.1.7-cp312-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fbbf87df625930bafbd979c2c510d5bd989e9171098e5bb65320d0e7336d0095", size = 2003196, upload-time = "2025-12-28T09:12:50.891Z" }, + { url = "https://files.pythonhosted.org/packages/3f/d4/37102d96e359386107f5ce3751c4e2a8c1b8df3d34f65b701810ba59465c/apache_tvm_ffi-0.1.7-cp312-abi3-win_amd64.whl", hash = "sha256:d2fb56f53e33c7ddf7d6d340d44cbc440d205f7dab4bc5ed1ad20c8fc779250f", size = 1768697, upload-time = "2025-12-28T09:12:52.394Z" }, + { url = "https://files.pythonhosted.org/packages/92/c3/aa4b950032251c24b9db7d725b86d7d683b62d9919f8a32f478c28951dc3/apache_tvm_ffi-0.1.7-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:dc4a02e0252599d0c4eb2d2fa91b7756f0446b3bc42479b05c140e9d336b9b8b", size = 1820520, upload-time = "2025-12-28T09:12:54.29Z" }, + { url = "https://files.pythonhosted.org/packages/19/70/55ee17b8a340ef8ffc0d6c0587ff5a0c7e7c85a94e6cb202e682838a42c7/apache_tvm_ffi-0.1.7-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:41e50f2c8d98d706923c70ac19fd5f605bf71b8ffa43c0c2e9e1e22c2d60d4e0", size = 1960686, upload-time = "2025-12-28T09:12:56.206Z" }, + { url = "https://files.pythonhosted.org/packages/b6/0f/ca4f7b4836e1e03386b6e486a0ba88812644723a96965a01e2072f551f2e/apache_tvm_ffi-0.1.7-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:835bd391c6f3388e84e36f0ea2347761992241a3953be6ebb319bf1c2ac855d8", size = 2032237, upload-time = "2025-12-28T09:12:58.113Z" }, + { url = "https://files.pythonhosted.org/packages/89/b6/35be0035f8ed9e10ae6d9ffb7e91397ba381eb734f85ff852efe56eb3012/apache_tvm_ffi-0.1.7-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d7d8b53e94c2bc28e961934e8291a9763d7868f84f9759cbae462b77ca801e5b", size = 1904414, upload-time = "2025-12-28T09:12:59.624Z" }, + { url = "https://files.pythonhosted.org/packages/5a/5f/1f57863c2c68389d1453fe147d89da22910a0e4f645a8be29cc8f461850f/apache_tvm_ffi-0.1.7-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e135b70c7be8627661c5ec4a466e17e1aba260ffd7c6bccfe231c9ea975875e7", size = 2013039, upload-time = "2025-12-28T09:13:01.37Z" }, + { url = "https://files.pythonhosted.org/packages/bb/3f/08d1931c6ebca557051176d400e15c1d7f6cf9096fc02f8c90ac7ee309ac/apache_tvm_ffi-0.1.7-cp314-cp314t-win_amd64.whl", hash = "sha256:408bb2c1fa585260afd556e53d65e2735f201f358202fda2b07d08a6cbfaf91f", size = 1828344, upload-time = "2025-12-28T09:13:03.359Z" }, ] [[package]] @@ -686,11 +686,11 @@ sdist = { url = "https://files.pythonhosted.org/packages/64/cb/104778c728dc3d5ea [[package]] name = "certifi" -version = "2025.11.12" +version = "2026.1.4" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a2/8c/58f469717fa48465e4a50c014a0400602d3c437d7c0c468e17ada824da3a/certifi-2025.11.12.tar.gz", hash = "sha256:d8ab5478f2ecd78af242878415affce761ca6bc54a22a27e026d7c25357c3316", size = 160538, upload-time = "2025-11-12T02:54:51.517Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/2d/a891ca51311197f6ad14a7ef42e2399f36cf2f9bd44752b3dc4eab60fdc5/certifi-2026.1.4.tar.gz", hash = "sha256:ac726dd470482006e014ad384921ed6438c457018f4b3d204aea4281258b2120", size = 154268, upload-time = "2026-01-04T02:42:41.825Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/70/7d/9bc192684cea499815ff478dfcdc13835ddf401365057044fb721ec6bddb/certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b", size = 159438, upload-time = "2025-11-12T02:54:49.735Z" }, + { url = "https://files.pythonhosted.org/packages/e6/ad/3cc14f097111b4de0040c83a525973216457bbeeb63739ef1ed275c1c021/certifi-2026.1.4-py3-none-any.whl", hash = "sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c", size = 152900, upload-time = "2026-01-04T02:42:40.15Z" }, ] [[package]] @@ -905,101 +905,101 @@ wheels = [ [[package]] name = "coverage" -version = "7.13.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b6/45/2c665ca77ec32ad67e25c77daf1cee28ee4558f3bc571cdbaf88a00b9f23/coverage-7.13.0.tar.gz", hash = "sha256:a394aa27f2d7ff9bc04cf703817773a59ad6dfbd577032e690f961d2460ee936", size = 820905, upload-time = "2025-12-08T13:14:38.055Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/db/08/bdd7ccca14096f7eb01412b87ac11e5d16e4cb54b6e328afc9dee8bdaec1/coverage-7.13.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:02d9fb9eccd48f6843c98a37bd6817462f130b86da8660461e8f5e54d4c06070", size = 217979, upload-time = "2025-12-08T13:12:14.505Z" }, - { url = "https://files.pythonhosted.org/packages/fa/f0/d1302e3416298a28b5663ae1117546a745d9d19fde7e28402b2c5c3e2109/coverage-7.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:367449cf07d33dc216c083f2036bb7d976c6e4903ab31be400ad74ad9f85ce98", size = 218496, upload-time = "2025-12-08T13:12:16.237Z" }, - { url = "https://files.pythonhosted.org/packages/07/26/d36c354c8b2a320819afcea6bffe72839efd004b98d1d166b90801d49d57/coverage-7.13.0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cdb3c9f8fef0a954c632f64328a3935988d33a6604ce4bf67ec3e39670f12ae5", size = 245237, upload-time = "2025-12-08T13:12:17.858Z" }, - { url = "https://files.pythonhosted.org/packages/91/52/be5e85631e0eec547873d8b08dd67a5f6b111ecfe89a86e40b89b0c1c61c/coverage-7.13.0-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d10fd186aac2316f9bbb46ef91977f9d394ded67050ad6d84d94ed6ea2e8e54e", size = 247061, upload-time = "2025-12-08T13:12:19.132Z" }, - { url = "https://files.pythonhosted.org/packages/0f/45/a5e8fa0caf05fbd8fa0402470377bff09cc1f026d21c05c71e01295e55ab/coverage-7.13.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7f88ae3e69df2ab62fb0bc5219a597cb890ba5c438190ffa87490b315190bb33", size = 248928, upload-time = "2025-12-08T13:12:20.702Z" }, - { url = "https://files.pythonhosted.org/packages/f5/42/ffb5069b6fd1b95fae482e02f3fecf380d437dd5a39bae09f16d2e2e7e01/coverage-7.13.0-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c4be718e51e86f553bcf515305a158a1cd180d23b72f07ae76d6017c3cc5d791", size = 245931, upload-time = "2025-12-08T13:12:22.243Z" }, - { url = "https://files.pythonhosted.org/packages/95/6e/73e809b882c2858f13e55c0c36e94e09ce07e6165d5644588f9517efe333/coverage-7.13.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a00d3a393207ae12f7c49bb1c113190883b500f48979abb118d8b72b8c95c032", size = 246968, upload-time = "2025-12-08T13:12:23.52Z" }, - { url = "https://files.pythonhosted.org/packages/87/08/64ebd9e64b6adb8b4a4662133d706fbaccecab972e0b3ccc23f64e2678ad/coverage-7.13.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3a7b1cd820e1b6116f92c6128f1188e7afe421c7e1b35fa9836b11444e53ebd9", size = 244972, upload-time = "2025-12-08T13:12:24.781Z" }, - { url = "https://files.pythonhosted.org/packages/12/97/f4d27c6fe0cb375a5eced4aabcaef22de74766fb80a3d5d2015139e54b22/coverage-7.13.0-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:37eee4e552a65866f15dedd917d5e5f3d59805994260720821e2c1b51ac3248f", size = 245241, upload-time = "2025-12-08T13:12:28.041Z" }, - { url = "https://files.pythonhosted.org/packages/0c/94/42f8ae7f633bf4c118bf1038d80472f9dade88961a466f290b81250f7ab7/coverage-7.13.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:62d7c4f13102148c78d7353c6052af6d899a7f6df66a32bddcc0c0eb7c5326f8", size = 245847, upload-time = "2025-12-08T13:12:29.337Z" }, - { url = "https://files.pythonhosted.org/packages/a8/2f/6369ca22b6b6d933f4f4d27765d313d8914cc4cce84f82a16436b1a233db/coverage-7.13.0-cp310-cp310-win32.whl", hash = "sha256:24e4e56304fdb56f96f80eabf840eab043b3afea9348b88be680ec5986780a0f", size = 220573, upload-time = "2025-12-08T13:12:30.905Z" }, - { url = "https://files.pythonhosted.org/packages/f1/dc/a6a741e519acceaeccc70a7f4cfe5d030efc4b222595f0677e101af6f1f3/coverage-7.13.0-cp310-cp310-win_amd64.whl", hash = "sha256:74c136e4093627cf04b26a35dab8cbfc9b37c647f0502fc313376e11726ba303", size = 221509, upload-time = "2025-12-08T13:12:32.09Z" }, - { url = "https://files.pythonhosted.org/packages/f1/dc/888bf90d8b1c3d0b4020a40e52b9f80957d75785931ec66c7dfaccc11c7d/coverage-7.13.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0dfa3855031070058add1a59fdfda0192fd3e8f97e7c81de0596c145dea51820", size = 218104, upload-time = "2025-12-08T13:12:33.333Z" }, - { url = "https://files.pythonhosted.org/packages/8d/ea/069d51372ad9c380214e86717e40d1a743713a2af191cfba30a0911b0a4a/coverage-7.13.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4fdb6f54f38e334db97f72fa0c701e66d8479af0bc3f9bfb5b90f1c30f54500f", size = 218606, upload-time = "2025-12-08T13:12:34.498Z" }, - { url = "https://files.pythonhosted.org/packages/68/09/77b1c3a66c2aa91141b6c4471af98e5b1ed9b9e6d17255da5eb7992299e3/coverage-7.13.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7e442c013447d1d8d195be62852270b78b6e255b79b8675bad8479641e21fd96", size = 248999, upload-time = "2025-12-08T13:12:36.02Z" }, - { url = "https://files.pythonhosted.org/packages/0a/32/2e2f96e9d5691eaf1181d9040f850b8b7ce165ea10810fd8e2afa534cef7/coverage-7.13.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:1ed5630d946859de835a85e9a43b721123a8a44ec26e2830b296d478c7fd4259", size = 250925, upload-time = "2025-12-08T13:12:37.221Z" }, - { url = "https://files.pythonhosted.org/packages/7b/45/b88ddac1d7978859b9a39a8a50ab323186148f1d64bc068f86fc77706321/coverage-7.13.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7f15a931a668e58087bc39d05d2b4bf4b14ff2875b49c994bbdb1c2217a8daeb", size = 253032, upload-time = "2025-12-08T13:12:38.763Z" }, - { url = "https://files.pythonhosted.org/packages/71/cb/e15513f94c69d4820a34b6bf3d2b1f9f8755fa6021be97c7065442d7d653/coverage-7.13.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:30a3a201a127ea57f7e14ba43c93c9c4be8b7d17a26e03bb49e6966d019eede9", size = 249134, upload-time = "2025-12-08T13:12:40.382Z" }, - { url = "https://files.pythonhosted.org/packages/09/61/d960ff7dc9e902af3310ce632a875aaa7860f36d2bc8fc8b37ee7c1b82a5/coverage-7.13.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7a485ff48fbd231efa32d58f479befce52dcb6bfb2a88bb7bf9a0b89b1bc8030", size = 250731, upload-time = "2025-12-08T13:12:41.992Z" }, - { url = "https://files.pythonhosted.org/packages/98/34/c7c72821794afc7c7c2da1db8f00c2c98353078aa7fb6b5ff36aac834b52/coverage-7.13.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:22486cdafba4f9e471c816a2a5745337742a617fef68e890d8baf9f3036d7833", size = 248795, upload-time = "2025-12-08T13:12:43.331Z" }, - { url = "https://files.pythonhosted.org/packages/0a/5b/e0f07107987a43b2def9aa041c614ddb38064cbf294a71ef8c67d43a0cdd/coverage-7.13.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:263c3dbccc78e2e331e59e90115941b5f53e85cfcc6b3b2fbff1fd4e3d2c6ea8", size = 248514, upload-time = "2025-12-08T13:12:44.546Z" }, - { url = "https://files.pythonhosted.org/packages/71/c2/c949c5d3b5e9fc6dd79e1b73cdb86a59ef14f3709b1d72bf7668ae12e000/coverage-7.13.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e5330fa0cc1f5c3c4c3bb8e101b742025933e7848989370a1d4c8c5e401ea753", size = 249424, upload-time = "2025-12-08T13:12:45.759Z" }, - { url = "https://files.pythonhosted.org/packages/11/f1/bbc009abd6537cec0dffb2cc08c17a7f03de74c970e6302db4342a6e05af/coverage-7.13.0-cp311-cp311-win32.whl", hash = "sha256:0f4872f5d6c54419c94c25dd6ae1d015deeb337d06e448cd890a1e89a8ee7f3b", size = 220597, upload-time = "2025-12-08T13:12:47.378Z" }, - { url = "https://files.pythonhosted.org/packages/c4/f6/d9977f2fb51c10fbaed0718ce3d0a8541185290b981f73b1d27276c12d91/coverage-7.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:51a202e0f80f241ccb68e3e26e19ab5b3bf0f813314f2c967642f13ebcf1ddfe", size = 221536, upload-time = "2025-12-08T13:12:48.7Z" }, - { url = "https://files.pythonhosted.org/packages/be/ad/3fcf43fd96fb43e337a3073dea63ff148dcc5c41ba7a14d4c7d34efb2216/coverage-7.13.0-cp311-cp311-win_arm64.whl", hash = "sha256:d2a9d7f1c11487b1c69367ab3ac2d81b9b3721f097aa409a3191c3e90f8f3dd7", size = 220206, upload-time = "2025-12-08T13:12:50.365Z" }, - { url = "https://files.pythonhosted.org/packages/9b/f1/2619559f17f31ba00fc40908efd1fbf1d0a5536eb75dc8341e7d660a08de/coverage-7.13.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:0b3d67d31383c4c68e19a88e28fc4c2e29517580f1b0ebec4a069d502ce1e0bf", size = 218274, upload-time = "2025-12-08T13:12:52.095Z" }, - { url = "https://files.pythonhosted.org/packages/2b/11/30d71ae5d6e949ff93b2a79a2c1b4822e00423116c5c6edfaeef37301396/coverage-7.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:581f086833d24a22c89ae0fe2142cfaa1c92c930adf637ddf122d55083fb5a0f", size = 218638, upload-time = "2025-12-08T13:12:53.418Z" }, - { url = "https://files.pythonhosted.org/packages/79/c2/fce80fc6ded8d77e53207489d6065d0fed75db8951457f9213776615e0f5/coverage-7.13.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:0a3a30f0e257df382f5f9534d4ce3d4cf06eafaf5192beb1a7bd066cb10e78fb", size = 250129, upload-time = "2025-12-08T13:12:54.744Z" }, - { url = "https://files.pythonhosted.org/packages/5b/b6/51b5d1eb6fcbb9a1d5d6984e26cbe09018475c2922d554fd724dd0f056ee/coverage-7.13.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:583221913fbc8f53b88c42e8dbb8fca1d0f2e597cb190ce45916662b8b9d9621", size = 252885, upload-time = "2025-12-08T13:12:56.401Z" }, - { url = "https://files.pythonhosted.org/packages/0d/f8/972a5affea41de798691ab15d023d3530f9f56a72e12e243f35031846ff7/coverage-7.13.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f5d9bd30756fff3e7216491a0d6d520c448d5124d3d8e8f56446d6412499e74", size = 253974, upload-time = "2025-12-08T13:12:57.718Z" }, - { url = "https://files.pythonhosted.org/packages/8a/56/116513aee860b2c7968aa3506b0f59b22a959261d1dbf3aea7b4450a7520/coverage-7.13.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a23e5a1f8b982d56fa64f8e442e037f6ce29322f1f9e6c2344cd9e9f4407ee57", size = 250538, upload-time = "2025-12-08T13:12:59.254Z" }, - { url = "https://files.pythonhosted.org/packages/d6/75/074476d64248fbadf16dfafbf93fdcede389ec821f74ca858d7c87d2a98c/coverage-7.13.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9b01c22bc74a7fb44066aaf765224c0d933ddf1f5047d6cdfe4795504a4493f8", size = 251912, upload-time = "2025-12-08T13:13:00.604Z" }, - { url = "https://files.pythonhosted.org/packages/f2/d2/aa4f8acd1f7c06024705c12609d8698c51b27e4d635d717cd1934c9668e2/coverage-7.13.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:898cce66d0836973f48dda4e3514d863d70142bdf6dfab932b9b6a90ea5b222d", size = 250054, upload-time = "2025-12-08T13:13:01.892Z" }, - { url = "https://files.pythonhosted.org/packages/19/98/8df9e1af6a493b03694a1e8070e024e7d2cdc77adedc225a35e616d505de/coverage-7.13.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:3ab483ea0e251b5790c2aac03acde31bff0c736bf8a86829b89382b407cd1c3b", size = 249619, upload-time = "2025-12-08T13:13:03.236Z" }, - { url = "https://files.pythonhosted.org/packages/d8/71/f8679231f3353018ca66ef647fa6fe7b77e6bff7845be54ab84f86233363/coverage-7.13.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1d84e91521c5e4cb6602fe11ece3e1de03b2760e14ae4fcf1a4b56fa3c801fcd", size = 251496, upload-time = "2025-12-08T13:13:04.511Z" }, - { url = "https://files.pythonhosted.org/packages/04/86/9cb406388034eaf3c606c22094edbbb82eea1fa9d20c0e9efadff20d0733/coverage-7.13.0-cp312-cp312-win32.whl", hash = "sha256:193c3887285eec1dbdb3f2bd7fbc351d570ca9c02ca756c3afbc71b3c98af6ef", size = 220808, upload-time = "2025-12-08T13:13:06.422Z" }, - { url = "https://files.pythonhosted.org/packages/1c/59/af483673df6455795daf5f447c2f81a3d2fcfc893a22b8ace983791f6f34/coverage-7.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:4f3e223b2b2db5e0db0c2b97286aba0036ca000f06aca9b12112eaa9af3d92ae", size = 221616, upload-time = "2025-12-08T13:13:07.95Z" }, - { url = "https://files.pythonhosted.org/packages/64/b0/959d582572b30a6830398c60dd419c1965ca4b5fb38ac6b7093a0d50ca8d/coverage-7.13.0-cp312-cp312-win_arm64.whl", hash = "sha256:086cede306d96202e15a4b77ace8472e39d9f4e5f9fd92dd4fecdfb2313b2080", size = 220261, upload-time = "2025-12-08T13:13:09.581Z" }, - { url = "https://files.pythonhosted.org/packages/7c/cc/bce226595eb3bf7d13ccffe154c3c487a22222d87ff018525ab4dd2e9542/coverage-7.13.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:28ee1c96109974af104028a8ef57cec21447d42d0e937c0275329272e370ebcf", size = 218297, upload-time = "2025-12-08T13:13:10.977Z" }, - { url = "https://files.pythonhosted.org/packages/3b/9f/73c4d34600aae03447dff3d7ad1d0ac649856bfb87d1ca7d681cfc913f9e/coverage-7.13.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d1e97353dcc5587b85986cda4ff3ec98081d7e84dd95e8b2a6d59820f0545f8a", size = 218673, upload-time = "2025-12-08T13:13:12.562Z" }, - { url = "https://files.pythonhosted.org/packages/63/ab/8fa097db361a1e8586535ae5073559e6229596b3489ec3ef2f5b38df8cb2/coverage-7.13.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:99acd4dfdfeb58e1937629eb1ab6ab0899b131f183ee5f23e0b5da5cba2fec74", size = 249652, upload-time = "2025-12-08T13:13:13.909Z" }, - { url = "https://files.pythonhosted.org/packages/90/3a/9bfd4de2ff191feb37ef9465855ca56a6f2f30a3bca172e474130731ac3d/coverage-7.13.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:ff45e0cd8451e293b63ced93161e189780baf444119391b3e7d25315060368a6", size = 252251, upload-time = "2025-12-08T13:13:15.553Z" }, - { url = "https://files.pythonhosted.org/packages/df/61/b5d8105f016e1b5874af0d7c67542da780ccd4a5f2244a433d3e20ceb1ad/coverage-7.13.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f4f72a85316d8e13234cafe0a9f81b40418ad7a082792fa4165bd7d45d96066b", size = 253492, upload-time = "2025-12-08T13:13:16.849Z" }, - { url = "https://files.pythonhosted.org/packages/f3/b8/0fad449981803cc47a4694768b99823fb23632150743f9c83af329bb6090/coverage-7.13.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:11c21557d0e0a5a38632cbbaca5f008723b26a89d70db6315523df6df77d6232", size = 249850, upload-time = "2025-12-08T13:13:18.142Z" }, - { url = "https://files.pythonhosted.org/packages/9a/e9/8d68337c3125014d918cf4327d5257553a710a2995a6a6de2ac77e5aa429/coverage-7.13.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:76541dc8d53715fb4f7a3a06b34b0dc6846e3c69bc6204c55653a85dd6220971", size = 251633, upload-time = "2025-12-08T13:13:19.56Z" }, - { url = "https://files.pythonhosted.org/packages/55/14/d4112ab26b3a1bc4b3c1295d8452dcf399ed25be4cf649002fb3e64b2d93/coverage-7.13.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:6e9e451dee940a86789134b6b0ffbe31c454ade3b849bb8a9d2cca2541a8e91d", size = 249586, upload-time = "2025-12-08T13:13:20.883Z" }, - { url = "https://files.pythonhosted.org/packages/2c/a9/22b0000186db663b0d82f86c2f1028099ae9ac202491685051e2a11a5218/coverage-7.13.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:5c67dace46f361125e6b9cace8fe0b729ed8479f47e70c89b838d319375c8137", size = 249412, upload-time = "2025-12-08T13:13:22.22Z" }, - { url = "https://files.pythonhosted.org/packages/a1/2e/42d8e0d9e7527fba439acdc6ed24a2b97613b1dc85849b1dd935c2cffef0/coverage-7.13.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f59883c643cb19630500f57016f76cfdcd6845ca8c5b5ea1f6e17f74c8e5f511", size = 251191, upload-time = "2025-12-08T13:13:23.899Z" }, - { url = "https://files.pythonhosted.org/packages/a4/af/8c7af92b1377fd8860536aadd58745119252aaaa71a5213e5a8e8007a9f5/coverage-7.13.0-cp313-cp313-win32.whl", hash = "sha256:58632b187be6f0be500f553be41e277712baa278147ecb7559983c6d9faf7ae1", size = 220829, upload-time = "2025-12-08T13:13:25.182Z" }, - { url = "https://files.pythonhosted.org/packages/58/f9/725e8bf16f343d33cbe076c75dc8370262e194ff10072c0608b8e5cf33a3/coverage-7.13.0-cp313-cp313-win_amd64.whl", hash = "sha256:73419b89f812f498aca53f757dd834919b48ce4799f9d5cad33ca0ae442bdb1a", size = 221640, upload-time = "2025-12-08T13:13:26.836Z" }, - { url = "https://files.pythonhosted.org/packages/8a/ff/e98311000aa6933cc79274e2b6b94a2fe0fe3434fca778eba82003675496/coverage-7.13.0-cp313-cp313-win_arm64.whl", hash = "sha256:eb76670874fdd6091eedcc856128ee48c41a9bbbb9c3f1c7c3cf169290e3ffd6", size = 220269, upload-time = "2025-12-08T13:13:28.116Z" }, - { url = "https://files.pythonhosted.org/packages/cf/cf/bbaa2e1275b300343ea865f7d424cc0a2e2a1df6925a070b2b2d5d765330/coverage-7.13.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:6e63ccc6e0ad8986386461c3c4b737540f20426e7ec932f42e030320896c311a", size = 218990, upload-time = "2025-12-08T13:13:29.463Z" }, - { url = "https://files.pythonhosted.org/packages/21/1d/82f0b3323b3d149d7672e7744c116e9c170f4957e0c42572f0366dbb4477/coverage-7.13.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:494f5459ffa1bd45e18558cd98710c36c0b8fbfa82a5eabcbe671d80ecffbfe8", size = 219340, upload-time = "2025-12-08T13:13:31.524Z" }, - { url = "https://files.pythonhosted.org/packages/fb/e3/fe3fd4702a3832a255f4d43013eacb0ef5fc155a5960ea9269d8696db28b/coverage-7.13.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:06cac81bf10f74034e055e903f5f946e3e26fc51c09fc9f584e4a1605d977053", size = 260638, upload-time = "2025-12-08T13:13:32.965Z" }, - { url = "https://files.pythonhosted.org/packages/ad/01/63186cb000307f2b4da463f72af9b85d380236965574c78e7e27680a2593/coverage-7.13.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f2ffc92b46ed6e6760f1d47a71e56b5664781bc68986dbd1836b2b70c0ce2071", size = 262705, upload-time = "2025-12-08T13:13:34.378Z" }, - { url = "https://files.pythonhosted.org/packages/7c/a1/c0dacef0cc865f2455d59eed3548573ce47ed603205ffd0735d1d78b5906/coverage-7.13.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0602f701057c6823e5db1b74530ce85f17c3c5be5c85fc042ac939cbd909426e", size = 265125, upload-time = "2025-12-08T13:13:35.73Z" }, - { url = "https://files.pythonhosted.org/packages/ef/92/82b99223628b61300bd382c205795533bed021505eab6dd86e11fb5d7925/coverage-7.13.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:25dc33618d45456ccb1d37bce44bc78cf269909aa14c4db2e03d63146a8a1493", size = 259844, upload-time = "2025-12-08T13:13:37.69Z" }, - { url = "https://files.pythonhosted.org/packages/cf/2c/89b0291ae4e6cd59ef042708e1c438e2290f8c31959a20055d8768349ee2/coverage-7.13.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:71936a8b3b977ddd0b694c28c6a34f4fff2e9dd201969a4ff5d5fc7742d614b0", size = 262700, upload-time = "2025-12-08T13:13:39.525Z" }, - { url = "https://files.pythonhosted.org/packages/bf/f9/a5f992efae1996245e796bae34ceb942b05db275e4b34222a9a40b9fbd3b/coverage-7.13.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:936bc20503ce24770c71938d1369461f0c5320830800933bc3956e2a4ded930e", size = 260321, upload-time = "2025-12-08T13:13:41.172Z" }, - { url = "https://files.pythonhosted.org/packages/4c/89/a29f5d98c64fedbe32e2ac3c227fbf78edc01cc7572eee17d61024d89889/coverage-7.13.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:af0a583efaacc52ae2521f8d7910aff65cdb093091d76291ac5820d5e947fc1c", size = 259222, upload-time = "2025-12-08T13:13:43.282Z" }, - { url = "https://files.pythonhosted.org/packages/b3/c3/940fe447aae302a6701ee51e53af7e08b86ff6eed7631e5740c157ee22b9/coverage-7.13.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f1c23e24a7000da892a312fb17e33c5f94f8b001de44b7cf8ba2e36fbd15859e", size = 261411, upload-time = "2025-12-08T13:13:44.72Z" }, - { url = "https://files.pythonhosted.org/packages/eb/31/12a4aec689cb942a89129587860ed4d0fd522d5fda81237147fde554b8ae/coverage-7.13.0-cp313-cp313t-win32.whl", hash = "sha256:5f8a0297355e652001015e93be345ee54393e45dc3050af4a0475c5a2b767d46", size = 221505, upload-time = "2025-12-08T13:13:46.332Z" }, - { url = "https://files.pythonhosted.org/packages/65/8c/3b5fe3259d863572d2b0827642c50c3855d26b3aefe80bdc9eba1f0af3b0/coverage-7.13.0-cp313-cp313t-win_amd64.whl", hash = "sha256:6abb3a4c52f05e08460bd9acf04fec027f8718ecaa0d09c40ffbc3fbd70ecc39", size = 222569, upload-time = "2025-12-08T13:13:47.79Z" }, - { url = "https://files.pythonhosted.org/packages/b0/39/f71fa8316a96ac72fc3908839df651e8eccee650001a17f2c78cdb355624/coverage-7.13.0-cp313-cp313t-win_arm64.whl", hash = "sha256:3ad968d1e3aa6ce5be295ab5fe3ae1bf5bb4769d0f98a80a0252d543a2ef2e9e", size = 220841, upload-time = "2025-12-08T13:13:49.243Z" }, - { url = "https://files.pythonhosted.org/packages/f8/4b/9b54bedda55421449811dcd5263a2798a63f48896c24dfb92b0f1b0845bd/coverage-7.13.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:453b7ec753cf5e4356e14fe858064e5520c460d3bbbcb9c35e55c0d21155c256", size = 218343, upload-time = "2025-12-08T13:13:50.811Z" }, - { url = "https://files.pythonhosted.org/packages/59/df/c3a1f34d4bba2e592c8979f924da4d3d4598b0df2392fbddb7761258e3dc/coverage-7.13.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:af827b7cbb303e1befa6c4f94fd2bf72f108089cfa0f8abab8f4ca553cf5ca5a", size = 218672, upload-time = "2025-12-08T13:13:52.284Z" }, - { url = "https://files.pythonhosted.org/packages/07/62/eec0659e47857698645ff4e6ad02e30186eb8afd65214fd43f02a76537cb/coverage-7.13.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:9987a9e4f8197a1000280f7cc089e3ea2c8b3c0a64d750537809879a7b4ceaf9", size = 249715, upload-time = "2025-12-08T13:13:53.791Z" }, - { url = "https://files.pythonhosted.org/packages/23/2d/3c7ff8b2e0e634c1f58d095f071f52ed3c23ff25be524b0ccae8b71f99f8/coverage-7.13.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:3188936845cd0cb114fa6a51842a304cdbac2958145d03be2377ec41eb285d19", size = 252225, upload-time = "2025-12-08T13:13:55.274Z" }, - { url = "https://files.pythonhosted.org/packages/aa/ac/fb03b469d20e9c9a81093575003f959cf91a4a517b783aab090e4538764b/coverage-7.13.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a2bdb3babb74079f021696cb46b8bb5f5661165c385d3a238712b031a12355be", size = 253559, upload-time = "2025-12-08T13:13:57.161Z" }, - { url = "https://files.pythonhosted.org/packages/29/62/14afa9e792383c66cc0a3b872a06ded6e4ed1079c7d35de274f11d27064e/coverage-7.13.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:7464663eaca6adba4175f6c19354feea61ebbdd735563a03d1e472c7072d27bb", size = 249724, upload-time = "2025-12-08T13:13:58.692Z" }, - { url = "https://files.pythonhosted.org/packages/31/b7/333f3dab2939070613696ab3ee91738950f0467778c6e5a5052e840646b7/coverage-7.13.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8069e831f205d2ff1f3d355e82f511eb7c5522d7d413f5db5756b772ec8697f8", size = 251582, upload-time = "2025-12-08T13:14:00.642Z" }, - { url = "https://files.pythonhosted.org/packages/81/cb/69162bda9381f39b2287265d7e29ee770f7c27c19f470164350a38318764/coverage-7.13.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:6fb2d5d272341565f08e962cce14cdf843a08ac43bd621783527adb06b089c4b", size = 249538, upload-time = "2025-12-08T13:14:02.556Z" }, - { url = "https://files.pythonhosted.org/packages/e0/76/350387b56a30f4970abe32b90b2a434f87d29f8b7d4ae40d2e8a85aacfb3/coverage-7.13.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:5e70f92ef89bac1ac8a99b3324923b4749f008fdbd7aa9cb35e01d7a284a04f9", size = 249349, upload-time = "2025-12-08T13:14:04.015Z" }, - { url = "https://files.pythonhosted.org/packages/86/0d/7f6c42b8d59f4c7e43ea3059f573c0dcfed98ba46eb43c68c69e52ae095c/coverage-7.13.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:4b5de7d4583e60d5fd246dd57fcd3a8aa23c6e118a8c72b38adf666ba8e7e927", size = 251011, upload-time = "2025-12-08T13:14:05.505Z" }, - { url = "https://files.pythonhosted.org/packages/d7/f1/4bb2dff379721bb0b5c649d5c5eaf438462cad824acf32eb1b7ca0c7078e/coverage-7.13.0-cp314-cp314-win32.whl", hash = "sha256:a6c6e16b663be828a8f0b6c5027d36471d4a9f90d28444aa4ced4d48d7d6ae8f", size = 221091, upload-time = "2025-12-08T13:14:07.127Z" }, - { url = "https://files.pythonhosted.org/packages/ba/44/c239da52f373ce379c194b0ee3bcc121020e397242b85f99e0afc8615066/coverage-7.13.0-cp314-cp314-win_amd64.whl", hash = "sha256:0900872f2fdb3ee5646b557918d02279dc3af3dfb39029ac4e945458b13f73bc", size = 221904, upload-time = "2025-12-08T13:14:08.542Z" }, - { url = "https://files.pythonhosted.org/packages/89/1f/b9f04016d2a29c2e4a0307baefefad1a4ec5724946a2b3e482690486cade/coverage-7.13.0-cp314-cp314-win_arm64.whl", hash = "sha256:3a10260e6a152e5f03f26db4a407c4c62d3830b9af9b7c0450b183615f05d43b", size = 220480, upload-time = "2025-12-08T13:14:10.958Z" }, - { url = "https://files.pythonhosted.org/packages/16/d4/364a1439766c8e8647860584171c36010ca3226e6e45b1753b1b249c5161/coverage-7.13.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9097818b6cc1cfb5f174e3263eba4a62a17683bcfe5c4b5d07f4c97fa51fbf28", size = 219074, upload-time = "2025-12-08T13:14:13.345Z" }, - { url = "https://files.pythonhosted.org/packages/ce/f4/71ba8be63351e099911051b2089662c03d5671437a0ec2171823c8e03bec/coverage-7.13.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0018f73dfb4301a89292c73be6ba5f58722ff79f51593352759c1790ded1cabe", size = 219342, upload-time = "2025-12-08T13:14:15.02Z" }, - { url = "https://files.pythonhosted.org/packages/5e/25/127d8ed03d7711a387d96f132589057213e3aef7475afdaa303412463f22/coverage-7.13.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:166ad2a22ee770f5656e1257703139d3533b4a0b6909af67c6b4a3adc1c98657", size = 260713, upload-time = "2025-12-08T13:14:16.907Z" }, - { url = "https://files.pythonhosted.org/packages/fd/db/559fbb6def07d25b2243663b46ba9eb5a3c6586c0c6f4e62980a68f0ee1c/coverage-7.13.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f6aaef16d65d1787280943f1c8718dc32e9cf141014e4634d64446702d26e0ff", size = 262825, upload-time = "2025-12-08T13:14:18.68Z" }, - { url = "https://files.pythonhosted.org/packages/37/99/6ee5bf7eff884766edb43bd8736b5e1c5144d0fe47498c3779326fe75a35/coverage-7.13.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e999e2dcc094002d6e2c7bbc1fb85b58ba4f465a760a8014d97619330cdbbbf3", size = 265233, upload-time = "2025-12-08T13:14:20.55Z" }, - { url = "https://files.pythonhosted.org/packages/d8/90/92f18fe0356ea69e1f98f688ed80cec39f44e9f09a1f26a1bbf017cc67f2/coverage-7.13.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:00c3d22cf6fb1cf3bf662aaaa4e563be8243a5ed2630339069799835a9cc7f9b", size = 259779, upload-time = "2025-12-08T13:14:22.367Z" }, - { url = "https://files.pythonhosted.org/packages/90/5d/b312a8b45b37a42ea7d27d7d3ff98ade3a6c892dd48d1d503e773503373f/coverage-7.13.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:22ccfe8d9bb0d6134892cbe1262493a8c70d736b9df930f3f3afae0fe3ac924d", size = 262700, upload-time = "2025-12-08T13:14:24.309Z" }, - { url = "https://files.pythonhosted.org/packages/63/f8/b1d0de5c39351eb71c366f872376d09386640840a2e09b0d03973d791e20/coverage-7.13.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:9372dff5ea15930fea0445eaf37bbbafbc771a49e70c0aeed8b4e2c2614cc00e", size = 260302, upload-time = "2025-12-08T13:14:26.068Z" }, - { url = "https://files.pythonhosted.org/packages/aa/7c/d42f4435bc40c55558b3109a39e2d456cddcec37434f62a1f1230991667a/coverage-7.13.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:69ac2c492918c2461bc6ace42d0479638e60719f2a4ef3f0815fa2df88e9f940", size = 259136, upload-time = "2025-12-08T13:14:27.604Z" }, - { url = "https://files.pythonhosted.org/packages/b8/d3/23413241dc04d47cfe19b9a65b32a2edd67ecd0b817400c2843ebc58c847/coverage-7.13.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:739c6c051a7540608d097b8e13c76cfa85263ced467168dc6b477bae3df7d0e2", size = 261467, upload-time = "2025-12-08T13:14:29.09Z" }, - { url = "https://files.pythonhosted.org/packages/13/e6/6e063174500eee216b96272c0d1847bf215926786f85c2bd024cf4d02d2f/coverage-7.13.0-cp314-cp314t-win32.whl", hash = "sha256:fe81055d8c6c9de76d60c94ddea73c290b416e061d40d542b24a5871bad498b7", size = 221875, upload-time = "2025-12-08T13:14:31.106Z" }, - { url = "https://files.pythonhosted.org/packages/3b/46/f4fb293e4cbe3620e3ac2a3e8fd566ed33affb5861a9b20e3dd6c1896cbc/coverage-7.13.0-cp314-cp314t-win_amd64.whl", hash = "sha256:445badb539005283825959ac9fa4a28f712c214b65af3a2c464f1adc90f5fcbc", size = 222982, upload-time = "2025-12-08T13:14:33.1Z" }, - { url = "https://files.pythonhosted.org/packages/68/62/5b3b9018215ed9733fbd1ae3b2ed75c5de62c3b55377a52cae732e1b7805/coverage-7.13.0-cp314-cp314t-win_arm64.whl", hash = "sha256:de7f6748b890708578fc4b7bb967d810aeb6fcc9bff4bb77dbca77dab2f9df6a", size = 221016, upload-time = "2025-12-08T13:14:34.601Z" }, - { url = "https://files.pythonhosted.org/packages/8d/4c/1968f32fb9a2604645827e11ff84a31e59d532e01995f904723b4f5328b3/coverage-7.13.0-py3-none-any.whl", hash = "sha256:850d2998f380b1e266459ca5b47bc9e7daf9af1d070f66317972f382d46f1904", size = 210068, upload-time = "2025-12-08T13:14:36.236Z" }, +version = "7.13.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/23/f9/e92df5e07f3fc8d4c7f9a0f146ef75446bf870351cd37b788cf5897f8079/coverage-7.13.1.tar.gz", hash = "sha256:b7593fe7eb5feaa3fbb461ac79aac9f9fc0387a5ca8080b0c6fe2ca27b091afd", size = 825862, upload-time = "2025-12-28T15:42:56.969Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2d/9a/3742e58fd04b233df95c012ee9f3dfe04708a5e1d32613bd2d47d4e1be0d/coverage-7.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e1fa280b3ad78eea5be86f94f461c04943d942697e0dac889fa18fff8f5f9147", size = 218633, upload-time = "2025-12-28T15:40:10.165Z" }, + { url = "https://files.pythonhosted.org/packages/7e/45/7e6bdc94d89cd7c8017ce735cf50478ddfe765d4fbf0c24d71d30ea33d7a/coverage-7.13.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c3d8c679607220979434f494b139dfb00131ebf70bb406553d69c1ff01a5c33d", size = 219147, upload-time = "2025-12-28T15:40:12.069Z" }, + { url = "https://files.pythonhosted.org/packages/f7/38/0d6a258625fd7f10773fe94097dc16937a5f0e3e0cdf3adef67d3ac6baef/coverage-7.13.1-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:339dc63b3eba969067b00f41f15ad161bf2946613156fb131266d8debc8e44d0", size = 245894, upload-time = "2025-12-28T15:40:13.556Z" }, + { url = "https://files.pythonhosted.org/packages/27/58/409d15ea487986994cbd4d06376e9860e9b157cfbfd402b1236770ab8dd2/coverage-7.13.1-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:db622b999ffe49cb891f2fff3b340cdc2f9797d01a0a202a0973ba2562501d90", size = 247721, upload-time = "2025-12-28T15:40:15.37Z" }, + { url = "https://files.pythonhosted.org/packages/da/bf/6e8056a83fd7a96c93341f1ffe10df636dd89f26d5e7b9ca511ce3bcf0df/coverage-7.13.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1443ba9acbb593fa7c1c29e011d7c9761545fe35e7652e85ce7f51a16f7e08d", size = 249585, upload-time = "2025-12-28T15:40:17.226Z" }, + { url = "https://files.pythonhosted.org/packages/f4/15/e1daff723f9f5959acb63cbe35b11203a9df77ee4b95b45fffd38b318390/coverage-7.13.1-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c832ec92c4499ac463186af72f9ed4d8daec15499b16f0a879b0d1c8e5cf4a3b", size = 246597, upload-time = "2025-12-28T15:40:19.028Z" }, + { url = "https://files.pythonhosted.org/packages/74/a6/1efd31c5433743a6ddbc9d37ac30c196bb07c7eab3d74fbb99b924c93174/coverage-7.13.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:562ec27dfa3f311e0db1ba243ec6e5f6ab96b1edfcfc6cf86f28038bc4961ce6", size = 247626, upload-time = "2025-12-28T15:40:20.846Z" }, + { url = "https://files.pythonhosted.org/packages/6d/9f/1609267dd3e749f57fdd66ca6752567d1c13b58a20a809dc409b263d0b5f/coverage-7.13.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:4de84e71173d4dada2897e5a0e1b7877e5eefbfe0d6a44edee6ce31d9b8ec09e", size = 245629, upload-time = "2025-12-28T15:40:22.397Z" }, + { url = "https://files.pythonhosted.org/packages/e2/f6/6815a220d5ec2466383d7cc36131b9fa6ecbe95c50ec52a631ba733f306a/coverage-7.13.1-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:a5a68357f686f8c4d527a2dc04f52e669c2fc1cbde38f6f7eb6a0e58cbd17cae", size = 245901, upload-time = "2025-12-28T15:40:23.836Z" }, + { url = "https://files.pythonhosted.org/packages/ac/58/40576554cd12e0872faf6d2c0eb3bc85f71d78427946ddd19ad65201e2c0/coverage-7.13.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:77cc258aeb29a3417062758975521eae60af6f79e930d6993555eeac6a8eac29", size = 246505, upload-time = "2025-12-28T15:40:25.421Z" }, + { url = "https://files.pythonhosted.org/packages/3b/77/9233a90253fba576b0eee81707b5781d0e21d97478e5377b226c5b096c0f/coverage-7.13.1-cp310-cp310-win32.whl", hash = "sha256:bb4f8c3c9a9f34423dba193f241f617b08ffc63e27f67159f60ae6baf2dcfe0f", size = 221257, upload-time = "2025-12-28T15:40:27.217Z" }, + { url = "https://files.pythonhosted.org/packages/e0/43/e842ff30c1a0a623ec80db89befb84a3a7aad7bfe44a6ea77d5a3e61fedd/coverage-7.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:c8e2706ceb622bc63bac98ebb10ef5da80ed70fbd8a7999a5076de3afaef0fb1", size = 222191, upload-time = "2025-12-28T15:40:28.916Z" }, + { url = "https://files.pythonhosted.org/packages/b4/9b/77baf488516e9ced25fc215a6f75d803493fc3f6a1a1227ac35697910c2a/coverage-7.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1a55d509a1dc5a5b708b5dad3b5334e07a16ad4c2185e27b40e4dba796ab7f88", size = 218755, upload-time = "2025-12-28T15:40:30.812Z" }, + { url = "https://files.pythonhosted.org/packages/d7/cd/7ab01154e6eb79ee2fab76bf4d89e94c6648116557307ee4ebbb85e5c1bf/coverage-7.13.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4d010d080c4888371033baab27e47c9df7d6fb28d0b7b7adf85a4a49be9298b3", size = 219257, upload-time = "2025-12-28T15:40:32.333Z" }, + { url = "https://files.pythonhosted.org/packages/01/d5/b11ef7863ffbbdb509da0023fad1e9eda1c0eaea61a6d2ea5b17d4ac706e/coverage-7.13.1-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d938b4a840fb1523b9dfbbb454f652967f18e197569c32266d4d13f37244c3d9", size = 249657, upload-time = "2025-12-28T15:40:34.1Z" }, + { url = "https://files.pythonhosted.org/packages/f7/7c/347280982982383621d29b8c544cf497ae07ac41e44b1ca4903024131f55/coverage-7.13.1-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bf100a3288f9bb7f919b87eb84f87101e197535b9bd0e2c2b5b3179633324fee", size = 251581, upload-time = "2025-12-28T15:40:36.131Z" }, + { url = "https://files.pythonhosted.org/packages/82/f6/ebcfed11036ade4c0d75fa4453a6282bdd225bc073862766eec184a4c643/coverage-7.13.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ef6688db9bf91ba111ae734ba6ef1a063304a881749726e0d3575f5c10a9facf", size = 253691, upload-time = "2025-12-28T15:40:37.626Z" }, + { url = "https://files.pythonhosted.org/packages/02/92/af8f5582787f5d1a8b130b2dcba785fa5e9a7a8e121a0bb2220a6fdbdb8a/coverage-7.13.1-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0b609fc9cdbd1f02e51f67f51e5aee60a841ef58a68d00d5ee2c0faf357481a3", size = 249799, upload-time = "2025-12-28T15:40:39.47Z" }, + { url = "https://files.pythonhosted.org/packages/24/aa/0e39a2a3b16eebf7f193863323edbff38b6daba711abaaf807d4290cf61a/coverage-7.13.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c43257717611ff5e9a1d79dce8e47566235ebda63328718d9b65dd640bc832ef", size = 251389, upload-time = "2025-12-28T15:40:40.954Z" }, + { url = "https://files.pythonhosted.org/packages/73/46/7f0c13111154dc5b978900c0ccee2e2ca239b910890e674a77f1363d483e/coverage-7.13.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e09fbecc007f7b6afdfb3b07ce5bd9f8494b6856dd4f577d26c66c391b829851", size = 249450, upload-time = "2025-12-28T15:40:42.489Z" }, + { url = "https://files.pythonhosted.org/packages/ac/ca/e80da6769e8b669ec3695598c58eef7ad98b0e26e66333996aee6316db23/coverage-7.13.1-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:a03a4f3a19a189919c7055098790285cc5c5b0b3976f8d227aea39dbf9f8bfdb", size = 249170, upload-time = "2025-12-28T15:40:44.279Z" }, + { url = "https://files.pythonhosted.org/packages/af/18/9e29baabdec1a8644157f572541079b4658199cfd372a578f84228e860de/coverage-7.13.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3820778ea1387c2b6a818caec01c63adc5b3750211af6447e8dcfb9b6f08dbba", size = 250081, upload-time = "2025-12-28T15:40:45.748Z" }, + { url = "https://files.pythonhosted.org/packages/00/f8/c3021625a71c3b2f516464d322e41636aea381018319050a8114105872ee/coverage-7.13.1-cp311-cp311-win32.whl", hash = "sha256:ff10896fa55167371960c5908150b434b71c876dfab97b69478f22c8b445ea19", size = 221281, upload-time = "2025-12-28T15:40:47.232Z" }, + { url = "https://files.pythonhosted.org/packages/27/56/c216625f453df6e0559ed666d246fcbaaa93f3aa99eaa5080cea1229aa3d/coverage-7.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:a998cc0aeeea4c6d5622a3754da5a493055d2d95186bad877b0a34ea6e6dbe0a", size = 222215, upload-time = "2025-12-28T15:40:49.19Z" }, + { url = "https://files.pythonhosted.org/packages/5c/9a/be342e76f6e531cae6406dc46af0d350586f24d9b67fdfa6daee02df71af/coverage-7.13.1-cp311-cp311-win_arm64.whl", hash = "sha256:fea07c1a39a22614acb762e3fbbb4011f65eedafcb2948feeef641ac78b4ee5c", size = 220886, upload-time = "2025-12-28T15:40:51.067Z" }, + { url = "https://files.pythonhosted.org/packages/ce/8a/87af46cccdfa78f53db747b09f5f9a21d5fc38d796834adac09b30a8ce74/coverage-7.13.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6f34591000f06e62085b1865c9bc5f7858df748834662a51edadfd2c3bfe0dd3", size = 218927, upload-time = "2025-12-28T15:40:52.814Z" }, + { url = "https://files.pythonhosted.org/packages/82/a8/6e22fdc67242a4a5a153f9438d05944553121c8f4ba70cb072af4c41362e/coverage-7.13.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b67e47c5595b9224599016e333f5ec25392597a89d5744658f837d204e16c63e", size = 219288, upload-time = "2025-12-28T15:40:54.262Z" }, + { url = "https://files.pythonhosted.org/packages/d0/0a/853a76e03b0f7c4375e2ca025df45c918beb367f3e20a0a8e91967f6e96c/coverage-7.13.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3e7b8bd70c48ffb28461ebe092c2345536fb18bbbf19d287c8913699735f505c", size = 250786, upload-time = "2025-12-28T15:40:56.059Z" }, + { url = "https://files.pythonhosted.org/packages/ea/b4/694159c15c52b9f7ec7adf49d50e5f8ee71d3e9ef38adb4445d13dd56c20/coverage-7.13.1-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c223d078112e90dc0e5c4e35b98b9584164bea9fbbd221c0b21c5241f6d51b62", size = 253543, upload-time = "2025-12-28T15:40:57.585Z" }, + { url = "https://files.pythonhosted.org/packages/96/b2/7f1f0437a5c855f87e17cf5d0dc35920b6440ff2b58b1ba9788c059c26c8/coverage-7.13.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:794f7c05af0763b1bbd1b9e6eff0e52ad068be3b12cd96c87de037b01390c968", size = 254635, upload-time = "2025-12-28T15:40:59.443Z" }, + { url = "https://files.pythonhosted.org/packages/e9/d1/73c3fdb8d7d3bddd9473c9c6a2e0682f09fc3dfbcb9c3f36412a7368bcab/coverage-7.13.1-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0642eae483cc8c2902e4af7298bf886d605e80f26382124cddc3967c2a3df09e", size = 251202, upload-time = "2025-12-28T15:41:01.328Z" }, + { url = "https://files.pythonhosted.org/packages/66/3c/f0edf75dcc152f145d5598329e864bbbe04ab78660fe3e8e395f9fff010f/coverage-7.13.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9f5e772ed5fef25b3de9f2008fe67b92d46831bd2bc5bdc5dd6bfd06b83b316f", size = 252566, upload-time = "2025-12-28T15:41:03.319Z" }, + { url = "https://files.pythonhosted.org/packages/17/b3/e64206d3c5f7dcbceafd14941345a754d3dbc78a823a6ed526e23b9cdaab/coverage-7.13.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:45980ea19277dc0a579e432aef6a504fe098ef3a9032ead15e446eb0f1191aee", size = 250711, upload-time = "2025-12-28T15:41:06.411Z" }, + { url = "https://files.pythonhosted.org/packages/dc/ad/28a3eb970a8ef5b479ee7f0c484a19c34e277479a5b70269dc652b730733/coverage-7.13.1-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:e4f18eca6028ffa62adbd185a8f1e1dd242f2e68164dba5c2b74a5204850b4cf", size = 250278, upload-time = "2025-12-28T15:41:08.285Z" }, + { url = "https://files.pythonhosted.org/packages/54/e3/c8f0f1a93133e3e1291ca76cbb63565bd4b5c5df63b141f539d747fff348/coverage-7.13.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f8dca5590fec7a89ed6826fce625595279e586ead52e9e958d3237821fbc750c", size = 252154, upload-time = "2025-12-28T15:41:09.969Z" }, + { url = "https://files.pythonhosted.org/packages/d0/bf/9939c5d6859c380e405b19e736321f1c7d402728792f4c752ad1adcce005/coverage-7.13.1-cp312-cp312-win32.whl", hash = "sha256:ff86d4e85188bba72cfb876df3e11fa243439882c55957184af44a35bd5880b7", size = 221487, upload-time = "2025-12-28T15:41:11.468Z" }, + { url = "https://files.pythonhosted.org/packages/fa/dc/7282856a407c621c2aad74021680a01b23010bb8ebf427cf5eacda2e876f/coverage-7.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:16cc1da46c04fb0fb128b4dc430b78fa2aba8a6c0c9f8eb391fd5103409a6ac6", size = 222299, upload-time = "2025-12-28T15:41:13.386Z" }, + { url = "https://files.pythonhosted.org/packages/10/79/176a11203412c350b3e9578620013af35bcdb79b651eb976f4a4b32044fa/coverage-7.13.1-cp312-cp312-win_arm64.whl", hash = "sha256:8d9bc218650022a768f3775dd7fdac1886437325d8d295d923ebcfef4892ad5c", size = 220941, upload-time = "2025-12-28T15:41:14.975Z" }, + { url = "https://files.pythonhosted.org/packages/a3/a4/e98e689347a1ff1a7f67932ab535cef82eb5e78f32a9e4132e114bbb3a0a/coverage-7.13.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:cb237bfd0ef4d5eb6a19e29f9e528ac67ac3be932ea6b44fb6cc09b9f3ecff78", size = 218951, upload-time = "2025-12-28T15:41:16.653Z" }, + { url = "https://files.pythonhosted.org/packages/32/33/7cbfe2bdc6e2f03d6b240d23dc45fdaf3fd270aaf2d640be77b7f16989ab/coverage-7.13.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1dcb645d7e34dcbcc96cd7c132b1fc55c39263ca62eb961c064eb3928997363b", size = 219325, upload-time = "2025-12-28T15:41:18.609Z" }, + { url = "https://files.pythonhosted.org/packages/59/f6/efdabdb4929487baeb7cb2a9f7dac457d9356f6ad1b255be283d58b16316/coverage-7.13.1-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3d42df8201e00384736f0df9be2ced39324c3907607d17d50d50116c989d84cd", size = 250309, upload-time = "2025-12-28T15:41:20.629Z" }, + { url = "https://files.pythonhosted.org/packages/12/da/91a52516e9d5aea87d32d1523f9cdcf7a35a3b298e6be05d6509ba3cfab2/coverage-7.13.1-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fa3edde1aa8807de1d05934982416cb3ec46d1d4d91e280bcce7cca01c507992", size = 252907, upload-time = "2025-12-28T15:41:22.257Z" }, + { url = "https://files.pythonhosted.org/packages/75/38/f1ea837e3dc1231e086db1638947e00d264e7e8c41aa8ecacf6e1e0c05f4/coverage-7.13.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9edd0e01a343766add6817bc448408858ba6b489039eaaa2018474e4001651a4", size = 254148, upload-time = "2025-12-28T15:41:23.87Z" }, + { url = "https://files.pythonhosted.org/packages/7f/43/f4f16b881aaa34954ba446318dea6b9ed5405dd725dd8daac2358eda869a/coverage-7.13.1-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:985b7836931d033570b94c94713c6dba5f9d3ff26045f72c3e5dbc5fe3361e5a", size = 250515, upload-time = "2025-12-28T15:41:25.437Z" }, + { url = "https://files.pythonhosted.org/packages/84/34/8cba7f00078bd468ea914134e0144263194ce849ec3baad187ffb6203d1c/coverage-7.13.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ffed1e4980889765c84a5d1a566159e363b71d6b6fbaf0bebc9d3c30bc016766", size = 252292, upload-time = "2025-12-28T15:41:28.459Z" }, + { url = "https://files.pythonhosted.org/packages/8c/a4/cffac66c7652d84ee4ac52d3ccb94c015687d3b513f9db04bfcac2ac800d/coverage-7.13.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8842af7f175078456b8b17f1b73a0d16a65dcbdc653ecefeb00a56b3c8c298c4", size = 250242, upload-time = "2025-12-28T15:41:30.02Z" }, + { url = "https://files.pythonhosted.org/packages/f4/78/9a64d462263dde416f3c0067efade7b52b52796f489b1037a95b0dc389c9/coverage-7.13.1-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:ccd7a6fca48ca9c131d9b0a2972a581e28b13416fc313fb98b6d24a03ce9a398", size = 250068, upload-time = "2025-12-28T15:41:32.007Z" }, + { url = "https://files.pythonhosted.org/packages/69/c8/a8994f5fece06db7c4a97c8fc1973684e178599b42e66280dded0524ef00/coverage-7.13.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0403f647055de2609be776965108447deb8e384fe4a553c119e3ff6bfbab4784", size = 251846, upload-time = "2025-12-28T15:41:33.946Z" }, + { url = "https://files.pythonhosted.org/packages/cc/f7/91fa73c4b80305c86598a2d4e54ba22df6bf7d0d97500944af7ef155d9f7/coverage-7.13.1-cp313-cp313-win32.whl", hash = "sha256:549d195116a1ba1e1ae2f5ca143f9777800f6636eab917d4f02b5310d6d73461", size = 221512, upload-time = "2025-12-28T15:41:35.519Z" }, + { url = "https://files.pythonhosted.org/packages/45/0b/0768b4231d5a044da8f75e097a8714ae1041246bb765d6b5563bab456735/coverage-7.13.1-cp313-cp313-win_amd64.whl", hash = "sha256:5899d28b5276f536fcf840b18b61a9fce23cc3aec1d114c44c07fe94ebeaa500", size = 222321, upload-time = "2025-12-28T15:41:37.371Z" }, + { url = "https://files.pythonhosted.org/packages/9b/b8/bdcb7253b7e85157282450262008f1366aa04663f3e3e4c30436f596c3e2/coverage-7.13.1-cp313-cp313-win_arm64.whl", hash = "sha256:868a2fae76dfb06e87291bcbd4dcbcc778a8500510b618d50496e520bd94d9b9", size = 220949, upload-time = "2025-12-28T15:41:39.553Z" }, + { url = "https://files.pythonhosted.org/packages/70/52/f2be52cc445ff75ea8397948c96c1b4ee14f7f9086ea62fc929c5ae7b717/coverage-7.13.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:67170979de0dacac3f3097d02b0ad188d8edcea44ccc44aaa0550af49150c7dc", size = 219643, upload-time = "2025-12-28T15:41:41.567Z" }, + { url = "https://files.pythonhosted.org/packages/47/79/c85e378eaa239e2edec0c5523f71542c7793fe3340954eafb0bc3904d32d/coverage-7.13.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f80e2bb21bfab56ed7405c2d79d34b5dc0bc96c2c1d2a067b643a09fb756c43a", size = 219997, upload-time = "2025-12-28T15:41:43.418Z" }, + { url = "https://files.pythonhosted.org/packages/fe/9b/b1ade8bfb653c0bbce2d6d6e90cc6c254cbb99b7248531cc76253cb4da6d/coverage-7.13.1-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:f83351e0f7dcdb14d7326c3d8d8c4e915fa685cbfdc6281f9470d97a04e9dfe4", size = 261296, upload-time = "2025-12-28T15:41:45.207Z" }, + { url = "https://files.pythonhosted.org/packages/1f/af/ebf91e3e1a2473d523e87e87fd8581e0aa08741b96265730e2d79ce78d8d/coverage-7.13.1-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bb3f6562e89bad0110afbe64e485aac2462efdce6232cdec7862a095dc3412f6", size = 263363, upload-time = "2025-12-28T15:41:47.163Z" }, + { url = "https://files.pythonhosted.org/packages/c4/8b/fb2423526d446596624ac7fde12ea4262e66f86f5120114c3cfd0bb2befa/coverage-7.13.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77545b5dcda13b70f872c3b5974ac64c21d05e65b1590b441c8560115dc3a0d1", size = 265783, upload-time = "2025-12-28T15:41:49.03Z" }, + { url = "https://files.pythonhosted.org/packages/9b/26/ef2adb1e22674913b89f0fe7490ecadcef4a71fa96f5ced90c60ec358789/coverage-7.13.1-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a4d240d260a1aed814790bbe1f10a5ff31ce6c21bc78f0da4a1e8268d6c80dbd", size = 260508, upload-time = "2025-12-28T15:41:51.035Z" }, + { url = "https://files.pythonhosted.org/packages/ce/7d/f0f59b3404caf662e7b5346247883887687c074ce67ba453ea08c612b1d5/coverage-7.13.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d2287ac9360dec3837bfdad969963a5d073a09a85d898bd86bea82aa8876ef3c", size = 263357, upload-time = "2025-12-28T15:41:52.631Z" }, + { url = "https://files.pythonhosted.org/packages/1a/b1/29896492b0b1a047604d35d6fa804f12818fa30cdad660763a5f3159e158/coverage-7.13.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:0d2c11f3ea4db66b5cbded23b20185c35066892c67d80ec4be4bab257b9ad1e0", size = 260978, upload-time = "2025-12-28T15:41:54.589Z" }, + { url = "https://files.pythonhosted.org/packages/48/f2/971de1238a62e6f0a4128d37adadc8bb882ee96afbe03ff1570291754629/coverage-7.13.1-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:3fc6a169517ca0d7ca6846c3c5392ef2b9e38896f61d615cb75b9e7134d4ee1e", size = 259877, upload-time = "2025-12-28T15:41:56.263Z" }, + { url = "https://files.pythonhosted.org/packages/6a/fc/0474efcbb590ff8628830e9aaec5f1831594874360e3251f1fdec31d07a3/coverage-7.13.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:d10a2ed46386e850bb3de503a54f9fe8192e5917fcbb143bfef653a9355e9a53", size = 262069, upload-time = "2025-12-28T15:41:58.093Z" }, + { url = "https://files.pythonhosted.org/packages/88/4f/3c159b7953db37a7b44c0eab8a95c37d1aa4257c47b4602c04022d5cb975/coverage-7.13.1-cp313-cp313t-win32.whl", hash = "sha256:75a6f4aa904301dab8022397a22c0039edc1f51e90b83dbd4464b8a38dc87842", size = 222184, upload-time = "2025-12-28T15:41:59.763Z" }, + { url = "https://files.pythonhosted.org/packages/58/a5/6b57d28f81417f9335774f20679d9d13b9a8fb90cd6160957aa3b54a2379/coverage-7.13.1-cp313-cp313t-win_amd64.whl", hash = "sha256:309ef5706e95e62578cda256b97f5e097916a2c26247c287bbe74794e7150df2", size = 223250, upload-time = "2025-12-28T15:42:01.52Z" }, + { url = "https://files.pythonhosted.org/packages/81/7c/160796f3b035acfbb58be80e02e484548595aa67e16a6345e7910ace0a38/coverage-7.13.1-cp313-cp313t-win_arm64.whl", hash = "sha256:92f980729e79b5d16d221038dbf2e8f9a9136afa072f9d5d6ed4cb984b126a09", size = 221521, upload-time = "2025-12-28T15:42:03.275Z" }, + { url = "https://files.pythonhosted.org/packages/aa/8e/ba0e597560c6563fc0adb902fda6526df5d4aa73bb10adf0574d03bd2206/coverage-7.13.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:97ab3647280d458a1f9adb85244e81587505a43c0c7cff851f5116cd2814b894", size = 218996, upload-time = "2025-12-28T15:42:04.978Z" }, + { url = "https://files.pythonhosted.org/packages/6b/8e/764c6e116f4221dc7aa26c4061181ff92edb9c799adae6433d18eeba7a14/coverage-7.13.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8f572d989142e0908e6acf57ad1b9b86989ff057c006d13b76c146ec6a20216a", size = 219326, upload-time = "2025-12-28T15:42:06.691Z" }, + { url = "https://files.pythonhosted.org/packages/4f/a6/6130dc6d8da28cdcbb0f2bf8865aeca9b157622f7c0031e48c6cf9a0e591/coverage-7.13.1-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d72140ccf8a147e94274024ff6fd8fb7811354cf7ef88b1f0a988ebaa5bc774f", size = 250374, upload-time = "2025-12-28T15:42:08.786Z" }, + { url = "https://files.pythonhosted.org/packages/82/2b/783ded568f7cd6b677762f780ad338bf4b4750205860c17c25f7c708995e/coverage-7.13.1-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d3c9f051b028810f5a87c88e5d6e9af3c0ff32ef62763bf15d29f740453ca909", size = 252882, upload-time = "2025-12-28T15:42:10.515Z" }, + { url = "https://files.pythonhosted.org/packages/cd/b2/9808766d082e6a4d59eb0cc881a57fc1600eb2c5882813eefff8254f71b5/coverage-7.13.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f398ba4df52d30b1763f62eed9de5620dcde96e6f491f4c62686736b155aa6e4", size = 254218, upload-time = "2025-12-28T15:42:12.208Z" }, + { url = "https://files.pythonhosted.org/packages/44/ea/52a985bb447c871cb4d2e376e401116520991b597c85afdde1ea9ef54f2c/coverage-7.13.1-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:132718176cc723026d201e347f800cd1a9e4b62ccd3f82476950834dad501c75", size = 250391, upload-time = "2025-12-28T15:42:14.21Z" }, + { url = "https://files.pythonhosted.org/packages/7f/1d/125b36cc12310718873cfc8209ecfbc1008f14f4f5fa0662aa608e579353/coverage-7.13.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9e549d642426e3579b3f4b92d0431543b012dcb6e825c91619d4e93b7363c3f9", size = 252239, upload-time = "2025-12-28T15:42:16.292Z" }, + { url = "https://files.pythonhosted.org/packages/6a/16/10c1c164950cade470107f9f14bbac8485f8fb8515f515fca53d337e4a7f/coverage-7.13.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:90480b2134999301eea795b3a9dbf606c6fbab1b489150c501da84a959442465", size = 250196, upload-time = "2025-12-28T15:42:18.54Z" }, + { url = "https://files.pythonhosted.org/packages/2a/c6/cd860fac08780c6fd659732f6ced1b40b79c35977c1356344e44d72ba6c4/coverage-7.13.1-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:e825dbb7f84dfa24663dd75835e7257f8882629fc11f03ecf77d84a75134b864", size = 250008, upload-time = "2025-12-28T15:42:20.365Z" }, + { url = "https://files.pythonhosted.org/packages/f0/3a/a8c58d3d38f82a5711e1e0a67268362af48e1a03df27c03072ac30feefcf/coverage-7.13.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:623dcc6d7a7ba450bbdbeedbaa0c42b329bdae16491af2282f12a7e809be7eb9", size = 251671, upload-time = "2025-12-28T15:42:22.114Z" }, + { url = "https://files.pythonhosted.org/packages/f0/bc/fd4c1da651d037a1e3d53e8cb3f8182f4b53271ffa9a95a2e211bacc0349/coverage-7.13.1-cp314-cp314-win32.whl", hash = "sha256:6e73ebb44dca5f708dc871fe0b90cf4cff1a13f9956f747cc87b535a840386f5", size = 221777, upload-time = "2025-12-28T15:42:23.919Z" }, + { url = "https://files.pythonhosted.org/packages/4b/50/71acabdc8948464c17e90b5ffd92358579bd0910732c2a1c9537d7536aa6/coverage-7.13.1-cp314-cp314-win_amd64.whl", hash = "sha256:be753b225d159feb397bd0bf91ae86f689bad0da09d3b301478cd39b878ab31a", size = 222592, upload-time = "2025-12-28T15:42:25.619Z" }, + { url = "https://files.pythonhosted.org/packages/f7/c8/a6fb943081bb0cc926499c7907731a6dc9efc2cbdc76d738c0ab752f1a32/coverage-7.13.1-cp314-cp314-win_arm64.whl", hash = "sha256:228b90f613b25ba0019361e4ab81520b343b622fc657daf7e501c4ed6a2366c0", size = 221169, upload-time = "2025-12-28T15:42:27.629Z" }, + { url = "https://files.pythonhosted.org/packages/16/61/d5b7a0a0e0e40d62e59bc8c7aa1afbd86280d82728ba97f0673b746b78e2/coverage-7.13.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:60cfb538fe9ef86e5b2ab0ca8fc8d62524777f6c611dcaf76dc16fbe9b8e698a", size = 219730, upload-time = "2025-12-28T15:42:29.306Z" }, + { url = "https://files.pythonhosted.org/packages/a3/2c/8881326445fd071bb49514d1ce97d18a46a980712b51fee84f9ab42845b4/coverage-7.13.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:57dfc8048c72ba48a8c45e188d811e5efd7e49b387effc8fb17e97936dde5bf6", size = 220001, upload-time = "2025-12-28T15:42:31.319Z" }, + { url = "https://files.pythonhosted.org/packages/b5/d7/50de63af51dfa3a7f91cc37ad8fcc1e244b734232fbc8b9ab0f3c834a5cd/coverage-7.13.1-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3f2f725aa3e909b3c5fdb8192490bdd8e1495e85906af74fe6e34a2a77ba0673", size = 261370, upload-time = "2025-12-28T15:42:32.992Z" }, + { url = "https://files.pythonhosted.org/packages/e1/2c/d31722f0ec918fd7453b2758312729f645978d212b410cd0f7c2aed88a94/coverage-7.13.1-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9ee68b21909686eeb21dfcba2c3b81fee70dcf38b140dcd5aa70680995fa3aa5", size = 263485, upload-time = "2025-12-28T15:42:34.759Z" }, + { url = "https://files.pythonhosted.org/packages/fa/7a/2c114fa5c5fc08ba0777e4aec4c97e0b4a1afcb69c75f1f54cff78b073ab/coverage-7.13.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:724b1b270cb13ea2e6503476e34541a0b1f62280bc997eab443f87790202033d", size = 265890, upload-time = "2025-12-28T15:42:36.517Z" }, + { url = "https://files.pythonhosted.org/packages/65/d9/f0794aa1c74ceabc780fe17f6c338456bbc4e96bd950f2e969f48ac6fb20/coverage-7.13.1-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:916abf1ac5cf7eb16bc540a5bf75c71c43a676f5c52fcb9fe75a2bd75fb944e8", size = 260445, upload-time = "2025-12-28T15:42:38.646Z" }, + { url = "https://files.pythonhosted.org/packages/49/23/184b22a00d9bb97488863ced9454068c79e413cb23f472da6cbddc6cfc52/coverage-7.13.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:776483fd35b58d8afe3acbd9988d5de592ab6da2d2a865edfdbc9fdb43e7c486", size = 263357, upload-time = "2025-12-28T15:42:40.788Z" }, + { url = "https://files.pythonhosted.org/packages/7d/bd/58af54c0c9199ea4190284f389005779d7daf7bf3ce40dcd2d2b2f96da69/coverage-7.13.1-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:b6f3b96617e9852703f5b633ea01315ca45c77e879584f283c44127f0f1ec564", size = 260959, upload-time = "2025-12-28T15:42:42.808Z" }, + { url = "https://files.pythonhosted.org/packages/4b/2a/6839294e8f78a4891bf1df79d69c536880ba2f970d0ff09e7513d6e352e9/coverage-7.13.1-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:bd63e7b74661fed317212fab774e2a648bc4bb09b35f25474f8e3325d2945cd7", size = 259792, upload-time = "2025-12-28T15:42:44.818Z" }, + { url = "https://files.pythonhosted.org/packages/ba/c3/528674d4623283310ad676c5af7414b9850ab6d55c2300e8aa4b945ec554/coverage-7.13.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:933082f161bbb3e9f90d00990dc956120f608cdbcaeea15c4d897f56ef4fe416", size = 262123, upload-time = "2025-12-28T15:42:47.108Z" }, + { url = "https://files.pythonhosted.org/packages/06/c5/8c0515692fb4c73ac379d8dc09b18eaf0214ecb76ea6e62467ba7a1556ff/coverage-7.13.1-cp314-cp314t-win32.whl", hash = "sha256:18be793c4c87de2965e1c0f060f03d9e5aff66cfeae8e1dbe6e5b88056ec153f", size = 222562, upload-time = "2025-12-28T15:42:49.144Z" }, + { url = "https://files.pythonhosted.org/packages/05/0e/c0a0c4678cb30dac735811db529b321d7e1c9120b79bd728d4f4d6b010e9/coverage-7.13.1-cp314-cp314t-win_amd64.whl", hash = "sha256:0e42e0ec0cd3e0d851cb3c91f770c9301f48647cb2877cb78f74bdaa07639a79", size = 223670, upload-time = "2025-12-28T15:42:51.218Z" }, + { url = "https://files.pythonhosted.org/packages/f5/5f/b177aa0011f354abf03a8f30a85032686d290fdeed4222b27d36b4372a50/coverage-7.13.1-cp314-cp314t-win_arm64.whl", hash = "sha256:eaecf47ef10c72ece9a2a92118257da87e460e113b83cc0d2905cbbe931792b4", size = 221707, upload-time = "2025-12-28T15:42:53.034Z" }, + { url = "https://files.pythonhosted.org/packages/cc/48/d9f421cb8da5afaa1a64570d9989e00fb7955e6acddc5a12979f7666ef60/coverage-7.13.1-py3-none-any.whl", hash = "sha256:2016745cb3ba554469d02819d78958b571792bb68e31302610e898f80dd3a573", size = 210722, upload-time = "2025-12-28T15:42:54.901Z" }, ] [package.optional-dependencies] @@ -1095,45 +1095,45 @@ wheels = [ [[package]] name = "cython" -version = "3.2.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/39/e1/c0d92b1258722e1bc62a12e630c33f1f842fdab53fd8cd5de2f75c6449a9/cython-3.2.3.tar.gz", hash = "sha256:f13832412d633376ffc08d751cc18ed0d7d00a398a4065e2871db505258748a6", size = 3276650, upload-time = "2025-12-14T07:50:34.691Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/11/77/71c2aef97648548116ca22197c191f8293178f9d4e939e2cb4cbe912619e/cython-3.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:55c0157a5940fbf0b054508207fe0fc5cc796d0532af492c0fa35b5b41a883f7", size = 2959265, upload-time = "2025-12-14T07:50:46.035Z" }, - { url = "https://files.pythonhosted.org/packages/76/b8/bc06c6427dfe46164d36c0b35e45028d0427faac28d218e065da05edcce5/cython-3.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:51fd1a56d0fc682c05ecc44f11927dbe28dd2867c30148557b62d7d1017a13d8", size = 3368365, upload-time = "2025-12-14T07:50:48.111Z" }, - { url = "https://files.pythonhosted.org/packages/c7/3e/7550e90ccd6493842dede63ac484181d4a254ed7332eaad01253ab789d36/cython-3.2.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1309bdce06f767e8514377f44b3a5b9e5b91e58af1348010cca10b572e1852ad", size = 3536996, upload-time = "2025-12-14T07:50:50.175Z" }, - { url = "https://files.pythonhosted.org/packages/33/94/df8d414d8fb3afd5a0350245ebc589e5bc25b655342ad7341e5cfc869cf5/cython-3.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:6b6dd6b7aca8447b2a6779b314cc402f1e4990754507a88477e535b3c8b41ad1", size = 2765625, upload-time = "2025-12-14T07:50:51.962Z" }, - { url = "https://files.pythonhosted.org/packages/c3/85/77315c92d29d782bee1b36e30b8d76ad1e731cb7ea0af17e285885f3bb68/cython-3.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c041f7e338cca2422e0924716b04fabeda57636214324fc1941396acce99e7c7", size = 2951618, upload-time = "2025-12-14T07:50:53.883Z" }, - { url = "https://files.pythonhosted.org/packages/cb/dd/a8209e0d424a0207ddb4a3097a97b667027af3cfada762d85f3bed08ccf8/cython-3.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:283262b8f902323ceb6ed3b643f275a2a963e7ab059f0714a467933383cbc56d", size = 3243636, upload-time = "2025-12-14T07:50:56.346Z" }, - { url = "https://files.pythonhosted.org/packages/1f/2d/bc1927fd7174f7928b86cc9b83589d39592b9273c8b1d2295ca0c0071984/cython-3.2.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22a624290c2883387b2c2cfb5224c15bff21432c6a2cf0c23ac8df3dcbd45e96", size = 3378528, upload-time = "2025-12-14T07:50:57.988Z" }, - { url = "https://files.pythonhosted.org/packages/ad/10/5add6a6e1721f9c36b5d5b4f3b75fa7af43196e4f2a474921a7277e31b7a/cython-3.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:26404441f733fd1cfb0dd9c45477f501437e7d51fad05bb402bd2feb4e127aa3", size = 2769341, upload-time = "2025-12-14T07:50:59.581Z" }, - { url = "https://files.pythonhosted.org/packages/b4/14/d16282d17c9eb2f78ca9ccd5801fed22f6c3360f5a55dbcce3c93cc70352/cython-3.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cf210228c15b5c625824d8e31d43b6fea25f9e13c81dac632f2f7d838e0229a5", size = 2968471, upload-time = "2025-12-14T07:51:01.207Z" }, - { url = "https://files.pythonhosted.org/packages/d0/3c/46304a942dac5a636701c55f5b05ec00ad151e6722cd068fe3d0993349bb/cython-3.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f5bf0cebeb4147e172a114437d3fce5a507595d8fdd821be792b1bb25c691514", size = 3223581, upload-time = "2025-12-14T07:51:04.336Z" }, - { url = "https://files.pythonhosted.org/packages/29/ad/15da606d71f40bcf2c405f84ca3d4195cb252f4eaa2f551fe6b2e630ee7c/cython-3.2.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d1f8700ba89c977438744f083890d87187f15709507a5489e0f6d682053b7fa0", size = 3391391, upload-time = "2025-12-14T07:51:05.998Z" }, - { url = "https://files.pythonhosted.org/packages/51/9e/045b35eb678682edc3e2d57112cf5ac3581a9ef274eb220b638279195678/cython-3.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:25732f3981a93407826297f4423206e5e22c3cfccfc74e37bf444453bbdc076f", size = 2756814, upload-time = "2025-12-14T07:51:07.759Z" }, - { url = "https://files.pythonhosted.org/packages/d5/c2/35cedff7fcbc844e4e872c6719df5ece26551e14f37d76eb41c412d778c6/cython-3.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1d097ad4686b58b8c03d760d08eca28f79878d404ef7452c49636170571654e0", size = 2959019, upload-time = "2025-12-14T07:51:09.429Z" }, - { url = "https://files.pythonhosted.org/packages/44/1b/05787f71b4834a28b19a0a3edee44537c239924f9a7d96ea38ebba365e5c/cython-3.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2a18f2e3bcd018416157d0a83446e29b4a31437ab79061fe5504c077e70389d0", size = 3212912, upload-time = "2025-12-14T07:51:11.512Z" }, - { url = "https://files.pythonhosted.org/packages/48/fe/f5d560e3a2eb1891d55f465d17437179d9f5fbd4f46aebf2c00d01fa5e80/cython-3.2.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:73afc824896ffaf22bf8122d0a7107f0120e3188a353bdcfa92317fc0d9a87ce", size = 3375222, upload-time = "2025-12-14T07:51:13.762Z" }, - { url = "https://files.pythonhosted.org/packages/3d/b9/dcf5a68ac2ef89424657b03f751ca799861db097fa83bd52068bed198120/cython-3.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:9aa1a8abf3d8bb53cc19cfaa21c004afad8d4ccb17513f8aa11a788d1f525abd", size = 2754908, upload-time = "2025-12-14T07:51:15.575Z" }, - { url = "https://files.pythonhosted.org/packages/5c/07/93c65fbee4ab419767b7e54937e91cacae5c71d2d1277cc882ea3b1ce777/cython-3.2.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:80f20369d7aaf4e76cfef902025256918a5cc6eb0aed6d8783e4b1c563e4f6c4", size = 2969476, upload-time = "2025-12-14T07:51:17.213Z" }, - { url = "https://files.pythonhosted.org/packages/00/ad/736b4cbcb42740608cae1315c790dd6a4419705545f0615af4074e267ea3/cython-3.2.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:60d19376252722241a3d3ec8a695c5cae4deb053486d2e5f9a40cb569a0cf984", size = 3258714, upload-time = "2025-12-14T07:51:18.925Z" }, - { url = "https://files.pythonhosted.org/packages/a2/74/03c08a723a319640f0bb3eaca947e009caa2eb48957ff735bfd77b0be060/cython-3.2.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e4293f1861480b397809a6f021a6c12e15e918feae1c7add80c99d07af206578", size = 3384940, upload-time = "2025-12-14T07:51:20.593Z" }, - { url = "https://files.pythonhosted.org/packages/73/14/0871a0b407fa50257a79c57a608903ed50032c7619d9531451f7090a5ee3/cython-3.2.3-cp314-cp314-win_amd64.whl", hash = "sha256:84330e7c8bf220a82b633678b9f99e10227c8f4c406d67c5552449ab2afedef8", size = 2791923, upload-time = "2025-12-14T07:51:22.292Z" }, - { url = "https://files.pythonhosted.org/packages/43/49/afe1e3df87a770861cf17ba39f4a91f6d22a2571010fc1890b3708360630/cython-3.2.3-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:74f482da8b605c61b4df6ff716d013f20131949cb2fa59b03e63abd36ef5bac0", size = 2874467, upload-time = "2025-12-14T07:51:31.568Z" }, - { url = "https://files.pythonhosted.org/packages/c7/da/044f725a083e28fb4de5bd33d13ec13f0753734b6ae52d4bc07434610cc8/cython-3.2.3-cp39-abi3-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:0a75a04688875b275a6c875565e672325bae04327dd6ec2fc25aeb5c6cf82fce", size = 3211272, upload-time = "2025-12-14T07:51:33.673Z" }, - { url = "https://files.pythonhosted.org/packages/95/14/af02ba6e2e03279f2ca2956e3024a44faed4c8496bda8170b663dc3ba6e8/cython-3.2.3-cp39-abi3-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6b01b36c9eb1b68c25bddbeef7379f7bfc37f7c9afc044e71840ffab761a2dd0", size = 2856058, upload-time = "2025-12-14T07:51:36.015Z" }, - { url = "https://files.pythonhosted.org/packages/69/16/d254359396c2f099ab154f89b2b35f5b8b0dd21a8102c2c96a7e00291434/cython-3.2.3-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:3829f99d611412288f44ff543e9d2b5c0c83274998b2a6680bbe5cca3539c1fd", size = 2993276, upload-time = "2025-12-14T07:51:37.863Z" }, - { url = "https://files.pythonhosted.org/packages/51/0e/1a071381923e896f751f8fbff2a01c5dc8860a8b9a90066f6ec8df561dc4/cython-3.2.3-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:c2365a0c79ab9c0fa86d30a4a6ba7e37fc1be9537c48b79b9d63ee7e08bf2fef", size = 2890843, upload-time = "2025-12-14T07:51:40.409Z" }, - { url = "https://files.pythonhosted.org/packages/f4/46/1e93e10766db988e6bb8e5c6f7e2e90b9e62f1ac8dee4c1a6cf1fc170773/cython-3.2.3-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:3141734fb15f8b5e9402b9240f8da8336edecae91742b41c85678c31ab68f66d", size = 3225339, upload-time = "2025-12-14T07:51:42.09Z" }, - { url = "https://files.pythonhosted.org/packages/d4/ae/c284b06ae6a9c95d5883bf8744d10466cf0df64cef041a4c80ccf9fd07bd/cython-3.2.3-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:9a24cc653fad3adbd9cbaa638d80df3aa08a1fe27f62eb35850971c70be680df", size = 3114751, upload-time = "2025-12-14T07:51:44.088Z" }, - { url = "https://files.pythonhosted.org/packages/c6/d6/7795a4775c70256217134195f06b07233cf17b00f8905d5b3d782208af64/cython-3.2.3-cp39-abi3-win32.whl", hash = "sha256:b39dff92db70cbd95528f3b81d70e06bd6d3fc9c1dd91321e4d3b999ece3bceb", size = 2435616, upload-time = "2025-12-14T07:51:46.063Z" }, - { url = "https://files.pythonhosted.org/packages/18/9e/2a3edcb858ad74e6274448dccf32150c532bc6e423f112a71f65ff3b5680/cython-3.2.3-cp39-abi3-win_arm64.whl", hash = "sha256:18edc858e6a52de47fe03ffa97ea14dadf450e20069de0a8aef531006c4bbd93", size = 2440952, upload-time = "2025-12-14T07:51:47.943Z" }, - { url = "https://files.pythonhosted.org/packages/e5/41/54fd429ff8147475fc24ca43246f85d78fb4e747c27f227e68f1594648f1/cython-3.2.3-py3-none-any.whl", hash = "sha256:06a1317097f540d3bb6c7b81ed58a0d8b9dbfa97abf39dfd4c22ee87a6c7241e", size = 1255561, upload-time = "2025-12-14T07:50:31.217Z" }, +version = "3.2.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/91/85/7574c9cd44b69a27210444b6650f6477f56c75fee1b70d7672d3e4166167/cython-3.2.4.tar.gz", hash = "sha256:84226ecd313b233da27dc2eb3601b4f222b8209c3a7216d8733b031da1dc64e6", size = 3280291, upload-time = "2026-01-04T14:14:14.473Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a1/10/720e0fb84eab4c927c4dd6b61eb7993f7732dd83d29ba6d73083874eade9/cython-3.2.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02cb0cc0f23b9874ad262d7d2b9560aed9c7e2df07b49b920bda6f2cc9cb505e", size = 2960836, upload-time = "2026-01-04T14:14:51.103Z" }, + { url = "https://files.pythonhosted.org/packages/7d/3d/b26f29092c71c36e0462752885bdfb18c23c176af4de953fdae2772a8941/cython-3.2.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f136f379a4a54246facd0eb6f1ee15c3837cb314ce87b677582ec014db4c6845", size = 3370134, upload-time = "2026-01-04T14:14:53.627Z" }, + { url = "https://files.pythonhosted.org/packages/56/9e/539fb0d09e4f5251b5b14f8daf77e71fee021527f1013791038234618b6b/cython-3.2.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:35ab0632186057406ec729374c737c37051d2eacad9d515d94e5a3b3e58a9b02", size = 3537552, upload-time = "2026-01-04T14:14:56.852Z" }, + { url = "https://files.pythonhosted.org/packages/10/c6/82d19a451c050d1be0f05b1a3302267463d391db548f013ee88b5348a8e9/cython-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:ca2399dc75796b785f74fb85c938254fa10c80272004d573c455f9123eceed86", size = 2766191, upload-time = "2026-01-04T14:14:58.709Z" }, + { url = "https://files.pythonhosted.org/packages/85/cc/8f06145ec3efa121c8b1b67f06a640386ddacd77ee3e574da582a21b14ee/cython-3.2.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff9af2134c05e3734064808db95b4dd7341a39af06e8945d05ea358e1741aaed", size = 2953769, upload-time = "2026-01-04T14:15:00.361Z" }, + { url = "https://files.pythonhosted.org/packages/55/b0/706cf830eddd831666208af1b3058c2e0758ae157590909c1f634b53bed9/cython-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67922c9de058a0bfb72d2e75222c52d09395614108c68a76d9800f150296ddb3", size = 3243841, upload-time = "2026-01-04T14:15:02.066Z" }, + { url = "https://files.pythonhosted.org/packages/ac/25/58893afd4ef45f79e3d4db82742fa4ff874b936d67a83c92939053920ccd/cython-3.2.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b362819d155fff1482575e804e43e3a8825332d32baa15245f4642022664a3f4", size = 3378083, upload-time = "2026-01-04T14:15:04.248Z" }, + { url = "https://files.pythonhosted.org/packages/32/e4/424a004d7c0d8a4050c81846ebbd22272ececfa9a498cb340aa44fccbec2/cython-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:1a64a112a34ec719b47c01395647e54fb4cf088a511613f9a3a5196694e8e382", size = 2769990, upload-time = "2026-01-04T14:15:06.53Z" }, + { url = "https://files.pythonhosted.org/packages/91/4d/1eb0c7c196a136b1926f4d7f0492a96c6fabd604d77e6cd43b56a3a16d83/cython-3.2.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:64d7f71be3dd6d6d4a4c575bb3a4674ea06d1e1e5e4cd1b9882a2bc40ed3c4c9", size = 2970064, upload-time = "2026-01-04T14:15:08.567Z" }, + { url = "https://files.pythonhosted.org/packages/03/1c/46e34b08bea19a1cdd1e938a4c123e6299241074642db9d81983cef95e9f/cython-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:869487ea41d004f8b92171f42271fbfadb1ec03bede3158705d16cd570d6b891", size = 3226757, upload-time = "2026-01-04T14:15:10.812Z" }, + { url = "https://files.pythonhosted.org/packages/12/33/3298a44d201c45bcf0d769659725ae70e9c6c42adf8032f6d89c8241098d/cython-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:55b6c44cd30821f0b25220ceba6fe636ede48981d2a41b9bbfe3c7902ce44ea7", size = 3388969, upload-time = "2026-01-04T14:15:12.45Z" }, + { url = "https://files.pythonhosted.org/packages/bb/f3/4275cd3ea0a4cf4606f9b92e7f8766478192010b95a7f516d1b7cf22cb10/cython-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:767b143704bdd08a563153448955935844e53b852e54afdc552b43902ed1e235", size = 2756457, upload-time = "2026-01-04T14:15:14.67Z" }, + { url = "https://files.pythonhosted.org/packages/18/b5/1cfca43b7d20a0fdb1eac67313d6bb6b18d18897f82dd0f17436bdd2ba7f/cython-3.2.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:28e8075087a59756f2d059273184b8b639fe0f16cf17470bd91c39921bc154e0", size = 2960506, upload-time = "2026-01-04T14:15:16.733Z" }, + { url = "https://files.pythonhosted.org/packages/71/bb/8f28c39c342621047fea349a82fac712a5e2b37546d2f737bbde48d5143d/cython-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:03893c88299a2c868bb741ba6513357acd104e7c42265809fd58dce1456a36fc", size = 3213148, upload-time = "2026-01-04T14:15:18.804Z" }, + { url = "https://files.pythonhosted.org/packages/7a/d2/16fa02f129ed2b627e88d9d9ebd5ade3eeb66392ae5ba85b259d2d52b047/cython-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f81eda419b5ada7b197bbc3c5f4494090e3884521ffd75a3876c93fbf66c9ca8", size = 3375764, upload-time = "2026-01-04T14:15:20.817Z" }, + { url = "https://files.pythonhosted.org/packages/91/3f/deb8f023a5c10c0649eb81332a58c180fad27c7533bb4aae138b5bc34d92/cython-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:83266c356c13c68ffe658b4905279c993d8a5337bb0160fa90c8a3e297ea9a2e", size = 2754238, upload-time = "2026-01-04T14:15:23.001Z" }, + { url = "https://files.pythonhosted.org/packages/ee/d7/3bda3efce0c5c6ce79cc21285dbe6f60369c20364e112f5a506ee8a1b067/cython-3.2.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:d4b4fd5332ab093131fa6172e8362f16adef3eac3179fd24bbdc392531cb82fa", size = 2971496, upload-time = "2026-01-04T14:15:25.038Z" }, + { url = "https://files.pythonhosted.org/packages/89/ed/1021ffc80b9c4720b7ba869aea8422c82c84245ef117ebe47a556bdc00c3/cython-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e3b5ac54e95f034bc7fb07313996d27cbf71abc17b229b186c1540942d2dc28e", size = 3256146, upload-time = "2026-01-04T14:15:26.741Z" }, + { url = "https://files.pythonhosted.org/packages/0c/51/ca221ec7e94b3c5dc4138dcdcbd41178df1729c1e88c5dfb25f9d30ba3da/cython-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:90f43be4eaa6afd58ce20d970bb1657a3627c44e1760630b82aa256ba74b4acb", size = 3383458, upload-time = "2026-01-04T14:15:28.425Z" }, + { url = "https://files.pythonhosted.org/packages/79/2e/1388fc0243240cd54994bb74f26aaaf3b2e22f89d3a2cf8da06d75d46ca2/cython-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:983f9d2bb8a896e16fa68f2b37866ded35fa980195eefe62f764ddc5f9f5ef8e", size = 2791241, upload-time = "2026-01-04T14:15:30.448Z" }, + { url = "https://files.pythonhosted.org/packages/0a/8b/fd393f0923c82be4ec0db712fffb2ff0a7a131707b842c99bf24b549274d/cython-3.2.4-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:36bf3f5eb56d5281aafabecbaa6ed288bc11db87547bba4e1e52943ae6961ccf", size = 2875622, upload-time = "2026-01-04T14:15:39.749Z" }, + { url = "https://files.pythonhosted.org/packages/73/48/48530d9b9d64ec11dbe0dd3178a5fe1e0b27977c1054ecffb82be81e9b6a/cython-3.2.4-cp39-abi3-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:6d5267f22b6451eb1e2e1b88f6f78a2c9c8733a6ddefd4520d3968d26b824581", size = 3210669, upload-time = "2026-01-04T14:15:41.911Z" }, + { url = "https://files.pythonhosted.org/packages/5e/91/4865fbfef1f6bb4f21d79c46104a53d1a3fa4348286237e15eafb26e0828/cython-3.2.4-cp39-abi3-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3b6e58f73a69230218d5381817850ce6d0da5bb7e87eb7d528c7027cbba40b06", size = 2856835, upload-time = "2026-01-04T14:15:43.815Z" }, + { url = "https://files.pythonhosted.org/packages/fa/39/60317957dbef179572398253f29d28f75f94ab82d6d39ea3237fb6c89268/cython-3.2.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:e71efb20048358a6b8ec604a0532961c50c067b5e63e345e2e359fff72feaee8", size = 2994408, upload-time = "2026-01-04T14:15:45.422Z" }, + { url = "https://files.pythonhosted.org/packages/8d/30/7c24d9292650db4abebce98abc9b49c820d40fa7c87921c0a84c32f4efe7/cython-3.2.4-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:28b1e363b024c4b8dcf52ff68125e635cb9cb4b0ba997d628f25e32543a71103", size = 2891478, upload-time = "2026-01-04T14:15:47.394Z" }, + { url = "https://files.pythonhosted.org/packages/86/70/03dc3c962cde9da37a93cca8360e576f904d5f9beecfc9d70b1f820d2e5f/cython-3.2.4-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:31a90b4a2c47bb6d56baeb926948348ec968e932c1ae2c53239164e3e8880ccf", size = 3225663, upload-time = "2026-01-04T14:15:49.446Z" }, + { url = "https://files.pythonhosted.org/packages/b1/97/10b50c38313c37b1300325e2e53f48ea9a2c078a85c0c9572057135e31d5/cython-3.2.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:e65e4773021f8dc8532010b4fbebe782c77f9a0817e93886e518c93bd6a44e9d", size = 3115628, upload-time = "2026-01-04T14:15:51.323Z" }, + { url = "https://files.pythonhosted.org/packages/8f/b1/d6a353c9b147848122a0db370863601fdf56de2d983b5c4a6a11e6ee3cd7/cython-3.2.4-cp39-abi3-win32.whl", hash = "sha256:2b1f12c0e4798293d2754e73cd6f35fa5bbdf072bdc14bc6fc442c059ef2d290", size = 2437463, upload-time = "2026-01-04T14:15:53.787Z" }, + { url = "https://files.pythonhosted.org/packages/2d/d8/319a1263b9c33b71343adfd407e5daffd453daef47ebc7b642820a8b68ed/cython-3.2.4-cp39-abi3-win_arm64.whl", hash = "sha256:3b8e62049afef9da931d55de82d8f46c9a147313b69d5ff6af6e9121d545ce7a", size = 2442754, upload-time = "2026-01-04T14:15:55.382Z" }, + { url = "https://files.pythonhosted.org/packages/ff/fa/d3c15189f7c52aaefbaea76fb012119b04b9013f4bf446cb4eb4c26c4e6b/cython-3.2.4-py3-none-any.whl", hash = "sha256:732fc93bc33ae4b14f6afaca663b916c2fdd5dcbfad7114e17fb2434eeaea45c", size = 1257078, upload-time = "2026-01-04T14:14:12.373Z" }, ] [[package]] name = "datasets" -version = "4.4.1" +version = "4.4.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "dill" }, @@ -1143,7 +1143,7 @@ dependencies = [ { name = "huggingface-hub" }, { name = "multiprocess" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "packaging" }, { name = "pandas" }, { name = "pyarrow" }, @@ -1152,9 +1152,9 @@ dependencies = [ { name = "tqdm" }, { name = "xxhash" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/93/bf/0dae295d6d1ba0b1a200a9dd216838464b5bbd05da01407cb1330b377445/datasets-4.4.1.tar.gz", hash = "sha256:80322699aa8c0bbbdb7caa87906da689c3c2e29523cff698775c67f28fdab1fc", size = 585341, upload-time = "2025-11-05T16:00:38.162Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c4/54/9359803da96bc65439a28fbb014dc2c90b7d4d8034a93b72362b0d40191f/datasets-4.4.2.tar.gz", hash = "sha256:9de16e415c4ba4713eac0493f7c7dc74f3aa21599297f00cc6ddab409cb7b24b", size = 586474, upload-time = "2025-12-19T15:03:09.129Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/3b/5e/6f8d874366788ad5d549e9ba258037d974dda6e004843be1bda794571701/datasets-4.4.1-py3-none-any.whl", hash = "sha256:c1163de5211e42546079ab355cc0250c7e6db16eb209ac5ac6252f801f596c44", size = 511591, upload-time = "2025-11-05T16:00:36.365Z" }, + { url = "https://files.pythonhosted.org/packages/7b/b5/fefa518c809de7bced5cddb7c21c010da66fa2ae494bda96844a280cc6ce/datasets-4.4.2-py3-none-any.whl", hash = "sha256:6f5ef3417504d9cd663c71c1b90b9a494ff4c2076a2cd6a6e40ceee6ad95befc", size = 512268, upload-time = "2025-12-19T15:03:07.087Z" }, ] [[package]] @@ -1285,7 +1285,7 @@ wheels = [ [[package]] name = "fastapi" -version = "0.125.0" +version = "0.128.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "annotated-doc" }, @@ -1293,9 +1293,9 @@ dependencies = [ { name = "starlette" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/17/71/2df15009fb4bdd522a069d2fbca6007c6c5487fce5cb965be00fc335f1d1/fastapi-0.125.0.tar.gz", hash = "sha256:16b532691a33e2c5dee1dac32feb31dc6eb41a3dd4ff29a95f9487cb21c054c0", size = 370550, upload-time = "2025-12-17T21:41:44.15Z" } +sdist = { url = "https://files.pythonhosted.org/packages/52/08/8c8508db6c7b9aae8f7175046af41baad690771c9bcde676419965e338c7/fastapi-0.128.0.tar.gz", hash = "sha256:1cc179e1cef10a6be60ffe429f79b829dce99d8de32d7acb7e6c8dfdf7f2645a", size = 365682, upload-time = "2025-12-27T15:21:13.714Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/34/2f/ff2fcc98f500713368d8b650e1bbc4a0b3ebcdd3e050dcdaad5f5a13fd7e/fastapi-0.125.0-py3-none-any.whl", hash = "sha256:2570ec4f3aecf5cca8f0428aed2398b774fcdfee6c2116f86e80513f2f86a7a1", size = 112888, upload-time = "2025-12-17T21:41:41.286Z" }, + { url = "https://files.pythonhosted.org/packages/5c/05/5cbb59154b093548acd0f4c7c474a118eda06da25aa75c616b72d8fcd92a/fastapi-0.128.0-py3-none-any.whl", hash = "sha256:aebd93f9716ee3b4f4fcfe13ffb7cf308d99c9f3ab5622d8877441072561582d", size = 103094, upload-time = "2025-12-27T15:21:12.154Z" }, ] [[package]] @@ -1315,11 +1315,11 @@ wheels = [ [[package]] name = "filelock" -version = "3.20.1" +version = "3.20.2" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a7/23/ce7a1126827cedeb958fc043d61745754464eb56c5937c35bbf2b8e26f34/filelock-3.20.1.tar.gz", hash = "sha256:b8360948b351b80f420878d8516519a2204b07aefcdcfd24912a5d33127f188c", size = 19476, upload-time = "2025-12-15T23:54:28.027Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c1/e0/a75dbe4bca1e7d41307323dad5ea2efdd95408f74ab2de8bd7dba9b51a1a/filelock-3.20.2.tar.gz", hash = "sha256:a2241ff4ddde2a7cebddf78e39832509cb045d18ec1a09d7248d6bfc6bfbbe64", size = 19510, upload-time = "2026-01-02T15:33:32.582Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e3/7f/a1a97644e39e7316d850784c642093c99df1290a460df4ede27659056834/filelock-3.20.1-py3-none-any.whl", hash = "sha256:15d9e9a67306188a44baa72f569d2bfd803076269365fdea0934385da4dc361a", size = 16666, upload-time = "2025-12-15T23:54:26.874Z" }, + { url = "https://files.pythonhosted.org/packages/9a/30/ab407e2ec752aa541704ed8f93c11e2a5d92c168b8a755d818b74a3c5c2d/filelock-3.20.2-py3-none-any.whl", hash = "sha256:fbba7237d6ea277175a32c54bb71ef814a8546d8601269e1bfc388de333974e8", size = 16697, upload-time = "2026-01-02T15:33:31.133Z" }, ] [[package]] @@ -1388,7 +1388,7 @@ dependencies = [ { name = "einops" }, { name = "ninja" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "nvidia-cudnn-frontend" }, { name = "nvidia-cutlass-dsl" }, { name = "nvidia-ml-py" }, @@ -1584,14 +1584,14 @@ wheels = [ [[package]] name = "gitpython" -version = "3.1.45" +version = "3.1.46" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "gitdb" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/9a/c8/dd58967d119baab745caec2f9d853297cec1989ec1d63f677d3880632b88/gitpython-3.1.45.tar.gz", hash = "sha256:85b0ee964ceddf211c41b9f27a49086010a190fd8132a24e21f362a4b36a791c", size = 215076, upload-time = "2025-07-24T03:45:54.871Z" } +sdist = { url = "https://files.pythonhosted.org/packages/df/b5/59d16470a1f0dfe8c793f9ef56fd3826093fc52b3bd96d6b9d6c26c7e27b/gitpython-3.1.46.tar.gz", hash = "sha256:400124c7d0ef4ea03f7310ac2fbf7151e09ff97f2a3288d64a440c584a29c37f", size = 215371, upload-time = "2026-01-01T15:37:32.073Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/01/61/d4b89fec821f72385526e1b9d9a3a0385dda4a72b206d28049e2c7cd39b8/gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77", size = 208168, upload-time = "2025-07-24T03:45:52.517Z" }, + { url = "https://files.pythonhosted.org/packages/6a/09/e21df6aef1e1ffc0c816f0522ddc3f6dcded766c3261813131c78a704470/gitpython-3.1.46-py3-none-any.whl", hash = "sha256:79812ed143d9d25b6d176a10bb511de0f9c67b1fa641d82097b0ab90398a2058", size = 208620, upload-time = "2026-01-01T15:37:30.574Z" }, ] [[package]] @@ -2223,7 +2223,7 @@ name = "megatron-core" source = { editable = "." } dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "packaging" }, { name = "torch", marker = "sys_platform == 'never' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] @@ -2250,7 +2250,7 @@ dev = [ { name = "tensorstore", version = "0.1.78", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "tensorstore", version = "0.1.80", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "tqdm" }, - { name = "transformer-engine", extra = ["core-cu13", "pytorch"], marker = "extra == 'extra-13-megatron-core-dev'" }, + { name = "transformer-engine", marker = "extra == 'extra-13-megatron-core-dev'" }, { name = "wget" }, ] lts = [ @@ -2374,7 +2374,7 @@ requires-dist = [ { name = "torch" }, { name = "tqdm", marker = "extra == 'dev'" }, { name = "tqdm", marker = "extra == 'lts'" }, - { name = "transformer-engine", extras = ["core-cu13", "pytorch"], marker = "extra == 'dev'", specifier = ">=2.9.0a0,<2.11.0" }, + { name = "transformer-engine", extras = ["core-cu13", "pytorch"], marker = "extra == 'dev'", git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.11" }, { name = "transformers", marker = "extra == 'mlm'" }, { name = "wandb", marker = "extra == 'mlm'" }, { name = "wget", marker = "extra == 'dev'" }, @@ -2441,7 +2441,7 @@ dependencies = [ { name = "click" }, { name = "multi-storage-client" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "pillow" }, { name = "pyyaml" }, { name = "s3fs" }, @@ -2470,7 +2470,7 @@ version = "0.5.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0e/4a/c27b42ed9b1c7d13d9ba8b6905dece787d6259152f2309338aed29b2447b/ml_dtypes-0.5.4.tar.gz", hash = "sha256:8ab06a50fb9bf9666dd0fe5dfb4676fa2b0ac0f31ecff72a6c3af8e22c063453", size = 692314, upload-time = "2025-11-17T22:32:31.031Z" } wheels = [ @@ -2965,7 +2965,7 @@ wheels = [ [[package]] name = "numpy" -version = "2.3.5" +version = "2.4.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ "python_full_version >= '3.14' and sys_platform == 'linux'", @@ -2977,81 +2977,79 @@ resolution-markers = [ "python_full_version == '3.11.*' and sys_platform == 'linux'", "python_full_version == '3.11.*' and sys_platform != 'linux'", ] -sdist = { url = "https://files.pythonhosted.org/packages/76/65/21b3bc86aac7b8f2862db1e808f1ea22b028e30a225a34a5ede9bf8678f2/numpy-2.3.5.tar.gz", hash = "sha256:784db1dcdab56bf0517743e746dfb0f885fc68d948aba86eeec2cba234bdf1c0", size = 20584950, upload-time = "2025-11-16T22:52:42.067Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/43/77/84dd1d2e34d7e2792a236ba180b5e8fcc1e3e414e761ce0253f63d7f572e/numpy-2.3.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:de5672f4a7b200c15a4127042170a694d4df43c992948f5e1af57f0174beed10", size = 17034641, upload-time = "2025-11-16T22:49:19.336Z" }, - { url = "https://files.pythonhosted.org/packages/2a/ea/25e26fa5837106cde46ae7d0b667e20f69cbbc0efd64cba8221411ab26ae/numpy-2.3.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:acfd89508504a19ed06ef963ad544ec6664518c863436306153e13e94605c218", size = 12528324, upload-time = "2025-11-16T22:49:22.582Z" }, - { url = "https://files.pythonhosted.org/packages/4d/1a/e85f0eea4cf03d6a0228f5c0256b53f2df4bc794706e7df019fc622e47f1/numpy-2.3.5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:ffe22d2b05504f786c867c8395de703937f934272eb67586817b46188b4ded6d", size = 5356872, upload-time = "2025-11-16T22:49:25.408Z" }, - { url = "https://files.pythonhosted.org/packages/5c/bb/35ef04afd567f4c989c2060cde39211e4ac5357155c1833bcd1166055c61/numpy-2.3.5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:872a5cf366aec6bb1147336480fef14c9164b154aeb6542327de4970282cd2f5", size = 6893148, upload-time = "2025-11-16T22:49:27.549Z" }, - { url = "https://files.pythonhosted.org/packages/f2/2b/05bbeb06e2dff5eab512dfc678b1cc5ee94d8ac5956a0885c64b6b26252b/numpy-2.3.5-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3095bdb8dd297e5920b010e96134ed91d852d81d490e787beca7e35ae1d89cf7", size = 14557282, upload-time = "2025-11-16T22:49:30.964Z" }, - { url = "https://files.pythonhosted.org/packages/65/fb/2b23769462b34398d9326081fad5655198fcf18966fcb1f1e49db44fbf31/numpy-2.3.5-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8cba086a43d54ca804ce711b2a940b16e452807acebe7852ff327f1ecd49b0d4", size = 16897903, upload-time = "2025-11-16T22:49:34.191Z" }, - { url = "https://files.pythonhosted.org/packages/ac/14/085f4cf05fc3f1e8aa95e85404e984ffca9b2275a5dc2b1aae18a67538b8/numpy-2.3.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6cf9b429b21df6b99f4dee7a1218b8b7ffbbe7df8764dc0bd60ce8a0708fed1e", size = 16341672, upload-time = "2025-11-16T22:49:37.2Z" }, - { url = "https://files.pythonhosted.org/packages/6f/3b/1f73994904142b2aa290449b3bb99772477b5fd94d787093e4f24f5af763/numpy-2.3.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:396084a36abdb603546b119d96528c2f6263921c50df3c8fd7cb28873a237748", size = 18838896, upload-time = "2025-11-16T22:49:39.727Z" }, - { url = "https://files.pythonhosted.org/packages/cd/b9/cf6649b2124f288309ffc353070792caf42ad69047dcc60da85ee85fea58/numpy-2.3.5-cp311-cp311-win32.whl", hash = "sha256:b0c7088a73aef3d687c4deef8452a3ac7c1be4e29ed8bf3b366c8111128ac60c", size = 6563608, upload-time = "2025-11-16T22:49:42.079Z" }, - { url = "https://files.pythonhosted.org/packages/aa/44/9fe81ae1dcc29c531843852e2874080dc441338574ccc4306b39e2ff6e59/numpy-2.3.5-cp311-cp311-win_amd64.whl", hash = "sha256:a414504bef8945eae5f2d7cb7be2d4af77c5d1cb5e20b296c2c25b61dff2900c", size = 13078442, upload-time = "2025-11-16T22:49:43.99Z" }, - { url = "https://files.pythonhosted.org/packages/6d/a7/f99a41553d2da82a20a2f22e93c94f928e4490bb447c9ff3c4ff230581d3/numpy-2.3.5-cp311-cp311-win_arm64.whl", hash = "sha256:0cd00b7b36e35398fa2d16af7b907b65304ef8bb4817a550e06e5012929830fa", size = 10458555, upload-time = "2025-11-16T22:49:47.092Z" }, - { url = "https://files.pythonhosted.org/packages/44/37/e669fe6cbb2b96c62f6bbedc6a81c0f3b7362f6a59230b23caa673a85721/numpy-2.3.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:74ae7b798248fe62021dbf3c914245ad45d1a6b0cb4a29ecb4b31d0bfbc4cc3e", size = 16733873, upload-time = "2025-11-16T22:49:49.84Z" }, - { url = "https://files.pythonhosted.org/packages/c5/65/df0db6c097892c9380851ab9e44b52d4f7ba576b833996e0080181c0c439/numpy-2.3.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ee3888d9ff7c14604052b2ca5535a30216aa0a58e948cdd3eeb8d3415f638769", size = 12259838, upload-time = "2025-11-16T22:49:52.863Z" }, - { url = "https://files.pythonhosted.org/packages/5b/e1/1ee06e70eb2136797abe847d386e7c0e830b67ad1d43f364dd04fa50d338/numpy-2.3.5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:612a95a17655e213502f60cfb9bf9408efdc9eb1d5f50535cc6eb365d11b42b5", size = 5088378, upload-time = "2025-11-16T22:49:55.055Z" }, - { url = "https://files.pythonhosted.org/packages/6d/9c/1ca85fb86708724275103b81ec4cf1ac1d08f465368acfc8da7ab545bdae/numpy-2.3.5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3101e5177d114a593d79dd79658650fe28b5a0d8abeb8ce6f437c0e6df5be1a4", size = 6628559, upload-time = "2025-11-16T22:49:57.371Z" }, - { url = "https://files.pythonhosted.org/packages/74/78/fcd41e5a0ce4f3f7b003da85825acddae6d7ecb60cf25194741b036ca7d6/numpy-2.3.5-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b973c57ff8e184109db042c842423ff4f60446239bd585a5131cc47f06f789d", size = 14250702, upload-time = "2025-11-16T22:49:59.632Z" }, - { url = "https://files.pythonhosted.org/packages/b6/23/2a1b231b8ff672b4c450dac27164a8b2ca7d9b7144f9c02d2396518352eb/numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0d8163f43acde9a73c2a33605353a4f1bc4798745a8b1d73183b28e5b435ae28", size = 16606086, upload-time = "2025-11-16T22:50:02.127Z" }, - { url = "https://files.pythonhosted.org/packages/a0/c5/5ad26fbfbe2012e190cc7d5003e4d874b88bb18861d0829edc140a713021/numpy-2.3.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:51c1e14eb1e154ebd80e860722f9e6ed6ec89714ad2db2d3aa33c31d7c12179b", size = 16025985, upload-time = "2025-11-16T22:50:04.536Z" }, - { url = "https://files.pythonhosted.org/packages/d2/fa/dd48e225c46c819288148d9d060b047fd2a6fb1eb37eae25112ee4cb4453/numpy-2.3.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b46b4ec24f7293f23adcd2d146960559aaf8020213de8ad1909dba6c013bf89c", size = 18542976, upload-time = "2025-11-16T22:50:07.557Z" }, - { url = "https://files.pythonhosted.org/packages/05/79/ccbd23a75862d95af03d28b5c6901a1b7da4803181513d52f3b86ed9446e/numpy-2.3.5-cp312-cp312-win32.whl", hash = "sha256:3997b5b3c9a771e157f9aae01dd579ee35ad7109be18db0e85dbdbe1de06e952", size = 6285274, upload-time = "2025-11-16T22:50:10.746Z" }, - { url = "https://files.pythonhosted.org/packages/2d/57/8aeaf160312f7f489dea47ab61e430b5cb051f59a98ae68b7133ce8fa06a/numpy-2.3.5-cp312-cp312-win_amd64.whl", hash = "sha256:86945f2ee6d10cdfd67bcb4069c1662dd711f7e2a4343db5cecec06b87cf31aa", size = 12782922, upload-time = "2025-11-16T22:50:12.811Z" }, - { url = "https://files.pythonhosted.org/packages/78/a6/aae5cc2ca78c45e64b9ef22f089141d661516856cf7c8a54ba434576900d/numpy-2.3.5-cp312-cp312-win_arm64.whl", hash = "sha256:f28620fe26bee16243be2b7b874da327312240a7cdc38b769a697578d2100013", size = 10194667, upload-time = "2025-11-16T22:50:16.16Z" }, - { url = "https://files.pythonhosted.org/packages/db/69/9cde09f36da4b5a505341180a3f2e6fadc352fd4d2b7096ce9778db83f1a/numpy-2.3.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d0f23b44f57077c1ede8c5f26b30f706498b4862d3ff0a7298b8411dd2f043ff", size = 16728251, upload-time = "2025-11-16T22:50:19.013Z" }, - { url = "https://files.pythonhosted.org/packages/79/fb/f505c95ceddd7027347b067689db71ca80bd5ecc926f913f1a23e65cf09b/numpy-2.3.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:aa5bc7c5d59d831d9773d1170acac7893ce3a5e130540605770ade83280e7188", size = 12254652, upload-time = "2025-11-16T22:50:21.487Z" }, - { url = "https://files.pythonhosted.org/packages/78/da/8c7738060ca9c31b30e9301ee0cf6c5ffdbf889d9593285a1cead337f9a5/numpy-2.3.5-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:ccc933afd4d20aad3c00bcef049cb40049f7f196e0397f1109dba6fed63267b0", size = 5083172, upload-time = "2025-11-16T22:50:24.562Z" }, - { url = "https://files.pythonhosted.org/packages/a4/b4/ee5bb2537fb9430fd2ef30a616c3672b991a4129bb1c7dcc42aa0abbe5d7/numpy-2.3.5-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:afaffc4393205524af9dfa400fa250143a6c3bc646c08c9f5e25a9f4b4d6a903", size = 6622990, upload-time = "2025-11-16T22:50:26.47Z" }, - { url = "https://files.pythonhosted.org/packages/95/03/dc0723a013c7d7c19de5ef29e932c3081df1c14ba582b8b86b5de9db7f0f/numpy-2.3.5-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c75442b2209b8470d6d5d8b1c25714270686f14c749028d2199c54e29f20b4d", size = 14248902, upload-time = "2025-11-16T22:50:28.861Z" }, - { url = "https://files.pythonhosted.org/packages/f5/10/ca162f45a102738958dcec8023062dad0cbc17d1ab99d68c4e4a6c45fb2b/numpy-2.3.5-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11e06aa0af8c0f05104d56450d6093ee639e15f24ecf62d417329d06e522e017", size = 16597430, upload-time = "2025-11-16T22:50:31.56Z" }, - { url = "https://files.pythonhosted.org/packages/2a/51/c1e29be863588db58175175f057286900b4b3327a1351e706d5e0f8dd679/numpy-2.3.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ed89927b86296067b4f81f108a2271d8926467a8868e554eaf370fc27fa3ccaf", size = 16024551, upload-time = "2025-11-16T22:50:34.242Z" }, - { url = "https://files.pythonhosted.org/packages/83/68/8236589d4dbb87253d28259d04d9b814ec0ecce7cb1c7fed29729f4c3a78/numpy-2.3.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51c55fe3451421f3a6ef9a9c1439e82101c57a2c9eab9feb196a62b1a10b58ce", size = 18533275, upload-time = "2025-11-16T22:50:37.651Z" }, - { url = "https://files.pythonhosted.org/packages/40/56/2932d75b6f13465239e3b7b7e511be27f1b8161ca2510854f0b6e521c395/numpy-2.3.5-cp313-cp313-win32.whl", hash = "sha256:1978155dd49972084bd6ef388d66ab70f0c323ddee6f693d539376498720fb7e", size = 6277637, upload-time = "2025-11-16T22:50:40.11Z" }, - { url = "https://files.pythonhosted.org/packages/0c/88/e2eaa6cffb115b85ed7c7c87775cb8bcf0816816bc98ca8dbfa2ee33fe6e/numpy-2.3.5-cp313-cp313-win_amd64.whl", hash = "sha256:00dc4e846108a382c5869e77c6ed514394bdeb3403461d25a829711041217d5b", size = 12779090, upload-time = "2025-11-16T22:50:42.503Z" }, - { url = "https://files.pythonhosted.org/packages/8f/88/3f41e13a44ebd4034ee17baa384acac29ba6a4fcc2aca95f6f08ca0447d1/numpy-2.3.5-cp313-cp313-win_arm64.whl", hash = "sha256:0472f11f6ec23a74a906a00b48a4dcf3849209696dff7c189714511268d103ae", size = 10194710, upload-time = "2025-11-16T22:50:44.971Z" }, - { url = "https://files.pythonhosted.org/packages/13/cb/71744144e13389d577f867f745b7df2d8489463654a918eea2eeb166dfc9/numpy-2.3.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:414802f3b97f3c1eef41e530aaba3b3c1620649871d8cb38c6eaff034c2e16bd", size = 16827292, upload-time = "2025-11-16T22:50:47.715Z" }, - { url = "https://files.pythonhosted.org/packages/71/80/ba9dc6f2a4398e7f42b708a7fdc841bb638d353be255655498edbf9a15a8/numpy-2.3.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5ee6609ac3604fa7780e30a03e5e241a7956f8e2fcfe547d51e3afa5247ac47f", size = 12378897, upload-time = "2025-11-16T22:50:51.327Z" }, - { url = "https://files.pythonhosted.org/packages/2e/6d/db2151b9f64264bcceccd51741aa39b50150de9b602d98ecfe7e0c4bff39/numpy-2.3.5-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:86d835afea1eaa143012a2d7a3f45a3adce2d7adc8b4961f0b362214d800846a", size = 5207391, upload-time = "2025-11-16T22:50:54.542Z" }, - { url = "https://files.pythonhosted.org/packages/80/ae/429bacace5ccad48a14c4ae5332f6aa8ab9f69524193511d60ccdfdc65fa/numpy-2.3.5-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:30bc11310e8153ca664b14c5f1b73e94bd0503681fcf136a163de856f3a50139", size = 6721275, upload-time = "2025-11-16T22:50:56.794Z" }, - { url = "https://files.pythonhosted.org/packages/74/5b/1919abf32d8722646a38cd527bc3771eb229a32724ee6ba340ead9b92249/numpy-2.3.5-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1062fde1dcf469571705945b0f221b73928f34a20c904ffb45db101907c3454e", size = 14306855, upload-time = "2025-11-16T22:50:59.208Z" }, - { url = "https://files.pythonhosted.org/packages/a5/87/6831980559434973bebc30cd9c1f21e541a0f2b0c280d43d3afd909b66d0/numpy-2.3.5-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ce581db493ea1a96c0556360ede6607496e8bf9b3a8efa66e06477267bc831e9", size = 16657359, upload-time = "2025-11-16T22:51:01.991Z" }, - { url = "https://files.pythonhosted.org/packages/dd/91/c797f544491ee99fd00495f12ebb7802c440c1915811d72ac5b4479a3356/numpy-2.3.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:cc8920d2ec5fa99875b670bb86ddeb21e295cb07aa331810d9e486e0b969d946", size = 16093374, upload-time = "2025-11-16T22:51:05.291Z" }, - { url = "https://files.pythonhosted.org/packages/74/a6/54da03253afcbe7a72785ec4da9c69fb7a17710141ff9ac5fcb2e32dbe64/numpy-2.3.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:9ee2197ef8c4f0dfe405d835f3b6a14f5fee7782b5de51ba06fb65fc9b36e9f1", size = 18594587, upload-time = "2025-11-16T22:51:08.585Z" }, - { url = "https://files.pythonhosted.org/packages/80/e9/aff53abbdd41b0ecca94285f325aff42357c6b5abc482a3fcb4994290b18/numpy-2.3.5-cp313-cp313t-win32.whl", hash = "sha256:70b37199913c1bd300ff6e2693316c6f869c7ee16378faf10e4f5e3275b299c3", size = 6405940, upload-time = "2025-11-16T22:51:11.541Z" }, - { url = "https://files.pythonhosted.org/packages/d5/81/50613fec9d4de5480de18d4f8ef59ad7e344d497edbef3cfd80f24f98461/numpy-2.3.5-cp313-cp313t-win_amd64.whl", hash = "sha256:b501b5fa195cc9e24fe102f21ec0a44dffc231d2af79950b451e0d99cea02234", size = 12920341, upload-time = "2025-11-16T22:51:14.312Z" }, - { url = "https://files.pythonhosted.org/packages/bb/ab/08fd63b9a74303947f34f0bd7c5903b9c5532c2d287bead5bdf4c556c486/numpy-2.3.5-cp313-cp313t-win_arm64.whl", hash = "sha256:a80afd79f45f3c4a7d341f13acbe058d1ca8ac017c165d3fa0d3de6bc1a079d7", size = 10262507, upload-time = "2025-11-16T22:51:16.846Z" }, - { url = "https://files.pythonhosted.org/packages/ba/97/1a914559c19e32d6b2e233cf9a6a114e67c856d35b1d6babca571a3e880f/numpy-2.3.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:bf06bc2af43fa8d32d30fae16ad965663e966b1a3202ed407b84c989c3221e82", size = 16735706, upload-time = "2025-11-16T22:51:19.558Z" }, - { url = "https://files.pythonhosted.org/packages/57/d4/51233b1c1b13ecd796311216ae417796b88b0616cfd8a33ae4536330748a/numpy-2.3.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:052e8c42e0c49d2575621c158934920524f6c5da05a1d3b9bab5d8e259e045f0", size = 12264507, upload-time = "2025-11-16T22:51:22.492Z" }, - { url = "https://files.pythonhosted.org/packages/45/98/2fe46c5c2675b8306d0b4a3ec3494273e93e1226a490f766e84298576956/numpy-2.3.5-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:1ed1ec893cff7040a02c8aa1c8611b94d395590d553f6b53629a4461dc7f7b63", size = 5093049, upload-time = "2025-11-16T22:51:25.171Z" }, - { url = "https://files.pythonhosted.org/packages/ce/0e/0698378989bb0ac5f1660c81c78ab1fe5476c1a521ca9ee9d0710ce54099/numpy-2.3.5-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:2dcd0808a421a482a080f89859a18beb0b3d1e905b81e617a188bd80422d62e9", size = 6626603, upload-time = "2025-11-16T22:51:27Z" }, - { url = "https://files.pythonhosted.org/packages/5e/a6/9ca0eecc489640615642a6cbc0ca9e10df70df38c4d43f5a928ff18d8827/numpy-2.3.5-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:727fd05b57df37dc0bcf1a27767a3d9a78cbbc92822445f32cc3436ba797337b", size = 14262696, upload-time = "2025-11-16T22:51:29.402Z" }, - { url = "https://files.pythonhosted.org/packages/c8/f6/07ec185b90ec9d7217a00eeeed7383b73d7e709dae2a9a021b051542a708/numpy-2.3.5-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fffe29a1ef00883599d1dc2c51aa2e5d80afe49523c261a74933df395c15c520", size = 16597350, upload-time = "2025-11-16T22:51:32.167Z" }, - { url = "https://files.pythonhosted.org/packages/75/37/164071d1dde6a1a84c9b8e5b414fa127981bad47adf3a6b7e23917e52190/numpy-2.3.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8f7f0e05112916223d3f438f293abf0727e1181b5983f413dfa2fefc4098245c", size = 16040190, upload-time = "2025-11-16T22:51:35.403Z" }, - { url = "https://files.pythonhosted.org/packages/08/3c/f18b82a406b04859eb026d204e4e1773eb41c5be58410f41ffa511d114ae/numpy-2.3.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2e2eb32ddb9ccb817d620ac1d8dae7c3f641c1e5f55f531a33e8ab97960a75b8", size = 18536749, upload-time = "2025-11-16T22:51:39.698Z" }, - { url = "https://files.pythonhosted.org/packages/40/79/f82f572bf44cf0023a2fe8588768e23e1592585020d638999f15158609e1/numpy-2.3.5-cp314-cp314-win32.whl", hash = "sha256:66f85ce62c70b843bab1fb14a05d5737741e74e28c7b8b5a064de10142fad248", size = 6335432, upload-time = "2025-11-16T22:51:42.476Z" }, - { url = "https://files.pythonhosted.org/packages/a3/2e/235b4d96619931192c91660805e5e49242389742a7a82c27665021db690c/numpy-2.3.5-cp314-cp314-win_amd64.whl", hash = "sha256:e6a0bc88393d65807d751a614207b7129a310ca4fe76a74e5c7da5fa5671417e", size = 12919388, upload-time = "2025-11-16T22:51:45.275Z" }, - { url = "https://files.pythonhosted.org/packages/07/2b/29fd75ce45d22a39c61aad74f3d718e7ab67ccf839ca8b60866054eb15f8/numpy-2.3.5-cp314-cp314-win_arm64.whl", hash = "sha256:aeffcab3d4b43712bb7a60b65f6044d444e75e563ff6180af8f98dd4b905dfd2", size = 10476651, upload-time = "2025-11-16T22:51:47.749Z" }, - { url = "https://files.pythonhosted.org/packages/17/e1/f6a721234ebd4d87084cfa68d081bcba2f5cfe1974f7de4e0e8b9b2a2ba1/numpy-2.3.5-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:17531366a2e3a9e30762c000f2c43a9aaa05728712e25c11ce1dbe700c53ad41", size = 16834503, upload-time = "2025-11-16T22:51:50.443Z" }, - { url = "https://files.pythonhosted.org/packages/5c/1c/baf7ffdc3af9c356e1c135e57ab7cf8d247931b9554f55c467efe2c69eff/numpy-2.3.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d21644de1b609825ede2f48be98dfde4656aefc713654eeee280e37cadc4e0ad", size = 12381612, upload-time = "2025-11-16T22:51:53.609Z" }, - { url = "https://files.pythonhosted.org/packages/74/91/f7f0295151407ddc9ba34e699013c32c3c91944f9b35fcf9281163dc1468/numpy-2.3.5-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:c804e3a5aba5460c73955c955bdbd5c08c354954e9270a2c1565f62e866bdc39", size = 5210042, upload-time = "2025-11-16T22:51:56.213Z" }, - { url = "https://files.pythonhosted.org/packages/2e/3b/78aebf345104ec50dd50a4d06ddeb46a9ff5261c33bcc58b1c4f12f85ec2/numpy-2.3.5-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:cc0a57f895b96ec78969c34f682c602bf8da1a0270b09bc65673df2e7638ec20", size = 6724502, upload-time = "2025-11-16T22:51:58.584Z" }, - { url = "https://files.pythonhosted.org/packages/02/c6/7c34b528740512e57ef1b7c8337ab0b4f0bddf34c723b8996c675bc2bc91/numpy-2.3.5-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:900218e456384ea676e24ea6a0417f030a3b07306d29d7ad843957b40a9d8d52", size = 14308962, upload-time = "2025-11-16T22:52:01.698Z" }, - { url = "https://files.pythonhosted.org/packages/80/35/09d433c5262bc32d725bafc619e095b6a6651caf94027a03da624146f655/numpy-2.3.5-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:09a1bea522b25109bf8e6f3027bd810f7c1085c64a0c7ce050c1676ad0ba010b", size = 16655054, upload-time = "2025-11-16T22:52:04.267Z" }, - { url = "https://files.pythonhosted.org/packages/7a/ab/6a7b259703c09a88804fa2430b43d6457b692378f6b74b356155283566ac/numpy-2.3.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:04822c00b5fd0323c8166d66c701dc31b7fbd252c100acd708c48f763968d6a3", size = 16091613, upload-time = "2025-11-16T22:52:08.651Z" }, - { url = "https://files.pythonhosted.org/packages/c2/88/330da2071e8771e60d1038166ff9d73f29da37b01ec3eb43cb1427464e10/numpy-2.3.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d6889ec4ec662a1a37eb4b4fb26b6100841804dac55bd9df579e326cdc146227", size = 18591147, upload-time = "2025-11-16T22:52:11.453Z" }, - { url = "https://files.pythonhosted.org/packages/51/41/851c4b4082402d9ea860c3626db5d5df47164a712cb23b54be028b184c1c/numpy-2.3.5-cp314-cp314t-win32.whl", hash = "sha256:93eebbcf1aafdf7e2ddd44c2923e2672e1010bddc014138b229e49725b4d6be5", size = 6479806, upload-time = "2025-11-16T22:52:14.641Z" }, - { url = "https://files.pythonhosted.org/packages/90/30/d48bde1dfd93332fa557cff1972fbc039e055a52021fbef4c2c4b1eefd17/numpy-2.3.5-cp314-cp314t-win_amd64.whl", hash = "sha256:c8a9958e88b65c3b27e22ca2a076311636850b612d6bbfb76e8d156aacde2aaf", size = 13105760, upload-time = "2025-11-16T22:52:17.975Z" }, - { url = "https://files.pythonhosted.org/packages/2d/fd/4b5eb0b3e888d86aee4d198c23acec7d214baaf17ea93c1adec94c9518b9/numpy-2.3.5-cp314-cp314t-win_arm64.whl", hash = "sha256:6203fdf9f3dc5bdaed7319ad8698e685c7a3be10819f41d32a0723e611733b42", size = 10545459, upload-time = "2025-11-16T22:52:20.55Z" }, - { url = "https://files.pythonhosted.org/packages/c6/65/f9dea8e109371ade9c782b4e4756a82edf9d3366bca495d84d79859a0b79/numpy-2.3.5-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f0963b55cdd70fad460fa4c1341f12f976bb26cb66021a5580329bd498988310", size = 16910689, upload-time = "2025-11-16T22:52:23.247Z" }, - { url = "https://files.pythonhosted.org/packages/00/4f/edb00032a8fb92ec0a679d3830368355da91a69cab6f3e9c21b64d0bb986/numpy-2.3.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:f4255143f5160d0de972d28c8f9665d882b5f61309d8362fdd3e103cf7bf010c", size = 12457053, upload-time = "2025-11-16T22:52:26.367Z" }, - { url = "https://files.pythonhosted.org/packages/16/a4/e8a53b5abd500a63836a29ebe145fc1ab1f2eefe1cfe59276020373ae0aa/numpy-2.3.5-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:a4b9159734b326535f4dd01d947f919c6eefd2d9827466a696c44ced82dfbc18", size = 5285635, upload-time = "2025-11-16T22:52:29.266Z" }, - { url = "https://files.pythonhosted.org/packages/a3/2f/37eeb9014d9c8b3e9c55bc599c68263ca44fdbc12a93e45a21d1d56df737/numpy-2.3.5-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:2feae0d2c91d46e59fcd62784a3a83b3fb677fead592ce51b5a6fbb4f95965ff", size = 6801770, upload-time = "2025-11-16T22:52:31.421Z" }, - { url = "https://files.pythonhosted.org/packages/7d/e4/68d2f474df2cb671b2b6c2986a02e520671295647dad82484cde80ca427b/numpy-2.3.5-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ffac52f28a7849ad7576293c0cb7b9f08304e8f7d738a8cb8a90ec4c55a998eb", size = 14391768, upload-time = "2025-11-16T22:52:33.593Z" }, - { url = "https://files.pythonhosted.org/packages/b8/50/94ccd8a2b141cb50651fddd4f6a48874acb3c91c8f0842b08a6afc4b0b21/numpy-2.3.5-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63c0e9e7eea69588479ebf4a8a270d5ac22763cc5854e9a7eae952a3908103f7", size = 16729263, upload-time = "2025-11-16T22:52:36.369Z" }, - { url = "https://files.pythonhosted.org/packages/2d/ee/346fa473e666fe14c52fcdd19ec2424157290a032d4c41f98127bfb31ac7/numpy-2.3.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:f16417ec91f12f814b10bafe79ef77e70113a2f5f7018640e7425ff979253425", size = 12967213, upload-time = "2025-11-16T22:52:39.38Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/a4/7a/6a3d14e205d292b738db449d0de649b373a59edb0d0b4493821d0a3e8718/numpy-2.4.0.tar.gz", hash = "sha256:6e504f7b16118198f138ef31ba24d985b124c2c469fe8467007cf30fd992f934", size = 20685720, upload-time = "2025-12-20T16:18:19.023Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/7e/7bae7cbcc2f8132271967aa03e03954fc1e48aa1f3bf32b29ca95fbef352/numpy-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:316b2f2584682318539f0bcaca5a496ce9ca78c88066579ebd11fd06f8e4741e", size = 16940166, upload-time = "2025-12-20T16:15:43.434Z" }, + { url = "https://files.pythonhosted.org/packages/0f/27/6c13f5b46776d6246ec884ac5817452672156a506d08a1f2abb39961930a/numpy-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2718c1de8504121714234b6f8241d0019450353276c88b9453c9c3d92e101db", size = 12641781, upload-time = "2025-12-20T16:15:45.701Z" }, + { url = "https://files.pythonhosted.org/packages/14/1c/83b4998d4860d15283241d9e5215f28b40ac31f497c04b12fa7f428ff370/numpy-2.4.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:21555da4ec4a0c942520ead42c3b0dc9477441e085c42b0fbdd6a084869a6f6b", size = 5470247, upload-time = "2025-12-20T16:15:47.943Z" }, + { url = "https://files.pythonhosted.org/packages/54/08/cbce72c835d937795571b0464b52069f869c9e78b0c076d416c5269d2718/numpy-2.4.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:413aa561266a4be2d06cd2b9665e89d9f54c543f418773076a76adcf2af08bc7", size = 6799807, upload-time = "2025-12-20T16:15:49.795Z" }, + { url = "https://files.pythonhosted.org/packages/ff/be/2e647961cd8c980591d75cdcd9e8f647d69fbe05e2a25613dc0a2ea5fb1a/numpy-2.4.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0feafc9e03128074689183031181fac0897ff169692d8492066e949041096548", size = 14701992, upload-time = "2025-12-20T16:15:51.615Z" }, + { url = "https://files.pythonhosted.org/packages/a2/fb/e1652fb8b6fd91ce6ed429143fe2e01ce714711e03e5b762615e7b36172c/numpy-2.4.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8fdfed3deaf1928fb7667d96e0567cdf58c2b370ea2ee7e586aa383ec2cb346", size = 16646871, upload-time = "2025-12-20T16:15:54.129Z" }, + { url = "https://files.pythonhosted.org/packages/62/23/d841207e63c4322842f7cd042ae981cffe715c73376dcad8235fb31debf1/numpy-2.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e06a922a469cae9a57100864caf4f8a97a1026513793969f8ba5b63137a35d25", size = 16487190, upload-time = "2025-12-20T16:15:56.147Z" }, + { url = "https://files.pythonhosted.org/packages/bc/a0/6a842c8421ebfdec0a230e65f61e0dabda6edbef443d999d79b87c273965/numpy-2.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:927ccf5cd17c48f801f4ed43a7e5673a2724bd2171460be3e3894e6e332ef83a", size = 18580762, upload-time = "2025-12-20T16:15:58.524Z" }, + { url = "https://files.pythonhosted.org/packages/0a/d1/c79e0046641186f2134dde05e6181825b911f8bdcef31b19ddd16e232847/numpy-2.4.0-cp311-cp311-win32.whl", hash = "sha256:882567b7ae57c1b1a0250208cc21a7976d8cbcc49d5a322e607e6f09c9e0bd53", size = 6233359, upload-time = "2025-12-20T16:16:00.938Z" }, + { url = "https://files.pythonhosted.org/packages/fc/f0/74965001d231f28184d6305b8cdc1b6fcd4bf23033f6cb039cfe76c9fca7/numpy-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:8b986403023c8f3bf8f487c2e6186afda156174d31c175f747d8934dfddf3479", size = 12601132, upload-time = "2025-12-20T16:16:02.484Z" }, + { url = "https://files.pythonhosted.org/packages/65/32/55408d0f46dfebce38017f5bd931affa7256ad6beac1a92a012e1fbc67a7/numpy-2.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:3f3096405acc48887458bbf9f6814d43785ac7ba2a57ea6442b581dedbc60ce6", size = 10573977, upload-time = "2025-12-20T16:16:04.77Z" }, + { url = "https://files.pythonhosted.org/packages/8b/ff/f6400ffec95de41c74b8e73df32e3fff1830633193a7b1e409be7fb1bb8c/numpy-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2a8b6bb8369abefb8bd1801b054ad50e02b3275c8614dc6e5b0373c305291037", size = 16653117, upload-time = "2025-12-20T16:16:06.709Z" }, + { url = "https://files.pythonhosted.org/packages/fd/28/6c23e97450035072e8d830a3c411bf1abd1f42c611ff9d29e3d8f55c6252/numpy-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e284ca13d5a8367e43734148622caf0b261b275673823593e3e3634a6490f83", size = 12369711, upload-time = "2025-12-20T16:16:08.758Z" }, + { url = "https://files.pythonhosted.org/packages/bc/af/acbef97b630ab1bb45e6a7d01d1452e4251aa88ce680ac36e56c272120ec/numpy-2.4.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:49ff32b09f5aa0cd30a20c2b39db3e669c845589f2b7fc910365210887e39344", size = 5198355, upload-time = "2025-12-20T16:16:10.902Z" }, + { url = "https://files.pythonhosted.org/packages/c1/c8/4e0d436b66b826f2e53330adaa6311f5cac9871a5b5c31ad773b27f25a74/numpy-2.4.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:36cbfb13c152b1c7c184ddac43765db8ad672567e7bafff2cc755a09917ed2e6", size = 6545298, upload-time = "2025-12-20T16:16:12.607Z" }, + { url = "https://files.pythonhosted.org/packages/ef/27/e1f5d144ab54eac34875e79037011d511ac57b21b220063310cb96c80fbc/numpy-2.4.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:35ddc8f4914466e6fc954c76527aa91aa763682a4f6d73249ef20b418fe6effb", size = 14398387, upload-time = "2025-12-20T16:16:14.257Z" }, + { url = "https://files.pythonhosted.org/packages/67/64/4cb909dd5ab09a9a5d086eff9586e69e827b88a5585517386879474f4cf7/numpy-2.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dc578891de1db95b2a35001b695451767b580bb45753717498213c5ff3c41d63", size = 16363091, upload-time = "2025-12-20T16:16:17.32Z" }, + { url = "https://files.pythonhosted.org/packages/9d/9c/8efe24577523ec6809261859737cf117b0eb6fdb655abdfdc81b2e468ce4/numpy-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:98e81648e0b36e325ab67e46b5400a7a6d4a22b8a7c8e8bbfe20e7db7906bf95", size = 16176394, upload-time = "2025-12-20T16:16:19.524Z" }, + { url = "https://files.pythonhosted.org/packages/61/f0/1687441ece7b47a62e45a1f82015352c240765c707928edd8aef875d5951/numpy-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d57b5046c120561ba8fa8e4030fbb8b822f3063910fa901ffadf16e2b7128ad6", size = 18287378, upload-time = "2025-12-20T16:16:22.866Z" }, + { url = "https://files.pythonhosted.org/packages/d3/6f/f868765d44e6fc466467ed810ba9d8d6db1add7d4a748abfa2a4c99a3194/numpy-2.4.0-cp312-cp312-win32.whl", hash = "sha256:92190db305a6f48734d3982f2c60fa30d6b5ee9bff10f2887b930d7b40119f4c", size = 5955432, upload-time = "2025-12-20T16:16:25.06Z" }, + { url = "https://files.pythonhosted.org/packages/d4/b5/94c1e79fcbab38d1ca15e13777477b2914dd2d559b410f96949d6637b085/numpy-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:680060061adb2d74ce352628cb798cfdec399068aa7f07ba9fb818b2b3305f98", size = 12306201, upload-time = "2025-12-20T16:16:26.979Z" }, + { url = "https://files.pythonhosted.org/packages/70/09/c39dadf0b13bb0768cd29d6a3aaff1fb7c6905ac40e9aaeca26b1c086e06/numpy-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:39699233bc72dd482da1415dcb06076e32f60eddc796a796c5fb6c5efce94667", size = 10308234, upload-time = "2025-12-20T16:16:29.417Z" }, + { url = "https://files.pythonhosted.org/packages/a7/0d/853fd96372eda07c824d24adf02e8bc92bb3731b43a9b2a39161c3667cc4/numpy-2.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a152d86a3ae00ba5f47b3acf3b827509fd0b6cb7d3259665e63dafbad22a75ea", size = 16649088, upload-time = "2025-12-20T16:16:31.421Z" }, + { url = "https://files.pythonhosted.org/packages/e3/37/cc636f1f2a9f585434e20a3e6e63422f70bfe4f7f6698e941db52ea1ac9a/numpy-2.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:39b19251dec4de8ff8496cd0806cbe27bf0684f765abb1f4809554de93785f2d", size = 12364065, upload-time = "2025-12-20T16:16:33.491Z" }, + { url = "https://files.pythonhosted.org/packages/ed/69/0b78f37ca3690969beee54103ce5f6021709134e8020767e93ba691a72f1/numpy-2.4.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:009bd0ea12d3c784b6639a8457537016ce5172109e585338e11334f6a7bb88ee", size = 5192640, upload-time = "2025-12-20T16:16:35.636Z" }, + { url = "https://files.pythonhosted.org/packages/1d/2a/08569f8252abf590294dbb09a430543ec8f8cc710383abfb3e75cc73aeda/numpy-2.4.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:5fe44e277225fd3dff6882d86d3d447205d43532c3627313d17e754fb3905a0e", size = 6541556, upload-time = "2025-12-20T16:16:37.276Z" }, + { url = "https://files.pythonhosted.org/packages/93/e9/a949885a4e177493d61519377952186b6cbfdf1d6002764c664ba28349b5/numpy-2.4.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f935c4493eda9069851058fa0d9e39dbf6286be690066509305e52912714dbb2", size = 14396562, upload-time = "2025-12-20T16:16:38.953Z" }, + { url = "https://files.pythonhosted.org/packages/99/98/9d4ad53b0e9ef901c2ef1d550d2136f5ac42d3fd2988390a6def32e23e48/numpy-2.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8cfa5f29a695cb7438965e6c3e8d06e0416060cf0d709c1b1c1653a939bf5c2a", size = 16351719, upload-time = "2025-12-20T16:16:41.503Z" }, + { url = "https://files.pythonhosted.org/packages/28/de/5f3711a38341d6e8dd619f6353251a0cdd07f3d6d101a8fd46f4ef87f895/numpy-2.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ba0cb30acd3ef11c94dc27fbfba68940652492bc107075e7ffe23057f9425681", size = 16176053, upload-time = "2025-12-20T16:16:44.552Z" }, + { url = "https://files.pythonhosted.org/packages/2a/5b/2a3753dc43916501b4183532e7ace862e13211042bceafa253afb5c71272/numpy-2.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:60e8c196cd82cbbd4f130b5290007e13e6de3eca79f0d4d38014769d96a7c475", size = 18277859, upload-time = "2025-12-20T16:16:47.174Z" }, + { url = "https://files.pythonhosted.org/packages/2c/c5/a18bcdd07a941db3076ef489d036ab16d2bfc2eae0cf27e5a26e29189434/numpy-2.4.0-cp313-cp313-win32.whl", hash = "sha256:5f48cb3e88fbc294dc90e215d86fbaf1c852c63dbdb6c3a3e63f45c4b57f7344", size = 5953849, upload-time = "2025-12-20T16:16:49.554Z" }, + { url = "https://files.pythonhosted.org/packages/4f/f1/719010ff8061da6e8a26e1980cf090412d4f5f8060b31f0c45d77dd67a01/numpy-2.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:a899699294f28f7be8992853c0c60741f16ff199205e2e6cdca155762cbaa59d", size = 12302840, upload-time = "2025-12-20T16:16:51.227Z" }, + { url = "https://files.pythonhosted.org/packages/f5/5a/b3d259083ed8b4d335270c76966cb6cf14a5d1b69e1a608994ac57a659e6/numpy-2.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:9198f447e1dc5647d07c9a6bbe2063cc0132728cc7175b39dbc796da5b54920d", size = 10308509, upload-time = "2025-12-20T16:16:53.313Z" }, + { url = "https://files.pythonhosted.org/packages/31/01/95edcffd1bb6c0633df4e808130545c4f07383ab629ac7e316fb44fff677/numpy-2.4.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74623f2ab5cc3f7c886add4f735d1031a1d2be4a4ae63c0546cfd74e7a31ddf6", size = 12491815, upload-time = "2025-12-20T16:16:55.496Z" }, + { url = "https://files.pythonhosted.org/packages/59/ea/5644b8baa92cc1c7163b4b4458c8679852733fa74ca49c942cfa82ded4e0/numpy-2.4.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:0804a8e4ab070d1d35496e65ffd3cf8114c136a2b81f61dfab0de4b218aacfd5", size = 5320321, upload-time = "2025-12-20T16:16:57.468Z" }, + { url = "https://files.pythonhosted.org/packages/26/4e/e10938106d70bc21319bd6a86ae726da37edc802ce35a3a71ecdf1fdfe7f/numpy-2.4.0-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:02a2038eb27f9443a8b266a66911e926566b5a6ffd1a689b588f7f35b81e7dc3", size = 6641635, upload-time = "2025-12-20T16:16:59.379Z" }, + { url = "https://files.pythonhosted.org/packages/b3/8d/a8828e3eaf5c0b4ab116924df82f24ce3416fa38d0674d8f708ddc6c8aac/numpy-2.4.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1889b3a3f47a7b5bee16bc25a2145bd7cb91897f815ce3499db64c7458b6d91d", size = 14456053, upload-time = "2025-12-20T16:17:01.768Z" }, + { url = "https://files.pythonhosted.org/packages/68/a1/17d97609d87d4520aa5ae2dcfb32305654550ac6a35effb946d303e594ce/numpy-2.4.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:85eef4cb5625c47ee6425c58a3502555e10f45ee973da878ac8248ad58c136f3", size = 16401702, upload-time = "2025-12-20T16:17:04.235Z" }, + { url = "https://files.pythonhosted.org/packages/18/32/0f13c1b2d22bea1118356b8b963195446f3af124ed7a5adfa8fdecb1b6ca/numpy-2.4.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6dc8b7e2f4eb184b37655195f421836cfae6f58197b67e3ffc501f1333d993fa", size = 16242493, upload-time = "2025-12-20T16:17:06.856Z" }, + { url = "https://files.pythonhosted.org/packages/ae/23/48f21e3d309fbc137c068a1475358cbd3a901b3987dcfc97a029ab3068e2/numpy-2.4.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:44aba2f0cafd287871a495fb3163408b0bd25bbce135c6f621534a07f4f7875c", size = 18324222, upload-time = "2025-12-20T16:17:09.392Z" }, + { url = "https://files.pythonhosted.org/packages/ac/52/41f3d71296a3dcaa4f456aaa3c6fc8e745b43d0552b6bde56571bb4b4a0f/numpy-2.4.0-cp313-cp313t-win32.whl", hash = "sha256:20c115517513831860c573996e395707aa9fb691eb179200125c250e895fcd93", size = 6076216, upload-time = "2025-12-20T16:17:11.437Z" }, + { url = "https://files.pythonhosted.org/packages/35/ff/46fbfe60ab0710d2a2b16995f708750307d30eccbb4c38371ea9e986866e/numpy-2.4.0-cp313-cp313t-win_amd64.whl", hash = "sha256:b48e35f4ab6f6a7597c46e301126ceba4c44cd3280e3750f85db48b082624fa4", size = 12444263, upload-time = "2025-12-20T16:17:13.182Z" }, + { url = "https://files.pythonhosted.org/packages/a3/e3/9189ab319c01d2ed556c932ccf55064c5d75bb5850d1df7a482ce0badead/numpy-2.4.0-cp313-cp313t-win_arm64.whl", hash = "sha256:4d1cfce39e511069b11e67cd0bd78ceff31443b7c9e5c04db73c7a19f572967c", size = 10378265, upload-time = "2025-12-20T16:17:15.211Z" }, + { url = "https://files.pythonhosted.org/packages/ab/ed/52eac27de39d5e5a6c9aadabe672bc06f55e24a3d9010cd1183948055d76/numpy-2.4.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:c95eb6db2884917d86cde0b4d4cf31adf485c8ec36bf8696dd66fa70de96f36b", size = 16647476, upload-time = "2025-12-20T16:17:17.671Z" }, + { url = "https://files.pythonhosted.org/packages/77/c0/990ce1b7fcd4e09aeaa574e2a0a839589e4b08b2ca68070f1acb1fea6736/numpy-2.4.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:65167da969cd1ec3a1df31cb221ca3a19a8aaa25370ecb17d428415e93c1935e", size = 12374563, upload-time = "2025-12-20T16:17:20.216Z" }, + { url = "https://files.pythonhosted.org/packages/37/7c/8c5e389c6ae8f5fd2277a988600d79e9625db3fff011a2d87ac80b881a4c/numpy-2.4.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:3de19cfecd1465d0dcf8a5b5ea8b3155b42ed0b639dba4b71e323d74f2a3be5e", size = 5203107, upload-time = "2025-12-20T16:17:22.47Z" }, + { url = "https://files.pythonhosted.org/packages/e6/94/ca5b3bd6a8a70a5eec9a0b8dd7f980c1eff4b8a54970a9a7fef248ef564f/numpy-2.4.0-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:6c05483c3136ac4c91b4e81903cb53a8707d316f488124d0398499a4f8e8ef51", size = 6538067, upload-time = "2025-12-20T16:17:24.001Z" }, + { url = "https://files.pythonhosted.org/packages/79/43/993eb7bb5be6761dde2b3a3a594d689cec83398e3f58f4758010f3b85727/numpy-2.4.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:36667db4d6c1cea79c8930ab72fadfb4060feb4bfe724141cd4bd064d2e5f8ce", size = 14411926, upload-time = "2025-12-20T16:17:25.822Z" }, + { url = "https://files.pythonhosted.org/packages/03/75/d4c43b61de473912496317a854dac54f1efec3eeb158438da6884b70bb90/numpy-2.4.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9a818668b674047fd88c4cddada7ab8f1c298812783e8328e956b78dc4807f9f", size = 16354295, upload-time = "2025-12-20T16:17:28.308Z" }, + { url = "https://files.pythonhosted.org/packages/b8/0a/b54615b47ee8736a6461a4bb6749128dd3435c5a759d5663f11f0e9af4ac/numpy-2.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:1ee32359fb7543b7b7bd0b2f46294db27e29e7bbdf70541e81b190836cd83ded", size = 16190242, upload-time = "2025-12-20T16:17:30.993Z" }, + { url = "https://files.pythonhosted.org/packages/98/ce/ea207769aacad6246525ec6c6bbd66a2bf56c72443dc10e2f90feed29290/numpy-2.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e493962256a38f58283de033d8af176c5c91c084ea30f15834f7545451c42059", size = 18280875, upload-time = "2025-12-20T16:17:33.327Z" }, + { url = "https://files.pythonhosted.org/packages/17/ef/ec409437aa962ea372ed601c519a2b141701683ff028f894b7466f0ab42b/numpy-2.4.0-cp314-cp314-win32.whl", hash = "sha256:6bbaebf0d11567fa8926215ae731e1d58e6ec28a8a25235b8a47405d301332db", size = 6002530, upload-time = "2025-12-20T16:17:35.729Z" }, + { url = "https://files.pythonhosted.org/packages/5f/4a/5cb94c787a3ed1ac65e1271b968686521169a7b3ec0b6544bb3ca32960b0/numpy-2.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:3d857f55e7fdf7c38ab96c4558c95b97d1c685be6b05c249f5fdafcbd6f9899e", size = 12435890, upload-time = "2025-12-20T16:17:37.599Z" }, + { url = "https://files.pythonhosted.org/packages/48/a0/04b89db963af9de1104975e2544f30de89adbf75b9e75f7dd2599be12c79/numpy-2.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:bb50ce5fb202a26fd5404620e7ef820ad1ab3558b444cb0b55beb7ef66cd2d63", size = 10591892, upload-time = "2025-12-20T16:17:39.649Z" }, + { url = "https://files.pythonhosted.org/packages/53/e5/d74b5ccf6712c06c7a545025a6a71bfa03bdc7e0568b405b0d655232fd92/numpy-2.4.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:355354388cba60f2132df297e2d53053d4063f79077b67b481d21276d61fc4df", size = 12494312, upload-time = "2025-12-20T16:17:41.714Z" }, + { url = "https://files.pythonhosted.org/packages/c2/08/3ca9cc2ddf54dfee7ae9a6479c071092a228c68aef08252aa08dac2af002/numpy-2.4.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:1d8f9fde5f6dc1b6fc34df8162f3b3079365468703fee7f31d4e0cc8c63baed9", size = 5322862, upload-time = "2025-12-20T16:17:44.145Z" }, + { url = "https://files.pythonhosted.org/packages/87/74/0bb63a68394c0c1e52670cfff2e309afa41edbe11b3327d9af29e4383f34/numpy-2.4.0-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:e0434aa22c821f44eeb4c650b81c7fbdd8c0122c6c4b5a576a76d5a35625ecd9", size = 6644986, upload-time = "2025-12-20T16:17:46.203Z" }, + { url = "https://files.pythonhosted.org/packages/06/8f/9264d9bdbcf8236af2823623fe2f3981d740fc3461e2787e231d97c38c28/numpy-2.4.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:40483b2f2d3ba7aad426443767ff5632ec3156ef09742b96913787d13c336471", size = 14457958, upload-time = "2025-12-20T16:17:48.017Z" }, + { url = "https://files.pythonhosted.org/packages/8c/d9/f9a69ae564bbc7236a35aa883319364ef5fd41f72aa320cc1cbe66148fe2/numpy-2.4.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d9e6a7664ddd9746e20b7325351fe1a8408d0a2bf9c63b5e898290ddc8f09544", size = 16398394, upload-time = "2025-12-20T16:17:50.409Z" }, + { url = "https://files.pythonhosted.org/packages/34/c7/39241501408dde7f885d241a98caba5421061a2c6d2b2197ac5e3aa842d8/numpy-2.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ecb0019d44f4cdb50b676c5d0cb4b1eae8e15d1ed3d3e6639f986fc92b2ec52c", size = 16241044, upload-time = "2025-12-20T16:17:52.661Z" }, + { url = "https://files.pythonhosted.org/packages/7c/95/cae7effd90e065a95e59fe710eeee05d7328ed169776dfdd9f789e032125/numpy-2.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d0ffd9e2e4441c96a9c91ec1783285d80bf835b677853fc2770a89d50c1e48ac", size = 18321772, upload-time = "2025-12-20T16:17:54.947Z" }, + { url = "https://files.pythonhosted.org/packages/96/df/3c6c279accd2bfb968a76298e5b276310bd55d243df4fa8ac5816d79347d/numpy-2.4.0-cp314-cp314t-win32.whl", hash = "sha256:77f0d13fa87036d7553bf81f0e1fe3ce68d14c9976c9851744e4d3e91127e95f", size = 6148320, upload-time = "2025-12-20T16:17:57.249Z" }, + { url = "https://files.pythonhosted.org/packages/92/8d/f23033cce252e7a75cae853d17f582e86534c46404dea1c8ee094a9d6d84/numpy-2.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b1f5b45829ac1848893f0ddf5cb326110604d6df96cdc255b0bf9edd154104d4", size = 12623460, upload-time = "2025-12-20T16:17:58.963Z" }, + { url = "https://files.pythonhosted.org/packages/a4/4f/1f8475907d1a7c4ef9020edf7f39ea2422ec896849245f00688e4b268a71/numpy-2.4.0-cp314-cp314t-win_arm64.whl", hash = "sha256:23a3e9d1a6f360267e8fbb38ba5db355a6a7e9be71d7fce7ab3125e88bb646c8", size = 10661799, upload-time = "2025-12-20T16:18:01.078Z" }, + { url = "https://files.pythonhosted.org/packages/4b/ef/088e7c7342f300aaf3ee5f2c821c4b9996a1bef2aaf6a49cc8ab4883758e/numpy-2.4.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:b54c83f1c0c0f1d748dca0af516062b8829d53d1f0c402be24b4257a9c48ada6", size = 16819003, upload-time = "2025-12-20T16:18:03.41Z" }, + { url = "https://files.pythonhosted.org/packages/ff/ce/a53017b5443b4b84517182d463fc7bcc2adb4faa8b20813f8e5f5aeb5faa/numpy-2.4.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:aabb081ca0ec5d39591fc33018cd4b3f96e1a2dd6756282029986d00a785fba4", size = 12567105, upload-time = "2025-12-20T16:18:05.594Z" }, + { url = "https://files.pythonhosted.org/packages/77/58/5ff91b161f2ec650c88a626c3905d938c89aaadabd0431e6d9c1330c83e2/numpy-2.4.0-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:8eafe7c36c8430b7794edeab3087dec7bf31d634d92f2af9949434b9d1964cba", size = 5395590, upload-time = "2025-12-20T16:18:08.031Z" }, + { url = "https://files.pythonhosted.org/packages/1d/4e/f1a084106df8c2df8132fc437e56987308e0524836aa7733721c8429d4fe/numpy-2.4.0-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:2f585f52b2baf07ff3356158d9268ea095e221371f1074fadea2f42544d58b4d", size = 6709947, upload-time = "2025-12-20T16:18:09.836Z" }, + { url = "https://files.pythonhosted.org/packages/63/09/3d8aeb809c0332c3f642da812ac2e3d74fc9252b3021f8c30c82e99e3f3d/numpy-2.4.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:32ed06d0fe9cae27d8fb5f400c63ccee72370599c75e683a6358dd3a4fb50aaf", size = 14535119, upload-time = "2025-12-20T16:18:12.105Z" }, + { url = "https://files.pythonhosted.org/packages/fd/7f/68f0fc43a2cbdc6bb239160c754d87c922f60fbaa0fa3cd3d312b8a7f5ee/numpy-2.4.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:57c540ed8fb1f05cb997c6761cd56db72395b0d6985e90571ff660452ade4f98", size = 16475815, upload-time = "2025-12-20T16:18:14.433Z" }, + { url = "https://files.pythonhosted.org/packages/11/73/edeacba3167b1ca66d51b1a5a14697c2c40098b5ffa01811c67b1785a5ab/numpy-2.4.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:a39fb973a726e63223287adc6dafe444ce75af952d711e400f3bf2b36ef55a7b", size = 12489376, upload-time = "2025-12-20T16:18:16.524Z" }, ] [[package]] @@ -3061,7 +3059,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "absl-py" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "torch", marker = "sys_platform == 'never'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/02/ad/046a097b63a96c1ba1d85f0031dbe7fcbdb33e6c445dfbaba2ffaefdd497/nv_grouped_gemm-1.1.4.post8.tar.gz", hash = "sha256:ab321693f0292cfd8a26dc7b6f14decd9eb00e209494de7218e4fad36191275d", size = 20821209, upload-time = "2025-12-17T02:22:38.432Z" } @@ -3151,21 +3149,21 @@ wheels = [ [[package]] name = "nvidia-cudnn-frontend" -version = "1.16.0" +version = "1.17.0" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/fa/cf/3cd3cc682df5488288c6043fc0977090497ff015a082ab160076fecb080a/nvidia_cudnn_frontend-1.16.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:83ecbe6d1145dc208a9ae82aa0b45b2c8f74ed8a43d3a102a13eef2117e2fedd", size = 1835542, upload-time = "2025-11-07T01:28:20.133Z" }, - { url = "https://files.pythonhosted.org/packages/92/45/87f3f2d94a928be21459949b03b0b8bcea13531d30094ad84a8ae4fca761/nvidia_cudnn_frontend-1.16.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:77cb06b91877c8489363867434ba1d9936f3e10bf7ed98d82e98f5f578611920", size = 1950339, upload-time = "2025-11-07T01:31:41.69Z" }, - { url = "https://files.pythonhosted.org/packages/be/f5/1662f18084ef4441bfb3a01383cbf77194905b53474dcb51c0d0f373c74b/nvidia_cudnn_frontend-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:ee3f3886f107919dad48cbc905fa6ae9207c8d7d5a24165e55625ea96f0fe40f", size = 1367883, upload-time = "2025-11-07T01:25:17.791Z" }, - { url = "https://files.pythonhosted.org/packages/10/b7/d0a3a337f5e83f26ff79a7fd63a859181ff2911f1d905d6fbab5fc80170d/nvidia_cudnn_frontend-1.16.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c360d5840d6eb597aade9e9c8780e24aec283b8e6bc97d52881c821a35c92aa9", size = 1837573, upload-time = "2025-11-07T01:29:05.507Z" }, - { url = "https://files.pythonhosted.org/packages/95/dc/465a14f2d235778405f2e84fce336d07ab045bf1c7df6404bdf8033e06a8/nvidia_cudnn_frontend-1.16.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c4a8fc573d85a86e08b15d9bf37f729e2487298781867a492a59cde6ac295e2", size = 1952630, upload-time = "2025-11-07T01:32:00.242Z" }, - { url = "https://files.pythonhosted.org/packages/3b/89/f14435f616603a999975930c4456d6140127f6acb19a877c752beccad837/nvidia_cudnn_frontend-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:a257f10a932ffde9741f644efd3611acf77e2fd89d493d81bc6a8353c48f1ec2", size = 1368775, upload-time = "2025-11-07T01:25:42.252Z" }, - { url = "https://files.pythonhosted.org/packages/00/39/79b606e805abd67ab4fa72f752a5413a496159f10d94fbdb1d67bb5ae86c/nvidia_cudnn_frontend-1.16.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd6fdd71c0896ff2ca1809d914cbd17f2904d55863f8881f47946e1d634c7a88", size = 1839271, upload-time = "2025-11-07T01:29:53.06Z" }, - { url = "https://files.pythonhosted.org/packages/09/21/a0e0d50ba8d7b639fe635500fee0d9c0319561b1ae72176d7024ec04b439/nvidia_cudnn_frontend-1.16.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:16efb069d4bda4d3b99134f59f376cfd4d09558298bd96af778fdc7f2851e696", size = 1954062, upload-time = "2025-11-07T01:32:18.556Z" }, - { url = "https://files.pythonhosted.org/packages/ce/d6/30ae67bb9c010e9459d1211c56d73373eb4e3dd9f57f4c3c1fe0966efcb1/nvidia_cudnn_frontend-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:7b7860db03767c158accbe0b4e9c9553506513cc970ff08ed28c7761681ac466", size = 1368435, upload-time = "2025-11-07T01:26:28.022Z" }, - { url = "https://files.pythonhosted.org/packages/32/2c/b4376afef0a6342c56e82e3465c1f8f5c719f588293a50dd04019a22ae6e/nvidia_cudnn_frontend-1.16.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b6bcb3a2fbff80538958e21e2227520f082a961164865aaeedaac527f61084f9", size = 1839805, upload-time = "2025-11-07T01:30:31.056Z" }, - { url = "https://files.pythonhosted.org/packages/71/13/836b90354036154ab82db3861210e5736983fe1fc44bb39c146ad93b333b/nvidia_cudnn_frontend-1.16.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cbdad88b2bec5dde837f8fa7632022334cddb4756f923b5421c06a712cb59d31", size = 1953953, upload-time = "2025-11-07T01:33:03.781Z" }, - { url = "https://files.pythonhosted.org/packages/e5/30/3025f34f2c86ceef85134dc1f323f8cf2a26d3ffddc5ada48528c80bfae1/nvidia_cudnn_frontend-1.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:138de2bc4697fabb2eb2f0f601a7e31f8fe97874908e26e33d737276f335473c", size = 1368359, upload-time = "2025-11-07T01:26:51.561Z" }, + { url = "https://files.pythonhosted.org/packages/14/94/b224e65becfb5ab02c5b331aeb73c98f6d95cde5326d7698a2fc0d20e84a/nvidia_cudnn_frontend-1.17.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4835ee3fc350782c89cdd290088ade69464faaa5dd66ccb0b215ad481ab3b41b", size = 1911670, upload-time = "2025-12-20T00:26:36.302Z" }, + { url = "https://files.pythonhosted.org/packages/d5/05/54afda6fc47838bd68a029067d8019e6b495dca0570d7e970cbb2c3e0b32/nvidia_cudnn_frontend-1.17.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1da7e972dbba939ad21111f1208815b8c8024cbf72aa6c1eb223b14b2049d4b6", size = 2033618, upload-time = "2025-12-20T00:24:42.991Z" }, + { url = "https://files.pythonhosted.org/packages/83/97/77ad90fac9372b0420885f16a2afaca95f78b082fa9d6a082d51a7c96bd3/nvidia_cudnn_frontend-1.17.0-cp310-cp310-win_amd64.whl", hash = "sha256:21c5b2ce097f72c6510cbf974ce8ea9a31b34989dd9209d7187584a6100e57e5", size = 1440589, upload-time = "2025-12-20T00:29:17.641Z" }, + { url = "https://files.pythonhosted.org/packages/4e/4a/a903c57ef5aaa32aa074007ba4d50ed7cbc80a8092ddb84fe9d879a69bbb/nvidia_cudnn_frontend-1.17.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:961004000a2c21dd4a03f816534629105cf49125a643dbb49abbc97021e66d20", size = 1911775, upload-time = "2025-12-20T00:27:11.297Z" }, + { url = "https://files.pythonhosted.org/packages/15/20/80c4f5d62ebc58b8db8d25a2ee11f3246bb8947addea37c229540bcc05ac/nvidia_cudnn_frontend-1.17.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6ea44a8f2c0cfd20868b239ea13a2e0f32895dab868f6ff2bee01caf3778d273", size = 2035158, upload-time = "2025-12-20T00:25:00.9Z" }, + { url = "https://files.pythonhosted.org/packages/5f/18/c24375c8d579c53a99a2d7428397288a94c7ea411d1823e3b8dc3cef50dc/nvidia_cudnn_frontend-1.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:8dd6cc197a58d63da4d146a1febc1f99d425374d159f9b00628b140c65acb486", size = 1441316, upload-time = "2025-12-20T00:29:34.951Z" }, + { url = "https://files.pythonhosted.org/packages/42/d9/f58ed6292c9396f7422812a0a2d9f80cc5a623ea6c758bcb3d34d4795bb8/nvidia_cudnn_frontend-1.17.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de0c473f32d705abcf14f351615f7ffbeed7320e3499cf2195ae5689652a2592", size = 1917620, upload-time = "2025-12-20T00:27:46.179Z" }, + { url = "https://files.pythonhosted.org/packages/db/eb/c641135632bd2afc21339aadee96af4c5db1460dfa07ca74836de75a590f/nvidia_cudnn_frontend-1.17.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c913c87fca691a91385287f2587575531933acfebc85c33dbcecb191886c7a53", size = 2038994, upload-time = "2025-12-20T00:25:18.9Z" }, + { url = "https://files.pythonhosted.org/packages/82/49/a92da03eb43bde90be770a43666c5ab26b4f8b15f6e46c4b0b0e84f37994/nvidia_cudnn_frontend-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0d4cfd03961592108abd1ba246e43c8bb7540aed984df860256d0bff181de98", size = 1441271, upload-time = "2025-12-20T00:29:52.056Z" }, + { url = "https://files.pythonhosted.org/packages/99/96/4d55a559dff3175599fe15d83c853f051526b91994b083ec36b12caae776/nvidia_cudnn_frontend-1.17.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3800a1fe3d41a9206281475b1c8c438b02cb7e3c7e262d13f0a101edec223cb6", size = 1917065, upload-time = "2025-12-20T00:28:21.402Z" }, + { url = "https://files.pythonhosted.org/packages/20/f6/5af63c254d7260dd1e974b2300eae9b157998b9d958f79c98ddaada0a0bf/nvidia_cudnn_frontend-1.17.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5adaf4a930b3be5ed019e1a25cfec7cc2bf444592a54a7639c28149b9227c2a4", size = 2039180, upload-time = "2025-12-20T00:25:36.695Z" }, + { url = "https://files.pythonhosted.org/packages/64/ee/6de6aec1e42c859134312e6d5348d6f036b2f1b825e6eae92f9a429eccc4/nvidia_cudnn_frontend-1.17.0-cp313-cp313-win_amd64.whl", hash = "sha256:5c6a120fb54b157585ce6587153fc7086081af961f284f2553e01ba7c7a80c1a", size = 1441177, upload-time = "2025-12-20T00:30:09.927Z" }, ] [[package]] @@ -3240,23 +3238,23 @@ wheels = [ [[package]] name = "nvidia-cutlass-dsl" -version = "4.3.3" +version = "4.3.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cuda-python" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "typing-extensions" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/ac/48/52907ac203c6de58b451511e251c8b1fc77c414dcb32aef3a0cd5194c7bd/nvidia_cutlass_dsl-4.3.3-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:dca550c8a7f7556a4c46bd4b85453342ae4e70600dc4aa3b5a6f1ebcd39a5ce5", size = 58734224, upload-time = "2025-12-10T09:45:22.008Z" }, - { url = "https://files.pythonhosted.org/packages/44/d7/f1936fdf697a8b76eea1f60d4bcfe41faa015e5bca925c4e767035e6857a/nvidia_cutlass_dsl-4.3.3-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:0d92144b9f161328be4a6734911c101d03c7d5335e307112ad579d826d7ac3b1", size = 58596215, upload-time = "2025-12-10T10:35:19.436Z" }, - { url = "https://files.pythonhosted.org/packages/53/ff/41a855a356067cab074c77e79ddb308a8d3df0e74659bdc2195f5c19bb10/nvidia_cutlass_dsl-4.3.3-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:7d3914b3e865cf17334d3139c11d38aed1160b5855c29eaa4e3a470ea1fcfaba", size = 58731282, upload-time = "2025-12-17T09:17:36.918Z" }, - { url = "https://files.pythonhosted.org/packages/ef/75/79f494e08b85ea921eb376a5363a7a357db2352a6a1dfdfc659721fe94b2/nvidia_cutlass_dsl-4.3.3-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:046f3fc3fe3fa60f3207133e57512f2f5581ca36943f0763f3f7e8ab11180e16", size = 58596543, upload-time = "2025-12-10T10:09:13.657Z" }, - { url = "https://files.pythonhosted.org/packages/c6/d4/7c5ef53ccf75d7f99a9ea29cae9f9c0233229b75b3b22f85a4ef4f52e6ab/nvidia_cutlass_dsl-4.3.3-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:3278526f54bddd920d8e539771e5820c6166c549a1e67813375025f39417dec6", size = 58734009, upload-time = "2025-12-10T09:23:29.305Z" }, - { url = "https://files.pythonhosted.org/packages/88/a8/a27562194cc4182c67793cd21c5dbf9468cd5a49c775a487153c6f28364c/nvidia_cutlass_dsl-4.3.3-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:f2b25816b8bb8bc332bcbf6fc341347b5d728344cf185c65af0dd73e8503d5c7", size = 58596724, upload-time = "2025-12-10T11:01:07.228Z" }, - { url = "https://files.pythonhosted.org/packages/9d/dd/83679f3467ee5827084994c2390c97659f2cda35ad824bfa936ba56295fd/nvidia_cutlass_dsl-4.3.3-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:5200ede1f51f2127c53ed5e7d38849895760469160861739813f24557e1230b8", size = 58733331, upload-time = "2025-12-10T09:03:12.607Z" }, - { url = "https://files.pythonhosted.org/packages/c6/94/1f591add7341a2ecdab76fabc0b2c7a07cadf9589bb0e78c041bd8a5a81f/nvidia_cutlass_dsl-4.3.3-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:0eb90254eee0bfdc73087034cab40f1ef723c26961606d3dd68e0fd6fe11115f", size = 58597870, upload-time = "2025-12-10T11:15:48.138Z" }, + { url = "https://files.pythonhosted.org/packages/ba/1f/83e48a71e0b7bed6b33b01732ae53e9f2e61dc518ab273e56ec859bb05f1/nvidia_cutlass_dsl-4.3.4-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:118508bc84f2a55ec7af3affd379bb713edf837d593218329909db67b518e700", size = 58736512, upload-time = "2025-12-21T07:40:34.715Z" }, + { url = "https://files.pythonhosted.org/packages/27/f1/21166ae0b6da766e11448d32c1e69fc60ba4023de9040f6ef9c333e7b0b5/nvidia_cutlass_dsl-4.3.4-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:3fdf0603ab7ec1bf6a499fbf72cff65e73b597d6e1359286808317c69aeb7c3d", size = 58598504, upload-time = "2025-12-21T07:39:43.124Z" }, + { url = "https://files.pythonhosted.org/packages/43/01/3067eaad7454a3e36523b6814f09344afa0d36f71719072a6eecd6c87a40/nvidia_cutlass_dsl-4.3.4-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:c5bd21ed877da171f115123a12aae4a920035fc47eb57c807f9fba9f3df97cf4", size = 58733573, upload-time = "2025-12-21T07:41:51.364Z" }, + { url = "https://files.pythonhosted.org/packages/86/3b/f8255a1fe6841955eea7a211bc9f30fd46bd8424ea15f361d5c09b29520a/nvidia_cutlass_dsl-4.3.4-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:671936f1df909e7de377d0cc00cb4287a3458c013d34947600423e9deb827e41", size = 58598831, upload-time = "2025-12-21T07:39:17.853Z" }, + { url = "https://files.pythonhosted.org/packages/86/ee/53d22e2e14cb763927d85f7ec9748f6af6d27a2b7f43d52de014728da10e/nvidia_cutlass_dsl-4.3.4-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:57693d87677919572ab9eefa386b3f39e8e888bc4a9db7ab8730a97e8dbe06b4", size = 58736300, upload-time = "2025-12-21T07:41:25.723Z" }, + { url = "https://files.pythonhosted.org/packages/66/f6/47489e07081cd4060f08bfa4166f8ff32beaecf71c06060d03bde88f3b6c/nvidia_cutlass_dsl-4.3.4-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a48fbff859e44dd548f8f26819d97d0595acea70e3b057c91dfdb47929015c72", size = 58599014, upload-time = "2025-12-21T07:38:51.632Z" }, + { url = "https://files.pythonhosted.org/packages/c7/2e/3aaf6121842351ec0231d5ab9d9ebe9a6e2269e9a8f7345e02f096db1ba8/nvidia_cutlass_dsl-4.3.4-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:36bde25160f461f393beba81868ef9e54d5ba2e0e7666ed3e44b6dbf788af493", size = 58735620, upload-time = "2025-12-21T07:40:59.729Z" }, + { url = "https://files.pythonhosted.org/packages/62/90/1da2583bda001bf678066bc970963aad3986036ac15e95eb38447fa1b51e/nvidia_cutlass_dsl-4.3.4-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:be127f0f087028fa498f50a994c49f95b2c6a518e11e2567bc3d71528bf0a504", size = 58600158, upload-time = "2025-12-21T07:40:09.36Z" }, ] [[package]] @@ -3283,7 +3281,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "ninja" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "nvidia-ml-py" }, { name = "packaging" }, { name = "pulp" }, @@ -3424,7 +3422,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "ml-dtypes" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "protobuf" }, { name = "typing-extensions" }, ] @@ -3461,7 +3459,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "ml-dtypes" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "onnx" }, { name = "typing-extensions" }, ] @@ -3477,7 +3475,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "ml-dtypes" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "onnx" }, { name = "onnx-ir" }, { name = "packaging" }, @@ -3525,7 +3523,7 @@ version = "2.3.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "python-dateutil" }, { name = "pytz" }, { name = "tzdata" }, @@ -3598,11 +3596,11 @@ wheels = [ [[package]] name = "pathspec" -version = "0.12.1" +version = "1.0.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043, upload-time = "2023-12-10T22:30:45Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c2/97/39352be14d20d377a387828daf9d3f765fad1ff29bd49913d5bbf4cefe61/pathspec-1.0.0.tar.gz", hash = "sha256:9ada63a23541746b0cf7d5672a39ea77eac31dd23a80470be90df83537512131", size = 129410, upload-time = "2026-01-06T03:21:22.892Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" }, + { url = "https://files.pythonhosted.org/packages/05/bb/39e6768529454cc2b57e1e2fa0a0a18ff64397a16303270e215a3e03285f/pathspec-1.0.0-py3-none-any.whl", hash = "sha256:1373719036e64a2b9de3b8ddd9e30afb082a915619f07265ed76d9ae507800ae", size = 54316, upload-time = "2026-01-06T03:21:21.74Z" }, ] [[package]] @@ -3616,100 +3614,100 @@ wheels = [ [[package]] name = "pillow" -version = "12.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5a/b0/cace85a1b0c9775a9f8f5d5423c8261c858760e2466c79b2dd184638b056/pillow-12.0.0.tar.gz", hash = "sha256:87d4f8125c9988bfbed67af47dd7a953e2fc7b0cc1e7800ec6d2080d490bb353", size = 47008828, upload-time = "2025-10-15T18:24:14.008Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5d/08/26e68b6b5da219c2a2cb7b563af008b53bb8e6b6fcb3fa40715fcdb2523a/pillow-12.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:3adfb466bbc544b926d50fe8f4a4e6abd8c6bffd28a26177594e6e9b2b76572b", size = 5289809, upload-time = "2025-10-15T18:21:27.791Z" }, - { url = "https://files.pythonhosted.org/packages/cb/e9/4e58fb097fb74c7b4758a680aacd558810a417d1edaa7000142976ef9d2f/pillow-12.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1ac11e8ea4f611c3c0147424eae514028b5e9077dd99ab91e1bd7bc33ff145e1", size = 4650606, upload-time = "2025-10-15T18:21:29.823Z" }, - { url = "https://files.pythonhosted.org/packages/4b/e0/1fa492aa9f77b3bc6d471c468e62bfea1823056bf7e5e4f1914d7ab2565e/pillow-12.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d49e2314c373f4c2b39446fb1a45ed333c850e09d0c59ac79b72eb3b95397363", size = 6221023, upload-time = "2025-10-15T18:21:31.415Z" }, - { url = "https://files.pythonhosted.org/packages/c1/09/4de7cd03e33734ccd0c876f0251401f1314e819cbfd89a0fcb6e77927cc6/pillow-12.0.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c7b2a63fd6d5246349f3d3f37b14430d73ee7e8173154461785e43036ffa96ca", size = 8024937, upload-time = "2025-10-15T18:21:33.453Z" }, - { url = "https://files.pythonhosted.org/packages/2e/69/0688e7c1390666592876d9d474f5e135abb4acb39dcb583c4dc5490f1aff/pillow-12.0.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d64317d2587c70324b79861babb9c09f71fbb780bad212018874b2c013d8600e", size = 6334139, upload-time = "2025-10-15T18:21:35.395Z" }, - { url = "https://files.pythonhosted.org/packages/ed/1c/880921e98f525b9b44ce747ad1ea8f73fd7e992bafe3ca5e5644bf433dea/pillow-12.0.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d77153e14b709fd8b8af6f66a3afbb9ed6e9fc5ccf0b6b7e1ced7b036a228782", size = 7026074, upload-time = "2025-10-15T18:21:37.219Z" }, - { url = "https://files.pythonhosted.org/packages/28/03/96f718331b19b355610ef4ebdbbde3557c726513030665071fd025745671/pillow-12.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:32ed80ea8a90ee3e6fa08c21e2e091bba6eda8eccc83dbc34c95169507a91f10", size = 6448852, upload-time = "2025-10-15T18:21:39.168Z" }, - { url = "https://files.pythonhosted.org/packages/3a/a0/6a193b3f0cc9437b122978d2c5cbce59510ccf9a5b48825096ed7472da2f/pillow-12.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c828a1ae702fc712978bda0320ba1b9893d99be0badf2647f693cc01cf0f04fa", size = 7117058, upload-time = "2025-10-15T18:21:40.997Z" }, - { url = "https://files.pythonhosted.org/packages/a7/c4/043192375eaa4463254e8e61f0e2ec9a846b983929a8d0a7122e0a6d6fff/pillow-12.0.0-cp310-cp310-win32.whl", hash = "sha256:bd87e140e45399c818fac4247880b9ce719e4783d767e030a883a970be632275", size = 6295431, upload-time = "2025-10-15T18:21:42.518Z" }, - { url = "https://files.pythonhosted.org/packages/92/c6/c2f2fc7e56301c21827e689bb8b0b465f1b52878b57471a070678c0c33cd/pillow-12.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:455247ac8a4cfb7b9bc45b7e432d10421aea9fc2e74d285ba4072688a74c2e9d", size = 7000412, upload-time = "2025-10-15T18:21:44.404Z" }, - { url = "https://files.pythonhosted.org/packages/b2/d2/5f675067ba82da7a1c238a73b32e3fd78d67f9d9f80fbadd33a40b9c0481/pillow-12.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:6ace95230bfb7cd79ef66caa064bbe2f2a1e63d93471c3a2e1f1348d9f22d6b7", size = 2435903, upload-time = "2025-10-15T18:21:46.29Z" }, - { url = "https://files.pythonhosted.org/packages/0e/5a/a2f6773b64edb921a756eb0729068acad9fc5208a53f4a349396e9436721/pillow-12.0.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0fd00cac9c03256c8b2ff58f162ebcd2587ad3e1f2e397eab718c47e24d231cc", size = 5289798, upload-time = "2025-10-15T18:21:47.763Z" }, - { url = "https://files.pythonhosted.org/packages/2e/05/069b1f8a2e4b5a37493da6c5868531c3f77b85e716ad7a590ef87d58730d/pillow-12.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3475b96f5908b3b16c47533daaa87380c491357d197564e0ba34ae75c0f3257", size = 4650589, upload-time = "2025-10-15T18:21:49.515Z" }, - { url = "https://files.pythonhosted.org/packages/61/e3/2c820d6e9a36432503ead175ae294f96861b07600a7156154a086ba7111a/pillow-12.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:110486b79f2d112cf6add83b28b627e369219388f64ef2f960fef9ebaf54c642", size = 6230472, upload-time = "2025-10-15T18:21:51.052Z" }, - { url = "https://files.pythonhosted.org/packages/4f/89/63427f51c64209c5e23d4d52071c8d0f21024d3a8a487737caaf614a5795/pillow-12.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5269cc1caeedb67e6f7269a42014f381f45e2e7cd42d834ede3c703a1d915fe3", size = 8033887, upload-time = "2025-10-15T18:21:52.604Z" }, - { url = "https://files.pythonhosted.org/packages/f6/1b/c9711318d4901093c15840f268ad649459cd81984c9ec9887756cca049a5/pillow-12.0.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa5129de4e174daccbc59d0a3b6d20eaf24417d59851c07ebb37aeb02947987c", size = 6343964, upload-time = "2025-10-15T18:21:54.619Z" }, - { url = "https://files.pythonhosted.org/packages/41/1e/db9470f2d030b4995083044cd8738cdd1bf773106819f6d8ba12597d5352/pillow-12.0.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bee2a6db3a7242ea309aa7ee8e2780726fed67ff4e5b40169f2c940e7eb09227", size = 7034756, upload-time = "2025-10-15T18:21:56.151Z" }, - { url = "https://files.pythonhosted.org/packages/cc/b0/6177a8bdd5ee4ed87cba2de5a3cc1db55ffbbec6176784ce5bb75aa96798/pillow-12.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:90387104ee8400a7b4598253b4c406f8958f59fcf983a6cea2b50d59f7d63d0b", size = 6458075, upload-time = "2025-10-15T18:21:57.759Z" }, - { url = "https://files.pythonhosted.org/packages/bc/5e/61537aa6fa977922c6a03253a0e727e6e4a72381a80d63ad8eec350684f2/pillow-12.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bc91a56697869546d1b8f0a3ff35224557ae7f881050e99f615e0119bf934b4e", size = 7125955, upload-time = "2025-10-15T18:21:59.372Z" }, - { url = "https://files.pythonhosted.org/packages/1f/3d/d5033539344ee3cbd9a4d69e12e63ca3a44a739eb2d4c8da350a3d38edd7/pillow-12.0.0-cp311-cp311-win32.whl", hash = "sha256:27f95b12453d165099c84f8a8bfdfd46b9e4bda9e0e4b65f0635430027f55739", size = 6298440, upload-time = "2025-10-15T18:22:00.982Z" }, - { url = "https://files.pythonhosted.org/packages/4d/42/aaca386de5cc8bd8a0254516957c1f265e3521c91515b16e286c662854c4/pillow-12.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:b583dc9070312190192631373c6c8ed277254aa6e6084b74bdd0a6d3b221608e", size = 6999256, upload-time = "2025-10-15T18:22:02.617Z" }, - { url = "https://files.pythonhosted.org/packages/ba/f1/9197c9c2d5708b785f631a6dfbfa8eb3fb9672837cb92ae9af812c13b4ed/pillow-12.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:759de84a33be3b178a64c8ba28ad5c135900359e85fb662bc6e403ad4407791d", size = 2436025, upload-time = "2025-10-15T18:22:04.598Z" }, - { url = "https://files.pythonhosted.org/packages/2c/90/4fcce2c22caf044e660a198d740e7fbc14395619e3cb1abad12192c0826c/pillow-12.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:53561a4ddc36facb432fae7a9d8afbfaf94795414f5cdc5fc52f28c1dca90371", size = 5249377, upload-time = "2025-10-15T18:22:05.993Z" }, - { url = "https://files.pythonhosted.org/packages/fd/e0/ed960067543d080691d47d6938ebccbf3976a931c9567ab2fbfab983a5dd/pillow-12.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:71db6b4c1653045dacc1585c1b0d184004f0d7e694c7b34ac165ca70c0838082", size = 4650343, upload-time = "2025-10-15T18:22:07.718Z" }, - { url = "https://files.pythonhosted.org/packages/e7/a1/f81fdeddcb99c044bf7d6faa47e12850f13cee0849537a7d27eeab5534d4/pillow-12.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2fa5f0b6716fc88f11380b88b31fe591a06c6315e955c096c35715788b339e3f", size = 6232981, upload-time = "2025-10-15T18:22:09.287Z" }, - { url = "https://files.pythonhosted.org/packages/88/e1/9098d3ce341a8750b55b0e00c03f1630d6178f38ac191c81c97a3b047b44/pillow-12.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:82240051c6ca513c616f7f9da06e871f61bfd7805f566275841af15015b8f98d", size = 8041399, upload-time = "2025-10-15T18:22:10.872Z" }, - { url = "https://files.pythonhosted.org/packages/a7/62/a22e8d3b602ae8cc01446d0c57a54e982737f44b6f2e1e019a925143771d/pillow-12.0.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:55f818bd74fe2f11d4d7cbc65880a843c4075e0ac7226bc1a23261dbea531953", size = 6347740, upload-time = "2025-10-15T18:22:12.769Z" }, - { url = "https://files.pythonhosted.org/packages/4f/87/424511bdcd02c8d7acf9f65caa09f291a519b16bd83c3fb3374b3d4ae951/pillow-12.0.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b87843e225e74576437fd5b6a4c2205d422754f84a06942cfaf1dc32243e45a8", size = 7040201, upload-time = "2025-10-15T18:22:14.813Z" }, - { url = "https://files.pythonhosted.org/packages/dc/4d/435c8ac688c54d11755aedfdd9f29c9eeddf68d150fe42d1d3dbd2365149/pillow-12.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c607c90ba67533e1b2355b821fef6764d1dd2cbe26b8c1005ae84f7aea25ff79", size = 6462334, upload-time = "2025-10-15T18:22:16.375Z" }, - { url = "https://files.pythonhosted.org/packages/2b/f2/ad34167a8059a59b8ad10bc5c72d4d9b35acc6b7c0877af8ac885b5f2044/pillow-12.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:21f241bdd5080a15bc86d3466a9f6074a9c2c2b314100dd896ac81ee6db2f1ba", size = 7134162, upload-time = "2025-10-15T18:22:17.996Z" }, - { url = "https://files.pythonhosted.org/packages/0c/b1/a7391df6adacf0a5c2cf6ac1cf1fcc1369e7d439d28f637a847f8803beb3/pillow-12.0.0-cp312-cp312-win32.whl", hash = "sha256:dd333073e0cacdc3089525c7df7d39b211bcdf31fc2824e49d01c6b6187b07d0", size = 6298769, upload-time = "2025-10-15T18:22:19.923Z" }, - { url = "https://files.pythonhosted.org/packages/a2/0b/d87733741526541c909bbf159e338dcace4f982daac6e5a8d6be225ca32d/pillow-12.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:9fe611163f6303d1619bbcb653540a4d60f9e55e622d60a3108be0d5b441017a", size = 7001107, upload-time = "2025-10-15T18:22:21.644Z" }, - { url = "https://files.pythonhosted.org/packages/bc/96/aaa61ce33cc98421fb6088af2a03be4157b1e7e0e87087c888e2370a7f45/pillow-12.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:7dfb439562f234f7d57b1ac6bc8fe7f838a4bd49c79230e0f6a1da93e82f1fad", size = 2436012, upload-time = "2025-10-15T18:22:23.621Z" }, - { url = "https://files.pythonhosted.org/packages/62/f2/de993bb2d21b33a98d031ecf6a978e4b61da207bef02f7b43093774c480d/pillow-12.0.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:0869154a2d0546545cde61d1789a6524319fc1897d9ee31218eae7a60ccc5643", size = 4045493, upload-time = "2025-10-15T18:22:25.758Z" }, - { url = "https://files.pythonhosted.org/packages/0e/b6/bc8d0c4c9f6f111a783d045310945deb769b806d7574764234ffd50bc5ea/pillow-12.0.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:a7921c5a6d31b3d756ec980f2f47c0cfdbce0fc48c22a39347a895f41f4a6ea4", size = 4120461, upload-time = "2025-10-15T18:22:27.286Z" }, - { url = "https://files.pythonhosted.org/packages/5d/57/d60d343709366a353dc56adb4ee1e7d8a2cc34e3fbc22905f4167cfec119/pillow-12.0.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:1ee80a59f6ce048ae13cda1abf7fbd2a34ab9ee7d401c46be3ca685d1999a399", size = 3576912, upload-time = "2025-10-15T18:22:28.751Z" }, - { url = "https://files.pythonhosted.org/packages/a4/a4/a0a31467e3f83b94d37568294b01d22b43ae3c5d85f2811769b9c66389dd/pillow-12.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c50f36a62a22d350c96e49ad02d0da41dbd17ddc2e29750dbdba4323f85eb4a5", size = 5249132, upload-time = "2025-10-15T18:22:30.641Z" }, - { url = "https://files.pythonhosted.org/packages/83/06/48eab21dd561de2914242711434c0c0eb992ed08ff3f6107a5f44527f5e9/pillow-12.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5193fde9a5f23c331ea26d0cf171fbf67e3f247585f50c08b3e205c7aeb4589b", size = 4650099, upload-time = "2025-10-15T18:22:32.73Z" }, - { url = "https://files.pythonhosted.org/packages/fc/bd/69ed99fd46a8dba7c1887156d3572fe4484e3f031405fcc5a92e31c04035/pillow-12.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bde737cff1a975b70652b62d626f7785e0480918dece11e8fef3c0cf057351c3", size = 6230808, upload-time = "2025-10-15T18:22:34.337Z" }, - { url = "https://files.pythonhosted.org/packages/ea/94/8fad659bcdbf86ed70099cb60ae40be6acca434bbc8c4c0d4ef356d7e0de/pillow-12.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a6597ff2b61d121172f5844b53f21467f7082f5fb385a9a29c01414463f93b07", size = 8037804, upload-time = "2025-10-15T18:22:36.402Z" }, - { url = "https://files.pythonhosted.org/packages/20/39/c685d05c06deecfd4e2d1950e9a908aa2ca8bc4e6c3b12d93b9cafbd7837/pillow-12.0.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b817e7035ea7f6b942c13aa03bb554fc44fea70838ea21f8eb31c638326584e", size = 6345553, upload-time = "2025-10-15T18:22:38.066Z" }, - { url = "https://files.pythonhosted.org/packages/38/57/755dbd06530a27a5ed74f8cb0a7a44a21722ebf318edbe67ddbd7fb28f88/pillow-12.0.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f4f1231b7dec408e8670264ce63e9c71409d9583dd21d32c163e25213ee2a344", size = 7037729, upload-time = "2025-10-15T18:22:39.769Z" }, - { url = "https://files.pythonhosted.org/packages/ca/b6/7e94f4c41d238615674d06ed677c14883103dce1c52e4af16f000338cfd7/pillow-12.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e51b71417049ad6ab14c49608b4a24d8fb3fe605e5dfabfe523b58064dc3d27", size = 6459789, upload-time = "2025-10-15T18:22:41.437Z" }, - { url = "https://files.pythonhosted.org/packages/9c/14/4448bb0b5e0f22dd865290536d20ec8a23b64e2d04280b89139f09a36bb6/pillow-12.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d120c38a42c234dc9a8c5de7ceaaf899cf33561956acb4941653f8bdc657aa79", size = 7130917, upload-time = "2025-10-15T18:22:43.152Z" }, - { url = "https://files.pythonhosted.org/packages/dd/ca/16c6926cc1c015845745d5c16c9358e24282f1e588237a4c36d2b30f182f/pillow-12.0.0-cp313-cp313-win32.whl", hash = "sha256:4cc6b3b2efff105c6a1656cfe59da4fdde2cda9af1c5e0b58529b24525d0a098", size = 6302391, upload-time = "2025-10-15T18:22:44.753Z" }, - { url = "https://files.pythonhosted.org/packages/6d/2a/dd43dcfd6dae9b6a49ee28a8eedb98c7d5ff2de94a5d834565164667b97b/pillow-12.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:4cf7fed4b4580601c4345ceb5d4cbf5a980d030fd5ad07c4d2ec589f95f09905", size = 7007477, upload-time = "2025-10-15T18:22:46.838Z" }, - { url = "https://files.pythonhosted.org/packages/77/f0/72ea067f4b5ae5ead653053212af05ce3705807906ba3f3e8f58ddf617e6/pillow-12.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:9f0b04c6b8584c2c193babcccc908b38ed29524b29dd464bc8801bf10d746a3a", size = 2435918, upload-time = "2025-10-15T18:22:48.399Z" }, - { url = "https://files.pythonhosted.org/packages/f5/5e/9046b423735c21f0487ea6cb5b10f89ea8f8dfbe32576fe052b5ba9d4e5b/pillow-12.0.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7fa22993bac7b77b78cae22bad1e2a987ddf0d9015c63358032f84a53f23cdc3", size = 5251406, upload-time = "2025-10-15T18:22:49.905Z" }, - { url = "https://files.pythonhosted.org/packages/12/66/982ceebcdb13c97270ef7a56c3969635b4ee7cd45227fa707c94719229c5/pillow-12.0.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f135c702ac42262573fe9714dfe99c944b4ba307af5eb507abef1667e2cbbced", size = 4653218, upload-time = "2025-10-15T18:22:51.587Z" }, - { url = "https://files.pythonhosted.org/packages/16/b3/81e625524688c31859450119bf12674619429cab3119eec0e30a7a1029cb/pillow-12.0.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c85de1136429c524e55cfa4e033b4a7940ac5c8ee4d9401cc2d1bf48154bbc7b", size = 6266564, upload-time = "2025-10-15T18:22:53.215Z" }, - { url = "https://files.pythonhosted.org/packages/98/59/dfb38f2a41240d2408096e1a76c671d0a105a4a8471b1871c6902719450c/pillow-12.0.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:38df9b4bfd3db902c9c2bd369bcacaf9d935b2fff73709429d95cc41554f7b3d", size = 8069260, upload-time = "2025-10-15T18:22:54.933Z" }, - { url = "https://files.pythonhosted.org/packages/dc/3d/378dbea5cd1874b94c312425ca77b0f47776c78e0df2df751b820c8c1d6c/pillow-12.0.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7d87ef5795da03d742bf49439f9ca4d027cde49c82c5371ba52464aee266699a", size = 6379248, upload-time = "2025-10-15T18:22:56.605Z" }, - { url = "https://files.pythonhosted.org/packages/84/b0/d525ef47d71590f1621510327acec75ae58c721dc071b17d8d652ca494d8/pillow-12.0.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aff9e4d82d082ff9513bdd6acd4f5bd359f5b2c870907d2b0a9c5e10d40c88fe", size = 7066043, upload-time = "2025-10-15T18:22:58.53Z" }, - { url = "https://files.pythonhosted.org/packages/61/2c/aced60e9cf9d0cde341d54bf7932c9ffc33ddb4a1595798b3a5150c7ec4e/pillow-12.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:8d8ca2b210ada074d57fcee40c30446c9562e542fc46aedc19baf758a93532ee", size = 6490915, upload-time = "2025-10-15T18:23:00.582Z" }, - { url = "https://files.pythonhosted.org/packages/ef/26/69dcb9b91f4e59f8f34b2332a4a0a951b44f547c4ed39d3e4dcfcff48f89/pillow-12.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:99a7f72fb6249302aa62245680754862a44179b545ded638cf1fef59befb57ef", size = 7157998, upload-time = "2025-10-15T18:23:02.627Z" }, - { url = "https://files.pythonhosted.org/packages/61/2b/726235842220ca95fa441ddf55dd2382b52ab5b8d9c0596fe6b3f23dafe8/pillow-12.0.0-cp313-cp313t-win32.whl", hash = "sha256:4078242472387600b2ce8d93ade8899c12bf33fa89e55ec89fe126e9d6d5d9e9", size = 6306201, upload-time = "2025-10-15T18:23:04.709Z" }, - { url = "https://files.pythonhosted.org/packages/c0/3d/2afaf4e840b2df71344ababf2f8edd75a705ce500e5dc1e7227808312ae1/pillow-12.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2c54c1a783d6d60595d3514f0efe9b37c8808746a66920315bfd34a938d7994b", size = 7013165, upload-time = "2025-10-15T18:23:06.46Z" }, - { url = "https://files.pythonhosted.org/packages/6f/75/3fa09aa5cf6ed04bee3fa575798ddf1ce0bace8edb47249c798077a81f7f/pillow-12.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:26d9f7d2b604cd23aba3e9faf795787456ac25634d82cd060556998e39c6fa47", size = 2437834, upload-time = "2025-10-15T18:23:08.194Z" }, - { url = "https://files.pythonhosted.org/packages/54/2a/9a8c6ba2c2c07b71bec92cf63e03370ca5e5f5c5b119b742bcc0cde3f9c5/pillow-12.0.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:beeae3f27f62308f1ddbcfb0690bf44b10732f2ef43758f169d5e9303165d3f9", size = 4045531, upload-time = "2025-10-15T18:23:10.121Z" }, - { url = "https://files.pythonhosted.org/packages/84/54/836fdbf1bfb3d66a59f0189ff0b9f5f666cee09c6188309300df04ad71fa/pillow-12.0.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:d4827615da15cd59784ce39d3388275ec093ae3ee8d7f0c089b76fa87af756c2", size = 4120554, upload-time = "2025-10-15T18:23:12.14Z" }, - { url = "https://files.pythonhosted.org/packages/0d/cd/16aec9f0da4793e98e6b54778a5fbce4f375c6646fe662e80600b8797379/pillow-12.0.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:3e42edad50b6909089750e65c91aa09aaf1e0a71310d383f11321b27c224ed8a", size = 3576812, upload-time = "2025-10-15T18:23:13.962Z" }, - { url = "https://files.pythonhosted.org/packages/f6/b7/13957fda356dc46339298b351cae0d327704986337c3c69bb54628c88155/pillow-12.0.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:e5d8efac84c9afcb40914ab49ba063d94f5dbdf5066db4482c66a992f47a3a3b", size = 5252689, upload-time = "2025-10-15T18:23:15.562Z" }, - { url = "https://files.pythonhosted.org/packages/fc/f5/eae31a306341d8f331f43edb2e9122c7661b975433de5e447939ae61c5da/pillow-12.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:266cd5f2b63ff316d5a1bba46268e603c9caf5606d44f38c2873c380950576ad", size = 4650186, upload-time = "2025-10-15T18:23:17.379Z" }, - { url = "https://files.pythonhosted.org/packages/86/62/2a88339aa40c4c77e79108facbd307d6091e2c0eb5b8d3cf4977cfca2fe6/pillow-12.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:58eea5ebe51504057dd95c5b77d21700b77615ab0243d8152793dc00eb4faf01", size = 6230308, upload-time = "2025-10-15T18:23:18.971Z" }, - { url = "https://files.pythonhosted.org/packages/c7/33/5425a8992bcb32d1cb9fa3dd39a89e613d09a22f2c8083b7bf43c455f760/pillow-12.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f13711b1a5ba512d647a0e4ba79280d3a9a045aaf7e0cc6fbe96b91d4cdf6b0c", size = 8039222, upload-time = "2025-10-15T18:23:20.909Z" }, - { url = "https://files.pythonhosted.org/packages/d8/61/3f5d3b35c5728f37953d3eec5b5f3e77111949523bd2dd7f31a851e50690/pillow-12.0.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6846bd2d116ff42cba6b646edf5bf61d37e5cbd256425fa089fee4ff5c07a99e", size = 6346657, upload-time = "2025-10-15T18:23:23.077Z" }, - { url = "https://files.pythonhosted.org/packages/3a/be/ee90a3d79271227e0f0a33c453531efd6ed14b2e708596ba5dd9be948da3/pillow-12.0.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c98fa880d695de164b4135a52fd2e9cd7b7c90a9d8ac5e9e443a24a95ef9248e", size = 7038482, upload-time = "2025-10-15T18:23:25.005Z" }, - { url = "https://files.pythonhosted.org/packages/44/34/a16b6a4d1ad727de390e9bd9f19f5f669e079e5826ec0f329010ddea492f/pillow-12.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fa3ed2a29a9e9d2d488b4da81dcb54720ac3104a20bf0bd273f1e4648aff5af9", size = 6461416, upload-time = "2025-10-15T18:23:27.009Z" }, - { url = "https://files.pythonhosted.org/packages/b6/39/1aa5850d2ade7d7ba9f54e4e4c17077244ff7a2d9e25998c38a29749eb3f/pillow-12.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d034140032870024e6b9892c692fe2968493790dd57208b2c37e3fb35f6df3ab", size = 7131584, upload-time = "2025-10-15T18:23:29.752Z" }, - { url = "https://files.pythonhosted.org/packages/bf/db/4fae862f8fad0167073a7733973bfa955f47e2cac3dc3e3e6257d10fab4a/pillow-12.0.0-cp314-cp314-win32.whl", hash = "sha256:1b1b133e6e16105f524a8dec491e0586d072948ce15c9b914e41cdadd209052b", size = 6400621, upload-time = "2025-10-15T18:23:32.06Z" }, - { url = "https://files.pythonhosted.org/packages/2b/24/b350c31543fb0107ab2599464d7e28e6f856027aadda995022e695313d94/pillow-12.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:8dc232e39d409036af549c86f24aed8273a40ffa459981146829a324e0848b4b", size = 7142916, upload-time = "2025-10-15T18:23:34.71Z" }, - { url = "https://files.pythonhosted.org/packages/0f/9b/0ba5a6fd9351793996ef7487c4fdbde8d3f5f75dbedc093bb598648fddf0/pillow-12.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:d52610d51e265a51518692045e372a4c363056130d922a7351429ac9f27e70b0", size = 2523836, upload-time = "2025-10-15T18:23:36.967Z" }, - { url = "https://files.pythonhosted.org/packages/f5/7a/ceee0840aebc579af529b523d530840338ecf63992395842e54edc805987/pillow-12.0.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:1979f4566bb96c1e50a62d9831e2ea2d1211761e5662afc545fa766f996632f6", size = 5255092, upload-time = "2025-10-15T18:23:38.573Z" }, - { url = "https://files.pythonhosted.org/packages/44/76/20776057b4bfd1aef4eeca992ebde0f53a4dce874f3ae693d0ec90a4f79b/pillow-12.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b2e4b27a6e15b04832fe9bf292b94b5ca156016bbc1ea9c2c20098a0320d6cf6", size = 4653158, upload-time = "2025-10-15T18:23:40.238Z" }, - { url = "https://files.pythonhosted.org/packages/82/3f/d9ff92ace07be8836b4e7e87e6a4c7a8318d47c2f1463ffcf121fc57d9cb/pillow-12.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fb3096c30df99fd01c7bf8e544f392103d0795b9f98ba71a8054bcbf56b255f1", size = 6267882, upload-time = "2025-10-15T18:23:42.434Z" }, - { url = "https://files.pythonhosted.org/packages/9f/7a/4f7ff87f00d3ad33ba21af78bfcd2f032107710baf8280e3722ceec28cda/pillow-12.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7438839e9e053ef79f7112c881cef684013855016f928b168b81ed5835f3e75e", size = 8071001, upload-time = "2025-10-15T18:23:44.29Z" }, - { url = "https://files.pythonhosted.org/packages/75/87/fcea108944a52dad8cca0715ae6247e271eb80459364a98518f1e4f480c1/pillow-12.0.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d5c411a8eaa2299322b647cd932586b1427367fd3184ffbb8f7a219ea2041ca", size = 6380146, upload-time = "2025-10-15T18:23:46.065Z" }, - { url = "https://files.pythonhosted.org/packages/91/52/0d31b5e571ef5fd111d2978b84603fce26aba1b6092f28e941cb46570745/pillow-12.0.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d7e091d464ac59d2c7ad8e7e08105eaf9dafbc3883fd7265ffccc2baad6ac925", size = 7067344, upload-time = "2025-10-15T18:23:47.898Z" }, - { url = "https://files.pythonhosted.org/packages/7b/f4/2dd3d721f875f928d48e83bb30a434dee75a2531bca839bb996bb0aa5a91/pillow-12.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:792a2c0be4dcc18af9d4a2dfd8a11a17d5e25274a1062b0ec1c2d79c76f3e7f8", size = 6491864, upload-time = "2025-10-15T18:23:49.607Z" }, - { url = "https://files.pythonhosted.org/packages/30/4b/667dfcf3d61fc309ba5a15b141845cece5915e39b99c1ceab0f34bf1d124/pillow-12.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:afbefa430092f71a9593a99ab6a4e7538bc9eabbf7bf94f91510d3503943edc4", size = 7158911, upload-time = "2025-10-15T18:23:51.351Z" }, - { url = "https://files.pythonhosted.org/packages/a2/2f/16cabcc6426c32218ace36bf0d55955e813f2958afddbf1d391849fee9d1/pillow-12.0.0-cp314-cp314t-win32.whl", hash = "sha256:3830c769decf88f1289680a59d4f4c46c72573446352e2befec9a8512104fa52", size = 6408045, upload-time = "2025-10-15T18:23:53.177Z" }, - { url = "https://files.pythonhosted.org/packages/35/73/e29aa0c9c666cf787628d3f0dcf379f4791fba79f4936d02f8b37165bdf8/pillow-12.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:905b0365b210c73afb0ebe9101a32572152dfd1c144c7e28968a331b9217b94a", size = 7148282, upload-time = "2025-10-15T18:23:55.316Z" }, - { url = "https://files.pythonhosted.org/packages/c1/70/6b41bdcddf541b437bbb9f47f94d2db5d9ddef6c37ccab8c9107743748a4/pillow-12.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:99353a06902c2e43b43e8ff74ee65a7d90307d82370604746738a1e0661ccca7", size = 2525630, upload-time = "2025-10-15T18:23:57.149Z" }, - { url = "https://files.pythonhosted.org/packages/1d/b3/582327e6c9f86d037b63beebe981425d6811104cb443e8193824ef1a2f27/pillow-12.0.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:b22bd8c974942477156be55a768f7aa37c46904c175be4e158b6a86e3a6b7ca8", size = 5215068, upload-time = "2025-10-15T18:23:59.594Z" }, - { url = "https://files.pythonhosted.org/packages/fd/d6/67748211d119f3b6540baf90f92fae73ae51d5217b171b0e8b5f7e5d558f/pillow-12.0.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:805ebf596939e48dbb2e4922a1d3852cfc25c38160751ce02da93058b48d252a", size = 4614994, upload-time = "2025-10-15T18:24:01.669Z" }, - { url = "https://files.pythonhosted.org/packages/2d/e1/f8281e5d844c41872b273b9f2c34a4bf64ca08905668c8ae730eedc7c9fa/pillow-12.0.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cae81479f77420d217def5f54b5b9d279804d17e982e0f2fa19b1d1e14ab5197", size = 5246639, upload-time = "2025-10-15T18:24:03.403Z" }, - { url = "https://files.pythonhosted.org/packages/94/5a/0d8ab8ffe8a102ff5df60d0de5af309015163bf710c7bb3e8311dd3b3ad0/pillow-12.0.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:aeaefa96c768fc66818730b952a862235d68825c178f1b3ffd4efd7ad2edcb7c", size = 6986839, upload-time = "2025-10-15T18:24:05.344Z" }, - { url = "https://files.pythonhosted.org/packages/20/2e/3434380e8110b76cd9eb00a363c484b050f949b4bbe84ba770bb8508a02c/pillow-12.0.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:09f2d0abef9e4e2f349305a4f8cc784a8a6c2f58a8c4892eea13b10a943bd26e", size = 5313505, upload-time = "2025-10-15T18:24:07.137Z" }, - { url = "https://files.pythonhosted.org/packages/57/ca/5a9d38900d9d74785141d6580950fe705de68af735ff6e727cb911b64740/pillow-12.0.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bdee52571a343d721fb2eb3b090a82d959ff37fc631e3f70422e0c2e029f3e76", size = 5963654, upload-time = "2025-10-15T18:24:09.579Z" }, - { url = "https://files.pythonhosted.org/packages/95/7e/f896623c3c635a90537ac093c6a618ebe1a90d87206e42309cb5d98a1b9e/pillow-12.0.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:b290fd8aa38422444d4b50d579de197557f182ef1068b75f5aa8558638b8d0a5", size = 6997850, upload-time = "2025-10-15T18:24:11.495Z" }, +version = "12.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d0/02/d52c733a2452ef1ffcc123b68e6606d07276b0e358db70eabad7e40042b7/pillow-12.1.0.tar.gz", hash = "sha256:5c5ae0a06e9ea030ab786b0251b32c7e4ce10e58d983c0d5c56029455180b5b9", size = 46977283, upload-time = "2026-01-02T09:13:29.892Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/41/f73d92b6b883a579e79600d391f2e21cb0df767b2714ecbd2952315dfeef/pillow-12.1.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:fb125d860738a09d363a88daa0f59c4533529a90e564785e20fe875b200b6dbd", size = 5304089, upload-time = "2026-01-02T09:10:24.953Z" }, + { url = "https://files.pythonhosted.org/packages/94/55/7aca2891560188656e4a91ed9adba305e914a4496800da6b5c0a15f09edf/pillow-12.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cad302dc10fac357d3467a74a9561c90609768a6f73a1923b0fd851b6486f8b0", size = 4657815, upload-time = "2026-01-02T09:10:27.063Z" }, + { url = "https://files.pythonhosted.org/packages/e9/d2/b28221abaa7b4c40b7dba948f0f6a708bd7342c4d47ce342f0ea39643974/pillow-12.1.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a40905599d8079e09f25027423aed94f2823adaf2868940de991e53a449e14a8", size = 6222593, upload-time = "2026-01-02T09:10:29.115Z" }, + { url = "https://files.pythonhosted.org/packages/71/b8/7a61fb234df6a9b0b479f69e66901209d89ff72a435b49933f9122f94cac/pillow-12.1.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:92a7fe4225365c5e3a8e598982269c6d6698d3e783b3b1ae979e7819f9cd55c1", size = 8027579, upload-time = "2026-01-02T09:10:31.182Z" }, + { url = "https://files.pythonhosted.org/packages/ea/51/55c751a57cc524a15a0e3db20e5cde517582359508d62305a627e77fd295/pillow-12.1.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f10c98f49227ed8383d28174ee95155a675c4ed7f85e2e573b04414f7e371bda", size = 6335760, upload-time = "2026-01-02T09:10:33.02Z" }, + { url = "https://files.pythonhosted.org/packages/dc/7c/60e3e6f5e5891a1a06b4c910f742ac862377a6fe842f7184df4a274ce7bf/pillow-12.1.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8637e29d13f478bc4f153d8daa9ffb16455f0a6cb287da1b432fdad2bfbd66c7", size = 7027127, upload-time = "2026-01-02T09:10:35.009Z" }, + { url = "https://files.pythonhosted.org/packages/06/37/49d47266ba50b00c27ba63a7c898f1bb41a29627ced8c09e25f19ebec0ff/pillow-12.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:21e686a21078b0f9cb8c8a961d99e6a4ddb88e0fc5ea6e130172ddddc2e5221a", size = 6449896, upload-time = "2026-01-02T09:10:36.793Z" }, + { url = "https://files.pythonhosted.org/packages/f9/e5/67fd87d2913902462cd9b79c6211c25bfe95fcf5783d06e1367d6d9a741f/pillow-12.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2415373395a831f53933c23ce051021e79c8cd7979822d8cc478547a3f4da8ef", size = 7151345, upload-time = "2026-01-02T09:10:39.064Z" }, + { url = "https://files.pythonhosted.org/packages/bd/15/f8c7abf82af68b29f50d77c227e7a1f87ce02fdc66ded9bf603bc3b41180/pillow-12.1.0-cp310-cp310-win32.whl", hash = "sha256:e75d3dba8fc1ddfec0cd752108f93b83b4f8d6ab40e524a95d35f016b9683b09", size = 6325568, upload-time = "2026-01-02T09:10:41.035Z" }, + { url = "https://files.pythonhosted.org/packages/d4/24/7d1c0e160b6b5ac2605ef7d8be537e28753c0db5363d035948073f5513d7/pillow-12.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:64efdf00c09e31efd754448a383ea241f55a994fd079866b92d2bbff598aad91", size = 7032367, upload-time = "2026-01-02T09:10:43.09Z" }, + { url = "https://files.pythonhosted.org/packages/f4/03/41c038f0d7a06099254c60f618d0ec7be11e79620fc23b8e85e5b31d9a44/pillow-12.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:f188028b5af6b8fb2e9a76ac0f841a575bd1bd396e46ef0840d9b88a48fdbcea", size = 2452345, upload-time = "2026-01-02T09:10:44.795Z" }, + { url = "https://files.pythonhosted.org/packages/43/c4/bf8328039de6cc22182c3ef007a2abfbbdab153661c0a9aa78af8d706391/pillow-12.1.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:a83e0850cb8f5ac975291ebfc4170ba481f41a28065277f7f735c202cd8e0af3", size = 5304057, upload-time = "2026-01-02T09:10:46.627Z" }, + { url = "https://files.pythonhosted.org/packages/43/06/7264c0597e676104cc22ca73ee48f752767cd4b1fe084662620b17e10120/pillow-12.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b6e53e82ec2db0717eabb276aa56cf4e500c9a7cec2c2e189b55c24f65a3e8c0", size = 4657811, upload-time = "2026-01-02T09:10:49.548Z" }, + { url = "https://files.pythonhosted.org/packages/72/64/f9189e44474610daf83da31145fa56710b627b5c4c0b9c235e34058f6b31/pillow-12.1.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:40a8e3b9e8773876d6e30daed22f016509e3987bab61b3b7fe309d7019a87451", size = 6232243, upload-time = "2026-01-02T09:10:51.62Z" }, + { url = "https://files.pythonhosted.org/packages/ef/30/0df458009be6a4caca4ca2c52975e6275c387d4e5c95544e34138b41dc86/pillow-12.1.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:800429ac32c9b72909c671aaf17ecd13110f823ddb7db4dfef412a5587c2c24e", size = 8037872, upload-time = "2026-01-02T09:10:53.446Z" }, + { url = "https://files.pythonhosted.org/packages/e4/86/95845d4eda4f4f9557e25381d70876aa213560243ac1a6d619c46caaedd9/pillow-12.1.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b022eaaf709541b391ee069f0022ee5b36c709df71986e3f7be312e46f42c84", size = 6345398, upload-time = "2026-01-02T09:10:55.426Z" }, + { url = "https://files.pythonhosted.org/packages/5c/1f/8e66ab9be3aaf1435bc03edd1ebdf58ffcd17f7349c1d970cafe87af27d9/pillow-12.1.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f345e7bc9d7f368887c712aa5054558bad44d2a301ddf9248599f4161abc7c0", size = 7034667, upload-time = "2026-01-02T09:10:57.11Z" }, + { url = "https://files.pythonhosted.org/packages/f9/f6/683b83cb9b1db1fb52b87951b1c0b99bdcfceaa75febf11406c19f82cb5e/pillow-12.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d70347c8a5b7ccd803ec0c85c8709f036e6348f1e6a5bf048ecd9c64d3550b8b", size = 6458743, upload-time = "2026-01-02T09:10:59.331Z" }, + { url = "https://files.pythonhosted.org/packages/9a/7d/de833d63622538c1d58ce5395e7c6cb7e7dce80decdd8bde4a484e095d9f/pillow-12.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1fcc52d86ce7a34fd17cb04e87cfdb164648a3662a6f20565910a99653d66c18", size = 7159342, upload-time = "2026-01-02T09:11:01.82Z" }, + { url = "https://files.pythonhosted.org/packages/8c/40/50d86571c9e5868c42b81fe7da0c76ca26373f3b95a8dd675425f4a92ec1/pillow-12.1.0-cp311-cp311-win32.whl", hash = "sha256:3ffaa2f0659e2f740473bcf03c702c39a8d4b2b7ffc629052028764324842c64", size = 6328655, upload-time = "2026-01-02T09:11:04.556Z" }, + { url = "https://files.pythonhosted.org/packages/6c/af/b1d7e301c4cd26cd45d4af884d9ee9b6fab893b0ad2450d4746d74a6968c/pillow-12.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:806f3987ffe10e867bab0ddad45df1148a2b98221798457fa097ad85d6e8bc75", size = 7031469, upload-time = "2026-01-02T09:11:06.538Z" }, + { url = "https://files.pythonhosted.org/packages/48/36/d5716586d887fb2a810a4a61518a327a1e21c8b7134c89283af272efe84b/pillow-12.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:9f5fefaca968e700ad1a4a9de98bf0869a94e397fe3524c4c9450c1445252304", size = 2452515, upload-time = "2026-01-02T09:11:08.226Z" }, + { url = "https://files.pythonhosted.org/packages/20/31/dc53fe21a2f2996e1b7d92bf671cdb157079385183ef7c1ae08b485db510/pillow-12.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a332ac4ccb84b6dde65dbace8431f3af08874bf9770719d32a635c4ef411b18b", size = 5262642, upload-time = "2026-01-02T09:11:10.138Z" }, + { url = "https://files.pythonhosted.org/packages/ab/c1/10e45ac9cc79419cedf5121b42dcca5a50ad2b601fa080f58c22fb27626e/pillow-12.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:907bfa8a9cb790748a9aa4513e37c88c59660da3bcfffbd24a7d9e6abf224551", size = 4657464, upload-time = "2026-01-02T09:11:12.319Z" }, + { url = "https://files.pythonhosted.org/packages/ad/26/7b82c0ab7ef40ebede7a97c72d473bda5950f609f8e0c77b04af574a0ddb/pillow-12.1.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:efdc140e7b63b8f739d09a99033aa430accce485ff78e6d311973a67b6bf3208", size = 6234878, upload-time = "2026-01-02T09:11:14.096Z" }, + { url = "https://files.pythonhosted.org/packages/76/25/27abc9792615b5e886ca9411ba6637b675f1b77af3104710ac7353fe5605/pillow-12.1.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bef9768cab184e7ae6e559c032e95ba8d07b3023c289f79a2bd36e8bf85605a5", size = 8044868, upload-time = "2026-01-02T09:11:15.903Z" }, + { url = "https://files.pythonhosted.org/packages/0a/ea/f200a4c36d836100e7bc738fc48cd963d3ba6372ebc8298a889e0cfc3359/pillow-12.1.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:742aea052cf5ab5034a53c3846165bc3ce88d7c38e954120db0ab867ca242661", size = 6349468, upload-time = "2026-01-02T09:11:17.631Z" }, + { url = "https://files.pythonhosted.org/packages/11/8f/48d0b77ab2200374c66d344459b8958c86693be99526450e7aee714e03e4/pillow-12.1.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6dfc2af5b082b635af6e08e0d1f9f1c4e04d17d4e2ca0ef96131e85eda6eb17", size = 7041518, upload-time = "2026-01-02T09:11:19.389Z" }, + { url = "https://files.pythonhosted.org/packages/1d/23/c281182eb986b5d31f0a76d2a2c8cd41722d6fb8ed07521e802f9bba52de/pillow-12.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:609e89d9f90b581c8d16358c9087df76024cf058fa693dd3e1e1620823f39670", size = 6462829, upload-time = "2026-01-02T09:11:21.28Z" }, + { url = "https://files.pythonhosted.org/packages/25/ef/7018273e0faac099d7b00982abdcc39142ae6f3bd9ceb06de09779c4a9d6/pillow-12.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:43b4899cfd091a9693a1278c4982f3e50f7fb7cff5153b05174b4afc9593b616", size = 7166756, upload-time = "2026-01-02T09:11:23.559Z" }, + { url = "https://files.pythonhosted.org/packages/8f/c8/993d4b7ab2e341fe02ceef9576afcf5830cdec640be2ac5bee1820d693d4/pillow-12.1.0-cp312-cp312-win32.whl", hash = "sha256:aa0c9cc0b82b14766a99fbe6084409972266e82f459821cd26997a488a7261a7", size = 6328770, upload-time = "2026-01-02T09:11:25.661Z" }, + { url = "https://files.pythonhosted.org/packages/a7/87/90b358775a3f02765d87655237229ba64a997b87efa8ccaca7dd3e36e7a7/pillow-12.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:d70534cea9e7966169ad29a903b99fc507e932069a881d0965a1a84bb57f6c6d", size = 7033406, upload-time = "2026-01-02T09:11:27.474Z" }, + { url = "https://files.pythonhosted.org/packages/5d/cf/881b457eccacac9e5b2ddd97d5071fb6d668307c57cbf4e3b5278e06e536/pillow-12.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:65b80c1ee7e14a87d6a068dd3b0aea268ffcabfe0498d38661b00c5b4b22e74c", size = 2452612, upload-time = "2026-01-02T09:11:29.309Z" }, + { url = "https://files.pythonhosted.org/packages/dd/c7/2530a4aa28248623e9d7f27316b42e27c32ec410f695929696f2e0e4a778/pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:7b5dd7cbae20285cdb597b10eb5a2c13aa9de6cde9bb64a3c1317427b1db1ae1", size = 4062543, upload-time = "2026-01-02T09:11:31.566Z" }, + { url = "https://files.pythonhosted.org/packages/8f/1f/40b8eae823dc1519b87d53c30ed9ef085506b05281d313031755c1705f73/pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:29a4cef9cb672363926f0470afc516dbf7305a14d8c54f7abbb5c199cd8f8179", size = 4138373, upload-time = "2026-01-02T09:11:33.367Z" }, + { url = "https://files.pythonhosted.org/packages/d4/77/6fa60634cf06e52139fd0e89e5bbf055e8166c691c42fb162818b7fda31d/pillow-12.1.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:681088909d7e8fa9e31b9799aaa59ba5234c58e5e4f1951b4c4d1082a2e980e0", size = 3601241, upload-time = "2026-01-02T09:11:35.011Z" }, + { url = "https://files.pythonhosted.org/packages/4f/bf/28ab865de622e14b747f0cd7877510848252d950e43002e224fb1c9ababf/pillow-12.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:983976c2ab753166dc66d36af6e8ec15bb511e4a25856e2227e5f7e00a160587", size = 5262410, upload-time = "2026-01-02T09:11:36.682Z" }, + { url = "https://files.pythonhosted.org/packages/1c/34/583420a1b55e715937a85bd48c5c0991598247a1fd2eb5423188e765ea02/pillow-12.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:db44d5c160a90df2d24a24760bbd37607d53da0b34fb546c4c232af7192298ac", size = 4657312, upload-time = "2026-01-02T09:11:38.535Z" }, + { url = "https://files.pythonhosted.org/packages/1d/fd/f5a0896839762885b3376ff04878f86ab2b097c2f9a9cdccf4eda8ba8dc0/pillow-12.1.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6b7a9d1db5dad90e2991645874f708e87d9a3c370c243c2d7684d28f7e133e6b", size = 6232605, upload-time = "2026-01-02T09:11:40.602Z" }, + { url = "https://files.pythonhosted.org/packages/98/aa/938a09d127ac1e70e6ed467bd03834350b33ef646b31edb7452d5de43792/pillow-12.1.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6258f3260986990ba2fa8a874f8b6e808cf5abb51a94015ca3dc3c68aa4f30ea", size = 8041617, upload-time = "2026-01-02T09:11:42.721Z" }, + { url = "https://files.pythonhosted.org/packages/17/e8/538b24cb426ac0186e03f80f78bc8dc7246c667f58b540bdd57c71c9f79d/pillow-12.1.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e115c15e3bc727b1ca3e641a909f77f8ca72a64fff150f666fcc85e57701c26c", size = 6346509, upload-time = "2026-01-02T09:11:44.955Z" }, + { url = "https://files.pythonhosted.org/packages/01/9a/632e58ec89a32738cabfd9ec418f0e9898a2b4719afc581f07c04a05e3c9/pillow-12.1.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6741e6f3074a35e47c77b23a4e4f2d90db3ed905cb1c5e6e0d49bff2045632bc", size = 7038117, upload-time = "2026-01-02T09:11:46.736Z" }, + { url = "https://files.pythonhosted.org/packages/c7/a2/d40308cf86eada842ca1f3ffa45d0ca0df7e4ab33c83f81e73f5eaed136d/pillow-12.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:935b9d1aed48fcfb3f838caac506f38e29621b44ccc4f8a64d575cb1b2a88644", size = 6460151, upload-time = "2026-01-02T09:11:48.625Z" }, + { url = "https://files.pythonhosted.org/packages/f1/88/f5b058ad6453a085c5266660a1417bdad590199da1b32fb4efcff9d33b05/pillow-12.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5fee4c04aad8932da9f8f710af2c1a15a83582cfb884152a9caa79d4efcdbf9c", size = 7164534, upload-time = "2026-01-02T09:11:50.445Z" }, + { url = "https://files.pythonhosted.org/packages/19/ce/c17334caea1db789163b5d855a5735e47995b0b5dc8745e9a3605d5f24c0/pillow-12.1.0-cp313-cp313-win32.whl", hash = "sha256:a786bf667724d84aa29b5db1c61b7bfdde380202aaca12c3461afd6b71743171", size = 6332551, upload-time = "2026-01-02T09:11:52.234Z" }, + { url = "https://files.pythonhosted.org/packages/e5/07/74a9d941fa45c90a0d9465098fe1ec85de3e2afbdc15cc4766622d516056/pillow-12.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:461f9dfdafa394c59cd6d818bdfdbab4028b83b02caadaff0ffd433faf4c9a7a", size = 7040087, upload-time = "2026-01-02T09:11:54.822Z" }, + { url = "https://files.pythonhosted.org/packages/88/09/c99950c075a0e9053d8e880595926302575bc742b1b47fe1bbcc8d388d50/pillow-12.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:9212d6b86917a2300669511ed094a9406888362e085f2431a7da985a6b124f45", size = 2452470, upload-time = "2026-01-02T09:11:56.522Z" }, + { url = "https://files.pythonhosted.org/packages/b5/ba/970b7d85ba01f348dee4d65412476321d40ee04dcb51cd3735b9dc94eb58/pillow-12.1.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:00162e9ca6d22b7c3ee8e61faa3c3253cd19b6a37f126cad04f2f88b306f557d", size = 5264816, upload-time = "2026-01-02T09:11:58.227Z" }, + { url = "https://files.pythonhosted.org/packages/10/60/650f2fb55fdba7a510d836202aa52f0baac633e50ab1cf18415d332188fb/pillow-12.1.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:7d6daa89a00b58c37cb1747ec9fb7ac3bc5ffd5949f5888657dfddde6d1312e0", size = 4660472, upload-time = "2026-01-02T09:12:00.798Z" }, + { url = "https://files.pythonhosted.org/packages/2b/c0/5273a99478956a099d533c4f46cbaa19fd69d606624f4334b85e50987a08/pillow-12.1.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e2479c7f02f9d505682dc47df8c0ea1fc5e264c4d1629a5d63fe3e2334b89554", size = 6268974, upload-time = "2026-01-02T09:12:02.572Z" }, + { url = "https://files.pythonhosted.org/packages/b4/26/0bf714bc2e73d5267887d47931d53c4ceeceea6978148ed2ab2a4e6463c4/pillow-12.1.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f188d580bd870cda1e15183790d1cc2fa78f666e76077d103edf048eed9c356e", size = 8073070, upload-time = "2026-01-02T09:12:04.75Z" }, + { url = "https://files.pythonhosted.org/packages/43/cf/1ea826200de111a9d65724c54f927f3111dc5ae297f294b370a670c17786/pillow-12.1.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0fde7ec5538ab5095cc02df38ee99b0443ff0e1c847a045554cf5f9af1f4aa82", size = 6380176, upload-time = "2026-01-02T09:12:06.626Z" }, + { url = "https://files.pythonhosted.org/packages/03/e0/7938dd2b2013373fd85d96e0f38d62b7a5a262af21ac274250c7ca7847c9/pillow-12.1.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0ed07dca4a8464bada6139ab38f5382f83e5f111698caf3191cb8dbf27d908b4", size = 7067061, upload-time = "2026-01-02T09:12:08.624Z" }, + { url = "https://files.pythonhosted.org/packages/86/ad/a2aa97d37272a929a98437a8c0ac37b3cf012f4f8721e1bd5154699b2518/pillow-12.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f45bd71d1fa5e5749587613037b172e0b3b23159d1c00ef2fc920da6f470e6f0", size = 6491824, upload-time = "2026-01-02T09:12:10.488Z" }, + { url = "https://files.pythonhosted.org/packages/a4/44/80e46611b288d51b115826f136fb3465653c28f491068a72d3da49b54cd4/pillow-12.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:277518bf4fe74aa91489e1b20577473b19ee70fb97c374aa50830b279f25841b", size = 7190911, upload-time = "2026-01-02T09:12:12.772Z" }, + { url = "https://files.pythonhosted.org/packages/86/77/eacc62356b4cf81abe99ff9dbc7402750044aed02cfd6a503f7c6fc11f3e/pillow-12.1.0-cp313-cp313t-win32.whl", hash = "sha256:7315f9137087c4e0ee73a761b163fc9aa3b19f5f606a7fc08d83fd3e4379af65", size = 6336445, upload-time = "2026-01-02T09:12:14.775Z" }, + { url = "https://files.pythonhosted.org/packages/e7/3c/57d81d0b74d218706dafccb87a87ea44262c43eef98eb3b164fd000e0491/pillow-12.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:0ddedfaa8b5f0b4ffbc2fa87b556dc59f6bb4ecb14a53b33f9189713ae8053c0", size = 7045354, upload-time = "2026-01-02T09:12:16.599Z" }, + { url = "https://files.pythonhosted.org/packages/ac/82/8b9b97bba2e3576a340f93b044a3a3a09841170ab4c1eb0d5c93469fd32f/pillow-12.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:80941e6d573197a0c28f394753de529bb436b1ca990ed6e765cf42426abc39f8", size = 2454547, upload-time = "2026-01-02T09:12:18.704Z" }, + { url = "https://files.pythonhosted.org/packages/8c/87/bdf971d8bbcf80a348cc3bacfcb239f5882100fe80534b0ce67a784181d8/pillow-12.1.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:5cb7bc1966d031aec37ddb9dcf15c2da5b2e9f7cc3ca7c54473a20a927e1eb91", size = 4062533, upload-time = "2026-01-02T09:12:20.791Z" }, + { url = "https://files.pythonhosted.org/packages/ff/4f/5eb37a681c68d605eb7034c004875c81f86ec9ef51f5be4a63eadd58859a/pillow-12.1.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:97e9993d5ed946aba26baf9c1e8cf18adbab584b99f452ee72f7ee8acb882796", size = 4138546, upload-time = "2026-01-02T09:12:23.664Z" }, + { url = "https://files.pythonhosted.org/packages/11/6d/19a95acb2edbace40dcd582d077b991646b7083c41b98da4ed7555b59733/pillow-12.1.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:414b9a78e14ffeb98128863314e62c3f24b8a86081066625700b7985b3f529bd", size = 3601163, upload-time = "2026-01-02T09:12:26.338Z" }, + { url = "https://files.pythonhosted.org/packages/fc/36/2b8138e51cb42e4cc39c3297713455548be855a50558c3ac2beebdc251dd/pillow-12.1.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:e6bdb408f7c9dd2a5ff2b14a3b0bb6d4deb29fb9961e6eb3ae2031ae9a5cec13", size = 5266086, upload-time = "2026-01-02T09:12:28.782Z" }, + { url = "https://files.pythonhosted.org/packages/53/4b/649056e4d22e1caa90816bf99cef0884aed607ed38075bd75f091a607a38/pillow-12.1.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3413c2ae377550f5487991d444428f1a8ae92784aac79caa8b1e3b89b175f77e", size = 4657344, upload-time = "2026-01-02T09:12:31.117Z" }, + { url = "https://files.pythonhosted.org/packages/6c/6b/c5742cea0f1ade0cd61485dc3d81f05261fc2276f537fbdc00802de56779/pillow-12.1.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e5dcbe95016e88437ecf33544ba5db21ef1b8dd6e1b434a2cb2a3d605299e643", size = 6232114, upload-time = "2026-01-02T09:12:32.936Z" }, + { url = "https://files.pythonhosted.org/packages/bf/8f/9f521268ce22d63991601aafd3d48d5ff7280a246a1ef62d626d67b44064/pillow-12.1.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d0a7735df32ccbcc98b98a1ac785cc4b19b580be1bdf0aeb5c03223220ea09d5", size = 8042708, upload-time = "2026-01-02T09:12:34.78Z" }, + { url = "https://files.pythonhosted.org/packages/1a/eb/257f38542893f021502a1bbe0c2e883c90b5cff26cc33b1584a841a06d30/pillow-12.1.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0c27407a2d1b96774cbc4a7594129cc027339fd800cd081e44497722ea1179de", size = 6347762, upload-time = "2026-01-02T09:12:36.748Z" }, + { url = "https://files.pythonhosted.org/packages/c4/5a/8ba375025701c09b309e8d5163c5a4ce0102fa86bbf8800eb0d7ac87bc51/pillow-12.1.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15c794d74303828eaa957ff8070846d0efe8c630901a1c753fdc63850e19ecd9", size = 7039265, upload-time = "2026-01-02T09:12:39.082Z" }, + { url = "https://files.pythonhosted.org/packages/cf/dc/cf5e4cdb3db533f539e88a7bbf9f190c64ab8a08a9bc7a4ccf55067872e4/pillow-12.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c990547452ee2800d8506c4150280757f88532f3de2a58e3022e9b179107862a", size = 6462341, upload-time = "2026-01-02T09:12:40.946Z" }, + { url = "https://files.pythonhosted.org/packages/d0/47/0291a25ac9550677e22eda48510cfc4fa4b2ef0396448b7fbdc0a6946309/pillow-12.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b63e13dd27da389ed9475b3d28510f0f954bca0041e8e551b2a4eb1eab56a39a", size = 7165395, upload-time = "2026-01-02T09:12:42.706Z" }, + { url = "https://files.pythonhosted.org/packages/4f/4c/e005a59393ec4d9416be06e6b45820403bb946a778e39ecec62f5b2b991e/pillow-12.1.0-cp314-cp314-win32.whl", hash = "sha256:1a949604f73eb07a8adab38c4fe50791f9919344398bdc8ac6b307f755fc7030", size = 6431413, upload-time = "2026-01-02T09:12:44.944Z" }, + { url = "https://files.pythonhosted.org/packages/1c/af/f23697f587ac5f9095d67e31b81c95c0249cd461a9798a061ed6709b09b5/pillow-12.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:4f9f6a650743f0ddee5593ac9e954ba1bdbc5e150bc066586d4f26127853ab94", size = 7176779, upload-time = "2026-01-02T09:12:46.727Z" }, + { url = "https://files.pythonhosted.org/packages/b3/36/6a51abf8599232f3e9afbd16d52829376a68909fe14efe29084445db4b73/pillow-12.1.0-cp314-cp314-win_arm64.whl", hash = "sha256:808b99604f7873c800c4840f55ff389936ef1948e4e87645eaf3fccbc8477ac4", size = 2543105, upload-time = "2026-01-02T09:12:49.243Z" }, + { url = "https://files.pythonhosted.org/packages/82/54/2e1dd20c8749ff225080d6ba465a0cab4387f5db0d1c5fb1439e2d99923f/pillow-12.1.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:bc11908616c8a283cf7d664f77411a5ed2a02009b0097ff8abbba5e79128ccf2", size = 5268571, upload-time = "2026-01-02T09:12:51.11Z" }, + { url = "https://files.pythonhosted.org/packages/57/61/571163a5ef86ec0cf30d265ac2a70ae6fc9e28413d1dc94fa37fae6bda89/pillow-12.1.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:896866d2d436563fa2a43a9d72f417874f16b5545955c54a64941e87c1376c61", size = 4660426, upload-time = "2026-01-02T09:12:52.865Z" }, + { url = "https://files.pythonhosted.org/packages/5e/e1/53ee5163f794aef1bf84243f755ee6897a92c708505350dd1923f4afec48/pillow-12.1.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8e178e3e99d3c0ea8fc64b88447f7cac8ccf058af422a6cedc690d0eadd98c51", size = 6269908, upload-time = "2026-01-02T09:12:54.884Z" }, + { url = "https://files.pythonhosted.org/packages/bc/0b/b4b4106ff0ee1afa1dc599fde6ab230417f800279745124f6c50bcffed8e/pillow-12.1.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:079af2fb0c599c2ec144ba2c02766d1b55498e373b3ac64687e43849fbbef5bc", size = 8074733, upload-time = "2026-01-02T09:12:56.802Z" }, + { url = "https://files.pythonhosted.org/packages/19/9f/80b411cbac4a732439e629a26ad3ef11907a8c7fc5377b7602f04f6fe4e7/pillow-12.1.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bdec5e43377761c5dbca620efb69a77f6855c5a379e32ac5b158f54c84212b14", size = 6381431, upload-time = "2026-01-02T09:12:58.823Z" }, + { url = "https://files.pythonhosted.org/packages/8f/b7/d65c45db463b66ecb6abc17c6ba6917a911202a07662247e1355ce1789e7/pillow-12.1.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:565c986f4b45c020f5421a4cea13ef294dde9509a8577f29b2fc5edc7587fff8", size = 7068529, upload-time = "2026-01-02T09:13:00.885Z" }, + { url = "https://files.pythonhosted.org/packages/50/96/dfd4cd726b4a45ae6e3c669fc9e49deb2241312605d33aba50499e9d9bd1/pillow-12.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:43aca0a55ce1eefc0aefa6253661cb54571857b1a7b2964bd8a1e3ef4b729924", size = 6492981, upload-time = "2026-01-02T09:13:03.314Z" }, + { url = "https://files.pythonhosted.org/packages/4d/1c/b5dc52cf713ae46033359c5ca920444f18a6359ce1020dd3e9c553ea5bc6/pillow-12.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0deedf2ea233722476b3a81e8cdfbad786f7adbed5d848469fa59fe52396e4ef", size = 7191878, upload-time = "2026-01-02T09:13:05.276Z" }, + { url = "https://files.pythonhosted.org/packages/53/26/c4188248bd5edaf543864fe4834aebe9c9cb4968b6f573ce014cc42d0720/pillow-12.1.0-cp314-cp314t-win32.whl", hash = "sha256:b17fbdbe01c196e7e159aacb889e091f28e61020a8abeac07b68079b6e626988", size = 6438703, upload-time = "2026-01-02T09:13:07.491Z" }, + { url = "https://files.pythonhosted.org/packages/b8/0e/69ed296de8ea05cb03ee139cee600f424ca166e632567b2d66727f08c7ed/pillow-12.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:27b9baecb428899db6c0de572d6d305cfaf38ca1596b5c0542a5182e3e74e8c6", size = 7182927, upload-time = "2026-01-02T09:13:09.841Z" }, + { url = "https://files.pythonhosted.org/packages/fc/f5/68334c015eed9b5cff77814258717dec591ded209ab5b6fb70e2ae873d1d/pillow-12.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:f61333d817698bdcdd0f9d7793e365ac3d2a21c1f1eb02b32ad6aefb8d8ea831", size = 2545104, upload-time = "2026-01-02T09:13:12.068Z" }, + { url = "https://files.pythonhosted.org/packages/8b/bc/224b1d98cffd7164b14707c91aac83c07b047fbd8f58eba4066a3e53746a/pillow-12.1.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:ca94b6aac0d7af2a10ba08c0f888b3d5114439b6b3ef39968378723622fed377", size = 5228605, upload-time = "2026-01-02T09:13:14.084Z" }, + { url = "https://files.pythonhosted.org/packages/0c/ca/49ca7769c4550107de049ed85208240ba0f330b3f2e316f24534795702ce/pillow-12.1.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:351889afef0f485b84078ea40fe33727a0492b9af3904661b0abbafee0355b72", size = 4622245, upload-time = "2026-01-02T09:13:15.964Z" }, + { url = "https://files.pythonhosted.org/packages/73/48/fac807ce82e5955bcc2718642b94b1bd22a82a6d452aea31cbb678cddf12/pillow-12.1.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bb0984b30e973f7e2884362b7d23d0a348c7143ee559f38ef3eaab640144204c", size = 5247593, upload-time = "2026-01-02T09:13:17.913Z" }, + { url = "https://files.pythonhosted.org/packages/d2/95/3e0742fe358c4664aed4fd05d5f5373dcdad0b27af52aa0972568541e3f4/pillow-12.1.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:84cabc7095dd535ca934d57e9ce2a72ffd216e435a84acb06b2277b1de2689bd", size = 6989008, upload-time = "2026-01-02T09:13:20.083Z" }, + { url = "https://files.pythonhosted.org/packages/5a/74/fe2ac378e4e202e56d50540d92e1ef4ff34ed687f3c60f6a121bcf99437e/pillow-12.1.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53d8b764726d3af1a138dd353116f774e3862ec7e3794e0c8781e30db0f35dfc", size = 5313824, upload-time = "2026-01-02T09:13:22.405Z" }, + { url = "https://files.pythonhosted.org/packages/f3/77/2a60dee1adee4e2655ac328dd05c02a955c1cd683b9f1b82ec3feb44727c/pillow-12.1.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5da841d81b1a05ef940a8567da92decaa15bc4d7dedb540a8c219ad83d91808a", size = 5963278, upload-time = "2026-01-02T09:13:24.706Z" }, + { url = "https://files.pythonhosted.org/packages/2d/71/64e9b1c7f04ae0027f788a248e6297d7fcc29571371fe7d45495a78172c0/pillow-12.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:75af0b4c229ac519b155028fa1be632d812a519abba9b46b20e50c6caa184f19", size = 7029809, upload-time = "2026-01-02T09:13:26.541Z" }, ] [[package]] @@ -3907,28 +3905,30 @@ wheels = [ [[package]] name = "psutil" -version = "7.1.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e1/88/bdd0a41e5857d5d703287598cbf08dad90aed56774ea52ae071bae9071b6/psutil-7.1.3.tar.gz", hash = "sha256:6c86281738d77335af7aec228328e944b30930899ea760ecf33a4dba66be5e74", size = 489059, upload-time = "2025-11-02T12:25:54.619Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/bd/93/0c49e776b8734fef56ec9c5c57f923922f2cf0497d62e0f419465f28f3d0/psutil-7.1.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0005da714eee687b4b8decd3d6cc7c6db36215c9e74e5ad2264b90c3df7d92dc", size = 239751, upload-time = "2025-11-02T12:25:58.161Z" }, - { url = "https://files.pythonhosted.org/packages/6f/8d/b31e39c769e70780f007969815195a55c81a63efebdd4dbe9e7a113adb2f/psutil-7.1.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:19644c85dcb987e35eeeaefdc3915d059dac7bd1167cdcdbf27e0ce2df0c08c0", size = 240368, upload-time = "2025-11-02T12:26:00.491Z" }, - { url = "https://files.pythonhosted.org/packages/62/61/23fd4acc3c9eebbf6b6c78bcd89e5d020cfde4acf0a9233e9d4e3fa698b4/psutil-7.1.3-cp313-cp313t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:95ef04cf2e5ba0ab9eaafc4a11eaae91b44f4ef5541acd2ee91d9108d00d59a7", size = 287134, upload-time = "2025-11-02T12:26:02.613Z" }, - { url = "https://files.pythonhosted.org/packages/30/1c/f921a009ea9ceb51aa355cb0cc118f68d354db36eae18174bab63affb3e6/psutil-7.1.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1068c303be3a72f8e18e412c5b2a8f6d31750fb152f9cb106b54090296c9d251", size = 289904, upload-time = "2025-11-02T12:26:05.207Z" }, - { url = "https://files.pythonhosted.org/packages/a6/82/62d68066e13e46a5116df187d319d1724b3f437ddd0f958756fc052677f4/psutil-7.1.3-cp313-cp313t-win_amd64.whl", hash = "sha256:18349c5c24b06ac5612c0428ec2a0331c26443d259e2a0144a9b24b4395b58fa", size = 249642, upload-time = "2025-11-02T12:26:07.447Z" }, - { url = "https://files.pythonhosted.org/packages/df/ad/c1cd5fe965c14a0392112f68362cfceb5230819dbb5b1888950d18a11d9f/psutil-7.1.3-cp313-cp313t-win_arm64.whl", hash = "sha256:c525ffa774fe4496282fb0b1187725793de3e7c6b29e41562733cae9ada151ee", size = 245518, upload-time = "2025-11-02T12:26:09.719Z" }, - { url = "https://files.pythonhosted.org/packages/2e/bb/6670bded3e3236eb4287c7bcdc167e9fae6e1e9286e437f7111caed2f909/psutil-7.1.3-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:b403da1df4d6d43973dc004d19cee3b848e998ae3154cc8097d139b77156c353", size = 239843, upload-time = "2025-11-02T12:26:11.968Z" }, - { url = "https://files.pythonhosted.org/packages/b8/66/853d50e75a38c9a7370ddbeefabdd3d3116b9c31ef94dc92c6729bc36bec/psutil-7.1.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ad81425efc5e75da3f39b3e636293360ad8d0b49bed7df824c79764fb4ba9b8b", size = 240369, upload-time = "2025-11-02T12:26:14.358Z" }, - { url = "https://files.pythonhosted.org/packages/41/bd/313aba97cb5bfb26916dc29cf0646cbe4dd6a89ca69e8c6edce654876d39/psutil-7.1.3-cp314-cp314t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8f33a3702e167783a9213db10ad29650ebf383946e91bc77f28a5eb083496bc9", size = 288210, upload-time = "2025-11-02T12:26:16.699Z" }, - { url = "https://files.pythonhosted.org/packages/c2/fa/76e3c06e760927a0cfb5705eb38164254de34e9bd86db656d4dbaa228b04/psutil-7.1.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fac9cd332c67f4422504297889da5ab7e05fd11e3c4392140f7370f4208ded1f", size = 291182, upload-time = "2025-11-02T12:26:18.848Z" }, - { url = "https://files.pythonhosted.org/packages/0f/1d/5774a91607035ee5078b8fd747686ebec28a962f178712de100d00b78a32/psutil-7.1.3-cp314-cp314t-win_amd64.whl", hash = "sha256:3792983e23b69843aea49c8f5b8f115572c5ab64c153bada5270086a2123c7e7", size = 250466, upload-time = "2025-11-02T12:26:21.183Z" }, - { url = "https://files.pythonhosted.org/packages/00/ca/e426584bacb43a5cb1ac91fae1937f478cd8fbe5e4ff96574e698a2c77cd/psutil-7.1.3-cp314-cp314t-win_arm64.whl", hash = "sha256:31d77fcedb7529f27bb3a0472bea9334349f9a04160e8e6e5020f22c59893264", size = 245756, upload-time = "2025-11-02T12:26:23.148Z" }, - { url = "https://files.pythonhosted.org/packages/ef/94/46b9154a800253e7ecff5aaacdf8ebf43db99de4a2dfa18575b02548654e/psutil-7.1.3-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2bdbcd0e58ca14996a42adf3621a6244f1bb2e2e528886959c72cf1e326677ab", size = 238359, upload-time = "2025-11-02T12:26:25.284Z" }, - { url = "https://files.pythonhosted.org/packages/68/3a/9f93cff5c025029a36d9a92fef47220ab4692ee7f2be0fba9f92813d0cb8/psutil-7.1.3-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:bc31fa00f1fbc3c3802141eede66f3a2d51d89716a194bf2cd6fc68310a19880", size = 239171, upload-time = "2025-11-02T12:26:27.23Z" }, - { url = "https://files.pythonhosted.org/packages/ce/b1/5f49af514f76431ba4eea935b8ad3725cdeb397e9245ab919dbc1d1dc20f/psutil-7.1.3-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3bb428f9f05c1225a558f53e30ccbad9930b11c3fc206836242de1091d3e7dd3", size = 263261, upload-time = "2025-11-02T12:26:29.48Z" }, - { url = "https://files.pythonhosted.org/packages/e0/95/992c8816a74016eb095e73585d747e0a8ea21a061ed3689474fabb29a395/psutil-7.1.3-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56d974e02ca2c8eb4812c3f76c30e28836fffc311d55d979f1465c1feeb2b68b", size = 264635, upload-time = "2025-11-02T12:26:31.74Z" }, - { url = "https://files.pythonhosted.org/packages/55/4c/c3ed1a622b6ae2fd3c945a366e64eb35247a31e4db16cf5095e269e8eb3c/psutil-7.1.3-cp37-abi3-win_amd64.whl", hash = "sha256:f39c2c19fe824b47484b96f9692932248a54c43799a84282cfe58d05a6449efd", size = 247633, upload-time = "2025-11-02T12:26:33.887Z" }, - { url = "https://files.pythonhosted.org/packages/c9/ad/33b2ccec09bf96c2b2ef3f9a6f66baac8253d7565d8839e024a6b905d45d/psutil-7.1.3-cp37-abi3-win_arm64.whl", hash = "sha256:bd0d69cee829226a761e92f28140bec9a5ee9d5b4fb4b0cc589068dbfff559b1", size = 244608, upload-time = "2025-11-02T12:26:36.136Z" }, +version = "7.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/73/cb/09e5184fb5fc0358d110fc3ca7f6b1d033800734d34cac10f4136cfac10e/psutil-7.2.1.tar.gz", hash = "sha256:f7583aec590485b43ca601dd9cea0dcd65bd7bb21d30ef4ddbf4ea6b5ed1bdd3", size = 490253, upload-time = "2025-12-29T08:26:00.169Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/77/8e/f0c242053a368c2aa89584ecd1b054a18683f13d6e5a318fc9ec36582c94/psutil-7.2.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:ba9f33bb525b14c3ea563b2fd521a84d2fa214ec59e3e6a2858f78d0844dd60d", size = 129624, upload-time = "2025-12-29T08:26:04.255Z" }, + { url = "https://files.pythonhosted.org/packages/26/97/a58a4968f8990617decee234258a2b4fc7cd9e35668387646c1963e69f26/psutil-7.2.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:81442dac7abfc2f4f4385ea9e12ddf5a796721c0f6133260687fec5c3780fa49", size = 130132, upload-time = "2025-12-29T08:26:06.228Z" }, + { url = "https://files.pythonhosted.org/packages/db/6d/ed44901e830739af5f72a85fa7ec5ff1edea7f81bfbf4875e409007149bd/psutil-7.2.1-cp313-cp313t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ea46c0d060491051d39f0d2cff4f98d5c72b288289f57a21556cc7d504db37fc", size = 180612, upload-time = "2025-12-29T08:26:08.276Z" }, + { url = "https://files.pythonhosted.org/packages/c7/65/b628f8459bca4efbfae50d4bf3feaab803de9a160b9d5f3bd9295a33f0c2/psutil-7.2.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:35630d5af80d5d0d49cfc4d64c1c13838baf6717a13effb35869a5919b854cdf", size = 183201, upload-time = "2025-12-29T08:26:10.622Z" }, + { url = "https://files.pythonhosted.org/packages/fb/23/851cadc9764edcc18f0effe7d0bf69f727d4cf2442deb4a9f78d4e4f30f2/psutil-7.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:923f8653416604e356073e6e0bccbe7c09990acef442def2f5640dd0faa9689f", size = 139081, upload-time = "2025-12-29T08:26:12.483Z" }, + { url = "https://files.pythonhosted.org/packages/59/82/d63e8494ec5758029f31c6cb06d7d161175d8281e91d011a4a441c8a43b5/psutil-7.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:cfbe6b40ca48019a51827f20d830887b3107a74a79b01ceb8cc8de4ccb17b672", size = 134767, upload-time = "2025-12-29T08:26:14.528Z" }, + { url = "https://files.pythonhosted.org/packages/05/c2/5fb764bd61e40e1fe756a44bd4c21827228394c17414ade348e28f83cd79/psutil-7.2.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:494c513ccc53225ae23eec7fe6e1482f1b8a44674241b54561f755a898650679", size = 129716, upload-time = "2025-12-29T08:26:16.017Z" }, + { url = "https://files.pythonhosted.org/packages/c9/d2/935039c20e06f615d9ca6ca0ab756cf8408a19d298ffaa08666bc18dc805/psutil-7.2.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3fce5f92c22b00cdefd1645aa58ab4877a01679e901555067b1bd77039aa589f", size = 130133, upload-time = "2025-12-29T08:26:18.009Z" }, + { url = "https://files.pythonhosted.org/packages/77/69/19f1eb0e01d24c2b3eacbc2f78d3b5add8a89bf0bb69465bc8d563cc33de/psutil-7.2.1-cp314-cp314t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:93f3f7b0bb07711b49626e7940d6fe52aa9940ad86e8f7e74842e73189712129", size = 181518, upload-time = "2025-12-29T08:26:20.241Z" }, + { url = "https://files.pythonhosted.org/packages/e1/6d/7e18b1b4fa13ad370787626c95887b027656ad4829c156bb6569d02f3262/psutil-7.2.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d34d2ca888208eea2b5c68186841336a7f5e0b990edec929be909353a202768a", size = 184348, upload-time = "2025-12-29T08:26:22.215Z" }, + { url = "https://files.pythonhosted.org/packages/98/60/1672114392dd879586d60dd97896325df47d9a130ac7401318005aab28ec/psutil-7.2.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2ceae842a78d1603753561132d5ad1b2f8a7979cb0c283f5b52fb4e6e14b1a79", size = 140400, upload-time = "2025-12-29T08:26:23.993Z" }, + { url = "https://files.pythonhosted.org/packages/fb/7b/d0e9d4513c46e46897b46bcfc410d51fc65735837ea57a25170f298326e6/psutil-7.2.1-cp314-cp314t-win_arm64.whl", hash = "sha256:08a2f175e48a898c8eb8eace45ce01777f4785bc744c90aa2cc7f2fa5462a266", size = 135430, upload-time = "2025-12-29T08:26:25.999Z" }, + { url = "https://files.pythonhosted.org/packages/c5/cf/5180eb8c8bdf6a503c6919f1da28328bd1e6b3b1b5b9d5b01ae64f019616/psutil-7.2.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:b2e953fcfaedcfbc952b44744f22d16575d3aa78eb4f51ae74165b4e96e55f42", size = 128137, upload-time = "2025-12-29T08:26:27.759Z" }, + { url = "https://files.pythonhosted.org/packages/c5/2c/78e4a789306a92ade5000da4f5de3255202c534acdadc3aac7b5458fadef/psutil-7.2.1-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:05cc68dbb8c174828624062e73078e7e35406f4ca2d0866c272c2410d8ef06d1", size = 128947, upload-time = "2025-12-29T08:26:29.548Z" }, + { url = "https://files.pythonhosted.org/packages/29/f8/40e01c350ad9a2b3cb4e6adbcc8a83b17ee50dd5792102b6142385937db5/psutil-7.2.1-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e38404ca2bb30ed7267a46c02f06ff842e92da3bb8c5bfdadbd35a5722314d8", size = 154694, upload-time = "2025-12-29T08:26:32.147Z" }, + { url = "https://files.pythonhosted.org/packages/06/e4/b751cdf839c011a9714a783f120e6a86b7494eb70044d7d81a25a5cd295f/psutil-7.2.1-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ab2b98c9fc19f13f59628d94df5cc4cc4844bc572467d113a8b517d634e362c6", size = 156136, upload-time = "2025-12-29T08:26:34.079Z" }, + { url = "https://files.pythonhosted.org/packages/44/ad/bbf6595a8134ee1e94a4487af3f132cef7fce43aef4a93b49912a48c3af7/psutil-7.2.1-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:f78baafb38436d5a128f837fab2d92c276dfb48af01a240b861ae02b2413ada8", size = 148108, upload-time = "2025-12-29T08:26:36.225Z" }, + { url = "https://files.pythonhosted.org/packages/1c/15/dd6fd869753ce82ff64dcbc18356093471a5a5adf4f77ed1f805d473d859/psutil-7.2.1-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:99a4cd17a5fdd1f3d014396502daa70b5ec21bf4ffe38393e152f8e449757d67", size = 147402, upload-time = "2025-12-29T08:26:39.21Z" }, + { url = "https://files.pythonhosted.org/packages/34/68/d9317542e3f2b180c4306e3f45d3c922d7e86d8ce39f941bb9e2e9d8599e/psutil-7.2.1-cp37-abi3-win_amd64.whl", hash = "sha256:b1b0671619343aa71c20ff9767eced0483e4fc9e1f489d50923738caf6a03c17", size = 136938, upload-time = "2025-12-29T08:26:41.036Z" }, + { url = "https://files.pythonhosted.org/packages/3e/73/2ce007f4198c80fcf2cb24c169884f833fe93fbc03d55d302627b094ee91/psutil-7.2.1-cp37-abi3-win_arm64.whl", hash = "sha256:0d67c1822c355aa6f7314d92018fb4268a76668a536f133599b91edd48759442", size = 133836, upload-time = "2025-12-29T08:26:43.086Z" }, ] [[package]] @@ -4245,39 +4245,37 @@ wheels = [ [[package]] name = "pynacl" -version = "1.6.1" +version = "1.6.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cffi", marker = "platform_python_implementation != 'PyPy' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b2/46/aeca065d227e2265125aea590c9c47fbf5786128c9400ee0eb7c88931f06/pynacl-1.6.1.tar.gz", hash = "sha256:8d361dac0309f2b6ad33b349a56cd163c98430d409fa503b10b70b3ad66eaa1d", size = 3506616, upload-time = "2025-11-10T16:02:13.195Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/75/d6/4b2dca33ed512de8f54e5c6074aa06eaeb225bfbcd9b16f33a414389d6bd/pynacl-1.6.1-cp314-cp314t-macosx_10_10_universal2.whl", hash = "sha256:7d7c09749450c385301a3c20dca967a525152ae4608c0a096fe8464bfc3df93d", size = 389109, upload-time = "2025-11-10T16:01:28.79Z" }, - { url = "https://files.pythonhosted.org/packages/3c/30/e8dbb8ff4fa2559bbbb2187ba0d0d7faf728d17cb8396ecf4a898b22d3da/pynacl-1.6.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc734c1696ffd49b40f7c1779c89ba908157c57345cf626be2e0719488a076d3", size = 808254, upload-time = "2025-11-10T16:01:37.839Z" }, - { url = "https://files.pythonhosted.org/packages/44/f9/f5449c652f31da00249638dbab065ad4969c635119094b79b17c3a4da2ab/pynacl-1.6.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3cd787ec1f5c155dc8ecf39b1333cfef41415dc96d392f1ce288b4fe970df489", size = 1407365, upload-time = "2025-11-10T16:01:40.454Z" }, - { url = "https://files.pythonhosted.org/packages/eb/2f/9aa5605f473b712065c0a193ebf4ad4725d7a245533f0cd7e5dcdbc78f35/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b35d93ab2df03ecb3aa506be0d3c73609a51449ae0855c2e89c7ed44abde40b", size = 843842, upload-time = "2025-11-10T16:01:30.524Z" }, - { url = "https://files.pythonhosted.org/packages/32/8d/748f0f6956e207453da8f5f21a70885fbbb2e060d5c9d78e0a4a06781451/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dece79aecbb8f4640a1adbb81e4aa3bfb0e98e99834884a80eb3f33c7c30e708", size = 1445559, upload-time = "2025-11-10T16:01:33.663Z" }, - { url = "https://files.pythonhosted.org/packages/78/d0/2387f0dcb0e9816f38373999e48db4728ed724d31accdd4e737473319d35/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:c2228054f04bf32d558fb89bb99f163a8197d5a9bf4efa13069a7fa8d4b93fc3", size = 825791, upload-time = "2025-11-10T16:01:34.823Z" }, - { url = "https://files.pythonhosted.org/packages/18/3d/ef6fb7eb072aaf15f280bc66f26ab97e7fc9efa50fb1927683013ef47473/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:2b12f1b97346f177affcdfdc78875ff42637cb40dcf79484a97dae3448083a78", size = 1410843, upload-time = "2025-11-10T16:01:36.401Z" }, - { url = "https://files.pythonhosted.org/packages/e3/fb/23824a017526850ee7d8a1cc4cd1e3e5082800522c10832edbbca8619537/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e735c3a1bdfde3834503baf1a6d74d4a143920281cb724ba29fb84c9f49b9c48", size = 801140, upload-time = "2025-11-10T16:01:42.013Z" }, - { url = "https://files.pythonhosted.org/packages/5d/d1/ebc6b182cb98603a35635b727d62f094bc201bf610f97a3bb6357fe688d2/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3384a454adf5d716a9fadcb5eb2e3e72cd49302d1374a60edc531c9957a9b014", size = 1371966, upload-time = "2025-11-10T16:01:43.297Z" }, - { url = "https://files.pythonhosted.org/packages/64/f4/c9d7b6f02924b1f31db546c7bd2a83a2421c6b4a8e6a2e53425c9f2802e0/pynacl-1.6.1-cp314-cp314t-win32.whl", hash = "sha256:d8615ee34d01c8e0ab3f302dcdd7b32e2bcf698ba5f4809e7cc407c8cdea7717", size = 230482, upload-time = "2025-11-10T16:01:47.688Z" }, - { url = "https://files.pythonhosted.org/packages/c4/2c/942477957fba22da7bf99131850e5ebdff66623418ab48964e78a7a8293e/pynacl-1.6.1-cp314-cp314t-win_amd64.whl", hash = "sha256:5f5b35c1a266f8a9ad22525049280a600b19edd1f785bccd01ae838437dcf935", size = 243232, upload-time = "2025-11-10T16:01:45.208Z" }, - { url = "https://files.pythonhosted.org/packages/7a/0c/bdbc0d04a53b96a765ab03aa2cf9a76ad8653d70bf1665459b9a0dedaa1c/pynacl-1.6.1-cp314-cp314t-win_arm64.whl", hash = "sha256:d984c91fe3494793b2a1fb1e91429539c6c28e9ec8209d26d25041ec599ccf63", size = 187907, upload-time = "2025-11-10T16:01:46.328Z" }, - { url = "https://files.pythonhosted.org/packages/49/41/3cfb3b4f3519f6ff62bf71bf1722547644bcfb1b05b8fdbdc300249ba113/pynacl-1.6.1-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:a6f9fd6d6639b1e81115c7f8ff16b8dedba1e8098d2756275d63d208b0e32021", size = 387591, upload-time = "2025-11-10T16:01:49.1Z" }, - { url = "https://files.pythonhosted.org/packages/18/21/b8a6563637799f617a3960f659513eccb3fcc655d5fc2be6e9dc6416826f/pynacl-1.6.1-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e49a3f3d0da9f79c1bec2aa013261ab9fa651c7da045d376bd306cf7c1792993", size = 798866, upload-time = "2025-11-10T16:01:55.688Z" }, - { url = "https://files.pythonhosted.org/packages/e8/6c/dc38033bc3ea461e05ae8f15a81e0e67ab9a01861d352ae971c99de23e7c/pynacl-1.6.1-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7713f8977b5d25f54a811ec9efa2738ac592e846dd6e8a4d3f7578346a841078", size = 1398001, upload-time = "2025-11-10T16:01:57.101Z" }, - { url = "https://files.pythonhosted.org/packages/9f/05/3ec0796a9917100a62c5073b20c4bce7bf0fea49e99b7906d1699cc7b61b/pynacl-1.6.1-cp38-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a3becafc1ee2e5ea7f9abc642f56b82dcf5be69b961e782a96ea52b55d8a9fc", size = 834024, upload-time = "2025-11-10T16:01:50.228Z" }, - { url = "https://files.pythonhosted.org/packages/f0/b7/ae9982be0f344f58d9c64a1c25d1f0125c79201634efe3c87305ac7cb3e3/pynacl-1.6.1-cp38-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ce50d19f1566c391fedc8dc2f2f5be265ae214112ebe55315e41d1f36a7f0a9", size = 1436766, upload-time = "2025-11-10T16:01:51.886Z" }, - { url = "https://files.pythonhosted.org/packages/b4/51/b2ccbf89cf3025a02e044dd68a365cad593ebf70f532299f2c047d2b7714/pynacl-1.6.1-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:543f869140f67d42b9b8d47f922552d7a967e6c116aad028c9bfc5f3f3b3a7b7", size = 817275, upload-time = "2025-11-10T16:01:53.351Z" }, - { url = "https://files.pythonhosted.org/packages/a8/6c/dd9ee8214edf63ac563b08a9b30f98d116942b621d39a751ac3256694536/pynacl-1.6.1-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a2bb472458c7ca959aeeff8401b8efef329b0fc44a89d3775cffe8fad3398ad8", size = 1401891, upload-time = "2025-11-10T16:01:54.587Z" }, - { url = "https://files.pythonhosted.org/packages/0f/c1/97d3e1c83772d78ee1db3053fd674bc6c524afbace2bfe8d419fd55d7ed1/pynacl-1.6.1-cp38-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3206fa98737fdc66d59b8782cecc3d37d30aeec4593d1c8c145825a345bba0f0", size = 772291, upload-time = "2025-11-10T16:01:58.111Z" }, - { url = "https://files.pythonhosted.org/packages/4d/ca/691ff2fe12f3bb3e43e8e8df4b806f6384593d427f635104d337b8e00291/pynacl-1.6.1-cp38-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:53543b4f3d8acb344f75fd4d49f75e6572fce139f4bfb4815a9282296ff9f4c0", size = 1370839, upload-time = "2025-11-10T16:01:59.252Z" }, - { url = "https://files.pythonhosted.org/packages/30/27/06fe5389d30391fce006442246062cc35773c84fbcad0209fbbf5e173734/pynacl-1.6.1-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:319de653ef84c4f04e045eb250e6101d23132372b0a61a7acf91bac0fda8e58c", size = 791371, upload-time = "2025-11-10T16:02:01.075Z" }, - { url = "https://files.pythonhosted.org/packages/2c/7a/e2bde8c9d39074a5aa046c7d7953401608d1f16f71e237f4bef3fb9d7e49/pynacl-1.6.1-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:262a8de6bba4aee8a66f5edf62c214b06647461c9b6b641f8cd0cb1e3b3196fe", size = 1363031, upload-time = "2025-11-10T16:02:02.656Z" }, - { url = "https://files.pythonhosted.org/packages/dd/b6/63fd77264dae1087770a1bb414bc604470f58fbc21d83822fc9c76248076/pynacl-1.6.1-cp38-abi3-win32.whl", hash = "sha256:9fd1a4eb03caf8a2fe27b515a998d26923adb9ddb68db78e35ca2875a3830dde", size = 226585, upload-time = "2025-11-10T16:02:07.116Z" }, - { url = "https://files.pythonhosted.org/packages/12/c8/b419180f3fdb72ab4d45e1d88580761c267c7ca6eda9a20dcbcba254efe6/pynacl-1.6.1-cp38-abi3-win_amd64.whl", hash = "sha256:a569a4069a7855f963940040f35e87d8bc084cb2d6347428d5ad20550a0a1a21", size = 238923, upload-time = "2025-11-10T16:02:04.401Z" }, - { url = "https://files.pythonhosted.org/packages/35/76/c34426d532e4dce7ff36e4d92cb20f4cbbd94b619964b93d24e8f5b5510f/pynacl-1.6.1-cp38-abi3-win_arm64.whl", hash = "sha256:5953e8b8cfadb10889a6e7bd0f53041a745d1b3d30111386a1bb37af171e6daf", size = 183970, upload-time = "2025-11-10T16:02:05.786Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/d9/9a/4019b524b03a13438637b11538c82781a5eda427394380381af8f04f467a/pynacl-1.6.2.tar.gz", hash = "sha256:018494d6d696ae03c7e656e5e74cdfd8ea1326962cc401bcf018f1ed8436811c", size = 3511692, upload-time = "2026-01-01T17:48:10.851Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4b/79/0e3c34dc3c4671f67d251c07aa8eb100916f250ee470df230b0ab89551b4/pynacl-1.6.2-cp314-cp314t-macosx_10_10_universal2.whl", hash = "sha256:622d7b07cc5c02c666795792931b50c91f3ce3c2649762efb1ef0d5684c81594", size = 390064, upload-time = "2026-01-01T17:31:57.264Z" }, + { url = "https://files.pythonhosted.org/packages/eb/1c/23a26e931736e13b16483795c8a6b2f641bf6a3d5238c22b070a5112722c/pynacl-1.6.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d071c6a9a4c94d79eb665db4ce5cedc537faf74f2355e4d502591d850d3913c0", size = 809370, upload-time = "2026-01-01T17:31:59.198Z" }, + { url = "https://files.pythonhosted.org/packages/87/74/8d4b718f8a22aea9e8dcc8b95deb76d4aae380e2f5b570cc70b5fd0a852d/pynacl-1.6.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe9847ca47d287af41e82be1dd5e23023d3c31a951da134121ab02e42ac218c9", size = 1408304, upload-time = "2026-01-01T17:32:01.162Z" }, + { url = "https://files.pythonhosted.org/packages/fd/73/be4fdd3a6a87fe8a4553380c2b47fbd1f7f58292eb820902f5c8ac7de7b0/pynacl-1.6.2-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:04316d1fc625d860b6c162fff704eb8426b1a8bcd3abacea11142cbd99a6b574", size = 844871, upload-time = "2026-01-01T17:32:02.824Z" }, + { url = "https://files.pythonhosted.org/packages/55/ad/6efc57ab75ee4422e96b5f2697d51bbcf6cdcc091e66310df91fbdc144a8/pynacl-1.6.2-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44081faff368d6c5553ccf55322ef2819abb40e25afaec7e740f159f74813634", size = 1446356, upload-time = "2026-01-01T17:32:04.452Z" }, + { url = "https://files.pythonhosted.org/packages/78/b7/928ee9c4779caa0a915844311ab9fb5f99585621c5d6e4574538a17dca07/pynacl-1.6.2-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:a9f9932d8d2811ce1a8ffa79dcbdf3970e7355b5c8eb0c1a881a57e7f7d96e88", size = 826814, upload-time = "2026-01-01T17:32:06.078Z" }, + { url = "https://files.pythonhosted.org/packages/f7/a9/1bdba746a2be20f8809fee75c10e3159d75864ef69c6b0dd168fc60e485d/pynacl-1.6.2-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:bc4a36b28dd72fb4845e5d8f9760610588a96d5a51f01d84d8c6ff9849968c14", size = 1411742, upload-time = "2026-01-01T17:32:07.651Z" }, + { url = "https://files.pythonhosted.org/packages/f3/2f/5e7ea8d85f9f3ea5b6b87db1d8388daa3587eed181bdeb0306816fdbbe79/pynacl-1.6.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3bffb6d0f6becacb6526f8f42adfb5efb26337056ee0831fb9a7044d1a964444", size = 801714, upload-time = "2026-01-01T17:32:09.558Z" }, + { url = "https://files.pythonhosted.org/packages/06/ea/43fe2f7eab5f200e40fb10d305bf6f87ea31b3bbc83443eac37cd34a9e1e/pynacl-1.6.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:2fef529ef3ee487ad8113d287a593fa26f48ee3620d92ecc6f1d09ea38e0709b", size = 1372257, upload-time = "2026-01-01T17:32:11.026Z" }, + { url = "https://files.pythonhosted.org/packages/4d/54/c9ea116412788629b1347e415f72195c25eb2f3809b2d3e7b25f5c79f13a/pynacl-1.6.2-cp314-cp314t-win32.whl", hash = "sha256:a84bf1c20339d06dc0c85d9aea9637a24f718f375d861b2668b2f9f96fa51145", size = 231319, upload-time = "2026-01-01T17:32:12.46Z" }, + { url = "https://files.pythonhosted.org/packages/ce/04/64e9d76646abac2dccf904fccba352a86e7d172647557f35b9fe2a5ee4a1/pynacl-1.6.2-cp314-cp314t-win_amd64.whl", hash = "sha256:320ef68a41c87547c91a8b58903c9caa641ab01e8512ce291085b5fe2fcb7590", size = 244044, upload-time = "2026-01-01T17:32:13.781Z" }, + { url = "https://files.pythonhosted.org/packages/33/33/7873dc161c6a06f43cda13dec67b6fe152cb2f982581151956fa5e5cdb47/pynacl-1.6.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d29bfe37e20e015a7d8b23cfc8bd6aa7909c92a1b8f41ee416bbb3e79ef182b2", size = 188740, upload-time = "2026-01-01T17:32:15.083Z" }, + { url = "https://files.pythonhosted.org/packages/be/7b/4845bbf88e94586ec47a432da4e9107e3fc3ce37eb412b1398630a37f7dd/pynacl-1.6.2-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:c949ea47e4206af7c8f604b8278093b674f7c79ed0d4719cc836902bf4517465", size = 388458, upload-time = "2026-01-01T17:32:16.829Z" }, + { url = "https://files.pythonhosted.org/packages/1e/b4/e927e0653ba63b02a4ca5b4d852a8d1d678afbf69b3dbf9c4d0785ac905c/pynacl-1.6.2-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8845c0631c0be43abdd865511c41eab235e0be69c81dc66a50911594198679b0", size = 800020, upload-time = "2026-01-01T17:32:18.34Z" }, + { url = "https://files.pythonhosted.org/packages/7f/81/d60984052df5c97b1d24365bc1e30024379b42c4edcd79d2436b1b9806f2/pynacl-1.6.2-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:22de65bb9010a725b0dac248f353bb072969c94fa8d6b1f34b87d7953cf7bbe4", size = 1399174, upload-time = "2026-01-01T17:32:20.239Z" }, + { url = "https://files.pythonhosted.org/packages/68/f7/322f2f9915c4ef27d140101dd0ed26b479f7e6f5f183590fd32dfc48c4d3/pynacl-1.6.2-cp38-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:46065496ab748469cdd999246d17e301b2c24ae2fdf739132e580a0e94c94a87", size = 835085, upload-time = "2026-01-01T17:32:22.24Z" }, + { url = "https://files.pythonhosted.org/packages/3e/d0/f301f83ac8dbe53442c5a43f6a39016f94f754d7a9815a875b65e218a307/pynacl-1.6.2-cp38-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8a66d6fb6ae7661c58995f9c6435bda2b1e68b54b598a6a10247bfcdadac996c", size = 1437614, upload-time = "2026-01-01T17:32:23.766Z" }, + { url = "https://files.pythonhosted.org/packages/c4/58/fc6e649762b029315325ace1a8c6be66125e42f67416d3dbd47b69563d61/pynacl-1.6.2-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:26bfcd00dcf2cf160f122186af731ae30ab120c18e8375684ec2670dccd28130", size = 818251, upload-time = "2026-01-01T17:32:25.69Z" }, + { url = "https://files.pythonhosted.org/packages/c9/a8/b917096b1accc9acd878819a49d3d84875731a41eb665f6ebc826b1af99e/pynacl-1.6.2-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:c8a231e36ec2cab018c4ad4358c386e36eede0319a0c41fed24f840b1dac59f6", size = 1402859, upload-time = "2026-01-01T17:32:27.215Z" }, + { url = "https://files.pythonhosted.org/packages/85/42/fe60b5f4473e12c72f977548e4028156f4d340b884c635ec6b063fe7e9a5/pynacl-1.6.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:68be3a09455743ff9505491220b64440ced8973fe930f270c8e07ccfa25b1f9e", size = 791926, upload-time = "2026-01-01T17:32:29.314Z" }, + { url = "https://files.pythonhosted.org/packages/fa/f9/e40e318c604259301cc091a2a63f237d9e7b424c4851cafaea4ea7c4834e/pynacl-1.6.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:8b097553b380236d51ed11356c953bf8ce36a29a3e596e934ecabe76c985a577", size = 1363101, upload-time = "2026-01-01T17:32:31.263Z" }, + { url = "https://files.pythonhosted.org/packages/48/47/e761c254f410c023a469284a9bc210933e18588ca87706ae93002c05114c/pynacl-1.6.2-cp38-abi3-win32.whl", hash = "sha256:5811c72b473b2f38f7e2a3dc4f8642e3a3e9b5e7317266e4ced1fba85cae41aa", size = 227421, upload-time = "2026-01-01T17:32:33.076Z" }, + { url = "https://files.pythonhosted.org/packages/41/ad/334600e8cacc7d86587fe5f565480fde569dfb487389c8e1be56ac21d8ac/pynacl-1.6.2-cp38-abi3-win_amd64.whl", hash = "sha256:62985f233210dee6548c223301b6c25440852e13d59a8b81490203c3227c5ba0", size = 239754, upload-time = "2026-01-01T17:32:34.557Z" }, + { url = "https://files.pythonhosted.org/packages/29/7d/5945b5af29534641820d3bd7b00962abbbdfee84ec7e19f0d5b3175f9a31/pynacl-1.6.2-cp38-abi3-win_arm64.whl", hash = "sha256:834a43af110f743a754448463e8fd61259cd4ab5bbedcf70f9dabad1d28a394c", size = 184801, upload-time = "2026-01-01T17:32:36.309Z" }, ] [[package]] @@ -4376,15 +4374,15 @@ wheels = [ [[package]] name = "python-gitlab" -version = "7.0.0" +version = "7.1.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "requests" }, { name = "requests-toolbelt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/5e/c4/0b613303b4f0fcda69b3d2e03d0a1fb1b6b079a7c7832e03a8d92461e9fe/python_gitlab-7.0.0.tar.gz", hash = "sha256:e4d934430f64efc09e6208b782c61cc0a3389527765e03ffbef17f4323dce441", size = 400568, upload-time = "2025-10-29T15:06:02.069Z" } +sdist = { url = "https://files.pythonhosted.org/packages/31/98/0b5d0a0367b90aec818298390b60ae65e6a08989cf5140271d0ee0206882/python_gitlab-7.1.0.tar.gz", hash = "sha256:1c34da3de40ad21675d788136f73d20a60649513e692f52c5a9720434db97c46", size = 401058, upload-time = "2025-12-28T01:27:01.369Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/4f/9e/811edc46a15f8deb828cba7ef8aab3451dc11ca72d033f3df72a5af865d9/python_gitlab-7.0.0-py3-none-any.whl", hash = "sha256:712a6c8c5e79e7e66f6dabb25d8fe7831a6b238d4a5132f8231df6b3b890ceff", size = 144415, upload-time = "2025-10-29T15:06:00.232Z" }, + { url = "https://files.pythonhosted.org/packages/14/44/70fa1e395731b6a4b1f249d5f7326f3bb6281e2cf94d6535f679239f4b93/python_gitlab-7.1.0-py3-none-any.whl", hash = "sha256:8e42030cf27674e7ec9ea1f6d2fedcaaef0a6210f5fa22c80721abaa3a4fec90", size = 144441, upload-time = "2025-12-28T01:26:59.726Z" }, ] [[package]] @@ -4517,7 +4515,7 @@ wheels = [ [[package]] name = "ray" -version = "2.51.2" +version = "2.53.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click" }, @@ -4530,21 +4528,21 @@ dependencies = [ { name = "requests" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/6a/ad/59270b7d1003152ef231b65c38c3721066fc970b2a2475314e7c8ee81990/ray-2.51.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:eb9b995de9ba3110373f00e77dda86f6a55a80a58114b1eae5e6daf1f5697338", size = 68040029, upload-time = "2025-11-29T00:28:25.435Z" }, - { url = "https://files.pythonhosted.org/packages/bc/bf/43442642cf4f29ac9ef721d9b184512ed84436e65d8244f1867e31b1ecdb/ray-2.51.2-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:983adacd9cecf2f74f7915560036f14c5d4fabdf6f65d959debc92820373729d", size = 70344819, upload-time = "2025-11-29T00:28:32.157Z" }, - { url = "https://files.pythonhosted.org/packages/57/78/79d8b884492b28c5d9ec99fd8750baaf30e311e79013e9f137dafee3b246/ray-2.51.2-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:572d8f7e95e506d6264c7b916fe70e765e3367d5f1bc9755bc1d73c8607a2ac6", size = 71172369, upload-time = "2025-11-29T00:28:38.511Z" }, - { url = "https://files.pythonhosted.org/packages/6a/26/632c509eda0742f6c9e8c876ebe308cfdefdd2cdd414fcb4e65c37490995/ray-2.51.2-cp310-cp310-win_amd64.whl", hash = "sha256:05d1cdd0352f9da10555899cb6212ac9a2e783b05c20c2989cae09531c1b1969", size = 26696512, upload-time = "2025-11-29T00:28:42.955Z" }, - { url = "https://files.pythonhosted.org/packages/6d/fa/4ee6a516d9de9d5fa7ecd0e59888c9ab1a2bedaec06fe9c6b91d0f9523b2/ray-2.51.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:26100d25b0ca5162e7404d57247ad697514709c6f41db7efb3d312d78a5ef292", size = 68044847, upload-time = "2025-11-29T00:28:47.902Z" }, - { url = "https://files.pythonhosted.org/packages/92/ca/06b1b761e8c4398c2818f0ac04e14c2f2937fa79bf9be6ffc74d785641fb/ray-2.51.2-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:1102471b4edb08605001be781f094c2291805d8e4a118ad8b59b833b12d4f13f", size = 70464861, upload-time = "2025-11-29T00:28:53.591Z" }, - { url = "https://files.pythonhosted.org/packages/7c/b0/7dda0bf542f3cf08fae67c57ec61422d4f8b3d0342d0d03057eefb93886e/ray-2.51.2-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:ad6aafbb7f67d1edbe3cad72b9e33ee99b0ed31ca7210ee8c6af9db1d1c4d850", size = 71286437, upload-time = "2025-11-29T00:28:59.26Z" }, - { url = "https://files.pythonhosted.org/packages/57/c9/31289a53bf4418b9fe71be8f7780ee520ef5f76fb5a5cdd5dcff9e41fb0b/ray-2.51.2-cp311-cp311-win_amd64.whl", hash = "sha256:a48e3871cc2b526bca7de84527fdf56875115829fab518cc938dd4c64e0174b9", size = 26692167, upload-time = "2025-11-29T00:29:03.786Z" }, - { url = "https://files.pythonhosted.org/packages/70/54/66fcfebd26c9747d908e2ac24f3a8a5502e84f19ea1e7a9b7f4d4a12bc34/ray-2.51.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:461b0e711f73cebc68128bca7202bef8db2c0e14dc6d49140f96549e5e752eb1", size = 68030141, upload-time = "2025-11-29T00:29:08.67Z" }, - { url = "https://files.pythonhosted.org/packages/0e/9e/7add3c78a5a3d05f9c702d247da83a8a3e30d57eae153985f48ec3309c82/ray-2.51.2-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:5c97f29574072e3568a2714a84e6948fb457ce09eefd251c919221584b2d458d", size = 70506728, upload-time = "2025-11-29T00:29:14.051Z" }, - { url = "https://files.pythonhosted.org/packages/b3/8e/5d1325619399d7eb9563e2f883f8e782fb26b39a122d6d629e54c8989a5a/ray-2.51.2-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:7b2a842744a1d4b47af8f3c0665a319736139518dd2e26fb9e18114281d8f9ea", size = 71359570, upload-time = "2025-11-29T00:29:19.508Z" }, - { url = "https://files.pythonhosted.org/packages/ba/96/ec1ee03fb1731d9e09d94d7ba6d9e47fce886d7cc79aac47e8422fe9c528/ray-2.51.2-cp312-cp312-win_amd64.whl", hash = "sha256:6b04ca7dccf540da2ab07fd7073009dfe04d9d084d705e337572272fa3e56485", size = 26675734, upload-time = "2025-11-29T00:29:24.27Z" }, - { url = "https://files.pythonhosted.org/packages/70/89/255ac2a70928a1d439c98fca9f3437cabbbebd3ac767523df608cce39197/ray-2.51.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:c9ed290667868c809eb467ad8830d887fdce10dac2c674b3d43d3b3b5f9c7b07", size = 67975149, upload-time = "2025-11-29T00:29:28.995Z" }, - { url = "https://files.pythonhosted.org/packages/d3/05/1e3bb04e263a2bc1eacd762b37a0013d18f76341de0a7199d84a5a00b372/ray-2.51.2-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:554bd393e97bed9dfa5f73f47e4fbf42aa35d81b1228081aa93ccb7cdd5d4b34", size = 70414911, upload-time = "2025-11-29T00:29:34.286Z" }, - { url = "https://files.pythonhosted.org/packages/c4/85/f6994a74cf5e6fa6ebc959c27ff6f1f5352b78e71b947b4b302c6bb0a203/ray-2.51.2-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:e3bf004ed23971ec5d324ed9748aed23f6645d56696a44cdbe35d331f66c4619", size = 71275062, upload-time = "2025-11-29T00:29:39.379Z" }, + { url = "https://files.pythonhosted.org/packages/2f/99/21986c7f8135dafbf7c49229c52faaa9d2d365db7d86fffe978dde8ee967/ray-2.53.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:4db914a0a6dd608fa49c066929a1282745a2dbd73caee67d7b80fe684ca65bdd", size = 69473649, upload-time = "2025-12-20T16:05:40.58Z" }, + { url = "https://files.pythonhosted.org/packages/70/d9/58b5426a3f11993851db3c93841358cebdddd948153481d355b720f31f9d/ray-2.53.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:4108280d8a1cb90d7d68e5c954c35e63b8bb9a4ba15f88c5e7da0e2025647712", size = 71342662, upload-time = "2025-12-20T16:05:46.936Z" }, + { url = "https://files.pythonhosted.org/packages/c5/05/4aa32370b313481c2d1d41cb53ec786daebdb2ef665b01ef2ac43d9cf457/ray-2.53.0-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:4dbb5fce1364763f29741055f50abe33cf726397141f9cc0e845dd3cc963e455", size = 72188620, upload-time = "2025-12-20T16:05:52.817Z" }, + { url = "https://files.pythonhosted.org/packages/f7/c6/21efe5886898421df20078a333b0984eade7d7aa4bdc68a336f0c66db27e/ray-2.53.0-cp310-cp310-win_amd64.whl", hash = "sha256:90faf630d20b6abf3135997fb3edb5842134aff92e04ee709865db04816d97ef", size = 27200553, upload-time = "2025-12-20T16:05:57.655Z" }, + { url = "https://files.pythonhosted.org/packages/bf/64/d5c29a4b014d8b9a624203a88b67630072c1d6960425dbf7a1f0fa5d6b74/ray-2.53.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:bd3ec4c342776ddac23ae2b108c64f5939f417ccc4875900d586c7c978463269", size = 69479296, upload-time = "2025-12-20T16:06:05.111Z" }, + { url = "https://files.pythonhosted.org/packages/c6/41/9e19d1e5d9458a5ba157c36642e2874bcb22fddbd7c1e77b668e5afc3f3d/ray-2.53.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:a0bbb98b0b0f25a3ee075ca10171e1260e70b6bc690cd509ecd7ce1228af854d", size = 71463449, upload-time = "2025-12-20T16:06:10.983Z" }, + { url = "https://files.pythonhosted.org/packages/63/de/58c19906b0dd16ea06b4f2465b7327f5f180e6b6e1c8c9b610d7c589ea5f/ray-2.53.0-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:eb000c17f7301071fdd15c44c4cd3ac0f7953bb4c7c227e61719fe7048195bcd", size = 72305102, upload-time = "2025-12-20T16:06:17.989Z" }, + { url = "https://files.pythonhosted.org/packages/b1/43/72cc1cfe17d26abe62a793eab10445f9546dce24192b85a6cd0cdc47ed86/ray-2.53.0-cp311-cp311-win_amd64.whl", hash = "sha256:4a1bb3fe09ab4cd0d16ddc96b9f60c9ed83b3f93b87aa8506e0d3b746fd4e825", size = 27194174, upload-time = "2025-12-20T16:06:23.042Z" }, + { url = "https://files.pythonhosted.org/packages/b2/44/562718a634e63e8ef7985285288a167d4af62bc2a7decce3300cf937776a/ray-2.53.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:d8b95d047d947493803fb8417aea31225dcacdab15afdc75b8a238901949d457", size = 69463763, upload-time = "2025-12-20T16:06:28.685Z" }, + { url = "https://files.pythonhosted.org/packages/38/68/8e59b8413f3751fe7ce8b98ee8787d13964b47a4043587950790a9dd2151/ray-2.53.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:65e2ce58d3dc6baa3cf45824d889c1968ebde565ee54dfd80a98af8f31af8e4a", size = 71504450, upload-time = "2025-12-20T16:06:34.922Z" }, + { url = "https://files.pythonhosted.org/packages/2a/db/978a50d264565ca42e2a4bf115ec9a1f04f19ca5e620e6aa2f280747b644/ray-2.53.0-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:14f46363e9b4cf0c1c8b4d8623ec337c5bd408377831b5e5b50067930137bbca", size = 72370424, upload-time = "2025-12-20T16:06:40.821Z" }, + { url = "https://files.pythonhosted.org/packages/8d/6c/bba6f22a9d83ee8f236000ba315f0c197bdc79888b4fa42fd762f729cbbd/ray-2.53.0-cp312-cp312-win_amd64.whl", hash = "sha256:b828c147f9ff2f277b1d254e4fe9a746fdfaee7e313a93a97c7edf4dae9b81a4", size = 27178106, upload-time = "2025-12-20T16:06:45.594Z" }, + { url = "https://files.pythonhosted.org/packages/3d/38/450cf9cf3c490fa4cc6d470597f819444da60f85579d2b34b95ee79fcb6f/ray-2.53.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:85b472ab6fb8f1189f8cef81913fd91b24dd69b3fa7dcca7e144827bd924f6c0", size = 69409819, upload-time = "2025-12-20T16:06:50.668Z" }, + { url = "https://files.pythonhosted.org/packages/71/5e/d452970b07174d5e4f8688abae889d01321b51ced827db1f1d1cb7d56d44/ray-2.53.0-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:7196e5358dfcc8211be864f45e6dfe4827202df294af3c7a76ff8fbc080e0522", size = 71409529, upload-time = "2025-12-20T16:06:56.2Z" }, + { url = "https://files.pythonhosted.org/packages/cb/84/50b317a125617a638a64694c12f56183edd5df01828a35fa4c55c7b13c66/ray-2.53.0-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:73dbbaa7962a7f5e38aa8cf9483e0e9817205e989aa3dc859c738c2af1ae01df", size = 72283961, upload-time = "2025-12-20T16:07:05.831Z" }, ] [[package]] @@ -4991,7 +4989,7 @@ resolution-markers = [ "python_full_version == '3.11.*' and sys_platform != 'linux'", ] dependencies = [ - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0a/ca/d8ace4f98322d01abcd52d381134344bf7b431eba7ed8b42bdea5a3c2ac9/scipy-1.16.3.tar.gz", hash = "sha256:01e87659402762f43bd2fee13370553a17ada367d42e7487800bf2916535aecb", size = 30597883, upload-time = "2025-10-28T17:38:54.068Z" } wheels = [ @@ -5213,7 +5211,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cffi" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" } wheels = [ @@ -5486,7 +5484,7 @@ dependencies = [ { name = "grpcio" }, { name = "markdown" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "packaging" }, { name = "pillow" }, { name = "protobuf" }, @@ -5560,7 +5558,7 @@ resolution-markers = [ ] dependencies = [ { name = "ml-dtypes", marker = "python_full_version >= '3.11'" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/88/18/7b91daa9cf29dbb6bfdd603154f355c9069a9cd8c757038fe52b0f613611/tensorstore-0.1.80.tar.gz", hash = "sha256:4158fe76b96f62d12a37d7868150d836e089b5280b2bdd363c43c5d651f10e26", size = 7090032, upload-time = "2025-12-10T21:35:10.941Z" } wheels = [ @@ -5653,27 +5651,32 @@ wheels = [ [[package]] name = "tokenizers" -version = "0.22.1" +version = "0.22.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "huggingface-hub" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/1c/46/fb6854cec3278fbfa4a75b50232c77622bc517ac886156e6afbfa4d8fc6e/tokenizers-0.22.1.tar.gz", hash = "sha256:61de6522785310a309b3407bac22d99c4db5dba349935e99e4d15ea2226af2d9", size = 363123, upload-time = "2025-09-19T09:49:23.424Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/bf/33/f4b2d94ada7ab297328fc671fed209368ddb82f965ec2224eb1892674c3a/tokenizers-0.22.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:59fdb013df17455e5f950b4b834a7b3ee2e0271e6378ccb33aa74d178b513c73", size = 3069318, upload-time = "2025-09-19T09:49:11.848Z" }, - { url = "https://files.pythonhosted.org/packages/1c/58/2aa8c874d02b974990e89ff95826a4852a8b2a273c7d1b4411cdd45a4565/tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:8d4e484f7b0827021ac5f9f71d4794aaef62b979ab7608593da22b1d2e3c4edc", size = 2926478, upload-time = "2025-09-19T09:49:09.759Z" }, - { url = "https://files.pythonhosted.org/packages/1e/3b/55e64befa1e7bfea963cf4b787b2cea1011362c4193f5477047532ce127e/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19d2962dd28bc67c1f205ab180578a78eef89ac60ca7ef7cbe9635a46a56422a", size = 3256994, upload-time = "2025-09-19T09:48:56.701Z" }, - { url = "https://files.pythonhosted.org/packages/71/0b/fbfecf42f67d9b7b80fde4aabb2b3110a97fac6585c9470b5bff103a80cb/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:38201f15cdb1f8a6843e6563e6e79f4abd053394992b9bbdf5213ea3469b4ae7", size = 3153141, upload-time = "2025-09-19T09:48:59.749Z" }, - { url = "https://files.pythonhosted.org/packages/17/a9/b38f4e74e0817af8f8ef925507c63c6ae8171e3c4cb2d5d4624bf58fca69/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1cbe5454c9a15df1b3443c726063d930c16f047a3cc724b9e6e1a91140e5a21", size = 3508049, upload-time = "2025-09-19T09:49:05.868Z" }, - { url = "https://files.pythonhosted.org/packages/d2/48/dd2b3dac46bb9134a88e35d72e1aa4869579eacc1a27238f1577270773ff/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7d094ae6312d69cc2a872b54b91b309f4f6fbce871ef28eb27b52a98e4d0214", size = 3710730, upload-time = "2025-09-19T09:49:01.832Z" }, - { url = "https://files.pythonhosted.org/packages/93/0e/ccabc8d16ae4ba84a55d41345207c1e2ea88784651a5a487547d80851398/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:afd7594a56656ace95cdd6df4cca2e4059d294c5cfb1679c57824b605556cb2f", size = 3412560, upload-time = "2025-09-19T09:49:03.867Z" }, - { url = "https://files.pythonhosted.org/packages/d0/c6/dc3a0db5a6766416c32c034286d7c2d406da1f498e4de04ab1b8959edd00/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2ef6063d7a84994129732b47e7915e8710f27f99f3a3260b8a38fc7ccd083f4", size = 3250221, upload-time = "2025-09-19T09:49:07.664Z" }, - { url = "https://files.pythonhosted.org/packages/d7/a6/2c8486eef79671601ff57b093889a345dd3d576713ef047776015dc66de7/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ba0a64f450b9ef412c98f6bcd2a50c6df6e2443b560024a09fa6a03189726879", size = 9345569, upload-time = "2025-09-19T09:49:14.214Z" }, - { url = "https://files.pythonhosted.org/packages/6b/16/32ce667f14c35537f5f605fe9bea3e415ea1b0a646389d2295ec348d5657/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:331d6d149fa9c7d632cde4490fb8bbb12337fa3a0232e77892be656464f4b446", size = 9271599, upload-time = "2025-09-19T09:49:16.639Z" }, - { url = "https://files.pythonhosted.org/packages/51/7c/a5f7898a3f6baa3fc2685c705e04c98c1094c523051c805cdd9306b8f87e/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:607989f2ea68a46cb1dfbaf3e3aabdf3f21d8748312dbeb6263d1b3b66c5010a", size = 9533862, upload-time = "2025-09-19T09:49:19.146Z" }, - { url = "https://files.pythonhosted.org/packages/36/65/7e75caea90bc73c1dd8d40438adf1a7bc26af3b8d0a6705ea190462506e1/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a0f307d490295717726598ef6fa4f24af9d484809223bbc253b201c740a06390", size = 9681250, upload-time = "2025-09-19T09:49:21.501Z" }, - { url = "https://files.pythonhosted.org/packages/30/2c/959dddef581b46e6209da82df3b78471e96260e2bc463f89d23b1bf0e52a/tokenizers-0.22.1-cp39-abi3-win32.whl", hash = "sha256:b5120eed1442765cd90b903bb6cfef781fd8fe64e34ccaecbae4c619b7b12a82", size = 2472003, upload-time = "2025-09-19T09:49:27.089Z" }, - { url = "https://files.pythonhosted.org/packages/b3/46/e33a8c93907b631a99377ef4c5f817ab453d0b34f93529421f42ff559671/tokenizers-0.22.1-cp39-abi3-win_amd64.whl", hash = "sha256:65fd6e3fb11ca1e78a6a93602490f134d1fdeb13bcef99389d5102ea318ed138", size = 2674684, upload-time = "2025-09-19T09:49:24.953Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/73/6f/f80cfef4a312e1fb34baf7d85c72d4411afde10978d4657f8cdd811d3ccc/tokenizers-0.22.2.tar.gz", hash = "sha256:473b83b915e547aa366d1eee11806deaf419e17be16310ac0a14077f1e28f917", size = 372115, upload-time = "2026-01-05T10:45:15.988Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/92/97/5dbfabf04c7e348e655e907ed27913e03db0923abb5dfdd120d7b25630e1/tokenizers-0.22.2-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:544dd704ae7238755d790de45ba8da072e9af3eea688f698b137915ae959281c", size = 3100275, upload-time = "2026-01-05T10:41:02.158Z" }, + { url = "https://files.pythonhosted.org/packages/2e/47/174dca0502ef88b28f1c9e06b73ce33500eedfac7a7692108aec220464e7/tokenizers-0.22.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:1e418a55456beedca4621dbab65a318981467a2b188e982a23e117f115ce5001", size = 2981472, upload-time = "2026-01-05T10:41:00.276Z" }, + { url = "https://files.pythonhosted.org/packages/d6/84/7990e799f1309a8b87af6b948f31edaa12a3ed22d11b352eaf4f4b2e5753/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2249487018adec45d6e3554c71d46eb39fa8ea67156c640f7513eb26f318cec7", size = 3290736, upload-time = "2026-01-05T10:40:32.165Z" }, + { url = "https://files.pythonhosted.org/packages/78/59/09d0d9ba94dcd5f4f1368d4858d24546b4bdc0231c2354aa31d6199f0399/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:25b85325d0815e86e0bac263506dd114578953b7b53d7de09a6485e4a160a7dd", size = 3168835, upload-time = "2026-01-05T10:40:38.847Z" }, + { url = "https://files.pythonhosted.org/packages/47/50/b3ebb4243e7160bda8d34b731e54dd8ab8b133e50775872e7a434e524c28/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bfb88f22a209ff7b40a576d5324bf8286b519d7358663db21d6246fb17eea2d5", size = 3521673, upload-time = "2026-01-05T10:40:56.614Z" }, + { url = "https://files.pythonhosted.org/packages/e0/fa/89f4cb9e08df770b57adb96f8cbb7e22695a4cb6c2bd5f0c4f0ebcf33b66/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1c774b1276f71e1ef716e5486f21e76333464f47bece56bbd554485982a9e03e", size = 3724818, upload-time = "2026-01-05T10:40:44.507Z" }, + { url = "https://files.pythonhosted.org/packages/64/04/ca2363f0bfbe3b3d36e95bf67e56a4c88c8e3362b658e616d1ac185d47f2/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df6c4265b289083bf710dff49bc51ef252f9d5be33a45ee2bed151114a56207b", size = 3379195, upload-time = "2026-01-05T10:40:51.139Z" }, + { url = "https://files.pythonhosted.org/packages/2e/76/932be4b50ef6ccedf9d3c6639b056a967a86258c6d9200643f01269211ca/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:369cc9fc8cc10cb24143873a0d95438bb8ee257bb80c71989e3ee290e8d72c67", size = 3274982, upload-time = "2026-01-05T10:40:58.331Z" }, + { url = "https://files.pythonhosted.org/packages/1d/28/5f9f5a4cc211b69e89420980e483831bcc29dade307955cc9dc858a40f01/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:29c30b83d8dcd061078b05ae0cb94d3c710555fbb44861139f9f83dcca3dc3e4", size = 9478245, upload-time = "2026-01-05T10:41:04.053Z" }, + { url = "https://files.pythonhosted.org/packages/6c/fb/66e2da4704d6aadebf8cb39f1d6d1957df667ab24cff2326b77cda0dcb85/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:37ae80a28c1d3265bb1f22464c856bd23c02a05bb211e56d0c5301a435be6c1a", size = 9560069, upload-time = "2026-01-05T10:45:10.673Z" }, + { url = "https://files.pythonhosted.org/packages/16/04/fed398b05caa87ce9b1a1bb5166645e38196081b225059a6edaff6440fac/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:791135ee325f2336f498590eb2f11dc5c295232f288e75c99a36c5dbce63088a", size = 9899263, upload-time = "2026-01-05T10:45:12.559Z" }, + { url = "https://files.pythonhosted.org/packages/05/a1/d62dfe7376beaaf1394917e0f8e93ee5f67fea8fcf4107501db35996586b/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38337540fbbddff8e999d59970f3c6f35a82de10053206a7562f1ea02d046fa5", size = 10033429, upload-time = "2026-01-05T10:45:14.333Z" }, + { url = "https://files.pythonhosted.org/packages/fd/18/a545c4ea42af3df6effd7d13d250ba77a0a86fb20393143bbb9a92e434d4/tokenizers-0.22.2-cp39-abi3-win32.whl", hash = "sha256:a6bf3f88c554a2b653af81f3204491c818ae2ac6fbc09e76ef4773351292bc92", size = 2502363, upload-time = "2026-01-05T10:45:20.593Z" }, + { url = "https://files.pythonhosted.org/packages/65/71/0670843133a43d43070abeb1949abfdef12a86d490bea9cd9e18e37c5ff7/tokenizers-0.22.2-cp39-abi3-win_amd64.whl", hash = "sha256:c9ea31edff2968b44a88f97d784c2f16dc0729b8b143ed004699ebca91f05c48", size = 2747786, upload-time = "2026-01-05T10:45:18.411Z" }, + { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" }, + { url = "https://files.pythonhosted.org/packages/84/04/655b79dbcc9b3ac5f1479f18e931a344af67e5b7d3b251d2dcdcd7558592/tokenizers-0.22.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:753d47ebd4542742ef9261d9da92cd545b2cacbb48349a1225466745bb866ec4", size = 3282301, upload-time = "2026-01-05T10:40:34.858Z" }, + { url = "https://files.pythonhosted.org/packages/46/cd/e4851401f3d8f6f45d8480262ab6a5c8cb9c4302a790a35aa14eeed6d2fd/tokenizers-0.22.2-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e10bf9113d209be7cd046d40fbabbaf3278ff6d18eb4da4c500443185dc1896c", size = 3161308, upload-time = "2026-01-05T10:40:40.737Z" }, + { url = "https://files.pythonhosted.org/packages/6f/6e/55553992a89982cd12d4a66dddb5e02126c58677ea3931efcbe601d419db/tokenizers-0.22.2-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:64d94e84f6660764e64e7e0b22baa72f6cd942279fdbb21d46abd70d179f0195", size = 3718964, upload-time = "2026-01-05T10:40:46.56Z" }, + { url = "https://files.pythonhosted.org/packages/59/8c/b1c87148aa15e099243ec9f0cf9d0e970cc2234c3257d558c25a2c5304e6/tokenizers-0.22.2-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f01a9c019878532f98927d2bacb79bbb404b43d3437455522a00a30718cdedb5", size = 3373542, upload-time = "2026-01-05T10:40:52.803Z" }, ] [[package]] @@ -5810,7 +5813,7 @@ version = "0.0.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "torch", marker = "sys_platform == 'never'" }, { name = "torchvision", marker = "sys_platform == 'never'" }, ] @@ -5825,7 +5828,7 @@ version = "0.24.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "pillow", marker = "sys_platform != 'linux'" }, { name = "torch", marker = "sys_platform == 'never'" }, ] @@ -5894,60 +5897,17 @@ wheels = [ [[package]] name = "transformer-engine" -version = "2.10.0" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ea/45/b3402a4931c0850ac662b532888d7cb89d5d8f22324309ae8d24557340ee/transformer_engine-2.10.0-py3-none-any.whl", hash = "sha256:a14ccf4e887409be062c0bd8c4a341df55a77baad6aea6aabfe39c24e38252e5", size = 696221, upload-time = "2025-12-02T20:53:17.688Z" }, -] - -[package.optional-dependencies] -core-cu13 = [ - { name = "transformer-engine-cu13" }, -] -pytorch = [ - { name = "transformer-engine-torch" }, -] - -[[package]] -name = "transformer-engine-cu12" -version = "2.10.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "importlib-metadata" }, - { name = "packaging" }, - { name = "pydantic" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/24/3c/9f480a555c4707cd7b091c5341cc96db1af80b5bfb1a2eae834fb704283b/transformer_engine_cu12-2.10.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:ddd6f4f1f2a8f2c450ea0210d04a08a7b8ceff49a4d900f27b3858980502f21b", size = 286567840, upload-time = "2025-12-02T20:50:26.438Z" }, - { url = "https://files.pythonhosted.org/packages/29/c7/b63b6989262fcf37402a910112aaee9f3273338d9d1d854478e022f5deb7/transformer_engine_cu12-2.10.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:6766d4ea1643a2606d498aa396d4e7da1046fe01580fdef2047c2c8aa37936b0", size = 287067223, upload-time = "2025-12-02T20:52:11.248Z" }, -] - -[[package]] -name = "transformer-engine-cu13" -version = "2.10.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "importlib-metadata" }, - { name = "packaging" }, - { name = "pydantic" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/31/bf/34a93b94ec3a8e707e9c5660c76533316357e3b84d08f5cc676787a196c5/transformer_engine_cu13-2.10.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:7409c48a5478acc15b7ac88231be3c45aa9e7c9d17f4875ad31d1bc1650595dd", size = 176560075, upload-time = "2025-12-02T20:48:52.307Z" }, - { url = "https://files.pythonhosted.org/packages/48/80/1f08d928e7e0ce3f10c6cfa6871b17d13cec070dffb8b88ed9308653ac77/transformer_engine_cu13-2.10.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:b8ee7bd6cef455e07bad61d645af290940aa58856d70fda05e1f973353a85349", size = 177257305, upload-time = "2025-12-02T20:51:36.94Z" }, -] - -[[package]] -name = "transformer-engine-torch" -version = "2.10.0" -source = { registry = "https://pypi.org/simple" } +version = "2.11.0+c188b533" +source = { git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.11#c188b533cc3721ca9c6bbfd26148f5cf60108c25" } dependencies = [ { name = "einops" }, + { name = "importlib-metadata" }, { name = "onnx" }, { name = "onnxscript" }, + { name = "packaging" }, + { name = "pydantic" }, { name = "torch", marker = "sys_platform == 'never'" }, - { name = "transformer-engine-cu12" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/18/94/609a7772569d3acdba34261be7fd30b75f5ff4e5f704117c9e0da517b079/transformer_engine_torch-2.10.0.tar.gz", hash = "sha256:71faff8e3def742553ad74b4e32d2d12e91be9acfb13d1699c89e1e18dd4ecd6", size = 220302, upload-time = "2025-12-02T20:53:57.876Z" } [[package]] name = "transformers" @@ -5957,7 +5917,7 @@ dependencies = [ { name = "filelock" }, { name = "huggingface-hub" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "packaging" }, { name = "pyyaml" }, { name = "regex" }, @@ -6003,7 +5963,7 @@ wheels = [ [[package]] name = "typer" -version = "0.20.0" +version = "0.21.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click" }, @@ -6011,9 +5971,9 @@ dependencies = [ { name = "shellingham" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8f/28/7c85c8032b91dbe79725b6f17d2fffc595dff06a35c7a30a37bef73a1ab4/typer-0.20.0.tar.gz", hash = "sha256:1aaf6494031793e4876fb0bacfa6a912b551cf43c1e63c800df8b1a866720c37", size = 106492, upload-time = "2025-10-20T17:03:49.445Z" } +sdist = { url = "https://files.pythonhosted.org/packages/36/bf/8825b5929afd84d0dabd606c67cd57b8388cb3ec385f7ef19c5cc2202069/typer-0.21.1.tar.gz", hash = "sha256:ea835607cd752343b6b2b7ce676893e5a0324082268b48f27aa058bdb7d2145d", size = 110371, upload-time = "2026-01-06T11:21:10.989Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/78/64/7713ffe4b5983314e9d436a90d5bd4f63b6054e2aca783a3cfc44cb95bbf/typer-0.20.0-py3-none-any.whl", hash = "sha256:5b463df6793ec1dca6213a3cf4c0f03bc6e322ac5e16e13ddd622a889489784a", size = 47028, upload-time = "2025-10-20T17:03:47.617Z" }, + { url = "https://files.pythonhosted.org/packages/a0/1d/d9257dd49ff2ca23ea5f132edf1281a0c4f9de8a762b9ae399b670a59235/typer-0.21.1-py3-none-any.whl", hash = "sha256:7985e89081c636b88d172c2ee0cfe33c253160994d47bdfdc302defd7d1f1d01", size = 47381, upload-time = "2026-01-06T11:21:09.824Z" }, ] [[package]] @@ -6070,16 +6030,16 @@ wheels = [ [[package]] name = "uvicorn" -version = "0.38.0" +version = "0.40.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click" }, { name = "h11" }, { name = "typing-extensions", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/cb/ce/f06b84e2697fef4688ca63bdb2fdf113ca0a3be33f94488f2cadb690b0cf/uvicorn-0.38.0.tar.gz", hash = "sha256:fd97093bdd120a2609fc0d3afe931d4d4ad688b6e75f0f929fde1bc36fe0e91d", size = 80605, upload-time = "2025-10-18T13:46:44.63Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c3/d1/8f3c683c9561a4e6689dd3b1d345c815f10f86acd044ee1fb9a4dcd0b8c5/uvicorn-0.40.0.tar.gz", hash = "sha256:839676675e87e73694518b5574fd0f24c9d97b46bea16df7b8c05ea1a51071ea", size = 81761, upload-time = "2025-12-21T14:16:22.45Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ee/d9/d88e73ca598f4f6ff671fb5fde8a32925c2e08a637303a1d12883c7305fa/uvicorn-0.38.0-py3-none-any.whl", hash = "sha256:48c0afd214ceb59340075b4a052ea1ee91c16fbc2a9b1469cca0e54566977b02", size = 68109, upload-time = "2025-10-18T13:46:42.958Z" }, + { url = "https://files.pythonhosted.org/packages/3d/d8/2083a1daa7439a66f3a48589a57d576aa117726762618f6bb09fe3798796/uvicorn-0.40.0-py3-none-any.whl", hash = "sha256:c6c8f55bc8bf13eb6fa9ff87ad62308bbbc33d0b67f84293151efe87e0d5f2ee", size = 68502, upload-time = "2025-12-21T14:16:21.041Z" }, ] [[package]] @@ -6242,7 +6202,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "braceexpand" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, - { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" }, { name = "pyyaml" }, ] sdist = { url = "https://files.pythonhosted.org/packages/5a/3a/68800d92e065cf4750ebecf973b13979c0c929b439e1293012938862038d/webdataset-1.0.2.tar.gz", hash = "sha256:7f0498be827cfa46cc5430a58768a24e2c6a410676a61be1838f53d61afdaab4", size = 80090, upload-time = "2025-06-19T23:26:21.945Z" } From de866fa56682b00a9e332c1116142e6173e13edb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 13 Jan 2026 09:38:07 +0100 Subject: [PATCH 224/248] ci(fix): Update golden values (#2921) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .../golden_values_dev_dgx_h100.json | 480 ++-- .../golden_values_dev_dgx_h100.json | 480 ++-- .../golden_values_dev_dgx_h100.json | 480 ++-- .../golden_values_dev_dgx_h100.json | 480 ++-- .../golden_values_lts_dgx_a100.json | 538 +---- .../golden_values_dev_dgx_h100.json | 2050 ++++++++--------- .../golden_values_dev_dgx_h100.json | 492 ++-- .../golden_values_dev_dgx_h100.json | 446 ++-- .../golden_values_dev_dgx_h100.json | 494 ++-- .../golden_values_dev_dgx_h100.json | 472 ++-- .../golden_values_dev_dgx_h100.json | 390 ++-- .../golden_values_dev_dgx_h100.json | 1140 ++++----- 12 files changed, 3703 insertions(+), 4239 deletions(-) diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_h100.json index 02b4683ea0b..81005995dad 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/golden_values_dev_dgx_h100.json @@ -4,55 +4,55 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 10.86535, - "2": 10.85873, - "3": 10.86284, - "4": 10.84009, + "1": 10.86539, + "2": 10.85871, + "3": 10.86282, + "4": 10.84007, "5": 10.87856, - "6": 10.88856, - "7": 10.86532, - "8": 10.86017, - "9": 10.8599, - "10": 10.82981, - "11": 10.8895, - "12": 10.8751, - "13": 10.87423, + "6": 10.88852, + "7": 10.86536, + "8": 10.86015, + "9": 10.85991, + "10": 10.82982, + "11": 10.88947, + "12": 10.87511, + "13": 10.87422, "14": 10.89675, - "15": 10.82054, - "16": 10.82504, + "15": 10.82056, + "16": 10.82497, "17": 10.78983, "18": 10.81029, - "19": 10.80535, - "20": 10.70398, - "21": 10.66993, - "22": 10.50643, - "23": 10.69004, - "24": 10.56314, - "25": 10.4942, - "26": 10.56628, - "27": 10.58025, + "19": 10.80528, + "20": 10.70396, + "21": 10.6699, + "22": 10.50641, + "23": 10.69006, + "24": 10.56312, + "25": 10.49418, + "26": 10.56627, + "27": 10.58023, "28": 10.51571, - "29": 10.55299, - "30": 10.30549, - "31": 10.02245, - "32": 10.40614, + "29": 10.55296, + "30": 10.30551, + "31": 10.02244, + "32": 10.40618, "33": 10.39874, - "34": 10.13771, + "34": 10.1377, "35": 10.20184, - "36": 10.16052, - "37": 10.28973, - "38": 10.11474, + "36": 10.1605, + "37": 10.28975, + "38": 10.11483, "39": 10.361, - "40": 10.01903, + "40": 10.01905, "41": 10.07292, - "42": 10.14698, - "43": 9.74687, - "44": 9.87766, - "45": 9.74966, - "46": 9.73383, - "47": 10.07535, - "48": 9.78068, - "49": 9.44784, + "42": 10.14697, + "43": 9.74684, + "44": 9.87763, + "45": 9.74962, + "46": 9.73382, + "47": 10.07536, + "48": 9.78071, + "49": 9.44783, "50": 9.8399 } }, @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 653.0, - "2": 642.0, - "3": 630.0, - "4": 585.0, - "5": 635.0, - "6": 687.0, - "7": 615.0, - "8": 601.0, - "9": 607.0, - "10": 522.0, - "11": 637.0, - "12": 675.0, - "13": 649.0, - "14": 648.0, - "15": 640.0, - "16": 602.0, - "17": 668.0, - "18": 634.0, - "19": 593.0, - "20": 579.0, - "21": 633.0, - "22": 597.0, - "23": 756.0, - "24": 612.0, - "25": 591.0, - "26": 620.0, - "27": 700.0, - "28": 705.0, - "29": 795.0, - "30": 752.0, - "31": 628.0, - "32": 712.0, - "33": 752.0, - "34": 737.0, - "35": 741.0, - "36": 770.0, - "37": 861.0, - "38": 823.0, - "39": 812.0, - "40": 814.0, - "41": 826.0, - "42": 801.0, - "43": 769.0, - "44": 822.0, - "45": 777.0, - "46": 828.0, - "47": 878.0, - "48": 915.0, - "49": 908.0, - "50": 848.0 + "1": 572.0, + "2": 656.0, + "3": 649.0, + "4": 631.0, + "5": 658.0, + "6": 636.0, + "7": 636.0, + "8": 542.0, + "9": 653.0, + "10": 551.0, + "11": 681.0, + "12": 642.0, + "13": 624.0, + "14": 658.0, + "15": 682.0, + "16": 659.0, + "17": 620.0, + "18": 603.0, + "19": 634.0, + "20": 639.0, + "21": 634.0, + "22": 602.0, + "23": 731.0, + "24": 620.0, + "25": 611.0, + "26": 626.0, + "27": 683.0, + "28": 668.0, + "29": 713.0, + "30": 712.0, + "31": 616.0, + "32": 786.0, + "33": 800.0, + "34": 702.0, + "35": 684.0, + "36": 664.0, + "37": 831.0, + "38": 802.0, + "39": 919.0, + "40": 802.0, + "41": 791.0, + "42": 840.0, + "43": 718.0, + "44": 756.0, + "45": 765.0, + "46": 809.0, + "47": 839.0, + "48": 827.0, + "49": 935.0, + "50": 839.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 510689792.0, - "2": 510689792.0, - "3": 510689792.0, - "4": 510689792.0, - "5": 510689792.0, - "6": 510689792.0, - "7": 510689792.0, - "8": 510689792.0, - "9": 510689792.0, - "10": 510689792.0, - "11": 510689792.0, - "12": 510689792.0, - "13": 510689792.0, - "14": 510689792.0, - "15": 510689792.0, - "16": 510689792.0, - "17": 510689792.0, - "18": 510689792.0, - "19": 510689792.0, - "20": 510689792.0, - "21": 510689792.0, - "22": 510689792.0, - "23": 510689792.0, - "24": 510689792.0, - "25": 510689792.0, - "26": 510689792.0, - "27": 510689792.0, - "28": 510689792.0, - "29": 510689792.0, - "30": 510689792.0, - "31": 510689792.0, - "32": 510689792.0, - "33": 510689792.0, - "34": 510689792.0, - "35": 510689792.0, - "36": 510689792.0, - "37": 510689792.0, - "38": 510689792.0, - "39": 510689792.0, - "40": 510689792.0, - "41": 510689792.0, - "42": 510689792.0, - "43": 510689792.0, - "44": 510689792.0, - "45": 510689792.0, - "46": 510689792.0, - "47": 510689792.0, - "48": 510689792.0, - "49": 510689792.0, - "50": 510689792.0 + "1": 509641216.0, + "2": 509641216.0, + "3": 509641216.0, + "4": 509641216.0, + "5": 509641216.0, + "6": 509641216.0, + "7": 509641216.0, + "8": 509641216.0, + "9": 509641216.0, + "10": 509641216.0, + "11": 509641216.0, + "12": 509641216.0, + "13": 509641216.0, + "14": 509641216.0, + "15": 509641216.0, + "16": 509641216.0, + "17": 509641216.0, + "18": 509641216.0, + "19": 509641216.0, + "20": 509641216.0, + "21": 509641216.0, + "22": 509641216.0, + "23": 509641216.0, + "24": 509641216.0, + "25": 509641216.0, + "26": 509641216.0, + "27": 509641216.0, + "28": 509641216.0, + "29": 509641216.0, + "30": 509641216.0, + "31": 509641216.0, + "32": 509641216.0, + "33": 509641216.0, + "34": 509641216.0, + "35": 509641216.0, + "36": 509641216.0, + "37": 509641216.0, + "38": 509641216.0, + "39": 509641216.0, + "40": 509641216.0, + "41": 509641216.0, + "42": 509641216.0, + "43": 509641216.0, + "44": 509641216.0, + "45": 509641216.0, + "46": 509641216.0, + "47": 509641216.0, + "48": 509641216.0, + "49": 509641216.0, + "50": 509641216.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 757801472.0, - "2": 933156352.0, - "3": 933156352.0, - "4": 933156352.0, - "5": 933156352.0, - "6": 933156352.0, - "7": 933156352.0, - "8": 933156352.0, - "9": 933156352.0, - "10": 933156352.0, - "11": 933156352.0, - "12": 933156352.0, - "13": 933156352.0, - "14": 933156352.0, - "15": 933156352.0, - "16": 933156352.0, - "17": 933156352.0, - "18": 933156352.0, - "19": 933156352.0, - "20": 933156352.0, - "21": 933156352.0, - "22": 933156352.0, - "23": 933156352.0, - "24": 933156352.0, - "25": 933156352.0, - "26": 933156352.0, - "27": 933156352.0, - "28": 933156352.0, - "29": 933156352.0, - "30": 933156352.0, - "31": 933156352.0, - "32": 933156352.0, - "33": 933156352.0, - "34": 933156352.0, - "35": 933156352.0, - "36": 933156352.0, - "37": 933156352.0, - "38": 933156352.0, - "39": 933156352.0, - "40": 933156352.0, - "41": 933156352.0, - "42": 933156352.0, - "43": 933156352.0, - "44": 933156352.0, - "45": 933156352.0, - "46": 933156352.0, - "47": 933156352.0, - "48": 933156352.0, - "49": 933156352.0, - "50": 933156352.0 + "1": 756751872.0, + "2": 932632064.0, + "3": 932632064.0, + "4": 932632064.0, + "5": 932632064.0, + "6": 932632064.0, + "7": 932632064.0, + "8": 932632064.0, + "9": 932632064.0, + "10": 933679616.0, + "11": 933679616.0, + "12": 933679616.0, + "13": 933679616.0, + "14": 933679616.0, + "15": 933679616.0, + "16": 933679616.0, + "17": 933679616.0, + "18": 933679616.0, + "19": 933679616.0, + "20": 933679616.0, + "21": 933679616.0, + "22": 933679616.0, + "23": 933679616.0, + "24": 933679616.0, + "25": 933679616.0, + "26": 933679616.0, + "27": 933679616.0, + "28": 933679616.0, + "29": 933679616.0, + "30": 933679616.0, + "31": 933679616.0, + "32": 933679616.0, + "33": 933679616.0, + "34": 933679616.0, + "35": 933679616.0, + "36": 933679616.0, + "37": 933679616.0, + "38": 933679616.0, + "39": 933679616.0, + "40": 933679616.0, + "41": 933679616.0, + "42": 933679616.0, + "43": 933679616.0, + "44": 933679616.0, + "45": 933680640.0, + "46": 933680640.0, + "47": 933680640.0, + "48": 933680640.0, + "49": 933680640.0, + "50": 933680640.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 15.78036, - "2": 0.34723, - "3": 0.33492, - "4": 0.3292, - "5": 0.33036, - "6": 0.34971, - "7": 0.33848, - "8": 0.33262, - "9": 0.34028, - "10": 0.3518, - "11": 0.34239, - "12": 0.33211, - "13": 0.32961, - "14": 0.33263, - "15": 0.32808, - "16": 0.33152, - "17": 0.33313, - "18": 0.329, - "19": 0.3317, - "20": 0.33143, - "21": 0.34166, - "22": 0.33873, - "23": 0.34817, - "24": 0.3415, - "25": 0.34495, - "26": 0.32592, - "27": 0.32935, - "28": 0.33233, - "29": 0.328, - "30": 0.32746, - "31": 0.3275, - "32": 0.327, - "33": 0.32765, - "34": 0.32542, - "35": 0.32703, - "36": 0.33052, - "37": 0.33413, - "38": 0.32701, - "39": 0.32816, - "40": 0.32555, - "41": 0.33676, - "42": 0.33367, - "43": 0.33748, - "44": 0.33125, - "45": 0.32793, - "46": 0.33387, - "47": 0.32628, - "48": 0.32993, - "49": 0.32747, - "50": 0.327 + "1": 42.02117, + "2": 0.34315, + "3": 0.31657, + "4": 0.29715, + "5": 0.29109, + "6": 0.28638, + "7": 0.28745, + "8": 0.29318, + "9": 0.30075, + "10": 0.29578, + "11": 0.30101, + "12": 0.29769, + "13": 0.2954, + "14": 0.2989, + "15": 0.29627, + "16": 0.29342, + "17": 0.29396, + "18": 0.29431, + "19": 0.29408, + "20": 0.29286, + "21": 0.29361, + "22": 0.29448, + "23": 0.29521, + "24": 0.29494, + "25": 0.29812, + "26": 0.29413, + "27": 0.2949, + "28": 0.29469, + "29": 0.29393, + "30": 0.29682, + "31": 0.2951, + "32": 0.29532, + "33": 0.29449, + "34": 0.29334, + "35": 0.29679, + "36": 0.29557, + "37": 0.29495, + "38": 0.29826, + "39": 0.29574, + "40": 0.2972, + "41": 0.29568, + "42": 0.29643, + "43": 0.29627, + "44": 0.29491, + "45": 0.29476, + "46": 0.29707, + "47": 0.35995, + "48": 0.28743, + "49": 0.28604, + "50": 0.28593 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_h100.json index f2adbef4530..873d08f92a3 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss/golden_values_dev_dgx_h100.json @@ -4,55 +4,55 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 10.86535, - "2": 10.85873, - "3": 10.86284, - "4": 10.84009, + "1": 10.86539, + "2": 10.85871, + "3": 10.86282, + "4": 10.84007, "5": 10.87856, - "6": 10.88856, - "7": 10.86532, - "8": 10.86017, - "9": 10.8599, - "10": 10.82981, - "11": 10.8895, - "12": 10.8751, - "13": 10.87423, + "6": 10.88852, + "7": 10.86536, + "8": 10.86015, + "9": 10.85991, + "10": 10.82982, + "11": 10.88947, + "12": 10.87511, + "13": 10.87422, "14": 10.89675, - "15": 10.82054, - "16": 10.82504, + "15": 10.82056, + "16": 10.82497, "17": 10.78983, "18": 10.81029, - "19": 10.80535, - "20": 10.70398, - "21": 10.66993, - "22": 10.50643, - "23": 10.69004, - "24": 10.56314, - "25": 10.4942, - "26": 10.56628, - "27": 10.58025, + "19": 10.80528, + "20": 10.70396, + "21": 10.6699, + "22": 10.50641, + "23": 10.69006, + "24": 10.56312, + "25": 10.49418, + "26": 10.56627, + "27": 10.58023, "28": 10.51571, - "29": 10.55299, - "30": 10.30549, - "31": 10.02245, - "32": 10.40614, + "29": 10.55296, + "30": 10.30551, + "31": 10.02244, + "32": 10.40618, "33": 10.39874, - "34": 10.13771, + "34": 10.1377, "35": 10.20184, - "36": 10.16052, - "37": 10.28973, - "38": 10.11474, + "36": 10.1605, + "37": 10.28975, + "38": 10.11483, "39": 10.361, - "40": 10.01903, + "40": 10.01905, "41": 10.07292, - "42": 10.14698, - "43": 9.74687, - "44": 9.87766, - "45": 9.74966, - "46": 9.73383, - "47": 10.07535, - "48": 9.78068, - "49": 9.44784, + "42": 10.14697, + "43": 9.74684, + "44": 9.87763, + "45": 9.74962, + "46": 9.73382, + "47": 10.07536, + "48": 9.78071, + "49": 9.44783, "50": 9.8399 } }, @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 653.0, - "2": 642.0, - "3": 630.0, - "4": 585.0, - "5": 635.0, - "6": 687.0, - "7": 615.0, - "8": 601.0, - "9": 607.0, - "10": 522.0, - "11": 637.0, - "12": 675.0, - "13": 649.0, - "14": 648.0, - "15": 640.0, - "16": 602.0, - "17": 668.0, - "18": 634.0, - "19": 593.0, - "20": 579.0, - "21": 633.0, - "22": 597.0, - "23": 756.0, - "24": 612.0, - "25": 591.0, - "26": 620.0, - "27": 700.0, - "28": 705.0, - "29": 795.0, - "30": 752.0, - "31": 628.0, - "32": 712.0, - "33": 752.0, - "34": 737.0, - "35": 741.0, - "36": 770.0, - "37": 861.0, - "38": 823.0, - "39": 812.0, - "40": 814.0, - "41": 826.0, - "42": 801.0, - "43": 769.0, - "44": 822.0, - "45": 777.0, - "46": 828.0, - "47": 878.0, - "48": 915.0, - "49": 908.0, - "50": 848.0 + "1": 572.0, + "2": 656.0, + "3": 649.0, + "4": 631.0, + "5": 658.0, + "6": 636.0, + "7": 636.0, + "8": 542.0, + "9": 653.0, + "10": 551.0, + "11": 681.0, + "12": 642.0, + "13": 624.0, + "14": 658.0, + "15": 682.0, + "16": 659.0, + "17": 620.0, + "18": 603.0, + "19": 634.0, + "20": 639.0, + "21": 634.0, + "22": 602.0, + "23": 731.0, + "24": 620.0, + "25": 611.0, + "26": 626.0, + "27": 683.0, + "28": 668.0, + "29": 713.0, + "30": 712.0, + "31": 616.0, + "32": 786.0, + "33": 800.0, + "34": 702.0, + "35": 684.0, + "36": 664.0, + "37": 831.0, + "38": 802.0, + "39": 919.0, + "40": 802.0, + "41": 791.0, + "42": 840.0, + "43": 718.0, + "44": 756.0, + "45": 765.0, + "46": 809.0, + "47": 839.0, + "48": 827.0, + "49": 935.0, + "50": 839.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 510689792.0, - "2": 510689792.0, - "3": 510689792.0, - "4": 510689792.0, - "5": 510689792.0, - "6": 510689792.0, - "7": 510689792.0, - "8": 510689792.0, - "9": 510689792.0, - "10": 510689792.0, - "11": 510689792.0, - "12": 510689792.0, - "13": 510689792.0, - "14": 510689792.0, - "15": 510689792.0, - "16": 510689792.0, - "17": 510689792.0, - "18": 510689792.0, - "19": 510689792.0, - "20": 510689792.0, - "21": 510689792.0, - "22": 510689792.0, - "23": 510689792.0, - "24": 510689792.0, - "25": 510689792.0, - "26": 510689792.0, - "27": 510689792.0, - "28": 510689792.0, - "29": 510689792.0, - "30": 510689792.0, - "31": 510689792.0, - "32": 510689792.0, - "33": 510689792.0, - "34": 510689792.0, - "35": 510689792.0, - "36": 510689792.0, - "37": 510689792.0, - "38": 510689792.0, - "39": 510689792.0, - "40": 510689792.0, - "41": 510689792.0, - "42": 510689792.0, - "43": 510689792.0, - "44": 510689792.0, - "45": 510689792.0, - "46": 510689792.0, - "47": 510689792.0, - "48": 510689792.0, - "49": 510689792.0, - "50": 510689792.0 + "1": 511214080.0, + "2": 511214080.0, + "3": 511214080.0, + "4": 511214080.0, + "5": 511214080.0, + "6": 511214080.0, + "7": 511214080.0, + "8": 511214080.0, + "9": 511214080.0, + "10": 511214080.0, + "11": 511214080.0, + "12": 511214080.0, + "13": 511214080.0, + "14": 511214080.0, + "15": 511214080.0, + "16": 511214080.0, + "17": 511214080.0, + "18": 511214080.0, + "19": 511214080.0, + "20": 511214080.0, + "21": 511214080.0, + "22": 511214080.0, + "23": 511214080.0, + "24": 511214080.0, + "25": 511214080.0, + "26": 511214080.0, + "27": 511214080.0, + "28": 511214080.0, + "29": 511214080.0, + "30": 511214080.0, + "31": 511214080.0, + "32": 511214080.0, + "33": 511214080.0, + "34": 511214080.0, + "35": 511214080.0, + "36": 511214080.0, + "37": 511214080.0, + "38": 511214080.0, + "39": 511214080.0, + "40": 511214080.0, + "41": 511214080.0, + "42": 511214080.0, + "43": 511214080.0, + "44": 511214080.0, + "45": 511214080.0, + "46": 511214080.0, + "47": 511214080.0, + "48": 511214080.0, + "49": 511214080.0, + "50": 511214080.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 759898624.0, - "2": 933156352.0, - "3": 933156352.0, - "4": 933156352.0, - "5": 933156352.0, - "6": 933156352.0, - "7": 933156352.0, - "8": 933156352.0, - "9": 933156352.0, - "10": 933156352.0, - "11": 933156352.0, - "12": 933156352.0, - "13": 933156352.0, - "14": 933156352.0, - "15": 933156352.0, - "16": 933156352.0, - "17": 933156352.0, - "18": 933156352.0, - "19": 933156352.0, - "20": 933156352.0, - "21": 933156352.0, - "22": 933156352.0, - "23": 933156352.0, - "24": 933156352.0, - "25": 933156352.0, - "26": 933156352.0, - "27": 933156352.0, - "28": 933156352.0, - "29": 933156352.0, - "30": 933156352.0, - "31": 933156352.0, - "32": 933156352.0, - "33": 933156352.0, - "34": 933156352.0, - "35": 933156352.0, - "36": 933156352.0, - "37": 933156352.0, - "38": 933156352.0, - "39": 933156352.0, - "40": 933156352.0, - "41": 933156352.0, - "42": 933156352.0, - "43": 933156352.0, - "44": 933156352.0, - "45": 933156352.0, - "46": 933156352.0, - "47": 933156352.0, - "48": 933156352.0, - "49": 933156352.0, - "50": 933156352.0 + "1": 756753920.0, + "2": 935776768.0, + "3": 935777792.0, + "4": 935777792.0, + "5": 935777792.0, + "6": 935777792.0, + "7": 935777792.0, + "8": 935777792.0, + "9": 935777792.0, + "10": 935777792.0, + "11": 935777792.0, + "12": 935777792.0, + "13": 935777792.0, + "14": 935777792.0, + "15": 935777792.0, + "16": 935777792.0, + "17": 935777792.0, + "18": 935777792.0, + "19": 935777792.0, + "20": 935777792.0, + "21": 935777792.0, + "22": 935777792.0, + "23": 935777792.0, + "24": 935777792.0, + "25": 935777792.0, + "26": 935777792.0, + "27": 935777792.0, + "28": 935777792.0, + "29": 935777792.0, + "30": 935777792.0, + "31": 935777792.0, + "32": 935777792.0, + "33": 935777792.0, + "34": 935777792.0, + "35": 935777792.0, + "36": 935777792.0, + "37": 935777792.0, + "38": 935777792.0, + "39": 935777792.0, + "40": 935777792.0, + "41": 935777792.0, + "42": 935777792.0, + "43": 935777792.0, + "44": 935777792.0, + "45": 935777792.0, + "46": 935777792.0, + "47": 935777792.0, + "48": 935777792.0, + "49": 935777792.0, + "50": 935777792.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 16.72434, - "2": 0.40342, - "3": 0.32477, - "4": 0.32459, - "5": 0.32511, - "6": 0.32478, - "7": 0.32469, - "8": 0.32479, - "9": 0.32229, - "10": 0.32534, - "11": 0.32568, - "12": 0.32325, - "13": 0.3234, - "14": 0.32735, - "15": 0.32264, - "16": 0.32664, - "17": 0.32289, - "18": 0.32328, - "19": 0.32997, - "20": 0.32955, - "21": 0.32699, - "22": 0.3292, - "23": 0.32982, - "24": 0.32452, - "25": 0.32644, - "26": 0.32596, - "27": 0.32426, - "28": 0.32527, - "29": 0.32409, - "30": 0.32549, - "31": 0.32259, - "32": 0.32488, - "33": 0.32331, - "34": 0.3242, - "35": 0.3261, - "36": 0.32048, - "37": 0.32127, - "38": 0.32479, - "39": 0.32338, - "40": 0.32137, - "41": 0.32292, - "42": 0.32202, - "43": 0.32321, - "44": 0.32105, - "45": 0.32265, - "46": 0.32148, - "47": 0.32443, - "48": 0.32158, - "49": 0.32089, - "50": 0.32389 + "1": 44.927, + "2": 0.34811, + "3": 0.31209, + "4": 0.29049, + "5": 0.28904, + "6": 0.28728, + "7": 0.28884, + "8": 0.29393, + "9": 0.28153, + "10": 0.28717, + "11": 0.28861, + "12": 0.29265, + "13": 0.29015, + "14": 0.29189, + "15": 0.29081, + "16": 0.29742, + "17": 0.29933, + "18": 0.29528, + "19": 0.29058, + "20": 0.29304, + "21": 0.29307, + "22": 0.29297, + "23": 0.2889, + "24": 0.29028, + "25": 0.29626, + "26": 0.29321, + "27": 0.29347, + "28": 0.29303, + "29": 0.2812, + "30": 0.28971, + "31": 0.28878, + "32": 0.28499, + "33": 0.28119, + "34": 0.27908, + "35": 0.28101, + "36": 0.2794, + "37": 0.2798, + "38": 0.27799, + "39": 0.28519, + "40": 0.28246, + "41": 0.28126, + "42": 0.28572, + "43": 0.28647, + "44": 0.28772, + "45": 0.28736, + "46": 0.29677, + "47": 0.29247, + "48": 0.29174, + "49": 0.29182, + "50": 0.29085 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_h100.json index f64661824cb..84e2331d673 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last/golden_values_dev_dgx_h100.json @@ -4,55 +4,55 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 10.86535, - "2": 10.85873, - "3": 10.86284, - "4": 10.84009, + "1": 10.86539, + "2": 10.85871, + "3": 10.86282, + "4": 10.84007, "5": 10.87856, - "6": 10.88856, - "7": 10.86532, - "8": 10.86017, - "9": 10.8599, - "10": 10.82981, - "11": 10.8895, - "12": 10.8751, - "13": 10.87423, + "6": 10.88852, + "7": 10.86536, + "8": 10.86015, + "9": 10.85991, + "10": 10.82982, + "11": 10.88947, + "12": 10.87511, + "13": 10.87422, "14": 10.89675, - "15": 10.82054, - "16": 10.82504, + "15": 10.82056, + "16": 10.82497, "17": 10.78983, "18": 10.81029, - "19": 10.80535, - "20": 10.70398, - "21": 10.66993, - "22": 10.50643, - "23": 10.69004, - "24": 10.56314, - "25": 10.4942, - "26": 10.56628, - "27": 10.58025, + "19": 10.80528, + "20": 10.70396, + "21": 10.6699, + "22": 10.50641, + "23": 10.69006, + "24": 10.56312, + "25": 10.49418, + "26": 10.56627, + "27": 10.58023, "28": 10.51571, - "29": 10.55299, - "30": 10.30549, - "31": 10.02245, - "32": 10.40614, + "29": 10.55296, + "30": 10.30551, + "31": 10.02244, + "32": 10.40618, "33": 10.39874, - "34": 10.13771, + "34": 10.1377, "35": 10.20184, - "36": 10.16052, - "37": 10.28973, - "38": 10.11474, + "36": 10.1605, + "37": 10.28975, + "38": 10.11483, "39": 10.361, - "40": 10.01903, + "40": 10.01905, "41": 10.07292, - "42": 10.14698, - "43": 9.74687, - "44": 9.87766, - "45": 9.74966, - "46": 9.73383, - "47": 10.07535, - "48": 9.78068, - "49": 9.44784, + "42": 10.14697, + "43": 9.74684, + "44": 9.87763, + "45": 9.74962, + "46": 9.73382, + "47": 10.07536, + "48": 9.78071, + "49": 9.44783, "50": 9.8399 } }, @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 653.0, - "2": 642.0, - "3": 630.0, - "4": 585.0, - "5": 635.0, - "6": 687.0, - "7": 615.0, - "8": 601.0, - "9": 607.0, - "10": 522.0, - "11": 637.0, - "12": 675.0, - "13": 649.0, - "14": 648.0, - "15": 640.0, - "16": 602.0, - "17": 668.0, - "18": 634.0, - "19": 593.0, - "20": 579.0, - "21": 633.0, - "22": 597.0, - "23": 756.0, - "24": 612.0, - "25": 591.0, - "26": 620.0, - "27": 700.0, - "28": 705.0, - "29": 795.0, - "30": 752.0, - "31": 628.0, - "32": 712.0, - "33": 752.0, - "34": 737.0, - "35": 741.0, - "36": 770.0, - "37": 861.0, - "38": 823.0, - "39": 812.0, - "40": 814.0, - "41": 826.0, - "42": 801.0, - "43": 769.0, - "44": 822.0, - "45": 777.0, - "46": 828.0, - "47": 878.0, - "48": 915.0, - "49": 908.0, - "50": 848.0 + "1": 572.0, + "2": 656.0, + "3": 649.0, + "4": 631.0, + "5": 658.0, + "6": 636.0, + "7": 636.0, + "8": 542.0, + "9": 653.0, + "10": 551.0, + "11": 681.0, + "12": 642.0, + "13": 624.0, + "14": 658.0, + "15": 682.0, + "16": 659.0, + "17": 620.0, + "18": 603.0, + "19": 634.0, + "20": 639.0, + "21": 634.0, + "22": 602.0, + "23": 731.0, + "24": 620.0, + "25": 611.0, + "26": 626.0, + "27": 683.0, + "28": 668.0, + "29": 713.0, + "30": 712.0, + "31": 616.0, + "32": 786.0, + "33": 800.0, + "34": 702.0, + "35": 684.0, + "36": 664.0, + "37": 831.0, + "38": 802.0, + "39": 919.0, + "40": 802.0, + "41": 791.0, + "42": 840.0, + "43": 718.0, + "44": 756.0, + "45": 765.0, + "46": 809.0, + "47": 839.0, + "48": 827.0, + "49": 935.0, + "50": 839.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 510689792.0, - "2": 510689792.0, - "3": 510689792.0, - "4": 510689792.0, - "5": 510689792.0, - "6": 510689792.0, - "7": 510689792.0, - "8": 510689792.0, - "9": 510689792.0, - "10": 510689792.0, - "11": 510689792.0, - "12": 510689792.0, - "13": 510689792.0, - "14": 510689792.0, - "15": 510689792.0, - "16": 510689792.0, - "17": 510689792.0, - "18": 510689792.0, - "19": 510689792.0, - "20": 510689792.0, - "21": 510689792.0, - "22": 510689792.0, - "23": 510689792.0, - "24": 510689792.0, - "25": 510689792.0, - "26": 510689792.0, - "27": 510689792.0, - "28": 510689792.0, - "29": 510689792.0, - "30": 510689792.0, - "31": 510689792.0, - "32": 510689792.0, - "33": 510689792.0, - "34": 510689792.0, - "35": 510689792.0, - "36": 510689792.0, - "37": 510689792.0, - "38": 510689792.0, - "39": 510689792.0, - "40": 510689792.0, - "41": 510689792.0, - "42": 510689792.0, - "43": 510689792.0, - "44": 510689792.0, - "45": 510689792.0, - "46": 510689792.0, - "47": 510689792.0, - "48": 510689792.0, - "49": 510689792.0, - "50": 510689792.0 + "1": 511214080.0, + "2": 511214080.0, + "3": 511214080.0, + "4": 511214080.0, + "5": 511214080.0, + "6": 511214080.0, + "7": 511214080.0, + "8": 511214080.0, + "9": 511214080.0, + "10": 511214080.0, + "11": 511214080.0, + "12": 511214080.0, + "13": 511214080.0, + "14": 511214080.0, + "15": 511214080.0, + "16": 511214080.0, + "17": 511214080.0, + "18": 511214080.0, + "19": 511214080.0, + "20": 511214080.0, + "21": 511214080.0, + "22": 511214080.0, + "23": 511214080.0, + "24": 511214080.0, + "25": 511214080.0, + "26": 511214080.0, + "27": 511214080.0, + "28": 511214080.0, + "29": 511214080.0, + "30": 511214080.0, + "31": 511214080.0, + "32": 511214080.0, + "33": 511214080.0, + "34": 511214080.0, + "35": 511214080.0, + "36": 511214080.0, + "37": 511214080.0, + "38": 511214080.0, + "39": 511214080.0, + "40": 511214080.0, + "41": 511214080.0, + "42": 511214080.0, + "43": 511214080.0, + "44": 511214080.0, + "45": 511214080.0, + "46": 511214080.0, + "47": 511214080.0, + "48": 511214080.0, + "49": 511214080.0, + "50": 511214080.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 759898624.0, - "2": 933156352.0, - "3": 933156352.0, - "4": 933156352.0, - "5": 933156352.0, - "6": 933156352.0, - "7": 933156352.0, - "8": 933156352.0, - "9": 933156352.0, - "10": 933156352.0, - "11": 933156352.0, - "12": 933156352.0, - "13": 933156352.0, - "14": 933156352.0, - "15": 933156352.0, - "16": 933156352.0, - "17": 933156352.0, - "18": 933156352.0, - "19": 933156352.0, - "20": 933156352.0, - "21": 933156352.0, - "22": 933156352.0, - "23": 933156352.0, - "24": 933156352.0, - "25": 933156352.0, - "26": 933156352.0, - "27": 933156352.0, - "28": 933156352.0, - "29": 933156352.0, - "30": 933156352.0, - "31": 933156352.0, - "32": 933156352.0, - "33": 933156352.0, - "34": 933156352.0, - "35": 933156352.0, - "36": 933156352.0, - "37": 933156352.0, - "38": 933156352.0, - "39": 933156352.0, - "40": 933156352.0, - "41": 933156352.0, - "42": 933156352.0, - "43": 933156352.0, - "44": 933156352.0, - "45": 933156352.0, - "46": 933156352.0, - "47": 933156352.0, - "48": 933156352.0, - "49": 933156352.0, - "50": 933156352.0 + "1": 759899136.0, + "2": 936824320.0, + "3": 936824832.0, + "4": 936824832.0, + "5": 936824832.0, + "6": 936824832.0, + "7": 936824832.0, + "8": 936824832.0, + "9": 936824832.0, + "10": 936824832.0, + "11": 936824832.0, + "12": 936824832.0, + "13": 936824832.0, + "14": 936824832.0, + "15": 936824832.0, + "16": 936824832.0, + "17": 936824832.0, + "18": 936824832.0, + "19": 936824832.0, + "20": 936824832.0, + "21": 936824832.0, + "22": 936824832.0, + "23": 936824832.0, + "24": 936824832.0, + "25": 936824832.0, + "26": 936824832.0, + "27": 936824832.0, + "28": 936824832.0, + "29": 936824832.0, + "30": 936824832.0, + "31": 936824832.0, + "32": 936824832.0, + "33": 936824832.0, + "34": 936824832.0, + "35": 936824832.0, + "36": 936824832.0, + "37": 936824832.0, + "38": 936824832.0, + "39": 936824832.0, + "40": 936824832.0, + "41": 936824832.0, + "42": 936824832.0, + "43": 936824832.0, + "44": 936824832.0, + "45": 936824832.0, + "46": 936824832.0, + "47": 936824832.0, + "48": 936824832.0, + "49": 936824832.0, + "50": 936824832.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 18.71096, - "2": 0.39649, - "3": 0.33228, - "4": 0.33042, - "5": 0.33036, - "6": 0.3326, - "7": 0.33962, - "8": 0.37041, - "9": 0.33077, - "10": 0.33179, - "11": 0.33053, - "12": 0.33332, - "13": 0.33149, - "14": 0.32928, - "15": 0.33252, - "16": 0.3321, - "17": 0.32661, - "18": 0.32933, - "19": 0.32718, - "20": 0.32982, - "21": 0.32827, - "22": 0.3313, - "23": 0.32836, - "24": 0.3287, - "25": 0.33025, - "26": 0.32605, - "27": 0.33501, - "28": 0.32889, - "29": 0.32971, - "30": 0.3318, - "31": 0.33458, - "32": 0.33222, - "33": 0.33434, - "34": 0.3337, - "35": 0.33221, - "36": 0.32984, - "37": 0.32779, - "38": 0.33131, - "39": 0.33056, - "40": 0.32941, - "41": 0.32351, - "42": 0.32946, - "43": 0.32913, - "44": 0.3283, - "45": 0.32845, - "46": 0.32474, - "47": 0.33097, - "48": 0.32791, - "49": 0.33143, - "50": 0.33005 + "1": 45.68343, + "2": 0.392, + "3": 0.35818, + "4": 0.28793, + "5": 0.28609, + "6": 0.28869, + "7": 0.28726, + "8": 0.28725, + "9": 0.28787, + "10": 0.2834, + "11": 0.28813, + "12": 0.28685, + "13": 0.28453, + "14": 0.28421, + "15": 0.28504, + "16": 0.28118, + "17": 0.28123, + "18": 0.28302, + "19": 0.28937, + "20": 0.28486, + "21": 0.28762, + "22": 0.28121, + "23": 0.28289, + "24": 0.28379, + "25": 0.28305, + "26": 0.28337, + "27": 0.28236, + "28": 0.28063, + "29": 0.27814, + "30": 0.2808, + "31": 0.27908, + "32": 0.28085, + "33": 0.28065, + "34": 0.28226, + "35": 0.28009, + "36": 0.2802, + "37": 0.28283, + "38": 0.27963, + "39": 0.28465, + "40": 0.28297, + "41": 0.28176, + "42": 0.28166, + "43": 0.2805, + "44": 0.28385, + "45": 0.28053, + "46": 0.27883, + "47": 0.28037, + "48": 0.28067, + "49": 0.27929, + "50": 0.27864 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_h100.json index cd45ff021d9..e8b9cea88e0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last/golden_values_dev_dgx_h100.json @@ -4,55 +4,55 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 10.86535, - "2": 10.85873, - "3": 10.86284, - "4": 10.84009, + "1": 10.86539, + "2": 10.85871, + "3": 10.86282, + "4": 10.84007, "5": 10.87856, - "6": 10.88856, - "7": 10.86532, - "8": 10.86017, - "9": 10.8599, - "10": 10.82981, - "11": 10.8895, - "12": 10.8751, - "13": 10.87423, + "6": 10.88852, + "7": 10.86536, + "8": 10.86015, + "9": 10.85991, + "10": 10.82982, + "11": 10.88947, + "12": 10.87511, + "13": 10.87422, "14": 10.89675, - "15": 10.82054, - "16": 10.82504, + "15": 10.82056, + "16": 10.82497, "17": 10.78983, "18": 10.81029, - "19": 10.80535, - "20": 10.70398, - "21": 10.66993, - "22": 10.50643, - "23": 10.69004, - "24": 10.56314, - "25": 10.4942, - "26": 10.56628, - "27": 10.58025, + "19": 10.80528, + "20": 10.70396, + "21": 10.6699, + "22": 10.50641, + "23": 10.69006, + "24": 10.56312, + "25": 10.49418, + "26": 10.56627, + "27": 10.58023, "28": 10.51571, - "29": 10.55299, - "30": 10.30549, - "31": 10.02245, - "32": 10.40614, + "29": 10.55296, + "30": 10.30551, + "31": 10.02244, + "32": 10.40618, "33": 10.39874, - "34": 10.13771, + "34": 10.1377, "35": 10.20184, - "36": 10.16052, - "37": 10.28973, - "38": 10.11474, + "36": 10.1605, + "37": 10.28975, + "38": 10.11483, "39": 10.361, - "40": 10.01903, + "40": 10.01905, "41": 10.07292, - "42": 10.14698, - "43": 9.74687, - "44": 9.87766, - "45": 9.74966, - "46": 9.73383, - "47": 10.07535, - "48": 9.78068, - "49": 9.44784, + "42": 10.14697, + "43": 9.74684, + "44": 9.87763, + "45": 9.74962, + "46": 9.73382, + "47": 10.07536, + "48": 9.78071, + "49": 9.44783, "50": 9.8399 } }, @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 653.0, - "2": 642.0, - "3": 630.0, - "4": 585.0, - "5": 635.0, - "6": 687.0, - "7": 615.0, - "8": 601.0, - "9": 607.0, - "10": 522.0, - "11": 637.0, - "12": 675.0, - "13": 649.0, - "14": 648.0, - "15": 640.0, - "16": 602.0, - "17": 668.0, - "18": 634.0, - "19": 593.0, - "20": 579.0, - "21": 633.0, - "22": 597.0, - "23": 756.0, - "24": 612.0, - "25": 591.0, - "26": 620.0, - "27": 700.0, - "28": 705.0, - "29": 795.0, - "30": 752.0, - "31": 628.0, - "32": 712.0, - "33": 752.0, - "34": 737.0, - "35": 741.0, - "36": 770.0, - "37": 861.0, - "38": 823.0, - "39": 812.0, - "40": 814.0, - "41": 826.0, - "42": 801.0, - "43": 769.0, - "44": 822.0, - "45": 777.0, - "46": 828.0, - "47": 878.0, - "48": 915.0, - "49": 908.0, - "50": 848.0 + "1": 572.0, + "2": 656.0, + "3": 649.0, + "4": 631.0, + "5": 658.0, + "6": 636.0, + "7": 636.0, + "8": 542.0, + "9": 653.0, + "10": 551.0, + "11": 681.0, + "12": 642.0, + "13": 624.0, + "14": 658.0, + "15": 682.0, + "16": 659.0, + "17": 620.0, + "18": 603.0, + "19": 634.0, + "20": 639.0, + "21": 634.0, + "22": 602.0, + "23": 731.0, + "24": 620.0, + "25": 611.0, + "26": 626.0, + "27": 683.0, + "28": 668.0, + "29": 713.0, + "30": 712.0, + "31": 616.0, + "32": 786.0, + "33": 800.0, + "34": 702.0, + "35": 684.0, + "36": 664.0, + "37": 831.0, + "38": 802.0, + "39": 919.0, + "40": 802.0, + "41": 791.0, + "42": 840.0, + "43": 718.0, + "44": 756.0, + "45": 765.0, + "46": 809.0, + "47": 839.0, + "48": 827.0, + "49": 935.0, + "50": 839.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 510689792.0, - "2": 510689792.0, - "3": 510689792.0, - "4": 510689792.0, - "5": 510689792.0, - "6": 510689792.0, - "7": 510689792.0, - "8": 510689792.0, - "9": 510689792.0, - "10": 510689792.0, - "11": 510689792.0, - "12": 510689792.0, - "13": 510689792.0, - "14": 510689792.0, - "15": 510689792.0, - "16": 510689792.0, - "17": 510689792.0, - "18": 510689792.0, - "19": 510689792.0, - "20": 510689792.0, - "21": 510689792.0, - "22": 510689792.0, - "23": 510689792.0, - "24": 510689792.0, - "25": 510689792.0, - "26": 510689792.0, - "27": 510689792.0, - "28": 510689792.0, - "29": 510689792.0, - "30": 510689792.0, - "31": 510689792.0, - "32": 510689792.0, - "33": 510689792.0, - "34": 510689792.0, - "35": 510689792.0, - "36": 510689792.0, - "37": 510689792.0, - "38": 510689792.0, - "39": 510689792.0, - "40": 510689792.0, - "41": 510689792.0, - "42": 510689792.0, - "43": 510689792.0, - "44": 510689792.0, - "45": 510689792.0, - "46": 510689792.0, - "47": 510689792.0, - "48": 510689792.0, - "49": 510689792.0, - "50": 510689792.0 + "1": 511214080.0, + "2": 511214080.0, + "3": 511214080.0, + "4": 511214080.0, + "5": 511214080.0, + "6": 511214080.0, + "7": 511214080.0, + "8": 511214080.0, + "9": 511214080.0, + "10": 511214080.0, + "11": 511214080.0, + "12": 511214080.0, + "13": 511214080.0, + "14": 511214080.0, + "15": 511214080.0, + "16": 511214080.0, + "17": 511214080.0, + "18": 511214080.0, + "19": 511214080.0, + "20": 511214080.0, + "21": 511214080.0, + "22": 511214080.0, + "23": 511214080.0, + "24": 511214080.0, + "25": 511214080.0, + "26": 511214080.0, + "27": 511214080.0, + "28": 511214080.0, + "29": 511214080.0, + "30": 511214080.0, + "31": 511214080.0, + "32": 511214080.0, + "33": 511214080.0, + "34": 511214080.0, + "35": 511214080.0, + "36": 511214080.0, + "37": 511214080.0, + "38": 511214080.0, + "39": 511214080.0, + "40": 511214080.0, + "41": 511214080.0, + "42": 511214080.0, + "43": 511214080.0, + "44": 511214080.0, + "45": 511214080.0, + "46": 511214080.0, + "47": 511214080.0, + "48": 511214080.0, + "49": 511214080.0, + "50": 511214080.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 759895552.0, - "2": 933156352.0, - "3": 933156352.0, - "4": 933156352.0, - "5": 933156352.0, - "6": 933156352.0, - "7": 933156352.0, - "8": 933156352.0, - "9": 933156352.0, - "10": 933156352.0, - "11": 933156352.0, - "12": 933156352.0, - "13": 933156352.0, - "14": 933156352.0, - "15": 933156352.0, - "16": 933156352.0, - "17": 933156352.0, - "18": 933156352.0, - "19": 933156352.0, - "20": 933156352.0, - "21": 933156352.0, - "22": 933156352.0, - "23": 933156352.0, - "24": 933156352.0, - "25": 933156352.0, - "26": 933156352.0, - "27": 933156352.0, - "28": 933156352.0, - "29": 933156352.0, - "30": 933156352.0, - "31": 933156352.0, - "32": 933156352.0, - "33": 934201856.0, - "34": 934201856.0, - "35": 934201856.0, - "36": 934201856.0, - "37": 934201856.0, - "38": 934201856.0, - "39": 934201856.0, - "40": 934201856.0, - "41": 934201856.0, - "42": 934201856.0, - "43": 934201856.0, - "44": 934201856.0, - "45": 934201856.0, - "46": 934201856.0, - "47": 934201856.0, - "48": 934201856.0, - "49": 934201856.0, - "50": 934201856.0 + "1": 757801984.0, + "2": 935777792.0, + "3": 935777792.0, + "4": 935777792.0, + "5": 935777792.0, + "6": 935777792.0, + "7": 935777792.0, + "8": 935777792.0, + "9": 935777792.0, + "10": 935777792.0, + "11": 935777792.0, + "12": 935777792.0, + "13": 935777792.0, + "14": 935777792.0, + "15": 935777792.0, + "16": 935777792.0, + "17": 935777792.0, + "18": 935777792.0, + "19": 935777792.0, + "20": 935777792.0, + "21": 935777792.0, + "22": 935777792.0, + "23": 935777792.0, + "24": 935777792.0, + "25": 935777792.0, + "26": 935777792.0, + "27": 935777792.0, + "28": 935777792.0, + "29": 935777792.0, + "30": 935777792.0, + "31": 935777792.0, + "32": 935777792.0, + "33": 935777792.0, + "34": 935777792.0, + "35": 935777792.0, + "36": 935777792.0, + "37": 935777792.0, + "38": 935777792.0, + "39": 935777792.0, + "40": 935777792.0, + "41": 935777792.0, + "42": 935777792.0, + "43": 935777792.0, + "44": 935777792.0, + "45": 935777792.0, + "46": 935777792.0, + "47": 935777792.0, + "48": 935777792.0, + "49": 935777792.0, + "50": 935777792.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 17.72917, - "2": 0.36269, - "3": 0.33585, - "4": 0.33878, - "5": 0.33758, - "6": 0.33453, - "7": 0.33628, - "8": 0.33416, - "9": 0.33309, - "10": 0.33521, - "11": 0.33536, - "12": 0.33148, - "13": 0.33565, - "14": 0.33401, - "15": 0.33029, - "16": 0.33788, - "17": 0.33302, - "18": 0.33337, - "19": 0.33761, - "20": 0.33672, - "21": 0.33256, - "22": 0.3374, - "23": 0.33652, - "24": 0.33672, - "25": 0.33982, - "26": 0.3335, - "27": 0.3328, - "28": 0.33835, - "29": 0.33338, - "30": 0.33371, - "31": 0.33991, - "32": 0.33259, - "33": 0.33537, - "34": 0.33777, - "35": 0.33494, - "36": 0.33504, - "37": 0.33915, - "38": 0.33462, - "39": 0.33387, - "40": 0.33791, - "41": 0.33426, - "42": 0.33834, - "43": 0.33785, - "44": 0.32761, - "45": 0.32857, - "46": 0.33205, - "47": 0.3355, - "48": 0.33535, - "49": 0.33792, - "50": 0.33613 + "1": 44.86787, + "2": 0.36349, + "3": 0.3142, + "4": 0.29456, + "5": 0.29609, + "6": 0.29566, + "7": 0.29467, + "8": 0.2899, + "9": 0.28864, + "10": 0.28994, + "11": 0.28355, + "12": 0.28608, + "13": 0.28278, + "14": 0.2823, + "15": 0.28087, + "16": 0.28237, + "17": 0.28556, + "18": 0.28363, + "19": 0.28381, + "20": 0.28356, + "21": 0.28235, + "22": 0.29036, + "23": 0.28491, + "24": 0.28322, + "25": 0.28412, + "26": 0.28352, + "27": 0.28643, + "28": 0.2853, + "29": 0.28809, + "30": 0.28258, + "31": 0.28114, + "32": 0.281, + "33": 0.28135, + "34": 0.27914, + "35": 0.28099, + "36": 0.28267, + "37": 0.28236, + "38": 0.28102, + "39": 0.31493, + "40": 0.28173, + "41": 0.28058, + "42": 0.28033, + "43": 0.28335, + "44": 0.28253, + "45": 0.28169, + "46": 0.28078, + "47": 0.28082, + "48": 0.2819, + "49": 0.28087, + "50": 0.28 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100.json index cac9c570ec1..9e26dfeeb6e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100.json @@ -1,537 +1 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 10.8583, - "2": 10.85411, - "3": 10.8543, - "4": 10.84407, - "5": 10.87282, - "6": 10.8793, - "7": 10.84658, - "8": 10.86139, - "9": 10.87078, - "10": 10.83266, - "11": 10.86332, - "12": 10.87295, - "13": 10.87798, - "14": 10.88588, - "15": 10.82104, - "16": 10.82759, - "17": 10.80303, - "18": 10.82092, - "19": 10.80032, - "20": 10.71379, - "21": 10.69818, - "22": 10.57542, - "23": 10.72119, - "24": 10.60091, - "25": 10.5476, - "26": 10.61127, - "27": 10.61393, - "28": 10.57777, - "29": 10.57888, - "30": 10.36791, - "31": 10.13451, - "32": 10.47063, - "33": 10.47371, - "34": 10.23442, - "35": 10.28457, - "36": 10.23595, - "37": 10.35351, - "38": 10.20695, - "39": 10.40581, - "40": 10.08924, - "41": 10.16388, - "42": 10.22671, - "43": 9.86336, - "44": 9.98189, - "45": 9.84555, - "46": 9.85753, - "47": 10.16884, - "48": 9.86474, - "49": 9.54712, - "50": 9.91942, - "51": 9.86179, - "52": 9.76162, - "53": 10.08383, - "54": 9.96743, - "55": 9.89199, - "56": 9.63777, - "57": 9.49339, - "58": 9.83897, - "59": 9.59641, - "60": 9.50823, - "61": 9.70513, - "62": 9.99499, - "63": 9.38054, - "64": 9.78296, - "65": 8.95946, - "66": 9.71045, - "67": 9.38075, - "68": 9.78884, - "69": 9.79451, - "70": 9.73441, - "71": 9.62146, - "72": 9.58792, - "73": 9.49657, - "74": 8.9434, - "75": 9.43112, - "76": 9.09716, - "77": 10.0681, - "78": 9.73005, - "79": 9.37764, - "80": 9.41097, - "81": 9.48622, - "82": 9.69669, - "83": 9.3163, - "84": 9.42182, - "85": 9.61516, - "86": 9.07553, - "87": 9.59851, - "88": 9.75046, - "89": 9.61112, - "90": 9.82373, - "91": 9.35278, - "92": 9.36495, - "93": 9.08811, - "94": 8.83656, - "95": 9.52256, - "96": 9.52793, - "97": 9.31634, - "98": 9.67876, - "99": 8.89321, - "100": 9.40801 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 1708.0, - "2": 1804.0, - "3": 1725.0, - "4": 1881.0, - "5": 2019.0, - "6": 2015.0, - "7": 2086.0, - "8": 1730.0, - "9": 2024.0, - "10": 1515.0, - "11": 2162.0, - "12": 1847.0, - "13": 2125.0, - "14": 2050.0, - "15": 1946.0, - "16": 2000.0, - "17": 1996.0, - "18": 1874.0, - "19": 2011.0, - "20": 1771.0, - "21": 2099.0, - "22": 1892.0, - "23": 2171.0, - "24": 1834.0, - "25": 1790.0, - "26": 1803.0, - "27": 1998.0, - "28": 2211.0, - "29": 2129.0, - "30": 2147.0, - "31": 1623.0, - "32": 2174.0, - "33": 2364.0, - "34": 2035.0, - "35": 2089.0, - "36": 2202.0, - "37": 2603.0, - "38": 2468.0, - "39": 2623.0, - "40": 2383.0, - "41": 2519.0, - "42": 2522.0, - "43": 2235.0, - "44": 2275.0, - "45": 2319.0, - "46": 2632.0, - "47": 2675.0, - "48": 2697.0, - "49": 2551.0, - "50": 2814.0, - "51": 2767.0, - "52": 2804.0, - "53": 3231.0, - "54": 2905.0, - "55": 2575.0, - "56": 3077.0, - "57": 2587.0, - "58": 3346.0, - "59": 3056.0, - "60": 2695.0, - "61": 3191.0, - "62": 2637.0, - "63": 2649.0, - "64": 3176.0, - "65": 2756.0, - "66": 3481.0, - "67": 2905.0, - "68": 3114.0, - "69": 3133.0, - "70": 3533.0, - "71": 3225.0, - "72": 2621.0, - "73": 3297.0, - "74": 2145.0, - "75": 2799.0, - "76": 3354.0, - "77": 3466.0, - "78": 3485.0, - "79": 3464.0, - "80": 3614.0, - "81": 4011.0, - "82": 3694.0, - "83": 3201.0, - "84": 3655.0, - "85": 3597.0, - "86": 3096.0, - "87": 4103.0, - "88": 3306.0, - "89": 3839.0, - "90": 3352.0, - "91": 2980.0, - "92": 3452.0, - "93": 2967.0, - "94": 3773.0, - "95": 3589.0, - "96": 3800.0, - "97": 3412.0, - "98": 3998.0, - "99": 3483.0, - "100": 3651.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 232422400.0, - "2": 232422400.0, - "3": 232422400.0, - "4": 232422400.0, - "5": 232422400.0, - "6": 233470976.0, - "7": 232422400.0, - "8": 233470976.0, - "9": 232422400.0, - "10": 232422400.0, - "11": 232422400.0, - "12": 232422400.0, - "13": 232422400.0, - "14": 233470976.0, - "15": 232422400.0, - "16": 232422400.0, - "17": 232422400.0, - "18": 232422400.0, - "19": 232422400.0, - "20": 232422400.0, - "21": 232422400.0, - "22": 232422400.0, - "23": 232422400.0, - "24": 232422400.0, - "25": 232422400.0, - "26": 232422400.0, - "27": 232422400.0, - "28": 232422400.0, - "29": 232422400.0, - "30": 232422400.0, - "31": 232422400.0, - "32": 232422400.0, - "33": 232422400.0, - "34": 232422400.0, - "35": 232422400.0, - "36": 232422400.0, - "37": 232422400.0, - "38": 232422400.0, - "39": 232422400.0, - "40": 232422400.0, - "41": 232422400.0, - "42": 232422400.0, - "43": 232422400.0, - "44": 232422400.0, - "45": 232422400.0, - "46": 232422400.0, - "47": 232422400.0, - "48": 232422400.0, - "49": 233470976.0, - "50": 232422400.0, - "51": 232422400.0, - "52": 232422400.0, - "53": 232422400.0, - "54": 232422400.0, - "55": 233470976.0, - "56": 232422400.0, - "57": 233470976.0, - "58": 232422400.0, - "59": 232422400.0, - "60": 232422400.0, - "61": 232422400.0, - "62": 232422400.0, - "63": 232422400.0, - "64": 232422400.0, - "65": 232422400.0, - "66": 232422400.0, - "67": 232422400.0, - "68": 232422400.0, - "69": 232422400.0, - "70": 232422400.0, - "71": 232422400.0, - "72": 232422400.0, - "73": 232422400.0, - "74": 232422400.0, - "75": 232422400.0, - "76": 232422400.0, - "77": 232422400.0, - "78": 232422400.0, - "79": 232422400.0, - "80": 232422400.0, - "81": 232422400.0, - "82": 232422400.0, - "83": 232422400.0, - "84": 232422400.0, - "85": 232422400.0, - "86": 232422400.0, - "87": 232422400.0, - "88": 232422400.0, - "89": 232422400.0, - "90": 232422400.0, - "91": 232422400.0, - "92": 232422400.0, - "93": 232422400.0, - "94": 232422400.0, - "95": 232422400.0, - "96": 232422400.0, - "97": 232422400.0, - "98": 232422400.0, - "99": 233470976.0, - "100": 232422400.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 683423744.0, - "2": 773273600.0, - "3": 773276672.0, - "4": 773276672.0, - "5": 773276672.0, - "6": 773276672.0, - "7": 773276672.0, - "8": 773276672.0, - "9": 773276672.0, - "10": 773276672.0, - "11": 773276672.0, - "12": 773276672.0, - "13": 773276672.0, - "14": 773276672.0, - "15": 773276672.0, - "16": 773276672.0, - "17": 773276672.0, - "18": 773276672.0, - "19": 773276672.0, - "20": 773276672.0, - "21": 773276672.0, - "22": 773276672.0, - "23": 773276672.0, - "24": 773276672.0, - "25": 773276672.0, - "26": 773276672.0, - "27": 773276672.0, - "28": 773276672.0, - "29": 773276672.0, - "30": 773276672.0, - "31": 773276672.0, - "32": 773276672.0, - "33": 773276672.0, - "34": 773276672.0, - "35": 773276672.0, - "36": 773276672.0, - "37": 773276672.0, - "38": 773276672.0, - "39": 773276672.0, - "40": 773276672.0, - "41": 773276672.0, - "42": 773276672.0, - "43": 773276672.0, - "44": 773276672.0, - "45": 773276672.0, - "46": 773276672.0, - "47": 773276672.0, - "48": 773276672.0, - "49": 773276672.0, - "50": 775372800.0, - "51": 775372800.0, - "52": 775372800.0, - "53": 775372800.0, - "54": 775372800.0, - "55": 775372800.0, - "56": 775372800.0, - "57": 775372800.0, - "58": 775372800.0, - "59": 775372800.0, - "60": 775372800.0, - "61": 775372800.0, - "62": 775372800.0, - "63": 775372800.0, - "64": 775372800.0, - "65": 775372800.0, - "66": 775372800.0, - "67": 775372800.0, - "68": 775372800.0, - "69": 775372800.0, - "70": 775372800.0, - "71": 775372800.0, - "72": 775372800.0, - "73": 775372800.0, - "74": 775372800.0, - "75": 775372800.0, - "76": 775372800.0, - "77": 775372800.0, - "78": 775372800.0, - "79": 775372800.0, - "80": 775372800.0, - "81": 775372800.0, - "82": 775372800.0, - "83": 775372800.0, - "84": 775372800.0, - "85": 775372800.0, - "86": 775372800.0, - "87": 775372800.0, - "88": 775372800.0, - "89": 775372800.0, - "90": 775372800.0, - "91": 775372800.0, - "92": 775372800.0, - "93": 775372800.0, - "94": 775372800.0, - "95": 775372800.0, - "96": 775372800.0, - "97": 775372800.0, - "98": 775372800.0, - "99": 775373312.0, - "100": 775373312.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 16.23173, - "2": 0.48632, - "3": 0.3184, - "4": 0.31067, - "5": 0.31575, - "6": 0.3127, - "7": 0.3096, - "8": 0.31392, - "9": 0.31591, - "10": 0.30891, - "11": 0.31209, - "12": 0.31271, - "13": 0.30582, - "14": 0.31032, - "15": 0.30879, - "16": 0.3077, - "17": 0.30689, - "18": 0.30824, - "19": 0.30953, - "20": 0.30728, - "21": 0.31141, - "22": 0.31157, - "23": 0.30569, - "24": 0.30896, - "25": 0.30916, - "26": 0.30674, - "27": 0.31017, - "28": 0.30716, - "29": 0.30734, - "30": 0.30698, - "31": 0.30881, - "32": 0.3089, - "33": 0.30647, - "34": 0.3112, - "35": 0.311, - "36": 0.30632, - "37": 0.30856, - "38": 0.30986, - "39": 0.30502, - "40": 0.31035, - "41": 0.306, - "42": 0.30943, - "43": 0.30773, - "44": 0.30886, - "45": 0.30942, - "46": 0.30579, - "47": 0.31121, - "48": 0.31407, - "49": 0.30981, - "50": 0.30966, - "51": 0.3347, - "52": 0.35543, - "53": 0.31067, - "54": 0.30931, - "55": 0.31517, - "56": 0.30883, - "57": 0.30908, - "58": 0.31373, - "59": 0.30746, - "60": 0.31113, - "61": 0.31473, - "62": 0.30775, - "63": 0.31034, - "64": 0.31108, - "65": 0.3103, - "66": 0.3085, - "67": 0.31036, - "68": 0.31412, - "69": 0.30947, - "70": 0.30646, - "71": 0.31133, - "72": 0.30734, - "73": 0.31043, - "74": 0.31583, - "75": 0.3074, - "76": 0.30939, - "77": 0.3182, - "78": 0.30755, - "79": 0.30953, - "80": 0.3085, - "81": 0.31023, - "82": 0.30621, - "83": 0.30705, - "84": 0.31232, - "85": 0.30864, - "86": 0.31017, - "87": 0.3124, - "88": 0.30667, - "89": 0.31086, - "90": 0.31626, - "91": 0.30744, - "92": 0.30887, - "93": 0.31054, - "94": 0.31172, - "95": 0.31164, - "96": 0.31058, - "97": 0.31089, - "98": 0.30676, - "99": 0.3105, - "100": 0.31337 - } - } -} \ No newline at end of file +{} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/golden_values_dev_dgx_h100.json index 9be8a9dc0ca..b31640a2a28 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/golden_values_dev_dgx_h100.json @@ -1,1028 +1,1028 @@ { "throughput": [ - 94.6087716527102, - 115.85992244026639, - 138.9562527069375, - 133.18726531918395, - 81.97861561771212, - 134.30726469422635, - 86.456140428456, - 114.99456351298251, - 147.3101800153954, - 3.0364623744653003, - 124.7590786954667, - 134.2276982994434, - 3.0580463134110167, - 117.03969654341354, - 130.92134521286803, - 48.493091604204935, - 1.4498729599486508, - 128.01470907994928, - 1.8330770354872434, - 66.31842482241125, - 82.24189975425459, - 1.07058112939944, - 1.8815468970982412, - 0.9373246942729808, - 134.9963160815443, - 2.285771114682068, - 43.068220270070434, - 134.9677086822377, - 82.44946740133796, - 47.71839155542011, - 114.4199568886962, - 29.67621576315833, - 144.1589742491705, - 95.8164720809401, - 122.80562228460093, - 39.21436814433054, - 3.041180292262413, - 3.2867844729646842, - 72.43808226229888, - 0.8371525937296347, - 1.2212635079980698, - 145.6869075644325, - 42.317711349146016, - 109.1196064871946, - 73.6281770453198, - 140.4495689387567, - 1.219834296561022, - 138.66856497329005, - 23.33818821323391, - 67.82342558671365, - 130.09683254313987, - 147.60199288178146, - 0.9427431720755464, - 3.2856495013162523, - 79.12426666101076, - 86.41557345094756, - 120.17346279825053, - 137.16615251640926, - 108.93291864542198, - 110.10504114490513, - 46.19253755421628, - 0.950218846923012, - 136.50642826951463, - 142.73168666846448, - 1.2206786818073785, - 1.898581377105612, - 131.72636154091063, - 2.2842414327001976, - 89.76521170090028, - 114.66053545744656, - 58.64474290044525, - 0.8367865961030284, - 128.01767795820945, - 60.87292097103301, - 124.20016865241587, - 119.59336898055426, - 0.9425820346281929, - 93.70053305431952, - 1.0728113870213674, - 135.7596767309971, - 112.89357243644062, - 89.2743296587299, - 137.86411291342458, - 135.6974706051771, - 102.59633828443238, - 129.82058179399326, - 139.57672703148444, - 140.5642311163746, - 78.49182953675201, - 123.40912657074227, - 82.74099904578694, - 75.5490641626476, - 93.38596238341951, - 141.19058076067225, - 1.072254167577298, - 100.8669047802279, - 132.77382347347034, - 92.29086179175866, - 137.20301032384705, - 89.57723938765776, - 67.5465256589703, - 0.9498935124108836, - 1.0716887464650027, - 0.8365472180547067, - 137.902625307774, - 132.67132600219722, - 1.45201860416265, - 1.8366476879619427, - 88.65095604379363, - 132.1806036761347, - 126.0481874394642, - 127.43750324083169, - 93.27238135265156, - 109.83884164204308, - 102.30516355984702, - 141.10387096377744, - 0.9425154448032942, - 95.04281981148903, - 103.11525529548061, - 0.8361762901534399, - 135.3171561172067, - 123.30032998064965, - 118.75691144485415, - 82.21375599642211, - 66.37216333263251, - 120.02349229491865, - 27.339414655466246, - 133.1312422227687, - 123.02377779863252, - 111.0798894329, - 58.88405247768833, - 131.31767475108893, - 40.19076958615912, - 123.58362152151858, - 130.6541142941889, - 61.39555613504246, - 43.92154495664044, - 1.037012527495492, - 127.16052127606021, - 137.06554800183082, - 85.67161160523041, - 1.0253417447981334, - 139.20903624514017, - 140.19068787455728, - 117.67416498245059, - 23.410837515725987, - 130.73052473972666, - 22.561824695346466, - 1.028901717647808, - 119.30712483977753, - 117.77548263464804, - 135.2959098119142, - 142.10193821260228, - 1.0366044325624144, - 1.0350271698893887, - 132.8943567509843, - 51.50353963446039, - 113.39559408843714, - 124.25424103796537, - 129.60407993083075, - 136.8566687186031, - 1.036163010240988, - 1.0345739017743927, - 118.72350056844492, - 32.453707095990595, - 43.851925176925825, - 139.39206855448938, - 141.0979597861742, - 132.81461728578432, - 80.95956255477945, - 133.42483643501154, - 57.27721135575491, - 81.47649794801364, - 79.39765285063396, - 56.40255861789973, - 0.8890603607397893, - 137.59325887086797, - 118.03982850100024, - 53.04390121587005, - 88.31177924841927, - 1.0287550608831881, - 54.67393025836421, - 54.73556135447348, - 129.6143036059356, - 123.57095756116274, - 146.05184555314386, - 55.506024155977386, - 84.40666358740559, - 62.68531518105107, - 147.42894642823578, - 1.0274253590993496, - 145.9063526676371, - 76.36231256557768, - 1.035808949157935, - 136.1858098182613, - 93.13144140533397, - 54.57886608953819, - 1.0251956490815057, - 1.0270063804838983, - 67.96952180390161, - 136.90103479290272, - 78.62986077133174, - 129.97235998681177, - 70.57784076609056, - 1.028567312218149, - 69.64434330087829, - 1.0266016363366386, - 25.142311727265525, - 139.54750333578679, - 118.80547132463877, - 1.0342055876192149, - 132.79991800938092, - 88.25494664060619, - 132.4600307114398, - 1.026200775415348, - 111.33264788932784, - 1.031301270403004, - 104.45912302410692, - 1.0337771723701492, - 124.53550504281608, - 1.0283501183885058, - 126.53361938982871, - 139.83512785200963, - 102.28350299734186, - 122.68389734539087, - 139.27095111763788, - 1.0333552237490158, - 97.04945381465573, - 60.63422077140298, - 1.0248694052483192, - 96.77644543721476, - 118.38370846079931, - 1.0309087229819596, - 136.0487423665781, - 1.032932214377732, - 104.96525711514936, - 50.75370028394122, - 125.67617176346853, - 125.47392048276225, - 101.59371483024698, - 119.1183231384482, - 134.24568445137294, - 1.0323996653747745, - 119.28563313083153, - 50.183581144589674, - 107.50817556608582, - 127.4693561344537, - 116.0234844098742, - 149.0429439759437, - 127.77855747904051, - 1.0319900690130652, - 129.7400124946839, - 60.27584011696136, - 1.0245534026749026, - 113.8687773549026, - 129.9927880985222, - 41.55332067297356, - 12.991853549713621, - 144.9384518471586, - 127.77570879015505, - 79.09214991388126, - 1.0326234729165304, - 144.50618896622706, - 44.461452482592826, - 145.75357879817352, - 150.5618330832813, - 123.17802281879979, - 147.0133924731902, - 57.07203337285457, - 140.17944630269687, - 44.5066568841284, - 150.2834791394652, - 146.37106237628518, - 135.59553639884948, - 21.91845075979551, - 1.0391172002596458, - 92.42182316100705, - 14.98578222593142, - 19.944740287073653, - 32.75622847272977, - 58.94666795839769, - 1.0428676908165904, - 97.94938911630567, - 140.5399781540016, - 36.397689902912774, - 1.0322919875583962, - 33.76444948259586, - 147.54902815924785, - 51.316830076622495, - 153.55703202636914, - 46.423895018386204, - 140.271682540213, - 1.0340651759548871, - 85.22971449383292, - 141.80480996358014, - 1.0234621691055457, - 1.0355322329825165, - 136.96321865236195, - 138.2293990177049, - 136.89440582973347, - 96.94919171687799, - 54.992986423891566, - 142.91167590864902, - 138.73615931624403, - 86.32837448704223, - 1.0424247604140402, - 127.58052889290863, - 138.2472241943501, - 1.0338260095695477, - 1.0317372756221133, - 150.59249576769173, - 1.0229533138894364, - 149.1711141084735, - 1.0419379125129562, - 1.040305113121658, - 150.13261057757276, - 62.47975017460808, - 70.20443057037575, - 76.88821624674898, - 1.0225242667788867, - 136.83301633777177, - 1.0414381555227956, - 131.6044067829552, - 1.038902005769604, - 1.0335832618537684, - 83.38230404797935, - 3.047737981863063, - 140.9843162162637, - 1.0352264324041114, - 1.0409374510445146, - 103.17228299164871, - 1.0383219913492376, - 67.5151836065632, - 126.94018489907108, - 95.29974174831813, - 1.022161551972834, - 1.0348032799350415, - 93.24855217625235, - 140.00831851627856, - 142.46553219867087, - 80.52507876480331, - 149.47939431741142, - 125.60095189608528, - 92.57991472689042, - 153.09192667088175, - 98.78787611117323, - 136.9802701171813, - 1.0378200246498124, - 79.05370338483348, - 145.63143231877774, - 107.86253722014555, - 113.1390555766259, - 150.4596904971142, - 6.010262757833046, - 138.11675690694213, - 1.0371929842524894, - 55.1702723554103, - 148.4142582794926, - 108.62464742566522, - 142.2515578682958, - 149.5588988951372, - 1.0310870179234204, - 32.798276334675066, - 145.8363475163408, - 82.52497836005318, - 144.77105210255448, - 140.95035733017403, - 145.4844811663436, - 145.0646083055648, - 139.1641494303434, - 1.0401220454548914, - 146.10598185112948, - 1.0335329080843159, - 1.0316085392161136, - 133.98012837767038, - 129.62059667226987, - 151.2681266565858, - 1.030719335336581, - 135.9600336007384, - 1.0366589924031362, - 107.70864165999221, - 118.06361914834272, - 148.4615541738592, - 135.1206190516379, - 1.0788915925864082, - 1.0662361391973343, - 1.0784094142292293, - 145.5492563111853, - 100.1745158858024, - 89.97448812790176, - 140.13008352060388, - 8.378443606045758, - 19.841723966559687, - 31.11972559764219, - 127.75589035167928, - 144.649118240912, - 83.40454687650907, - 13.609558087727212, - 144.14916775068022, - 143.0831699051951, - 144.53789580070173, - 129.35689525213576, - 126.54760361436873, - 136.72725454688293, - 83.66753329456253, - 35.238850690537326, - 138.73588075606074, - 148.39285997484404, - 141.43706957675556, - 35.20788617289704, - 140.22918428708584, - 141.42288954532623, - 80.8071906111917, - 53.480908541665116, - 96.60869116876205, - 138.83030943256392, - 146.89537016655746, - 1.0659353965573166, - 138.66041009897964, - 138.0783824554628, - 54.95061283513892, - 1.0688789370964418, - 145.4981195236156, - 107.91672388693667, - 147.39387423946786, - 143.49840246862203, - 1.0781871694837721, - 125.37215873599833, - 46.390553110182545, - 1.0683430650310588, - 60.55314896188811, - 128.32962060837178, - 142.6648214311374, - 1.065532502621677, - 145.06202945295232, - 149.5985088362253, - 43.61426254132819, - 139.2120402464869, - 138.80120892663803, - 142.59390751862693, - 147.27000174003754, - 139.5980537408405, - 142.37081759892675, - 76.47257166426981, - 0.8663971721944621, - 1.067847671923619, - 1.0752972325757186, - 139.11225337731244, - 154.1012640338781, - 91.85315813315137, - 7.34066705730821, - 1.0763437477764217, - 56.03391448680589, - 1.067309924884827, - 1.0747789028833068, - 1.057667310022394, - 146.4284745539176, - 142.32867288307636, - 132.81801172672715, - 142.5746724111237, - 43.178263922620026, - 140.19958418325498, - 1.0742201855279276, - 139.95237701874325, - 124.69044225989671, - 89.93275546978569, - 1.0778110524743836, - 108.03753008375865, - 0.8649825661375887, - 101.22782607000799, - 138.6615942910557, - 1.0572642952018412, - 143.509260845593, - 1.0651693329533294, - 97.454990956795, - 1.075960473594851, - 104.89429761368234, - 153.46849816095335, - 143.28204379991922, - 112.57923589922926, - 145.35468060283986, - 119.53338040876814, - 132.53105489182144, - 146.60735281445733, - 0.8648000721123511, - 132.61504628627392, - 140.81953388748138, - 1.05684091289561, - 147.29646966899597, - 1.0646855258714663, - 1.0772400203863821, - 137.87592499226204, - 101.79954304062817, - 134.45893707567646, - 1.0737967838723397, - 147.3289039421509, - 142.95955673278567, - 123.11846557585149, - 139.7223884224781, - 5.274894457437767, - 0.8646226703470901, - 135.27010135142623, - 134.53222451904563, - 140.4520894166607, - 148.6784682726068, - 148.83999547746723, - 144.76059628877204, - 146.09818079047014, - 0.8644123666240657, - 133.05795012757028, - 141.21253159110282, - 147.08086640702987, - 153.13511211461227, - 147.72437078211334, - 53.87242850230838, - 61.34701685378028, - 74.50771860339175, - 16.40780504974564, - 16.448796993269678, - 144.08505364828036, - 143.78069847853888, - 145.08382905436133, - 139.4144567792124, - 1.113422304912727, - 23.732299099149245, - 146.716938504402, - 1.1150428401994323, - 1.1070863332993708, - 147.462815334713, - 15.300506166735937, - 142.89311901203018, - 35.881455163220174, - 0.8959120615185874, - 134.50389621984408, - 79.91603718165896, - 145.31776951960734, - 153.19384567886857, - 142.494036234602, - 130.58249312188119, - 1.1128817603274543, - 56.157995916719756, - 35.81413980204931, - 116.5213087641768, - 63.30354399512571, - 55.0117106848875, - 47.52954249314361, - 153.04709230401787, - 1.112276523473745, - 80.1523559974256, - 136.20373724941714, - 1.114673225365626, - 1.1067132158651183, - 149.29883052073288, - 145.10950784560325, - 130.53765167080937, - 1.111788125890117, - 0.8957719496064405, - 1.1050775451489783, - 17.522300994030367, - 154.45472111064055, - 152.07616582090188, - 1.1020107149905272, - 138.6808068419634, - 76.87873177159636, - 51.43702839643221, - 138.95045176064437, - 138.64177504011988, - 140.72197385602811, - 132.80947742972836, - 149.78872816785005, - 139.94034036065392, - 154.2632802491591, - 55.57148538150843, - 1.1044580058296936, - 147.1712801496827, - 77.84198065949245, - 142.38330204183904, - 151.76812011990265, - 145.19131540821485, - 147.26566215388425, - 87.12413393605841, - 1.1038403429439656, - 141.4935550752979, - 145.7397470598185, - 3.3080164659931235, - 123.0327553358976, - 146.24080278853327, - 148.10448175245884, - 29.234562433775857, - 151.30177873039895, - 135.4653748135468, - 144.3293913931314, - 148.16163203136404, - 1.1015876034201657, - 1.1114790318458536, - 136.68047783885697, - 77.72584511329579, - 125.73692105352463, - 106.98755729483561, - 96.25926845246491, - 1.109721323323522, - 141.71073652156545, - 130.22006710827588, - 145.24478945746003, - 80.67459353439743, - 1.1033551544760267, - 150.03177939272493, - 154.12875534463626, - 150.04771421074818, - 1.1010813815407388, - 1.1110434127990452, - 145.385699877379, - 86.86487551811825, - 130.16687493633253, - 143.8726181331947, - 111.91340621077623, - 146.0394914387852, - 1.1006353022455784, - 134.47903589563677, - 148.6907436994389, - 102.87151097507036, - 137.41724911494663, - 1.1146766644704549, - 143.85952373403495, - 146.92280951248307, - 1.100156488603178, - 144.04783334738536, - 148.53630346113712, - 58.74848466983248, - 147.0485685726298, - 141.32891699761203, - 142.8441702922343, - 131.04366253726744, - 128.6305301075303, - 1.1106412111686195, - 147.90025888582002, - 0.8959265584913588, - 149.5194069726666, - 137.43649451567626, - 1.1068068376551545, - 68.05269425995475, - 138.94056631255367, - 138.43818227469507, - 69.60391199895408, - 114.83395091462887, - 151.34107787433956, - 141.57237630997332, - 146.07433910500515, - 9.941778754980154, - 131.297822968639, - 10.386636719874664, - 10.545636067043365, - 114.58677137445733, - 75.28902943071078, - 90.63452059810655, - 143.58694736923238, - 9.901118804514459, - 144.5206530902411, - 144.78737732574044, - 79.81136215142409, - 84.9314508821071, - 120.18939827456474, - 10.225253542151219, - 9.702822548173124, - 103.1188517219872, - 138.5008491242522, - 92.02238700298246, - 151.99592340131602, - 9.807595290716304, - 150.0447954775559, - 134.2614008494909, - 149.38544573345007, - 149.62298116309924, - 124.32358754465251, - 132.817456221544, - 10.50607995390264, - 9.78317681034783, - 151.07916494121415, - 146.93545537009487, - 118.45851163082196, - 145.03008316360754, - 154.4449202186591, - 146.86002069809945, - 150.6932855951215, - 110.74803327496042, - 127.40788523389726, - 150.81323854197058, - 150.0047673310006, - 149.6063654551971, - 133.87244996538675, - 10.329695475492791, - 9.414695716712222, - 106.77032789813472, - 118.34636653947105, - 123.44441062862572, - 144.9015592115516, - 153.74652990582067, - 10.065713405335144, - 129.38998560194165, - 117.69087049838025, - 99.15650839997046, - 127.90462338199198, - 147.3574863739125, - 9.696544883885949, - 9.8853852911422, - 128.35872796896587, - 145.2939860705264, - 128.72081963712404, - 94.09935653689803, - 142.8780531031409, - 130.5213122981276, - 126.89288883528536, - 153.36107852781166, - 149.17239657923582, - 9.177632630803961, - 9.387171298727486, - 109.68196882316985, - 148.55536204011432, - 152.61730207818772, - 9.648922236946333, - 132.805446535875, - 138.74295200738652, - 141.66118217831166, - 124.0399127789103, - 113.05005278683446, - 149.71230902297984, - 25.727698431920004, - 129.56419655827216, - 130.40687823665095, - 128.46470366050013, - 150.46298369674685, - 9.22073843893938, - 110.36443029340542, - 148.23878821929193, - 10.219508495480236, - 9.615051521185155, - 9.8723813087942, - 149.91378148843256, - 9.149056684599877, - 130.37704092008303, - 114.86611671621016, - 134.53633480709703, - 131.11593468604048, - 149.74665952988033, - 136.60701891253495, - 146.50864617645632, - 9.094221140419737, - 149.69902295915708, - 126.93245475406366, - 141.2463933703881, - 10.18172163650932, - 136.76582155059438, - 155.5823388453975, - 144.68082947663285, - 142.0128061769988, - 116.20800508912414, - 101.13756407758095, - 10.050927550768915, - 10.14139856150474, - 9.573219645146107, - 146.33874064646594, - 137.22302119976462, - 132.14965518046, - 148.08190796641483, - 117.6843964457568, - 153.04352772565807, - 146.79238076404926, - 9.522740968586977, - 145.93484469600287, - 13.925952420322696, - 12.697420287309185, - 146.39122941822845, - 113.94298610788566, - 13.844109957456581, - 154.57922917096633, - 13.525210269101805, - 103.83976095796662, - 97.75660804271413, - 135.83818209343426, - 158.60060111529293, - 111.57793188874757, - 13.768524263105455, - 154.2203592546867, - 108.85242762118563, - 111.15752259030245, - 149.5942138872604, - 119.77102605185765, - 120.68065341205389, - 105.29698904913548, - 151.41465167808087, - 138.90606724001483, - 13.437371194424983, - 119.97194649055415, - 144.6223725248399, - 146.9934910169238, - 149.45319992777343, - 121.48260402443249, - 13.662736071688842, - 14.448955892498802, - 144.5545360346381, - 154.00382983055897, - 151.8635735223181, - 137.2321484611102, - 119.71487519948164, - 88.24978714231261, - 147.74815341218743, - 142.1113258863455, - 132.08775922189477, - 124.63351274554526, - 145.72256212355262, - 100.50708502243579, - 139.16363846809003, - 114.82662827063822, - 154.78307253831395, - 149.22879563842886, - 152.6744734255461, - 145.81022434241217, - 152.68018782123758, - 116.75549006136289, - 12.968595875688791, - 6.824624970615158, - 125.05116103474757, - 147.66072487793718, - 147.5735120742967, - 139.1302141298083, - 146.48542990069834, - 12.674865288395944, - 147.88858853602966, - 6.8124480142416175, - 137.54766974463703, - 130.89979405333307, - 13.364169845161861, - 14.116086127002273, - 130.3002929300388, - 116.98398239487472, - 152.70827610346095, - 98.51470626500011, - 135.1252373635164, - 14.405992358855888, - 154.13709739001223, - 146.28661687368685, - 137.87827066214206, - 12.621081453489012, - 154.04574874294514, - 6.802625211185703, - 152.18661864386252, - 149.30257880598677, - 13.244501725269068, - 138.34068638798834, - 150.95140747506372, - 141.8441899037163, - 152.99022366652198, - 103.95004802425926, - 140.28144756248412, - 154.51222806007945, - 85.40777548962518, - 154.7067128296305, - 120.47843952303268, - 12.568053995018431, - 12.916583075889136, - 105.92477484543576, - 137.92878859711615, - 135.13853669037294, - 137.88549737290148, - 157.83019925734393, - 145.48927689323145, - 12.509532718065461, - 150.6233829715981, - 119.23669844460764, - 138.49099023171033, - 154.0870149904812, - 140.1862744667834, - 148.860174031694, - 147.54629689336036, - 12.448861769003683, - 152.4711466483636, - 102.47079224461186, - 152.40864885890767, - 156.21773232766026, - 13.139291580904986, - 150.30653960489693, - 145.43571147072188, - 132.8965387342577, - 144.85972103961666, - 125.5438694385711, - 158.07457773478276, - 14.359506122440205, - 137.7658155977229, - 153.68125116011197, - 156.57780724945528, - 12.394708947912125, - 12.874702780202174, - 110.61518572692995, - 149.4338565730422, - 149.67552030435513, - 146.20909415912828, - 9.308833539527914, - 26.176147260970783, - 8.701217384742513, - 66.92241449340185, - 105.12940849136734, - 145.25326276553395, - 139.68219350261262, - 131.60335890332783, - 150.53420884400245, - 17.552483447968918, - 99.60476667168517, - 9.003208512207522, - 8.539560747895454, - 9.946172723540226, - 150.55644446784382, - 9.608936841972842, - 104.80864366760326, - 25.95068644438624, - 99.42592550150236, - 108.35979254469888, - 113.9171427720856, - 9.905905876631499, - 131.1684982861573, - 154.7989292174601, - 151.34753888952145, - 150.11816141981262, - 143.00557828542912, - 126.2310299151925, - 113.53830001728545, - 148.13405630794878, - 150.7564429392251, - 155.252325076404, - 18.20048176554747, - 25.725436761645142, - 8.678711562613207, - 143.3683328827327, - 127.0294451168928, - 137.50119476282134, - 10.068367539846923, - 155.64822784014916, - 153.2789382926615, - 25.46950813818654, - 142.9138107220956, - 155.10510899417167, - 107.40557834412083, - 9.871948602847068, - 144.4712732194919, - 140.17802930301565, - 9.286026243902361, - 129.1488895575147, - 124.35586045151207, - 140.1410811550992, - 96.63692877337894, - 153.62093095799207, - 156.05800033315097, - 9.587609950939838, - 140.09721428165886, - 134.898750425008, - 8.652809034763463, - 8.989448046931262, - 107.64260577858933, - 9.825071080298192, - 150.6237132142087, - 143.76058852986372, - 154.01627264735168, - 140.85322298632985, - 143.63714834446708, - 149.7259575806535, - 8.53942846683121, - 157.02635815805976, - 150.83913162907433, - 154.0283691261865, - 9.246842209481716, - 154.5851361854829, - 133.4662155767381, - 137.55396410787307, - 105.77910782321499, - 148.97953057255376, - 111.3041581371634, - 9.543858351726714, - 142.71996301994741, - 144.2417836324451, - 148.5293262803374, - 8.95331376662564, - 105.2724164655814, - 149.16646109060707, - 151.1947852118465, - 9.503293907683512, - 133.40055362812345, - 8.776394391795916, - 148.3675722527084, - 154.66946641450528, - 122.71674068416665, - 149.62192317697068, - 153.40159484208397, - 9.46860898864519, - 146.10526710538994, - 143.96020057925128, - 8.62472208077336, - 8.906885562515198, - 105.7754218686014, - 150.17957794387223, - 144.0451331512576, - 149.95461039551162, - 151.46311089131117, - 142.22104279807664, - 147.3679944003333, - 140.5394711174869, - 123.62157744638432, - 152.32796921399395, - 156.6603241829257, - 9.43621164630811, - 158.2241383954169, - 149.33346139426692, - 144.12074054746773, - 143.1977521817863, - 8.536662624511228, - 9.785635570067782, - 147.61880087321424, - 9.402323265876474, - 159.1161790596516, - 146.56796834276156, - 147.64890403285438, - 157.70847517328534, - 114.64282143770687, - 148.5000942425868, - 10.052761003641129, - 147.38801074409378 + 41.46611265659158, + 44.4918071112372, + 46.926673665513704, + 46.30487800041612, + 45.31117511724168, + 39.48427257480573, + 41.73807567318408, + 44.986328772700176, + 46.79460518580979, + 2.1481645603133406, + 45.3304673980315, + 46.361305003734564, + 1.2216768370041928, + 35.39842883637453, + 44.9539795483452, + 39.212326267312775, + 1.0742220506708642, + 45.596949876501405, + 1.656518545685144, + 41.1853065101293, + 45.186903991589205, + 2.733636984435035, + 1.8859234764357438, + 4.103119744826081, + 45.69245622017379, + 1.6582215083936738, + 37.954906657600475, + 46.5127757873931, + 45.29733823530308, + 23.1754689963102, + 43.44487109471452, + 33.311038622351724, + 46.400400898475304, + 43.13207624251721, + 45.26221685255157, + 38.89631907864675, + 1.0766827581902934, + 3.1955625641377354, + 41.00672778846412, + 1.225434086753332, + 0.951420354873873, + 47.29759062957134, + 37.27931328255301, + 44.02626192577354, + 44.567351509891715, + 41.19817412895097, + 1.4117117845102758, + 46.974942144500005, + 26.16803432928029, + 40.79104304470394, + 45.98186302516314, + 47.4055947551752, + 1.076201435026891, + 3.1796394093402074, + 41.23717257081556, + 42.85213590859161, + 44.28329201807133, + 46.527540336613534, + 43.08848614726634, + 44.40830753324719, + 41.37604170752994, + 0.9482378607333808, + 45.48122547719385, + 47.20316588665498, + 0.9510683482370443, + 1.9012380421663475, + 46.19550253488152, + 2.7330118039774067, + 45.74495207812405, + 34.67238053318697, + 38.85119722571936, + 1.225081100472964, + 45.15238085691014, + 40.396011557170766, + 45.488921919651816, + 45.29351001493665, + 1.0758273605231232, + 29.808026495079588, + 1.2280820949811997, + 46.586185131212794, + 42.89263913245724, + 42.15612175451927, + 46.693253798156995, + 46.57003199283068, + 46.509087816223484, + 38.12557546239959, + 45.81548305523131, + 46.07453120649211, + 40.81605463432999, + 45.228424339779814, + 42.086064813661196, + 42.78740035356858, + 45.98922633164769, + 41.28717865700289, + 1.2274351142907918, + 43.46971411790415, + 45.4498626576556, + 42.51719188567606, + 46.624215728553786, + 43.26045159027894, + 43.962414509948275, + 0.9481540147597537, + 1.2267700611313974, + 1.2246727704472544, + 45.950324312195605, + 46.02559998344755, + 1.413545795432525, + 2.1538932898075407, + 45.57032628071106, + 38.877775528665516, + 44.5660811280025, + 45.98326532911864, + 41.78435738761637, + 44.118449498817554, + 43.11682781122976, + 46.80957208928424, + 1.0755822711089933, + 29.775928132799514, + 42.492052303926506, + 1.2241095107799485, + 45.796086216431775, + 45.258843364665246, + 44.97308057669771, + 42.89527265230854, + 43.91533758581356, + 35.81442349583988, + 30.65358830169187, + 46.3182793971083, + 44.145493159555286, + 44.2651994526335, + 40.09824843769361, + 45.68707977480025, + 39.990813212941646, + 35.79658562417175, + 44.86013694329229, + 41.83115806056866, + 37.15064410140025, + 0.996787320025337, + 45.66808620182929, + 46.6130598481811, + 45.60972037064592, + 0.9940425141246046, + 45.591900274871186, + 46.96840985185615, + 43.393354375970155, + 25.5248831966376, + 45.77235244972332, + 24.590561326831967, + 0.9773483444490005, + 34.09417278739622, + 43.586572958161206, + 46.535859932274164, + 45.946757322805404, + 0.9962165194499956, + 0.992874583950711, + 46.119932829039165, + 42.179658293228435, + 32.997191121192365, + 44.17582132320044, + 46.14366473770965, + 45.81106545186327, + 0.9957624959115234, + 0.9924622264244217, + 39.42192933951627, + 37.64229442727469, + 21.26565173458009, + 45.593412953334585, + 46.87304671516134, + 45.216027572946594, + 42.43765019133474, + 46.197382024442064, + 40.692114254409056, + 45.33796853087654, + 27.766522112160985, + 40.02641706822085, + 1.3017150918854614, + 45.591631786019235, + 44.34279696011747, + 39.28257190816356, + 43.72958684288255, + 0.9771143356157014, + 23.874882409185425, + 38.84831650281934, + 46.04825715862786, + 44.318350427904555, + 47.26086876225989, + 39.433419122254435, + 42.94084765393213, + 43.44077111651132, + 42.4775425505976, + 0.9890763303083981, + 47.353878858820345, + 40.99026973150018, + 0.9955331259047124, + 46.52810662522569, + 43.71121305319187, + 43.098140605333754, + 0.9941110054345192, + 0.9887007080233833, + 41.60423122999918, + 45.81533148936388, + 42.37614297709579, + 45.84171517205181, + 41.73162426832469, + 0.976838541947363, + 14.558863836592382, + 0.988317986920056, + 27.41518624216025, + 46.00613760472248, + 44.605125117227445, + 0.9923556095766691, + 46.06453996269855, + 45.69598995103852, + 38.29204120955434, + 0.9879204612413145, + 45.051133494631664, + 0.974139430894493, + 43.52911731376158, + 0.9919675926934881, + 45.37964604415822, + 0.976397605350521, + 36.30289308241207, + 45.597233615462315, + 43.61071649968794, + 43.122470348017536, + 46.76087701561043, + 0.9915593888202096, + 43.301652472823534, + 43.35874933591963, + 0.9940066207204965, + 42.186091123827985, + 45.37749985977852, + 0.9738097357420213, + 46.47531110944141, + 0.9911618676375942, + 43.561154900046205, + 42.50481546978642, + 36.28178246877416, + 44.229193258120816, + 43.274122438133034, + 43.16603619055846, + 46.24123104179791, + 0.9907652867200517, + 44.808052346983644, + 42.157257924432415, + 30.810167635761594, + 44.5009455404432, + 44.803133707609575, + 46.717718944658586, + 45.328295623099564, + 0.9903649151763216, + 45.98765051561304, + 43.15949033247262, + 0.9938810855133485, + 42.5272021864534, + 46.202556875553654, + 37.69680010665373, + 13.506488443568907, + 47.084518208092895, + 45.34409129030842, + 45.528670127709155, + 1.0839758382565585, + 45.77369572816552, + 40.36600389536794, + 46.346373598961115, + 47.59928731210073, + 45.213230445194775, + 46.97741000418462, + 43.73589527028813, + 38.21138599701667, + 39.80440406603509, + 47.546574744238036, + 46.363044750837105, + 45.73935328577624, + 22.79542790283351, + 1.0852955230764447, + 46.31190530756646, + 10.103645571001175, + 20.743583307847267, + 34.08924086156784, + 40.34233471572178, + 1.0825832325439408, + 42.93380762165118, + 46.538540446937695, + 40.56431787179345, + 1.0837596134259624, + 35.02268200701654, + 47.136990718638934, + 38.591258432063235, + 47.93266376947172, + 40.53416662878643, + 46.663334136659614, + 1.0714520955139675, + 27.88935756664922, + 45.48047962233704, + 1.0758750615408978, + 1.0683190801502396, + 46.009876361978876, + 46.59268594380503, + 46.02812612004097, + 46.372356575684854, + 22.894765755636868, + 45.64436406976758, + 46.20773355624579, + 42.364426646383905, + 1.0822510357556412, + 44.863056156314066, + 46.46090797778492, + 1.0710544669423023, + 1.083596675232654, + 46.253226306136575, + 1.075461579555405, + 46.46757181265049, + 1.081777244820761, + 1.079157130525964, + 47.44728077576711, + 44.18890905454099, + 25.69445080780143, + 41.61341063520841, + 1.0749834632245117, + 45.18278804232428, + 1.0813046939407982, + 45.584290798191994, + 1.0851558601194167, + 1.0706298125469418, + 27.277652622917802, + 3.13795203228774, + 46.596243996630385, + 1.0680343711445561, + 1.0808489429820316, + 44.07771833504717, + 1.0782837622370247, + 44.620236842054005, + 33.66037405692795, + 42.88981761147569, + 1.0745719383443746, + 1.067541523615096, + 43.3531928586852, + 46.45260807995745, + 46.301433990064965, + 45.45037480313856, + 42.01190688214572, + 43.97592120992246, + 44.22612202356458, + 46.93790632881387, + 43.35324044647867, + 46.24983553374027, + 1.0779013969854039, + 45.68642573969881, + 40.71576971597602, + 43.609256041900395, + 44.75345611987869, + 46.683440264062696, + 6.250364298356673, + 46.58797465847453, + 1.0773923535890582, + 43.82763570204923, + 41.62940460437239, + 42.91661388574536, + 46.901610347450095, + 46.61677212391794, + 1.080583826854443, + 34.07713605907777, + 46.92641126499492, + 45.79075334582258, + 40.14409222341034, + 45.361779654878845, + 46.88204342817273, + 46.35566639777504, + 46.36704829301128, + 1.079068056447631, + 46.774512434519465, + 1.0704507990204184, + 1.0837001046492374, + 44.56501843026455, + 45.92497594226974, + 46.819599375484145, + 1.0801577199815187, + 46.01182819769449, + 1.0770346495733834, + 46.950613182781744, + 30.797706097998343, + 46.18180484355316, + 46.16072338065117, + 1.1133090433838153, + 1.1264329475750274, + 1.1236172122377037, + 47.045544454610436, + 46.77875324298633, + 28.03992244253687, + 45.334641615839494, + 8.780689100623139, + 20.7913981632672, + 32.723036948097274, + 45.13282209264667, + 46.65435200771115, + 45.96287965580367, + 9.076296968757461, + 45.4816339150996, + 46.902872519542036, + 46.16846796984993, + 45.756891597403175, + 44.88315382035088, + 46.23903054578556, + 45.83324366902273, + 17.750809391531607, + 45.20000225981293, + 47.302482301226895, + 45.60218665990497, + 36.97764728135097, + 46.59609042040382, + 46.604767462324304, + 45.96159537616419, + 22.37221435902452, + 43.859502782475616, + 46.5164446015921, + 46.29329085467359, + 1.1262112315718147, + 46.308551190848824, + 46.12319048896243, + 43.60305812792925, + 0.9422659923955576, + 45.850627271010616, + 45.017760412103506, + 46.45017372234843, + 46.681005137311296, + 1.1235052275623567, + 45.024655731975905, + 42.551907139236725, + 0.9419457570631012, + 41.1118024425248, + 45.63421048620437, + 46.022116096626675, + 1.1258383546403372, + 47.1081443735114, + 47.030126605956774, + 42.86500455064436, + 37.358353939700315, + 45.34461986882157, + 46.86806884248587, + 46.417501701989885, + 46.351389315230215, + 46.78447423742242, + 43.74686698408526, + 1.116867665232356, + 0.9417093885501255, + 1.1193255628248941, + 46.36628759364972, + 47.0182927090698, + 44.33757352470002, + 7.691634088129115, + 1.1283438070497074, + 43.879143747221455, + 0.9414915905260655, + 1.1187592356622462, + 1.1221505116978934, + 46.07747894106487, + 46.579798906537704, + 45.766896552621894, + 46.65247758283254, + 43.302159908237364, + 37.720159108605536, + 1.1182282725285237, + 46.39182837285494, + 44.636636353923784, + 43.44450203063323, + 1.1233649178804157, + 45.04855028838785, + 1.1165108506849695, + 29.25784442036365, + 44.92016113045485, + 1.1217307674387187, + 46.08594914883392, + 1.1256588113160433, + 44.33658350966423, + 1.1279641443945907, + 46.995953225218045, + 43.09174152350243, + 45.522175701238005, + 44.54660682798267, + 46.26002914896281, + 45.121721334753246, + 45.99661519970516, + 46.999367551883665, + 1.1162274151428622, + 34.79092708982097, + 45.466303894602824, + 1.1214388358967042, + 46.3611527229414, + 1.1253775196067384, + 1.1231558495643674, + 45.46781022594765, + 46.83967784020296, + 35.37244717495285, + 1.1180685191822184, + 47.0281597759591, + 45.004932496628875, + 44.35708507257986, + 46.65855899768837, + 5.505111079406215, + 1.115802761131929, + 35.602590093008914, + 44.671751586624886, + 46.281278781026465, + 46.65874233841448, + 47.449917573209895, + 47.11754288927177, + 46.84313387306054, + 1.1152851890752418, + 26.693730551391678, + 45.574691537692864, + 47.110350441661474, + 46.950895044828556, + 47.10814947984309, + 42.35670263948847, + 43.399091167413815, + 45.65945467138436, + 10.323879128717438, + 17.406756102821927, + 46.70765041608834, + 46.265154949804675, + 46.966387230240066, + 46.58181691440536, + 1.1794390054814614, + 40.240832270343546, + 39.59688963721167, + 1.169177901708881, + 1.176889456593387, + 46.512318262726104, + 16.255791986842784, + 46.90191826875892, + 38.002332039368945, + 1.1673839996531623, + 32.855434627015846, + 43.339268319257165, + 46.75273409704357, + 46.82224515218503, + 46.7787448289983, + 46.08633464118119, + 1.1789416201176985, + 45.01880600815589, + 17.692981429746695, + 43.82069805510859, + 42.693302457425894, + 40.895519742462156, + 43.141099312595934, + 48.08036522096514, + 1.178390117026328, + 45.95511642215028, + 35.29568405980472, + 1.1687957641452225, + 1.1765143734981645, + 46.688387154545254, + 47.06125638807941, + 45.346066735128574, + 1.1777709765320192, + 1.166989666506321, + 0.9847523589742398, + 18.562855771239047, + 47.9065264813057, + 46.73354514650198, + 1.1735046304883543, + 46.412712735423334, + 45.16100408019957, + 43.83022094061403, + 35.89794593782671, + 44.97192473982221, + 46.7633180339843, + 44.329869977212624, + 47.38342947643397, + 46.79402738420473, + 47.634269098703626, + 44.0213863595159, + 0.9845269249937244, + 45.78778499348287, + 43.90149865817902, + 45.65368969409286, + 47.746456721033944, + 47.21697228426952, + 47.01924612843149, + 46.3245200194134, + 0.9842560530393194, + 45.26992712182612, + 46.89243421872701, + 3.4924828727877877, + 45.25207572636316, + 47.25700297914972, + 46.94730150195301, + 39.12367514310055, + 42.117856976344655, + 44.28179459170351, + 46.596840500912684, + 45.392754933120926, + 1.1731165363524663, + 1.1755941425503302, + 46.46126582671268, + 45.79994582850055, + 31.36362072652773, + 43.50384100878153, + 45.440038476775335, + 1.1661505662188223, + 46.52744939333318, + 45.250414658311975, + 46.53386354717518, + 45.796239735104564, + 0.9841302985201961, + 46.27883497779145, + 47.83598353847002, + 46.607837943658275, + 1.1726681962992465, + 1.1751504766334446, + 46.84845290565303, + 46.07497571222637, + 33.33732005606778, + 45.813985387630716, + 45.57964157112892, + 46.41818933014048, + 1.1721397028860254, + 45.89252926130944, + 47.09569465450331, + 47.250364539349285, + 35.22784278442342, + 1.1688030911620526, + 46.42186257421796, + 46.25658899517002, + 1.171409947579052, + 45.16137403712752, + 47.22442045049697, + 44.82261712339744, + 32.494327996097915, + 44.219079390101115, + 46.87735465561079, + 44.699203955991905, + 45.12568915598884, + 1.1747532937483116, + 47.069832959511444, + 1.1670956785442357, + 41.217948435045656, + 44.93033926516496, + 1.1766349885441727, + 35.47522021954888, + 46.21124702140885, + 46.24628779612773, + 34.53125955420697, + 46.66578037331865, + 43.65856477535035, + 45.03361057951491, + 46.76526122602155, + 10.182019712559228, + 45.71366318720834, + 9.833945628376052, + 9.322117004081543, + 46.537564499785105, + 31.262138808373493, + 37.90592059294092, + 46.820091937863225, + 10.139423148881114, + 46.75580347295349, + 46.89455728317566, + 39.52390472502032, + 42.643467900988064, + 38.90725083946543, + 9.086630150053459, + 8.937192123351853, + 40.9872575801166, + 46.394128489242924, + 41.193529101734704, + 47.34329154675404, + 10.054610354639179, + 43.31828144588645, + 44.553079069624026, + 46.98279134065351, + 46.830147489351724, + 45.31329233494219, + 45.552850223950976, + 9.295212965663417, + 10.01436272470524, + 43.57022598341257, + 45.70609566213184, + 43.449062338174066, + 46.855675373016474, + 47.68860594538369, + 47.09689498272573, + 47.173878516378814, + 46.069788054621185, + 38.92002107306488, + 46.38712908030891, + 47.104897416242906, + 46.938337511897245, + 45.36212980855197, + 9.7037632831636, + 9.265430506589102, + 46.11721659871563, + 38.06187391881914, + 43.25827348162763, + 46.84719251692419, + 47.03682707869591, + 9.90500846057903, + 45.68739012850455, + 43.47148156475432, + 45.23323967788647, + 39.81125388088527, + 45.95084232488125, + 8.919454342379801, + 8.706571515609426, + 45.29003523159025, + 46.867399234540684, + 45.35240769107086, + 44.80265358061401, + 41.83510960528982, + 43.92616077285124, + 44.61292075723489, + 46.86625528407582, + 47.230904823696534, + 9.643361950798496, + 9.236779459262468, + 46.27993094745158, + 43.29062809284174, + 46.53130368901898, + 8.891092687715933, + 45.323215643957305, + 46.38559644193777, + 46.8553797027437, + 45.16725651833185, + 46.26177304715086, + 43.16649621953115, + 19.53072875578119, + 44.16107832748164, + 44.46643011473998, + 45.302511702487166, + 47.59950805589659, + 9.206283803180765, + 46.31521045156664, + 42.932315734513345, + 9.081962094633843, + 8.862645496755041, + 8.681026899042758, + 47.175946890403075, + 9.613647025719098, + 45.37459772842735, + 46.657937572561956, + 40.090063197986055, + 43.91176191056239, + 47.1764939819939, + 44.932347492473085, + 46.951971869749755, + 9.588107858966847, + 46.890536209011636, + 47.457220061858926, + 41.820791051617206, + 9.051934235829219, + 45.46750284471863, + 47.1114848526844, + 46.90614671206355, + 46.81408948407702, + 44.76508972637772, + 44.94143445208981, + 10.013702243637548, + 9.016326405341099, + 8.836765675846252, + 46.724030690708, + 45.670931647965055, + 45.52105012345985, + 46.760404038674345, + 46.879394746618935, + 44.17372013338399, + 45.75158023561404, + 8.805217872024683, + 45.797390838433785, + 13.147893146580197, + 10.47047709122617, + 46.61575812332005, + 46.51823693220529, + 4.823033237525791, + 46.77438522864306, + 12.978009554740229, + 38.60487947846694, + 42.776667803234396, + 46.400158258735026, + 47.945284694706544, + 46.56814403610221, + 4.817274157491479, + 46.62284523101857, + 43.12368820615556, + 41.32670008561977, + 47.18041683967238, + 43.946314235571926, + 44.21062282398479, + 46.19942835901387, + 43.058732279332816, + 45.38189559700182, + 12.884302510247224, + 41.31993708388949, + 46.47169213829526, + 47.19006572402318, + 47.14982705362978, + 47.06368907184152, + 4.812880414029111, + 11.16220592067454, + 46.574241250493166, + 46.97994816848278, + 47.45816665639938, + 46.13083135931701, + 44.32000975084153, + 43.41804159092183, + 42.66169852490167, + 45.48613569289166, + 44.33345445574926, + 43.452008302705025, + 46.81171828117368, + 43.10993692872848, + 45.994793877105536, + 46.800586622051604, + 44.27154316655175, + 46.105917327794614, + 47.46844284412024, + 46.26483577817879, + 47.53682651754337, + 44.570703276937955, + 13.903655242145248, + 11.480956559418479, + 39.336500908555834, + 45.90660459732642, + 46.77917515765938, + 45.088381020490885, + 46.506580602768324, + 10.416775312398924, + 46.58444309156844, + 11.387487180031048, + 40.66527760299146, + 43.83362837067986, + 12.535722984692502, + 10.862075986088263, + 45.57849071079437, + 44.54752207894966, + 47.368339209936586, + 44.99292457355705, + 40.53083756344339, + 11.0636299214144, + 47.688667053142176, + 46.49150277169404, + 45.74006902822907, + 10.33525884882965, + 47.48557960393818, + 11.308966508889716, + 43.29259854243531, + 46.1099584752184, + 12.17957601526656, + 45.17415787692287, + 47.42069363597441, + 46.61857073840612, + 47.2421945434337, + 45.43588217737557, + 40.87274833234901, + 46.70759606653805, + 36.65554403597885, + 47.00974843039727, + 44.27238095134427, + 10.215116571612004, + 13.7852700376187, + 46.056843647274086, + 40.6532114020977, + 44.73992298080998, + 45.68916428641405, + 47.31026005200245, + 46.82535713731543, + 10.130547297609347, + 47.03536361799409, + 46.991892284267614, + 40.158116078863046, + 46.709887162762875, + 46.67477141304538, + 46.52127067854677, + 46.8876604645323, + 10.042145383707755, + 47.028109894652104, + 45.7372913308103, + 43.35504560755716, + 46.94810107337359, + 11.8541419498795, + 46.48396692070885, + 46.650791251635994, + 45.251645228092976, + 46.90500963017914, + 47.44769079351513, + 45.17830741847997, + 10.999409433497265, + 46.47750683850478, + 46.775120397902185, + 47.814786925390884, + 9.948141267257297, + 13.587316761063226, + 46.55485731583328, + 42.77962873201528, + 45.79657353014755, + 46.78648032853886, + 6.092950585496579, + 16.427217699690395, + 6.041669306781378, + 33.44834000640586, + 45.71021173581392, + 40.44649791159415, + 44.41704966518361, + 45.16867811008679, + 46.553484065254395, + 11.951659518508801, + 40.964520355583325, + 17.222473173678548, + 15.810785212495478, + 5.896598504159821, + 46.15486957962745, + 6.267247605496281, + 38.65955739206124, + 16.334240831872595, + 40.92114763036668, + 44.25538155878388, + 46.79667178943268, + 5.886210147826818, + 45.086831193223446, + 47.3009972481073, + 47.07801971653764, + 46.80397795995714, + 46.806845163101094, + 43.42411625011456, + 46.37426980773864, + 41.17909401763616, + 46.16226579941339, + 47.44507636385267, + 11.930205494257288, + 16.233747914032552, + 6.031411752952078, + 45.92910900092996, + 47.47110773753601, + 39.494621036199604, + 16.734374432604927, + 47.37802539239185, + 46.74469194379278, + 16.087259096423576, + 46.92051488410033, + 47.34732444333283, + 46.40587690730415, + 5.872780467931287, + 44.55593583365237, + 45.7052618242163, + 6.085826627872682, + 44.846431805065144, + 45.41689502907426, + 45.289189315257374, + 44.95210230627078, + 42.99904025714732, + 46.839026962763846, + 6.250954782033121, + 44.8453124032084, + 45.278261112862296, + 6.020810288080093, + 17.182296973833214, + 46.63633652424215, + 5.866101016705892, + 46.160696572751434, + 46.32038287353405, + 46.89907461120633, + 45.95374406526204, + 46.925975948392896, + 46.42837166656114, + 15.78999329881552, + 44.465193132950446, + 46.21771478110725, + 47.314131714710484, + 6.0756954521719475, + 47.654756058723834, + 45.70610138140926, + 46.42506531228388, + 46.278376731444745, + 42.38396099575264, + 42.30031354989153, + 6.238343970049818, + 44.63197875047801, + 45.842276161134954, + 47.290515920449934, + 17.100464476837107, + 46.03336595920761, + 42.199011552033475, + 46.12151306088509, + 6.22230433569469, + 42.38409981463419, + 16.065182030558717, + 47.159068653554634, + 47.325440650358736, + 47.304702743784624, + 41.95305830151048, + 46.32090634094613, + 6.205841232502227, + 45.21525043209204, + 46.68630635575757, + 6.014917714514858, + 16.99660741175496, + 46.04707312586917, + 42.19662106675615, + 45.454018018858854, + 47.15352407193948, + 46.93603762078255, + 46.83396897378934, + 47.15013333226566, + 46.77541231643884, + 47.24502443147304, + 42.759813321329425, + 47.001201569266215, + 6.192232905623395, + 47.13098385966453, + 47.01234120088298, + 46.79153288884898, + 46.373378014241005, + 15.754365078113269, + 5.8675558701311985, + 45.42074545020536, + 6.176488223442546, + 47.27337589918247, + 46.90578973015155, + 47.16448140788897, + 47.56000914081759, + 46.62586586855627, + 41.982557140496446, + 16.770559660054925, + 47.00638722437522 ] -} +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json index 221abd48c74..a47b94faa75 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json @@ -6,54 +6,54 @@ "values": { "1": 11.04733, "2": 11.03572, - "3": 9.5878, - "4": 9.25791, - "5": 9.51585, - "6": 9.91425, - "7": 9.49022, - "8": 8.94619, - "9": 8.65195, - "10": 9.06313, - "11": 8.49654, - "12": 8.52749, - "13": 8.45919, - "14": 7.99341, - "15": 8.05353, - "16": 8.08327, - "17": 8.10021, - "18": 7.77408, - "19": 8.14992, - "20": 7.89646, - "21": 7.60027, - "22": 7.55248, - "23": 7.43137, - "24": 7.43223, - "25": 7.68057, - "26": 7.07422, - "27": 7.62201, - "28": 7.33353, - "29": 7.49795, - "30": 7.64414, - "31": 7.39519, - "32": 7.59013, - "33": 7.64569, - "34": 7.70593, - "35": 7.2143, - "36": 7.08788, - "37": 7.43168, - "38": 7.19723, - "39": 7.55557, - "40": 7.54844, - "41": 7.49611, - "42": 7.25383, - "43": 7.23801, - "44": 7.42036, - "45": 7.19742, - "46": 6.90447, - "47": 7.30251, - "48": 7.14379, - "49": 7.59525, - "50": 7.04023 + "3": 9.58776, + "4": 9.25801, + "5": 9.53164, + "6": 9.90992, + "7": 9.48661, + "8": 8.93947, + "9": 8.65725, + "10": 9.0567, + "11": 8.49436, + "12": 8.52422, + "13": 8.45295, + "14": 7.97674, + "15": 8.04629, + "16": 8.08024, + "17": 8.08398, + "18": 7.76141, + "19": 8.15001, + "20": 7.89339, + "21": 7.58212, + "22": 7.54491, + "23": 7.43428, + "24": 7.42622, + "25": 7.67267, + "26": 7.07291, + "27": 7.61503, + "28": 7.31789, + "29": 7.48965, + "30": 7.64357, + "31": 7.3927, + "32": 7.58407, + "33": 7.63624, + "34": 7.69746, + "35": 7.21377, + "36": 7.08367, + "37": 7.4245, + "38": 7.18783, + "39": 7.5498, + "40": 7.54133, + "41": 7.48816, + "42": 7.24677, + "43": 7.23194, + "44": 7.41471, + "45": 7.18838, + "46": 6.89674, + "47": 7.29904, + "48": 7.13855, + "49": 7.58882, + "50": 7.03386 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 38802616.0, - "2": 38543540.0, - "3": 38741560.0, - "4": 273652640.0, - "5": 246619984.0, - "6": 255713984.0, - "7": 585904576.0, - "8": 775188544.0, - "9": 683552384.0, - "10": 678184384.0, - "11": 709420544.0, - "12": 771913024.0, - "13": 884572992.0, - "14": 805905152.0, - "15": 771490816.0, - "16": 932248832.0, - "17": 721261824.0, - "18": 683711296.0, - "19": 963724352.0, - "20": 998655872.0, - "21": 756360320.0, - "22": 969720704.0, - "23": 762708416.0, - "24": 889305088.0, - "25": 865191296.0, - "26": 828440320.0, - "27": 806905024.0, - "28": 837449408.0, - "29": 783497856.0, - "30": 772494272.0, - "31": 793774528.0, - "32": 774902528.0, - "33": 752992128.0, - "34": 721632000.0, - "35": 728225216.0, - "36": 542603008.0, - "37": 723530816.0, - "38": 677573184.0, - "39": 686397568.0, - "40": 651324224.0, - "41": 604614656.0, - "42": 582812544.0, - "43": 564189760.0, - "44": 569972864.0, - "45": 536820928.0, - "46": 334504672.0, - "47": 494444000.0, - "48": 504118016.0, - "49": 475199808.0, - "50": 350261056.0 + "1": 38802552.0, + "2": 38543496.0, + "3": 38742496.0, + "4": 276808768.0, + "5": 252900224.0, + "6": 262014400.0, + "7": 604765376.0, + "8": 778329280.0, + "9": 664674944.0, + "10": 728521920.0, + "11": 718868480.0, + "12": 787622592.0, + "13": 900296192.0, + "14": 831151488.0, + "15": 762029184.0, + "16": 938532864.0, + "17": 633234048.0, + "18": 708920704.0, + "19": 976315584.0, + "20": 986060288.0, + "21": 781551744.0, + "22": 762139648.0, + "23": 888477824.0, + "24": 851552512.0, + "25": 827443072.0, + "26": 812721088.0, + "27": 806914304.0, + "28": 802850496.0, + "29": 748894592.0, + "30": 731604672.0, + "31": 752878144.0, + "32": 762315520.0, + "33": 737258304.0, + "34": 746789888.0, + "35": 734508928.0, + "36": 674695808.0, + "37": 673198208.0, + "38": 633526912.0, + "39": 620340928.0, + "40": 613575552.0, + "41": 566869312.0, + "42": 557646592.0, + "43": 554752576.0, + "44": 547950784.0, + "45": 527374464.0, + "46": 347107200.0, + "47": 497586496.0, + "48": 497828864.0, + "49": 465758912.0, + "50": 450885792.0 } }, "mem-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 55051542528.0, - "2": 57803964416.0, - "3": 57918414848.0, - "4": 57918414848.0, - "5": 57918414848.0, - "6": 57918414848.0, - "7": 57918414848.0, - "8": 57918414848.0, - "9": 57918414848.0, - "10": 57918414848.0, - "11": 57918414848.0, - "12": 57918414848.0, - "13": 57918414848.0, - "14": 57918414848.0, - "15": 57918414848.0, - "16": 57918414848.0, - "17": 57918414848.0, - "18": 57918414848.0, - "19": 57918414848.0, - "20": 57918414848.0, - "21": 57918414848.0, - "22": 57918414848.0, - "23": 57918414848.0, - "24": 57918414848.0, - "25": 57918414848.0, - "26": 57918414848.0, - "27": 57918414848.0, - "28": 57918414848.0, - "29": 57918414848.0, - "30": 57918414848.0, - "31": 57918414848.0, - "32": 57918414848.0, - "33": 57918414848.0, - "34": 57918414848.0, - "35": 57918414848.0, - "36": 57918414848.0, - "37": 57918414848.0, - "38": 57918414848.0, - "39": 57918414848.0, - "40": 57918414848.0, - "41": 57918414848.0, - "42": 57918414848.0, - "43": 57918414848.0, - "44": 57981075456.0, - "45": 58164338688.0, - "46": 58164338688.0, - "47": 58164338688.0, - "48": 58164338688.0, - "49": 58164338688.0, - "50": 58164338688.0 + "1": 54204293120.0, + "2": 56956715008.0, + "3": 57074692096.0, + "4": 57074692096.0, + "5": 57074692096.0, + "6": 57074692096.0, + "7": 57074692096.0, + "8": 57074692096.0, + "9": 57074692096.0, + "10": 57074692096.0, + "11": 57074692096.0, + "12": 57074692096.0, + "13": 57074692096.0, + "14": 57074692096.0, + "15": 57074692096.0, + "16": 57074692096.0, + "17": 57074692096.0, + "18": 57074692096.0, + "19": 57074692096.0, + "20": 57074692096.0, + "21": 57074692096.0, + "22": 57074692096.0, + "23": 57074692096.0, + "24": 57074692096.0, + "25": 57074692096.0, + "26": 57211289600.0, + "27": 57211289600.0, + "28": 57211289600.0, + "29": 57368535040.0, + "30": 57742073856.0, + "31": 57742073856.0, + "32": 57742073856.0, + "33": 57742073856.0, + "34": 57744101376.0, + "35": 58293194752.0, + "36": 58293194752.0, + "37": 58293194752.0, + "38": 58293194752.0, + "39": 58293194752.0, + "40": 58293194752.0, + "41": 58293194752.0, + "42": 58293194752.0, + "43": 58293194752.0, + "44": 58293194752.0, + "45": 58293194752.0, + "46": 58293194752.0, + "47": 58293194752.0, + "48": 58293194752.0, + "49": 58293194752.0, + "50": 58293194752.0 } }, "mtp_1 loss": { @@ -234,54 +234,54 @@ "values": { "1": 11.0765, "2": 11.07404, - "3": 10.5387, - "4": 10.09807, - "5": 9.81158, - "6": 10.07371, - "7": 9.79765, - "8": 9.06972, - "9": 8.86823, - "10": 9.12665, - "11": 8.49944, - "12": 8.5346, - "13": 8.42954, - "14": 7.8522, - "15": 7.99476, - "16": 8.05407, - "17": 8.0055, - "18": 7.73795, - "19": 8.11808, - "20": 7.83141, - "21": 7.53056, - "22": 7.50549, - "23": 7.37363, - "24": 7.37845, - "25": 7.62115, - "26": 7.02061, - "27": 7.5605, - "28": 7.2695, - "29": 7.44668, - "30": 7.58971, - "31": 7.32847, - "32": 7.50861, - "33": 7.57687, - "34": 7.63939, - "35": 7.15634, - "36": 7.02394, - "37": 7.35539, - "38": 7.13177, - "39": 7.49132, - "40": 7.47677, - "41": 7.42456, - "42": 7.1802, - "43": 7.16487, - "44": 7.34808, - "45": 7.12903, - "46": 6.83012, - "47": 7.2395, - "48": 7.08268, - "49": 7.51404, - "50": 6.97693 + "3": 10.53863, + "4": 10.0981, + "5": 9.81152, + "6": 10.0744, + "7": 9.79944, + "8": 9.07176, + "9": 8.87116, + "10": 9.12759, + "11": 8.49894, + "12": 8.53114, + "13": 8.42531, + "14": 7.84784, + "15": 7.99147, + "16": 8.05102, + "17": 8.00126, + "18": 7.73217, + "19": 8.11102, + "20": 7.83055, + "21": 7.52608, + "22": 7.49979, + "23": 7.37315, + "24": 7.37265, + "25": 7.61392, + "26": 7.01833, + "27": 7.55877, + "28": 7.26822, + "29": 7.44363, + "30": 7.58581, + "31": 7.3265, + "32": 7.50876, + "33": 7.57264, + "34": 7.63783, + "35": 7.15428, + "36": 7.02086, + "37": 7.35313, + "38": 7.12909, + "39": 7.48882, + "40": 7.47518, + "41": 7.42231, + "42": 7.17726, + "43": 7.16243, + "44": 7.34345, + "45": 7.12344, + "46": 6.8279, + "47": 7.23665, + "48": 7.08061, + "49": 7.51184, + "50": 6.9731 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 90.94511, - "2": 1.54793, - "3": 1.33035, - "4": 2.25969, - "5": 1.82487, - "6": 1.71972, - "7": 2.15404, - "8": 1.61956, - "9": 1.77326, - "10": 1.72086, - "11": 1.01952, - "12": 1.02588, - "13": 1.02874, - "14": 1.02703, - "15": 1.03114, - "16": 1.03244, - "17": 1.03532, - "18": 1.04017, - "19": 1.03111, - "20": 1.03139, - "21": 1.03293, - "22": 1.03136, - "23": 1.03187, - "24": 1.0297, - "25": 1.03561, - "26": 1.5512, - "27": 1.03857, - "28": 1.02247, - "29": 1.03252, - "30": 1.02351, - "31": 1.02701, - "32": 1.0267, - "33": 1.02921, - "34": 1.02405, - "35": 1.02405, - "36": 1.04177, - "37": 1.0449, - "38": 1.04688, - "39": 1.05181, - "40": 1.04378, - "41": 1.0421, - "42": 1.04502, - "43": 1.0336, - "44": 1.05112, - "45": 1.04838, - "46": 1.03386, - "47": 1.04806, - "48": 1.04195, - "49": 1.04121, - "50": 1.03797 + "1": 97.95665, + "2": 1.66988, + "3": 1.35644, + "4": 2.24552, + "5": 2.14285, + "6": 1.60272, + "7": 1.5113, + "8": 2.10932, + "9": 1.69738, + "10": 1.0561, + "11": 1.04064, + "12": 1.0335, + "13": 1.03186, + "14": 1.03406, + "15": 1.05897, + "16": 1.03516, + "17": 1.04396, + "18": 1.08073, + "19": 1.06079, + "20": 1.04178, + "21": 1.03726, + "22": 1.03706, + "23": 1.03878, + "24": 1.04111, + "25": 1.04952, + "26": 1.04497, + "27": 1.04672, + "28": 1.03793, + "29": 1.03092, + "30": 1.04813, + "31": 1.03205, + "32": 1.03729, + "33": 1.02557, + "34": 1.03623, + "35": 1.04247, + "36": 1.03261, + "37": 1.03911, + "38": 1.04764, + "39": 1.0376, + "40": 1.04918, + "41": 1.03907, + "42": 1.05227, + "43": 1.04186, + "44": 1.04266, + "45": 1.03786, + "46": 1.04673, + "47": 1.05766, + "48": 1.04958, + "49": 1.05312, + "50": 1.05239 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgx_h100.json index 644d5284b7a..a76d8667ec6 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgx_h100.json @@ -6,54 +6,54 @@ "values": { "1": 11.0474, "2": 11.03765, - "3": 9.60584, - "4": 9.26463, - "5": 9.32776, - "6": 9.30982, - "7": 9.1645, - "8": 8.78939, - "9": 8.69677, - "10": 8.91589, - "11": 8.38321, - "12": 8.44094, - "13": 8.35341, - "14": 7.80742, - "15": 7.95516, - "16": 7.99761, - "17": 7.95082, - "18": 7.67707, - "19": 8.07009, - "20": 7.78393, - "21": 7.48374, - "22": 7.4799, - "23": 7.35056, - "24": 7.34597, - "25": 7.62236, - "26": 7.01653, - "27": 7.55175, - "28": 7.27173, - "29": 7.44209, - "30": 7.57394, - "31": 7.33713, - "32": 7.52234, - "33": 7.5745, - "34": 7.62003, - "35": 7.15235, - "36": 7.01753, - "37": 7.35428, - "38": 7.12808, - "39": 7.47832, - "40": 7.48784, - "41": 7.42289, - "42": 7.19117, - "43": 7.17856, - "44": 7.35808, - "45": 7.12045, - "46": 6.85278, - "47": 7.23963, - "48": 7.07274, - "49": 7.54922, - "50": 6.97811 + "3": 9.6074, + "4": 9.2648, + "5": 9.42291, + "6": 9.09511, + "7": 9.12753, + "8": 8.75686, + "9": 8.61627, + "10": 8.89295, + "11": 8.37933, + "12": 8.39932, + "13": 8.32626, + "14": 7.81437, + "15": 7.93661, + "16": 7.99492, + "17": 7.95458, + "18": 7.67733, + "19": 8.07234, + "20": 7.78815, + "21": 7.48342, + "22": 7.48177, + "23": 7.34879, + "24": 7.34465, + "25": 7.61117, + "26": 7.01605, + "27": 7.54878, + "28": 7.26655, + "29": 7.43507, + "30": 7.56529, + "31": 7.32669, + "32": 7.50645, + "33": 7.5577, + "34": 7.60977, + "35": 7.14607, + "36": 7.00597, + "37": 7.34071, + "38": 7.11796, + "39": 7.46649, + "40": 7.47443, + "41": 7.41032, + "42": 7.17365, + "43": 7.16495, + "44": 7.34265, + "45": 7.10918, + "46": 6.83934, + "47": 7.22335, + "48": 7.05732, + "49": 7.53394, + "50": 6.95951 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 38802592.0, - "2": 38543572.0, - "3": 38743144.0, - "4": 270609984.0, - "5": 224754048.0, - "6": 372389344.0, - "7": 598920768.0, - "8": 850687488.0, - "9": 708853952.0, - "10": 684582272.0, - "11": 621544192.0, - "12": 630341056.0, - "13": 639368448.0, - "14": 548278592.0, - "15": 617425984.0, - "16": 702795968.0, - "17": 567344064.0, - "18": 589440000.0, - "19": 630362240.0, - "20": 669614592.0, - "21": 564495744.0, - "22": 586578304.0, - "23": 542928576.0, - "24": 511907552.0, - "25": 547508864.0, - "26": 661787712.0, - "27": 479817696.0, - "28": 466314688.0, - "29": 491018048.0, - "30": 470632640.0, - "31": 623908992.0, - "32": 523373440.0, - "33": 435529664.0, - "34": 405444992.0, - "35": 489248416.0, - "36": 322730176.0, - "37": 339782720.0, - "38": 281398720.0, - "39": 249171440.0, - "40": 343532416.0, - "41": 400160576.0, - "42": 384640608.0, - "43": 378621824.0, - "44": 374955616.0, - "45": 241150752.0, - "46": 340828096.0, - "47": 280778400.0, - "48": 284051968.0, - "49": 173319200.0, - "50": 197102384.0 + "1": 38802536.0, + "2": 38543540.0, + "3": 38739408.0, + "4": 273756736.0, + "5": 205853584.0, + "6": 284244640.0, + "7": 652227968.0, + "8": 790994816.0, + "9": 762295424.0, + "10": 665870592.0, + "11": 618336384.0, + "12": 639816192.0, + "13": 699169600.0, + "14": 620502464.0, + "15": 623699456.0, + "16": 847396864.0, + "17": 601834432.0, + "18": 642855744.0, + "19": 668078912.0, + "20": 574651008.0, + "21": 608590080.0, + "22": 599821504.0, + "23": 558380672.0, + "24": 688014720.0, + "25": 500623296.0, + "26": 532887808.0, + "27": 506526976.0, + "28": 450900800.0, + "29": 528748480.0, + "30": 445603872.0, + "31": 457250368.0, + "32": 400653888.0, + "33": 347460640.0, + "34": 268919904.0, + "35": 495515584.0, + "36": 332139008.0, + "37": 446760768.0, + "38": 391328576.0, + "39": 378290400.0, + "40": 261331328.0, + "41": 368680832.0, + "42": 337485280.0, + "43": 337755968.0, + "44": 324657920.0, + "45": 216104608.0, + "46": 218159872.0, + "47": 302569184.0, + "48": 296505312.0, + "49": 280170176.0, + "50": 268486912.0 } }, "mem-allocated-bytes": { @@ -198,33 +198,33 @@ "21": 56295710720.0, "22": 56295710720.0, "23": 56295710720.0, - "24": 56295710720.0, - "25": 56502132736.0, - "26": 56578957312.0, - "27": 57159032832.0, - "28": 57159032832.0, - "29": 57159032832.0, - "30": 57159032832.0, - "31": 57159032832.0, - "32": 57159032832.0, - "33": 57159032832.0, - "34": 57159032832.0, - "35": 57159032832.0, - "36": 57159032832.0, - "37": 57159032832.0, - "38": 57159032832.0, - "39": 57159032832.0, - "40": 57159032832.0, - "41": 57159032832.0, - "42": 57296633856.0, - "43": 57314361344.0, - "44": 57498943488.0, - "45": 57649999872.0, - "46": 57649999872.0, - "47": 57649999872.0, - "48": 57649999872.0, - "49": 57649999872.0, - "50": 57649999872.0 + "24": 56738553856.0, + "25": 56738553856.0, + "26": 56777162752.0, + "27": 56777162752.0, + "28": 56777162752.0, + "29": 56777162752.0, + "30": 56777162752.0, + "31": 56777162752.0, + "32": 56777162752.0, + "33": 56777162752.0, + "34": 56824344576.0, + "35": 57080135680.0, + "36": 57331695616.0, + "37": 57331695616.0, + "38": 57577013248.0, + "39": 57577013248.0, + "40": 57577013248.0, + "41": 57577013248.0, + "42": 57577013248.0, + "43": 57587191808.0, + "44": 57596944384.0, + "45": 57705652224.0, + "46": 57790390272.0, + "47": 57790390272.0, + "48": 57790390272.0, + "49": 57790390272.0, + "50": 57790390272.0 } }, "mtp_1 loss": { @@ -234,54 +234,54 @@ "values": { "1": 11.07756, "2": 11.07651, - "3": 10.53059, - "4": 10.08643, - "5": 9.86147, - "6": 9.55598, - "7": 9.64192, - "8": 8.9278, - "9": 8.73566, - "10": 9.03281, - "11": 8.40329, - "12": 8.42578, - "13": 8.32864, - "14": 7.77688, - "15": 7.92204, - "16": 7.97443, - "17": 7.92322, - "18": 7.65613, - "19": 8.04247, - "20": 7.76026, - "21": 7.44933, - "22": 7.43739, - "23": 7.31015, - "24": 7.31285, - "25": 7.56522, - "26": 6.97802, - "27": 7.50958, - "28": 7.22284, - "29": 7.40631, - "30": 7.53948, - "31": 7.2872, - "32": 7.474, - "33": 7.53734, - "34": 7.59617, - "35": 7.12168, - "36": 6.98902, - "37": 7.32682, - "38": 7.10026, - "39": 7.4584, - "40": 7.44943, - "41": 7.39421, - "42": 7.15113, - "43": 7.13405, - "44": 7.31917, - "45": 7.09081, - "46": 6.80653, - "47": 7.21079, - "48": 7.0516, - "49": 7.48755, - "50": 6.95113 + "3": 10.53063, + "4": 10.08611, + "5": 9.87524, + "6": 9.55366, + "7": 9.62345, + "8": 8.91012, + "9": 8.72228, + "10": 9.02504, + "11": 8.39501, + "12": 8.42504, + "13": 8.32334, + "14": 7.76976, + "15": 7.91789, + "16": 7.97018, + "17": 7.92051, + "18": 7.65266, + "19": 8.0377, + "20": 7.76074, + "21": 7.44752, + "22": 7.43657, + "23": 7.30984, + "24": 7.31186, + "25": 7.56562, + "26": 6.97201, + "27": 7.50933, + "28": 7.2266, + "29": 7.40633, + "30": 7.53569, + "31": 7.28904, + "32": 7.47424, + "33": 7.53526, + "34": 7.59404, + "35": 7.11968, + "36": 6.9867, + "37": 7.32338, + "38": 7.09605, + "39": 7.45524, + "40": 7.44706, + "41": 7.39271, + "42": 7.14573, + "43": 7.13128, + "44": 7.31399, + "45": 7.08836, + "46": 6.80158, + "47": 7.2062, + "48": 7.0468, + "49": 7.47982, + "50": 6.94494 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 97.6542, - "2": 1.64943, - "3": 1.32578, - "4": 1.75905, - "5": 1.13768, - "6": 1.90299, - "7": 1.09961, - "8": 1.09819, - "9": 1.09778, - "10": 1.11461, - "11": 1.09709, - "12": 1.10879, - "13": 1.11446, - "14": 1.10227, - "15": 1.10064, - "16": 1.10154, - "17": 1.10307, - "18": 1.11422, - "19": 1.11171, - "20": 1.10785, - "21": 1.10391, - "22": 1.10739, - "23": 1.09617, - "24": 1.09808, - "25": 1.10211, - "26": 1.09861, - "27": 1.11235, - "28": 1.10628, - "29": 1.08834, - "30": 1.08904, - "31": 1.09002, - "32": 1.08833, - "33": 1.08496, - "34": 1.09187, - "35": 1.09656, - "36": 1.0944, - "37": 1.0819, - "38": 1.08992, - "39": 1.10447, - "40": 1.08684, - "41": 1.0921, - "42": 1.10087, - "43": 1.09566, - "44": 1.08789, - "45": 1.09029, - "46": 1.08534, - "47": 1.08796, - "48": 1.10222, - "49": 1.09817, - "50": 1.07925 + "1": 102.52307, + "2": 1.75305, + "3": 1.36681, + "4": 1.62808, + "5": 1.13714, + "6": 1.45805, + "7": 1.6121, + "8": 1.20031, + "9": 1.09784, + "10": 1.10383, + "11": 1.10878, + "12": 1.18093, + "13": 1.43808, + "14": 1.17223, + "15": 1.11575, + "16": 1.1159, + "17": 1.11727, + "18": 1.10751, + "19": 1.11189, + "20": 1.1082, + "21": 1.10459, + "22": 1.11252, + "23": 1.10744, + "24": 1.12218, + "25": 1.09823, + "26": 1.11657, + "27": 1.08949, + "28": 1.10254, + "29": 1.10189, + "30": 1.08963, + "31": 1.10454, + "32": 1.09654, + "33": 1.08747, + "34": 1.09674, + "35": 1.09106, + "36": 1.08904, + "37": 1.1178, + "38": 1.09379, + "39": 1.10306, + "40": 1.09998, + "41": 1.08808, + "42": 1.0941, + "43": 1.0919, + "44": 1.0813, + "45": 1.08715, + "46": 1.07061, + "47": 1.07098, + "48": 1.07438, + "49": 1.07469, + "50": 1.0719 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgx_h100.json index f50f32bf276..c55faf839a8 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgx_h100.json @@ -6,54 +6,54 @@ "values": { "1": 10.94944, "2": 10.95158, - "3": 10.50291, - "4": 9.96373, - "5": 9.94051, - "6": 9.67323, - "7": 10.22821, - "8": 9.49736, - "9": 9.54323, - "10": 9.79347, + "3": 10.50318, + "4": 9.964, + "5": 9.94016, + "6": 9.67332, + "7": 10.23184, + "8": 9.4965, + "9": 9.54631, + "10": 9.79388, "11": 9.3003, - "12": 9.40372, - "13": 9.39468, - "14": 8.84935, - "15": 9.02277, - "16": 9.06983, - "17": 9.04403, - "18": 8.75568, - "19": 9.17822, - "20": 8.86078, - "21": 8.53542, - "22": 8.54991, - "23": 8.42524, - "24": 8.37607, - "25": 8.63809, - "26": 7.96681, - "27": 8.57149, - "28": 8.19023, - "29": 8.39544, - "30": 8.67048, - "31": 8.28487, - "32": 8.43358, - "33": 8.55518, - "34": 8.65834, - "35": 8.07752, - "36": 7.94541, - "37": 8.29246, - "38": 7.97753, - "39": 8.38915, - "40": 8.35513, - "41": 8.31736, - "42": 8.05606, - "43": 8.03035, - "44": 8.23838, - "45": 8.09696, - "46": 7.61491, - "47": 8.15046, - "48": 8.0039, - "49": 8.38371, - "50": 7.81253 + "12": 9.40451, + "13": 9.39562, + "14": 8.8513, + "15": 9.02474, + "16": 9.07111, + "17": 9.04534, + "18": 8.75805, + "19": 9.1794, + "20": 8.86325, + "21": 8.5391, + "22": 8.55134, + "23": 8.42688, + "24": 8.38109, + "25": 8.63783, + "26": 7.96861, + "27": 8.57603, + "28": 8.1922, + "29": 8.3971, + "30": 8.67285, + "31": 8.28458, + "32": 8.43378, + "33": 8.55597, + "34": 8.65985, + "35": 8.07899, + "36": 7.94715, + "37": 8.29413, + "38": 7.97958, + "39": 8.39117, + "40": 8.35496, + "41": 8.31782, + "42": 8.05717, + "43": 8.03152, + "44": 8.24042, + "45": 8.0999, + "46": 7.61677, + "47": 8.15178, + "48": 8.00508, + "49": 8.38458, + "50": 7.81369 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 19403592.0, - "2": 19274176.0, - "3": 20945222.0, - "4": 89687760.0, - "5": 151693248.0, - "6": 138938096.0, - "7": 164021920.0, - "8": 198936768.0, - "9": 160969488.0, - "10": 159820768.0, - "11": 216424656.0, - "12": 209851488.0, - "13": 225333088.0, - "14": 222140112.0, - "15": 231619680.0, - "16": 216080960.0, - "17": 288314816.0, - "18": 170463296.0, - "19": 167479232.0, - "20": 178590448.0, - "21": 241500624.0, - "22": 220658528.0, - "23": 197474784.0, - "24": 226071040.0, - "25": 237749008.0, - "26": 288417664.0, - "27": 232076720.0, - "28": 286654304.0, - "29": 258070544.0, - "30": 214923920.0, - "31": 241275712.0, - "32": 214510896.0, - "33": 203527888.0, - "34": 228752368.0, - "35": 194293392.0, - "36": 236711744.0, - "37": 162157968.0, - "38": 225545168.0, - "39": 214299328.0, - "40": 218746384.0, - "41": 163931104.0, - "42": 162458624.0, - "43": 192453632.0, - "44": 149739552.0, - "45": 175646608.0, - "46": 129510480.0, - "47": 170153408.0, - "48": 157697168.0, - "49": 92955200.0, - "50": 157824256.0 + "1": 19403652.0, + "2": 19274102.0, + "3": 19373168.0, + "4": 86562120.0, + "5": 151677296.0, + "6": 142091232.0, + "7": 167132032.0, + "8": 197337088.0, + "9": 168836496.0, + "10": 162963792.0, + "11": 211653824.0, + "12": 214575616.0, + "13": 231549168.0, + "14": 220571728.0, + "15": 250508240.0, + "16": 168968368.0, + "17": 294610112.0, + "18": 167327952.0, + "19": 156385504.0, + "20": 177007072.0, + "21": 219468816.0, + "22": 217511168.0, + "23": 194318208.0, + "24": 208788192.0, + "25": 240820928.0, + "26": 250667072.0, + "27": 235205856.0, + "28": 285071552.0, + "29": 270668736.0, + "30": 241596448.0, + "31": 256938208.0, + "32": 252232640.0, + "33": 213058752.0, + "34": 217720576.0, + "35": 172316416.0, + "36": 246137120.0, + "37": 228162320.0, + "38": 238162048.0, + "39": 211207168.0, + "40": 206162560.0, + "41": 151397232.0, + "42": 206473424.0, + "43": 175165248.0, + "44": 182768560.0, + "45": 158317856.0, + "46": 159388704.0, + "47": 152897904.0, + "48": 143548896.0, + "49": 124357696.0, + "50": 151519648.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 4876471296.0, - "2": 4876535296.0, - "3": 4875369984.0, - "4": 4874512896.0, - "5": 4874505728.0, - "6": 4876898816.0, - "7": 4875386368.0, - "8": 4876464640.0, - "9": 4876400128.0, - "10": 4877448704.0, - "11": 4876193280.0, - "12": 4874407424.0, - "13": 4875226624.0, - "14": 4875415040.0, - "15": 4876397056.0, - "16": 4877806080.0, - "17": 4876205568.0, - "18": 4876743168.0, - "19": 4875044352.0, - "20": 4877310464.0, - "21": 4875642368.0, - "22": 4874806784.0, - "23": 4875531776.0, - "24": 4878220800.0, - "25": 4875477504.0, - "26": 4877613568.0, - "27": 4875030016.0, - "28": 4875365888.0, - "29": 4876291584.0, - "30": 4876465664.0, - "31": 4874710528.0, - "32": 4875980288.0, - "33": 4874096128.0, - "34": 4875379200.0, - "35": 4875995648.0, - "36": 4876016128.0, - "37": 4874497536.0, - "38": 4875453952.0, - "39": 4875932160.0, - "40": 4876112384.0, - "41": 4875683328.0, - "42": 4877188608.0, - "43": 4875977216.0, - "44": 4878347776.0, - "45": 4876845568.0, - "46": 4875212288.0, - "47": 4876330496.0, - "48": 4875971072.0, - "49": 4875368960.0, - "50": 4875349504.0 + "1": 4875597824.0, + "2": 4875363840.0, + "3": 4874979840.0, + "4": 4874899968.0, + "5": 4875749888.0, + "6": 4876656128.0, + "7": 4875178496.0, + "8": 4874036736.0, + "9": 4876568064.0, + "10": 4876058112.0, + "11": 4876045824.0, + "12": 4874515968.0, + "13": 4875086336.0, + "14": 4874568192.0, + "15": 4875987456.0, + "16": 4874790400.0, + "17": 4875477504.0, + "18": 4875512320.0, + "19": 4876186112.0, + "20": 4875747840.0, + "21": 4874790400.0, + "22": 4876221952.0, + "23": 4874534400.0, + "24": 4875733504.0, + "25": 4875019776.0, + "26": 4875168256.0, + "27": 4874978816.0, + "28": 4875781632.0, + "29": 4876329472.0, + "30": 4875107840.0, + "31": 4874253824.0, + "32": 4874167808.0, + "33": 4876044800.0, + "34": 4875914752.0, + "35": 4874962432.0, + "36": 4875862528.0, + "37": 4877336064.0, + "38": 4875002368.0, + "39": 4874599936.0, + "40": 4874880512.0, + "41": 4875294208.0, + "42": 4875419136.0, + "43": 4875780608.0, + "44": 4874780160.0, + "45": 4875191808.0, + "46": 4875717120.0, + "47": 4874050048.0, + "48": 4875580928.0, + "49": 4875412992.0, + "50": 4875462144.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 41199984640.0, - "2": 41199984640.0, - "3": 41199984640.0, - "4": 41199984640.0, - "5": 41199984640.0, - "6": 41199984640.0, - "7": 41199984640.0, - "8": 41199984640.0, - "9": 41199984640.0, - "10": 41199984640.0, - "11": 41199984640.0, - "12": 41199984640.0, - "13": 41199984640.0, - "14": 41199984640.0, - "15": 41199984640.0, - "16": 41199984640.0, - "17": 41199984640.0, - "18": 41199984640.0, - "19": 41199984640.0, - "20": 41199984640.0, - "21": 41199984640.0, - "22": 41199984640.0, - "23": 41199984640.0, - "24": 41199984640.0, - "25": 41199984640.0, - "26": 41199984640.0, - "27": 41199984640.0, - "28": 41199984640.0, - "29": 41199984640.0, - "30": 41199984640.0, - "31": 41199984640.0, - "32": 41199984640.0, - "33": 41199984640.0, - "34": 41199984640.0, - "35": 41199984640.0, - "36": 41199984640.0, - "37": 41199984640.0, - "38": 41199984640.0, - "39": 41199984640.0, - "40": 41199984640.0, - "41": 41199984640.0, - "42": 41199984640.0, - "43": 41199984640.0, - "44": 41199984640.0, - "45": 41199984640.0, - "46": 41199984640.0, - "47": 41199984640.0, - "48": 41199984640.0, - "49": 41199984640.0, - "50": 41199984640.0 + "1": 41201033216.0, + "2": 41201033216.0, + "3": 41201033216.0, + "4": 41201033216.0, + "5": 41201033216.0, + "6": 41201033216.0, + "7": 41201033216.0, + "8": 41201033216.0, + "9": 41201033216.0, + "10": 41201033216.0, + "11": 41201033216.0, + "12": 41201033216.0, + "13": 41201033216.0, + "14": 41201033216.0, + "15": 41201033216.0, + "16": 41201033216.0, + "17": 41201033216.0, + "18": 41201033216.0, + "19": 41201033216.0, + "20": 41201033216.0, + "21": 41201033216.0, + "22": 41201033216.0, + "23": 41201033216.0, + "24": 41201033216.0, + "25": 41201033216.0, + "26": 41201033216.0, + "27": 41201033216.0, + "28": 41201033216.0, + "29": 41201033216.0, + "30": 41201033216.0, + "31": 41201033216.0, + "32": 41201033216.0, + "33": 41201033216.0, + "34": 41201033216.0, + "35": 41201033216.0, + "36": 41201033216.0, + "37": 41201033216.0, + "38": 41201033216.0, + "39": 41201033216.0, + "40": 41201033216.0, + "41": 41201033216.0, + "42": 41201033216.0, + "43": 41201033216.0, + "44": 41201033216.0, + "45": 41201033216.0, + "46": 41201033216.0, + "47": 41201033216.0, + "48": 41201033216.0, + "49": 41201033216.0, + "50": 41201033216.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 86.59245, - "2": 1.11188, - "3": 0.94659, - "4": 0.89686, - "5": 1.40432, - "6": 1.06239, - "7": 1.03181, - "8": 1.07838, - "9": 0.88529, - "10": 0.87346, - "11": 0.9764, - "12": 0.87397, - "13": 0.87922, - "14": 0.87464, - "15": 0.86356, - "16": 0.88539, - "17": 0.86198, - "18": 0.86676, - "19": 0.85335, - "20": 0.85904, - "21": 0.84697, - "22": 0.84984, - "23": 0.84683, - "24": 0.85172, - "25": 0.84975, - "26": 0.86347, - "27": 0.86726, - "28": 0.84853, - "29": 0.84946, - "30": 0.85197, - "31": 0.85026, - "32": 0.84681, - "33": 0.84571, - "34": 0.85295, - "35": 0.8568, - "36": 0.84946, - "37": 0.8495, - "38": 0.84754, - "39": 0.85264, - "40": 0.8452, - "41": 0.84517, - "42": 0.84876, - "43": 0.84152, - "44": 0.84772, - "45": 0.84803, - "46": 0.84148, - "47": 0.84697, - "48": 0.84232, - "49": 0.84236, - "50": 0.84249 + "1": 84.85893, + "2": 1.16099, + "3": 0.98814, + "4": 0.90006, + "5": 1.44704, + "6": 1.12424, + "7": 1.08423, + "8": 1.07558, + "9": 1.1513, + "10": 0.88417, + "11": 1.07532, + "12": 0.88519, + "13": 0.87318, + "14": 0.87758, + "15": 0.87276, + "16": 0.8776, + "17": 0.86863, + "18": 0.87011, + "19": 0.86845, + "20": 0.86617, + "21": 0.85521, + "22": 0.86783, + "23": 0.86126, + "24": 0.85746, + "25": 0.85758, + "26": 0.86093, + "27": 0.85634, + "28": 0.85365, + "29": 0.86147, + "30": 0.86891, + "31": 0.85512, + "32": 0.85344, + "33": 0.85409, + "34": 0.85597, + "35": 0.85605, + "36": 0.84565, + "37": 0.84908, + "38": 0.85623, + "39": 0.8586, + "40": 0.87856, + "41": 0.85187, + "42": 0.86298, + "43": 0.85814, + "44": 0.85706, + "45": 0.85473, + "46": 0.85417, + "47": 0.85861, + "48": 0.85261, + "49": 0.85118, + "50": 0.84383 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json index 51e9d7154c9..bc1062ce151 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json @@ -6,54 +6,54 @@ "values": { "1": 11.06693, "2": 11.0602, - "3": 10.21167, - "4": 9.95277, - "5": 10.12388, - "6": 8.82369, - "7": 9.52785, - "8": 8.44289, - "9": 7.85041, - "10": 7.07093, - "11": 9.28562, - "12": 9.13324, - "13": 7.86224, - "14": 8.19705, - "15": 8.22932, - "16": 8.17783, - "17": 8.2161, - "18": 7.50358, - "19": 8.08893, - "20": 7.64905, - "21": 7.95183, - "22": 7.29849, - "23": 7.93348, - "24": 7.43565, - "25": 8.2385, - "26": 7.75634, - "27": 7.70075, - "28": 7.66089, - "29": 7.75606, - "30": 7.56072, - "31": 7.81859, - "32": 6.46861, - "33": 7.20532, - "34": 7.77706, - "35": 7.73113, - "36": 6.72448, - "37": 8.09344, - "38": 7.62008, - "39": 7.96872, - "40": 7.4992, - "41": 7.49916, - "42": 6.11993, - "43": 7.59389, - "44": 7.91482, - "45": 6.83633, - "46": 7.41335, - "47": 7.78887, - "48": 7.87666, - "49": 7.58746, - "50": 6.84352 + "3": 10.21173, + "4": 9.95255, + "5": 10.12502, + "6": 8.8231, + "7": 9.52825, + "8": 8.44297, + "9": 7.84977, + "10": 7.0728, + "11": 9.30154, + "12": 9.14531, + "13": 7.86583, + "14": 8.21069, + "15": 8.2169, + "16": 8.17413, + "17": 8.21514, + "18": 7.49348, + "19": 8.08414, + "20": 7.63479, + "21": 7.95116, + "22": 7.29475, + "23": 7.9358, + "24": 7.43073, + "25": 8.23819, + "26": 7.75508, + "27": 7.6991, + "28": 7.65492, + "29": 7.75272, + "30": 7.56401, + "31": 7.81794, + "32": 6.46781, + "33": 7.20433, + "34": 7.77611, + "35": 7.72648, + "36": 6.71848, + "37": 8.09106, + "38": 7.61823, + "39": 7.96665, + "40": 7.49555, + "41": 7.49366, + "42": 6.10456, + "43": 7.59158, + "44": 7.91315, + "45": 6.83253, + "46": 7.4064, + "47": 7.78787, + "48": 7.87227, + "49": 7.58424, + "50": 6.83739 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 47165160.0, - "2": 46897928.0, - "3": 52684380.0, - "4": 297108064.0, - "5": 556667648.0, - "6": 661861120.0, - "7": 1027446592.0, - "8": 742822528.0, - "9": 846651648.0, - "10": 693167680.0, - "11": 826875520.0, - "12": 814304768.0, - "13": 642608768.0, - "14": 606554752.0, - "15": 728814528.0, - "16": 845696384.0, - "17": 667529728.0, - "18": 673504384.0, - "19": 889544960.0, - "20": 890696768.0, - "21": 676302464.0, - "22": 688965120.0, - "23": 789972480.0, - "24": 761249536.0, - "25": 648185280.0, - "26": 789507392.0, - "27": 641355648.0, - "28": 805511168.0, - "29": 773780224.0, - "30": 811888960.0, - "31": 688167744.0, - "32": 834871424.0, - "33": 792944256.0, - "34": 777109568.0, - "35": 763515136.0, - "36": 733607744.0, - "37": 743626240.0, - "38": 746577024.0, - "39": 732972864.0, - "40": 735645696.0, - "41": 556711680.0, - "42": 680528384.0, - "43": 669752960.0, - "44": 667702912.0, - "45": 635197248.0, - "46": 629093120.0, - "47": 626713344.0, - "48": 600843456.0, - "49": 581506752.0, - "50": 572705728.0 + "1": 47165248.0, + "2": 46897896.0, + "3": 52684328.0, + "4": 297102368.0, + "5": 569266880.0, + "6": 661848704.0, + "7": 1027448384.0, + "8": 752263424.0, + "9": 852974912.0, + "10": 683720576.0, + "11": 833170624.0, + "12": 814312640.0, + "13": 639456320.0, + "14": 628553664.0, + "15": 706814592.0, + "16": 848848256.0, + "17": 676948992.0, + "18": 676681088.0, + "19": 892688576.0, + "20": 890700864.0, + "21": 676293696.0, + "22": 701562304.0, + "23": 796268224.0, + "24": 786414720.0, + "25": 667072192.0, + "26": 767487552.0, + "27": 773408512.0, + "28": 758333696.0, + "29": 770627840.0, + "30": 758410304.0, + "31": 644127616.0, + "32": 806561088.0, + "33": 811820352.0, + "34": 780254848.0, + "35": 757223808.0, + "36": 758778496.0, + "37": 753072832.0, + "38": 752875328.0, + "39": 767575744.0, + "40": 760803392.0, + "41": 742253440.0, + "42": 718278848.0, + "43": 676047424.0, + "44": 673998592.0, + "45": 635196864.0, + "46": 629090048.0, + "47": 623565376.0, + "48": 600849984.0, + "49": 578357504.0, + "50": 585291904.0 } }, "mem-allocated-bytes": { @@ -185,46 +185,46 @@ "8": 8233667072.0, "9": 8233667072.0, "10": 8233667072.0, - "11": 8262715904.0, - "12": 8262715904.0, - "13": 8262715904.0, - "14": 8262715904.0, - "15": 8262715904.0, - "16": 8268117504.0, - "17": 8288236032.0, - "18": 8288236032.0, - "19": 8288236032.0, - "20": 8288236032.0, - "21": 8288236032.0, - "22": 8299924992.0, - "23": 8302176768.0, - "24": 8302176768.0, - "25": 8302176768.0, - "26": 8302176768.0, - "27": 8302176768.0, - "28": 8302176768.0, - "29": 8302176768.0, - "30": 8302176768.0, - "31": 8302176768.0, - "32": 8302176768.0, - "33": 8302176768.0, - "34": 8302176768.0, - "35": 8302176768.0, - "36": 8302176768.0, - "37": 8302176768.0, - "38": 8313753088.0, - "39": 8313753088.0, - "40": 8313753088.0, - "41": 8313753088.0, - "42": 8313753088.0, - "43": 8313753088.0, - "44": 8313753088.0, - "45": 8313753088.0, - "46": 8313753088.0, - "47": 8313753088.0, - "48": 8313753088.0, - "49": 8313753088.0, - "50": 8313753088.0 + "11": 8262763008.0, + "12": 8262763008.0, + "13": 8262763008.0, + "14": 8262763008.0, + "15": 8262763008.0, + "16": 8273029632.0, + "17": 8282915328.0, + "18": 8282915328.0, + "19": 8284467712.0, + "20": 8294910464.0, + "21": 8294910464.0, + "22": 8303365632.0, + "23": 8303365632.0, + "24": 8303365632.0, + "25": 8303365632.0, + "26": 8303365632.0, + "27": 8303365632.0, + "28": 8303365632.0, + "29": 8303365632.0, + "30": 8328921600.0, + "31": 8328921600.0, + "32": 8328921600.0, + "33": 8328921600.0, + "34": 8342317568.0, + "35": 8352083456.0, + "36": 8352083456.0, + "37": 8352083456.0, + "38": 8352083456.0, + "39": 8352083456.0, + "40": 8352083456.0, + "41": 8352083456.0, + "42": 8352083456.0, + "43": 8352083456.0, + "44": 8352083456.0, + "45": 8352083456.0, + "46": 8352083456.0, + "47": 8352083456.0, + "48": 8352083456.0, + "49": 8352083456.0, + "50": 8352083456.0 } }, "mtp_1 loss": { @@ -234,54 +234,54 @@ "values": { "1": 11.07401, "2": 11.0927, - "3": 10.8262, - "4": 10.27574, - "5": 10.45324, - "6": 8.32758, - "7": 9.82629, - "8": 8.01538, - "9": 7.47611, - "10": 6.75851, - "11": 8.92961, - "12": 8.98772, - "13": 7.80203, - "14": 8.02221, - "15": 8.11372, - "16": 8.14498, - "17": 8.13435, - "18": 7.45035, - "19": 8.03784, - "20": 7.54246, - "21": 7.90269, - "22": 7.28093, - "23": 7.88727, - "24": 7.37587, - "25": 8.17289, - "26": 7.70083, - "27": 7.62668, - "28": 7.61747, - "29": 7.69888, - "30": 7.48586, - "31": 7.74301, - "32": 6.37542, - "33": 7.13919, - "34": 7.7198, - "35": 7.63387, - "36": 6.6127, - "37": 8.03449, - "38": 7.58334, - "39": 7.89887, - "40": 7.41168, - "41": 7.42316, - "42": 6.01689, - "43": 7.48867, - "44": 7.86976, - "45": 6.75113, - "46": 7.3054, - "47": 7.73281, - "48": 7.79017, - "49": 7.48985, - "50": 6.75753 + "3": 10.82644, + "4": 10.27575, + "5": 10.45332, + "6": 8.3277, + "7": 9.8265, + "8": 8.01558, + "9": 7.47586, + "10": 6.7581, + "11": 8.9297, + "12": 8.98829, + "13": 7.80214, + "14": 8.02436, + "15": 8.11251, + "16": 8.14258, + "17": 8.13031, + "18": 7.44579, + "19": 8.03606, + "20": 7.54064, + "21": 7.90046, + "22": 7.27709, + "23": 7.88548, + "24": 7.37576, + "25": 8.17071, + "26": 7.69849, + "27": 7.62829, + "28": 7.61349, + "29": 7.69754, + "30": 7.47936, + "31": 7.73926, + "32": 6.37137, + "33": 7.1379, + "34": 7.71901, + "35": 7.63544, + "36": 6.61321, + "37": 8.03174, + "38": 7.58067, + "39": 7.89473, + "40": 7.41418, + "41": 7.42196, + "42": 6.01401, + "43": 7.49099, + "44": 7.86625, + "45": 6.74951, + "46": 7.30637, + "47": 7.72653, + "48": 7.78872, + "49": 7.48917, + "50": 6.75533 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 64.76466, - "2": 2.42359, - "3": 2.56054, - "4": 2.61199, - "5": 2.3272, - "6": 2.19806, - "7": 2.16133, - "8": 1.97339, - "9": 2.14238, - "10": 2.05512, - "11": 2.00856, - "12": 1.96198, - "13": 2.08656, - "14": 1.96948, - "15": 1.96059, - "16": 1.97248, - "17": 1.97639, - "18": 2.01386, - "19": 1.9606, - "20": 1.94716, - "21": 2.00286, - "22": 1.965, - "23": 2.03401, - "24": 2.00528, - "25": 2.03321, - "26": 1.95999, - "27": 1.96395, - "28": 1.98191, - "29": 1.99346, - "30": 1.97579, - "31": 1.95097, - "32": 1.95726, - "33": 1.9399, - "34": 1.99177, - "35": 1.91153, - "36": 1.97534, - "37": 1.95691, - "38": 1.96206, - "39": 1.9414, - "40": 1.96027, - "41": 1.97807, - "42": 1.98861, - "43": 1.94856, - "44": 1.96339, - "45": 1.96835, - "46": 1.99733, - "47": 1.9716, - "48": 1.96591, - "49": 1.93865, - "50": 1.95198 + "1": 88.9425, + "2": 2.91855, + "3": 2.58352, + "4": 3.73409, + "5": 2.63585, + "6": 2.48926, + "7": 2.27523, + "8": 2.50563, + "9": 2.45577, + "10": 1.90482, + "11": 1.96806, + "12": 2.42331, + "13": 1.88872, + "14": 1.89773, + "15": 1.90418, + "16": 1.885, + "17": 1.91181, + "18": 1.89194, + "19": 1.97889, + "20": 1.88063, + "21": 1.88612, + "22": 1.90981, + "23": 1.87053, + "24": 1.87293, + "25": 1.89611, + "26": 1.96035, + "27": 1.9067, + "28": 1.91982, + "29": 1.94441, + "30": 1.88208, + "31": 1.9521, + "32": 1.89063, + "33": 1.9571, + "34": 1.93481, + "35": 1.87558, + "36": 1.88538, + "37": 1.89041, + "38": 1.97023, + "39": 1.89001, + "40": 1.87859, + "41": 1.89949, + "42": 1.88775, + "43": 1.94805, + "44": 1.90575, + "45": 1.89185, + "46": 1.87259, + "47": 1.89396, + "48": 1.8747, + "49": 1.88874, + "50": 1.91915 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json index 162edd4f113..ca64f30b0fb 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json @@ -6,54 +6,54 @@ "values": { "1": 11.01693, "2": 11.06263, - "3": 10.1782, - "4": 10.86126, - "5": 9.81699, - "6": 9.10047, - "7": 9.6123, - "8": 8.39574, - "9": 7.79397, - "10": 7.15194, - "11": 9.06709, - "12": 12.4321, - "13": 8.58689, - "14": 8.37208, - "15": 8.32207, - "16": 8.28873, - "17": 8.33948, - "18": 7.62098, - "19": 8.20737, - "20": 7.71874, - "21": 8.02566, - "22": 7.37552, - "23": 7.97218, - "24": 7.52837, - "25": 8.3433, - "26": 7.79595, - "27": 7.73606, - "28": 7.71545, - "29": 7.78466, - "30": 7.57814, - "31": 7.86251, - "32": 6.53514, - "33": 7.24722, - "34": 7.81689, - "35": 7.75181, - "36": 6.74644, - "37": 8.15937, - "38": 7.62962, - "39": 7.9886, - "40": 7.53058, - "41": 7.54209, - "42": 6.14029, - "43": 7.61626, - "44": 7.97638, - "45": 6.85528, - "46": 7.44245, - "47": 7.84386, - "48": 7.89235, - "49": 7.61461, - "50": 6.86695 + "3": 10.17828, + "4": 10.86162, + "5": 9.8171, + "6": 9.10066, + "7": 9.61216, + "8": 8.39629, + "9": 7.79624, + "10": 7.15182, + "11": 9.06686, + "12": 12.41529, + "13": 8.05859, + "14": 8.25078, + "15": 8.25932, + "16": 8.33199, + "17": 8.33144, + "18": 7.58852, + "19": 8.19681, + "20": 7.68193, + "21": 8.00256, + "22": 7.37928, + "23": 7.95036, + "24": 7.52138, + "25": 8.32313, + "26": 7.80137, + "27": 7.73067, + "28": 7.70985, + "29": 7.77487, + "30": 7.57653, + "31": 7.85303, + "32": 6.5208, + "33": 7.2477, + "34": 7.80024, + "35": 7.74614, + "36": 6.73365, + "37": 8.154, + "38": 7.62714, + "39": 7.97924, + "40": 7.524, + "41": 7.52079, + "42": 6.11188, + "43": 7.6025, + "44": 7.97264, + "45": 6.84479, + "46": 7.4241, + "47": 7.82528, + "48": 7.87668, + "49": 7.5987, + "50": 6.8481 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 47167904.0, - "2": 46900672.0, - "3": 81004512.0, - "4": 231040016.0, - "5": 477984896.0, - "6": 558059904.0, - "7": 958271680.0, - "8": 723959296.0, - "9": 802607040.0, - "10": 715176064.0, - "11": 657024320.0, - "12": 565795136.0, - "13": 541943680.0, - "14": 773290880.0, - "15": 810566400.0, - "16": 748195712.0, - "17": 730395008.0, - "18": 733261760.0, - "19": 729119744.0, - "20": 859242112.0, - "21": 846155136.0, - "22": 648056832.0, - "23": 774244288.0, - "24": 629192960.0, - "25": 843192448.0, - "26": 846129280.0, - "27": 804864512.0, - "28": 789783424.0, - "29": 817814656.0, - "30": 808743168.0, - "31": 662987648.0, - "32": 841163840.0, - "33": 676597440.0, - "34": 808569792.0, - "35": 804410048.0, - "36": 749336000.0, - "37": 759355904.0, - "38": 768597888.0, - "39": 758146688.0, - "40": 767096448.0, - "41": 735961920.0, - "42": 705693632.0, - "43": 694921152.0, - "44": 692872768.0, - "45": 638337792.0, - "46": 654254336.0, - "47": 655022208.0, - "48": 648030848.0, - "49": 622397184.0, - "50": 582138304.0 + "1": 47167760.0, + "2": 46900544.0, + "3": 84151152.0, + "4": 237329488.0, + "5": 471710816.0, + "6": 558040704.0, + "7": 958277696.0, + "8": 723945792.0, + "9": 812038208.0, + "10": 721441280.0, + "11": 622437632.0, + "12": 556346176.0, + "13": 633166464.0, + "14": 700920576.0, + "15": 766532480.0, + "16": 719878656.0, + "17": 673785280.0, + "18": 733291456.0, + "19": 713440768.0, + "20": 859244608.0, + "21": 836730112.0, + "22": 789566720.0, + "23": 808848960.0, + "24": 644896128.0, + "25": 852631104.0, + "26": 836696384.0, + "27": 550069504.0, + "28": 604192832.0, + "29": 761193792.0, + "30": 758412160.0, + "31": 782509568.0, + "32": 765664256.0, + "33": 745758912.0, + "34": 569510656.0, + "35": 728914304.0, + "36": 699003840.0, + "37": 705883072.0, + "38": 705682240.0, + "39": 685787136.0, + "40": 656996352.0, + "41": 484325760.0, + "42": 633345536.0, + "43": 641441984.0, + "44": 466413888.0, + "45": 427604864.0, + "46": 566181184.0, + "47": 563795904.0, + "48": 421565312.0, + "49": 537463040.0, + "50": 494058176.0 } }, "mem-allocated-bytes": { @@ -178,53 +178,53 @@ "1": 4305060864.0, "2": 5850929152.0, "3": 5850929152.0, - "4": 5857025536.0, - "5": 5857025536.0, - "6": 5857025536.0, - "7": 5857025536.0, - "8": 5857025536.0, - "9": 5857025536.0, - "10": 5857025536.0, - "11": 5857025536.0, - "12": 5857025536.0, - "13": 5857025536.0, - "14": 5857025536.0, - "15": 5857025536.0, - "16": 5857025536.0, - "17": 5857025536.0, - "18": 5857025536.0, - "19": 5857025536.0, - "20": 5857025536.0, - "21": 5857025536.0, - "22": 5857025536.0, - "23": 5857025536.0, - "24": 5857025536.0, - "25": 5857025536.0, - "26": 5857025536.0, - "27": 5857025536.0, - "28": 5857025536.0, - "29": 5857025536.0, - "30": 5857025536.0, - "31": 5857025536.0, - "32": 5857025536.0, - "33": 5857025536.0, - "34": 5857025536.0, - "35": 5857025536.0, - "36": 5857025536.0, - "37": 5857025536.0, - "38": 5857025536.0, - "39": 5857025536.0, - "40": 5857025536.0, - "41": 5857025536.0, - "42": 5857025536.0, - "43": 5857025536.0, - "44": 5857025536.0, - "45": 5857025536.0, - "46": 5857025536.0, - "47": 5857025536.0, - "48": 5857025536.0, - "49": 5857025536.0, - "50": 5860186112.0 + "4": 5857061888.0, + "5": 5857061888.0, + "6": 5857061888.0, + "7": 5857061888.0, + "8": 5857061888.0, + "9": 5857061888.0, + "10": 5857061888.0, + "11": 5857061888.0, + "12": 5857061888.0, + "13": 5857061888.0, + "14": 5857061888.0, + "15": 5857061888.0, + "16": 5857061888.0, + "17": 5857061888.0, + "18": 5857061888.0, + "19": 5857061888.0, + "20": 5857061888.0, + "21": 5857061888.0, + "22": 5857061888.0, + "23": 5857061888.0, + "24": 5857061888.0, + "25": 5857061888.0, + "26": 5857061888.0, + "27": 5857061888.0, + "28": 5857061888.0, + "29": 5857061888.0, + "30": 5857061888.0, + "31": 5857061888.0, + "32": 5857061888.0, + "33": 5857061888.0, + "34": 5857061888.0, + "35": 5857061888.0, + "36": 5857061888.0, + "37": 5857061888.0, + "38": 5857061888.0, + "39": 5860414976.0, + "40": 5860414976.0, + "41": 5860414976.0, + "42": 5860414976.0, + "43": 5860414976.0, + "44": 5860414976.0, + "45": 5860414976.0, + "46": 5860414976.0, + "47": 5860414976.0, + "48": 5860414976.0, + "49": 5860414976.0, + "50": 5860414976.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 89.57975, - "2": 3.08398, - "3": 3.39072, - "4": 2.95563, - "5": 3.89951, - "6": 1.99592, - "7": 2.70541, - "8": 1.95431, - "9": 1.95178, - "10": 1.95311, - "11": 2.53128, - "12": 2.03561, - "13": 2.63986, - "14": 1.9956, - "15": 1.94751, - "16": 1.94319, - "17": 1.96972, - "18": 2.07225, - "19": 1.94281, - "20": 1.9489, - "21": 1.94199, - "22": 1.95565, - "23": 1.94632, - "24": 1.94485, - "25": 1.94325, - "26": 1.96685, - "27": 2.00745, - "28": 1.94741, - "29": 1.95606, - "30": 1.95414, - "31": 2.57092, - "32": 1.95172, - "33": 1.94952, - "34": 1.95519, - "35": 1.95735, - "36": 1.94985, - "37": 1.95117, - "38": 1.96384, - "39": 1.98373, - "40": 1.98071, - "41": 1.96168, - "42": 1.97892, - "43": 1.97654, - "44": 1.95705, - "45": 1.95269, - "46": 2.02666, - "47": 1.96138, - "48": 1.9657, - "49": 1.96155, - "50": 1.96872 + "1": 92.74621, + "2": 3.05215, + "3": 3.87635, + "4": 2.96691, + "5": 3.09601, + "6": 1.94793, + "7": 2.58283, + "8": 2.00403, + "9": 1.96081, + "10": 1.955, + "11": 1.95251, + "12": 2.07845, + "13": 2.01952, + "14": 1.96206, + "15": 1.96234, + "16": 1.97406, + "17": 2.0423, + "18": 1.96841, + "19": 1.95796, + "20": 2.48713, + "21": 2.55338, + "22": 1.97633, + "23": 1.95723, + "24": 1.98425, + "25": 1.95827, + "26": 1.95919, + "27": 1.95629, + "28": 1.96685, + "29": 1.95089, + "30": 2.55672, + "31": 1.93918, + "32": 1.95892, + "33": 1.95987, + "34": 1.95394, + "35": 1.96053, + "36": 1.96074, + "37": 1.96542, + "38": 1.97304, + "39": 2.00073, + "40": 1.98223, + "41": 1.95986, + "42": 1.96976, + "43": 1.94793, + "44": 1.95897, + "45": 1.96904, + "46": 1.96519, + "47": 1.95996, + "48": 1.96564, + "49": 1.96485, + "50": 1.97038 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100.json index 06c61dd41cd..a77eac20664 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100.json @@ -4,106 +4,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 10.93667, - "2": 10.93264, - "3": 10.94261, - "4": 10.94946, - "5": 10.9505, - "6": 10.94178, - "7": 10.94476, - "8": 10.93699, - "9": 10.94972, - "10": 10.93759, - "11": 10.9406, - "12": 10.93716, - "13": 10.92358, - "14": 10.93371, - "15": 10.88706, - "16": 10.87515, - "17": 10.86873, - "18": 10.86098, - "19": 10.86339, - "20": 10.78129, - "21": 10.73115, - "22": 10.60306, - "23": 10.73333, - "24": 10.61855, - "25": 10.55193, - "26": 10.62733, - "27": 10.63863, - "28": 10.59011, - "29": 10.59838, - "30": 10.37855, - "31": 10.12094, - "32": 10.4607, - "33": 10.45529, - "34": 10.20066, - "35": 10.25786, - "36": 10.20915, - "37": 10.33728, - "38": 10.1679, - "39": 10.40892, - "40": 10.05215, - "41": 10.09403, - "42": 10.17856, - "43": 9.74184, - "44": 9.89065, - "45": 9.73999, - "46": 9.72711, - "47": 10.0914, - "48": 9.75297, - "49": 9.40165, - "50": 9.83664, - "51": 9.77026, - "52": 9.65357, - "53": 10.03083, - "54": 9.87876, - "55": 9.79584, - "56": 9.53186, - "57": 9.36615, - "58": 9.75299, - "59": 9.48086, - "60": 9.40843, - "61": 9.6013, - "62": 9.90762, - "63": 9.25801, - "64": 9.68466, - "65": 8.79874, - "66": 9.60761, - "67": 9.25475, - "68": 9.71411, - "69": 9.71658, - "70": 9.66191, - "71": 9.52462, - "72": 9.47118, - "73": 9.38807, - "74": 8.8033, - "75": 9.33989, - "76": 8.93556, - "77": 9.99334, - "78": 9.6476, - "79": 9.28161, - "80": 9.29609, - "81": 9.39641, - "82": 9.60864, - "83": 9.21675, - "84": 9.34039, - "85": 9.53003, - "86": 8.95526, - "87": 9.51627, - "88": 9.68227, - "89": 9.50564, - "90": 9.75275, - "91": 9.23417, - "92": 9.25974, - "93": 8.94473, - "94": 8.6919, - "95": 9.44561, - "96": 9.40972, - "97": 9.20069, - "98": 9.58166, - "99": 8.75941, - "100": 9.2944 + "1": 10.93691, + "2": 10.93262, + "3": 10.94243, + "4": 10.95011, + "5": 10.9502, + "6": 10.94175, + "7": 10.94469, + "8": 10.93675, + "9": 10.94939, + "10": 10.9367, + "11": 10.94082, + "12": 10.93794, + "13": 10.92338, + "14": 10.93415, + "15": 10.88723, + "16": 10.87495, + "17": 10.86864, + "18": 10.86127, + "19": 10.86341, + "20": 10.78125, + "21": 10.73131, + "22": 10.60371, + "23": 10.73309, + "24": 10.61865, + "25": 10.55175, + "26": 10.62651, + "27": 10.63921, + "28": 10.59104, + "29": 10.5981, + "30": 10.37817, + "31": 10.12235, + "32": 10.46117, + "33": 10.45537, + "34": 10.20087, + "35": 10.25661, + "36": 10.20876, + "37": 10.33662, + "38": 10.16683, + "39": 10.40916, + "40": 10.05209, + "41": 10.09427, + "42": 10.17821, + "43": 9.74204, + "44": 9.89005, + "45": 9.74011, + "46": 9.72669, + "47": 10.09152, + "48": 9.75295, + "49": 9.40186, + "50": 9.83645, + "51": 9.77036, + "52": 9.65641, + "53": 10.03067, + "54": 9.87916, + "55": 9.79619, + "56": 9.52858, + "57": 9.36596, + "58": 9.75327, + "59": 9.48259, + "60": 9.40835, + "61": 9.60202, + "62": 9.90742, + "63": 9.25777, + "64": 9.68411, + "65": 8.79911, + "66": 9.60796, + "67": 9.25427, + "68": 9.71419, + "69": 9.71666, + "70": 9.6613, + "71": 9.52439, + "72": 9.4709, + "73": 9.38862, + "74": 8.80286, + "75": 9.34004, + "76": 8.93543, + "77": 9.99337, + "78": 9.64723, + "79": 9.28126, + "80": 9.29633, + "81": 9.39609, + "82": 9.60877, + "83": 9.21694, + "84": 9.34008, + "85": 9.53009, + "86": 8.95652, + "87": 9.51691, + "88": 9.68221, + "89": 9.50553, + "90": 9.753, + "91": 9.2347, + "92": 9.26019, + "93": 8.94568, + "94": 8.69194, + "95": 9.44616, + "96": 9.41008, + "97": 9.20125, + "98": 9.58169, + "99": 8.75946, + "100": 9.29483 } }, "num-zeros": { @@ -111,106 +111,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 22750340.0, - "2": 22953240.0, - "3": 22604372.0, - "4": 23266290.0, - "5": 22735592.0, - "6": 23061820.0, - "7": 22793344.0, - "8": 22960844.0, - "9": 22865576.0, - "10": 22950400.0, - "11": 22499694.0, - "12": 22456048.0, - "13": 22948070.0, - "14": 22384472.0, - "15": 22846226.0, - "16": 22856726.0, - "17": 22836416.0, - "18": 22590156.0, - "19": 22627028.0, - "20": 22712304.0, - "21": 22762708.0, - "22": 22816860.0, - "23": 22545234.0, - "24": 22794360.0, - "25": 22842012.0, - "26": 22549648.0, - "27": 22464794.0, - "28": 22453688.0, - "29": 22534550.0, - "30": 22636280.0, - "31": 22989464.0, - "32": 22594058.0, - "33": 22565896.0, - "34": 22855566.0, - "35": 22813548.0, - "36": 22595456.0, - "37": 22499328.0, - "38": 22926188.0, - "39": 22825288.0, - "40": 22675666.0, - "41": 22671440.0, - "42": 22682290.0, - "43": 23013968.0, - "44": 22764432.0, - "45": 22682616.0, - "46": 22911524.0, - "47": 23691920.0, - "48": 22954152.0, - "49": 23786644.0, - "50": 22934374.0, - "51": 23866192.0, - "52": 23807216.0, - "53": 24007492.0, - "54": 22868900.0, - "55": 23571312.0, - "56": 23954240.0, - "57": 23162470.0, - "58": 23914490.0, - "59": 22722768.0, - "60": 23813636.0, - "61": 23813616.0, - "62": 23739838.0, - "63": 23916666.0, - "64": 23899012.0, - "65": 24148300.0, - "66": 23796396.0, - "67": 25032292.0, - "68": 23675750.0, - "69": 23646956.0, - "70": 23903548.0, - "71": 24864524.0, - "72": 24767004.0, - "73": 24850716.0, - "74": 24133058.0, - "75": 24146156.0, - "76": 25025568.0, - "77": 24358296.0, - "78": 24910078.0, - "79": 23808274.0, - "80": 24821470.0, - "81": 25020448.0, - "82": 23851480.0, - "83": 23911932.0, - "84": 25143880.0, - "85": 24823452.0, - "86": 23154428.0, - "87": 24850248.0, - "88": 24749204.0, - "89": 22506446.0, - "90": 25108540.0, - "91": 23839404.0, - "92": 23875080.0, - "93": 24769680.0, - "94": 23992436.0, - "95": 25189956.0, - "96": 23908992.0, - "97": 24713120.0, - "98": 23832428.0, - "99": 23983742.0, - "100": 24101128.0 + "1": 22750372.0, + "2": 22953180.0, + "3": 22604424.0, + "4": 23266362.0, + "5": 22735560.0, + "6": 23061884.0, + "7": 22793368.0, + "8": 22960792.0, + "9": 22865612.0, + "10": 22950328.0, + "11": 22499656.0, + "12": 22456052.0, + "13": 22948014.0, + "14": 22384498.0, + "15": 22846334.0, + "16": 22856854.0, + "17": 22836340.0, + "18": 22590220.0, + "19": 22627128.0, + "20": 22712376.0, + "21": 22762744.0, + "22": 22816900.0, + "23": 22545168.0, + "24": 22794340.0, + "25": 22841898.0, + "26": 22549680.0, + "27": 22464852.0, + "28": 22453780.0, + "29": 22534588.0, + "30": 22636160.0, + "31": 22989382.0, + "32": 22594002.0, + "33": 22566000.0, + "34": 22855476.0, + "35": 22813640.0, + "36": 22595484.0, + "37": 22499348.0, + "38": 22926172.0, + "39": 22825344.0, + "40": 22675752.0, + "41": 22671542.0, + "42": 22682408.0, + "43": 23014140.0, + "44": 22768504.0, + "45": 22679044.0, + "46": 22912572.0, + "47": 23691904.0, + "48": 24003148.0, + "49": 23786764.0, + "50": 22931654.0, + "51": 23866164.0, + "52": 23807242.0, + "53": 24007504.0, + "54": 22867916.0, + "55": 23571280.0, + "56": 23954212.0, + "57": 24211680.0, + "58": 23914512.0, + "59": 22722820.0, + "60": 23813508.0, + "61": 23796364.0, + "62": 23739896.0, + "63": 24965914.0, + "64": 23898698.0, + "65": 24150860.0, + "66": 23796512.0, + "67": 25032960.0, + "68": 23673048.0, + "69": 23644684.0, + "70": 23903614.0, + "71": 24864656.0, + "72": 24766928.0, + "73": 24850636.0, + "74": 24133166.0, + "75": 24143912.0, + "76": 25025406.0, + "77": 24358344.0, + "78": 24910132.0, + "79": 23808164.0, + "80": 23772256.0, + "81": 25020440.0, + "82": 23851242.0, + "83": 23911824.0, + "84": 25143864.0, + "85": 24823592.0, + "86": 23153228.0, + "87": 24850332.0, + "88": 24749368.0, + "89": 22505174.0, + "90": 25108752.0, + "91": 23838548.0, + "92": 24923816.0, + "93": 24769484.0, + "94": 25041572.0, + "95": 25189350.0, + "96": 23909318.0, + "97": 23664104.0, + "98": 23832392.0, + "99": 23981812.0, + "100": 24101144.0 } }, "mem-allocated-bytes": { @@ -219,105 +219,105 @@ "step_interval": 1, "values": { "1": 773784064.0, - "2": 763563008.0, - "3": 766700544.0, - "4": 935098368.0, + "2": 776621056.0, + "3": 764709888.0, + "4": 937392128.0, "5": 935098368.0, - "6": 937392128.0, - "7": 937392128.0, - "8": 935639040.0, - "9": 937392128.0, - "10": 937392128.0, - "11": 935098368.0, - "12": 937392128.0, - "13": 937392128.0, - "14": 935098368.0, + "6": 935098368.0, + "7": 935639040.0, + "8": 937392128.0, + "9": 935098368.0, + "10": 936785920.0, + "11": 937392128.0, + "12": 935098368.0, + "13": 935098368.0, + "14": 935639040.0, "15": 937392128.0, - "16": 936785920.0, - "17": 935098368.0, + "16": 935098368.0, + "17": 935639040.0, "18": 937392128.0, "19": 937392128.0, "20": 935098368.0, - "21": 937392128.0, - "22": 936785920.0, - "23": 935098368.0, + "21": 936785920.0, + "22": 937392128.0, + "23": 936785920.0, "24": 937392128.0, - "25": 935639040.0, - "26": 937392128.0, - "27": 937392128.0, - "28": 935098368.0, + "25": 935098368.0, + "26": 935098368.0, + "27": 936245248.0, + "28": 937392128.0, "29": 937392128.0, - "30": 935639040.0, + "30": 935098368.0, "31": 935098368.0, - "32": 937392128.0, - "33": 935098368.0, + "32": 935639040.0, + "33": 936785920.0, "34": 937392128.0, - "35": 936245248.0, - "36": 935098368.0, - "37": 937392128.0, + "35": 937392128.0, + "36": 937392128.0, + "37": 935098368.0, "38": 935098368.0, - "39": 937392128.0, - "40": 937392128.0, - "41": 935098368.0, + "39": 935098368.0, + "40": 936785920.0, + "41": 937392128.0, "42": 937392128.0, - "43": 935098368.0, + "43": 937392128.0, "44": 937392128.0, - "45": 936245248.0, + "45": 937392128.0, "46": 937392128.0, - "47": 937392128.0, + "47": 935098368.0, "48": 935098368.0, "49": 937392128.0, - "50": 935639040.0, - "51": 937392128.0, - "52": 935098368.0, - "53": 937392128.0, - "54": 936245248.0, - "55": 935098368.0, - "56": 937392128.0, + "50": 937392128.0, + "51": 935098368.0, + "52": 935639040.0, + "53": 936785920.0, + "54": 937392128.0, + "55": 937392128.0, + "56": 935098368.0, "57": 935098368.0, - "58": 937392128.0, + "58": 935098368.0, "59": 935639040.0, - "60": 937392128.0, + "60": 936245248.0, "61": 936785920.0, - "62": 937392128.0, - "63": 936785920.0, - "64": 935098368.0, + "62": 936785920.0, + "63": 937392128.0, + "64": 937392128.0, "65": 937392128.0, "66": 935098368.0, - "67": 937392128.0, - "68": 935098368.0, - "69": 937392128.0, - "70": 935098368.0, + "67": 935098368.0, + "68": 935639040.0, + "69": 936245248.0, + "70": 936785920.0, "71": 937392128.0, - "72": 935098368.0, + "72": 937392128.0, "73": 937392128.0, - "74": 936245248.0, - "75": 937392128.0, - "76": 936785920.0, + "74": 937392128.0, + "75": 935098368.0, + "76": 937392128.0, "77": 937392128.0, - "78": 936785920.0, - "79": 935098368.0, + "78": 935098368.0, + "79": 935639040.0, "80": 937392128.0, - "81": 935098368.0, - "82": 937392128.0, - "83": 935098368.0, + "81": 937392128.0, + "82": 935098368.0, + "83": 936785920.0, "84": 937392128.0, - "85": 935639040.0, - "86": 937392128.0, - "87": 937392128.0, - "88": 935098368.0, - "89": 937392128.0, + "85": 937392128.0, + "86": 935098368.0, + "87": 936785920.0, + "88": 937392128.0, + "89": 935098368.0, "90": 935639040.0, "91": 937392128.0, - "92": 936785920.0, - "93": 935098368.0, - "94": 937392128.0, + "92": 937392128.0, + "93": 937392128.0, + "94": 935098368.0, "95": 935098368.0, - "96": 937392128.0, - "97": 936785920.0, - "98": 935098368.0, - "99": 937392128.0, - "100": 935098368.0 + "96": 935639040.0, + "97": 936245248.0, + "98": 937392128.0, + "99": 935098368.0, + "100": 936785920.0 } }, "mem-max-allocated-bytes": { @@ -325,106 +325,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 940788224.0, - "2": 1157431808.0, - "3": 1157431808.0, - "4": 1247832064.0, - "5": 1247832064.0, - "6": 1247832064.0, - "7": 1247832064.0, - "8": 1248165376.0, - "9": 1248165376.0, - "10": 1248305664.0, - "11": 1248305664.0, - "12": 1248305664.0, - "13": 1248305664.0, - "14": 1248979968.0, - "15": 1248979968.0, - "16": 1248979968.0, - "17": 1248979968.0, - "18": 1248979968.0, - "19": 1249688576.0, - "20": 1249688576.0, - "21": 1249688576.0, - "22": 1249688576.0, - "23": 1249688576.0, - "24": 1249688576.0, - "25": 1249688576.0, - "26": 1249688576.0, - "27": 1249688576.0, - "28": 1249688576.0, - "29": 1249688576.0, - "30": 1249688576.0, - "31": 1249688576.0, - "32": 1249688576.0, - "33": 1249688576.0, - "34": 1249688576.0, - "35": 1249688576.0, - "36": 1249688576.0, - "37": 1249688576.0, - "38": 1249688576.0, - "39": 1249688576.0, - "40": 1249688576.0, - "41": 1249688576.0, - "42": 1249688576.0, - "43": 1249688576.0, - "44": 1249688576.0, - "45": 1249688576.0, - "46": 1249688576.0, - "47": 1249688576.0, - "48": 1249688576.0, - "49": 1249688576.0, - "50": 1249688576.0, - "51": 1249688576.0, - "52": 1249688576.0, - "53": 1249688576.0, - "54": 1249688576.0, - "55": 1249688576.0, - "56": 1249688576.0, - "57": 1249688576.0, - "58": 1249688576.0, - "59": 1249688576.0, - "60": 1249688576.0, - "61": 1249688576.0, - "62": 1249688576.0, - "63": 1249688576.0, - "64": 1249688576.0, - "65": 1249688576.0, - "66": 1249688576.0, - "67": 1249688576.0, - "68": 1249688576.0, - "69": 1249688576.0, - "70": 1249688576.0, - "71": 1249688576.0, - "72": 1249688576.0, - "73": 1249688576.0, - "74": 1249688576.0, - "75": 1249688576.0, - "76": 1249688576.0, - "77": 1249688576.0, - "78": 1249688576.0, - "79": 1249688576.0, - "80": 1249688576.0, - "81": 1249688576.0, - "82": 1249688576.0, - "83": 1249688576.0, - "84": 1249688576.0, - "85": 1249688576.0, - "86": 1249688576.0, - "87": 1249688576.0, - "88": 1249688576.0, - "89": 1249688576.0, - "90": 1249688576.0, - "91": 1249688576.0, - "92": 1249688576.0, - "93": 1249688576.0, - "94": 1249688576.0, - "95": 1249688576.0, - "96": 1249688576.0, - "97": 1249688576.0, - "98": 1249688576.0, - "99": 1249688576.0, - "100": 1249688576.0 + "1": 936453632.0, + "2": 1158617088.0, + "3": 1158617088.0, + "4": 1246761472.0, + "5": 1247365632.0, + "6": 1247365632.0, + "7": 1247765504.0, + "8": 1247765504.0, + "9": 1247765504.0, + "10": 1252415488.0, + "11": 1252415488.0, + "12": 1252415488.0, + "13": 1252415488.0, + "14": 1252415488.0, + "15": 1252415488.0, + "16": 1252415488.0, + "17": 1252415488.0, + "18": 1252415488.0, + "19": 1252415488.0, + "20": 1252415488.0, + "21": 1252415488.0, + "22": 1252415488.0, + "23": 1252415488.0, + "24": 1252415488.0, + "25": 1252415488.0, + "26": 1252415488.0, + "27": 1252415488.0, + "28": 1252415488.0, + "29": 1252415488.0, + "30": 1252415488.0, + "31": 1252415488.0, + "32": 1252415488.0, + "33": 1252415488.0, + "34": 1252415488.0, + "35": 1252415488.0, + "36": 1252415488.0, + "37": 1252415488.0, + "38": 1252415488.0, + "39": 1252415488.0, + "40": 1252415488.0, + "41": 1252415488.0, + "42": 1252415488.0, + "43": 1252415488.0, + "44": 1252415488.0, + "45": 1252415488.0, + "46": 1252415488.0, + "47": 1252415488.0, + "48": 1252415488.0, + "49": 1252415488.0, + "50": 1252415488.0, + "51": 1252415488.0, + "52": 1252415488.0, + "53": 1252415488.0, + "54": 1252415488.0, + "55": 1252415488.0, + "56": 1252415488.0, + "57": 1252415488.0, + "58": 1252415488.0, + "59": 1252415488.0, + "60": 1252415488.0, + "61": 1252415488.0, + "62": 1252415488.0, + "63": 1252415488.0, + "64": 1252415488.0, + "65": 1252415488.0, + "66": 1252415488.0, + "67": 1252415488.0, + "68": 1252415488.0, + "69": 1252415488.0, + "70": 1252415488.0, + "71": 1252415488.0, + "72": 1252415488.0, + "73": 1252415488.0, + "74": 1252415488.0, + "75": 1252415488.0, + "76": 1252415488.0, + "77": 1252415488.0, + "78": 1252415488.0, + "79": 1252415488.0, + "80": 1252415488.0, + "81": 1252415488.0, + "82": 1252415488.0, + "83": 1252415488.0, + "84": 1252415488.0, + "85": 1252415488.0, + "86": 1252415488.0, + "87": 1252415488.0, + "88": 1252415488.0, + "89": 1252415488.0, + "90": 1252415488.0, + "91": 1252415488.0, + "92": 1252415488.0, + "93": 1252415488.0, + "94": 1252415488.0, + "95": 1252415488.0, + "96": 1252415488.0, + "97": 1252415488.0, + "98": 1252415488.0, + "99": 1252415488.0, + "100": 1252415488.0 } }, "mtp_1 loss": { @@ -432,106 +432,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 10.88688, - "2": 10.90482, - "3": 10.9087, - "4": 10.86893, - "5": 10.91659, - "6": 10.90568, - "7": 10.90273, - "8": 10.89003, - "9": 10.90367, - "10": 10.89165, - "11": 10.93407, - "12": 10.91649, - "13": 10.91113, - "14": 10.91972, - "15": 10.88512, - "16": 10.90762, + "1": 10.88691, + "2": 10.90544, + "3": 10.90868, + "4": 10.86912, + "5": 10.91636, + "6": 10.90651, + "7": 10.90278, + "8": 10.88975, + "9": 10.90453, + "10": 10.89162, + "11": 10.93392, + "12": 10.91634, + "13": 10.91136, + "14": 10.91999, + "15": 10.88538, + "16": 10.90717, "17": 10.87525, - "18": 10.91396, - "19": 10.90949, - "20": 10.87811, - "21": 10.87944, - "22": 10.85495, - "23": 10.87985, - "24": 10.87289, - "25": 10.85849, - "26": 10.86957, - "27": 10.87683, - "28": 10.88682, - "29": 10.88885, - "30": 10.85468, - "31": 10.79756, - "32": 10.86606, - "33": 10.87767, - "34": 10.84002, - "35": 10.84197, - "36": 10.8501, - "37": 10.85593, - "38": 10.8371, - "39": 10.86345, - "40": 10.82902, - "41": 10.83425, - "42": 10.84438, - "43": 10.78764, - "44": 10.82077, - "45": 10.78834, - "46": 10.78249, - "47": 10.82884, - "48": 10.79035, - "49": 10.71167, - "50": 10.77366, - "51": 10.76725, - "52": 10.74037, - "53": 10.80261, - "54": 10.77356, - "55": 10.76019, - "56": 10.71045, - "57": 10.66667, - "58": 10.74362, - "59": 10.69036, - "60": 10.66502, - "61": 10.70788, - "62": 10.772, - "63": 10.61853, - "64": 10.71765, - "65": 10.49451, - "66": 10.67121, - "67": 10.57549, - "68": 10.68782, - "69": 10.68291, - "70": 10.6695, - "71": 10.64584, - "72": 10.60876, - "73": 10.56523, - "74": 10.37039, - "75": 10.51086, - "76": 10.39869, - "77": 10.75172, - "78": 10.62677, - "79": 10.46664, - "80": 10.47405, - "81": 10.51052, - "82": 10.58766, - "83": 10.43963, - "84": 10.44967, - "85": 10.55157, - "86": 10.28464, - "87": 10.51164, - "88": 10.6034, - "89": 10.50879, - "90": 10.60395, - "91": 10.38241, - "92": 10.38669, - "93": 10.22995, - "94": 10.08283, - "95": 10.42553, - "96": 10.44856, - "97": 10.32063, - "98": 10.49615, - "99": 10.04594, - "100": 10.33373 + "18": 10.91409, + "19": 10.90936, + "20": 10.87835, + "21": 10.8786, + "22": 10.85481, + "23": 10.87937, + "24": 10.87208, + "25": 10.85798, + "26": 10.86991, + "27": 10.87718, + "28": 10.88667, + "29": 10.88859, + "30": 10.85479, + "31": 10.79701, + "32": 10.86609, + "33": 10.87789, + "34": 10.8397, + "35": 10.84184, + "36": 10.85, + "37": 10.85585, + "38": 10.83714, + "39": 10.86361, + "40": 10.82866, + "41": 10.83386, + "42": 10.84447, + "43": 10.78747, + "44": 10.82127, + "45": 10.78826, + "46": 10.78323, + "47": 10.82894, + "48": 10.7901, + "49": 10.71201, + "50": 10.77359, + "51": 10.76681, + "52": 10.74029, + "53": 10.8027, + "54": 10.77345, + "55": 10.76133, + "56": 10.71153, + "57": 10.66673, + "58": 10.74318, + "59": 10.69182, + "60": 10.66418, + "61": 10.70712, + "62": 10.77164, + "63": 10.61759, + "64": 10.71667, + "65": 10.4936, + "66": 10.67118, + "67": 10.57515, + "68": 10.68716, + "69": 10.68277, + "70": 10.66908, + "71": 10.64566, + "72": 10.60905, + "73": 10.56507, + "74": 10.37106, + "75": 10.5114, + "76": 10.39856, + "77": 10.75192, + "78": 10.62708, + "79": 10.4675, + "80": 10.47474, + "81": 10.51003, + "82": 10.58819, + "83": 10.43946, + "84": 10.45015, + "85": 10.55142, + "86": 10.2831, + "87": 10.51182, + "88": 10.60318, + "89": 10.50948, + "90": 10.60407, + "91": 10.38208, + "92": 10.38708, + "93": 10.23019, + "94": 10.08381, + "95": 10.4259, + "96": 10.4489, + "97": 10.32133, + "98": 10.49668, + "99": 10.04795, + "100": 10.33446 } }, "iteration-time": { @@ -539,106 +539,106 @@ "end_step": 100, "step_interval": 1, "values": { - "1": 67.25594, - "2": 2.02448, - "3": 1.31909, - "4": 3.51713, - "5": 0.68118, - "6": 0.68517, - "7": 0.6825, - "8": 0.66566, - "9": 0.66522, - "10": 0.67133, - "11": 0.66857, - "12": 0.66644, - "13": 0.67083, - "14": 0.66571, - "15": 0.66315, - "16": 0.66494, - "17": 0.66971, - "18": 0.67036, - "19": 0.66993, - "20": 0.66906, - "21": 0.66515, - "22": 0.66541, - "23": 0.66633, - "24": 0.66527, - "25": 0.66367, - "26": 0.66301, - "27": 0.6633, - "28": 0.66152, - "29": 0.66022, - "30": 0.66204, - "31": 0.66645, - "32": 0.66494, - "33": 0.66029, - "34": 0.66391, - "35": 0.65922, - "36": 0.66135, - "37": 0.6625, - "38": 0.65862, - "39": 0.65997, - "40": 0.68187, - "41": 0.65886, - "42": 0.65824, - "43": 0.65934, - "44": 0.65661, - "45": 0.65819, - "46": 0.66081, - "47": 0.65905, - "48": 0.66151, - "49": 0.66043, - "50": 0.65818, - "51": 0.74732, - "52": 0.65757, - "53": 0.66273, - "54": 0.65899, - "55": 0.66722, - "56": 0.65747, - "57": 0.65863, - "58": 0.66051, - "59": 0.65938, - "60": 0.65822, - "61": 0.65548, - "62": 0.65759, - "63": 0.65386, - "64": 0.65424, - "65": 0.65305, - "66": 0.65491, - "67": 0.6567, - "68": 0.65495, - "69": 0.65344, - "70": 0.65619, - "71": 0.65258, - "72": 0.65965, - "73": 0.66093, - "74": 0.65552, - "75": 0.65731, - "76": 0.6542, - "77": 0.65449, - "78": 0.65305, - "79": 0.65456, - "80": 0.65355, - "81": 0.65662, - "82": 0.65633, - "83": 0.6568, - "84": 0.65869, - "85": 0.66387, - "86": 0.66145, - "87": 0.66045, - "88": 0.66082, - "89": 0.66365, - "90": 0.66413, - "91": 0.66268, - "92": 0.6594, - "93": 0.66184, - "94": 0.65968, - "95": 0.66219, - "96": 0.66239, - "97": 0.66014, - "98": 0.66265, - "99": 0.66054, - "100": 0.66123 + "1": 74.16337, + "2": 1.6487, + "3": 1.45105, + "4": 4.39166, + "5": 0.72113, + "6": 0.82637, + "7": 0.7985, + "8": 0.73623, + "9": 0.7398, + "10": 0.74065, + "11": 0.73395, + "12": 0.73395, + "13": 0.79806, + "14": 0.7251, + "15": 0.7312, + "16": 0.75102, + "17": 0.72379, + "18": 0.72614, + "19": 0.73367, + "20": 0.73334, + "21": 0.72408, + "22": 0.74787, + "23": 0.75535, + "24": 0.72783, + "25": 0.7314, + "26": 0.71985, + "27": 0.7246, + "28": 0.72236, + "29": 0.71945, + "30": 0.72182, + "31": 0.72292, + "32": 0.71754, + "33": 0.7157, + "34": 0.70975, + "35": 0.72388, + "36": 0.71455, + "37": 0.71511, + "38": 0.71163, + "39": 0.71376, + "40": 0.72067, + "41": 0.71279, + "42": 0.70858, + "43": 0.7086, + "44": 0.70995, + "45": 0.70901, + "46": 0.70881, + "47": 0.71115, + "48": 0.72369, + "49": 0.73908, + "50": 0.81598, + "51": 0.73667, + "52": 0.71381, + "53": 0.72282, + "54": 0.73549, + "55": 0.70748, + "56": 0.7102, + "57": 0.70853, + "58": 0.70998, + "59": 0.71846, + "60": 0.70825, + "61": 0.70848, + "62": 0.70734, + "63": 0.7097, + "64": 0.72007, + "65": 0.71061, + "66": 0.7223, + "67": 0.71411, + "68": 0.71437, + "69": 0.70943, + "70": 0.70895, + "71": 0.71052, + "72": 0.70672, + "73": 0.72725, + "74": 0.70761, + "75": 0.7334, + "76": 0.7387, + "77": 0.72758, + "78": 0.72748, + "79": 0.73386, + "80": 0.72774, + "81": 0.71859, + "82": 0.71526, + "83": 0.75425, + "84": 0.72064, + "85": 0.72017, + "86": 0.72277, + "87": 0.73635, + "88": 0.72228, + "89": 0.73388, + "90": 0.74435, + "91": 0.7281, + "92": 0.71839, + "93": 0.71175, + "94": 0.71437, + "95": 0.71311, + "96": 0.71386, + "97": 0.71412, + "98": 0.72944, + "99": 0.7486, + "100": 0.74015 } } } \ No newline at end of file From ae3dbc04b6ec04091b85f4d7ec3acc53becbafe3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 13 Jan 2026 16:01:15 +0000 Subject: [PATCH 225/248] ci(hotfix): Re-add `gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone` value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .../golden_values_lts_dgx_a100.json | 538 +++++++++++++++++- 1 file changed, 537 insertions(+), 1 deletion(-) diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100.json index 9e26dfeeb6e..f273ff540d3 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100.json +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_lts_dgx_a100.json @@ -1 +1,537 @@ -{} \ No newline at end of file +{ + "lm loss": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 10.85936, + "2": 10.8548, + "3": 10.85199, + "4": 10.84317, + "5": 10.87247, + "6": 10.87857, + "7": 10.84622, + "8": 10.86369, + "9": 10.87211, + "10": 10.8311, + "11": 10.86068, + "12": 10.87273, + "13": 10.87992, + "14": 10.88657, + "15": 10.82029, + "16": 10.82684, + "17": 10.7998, + "18": 10.81985, + "19": 10.80035, + "20": 10.71399, + "21": 10.69893, + "22": 10.57449, + "23": 10.71973, + "24": 10.60285, + "25": 10.54611, + "26": 10.61041, + "27": 10.61227, + "28": 10.57731, + "29": 10.58005, + "30": 10.36705, + "31": 10.13447, + "32": 10.47127, + "33": 10.47454, + "34": 10.23198, + "35": 10.28443, + "36": 10.23436, + "37": 10.35346, + "38": 10.20696, + "39": 10.40599, + "40": 10.08972, + "41": 10.16331, + "42": 10.2256, + "43": 9.8639, + "44": 9.98246, + "45": 9.84548, + "46": 9.8581, + "47": 10.1689, + "48": 9.86658, + "49": 9.54555, + "50": 9.91937, + "51": 9.86074, + "52": 9.76116, + "53": 10.08415, + "54": 9.96563, + "55": 9.89123, + "56": 9.63923, + "57": 9.4936, + "58": 9.83871, + "59": 9.59623, + "60": 9.5091, + "61": 9.70544, + "62": 9.99513, + "63": 9.38104, + "64": 9.78222, + "65": 8.95962, + "66": 9.71006, + "67": 9.38013, + "68": 9.78827, + "69": 9.79425, + "70": 9.73517, + "71": 9.62218, + "72": 9.58801, + "73": 9.49714, + "74": 8.94242, + "75": 9.4322, + "76": 9.09757, + "77": 10.06853, + "78": 9.73055, + "79": 9.37759, + "80": 9.41116, + "81": 9.48631, + "82": 9.69758, + "83": 9.31674, + "84": 9.42151, + "85": 9.61502, + "86": 9.07627, + "87": 9.59887, + "88": 9.75047, + "89": 9.61233, + "90": 9.82363, + "91": 9.35377, + "92": 9.36525, + "93": 9.08833, + "94": 8.83614, + "95": 9.5226, + "96": 9.52736, + "97": 9.3169, + "98": 9.67961, + "99": 8.89276, + "100": 9.40803 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 1768.0, + "2": 1871.0, + "3": 1757.0, + "4": 1902.0, + "5": 2016.0, + "6": 1943.0, + "7": 1981.0, + "8": 1667.0, + "9": 1973.0, + "10": 1477.0, + "11": 2178.0, + "12": 1985.0, + "13": 2137.0, + "14": 2021.0, + "15": 1944.0, + "16": 2053.0, + "17": 1967.0, + "18": 1922.0, + "19": 2031.0, + "20": 1837.0, + "21": 2048.0, + "22": 1917.0, + "23": 2190.0, + "24": 1787.0, + "25": 1869.0, + "26": 1882.0, + "27": 2143.0, + "28": 2147.0, + "29": 2222.0, + "30": 2046.0, + "31": 1734.0, + "32": 2171.0, + "33": 2380.0, + "34": 2046.0, + "35": 2147.0, + "36": 2149.0, + "37": 2645.0, + "38": 2416.0, + "39": 2672.0, + "40": 2441.0, + "41": 2585.0, + "42": 2483.0, + "43": 2262.0, + "44": 2344.0, + "45": 2300.0, + "46": 2560.0, + "47": 2755.0, + "48": 2764.0, + "49": 2505.0, + "50": 2723.0, + "51": 2806.0, + "52": 2805.0, + "53": 3225.0, + "54": 3028.0, + "55": 2486.0, + "56": 3093.0, + "57": 2588.0, + "58": 3219.0, + "59": 3021.0, + "60": 2649.0, + "61": 3247.0, + "62": 2649.0, + "63": 2637.0, + "64": 3140.0, + "65": 3038.0, + "66": 3422.0, + "67": 2933.0, + "68": 3039.0, + "69": 3167.0, + "70": 3539.0, + "71": 3213.0, + "72": 2597.0, + "73": 3290.0, + "74": 2140.0, + "75": 2837.0, + "76": 3342.0, + "77": 3444.0, + "78": 3504.0, + "79": 3513.0, + "80": 3733.0, + "81": 4024.0, + "82": 3670.0, + "83": 3199.0, + "84": 3539.0, + "85": 3585.0, + "86": 2979.0, + "87": 3951.0, + "88": 3286.0, + "89": 3787.0, + "90": 3341.0, + "91": 3070.0, + "92": 3410.0, + "93": 2923.0, + "94": 3868.0, + "95": 3627.0, + "96": 3787.0, + "97": 3549.0, + "98": 4026.0, + "99": 3531.0, + "100": 3649.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 232398336.0, + "2": 232398336.0, + "3": 232398336.0, + "4": 232398336.0, + "5": 232398336.0, + "6": 232398336.0, + "7": 232398336.0, + "8": 232398336.0, + "9": 232398336.0, + "10": 232398336.0, + "11": 232398336.0, + "12": 232398336.0, + "13": 232398336.0, + "14": 232398336.0, + "15": 232398336.0, + "16": 232398336.0, + "17": 232398336.0, + "18": 232398336.0, + "19": 232398336.0, + "20": 232398336.0, + "21": 232398336.0, + "22": 232398336.0, + "23": 232398336.0, + "24": 232398336.0, + "25": 232398336.0, + "26": 232398336.0, + "27": 232398336.0, + "28": 232398336.0, + "29": 232398336.0, + "30": 232398336.0, + "31": 232398336.0, + "32": 232398336.0, + "33": 232398336.0, + "34": 232398336.0, + "35": 232398336.0, + "36": 232398336.0, + "37": 232398336.0, + "38": 232398336.0, + "39": 232398336.0, + "40": 232398336.0, + "41": 232398336.0, + "42": 232398336.0, + "43": 232398336.0, + "44": 232398336.0, + "45": 232398336.0, + "46": 232398336.0, + "47": 232398336.0, + "48": 232398336.0, + "49": 232398336.0, + "50": 232398336.0, + "51": 232398336.0, + "52": 232398336.0, + "53": 232398336.0, + "54": 232398336.0, + "55": 232398336.0, + "56": 232398336.0, + "57": 232398336.0, + "58": 232398336.0, + "59": 232398336.0, + "60": 232398336.0, + "61": 232398336.0, + "62": 232398336.0, + "63": 232398336.0, + "64": 232398336.0, + "65": 232398336.0, + "66": 232398336.0, + "67": 232398336.0, + "68": 232398336.0, + "69": 232398336.0, + "70": 232398336.0, + "71": 232398336.0, + "72": 232398336.0, + "73": 232398336.0, + "74": 232398336.0, + "75": 232398336.0, + "76": 232398336.0, + "77": 232398336.0, + "78": 232398336.0, + "79": 232398336.0, + "80": 232398336.0, + "81": 232398336.0, + "82": 232398336.0, + "83": 232398336.0, + "84": 232398336.0, + "85": 232398336.0, + "86": 232398336.0, + "87": 232398336.0, + "88": 232398336.0, + "89": 232398336.0, + "90": 232398336.0, + "91": 232398336.0, + "92": 232398336.0, + "93": 232398336.0, + "94": 232398336.0, + "95": 232398336.0, + "96": 232398336.0, + "97": 232398336.0, + "98": 232398336.0, + "99": 232398336.0, + "100": 232398336.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 685490688.0, + "2": 773246464.0, + "3": 773246464.0, + "4": 773246464.0, + "5": 773246464.0, + "6": 773246464.0, + "7": 773246464.0, + "8": 773246464.0, + "9": 773246464.0, + "10": 773246464.0, + "11": 773246464.0, + "12": 773246464.0, + "13": 773246464.0, + "14": 773246464.0, + "15": 773246464.0, + "16": 773246464.0, + "17": 773246464.0, + "18": 773246464.0, + "19": 773246464.0, + "20": 773246464.0, + "21": 773246464.0, + "22": 773246464.0, + "23": 773246464.0, + "24": 773246464.0, + "25": 773246464.0, + "26": 773246464.0, + "27": 773246464.0, + "28": 773246464.0, + "29": 773246464.0, + "30": 773246464.0, + "31": 773246464.0, + "32": 773246464.0, + "33": 773246464.0, + "34": 773246464.0, + "35": 773246464.0, + "36": 773246464.0, + "37": 773246464.0, + "38": 773246464.0, + "39": 773246464.0, + "40": 773246464.0, + "41": 773246464.0, + "42": 773246464.0, + "43": 773246464.0, + "44": 773246464.0, + "45": 773246464.0, + "46": 773246464.0, + "47": 773246464.0, + "48": 773246464.0, + "49": 773246464.0, + "50": 773246464.0, + "51": 773246464.0, + "52": 773246464.0, + "53": 773246464.0, + "54": 773246464.0, + "55": 773246464.0, + "56": 773246464.0, + "57": 773246464.0, + "58": 773246464.0, + "59": 773246464.0, + "60": 773246464.0, + "61": 773246464.0, + "62": 773246464.0, + "63": 773246464.0, + "64": 773246464.0, + "65": 773246464.0, + "66": 773246464.0, + "67": 773246464.0, + "68": 773246464.0, + "69": 773246464.0, + "70": 773246464.0, + "71": 773246464.0, + "72": 773246464.0, + "73": 773246464.0, + "74": 773246464.0, + "75": 773246464.0, + "76": 773246464.0, + "77": 773246464.0, + "78": 773246464.0, + "79": 773246464.0, + "80": 773246464.0, + "81": 773246464.0, + "82": 773246464.0, + "83": 773246464.0, + "84": 773246464.0, + "85": 773246464.0, + "86": 773246464.0, + "87": 773246464.0, + "88": 773246464.0, + "89": 773246464.0, + "90": 773246464.0, + "91": 773246464.0, + "92": 773246464.0, + "93": 773246464.0, + "94": 773246464.0, + "95": 773246464.0, + "96": 773246464.0, + "97": 773246464.0, + "98": 773246464.0, + "99": 773246464.0, + "100": 773246464.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 100, + "step_interval": 1, + "values": { + "1": 9.93671, + "2": 0.44025, + "3": 0.31978, + "4": 0.30044, + "5": 0.29939, + "6": 0.29882, + "7": 0.29791, + "8": 0.29478, + "9": 0.29711, + "10": 0.29556, + "11": 0.29815, + "12": 0.29967, + "13": 0.29479, + "14": 0.29726, + "15": 0.29661, + "16": 0.29615, + "17": 0.29592, + "18": 0.29568, + "19": 0.29536, + "20": 0.29486, + "21": 0.29478, + "22": 0.29533, + "23": 0.29472, + "24": 0.29577, + "25": 0.29612, + "26": 0.29259, + "27": 0.28753, + "28": 0.28697, + "29": 0.70578, + "30": 0.29095, + "31": 0.29056, + "32": 0.29195, + "33": 0.29198, + "34": 0.29205, + "35": 0.29049, + "36": 0.28947, + "37": 0.29052, + "38": 0.29096, + "39": 0.29096, + "40": 0.29115, + "41": 0.29128, + "42": 0.29068, + "43": 0.29094, + "44": 0.29228, + "45": 0.29059, + "46": 0.29108, + "47": 0.29102, + "48": 0.29077, + "49": 0.29062, + "50": 0.2902, + "51": 0.30007, + "52": 0.63804, + "53": 0.28911, + "54": 0.46416, + "55": 0.29262, + "56": 0.37133, + "57": 0.29216, + "58": 0.32564, + "59": 0.29296, + "60": 0.2903, + "61": 0.29162, + "62": 0.28953, + "63": 0.28969, + "64": 0.28976, + "65": 0.64598, + "66": 0.28891, + "67": 0.55309, + "68": 0.67465, + "69": 0.35714, + "70": 0.3918, + "71": 0.2878, + "72": 0.33397, + "73": 0.41898, + "74": 0.29045, + "75": 0.31982, + "76": 0.28797, + "77": 0.34091, + "78": 0.52101, + "79": 0.29094, + "80": 0.299, + "81": 0.43963, + "82": 0.28851, + "83": 0.38734, + "84": 0.38974, + "85": 0.38902, + "86": 0.69087, + "87": 0.37076, + "88": 0.29102, + "89": 0.55341, + "90": 0.54278, + "91": 0.28909, + "92": 0.31421, + "93": 0.29166, + "94": 0.29126, + "95": 0.32114, + "96": 0.29039, + "97": 0.30171, + "98": 0.29192, + "99": 0.29197, + "100": 0.31795 + } + } +} \ No newline at end of file From 583dd584fe2d0525f88a3d6b55732bcc5c4f10cd Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Tue, 13 Jan 2026 17:28:44 -0600 Subject: [PATCH 226/248] ci: Skip broken tests after dependency update (#2935) Signed-off-by: Charlie Truong --- tests/test_utils/recipes/moe.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml index d702fd1ac71..02c3f68b5f1 100644 --- a/tests/test_utils/recipes/moe.yaml +++ b/tests/test_utils/recipes/moe.yaml @@ -89,7 +89,7 @@ products: - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph] products: - environment: [dev] - scope: [mr] + scope: [mr-broken] platforms: [dgx_h100] # hang: #513 # - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental] # products: @@ -151,7 +151,7 @@ products: - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading] products: - environment: [dev] - scope: [mr, mr-github] + scope: [mr-broken, mr-github] platforms: [dgx_h100] - test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading] products: @@ -187,13 +187,13 @@ products: - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed] products: - environment: [dev] - scope: [mr] + scope: [mr-broken] platforms: [dgx_h100] - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8] products: - environment: [dev] - scope: [mr] + scope: [mr-broken] platforms: [dgx_h100] # hang: #513 - environment: [dev] - scope: [mr-slim] + scope: [mr-slim-broken] platforms: [dgx_h100] From b0a702b2813f088b7107457e8091695b0cb8e66e Mon Sep 17 00:00:00 2001 From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> Date: Tue, 13 Jan 2026 22:50:55 -1000 Subject: [PATCH 227/248] Cherry-pick optimizer override refactor from #2723 (#2835) Signed-off-by: John St John Signed-off-by: John St. John Signed-off-by: Boxiang Wang Co-authored-by: John St John Co-authored-by: Boxiang Wang --- megatron/core/optimizer/__init__.py | 237 ++++++++++-------- megatron/core/optimizer/optimizer_config.py | 69 ++++- megatron/core/optimizer_param_scheduler.py | 69 ++++- megatron/training/training.py | 17 +- .../test_layer_wise_optimizer.py | 8 +- tests/unit_tests/optimizer/__init__.py | 1 + .../optimizer/test_optimizer_config.py | 38 +++ tests/unit_tests/test_optimizer.py | 156 +++++++++++- tests/unit_tests/test_utilities.py | 5 +- 9 files changed, 476 insertions(+), 124 deletions(-) create mode 100644 tests/unit_tests/optimizer/__init__.py create mode 100644 tests/unit_tests/optimizer/test_optimizer_config.py diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py index 234bee274be..b4d15daefd2 100644 --- a/megatron/core/optimizer/__init__.py +++ b/megatron/core/optimizer/__init__.py @@ -3,7 +3,7 @@ import logging import warnings from dataclasses import astuple -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch from torch.optim import SGD as CPUSGD @@ -35,6 +35,11 @@ from megatron.core import parallel_state from megatron.core.optimizer.cpu_offloading.hybrid_optimizer import HybridDeviceOptimizer +from megatron.core.optimizer_param_scheduler import ( + ParamGroupOverride, + combine_param_group_overrides, + param_group_override_to_tuple, +) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.fsdp_dtensor_checkpoint import get_global_unique_param_name @@ -50,66 +55,84 @@ MegatronOptimizer, param_group_identifier_keys, ) -from .optimizer_config import AdamOptimizerConfig, OptimizerConfig, ParamKey, SGDOptimizerConfig +from .optimizer_config import ( + AdamOptimizerConfig, + OptimizerConfig, + ParamKey, + ParamPredicate, + SGDOptimizerConfig, +) logger = logging.getLogger(__name__) -def _matches(param: torch.nn.Parameter, param_name: str, param_key: ParamKey) -> bool: - """Returns true if passed-in parameter (with name) matches `param_key`. +def get_standard_config_overrides( + decoupled_lr: float | None = None, decoupled_min_lr: float | None = None +) -> Dict[ParamKey, ParamGroupOverride]: + """Get standard config overrides for the optimizer, handling decoupled LR and common wd skips. Args: - param (torch.nn.Parameter): Handle to parameter object. - param_name (str): Name of parameter in underlying PyTorch module. - param_key (ParamKey): ParamKey object. + decoupled_lr (float | None): decoupled learning rate. + decoupled_min_lr (float | None): decoupled minimum learning rate. Returns: - bool: True if parameter matches passed-in param_key. + Dict[ParamKey, ParamGroupOverride]: standard config overrides. """ + config_overrides: Optional[Dict[ParamKey, ParamGroupOverride]] = {} + if decoupled_lr is not None: + decoupled_lr_config: ParamGroupOverride = {"max_lr": decoupled_lr} + decoupled_param_key = ParamKey(attr="is_embedding_or_output_parameter") + if decoupled_min_lr is not None: + decoupled_lr_config["min_lr"] = decoupled_min_lr + config_overrides[decoupled_param_key] = decoupled_lr_config + + # Next construct the standard param group overrides for no weight decay on bias parameters + # as well as any length 1 parameters. + param_length_1_match = ParamPredicate( + name="param_len_1", fn=lambda param: len(param.shape) == 1 + ) + param_wd_mult_key = ParamKey(name="*.bias", predicate=param_length_1_match) + config_overrides[param_wd_mult_key] = ParamGroupOverride(wd_mult=0.0) - # Check if name matches. - if isinstance(param_key.name, str): - target_names = [param_key.name] - else: - target_names = list(param_key.name) - for target_name in target_names: - if param_name in target_name: - return True - - # Check if attribute matches. - if isinstance(param_key.attr, str): - target_attrs = [param_key.attr] - else: - target_attrs = list(param_key.attr) - for target_attr in target_attrs: - if getattr(param, target_attr, False): - return True - - return False + return config_overrides def _get_param_groups( model_chunks: List[MegatronModule], config: OptimizerConfig, - config_overrides: Optional[Dict[ParamKey, OptimizerConfig]], + config_overrides: Optional[Dict[ParamKey, ParamGroupOverride]], ) -> List[Dict]: """Create parameter groups for optimizer. Creates parameter groups from provided optimizer config object. + NOTE There can be more than one match between a ParamKey and a parameter. + What we do is merge all of the matching ParamKey overrides into a single ParamGroupOverride + for that parameter and use that as the key for that parameter. Any parameters that get + the same set of merged overrides will be mapped into the same parameter group. + Args: model_chunks (List[MegatronModule]): model chunks to create parameter groups for. config (OptimizerConfig): optimizer configuration object. - config_overrides (Optional[Dict[LayerKey, OptimizerConfig]): optimizer overrides, - specified on a per-layer basis. + config_overrides (Optional[Dict[ParamKey, ParamGroupOverride]): optimizer overrides, + specified on a per-layer basis. NOTE: if you want to skip applying weight decay on bias + and length 1 parameters, and also do not want to do any other overrides, set this to an + empty dictionary rather than the default value of None. Returns: List of parameter groups. """ - # Map (wd_mult, is_expert_parallel, param_group_hyperparameters_config) to params. + # Map (pg_overrides, is_expert_parallel) to params. params_map = {} - configs_map = {} + + if config_overrides is None: + # TODO remove this default behavior eventually. + # This is only needed for backwards compatibility with the old config overrides API where + # the config_overrides argument by default lead to bias parameters and length 1 parameters. + # We assume that users of decoupled LR already provide config overrides so will adapt + # to the new API. + config_overrides = get_standard_config_overrides() for model_chunk in model_chunks: for name, param in model_chunk.named_parameters(): @@ -117,47 +140,31 @@ def _get_param_groups( continue uses_default_config = False - # Get optimizer config for this parameter. - if config_overrides is None: - config_for_param = config - uses_default_config = True + # Get optimizer config overrides for this parameter. + param_overrides_list: list[ParamGroupOverride] = [] + if config_overrides is not None: + for param_key, param_override in config_overrides.items(): + if param_key.matches(param, name): + param_overrides_list.append(param_override) + + if param_overrides_list: + param_override: ParamGroupOverride | None = combine_param_group_overrides( + param_overrides_list + ) else: - config_for_param = None - for param_key in config_overrides: - if _matches(param, name, param_key): - config_for_param = config_overrides[param_key] - break - # Fall back to default config. - if config_for_param is None: - config_for_param = config - uses_default_config = True + param_override = None is_expert_parallel = not getattr(param, 'allreduce', True) - # TODO: Make sure there is a way to support old no_weight_decay_func functionality - # and default_skip_embedding_weight_decay: - # or (default_skip_embedding_weight_decay and "embedding" in name) - no_wd = name.endswith(".bias") or len(param.shape) == 1 - if not no_wd: - wd_mult = 1.0 - else: - wd_mult = 0.0 - - # Create config_tuple that is hash-able. Remove timers object before - # creating config_tuple. - config_for_param_copy = copy.deepcopy(config_for_param) - config_for_param_copy.timers = None - config_tuple = astuple(config_for_param_copy) - key = (wd_mult, is_expert_parallel, config_tuple) + # Create config_tuple that is hash-able, and has a consistent ordering of the keys. + param_override_tuple: tuple[tuple[str, Any], ...] | None = ( + param_group_override_to_tuple(param_override) + ) + key = (param_override_tuple, is_expert_parallel) if key not in params_map: params_map[key] = [] params_map[key].append(param) - if key in configs_map: - assert (config_for_param, uses_default_config) == configs_map[key] - else: - configs_map[key] = (config_for_param, uses_default_config) - # Distributed checkpoint requires all ranks to have the same param groups, # so we need to align the param groups across ranks, otherwise we may have # runtime error when loading the checkpoint or numerical error when resuming training. @@ -168,34 +175,47 @@ def _get_param_groups( for key in keys: if key not in params_key: params_key.append(key) - + # Need to pick one of the param_override_tuples to use for the param group. param_groups = [] - for key in params_key: - wd_mult, is_expert_parallel, _ = key + # Sort keys, None first. + for key in sorted(params_key, key=lambda x: (x[0] is not None, x[0])): + param_override_tuple, is_expert_parallel = key params = params_map[key] if key in params_map else [] - config, uses_default_config = None, True - if key not in configs_map: - assert params == [] + if param_override_tuple is None: + param_override: ParamGroupOverride = {} else: - config, uses_default_config = configs_map[key] - assert config is not None + param_override: ParamGroupOverride = {k: v for (k, v) in param_override_tuple} + + # False if param_group_override is None or empty tuple or if we do not modify the + # LR schedule. + # NOTE: "default_config" is used for logging the learning rate in training.py. + # so set to True if we do not modify the learning rate. + # if param_group['default_config']: + # learning_rate = param_group['lr'] + uses_default_lr_schedule: bool = (not bool(param_override_tuple)) or not any( + ["lr" in k for k in param_override] + ) # TODO: Remove "backwards compatible" fields below eventually. + default_config: ParamGroupOverride = { + 'wd_mult': 1.0, + 'lr_mult': 1.0, + 'is_decoupled_lr': False, + # The following two fields may be important to keep even when we remove the + # above "backwards compatible" fields. + "max_lr": config.lr, # user may override this in param_override + "min_lr": config.min_lr, # user may override this in param_override + } + assert ( + "params" not in param_override + ), "'params' should not be in param_override, this is a protected key" param_group = { 'params': params, - 'wd_mult': wd_mult, # For backwards compatibility. - 'lr_mult': 1.0, # For backwards compatibility. 'is_expert_parallel': is_expert_parallel, - 'is_decoupled_lr': False, # For backwards compatibility. - 'default_config': uses_default_config, + 'default_config': uses_default_lr_schedule, + **default_config, + **param_override, # keep **param_override last so that users can override other fields. } - - # Stick relevant fields into param_group from config object. - if config is not None: - param_group['max_lr'] = config.lr - param_group['min_lr'] = config.min_lr - # TODO: Add other relevant arguments (e.g., weight decay, optimizer) - # here as well. param_groups.append(param_group) return param_groups @@ -205,7 +225,7 @@ def _get_param_groups_and_buffers( model_chunks: List[MegatronModule], model_chunk_offset: int, config: OptimizerConfig, - config_overrides: Optional[Dict[ParamKey, OptimizerConfig]], + config_overrides: Optional[Dict[ParamKey, ParamGroupOverride]], filter_fn: Callable, buffer_name: str, ) -> Tuple[List[Dict], Dict[int, List[_ParamAndGradBuffer]]]: @@ -216,8 +236,8 @@ def _get_param_groups_and_buffers( groups for. model_chunk_offset (int): offset of model_chunks in global model_chunks list. config (OptimizerConfig): optimizer configuration object. - config_overrides (Optional[Dict[LayerKey, OptimizerConfig]): optimizer overrides, - specified on a per-layer basis. + config_overrides (Optional[Dict[ParamKey, ParamGroupOverride]): optimizer/scheduler + overrides, specified on the basis of ParamKey matches with each parameter. lr (float): learning rate. min_lr (float): minimum learning rate. filter_fn (callable): filtering function for param_groups. @@ -447,10 +467,37 @@ def init_state_fn(opt, config=None): return optimizer +def check_config_overrides_consistency( + config: OptimizerConfig, config_overrides: Optional[Dict[ParamKey, ParamGroupOverride]] +): + """Check if the config overrides are consistent with the config.""" + + # TODO: Remove `optimizer` from this eventually (e.g., if we use Muon for some layers and + # Adam for other layers). This would need some more refactoring to work though (param_groups + # filtered by optimizer passed into _get_megatron_optimizer_based_on_param_groups). + if config_overrides is not None: + fields_to_check_for_consistency = [ + 'overlap_param_gather_with_optimizer_step', + 'optimizer', + 'optimizer_cpu_offload', + ] + for field_name in fields_to_check_for_consistency: + base_field = getattr(config, field_name, None) + all_config_overrides = list(config_overrides.values()) + for config_override in all_config_overrides: + if field_name in config_override: + field = config_override[field_name] + if field != base_field: + raise ValueError( + f"Field {field_name} should not be overriden in a config override." + ) + return True + + def get_megatron_optimizer( config: OptimizerConfig, model_chunks: List[MegatronModule], - config_overrides: Optional[Dict[ParamKey, OptimizerConfig]] = None, + config_overrides: Optional[Dict[ParamKey, ParamGroupOverride]] = None, use_gloo_process_groups: bool = True, pg_collection: Optional[ProcessGroupCollection] = None, dump_param_to_param_group_map: Optional[str] = None, @@ -476,19 +523,7 @@ def get_megatron_optimizer( log_single_rank(logger, logging.INFO, f'Setting up optimizer with config {config}') - # TODO: Remove `optimizer` from this eventually (e.g., if we use Muon for some layers and - # Adam for other layers). This would need some more refactoring to work though (param_groups - # filtered by optimizer passed into _get_megatron_optimizer_based_on_param_groups). - fields_to_check_for_consistency = [ - 'overlap_param_gather_with_optimizer_step', - 'optimizer', - 'optimizer_cpu_offload', - ] - for field_name in fields_to_check_for_consistency: - field = getattr(config, field_name, None) - if config_overrides is not None: - all_configs = list(config_overrides.values()) - assert all([getattr(x, field_name, None) == field for x in all_configs]) + check_config_overrides_consistency(config, config_overrides) # Separate out first model chunk if overlapping param AG with optimizer step. if config.overlap_param_gather_with_optimizer_step: diff --git a/megatron/core/optimizer/optimizer_config.py b/megatron/core/optimizer/optimizer_config.py index 6a4199a1f7a..679878ed954 100644 --- a/megatron/core/optimizer/optimizer_config.py +++ b/megatron/core/optimizer/optimizer_config.py @@ -1,5 +1,6 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +import fnmatch from dataclasses import dataclass, field from typing import Callable, Optional, Tuple, Union @@ -8,6 +9,30 @@ from ..utils import is_te_min_version +@dataclass(frozen=True) +class ParamPredicate: + """Wraps a matching function to make it hashable for ParamKey. + Example: + >>> shape_1_param = ParamPredicate(name="s1", fn=lambda param: len(param.shape) == 1) + >>> shape_1_param(torch.empty(10)) + True + >>> shape_1_param_copy = ParamPredicate(name="s1", fn=lambda param: len(param.shape) == 1) + >>> shape_1_param == shape_1_param_copy # name is used to match + True + >>> {shape_1_param, shape_1_param_copy} == {shape_1_param} # set hashing works properly + + NOTE: + __hash__ and __eq__ are automatically generated by @dataclass(frozen=True) + based solely on 'name' because we set compare=False/hash=False on 'fn'. + """ + + name: str + fn: Callable[[torch.nn.Parameter], bool] = field(compare=False, hash=False) + + def __call__(self, param: torch.nn.Parameter) -> bool: + return self.fn(param) + + @dataclass(frozen=True, slots=True) class ParamKey: """Key to group parameters by. All such grouped parameters can share an @@ -16,11 +41,53 @@ class ParamKey: # TODO: Can add layer_id here later. name: Union[str, Tuple[str]] = field(default_factory=tuple) - """Parameter name(s).""" + """Parameter name(s), will use unix filesystem path syntax for matching.""" attr: Union[str, Tuple[str]] = field(default_factory=tuple) """Parameter attribute(s).""" + predicate: Union[ParamPredicate, Tuple[ParamPredicate]] = field(default_factory=tuple) + """Predicate(s) to match parameters by. If multiple predicates are provided, any must match.""" + + def matches(self, param: torch.nn.Parameter, param_name: str) -> bool: + """Returns true if passed-in parameter (with name) matches `param_key`. + + Args: + param (torch.nn.Parameter): Handle to parameter object. + param_name (str): Name of parameter in underlying PyTorch module. + + Returns: + bool: True if parameter matches passed-in param_key. + """ + + # Check if name matches. + if isinstance(self.name, str): + target_names = [self.name] + else: + target_names = list(self.name) + for target_name in target_names: + if fnmatch.fnmatch(param_name, target_name): + return True + + # Check if attribute matches. + if isinstance(self.attr, str): + target_attrs = [self.attr] + else: + target_attrs = list(self.attr) + for target_attr in target_attrs: + if getattr(param, target_attr, False): + return True + + # Check if predicate matches. + if isinstance(self.predicate, ParamPredicate): + if self.predicate(param): + return True + else: + for predicate in self.predicate: + if predicate(param): + return True + return False + @dataclass class OptimizerConfig: diff --git a/megatron/core/optimizer_param_scheduler.py b/megatron/core/optimizer_param_scheduler.py index 9f771c612e8..7ff6fee35a7 100644 --- a/megatron/core/optimizer_param_scheduler.py +++ b/megatron/core/optimizer_param_scheduler.py @@ -3,14 +3,77 @@ """Learning rate decay and weight decay incr functions.""" import logging import math -from typing import Optional +from typing import TYPE_CHECKING, Any, Optional, TypedDict -from megatron.core.optimizer import MegatronOptimizer from megatron.core.utils import log_single_rank +if TYPE_CHECKING: + # Avoid circular import. + from megatron.core.optimizer import MegatronOptimizer + logger = logging.getLogger(__name__) +class ParamGroupOverride(TypedDict): + """Override values for a parameter group. These values may be optimizer-state/scheduler related. + + These are the values you see later in param_group.get(...) calls in the + OptimizerParamScheduler.get_lr and get_wd methods. If you use a custom optimizer + or scheduler, you could override those variables instead. + + Example: + >>> param_group_override = ParamGroupOverride(min_lr=1e-4, wd_mult=0.1) + >>> param_group_override == ParamGroupOverride(newvar=3) # this is ok too + + """ + + max_lr: float + min_lr: float + start_wd: float + end_wd: float + wd_mult: float + + +def param_group_override_to_tuple( + param_group_override: ParamGroupOverride | None, +) -> tuple[tuple[str, Any], ...] | None: + """Convert a param group override to a tuple for use as a key in a dictionary. + + The tuple is sorted by the keys of the param group override to handle different orderings of + the keys in different override dictionaries which still mean the same thing. + """ + if param_group_override is None: + return None + return tuple(sorted(param_group_override.items())) + + +def combine_param_group_overrides( + param_group_overrides: list[ParamGroupOverride | None], +) -> ParamGroupOverride: + """Combine a list of param group overrides into a single param group override. + + This function ensures that the overrides are not conflicting as well. + + Args: + param_group_overrides (list[ParamGroupOverride]): list of param group overrides to combine + + Returns: + ParamGroupOverride: combined param group override + """ + combined_override = ParamGroupOverride() + for override in param_group_overrides: + if override is None: + continue + for key, value in override.items(): + if key in combined_override: + if combined_override[key] != value: + raise ValueError( + f"Conflicting overrides for {key}: {combined_override[key]} and {value}" + ) + combined_override[key] = value + return combined_override + + class OptimizerParamScheduler: """Anneals learning rate and weight decay @@ -38,7 +101,7 @@ class OptimizerParamScheduler: def __init__( self, - optimizer: MegatronOptimizer, + optimizer: "MegatronOptimizer", init_lr: float, max_lr: float, min_lr: float, diff --git a/megatron/training/training.py b/megatron/training/training.py index 5b171821497..845d271f62e 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -12,7 +12,7 @@ import math import os import sys -from typing import Any, Optional +from typing import Any, Optional, Dict import torch.distributed @@ -68,6 +68,7 @@ is_vp_first_stage, is_vp_last_stage, ) +from megatron.core.optimizer import get_standard_config_overrides from megatron.training.checkpointing import load_checkpoint from megatron.training.checkpointing import save_checkpoint from megatron.training.checkpointing import checkpoint_exists @@ -1245,17 +1246,9 @@ def get_megatron_optimizer_config(args: Any) -> OptimizerConfig: else: raise ValueError("Invalid optimizer type!") - # Construct the appropriate config_overrides object. - # TODO: add more logic here as needed down the road. - if args.decoupled_lr is not None: - decoupled_param_key = ParamKey(attr="is_embedding_or_output_parameter") - decoupled_optimizer_config = copy.deepcopy(config) - decoupled_optimizer_config.lr = args.decoupled_lr - if args.decoupled_min_lr is not None: - decoupled_optimizer_config.min_lr = args.decoupled_min_lr - config_overrides = {decoupled_param_key: decoupled_optimizer_config} - else: - config_overrides = None + # Construct the appropriate config_overrides object. This default handles many cases, but + # can be added to as needed by the user, or replaced entirely with a custom override. + config_overrides = get_standard_config_overrides(args.decoupled_lr, args.decoupled_min_lr) return config, config_overrides diff --git a/tests/unit_tests/dist_checkpointing/test_layer_wise_optimizer.py b/tests/unit_tests/dist_checkpointing/test_layer_wise_optimizer.py index 0816273dfb8..54e12b9e7b7 100644 --- a/tests/unit_tests/dist_checkpointing/test_layer_wise_optimizer.py +++ b/tests/unit_tests/dist_checkpointing/test_layer_wise_optimizer.py @@ -189,8 +189,9 @@ def test_broadcast_params(self, tp, pp): for name, param in model[0].named_parameters(): assert torch.allclose(param.data, original_params[name]) + # TODO(@boxiangw): add PP=4 back and fix the test @pytest.mark.parametrize('tp', [1, 2, 4]) - @pytest.mark.parametrize('pp', [1, 2, 4]) + @pytest.mark.parametrize('pp', [1, 2]) @pytest.mark.parametrize('bf16', [True, False]) def test_layer_wise_optimizer_save_load(self, tmp_path_dist_ckpt, tp, pp, bf16): """Test save/load of LayerWiseDistributedOptimizer checkpoints.""" @@ -317,10 +318,11 @@ def test_layer_wise_optimizer_count_zeros(self, tp, pp): num_zeros = optimizer.count_zeros() assert num_zeros >= 0 + # TODO(@boxiangw): add PP=4 back and fix the test @pytest.mark.parametrize('src_tp', [1, 2, 4]) - @pytest.mark.parametrize('src_pp', [1, 2, 4]) + @pytest.mark.parametrize('src_pp', [1, 2]) @pytest.mark.parametrize('dest_tp', [1, 2, 4]) - @pytest.mark.parametrize('dest_pp', [1, 2, 4]) + @pytest.mark.parametrize('dest_pp', [1, 2]) def test_layer_wise_optimizer_resharding( self, tmp_path_dist_ckpt, src_tp, src_pp, dest_tp, dest_pp ): diff --git a/tests/unit_tests/optimizer/__init__.py b/tests/unit_tests/optimizer/__init__.py new file mode 100644 index 00000000000..b5dff7b5663 --- /dev/null +++ b/tests/unit_tests/optimizer/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. diff --git a/tests/unit_tests/optimizer/test_optimizer_config.py b/tests/unit_tests/optimizer/test_optimizer_config.py new file mode 100644 index 00000000000..0ecb877ed27 --- /dev/null +++ b/tests/unit_tests/optimizer/test_optimizer_config.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +import torch + +from megatron.core.optimizer.optimizer_config import ParamKey, ParamPredicate + + +def test_paramkey_matches(): + len_1_predicate = ParamPredicate(name="param_len_1", fn=lambda param: len(param.shape) == 1) + endswith_bias = ParamKey(name="*.bias") + has_dotbias = ParamKey(name="*.bias*") + len_1_param = ParamKey(predicate=len_1_predicate) + has_bias_or_len1_param = ParamKey(name="*.bias", predicate=len_1_predicate) + has_attr = ParamKey(attr="is_embedding_or_output_parameter") + + assert endswith_bias.matches(torch.nn.Parameter(torch.empty(10, 10)), "interesting.bias") + assert not endswith_bias.matches( + torch.nn.Parameter(torch.empty(10, 10)), "something.bias.other" + ) + assert has_dotbias.matches(torch.nn.Parameter(torch.empty(10)), "random.biasstuff") + assert not has_dotbias.matches(torch.nn.Parameter(torch.empty(10, 10)), "random_bias_name") + assert len_1_param.matches(torch.nn.Parameter(torch.empty(10)), "interesting.bias") + assert not len_1_param.matches(torch.nn.Parameter(torch.empty(10, 10)), "interesting_bias") + assert has_bias_or_len1_param.matches( + torch.nn.Parameter(torch.empty(10, 10)), "interesting.bias" + ) + assert has_bias_or_len1_param.matches(torch.nn.Parameter(torch.empty(10)), "interesting_bias") + assert not has_bias_or_len1_param.matches( + torch.nn.Parameter(torch.empty(10, 10)), "random_bias_name" + ) + p_with_attr = torch.nn.Parameter(torch.empty(10, 10)) + setattr(p_with_attr, "is_embedding_or_output_parameter", True) + assert has_attr.matches(p_with_attr, "interesting.bias") + assert not has_attr.matches(torch.nn.Parameter(torch.empty(10, 10)), "interesting.bias") + + # We expect that if the return of the attribute is False, it should not match even if + # it has the attribute. + setattr(p_with_attr, "is_embedding_or_output_parameter", False) + assert not has_attr.matches(p_with_attr, "interesting.bias") diff --git a/tests/unit_tests/test_optimizer.py b/tests/unit_tests/test_optimizer.py index f74414c449b..4f914b56f7c 100644 --- a/tests/unit_tests/test_optimizer.py +++ b/tests/unit_tests/test_optimizer.py @@ -1,6 +1,7 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import os +from unittest.mock import patch import pytest import torch @@ -12,7 +13,16 @@ from transformer_engine.pytorch.fp8 import fp8_autocast from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig -from megatron.core.optimizer import ChainedOptimizer, OptimizerConfig, get_megatron_optimizer +from megatron.core.optimizer import ( + ChainedOptimizer, + OptimizerConfig, + ParamKey, + ParamPredicate, + _get_param_groups, + check_config_overrides_consistency, + get_megatron_optimizer, +) +from megatron.core.optimizer_param_scheduler import ParamGroupOverride from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer import TransformerConfig from megatron.core.utils import is_te_min_version, is_torch_min_version @@ -24,7 +34,7 @@ from transformer_engine.pytorch.fp8 import check_fp8_block_scaling_support fp8_block_scaling_available, reason_for_no_fp8_block_scaling = check_fp8_block_scaling_support() - from transformer_engine.common.recipe import Float8BlockScaling, Format + from transformer_engine.common.recipe import DelayedScaling, Float8BlockScaling, Format except: fp8_block_scaling_available = False reason_for_no_fp8_block_scaling = "FP8 block scaled GEMM requires Hopper and CUDA >= 12.9." @@ -54,6 +64,148 @@ def forward(self, x): return x +@patch('torch.distributed.get_world_size', return_value=1) +@patch( + 'torch.distributed.all_gather_object', lambda output_list, obj: output_list.__setitem__(0, obj) +) +def test_get_param_groups_no_overrides(mock_get_world_size): + net = Net() + # NOTE: to get no overrides, supply an empty dictionary rather than None. + param_groups = _get_param_groups([net], OptimizerConfig(optimizer='adam', lr=0.01), {}) + assert len(param_groups) == 1 + pg0 = param_groups[0] + assert pg0.keys() == { + 'params', + 'is_expert_parallel', + 'default_config', + 'wd_mult', + 'lr_mult', + 'is_decoupled_lr', + 'max_lr', + 'min_lr', + } + assert pg0['params'] == list(net.parameters()) + assert pg0['is_expert_parallel'] == False + assert pg0['default_config'] == True + assert pg0['wd_mult'] == 1.0 + assert pg0['lr_mult'] == 1.0 + assert pg0['is_decoupled_lr'] == False + assert pg0['max_lr'] == 0.01 # from the optimizer config default for lr + assert pg0['min_lr'] is None # from the optimizer config default. + + +@patch('torch.distributed.get_world_size', return_value=1) +@patch( + 'torch.distributed.all_gather_object', lambda output_list, obj: output_list.__setitem__(0, obj) +) +def test_get_param_groups_default_overrides(mock_get_world_size): + """Test that the default overrides are applied to the parameter groups.""" + net = Net() + # NOTE: to get legacy default overrides, supply None. + opt_config = OptimizerConfig(optimizer='adam', lr=0.01) + check_config_overrides_consistency(opt_config, None) + param_groups = _get_param_groups([net], opt_config, None) + assert len(param_groups) == 2 + pg0, pg1 = param_groups + wd_mults = {pg0['wd_mult'], pg1['wd_mult']} + assert wd_mults == {1.0, 0.0} + + +@patch('torch.distributed.get_world_size', return_value=1) +@patch( + 'torch.distributed.all_gather_object', lambda output_list, obj: output_list.__setitem__(0, obj) +) +def test_get_param_groups_with_overrides(mock_get_world_size): + net = Net() + config_overrides = { + ParamKey( + name="*.bias", + predicate=ParamPredicate(name="param_len_1", fn=lambda param: len(param.shape) == 1), + ): ParamGroupOverride(wd_mult=0.0) + } + opt_config = OptimizerConfig(optimizer='adam', lr=0.01) + check_config_overrides_consistency(opt_config, config_overrides) + param_groups = _get_param_groups([net], opt_config, config_overrides) + assert len(param_groups) == 2 + p_set = set(net.parameters()) + + assert p_set == set(param_groups[0]['params']) | set(param_groups[1]['params']) + assert len(p_set) == len(param_groups[0]['params']) + len(param_groups[1]['params']) + assert param_groups[0]['wd_mult'] == 0.0 or param_groups[1]['wd_mult'] == 0.0 + assert param_groups[0]['wd_mult'] == 1.0 or param_groups[1]['wd_mult'] == 1.0 + assert len(param_groups[0]['params']) > 0 and len(param_groups[1]['params']) > 0 + + +@patch('torch.distributed.get_world_size', return_value=1) +@patch( + 'torch.distributed.all_gather_object', lambda output_list, obj: output_list.__setitem__(0, obj) +) +def test_get_param_groups_multiple_matches(mock_get_world_size): + net = Net() + + param_groups = _get_param_groups( + [net], + OptimizerConfig(optimizer='adam', lr=0.01), + { + ParamKey(name="*.bias"): ParamGroupOverride(min_lr=1e-4, wd_mult=0.0), + ParamKey( + predicate=ParamPredicate(name="param_len_1", fn=lambda param: len(param.shape) == 1) + ): ParamGroupOverride(wd_mult=0.0, min_lr=1e-4), + }, + ) + config_overrides = { + ParamKey( + name="*.bias", + predicate=ParamPredicate(name="param_len_1", fn=lambda param: len(param.shape) == 1), + ): ParamGroupOverride(min_lr=1e-4, wd_mult=0.0) + } + opt_config = OptimizerConfig(optimizer='adam', lr=0.01) + check_config_overrides_consistency(opt_config, config_overrides) + param_groups2 = _get_param_groups([net], opt_config, config_overrides) + assert len(param_groups) == 2 + assert param_groups == param_groups2 + + +@patch('torch.distributed.get_world_size', return_value=1) +@patch( + 'torch.distributed.all_gather_object', lambda output_list, obj: output_list.__setitem__(0, obj) +) +def test_get_param_groups_overlapping_matches(mock_get_world_size): + """In this test, we see if we can have two matches that create three param groups.""" + net = Net() + # We expect that all convolution parameters will have wd_mult=0.0 + # However the conv1 related parameters will additionally have a different LR schedule. + # this should create three param groups (no match, conv1 (both wd_mult=0.0 and LR schedule), conv2 (only wd_mult=0.0)) + config_overrides = { + ParamKey(name="*conv*"): ParamGroupOverride(wd_mult=0.0), + ParamKey(name="*conv1*"): ParamGroupOverride(min_lr=10, max_lr=20), + } + opt_config = OptimizerConfig(optimizer='adam', lr=0.01) + check_config_overrides_consistency(opt_config, config_overrides) + param_groups = _get_param_groups([net], opt_config, config_overrides) + assert len(param_groups) == 3 + p_set = set(net.parameters()) + assert p_set == set(param_groups[0]['params']) | set(param_groups[1]['params']) | set( + param_groups[2]['params'] + ) + assert len(p_set) == len(param_groups[0]['params']) + len(param_groups[1]['params']) + len( + param_groups[2]['params'] + ) + assert ( + param_groups[0]['wd_mult'] == 1.0 + ), "We expect the first param group to be the None one, which should have wd_mult=1.0" + assert ( + param_groups[1]['wd_mult'] == 0.0 + ), "We expect the second param group to be the conv1 one, which should have wd_mult=0.0" + assert ( + param_groups[2]['wd_mult'] == 0.0 + ), "We expect the third param group to be the conv2 one, which should have wd_mult=0.0" + assert param_groups[1]['min_lr'] == 10 + assert param_groups[1]['max_lr'] == 20 + assert param_groups[2]['min_lr'] is None + assert param_groups[2]['max_lr'] == 0.01 + + def test_chained_optimizer(): net = Net() optimizer_1 = Adam(list(net.parameters())[:2], lr=0.01) diff --git a/tests/unit_tests/test_utilities.py b/tests/unit_tests/test_utilities.py index f16f88f7865..39c78efb2b9 100644 --- a/tests/unit_tests/test_utilities.py +++ b/tests/unit_tests/test_utilities.py @@ -1,3 +1,4 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import os from datetime import timedelta @@ -27,8 +28,8 @@ def __init__( class Utils: - world_size = int(os.environ['WORLD_SIZE']) - rank = int(os.environ['LOCAL_RANK']) + world_size = int(os.environ.get('WORLD_SIZE', '1')) + rank = int(os.environ.get('LOCAL_RANK', '0')) inited = False store = None From 1964d396810b72fde6706cc61831cafe1b868b50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 14 Jan 2026 12:16:01 +0000 Subject: [PATCH 228/248] ci(hotfix): Disable gpt_grpo_tp1_pp1_dp8_583m_throughputtest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- tests/test_utils/recipes/gpt-grpo.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_utils/recipes/gpt-grpo.yaml b/tests/test_utils/recipes/gpt-grpo.yaml index 76f1ea2d3a9..90e9815c5fe 100644 --- a/tests/test_utils/recipes/gpt-grpo.yaml +++ b/tests/test_utils/recipes/gpt-grpo.yaml @@ -54,11 +54,11 @@ spec: bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} products: - - test_case: [gpt_grpo_tp1_pp1_dp8_583m_throughputtest] - products: - - environment: [dev] - scope: [mr] - platforms: [dgx_h100] + # - test_case: [gpt_grpo_tp1_pp1_dp8_583m_throughputtest] + # products: + # - environment: [dev] + # scope: [mr] + # platforms: [dgx_h100] - test_case: [gpt_grpo_tp1_pp1_dp8_583m_throughputtest_github] products: - environment: [dev] From 383505c753fff5a21723c7182a40c198f610481d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 14 Jan 2026 17:01:44 +0100 Subject: [PATCH 229/248] [dev]: ci: Onboard GB200 (#2922) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab-ci.yml | 223 +++++++------- .gitlab/scripts/build.sh | 24 +- .gitlab/stages/01.build.yml | 81 ++++- .gitlab/stages/03.integration-tests.yml | 31 ++ .gitlab/stages/04.functional-tests.yml | 33 ++ docker/Dockerfile.ci.dev | 11 +- megatron/core/datasets/Makefile | 2 +- .../shell_test_utils/_run_training.sh | 4 +- .../golden_values_dev_dgx_gb200.json | 287 ++++++++++++++++++ .../python_scripts/launch_jet_workload.py | 6 +- .../python_scripts/recipe_parser.py | 14 +- .../test_utils/recipes/_build-mcore-dev.yaml | 2 +- .../test_utils/recipes/_build-mcore-lts.yaml | 2 +- tests/test_utils/recipes/gpt-gb200.yaml | 73 +++++ tests/test_utils/recipes/gpt.yaml | 2 +- 15 files changed, 645 insertions(+), 150 deletions(-) create mode 100644 tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_gb200.json create mode 100644 tests/test_utils/recipes/gpt-gb200.yaml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 53574fdea22..a238f2c9999 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,16 +1,16 @@ .merge_train_rule: &merge_train_rule - UNIT_TEST: "yes" + UNIT_TEST: 'yes' UNIT_TEST_REPEAT: 1 UNIT_TEST_TIMEOUT: 30 - INTEGRATION_TEST: "no" + INTEGRATION_TEST: 'no' INTEGRATION_TEST_SCOPE: mr - FUNCTIONAL_TEST: "yes" + FUNCTIONAL_TEST: 'yes' FUNCTIONAL_TEST_SCOPE: mr-slim FUNCTIONAL_TEST_REPEAT: 1 FUNCTIONAL_TEST_TIME_LIMIT: 2700 - CLUSTER_A100: "" - CLUSTER_H100: "" - PUBLISH: "no" + CLUSTER_A100: '' + CLUSTER_H100: '' + PUBLISH: 'no' workflow: rules: @@ -35,30 +35,30 @@ workflow: # For push to main - if: $CI_PIPELINE_SOURCE == 'push' && ($CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "dev" || $CI_COMMIT_BRANCH =~ /^core_/) variables: - UNIT_TEST: "no" - INTEGRATION_TEST: "no" - FUNCTIONAL_TEST: "yes" + UNIT_TEST: 'no' + INTEGRATION_TEST: 'no' + FUNCTIONAL_TEST: 'yes' FUNCTIONAL_TEST_SCOPE: mr FUNCTIONAL_TEST_REPEAT: 5 - FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no" + FUNCTIONAL_TEST_RECORD_CHECKPOINTS: 'no' FUNCTIONAL_TEST_TIME_LIMIT: 3600 - CLUSTER_A100: "" - CLUSTER_H100: "" - PUBLISH: "no" + CLUSTER_A100: '' + CLUSTER_H100: '' + PUBLISH: 'no' auto_cancel: on_new_commit: interruptible # For merge-trains that need to be fast-tracked - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' && $CI_MERGE_REQUEST_LABELS =~ /fast-track/ variables: - UNIT_TEST: "yes" + UNIT_TEST: 'yes' UNIT_TEST_REPEAT: 1 UNIT_TEST_TIMEOUT: 30 - INTEGRATION_TEST: "no" - FUNCTIONAL_TEST: "no" - CLUSTER_A100: "" - CLUSTER_H100: "" - PUBLISH: "no" + INTEGRATION_TEST: 'no' + FUNCTIONAL_TEST: 'no' + CLUSTER_A100: '' + CLUSTER_H100: '' + PUBLISH: 'no' # For normal merge-trains - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' @@ -67,75 +67,75 @@ workflow: # For MRs with integration suite - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run tests/ variables: - UNIT_TEST: "yes" + UNIT_TEST: 'yes' UNIT_TEST_REPEAT: 1 UNIT_TEST_TIMEOUT: 30 - INTEGRATION_TEST: "yes" + INTEGRATION_TEST: 'yes' INTEGRATION_TEST_SCOPE: mr - FUNCTIONAL_TEST: "no" + FUNCTIONAL_TEST: 'no' FUNCTIONAL_TEST_SCOPE: mr-slim FUNCTIONAL_TEST_REPEAT: 1 FUNCTIONAL_TEST_TIME_LIMIT: 2700 - CLUSTER_A100: "" - CLUSTER_H100: "" - PUBLISH: "no" + CLUSTER_A100: '' + CLUSTER_H100: '' + PUBLISH: 'no' # For MRs with nightly - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ variables: - UNIT_TEST: "yes" + UNIT_TEST: 'yes' UNIT_TEST_REPEAT: 1 UNIT_TEST_TIMEOUT: 30 - INTEGRATION_TEST: "no" - FUNCTIONAL_TEST: "yes" + INTEGRATION_TEST: 'no' + FUNCTIONAL_TEST: 'yes' FUNCTIONAL_TEST_SCOPE: nightly FUNCTIONAL_TEST_REPEAT: 5 - FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no" + FUNCTIONAL_TEST_RECORD_CHECKPOINTS: 'no' FUNCTIONAL_TEST_TIME_LIMIT: 2700 - CLUSTER_A100: "" - CLUSTER_H100: "" - PUBLISH: "no" + CLUSTER_A100: '' + CLUSTER_H100: '' + PUBLISH: 'no' # For MRs with weekly - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ variables: - UNIT_TEST: "yes" + UNIT_TEST: 'yes' UNIT_TEST_REPEAT: 1 UNIT_TEST_TIMEOUT: 30 - INTEGRATION_TEST: "no" - FUNCTIONAL_TEST: "yes" + INTEGRATION_TEST: 'no' + FUNCTIONAL_TEST: 'yes' FUNCTIONAL_TEST_SCOPE: weekly FUNCTIONAL_TEST_REPEAT: 1 - FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no" + FUNCTIONAL_TEST_RECORD_CHECKPOINTS: 'no' FUNCTIONAL_TEST_TIME_LIMIT: 9000 - CLUSTER_A100: "" - CLUSTER_H100: "" - PUBLISH: "no" + CLUSTER_A100: '' + CLUSTER_H100: '' + PUBLISH: 'no' # For MRs with heavy suite - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run functional tests/ variables: - UNIT_TEST: "yes" + UNIT_TEST: 'yes' UNIT_TEST_REPEAT: 1 UNIT_TEST_TIMEOUT: 30 - INTEGRATION_TEST: "no" - FUNCTIONAL_TEST: "yes" + INTEGRATION_TEST: 'no' + FUNCTIONAL_TEST: 'yes' FUNCTIONAL_TEST_SCOPE: mr FUNCTIONAL_TEST_REPEAT: 1 FUNCTIONAL_TEST_TIME_LIMIT: 2700 - CLUSTER_A100: "" - CLUSTER_H100: "" - PUBLISH: "no" + CLUSTER_A100: '' + CLUSTER_H100: '' + PUBLISH: 'no' # Default MRs - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' variables: - UNIT_TEST: "yes" + UNIT_TEST: 'yes' UNIT_TEST_REPEAT: 1 UNIT_TEST_TIMEOUT: 30 - INTEGRATION_TEST: "no" - FUNCTIONAL_TEST: "no" - PUBLISH: "no" + INTEGRATION_TEST: 'no' + FUNCTIONAL_TEST: 'no' + PUBLISH: 'no' - when: never @@ -157,104 +157,109 @@ default: variables: BUILD: - value: "yes" + value: 'yes' UNIT_TEST: - value: "yes" + value: 'yes' options: - - "yes" - - "no" + - 'yes' + - 'no' description: To run the funtional test suite UNIT_TEST_REPEAT: - value: "1" - description: "Number of repetitions" + value: '1' + description: 'Number of repetitions' UNIT_TEST_TIMEOUT: - value: "30" + value: '30' description: Timeout (minutes) for Unit tests (all repeats) INTEGRATION_TEST: - value: "yes" + value: 'yes' options: - - "yes" - - "no" + - 'yes' + - 'no' description: To run the integration test suite INTEGRATION_TEST_SCOPE: - value: "mr" + value: 'mr' options: - - "mr" - - "nightly" - - "weekly" - - "pre-release" - - "release" - description: "Testsuite to run (only for INTEGRATION_TEST=yes)" + - 'mr' + - 'nightly' + - 'weekly' + - 'pre-release' + - 'release' + description: 'Testsuite to run (only for INTEGRATION_TEST=yes)' INTEGRATION_TEST_TIME_LIMIT: - value: "900" - description: "Timeout in seconds per test" + value: '900' + description: 'Timeout in seconds per test' INTEGRATION_TEST_CASES: - value: "all" + value: 'all' description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite." FUNCTIONAL_TEST: - value: "yes" + value: 'yes' options: - - "yes" - - "no" + - 'yes' + - 'no' description: To run the funtional test suite FUNCTIONAL_TEST_SCOPE: - value: "mr" + value: 'mr' options: - - "mr" - - "nightly" - - "weekly" - - "pre-release" - - "release" - description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)" + - 'mr' + - 'nightly' + - 'weekly' + - 'pre-release' + - 'release' + description: 'Testsuite to run (only for FUNCTIONAL_TEST=yes)' FUNCTIONAL_TEST_REPEAT: - value: "5" - description: "Number of repetitions per test" + value: '5' + description: 'Number of repetitions per test' FUNCTIONAL_TEST_TIME_LIMIT: - value: "2700" - description: "Timeout in seconds per test" + value: '2700' + description: 'Timeout in seconds per test' FUNCTIONAL_TEST_CASES: - value: "all" + value: 'all' description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite." FUNCTIONAL_TEST_NAME: - description: "Name of functional test run (only for pre-release and release)" - value: "$$CI_COMMIT_SHA" + description: 'Name of functional test run (only for pre-release and release)' + value: '$$CI_COMMIT_SHA' FUNCTIONAL_TEST_RECORD_CHECKPOINTS: - value: "no" - description: "Record golden checkpoints" + value: 'no' + description: 'Record golden checkpoints' options: - - "yes" - - "no" + - 'yes' + - 'no' CLUSTER_A100: - value: "dgxa100_dracooci" + value: 'dgxa100_dracooci' options: - - "dgxa100_dracooci" - - "dgxa100_dracooci-ord" - description: "Cluster for A100 workloads" + - 'dgxa100_dracooci' + - 'dgxa100_dracooci-ord' + description: 'Cluster for A100 workloads' CLUSTER_H100: - value: "dgxh100_coreweave" + value: 'dgxh100_coreweave' options: - - "dgxh100_coreweave" - - "dgxh100_eos" - description: "Cluster for H100 workloads" + - 'dgxh100_coreweave' + - 'dgxh100_eos' + description: 'Cluster for H100 workloads' + CLUSTER_GB200: + value: 'dgxgb200_oci-hsg' + options: + - 'dgxgb200_oci-hsg' + description: 'Cluster for H100 workloads' PUBLISH: - value: "no" + value: 'no' options: - - "yes" - - "no" + - 'yes' + - 'no' description: Build and publish a wheel to PyPi PUBLISH_COMMIT: - value: "$$CI_COMMIT_SHA" + value: '$$CI_COMMIT_SHA' description: Which commit to publish PUBLISH_VERSION_BUMP_BRANCH: - value: "$$CI_COMMIT_BRANCH" + value: '$$CI_COMMIT_BRANCH' description: Which branch to target for version bump PUBLISH_SCOPE: - value: "code-freeze" + value: 'code-freeze' options: - - "code-freeze" - - "release" - - "review-reminder" - - "upgrade-dependencies" + - 'code-freeze' + - 'release' + - 'review-reminder' + - 'upgrade-dependencies' description: Type of publish (freeze or final release) # CI wide variables @@ -262,7 +267,7 @@ variables: CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci UTILITY_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_utility - TE_GIT_REF: "" + TE_GIT_REF: '' include: - .gitlab/stages/00.pre.yml diff --git a/.gitlab/scripts/build.sh b/.gitlab/scripts/build.sh index e64434e834d..8359731e3d7 100644 --- a/.gitlab/scripts/build.sh +++ b/.gitlab/scripts/build.sh @@ -22,15 +22,21 @@ ADDITIONAL_PARAMS=() if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" || "$CI_COMMIT_BRANCH" == "main" || "$CI_COMMIT_BRANCH" == "dev" ]]; then ADDITIONAL_PARAMS+=("--pull") - ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:main,mode=max") - ADDITIONAL_PARAMS+=("-t ${IMAGE}:${CI_COMMIT_BRANCH}") -elif [[ -n "$CI_MERGE_REQUEST_IID" ]]; then - ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID},mode=max") - ADDITIONAL_PARAMS+=("-t ${IMAGE}:${CI_MERGE_REQUEST_IID}") +fi + +CI_COMMIT_BRANCH=$(echo "$CI_COMMIT_BRANCH" | tr '/' '-' | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9._-]/-/g') +ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:${CI_COMMIT_BRANCH}-${PLATFORM},mode=max") +ADDITIONAL_PARAMS+=("--cache-from type=registry,ref=${IMAGE}-buildcache:${CI_COMMIT_BRANCH}-${PLATFORM}") +ADDITIONAL_PARAMS+=("-t ${IMAGE}:${CI_COMMIT_BRANCH}-${PLATFORM}") + +if [[ -n "$CI_MERGE_REQUEST_IID" ]]; then + ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID}-${PLATFORM},mode=max") + ADDITIONAL_PARAMS+=("--cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID}-${PLATFORM}") + ADDITIONAL_PARAMS+=("-t ${IMAGE}:${CI_MERGE_REQUEST_IID}-${PLATFORM}") fi if [[ "$CI_COMMIT_BRANCH" == "ci-nightly" ]]; then - ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly") + ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly-${PLATFORM}") fi if [[ -n "$TE_GIT_REF" ]]; then @@ -46,13 +52,11 @@ DOCKER_BUILDKIT=1 docker build \ --secret id=LOGGER_INDEX_URL \ --target $STAGE \ -f docker/$FILE \ - -t ${IMAGE}:${CI_PIPELINE_ID} \ + -t ${IMAGE}:${CI_PIPELINE_ID}-${PLATFORM} \ --builder=container \ --build-arg JET_API_VERSION=$JET_API_VERSION \ - --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID} \ - --cache-from type=registry,ref=${IMAGE}-buildcache:dev \ - --cache-from type=registry,ref=${IMAGE}-buildcache:main \ --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \ + --provenance=false \ --push \ --progress plain \ ${ADDITIONAL_PARAMS[@]} . diff --git a/.gitlab/stages/01.build.yml b/.gitlab/stages/01.build.yml index b3ab8cc5bd5..20252e7d045 100644 --- a/.gitlab/stages/01.build.yml +++ b/.gitlab/stages/01.build.yml @@ -9,21 +9,20 @@ extends: [.build_rules, .dind_rules] stage: build tags: - - arch/amd64 + - arch/${PLATFORM} - origin/jet-fleet - env/prod - - ${TAG} + - purpose/builder-large services: - name: docker:24.0.5-dind variables: - HEALTHCHECK_TCP_PORT: '2376' + HEALTHCHECK_TCP_PORT: "2376" timeout: 180m variables: DOCKER_HOST: tcp://docker:2376 - DOCKER_TLS_CERTDIR: '/certs' + DOCKER_TLS_CERTDIR: "/certs" DOCKER_TLS_VERIFY: 1 - DOCKER_CERT_PATH: '$DOCKER_TLS_CERTDIR/client' - TAG: purpose/builder-large + DOCKER_CERT_PATH: "$DOCKER_TLS_CERTDIR/client" STAGE: jet MCORE_BACKWARDS_REF: core_r0.14.0 KUBERNETES_SERVICE_MEMORY_REQUEST: 90Gi @@ -48,7 +47,7 @@ reports: dotenv: build.env -test:build_image: +test:pre_build_image: extends: [.build_image] parallel: matrix: @@ -56,13 +55,30 @@ test:build_image: FILE: Dockerfile.ci.dev IMAGE_TYPE: lts BASE_IMAGE: nvcr.io/nvidia/pytorch:25.09-py3 + PLATFORM: amd64 + - IMAGE: CI_MCORE_LTS_IMAGE + FILE: Dockerfile.ci.dev + IMAGE_TYPE: lts + BASE_IMAGE: nvcr.io/nvidia/pytorch:25.09-py3 + PLATFORM: arm64 - IMAGE: CI_MCORE_DEV_IMAGE FILE: Dockerfile.ci.dev IMAGE_TYPE: dev BASE_IMAGE: nvcr.io/nvidia/pytorch:25.11-py3 + PLATFORM: amd64 + - IMAGE: CI_MCORE_DEV_IMAGE + FILE: Dockerfile.ci.dev + IMAGE_TYPE: dev + BASE_IMAGE: nvcr.io/nvidia/pytorch:25.11-py3 + PLATFORM: arm64 + - IMAGE: UTILITY_IMAGE + FILE: Dockerfile.linting + BASE_IMAGE: python:3.10 + PLATFORM: amd64 - IMAGE: UTILITY_IMAGE FILE: Dockerfile.linting BASE_IMAGE: python:3.10 + PLATFORM: arm64 test:build_nemo_image: extends: [.build_image] @@ -70,6 +86,57 @@ test:build_nemo_image: IMAGE: CI_NEMO_IMAGE FILE: Dockerfile.ci.nemo BASE_IMAGE: nvcr.io/nvidian/nemo:nightly + PLATFORM: amd64 rules: - if: $FUNCTIONAL_TEST == "yes" || $INTEGRATION_TEST == "yes" || $CI_COMMIT_BRANCH == "ci-rebuild-mcore-nemo-image" when: on_success + +test:build_image: + needs: [test:pre_build_image] + extends: [.build_rules, .dind_rules] + parallel: + matrix: + - IMAGE: CI_MCORE_LTS_IMAGE + - IMAGE: CI_MCORE_DEV_IMAGE + - IMAGE: UTILITY_IMAGE + stage: build + tags: + - arch/amd64 + - origin/jet-fleet + - env/prod + - purpose/builder-large + services: + - name: docker:24.0.5-dind + variables: + HEALTHCHECK_TCP_PORT: "2376" + timeout: 180m + variables: + DOCKER_HOST: tcp://docker:2376 + DOCKER_TLS_CERTDIR: "/certs" + DOCKER_TLS_VERIFY: 1 + DOCKER_CERT_PATH: "$DOCKER_TLS_CERTDIR/client" + STAGE: jet + MCORE_BACKWARDS_REF: core_r0.14.0 + KUBERNETES_SERVICE_MEMORY_REQUEST: 90Gi + KUBERNETES_SERVICE_MEMORY_LIMIT: 90Gi + SHARED_PATH: /builds/$CI_PROJECT_PATH/shared + script: + - | + set -x + + env + eval "IMAGE=\$$IMAGE" + + docker manifest create ${IMAGE}:${CI_PIPELINE_ID} \ + ${IMAGE}:${CI_PIPELINE_ID}-amd64 \ + ${IMAGE}:${CI_PIPELINE_ID}-arm64 + + docker manifest push ${IMAGE}:${CI_PIPELINE_ID} + - echo "MCORE_MR_COMMIT=$CI_COMMIT_SHA" | tee -a build.env + - echo "MCORE_BACKWARDS_COMMIT=$MCORE_BACKWARDS_COMMIT" | tee -a build.env + - cat build.env + retry: + max: 2 + artifacts: + reports: + dotenv: build.env diff --git a/.gitlab/stages/03.integration-tests.yml b/.gitlab/stages/03.integration-tests.yml index 824721b9fb1..d28ecd8e137 100644 --- a/.gitlab/stages/03.integration-tests.yml +++ b/.gitlab/stages/03.integration-tests.yml @@ -43,6 +43,7 @@ integration:configure: - | A100_CLUSTER=$([[ "$CLUSTER_A100" != "" ]] && echo $CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER) H100_CLUSTER=$([[ "$CLUSTER_H100" != "" ]] && echo $CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER) + GB200_CLUSTER=$([[ "$CLUSTER_GB200" != "" ]] && echo $CLUSTER_GB200 || echo $DEFAULT_GB200_CLUSTER) - | ARGS=( "--scope $INTEGRATION_TEST_SCOPE" @@ -88,12 +89,30 @@ integration:configure: --platform dgx_h100 \ --cluster $H100_CLUSTER \ --output-path "functional-test-job-lts-H100.yaml" + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment lts \ + --platform dgx_gb2100 \ + --cluster $GB200_CLUSTER \ + --output-path "functional-test-job-lts-GB200.yaml" + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment lts \ + --platform dgx_gb200 \ + --cluster $GB200_CLUSTER \ + --output-path "functional-test-job-lts-GB200.yaml" artifacts: paths: - functional-test-job-lts-A100.yaml - functional-test-job-lts-H100.yaml - functional-test-job-dev-H100.yaml - functional-test-job-dev-A100.yaml + - functional-test-job-lts-GB200.yaml + - functional-test-job-dev-GB200.yaml - tests/test_utils/local_recipes .integration_run: @@ -132,6 +151,12 @@ integration:run_lts_dgx_h100: ENVIRONMENT: lts CLUSTER: H100 +integration:run_lts_dgx_gb200: + extends: [.integration_run] + variables: + ENVIRONMENT: lts + CLUSTER: GB200 + integration:run_dev_dgx_a100: extends: [.integration_run] variables: @@ -143,3 +168,9 @@ integration:run_dev_dgx_h100: variables: ENVIRONMENT: dev CLUSTER: H100 + +integration:run_dev_dgx_gb200: + extends: [.integration_run] + variables: + ENVIRONMENT: dev + CLUSTER: GB200 diff --git a/.gitlab/stages/04.functional-tests.yml b/.gitlab/stages/04.functional-tests.yml index eee5a9b80fe..d32ff86a344 100644 --- a/.gitlab/stages/04.functional-tests.yml +++ b/.gitlab/stages/04.functional-tests.yml @@ -50,6 +50,7 @@ functional:configure: - | A100_CLUSTER=$([[ "$CLUSTER_A100" != "" ]] && echo $CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER) H100_CLUSTER=$([[ "$CLUSTER_H100" != "" ]] && echo $CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER) + GB200_CLUSTER=$([[ "$CLUSTER_GB200" != "" ]] && echo $CLUSTER_GB200 || echo $DEFAULT_GB200_CLUSTER) - | RECORD_CHECKPOINTS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Record checkpoints"* || "$FUNCTIONAL_TEST_RECORD_CHECKPOINTS" == "yes" ]] && echo "true" || echo "false") - | @@ -113,12 +114,32 @@ functional:configure: --cluster $H100_CLUSTER \ --output-path "functional-test-job-lts-H100.yaml" \ ${RELEASE_ARGS[@]} + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment dev \ + --platform dgx_gb200 \ + --cluster $GB200_CLUSTER \ + --output-path "functional-test-job-dev-GB200.yaml" \ + ${RELEASE_ARGS[@]} + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment lts \ + --platform dgx_gb200 \ + --cluster $GB200_CLUSTER \ + --output-path "functional-test-job-lts-GB200.yaml" \ + ${RELEASE_ARGS[@]} artifacts: paths: - functional-test-job-lts-A100.yaml - functional-test-job-lts-H100.yaml - functional-test-job-dev-A100.yaml - functional-test-job-dev-H100.yaml + - functional-test-job-lts-GB200.yaml + - functional-test-job-dev-GB200.yaml - tests/test_utils/local_recipes .functional_run: @@ -157,6 +178,12 @@ functional:run_lts_dgx_h100: ENVIRONMENT: lts CLUSTER: H100 +functional:run_lts_dgx_gb200: + extends: [.functional_run] + variables: + ENVIRONMENT: lts + CLUSTER: GB200 + functional:run_dev_dgx_a100: extends: [.functional_run] variables: @@ -169,6 +196,12 @@ functional:run_dev_dgx_h100: ENVIRONMENT: dev CLUSTER: H100 +functional:run_dev_dgx_gb200: + extends: [.functional_run] + variables: + ENVIRONMENT: dev + CLUSTER: GB200 + functional:run_nemo: extends: [.functional_tests_rules] trigger: diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev index d8c1dd33942..4e1a4de55e8 100644 --- a/docker/Dockerfile.ci.dev +++ b/docker/Dockerfile.ci.dev @@ -17,10 +17,17 @@ ENV UV_LINK_MODE=copy RUN bash -ex <<"EOF" apt-get update - apt-get install -y --no-install-recommends gettext python3-venv psmisc + apt-get install -y --no-install-recommends gettext python3-venv psmisc uuid-runtime apt-get clean python -m venv /opt/jet - wget https://github.com/mikefarah/yq/releases/download/v${YQ_VERSION}/yq_linux_amd64 -O /usr/local/bin/yq + ARCH=$(uname -m) + case "${ARCH}" in \ + "x86_64") YQ_ARCH=amd64 ;; \ + "aarch64") YQ_ARCH=arm64 ;; \ + "armv7l") YQ_ARCH=arm ;; \ + *) echo "Unsupported architecture: ${ARCH}" && exit 1 ;; \ + esac + wget https://github.com/mikefarah/yq/releases/download/v${YQ_VERSION}/yq_linux_${YQ_ARCH} -O /usr/local/bin/yq chmod a+x /usr/local/bin/yq curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh EOF diff --git a/megatron/core/datasets/Makefile b/megatron/core/datasets/Makefile index e745f52399b..16f251bf903 100644 --- a/megatron/core/datasets/Makefile +++ b/megatron/core/datasets/Makefile @@ -1,4 +1,4 @@ -CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color +CXXFLAGS += -O3 -Wall -shared -std=c++17 -fPIC -fdiagnostics-color CPPFLAGS += $(shell python3 -m pybind11 --includes) LIBNAME = helpers_cpp diff --git a/tests/functional_tests/shell_test_utils/_run_training.sh b/tests/functional_tests/shell_test_utils/_run_training.sh index 1d0e77a3477..72fd187d19d 100644 --- a/tests/functional_tests/shell_test_utils/_run_training.sh +++ b/tests/functional_tests/shell_test_utils/_run_training.sh @@ -159,7 +159,7 @@ MASTER_PORT=${MASTER_PORT:-6000} NUM_NODES=${NUM_NODES:-${SLURM_NNODES:-1}} GPUS_PER_NODE=${GPUS_PER_NODE:-8} NODE_RANK=${SLURM_NODEID:-${SLURM_NODEID:-0}} -LAST_RANK=7 +LAST_RANK=$((GPUS_PER_NODE - 1)) export LOG_DIR=$OUTPUT_PATH/logs/$REPEAT mkdir -p $LOG_DIR @@ -170,7 +170,7 @@ DISTRIBUTED_ARGS=( --master_port $MASTER_PORT --node_rank $NODE_RANK --log-dir $LOG_DIR - --tee "0:3,7:3" + --tee "0:3,$LAST_RANK:3" --redirects "3" ) diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_gb200.json new file mode 100644 index 00000000000..f023ed07c99 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer/golden_values_dev_dgx_gb200.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 10.82558, + "2": 10.83322, + "3": 10.82737, + "4": 10.79588, + "5": 10.85708, + "6": 10.86392, + "7": 10.8269, + "8": 10.82588, + "9": 10.83699, + "10": 10.79719, + "11": 10.87851, + "12": 10.85797, + "13": 10.85368, + "14": 10.87548, + "15": 10.79177, + "16": 10.80301, + "17": 10.7745, + "18": 10.80399, + "19": 10.79365, + "20": 10.69588, + "21": 10.6855, + "22": 10.53152, + "23": 10.70658, + "24": 10.57319, + "25": 10.51545, + "26": 10.59076, + "27": 10.60738, + "28": 10.57025, + "29": 10.58904, + "30": 10.34674, + "31": 10.07736, + "32": 10.46317, + "33": 10.45705, + "34": 10.19923, + "35": 10.25593, + "36": 10.21246, + "37": 10.34689, + "38": 10.18008, + "39": 10.40796, + "40": 10.07602, + "41": 10.12935, + "42": 10.21132, + "43": 9.81692, + "44": 9.94027, + "45": 9.817, + "46": 9.80608, + "47": 10.12473, + "48": 9.84047, + "49": 9.50975, + "50": 9.88932 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 1691.0, + "2": 1553.0, + "3": 1673.0, + "4": 1760.0, + "5": 1852.0, + "6": 1861.0, + "7": 1852.0, + "8": 1755.0, + "9": 1952.0, + "10": 1427.0, + "11": 1857.0, + "12": 1820.0, + "13": 1948.0, + "14": 1828.0, + "15": 1913.0, + "16": 1881.0, + "17": 1770.0, + "18": 1683.0, + "19": 1784.0, + "20": 1714.0, + "21": 1969.0, + "22": 1701.0, + "23": 1972.0, + "24": 1545.0, + "25": 1537.0, + "26": 1650.0, + "27": 1770.0, + "28": 1889.0, + "29": 1946.0, + "30": 2031.0, + "31": 1511.0, + "32": 1848.0, + "33": 2009.0, + "34": 1749.0, + "35": 1978.0, + "36": 1926.0, + "37": 2358.0, + "38": 2036.0, + "39": 2202.0, + "40": 2015.0, + "41": 2184.0, + "42": 2304.0, + "43": 2079.0, + "44": 2042.0, + "45": 2082.0, + "46": 2206.0, + "47": 2417.0, + "48": 2284.0, + "49": 2231.0, + "50": 2430.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 552193536.0, + "2": 552193536.0, + "3": 552193536.0, + "4": 553242112.0, + "5": 552193536.0, + "6": 553242112.0, + "7": 553242112.0, + "8": 552193536.0, + "9": 552193536.0, + "10": 552193536.0, + "11": 553242112.0, + "12": 552193536.0, + "13": 552193536.0, + "14": 552193536.0, + "15": 552193536.0, + "16": 553242112.0, + "17": 553242112.0, + "18": 552193536.0, + "19": 553242112.0, + "20": 552193536.0, + "21": 552193536.0, + "22": 552193536.0, + "23": 552193536.0, + "24": 552193536.0, + "25": 552193536.0, + "26": 552193536.0, + "27": 552193536.0, + "28": 552193536.0, + "29": 552193536.0, + "30": 552193536.0, + "31": 552193536.0, + "32": 552193536.0, + "33": 552193536.0, + "34": 552193536.0, + "35": 552193536.0, + "36": 552193536.0, + "37": 552193536.0, + "38": 552193536.0, + "39": 552193536.0, + "40": 552193536.0, + "41": 552193536.0, + "42": 552193536.0, + "43": 552193536.0, + "44": 552193536.0, + "45": 553242112.0, + "46": 552193536.0, + "47": 552193536.0, + "48": 552193536.0, + "49": 552193536.0, + "50": 552193536.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 3798208000.0, + "2": 3942086144.0, + "3": 3942086144.0, + "4": 3942086144.0, + "5": 3942086144.0, + "6": 3942086144.0, + "7": 3942086144.0, + "8": 3942086144.0, + "9": 3942086144.0, + "10": 3942086144.0, + "11": 3942086144.0, + "12": 3942086144.0, + "13": 3942086144.0, + "14": 3942086144.0, + "15": 3942086144.0, + "16": 3942086144.0, + "17": 3942086144.0, + "18": 3942086144.0, + "19": 3942086144.0, + "20": 3942086144.0, + "21": 3942086144.0, + "22": 3942086144.0, + "23": 3942086144.0, + "24": 3942086144.0, + "25": 3942086144.0, + "26": 3942086144.0, + "27": 3942086144.0, + "28": 3942086144.0, + "29": 3942086144.0, + "30": 3942086144.0, + "31": 3942086144.0, + "32": 3942086144.0, + "33": 3942086144.0, + "34": 3942086144.0, + "35": 3942086144.0, + "36": 3942086144.0, + "37": 3942086144.0, + "38": 3942086144.0, + "39": 3942086144.0, + "40": 3942086144.0, + "41": 3942086144.0, + "42": 3942086144.0, + "43": 3942086144.0, + "44": 3942086144.0, + "45": 3942086144.0, + "46": 3942086144.0, + "47": 3942086144.0, + "48": 3942086144.0, + "49": 3942086144.0, + "50": 3942086144.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 11.06303, + "2": 0.15398, + "3": 0.27325, + "4": 0.13945, + "5": 0.25021, + "6": 0.16329, + "7": 0.27717, + "8": 0.18718, + "9": 0.12007, + "10": 0.21402, + "11": 0.2385, + "12": 0.61603, + "13": 0.24413, + "14": 0.18837, + "15": 0.14999, + "16": 0.12555, + "17": 0.24832, + "18": 0.1361, + "19": 0.13136, + "20": 0.27497, + "21": 0.22444, + "22": 0.11923, + "23": 0.11996, + "24": 0.25718, + "25": 0.20275, + "26": 0.35028, + "27": 0.11968, + "28": 0.23901, + "29": 0.12079, + "30": 0.12184, + "31": 0.21733, + "32": 0.28054, + "33": 0.11829, + "34": 0.17717, + "35": 0.1215, + "36": 0.27112, + "37": 0.22357, + "38": 0.12158, + "39": 0.12105, + "40": 0.12099, + "41": 0.21658, + "42": 0.22641, + "43": 0.12146, + "44": 0.1201, + "45": 0.253, + "46": 0.12142, + "47": 0.23268, + "48": 0.13569, + "49": 0.1302, + "50": 0.24153 + } + } +} \ No newline at end of file diff --git a/tests/test_utils/python_scripts/launch_jet_workload.py b/tests/test_utils/python_scripts/launch_jet_workload.py index 6ecd98a06c1..7f60ceb12d6 100644 --- a/tests/test_utils/python_scripts/launch_jet_workload.py +++ b/tests/test_utils/python_scripts/launch_jet_workload.py @@ -8,6 +8,7 @@ import signal import sys import time +import uuid import zipfile from typing import Dict, List, Optional @@ -111,15 +112,12 @@ def launch_and_wait_for_completion( "HF_HUB_CACHE": "/lustre/fsw/coreai_dlalgo_mcore/hf_hub", "TRANSFORMERS_OFFLINE": "1", "CLUSTER": cluster, + "RUN_ID": str(uuid.uuid4()), } } } } }, - "outputs": { - "enabled": True, - "artifacts_storages": [recipe_parser.resolve_artifact_config(cluster)], - }, }, wait_for_validation=True, max_wait_time=(60 * 60), diff --git a/tests/test_utils/python_scripts/recipe_parser.py b/tests/test_utils/python_scripts/recipe_parser.py index b866fbbf5c2..c6e7c5517e8 100644 --- a/tests/test_utils/python_scripts/recipe_parser.py +++ b/tests/test_utils/python_scripts/recipe_parser.py @@ -24,6 +24,8 @@ class dotdict(dict): def resolve_cluster_config(cluster: str) -> str: if cluster == "dgxh100_eos": return "eos" + if cluster == "dgxgb200_oci-hsg": + return "oci-hsg" if cluster == "dgxa100_dracooci": return "draco-oci-iad" if cluster == "dgxa100_dracooci-ord": @@ -35,18 +37,6 @@ def resolve_cluster_config(cluster: str) -> str: raise ValueError(f"Unknown cluster {cluster} provided.") -def resolve_artifact_config(cluster: str) -> str: - if cluster == "dgxh100_eos": - return "eos_lustre" - if cluster == "dgxa100_dracooci": - return "draco-oci_lustre" - if cluster == "dgxa100_dracooci-ord": - return "draco-oci-ord_lustre" - if cluster == "dgxh100_coreweave": - return "coreweave_lustre" - raise ValueError(f"Unknown cluster {cluster} provided.") - - def flatten_products(workload_manifest: dotdict) -> dotdict: """Flattens a nested dict of products""" expanded_products = [] diff --git a/tests/test_utils/recipes/_build-mcore-dev.yaml b/tests/test_utils/recipes/_build-mcore-dev.yaml index 123250d7469..d82417ea5e3 100644 --- a/tests/test_utils/recipes/_build-mcore-dev.yaml +++ b/tests/test_utils/recipes/_build-mcore-dev.yaml @@ -3,7 +3,7 @@ format_version: 1 maintainers: [maanug] spec: name: mcore-pyt-dev - platforms: [linux/amd64] + platforms: [linux/amd64,linux/arm64] source: # The image tag will be added via `jet-tests.yaml` # Tags are one of {buildcache, $CI_PIPELINE_ID} diff --git a/tests/test_utils/recipes/_build-mcore-lts.yaml b/tests/test_utils/recipes/_build-mcore-lts.yaml index d017b71c101..8efa6faa1e5 100644 --- a/tests/test_utils/recipes/_build-mcore-lts.yaml +++ b/tests/test_utils/recipes/_build-mcore-lts.yaml @@ -3,7 +3,7 @@ format_version: 1 maintainers: [maanug] spec: name: mcore-pyt-lts - platforms: [linux/amd64] + platforms: [linux/amd64,linux/arm64] source: # The image tag will be added via `jet-tests.yaml` # Tags are one of {buildcache, $CI_PIPELINE_ID} diff --git a/tests/test_utils/recipes/gpt-gb200.yaml b/tests/test_utils/recipes/gpt-gb200.yaml new file mode 100644 index 00000000000..c32d141bbf4 --- /dev/null +++ b/tests/test_utils/recipes/gpt-gb200.yaml @@ -0,0 +1,73 @@ +type: basic +format_version: 1 +maintainers: [mcore] +loggers: [stdout] +spec: + name: "{test_case}_{environment}_{platforms}" + model: gpt + build: mcore-pyt-{environment} + nodes: 2 + gpus: 4 + n_repeat: 5 + platforms: dgx_a100 + script_setup: | + unset https_proxy + echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc + + # Checkout latest + cd /opt + rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm + git init + git remote add origin $MCORE_REPO + git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*' + git fetch origin $MCORE_MR_COMMIT + git checkout $MCORE_MR_COMMIT + git rev-parse HEAD + + # Checkout backwards-ref + cd /opt + rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy + git init + git remote add origin $MCORE_REPO + git fetch origin $MCORE_BACKWARDS_COMMIT + git checkout $MCORE_BACKWARDS_COMMIT + git rev-parse HEAD + rm -rf megatron; cp -a /opt/megatron-lm/megatron ./ + script: |- + ls + cd /opt/megatron-lm + + NAME=$(echo {test_case}_{environment} | sed 's/dgx_h100/dgx_a100/g') + export GPUS_PER_NODE={gpus} + + ARGUMENTS=( + "DATA_PATH=/mnt/artifacts" + "DATA_CACHE_PATH=/lustre/fsw/coreai_dlalgo_mcore/mcore_ci/data/$RUN_ID/cache/" + "OUTPUT_PATH={assets_dir}" + "TENSORBOARD_PATH={assets_dir}/tensorboard" + "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints" + "CHECKPOINT_LOAD_PATH=/mnt/artifacts/" + "TRAINING_SCRIPT_PATH=pretrain_gpt.py" + "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "N_REPEAT={n_repeat}" + "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" + "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" + ) + + set +x + bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} + exit_code=$? + echo "Exit code: $exit_code" + rm -rf /lustre/fsw/coreai_dlalgo_mcore/mcore_ci/data/$RUN_ID || true + set -x + exit $exit_code + +products: + - test_case: [gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer] + products: + - environment: [lts] + scope: [mr] + - environment: [dev] + scope: [mr, mr-github, mr-github-slim] + platforms: [dgx_gb200] diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/gpt.yaml index f403ac20e3f..eab62026381 100644 --- a/tests/test_utils/recipes/gpt.yaml +++ b/tests/test_utils/recipes/gpt.yaml @@ -3,7 +3,7 @@ format_version: 1 maintainers: [mcore] loggers: [stdout] spec: - name: '{test_case}_{environment}_{platforms}' + name: "{test_case}_{environment}_{platforms}" model: gpt build: mcore-pyt-{environment} nodes: 1 From ab3ae8a08cc6a221f91926ac489ee5d911e33ed8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 14 Jan 2026 18:08:52 +0000 Subject: [PATCH 230/248] ci(hotfix): Repair recipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- tests/test_utils/recipes/gpt-gb200.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_utils/recipes/gpt-gb200.yaml b/tests/test_utils/recipes/gpt-gb200.yaml index c32d141bbf4..750017b70a7 100644 --- a/tests/test_utils/recipes/gpt-gb200.yaml +++ b/tests/test_utils/recipes/gpt-gb200.yaml @@ -66,8 +66,6 @@ spec: products: - test_case: [gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer] products: - - environment: [lts] - scope: [mr] - environment: [dev] scope: [mr, mr-github, mr-github-slim] platforms: [dgx_gb200] From dce8e88e7ad709dc270d16bf4bc84b3b56fe490a Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Thu, 15 Jan 2026 12:06:28 +0800 Subject: [PATCH 231/248] Fix clip_qk for virtual pipeline size > 1 (#2776) Co-authored-by: Xin Yao --- megatron/core/optimizer/qk_clip.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/megatron/core/optimizer/qk_clip.py b/megatron/core/optimizer/qk_clip.py index 72127f94712..26b5787cd50 100644 --- a/megatron/core/optimizer/qk_clip.py +++ b/megatron/core/optimizer/qk_clip.py @@ -22,6 +22,11 @@ def clip_qk(model, log_max_only=False) -> float: for model_chunk in model: for transformer_layer in model_chunk.module.module.decoder.layers: if hasattr(transformer_layer.self_attention, 'clip_qk'): + if ( + transformer_layer.self_attention.core_attention.current_max_attn_logits + is None + ): + continue torch.distributed.all_reduce( transformer_layer.self_attention.core_attention.current_max_attn_logits, op=torch.distributed.ReduceOp.MAX, From 748ab80ed7bda06a6ec4730ff2eb8e9923153818 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 15 Jan 2026 08:49:58 +0000 Subject: [PATCH 232/248] ci(hotfix): GB200 to nightly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- tests/test_utils/recipes/gpt-gb200.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_utils/recipes/gpt-gb200.yaml b/tests/test_utils/recipes/gpt-gb200.yaml index 750017b70a7..70b89e31a0e 100644 --- a/tests/test_utils/recipes/gpt-gb200.yaml +++ b/tests/test_utils/recipes/gpt-gb200.yaml @@ -67,5 +67,5 @@ products: - test_case: [gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer] products: - environment: [dev] - scope: [mr, mr-github, mr-github-slim] + scope: [nightly] platforms: [dgx_gb200] From a32b1985da4d645ceeabae725ef72c110817b987 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 15 Jan 2026 16:42:23 +0100 Subject: [PATCH 233/248] ci(fix): GB200 racecondition (#2962) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- tests/functional_tests/shell_test_utils/run_ci_test.sh | 3 ++- tests/test_utils/recipes/gpt-gb200.yaml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/functional_tests/shell_test_utils/run_ci_test.sh b/tests/functional_tests/shell_test_utils/run_ci_test.sh index 693970d3b67..00daaea69e2 100644 --- a/tests/functional_tests/shell_test_utils/run_ci_test.sh +++ b/tests/functional_tests/shell_test_utils/run_ci_test.sh @@ -69,6 +69,7 @@ mkdir -p $CHECKPOINT_SAVE_PATH mkdir -p $CHECKPOINT_LOAD_PATH || true _CHECKPOINT_LOAD_PATH=$CHECKPOINT_LOAD_PATH _CHECKPOINT_SAVE_PATH=$CHECKPOINT_SAVE_PATH +_TENSORBOARD_PATH=$TENSORBOARD_PATH SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) ROOT_DIR=$(realpath $SCRIPT_DIR/../../../) @@ -130,11 +131,11 @@ for i in $(seq 1 $N_REPEAT); do if [[ $i -gt 1 ]]; then rm -rf $CHECKPOINT_SAVE_PATH/* rm -rf /tmp/checkpoints/* - rm -rf $TENSORBOARD_PATH/* fi # First run never loads from a checkpoint export RUN_NUMBER=1 + export TENSORBOARD_PATH=$_TENSORBOARD_PATH/$i/ export REPEAT=$i export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH export TRAINING_EXIT_CODE=0 diff --git a/tests/test_utils/recipes/gpt-gb200.yaml b/tests/test_utils/recipes/gpt-gb200.yaml index 70b89e31a0e..fd3a8b1605c 100644 --- a/tests/test_utils/recipes/gpt-gb200.yaml +++ b/tests/test_utils/recipes/gpt-gb200.yaml @@ -67,5 +67,5 @@ products: - test_case: [gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer] products: - environment: [dev] - scope: [nightly] + scope: [mr] platforms: [dgx_gb200] From 7c6c4e9b753a78c3ac2e740cb9c715eb599de1e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 15 Jan 2026 18:44:21 +0000 Subject: [PATCH 234/248] Revert "ci(fix): GB200 racecondition (#2962)" This reverts commit a32b1985da4d645ceeabae725ef72c110817b987. --- tests/functional_tests/shell_test_utils/run_ci_test.sh | 3 +-- tests/test_utils/recipes/gpt-gb200.yaml | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/functional_tests/shell_test_utils/run_ci_test.sh b/tests/functional_tests/shell_test_utils/run_ci_test.sh index 00daaea69e2..693970d3b67 100644 --- a/tests/functional_tests/shell_test_utils/run_ci_test.sh +++ b/tests/functional_tests/shell_test_utils/run_ci_test.sh @@ -69,7 +69,6 @@ mkdir -p $CHECKPOINT_SAVE_PATH mkdir -p $CHECKPOINT_LOAD_PATH || true _CHECKPOINT_LOAD_PATH=$CHECKPOINT_LOAD_PATH _CHECKPOINT_SAVE_PATH=$CHECKPOINT_SAVE_PATH -_TENSORBOARD_PATH=$TENSORBOARD_PATH SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) ROOT_DIR=$(realpath $SCRIPT_DIR/../../../) @@ -131,11 +130,11 @@ for i in $(seq 1 $N_REPEAT); do if [[ $i -gt 1 ]]; then rm -rf $CHECKPOINT_SAVE_PATH/* rm -rf /tmp/checkpoints/* + rm -rf $TENSORBOARD_PATH/* fi # First run never loads from a checkpoint export RUN_NUMBER=1 - export TENSORBOARD_PATH=$_TENSORBOARD_PATH/$i/ export REPEAT=$i export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH export TRAINING_EXIT_CODE=0 diff --git a/tests/test_utils/recipes/gpt-gb200.yaml b/tests/test_utils/recipes/gpt-gb200.yaml index fd3a8b1605c..70b89e31a0e 100644 --- a/tests/test_utils/recipes/gpt-gb200.yaml +++ b/tests/test_utils/recipes/gpt-gb200.yaml @@ -67,5 +67,5 @@ products: - test_case: [gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer] products: - environment: [dev] - scope: [mr] + scope: [nightly] platforms: [dgx_gb200] From 619115a902a2c74c3e9f200bdbbaadf10723952f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 16 Jan 2026 01:20:07 +0100 Subject: [PATCH 235/248] ci: Fix GB200 change (#2969) (#2974) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- tests/functional_tests/shell_test_utils/run_ci_test.sh | 5 +++++ tests/test_utils/recipes/gpt.yaml | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/functional_tests/shell_test_utils/run_ci_test.sh b/tests/functional_tests/shell_test_utils/run_ci_test.sh index 693970d3b67..20267536a0f 100644 --- a/tests/functional_tests/shell_test_utils/run_ci_test.sh +++ b/tests/functional_tests/shell_test_utils/run_ci_test.sh @@ -69,6 +69,7 @@ mkdir -p $CHECKPOINT_SAVE_PATH mkdir -p $CHECKPOINT_LOAD_PATH || true _CHECKPOINT_LOAD_PATH=$CHECKPOINT_LOAD_PATH _CHECKPOINT_SAVE_PATH=$CHECKPOINT_SAVE_PATH +_TENSORBOARD_PATH=$TENSORBOARD_PATH SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) ROOT_DIR=$(realpath $SCRIPT_DIR/../../../) @@ -135,6 +136,10 @@ for i in $(seq 1 $N_REPEAT); do # First run never loads from a checkpoint export RUN_NUMBER=1 + DIR=$(dirname "$_TENSORBOARD_PATH") + FILE=$(basename "$_TENSORBOARD_PATH") + export TENSORBOARD_PATH=$DIR/$i/$FILE + mkdir -p $(dirname $TENSORBOARD_PATH) export REPEAT=$i export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH export TRAINING_EXIT_CODE=0 diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/gpt.yaml index eab62026381..90eddc55c27 100644 --- a/tests/test_utils/recipes/gpt.yaml +++ b/tests/test_utils/recipes/gpt.yaml @@ -462,7 +462,7 @@ products: - test_case: [gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer] products: - environment: [lts] - scope: [mr] + scope: [nightly] - environment: [dev] scope: [mr, mr-github, mr-github-slim] platforms: [dgx_h100] @@ -472,11 +472,11 @@ products: scope: [mr] platforms: [dgx_h100] - environment: [lts] - scope: [mr] + scope: [nightly] - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather] products: - environment: [lts] - scope: [mr] + scope: [nightly] - environment: [dev] scope: [mr, mr-github, mr-github-slim] platforms: [dgx_h100] @@ -486,7 +486,7 @@ products: scope: [mr] platforms: [dgx_h100] - environment: [lts] - scope: [mr] + scope: [nightly] # - test_case: [gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone] # products: # - environment: [dev] From b3950164bcf3294f03a0f315d4274b98e7b97adf Mon Sep 17 00:00:00 2001 From: Robin Zhang Date: Fri, 16 Jan 2026 09:38:58 +0800 Subject: [PATCH 236/248] [Dev] TE cudagraph recompute (#2694) Signed-off-by: Robin Zhang Co-authored-by: Xin Yao --- .../core/models/gpt/fine_grained_callables.py | 6 +- megatron/core/tensor_parallel/random.py | 5 + megatron/core/transformer/cuda_graphs.py | 6 +- megatron/core/transformer/moe/moe_layer.py | 15 +- megatron/core/transformer/moe/moe_utils.py | 68 ++++----- .../core/transformer/transformer_config.py | 104 ++++++-------- .../core/transformer/transformer_layer.py | 132 +++++++++++------- megatron/training/arguments.py | 3 - 8 files changed, 172 insertions(+), 167 deletions(-) diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index b4879cd1e13..71c5c19749c 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -466,7 +466,7 @@ def forward_func( shared_expert_output = layer.mlp.shared_experts_compute(pre_mlp_layernorm_output) probs, routing_map = layer.mlp.route(pre_mlp_layernorm_output) - local_tokens, probs, _ = layer.mlp.preprocess( + local_tokens, probs = layer.mlp.preprocess( pre_mlp_layernorm_output, probs, routing_map ) return hidden_states, local_tokens, probs, shared_expert_output @@ -519,9 +519,7 @@ def submodule_moe_forward(node: ScheduleNode, dispatched_tokens: torch.Tensor): # backward graph from connecting to dispatch submodule token_dispatcher._comm_manager.dispatched_probs = dispatched_probs - expert_output, _ = layer.mlp.routed_experts_compute( - dispatched_tokens, dispatched_probs, None - ) + expert_output, _ = layer.mlp.routed_experts_compute(dispatched_tokens, dispatched_probs) if layer.recompute_pre_mlp_layernorm: # discard the output of the pre-mlp layernorm and register the recompute diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py index 617d2803c12..5d5389a52d2 100644 --- a/megatron/core/tensor_parallel/random.py +++ b/megatron/core/tensor_parallel/random.py @@ -627,6 +627,11 @@ def checkpoint(self, run_function, *args): def _recompute(self, _): """Used as a hook to recompute the output.""" + + if self.ctx is None: + # The recomputation has been triggered already. Just return. + return + if not torch.autograd._is_checkpoint_valid(): raise RuntimeError( "Checkpointing is not compatible with .grad(), " diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index b566c1830dc..ec02555233b 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -1835,7 +1835,11 @@ def _get_cuda_graph_input_data(self): sample_args, sample_kwargs = self._get_sample_arguments(order, chunk_id_list) def get_make_graphed_callables_kwargs(): - kwargs = {'allow_unused_input': True, '_order': order} + kwargs = { + 'allow_unused_input': True, + '_order': order, + 'retain_graph_in_backward': self.config.cuda_graph_retain_backward_graph, + } # Calculate the number of warmup iterations per layer per microbatch inside TE # make_graphed_callables(). There are two rules: diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py index e44d8647bd6..e17cebcf1f9 100644 --- a/megatron/core/transformer/moe/moe_layer.py +++ b/megatron/core/transformer/moe/moe_layer.py @@ -24,6 +24,7 @@ ) from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import internal_api try: import transformer_engine as te # pylint: disable=unused-import @@ -222,9 +223,8 @@ def preprocess( """Preprocess token routing for dispatch. This method preprocesses the hidden states and routing probabilities for the token - dispatcher. The original hidden states are returned as a residual connection. + dispatcher. """ - residual = hidden_states # Project the hidden_states from hidden dimension down to latent dimenion. if self.config.moe_latent_size: assert ( @@ -234,7 +234,7 @@ def preprocess( hidden_states, probs = self.token_dispatcher.dispatch_preprocess( hidden_states, routing_map, probs ) - return hidden_states, probs, residual + return hidden_states, probs def dispatch(self, hidden_states: torch.Tensor, probs: torch.Tensor): """Dispatches tokens to assigned expert ranks via communication. @@ -273,9 +273,8 @@ def shared_experts_compute(self, hidden_states: torch.Tensor): return shared_expert_output - def routed_experts_compute( - self, hidden_states: torch.Tensor, probs: torch.Tensor, residual: torch.Tensor - ): + @internal_api + def routed_experts_compute(self, hidden_states: torch.Tensor, probs: torch.Tensor): """Computes the output of the routed experts on the dispatched tokens. This method first post-processes the dispatched input to get permuted tokens @@ -342,7 +341,7 @@ def custom_forward(hidden_states, padding_mask=None): try: shared_expert_output = self.shared_experts_compute(hidden_states) probs, routing_map = self.route(hidden_states, padding_mask=padding_mask) - hidden_states, probs, residual = self.preprocess(hidden_states, probs, routing_map) + hidden_states, probs = self.preprocess(hidden_states, probs, routing_map) except MoECudaGraphPartialCaptureSignal as e: # This signal is raised from the maybe_skip_or_early_return_by_cudagraph decorator. # It means we should early-return from the MoE layer forward pass. @@ -352,7 +351,7 @@ def custom_forward(hidden_states, padding_mask=None): return e.get_early_return_outputs(hidden_states, shared_expert_output) dispatched_input, probs = self.dispatch(hidden_states, probs) - output, mlp_bias = self.routed_experts_compute(dispatched_input, probs, residual) + output, mlp_bias = self.routed_experts_compute(dispatched_input, probs) assert mlp_bias is None, f"mlp_bias is not supported for {type(self.token_dispatcher)}" output = self.combine(output, shared_expert_output) diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py index d915cfabb26..d38b06b2704 100644 --- a/megatron/core/transformer/moe/moe_utils.py +++ b/megatron/core/transformer/moe/moe_utils.py @@ -1,4 +1,5 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +import functools import math from dataclasses import dataclass from typing import List, Optional, Union @@ -1142,17 +1143,24 @@ def get_early_return_outputs( """ Get the CUDA graph early return outputs for the MoE layer, including the intermediate tensors and the intermediate attributes of the token dispatcher. + + The returned output tensors are in the order of: + - routed experts path outputs + - hidden states, probs, and routing map for capturing router + - hidden states and probs for capturing router and preprocess + - intermediate attributes of the token dispatcher (if capturing the preprocess step) + - shared expert path output (if exists) """ if self.return_step == "route": # Capturing the router step returns three intermediate tensors: # hidden states, routing probabilities, and routing map. outputs = [hidden_states, self.kwargs['probs'], self.kwargs['routing_map']] elif self.return_step == "preprocess": - # Capturing the preprocess step returns three intermediate tensors: - # hidden states, routing probabilities, and residual connection. + # Capturing the preprocess step returns two intermediate tensors: + # hidden states and routing probabilities. # It also returns the intermediate attributes of the token dispatcher, recorded in # "token_dispatcher.cudagraph_attrs". - outputs = [self.kwargs['hidden_states'], self.kwargs['probs'], self.kwargs['residual']] + outputs = [self.kwargs['hidden_states'], self.kwargs['probs']] valid_cudagraph_attrs = [] for attr_name in self.moe_layer.token_dispatcher.cudagraph_attrs: hier_attr_name = attr_name.split('.') @@ -1180,6 +1188,7 @@ def get_early_return_outputs( return outputs +@internal_api @dataclass class MoECudaGraphTensorStore: """Storage for tensors used in CUDA graph replay for MoE layers. @@ -1192,8 +1201,6 @@ class MoECudaGraphTensorStore: probs (Optional[torch.Tensor]): The routing probabilities for each token-expert pair. routing_map (Optional[torch.Tensor]): The sparse mapping indicating which experts were selected for each token. Used to skip the normal router step. - residual (Optional[torch.Tensor]): The residual connection tensor before routing. - Used to skip the normal preprocess step. shared_expert_output (Optional[torch.Tensor]): The output from shared experts computation. Used to skip the normal shared expert computation step. """ @@ -1201,7 +1208,6 @@ class MoECudaGraphTensorStore: hidden_states: Optional[torch.Tensor] = None probs: Optional[torch.Tensor] = None routing_map: Optional[torch.Tensor] = None - residual: Optional[torch.Tensor] = None shared_expert_output: Optional[torch.Tensor] = None def is_empty(self) -> bool: @@ -1212,13 +1218,7 @@ def is_empty(self) -> bool: """ return all( getattr(self, field_name) is None - for field_name in [ - 'hidden_states', - 'probs', - 'routing_map', - 'residual', - 'shared_expert_output', - ] + for field_name in ['hidden_states', 'probs', 'routing_map', 'shared_expert_output'] ) def set(self, **kwargs): @@ -1228,7 +1228,6 @@ def set(self, **kwargs): 'hidden_states', 'probs', 'routing_map', - 'residual', 'shared_expert_output', ], f"Invalid field name: {field_name}" if value is not None: @@ -1239,13 +1238,7 @@ def set(self, **kwargs): def clear(self): """Reset all stored tensors to None.""" - for field_name in [ - 'hidden_states', - 'probs', - 'routing_map', - 'residual', - 'shared_expert_output', - ]: + for field_name in ['hidden_states', 'probs', 'routing_map', 'shared_expert_output']: setattr(self, field_name, None) @@ -1288,6 +1281,8 @@ def maybe_raise_signal(moe_layer, **kwargs): raise MoECudaGraphPartialCaptureSignal(moe_layer, "preprocess", **kwargs) def decorator(func): + + @functools.wraps(func) def wrapped_func(moe_layer, *args, **kwargs): """ Check if we should skip executing the original function based on the current @@ -1316,46 +1311,39 @@ def wrapped_func(moe_layer, *args, **kwargs): # Don't skip the router. assert ( moe_layer.cudagraph_tensor_store.routing_map is None - and moe_layer.cudagraph_tensor_store.residual is None - ), "both routing_map and residual must be None if probs is None" + ), "routing_map must be None if probs is None" probs, routing_map = func(moe_layer, *args, **kwargs) # Maybe early return after the router. maybe_raise_signal(moe_layer, probs=probs, routing_map=routing_map) else: # Skip the router and get value from store. - assert ( - moe_layer.cudagraph_tensor_store.routing_map is not None - or moe_layer.cudagraph_tensor_store.residual is not None - ), "either routing_map or residual must be given if probs is given" probs, routing_map = ( moe_layer.cudagraph_tensor_store.probs, moe_layer.cudagraph_tensor_store.routing_map, ) return probs, routing_map elif step_condition == "preprocess": - if moe_layer.cudagraph_tensor_store.residual is None: + if ( + moe_layer.cudagraph_tensor_store.is_empty() + or moe_layer.cudagraph_tensor_store.routing_map is not None + ): # Don't skip the preprocess. - hidden_states, probs, residual = func(moe_layer, *args, **kwargs) + hidden_states, probs = func(moe_layer, *args, **kwargs) # Maybe early return after the preprocess. - maybe_raise_signal( - moe_layer, hidden_states=hidden_states, probs=probs, residual=residual - ) + maybe_raise_signal(moe_layer, hidden_states=hidden_states, probs=probs) else: # Skip the preprocess and get value from store. assert ( - moe_layer.cudagraph_tensor_store.probs is not None - ), "probs must not be None if residual is not None" - assert ( - moe_layer.cudagraph_tensor_store.routing_map is None - ), "routing_map must be None if residual is not None" - hidden_states, probs, residual = ( + moe_layer.cudagraph_tensor_store.hidden_states is not None + and moe_layer.cudagraph_tensor_store.probs is not None + ), "hidden_states and probs must be given in moe_preprocess cudagraph replay" + hidden_states, probs = ( moe_layer.cudagraph_tensor_store.hidden_states, moe_layer.cudagraph_tensor_store.probs, - moe_layer.cudagraph_tensor_store.residual, ) - return hidden_states, probs, residual + return hidden_states, probs return wrapped_func diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index 3a57f09f6cf..df11daeb095 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -723,11 +723,11 @@ class TransformerConfig(ModelParallelConfig): determines the scope of graph capture.""" cuda_graph_use_single_mempool: bool = False - """When set to true, cudagraphs will be captured inside a single mempool, in which all - cudagraphs may only be used once per step. If false, cudagraphs may be reused across - microbatches. Enabling may reduce cudagraph memory overheads due to memory fragmentation, - however may greatly increase the number of cudagraphs created when the number of microbatches - is high.""" + """[For `local` implementation only] When set to true, cudagraphs will be captured inside a + single mempool, in which all cudagraphs may only be used once per step. If false, cudagraphs may + be reused across microbatches. Enabling may reduce cudagraph memory overheads due to memory + fragmentation, however may greatly increase the number of cudagraphs created when the number of + microbatches is high.""" cuda_graph_retain_backward_graph: bool = False """When set to true, cudagraph backward passes will be graph captured with 'retain_grad=True' @@ -1739,64 +1739,46 @@ def __post_init__(self): ) if self.recompute_granularity: - if self.recompute_granularity != "selective" or not self.cuda_graph_scope: - raise ValueError( - "Full-layer CUDA graphs not supported with activation recomputation." - ) - elif self.cuda_graph_scope != [CudaGraphScope.full_iteration]: - # For scoped CUDA graphs, only the non-graphed parts of the layer can be - # recomputed. So check if there are overlaps between the recomputed parts - # and the graphed parts. - if CudaGraphScope.attn in self.cuda_graph_scope: - for module in self.recompute_modules: - if module in ['core_attn', 'mla_up_proj']: - raise ValueError( - f'attn cuda graph is not supported with {module} recompute.' - ) + if self.recompute_granularity != "selective": + assert self.cuda_graph_scope == [ + CudaGraphScope.full_iteration + ], "full recompute is only supported with full iteration CUDA graph." + else: + # The recompute module should be inside or outside of the graph scope. + # Recompute module coverring graph scope is not allowed. + if "moe" in self.recompute_modules: + assert ( + CudaGraphScope.moe_router not in self.cuda_graph_scope + ), "moe recompute is not supported with moe_router CUDA graph." + # Graphed recompute module doesn't accept random number. if ( - CudaGraphScope.mlp in self.cuda_graph_scope - and "mlp" in self.recompute_modules + not self.cuda_graph_scope + or CudaGraphScope.full_iteration in self.cuda_graph_scope ): - raise ValueError(f'mlp cuda graph is not supported with mlp recompute.') - if CudaGraphScope.moe in self.cuda_graph_scope: - for module in self.recompute_modules: - if module in ['moe_act', 'moe', 'shared_experts']: - raise ValueError( - f'moe cuda graph is not supported with {module} recompute.' - ) - if CudaGraphScope.moe_router in self.cuda_graph_scope: - for module in self.recompute_modules: - if module in ['moe', 'shared_experts']: - raise ValueError( - f'moe_router cuda graph is not supported with {module} ' - 'recompute.' - ) - if "layernorm" in self.recompute_modules: - if ( - CudaGraphScope.attn in self.cuda_graph_scope - and CudaGraphScope.mlp in self.cuda_graph_scope - and ( - CudaGraphScope.moe in self.cuda_graph_scope - or CudaGraphScope.moe_router in self.cuda_graph_scope - ) - ): - raise ValueError( - 'cuda graph is not supported with layernorm recompute.' - ) - if CudaGraphScope.attn in self.cuda_graph_scope: - warnings.warn( - "input_layernorm recompute is not supported with attention " - "cudagraph. Will only recompute the pre_mlp_layernorm." - ) - if ( - CudaGraphScope.mlp in self.cuda_graph_scope - or CudaGraphScope.moe in self.cuda_graph_scope - or CudaGraphScope.moe_router in self.cuda_graph_scope - ): - warnings.warn( - "pre_mlp_layernorm recompute is not supported with mlp/moe " - "cudagraph. Will only recompute the input_layernorm." - ) + full_cudagraph = True + else: + full_cudagraph = False + if self.attention_dropout != 0.0: + assert ( + not full_cudagraph and CudaGraphScope.attn not in self.cuda_graph_scope + ) or "core_attn" not in self.recompute_modules, ( + "attention dropout is not supported with graphed attention " + "recomputation." + ) + if self.hidden_dropout != 0.0: + assert ( + (not full_cudagraph and CudaGraphScope.mlp not in self.cuda_graph_scope) + or "mlp" not in self.recompute_modules + ) and ( + (not full_cudagraph and CudaGraphScope.moe not in self.cuda_graph_scope) + or "moe" not in self.recompute_modules + ), "hidden dropout is not supported with graphed MLP/MoE recomputation." + if self.moe_input_jitter_eps is not None: + assert ( + not full_cudagraph and CudaGraphScope.moe not in self.cuda_graph_scope + ) or "moe" not in self.recompute_modules, ( + "moe_input_jitter_eps is not supported with graphed moe recomputation." + ) if self.moe_token_dispatcher_type in ["allgather"]: if self.variable_seq_lengths is True: diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index 53a1470c492..ce90aaf357a 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -381,24 +381,55 @@ def __init__( self.recompute_mlp = False if self.config.recompute_granularity == 'selective': if "layernorm" in self.config.recompute_modules: - if not isinstance(self.input_layernorm, IdentityOp) and ( - self.config.cuda_graph_impl == "none" - or CudaGraphScope.attn not in self.config.cuda_graph_scope - ): + if not isinstance(self.input_layernorm, IdentityOp): self.recompute_input_layernorm = True if self.config.fp8 or self.config.fp4: self.self_attention.set_for_recompute_input_layernorm() - if not isinstance(self.pre_mlp_layernorm, IdentityOp) and ( - self.config.cuda_graph_impl == "none" - or ( + + def can_recompute_pre_mlp_layernorm_for_cudagraph(): + if ( not self.is_moe_layer - and CudaGraphScope.mlp not in self.config.cuda_graph_scope - ) - or ( - self.is_moe_layer - and CudaGraphScope.moe not in self.config.cuda_graph_scope - and CudaGraphScope.moe_router not in self.config.cuda_graph_scope + or CudaGraphScope.moe_router not in self.config.cuda_graph_scope + ): + # Not a MoE layer, or not capturing the router part. + return True + if ( + self.config.moe_shared_expert_intermediate_size is not None + and self.config.moe_shared_expert_overlap + ): + # If shared expert overlap is used, we cannot make the pre-mlp layernorm + # recomputation, because the shared expert takes the layernorm output as + # input, and it is outside of the CUDA graph scope. + log_single_rank( + logger, + logging.WARNING, + "pre_mlp_layernorm recompute is not supported with moe router " + "cudagraph + shared expert overlap. Disabling pre_mlp_layernorm " + "recompute.", + ) + return False + if CudaGraphScope.moe_preprocess in self.config.cuda_graph_scope and ( + self.config.moe_token_dispatcher_type == "alltoall" + or self.config.moe_latent_size + ): + # Only when capturing the preprocess part and using alltoall token + # dispatcher or latent MoE can we make the pre-mlp layernorm recomputation. + # Because in other cases the layernorm output returns directly as one of the + # outputs of the cudagraph, which will be allocated a static buffer, thus + # not able to be released. + return True + log_single_rank( + logger, + logging.WARNING, + "pre_mlp_layernorm recompute is only supported with moe router + " + "preprocess cudagraph will alltoall token dispatcher or latent MoE. " + "Disabling pre_mlp_layernorm recompute.", ) + return False + + if ( + not isinstance(self.pre_mlp_layernorm, IdentityOp) + and can_recompute_pre_mlp_layernorm_for_cudagraph() ): self.recompute_pre_mlp_layernorm = True if self.config.fp8 or self.config.fp4: @@ -645,20 +676,7 @@ def _forward_mlp(self, hidden_states, inference_context=None, padding_mask=None) and not isinstance(self.mlp, IdentityOp) ) - if ( - self.is_moe_layer - and self.config.cuda_graph_impl == "transformer_engine" - and self.training - and is_graph_capturing() - and CudaGraphScope.moe_router in self.config.cuda_graph_scope - ): - assert ( - not self.recompute_pre_mlp_layernorm - ), "Recomputation is not supported for CUDA graph." - cudagraph_outputs = self.mlp(pre_mlp_layernorm_output, padding_mask=padding_mask) - nvtx_range_pop(suffix="mlp") - return cudagraph_outputs + [residual] - elif self.recompute_mlp: + if self.recompute_mlp: if self.config.fp8 or self.config.fp4: # import here to avoid circular import from megatron.core.extensions.transformer_engine import te_checkpoint @@ -701,7 +719,23 @@ def _forward_mlp(self, hidden_states, inference_context=None, padding_mask=None) ) nvtx_range_pop(suffix="mlp") - return self._forward_post_mlp(mlp_output_with_bias, residual) + if ( + self.is_moe_layer + and self.config.cuda_graph_impl == "transformer_engine" + and self.training + and is_graph_capturing() + and CudaGraphScope.moe_router in self.config.cuda_graph_scope + ): + if self.recompute_pre_mlp_layernorm: + # Register the recompute hooks to all the cudagraph output tensors, because some + # tensors are in parallel execution paths and they all need pre_mlp_layernorm to be + # recomputed in backward pass. For example, the router path and the shared expert + # path. So only register in one path is risky. + for tensor in mlp_output_with_bias[1:]: + self.pre_mlp_norm_checkpoint.discard_output_and_register_recompute(tensor) + return list(mlp_output_with_bias) + [residual] + else: + return self._forward_post_mlp(mlp_output_with_bias, residual) def _forward_post_mlp(self, mlp_output_with_bias, residual): """ @@ -895,20 +929,19 @@ def _te_cuda_graph_replay(self, *args, **kwargs): elif self.is_moe_layer and CudaGraphScope.moe_router in self.config.cuda_graph_scope: # CUDA Graph partially captures the MoE. # The rest of the layer should go to the normal pass. - shared_expert_output, routing_map, residual = None, None, None - mlp_residual = cuda_graph_output.pop() + shared_expert_output, routing_map = None, None + # residual is the last element in the CUDA graph output. + residual = cuda_graph_output.pop() if ( self.config.moe_shared_expert_intermediate_size is not None and not self.config.moe_shared_expert_overlap ): - # The shared expert output is the fourth element in the CUDA graph output. + # The shared expert output is the last second element in the CUDA graph output. shared_expert_output = cuda_graph_output.pop() - # Split cudagraph outputs into function outputs and attribute outputs, and - # process them separately. Function outputs should have three tensors. - func_output, attr_outputs = cuda_graph_output[:3], cuda_graph_output[3:] if CudaGraphScope.moe_preprocess in self.config.cuda_graph_scope: - hidden_states, probs, residual = func_output + # CUDA graph output is [hidden_states, probs] + attributes outputs. + (hidden_states, probs), attr_outputs = cuda_graph_output[:2], cuda_graph_output[2:] valid_cudagraph_attrs = self.mlp.token_dispatcher.valid_cudagraph_attrs assert len(attr_outputs) == len( valid_cudagraph_attrs @@ -920,8 +953,12 @@ def _te_cuda_graph_replay(self, *args, **kwargs): attr = getattr(attr, name) setattr(attr, hier_attr_name[-1], attr_outputs[i]) else: - hidden_states, probs, routing_map = func_output - assert not attr_outputs, "cuda_graph_attr_outputs should be empty" + # CUDA graph output is [hidden_states, probs, routing_map]. + assert len(cuda_graph_output) == 3, ( + "CUDA graph output should be [hidden_states, probs, routing_map], " + f"but got {len(cuda_graph_output)} elements" + ) + hidden_states, probs, routing_map = cuda_graph_output # Resume the MoELayer forward pass from the end of the CUDA graph scope. # The MoE layer will skip redundant computations when we pass in the calculated values @@ -931,37 +968,32 @@ def _te_cuda_graph_replay(self, *args, **kwargs): hidden_states=hidden_states, probs=probs, routing_map=routing_map, - residual=residual, shared_expert_output=shared_expert_output, ) # If EP overlap is enabled, remaining of mlp will be called as fine_grained_callables # and should be skipped here. if self.config.overlap_moe_expert_parallel_comm: probs, routing_map = self.mlp.route(hidden_states) - hidden_states, probs, residual = self.mlp.preprocess( - hidden_states, probs, routing_map - ) + hidden_states, probs = self.mlp.preprocess(hidden_states, probs, routing_map) nvtx_range_pop(suffix="mlp") - return mlp_residual, hidden_states, probs, shared_expert_output + return residual, hidden_states, probs, shared_expert_output mlp_output_with_bias = self.mlp(hidden_states) self.mlp.cudagraph_tensor_store.clear() nvtx_range_pop(suffix="mlp") - output = self._forward_post_mlp(mlp_output_with_bias, mlp_residual) + output = self._forward_post_mlp(mlp_output_with_bias, residual) else: # If EP overlap is enabled, needs to return same outputs as submodule.attn if self.config.overlap_moe_expert_parallel_comm: assert len(cuda_graph_output) == 1, "CUDA Graph output should be the layer output." - mlp_residual = cuda_graph_output.pop() + residual = cuda_graph_output.pop() if not self.is_moe_layer: - return mlp_residual, None, None, None - hidden_states = self.pre_mlp_layernorm(mlp_residual) + return residual, None, None, None + hidden_states = self.pre_mlp_layernorm(residual) shared_expert_output = self.mlp.shared_experts_compute(hidden_states) probs, routing_map = self.mlp.route(hidden_states) - hidden_states, probs, residual = self.mlp.preprocess( - hidden_states, probs, routing_map - ) - return mlp_residual, hidden_states, probs, shared_expert_output + hidden_states, probs = self.mlp.preprocess(hidden_states, probs, routing_map) + return residual, hidden_states, probs, shared_expert_output # CUDA Graph does not capture the MLP/MoE part at all. output = self._forward_mlp(*cuda_graph_output) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 9aba3a7cb8e..5f9e7350c18 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1322,9 +1322,6 @@ def validate_args(args, defaults={}): "Setting NCCL_GRAPH_REGISTER=0 to avoid illegal memory access when using " "CUDA Graph with PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True." ) - assert ( - args.recompute_granularity != 'full' - ), 'recompute_granularity must not be full when CUDA Graphs are enabled.' if args.cuda_graph_scope == "full" or ( isinstance(args.cuda_graph_scope, list) and "full" in args.cuda_graph_scope ): From b927e1fa1a90e218c64129280531d34377a66c72 Mon Sep 17 00:00:00 2001 From: xuwchen <79835960+xuwchen@users.noreply.github.com> Date: Fri, 16 Jan 2026 14:55:03 +0800 Subject: [PATCH 237/248] [Dev] docs(megatron-fsdp): add Megatron-FSDP user guide (#2397) --- docs/api-guide/custom_fsdp.md | 2 + docs/discussions/README.md | 10 +- .../sbatch_checkpoint_convert.sh | 50 ++++ .../sbatch_mfsdp_deepseek_v3.sh | 223 ++++++++++++++++++ .../megatron-fsdp-user-guide.md | 116 +++++++++ 5 files changed, 397 insertions(+), 4 deletions(-) create mode 100644 docs/discussions/megatron-fsdp-user-guide/example-scripts/sbatch_checkpoint_convert.sh create mode 100644 docs/discussions/megatron-fsdp-user-guide/example-scripts/sbatch_mfsdp_deepseek_v3.sh create mode 100644 docs/discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md diff --git a/docs/api-guide/custom_fsdp.md b/docs/api-guide/custom_fsdp.md index e265de8ae4b..faa262ee7fa 100644 --- a/docs/api-guide/custom_fsdp.md +++ b/docs/api-guide/custom_fsdp.md @@ -13,6 +13,8 @@ Add these flag to enable MCore custom FSDP. --use-distributed-optimizer ``` +For a practical guide covering required configurations, checkpoint conversion, and example scripts, see the [Megatron-FSDP User Guide](../../discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md). + ## Key Features - **Sharding Strategy**: Efficiently shards optimizer states, gradients, and parameters to reduce memory consumption. diff --git a/docs/discussions/README.md b/docs/discussions/README.md index 26a2a8e1648..81b1a58d5b0 100644 --- a/docs/discussions/README.md +++ b/docs/discussions/README.md @@ -6,14 +6,16 @@ This directory contains in-depth guides, tutorials, and discussions about optimi ### Performance Optimization -- **[Optimizing DeepSeek-V3 Training Performance on NVIDIA GB200 NVL72](deepseek-v3-gb200-optimization/deepseek-v3-gb200-optimization.md)** - - A comprehensive guide on optimizing DeepSeek-V3 model training on NVIDIA GB200 NVL72 systems, covering profiling techniques, performance bottlenecks, and optimization strategies. - - **[A Guide to Reproduce DeepSeek-V3 Pre-training Performance on GB200](deepseek-v3-gb200-optimization/deepseek-v3-gb200-reproduce-guide.md)** A detailed guide on how to reproduce the DeepSeek-V3 pre-training performance on GB200, incluing the dockerfile, package requirements and training scripts. +### Training Guides + +- **[Megatron-FSDP User Guide](megatron-fsdp-user-guide/megatron-fsdp-user-guide.md)** + + A practical guide to enable Megatron-FSDP training, including a quick-start example for DeepSeek-V3, required and recommended configurations, and instructions for checkpoint conversion from torch_dist to fsdp_dtensor. + ## Contributing If you'd like to contribute a guide or tutorial, please follow this structure: diff --git a/docs/discussions/megatron-fsdp-user-guide/example-scripts/sbatch_checkpoint_convert.sh b/docs/discussions/megatron-fsdp-user-guide/example-scripts/sbatch_checkpoint_convert.sh new file mode 100644 index 00000000000..9f302c93f8f --- /dev/null +++ b/docs/discussions/megatron-fsdp-user-guide/example-scripts/sbatch_checkpoint_convert.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Configuration: Set these paths before running the script +MEGATRON_PATH=${MEGATRON_PATH:-"your_own_megatron_path"} # Path to Megatron-LM repository +CONTAINER_IMAGE=${CONTAINER_IMAGE:-"your_own_container_image"} # Path to .sqsh or docker image url +OUTPUT_PATH=${OUTPUT_PATH:-"your_own_output_path"} # Path for SLURM logs + +# Checkpoint conversion command +# Note: Update the checkpoint paths in the command below +RUN_CMD=" +cd ${MEGATRON_PATH}; +git rev-parse HEAD; +export PYTHONPATH=${MEGATRON_PATH}:${PYTHONPATH}; +python3 tools/checkpoint/checkpoint_inspector.py \ + convert-torch-dist-to-fsdp-dtensor --swiglu \ + your_own_path_to_input_torch_dist_checkpoint \ + your_own_path_to_output_fsdp_dtensor_checkpoint \ + --param-to-param-group-map-json your_own_path_to_param_to_param_group_map.json" + +# SLURM settings +SLURM_LOGS="${OUTPUT_PATH}/slurm_logs" +mkdir -p ${SLURM_LOGS} || { + echo "Error: Failed to create SLURM logs directory ${SLURM_LOGS}" + exit 1 +} + +# Submit SLURM job +# Note: Update SBATCH parameters below according to your cluster configuration +set +e +sbatch <&1 | tee ${SLURM_LOGS}/\${SLURM_JOB_ID}.log + +EOF +set -e diff --git a/docs/discussions/megatron-fsdp-user-guide/example-scripts/sbatch_mfsdp_deepseek_v3.sh b/docs/discussions/megatron-fsdp-user-guide/example-scripts/sbatch_mfsdp_deepseek_v3.sh new file mode 100644 index 00000000000..7b93d25d943 --- /dev/null +++ b/docs/discussions/megatron-fsdp-user-guide/example-scripts/sbatch_mfsdp_deepseek_v3.sh @@ -0,0 +1,223 @@ +#!/bin/bash + +export NCCL_IB_SL=1 +export NCCL_IB_TIMEOUT=19 +export NVTE_FWD_LAYERNORM_SM_MARGIN=16 +export NVTE_BWD_LAYERNORM_SM_MARGIN=16 +export NCCL_P2P_NET_CHUNKSIZE=2097152 +export TORCH_NCCL_AVOID_RECORD_STREAMS=1 +export PYTHONWARNINGS=ignore +export TRITON_CACHE_DIR=/tmp/triton_cache_$SLURM_NODEID + +# Configuration: Set these variables before running the script +MEGATRON_PATH=${MEGATRON_PATH:-"your_own_megatron_path"} # Path to Megatron-LM repository +CONTAINER_IMAGE=${CONTAINER_IMAGE:-"your_own_container_image"} # Path to .sqsh or docker image url +OUTPUT_PATH=${OUTPUT_PATH:-"your_own_output_path"} # Path for output logs and checkpoints +DATA_PATH=${DATA_PATH:-"your_own_data_path"} +USE_MEGATRON_FSDP=${USE_MEGATRON_FSDP:-1} +SHARDING_STRATEGY=${SHARDING_STRATEGY:-"optim_grads_params"} +PROFILE=${PROFILE:-0} +WANDB=${WANDB:-1} + +TP=${TP:-1} +EP=${EP:-8} +MBS=${MBS:-4} +GBS=${GBS:-2048} +COMMENT=${COMMENT:-"hybridep-selective-recompute"} + +PRETRAIN_ARGS=( + --distributed-timeout-minutes 60 + --tensor-model-parallel-size ${TP} + --expert-model-parallel-size ${EP} + --expert-tensor-parallel-size 1 + --context-parallel-size 1 + --use-distributed-optimizer + --overlap-grad-reduce + --overlap-param-gather + --use-mcore-models + --sequence-parallel + --use-flash-attn + --disable-bias-linear + --micro-batch-size ${MBS} + --global-batch-size ${GBS} + --train-samples 585937500 + --exit-duration-in-mins 220 + --no-check-for-nan-in-loss-and-grad + --manual-gc + --manual-gc-interval 10 + --recompute-granularity selective + --recompute-modules mlp moe mla_up_proj layernorm + --transformer-impl transformer_engine + --seq-length 4096 + --data-cache-path ${OUTPUT_PATH}/cache + --tokenizer-type HuggingFaceTokenizer + --tokenizer-model deepseek-ai/DeepSeek-V3 + --data-path ${DATA_PATH} + --split 99,1,0 + --no-mmap-bin-files + --no-create-attention-mask-in-dataloader + --num-workers 6 + --num-layers 61 + --hidden-size 7168 + --ffn-hidden-size 18432 + --num-attention-heads 128 + --kv-channels 128 + --max-position-embeddings 4096 + --position-embedding-type rope + --rotary-base 10000 + --make-vocab-size-divisible-by 3232 + --normalization RMSNorm + --norm-epsilon 1e-6 + --swiglu + --untie-embeddings-and-output-weights + --multi-latent-attention + --attention-dropout 0.0 + --hidden-dropout 0.0 + --clip-grad 1.0 + --weight-decay 0.1 + --qk-layernorm + --lr-decay-samples 584765624 + --lr-warmup-samples 1536000 + --lr-warmup-init 3.9e-7 + --lr 3.9e-6 + --min-lr 3.9e-7 + --lr-decay-style cosine + --adam-beta1 0.9 + --adam-beta2 0.95 + --num-experts 256 + --moe-layer-freq [0]*3+[1]*58 + --moe-ffn-hidden-size 2048 + --moe-shared-expert-intermediate-size 2048 + --moe-router-load-balancing-type seq_aux_loss + --moe-router-topk 8 + --moe-token-dispatcher-type flex + --moe-flex-dispatcher-backend hybridep + --moe-router-pre-softmax + --moe-grouped-gemm + --moe-aux-loss-coeff 1e-4 + --moe-router-group-topk 4 + --moe-router-num-groups 8 + --moe-router-topk-scaling-factor 2.5 + --moe-router-score-function sigmoid + --moe-router-enable-expert-bias + --moe-router-bias-update-rate 1e-3 + --moe-router-dtype fp32 + --moe-permute-fusion + --moe-router-force-load-balancing + --q-lora-rank 1536 + --kv-lora-rank 512 + --qk-head-dim 128 + --qk-pos-emb-head-dim 64 + --v-head-dim 128 + --rotary-scaling-factor 40 + --mscale 1.0 + --mscale-all-dim 1.0 + --mtp-num-layers 1 + --mtp-loss-scaling-factor 0.1 + --eval-iters 32 + --eval-interval 100 + --auto-detect-ckpt-format + --load ${OUTPUT_PATH}/checkpoints + --save ${OUTPUT_PATH}/checkpoints + --save-interval 100 + --dist-ckpt-strictness log_all + --init-method-std 0.02 + --log-timers-to-tensorboard + --log-memory-to-tensorboard + --log-num-zeros-in-grad + --log-params-norm + --log-validation-ppl-to-tensorboard + --log-throughput + --log-interval 1 + --logging-level 40 + --tensorboard-dir ${OUTPUT_PATH}/tensorboard + --bf16 + --enable-experimental +) + +if [ "${USE_MEGATRON_FSDP}" = 1 ]; then + unset CUDA_DEVICE_MAX_CONNECTIONS + PRETRAIN_ARGS=( + "${PRETRAIN_ARGS[@]}" + --use-megatron-fsdp + --data-parallel-sharding-strategy ${SHARDING_STRATEGY} + --no-gradient-accumulation-fusion + --use-distributed-optimizer + --calculate-per-token-loss + --init-model-with-meta-device + --ckpt-format fsdp_dtensor + --grad-reduce-in-bf16 + --fsdp-double-buffer + --use-nccl-ub + ) +fi + +# Profiling command +if [ "${PROFILE}" = 1 ]; then + PROFILE_CMD="nsys profile --sample=none --cpuctxsw=none --trace=cuda,nvtx,cublas,cudnn \ + --capture-range=cudaProfilerApi \ + --capture-range-end=stop \ + --cuda-graph-trace=node \ + --cuda-memory-usage=true \ + -f true -x true \ + -o ${OUTPUT_PATH}/nsys/Megatron-FSDP-Deepseek-V3-TP${TP}EP${EP}-MBS${MBS}GBS${GBS}-${COMMENT}" + PRETRAIN_ARGS=( + "${PRETRAIN_ARGS[@]}" + --profile + --profile-step-start 10 + --profile-step-end 12 + --profile-ranks 0 + ) + echo "PROFILE_CMD=" + echo $PROFILE_CMD +else + PROFILE_CMD="" +fi + +if [ "${WANDB}" = 1 ]; then + export WANDB_API_KEY=${WANDB_API_KEY:-"your_own_wandb_api_key"} + PRETRAIN_ARGS=( + "${PRETRAIN_ARGS[@]}" + --wandb-project your_own_wandb_project + --wandb-exp-name DeepSeek-V3-TP${TP}EP${EP}-MBS${MBS}GBS${GBS}-${COMMENT} + ) +fi + +TRAINING_CMD=" +cd ${MEGATRON_PATH}; +git rev-parse HEAD; +export PYTHONPATH=${MEGATRON_PATH}:${PYTHONPATH}; +${PROFILE_CMD} python ${MEGATRON_PATH}/pretrain_gpt.py ${PRETRAIN_ARGS[@]}" + +# SLURM settings +SLURM_LOGS="${OUTPUT_PATH}/slurm_logs" +mkdir -p ${SLURM_LOGS} || { + echo "Error: Failed to create SLURM logs directory ${SLURM_LOGS}" + exit 1 +} + +# Submit SLURM job +# Note: Update SBATCH parameters below according to your cluster configuration +set +e +sbatch <&1 | tee ${SLURM_LOGS}/\${SLURM_JOB_ID}.log + +EOF +set -e diff --git a/docs/discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md b/docs/discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md new file mode 100644 index 00000000000..c2354ad07f0 --- /dev/null +++ b/docs/discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md @@ -0,0 +1,116 @@ +# Megatron-FSDP User Guide + +## Table of Contents + +- [Megatron-FSDP Quick Start](#megatron-fsdp-quick-start) +- [Checkpoint Conversion from 3D-Parallel to Megatron-FSDP](#checkpoint-conversion-from-3d-parallel-to-megatron-fsdp) + +## Megatron-FSDP Quick Start + +We recommend using the latest [NVIDIA NeMo Framework Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags), which provides a tested software stack and optimized performance. + +For your reference, we provide an example launch script for DeepSeek-V3: [`sbatch_mfsdp_deepseek_v3.sh`](./example-scripts/sbatch_mfsdp_deepseek_v3.sh). + +### Required Configurations + +To enable Megatron-FSDP, add the following required flags to your training script: + +```bash +--use-megatron-fsdp +--data-parallel-sharding-strategy optim_grads_params +--no-gradient-accumulation-fusion +--use-distributed-optimizer +--ckpt-format fsdp_dtensor +``` + +### Recommended Configurations + +We also recommend adding the following configurations to further improve performance: + +```bash +unset CUDA_DEVICE_MAX_CONNECTIONS +``` +```bash +--calculate-per-token-loss +--init-model-with-meta-device +--grad-reduce-in-bf16 +--fsdp-double-buffer +--use-nccl-ub +``` + +💡 **Detailed explanations of these configurations are provided below.** + +#### 1. Disable `CUDA_DEVICE_MAX_CONNECTIONS` + +To ensure full parallelization of FSDP communication and computation, disable the CUDA_DEVICE_MAX_CONNECTIONS environment variable. This step avoids potential bubbles in the CUDA stream. (But it may slow down TP and CP to some extent.) + +#### 2. Add `--calculate-per-token-loss` + +For gradients sharding mode optimization, include the `--calculate-per-token-loss` flag in your training script. This improves performance by reducing the frequency of gradient scaling, which is also a sizable drain on SM resources. + +#### 3. Add `--init-model-with-meta-device` + +Allows model initialization using meta device, followed by layer-by-layer initialization of distributed model weight buffers via the `Module.reset_parameters` API, facilitating the initialization of extremely large models. + +#### 4. Add `--grad-reduce-in-bf16` + +Enables gradient reduction in BF16 precision instead of FP32, reducing communication volume and accelerating the backward pass. + +#### 5. Add `--fsdp-double-buffer` + +Uses persistently allocated double buffers for temporarily-defined memory needed in `MegatronFSDP` communications. While having persistent double buffers may increase peak VRAM utilization, it is necessary to register NCCL user buffers (`nccl_ub=True`) for `MegatronFSDP`. Currently, this is supported only for simple repetitive model structures such as GPT. + +- **Only effective when using Megatron-LM.** +- Defaults to `False`. Automatically overridden to `True` when `nccl_ub` is enabled. + +#### 6. Add `--use-nccl-ub` + +Allocates and [registers NCCL user buffers](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html#) for param and grad buffers. This option enables an SM-efficient NCCL algorithm that could improve the performance of overlapped computations. This flag will be much more effective when used together with [SHARP](https://docs.nvidia.com/networking/display/sharpv3130) if the FSDP communication includes both NVL and IB domains. Enabling this option will cause additional memory overhead due to the requirement to enable the `fsdp_double_buffer` option. + +- **Only effective when using Megatron-LM.** +- Defaults to `False`. +- By default we try to use NCCL window (symmetric) registration if it is available. If not it falls back to conventional local registration. +- **Incompatible with PyTorch's segmentable allocator:** Do not set `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True` when using `--use-nccl-ub`, as this will cause a runtime error due to compatibility issues with the `torch.cuda.MemPool` API. + +## Checkpoint Conversion from 3D-Parallel to Megatron-FSDP + +Megatron-FSDP introduces `fsdp_dtensor`, a DTensor-based distributed checkpoint format that serves as its standard. To help you smoothly transition from 3D-Parallel to Megatron-FSDP, we provide a script for converting checkpoints from the `torch_dist` format to the `fsdp_dtensor` format. Using DeepSeek-V3 as an example, the detailed conversion process is described below. + +### Step 1: Generate 3D-Parallel Checkpoint with `param_to_param_group_map` + +Run your 3D-parallel + EP training script to generate a `torch_dist` checkpoint along with a directory containing `param_to_param_group_map` files. Add the following flag to your training script: + +```bash +--dump-param-to-param-group-map /path/to/param_to_param_group_map +``` + +If you already have a `torch_dist` checkpoint, simply specify the `--dump-param-to-param-group-map /path/to/param_to_param_group_map` flag and run a very short experiment-this will create the `param_to_param_group_map` you need without full pretraining. + +### Step 2: Export `param_to_param_group_map` to a JSON File + +Convert the `param_to_param_group_map` into a JSON file for easier processing by running: + +```bash +python tools/checkpoint/checkpoint_inspector.py print-torch-dcp-in-json /path/to/param_to_param_group_map +``` + +This will create a `param_to_param_group_map.json` file in the `/path/to/param_to_param_group_map` directory. + +### Step 3: Convert Checkpoint from `torch_dist` to `fsdp_dtensor` + +Convert your `torch_dist` checkpoint to the `fsdp_dtensor` format using the parameter to `param_to_param_group_map` JSON file: + +```bash +torchrun --nproc_per_node=8 --nnodes=1 \ + tools/checkpoint/checkpoint_inspector.py \ + convert-torch-dist-to-fsdp-dtensor --swiglu \ + /path/to/input_torch_dist_checkpoint \ + /path/to/output_fsdp_dtensor_checkpoint \ + --param-to-param-group-map-json /path/to/param_to_param_group_map.json +``` + +**Note:** For multi-node conversion tasks, please refer to the example script: [`sbatch_checkpoint_convert.sh`](./example-scripts/sbatch_checkpoint_convert.sh). + +### Step 4: Launch Megatron-FSDP Training + +Start your Megatron-FSDP training job using the converted `fsdp_dtensor` checkpoint. \ No newline at end of file From 6b157e007138c28f5ea25d79a7f4661800f3f8b4 Mon Sep 17 00:00:00 2001 From: hx Date: Fri, 16 Jan 2026 23:17:05 +0800 Subject: [PATCH 238/248] [Dev] Optimizer State and Master Weight Offloading (#2760) Co-authored-by: Xin Yao --- .../optimizer_state_offloader.py | 315 ++++++++++++++++ megatron/core/optimizer/distrib_optimizer.py | 25 ++ megatron/core/optimizer/optimizer_config.py | 6 + megatron/training/arguments.py | 13 + megatron/training/training.py | 30 +- .../test_optimizer_state_offloading.py | 337 ++++++++++++++++++ 6 files changed, 725 insertions(+), 1 deletion(-) create mode 100644 megatron/core/optimizer/cpu_offloading/optimizer_state_offloader.py create mode 100644 tests/unit_tests/test_optimizer_state_offloading.py diff --git a/megatron/core/optimizer/cpu_offloading/optimizer_state_offloader.py b/megatron/core/optimizer/cpu_offloading/optimizer_state_offloader.py new file mode 100644 index 00000000000..81fd116c8ba --- /dev/null +++ b/megatron/core/optimizer/cpu_offloading/optimizer_state_offloader.py @@ -0,0 +1,315 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +"""Optimizer state offloading class.""" + +from typing import TYPE_CHECKING, Dict, List, Tuple + +import torch + +if TYPE_CHECKING: + from megatron.core.optimizer.distrib_optimizer import DistributedOptimizer + + +class OptimizerStateOffloader: + """ + Manages offloading of optimizer states and master weights to CPU. + Used with DistributedOptimizer to reduce GPU memory usage. + + Supports overlapped D2H/H2D transfers using CUDA streams. + + Master weights can be stored in two locations: + - In adam optimizer state (when use_precision_aware_optimizer_no_fp8_or_ds_fp8 is True) + - In mcore's shard_fp32_from_float16_groups + """ + + OPTIMIZER_STATE_KEYS = ('exp_avg', 'exp_avg_sq') + MASTER_WEIGHT_KEY = 'master_param' + + def __init__(self, distrib_optimizer: "DistributedOptimizer"): + """ + Args: + distrib_optimizer: The DistributedOptimizer to offload states and master weights from. + """ + self.dist_optimizer = distrib_optimizer + self.adam_optimizer = distrib_optimizer.optimizer + + # Only support TE FusedAdam optimizer for now. + try: + from transformer_engine.pytorch.optimizers import FusedAdam + + assert isinstance(self.adam_optimizer, FusedAdam), ( + f"OptimizerStateOffloader requires TE FusedAdam optimizer, " + f"but got {type(self.adam_optimizer).__name__}" + ) + except ImportError: + raise ImportError( + "OptimizerStateOffloader requires transformer_engine.pytorch.optimizers.FusedAdam" + ) + + # Check if master weights are stored in adam optimizer state + self.optimizer_contains_master_weights = self.adam_optimizer.master_weights + + # CUDA streams for async transfers + self._d2h_stream = torch.cuda.Stream() + self._h2d_stream = torch.cuda.Stream() + + # CPU buffers for optimizer states: {param: {key: cpu_tensor}} + self._opt_state_cpu_buffers: Dict[torch.Tensor, Dict[str, torch.Tensor]] = {} + + # CPU buffers for mcore master weights, matching the structure of source groups + # List[List[cpu_tensor]] + self._shard_fp32_from_float16_cpu_buffers: List[List[torch.Tensor]] = [] + + # State tracking + self._offloaded = False + self._offloaded_state_keys: Tuple[str, ...] = () + self._offloaded_mcore_master_weights = False + + # Track whether optimizer states (exp_avg, exp_avg_sq) have been initialized. + # These are lazily initialized by FusedAdam during the first optimizer.step(). + # Master weights (shard_fp32_from_float16_groups) are available from the start. + self._optimizer_states_initialized = False + + def mark_optimizer_states_initialized(self): + """ + Mark that optimizer states (exp_avg, exp_avg_sq) are now available. + Should be called after the first optimizer.step() completes. + """ + self._optimizer_states_initialized = True + + def _get_state_keys_to_offload( + self, offload_optimizer_states: bool, offload_master_weights: bool + ) -> Tuple[str, ...]: + """Get the state keys in FusedAdam to offload based on configuration.""" + keys = [] + # Skip optimizer states offloading if they haven't been initialized yet. + # Optimizer states are lazily initialized by FusedAdam during the first optimizer.step(). + if self._optimizer_states_initialized: + if offload_optimizer_states: + keys.extend(self.OPTIMIZER_STATE_KEYS) + if offload_master_weights and self.optimizer_contains_master_weights: + keys.append(self.MASTER_WEIGHT_KEY) + return tuple(keys) + + def _ensure_state_cpu_buffer( + self, param: torch.Tensor, state_key: str, gpu_tensor: torch.Tensor, pin_memory: bool = True + ) -> torch.Tensor: + """Get or create a CPU buffer for a state tensor.""" + if param not in self._opt_state_cpu_buffers: + self._opt_state_cpu_buffers[param] = {} + + if state_key not in self._opt_state_cpu_buffers[param]: + cpu_buffer = torch.empty( + gpu_tensor.size(), + dtype=gpu_tensor.dtype, + layout=gpu_tensor.layout, + device='cpu', + pin_memory=pin_memory, + ) + self._opt_state_cpu_buffers[param][state_key] = cpu_buffer + + return self._opt_state_cpu_buffers[param][state_key] + + def _offload_shard_groups( + self, + shard_groups: List[List[torch.Tensor]], + cpu_buffers: List[List[torch.Tensor]], + pin_memory: bool = True, + ): + """Offload a shard group to CPU buffers.""" + # Initialize CPU buffers on first call + if len(cpu_buffers) == 0: + for group in shard_groups: + group_buffers = [] + for gpu_tensor in group: + cpu_buffer = torch.empty( + gpu_tensor.size(), + dtype=gpu_tensor.dtype, + layout=gpu_tensor.layout, + device='cpu', + pin_memory=pin_memory, + ) + group_buffers.append(cpu_buffer) + cpu_buffers.append(group_buffers) + + # Copy D2H + for group_idx, group in enumerate(shard_groups): + for param_idx, gpu_tensor in enumerate(group): + cpu_buffer = cpu_buffers[group_idx][param_idx] + cpu_buffer.copy_(gpu_tensor, non_blocking=pin_memory) + gpu_tensor.record_stream(self._d2h_stream) + + def _offload_states( + self, + offload_optimizer_states: bool, + offload_master_weights: bool, + use_pin_memory: bool = True, + ): + """Offload optimizer states and/or master weights to CPU.""" + # Offload states from adam optimizer + self._offloaded_state_keys = self._get_state_keys_to_offload( + offload_optimizer_states, offload_master_weights + ) + states = self.adam_optimizer.state + + for param, param_state in states.items(): + for state_key in self._offloaded_state_keys: + if state_key not in param_state: + continue + + gpu_tensor = param_state[state_key] + if not isinstance(gpu_tensor, torch.Tensor) or not gpu_tensor.is_cuda: + continue + + cpu_buffer = self._ensure_state_cpu_buffer( + param, state_key, gpu_tensor, use_pin_memory + ) + cpu_buffer.copy_(gpu_tensor, non_blocking=use_pin_memory) + gpu_tensor.record_stream(self._d2h_stream) + + # Offload mcore master weights if not in optimizer state + if offload_master_weights and not self.optimizer_contains_master_weights: + self._offload_shard_groups( + self.dist_optimizer.shard_fp32_from_float16_groups, + self._shard_fp32_from_float16_cpu_buffers, + use_pin_memory, + ) + self._offloaded_mcore_master_weights = True + + def _release_states(self): + """Replace optimizer state GPU tensors with CPU tensors to free GPU memory.""" + states = self.adam_optimizer.state + + for param, param_state in states.items(): + if param not in self._opt_state_cpu_buffers: + continue + + for state_key in self._offloaded_state_keys: + if state_key not in self._opt_state_cpu_buffers[param]: + continue + + param_state[state_key].untyped_storage().resize_(0) + + if self._offloaded_mcore_master_weights: + for group in self.dist_optimizer.shard_fp32_from_float16_groups: + for gpu_tensor in group: + gpu_tensor.untyped_storage().resize_(0) + + def _reload_shard_groups( + self, + shard_groups: List[List[torch.Tensor]], + cpu_buffers: List[List[torch.Tensor]], + is_allocate_stage: bool, + ): + """Reload shard groups from CPU to GPU.""" + for group_idx, group in enumerate(shard_groups): + for param_idx, _ in enumerate(group): + cpu_buffer = cpu_buffers[group_idx][param_idx] + if is_allocate_stage: + shard_groups[group_idx][param_idx].untyped_storage().resize_( + cpu_buffer.untyped_storage().size() + ) + else: + shard_groups[group_idx][param_idx].copy_( + cpu_buffer, non_blocking=cpu_buffer.is_pinned() + ) + + def _reload_states(self, is_allocate_stage: bool): + """ + Reload optimizer states and/or master weights from CPU to GPU. + + If is_allocate_stage is True, only allocate GPU memory for the states and master weights, + but do not copy the data from CPU to GPU. Otherwise, copy the data from CPU to GPU. + The two processes are separated to make sure that the GPU memory is allocated on the + default stream to avoid fragmentation. + """ + # Reload states to adam optimizer + states = self.adam_optimizer.state + + for param, param_state in states.items(): + if param not in self._opt_state_cpu_buffers: + continue + + for state_key in self._offloaded_state_keys: + if state_key not in self._opt_state_cpu_buffers[param]: + continue + + cpu_buffer = self._opt_state_cpu_buffers[param][state_key] + if is_allocate_stage: + param_state[state_key].untyped_storage().resize_( + cpu_buffer.untyped_storage().size() + ) + else: + param_state[state_key].copy_(cpu_buffer, non_blocking=cpu_buffer.is_pinned()) + + # Reload mcore master weights if not in optimizer state + if self._offloaded_mcore_master_weights: + self._reload_shard_groups( + self.dist_optimizer.shard_fp32_from_float16_groups, + self._shard_fp32_from_float16_cpu_buffers, + is_allocate_stage, + ) + + def offload(self, offload_optimizer_states: bool = True, offload_master_weights: bool = True): + """ + Offload optimizer states and/or master weights to CPU. + Starts async D2H transfer that can overlap with other operations. + + Args: + offload_optimizer_states: Whether to offload exp_avg, exp_avg_sq. + offload_master_weights: Whether to offload master weights. + """ + if not offload_optimizer_states and not offload_master_weights: + return + + # Wait for current stream finishing updating the optimizer states. + self._d2h_stream.wait_stream(torch.cuda.current_stream()) + + with torch.cuda.stream(self._d2h_stream): + self._offload_states(offload_optimizer_states, offload_master_weights) + + self._offloaded = True + + def release_gpu_memory(self): + """ + Release GPU memory for optimizer states and master weights after D2H copy completes. + + This is separated from offload() to allow delayed GPU memory release, + which is needed for mxfp8 + overlap_param_gather case where master weights + must remain on GPU until after _copy_main_params_to_param_buffer() is called. + """ + if not self._offloaded: + return + + self._release_states() + + def reload(self): + """ + Reload optimizer states and/or master weights from CPU to GPU. + Call before optimizer.step() to ensure states are on GPU. + """ + if not self._offloaded: + return + + # Allocate GPU memory on the current stream to avoid fragmentation. + self._reload_states(is_allocate_stage=True) + + self._h2d_stream.wait_stream(self._d2h_stream) + self._h2d_stream.wait_stream(torch.cuda.current_stream()) + + # Reload states on the h2d stream to overlap with other operations. + with torch.cuda.stream(self._h2d_stream): + self._reload_states(is_allocate_stage=False) + + self._offloaded_state_keys = () + self._offloaded_mcore_master_weights = False + self._offloaded = False + + def sync_before_step(self): + """ + Wait for H2D reload to complete before optimizer.step(). + Must be called to ensure states are on GPU before optimizer uses them. + + This is separated from reload() to make it possible to move the reload ahead of time. + """ + torch.cuda.current_stream().wait_stream(self._h2d_stream) diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py index 6e093f96f7e..9536bc4f9ef 100644 --- a/megatron/core/optimizer/distrib_optimizer.py +++ b/megatron/core/optimizer/distrib_optimizer.py @@ -49,6 +49,7 @@ from ..fp8_utils import dequantize_fp8_tensor, is_float8tensor, quantize_param_shard from ..transformer.fsdp_dtensor_checkpoint import handle_experts_in_state_dict from ..transformer.module import MegatronModule +from .cpu_offloading.optimizer_state_offloader import OptimizerStateOffloader from .grad_scaler import MegatronGradScaler from .optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper, param_group_identifier_keys from .optimizer_config import OptimizerConfig @@ -604,6 +605,10 @@ def __init__( self.optimizer.param_groups = [g["orig_group"] for g in self.opt_group_ranges] self.optimizer.load_state_dict(self.optimizer.state_dict()) + self._state_offloader: Optional[OptimizerStateOffloader] = None + if self.config.offload_optimizer_states: + self._state_offloader = OptimizerStateOffloader(self) + def _get_model_param_range_map(self, param: torch.nn.Parameter): """ Given a model param, get the index sub-range of the param that this @@ -2580,6 +2585,8 @@ def step_with_ready_grads(self) -> bool: Under the hood, either launch synchronous param all-gathers or get ready to launch asynchorous all-gathers that get overlapped with the next forward pass. """ + if self._state_offloader is not None: + self._state_offloader.sync_before_step() update_successful = super().step_with_ready_grads() timers = self.config.timers @@ -2600,4 +2607,22 @@ def step_with_ready_grads(self) -> bool: if timers is not None: timers('params-all-gather').stop() + if self._state_offloader is not None: + self._state_offloader.mark_optimizer_states_initialized() + return update_successful + + def offload_states(self): + """Offload states to CPU.""" + if self._state_offloader is not None: + self._state_offloader.offload() + + def reload_offloaded_states(self): + """Start async reload of offloaded states.""" + if self._state_offloader is not None: + self._state_offloader.reload() + + def release_offloaded_gpu_states(self): + """Release GPU memory after D2H completes. For delayed release case.""" + if self._state_offloader is not None: + self._state_offloader.release_gpu_memory() diff --git a/megatron/core/optimizer/optimizer_config.py b/megatron/core/optimizer/optimizer_config.py index 679878ed954..1813488d7bd 100644 --- a/megatron/core/optimizer/optimizer_config.py +++ b/megatron/core/optimizer/optimizer_config.py @@ -266,6 +266,12 @@ class OptimizerConfig: pin_cpu_params: bool = True """If True, pin the optimizer parameters to CPU memory.""" + offload_optimizer_states: bool = False + """ + If True, offload optimizer states to CPU after each optimizer step and + reload them before the next optimizer step. + """ + ################ # Miscellaneous ################ diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 5f9e7350c18..8a70772cc3d 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1271,6 +1271,11 @@ def validate_args(args, defaults={}): "must be used in conjunction with `--fp8-recipe delayed`." ) + if args.offload_optimizer_states: + assert args.use_distributed_optimizer, "offload_optimizer_states is only supported with distributed optimizer" + assert args.optimizer == 'adam', "offload_optimizer_states is only supported with adam optimizer" + assert not args.use_megatron_fsdp, "offload_optimizer_states does not support Megatron-FSDP for now." + if args.non_persistent_ckpt_type == "local": assert args.non_persistent_local_ckpt_dir is not None, "Tried to use local checkpointing without specifying --local-ckpt-dir!" if args.replication: @@ -2386,6 +2391,14 @@ def _add_training_args(parser): help='Disable pinning of CPU memory for gradients.') group.add_argument('--no-pin-cpu-params', action='store_false', dest='pin_cpu_params', help='Disable pinning of CPU memory for parameters.') + group.add_argument('--offload-optimizer-states', + action='store_true', + dest='offload_optimizer_states', + help='Offload optimizer states to CPU after each optimizer step and ' + 'reload them before the next optimizer step. ' + 'Only support TE FusedAdam optimizer.' + 'Note that this still uses pure GPU optimizer instead of ' + 'HybridDeviceOptimizer for --optimizer-cpu-offload.') group.add_argument('--dataloader-type', type=str, default=None, choices=['single', 'cyclic', 'external'], help='Single pass vs multiple pass data loader') diff --git a/megatron/training/training.py b/megatron/training/training.py index 845d271f62e..8aff2556d14 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -1425,6 +1425,12 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch rerun_state_machine = get_rerun_state_machine() while rerun_state_machine.should_run_forward_backward(data_iterator): + # Offload optimizer states to CPU if enabled. + if args.offload_optimizer_states: + for optim_instance in optimizer.chained_optimizers: + if isinstance(optim_instance, DistributedOptimizer): + optim_instance.offload_states() + # Set grad to zero. for model_chunk in model: model_chunk.zero_grad_buffer() @@ -1458,6 +1464,14 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch if isinstance(optim_instance, DistributedOptimizer): optim_instance._copy_main_params_to_param_buffer() + # Release GPU memory for offloaded optimizer states. + # This needs to be done after _copy_main_params_to_param_buffer(). + # Separate offload and release to allow early D2H transfer to overlap with other operations. + if args.offload_optimizer_states: + for optim_instance in optimizer.chained_optimizers: + if isinstance(optim_instance, DistributedOptimizer): + optim_instance.release_offloaded_gpu_states() + # Forward pass. losses_reduced = forward_backward_func( forward_step_func=forward_step_func, @@ -2305,7 +2319,21 @@ def train( config.param_sync_func = [model_chunk.start_param_sync for model_chunk in model] if len(model) == 1: config.param_sync_func = config.param_sync_func[0] - config.finalize_model_grads_func = finalize_model_grads + + # Wrap finalize_model_grads to reload offloaded optimizer states before grad finalization. + # This allows H2D transfer to overlap with grad all-reduce. + if args.offload_optimizer_states: + + def finalize_model_grads_with_state_reload(*fmg_args, **fmg_kwargs): + # Reload offloaded states for all DistributedOptimizer instances + for optim_instance in optimizer.chained_optimizers: + if isinstance(optim_instance, DistributedOptimizer): + optim_instance.reload_offloaded_states() + return finalize_model_grads(*fmg_args, **fmg_kwargs) + + config.finalize_model_grads_func = finalize_model_grads_with_state_reload + else: + config.finalize_model_grads_func = finalize_model_grads if args.log_energy: energy_monitor.setup() diff --git a/tests/unit_tests/test_optimizer_state_offloading.py b/tests/unit_tests/test_optimizer_state_offloading.py new file mode 100644 index 00000000000..baaab355182 --- /dev/null +++ b/tests/unit_tests/test_optimizer_state_offloading.py @@ -0,0 +1,337 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +"""Unit tests for OptimizerStateOffloader.""" + +import pytest +import torch +import torch.nn as nn + +from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig +from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer +from megatron.core.transformer import TransformerConfig +from tests.unit_tests.test_utilities import Utils + +try: + from transformer_engine.pytorch.optimizers import FusedAdam # noqa: F401 + + TE_FUSED_ADAM_AVAILABLE = True +except ImportError: + TE_FUSED_ADAM_AVAILABLE = False + + +class SimpleModel(nn.Module): + """Simple model for testing.""" + + def __init__(self, hidden_size=256): + super().__init__() + self.fc1 = nn.Linear(hidden_size, hidden_size) + self.fc2 = nn.Linear(hidden_size, hidden_size) + + def forward(self, x): + return self.fc2(torch.relu(self.fc1(x))) + + +def create_model_and_optimizer(hidden_size=256, offload_optimizer_states=True, **optimizer_kwargs): + """Helper to create model and optimizer for tests.""" + model = SimpleModel(hidden_size=hidden_size).bfloat16().cuda() + ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=True) + model = DistributedDataParallel( + TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model + ) + + default_config = dict( + optimizer='adam', + bf16=True, + lr=0.001, + use_distributed_optimizer=True, + offload_optimizer_states=offload_optimizer_states, + ) + default_config.update(optimizer_kwargs) + + optimizer_config = OptimizerConfig(**default_config) + optim = get_megatron_optimizer(optimizer_config, [model]) + return model, optim + + +def run_forward_backward_step(model, optim, hidden_size=256): + """Run a single forward-backward-step cycle.""" + input_tensor = torch.randn(8, hidden_size, dtype=torch.bfloat16, device='cuda') + output = model(input_tensor) + output.sum().backward() + optim.step() + optim.zero_grad() + + +# ============================================================================= +# Test 1: Basic OptimizerStateOffloader Initialization +# ============================================================================= +@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") +def test_offloader_initialization(): + """Test that OptimizerStateOffloader initializes correctly.""" + Utils.initialize_model_parallel() + model, optim = create_model_and_optimizer() + dist_optim = optim.chained_optimizers[0] + + # Offloader is created in __init__ when offload_optimizer_states=True + assert dist_optim._state_offloader is not None + offloader = dist_optim._state_offloader + + # Verify offloader properties + assert offloader.adam_optimizer is not None + assert offloader._d2h_stream is not None + assert offloader._h2d_stream is not None + assert offloader._offloaded is False + + # Before first step, optimizer states are not initialized yet + assert offloader._optimizer_states_initialized is False + + # Run one step to initialize optimizer states + run_forward_backward_step(model, optim) + + # After first step, optimizer states should be marked as initialized + assert offloader._optimizer_states_initialized is True + Utils.destroy_model_parallel() + + +# ============================================================================= +# Test 2: Early Master Weight Offloading Before First Step +# ============================================================================= +@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") +def test_early_master_weight_offloading(): + """Test that master weights can be offloaded before the first optimizer step.""" + Utils.initialize_model_parallel() + model, optim = create_model_and_optimizer() + dist_optim = optim.chained_optimizers[0] + + # Offloader is created in __init__ + assert dist_optim._state_offloader is not None + offloader = dist_optim._state_offloader + + # Before first step, optimizer states are not initialized + assert offloader._optimizer_states_initialized is False + + # Capture original master weights before offload + original_master_weights = [] + for group in dist_optim.shard_fp32_from_float16_groups: + group_weights = [tensor.clone() for tensor in group] + original_master_weights.append(group_weights) + + # Offload before first step - should only offload master weights + offloader.offload() + offloader.release_gpu_memory() + torch.cuda.synchronize() + + # Verify master weights were offloaded (storage resized to 0) + for group in dist_optim.shard_fp32_from_float16_groups: + for tensor in group: + assert tensor.untyped_storage().size() == 0, "Master weight should be offloaded" + + # Reload master weights + offloader.reload() + offloader.sync_before_step() + + # Verify master weights match after reload + for group_idx, group in enumerate(dist_optim.shard_fp32_from_float16_groups): + for param_idx, tensor in enumerate(group): + original = original_master_weights[group_idx][param_idx] + torch.testing.assert_close( + tensor, + original, + msg=f"Master weight [{group_idx}][{param_idx}] mismatch after offload/reload", + ) + + # Now run a step and verify optimizer states can be offloaded after + run_forward_backward_step(model, optim) + assert offloader._optimizer_states_initialized is True + + Utils.destroy_model_parallel() + + +# ============================================================================= +# Test 3: Offload and Reload Correctness +# ============================================================================= +@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") +@pytest.mark.parametrize("offload_optimizer_states", [True, False]) +@pytest.mark.parametrize("offload_master_weights", [True, False]) +def test_offload_reload_correctness(offload_optimizer_states, offload_master_weights): + """Test that offload/reload preserves optimizer state values.""" + if not offload_optimizer_states and not offload_master_weights: + pytest.skip("At least one offload type required") + + Utils.initialize_model_parallel() + model, optim = create_model_and_optimizer() + dist_optim = optim.chained_optimizers[0] + + # Run steps to build up optimizer state + for _ in range(3): + run_forward_backward_step(model, optim) + + offloader = dist_optim._state_offloader + + # Capture original states before offload + original_states = {} + for param, state in offloader.adam_optimizer.state.items(): + original_states[param] = { + k: v.clone() for k, v in state.items() if isinstance(v, torch.Tensor) + } + + # Offload + offloader.offload( + offload_optimizer_states=offload_optimizer_states, + offload_master_weights=offload_master_weights, + ) + + # Release GPU memory + offloader.release_gpu_memory() + torch.cuda.synchronize() + + # Reload + offloader.reload() + offloader.sync_before_step() + + # Verify states match after reload + for param, state in offloader.adam_optimizer.state.items(): + if param in original_states: + for key, original_tensor in original_states[param].items(): + if key in state and isinstance(state[key], torch.Tensor): + reloaded_tensor = state[key] + assert reloaded_tensor.device.type == 'cuda', f"State {key} should be on GPU" + torch.testing.assert_close( + reloaded_tensor, + original_tensor, + msg=f"State {key} mismatch after offload/reload", + ) + Utils.destroy_model_parallel() + + +# ============================================================================= +# Test 4: GPU Memory Release Verification +# ============================================================================= +@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") +def test_gpu_memory_release(): + """Test that GPU memory is actually freed after release_gpu_memory().""" + Utils.initialize_model_parallel() + # Use larger model for measurable memory impact + model, optim = create_model_and_optimizer(hidden_size=1024) + dist_optim = optim.chained_optimizers[0] + + # Initialize optimizer states + run_forward_backward_step(model, optim, hidden_size=1024) + + offloader = dist_optim._state_offloader + + # Measure memory before offload + torch.cuda.synchronize() + torch.cuda.empty_cache() + memory_before = torch.cuda.memory_allocated() + + # Offload and release + offloader.offload() + offloader.release_gpu_memory() + + # Wait for async operations + torch.cuda.synchronize() + torch.cuda.empty_cache() + memory_after = torch.cuda.memory_allocated() + + # Memory should decrease + memory_freed = memory_before - memory_after + assert memory_freed > 0, f"Expected memory to be freed, but got {memory_freed} bytes difference" + Utils.destroy_model_parallel() + + +# ============================================================================= +# Test 5: Multiple Offload/Reload Cycles +# ============================================================================= +@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") +def test_multiple_offload_reload_cycles(): + """Test that multiple offload/reload cycles work correctly.""" + Utils.initialize_model_parallel() + model, optim = create_model_and_optimizer() + dist_optim = optim.chained_optimizers[0] + + # Initialize + run_forward_backward_step(model, optim) + + offloader = dist_optim._state_offloader + + # Run multiple cycles + for cycle in range(5): + # Offload + offloader.offload() + offloader.release_gpu_memory() + + # Reload + offloader.reload() + offloader.sync_before_step() + + # Run optimizer step + run_forward_backward_step(model, optim) + + # Verify model can still produce valid outputs + input_tensor = torch.randn(8, 256, dtype=torch.bfloat16, device='cuda') + output = model(input_tensor) + assert not output.isnan().any(), "Model output contains NaN after multiple cycles" + Utils.destroy_model_parallel() + + +# ============================================================================= +# Test 6: Training Correctness with Offloading +# ============================================================================= +@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") +def test_training_correctness_with_offloading(): + """Test that training with offloading produces same results as without.""" + Utils.initialize_model_parallel() + torch.manual_seed(42) + + # Model 1: with offloading + model1, optim1 = create_model_and_optimizer(offload_optimizer_states=True, lr=0.01) + + # Model 2: without offloading (reference) + torch.manual_seed(42) + model2, optim2 = create_model_and_optimizer(offload_optimizer_states=False, lr=0.01) + + # Train both models + n_steps = 10 + torch.manual_seed(123) + dist_optim1 = optim1.chained_optimizers[0] + + # Offloader is created in __init__ when offload_optimizer_states=True + assert dist_optim1._state_offloader is not None + offloader = dist_optim1._state_offloader + + for step in range(n_steps): + input_tensor = torch.randn(8, 256, dtype=torch.bfloat16, device='cuda') + + # Model 1 with offloading + # Offload states (master weights can be offloaded from the start, + # optimizer states will be skipped until after first step) + offloader.offload() + offloader.release_gpu_memory() + + output1 = model1(input_tensor) + loss1 = output1.sum() + loss1.backward() + + offloader.reload() + offloader.sync_before_step() + optim1.step() + optim1.zero_grad() + + # Model 2 without offloading + output2 = model2(input_tensor) + loss2 = output2.sum() + loss2.backward() + optim2.step() + optim2.zero_grad() + + # Compare final model weights + for (n1, p1), (n2, p2) in zip(model1.named_parameters(), model2.named_parameters()): + torch.testing.assert_close( + p1.data, + p2.data, + atol=1e-5, + rtol=1e-4, + msg=f"Parameter {n1} mismatch between offloaded and non-offloaded training", + ) + Utils.destroy_model_parallel() From 8ac3a9f43c1034c63547c01434c97835febb5234 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 16 Jan 2026 17:28:06 +0100 Subject: [PATCH 239/248] Revert "[Dev] Optimizer State and Master Weight Offloading (#2760)" (#2984) --- .../optimizer_state_offloader.py | 315 ---------------- megatron/core/optimizer/distrib_optimizer.py | 25 -- megatron/core/optimizer/optimizer_config.py | 6 - megatron/training/arguments.py | 13 - megatron/training/training.py | 30 +- .../test_optimizer_state_offloading.py | 337 ------------------ 6 files changed, 1 insertion(+), 725 deletions(-) delete mode 100644 megatron/core/optimizer/cpu_offloading/optimizer_state_offloader.py delete mode 100644 tests/unit_tests/test_optimizer_state_offloading.py diff --git a/megatron/core/optimizer/cpu_offloading/optimizer_state_offloader.py b/megatron/core/optimizer/cpu_offloading/optimizer_state_offloader.py deleted file mode 100644 index 81fd116c8ba..00000000000 --- a/megatron/core/optimizer/cpu_offloading/optimizer_state_offloader.py +++ /dev/null @@ -1,315 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. - -"""Optimizer state offloading class.""" - -from typing import TYPE_CHECKING, Dict, List, Tuple - -import torch - -if TYPE_CHECKING: - from megatron.core.optimizer.distrib_optimizer import DistributedOptimizer - - -class OptimizerStateOffloader: - """ - Manages offloading of optimizer states and master weights to CPU. - Used with DistributedOptimizer to reduce GPU memory usage. - - Supports overlapped D2H/H2D transfers using CUDA streams. - - Master weights can be stored in two locations: - - In adam optimizer state (when use_precision_aware_optimizer_no_fp8_or_ds_fp8 is True) - - In mcore's shard_fp32_from_float16_groups - """ - - OPTIMIZER_STATE_KEYS = ('exp_avg', 'exp_avg_sq') - MASTER_WEIGHT_KEY = 'master_param' - - def __init__(self, distrib_optimizer: "DistributedOptimizer"): - """ - Args: - distrib_optimizer: The DistributedOptimizer to offload states and master weights from. - """ - self.dist_optimizer = distrib_optimizer - self.adam_optimizer = distrib_optimizer.optimizer - - # Only support TE FusedAdam optimizer for now. - try: - from transformer_engine.pytorch.optimizers import FusedAdam - - assert isinstance(self.adam_optimizer, FusedAdam), ( - f"OptimizerStateOffloader requires TE FusedAdam optimizer, " - f"but got {type(self.adam_optimizer).__name__}" - ) - except ImportError: - raise ImportError( - "OptimizerStateOffloader requires transformer_engine.pytorch.optimizers.FusedAdam" - ) - - # Check if master weights are stored in adam optimizer state - self.optimizer_contains_master_weights = self.adam_optimizer.master_weights - - # CUDA streams for async transfers - self._d2h_stream = torch.cuda.Stream() - self._h2d_stream = torch.cuda.Stream() - - # CPU buffers for optimizer states: {param: {key: cpu_tensor}} - self._opt_state_cpu_buffers: Dict[torch.Tensor, Dict[str, torch.Tensor]] = {} - - # CPU buffers for mcore master weights, matching the structure of source groups - # List[List[cpu_tensor]] - self._shard_fp32_from_float16_cpu_buffers: List[List[torch.Tensor]] = [] - - # State tracking - self._offloaded = False - self._offloaded_state_keys: Tuple[str, ...] = () - self._offloaded_mcore_master_weights = False - - # Track whether optimizer states (exp_avg, exp_avg_sq) have been initialized. - # These are lazily initialized by FusedAdam during the first optimizer.step(). - # Master weights (shard_fp32_from_float16_groups) are available from the start. - self._optimizer_states_initialized = False - - def mark_optimizer_states_initialized(self): - """ - Mark that optimizer states (exp_avg, exp_avg_sq) are now available. - Should be called after the first optimizer.step() completes. - """ - self._optimizer_states_initialized = True - - def _get_state_keys_to_offload( - self, offload_optimizer_states: bool, offload_master_weights: bool - ) -> Tuple[str, ...]: - """Get the state keys in FusedAdam to offload based on configuration.""" - keys = [] - # Skip optimizer states offloading if they haven't been initialized yet. - # Optimizer states are lazily initialized by FusedAdam during the first optimizer.step(). - if self._optimizer_states_initialized: - if offload_optimizer_states: - keys.extend(self.OPTIMIZER_STATE_KEYS) - if offload_master_weights and self.optimizer_contains_master_weights: - keys.append(self.MASTER_WEIGHT_KEY) - return tuple(keys) - - def _ensure_state_cpu_buffer( - self, param: torch.Tensor, state_key: str, gpu_tensor: torch.Tensor, pin_memory: bool = True - ) -> torch.Tensor: - """Get or create a CPU buffer for a state tensor.""" - if param not in self._opt_state_cpu_buffers: - self._opt_state_cpu_buffers[param] = {} - - if state_key not in self._opt_state_cpu_buffers[param]: - cpu_buffer = torch.empty( - gpu_tensor.size(), - dtype=gpu_tensor.dtype, - layout=gpu_tensor.layout, - device='cpu', - pin_memory=pin_memory, - ) - self._opt_state_cpu_buffers[param][state_key] = cpu_buffer - - return self._opt_state_cpu_buffers[param][state_key] - - def _offload_shard_groups( - self, - shard_groups: List[List[torch.Tensor]], - cpu_buffers: List[List[torch.Tensor]], - pin_memory: bool = True, - ): - """Offload a shard group to CPU buffers.""" - # Initialize CPU buffers on first call - if len(cpu_buffers) == 0: - for group in shard_groups: - group_buffers = [] - for gpu_tensor in group: - cpu_buffer = torch.empty( - gpu_tensor.size(), - dtype=gpu_tensor.dtype, - layout=gpu_tensor.layout, - device='cpu', - pin_memory=pin_memory, - ) - group_buffers.append(cpu_buffer) - cpu_buffers.append(group_buffers) - - # Copy D2H - for group_idx, group in enumerate(shard_groups): - for param_idx, gpu_tensor in enumerate(group): - cpu_buffer = cpu_buffers[group_idx][param_idx] - cpu_buffer.copy_(gpu_tensor, non_blocking=pin_memory) - gpu_tensor.record_stream(self._d2h_stream) - - def _offload_states( - self, - offload_optimizer_states: bool, - offload_master_weights: bool, - use_pin_memory: bool = True, - ): - """Offload optimizer states and/or master weights to CPU.""" - # Offload states from adam optimizer - self._offloaded_state_keys = self._get_state_keys_to_offload( - offload_optimizer_states, offload_master_weights - ) - states = self.adam_optimizer.state - - for param, param_state in states.items(): - for state_key in self._offloaded_state_keys: - if state_key not in param_state: - continue - - gpu_tensor = param_state[state_key] - if not isinstance(gpu_tensor, torch.Tensor) or not gpu_tensor.is_cuda: - continue - - cpu_buffer = self._ensure_state_cpu_buffer( - param, state_key, gpu_tensor, use_pin_memory - ) - cpu_buffer.copy_(gpu_tensor, non_blocking=use_pin_memory) - gpu_tensor.record_stream(self._d2h_stream) - - # Offload mcore master weights if not in optimizer state - if offload_master_weights and not self.optimizer_contains_master_weights: - self._offload_shard_groups( - self.dist_optimizer.shard_fp32_from_float16_groups, - self._shard_fp32_from_float16_cpu_buffers, - use_pin_memory, - ) - self._offloaded_mcore_master_weights = True - - def _release_states(self): - """Replace optimizer state GPU tensors with CPU tensors to free GPU memory.""" - states = self.adam_optimizer.state - - for param, param_state in states.items(): - if param not in self._opt_state_cpu_buffers: - continue - - for state_key in self._offloaded_state_keys: - if state_key not in self._opt_state_cpu_buffers[param]: - continue - - param_state[state_key].untyped_storage().resize_(0) - - if self._offloaded_mcore_master_weights: - for group in self.dist_optimizer.shard_fp32_from_float16_groups: - for gpu_tensor in group: - gpu_tensor.untyped_storage().resize_(0) - - def _reload_shard_groups( - self, - shard_groups: List[List[torch.Tensor]], - cpu_buffers: List[List[torch.Tensor]], - is_allocate_stage: bool, - ): - """Reload shard groups from CPU to GPU.""" - for group_idx, group in enumerate(shard_groups): - for param_idx, _ in enumerate(group): - cpu_buffer = cpu_buffers[group_idx][param_idx] - if is_allocate_stage: - shard_groups[group_idx][param_idx].untyped_storage().resize_( - cpu_buffer.untyped_storage().size() - ) - else: - shard_groups[group_idx][param_idx].copy_( - cpu_buffer, non_blocking=cpu_buffer.is_pinned() - ) - - def _reload_states(self, is_allocate_stage: bool): - """ - Reload optimizer states and/or master weights from CPU to GPU. - - If is_allocate_stage is True, only allocate GPU memory for the states and master weights, - but do not copy the data from CPU to GPU. Otherwise, copy the data from CPU to GPU. - The two processes are separated to make sure that the GPU memory is allocated on the - default stream to avoid fragmentation. - """ - # Reload states to adam optimizer - states = self.adam_optimizer.state - - for param, param_state in states.items(): - if param not in self._opt_state_cpu_buffers: - continue - - for state_key in self._offloaded_state_keys: - if state_key not in self._opt_state_cpu_buffers[param]: - continue - - cpu_buffer = self._opt_state_cpu_buffers[param][state_key] - if is_allocate_stage: - param_state[state_key].untyped_storage().resize_( - cpu_buffer.untyped_storage().size() - ) - else: - param_state[state_key].copy_(cpu_buffer, non_blocking=cpu_buffer.is_pinned()) - - # Reload mcore master weights if not in optimizer state - if self._offloaded_mcore_master_weights: - self._reload_shard_groups( - self.dist_optimizer.shard_fp32_from_float16_groups, - self._shard_fp32_from_float16_cpu_buffers, - is_allocate_stage, - ) - - def offload(self, offload_optimizer_states: bool = True, offload_master_weights: bool = True): - """ - Offload optimizer states and/or master weights to CPU. - Starts async D2H transfer that can overlap with other operations. - - Args: - offload_optimizer_states: Whether to offload exp_avg, exp_avg_sq. - offload_master_weights: Whether to offload master weights. - """ - if not offload_optimizer_states and not offload_master_weights: - return - - # Wait for current stream finishing updating the optimizer states. - self._d2h_stream.wait_stream(torch.cuda.current_stream()) - - with torch.cuda.stream(self._d2h_stream): - self._offload_states(offload_optimizer_states, offload_master_weights) - - self._offloaded = True - - def release_gpu_memory(self): - """ - Release GPU memory for optimizer states and master weights after D2H copy completes. - - This is separated from offload() to allow delayed GPU memory release, - which is needed for mxfp8 + overlap_param_gather case where master weights - must remain on GPU until after _copy_main_params_to_param_buffer() is called. - """ - if not self._offloaded: - return - - self._release_states() - - def reload(self): - """ - Reload optimizer states and/or master weights from CPU to GPU. - Call before optimizer.step() to ensure states are on GPU. - """ - if not self._offloaded: - return - - # Allocate GPU memory on the current stream to avoid fragmentation. - self._reload_states(is_allocate_stage=True) - - self._h2d_stream.wait_stream(self._d2h_stream) - self._h2d_stream.wait_stream(torch.cuda.current_stream()) - - # Reload states on the h2d stream to overlap with other operations. - with torch.cuda.stream(self._h2d_stream): - self._reload_states(is_allocate_stage=False) - - self._offloaded_state_keys = () - self._offloaded_mcore_master_weights = False - self._offloaded = False - - def sync_before_step(self): - """ - Wait for H2D reload to complete before optimizer.step(). - Must be called to ensure states are on GPU before optimizer uses them. - - This is separated from reload() to make it possible to move the reload ahead of time. - """ - torch.cuda.current_stream().wait_stream(self._h2d_stream) diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py index 9536bc4f9ef..6e093f96f7e 100644 --- a/megatron/core/optimizer/distrib_optimizer.py +++ b/megatron/core/optimizer/distrib_optimizer.py @@ -49,7 +49,6 @@ from ..fp8_utils import dequantize_fp8_tensor, is_float8tensor, quantize_param_shard from ..transformer.fsdp_dtensor_checkpoint import handle_experts_in_state_dict from ..transformer.module import MegatronModule -from .cpu_offloading.optimizer_state_offloader import OptimizerStateOffloader from .grad_scaler import MegatronGradScaler from .optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper, param_group_identifier_keys from .optimizer_config import OptimizerConfig @@ -605,10 +604,6 @@ def __init__( self.optimizer.param_groups = [g["orig_group"] for g in self.opt_group_ranges] self.optimizer.load_state_dict(self.optimizer.state_dict()) - self._state_offloader: Optional[OptimizerStateOffloader] = None - if self.config.offload_optimizer_states: - self._state_offloader = OptimizerStateOffloader(self) - def _get_model_param_range_map(self, param: torch.nn.Parameter): """ Given a model param, get the index sub-range of the param that this @@ -2585,8 +2580,6 @@ def step_with_ready_grads(self) -> bool: Under the hood, either launch synchronous param all-gathers or get ready to launch asynchorous all-gathers that get overlapped with the next forward pass. """ - if self._state_offloader is not None: - self._state_offloader.sync_before_step() update_successful = super().step_with_ready_grads() timers = self.config.timers @@ -2607,22 +2600,4 @@ def step_with_ready_grads(self) -> bool: if timers is not None: timers('params-all-gather').stop() - if self._state_offloader is not None: - self._state_offloader.mark_optimizer_states_initialized() - return update_successful - - def offload_states(self): - """Offload states to CPU.""" - if self._state_offloader is not None: - self._state_offloader.offload() - - def reload_offloaded_states(self): - """Start async reload of offloaded states.""" - if self._state_offloader is not None: - self._state_offloader.reload() - - def release_offloaded_gpu_states(self): - """Release GPU memory after D2H completes. For delayed release case.""" - if self._state_offloader is not None: - self._state_offloader.release_gpu_memory() diff --git a/megatron/core/optimizer/optimizer_config.py b/megatron/core/optimizer/optimizer_config.py index 1813488d7bd..679878ed954 100644 --- a/megatron/core/optimizer/optimizer_config.py +++ b/megatron/core/optimizer/optimizer_config.py @@ -266,12 +266,6 @@ class OptimizerConfig: pin_cpu_params: bool = True """If True, pin the optimizer parameters to CPU memory.""" - offload_optimizer_states: bool = False - """ - If True, offload optimizer states to CPU after each optimizer step and - reload them before the next optimizer step. - """ - ################ # Miscellaneous ################ diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 8a70772cc3d..5f9e7350c18 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1271,11 +1271,6 @@ def validate_args(args, defaults={}): "must be used in conjunction with `--fp8-recipe delayed`." ) - if args.offload_optimizer_states: - assert args.use_distributed_optimizer, "offload_optimizer_states is only supported with distributed optimizer" - assert args.optimizer == 'adam', "offload_optimizer_states is only supported with adam optimizer" - assert not args.use_megatron_fsdp, "offload_optimizer_states does not support Megatron-FSDP for now." - if args.non_persistent_ckpt_type == "local": assert args.non_persistent_local_ckpt_dir is not None, "Tried to use local checkpointing without specifying --local-ckpt-dir!" if args.replication: @@ -2391,14 +2386,6 @@ def _add_training_args(parser): help='Disable pinning of CPU memory for gradients.') group.add_argument('--no-pin-cpu-params', action='store_false', dest='pin_cpu_params', help='Disable pinning of CPU memory for parameters.') - group.add_argument('--offload-optimizer-states', - action='store_true', - dest='offload_optimizer_states', - help='Offload optimizer states to CPU after each optimizer step and ' - 'reload them before the next optimizer step. ' - 'Only support TE FusedAdam optimizer.' - 'Note that this still uses pure GPU optimizer instead of ' - 'HybridDeviceOptimizer for --optimizer-cpu-offload.') group.add_argument('--dataloader-type', type=str, default=None, choices=['single', 'cyclic', 'external'], help='Single pass vs multiple pass data loader') diff --git a/megatron/training/training.py b/megatron/training/training.py index 8aff2556d14..845d271f62e 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -1425,12 +1425,6 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch rerun_state_machine = get_rerun_state_machine() while rerun_state_machine.should_run_forward_backward(data_iterator): - # Offload optimizer states to CPU if enabled. - if args.offload_optimizer_states: - for optim_instance in optimizer.chained_optimizers: - if isinstance(optim_instance, DistributedOptimizer): - optim_instance.offload_states() - # Set grad to zero. for model_chunk in model: model_chunk.zero_grad_buffer() @@ -1464,14 +1458,6 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch if isinstance(optim_instance, DistributedOptimizer): optim_instance._copy_main_params_to_param_buffer() - # Release GPU memory for offloaded optimizer states. - # This needs to be done after _copy_main_params_to_param_buffer(). - # Separate offload and release to allow early D2H transfer to overlap with other operations. - if args.offload_optimizer_states: - for optim_instance in optimizer.chained_optimizers: - if isinstance(optim_instance, DistributedOptimizer): - optim_instance.release_offloaded_gpu_states() - # Forward pass. losses_reduced = forward_backward_func( forward_step_func=forward_step_func, @@ -2319,21 +2305,7 @@ def train( config.param_sync_func = [model_chunk.start_param_sync for model_chunk in model] if len(model) == 1: config.param_sync_func = config.param_sync_func[0] - - # Wrap finalize_model_grads to reload offloaded optimizer states before grad finalization. - # This allows H2D transfer to overlap with grad all-reduce. - if args.offload_optimizer_states: - - def finalize_model_grads_with_state_reload(*fmg_args, **fmg_kwargs): - # Reload offloaded states for all DistributedOptimizer instances - for optim_instance in optimizer.chained_optimizers: - if isinstance(optim_instance, DistributedOptimizer): - optim_instance.reload_offloaded_states() - return finalize_model_grads(*fmg_args, **fmg_kwargs) - - config.finalize_model_grads_func = finalize_model_grads_with_state_reload - else: - config.finalize_model_grads_func = finalize_model_grads + config.finalize_model_grads_func = finalize_model_grads if args.log_energy: energy_monitor.setup() diff --git a/tests/unit_tests/test_optimizer_state_offloading.py b/tests/unit_tests/test_optimizer_state_offloading.py deleted file mode 100644 index baaab355182..00000000000 --- a/tests/unit_tests/test_optimizer_state_offloading.py +++ /dev/null @@ -1,337 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. - -"""Unit tests for OptimizerStateOffloader.""" - -import pytest -import torch -import torch.nn as nn - -from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig -from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer -from megatron.core.transformer import TransformerConfig -from tests.unit_tests.test_utilities import Utils - -try: - from transformer_engine.pytorch.optimizers import FusedAdam # noqa: F401 - - TE_FUSED_ADAM_AVAILABLE = True -except ImportError: - TE_FUSED_ADAM_AVAILABLE = False - - -class SimpleModel(nn.Module): - """Simple model for testing.""" - - def __init__(self, hidden_size=256): - super().__init__() - self.fc1 = nn.Linear(hidden_size, hidden_size) - self.fc2 = nn.Linear(hidden_size, hidden_size) - - def forward(self, x): - return self.fc2(torch.relu(self.fc1(x))) - - -def create_model_and_optimizer(hidden_size=256, offload_optimizer_states=True, **optimizer_kwargs): - """Helper to create model and optimizer for tests.""" - model = SimpleModel(hidden_size=hidden_size).bfloat16().cuda() - ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=True) - model = DistributedDataParallel( - TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model - ) - - default_config = dict( - optimizer='adam', - bf16=True, - lr=0.001, - use_distributed_optimizer=True, - offload_optimizer_states=offload_optimizer_states, - ) - default_config.update(optimizer_kwargs) - - optimizer_config = OptimizerConfig(**default_config) - optim = get_megatron_optimizer(optimizer_config, [model]) - return model, optim - - -def run_forward_backward_step(model, optim, hidden_size=256): - """Run a single forward-backward-step cycle.""" - input_tensor = torch.randn(8, hidden_size, dtype=torch.bfloat16, device='cuda') - output = model(input_tensor) - output.sum().backward() - optim.step() - optim.zero_grad() - - -# ============================================================================= -# Test 1: Basic OptimizerStateOffloader Initialization -# ============================================================================= -@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") -def test_offloader_initialization(): - """Test that OptimizerStateOffloader initializes correctly.""" - Utils.initialize_model_parallel() - model, optim = create_model_and_optimizer() - dist_optim = optim.chained_optimizers[0] - - # Offloader is created in __init__ when offload_optimizer_states=True - assert dist_optim._state_offloader is not None - offloader = dist_optim._state_offloader - - # Verify offloader properties - assert offloader.adam_optimizer is not None - assert offloader._d2h_stream is not None - assert offloader._h2d_stream is not None - assert offloader._offloaded is False - - # Before first step, optimizer states are not initialized yet - assert offloader._optimizer_states_initialized is False - - # Run one step to initialize optimizer states - run_forward_backward_step(model, optim) - - # After first step, optimizer states should be marked as initialized - assert offloader._optimizer_states_initialized is True - Utils.destroy_model_parallel() - - -# ============================================================================= -# Test 2: Early Master Weight Offloading Before First Step -# ============================================================================= -@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") -def test_early_master_weight_offloading(): - """Test that master weights can be offloaded before the first optimizer step.""" - Utils.initialize_model_parallel() - model, optim = create_model_and_optimizer() - dist_optim = optim.chained_optimizers[0] - - # Offloader is created in __init__ - assert dist_optim._state_offloader is not None - offloader = dist_optim._state_offloader - - # Before first step, optimizer states are not initialized - assert offloader._optimizer_states_initialized is False - - # Capture original master weights before offload - original_master_weights = [] - for group in dist_optim.shard_fp32_from_float16_groups: - group_weights = [tensor.clone() for tensor in group] - original_master_weights.append(group_weights) - - # Offload before first step - should only offload master weights - offloader.offload() - offloader.release_gpu_memory() - torch.cuda.synchronize() - - # Verify master weights were offloaded (storage resized to 0) - for group in dist_optim.shard_fp32_from_float16_groups: - for tensor in group: - assert tensor.untyped_storage().size() == 0, "Master weight should be offloaded" - - # Reload master weights - offloader.reload() - offloader.sync_before_step() - - # Verify master weights match after reload - for group_idx, group in enumerate(dist_optim.shard_fp32_from_float16_groups): - for param_idx, tensor in enumerate(group): - original = original_master_weights[group_idx][param_idx] - torch.testing.assert_close( - tensor, - original, - msg=f"Master weight [{group_idx}][{param_idx}] mismatch after offload/reload", - ) - - # Now run a step and verify optimizer states can be offloaded after - run_forward_backward_step(model, optim) - assert offloader._optimizer_states_initialized is True - - Utils.destroy_model_parallel() - - -# ============================================================================= -# Test 3: Offload and Reload Correctness -# ============================================================================= -@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") -@pytest.mark.parametrize("offload_optimizer_states", [True, False]) -@pytest.mark.parametrize("offload_master_weights", [True, False]) -def test_offload_reload_correctness(offload_optimizer_states, offload_master_weights): - """Test that offload/reload preserves optimizer state values.""" - if not offload_optimizer_states and not offload_master_weights: - pytest.skip("At least one offload type required") - - Utils.initialize_model_parallel() - model, optim = create_model_and_optimizer() - dist_optim = optim.chained_optimizers[0] - - # Run steps to build up optimizer state - for _ in range(3): - run_forward_backward_step(model, optim) - - offloader = dist_optim._state_offloader - - # Capture original states before offload - original_states = {} - for param, state in offloader.adam_optimizer.state.items(): - original_states[param] = { - k: v.clone() for k, v in state.items() if isinstance(v, torch.Tensor) - } - - # Offload - offloader.offload( - offload_optimizer_states=offload_optimizer_states, - offload_master_weights=offload_master_weights, - ) - - # Release GPU memory - offloader.release_gpu_memory() - torch.cuda.synchronize() - - # Reload - offloader.reload() - offloader.sync_before_step() - - # Verify states match after reload - for param, state in offloader.adam_optimizer.state.items(): - if param in original_states: - for key, original_tensor in original_states[param].items(): - if key in state and isinstance(state[key], torch.Tensor): - reloaded_tensor = state[key] - assert reloaded_tensor.device.type == 'cuda', f"State {key} should be on GPU" - torch.testing.assert_close( - reloaded_tensor, - original_tensor, - msg=f"State {key} mismatch after offload/reload", - ) - Utils.destroy_model_parallel() - - -# ============================================================================= -# Test 4: GPU Memory Release Verification -# ============================================================================= -@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") -def test_gpu_memory_release(): - """Test that GPU memory is actually freed after release_gpu_memory().""" - Utils.initialize_model_parallel() - # Use larger model for measurable memory impact - model, optim = create_model_and_optimizer(hidden_size=1024) - dist_optim = optim.chained_optimizers[0] - - # Initialize optimizer states - run_forward_backward_step(model, optim, hidden_size=1024) - - offloader = dist_optim._state_offloader - - # Measure memory before offload - torch.cuda.synchronize() - torch.cuda.empty_cache() - memory_before = torch.cuda.memory_allocated() - - # Offload and release - offloader.offload() - offloader.release_gpu_memory() - - # Wait for async operations - torch.cuda.synchronize() - torch.cuda.empty_cache() - memory_after = torch.cuda.memory_allocated() - - # Memory should decrease - memory_freed = memory_before - memory_after - assert memory_freed > 0, f"Expected memory to be freed, but got {memory_freed} bytes difference" - Utils.destroy_model_parallel() - - -# ============================================================================= -# Test 5: Multiple Offload/Reload Cycles -# ============================================================================= -@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") -def test_multiple_offload_reload_cycles(): - """Test that multiple offload/reload cycles work correctly.""" - Utils.initialize_model_parallel() - model, optim = create_model_and_optimizer() - dist_optim = optim.chained_optimizers[0] - - # Initialize - run_forward_backward_step(model, optim) - - offloader = dist_optim._state_offloader - - # Run multiple cycles - for cycle in range(5): - # Offload - offloader.offload() - offloader.release_gpu_memory() - - # Reload - offloader.reload() - offloader.sync_before_step() - - # Run optimizer step - run_forward_backward_step(model, optim) - - # Verify model can still produce valid outputs - input_tensor = torch.randn(8, 256, dtype=torch.bfloat16, device='cuda') - output = model(input_tensor) - assert not output.isnan().any(), "Model output contains NaN after multiple cycles" - Utils.destroy_model_parallel() - - -# ============================================================================= -# Test 6: Training Correctness with Offloading -# ============================================================================= -@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") -def test_training_correctness_with_offloading(): - """Test that training with offloading produces same results as without.""" - Utils.initialize_model_parallel() - torch.manual_seed(42) - - # Model 1: with offloading - model1, optim1 = create_model_and_optimizer(offload_optimizer_states=True, lr=0.01) - - # Model 2: without offloading (reference) - torch.manual_seed(42) - model2, optim2 = create_model_and_optimizer(offload_optimizer_states=False, lr=0.01) - - # Train both models - n_steps = 10 - torch.manual_seed(123) - dist_optim1 = optim1.chained_optimizers[0] - - # Offloader is created in __init__ when offload_optimizer_states=True - assert dist_optim1._state_offloader is not None - offloader = dist_optim1._state_offloader - - for step in range(n_steps): - input_tensor = torch.randn(8, 256, dtype=torch.bfloat16, device='cuda') - - # Model 1 with offloading - # Offload states (master weights can be offloaded from the start, - # optimizer states will be skipped until after first step) - offloader.offload() - offloader.release_gpu_memory() - - output1 = model1(input_tensor) - loss1 = output1.sum() - loss1.backward() - - offloader.reload() - offloader.sync_before_step() - optim1.step() - optim1.zero_grad() - - # Model 2 without offloading - output2 = model2(input_tensor) - loss2 = output2.sum() - loss2.backward() - optim2.step() - optim2.zero_grad() - - # Compare final model weights - for (n1, p1), (n2, p2) in zip(model1.named_parameters(), model2.named_parameters()): - torch.testing.assert_close( - p1.data, - p2.data, - atol=1e-5, - rtol=1e-4, - msg=f"Parameter {n1} mismatch between offloaded and non-offloaded training", - ) - Utils.destroy_model_parallel() From bd8411c39332651120ce7505bb64b37d73075801 Mon Sep 17 00:00:00 2001 From: Nan Zheng <80790206+nanz-nv@users.noreply.github.com> Date: Mon, 19 Jan 2026 15:36:47 +0800 Subject: [PATCH 240/248] Forced load imbalance (#2917) Co-authored-by: Dennis(Zhenhuan) Liu --- megatron/core/transformer/moe/moe_utils.py | 54 ++++++++++++++++++- megatron/core/transformer/moe/router.py | 7 +++ .../core/transformer/transformer_config.py | 7 +++ megatron/training/arguments.py | 6 +++ 4 files changed, 73 insertions(+), 1 deletion(-) diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py index d38b06b2704..60878155fd4 100644 --- a/megatron/core/transformer/moe/moe_utils.py +++ b/megatron/core/transformer/moe/moe_utils.py @@ -10,7 +10,11 @@ from megatron.core.fp4_utils import get_fp4_align_size from megatron.core.fp8_utils import get_fp8_align_size from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.tensor_parallel import get_cuda_rng_tracker, get_expert_parallel_rng_tracker_name +from megatron.core.tensor_parallel import ( + get_cuda_rng_tracker, + get_data_parallel_rng_tracker_name, + get_expert_parallel_rng_tracker_name, +) from megatron.core.tensor_parallel.mappings import reduce_from_tensor_model_parallel_region from megatron.core.transformer.cuda_graphs import is_graph_capturing from megatron.core.transformer.enums import CudaGraphScope @@ -1021,6 +1025,54 @@ def apply_random_logits(logits): return RandomSTE.apply(logits) +@internal_api +class RandomSTEShared(torch.autograd.Function): + """ + STE that generates random values with shared seed across all ranks. + When std < 0, caches and reuses values per layer. + """ + + _cache = {} + + @staticmethod + def forward(ctx, logits, std, layer_number): + """Forward pass: apply random bias to logits.""" + # Check cache if reuse mode (negative std) + if std < 0 and layer_number in RandomSTEShared._cache: + return logits + RandomSTEShared._cache[layer_number] + + # Generate random bias with shared seed across all ranks + with get_cuda_rng_tracker().fork(get_data_parallel_rng_tracker_name()): + bias = torch.empty(logits.shape[-1], device=logits.device, dtype=logits.dtype).normal_( + std=abs(std) + ) + + # Cache if reuse mode + if std < 0 and layer_number is not None: + RandomSTEShared._cache[layer_number] = bias + + return logits + bias + + @staticmethod + def backward(ctx, grad_output): + """Backward pass: pass through gradients.""" + return grad_output, None, None + + +def apply_biased_logits(logits, std, layer_number=None): + """ + Apply random bias to logits. All ranks get the same random values. + + Args: + logits: Input logits tensor [num_tokens, num_experts] + std: Standard deviation for random bias. If negative, generate once + per layer and reuse (using abs(std) as actual std). + layer_number: Layer number for caching when std is negative. + """ + logits = apply_random_logits(logits) + return RandomSTEShared.apply(logits, std, layer_number) + + class RouterGatingLinearFunction(torch.autograd.Function): """ Autograd function for router gating linear. diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py index bbfb01fec8b..003043bc18d 100644 --- a/megatron/core/transformer/moe/router.py +++ b/megatron/core/transformer/moe/router.py @@ -10,6 +10,7 @@ from megatron.core.transformer.moe.moe_utils import ( MoEAuxLossAutoScaler, ProcessGroupCollection, + apply_biased_logits, apply_random_logits, apply_router_token_dropping, compute_routing_scores_for_aux_loss, @@ -654,6 +655,12 @@ def forward(self, input: torch.Tensor, padding_mask: Optional[torch.Tensor] = No # Apply force load balancing with random logits for benchmark logits = apply_random_logits(logits) + if self.config.moe_router_force_biased is not None: + # Apply biased logits with shared random bias across all ranks + logits = apply_biased_logits( + logits, self.config.moe_router_force_biased, self.layer_number + ) + probs, routing_map = self.routing(logits, padding_mask=padding_mask) return probs, routing_map diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index df11daeb095..18cea44c51f 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -616,6 +616,13 @@ class TransformerConfig(ModelParallelConfig): """[Experimental] Force load balancing with random logits for MoE router, supports naive topk and group-limited topk. This is an experimental feature and only for benchmark.""" + moe_router_force_biased: Optional[float] = None + """[Experimental] Apply random expert bias in normal distribution with specified std + to router logits. Shared seed across all ranks ensures identical bias. + If positive, generates new random bias each forward pass. + If negative, generates bias once per layer and reuses it (abs value is std). + This is an experimental feature for benchmarking purposes.""" + moe_grouped_gemm: bool = False """When there are multiple experts per rank, compress multiple local (potentially small) gemms in a single kernel launch to improve the utilization and performance by leveraging the Grouped diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 5f9e7350c18..096d63985d9 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -3332,6 +3332,12 @@ def _add_moe_args(parser): 'The default value 1e-3 is same as that used in DeepSeekV3.') group.add_argument('--moe-router-force-load-balancing', action='store_true', help='[Experimental] Force override routing to balance token distribution using random logits for MoE routers, supporting naive top-k and group-limited top-k. This experimental feature is for benchmarking purposes only!') + group.add_argument('--moe-router-force-biased', type=float, default=None, + help='[Experimental] Apply random expert bias in normal distribution with specified std to router logits. ' + 'Shared seed across all ranks ensures identical bias. ' + 'If positive, generates new random bias each forward pass. ' + 'If negative, generates bias once per layer and reuses it (abs value is std). ' + 'This experimental feature is for benchmarking purposes only!') group.add_argument('--moe-router-padding-for-quantization', action='store_true', help='Pad the routing_map to make sure the number of tokens each expert received ' 'is a multiple of 16/32 for FP8/FP4 precision. It is suggested to enable this for ' From 0a2e01fdcade766a9d1ebd0119387ba159358b61 Mon Sep 17 00:00:00 2001 From: hx Date: Mon, 19 Jan 2026 15:51:50 +0800 Subject: [PATCH 241/248] [Dev] [Reapply] Optimizer State and Master Weight Offloading (#2987) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Xin Yao Co-authored-by: oliver könig --- .../optimizer_state_offloader.py | 315 ++++++++++++++++ megatron/core/optimizer/distrib_optimizer.py | 26 ++ megatron/core/optimizer/optimizer_config.py | 6 + megatron/training/arguments.py | 13 + megatron/training/training.py | 30 +- .../test_optimizer_state_offloading.py | 337 ++++++++++++++++++ 6 files changed, 726 insertions(+), 1 deletion(-) create mode 100644 megatron/core/optimizer/cpu_offloading/optimizer_state_offloader.py create mode 100644 tests/unit_tests/test_optimizer_state_offloading.py diff --git a/megatron/core/optimizer/cpu_offloading/optimizer_state_offloader.py b/megatron/core/optimizer/cpu_offloading/optimizer_state_offloader.py new file mode 100644 index 00000000000..81fd116c8ba --- /dev/null +++ b/megatron/core/optimizer/cpu_offloading/optimizer_state_offloader.py @@ -0,0 +1,315 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +"""Optimizer state offloading class.""" + +from typing import TYPE_CHECKING, Dict, List, Tuple + +import torch + +if TYPE_CHECKING: + from megatron.core.optimizer.distrib_optimizer import DistributedOptimizer + + +class OptimizerStateOffloader: + """ + Manages offloading of optimizer states and master weights to CPU. + Used with DistributedOptimizer to reduce GPU memory usage. + + Supports overlapped D2H/H2D transfers using CUDA streams. + + Master weights can be stored in two locations: + - In adam optimizer state (when use_precision_aware_optimizer_no_fp8_or_ds_fp8 is True) + - In mcore's shard_fp32_from_float16_groups + """ + + OPTIMIZER_STATE_KEYS = ('exp_avg', 'exp_avg_sq') + MASTER_WEIGHT_KEY = 'master_param' + + def __init__(self, distrib_optimizer: "DistributedOptimizer"): + """ + Args: + distrib_optimizer: The DistributedOptimizer to offload states and master weights from. + """ + self.dist_optimizer = distrib_optimizer + self.adam_optimizer = distrib_optimizer.optimizer + + # Only support TE FusedAdam optimizer for now. + try: + from transformer_engine.pytorch.optimizers import FusedAdam + + assert isinstance(self.adam_optimizer, FusedAdam), ( + f"OptimizerStateOffloader requires TE FusedAdam optimizer, " + f"but got {type(self.adam_optimizer).__name__}" + ) + except ImportError: + raise ImportError( + "OptimizerStateOffloader requires transformer_engine.pytorch.optimizers.FusedAdam" + ) + + # Check if master weights are stored in adam optimizer state + self.optimizer_contains_master_weights = self.adam_optimizer.master_weights + + # CUDA streams for async transfers + self._d2h_stream = torch.cuda.Stream() + self._h2d_stream = torch.cuda.Stream() + + # CPU buffers for optimizer states: {param: {key: cpu_tensor}} + self._opt_state_cpu_buffers: Dict[torch.Tensor, Dict[str, torch.Tensor]] = {} + + # CPU buffers for mcore master weights, matching the structure of source groups + # List[List[cpu_tensor]] + self._shard_fp32_from_float16_cpu_buffers: List[List[torch.Tensor]] = [] + + # State tracking + self._offloaded = False + self._offloaded_state_keys: Tuple[str, ...] = () + self._offloaded_mcore_master_weights = False + + # Track whether optimizer states (exp_avg, exp_avg_sq) have been initialized. + # These are lazily initialized by FusedAdam during the first optimizer.step(). + # Master weights (shard_fp32_from_float16_groups) are available from the start. + self._optimizer_states_initialized = False + + def mark_optimizer_states_initialized(self): + """ + Mark that optimizer states (exp_avg, exp_avg_sq) are now available. + Should be called after the first optimizer.step() completes. + """ + self._optimizer_states_initialized = True + + def _get_state_keys_to_offload( + self, offload_optimizer_states: bool, offload_master_weights: bool + ) -> Tuple[str, ...]: + """Get the state keys in FusedAdam to offload based on configuration.""" + keys = [] + # Skip optimizer states offloading if they haven't been initialized yet. + # Optimizer states are lazily initialized by FusedAdam during the first optimizer.step(). + if self._optimizer_states_initialized: + if offload_optimizer_states: + keys.extend(self.OPTIMIZER_STATE_KEYS) + if offload_master_weights and self.optimizer_contains_master_weights: + keys.append(self.MASTER_WEIGHT_KEY) + return tuple(keys) + + def _ensure_state_cpu_buffer( + self, param: torch.Tensor, state_key: str, gpu_tensor: torch.Tensor, pin_memory: bool = True + ) -> torch.Tensor: + """Get or create a CPU buffer for a state tensor.""" + if param not in self._opt_state_cpu_buffers: + self._opt_state_cpu_buffers[param] = {} + + if state_key not in self._opt_state_cpu_buffers[param]: + cpu_buffer = torch.empty( + gpu_tensor.size(), + dtype=gpu_tensor.dtype, + layout=gpu_tensor.layout, + device='cpu', + pin_memory=pin_memory, + ) + self._opt_state_cpu_buffers[param][state_key] = cpu_buffer + + return self._opt_state_cpu_buffers[param][state_key] + + def _offload_shard_groups( + self, + shard_groups: List[List[torch.Tensor]], + cpu_buffers: List[List[torch.Tensor]], + pin_memory: bool = True, + ): + """Offload a shard group to CPU buffers.""" + # Initialize CPU buffers on first call + if len(cpu_buffers) == 0: + for group in shard_groups: + group_buffers = [] + for gpu_tensor in group: + cpu_buffer = torch.empty( + gpu_tensor.size(), + dtype=gpu_tensor.dtype, + layout=gpu_tensor.layout, + device='cpu', + pin_memory=pin_memory, + ) + group_buffers.append(cpu_buffer) + cpu_buffers.append(group_buffers) + + # Copy D2H + for group_idx, group in enumerate(shard_groups): + for param_idx, gpu_tensor in enumerate(group): + cpu_buffer = cpu_buffers[group_idx][param_idx] + cpu_buffer.copy_(gpu_tensor, non_blocking=pin_memory) + gpu_tensor.record_stream(self._d2h_stream) + + def _offload_states( + self, + offload_optimizer_states: bool, + offload_master_weights: bool, + use_pin_memory: bool = True, + ): + """Offload optimizer states and/or master weights to CPU.""" + # Offload states from adam optimizer + self._offloaded_state_keys = self._get_state_keys_to_offload( + offload_optimizer_states, offload_master_weights + ) + states = self.adam_optimizer.state + + for param, param_state in states.items(): + for state_key in self._offloaded_state_keys: + if state_key not in param_state: + continue + + gpu_tensor = param_state[state_key] + if not isinstance(gpu_tensor, torch.Tensor) or not gpu_tensor.is_cuda: + continue + + cpu_buffer = self._ensure_state_cpu_buffer( + param, state_key, gpu_tensor, use_pin_memory + ) + cpu_buffer.copy_(gpu_tensor, non_blocking=use_pin_memory) + gpu_tensor.record_stream(self._d2h_stream) + + # Offload mcore master weights if not in optimizer state + if offload_master_weights and not self.optimizer_contains_master_weights: + self._offload_shard_groups( + self.dist_optimizer.shard_fp32_from_float16_groups, + self._shard_fp32_from_float16_cpu_buffers, + use_pin_memory, + ) + self._offloaded_mcore_master_weights = True + + def _release_states(self): + """Replace optimizer state GPU tensors with CPU tensors to free GPU memory.""" + states = self.adam_optimizer.state + + for param, param_state in states.items(): + if param not in self._opt_state_cpu_buffers: + continue + + for state_key in self._offloaded_state_keys: + if state_key not in self._opt_state_cpu_buffers[param]: + continue + + param_state[state_key].untyped_storage().resize_(0) + + if self._offloaded_mcore_master_weights: + for group in self.dist_optimizer.shard_fp32_from_float16_groups: + for gpu_tensor in group: + gpu_tensor.untyped_storage().resize_(0) + + def _reload_shard_groups( + self, + shard_groups: List[List[torch.Tensor]], + cpu_buffers: List[List[torch.Tensor]], + is_allocate_stage: bool, + ): + """Reload shard groups from CPU to GPU.""" + for group_idx, group in enumerate(shard_groups): + for param_idx, _ in enumerate(group): + cpu_buffer = cpu_buffers[group_idx][param_idx] + if is_allocate_stage: + shard_groups[group_idx][param_idx].untyped_storage().resize_( + cpu_buffer.untyped_storage().size() + ) + else: + shard_groups[group_idx][param_idx].copy_( + cpu_buffer, non_blocking=cpu_buffer.is_pinned() + ) + + def _reload_states(self, is_allocate_stage: bool): + """ + Reload optimizer states and/or master weights from CPU to GPU. + + If is_allocate_stage is True, only allocate GPU memory for the states and master weights, + but do not copy the data from CPU to GPU. Otherwise, copy the data from CPU to GPU. + The two processes are separated to make sure that the GPU memory is allocated on the + default stream to avoid fragmentation. + """ + # Reload states to adam optimizer + states = self.adam_optimizer.state + + for param, param_state in states.items(): + if param not in self._opt_state_cpu_buffers: + continue + + for state_key in self._offloaded_state_keys: + if state_key not in self._opt_state_cpu_buffers[param]: + continue + + cpu_buffer = self._opt_state_cpu_buffers[param][state_key] + if is_allocate_stage: + param_state[state_key].untyped_storage().resize_( + cpu_buffer.untyped_storage().size() + ) + else: + param_state[state_key].copy_(cpu_buffer, non_blocking=cpu_buffer.is_pinned()) + + # Reload mcore master weights if not in optimizer state + if self._offloaded_mcore_master_weights: + self._reload_shard_groups( + self.dist_optimizer.shard_fp32_from_float16_groups, + self._shard_fp32_from_float16_cpu_buffers, + is_allocate_stage, + ) + + def offload(self, offload_optimizer_states: bool = True, offload_master_weights: bool = True): + """ + Offload optimizer states and/or master weights to CPU. + Starts async D2H transfer that can overlap with other operations. + + Args: + offload_optimizer_states: Whether to offload exp_avg, exp_avg_sq. + offload_master_weights: Whether to offload master weights. + """ + if not offload_optimizer_states and not offload_master_weights: + return + + # Wait for current stream finishing updating the optimizer states. + self._d2h_stream.wait_stream(torch.cuda.current_stream()) + + with torch.cuda.stream(self._d2h_stream): + self._offload_states(offload_optimizer_states, offload_master_weights) + + self._offloaded = True + + def release_gpu_memory(self): + """ + Release GPU memory for optimizer states and master weights after D2H copy completes. + + This is separated from offload() to allow delayed GPU memory release, + which is needed for mxfp8 + overlap_param_gather case where master weights + must remain on GPU until after _copy_main_params_to_param_buffer() is called. + """ + if not self._offloaded: + return + + self._release_states() + + def reload(self): + """ + Reload optimizer states and/or master weights from CPU to GPU. + Call before optimizer.step() to ensure states are on GPU. + """ + if not self._offloaded: + return + + # Allocate GPU memory on the current stream to avoid fragmentation. + self._reload_states(is_allocate_stage=True) + + self._h2d_stream.wait_stream(self._d2h_stream) + self._h2d_stream.wait_stream(torch.cuda.current_stream()) + + # Reload states on the h2d stream to overlap with other operations. + with torch.cuda.stream(self._h2d_stream): + self._reload_states(is_allocate_stage=False) + + self._offloaded_state_keys = () + self._offloaded_mcore_master_weights = False + self._offloaded = False + + def sync_before_step(self): + """ + Wait for H2D reload to complete before optimizer.step(). + Must be called to ensure states are on GPU before optimizer uses them. + + This is separated from reload() to make it possible to move the reload ahead of time. + """ + torch.cuda.current_stream().wait_stream(self._h2d_stream) diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py index 6e093f96f7e..2f5876fa48a 100644 --- a/megatron/core/optimizer/distrib_optimizer.py +++ b/megatron/core/optimizer/distrib_optimizer.py @@ -49,6 +49,7 @@ from ..fp8_utils import dequantize_fp8_tensor, is_float8tensor, quantize_param_shard from ..transformer.fsdp_dtensor_checkpoint import handle_experts_in_state_dict from ..transformer.module import MegatronModule +from .cpu_offloading.optimizer_state_offloader import OptimizerStateOffloader from .grad_scaler import MegatronGradScaler from .optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper, param_group_identifier_keys from .optimizer_config import OptimizerConfig @@ -516,6 +517,8 @@ def __init__( "due to checkpointing requirements." ) + self._state_offloader: Optional[OptimizerStateOffloader] = None + # when freezing sub-models we have no real optimizer # but still need a stub DistributedOptimizer class if optimizer is None: @@ -604,6 +607,9 @@ def __init__( self.optimizer.param_groups = [g["orig_group"] for g in self.opt_group_ranges] self.optimizer.load_state_dict(self.optimizer.state_dict()) + if self.config.offload_optimizer_states: + self._state_offloader = OptimizerStateOffloader(self) + def _get_model_param_range_map(self, param: torch.nn.Parameter): """ Given a model param, get the index sub-range of the param that this @@ -2580,6 +2586,8 @@ def step_with_ready_grads(self) -> bool: Under the hood, either launch synchronous param all-gathers or get ready to launch asynchorous all-gathers that get overlapped with the next forward pass. """ + if self._state_offloader is not None: + self._state_offloader.sync_before_step() update_successful = super().step_with_ready_grads() timers = self.config.timers @@ -2600,4 +2608,22 @@ def step_with_ready_grads(self) -> bool: if timers is not None: timers('params-all-gather').stop() + if self._state_offloader is not None: + self._state_offloader.mark_optimizer_states_initialized() + return update_successful + + def offload_states(self): + """Offload states to CPU.""" + if self._state_offloader is not None: + self._state_offloader.offload() + + def reload_offloaded_states(self): + """Start async reload of offloaded states.""" + if self._state_offloader is not None: + self._state_offloader.reload() + + def release_offloaded_gpu_states(self): + """Release GPU memory after D2H completes. For delayed release case.""" + if self._state_offloader is not None: + self._state_offloader.release_gpu_memory() diff --git a/megatron/core/optimizer/optimizer_config.py b/megatron/core/optimizer/optimizer_config.py index 679878ed954..1813488d7bd 100644 --- a/megatron/core/optimizer/optimizer_config.py +++ b/megatron/core/optimizer/optimizer_config.py @@ -266,6 +266,12 @@ class OptimizerConfig: pin_cpu_params: bool = True """If True, pin the optimizer parameters to CPU memory.""" + offload_optimizer_states: bool = False + """ + If True, offload optimizer states to CPU after each optimizer step and + reload them before the next optimizer step. + """ + ################ # Miscellaneous ################ diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 096d63985d9..a65f1cd6469 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1271,6 +1271,11 @@ def validate_args(args, defaults={}): "must be used in conjunction with `--fp8-recipe delayed`." ) + if args.offload_optimizer_states: + assert args.use_distributed_optimizer, "offload_optimizer_states is only supported with distributed optimizer" + assert args.optimizer == 'adam', "offload_optimizer_states is only supported with adam optimizer" + assert not args.use_megatron_fsdp, "offload_optimizer_states does not support Megatron-FSDP for now." + if args.non_persistent_ckpt_type == "local": assert args.non_persistent_local_ckpt_dir is not None, "Tried to use local checkpointing without specifying --local-ckpt-dir!" if args.replication: @@ -2386,6 +2391,14 @@ def _add_training_args(parser): help='Disable pinning of CPU memory for gradients.') group.add_argument('--no-pin-cpu-params', action='store_false', dest='pin_cpu_params', help='Disable pinning of CPU memory for parameters.') + group.add_argument('--offload-optimizer-states', + action='store_true', + dest='offload_optimizer_states', + help='Offload optimizer states to CPU after each optimizer step and ' + 'reload them before the next optimizer step. ' + 'Only support TE FusedAdam optimizer.' + 'Note that this still uses pure GPU optimizer instead of ' + 'HybridDeviceOptimizer for --optimizer-cpu-offload.') group.add_argument('--dataloader-type', type=str, default=None, choices=['single', 'cyclic', 'external'], help='Single pass vs multiple pass data loader') diff --git a/megatron/training/training.py b/megatron/training/training.py index 845d271f62e..8aff2556d14 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -1425,6 +1425,12 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch rerun_state_machine = get_rerun_state_machine() while rerun_state_machine.should_run_forward_backward(data_iterator): + # Offload optimizer states to CPU if enabled. + if args.offload_optimizer_states: + for optim_instance in optimizer.chained_optimizers: + if isinstance(optim_instance, DistributedOptimizer): + optim_instance.offload_states() + # Set grad to zero. for model_chunk in model: model_chunk.zero_grad_buffer() @@ -1458,6 +1464,14 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch if isinstance(optim_instance, DistributedOptimizer): optim_instance._copy_main_params_to_param_buffer() + # Release GPU memory for offloaded optimizer states. + # This needs to be done after _copy_main_params_to_param_buffer(). + # Separate offload and release to allow early D2H transfer to overlap with other operations. + if args.offload_optimizer_states: + for optim_instance in optimizer.chained_optimizers: + if isinstance(optim_instance, DistributedOptimizer): + optim_instance.release_offloaded_gpu_states() + # Forward pass. losses_reduced = forward_backward_func( forward_step_func=forward_step_func, @@ -2305,7 +2319,21 @@ def train( config.param_sync_func = [model_chunk.start_param_sync for model_chunk in model] if len(model) == 1: config.param_sync_func = config.param_sync_func[0] - config.finalize_model_grads_func = finalize_model_grads + + # Wrap finalize_model_grads to reload offloaded optimizer states before grad finalization. + # This allows H2D transfer to overlap with grad all-reduce. + if args.offload_optimizer_states: + + def finalize_model_grads_with_state_reload(*fmg_args, **fmg_kwargs): + # Reload offloaded states for all DistributedOptimizer instances + for optim_instance in optimizer.chained_optimizers: + if isinstance(optim_instance, DistributedOptimizer): + optim_instance.reload_offloaded_states() + return finalize_model_grads(*fmg_args, **fmg_kwargs) + + config.finalize_model_grads_func = finalize_model_grads_with_state_reload + else: + config.finalize_model_grads_func = finalize_model_grads if args.log_energy: energy_monitor.setup() diff --git a/tests/unit_tests/test_optimizer_state_offloading.py b/tests/unit_tests/test_optimizer_state_offloading.py new file mode 100644 index 00000000000..baaab355182 --- /dev/null +++ b/tests/unit_tests/test_optimizer_state_offloading.py @@ -0,0 +1,337 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +"""Unit tests for OptimizerStateOffloader.""" + +import pytest +import torch +import torch.nn as nn + +from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig +from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer +from megatron.core.transformer import TransformerConfig +from tests.unit_tests.test_utilities import Utils + +try: + from transformer_engine.pytorch.optimizers import FusedAdam # noqa: F401 + + TE_FUSED_ADAM_AVAILABLE = True +except ImportError: + TE_FUSED_ADAM_AVAILABLE = False + + +class SimpleModel(nn.Module): + """Simple model for testing.""" + + def __init__(self, hidden_size=256): + super().__init__() + self.fc1 = nn.Linear(hidden_size, hidden_size) + self.fc2 = nn.Linear(hidden_size, hidden_size) + + def forward(self, x): + return self.fc2(torch.relu(self.fc1(x))) + + +def create_model_and_optimizer(hidden_size=256, offload_optimizer_states=True, **optimizer_kwargs): + """Helper to create model and optimizer for tests.""" + model = SimpleModel(hidden_size=hidden_size).bfloat16().cuda() + ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=True) + model = DistributedDataParallel( + TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model + ) + + default_config = dict( + optimizer='adam', + bf16=True, + lr=0.001, + use_distributed_optimizer=True, + offload_optimizer_states=offload_optimizer_states, + ) + default_config.update(optimizer_kwargs) + + optimizer_config = OptimizerConfig(**default_config) + optim = get_megatron_optimizer(optimizer_config, [model]) + return model, optim + + +def run_forward_backward_step(model, optim, hidden_size=256): + """Run a single forward-backward-step cycle.""" + input_tensor = torch.randn(8, hidden_size, dtype=torch.bfloat16, device='cuda') + output = model(input_tensor) + output.sum().backward() + optim.step() + optim.zero_grad() + + +# ============================================================================= +# Test 1: Basic OptimizerStateOffloader Initialization +# ============================================================================= +@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") +def test_offloader_initialization(): + """Test that OptimizerStateOffloader initializes correctly.""" + Utils.initialize_model_parallel() + model, optim = create_model_and_optimizer() + dist_optim = optim.chained_optimizers[0] + + # Offloader is created in __init__ when offload_optimizer_states=True + assert dist_optim._state_offloader is not None + offloader = dist_optim._state_offloader + + # Verify offloader properties + assert offloader.adam_optimizer is not None + assert offloader._d2h_stream is not None + assert offloader._h2d_stream is not None + assert offloader._offloaded is False + + # Before first step, optimizer states are not initialized yet + assert offloader._optimizer_states_initialized is False + + # Run one step to initialize optimizer states + run_forward_backward_step(model, optim) + + # After first step, optimizer states should be marked as initialized + assert offloader._optimizer_states_initialized is True + Utils.destroy_model_parallel() + + +# ============================================================================= +# Test 2: Early Master Weight Offloading Before First Step +# ============================================================================= +@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") +def test_early_master_weight_offloading(): + """Test that master weights can be offloaded before the first optimizer step.""" + Utils.initialize_model_parallel() + model, optim = create_model_and_optimizer() + dist_optim = optim.chained_optimizers[0] + + # Offloader is created in __init__ + assert dist_optim._state_offloader is not None + offloader = dist_optim._state_offloader + + # Before first step, optimizer states are not initialized + assert offloader._optimizer_states_initialized is False + + # Capture original master weights before offload + original_master_weights = [] + for group in dist_optim.shard_fp32_from_float16_groups: + group_weights = [tensor.clone() for tensor in group] + original_master_weights.append(group_weights) + + # Offload before first step - should only offload master weights + offloader.offload() + offloader.release_gpu_memory() + torch.cuda.synchronize() + + # Verify master weights were offloaded (storage resized to 0) + for group in dist_optim.shard_fp32_from_float16_groups: + for tensor in group: + assert tensor.untyped_storage().size() == 0, "Master weight should be offloaded" + + # Reload master weights + offloader.reload() + offloader.sync_before_step() + + # Verify master weights match after reload + for group_idx, group in enumerate(dist_optim.shard_fp32_from_float16_groups): + for param_idx, tensor in enumerate(group): + original = original_master_weights[group_idx][param_idx] + torch.testing.assert_close( + tensor, + original, + msg=f"Master weight [{group_idx}][{param_idx}] mismatch after offload/reload", + ) + + # Now run a step and verify optimizer states can be offloaded after + run_forward_backward_step(model, optim) + assert offloader._optimizer_states_initialized is True + + Utils.destroy_model_parallel() + + +# ============================================================================= +# Test 3: Offload and Reload Correctness +# ============================================================================= +@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") +@pytest.mark.parametrize("offload_optimizer_states", [True, False]) +@pytest.mark.parametrize("offload_master_weights", [True, False]) +def test_offload_reload_correctness(offload_optimizer_states, offload_master_weights): + """Test that offload/reload preserves optimizer state values.""" + if not offload_optimizer_states and not offload_master_weights: + pytest.skip("At least one offload type required") + + Utils.initialize_model_parallel() + model, optim = create_model_and_optimizer() + dist_optim = optim.chained_optimizers[0] + + # Run steps to build up optimizer state + for _ in range(3): + run_forward_backward_step(model, optim) + + offloader = dist_optim._state_offloader + + # Capture original states before offload + original_states = {} + for param, state in offloader.adam_optimizer.state.items(): + original_states[param] = { + k: v.clone() for k, v in state.items() if isinstance(v, torch.Tensor) + } + + # Offload + offloader.offload( + offload_optimizer_states=offload_optimizer_states, + offload_master_weights=offload_master_weights, + ) + + # Release GPU memory + offloader.release_gpu_memory() + torch.cuda.synchronize() + + # Reload + offloader.reload() + offloader.sync_before_step() + + # Verify states match after reload + for param, state in offloader.adam_optimizer.state.items(): + if param in original_states: + for key, original_tensor in original_states[param].items(): + if key in state and isinstance(state[key], torch.Tensor): + reloaded_tensor = state[key] + assert reloaded_tensor.device.type == 'cuda', f"State {key} should be on GPU" + torch.testing.assert_close( + reloaded_tensor, + original_tensor, + msg=f"State {key} mismatch after offload/reload", + ) + Utils.destroy_model_parallel() + + +# ============================================================================= +# Test 4: GPU Memory Release Verification +# ============================================================================= +@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") +def test_gpu_memory_release(): + """Test that GPU memory is actually freed after release_gpu_memory().""" + Utils.initialize_model_parallel() + # Use larger model for measurable memory impact + model, optim = create_model_and_optimizer(hidden_size=1024) + dist_optim = optim.chained_optimizers[0] + + # Initialize optimizer states + run_forward_backward_step(model, optim, hidden_size=1024) + + offloader = dist_optim._state_offloader + + # Measure memory before offload + torch.cuda.synchronize() + torch.cuda.empty_cache() + memory_before = torch.cuda.memory_allocated() + + # Offload and release + offloader.offload() + offloader.release_gpu_memory() + + # Wait for async operations + torch.cuda.synchronize() + torch.cuda.empty_cache() + memory_after = torch.cuda.memory_allocated() + + # Memory should decrease + memory_freed = memory_before - memory_after + assert memory_freed > 0, f"Expected memory to be freed, but got {memory_freed} bytes difference" + Utils.destroy_model_parallel() + + +# ============================================================================= +# Test 5: Multiple Offload/Reload Cycles +# ============================================================================= +@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") +def test_multiple_offload_reload_cycles(): + """Test that multiple offload/reload cycles work correctly.""" + Utils.initialize_model_parallel() + model, optim = create_model_and_optimizer() + dist_optim = optim.chained_optimizers[0] + + # Initialize + run_forward_backward_step(model, optim) + + offloader = dist_optim._state_offloader + + # Run multiple cycles + for cycle in range(5): + # Offload + offloader.offload() + offloader.release_gpu_memory() + + # Reload + offloader.reload() + offloader.sync_before_step() + + # Run optimizer step + run_forward_backward_step(model, optim) + + # Verify model can still produce valid outputs + input_tensor = torch.randn(8, 256, dtype=torch.bfloat16, device='cuda') + output = model(input_tensor) + assert not output.isnan().any(), "Model output contains NaN after multiple cycles" + Utils.destroy_model_parallel() + + +# ============================================================================= +# Test 6: Training Correctness with Offloading +# ============================================================================= +@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam") +def test_training_correctness_with_offloading(): + """Test that training with offloading produces same results as without.""" + Utils.initialize_model_parallel() + torch.manual_seed(42) + + # Model 1: with offloading + model1, optim1 = create_model_and_optimizer(offload_optimizer_states=True, lr=0.01) + + # Model 2: without offloading (reference) + torch.manual_seed(42) + model2, optim2 = create_model_and_optimizer(offload_optimizer_states=False, lr=0.01) + + # Train both models + n_steps = 10 + torch.manual_seed(123) + dist_optim1 = optim1.chained_optimizers[0] + + # Offloader is created in __init__ when offload_optimizer_states=True + assert dist_optim1._state_offloader is not None + offloader = dist_optim1._state_offloader + + for step in range(n_steps): + input_tensor = torch.randn(8, 256, dtype=torch.bfloat16, device='cuda') + + # Model 1 with offloading + # Offload states (master weights can be offloaded from the start, + # optimizer states will be skipped until after first step) + offloader.offload() + offloader.release_gpu_memory() + + output1 = model1(input_tensor) + loss1 = output1.sum() + loss1.backward() + + offloader.reload() + offloader.sync_before_step() + optim1.step() + optim1.zero_grad() + + # Model 2 without offloading + output2 = model2(input_tensor) + loss2 = output2.sum() + loss2.backward() + optim2.step() + optim2.zero_grad() + + # Compare final model weights + for (n1, p1), (n2, p2) in zip(model1.named_parameters(), model2.named_parameters()): + torch.testing.assert_close( + p1.data, + p2.data, + atol=1e-5, + rtol=1e-4, + msg=f"Parameter {n1} mismatch between offloaded and non-offloaded training", + ) + Utils.destroy_model_parallel() From 8abc08640a3dfc11510d2849f358d65784507fca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 19 Jan 2026 15:51:08 +0100 Subject: [PATCH 242/248] ci(fix): CI_COMMIT_BRANCH on forks (#2982) (#2989) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .gitlab/scripts/build.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab/scripts/build.sh b/.gitlab/scripts/build.sh index 8359731e3d7..9bcf5d45712 100644 --- a/.gitlab/scripts/build.sh +++ b/.gitlab/scripts/build.sh @@ -20,6 +20,8 @@ docker buildx create --name container --driver=docker-container --use tls-enviro ADDITIONAL_PARAMS=() +CI_COMMIT_BRANCH="${CI_COMMIT_BRANCH:-$CI_MERGE_REQUEST_SOURCE_BRANCH_NAME}" + if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" || "$CI_COMMIT_BRANCH" == "main" || "$CI_COMMIT_BRANCH" == "dev" ]]; then ADDITIONAL_PARAMS+=("--pull") fi From 5b17f19fc7d0ed6e00aabb1a3154769d276c68fe Mon Sep 17 00:00:00 2001 From: "Dennis(Zhenhuan) Liu" Date: Tue, 20 Jan 2026 00:56:53 +0800 Subject: [PATCH 243/248] [Dev] Update MoE readme. (#2808) Co-authored-by: Zijie Yan --- megatron/core/transformer/moe/README.md | 931 +++++++++++++++--------- 1 file changed, 584 insertions(+), 347 deletions(-) diff --git a/megatron/core/transformer/moe/README.md b/megatron/core/transformer/moe/README.md index a44daea38e2..71dfa17fda0 100644 --- a/megatron/core/transformer/moe/README.md +++ b/megatron/core/transformer/moe/README.md @@ -1,159 +1,396 @@ # Megatron Core MoE -Megatron-Core MoE provides comprehensive parallelism strategies, seamlessly integrating Expert Parallelism with tensor, data, sequence, and pipeline parallelism. With MCore v0.9, we've achieved remarkable performance of **468 TFLOPS** for Mixtral 8X7B bf16 training. Additionally, we support state-of-the-art MoE model architectures including DeepSeek-V3 and Qwen-MoE. - -### What's New -- **Support for DeepSeek-V3 architecture** - - Enable TP for MLA and DeepSeek-V3 - - Enable CP for MLA and DeepSeek-V3 - - Requires TransformerEngine >= 2.5.0 - - Many thanks to [SuperCB](https://github.com/SuperCB) from Xiaohongshu Inc. and [RandMist](https://github.com/RandMist) from WeChat Infra Department, Tencent Inc. for their contributions. - - Support aux-loss-free load balancing strategy - - Support node-limited routing - - Support Multi-Token Prediction (MTP) - - Batch-level overlapping to hide EP-A2A communication -- **Support DeepSeek's DeepEP for efficient token dispatching and combining** -- Support HybridEP for efficient token dispatching and combining within intra-node and MNNVL scenarios. -- Add fusion for token permutation and unpermutation -- Support Uneven virtual pipeline parallel split -- Support output-discarding checkpointing on some submodules - -### Parallelism -- **Expert Parallelism** - - A specific method of parallelism for MoE models, where experts are partitioned onto different workers and each worker processes a different batch of training samples, each worker process one or more experts for each MoE layer. -- **3D Parallelism**: Data Parallelism, Tensor Parallelism, Pipeline Parallelism - - Note: When using MoE with expert parallelism and tensor parallelism, sequence parallelism must be enabled. -- **Context Parallelism**: - - Split the sequence dimension to support long context training. -- **Richer parallel mappings**: EP can be combined with DP/TP/PP/CP for handling larger MoE variants. -- **MoE Parallel Folding**: Support for setting different parallelism strategies for Attention and MoE components, enabling more flexible and efficient model sharding. See detailed documentation below. -- **Full distributed optimizer support.** - -### Router and Load Balancing -- Router type: - - Top-K MLP router -- Load Balancing algorithms: - - Sinkhorn (S-BASE) - - Aux loss / Load balancing loss - - Aux-loss-free load balancing strategy -- CUDA fused routing and load balancing kernels +Megatron Core MoE is a production-ready framework for training large-scale Mixture-of-Experts models, providing the foundational architecture, performance optimizations, and best practices that guide MoE framework development across the industry. + +## Table of Contents + +- [What's New](#whats-new) +- [Overview of MCore MoE Supported Features and Architectures](#overview-of-mcore-moe-supported-features-and-architectures) +- [Quick Start Guide](#quick-start-guide) + - [Basic MoE Training](#basic-moe-training-in-megatron-lm) + - [Pre-defined Configs for Popular Models](#use-the-pre-defined-config-to-train-the-popular-moe-models) + - [General Performance Tips](#general-performance-tips) +- [Best Practices for High Performance MoE Training](#best-practices-to-achieve-high-performance-on-moe-training) + - [Step 1: Find Feasible Parallel Mapping](#step-1-find-the-feasible-parallel-mapping-under-the-memory-capacity-of-the-gpu) + - [Step 2: Select Optimal Parallelism Strategy](#step-2-select-optimal-parallelism-strategy) + - [Step 3: Enable Performance Features](#step-3-enable-performance-features-based-on-profiling-bottlenecks) +- [Feature Documentation](#feature-documentation) + - [Router and Load Balancing](#router-and-load-balancing) + - [Token Dispatching](#token-dispatching) + - [Upcycling](#upcycling) +- [Training Optimizations](#training-optimizations) + - [MoE Parallel Folding](#moe-parallel-folding) + - [Memory Optimization](#memory-optimization) + - [Communication Optimization](#communication-optimization) + - [Compute Optimization](#compute-optimization) + - [FP8 Training](#fp8-training) + - [CUDA Graph](#cuda-graph) +- [MoE Arguments Reference](#moe-arguments-reference) +- [Examples](#examples) +- [Contributing](#contributing) +- [Citation](#citation) + +## What's New +For latest features and architectures, please refer to the [MCore dev roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729). + +### 🔥 [MCore dev] (2026/01) +- 🚀 Pipeline-aware fine-grained activation offloading +- 🚀 Qwen3-Next model support +- 🚀 Muon and Layer-wise distributed optimizer + +### 🔥 [MCore v0.15] (2025/11) +- 🚀 Add HybridEP backend to Flex Dispatcher(GB200, B200, H100 supported) +- 🚀 Support FSDP with EP for MoE models + +### 🔥 [MCore v0.14] (2025/09) +- 🚀 Batch-level overlapping to hide EP-A2A communication (--overlap-moe-expert-parallel-comm --delay-wgrad-compute) +- 🚀 FP8 support for Fine-grained Recomputations +- Router fusion kernels for MoE models (--moe-router-fusion) +- Context Parallelism (CP) support for MTP and MLA + +### 🔥 [MCore v0.13] (2025/07) +- Support bf16 dtype for optimizer states to use precision-aware optimizer in TransformerEngine (--use-precision-aware-optimizer) +- Flexible Asymmetric Virtual Pipeline Parallelism with Custom Pipeline Layout (--pipeline-model-parallel-layout) +- Add Hybrid Shard Data-Parallel support for MoE models (--num-distributed-optimizer-instances) +- Fine-grained recomputation to reduce activation memory. (--recompute-modules with --recompute-granularity selective) +- Memory efficient token permutation by moving the probs multiplication from unpermutation to activation function of GroupedMLP. + +### 🔥 [MCore v0.12] (2025/05) +- Support DeepSeek's DeepEP for efficient token dispatching (--moe-token-dispatcher-type flex --moe-enable-deepep) +- Support Multi-Token Prediction (MTP) (--mtp-num-layers 1) +- CUDA Graph support for dropless MoE models with attention only capture (--te-rng-track --external-cuda-graph --cuda-graph-scope attn) + +## Overview of MCore MoE Supported Features and Architectures + +### Model Support +- ✅ **DeepSeek** + - ✅ DeepSeek-V2 + - ✅ DeepSeek-V3, including MTP +- ✅ **Qwen** + - ✅ Qwen2-57B-A14B + - ✅ Qwen3-30B-A3B + - ✅ Qwen3-235B-A22B +- ✅ **Mixtral** + - ✅ Mixtral-8x7B + - ✅ Mixtral-8x22B + +### Core MoE Functionality +- ✅ Token dropless MoE (dMoE) - Advanced routing without token dropping +- ✅ Top-K Router with flexible K selection +- ✅ Load balancing losses for expert utilization optimization + +### Advanced Parallelism +- ✅ Expert Parallel (EP) with 3D parallelism integration +- ✅ Full parallelism combo: EP + DP + TP + PP + SP support +- ✅ Context Parallel (CP) for long sequence MoE training +- ✅ Parallel Folding Heterogeneous Parallelism Mappings for Efficient Large-Scale MoE Model Training +- ✅ Distributed Optimizer for MoE (ZeRO-1 equivalent) ### Performance Optimizations -- (Experimental) **DeepEP** is integrated for efficient token communication in large-scale MoE training. -- GroupedGEMM when num local experts > 1 - - Supported dtype: bf16 - - Performance improvements for larger MoE models -- Enable `--tp-comm-overlap` for MoE -- FP8 training support - -### Token Dispatch Mechanism -- Dropless / No token drop -- Token drop, with or without padding to capacity -- Token permutation / Unpermutation fusion +- ✅ Memory Efficient token permutation +- ✅ Fine-grained Recomputations (mla, moe, mlp, moe_act, norm) +- ✅ MLA TP Support for Mixture of Linear Attention +- ✅ GroupedGEMM and GA Fusion +- ✅ DP/PP/TP Communication Overlapping +- ✅ Overlapped Shared Expert execution +- ✅ Router Fusion optimizations +- ✅ Token (un)permutation Fusion kernels +- ✅ cuDNN fused Attention integration + +### Hardware & Precision Support +- ✅ DeepEP support for H100 and B200 +- ✅ GroupedGEMM including FP8/MXFP8 support +- ✅ FP8 weights with BF16 optimizer states +- ✅ FP8 training full support + +### Developer Experience +- ✅ MoE Model Zoo with pre-training best practices +- ✅ Distributed Checkpointing for MoE models +- ✅ Upcycling Support for model scaling +- ✅ MCore2HF Converter for ecosystem compatibility +- ✅ Layer-wise logging for detailed monitoring +- ✅ Runtime Upcycling capabilities + +## Quick Start Guide + +### Basic MoE Training in Megatron-LM + +To train a top-2 MoE model with 8 experts and auxiliary loss, add the following arguments to your megatron training script: -### Ease of use -- Checkpoint converter for Mixtral models, see the [example](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/mixtral) for details. -- MoE Layer Frequency to customize the hybrid MoE/Dense layer architecture -- Distributed checkpoining -- Per-layer logging -- Upcycling Support +```bash +## Set MoE Hidden site +--num-experts 8 +--moe-shared-expert-intermediate-size: 2048 +## Set router config +--moe-router-load-balancing-type aux_loss +--moe-router-topk 2 +--moe-aux-loss-coeff 1e-2 +## Set token dispatcher +--moe-token-dispatcher-type alltoall +``` -# User Guide +Detailed documentation for each feature is available in the [Feature Documentation](#feature-documentation) section. -## Usage +### Use the pre-defined config to train the popular MoE models +We have provided some pre-defined config to train the popular MoE models in the [Megatron-MoE-Model-Zoo](https://github.com/yanring/Megatron-MoE-ModelZoo/tree/main) repository. You can use them as a reference to configure your training script. Currently we have added the config for Mixtral 8x7B, Mixtral 8x22B, DeepSeek-V3, Qwen3-30B-A3B, Qwen3-235B-A22B. -### Quick Start -To train a top-2 MoE model with 8 experts and auxiliary loss, include the following arguments: +### General Performance Tips +#### Training arguments +The following flags are general performance flags that can help to achieve higher performance on almost all workloads. Check if you have enabled all of them in your training script. ```bash ---num-experts 8 ---expert-model-parallel-size 8 +## Enable DeepEP token dispatcher +--moe-token-dispatcher-type flex +--moe-flex-dispatcher-backend deepep +## Enable GroupedGEMM --moe-grouped-gemm +## Enable fusion kernels +--moe-router-fusion --moe-permute-fusion ---moe-router-load-balancing-type aux_loss # options: aux_loss, sinkhorn, none. Default is aux_loss. ---moe-router-topk 2 ---moe-aux-loss-coeff 1e-2 +--cross-entropy-loss-fusion +--cross-entropy-fusion-impl te + +## Communication optimization --use-distributed-optimizer ---moe-token-dispatcher-type alltoall -``` +--overlap-param-gather +--overlap-grad-reduce +--tp-comm-overlap -To enable the token drop mechanism, such as GShard and SwitchTransformer, include the following arguments: +## Enable manual gc to prevent python jitter +--manual-gc: true +--manual-gc-interval: 10 +``` +#### Environment variables +Below are some environment variables that can be useful. ```bash ---moe-expert-capacity-factor 1.0 ---moe-pad-expert-input-to-capacity # Optional +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # Enable expandable segments to prevent memory fragmentation +export NCCL_NVLS_ENABLE=0 # Disable NVLS to prevent memory overhead ``` +#### Dependencies +- Use the latest version of [TransformerEngine](https://github.com/NVIDIA/TransformerEngine). +- Use the latest [NGC PyTorch Docker Image](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) -The following figure illustrates differenting dropping strategies in MCore: - - - -1. The default dropless strategy will not drop or pad any token. -2. By setting `--moe-expert-capacity-factor`, the tokens exceed the capacity of expert will be dropped based on their selected probabilities. - The dropping is performed before the token exchange operation between EP ranks when EP > 1. - The formula of capacity is `capacity = num_tokens_per_rank * topk * capacity_factor / num_experts`. -3. By setting `--moe-pad-expert-input-to-capacity`, the experts with tokens less than capacity will be padded to the capacity. - -### Fine-tuning Mixtral Models -Megatron-Core has full support for Mixtral MoE models, and we provide the checkpoint converter for Mixtral models from huggingface format to MCore format. - - -### Distributed Checkpointing -MCore v0.7 introduced fully parallel and asynchronous saving capabilities to distributed checkpointing, -which addresses the issues of low efficiency in the traditional checkpoint saving methods. -It also solved the problem of incompatibility between checkpoints of different parallel mappings in the traditional format. -With the new distributed checkpointing solution, MCore can achieve flexible parallelism configurations by saving and loading the unified format checkpoints. -Compared to native PyTorch solution, MCore achieves up to 50x reduction in checkpointing overhead. - -From MCore v0.8, MoE supports Distributed Checkpointing, which means users can save and load with any combination of parallelism and it is currently available, including expert parallel. -1. Loading weight and distributed optimizer states with TPxCPxEPxPP resharding with SequentialMLP is supported in version 0.8. -2. GroupedMLP weight resharding is supported in version 0.8.0 and optimizer state resharding is supported in version 0.10.0. Switching between GroupedMLP/SequentialMLP when loading and saving is partially supported. -3. TEGroupedMLP has fully support on distributed checkpointing and is fully exchangable with SequentialMLP in version 0.9.0. -4. Optimizer state resharding cannot do across EP=1 with EP>1 due to the different optimizer type. - -Usage -- `--ckpt-format torch_dist` The main argument, it will attempt to save and load using distributed checkpointing. -- `--auto-detect-ckpt-format` With this, it can load both distributed checkpointing and legacy checkpointing. - -Checkpoint compatibility across SequentialMLP, GroupedMLP, and TEGroupedMLP: -```text - ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ - │ GroupedMLP │ │ SequentialMLP │ │ TEGroupedMLP │ - │ │ │ │ │ │ - │ │ │ │ │ │ - │ ┌───────────┐ │ │ ┌───────────┐ │ │ ┌───────────┐ │ - │ │legacy ckpt│ │ │ │legacy ckpt│ │ │ │legacy ckpt│ │ - │ └─────┬─────┘ │ │ └─────┬─────┘ │ │ └─────┬─────┘ │ - │ ▼ │ │ ▼ │ │ ▼ │ - │ ┌─────────┐ │ │ ┌─────────┐ │ │ ┌─────────┐ │ - │ │dist ckpt│ │ │ │dist ckpt│ │ │ │dist ckpt│ │ -┌──►│ │ weight │ │◄────────►│ │ weight │ │◄────────►│ │ weight │ │◄──┐ -│ │ └─────────┘ │ │ └─────────┘ │ │ └─────────┘ │ │ -└───┼───────────────┼──────────┼───────────────┼──────────┼───────────────┼───┘ - │┌─────────────┐│ │┌─────────────┐│ │┌─────────────┐│ - ││ dist ckpt ││ ││ dist ckpt ││ ││ dist ckpt ││ - ││optim states ││ ││optim states ││◄────────►││optim states ││ - │└─────────────┘│ │└─────────────┘│ │└─────────────┘│ - └───────────────┘ └───────────────┘ └───────────────┘ -``` +## Best Practices to achieve high performance on MoE training + +Distributed training involves complex trade-offs between **communication**, **memory**, and **computation**, making it challenging to find an optimal parallelism configuration. This section provides a systematic workflow to help you identify the best parallel mapping for your model and hardware. + +### Step 1: Find the feasible parallel mapping under the memory capacity of the GPU +To find the best parallel mapping, we need to first know the feasible parallel mapping for the model under the memory capacity of the GPU. +The consumption of memory consists of three parts: +- Activation memory +- Weight and gradient memory +- Optimizer states memory +Different parallel strategies will shard these tensor memory in different ways. + +| Parallel Strategy | Peak Activation Memory | Weight Memory | Optimizer states | Communication (Per-Layer) | +|:-----------------:|:-------------------------------:|:--------------:|:---------------------------------:|:-------------------------:| +| TP | 1/N (with SP on) | 1/N | 1/N | High | +| EP | ~1 (varies with EP balancing) | 1/N in MoELayer| 1/N | Medium | +| PP | 1 (>1 with virtual pipeline) | 1/N | 1/N | Medium | +| CP | 1/N | 1 | 1/N (with distributed optimizer) | Medium | +| DP | 1 | 1 | 1/N (with distributed optimizer) | Low | + +We provide the argument of `--fake-init-process-group` to emulate distributed training on one GPU. This is useful to find the feasible parallel mapping under the memory capacity of the GPU. See https://github.com/NVIDIA/Megatron-LM/pull/2254 for detailed usage. + +### Step 2: Select Optimal Parallelism Strategy + +The optimal parallelism configuration varies based on **model architecture**, **sequence length**, and **hardware platform**. Below are general guidelines to help you achieve high throughput. + +#### Guideline 1: Minimize Model Parallelism, Maximize Data Parallelism + +| Aspect | Recommendation | +|--------|----------------| +| **Goal** | Keep TP/EP/PP as small as possible while avoiding OOM | +| **Why** | Model parallelism introduces communication overhead that hurts performance | +| **How** | Use distributed optimizer (`--use-distributed-optimizer`) to shard optimizer states across DP ranks, freeing memory for larger DP size | + +#### Guideline 2: Keep EP and TP Communication Within NVLink Domain + +| Aspect | Recommendation | +|--------|----------------| +| **Goal** | Ensure EP×TP fits within a single node (typically 8 GPUs) | +| **Why** | EP and TP are communication-intensive; NVLink provides much higher bandwidth than cross-node interconnects | +| **Scaling** | When scaling beyond one node, prefer PP over expanding TP/EP across nodes | + +**Note:** +For very large MoE models like DeepSeek-V3, the EP communication may exceed the NVLink bandwidth. In this case, consider using 1F1B A2A Overlap to overlap the EP communication. + +#### Guideline 3: Use Pipeline Parallelism (PP) for Multi-Node Scaling + +| Aspect | Recommendation | +|--------|----------------| +| **Goal** | Use PP to distribute layers across nodes while keeping EP×TP within NVLink | +| **VPP** | Enable Virtual Pipeline Parallelism to reduce pipeline bubbles when `PP ≥ 2` | +| **Config** | Set `--num-layers-per-virtual-pipeline-stage` to control VPP size | + +**VPP Size Tuning:** +- Valid values: all divisors of `num_layers / PP_size` +- Example: `num_layers=24, PP=4` → valid VPP sizes: `{1, 2, 3, 6}` +- Trade-off: Larger VPP = fewer bubbles but more P2P communications +- Recommendation: A middle value often gives the best balance + +#### Guideline 4: Prefer EP over TP for Expert Layers + +| EP Advantages | Details | +|---------------|---------| +| **Better GEMM efficiency** | Larger local matrix sizes improve GPU utilization | +| **Lower communication** | EP has less communication overhead than TP for MoE layers | +| **Simpler computation graph** | Easier to overlap communication with computation | +| **Token permutation** | When `EP = num_experts`, local token permutation is eliminated | + +**Example:** For Mixtral 8x7B, `EP8×TP1` outperforms `EP4×TP2`. + +#### Guideline 5: Enable Context Parallelism (CP) for Long Sequences + +| Aspect | Recommendation | +|--------|----------------| +| **When to use** | Sequence length ≥ 8K tokens | +| **Key factor** | CP efficiency depends on overlapping communication with computation | +| **Config** | Set `--context-parallel-size` to partition sequences across GPUs | + +### Step 3: Enable Performance Features Based on Profiling Bottlenecks + +After establishing a working parallel configuration, profile your training to identify bottlenecks and apply targeted optimizations. + +#### Memory Bottleneck + +**Symptom**: Forced to use full recomputation or excessively large parallelism degrees to avoid OOM. + +**Solutions**: +| Optimization | Overhead | Config | Reference | +|--------------|----------|--------|---------| +| Selective Recomputation | Low | `--recompute-granularity selective --recompute-modules ...` | [Fine-grained Recomputation](#fine-grained-recomputation) | +| Activation Offloading | Medium | `--fine-grained-activation-offloading --offload-modules ...` | [Fine-grained Activation Offloading](#fine-grained-activation-offloading) | +| Optimizer Offloading | Medium | `--optimizer-cpu-offload` | --- | + +#### Communication Bottleneck + +**Symptom**: Profiling shows significant time spent in collective operations. + +**Solutions**: Identify which communication is the bottleneck and enable corresponding overlap: +| Communication Type | Overlap Config | +|--------------------|----------------| +| DP gradient reduce | `--overlap-grad-reduce` | +| DP param gather | `--overlap-param-gather` | +| TP communication | `--tp-comm-overlap` | +| EP All-to-All | `--overlap-moe-expert-parallel-comm --delay-wgrad-compute` | +| PP send/recv | Enable VPP with `--num-layers-per-virtual-pipeline-stage` | + +#### CPU Overhead Bottleneck + +**Symptom**: Nsight Systems timeline shows gaps between GPU kernels where CPU cannot launch kernels fast enough. + +**Solutions**: +| Optimization | Config | +|--------------|--------| +| Disable Python GC | `--manual-gc --manual-gc-interval 100` | +| Enable CUDA Graphs | `--cuda-graph-impl transformer_engine --cuda-graph-scope attn moe_router moe_preprocess` | +| Reduce kernel launches | Decrease TP size or increase micro-batch size | + +#### Computation Bottleneck + +**Symptom**: GPU utilization is low despite no communication or CPU bottlenecks. + +**Solutions**: +| Optimization | Config | +|--------------|--------| +| Enable kernel fusions | `--moe-router-fusion --moe-grouped-gemm --moe-permute-fusion` | +| Use FP8 precision | `--fp8-format e4m3 --fp8-recipe blockwise` | + + +## Feature Documentation + +### Router and Load Balancing + +Routers determine which expert(s) handle each token. A lightweight MLP scores every token and applies `softmax` or `sigmoid` to compute routing probabilities. The router then selects the top-K experts for each token. + +> **Note**: The router logits is better to remain in **FP32** or **FP64** rather than BF16 by --moe-router-dtype fp32. At high expert counts, FP32 precision yields better accuracy because output hidden states of experts are multiplied by router scores and accumulated to get the final output. + +#### Router Types + +| Router Types | Description | Config | +|-------------|-------------|----------| +| **Top-K Router** | Standard routing with configurable K, uses softmax for probability computation | --moe-router-topk 8 | +| **Group Top-K Router** | Selects top-K expert groups, then routes experts in selected groups | --moe-router-num-groups 8 --moe-router-group-topk 4 | +| **Router score function** | Score function to calculate the probs from output logits of router | --moe-router-score-function softmax/sigmoid | + +#### Load Balancing Strategies + +| Strategy | Description | Config | +|----------|-------------|--------| +| **aux_loss** | Auxiliary loss for balancing expert usage on a micro-batch | `--moe-router-load-balancing-type aux_loss` | +| **seq_aux_loss** | Sequence-level auxiliary loss for balancing expert usage on each sequence| `--moe-router-load-balancing-type seq_aux_loss` | +| **global_aux_loss** | Global auxiliary loss for balancing expert usage on a global batch across all ranks | `--moe-router-load-balancing-type global_aux_loss` | +| **sinkhorn** | Optimal transport formulation for balancing expert usage | `--moe-router-load-balancing-type sinkhorn` | +| **aux loss free** | Dynamic bias-based load balancing strategy without auxiliary loss | `--moe-router-enable-expert-bias --moe-router-bias-update-rate 1e-3`| +| **none** | No load balancing | `--moe-router-load-balancing-type none` | + +### Token Dispatching + +After routing, tokens are **dispatched** to the GPU hosting the assigned expert. After expert computation, tokens are sent back and **combined** to restore the original sequence. + +| Dispatcher | Description | Best For | Config | +|------------|-------------|----------|--------| +| **alltoall** | NCCL-based All-to-All communication for token exchange | Standard EP > 1 setups | `--moe-token-dispatcher-type alltoall` | +| **FlexDispatcher with [DeepEP](https://github.com/deepseek-ai/DeepEP) backend** | Removes redundant tokens during cross-node communication, fuses intra/inter-node communication into single kernel | Cross-node EP, fine-grained MoE (DeepSeek-V3) | `--moe-token-dispatcher-type flex --moe-flex-dispatcher-backend deepep` | +| **FlexDispatcher with [HybridEP](https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) backend** | NVIDIA's optimized dispatcher using TMA and IBGDA, fewer SMs, native MNNVL support | GB200 NVL72, Multi-Node NVLink | `--moe-token-dispatcher-type flex --moe-flex-dispatcher-backend hybridep` | +| **allgather** | Gathers all tokens to each GPU, no inter-GPU token movement | TP-only setups, small EP, large Top-K | `--moe-token-dispatcher-type allgather` | + +### Upcycling +Use `--moe-use-upcycling` to enable upcycling, which loads the dense model from the `--load` directory, converts it to an MoE model at runtime, and starts training. The converted model is saved to the `--save` path before training begins. Upcycling is built on distributed checkpointing, supporting parallel modes different from existing dense checkpoints, such as arbitrary expert parallelism during upcycling. + +In addition to the default upcycling strategy, we also support granular upcycling strategy which is a more state-of-the-art upcycling strategy from [our recent research work](https://arxiv.org/abs/2410.07524). For the default upcycling strategy, we duplicate the existing MLP to multiple experts, with each expert starting from a copy of the MLP. For the granular upcycling strategy, we use `--moe-upcycling-granularity` to specify how many times smaller is the expert hidden size compared with the original dense FFN hidden size. For using granular upcycling strategy, please set `--moe-upcycling-granularity` as a positive integer. If this param is set to 1, it means using the default upcycling strategy. + +Note: The MoE model structure is defined through script arguments. All MoE-related arguments (such as `--num-experts`) can be customized; however, other model structure arguments must be consistent with those of the dense model. For granular upcycling strategy, the moe's FFN hidden size should be set as dense FFN hidden size divided by `--moe-upcycling-granularity`. + +## Training Optimizations +MoE training faces three fundamental performance bottlenecks: **Memory Wall**, **Communication Wall**, and **Compute Efficiency Wall**. The following optimizations address each of these challenges. + +### MoE Parallel Folding +**The Problem with Traditional Approaches:** +- Prior MoE frameworks constrain **EP ≤ DP** (Expert Parallelism must be a sub-group of Data Parallelism), which severely limits scalability. +- Applying the same TP/CP to both attention and MoE is suboptimal: + - High TP benefits attention but hurts MoE (small per-expert dims make TP overhead prohibitive) + - High CP benefits long-context attention but is unnecessary for MoE (tokens processed independently) + +**MoE Parallel Folding** is Megatron Core's solution that **decouples attention and MoE parallelism**: + +| Parallelism Group | Attention Layers | MoE Layers | +|-------------------|------------------|------------| +| **Dimensions** | TP × CP × DP × PP | ETP × EP × EDP × PP | + +#### Key Benefits + +1. **Breaks the EP ≤ DP Constraint** + - Traditional: TP=4, CP=2, DP=8, PP=4 → max EP=8 + - With Folding: Same attention config, but MoE uses ETP=1, EP=64, EDP=1 → 8× more expert parallelism -Best practices for distributed checkpointing: -1. Convert a legacy checkpoint to a distributed checkpoint. To achieve this, we can add both `--ckpt-format torch_dist --auto-detect-ckpt-format`, then it will load the legacy one and save as the distributed checkpoint format later when the training progress tries to save checkpoints. -2. Convert checkpoint of the legacy GroupedMLP to TEGroupedMLP. This is only supported for the weight parts. To achieve this, we can use the above method to convert the legacy checkpoint to a distributed checkpoint of the legacy GroupedMLP. After updating the libraries and using TEGroupedMLP, we can directly load the previously saved checkpoint by adding argument `--no-load-optim`. +2. **Reduces Minimum GPU Requirements** + - Traditional CP=8, EP=8 requires at least 64 GPUs + - With Folding: CP and EP are folded together, only 8 GPUs needed -### Shared Experts -MCore v0.9 introduced the shared expert feature. We can enable this feature by setting suitable `--moe-shared-expert-intermediate-size`. +3. **Enables Independent Optimization** + - Use high TP for attention (memory efficiency) + - Use ETP=1 for MoE (better GEMM efficiency, less communication) -The parallelism patterns of the shared experts follow the settings of the dense part, i.e., the attention module. The shared experts are not distributed but replicated in EP ranks. +4. **Keeps High-Bandwidth Communication in NVLink Domain** + - Both CP and EP communication can remain within NVLink domain -We also have an experimental feature that tries to overlap the communications and computations in the shared experts and the dispatcher. -We can set `--moe-shared-expert-overlap` and use `alltoall` dispatcher to enable it. -The overlapping relies on the envirionment setting `CUDA_DEVICE_MAX_CONNECTIONS=1`. -The `AllGather` and `ReduceScatter` communications in the shared experts are overlapped with `permute`/`unpermute` in the dispatcher. -The `MLP` computation part in the shared experts are overlapped with the `AlltoAll` communications in the dispatcher. -Both the forward and the backward pass can overlap. But to get the overlapping in the backward pass, the PyTorch version should `>= 2.2.0`. +> **Reference**: [MoE Parallel Folding: Heterogeneous Parallelism Mappings for Efficient Large-Scale MoE Model Training](https://arxiv.org/abs/2504.14960) -### Checkpointing +### Memory Optimization + +Memory optimization is critical for large-scale MoE training, as MoE models maintain all expert parameters even though only a subset is activated per token. + +| Optimization | Description | Config | +|--------------|-------------|--------| +| **Fine-grained Recomputation** | Selectively recomputes specific modules (e.g., `mla_up_proj`, `layernorm`, `moe_act`) instead of full layers | `--recompute-granularity selective --recompute-modules mla_up_proj layernorm moe_act` | +| **Fine-grained Activation Offloading** | Offloads activations to CPU memory, overlapping D2H/H2D transfers with computation | See `docs/source/api-guide/fine_grained_activation_offloading.md` | +| **Precision-aware Optimizer** | Stores optimizer states (exp_avg, exp_avg_sq) in BF16 instead of FP32, reducing optimizer memory by 50% | `--use-precision-aware-optimizer --exp-avg-dtype bf16 --exp-avg-sq-dtype bf16` | +| **Optimizer Offloading** | Offloads optimizer states to CPU memory. | `--optimizer-cpu-offload` | + +#### Fine-grained Recomputation A new output-discarding checkpointing method is also supported. This method discards the output memory of certain submodules during the forward pass and recomputes them during the backward pass, which can save memory compared to standard checkpointing. This can be enabled for specific submodules using the `--recompute-granularity selective --recompute-modules [submodule1, submodule2, ...]` argument. The supported submodules are: * `moe_act`: Recompute the GroupedMLP activation function. @@ -163,137 +400,214 @@ A new output-discarding checkpointing method is also supported. This method disc * `mlp`: Recompute the dense MLP submodule (uses standard checkpointing rather than output-discarding) which is useful for hybrid-models like DeepSeek-V3. * `moe`: Recompute the MoE layer submodule (uses standard checkpointing rather than output-discarding). -### Upcycling -Use `--moe-use-upcycling` to enable upcycling, which loads the dense model from the `--load` directory, converts it to an MoE model at runtime, and starts training. The converted model is saved to the `--save` path before training begins. Upcycling is built on distributed checkpointing, supporting parallel modes different from existing dense checkpoints, such as arbitrary expert parallelism during upcycling. +#### Fine-grained Activation Offloading -In addition to the default upcycling strategy, we also support granular upcycling strategy which is a more state-of-the-art upcycling strategy from [our recent research work](https://arxiv.org/abs/2410.07524). For the default upcycling strategy, we duplicate the existing MLP to multiple experts, with each expert starting from a copy of the MLP. For the granular upcycling strategy, we use `--moe-upcycling-granularity` to specify how many times smaller is the expert hidden size compared with the original dense FFN hidden size. For using granular upcycling strategy, please set `--moe-upcycling-granularity` as a positive integer. If this param is set to 1, it means using the default upcycling strategy. +Unlike recomputation (which trades compute for memory), offloading trades **GPU-CPU bandwidth for memory**: activations are transferred to CPU during forward pass and retrieved during backward pass. The key is hiding transfer latency behind computation using asynchronous D2H/H2D transfers. -Note: The MoE model structure is defined through script arguments. All MoE-related arguments (such as `--num-experts`) can be customized; however, other model structure arguments must be consistent with those of the dense model. For granular upcycling strategy, the moe's FFN hidden size should be set as dense FFN hidden size divided by `--moe-upcycling-granularity`. +**Key Features:** +- **Module-level granularity**: Target specific modules rather than entire layers +- **Computation-offloading overlap**: Asynchronous transfers via independent CUDA streams +- **Compatible with PP/VPP**: Works with pipeline parallelism and fine-grained recomputation -### Leverage DeepSeek's DeepEP for High-Performance Cross-Node Token Dispatching -- [DeepSeek-DeepEP](https://github.com/deepseek-ai/deepep) provides a highly optimized implementation for MoE token dispatching and combining operations, specifically designed for large-scale MoE training scenarios. -- DeepEP is particularly recommended for training large-scale, fine-grained MoE architectures such as DeepSeek-V3 and other advanced MoE models. -- To enable DeepEP in your training configuration, simply set `--moe-token-dispatcher-type=flex` and `--moe-flex-dispatcher-backend=deepep` in your command line arguments. +**Usage** +```bash +--fine-grained-activation-offloading +--offload-modules expert_fc1 moe_act # Choices: attn_norm, core_attn, attn_proj, mlp_norm, expert_fc1, moe_act +``` -### Integrate HybridEP for High-Performance Intra-Node Token Dispatching -- [HybridEP](https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) is developed by NVIDIA as an optimized solution for large-scale MoE (Mixture of Experts) all-to-all communication. It is designed to leverage NVIDIA GPU hardware capabilities, significantly reducing Streaming Multiprocessor (SM) resource usage. -- HybridEP currently supports intra-node and multi-node NVLink scenarios. -- To enable HybridEP, set `--moe-token-dispatcher-type=flex` and - `--moe-flex-dispatcher-backend=hybridep` in your command line arguments. +For more details, see `docs/source/api-guide/fine_grained_activation_offloading.md` -### CUDA Graph Support -CUDA Graph functionality can be enabled through the `--cuda-graph-impl` option. There are two implementations: +### Communication Optimization -1. `--cuda-graph-impl=local`: Captures cuda graphs using the MCore-internal cuda graph manager. -2. `--cuda-graph-impl=transformer_engine`: Captures cuda graphs using the TE `make_graphed_callables()` interface. +Distributed training introduces communication overhead from various parallelism strategies. Megatron Core supports overlapping communication with computation to hide latency and improve throughput. -To use `--cuda-graph-impl=transformer_engine`, the user should call related methods `TECudaGraphHelper.create_cudagraphs()` and `TECudaGraphHelper.cuda_graph_set_manual_hooks()` in the training script. Please refer to the usage in `megatron/training/training.py`. +#### Data Parallel (DP) Communication Overlap -For MoE models, certain configurations may prevent CUDA Graph capture of MoE layers. Specifically, when `--moe-expert-capacity-factor` and `--moe-pad-expert-input-to-capacity` are not set, the resulting dynamic shapes make MoE layers uncapturable. In such cases, you can still leverage CUDA Graphs for the attention layers (operations in `TransformerLayer._forward_attention()`) by setting `--cuda-graph-scope=attn`, while leaving the MoE layers (operations in `TransformerLayer._forward_mlp()`) unmodified. See the argument description for more usage of `--cuda-graph-scope`. +With distributed optimizer, DP introduces **reduce-scatter** (gradients) and **all-gather** (parameters) communications, chunked by Transformer layer granularity. + +| Optimization | Description | Config | +|--------------|-------------|--------| +| **Gradient Reduce Overlap** | Overlaps gradient reduce-scatter with backward computation | `--overlap-grad-reduce` | +| **Param Gather Overlap** | Overlaps parameter all-gather with forward computation | `--overlap-param-gather` | +| **BF16 Gradient Reduce** | Reduces gradients in BF16 instead of FP32 for better performance | `--grad-reduce-in-fp32 false` (via mixed precision config) | +| **FP8 Param Gather** | Conducts parameter all-gather in FP8, reducing overhead by 50% | `--fp8-param-gather` | + +#### Tensor Parallel (TP) Communication Overlap + +TP with sequence parallelism introduces activation all-gather and reduce-scatter operations. Communications are overlapped in **bulk** (no dependency) or **pipelined** (with dependency) fashion. + +| Optimization | Description | Config | +|--------------|-------------|--------| +| **TP Comm Overlap** | Enables bulk and pipelined TP communication overlap | `--tp-comm-overlap` | + +> **Requirements**: `tensor_model_parallel_size >= 2` and `--sequence-parallel` + +#### Pipeline Parallel (PP) Communication Overlap + +PP introduces P2P activation sends/receives between pipeline stages. Overlap is automatic in the 1F1B pipelining phase when VPP is enabled. + +| Optimization | Description | Config | +|--------------|-------------|--------| +| **P2P Comm Overlap** | Overlaps PP P2P communications with non-dependent computations | `--overlap-p2p-comm` (auto-enabled with VPP) | +| **VPP for Better Overlap** | Increases overlap opportunities by reducing layers per virtual stage | `--num-layers-per-virtual-pipeline-stage` | + +#### Expert Parallel (EP) Communication Overlap + +EP All-to-All can consume 30-40% of training time without optimization. These features hide or reduce EP communication overhead. + +| Optimization | Description | Config | +|--------------|-------------|--------| +| **EP A2A Overlap** | Overlaps All-to-All with computation by merging FWD-BWD passes of adjacent microbatches | `--overlap-moe-expert-parallel-comm --delay-wgrad-compute` | +| **Shared Expert Overlap** | Runs shared expert computation concurrently with EP token transfer | `--moe-shared-expert-overlap` | +> **Requirements for EP A2A Overlap**: `expert_model_parallel_size > 1`, CUDA_DEVICE_MAX_CONNECTIONS > 1. -### Batch-Level EP-A2A hidding -Enable A2A overlap across different batches inspired by the DSv3 DualPipe implmentation. \ -**Features** -- Hide ep a2a communication by batch-level overlapping -- Split weight gradient and activation gradient computations for better overlap with communications -- Support interleaved pipelined parallelism -- Support FP8 training -- Support MTP (`-mtp-num-layers 1` only, multiple MTP layers are not supported yet.) +### Compute Optimization +Fine-grained MoE produces many small operations that can underutilize GPU resources. These optimizations reduce kernel launch overhead and improve GPU utilization. + +| Optimization | Description | Config | +|--------------|-------------|--------| +| **Grouped GEMM** | Batches multiple expert GEMM operations into a single kernel call, improving GPU utilization | `--moe-grouped-gemm` | +| **Router Fusion** | Fuses router projection, top-k selection, softmax, and auxiliary loss into fewer kernels | `--moe-router-fusion` | +| **Permute Fusion** | Fuses token permutation/unpermutation operations into optimized single kernels | `--moe-permute-fusion` | +| **FP8 Training** | Uses FP8 Tensor Core operations for faster GEMMs on Hopper/Blackwell GPUs | `--fp8 --fp8-recipe blockwise` | + + +### FP8 Training + +FP8 training provides benefits across all three performance walls: + +| Wall | FP8 Benefit | Impact | +|------|-------------|--------| +| **Compute** | Faster Tensor Core GEMMs | FP8 ops on Hopper/Blackwell are faster than BF16 | +| **Memory** | 50% activation reduction | Stores linear layer inputs in FP8 instead of BF16 | +| **Communication** | 50% parameter all-gather | With FP8 primary weights (except MXFP8) | + +#### FP8 Recipes + +| Recipe | Scaling Granularity | Format | Platform | Use Case | +|--------|---------------------|--------|----------|----------| +| **Per-tensor** | Whole tensor | E4M3/E5M2 hybrid | Hopper, Blackwell | Conservative, initial experimentation | +| **Blockwise** | 1×128 (activations), 128×128 (weights) | E4M3 | Hopper | **Production-proven** (DeepSeek-V3, Minimax-M2) | +| **MXFP8** | 1×32 | E4M3 + E8M0 scaling | Blackwell | Native hardware support on GB200 | + +> **Recommendation**: Use **blockwise FP8** on Hopper for production training. It has been validated at scale on DeepSeek-V3 class models. + +#### MoE-Specific FP8 Optimizations + +| Optimization | Description | Config | +|--------------|-------------|--------| +| **Routing Map Padding** | Pads routing map (not tokens) to align M dimension to 16/32, avoiding per-tensor padding overhead | `--moe-router-padding-for-fp8` | +| **FP8 Primary Weights** | Casts FP32 master weights directly to FP8, eliminating BF16 intermediate copy | `--fp8-param-gather` (Need additional `--reuse-grad-buf-for-mxfp8-param-ag` for MXFP8) | + + +#### Example Configuration -**Usage** ```bash -# Add the following flags to your training scripts ---overlap-moe-expert-parallel-comm -# [optional] only works with specific TE version ---delay-wgrad-compute +# Blockwise FP8 on Hopper (recommended for production) +--fp8-format e4m3 +--fp8-recipe blockwise +--fp8-param-gather +--moe-router-padding-for-fp8 + +# MXFP8 on Blackwell +--fp8-format e4m3 +--fp8-recipe mxfp8 +--moe-router-padding-for-fp8 +--fp8-param-gather +--reuse-grad-buf-for-mxfp8-param-ag ``` -### Fine-grained Activation Offloading (collaborated with rednote) -Offload the input activation at the granularity of modules +> **Note**: For blockwise and MXFP8 recipes with current scaling, training loss curves show negligible difference compared to BF16 baselines. -**Usage** -```bash -# Enable fine-grained activation offloading ---fine-grained-activation-offloading -# Specify which modules are going to offload its input -# Choices: "attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act". ---offload-modules expert_fc1 -``` -For more details, please refer to the ```docs/source/api-guide/fine_grained_activation_offloading.md``` - -### MoE Related Arguments -| Item | Description | -| --- | --- | -| --num-experts | Number of Experts in MoE (None means no MoE) | -| --expert-model-parallel-size | Degree of expert model parallelism. Default is 1. | -| --moe-ffn-hidden-size | MoE Feed-Forward Network hidden size. Default is None. | - -
    - View all MoE related arguments. - -| Item | Description | -| --- | --- | -| --num-experts | Number of Experts in MoE (None means no MoE) | -| --expert-model-parallel-size | Degree of expert model parallelism. Default is 1. | -| --moe-ffn-hidden-size | MoE Feed-Forward Network hidden size. Default is None. | -| --expert-tensor-parallel-size | Degree of tensor model parallelism of expert layer. Default is same to --tensor-model-parallel-size. | -| --moe-layer-freq | Frequency between MoE layers and Dense layers. Accepts either: 1) An integer N for 1:N ratio (one expert layer for every N-1 dense layers), 2) A string "N" for the same ratio, or 3) A string with Python list expression for custom patterns like `([1]*3+[0]*1)*3` which gives [1,1,1,0,1,1,1,0,1,1,1,0] where 1=expert layer and 0=dense layer. Examples: `([0]+[1]*23)` for 1 dense layer followed by 23 experts layers, `([1]*3+[0]*2)*2` for three expert layers followed by two dense layers, repeated twice. Default is 1. | -| --moe-grouped-gemm | When there are multiple experts per rank, launch multiple local GEMM kernels in multiple streams to improve the utilization and performance with GroupedLinear in TransformerEngine. | -| --moe-router-load-balancing-type | Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer; "seq_aux_loss" corresponds to the load balancing loss used in DeepSeekV2 and DeepSeekV3, which computes the loss for each individual sample; "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss". | -| --moe-router-dtype | Data type for routing computation and expert output weighted averaging. Options are 'fp32' and 'fp64'. This can improve numerical stability, particularly when using a large number of experts. The throughput/memory impact should be negligible when used with --moe-permute-fusion. Default is None (no dtype promotion). | -| --moe-router-topk | Number of experts to route to for each token. The default is 2. | -| --moe-router-score-function | Score function for MoE routing. Can be "softmax" or "sigmoid". Default is "softmax". | -| --moe-router-pre-softmax | Enable pre-softmax routing for MoE, which means softmax is before the top-k selection. By default, softmax is done after top-k. | -| --moe-router-num-groups | Number of groups to divide experts into for group-limited routing. When using group-limited routing: 1) Experts are divided into equal-sized groups, 2) For each token, a subset of groups are selected based on routing scores (sum of top-2 expert scores within each group), 3) From these selected groups, moe_router_topk experts are chosen. Two common use cases: 1) Device-limited routing: Set equal to expert parallel size (EP) to limit each token to experts on a subset of devices (See DeepSeek-V2: https://arxiv.org/pdf/2405.04434) 2) Node-limited routing: Set equal to number of nodes in EP group to limit each token to experts on a subset of nodes (See DeepSeek-V3: https://arxiv.org/pdf/2412.19437)) | -| --moe-router-group-topk | Number of selected groups for group-limited routing. | -| --moe-router-topk-scaling-factor | Scaling factor for routing score in top-k selection, only works when --moe-router-pre-softmax enabled. Defaults to None, which means no scaling. | -| --moe-router-enable-expert-bias | TopK routing with dynamic per-expert bias in the aux-loss-free load balancing strategy. The routing decision is based on the sum of the routing scores and the expert bias. See https://arxiv.org/abs/2408.15664 for details. | -| --moe-router-fusion | Enable fusion for MoE TopK routing and aux-loss computation. This is only supported in TransformerEngine 2.7.0 and above. | -| --moe-router-bias-update-rate | The expert bias is updated based on the number of assigned tokens to each expert in a global batch, where the bias is increased for experts with less assigned tokens and decreased for experts with more assigned tokens. Default is 1e-3 same as that used in DeepSeekV3. | -| --moe-router-force-load-balancing | (Experimental) Force override routing to balance token distribution using random logits for MoE routers, supporting naive top-k and group-limited top-k. This experimental feature is for benchmarking purposes only! | -| --moe-router-padding-for-quantization | Pad the routing_map to make sure the number of tokens each expert received is a multiple of 16/32 for FP8/FP4 precision. It is suggested to enable this for dropless training with FP8 precision when num_local_experts > 1. This is a more efficient way to pad for FP8 which eliminates the explicit padding in the GroupedMLP layer. | -| --moe-aux-loss-coeff | Scaling coefficient for the aux loss: a starting value of 1e-2 is recommended. Default is 0.0. | -| --moe-z-loss-coeff | Scaling coefficient for the z-loss: a starting value of 1e-3 is recommended. Default is None. | -| --moe-input-jitter-eps | Add noise to the input tensor by applying jitter with a specified epsilon value. Default is None. | -| --moe-token-dispatcher-type | Determines the token dispatcher type. Choices are "allgather", "alltoall". Default is "allgather". We recommend using 'alltoall' if expert parallelism is applied. We have upgraded the "alltoall" dispatcher in place during MCore v0.9, while the original implementation renamed as "alltoall_seq" is retained until MCore v0.13.| -| --moe-flex-dispatcher-backend | (Experimental) Select the backend for the flex token dispatcher. Supported options: "deepep", "hybridep". Enables efficient token dispatching and combining for MoE models. | -| --moe-per-layer-logging | Enable per-layer logging for MoE, currently supports auxiliary loss and z loss. | -| --moe-expert-capacity-factor | The capacity factor for each expert, None means no token will be dropped. Default is None. | -| --moe-pad-expert-input-to-capacity | Pads the input for each expert to match the expert capacity length, effective only after the --moe-expert-capacity-factor is set. | -| --moe-token-drop-policy | The policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will be dropped. | -| --moe-layer-recompute | Enable activation checkpointing for moe_layer, should be used when memory is not sufficient. | -| --moe-permute-fusion | Fuse token rearrangement ops during token dispatching. | -| --moe-shared-expert-intermediate-size | Set shared expert total ffn hidden size. It should be equal to `num_shared_experts * ffn_size_of_each_shared_expert` if there are multiple shared experts. None means no shared expert. | -| --moe-shared-expert-overlap | (Experimental, may change) If this is set, the communications/computations in the shared experts and the dispatcher will overlap (The `alltoall` dispatcher is needed.) Otherwise, the shared expert runs after the routed experts. | -| --moe-use-upcycling | Load the dense model checkpoint, convert it into an MoE model at runtime and start training. The converted model will be saved to the path specified by `--save` before training begins. Upcycling is implemented on the top of distributed checkpointing, so it supports parallel modes different from the dense model.| -| --overlap-moe-expert-parallel-comm | Enable batch-level overlapping in 1f1b stage. | -| --delay-wgrad-compute | Enable split dgrad and wgrad for `overlap-moe-expert-parallel-comm` execution. Increasing room to hide communication latency by more finegrained control. | -| --pipeline-model-parallel-layout | (Experimental, may change) A string containing a Python list expression that defines a custom pipeline model parallel layout. | -| --moe-upcycling-granularity | This param sepecifics how many times smaller is the expert hidden size compared with the original dense FFN hidden size. For using granular upcycling strategy, please set this param as a positive integer. If this param is set to 1, it means using the default upcycling strategy.| +### CUDA Graph +CUDA Graph functionality can be enabled through the `--cuda-graph-impl` option. There are two implementations: -
    +1. `--cuda-graph-impl=local`: Captures cuda graphs using the MCore-internal cuda graph manager. +2. `--cuda-graph-impl=transformer_engine`: Captures cuda graphs using the TE `make_graphed_callables()` interface. -## MoE training example: -
    -Click here. +To use `--cuda-graph-impl=transformer_engine`, the user should call related methods `TECudaGraphHelper.create_cudagraphs()` and `TECudaGraphHelper.cuda_graph_set_manual_hooks()` in the training script. Please refer to the usage in `megatron/training/training.py`. + +For MoE models, certain configurations may prevent CUDA Graph capture of MoE layers. Specifically, when `--moe-expert-capacity-factor` and `--moe-pad-expert-input-to-capacity` are not set, the resulting dynamic shapes make MoE layers uncapturable. In such cases, you can still leverage CUDA Graphs for the attention layers (operations in `TransformerLayer._forward_attention()`) by setting `--cuda-graph-scope=attn`, while leaving the MoE layers (operations in `TransformerLayer._forward_mlp()`) unmodified. See the argument description for more usage of `--cuda-graph-scope`. +## MoE Arguments Reference +### Core Arguments +| Argument | Description | Default | +|----------|-------------|---------| +| --num-experts | Number of Experts in MoE | None | +| --expert-model-parallel-size | Degree of expert model parallelism | 1 | +| --moe-ffn-hidden-size | MoE FFN hidden size | FFN hidden size of the dense model | +| --expert-tensor-parallel-size | Expert layer tensor parallelism | Same as TP(Recommeded to set to 1 for fine-grained MoE models) | +| --moe-layer-freq | MoE layer frequency pattern | 1 | + +### Router Arguments +| Argument | Description | Default | +|----------|-------------|---------| +| --moe-router-load-balancing-type | Load balancing: aux_loss, sinkhorn, seq_aux_loss, none | aux_loss | +| --moe-router-topk | Number of experts per token | 2 | +| --moe-router-score-function | Score function: softmax, sigmoid | softmax | +| --moe-router-pre-softmax | Softmax before top-k | False | +| --moe-router-num-groups | Groups for group-limited routing | None | +| --moe-router-group-topk | Selected groups in group-limited routing | None | +| --moe-router-enable-expert-bias | Dynamic per-expert bias | False | +| --moe-router-bias-update-rate | Bias update rate | 1e-3 | +| --moe-router-fusion | Enable router fusion | False | +| --moe-router-dtype | Router precision: fp32, fp64 | None | +| --moe-router-padding-for-fp8 | Pad for FP8 alignment | False | + +### Loss and Regularization +| Argument | Description | Default | +|----------|-------------|---------| +| --moe-aux-loss-coeff | Auxiliary loss coefficient | 0.0 | +| --moe-z-loss-coeff | Z-loss coefficient | None | +| --moe-input-jitter-eps | Input jitter epsilon | None | + +### Token Dispatching +| Argument | Description | Default | +|----------|-------------|---------| +| --moe-token-dispatcher-type | Dispatcher: allgather, alltoall, flex | allgather | +| --moe-enable-deepep | Enable DeepEP (with flex) | False | +| --moe-expert-capacity-factor | Capacity factor | None | +| --moe-pad-expert-input-to-capacity | Pad to capacity | False | +| --moe-token-drop-policy | Drop policy: probs, position | probs | +| --moe-permute-fusion | Fuse permutation ops | False | + +### Performance Optimization +| Argument | Description | Default | +|----------|-------------|---------| +| --moe-grouped-gemm | Use GroupedGEMM | False | +| --overlap-moe-expert-parallel-comm | Batch-level EP overlap | False | +| --delay-wgrad-compute | Split dgrad/wgrad compute | False | +| --moe-shared-expert-intermediate-size | Shared expert FFN size | None | +| --moe-shared-expert-overlap | Overlap shared expert | False | + +### Memory and Checkpointing +| Argument | Description | Default | +|----------|-------------|---------| +| --moe-layer-recompute | Recompute MoE layer | False | +| --moe-use-upcycling | Enable upcycling | False | +| --moe-upcycling-granularity | Upcycling granularity | 1 | + +### Miscellaneous +| Argument | Description | Default | +|----------|-------------|---------| +| --moe-per-layer-logging | Per-layer logging | False | +| --moe-router-force-load-balancing | Force load balancing (experimental) | False | + +## Examples ```bash #!/bin/bash # Runs Mixtral 8x7B model on 32 H100/A100 GPUs -# The Dropless MoE suffers from an imbalanced token distribution at the early stage of training (the first few hundred iterations), which may lead to poor performance and out-of-memory (OOM) issues. -# To check the performance of a Dropless MoE model, we should run the model for at least 500 iterations or resume from trained checkpoints. export CUDA_DEVICE_MAX_CONNECTIONS=1 GPUS_PER_NODE=8 -# Change for multinode config MASTER_ADDR=${MASTER_ADDR:-"localhost"} MASTER_PORT=${MASTER_PORT:-"6000"} -NNODES=${NNODES:-"1"} +NNODES=${NNODES:-"4"} NODE_RANK=${RANK:-"0"} WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) @@ -333,11 +647,12 @@ MODEL_ARGS=( MOE_ARGS=( --num-experts 8 --expert-model-parallel-size 8 - --moe-router-load-balancing-type aux_loss # options: aux_loss, sinkhorn, None. Default is aux_loss. + --moe-router-load-balancing-type aux_loss --moe-router-topk 2 --moe-aux-loss-coeff 1e-2 --moe-grouped-gemm --moe-permute-fusion + --moe-token-dispatcher-type alltoall ) DATA_ARGS=( @@ -372,24 +687,17 @@ MODEL_PARALLEL_ARGS=( ) LOGGING_ARGS=( - --log-interval 1 \ - --save-interval 10000 \ - --eval-interval 1000 \ - --eval-iters 10 \ - --save $CHECKPOINT_PATH \ - --load $CHECKPOINT_PATH \ - --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \ - --no-load-optim \ - --no-load-rng + --log-interval 1 + --save-interval 10000 + --eval-interval 1000 + --eval-iters 10 + --save $CHECKPOINT_PATH + --load $CHECKPOINT_PATH + --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" + --ckpt-format torch_dist + --auto-detect-ckpt-format ) -if [ -n "${WANDB_API_KEY}" ]; then - LOGGING_ARGS+=( - --wandb-project ${WANDB_PROJECT:-"Mixtral-Finetuning"} - --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"} - ) -fi - torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \ ${MODEL_ARGS[@]} \ ${MOE_ARGS[@]} \ @@ -398,107 +706,36 @@ torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \ ${MODEL_PARALLEL_ARGS[@]} \ ${LOGGING_ARGS[@]} ``` +
    -# Performance Best Practice +## Contributing -### Tuning Guide of Parallel Mappings +We welcome contributions! Please see [CONTRIBUTING.md](../../../../CONTRIBUTING.md) for guidelines. -To find a good parallel mapping that help you achieve a high throughput of a new model, there are some general rule that could help. Here is an overview of properties in different aspects for each parallel strategy. +## Support -| Parallel Strategy | Peak Activation Memory | Weight Memory | Optimizer states | Communication (Per-Layer) | -|:-----------------:|:-------------------------------:|:--------------:|:---------------------------------:|:-------------------------:| -| TP | 1/N (with SP on) | 1/N | 1/N | High | -| EP | 1 | 1/N in MoELayer| 1/N | Medium | -| PP | 1 (>1 with virtual pipeline) | 1/N | 1/N | Medium | -| CP | 1/N | 1 | 1/N (with distributed optimizer) | Medium | -| DP | 1 | 1 | 1/N (with distributed optimizer) | Low | +- GitHub Issues: [Report bugs or request features](https://github.com/NVIDIA/Megatron-LM/issues) +- Documentation: [Full documentation](https://docs.nvidia.com/megatron-core/developer-guide/latest/index.html) -For a specific model, the best parallel mapping varies based on the model architecture, trained sequence length and the hardware platform. -Here we provide some general rules to get better performance: -1. Keep the model parallism size as small as possible. - - For the large language models, model parallism is often required to prevent OOM, but it will bring communication overhead and hurt performance. - - With distributed optimizer, master weights and optimizer states will be sharded across all DP ranks with slight communication overhead. - So try to reduce the model parallism size and increase data parallism size when there are lots of free GPU memory during training. -2. Ensure the EPxTP communication winthin the NVLink domain. - - Communications of EP and TP should remain within the NVLink domain as much as possible, as both are communication-intensive. - - If the model is too large and requires scaling across multiple nodes, consider PP before TP and EP. See item 3 for details. -3. Use Pipeline Parallelism to scale the model further. - - Enable Virtual Pipeline Parallelism(VPP) to reduce pp bubbles when PP_size >= 2 by setting `num_layers_per_virtual_pipeline_stage`. - - VPP_size tuning: the legal values of vpp_size are all common divisors of num_layers/pp_size, E.g., num_layers=24, pp_size=4, then we can pick vpp_size from {1, 2, 3, 6}. The larger the vpp_size, the lower the pipeline bubbles, while the larger number of P2P communications between each PP stages. Empirically a value in the middle often gives the best trade-off. `VPP_size=num_layers / PP_size / num_layers_per_virtual_pipeline_stage` -4. Prefer EP over TP for the expert layer when possible: - - TP saves more memory than EP, but EP can achieve better GEMM efficiency and less communication overhead than TP. - - If EP size increased to the number of expert, the local token permutation/un-permutation for experts computation are omitted. - - Simplify the computation graph of MoE layers, more convenient for performing potential comm-computation overlapping. - - In practice, EP8TP1 is better than EP4TP2 for 8x7B. -5. Enable Context Parallelism for long context training. - - The efficiency of CP largely depends on whether its communication can be overlapped with computation. - - Empirically, use CP when sequence length >= 8K. -### MoE Parallel Folding +## Citation -MoE Parallel Folding separates the MoE related parallel groups from Dense groups. -1. Traditional MoE parallel groups are entangled with dense by using a 5-dimension parallel group generator with default order `tp-cp-ep-dp-pp`. The EP group in MoE is a sub-group of DP in Attention. -2. With MoE Parallel Folding, we use a parallel group generator with `tp-cp-dp-pp` for Attention, and another with `tp-ep-dp-pp` for MoE. The EPxTP group in MoE is a sub-group of DPxCPxTP in Attention. - -By setting `--expert-tensor-parallel-size`, we can set MoE-specific TP size. - -#### Advantages of MoE Parallel Folding -1. The CP and EP group are folded together by defualt, such that: - 1. It reduces the minimal required GPUs to turn on both CP and EP. For example, the traditional way with (CP=8, EP=8) needs at least 64 GPUs, for now it only requires 8 GPUs. - 2. The CP and EP communication can be both put in the NVLink domain. -2. We can set different TP sizes for Attention and MoE part. - 1. For MoE, EP is often more efficient than TP. But in the traditional way, only using EP can get OOM for most models. - 2. With MoE parallel folding, we can turn on TP for Attention part and setting TP=1 for MoE models, which often gets better MFU. - -### End-to-End Training Practice -**Use the latest NVIDIA PyTorch or NeMo Docker Image** -- [NGC PyTorch Image](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) -- [NGC NeMo Image](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) - -**Token Dispatcher Choices** -- Token Dispatcher sends tokens to the designated expert, involves tensor rearangement and communications. -- Dispatcher `allgather` is the default option. It achieves better performance and efficiency when only tensor parallelism is used or when the Top-k value is very large. -- Dispatcher `alltoall` is recommended if expert parallelism is applied. -- Dispatcher `flex` is a new dispatcher decouples communication group from model parallelism. It supports two backends(DeepEP and HybridEP) selectable via `--moe-flex-dispatcher-backend`. - -**Enable Communication Overlap** -- Enable `--overlap-param-gather` and `--overlap-grad-reduce` with distributed optimizer. -- Enable `--tp-comm-overlap` when TP>1. -- Enable p2p comm overlap when PP > 1 by setting `num_layers_per_virtual_pipeline_stage`. - -**Enable GroupedGEMM when num_local_experts>1 with `--moe-grouped-gemm`** -- GroupedGEMM has higher efficiency than vanilla sequential GEMMs for each expert. -- Recommend to use the TE version of Grouped GEMM (by upgrading to MCore v0.8 and TE v1.9), which support Gradient Accumulation Fusion and FP8 Training. - -**OOM Caused by Token Distribution Imbalance when Training From Scratch** -MoE suffers from a severe load imbalance issue when the router is under-trained, leading to the model easily running out of memory (OOM), which typically occurs in the first 100~300 steps when training from scratch. -Therefore, there are two recommended ways during the first 200 steps to avoid the OOM problem, which can be removed after the token distribution is more stable: -1. Increase the `expert-tensor-parallel-size` and decrease `expert-model-parallel-size` to replace EP with TP in MoELayer, this can prevent the load imbalancing between EP ranks. Since current ETP implementation has some memeory overhead, you can further enable activation recomputation only for MoE Layer by adding `--moe-layer-recompute`. -2. Setting capacity factor to a relatively small number like 1.0 by adding `--moe-token-capacity-factor 1.0`. - -**Leverage DeepSeek's DeepEP for High-Performance Cross-Node Token Dispatching** -- The primary advantage of DeepEP is its cross-node token communication efficiency, which delivers substantial performance improvements when deploying expert parallelism across multiple nodes with large TopK values. -- To enable DeepEP in your training configuration, simply set `--moe-token-dispatcher-type=flex` and `--moe-enable-deepep` in your command line arguments. - -**FP8 Training Best Practice** -- Using latest version of [TransformerEngine](https://github.com/NVIDIA/TransformerEngine). -- Enable router padding with `--moe-router-padding-for-quantization` to reduce padding overhead. -- Enable native FP8 weights with `--fp8-param-gather` to reduce weights memory cost. - -### Reference Best Parallel Mapping - -Here are the reference parallel mappings of MCore v0.8 for Mixtral 8x7B and 8x22B models: -| Model | Vocab Size| Dispatcher | Precision | #GPUs | SEQ LEN | TP | EP | PP | VP | MBS | GBS | -|:-----------------------:|:---------:|:----------:|:---------:|:-----:|:-------:|:--:|:--:|:--:|:--:|:---:|:---:| -| Mixtral 8x7B(Dropless) | 32K | All-to-All | BF16 | 64 | 4096 | 1 | 8 | 4 | 8 | 1 | 256 | -| Mixtral 8x22B(Dropless) | 32K | All-to-All | BF16 | 128 | 4096 | 4 | 2 | 8 | 7 | 1 | 256 | - -Detailed Benchmark Information: -Server: -- 8xH100 80GB HBM3 -- NVLink 4th Generation -- InfiniBand 8x400 Gbit/s - -Docker Image: -- PyTorch 24.09 with TransformerEngine v1.11 +If you use Megatron-Core MoE in your research, please cite: + +```bibtex + +@article{megatron-lm, + title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism}, + author={Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared and Catanzaro, Bryan}, + journal={arXiv preprint arXiv:1909.08053}, + year={2019} +} + +@article{moe-parallel-folding, + title={MoE Parallel Folding: Heterogeneous Parallelism Mappings for Efficient Large-Scale MoE Model Training with Megatron Core}, + author={Liu, Dennis and Yan, Zijie and Yao, Xin and Liu, Tong and Korthikanti, Vijay and Wu, Evan and Fan, Shiqing and Deng, Gao and Bai, Hongxiao and Chang, Jianbin and Aithal, Ashwath and Andersch, Michael and Shoeybi, Mohammad and Yao, Jiajie and Zhou, Chandler and Wu, David and Li, Xipeng and Yang, June}, + year={2025}, + journal={arXiv preprint arXiv:2504.14960}, +} +``` From 9ea50a9d500c187798571d42ffaafe1bb77758c5 Mon Sep 17 00:00:00 2001 From: litianjian <45817262+litianjian@users.noreply.github.com> Date: Tue, 20 Jan 2026 22:52:05 +0800 Subject: [PATCH 244/248] feat: add routing replay for Mcore (#2693) Co-authored-by: litianjian Co-authored-by: Zijie Yan --- docs/source/api-guide/router_replay.md | 176 ++++++++++++++++++ megatron/core/transformer/moe/moe_utils.py | 16 +- megatron/core/transformer/moe/router.py | 6 + .../core/transformer/moe/router_replay.py | 161 ++++++++++++++++ .../core/transformer/transformer_config.py | 3 + megatron/training/arguments.py | 3 + .../transformer/moe/test_router_replay.py | 95 ++++++++++ 7 files changed, 459 insertions(+), 1 deletion(-) create mode 100644 docs/source/api-guide/router_replay.md create mode 100644 megatron/core/transformer/moe/router_replay.py create mode 100644 tests/unit_tests/transformer/moe/test_router_replay.py diff --git a/docs/source/api-guide/router_replay.md b/docs/source/api-guide/router_replay.md new file mode 100644 index 00000000000..334a29c78a6 --- /dev/null +++ b/docs/source/api-guide/router_replay.md @@ -0,0 +1,176 @@ +# Design Document: MoE Router Replay Feature + +### 1. Overview + +This document provides a detailed description of the "Router Replay" feature implemented within the Megatron-LM Core for Mixture-of-Experts (MoE) models. + +This feature is designed to enhance determinism and analyzability in MoE model training and inference. It enables the model to load routing decisions from a predefined file and enforce their use during the forward pass, thereby bypassing the real-time routing computation. + +### 2. Motivation + +* **Determinism & Reproducibility**: In distributed training, MoE routing decisions can exhibit minor variations due to factors like floating-point precision. By replaying a fixed routing table, the MoE computation path is guaranteed to be identical across runs, which facilitates debugging and reproducing experimental results. +* **Performance Profiling**: The router's own computation (e.g., logits calculation, top-k selection) incurs overhead. In replay mode, this part of the computation can be completely skipped, allowing for more precise isolation and profiling of performance bottlenecks within the Expert Layers themselves. +* **Debugging Aid**: When issues arise in the model, fixing the routing decisions helps to isolate variables, making it easier to determine whether the problem lies with the routing mechanism or the expert computations. + +### 3. Design and Architecture + +The design follows the principles of being non-intrusive and on-demand, with the core idea of activating the replay logic only when explicitly requested by the user. + +* **Core Components**: + * `RouterReplay` (located in `megatron/core/transformer/moe/router_replay.py`): A utility class for replaying MoE routing decisions. When enabled via the `enable_routing_replay` flag, a separate instance of `RouterReplay` is created for each MoE layer's router. Each instance is responsible for loading routing data and providing the deterministic routing decisions for its corresponding layer during the forward pass. + * `enable_routing_replay` (located in `megatron/core/transformer/transformer_config.py`): A boolean global configuration flag that serves as the sole entry point for enabling this feature. + +* **Workflow**: + The feature supports different modes, such as recording and replaying, controlled by a `RouterReplayAction`. + + 1. **Enabling the Feature**: The user sets `enable_routing_replay` to `True` in the model configuration. + 2. **Initialization**: When `enable_routing_replay` is true, each `TopKRouter` creates its own `RouterReplay` instance. + 3. **Mode Configuration**: The user must programmatically set the desired router replay action (e.g., `record`, `forward_replay`, `backward_replay`) on the `RouterReplay` instances. + 4. **Execution Flow (within a mini-batch)**: + * **Forward Pass**: + * For each micro-batch, the `topk_routing_with_score_function` checks the `router_replay_action`. + * **In `record` mode**: The dynamically computed `top-k` expert indices are captured and stored. + * **In `forward_replay` mode**: The function retrieves pre-loaded expert indices from `target_topk_idx`. These indices are used for the forward computation and are also appended to the `replay_backward_list` to prepare for the backward pass. + * **Backward Pass**: + * For each micro-batch (processed in reverse order in pipeline parallelism), the `router_replay_action` is checked again. + * **In `backward_replay` mode**: The function retrieves the expert indices for the corresponding micro-batch by popping them from the `replay_backward_list`. This mode is intended for training recomputation (e.g., activation checkpointing and pipeline recompute) so the same routing decisions are used during recompute/backward as in forward, ensuring determinism and correctness. + +### 4. Implementation Details + +The implementation cleanly separates the replay logic from the router's core computation. + +* **`megatron/core/transformer/transformer_config.py`**: + * Adds the configuration option `enable_routing_replay: bool = False`. + +* **`megatron/core/transformer/moe/moe_utils.py`**: + * Introduces the `RouterReplay` class to manage the state for recording and replaying routing decisions for a single MoE layer. + * `target_topk_idx`: An attribute holding the expert indices for the current micro-batch during forward replay mode. + * `recorded_topk_idx`: An attribute for storing the computed expert indices when in record mode. + * `replay_backward_list`: A list that accumulates the `top-k` indices used during the forward passes of a mini-batch. This list is consumed in FIFO order during the backward pass to ensure correctness under pipeline parallelism. + * `set_target_indices()`: A method to load the replay indices into `target_topk_idx` for the forward pass. + * `record_indices()`: A method to save the computed indices. + * The `topk_routing_with_score_function` is modified to contain the core logic. It checks the `router_replay_action` on the `router_replay` instance and accordingly performs one of the following actions: computes and records indices, replays indices from `target_topk_idx` (for forward), replays indices from `replay_backward_list` (for backward), or falls through to the default dynamic routing. + +#### Training recompute usage +- During forward replay, `set_target_indices()` prepares `replay_backward_list` so each micro-batch’s indices are available for recomputation. +- During recompute/backward, set action to `REPLAY_BACKWARD` so indices are consumed in FIFO order to mirror the forward sequence. + +### 5. Usage Guide + +1. **Enable & Instantiate** + - Create one `RouterReplay` instance per MoE router layer when building the model. + - Optionally use the global helpers to set/clear actions across all layers. +2. **Record Routing Decisions** + - Set action: `RouterReplay.set_global_router_replay_action(RouterReplayAction.RECORD)`. + - Run the model; retrieve per-layer indices via `RouterReplay.get_recorded_data()` and persist. +3. **Forward Replay** + - Load indices and distribute: `RouterReplay.set_replay_data(list_of_tensors)`. + - Set action: `RouterReplay.set_global_router_replay_action(RouterReplayAction.REPLAY_FORWARD)`. + - Run the model; dynamic top‑k is bypassed and target indices are used. +4. **Backward Replay** + - For training recomputation (activation checkpointing or pipeline recompute), set action: `REPLAY_BACKWARD` during recomputation. + - Per micro‑batch indices are consumed from `replay_backward_list` in FIFO order. +5. **Cleanup** + - Use `RouterReplay.clear_global_indices()`, `RouterReplay.clear_global_router_replay_action()`, and `RouterReplay.clear_global_router_replay_instances()` to restore default behavior and prevent memory leaks. + +#### Quick usage with `topk_routing_with_score_function` + +```python +import torch +from megatron.core.transformer.moe.router_replay import RouterReplay, RouterReplayAction +from megatron.core.transformer.moe.moe_utils import topk_routing_with_score_function + +rr = RouterReplay() + +# Record +RouterReplay.set_global_router_replay_action(RouterReplayAction.RECORD) +logits = torch.randn(8, 16) +probs_rec, routing_map_rec = topk_routing_with_score_function( + logits=logits, topk=2, use_pre_softmax=False, score_function="softmax", router_replay=rr, +) +recorded = rr.get_recorded_indices() +torch.save(recorded, "/tmp/replay.pt") + +# Forward replay +rr.clear_router_replay_action() +rr.set_router_replay_action(RouterReplayAction.REPLAY_FORWARD) +target = torch.load("/tmp/replay.pt") +rr.set_target_indices(target) +probs_rep, routing_map_rep = topk_routing_with_score_function( + logits=logits, topk=2, use_pre_softmax=False, score_function="softmax", router_replay=rr, +) + +RouterReplay.clear_global_router_replay_action() +RouterReplay.clear_global_indices() +RouterReplay.clear_global_router_replay_instances() +``` + +### 6. Minimal Demo + +Here is a minimal code example showing how to use RouterReplay for recording and replaying: + +```python +import torch +import torch.distributed as dist +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.moe.router import TopKRouter +from megatron.core.transformer.moe.router_replay import RouterReplay, RouterReplayAction + + +# Initialize distributed training +if not dist.is_initialized(): + dist.init_process_group(backend="nccl") + +# Create a transformer config with RouterReplay enabled +config = TransformerConfig( + num_experts=8, + expert_model_parallel_size=1, + num_top_k=2, + enable_routing_replay=True +) + +# Create a TopKRouter instance +router = TopKRouter(config) + +# Generate sample input (batch_size, sequence_length, hidden_size) +logits = torch.randn(16, 32, 8).to(torch.cuda.current_device()) + +# ----------------- +# 1. Recording Mode +# ----------------- +print("=== Recording Mode ===") +# Set global router replay action to RECORD +RouterReplay.set_global_router_replay_action(RouterReplayAction.RECORD) + +# Perform routing +routing_output = router.forward(logits) +print(f"Recorded top-k indices shape: {routing_output.top_k_idx.shape}") + +# ----------------- +# 2. Forward Replay Mode +# ----------------- +print("\n=== Forward Replay Mode ===") +# Save recorded indices to a file +torch.save(routing_output.top_k_idx, "/tmp/replay.pt") + +# Load indices from file and set as target for replay +replay_indices = torch.load("/tmp/replay.pt") +for router_instance in RouterReplay.global_router_replay_instances: + router_instance.target_topk_idx = replay_indices + +# Set global router replay action to REPLAY_FORWARD +RouterReplay.set_global_router_replay_action(RouterReplayAction.REPLAY_FORWARD) + +# Perform routing again - this will use the replayed indices +replay_routing_output = router.forward(logits) +print(f"Replayed top-k indices shape: {replay_routing_output.top_k_idx.shape}") +print(f"Are indices the same? {torch.equal(routing_output.top_k_idx, replay_routing_output.top_k_idx)}") + + +# Clean up +RouterReplay.clear_global_router_replay_action() +RouterReplay.clear_global_indices() +RouterReplay.clear_global_router_replay_instances() +if dist.is_initialized(): + dist.destroy_process_group() +``` diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py index 60878155fd4..e5e06f05758 100644 --- a/megatron/core/transformer/moe/moe_utils.py +++ b/megatron/core/transformer/moe/moe_utils.py @@ -18,6 +18,7 @@ from megatron.core.tensor_parallel.mappings import reduce_from_tensor_model_parallel_region from megatron.core.transformer.cuda_graphs import is_graph_capturing from megatron.core.transformer.enums import CudaGraphScope +from megatron.core.transformer.moe.router_replay import RouterReplay from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import internal_api @@ -580,6 +581,7 @@ def topk_routing_with_score_function( score_function: str = "softmax", expert_bias: Optional[torch.Tensor] = None, fused: bool = False, + router_replay: Optional['RouterReplay'] = None, ): """Compute the routing probabilities and map for top-k selection with score function. Args: @@ -591,6 +593,9 @@ def topk_routing_with_score_function( scaling_factor (float): Scaling factor of routing score in top-k selection. score_function (str): The score function to use. Can be either "softmax" or "sigmoid". expert_bias (torch.Tensor): The bias added to logits for expert routing. + router_replay (Optional['RouterReplay']): For debugging and development, allows for + deterministic routing by replaying a previously + recorded routing sequence. Returns: Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - routing_probs (torch.Tensor): A tensor of shape [num_tokens, num_experts] containing @@ -617,7 +622,7 @@ def topk_routing_with_score_function( expert_bias=expert_bias, ) - def compute_topk(scores, topk, num_groups=None, group_topk=None): + def _compute_topk(scores, topk, num_groups=None, group_topk=None): if group_topk: return group_limited_topk( scores=scores, @@ -630,6 +635,15 @@ def compute_topk(scores, topk, num_groups=None, group_topk=None): else: return torch.topk(scores, k=topk, dim=1) + def compute_topk(scores, topk, num_groups=None, group_topk=None): + # Default behavior if no replay is active + if router_replay is None: + return _compute_topk(scores, topk, num_groups=num_groups, group_topk=group_topk) + else: + return router_replay.get_replay_topk( + scores, topk, num_groups, group_topk, _compute_topk + ) + if score_function == "softmax": if use_pre_softmax: scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits) diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py index 003043bc18d..01238e425d9 100644 --- a/megatron/core/transformer/moe/router.py +++ b/megatron/core/transformer/moe/router.py @@ -22,6 +22,7 @@ topk_routing_with_score_function, z_loss_func, ) +from megatron.core.transformer.moe.router_replay import RouterReplay from megatron.core.transformer.transformer_config import TransformerConfig @@ -202,6 +203,10 @@ def __init__( self.global_tokens_per_expert = None self.ga_steps = None + self.router_replay = None + if self.config.enable_routing_replay: + self.router_replay = RouterReplay() + def _maintain_float32_expert_bias(self): """ Maintain the expert bias in float32. @@ -580,6 +585,7 @@ def routing(self, logits: torch.Tensor, padding_mask: Optional[torch.Tensor] = N score_function=self.score_function, expert_bias=self.expert_bias, fused=self.config.moe_router_fusion, + router_replay=self.router_replay, ) # Apply token dropping to probs and routing_map. diff --git a/megatron/core/transformer/moe/router_replay.py b/megatron/core/transformer/moe/router_replay.py new file mode 100644 index 00000000000..b6b8e26a0a6 --- /dev/null +++ b/megatron/core/transformer/moe/router_replay.py @@ -0,0 +1,161 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +from enum import Enum +from typing import Callable, List, Optional, Tuple + +import torch + + +class RouterReplayAction(Enum): + """ + A Enum to define the actions for router replay. + """ + + RECORD = "record" # Record the topk indices for replay + REPLAY_FORWARD = "replay_forward" # Replay the recorded topk indices for forward pass + REPLAY_BACKWARD = "replay_backward" # Replay topk indices for re-compute during backward pass + + +class RouterReplay: + """ + A class to manage the recording and replaying of MoE routing decisions. + It holds all router instances and provides static methods to globally + control recording and replaying. + """ + + # Static variable to hold all router instances, one per MoE layer. + global_router_replay_instances: List['RouterReplay'] = [] + + @staticmethod + def set_replay_data(all_layers_topk_indices: List[torch.Tensor]): + """ + Distributes the topk indices for all layers to their respective RouterReplay instances. + :param all_layers_topk_indices: A list of tensors, where each tensor contains the + topk indices for a specific layer. The order + must match the instantiation order of the routers. + """ + if len(all_layers_topk_indices) != len(RouterReplay.global_router_replay_instances): + raise ValueError( + f"The number of replay tensors ({len(all_layers_topk_indices)}) " + f"does not match instances ({len(RouterReplay.global_router_replay_instances)})." + ) + for i, router_instance in enumerate(RouterReplay.global_router_replay_instances): + router_instance.set_target_indices(all_layers_topk_indices[i]) + + @staticmethod + def get_recorded_data() -> List[torch.Tensor]: + """ + Collects the recorded topk indices from all RouterReplay instances. + :return: A list of tensors, each containing the recorded topk indices for a layer. + """ + return [ + router.get_recorded_indices() for router in RouterReplay.global_router_replay_instances + ] + + @staticmethod + def clear_global_indices(): + """Clears the recorded and target topk indices in all instances.""" + for router in RouterReplay.global_router_replay_instances: + router.clear_indices() + + @staticmethod + def set_global_router_replay_action(router_replay_action: RouterReplayAction): + """Sets the router replay action for all router instances.""" + for router in RouterReplay.global_router_replay_instances: + router.set_router_replay_action(router_replay_action) + + @staticmethod + def clear_global_router_replay_action(): + """Clears the router replay action for all router instances.""" + for router in RouterReplay.global_router_replay_instances: + router.clear_router_replay_action() + + @staticmethod + def clear_global_router_replay_instances(): + """Clear the global list of router replay instances to prevent memory leaks.""" + RouterReplay.global_router_replay_instances.clear() + + def __init__(self): + """Initializes a RouterReplay instance for a specific layer.""" + self.target_topk_idx: Optional[torch.Tensor] = None # Target topk indices for replay + self.recorded_topk_idx: Optional[torch.Tensor] = None # Recorded topk indices for replay + self.router_replay_action: Optional[RouterReplayAction] = ( + None # Router replay action for this layer + ) + self.replay_backward_list: List[torch.Tensor] = ( + [] + ) # List of tensors for backward pass replay + RouterReplay.global_router_replay_instances.append(self) + + def set_target_indices(self, topk_indices: torch.Tensor): + """Sets the target topk indices for replay.""" + self.target_topk_idx = topk_indices + self.replay_backward_list.append(topk_indices) + + def get_recorded_indices(self) -> Optional[torch.Tensor]: + """Returns the recorded topk indices.""" + return self.recorded_topk_idx + + def record_indices(self, topk_indices: torch.Tensor): + """Records the topk indices.""" + self.recorded_topk_idx = topk_indices + + def clear_indices(self): + """Clears the recorded and target topk indices.""" + self.recorded_topk_idx = None + self.target_topk_idx = None + self.replay_backward_list = [] + + def set_router_replay_action(self, router_replay_action: RouterReplayAction): + """Sets the router replay action for this layer.""" + self.router_replay_action = router_replay_action + + def clear_router_replay_action(self): + """Clears the router replay action for this layer.""" + self.router_replay_action = None + + def get_replay_topk( + self, + scores: torch.Tensor, + topk: int, + num_groups: Optional[int] = None, + group_topk: Optional[int] = None, + default_compute_topk: Callable[ + [torch.Tensor, int, Optional[int], Optional[int]], Tuple[torch.Tensor, torch.Tensor] + ] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + A wrapper for top-k computation that handles different replay actions. + + Args: + scores (torch.Tensor): The scores to compute top-k on. + topk (int): The number of top elements to select. + num_groups (Optional[int]): Number of expert groups for group-limited routing. + group_topk (Optional[int]): Number of groups to select for each token. + default_compute_topk (Callable): The default top-k computation function, which + should return a tuple of (values, indices). + + Returns: + Tuple[torch.Tensor, torch.Tensor]: A tuple containing the top-k values and indices. + """ + if self.router_replay_action == RouterReplayAction.RECORD: + probs, top_indices = default_compute_topk( + scores, topk, num_groups=num_groups, group_topk=group_topk + ) + self.record_indices(top_indices) + return probs, top_indices + elif self.router_replay_action == RouterReplayAction.REPLAY_FORWARD: + top_indices = self.target_topk_idx + # Ensure indices are on the correct device + top_indices = top_indices.to(scores.device) + # Gather the scores for the replayed indices to get the probabilities + probs = scores.gather(1, top_indices) + return probs, top_indices + elif self.router_replay_action == RouterReplayAction.REPLAY_BACKWARD: + top_indices = self.replay_backward_list.pop(0) + # Ensure indices are on the correct device + top_indices = top_indices.to(scores.device) + # Gather the scores for the replayed indices to get the probabilities + probs = scores.gather(1, top_indices) + return probs, top_indices + else: + return default_compute_topk(scores, topk, num_groups, group_topk) diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index 18cea44c51f..875d8a92049 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -551,6 +551,9 @@ class TransformerConfig(ModelParallelConfig): moe_router_topk: int = 2 """Number of experts to route to for each token.""" + enable_routing_replay: bool = False + """Enable routing replay for MoE.""" + moe_router_topk_limited_devices: Optional[int] = None """Number of EP ranks to consider for each token in group-limited routing, DEPRECATED and replaced by moe_router_num_groups and moe_router_group_topk. diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index a65f1cd6469..7744869f80e 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -3325,6 +3325,9 @@ def _add_moe_args(parser): help='Score function for MoE TopK routing. Can be "softmax" or "sigmoid".') group.add_argument('--moe-router-topk', type=int, default=2, help='Number of experts to route to for each token. The default is 2.') + group.add_argument('--enable-routing-replay', action='store_true', + help='Enable routing replay for MoE routers. When enabled, the router will ' + 'use a pre-defined routing table instead of computing it on the fly.') group.add_argument('--moe-router-pre-softmax', action='store_true', help='Enable pre-softmax routing for MoE, which means softmax is before the top-k selection. By default, softmax is done after top-k.') group.add_argument('--moe-router-num-groups', type=int, default=None, diff --git a/tests/unit_tests/transformer/moe/test_router_replay.py b/tests/unit_tests/transformer/moe/test_router_replay.py new file mode 100644 index 00000000000..840fc0fd269 --- /dev/null +++ b/tests/unit_tests/transformer/moe/test_router_replay.py @@ -0,0 +1,95 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +import pytest +import torch + +from megatron.core.transformer.moe.moe_utils import topk_routing_with_score_function +from megatron.core.transformer.moe.router_replay import RouterReplay, RouterReplayAction + + +def setup_function(): + RouterReplay.global_router_replay_instances.clear() + + +def teardown_function(): + RouterReplay.global_router_replay_instances.clear() + + +def test_record_mode_with_topk_routing_softmax_post(): + rr = RouterReplay() + rr.set_router_replay_action(RouterReplayAction.RECORD) + logits = torch.randn(4, 6) + probs, routing_map = topk_routing_with_score_function( + logits=logits, topk=2, use_pre_softmax=False, router_replay=rr, score_function="softmax" + ) + recorded = rr.get_recorded_indices() + expected_idx = torch.topk(logits, k=2, dim=1).indices + assert recorded is not None + assert torch.equal(recorded, expected_idx) + assert probs.shape == (4, 6) + assert routing_map.shape == (4, 6) + assert routing_map.sum(dim=1).eq(2).all() + + +def test_replay_forward_with_topk_routing_softmax_pre(): + rr = RouterReplay() + rr.set_router_replay_action(RouterReplayAction.REPLAY_FORWARD) + logits = torch.randn(3, 5) + target = torch.tensor([[1, 2], [0, 3], [2, 4]], dtype=torch.long) + rr.set_target_indices(target) + probs, routing_map = topk_routing_with_score_function( + logits=logits, topk=2, use_pre_softmax=True, router_replay=rr, score_function="softmax" + ) + assert routing_map.sum(dim=1).eq(2).all() + scores = torch.softmax(logits, dim=-1) + assert torch.equal(probs.gather(1, target), scores.gather(1, target)) + + +def test_replay_forward_with_topk_routing_softmax_post(): + rr = RouterReplay() + rr.set_router_replay_action(RouterReplayAction.REPLAY_FORWARD) + logits = torch.randn(3, 6) + target = torch.tensor([[1, 2], [0, 5], [3, 4]], dtype=torch.long) + rr.set_target_indices(target) + probs, routing_map = topk_routing_with_score_function( + logits=logits, topk=2, use_pre_softmax=False, router_replay=rr, score_function="softmax" + ) + selected = torch.softmax(logits.gather(1, target), dim=-1) + assert torch.equal(probs.gather(1, target), selected) + assert routing_map.sum(dim=1).eq(2).all() + + +def test_global_set_get_clear_indices(): + r1 = RouterReplay() + r2 = RouterReplay() + t1 = torch.tensor([[0, 1]], dtype=torch.long) + t2 = torch.tensor([[1, 0]], dtype=torch.long) + RouterReplay.set_replay_data([t1, t2]) + assert torch.equal(r1.target_topk_idx, t1) + assert torch.equal(r2.target_topk_idx, t2) + r1.record_indices(t1) + r2.record_indices(t2) + rec = RouterReplay.get_recorded_data() + assert len(rec) == 2 + assert torch.equal(rec[0], t1) + assert torch.equal(rec[1], t2) + RouterReplay.clear_global_indices() + assert r1.target_topk_idx is None and r2.target_topk_idx is None + assert r1.get_recorded_indices() is None and r2.get_recorded_indices() is None + + +def test_global_action_set_and_clear(): + r1 = RouterReplay() + r2 = RouterReplay() + RouterReplay.set_global_router_replay_action(RouterReplayAction.REPLAY_FORWARD) + assert r1.router_replay_action == RouterReplayAction.REPLAY_FORWARD + assert r2.router_replay_action == RouterReplayAction.REPLAY_FORWARD + RouterReplay.clear_global_router_replay_action() + assert r1.router_replay_action is None and r2.router_replay_action is None + + +def test_set_replay_data_length_mismatch(): + _ = RouterReplay() + with pytest.raises(ValueError): + RouterReplay.set_replay_data( + [torch.tensor([[0, 1]], dtype=torch.long), torch.tensor([[1, 0]], dtype=torch.long)] + ) From ac9f665c149e8114a9e8fb2294f7e1dd825b4c25 Mon Sep 17 00:00:00 2001 From: Yuzhong Wang Date: Wed, 21 Jan 2026 13:49:18 +0800 Subject: [PATCH 245/248] [dev] feat(moe): Support apply wd to qk layernorm for Qwen3-Next (#2825) Signed-off-by: John St. John Co-authored-by: John St. John Co-authored-by: Deepak Narayanan <2724038+deepakn94@users.noreply.github.com> --- megatron/core/optimizer/__init__.py | 44 ++++++---- megatron/core/optimizer/optimizer_config.py | 57 +++++++++++++ megatron/core/ssm/gated_delta_net.py | 2 +- megatron/training/arguments.py | 25 ++++-- megatron/training/training.py | 2 +- .../model_config.yaml | 2 +- tests/unit_tests/test_optimizer.py | 82 ++++++++++++++++++- 7 files changed, 186 insertions(+), 28 deletions(-) diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py index b4d15daefd2..11aa6c49585 100644 --- a/megatron/core/optimizer/__init__.py +++ b/megatron/core/optimizer/__init__.py @@ -60,40 +60,48 @@ OptimizerConfig, ParamKey, ParamPredicate, + ParamWithNamePredicate, SGDOptimizerConfig, ) logger = logging.getLogger(__name__) -def get_standard_config_overrides( - decoupled_lr: float | None = None, decoupled_min_lr: float | None = None -) -> Dict[ParamKey, ParamGroupOverride]: +def get_standard_config_overrides(config: OptimizerConfig) -> Dict[ParamKey, ParamGroupOverride]: """Get standard config overrides for the optimizer, handling decoupled LR and common wd skips. Args: - decoupled_lr (float | None): decoupled learning rate. - decoupled_min_lr (float | None): decoupled minimum learning rate. + config (OptimizerConfig): optimizer configuration object. Returns: Dict[ParamKey, ParamGroupOverride]: standard config overrides. """ config_overrides: Optional[Dict[ParamKey, ParamGroupOverride]] = {} - if decoupled_lr is not None: - decoupled_lr_config: ParamGroupOverride = {"max_lr": decoupled_lr} - decoupled_param_key = ParamKey(attr="is_embedding_or_output_parameter") - if decoupled_min_lr is not None: - decoupled_lr_config["min_lr"] = decoupled_min_lr - config_overrides[decoupled_param_key] = decoupled_lr_config + # First, figure out how we are going to do wd skipping. The two main approaches are: + # 1. The classic megatron approach of skipping all len 1 and bias parameters. + # 2. The Qwen3-Next approach of doing 1, other than qk layernorm parameters. + if config.apply_wd_to_qk_layernorm: + shape_1_not_qkln_param = ParamWithNamePredicate( + name="s1_not_qkln", + fn=lambda param, name: (len(param.shape) == 1 or name.endswith(".bias")) + and not ("q_layernorm." in name or "k_layernorm." in name), + ) + param_wd_mult_key = ParamKey(with_name_predicate=shape_1_not_qkln_param) + else: + param_length_1_match = ParamPredicate( + name="param_len_1", fn=lambda param: len(param.shape) == 1 + ) + param_wd_mult_key = ParamKey(name="*.bias", predicate=param_length_1_match) - # Next construct the standard param group overrides for no weight decay on bias parameters - # as well as any length 1 parameters. - param_length_1_match = ParamPredicate( - name="param_len_1", fn=lambda param: len(param.shape) == 1 - ) - param_wd_mult_key = ParamKey(name="*.bias", predicate=param_length_1_match) config_overrides[param_wd_mult_key] = ParamGroupOverride(wd_mult=0.0) + if config.decoupled_lr is not None: + decoupled_lr_config: ParamGroupOverride = {"max_lr": config.decoupled_lr} + decoupled_param_key = ParamKey(attr="is_embedding_or_output_parameter") + if config.decoupled_min_lr is not None: + decoupled_lr_config["min_lr"] = config.decoupled_min_lr + config_overrides[decoupled_param_key] = decoupled_lr_config + return config_overrides @@ -132,7 +140,7 @@ def _get_param_groups( # the config_overrides argument by default lead to bias parameters and length 1 parameters. # We assume that users of decoupled LR already provide config overrides so will adapt # to the new API. - config_overrides = get_standard_config_overrides() + config_overrides = get_standard_config_overrides(config=config) for model_chunk in model_chunks: for name, param in model_chunk.named_parameters(): diff --git a/megatron/core/optimizer/optimizer_config.py b/megatron/core/optimizer/optimizer_config.py index 1813488d7bd..a1429b7a170 100644 --- a/megatron/core/optimizer/optimizer_config.py +++ b/megatron/core/optimizer/optimizer_config.py @@ -33,6 +33,34 @@ def __call__(self, param: torch.nn.Parameter) -> bool: return self.fn(param) +@dataclass(frozen=True) +class ParamWithNamePredicate: + """Wraps a matching function to make it hashable for ParamKey. + Example: + >>> shape_1_not_qkln_param = ParamWithNamePredicate( + name="s1_not_qkln", + fn=lambda param, name: ( + len(param.shape) == 1 or name.endswith(".bias") + and not ("q_layernorm." in name or "k_layernorm." in name) + ) + ) + >>> shape_1_not_qkln_param(torch.empty(10), "interesting.bias") + True + >>> shape_1_not_qkln_param(torch.empty(10), "interesting.q_layernorm.bias") + False + + NOTE: + __hash__ and __eq__ are automatically generated by @dataclass(frozen=True) + based solely on 'name' because we set compare=False/hash=False on 'fn'. + """ + + name: str + fn: Callable[[torch.nn.Parameter, str], bool] = field(compare=False, hash=False) + + def __call__(self, param: torch.nn.Parameter, name: str) -> bool: + return self.fn(param, name) + + @dataclass(frozen=True, slots=True) class ParamKey: """Key to group parameters by. All such grouped parameters can share an @@ -49,6 +77,15 @@ class ParamKey: predicate: Union[ParamPredicate, Tuple[ParamPredicate]] = field(default_factory=tuple) """Predicate(s) to match parameters by. If multiple predicates are provided, any must match.""" + with_name_predicate: Union[ParamWithNamePredicate, Tuple[ParamWithNamePredicate]] = field( + default_factory=tuple + ) + """ + Predicate(s) to match parameters with their name. If multiple predicates are provided, + any must match. This is useful if you need to filter out some parameters from an otherwise + positive match by their name. + """ + def matches(self, param: torch.nn.Parameter, param_name: str) -> bool: """Returns true if passed-in parameter (with name) matches `param_key`. @@ -86,6 +123,15 @@ def matches(self, param: torch.nn.Parameter, param_name: str) -> bool: for predicate in self.predicate: if predicate(param): return True + + # Check if with_name_predicate matches. + if isinstance(self.with_name_predicate, ParamWithNamePredicate): + if self.with_name_predicate(param, param_name): + return True + else: + for predicate in self.with_name_predicate: + if predicate(param, param_name): + return True return False @@ -104,9 +150,20 @@ class OptimizerConfig: min_lr: Optional[float] = None """Minumum value for learning rate. The scheduler clip values below this threshold.""" + decoupled_lr: Optional[float] = None + """Separate learning rate for the input and output layer.""" + + decoupled_min_lr: Optional[float] = None + """Minimum value for learning rate for the input and output layer. The scheduler clip values + below this threshold. + """ + weight_decay: float = 0.01 """Weight decay coefficient for L2 regularization.""" + apply_wd_to_qk_layernorm: bool = False + """If true, apply weight decay to qk layernorm as a special case.""" + ############## # Precision ############## diff --git a/megatron/core/ssm/gated_delta_net.py b/megatron/core/ssm/gated_delta_net.py index 2b0a18b433b..a08d043bdb3 100644 --- a/megatron/core/ssm/gated_delta_net.py +++ b/megatron/core/ssm/gated_delta_net.py @@ -246,7 +246,7 @@ def reset_parameters(self): dtype=self.config.params_dtype, device=torch.cuda.current_device(), ).uniform_(*self.A_init_range) - self.A_log.data.copy_(A) + self.A_log.data.copy_(torch.log(A)) def forward( self, diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 7744869f80e..c85228e1136 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -911,6 +911,17 @@ def validate_args(args, defaults={}): dc = torch.cuda.get_device_capability() assert dc[0] >= 8, "Unsupported compute capability for GroupedGEMM kernels." + if args.no_weight_decay_cond_type is not None: + print_rank_0( + 'WARNING: --no-weight-decay-cond-type is deprecated. Please use --apply-wd-to-qk-layernorm instead.', + args.rank, + ) + if args.no_weight_decay_cond_type == "apply_wd_to_qk_layernorm": + args.apply_wd_to_qk_layernorm = True + else: + raise ValueError(f"Invalid no_weight_decay_cond_type: {args.no_weight_decay_cond_type}") + args.no_weight_decay_cond_type = None + if args.weight_decay_incr_style == 'constant': assert args.start_weight_decay is None assert args.end_weight_decay is None @@ -2083,12 +2094,8 @@ def _add_regularization_args(parser): group.add_argument('--weight-decay-incr-style', type=str, default='constant', choices=['constant', 'linear', 'cosine'], help='Weight decay increment function.') - group.add_argument('--no-weight-decay-cond-type', type=str, choices=['apply_wd_to_qk_layernorm'], - help='Type of no weight decay condition. Choices: ' - 'None (default): param no weight decay if and only if it is 1D; or it is bias; ' - 'or it is embedding and embedding_init_method_std is not None. ' - '"apply_wd_to_qk_layernorm": In addition to the default rules, ' - 'apply weight decay to qk layernorm as a special case.') + group.add_argument('--apply-wd-to-qk-layernorm', action='store_true', + help='Apply weight decay to qk layernorm as a special case.') group.add_argument('--clip-grad', type=float, default=1.0, help='Gradient clipping based on global L2 norm.') group.add_argument('--adam-beta1', type=float, default=0.9, @@ -2123,6 +2130,12 @@ def _add_regularization_args(parser): group.add_argument('--muon-extra-scale-factor', type=float, default=1.0, help='Additional scale factor for the muon update') + group.add_argument('--no-weight-decay-cond-type', type=str, choices=['apply_wd_to_qk_layernorm'], + help='Type of no weight decay condition. Choices: ' + 'None (default): apply weight decay to 1D weights and biases.' + '"apply_wd_to_qk_layernorm": additionally apply weight decay to ' + 'qk layernorm as a special case.' + 'DEPRECATED. Please use --apply-wd-to-qk-layernorm instead. ') return parser diff --git a/megatron/training/training.py b/megatron/training/training.py index 8aff2556d14..60156e1f227 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -1248,7 +1248,7 @@ def get_megatron_optimizer_config(args: Any) -> OptimizerConfig: # Construct the appropriate config_overrides object. This default handles many cases, but # can be added to as needed by the user, or replaced entirely with a custom override. - config_overrides = get_standard_config_overrides(args.decoupled_lr, args.decoupled_min_lr) + config_overrides = get_standard_config_overrides(config=config) return config, config_overrides diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_gdn/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_gdn/model_config.yaml index 5f63de867d9..37933a0e0a7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_gdn/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_gdn/model_config.yaml @@ -18,7 +18,7 @@ MODEL_ARGS: --no-rope-fusion: true #TODO: We can remove this once upgrading to the DEV container --apply-layernorm-1p: true --attention-output-gate: true - --no-weight-decay-cond-type: apply_wd_to_qk_layernorm + --apply-wd-to-qk-layernorm: true --experimental-attention-variant: gated_delta_net --linear-attention-freq: 3 --linear-conv-kernel-dim: 4 diff --git a/tests/unit_tests/test_optimizer.py b/tests/unit_tests/test_optimizer.py index 4f914b56f7c..1f5bbc3f14c 100644 --- a/tests/unit_tests/test_optimizer.py +++ b/tests/unit_tests/test_optimizer.py @@ -21,6 +21,7 @@ _get_param_groups, check_config_overrides_consistency, get_megatron_optimizer, + get_standard_config_overrides, ) from megatron.core.optimizer_param_scheduler import ParamGroupOverride from megatron.core.process_groups_config import ProcessGroupCollection @@ -45,7 +46,7 @@ class Net(nn.Module): - def __init__(self): + def __init__(self, add_layernorm=False): super().__init__() self.conv1 = nn.Conv2d(3, 6, 5) self.pool = nn.MaxPool2d(2, 2) @@ -53,6 +54,10 @@ def __init__(self): self.fc1 = nn.Linear(16 * 5 * 5, 120) self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 10) + if add_layernorm: + self.q_layernorm = nn.LayerNorm(10, bias=False) + self.k_layernorm = nn.LayerNorm(10, bias=False) + self.layernorm = nn.LayerNorm(10, bias=False) def forward(self, x): x = self.pool(F.relu(self.conv1(x))) @@ -206,6 +211,81 @@ def test_get_param_groups_overlapping_matches(mock_get_world_size): assert param_groups[2]['max_lr'] == 0.01 +@patch('torch.distributed.get_world_size', return_value=1) +@patch( + 'torch.distributed.all_gather_object', lambda output_list, obj: output_list.__setitem__(0, obj) +) +def test_get_param_groups_with_standard_config_overrides(apply_wd_to_qk_layernorm: bool): + """In this test, we see if the standard config overrides are applied correctly.""" + + # Initialize the model with layernorm + net = Net() + + config = OptimizerConfig(optimizer='adam', lr=0.01) + config_overrides = get_standard_config_overrides(config=config) + param_groups = _get_param_groups([net], config, config_overrides) + + assert len(param_groups) == 2 + p_set = set(net.parameters()) + + assert p_set == set(param_groups[0]['params']) | set(param_groups[1]['params']) + assert len(p_set) == len(param_groups[0]['params']) + len(param_groups[1]['params']) + assert param_groups[0]['wd_mult'] == 0.0 or param_groups[1]['wd_mult'] == 0.0 + assert param_groups[0]['wd_mult'] == 1.0 or param_groups[1]['wd_mult'] == 1.0 + assert len(param_groups[0]['params']) > 0 and len(param_groups[1]['params']) > 0 + + # Both param groups should have 5 parameters. + # Param group A (wd_mult=1.0): conv1.weight, conv2.weight, fc1.weight, fc2.weight, fc3.weight + # Param group B (wd_mult=0.0): conv1.bias, conv2.bias, fc1.bias, fc2.bias, fc3.bias + assert len(param_groups[0]['params']) == 5, ( + f"Expected 5 parameters in the first param group, " + f"but got {len(param_groups[0]['params'])}" + ) + assert len(param_groups[1]['params']) == 5, ( + f"Expected 5 parameters in the second param group, " + f"but got {len(param_groups[1]['params'])}" + ) + + +@patch('torch.distributed.get_world_size', return_value=1) +@patch( + 'torch.distributed.all_gather_object', lambda output_list, obj: output_list.__setitem__(0, obj) +) +def test_get_param_groups_appling_wd_to_qk_layernorm(apply_wd_to_qk_layernorm: bool): + """In this test, we see if the `apply_wd_to_qk_layernorm` config is applied correctly.""" + + # Initialize the model with layernorm + net = Net(add_layernorm=True) + + config = OptimizerConfig( + optimizer='adam', lr=0.01, apply_wd_to_qk_layernorm=apply_wd_to_qk_layernorm + ) + config_overrides = get_standard_config_overrides(config=config) + param_groups = _get_param_groups([net], config, config_overrides) + + assert len(param_groups) == 2 + p_set = set(net.parameters()) + + assert p_set == set(param_groups[0]['params']) | set(param_groups[1]['params']) + assert len(p_set) == len(param_groups[0]['params']) + len(param_groups[1]['params']) + assert param_groups[0]['wd_mult'] == 1.0 + assert param_groups[1]['wd_mult'] == 0.0 + + # There are two param groups, having 7, and 6 parameters respectively. + # Param group A (wd_mult=1.0): conv1.weight, conv2.weight, fc1.weight, fc2.weight, fc3.weight, + # q_layernorm.weight, k_layernorm.weight + # Param group B (wd_mult=0.0): conv1.bias, conv2.bias, fc1.bias, fc2.bias, fc3.bias, + # layernorm.weight + assert len(param_groups[0]['params']) == 7, ( + f"Expected 5 parameters in the first param group, " + f"but got {len(param_groups[0]['params'])}" + ) + assert len(param_groups[1]['params']) == 6, ( + f"Expected 6 parameters in the second param group, " + f"but got {len(param_groups[1]['params'])}" + ) + + def test_chained_optimizer(): net = Net() optimizer_1 = Adam(list(net.parameters())[:2], lr=0.01) From 6e2153b9e3c7a71c07bdb1aa417bef0177809f01 Mon Sep 17 00:00:00 2001 From: Yuzhong Wang Date: Wed, 21 Jan 2026 14:19:46 +0800 Subject: [PATCH 246/248] [dev] feat(moe): Cherry-pick #1989 back to dev (#3011) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig Co-authored-by: oliver könig --- gpt_builders.py | 27 +- ...rimental_attention_variant_module_specs.py | 467 +++++++++++++-- megatron/core/models/gpt/gpt_layer_specs.py | 530 +++++++----------- megatron/core/ssm/gated_delta_net.py | 4 +- .../dot_product_attention_context_parallel.py | 3 + megatron/core/transformer/spec_utils.py | 1 + .../core/transformer/transformer_config.py | 43 +- megatron/training/arguments.py | 26 +- megatron/training/checkpointing.py | 8 +- megatron/training/training.py | 66 ++- .../test_modelopt_module_spec.py | 1 + tests/unit_tests/ssm/test_gated_delta_net.py | 33 +- .../unit_tests/transformer/test_attention.py | 43 +- 13 files changed, 749 insertions(+), 503 deletions(-) diff --git a/gpt_builders.py b/gpt_builders.py index 293475b06b6..0be64edaab6 100644 --- a/gpt_builders.py +++ b/gpt_builders.py @@ -10,7 +10,8 @@ get_gpt_decoder_layer_specs, ) from megatron.core.models.gpt.experimental_attention_variant_module_specs import ( - is_linear_attention_variant, + get_transformer_block_with_experimental_attention_variant_spec, + get_transformer_layer_with_experimental_attention_variant_spec, ) from megatron.core.models.gpt.heterogeneous.heterogeneous_layer_specs import ( get_gpt_heterogeneous_layer_spec, @@ -46,7 +47,13 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_ else: use_te = args.transformer_impl == "transformer_engine" - if args.num_experts or is_linear_attention_variant(args.experimental_attention_variant): + if args.experimental_attention_variant is not None: + transformer_layer_spec = ( + get_transformer_block_with_experimental_attention_variant_spec( + config=config, vp_stage=vp_stage + ) + ) + elif args.num_experts: assert not (config.transformer_impl == "inference_optimized") # Define the decoder block spec transformer_layer_spec = get_gpt_decoder_block_spec( @@ -70,9 +77,19 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_ mtp_transformer_layer_spec = import_module(args.spec) else: # Define the decoder block spec - decoder_layer_specs = get_gpt_decoder_layer_specs( - config, use_transformer_engine=use_te, normalization=args.normalization, qk_l2_norm=args.qk_l2_norm, vp_stage=vp_stage - ) + if args.experimental_attention_variant is not None: + decoder_layer_specs = ( + get_transformer_layer_with_experimental_attention_variant_spec( + config=config + ) + ) + else: + decoder_layer_specs = get_gpt_decoder_layer_specs( + config, + use_transformer_engine=use_te, + normalization=args.normalization, + qk_l2_norm=args.qk_l2_norm, + ) mtp_transformer_layer_spec = decoder_layer_specs[-1] # Use spec of the last layer in decoder block as spec of the transformer layer in MTP mtp_block_spec = get_gpt_mtp_block_spec( diff --git a/megatron/core/models/gpt/experimental_attention_variant_module_specs.py b/megatron/core/models/gpt/experimental_attention_variant_module_specs.py index e6d6fa03ce7..7649a0b2165 100644 --- a/megatron/core/models/gpt/experimental_attention_variant_module_specs.py +++ b/megatron/core/models/gpt/experimental_attention_variant_module_specs.py @@ -1,10 +1,11 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -from typing import Optional +from typing import List, Optional +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.models.backends import BackendSpecProvider from megatron.core.ssm.gated_delta_net import GatedDeltaNet, GatedDeltaNetSubmodules -from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.enums import AttnMaskType, LayerType from megatron.core.transformer.experimental_attention_variant.dsa import ( DSAIndexer, DSAIndexerSubmodules, @@ -17,19 +18,50 @@ MLASelfAttentionSubmodules, ) from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.transformer.transformer_block import ( + TransformerBlockSubmodules, + get_num_layers_to_build, +) +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.transformer_layer import ( + TransformerLayer, + TransformerLayerSubmodules, + get_transformer_layer_offset, +) +try: + import transformer_engine as te # type: ignore[import-untyped] # pylint: disable=unused-import -def is_linear_attention_variant(experimental_attention_variant: str) -> bool: - """Check if the experimental attention variant is a linear attention variant.""" - linear_attention_variants = ["gated_delta_net"] - return experimental_attention_variant in linear_attention_variants + from megatron.core.extensions.transformer_engine_spec_provider import TESpecProvider + + HAVE_TE = True +except ImportError: + HAVE_TE = False + +try: + import nvidia_kitchen # type: ignore[import-not-found] # pylint: disable=unused-import + + from megatron.core.extensions.kitchen import KitchenSpecProvider + HAVE_KITCHEN = True +except ImportError: + HAVE_KITCHEN = False -def get_gated_delta_net_module_spec_for_backend( - backend: BackendSpecProvider, normalization: Optional[str] = None + +########## +# Experimental Attention Variant Module Specs +########## + + +def get_gated_delta_net_module_spec( + config: TransformerConfig, backend: BackendSpecProvider = None ) -> ModuleSpec: - """Helper function to get module spec for Linear Attention""" - rms_norm = normalization == "RMSNorm" + """Build module spec for GatedDeltaNet attention.""" + + if backend is None: + backend = _get_backend_spec_provider(config=config) + + rms_norm = config.normalization == "RMSNorm" attention = ModuleSpec( module=GatedDeltaNet, submodules=GatedDeltaNetSubmodules( @@ -43,27 +75,22 @@ def get_gated_delta_net_module_spec_for_backend( def get_dsa_module_spec_for_backend( - backend: BackendSpecProvider, - qk_layernorm: Optional[bool] = False, - qk_l2_norm: Optional[bool] = False, - multi_latent_attention: Optional[bool] = False, - mla_down_proj_use_column_parallel: Optional[bool] = False, - normalization: Optional[str] = None, - fallback_to_eager_attn: Optional[bool] = False, + config: TransformerConfig, backend: BackendSpecProvider = None ) -> ModuleSpec: """Helper function to get module spec for Sparse Attention.""" - assert multi_latent_attention, "Currently only MLA supports sparse attention." - assert qk_l2_norm is False, "qk_l2_norm is not supported with MLA." - assert fallback_to_eager_attn is False, "Fallback to eager attention is not supported with DSA." + assert config.multi_latent_attention, "Currently only MLA supports sparse attention." + assert config.qk_l2_norm is False, "qk_l2_norm is not supported with MLA." - linear_q_down_proj = ( - backend.column_parallel_linear() if mla_down_proj_use_column_parallel else backend.linear() + linear_q_up_proj = ( + backend.column_parallel_layer_norm_linear() + if config.qk_layernorm + else backend.column_parallel_linear() ) - linear_kv_down_proj = ( - backend.column_parallel_linear() if mla_down_proj_use_column_parallel else backend.linear() + linear_kv_up_proj = ( + backend.column_parallel_layer_norm_linear() + if config.qk_layernorm + else backend.column_parallel_linear() ) - linear_q_up_proj = backend.column_parallel_linear() - linear_kv_up_proj = backend.column_parallel_linear() # Because TransformerEngine does not support sparse attention yet, we use local # implementation whether the backend is TransformerEngine or not. @@ -82,23 +109,19 @@ def get_dsa_module_spec_for_backend( ), ) - # Adjust for RMS norm. - rms_norm = normalization == "RMSNorm" - qk_norm = backend.layer_norm(rms_norm=rms_norm, for_qk=True) if qk_layernorm else IdentityOp - attention = ModuleSpec( module=MLASelfAttention, params={"attn_mask_type": AttnMaskType.causal}, submodules=MLASelfAttentionSubmodules( linear_q_proj=backend.column_parallel_linear(), - linear_q_down_proj=linear_q_down_proj, + linear_q_down_proj=backend.linear(), linear_q_up_proj=linear_q_up_proj, - linear_kv_down_proj=linear_kv_down_proj, + linear_kv_down_proj=backend.linear(), linear_kv_up_proj=linear_kv_up_proj, core_attention=core_attention, linear_proj=backend.row_parallel_linear(), - q_layernorm=qk_norm, - kv_layernorm=qk_norm, + q_layernorm=IdentityOp, + kv_layernorm=IdentityOp, ), metainfo={"fuse_input_layernorm": False}, ) @@ -106,33 +129,359 @@ def get_dsa_module_spec_for_backend( return attention -def get_experimental_attention_variant_module_spec_for_backend( - backend: BackendSpecProvider, - sharded_state_dict_keys_map: dict, - experimental_attention_variant: Optional[str] = None, - qk_layernorm: Optional[bool] = False, - qk_l2_norm: Optional[bool] = False, - multi_latent_attention: Optional[bool] = False, - mla_down_proj_use_column_parallel: Optional[bool] = False, - normalization: Optional[str] = None, - fallback_to_eager_attn: Optional[bool] = False, +def get_experimental_attention_variant_module_spec( + config: TransformerConfig, backend: BackendSpecProvider = None ) -> ModuleSpec: - """Helper function to get module spec for Attention""" - if experimental_attention_variant == "gated_delta_net": - return get_gated_delta_net_module_spec_for_backend( - backend=backend, normalization=normalization + """Helper function to get module spec for experimental attention variant""" + + if backend is None: + backend = _get_backend_spec_provider(config=config) + + if config.experimental_attention_variant == "gated_delta_net": + return get_gated_delta_net_module_spec(config=config, backend=backend) + else: + raise ValueError( + f"Invalid experimental attention variant: {config.experimental_attention_variant}" ) - elif experimental_attention_variant == "dsa": - return get_dsa_module_spec_for_backend( - backend=backend, - qk_layernorm=qk_layernorm, - qk_l2_norm=qk_l2_norm, - multi_latent_attention=multi_latent_attention, - mla_down_proj_use_column_parallel=mla_down_proj_use_column_parallel, - normalization=normalization, - fallback_to_eager_attn=fallback_to_eager_attn, + + +########## +# Experimental GPT Decoder Block Spec +########## + + +def get_transformer_layer_with_experimental_attention_variant_spec( + config: TransformerConfig, backend: BackendSpecProvider = None +) -> List[ModuleSpec]: + """Build transformer layer specs with experimental attention variants (e.g., linear attention). + + This function is for constructing a heterogeneous transformer that supports mixing different + attention mechanisms (experimental vs standard) and MLP types (MoE vs dense) across layers. + **Note that, this API is a experimental API in the short term, and might be deprecated in the + future. In the long run, we will move to a new design that better support hybrid models.** + + Key Design: + 1. Attention and MLP patterns: The attention pattern and MLP pattern are orthogonal + and determined independently. This allows flexible combinations (e.g., linear attention + with MoE, or standard attention with dense MLP). + - Attention pattern: derived from `config.linear_attention_freq` or + `config.experimental_attention_variant`. + - MLP pattern: derived from `config.moe_layer_freq`. + + 2. Per-Layer Spec Construction: Iterates through layers, constructing transformer + layer specs based on attention and MLP patterns. + + Args: + config: Transformer configuration containing model hyperparameters and feature flags. + + Returns: + List[ModuleSpec] containing per-layer specs. + + Note: + Currently only supports transformer_engine backend. Kitchen backend can be used as a + wrapper with TE fallback for unsupported operations. + """ + + if backend is None: + backend = _get_backend_spec_provider(config=config) + + # Get attention patterns and specs + experimental_attention_pattern = [0] * config.num_layers + if is_linear_attention_variant(config.experimental_attention_variant): + experimental_attention_pattern = get_linear_attention_pattern(config=config) + elif config.experimental_attention_variant is not None: + experimental_attention_pattern = [1] * config.num_layers + + if 1 in experimental_attention_pattern: + experimental_attention_spec = get_experimental_attention_variant_module_spec( + config=config, backend=backend + ) + else: + experimental_attention_spec = None + + if 0 in experimental_attention_pattern: + standard_attention_spec = _get_self_attention_module_spec(config=config, backend=backend) + else: + standard_attention_spec = None + + # Get MLP patterns and specs + if config.num_moe_experts is not None: + moe_layer_pattern = get_moe_layer_pattern(config=config) + else: + moe_layer_pattern = [0] * config.num_layers + + if 1 in moe_layer_pattern: + moe_layer_spec = _get_moe_module_spec(config=config, backend=backend) + else: + moe_layer_spec = None + + if 0 in moe_layer_pattern: + dense_mlp_layer_spec = _get_dense_mlp_module_spec(config=config, backend=backend) + else: + dense_mlp_layer_spec = None + + # Get GPT decoder block layer specs + rms_norm = config.normalization == "RMSNorm" + layer_specs = [] + for layer_number in range(config.num_layers): + attention = ( + experimental_attention_spec + if experimental_attention_pattern[layer_number] == 1 + else standard_attention_spec + ) + mlp = moe_layer_spec if moe_layer_pattern[layer_number] == 1 else dense_mlp_layer_spec + input_layernorm = ( + IdentityOp + if attention.metainfo["fuse_input_layernorm"] + else backend.layer_norm(rms_norm=rms_norm, for_qk=False) + ) + pre_mlp_layernorm = ( + IdentityOp + if mlp.metainfo["fuse_pre_mlp_layernorm"] + else backend.layer_norm(rms_norm=rms_norm, for_qk=False) + ) + + layer_specs.append( + ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=input_layernorm, + self_attention=attention, + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=pre_mlp_layernorm, + mlp=mlp, + mlp_bda=get_bias_dropout_add, + ), + ) + ) + + return layer_specs + + +def get_transformer_block_with_experimental_attention_variant_spec( + config: TransformerConfig, vp_stage: Optional[int] = None, pp_rank: Optional[int] = None +) -> TransformerBlockSubmodules: + """Build transformer block spec with experimental attention variants (e.g., linear attention). + + This function constructs a heterogeneous transformer block that supports mixing different + attention mechanisms (experimental vs standard) and MLP types (MoE vs dense) across layers. + **Note that, this API is a experimental API in the short term, and might be deprecated in the + future. In the long run, we will move to a new design that better support hybrid models.** + + Constructing transformer layer specs by + `get_transformer_layer_with_experimental_attention_variant_spec` and then slicing the + layer specs to only include the layers that are built in this pipeline stage. + + Args: + config: Transformer configuration containing model hyperparameters and feature flags. + vp_stage: Virtual pipeline stage index for interleaved pipeline parallelism. + pp_rank: Pipeline model parallel rank. + + Returns: + TransformerBlockSubmodules containing per-layer specs and final layer norm. + + Note: + Currently only supports transformer_engine backend. Kitchen backend can be used as a + wrapper with TE fallback for unsupported operations. + """ + + backend = _get_backend_spec_provider(config=config) + + layer_specs = get_transformer_layer_with_experimental_attention_variant_spec( + config=config, backend=backend + ) + + # Slice the layer specs to only include the layers that are built in this pipeline stage. + if config.pipeline_model_parallel_layout is not None: + local_layer_ids = config.pipeline_model_parallel_layout.get_layer_id_list( + layer_type=LayerType.decoder, vp_stage=vp_stage, pp_rank=pp_rank + ) + else: + offset = get_transformer_layer_offset(config, vp_stage=vp_stage, pp_rank=pp_rank) + num_layers_to_build = get_num_layers_to_build(config, vp_stage=vp_stage, pp_rank=pp_rank) + local_layer_ids = range(offset, offset + num_layers_to_build) + + layer_specs = [layer_specs[layer_id] for layer_id in local_layer_ids] + + # Get GPT decoder block spec + rms_norm = config.normalization == "RMSNorm" + gpt_decoder_block_spec = TransformerBlockSubmodules( + layer_specs=layer_specs, layer_norm=backend.layer_norm(rms_norm=rms_norm, for_qk=False) + ) + + return gpt_decoder_block_spec + + +########## +# Utilities +########## + + +def is_linear_attention_variant(experimental_attention_variant: Optional[str]) -> bool: + """Check if the experimental attention variant is a linear attention variant.""" + linear_attention_variants = ["gated_delta_net"] + return experimental_attention_variant in linear_attention_variants + + +def get_moe_layer_pattern(config: TransformerConfig) -> List[int]: + """Parse config.moe_layer_freq to get per-layer MoE pattern (1=MoE, 0=dense). + + - int N: one MoE layer every N layers (e.g., N=2 -> [1,0,1,0,...]) + - list: use directly as the pattern.""" + + if isinstance(config.moe_layer_freq, int): + # [1,0,0,...,0,1,0,0,...,0,...] + moe_layer_pattern = [ + 1 if (i % config.moe_layer_freq == 0) else 0 for i in range(config.num_layers) + ] + elif isinstance(config.moe_layer_freq, list): + moe_layer_pattern = config.moe_layer_freq + assert len(moe_layer_pattern) == config.num_layers, ( + f"Invalid length of moe_layer_pattern: {len(moe_layer_pattern)}, " + f"expected {config.num_layers}, " + f"current moe layer pattern: {config.moe_layer_freq}" ) else: raise ValueError( - f"Invalid experimental attention variant: {experimental_attention_variant}" + f"Invalid moe_layer_freq: {type(config.moe_layer_freq)}, {config.moe_layer_freq}" + ) + return moe_layer_pattern + + +def get_linear_attention_pattern(config: TransformerConfig) -> List[int]: + """Parse config.linear_attention_freq to get per-layer attention pattern (1=LA, 0=SDPA). + + - int N: one SDPA layer every N layers (e.g., N=4 -> [1,1,1,0,1,1,1,0,...]) + - list: use directly as the pattern.""" + + if isinstance(config.linear_attention_freq, int): + linear_attention_pattern = [ + # [1,1,...,1,0,1,1,...,1,0,...] + 0 if ((i + 1) % config.linear_attention_freq == 0) else 1 + for i in range(config.num_layers) + ] + elif isinstance(config.linear_attention_freq, list): + linear_attention_pattern = config.linear_attention_freq + assert len(linear_attention_pattern) == config.num_layers, ( + f"Invalid length of linear_attention_pattern: {len(linear_attention_pattern)}, " + f"expected {config.num_layers}, " + f"current linear attention pattern: {config.linear_attention_freq}" + ) + elif config.linear_attention_freq is None: + if not is_linear_attention_variant(config.experimental_attention_variant): + linear_attention_pattern = [0] * config.num_layers + else: + # This should be caught by config validation, but raise here as a safety check + raise ValueError( + f"Linear attention type {config.experimental_attention_variant} is specified " + "but linear_attention_freq is None. " + "Please set linear_attention_freq to specify the LA/SDPA layer pattern." + ) + else: + raise ValueError( + f"Invalid linear_attention_freq: {type(config.linear_attention_freq)}," + f" {config.linear_attention_freq}" + ) + return linear_attention_pattern + + +def _get_backend_spec_provider(config: TransformerConfig) -> BackendSpecProvider: + """Get backend spec provider for experimental attention variant.""" + + assert config.transformer_impl == "transformer_engine", ( + "Experimental GPT decoder block spec only supports " + "transformer engine implementation for now." + ) + backend: BackendSpecProvider = ( + KitchenSpecProvider( + fallback=TESpecProvider(fallback_to_eager_attn=config.fallback_to_eager_attn), + use_kitchen_attention=config.use_kitchen_attention, + kitchen_attention_backend=config.kitchen_attention_backend, ) + if config.use_kitchen + else TESpecProvider() + ) + return backend + + +########## +# Spec functions for non-experimental self attention and MLP layer. +########## + + +def _get_self_attention_module_spec( + config: TransformerConfig, backend: BackendSpecProvider = None +) -> ModuleSpec: + """Get non-experimental self-attention module spec. + For hybrid models that mix experimental and non-experimental attention architectures. + + Warning: This function may be deprecated in the future.""" + + if backend is None: + backend = _get_backend_spec_provider(config=config) + + from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec + + layer_spec = get_gpt_layer_with_transformer_engine_spec( + num_experts=config.num_moe_experts, + moe_grouped_gemm=config.moe_grouped_gemm, + qk_layernorm=config.qk_layernorm, + multi_latent_attention=config.multi_latent_attention, + moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, + qk_l2_norm=config.qk_l2_norm, + use_kitchen=config.use_kitchen, + use_te_activation_func=config.use_te_activation_func, + fallback_to_eager_attn=config.fallback_to_eager_attn, + use_kitchen_attention=config.use_kitchen_attention, + kitchen_attention_backend=config.kitchen_attention_backend, + ) + attn_spec = layer_spec.submodules.self_attention + if config.multi_latent_attention: + attn_spec.metainfo["fuse_input_layernorm"] = False + else: + attn_spec.metainfo["fuse_input_layernorm"] = backend.fuse_layernorm_and_linear() + + return attn_spec + + +def _get_dense_mlp_module_spec( + config: TransformerConfig, backend: BackendSpecProvider = None +) -> ModuleSpec: + """Get dense MLP module spec. + For hybrid models that mix dense MLP and experimental attention architectures. + + Warning: This function may be deprecated in the future.""" + + if backend is None: + backend = _get_backend_spec_provider(config=config) + + from megatron.core.models.gpt.gpt_layer_specs import get_mlp_module_spec_for_backend + + mlp_spec = get_mlp_module_spec_for_backend(backend=backend, num_experts=None) + mlp_spec.metainfo["fuse_pre_mlp_layernorm"] = backend.fuse_layernorm_and_linear() + + return mlp_spec + + +def _get_moe_module_spec( + config: TransformerConfig, backend: BackendSpecProvider = None +) -> ModuleSpec: + """Get MoE module spec. + For hybrid models that mix MoE and experimental attention architectures. + + Warning: This function may be deprecated in the future.""" + + if backend is None: + backend = _get_backend_spec_provider(config=config) + + from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec_for_backend + + moe_spec = get_moe_module_spec_for_backend( + backend=backend, + num_experts=config.num_moe_experts, + moe_grouped_gemm=config.moe_grouped_gemm, + moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, + use_te_activation_func=config.use_te_activation_func, + ) + moe_spec.metainfo["fuse_pre_mlp_layernorm"] = False + return moe_spec diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py index 1db3b939530..70f0a8244ca 100755 --- a/megatron/core/models/gpt/gpt_layer_specs.py +++ b/megatron/core/models/gpt/gpt_layer_specs.py @@ -9,13 +9,8 @@ InferenceSpecProvider, LocalSpecProvider, ) -from megatron.core.models.gpt.experimental_attention_variant_module_specs import ( - get_experimental_attention_variant_module_spec_for_backend, - is_linear_attention_variant, -) from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec_for_backend from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules -from megatron.core.transformer.dot_product_attention import DotProductAttention from megatron.core.transformer.enums import AttnMaskType, LayerType from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.mlp import MLP, MLPSubmodules @@ -45,7 +40,7 @@ from megatron.core.utils import is_te_min_version try: - import transformer_engine as te # type: ignore[import-untyped] # pylint: disable=unused-import + import transformer_engine as te # pylint: disable=unused-import from megatron.core.extensions.transformer_engine import TEFusedMLP, TENorm from megatron.core.extensions.transformer_engine_spec_provider import TESpecProvider @@ -55,7 +50,7 @@ HAVE_TE = False try: - import nvidia_kitchen # type: ignore[import-not-found] # pylint: disable=unused-import + import nvidia_kitchen # pylint: disable=unused-import from megatron.core.extensions.kitchen import KitchenSpecProvider @@ -64,7 +59,7 @@ HAVE_KITCHEN = False try: - import apex # type: ignore[import-untyped] # pylint: disable=unused-import + import apex # pylint: disable=unused-import from megatron.core.fusions.fused_layer_norm import FusedLayerNorm @@ -181,10 +176,8 @@ def get_gpt_layer_with_transformer_engine_spec( moe_grouped_gemm: Optional[bool] = False, qk_layernorm: Optional[bool] = False, multi_latent_attention: Optional[bool] = False, - experimental_attention_variant: Optional[str] = None, fp8: Optional[str] = None, # pylint: disable=unused-argument moe_use_legacy_grouped_gemm: Optional[bool] = False, - normalization: Optional[str] = None, qk_l2_norm: Optional[bool] = False, use_te_op_fuser: Optional[bool] = False, use_kitchen: bool = False, @@ -200,15 +193,10 @@ def get_gpt_layer_with_transformer_engine_spec( num_experts (int, optional): Number of experts. Defaults to None. moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False. qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False. - multi_latent_attention (bool, optional): To use multi-latent attention. Defaults to False. - experimental_attention_variant (str, optional): The type of experimental attention variant. - Defaults to None. fp8 (str, optional): Deprecated. For temporary Nemo compatibility. moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP. Defaults to False. - normalization (str, optional): The normalization to use. Defaults to None. qk_l2_norm (bool, optional): To use l2 norm for queries/keys. Defaults to False. - use_kitchen (bool, optional): To use KitchenSpecProvider. Defaults to False. use_te_op_fuser (bool, optional): Use Transformer Engine's operation-based API, which may enable certain operation fusions. Defaults to False. @@ -236,23 +224,8 @@ def get_gpt_layer_with_transformer_engine_spec( else: backend = TESpecProvider(fallback_to_eager_attn=fallback_to_eager_attn) - sharded_state_dict_keys_map = {} - - attention = get_attention_module_spec_for_backend( - backend=backend, - sharded_state_dict_keys_map=sharded_state_dict_keys_map, - experimental_attention_variant=experimental_attention_variant, - qk_layernorm=qk_layernorm, - qk_l2_norm=qk_l2_norm, - multi_latent_attention=multi_latent_attention, - mla_down_proj_use_column_parallel=False, - normalization=normalization, - fallback_to_eager_attn=fallback_to_eager_attn, - ) - mlp = get_mlp_module_spec_for_backend( backend=backend, - sharded_state_dict_keys_map=sharded_state_dict_keys_map, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm, @@ -260,13 +233,77 @@ def get_gpt_layer_with_transformer_engine_spec( use_te_activation_func=use_te_activation_func, ) - return get_transformer_layer_spec_for_backend( - backend=backend, - attention=attention, - mlp=mlp, - sharded_state_dict_keys_map=sharded_state_dict_keys_map, - normalization=normalization, - ) + if multi_latent_attention: + assert qk_l2_norm is False, "qk_l2_norm is not supported with MLA." + linear_q_up_proj = ( + backend.column_parallel_layer_norm_linear() + if qk_layernorm + else backend.column_parallel_linear() + ) + linear_kv_up_proj = ( + backend.column_parallel_layer_norm_linear() + if qk_layernorm + else backend.column_parallel_linear() + ) + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=backend.layer_norm(), + self_attention=ModuleSpec( + module=MLASelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=MLASelfAttentionSubmodules( + linear_q_proj=backend.column_parallel_linear(), + linear_q_down_proj=backend.linear(), + linear_q_up_proj=linear_q_up_proj, + linear_kv_down_proj=backend.linear(), + linear_kv_up_proj=linear_kv_up_proj, + core_attention=backend.core_attention(), + linear_proj=backend.row_parallel_linear(), + q_layernorm=IdentityOp, + kv_layernorm=IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=backend.layer_norm() if num_experts else IdentityOp, + mlp=mlp, + mlp_bda=get_bias_dropout_add, + ), + ) + else: + qk_norm = backend.layer_norm(for_qk=True) + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=backend.column_parallel_layer_norm_linear(), + core_attention=backend.core_attention(), + linear_proj=backend.row_parallel_linear(), + q_layernorm=( + L2Norm if qk_l2_norm else (qk_norm if qk_layernorm else IdentityOp) + ), + k_layernorm=( + L2Norm if qk_l2_norm else (qk_norm if qk_layernorm else IdentityOp) + ), + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=backend.layer_norm() if num_experts else IdentityOp, + mlp=mlp, + mlp_bda=get_bias_dropout_add, + sharded_state_dict_keys_map={ + "mlp.0.weight": "mlp.linear_fc1.layer_norm_weight", + "mlp.0.bias": "mlp.linear_fc1.layer_norm_bias", + "mlp.1.basic_ops.0.weight": "mlp.linear_fc1.weight", + "mlp.1.basic_ops.1.bias": "mlp.linear_fc1.bias", + "mlp.3.basic_ops.0.weight": "mlp.linear_fc2.weight", + "mlp.3.basic_ops.1.bias": "mlp.linear_fc2.bias", + }, + ), + ) def get_gpt_layer_local_spec( @@ -274,7 +311,6 @@ def get_gpt_layer_local_spec( moe_grouped_gemm: Optional[bool] = False, qk_layernorm: Optional[bool] = False, multi_latent_attention: Optional[bool] = False, - experimental_attention_variant: Optional[str] = None, fp8: Optional[str] = None, # pylint: disable=unused-argument moe_use_legacy_grouped_gemm: Optional[bool] = False, normalization: Optional[str] = None, @@ -290,15 +326,10 @@ def get_gpt_layer_local_spec( num_experts (int, optional): Number of experts. Defaults to None. moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False. qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False. - multi_latent_attention (bool, optional): To use multi-latent attention. Defaults to False. - experimental_attention_variant (str, optional): The type of experimental attention variant. - Defaults to None. fp8 (str, optional): Deprecated. For temporary Nemo compatibility. moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP. Defaults to False. - normalization (str, optional): The normalization to use. Defaults to None. qk_l2_norm (bool, optional): To use l2 norm for queries/keys. Defaults to False. - use_kitchen (bool, optional): To use KitchenSpecProvider. Defaults to False. Returns: ModuleSpec: Module specification with Megatron-Core modules @@ -313,6 +344,13 @@ def get_gpt_layer_local_spec( ) else: backend = LocalSpecProvider() + # Adjust for RMS norm. + if normalization == "RMSNorm": + layer_norm = backend.layer_norm(rms_norm=True, for_qk=False) + qk_norm = backend.layer_norm(rms_norm=True, for_qk=True) + else: + layer_norm = backend.layer_norm(rms_norm=False, for_qk=False) + qk_norm = backend.layer_norm(rms_norm=False, for_qk=True) if fp8 is not None: warnings.warn( @@ -320,25 +358,6 @@ def get_gpt_layer_local_spec( " and will be removed soon. Please update your code accordingly." ) - if experimental_attention_variant is not None: - raise NotImplementedError( - "Experimental attention variant is not supported with local spec yet." - ) - - sharded_state_dict_keys_map = {} - - attention = get_attention_module_spec_for_backend( - backend=backend, - sharded_state_dict_keys_map=sharded_state_dict_keys_map, - experimental_attention_variant=experimental_attention_variant, - qk_layernorm=qk_layernorm, - qk_l2_norm=qk_l2_norm, - multi_latent_attention=multi_latent_attention, - mla_down_proj_use_column_parallel=True, - normalization=normalization, - fallback_to_eager_attn=False, - ) - mlp = get_mlp_module_spec_for_backend( backend=backend, num_experts=num_experts, @@ -346,170 +365,63 @@ def get_gpt_layer_local_spec( moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm, ) - return get_transformer_layer_spec_for_backend( - backend=backend, - attention=attention, - mlp=mlp, - sharded_state_dict_keys_map=sharded_state_dict_keys_map, - normalization=normalization, - ) - - -def get_transformer_layer_spec_for_backend( - backend: BackendSpecProvider, - attention: ModuleSpec, - mlp: ModuleSpec, - sharded_state_dict_keys_map: Optional[dict] = None, - normalization: Optional[str] = None, -) -> ModuleSpec: - """Helper function to get module spec for TransformerLayer""" - - rms_norm = normalization == "RMSNorm" - - input_layernorm = ( - IdentityOp - if attention.metainfo["fuse_input_layernorm"] - else backend.layer_norm(rms_norm=rms_norm, for_qk=False) - ) - pre_mlp_layernorm = ( - IdentityOp - if mlp.metainfo["fuse_pre_mlp_layernorm"] - else backend.layer_norm(rms_norm=rms_norm, for_qk=False) - ) - - transformer_layer = ModuleSpec( - module=TransformerLayer, - submodules=TransformerLayerSubmodules( - input_layernorm=input_layernorm, - self_attention=attention, - self_attn_bda=get_bias_dropout_add, - pre_mlp_layernorm=pre_mlp_layernorm, - mlp=mlp, - mlp_bda=get_bias_dropout_add, - sharded_state_dict_keys_map=sharded_state_dict_keys_map, - ), - ) - return transformer_layer - - -def get_attention_module_spec_for_backend( - backend: BackendSpecProvider, - sharded_state_dict_keys_map: dict, - experimental_attention_variant: Optional[str] = None, - qk_layernorm: Optional[bool] = False, - qk_l2_norm: Optional[bool] = False, - multi_latent_attention: Optional[bool] = False, - mla_down_proj_use_column_parallel: Optional[bool] = False, - normalization: Optional[str] = None, - fallback_to_eager_attn: Optional[bool] = False, -) -> ModuleSpec: - """Helper function to get module spec for Attention""" - - if experimental_attention_variant is not None: - return get_experimental_attention_variant_module_spec_for_backend( - backend, - sharded_state_dict_keys_map, - experimental_attention_variant, - qk_layernorm, - qk_l2_norm, - multi_latent_attention, - mla_down_proj_use_column_parallel, - normalization, - fallback_to_eager_attn, - ) - - # Adjust for RMS norm. - rms_norm = normalization == "RMSNorm" - qk_norm = backend.layer_norm(rms_norm=rms_norm, for_qk=True) - - core_attention = backend.core_attention() if not fallback_to_eager_attn else DotProductAttention if multi_latent_attention: assert qk_l2_norm is False, "qk_l2_norm is not supported with MLA." - linear_q_down_proj = ( - backend.column_parallel_linear() - if mla_down_proj_use_column_parallel - else backend.linear() - ) - linear_kv_down_proj = ( - backend.column_parallel_linear() - if mla_down_proj_use_column_parallel - else backend.linear() - ) - linear_q_up_proj = ( - backend.column_parallel_layer_norm_linear() - if qk_layernorm and backend.fuse_layernorm_and_linear() - else backend.column_parallel_linear() - ) - linear_kv_up_proj = ( - backend.column_parallel_layer_norm_linear() - if qk_layernorm and backend.fuse_layernorm_and_linear() - else backend.column_parallel_linear() - ) - qk_norm = ( - backend.layer_norm(rms_norm=rms_norm, for_qk=True) - if qk_layernorm and not backend.fuse_layernorm_and_linear() - else IdentityOp - ) - attention = ModuleSpec( - module=MLASelfAttention, - params={"attn_mask_type": AttnMaskType.causal}, - submodules=MLASelfAttentionSubmodules( - linear_q_proj=backend.column_parallel_linear(), - linear_q_down_proj=linear_q_down_proj, - linear_q_up_proj=linear_q_up_proj, - linear_kv_down_proj=linear_kv_down_proj, - linear_kv_up_proj=linear_kv_up_proj, - core_attention=core_attention, - linear_proj=backend.row_parallel_linear(), - q_layernorm=qk_norm, - kv_layernorm=qk_norm, + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=layer_norm, + self_attention=ModuleSpec( + module=MLASelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=MLASelfAttentionSubmodules( + linear_q_proj=backend.column_parallel_linear(), + linear_q_down_proj=backend.column_parallel_linear(), + linear_q_up_proj=backend.column_parallel_linear(), + linear_kv_down_proj=backend.column_parallel_linear(), + linear_kv_up_proj=backend.column_parallel_linear(), + core_attention=backend.core_attention(), + linear_proj=backend.row_parallel_linear(), + q_layernorm=qk_norm if qk_layernorm else IdentityOp, + kv_layernorm=qk_norm if qk_layernorm else IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=layer_norm, + mlp=mlp, + mlp_bda=get_bias_dropout_add, ), - metainfo={"fuse_input_layernorm": False}, ) else: - linear_qkv = ( - backend.column_parallel_layer_norm_linear() - if backend.fuse_layernorm_and_linear() - else backend.column_parallel_linear() - ) - if qk_l2_norm: - qk_norm = L2Norm - elif qk_layernorm: - qk_norm = backend.layer_norm(rms_norm=rms_norm, for_qk=True) - else: - qk_norm = IdentityOp - attention = ModuleSpec( - module=SelfAttention, - params={"attn_mask_type": AttnMaskType.causal}, - submodules=SelfAttentionSubmodules( - linear_qkv=linear_qkv, - core_attention=core_attention, - linear_proj=backend.row_parallel_linear(), - q_layernorm=qk_norm, - k_layernorm=qk_norm, - ), - metainfo={"fuse_input_layernorm": backend.fuse_layernorm_and_linear()}, - ) - if backend.fuse_layernorm_and_linear(): - sharded_state_dict_keys_map.update( - { - "mlp.0.weight": "mlp.linear_fc1.layer_norm_weight", - "mlp.0.bias": "mlp.linear_fc1.layer_norm_bias", - "mlp.1.basic_ops.0.weight": "mlp.linear_fc1.weight", - "mlp.1.basic_ops.1.bias": "mlp.linear_fc1.bias", - "mlp.3.basic_ops.0.weight": "mlp.linear_fc2.weight", - "mlp.3.basic_ops.1.bias": "mlp.linear_fc2.bias", - } - ) - else: - sharded_state_dict_keys_map.update( - { + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=layer_norm, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=backend.column_parallel_linear(), + core_attention=backend.core_attention(), + linear_proj=backend.row_parallel_linear(), + q_layernorm=( + L2Norm if qk_l2_norm else (qk_norm if qk_layernorm else IdentityOp) + ), + k_layernorm=( + L2Norm if qk_l2_norm else (qk_norm if qk_layernorm else IdentityOp) + ), + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=layer_norm, + mlp=mlp, + mlp_bda=get_bias_dropout_add, + sharded_state_dict_keys_map={ "input_layernorm.": "self_attention.linear_qkv.layer_norm_", "pre_mlp_layernorm.": "mlp.linear_fc1.layer_norm_", - } - ) - - return attention + }, + ), + ) def _get_mlp_module_spec( @@ -568,7 +480,6 @@ def get_mlp_module_spec( def get_mlp_module_spec_for_backend( backend: BackendSpecProvider, - sharded_state_dict_keys_map: Optional[dict] = None, num_experts: Optional[int] = None, moe_grouped_gemm: Optional[bool] = False, moe_use_legacy_grouped_gemm: Optional[bool] = False, @@ -586,16 +497,13 @@ def get_mlp_module_spec_for_backend( if backend.fuse_layernorm_and_linear(): linear_fc1 = backend.column_parallel_layer_norm_linear() assert linear_fc1 is not None - fuse_pre_mlp_layernorm = True else: linear_fc1 = backend.column_parallel_linear() - fuse_pre_mlp_layernorm = False return ModuleSpec( module=module, submodules=MLPSubmodules( linear_fc1=linear_fc1, linear_fc2=linear_fc2, activation_func=activation_func ), - metainfo={"fuse_pre_mlp_layernorm": fuse_pre_mlp_layernorm}, ) else: # Mixture of experts with modules in megatron core. @@ -613,76 +521,61 @@ def get_gpt_decoder_layer_specs( use_transformer_engine: bool, normalization: Optional[str] = None, qk_l2_norm: Optional[bool] = False, - vp_stage: Optional[int] = None, - pp_rank: Optional[int] = None, ) -> TransformerBlockSubmodules: - """Helper function to get GPT block spec. - - Return a list of transformer layer spec of the current pipeline stage.""" - - get_layer_spec_kwargs = { - "qk_layernorm": config.qk_layernorm, - "moe_use_legacy_grouped_gemm": config.moe_use_legacy_grouped_gemm, - "qk_l2_norm": qk_l2_norm, - "use_kitchen": config.use_kitchen, - "normalization": normalization, - "use_kitchen_attention": config.use_kitchen_attention, - "kitchen_attention_backend": config.kitchen_attention_backend, - } + """GPT block spec.""" + assert config.experimental_attention_variant is None, ( + "Experimental attention variant is not supported with get_gpt_decoder_layer_specs, " + f"but got {config.experimental_attention_variant=}." + ) + if use_transformer_engine: - layer_norm_impl = TENorm - get_layer_spec_kwargs["use_te_activation_func"] = config.use_te_activation_func - get_layer_spec_kwargs['fallback_to_eager_attn'] = config.fallback_to_eager_attn - get_layer_spec_fn = get_gpt_layer_with_transformer_engine_spec + dense_layer_spec = get_gpt_layer_with_transformer_engine_spec( + num_experts=None, + moe_grouped_gemm=False, + qk_layernorm=config.qk_layernorm, + multi_latent_attention=config.multi_latent_attention, + moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, + qk_l2_norm=qk_l2_norm, + use_kitchen=config.use_kitchen, + use_te_activation_func=config.use_te_activation_func, + ) + moe_layer_spec = get_gpt_layer_with_transformer_engine_spec( + num_experts=config.num_moe_experts, + moe_grouped_gemm=config.moe_grouped_gemm, + qk_layernorm=config.qk_layernorm, + multi_latent_attention=config.multi_latent_attention, + moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, + qk_l2_norm=qk_l2_norm, + use_kitchen=config.use_kitchen, + use_te_activation_func=config.use_te_activation_func, + ) else: - layer_norm_impl = LNImpl - get_layer_spec_fn = get_gpt_layer_local_spec - - layer_spec_dict = {} - for mlp_type in ["dense", "moe"]: - for attention_type in ["softmax_attention", "linear_attention"]: - if mlp_type == "moe": - if config.moe_layer_freq is None: - # Skip if there is no MoE layer in the model. - continue - num_experts = config.num_moe_experts - moe_grouped_gemm = config.moe_grouped_gemm - else: - num_experts = None - moe_grouped_gemm = None - if attention_type == "linear_attention": - multi_latent_attention = None - if is_linear_attention_variant(config.experimental_attention_variant): - # There exists linear attention layer in the model. - experimental_attention_variant = config.experimental_attention_variant - else: - # Skip if there is no linear attention layer in the model. - continue - else: - multi_latent_attention = config.multi_latent_attention - if is_linear_attention_variant(config.experimental_attention_variant): - # experimental_attention_variant is a linear attention variant, - # so softmax attention is regular attention layer. - experimental_attention_variant = None - else: - # Softmax attention is an experimental attention variant. - experimental_attention_variant = config.experimental_attention_variant - - layer_spec_key = f"{mlp_type}_{attention_type}" - layer_spec_dict[layer_spec_key] = get_layer_spec_fn( - num_experts=num_experts, - moe_grouped_gemm=moe_grouped_gemm, - multi_latent_attention=multi_latent_attention, - experimental_attention_variant=experimental_attention_variant, - **get_layer_spec_kwargs, - ) + dense_layer_spec = get_gpt_layer_local_spec( + num_experts=None, + moe_grouped_gemm=False, + qk_layernorm=config.qk_layernorm, + multi_latent_attention=config.multi_latent_attention, + moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, + normalization=normalization, + qk_l2_norm=qk_l2_norm, + use_kitchen=config.use_kitchen, + ) + moe_layer_spec = get_gpt_layer_local_spec( + num_experts=config.num_moe_experts, + moe_grouped_gemm=config.moe_grouped_gemm, + qk_layernorm=config.qk_layernorm, + multi_latent_attention=config.multi_latent_attention, + moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, + normalization=normalization, + qk_l2_norm=qk_l2_norm, + use_kitchen=config.use_kitchen, + ) # Parse config.moe_layer_freq to determine the pattern of expert/dense layers. # 0 stands for dense layers, 1 stands for expert layers. # For integer N: Creates a pattern with one expert layer every N layers. # For string pattern: Evaluates the str directly (e.g. "[1,0,1]" for alternating expert/dense). if isinstance(config.moe_layer_freq, int): - # [1,0,0,...,0,1,0,0,...,0,...] moe_layer_pattern = [ 1 if (i % config.moe_layer_freq == 0) else 0 for i in range(config.num_layers) ] @@ -698,50 +591,15 @@ def get_gpt_decoder_layer_specs( f"Invalid moe_layer_freq: {type(config.moe_layer_freq)}, {config.moe_layer_freq}" ) - # Parse config.linear_attention_freq to determine the pattern of expert/dense layers. - # 0 stands for SDPA layers, 1 stands for LA layers. - # For integer N: Creates a pattern with (N-1) LA layers and 1 SDPA layer every N layers. - # For string pattern: Evaluates the str directly (e.g. "[1,0,1]" for alternating LA/SDPA). - if isinstance(config.linear_attention_freq, int): - linear_attention_pattern = [ - # [1,1,...,1,0,1,1,...,1,0,...] - 0 if ((i + 1) % config.linear_attention_freq == 0) else 1 - for i in range(config.num_layers) - ] - elif isinstance(config.linear_attention_freq, list): - linear_attention_pattern = config.linear_attention_freq - assert len(linear_attention_pattern) == config.num_layers, ( - f"Invalid length of linear_attention_pattern: {len(linear_attention_pattern)}, " - f"expected {config.num_layers}, " - f"current linear attention pattern: {config.linear_attention_freq}" - ) - elif config.linear_attention_freq is None: - if not is_linear_attention_variant(config.experimental_attention_variant): - linear_attention_pattern = [0] * config.num_layers - else: - linear_attention_pattern = [1] * config.num_layers - warnings.warn( - f"Linear attention type {config.experimental_attention_variant} is specified " - "but linear_attention_freq is None. " - "Setting linear_attention_pattern to [1] * config.num_layers as default." - ) - else: - raise ValueError( - f"Invalid linear_attention_freq: {type(config.linear_attention_freq)}," - f" {config.linear_attention_freq}" - ) - # Create the layer specs for the model. layer_specs = [] for layer_number in range(config.num_layers): - mlp_type = "moe" if moe_layer_pattern[layer_number] else "dense" - attention_type = ( - "linear_attention" if linear_attention_pattern[layer_number] else "softmax_attention" - ) - layer_spec_key = f"{mlp_type}_{attention_type}" - if layer_spec_key not in layer_spec_dict: - raise ValueError(f"Invalid layer spec key: {layer_spec_key}") - layer_specs.append(layer_spec_dict[layer_spec_key]) + if moe_layer_pattern[layer_number] == 1: + layer_specs.append(moe_layer_spec) + elif moe_layer_pattern[layer_number] == 0: + layer_specs.append(dense_layer_spec) + else: + raise ValueError(f"Invalid layer pattern: {moe_layer_pattern}") return layer_specs @@ -758,13 +616,16 @@ def get_gpt_decoder_block_spec( layer_specs = get_gpt_decoder_layer_specs( config, use_transformer_engine, normalization, qk_l2_norm ) + # Slice the layer specs to only include the layers that are built in this pipeline stage. # Note: MCore layer_number starts at 1 num_layers_to_build = get_num_layers_to_build(config, vp_stage=vp_stage, pp_rank=pp_rank) if config.pipeline_model_parallel_layout is not None: layout = config.pipeline_model_parallel_layout - assert isinstance(layout, PipelineParallelLayerLayout) + assert isinstance( + layout, PipelineParallelLayerLayout + ), f"Invalid pipeline model parallel layout: {layout}" local_layer_specs = [ layer_specs[layer_id] for layer_id in layout.get_layer_id_list( @@ -775,11 +636,11 @@ def get_gpt_decoder_block_spec( offset = get_transformer_layer_offset(config, vp_stage=vp_stage, pp_rank=pp_rank) local_layer_specs = layer_specs[offset : offset + num_layers_to_build] + # Block spec. if use_transformer_engine: layer_norm_impl = TENorm else: layer_norm_impl = LNImpl - # Block spec. block_spec = TransformerBlockSubmodules( layer_specs=local_layer_specs, layer_norm=layer_norm_impl ) @@ -796,22 +657,17 @@ def get_gpt_mtp_block_spec( ) -> MultiTokenPredictionBlockSubmodules: """GPT Multi-Token Prediction (MTP) block spec.""" if use_transformer_engine: - backend: BackendSpecProvider = ( - KitchenSpecProvider( + if config.use_kitchen: + backend: BackendSpecProvider = KitchenSpecProvider( fallback=TESpecProvider(fallback_to_eager_attn=config.fallback_to_eager_attn), use_kitchen_attention=config.use_kitchen_attention, kitchen_attention_backend=config.kitchen_attention_backend, ) - if config.use_kitchen - else TESpecProvider(fallback_to_eager_attn=config.fallback_to_eager_attn) - ) + else: + backend = TESpecProvider(fallback_to_eager_attn=config.fallback_to_eager_attn) else: backend = ( - KitchenSpecProvider( - fallback=LocalSpecProvider(), - use_kitchen_attention=config.use_kitchen_attention, - kitchen_attention_backend=config.kitchen_attention_backend, - ) + KitchenSpecProvider(fallback=LocalSpecProvider()) if config.use_kitchen else LocalSpecProvider() ) diff --git a/megatron/core/ssm/gated_delta_net.py b/megatron/core/ssm/gated_delta_net.py index a08d043bdb3..16dc3a79ebb 100644 --- a/megatron/core/ssm/gated_delta_net.py +++ b/megatron/core/ssm/gated_delta_net.py @@ -104,7 +104,9 @@ def __init__( """ if not HAVE_FLA: - raise ImportError("FLA is not installed. Please install it with `pip install fla`.") + raise ImportError( + "FLA is not installed. Please install it with `pip install flash-linear-attention`." + ) super().__init__(config) diff --git a/megatron/core/transformer/dot_product_attention_context_parallel.py b/megatron/core/transformer/dot_product_attention_context_parallel.py index 89659a1d743..aaf08d40ade 100644 --- a/megatron/core/transformer/dot_product_attention_context_parallel.py +++ b/megatron/core/transformer/dot_product_attention_context_parallel.py @@ -185,6 +185,9 @@ def forward(ctx, q, k, v, attention_mask, attention_dropout, softmax_scale, pg): comm.all_gather(kv_buffer_copy[1], v_0) # Prepare attention bias + assert ( + attention_mask is not None + ), "Attention mask is required for the native attention function with context parallelism" attn_bias = to_zz_mask_attn_bias( attention_mask, cp_size, nheads, nheads_k, heads_k_stride, q.device, q.dtype ) diff --git a/megatron/core/transformer/spec_utils.py b/megatron/core/transformer/spec_utils.py index 24df1add0eb..dbd2e08bccb 100644 --- a/megatron/core/transformer/spec_utils.py +++ b/megatron/core/transformer/spec_utils.py @@ -46,6 +46,7 @@ def import_module(module_path: Tuple[str]): return vars(module)[name] +# pylint: disable=missing-function-docstring def get_module(spec_or_module: Union[ModuleSpec, type], **additional_kwargs): """Retrieve the module class or function specified by a ModuleSpec or return it as is if already provided. diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index 875d8a92049..8f5462ff55b 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -194,6 +194,9 @@ class TransformerConfig(ModelParallelConfig): qk_layernorm: bool = False """Whether to apply `normalization` type of normalization to the query and key embeddings.""" + qk_l2_norm: bool = False + """Whether to apply llama 4-style qk L2 norm.""" + qk_clip: bool = False """Whether to clip the query and key weights. Needed for Muon MLA Model training.""" @@ -234,7 +237,26 @@ class TransformerConfig(ModelParallelConfig): """Type of attention variant to use. Currently support gated_delta_net and dsa.""" #################### - # attention variant: gated_delta_net + # DSA + #################### + dsa_indexer_n_heads: Optional[int] = None + """Number of DSA indexer heads.""" + + dsa_indexer_head_dim: Optional[int] = None + """Dimension per DSA indexer head.""" + + dsa_indexer_topk: Optional[int] = None + """Number of top-k tokens to select in DSA indexer.""" + + dsa_indexer_loss_coeff: Optional[float] = None + """Coefficient for the DSA indexer KL divergence loss. Set to 0 to disable indexer loss.""" + + dsa_indexer_use_sparse_loss: Optional[bool] = None + """Whether to use sparse DSA indexer loss. If True, the indexer loss will be computed using the + top-k indices.""" + + #################### + # linear attention #################### linear_attention_type: Optional[str] = None """Type of linear attention to use. @@ -262,25 +284,6 @@ class TransformerConfig(ModelParallelConfig): linear_num_value_heads: Optional[int] = None """Number of value and gate heads for the gated delta net.""" - #################### - # attention variant: dsa - #################### - dsa_indexer_n_heads: Optional[int] = None - """Number of DSA indexer heads.""" - - dsa_indexer_head_dim: Optional[int] = None - """Dimension per DSA indexer head.""" - - dsa_indexer_topk: Optional[int] = None - """Number of top-k tokens to select in DSA indexer.""" - - dsa_indexer_loss_coeff: Optional[float] = None - """Coefficient for the DSA indexer KL divergence loss. Set to 0 to disable indexer loss.""" - - dsa_indexer_use_sparse_loss: Optional[bool] = None - """Whether to use sparse DSA indexer loss. If True, the indexer loss will be computed using the - top-k indices.""" - #################### # initialization #################### diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index c85228e1136..027449b1729 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -2467,7 +2467,6 @@ def _add_training_args(parser): 'which only ensures bitwise identical results when the same inputs are processed in the same batch configuration. ' 'This will significantly affect speed of training and inference as the kernels are not full optimized.') - return parser @@ -3454,7 +3453,17 @@ def _add_experimental_attention_variant_args(parser): group = parser.add_argument_group(title="experimental_attention_variant") group.add_argument('--experimental-attention-variant', default=None, choices=['gated_delta_net', 'dsa'], type=str, help='Type of attention variant to use. Currently support gated_delta_net and dsa.') - + # DSA + group.add_argument('--dsa-indexer-n-heads', default=None, type=int, + help='Number of indexer heads for sparse attention. If not set, defaults to num-attention-heads.') + group.add_argument('--dsa-indexer-head-dim', default=None, type=int, + help='Dimension per indexer head for sparse attention. If not set, defaults to kv-channels.') + group.add_argument('--dsa-indexer-topk', default=None, type=int, + help='Number of top-k tokens to select in sparse attention indexer.') + group.add_argument('--dsa-indexer-loss-coeff', default=0.0, type=float, + help='Coefficient for the indexer KL divergence loss. Set to 0 to disable indexer loss.') + group.add_argument('--dsa-indexer-use-sparse-loss', action='store_true', + help='Use sparse indexer loss. If set, the indexer loss will be computed using the top-k indices.') # Linear attention group.add_argument('--linear-attention-type', default=None, choices=['gated_delta_net'], type=str, help='(Deprecated, use --experimental-attention-variant instead) Type of linear attention to use. Currently support gated_delta_net.') @@ -3477,19 +3486,6 @@ def _add_experimental_attention_variant_args(parser): help='Number of query and key heads for the gated delta net.') group.add_argument('--linear-num-value-heads', default=32, type=int, help='Number of value and gate heads for the gated delta net.') - - # DSA - group.add_argument('--dsa-indexer-n-heads', default=None, type=int, - help='Number of indexer heads for sparse attention. If not set, defaults to num-attention-heads.') - group.add_argument('--dsa-indexer-head-dim', default=None, type=int, - help='Dimension per indexer head for sparse attention. If not set, defaults to kv-channels.') - group.add_argument('--dsa-indexer-topk', default=None, type=int, - help='Number of top-k tokens to select in sparse attention indexer.') - group.add_argument('--dsa-indexer-loss-coeff', default=0.0, type=float, - help='Coefficient for the indexer KL divergence loss. Set to 0 to disable indexer loss.') - group.add_argument('--dsa-indexer-use-sparse-loss', action='store_true', - help='Use sparse indexer loss. If set, the indexer loss will be computed using the top-k indices.') - return parser def _add_heterogeneous_args(parser): diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py index 77b17b07e13..f7ff7cd2775 100644 --- a/megatron/training/checkpointing.py +++ b/megatron/training/checkpointing.py @@ -1472,13 +1472,13 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load', ckpt_args = state_dict.get("args") if not hasattr(ckpt_args, "tensor_model_parallel_size"): - print_rank_0("WARNING: TP size not found in checkpoint args, using 0 as default.") + print_rank_0("WARNING: TP size not found in checkpoint args, using 1 as default.") if not hasattr(ckpt_args, "pipeline_model_parallel_size"): - print_rank_0("WARNING: PP size not found in checkpoint args, using 0 as default.") + print_rank_0("WARNING: PP size not found in checkpoint args, using 1 as default.") ckpt_tp_pp = ( - getattr(ckpt_args, "tensor_model_parallel_size", 0), - getattr(ckpt_args, "pipeline_model_parallel_size", 0), + getattr(ckpt_args, "tensor_model_parallel_size", 1), + getattr(ckpt_args, "pipeline_model_parallel_size", 1), ) run_tp_pp = ( args.tensor_model_parallel_size, diff --git a/megatron/training/training.py b/megatron/training/training.py index 60156e1f227..5c52f907fc6 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -332,18 +332,15 @@ def transformer_flops(): if args.moe_shared_expert_intermediate_size is None else args.moe_shared_expert_intermediate_size ) - # SwiGLU. - gated_linear_multiplier = 3 / 2 if args.swiglu else 1 - # The 12x term below comes from the following factors; for more details, see - # "APPENDIX: FLOATING-POINT OPERATIONS" in https://arxiv.org/abs/2104.04473. # - 3x: Each GEMM in the model needs to be performed 3 times (forward pass, # backward wgrad [weight gradient], backward dgrad [data gradient]). - # - 2x: GEMMs of a particular size are stacked twice in the standard Transformer model - # architectures implemented in this codebase (e.g., h->ffn_h GEMM and ffn_h->h GEMM - # in MLP layer). + forward_backward_expansion_factor = 3 # - 2x: A GEMM of a m*n tensor with a n*k tensor requires 2mnk floating-point operations. - expansion_factor = 3 * 2 * 2 + fma_expansion_factor = 2 + # - 3x (SwiGLU enabled): h->2*ffn_h GEMM and ffn_h->h GEMM are stacked. + # - 2x (SwiGLU disabled): h->ffn_h GEMM and ffn_h->h GEMM are stacked. + ffn_expansion_factor = 3 if args.swiglu else 2 if args.multi_latent_attention: assert not args.group_query_attention @@ -374,8 +371,8 @@ def transformer_flops(): + 1 ) standard_self_attn_term = ( - 3 - * 2 # fwd(1) + bwd(2) *FMA + forward_backward_expansion_factor + * fma_expansion_factor * ( ## q lora + rope + q norm q_term @@ -402,13 +399,19 @@ def transformer_flops(): query_projection_size = args.kv_channels * args.num_attention_heads key_projection_size = args.kv_channels * args.num_query_groups value_projection_size = args.kv_channels * args.num_query_groups + gate_projection_size = query_projection_size if args.attention_output_gate else 0 standard_self_attn_term = ( - 3 - * 2 # fwd(1) + bwd(2) *FMA + forward_backward_expansion_factor + * fma_expansion_factor * ( ## qkv proj args.hidden_size - * (query_projection_size + key_projection_size + value_projection_size) + * ( + query_projection_size + + key_projection_size + + value_projection_size + + gate_projection_size + ) ## core attention + query_projection_size * args.seq_length @@ -436,7 +439,12 @@ def transformer_flops(): f"current linear attention pattern: {args.linear_attention_freq}" ) elif args.linear_attention_freq is None: - linear_attention_pattern = [1] * num_layers + # This should be caught by config validation, but raise here as a safety check + raise ValueError( + f"Linear attention type {args.experimental_attention_variant} is specified " + "but linear_attention_freq is None. " + "Please set linear_attention_freq to specify the LA/SDPA layer pattern." + ) else: raise ValueError( f"Invalid linear_attention_freq: {type(args.linear_attention_freq)}," @@ -454,8 +462,8 @@ def transformer_flops(): qk_dim = qk_head_dim * num_qk_heads v_dim = v_head_dim * num_v_heads linear_self_attn_term = ( - 3 - * 2 # fwd(1) + bwd(2) *FMA + forward_backward_expansion_factor + * fma_expansion_factor * ( ## in proj args.hidden_size @@ -492,25 +500,25 @@ def transformer_flops(): * args.seq_length * ( # MLP - expansion_factor - * num_layers + forward_backward_expansion_factor + * fma_expansion_factor * args.hidden_size * ( # dense layer (deepseek v2, v3 style) - (args.ffn_hidden_size * gated_linear_multiplier) - * (num_dense_layers / num_layers) + (args.ffn_hidden_size * ffn_expansion_factor) + * num_dense_layers # routed experts - + (moe_ffn_hidden_size * num_experts_routed_to * gated_linear_multiplier) - * (num_moe_layers / num_layers) + + (moe_ffn_hidden_size * num_experts_routed_to * ffn_expansion_factor) + * num_moe_layers # Shared Experts. - + (shared_expert_ffn_hidden_size * gated_linear_multiplier) - * (num_moe_layers / num_layers) + + (shared_expert_ffn_hidden_size * ffn_expansion_factor) + * num_moe_layers ) # Self Attention + self_attn_term # MTP norms and proj - + 3 - * 2 + + forward_backward_expansion_factor + * fma_expansion_factor * mtp_num_layers * ( # MTP eh norm + final nrom @@ -519,7 +527,11 @@ def transformer_flops(): + 2 * args.hidden_size * args.hidden_size ) # Logit. - + 3 * 2 * args.hidden_size * args.padded_vocab_size * (mtp_num_layers + 1) + + forward_backward_expansion_factor + * fma_expansion_factor + * args.hidden_size + * args.padded_vocab_size + * (mtp_num_layers + 1) # MTP + final logit ) ) return total_floating_point_operations diff --git a/tests/unit_tests/post_training/test_modelopt_module_spec.py b/tests/unit_tests/post_training/test_modelopt_module_spec.py index ec80fcb1a72..dac96785bc0 100644 --- a/tests/unit_tests/post_training/test_modelopt_module_spec.py +++ b/tests/unit_tests/post_training/test_modelopt_module_spec.py @@ -173,6 +173,7 @@ def setup_method(self, method): moe_ffn_hidden_size=128, moe_shared_expert_intermediate_size=128, qk_layernorm=True, + qk_l2_norm=True, use_cpu_initialization=True, ) default_spec = get_gpt_decoder_block_spec( diff --git a/tests/unit_tests/ssm/test_gated_delta_net.py b/tests/unit_tests/ssm/test_gated_delta_net.py index 725d18fbc06..81f8eed0574 100644 --- a/tests/unit_tests/ssm/test_gated_delta_net.py +++ b/tests/unit_tests/ssm/test_gated_delta_net.py @@ -11,7 +11,10 @@ from megatron.core.models.common.embeddings.rope_utils import ( get_pos_emb_on_this_cp_rank as get_tensor_on_this_cp_rank, ) -from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec +from megatron.core.models.gpt.experimental_attention_variant_module_specs import ( + get_experimental_attention_variant_module_spec, + get_transformer_block_with_experimental_attention_variant_spec, +) from megatron.core.models.gpt.gpt_model import GPTModel from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.ssm.gated_delta_net import GatedDeltaNet @@ -82,10 +85,13 @@ def setup_method(self, tp_size, sp, cp_size): tensor_model_parallel_size=tp_size, sequence_parallel=sp, context_parallel_size=cp_size, + experimental_attention_variant="gated_delta_net", + linear_attention_freq=[1], + transformer_impl="transformer_engine", ) - gdn_submodules = get_gpt_layer_with_transformer_engine_spec( - experimental_attention_variant="gated_delta_net", normalization="RMSNorm" - ).submodules.self_attention.submodules + gdn_submodules = get_experimental_attention_variant_module_spec( + config=self.transformer_config + ).submodules self.gdn = GatedDeltaNet( self.transformer_config, @@ -159,10 +165,13 @@ def test_parallel_gated_delta_net_correctness(tmp_path_dist_ckpt, tp, sp, cp): num_attention_heads=8, activation_func=F.silu, bf16=True, + experimental_attention_variant="gated_delta_net", + linear_attention_freq=[1], + transformer_impl="transformer_engine", ) - transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( - experimental_attention_variant="gated_delta_net", normalization="RMSNorm" + transformer_layer_spec = get_transformer_block_with_experimental_attention_variant_spec( + config=transformer_config, vp_stage=None, pp_rank=0 ) if cp: @@ -171,5 +180,15 @@ def test_parallel_gated_delta_net_correctness(tmp_path_dist_ckpt, tp, sp, cp): atol, rtol = 5e-4, 5e-4 _test_parallel_attention_correctness( - transformer_config, transformer_layer_spec, tmp_path_dist_ckpt, tp, sp, cp + transformer_config=transformer_config, + transformer_layer_spec=transformer_layer_spec, + tmp_path_dist_ckpt=tmp_path_dist_ckpt, + atol=atol, + rtol=rtol, + tp=tp, + sp=sp, + cp=cp, + seed=123, + sequence_length=256, + micro_batch_size=4, ) diff --git a/tests/unit_tests/transformer/test_attention.py b/tests/unit_tests/transformer/test_attention.py index cd7ca916091..b5f2857d622 100644 --- a/tests/unit_tests/transformer/test_attention.py +++ b/tests/unit_tests/transformer/test_attention.py @@ -875,6 +875,7 @@ def get_tensor_on_this_rank(tensor): Utils.destroy_model_parallel() +# TODO(yuzhongw): Add test case for fallback_to_eager_attn @pytest.mark.parametrize("apply_rope_fusion", [False, True]) @pytest.mark.parametrize( ("tp", "sp", "cp"), @@ -887,25 +888,15 @@ def get_tensor_on_this_rank(tensor): ], ) @pytest.mark.parametrize("qk_layernorm", [False, True]) -@pytest.mark.parametrize("fallback_to_eager_attn", [False, True]) @pytest.mark.parametrize("output_gate", [False, True]) def test_parallel_attention_correctness( - tmp_path_dist_ckpt, - apply_rope_fusion, - tp, - sp, - cp, - qk_layernorm, - fallback_to_eager_attn, - output_gate, + tmp_path_dist_ckpt, apply_rope_fusion, tp, sp, cp, qk_layernorm, output_gate ): transformer_config = TransformerConfig( num_layers=1, hidden_size=128, num_attention_heads=4, - context_parallel_size=1, - tensor_model_parallel_size=1, - sequence_parallel=False, + normalization="RMSNorm", bf16=True, qk_layernorm=qk_layernorm, apply_rope_fusion=apply_rope_fusion, @@ -914,24 +905,20 @@ def test_parallel_attention_correctness( attention_dropout=0.0, ) - transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( - fallback_to_eager_attn=fallback_to_eager_attn, - normalization="RMSNorm", - qk_layernorm=qk_layernorm, - ) - if cp > 1: - if qk_layernorm: - atol, rtol = 2e-2, 2e-2 - else: - atol, rtol = 5e-3, 5e-3 - else: - if qk_layernorm: - atol, rtol = 1e-2, 1e-2 - else: - atol, rtol = 2e-3, 2e-3 + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(qk_layernorm=qk_layernorm) + atol, rtol = 1e-2, 1e-2 _test_parallel_attention_correctness( - transformer_config, transformer_layer_spec, tmp_path_dist_ckpt, tp, sp, cp + transformer_config, + transformer_layer_spec, + tmp_path_dist_ckpt, + atol=atol, + rtol=rtol, + tp=tp, + sp=sp, + cp=cp, + seed=123, + sequence_length=256, ) From 68e5fec01969afbb7cd466a40909a2d2fc6da91d Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Thu, 22 Jan 2026 11:26:06 +0800 Subject: [PATCH 247/248] [Dev]feat(moe): code refactor for fine grained activation offloading (#2905) Signed-off-by: Hongbin Liu Signed-off-by: root Co-authored-by: root --- .../fine_grained_activation_offloading.md | 2 +- .../offloading_and_recomputing.png | Bin .../common/model_chunk_schedule_plan.py | 9 +- .../core/models/gpt/fine_grained_callables.py | 20 +- megatron/core/models/gpt/gpt_model.py | 10 +- .../fine_grained_activation_offload.py | 1037 ++++++++++++++--- megatron/core/pipeline_parallel/schedules.py | 19 +- megatron/core/pipeline_parallel/utils.py | 25 +- megatron/core/transformer/attention.py | 27 +- megatron/core/transformer/moe/experts.py | 29 +- .../transformer/multi_latent_attention.py | 68 +- .../transformer/multi_token_prediction.py | 5 - .../core/transformer/transformer_block.py | 8 - .../core/transformer/transformer_layer.py | 29 +- megatron/training/arguments.py | 3 + megatron/training/training.py | 7 +- .../golden_values_dev_dgx_h100.json | 102 +- .../model_config.yaml | 2 +- .../golden_values_dev_dgx_h100.json | 102 +- .../model_config.yaml | 7 +- ...test_fine_grained_activation_offloading.py | 720 +++++++++--- 21 files changed, 1638 insertions(+), 593 deletions(-) rename docs/{source => }/images/fine_grained_activation_offloading/offloading_and_recomputing.png (100%) diff --git a/docs/api-guide/fine_grained_activation_offloading.md b/docs/api-guide/fine_grained_activation_offloading.md index 969098263fc..53211d1d06c 100644 --- a/docs/api-guide/fine_grained_activation_offloading.md +++ b/docs/api-guide/fine_grained_activation_offloading.md @@ -28,4 +28,4 @@ Currently, the supported offloading modules are `"attn_norm", "core_attn", "attn - For other modules, use offloading to reduce memory footprint; - Make sure the offloading/reloading could be overlapped with computing; -![Fine-grained Activation Offloading and Fine-grained Recomputation](../images/fine_grained_activation_offloading/offloading_and_recomputing.png) +![Fine-grained Activation Offloading and Fine-grained Recomputation](../../images/fine_grained_activation_offloading/offloading_and_recomputing.png) diff --git a/docs/source/images/fine_grained_activation_offloading/offloading_and_recomputing.png b/docs/images/fine_grained_activation_offloading/offloading_and_recomputing.png similarity index 100% rename from docs/source/images/fine_grained_activation_offloading/offloading_and_recomputing.png rename to docs/images/fine_grained_activation_offloading/offloading_and_recomputing.png diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py index b8f11ed9d38..0c29423edab 100644 --- a/megatron/core/models/common/model_chunk_schedule_plan.py +++ b/megatron/core/models/common/model_chunk_schedule_plan.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. from contextlib import nullcontext from typing import Optional @@ -8,9 +8,6 @@ from megatron.core.enums import Fp8Recipe from megatron.core.fp8_utils import get_fp8_context -from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_set_last_layer, -) from megatron.core.pipeline_parallel.utils import ( AbstractSchedulePlan, NoopScheduleNode, @@ -488,8 +485,6 @@ def run( # combined forward and backward pass for overlapped layers for i in range(overlapped_layers): f_layer = f_schedule_plan.get_layer(i) - if f_layer.layer.config.fine_grained_activation_offloading: - fine_grained_offloading_set_last_layer(i == f_num_layers - 1) b_layer = b_schedule_plan.pop_layer() torch.cuda.nvtx.range_push(f"layer_{i}f-layer_{b_schedule_plan.num_layers()}b") f_input, b_grad = TransformerLayerSchedulePlan.run( @@ -518,8 +513,6 @@ def run( for i in range(overlapped_layers, f_num_layers): f_layer = f_schedule_plan.get_layer(i) torch.cuda.nvtx.range_push(f"layer_{i}f") - if f_layer.layer.config.fine_grained_activation_offloading: - fine_grained_offloading_set_last_layer(i == f_num_layers - 1) f_input, _ = TransformerLayerSchedulePlan.run(f_layer, None, f_input=f_input) torch.cuda.nvtx.range_pop() diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py index 71c5c19749c..5a365b015b2 100644 --- a/megatron/core/models/gpt/fine_grained_callables.py +++ b/megatron/core/models/gpt/fine_grained_callables.py @@ -11,9 +11,7 @@ from megatron.core import tensor_parallel from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_group_commit, - fine_grained_offloading_group_start, - get_fine_grained_offloading_context, + FineGrainedActivationOffloadingInterface as off_interface, ) from megatron.core.pipeline_parallel.utils import ScheduleNode, make_viewless from megatron.core.transformer.enums import CudaGraphScope @@ -450,18 +448,18 @@ def forward_func( ) if not isinstance(layer.mlp, MoELayer): return hidden_states, None, None, None - if layer.offload_mlp_norm: - hidden_states = fine_grained_offloading_group_start( - hidden_states, name="mlp_norm" - ) if layer.recompute_pre_mlp_layernorm: layer.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput() - with get_fine_grained_offloading_context(layer.offload_mlp_norm): + with off_interface( + layer.offload_mlp_norm, hidden_states, "mlp_norm" + ) as hidden_states: pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint( layer.pre_mlp_layernorm, hidden_states ) else: - with get_fine_grained_offloading_context(layer.offload_mlp_norm): + with off_interface( + layer.offload_mlp_norm, hidden_states, "mlp_norm" + ) as hidden_states: pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states) shared_expert_output = layer.mlp.shared_experts_compute(pre_mlp_layernorm_output) @@ -550,8 +548,10 @@ def submodule_combine_forward(node: ScheduleNode, output: torch.Tensor): hidden_states = layer.mlp_bda(layer.training, layer.config.bias_dropout_fusion)( mlp_output_with_bias, residual, layer.hidden_dropout ) + # Delay the offload of the mlp norm until after the mlp_bda has been computed + # because the residual is needed in the mlp_bda. if layer.offload_mlp_norm: - (hidden_states,) = fine_grained_offloading_group_commit( + hidden_states = off_interface.group_commit( hidden_states, name="mlp_norm", forced_released_tensors=[residual] ) output = make_viewless_tensor( diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 9e70c677226..16462d6e426 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -19,7 +19,7 @@ from megatron.core.models.common.language_module.language_module import LanguageModule from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_init_chunk_handler, + FineGrainedActivationOffloadingInterface as off_interface, ) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.quantization.utils import get_quant_config_or_none @@ -431,20 +431,20 @@ def _preprocess( def preprocess_for_fine_grained_offloading(self): """Preprocess for fine-grained activation offloading.""" - fine_grained_offloading_init_chunk_handler( + off_interface.init_chunk_handler( vp_size=self.config.virtual_pipeline_model_parallel_size, vp_stage=self.vp_stage, min_offloaded_tensor_size=self.config.min_offloaded_tensor_size, ) if self.disable_param_offloading: for param in self.decoder.parameters(): - param.offloading_activation = False + off_interface.mark_not_offloadable(param) if self.mtp_process: for param in self.mtp.parameters(): - param.offloading_activation = False + off_interface.mark_not_offloadable(param) if self.post_process: for param in self.output_layer.parameters(): - param.offloading_activation = False + off_interface.mark_not_offloadable(param) self.disable_param_offloading = False def forward( diff --git a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py index 138dcd8f7b1..9996c9b57a4 100644 --- a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py +++ b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py @@ -2,16 +2,16 @@ from collections import deque from contextlib import nullcontext -from typing import Any +from typing import Any, Dict, Tuple import torch -from megatron.core.pipeline_parallel.utils import set_ideal_affinity_for_current_gpu - # CPU offload implementation for pipeline parallelism DEBUG = False DEBUG_RANK = 0 +from megatron.core.transformer.cuda_graphs import is_graph_capturing + def debug_rank(message): """Print debug message for a specific rank when DEBUG is enabled.""" @@ -23,6 +23,362 @@ def debug_rank(message): print(message) +def print_offload_summary_table(total_offload_bytes: Dict[str, int]): + """ + Print an ASCII table summarizing offload bytes across all ranks. + + Gathers offload data from all ranks and prints a formatted table on rank 0, + with rows representing ranks and columns representing groups. + + Args: + total_offload_bytes: Dict mapping group names to offload bytes for this rank. + """ + # pylint: disable=bad-builtin + assert torch.distributed.is_initialized() + rank = torch.distributed.get_rank() + world_size = torch.distributed.get_world_size() + + # Gather all group names across ranks + local_names = list(total_offload_bytes.keys()) + all_names_list = [None] * world_size + torch.distributed.all_gather_object(all_names_list, local_names) + all_group_names = sorted(set(name for names in all_names_list for name in names)) + + # Gather offload bytes from all ranks: each rank sends a list of bytes per group + local_bytes = [total_offload_bytes.get(name, 0) for name in all_group_names] + all_bytes_list = [None] * world_size + torch.distributed.all_gather_object(all_bytes_list, local_bytes) + + # Print ASCII table on rank 0 + if rank == 0: + # Calculate column widths + col_width = max(12, max((len(name) for name in all_group_names), default=8) + 2) + rank_col_width = max(6, len(f"Rank {world_size - 1}") + 2) + + # Build header + header = "Rank".ljust(rank_col_width) + header += "".join(name.rjust(col_width) for name in all_group_names) + header += "Total".rjust(col_width) + separator = "-" * len(header) + + print("\n" + "=" * len(header)) + print("Activation Offload Summary (MB)".center(len(header))) + print("=" * len(header)) + print(header) + print(separator) + + # Build rows for each rank + grand_total = 0 + col_totals = [0] * len(all_group_names) + for r in range(world_size): + row_bytes = all_bytes_list[r] + row_total = sum(row_bytes) + grand_total += row_total + for i, b in enumerate(row_bytes): + col_totals[i] += b + row_str = f"Rank {r}".ljust(rank_col_width) + for b in row_bytes: + row_str += f"{b / (1024 * 1024):.2f}".rjust(col_width) + row_str += f"{row_total / (1024 * 1024):.2f}".rjust(col_width) + print(row_str) + + # Print totals row + print(separator) + totals_row = "Total".ljust(rank_col_width) + for ct in col_totals: + totals_row += f"{ct / (1024 * 1024):.2f}".rjust(col_width) + totals_row += f"{grand_total / (1024 * 1024):.2f}".rjust(col_width) + print(totals_row) + print("=" * len(header) + "\n") + + torch.distributed.barrier() + + +class GPUTensorPool: + """ + GPU memory pool for efficient allocation and deallocation of tensors. + + Features: + - Supports multiple tensor shapes and dtypes, each with its own pool + - Dynamic allocation: tensors are created on-demand during allocation + - Efficient reuse: freed tensors are returned to the pool for reuse + - Uses queue-based management for O(1) allocation and deallocation + + Example: + pool = GPUTensorPool(device='cuda:0') + tensor = pool.allocate((128, 512), dtype=torch.float32) + # ... use tensor ... + pool.free(tensor, (128, 512), dtype=torch.float32) + """ + + def __init__(self, device: str = 'cuda', pin_memory: bool = False): + """ + Initialize GPU tensor pool. + + Args: + device: GPU device, default 'cuda' + pin_memory: Whether to use pinned memory (mainly for CPU tensors) + """ + self.device = torch.device(device) + self.pin_memory = pin_memory + + # Maintain a separate pool for each (shape, dtype) combination + # Structure: {(shape, dtype): {'free': deque, 'all': list, 'allocated_count': int}} + self._pools: Dict[Tuple, Dict[str, Any]] = {} + + # Statistics + self._stats = { + 'total_allocated': 0, # Total number of tensors ever allocated + 'current_in_use': 0, # Number of tensors currently in use + 'allocation_requests': 0, # Number of allocation requests + 'free_requests': 0, # Number of free requests + 'pool_hits': 0, # Number of times a tensor was reused from pool + 'pool_misses': 0, # Number of times a new tensor was created + } + + debug_rank("GPUTensorPool: Initialized with dynamic allocation") + + def _get_pool_key(self, shape: Tuple, dtype: torch.dtype) -> Tuple: + """Generate a unique key for the pool based on shape and dtype.""" + return (shape, dtype) + + @staticmethod + def _calculate_memory_size(shape: Tuple, dtype: torch.dtype) -> int: + """Calculate memory size in bytes.""" + element_size = torch.tensor([], dtype=dtype).element_size() + numel = 1 + for dim in shape: + numel *= dim + return numel * element_size + + def allocate(self, shape: Tuple, dtype: torch.dtype = torch.float32) -> torch.Tensor: + """ + Allocate a tensor with the specified shape and dtype. + + Args: + shape: Shape of the tensor + dtype: Data type of the tensor, default torch.float32 + + Returns: + Allocated tensor + """ + self._stats['allocation_requests'] += 1 + + pool_key = self._get_pool_key(shape, dtype) + + # Create pool for this (shape, dtype) if it doesn't exist + if pool_key not in self._pools: + self._pools[pool_key] = { + 'free': deque(), # Queue of available tensors + 'all': [], # List of all tensors (for tracking) + 'allocated_count': 0, # Number of allocated tensors + } + + pool = self._pools[pool_key] + + # Try to reuse a tensor from the pool + if len(pool['free']) > 0: + tensor = pool['free'].popleft() + self._stats['pool_hits'] += 1 + debug_rank( + f"GPUTensorPool.allocate: Reused tensor from pool, " + f"shape={shape}, dtype={dtype}, " + f"remaining in pool={len(pool['free'])}" + ) + else: + # Allocate a new tensor + tensor = torch.empty(shape, dtype=dtype, device=self.device, pin_memory=self.pin_memory) + pool['all'].append(tensor) + self._stats['total_allocated'] += 1 + self._stats['pool_misses'] += 1 + + memory_mb = self._calculate_memory_size(shape, dtype) / (1024**2) + debug_rank( + f"GPUTensorPool.allocate: Created new tensor, " + f"shape={shape}, dtype={dtype}, " + f"memory={memory_mb:.2f} MB, " + f"total_created={len(pool['all'])}" + ) + + pool['allocated_count'] += 1 + self._stats['current_in_use'] += 1 + + return tensor + + def free(self, tensor: torch.Tensor): + """ + Return a tensor to the pool for reuse. + + Args: + tensor: Tensor to free + + Raises: + ValueError: If tensor doesn't belong to this pool + """ + self._stats['free_requests'] += 1 + + shape = tensor.shape + dtype = tensor.dtype + + pool_key = self._get_pool_key(shape, dtype) + + if pool_key not in self._pools: + raise ValueError( + f"No pool exists for shape={shape}, dtype={dtype}. " + f"Available pools: {list(self._pools.keys())}" + ) + + pool = self._pools[pool_key] + + # Verify tensor belongs to this pool (use identity check, not value comparison) + tensor_found = any(tensor is t for t in pool['all']) + if not tensor_found: + raise ValueError( + f"Attempting to free a tensor that doesn't belong to this pool " + f"(shape={shape}, dtype={dtype})" + ) + + # Return tensor to the free queue + pool['free'].append(tensor) + pool['allocated_count'] -= 1 + self._stats['current_in_use'] -= 1 + + debug_rank( + f"GPUTensorPool.free: shape={shape}, dtype={dtype}, " + f"available in pool={len(pool['free'])}" + ) + + def get_pool_status(self, shape: Tuple = None, dtype: torch.dtype = None) -> Dict[str, Any]: + """ + Get the status of the memory pool. + + Args: + shape: If specified along with dtype, return status for that specific pool + dtype: Data type (required if shape is specified) + + Returns: + Dictionary containing status information + """ + if shape is not None: + if dtype is None: + raise ValueError("dtype must be specified when shape is provided") + + pool_key = self._get_pool_key(shape, dtype) + + if pool_key not in self._pools: + raise ValueError(f"No pool exists for shape={shape}, dtype={dtype}") + + pool = self._pools[pool_key] + total_count = len(pool['all']) + + return { + 'shape': shape, + 'dtype': dtype, + 'total_count': total_count, + 'allocated_count': pool['allocated_count'], + 'free_count': len(pool['free']), + 'utilization': ( + pool['allocated_count'] / total_count * 100 if total_count > 0 else 0 + ), + } + else: + # Return status for all pools + status = {'global_stats': self._stats.copy(), 'pools': {}} + + for pool_key in self._pools: + shape, dtype = pool_key + status['pools'][pool_key] = self.get_pool_status(shape, dtype) + + return status + + def reset(self): + """Reset the pool, marking all tensors as available.""" + debug_rank("GPUTensorPool: Resetting pool...") + + for pool_key, pool in self._pools.items(): + # Clear and refill the free queue + pool['free'].clear() + for tensor in pool['all']: + pool['free'].append(tensor) + pool['allocated_count'] = 0 + + self._stats['current_in_use'] = 0 + debug_rank("GPUTensorPool: Reset complete") + + def clear(self): + """Clear the pool and release all GPU memory.""" + debug_rank("GPUTensorPool: Clearing pool...") + + for pool_key, pool in self._pools.items(): + # Clear all references, allowing PyTorch GC to reclaim memory + pool['free'].clear() + pool['all'].clear() + + self._pools.clear() + self._stats['current_in_use'] = 0 + + # Trigger GPU cache cleanup + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + debug_rank("GPUTensorPool: Clear complete") + + def __del__(self): + """Destructor to ensure resources are released.""" + self.clear() + + +class OffloadTensorGroup: + """ + A group of tensors to be offloaded together. + """ + + def __init__(self, name): + self._name = name + self._tensors = {} + self._offload_event = torch.cuda.Event() + self._reload_event = torch.cuda.Event() + self.offload = True + self.total_offload_bytes = 0 + self.total_tensor_count = 0 + # Using memory pool is for the compatibility with cuda graph. + # Shapes of tensors for expert_fc1 and moe_act are not known in advance, + # so we do not use CPU pool for them. + if name == "expert_fc1" or name == "moe_act": + self.use_cpu_pool = False + else: + self.use_cpu_pool = True + + def push_tensor(self, tag, tensor): + """Push a tensor to the group.""" + self._tensors[tag] = tensor + + def pop_tensor(self, tag): + """Pop a tensor from the group.""" + return self._tensors.pop(tag) + + def record_offload_event(self, stream): + """Record the offload event.""" + self._offload_event.record(stream) + + def wait_offload_event(self, stream): + """Wait for the offload event.""" + stream.wait_event(self._offload_event) + + def record_reload_event(self, stream): + """Record the reload event.""" + self._reload_event.record(stream) + + def wait_reload_event(self, stream): + """Wait for the reload event.""" + stream.wait_event(self._reload_event) + + def update_offload_info(self, tensor): + """Update the offload information.""" + self.total_offload_bytes += tensor.numel() * tensor.element_size() + self.total_tensor_count += 1 + + class PipelineOffloadManager: """ Singleton manager for coordinating activation offloading across pipeline stages. @@ -39,6 +395,12 @@ def get_instance(cls): cls.OFFLOAD_MGR = PipelineOffloadManager() return cls.OFFLOAD_MGR + @classmethod + def reset_instance(cls): + """Reset the singleton instance of PipelineOffloadManager.""" + cls.OFFLOAD_MGR = None + cls.OFFLOAD_MGR = PipelineOffloadManager() + def __init__(self): """Initialize the manager with queues and dedicated CUDA streams.""" # Queue to store chunk handlers for backward pass @@ -48,6 +410,27 @@ def __init__(self): # allocate streams and events for synchronization self._d2h_stream = torch.cuda.Stream() self._h2d_stream = torch.cuda.Stream() + # Shared CPU tensor pool for all chunks to improve reuse efficiency + self._cpu_tensor_pool = GPUTensorPool(device="cpu", pin_memory=True) + + # Whether the manager is in warmup phase. + self._is_warmup = True + # Cache OffloadChunkHandler objects for each virtual pipeline stage and each forward pass. + self._cached_chunks_forward = [] + # Cache OffloadChunkHandler objects for each virtual pipeline stage and each backward pass. + self._cached_chunks_backward = [] + # Index of the current backward chunk in the cached chunks backward. + self._cached_chunks_index_backward = 0 + # Index of the current forward chunk in the cached chunks forward. + self._cached_chunks_index_forward = 0 + + self.do_offload = True + + # Do not offload the last X groups so that the reloading won't block the computing stream. + self._offload_margin = 0 + # Sometimes we need to delay the offloading and launch it later. + # The delayed offload groups are stored in a queue. + self._delayed_offload_groups = [] self.reset() @property @@ -60,14 +443,52 @@ def h2d_stream(self): """Get the host-to-device (CPU to GPU) transfer stream.""" return self._h2d_stream + @property + def cpu_tensor_pool(self): + """Get the shared CPU tensor pool.""" + return self._cpu_tensor_pool + + def push_offload_groups(self, group_hook, forced_released_tensors): + """Push the offload groups to the delayed queue.""" + debug_rank(f"pushing offload groups to the delayed queue") + self._delayed_offload_groups.append((group_hook, forced_released_tensors)) + + def flush_delayed_groups(self): + """Flush the delayed groups.""" + debug_rank("flushing delayed groups") + # Flush the delayed groups in reverse order to maintain the order of the groups. + for group_hook, forced_released_tensors in reversed(self._delayed_offload_groups): + group_hook(forced_released_tensors) + self._delayed_offload_groups = [] + def reset(self): """Reset manager state for a new training iteration.""" - set_ideal_affinity_for_current_gpu() self._inside_context = False self._cur_forward_chunk = None self._cur_backward_chunk = None - # Track the first microbatch of the last virtual pipeline stage - self._is_first_last_vpp_chunk = True + # Reset CPU tensor pool to reuse all CPU tensors for next iteration + if hasattr(self, '_cpu_tensor_pool'): + self._cpu_tensor_pool.reset() + + # Call post_warmup_callback after warmup to collect the offload information. + if self._is_warmup and len(self._cached_chunks_forward) > 0: + self.post_warmup_callback() + self._cached_chunks_index_backward = 0 + self._cached_chunks_index_forward = 0 + + for chunk in self._cached_chunks_forward: + chunk.reset() + self._delayed_offload_groups = [] + + @property + def offload_summary_bytes(self) -> Dict[str, int]: + """Offload summary bytes per group collected after warmup.""" + return self._offload_summary_bytes + + @property + def offload_summary_total_bytes(self) -> int: + """Total offloaded bytes collected after warmup.""" + return self._offload_summary_total_bytes def flush(self): """Flush all staged chunks to the backward queue in reverse order.""" @@ -84,33 +505,107 @@ def flush(self): for i in range(self._vpp): self._stages[i] = [] + def disable_offload(self): + """Disable the offload.""" + debug_rank("disable_offload") + self.do_offload = False + for chunk in self._cached_chunks_forward: + chunk.do_offload = False + + def enable_offload(self): + """Enable the offload.""" + debug_rank("enable_offload") + self.do_offload = True + for chunk in self._cached_chunks_forward: + chunk.do_offload = True + + def post_warmup_callback(self): + """Callback after warmup.""" + # pylint: disable=bad-builtin + debug_rank("post_warmup_callback") + self._is_warmup = False + assert len(self._cached_chunks_forward) == len( + self._cached_chunks_backward + ), "Cached chunks forward and backward must have the same length" + for chunk in self._cached_chunks_forward: + chunk.is_warmup = False + assert ( + chunk in self._cached_chunks_backward + ), "Chunk not found in cached chunks backward" + # Update the offload margin to the maximum number of deduplicated groups + self._offload_margin = max(self._offload_margin, chunk.get_max_deduplicated_groups()) + debug_rank(f"offload margin {self._offload_margin}") + # Find the last group with the same name in the cached chunks backward + last_group_with_same_name = {} + for chunk_idx, chunk in enumerate(reversed(self._cached_chunks_backward)): + for group in chunk.offload_groups: + last_group_with_same_name[group._name] = group + # Mark the last group with the same name as not offloadable to make sure + # the reloading won't block the main stream. + for name, group in last_group_with_same_name.items(): + if self._offload_margin > 0: + group.offload = False + self._offload_margin -= 1 + debug_rank(f"setting offload to false for group {name} at chunk index {chunk_idx}") + else: + break + debug_rank(f"offload margin {self._offload_margin}") + assert self._offload_margin == 0, "Offload margin is not 0" + # Dump the offload information + total_tensor_count = {} + total_offload_bytes = {} + for chunk in self._cached_chunks_forward: + for group in chunk.offload_groups: + if group.offload: + if group._name not in total_tensor_count: + total_tensor_count[group._name] = 0 + total_tensor_count[group._name] += group.total_tensor_count + if group._name not in total_offload_bytes: + total_offload_bytes[group._name] = 0 + total_offload_bytes[group._name] += group.total_offload_bytes + # Stop statistics at the first backward chunk after which 1F1B is running, + # where the memory cost will not increase anymore. + if chunk is self._cached_chunks_backward[0]: + break + # Cache summary for downstream consumers (e.g., unit tests). + self._offload_summary_bytes = dict(total_offload_bytes) + self._offload_summary_total_bytes = int(sum(total_offload_bytes.values())) + print_offload_summary_table(total_offload_bytes) + def push(self, handler): """Add a chunk handler to the backward queue.""" debug_rank(f"pushing handler {handler}") self._queue.append(handler) + if self._is_warmup: + self._cached_chunks_backward.append(handler) - def pop(self): - """Remove and set the next non-empty chunk as the current backward chunk.""" - assert self.size(), "Cannot pop from empty queue" - while self._queue: - self._cur_backward_chunk = self._queue.popleft() - if not self._cur_backward_chunk.is_empty_chunk(): + def pop_backward_chunk(self, name=None): + """Get the next non-empty backward chunk containing the group with the given name.""" + self._cur_backward_chunk = None + debug_rank(f"popping backward chunk {self._cached_chunks_index_backward}") + debug_rank(f"cached chunks backward {self._cached_chunks_backward}") + for idx, handler in enumerate( + self._cached_chunks_backward[self._cached_chunks_index_backward :] + ): + self._cached_chunks_index_backward += 1 + if not handler.is_empty_chunk(name): + self._cur_backward_chunk = ( + handler # set the first non-empty chunk as the current backward chunk + ) + debug_rank(f"handler {handler} at index {idx} is not empty") break - debug_rank(f"popping handler {self._cur_backward_chunk}") - - def front(self): - """Get the first non-empty chunk handler without removing it from the queue.""" - if not self.size(): - return None - for chunk_handler in self._queue: - if not chunk_handler.is_empty_chunk(): - return chunk_handler + assert self._cur_backward_chunk is not None, "No non-empty chunk found" + + def front_backward_chunk(self, name=None): + """Get the first non-empty backward chunk containing the group with the given name.""" + for idx, handler in enumerate( + self._cached_chunks_backward[self._cached_chunks_index_backward :] + ): + if not handler.is_empty_chunk(name): + debug_rank(f"front handler {handler} at index {idx}") + return handler return None - def size(self): - """Return the number of chunk handlers in the queue.""" - return len(self._queue) - def init_model_chunk_offload_handler( self, vp_size, vp_stage, min_offloaded_tensor_size=1024 * 1024 ): @@ -122,8 +617,11 @@ def init_model_chunk_offload_handler( vp_stage: Virtual pipeline stage index (None means stage 0) min_offloaded_tensor_size: Minimum tensor size (in elements) to offload """ + if not self._is_warmup: + return + + vp_size = 1 if vp_size is None else vp_size if self._stages is None: - vp_size = 1 if vp_size is None else vp_size self._vpp = vp_size self._stages = [[] for _ in range(vp_size)] @@ -132,26 +630,34 @@ def init_model_chunk_offload_handler( else: cur_vpp_rank = vp_stage - is_first_last_vpp_chunk = self._is_first_last_vpp_chunk # Flush staged chunks when reaching the last virtual pipeline stage if cur_vpp_rank == self._vpp - 1: self.flush() - # Determine if this is the first microbatch of the last virtual pipeline stage - is_first_last_vpp_chunk = is_first_last_vpp_chunk and (cur_vpp_rank == self._vpp - 1) - cur_chunk = ChunkOffloadHandler(is_first_last_vpp_chunk, min_offloaded_tensor_size) + # Use shared CPU tensor pool for better reuse across chunks + cur_chunk = ChunkOffloadHandler(min_offloaded_tensor_size, self._cpu_tensor_pool) + debug_rank(f"init_model_chunk_offload_handler {cur_chunk}") self._stages[cur_vpp_rank].append(cur_chunk) # For the last stage, push immediately and flush if cur_vpp_rank == self._vpp - 1: - self._is_first_last_vpp_chunk = False self.push(cur_chunk) self.flush() self._cur_forward_chunk = cur_chunk cur_chunk.vpp_rank = cur_vpp_rank - - def set_last_layer(self, is_last_layer): - """Mark whether the current forward chunk is processing the last layer.""" - self._cur_forward_chunk.is_last_layer = is_last_layer + self._cached_chunks_forward.append(cur_chunk) + + def pop_forward_chunk(self, name=None): + """Get the next forward pass chunk handler.""" + debug_rank(f"pop_forward_chunk {self._cur_forward_chunk}") + if not self.do_offload: + return self._cur_forward_chunk + while not self._is_warmup and ( + self._cur_forward_chunk is None or self._cur_forward_chunk.finish_all_groups(name) + ): + self._cur_forward_chunk = self._cached_chunks_forward[self._cached_chunks_index_forward] + self._cached_chunks_index_forward += 1 + debug_rank(f"new cur_forward_chunk {self._cur_forward_chunk}") + return self._cur_forward_chunk def cur_forward_chunk(self): """Get the current forward pass chunk handler.""" @@ -161,9 +667,16 @@ def cur_backward_chunk(self): """Get the current backward pass chunk handler.""" return self._cur_backward_chunk + def mark_not_offloadable(self, tensor: torch.Tensor): + """Mark the current forward chunk as not offloadable.""" + if tensor is not None: + tensor.offloading_activation = False + def __enter__(self): """Enter context manager to enable activation offloading hooks.""" debug_rank("----__enter__") + if self._cur_forward_chunk is None or not self.cur_forward_chunk().do_offload: + return from megatron.core.extensions.transformer_engine import cpu_offload if cpu_offload is not None: @@ -179,6 +692,8 @@ def __enter__(self): def __exit__(self, *args: Any): """Exit context manager and restore original tensor saving behavior.""" debug_rank("----__exit__") + if self._cur_forward_chunk is None or not self.cur_forward_chunk().do_offload: + return from megatron.core.extensions.transformer_engine import cpu_offload if cpu_offload is not None: @@ -212,69 +727,103 @@ class ChunkOffloadHandler: Manages tensor groups, coordinates asynchronous GPU-CPU transfers, and handles synchronization. """ - @staticmethod - def offload(src_tensor, pin_memory=True): + def offload(self, src_tensor, pin_memory=True, use_cpu_pool=True): """Offload.""" debug_rank("--------offload") if not src_tensor.is_contiguous(): src_tensor = src_tensor.contiguous() - cpu_backup = torch.empty( - src_tensor.size(), - dtype=src_tensor.dtype, - layout=src_tensor.layout, - device="cpu", - pin_memory=pin_memory, - ) + if use_cpu_pool: + cpu_backup = self.cpu_tensor_pool.allocate(src_tensor.shape, dtype=src_tensor.dtype) + else: + cpu_backup = torch.empty( + src_tensor.shape, dtype=src_tensor.dtype, device="cpu", pin_memory=pin_memory + ) cpu_backup.copy_(src_tensor, non_blocking=pin_memory) - state = (src_tensor.device, cpu_backup) + state = (src_tensor.device, cpu_backup, use_cpu_pool) return state - @staticmethod - def reload(state, non_blocking=None): + def reload(self, state, non_blocking=None): """Reload.""" debug_rank("------reload") - dev, cpu_backup = state + dev, cpu_backup, use_cpu_pool = state if non_blocking is None: non_blocking = cpu_backup.is_pinned() - return cpu_backup.to(dev, non_blocking=non_blocking) + gpu_tensor = torch.empty( + cpu_backup.size(), dtype=cpu_backup.dtype, layout=cpu_backup.layout, device=dev + ) + gpu_tensor.copy_(cpu_backup, non_blocking=non_blocking) + if use_cpu_pool: + self.cpu_tensor_pool.free(cpu_backup) + return gpu_tensor - def __init__(self, is_first_last_vpp_chunk, min_offloaded_tensor_size): - # Data Structure to maintain reference to activation tensors - self._tensor_tag_to_state = {} - # Mark the first microbatch of the last virtual pipeline stage - self._is_first_last_vpp_chunk = is_first_last_vpp_chunk + def __init__(self, min_offloaded_tensor_size, cpu_tensor_pool): + self.do_offload = True # Group management for batching offload/reload operations + self.offload_groups = [] self._offloaded_group_index = 0 + # Groups to be offloaded. self._groups_to_offload = [] + # Groups to be reloaded. self._groups_to_reload = [] + # Tensor count for the current group. self._tensor_count_current_group = 0 - + # Maximum number of groups to offload or reload. + self._max_group_size = 0 + # Groups being reloaded. + self._reloading_group = [] # Counter for special torch tensor types (FakeTensor, FunctionalTensor) self.torch_tensor_count = 0 self.d2h_stream = PipelineOffloadManager.get_instance().d2h_stream self.h2d_stream = PipelineOffloadManager.get_instance().h2d_stream - self._offload_events = {} - self._reload_events = {} self.min_offloaded_tensor_size = min_offloaded_tensor_size - self.is_last_layer = False + self.cpu_tensor_pool = cpu_tensor_pool + self.is_warmup = True + + def reset(self): + """Reset the chunk offload handler.""" + self._offloaded_group_index = 0 + self._groups_to_offload = [] + self._groups_to_reload = [] + self._tensor_count_current_group = 0 + self._reloading_group = [] - def is_empty_chunk(self): + def find_group_with_name(self, name: str, start_index: int = 0): + """Find the group with the given name starting from the given index.""" + return next( + (group for group in self.offload_groups[start_index:] if group._name == name), None + ) + + def is_empty_chunk(self, name=None): """Check if this chunk has no tensors to manage.""" - return len(self._tensor_tag_to_state) == 0 + debug_rank(f"------is_empty_chunk {self._max_group_size}") + if name is not None: + return self.find_group_with_name(name) is None + return self._max_group_size == 0 - def is_first_last_layer(self): - """ - Check if this is the last layer of the first microbatch of the last vp stage. - These tensors should not be offloaded to avoid unnecessary overhead. - """ + def finish_all_groups(self, name=None) -> bool: + """Finish all groups.""" debug_rank( - f"------is_first_last_layer {self._is_first_last_vpp_chunk} {self.is_last_layer}" + f"------finish_all_groups {self} {self._max_group_size} {self._offloaded_group_index}" ) - return self._is_first_last_vpp_chunk and self.is_last_layer + # TODO: check if this is correct + # Mark it as finished when there are no groups to offload or reload + if ( + len(self._groups_to_reload) == 0 + and len(self._groups_to_offload) == 0 + and self._offloaded_group_index > 0 + ): + return True + assert name is not None, "Name is required" + return self.find_group_with_name(name, self._offloaded_group_index) is None + + def find_next_group(self, name=None): + """Find the next group with the given name.""" + assert name is not None, "Name is required" + return self.find_group_with_name(name, self._offloaded_group_index) def tensor_push(self, tensor): """Push tensor to the offload handler.""" @@ -285,26 +834,20 @@ def tensor_push(self, tensor): torch._subclasses.functional_tensor.FunctionalTensor, ), ) + assert not torch_stray_tensor, "Stray tensor should not be offloaded" - if not torch_stray_tensor: - # Assign unique tag based on group index and position within group - tensor_tag = (self._offloaded_group_index, self._tensor_count_current_group) - self._tensor_count_current_group += 1 - assert tensor_tag not in self._tensor_tag_to_state, "Duplicate tensor tag" - self._tensor_tag_to_state[tensor_tag] = tensor - else: - # Use negative group ID for special tensor types - tensor_tag = (-1, self.torch_tensor_count) - self.torch_tensor_count += 1 - self._tensor_tag_to_state[tensor_tag] = tensor + # Assign unique tag based on group index and position within group + tensor_tag = (self._offloaded_group_index, self._tensor_count_current_group) + self._tensor_count_current_group += 1 + self.offload_groups[self._offloaded_group_index - 1].push_tensor(tensor_tag, tensor) debug_rank(f"--------tensor_push {tensor_tag}") return tensor_tag def tensor_pop(self, tensor_tag): """Pop tensor from the offload handler.""" debug_rank(f"--------tensor_pop {tensor_tag}") - assert tensor_tag in self._tensor_tag_to_state, f"Tag {tensor_tag} not found" - tensor = self._tensor_tag_to_state.pop(tensor_tag) + group_id, idx = tensor_tag + tensor = self.offload_groups[group_id - 1].pop_tensor(tensor_tag) # If tensor is offloaded (stored as tuple), reload it if isinstance(tensor, tuple): tensor = self.reload(tensor) @@ -313,6 +856,9 @@ def tensor_pop(self, tensor_tag): def tensor_need_offloading_checker(self, tensor): """Check if the tensor needs to be offloaded.""" + debug_rank( + f"tensor_need_offloading_checker {getattr(tensor, 'offloading_activation', None)}" + ) if tensor.numel() < self.min_offloaded_tensor_size: return False # Respect tensor's offload preference if specified @@ -320,83 +866,82 @@ def tensor_need_offloading_checker(self, tensor): return False return True - def bulk_offload_group(self, group_to_offload): + def bulk_offload_group(self): """offload a group of tensors recorded in tensor_push().""" debug_rank("------bulk_offload_group") - assert not self.is_first_last_layer(), "Should not offload first-last layer" - group_id_to_offload, name = group_to_offload - torch.cuda.nvtx.range_push("activation offloading " + name) + group_to_offload = self._groups_to_offload[-1] + torch.cuda.nvtx.range_push("activation offloading " + group_to_offload._name) with torch.cuda.stream(self.d2h_stream): - for tensor_tag, state in self._tensor_tag_to_state.items(): - group_id, _ = tensor_tag - if group_id == group_id_to_offload: - debug_rank(f"------tensor_tag {tensor_tag}") - debug_rank(f"------group_to_offload {group_to_offload}") - assert not isinstance(state, tuple), "Tensor already offloaded" - tensor_on_device = state - if self.tensor_need_offloading_checker(tensor_on_device): - state = self.offload(tensor_on_device) - event = torch.cuda.Event() - event.record(self.d2h_stream) - self._offload_events[name] = event - tensor_on_device.record_stream(self.d2h_stream) - self._tensor_tag_to_state[tensor_tag] = state + for tensor_tag, tensor_on_device in group_to_offload._tensors.items(): + if self.tensor_need_offloading_checker(tensor_on_device): + state = self.offload( + tensor_on_device, use_cpu_pool=group_to_offload.use_cpu_pool + ) + if self.is_warmup: + group_to_offload.update_offload_info(tensor_on_device) + tensor_on_device.record_stream(self.d2h_stream) + group_to_offload.push_tensor(tensor_tag, state) + group_to_offload.record_offload_event(self.d2h_stream) + self._groups_to_offload.pop() torch.cuda.nvtx.range_pop() - def get_offload_event(self, name): - """Get the CUDA event for a named offload operation.""" - return self._offload_events.get(name, None) - - def get_reload_event(self, name): - """Get the CUDA event for a named reload operation.""" - return self._reload_events.get(name, None) + def get_max_deduplicated_groups(self): + """Get the maximum number of deduplicated groups.""" + count_modules = [] + for group in self.offload_groups: + if group._name not in count_modules: + count_modules.append(group._name) + return len(count_modules) - def bulk_reload_group(self, group_to_reload): + def bulk_reload_group(self): """Bulk reload group.""" debug_rank("----bulk_reload_group") - found_reload_group = False - group_id_to_reload, name = group_to_reload - torch.cuda.nvtx.range_push("activation reloading " + name) + group_to_reload = self._groups_to_reload[-1] + torch.cuda.nvtx.range_push("activation reloading " + group_to_reload._name) with torch.cuda.stream(self.h2d_stream): - for tensor_label, state in self._tensor_tag_to_state.items(): - group_id, _ = tensor_label - if group_id == group_id_to_reload: - debug_rank(f"----tensor_label {tensor_label}") - found_reload_group = True - event = self.get_offload_event(name) - # Only reload if tensor was offloaded (stored as tuple) - if isinstance(state, tuple): - # Wait for offload to complete before reloading - torch.cuda.current_stream().wait_event(event) - recovered_tensor = self.reload(state) - event.record(self.h2d_stream) - self._reload_events[name] = event - debug_rank(f"----recovered_tensor {recovered_tensor.shape}") - self._tensor_tag_to_state[tensor_label] = recovered_tensor + # Wait for offload to complete before reloading + if not is_graph_capturing(): + group_to_reload.wait_offload_event(self.h2d_stream) + for tensor_tag, state in group_to_reload._tensors.items(): + # Only reload if tensor was offloaded (stored as tuple) + if isinstance(state, tuple): + recovered_tensor = self.reload(state) + debug_rank(f"----recovered_tensor {recovered_tensor.shape}") + group_to_reload.push_tensor(tensor_tag, recovered_tensor) + group_to_reload.record_reload_event(self.h2d_stream) + self._groups_to_reload.pop() + # Add the group to the reloading group to wait for the reload event. + self._reloading_group.append(group_to_reload) torch.cuda.nvtx.range_pop() - return found_reload_group def pre_reload_last_layer(self): """Pre-reload the last layer of this chunk to hide reload latency.""" debug_rank("pre_reload_last_layer") - assert not self._is_first_last_vpp_chunk, "Should not pre-reload first chunk" debug_rank(f"len(self._groups_to_reload) {len(self._groups_to_reload)}") if len(self._groups_to_reload) > 0: # Reload the last group (last layer) early - if self.bulk_reload_group(self._groups_to_reload[-1]): - self._groups_to_reload.pop() + self.bulk_reload_group() def should_bulk_offload(self): """Determine if the current group should be offloaded.""" - # Don't offload the first backward chunk's last layer - if self.is_first_last_layer(): + assert len(self._groups_to_offload) > 0, "No groups to offload" + group = self._groups_to_offload[-1] + debug_rank(f"should_bulk_offload {self.is_warmup} {group.offload}") + # Don't offload if the chunk is not in warmup stage + if self.is_warmup: + return True + # Don't offload if the group is marked as not offloadable + if not group.offload: return False # Check if next backward chunk is this chunk (for last pipeline stage) - next_backward_chunk = PipelineOffloadManager.get_instance().front() + next_backward_chunk = PipelineOffloadManager.get_instance().front_backward_chunk( + group._name + ) if next_backward_chunk is not None and next_backward_chunk is self: - # Don't offload last layer if it's about to be used immediately - if self.is_last_layer: + # Don't offload the last group with the same name if it's about to be used immediately + if self.find_next_group(group._name) is None: + debug_rank(f"next group {group._name} is not found") return False return True @@ -405,9 +950,8 @@ def bulk_offload(self, forced_released_tensors): """Offload a group of tensors and optionally release their GPU memory.""" debug_rank("----bulk_offload") if self.should_bulk_offload(): - group_to_offload = self._groups_to_offload.pop() - self._groups_to_reload.append(group_to_offload) - self.bulk_offload_group(group_to_offload) + self._groups_to_reload.append(self._groups_to_offload[-1]) + self.bulk_offload_group() # Manually release tensors not auto-freed by torch GC if len(forced_released_tensors) > 0: cur_stream = torch.cuda.current_stream() @@ -419,6 +963,8 @@ def bulk_offload(self, forced_released_tensors): def on_group_commit_forward(self, forced_released_tensors): """Called at the end of a layer group's forward pass to trigger offloading.""" + if not self.do_offload: + return debug_rank("--on_group_commit_forward") # Wait for compute to finish before starting offload self.d2h_stream.wait_stream(torch.cuda.current_stream()) @@ -429,13 +975,16 @@ def bulk_reload(self): debug_rank("--bulk_reload") if len(self._groups_to_reload) > 0: # Reload the next layer group - if self.bulk_reload_group(self._groups_to_reload[-1]): - debug_rank(f"--bulk_reload_group {self._groups_to_reload}") - self._groups_to_reload.pop() + self.bulk_reload_group() else: # Pre-load the last layer of the next backward chunk to hide latency - next_backward_chunk = PipelineOffloadManager.get_instance().front() - if next_backward_chunk is not None: + next_backward_chunk = PipelineOffloadManager.get_instance().front_backward_chunk() + # Don't pre-reload the last layer if the next backward chunk hasn't finished fprop yet. + if ( + next_backward_chunk is not None + and next_backward_chunk._offloaded_group_index + == next_backward_chunk._max_group_size + ): next_backward_chunk.pre_reload_last_layer() def on_group_commit_backward(self, name): @@ -443,40 +992,70 @@ def on_group_commit_backward(self, name): Called at the end of a layer group's backward pass. Ensures correct chunk is active and synchronizes reloads. """ + if not self.do_offload: + return debug_rank("--on_group_commit_backward") cur_backward_chunk = PipelineOffloadManager.get_instance().cur_backward_chunk() # Switch to this chunk if it's not already current if cur_backward_chunk is not self: - PipelineOffloadManager.get_instance().pop() + PipelineOffloadManager.get_instance().pop_backward_chunk(name) cur_backward_chunk = PipelineOffloadManager.get_instance().cur_backward_chunk() - assert cur_backward_chunk is self, "Chunk mismatch" + assert cur_backward_chunk is self, f"Chunk mismatch {cur_backward_chunk} {self}" # Wait for reload to complete before using tensors - event = self.get_reload_event(name) - if event is not None: - torch.cuda.current_stream().wait_event(event) - self._offloaded_group_index = self._offloaded_group_index - 1 + if not is_graph_capturing() and len(self._reloading_group) > 0: + for reloading_group in self._reloading_group: + if reloading_group._name == name: + reloading_group.wait_reload_event(torch.cuda.current_stream()) + self._reloading_group.remove(reloading_group) + break def on_group_start_forward(self, name): """ Called at the start of a layer group's forward pass. Increments group index and prepares for offloading. """ - debug_rank(f"--on_group_start_forward") + if not self.do_offload: + return + debug_rank(f"--on_group_start_forward {name}") self._offloaded_group_index = self._offloaded_group_index + 1 + if self.is_warmup: + self.offload_groups.append(OffloadTensorGroup(name)) + self._max_group_size = max(self._max_group_size, self._offloaded_group_index) + debug_rank(f"max group size {self._max_group_size}") + else: + for group in self.offload_groups[self._offloaded_group_index - 1 :]: + if group._name == name: + break + self._offloaded_group_index = self._offloaded_group_index + 1 self._tensor_count_current_group = 0 - self._groups_to_offload.append((self._offloaded_group_index, name)) + self._groups_to_offload.append(self.offload_groups[self._offloaded_group_index - 1]) + debug_rank(f"groups to offload {self._groups_to_offload}") def on_group_start_backward(self): """ Called at the start of a layer group's backward pass. Triggers reloading of tensors from CPU. """ - debug_rank("--on_group_start_backward") + if not self.do_offload: + return + debug_rank(f"--on_group_start_backward {self}") # Wait for compute to finish before starting reload self.h2d_stream.wait_stream(torch.cuda.current_stream()) self.bulk_reload() +def fine_grained_offloading_disable_offload(): + """Disable the offload.""" + debug_rank("fine_grained_offloading_disable_offload") + PipelineOffloadManager.get_instance().disable_offload() + + +def fine_grained_offloading_enable_offload(): + """Enable the offload.""" + debug_rank("fine_grained_offloading_enable_offload") + PipelineOffloadManager.get_instance().enable_offload() + + class FineGrainedOffloadingGroupCommitFunction(torch.autograd.Function): """ Identity operation that marks the end of a layer group for offload synchronization. @@ -484,19 +1063,18 @@ class FineGrainedOffloadingGroupCommitFunction(torch.autograd.Function): """ @staticmethod - def forward(ctx, *args): + def forward(ctx, tensor, cur_forward_chunk, name, forced_released_tensors, delay_offload): # pylint: disable=missing-function-docstring debug_rank("FineGrainedOffloadingGroupCommitFunction forward") - forced_released_tensors = args[-1] - name = args[-2] - cpu_offload_handler = args[-3] - tensor = args[:-3] - cpu_offload_handler.on_group_commit_forward(forced_released_tensors) - ctx.cpu_offload_handler = cpu_offload_handler + if delay_offload: + PipelineOffloadManager.get_instance().push_offload_groups( + cur_forward_chunk.on_group_commit_forward, forced_released_tensors + ) + else: + cur_forward_chunk.on_group_commit_forward(forced_released_tensors) + ctx.cpu_offload_handler = cur_forward_chunk ctx.name = name - - # return the identical tensor return tensor @staticmethod @@ -506,19 +1084,49 @@ def backward(ctx, *grad_output): cpu_offload_handler = ctx.cpu_offload_handler cpu_offload_handler.on_group_commit_backward(ctx.name) - return grad_output + (None, None, None) + return grad_output + (None, None, None, None) -def fine_grained_offloading_group_commit(*tensor, name, forced_released_tensors=[]): +def fine_grained_offloading_group_commit( + tensor, name, forced_released_tensors=None, delay_offload=False +): """ Specify the tensors to be released after offloading. forced_released_tensors is a list of tensors to be released after offloading. The tensors will be untyped_storage().resize_(0) after offloading. Note: specify the tensors only when they are not automatically released by torch gc. """ + # Be permissive: callers may pass a tuple/list of outputs (e.g., (q, k, v)). + # We only need to insert a single identity op into the autograd graph; applying + # it to the first tensor output is sufficient and keeps callers' code minimal. + if forced_released_tensors is None: + forced_released_tensors = [] + if isinstance(tensor, tuple): + if len(tensor) == 0: + return tensor + committed0 = fine_grained_offloading_group_commit( + tensor[0], + name=name, + forced_released_tensors=forced_released_tensors, + delay_offload=delay_offload, + ) + return (committed0,) + tensor[1:] + if isinstance(tensor, list): + if len(tensor) == 0: + return tensor + committed0 = fine_grained_offloading_group_commit( + tensor[0], + name=name, + forced_released_tensors=forced_released_tensors, + delay_offload=delay_offload, + ) + return [committed0] + tensor[1:] + cur_forward_chunk = PipelineOffloadManager.get_instance().cur_forward_chunk() + if cur_forward_chunk is None: + return tensor return FineGrainedOffloadingGroupCommitFunction.apply( - *tensor, cur_forward_chunk, name, forced_released_tensors + tensor, cur_forward_chunk, name, forced_released_tensors, delay_offload ) @@ -544,32 +1152,105 @@ def backward(ctx, grad_output): debug_rank("FineGrainedOffloadingGroupStartFunction backward") cpu_offload_handler = ctx.cpu_offload_handler cpu_offload_handler.on_group_start_backward() - return grad_output, None, None + return grad_output, None, None, None def fine_grained_offloading_group_start(tensor, name=None): """Mark the start of a layer group and prepare for offload/reload.""" - cur_forward_chunk = PipelineOffloadManager.get_instance().cur_forward_chunk() + cur_forward_chunk = PipelineOffloadManager.get_instance().pop_forward_chunk(name=name) + if cur_forward_chunk is None: + return tensor return FineGrainedOffloadingGroupStartFunction.apply(tensor, cur_forward_chunk, name) -def get_fine_grained_offloading_context(flag): - """Get the fine-grained offload context""" - return PipelineOffloadManager.get_instance() if flag else nullcontext() +class FineGrainedOffloadingBackwardRecordFunction(torch.autograd.Function): + """ + Identity operation that marks the end of a layer group for offload synchronization. + Triggers offload during forward and synchronizes reload during backward. + """ + @staticmethod + def forward(ctx, tensor, event: torch.cuda.Event) -> torch.Tensor: + """Forward pass for cuda graph capture.""" + ctx.event = event + return tensor + + @staticmethod + def backward(ctx, grad_output): + """Record the backward event and wait for the h2d stream on cuda graph stream.""" + h2d_stream = PipelineOffloadManager.get_instance().h2d_stream + torch.cuda.current_stream().record_event(ctx.event) + torch.cuda.current_stream().wait_stream(h2d_stream) + return grad_output, None -def fine_grained_offloading_set_last_layer(is_last_layer): - """Set the last layer flag.""" - PipelineOffloadManager.get_instance().set_last_layer(is_last_layer) +class FineGrainedActivationOffloadingInterface: + """Interface for fine-grained activation offloading.""" -def fine_grained_offloading_init_chunk_handler(vp_size, vp_stage, min_offloaded_tensor_size): - """Initialize the chunk handler, called at the start of a microbatch forward pass.""" - PipelineOffloadManager.get_instance().init_model_chunk_offload_handler( - vp_size, vp_stage, min_offloaded_tensor_size - ) + def __init__(self, offload: bool, tensor: torch.Tensor, name: str): + self.offload = offload + self.tensor = tensor + self.name = name + def __enter__(self): + """Enter context manager to enable activation offloading hooks.""" + if self.offload: + self.tensor = fine_grained_offloading_group_start(self.tensor, self.name) + PipelineOffloadManager.get_instance().__enter__() + return self.tensor -def fine_grained_offloading_reset(): - """Reset the chunk handler, called at the start of a training iteration.""" - PipelineOffloadManager.get_instance().reset() + def __exit__(self, *args: Any): + """Exit context manager to disable activation offloading hooks.""" + if self.offload: + PipelineOffloadManager.get_instance().__exit__() + + @staticmethod + def init_chunk_handler(vp_size, vp_stage, min_offloaded_tensor_size): + """Initialize the chunk handler, called at the start of a microbatch forward pass.""" + PipelineOffloadManager.get_instance().init_model_chunk_offload_handler( + vp_size, vp_stage, min_offloaded_tensor_size + ) + + @staticmethod + def get_context(flag): + """Get the fine-grained offload context""" + return PipelineOffloadManager.get_instance() if flag else nullcontext() + + @staticmethod + def group_commit(tensor, name, forced_released_tensors=None, delay_offload=False): + """Group commit the tensors.""" + return fine_grained_offloading_group_commit( + tensor, name, forced_released_tensors, delay_offload + ) + + @staticmethod + def mark_not_offloadable(tensor: torch.Tensor): + """Mark the tensor as not offloadable.""" + PipelineOffloadManager.get_instance().mark_not_offloadable(tensor) + + @staticmethod + def forward_record(event: torch.cuda.Event) -> None: + """Record the forward event for cuda graph capture.""" + d2h_stream = PipelineOffloadManager.get_instance().d2h_stream + torch.cuda.current_stream().record_event(event) + torch.cuda.current_stream().wait_stream(d2h_stream) + + @staticmethod + def backward_record(tensor, event: torch.cuda.Event) -> torch.Tensor: + """Record the backward event for cuda graph capture.""" + return FineGrainedOffloadingBackwardRecordFunction.apply(tensor, event) + + @staticmethod + def reset(): + """Reset the chunk handler.""" + PipelineOffloadManager.get_instance().reset() + + @staticmethod + def reset_instance(): + """Reset the singleton instance.""" + PipelineOffloadManager.reset_instance() + + @staticmethod + def flush_delayed_groups(): + """Flush the delayed groups.""" + PipelineOffloadManager.get_instance().flush_delayed_groups() diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index 9dc79ed11f7..dadbd199ab7 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -11,7 +11,7 @@ from megatron.core import parallel_state from megatron.core.enums import ModelType from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_reset, + FineGrainedActivationOffloadingInterface as off_interface, ) from megatron.core.pipeline_parallel.p2p_communication import P2PCommunicator from megatron.core.pipeline_parallel.utils import ( @@ -581,9 +581,6 @@ def forward_backward_no_pipelining( if config.timers is not None: config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time) - if not forward_only and config.fine_grained_activation_offloading: - fine_grained_offloading_reset() - no_sync_func = config.no_sync_func if no_sync_func is None: no_sync_func = contextlib.nullcontext @@ -682,6 +679,9 @@ def forward_backward_no_pipelining( pg_collection=pg_collection, ) + if not forward_only and config.fine_grained_activation_offloading: + off_interface.reset() + if config.timers is not None: config.timers('forward-backward').stop() @@ -1042,9 +1042,6 @@ def forward_backward_pipelining_with_interleaving( adjust_tensor_shapes_fn is None ), "adjust_tensor_shapes_fn is not supported for interleaved pipeline parallelism" - if not forward_only and config.fine_grained_activation_offloading: - fine_grained_offloading_reset() - if config.overlap_p2p_comm and config.batch_p2p_comm: raise ValueError("Can not use both overlap_p2p_comm and batch_p2p_comm") @@ -2049,6 +2046,8 @@ def pp_post_backward(input_tensor_grad, vp_stage=None): pg_collection=pg_collection, ) + if not forward_only and config.fine_grained_activation_offloading: + off_interface.reset() # Restore config.grad_sync_func and config.param_sync_func. if forward_only: config.grad_sync_func, config.param_sync_func = grad_sync_func, param_sync_func @@ -2190,9 +2189,6 @@ def forward_backward_pipelining_without_interleaving( if config.timers is not None: config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time) - if not forward_only and config.fine_grained_activation_offloading: - fine_grained_offloading_reset() - # Disable async grad reductions no_sync_func = config.no_sync_func if no_sync_func is None: @@ -2440,6 +2436,9 @@ def enable_grad_sync(): pg_collection=pg_collection, ) + if not forward_only and config.fine_grained_activation_offloading: + off_interface.reset() + if config.timers is not None: config.timers('forward-backward').stop() diff --git a/megatron/core/pipeline_parallel/utils.py b/megatron/core/pipeline_parallel/utils.py index d38f6d702c0..bda6334fc4b 100644 --- a/megatron/core/pipeline_parallel/utils.py +++ b/megatron/core/pipeline_parallel/utils.py @@ -1,5 +1,6 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +import logging from abc import ABC, abstractmethod from contextlib import contextmanager from typing import Callable, Optional @@ -7,7 +8,9 @@ import torch from torch.autograd import Variable -from megatron.core.utils import get_pg_rank, get_pg_size, make_viewless_tensor +from megatron.core.utils import get_pg_rank, get_pg_size, log_single_rank, make_viewless_tensor + +logger = logging.getLogger(__name__) def is_pp_first_stage(pp_group: torch.distributed.ProcessGroup): @@ -87,19 +90,13 @@ def set_ideal_affinity_for_current_gpu(): try: import cuda.bindings.driver as cuda_driver import cuda.bindings.runtime as cuda_runtime - except ImportError: + except: try: import cuda.cuda as cuda_driver import cuda.cudart as cuda_runtime - except ImportError: - # print("cuda-python may not be installed, skipping GPU affinity setting") - warnings.warn("cuda-python may not be installed, skipping GPU affinity setting") - return - try: - import pynvml - except ImportError: - warnings.warn("pynvml is not installed, skipping GPU affinity setting") - return + except: + raise RuntimeError("Please install cuda-python to enable GPU affinity setting") + import pynvml # Get current CUDA device ID err, device_id = cuda_runtime.cudaGetDevice() @@ -112,6 +109,12 @@ def set_ideal_affinity_for_current_gpu(): handle = pynvml.nvmlDeviceGetHandleByUUID("GPU-" + str(uuid.UUID(bytes=device_uuid.bytes))) pynvml.nvmlDeviceSetCpuAffinity(handle) + log_single_rank( + logger, + logging.WARNING, + f"Set CPU affinity for all GPUs for optimal host-device transfer performance", + ) + @contextmanager def stream_acquire_context(stream, event): diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index 0c5309a5876..c3c7dad250a 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -25,9 +25,7 @@ get_tensor_model_parallel_world_size, ) from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_group_commit, - fine_grained_offloading_group_start, - get_fine_grained_offloading_context, + FineGrainedActivationOffloadingInterface as off_interface, ) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel.mappings import all_gather_last_dim_from_tensor_parallel_region @@ -830,14 +828,13 @@ def forward( if output_gate: assert split_qkv, "output_gate is not supported for unsplit mixed_qkv tensor." - if self.offload_qkv_linear: - hidden_states = fine_grained_offloading_group_start(hidden_states, name="qkv_linear") - with get_fine_grained_offloading_context(self.offload_qkv_linear): + with off_interface(self.offload_qkv_linear, hidden_states, "qkv_linear") as hidden_states: qkv_output = self.get_query_key_value_tensors( hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv ) if self.offload_qkv_linear: - (qkv_output,) = fine_grained_offloading_group_commit( + # `qkv_output` may be a tuple; commit supports tuple/list and will keep structure. + qkv_output = off_interface.group_commit( qkv_output, name="qkv_linear", forced_released_tensors=[] ) @@ -989,11 +986,11 @@ def forward( packed_seq_params=packed_seq_params, ) else: - if self.offload_core_attention and self.training: - query = fine_grained_offloading_group_start(query, name="core_attn") if inference_context is None or inference_context.is_static_batching(): # Static batching attention kernel. - with get_fine_grained_offloading_context(self.offload_core_attention): + with off_interface( + self.offload_core_attention and self.training, query, "core_attn" + ) as query: core_attn_out = self.core_attention( query, key, @@ -1023,7 +1020,7 @@ def forward( ) core_attn_out = rearrange(core_attn_out, 's b h d -> s b (h d)') if self.offload_core_attention and self.training: - (core_attn_out,) = fine_grained_offloading_group_commit( + core_attn_out = off_interface.group_commit( core_attn_out, name="core_attn", forced_released_tensors=[query, key, value] ) @@ -1046,13 +1043,11 @@ def forward( # ================= nvtx_range_push(suffix="linear_proj") - if self.offload_attn_proj: - core_attn_out = fine_grained_offloading_group_start(core_attn_out, name="attn_proj") - with get_fine_grained_offloading_context(self.offload_attn_proj): + with off_interface(self.offload_attn_proj, core_attn_out, "attn_proj") as core_attn_out: output, bias = self.linear_proj(core_attn_out) if self.offload_attn_proj: - output, bias = fine_grained_offloading_group_commit( - output, bias, name="attn_proj", forced_released_tensors=[core_attn_out] + output = off_interface.group_commit( + output, name="attn_proj", forced_released_tensors=[core_attn_out] ) nvtx_range_pop(suffix="linear_proj") diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py index aec5ac00bab..615e12e09d6 100644 --- a/megatron/core/transformer/moe/experts.py +++ b/megatron/core/transformer/moe/experts.py @@ -26,9 +26,7 @@ from megatron.core.fusions.fused_weighted_squared_relu import weighted_squared_relu_impl from megatron.core.jit import jit_fuser from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_group_commit, - fine_grained_offloading_group_start, - get_fine_grained_offloading_context, + FineGrainedActivationOffloadingInterface as off_interface, ) from megatron.core.tensor_parallel.layers import ( _initialize_affine_weight_cpu, @@ -662,7 +660,7 @@ def __init__( set_save_original_input(self.linear_fc2) # This is to avoid the CPU overhead of multiple d2h copies - if self.offload_expert_fc1 and not (self.config.fp8 or self.config.fp4): + if self.offload_expert_fc1: from megatron.core.extensions.transformer_engine import set_save_original_input set_save_original_input(self.linear_fc1) @@ -731,18 +729,15 @@ def forward( # Probs already applied, so reset to 1. permuted_probs = torch.ones_like(permuted_probs) - if self.offload_expert_fc1: - permuted_local_hidden_states = fine_grained_offloading_group_start( - permuted_local_hidden_states, name="expert_fc1" - ) - with get_fine_grained_offloading_context(self.offload_expert_fc1): + with off_interface( + self.offload_expert_fc1, permuted_local_hidden_states, "expert_fc1" + ) as permuted_local_hidden_states: fc1_output, bias_parallel = self.linear_fc1( permuted_local_hidden_states, tokens_per_expert ) if self.offload_expert_fc1: - fc1_output, bias_parallel = fine_grained_offloading_group_commit( + fc1_output = off_interface.group_commit( fc1_output, - bias_parallel, name="expert_fc1", forced_released_tensors=[permuted_local_hidden_states], ) @@ -805,24 +800,24 @@ def glu(x): intermediate_parallel = intermediate_parallel.to(original_dtype) return intermediate_parallel - if self.offload_moe_act: - fc1_output = fine_grained_offloading_group_start(fc1_output, name="moe_act") - if self.activation_recompute: self.activation_checkpoint = tensor_parallel.CheckpointWithoutOutput() - with get_fine_grained_offloading_context(self.offload_moe_act): + with off_interface(self.offload_moe_act, fc1_output, "moe_act") as fc1_output: bias_act_output = self.activation_checkpoint.checkpoint( bias_act_func, fc1_output, bias_parallel, permuted_probs ) else: - with get_fine_grained_offloading_context(self.offload_moe_act): + with off_interface(self.offload_moe_act, fc1_output, "moe_act") as fc1_output: bias_act_output = bias_act_func(fc1_output, bias_parallel, permuted_probs) output, output_bias = self.linear_fc2(bias_act_output, tokens_per_expert) if self.activation_recompute: self.activation_checkpoint.discard_output_and_register_recompute(output) + + # Delay the offload of the moe act until after the linear_fc2 has been computed + # to make sure the fc1_output is reloaded to GPU before recomputing moe_act. if self.offload_moe_act: - (output,) = fine_grained_offloading_group_commit( + output = off_interface.group_commit( output, name="moe_act", forced_released_tensors=[fc1_output] ) diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py index ed90fdffa97..9689056e325 100644 --- a/megatron/core/transformer/multi_latent_attention.py +++ b/megatron/core/transformer/multi_latent_attention.py @@ -23,9 +23,7 @@ apply_rotary_pos_emb, ) from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_group_commit, - fine_grained_offloading_group_start, - get_fine_grained_offloading_context, + FineGrainedActivationOffloadingInterface as off_interface, ) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel.layers import ColumnParallelLinear @@ -244,27 +242,32 @@ def forward( # Get the query, key and value tensors based on the type of attention - # self or cross attn. # query: [96, 1, 16, 128], key:[96, 1, 16, 128], value:[96, 1, 16, 128] - if self.config.experimental_attention_variant is None: - query, key, value = self.get_query_key_value_tensors( - hidden_states, - key_value_states, - position_ids, - packed_seq_params, - inference_context=inference_context, - ) - elif self.config.experimental_attention_variant == "dsa": - query, key, value, q_compressed, _ = self.get_query_key_value_tensors( - hidden_states, - key_value_states, - position_ids, - packed_seq_params, - inference_context=inference_context, - return_compressed_tensors=True, - ) - else: - raise ValueError( - f"Unsupported experimental attention variant: " - f"{self.config.experimental_attention_variant}" + with off_interface(self.offload_qkv_linear, hidden_states, "qkv_linear") as hidden_states: + if self.config.experimental_attention_variant is None: + query, key, value = self.get_query_key_value_tensors( + hidden_states, + key_value_states, + position_ids, + packed_seq_params, + inference_context=inference_context, + ) + elif self.config.experimental_attention_variant == "dsa": + query, key, value, q_compressed, _ = self.get_query_key_value_tensors( + hidden_states, + key_value_states, + position_ids, + packed_seq_params, + inference_context=inference_context, + return_compressed_tensors=True, + ) + else: + raise ValueError( + f"Unsupported experimental attention variant: " + f"{self.config.experimental_attention_variant}" + ) + if self.offload_qkv_linear: + query = off_interface.group_commit( + query, name="qkv_linear", forced_released_tensors=[hidden_states] ) # =================================================== @@ -292,11 +295,10 @@ def forward( query, key, value, attention_mask, packed_seq_params=packed_seq_params ) else: - if self.offload_core_attention and self.training: - query = fine_grained_offloading_group_start(query, name="core_attn") - if inference_context is None or inference_context.is_static_batching(): - with get_fine_grained_offloading_context(self.offload_core_attention): + with off_interface( + self.offload_core_attention and self.training, query, "core_attn" + ) as query: if self.config.experimental_attention_variant is None: core_attn_out = self.core_attention( query, @@ -346,7 +348,7 @@ def forward( if not inference_context.is_decode_only(): core_attn_out = rearrange(core_attn_out, 's b h d -> s b (h d)') if self.offload_core_attention and self.training: - (core_attn_out,) = fine_grained_offloading_group_commit( + core_attn_out = off_interface.group_commit( core_attn_out, name="core_attn", forced_released_tensors=[query, key, value] ) @@ -374,13 +376,11 @@ def forward( # ================= # Output. [sq, b, h] # ================= - if self.offload_attn_proj: - core_attn_out = fine_grained_offloading_group_start(core_attn_out, name="attn_proj") - with get_fine_grained_offloading_context(self.offload_attn_proj): + with off_interface(self.offload_attn_proj, core_attn_out, "attn_proj") as core_attn_out: output, bias = self.linear_proj(core_attn_out) if self.offload_attn_proj: - output, bias = fine_grained_offloading_group_commit( - output, bias, name="attn_proj", forced_released_tensors=[core_attn_out] + output = off_interface.group_commit( + output, name="attn_proj", forced_released_tensors=[core_attn_out] ) return output, bias diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py index bde3149f5f4..8d5c479aa59 100755 --- a/megatron/core/transformer/multi_token_prediction.py +++ b/megatron/core/transformer/multi_token_prediction.py @@ -13,9 +13,6 @@ from megatron.core.fp8_utils import get_fp8_context from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider from megatron.core.packed_seq_params import PackedSeqParams -from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_set_last_layer, -) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel import ( gather_from_tensor_model_parallel_region, @@ -1114,8 +1111,6 @@ def forward( hidden_states_list = list(torch.chunk(hidden_states, 1 + offset, dim=0)) hidden_states = hidden_states_list[offset] for layer_number in range(len(self.layers)): - if self.config.fine_grained_activation_offloading: - fine_grained_offloading_set_last_layer(layer_number == len(self.layers) - 1) (hidden_states, input_ids, position_ids) = self.layers[layer_number]( input_ids=input_ids, position_ids=position_ids, diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index cbbd7ec00eb..b28a66400e0 100755 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -16,9 +16,6 @@ from megatron.core.fusions.fused_layer_norm import FusedLayerNorm from megatron.core.inference.contexts import BaseInferenceContext from megatron.core.packed_seq_params import PackedSeqParams -from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_set_last_layer, -) from megatron.core.pipeline_parallel.utils import is_vp_first_stage, is_vp_last_stage from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.transformer.enums import CudaGraphScope, LayerType @@ -736,11 +733,6 @@ def forward( else: inner_quantization_context = nullcontext() - if self.config.fine_grained_activation_offloading: - fine_grained_offloading_set_last_layer( - l_no == self.num_layers_per_pipeline_rank - 1 - ) - with self.offload_context, inner_quantization_context: hidden_states, context = layer( hidden_states=hidden_states, diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index ce90aaf357a..a486b6ed3d5 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -543,9 +543,7 @@ def _forward_attention( otherwise None. """ from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_group_commit, - fine_grained_offloading_group_start, - get_fine_grained_offloading_context, + FineGrainedActivationOffloadingInterface as off_interface, ) inference_context = deprecate_inference_params(inference_context, inference_params) @@ -553,17 +551,15 @@ def _forward_attention( # Residual connection. residual = hidden_states - if self.offload_attn_norm: - hidden_states = fine_grained_offloading_group_start(hidden_states, name="attn_norm") # Optional Input Layer norm if self.recompute_input_layernorm: self.input_layernorm_checkpoint = tensor_parallel.CheckpointWithoutOutput() - with get_fine_grained_offloading_context(self.offload_attn_norm): + with off_interface(self.offload_attn_norm, hidden_states, "attn_norm") as hidden_states: input_layernorm_output = self.input_layernorm_checkpoint.checkpoint( self.input_layernorm, hidden_states ) else: - with get_fine_grained_offloading_context(self.offload_attn_norm): + with off_interface(self.offload_attn_norm, hidden_states, "attn_norm") as hidden_states: input_layernorm_output = self.input_layernorm(hidden_states) # Self attention. @@ -598,8 +594,10 @@ def _forward_attention( ) nvtx_range_pop(suffix="self_attn_bda") + # Delay the offload of the attention norm until after the self_attn_bda has been computed + # because the residual is needed in the self_attn_bda. if self.offload_attn_norm: - (hidden_states,) = fine_grained_offloading_group_commit( + hidden_states = off_interface.group_commit( hidden_states, name="attn_norm", forced_released_tensors=[residual] ) @@ -647,24 +645,21 @@ def _forward_mlp(self, hidden_states, inference_context=None, padding_mask=None) """ from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_group_start, - get_fine_grained_offloading_context, + FineGrainedActivationOffloadingInterface as off_interface, ) # Residual connection. residual = hidden_states - if self.offload_mlp_norm: - hidden_states = fine_grained_offloading_group_start(hidden_states, name="mlp_norm") # Optional Layer norm post the cross-attention. if self.recompute_pre_mlp_layernorm: self.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput() - with get_fine_grained_offloading_context(self.offload_mlp_norm): + with off_interface(self.offload_mlp_norm, hidden_states, "mlp_norm") as hidden_states: pre_mlp_layernorm_output = self.pre_mlp_norm_checkpoint.checkpoint( self.pre_mlp_layernorm, hidden_states ) else: - with get_fine_grained_offloading_context(self.offload_mlp_norm): + with off_interface(self.offload_mlp_norm, hidden_states, "mlp_norm") as hidden_states: pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states) nvtx_range_push(suffix="mlp") @@ -750,7 +745,7 @@ def _forward_post_mlp(self, mlp_output_with_bias, residual): """ from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_group_commit, + FineGrainedActivationOffloadingInterface as off_interface, ) # TODO: could we move `bias_dropout_add_exec_handler` itself @@ -761,8 +756,10 @@ def _forward_post_mlp(self, mlp_output_with_bias, residual): mlp_output_with_bias, residual, self.hidden_dropout ) nvtx_range_pop(suffix="mlp_bda") + # Delay the offload of the mlp norm until after the mlp_bda has been computed + # because the residual is needed in the mlp_bda. if self.offload_mlp_norm: - (hidden_states,) = fine_grained_offloading_group_commit( + hidden_states = off_interface.group_commit( hidden_states, name="mlp_norm", forced_released_tensors=[residual] ) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 027449b1729..b94b5b45544 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -1312,6 +1312,9 @@ def validate_args(args, defaults={}): if args.fine_grained_activation_offloading: assert args.transformer_impl == 'transformer_engine', \ "Fine-grained activation offloading is only supported with transformer_engine implementation" + if is_te_min_version("2.10.0"): + assert os.getenv("NVTE_CPU_OFFLOAD_V1", "0") == "1", \ + "For fine-grained activation offloading with TE >= 2.10.0, NVTE_CPU_OFFLOAD_V1 should be set to 1 to avoid offloading weights." if args.mtp_num_layers: assert not args.use_legacy_models, "The legacy Megatron models does not support Multi-Token Prediction (MTP)." diff --git a/megatron/training/training.py b/megatron/training/training.py index 5c52f907fc6..13ad0025e43 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -728,11 +728,16 @@ def pretrain( args = get_args() timers = get_timers() + if args.fine_grained_activation_offloading: + from megatron.core.pipeline_parallel.utils import ( + set_ideal_affinity_for_current_gpu + ) + set_ideal_affinity_for_current_gpu() + if args.batch_invariant_mode: print_rank_0("Enabling batch invariant mode globally",flush=True) enable_batch_invariant_mode() - if args.log_progress: append_to_progress_log("Starting job") diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json index bc1062ce151..038ed2be724 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 5275215360.0, - "2": 5275420160.0, - "3": 5275622912.0, - "4": 5275217408.0, - "5": 5275420160.0, - "6": 5275622912.0, - "7": 5275825664.0, - "8": 5276028416.0, - "9": 5276231168.0, - "10": 5276433920.0, - "11": 5276636672.0, - "12": 5276839424.0, - "13": 5277042176.0, - "14": 5277244928.0, - "15": 5277447680.0, - "16": 5277650432.0, - "17": 5277853184.0, - "18": 5278055936.0, - "19": 5278258688.0, - "20": 5278461440.0, - "21": 5278664192.0, - "22": 5278866944.0, - "23": 5279069696.0, - "24": 5279272448.0, - "25": 5279475200.0, - "26": 5279677952.0, - "27": 5279880704.0, - "28": 5280083456.0, - "29": 5280286208.0, - "30": 5280488960.0, - "31": 5280691712.0, - "32": 5280894464.0, - "33": 5281097216.0, - "34": 5281299968.0, - "35": 5281502720.0, - "36": 5281705472.0, - "37": 5281908224.0, - "38": 5282110976.0, - "39": 5282313728.0, - "40": 5282516480.0, - "41": 5282719232.0, - "42": 5282921984.0, - "43": 5283124736.0, - "44": 5283327488.0, - "45": 5283530240.0, - "46": 5283732992.0, - "47": 5283935744.0, - "48": 5284138496.0, - "49": 5284341248.0, - "50": 5284544000.0 + "1": 5283616256.0, + "2": 5288015360.0, + "3": 5288218112.0, + "4": 5288420864.0, + "5": 5288623616.0, + "6": 5287812608.0, + "7": 5288015360.0, + "8": 5288218112.0, + "9": 5287711232.0, + "10": 5287913984.0, + "11": 5288116736.0, + "12": 5288319488.0, + "13": 5288522240.0, + "14": 5288724992.0, + "15": 5288927744.0, + "16": 5289130496.0, + "17": 5289333248.0, + "18": 5289536000.0, + "19": 5289738752.0, + "20": 5289941504.0, + "21": 5290144256.0, + "22": 5290347008.0, + "23": 5290549760.0, + "24": 5290752512.0, + "25": 5290955264.0, + "26": 5291158016.0, + "27": 5291360768.0, + "28": 5291563520.0, + "29": 5291766272.0, + "30": 5291969024.0, + "31": 5292171776.0, + "32": 5292374528.0, + "33": 5292577280.0, + "34": 5292780032.0, + "35": 5292982784.0, + "36": 5293185536.0, + "37": 5293388288.0, + "38": 5293591040.0, + "39": 5293793792.0, + "40": 5293996544.0, + "41": 5294199296.0, + "42": 5294402048.0, + "43": 5294604800.0, + "44": 5294807552.0, + "45": 5295010304.0, + "46": 5295213056.0, + "47": 5295415808.0, + "48": 5295618560.0, + "49": 5295821312.0, + "50": 5296024064.0 } }, "mem-max-allocated-bytes": { @@ -341,4 +341,4 @@ "50": 1.91915 } } -} \ No newline at end of file +} diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml index be34eb9aec5..38528836659 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml @@ -5,6 +5,7 @@ ENV_VARS: NCCL_NVLS_ENABLE: 0 PYTHONWARNINGS: ignore NCCL_DEBUG: VERSION + NVTE_CPU_OFFLOAD_V1: 1 NVTE_FUSED_ATTN: 0 NCCL_ALGO: ^NVLS CUBLAS_WORKSPACE_CONFIG: ':4096:8' @@ -134,7 +135,6 @@ TEST_TYPE: regular # Usually ckpt-resume, but as a WAR to #513 set to regular METRICS: # - "iteration-time" - "lm loss" - - "num-zeros" - "mem-allocated-bytes" - "mem-max-allocated-bytes" - "mtp_1 loss" diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json index ca64f30b0fb..9cc2fa69da7 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 4305058304.0, - "2": 4305059840.0, - "3": 4305059840.0, - "4": 4305059840.0, - "5": 4305059840.0, - "6": 4305059840.0, - "7": 4305059840.0, - "8": 4305059840.0, - "9": 4305059840.0, - "10": 4305059840.0, - "11": 4305059840.0, - "12": 4305059840.0, - "13": 4305059840.0, - "14": 4305059840.0, - "15": 4305059840.0, - "16": 4305059840.0, - "17": 4305059840.0, - "18": 4305059840.0, - "19": 4305059840.0, - "20": 4305059840.0, - "21": 4305059840.0, - "22": 4305059840.0, - "23": 4305059840.0, - "24": 4305059840.0, - "25": 4305059840.0, - "26": 4305059840.0, - "27": 4305059840.0, - "28": 4305059840.0, - "29": 4305059840.0, - "30": 4305059840.0, - "31": 4305059840.0, - "32": 4305059840.0, - "33": 4305059840.0, - "34": 4305059840.0, - "35": 4305059840.0, - "36": 4305059840.0, - "37": 4305059840.0, - "38": 4305059840.0, - "39": 4305059840.0, - "40": 4305059840.0, - "41": 4305059840.0, - "42": 4305059840.0, - "43": 4305059840.0, - "44": 4305059840.0, - "45": 4305059840.0, - "46": 4305059840.0, - "47": 4305059840.0, - "48": 4305059840.0, - "49": 4305059840.0, - "50": 4305059840.0 + "1": 4313446912.0, + "2": 4313448448.0, + "3": 4313448448.0, + "4": 4313448448.0, + "5": 4313448448.0, + "6": 4313448448.0, + "7": 4313448448.0, + "8": 4313448448.0, + "9": 4313448448.0, + "10": 4313448448.0, + "11": 4313448448.0, + "12": 4313448448.0, + "13": 4313448448.0, + "14": 4313448448.0, + "15": 4313448448.0, + "16": 4313448448.0, + "17": 4313448448.0, + "18": 4313448448.0, + "19": 4313448448.0, + "20": 4313448448.0, + "21": 4313448448.0, + "22": 4313448448.0, + "23": 4313448448.0, + "24": 4313448448.0, + "25": 4313448448.0, + "26": 4313448448.0, + "27": 4313448448.0, + "28": 4313448448.0, + "29": 4313448448.0, + "30": 4313448448.0, + "31": 4313448448.0, + "32": 4313448448.0, + "33": 4313448448.0, + "34": 4313448448.0, + "35": 4313448448.0, + "36": 4313448448.0, + "37": 4313448448.0, + "38": 4313448448.0, + "39": 4313448448.0, + "40": 4313448448.0, + "41": 4313448448.0, + "42": 4313448448.0, + "43": 4313448448.0, + "44": 4313448448.0, + "45": 4313448448.0, + "46": 4313448448.0, + "47": 4313448448.0, + "48": 4313448448.0, + "49": 4313448448.0, + "50": 4313448448.0 } }, "mem-max-allocated-bytes": { @@ -284,4 +284,4 @@ "50": 1.97038 } } -} \ No newline at end of file +} diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml index 5b177ed116d..d1fcd8fd4b7 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml @@ -5,6 +5,10 @@ ENV_VARS: NCCL_NVLS_ENABLE: 0 PYTHONWARNINGS: ignore NCCL_DEBUG: VERSION + NVTE_CPU_OFFLOAD_V1: 1 + NVTE_FUSED_ATTN: 0 + NCCL_ALGO: ^NVLS + CUBLAS_WORKSPACE_CONFIG: ':4096:8' MODEL_ARGS: # Distributed args --distributed-timeout-minutes: 60 @@ -29,8 +33,6 @@ MODEL_ARGS: --exit-duration-in-mins: 230 --no-check-for-nan-in-loss-and-grad: true --no-rope-fusion: true - --cross-entropy-loss-fusion: true - --cross-entropy-fusion-impl: native --manual-gc: true --manual-gc-interval: 100 --recompute-granularity: selective @@ -129,6 +131,5 @@ TEST_TYPE: regular # Usually ckpt-resume, but as a WAR to #513 set to regular METRICS: # - "iteration-time" - "lm loss" - - "num-zeros" - "mem-allocated-bytes" - "mem-max-allocated-bytes" diff --git a/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py b/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py index 7c1b7f1fe4b..558c6934a0c 100644 --- a/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py +++ b/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py @@ -1,187 +1,573 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import gc +import os +from contextlib import nullcontext +from typing import Dict, List, Optional, Tuple import pytest import torch -EPSILON = 0.1 +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec +from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + FineGrainedActivationOffloadingInterface as off_interface, +) +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.enums import AttnBackend +from megatron.core.transformer.transformer_config import MLATransformerConfig, TransformerConfig +from megatron.core.utils import is_te_min_version +from tests.unit_tests.test_utilities import Utils -# Skip all tests if CUDA is not available -cuda_available = torch.cuda.is_available() +# Tolerance for memory expectation check (GPU allocator jitter etc). +EPSILON = 0.30 +EPSILON_A2A = 0.30 +DELTA = 20 # MiB -def _reset_cuda_memory(): +def _reset_cuda_memory() -> None: gc.collect() - if cuda_available: + if torch.cuda.is_available(): torch.cuda.empty_cache() + torch.cuda.synchronize() + + +def _build_gpt_model( + *, + seed: int, + num_layers: int, + hidden_size: int, + num_attention_heads: int, + vocab_size: int, + seq_length: int, + num_experts: Optional[int], + fine_grained_activation_offloading: bool, + offload_modules: Optional[List[str]], + min_offloaded_tensor_size: int, + is_mla: bool, +) -> GPTModel: + """Build a GPTModel that uses TE-based transformer layer spec.""" + model_parallel_cuda_manual_seed(seed) + torch.manual_seed(seed) + ConfigClass = MLATransformerConfig if is_mla else TransformerConfig + transformer_config = ConfigClass( + num_layers=num_layers, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + use_cpu_initialization=True, + attention_backend=AttnBackend.unfused, + bf16=True, + # Recompute + recompute_modules=["layernorm", "moe_act"] if num_experts is not None else ["layernorm"], + recompute_granularity="selective", + # MoE + num_moe_experts=num_experts, + moe_grouped_gemm=(num_experts is not None), + # Fine-grained activation offloading + fine_grained_activation_offloading=fine_grained_activation_offloading, + offload_modules=offload_modules, + min_offloaded_tensor_size=min_offloaded_tensor_size, + ) + gpt_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec( + num_experts=num_experts, + moe_grouped_gemm=num_experts is not None, + moe_use_legacy_grouped_gemm=False, + multi_latent_attention=is_mla, + ), + vocab_size=vocab_size, + max_sequence_length=seq_length, + ).bfloat16() + return gpt_model + + +def _make_gpt_inputs( + *, seq_length: int, micro_batch_size: int, device: torch.device +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + data = list(range(seq_length)) + input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).to(device) + position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).to(device) + attention_mask = torch.ones((micro_batch_size, 1, seq_length, seq_length), dtype=bool).to( + device + ) + return input_ids, position_ids, attention_mask + + +def _run_one_iter_and_capture( + model: GPTModel, + *, + input_ids: torch.Tensor, + position_ids: torch.Tensor, + attention_mask: torch.Tensor, + enable_offload_reset: bool, +) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], int]: + """ + Run a single forward+backward iteration. + + Returns: + - logits (CPU float32) + - selected grads (CPU float32) + - peak_memory_allocated (bytes) during the iteration + """ + + if enable_offload_reset: + off_interface.reset() + + # for p in model.parameters(): + # if p.grad is not None: + # p.grad = None + + torch.cuda.reset_peak_memory_stats() + logits = model(input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask) + loss = logits.float().sum() + loss.backward() + torch.cuda.synchronize() + peak_bytes = int(torch.cuda.max_memory_allocated()) + + # capture all gradients for correctness + grads: Dict[str, torch.Tensor] = {} + for name, p in model.named_parameters(): + grads[name] = p.grad.detach().float().cpu() if p.grad is not None else None + + return logits.detach().float().cpu(), grads, peak_bytes + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required for offloading tests.") +@pytest.mark.parametrize( + "is_moe, is_mla, offload_modules", + [ + # Dense GPT modules + (False, True, ["attn_norm"]), + (True, False, ["qkv_linear"]), + (True, False, ["core_attn"]), + # # attn_proj depends on core_attn (validated in TransformerConfig.__post_init__) + (True, True, ["core_attn", "attn_proj"]), + (True, False, ["mlp_norm"]), + (True, False, ["expert_fc1"]), + (True, False, ["moe_act"]), + ], +) +def test_gpt_fine_grained_activation_offloading_correctness_and_memory( + is_moe: bool, is_mla: bool, offload_modules: List[str] +): + """ + Initialize a GPTModel and verify: + - forward output correctness under each offload_modules setting + - backward gradient correctness (subset) + - peak GPU memory is reduced roughly as expected (based on recorded offload bytes) + """ + # setup distributed/model-parallel (same pattern as other UTs) + os.environ.pop("NVTE_FUSED_ATTN", None) + os.environ.pop("NVTE_FLASH_ATTN", None) + os.environ.pop("NVTE_UNFUSED_ATTN", None) + # os.environ["NVTE_FLASH_ATTN"] = "1" + Utils.initialize_model_parallel(tensor_model_parallel_size=1, pipeline_model_parallel_size=1) + + seed = 123 + # Choose shapes large enough to make memory deltas stable but still fast. + num_experts = 4 if is_moe else None + num_layers = 8 + hidden_size = 2048 if num_experts is None else 1024 + num_attention_heads = 16 if hidden_size >= 2048 else 8 + vocab_size = 1024 + seq_length = 1024 + micro_batch_size = 2 + device = torch.device("cuda") + + input_ids, position_ids, attention_mask = _make_gpt_inputs( + seq_length=seq_length, micro_batch_size=micro_batch_size, device=device + ) + from megatron.core.pipeline_parallel import fine_grained_activation_offload as off -class ToyModel(torch.nn.Module): - def __init__(self, hidden_size: int = 2048, num_layers: int = 4, dtype=torch.bfloat16): - super().__init__() - layers = [] - for _ in range(num_layers): - layers.append( - torch.nn.Linear(hidden_size, hidden_size, bias=True, dtype=dtype, device="cuda") + off_interface.reset_instance() + + try: + # 1) Baseline run (no offloading) + _reset_cuda_memory() + base_model = _build_gpt_model( + seed=seed, + num_layers=num_layers, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + vocab_size=vocab_size, + seq_length=seq_length, + num_experts=num_experts, + fine_grained_activation_offloading=False, + offload_modules=None, + min_offloaded_tensor_size=1024 * 1024, + is_mla=is_mla, + ).cuda() + base_model.train() + + # Warmup baseline once for allocator stability + _run_one_iter_and_capture( + base_model, + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + enable_offload_reset=False, + ) + _reset_cuda_memory() + base_logits, base_grads, base_peak = _run_one_iter_and_capture( + base_model, + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + enable_offload_reset=False, + ) + # Free baseline model GPU memory before offload path + del base_model + _reset_cuda_memory() + + # 2) Offload run (warmup to record bytes + steady-state measurement) + off_model = _build_gpt_model( + seed=seed, + num_layers=num_layers, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + vocab_size=vocab_size, + seq_length=seq_length, + num_experts=num_experts, + fine_grained_activation_offloading=True, + offload_modules=offload_modules, + min_offloaded_tensor_size=1024, # force offloading for UT determinism + is_mla=is_mla, + ).cuda() + off_model.train() + + # Warmup 1 iter to populate cached chunks, then reset to finish warmup bookkeeping. + _run_one_iter_and_capture( + off_model, + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + enable_offload_reset=True, + ) + # Reset once more to trigger post_warmup_callback and apply steady-state offload decisions. + off_interface.reset() + + from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + PipelineOffloadManager, + ) + + mgr = PipelineOffloadManager.get_instance() + expected_offload_bytes = int( + sum(mgr.offload_summary_bytes.get(k, 0) for k in offload_modules) + ) + expected_offload_mib = expected_offload_bytes / (1024**2) + + _reset_cuda_memory() + off_logits, off_grads, off_peak = _run_one_iter_and_capture( + off_model, + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + enable_offload_reset=True, + ) + del off_model + _reset_cuda_memory() + + # 3) Correctness checks (forward + selected grads) + assert torch.allclose(off_logits, base_logits, rtol=1e-3, atol=1e-3) + assert set(off_grads.keys()) == set(base_grads.keys()) + for name, gb in base_grads.items(): + go = off_grads[name] + if gb is None or go is None: + assert gb is None and go is None, f"Grad None mismatch for {name}" + continue + assert torch.allclose(go, gb, rtol=1e-3, atol=1e-3), f"Grad mismatch for {name}" + + # 4) Memory checks (peak allocated over forward+backward) + saved_mib = (base_peak - off_peak) / (1024**2) + assert saved_mib > 0.0, ( + f"Expected GPU peak memory reduction for offload_modules={offload_modules}, " + f"but got saved={saved_mib:.2f}MiB (base={base_peak/(1024**2):.2f}MiB, " + f"off={off_peak/(1024**2):.2f}MiB)" + ) + + # If expectation is large enough, enforce approximate match. + # For tiny expectations, allocator noise may dominate; we only require a positive reduction. + if expected_offload_mib >= 2.0: + rel_err = abs(saved_mib - expected_offload_mib) / max(expected_offload_mib, 1e-6) + abs_err = abs(saved_mib - expected_offload_mib) + assert rel_err <= EPSILON and abs_err <= DELTA, ( + f"Memory saving mismatch for offload_modules={offload_modules}: " + f"saved={saved_mib:.2f}MiB expected~={expected_offload_mib:.2f}MiB " + f"(rel_err={rel_err:.2f}, abs_err={abs_err:.2f})" ) - self.net = torch.nn.Sequential(*layers).to(device="cuda", dtype=dtype) - self.hidden_size = hidden_size - self.num_layers = num_layers - self.dtype = dtype - - # Prevent weights/bias from being considered activation tensors for offload; - # ensure we only count activation tensors (inputs x) in memory accounting. - for p in self.parameters(): - try: - setattr(p, "offloading_activation", False) - except Exception: - pass - - def forward(self, x, use_offload: bool = False): - from megatron.core.pipeline_parallel import fine_grained_activation_offload as off - - if use_offload: - # Initialize a new chunk (microbatch) and enable offload context. - with off.get_fine_grained_offloading_context(True): - off.fine_grained_offloading_init_chunk_handler( - vp_size=1, vp_stage=None, min_offloaded_tensor_size=1 - ) - for i, layer in enumerate(self.net): - # Group by module; with this linear-only model, each group corresponds to a layer. - off.fine_grained_offloading_set_last_layer(i == len(self.net) - 1) - x = off.fine_grained_offloading_group_start(x, name=f"layer_{i}") - x = layer(x) - # Commit the group; returns a tuple of tensors - (x,) = off.fine_grained_offloading_group_commit( - x, name=f"layer_{i}", forced_released_tensors=[] - ) - return x - # Baseline path (no offload hooks) - with ( - torch.autocast(device_type="cuda", dtype=self.dtype) - if self.dtype in (torch.float16, torch.bfloat16) - else torch.cuda.amp.autocast(enabled=False) - ): - for layer in self.net: - x = layer(x) - return x - - -@pytest.fixture(autouse=True) -def _monkeypatch_offload_deps(monkeypatch): - # Avoid requiring torch.distributed initialization and NVML in tests - import megatron.core.pipeline_parallel.fine_grained_activation_offload as off - - monkeypatch.setattr(off, "debug_rank", lambda *args, **kwargs: None, raising=False) - monkeypatch.setattr(off, "set_ideal_affinity_for_current_gpu", lambda: None, raising=False) - # Ensure a clean state each test - off.fine_grained_offloading_reset() - yield - off.fine_grained_offloading_reset() - - -def test_fine_grained_activation_offload_memory_reduction(): - torch.manual_seed(1234) - # Use a linear-only stack so theoretical saved memory equals sum of per-layer input x bytes. - model = ToyModel(hidden_size=2048, num_layers=8, dtype=torch.bfloat16).eval() - - # Create input - inp = torch.randn( - (2048, model.hidden_size), device="cuda", dtype=torch.bfloat16, requires_grad=True + print( + f"Rank {torch.distributed.get_rank()}: Saved {saved_mib:.2f}MiB, expected {expected_offload_mib:.2f}MiB" + ) + finally: + Utils.destroy_model_parallel() + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required for offloading tests.") +@pytest.mark.skipif( + not is_te_min_version("1.9.0.dev0"), + reason="EP A2A overlap requires TE 1.9.0.dev0+ in this repo's tests.", +) +@pytest.mark.parametrize( + "dispatcher_backend, is_mla, offload_modules", + [ + ("alltoall", True, ["attn_norm"]), + ("alltoall", True, ["core_attn"]), + ("alltoall", True, ["attn_norm", "core_attn", "attn_proj"]), + ("alltoall", True, ["mlp_norm"]), + ("alltoall", False, ["expert_fc1"]), + ("alltoall", False, ["moe_act"]), + ("alltoall", False, ["mlp_norm", "expert_fc1", "moe_act"]), + ( + "alltoall", + True, + ["attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act"], + ), + ( + "alltoall", + False, + ["attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act"], + ), + ], +) +def test_fine_grained_activation_offload_with_ep_a2a_overlap_compatibility( + dispatcher_backend: str, is_mla: bool, offload_modules: List[str] +): + """ + Compatibility test for: + - fine-grained activation offloading + - EP all-to-all overlap (overlap_moe_expert_parallel_comm) + - memory saving roughly matches expected offload bytes (when expectation is large enough) + + The EP A2A overlap initialization pattern is aligned with + `tests/unit_tests/a2a_overlap/test_schedule_chunk_1f1b.py`. + """ + from megatron.core.models.common.model_chunk_schedule_plan import ( + TransformerModelChunkSchedulePlan, ) + from megatron.core.pipeline_parallel.utils import set_streams + from tests.unit_tests.a2a_overlap.utils import deterministic_mode + + # EP overlap requires distributed initialization with EP groups. + ep_size = 4 + if Utils.world_size % ep_size != 0: + pytest.skip( + f"Skipping: WORLD_SIZE={Utils.world_size} must be divisible by ep_size={ep_size}." + ) + + seed = 123 + num_experts = 8 # must be divisible by ep_size + if num_experts % ep_size != 0: + pytest.skip( + f"Skipping: num_moe_experts={num_experts} must be divisible by ep_size={ep_size}." + ) + + # Small shapes to keep this compatibility test fast. + num_layers = 8 + hidden_size = 1024 + num_attention_heads = 16 + vocab_size = 1024 + seq_length = 1024 + micro_batch_size = 2 + device = torch.device("cuda") - # Warmup to stabilize allocator behavior - _reset_cuda_memory() - out = model(inp, use_offload=False) - (out.sum()).backward() - torch.cuda.synchronize() - _reset_cuda_memory() - - # Baseline memory measurement (no offload) - _reset_cuda_memory() - inp_baseline = inp.detach().clone().requires_grad_(True) - baseline_mem_before = torch.cuda.memory_allocated() / (1024**2) - out_base = model(inp_baseline, use_offload=False) - baseline_mem_after = (torch.cuda.memory_allocated() - out_base.nbytes) / (1024**2) - (out_base.sum()).backward() - torch.cuda.synchronize() - baseline_delta = baseline_mem_after - baseline_mem_before - - # Offload memory measurement from megatron.core.pipeline_parallel import fine_grained_activation_offload as off - off.fine_grained_offloading_reset() - _reset_cuda_memory() - inp_off = inp.detach().clone().requires_grad_(True) - offload_mem_before = torch.cuda.memory_allocated() / (1024**2) - out_off = model(inp_off, use_offload=True) - offload_mem_after = (torch.cuda.memory_allocated() - out_off.nbytes) / (1024**2) - (out_off.sum()).backward() - torch.cuda.synchronize() - offload_delta = offload_mem_after - offload_mem_before - - # Offload should reduce peak cached memory usage after forward - assert ( - offload_delta < baseline_delta - ), f"offload did not reduce memory: off={offload_delta:.2f}MiB base={baseline_delta:.2f}MiB" - - # Theoretical savings: storing per-layer input x (same shape each layer). - bytes_per_elem = inp.element_size() # 2 for bfloat16 - input_bytes = inp.numel() * bytes_per_elem - # -2 because the first and last activations are not offloaded - expected_saved_mib = (model.num_layers - 2) * (input_bytes / (1024**2)) - - # Actual savings ≈ baseline_delta - offload_delta (both exclude output tensor memory). - actual_saved_mib = baseline_delta - offload_delta - - # Allow slack for allocator jitter and extra intermediates; magnitudes should match. - rel_err = abs(actual_saved_mib - expected_saved_mib) / max(expected_saved_mib, 1e-6) - assert ( - rel_err <= EPSILON - ), f"saved mismatch: actual={actual_saved_mib:.2f}MiB expected~={expected_saved_mib:.2f}MiB (rel_err={rel_err:.2f})" - - -def test_fine_grained_activation_offload_output_and_grad_consistency(): - torch.manual_seed(2025) - hidden = 1024 - layers = 3 - - # Create identical models by resetting seed - torch.manual_seed(2025) - model_base = ToyModel(hidden_size=hidden, num_layers=layers, dtype=torch.bfloat16).train() - torch.manual_seed(2025) - model_off = ToyModel(hidden_size=hidden, num_layers=layers, dtype=torch.bfloat16).train() - - # Same input and target - inp = torch.randn((32, hidden), device="cuda", dtype=torch.bfloat16, requires_grad=True) - target = torch.randn_like(inp) - - # Baseline forward/backward - out_base = model_base(inp, use_offload=False) - loss_base = torch.nn.functional.mse_loss(out_base, target) - loss_base.backward() - grads_base = [ - p.grad.detach().clone() if p.grad is not None else None for p in model_base.parameters() - ] - - # Offload forward/backward - from megatron.core.pipeline_parallel import fine_grained_activation_offload as off + def _make_schedule_inputs() -> Dict[str, torch.Tensor]: + data = list(range(seq_length)) + input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).to(device) + position_ids = ( + torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).to(device) + ) + attention_mask = torch.ones((micro_batch_size, 1, seq_length, seq_length), dtype=bool).to( + device + ) + labels = input_ids.clone() + return { + "input_ids": input_ids, + "labels": labels, + "position_ids": position_ids, + "attention_mask": attention_mask, + } + + def _capture_params(model: torch.nn.Module) -> Dict[str, torch.Tensor]: + params: Dict[str, torch.Tensor] = {} + for name, p in model.named_parameters(): + params[name] = p.detach().clone() + return params + + def _restore_params(model: torch.nn.Module, params: Dict[str, torch.Tensor]) -> None: + for name, p in model.named_parameters(): + p.data.copy_(params[name]) + + def _build_overlap_moe_gpt( + *, enable_offload: bool, is_mla: bool, dispatcher_backend: str + ) -> GPTModel: + model_parallel_cuda_manual_seed(seed) + torch.manual_seed(seed) + ConfigClass = MLATransformerConfig if is_mla else TransformerConfig + transformer_config = ConfigClass( + num_layers=num_layers, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + use_cpu_initialization=True, + attention_backend=AttnBackend.unfused, + # Recompute + recompute_modules=["layernorm", "moe_act"], + recompute_granularity="selective", + bf16=True, + # MoE + EP overlap + num_moe_experts=num_experts, + moe_grouped_gemm=True, + expert_model_parallel_size=ep_size, + moe_token_dispatcher_type="alltoall" if dispatcher_backend == "alltoall" else "flex", + moe_flex_dispatcher_backend=dispatcher_backend, + moe_router_dtype="fp32" if dispatcher_backend == "hybridep" else "fp64", + overlap_moe_expert_parallel_comm=True, + delay_wgrad_compute=True, + # Fine-grained activation offloading + fine_grained_activation_offloading=enable_offload, + offload_modules=offload_modules if enable_offload else None, + min_offloaded_tensor_size=1024, # force offloading to exercise the code path + ) + return ( + GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec( + num_experts=num_experts, + moe_grouped_gemm=True, + moe_use_legacy_grouped_gemm=False, + multi_latent_attention=is_mla, + ), + vocab_size=vocab_size, + max_sequence_length=seq_length, + ) + .bfloat16() + .cuda() + ) + + def _run_schedule_1f1b_two_microbatches( + model: GPTModel, *, enable_offload_reset: bool + ) -> Tuple[List[torch.Tensor], Dict[str, torch.Tensor], int]: + """ + Run a minimal 1F1B schedule (2 microbatches) using ModelChunkSchedulePlan.run(). + This is the execution path that exercises EP A2A overlap scheduling. + """ + if enable_offload_reset: + off_interface.reset() + + data0 = _make_schedule_inputs() + data1 = _make_schedule_inputs() + plan0 = model.build_schedule_plan(**data0) + + torch.cuda.reset_peak_memory_stats() + out0 = TransformerModelChunkSchedulePlan.run(plan0, None) + plan1 = model.build_schedule_plan(**data1) + out1 = TransformerModelChunkSchedulePlan.run(plan1, plan0, b_grad=torch.ones_like(out0)) + TransformerModelChunkSchedulePlan.run(None, plan1, b_grad=torch.ones_like(out1)) + torch.cuda.synchronize() + peak_bytes = int(torch.cuda.max_memory_allocated()) + + # capture outputs and grads + outputs = [out0.detach().float().cpu(), out1.detach().float().cpu()] + grads: Dict[str, torch.Tensor] = {} + for name, p in model.named_parameters(): + grads[name] = p.grad.detach().float().cpu() if p.grad is not None else None + return outputs, grads, peak_bytes + + # setup distributed/model-parallel + os.environ.pop("NVTE_FUSED_ATTN", None) + os.environ.pop("NVTE_FLASH_ATTN", None) + os.environ.pop("NVTE_UNFUSED_ATTN", None) + + Utils.initialize_model_parallel( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + expert_model_parallel_size=ep_size, + ) + set_streams() + + off_interface.reset_instance() + + try: + with deterministic_mode(): + # Baseline: EP overlap on, offload off. + _reset_cuda_memory() + base_model = _build_overlap_moe_gpt( + enable_offload=False, is_mla=is_mla, dispatcher_backend=dispatcher_backend + ) + base_model.train() + base_params = _capture_params(base_model) + # Warmup once for allocator stability / graph caching + _run_schedule_1f1b_two_microbatches(base_model, enable_offload_reset=False) + _reset_cuda_memory() + base_outs, base_grads, base_peak = _run_schedule_1f1b_two_microbatches( + base_model, enable_offload_reset=False + ) + del base_model + _reset_cuda_memory() + + # Offload: EP overlap on, fine-grained offload on. + off_model = _build_overlap_moe_gpt( + enable_offload=True, is_mla=is_mla, dispatcher_backend=dispatcher_backend + ) + _restore_params(off_model, base_params) + off_model.train() + # Warmup once to populate cached chunks, then reset to apply steady-state offload decisions. + off_interface.reset() + _run_schedule_1f1b_two_microbatches(off_model, enable_offload_reset=False) + off_interface.reset() + from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + PipelineOffloadManager, + ) + + mgr = PipelineOffloadManager.get_instance() + expected_offload_bytes = int( + sum(mgr.offload_summary_bytes.get(k, 0) for k in offload_modules) + ) + expected_offload_mib = expected_offload_bytes / (1024**2) - off.fine_grained_offloading_reset() - out_off = model_off(inp.detach().clone().requires_grad_(True), use_offload=True) - loss_off = torch.nn.functional.mse_loss(out_off, target) - loss_off.backward() - grads_off = [ - p.grad.detach().clone() if p.grad is not None else None for p in model_off.parameters() - ] - - # Compare outputs - assert torch.allclose(out_off.float(), out_base.float(), rtol=1e-3, atol=1e-3) - - # Compare gradients parameter-wise - for gb, go in zip(grads_base, grads_off): - if gb is None and go is None: - continue - assert gb is not None and go is not None - assert torch.allclose(go.float(), gb.float(), rtol=1e-3, atol=1e-3) + _reset_cuda_memory() + off_outs, off_grads, off_peak = _run_schedule_1f1b_two_microbatches( + off_model, enable_offload_reset=True + ) + del off_model + _reset_cuda_memory() + + # Correctness (forward outputs + all grads) + assert len(off_outs) == len(base_outs) == 2 + for i in range(2): + assert torch.allclose(off_outs[i], base_outs[i], rtol=1e-3, atol=1e-3) + assert set(off_grads.keys()) == set(base_grads.keys()) + for name, gb in base_grads.items(): + go = off_grads[name] + if gb is None or go is None: + assert gb is None and go is None, f"Grad None mismatch for {name}" + continue + assert torch.allclose( + go, gb, rtol=1e-3, atol=1e-3 + ), f"Rank {torch.distributed.get_rank()}: Grad mismatch for {name}" + + # Memory checks (peak allocated during the scheduled 1F1B run) + saved_mib = (base_peak - off_peak) / (1024**2) + assert saved_mib > 0.0, ( + f"Expected GPU peak memory reduction for offload_modules={offload_modules}, " + f"but got saved={saved_mib:.2f}MiB (base={base_peak/(1024**2):.2f}MiB, " + f"off={off_peak/(1024**2):.2f}MiB)" + ) + # If expectation is large enough, enforce approximate match. + if expected_offload_mib >= 2.0: + rel_err = abs(saved_mib - expected_offload_mib) / max(expected_offload_mib, 1e-6) + abs_err = abs(saved_mib - expected_offload_mib) + print( + f"Rank {torch.distributed.get_rank()}: Saved {saved_mib:.2f}MiB, expected {expected_offload_mib:.2f}MiB" + ) + if abs_err > DELTA: + assert rel_err <= EPSILON_A2A, ( + f"Memory saving mismatch for offload_modules={offload_modules}: " + f"saved={saved_mib:.2f}MiB expected~={expected_offload_mib:.2f}MiB " + f"(rel_err={rel_err:.2f}, abs_err={abs_err:.2f})" + ) + finally: + Utils.destroy_model_parallel() From 6807df4ff4f97e1b56b978877b891328a25b8b7a Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Thu, 22 Jan 2026 21:10:35 +0800 Subject: [PATCH 248/248] [Dev] [fix] Bug fix for offloading in evaluate() (#3041) Signed-off-by: Hongbin Liu --- .../core/pipeline_parallel/fine_grained_activation_offload.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py index 9996c9b57a4..01c3a0c3aa0 100644 --- a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py +++ b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py @@ -654,6 +654,9 @@ def pop_forward_chunk(self, name=None): while not self._is_warmup and ( self._cur_forward_chunk is None or self._cur_forward_chunk.finish_all_groups(name) ): + if self._cached_chunks_index_forward >= len(self._cached_chunks_forward): + self._cur_forward_chunk = None + break self._cur_forward_chunk = self._cached_chunks_forward[self._cached_chunks_index_forward] self._cached_chunks_index_forward += 1 debug_rank(f"new cur_forward_chunk {self._cur_forward_chunk}")